summaryrefslogtreecommitdiffstats
path: root/third_party/aom
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom')
-rw-r--r--third_party/aom/.clang-format9
-rw-r--r--third_party/aom/.cmake-format.py102
-rw-r--r--third_party/aom/.mailmap113
-rw-r--r--third_party/aom/AUTHORS319
-rw-r--r--third_party/aom/CHANGELOG828
-rw-r--r--third_party/aom/CMakeLists.txt1035
-rw-r--r--third_party/aom/LICENSE27
-rw-r--r--third_party/aom/PATENTS108
-rw-r--r--third_party/aom/README.md677
-rw-r--r--third_party/aom/Sample.cfg35
-rw-r--r--third_party/aom/aom/aom.h127
-rw-r--r--third_party/aom/aom/aom_codec.h577
-rw-r--r--third_party/aom/aom/aom_decoder.h257
-rw-r--r--third_party/aom/aom/aom_encoder.h1144
-rw-r--r--third_party/aom/aom/aom_external_partition.h452
-rw-r--r--third_party/aom/aom/aom_frame_buffer.h84
-rw-r--r--third_party/aom/aom/aom_image.h448
-rw-r--r--third_party/aom/aom/aom_integer.h68
-rw-r--r--third_party/aom/aom/aomcx.h2205
-rw-r--r--third_party/aom/aom/aomdx.h604
-rw-r--r--third_party/aom/aom/exports_com42
-rw-r--r--third_party/aom/aom/exports_dec8
-rw-r--r--third_party/aom/aom/exports_enc17
-rw-r--r--third_party/aom/aom/exports_test4
-rw-r--r--third_party/aom/aom/internal/aom_codec_internal.h418
-rw-r--r--third_party/aom/aom/internal/aom_image_internal.h93
-rw-r--r--third_party/aom/aom/src/aom_codec.c199
-rw-r--r--third_party/aom/aom/src/aom_decoder.c137
-rw-r--r--third_party/aom/aom/src/aom_encoder.c333
-rw-r--r--third_party/aom/aom/src/aom_image.c401
-rw-r--r--third_party/aom/aom/src/aom_integer.c105
-rw-r--r--third_party/aom/aom_dsp/aom_convolve.c261
-rw-r--r--third_party/aom/aom_dsp/aom_dsp.cmake510
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_common.h99
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_rtcd.c18
-rwxr-xr-xthird_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl1798
-rw-r--r--third_party/aom/aom_dsp/aom_filter.h56
-rw-r--r--third_party/aom/aom_dsp/aom_simd.h36
-rw-r--r--third_party/aom/aom_dsp/aom_simd_inline.h24
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon.c349
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c460
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c408
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c154
-rw-r--r--third_party/aom/aom_dsp/arm/avg_neon.c309
-rw-r--r--third_party/aom/aom_dsp/arm/avg_pred_neon.c221
-rw-r--r--third_party/aom/aom_dsp/arm/avg_sve.c62
-rw-r--r--third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c492
-rw-r--r--third_party/aom/aom_dsp/arm/blend_neon.h125
-rw-r--r--third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c124
-rw-r--r--third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c106
-rw-r--r--third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h65
-rw-r--r--third_party/aom/aom_dsp/arm/dot_sve.h42
-rw-r--r--third_party/aom/aom_dsp/arm/fwd_txfm_neon.c304
-rw-r--r--third_party/aom/aom_dsp/arm/hadamard_neon.c325
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_avg_neon.c125
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c190
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c97
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c473
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c105
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c363
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c213
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c2730
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c1265
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c354
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c211
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c369
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_quantize_neon.c431
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sad_neon.c509
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c617
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sse_neon.c284
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sse_sve.c215
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c1497
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_variance_neon.c502
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c92
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_variance_sve.c430
-rw-r--r--third_party/aom/aom_dsp/arm/intrapred_neon.c3110
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_neon.c1045
-rw-r--r--third_party/aom/aom_dsp/arm/masked_sad4d_neon.c562
-rw-r--r--third_party/aom/aom_dsp/arm/masked_sad_neon.c244
-rw-r--r--third_party/aom/aom_dsp/arm/mem_neon.h1253
-rw-r--r--third_party/aom/aom_dsp/arm/obmc_sad_neon.c250
-rw-r--r--third_party/aom/aom_dsp/arm/obmc_variance_neon.c290
-rw-r--r--third_party/aom/aom_dsp/arm/reinterpret_neon.h33
-rw-r--r--third_party/aom/aom_dsp/arm/sad_neon.c873
-rw-r--r--third_party/aom/aom_dsp/arm/sad_neon_dotprod.c530
-rw-r--r--third_party/aom/aom_dsp/arm/sadxd_neon.c514
-rw-r--r--third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c289
-rw-r--r--third_party/aom/aom_dsp/arm/sse_neon.c210
-rw-r--r--third_party/aom/aom_dsp/arm/sse_neon_dotprod.c223
-rw-r--r--third_party/aom/aom_dsp/arm/subpel_variance_neon.c1103
-rw-r--r--third_party/aom/aom_dsp/arm/subtract_neon.c166
-rw-r--r--third_party/aom/aom_dsp/arm/sum_neon.h311
-rw-r--r--third_party/aom/aom_dsp/arm/sum_squares_neon.c574
-rw-r--r--third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c154
-rw-r--r--third_party/aom/aom_dsp/arm/sum_squares_sve.c402
-rw-r--r--third_party/aom/aom_dsp/arm/transpose_neon.h1263
-rw-r--r--third_party/aom/aom_dsp/arm/variance_neon.c470
-rw-r--r--third_party/aom/aom_dsp/arm/variance_neon_dotprod.c314
-rw-r--r--third_party/aom/aom_dsp/avg.c573
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.c55
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.h44
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.c137
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.h65
-rw-r--r--third_party/aom/aom_dsp/bitreader.c41
-rw-r--r--third_party/aom/aom_dsp/bitreader.h232
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.c116
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.h53
-rw-r--r--third_party/aom/aom_dsp/bitwriter.c40
-rw-r--r--third_party/aom/aom_dsp/bitwriter.h110
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.c141
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.h55
-rw-r--r--third_party/aom/aom_dsp/blend.h45
-rw-r--r--third_party/aom/aom_dsp/blend_a64_hmask.c71
-rw-r--r--third_party/aom/aom_dsp/blend_a64_mask.c349
-rw-r--r--third_party/aom/aom_dsp/blend_a64_vmask.c73
-rw-r--r--third_party/aom/aom_dsp/blk_sse_sum.c26
-rw-r--r--third_party/aom/aom_dsp/butteraugli.c109
-rw-r--r--third_party/aom/aom_dsp/butteraugli.h23
-rw-r--r--third_party/aom/aom_dsp/entcode.c49
-rw-r--r--third_party/aom/aom_dsp/entcode.h41
-rw-r--r--third_party/aom/aom_dsp/entdec.c247
-rw-r--r--third_party/aom/aom_dsp/entdec.h81
-rw-r--r--third_party/aom/aom_dsp/entenc.c374
-rw-r--r--third_party/aom/aom_dsp/entenc.h108
-rw-r--r--third_party/aom/aom_dsp/fastssim.c488
-rw-r--r--third_party/aom/aom_dsp/fft.c220
-rw-r--r--third_party/aom/aom_dsp/fft_common.h1056
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c368
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/corner_detect.c167
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/corner_detect.h80
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/corner_match.c259
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/corner_match.h41
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/disflow.c823
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/disflow.h106
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/flow_estimation.c60
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/flow_estimation.h95
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/ransac.c484
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/ransac.h35
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c80
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c104
-rw-r--r--third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c558
-rw-r--r--third_party/aom/aom_dsp/fwd_txfm.c245
-rw-r--r--third_party/aom/aom_dsp/grain_params.h158
-rw-r--r--third_party/aom/aom_dsp/grain_table.c358
-rw-r--r--third_party/aom/aom_dsp/grain_table.h102
-rw-r--r--third_party/aom/aom_dsp/intrapred.c793
-rw-r--r--third_party/aom/aom_dsp/intrapred_common.h59
-rw-r--r--third_party/aom/aom_dsp/loopfilter.c997
-rw-r--r--third_party/aom/aom_dsp/mathutils.h145
-rw-r--r--third_party/aom/aom_dsp/noise_model.c1692
-rw-r--r--third_party/aom/aom_dsp/noise_model.h328
-rw-r--r--third_party/aom/aom_dsp/noise_util.c225
-rw-r--r--third_party/aom/aom_dsp/noise_util.h68
-rw-r--r--third_party/aom/aom_dsp/odintrin.c541
-rw-r--r--third_party/aom/aom_dsp/odintrin.h81
-rw-r--r--third_party/aom/aom_dsp/prob.h144
-rw-r--r--third_party/aom/aom_dsp/psnr.c454
-rw-r--r--third_party/aom/aom_dsp/psnr.h96
-rw-r--r--third_party/aom/aom_dsp/psnrhvs.c282
-rw-r--r--third_party/aom/aom_dsp/pyramid.c414
-rw-r--r--third_party/aom/aom_dsp/pyramid.h127
-rw-r--r--third_party/aom/aom_dsp/quantize.c472
-rw-r--r--third_party/aom/aom_dsp/quantize.h127
-rw-r--r--third_party/aom/aom_dsp/recenter.h61
-rw-r--r--third_party/aom/aom_dsp/rect.h35
-rw-r--r--third_party/aom/aom_dsp/sad.c389
-rw-r--r--third_party/aom/aom_dsp/sad_av1.c266
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics.h346
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_c.h898
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h659
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics.h377
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_c.h963
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h806
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h754
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics.h234
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_c.h966
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h489
-rw-r--r--third_party/aom/aom_dsp/sse.c59
-rw-r--r--third_party/aom/aom_dsp/ssim.c481
-rw-r--r--third_party/aom/aom_dsp/ssim.h104
-rw-r--r--third_party/aom/aom_dsp/subtract.c54
-rw-r--r--third_party/aom/aom_dsp/sum_squares.c90
-rw-r--r--third_party/aom/aom_dsp/txfm_common.h155
-rw-r--r--third_party/aom/aom_dsp/variance.c1234
-rw-r--r--third_party/aom/aom_dsp/variance.h127
-rw-r--r--third_party/aom/aom_dsp/vmaf.c192
-rw-r--r--third_party/aom/aom_dsp/vmaf.h41
-rw-r--r--third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c244
-rw-r--r--third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c633
-rw-r--r--third_party/aom/aom_dsp/x86/aom_asm_stubs.c95
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c256
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c308
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm613
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm367
-rw-r--r--third_party/aom/aom_dsp/x86/aom_quantize_avx.c282
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c1441
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c569
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c847
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm615
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm870
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm295
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm267
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_avx2.c897
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_sse2.c700
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_sse4.c59
-rw-r--r--third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h32
-rw-r--r--third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h49
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c36
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c1374
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c1560
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c285
-rw-r--r--third_party/aom/aom_dsp/x86/blend_mask_sse4.h237
-rw-r--r--third_party/aom/aom_dsp/x86/blend_sse4.h191
-rw-r--r--third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c185
-rw-r--r--third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c138
-rw-r--r--third_party/aom/aom_dsp/x86/common_avx2.h147
-rw-r--r--third_party/aom/aom_dsp/x86/convolve.h204
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_avx2.h922
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_common_intrin.h102
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse2.h122
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse4_1.h53
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_ssse3.h50
-rw-r--r--third_party/aom/aom_dsp/x86/fft_avx2.c74
-rw-r--r--third_party/aom/aom_dsp/x86/fft_sse2.c173
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h529
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c39
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h160
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm379
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c456
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c732
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c1248
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c351
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c439
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm259
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c984
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c66
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c1698
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c294
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c208
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm344
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_avx2.c720
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm524
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm1024
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c266
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_avx2.c904
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm318
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c735
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse4.c216
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm608
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_avx2.c4707
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2.c1411
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse4.c1307
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_ssse3.c2997
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_utils.h205
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_x86.h38
-rw-r--r--third_party/aom/aom_dsp/x86/inv_wht_sse2.asm107
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_sad_sse2.c238
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c161
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_avx2.c1016
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_sse2.c2973
-rw-r--r--third_party/aom/aom_dsp/x86/lpf_common_sse2.h721
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c266
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c389
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c400
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h33
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c1067
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h92
-rw-r--r--third_party/aom/aom_dsp/x86/mem_sse2.h167
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h58
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h54
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_avx2.c271
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_sse4.c269
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_avx2.c191
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c382
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_avx2.c274
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_sse2.c125
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3.c192
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm302
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_x86.h202
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_avx2.c326
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_sse2.asm437
-rw-r--r--third_party/aom/aom_dsp/x86/sad_avx2.c219
-rw-r--r--third_party/aom/aom_dsp/x86/sad_impl_avx2.c181
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse2.asm432
-rw-r--r--third_party/aom/aom_dsp/x86/sse_avx2.c389
-rw-r--r--third_party/aom/aom_dsp/x86/sse_sse4.c355
-rw-r--r--third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm222
-rw-r--r--third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm1470
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_avx2.c109
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_sse2.asm147
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_avx2.c326
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.c478
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.h28
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms.h134
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms_avx2.h79
-rw-r--r--third_party/aom/aom_dsp/x86/transpose_sse2.h424
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h357
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_sse2.h33
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c961
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_avx2.c924
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_ssse3.c129
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c802
-rw-r--r--third_party/aom/aom_mem/aom_mem.c82
-rw-r--r--third_party/aom/aom_mem/aom_mem.cmake29
-rw-r--r--third_party/aom/aom_mem/aom_mem.h80
-rw-r--r--third_party/aom/aom_mem/include/aom_mem_intrnl.h29
-rw-r--r--third_party/aom/aom_ports/aarch32_cpudetect.c86
-rw-r--r--third_party/aom/aom_ports/aarch64_cpudetect.c188
-rw-r--r--third_party/aom/aom_ports/aom_once.h83
-rw-r--r--third_party/aom/aom_ports/aom_ports.cmake96
-rw-r--r--third_party/aom/aom_ports/aom_timer.h113
-rw-r--r--third_party/aom/aom_ports/arm.h45
-rw-r--r--third_party/aom/aom_ports/arm_cpudetect.h54
-rw-r--r--third_party/aom/aom_ports/bitops.h122
-rw-r--r--third_party/aom/aom_ports/emmintrin_compat.h56
-rw-r--r--third_party/aom/aom_ports/float.asm33
-rw-r--r--third_party/aom/aom_ports/mem.h102
-rw-r--r--third_party/aom/aom_ports/mem_ops.h228
-rw-r--r--third_party/aom/aom_ports/mem_ops_aligned.h173
-rw-r--r--third_party/aom/aom_ports/msvc.h75
-rw-r--r--third_party/aom/aom_ports/ppc.h30
-rw-r--r--third_party/aom/aom_ports/ppc_cpudetect.c82
-rw-r--r--third_party/aom/aom_ports/sanitizer.h38
-rw-r--r--third_party/aom/aom_ports/x86.h402
-rw-r--r--third_party/aom/aom_ports/x86_abi_support.asm416
-rw-r--r--third_party/aom/aom_scale/aom_scale.cmake37
-rw-r--r--third_party/aom/aom_scale/aom_scale.h23
-rw-r--r--third_party/aom/aom_scale/aom_scale_rtcd.c18
-rw-r--r--third_party/aom/aom_scale/aom_scale_rtcd.pl55
-rw-r--r--third_party/aom/aom_scale/generic/aom_scale.c506
-rw-r--r--third_party/aom/aom_scale/generic/gen_scalers.c201
-rw-r--r--third_party/aom/aom_scale/generic/yv12config.c312
-rw-r--r--third_party/aom/aom_scale/generic/yv12extend.c517
-rw-r--r--third_party/aom/aom_scale/yv12config.h214
-rw-r--r--third_party/aom/aom_util/aom_thread.c240
-rw-r--r--third_party/aom/aom_util/aom_thread.h236
-rw-r--r--third_party/aom/aom_util/aom_util.cmake34
-rw-r--r--third_party/aom/aom_util/debug_util.c293
-rw-r--r--third_party/aom/aom_util/debug_util.h69
-rw-r--r--third_party/aom/aom_util/endian_inl.h109
-rw-r--r--third_party/aom/aomedia_logo_200.pngbin0 -> 7052 bytes
-rw-r--r--third_party/aom/apps/aomdec.c1088
-rw-r--r--third_party/aom/apps/aomenc.c2688
-rw-r--r--third_party/aom/apps/aomenc.h59
-rw-r--r--third_party/aom/av1/arg_defs.c702
-rw-r--r--third_party/aom/av1/arg_defs.h246
-rw-r--r--third_party/aom/av1/av1.cmake715
-rw-r--r--third_party/aom/av1/av1_cx_iface.c4712
-rw-r--r--third_party/aom/av1/av1_cx_iface.h37
-rw-r--r--third_party/aom/av1/av1_dx_iface.c1777
-rw-r--r--third_party/aom/av1/av1_iface_common.h148
-rw-r--r--third_party/aom/av1/common/alloccommon.c506
-rw-r--r--third_party/aom/av1/common/alloccommon.h65
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.c4217
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.h154
-rw-r--r--third_party/aom/av1/common/arm/av1_txfm_neon.c30
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_hmask_neon.c102
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_vmask_neon.c112
-rw-r--r--third_party/aom/av1/common/arm/cdef_block_neon.c1355
-rw-r--r--third_party/aom/av1/common/arm/cfl_neon.c589
-rw-r--r--third_party/aom/av1/common/arm/compound_convolve_neon.c2719
-rw-r--r--third_party/aom/av1/common/arm/compound_convolve_neon.h1164
-rw-r--r--third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c675
-rw-r--r--third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c614
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.c1659
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.h538
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon_dotprod.c793
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon_i8mm.c702
-rw-r--r--third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c2031
-rw-r--r--third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c273
-rw-r--r--third_party/aom/av1/common/arm/highbd_convolve_neon.c2120
-rw-r--r--third_party/aom/av1/common/arm/highbd_convolve_neon.h148
-rw-r--r--third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c552
-rw-r--r--third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c5994
-rw-r--r--third_party/aom/av1/common/arm/highbd_reconinter_neon.c327
-rw-r--r--third_party/aom/av1/common/arm/highbd_reconintra_neon.c241
-rw-r--r--third_party/aom/av1/common/arm/highbd_warp_plane_neon.c317
-rw-r--r--third_party/aom/av1/common/arm/highbd_warp_plane_neon.h424
-rw-r--r--third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c403
-rw-r--r--third_party/aom/av1/common/arm/reconinter_neon.c217
-rw-r--r--third_party/aom/av1/common/arm/reconintra_neon.c392
-rw-r--r--third_party/aom/av1/common/arm/resize_neon.c1178
-rw-r--r--third_party/aom/av1/common/arm/selfguided_neon.c1595
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_neon.c276
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_neon.h367
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c291
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_sve.c284
-rw-r--r--third_party/aom/av1/common/arm/wiener_convolve_neon.c348
-rw-r--r--third_party/aom/av1/common/av1_common_int.h1882
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.c1841
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.h61
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d_cfg.h45
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm2d.c484
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.c2099
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.h150
-rw-r--r--third_party/aom/av1/common/av1_rtcd.c18
-rw-r--r--third_party/aom/av1/common/av1_rtcd_defs.pl655
-rw-r--r--third_party/aom/av1/common/av1_txfm.c278
-rw-r--r--third_party/aom/av1/common/av1_txfm.h256
-rw-r--r--third_party/aom/av1/common/blockd.c100
-rw-r--r--third_party/aom/av1/common/blockd.h1612
-rw-r--r--third_party/aom/av1/common/cdef.c466
-rw-r--r--third_party/aom/av1/common/cdef.h112
-rw-r--r--third_party/aom/av1/common/cdef_block.c426
-rw-r--r--third_party/aom/av1/common/cdef_block.h65
-rw-r--r--third_party/aom/av1/common/cdef_block_simd.h844
-rw-r--r--third_party/aom/av1/common/cfl.c434
-rw-r--r--third_party/aom/av1/common/cfl.h294
-rw-r--r--third_party/aom/av1/common/common.h61
-rw-r--r--third_party/aom/av1/common/common_data.c43
-rw-r--r--third_party/aom/av1/common/common_data.h432
-rw-r--r--third_party/aom/av1/common/convolve.c1508
-rw-r--r--third_party/aom/av1/common/convolve.h132
-rw-r--r--third_party/aom/av1/common/debugmodes.c113
-rw-r--r--third_party/aom/av1/common/entropy.c178
-rw-r--r--third_party/aom/av1/common/entropy.h182
-rw-r--r--third_party/aom/av1/common/entropymode.c1094
-rw-r--r--third_party/aom/av1/common/entropymode.h218
-rw-r--r--third_party/aom/av1/common/entropymv.c67
-rw-r--r--third_party/aom/av1/common/entropymv.h104
-rw-r--r--third_party/aom/av1/common/enums.h651
-rw-r--r--third_party/aom/av1/common/filter.h320
-rw-r--r--third_party/aom/av1/common/frame_buffers.c98
-rw-r--r--third_party/aom/av1/common/frame_buffers.h60
-rw-r--r--third_party/aom/av1/common/idct.c322
-rw-r--r--third_party/aom/av1/common/idct.h51
-rw-r--r--third_party/aom/av1/common/mv.h337
-rw-r--r--third_party/aom/av1/common/mvref_common.c1501
-rw-r--r--third_party/aom/av1/common/mvref_common.h342
-rw-r--r--third_party/aom/av1/common/obmc.h89
-rw-r--r--third_party/aom/av1/common/obu_util.c133
-rw-r--r--third_party/aom/av1/common/obu_util.h47
-rw-r--r--third_party/aom/av1/common/ppc/cfl_ppc.c152
-rw-r--r--third_party/aom/av1/common/pred_common.c501
-rw-r--r--third_party/aom/av1/common/pred_common.h377
-rw-r--r--third_party/aom/av1/common/quant_common.c12876
-rw-r--r--third_party/aom/av1/common/quant_common.h84
-rw-r--r--third_party/aom/av1/common/reconinter.c1169
-rw-r--r--third_party/aom/av1/common/reconinter.h489
-rw-r--r--third_party/aom/av1/common/reconinter_template.inc267
-rw-r--r--third_party/aom/av1/common/reconintra.c1798
-rw-r--r--third_party/aom/av1/common/reconintra.h158
-rw-r--r--third_party/aom/av1/common/resize.c1452
-rw-r--r--third_party/aom/av1/common/resize.h146
-rw-r--r--third_party/aom/av1/common/restoration.c1494
-rw-r--r--third_party/aom/av1/common/restoration.h471
-rw-r--r--third_party/aom/av1/common/scale.c57
-rw-r--r--third_party/aom/av1/common/scale.h87
-rw-r--r--third_party/aom/av1/common/scan.c2038
-rw-r--r--third_party/aom/av1/common/scan.h54
-rw-r--r--third_party/aom/av1/common/seg_common.c91
-rw-r--r--third_party/aom/av1/common/seg_common.h113
-rw-r--r--third_party/aom/av1/common/thread_common.c1250
-rw-r--r--third_party/aom/av1/common/thread_common.h345
-rw-r--r--third_party/aom/av1/common/tile_common.c249
-rw-r--r--third_party/aom/av1/common/tile_common.h75
-rw-r--r--third_party/aom/av1/common/timing.c92
-rw-r--r--third_party/aom/av1/common/timing.h55
-rw-r--r--third_party/aom/av1/common/token_cdfs.h3555
-rw-r--r--third_party/aom/av1/common/txb_common.c364
-rw-r--r--third_party/aom/av1/common/txb_common.h463
-rw-r--r--third_party/aom/av1/common/warped_motion.c918
-rw-r--r--third_party/aom/av1/common/warped_motion.h97
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c228
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c498
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c2254
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h71
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c2904
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h247
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse2.h321
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.c22
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h72
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_avx2.c357
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_sse2.c40
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_sse4.c40
-rw-r--r--third_party/aom/av1/common/x86/cdef_block_ssse3.c40
-rw-r--r--third_party/aom/av1/common/x86/cfl_avx2.c495
-rw-r--r--third_party/aom/av1/common/x86/cfl_simd.h246
-rw-r--r--third_party/aom/av1/common/x86/cfl_sse2.c89
-rw-r--r--third_party/aom/av1/common/x86/cfl_ssse3.c397
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c161
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c547
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c916
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c500
-rw-r--r--third_party/aom/av1/common/x86/filterintra_sse4.c350
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c200
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c421
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c414
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c4239
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c5830
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c849
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c381
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h132
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c656
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c636
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c245
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c202
-rw-r--r--third_party/aom/av1/common/x86/intra_edge_sse4.c322
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c1124
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_sse2.c606
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_ssse3.c230
-rw-r--r--third_party/aom/av1/common/x86/reconinter_avx2.c624
-rw-r--r--third_party/aom/av1/common/x86/reconinter_sse4.c154
-rw-r--r--third_party/aom/av1/common/x86/reconinter_ssse3.c120
-rw-r--r--third_party/aom/av1/common/x86/resize_ssse3.c974
-rw-r--r--third_party/aom/av1/common/x86/selfguided_avx2.c724
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c662
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_avx2.c1210
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse4.c908
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_avx2.c242
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_sse2.c199
-rw-r--r--third_party/aom/av1/decoder/accounting.c140
-rw-r--r--third_party/aom/av1/decoder/accounting.h82
-rw-r--r--third_party/aom/av1/decoder/decodeframe.c5369
-rw-r--r--third_party/aom/av1/decoder/decodeframe.h84
-rw-r--r--third_party/aom/av1/decoder/decodemv.c1586
-rw-r--r--third_party/aom/av1/decoder/decodemv.h33
-rw-r--r--third_party/aom/av1/decoder/decoder.c538
-rw-r--r--third_party/aom/av1/decoder/decoder.h452
-rw-r--r--third_party/aom/av1/decoder/decodetxb.c381
-rw-r--r--third_party/aom/av1/decoder/decodetxb.h34
-rw-r--r--third_party/aom/av1/decoder/detokenize.c78
-rw-r--r--third_party/aom/av1/decoder/detokenize.h29
-rw-r--r--third_party/aom/av1/decoder/dthread.h51
-rw-r--r--third_party/aom/av1/decoder/grain_synthesis.c1461
-rw-r--r--third_party/aom/av1/decoder/grain_synthesis.h66
-rw-r--r--third_party/aom/av1/decoder/inspection.c162
-rw-r--r--third_party/aom/av1/decoder/inspection.h91
-rw-r--r--third_party/aom/av1/decoder/obu.c1101
-rw-r--r--third_party/aom/av1/decoder/obu.h31
-rw-r--r--third_party/aom/av1/encoder/allintra_vis.c1055
-rw-r--r--third_party/aom/av1/encoder/allintra_vis.h46
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c175
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.h37
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c657
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.h332
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c220
-rw-r--r--third_party/aom/av1/encoder/aq_variance.h35
-rw-r--r--third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c61
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_error_neon.c95
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_error_sve.c109
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c3090
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c146
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c115
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c360
-rw-r--r--third_party/aom/av1/encoder/arm/neon/cnn_neon.c1144
-rw-r--r--third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c646
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c2619
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c1207
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c49
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c562
-rw-r--r--third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c73
-rw-r--r--third_party/aom/av1/encoder/arm/neon/ml_neon.c339
-rw-r--r--third_party/aom/av1/encoder/arm/neon/pickrst_neon.c1217
-rw-r--r--third_party/aom/av1/encoder/arm/neon/pickrst_neon.h188
-rw-r--r--third_party/aom/av1/encoder/arm/neon/quantize_neon.c928
-rw-r--r--third_party/aom/av1/encoder/arm/neon/rdopt_neon.c459
-rw-r--r--third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c288
-rw-r--r--third_party/aom/av1/encoder/arm/neon/shift_neon.h49
-rw-r--r--third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c548
-rw-r--r--third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c299
-rw-r--r--third_party/aom/av1/encoder/arm/neon/txfm_neon.h26
-rw-r--r--third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c131
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.c1885
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.h49
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h19
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm2d.c423
-rw-r--r--third_party/aom/av1/encoder/av1_ml_partition_models.h179
-rw-r--r--third_party/aom/av1/encoder/av1_noise_estimate.c296
-rw-r--r--third_party/aom/av1/encoder/av1_noise_estimate.h50
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c917
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.h224
-rw-r--r--third_party/aom/av1/encoder/av1_temporal_denoiser.c805
-rw-r--r--third_party/aom/av1/encoder/av1_temporal_denoiser.h134
-rw-r--r--third_party/aom/av1/encoder/bitstream.c4248
-rw-r--r--third_party/aom/av1/encoder/bitstream.h137
-rw-r--r--third_party/aom/av1/encoder/block.h1515
-rw-r--r--third_party/aom/av1/encoder/blockiness.c140
-rw-r--r--third_party/aom/av1/encoder/cnn.c1189
-rw-r--r--third_party/aom/av1/encoder/cnn.h191
-rw-r--r--third_party/aom/av1/encoder/compound_type.c1678
-rw-r--r--third_party/aom/av1/encoder/compound_type.h52
-rw-r--r--third_party/aom/av1/encoder/context_tree.c311
-rw-r--r--third_party/aom/av1/encoder/context_tree.h142
-rw-r--r--third_party/aom/av1/encoder/cost.c46
-rw-r--r--third_party/aom/av1/encoder/cost.h51
-rw-r--r--third_party/aom/av1/encoder/deltaq4_model.c7776
-rw-r--r--third_party/aom/av1/encoder/dwt.c146
-rw-r--r--third_party/aom/av1/encoder/dwt.h27
-rw-r--r--third_party/aom/av1/encoder/enc_enums.h268
-rw-r--r--third_party/aom/av1/encoder/encode_strategy.c1767
-rw-r--r--third_party/aom/av1/encoder/encode_strategy.h138
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c2408
-rw-r--r--third_party/aom/av1/encoder/encodeframe.h55
-rw-r--r--third_party/aom/av1/encoder/encodeframe_utils.c1775
-rw-r--r--third_party/aom/av1/encoder/encodeframe_utils.h595
-rw-r--r--third_party/aom/av1/encoder/encodemb.c866
-rw-r--r--third_party/aom/av1/encoder/encodemb.h180
-rw-r--r--third_party/aom/av1/encoder/encodemv.c345
-rw-r--r--third_party/aom/av1/encoder/encodemv.h110
-rw-r--r--third_party/aom/av1/encoder/encoder.c5409
-rw-r--r--third_party/aom/av1/encoder/encoder.h4512
-rw-r--r--third_party/aom/av1/encoder/encoder_alloc.h531
-rw-r--r--third_party/aom/av1/encoder/encoder_utils.c1503
-rw-r--r--third_party/aom/av1/encoder/encoder_utils.h1141
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c886
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h276
-rw-r--r--third_party/aom/av1/encoder/ethread.c3469
-rw-r--r--third_party/aom/av1/encoder/ethread.h133
-rw-r--r--third_party/aom/av1/encoder/extend.c163
-rw-r--r--third_party/aom/av1/encoder/extend.h29
-rw-r--r--third_party/aom/av1/encoder/external_partition.c98
-rw-r--r--third_party/aom/av1/encoder/external_partition.h58
-rw-r--r--third_party/aom/av1/encoder/firstpass.c1600
-rw-r--r--third_party/aom/av1/encoder/firstpass.h603
-rw-r--r--third_party/aom/av1/encoder/global_motion.c575
-rw-r--r--third_party/aom/av1/encoder/global_motion.h157
-rw-r--r--third_party/aom/av1/encoder/global_motion_facade.c450
-rw-r--r--third_party/aom/av1/encoder/global_motion_facade.h58
-rw-r--r--third_party/aom/av1/encoder/gop_structure.c867
-rw-r--r--third_party/aom/av1/encoder/gop_structure.h95
-rw-r--r--third_party/aom/av1/encoder/grain_test_vectors.h781
-rw-r--r--third_party/aom/av1/encoder/hash.c126
-rw-r--r--third_party/aom/av1/encoder/hash.h53
-rw-r--r--third_party/aom/av1/encoder/hash_motion.c503
-rw-r--r--third_party/aom/av1/encoder/hash_motion.h103
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.c370
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.h40
-rw-r--r--third_party/aom/av1/encoder/interp_search.c801
-rw-r--r--third_party/aom/av1/encoder/interp_search.h205
-rw-r--r--third_party/aom/av1/encoder/intra_mode_search.c1739
-rw-r--r--third_party/aom/av1/encoder/intra_mode_search.h329
-rw-r--r--third_party/aom/av1/encoder/intra_mode_search_utils.h690
-rw-r--r--third_party/aom/av1/encoder/k_means_template.h151
-rw-r--r--third_party/aom/av1/encoder/level.c1397
-rw-r--r--third_party/aom/av1/encoder/level.h221
-rw-r--r--third_party/aom/av1/encoder/lookahead.c222
-rw-r--r--third_party/aom/av1/encoder/lookahead.h138
-rw-r--r--third_party/aom/av1/encoder/mcomp.c3998
-rw-r--r--third_party/aom/av1/encoder/mcomp.h398
-rw-r--r--third_party/aom/av1/encoder/mcomp_structs.h109
-rw-r--r--third_party/aom/av1/encoder/misc_model_weights.h696
-rw-r--r--third_party/aom/av1/encoder/ml.c171
-rw-r--r--third_party/aom/av1/encoder/ml.h85
-rw-r--r--third_party/aom/av1/encoder/mode_prune_model_weights.h185
-rw-r--r--third_party/aom/av1/encoder/model_rd.h270
-rw-r--r--third_party/aom/av1/encoder/motion_search_facade.c1071
-rw-r--r--third_party/aom/av1/encoder/motion_search_facade.h145
-rw-r--r--third_party/aom/av1/encoder/mv_prec.c429
-rw-r--r--third_party/aom/av1/encoder/mv_prec.h52
-rw-r--r--third_party/aom/av1/encoder/nonrd_opt.c933
-rw-r--r--third_party/aom/av1/encoder/nonrd_opt.h575
-rw-r--r--third_party/aom/av1/encoder/nonrd_pickmode.c3537
-rw-r--r--third_party/aom/av1/encoder/optical_flow.c1113
-rw-r--r--third_party/aom/av1/encoder/optical_flow.h76
-rw-r--r--third_party/aom/av1/encoder/palette.c975
-rw-r--r--third_party/aom/av1/encoder/palette.h215
-rw-r--r--third_party/aom/av1/encoder/partition_cnn_weights.h2139
-rw-r--r--third_party/aom/av1/encoder/partition_model_weights.h5646
-rw-r--r--third_party/aom/av1/encoder/partition_search.c6263
-rw-r--r--third_party/aom/av1/encoder/partition_search.h81
-rw-r--r--third_party/aom/av1/encoder/partition_strategy.c2573
-rw-r--r--third_party/aom/av1/encoder/partition_strategy.h265
-rw-r--r--third_party/aom/av1/encoder/pass2_strategy.c4488
-rw-r--r--third_party/aom/av1/encoder/pass2_strategy.h149
-rw-r--r--third_party/aom/av1/encoder/pickcdef.c958
-rw-r--r--third_party/aom/av1/encoder/pickcdef.h261
-rw-r--r--third_party/aom/av1/encoder/picklpf.c339
-rw-r--r--third_party/aom/av1/encoder/picklpf.h165
-rw-r--r--third_party/aom/av1/encoder/pickrst.c2217
-rw-r--r--third_party/aom/av1/encoder/pickrst.h126
-rw-r--r--third_party/aom/av1/encoder/pustats.h198
-rw-r--r--third_party/aom/av1/encoder/random.h85
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c3587
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h864
-rw-r--r--third_party/aom/av1/encoder/rc_utils.h469
-rw-r--r--third_party/aom/av1/encoder/rd.c1580
-rw-r--r--third_party/aom/av1/encoder/rd.h390
-rw-r--r--third_party/aom/av1/encoder/rdopt.c6598
-rw-r--r--third_party/aom/av1/encoder/rdopt.h327
-rw-r--r--third_party/aom/av1/encoder/rdopt_data_defs.h294
-rw-r--r--third_party/aom/av1/encoder/rdopt_utils.h797
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.c701
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.h94
-rw-r--r--third_party/aom/av1/encoder/saliency_map.c1414
-rw-r--r--third_party/aom/av1/encoder/saliency_map.h28
-rw-r--r--third_party/aom/av1/encoder/segmentation.c54
-rw-r--r--third_party/aom/av1/encoder/segmentation.h38
-rw-r--r--third_party/aom/av1/encoder/sorting_network.h140
-rw-r--r--third_party/aom/av1/encoder/sparse_linear_solver.c472
-rw-r--r--third_party/aom/av1/encoder/sparse_linear_solver.h67
-rw-r--r--third_party/aom/av1/encoder/speed_features.c2715
-rw-r--r--third_party/aom/av1/encoder/speed_features.h2025
-rw-r--r--third_party/aom/av1/encoder/superres_scale.c423
-rw-r--r--third_party/aom/av1/encoder/superres_scale.h28
-rw-r--r--third_party/aom/av1/encoder/svc_layercontext.c701
-rw-r--r--third_party/aom/av1/encoder/svc_layercontext.h325
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c1520
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.h458
-rw-r--r--third_party/aom/av1/encoder/thirdpass.c877
-rw-r--r--third_party/aom/av1/encoder/thirdpass.h197
-rw-r--r--third_party/aom/av1/encoder/tokenize.c396
-rw-r--r--third_party/aom/av1/encoder/tokenize.h159
-rw-r--r--third_party/aom/av1/encoder/tpl_model.c2511
-rw-r--r--third_party/aom/av1/encoder/tpl_model.h794
-rw-r--r--third_party/aom/av1/encoder/tune_butteraugli.c313
-rw-r--r--third_party/aom/av1/encoder/tune_butteraugli.h45
-rw-r--r--third_party/aom/av1/encoder/tune_vmaf.c1112
-rw-r--r--third_party/aom/av1/encoder/tune_vmaf.h63
-rw-r--r--third_party/aom/av1/encoder/tx_prune_model_weights.h3422
-rw-r--r--third_party/aom/av1/encoder/tx_search.c3830
-rw-r--r--third_party/aom/av1/encoder/tx_search.h226
-rw-r--r--third_party/aom/av1/encoder/txb_rdopt.c659
-rw-r--r--third_party/aom/av1/encoder/txb_rdopt.h160
-rw-r--r--third_party/aom/av1/encoder/txb_rdopt_utils.h236
-rw-r--r--third_party/aom/av1/encoder/var_based_part.c1914
-rw-r--r--third_party/aom/av1/encoder/var_based_part.h104
-rw-r--r--third_party/aom/av1/encoder/wedge_utils.c125
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c1409
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c3010
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c336
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h96
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c2673
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h253
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c137
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c195
-rw-r--r--third_party/aom/av1/encoder/x86/av1_k_means_avx2.c132
-rw-r--r--third_party/aom/av1/encoder/x86/av1_k_means_sse2.c124
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_avx2.c414
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_sse2.c289
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm204
-rw-r--r--third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm222
-rw-r--r--third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c328
-rw-r--r--third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h144
-rw-r--r--third_party/aom/av1/encoder/x86/cnn_avx2.c532
-rw-r--r--third_party/aom/av1/encoder/x86/dct_sse2.asm82
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_avx2.c122
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse2.c505
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse4.c84
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_avx2.c210
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_sse2.c75
-rw-r--r--third_party/aom/av1/encoder/x86/error_sse2.asm88
-rw-r--r--third_party/aom/av1/encoder/x86/hash_sse42.c53
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c64
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c74
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c3132
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c2629
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c466
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c341
-rw-r--r--third_party/aom/av1/encoder/x86/ml_avx2.c240
-rw-r--r--third_party/aom/av1/encoder/x86/ml_sse3.c336
-rw-r--r--third_party/aom/av1/encoder/x86/ml_sse3.h29
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_avx2.c2348
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_sse4.c1483
-rw-r--r--third_party/aom/av1/encoder/x86/rdopt_avx2.c254
-rw-r--r--third_party/aom/av1/encoder/x86/rdopt_sse4.c272
-rw-r--r--third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c347
-rw-r--r--third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c67
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_avx2.c647
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_sse2.c320
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_avx2.c215
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_sse2.c254
-rw-r--r--third_party/aom/av1/exports_com2
-rw-r--r--third_party/aom/av1/exports_dec3
-rw-r--r--third_party/aom/av1/exports_enc2
-rw-r--r--third_party/aom/av1/exports_ident2
-rw-r--r--third_party/aom/av1/exports_test2
-rw-r--r--third_party/aom/av1/ratectrl_rtc.cc375
-rw-r--r--third_party/aom/av1/ratectrl_rtc.h127
-rw-r--r--third_party/aom/build/cmake/aom_config.c.template13
-rw-r--r--third_party/aom/build/cmake/aom_config_defaults.cmake235
-rw-r--r--third_party/aom/build/cmake/aom_configure.cmake489
-rw-r--r--third_party/aom/build/cmake/aom_experiment_deps.cmake24
-rw-r--r--third_party/aom/build/cmake/aom_install.cmake98
-rw-r--r--third_party/aom/build/cmake/aom_optimization.cmake279
-rw-r--r--third_party/aom/build/cmake/compiler_flags.cmake385
-rw-r--r--third_party/aom/build/cmake/compiler_tests.cmake179
-rw-r--r--third_party/aom/build/cmake/cpu.cmake108
-rw-r--r--third_party/aom/build/cmake/dist.cmake64
-rw-r--r--third_party/aom/build/cmake/exports.cmake76
-rw-r--r--third_party/aom/build/cmake/exports_sources.cmake35
-rw-r--r--third_party/aom/build/cmake/generate_aom_config_templates.cmake92
-rw-r--r--third_party/aom/build/cmake/generate_exports.cmake69
-rw-r--r--third_party/aom/build/cmake/pkg_config.cmake69
-rwxr-xr-xthird_party/aom/build/cmake/rtcd.pl430
-rw-r--r--third_party/aom/build/cmake/sanitizers.cmake46
-rw-r--r--third_party/aom/build/cmake/toolchains/android.cmake53
-rw-r--r--third_party/aom/build/cmake/toolchains/arm-ios-common.cmake24
-rw-r--r--third_party/aom/build/cmake/toolchains/arm64-ios.cmake23
-rw-r--r--third_party/aom/build/cmake/toolchains/arm64-linux-clang.cmake30
-rw-r--r--third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake40
-rw-r--r--third_party/aom/build/cmake/toolchains/arm64-macos.cmake16
-rw-r--r--third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake36
-rw-r--r--third_party/aom/build/cmake/toolchains/armv7-ios.cmake31
-rw-r--r--third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake46
-rw-r--r--third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake39
-rw-r--r--third_party/aom/build/cmake/toolchains/armv7s-ios.cmake31
-rw-r--r--third_party/aom/build/cmake/toolchains/i686-linux-gcc.cmake34
-rw-r--r--third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake24
-rw-r--r--third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake36
-rw-r--r--third_party/aom/build/cmake/toolchains/riscv-linux-gcc.cmake36
-rw-r--r--third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake28
-rw-r--r--third_party/aom/build/cmake/toolchains/x86-linux.cmake20
-rw-r--r--third_party/aom/build/cmake/toolchains/x86-macos.cmake19
-rw-r--r--third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake40
-rw-r--r--third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake25
-rw-r--r--third_party/aom/build/cmake/toolchains/x86_64-macos.cmake16
-rw-r--r--third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake37
-rw-r--r--third_party/aom/build/cmake/util.cmake173
-rw-r--r--third_party/aom/build/cmake/version.cmake67
-rwxr-xr-xthird_party/aom/build/cmake/version.pl114
-rw-r--r--third_party/aom/codereview.settings4
-rw-r--r--third_party/aom/common/args.c248
-rw-r--r--third_party/aom/common/args.h40
-rw-r--r--third_party/aom/common/args_helper.c221
-rw-r--r--third_party/aom/common/args_helper.h79
-rw-r--r--third_party/aom/common/av1_config.c511
-rw-r--r--third_party/aom/common/av1_config.h86
-rw-r--r--third_party/aom/common/ivf_dec.cmake28
-rw-r--r--third_party/aom/common/ivfdec.c112
-rw-r--r--third_party/aom/common/ivfdec.h30
-rw-r--r--third_party/aom/common/ivfenc.c52
-rw-r--r--third_party/aom/common/ivfenc.h34
-rw-r--r--third_party/aom/common/md5_utils.c257
-rw-r--r--third_party/aom/common/md5_utils.h49
-rw-r--r--third_party/aom/common/obudec.c512
-rw-r--r--third_party/aom/common/obudec.h48
-rw-r--r--third_party/aom/common/rawenc.c99
-rw-r--r--third_party/aom/common/rawenc.h32
-rw-r--r--third_party/aom/common/tools_common.c636
-rw-r--r--third_party/aom/common/tools_common.h208
-rw-r--r--third_party/aom/common/video_common.h25
-rw-r--r--third_party/aom/common/video_reader.c135
-rw-r--r--third_party/aom/common/video_reader.h60
-rw-r--r--third_party/aom/common/video_writer.c83
-rw-r--r--third_party/aom/common/video_writer.h47
-rw-r--r--third_party/aom/common/warnings.c97
-rw-r--r--third_party/aom/common/warnings.h34
-rw-r--r--third_party/aom/common/webmdec.cc248
-rw-r--r--third_party/aom/common/webmdec.h71
-rw-r--r--third_party/aom/common/webmenc.cc242
-rw-r--r--third_party/aom/common/webmenc.h71
-rw-r--r--third_party/aom/common/y4menc.c108
-rw-r--r--third_party/aom/common/y4menc.h39
-rw-r--r--third_party/aom/common/y4minput.c1222
-rw-r--r--third_party/aom/common/y4minput.h82
-rw-r--r--third_party/aom/doc/AlgorithmDescription.md799
-rw-r--r--third_party/aom/doc/dev_guide/av1_decoder.dox11
-rw-r--r--third_party/aom/doc/dev_guide/av1_encoder.dox1617
-rw-r--r--third_party/aom/doc/dev_guide/av1encoderflow.pngbin0 -> 97167 bytes
-rw-r--r--third_party/aom/doc/dev_guide/av1partitions.pngbin0 -> 115004 bytes
-rw-r--r--third_party/aom/doc/dev_guide/coeff_coding.pngbin0 -> 17955 bytes
-rw-r--r--third_party/aom/doc/dev_guide/filter_flow.pngbin0 -> 30616 bytes
-rw-r--r--third_party/aom/doc/dev_guide/filter_thr.pngbin0 -> 12969 bytes
-rw-r--r--third_party/aom/doc/dev_guide/genericcodecflow.pngbin0 -> 46815 bytes
-rw-r--r--third_party/aom/doc/dev_guide/gf_group.pngbin0 -> 121402 bytes
-rw-r--r--third_party/aom/doc/dev_guide/partition.pngbin0 -> 32428 bytes
-rw-r--r--third_party/aom/doc/dev_guide/tplgfgroupdiagram.pngbin0 -> 31598 bytes
-rw-r--r--third_party/aom/doc/img/edge_direction.svg6319
-rw-r--r--third_party/aom/doc/img/equ_dir_search.svg206
-rw-r--r--third_party/aom/doc/img/equ_dual_self_guided.svg71
-rw-r--r--third_party/aom/doc/img/equ_dual_self_para.svg69
-rw-r--r--third_party/aom/doc/img/equ_edge_direction.svg121
-rw-r--r--third_party/aom/doc/img/equ_guided_filter.svg53
-rw-r--r--third_party/aom/doc/img/equ_wiener_filter.svg51
-rw-r--r--third_party/aom/doc/img/inter_motion_field.svg219
-rw-r--r--third_party/aom/doc/img/inter_obmc.svg61
-rw-r--r--third_party/aom/doc/img/inter_spatial_mvp.svg215
-rw-r--r--third_party/aom/doc/img/inter_tmvp_positions.svg99
-rw-r--r--third_party/aom/doc/img/inter_tx_partition.svg87
-rw-r--r--third_party/aom/doc/img/intra_cfl.svg193
-rw-r--r--third_party/aom/doc/img/intra_directional.svg192
-rw-r--r--third_party/aom/doc/img/intra_paeth.svg181
-rw-r--r--third_party/aom/doc/img/intra_recursive.svg710
-rw-r--r--third_party/aom/doc/img/intra_tx_partition.svg142
-rw-r--r--third_party/aom/doc/img/loop_restoration.svg114
-rw-r--r--third_party/aom/doc/img/partition_codingblock.svg225
-rw-r--r--third_party/aom/doc/img/primary_tap.svg1589
-rw-r--r--third_party/aom/doc/img/quant_ac.svg1
-rw-r--r--third_party/aom/doc/img/quant_dc.svg1
-rw-r--r--third_party/aom/doc/img/scc_intrabc.svg348
-rw-r--r--third_party/aom/doc/img/secondary_tap.svg857
-rw-r--r--third_party/aom/doc/img/tx_basis.svg1
-rw-r--r--third_party/aom/doc/img/tx_cands_large.svg1
-rw-r--r--third_party/aom/doc/img/tx_cands_small.svg1
-rw-r--r--third_party/aom/doc/img/tx_chroma.svg1
-rw-r--r--third_party/aom/doc/img/tx_partition.svg1
-rw-r--r--third_party/aom/doc/img/tx_set.svg1
-rw-r--r--third_party/aom/docs.cmake345
-rw-r--r--third_party/aom/examples/analyzer.cc722
-rw-r--r--third_party/aom/examples/aom_cx_set_ref.c392
-rw-r--r--third_party/aom/examples/av1_dec_fuzzer.cc67
-rw-r--r--third_party/aom/examples/av1_dec_fuzzer.dict5
-rwxr-xr-xthird_party/aom/examples/build_av1_dec_fuzzer.sh70
-rw-r--r--third_party/aom/examples/decode_to_md5.c130
-rw-r--r--third_party/aom/examples/decode_with_drops.c144
-rw-r--r--third_party/aom/examples/encoder_util.c136
-rw-r--r--third_party/aom/examples/encoder_util.h40
-rw-r--r--third_party/aom/examples/inspect.c963
-rw-r--r--third_party/aom/examples/lightfield_bitstream_parsing.c415
-rw-r--r--third_party/aom/examples/lightfield_decoder.c381
-rw-r--r--third_party/aom/examples/lightfield_encoder.c525
-rw-r--r--third_party/aom/examples/lightfield_tile_list_decoder.c232
-rw-r--r--third_party/aom/examples/lossless_encoder.c137
-rw-r--r--third_party/aom/examples/noise_model.c434
-rw-r--r--third_party/aom/examples/photon_noise_table.c398
-rw-r--r--third_party/aom/examples/scalable_decoder.c184
-rw-r--r--third_party/aom/examples/scalable_encoder.c288
-rw-r--r--third_party/aom/examples/set_maps.c219
-rw-r--r--third_party/aom/examples/simple_decoder.c145
-rw-r--r--third_party/aom/examples/simple_encoder.c259
-rw-r--r--third_party/aom/examples/svc_encoder_rtc.cc2062
-rw-r--r--third_party/aom/examples/twopass_encoder.c254
-rw-r--r--third_party/aom/keywords.dox51
-rw-r--r--third_party/aom/libs.doxy_template2447
-rw-r--r--third_party/aom/mainpage.dox68
-rw-r--r--third_party/aom/stats/aomstats.c112
-rw-r--r--third_party/aom/stats/aomstats.h44
-rw-r--r--third_party/aom/stats/rate_hist.c301
-rw-r--r--third_party/aom/stats/rate_hist.h41
-rw-r--r--third_party/aom/test/accounting_test.cc75
-rw-r--r--third_party/aom/test/acm_random.h88
-rw-r--r--third_party/aom/test/active_map_test.cc97
-rw-r--r--third_party/aom/test/allintra_end_to_end_test.cc145
-rw-r--r--third_party/aom/test/altref_test.cc215
-rw-r--r--third_party/aom/test/aom_image_test.cc62
-rw-r--r--third_party/aom/test/aom_integer_test.cc177
-rw-r--r--third_party/aom/test/aom_mem_test.cc34
-rwxr-xr-xthird_party/aom/test/aomcx_set_ref.sh58
-rwxr-xr-xthird_party/aom/test/aomdec.sh219
-rwxr-xr-xthird_party/aom/test/aomenc.sh306
-rw-r--r--third_party/aom/test/aq_segment_test.cc110
-rw-r--r--third_party/aom/test/arf_freq_test.cc218
-rwxr-xr-xthird_party/aom/test/av1_c_vs_simd_encode.sh566
-rw-r--r--third_party/aom/test/av1_common_int_test.cc22
-rw-r--r--third_party/aom/test/av1_config_test.cc164
-rw-r--r--third_party/aom/test/av1_convolve_scale_test.cc561
-rw-r--r--third_party/aom/test/av1_convolve_test.cc2447
-rw-r--r--third_party/aom/test/av1_encoder_parms_get_to_decoder.cc160
-rw-r--r--third_party/aom/test/av1_ext_tile_test.cc212
-rw-r--r--third_party/aom/test/av1_external_partition_test.cc702
-rw-r--r--third_party/aom/test/av1_fwd_txfm1d_test.cc108
-rw-r--r--third_party/aom/test/av1_fwd_txfm2d_test.cc692
-rw-r--r--third_party/aom/test/av1_highbd_iht_test.cc376
-rw-r--r--third_party/aom/test/av1_horz_only_frame_superres_test.cc385
-rw-r--r--third_party/aom/test/av1_inv_txfm1d_test.cc157
-rw-r--r--third_party/aom/test/av1_inv_txfm2d_test.cc406
-rw-r--r--third_party/aom/test/av1_k_means_test.cc295
-rw-r--r--third_party/aom/test/av1_key_value_api_test.cc133
-rw-r--r--third_party/aom/test/av1_nn_predict_test.cc228
-rw-r--r--third_party/aom/test/av1_quantize_test.cc264
-rw-r--r--third_party/aom/test/av1_round_shift_array_test.cc131
-rw-r--r--third_party/aom/test/av1_softmax_test.cc122
-rw-r--r--third_party/aom/test/av1_temporal_denoiser_test.cc140
-rw-r--r--third_party/aom/test/av1_txfm_test.cc398
-rw-r--r--third_party/aom/test/av1_txfm_test.h161
-rw-r--r--third_party/aom/test/av1_wedge_utils_test.cc411
-rw-r--r--third_party/aom/test/avg_test.cc1150
-rw-r--r--third_party/aom/test/avif_progressive_test.cc279
-rwxr-xr-xthird_party/aom/test/best_encode.sh101
-rw-r--r--third_party/aom/test/binary_codes_test.cc83
-rw-r--r--third_party/aom/test/blend_a64_mask_1d_test.cc342
-rw-r--r--third_party/aom/test/blend_a64_mask_test.cc649
-rw-r--r--third_party/aom/test/block_test.cc209
-rw-r--r--third_party/aom/test/boolcoder_test.cc173
-rw-r--r--third_party/aom/test/borders_test.cc82
-rw-r--r--third_party/aom/test/cdef_test.cc962
-rw-r--r--third_party/aom/test/cfl_test.cc597
-rw-r--r--third_party/aom/test/cnn_test.cc2661
-rw-r--r--third_party/aom/test/codec_factory.h178
-rw-r--r--third_party/aom/test/coding_path_sync.cc212
-rw-r--r--third_party/aom/test/comp_avg_pred_test.cc249
-rw-r--r--third_party/aom/test/comp_avg_pred_test.h757
-rw-r--r--third_party/aom/test/comp_mask_pred_test.cc856
-rw-r--r--third_party/aom/test/convolve_test.cc922
-rw-r--r--third_party/aom/test/corner_match_test.cc145
-rw-r--r--third_party/aom/test/cpu_speed_test.cc175
-rw-r--r--third_party/aom/test/cpu_used_firstpass_test.cc129
-rw-r--r--third_party/aom/test/datarate_test.cc712
-rw-r--r--third_party/aom/test/datarate_test.h223
-rw-r--r--third_party/aom/test/decode_api_test.cc62
-rw-r--r--third_party/aom/test/decode_multithreaded_test.cc182
-rw-r--r--third_party/aom/test/decode_perf_test.cc246
-rw-r--r--third_party/aom/test/decode_scalability_test.cc121
-rw-r--r--third_party/aom/test/decode_test_driver.cc114
-rw-r--r--third_party/aom/test/decode_test_driver.h165
-rwxr-xr-xthird_party/aom/test/decode_to_md5.sh77
-rwxr-xr-xthird_party/aom/test/decode_with_drops.sh68
-rw-r--r--third_party/aom/test/deltaq_mode_test.cc209
-rw-r--r--third_party/aom/test/disflow_test.cc122
-rw-r--r--third_party/aom/test/divu_small_test.cc41
-rw-r--r--third_party/aom/test/dr_prediction_test.cc542
-rw-r--r--third_party/aom/test/dropframe_encode_test.cc62
-rwxr-xr-xthird_party/aom/test/dump_obu.sh77
-rw-r--r--third_party/aom/test/ec_test.cc154
-rw-r--r--third_party/aom/test/encode_api_test.cc659
-rw-r--r--third_party/aom/test/encode_perf_test.cc183
-rw-r--r--third_party/aom/test/encode_small_width_height_test.cc246
-rw-r--r--third_party/aom/test/encode_test_driver.cc302
-rw-r--r--third_party/aom/test/encode_test_driver.h286
-rw-r--r--third_party/aom/test/encodemb_test.cc245
-rw-r--r--third_party/aom/test/encodetxb_test.cc289
-rw-r--r--third_party/aom/test/end_to_end_psnr_test.cc212
-rw-r--r--third_party/aom/test/end_to_end_qmpsnr_test.cc193
-rw-r--r--third_party/aom/test/end_to_end_ssim_test.cc189
-rw-r--r--third_party/aom/test/error_block_test.cc319
-rw-r--r--third_party/aom/test/error_resilience_test.cc465
-rw-r--r--third_party/aom/test/ethread_test.cc577
-rwxr-xr-xthird_party/aom/test/examples.sh37
-rw-r--r--third_party/aom/test/external_frame_buffer_test.cc547
-rw-r--r--third_party/aom/test/fdct4x4_test.cc124
-rw-r--r--third_party/aom/test/fft_test.cc268
-rw-r--r--third_party/aom/test/film_grain_table_test.cc381
-rw-r--r--third_party/aom/test/filterintra_test.cc197
-rw-r--r--third_party/aom/test/firstpass_test.cc166
-rw-r--r--third_party/aom/test/force_key_frame_test.cc93
-rw-r--r--third_party/aom/test/forced_max_frame_width_height_test.cc280
-rw-r--r--third_party/aom/test/frame_parallel_enc_test.cc197
-rw-r--r--third_party/aom/test/frame_size_tests.cc388
-rw-r--r--third_party/aom/test/function_equivalence_test.h68
-rw-r--r--third_party/aom/test/fwht4x4_test.cc223
-rw-r--r--third_party/aom/test/gf_pyr_height_test.cc155
-rwxr-xr-xthird_party/aom/test/gviz_api.py1087
-rw-r--r--third_party/aom/test/hadamard_test.cc547
-rw-r--r--third_party/aom/test/hash_test.cc141
-rw-r--r--third_party/aom/test/hbd_metrics_test.cc239
-rw-r--r--third_party/aom/test/hiprec_convolve_test.cc76
-rw-r--r--third_party/aom/test/hiprec_convolve_test_util.cc380
-rw-r--r--third_party/aom/test/hiprec_convolve_test_util.h90
-rw-r--r--third_party/aom/test/horver_correlation_test.cc154
-rw-r--r--third_party/aom/test/horz_superres_test.cc409
-rw-r--r--third_party/aom/test/i420_video_source.h34
-rw-r--r--third_party/aom/test/intra_edge_test.cc351
-rw-r--r--third_party/aom/test/intrabc_test.cc172
-rw-r--r--third_party/aom/test/intrapred_test.cc488
-rw-r--r--third_party/aom/test/invalid_file_test.cc169
-rw-r--r--third_party/aom/test/ivf_video_source.h114
-rw-r--r--third_party/aom/test/kf_test.cc401
-rw-r--r--third_party/aom/test/level_test.cc188
-rwxr-xr-xthird_party/aom/test/lightfield_test.sh115
-rw-r--r--third_party/aom/test/log2_test.cc51
-rw-r--r--third_party/aom/test/loopfilter_control_test.cc198
-rw-r--r--third_party/aom/test/lossless_test.cc230
-rw-r--r--third_party/aom/test/lpf_test.cc824
-rw-r--r--third_party/aom/test/masked_sad_test.cc617
-rw-r--r--third_party/aom/test/masked_variance_test.cc712
-rw-r--r--third_party/aom/test/md5_helper.h76
-rw-r--r--third_party/aom/test/metadata_test.cc332
-rw-r--r--third_party/aom/test/metrics_template.html422
-rw-r--r--third_party/aom/test/minmax_test.cc244
-rw-r--r--third_party/aom/test/monochrome_test.cc213
-rw-r--r--third_party/aom/test/motion_vector_test.cc103
-rw-r--r--third_party/aom/test/mv_cost_test.cc125
-rw-r--r--third_party/aom/test/noise_model_test.cc1372
-rw-r--r--third_party/aom/test/obmc_sad_test.cc333
-rw-r--r--third_party/aom/test/obmc_variance_test.cc571
-rw-r--r--third_party/aom/test/pickrst_test.cc750
-rw-r--r--third_party/aom/test/postproc_filters_test.cc140
-rw-r--r--third_party/aom/test/quant_test.cc188
-rw-r--r--third_party/aom/test/quantize_func_test.cc795
-rw-r--r--third_party/aom/test/ratectrl_rtc_test.cc505
-rw-r--r--third_party/aom/test/ratectrl_test.cc39
-rw-r--r--third_party/aom/test/rd_test.cc87
-rw-r--r--third_party/aom/test/reconinter_test.cc372
-rw-r--r--third_party/aom/test/register_state_check.h136
-rw-r--r--third_party/aom/test/resize_test.cc1136
-rw-r--r--third_party/aom/test/rt_end_to_end_test.cc208
-rwxr-xr-xthird_party/aom/test/run_encodes.sh39
-rw-r--r--third_party/aom/test/sad_test.cc3353
-rw-r--r--third_party/aom/test/sb_multipass_test.cc152
-rw-r--r--third_party/aom/test/sb_qp_sweep_test.cc147
-rw-r--r--third_party/aom/test/scalability_test.cc81
-rw-r--r--third_party/aom/test/scan_test.cc133
-rw-r--r--third_party/aom/test/screen_content_test.cc135
-rw-r--r--third_party/aom/test/segment_binarization_sync.cc61
-rw-r--r--third_party/aom/test/selfguided_filter_test.cc435
-rwxr-xr-xthird_party/aom/test/set_maps.sh52
-rw-r--r--third_party/aom/test/sharpness_test.cc143
-rw-r--r--third_party/aom/test/simd_avx2_test.cc15
-rw-r--r--third_party/aom/test/simd_cmp_avx2.cc15
-rw-r--r--third_party/aom/test/simd_cmp_impl.h2175
-rw-r--r--third_party/aom/test/simd_cmp_sse2.cc18
-rw-r--r--third_party/aom/test/simd_cmp_sse4.cc18
-rw-r--r--third_party/aom/test/simd_cmp_ssse3.cc18
-rw-r--r--third_party/aom/test/simd_impl.h1140
-rw-r--r--third_party/aom/test/simd_sse2_test.cc18
-rw-r--r--third_party/aom/test/simd_sse4_test.cc18
-rw-r--r--third_party/aom/test/simd_ssse3_test.cc18
-rwxr-xr-xthird_party/aom/test/simple_decoder.sh58
-rwxr-xr-xthird_party/aom/test/simple_encoder.sh53
-rw-r--r--third_party/aom/test/sse_sum_test.cc182
-rw-r--r--third_party/aom/test/still_picture_test.cc95
-rw-r--r--third_party/aom/test/subtract_test.cc292
-rw-r--r--third_party/aom/test/sum_squares_test.cc928
-rw-r--r--third_party/aom/test/svc_datarate_test.cc2675
-rw-r--r--third_party/aom/test/svc_encoder_rtc.sh85
-rw-r--r--third_party/aom/test/temporal_filter_test.cc788
-rw-r--r--third_party/aom/test/test-data.sha1575
-rw-r--r--third_party/aom/test/test.cmake647
-rw-r--r--third_party/aom/test/test_aom_rc.cc17
-rw-r--r--third_party/aom/test/test_data_download_worker.cmake46
-rw-r--r--third_party/aom/test/test_data_util.cmake665
-rw-r--r--third_party/aom/test/test_intra_pred_speed.cc1742
-rw-r--r--third_party/aom/test/test_libaom.cc91
-rw-r--r--third_party/aom/test/test_runner.cmake28
-rw-r--r--third_party/aom/test/test_vector_test.cc173
-rw-r--r--third_party/aom/test/test_vectors.cc268
-rw-r--r--third_party/aom/test/test_vectors.h26
-rw-r--r--third_party/aom/test/tile_config_test.cc363
-rw-r--r--third_party/aom/test/tile_independence_test.cc170
-rw-r--r--third_party/aom/test/time_stamp_test.cc107
-rwxr-xr-xthird_party/aom/test/tools_common.sh520
-rw-r--r--third_party/aom/test/tpl_model_test.cc529
-rw-r--r--third_party/aom/test/transform_test_base.h368
-rwxr-xr-xthird_party/aom/test/twopass_encoder.sh54
-rw-r--r--third_party/aom/test/util.h60
-rw-r--r--third_party/aom/test/variance_test.cc4370
-rw-r--r--third_party/aom/test/video_source.h282
-rwxr-xr-xthird_party/aom/test/visual_metrics.py466
-rw-r--r--third_party/aom/test/warp_filter_test.cc93
-rw-r--r--third_party/aom/test/warp_filter_test_util.cc505
-rw-r--r--third_party/aom/test/warp_filter_test_util.h102
-rw-r--r--third_party/aom/test/webm_video_source.h107
-rw-r--r--third_party/aom/test/webmenc_test.cc69
-rw-r--r--third_party/aom/test/wiener_test.cc1390
-rw-r--r--third_party/aom/test/y4m_test.cc287
-rw-r--r--third_party/aom/test/y4m_video_source.h125
-rw-r--r--third_party/aom/test/yuv_video_source.h126
-rw-r--r--third_party/aom/third_party/SVT-AV1/EbMemory_AVX2.h110
-rw-r--r--third_party/aom/third_party/SVT-AV1/EbMemory_SSE4_1.h38
-rw-r--r--third_party/aom/third_party/SVT-AV1/LICENSE.md32
-rw-r--r--third_party/aom/third_party/SVT-AV1/PATENTS.md107
-rw-r--r--third_party/aom/third_party/SVT-AV1/README.libaom14
-rw-r--r--third_party/aom/third_party/SVT-AV1/convolve_2d_avx2.h1199
-rw-r--r--third_party/aom/third_party/SVT-AV1/convolve_avx2.h3335
-rw-r--r--third_party/aom/third_party/SVT-AV1/synonyms.h31
-rw-r--r--third_party/aom/third_party/fastfeat/LICENSE30
-rw-r--r--third_party/aom/third_party/fastfeat/README.libaom44
-rw-r--r--third_party/aom/third_party/fastfeat/fast.c67
-rw-r--r--third_party/aom/third_party/fastfeat/fast.h56
-rw-r--r--third_party/aom/third_party/fastfeat/fast_9.c5947
-rw-r--r--third_party/aom/third_party/fastfeat/nonmax.c174
-rw-r--r--third_party/aom/third_party/googletest/README.libaom38
-rw-r--r--third_party/aom/third_party/googletest/src/.clang-format4
-rw-r--r--third_party/aom/third_party/googletest/src/CMakeLists.txt34
-rw-r--r--third_party/aom/third_party/googletest/src/CONTRIBUTORS65
-rw-r--r--third_party/aom/third_party/googletest/src/LICENSE28
-rw-r--r--third_party/aom/third_party/googletest/src/README.md141
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/CMakeLists.txt218
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/README.md40
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/cmake/gmock.pc.in10
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in10
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h2298
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h159
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h514
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h5610
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h662
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h91
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h277
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h2083
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock.h96
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md18
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h7
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h37
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h40
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h476
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h139
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h279
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock-all.cc46
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc155
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc250
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock-matchers.cc462
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc781
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock.cc223
-rw-r--r--third_party/aom/third_party/googletest/src/googlemock/src/gmock_main.cc72
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt322
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/README.md217
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/cmake/Config.cmake.in9
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/cmake/gtest.pc.in9
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in10
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake342
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/cmake/libgtest.la.in21
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h237
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h345
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h956
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h218
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h510
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h1048
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h248
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h190
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h331
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h2297
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h279
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h60
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md44
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h68
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h42
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h37
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h306
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h210
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h1570
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h956
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h116
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h2413
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h177
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h186
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc49
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-assertion-result.cc77
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc1620
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc367
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h1212
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-matchers.cc98
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc1394
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc553
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc105
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc104
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest.cc6795
-rw-r--r--third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc53
-rw-r--r--third_party/aom/third_party/libwebm/AUTHORS.TXT5
-rw-r--r--third_party/aom/third_party/libwebm/Android.mk23
-rw-r--r--third_party/aom/third_party/libwebm/LICENSE.TXT30
-rw-r--r--third_party/aom/third_party/libwebm/PATENTS.TXT23
-rw-r--r--third_party/aom/third_party/libwebm/README.libaom20
-rw-r--r--third_party/aom/third_party/libwebm/common/file_util.cc93
-rw-r--r--third_party/aom/third_party/libwebm/common/file_util.h44
-rw-r--r--third_party/aom/third_party/libwebm/common/hdr_util.cc220
-rw-r--r--third_party/aom/third_party/libwebm/common/hdr_util.h71
-rw-r--r--third_party/aom/third_party/libwebm/common/webmids.h193
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc4230
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h1924
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h28
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc743
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h115
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc92
-rw-r--r--third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h51
-rw-r--r--third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc8100
-rw-r--r--third_party/aom/third_party/libwebm/mkvparser/mkvparser.h1147
-rw-r--r--third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc135
-rw-r--r--third_party/aom/third_party/libwebm/mkvparser/mkvreader.h45
-rw-r--r--third_party/aom/third_party/libyuv/LICENSE29
-rw-r--r--third_party/aom/third_party/libyuv/README.libaom37
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/basic_types.h68
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/compare.h111
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/convert.h526
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h1611
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/convert_from.h185
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h311
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h122
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h195
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h900
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/rotate.h182
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h37
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h223
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/row.h4384
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/scale.h204
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h76
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/scale_row.h1367
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/scale_uv.h38
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/version.h16
-rw-r--r--third_party/aom/third_party/libyuv/include/libyuv/video_common.h206
-rw-r--r--third_party/aom/third_party/libyuv/source/compare.cc440
-rw-r--r--third_party/aom/third_party/libyuv/source/compare_common.cc104
-rw-r--r--third_party/aom/third_party/libyuv/source/compare_gcc.cc360
-rw-r--r--third_party/aom/third_party/libyuv/source/compare_neon.cc96
-rw-r--r--third_party/aom/third_party/libyuv/source/compare_neon64.cc94
-rw-r--r--third_party/aom/third_party/libyuv/source/compare_win.cc241
-rw-r--r--third_party/aom/third_party/libyuv/source/convert.cc2514
-rw-r--r--third_party/aom/third_party/libyuv/source/convert_argb.cc4125
-rw-r--r--third_party/aom/third_party/libyuv/source/convert_from.cc713
-rw-r--r--third_party/aom/third_party/libyuv/source/convert_from_argb.cc2163
-rw-r--r--third_party/aom/third_party/libyuv/source/convert_jpeg.cc602
-rw-r--r--third_party/aom/third_party/libyuv/source/convert_to_argb.cc382
-rw-r--r--third_party/aom/third_party/libyuv/source/convert_to_i420.cc272
-rw-r--r--third_party/aom/third_party/libyuv/source/cpu_id.cc280
-rw-r--r--third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc585
-rw-r--r--third_party/aom/third_party/libyuv/source/mjpeg_validate.cc71
-rw-r--r--third_party/aom/third_party/libyuv/source/planar_functions.cc4107
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate.cc609
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_any.cc79
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_argb.cc243
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_common.cc106
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_gcc.cc374
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_mips.cc484
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_neon.cc418
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_neon64.cc443
-rw-r--r--third_party/aom/third_party/libyuv/source/rotate_win.cc252
-rw-r--r--third_party/aom/third_party/libyuv/source/row_any.cc1562
-rw-r--r--third_party/aom/third_party/libyuv/source/row_common.cc3849
-rw-r--r--third_party/aom/third_party/libyuv/source/row_gcc.cc7175
-rw-r--r--third_party/aom/third_party/libyuv/source/row_mips.cc911
-rw-r--r--third_party/aom/third_party/libyuv/source/row_neon.cc3039
-rw-r--r--third_party/aom/third_party/libyuv/source/row_neon64.cc3387
-rw-r--r--third_party/aom/third_party/libyuv/source/row_win.cc6237
-rw-r--r--third_party/aom/third_party/libyuv/source/scale.cc1935
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_any.cc615
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_argb.cc1091
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_common.cc1564
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_gcc.cc1464
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_mips.cc654
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_neon.cc1016
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_neon64.cc1152
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_uv.cc891
-rw-r--r--third_party/aom/third_party/libyuv/source/scale_win.cc1391
-rw-r--r--third_party/aom/third_party/libyuv/source/video_common.cc62
-rw-r--r--third_party/aom/third_party/vector/LICENSE19
-rw-r--r--third_party/aom/third_party/vector/README.libaom16
-rw-r--r--third_party/aom/third_party/vector/vector.c540
-rw-r--r--third_party/aom/third_party/vector/vector.h138
-rw-r--r--third_party/aom/third_party/x86inc/LICENSE18
-rw-r--r--third_party/aom/third_party/x86inc/README.libaom19
-rw-r--r--third_party/aom/third_party/x86inc/x86inc.asm1923
-rw-r--r--third_party/aom/tools/aggregate_entropy_stats.py39
-rw-r--r--third_party/aom/tools/aom_entropy_optimizer.c761
-rw-r--r--third_party/aom/tools/auto_refactor/auto_refactor.py919
-rw-r--r--third_party/aom/tools/auto_refactor/av1_preprocess.py113
-rw-r--r--third_party/aom/tools/auto_refactor/c_files/decl_status_code.c31
-rw-r--r--third_party/aom/tools/auto_refactor/c_files/func_in_out.c208
-rw-r--r--third_party/aom/tools/auto_refactor/c_files/global_variable.c27
-rw-r--r--third_party/aom/tools/auto_refactor/c_files/parse_lvalue.c46
-rw-r--r--third_party/aom/tools/auto_refactor/c_files/simple_code.c64
-rw-r--r--third_party/aom/tools/auto_refactor/c_files/struct_code.c49
-rw-r--r--third_party/aom/tools/auto_refactor/test_auto_refactor.py675
-rwxr-xr-xthird_party/aom/tools/cpplint.py6244
-rw-r--r--third_party/aom/tools/diff.py132
-rw-r--r--third_party/aom/tools/dump_obu.cc168
-rw-r--r--third_party/aom/tools/frame_size_variation_analyzer.py74
-rwxr-xr-xthird_party/aom/tools/gen_authors.sh10
-rwxr-xr-xthird_party/aom/tools/gen_constrained_tokenset.py120
-rw-r--r--third_party/aom/tools/gop_bitrate/analyze_data.py18
-rwxr-xr-xthird_party/aom/tools/gop_bitrate/encode_all_script.sh13
-rw-r--r--third_party/aom/tools/gop_bitrate/python/bitrate_accuracy.py185
-rw-r--r--third_party/aom/tools/inspect-cli.js39
-rw-r--r--third_party/aom/tools/inspect-post.js1
-rwxr-xr-xthird_party/aom/tools/intersect-diffs.py78
-rwxr-xr-xthird_party/aom/tools/lint-hunks.py150
-rw-r--r--third_party/aom/tools/obu_parser.cc190
-rw-r--r--third_party/aom/tools/obu_parser.h27
-rw-r--r--third_party/aom/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py154
-rw-r--r--third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc580
-rw-r--r--third_party/aom/tools/txfm_analyzer/txfm_graph.cc943
-rw-r--r--third_party/aom/tools/txfm_analyzer/txfm_graph.h160
-rwxr-xr-xthird_party/aom/tools/wrap-commit-msg.py72
-rw-r--r--third_party/aom/usage.dox109
-rw-r--r--third_party/aom/usage_cx.dox9
-rw-r--r--third_party/aom/usage_dx.dox22
1345 files changed, 730066 insertions, 0 deletions
diff --git a/third_party/aom/.clang-format b/third_party/aom/.clang-format
new file mode 100644
index 0000000000..a8bc4967c3
--- /dev/null
+++ b/third_party/aom/.clang-format
@@ -0,0 +1,9 @@
+---
+Language: Cpp
+BasedOnStyle: Google
+AllowShortCaseLabelsOnASingleLine: true
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+PointerAlignment: Right
+SortIncludes: false
diff --git a/third_party/aom/.cmake-format.py b/third_party/aom/.cmake-format.py
new file mode 100644
index 0000000000..c79a6ad604
--- /dev/null
+++ b/third_party/aom/.cmake-format.py
@@ -0,0 +1,102 @@
+# Generated with cmake-format 0.5.1
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# What character to use for bulleted lists
+bullet_char = '*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = '.'
+
+# What style line endings to use in the output.
+line_ending = u'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = u'lower'
+
+# Format keywords consistently as 'lower' or 'upper' case
+keyword_case = u'unchanged'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+ "foo": {
+ "flags": [
+ "BAR",
+ "BAZ"
+ ],
+ "kwargs": {
+ "HEADERS": "*",
+ "DEPENDS": "*",
+ "SOURCES": "*"
+ }
+ }
+}
+
+# A list of command names which should always be wrapped
+always_wrap = []
+
+# Specify the order of wrapping algorithms during successive reflow attempts
+algorithm_order = [0, 1, 2, 3, 4]
+
+# If true, the argument lists which are known to be sortable will be sorted
+# lexicographicall
+autosort = False
+
+# enable comment markup parsing and reflow
+enable_markup = True
+
+# If comment markup is enabled, don't reflow the first comment block in
+# eachlistfile. Use this to preserve formatting of your
+# copyright/licensestatements.
+first_comment_is_literal = True
+
+# If comment markup is enabled, don't reflow any comment block which matchesthis
+# (regex) pattern. Default is `None` (disabled).
+literal_comment_pattern = None
+
+# Regular expression to match preformat fences in comments
+# default=r'^\s*([`~]{3}[`~]*)(.*)$'
+fence_pattern = u'^\\s*([`~]{3}[`~]*)(.*)$'
+
+# Regular expression to match rulers in comments
+# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
+ruler_pattern = u'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
+
+# If true, emit the unicode byte-order mark (BOM) at the start of the file
+emit_byteorder_mark = False
+
+# If a comment line starts with at least this many consecutive hash characters,
+# then don't lstrip() them off. This allows for lazy hash rulers where the first
+# hash char is not separated by space
+hashruler_min_length = 10
+
+# If true, then insert a space between the first hash char and remaining hash
+# chars in a hash ruler, and normalize it's length to fill the column
+canonicalize_hashrulers = True
+
+# Specify the encoding of the input file. Defaults to utf-8.
+input_encoding = u'utf-8'
+
+# Specify the encoding of the output file. Defaults to utf-8. Note that cmake
+# only claims to support utf-8 so be careful when using anything else
+output_encoding = u'utf-8'
+
+# A dictionary containing any per-command configuration overrides. Currently
+# only `command_case` is supported.
+per_command = {}
diff --git a/third_party/aom/.mailmap b/third_party/aom/.mailmap
new file mode 100644
index 0000000000..6d6e6302bc
--- /dev/null
+++ b/third_party/aom/.mailmap
@@ -0,0 +1,113 @@
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
+Adrian Grange <agrange@google.com>
+Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
+Alexander Bokov <alexanderbokov@google.com>
+Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Andrey Norkin <anorkin@netflix.com>
+Angie Chiang <angiebird@google.com>
+Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
+Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com> Aasaipriya C <100778@ittiam.com>
+Apurve Pandey <apurve.pandey@ittiam.com>
+Apurve Kumar Pandey <apurve.pandey@ittiam.com> Apurve Pandey
+Bohan Li <bohanli@google.com>
+Changjun Yang <changjun.yang@intel.com>
+Chi Yo Tsai <chiyotsai@google.com>
+Chi Yo Tsai <chiyotsai@google.com> <chiyotsai@dhcp-100-106-128-213.corp.google.com>
+Chm <chm@rock-chips.com>
+Damon Shen <yjshen@google.com>
+Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
+Deb Mukherjee <debargha@google.com>
+Elliott Karpilovsky <elliottk@google.com>
+Emil Keyder <emilkeyder@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Frederic Barbier <frederic.barbier@allegrodvt.com> <fbarbier.contact@gmail.com>
+Fyodor Kyslov <kyslov@google.com>
+Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
+Guillaume Martres <smarter@ubuntu.com>
+Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
+Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
+Hangyu Kuang <hkuang@google.com>
+Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
+Hui Su <huisu@google.com>
+Iole Moccagatta <iole.moccagatta@gmail.com>
+Jacky Chen <jackychen@google.com>
+James Zern <jzern@google.com> <jzern@google.cOm>
+Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com>
+Jim Bankoski <jimbankoski@google.com>
+Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Kyle Siefring <siekyleb@amazon.com>
+Kyle Siefring <siekyleb@amazon.com> <kylesiefring@gmail.com>
+Lin Zheng <linzhen@google.com>
+Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
+Luc Trudeau <luc@trud.ca>
+Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
+Michael Bebenita <mbebenita@gmail.com> <mbebenita@mozilla.com>
+Michael Horowitz <mhoro@webrtc.org> <mhoro@google.com>
+Mingliang Chen <mlchen@google.com>
+Monty Montgomery <cmontgomery@mozilla.com>
+Mudassir Galaganath <mudassir.galaganath@ittiam.com>
+Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
+Mudassir Galaganath <mudassir.galaganath@ittiam.com> Mudassir Galagnath
+Nathan E. Egge <negge@mozilla.com>
+Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
+Onur Guleryuz <oguleryuz@google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com> <skal@google.com>
+Paul Wilkins <paulwilkins@google.com>
+Peng Bin <binpengsmail@gmail.com>
+Peng Bin <binpengsmail@gmail.com> <pengbin@kingsoft.com>
+Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
+Rachel Barker <rachelbarker@google.com> David Barker <david.barker@argondesign.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Remya Prakasan <remya.prakasan@ittiam.com>
+Roger Zhou <youzhou@microsoft.com>
+Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
+Ryan Lei <ryanlei@fb.com> <ryan.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <ryan.z.lei@intel.com>
+Ryan Lei <ryanlei@fb.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
+Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
+Sai Deng <sdeng@google.com>
+Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
+Susanna D'Souza <susannad@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
+Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Thomas Davies Thomas <thdavies@cisco.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
+Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
+Tom Finegan <tomfinegan@google.com>
+Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
+Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com> <vdziumenko@luxoft.corp-partner.google.com>
+Wei-Ting Lin <weitinglin@google.com>
+Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
+Wenyao Liu <wenyao.liu@cidana.com>
+Will Bresnahan <bill.wresnahan@gmail.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
+Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
+Zhipin Deng <zhipin.deng@intel.com>
+Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>
diff --git a/third_party/aom/AUTHORS b/third_party/aom/AUTHORS
new file mode 100644
index 0000000000..ade7a1a5d0
--- /dev/null
+++ b/third_party/aom/AUTHORS
@@ -0,0 +1,319 @@
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+Aamir Anis <aanis@google.com>
+Aaron Watry <awatry@gmail.com>
+Aasaipriya Chandran <aasaipriya.c@ittiam.com>
+Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adrian Grange <agrange@google.com>
+Ahmad Sharif <asharif@google.com>
+Akshata Jadhav <akshata.jadhav@ittiam.com>
+Alexander Bokov <alexanderbokov@google.com>
+Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Aℓex Converse <aconverse@google.com>
+Alexis Ballier <aballier@gentoo.org>
+Alex Peterson <petersonab@google.com>
+Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
+A.Mahfoodh <ab.mahfoodh@gmail.com>
+Ami Fischman <fischman@chromium.org>
+Andoni Morales Alastruey <ylatuya@gmail.com>
+Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
+Andrey Norkin <anorkin@netflix.com>
+Angie Chiang <angiebird@google.com>
+Aniket Dhok <aniket.dhok@ittiam.com>
+Aniket Wanare <Aniket.wanare@ittiam.com>
+Ankur Saxena <ankurs@nvidia.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
+Apurve Kumar Pandey <apurve.pandey@ittiam.com>
+Arild Fuldseth <arilfuld@cisco.com>
+Aron Rosenberg <arosenberg@logitech.com>
+Arpad Panyik <Arpad.Panyik@arm.com>
+Arun Singh Negi <arun.negi@ittiam.com>
+Attila Nagy <attilanagy@google.com>
+Balaji Anandapadmanaban <balaji.anandapadmanaban@arm.com>
+Bohan Li <bohanli@google.com>
+Brennan Shacklett <bshacklett@mozilla.com>
+Brion Vibber <bvibber@wikimedia.org>
+Bruno Berthier <bruno.berthier@allegrodvt.com>
+Casey Smalley <casey.smalley@arm.com>
+Changjun Yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
+Cherma Rajan A <cherma.rajan@ittiam.com>
+Chethan Kumar R E <chethan.kumar@ittiam.com>
+Chi Yo Tsai <chiyotsai@google.com>
+Chm <chm@rock-chips.com>
+Christian Duvivier <cduvivier@google.com>
+Christopher Degawa <christopher.degawa@intel.com>
+Cyril Concolato <cconcolato@netflix.com>
+Dake He <dkhe@google.com>
+Damon Shen <yjshen@google.com>
+Dandan Ding <vickyddding@gmail.com>
+Daniele Castagna <dcastagna@chromium.org>
+Daniel Kang <ddkang@google.com>
+Daniel Max Valenzuela <daniel.vt@samsung.com>
+Danil Chapovalov <danilchap@google.com>
+David Major <dmajor@mozilla.com>
+David Michael Barr <b@rr-dav.id.au>
+David Turner <david.turner@argondesign.com>
+Deb Mukherjee <debargha@google.com>
+Deepa K G <deepa.kg@ittiam.com>
+Di Chen <chendixi@google.com>
+Diksha Singh <diksha.singh@ittiam.com>
+Dim Temp <dimtemp0@gmail.com>
+Dmitry Kovalev <dkovalev@google.com>
+Dominic Symes <dominic.symes@arm.com>
+Dragan Mrdjan <dmrdjan@mips.com>
+Ed Baker <edward.baker@intel.com>
+Edward Hervey <edward@centricular.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elliott Karpilovsky <elliottk@google.com>
+Emil Keyder <emilkeyder@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
+Fabio Pedretti <fabio.ped@libero.it>
+Fangwen Fu <fangwen.fu@intel.com>
+Fergus Simpson <afergs@google.com>
+Frank Bossen <fbossen@gmail.com>
+Frank Galligan <fgalligan@google.com>
+Frederic Barbier <frederic.barbier@allegrodvt.com>
+Fredrik Söderquist <fs@opera.com>
+Fritz Koenig <frkoenig@google.com>
+Fyodor Kyslov <kyslov@google.com>
+Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
+Geza Lore <gezalore@gmail.com>
+Ghislain MARY <ghislainmary2@gmail.com>
+Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Grant Hsu <grant.hsu@cidana.com>
+Guillaume Martres <smarter@ubuntu.com>
+Guillermo Ballester Valor <gbvalor@gmail.com>
+Hamsalekha S <hamsalekha.s@ittiam.com>
+Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
+Harish Mahendrakar <harish.mahendrakar@ittiam.com>
+Henrik Lundin <hlundin@google.com>
+Hien Ho <hienho@google.com>
+Hirokazu Honda <hiroh@google.com>
+Hui Su <huisu@google.com>
+Ilie Halip <ilie.halip@gmail.com>
+Ilya Brailovskiy <brailovs@lab126.com>
+Imdad Sardharwalla <imdad.sardharwalla@argondesign.com>
+Iole Moccagatta <iole.moccagatta@gmail.com>
+Ivan Krasin <krasin@chromium.org>
+Ivan Maltz <ivanmaltz@google.com>
+Ivan Rosales <rosalesi@google.com>
+Jacek Caban <cjacek@gmail.com>
+Jack Haughton <jack.haughton@argondesign.com>
+Jacky Chen <jackychen@google.com>
+James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
+James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
+Jan Kratochvil <jan.kratochvil@redhat.com>
+Janne Salonen <jsalonen@google.com>
+Jayasanker J <jayasanker.j@ittiam.com>
+Jayashri Murugan <jayashri.murugan@ittiam.com>
+Jean-Marc Valin <jmvalin@jmvalin.ca>
+Jean-Yves Avenard <jyavenard@mozilla.com>
+Jeff Faust <jfaust@google.com>
+Jeff Muizelaar <jmuizelaar@mozilla.com>
+Jeff Petkau <jpet@chromium.org>
+Jerome Jiang <jianj@google.com>
+Jia Jia <jia.jia@linaro.org>
+Jian Zhou <zhoujian@google.com>
+Jim Bankoski <jimbankoski@google.com>
+Jingning Han <jingning@google.com>
+Joe Young <joeyoung@google.com>
+Joey Parrish <joeyparrish@google.com>
+Johann Koenig <johannkoenig@google.com>
+John Koleszar <jkoleszar@google.com>
+Johnny Klonaris <google@jawknee.com>
+John Stark <jhnstrk@gmail.com>
+Jonathan Matthews <jonathan.matthews@argondesign.com>
+Jonathan Wright <jonathan.wright@arm.com>
+Joshua Bleecher Snyder <josh@treelinelabs.com>
+Joshua Litt <joshualitt@google.com>
+Josh Verdejo <joverdejo@google.com>
+Julia Robson <juliamrobson@gmail.com>
+Justin Clift <justin@salasaga.org>
+Justin Lebar <justin.lebar@gmail.com>
+Katsuhisa Yuasa <berupon@gmail.com>
+Kavi Ramamurthy <kavii@google.com>
+KO Myung-Hun <komh@chollian.net>
+Konstantinos Margaritis <konma@vectorcamp.gr>
+Krishna Malladi <kmalladi@google.com>
+Kwanghoon Son <kwangson@yahoo.com>
+Kyle Siefring <siekyleb@amazon.com>
+Larisa Markeeva <lmarkeeva@google.com>
+Lauren Partin <lpartin@google.com>
+Lawrence Velázquez <larryv@macports.org>
+leolzhao <leolzhao@tencent.com>
+Leon Kollar <Leon.Kollar@arm.com>
+L. E. Segovia <amy@amyspark.me>
+Lester Lu <kslu@google.com>
+liang zhao <leolzhao@tencent.com>
+Linfeng Zhang <linfengz@google.com>
+Link.Meng <monthev@gmail.com>
+Lin Zheng <linzhen@google.com>
+Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
+Lou Quillio <louquillio@google.com>
+Luca Barbato <lu_zero@gentoo.org>
+Luca Versari <veluca@google.com>
+Luc Trudeau <luc@trud.ca>
+Madhu Peringassery Krishnan <mpkrishnan@tencent.com>
+Makoto Kato <makoto.kt@gmail.com>
+Mans Rullgard <mans@mansr.com>
+Marco Paniconi <marpan@google.com>
+Mark Horvath <mark.horvath@arm.com>
+Mark Mentovai <mark@chromium.org>
+Mark Wachsler <wachsler@google.com>
+Martin Ettl <ettl.martin78@googlemail.com>
+Martin Storsjo <martin@martin.st>
+Maryla <maryla@google.com>
+Matthew Heaney <matthewjheaney@chromium.org>
+Matthieu Vaudano <matthieu.vaudano@allegrodvt.com>
+Mattias Hansson <mattias.hansson@arm.com>
+Maxym Dmytrychenko <maxim.d33@gmail.com>
+Michael Bebenita <mbebenita@gmail.com>
+Michael Horowitz <mhoro@webrtc.org>
+Michael Kohler <michaelkohler@live.com>
+Michelle Findlay-Olynyk <mfo@google.com>
+Mike Frysinger <vapier@chromium.org>
+Mike Hommey <mhommey@mozilla.com>
+Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
+Mingliang Chen <mlchen@google.com>
+Mirko Bonadei <mbonadei@google.com>
+Monty Montgomery <cmontgomery@mozilla.com>
+Morton Jonuschat <yabawock@gmail.com>
+Mudassir Galaganath <mudassir.galaganath@ittiam.com>
+Mufaddal Chakera <mufaddal.chakera@ittiam.com>
+Narayan Kalaburgi <narayan.kalaburgi@ittiam.com>
+Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
+Neha Mary Thomas <neha.marythomas@ittiam.com>
+Neil Birkbeck <birkbeck@google.com>
+Nico Weber <thakis@chromium.org>
+Nithya V S <nithya.vs@ittiam.com>
+Ola Hugosson <ola.hugosson@arm.com>
+Oleg Nalivayko <o13g86@gmail.com>
+Onur Guleryuz <oguleryuz@google.com>
+Parag Salasakar <img.mips1@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Patrik Westin <patrik.westin@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Pavel Frolov <pavel.frolov@vicuesoft.com>
+Pavol Rusnak <stick@gk2.sk>
+Paweł Hajdan <phajdan@google.com>
+Peng Bin <binpengsmail@gmail.com>
+Pengchong Jin <pengchong@google.com>
+Peter Boström <pbos@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
+Peter Kasting <pkasting@chromium.org>
+Philip Jägenstedt <philipj@opera.com>
+Priit Laes <plaes@plaes.org>
+Qiu Jianlin <jianlin.qiu@intel.com>
+Rachel Barker <rachelbarker@google.com>
+Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
+Ralph Giles <giles@xiph.org>
+Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
+Ravi Chaudhary <ravi.chaudhary@ittiam.com>
+Remya Prakasan <remya.prakasan@ittiam.com>
+Remy Foray <remy.foray@allegrodvt.com>
+Rob Bradford <rob@linux.intel.com>
+Robert-André Mauchin <zebob.m@gmail.com>
+Robert Chin <robertchin@google.com>
+Roger Zhou <youzhou@microsoft.com>
+Rohit Athavale <rathaval@xilinx.com>
+Ronald S. Bultje <rsbultje@gmail.com>
+Rostislav Pehlivanov <rpehlivanov@mozilla.com>
+Ruiling Song <ruiling.song@intel.com>
+Rui Ueyama <ruiu@google.com>
+Rupert Swarbrick <rupert.swarbrick@argondesign.com>
+Ryan Lei <ryanlei@fb.com>
+Ryan Overbeck <rover@google.com>
+Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
+Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
+Sami Boukortt <sboukortt@google.com>
+Sami Pietilä <samipietila@google.com>
+Samuel Thibault <samuel.thibault@ens-lyon.org>
+Samuthirika S <samuthirika.s@ittiam.com>
+Sarah Parker <sarahparker@google.com>
+Sasi Inguva <isasi@google.com>
+Satheesh Kumar <satheesh.kumar@ittiam.com>
+Satish Kumar Suman <satish.suman@ittiam.com>
+Scott Graham <scottmg@chromium.org>
+Scott LaVarnway <slavarnway@google.com>
+Sean DuBois <sean@siobud.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sean Purser-Haskell <seanhaskell@google.com>
+Sebastien Alaiwan <sebastien.alaiwan@allegrodvt.com>
+Sergey Kolomenkin <kolomenkin@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
+S Hamsalekha <hamsalekha.s@ittiam.com>
+Shimon Doodkin <helpmepro1@gmail.com>
+Shunyao Li <shunyaoli@google.com>
+SmilingWolf <lupo996@gmail.com>
+Soo-Chul Han <shan@vidyo.com>
+Stanislav Vitvitskyy <vitvitskyy@google.com>
+Stefan Holmer <holmer@google.com>
+Steinar Midtskogen <stemidts@cisco.com>
+Steve Lhomme <robux4@gmail.com>
+Suman Sunkara <sunkaras@google.com>
+Susanna D'Souza <susannad@google.com>
+Taekhyun Kim <takim@nvidia.com>
+Takanori MATSUURA <t.matsuu@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
+Tarek AMARA <amatarek@justin.tv>
+Tarundeep Singh <tarundeep.singh@ittiam.com>
+Tero Rintaluoma <teror@google.com>
+Thijs Vermeir <thijsvermeir@gmail.com>
+Thomas Daede <tdaede@mozilla.com>
+Thomas Davies Thomas <thdavies@cisco.com>
+Tim Kopp <tkopp@google.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Timo Witte <timo.witte@gmail.com>
+Todd Nguyen <toddnguyen@google.com>
+Tom Anderson <thomasanderson@google.com>
+Tom Finegan <tomfinegan@google.com>
+Tristan Matthews <tmatth@videolan.org>
+Umang Saini <umang.saini@ittiam.com>
+Urvang Joshi <urvang@google.com>
+Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Victoria Zhislina <niva213@gmail.com>
+Vignesh Venkatasubramanian <vigneshv@google.com>
+Vikas Prasad <vikas.prasad@ittiam.com>
+Vincent Rabaud <vrabaud@google.com>
+Vishesh <vishesh.garg@ittiam.com>
+Vishnu Teja Manyam <vishnu.teja@ittiam.com>
+Vitalii Dziumenko <vdziumenko@luxoft.com>
+Wan-Teh Chang <wtc@google.com>
+Wei-Ting Lin <weitinglin@google.com>
+Wenyao Liu <wenyao.liu@cidana.com>
+Will Bresnahan <bill.wresnahan@gmail.com>
+Xiaoqing Zhu <xzhu@netflix.com>
+Xing Jin <ddvfinite@gmail.com>
+Xin Zhao <xinzzhao@tencent.com>
+Yannis Guyon <yguyon@google.com>
+Yaowu Xu <yaowu@google.com>
+Yeqing Wu <yeqing_wu@apple.com>
+Yi Luo <luoyi@google.com>
+Yingying Ma <yingying.ma@intel.com>
+Yongzhe Wang <yongzhe@google.com>
+Yuan Tong <tongyuan200097@gmail.com>
+Yu-Chen (Eric) Sun <ycsun@fb.com>
+Yue Chen <yuec@google.com>
+Yunqing Wang <yunqingwang@google.com>
+Yury Gitman <yuryg@google.com>
+Yushin Cho <ycho@mozilla.com>
+Zhijie Yang <zhijie.yang@broadcom.com>
+Zhipin Deng <zhipin.deng@intel.com>
+Zoe Liu <zoeliu@gmail.com>
diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG
new file mode 100644
index 0000000000..b243837d3c
--- /dev/null
+++ b/third_party/aom/CHANGELOG
@@ -0,0 +1,828 @@
+2023-11-30 v3.8.0
+ This release includes new codec interfaces, compression efficiency and
+ perceptual improvements, speedup and memory optimizations and many bug
+ fixes. This release is ABI compatible with the last release.
+
+ - New Features
+ * New codec controls:
+ * AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR: Set the maximum number of
+ consecutive frame drops allowed for the frame dropper in 1 pass
+ CBR mode.
+ * Run-time CPU feature detection for all Arm platforms:
+ CRC, DotProd, I8MM and SVE CPU feature presence is detected at run
+ time and code paths making use of these features are selected
+ dynamically. These code paths provide meaningful performance gains
+ for standard bitdepth RTC and VoD encoding: up to 10% and 20%
+ respectively, over the Armv8.0-A baseline build.
+ * RTC: Frame-dropper support added to the rate control library.
+ * RTC Rate control improvements for low bitrate and for SVC.
+
+ - Compression Efficiency Improvements
+ * Improved accuracy of cost estimation for loop restoration and
+ global motion.
+ * Improved selection of loop restoration unit size - full search up
+ to (non-realtime) speed 2, retuned static selection at higher
+ speeds.
+ * RTC Screen content mode: 3-5% bdrate gains across speeds 7 - 10.
+ * Good-quality mode: 0.2 - 0.5% bdrate gains across speeds 1 - 4.
+
+ - Perceptual Quality Improvements
+ * RTC Screen: Improved visual quality for scrolling.
+ * RTC: Improved color quality for both screen and video mode.
+
+ - Speedup and Memory Optimizations
+ * Good-quality, single-thread encoder speedups:
+ o 15% improvement for speed 5.
+ o 12% improvement for speed 6.
+ * Arm standard bitdepth VoD (--good):
+ o 8% speedup for speeds 0 and 1.
+ o 20% speedup for speed 2.
+ o 27% speedup for speed 3.
+ o 30% speedup for speed 4.
+ o 38% speedup for speeds 5 and 6.
+ * Arm high bitdepth VoD (--good):
+ o 206% speedup for speeds 0 and 1.
+ o 180% speedup for speed 2.
+ o 51% speedup for speeds 3 and 4.
+ o 68% speedup for speed 5.
+ o 72% speedup for speed 6.
+ * RTC Screen content: 2-6% speedup across speeds 7-10.
+ * RTC: 2-3% speedup for temporal layers.
+ * RTC: Speedups to reference scaling in nonrd pickmode.
+ * Good-quality mode: Simplified global motion estimation, saving
+ ~1200 lines of code and 1KB of tables while improving quality.
+
+ - Bug Fixes
+ * Fixes to improve libaom stability in case of memory allocation
+ failures.
+ * Fixes to SIMD functions (x86 AVX2/SSE2 and ARM Neon).
+ * b/310457427, b/310766628: Bug fixes to only use rec_sse in CBR
+ mode.
+
+2023-11-17 v3.7.1
+ This release includes several bug fixes. This release is ABI
+ compatible with the last release. See
+ https://aomedia.googlesource.com/aom/+log/v3.7.0..v3.7.1 for all the
+ commits in this release.
+
+ - Bug Fixes
+ * aomedia:3349: heap overflow when increasing resolution
+ * aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on
+ aom/av1/encoder/motion_search_facade.c
+ * aomedia:3489: Detect encoder and image high bit depth mismatch
+ * aomedia:3491: heap-buffer-overflow on frame size change (CVE-2023-6879)
+ * b/303023614: Segfault at encoding time for high bit depth images
+
+2023-08-10 v3.7.0
+ This release includes new codec interfaces, compression efficiency and
+ perceptual improvements, speedup and memory optimizations and many bug fixes.
+ This release is ABI compatible with the last release.
+
+ - New Features
+ * New codec controls:
+ * AV1E_SET_QUANTIZER_ONE_PASS: Set quantizer for each frame.
+ * AV1E_ENABLE_RATE_GUIDE_DELTAQ: enable the rate distribution guided delta
+ quantization in all intra mode. The "enable-rate-guide-deltaq" option is
+ added for this control.
+ * AV1E_SET_RATE_DISTRIBUTION_INFO: set the input file for rate
+ distribution used in all intra mode. The "rate-distribution-info" option
+ is added for this control.
+ * AV1E_GET_LUMA_CDEF_STRENGTH
+ * AV1E_SET_BITRATE_ONE_PASS_CBR
+ * AOM_SCALING_MODE is extended to include 2/3 and 1/3 scaling.
+ * aom_tune_metric is extended to include AOM_TUNE_VMAF_SALIENCY_MAP.
+ The "tune" option is extended to include "vmaf_saliency_map".
+ * SVC example encoder svc_encoder_rtc is able to use the rate control
+ library.
+ * Loopfilter level and CDEF filter level is supported by RTC rate control
+ library.
+ * New speed (--cpu-used) 11, intended for RTC screen sharing, added for
+ faster encoding with ~3% bdrate loss with 16% IC (instruction count)
+ speedup compared to speed 10.
+
+ - Compression Efficiency Improvements
+ * Improved VoD encoding performance
+ * 0.1-0.6% BDrate gains for encoding speeds 2 to 6
+ * Rate control accuracy improvement in VBR mode
+ * RTC encoding improvements
+ * Screen content mode: 10-19% BDrate gains for speeds 6 - 10
+ * Temporal layers video mode, for speed 10:
+ * 2 temporal layers on low resolutions: 13-15% BDrate gain
+ * 3 temporal layers on VGA/HD: 3-4% BDrate gain
+
+ - Perceptual Quality Improvements
+ * Fixed multiple block and color artifacts for RTC screen content by
+ * Incorporating color into RD cost for IDTX
+ * Reducing thresholds for palette mode in non RD mode
+ * Allowing more palette mode testing
+ * Improved color sensitivity for altref in non-RD mode.
+ * Reduced video flickering for temporal layer encoding.
+
+ - Speedup and Memory Optimizations
+ * Speed up the VoD encoder
+ * 2-5% for encoding speed 2 to 4
+ * 9-15% for encoding speed 5 to 6
+ * ARM
+ * Standard bitdepth
+ * speed 5: +31%
+ * speed 4: +2%
+ * speed 3: +9%
+ * speed 2: +157%
+ * High bitdepth
+ * speed 5: +85%
+ * RTC speedups
+ * Screen content mode
+ * 15% IC speedup for speeds 6-8
+ * ARM: 7% for speed 9, 3% for speed 10
+ * Temporal layers video mode
+ * 7% speedup for 3 temporal layers on VGA/HD, for speed 10
+ * Single layer video
+ * x86: 2% IC speedup for speeds 7-10
+ * ARM: 2-4% speedup across speeds 5-10
+
+ - Other improvements
+ * VoD: Major improvements to global motion estimation, now enabled up to
+ speed 4
+ * RTC
+ * Fixes to make lossless coding work.
+ * Fixes to make frame dropper (--drop_frames) work for single and temporal
+ layers.
+ * Improvements to RPS (reference picture selection) recovery frames.
+ * Improvements to rate control for temporal layers.
+ * libwebm is updated to libwebm-1.0.0.29-9-g1930e3c
+
+ - Bug Fixes
+ * aomedia:3261 Assertion failed when encoding av1 with film grain and
+ '--monochrome' flag
+ * aomedia:3276 ensure all allocations are checked (partial fix)
+ * aomedia:3451 The libaom library calls exit()
+ * aomedia:3450 enable -Wshadow for C++ sources
+ * aomedia:3449 Test Seg Faults After
+ b459af3e345be402db052a143fcc5383d4b74cbd
+ * aomedia:3416 prune unused symbols / restrict symbol visibility
+ * aomedia:3443 Jenkins failure:
+ UninstantiatedParameterizedTestSuite<EstimateNoiseTest>
+ * aomedia:3434 realtime failures with CONFIG_BITSTREAM_DEBUG=1
+ * aomedia:3433 DeltaqModeTest crash w/row_mt=0
+ * aomedia:3429 Encoder crash when turn on both ExternalResize and
+ g_threads > 2
+ * aomedia:3438 Build failure with
+ `-DSANITIZE=address -DBUILD_SHARED_LIBS=ON` when using clang.
+ * aomedia:3435 Block artifacts when scrolling with AV1 in screen sharing
+ scenarios
+ * aomedia:3170 vmaf tune presets produce extreme glitches in one scene
+ * aomedia:3401 Building shared libaom with MSVC results in a race condition
+ with the export library
+ * aomedia:3420 Floating point exception in av1_tpl_get_frame_importance()
+ * aomedia:3424 heap-buffer-overflow in ScaleFilterCols_16_C() (SIGABRT)
+ * aomedia:3417 examples/svc_encoder_rtc.c is using internal macros and
+ functions
+ * aomedia:3372 SEGV in assign_frame_buffer_p av1_common_int.h
+ * aomedia:3130 'cpu-features.h' file not found on Android NDK 22
+ * aomedia:3415 Encoder/decoder mismatch for svc_encoder_rtc running
+ 1 SL 3 TL
+ * aomedia:3412 Lossless Mode Fails Loopback Bit Test
+ * aomedia:3409 The use of AV1_VAR_OFFS in av1/encoder/var_based_part.c is
+ incorrect for high bit depths
+ * aomedia:3403 test_libaom fails with error message
+ "feenableexcept() failed" on Linux arm
+ * aomedia:3370 Random color block at fast motion area
+ * aomedia:3393 Assertion failure in av1_convolve_2d_sr_c()
+ * aomedia:3392 Strong artifacting for high bit-depth real-time
+ * aomedia:3376 aomenc --threads=10 --deltaq-mode=3 crashes after
+ "Allintra: multi-threading of calculating differential contrast"
+ * aomedia:3380 Crashes and ASan and TSan errors in deltaq-mode=3
+ multithreading code
+ * chromium:1410766 heap-buffer-overflow in aom_yv12_copy_v_c
+ * Cannot set level via AV1E_SET_TARGET_SEQ_LEVEL_IDX
+ * Encoding failure due to the use of loop restoration with unintended use of
+ lossless mode.
+ * Signed integer overflow in scan_past_frames
+ * Signed integer overflow in update_a_sep_sym
+ * Flickering in AV1 1440p/2160p HDR transcodes
+ * Fixed artifacts with screen share at encoder speed 10
+ * Fixed prediction setup for IDTX
+
+2023-05-08 v3.6.1
+ This release includes several bug fixes. This release is ABI
+ compatible with the last release. See
+ https://aomedia.googlesource.com/aom/+log/v3.6.0..v3.6.1 for all the
+ commits in this release.
+
+ - Bug Fixes
+ * aomedia:2871: Guard the support of the 7.x and 8.x levels for AV1
+ under the CONFIG_CWG_C013 config flag, and only output the 7.x and
+ 8.x levels when explicitly requested.
+ * aomedia:3382: Choose sb_size by ppi instead of svc.
+ * aomedia:3384: Fix fullpel search limits.
+ * aomedia:3388: Replace left shift of xq_active by multiplication.
+ * aomedia:3389: Fix MV clamping in av1_mv_pred.
+ * aomedia:3390: set_ld_layer_depth: cap max_layer_depth to
+ MAX_ARF_LAYERS.
+ * aomedia:3418: Fix MV clamping in av1_int_pro_motion_estimation.
+ * aomedia:3429: Move lpf thread data init to lpf_pipeline_mt_init().
+ * b:266719111: Fix undefined behavior in Arm Neon code.
+ * b:269840681: nonrd_opt: align scan tables.
+ * rtc: Fix is_key_frame setting in variance partition.
+ * Build: Fix build with clang-cl and Visual Studio.
+ * Build: Fix module definition file for MinGW/MSYS.
+
+2023-02-03 v3.6.0
+ This release includes compression efficiency and perceptual quality
+ improvements, speedup and memory optimizations, and some new features.
+ This release is ABI compatible with the last release.
+
+ - New Features
+ * New values 20-27 (corresponding to levels 7.0-7.3 and 8.0-8.3) for
+ the encoder control AV1E_SET_TARGET_SEQ_LEVEL_IDX (note that the
+ proposal to add the new levels are still in draft status). The
+ original special value 24 (keep level stats only for level
+ monitoring) is renumbered as 32.
+ * New encoder control AV1E_SET_SKIP_POSTPROC_FILTERING to skip the
+ application of post-processing filters on reconstructed frame in
+ all intra mode.
+ * New encoder option "kf-max-pyr-height": Maximum height of pyramid
+ structure used for the GOP starting with a key frame (-1 to 5).
+ * Make SVC work for screen content.
+ * Rate control improvements to reduce frame-size spikes for screen
+ content coding.
+ * RISC-V architecture support with gcc toolchain.
+
+ - Compression Efficiency Improvements
+ * Peak compression efficiency in VOD setting is improved by 1%.
+ * 0.7% - 2.2% RTC encoding BDrate gains for real time speed 8 to 10.
+ * 15% RTC encoding BDrate gains for screen content speed 10.
+
+ - Perceptual Quality Improvements
+ * Resolved a visual quality issue that was reported for high
+ resolution clips (2K) for speed 4 and above in VOD use case.
+ * Visual quality improvements to screen content coding.
+ * Quality improvements to temporal layer RTC coding.
+
+ - Speedup and Memory Optimizations
+ * RTC single-thread encoder speedup:
+ o ~6% instruction count reduction for speed 5 and 6.
+ o ~15% instruction count reduction for speed 7.
+ o ~10% instruction count reduction for speed 8 to 10 (>=360p
+ resolutions).
+ * RTC multi-thread encoder speedup (beyond single-thread speedup):
+ o 5-8% encode time reduction for speed 7 to 10.
+ * RTC screen-content encoder speedup:
+ o 11% instruction count reduction for speed 9 and 10 (>=720p
+ resolutions).
+ * ~5% reduction in heap memory requirements for RTC, speed 6 to 10.
+ * AVIF:
+ o 4-5% speedup for speed 9 in still-picture encoding mode.
+ o 3-4% heap memory reduction in still-picture encoding mode for
+ 360p-720p resolutions with multiple threads.
+
+ - Bug Fixes
+ * Added a workaround for an AV1 specification bug which makes
+ TRANSLATION type global motion models unusable.
+ * Fixed AddressSanitizer global-buffer-overflow errors in
+ av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c.
+ * Fixed AddressSanitizer heap-buffer-overflow error in
+ av1_wiener_convolve_add_src_neon().
+ * chromium:1393384 Avoid scene detection on spatial resize.
+ * aomedia:3308 Remove color artifacts under high motion.
+ * aomedia:3310 Avoid out of memory failures with Visual Studio 2017,
+ 2019, and 2022 for Win32 x86 builds.
+ * aomedia:3346 Make SVC work properly for screen content.
+ * aomedia:3348 Fix a bug where an uninitialized search_site is used.
+ * aomedia:3365 Work around what seems like a Visual Studio 2022
+ compiler optimization bug.
+ * aomedia:3369 Incorrect PSNR values reported by libaom for 12-bit
+ encode.
+
+2022-08-31 v3.5.0
+ This release is ABI compatible with the last one, including speedup and memory
+ optimizations, and new APIs and features.
+
+ - New Features
+ * Support for frame parallel encode for larger number of threads. --fp-mt
+ flag is available for all build configurations.
+ * New codec control AV1E_GET_NUM_OPERATING_POINTS
+
+ - Speedup and Memory Optimizations
+ * Speed-up multithreaded encoding for good quality mode for larger number of
+ threads through frame parallel encoding:
+ o 30-34% encode time reduction for 1080p, 16 threads, 1x1 tile
+ configuration (tile_rows x tile_columns)
+ o 18-28% encode time reduction for 1080p, 16 threads, 2x4 tile
+ configuration
+ o 18-20% encode time reduction for 2160p, 32 threads, 2x4 tile
+ configuration
+ * 16-20% speed-up for speed=6 to 8 in still-picture encoding mode
+ * 5-6% heap memory reduction for speed=6 to 10 in real-time encoding mode
+ * Improvements to the speed for speed=7, 8 in real-time encoding mode
+ * Improvements to the speed for speed=9, 10 in real-time screen encoding
+ mode
+ * Optimizations to improve multi-thread efficiency in real-time encoding
+ mode
+ * 10-15% speed up for SVC with temporal layers
+ * SIMD optimizations:
+ o Improve av1_quantize_fp_32x32_neon() 1.05x to 1.24x faster
+ o Add aom_highbd_quantize_b{,_32x32,_64x64}_adaptive_neon() 3.15x to 5.6x
+ faster than "C"
+ o Improve av1_quantize_fp_64x64_neon() 1.17x to 1.66x faster
+ o Add aom_quantize_b_avx2() 1.4x to 1.7x faster than aom_quantize_b_avx()
+ o Add aom_quantize_b_32x32_avx2() 1.4x to 2.3x faster than
+ aom_quantize_b_32x32_avx()
+ o Add aom_quantize_b_64x64_avx2() 2.0x to 2.4x faster than
+ aom_quantize_b_64x64_ssse3()
+ o Add aom_highbd_quantize_b_32x32_avx2() 9.0x to 10.5x faster than
+ aom_highbd_quantize_b_32x32_c()
+ o Add aom_highbd_quantize_b_64x64_avx2() 7.3x to 9.7x faster than
+ aom_highbd_quantize_b_64x64_c()
+ o Improve aom_highbd_quantize_b_avx2() 1.07x to 1.20x faster
+ o Improve av1_quantize_fp_avx2() 1.13x to 1.49x faster
+ o Improve av1_quantize_fp_32x32_avx2() 1.07x to 1.54x faster
+ o Improve av1_quantize_fp_64x64_avx2() 1.03x to 1.25x faster
+ o Improve av1_quantize_lp_avx2() 1.07x to 1.16x faster
+
+ - Bug fixes including but not limited to
+ * aomedia:3206 Assert that skip_width > 0 for deconvolve function
+ * aomedia:3278 row_mt enc: Delay top-right sync when intraBC is enabled
+ * aomedia:3282 blend_a64_*_neon: fix bus error in armv7
+ * aomedia:3283 FRAME_PARALLEL: Propagate border size to all cpis
+ * aomedia:3283 RESIZE_MODE: Fix incorrect strides being used for motion
+ search
+ * aomedia:3286 rtc-svc: Fix to dynamic_enable spatial layers
+ * aomedia:3289 rtc-screen: Fix to skipping inter-mode test in nonrd
+ * aomedia:3289 rtc-screen: Fix for skip newmv on flat blocks
+ * aomedia:3299 Fix build failure with CONFIG_TUNE_VMAF=1
+ * aomedia:3296 Fix the conflict --enable-tx-size-search=0 with nonrd mode
+ --enable-tx-size-search will be ignored in non-rd pick mode
+ * aomedia:3304 Fix off-by-one error of max w/h in validate_config
+ * aomedia:3306 Do not use pthread_setname_np on GNU/Hurd
+ * aomedia:3325 row-multithreading produces invalid bitstream in some cases
+ * chromium:1346938, chromium:1338114
+ * compiler_flags.cmake: fix flag detection w/cmake 3.17-3.18.2
+ * tools/*.py: update to python3
+ * aom_configure.cmake: detect PIE and set CONFIG_PIC
+ * test/simd_cmp_impl: use explicit types w/CompareSimd*
+ * rtc: Fix to disable segm for aq-mode=3
+ * rtc: Fix to color_sensitivity in variance partition
+ * rtc-screen: Fix bsize in model rd computation for intra chroma
+ * Fixes to ensure the correct behavior of the encoder algorithms (like
+ segmentation, computation of statistics, etc.)
+
+2022-06-17 v3.4.0
+ This release includes compression efficiency and perceptual quality
+ improvements, speedup and memory optimizations, and some new features.
+ There are no ABI or API breaking changes in this release.
+
+ - New Features
+ * New --dist-metric flag with "qm-psnr" value to use quantization
+ matrices in the distortion computation for RD search. The default
+ value is "psnr".
+ * New command line option "--auto-intra-tools-off=1" to make
+ all-intra encoding faster for high bit rate under
+ "--deltaq-mode=3" mode.
+ * New rate control library aom_av1_rc for real-time hardware
+ encoders. Supports CBR for both one spatial layer and SVC.
+ * New image format AOM_IMG_FMT_NV12 can be used as input to the
+ encoder. The presence of AOM_IMG_FMT_NV12 can be detected at
+ compile time by checking if the macro AOM_HAVE_IMG_FMT_NV12 is
+ defined.
+ * New codec controls for the encoder:
+ o AV1E_SET_AUTO_INTRA_TOOLS_OFF. Only in effect if
+ --deltaq-mode=3.
+ o AV1E_SET_RTC_EXTERNAL_RC
+ o AV1E_SET_FP_MT. Only supported if libaom is built with
+ -DCONFIG_FRAME_PARALLEL_ENCODE=1.
+ o AV1E_GET_TARGET_SEQ_LEVEL_IDX
+ * New key-value pairs for the key-value API:
+ o --auto-intra-tools-off=0 (default) or 1. Only in effect if
+ --deltaq-mode=3.
+ o --strict-level-conformance=0 (default) or 1
+ o --fp-mt=0 (default) or 1. Only supported if libaom is built
+ with -DCONFIG_FRAME_PARALLEL_ENCODE=1.
+ * New aomenc options (not supported by the key-value API):
+ o --nv12
+
+ - Compression Efficiency Improvements
+ * Correctly calculate SSE for high bitdepth in skip mode, 0.2% to
+ 0.6% coding gain.
+ * RTC at speed 9/10: BD-rate gain of ~4/5%
+ * RTC screen content coding: many improvements for real-time screen
+ at speed 10 (quality, speedup, and rate control), up to high
+ resolutions (1080p).
+ * RTC-SVC: fixes to make intra-only frames work for spatial layers.
+ * RTC-SVC: quality improvements for temporal layers.
+ * AV1 RT: A new passive rate control strategy for screen content, an
+ average of 7.5% coding gain, with some clips of 20+%. The feature
+ is turned off by default due to higher bit rate variation.
+
+ - Perceptual Quality Improvements
+ * RTC: Visual quality improvements for high speeds (9/10)
+ * Improvements in coding quality for all intra mode
+
+ - Speedup and Memory Optimizations
+ * ~10% speedup in good quality mode encoding.
+ * ~7% heap memory reduction in good quality encoding mode for speed
+ 5 and 6.
+ * Ongoing improvements to intra-frame encoding performance on Arm
+ * Faster encoding speed for "--deltaq-mode=3" mode.
+ * ~10% speedup for speed 5/6, ~15% speedup for speed 7/8, and
+ ~10% speedup for speed 9/10 in real time encoding mode
+ * ~20% heap memory reduction in still-picture encoding mode for
+ 360p-720p resolutions with multiple threads
+ * ~13% speedup for speed 6 and ~12% speedup for speed 9 in
+ still-picture encoding mode.
+ * Optimizations to improve multi-thread efficiency for still-picture
+ encoding mode.
+
+ - Bug Fixes
+ * b/204460717: README.md: replace master with main
+ * b/210677928: libaom disable_order is surprising for
+ max_reference_frames=3
+ * b/222461449: -DCONFIG_TUNE_BUTTERAUGLI=1 broken
+ * b/227207606: write_greyscale writes incorrect chroma in highbd
+ mode
+ * b/229955363: Integer-overflow in linsolve_wiener
+ * https://crbug.com/aomedia/2032
+ * https://crbug.com/aomedia/2397
+ * https://crbug.com/aomedia/2563
+ * https://crbug.com/aomedia/2815
+ * https://crbug.com/aomedia/3009
+ * https://crbug.com/aomedia/3018
+ * https://crbug.com/aomedia/3045
+ * https://crbug.com/aomedia/3101
+ * https://crbug.com/aomedia/3130
+ * https://crbug.com/aomedia/3173
+ * https://crbug.com/aomedia/3184
+ * https://crbug.com/aomedia/3187
+ * https://crbug.com/aomedia/3190
+ * https://crbug.com/aomedia/3195
+ * https://crbug.com/aomedia/3197
+ * https://crbug.com/aomedia/3201
+ * https://crbug.com/aomedia/3202
+ * https://crbug.com/aomedia/3204
+ * https://crbug.com/aomedia/3205
+ * https://crbug.com/aomedia/3207
+ * https://crbug.com/aomedia/3208
+ * https://crbug.com/aomedia/3209
+ * https://crbug.com/aomedia/3213
+ * https://crbug.com/aomedia/3214
+ * https://crbug.com/aomedia/3219
+ * https://crbug.com/aomedia/3222
+ * https://crbug.com/aomedia/3223
+ * https://crbug.com/aomedia/3225
+ * https://crbug.com/aomedia/3226
+ * https://crbug.com/aomedia/3228
+ * https://crbug.com/aomedia/3232
+ * https://crbug.com/aomedia/3236
+ * https://crbug.com/aomedia/3237
+ * https://crbug.com/aomedia/3238
+ * https://crbug.com/aomedia/3240
+ * https://crbug.com/aomedia/3243
+ * https://crbug.com/aomedia/3244
+ * https://crbug.com/aomedia/3246
+ * https://crbug.com/aomedia/3248
+ * https://crbug.com/aomedia/3250
+ * https://crbug.com/aomedia/3251
+ * https://crbug.com/aomedia/3252
+ * https://crbug.com/aomedia/3255
+ * https://crbug.com/aomedia/3257
+ * https://crbug.com/aomedia/3259
+ * https://crbug.com/aomedia/3260
+ * https://crbug.com/aomedia/3267
+ * https://crbug.com/aomedia/3268
+ * https://crbug.com/aomedia/3269
+ * https://crbug.com/aomedia/3276
+ * https://crbug.com/aomedia/3278
+ * https://crbug.com/chromium/1290068
+ * https://crbug.com/chromium/1303237
+ * https://crbug.com/chromium/1304990
+ * https://crbug.com/chromium/1321141
+ * https://crbug.com/chromium/1321388
+ * https://crbug.com/oss-fuzz/44846
+ * https://crbug.com/oss-fuzz/44856
+ * https://crbug.com/oss-fuzz/44862
+ * https://crbug.com/oss-fuzz/44904
+ * https://crbug.com/oss-fuzz/45056
+
+2022-01-28 v3.3.0
+ This release includes compression efficiency and perceptual quality
+ improvements, speedup and memory optimizations, some new features, and
+ several bug fixes.
+
+ - New Features
+ * AV1 RT: Introducing CDEF search level 5
+ * Changed real time speed 4 to behave the same as real time speed 5
+ * Add --deltaq-strength
+ * rtc: Allow scene-change and overshoot detection for svc
+ * rtc: Intra-only frame for svc
+ * AV1 RT: Option 2 for codec control AV1E_SET_ENABLE_CDEF to disable
+ CDEF on non-ref frames
+ * New codec controls AV1E_SET_LOOPFILTER_CONTROL and
+ AOME_GET_LOOPFILTER_LEVEL
+ * Improvements to three pass encoding
+
+ - Compression Efficiency Improvements
+ * Overall compression gains: 0.6%
+
+ - Perceptual Quality Improvements
+ * Improves the perceptual quality of high QP encoding for delta-q mode 4
+ * Auto select noise synthesis level for all intra
+
+ - Speedup and Memory Optimizations
+ * Added many SSE2 optimizations.
+ * Good quality 2-pass encoder speedups:
+ o Speed 2: 9%
+ o Speed 3: 12.5%
+ o Speed 4: 8%
+ o Speed 5: 3%
+ o Speed 6: 4%
+ * Real time mode encoder speedups:
+ o Speed 5: 2.6% BDRate gain, 4% speedup
+ o Speed 6: 3.5% BDRate gain, 4% speedup
+ o Speed 9: 1% BDRate gain, 3% speedup
+ o Speed 10: 3% BDRate gain, neutral speedup
+ * All intra encoding speedups (AVIF):
+ o Single thread - speed 6: 8%
+ o Single thread - speed 9: 15%
+ o Multi thread(8) - speed 6: 14%
+ o Multi thread(8) - speed 9: 34%
+
+ - Bug Fixes
+ * Issue 3163: Segmentation fault when using --enable-keyframe-filtering=2
+ * Issue 2436: Integer overflow in av1_warp_affine_c()
+ * Issue 3226: armv7 build failure due to gcc-11
+ * Issue 3195: Bug report on libaom (AddressSanitizer: heap-buffer-overflow)
+ * Issue 3191: Bug report on libaom (AddressSanitizer: SEGV on unknown
+ address)
+ * Issue 3176: Some SSE2/SADx4AvgTest.* tests fail on Windows
+ * Issue 3175: Some SSE2/SADSkipTest.* tests fail on Windows
+
+2021-10-13 v3.2.0
+ This release includes compression efficiency and perceptual quality
+ improvements, speedup and memory optimizations, as well as some new
+ features.
+
+ - New Features
+ * Introduced speeds 7, 8, and 9 for all intra mode.
+ * Introduced speed 10 for real time mode.
+ * Introduced an API that allows external partition decisions.
+ * SVC: added support for compound prediction.
+ * SVC: added support for fixed SVC modes.
+
+ - Compression Efficiency Improvements
+ * Intra-mode search improvement.
+ * Improved real time (RT) mode BDrate savings by ~5% (RT speed 5)
+ and ~12% (RT speed 6). The improvement was measured on the video
+ conference set.
+ * Improved real time mode for nonrd path (speed 7, 8, 9): BDrate
+ gains of ~3-5%.
+ * Rate control and RD adjustments based on ML research in VP9.
+ Gains of ~0.5-1.0% for HD.
+
+ - Perceptual Quality Improvements
+ * Added a new mode --deltaq-mode=3 to improve perceptual quality
+ based on a differential contrast model for still images.
+ * Added a new mode --deltaq-mode=4 to improve perceptual quality
+ based on user rated cq_level data set for still images.
+ * Weighting of some intra mode and partition size choices to better
+ manage and retain texture.
+
+ - Speedup and Memory Optimizations
+ * Further improved 2-pass good quality encoder speed:
+ o Speed 2 speedup: 18%
+ o Speed 3 speedup: 22%
+ o Speed 4 speedup: 37%
+ o Speed 5 speedup: 30%
+ o Speed 6 speedup: 20%
+ * Optimized the real time encoder (measured on the video conference
+ set):
+ o RT speed 5 speedup: 110%
+ o RT speed 6 speedup: 77%
+
+ - Bug Fixes
+ * Issue 3069: Fix one-pass mode keyframe placement off-by-one error.
+ * Issue 3156: Fix a bug in av1_quantize_lp AVX2 optimization.
+
+2021-09-29 v3.1.3
+ This release includes several bug fixes.
+
+ - Bug fixes:
+ The following four cmake changes should help the people building
+ libaom using MSVC.
+ 1. exports: use CMAKE_SHARED_LIBRARY_PREFIX to determine lib name
+ https://aomedia-review.googlesource.com/c/aom/+/142342
+ 2. aom_install: Install lib dlls to bindir
+ https://aomedia-review.googlesource.com/c/aom/+/146546
+ 3. aom_install: use relpath for install
+ https://aomedia-review.googlesource.com/c/aom/+/146550
+ 4. aom_install: don't exclude msvc from install
+ https://aomedia-review.googlesource.com/c/aom/+/146547
+
+ aom/aom_encoder.h: remove configure option reference
+ https://aomedia-review.googlesource.com/c/aom/+/146743
+
+ Issue 3113: Tests for detecting chroma subsampling in
+ av1_copy_and_extend_frame() do not work when y_width or y_height is
+ 1
+
+ Issue 3115: image2yuvconfig() should calculate uv_crop_width and
+ uv_crop_height from y_crop_width and y_crop_height
+
+ Issue 3140: rc_overshoot_pct is documented as having a range of
+ 0-1000, but is range checked against 0-100
+
+ Issue 3147: Build failure on Apple M1 arm64
+
+2021-07-20 v3.1.2
+ This release includes several bug fixes.
+
+ - Bug fixes:
+ exports.cmake: use APPLE and WIN32 and use def for mingw-w64
+ https://aomedia-review.googlesource.com/c/aom/+/139882
+
+ Issue 2993: Incorrect spatial_id when decoding base layer of
+ multi-layer stream
+
+ Issue 3080: Chroma Resampling by Encoder on Y4M Inputs Files Tagged
+ as C420mpeg2
+
+ Issue 3081: Use of uninitialized value $version_extra in
+ concatenation (.) or string at aom/build/cmake/version.pl line 88.
+
+2021-06-08 v3.1.1
+ This release includes several bug fixes.
+
+ - Bug fixes:
+ Issue 2965: Cherry-picked the following four commits for the
+ tune=butteraugli mode.
+ 1. Add libjxl to pkg_config if enabled:
+ https://aomedia-review.googlesource.com/c/aom/+/136044
+ 2. Declare set_mb_butteraugli_rdmult_scaling static:
+ https://aomedia-review.googlesource.com/c/aom/+/134506
+ 3. Add color range detection in tune=butteraugli mode:
+ https://aomedia-review.googlesource.com/c/aom/+/135521
+ 4. Enable tune=butteraugli in all-intra mode:
+ https://aomedia-review.googlesource.com/c/aom/+/136082
+
+ Issue 3021: Fix vmaf model initialization error when not set to
+ tune=vmaf
+
+ Issue 3050: Compilation fails with -DCONFIG_TUNE_VMAF=1
+
+ Issue 3054: Consistent crash on near-static screen content, keyframe
+ related
+
+2021-05-03 v3.1.0
+ This release adds an "all intra" mode to the encoder, which significantly
+ speeds up the encoding of AVIF still images at speed 6.
+
+ - Upgrading:
+ All intra mode for encoding AVIF still images and AV1 all intra videos:
+ AOM_USAGE_ALL_INTRA (2) can be passed as the 'usage' argument to
+ aom_codec_enc_config_default().
+
+ New encoder control IDs added:
+ - AV1E_SET_ENABLE_DIAGONAL_INTRA: Enable diagonal (D45 to D203) intra
+ prediction modes (0: false, 1: true (default)). Also available as
+ "enable-diagonal-intra" for the aom_codec_set_option() function.
+
+ New aom_tune_metric enum value: AOM_TUNE_BUTTERAUGLI. The new aomenc option
+ --tune=butteraugli was added to optimize the encoder's perceptual quality by
+ optimizing the Butteraugli metric. Install libjxl (JPEG XL) and then pass
+ -DCONFIG_TUNE_BUTTERAUGLI=1 to the cmake command to enable it.
+
+ Addition of support for libvmaf 2.x.
+
+ - Enhancements:
+ Heap memory consumption for encoding AVIF still images is significantly
+ reduced.
+
+ - Bug fixes:
+ Issue 2601: third_party/libaom fails licensecheck
+
+ Issue 2950: Conditional expression for rc->this_key_frame_forced is always
+ true in find_next_key_frame()
+
+ Issue 2988: "make install" installs the aom.h header twice
+
+ Issue 2992: Incorrectly printing the temporal_id twice in dump_obu tool
+
+ Issue 2998:
+
+ Issue 2999:
+
+ Issue 3000:
+
+2021-02-24 v3.0.0
+ This release includes compression efficiency improvement, speed improvement
+ for realtime mode, as well as some new APIs.
+
+ - Upgrading:
+ Support for PSNR calculation based on stream bit-depth.
+
+ New encoder control IDs added:
+ - AV1E_SET_ENABLE_RECT_TX
+ - AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP
+ - AV1E_GET_BASELINE_GF_INTERVAL
+ - AV1E_SET_ENABLE_DNL_DENOISING
+
+ New decoder control IDs added:
+ - AOMD_GET_FWD_KF_PRESENT
+ - AOMD_GET_FRAME_FLAGS
+ - AOMD_GET_ALTREF_PRESENT
+ - AOMD_GET_TILE_INFO
+ - AOMD_GET_SCREEN_CONTENT_TOOLS_INFO
+ - AOMD_GET_STILL_PICTURE
+ - AOMD_GET_SB_SIZE
+ - AOMD_GET_SHOW_EXISTING_FRAME_FLAG
+ - AOMD_GET_S_FRAME_INFO
+
+ New aom_tune_content enum value: AOM_CONTENT_FILM
+
+ New aom_tune_metric enum value: AOM_TUNE_VMAF_NEG_MAX_GAIN
+
+ Coefficient and mode update can be turned off via
+ AV1E_SET_{COEFF/MODE}_COST_UPD_FREQ.
+
+ New key & value API added, available with aom_codec_set_option() function.
+
+ Scaling API expanded to include 1/4, 3/4 and 1/8.
+
+ - Enhancements:
+ Better multithreading performance with realtime mode.
+
+ New speed 9 setting for faster realtime encoding.
+
+ Smaller binary size with low bitdepth and realtime only build.
+
+ Temporal denoiser and its optimizations on x86 and Neon.
+
+ Optimizations for scaling.
+
+ Faster encoding with speed settings 2 to 6 for good encoding mode.
+
+ Improved documentation throughout the library, with function level
+ documentation, tree view and support for the dot tool.
+
+ - Bug fixes:
+ Aside from those mentioned in v2.0.1 and v2.0.2, this release includes the
+ following bug fixes:
+
+ Issue 2940: Segfault when encoding with --use-16bit-internal and --limit > 1
+
+ Issue 2941: Decoder mismatch with --rt --bit-depth=10 and --cpu-used=8
+
+ Issue 2895: mingw-w64 i686 gcc fails to build
+
+ Issue 2874: Separate ssse3 functions from sse2 file.
+
+2021-02-09 v2.0.2
+ This release includes several bug fixes.
+
+ - Bug fixes:
+ Issue 2643: Modify the assertion in temporal filter intrinsics.
+
+ Issue 2648: Fix unit test ThreadTestLarge.EncoderResultTest/49
+ assertion failure.
+
+ Issue 2869: Add -Wimplicit-function-declaration as C flag only.
+
+ Issue 2878: Avoid memset in the av1_filter_intra_predictor module
+ functions.
+
+ Issue 2903: Fix a typo bug in apply_temporal_filter_planewise.
+
+ Call av1_setup_frame_size() when dropping a frame in the
+ encode_frame_to_data_rate() function in av1/encoder/encoder.c.
+
+2020-11-25 v2.0.1
+ This release includes two bug fixes.
+
+ - Bug fixes:
+ Issue 2723: Fix crash in chroma_check() when generating a monochrome
+ encoded stream in real-time mode.
+
+ Issue 2833: Fix crash on some input when reduced still picture header is
+ used in real-time mode and speed >=7.
+
+2020-05-07 v2.0.0 "Applejack"
+ First official release of libaom.
+ This release includes new real-time mode and SVC support.
+
+ - Upgrading:
+ AOM_SET_POSTPROC, AOM_CODEC_CAP_POSTPROC and AOM_CODEC_USE_POSTPROC are
+ removed.
+
+ AOM_SET_DBG_* is removed.
+
+ Multi-resolution encoding is removed.
+
+ put_frame and put_slice callbacks are removed.
+
+ - Enhancements:
+ Full-sweep document update for codec controls.
+
+2018-06-28 v1.0.0
+ AOMedia Codec Workgroup Approved version 1.0
+
+2016-04-07 v0.1.0 "AOMedia Codec 1"
+ This release is the first Alliance for Open Media codec.
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
new file mode 100644
index 0000000000..76944e6917
--- /dev/null
+++ b/third_party/aom/CMakeLists.txt
@@ -0,0 +1,1035 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(CONFIG_TFLITE)
+ cmake_minimum_required(VERSION 3.11)
+else()
+ cmake_minimum_required(VERSION 3.9)
+endif()
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
+ message(
+ FATAL_ERROR "Building from within the aom source tree is not supported.\n"
+ "Hint: Run these commands\n"
+ "$ rm -rf CMakeCache.txt CMakeFiles\n"
+ "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
+ "And re-run CMake from the aom_build directory.")
+endif()
+
+project(AOM C CXX)
+
+# GENERATED source property global visibility.
+if(POLICY CMP0118)
+ cmake_policy(SET CMP0118 NEW)
+endif()
+
+if(NOT EMSCRIPTEN)
+ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+ set(CMAKE_BUILD_TYPE
+ "Release"
+ CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel"
+ FORCE)
+ endif()
+endif()
+
+if(MSVC AND MSVC_VERSION LESS 1920)
+ message(
+ WARNING
+ "MSVC versions prior to 2019 (v16) are not supported and may generate"
+ " incorrect code!")
+endif()
+
+# Library version info. Update LT_CURRENT, LT_REVISION and LT_AGE when making a
+# public release by following the guidelines in the libtool document:
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+#
+# c=<current>, r=<revision>, a=<age>
+#
+# libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+# passed to libtool.
+#
+# We set SO_FILE_VERSION = [c-a].a.r
+set(LT_CURRENT 11)
+set(LT_REVISION 0)
+set(LT_AGE 8)
+math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}")
+set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}")
+unset(LT_CURRENT)
+unset(LT_REVISION)
+unset(LT_AGE)
+
+# Enable generators like Xcode and Visual Studio to place projects in folders.
+set_property(GLOBAL PROPERTY USE_FOLDERS TRUE)
+
+include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
+if(CONFIG_THREE_PASS)
+ include("${AOM_ROOT}/common/ivf_dec.cmake")
+endif()
+include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
+include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
+include("${AOM_ROOT}/aom_ports/aom_ports.cmake")
+include("${AOM_ROOT}/aom_scale/aom_scale.cmake")
+include("${AOM_ROOT}/aom_util/aom_util.cmake")
+include("${AOM_ROOT}/av1/av1.cmake")
+include("${AOM_ROOT}/build/cmake/aom_install.cmake")
+include("${AOM_ROOT}/build/cmake/sanitizers.cmake")
+include("${AOM_ROOT}/build/cmake/util.cmake")
+include("${AOM_ROOT}/test/test.cmake")
+
+list(APPEND AOM_RTCD_SOURCES
+ "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+ "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+ "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+ "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+ "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+ "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+ "${AOM_ROOT}/av1/common/av1_rtcd.c"
+ "${AOM_ROOT}/build/cmake/rtcd.pl")
+
+list(APPEND AOM_LIBWEBM_SOURCES
+ "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc"
+ "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h"
+ "${AOM_ROOT}/third_party/libwebm/common/webmids.h"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc"
+ "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h"
+ "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc"
+ "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h"
+ "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc"
+ "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h")
+
+list(APPEND AOM_LIBYUV_SOURCES
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h"
+ "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h"
+ "${AOM_ROOT}/third_party/libyuv/source/convert_argb.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_any.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_common.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/row_win.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc"
+ "${AOM_ROOT}/third_party/libyuv/source/scale_uv.cc")
+
+list(APPEND AOM_SOURCES
+ "${AOM_CONFIG_DIR}/config/aom_config.c"
+ "${AOM_CONFIG_DIR}/config/aom_config.h"
+ "${AOM_ROOT}/aom/aom.h"
+ "${AOM_ROOT}/aom/aom_codec.h"
+ "${AOM_ROOT}/aom/aom_decoder.h"
+ "${AOM_ROOT}/aom/aom_encoder.h"
+ "${AOM_ROOT}/aom/aom_external_partition.h"
+ "${AOM_ROOT}/aom/aom_frame_buffer.h"
+ "${AOM_ROOT}/aom/aom_image.h"
+ "${AOM_ROOT}/aom/aom_integer.h"
+ "${AOM_ROOT}/aom/aomcx.h"
+ "${AOM_ROOT}/aom/aomdx.h"
+ "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+ "${AOM_ROOT}/aom/internal/aom_image_internal.h"
+ "${AOM_ROOT}/aom/src/aom_codec.c"
+ "${AOM_ROOT}/aom/src/aom_decoder.c"
+ "${AOM_ROOT}/aom/src/aom_encoder.c"
+ "${AOM_ROOT}/aom/src/aom_image.c"
+ "${AOM_ROOT}/aom/src/aom_integer.c")
+
+list(APPEND AOM_COMMON_APP_UTIL_SOURCES
+ "${AOM_ROOT}/av1/arg_defs.c"
+ "${AOM_ROOT}/av1/arg_defs.h"
+ "${AOM_ROOT}/common/args_helper.c"
+ "${AOM_ROOT}/common/args_helper.h"
+ "${AOM_ROOT}/common/args.c"
+ "${AOM_ROOT}/common/args.h"
+ "${AOM_ROOT}/common/av1_config.c"
+ "${AOM_ROOT}/common/av1_config.h"
+ "${AOM_ROOT}/common/md5_utils.c"
+ "${AOM_ROOT}/common/md5_utils.h"
+ "${AOM_ROOT}/common/tools_common.c"
+ "${AOM_ROOT}/common/tools_common.h"
+ "${AOM_ROOT}/common/video_common.h"
+ "${AOM_ROOT}/common/rawenc.c"
+ "${AOM_ROOT}/common/rawenc.h"
+ "${AOM_ROOT}/common/y4menc.c"
+ "${AOM_ROOT}/common/y4menc.h"
+ "${AOM_ROOT}/common/ivfdec.c"
+ "${AOM_ROOT}/common/ivfdec.h")
+
+list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/obudec.c"
+ "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c"
+ "${AOM_ROOT}/common/video_reader.h")
+
+list(APPEND AOM_ENCODER_APP_UTIL_SOURCES
+ "${AOM_ROOT}/common/ivfenc.c"
+ "${AOM_ROOT}/common/ivfenc.h"
+ "${AOM_ROOT}/common/video_writer.c"
+ "${AOM_ROOT}/common/video_writer.h"
+ "${AOM_ROOT}/common/warnings.c"
+ "${AOM_ROOT}/common/warnings.h"
+ "${AOM_ROOT}/common/y4minput.c"
+ "${AOM_ROOT}/common/y4minput.h"
+ "${AOM_ROOT}/examples/encoder_util.h"
+ "${AOM_ROOT}/examples/encoder_util.c")
+
+list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
+ "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
+ "${AOM_ROOT}/stats/rate_hist.h")
+
+list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h")
+
+list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc"
+ "${AOM_ROOT}/common/webmdec.h")
+
+list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc"
+ "${AOM_ROOT}/common/webmenc.h")
+
+include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps
+ ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats)
+
+# Targets
+add_library(aom_version ${AOM_VERSION_SOURCES})
+add_no_op_source_file_to_target(aom_version c)
+add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h"
+ COMMAND ${CMAKE_COMMAND} ARGS
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+ -DAOM_ROOT=${AOM_ROOT}
+ -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+ -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+ "${AOM_ROOT}/build/cmake/version.cmake"
+ COMMENT "Writing aom_version.h"
+ VERBATIM)
+
+add_custom_target(aom_version_check
+ COMMAND ${CMAKE_COMMAND}
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+ -DAOM_ROOT=${AOM_ROOT}
+ -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+ -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+ "${AOM_ROOT}/build/cmake/version.cmake"
+ COMMENT "Updating version info if necessary."
+ VERBATIM)
+
+if(BUILD_SHARED_LIBS AND NOT MSVC)
+ # Generate version file immediately for non-MSVC shared builds: The version
+ # string is needed for the aom target.
+ execute_process(COMMAND ${CMAKE_COMMAND}
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+ -DAOM_ROOT=${AOM_ROOT}
+ -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+ -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+ "${AOM_ROOT}/build/cmake/version.cmake")
+endif()
+
+add_dependencies(aom_version aom_version_check)
+
+# TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd
+# source.
+add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+ "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd")
+add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+ "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+ "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd")
+add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+ "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+ "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd")
+
+add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES})
+add_dependencies(aom_rtcd aom_version)
+
+if(ENABLE_EXAMPLES)
+ add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats)
+endif()
+
+# Xcode generator cannot take a library composed solely of objects. See
+# https://gitlab.kitware.com/cmake/cmake/-/issues/17500
+if(XCODE)
+ set(target_objs_aom ${AOM_SOURCES})
+else()
+ add_library(aom_obj OBJECT ${AOM_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_obj)
+ set(target_objs_aom $<TARGET_OBJECTS:aom_obj>)
+endif()
+add_library(aom ${target_objs_aom} $<TARGET_OBJECTS:aom_rtcd>)
+
+if(BUILD_SHARED_LIBS)
+ add_library(aom_static STATIC ${target_objs_aom} $<TARGET_OBJECTS:aom_rtcd>)
+ set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
+ if(MSVC OR (WIN32 AND NOT MINGW))
+ # Fix race condition between the import library and the static library.
+ # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter
+ # sets MSVC and MINGW both to FALSE).
+ set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll")
+ endif()
+
+ if(NOT MSVC)
+ # Extract version string and set VERSION/SOVERSION for the aom target.
+ extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h"
+ aom_version_triple)
+
+ # Strip any trailing version information, if present.
+ string(FIND "${aom_version_triple}" "-" dash_pos)
+ if(NOT dash_pos EQUAL -1)
+ string(SUBSTRING "${aom_version_triple}" 0 ${dash_pos} aom_version_triple)
+ endif()
+
+ # cmake-format: off
+ # VERSION is embedded in the .so file name.
+ # libaom.so -> libaom.so.SOVERSION
+ # libaom.so.SOVERSION -> libaom.so.VERSION
+ # libaom.so.VERSION
+ # cmake-format: on
+ set_target_properties(aom PROPERTIES SOVERSION ${SO_VERSION})
+ set_target_properties(aom PROPERTIES VERSION ${SO_FILE_VERSION})
+ endif()
+endif()
+
+if(NOT WIN32 AND NOT APPLE)
+ target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m)
+ if(BUILD_SHARED_LIBS)
+ target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} m)
+ endif()
+endif()
+
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
+ list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h"
+ "${AOM_ROOT}/av1/ratectrl_rtc.cc")
+ add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES})
+ target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom)
+ if(NOT WIN32 AND NOT APPLE)
+ target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} m)
+ endif()
+ set_target_properties(aom_av1_rc PROPERTIES LINKER_LANGUAGE CXX)
+endif()
+
+# List of object and static library targets.
+set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
+if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS)
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc)
+endif()
+if(BUILD_SHARED_LIBS)
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static)
+endif()
+
+# Setup dependencies.
+if(CONFIG_THREE_PASS)
+ setup_ivf_dec_targets()
+endif()
+setup_aom_dsp_targets()
+setup_aom_mem_targets()
+setup_aom_ports_targets()
+setup_aom_util_targets()
+setup_aom_scale_targets()
+setup_av1_targets()
+
+# Make all library targets depend on aom_rtcd to make sure it builds first.
+foreach(aom_lib ${AOM_LIB_TARGETS})
+ if(NOT "${aom_lib}" STREQUAL "aom_rtcd")
+ add_dependencies(${aom_lib} aom_rtcd)
+ endif()
+endforeach()
+
+# Generate a C file containing the function usage_exit(). Users of the
+# aom_common_app_util library must define this function. This is a convenience
+# to allow omission of the function from applications that might want to use
+# other pieces of the util support without defining usage_exit().
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c"
+ "#include <stdlib.h>\n\n#include \"common/tools_common.h\"\n\n"
+ "void usage_exit(void) { exit(EXIT_FAILURE); }\n")
+
+#
+# Application and application support targets.
+#
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+ add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+ set_property(TARGET ${example} PROPERTY FOLDER examples)
+ if(CONFIG_AV1_DECODER)
+ add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+ set_property(TARGET ${example} PROPERTY FOLDER examples)
+ # obudec depends on internal headers that require *rtcd.h
+ add_dependencies(aom_decoder_app_util aom_rtcd)
+ endif()
+ if(CONFIG_AV1_ENCODER)
+ add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
+ set_property(TARGET ${example} PROPERTY FOLDER examples)
+ endif()
+endif()
+
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+ add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+
+ if(CONFIG_ANALYZER)
+ add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
+ list(APPEND AOM_APP_TARGETS analyzer)
+ list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer)
+ endif()
+
+ if(CONFIG_INSPECTION)
+ add_executable(inspect "${AOM_ROOT}/examples/inspect.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect)
+
+ if(EMSCRIPTEN)
+ add_preproc_definition(_POSIX_SOURCE)
+ append_link_flag_to_target("inspect" "--emrun")
+ append_link_flag_to_target("inspect" "-s USE_PTHREADS=0")
+ append_link_flag_to_target("inspect" "-s WASM=1")
+ append_link_flag_to_target("inspect" "-s MODULARIZE=1")
+ append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1")
+ append_link_flag_to_target(
+ "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'")
+ append_link_flag_to_target("inspect"
+ "-s EXPORT_NAME=\"\'DecoderModule\'\"")
+ append_link_flag_to_target("inspect" "--memory-init-file 0")
+
+ if("${CMAKE_BUILD_TYPE}" STREQUAL "")
+
+ # Default to -O3 when no build type is specified.
+ append_compiler_flag("-O3")
+ endif()
+
+ em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js")
+ endif()
+ endif()
+
+ # Maintain a list of decoder example targets.
+ list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5 decode_with_drops
+ scalable_decoder simple_decoder)
+
+ # Add decoder examples to the app targets list.
+ list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS})
+endif()
+
+if(CONFIG_LIBYUV OR CONFIG_TUNE_BUTTERAUGLI)
+ add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
+ if(NOT MSVC)
+ target_compile_options(yuv PRIVATE -Wno-shadow)
+ endif()
+ include_directories("${AOM_ROOT}/third_party/libyuv/include")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+ if(ENABLE_EXAMPLES)
+ add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>
+ $<TARGET_OBJECTS:aom_encoder_stats>)
+ add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ add_executable(photon_noise_table
+ "${AOM_ROOT}/examples/photon_noise_table.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+ # Maintain a list of encoder example targets.
+ list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
+ photon_noise_table set_maps simple_encoder scalable_encoder
+ twopass_encoder)
+
+ if(NOT BUILD_SHARED_LIBS)
+ add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc)
+ list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc)
+ endif()
+ endif()
+
+ if(ENABLE_TOOLS)
+ if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS)
+
+ # TODO(tomfinegan): Sort out why a simple link command with
+ # aom_entropy_optimizer.c won't work on macos, but dragging in all the
+ # helper machinery allows the link to succeed.
+ add_executable(aom_entropy_optimizer
+ "${AOM_GEN_SRC_DIR}/usage_exit.c"
+ "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+ # Maintain a list of encoder tool targets.
+ list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer)
+ endif()
+ endif()
+
+ # Add encoder examples and tools to the targets list.
+ list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
+ ${AOM_ENCODER_TOOL_TARGETS})
+
+ if(CONFIG_TUNE_BUTTERAUGLI)
+ find_package(PkgConfig)
+ # Use find_library() with STATIC_LINK_JXL for static build since
+ # pkg_check_modules() with LIBJXL_STATIC is not working.
+ if(STATIC_LINK_JXL OR NOT PKG_CONFIG_FOUND)
+ find_library(LIBJXL_LIBRARIES libjxl.a)
+ find_library(LIBHWY_LIBRARIES libhwy.a)
+ find_library(LIBSKCMS_LIBRARIES libskcms.a)
+ find_library(LIBBROTLICOMMON_LIBRARIES libbrotlicommon-static.a)
+ find_library(LIBBROTLIENC_LIBRARIES libbrotlienc-static.a)
+ find_library(LIBBROTLIDEC_LIBRARIES libbrotlidec-static.a)
+ find_path(LIBJXL_INCLUDE_DIRS butteraugli.h PATH_SUFFIXES jxl)
+ if(LIBJXL_LIBRARIES
+ AND LIBHWY_LIBRARIES
+ AND LIBSKCMS_LIBRARIES
+ AND LIBBROTLICOMMON_LIBRARIES
+ AND LIBBROTLIENC_LIBRARIES
+ AND LIBBROTLIDEC_LIBRARIES
+ AND LIBJXL_INCLUDE_DIRS)
+ message(STATUS "Found JXL library: ${LIBJXL_LIBRARIES} "
+ "${LIBHWY_LIBRARIES} ${LIBSKCMS_LIBRARIES} "
+ "${LIBBROTLICOMMON_LIBRARIES} ${LIBBROTLIENC_LIBRARIES}"
+ "${LIBBROTLIDEC_LIBRARIES}")
+ message(STATUS "Found JXL include: ${LIBJXL_INCLUDE_DIRS}")
+ else()
+ message(FATAL_ERROR "JXL library not found.")
+ endif()
+ target_link_libraries(aom
+ PRIVATE ${LIBJXL_LIBRARIES} ${LIBHWY_LIBRARIES}
+ ${LIBSKCMS_LIBRARIES}
+ ${LIBBROTLIENC_LIBRARIES}
+ ${LIBBROTLIDEC_LIBRARIES}
+ ${LIBBROTLICOMMON_LIBRARIES})
+ target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS})
+ else()
+ pkg_check_modules(LIBJXL REQUIRED libjxl)
+ target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS} ${LIBJXL_LIBRARIES})
+ target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS})
+ if(LIBJXL_CFLAGS)
+ append_compiler_flag("${LIBJXL_CFLAGS}")
+ endif()
+ pkg_check_modules(LIBHWY REQUIRED libhwy)
+ target_link_libraries(aom PRIVATE ${LIBHWY_LDFLAGS} ${LIBHWY_LIBRARIES})
+ target_include_directories(aom_dsp_encoder
+ PRIVATE ${LIBLIBHWY_INCLUDE_DIRS})
+ if(LIBHWY_CFLAGS)
+ append_compiler_flag("${LIBHWY_CFLAGS}")
+ endif()
+ endif()
+
+ set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
+ if(BUILD_SHARED_LIBS)
+ set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
+ endif()
+
+ list(APPEND AOM_LIB_TARGETS yuv)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:yuv>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:yuv>)
+ endif()
+ endif()
+
+ if(CONFIG_TFLITE)
+ include(FetchContent)
+
+ set(TFLITE_TAG "v2.6.1")
+
+ message(STATUS "Fetching TFLite ${TFLITE_TAG}...")
+
+ # static linking makes life with TFLite much easier
+ set(TFLITE_C_BUILD_SHARED_LIBS OFF)
+
+ # We don't care about comparing against these delegates (yet), and disabling
+ # it reduces compile time meaningfully
+ set(TFLITE_ENABLE_RUY OFF)
+ set(TFLITE_ENABLE_XNNPACK OFF)
+
+ fetchcontent_declare(tflite
+ GIT_REPOSITORY https://github.com/tensorflow/tensorflow
+ GIT_TAG ${TFLITE_TAG}
+ GIT_SHALLOW TRUE)
+
+ fetchcontent_getproperties(tflite)
+ if(NOT tflite_POPULATED)
+ fetchcontent_populate(tflite)
+ # Some of the subprojects (e.g. Eigen) are very noisy and emit status
+ # messages all the time. Temporary ignore status messages while adding
+ # this to silence it. Ugly but effective.
+ set(OLD_CMAKE_MESSAGE_LOG_LEVEL ${CMAKE_MESSAGE_LOG_LEVEL})
+ set(CMAKE_MESSAGE_LOG_LEVEL WARNING)
+ add_subdirectory(${tflite_SOURCE_DIR}/tensorflow/lite/c
+ ${tflite_BINARY_DIR})
+ set(CMAKE_MESSAGE_LOG_LEVEL ${OLD_CMAKE_MESSAGE_LOG_LEVEL})
+ endif()
+
+ # Disable some noisy warnings in tflite
+ target_compile_options(tensorflow-lite PRIVATE -w)
+
+ # tensorflowlite_c is implicitly declared by this FetchContent
+ include_directories(${tflite_SOURCE_DIR})
+ target_link_libraries(aom PRIVATE tensorflow-lite)
+ endif()
+
+ if(CONFIG_TUNE_VMAF)
+ find_package(PkgConfig)
+ if(PKG_CONFIG_FOUND)
+ pkg_check_modules(VMAF REQUIRED libvmaf)
+ if(BUILD_SHARED_LIBS)
+ target_link_libraries(aom_static
+ PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES})
+ endif()
+ target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES})
+ target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
+ if(VMAF_CFLAGS)
+ foreach(flag "${VMAF_CFLAGS}")
+ append_compiler_flag("${flag}")
+ endforeach()
+ endif()
+ else()
+ message(FATAL_ERROR "CONFIG_TUNE_VMAF error: pkg-config not found.")
+ endif()
+ set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
+ if(BUILD_SHARED_LIBS)
+ set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
+ endif()
+ endif()
+endif()
+
+if(ENABLE_EXAMPLES)
+
+ # Maintain a separate variable listing only the examples to facilitate
+ # installation of example programs into an examples sub directory of
+ # $AOM_DIST_DIR/bin when building the dist target.
+ list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}
+ ${AOM_ENCODER_EXAMPLE_TARGETS})
+endif()
+
+if(ENABLE_TOOLS)
+ if(CONFIG_AV1_DECODER)
+ add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.c"
+ "${AOM_ROOT}/tools/dump_obu.cc"
+ "${AOM_ROOT}/tools/obu_parser.cc"
+ "${AOM_ROOT}/tools/obu_parser.h"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+
+ list(APPEND AOM_TOOL_TARGETS dump_obu)
+ list(APPEND AOM_APP_TARGETS dump_obu)
+
+ # Maintain a separate variable listing only the examples to facilitate
+ # installation of example programs into an tools sub directory of
+ # $AOM_DIST_DIR/bin when building the dist target.
+ list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS}
+ ${AOM_ENCODER_TOOL_TARGETS})
+ endif()
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+ add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref)
+ list(APPEND AOM_APP_TARGETS aom_cx_set_ref)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER)
+ add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+ list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder)
+ list(APPEND AOM_APP_TARGETS lightfield_encoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+ add_executable(lightfield_tile_list_decoder
+ "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder)
+ list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+ add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder)
+ list(APPEND AOM_APP_TARGETS lightfield_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER)
+ add_executable(lightfield_bitstream_parsing
+ "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c"
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:aom_encoder_app_util>
+ $<TARGET_OBJECTS:aom_decoder_app_util>)
+ list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing)
+ list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing)
+endif()
+
+foreach(aom_app ${AOM_APP_TARGETS})
+ target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom)
+endforeach()
+
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+ if(CONFIG_LIBYUV)
+ # Add to existing targets.
+ foreach(aom_app ${AOM_APP_TARGETS})
+ target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
+ set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+ endforeach()
+ endif()
+
+ if(CONFIG_WEBM_IO)
+ add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES})
+ include_directories("${AOM_ROOT}/third_party/libwebm")
+ target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS)
+ target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS)
+
+ if(NOT MSVC)
+ target_compile_options(webm PRIVATE -Wno-shadow)
+ endif()
+
+ # Add to existing targets.
+ if(CONFIG_AV1_DECODER)
+ target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES})
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES})
+ endif()
+
+ foreach(aom_app ${AOM_APP_TARGETS})
+ target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:webm>)
+ set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+ endforeach()
+ endif()
+endif()
+
+if(ENABLE_TESTS)
+
+ # Create test_libaom target and the targets it depends on.
+ setup_aom_test_targets()
+endif()
+
+if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
+ find_package(Threads)
+ target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads)
+ if(BUILD_SHARED_LIBS)
+ target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} Threads::Threads)
+ endif()
+endif()
+
+if(XCODE)
+
+ # TODO(tomfinegan): Make sure target has no C++ files before doing this as
+ # it's not necessary in that case.
+ if(CONFIG_LIBYUV OR CONFIG_WEBM_IO)
+
+ # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue
+ # what looks like a C++ file needs to be in any target that Xcode will link
+ # when the target contains a C++ dependency. Without this Xcode will try to
+ # link with the C linker, which always ends badly when a dependency actually
+ # includes C++.
+
+ # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched
+ # here, it really is the Xcode generator's fault, or just a deficiency in
+ # Xcode itself.
+ foreach(aom_app ${AOM_APP_TARGETS})
+ add_no_op_source_file_to_target("${aom_app}" "cc")
+ endforeach()
+ endif()
+endif()
+
+if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
+
+ # For historical purposes place the example binaries in the example directory.
+ file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples")
+
+ foreach(target ${AOM_EXAMPLE_TARGETS})
+ if(NOT "${target}" MATCHES "aomdec\|aomenc")
+ set_target_properties(${target}
+ PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+ "${AOM_CONFIG_DIR}/examples")
+ endif()
+ endforeach()
+
+ if(ENABLE_TOOLS AND AOM_TOOL_TARGETS)
+
+ # The same expectation is true for tool targets.
+ file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools")
+ set_target_properties(${AOM_TOOL_TARGETS}
+ PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+ "${AOM_CONFIG_DIR}/tools")
+ endif()
+endif()
+
+if(BUILD_SHARED_LIBS)
+ # Don't use -Wl,-z,defs with Clang's sanitizers.
+ #
+ # Clang's AddressSanitizer documentation says "When linking shared libraries,
+ # the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+ # errors (don't use it with AddressSanitizer)." See
+ # https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+ if(NOT WIN32
+ AND NOT APPLE
+ AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE))
+ # The -z defs linker option reports unresolved symbol references from object
+ # files when building a shared library.
+ if("${CMAKE_VERSION}" VERSION_LESS "3.13")
+ # target_link_options() is not available before CMake 3.13.
+ target_link_libraries(aom PRIVATE -Wl,-z,defs)
+ else()
+ target_link_options(aom PRIVATE LINKER:-z,defs)
+ endif()
+ endif()
+
+ include("${AOM_ROOT}/build/cmake/exports.cmake")
+ setup_exports_target()
+endif()
+
+# Do not allow implicit vector type conversions on Clang builds (this is already
+# the default on GCC builds).
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+ append_compiler_flag("-flax-vector-conversions=none")
+endif()
+
+# Handle user supplied compile and link flags last to ensure they're obeyed.
+set_user_flags()
+
+# Aomedia documentation rule.
+set(DOXYGEN_VERSION_VALUE 0)
+if(ENABLE_DOCS)
+ find_package(Doxygen)
+ if(DOXYGEN_FOUND)
+ # Check if Doxygen version is >= minimum required version(i.e. 1.8.10).
+ set(MINIMUM_DOXYGEN_VERSION 1008010)
+
+ if(DOXYGEN_VERSION)
+ # Strip SHA1 from version string if present.
+ string(REGEX
+ REPLACE "^([0-9]+\\.[0-9]+\\.[0-9]+).*" "\\1" DOXYGEN_VERSION
+ ${DOXYGEN_VERSION})
+ # Replace dots with semicolons to create a list.
+ string(REGEX REPLACE "\\." ";" DOXYGEN_VERSION_LIST ${DOXYGEN_VERSION})
+ # Parse version components from the list.
+ list(GET DOXYGEN_VERSION_LIST 0 DOXYGEN_MAJOR)
+ list(GET DOXYGEN_VERSION_LIST 1 DOXYGEN_MINOR)
+ list(GET DOXYGEN_VERSION_LIST 2 DOXYGEN_PATCH)
+ endif()
+
+ # Construct a version value for comparison.
+ math(EXPR DOXYGEN_MAJOR "${DOXYGEN_MAJOR}*1000000")
+ math(EXPR DOXYGEN_MINOR "${DOXYGEN_MINOR}*1000")
+ math(EXPR DOXYGEN_VERSION_VALUE
+ "${DOXYGEN_MAJOR} + ${DOXYGEN_MINOR} + ${DOXYGEN_PATCH}")
+
+ if(${DOXYGEN_VERSION_VALUE} LESS ${MINIMUM_DOXYGEN_VERSION})
+ set(DOXYGEN_FOUND NO)
+ endif()
+ endif()
+
+ if(DOXYGEN_FOUND)
+ include("${AOM_ROOT}/docs.cmake")
+ setup_documentation_targets()
+ else()
+ message(
+ "--- Cannot find doxygen(version 1.8.10 or newer), ENABLE_DOCS turned off."
+ )
+ set(ENABLE_DOCS OFF)
+ endif()
+endif()
+
+# Aomedia dist rule.
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+ list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomdec>)
+endif()
+if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES)
+ list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomenc>)
+endif()
+
+if(ENABLE_EXAMPLES)
+ foreach(example ${AOM_EXAMPLE_TARGETS})
+ list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
+ set_property(TARGET ${example} PROPERTY FOLDER examples)
+ endforeach()
+endif()
+
+if(ENABLE_TOOLS)
+ foreach(tool ${AOM_TOOL_TARGETS})
+ list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
+ set_property(TARGET ${tool} PROPERTY FOLDER tools)
+ endforeach()
+endif()
+
+if(NOT AOM_DIST_DIR)
+ set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist")
+endif()
+
+add_custom_target(dist
+ COMMAND ${CMAKE_COMMAND}
+ -DAOM_ROOT=${AOM_ROOT}
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+ -DAOM_DIST_DIR=${AOM_DIST_DIR}
+ -DAOM_DIST_APPS="${AOM_DIST_APPS}"
+ -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}"
+ -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}"
+ -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}"
+ -DAOM_DIST_LIBS=$<TARGET_FILE:aom>
+ -DENABLE_DOCS=${ENABLE_DOCS} -P
+ "${AOM_ROOT}/build/cmake/dist.cmake"
+ DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS}
+ ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS}
+ ${AOM_TOOL_TARGETS})
+
+if(ENABLE_DOCS)
+ add_dependencies(dist docs)
+endif()
+
+# Collect all variables containing libaom source files.
+get_cmake_property(all_cmake_vars VARIABLES)
+foreach(var ${all_cmake_vars})
+ if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
+ AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST"
+ AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER")
+ list(APPEND aom_source_vars ${var})
+ endif()
+endforeach()
+
+if(NOT CONFIG_AV1_DECODER)
+ list(FILTER aom_source_vars EXCLUDE REGEX "_DECODER_")
+endif()
+
+# Libaom_srcs.txt generation.
+set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt")
+file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n")
+
+# Static source file list first.
+foreach(aom_source_var ${aom_source_vars})
+ foreach(file ${${aom_source_var}})
+ if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+ string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+ if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder")
+ continue()
+ endif()
+ file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+ endif()
+ endforeach()
+endforeach()
+
+file(APPEND "${libaom_srcs_txt_file}"
+ "# Files below this line are generated by the libaom build system.\n")
+foreach(aom_source_var ${aom_source_vars})
+ foreach(file ${${aom_source_var}})
+ if("${file}" MATCHES "${AOM_CONFIG_DIR}")
+ string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}")
+ file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+ endif()
+ endforeach()
+endforeach()
+
+# Libaom_srcs.gni generation.
+set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni")
+file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+ if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+ string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+ file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n")
+ endif()
+
+ foreach(file ${${aom_source_var}})
+ if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+ string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file
+ "${file}")
+ if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder")
+ continue()
+ endif()
+ file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n")
+ endif()
+ endforeach()
+
+ if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+ file(APPEND "${libaom_srcs_gni_file}" "]\n")
+ endif()
+endforeach()
+
+file(APPEND "${libaom_srcs_gni_file}"
+ "\n# Files below this line are generated by the libaom build system.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+ if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+ string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+ file(APPEND "${libaom_srcs_gni_file}"
+ "\n${aom_source_var_lowercase}_gen = [\n")
+ endif()
+ foreach(file ${${aom_source_var}})
+ if(NOT "${file}" MATCHES "${AOM_ROOT}")
+ string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom"
+ file "${file}")
+ file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n")
+ endif()
+ endforeach()
+
+ if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+ file(APPEND "${libaom_srcs_gni_file}" "]\n")
+ endif()
+endforeach()
+
+# Generate aom.pc and setup install rule.
+setup_aom_install_targets()
diff --git a/third_party/aom/LICENSE b/third_party/aom/LICENSE
new file mode 100644
index 0000000000..fc340c3764
--- /dev/null
+++ b/third_party/aom/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/third_party/aom/PATENTS b/third_party/aom/PATENTS
new file mode 100644
index 0000000000..fc4de9edf8
--- /dev/null
+++ b/third_party/aom/PATENTS
@@ -0,0 +1,108 @@
+Alliance for Open Media Patent License 1.0
+
+1. License Terms.
+
+1.1. Patent License. Subject to the terms and conditions of this License, each
+ Licensor, on behalf of itself and successors in interest and assigns,
+ grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+ no-charge, royalty-free, irrevocable (except as expressly stated in this
+ License) patent license to its Necessary Claims to make, use, sell, offer
+ for sale, import or distribute any Implementation.
+
+1.2. Conditions.
+
+1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
+ sell, offer for sale, import or distribute an Implementation under
+ Section 1.1, Licensee must make its Necessary Claims available under
+ this License, and must reproduce this License with any Implementation
+ as follows:
+
+ a. For distribution in source code, by including this License in the
+ root directory of the source code with its Implementation.
+
+ b. For distribution in any other form (including binary, object form,
+ and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+ GDSII, etc.)), by including this License in the documentation, legal
+ notices, and/or other written materials provided with the
+ Implementation.
+
+1.2.2. Additional Conditions. This license is directly from Licensor to
+ Licensee. Licensee acknowledges as a condition of benefiting from it
+ that no rights from Licensor are received from suppliers, distributors,
+ or otherwise in connection with this License.
+
+1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
+ initiates patent litigation or files, maintains, or voluntarily
+ participates in a lawsuit against another entity or any person asserting
+ that any Implementation infringes Necessary Claims, any patent licenses
+ granted under this License directly to the Licensee are immediately
+ terminated as of the date of the initiation of action unless 1) that suit
+ was in response to a corresponding suit regarding an Implementation first
+ brought against an initiating entity, or 2) that suit was brought to
+ enforce the terms of this License (including intervention in a third-party
+ action by a Licensee).
+
+1.4. Disclaimers. The Reference Implementation and Specification are provided
+ "AS IS" and without warranty. The entire risk as to implementing or
+ otherwise using the Reference Implementation or Specification is assumed
+ by the implementer and user. Licensor expressly disclaims any warranties
+ (express, implied, or otherwise), including implied warranties of
+ merchantability, non-infringement, fitness for a particular purpose, or
+ title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+ ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+ INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+ ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+ OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+ NOT THE OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. Definitions.
+
+2.1. Affiliate. "Affiliate" means an entity that directly or indirectly
+ Controls, is Controlled by, or is under common Control of that party.
+
+2.2. Control. "Control" means direct or indirect control of more than 50% of
+ the voting power to elect directors of that corporation, or for any other
+ entity, the power to direct management of such entity.
+
+2.3. Decoder. "Decoder" means any decoder that conforms fully with all
+ non-optional portions of the Specification.
+
+2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can
+ be decoded by a Decoder only to the extent it produces such a bitstream.
+
+2.5. Final Deliverable. "Final Deliverable" means the final version of a
+ deliverable approved by the Alliance for Open Media as a Final
+ Deliverable.
+
+2.6. Implementation. "Implementation" means any implementation, including the
+ Reference Implementation, that is an Encoder and/or a Decoder. An
+ Implementation also includes components of an Implementation only to the
+ extent they are used as part of an Implementation.
+
+2.7. License. "License" means this license.
+
+2.8. Licensee. "Licensee" means any person or entity who exercises patent
+ rights granted under this License.
+
+2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers
+ for sale, imports or distributes any Implementation, or (ii) a person
+ or entity that has a licensing obligation to the Implementation as a
+ result of its membership and/or participation in the Alliance for Open
+ Media working group that developed the Specification.
+
+2.10. Necessary Claims. "Necessary Claims" means all claims of patents or
+ patent applications, (a) that currently or at any time in the future,
+ are owned or controlled by the Licensor, and (b) (i) would be an
+ Essential Claim as defined by the W3C Policy as of February 5, 2004
+ (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+ as if the Specification was a W3C Recommendation; or (ii) are infringed
+ by the Reference Implementation.
+
+2.11. Reference Implementation. "Reference Implementation" means an Encoder
+ and/or Decoder released by the Alliance for Open Media as a Final
+ Deliverable.
+
+2.12. Specification. "Specification" means the specification designated by
+ the Alliance for Open Media as a Final Deliverable for which this
+ License was issued.
+
diff --git a/third_party/aom/README.md b/third_party/aom/README.md
new file mode 100644
index 0000000000..4e2eb2756c
--- /dev/null
+++ b/third_party/aom/README.md
@@ -0,0 +1,677 @@
+README.md {#LREADME}
+=========
+# AV1 Codec Library
+
+## Contents
+1. [Building the lib and applications](#building-the-library-and-applications)
+ - [Prerequisites](#prerequisites)
+ - [Get the code](#get-the-code)
+ - [Basics](#basic-build)
+ - [Configuration options](#configuration-options)
+ - [Dylib builds](#dylib-builds)
+ - [Debugging](#debugging)
+ - [Cross compiling](#cross-compiling)
+ - [Sanitizer support](#sanitizers)
+ - [MSVC builds](#microsoft-visual-studio-builds)
+ - [Xcode builds](#xcode-builds)
+ - [Emscripten builds](#emscripten-builds)
+ - [Extra Build Flags](#extra-build-flags)
+ - [Build with VMAF support](#build-with-vmaf)
+2. [Testing the library](#testing-the-av1-codec)
+ - [Basics](#testing-basics)
+ - [Unit tests](#unit-tests)
+ - [Example tests](#example-tests)
+ - [Encoder tests](#encoder-tests)
+ - [IDE hosted tests](#ide-hosted-tests)
+ - [Downloading test data](#downloading-the-test-data)
+ - [Adding a new test data file](#adding-a-new-test-data-file)
+ - [Additional test data](#additional-test-data)
+ - [Sharded testing](#sharded-testing)
+ - [Running tests directly](#running-test_libaom-directly)
+ - [Running tests via CMake](#running-the-tests-via-the-cmake-build)
+3. [Coding style](#coding-style)
+4. [Submitting patches](#submitting-patches)
+ - [Login cookie](#login-cookie)
+ - [Contributor agreement](#contributor-agreement)
+ - [Testing your code](#testing-your-code)
+ - [Commit message hook](#commit-message-hook)
+ - [Upload your change](#upload-your-change)
+ - [Incorporating Reviewer Comments](#incorporating-reviewer-comments)
+ - [Submitting your change](#submitting-your-change)
+ - [Viewing change status](#viewing-the-status-of-uploaded-changes)
+5. [Support](#support)
+6. [Bug reports](#bug-reports)
+
+## Building the library and applications {#building-the-library-and-applications}
+
+### Prerequisites {#prerequisites}
+
+ 1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version
+ required.
+ 2. [Git](https://git-scm.com/).
+ 3. [Perl](https://www.perl.org/).
+ 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a
+ recent version of [nasm](http://www.nasm.us/). If you download yasm with
+ the intention to work with Visual Studio, please download win32.exe or
+ win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe.
+ 5. Building the documentation requires
+ [doxygen version 1.8.10 or newer](http://doxygen.org).
+ 6. Emscripten builds require the portable
+ [EMSDK](https://kripken.github.io/emscripten-site/index.html).
+
+### Get the code {#get-the-code}
+
+The AV1 library source code is stored in the Alliance for Open Media Git
+repository:
+
+~~~
+ $ git clone https://aomedia.googlesource.com/aom
+ # By default, the above command stores the source in the aom directory:
+ $ cd aom
+~~~
+
+### Basic build {#basic-build}
+
+CMake replaces the configure step typical of many projects. Running CMake will
+produce configuration and build files for the currently selected CMake
+generator. For most systems the default generator is Unix Makefiles. The basic
+form of a makefile build is the following:
+
+~~~
+ $ cmake path/to/aom
+ $ make
+~~~
+
+The above will generate a makefile build that produces the AV1 library and
+applications for the current host system after the make step completes
+successfully. The compiler chosen varies by host platform, but a general rule
+applies: On systems where cc and c++ are present in $PATH at the time CMake is
+run the generated build will use cc and c++ by default.
+
+### Configuration options {#configuration-options}
+
+The AV1 codec library has a great many configuration options. These come in two
+varieties:
+
+ 1. Build system configuration options. These have the form `ENABLE_FEATURE`.
+ 2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`.
+
+Both types of options are set at the time CMake is run. The following example
+enables ccache and disables the AV1 encoder:
+
+~~~
+ $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0
+ $ make
+~~~
+
+The available configuration options are too numerous to list here. Build system
+configuration options can be found at the top of the CMakeLists.txt file found
+in the root of the AV1 repository, and AV1 codec configuration options can
+currently be found in the file `build/cmake/aom_config_defaults.cmake`.
+
+### Dylib builds {#dylib-builds}
+
+A dylib (shared object) build of the AV1 codec library can be enabled via the
+CMake built in variable `BUILD_SHARED_LIBS`:
+
+~~~
+ $ cmake path/to/aom -DBUILD_SHARED_LIBS=1
+ $ make
+~~~
+
+This is currently only supported on non-Windows targets.
+
+### Debugging {#debugging}
+
+Depending on the generator used there are multiple ways of going about
+debugging AV1 components. For single configuration generators like the Unix
+Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient:
+
+~~~
+ $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug
+~~~
+
+For Xcode, mainly because configuration controls for Xcode builds are buried two
+configuration windows deep and must be set for each subproject within the Xcode
+IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug:
+
+~~~
+ $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug
+~~~
+
+For Visual Studio the in-IDE configuration controls should be used. Simply set
+the IDE project configuration to Debug to allow for stepping through the code.
+
+In addition to the above it can sometimes be useful to debug only C and C++
+code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to
+generic at generation time:
+
+~~~
+ $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+~~~
+
+### Cross compiling {#cross-compiling}
+
+For the purposes of building the AV1 codec and applications and relative to the
+scope of this guide, all builds for architectures differing from the native host
+architecture will be considered cross compiles. The AV1 CMake build handles
+cross compiling via the use of toolchain files included in the AV1 repository.
+The toolchain files available at the time of this writing are:
+
+ - arm64-ios.cmake
+ - arm64-linux-clang.cmake
+ - arm64-linux-gcc.cmake
+ - arm64-mingw-gcc.cmake
+ - armv7-ios.cmake
+ - armv7-linux-gcc.cmake
+ - armv7-mingw-gcc.cmake
+ - armv7s-ios.cmake
+ - ppc-linux-gcc.cmake
+ - riscv-linux-gcc.cmake
+ - x86-ios-simulator.cmake
+ - x86-linux.cmake
+ - x86-macos.cmake
+ - x86-mingw-gcc.cmake
+ - x86\_64-ios-simulator.cmake
+ - x86\_64-mingw-gcc.cmake
+
+The following example demonstrates use of the x86-macos.cmake toolchain file on
+a x86\_64 MacOS host:
+
+~~~
+ $ cmake path/to/aom \
+ -DCMAKE_TOOLCHAIN_FILE=path/to/aom/build/cmake/toolchains/x86-macos.cmake
+ $ make
+~~~
+
+To build for an unlisted target creation of a new toolchain file is the best
+solution. The existing toolchain files can be used a starting point for a new
+toolchain file since each one exposes the basic requirements for toolchain files
+as used in the AV1 codec build.
+
+As a temporary work around an unoptimized AV1 configuration that builds only C
+and C++ sources can be produced using the following commands:
+
+~~~
+ $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+ $ make
+~~~
+
+In addition to the above it's important to note that the toolchain files
+suffixed with gcc behave differently than the others. These toolchain files
+attempt to obey the $CROSS environment variable.
+
+### Sanitizers {#sanitizers}
+
+Sanitizer integration is built-in to the CMake build system. To enable a
+sanitizer, add `-DSANITIZE=<type>` to the CMake command line. For example, to
+enable address sanitizer:
+
+~~~
+ $ cmake path/to/aom -DSANITIZE=address
+ $ make
+~~~
+
+Sanitizers available vary by platform, target, and compiler. Consult your
+compiler documentation to determine which, if any, are available.
+
+### Microsoft Visual Studio builds {#microsoft-visual-studio-builds}
+
+Building the AV1 codec library in Microsoft Visual Studio is supported. Visual
+Studio 2019 (16.0) or later is required. The following example demonstrates
+generating projects and a solution for the Microsoft IDE:
+
+~~~
+ # This does not require a bash shell; Command Prompt (cmd.exe) is fine.
+ # This assumes the build host is a Windows x64 computer.
+
+ # To create a Visual Studio 2022 solution for the x64 target:
+ $ cmake path/to/aom -G "Visual Studio 17 2022"
+
+ # To create a Visual Studio 2022 solution for the 32-bit x86 target:
+ $ cmake path/to/aom -G "Visual Studio 17 2022" -A Win32
+
+ # To create a Visual Studio 2019 solution for the x64 target:
+ $ cmake path/to/aom -G "Visual Studio 16 2019"
+
+ # To create a Visual Studio 2019 solution for the 32-bit x86 target:
+ $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32
+
+ # To build the solution:
+ $ cmake --build .
+~~~
+
+NOTE: The build system targets Windows 7 or later by compiling files with
+`-D_WIN32_WINNT=0x0601`.
+
+### Xcode builds {#xcode-builds}
+
+Building the AV1 codec library in Xcode is supported. The following example
+demonstrates generating an Xcode project:
+
+~~~
+ $ cmake path/to/aom -G Xcode
+~~~
+
+### Emscripten builds {#emscripten-builds}
+
+Building the AV1 codec library with Emscripten is supported. Typically this is
+used to hook into the AOMAnalyzer GUI application. These instructions focus on
+using the inspector with AOMAnalyzer, but all tools can be built with
+Emscripten.
+
+It is assumed here that you have already downloaded and installed the EMSDK,
+installed and activated at least one toolchain, and setup your environment
+appropriately using the emsdk\_env script.
+
+1. Build [AOM Analyzer](https://github.com/xiph/aomanalyzer).
+
+2. Configure the build:
+
+~~~
+ $ cmake path/to/aom \
+ -DENABLE_CCACHE=1 \
+ -DAOM_TARGET_CPU=generic \
+ -DENABLE_DOCS=0 \
+ -DENABLE_TESTS=0 \
+ -DCONFIG_ACCOUNTING=1 \
+ -DCONFIG_INSPECTION=1 \
+ -DCONFIG_MULTITHREAD=0 \
+ -DCONFIG_RUNTIME_CPU_DETECT=0 \
+ -DCONFIG_WEBM_IO=0 \
+ -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake
+~~~
+
+3. Build it: run make if that's your generator of choice:
+
+~~~
+ $ make inspect
+~~~
+
+4. Run the analyzer:
+
+~~~
+ # inspect.js is in the examples sub directory of the directory in which you
+ # executed cmake.
+ $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file
+~~~
+
+### Extra build flags {#extra-build-flags}
+
+Three variables allow for passing of additional flags to the build system.
+
+- AOM\_EXTRA\_C\_FLAGS
+- AOM\_EXTRA\_CXX\_FLAGS
+- AOM\_EXTRA\_EXE\_LINKER\_FLAGS
+
+The build system attempts to ensure the flags passed through the above variables
+are passed to tools last in order to allow for override of default behavior.
+These flags can be used, for example, to enable asserts in a release build:
+
+~~~
+ $ cmake path/to/aom \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DAOM_EXTRA_C_FLAGS=-UNDEBUG \
+ -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
+~~~
+
+### Build with VMAF support {#build-with-vmaf}
+
+After installing
+[libvmaf.a](https://github.com/Netflix/vmaf/tree/master/libvmaf),
+you can use it with the encoder:
+
+~~~
+ $ cmake path/to/aom -DCONFIG_TUNE_VMAF=1
+~~~
+
+Please note that the default VMAF model
+("/usr/local/share/model/vmaf_v0.6.1.json")
+will be used unless you set the following flag when running the encoder:
+
+~~~
+ # --vmaf-model-path=path/to/model
+~~~
+
+## Testing the AV1 codec {#testing-the-av1-codec}
+
+### Testing basics {#testing-basics}
+
+There are several methods of testing the AV1 codec. All of these methods require
+the presence of the AV1 source code and a working build of the AV1 library and
+applications.
+
+#### 1. Unit tests: {#unit-tests}
+
+The unit tests can be run at build time:
+
+~~~
+ # Before running the make command the LIBAOM_TEST_DATA_PATH environment
+ # variable should be set to avoid downloading the test files to the
+ # cmake build configuration directory.
+ $ cmake path/to/aom
+ # Note: The AV1 CMake build creates many test targets. Running make
+ # with multiple jobs will speed up the test run significantly.
+ $ make runtests
+~~~
+
+#### 2. Example tests: {#example-tests}
+
+The example tests require a bash shell and can be run in the following manner:
+
+~~~
+ # See the note above about LIBAOM_TEST_DATA_PATH above.
+ $ cmake path/to/aom
+ $ make
+ # It's best to build the testdata target using many make jobs.
+ # Running it like this will verify and download (if necessary)
+ # one at a time, which takes a while.
+ $ make testdata
+ $ path/to/aom/test/examples.sh --bin-path examples
+~~~
+
+#### 3. Encoder tests: {#encoder-tests}
+
+When making a change to the encoder run encoder tests to confirm that your
+change has a positive or negligible impact on encode quality. When running these
+tests the build configuration should be changed to enable internal encoder
+statistics:
+
+~~~
+ $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1
+ $ make
+~~~
+
+The repository contains scripts intended to make running these tests as simple
+as possible. The following example demonstrates creating a set of baseline clips
+for comparison to results produced after making your change to libaom:
+
+~~~
+ # This will encode all Y4M files in the current directory using the
+ # settings specified to create the encoder baseline statistical data:
+ $ cd path/to/test/inputs
+ # This command line assumes that run_encodes.sh, its helper script
+ # best_encode.sh, and the aomenc you intend to test are all within a
+ # directory in your PATH.
+ $ run_encodes.sh 200 500 50 baseline
+~~~
+
+After making your change and creating the baseline clips, you'll need to run
+encodes that include your change(s) to confirm that things are working as
+intended:
+
+~~~
+ # This will encode all Y4M files in the current directory using the
+ # settings specified to create the statistical data for your change:
+ $ cd path/to/test/inputs
+ # This command line assumes that run_encodes.sh, its helper script
+ # best_encode.sh, and the aomenc you intend to test are all within a
+ # directory in your PATH.
+ $ run_encodes.sh 200 500 50 mytweak
+~~~
+
+After creating both data sets you can use `test/visual_metrics.py` to generate a
+report that can be viewed in a web browser:
+
+~~~
+ $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \
+ > mytweak.html
+~~~
+
+You can view the report by opening mytweak.html in a web browser.
+
+
+### IDE hosted tests {#ide-hosted-tests}
+
+By default the generated projects files created by CMake will not include the
+runtests and testdata rules when generating for IDEs like Microsoft Visual
+Studio and Xcode. This is done to avoid intolerably long build cycles in the
+IDEs-- IDE behavior is to build all targets when selecting the build project
+options in MSVS and Xcode. To enable the test rules in IDEs the
+`ENABLE_IDE_TEST_HOSTING` variable must be enabled at CMake generation time:
+
+~~~
+ # This example uses Xcode. To get a list of the generators
+ # available, run cmake with the -G argument missing its
+ # value.
+ $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode
+~~~
+
+### Downloading the test data {#downloading-the-test-data}
+
+The fastest and easiest way to obtain the test data is to use CMake to generate
+a build using the Unix Makefiles generator, and then to build only the testdata
+rule. By default the test files will be downloaded to the current directory. The
+`LIBAOM_TEST_DATA_PATH` environment variable can be used to set a
+custom one.
+
+~~~
+ $ cmake path/to/aom -G "Unix Makefiles"
+ # 28 is used because there are 28 test files as of this writing.
+ $ make -j28 testdata
+~~~
+
+The above make command will only download and verify the test data.
+
+### Adding a new test data file {#adding-a-new-test-data-file}
+
+First, add the new test data file to the `aom-test-data` bucket of the
+`aomedia-testing` project on Google Cloud Platform. You may need to ask someone
+with the necessary access permissions to do this for you.
+
+NOTE: When a new test data file is added to the `aom-test-data` bucket, its
+"Public access" is initially "Not public". We need to change its
+"Public access" to "Public" by using the following
+[`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command:
+~~~
+ $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name
+~~~
+This command grants the `AllUsers` group READ access to the file named
+"test-data-file-name" in the `aom-test-data` bucket.
+
+Once the new test data file has been added to `aom-test-data`, create a CL to
+add the name of the new test data file to `test/test_data_util.cmake` and add
+the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1
+checksum of a file can be calculated by running the `sha1sum` command on the
+file.)
+
+### Additional test data {#additional-test-data}
+
+The test data mentioned above is strictly intended for unit testing.
+
+Additional input data for testing the encoder can be obtained from:
+https://media.xiph.org/video/derf/
+
+### Sharded testing {#sharded-testing}
+
+The AV1 codec library unit tests are built upon gtest which supports sharding of
+test jobs. Sharded test runs can be achieved in a couple of ways.
+
+#### 1. Running test\_libaom directly: {#running-test_libaom-directly}
+
+~~~
+ # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
+ # shards.
+ $ export GTEST_TOTAL_SHARDS=10
+ # (GTEST shard indexing is 0 based).
+ $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \
+ | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom
+~~~
+
+To create a test shard for each CPU core available on the current system set
+`GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one.
+
+#### 2. Running the tests via the CMake build: {#running-the-tests-via-the-cmake-build}
+
+~~~
+ # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See
+ # the IDE hosted tests section above for more information. If the IDE
+ # supports building targets concurrently tests will be sharded by default.
+
+ # For make and ninja builds the -j parameter controls the number of shards
+ # at test run time. This example will run the tests using 10 shards via
+ # make.
+ $ make -j10 runtests
+~~~
+
+The maximum number of test targets that can run concurrently is determined by
+the number of CPUs on the system where the build is configured as detected by
+CMake. A system with 24 cores can run 24 test shards using a value of 24 with
+the `-j` parameter. When CMake is unable to detect the number of cores 10 shards
+is the default maximum value.
+
+## Coding style {#coding-style}
+
+We are using the Google C Coding Style defined by the
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+The coding style used by this project is enforced with clang-format using the
+configuration contained in the
+[.clang-format](https://chromium.googlesource.com/webm/aom/+/main/.clang-format)
+file in the root of the repository.
+
+You can download clang-format using your system's package manager, or directly
+from [llvm.org](http://llvm.org/releases/download.html). You can also view the
+[documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org.
+Output from clang-format varies by clang-format version, for best results your
+version should match the one used on Jenkins. You can find the clang-format
+version by reading the comment in the `.clang-format` file linked above.
+
+Before pushing changes for review you can format your code with:
+
+~~~
+ # Apply clang-format to modified .c, .h and .cc files
+ $ clang-format -i --style=file \
+ $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
+~~~
+
+Check the .clang-format file for the version used to generate it if there is any
+difference between your local formatting and the review system.
+
+Some Git installations have clang-format integration. Here are some examples:
+
+~~~
+ # Apply clang-format to all staged changes:
+ $ git clang-format
+
+ # Clang format all staged and unstaged changes:
+ $ git clang-format -f
+
+ # Clang format all staged and unstaged changes interactively:
+ $ git clang-format -f -p
+~~~
+
+## Submitting patches {#submitting-patches}
+
+We manage the submission of patches using the
+[Gerrit](https://www.gerritcodereview.com/) code review tool. This tool
+implements a workflow on top of the Git version control system to ensure that
+all changes get peer reviewed and tested prior to their distribution.
+
+### Login cookie {#login-cookie}
+
+Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with
+your account (Gmail credentials, for example). Next, follow the
+`Generate Password` Password link at the top of the page. You’ll be given
+instructions for creating a cookie to use with our Git repos.
+
+You must also have a Gerrit account associated with your Google account. To do
+this visit the [Gerrit review server](https://aomedia-review.googlesource.com)
+and click "Sign in" (top right).
+
+### Contributor agreement {#contributor-agreement}
+
+You will be required to execute a
+[contributor agreement](http://aomedia.org/license) to ensure that the AOMedia
+Project has the right to distribute your changes.
+
+Note: If you are pushing changes on behalf of an Alliance for Open Media member
+organization this step is not necessary.
+
+### Testing your code {#testing-your-code}
+
+The testing basics are covered in the [testing section](#testing-the-av1-codec)
+above.
+
+In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run
+through Jenkins instances upon upload to gerrit.
+
+### Commit message hook {#commit-message-hook}
+
+Gerrit requires that each submission include a unique Change-Id. You can assign
+one manually using git commit --amend, but it’s easier to automate it with the
+commit-msg hook provided by Gerrit.
+
+Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an
+example:
+
+~~~
+ $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg
+
+ # Next, ensure that the downloaded commit-msg script is executable:
+ $ chmod u+x aom/.git/hooks/commit-msg
+~~~
+
+See the Gerrit
+[documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html)
+for more information.
+
+### Upload your change {#upload-your-change}
+
+The command line to upload your patch looks like this:
+
+~~~
+ $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/main
+~~~
+
+### Incorporating reviewer comments {#incorporating-reviewer-comments}
+
+If you previously uploaded a change to Gerrit and the Approver has asked for
+changes, follow these steps:
+
+1. Edit the files to make the changes the reviewer has requested.
+2. Recommit your edits using the --amend flag, for example:
+
+~~~
+ $ git commit -a --amend
+~~~
+
+3. Use the same git push command as above to upload to Gerrit again for another
+ review cycle.
+
+In general, you should not rebase your changes when doing updates in response to
+review. Doing so can make it harder to follow the evolution of your change in
+the diff view.
+
+### Submitting your change {#submitting-your-change}
+
+Once your change has been Approved and Verified, you can “submit” it through the
+Gerrit UI. This will usually automatically rebase your change onto the branch
+specified.
+
+Sometimes this can’t be done automatically. If you run into this problem, you
+must rebase your changes manually:
+
+~~~
+ $ git fetch
+ $ git rebase origin/branchname
+~~~
+
+If there are any conflicts, resolve them as you normally would with Git. When
+you’re done, reupload your change.
+
+### Viewing the status of uploaded changes {#viewing-the-status-of-uploaded-changes}
+
+To check the status of a change that you uploaded, open
+[Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My >
+Changes.
+
+## Support {#support}
+
+This library is an open source project supported by its community. Please
+please email aomediacodec@jointdevelopment.kavi.com for help.
+
+## Bug reports {#bug-reports}
+
+Bug reports can be filed in the Alliance for Open Media
+[issue tracker](https://bugs.chromium.org/p/aomedia/issues/list).
diff --git a/third_party/aom/Sample.cfg b/third_party/aom/Sample.cfg
new file mode 100644
index 0000000000..d5dbe66415
--- /dev/null
+++ b/third_party/aom/Sample.cfg
@@ -0,0 +1,35 @@
+#sample config file
+super_block_size = 128 # super block size. 0, 64 or 128
+max_partition_size = 128 # max partition size(8, 16, 32, 64, 128)
+min_partition_size = 4 # min partition size(4, 8, 16, 32, 64)
+disable_rect_partition_type = 0 # disable rectangle partition type
+disable_ab_partition_type = 0 # disable AB partition type
+disable_1to4_partition_type = 0 # disable 1 to 4 and 4 to 1 partition type
+disable_intra_angle_delta = 0 # disable intra angle delta
+disable_paeth_intra = 0 # disable paeth intra
+disable_smooth_intra = 0 # disable intra smooth mode
+disable_intra_edge_filter = 0 # disable intra edge filter
+disable_filter_intra = 0 # disable filter intra
+disable_intrabc = 0 # disable Intra Block Copy
+disable_cfl = 0 # disable chroma from luma prediction
+disable_palette = 0 # disable Palette
+disable_flip_idtx = 0 # disable flip and identity transform
+disable_tx_64x64 = 0 # disable 64x64 transform
+reduced_tx_type_set = 0 # use reduced transform type set
+reduced_reference_set = 0 # use reduced reference frame set
+disable_obmc = 0 # disable OBMC
+disable_warp_motion = 0 # disable Warped Motion
+disable_global_motion = 0 # disable global motion
+disable_ref_frame_mv = 0 # disable ref mv
+disable_dual_filter = 0 # disable dual interpolation filter
+disable_one_sided_comp = 0 # disable one sided compound mode
+disable_masked_comp = 0 # disable masked compound prediction
+disable_diff_wtd_comp = 0 # disable difference weighted compound mode
+disable_inter_inter_wedge = 0 # disable inter/inter wedge comp
+disable_dist_wtd_comp = 0 # disable distant weighted compound mode
+disable_inter_intra_comp = 0 # disable inter/intra compound mode.
+disable_inter_intra_wedge = 0 # disable inter/intra wedge comp
+disable_smooth_inter_intra = 0 # disable smooth inter/intra
+disable_cdef = 0 # disable CDEF filter
+disable_lr = 0 # disable Loop Restoration Filter
+disable_trellis_quant = 0 # disable trellis quantization \ No newline at end of file
diff --git a/third_party/aom/aom/aom.h b/third_party/aom/aom/aom.h
new file mode 100644
index 0000000000..0650a11f6b
--- /dev/null
+++ b/third_party/aom/aom/aom.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom AOM
+ * \ingroup codecs
+ * AOM is aom's newest video compression algorithm that uses motion
+ * compensated prediction, Discrete Cosine Transform (DCT) coding of the
+ * prediction error signal and context dependent entropy coding techniques
+ * based on arithmetic principles. It features:
+ * - YUV 4:2:0 image format
+ * - Macro-block based coding (16x16 luma plus two 8x8 chroma)
+ * - 1/4 (1/8) pixel accuracy motion compensated prediction
+ * - 4x4 DCT transform
+ * - 128 level linear quantizer
+ * - In loop deblocking filter
+ * - Context-based entropy coding
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides controls common to both the AOM encoder and decoder.
+ */
+#ifndef AOM_AOM_AOM_H_
+#define AOM_AOM_AOM_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Control functions
+ *
+ * The set of macros define the control functions of AOM interface
+ * The range for common control IDs is 230-255(max).
+ */
+enum aom_com_control_id {
+ /*!\brief Codec control function to get a pointer to a reference frame
+ *
+ * av1_ref_frame_t* parameter
+ */
+ AV1_GET_REFERENCE = 230,
+
+ /*!\brief Codec control function to write a frame into a reference buffer
+ *
+ * av1_ref_frame_t* parameter
+ */
+ AV1_SET_REFERENCE = 231,
+
+ /*!\brief Codec control function to get a copy of reference frame from the
+ * decoder
+ *
+ * av1_ref_frame_t* parameter
+ */
+ AV1_COPY_REFERENCE = 232,
+
+ /*!\brief Codec control function to get a pointer to the new frame
+ *
+ * aom_image_t* parameter
+ */
+ AV1_GET_NEW_FRAME_IMAGE = 233,
+
+ /*!\brief Codec control function to copy the new frame to an external buffer
+ *
+ * aom_image_t* parameter
+ */
+ AV1_COPY_NEW_FRAME_IMAGE = 234,
+
+ /*!\brief Start point of control IDs for aom_dec_control_id.
+ * Any new common control IDs should be added above.
+ */
+ AOM_DECODER_CTRL_ID_START = 256
+ // No common control IDs should be added after AOM_DECODER_CTRL_ID_START.
+};
+
+/*!\brief AV1 specific reference frame data struct
+ *
+ * Define the data struct to access av1 reference frames.
+ */
+typedef struct av1_ref_frame {
+ int idx; /**< frame index to get (input) */
+ int use_external_ref; /**< Directly use external ref buffer(decoder only) */
+ aom_image_t img; /**< img structure to populate (output) */
+} av1_ref_frame_t;
+
+/*!\cond */
+/*!\brief aom decoder control function parameter type
+ *
+ * Defines the data type for each of AOM decoder control function requires.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
+ */
+AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_GET_REFERENCE
+
+AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_SET_REFERENCE
+
+AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_COPY_REFERENCE
+
+AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
+
+AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE
+
+/*!\endcond */
+/*! @} - end defgroup aom */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_AOM_H_
diff --git a/third_party/aom/aom/aom_codec.h b/third_party/aom/aom/aom_codec.h
new file mode 100644
index 0000000000..d5b8790a98
--- /dev/null
+++ b/third_party/aom/aom/aom_codec.h
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+// Internal implementation details
+///////////////////////////////////////////////////////////////////////////////
+//
+// There are two levels of interfaces used to access the AOM codec: the
+// aom_codec_iface and the aom_codec_ctx.
+//
+// 1. aom_codec_iface_t
+// (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
+// aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
+// av1/av1_dx_iface.c)
+//
+// Used to initialize the codec context, which contains the configuration for
+// for modifying the encoder/decoder during run-time. See the other
+// documentation in this header file for more details. For the most part,
+// users will call helper functions, such as aom_codec_iface_name,
+// aom_codec_get_caps, etc., to interact with it.
+//
+// The main purpose of the aom_codec_iface_t is to provide a way to generate
+// a default codec config, find out what capabilities the implementation has,
+// and create an aom_codec_ctx_t (which is actually used to interact with the
+// codec).
+//
+// Note that the implementations for the AV1 algorithm are located in
+// av1/av1_cx_iface.c and av1/av1_dx_iface.c
+//
+//
+// 2. aom_codec_ctx_t
+// (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
+// aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
+//
+// The actual interface between user code and the codec. It stores the name
+// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
+// initialization flags, a config for either encoder or the decoder, and a
+// pointer to internal data.
+//
+// The codec is configured / queried through calls to aom_codec_control,
+// which takes a control ID (listed in aomcx.h and aomdx.h) and a parameter.
+// In the case of "getter" control IDs, the parameter is modified to have
+// the requested value; in the case of "setter" control IDs, the codec's
+// configuration is changed based on the parameter. Note that a aom_codec_err_t
+// is returned, which indicates if the operation was successful or not.
+//
+// Note that for the encoder, the aom_codec_alg_priv_t points to the
+// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
+// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
+// here and also used in the core algorithm.
+//
+// At the end, aom_codec_destroy should be called for each initialized
+// aom_codec_ctx_t.
+
+/*!\defgroup codec Common Algorithm Interface
+ * This abstraction allows applications to easily support multiple video
+ * formats with minimal code duplication. This section describes the interface
+ * common to all codecs (both encoders and decoders).
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the codec algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video codec algorithm.
+ *
+ * An application instantiates a specific codec instance by using
+ * aom_codec_dec_init() or aom_codec_enc_init() and a pointer to the
+ * algorithm's interface structure:
+ * <pre>
+ * my_app.c:
+ * extern aom_codec_iface_t my_codec;
+ * {
+ * aom_codec_ctx_t algo;
+ * int threads = 4;
+ * aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ * res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
+ * }
+ * </pre>
+ *
+ * Once initialized, the instance is managed using other functions from
+ * the aom_codec_* family.
+ */
+#ifndef AOM_AOM_AOM_CODEC_H_
+#define AOM_AOM_AOM_CODEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+
+/*!\brief Decorator indicating a function is deprecated */
+#ifndef AOM_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define AOM_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define AOM_DEPRECATED
+#else
+#define AOM_DEPRECATED
+#endif
+#endif /* AOM_DEPRECATED */
+
+#ifndef AOM_DECLSPEC_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
+#elif defined(_MSC_VER)
+/*!\brief \copydoc #AOM_DEPRECATED */
+#define AOM_DECLSPEC_DEPRECATED __declspec(deprecated)
+#else
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
+#endif
+#endif /* AOM_DECLSPEC_DEPRECATED */
+
+/*!\brief Decorator indicating a function is potentially unused */
+#ifdef AOM_UNUSED
+#elif defined(__GNUC__) || defined(__clang__)
+#define AOM_UNUSED __attribute__((unused))
+#else
+#define AOM_UNUSED
+#endif
+
+/*!\brief Decorator indicating that given struct/union/enum is packed */
+#ifndef ATTRIBUTE_PACKED
+#if defined(__GNUC__) && __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((packed))
+#elif defined(_MSC_VER)
+#define ATTRIBUTE_PACKED
+#else
+#define ATTRIBUTE_PACKED
+#endif
+#endif /* ATTRIBUTE_PACKED */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+
+/*!\brief Algorithm return codes */
+typedef enum {
+ /*!\brief Operation completed without error */
+ AOM_CODEC_OK,
+
+ /*!\brief Unspecified error */
+ AOM_CODEC_ERROR,
+
+ /*!\brief Memory operation failed */
+ AOM_CODEC_MEM_ERROR,
+
+ /*!\brief ABI version mismatch */
+ AOM_CODEC_ABI_MISMATCH,
+
+ /*!\brief Algorithm does not have required capability */
+ AOM_CODEC_INCAPABLE,
+
+ /*!\brief The given bitstream is not supported.
+ *
+ * The bitstream was unable to be parsed at the highest level. The decoder
+ * is unable to proceed. This error \ref SHOULD be treated as fatal to the
+ * stream. */
+ AOM_CODEC_UNSUP_BITSTREAM,
+
+ /*!\brief Encoded bitstream uses an unsupported feature
+ *
+ * The decoder does not implement a feature required by the encoder. This
+ * return code should only be used for features that prevent future
+ * pictures from being properly decoded. This error \ref MAY be treated as
+ * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
+ */
+ AOM_CODEC_UNSUP_FEATURE,
+
+ /*!\brief The coded data for this stream is corrupt or incomplete
+ *
+ * There was a problem decoding the current frame. This return code
+ * should only be used for failures that prevent future pictures from
+ * being properly decoded. This error \ref MAY be treated as fatal to the
+ * stream or \ref MAY be treated as fatal to the current GOP. If decoding
+ * is continued for the current GOP, artifacts may be present.
+ */
+ AOM_CODEC_CORRUPT_FRAME,
+
+ /*!\brief An application-supplied parameter is not valid.
+ *
+ */
+ AOM_CODEC_INVALID_PARAM,
+
+ /*!\brief An iterator reached the end of list.
+ *
+ */
+ AOM_CODEC_LIST_END
+
+} aom_codec_err_t;
+
+/*! \brief Codec capabilities bitfield
+ *
+ * Each codec advertises the capabilities it supports as part of its
+ * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
+ * or functionality, and are not required to be supported.
+ *
+ * The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+typedef long aom_codec_caps_t;
+#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
+#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ * Certain codec features must be known at initialization time, to allow for
+ * proper memory allocation.
+ *
+ * The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+typedef long aom_codec_flags_t;
+
+/*!\brief Time Stamp Type
+ *
+ * An integer, which when multiplied by the stream's time base, provides
+ * the absolute time of a sample.
+ */
+typedef int64_t aom_codec_pts_t;
+
+/*!\brief Codec interface structure.
+ *
+ * Contains function pointers and other data private to the codec
+ * implementation. This structure is opaque to the application. Common
+ * functions used with this structure:
+ * - aom_codec_iface_name(aom_codec_iface_t *iface): get the
+ * name of the codec
+ * - aom_codec_get_caps(aom_codec_iface_t *iface): returns
+ * the capabilities of the codec
+ * - aom_codec_enc_config_default: generate the default config for
+ * initializing the encoder (see documentation in aom_encoder.h)
+ * - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context
+ * structure (see documentation on aom_codec_ctx).
+ *
+ * To get access to the AV1 encoder and decoder, use aom_codec_av1_cx() and
+ * aom_codec_av1_dx().
+ */
+typedef const struct aom_codec_iface aom_codec_iface_t;
+
+/*!\brief Codec private data structure.
+ *
+ * Contains data private to the codec implementation. This structure is opaque
+ * to the application.
+ */
+typedef struct aom_codec_priv aom_codec_priv_t;
+
+/*!\brief Compressed Frame Flags
+ *
+ * This type represents a bitfield containing information about a compressed
+ * frame that may be useful to an application. The most significant 16 bits
+ * can be used by an algorithm to provide additional detail, for example to
+ * support frame types that are codec specific (MPEG-1 D-frames for example)
+ */
+typedef uint32_t aom_codec_frame_flags_t;
+#define AOM_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
+#define AOM_FRAME_IS_DROPPABLE 0x2u
+/*!\brief this is an INTRA_ONLY frame */
+#define AOM_FRAME_IS_INTRAONLY 0x10u
+/*!\brief this is an S-frame */
+#define AOM_FRAME_IS_SWITCH 0x20u
+/*!\brief this is an error-resilient frame */
+#define AOM_FRAME_IS_ERROR_RESILIENT 0x40u
+/*!\brief this is a key-frame dependent recovery-point frame */
+#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80u
+
+/*!\brief Iterator
+ *
+ * Opaque storage used for iterating over lists.
+ */
+typedef const void *aom_codec_iter_t;
+
+/*!\brief Codec context structure
+ *
+ * All codecs \ref MUST support this context structure fully. In general,
+ * this data should be considered private to the codec algorithm, and
+ * not be manipulated or examined by the calling application. Applications
+ * may reference the 'name' member to get a printable description of the
+ * algorithm.
+ */
+typedef struct aom_codec_ctx {
+ const char *name; /**< Printable interface name */
+ aom_codec_iface_t *iface; /**< Interface pointers */
+ aom_codec_err_t err; /**< Last returned error */
+ const char *err_detail; /**< Detailed info, if available */
+ aom_codec_flags_t init_flags; /**< Flags passed at init time */
+ union {
+ /**< Decoder Configuration Pointer */
+ const struct aom_codec_dec_cfg *dec;
+ /**< Encoder Configuration Pointer */
+ const struct aom_codec_enc_cfg *enc;
+ const void *raw;
+ } config; /**< Configuration pointer aliasing union */
+ aom_codec_priv_t *priv; /**< Algorithm private storage */
+} aom_codec_ctx_t;
+
+/*!\brief Bit depth for codec
+ * *
+ * This enumeration determines the bit depth of the codec.
+ */
+typedef enum aom_bit_depth {
+ AOM_BITS_8 = 8, /**< 8 bits */
+ AOM_BITS_10 = 10, /**< 10 bits */
+ AOM_BITS_12 = 12, /**< 12 bits */
+} aom_bit_depth_t;
+
+/*!\brief Superblock size selection.
+ *
+ * Defines the superblock size used for encoding. The superblock size can
+ * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+ * selected by the encoder for each frame.
+ */
+typedef enum aom_superblock_size {
+ AOM_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */
+ AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
+ AOM_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */
+} aom_superblock_size_t;
+
+/*
+ * Library Version Number Interface
+ *
+ * For example, see the following sample return values:
+ * aom_codec_version() (1<<16 | 2<<8 | 3)
+ * aom_codec_version_str() "v1.2.3-rc1-16-gec6a1ba"
+ * aom_codec_version_extra_str() "rc1-16-gec6a1ba"
+ */
+
+/*!\brief Return the version information (as an integer)
+ *
+ * Returns a packed encoding of the library version number. This will only
+ * include the major.minor.patch component of the version number. Note that this
+ * encoded value should be accessed through the macros provided, as the encoding
+ * may change in the future.
+ *
+ */
+int aom_codec_version(void);
+
+/*!\brief Return the major version number */
+#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
+
+/*!\brief Return the minor version number */
+#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
+
+/*!\brief Return the patch version number */
+#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
+
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable string containing the full library version number. This
+ * may contain additional text following the three digit version number, as to
+ * indicate release candidates, pre-release versions, etc.
+ *
+ */
+const char *aom_codec_version_str(void);
+
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable "extra string". This is the component of the string
+ * returned by aom_codec_version_str() following the three digit version number.
+ *
+ */
+const char *aom_codec_version_extra_str(void);
+
+/*!\brief Return the build configuration
+ *
+ * Returns a printable string containing an encoded version of the build
+ * configuration. This may be useful to aom support.
+ *
+ */
+const char *aom_codec_build_config(void);
+
+/*!\brief Return the name for a given interface
+ *
+ * Returns a human readable string for name of the given codec interface.
+ *
+ * \param[in] iface Interface pointer
+ *
+ */
+const char *aom_codec_iface_name(aom_codec_iface_t *iface);
+
+/*!\brief Convert error number to printable string
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in] err Error number.
+ *
+ */
+const char *aom_codec_err_to_string(aom_codec_err_t err);
+
+/*!\brief Retrieve error synopsis for codec context
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in] ctx Pointer to this instance's context.
+ *
+ */
+const char *aom_codec_error(const aom_codec_ctx_t *ctx);
+
+/*!\brief Retrieve detailed error information for codec context
+ *
+ * Returns a human readable string providing detailed information about
+ * the last error. The returned string is only valid until the next
+ * aom_codec_* function call (except aom_codec_error and
+ * aom_codec_error_detail) on the codec context.
+ *
+ * \param[in] ctx Pointer to this instance's context.
+ *
+ * \retval NULL
+ * No detailed information is available.
+ */
+const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx);
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all codecs.
+ * They represent the base case functionality expected of all codecs.
+ */
+
+/*!\brief Destroy a codec instance
+ *
+ * Destroys a codec context, freeing any associated memory buffers.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ *
+ * \retval #AOM_CODEC_OK
+ * The codec instance has been destroyed.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * ctx is a null pointer.
+ * \retval #AOM_CODEC_ERROR
+ * Codec context not initialized.
+ */
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx);
+
+/*!\brief Get the capabilities of an algorithm.
+ *
+ * Retrieves the capabilities bitfield from the algorithm's interface.
+ *
+ * \param[in] iface Pointer to the algorithm interface
+ *
+ */
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
+
+/*!\name Codec Control
+ *
+ * The aom_codec_control function exchanges algorithm specific data with the
+ * codec instance. Additionally, the macro AOM_CODEC_CONTROL_TYPECHECKED is
+ * provided, which will type-check the parameter against the control ID before
+ * calling aom_codec_control - note that this macro requires the control ID
+ * to be directly encoded in it, e.g.,
+ * AOM_CODEC_CONTROL_TYPECHECKED(&ctx, AOME_SET_CPUUSED, 8).
+ *
+ * The codec control IDs can be found in aom.h, aomcx.h, and aomdx.h
+ * (defined as aom_com_control_id, aome_enc_control_id, and aom_dec_control_id).
+ * @{
+ */
+/*!\brief Algorithm Control
+ *
+ * aom_codec_control takes a context, a control ID, and a third parameter
+ * (with varying type). If the context is non-null and an error occurs,
+ * ctx->err will be set to the same value as the return value.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] ctrl_id Algorithm specific control identifier.
+ * Must be nonzero.
+ *
+ * \retval #AOM_CODEC_OK
+ * The control request was processed.
+ * \retval #AOM_CODEC_ERROR
+ * The control request was not processed.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * The control ID was zero, or the data was not valid.
+ */
+aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...);
+
+/*!\brief Key & Value API
+ *
+ * aom_codec_set_option() takes a context, a key (option name) and a value. If
+ * the context is non-null and an error occurs, ctx->err will be set to the same
+ * value as the return value.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] name The name of the option (key)
+ * \param[in] value The value of the option
+ *
+ * \retval #AOM_CODEC_OK
+ * The value of the option was set.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * The data was not valid.
+ * \retval #AOM_CODEC_ERROR
+ * The option was not successfully set.
+ */
+aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name,
+ const char *value);
+
+/*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible)
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to aom_codec_control(). However, it requires the explicit control ID
+ * be passed in (it cannot be passed in via a variable) -- otherwise a compiler
+ * error will occur. After the type checking, it calls aom_codec_control.
+ */
+#define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \
+ aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
+
+/*!\brief Creates type checking mechanisms for aom_codec_control
+ *
+ * It defines a static function with the correctly typed arguments as a wrapper
+ * to the type-unsafe aom_codec_control function. It also creates a typedef
+ * for each type.
+ */
+#define AOM_CTRL_USE_TYPE(id, typ) \
+ static aom_codec_err_t aom_codec_control_typechecked_##id( \
+ aom_codec_ctx_t *, int, typ) AOM_UNUSED; \
+ static aom_codec_err_t aom_codec_control_typechecked_##id( \
+ aom_codec_ctx_t *ctx, int ctrl, typ data) { \
+ return aom_codec_control(ctx, ctrl, data); \
+ } /**<\hideinitializer*/ \
+ typedef typ aom_codec_control_type_##id;
+/*!@} end Codec Control group */
+
+/*!\brief OBU types. */
+typedef enum ATTRIBUTE_PACKED {
+ OBU_SEQUENCE_HEADER = 1,
+ OBU_TEMPORAL_DELIMITER = 2,
+ OBU_FRAME_HEADER = 3,
+ OBU_TILE_GROUP = 4,
+ OBU_METADATA = 5,
+ OBU_FRAME = 6,
+ OBU_REDUNDANT_FRAME_HEADER = 7,
+ OBU_TILE_LIST = 8,
+ OBU_PADDING = 15,
+} OBU_TYPE;
+
+/*!\brief OBU metadata types. */
+typedef enum {
+ OBU_METADATA_TYPE_AOM_RESERVED_0 = 0,
+ OBU_METADATA_TYPE_HDR_CLL = 1,
+ OBU_METADATA_TYPE_HDR_MDCV = 2,
+ OBU_METADATA_TYPE_SCALABILITY = 3,
+ OBU_METADATA_TYPE_ITUT_T35 = 4,
+ OBU_METADATA_TYPE_TIMECODE = 5,
+} OBU_METADATA_TYPE;
+
+/*!\brief Returns string representation of OBU_TYPE.
+ *
+ * \param[in] type The OBU_TYPE to convert to string.
+ */
+const char *aom_obu_type_to_string(OBU_TYPE type);
+
+/*!@} - end defgroup codec*/
+#ifdef __cplusplus
+}
+#endif
+#endif // AOM_AOM_AOM_CODEC_H_
diff --git a/third_party/aom/aom/aom_decoder.h b/third_party/aom/aom/aom_decoder.h
new file mode 100644
index 0000000000..229cf7358f
--- /dev/null
+++ b/third_party/aom/aom/aom_decoder.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_DECODER_H_
+#define AOM_AOM_AOM_DECODER_H_
+
+/*!\defgroup decoder Decoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this decoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all decoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video decoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_codec.h" // IWYU pragma: export
+#include "aom/aom_frame_buffer.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_DECODER_ABI_VERSION \
+ (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+/*! \brief Decoder capabilities bitfield
+ *
+ * Each decoder advertises the capabilities it supports as part of its
+ * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
+ * or functionality, and are not required to be supported by a decoder.
+ *
+ * The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+/*!brief Can support external frame buffers */
+#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ * Certain codec features must be known at initialization time, to allow for
+ * proper memory allocation.
+ *
+ * The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+
+/*!\brief Stream properties
+ *
+ * This structure is used to query or set properties of the decoded
+ * stream.
+ */
+typedef struct aom_codec_stream_info {
+ unsigned int w; /**< Width (or 0 for unknown/default) */
+ unsigned int h; /**< Height (or 0 for unknown/default) */
+ unsigned int is_kf; /**< Current frame is a keyframe */
+ unsigned int number_spatial_layers; /**< Number of spatial layers */
+ unsigned int number_temporal_layers; /**< Number of temporal layers */
+ unsigned int is_annexb; /**< Is Bitstream in Annex-B format */
+} aom_codec_stream_info_t;
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all decoders.
+ * They represent the base case functionality expected of all decoders.
+ */
+
+/*!\brief Initialization Configurations
+ *
+ * This structure is used to pass init time configuration options to the
+ * decoder.
+ */
+typedef struct aom_codec_dec_cfg {
+ unsigned int threads; /**< Maximum number of threads to use, default 1 */
+ unsigned int w; /**< Width */
+ unsigned int h; /**< Height */
+ unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */
+} aom_codec_dec_cfg_t; /**< alias for struct aom_codec_dec_cfg */
+
+/*!\brief Initialize a decoder instance
+ *
+ * Initializes a decoder context using the given interface. Applications
+ * should call the aom_codec_dec_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with cmake -DCONFIG_MULTITHREAD=0, this
+ * call is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in] ctx Pointer to this instance's context.
+ * \param[in] iface Pointer to the algorithm interface to use.
+ * \param[in] cfg Configuration to use, if known. May be NULL.
+ * \param[in] flags Bitfield of AOM_CODEC_USE_* flags
+ * \param[in] ver ABI version number. Must be set to
+ * AOM_DECODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ * The decoder algorithm has been initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ * Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+ aom_codec_iface_t *iface,
+ const aom_codec_dec_cfg_t *cfg,
+ aom_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for aom_codec_dec_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_dec_init(ctx, iface, cfg, flags) \
+ aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)
+
+/*!\brief Parse stream info from a buffer
+ *
+ * Performs high level parsing of the bitstream. Construction of a decoder
+ * context is not necessary. Can be used to determine if the bitstream is
+ * of the proper format, and to extract information from the stream.
+ *
+ * \param[in] iface Pointer to the algorithm interface
+ * \param[in] data Pointer to a block of data to parse
+ * \param[in] data_sz Size of the data buffer
+ * \param[in,out] si Pointer to stream info to update. The is_annexb
+ * member \ref MUST be properly initialized. This
+ * function sets the rest of the members.
+ *
+ * \retval #AOM_CODEC_OK
+ * Bitstream is parsable and stream information updated.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * One of the arguments is invalid, for example a NULL pointer.
+ * \retval #AOM_CODEC_UNSUP_BITSTREAM
+ * The decoder didn't recognize the coded data, or the
+ * buffer was too short.
+ */
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+ const uint8_t *data, size_t data_sz,
+ aom_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in,out] si Pointer to stream info to update.
+ *
+ * \retval #AOM_CODEC_OK
+ * Bitstream is parsable and stream information updated.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * One of the arguments is invalid, for example a NULL pointer.
+ * \retval #AOM_CODEC_UNSUP_BITSTREAM
+ * The decoder couldn't parse the submitted data.
+ */
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+ aom_codec_stream_info_t *si);
+
+/*!\brief Decode data
+ *
+ * Processes a buffer of coded data. Encoded data \ref MUST be passed in DTS
+ * (decode time stamp) order. Frames produced will always be in PTS
+ * (presentation time stamp) order.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] data Pointer to this block of new coded data.
+ * \param[in] data_sz Size of the coded data, in bytes.
+ * \param[in] user_priv Application specific data to associate with
+ * this frame.
+ *
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
+ * and future pictures can be decoded without error. Otherwise,
+ * see the descriptions of the other error codes in ::aom_codec_err_t
+ * for recoverability capabilities.
+ */
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+ size_t data_sz, void *user_priv);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in,out] iter Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ * produced will always be in PTS (presentation time stamp) order.
+ */
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);
+
+/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+ *
+ * The following function is required to be implemented for all decoders
+ * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+ * Calling this function for codecs that don't advertise this capability
+ * will result in an error code being returned, usually AOM_CODEC_INCAPABLE.
+ * @{
+ */
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libaom will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] cb_get Pointer to the get callback function
+ * \param[in] cb_release Pointer to the release callback function
+ * \param[in] cb_priv Callback's private data
+ *
+ * \retval #AOM_CODEC_OK
+ * External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * One or more of the callbacks were NULL.
+ * \retval #AOM_CODEC_ERROR
+ * Decoder context not initialized.
+ * \retval #AOM_CODEC_INCAPABLE
+ * Algorithm not capable of using external frame buffers.
+ *
+ * \note
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame buffers.
+ */
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+ aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+/*!@} - end defgroup cap_external_frame_buffer */
+
+/*!@} - end defgroup decoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif // AOM_AOM_AOM_DECODER_H_
diff --git a/third_party/aom/aom/aom_encoder.h b/third_party/aom/aom/aom_encoder.h
new file mode 100644
index 0000000000..6a6254dafe
--- /dev/null
+++ b/third_party/aom/aom/aom_encoder.h
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_ENCODER_H_
+#define AOM_AOM_AOM_ENCODER_H_
+
+/*!\defgroup encoder Encoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this encoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all encoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the encoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video encoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_codec.h" // IWYU pragma: export
+#include "aom/aom_external_partition.h"
+
+/*!\brief Current ABI version number
+ *
+ * \hideinitializer
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ *
+ * Note: In the definition of AOM_ENCODER_ABI_VERSION, 3 is the value of
+ * AOM_EXT_PART_ABI_VERSION in libaom v3.2.0. The old value of
+ * AOM_EXT_PART_ABI_VERSION is used so as to not break the ABI version check in
+ * aom_codec_enc_init_ver() when an application compiled against libaom v3.2.0
+ * passes the old value of AOM_ENCODER_ABI_VERSION to aom_codec_enc_init_ver().
+ * The external partition API is still experimental. When it is declared stable,
+ * we will replace 3 with AOM_EXT_PART_ABI_VERSION in the definition of
+ * AOM_ENCODER_ABI_VERSION.
+ */
+#define AOM_ENCODER_ABI_VERSION \
+ (10 + AOM_CODEC_ABI_VERSION + /*AOM_EXT_PART_ABI_VERSION=*/3)
+
+/*! \brief Encoder capabilities bitfield
+ *
+ * Each encoder advertises the capabilities it supports as part of its
+ * ::aom_codec_iface_t interface structure. Capabilities are extra
+ * interfaces or functionality, and are not required to be supported
+ * by an encoder.
+ *
+ * The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
+
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ * Certain codec features must be known at initialization time, to allow
+ * for proper memory allocation.
+ *
+ * The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
+#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
+
+/*!\brief Generic fixed size buffer structure
+ *
+ * This structure is able to hold a reference to any fixed size buffer.
+ */
+typedef struct aom_fixed_buf {
+ void *buf; /**< Pointer to the data. Does NOT own the data! */
+ size_t sz; /**< Length of the buffer, in chars */
+} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
+
+/*!\brief Error Resilient flags
+ *
+ * These flags define which error resilient features to enable in the
+ * encoder. The flags are specified through the
+ * aom_codec_enc_cfg::g_error_resilient variable.
+ */
+typedef uint32_t aom_codec_er_flags_t;
+/*!\brief Improve resiliency against losses of whole frames */
+#define AOM_ERROR_RESILIENT_DEFAULT 0x1
+
+/*!\brief Encoder output packet variants
+ *
+ * This enumeration lists the different kinds of data packets that can be
+ * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY
+ * extend this list to provide additional functionality.
+ */
+enum aom_codec_cx_pkt_kind {
+ AOM_CODEC_CX_FRAME_PKT, /**< Compressed video frame */
+ AOM_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */
+ AOM_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
+ AOM_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */
+ AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */
+};
+
+/*!\brief Encoder output packet
+ *
+ * This structure contains the different kinds of output data the encoder
+ * may produce while compressing a frame.
+ */
+typedef struct aom_codec_cx_pkt {
+ enum aom_codec_cx_pkt_kind kind; /**< packet variant */
+ union {
+ struct {
+ void *buf; /**< compressed data buffer */
+ size_t sz; /**< length of compressed data */
+ /*!\brief time stamp to show frame (in timebase units) */
+ aom_codec_pts_t pts;
+ /*!\brief duration to show frame (in timebase units) */
+ unsigned long duration;
+ aom_codec_frame_flags_t flags; /**< flags for this frame */
+ /*!\brief the partition id defines the decoding order of the partitions.
+ * Only applicable when "output partition" mode is enabled. First
+ * partition has id 0.*/
+ int partition_id;
+ /*!\brief size of the visible frame in this packet */
+ size_t vis_frame_size;
+ } frame; /**< data for compressed frame packet */
+ aom_fixed_buf_t twopass_stats; /**< data for two-pass packet */
+ aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
+ struct aom_psnr_pkt {
+ unsigned int samples[4]; /**< Number of samples, total/y/u/v */
+ uint64_t sse[4]; /**< sum squared error, total/y/u/v */
+ double psnr[4]; /**< PSNR, total/y/u/v */
+ /*!\brief Number of samples, total/y/u/v when
+ * input bit-depth < stream bit-depth.*/
+ unsigned int samples_hbd[4];
+ /*!\brief sum squared error, total/y/u/v when
+ * input bit-depth < stream bit-depth.*/
+ uint64_t sse_hbd[4];
+ /*!\brief PSNR, total/y/u/v when
+ * input bit-depth < stream bit-depth.*/
+ double psnr_hbd[4];
+ } psnr; /**< data for PSNR packet */
+ aom_fixed_buf_t raw; /**< data for arbitrary packets */
+ } data; /**< packet data */
+} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
+
+/*!\brief Rational Number
+ *
+ * This structure holds a fractional value.
+ */
+typedef struct aom_rational {
+ int num; /**< fraction numerator */
+ int den; /**< fraction denominator */
+} aom_rational_t; /**< alias for struct aom_rational */
+
+/*!\brief Multi-pass Encoding Pass
+ *
+ * AOM_RC_LAST_PASS is kept for backward compatibility.
+ * If passes is not given and pass==2, the codec will assume passes=2.
+ * For new code, it is recommended to use AOM_RC_SECOND_PASS and set
+ * the "passes" member to 2 via the key & val API for two-pass encoding.
+ */
+enum aom_enc_pass {
+ AOM_RC_ONE_PASS = 0, /**< Single pass mode */
+ AOM_RC_FIRST_PASS = 1, /**< First pass of multi-pass mode */
+ AOM_RC_SECOND_PASS = 2, /**< Second pass of multi-pass mode */
+ AOM_RC_THIRD_PASS = 3, /**< Third pass of multi-pass mode */
+ AOM_RC_LAST_PASS = 2, /**< Final pass of two-pass mode */
+};
+
+/*!\brief Rate control mode */
+enum aom_rc_mode {
+ AOM_VBR, /**< Variable Bit Rate (VBR) mode */
+ AOM_CBR, /**< Constant Bit Rate (CBR) mode */
+ AOM_CQ, /**< Constrained Quality (CQ) mode */
+ AOM_Q, /**< Constant Quality (Q) mode */
+};
+
+/*!\brief Keyframe placement mode.
+ *
+ * This enumeration determines whether keyframes are placed automatically by
+ * the encoder or whether this behavior is disabled. Older releases of this
+ * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled.
+ * This name is confusing for this behavior, so the new symbols to be used
+ * are AOM_KF_AUTO and AOM_KF_DISABLED.
+ */
+enum aom_kf_mode {
+ AOM_KF_FIXED, /**< deprecated, implies AOM_KF_DISABLED */
+ AOM_KF_AUTO, /**< Encoder determines optimal placement automatically */
+ AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
+};
+
+/*!\brief Frame super-resolution mode. */
+typedef enum {
+ /**< Frame super-resolution is disabled for all frames. */
+ AOM_SUPERRES_NONE,
+ /**< All frames are coded at the specified scale and super-resolved. */
+ AOM_SUPERRES_FIXED,
+ /**< All frames are coded at a random scale and super-resolved. */
+ AOM_SUPERRES_RANDOM,
+ /**< Super-resolution scale for each frame is determined based on the q index
+ of that frame. */
+ AOM_SUPERRES_QTHRESH,
+ /**< Full-resolution or super-resolution and the scale (in case of
+ super-resolution) are automatically selected for each frame. */
+ AOM_SUPERRES_AUTO,
+} aom_superres_mode;
+
+/*!\brief Encoder Config Options
+ *
+ * This type allows to enumerate and control flags defined for encoder control
+ * via config file at runtime.
+ */
+typedef struct cfg_options {
+ /*!\brief Indicate init by cfg file
+ * 0 or 1
+ */
+ unsigned int init_by_cfg_file;
+ /*!\brief Superblock size
+ * 0, 64 or 128
+ */
+ unsigned int super_block_size;
+ /*!\brief max partition size
+ * 8, 16, 32, 64, 128
+ */
+ unsigned int max_partition_size;
+ /*!\brief min partition size
+ * 8, 16, 32, 64, 128
+ */
+ unsigned int min_partition_size;
+ /*!\brief disable AB Shape partition type
+ *
+ */
+ unsigned int disable_ab_partition_type;
+ /*!\brief disable rectangular partition type
+ *
+ */
+ unsigned int disable_rect_partition_type;
+ /*!\brief disable 1:4/4:1 partition type
+ *
+ */
+ unsigned int disable_1to4_partition_type;
+ /*!\brief disable flip and identity transform type
+ *
+ */
+ unsigned int disable_flip_idtx;
+ /*!\brief disable CDEF filter
+ *
+ */
+ unsigned int disable_cdef;
+ /*!\brief disable Loop Restoration Filter
+ *
+ */
+ unsigned int disable_lr;
+ /*!\brief disable OBMC
+ *
+ */
+ unsigned int disable_obmc;
+ /*!\brief disable Warped Motion
+ *
+ */
+ unsigned int disable_warp_motion;
+ /*!\brief disable global motion
+ *
+ */
+ unsigned int disable_global_motion;
+ /*!\brief disable dist weighted compound
+ *
+ */
+ unsigned int disable_dist_wtd_comp;
+ /*!\brief disable diff weighted compound
+ *
+ */
+ unsigned int disable_diff_wtd_comp;
+ /*!\brief disable inter/intra compound
+ *
+ */
+ unsigned int disable_inter_intra_comp;
+ /*!\brief disable masked compound
+ *
+ */
+ unsigned int disable_masked_comp;
+ /*!\brief disable one sided compound
+ *
+ */
+ unsigned int disable_one_sided_comp;
+ /*!\brief disable Palette
+ *
+ */
+ unsigned int disable_palette;
+ /*!\brief disable Intra Block Copy
+ *
+ */
+ unsigned int disable_intrabc;
+ /*!\brief disable chroma from luma
+ *
+ */
+ unsigned int disable_cfl;
+ /*!\brief disable intra smooth mode
+ *
+ */
+ unsigned int disable_smooth_intra;
+ /*!\brief disable filter intra
+ *
+ */
+ unsigned int disable_filter_intra;
+ /*!\brief disable dual filter
+ *
+ */
+ unsigned int disable_dual_filter;
+ /*!\brief disable intra angle delta
+ *
+ */
+ unsigned int disable_intra_angle_delta;
+ /*!\brief disable intra edge filter
+ *
+ */
+ unsigned int disable_intra_edge_filter;
+ /*!\brief disable 64x64 transform
+ *
+ */
+ unsigned int disable_tx_64x64;
+ /*!\brief disable smooth inter/intra
+ *
+ */
+ unsigned int disable_smooth_inter_intra;
+ /*!\brief disable inter/inter wedge comp
+ *
+ */
+ unsigned int disable_inter_inter_wedge;
+ /*!\brief disable inter/intra wedge comp
+ *
+ */
+ unsigned int disable_inter_intra_wedge;
+ /*!\brief disable paeth intra
+ *
+ */
+ unsigned int disable_paeth_intra;
+ /*!\brief disable trellis quantization
+ *
+ */
+ unsigned int disable_trellis_quant;
+ /*!\brief disable ref frame MV
+ *
+ */
+ unsigned int disable_ref_frame_mv;
+ /*!\brief use reduced reference frame set
+ *
+ */
+ unsigned int reduced_reference_set;
+ /*!\brief use reduced transform type set
+ *
+ */
+ unsigned int reduced_tx_type_set;
+} cfg_options_t;
+
+/*!\brief Encoded Frame Flags
+ *
+ * This type indicates a bitfield to be passed to aom_codec_encode(), defining
+ * per-frame boolean values. By convention, bits common to all codecs will be
+ * named AOM_EFLAG_*, and bits specific to an algorithm will be named
+ * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
+ */
+typedef long aom_enc_frame_flags_t;
+/*!\brief Force this frame to be a keyframe */
+#define AOM_EFLAG_FORCE_KF (1 << 0)
+
+/*!\brief Encoder configuration structure
+ *
+ * This structure contains the encoder settings that have common representations
+ * across all codecs. This doesn't imply that all codecs support all features,
+ * however.
+ */
+typedef struct aom_codec_enc_cfg {
+ /*
+ * generic settings (g)
+ */
+
+ /*!\brief Algorithm specific "usage" value
+ *
+ * Algorithms may define multiple values for usage, which may convey the
+ * intent of how the application intends to use the stream. If this value
+ * is non-zero, consult the documentation for the codec to determine its
+ * meaning.
+ */
+ unsigned int g_usage;
+
+ /*!\brief Maximum number of threads to use
+ *
+ * For multi-threaded implementations, use no more than this number of
+ * threads. The codec may use fewer threads than allowed. The value
+ * 0 is equivalent to the value 1.
+ */
+ unsigned int g_threads;
+
+ /*!\brief Bitstream profile to use
+ *
+ * Some codecs support a notion of multiple bitstream profiles. Typically
+ * this maps to a set of features that are turned on or off. Often the
+ * profile to use is determined by the features of the intended decoder.
+ * Consult the documentation for the codec to determine the valid values
+ * for this parameter, or set to zero for a sane default.
+ */
+ unsigned int g_profile; /**< profile of bitstream to use */
+
+ /*!\brief Width of the frame
+ *
+ * This value identifies the presentation resolution of the frame,
+ * in pixels. Note that the frames passed as input to the encoder must
+ * have this resolution. Frames will be presented by the decoder in this
+ * resolution, independent of any spatial resampling the encoder may do.
+ */
+ unsigned int g_w;
+
+ /*!\brief Height of the frame
+ *
+ * This value identifies the presentation resolution of the frame,
+ * in pixels. Note that the frames passed as input to the encoder must
+ * have this resolution. Frames will be presented by the decoder in this
+ * resolution, independent of any spatial resampling the encoder may do.
+ */
+ unsigned int g_h;
+
+ /*!\brief Max number of frames to encode
+ *
+ * If force video mode is off (the default) and g_limit is 1, the encoder
+ * will encode a still picture (still_picture is set to 1 in the sequence
+ * header OBU). If in addition full_still_picture_hdr is 0 (the default),
+ * the encoder will use a reduced header (reduced_still_picture_header is
+ * set to 1 in the sequence header OBU) for the still picture.
+ */
+ unsigned int g_limit;
+
+ /*!\brief Forced maximum width of the frame
+ *
+ * If this value is non-zero then it is used to force the maximum frame
+ * width written in write_sequence_header().
+ */
+ unsigned int g_forced_max_frame_width;
+
+ /*!\brief Forced maximum height of the frame
+ *
+ * If this value is non-zero then it is used to force the maximum frame
+ * height written in write_sequence_header().
+ */
+ unsigned int g_forced_max_frame_height;
+
+ /*!\brief Bit-depth of the codec
+ *
+ * This value identifies the bit_depth of the codec,
+ * Only certain bit-depths are supported as identified in the
+ * aom_bit_depth_t enum.
+ */
+ aom_bit_depth_t g_bit_depth;
+
+ /*!\brief Bit-depth of the input frames
+ *
+ * This value identifies the bit_depth of the input frames in bits.
+ * Note that the frames passed as input to the encoder must have
+ * this bit-depth.
+ */
+ unsigned int g_input_bit_depth;
+
+ /*!\brief Stream timebase units
+ *
+ * Indicates the smallest interval of time, in seconds, used by the stream.
+ * For fixed frame rate material, or variable frame rate material where
+ * frames are timed at a multiple of a given clock (ex: video capture),
+ * the \ref RECOMMENDED method is to set the timebase to the reciprocal
+ * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
+ * pts to correspond to the frame number, which can be handy. For
+ * re-encoding video from containers with absolute time timestamps, the
+ * \ref RECOMMENDED method is to set the timebase to that of the parent
+ * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
+ */
+ struct aom_rational g_timebase;
+
+ /*!\brief Enable error resilient modes.
+ *
+ * The error resilient bitfield indicates to the encoder which features
+ * it should enable to take measures for streaming over lossy or noisy
+ * links.
+ */
+ aom_codec_er_flags_t g_error_resilient;
+
+ /*!\brief Multi-pass Encoding Mode
+ *
+ * This value should be set to the current phase for multi-pass encoding.
+ * For single pass, set to #AOM_RC_ONE_PASS.
+ */
+ enum aom_enc_pass g_pass;
+
+ /*!\brief Allow lagged encoding
+ *
+ * If set, this value allows the encoder to consume a number of input
+ * frames before producing output frames. This allows the encoder to
+ * base decisions for the current frame on future frames. This does
+ * increase the latency of the encoding pipeline, so it is not appropriate
+ * in all situations (ex: realtime encoding).
+ *
+ * Note that this is a maximum value -- the encoder may produce frames
+ * sooner than the given limit. Set this value to 0 to disable this
+ * feature.
+ */
+ unsigned int g_lag_in_frames;
+
+ /*
+ * rate control settings (rc)
+ */
+
+ /*!\brief Temporal resampling configuration, if supported by the codec.
+ *
+ * Temporal resampling allows the codec to "drop" frames as a strategy to
+ * meet its target data rate. This can cause temporal discontinuities in
+ * the encoded video, which may appear as stuttering during playback. This
+ * trade-off is often acceptable, but for many applications is not. It can
+ * be disabled in these cases.
+ *
+ * Note that not all codecs support this feature. All aom AVx codecs do.
+ * For other codecs, consult the documentation for that algorithm.
+ *
+ * This threshold is described as a percentage of the target data buffer.
+ * When the data buffer falls below this percentage of fullness, a
+ * dropped frame is indicated. Set the threshold to zero (0) to disable
+ * this feature.
+ */
+ unsigned int rc_dropframe_thresh;
+
+ /*!\brief Mode for spatial resampling, if supported by the codec.
+ *
+ * Spatial resampling allows the codec to compress a lower resolution
+ * version of the frame, which is then upscaled by the decoder to the
+ * correct presentation resolution. This increases visual quality at
+ * low data rates, at the expense of CPU time on the encoder/decoder.
+ */
+ unsigned int rc_resize_mode;
+
+ /*!\brief Frame resize denominator.
+ *
+ * The denominator for resize to use, assuming 8 as the numerator.
+ *
+ * Valid denominators are 8 - 16 for now.
+ */
+ unsigned int rc_resize_denominator;
+
+ /*!\brief Keyframe resize denominator.
+ *
+ * The denominator for resize to use, assuming 8 as the numerator.
+ *
+ * Valid denominators are 8 - 16 for now.
+ */
+ unsigned int rc_resize_kf_denominator;
+
+ /*!\brief Frame super-resolution scaling mode.
+ *
+ * Similar to spatial resampling, frame super-resolution integrates
+ * upscaling after the encode/decode process. Taking control of upscaling and
+ * using restoration filters should allow it to outperform normal resizing.
+ */
+ aom_superres_mode rc_superres_mode;
+
+ /*!\brief Frame super-resolution denominator.
+ *
+ * The denominator for superres to use. If fixed it will only change if the
+ * cumulative scale change over resizing and superres is greater than 1/2;
+ * this forces superres to reduce scaling.
+ *
+ * Valid denominators are 8 to 16.
+ *
+ * Used only by AOM_SUPERRES_FIXED.
+ */
+ unsigned int rc_superres_denominator;
+
+ /*!\brief Keyframe super-resolution denominator.
+ *
+ * The denominator for superres to use. If fixed it will only change if the
+ * cumulative scale change over resizing and superres is greater than 1/2;
+ * this forces superres to reduce scaling.
+ *
+ * Valid denominators are 8 - 16 for now.
+ */
+ unsigned int rc_superres_kf_denominator;
+
+ /*!\brief Frame super-resolution q threshold.
+ *
+ * The q level threshold after which superres is used.
+ * Valid values are 1 to 63.
+ *
+ * Used only by AOM_SUPERRES_QTHRESH
+ */
+ unsigned int rc_superres_qthresh;
+
+ /*!\brief Keyframe super-resolution q threshold.
+ *
+ * The q level threshold after which superres is used for key frames.
+ * Valid values are 1 to 63.
+ *
+ * Used only by AOM_SUPERRES_QTHRESH
+ */
+ unsigned int rc_superres_kf_qthresh;
+
+ /*!\brief Rate control algorithm to use.
+ *
+ * Indicates whether the end usage of this stream is to be streamed over
+ * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
+ * mode should be used, or whether it will be played back on a high
+ * bandwidth link, as from a local disk, where higher variations in
+ * bitrate are acceptable.
+ */
+ enum aom_rc_mode rc_end_usage;
+
+ /*!\brief Two-pass stats buffer.
+ *
+ * A buffer containing all of the stats packets produced in the first
+ * pass, concatenated.
+ */
+ aom_fixed_buf_t rc_twopass_stats_in;
+
+ /*!\brief first pass mb stats buffer.
+ *
+ * A buffer containing all of the first pass mb stats packets produced
+ * in the first pass, concatenated.
+ */
+ aom_fixed_buf_t rc_firstpass_mb_stats_in;
+
+ /*!\brief Target data rate
+ *
+ * Target bitrate to use for this stream, in kilobits per second.
+ */
+ unsigned int rc_target_bitrate;
+
+ /*
+ * quantizer settings
+ */
+
+ /*!\brief Minimum (Best Quality) Quantizer
+ *
+ * The quantizer is the most direct control over the quality of the
+ * encoded image. The range of valid values for the quantizer is codec
+ * specific. Consult the documentation for the codec to determine the
+ * values to use. To determine the range programmatically, call
+ * aom_codec_enc_config_default() with a usage value of 0.
+ */
+ unsigned int rc_min_quantizer;
+
+ /*!\brief Maximum (Worst Quality) Quantizer
+ *
+ * The quantizer is the most direct control over the quality of the
+ * encoded image. The range of valid values for the quantizer is codec
+ * specific. Consult the documentation for the codec to determine the
+ * values to use. To determine the range programmatically, call
+ * aom_codec_enc_config_default() with a usage value of 0.
+ */
+ unsigned int rc_max_quantizer;
+
+ /*
+ * bitrate tolerance
+ */
+
+ /*!\brief Rate control adaptation undershoot control
+ *
+ * This value, controls the tolerance of the VBR algorithm to undershoot
+ * and is used as a trigger threshold for more aggressive adaptation of Q.
+ *
+ * Valid values in the range 0-100.
+ */
+ unsigned int rc_undershoot_pct;
+
+ /*!\brief Rate control adaptation overshoot control
+ *
+ * This value, controls the tolerance of the VBR algorithm to overshoot
+ * and is used as a trigger threshold for more aggressive adaptation of Q.
+ *
+ * Valid values in the range 0-100.
+ */
+ unsigned int rc_overshoot_pct;
+
+ /*
+ * decoder buffer model parameters
+ */
+
+ /*!\brief Decoder Buffer Size
+ *
+ * This value indicates the amount of data that may be buffered by the
+ * decoding application. Note that this value is expressed in units of
+ * time (milliseconds). For example, a value of 5000 indicates that the
+ * client will buffer (at least) 5000ms worth of encoded data. Use the
+ * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
+ * necessary.
+ */
+ unsigned int rc_buf_sz;
+
+ /*!\brief Decoder Buffer Initial Size
+ *
+ * This value indicates the amount of data that will be buffered by the
+ * decoding application prior to beginning playback. This value is
+ * expressed in units of time (milliseconds). Use the target bitrate
+ * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
+ */
+ unsigned int rc_buf_initial_sz;
+
+ /*!\brief Decoder Buffer Optimal Size
+ *
+ * This value indicates the amount of data that the encoder should try
+ * to maintain in the decoder's buffer. This value is expressed in units
+ * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
+ * to convert to bits/bytes, if necessary.
+ */
+ unsigned int rc_buf_optimal_sz;
+
+ /*
+ * 2 pass rate control parameters
+ */
+
+ /*!\brief Two-pass mode CBR/VBR bias
+ *
+ * Bias, expressed on a scale of 0 to 100, for determining target size
+ * for the current frame. The value 0 indicates the optimal CBR mode
+ * value should be used. The value 100 indicates the optimal VBR mode
+ * value should be used. Values in between indicate which way the
+ * encoder should "lean."
+ */
+ unsigned int rc_2pass_vbr_bias_pct;
+
+ /*!\brief Two-pass mode per-GOP minimum bitrate
+ *
+ * This value, expressed as a percentage of the target bitrate, indicates
+ * the minimum bitrate to be used for a single GOP (aka "section")
+ */
+ unsigned int rc_2pass_vbr_minsection_pct;
+
+ /*!\brief Two-pass mode per-GOP maximum bitrate
+ *
+ * This value, expressed as a percentage of the target bitrate, indicates
+ * the maximum bitrate to be used for a single GOP (aka "section")
+ */
+ unsigned int rc_2pass_vbr_maxsection_pct;
+
+ /*
+ * keyframing settings (kf)
+ */
+
+ /*!\brief Option to enable forward reference key frame
+ *
+ */
+ int fwd_kf_enabled;
+
+ /*!\brief Keyframe placement mode
+ *
+ * This value indicates whether the encoder should place keyframes at a
+ * fixed interval, or determine the optimal placement automatically
+ * (as governed by the #kf_min_dist and #kf_max_dist parameters)
+ */
+ enum aom_kf_mode kf_mode;
+
+ /*!\brief Keyframe minimum interval
+ *
+ * This value, expressed as a number of frames, prevents the encoder from
+ * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
+ * least kf_min_dist frames non-keyframes will be coded before the next
+ * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
+ */
+ unsigned int kf_min_dist;
+
+ /*!\brief Keyframe maximum interval
+ *
+ * This value, expressed as a number of frames, forces the encoder to code
+ * a keyframe if one has not been coded in the last kf_max_dist frames.
+ * A value of 0 implies all frames will be keyframes. Set kf_min_dist
+ * equal to kf_max_dist for a fixed interval.
+ */
+ unsigned int kf_max_dist;
+
+ /*!\brief sframe interval
+ *
+ * This value, expressed as a number of frames, forces the encoder to code
+ * an S-Frame every sframe_dist frames.
+ */
+ unsigned int sframe_dist;
+
+ /*!\brief sframe insertion mode
+ *
+ * This value must be set to 1 or 2, and tells the encoder how to insert
+ * S-Frames. It will only have an effect if sframe_dist != 0.
+ *
+ * If altref is enabled:
+ * - if sframe_mode == 1, the considered frame will be made into an
+ * S-Frame only if it is an altref frame
+ * - if sframe_mode == 2, the next altref frame will be made into an
+ * S-Frame.
+ *
+ * Otherwise: the considered frame will be made into an S-Frame.
+ */
+ unsigned int sframe_mode;
+
+ /*!\brief Tile coding mode
+ *
+ * This value indicates the tile coding mode.
+ * A value of 0 implies a normal non-large-scale tile coding. A value of 1
+ * implies a large-scale tile coding.
+ */
+ unsigned int large_scale_tile;
+
+ /*!\brief Monochrome mode
+ *
+ * If this is nonzero, the encoder will generate a monochrome stream
+ * with no chroma planes.
+ */
+ unsigned int monochrome;
+
+ /*!\brief full_still_picture_hdr
+ *
+ * If this is nonzero, the encoder will generate a full header
+ * (reduced_still_picture_header is set to 0 in the sequence header OBU) even
+ * for still picture encoding. If this is zero (the default), a reduced
+ * header (reduced_still_picture_header is set to 1 in the sequence header
+ * OBU) is used for still picture encoding. This flag has no effect when a
+ * regular video with more than a single frame is encoded.
+ */
+ unsigned int full_still_picture_hdr;
+
+ /*!\brief Bitstream syntax mode
+ *
+ * This value indicates the bitstream syntax mode.
+ * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value
+ * of 1 indicates the bitstream is saved in Annex-B format
+ */
+ unsigned int save_as_annexb;
+
+ /*!\brief Number of explicit tile widths specified
+ *
+ * This value indicates the number of tile widths specified
+ * A value of 0 implies no tile widths are specified.
+ * Tile widths are given in the array tile_widths[]
+ */
+ int tile_width_count;
+
+ /*!\brief Number of explicit tile heights specified
+ *
+ * This value indicates the number of tile heights specified
+ * A value of 0 implies no tile heights are specified.
+ * Tile heights are given in the array tile_heights[]
+ */
+ int tile_height_count;
+
+/*!\brief Maximum number of tile widths in tile widths array
+ *
+ * This define gives the maximum number of elements in the tile_widths array.
+ */
+#define MAX_TILE_WIDTHS 64 // maximum tile width array length
+
+ /*!\brief Array of specified tile widths
+ *
+ * This array specifies tile widths (and may be empty)
+ * The number of widths specified is given by tile_width_count
+ */
+ int tile_widths[MAX_TILE_WIDTHS];
+
+/*!\brief Maximum number of tile heights in tile heights array.
+ *
+ * This define gives the maximum number of elements in the tile_heights array.
+ */
+#define MAX_TILE_HEIGHTS 64 // maximum tile height array length
+
+ /*!\brief Array of specified tile heights
+ *
+ * This array specifies tile heights (and may be empty)
+ * The number of heights specified is given by tile_height_count
+ */
+ int tile_heights[MAX_TILE_HEIGHTS];
+
+ /*!\brief Whether encoder should use fixed QP offsets.
+ *
+ * If a value of 1 is provided, encoder will use fixed QP offsets for frames
+ * at different levels of the pyramid.
+ * If a value of 0 is provided, encoder will NOT use fixed QP offsets.
+ * Note: This option is only relevant for --end-usage=q.
+ */
+ unsigned int use_fixed_qp_offsets;
+
+ /*!\brief Deprecated and ignored. DO NOT USE.
+ *
+ * TODO(aomedia:3269): Remove fixed_qp_offsets in libaom v4.0.0.
+ */
+ int fixed_qp_offsets[5];
+
+ /*!\brief Options defined per config file
+ *
+ */
+ cfg_options_t encoder_cfg;
+} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
+
+/*!\brief Initialize an encoder instance
+ *
+ * Initializes an encoder context using the given interface. Applications
+ * should call the aom_codec_enc_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with -DCONFIG_MULTITHREAD=0, this call
+ * is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * If aom_codec_enc_init_ver() fails, it is not necessary to call
+ * aom_codec_destroy() on the encoder context.
+ *
+ * \param[in] ctx Pointer to this instance's context.
+ * \param[in] iface Pointer to the algorithm interface to use.
+ * \param[in] cfg Configuration to use, if known.
+ * \param[in] flags Bitfield of AOM_CODEC_USE_* flags
+ * \param[in] ver ABI version number. Must be set to
+ * AOM_ENCODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ * The encoder algorithm has been initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ * Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+ aom_codec_iface_t *iface,
+ const aom_codec_enc_cfg_t *cfg,
+ aom_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for aom_codec_enc_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_enc_init(ctx, iface, cfg, flags) \
+ aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)
+
+/*!\brief Get the default configuration for a usage.
+ *
+ * Initializes an encoder configuration structure with default values. Supports
+ * the notion of "usages" so that an algorithm may offer different default
+ * settings depending on the user's intended goal. This function \ref SHOULD
+ * be called by all applications to initialize the configuration structure
+ * before specializing the configuration with application specific values.
+ *
+ * \param[in] iface Pointer to the algorithm interface to use.
+ * \param[out] cfg Configuration buffer to populate.
+ * \param[in] usage Algorithm specific usage value. For AV1, must be
+ * set to AOM_USAGE_GOOD_QUALITY (0),
+ * AOM_USAGE_REALTIME (1), or AOM_USAGE_ALL_INTRA (2).
+ *
+ * \retval #AOM_CODEC_OK
+ * The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ * Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * A parameter was NULL, or the usage value was not recognized.
+ */
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+ aom_codec_enc_cfg_t *cfg,
+ unsigned int usage);
+
+/*!\brief Set or change configuration
+ *
+ * Reconfigures an encoder instance according to the given configuration.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] cfg Configuration buffer to use
+ *
+ * \retval #AOM_CODEC_OK
+ * The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ * Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * A parameter was NULL, or the usage value was not recognized.
+ */
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+ const aom_codec_enc_cfg_t *cfg);
+
+/*!\brief Get global stream headers
+ *
+ * Retrieves a stream level global header packet, if supported by the codec.
+ * Calls to this function should be deferred until all configuration information
+ * has been passed to libaom. Otherwise the global header data may be
+ * invalidated by additional configuration changes.
+ *
+ * The AV1 implementation of this function returns an OBU. The OBU returned is
+ * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is
+ * set, and the buffer contains the obu_size field for the returned OBU.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ *
+ * \retval NULL
+ * Encoder does not support global header, or an error occurred while
+ * generating the global header.
+ *
+ * \retval Non-NULL
+ * Pointer to buffer containing global header packet. The caller owns the
+ * memory associated with this buffer, and must free the 'buf' member of the
+ * aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned
+ * must be freed via call to free().
+ */
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
+
+/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
+#define AOM_USAGE_GOOD_QUALITY 0u
+/*!\brief usage parameter analogous to AV1 REALTIME mode. */
+#define AOM_USAGE_REALTIME 1u
+/*!\brief usage parameter analogous to AV1 all intra mode. */
+#define AOM_USAGE_ALL_INTRA 2u
+
+/*!\brief Encode a frame
+ *
+ * Encodes a video frame at the given "presentation time." The presentation
+ * time stamp (PTS) \ref MUST be strictly increasing.
+ *
+ * When the last frame has been passed to the encoder, this function should
+ * continue to be called in a loop, with the img parameter set to NULL. This
+ * will signal the end-of-stream condition to the encoder and allow it to
+ * encode any held buffers. Encoding is complete when aom_codec_encode() is
+ * called with img set to NULL and aom_codec_get_cx_data() returns no data.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] img Image data to encode, NULL to flush.
+ * Encoding sample values outside the range
+ * [0..(1<<img->bit_depth)-1] is undefined behavior.
+ * Note: Although img is declared as a const pointer,
+ * if AV1E_SET_DENOISE_NOISE_LEVEL is set to a nonzero
+ * value aom_codec_encode() modifies (denoises) the
+ * samples in img->planes[i] .
+ * \param[in] pts Presentation time stamp, in timebase units. If img
+ * is NULL, pts is ignored.
+ * \param[in] duration Duration to show frame, in timebase units. If img
+ * is not NULL, duration must be nonzero. If img is
+ * NULL, duration is ignored.
+ * \param[in] flags Flags to use for encoding this frame.
+ *
+ * \retval #AOM_CODEC_OK
+ * The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ * Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * A parameter was NULL, the image format is unsupported, etc.
+ */
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+ aom_codec_pts_t pts, unsigned long duration,
+ aom_enc_frame_flags_t flags);
+
+/*!\brief Set compressed data output buffer
+ *
+ * Sets the buffer that the codec should output the compressed data
+ * into. This call effectively sets the buffer pointer returned in the
+ * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
+ * appended into this buffer. The buffer is preserved across frames,
+ * so applications must periodically call this function after flushing
+ * the accumulated compressed data to disk or to the network to reset
+ * the pointer to the buffer's head.
+ *
+ * `pad_before` bytes will be skipped before writing the compressed
+ * data, and `pad_after` bytes will be appended to the packet. The size
+ * of the packet will be the sum of the size of the actual compressed
+ * data, pad_before, and pad_after. The padding bytes will be preserved
+ * (not overwritten).
+ *
+ * Note that calling this function does not guarantee that the returned
+ * compressed data will be placed into the specified buffer. In the
+ * event that the encoded data will not fit into the buffer provided,
+ * the returned packet \ref MAY point to an internal buffer, as it would
+ * if this call were never used. In this event, the output packet will
+ * NOT have any padding, and the application must free space and copy it
+ * to the proper place. This is of particular note in configurations
+ * that may output multiple packets for a single encoded frame (e.g., lagged
+ * encoding) or if the application does not reset the buffer periodically.
+ *
+ * Applications may restore the default behavior of the codec providing
+ * the compressed data buffer by calling this function with a NULL
+ * buffer.
+ *
+ * Applications \ref MUSTNOT call this function during iteration of
+ * aom_codec_get_cx_data().
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] buf Buffer to store compressed data into
+ * \param[in] pad_before Bytes to skip before writing compressed data
+ * \param[in] pad_after Bytes to skip after writing compressed data
+ *
+ * \retval #AOM_CODEC_OK
+ * The buffer was set successfully.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * A parameter was NULL, the image format is unsupported, etc.
+ */
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+ const aom_fixed_buf_t *buf,
+ unsigned int pad_before,
+ unsigned int pad_after);
+
+/*!\brief Encoded data iterator
+ *
+ * Iterates over a list of data packets to be passed from the encoder to the
+ * application. The different kinds of packets available are enumerated in
+ * #aom_codec_cx_pkt_kind.
+ *
+ * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's
+ * muxer. Multiple compressed frames may be in the list.
+ * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer.
+ *
+ * The application \ref MUST silently ignore any packet kinds that it does
+ * not recognize or support.
+ *
+ * The data buffers returned from this function are only guaranteed to be
+ * valid until the application makes another call to any aom_codec_* function.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in,out] iter Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an output data packet (compressed frame data,
+ * two-pass statistics, etc.) or NULL to signal end-of-list.
+ *
+ */
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+ aom_codec_iter_t *iter);
+
+/*!\brief Get Preview Frame
+ *
+ * Returns an image that can be used as a preview. Shows the image as it would
+ * exist at the decompressor. The application \ref MUST NOT write into this
+ * image buffer.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ *
+ * \return Returns a pointer to a preview image, or NULL if no image is
+ * available.
+ *
+ */
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);
+
+/*!@} - end defgroup encoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif // AOM_AOM_AOM_ENCODER_H_
diff --git a/third_party/aom/aom/aom_external_partition.h b/third_party/aom/aom/aom_external_partition.h
new file mode 100644
index 0000000000..c381f6e5e9
--- /dev/null
+++ b/third_party/aom/aom/aom_external_partition.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+#define AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include <stdint.h>
+
+/*!\file
+ * \brief Provides function pointer definitions for the external partition.
+ *
+ * \note The external partition API should be considered experimental. Until the
+ * external partition API is declared stable, breaking changes may be made to
+ * this API in a future libaom release.
+ */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define AOM_EXT_PART_ABI_VERSION 8
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Abstract external partition model handler
+ */
+typedef void *aom_ext_part_model_t;
+
+/*!\brief Number of features to determine whether to skip partition none and
+ * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT".
+ */
+#define AOM_EXT_PART_SIZE_DIRECT_SPLIT 17
+
+/*!\brief Number of features to use simple motion search to prune out
+ * rectangular partition in some direction. The same as
+ * "FEATURE_SIZE_SMS_PRUNE_PART".
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_PART 25
+
+/*!\brief Number of features to prune split and rectangular partition
+ * after PARTITION_NONE.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_NONE 4
+
+/*!\brief Number of features to terminates partition after partition none using
+ * simple_motion_search features and the rate, distortion, and rdcost of
+ * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE".
+ */
+#define AOM_EXT_PART_SIZE_TERM_NONE 28
+
+/*!\brief Number of features to terminates partition after partition split.
+ */
+#define AOM_EXT_PART_SIZE_TERM_SPLIT 31
+
+/*!\brief Number of features to prune rectangular partition using stats
+ * collected after partition split.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_RECT 9
+
+/*!\brief Number of features to prune AB partition using stats
+ * collected after rectangular partition..
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_AB 10
+
+/*!\brief Number of features to prune 4-way partition using stats
+ * collected after AB partition.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_4_WAY 18
+
+/*!\brief Decision mode of the external partition model.
+ * AOM_EXT_PART_WHOLE_TREE: the external partition model should provide the
+ * whole partition tree for the superblock.
+ *
+ * AOM_EXT_PART_RECURSIVE: the external partition model provides the partition
+ * decision of the current block only. The decision process starts from
+ * the superblock size, down to the smallest block size (4x4) recursively.
+ */
+typedef enum aom_ext_part_decision_mode {
+ AOM_EXT_PART_WHOLE_TREE = 0,
+ AOM_EXT_PART_RECURSIVE = 1,
+} aom_ext_part_decision_mode_t;
+
+/*!\brief Config information sent to the external partition model.
+ *
+ * For example, the maximum superblock size determined by the sequence header.
+ */
+typedef struct aom_ext_part_config {
+ int superblock_size; ///< super block size (either 64x64 or 128x128)
+} aom_ext_part_config_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected before NONE partition.
+ * Features "f" are used to determine:
+ * partition_none_allowed, partition_horz_allowed, partition_vert_allowed,
+ * do_rectangular_split, do_square_split
+ * Features "f_part2" are used to determine:
+ * prune_horz, prune_vert.
+ */
+typedef struct aom_partition_features_before_none {
+ /*! features to determine whether skip partition none and do split directly */
+ float f[AOM_EXT_PART_SIZE_DIRECT_SPLIT];
+ /*! features to determine whether to prune rectangular partition */
+ float f_part2[AOM_EXT_PART_SIZE_PRUNE_PART];
+} aom_partition_features_before_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after NONE partition.
+ */
+typedef struct aom_partition_features_none {
+ /*! features to prune split and rectangular partition */
+ float f[AOM_EXT_PART_SIZE_PRUNE_NONE];
+ /*! features to determine termination of partition */
+ float f_terminate[AOM_EXT_PART_SIZE_TERM_NONE];
+} aom_partition_features_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after SPLIT partition.
+ */
+typedef struct aom_partition_features_split {
+ /*! features to determine termination of partition */
+ float f_terminate[AOM_EXT_PART_SIZE_TERM_SPLIT];
+ /*! features to determine pruning rect partition */
+ float f_prune_rect[AOM_EXT_PART_SIZE_PRUNE_RECT];
+} aom_partition_features_split_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after RECTANGULAR partition.
+ */
+typedef struct aom_partition_features_rect {
+ /*! features to determine pruning AB partition */
+ float f[AOM_EXT_PART_SIZE_PRUNE_AB];
+} aom_partition_features_rect_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A,
+ * VERT_B.
+ */
+typedef struct aom_partition_features_ab {
+ /*! features to determine pruning 4-way partition */
+ float f[AOM_EXT_PART_SIZE_PRUNE_4_WAY];
+} aom_partition_features_ab_t;
+
+/*!\brief Feature id to tell the external model the current stage in partition
+ * pruning and what features to use to make decisions accordingly.
+ */
+typedef enum {
+ AOM_EXT_PART_FEATURE_BEFORE_NONE,
+ AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2,
+ AOM_EXT_PART_FEATURE_AFTER_NONE,
+ AOM_EXT_PART_FEATURE_AFTER_NONE_PART2,
+ AOM_EXT_PART_FEATURE_AFTER_SPLIT,
+ AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2,
+ AOM_EXT_PART_FEATURE_AFTER_RECT,
+ AOM_EXT_PART_FEATURE_AFTER_AB
+} AOM_EXT_PART_FEATURE_ID;
+
+/*!\brief Features collected from the tpl process.
+ *
+ * The tpl process collects information that help measure the inter-frame
+ * dependency.
+ * The tpl process is computed in the unit of tpl_bsize_1d (16x16).
+ * Therefore, the max number of units inside a superblock is
+ * 128x128 / (16x16) = 64. Change it if the tpl process changes.
+ */
+typedef struct aom_sb_tpl_features {
+ int available; ///< If tpl stats are available
+ int tpl_unit_length; ///< The block length of tpl process
+ int num_units; ///< The number of units inside the current superblock
+ int64_t intra_cost[64]; ///< The intra cost of each unit
+ int64_t inter_cost[64]; ///< The inter cost of each unit
+ int64_t mc_dep_cost[64]; ///< The motion compensated dependency cost
+} aom_sb_tpl_features_t;
+
+/*!\brief Features collected from the simple motion process.
+ *
+ * The simple motion process collects information by applying motion compensated
+ * prediction on each block.
+ * The block size is 16x16, which could be changed. If it is changed, update
+ * comments and the array size here.
+ */
+typedef struct aom_sb_simple_motion_features {
+ int unit_length; ///< The block length of the simple motion process
+ int num_units; ///< The number of units inside the current superblock
+ int block_sse[64]; ///< Sum of squared error of each unit
+ int block_var[64]; ///< Variance of each unit
+} aom_sb_simple_motion_features_t;
+
+/*!\brief Features of each super block.
+ *
+ * Features collected for each super block before partition search.
+ */
+typedef struct aom_sb_features {
+ /*! Features from motion search */
+ aom_sb_simple_motion_features_t motion_features;
+ /*! Features from tpl process */
+ aom_sb_tpl_features_t tpl_features;
+} aom_sb_features_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ *
+ * The encoder sends these features to the external model through
+ * "func()" defined in .....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_features {
+ // Features for the current supervised multi-stage ML model.
+ /*! Feature ID to indicate active features */
+ AOM_EXT_PART_FEATURE_ID id;
+ /*! Features collected before NONE partition */
+ aom_partition_features_before_none_t before_part_none;
+ /*! Features collected after NONE partition */
+ aom_partition_features_none_t after_part_none;
+ /*! Features collected after SPLIT partition */
+ aom_partition_features_split_t after_part_split;
+ /*! Features collected after RECTANGULAR partition */
+ aom_partition_features_rect_t after_part_rect;
+ /*! Features collected after AB partition */
+ aom_partition_features_ab_t after_part_ab;
+
+ // Features for a new ML model.
+ aom_sb_features_t sb_features; ///< Features collected for the super block
+ int mi_row; ///< Mi_row position of the block
+ int mi_col; ///< Mi_col position of the block
+ int frame_width; ///< Frame width
+ int frame_height; ///< Frame height
+ int block_size; ///< As "BLOCK_SIZE" in av1/common/enums.h
+ /*!
+ * Valid partition types. A bitmask is used. "1" represents the
+ * corresponding type is valid. The bitmask follows the enum order for
+ * PARTITION_TYPE in "enums.h" to represent one partition type at a bit.
+ * For example, 0x01 stands for only PARTITION_NONE is valid,
+ * 0x09 (00...001001) stands for PARTITION_NONE and PARTITION_SPLIT are valid.
+ */
+ int valid_partition_types;
+ int update_type; ///< Frame update type, defined in ratectrl.h
+ int qindex; ///< Quantization index, range: [0, 255]
+ int rdmult; ///< Rate-distortion multiplier
+ int pyramid_level; ///< The level of this frame in the hierarchical structure
+ int has_above_block; ///< Has above neighbor block
+ int above_block_width; ///< Width of the above block, -1 if not exist
+ int above_block_height; ///< Height of the above block, -1 if not exist
+ int has_left_block; ///< Has left neighbor block
+ int left_block_width; ///< Width of the left block, -1 if not exist
+ int left_block_height; ///< Height of the left block, -1 if not exist
+ /*!
+ * The following parameters are collected from applying simple motion search.
+ * Sum of squared error (SSE) and variance of motion compensated residual
+ * are good indicators of block partitioning.
+ * If a block is a square, we also apply motion search for its 4 sub blocks.
+ * If not a square, their values are -1.
+ * If a block is able to split horizontally, we apply motion search and get
+ * stats for horizontal blocks. If not, their values are -1.
+ * If a block is able to split vertically, we apply motion search and get
+ * stats for vertical blocks. If not, their values are -1.
+ */
+ unsigned int block_sse; ///< SSE of motion compensated residual
+ unsigned int block_var; ///< Variance of motion compensated residual
+ unsigned int sub_block_sse[4]; ///< SSE of sub blocks.
+ unsigned int sub_block_var[4]; ///< Variance of sub blocks.
+ unsigned int horz_block_sse[2]; ///< SSE of horz sub blocks
+ unsigned int horz_block_var[2]; ///< Variance of horz sub blocks
+ unsigned int vert_block_sse[2]; ///< SSE of vert sub blocks
+ unsigned int vert_block_var[2]; ///< Variance of vert sub blocks
+ /*!
+ * The following parameters are calculated from tpl model.
+ * If tpl model is not available, their values are -1.
+ */
+ int64_t tpl_intra_cost; ///< Intra cost, ref to "TplDepStats" in tpl_model.h
+ int64_t tpl_inter_cost; ///< Inter cost in tpl model
+ int64_t tpl_mc_dep_cost; ///< Motion compensated dependency cost in tpl model
+} aom_partition_features_t;
+
+/*!\brief Partition decisions received from the external model.
+ *
+ * The encoder receives partition decisions and encodes the superblock
+ * with the given partition type.
+ * The encoder receives it from "func()" define in ....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_decision {
+ // Decisions for directly set partition types
+ int is_final_decision; ///< The flag whether it's the final decision
+ int num_nodes; ///< The number of leaf nodes
+ int partition_decision[2048]; ///< Partition decisions
+ int current_decision; ///< Partition decision for the current block
+
+ // Decisions for partition type pruning
+ int terminate_partition_search; ///< Terminate further partition search
+ int partition_none_allowed; ///< Allow partition none type
+ int partition_rect_allowed[2]; ///< Allow rectangular partitions
+ int do_rectangular_split; ///< Try rectangular split partition
+ int do_square_split; ///< Try square split partition
+ int prune_rect_part[2]; ///< Prune rectangular partition
+ int horza_partition_allowed; ///< Allow HORZ_A partition
+ int horzb_partition_allowed; ///< Allow HORZ_B partition
+ int verta_partition_allowed; ///< Allow VERT_A partition
+ int vertb_partition_allowed; ///< Allow VERT_B partition
+ int partition_horz4_allowed; ///< Allow HORZ4 partition
+ int partition_vert4_allowed; ///< Allow VERT4 partition
+} aom_partition_decision_t;
+
+/*!\brief Encoding stats for the given partition decision.
+ *
+ * The encoding stats collected by encoding the superblock with the
+ * given partition types.
+ * The encoder sends the stats to the external model for training
+ * or inference through "func()" defined in ....
+ */
+typedef struct aom_partition_stats {
+ int rate; ///< Rate cost of the block
+ int64_t dist; ///< Distortion of the block
+ int64_t rdcost; ///< Rate-distortion cost of the block
+} aom_partition_stats_t;
+
+/*!\brief Enum for return status.
+ */
+typedef enum aom_ext_part_status {
+ AOM_EXT_PART_OK = 0, ///< Status of success
+ AOM_EXT_PART_ERROR = 1, ///< Status of failure
+ AOM_EXT_PART_TEST = 2, ///< Status used for tests
+} aom_ext_part_status_t;
+
+/*!\brief Callback of creating an external partition model.
+ *
+ * The callback is invoked by the encoder to create an external partition
+ * model.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] part_config Config information pointer for model creation
+ * \param[out] ext_part_model Pointer to the model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)(
+ void *priv, const aom_ext_part_config_t *part_config,
+ aom_ext_part_model_t *ext_part_model);
+
+/*!\brief Callback of sending features to the external partition model.
+ *
+ * The callback is invoked by the encoder to send features to the external
+ * partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] part_features Pointer to the features
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_features_t *part_features);
+
+/*!\brief Callback of receiving partition decisions from the external
+ * partition model.
+ *
+ * The callback is invoked by the encoder to receive partition decisions from
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_decision Pointer to the partition decisions
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)(
+ aom_ext_part_model_t ext_part_model,
+ aom_partition_decision_t *ext_part_decision);
+
+/*!\brief Callback of sending stats to the external partition model.
+ *
+ * The callback is invoked by the encoder to send encoding stats to
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_stats Pointer to the encoding stats
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_stats_t *ext_part_stats);
+
+/*!\brief Callback of deleting the external partition model.
+ *
+ * The callback is invoked by the encoder to delete the external partition
+ * model.
+ *
+ * \param[in] ext_part_model The external model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)(
+ aom_ext_part_model_t ext_part_model);
+
+/*!\brief Callback function set for external partition model.
+ *
+ * Uses can enable external partition model by registering a set of
+ * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL
+ */
+typedef struct aom_ext_part_funcs {
+ /*!
+ * Create an external partition model.
+ */
+ aom_ext_part_create_model_fn_t create_model;
+
+ /*!
+ * Send features to the external partition model to make partition decisions.
+ */
+ aom_ext_part_send_features_fn_t send_features;
+
+ /*!
+ * Get partition decisions from the external partition model.
+ */
+ aom_ext_part_get_decision_fn_t get_partition_decision;
+
+ /*!
+ * Send stats of the current partition to the external model.
+ */
+ aom_ext_part_send_partition_stats_fn_t send_partition_stats;
+
+ /*!
+ * Delete the external partition model.
+ */
+ aom_ext_part_delete_model_fn_t delete_model;
+
+ /*!
+ * The decision mode of the model.
+ */
+ aom_ext_part_decision_mode_t decision_mode;
+
+ /*!
+ * Private data for the external partition model.
+ */
+ void *priv;
+} aom_ext_part_funcs_t;
+
+/*!@} - end defgroup aom_encoder*/
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_AOM_EXTERNAL_PARTITION_H_
diff --git a/third_party/aom/aom/aom_frame_buffer.h b/third_party/aom/aom/aom_frame_buffer.h
new file mode 100644
index 0000000000..0e80373ddd
--- /dev/null
+++ b/third_party/aom/aom/aom_frame_buffer.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_AOM_FRAME_BUFFER_H_
+#define AOM_AOM_AOM_FRAME_BUFFER_H_
+
+/*!\file
+ * \brief Describes the decoder external frame buffer interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_integer.h"
+
+/*!\brief The maximum number of work buffers used by libaom.
+ * Support maximum 4 threads to decode video in parallel.
+ * Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
+ */
+#define AOM_MAXIMUM_WORK_BUFFERS 8
+
+/*!\brief The maximum number of reference buffers that a AV1 encoder may use.
+ */
+#define AOM_MAXIMUM_REF_BUFFERS 8
+
+/*!\brief External frame buffer
+ *
+ * This structure holds allocated frame buffers used by the decoder.
+ */
+typedef struct aom_codec_frame_buffer {
+ uint8_t *data; /**< Pointer to the data buffer */
+ size_t size; /**< Size of data in bytes */
+ void *priv; /**< Frame's private data */
+} aom_codec_frame_buffer_t;
+
+/*!\brief get frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder to retrieve data for the frame
+ * buffer in order for the decode call to complete. The callback must
+ * allocate at least min_size in bytes and assign it to fb->data. The callback
+ * must zero out all the data allocated. Then the callback must set fb->size
+ * to the allocated size. The application does not need to align the allocated
+ * data. The callback is triggered when the decoder needs a frame buffer to
+ * decode a compressed image into. This function may be called more than once
+ * for every call to aom_codec_decode. The application may set fb->priv to
+ * some data which will be passed back in the aom_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] min_size Size in bytes needed by the buffer
+ * \param[in,out] fb Pointer to aom_codec_frame_buffer_t
+ */
+typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb);
+
+/*!\brief release frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder when the frame buffer is not
+ * referenced by any other buffers. |fb| is guaranteed to not be NULL. On
+ * success the callback must return 0. Any failure the callback must return
+ * a value less than 0.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] fb Pointer to aom_codec_frame_buffer_t
+ */
+typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
+ aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_AOM_FRAME_BUFFER_H_
diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h
new file mode 100644
index 0000000000..d5f0c087e6
--- /dev/null
+++ b/third_party/aom/aom/aom_image.h
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the aom image descriptor and associated operations
+ *
+ */
+#ifndef AOM_AOM_AOM_IMAGE_H_
+#define AOM_AOM_AOM_IMAGE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_integer.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/
+
+#define AOM_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */
+#define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
+/** 0x400 used to signal alpha channel, skipping for backwards compatibility. */
+#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
+
+/*!\brief List of supported image formats */
+typedef enum aom_img_fmt {
+ AOM_IMG_FMT_NONE,
+ AOM_IMG_FMT_YV12 =
+ AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
+ AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
+ AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP |
+ 3, /** < planar 4:2:0 format with aom color space */
+ AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
+ AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
+ AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
+/*!\brief Allows detection of the presence of AOM_IMG_FMT_NV12 at compile time.
+ */
+#define AOM_HAVE_IMG_FMT_NV12 1
+ AOM_IMG_FMT_NV12 =
+ AOM_IMG_FMT_PLANAR | 7, /**< 4:2:0 with U and V interleaved */
+ AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
+ AOM_IMG_FMT_YV1216 = AOM_IMG_FMT_YV12 | AOM_IMG_FMT_HIGHBITDEPTH,
+ AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
+ AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
+} aom_img_fmt_t; /**< alias for enum aom_img_fmt */
+
+/*!\brief List of supported color primaries */
+typedef enum aom_color_primaries {
+ AOM_CICP_CP_RESERVED_0 = 0, /**< For future use */
+ AOM_CICP_CP_BT_709 = 1, /**< BT.709 */
+ AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */
+ AOM_CICP_CP_RESERVED_3 = 3, /**< For future use */
+ AOM_CICP_CP_BT_470_M = 4, /**< BT.470 System M (historical) */
+ AOM_CICP_CP_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */
+ AOM_CICP_CP_BT_601 = 6, /**< BT.601 */
+ AOM_CICP_CP_SMPTE_240 = 7, /**< SMPTE 240 */
+ AOM_CICP_CP_GENERIC_FILM =
+ 8, /**< Generic film (color filters using illuminant C) */
+ AOM_CICP_CP_BT_2020 = 9, /**< BT.2020, BT.2100 */
+ AOM_CICP_CP_XYZ = 10, /**< SMPTE 428 (CIE 1921 XYZ) */
+ AOM_CICP_CP_SMPTE_431 = 11, /**< SMPTE RP 431-2 */
+ AOM_CICP_CP_SMPTE_432 = 12, /**< SMPTE EG 432-1 */
+ AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21) */
+ AOM_CICP_CP_EBU_3213 = 22, /**< EBU Tech. 3213-E */
+ AOM_CICP_CP_RESERVED_23 = 23 /**< For future use (values 23 - 255) */
+} aom_color_primaries_t; /**< alias for enum aom_color_primaries */
+
+/*!\brief List of supported transfer functions */
+typedef enum aom_transfer_characteristics {
+ AOM_CICP_TC_RESERVED_0 = 0, /**< For future use */
+ AOM_CICP_TC_BT_709 = 1, /**< BT.709 */
+ AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */
+ AOM_CICP_TC_RESERVED_3 = 3, /**< For future use */
+ AOM_CICP_TC_BT_470_M = 4, /**< BT.470 System M (historical) */
+ AOM_CICP_TC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */
+ AOM_CICP_TC_BT_601 = 6, /**< BT.601 */
+ AOM_CICP_TC_SMPTE_240 = 7, /**< SMPTE 240 M */
+ AOM_CICP_TC_LINEAR = 8, /**< Linear */
+ AOM_CICP_TC_LOG_100 = 9, /**< Logarithmic (100 : 1 range) */
+ AOM_CICP_TC_LOG_100_SQRT10 =
+ 10, /**< Logarithmic (100 * Sqrt(10) : 1 range) */
+ AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */
+ AOM_CICP_TC_BT_1361 = 12, /**< BT.1361 */
+ AOM_CICP_TC_SRGB = 13, /**< sRGB or sYCC*/
+ AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */
+ AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */
+ AOM_CICP_TC_SMPTE_2084 = 16, /**< SMPTE ST 2084, ITU BT.2100 PQ */
+ AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */
+ AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */
+ AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */
+} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */
+
+/*!\brief List of supported matrix coefficients */
+typedef enum aom_matrix_coefficients {
+ AOM_CICP_MC_IDENTITY = 0, /**< Identity matrix */
+ AOM_CICP_MC_BT_709 = 1, /**< BT.709 */
+ AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */
+ AOM_CICP_MC_RESERVED_3 = 3, /**< For future use */
+ AOM_CICP_MC_FCC = 4, /**< US FCC 73.628 */
+ AOM_CICP_MC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */
+ AOM_CICP_MC_BT_601 = 6, /**< BT.601 */
+ AOM_CICP_MC_SMPTE_240 = 7, /**< SMPTE 240 M */
+ AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */
+ AOM_CICP_MC_BT_2020_NCL =
+ 9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr */
+ AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */
+ AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */
+ AOM_CICP_MC_CHROMAT_NCL =
+ 12, /**< Chromaticity-derived non-constant luminance */
+ AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */
+ AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */
+ AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */
+} aom_matrix_coefficients_t;
+
+/*!\brief List of supported color range */
+typedef enum aom_color_range {
+ AOM_CR_STUDIO_RANGE = 0, /**<- Y [16..235], UV [16..240] (bit depth 8) */
+ /**<- Y [64..940], UV [64..960] (bit depth 10) */
+ /**<- Y [256..3760], UV [256..3840] (bit depth 12) */
+ AOM_CR_FULL_RANGE = 1 /**<- YUV/RGB [0..255] (bit depth 8) */
+ /**<- YUV/RGB [0..1023] (bit depth 10) */
+ /**<- YUV/RGB [0..4095] (bit depth 12) */
+} aom_color_range_t; /**< alias for enum aom_color_range */
+
+/*!\brief List of chroma sample positions */
+typedef enum aom_chroma_sample_position {
+ AOM_CSP_UNKNOWN = 0, /**< Unknown */
+ AOM_CSP_VERTICAL = 1, /**< Horizontally co-located with luma(0, 0)*/
+ /**< sample, between two vertical samples */
+ AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */
+ AOM_CSP_RESERVED = 3 /**< Reserved value */
+} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */
+
+/*!\brief List of insert flags for Metadata
+ *
+ * These flags control how the library treats metadata during encode.
+ *
+ * While encoding, when metadata is added to an aom_image via
+ * aom_img_add_metadata(), the flag passed along with the metadata will
+ * determine where the metadata OBU will be placed in the encoded OBU stream.
+ * Metadata will be emitted into the output stream within the next temporal unit
+ * if it satisfies the specified insertion flag.
+ *
+ * During decoding, when the library encounters a metadata OBU, it is always
+ * flagged as AOM_MIF_ANY_FRAME and emitted with the next output aom_image.
+ */
+typedef enum aom_metadata_insert_flags {
+ AOM_MIF_NON_KEY_FRAME = 0, /**< Adds metadata if it's not keyframe */
+ AOM_MIF_KEY_FRAME = 1, /**< Adds metadata only if it's a keyframe */
+ AOM_MIF_ANY_FRAME = 2 /**< Adds metadata to any type of frame */
+} aom_metadata_insert_flags_t;
+
+/*!\brief Array of aom_metadata structs for an image. */
+typedef struct aom_metadata_array aom_metadata_array_t;
+
+/*!\brief Metadata payload. */
+typedef struct aom_metadata {
+ uint32_t type; /**< Metadata type */
+ uint8_t *payload; /**< Metadata payload data */
+ size_t sz; /**< Metadata payload size */
+ aom_metadata_insert_flags_t insert_flag; /**< Metadata insertion flag */
+} aom_metadata_t;
+
+/**\brief Image Descriptor */
+typedef struct aom_image {
+ aom_img_fmt_t fmt; /**< Image Format */
+ aom_color_primaries_t cp; /**< CICP Color Primaries */
+ aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */
+ aom_matrix_coefficients_t mc; /**< CICP Matrix Coefficients */
+ int monochrome; /**< Whether image is monochrome */
+ aom_chroma_sample_position_t csp; /**< chroma sample position */
+ aom_color_range_t range; /**< Color Range */
+
+ /* Image storage dimensions */
+ unsigned int w; /**< Stored image width */
+ unsigned int h; /**< Stored image height */
+ unsigned int bit_depth; /**< Stored image bit-depth */
+
+ /* Image display dimensions */
+ unsigned int d_w; /**< Displayed image width */
+ unsigned int d_h; /**< Displayed image height */
+
+ /* Image intended rendering dimensions */
+ unsigned int r_w; /**< Intended rendering image width */
+ unsigned int r_h; /**< Intended rendering image height */
+
+ /* Chroma subsampling info */
+ unsigned int x_chroma_shift; /**< subsampling order, X */
+ unsigned int y_chroma_shift; /**< subsampling order, Y */
+
+/* Image data pointers. */
+#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */
+#define AOM_PLANE_Y 0 /**< Y (Luminance) plane */
+#define AOM_PLANE_U 1 /**< U (Chroma) plane */
+#define AOM_PLANE_V 2 /**< V (Chroma) plane */
+ /* planes[AOM_PLANE_V] = NULL and stride[AOM_PLANE_V] = 0 when fmt ==
+ * AOM_IMG_FMT_NV12 */
+ unsigned char *planes[3]; /**< pointer to the top left pixel for each plane */
+ int stride[3]; /**< stride between rows for each plane */
+ size_t sz; /**< data size */
+
+ int bps; /**< bits per sample (for packed formats) */
+
+ int temporal_id; /**< Temporal layer Id of image */
+ int spatial_id; /**< Spatial layer Id of image */
+
+ /*!\brief The following member may be set by the application to associate
+ * data with this image.
+ */
+ void *user_priv;
+
+ /* The following members should be treated as private. */
+ unsigned char *img_data; /**< private */
+ int img_data_owner; /**< private */
+ int self_allocd; /**< private */
+
+ aom_metadata_array_t
+ *metadata; /**< Metadata payloads associated with the image. */
+
+ void *fb_priv; /**< Frame buffer data associated with the image. */
+} aom_image_t; /**< alias for struct aom_image */
+
+/*!\brief Open a descriptor, allocating storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for the image is allocated on the heap.
+ *
+ * \param[in] img Pointer to storage for descriptor. If this parameter
+ * is NULL, the storage for the descriptor will be
+ * allocated on the heap.
+ * \param[in] fmt Format for the image
+ * \param[in] d_w Width of the image
+ * \param[in] d_h Height of the image
+ * \param[in] align Alignment, in bytes, of the image buffer and
+ * each row in the image (stride).
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ * parameter is non-null, the value of the img parameter will be
+ * returned.
+ */
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int align);
+
+/*!\brief Open a descriptor, using existing storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for the image has been allocated elsewhere, and a descriptor is
+ * desired to "wrap" that storage.
+ *
+ * \param[in] img Pointer to storage for descriptor. If this parameter
+ * is NULL, the storage for the descriptor will be
+ * allocated on the heap.
+ * \param[in] fmt Format for the image
+ * \param[in] d_w Width of the image
+ * \param[in] d_h Height of the image
+ * \param[in] align Alignment, in bytes, of each row in the image
+ * (stride).
+ * \param[in] img_data Storage to use for the image
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ * parameter is non-null, the value of the img parameter will be
+ * returned.
+ */
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+ unsigned int d_h, unsigned int align,
+ unsigned char *img_data);
+
+/*!\brief Open a descriptor, allocating storage for the underlying image with a
+ * border
+ *
+ * Returns a descriptor for storing an image of the given format and its
+ * borders. The storage for the image is allocated on the heap.
+ *
+ * \param[in] img Pointer to storage for descriptor. If this parameter
+ * is NULL, the storage for the descriptor will be
+ * allocated on the heap.
+ * \param[in] fmt Format for the image
+ * \param[in] d_w Width of the image
+ * \param[in] d_h Height of the image
+ * \param[in] align Alignment, in bytes, of the image buffer and
+ * each row in the image (stride).
+ * \param[in] size_align Alignment, in pixels, of the image width and height.
+ * \param[in] border A border that is padded on four sides of the image.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ * parameter is non-null, the value of the img parameter will be
+ * returned.
+ */
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int align,
+ unsigned int size_align,
+ unsigned int border);
+
+/*!\brief Set the rectangle identifying the displayed portion of the image
+ *
+ * Updates the displayed rectangle (aka viewport) on the image surface to
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
+ *
+ * \param[in] img Image descriptor
+ * \param[in] x leftmost column
+ * \param[in] y topmost row
+ * \param[in] w width
+ * \param[in] h height
+ * \param[in] border A border that is padded on four sides of the image.
+ *
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
+ */
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+ unsigned int w, unsigned int h, unsigned int border);
+
+/*!\brief Flip the image vertically (top for bottom)
+ *
+ * Adjusts the image descriptor's pointers and strides to make the image
+ * be referenced upside-down.
+ *
+ * \param[in] img Image descriptor
+ */
+void aom_img_flip(aom_image_t *img);
+
+/*!\brief Close an image descriptor
+ *
+ * Frees all allocated storage associated with an image descriptor.
+ *
+ * \param[in] img Image descriptor
+ */
+void aom_img_free(aom_image_t *img);
+
+/*!\brief Get the width of a plane
+ *
+ * Get the width of a plane of an image
+ *
+ * \param[in] img Image descriptor
+ * \param[in] plane Plane index
+ */
+int aom_img_plane_width(const aom_image_t *img, int plane);
+
+/*!\brief Get the height of a plane
+ *
+ * Get the height of a plane of an image
+ *
+ * \param[in] img Image descriptor
+ * \param[in] plane Plane index
+ */
+int aom_img_plane_height(const aom_image_t *img, int plane);
+
+/*!\brief Add metadata to image.
+ *
+ * Adds metadata to aom_image_t.
+ * Function makes a copy of the provided data parameter.
+ * Metadata insertion point is controlled by insert_flag.
+ *
+ * \param[in] img Image descriptor
+ * \param[in] type Metadata type
+ * \param[in] data Metadata contents
+ * \param[in] sz Metadata contents size
+ * \param[in] insert_flag Metadata insert flag
+ *
+ * \return Returns 0 on success. If img or data is NULL, sz is 0, or memory
+ * allocation fails, it returns -1.
+ */
+int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
+ size_t sz, aom_metadata_insert_flags_t insert_flag);
+
+/*!\brief Return a metadata payload stored within the image metadata array.
+ *
+ * Gets the metadata (aom_metadata_t) at the indicated index in the image
+ * metadata array.
+ *
+ * \param[in] img Pointer to image descriptor to get metadata from
+ * \param[in] index Metadata index to get from metadata array
+ *
+ * \return Returns a const pointer to the selected metadata, if img and/or index
+ * is invalid, it returns NULL.
+ */
+const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img,
+ size_t index);
+
+/*!\brief Return the number of metadata blocks within the image.
+ *
+ * Gets the number of metadata blocks contained within the provided image
+ * metadata array.
+ *
+ * \param[in] img Pointer to image descriptor to get metadata number
+ * from.
+ *
+ * \return Returns the size of the metadata array. If img or metadata is NULL,
+ * it returns 0.
+ */
+size_t aom_img_num_metadata(const aom_image_t *img);
+
+/*!\brief Remove metadata from image.
+ *
+ * Removes all metadata in image metadata list and sets metadata list pointer
+ * to NULL.
+ *
+ * \param[in] img Image descriptor
+ */
+void aom_img_remove_metadata(aom_image_t *img);
+
+/*!\brief Allocate memory for aom_metadata struct.
+ *
+ * Allocates storage for the metadata payload, sets its type and copies the
+ * payload data into the aom_metadata struct. A metadata payload buffer of size
+ * sz is allocated and sz bytes are copied from data into the payload buffer.
+ *
+ * \param[in] type Metadata type
+ * \param[in] data Metadata data pointer
+ * \param[in] sz Metadata size
+ * \param[in] insert_flag Metadata insert flag
+ *
+ * \return Returns the newly allocated aom_metadata struct. If data is NULL,
+ * sz is 0, or memory allocation fails, it returns NULL.
+ */
+aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data,
+ size_t sz,
+ aom_metadata_insert_flags_t insert_flag);
+
+/*!\brief Free metadata struct.
+ *
+ * Free metadata struct and its buffer.
+ *
+ * \param[in] metadata Metadata struct pointer
+ */
+void aom_img_metadata_free(aom_metadata_t *metadata);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_AOM_IMAGE_H_
diff --git a/third_party/aom/aom/aom_integer.h b/third_party/aom/aom/aom_integer.h
new file mode 100644
index 0000000000..ce65e98452
--- /dev/null
+++ b/third_party/aom/aom/aom_integer.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_INTEGER_H_
+#define AOM_AOM_AOM_INTEGER_H_
+
+/* get ptrdiff_t, size_t, wchar_t, NULL */
+#include <stddef.h> // IWYU pragma: export
+
+#if defined(_MSC_VER)
+#define AOM_FORCE_INLINE __forceinline
+#define AOM_INLINE __inline
+#else
+#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
+#define AOM_INLINE inline
+#endif
+
+/* Assume platforms have the C99 standard integer types. */
+
+#if defined(__cplusplus)
+#if !defined(__STDC_FORMAT_MACROS)
+#define __STDC_FORMAT_MACROS
+#endif
+#if !defined(__STDC_LIMIT_MACROS)
+#define __STDC_LIMIT_MACROS
+#endif
+#endif // __cplusplus
+
+#include <stdint.h> // IWYU pragma: export
+#include <inttypes.h> // IWYU pragma: export
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // __cplusplus
+
+// Returns size of uint64_t when encoded using LEB128.
+size_t aom_uleb_size_in_bytes(uint64_t value);
+
+// Returns 0 on success, -1 on decode failure.
+// On success, 'value' stores the decoded LEB128 value and 'length' stores
+// the number of bytes decoded.
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+ size_t *length);
+
+// Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure.
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+ size_t *coded_size);
+
+// Encodes LEB128 integer to size specified. Returns 0 when successful, and -1
+// upon failure.
+// Note: This will write exactly pad_to_size bytes; if the value cannot be
+// encoded in this many bytes, then this will fail.
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+ size_t pad_to_size, uint8_t *coded_value,
+ size_t *coded_size);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif // __cplusplus
+
+#endif // AOM_AOM_AOM_INTEGER_H_
diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h
new file mode 100644
index 0000000000..edd8cd5e7c
--- /dev/null
+++ b/third_party/aom/aom/aomcx.h
@@ -0,0 +1,2205 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOMCX_H_
+#define AOM_AOM_AOMCX_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include "aom/aom.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_external_partition.h"
+
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
+ * aom Codec Interface.
+ *
+ * Several interfaces are excluded with CONFIG_REALTIME_ONLY build:
+ * Global motion
+ * Warped motion
+ * OBMC
+ * TPL model
+ * Loop restoration
+ *
+ * The following features are also disabled with CONFIG_REALTIME_ONLY:
+ * AV1E_SET_QUANT_B_ADAPT
+ * CNN
+ * 4X rectangular blocks
+ * 4X rectangular transform in intra prediction
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to encode raw AV1 streams.
+ *@{
+ */
+
+/*!\brief A single instance of the AV1 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer aom_codec_av1_cx().
+ */
+extern aom_codec_iface_t aom_codec_av1_cx_algo;
+
+/*!\brief The interface to the AV1 encoder.
+ */
+extern aom_codec_iface_t *aom_codec_av1_cx(void);
+/*!@} - end algorithm interface member group */
+
+/*
+ * Algorithm Flags
+ */
+
+/*!\brief Don't reference the last frame
+ *
+ * When this flag is set, the encoder will not use the last frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST (1 << 16)
+/*!\brief Don't reference the last2 frame
+ *
+ * When this flag is set, the encoder will not use the last2 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last2 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST2 (1 << 17)
+/*!\brief Don't reference the last3 frame
+ *
+ * When this flag is set, the encoder will not use the last3 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last3 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST3 (1 << 18)
+/*!\brief Don't reference the golden frame
+ *
+ * When this flag is set, the encoder will not use the golden frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * golden frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_GF (1 << 19)
+
+/*!\brief Don't reference the alternate reference frame
+ *
+ * When this flag is set, the encoder will not use the alt ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF (1 << 20)
+/*!\brief Don't reference the bwd reference frame
+ *
+ * When this flag is set, the encoder will not use the bwd ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * bwd ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_BWD (1 << 21)
+/*!\brief Don't reference the alt2 reference frame
+ *
+ * When this flag is set, the encoder will not use the alt2 ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt2 ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF2 (1 << 22)
+
+/*!\brief Don't update the last frame
+ *
+ * When this flag is set, the encoder will not update the last frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_LAST (1 << 23)
+
+/*!\brief Don't update the golden frame
+ *
+ * When this flag is set, the encoder will not update the golden frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_GF (1 << 24)
+
+/*!\brief Don't update the alternate reference frame
+ *
+ * When this flag is set, the encoder will not update the alt ref frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_ARF (1 << 25)
+/*!\brief Disable entropy update
+ *
+ * When this flag is set, the encoder will not update its internal entropy
+ * model based on the entropy of this frame.
+ */
+#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26)
+/*!\brief Disable ref frame mvs
+ *
+ * When this flag is set, the encoder will not allow frames to
+ * be encoded using mfmv.
+ */
+#define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27)
+/*!\brief Enable error resilient frame
+ *
+ * When this flag is set, the encoder will code frames as error
+ * resilient.
+ */
+#define AOM_EFLAG_ERROR_RESILIENT (1 << 28)
+/*!\brief Enable s frame mode
+ *
+ * When this flag is set, the encoder will code frames as an
+ * s frame.
+ */
+#define AOM_EFLAG_SET_S_FRAME (1 << 29)
+/*!\brief Force primary_ref_frame to PRIMARY_REF_NONE
+ *
+ * When this flag is set, the encoder will set a frame's primary_ref_frame
+ * to PRIMARY_REF_NONE
+ */
+#define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30)
+
+/*!\brief AVx encoder control functions
+ *
+ * This set of macros define the control functions available for AVx
+ * encoder interface.
+ * The range of encode control ID is 7-229(max).
+ *
+ * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
+ */
+enum aome_enc_control_id {
+ /*!\brief Codec control function to set which reference frame encoder can use,
+ * int parameter.
+ */
+ AOME_USE_REFERENCE = 7,
+
+ /*!\brief Codec control function to pass an ROI map to encoder, aom_roi_map_t*
+ * parameter.
+ */
+ AOME_SET_ROI_MAP = 8,
+
+ /*!\brief Codec control function to pass an Active map to encoder,
+ * aom_active_map_t* parameter.
+ */
+ AOME_SET_ACTIVEMAP = 9,
+
+ /* NOTE: enum 10 unused */
+
+ /*!\brief Codec control function to set encoder scaling mode for the next
+ * frame to be coded, aom_scaling_mode_t* parameter.
+ */
+ AOME_SET_SCALEMODE = 11,
+
+ /*!\brief Codec control function to set encoder spatial layer id, int
+ * parameter.
+ */
+ AOME_SET_SPATIAL_LAYER_ID = 12,
+
+ /*!\brief Codec control function to set encoder internal speed settings,
+ * int parameter
+ *
+ * Changes in this value influences the complexity of algorithms used in
+ * encoding process, values greater than 0 will increase encoder speed at
+ * the expense of quality.
+ *
+ * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest;
+ * quality improves as speed decreases (since more compression
+ * possibilities are explored).
+ *
+ * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In
+ * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed
+ * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also,
+ * AOM_USAGE_REALTIME treats 0..4 the same as 5.
+ */
+ AOME_SET_CPUUSED = 13,
+
+ /*!\brief Codec control function to enable automatic set and use alf frames,
+ * unsigned int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AOME_SET_ENABLEAUTOALTREF = 14,
+
+ /* NOTE: enum 15 unused */
+
+ /*!\brief Codec control function to set the sharpness parameter,
+ * unsigned int parameter.
+ *
+ * This parameter controls the level at which rate-distortion optimization of
+ * transform coefficients favours sharpness in the block.
+ *
+ * Valid range: 0..7. The default is 0. Values 1-7 will avoid eob and skip
+ * block optimization and will change rdmult in favour of block sharpness.
+ */
+ AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2, // 16
+
+ /*!\brief Codec control function to set the threshold for MBs treated static,
+ * unsigned int parameter
+ */
+ AOME_SET_STATIC_THRESHOLD = 17,
+
+ /* NOTE: enum 18 unused */
+
+ /*!\brief Codec control function to get last quantizer chosen by the encoder,
+ * int* parameter
+ *
+ * Return value uses internal quantizer scale defined by the codec.
+ */
+ AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2, // 19
+
+ /*!\brief Codec control function to get last quantizer chosen by the encoder,
+ * int* parameter
+ *
+ * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+ * parameters.
+ */
+ AOME_GET_LAST_QUANTIZER_64 = 20,
+
+ /*!\brief Codec control function to set the max no of frames to create arf,
+ * unsigned int parameter
+ */
+ AOME_SET_ARNR_MAXFRAMES = 21,
+
+ /*!\brief Codec control function to set the filter strength for the arf,
+ * unsigned int parameter
+ */
+ AOME_SET_ARNR_STRENGTH = 22,
+
+ /* NOTE: enum 23 unused */
+
+ /*!\brief Codec control function to set visual tuning, aom_tune_metric (int)
+ * parameter
+ *
+ * The default is AOM_TUNE_PSNR.
+ */
+ AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2, // 24
+
+ /*!\brief Codec control function to set constrained / constant quality level,
+ * unsigned int parameter
+ *
+ * Valid range: 0..63
+ *
+ * \attention For this value to be used aom_codec_enc_cfg_t::rc_end_usage
+ * must be set to #AOM_CQ or #AOM_Q.
+ */
+ AOME_SET_CQ_LEVEL = 25,
+
+ /*!\brief Codec control function to set max data rate for intra frames,
+ * unsigned int parameter
+ *
+ * This value controls additional clamping on the maximum size of a
+ * keyframe. It is expressed as a percentage of the average
+ * per-frame bitrate, with the special (and default) value 0 meaning
+ * unlimited, or no additional clamping beyond the codec's built-in
+ * algorithm.
+ *
+ * For example, to allocate no more than 4.5 frames worth of bitrate
+ * to a keyframe, set this to 450.
+ */
+ AOME_SET_MAX_INTRA_BITRATE_PCT = 26,
+
+ /*!\brief Codec control function to set number of spatial layers, int
+ * parameter
+ */
+ AOME_SET_NUMBER_SPATIAL_LAYERS = 27,
+
+ /*!\brief Codec control function to set max data rate for inter frames,
+ * unsigned int parameter
+ *
+ * This value controls additional clamping on the maximum size of an
+ * inter frame. It is expressed as a percentage of the average
+ * per-frame bitrate, with the special (and default) value 0 meaning
+ * unlimited, or no additional clamping beyond the codec's built-in
+ * algorithm.
+ *
+ * For example, to allow no more than 4.5 frames worth of bitrate
+ * to an inter frame, set this to 450.
+ */
+ AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2, // 28
+
+ /*!\brief Boost percentage for Golden Frame in CBR mode, unsigned int
+ * parameter
+ *
+ * This value controls the amount of boost given to Golden Frame in
+ * CBR mode. It is expressed as a percentage of the average
+ * per-frame bitrate, with the special (and default) value 0 meaning
+ * the feature is off, i.e., no golden frame boost in CBR mode and
+ * average bitrate target is used.
+ *
+ * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+ * than average frame, set this to 100.
+ */
+ AV1E_SET_GF_CBR_BOOST_PCT = 29,
+
+ /* NOTE: enum 30 unused */
+
+ /*!\brief Codec control function to set lossless encoding mode, unsigned int
+ * parameter
+ *
+ * AV1 can operate in lossless encoding mode, in which the bitstream
+ * produced will be able to decode and reconstruct a perfect copy of
+ * input source.
+ *
+ * - 0 = normal coding mode, may be lossy (default)
+ * - 1 = lossless coding mode
+ */
+ AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2, // 31
+
+ /*!\brief Codec control function to enable the row based multi-threading
+ * of the encoder, unsigned int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ROW_MT = 32,
+
+ /*!\brief Codec control function to set number of tile columns. unsigned int
+ * parameter
+ *
+ * In encoding and decoding, AV1 allows an input image frame be partitioned
+ * into separate vertical tile columns, which can be encoded or decoded
+ * independently. This enables easy implementation of parallel encoding and
+ * decoding. The parameter for this control describes the number of tile
+ * columns (in log2 units), which has a valid range of [0, 6]:
+ * \verbatim
+ 0 = 1 tile column
+ 1 = 2 tile columns
+ 2 = 4 tile columns
+ .....
+ n = 2**n tile columns
+ \endverbatim
+ * By default, the value is 0, i.e. one single column tile for entire image.
+ */
+ AV1E_SET_TILE_COLUMNS = 33,
+
+ /*!\brief Codec control function to set number of tile rows, unsigned int
+ * parameter
+ *
+ * In encoding and decoding, AV1 allows an input image frame be partitioned
+ * into separate horizontal tile rows, which can be encoded or decoded
+ * independently. The parameter for this control describes the number of tile
+ * rows (in log2 units), which has a valid range of [0, 6]:
+ * \verbatim
+ 0 = 1 tile row
+ 1 = 2 tile rows
+ 2 = 4 tile rows
+ .....
+ n = 2**n tile rows
+ \endverbatim
+ * By default, the value is 0, i.e. one single row tile for entire image.
+ */
+ AV1E_SET_TILE_ROWS = 34,
+
+ /*!\brief Codec control function to enable RDO modulated by frame temporal
+ * dependency, unsigned int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ *
+ * \note Excluded from CONFIG_REALTIME_ONLY build.
+ */
+ AV1E_SET_ENABLE_TPL_MODEL = 35,
+
+ /*!\brief Codec control function to enable temporal filtering on key frame,
+ * unsigned int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable without overlay (default)
+ * - 2 = enable with overlay
+ */
+ AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36,
+
+ /*!\brief Codec control function to enable frame parallel decoding feature,
+ * unsigned int parameter
+ *
+ * AV1 has a bitstream feature to reduce decoding dependency between frames
+ * by turning off backward update of probability context used in encoding
+ * and decoding. This allows staged parallel processing of more than one
+ * video frames in the decoder. This control function provides a means to
+ * turn this feature on or off for bitstreams produced by encoder.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_FRAME_PARALLEL_DECODING = 37,
+
+ /*!\brief Codec control function to enable error_resilient_mode, int parameter
+ *
+ * AV1 has a bitstream feature to guarantee parsability of a frame
+ * by turning on the error_resilient_decoding mode, even though the
+ * reference buffers are unreliable or not received.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_ERROR_RESILIENT_MODE = 38,
+
+ /*!\brief Codec control function to enable s_frame_mode, int parameter
+ *
+ * AV1 has a bitstream feature to designate certain frames as S-frames,
+ * from where we can switch to a different stream,
+ * even though the reference buffers may not be exactly identical.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_S_FRAME_MODE = 39,
+
+ /*!\brief Codec control function to set adaptive quantization mode, unsigned
+ * int parameter
+ *
+ * AV1 has a segment based feature that allows encoder to adaptively change
+ * quantization parameter for each segment within a frame to improve the
+ * subjective quality. This control makes encoder operate in one of the
+ * several AQ modes supported.
+ *
+ * - 0 = disable (default)
+ * - 1 = variance
+ * - 2 = complexity
+ * - 3 = cyclic refresh
+ */
+ AV1E_SET_AQ_MODE = 40,
+
+ /*!\brief Codec control function to enable/disable periodic Q boost, unsigned
+ * int parameter
+ *
+ * One AV1 encoder speed feature is to enable quality boost by lowering
+ * frame level Q periodically. This control function provides a means to
+ * turn on/off this feature.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_FRAME_PERIODIC_BOOST = 41,
+
+ /*!\brief Codec control function to set noise sensitivity, unsigned int
+ * parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable (Y only)
+ */
+ AV1E_SET_NOISE_SENSITIVITY = 42,
+
+ /*!\brief Codec control function to set content type, aom_tune_content
+ * parameter
+ *
+ * - AOM_CONTENT_DEFAULT = Regular video content (default)
+ * - AOM_CONTENT_SCREEN = Screen capture content
+ * - AOM_CONTENT_FILM = Film content
+ */
+ AV1E_SET_TUNE_CONTENT = 43,
+
+ /*!\brief Codec control function to set CDF update mode, unsigned int
+ * parameter
+ *
+ * - 0: no update
+ * - 1: update on every frame (default)
+ * - 2: selectively update
+ */
+ AV1E_SET_CDF_UPDATE_MODE = 44,
+
+ /*!\brief Codec control function to set color space info, int parameter
+ *
+ * - 0 = For future use
+ * - 1 = BT.709
+ * - 2 = Unspecified (default)
+ * - 3 = For future use
+ * - 4 = BT.470 System M (historical)
+ * - 5 = BT.470 System B, G (historical)
+ * - 6 = BT.601
+ * - 7 = SMPTE 240
+ * - 8 = Generic film (color filters using illuminant C)
+ * - 9 = BT.2020, BT.2100
+ * - 10 = SMPTE 428 (CIE 1921 XYZ)
+ * - 11 = SMPTE RP 431-2
+ * - 12 = SMPTE EG 432-1
+ * - 13..21 = For future use
+ * - 22 = EBU Tech. 3213-E
+ * - 23 = For future use
+ */
+ AV1E_SET_COLOR_PRIMARIES = 45,
+
+ /*!\brief Codec control function to set transfer function info, int parameter
+ *
+ * - 0 = For future use
+ * - 1 = BT.709
+ * - 2 = Unspecified (default)
+ * - 3 = For future use
+ * - 4 = BT.470 System M (historical)
+ * - 5 = BT.470 System B, G (historical)
+ * - 6 = BT.601
+ * - 7 = SMPTE 240 M
+ * - 8 = Linear
+ * - 9 = Logarithmic (100 : 1 range)
+ * - 10 = Logarithmic (100 * Sqrt(10) : 1 range)
+ * - 11 = IEC 61966-2-4
+ * - 12 = BT.1361
+ * - 13 = sRGB or sYCC
+ * - 14 = BT.2020 10-bit systems
+ * - 15 = BT.2020 12-bit systems
+ * - 16 = SMPTE ST 2084, ITU BT.2100 PQ
+ * - 17 = SMPTE ST 428
+ * - 18 = BT.2100 HLG, ARIB STD-B67
+ * - 19 = For future use
+ */
+ AV1E_SET_TRANSFER_CHARACTERISTICS = 46,
+
+ /*!\brief Codec control function to set transfer function info, int parameter
+ *
+ * - 0 = Identity matrix
+ * - 1 = BT.709
+ * - 2 = Unspecified (default)
+ * - 3 = For future use
+ * - 4 = US FCC 73.628
+ * - 5 = BT.470 System B, G (historical)
+ * - 6 = BT.601
+ * - 7 = SMPTE 240 M
+ * - 8 = YCgCo
+ * - 9 = BT.2020 non-constant luminance, BT.2100 YCbCr
+ * - 10 = BT.2020 constant luminance
+ * - 11 = SMPTE ST 2085 YDzDx
+ * - 12 = Chromaticity-derived non-constant luminance
+ * - 13 = Chromaticity-derived constant luminance
+ * - 14 = BT.2100 ICtCp
+ * - 15 = For future use
+ */
+ AV1E_SET_MATRIX_COEFFICIENTS = 47,
+
+ /*!\brief Codec control function to set chroma 4:2:0 sample position info,
+ * aom_chroma_sample_position_t parameter
+ *
+ * AOM_CSP_UNKNOWN is default
+ */
+ AV1E_SET_CHROMA_SAMPLE_POSITION = 48,
+
+ /*!\brief Codec control function to set minimum interval between GF/ARF
+ * frames, unsigned int parameter
+ *
+ * By default the value is set as 4.
+ */
+ AV1E_SET_MIN_GF_INTERVAL = 49,
+
+ /*!\brief Codec control function to set minimum interval between GF/ARF
+ * frames, unsigned int parameter
+ *
+ * By default the value is set as 16.
+ */
+ AV1E_SET_MAX_GF_INTERVAL = 50,
+
+ /*!\brief Codec control function to get an active map back from the encoder,
+ aom_active_map_t* parameter
+ */
+ AV1E_GET_ACTIVEMAP = 51,
+
+ /*!\brief Codec control function to set color range bit, int parameter
+ *
+ * - 0 = Limited range, 16..235 or HBD equivalent (default)
+ * - 1 = Full range, 0..255 or HBD equivalent
+ */
+ AV1E_SET_COLOR_RANGE = 52,
+
+ /*!\brief Codec control function to set intended rendering image size,
+ * int32_t[2] parameter
+ *
+ * By default, this is identical to the image size in pixels.
+ */
+ AV1E_SET_RENDER_SIZE = 53,
+
+ /*!\brief Control to set target sequence level index for a certain operating
+ * point (OP), int parameter
+ * Possible values are in the form of "ABxy".
+ * - AB: OP index.
+ * - xy: Target level index for the OP. Possible values are:
+ * + 0~27: corresponding to level 2.0 ~ 8.3. Note:
+ * > Levels 2.2 (2), 2.3 (3), 3.2 (6), 3.3 (7), 4.2 (10) & 4.3 (11) are
+ * undefined.
+ * > Levels 7.x and 8.x (20~27) are in draft status, available under the
+ * config flag CONFIG_CWG_C013.
+ * + 31: maximum parameters level, no level-based constraints.
+ * + 32: keep level stats only for level monitoring.
+ *
+ * E.g.:
+ * - "0" means target level index 0 (2.0) for the 0th OP;
+ * - "109" means target level index 9 (4.1) for the 1st OP;
+ * - "1019" means target level index 19 (6.3) for the 10th OP.
+ *
+ * If the target level is not specified for an OP, the maximum parameters
+ * level of 31 is used as default.
+ */
+ AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54,
+
+ /*!\brief Codec control function to get sequence level index for each
+ * operating point. int* parameter. There can be at most 32 operating points.
+ * The results will be written into a provided integer array of sufficient
+ * size.
+ */
+ AV1E_GET_SEQ_LEVEL_IDX = 55,
+
+ /*!\brief Codec control function to set intended superblock size, unsigned int
+ * parameter
+ *
+ * By default, the superblock size is determined separately for each
+ * frame by the encoder.
+ */
+ AV1E_SET_SUPERBLOCK_SIZE = 56,
+
+ /*!\brief Codec control function to enable automatic set and use of
+ * bwd-pred frames, unsigned int parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AOME_SET_ENABLEAUTOBWDREF = 57,
+
+ /*!\brief Codec control function to encode with CDEF, unsigned int parameter
+ *
+ * CDEF is the constrained directional enhancement filter which is an
+ * in-loop filter aiming to remove coding artifacts
+ *
+ * - 0 = disable
+ * - 1 = enable for all frames (default)
+ * - 2 = disable for non-reference frames
+ */
+ AV1E_SET_ENABLE_CDEF = 58,
+
+ /*!\brief Codec control function to encode with Loop Restoration Filter,
+ * unsigned int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ *
+ * \note Excluded from CONFIG_REALTIME_ONLY build.
+ */
+ AV1E_SET_ENABLE_RESTORATION = 59,
+
+ /*!\brief Codec control function to force video mode, unsigned int parameter
+ *
+ * - 0 = do not force video mode (default)
+ * - 1 = force video mode even for a single frame
+ */
+ AV1E_SET_FORCE_VIDEO_MODE = 60,
+
+ /*!\brief Codec control function to predict with OBMC mode, unsigned int
+ * parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ *
+ * \note Excluded from CONFIG_REALTIME_ONLY build.
+ */
+ AV1E_SET_ENABLE_OBMC = 61,
+
+ /*!\brief Codec control function to encode without trellis quantization,
+ * unsigned int parameter
+ *
+ * - 0 = apply trellis quantization (default)
+ * - 1 = do not apply trellis quantization
+ * - 2 = disable trellis quantization in rd search
+ * - 3 = disable trellis quantization in estimate yrd
+ */
+ AV1E_SET_DISABLE_TRELLIS_QUANT = 62,
+
+ /*!\brief Codec control function to encode with quantisation matrices,
+ * unsigned int parameter
+ *
+ * AOM can operate with default quantisation matrices dependent on
+ * quantisation level and block type.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_ENABLE_QM = 63,
+
+ /*!\brief Codec control function to set the min quant matrix flatness,
+ * unsigned int parameter
+ *
+ * AOM can operate with different ranges of quantisation matrices.
+ * As quantisation levels increase, the matrices get flatter. This
+ * control sets the minimum level of flatness from which the matrices
+ * are determined.
+ *
+ * By default, the encoder sets this minimum at half the available
+ * range.
+ */
+ AV1E_SET_QM_MIN = 64,
+
+ /*!\brief Codec control function to set the max quant matrix flatness,
+ * unsigned int parameter
+ *
+ * AOM can operate with different ranges of quantisation matrices.
+ * As quantisation levels increase, the matrices get flatter. This
+ * control sets the maximum level of flatness possible.
+ *
+ * By default, the encoder sets this maximum at the top of the
+ * available range.
+ */
+ AV1E_SET_QM_MAX = 65,
+
+ /*!\brief Codec control function to set the min quant matrix flatness,
+ * unsigned int parameter
+ *
+ * AOM can operate with different ranges of quantisation matrices.
+ * As quantisation levels increase, the matrices get flatter. This
+ * control sets the flatness for luma (Y).
+ *
+ * By default, the encoder sets this minimum at half the available
+ * range.
+ */
+ AV1E_SET_QM_Y = 66,
+
+ /*!\brief Codec control function to set the min quant matrix flatness,
+ * unsigned int parameter
+ *
+ * AOM can operate with different ranges of quantisation matrices.
+ * As quantisation levels increase, the matrices get flatter. This
+ * control sets the flatness for chroma (U).
+ *
+ * By default, the encoder sets this minimum at half the available
+ * range.
+ */
+ AV1E_SET_QM_U = 67,
+
+ /*!\brief Codec control function to set the min quant matrix flatness,
+ * unsigned int parameter
+ *
+ * AOM can operate with different ranges of quantisation matrices.
+ * As quantisation levels increase, the matrices get flatter. This
+ * control sets the flatness for chrome (V).
+ *
+ * By default, the encoder sets this minimum at half the available
+ * range.
+ */
+ AV1E_SET_QM_V = 68,
+
+ /* NOTE: enum 69 unused */
+
+ /*!\brief Codec control function to set a maximum number of tile groups,
+ * unsigned int parameter
+ *
+ * This will set the maximum number of tile groups. This will be
+ * overridden if an MTU size is set. The default value is 1.
+ */
+ AV1E_SET_NUM_TG = 70,
+
+ /*!\brief Codec control function to set an MTU size for a tile group, unsigned
+ * int parameter
+ *
+ * This will set the maximum number of bytes in a tile group. This can be
+ * exceeded only if a single tile is larger than this amount.
+ *
+ * By default, the value is 0, in which case a fixed number of tile groups
+ * is used.
+ */
+ AV1E_SET_MTU = 71,
+
+ /* NOTE: enum 72 unused */
+
+ /*!\brief Codec control function to enable/disable rectangular partitions, int
+ * parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_RECT_PARTITIONS = 73,
+
+ /*!\brief Codec control function to enable/disable AB partitions, int
+ * parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_AB_PARTITIONS = 74,
+
+ /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions, int
+ * parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_1TO4_PARTITIONS = 75,
+
+ /*!\brief Codec control function to set min partition size, int parameter
+ *
+ * min_partition_size is applied to both width and height of the partition.
+ * i.e, both width and height of a partition can not be smaller than
+ * the min_partition_size, except the partition at the picture boundary.
+ *
+ * Valid values: [4, 8, 16, 32, 64, 128]. The default value is 4 for
+ * 4x4.
+ */
+ AV1E_SET_MIN_PARTITION_SIZE = 76,
+
+ /*!\brief Codec control function to set max partition size, int parameter
+ *
+ * max_partition_size is applied to both width and height of the partition.
+ * i.e, both width and height of a partition can not be larger than
+ * the max_partition_size.
+ *
+ * Valid values:[4, 8, 16, 32, 64, 128] The default value is 128 for
+ * 128x128.
+ */
+ AV1E_SET_MAX_PARTITION_SIZE = 77,
+
+ /*!\brief Codec control function to turn on / off intra edge filter
+ * at sequence level, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_INTRA_EDGE_FILTER = 78,
+
+ /*!\brief Codec control function to turn on / off frame order hint (int
+ * parameter). Affects: joint compound mode, motion field motion vector,
+ * ref frame sign bias
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_ORDER_HINT = 79,
+
+ /*!\brief Codec control function to turn on / off 64-length transforms, int
+ * parameter
+ *
+ * This will enable or disable usage of length 64 transforms in any
+ * direction.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_TX64 = 80,
+
+ /*!\brief Codec control function to turn on / off flip and identity
+ * transforms, int parameter
+ *
+ * This will enable or disable usage of flip and identity transform
+ * types in any direction. If enabled, this includes:
+ * - FLIPADST_DCT
+ * - DCT_FLIPADST
+ * - FLIPADST_FLIPADST
+ * - ADST_FLIPADST
+ * - FLIPADST_ADST
+ * - IDTX
+ * - V_DCT
+ * - H_DCT
+ * - V_ADST
+ * - H_ADST
+ * - V_FLIPADST
+ * - H_FLIPADST
+ *
+ * Valid values:
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_FLIP_IDTX = 81,
+
+ /*!\brief Codec control function to turn on / off rectangular transforms, int
+ * parameter
+ *
+ * This will enable or disable usage of rectangular transforms. NOTE:
+ * Rectangular transforms only enabled when corresponding rectangular
+ * partitions are.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_RECT_TX = 82,
+
+ /*!\brief Codec control function to turn on / off dist-wtd compound mode
+ * at sequence level, int parameter
+ *
+ * This will enable or disable distance-weighted compound mode.
+ * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+ * to 0.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_DIST_WTD_COMP = 83,
+
+ /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
+ * at sequence level, int parameter
+ *
+ * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+ * to 0.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_REF_FRAME_MVS = 84,
+
+ /*!\brief Codec control function to set temporal mv prediction
+ * enabling/disabling at frame level, int parameter
+ *
+ * \attention If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is
+ * forced to 0.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ALLOW_REF_FRAME_MVS = 85,
+
+ /*!\brief Codec control function to turn on / off dual interpolation filter
+ * for a sequence, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable
+ */
+ AV1E_SET_ENABLE_DUAL_FILTER = 86,
+
+ /*!\brief Codec control function to turn on / off delta quantization in chroma
+ * planes for a sequence, int parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_ENABLE_CHROMA_DELTAQ = 87,
+
+ /*!\brief Codec control function to turn on / off masked compound usage
+ * (wedge and diff-wtd compound modes) for a sequence, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_MASKED_COMP = 88,
+
+ /*!\brief Codec control function to turn on / off one sided compound usage
+ * for a sequence, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_ONESIDED_COMP = 89,
+
+ /*!\brief Codec control function to turn on / off interintra compound
+ * for a sequence, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_INTERINTRA_COMP = 90,
+
+ /*!\brief Codec control function to turn on / off smooth inter-intra
+ * mode for a sequence, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_SMOOTH_INTERINTRA = 91,
+
+ /*!\brief Codec control function to turn on / off difference weighted
+ * compound, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_DIFF_WTD_COMP = 92,
+
+ /*!\brief Codec control function to turn on / off interinter wedge
+ * compound, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_INTERINTER_WEDGE = 93,
+
+ /*!\brief Codec control function to turn on / off interintra wedge
+ * compound, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_INTERINTRA_WEDGE = 94,
+
+ /*!\brief Codec control function to turn on / off global motion usage
+ * for a sequence, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ *
+ * \note Excluded from CONFIG_REALTIME_ONLY build.
+ */
+ AV1E_SET_ENABLE_GLOBAL_MOTION = 95,
+
+ /*!\brief Codec control function to turn on / off warped motion usage
+ * at sequence level, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ *
+ * \note Excluded from CONFIG_REALTIME_ONLY build.
+ */
+ AV1E_SET_ENABLE_WARPED_MOTION = 96,
+
+ /*!\brief Codec control function to turn on / off warped motion usage
+ * at frame level, int parameter
+ *
+ * \attention If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is
+ * forced to 0.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ *
+ * \note Excluded from CONFIG_REALTIME_ONLY build.
+ */
+ AV1E_SET_ALLOW_WARPED_MOTION = 97,
+
+ /*!\brief Codec control function to turn on / off filter intra usage at
+ * sequence level, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_FILTER_INTRA = 98,
+
+ /*!\brief Codec control function to turn on / off smooth intra modes usage,
+ * int parameter
+ *
+ * This will enable or disable usage of smooth, smooth_h and smooth_v intra
+ * modes.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_SMOOTH_INTRA = 99,
+
+ /*!\brief Codec control function to turn on / off Paeth intra mode usage, int
+ * parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_PAETH_INTRA = 100,
+
+ /*!\brief Codec control function to turn on / off CFL uv intra mode usage, int
+ * parameter
+ *
+ * This will enable or disable usage of chroma-from-luma intra mode.
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_CFL_INTRA = 101,
+
+ /*!\brief Codec control function to turn on / off frame superresolution, int
+ * parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_SUPERRES = 102,
+
+ /*!\brief Codec control function to turn on / off overlay frames for
+ * filtered ALTREF frames, int parameter
+ *
+ * This will enable or disable coding of overlay frames for filtered ALTREF
+ * frames. When set to 0, overlay frames are not used but show existing frame
+ * is used to display the filtered ALTREF frame as is. As a result the decoded
+ * frame rate remains the same as the display frame rate. The default is 1.
+ */
+ AV1E_SET_ENABLE_OVERLAY = 103,
+
+ /*!\brief Codec control function to turn on/off palette mode, int parameter */
+ AV1E_SET_ENABLE_PALETTE = 104,
+
+ /*!\brief Codec control function to turn on/off intra block copy mode, int
+ parameter */
+ AV1E_SET_ENABLE_INTRABC = 105,
+
+ /*!\brief Codec control function to turn on/off intra angle delta, int
+ parameter */
+ AV1E_SET_ENABLE_ANGLE_DELTA = 106,
+
+ /*!\brief Codec control function to set the delta q mode, unsigned int
+ * parameter
+ *
+ * AV1 supports a delta q mode feature, that allows modulating q per
+ * superblock.
+ *
+ * - 0 = deltaq signaling off
+ * - 1 = use modulation to maximize objective quality (default)
+ * - 2 = use modulation for local test
+ * - 3 = use modulation for key frame perceptual quality optimization
+ * - 4 = use modulation for user rating based perceptual quality optimization
+ */
+ AV1E_SET_DELTAQ_MODE = 107,
+
+ /*!\brief Codec control function to turn on/off loopfilter modulation
+ * when delta q modulation is enabled, unsigned int parameter.
+ *
+ * \attention AV1 only supports loopfilter modulation when delta q
+ * modulation is enabled as well.
+ */
+ AV1E_SET_DELTALF_MODE = 108,
+
+ /*!\brief Codec control function to set the single tile decoding mode,
+ * unsigned int parameter
+ *
+ * \attention Only applicable if large scale tiling is on.
+ *
+ * - 0 = single tile decoding is off
+ * - 1 = single tile decoding is on (default)
+ */
+ AV1E_SET_SINGLE_TILE_DECODING = 109,
+
+ /*!\brief Codec control function to enable the extreme motion vector unit
+ * test, unsigned int parameter
+ *
+ * - 0 = off
+ * - 1 = MAX_EXTREME_MV
+ * - 2 = MIN_EXTREME_MV
+ *
+ * \note This is only used in motion vector unit test.
+ */
+ AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST = 110,
+
+ /*!\brief Codec control function to signal picture timing info in the
+ * bitstream, aom_timing_info_type_t parameter. Default is
+ * AOM_TIMING_UNSPECIFIED.
+ */
+ AV1E_SET_TIMING_INFO_TYPE = 111,
+
+ /*!\brief Codec control function to add film grain parameters (one of several
+ * preset types) info in the bitstream, int parameter
+ *
+ Valid range: 0..16, 0 is unknown, 1..16 are test vectors
+ */
+ AV1E_SET_FILM_GRAIN_TEST_VECTOR = 112,
+
+ /*!\brief Codec control function to set the path to the film grain parameters,
+ * const char* parameter
+ */
+ AV1E_SET_FILM_GRAIN_TABLE = 113,
+
+ /*!\brief Sets the noise level, int parameter */
+ AV1E_SET_DENOISE_NOISE_LEVEL = 114,
+
+ /*!\brief Sets the denoisers block size, unsigned int parameter */
+ AV1E_SET_DENOISE_BLOCK_SIZE = 115,
+
+ /*!\brief Sets the chroma subsampling x value, unsigned int parameter */
+ AV1E_SET_CHROMA_SUBSAMPLING_X = 116,
+
+ /*!\brief Sets the chroma subsampling y value, unsigned int parameter */
+ AV1E_SET_CHROMA_SUBSAMPLING_Y = 117,
+
+ /*!\brief Control to use a reduced tx type set, int parameter */
+ AV1E_SET_REDUCED_TX_TYPE_SET = 118,
+
+ /*!\brief Control to use dct only for intra modes, int parameter */
+ AV1E_SET_INTRA_DCT_ONLY = 119,
+
+ /*!\brief Control to use dct only for inter modes, int parameter */
+ AV1E_SET_INTER_DCT_ONLY = 120,
+
+ /*!\brief Control to use default tx type only for intra modes, int parameter
+ */
+ AV1E_SET_INTRA_DEFAULT_TX_ONLY = 121,
+
+ /*!\brief Control to use adaptive quantize_b, int parameter */
+ AV1E_SET_QUANT_B_ADAPT = 122,
+
+ /*!\brief Control to select maximum height for the GF group pyramid structure,
+ * unsigned int parameter
+ *
+ * Valid range: 0..5
+ */
+ AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123,
+
+ /*!\brief Control to select maximum reference frames allowed per frame, int
+ * parameter
+ *
+ * Valid range: 3..7
+ */
+ AV1E_SET_MAX_REFERENCE_FRAMES = 124,
+
+ /*!\brief Control to use reduced set of single and compound references, int
+ parameter */
+ AV1E_SET_REDUCED_REFERENCE_SET = 125,
+
+ /*!\brief Control to set frequency of the cost updates for coefficients,
+ * unsigned int parameter
+ *
+ * - 0 = update at SB level (default)
+ * - 1 = update at SB row level in tile
+ * - 2 = update at tile level
+ * - 3 = turn off
+ */
+ AV1E_SET_COEFF_COST_UPD_FREQ = 126,
+
+ /*!\brief Control to set frequency of the cost updates for mode, unsigned int
+ * parameter
+ *
+ * - 0 = update at SB level (default)
+ * - 1 = update at SB row level in tile
+ * - 2 = update at tile level
+ * - 3 = turn off
+ */
+ AV1E_SET_MODE_COST_UPD_FREQ = 127,
+
+ /*!\brief Control to set frequency of the cost updates for motion vectors,
+ * unsigned int parameter
+ *
+ * - 0 = update at SB level (default)
+ * - 1 = update at SB row level in tile
+ * - 2 = update at tile level
+ * - 3 = turn off
+ */
+ AV1E_SET_MV_COST_UPD_FREQ = 128,
+
+ /*!\brief Control to set bit mask that specifies which tier each of the 32
+ * possible operating points conforms to, unsigned int parameter
+ *
+ * - 0 = main tier (default)
+ * - 1 = high tier
+ */
+ AV1E_SET_TIER_MASK = 129,
+
+ /*!\brief Control to set minimum compression ratio, unsigned int parameter
+ * Take integer values. If non-zero, encoder will try to keep the compression
+ * ratio of each frame to be higher than the given value divided by 100.
+ * E.g. 850 means minimum compression ratio of 8.5.
+ */
+ AV1E_SET_MIN_CR = 130,
+
+ /* NOTE: enums 145-149 unused */
+
+ /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t*
+ * parameter
+ */
+ AV1E_SET_SVC_LAYER_ID = 131,
+
+ /*!\brief Codec control function to set SVC parameters, aom_svc_params_t*
+ * parameter
+ */
+ AV1E_SET_SVC_PARAMS = 132,
+
+ /*!\brief Codec control function to set reference frame config:
+ * the ref_idx and the refresh flags for each buffer slot.
+ * aom_svc_ref_frame_config_t* parameter
+ */
+ AV1E_SET_SVC_REF_FRAME_CONFIG = 133,
+
+ /*!\brief Codec control function to set the path to the VMAF model used when
+ * tuning the encoder for VMAF, const char* parameter
+ */
+ AV1E_SET_VMAF_MODEL_PATH = 134,
+
+ /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder,
+ * unsigned int parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ *
+ * \note This is only used in lightfield example test.
+ */
+ AV1E_ENABLE_EXT_TILE_DEBUG = 135,
+
+ /*!\brief Codec control function to enable the superblock multipass unit test
+ * in AV1 to ensure that the encoder does not leak state between different
+ * passes. unsigned int parameter.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ *
+ * \note This is only used in sb_multipass unit test.
+ */
+ AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 136,
+
+ /*!\brief Control to select minimum height for the GF group pyramid structure,
+ * unsigned int parameter
+ *
+ * Valid values: 0..5
+ */
+ AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 137,
+
+ /*!\brief Control to set average complexity of the corpus in the case of
+ * single pass vbr based on LAP, unsigned int parameter
+ */
+ AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 138,
+
+ /*!\brief Control to get baseline gf interval
+ */
+ AV1E_GET_BASELINE_GF_INTERVAL = 139,
+
+ /*\brief Control to set encoding the denoised frame from denoise-noise-level
+ *
+ * - 0 = disabled/encode the original frame
+ * - 1 = enabled/encode the denoised frame (default)
+ */
+ AV1E_SET_ENABLE_DNL_DENOISING = 140,
+
+ /*!\brief Codec control function to turn on / off D45 to D203 intra mode
+ * usage, int parameter
+ *
+ * This will enable or disable usage of D45 to D203 intra modes, which are a
+ * subset of directional modes. This control has no effect if directional
+ * modes are disabled (AV1E_SET_ENABLE_DIRECTIONAL_INTRA set to 0).
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_DIAGONAL_INTRA = 141,
+
+ /*!\brief Control to set frequency of the cost updates for intrabc motion
+ * vectors, unsigned int parameter
+ *
+ * - 0 = update at SB level (default)
+ * - 1 = update at SB row level in tile
+ * - 2 = update at tile level
+ * - 3 = turn off
+ */
+ AV1E_SET_DV_COST_UPD_FREQ = 142,
+
+ /*!\brief Codec control to set the path for partition stats read and write.
+ * const char * parameter.
+ */
+ AV1E_SET_PARTITION_INFO_PATH = 143,
+
+ /*!\brief Codec control to use an external partition model
+ * A set of callback functions is passed through this control
+ * to let the encoder encode with given partitions.
+ */
+ AV1E_SET_EXTERNAL_PARTITION = 144,
+
+ /*!\brief Codec control function to turn on / off directional intra mode
+ * usage, int parameter
+ *
+ * - 0 = disable
+ * - 1 = enable (default)
+ */
+ AV1E_SET_ENABLE_DIRECTIONAL_INTRA = 145,
+
+ /*!\brief Control to turn on / off transform size search.
+ * Note: it can not work with non RD pick mode in real-time encoding,
+ * where the max transform size is only 16x16.
+ * It will be ignored if non RD pick mode is set.
+ *
+ * - 0 = disable, transforms always have the largest possible size
+ * - 1 = enable, search for the best transform size for each block (default)
+ */
+ AV1E_SET_ENABLE_TX_SIZE_SEARCH = 146,
+
+ /*!\brief Codec control function to set reference frame compound prediction.
+ * aom_svc_ref_frame_comp_pred_t* parameter
+ */
+ AV1E_SET_SVC_REF_FRAME_COMP_PRED = 147,
+
+ /*!\brief Set --deltaq-mode strength.
+ *
+ * Valid range: [0, 1000]
+ */
+ AV1E_SET_DELTAQ_STRENGTH = 148,
+
+ /*!\brief Codec control to control loop filter
+ *
+ * - 0 = Loop filter is disabled for all frames
+ * - 1 = Loop filter is enabled for all frames
+ * - 2 = Loop filter is disabled for non-reference frames
+ * - 3 = Loop filter is disabled for the frames with low motion
+ */
+ AV1E_SET_LOOPFILTER_CONTROL = 149,
+
+ /*!\brief Codec control function to get the loopfilter chosen by the encoder,
+ * int* parameter
+ */
+ AOME_GET_LOOPFILTER_LEVEL = 150,
+
+ /*!\brief Codec control to automatically turn off several intra coding tools,
+ * unsigned int parameter
+ * - 0 = do not use the feature
+ * - 1 = enable the automatic decision to turn off several intra tools
+ */
+ AV1E_SET_AUTO_INTRA_TOOLS_OFF = 151,
+
+ /*!\brief Codec control function to set flag for rate control used by external
+ * encoders.
+ * - 1 = Enable rate control for external encoders. This will disable content
+ * dependency in rate control and cyclic refresh.
+ * - 0 = Default. Disable rate control for external encoders.
+ */
+ AV1E_SET_RTC_EXTERNAL_RC = 152,
+
+ /*!\brief Codec control function to enable frame parallel multi-threading
+ * of the encoder, unsigned int parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ */
+ AV1E_SET_FP_MT = 153,
+
+ /*!\brief Codec control to enable actual frame parallel encode or
+ * simulation of frame parallel encode in FPMT unit test, unsigned int
+ * parameter
+ *
+ * - 0 = simulate frame parallel encode
+ * - 1 = actual frame parallel encode (default)
+ *
+ * \note This is only used in FPMT unit test.
+ */
+ AV1E_SET_FP_MT_UNIT_TEST = 154,
+
+ /*!\brief Codec control function to get the target sequence level index for
+ * each operating point. int* parameter. There can be at most 32 operating
+ * points. The results will be written into a provided integer array of
+ * sufficient size. If a target level is not set, the result will be 31.
+ * Please refer to https://aomediacodec.github.io/av1-spec/#levels for more
+ * details on level definitions and indices.
+ */
+ AV1E_GET_TARGET_SEQ_LEVEL_IDX = 155,
+
+ /*!\brief Codec control function to get the number of operating points. int*
+ * parameter.
+ */
+ AV1E_GET_NUM_OPERATING_POINTS = 156,
+
+ /*!\brief Codec control function to skip the application of post-processing
+ * filters on reconstructed frame, unsigned int parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ *
+ * \attention For this value to be used aom_codec_enc_cfg_t::g_usage
+ * must be set to AOM_USAGE_ALL_INTRA.
+ */
+ AV1E_SET_SKIP_POSTPROC_FILTERING = 157,
+
+ /*!\brief Codec control function to enable the superblock level
+ * qp sweep in AV1 to ensure that end-to-end test runs well,
+ * unsigned int parameter.
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ *
+ * \note This is only used in sb_qp_sweep unit test.
+ */
+ AV1E_ENABLE_SB_QP_SWEEP = 158,
+
+ /*!\brief Codec control to set quantizer for the next frame, int parameter.
+ *
+ * - Valid range [0, 63]
+ *
+ * This will turn off cyclic refresh. Only applicable to 1-pass.
+ */
+ AV1E_SET_QUANTIZER_ONE_PASS = 159,
+
+ /*!\brief Codec control to enable the rate distribution guided delta
+ * quantization in all intra mode, unsigned int parameter
+ *
+ * - 0 = disable (default)
+ * - 1 = enable
+ *
+ * \attention This feature requires --deltaq-mode=3, also an input file
+ * which contains rate distribution for each 16x16 block,
+ * passed in by --rate-distribution-info=rate_distribution.txt.
+ */
+ AV1E_ENABLE_RATE_GUIDE_DELTAQ = 160,
+
+ /*!\brief Codec control to set the input file for rate distribution used
+ * in all intra mode, const char * parameter
+ * The input should be the name of a text file, which
+ * contains (rows x cols) float values separated by space.
+ * Each float value represent the number of bits for each 16x16 block.
+ * rows = (frame_height + 15) / 16
+ * cols = (frame_width + 15) / 16
+ *
+ * \attention This feature requires --enable-rate-guide-deltaq=1.
+ */
+ AV1E_SET_RATE_DISTRIBUTION_INFO = 161,
+
+ /*!\brief Codec control to get the CDEF strength for Y / luma plane,
+ * int * parameter.
+ * Returns an integer array of CDEF_MAX_STRENGTHS elements.
+ */
+ AV1E_GET_LUMA_CDEF_STRENGTH = 162,
+
+ /*!\brief Codec control to set the target bitrate in kilobits per second,
+ * unsigned int parameter. For 1 pass CBR mode, single layer encoding.
+ * This controls replaces the call aom_codec_enc_config_set(&codec, &cfg)
+ * when only target bitrate is changed, and so is much cheaper as it
+ * bypasses a lot of unneeded code checks.
+ */
+ AV1E_SET_BITRATE_ONE_PASS_CBR = 163,
+
+ /*!\brief Codec control to set the maximum number of consecutive frame drops
+ * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of
+ * zero has no effect.
+ */
+ AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164,
+
+ /*!\brief Codec control to set the frame drop mode for SVC,
+ * unsigned int parameter. The valid values are constants of the
+ * AOM_SVC_FRAME_DROP_MODE enum: AOM_LAYER_DROP or AOM_FULL_SUPERFRAME_DROP.
+ */
+ AV1E_SET_SVC_FRAME_DROP_MODE = 165,
+
+ // Any new encoder control IDs should be added above.
+ // Maximum allowed encoder control ID is 229.
+ // No encoder control ID should be added below.
+};
+
+/*!\brief aom 1-D scaling mode
+ *
+ * This set of constants define 1-D aom scaling modes
+ */
+typedef enum aom_scaling_mode_1d {
+ AOME_NORMAL = 0,
+ AOME_FOURFIVE = 1,
+ AOME_THREEFIVE = 2,
+ AOME_THREEFOUR = 3,
+ AOME_ONEFOUR = 4,
+ AOME_ONEEIGHT = 5,
+ AOME_ONETWO = 6,
+ AOME_TWOTHREE = 7,
+ AOME_ONETHREE = 8
+} AOM_SCALING_MODE;
+
+/*!\brief Max number of segments
+ *
+ * This is the limit of number of segments allowed within a frame.
+ *
+ * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_SEGMENTS 8
+
+/*!\brief aom region of interest map
+ *
+ * These defines the data structures for the region of interest map
+ *
+ * TODO(yaowu): create a unit test for ROI map related APIs
+ *
+ */
+typedef struct aom_roi_map {
+ /*! An id between 0 and 7 for each 8x8 region within a frame. */
+ unsigned char *roi_map;
+ unsigned int rows; /**< Number of rows. */
+ unsigned int cols; /**< Number of columns. */
+ int delta_q[AOM_MAX_SEGMENTS]; /**< Quantizer deltas. */
+ int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */
+ /*! Static breakout threshold for each segment. */
+ unsigned int static_threshold[AOM_MAX_SEGMENTS];
+} aom_roi_map_t;
+
+/*!\brief aom active region map
+ *
+ * These defines the data structures for active region map
+ *
+ */
+
+typedef struct aom_active_map {
+ /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
+ unsigned char *active_map;
+ unsigned int rows; /**< number of rows */
+ unsigned int cols; /**< number of cols */
+} aom_active_map_t;
+
+/*!\brief aom image scaling mode
+ *
+ * This defines the data structure for image scaling mode
+ *
+ */
+typedef struct aom_scaling_mode {
+ AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
+ AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */
+} aom_scaling_mode_t;
+
+/*!brief AV1 encoder content type */
+typedef enum {
+ AOM_CONTENT_DEFAULT,
+ AOM_CONTENT_SCREEN,
+ AOM_CONTENT_FILM,
+ AOM_CONTENT_INVALID
+} aom_tune_content;
+
+/*!brief AV1 encoder timing info type signaling */
+typedef enum {
+ AOM_TIMING_UNSPECIFIED,
+ AOM_TIMING_EQUAL,
+ AOM_TIMING_DEC_MODEL
+} aom_timing_info_type_t;
+
+/*!\brief Model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum {
+ AOM_TUNE_PSNR = 0,
+ AOM_TUNE_SSIM = 1,
+ /* NOTE: enums 2 and 3 unused */
+ AOM_TUNE_VMAF_WITH_PREPROCESSING = 4,
+ AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5,
+ AOM_TUNE_VMAF_MAX_GAIN = 6,
+ AOM_TUNE_VMAF_NEG_MAX_GAIN = 7,
+ AOM_TUNE_BUTTERAUGLI = 8,
+ AOM_TUNE_VMAF_SALIENCY_MAP = 9,
+} aom_tune_metric;
+
+/*!\brief Distortion metric to use for RD optimization.
+ *
+ * Changes the encoder to use a different distortion metric for RD search. Note
+ * that this value operates on a "lower level" compared to aom_tune_metric - it
+ * affects the distortion metric inside a block, while aom_tune_metric only
+ * affects RD across blocks.
+ *
+ */
+typedef enum {
+ // Use PSNR for in-block rate-distortion optimization.
+ AOM_DIST_METRIC_PSNR,
+ // Use quantization matrix-weighted PSNR for in-block rate-distortion
+ // optimization. If --enable-qm=1 is not specified, this falls back to
+ // behaving in the same way as AOM_DIST_METRIC_PSNR.
+ AOM_DIST_METRIC_QM_PSNR,
+} aom_dist_metric;
+
+#define AOM_MAX_LAYERS 32 /**< Max number of layers */
+#define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */
+#define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */
+
+/*!brief Struct for spatial and temporal layer ID */
+typedef struct aom_svc_layer_id {
+ int spatial_layer_id; /**< Spatial layer ID */
+ int temporal_layer_id; /**< Temporal layer ID */
+} aom_svc_layer_id_t;
+
+/*!brief Parameter type for SVC
+ *
+ * In the arrays of size AOM_MAX_LAYERS, the index for spatial layer `sl` and
+ * temporal layer `tl` is sl * number_temporal_layers + tl.
+ *
+ */
+typedef struct aom_svc_params {
+ int number_spatial_layers; /**< Number of spatial layers */
+ int number_temporal_layers; /**< Number of temporal layers */
+ int max_quantizers[AOM_MAX_LAYERS]; /**< Max Q for each layer */
+ int min_quantizers[AOM_MAX_LAYERS]; /**< Min Q for each layer */
+ int scaling_factor_num[AOM_MAX_SS_LAYERS]; /**< Scaling factor-numerator */
+ int scaling_factor_den[AOM_MAX_SS_LAYERS]; /**< Scaling factor-denominator */
+ /*! Target bitrate for each layer, in kilobits per second */
+ int layer_target_bitrate[AOM_MAX_LAYERS];
+ /*! Frame rate factor for each temporal layer */
+ int framerate_factor[AOM_MAX_TS_LAYERS];
+} aom_svc_params_t;
+
+/*!brief Parameters for setting ref frame config */
+typedef struct aom_svc_ref_frame_config {
+ // 7 references: The index 0 - 6 refers to the references:
+ // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6).
+ int reference[7]; /**< Reference flag for each of the 7 references. */
+ /*! Buffer slot index for each of 7 references indexed above. */
+ int ref_idx[7];
+ int refresh[8]; /**< Refresh flag for each of the 8 slots. */
+} aom_svc_ref_frame_config_t;
+
+/*!brief Parameters for setting ref frame compound prediction */
+typedef struct aom_svc_ref_frame_comp_pred {
+ // Use compound prediction for the ref_frame pairs GOLDEN_LAST (0),
+ // LAST2_LAST (1), and ALTREF_LAST (2).
+ int use_comp_pred[3]; /**<Compound reference flag. */
+} aom_svc_ref_frame_comp_pred_t;
+
+/*!brief Frame drop modes for spatial/quality layer SVC */
+typedef enum {
+ AOM_LAYER_DROP, /**< Any spatial layer can drop. */
+ AOM_FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */
+} AOM_SVC_FRAME_DROP_MODE;
+
+/*!\cond */
+/*!\brief Encoder control function parameter type
+ *
+ * Defines the data types that AOME/AV1E control functions take.
+ *
+ * \note Additional common controls are defined in aom.h.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
+ */
+AOM_CTRL_USE_TYPE(AOME_USE_REFERENCE, int)
+#define AOM_CTRL_AOME_USE_REFERENCE
+
+AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
+#define AOM_CTRL_AOME_SET_ROI_MAP
+
+AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AOME_SET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
+#define AOM_CTRL_AOME_SET_SCALEMODE
+
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int)
+#define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
+
+AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
+#define AOM_CTRL_AOME_SET_CPUUSED
+
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
+
+AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
+#define AOM_CTRL_AOME_SET_SHARPNESS
+
+AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
+#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
+
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
+
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
+
+AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
+#define AOM_CTRL_AOME_SET_TUNING
+
+AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
+#define AOM_CTRL_AOME_SET_CQ_LEVEL
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int)
+#define AOM_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_INTER_BITRATE_PCT
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
+#define AOM_CTRL_AV1E_SET_LOSSLESS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1E_SET_ROW_MT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int)
+#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int)
+#define AOM_CTRL_AV1E_SET_TILE_ROWS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TPL_MODEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_KEYFRAME_FILTERING, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_KEYFRAME_FILTERING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int)
+#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int)
+#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_AQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
+#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
+#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int)
+#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int)
+#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int)
+#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AV1E_GET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
+#define AOM_CTRL_AV1E_SET_COLOR_RANGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
+#define AOM_CTRL_AV1E_SET_RENDER_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
+#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CDEF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FORCE_VIDEO_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_FORCE_VIDEO_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OBMC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
+#define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_QM
+
+// TODO(aomedia:3231): Remove these two lines.
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_8X8, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_8X8
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MIN
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MAX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_Y
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_U, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_U
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_V, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_V
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int)
+#define AOM_CTRL_AV1E_SET_NUM_TG
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
+#define AOM_CTRL_AV1E_SET_MTU
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_AB_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_AB_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_1TO4_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_1TO4_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MIN_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MAX_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRA_EDGE_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX64
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_TX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_TX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, int)
+#define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CHROMA_DELTAQ, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CHROMA_DELTAQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ONESIDED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ONESIDED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTERINTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIFF_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIFF_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTER_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTER_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_GLOBAL_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_GLOBAL_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FILTER_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FILTER_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PAETH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PAETH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CFL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CFL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OVERLAY, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OVERLAY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PALETTE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRABC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTAQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTALF_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTALF_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */
+#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int)
+#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_X
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y
+
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DEFAULT_TX_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int)
+#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, int)
+#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_REFERENCE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_REFERENCE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COEFF_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_COEFF_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MV_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int)
+#define AOM_CTRL_AV1E_SET_TIER_MASK
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_CR, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_CR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_LAYER_ID, aom_svc_layer_id_t *)
+#define AOM_CTRL_AV1E_SET_SVC_LAYER_ID
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AV1E_SET_SVC_LAYER_ID
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_PARAMS, aom_svc_params_t *)
+#define AOM_CTRL_AV1E_SET_SVC_PARAMS
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AV1E_SET_SVC_PARAMS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_CONFIG, aom_svc_ref_frame_config_t *)
+#define AOM_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG
+
+AOM_CTRL_USE_TYPE(AV1E_SET_VMAF_MODEL_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_VMAF_MODEL_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_EXT_TILE_DEBUG
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MIN_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, unsigned int)
+#define AOM_CTRL_AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP
+
+AOM_CTRL_USE_TYPE(AV1E_GET_BASELINE_GF_INTERVAL, int *)
+#define AOM_CTRL_AV1E_GET_BASELINE_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DNL_DENOISING, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIAGONAL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIAGONAL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_DV_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_PARTITION_INFO_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_PARTITION_INFO_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_EXTERNAL_PARTITION, aom_ext_part_funcs_t *)
+#define AOM_CTRL_AV1E_SET_EXTERNAL_PARTITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIRECTIONAL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX_SIZE_SEARCH, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX_SIZE_SEARCH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+ aom_svc_ref_frame_comp_pred_t *)
+#define AOM_CTRL_AV1E_SET_SVC_REF_FRAME_COMP_PRED
+// TODO(aomedia:3231): Deprecated. Remove it.
+#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_COMP_PRED
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_STRENGTH, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTAQ_STRENGTH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOOPFILTER_CONTROL, int)
+#define AOM_CTRL_AV1E_SET_LOOPFILTER_CONTROL
+
+AOM_CTRL_USE_TYPE(AOME_GET_LOOPFILTER_LEVEL, int *)
+#define AOM_CTRL_AOME_GET_LOOPFILTER_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_AUTO_INTRA_TOOLS_OFF, unsigned int)
+#define AOM_CTRL_AV1E_SET_AUTO_INTRA_TOOLS_OFF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_RTC_EXTERNAL_RC, int)
+#define AOM_CTRL_AV1E_SET_RTC_EXTERNAL_RC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FP_MT, unsigned int)
+#define AOM_CTRL_AV1E_SET_FP_MT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FP_MT_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FP_MT_UNIT_TEST
+
+AOM_CTRL_USE_TYPE(AV1E_GET_TARGET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_TARGET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_GET_NUM_OPERATING_POINTS, int *)
+#define AOM_CTRL_AV1E_GET_NUM_OPERATING_POINTS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SKIP_POSTPROC_FILTERING, unsigned int)
+#define AOM_CTRL_AV1E_SET_SKIP_POSTPROC_FILTERING
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_QP_SWEEP, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_SB_QP_SWEEP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QUANTIZER_ONE_PASS, int)
+#define AOM_CTRL_AV1E_SET_QUANTIZER_ONE_PASS
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_RATE_GUIDE_DELTAQ, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_RATE_GUIDE_DELTAQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_RATE_DISTRIBUTION_INFO, const char *)
+#define AOM_CTRL_AV1E_SET_RATE_DISTRIBUTION_INFO
+
+AOM_CTRL_USE_TYPE(AV1E_GET_LUMA_CDEF_STRENGTH, int *)
+#define AOM_CTRL_AV1E_GET_LUMA_CDEF_STRENGTH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_BITRATE_ONE_PASS_CBR, unsigned int)
+#define AOM_CTRL_AV1E_SET_BITRATE_ONE_PASS_CBR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_FRAME_DROP_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SVC_FRAME_DROP_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, int)
+#define AOM_CTRL_AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR
+
+/*!\endcond */
+/*! @} - end defgroup aom_encoder */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_AOMCX_H_
diff --git a/third_party/aom/aom/aomdx.h b/third_party/aom/aom/aomdx.h
new file mode 100644
index 0000000000..02ea19597c
--- /dev/null
+++ b/third_party/aom/aom/aomdx.h
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
+ * \ingroup aom
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 within the aom Decoder
+ * interface.
+ */
+#ifndef AOM_AOM_AOMDX_H_
+#define AOM_AOM_AOMDX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Include controls common to both the encoder and decoder */
+#include "aom/aom.h"
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to decode AV1 streams.
+ * @{
+ */
+
+/*!\brief A single instance of the AV1 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer aom_codec_av1_dx().
+ */
+extern aom_codec_iface_t aom_codec_av1_dx_algo;
+/*!\brief The interface to the AV1 decoder.
+ */
+extern aom_codec_iface_t *aom_codec_av1_dx(void);
+
+/*!@} - end algorithm interface member group */
+
+/** Data structure that stores bit accounting for debug
+ */
+typedef struct Accounting Accounting;
+
+#ifndef AOM_INSPECTION_H_
+/** Callback that inspects decoder frame data.
+ */
+typedef void (*aom_inspect_cb)(void *decoder, void *ctx);
+
+#endif
+
+/*!\brief Structure to hold inspection callback and context.
+ *
+ * Defines a structure to hold the inspection callback function and calling
+ * context.
+ */
+typedef struct aom_inspect_init {
+ /*! Inspection callback. */
+ aom_inspect_cb inspect_cb;
+
+ /*! Inspection context. */
+ void *inspect_ctx;
+} aom_inspect_init;
+
+/*!\brief Structure to collect a buffer index when inspecting.
+ *
+ * Defines a structure to hold the buffer and return an index
+ * when calling decode from inspect. This enables us to decode
+ * non showable sub frames.
+ */
+typedef struct {
+ /*! Pointer for new position in compressed buffer after decoding 1 OBU. */
+ const unsigned char *buf;
+ /*! Index into reference buffer array to see result of decoding 1 OBU. */
+ int idx;
+ /*! Is a show existing frame. */
+ int show_existing;
+} Av1DecodeReturn;
+
+/*!\brief Structure to hold a tile's start address and size in the bitstream.
+ *
+ * Defines a structure to hold a tile's start address and size in the bitstream.
+ */
+typedef struct aom_tile_data {
+ /*! Tile data size. */
+ size_t coded_tile_data_size;
+ /*! Tile's start address. */
+ const void *coded_tile_data;
+ /*! Extra size information. */
+ size_t extra_size;
+} aom_tile_data;
+
+/*!\brief Max number of tile columns
+ *
+ * This is the limit of number of tile columns allowed within a frame.
+ *
+ * Currently same as "MAX_TILE_COLS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_TILE_COLS 64
+/*!\brief Max number of tile rows
+ *
+ * This is the limit of number of tile rows allowed within a frame.
+ *
+ * Currently same as "MAX_TILE_ROWS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_TILE_ROWS 64
+
+/*!\brief Structure to hold information about tiles in a frame.
+ *
+ * Defines a structure to hold a frame's tile information, namely
+ * number of tile columns, number of tile_rows, and the width and
+ * height of each tile.
+ */
+typedef struct aom_tile_info {
+ /*! Indicates the number of tile columns. */
+ int tile_columns;
+ /*! Indicates the number of tile rows. */
+ int tile_rows;
+ /*! Indicates the tile widths in units of SB. */
+ int tile_widths[AOM_MAX_TILE_COLS];
+ /*! Indicates the tile heights in units of SB. */
+ int tile_heights[AOM_MAX_TILE_ROWS];
+ /*! Indicates the number of tile groups present in a frame. */
+ int num_tile_groups;
+} aom_tile_info;
+
+/*!\brief Structure to hold information about still image coding.
+ *
+ * Defines a structure to hold a information regarding still picture
+ * and its header type.
+ */
+typedef struct aom_still_picture_info {
+ /*! Video is a single frame still picture */
+ int is_still_picture;
+ /*! Use full header for still picture */
+ int is_reduced_still_picture_hdr;
+} aom_still_picture_info;
+
+/*!\brief Structure to hold information about S_FRAME.
+ *
+ * Defines a structure to hold a information regarding S_FRAME
+ * and its position.
+ */
+typedef struct aom_s_frame_info {
+ /*! Indicates if current frame is S_FRAME */
+ int is_s_frame;
+ /*! Indicates if current S_FRAME is present at ALTREF frame*/
+ int is_s_frame_at_altref;
+} aom_s_frame_info;
+
+/*!\brief Structure to hold information about screen content tools.
+ *
+ * Defines a structure to hold information about screen content
+ * tools, namely: allow_screen_content_tools, allow_intrabc, and
+ * force_integer_mv.
+ */
+typedef struct aom_screen_content_tools_info {
+ /*! Are screen content tools allowed */
+ int allow_screen_content_tools;
+ /*! Is intrabc allowed */
+ int allow_intrabc;
+ /*! Is integer mv forced */
+ int force_integer_mv;
+} aom_screen_content_tools_info;
+
+/*!\brief Structure to hold the external reference frame pointer.
+ *
+ * Define a structure to hold the external reference frame pointer.
+ */
+typedef struct av1_ext_ref_frame {
+ /*! Start pointer of external references. */
+ aom_image_t *img;
+ /*! Number of available external references. */
+ int num;
+} av1_ext_ref_frame_t;
+
+/*!\enum aom_dec_control_id
+ * \brief AOM decoder control functions
+ *
+ * This set of macros define the control functions available for the AOM
+ * decoder interface.
+ * The range for decoder control ID is >= 256.
+ *
+ * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
+ */
+enum aom_dec_control_id {
+ /*!\brief Codec control function to get info on which reference frames were
+ * updated by the last decode, int* parameter
+ */
+ AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
+
+ /*!\brief Codec control function to check if the indicated frame is
+ corrupted, int* parameter
+ */
+ AOMD_GET_FRAME_CORRUPTED,
+
+ /*!\brief Codec control function to get info on which reference frames were
+ * used by the last decode, int* parameter
+ */
+ AOMD_GET_LAST_REF_USED,
+
+ /*!\brief Codec control function to get the dimensions that the current
+ * frame is decoded at, int* parameter
+ *
+ * This may be different to the intended display size for the frame as
+ * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE).
+ */
+ AV1D_GET_FRAME_SIZE,
+
+ /*!\brief Codec control function to get the current frame's intended display
+ * dimensions (as specified in the wrapper or frame header), int* parameter
+ *
+ * This may be different to the decoded dimensions of this frame (see
+ * AV1D_GET_FRAME_SIZE).
+ */
+ AV1D_GET_DISPLAY_SIZE,
+
+ /*!\brief Codec control function to get the bit depth of the stream,
+ * unsigned int* parameter
+ */
+ AV1D_GET_BIT_DEPTH,
+
+ /*!\brief Codec control function to get the image format of the stream,
+ * aom_img_fmt_t* parameter
+ */
+ AV1D_GET_IMG_FORMAT,
+
+ /*!\brief Codec control function to get the size of the tile, unsigned int*
+ * parameter
+ */
+ AV1D_GET_TILE_SIZE,
+
+ /*!\brief Codec control function to get the tile count in a tile list,
+ * unsigned int* parameter
+ */
+ AV1D_GET_TILE_COUNT,
+
+ /*!\brief Codec control function to set the byte alignment of the planes in
+ * the reference buffers, int parameter
+ *
+ * Valid values are power of 2, from 32 to 1024. A value of 0 sets
+ * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+ * follows Y plane, and V plane directly follows U plane. Default value is 0.
+ */
+ AV1_SET_BYTE_ALIGNMENT,
+
+ /*!\brief Codec control function to invert the decoding order to from right to
+ * left, int parameter
+ *
+ * The function is used in a test to confirm the decoding independence of tile
+ * columns. The function may be used in application where this order
+ * of decoding is desired. int parameter
+ *
+ * TODO(yaowu): Rework the unit test that uses this control, and in a future
+ * release, this test-only control shall be removed.
+ */
+ AV1_INVERT_TILE_DECODE_ORDER,
+
+ /*!\brief Codec control function to set the skip loop filter flag, int
+ * parameter
+ *
+ * Valid values are integers. The decoder will skip the loop filter
+ * when its value is set to nonzero. If the loop filter is skipped the
+ * decoder may accumulate decode artifacts. The default value is 0.
+ */
+ AV1_SET_SKIP_LOOP_FILTER,
+
+ /*!\brief Codec control function to retrieve a pointer to the Accounting
+ * struct, takes Accounting** as parameter
+ *
+ * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
+ * The caller should ensure that AOM_CODEC_OK is returned before attempting
+ * to dereference the Accounting pointer.
+ *
+ * \attention When configured with -DCONFIG_ACCOUNTING=0, the default, this
+ * returns AOM_CODEC_INCAPABLE.
+ */
+ AV1_GET_ACCOUNTING,
+
+ /*!\brief Codec control function to get last decoded frame quantizer,
+ * int* parameter
+ *
+ * Returned value uses internal quantizer scale defined by the codec.
+ */
+ AOMD_GET_LAST_QUANTIZER,
+
+ /*!\brief Codec control function to set the range of tile decoding, int
+ * parameter
+ *
+ * A value that is greater and equal to zero indicates only the specific
+ * row/column is decoded. A value that is -1 indicates the whole row/column
+ * is decoded. A special case is both values are -1 that means the whole
+ * frame is decoded.
+ */
+ AV1_SET_DECODE_TILE_ROW,
+ AV1_SET_DECODE_TILE_COL,
+
+ /*!\brief Codec control function to set the tile coding mode, unsigned int
+ * parameter
+ *
+ * - 0 = tiles are coded in normal tile mode
+ * - 1 = tiles are coded in large-scale tile mode
+ */
+ AV1_SET_TILE_MODE,
+
+ /*!\brief Codec control function to get the frame header information of an
+ * encoded frame, aom_tile_data* parameter
+ */
+ AV1D_GET_FRAME_HEADER_INFO,
+
+ /*!\brief Codec control function to get the start address and size of a
+ * tile in the coded bitstream, aom_tile_data* parameter.
+ */
+ AV1D_GET_TILE_DATA,
+
+ /*!\brief Codec control function to set the external references' pointers in
+ * the decoder, av1_ext_ref_frame_t* parameter.
+ *
+ * This is used while decoding the tile list OBU in large-scale tile coding
+ * mode.
+ */
+ AV1D_SET_EXT_REF_PTR,
+
+ /*!\brief Codec control function to enable the ext-tile software debug and
+ * testing code in the decoder, unsigned int parameter
+ */
+ AV1D_EXT_TILE_DEBUG,
+
+ /*!\brief Codec control function to enable the row based multi-threading of
+ * decoding, unsigned int parameter
+ *
+ * - 0 = disabled
+ * - 1 = enabled (default)
+ */
+ AV1D_SET_ROW_MT,
+
+ /*!\brief Codec control function to indicate whether bitstream is in
+ * Annex-B format, unsigned int parameter
+ */
+ AV1D_SET_IS_ANNEXB,
+
+ /*!\brief Codec control function to indicate which operating point to use,
+ * int parameter
+ *
+ * A scalable stream may define multiple operating points, each of which
+ * defines a set of temporal and spatial layers to be processed. The
+ * operating point index may take a value between 0 and
+ * operating_points_cnt_minus_1 (which is at most 31).
+ */
+ AV1D_SET_OPERATING_POINT,
+
+ /*!\brief Codec control function to indicate whether to output one frame per
+ * temporal unit (the default), or one frame per spatial layer, int parameter
+ *
+ * In a scalable stream, each temporal unit corresponds to a single "frame"
+ * of video, and within a temporal unit there may be multiple spatial layers
+ * with different versions of that frame.
+ * For video playback, only the highest-quality version (within the
+ * selected operating point) is needed, but for some use cases it is useful
+ * to have access to multiple versions of a frame when they are available.
+ */
+ AV1D_SET_OUTPUT_ALL_LAYERS,
+
+ /*!\brief Codec control function to set an aom_inspect_cb callback that is
+ * invoked each time a frame is decoded, aom_inspect_init* parameter
+ *
+ * \attention When configured with -DCONFIG_INSPECTION=0, the default, this
+ * returns AOM_CODEC_INCAPABLE.
+ */
+ AV1_SET_INSPECTION_CALLBACK,
+
+ /*!\brief Codec control function to set the skip film grain flag, int
+ * parameter
+ *
+ * Valid values are integers. The decoder will skip the film grain when its
+ * value is set to nonzero. The default value is 0.
+ */
+ AV1D_SET_SKIP_FILM_GRAIN,
+
+ /*!\brief Codec control function to check the presence of forward key frames,
+ * int* parameter
+ */
+ AOMD_GET_FWD_KF_PRESENT,
+
+ /*!\brief Codec control function to get the frame flags of the previous frame
+ * decoded, int* parameter
+ *
+ * This will return a flag of type aom_codec_frame_flags_t.
+ */
+ AOMD_GET_FRAME_FLAGS,
+
+ /*!\brief Codec control function to check the presence of altref frames, int*
+ * parameter
+ */
+ AOMD_GET_ALTREF_PRESENT,
+
+ /*!\brief Codec control function to get tile information of the previous frame
+ * decoded, aom_tile_info* parameter
+ *
+ * This will return a struct of type aom_tile_info.
+ */
+ AOMD_GET_TILE_INFO,
+
+ /*!\brief Codec control function to get screen content tools information,
+ * aom_screen_content_tools_info* parameter
+ *
+ * It returns a struct of type aom_screen_content_tools_info, which contains
+ * the header flags allow_screen_content_tools, allow_intrabc, and
+ * force_integer_mv.
+ */
+ AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+
+ /*!\brief Codec control function to get the still picture coding information,
+ * aom_still_picture_info* parameter
+ */
+ AOMD_GET_STILL_PICTURE,
+
+ /*!\brief Codec control function to get superblock size,
+ * aom_superblock_size_t* parameter
+ *
+ * It returns an enum, indicating the superblock size read from the sequence
+ * header(0 for BLOCK_64X64 and 1 for BLOCK_128X128)
+ */
+ AOMD_GET_SB_SIZE,
+
+ /*!\brief Codec control function to check if the previous frame
+ * decoded has show existing frame flag set, int* parameter
+ */
+ AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+
+ /*!\brief Codec control function to get the S_FRAME coding information,
+ * aom_s_frame_info* parameter
+ */
+ AOMD_GET_S_FRAME_INFO,
+
+ /*!\brief Codec control function to get the show frame flag, int* parameter
+ */
+ AOMD_GET_SHOW_FRAME_FLAG,
+
+ /*!\brief Codec control function to get the base q index of a frame, int*
+ * parameter
+ */
+ AOMD_GET_BASE_Q_IDX,
+
+ /*!\brief Codec control function to get the order hint of a frame, unsigned
+ * int* parameter
+ */
+ AOMD_GET_ORDER_HINT,
+
+ /*!\brief Codec control function to get the info of a 4x4 block.
+ * Parameters: int mi_row, int mi_col, and MB_MODE_INFO*.
+ *
+ * \note This only returns a shallow copy, so all pointer members should not
+ * be used.
+ */
+ AV1D_GET_MI_INFO,
+};
+
+/*!\cond */
+/*!\brief AOM decoder control function parameter type
+ *
+ * Defines the data types that AOMD control functions take.
+ *
+ * \note Additional common controls are defined in aom.h.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
+ */
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
+
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_USED
+
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_FRAME_SIZE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
+#define AOM_CTRL_AV1D_GET_BIT_DEPTH
+
+AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *)
+#define AOM_CTRL_AV1D_GET_IMG_FORMAT
+
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_SIZE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_COUNT
+
+AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
+#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
+
+AOM_CTRL_USE_TYPE(AV1_SET_SKIP_LOOP_FILTER, int)
+#define AOM_CTRL_AV1_SET_SKIP_LOOP_FILTER
+
+AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
+#define AOM_CTRL_AV1_GET_ACCOUNTING
+
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
+
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
+
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
+
+AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int)
+#define AOM_CTRL_AV1_SET_TILE_MODE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO
+
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_TILE_DATA
+
+AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
+#define AOM_CTRL_AV1D_SET_EXT_REF_PTR
+
+AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+
+AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1D_SET_ROW_MT
+
+AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
+#define AOM_CTRL_AV1D_SET_IS_ANNEXB
+
+AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
+#define AOM_CTRL_AV1D_SET_OPERATING_POINT
+
+AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int)
+#define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS
+
+AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
+#define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
+
+AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
+#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FWD_KF_PRESENT, int *)
+#define AOM_CTRL_AOMD_GET_FWD_KF_PRESENT
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_FLAGS, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_FLAGS
+
+AOM_CTRL_USE_TYPE(AOMD_GET_ALTREF_PRESENT, int *)
+#define AOM_CTRL_AOMD_GET_ALTREF_PRESENT
+
+AOM_CTRL_USE_TYPE(AOMD_GET_TILE_INFO, aom_tile_info *)
+#define AOM_CTRL_AOMD_GET_TILE_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+ aom_screen_content_tools_info *)
+#define AOM_CTRL_AOMD_GET_SCREEN_CONTENT_TOOLS_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_STILL_PICTURE, aom_still_picture_info *)
+#define AOM_CTRL_AOMD_GET_STILL_PICTURE
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SB_SIZE, aom_superblock_size_t *)
+#define AOMD_CTRL_AOMD_GET_SB_SIZE
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_EXISTING_FRAME_FLAG, int *)
+#define AOMD_CTRL_AOMD_GET_SHOW_EXISTING_FRAME_FLAG
+
+AOM_CTRL_USE_TYPE(AOMD_GET_S_FRAME_INFO, aom_s_frame_info *)
+#define AOMD_CTRL_AOMD_GET_S_FRAME_INFO
+
+AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_FRAME_FLAG, int *)
+#define AOM_CTRL_AOMD_GET_SHOW_FRAME_FLAG
+
+AOM_CTRL_USE_TYPE(AOMD_GET_BASE_Q_IDX, int *)
+#define AOM_CTRL_AOMD_GET_BASE_Q_IDX
+
+AOM_CTRL_USE_TYPE(AOMD_GET_ORDER_HINT, unsigned int *)
+#define AOM_CTRL_AOMD_GET_ORDER_HINT
+
+// The AOM_CTRL_USE_TYPE macro can't be used with AV1D_GET_MI_INFO because
+// AV1D_GET_MI_INFO takes more than one parameter.
+#define AOM_CTRL_AV1D_GET_MI_INFO
+/*!\endcond */
+/*! @} - end defgroup aom_decoder */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_AOMDX_H_
diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com
new file mode 100644
index 0000000000..266e2943a3
--- /dev/null
+++ b/third_party/aom/aom/exports_com
@@ -0,0 +1,42 @@
+text aom_codec_build_config
+text aom_codec_control
+text aom_codec_destroy
+text aom_codec_err_to_string
+text aom_codec_error
+text aom_codec_error_detail
+text aom_codec_get_caps
+text aom_codec_iface_name
+text aom_codec_set_option
+text aom_codec_version
+text aom_codec_version_extra_str
+text aom_codec_version_str
+text aom_free
+text aom_img_add_metadata
+text aom_img_alloc
+text aom_img_alloc_with_border
+text aom_img_flip
+text aom_img_free
+text aom_img_get_metadata
+text aom_img_metadata_array_free
+text aom_img_metadata_array_alloc
+text aom_img_metadata_free
+text aom_img_metadata_alloc
+text aom_img_num_metadata
+text aom_img_plane_height
+text aom_img_plane_width
+text aom_img_remove_metadata
+text aom_img_set_rect
+text aom_img_wrap
+text aom_malloc
+text aom_rb_bytes_read
+text aom_rb_read_bit
+text aom_rb_read_literal
+text aom_rb_read_uvlc
+text aom_uleb_decode
+text aom_uleb_encode
+text aom_uleb_encode_fixed_size
+text aom_uleb_size_in_bytes
+text aom_wb_bytes_written
+text aom_wb_write_bit
+text aom_wb_write_literal
+text aom_wb_write_unsigned_literal
diff --git a/third_party/aom/aom/exports_dec b/third_party/aom/aom/exports_dec
new file mode 100644
index 0000000000..ffff023ddd
--- /dev/null
+++ b/third_party/aom/aom/exports_dec
@@ -0,0 +1,8 @@
+text aom_codec_dec_init_ver
+text aom_codec_decode
+text aom_codec_get_frame
+text aom_codec_get_stream_info
+text aom_codec_peek_stream_info
+text aom_codec_set_frame_buffer_functions
+text aom_obu_type_to_string
+text aom_read_obu_header
diff --git a/third_party/aom/aom/exports_enc b/third_party/aom/aom/exports_enc
new file mode 100644
index 0000000000..1473d9d2b5
--- /dev/null
+++ b/third_party/aom/aom/exports_enc
@@ -0,0 +1,17 @@
+text aom_codec_enc_config_default
+text aom_codec_enc_config_set
+text aom_codec_enc_init_ver
+text aom_codec_encode
+text aom_codec_get_cx_data
+text aom_codec_get_global_headers
+text aom_codec_get_preview_frame
+text aom_codec_set_cx_data_buf
+text aom_film_grain_table_append
+text aom_film_grain_table_free
+text aom_film_grain_table_write
+text aom_flat_block_finder_init
+text aom_flat_block_finder_run
+text aom_noise_model_init
+text aom_noise_model_get_grain_parameters
+text aom_noise_model_save_latest
+text aom_noise_model_update
diff --git a/third_party/aom/aom/exports_test b/third_party/aom/aom/exports_test
new file mode 100644
index 0000000000..452a532ce6
--- /dev/null
+++ b/third_party/aom/aom/exports_test
@@ -0,0 +1,4 @@
+text aom_copy_metadata_to_frame_buffer
+text aom_dsp_rtcd
+text aom_remove_metadata_from_frame_buffer
+text aom_scale_rtcd
diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h
new file mode 100644
index 0000000000..b854a889e0
--- /dev/null
+++ b/third_party/aom/aom/internal/aom_codec_internal.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface for algorithm
+ * implementations.
+ *
+ * This file defines the private structures and data types that are only
+ * relevant to implementing an algorithm, as opposed to using it.
+ *
+ * To create a decoder algorithm class, an interface structure is put
+ * into the global namespace:
+ * <pre>
+ * my_codec.c:
+ * aom_codec_iface_t my_codec = {
+ * "My Codec v1.0",
+ * AOM_CODEC_ALG_ABI_VERSION,
+ * ...
+ * };
+ * </pre>
+ *
+ * An application instantiates a specific decoder instance by using
+ * aom_codec_dec_init() and a pointer to the algorithm's interface structure:
+ * <pre>
+ * my_app.c:
+ * extern aom_codec_iface_t my_codec;
+ * {
+ * aom_codec_ctx_t algo;
+ * int threads = 4;
+ * aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ * res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
+ * }
+ * </pre>
+ *
+ * Once initialized, the instance is managed using other functions from
+ * the aom_codec_* family.
+ */
+#ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#include "../aom_decoder.h"
+#include "../aom_encoder.h"
+#include "common/args_helper.h"
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/
+
+typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
+
+/*!\brief init function pointer prototype
+ *
+ * Performs algorithm-specific initialization of the decoder context. This
+ * function is called by aom_codec_dec_init() and aom_codec_enc_init(), so
+ * plugins implementing this interface may trust the input parameters to be
+ * properly initialized.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \retval #AOM_CODEC_OK
+ * The input stream was recognized and decoder initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ * Memory operation failed.
+ */
+typedef aom_codec_err_t (*aom_codec_init_fn_t)(aom_codec_ctx_t *ctx);
+
+/*!\brief destroy function pointer prototype
+ *
+ * Performs algorithm-specific destruction of the decoder context. This
+ * function is called by the generic aom_codec_destroy() wrapper function,
+ * so plugins implementing this interface may trust the input parameters
+ * to be properly initialized.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \retval #AOM_CODEC_OK
+ * The input stream was recognized and decoder initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ * Memory operation failed.
+ */
+typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);
+
+/*!\brief parse stream info function pointer prototype
+ *
+ * Performs high level parsing of the bitstream. This function is called by the
+ * generic aom_codec_peek_stream_info() wrapper function, so plugins
+ * implementing this interface may trust the input parameters to be properly
+ * initialized.
+ *
+ * \param[in] data Pointer to a block of data to parse
+ * \param[in] data_sz Size of the data buffer
+ * \param[in,out] si Pointer to stream info to update. The is_annexb
+ * member \ref MUST be properly initialized. This
+ * function sets the rest of the members.
+ *
+ * \retval #AOM_CODEC_OK
+ * Bitstream is parsable and stream information updated
+ */
+typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
+ size_t data_sz,
+ aom_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in,out] si Pointer to stream info to update
+ *
+ * \retval #AOM_CODEC_OK
+ * Bitstream is parsable and stream information updated
+ */
+typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
+ aom_codec_stream_info_t *si);
+
+/*!\brief control function pointer prototype
+ *
+ * This function is used to exchange algorithm specific data with the decoder
+ * instance. This can be used to implement features specific to a particular
+ * algorithm.
+ *
+ * This function is called by the generic aom_codec_control() wrapper
+ * function, so plugins implementing this interface may trust the input
+ * parameters to be properly initialized. However, this interface does not
+ * provide type safety for the exchanged data or assign meanings to the
+ * control IDs. Those details should be specified in the algorithm's
+ * header file. In particular, the ctrl_id parameter is guaranteed to exist
+ * in the algorithm's control mapping table, and the data parameter may be NULL.
+ *
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] ctrl_id Algorithm specific control identifier
+ * \param[in,out] data Data to exchange with algorithm instance.
+ *
+ * \retval #AOM_CODEC_OK
+ * The internal state data was deserialized.
+ */
+typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
+ va_list ap);
+
+/*!\brief codec option setter function pointer prototype
+ * This function is used to set a codec option using a key (option name) & value
+ * pair.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] name A string of the option's name (key)
+ * \param[in] value A string of the value to be set to
+ *
+ * \retval #AOM_CODEC_OK
+ * The option is successfully set to the value
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * The data was not valid.
+ */
+typedef aom_codec_err_t (*aom_codec_set_option_fn_t)(aom_codec_alg_priv_t *ctx,
+ const char *name,
+ const char *value);
+
+/*!\brief control function pointer mapping
+ *
+ * This structure stores the mapping between control identifiers and
+ * implementing functions. Each algorithm provides a list of these
+ * mappings. This list is searched by the aom_codec_control()
+ * function to determine which function to invoke. The special
+ * value defined by CTRL_MAP_END is used to indicate end-of-list, and must be
+ * present. It can be tested with the at_ctrl_map_end function. Note that
+ * ctrl_id values \ref MUST be non-zero.
+ */
+typedef const struct aom_codec_ctrl_fn_map {
+ int ctrl_id;
+ aom_codec_control_fn_t fn;
+} aom_codec_ctrl_fn_map_t;
+
+#define CTRL_MAP_END \
+ { 0, NULL }
+
+static AOM_INLINE int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) {
+ return e->ctrl_id == 0 && e->fn == NULL;
+}
+
+/*!\brief decode data function pointer prototype
+ *
+ * Processes a buffer of coded data. This function is called by the generic
+ * aom_codec_decode() wrapper function, so plugins implementing this interface
+ * may trust the input parameters to be properly initialized.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] data Pointer to this block of new coded data.
+ * \param[in] data_sz Size of the coded data, in bytes.
+ *
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
+ * and future pictures can be decoded without error. Otherwise,
+ * see the descriptions of the other error codes in ::aom_codec_err_t
+ * for recoverability capabilities.
+ */
+typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
+ const uint8_t *data,
+ size_t data_sz,
+ void *user_priv);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in out] iter Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ * produced will always be in PTS (presentation time stamp) order.
+ */
+typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx,
+ aom_codec_iter_t *iter);
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libaom will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx Pointer to this instance's context
+ * \param[in] cb_get Pointer to the get callback function
+ * \param[in] cb_release Pointer to the release callback function
+ * \param[in] cb_priv Callback's private data
+ *
+ * \retval #AOM_CODEC_OK
+ * External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ * One or more of the callbacks were NULL.
+ * \retval #AOM_CODEC_ERROR
+ * Decoder context not initialized, or algorithm not capable of
+ * using external frame buffers.
+ *
+ * \note
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)(
+ aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
+ const aom_image_t *img,
+ aom_codec_pts_t pts,
+ unsigned long duration,
+ aom_enc_frame_flags_t flags);
+typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
+ aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);
+
+typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)(
+ aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg);
+typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
+ aom_codec_alg_priv_t *ctx);
+
+typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
+ aom_codec_alg_priv_t *ctx);
+
+/*!\brief Decoder algorithm interface
+ *
+ * All decoders \ref MUST expose a variable of this type.
+ */
+struct aom_codec_iface {
+ const char *name; /**< Identification String */
+ int abi_version; /**< Implemented ABI version */
+ aom_codec_caps_t caps; /**< Decoder capabilities */
+ aom_codec_init_fn_t init; /**< \copydoc ::aom_codec_init_fn_t */
+ aom_codec_destroy_fn_t destroy; /**< \copydoc ::aom_codec_destroy_fn_t */
+ aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */
+ struct aom_codec_dec_iface {
+ aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */
+ aom_codec_get_si_fn_t get_si; /**< \copydoc ::aom_codec_get_si_fn_t */
+ aom_codec_decode_fn_t decode; /**< \copydoc ::aom_codec_decode_fn_t */
+ aom_codec_get_frame_fn_t
+ get_frame; /**< \copydoc ::aom_codec_get_frame_fn_t */
+ aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
+ } dec;
+ struct aom_codec_enc_iface {
+ int cfg_count;
+ const aom_codec_enc_cfg_t *cfgs; /**< \copydoc ::aom_codec_enc_cfg_t */
+ aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */
+ aom_codec_get_cx_data_fn_t
+ get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
+ aom_codec_enc_config_set_fn_t
+ cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */
+ aom_codec_get_global_headers_fn_t
+ get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
+ aom_codec_get_preview_frame_fn_t
+ get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
+ } enc;
+ aom_codec_set_option_fn_t set_option;
+};
+
+/*!\brief Instance private storage
+ *
+ * This structure is allocated by the algorithm's init function. It can be
+ * extended in one of two ways. First, a second, algorithm specific structure
+ * can be allocated and the priv member pointed to it. Alternatively, this
+ * structure can be made the first member of the algorithm specific structure,
+ * and the pointer cast to the proper type.
+ */
+struct aom_codec_priv {
+ const char *err_detail;
+ aom_codec_flags_t init_flags;
+ struct {
+ aom_fixed_buf_t cx_data_dst_buf;
+ unsigned int cx_data_pad_before;
+ unsigned int cx_data_pad_after;
+ aom_codec_cx_pkt_t cx_data_pkt;
+ } enc;
+};
+
+#define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id)
+
+/* Internal Utility Functions
+ *
+ * The following functions are intended to be used inside algorithms as
+ * utilities for manipulating aom_codec_* data structures.
+ */
+struct aom_codec_pkt_list {
+ unsigned int cnt;
+ unsigned int max;
+ struct aom_codec_cx_pkt pkts[1];
+};
+
+#define aom_codec_pkt_list_decl(n) \
+ union { \
+ struct aom_codec_pkt_list head; \
+ struct { \
+ struct aom_codec_pkt_list head; \
+ struct aom_codec_cx_pkt pkts[n]; \
+ } alloc; \
+ }
+
+#define aom_codec_pkt_list_init(m) \
+ (m)->alloc.head.cnt = 0, \
+ (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *,
+ const struct aom_codec_cx_pkt *);
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+ struct aom_codec_pkt_list *list, aom_codec_iter_t *iter);
+
+#include <stdio.h>
+#include <setjmp.h>
+
+struct aom_internal_error_info {
+ aom_codec_err_t error_code;
+ int has_detail;
+ char detail[ARG_ERR_MSG_MAX_LEN];
+ int setjmp; // Boolean: whether 'jmp' is valid.
+ jmp_buf jmp;
+};
+
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define LIBAOM_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef LIBAOM_FORMAT_PRINTF
+#define LIBAOM_FORMAT_PRINTF(string_index, first_to_check) \
+ __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
+// Records the error code and error message. Does not call longjmp().
+void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error,
+ const char *fmt, ...) LIBAOM_FORMAT_PRINTF(3, 4);
+
+void aom_internal_error(struct aom_internal_error_info *info,
+ aom_codec_err_t error, const char *fmt, ...)
+ LIBAOM_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN;
+
+// Calls aom_internal_error() with the error code and error message in `src`.
+// `info` and `src` must not point to the same struct, i.e., self copy is
+// prohibited.
+void aom_internal_error_copy(struct aom_internal_error_info *info,
+ const struct aom_internal_error_info *src)
+ CLANG_ANALYZER_NORETURN;
+
+void aom_merge_corrupted_flag(int *corrupted, int value);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
diff --git a/third_party/aom/aom/internal/aom_image_internal.h b/third_party/aom/aom/internal/aom_image_internal.h
new file mode 100644
index 0000000000..1b04c9ec3f
--- /dev/null
+++ b/third_party/aom/aom/internal/aom_image_internal.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the internal functions associated with the aom image
+ * descriptor.
+ *
+ */
+#ifndef AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
+
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Array of aom_metadata structs for an image. */
+struct aom_metadata_array {
+ size_t sz; /* Number of metadata structs in the list */
+ aom_metadata_t **metadata_array; /* Array of metadata structs */
+};
+
+/*!\brief Alloc memory for aom_metadata_array struct.
+ *
+ * Allocate memory for aom_metadata_array struct.
+ * If sz is 0 the aom_metadata_array struct's internal buffer list will be
+ * NULL, but the aom_metadata_array struct itself will still be allocated.
+ * Returns a pointer to the allocated struct or NULL on failure.
+ *
+ * \param[in] sz Size of internal metadata list buffer
+ */
+aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz);
+
+/*!\brief Free metadata array struct.
+ *
+ * Free metadata array struct and all metadata structs inside.
+ *
+ * \param[in] arr Metadata array struct pointer
+ */
+void aom_img_metadata_array_free(aom_metadata_array_t *arr);
+
+typedef void *(*aom_alloc_img_data_cb_fn_t)(void *priv, size_t size);
+
+/*!\brief Open a descriptor, allocating storage for the underlying image by
+ * using the provided callback function.
+ *
+ * Returns a descriptor for storing an image of the given format. The storage
+ * for the image is allocated by using the provided callback function. Unlike
+ * aom_img_alloc(), the returned descriptor does not own the storage for the
+ * image. The caller is responsible for freeing the storage for the image.
+ *
+ * Note: If the callback function is invoked and succeeds,
+ * aom_img_alloc_with_cb() is guaranteed to succeed. Therefore, if
+ * aom_img_alloc_with_cb() fails, the caller is assured that no storage was
+ * allocated.
+ *
+ * \param[in] img Pointer to storage for descriptor. If this parameter
+ * is NULL, the storage for the descriptor will be
+ * allocated on the heap.
+ * \param[in] fmt Format for the image
+ * \param[in] d_w Width of the image
+ * \param[in] d_h Height of the image
+ * \param[in] align Alignment, in bytes, of the image buffer and
+ * each row in the image (stride).
+ * \param[in] alloc_cb Callback function used to allocate storage for the
+ * image.
+ * \param[in] cb_priv The first argument ('priv') for the callback
+ * function.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ * parameter is non-null, the value of the img parameter will be
+ * returned.
+ */
+aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int align,
+ aom_alloc_img_data_cb_fn_t alloc_cb,
+ void *cb_priv);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c
new file mode 100644
index 0000000000..512fd28196
--- /dev/null
+++ b/third_party/aom/aom/src/aom_codec.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <assert.h>
+#include <stdarg.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_codec_internal.h"
+
+int aom_codec_version(void) { return VERSION_PACKED; }
+
+const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
+
+const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
+
+const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
+ return iface ? iface->name : "<invalid interface>";
+}
+
+const char *aom_codec_err_to_string(aom_codec_err_t err) {
+ switch (err) {
+ case AOM_CODEC_OK: return "Success";
+ case AOM_CODEC_ERROR: return "Unspecified internal error";
+ case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
+ case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
+ case AOM_CODEC_INCAPABLE:
+ return "Codec does not implement requested capability";
+ case AOM_CODEC_UNSUP_BITSTREAM:
+ return "Bitstream not supported by this decoder";
+ case AOM_CODEC_UNSUP_FEATURE:
+ return "Bitstream required feature not supported by this decoder";
+ case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
+ case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
+ case AOM_CODEC_LIST_END: return "End of iterated list";
+ }
+
+ return "Unrecognized error code";
+}
+
+const char *aom_codec_error(const aom_codec_ctx_t *ctx) {
+ return (ctx) ? aom_codec_err_to_string(ctx->err)
+ : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
+}
+
+const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx) {
+ if (ctx && ctx->err)
+ return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
+
+ return NULL;
+}
+
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
+ if (!ctx) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (!ctx->iface || !ctx->priv) {
+ ctx->err = AOM_CODEC_ERROR;
+ return AOM_CODEC_ERROR;
+ }
+ ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
+ ctx->iface = NULL;
+ ctx->name = NULL;
+ ctx->priv = NULL;
+ ctx->err = AOM_CODEC_OK;
+ return AOM_CODEC_OK;
+}
+
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
+ return iface ? iface->caps : 0;
+}
+
+aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
+ if (!ctx) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ // Control ID must be non-zero.
+ if (!ctrl_id) {
+ ctx->err = AOM_CODEC_INVALID_PARAM;
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) {
+ ctx->err = AOM_CODEC_ERROR;
+ return AOM_CODEC_ERROR;
+ }
+
+ // "ctrl_maps" is an array of (control ID, function pointer) elements,
+ // with CTRL_MAP_END as a sentinel.
+ for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps;
+ !at_ctrl_map_end(entry); ++entry) {
+ if (entry->ctrl_id == ctrl_id) {
+ va_list ap;
+ va_start(ap, ctrl_id);
+ ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
+ va_end(ap);
+ return ctx->err;
+ }
+ }
+ ctx->err = AOM_CODEC_ERROR;
+ ctx->priv->err_detail = "Invalid control ID";
+ return AOM_CODEC_ERROR;
+}
+
+aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name,
+ const char *value) {
+ if (!ctx) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (!ctx->iface || !ctx->priv || !ctx->iface->set_option) {
+ ctx->err = AOM_CODEC_ERROR;
+ return AOM_CODEC_ERROR;
+ }
+ ctx->err =
+ ctx->iface->set_option((aom_codec_alg_priv_t *)ctx->priv, name, value);
+ return ctx->err;
+}
+
+LIBAOM_FORMAT_PRINTF(3, 0)
+static void set_error(struct aom_internal_error_info *info,
+ aom_codec_err_t error, const char *fmt, va_list ap) {
+ info->error_code = error;
+ info->has_detail = 0;
+
+ if (fmt) {
+ size_t sz = sizeof(info->detail);
+
+ info->has_detail = 1;
+ vsnprintf(info->detail, sz - 1, fmt, ap);
+ info->detail[sz - 1] = '\0';
+ }
+}
+
+void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error,
+ const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ set_error(info, error, fmt, ap);
+ va_end(ap);
+
+ assert(!info->setjmp);
+}
+
+void aom_internal_error(struct aom_internal_error_info *info,
+ aom_codec_err_t error, const char *fmt, ...) {
+ va_list ap;
+
+ va_start(ap, fmt);
+ set_error(info, error, fmt, ap);
+ va_end(ap);
+
+ if (info->setjmp) longjmp(info->jmp, info->error_code);
+}
+
+void aom_internal_error_copy(struct aom_internal_error_info *info,
+ const struct aom_internal_error_info *src) {
+ assert(info != src);
+
+ if (!src->has_detail) {
+ aom_internal_error(info, src->error_code, NULL);
+ } else {
+ aom_internal_error(info, src->error_code, "%s", src->detail);
+ }
+}
+
+void aom_merge_corrupted_flag(int *corrupted, int value) {
+ *corrupted |= value;
+}
+
+const char *aom_obu_type_to_string(OBU_TYPE type) {
+ switch (type) {
+ case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER";
+ case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER";
+ case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER";
+ case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER";
+ case OBU_FRAME: return "OBU_FRAME";
+ case OBU_TILE_GROUP: return "OBU_TILE_GROUP";
+ case OBU_METADATA: return "OBU_METADATA";
+ case OBU_TILE_LIST: return "OBU_TILE_LIST";
+ case OBU_PADDING: return "OBU_PADDING";
+ default: break;
+ }
+ return "<Invalid OBU Type>";
+}
diff --git a/third_party/aom/aom/src/aom_decoder.c b/third_party/aom/aom/src/aom_decoder.c
new file mode 100644
index 0000000000..49fff26352
--- /dev/null
+++ b/third_party/aom/aom/src/aom_decoder.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <string.h>
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+ return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+ aom_codec_iface_t *iface,
+ const aom_codec_dec_cfg_t *cfg,
+ aom_codec_flags_t flags, int ver) {
+ aom_codec_err_t res;
+
+ if (ver != AOM_DECODER_ABI_VERSION)
+ res = AOM_CODEC_ABI_MISMATCH;
+ else if (!ctx || !iface)
+ res = AOM_CODEC_INVALID_PARAM;
+ else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+ res = AOM_CODEC_ABI_MISMATCH;
+ else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
+ res = AOM_CODEC_INCAPABLE;
+ else {
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->iface = iface;
+ ctx->name = iface->name;
+ ctx->priv = NULL;
+ ctx->init_flags = flags;
+ ctx->config.dec = cfg;
+
+ res = ctx->iface->init(ctx);
+ if (res) {
+ ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+ aom_codec_destroy(ctx);
+ }
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+ const uint8_t *data, size_t data_sz,
+ aom_codec_stream_info_t *si) {
+ aom_codec_err_t res;
+
+ if (!iface || !data || !data_sz || !si) {
+ res = AOM_CODEC_INVALID_PARAM;
+ } else {
+ /* Set default/unknown values */
+ si->w = 0;
+ si->h = 0;
+
+ res = iface->dec.peek_si(data, data_sz, si);
+ }
+
+ return res;
+}
+
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+ aom_codec_stream_info_t *si) {
+ aom_codec_err_t res;
+
+ if (!ctx || !si) {
+ res = AOM_CODEC_INVALID_PARAM;
+ } else if (!ctx->iface || !ctx->priv) {
+ res = AOM_CODEC_ERROR;
+ } else {
+ /* Set default/unknown values */
+ si->w = 0;
+ si->h = 0;
+
+ res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+ size_t data_sz, void *user_priv) {
+ aom_codec_err_t res;
+
+ if (!ctx)
+ res = AOM_CODEC_INVALID_PARAM;
+ else if (!ctx->iface || !ctx->priv)
+ res = AOM_CODEC_ERROR;
+ else {
+ res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
+
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
+ aom_image_t *img;
+
+ if (!ctx || !iter || !ctx->iface || !ctx->priv)
+ img = NULL;
+ else
+ img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
+
+ return img;
+}
+
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+ aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+ aom_codec_err_t res;
+
+ if (!ctx || !cb_get || !cb_release) {
+ res = AOM_CODEC_INVALID_PARAM;
+ } else if (!ctx->iface || !ctx->priv) {
+ res = AOM_CODEC_ERROR;
+ } else if (!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+ res = AOM_CODEC_INCAPABLE;
+ } else {
+ res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
+ cb_priv);
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
diff --git a/third_party/aom/aom/src/aom_encoder.c b/third_party/aom/aom/src/aom_encoder.c
new file mode 100644
index 0000000000..70e0b75bcd
--- /dev/null
+++ b/third_party/aom/aom/src/aom_encoder.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap encoder algorithms.
+ *
+ */
+#include "config/aom_config.h"
+
+#if HAVE_FEXCEPT
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <fenv.h>
+#endif
+
+#include <limits.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+ return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+ aom_codec_iface_t *iface,
+ const aom_codec_enc_cfg_t *cfg,
+ aom_codec_flags_t flags, int ver) {
+ aom_codec_err_t res;
+ // The value of AOM_ENCODER_ABI_VERSION in libaom v3.0.0 and v3.1.0 - v3.1.3.
+ //
+ // We are compatible with these older libaom releases. AOM_ENCODER_ABI_VERSION
+ // was incremented after these releases for two reasons:
+ // 1. AOM_ENCODER_ABI_VERSION takes contribution from
+ // AOM_EXT_PART_ABI_VERSION. The external partition API is still
+ // experimental, so it should not be considered as part of the stable ABI.
+ // fd9ed8366 External partition: Define APIs
+ // https://aomedia-review.googlesource.com/c/aom/+/135663
+ // 2. As a way to detect the presence of speeds 7-9 in all-intra mode. I (wtc)
+ // suggested this change because I misunderstood how
+ // AOM_ENCODER_ABI_VERSION was used.
+ // bbdfa68d1 AllIntra: Redefine all-intra mode speed features for speed 7+
+ // https://aomedia-review.googlesource.com/c/aom/+/140624
+ const int aom_encoder_abi_version_25 = 25;
+
+ // TODO(bug aomedia:3228): Remove the check for aom_encoder_abi_version_25 in
+ // libaom v4.0.0.
+ if (ver != AOM_ENCODER_ABI_VERSION && ver != aom_encoder_abi_version_25)
+ res = AOM_CODEC_ABI_MISMATCH;
+ else if (!ctx || !iface || !cfg)
+ res = AOM_CODEC_INVALID_PARAM;
+ else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+ res = AOM_CODEC_ABI_MISMATCH;
+ else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+ res = AOM_CODEC_INCAPABLE;
+ else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
+ res = AOM_CODEC_INCAPABLE;
+ else if ((flags & AOM_CODEC_USE_HIGHBITDEPTH) &&
+ !(iface->caps & AOM_CODEC_CAP_HIGHBITDEPTH)) {
+ res = AOM_CODEC_INCAPABLE;
+ } else if (cfg->g_bit_depth > 8 &&
+ (flags & AOM_CODEC_USE_HIGHBITDEPTH) == 0) {
+ res = AOM_CODEC_INVALID_PARAM;
+ ctx->err_detail =
+ "High bit-depth used without the AOM_CODEC_USE_HIGHBITDEPTH flag.";
+ } else {
+ ctx->iface = iface;
+ ctx->name = iface->name;
+ ctx->priv = NULL;
+ ctx->init_flags = flags;
+ ctx->config.enc = cfg;
+ res = ctx->iface->init(ctx);
+
+ if (res) {
+ // IMPORTANT: ctx->priv->err_detail must be null or point to a string
+ // that remains valid after ctx->priv is destroyed, such as a C string
+ // literal. This makes it safe to call aom_codec_error_detail() after
+ // aom_codec_enc_init_ver() failed.
+ ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+ aom_codec_destroy(ctx);
+ }
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+ aom_codec_enc_cfg_t *cfg,
+ unsigned int usage) {
+ aom_codec_err_t res;
+
+ if (!iface || !cfg)
+ res = AOM_CODEC_INVALID_PARAM;
+ else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+ res = AOM_CODEC_INCAPABLE;
+ else {
+ res = AOM_CODEC_INVALID_PARAM;
+
+ for (int i = 0; i < iface->enc.cfg_count; ++i) {
+ if (iface->enc.cfgs[i].g_usage == usage) {
+ *cfg = iface->enc.cfgs[i];
+ res = AOM_CODEC_OK;
+ /* default values */
+ memset(&cfg->encoder_cfg, 0, sizeof(cfg->encoder_cfg));
+ cfg->encoder_cfg.super_block_size = 0; // Dynamic
+ cfg->encoder_cfg.max_partition_size = 128;
+ cfg->encoder_cfg.min_partition_size = 4;
+ cfg->encoder_cfg.disable_trellis_quant = 3;
+ break;
+ }
+ }
+ }
+ return res;
+}
+
+#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+/* On X86, disable the x87 unit's internal 80 bit precision for better
+ * consistency with the SSE unit's 64 bit precision.
+ */
+#include "aom_ports/x86.h"
+#define FLOATING_POINT_SET_PRECISION \
+ unsigned short x87_orig_mode = x87_set_double_precision();
+#define FLOATING_POINT_RESTORE_PRECISION x87_set_control_word(x87_orig_mode);
+#else
+#define FLOATING_POINT_SET_PRECISION
+#define FLOATING_POINT_RESTORE_PRECISION
+#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
+
+#if HAVE_FEXCEPT && CONFIG_DEBUG
+#define FLOATING_POINT_SET_EXCEPTIONS \
+ const int float_excepts = \
+ feenableexcept(FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW);
+#define FLOATING_POINT_RESTORE_EXCEPTIONS \
+ if (float_excepts != -1) { \
+ fedisableexcept(FE_ALL_EXCEPT); \
+ feenableexcept(float_excepts); \
+ }
+#else
+#define FLOATING_POINT_SET_EXCEPTIONS
+#define FLOATING_POINT_RESTORE_EXCEPTIONS
+#endif // HAVE_FEXCEPT && CONFIG_DEBUG
+
+/* clang-format off */
+#define FLOATING_POINT_INIT \
+ do { \
+ FLOATING_POINT_SET_PRECISION \
+ FLOATING_POINT_SET_EXCEPTIONS
+
+#define FLOATING_POINT_RESTORE \
+ FLOATING_POINT_RESTORE_EXCEPTIONS \
+ FLOATING_POINT_RESTORE_PRECISION \
+ } while (0);
+/* clang-format on */
+
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+ aom_codec_pts_t pts, unsigned long duration,
+ aom_enc_frame_flags_t flags) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+
+ if (!ctx || (img && !duration))
+ res = AOM_CODEC_INVALID_PARAM;
+ else if (!ctx->iface || !ctx->priv)
+ res = AOM_CODEC_ERROR;
+ else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+ res = AOM_CODEC_INCAPABLE;
+ else if (img && ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) != 0) !=
+ ((ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) != 0)) {
+ res = AOM_CODEC_INVALID_PARAM;
+ } else {
+ /* Execute in a normalized floating point environment, if the platform
+ * requires it.
+ */
+ FLOATING_POINT_INIT
+ res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags);
+ FLOATING_POINT_RESTORE
+ }
+
+ return SAVE_STATUS(ctx, res);
+}
+
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+ aom_codec_iter_t *iter) {
+ const aom_codec_cx_pkt_t *pkt = NULL;
+
+ if (ctx) {
+ if (!iter)
+ ctx->err = AOM_CODEC_INVALID_PARAM;
+ else if (!ctx->iface || !ctx->priv)
+ ctx->err = AOM_CODEC_ERROR;
+ else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+ ctx->err = AOM_CODEC_INCAPABLE;
+ else
+ pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
+ }
+
+ if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ // If the application has specified a destination area for the
+ // compressed data, and the codec has not placed the data there,
+ // and it fits, copy it.
+ aom_codec_priv_t *const priv = ctx->priv;
+ char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
+
+ if (dst_buf && pkt->data.raw.buf != dst_buf &&
+ pkt->data.raw.sz + priv->enc.cx_data_pad_before +
+ priv->enc.cx_data_pad_after <=
+ priv->enc.cx_data_dst_buf.sz) {
+ aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
+
+ memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
+ pkt->data.raw.sz);
+ *modified_pkt = *pkt;
+ modified_pkt->data.raw.buf = dst_buf;
+ modified_pkt->data.raw.sz +=
+ priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
+ pkt = modified_pkt;
+ }
+
+ if (dst_buf == pkt->data.raw.buf) {
+ priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
+ priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
+ }
+ }
+
+ return pkt;
+}
+
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+ const aom_fixed_buf_t *buf,
+ unsigned int pad_before,
+ unsigned int pad_after) {
+ if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM;
+
+ if (buf) {
+ ctx->priv->enc.cx_data_dst_buf = *buf;
+ ctx->priv->enc.cx_data_pad_before = pad_before;
+ ctx->priv->enc.cx_data_pad_after = pad_after;
+ } else {
+ ctx->priv->enc.cx_data_dst_buf.buf = NULL;
+ ctx->priv->enc.cx_data_dst_buf.sz = 0;
+ ctx->priv->enc.cx_data_pad_before = 0;
+ ctx->priv->enc.cx_data_pad_after = 0;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) {
+ aom_image_t *img = NULL;
+
+ if (ctx) {
+ if (!ctx->iface || !ctx->priv)
+ ctx->err = AOM_CODEC_ERROR;
+ else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+ ctx->err = AOM_CODEC_INCAPABLE;
+ else if (!ctx->iface->enc.get_preview)
+ ctx->err = AOM_CODEC_INCAPABLE;
+ else
+ img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
+ }
+
+ return img;
+}
+
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) {
+ aom_fixed_buf_t *buf = NULL;
+
+ if (ctx) {
+ if (!ctx->iface || !ctx->priv)
+ ctx->err = AOM_CODEC_ERROR;
+ else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+ ctx->err = AOM_CODEC_INCAPABLE;
+ else if (!ctx->iface->enc.get_glob_hdrs)
+ ctx->err = AOM_CODEC_INCAPABLE;
+ else
+ buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
+ }
+
+ return buf;
+}
+
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+ const aom_codec_enc_cfg_t *cfg) {
+ aom_codec_err_t res;
+
+ if (!ctx || !ctx->iface || !ctx->priv || !cfg)
+ res = AOM_CODEC_INVALID_PARAM;
+ else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+ res = AOM_CODEC_INCAPABLE;
+ else
+ res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
+
+ return SAVE_STATUS(ctx, res);
+}
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list,
+ const struct aom_codec_cx_pkt *pkt) {
+ if (list->cnt < list->max) {
+ list->pkts[list->cnt++] = *pkt;
+ return 0;
+ }
+
+ return 1;
+}
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+ struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) {
+ const aom_codec_cx_pkt_t *pkt;
+
+ if (!(*iter)) {
+ *iter = list->pkts;
+ }
+
+ pkt = (const aom_codec_cx_pkt_t *)*iter;
+
+ if ((size_t)(pkt - list->pkts) < list->cnt)
+ *iter = pkt + 1;
+ else
+ pkt = NULL;
+
+ return pkt;
+}
diff --git a/third_party/aom/aom/src/aom_image.c b/third_party/aom/aom/src/aom_image.c
new file mode 100644
index 0000000000..8e94d5dd4f
--- /dev/null
+++ b/third_party/aom/aom/src/aom_image.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom_mem/aom_mem.h"
+
+static INLINE unsigned int align_image_dimension(unsigned int d,
+ unsigned int subsampling,
+ unsigned int size_align) {
+ unsigned int align;
+
+ align = (1 << subsampling) - 1;
+ align = (size_align - 1 > align) ? (size_align - 1) : align;
+ return ((d + align) & ~align);
+}
+
+static aom_image_t *img_alloc_helper(
+ aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h,
+ unsigned int buf_align, unsigned int stride_align, unsigned int size_align,
+ unsigned int border, unsigned char *img_data,
+ aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) {
+ /* NOTE: In this function, bit_depth is either 8 or 16 (if
+ * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12.
+ */
+ unsigned int h, w, s, xcs, ycs, bps, bit_depth;
+ unsigned int stride_in_bytes;
+
+ if (img != NULL) memset(img, 0, sizeof(aom_image_t));
+
+ /* Treat align==0 like align==1 */
+ if (!buf_align) buf_align = 1;
+
+ /* Validate alignment (must be power of 2) */
+ if (buf_align & (buf_align - 1)) goto fail;
+
+ /* Treat align==0 like align==1 */
+ if (!stride_align) stride_align = 1;
+
+ /* Validate alignment (must be power of 2) */
+ if (stride_align & (stride_align - 1)) goto fail;
+
+ /* Treat align==0 like align==1 */
+ if (!size_align) size_align = 1;
+
+ /* Validate alignment (must be power of 2) */
+ if (size_align & (size_align - 1)) goto fail;
+
+ /* Get sample size for this format */
+ switch (fmt) {
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_NV12:
+ case AOM_IMG_FMT_AOMI420:
+ case AOM_IMG_FMT_AOMYV12: bps = 12; break;
+ case AOM_IMG_FMT_I422: bps = 16; break;
+ case AOM_IMG_FMT_I444: bps = 24; break;
+ case AOM_IMG_FMT_YV1216:
+ case AOM_IMG_FMT_I42016: bps = 24; break;
+ case AOM_IMG_FMT_I42216: bps = 32; break;
+ case AOM_IMG_FMT_I44416: bps = 48; break;
+ default: bps = 16; break;
+ }
+
+ bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+
+ /* Get chroma shift values for this format */
+ switch (fmt) {
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_NV12:
+ case AOM_IMG_FMT_AOMI420:
+ case AOM_IMG_FMT_AOMYV12:
+ case AOM_IMG_FMT_I422:
+ case AOM_IMG_FMT_I42016:
+ case AOM_IMG_FMT_YV1216:
+ case AOM_IMG_FMT_I42216: xcs = 1; break;
+ default: xcs = 0; break;
+ }
+
+ switch (fmt) {
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_NV12:
+ case AOM_IMG_FMT_AOMI420:
+ case AOM_IMG_FMT_AOMYV12:
+ case AOM_IMG_FMT_YV1216:
+ case AOM_IMG_FMT_I42016: ycs = 1; break;
+ default: ycs = 0; break;
+ }
+
+ /* Calculate storage sizes given the chroma subsampling */
+ w = align_image_dimension(d_w, xcs, size_align);
+ h = align_image_dimension(d_h, ycs, size_align);
+
+ s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth;
+ s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
+ stride_in_bytes = s * bit_depth / 8;
+
+ /* Allocate the new image */
+ if (!img) {
+ img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
+
+ if (!img) goto fail;
+
+ img->self_allocd = 1;
+ }
+
+ img->img_data = img_data;
+
+ if (!img_data) {
+ const uint64_t alloc_size =
+ (fmt & AOM_IMG_FMT_PLANAR)
+ ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth
+ : (uint64_t)(h + 2 * border) * stride_in_bytes;
+
+ if (alloc_size != (size_t)alloc_size) goto fail;
+
+ if (alloc_cb) {
+ const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1;
+ img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size);
+ if (img->img_data) {
+ img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align);
+ }
+ img->img_data_owner = 0;
+ } else {
+ img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
+ img->img_data_owner = 1;
+ }
+ img->sz = (size_t)alloc_size;
+ }
+
+ if (!img->img_data) goto fail;
+
+ img->fmt = fmt;
+ img->bit_depth = bit_depth;
+ // aligned width and aligned height
+ img->w = w;
+ img->h = h;
+ img->x_chroma_shift = xcs;
+ img->y_chroma_shift = ycs;
+ img->bps = bps;
+
+ /* Calculate strides */
+ img->stride[AOM_PLANE_Y] = stride_in_bytes;
+ img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
+
+ if (fmt == AOM_IMG_FMT_NV12) {
+ // Each row is a row of U and a row of V interleaved, so the stride is twice
+ // as long.
+ img->stride[AOM_PLANE_U] *= 2;
+ img->stride[AOM_PLANE_V] = 0;
+ }
+
+ /* Default viewport to entire image. (This aom_img_set_rect call always
+ * succeeds.) */
+ aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+ return img;
+
+fail:
+ aom_img_free(img);
+ return NULL;
+}
+
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int align) {
+ return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, NULL,
+ NULL);
+}
+
+aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int align,
+ aom_alloc_img_data_cb_fn_t alloc_cb,
+ void *cb_priv) {
+ return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL,
+ alloc_cb, cb_priv);
+}
+
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+ unsigned int d_h, unsigned int stride_align,
+ unsigned char *img_data) {
+ /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is
+ * not NULL. */
+ return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, 0, img_data,
+ NULL, NULL);
+}
+
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int align,
+ unsigned int size_align,
+ unsigned int border) {
+ return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, border,
+ NULL, NULL, NULL);
+}
+
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+ unsigned int w, unsigned int h, unsigned int border) {
+ if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+ y + h <= img->h) {
+ img->d_w = w;
+ img->d_h = h;
+
+ x += border;
+ y += border;
+
+ /* Calculate plane pointers */
+ if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
+ img->planes[AOM_PLANE_PACKED] =
+ img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
+ } else {
+ const int bytes_per_sample =
+ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+ unsigned char *data = img->img_data;
+
+ img->planes[AOM_PLANE_Y] =
+ data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
+ data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y];
+
+ unsigned int uv_border_h = border >> img->y_chroma_shift;
+ unsigned int uv_x = x >> img->x_chroma_shift;
+ unsigned int uv_y = y >> img->y_chroma_shift;
+ if (img->fmt == AOM_IMG_FMT_NV12) {
+ img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample * 2 +
+ uv_y * img->stride[AOM_PLANE_U];
+ img->planes[AOM_PLANE_V] = NULL;
+ } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
+ img->planes[AOM_PLANE_U] =
+ data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+ data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+ img->stride[AOM_PLANE_U];
+ img->planes[AOM_PLANE_V] =
+ data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+ } else {
+ img->planes[AOM_PLANE_V] =
+ data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+ data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+ img->stride[AOM_PLANE_V];
+ img->planes[AOM_PLANE_U] =
+ data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+ }
+ }
+ return 0;
+ }
+ return -1;
+}
+
+void aom_img_flip(aom_image_t *img) {
+ /* Note: In the calculation pointer adjustment calculation, we want the
+ * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
+ * standard indicates that if the adjustment parameter is unsigned, the
+ * stride parameter will be promoted to unsigned, causing errors when
+ * the lhs is a larger type than the rhs.
+ */
+ img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
+ img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
+
+ img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+ img->stride[AOM_PLANE_U];
+ img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
+
+ img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+ img->stride[AOM_PLANE_V];
+ img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
+}
+
+void aom_img_free(aom_image_t *img) {
+ if (img) {
+ aom_img_remove_metadata(img);
+ if (img->img_data && img->img_data_owner) aom_free(img->img_data);
+
+ if (img->self_allocd) free(img);
+ }
+}
+
+int aom_img_plane_width(const aom_image_t *img, int plane) {
+ if (plane > 0 && img->x_chroma_shift > 0)
+ return (img->d_w + 1) >> img->x_chroma_shift;
+ else
+ return img->d_w;
+}
+
+int aom_img_plane_height(const aom_image_t *img, int plane) {
+ if (plane > 0 && img->y_chroma_shift > 0)
+ return (img->d_h + 1) >> img->y_chroma_shift;
+ else
+ return img->d_h;
+}
+
+aom_metadata_t *aom_img_metadata_alloc(
+ uint32_t type, const uint8_t *data, size_t sz,
+ aom_metadata_insert_flags_t insert_flag) {
+ if (!data || sz == 0) return NULL;
+ aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t));
+ if (!metadata) return NULL;
+ metadata->type = type;
+ metadata->payload = (uint8_t *)malloc(sz);
+ if (!metadata->payload) {
+ free(metadata);
+ return NULL;
+ }
+ memcpy(metadata->payload, data, sz);
+ metadata->sz = sz;
+ metadata->insert_flag = insert_flag;
+ return metadata;
+}
+
+void aom_img_metadata_free(aom_metadata_t *metadata) {
+ if (metadata) {
+ if (metadata->payload) free(metadata->payload);
+ free(metadata);
+ }
+}
+
+aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) {
+ aom_metadata_array_t *arr =
+ (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t));
+ if (!arr) return NULL;
+ if (sz > 0) {
+ arr->metadata_array =
+ (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *));
+ if (!arr->metadata_array) {
+ aom_img_metadata_array_free(arr);
+ return NULL;
+ }
+ arr->sz = sz;
+ }
+ return arr;
+}
+
+void aom_img_metadata_array_free(aom_metadata_array_t *arr) {
+ if (arr) {
+ if (arr->metadata_array) {
+ for (size_t i = 0; i < arr->sz; i++) {
+ aom_img_metadata_free(arr->metadata_array[i]);
+ }
+ free(arr->metadata_array);
+ }
+ free(arr);
+ }
+}
+
+int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
+ size_t sz, aom_metadata_insert_flags_t insert_flag) {
+ if (!img) return -1;
+ if (!img->metadata) {
+ img->metadata = aom_img_metadata_array_alloc(0);
+ if (!img->metadata) return -1;
+ }
+ aom_metadata_t *metadata =
+ aom_img_metadata_alloc(type, data, sz, insert_flag);
+ if (!metadata) return -1;
+ aom_metadata_t **metadata_array =
+ (aom_metadata_t **)realloc(img->metadata->metadata_array,
+ (img->metadata->sz + 1) * sizeof(metadata));
+ if (!metadata_array) {
+ aom_img_metadata_free(metadata);
+ return -1;
+ }
+ img->metadata->metadata_array = metadata_array;
+ img->metadata->metadata_array[img->metadata->sz] = metadata;
+ img->metadata->sz++;
+ return 0;
+}
+
+void aom_img_remove_metadata(aom_image_t *img) {
+ if (img && img->metadata) {
+ aom_img_metadata_array_free(img->metadata);
+ img->metadata = NULL;
+ }
+}
+
+const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img,
+ size_t index) {
+ if (!img) return NULL;
+ const aom_metadata_array_t *array = img->metadata;
+ if (array && index < array->sz) {
+ return array->metadata_array[index];
+ }
+ return NULL;
+}
+
+size_t aom_img_num_metadata(const aom_image_t *img) {
+ if (!img || !img->metadata) return 0;
+ return img->metadata->sz;
+}
diff --git a/third_party/aom/aom/src/aom_integer.c b/third_party/aom/aom/src/aom_integer.c
new file mode 100644
index 0000000000..7edfd0de87
--- /dev/null
+++ b/third_party/aom/aom/src/aom_integer.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+static const size_t kMaximumLeb128Size = 8;
+static const uint8_t kLeb128ByteMask = 0x7f; // Binary: 01111111
+
+// Disallow values larger than 32-bits to ensure consistent behavior on 32 and
+// 64 bit targets: value is typically used to determine buffer allocation size
+// when decoded.
+static const uint64_t kMaximumLeb128Value = UINT32_MAX;
+
+size_t aom_uleb_size_in_bytes(uint64_t value) {
+ size_t size = 0;
+ do {
+ ++size;
+ } while ((value >>= 7) != 0);
+ return size;
+}
+
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+ size_t *length) {
+ if (buffer && value) {
+ *value = 0;
+ for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) {
+ const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask;
+ *value |= ((uint64_t)decoded_byte) << (i * 7);
+ if ((*(buffer + i) >> 7) == 0) {
+ if (length) {
+ *length = i + 1;
+ }
+
+ // Fail on values larger than 32-bits to ensure consistent behavior on
+ // 32 and 64 bit targets: value is typically used to determine buffer
+ // allocation size.
+ if (*value > UINT32_MAX) return -1;
+
+ return 0;
+ }
+ }
+ }
+
+ // If we get here, either the buffer/value pointers were invalid,
+ // or we ran over the available space
+ return -1;
+}
+
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+ size_t *coded_size) {
+ const size_t leb_size = aom_uleb_size_in_bytes(value);
+ if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size ||
+ leb_size > available || !coded_value || !coded_size) {
+ return -1;
+ }
+
+ for (size_t i = 0; i < leb_size; ++i) {
+ uint8_t byte = value & 0x7f;
+ value >>= 7;
+
+ if (value != 0) byte |= 0x80; // Signal that more bytes follow.
+
+ *(coded_value + i) = byte;
+ }
+
+ *coded_size = leb_size;
+ return 0;
+}
+
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+ size_t pad_to_size, uint8_t *coded_value,
+ size_t *coded_size) {
+ if (value > kMaximumLeb128Value || !coded_value || !coded_size ||
+ available < pad_to_size || pad_to_size > kMaximumLeb128Size) {
+ return -1;
+ }
+ const uint64_t limit = 1ULL << (7 * pad_to_size);
+ if (value >= limit) {
+ // Can't encode 'value' within 'pad_to_size' bytes
+ return -1;
+ }
+
+ for (size_t i = 0; i < pad_to_size; ++i) {
+ uint8_t byte = value & 0x7f;
+ value >>= 7;
+
+ if (i < pad_to_size - 1) byte |= 0x80; // Signal that more bytes follow.
+
+ *(coded_value + i) = byte;
+ }
+
+ assert(value == 0);
+
+ *coded_size = pad_to_size;
+ return 0;
+}
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
new file mode 100644
index 0000000000..254f6401c7
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+ return sum;
+}
+
+static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+ return sum;
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (int x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ const int sum = horz_scalar_product(src_x, x_filter);
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h) {
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (int x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (int y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ const int sum = vert_scalar_product(src_y, src_stride, y_filter);
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+ return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ (void)filter_y;
+ (void)y_step_q4;
+
+ convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+ w, h);
+}
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ (void)filter_x;
+ (void)x_step_q4;
+
+ convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+ w, h);
+}
+
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ uint8_t temp[64 * 135];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+ filter, x0_q4, x_step_q4, w, intermediate_height);
+ convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+ y0_q4, y_step_q4, w, h);
+}
+
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ aom_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h) {
+ for (int r = h; r > 0; --r) {
+ memmove(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+ ptrdiff_t a_stride,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+ return sum;
+}
+
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+ return sum;
+}
+
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int bd) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (int x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ const int sum = highbd_horz_scalar_product(src_x, x_filter);
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (int x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (int y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+ (void)filter_y;
+ (void)y_step_q4;
+
+ highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+ x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h, int bd) {
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+ (void)filter_x;
+ (void)x_step_q4;
+
+ highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ for (int y = 0; y < h; ++y) {
+ memmove(dst, src, w * sizeof(src[0]));
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
new file mode 100644
index 0000000000..653f690741
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -0,0 +1,510 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
+ return()
+endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
+set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
+
+list(APPEND AOM_DSP_COMMON_SOURCES
+ "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+ "${AOM_ROOT}/aom_dsp/aom_filter.h"
+ "${AOM_ROOT}/aom_dsp/aom_simd.h"
+ "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+ "${AOM_ROOT}/aom_dsp/blend.h"
+ "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+ "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+ "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+ "${AOM_ROOT}/aom_dsp/entcode.c"
+ "${AOM_ROOT}/aom_dsp/entcode.h"
+ "${AOM_ROOT}/aom_dsp/fft.c"
+ "${AOM_ROOT}/aom_dsp/fft_common.h"
+ "${AOM_ROOT}/aom_dsp/grain_params.h"
+ "${AOM_ROOT}/aom_dsp/intrapred.c"
+ "${AOM_ROOT}/aom_dsp/intrapred_common.h"
+ "${AOM_ROOT}/aom_dsp/loopfilter.c"
+ "${AOM_ROOT}/aom_dsp/odintrin.c"
+ "${AOM_ROOT}/aom_dsp/odintrin.h"
+ "${AOM_ROOT}/aom_dsp/prob.h"
+ "${AOM_ROOT}/aom_dsp/recenter.h"
+ "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/subtract.c"
+ "${AOM_ROOT}/aom_dsp/txfm_common.h"
+ "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+ "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+ "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
+ "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/convolve_ssse3.h"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_utils.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+ "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
+ "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h"
+ "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h"
+ "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h"
+ "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h"
+ "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h"
+ "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
+ "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+
+if(CONFIG_AV1_HIGHBITDEPTH)
+ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+
+ list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+
+ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
+
+ list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
+endif()
+
+if(CONFIG_AV1_DECODER)
+ list(APPEND AOM_DSP_DECODER_SOURCES
+ "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+ "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+ "${AOM_ROOT}/aom_dsp/bitreader.c"
+ "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c"
+ "${AOM_ROOT}/aom_dsp/entdec.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+ list(APPEND AOM_DSP_ENCODER_SOURCES
+ "${AOM_ROOT}/aom_dsp/avg.c"
+ "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+ "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter.c"
+ "${AOM_ROOT}/aom_dsp/bitwriter.h"
+ "${AOM_ROOT}/aom_dsp/blk_sse_sum.c"
+ "${AOM_ROOT}/aom_dsp/entenc.c"
+ "${AOM_ROOT}/aom_dsp/entenc.h"
+ "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+ "${AOM_ROOT}/aom_dsp/grain_table.c"
+ "${AOM_ROOT}/aom_dsp/grain_table.h"
+ "${AOM_ROOT}/aom_dsp/noise_model.c"
+ "${AOM_ROOT}/aom_dsp/noise_model.h"
+ "${AOM_ROOT}/aom_dsp/noise_util.c"
+ "${AOM_ROOT}/aom_dsp/noise_util.h"
+ "${AOM_ROOT}/aom_dsp/psnr.c"
+ "${AOM_ROOT}/aom_dsp/psnr.h"
+ "${AOM_ROOT}/aom_dsp/quantize.c"
+ "${AOM_ROOT}/aom_dsp/quantize.h"
+ "${AOM_ROOT}/aom_dsp/sad.c"
+ "${AOM_ROOT}/aom_dsp/sad_av1.c"
+ "${AOM_ROOT}/aom_dsp/sse.c"
+ "${AOM_ROOT}/aom_dsp/ssim.c"
+ "${AOM_ROOT}/aom_dsp/ssim.h"
+ "${AOM_ROOT}/aom_dsp/sum_squares.c"
+ "${AOM_ROOT}/aom_dsp/variance.c"
+ "${AOM_ROOT}/aom_dsp/variance.h")
+
+ # Flow estimation library
+ if(NOT CONFIG_REALTIME_ONLY)
+ list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/pyramid.c"
+ "${AOM_ROOT}/aom_dsp/flow_estimation/corner_detect.c"
+ "${AOM_ROOT}/aom_dsp/flow_estimation/corner_match.c"
+ "${AOM_ROOT}/aom_dsp/flow_estimation/disflow.c"
+ "${AOM_ROOT}/aom_dsp/flow_estimation/flow_estimation.c"
+ "${AOM_ROOT}/aom_dsp/flow_estimation/ransac.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_sse4.c"
+ "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
+ endif()
+
+ list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
+
+ list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64
+ "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
+ "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c")
+
+ list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+ "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_AVX
+ "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+ "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
+ "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
+ "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/sadxd_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/masked_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/masked_sad4d_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/avg_sve.c"
+ "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_sve.c"
+ "${AOM_ROOT}/aom_dsp/arm/sum_squares_sve.c")
+
+ if(CONFIG_AV1_HIGHBITDEPTH)
+ list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c")
+
+ list(APPEND AOM_DSP_ENCODER_INTRIN_SVE
+ "${AOM_ROOT}/aom_dsp/arm/highbd_sse_sve.c"
+ "${AOM_ROOT}/aom_dsp/arm/highbd_variance_sve.c")
+ endif()
+
+ if(CONFIG_INTERNAL_STATS)
+ list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
+ "${AOM_ROOT}/aom_dsp/psnrhvs.c")
+ endif()
+
+ if(CONFIG_TUNE_VMAF)
+ list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c"
+ "${AOM_ROOT}/aom_dsp/vmaf.h")
+ endif()
+
+ if(CONFIG_TUNE_BUTTERAUGLI)
+ list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/butteraugli.c"
+ "${AOM_ROOT}/aom_dsp/butteraugli.h")
+ endif()
+
+ if(CONFIG_REALTIME_ONLY)
+ list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c")
+
+ list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+ "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+ list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+
+ list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+ "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
+ endif()
+endif()
+
+# Creates aom_dsp build targets. Must not be called until after libaom target
+# has been created.
+function(setup_aom_dsp_targets)
+ add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
+ list(APPEND AOM_LIB_TARGETS aom_dsp_common)
+ create_no_op_source_file("aom_av1" "c" "no_op_source_file")
+ add_library(aom_dsp OBJECT "${no_op_source_file}")
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+ endif()
+ list(APPEND AOM_LIB_TARGETS aom_dsp)
+
+ # Not all generators support libraries consisting only of object files. Add a
+ # source file to the aom_dsp target.
+ add_no_op_source_file_to_target("aom_dsp" "c")
+
+ if(CONFIG_AV1_DECODER)
+ add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
+ list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+ endif()
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
+ list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+ endif()
+ if(CONFIG_TUNE_VMAF)
+ target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
+ endif()
+ endif()
+
+ if(HAVE_SSE2)
+ add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2")
+ add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_SSE2")
+
+ if(CONFIG_AV1_ENCODER)
+ if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64})
+ endif()
+ add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2")
+ add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SSE2")
+ endif()
+ endif()
+
+ if(HAVE_SSSE3)
+ add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3")
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_SSSE3")
+
+ if(CONFIG_AV1_ENCODER)
+ if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
+ ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+ endif()
+ add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3")
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SSSE3")
+ endif()
+ endif()
+
+ if(HAVE_SSE4_1)
+ add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_SSE4_1")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SSE4_1")
+ endif()
+ endif()
+
+ if(HAVE_AVX)
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("-mavx" "avx" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_AVX")
+ endif()
+ endif()
+
+ if(HAVE_AVX2)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_AVX2")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_AVX2")
+ endif()
+ endif()
+
+ if(HAVE_NEON)
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+ "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+ "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_NEON")
+ endif()
+ endif()
+
+ if(HAVE_NEON_DOTPROD)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD")
+ endif()
+ endif()
+
+ if(HAVE_NEON_I8MM)
+ add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+ "aom_dsp_common"
+ "AOM_DSP_COMMON_INTRIN_NEON_I8MM")
+ endif()
+
+ if(HAVE_SVE)
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder"
+ "AOM_DSP_ENCODER_INTRIN_SVE")
+ endif()
+ endif()
+
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+ endif()
+
+ # Pass the new lib targets up to the parent scope instance of
+ # $AOM_LIB_TARGETS.
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
new file mode 100644
index 0000000000..85dc0052e2
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PI 3.141592653589793238462643383279502884
+
+#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+#define AOMSIGN(x) ((x) < 0 ? -1 : 0)
+
+#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
+
+#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+/* Left shifting a negative value became undefined behavior in C99 (downgraded
+ from merely implementation-defined in C89). This should still compile to the
+ correct thing on any two's-complement machine, but avoid ubsan warnings.*/
+#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+typedef uint8_t qm_val_t;
+#define AOM_QM_BITS 5
+
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+
+static INLINE uint8_t clip_pixel(int val) {
+ return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+ switch (bd) {
+ case 8:
+ default: return (uint16_t)clamp(val, 0, 255);
+ case 10: return (uint16_t)clamp(val, 0, 1023);
+ case 12: return (uint16_t)clamp(val, 0, 4095);
+ }
+}
+
+// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
+// or max(0, value) and might be faster in some cases.
+// Care should be taken since the behavior of right shifting signed type
+// negative value is undefined by C standards and implementation defined,
+static INLINE unsigned int negative_to_zero(int value) {
+ return value & ~(value >> (sizeof(value) * 8 - 1));
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
new file mode 100644
index 0000000000..0265dd1ee5
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
new file mode 100755
index 0000000000..4b49605e53
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -0,0 +1,1798 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+EOF
+}
+forward_decls qw/aom_dsp_forward_decls/;
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
+ $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+}
+
+@block_widths = (4, 8, 16, 32, 64, 128);
+
+@encoder_block_sizes = ();
+foreach $w (@block_widths) {
+ foreach $h (@block_widths) {
+ push @encoder_block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w);
+ }
+}
+
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ push @encoder_block_sizes, [4, 16];
+ push @encoder_block_sizes, [16, 4];
+ push @encoder_block_sizes, [8, 32];
+ push @encoder_block_sizes, [32, 8];
+ push @encoder_block_sizes, [16, 64];
+ push @encoder_block_sizes, [64, 16];
+}
+
+@tx_dims = (4, 8, 16, 32, 64);
+@tx_sizes = ();
+foreach $w (@tx_dims) {
+ push @tx_sizes, [$w, $w];
+ foreach $h (@tx_dims) {
+ push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
+ push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
+ }
+}
+
+@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
+
+#
+# Intra prediction
+#
+
+foreach (@tx_sizes) {
+ ($w, $h) = @$_;
+ foreach $pred_name (@pred_names) {
+ add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
+ "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+ "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ }
+ }
+}
+
+specialize qw/aom_dc_top_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_dc_left_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_dc_128_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_v_predictor_4x4 neon sse2/;
+specialize qw/aom_v_predictor_4x8 neon sse2/;
+specialize qw/aom_v_predictor_4x16 neon sse2/;
+specialize qw/aom_v_predictor_8x4 neon sse2/;
+specialize qw/aom_v_predictor_8x8 neon sse2/;
+specialize qw/aom_v_predictor_8x16 neon sse2/;
+specialize qw/aom_v_predictor_8x32 neon sse2/;
+specialize qw/aom_v_predictor_16x4 neon sse2/;
+specialize qw/aom_v_predictor_16x8 neon sse2/;
+specialize qw/aom_v_predictor_16x16 neon sse2/;
+specialize qw/aom_v_predictor_16x32 neon sse2/;
+specialize qw/aom_v_predictor_16x64 neon sse2/;
+specialize qw/aom_v_predictor_32x8 neon sse2/;
+specialize qw/aom_v_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_v_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_h_predictor_4x4 neon sse2/;
+specialize qw/aom_h_predictor_4x8 neon sse2/;
+specialize qw/aom_h_predictor_4x16 neon sse2/;
+specialize qw/aom_h_predictor_8x4 neon sse2/;
+specialize qw/aom_h_predictor_8x8 neon sse2/;
+specialize qw/aom_h_predictor_8x16 neon sse2/;
+specialize qw/aom_h_predictor_8x32 neon sse2/;
+specialize qw/aom_h_predictor_16x4 neon sse2/;
+specialize qw/aom_h_predictor_16x8 neon sse2/;
+specialize qw/aom_h_predictor_16x16 neon sse2/;
+specialize qw/aom_h_predictor_16x32 neon sse2/;
+specialize qw/aom_h_predictor_16x64 neon sse2/;
+specialize qw/aom_h_predictor_32x8 neon sse2/;
+specialize qw/aom_h_predictor_32x16 neon sse2/;
+specialize qw/aom_h_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_h_predictor_32x64 neon sse2/;
+specialize qw/aom_h_predictor_64x16 neon sse2/;
+specialize qw/aom_h_predictor_64x32 neon sse2/;
+specialize qw/aom_h_predictor_64x64 neon sse2/;
+
+specialize qw/aom_paeth_predictor_4x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_4x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_4x16 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x16 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x32 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x8 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_32x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2 neon/;
+
+specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 neon ssse3/;
+
+# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+# by multiply and shift.
+specialize qw/aom_dc_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 neon sse2 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_4x16 neon/;
+ specialize qw/aom_highbd_v_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_8x32 neon/;
+ specialize qw/aom_highbd_v_predictor_16x4 neon/;
+ specialize qw/aom_highbd_v_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_16x64 neon/;
+ specialize qw/aom_highbd_v_predictor_32x8 neon/;
+ specialize qw/aom_highbd_v_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_v_predictor_32x64 neon/;
+ specialize qw/aom_highbd_v_predictor_64x16 neon/;
+ specialize qw/aom_highbd_v_predictor_64x32 neon/;
+ specialize qw/aom_highbd_v_predictor_64x64 neon/;
+
+ # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+ # by multiply and shift.
+ specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_4x16 neon/;
+ specialize qw/aom_highbd_dc_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_8x32 neon/;
+ specialize qw/aom_highbd_dc_predictor_16x4 neon/;
+ specialize qw/aom_highbd_dc_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_16x64 neon/;
+ specialize qw/aom_highbd_dc_predictor_32x8 neon/;
+ specialize qw/aom_highbd_dc_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_predictor_32x64 neon/;
+ specialize qw/aom_highbd_dc_predictor_64x16 neon/;
+ specialize qw/aom_highbd_dc_predictor_64x32 neon/;
+ specialize qw/aom_highbd_dc_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_h_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_4x16 neon/;
+ specialize qw/aom_highbd_h_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_8x32 neon/;
+ specialize qw/aom_highbd_h_predictor_16x4 neon/;
+ specialize qw/aom_highbd_h_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_16x64 neon/;
+ specialize qw/aom_highbd_h_predictor_32x8 neon/;
+ specialize qw/aom_highbd_h_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_h_predictor_32x64 neon/;
+ specialize qw/aom_highbd_h_predictor_64x16 neon/;
+ specialize qw/aom_highbd_h_predictor_64x32 neon/;
+ specialize qw/aom_highbd_h_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_4x16 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_8x32 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x4 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_16x64 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x8 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_32x64 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_64x16 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_64x32 neon/;
+ specialize qw/aom_highbd_dc_128_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_dc_left_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_4x16 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_8x32 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_16x4 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_16x64 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_32x8 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_32x64 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_64x16 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_64x32 neon/;
+ specialize qw/aom_highbd_dc_left_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_dc_top_predictor_4x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_4x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_4x16 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_8x4 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_8x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_8x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_8x32 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_16x4 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_16x8 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_16x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_16x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_16x64 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_32x8 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_32x16 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_32x32 sse2 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_32x64 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_64x16 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_64x32 neon/;
+ specialize qw/aom_highbd_dc_top_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_paeth_predictor_4x4 neon/;
+ specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
+ specialize qw/aom_highbd_paeth_predictor_4x16 neon/;
+ specialize qw/aom_highbd_paeth_predictor_8x4 neon/;
+ specialize qw/aom_highbd_paeth_predictor_8x8 neon/;
+ specialize qw/aom_highbd_paeth_predictor_8x16 neon/;
+ specialize qw/aom_highbd_paeth_predictor_8x32 neon/;
+ specialize qw/aom_highbd_paeth_predictor_16x4 neon/;
+ specialize qw/aom_highbd_paeth_predictor_16x8 neon/;
+ specialize qw/aom_highbd_paeth_predictor_16x16 neon/;
+ specialize qw/aom_highbd_paeth_predictor_16x32 neon/;
+ specialize qw/aom_highbd_paeth_predictor_16x64 neon/;
+ specialize qw/aom_highbd_paeth_predictor_32x8 neon/;
+ specialize qw/aom_highbd_paeth_predictor_32x16 neon/;
+ specialize qw/aom_highbd_paeth_predictor_32x32 neon/;
+ specialize qw/aom_highbd_paeth_predictor_32x64 neon/;
+ specialize qw/aom_highbd_paeth_predictor_64x16 neon/;
+ specialize qw/aom_highbd_paeth_predictor_64x32 neon/;
+ specialize qw/aom_highbd_paeth_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_smooth_predictor_4x4 neon/;
+ specialize qw/aom_highbd_smooth_predictor_4x8 neon/;
+ specialize qw/aom_highbd_smooth_predictor_4x16 neon/;
+ specialize qw/aom_highbd_smooth_predictor_8x4 neon/;
+ specialize qw/aom_highbd_smooth_predictor_8x8 neon/;
+ specialize qw/aom_highbd_smooth_predictor_8x16 neon/;
+ specialize qw/aom_highbd_smooth_predictor_8x32 neon/;
+ specialize qw/aom_highbd_smooth_predictor_16x4 neon/;
+ specialize qw/aom_highbd_smooth_predictor_16x8 neon/;
+ specialize qw/aom_highbd_smooth_predictor_16x16 neon/;
+ specialize qw/aom_highbd_smooth_predictor_16x32 neon/;
+ specialize qw/aom_highbd_smooth_predictor_16x64 neon/;
+ specialize qw/aom_highbd_smooth_predictor_32x8 neon/;
+ specialize qw/aom_highbd_smooth_predictor_32x16 neon/;
+ specialize qw/aom_highbd_smooth_predictor_32x32 neon/;
+ specialize qw/aom_highbd_smooth_predictor_32x64 neon/;
+ specialize qw/aom_highbd_smooth_predictor_64x16 neon/;
+ specialize qw/aom_highbd_smooth_predictor_64x32 neon/;
+ specialize qw/aom_highbd_smooth_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_smooth_v_predictor_4x4 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_4x8 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_4x16 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_8x4 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_8x8 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_8x16 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_8x32 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_16x4 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_16x8 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_16x16 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_16x32 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_16x64 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_32x8 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_32x16 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_32x32 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_32x64 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_64x16 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_64x32 neon/;
+ specialize qw/aom_highbd_smooth_v_predictor_64x64 neon/;
+
+ specialize qw/aom_highbd_smooth_h_predictor_4x4 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_4x8 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_4x16 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_8x4 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_8x8 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_8x16 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_8x32 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_16x4 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_16x8 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_16x16 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_16x32 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_16x64 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_32x8 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_32x16 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_32x32 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_32x64 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_64x16 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_64x32 neon/;
+ specialize qw/aom_highbd_smooth_h_predictor_64x64 neon/;
+}
+#
+# Sub Pixel Filters
+#
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
+add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/aom_convolve_copy neon sse2 avx2/;
+specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+
+add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/aom_scaled_2d ssse3 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
+ specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/;
+
+ add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+ specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/;
+
+ add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+ specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/;
+}
+
+#
+# Loopfilter
+#
+add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_14_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_14_quad avx2 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_8_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_8_quad sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_4_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_4_quad sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_14_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_14_quad sse2 avx2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_6_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_6_quad sse2 avx2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_8_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_8_quad sse2 avx2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_4_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_4_quad sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_6_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_6_quad sse2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_14 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_vertical_14_dual neon sse2 avx2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_8 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_vertical_8_dual neon sse2 avx2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_6 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_vertical_6_dual neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_vertical_4 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_vertical_4_dual neon sse2 avx2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_14 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
+ specialize qw/aom_highbd_lpf_horizontal_14_dual neon sse2 avx2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_6 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_6_dual neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_8 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_8_dual neon sse2 avx2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_4 neon sse2/;
+
+ add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+ specialize qw/aom_highbd_lpf_horizontal_4_dual neon sse2 avx2/;
+}
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
+ add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4 neon sse2/;
+
+ add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride";
+ specialize qw/aom_fdct4x4_lp neon sse2/;
+
+ if (aom_config("CONFIG_INTERNAL_STATS") eq "yes"){
+ # 8x8 DCT transform for psnr-hvs. Unlike other transforms isn't compatible
+ # with av1 scan orders, because it does two transposes.
+ add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64";
+ # High bit depth
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_highbd_fdct8x8 sse2/;
+ }
+ }
+ # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+ add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+ add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft4x4_float sse2/;
+
+ add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft8x8_float avx2 sse2/;
+
+ add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft16x16_float avx2 sse2/;
+
+ add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_fft32x32_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+ add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft4x4_float sse2/;
+
+ add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft8x8_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft16x16_float avx2 sse2/;
+
+ add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+ specialize qw/aom_ifft32x32_float avx2 sse2/;
+} # CONFIG_AV1_ENCODER
+
+#
+# Quantization
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b sse2 neon avx avx2/, "$ssse3_x86_64";
+
+ add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_32x32 neon avx avx2/, "$ssse3_x86_64";
+
+ add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_64x64 neon ssse3 avx2/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_adaptive sse2 avx2/;
+
+ add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_32x32_adaptive sse2/;
+
+ add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_64x64_adaptive sse2/;
+ }
+} # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b sse2 avx2 neon/;
+
+ add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b_32x32 sse2 avx2 neon/;
+
+ add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b_64x64 sse2 avx2 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2 neon/;
+
+ add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2 neon/;
+
+ add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2 neon/;
+ }
+} # CONFIG_AV1_ENCODER
+
+#
+# Alpha blending with mask
+#
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params";
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
+add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/;
+specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
+specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd";
+ add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+ add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+ add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
+ specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/;
+ specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/;
+}
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ #
+ # Block subtraction
+ #
+ add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+ specialize qw/aom_subtract_block neon sse2 avx2/;
+
+ add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+ specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/;
+
+ add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
+ specialize qw/aom_get_blk_sse_sum sse2 avx2 neon sve/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+ specialize qw/aom_highbd_subtract_block sse2 neon/;
+
+ add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+ specialize qw/aom_highbd_sse sse4_1 avx2 neon sve/;
+ }
+
+ #
+ # Sum of Squares
+ #
+ add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+ specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon sve/;
+
+ add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+ specialize qw/aom_sum_squares_i16 sse2 neon sve/;
+
+ add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/;
+
+ add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+ specialize qw/aom_var_2d_u16 sse2 avx2 neon sve/;
+
+ #
+ # Single block SAD / Single block Avg SAD
+ #
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
+ }
+
+ add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
+ specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2 sve/;
+ specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16 sse2 neon/;
+ specialize qw/aom_sad8x8 sse2 neon/;
+ specialize qw/aom_sad8x4 sse2 neon/;
+ specialize qw/aom_sad4x8 sse2 neon/;
+ specialize qw/aom_sad4x4 sse2 neon/;
+
+ specialize qw/aom_sad4x16 sse2 neon/;
+ specialize qw/aom_sad16x4 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32 sse2 neon/;
+ specialize qw/aom_sad32x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad_skip_128x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_128x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x128 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x64 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x32 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x16 avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x32 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x16 sse2 neon/;
+ specialize qw/aom_sad_skip_8x8 sse2 neon/;
+ specialize qw/aom_sad_skip_8x4 neon/;
+ specialize qw/aom_sad_skip_4x8 sse2 neon/;
+ specialize qw/aom_sad_skip_4x4 neon/;
+
+ specialize qw/aom_sad_skip_4x16 sse2 neon/;
+ specialize qw/aom_sad_skip_16x4 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x32 sse2 neon/;
+ specialize qw/aom_sad_skip_32x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x64 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x16 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16_avg sse2 neon/;
+ specialize qw/aom_sad8x8_avg sse2 neon/;
+ specialize qw/aom_sad8x4_avg sse2 neon/;
+ specialize qw/aom_sad4x8_avg sse2 neon/;
+ specialize qw/aom_sad4x4_avg sse2 neon/;
+
+ specialize qw/aom_sad4x16_avg sse2 neon/;
+ specialize qw/aom_sad16x4_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32_avg sse2 neon/;
+ specialize qw/aom_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16_avg sse2 neon neon_dotprod/;
+
+ specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad128x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x128_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad32x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad8x16_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad8x8_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad8x4_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad4x8_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad4x4_avg sse2 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ specialize qw/aom_dist_wtd_sad4x16_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad16x4_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad8x32_avg sse2 neon/;
+ specialize qw/aom_dist_wtd_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_dist_wtd_sad64x16_avg sse2 neon neon_dotprod/;
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ add_proto qw/unsigned int/, "aom_highbd_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+ add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ if ($w != 128 && $h != 128 && $w != 4) {
+ specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
+ specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
+ }
+ add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+ }
+ specialize qw/aom_highbd_sad128x128 avx2 neon/;
+ specialize qw/aom_highbd_sad128x64 avx2 neon/;
+ specialize qw/aom_highbd_sad64x128 avx2 neon/;
+ specialize qw/aom_highbd_sad64x64 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x32 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x64 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x32 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x16 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x32 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x16 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x8 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad8x16 sse2 neon/;
+ specialize qw/aom_highbd_sad8x8 sse2 neon/;
+ specialize qw/aom_highbd_sad8x4 sse2 neon/;
+ specialize qw/aom_highbd_sad4x8 sse2 neon/;
+ specialize qw/aom_highbd_sad4x4 sse2 neon/;
+
+ specialize qw/aom_highbd_sad4x16 sse2 neon/;
+ specialize qw/aom_highbd_sad16x4 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad8x32 sse2 neon/;
+ specialize qw/aom_highbd_sad32x8 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x64 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x16 avx2 sse2 neon/;
+
+ specialize qw/aom_highbd_sad_skip_128x128 avx2 neon/;
+ specialize qw/aom_highbd_sad_skip_128x64 avx2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x128 avx2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x64 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x32 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x64 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x32 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x16 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x32 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x16 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x8 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x4 neon/;
+ specialize qw/aom_highbd_sad_skip_8x16 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_8x4 neon/;
+ specialize qw/aom_highbd_sad_skip_8x8 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_4x8 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_4x4 neon/;
+
+ specialize qw/aom_highbd_sad_skip_4x16 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_8x32 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x8 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x64 avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x16 avx2 sse2 neon/;
+
+ specialize qw/aom_highbd_sad128x128_avg avx2 neon/;
+ specialize qw/aom_highbd_sad128x64_avg avx2 neon/;
+ specialize qw/aom_highbd_sad64x128_avg avx2 neon/;
+ specialize qw/aom_highbd_sad64x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x16_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x32_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x16_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x8_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad8x16_avg neon/;
+ specialize qw/aom_highbd_sad8x8_avg neon/;
+ specialize qw/aom_highbd_sad8x4_avg sse2 neon/;
+ specialize qw/aom_highbd_sad4x8_avg sse2 neon/;
+ specialize qw/aom_highbd_sad4x4_avg sse2 neon/;
+
+ specialize qw/aom_highbd_sad4x16_avg sse2 neon/;
+ specialize qw/aom_highbd_sad8x32_avg sse2 neon/;
+ specialize qw/aom_highbd_sad16x4_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x64_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad32x8_avg avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x16_avg avx2 sse2 neon/;
+ }
+ #
+ # Masked SAD
+ #
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
+ specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
+ specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
+ }
+ }
+
+ #
+ # OBMC SAD
+ #
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+ if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+ specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
+ }
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+ if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+ specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
+ }
+ }
+ }
+ }
+
+ #
+ # Multi-block SAD, comparing a reference to N independent blocks
+ #
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
+ }
+
+ specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8x4d avx2 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad8x16x4d sse2 neon/;
+ specialize qw/aom_sad8x8x4d sse2 neon/;
+ specialize qw/aom_sad8x4x4d sse2 neon/;
+ specialize qw/aom_sad4x8x4d sse2 neon/;
+ specialize qw/aom_sad4x4x4d sse2 neon/;
+
+ specialize qw/aom_sad64x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x8x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x4x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad8x32x4d sse2 neon/;
+ specialize qw/aom_sad4x16x4d sse2 neon/;
+
+ specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/;
+ specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
+ specialize qw/aom_sad_skip_8x16x4d sse2 neon/;
+ specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
+ specialize qw/aom_sad_skip_8x4x4d neon/;
+ specialize qw/aom_sad_skip_4x16x4d sse2 neon/;
+ specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
+ specialize qw/aom_sad_skip_4x4x4d neon/;
+
+ specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad8x16x3d neon/;
+ specialize qw/aom_sad8x8x3d neon/;
+ specialize qw/aom_sad8x4x3d neon/;
+ specialize qw/aom_sad4x8x3d neon/;
+ specialize qw/aom_sad4x4x3d neon/;
+
+ specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/;
+ specialize qw/aom_sad16x4x3d neon neon_dotprod/;
+ specialize qw/aom_sad8x32x3d neon/;
+ specialize qw/aom_sad4x16x3d neon/;
+
+ specialize qw/aom_masked_sad128x128x4d ssse3 neon/;
+ specialize qw/aom_masked_sad128x64x4d ssse3 neon/;
+ specialize qw/aom_masked_sad64x128x4d ssse3 neon/;
+ specialize qw/aom_masked_sad64x64x4d ssse3 neon/;
+ specialize qw/aom_masked_sad64x32x4d ssse3 neon/;
+ specialize qw/aom_masked_sad64x16x4d ssse3 neon/;
+ specialize qw/aom_masked_sad32x64x4d ssse3 neon/;
+ specialize qw/aom_masked_sad32x32x4d ssse3 neon/;
+ specialize qw/aom_masked_sad32x16x4d ssse3 neon/;
+ specialize qw/aom_masked_sad32x8x4d ssse3 neon/;
+ specialize qw/aom_masked_sad16x64x4d ssse3 neon/;
+ specialize qw/aom_masked_sad16x32x4d ssse3 neon/;
+ specialize qw/aom_masked_sad16x16x4d ssse3 neon/;
+ specialize qw/aom_masked_sad16x8x4d ssse3 neon/;
+
+ specialize qw/aom_masked_sad8x16x4d ssse3 neon/;
+ specialize qw/aom_masked_sad8x8x4d ssse3 neon/;
+ specialize qw/aom_masked_sad8x4x4d ssse3 neon/;
+ specialize qw/aom_masked_sad4x16x4d ssse3 neon/;
+ specialize qw/aom_masked_sad4x8x4d ssse3 neon/;
+ specialize qw/aom_masked_sad4x4x4d ssse3 neon/;
+
+ specialize qw/aom_masked_sad4x16x4d ssse3 neon/;
+ specialize qw/aom_masked_sad16x4x4d ssse3 neon/;
+ specialize qw/aom_masked_sad8x32x4d ssse3 neon/;
+ specialize qw/aom_masked_sad32x8x4d ssse3 neon/;
+ specialize qw/aom_masked_sad64x16x4d ssse3 neon/;
+ #
+ # Multi-block SAD, comparing a reference to N independent blocks
+ #
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+ if ($w != 128 && $h != 128) {
+ specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+ }
+ }
+ specialize qw/aom_highbd_sad128x128x4d avx2 neon/;
+ specialize qw/aom_highbd_sad128x64x4d avx2 neon/;
+ specialize qw/aom_highbd_sad64x128x4d avx2 neon/;
+ specialize qw/aom_highbd_sad64x64x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad64x32x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad32x64x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad32x32x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad32x16x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad16x32x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad16x16x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad16x8x4d sse2 avx2 neon/;
+ specialize qw/aom_highbd_sad8x16x4d sse2 neon/;
+ specialize qw/aom_highbd_sad8x8x4d sse2 neon/;
+ specialize qw/aom_highbd_sad8x4x4d sse2 neon/;
+ specialize qw/aom_highbd_sad4x8x4d sse2 neon/;
+ specialize qw/aom_highbd_sad4x4x4d sse2 neon/;
+
+ specialize qw/aom_highbd_sad4x16x4d sse2 neon/;
+ specialize qw/aom_highbd_sad16x4x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad8x32x4d sse2 neon/;
+ specialize qw/aom_highbd_sad32x8x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad16x64x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad64x16x4d avx2 sse2 neon/;
+
+ specialize qw/aom_highbd_sad_skip_128x128x4d avx2 neon/;
+ specialize qw/aom_highbd_sad_skip_128x64x4d avx2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x128x4d avx2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x64x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x32x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x64x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x32x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x16x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x32x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x16x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x8x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x4x4d neon/;
+ specialize qw/aom_highbd_sad_skip_8x16x4d sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_8x8x4d sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_8x4x4d neon/;
+ specialize qw/aom_highbd_sad_skip_4x8x4d sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_4x4x4d neon/;
+
+ specialize qw/aom_highbd_sad_skip_4x16x4d sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_8x32x4d sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_32x8x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_16x64x4d avx2 sse2 neon/;
+ specialize qw/aom_highbd_sad_skip_64x16x4d avx2 sse2 neon/;
+
+ specialize qw/aom_highbd_sad128x128x3d avx2 neon/;
+ specialize qw/aom_highbd_sad128x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x128x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad64x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x32x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x8x3d avx2 neon/;
+ specialize qw/aom_highbd_sad8x16x3d neon/;
+ specialize qw/aom_highbd_sad8x8x3d neon/;
+ specialize qw/aom_highbd_sad8x4x3d neon/;
+ specialize qw/aom_highbd_sad4x8x3d neon/;
+ specialize qw/aom_highbd_sad4x4x3d neon/;
+
+ specialize qw/aom_highbd_sad64x16x3d avx2 neon/;
+ specialize qw/aom_highbd_sad32x8x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x64x3d avx2 neon/;
+ specialize qw/aom_highbd_sad16x4x3d avx2 neon/;
+ specialize qw/aom_highbd_sad8x32x3d neon/;
+ specialize qw/aom_highbd_sad4x16x3d neon/;
+ }
+ #
+ # Avg
+ #
+ add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+ specialize qw/aom_avg_8x8 sse2 neon/;
+
+ add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/aom_avg_4x4 sse2 neon/;
+
+ add_proto qw/void aom_avg_8x8_quad/, "const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg";
+ specialize qw/aom_avg_8x8_quad avx2 sse2 neon/;
+
+ add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/aom_minmax_8x8 sse2 neon/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+ specialize qw/aom_highbd_avg_8x8 neon/;
+ add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/aom_highbd_avg_4x4 neon/;
+ add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/aom_highbd_minmax_8x8 neon/;
+ }
+
+ add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+ specialize qw/aom_int_pro_row avx2 sse2 neon/;
+
+ add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+ specialize qw/aom_int_pro_col avx2 sse2 neon/;
+
+ add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl";
+ specialize qw/aom_vector_var avx2 sse4_1 neon sve/;
+ # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+ #specialize qw/aom_vector_var neon sse2/;
+
+ #
+ # hamadard transform and satd for implmenting temporal dependency model
+ #
+ add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_hadamard_4x4 sse2 neon/;
+
+ add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_hadamard_8x8 sse2 neon/;
+
+ add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_hadamard_16x16 avx2 sse2 neon/;
+
+ add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_hadamard_32x32 avx2 sse2 neon/;
+
+ add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
+
+ add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
+
+ add_proto qw/void aom_hadamard_lp_8x8_dual/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+ specialize qw/aom_hadamard_lp_8x8_dual sse2 avx2 neon/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_highbd_hadamard_8x8 avx2 neon/;
+
+ add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_highbd_hadamard_16x16 avx2 neon/;
+
+ add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+ specialize qw/aom_highbd_hadamard_32x32 avx2 neon/;
+ }
+ add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/aom_satd neon sse2 avx2/;
+
+ add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
+ specialize qw/aom_satd_lp sse2 avx2 neon/;
+
+
+ #
+ # Structured Similarity (SSIM)
+ #
+ add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+
+ if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ }
+} # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+ #
+ # Specialty Variance
+ #
+ add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
+ specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon neon_dotprod/;
+
+ add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
+ specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon neon_dotprod/;
+
+ add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+
+ specialize qw/aom_mse16x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_mse16x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_mse8x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_mse8x8 sse2 neon neon_dotprod/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach $bd (8, 10, 12) {
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+
+ specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/;
+ specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/;
+ specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/;
+ specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/;
+ }
+
+ specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
+ specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
+ }
+
+ #
+ #
+ #
+ add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
+ specialize qw/aom_get_mb_ss sse2 neon/;
+
+ #
+ # Variance / Subpixel Variance / Subpixel Avg Variance
+ #
+ add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+ specialize qw/aom_mse_wxh_16bit sse2 avx2 neon/;
+
+ add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
+ specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
+
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
+ }
+ specialize qw/aom_variance128x128 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance128x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x128 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance64x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x64 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance32x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x32 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x16 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance16x8 sse2 avx2 neon neon_dotprod/;
+ specialize qw/aom_variance8x16 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance8x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance8x4 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance4x8 sse2 neon neon_dotprod/;
+ specialize qw/aom_variance4x4 sse2 neon neon_dotprod/;
+
+ specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x128 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x64 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x32 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance32x64 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance32x32 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance32x16 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x32 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x16 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x8 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance8x16 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance8x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance8x4 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance4x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance4x4 neon sse2 ssse3/;
+
+ specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x32 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x16 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x16 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x4 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance4x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ specialize qw/aom_variance4x16 neon neon_dotprod sse2/;
+ specialize qw/aom_variance16x4 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance8x32 neon neon_dotprod sse2/;
+ specialize qw/aom_variance32x8 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
+ specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
+
+ specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
+ specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
+
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/;
+ }
+
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 neon ssse3/;
+
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 neon ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 neon ssse3/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach $bd (8, 10, 12) {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+ }
+ }
+
+ specialize qw/aom_highbd_12_variance128x128 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance128x64 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance64x128 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance64x64 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance64x32 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance32x64 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance32x32 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance32x16 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance16x32 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance16x16 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance16x8 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance8x16 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance8x8 sse2 neon sve/;
+ specialize qw/aom_highbd_12_variance8x4 neon sve/;
+ specialize qw/aom_highbd_12_variance4x8 neon sve/;
+ specialize qw/aom_highbd_12_variance4x4 sse4_1 neon sve/;
+
+ specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon sve/;
+ specialize qw/aom_highbd_10_variance8x4 neon sve/;
+ specialize qw/aom_highbd_10_variance4x8 neon sve/;
+ specialize qw/aom_highbd_10_variance4x4 sse4_1 neon sve/;
+
+ specialize qw/aom_highbd_8_variance128x128 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance128x64 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance64x128 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance64x64 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance64x32 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance32x64 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance32x32 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance32x16 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance16x32 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance16x16 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance16x8 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance8x16 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance8x8 sse2 neon sve/;
+ specialize qw/aom_highbd_8_variance8x4 neon sve/;
+ specialize qw/aom_highbd_8_variance4x8 neon sve/;
+ specialize qw/aom_highbd_8_variance4x4 sse4_1 neon sve/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ my $avx2 = ($bd == 10) ? "avx2" : "";
+ specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon sve/;
+ specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon sve/;
+ specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon sve/;
+ specialize "aom_highbd_${bd}_variance16x4" , qw/neon sve/;
+ specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon sve/;
+ specialize "aom_highbd_${bd}_variance4x16" , qw/neon sve/;
+ }
+ }
+
+ specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance4x8 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/;
+ }
+ }
+
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8 neon/;
+ specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/;
+ specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/;
+ }
+ }
+
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4" , qw/neon/;
+ }
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach $bd (8, 10, 12) {
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/;
+ specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/;
+ }
+ }
+ }
+ #
+ # Masked Variance / Masked Subpixel Variance
+ #
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+ specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach $bd ("_8_", "_10_", "_12_") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+ specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
+ }
+ }
+ }
+
+ #
+ # OBMC Variance / OBMC Subpixel Variance
+ #
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2 neon/;
+ specialize "aom_obmc_sub_pixel_variance${w}x${h}", qw/sse4_1 neon/;
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ foreach $bd ("_8_", "_10_", "_12_") {
+ foreach (@encoder_block_sizes) {
+ ($w, $h) = @$_;
+ add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+ specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/;
+ specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/;
+ }
+ }
+ }
+ }
+
+ #
+ # Comp Avg
+ #
+ add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+ specialize qw/aom_comp_avg_pred avx2 neon/;
+
+ add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+ specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+ specialize qw/aom_highbd_comp_avg_pred neon/;
+
+ add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+ specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/;
+
+ add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+ specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon sve/;
+ }
+
+ add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+ specialize qw/aom_comp_mask_pred ssse3 avx2 neon/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+ specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/;
+ }
+
+ # Flow estimation library
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/double av1_compute_cross_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2";
+ specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
+
+ add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
+ specialize qw/aom_compute_flow_at_point sse4_1 neon/;
+ }
+
+} # CONFIG_AV1_ENCODER
+
+1;
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
new file mode 100644
index 0000000000..00686ac388
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_FILTER_H_
+#define AOM_AOM_DSP_AOM_FILTER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+#define SCALE_SUBPEL_BITS 10
+#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
+#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
+#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
+#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
+
+#define RS_SUBPEL_BITS 6
+#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
+#define RS_SCALE_SUBPEL_BITS 14
+#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
+#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
+#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+ { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+ { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
new file mode 100644
index 0000000000..69da8f21b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_H_
+#define AOM_AOM_DSP_AOM_SIMD_H_
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_simd_inline.h"
+
+#define SIMD_CHECK 1 // Sanity checks in C equivalents
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#include "simd/v256_intrinsics_x86.h"
+#else
+#include "simd/v256_intrinsics.h"
+#endif
+
+#endif // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
new file mode 100644
index 0000000000..b4b1b35637
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+
+#include "aom/aom_integer.h"
+
+#ifndef SIMD_INLINE
+#define SIMD_INLINE static AOM_FORCE_INLINE
+#endif
+
+#define SIMD_CLAMP(value, min, max) \
+ ((value) > (max) ? (max) : (value) < (min) ? (min) : (value))
+
+#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
new file mode 100644
index 0000000000..7441108b01
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filter = vld1q_s16(filter_x);
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (h == 4) {
+ uint8x8_t t0, t1, t2, t3, d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ transpose_elems_inplace_u8_4x4(&d01, &d23);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+ store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w != 0);
+ } else {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ if (w == 4) {
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+ &t3);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+ transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0);
+ store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2);
+ store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3);
+
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ uint8x8_t d4, d5, d6, d7;
+ int16x8_t s11, s12, s13, s14;
+ int width;
+ const uint8_t *s;
+ uint8_t *d;
+
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+ d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+ d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+ d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+
+ transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+}
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filter = vld1q_s16(filter_y);
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+ s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+
+ src += 7 * src_stride;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+
+ do {
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ height = h;
+ s = src + 7 * src_stride;
+ d = dst;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..ac0a6efd00
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[2];
+ int32x4_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+ sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+ t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+ t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+ t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filter);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filter);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filter);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..c314c0a192
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+ t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+ t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+ t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filter);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filter);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filter);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
new file mode 100644
index 0000000000..325d6f29ff
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ const uint8_t *src1;
+ uint8_t *dst1;
+ int y;
+
+ if (!(w & 0x0F)) {
+ for (y = 0; y < h; ++y) {
+ src1 = src;
+ dst1 = dst;
+ for (int x = 0; x < (w >> 4); ++x) {
+ vst1q_u8(dst1, vld1q_u8(src1));
+ src1 += 16;
+ dst1 += 16;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x07)) {
+ for (y = 0; y < h; ++y) {
+ vst1_u8(dst, vld1_u8(src));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x03)) {
+ for (y = 0; y < h; ++y) {
+ memcpy(dst, src, sizeof(uint32_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x01)) {
+ for (y = 0; y < h; ++y) {
+ memcpy(dst, src, sizeof(uint16_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w < 8) { // copy4
+ uint16x4_t s0, s1;
+ do {
+ s0 = vld1_u16(src);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ src += src_stride;
+
+ vst1_u16(dst, s0);
+ dst += dst_stride;
+ vst1_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint16x8_t s0, s1;
+ do {
+ s0 = vld1q_u16(src);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ dst += dst_stride;
+ vst1q_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ src += src_stride;
+ s2 = vld1q_u16(src);
+ s3 = vld1q_u16(src + 8);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ dst += dst_stride;
+ vst1q_u16(dst, s2);
+ vst1q_u16(dst + 8, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ dst += dst_stride;
+ } while (--h != 0);
+ } else { // copy64
+ uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+ int width = w;
+ do {
+ s0 = vld1q_u16(s);
+ s1 = vld1q_u16(s + 8);
+ s2 = vld1q_u16(s + 16);
+ s3 = vld1q_u16(s + 24);
+ s4 = vld1q_u16(s + 32);
+ s5 = vld1q_u16(s + 40);
+ s6 = vld1q_u16(s + 48);
+ s7 = vld1q_u16(s + 56);
+
+ vst1q_u16(d, s0);
+ vst1q_u16(d + 8, s1);
+ vst1q_u16(d + 16, s2);
+ vst1q_u16(d + 24, s3);
+ vst1q_u16(d + 32, s4);
+ vst1q_u16(d + 40, s5);
+ vst1q_u16(d + 48, s6);
+ vst1q_u16(d + 56, s7);
+ s += 64;
+ d += 64;
+ width -= 64;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c
new file mode 100644
index 0000000000..2e79b2ef69
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+ const uint8x8_t s0 = load_unaligned_u8(p, stride);
+ const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
+
+ const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+ return (sum + (1 << 3)) >> 4;
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+ uint8x8_t s0 = vld1_u8(p);
+ p += stride;
+ uint8x8_t s1 = vld1_u8(p);
+ p += stride;
+ uint16x8_t acc = vaddl_u8(s0, s1);
+
+ int i = 0;
+ do {
+ const uint8x8_t si = vld1_u8(p);
+ p += stride;
+ acc = vaddw_u8(acc, si);
+ } while (++i < 6);
+
+ const uint32_t sum = horizontal_add_u16x8(acc);
+ return (sum + (1 << 5)) >> 6;
+}
+
+void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+ avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+ avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+ avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
+}
+
+int aom_satd_lp_neon(const int16_t *coeff, int length) {
+ int16x8_t s0 = vld1q_s16(coeff);
+ int16x8_t s1 = vld1q_s16(coeff + 8);
+
+ int16x8_t abs0 = vabsq_s16(s0);
+ int16x8_t abs1 = vabsq_s16(s1);
+
+ int32x4_t acc0 = vpaddlq_s16(abs0);
+ int32x4_t acc1 = vpaddlq_s16(abs1);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s16(coeff);
+ s1 = vld1q_s16(coeff + 8);
+
+ abs0 = vabsq_s16(s0);
+ abs1 = vabsq_s16(s1);
+
+ acc0 = vpadalq_s16(acc0, abs0);
+ acc1 = vpadalq_s16(acc1, abs1);
+
+ length -= 16;
+ coeff += 16;
+ }
+
+ int32x4_t accum = vaddq_s32(acc0, acc1);
+ return horizontal_add_s32x4(accum);
+}
+
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ assert(width % 16 == 0);
+ assert(height % 4 == 0);
+
+ const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+ uint16x8_t sum_lo[2], sum_hi[2];
+
+ int w = 0;
+ do {
+ const uint8_t *r = ref + w;
+ uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride);
+ uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride);
+ uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride);
+ uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride);
+
+ sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ r += 4 * ref_stride;
+
+ for (int h = height - 4; h != 0; h -= 4) {
+ r0 = vld1q_u8(r + 0 * ref_stride);
+ r1 = vld1q_u8(r + 1 * ref_stride);
+ r2 = vld1q_u8(r + 2 * ref_stride);
+ r3 = vld1q_u8(r + 3 * ref_stride);
+
+ uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo);
+ sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi);
+ sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo);
+ sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi);
+
+ r += 4 * ref_stride;
+ }
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+ sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+ const int16x8_t avg0 =
+ vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+ const int16x8_t avg1 =
+ vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+ vst1q_s16(hbuf + w, avg0);
+ vst1q_s16(hbuf + w + 8, avg1);
+ w += 16;
+ } while (w < width);
+}
+
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ assert(width % 16 == 0);
+ assert(height % 4 == 0);
+
+ const int16x4_t neg_norm_factor = vdup_n_s16(-norm_factor);
+ uint16x8_t sum[4];
+
+ int h = 0;
+ do {
+ sum[0] = vpaddlq_u8(vld1q_u8(ref + 0 * ref_stride));
+ sum[1] = vpaddlq_u8(vld1q_u8(ref + 1 * ref_stride));
+ sum[2] = vpaddlq_u8(vld1q_u8(ref + 2 * ref_stride));
+ sum[3] = vpaddlq_u8(vld1q_u8(ref + 3 * ref_stride));
+
+ for (int w = 16; w < width; w += 16) {
+ sum[0] = vpadalq_u8(sum[0], vld1q_u8(ref + 0 * ref_stride + w));
+ sum[1] = vpadalq_u8(sum[1], vld1q_u8(ref + 1 * ref_stride + w));
+ sum[2] = vpadalq_u8(sum[2], vld1q_u8(ref + 2 * ref_stride + w));
+ sum[3] = vpadalq_u8(sum[3], vld1q_u8(ref + 3 * ref_stride + w));
+ }
+
+ uint16x4_t sum_4d = vmovn_u32(horizontal_add_4d_u16x8(sum));
+ int16x4_t avg = vshl_s16(vreinterpret_s16_u16(sum_4d), neg_norm_factor);
+ vst1_s16(vbuf + h, avg);
+
+ ref += 4 * ref_stride;
+ h += 4;
+ } while (h < height);
+}
+
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+ const int32x4_t zero = vdupq_n_s32(0);
+
+ int32x4_t s0 = vld1q_s32(&coeff[0]);
+ int32x4_t s1 = vld1q_s32(&coeff[4]);
+ int32x4_t s2 = vld1q_s32(&coeff[8]);
+ int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+ int32x4_t accum0 = vabsq_s32(s0);
+ int32x4_t accum1 = vabsq_s32(s2);
+ accum0 = vabaq_s32(accum0, s1, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s32(&coeff[0]);
+ s1 = vld1q_s32(&coeff[4]);
+ s2 = vld1q_s32(&coeff[8]);
+ s3 = vld1q_s32(&coeff[12]);
+
+ accum0 = vabaq_s32(accum0, s0, zero);
+ accum1 = vabaq_s32(accum1, s1, zero);
+ accum0 = vabaq_s32(accum0, s2, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
+ length -= 16;
+ coeff += 16;
+ }
+
+ // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+ return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
+}
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
+ assert(bwl >= 2 && bwl <= 5);
+ int width = 4 << bwl;
+
+ int16x8_t r = vld1q_s16(ref);
+ int16x8_t s = vld1q_s16(src);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff = vsubq_s16(r, s);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ int16x8_t v_mean = diff;
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ int32x4_t v_sse[2];
+ v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+
+ do {
+ r = vld1q_s16(ref);
+ s = vld1q_s16(src);
+
+ diff = vsubq_s16(r, s);
+ v_mean = vaddq_s16(v_mean, diff);
+
+ v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ // Dynamic range [0, 65280], 16 (unsigned) bits.
+ const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+ const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+ // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+ return sse - ((mean_abs * mean_abs) >> (bwl + 2));
+}
+
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max) {
+ // Load and concatenate.
+ const uint8x16_t a01 = load_u8_8x2(a + 0 * a_stride, a_stride);
+ const uint8x16_t a23 = load_u8_8x2(a + 2 * a_stride, a_stride);
+ const uint8x16_t a45 = load_u8_8x2(a + 4 * a_stride, a_stride);
+ const uint8x16_t a67 = load_u8_8x2(a + 6 * a_stride, a_stride);
+
+ const uint8x16_t b01 = load_u8_8x2(b + 0 * b_stride, b_stride);
+ const uint8x16_t b23 = load_u8_8x2(b + 2 * b_stride, b_stride);
+ const uint8x16_t b45 = load_u8_8x2(b + 4 * b_stride, b_stride);
+ const uint8x16_t b67 = load_u8_8x2(b + 6 * b_stride, b_stride);
+
+ // Absolute difference.
+ const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+ const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+ const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+ const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+ // Max values between the Q vectors.
+ const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+ const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+ const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+ const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+#if AOM_ARCH_AARCH64
+ *min = *max = 0; // Clear high bits
+ *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+ *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+ uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u8((uint8_t *)max, ab_max, 0);
+ vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_pred_neon.c b/third_party/aom/aom_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..b17f7fca7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ if (width > 8) {
+ do {
+ const uint8_t *pred_ptr = pred;
+ const uint8_t *ref_ptr = ref;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+
+ vst1q_u8(comp_pred_ptr, avg);
+
+ ref_ptr += 16;
+ pred_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ pred += width;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ int h = height / 2;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+
+ vst1q_u8(comp_pred, avg);
+
+ ref += 2 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ } else {
+ int h = height / 4;
+ assert(width == 4);
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+
+ vst1q_u8(comp_pred, avg);
+
+ ref += 4 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ }
+}
+
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ if (width > 8) {
+ do {
+ const uint8_t *pred_ptr = pred;
+ const uint8_t *ref_ptr = ref;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred_ptr, wtd_avg);
+
+ ref_ptr += 16;
+ pred_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ pred += width;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ int h = height / 2;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ } else {
+ int h = height / 2;
+ assert(width == 4);
+
+ do {
+ const uint8x8_t p = vld1_u8(pred);
+ const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+ const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+ vget_low_u8(bck_offset));
+
+ vst1_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 8;
+ comp_pred += 8;
+ } while (--h != 0);
+ }
+}
+
+void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask) {
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width > 8) {
+ do {
+ const uint8_t *src0_ptr = src0;
+ const uint8_t *src1_ptr = src1;
+ const uint8_t *mask_ptr = mask;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t s0 = vld1q_u8(src0_ptr);
+ const uint8x16_t s1 = vld1q_u8(src1_ptr);
+ const uint8x16_t m0 = vld1q_u8(mask_ptr);
+
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(comp_pred_ptr, blend_u8);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint8x8_t s0 = vld1_u8(src0);
+ const uint8x8_t s1 = vld1_u8(src1);
+ const uint8x8_t m0 = vld1_u8(mask);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(comp_pred, blend_u8);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 8;
+ } while (--height != 0);
+ } else {
+ int h = height / 2;
+ assert(width == 4);
+
+ do {
+ const uint8x8_t s0 = load_unaligned_u8(src0, src_stride0);
+ const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
+ const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(comp_pred, blend_u8);
+
+ src0 += 2 * src_stride0;
+ src1 += 2 * src_stride1;
+ mask += 2 * mask_stride;
+ comp_pred += 8;
+ } while (--h != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_sve.c b/third_party/aom/aom_dsp/arm/avg_sve.c
new file mode 100644
index 0000000000..bbf5a9447c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_sve.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) {
+ assert(bwl >= 2 && bwl <= 5);
+ int width = 4 << bwl;
+
+ int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+
+ do {
+ int16x8_t r0 = vld1q_s16(ref);
+ int16x8_t s0 = vld1q_s16(src);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff0 = vsubq_s16(r0, s0);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ v_mean[0] = vaddq_s16(v_mean[0], diff0);
+
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0);
+
+ int16x8_t r1 = vld1q_s16(ref + 8);
+ int16x8_t s1 = vld1q_s16(src + 8);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff1 = vsubq_s16(r1, s1);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ v_mean[1] = vaddq_s16(v_mean[1], diff1);
+
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1);
+
+ ref += 16;
+ src += 16;
+ width -= 16;
+ } while (width != 0);
+
+ // Dynamic range [0, 65280], 16 (unsigned) bits.
+ const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1])));
+ const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
+
+ // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+ return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2)));
+}
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 0000000000..1bc3b80310
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+ uint16x8_t round_offset) {
+ const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
+
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
+
+ uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
+
+ res = vqsubq_u16(res, round_offset);
+
+ return vqrshrn_n_u16(res,
+ 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ (void)conv_params;
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 1) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
+
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if ((subw | subh) == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+ uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t m1 = vld1_u8(mask + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/blend_neon.h b/third_party/aom/aom_dsp/arm/blend_neon.h
new file mode 100644
index 0000000000..c8a03224e4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+ uint8x16_t b) {
+ const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+ uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+ blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+ blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+ uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+ uint8x8_t b) {
+ const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16 = vmull_u8(m, a);
+
+ blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+ return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+ uint16x8_t b) {
+ uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+ uint16x4_t blend_u16_lo =
+ vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi =
+ vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+ uint16x4_t b) {
+ const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u16 = vmull_u16(m, a);
+
+ blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+ return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+ return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+ return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+ uint8x8_t c, uint8x8_t d) {
+ uint8x8_t a_c = vpadd_u8(a, c);
+ uint8x8_t b_d = vpadd_u8(b, d);
+ return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+ uint8x16_t c,
+ uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+ uint8x16_t a_c = vpaddq_u8(a, c);
+ uint8x16_t b_d = vpaddq_u8(b, d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+ uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+ uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+ uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif // AOM_ARCH_AARCH64
+}
+
+#endif // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 0000000000..f2ada93e95
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ do {
+ int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += 2 * stride;
+ i -= 2;
+ } while (i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+ // value for bh is 32, we don't have to worry about sse overflowing.
+
+ do {
+ int16x8_t d = vld1q_s16(data);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += stride;
+ } while (--i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+ int bw, int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum = vdupq_n_s32(0);
+ int64x2_t sse = vdupq_n_s64(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t vector that
+ // means we can process up to (127*4)/bw rows before we need to widen to
+ // 64 bits.
+
+ int i_limit = (127 * 4) / bw;
+ int i_tmp = bh > i_limit ? i_limit : bh;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ do {
+ int j = bw;
+ const int16_t *data_ptr = data;
+ do {
+ int16x8_t d = vld1q_s16(data_ptr);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+ data_ptr += 8;
+ j -= 8;
+ } while (j != 0);
+
+ data += stride;
+ i++;
+ } while (i < i_tmp && i < bh);
+
+ sse = vpadalq_s32(sse, sse_s32);
+ i_tmp += i_limit;
+ } while (i < bh);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ if (bw == 4) {
+ get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else if (bw == 8) {
+ get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else {
+ assert(bw % 8 == 0);
+ get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c
new file mode 100644
index 0000000000..18bdc5dbfe
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum = vdupq_n_s32(0);
+ int64x2_t sse = vdupq_n_s64(0);
+
+ do {
+ int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = aom_sdotq_s16(sse, d, d);
+
+ data += 2 * stride;
+ bh -= 2;
+ } while (bh != 0);
+
+ *x_sum = vaddvq_s32(sum);
+ *x2_sum = vaddvq_s64(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int16x8_t d0 = vld1q_s16(data);
+ int16x8_t d1 = vld1q_s16(data + stride);
+
+ sum[0] = vpadalq_s16(sum[0], d0);
+ sum[1] = vpadalq_s16(sum[1], d1);
+
+ sse[0] = aom_sdotq_s16(sse[0], d0, d0);
+ sse[1] = aom_sdotq_s16(sse[1], d1, d1);
+
+ data += 2 * stride;
+ bh -= 2;
+ } while (bh != 0);
+
+ *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1]));
+ *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride,
+ int bw, int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int j = bw;
+ const int16_t *data_ptr = data;
+ do {
+ int16x8_t d0 = vld1q_s16(data_ptr);
+ int16x8_t d1 = vld1q_s16(data_ptr + 8);
+
+ sum[0] = vpadalq_s16(sum[0], d0);
+ sum[1] = vpadalq_s16(sum[1], d1);
+
+ sse[0] = aom_sdotq_s16(sse[0], d0, d0);
+ sse[1] = aom_sdotq_s16(sse[1], d1, d1);
+
+ data_ptr += 16;
+ j -= 16;
+ } while (j != 0);
+
+ data += stride;
+ } while (--bh != 0);
+
+ *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1]));
+ *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ if (bw == 4) {
+ get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum);
+ } else if (bw == 8) {
+ get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum);
+ } else {
+ assert(bw % 16 == 0);
+ get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 0000000000..19c9b04c57
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+ uint8x8_t wta, uint8x8_t wtb) {
+ uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+ wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+ return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+ uint16x4_t wta, uint16x4_t wtb) {
+ uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+ wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+ return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+ uint8x16_t wta, uint8x16_t wtb) {
+ uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+ uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+ wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+ wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+ uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+ uint16x8_t wta, uint16x8_t wtb) {
+ uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+ uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+ wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+ wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+ uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/dot_sve.h b/third_party/aom/aom_dsp/arm/dot_sve.h
new file mode 100644
index 0000000000..cf49f23606
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/dot_sve.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DOT_SVE_H_
+#define AOM_AOM_DSP_ARM_DOT_SVE_H_
+
+#include <arm_neon_sve_bridge.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x,
+ uint16x8_t y) {
+ return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+ svset_neonq_u16(svundef_u16(), x),
+ svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+ return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+ svset_neonq_s16(svundef_s16(), x),
+ svset_neonq_s16(svundef_s16(), y)));
+}
+
+#endif // AOM_AOM_DSP_ARM_DOT_SVE_H_
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 0000000000..a4d6322f24
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static void aom_fdct4x4_helper(const int16_t *input, int stride,
+ int16x4_t *input_0, int16x4_t *input_1,
+ int16x4_t *input_2, int16x4_t *input_3) {
+ *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+ *input_0 = vadd_s16(*input_0, one);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ const int16x8_t input_01 = vcombine_s16(*input_0, *input_1);
+ const int16x8_t input_32 = vcombine_s16(*input_3, *input_2);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // (s_0 +/- s_1) * cospi_16_64
+ // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+ const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+ const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
+
+ // fdct_round_shift
+ int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+ int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
+
+ const int32x4_t temp3 =
+ vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+ const int32x4_t temp4 =
+ vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
+
+ // fdct_round_shift
+ int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+ int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+ // Only transpose the first pass
+ if (i == 0) {
+ transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
+ }
+
+ *input_0 = out_0;
+ *input_1 = out_1;
+ *input_2 = out_2;
+ *input_3 = out_3;
+ }
+}
+
+void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int16x4_t input_0, input_1, input_2, input_3;
+
+ aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(input_0, input_1);
+ int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+ store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int16x4_t input_0, input_1, input_2, input_3;
+
+ aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(input_0, input_1);
+ int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ vst1q_s16(final_output + 0 * 8, out_01);
+ vst1q_s16(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+ // stage 1
+ int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ for (int i = 0; i < 2; ++i) {
+ int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+ const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+ const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+ const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+ const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+ const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+ const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+ const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+ const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+ // transpose 8x8
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ input_0 = r01_s16.val[0];
+ input_1 = r01_s16.val[1];
+ input_2 = r23_s16.val[0];
+ input_3 = r23_s16.val[1];
+ input_4 = r45_s16.val[0];
+ input_5 = r45_s16.val[1];
+ input_6 = r67_s16.val[0];
+ input_7 = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ } // for
+ {
+ // from aom_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+ input_0 = vhsubq_s16(input_0, sign_in0);
+ input_1 = vhsubq_s16(input_1, sign_in1);
+ input_2 = vhsubq_s16(input_2, sign_in2);
+ input_3 = vhsubq_s16(input_3, sign_in3);
+ input_4 = vhsubq_s16(input_4, sign_in4);
+ input_5 = vhsubq_s16(input_5, sign_in5);
+ input_6 = vhsubq_s16(input_6, sign_in6);
+ input_7 = vhsubq_s16(input_7, sign_in7);
+ // store results
+ vst1q_s16(&final_output[0 * 8], input_0);
+ vst1q_s16(&final_output[1 * 8], input_1);
+ vst1q_s16(&final_output[2 * 8], input_2);
+ vst1q_s16(&final_output[3 * 8], input_3);
+ vst1q_s16(&final_output[4 * 8], input_4);
+ vst1q_s16(&final_output[5 * 8], input_5);
+ vst1q_s16(&final_output[6 * 8], input_6);
+ vst1q_s16(&final_output[7 * 8], input_7);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000000..d0f59227db
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3) {
+ const int16x4_t b0 = vhadd_s16(*a0, *a1);
+ const int16x4_t b1 = vhsub_s16(*a0, *a1);
+ const int16x4_t b2 = vhadd_s16(*a2, *a3);
+ const int16x4_t b3 = vhsub_s16(*a2, *a3);
+
+ *a0 = vadd_s16(b0, b2);
+ *a1 = vadd_s16(b1, b3);
+ *a2 = vsub_s16(b0, b2);
+ *a3 = vsub_s16(b1, b3);
+}
+
+void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t a0 = vld1_s16(src_diff);
+ int16x4_t a1 = vld1_s16(src_diff + src_stride);
+ int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride);
+ int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride);
+
+ hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+ hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+ store_s16_to_tran_low(coeff, a0);
+ store_s16_to_tran_low(coeff + 4, a1);
+ store_s16_to_tran_low(coeff + 8, a2);
+ store_s16_to_tran_low(coeff + 12, a3);
+}
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ const int16x8_t b0 = vaddq_s16(*a0, *a1);
+ const int16x8_t b1 = vsubq_s16(*a0, *a1);
+ const int16x8_t b2 = vaddq_s16(*a2, *a3);
+ const int16x8_t b3 = vsubq_s16(*a2, *a3);
+ const int16x8_t b4 = vaddq_s16(*a4, *a5);
+ const int16x8_t b5 = vsubq_s16(*a4, *a5);
+ const int16x8_t b6 = vaddq_s16(*a6, *a7);
+ const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+ const int16x8_t c4 = vaddq_s16(b4, b6);
+ const int16x8_t c5 = vaddq_s16(b5, b7);
+ const int16x8_t c6 = vsubq_s16(b4, b6);
+ const int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a1 = vsubq_s16(c2, c6);
+ *a2 = vsubq_s16(c0, c4);
+ *a3 = vaddq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+ *a6 = vsubq_s16(c1, c5);
+ *a7 = vaddq_s16(c1, c5);
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ store_s16q_to_tran_low(coeff + 0, a0);
+ store_s16q_to_tran_low(coeff + 8, a1);
+ store_s16q_to_tran_low(coeff + 16, a2);
+ store_s16q_to_tran_low(coeff + 24, a3);
+ store_s16q_to_tran_low(coeff + 32, a4);
+ store_s16q_to_tran_low(coeff + 40, a5);
+ store_s16q_to_tran_low(coeff + 48, a6);
+ store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ vst1q_s16(coeff + 0, a0);
+ vst1q_s16(coeff + 8, a1);
+ vst1q_s16(coeff + 16, a2);
+ vst1q_s16(coeff + 24, a3);
+ vst1q_s16(coeff + 32, a4);
+ vst1q_s16(coeff + 40, a5);
+ vst1q_s16(coeff + 48, a6);
+ vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ for (int i = 0; i < 2; i++) {
+ aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64));
+ }
+}
+
+void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
+ coeff + 0);
+ /* Top right. */
+ aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
+ coeff + 64);
+ /* Bottom left. */
+ aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
+ coeff + 128);
+ /* Bottom right. */
+ aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
+ coeff + 192);
+
+ for (int i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = vld1q_s16(coeff + 0);
+ const int16x8_t a1 = vld1q_s16(coeff + 64);
+ const int16x8_t a2 = vld1q_s16(coeff + 128);
+ const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+
+ vst1q_s16(coeff + 0, c0);
+ vst1q_s16(coeff + 64, c1);
+ vst1q_s16(coeff + 128, c2);
+ vst1q_s16(coeff + 192, c3);
+
+ coeff += 8;
+ }
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ // Each iteration of the loop operates on entire rows (16 samples each)
+ // because we need to swap the second and third quarters of every row in the
+ // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+ // loop at the end of aom_hadamard_16x16_c.
+ for (int i = 0; i < 64; i += 16) {
+ const int32x4_t a00 = vld1q_s32(coeff + 0);
+ const int32x4_t a01 = vld1q_s32(coeff + 64);
+ const int32x4_t a02 = vld1q_s32(coeff + 128);
+ const int32x4_t a03 = vld1q_s32(coeff + 192);
+
+ const int32x4_t b00 = vhaddq_s32(a00, a01);
+ const int32x4_t b01 = vhsubq_s32(a00, a01);
+ const int32x4_t b02 = vhaddq_s32(a02, a03);
+ const int32x4_t b03 = vhsubq_s32(a02, a03);
+
+ const int32x4_t c00 = vaddq_s32(b00, b02);
+ const int32x4_t c01 = vaddq_s32(b01, b03);
+ const int32x4_t c02 = vsubq_s32(b00, b02);
+ const int32x4_t c03 = vsubq_s32(b01, b03);
+
+ const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+ const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+ const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+ const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
+
+ const int32x4_t b10 = vhaddq_s32(a10, a11);
+ const int32x4_t b11 = vhsubq_s32(a10, a11);
+ const int32x4_t b12 = vhaddq_s32(a12, a13);
+ const int32x4_t b13 = vhsubq_s32(a12, a13);
+
+ const int32x4_t c10 = vaddq_s32(b10, b12);
+ const int32x4_t c11 = vaddq_s32(b11, b13);
+ const int32x4_t c12 = vsubq_s32(b10, b12);
+ const int32x4_t c13 = vsubq_s32(b11, b13);
+
+ const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+ const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+ const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+ const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
+
+ const int32x4_t b20 = vhaddq_s32(a20, a21);
+ const int32x4_t b21 = vhsubq_s32(a20, a21);
+ const int32x4_t b22 = vhaddq_s32(a22, a23);
+ const int32x4_t b23 = vhsubq_s32(a22, a23);
+
+ const int32x4_t c20 = vaddq_s32(b20, b22);
+ const int32x4_t c21 = vaddq_s32(b21, b23);
+ const int32x4_t c22 = vsubq_s32(b20, b22);
+ const int32x4_t c23 = vsubq_s32(b21, b23);
+
+ const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+ const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+ const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+ const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+ const int32x4_t b30 = vhaddq_s32(a30, a31);
+ const int32x4_t b31 = vhsubq_s32(a30, a31);
+ const int32x4_t b32 = vhaddq_s32(a32, a33);
+ const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+ const int32x4_t c30 = vaddq_s32(b30, b32);
+ const int32x4_t c31 = vaddq_s32(b31, b33);
+ const int32x4_t c32 = vsubq_s32(b30, b32);
+ const int32x4_t c33 = vsubq_s32(b31, b33);
+
+ vst1q_s32(coeff + 0 + 0, c00);
+ vst1q_s32(coeff + 0 + 4, c20);
+ vst1q_s32(coeff + 0 + 8, c10);
+ vst1q_s32(coeff + 0 + 12, c30);
+
+ vst1q_s32(coeff + 64 + 0, c01);
+ vst1q_s32(coeff + 64 + 4, c21);
+ vst1q_s32(coeff + 64 + 8, c11);
+ vst1q_s32(coeff + 64 + 12, c31);
+
+ vst1q_s32(coeff + 128 + 0, c02);
+ vst1q_s32(coeff + 128 + 4, c22);
+ vst1q_s32(coeff + 128 + 8, c12);
+ vst1q_s32(coeff + 128 + 12, c32);
+
+ vst1q_s32(coeff + 192 + 0, c03);
+ vst1q_s32(coeff + 192 + 4, c23);
+ vst1q_s32(coeff + 192 + 8, c13);
+ vst1q_s32(coeff + 192 + 12, c33);
+
+ coeff += 16;
+ }
+}
+
+void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ /* Top left first. */
+ aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+ coeff + 256);
+ /* Bottom left. */
+ aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+ coeff + 512);
+ /* Bottom right. */
+ aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+ coeff + 768);
+
+ for (int i = 0; i < 256; i += 4) {
+ const int32x4_t a0 = vld1q_s32(coeff);
+ const int32x4_t a1 = vld1q_s32(coeff + 256);
+ const int32x4_t a2 = vld1q_s32(coeff + 512);
+ const int32x4_t a3 = vld1q_s32(coeff + 768);
+
+ const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+ const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+ const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+ const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+ const int32x4_t c0 = vaddq_s32(b0, b2);
+ const int32x4_t c1 = vaddq_s32(b1, b3);
+ const int32x4_t c2 = vsubq_s32(b0, b2);
+ const int32x4_t c3 = vsubq_s32(b1, b3);
+
+ vst1q_s32(coeff + 0, c0);
+ vst1q_s32(coeff + 256, c1);
+ vst1q_s32(coeff + 512, c2);
+ vst1q_s32(coeff + 768, c3);
+
+ coeff += 4;
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..47d5dae012
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ uint16x4_t sum, a0, a1, a2, a3;
+
+ load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3);
+
+ sum = vadd_u16(a0, a1);
+ sum = vadd_u16(sum, a2);
+ sum = vadd_u16(sum, a3);
+
+ return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4;
+}
+
+uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+ load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ sum = vaddq_u16(a0, a1);
+ sum = vaddq_u16(sum, a2);
+ sum = vaddq_u16(sum, a3);
+ sum = vaddq_u16(sum, a4);
+ sum = vaddq_u16(sum, a5);
+ sum = vaddq_u16(sum, a6);
+ sum = vaddq_u16(sum, a7);
+
+ return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6;
+}
+
+void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+ const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+ const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+ const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+ const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+ const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+ const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+ const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+ const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+ const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+ const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+ const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+ const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+ const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+ const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+ const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+ const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+ const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+ const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+ const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+ const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+ const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+ const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+ const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+ const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+ const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+ const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+ const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+ const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t min0123 = vminq_u16(min01, min23);
+ const uint16x8_t min4567 = vminq_u16(min45, min67);
+ const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if AOM_ARCH_AARCH64
+ *max = (int)vmaxvq_u16(max07);
+ *min = (int)vminvq_u16(min07);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+ uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u16((uint16_t *)max, ab_max, 0);
+ vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..531309b025
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width >= 8) {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src0 + j);
+ const uint16x8_t s1 = vld1q_u16(src1 + j);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(comp_pred + j, blend_u16);
+
+ j += 8;
+ } while (j < width);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+
+ do {
+ const uint16x4_t s0 = vld1_u16(src0);
+ const uint16x4_t s1 = vld1_u16(src1);
+ const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ vst1_u16(comp_pred, blend_u16);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 4;
+ } while (--height != 0);
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ const uint16x4_t avg = dist_wtd_avg_u16x4(
+ r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 0000000000..8b03e91ac3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ const uint16x4_t m0 =
+ vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+ do {
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_u16x2_strided_x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 0000000000..90b44fcc5e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits) \
+ static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8( \
+ uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) { \
+ const uint16x8_t m_inv = \
+ vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); \
+ \
+ uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_low_u16(m), vget_low_u16(a)); \
+ uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_high_u16(m), vget_high_u16(a)); \
+ \
+ blend_u32_lo = \
+ vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); \
+ blend_u32_hi = \
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); \
+ \
+ uint16x4_t blend_u16_lo = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ uint16x4_t blend_u16_hi = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ \
+ uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi); \
+ blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1)); \
+ \
+ return blend_u16; \
+ } \
+ \
+ static INLINE void highbd_##bd##_blend_a64_d16_mask_neon( \
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, \
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, \
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, \
+ int subh) { \
+ const int offset_bits = bd + 2 * FILTER_BITS - round0_bits; \
+ int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + \
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); \
+ int32x4_t offset = \
+ vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS)); \
+ \
+ if ((subw | subh) == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if ((subw & subh) == 1) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); \
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4( \
+ vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0), \
+ vget_high_u8(m1))); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); \
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = \
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if (subw == 1 && subh == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 2 * i); \
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0_2 = \
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+ uint8x8_t m1_3 = \
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } \
+ }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+ uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ (void)conv_params;
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ if (bd == 12) {
+ highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else if (bd == 10) {
+ highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else {
+ highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh);
+ }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h, int subw, int subh, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if ((subw | subh) == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 0000000000..1292e20342
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+ uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+ uint16x8_t m = vcombine_u16(m1, m2);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ do {
+ uint16x4_t m0 = vdup_n_u16(0);
+ m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+ uint8x8_t m0_zip =
+ vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+ m0 = vget_low_u16(vmovl_u8(m0_zip));
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_u16x2_strided_x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 0000000000..e25438c9b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ int32x4_t sum =
+ highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ const int16x8_t s2 = vextq_s16(s0, s1, 1);
+ const int16x8_t s3 = vextq_s16(s0, s1, 2);
+ const int16x8_t s4 = vextq_s16(s0, s1, 3);
+ const int16x4_t s0_lo = vget_low_s16(s0);
+ const int16x4_t s1_lo = vget_low_s16(s2);
+ const int16x4_t s2_lo = vget_low_s16(s3);
+ const int16x4_t s3_lo = vget_low_s16(s4);
+ const int16x4_t s4_lo = vget_high_s16(s0);
+ const int16x4_t s5_lo = vget_high_s16(s2);
+ const int16x4_t s6_lo = vget_high_s16(s3);
+ const int16x4_t s7_lo = vget_high_s16(s4);
+
+ return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+ s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ int32x4_t *sum0, int32x4_t *sum1) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+ const int16x8_t s0_hi,
+ const int16x8_t x_filter_0_7,
+ int32x4_t *sum0,
+ int32x4_t *sum1) {
+ const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+ const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+ const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+ const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+ const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+ const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+ const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+ sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum0, sum1;
+ highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+ int32x4_t sum0;
+ int32x4_t sum1;
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+ &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *x_filter_ptr,
+ int x_step_q4, int w, int h, int bd) {
+ assert(w >= 4 && h >= 4);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0, s1, s2, s3;
+ load_s16_8x2(s, src_stride, &s0, &s2);
+ load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+ uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+ uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ d01 = vminq_u16(d01, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+ s += 2 * src_stride;
+ d += 2 * dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int x_q4 = 0;
+
+ const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+ int16x8_t s0, s2, s4, s6;
+ load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+ src_x += 8;
+
+ do {
+ int16x8_t s1, s3, s5, s7;
+ load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+ uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+ uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+ uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+ uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s1;
+ s2 = s3;
+ s4 = s5;
+ s6 = s7;
+ src_x += 8;
+ d += 8;
+ width -= 8;
+ x_q4 += 8 * x_step_q4;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_y;
+ (void)y_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= SUBPEL_TAPS / 2 - 1;
+ highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, w, h, bd);
+ }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *y_filter_ptr, int w, int h,
+ int bd) {
+ assert(w >= 4 && h >= 4);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x4_t d1 =
+ highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x4_t d2 =
+ highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x4_t d3 =
+ highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ uint16x8_t d23 = vcombine_u16(d2, d3);
+
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+ vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+ vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+ s7, y_filter);
+ uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+ s8, y_filter);
+ uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+ s9, y_filter);
+ uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+ s10, y_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_x;
+ (void)x_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+ highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+ bd);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..d28617c67e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
+ int16x8_t b0 = vaddq_s16(*a0, *a1);
+ int16x8_t b1 = vsubq_s16(*a0, *a1);
+ int16x8_t b2 = vaddq_s16(*a2, *a3);
+ int16x8_t b3 = vsubq_s16(*a2, *a3);
+ int16x8_t b4 = vaddq_s16(*a4, *a5);
+ int16x8_t b5 = vsubq_s16(*a4, *a5);
+ int16x8_t b6 = vaddq_s16(*a6, *a7);
+ int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ int16x8_t c0 = vaddq_s16(b0, b2);
+ int16x8_t c2 = vsubq_s16(b0, b2);
+ int16x8_t c1 = vaddq_s16(b1, b3);
+ int16x8_t c3 = vsubq_s16(b1, b3);
+ int16x8_t c4 = vaddq_s16(b4, b6);
+ int16x8_t c6 = vsubq_s16(b4, b6);
+ int16x8_t c5 = vaddq_s16(b5, b7);
+ int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a2 = vsubq_s16(c0, c4);
+ *a7 = vaddq_s16(c1, c5);
+ *a6 = vsubq_s16(c1, c5);
+ *a3 = vaddq_s16(c2, c6);
+ *a1 = vsubq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+ int16x4_t a2, int16x4_t a3,
+ int16x4_t a4, int16x4_t a5,
+ int16x4_t a6, int16x4_t a7,
+ tran_low_t *coeff) {
+ int32x4_t b0 = vaddl_s16(a0, a1);
+ int32x4_t b1 = vsubl_s16(a0, a1);
+ int32x4_t b2 = vaddl_s16(a2, a3);
+ int32x4_t b3 = vsubl_s16(a2, a3);
+ int32x4_t b4 = vaddl_s16(a4, a5);
+ int32x4_t b5 = vsubl_s16(a4, a5);
+ int32x4_t b6 = vaddl_s16(a6, a7);
+ int32x4_t b7 = vsubl_s16(a6, a7);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+ int32x4_t c4 = vaddq_s32(b4, b6);
+ int32x4_t c6 = vsubq_s32(b4, b6);
+ int32x4_t c5 = vaddq_s32(b5, b7);
+ int32x4_t c7 = vsubq_s32(b5, b7);
+
+ int32x4_t d0 = vaddq_s32(c0, c4);
+ int32x4_t d2 = vsubq_s32(c0, c4);
+ int32x4_t d7 = vaddq_s32(c1, c5);
+ int32x4_t d6 = vsubq_s32(c1, c5);
+ int32x4_t d3 = vaddq_s32(c2, c6);
+ int32x4_t d1 = vsubq_s32(c2, c6);
+ int32x4_t d4 = vaddq_s32(c3, c7);
+ int32x4_t d5 = vsubq_s32(c3, c7);
+
+ vst1q_s32(coeff + 0, d0);
+ vst1q_s32(coeff + 4, d1);
+ vst1q_s32(coeff + 8, d2);
+ vst1q_s32(coeff + 12, d3);
+ vst1q_s32(coeff + 16, d4);
+ vst1q_s32(coeff + 20, d5);
+ vst1q_s32(coeff + 24, d6);
+ vst1q_s32(coeff + 28, d7);
+}
+
+void aom_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+ int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+ int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+ hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ // For the second pass we need to widen to 32-bit elements, so we're
+ // processing 4 columns at a time.
+ // Skip the second transpose because it is not required.
+
+ b0 = vget_low_s16(s0);
+ b1 = vget_low_s16(s1);
+ b2 = vget_low_s16(s2);
+ b3 = vget_low_s16(s3);
+ b4 = vget_low_s16(s4);
+ b5 = vget_low_s16(s5);
+ b6 = vget_low_s16(s6);
+ b7 = vget_low_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+ b0 = vget_high_s16(s0);
+ b1 = vget_high_s16(s1);
+ b2 = vget_high_s16(s2);
+ b3 = vget_high_s16(s3);
+ b4 = vget_high_s16(s4);
+ b5 = vget_high_s16(s5);
+ b6 = vget_high_s16(s6);
+ b7 = vget_high_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void aom_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ // Rearrange 16x16 to 8x32 and remove stride.
+ // Top left first.
+ aom_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+ // Top right.
+ aom_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+ // Bottom left.
+ aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+ coeff + 128);
+ // Bottom right.
+ aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+ coeff + 192);
+
+ for (int i = 0; i < 16; i++) {
+ int32x4_t a0 = vld1q_s32(coeff + 4 * i);
+ int32x4_t a1 = vld1q_s32(coeff + 4 * i + 64);
+ int32x4_t a2 = vld1q_s32(coeff + 4 * i + 128);
+ int32x4_t a3 = vld1q_s32(coeff + 4 * i + 192);
+
+ int32x4_t b0 = vhaddq_s32(a0, a1);
+ int32x4_t b1 = vhsubq_s32(a0, a1);
+ int32x4_t b2 = vhaddq_s32(a2, a3);
+ int32x4_t b3 = vhsubq_s32(a2, a3);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+
+ vst1q_s32(coeff + 4 * i, c0);
+ vst1q_s32(coeff + 4 * i + 64, c1);
+ vst1q_s32(coeff + 4 * i + 128, c2);
+ vst1q_s32(coeff + 4 * i + 192, c3);
+ }
+}
+
+void aom_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ // Rearrange 32x32 to 16x64 and remove stride.
+ // Top left first.
+ aom_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+ // Top right.
+ aom_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+ // Bottom left.
+ aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+ coeff + 512);
+ // Bottom right.
+ aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+ coeff + 768);
+
+ for (int i = 0; i < 64; i++) {
+ int32x4_t a0 = vld1q_s32(coeff + 4 * i);
+ int32x4_t a1 = vld1q_s32(coeff + 4 * i + 256);
+ int32x4_t a2 = vld1q_s32(coeff + 4 * i + 512);
+ int32x4_t a3 = vld1q_s32(coeff + 4 * i + 768);
+
+ int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+ int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+ int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+ int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+
+ vst1q_s32(coeff + 4 * i, c0);
+ vst1q_s32(coeff + 4 * i + 256, c1);
+ vst1q_s32(coeff + 4 * i + 512, c2);
+ vst1q_s32(coeff + 4 * i + 768, c3);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..dc47974c68
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,2730 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// DC
+
+static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x4_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u16(dst + i * stride, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ vst1q_u16(dst + i * stride + 16, dc);
+ vst1q_u16(dst + i * stride + 24, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ vst1q_u16(dst + i * stride + 16, dc);
+ vst1q_u16(dst + i * stride + 24, dc);
+ vst1q_u16(dst + i * stride + 32, dc);
+ vst1q_u16(dst + i * stride + 40, dc);
+ vst1q_u16(dst + i * stride + 48, dc);
+ vst1q_u16(dst + i * stride + 56, dc);
+ }
+}
+
+static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
+ // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
+ // promote first.
+ const uint32x4_t b = vpaddlq_u16(a);
+#if AOM_ARCH_AARCH64
+ const uint32x4_t c = vpaddq_u32(b, b);
+ return vpaddq_u32(c, c);
+#else
+ const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
+ const uint32x2_t d = vpadd_u32(c, c);
+ return vcombine_u32(d, d);
+#endif
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
+ // Nothing to do since sum is already one vector, but saves needing to
+ // special case w=4 or h=4 cases. The combine will be zero cost for a sane
+ // compiler since vld1 already sets the top half of a vector to zero as part
+ // of the operation.
+ return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
+ // Nothing to do since sum is already one vector, but saves needing to
+ // special case w=8 or h=8 cases.
+ return vld1q_u16(left);
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
+ const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
+ const uint16x8_t a1 = vld1q_u16(left + 8);
+ return vaddq_u16(a0, a1); // up to 13 bits
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
+ const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
+ const uint16x8_t a1 = vld1q_u16(left + 8);
+ const uint16x8_t a2 = vld1q_u16(left + 16);
+ const uint16x8_t a3 = vld1q_u16(left + 24);
+ const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits
+ const uint16x8_t b1 = vaddq_u16(a2, a3);
+ return vaddq_u16(b0, b1); // up to 14 bits
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
+ const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
+ const uint16x8_t a1 = vld1q_u16(left + 8);
+ const uint16x8_t a2 = vld1q_u16(left + 16);
+ const uint16x8_t a3 = vld1q_u16(left + 24);
+ const uint16x8_t a4 = vld1q_u16(left + 32);
+ const uint16x8_t a5 = vld1q_u16(left + 40);
+ const uint16x8_t a6 = vld1q_u16(left + 48);
+ const uint16x8_t a7 = vld1q_u16(left + 56);
+ const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits
+ const uint16x8_t b1 = vaddq_u16(a2, a3);
+ const uint16x8_t b2 = vaddq_u16(a4, a5);
+ const uint16x8_t b3 = vaddq_u16(a6, a7);
+ const uint16x8_t c0 = vaddq_u16(b0, b1); // up to 14 bits
+ const uint16x8_t c1 = vaddq_u16(b2, b3);
+ return vaddq_u16(c0, c1); // up to 15 bits
+}
+
+#define HIGHBD_DC_PREDICTOR(w, h, shift) \
+ void aom_highbd_dc_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ const uint16x8_t a = highbd_dc_load_partial_sum_##w(above); \
+ const uint16x8_t l = highbd_dc_load_partial_sum_##h(left); \
+ const uint32x4_t sum = \
+ horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l)); \
+ const uint16x4_t dc0 = vrshrn_n_u32(sum, shift); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
+ }
+
+void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // In the rectangular cases we simply extend the shorter vector to uint16x8
+ // in order to accumulate, however in the 4x4 case there is no shorter vector
+ // to extend so it is beneficial to do the whole calculation in uint16x4
+ // instead.
+ (void)bd;
+ const uint16x4_t a = vld1_u16(above); // up to 12 bits
+ const uint16x4_t l = vld1_u16(left);
+ uint16x4_t sum = vpadd_u16(a, l); // up to 13 bits
+ sum = vpadd_u16(sum, sum); // up to 14 bits
+ sum = vpadd_u16(sum, sum);
+ const uint16x4_t dc = vrshr_n_u16(sum, 3);
+ highbd_dc_store_4xh(dst, stride, 4, dc);
+}
+
+HIGHBD_DC_PREDICTOR(8, 8, 4)
+HIGHBD_DC_PREDICTOR(16, 16, 5)
+HIGHBD_DC_PREDICTOR(32, 32, 6)
+HIGHBD_DC_PREDICTOR(64, 64, 7)
+
+#undef HIGHBD_DC_PREDICTOR
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier, int shift2) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> shift2;
+}
+
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
+ uint32_t multiplier) {
+ return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
+ HIGHBD_DC_SHIFT2);
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+#define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult) \
+ void aom_highbd_dc_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \
+ uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \
+ uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \
+ int sum = horizontal_add_u16x8(sum_vec); \
+ int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \
+ }
+
+HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
+
+#undef HIGHBD_DC_PREDICTOR_RECT
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+#define HIGHBD_DC_PREDICTOR_128(w, h, q) \
+ void aom_highbd_dc_128_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ (void)left; \
+ highbd_dc_store_##w##xh(dst, stride, (h), \
+ vdup##q##_n_u16(0x80 << (bd - 8))); \
+ }
+
+HIGHBD_DC_PREDICTOR_128(4, 4, )
+HIGHBD_DC_PREDICTOR_128(4, 8, )
+HIGHBD_DC_PREDICTOR_128(4, 16, )
+HIGHBD_DC_PREDICTOR_128(8, 4, q)
+HIGHBD_DC_PREDICTOR_128(8, 8, q)
+HIGHBD_DC_PREDICTOR_128(8, 16, q)
+HIGHBD_DC_PREDICTOR_128(8, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 4, q)
+HIGHBD_DC_PREDICTOR_128(16, 8, q)
+HIGHBD_DC_PREDICTOR_128(16, 16, q)
+HIGHBD_DC_PREDICTOR_128(16, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 64, q)
+HIGHBD_DC_PREDICTOR_128(32, 8, q)
+HIGHBD_DC_PREDICTOR_128(32, 16, q)
+HIGHBD_DC_PREDICTOR_128(32, 32, q)
+HIGHBD_DC_PREDICTOR_128(32, 64, q)
+HIGHBD_DC_PREDICTOR_128(64, 16, q)
+HIGHBD_DC_PREDICTOR_128(64, 32, q)
+HIGHBD_DC_PREDICTOR_128(64, 64, q)
+
+#undef HIGHBD_DC_PREDICTOR_128
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+static INLINE uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
+ const uint16x4_t a = vld1_u16(left); // up to 12 bits
+ const uint16x4_t b = vpadd_u16(a, a); // up to 13 bits
+ return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(
+ highbd_dc_load_partial_sum_16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(
+ highbd_dc_load_partial_sum_32(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(
+ highbd_dc_load_partial_sum_64(left));
+}
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q) \
+ void aom_highbd_dc_left_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ const uint32x4_t sum = highbd_dc_load_sum_##h(left); \
+ const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+ }
+
+DC_PREDICTOR_LEFT(4, 4, 2, )
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(8, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 8, 3, q)
+DC_PREDICTOR_LEFT(8, 16, 4, q)
+DC_PREDICTOR_LEFT(8, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 16, 4, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(64, 64, 6, q)
+
+#undef DC_PREDICTOR_LEFT
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+#define DC_PREDICTOR_TOP(w, h, shift, q) \
+ void aom_highbd_dc_top_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ (void)left; \
+ const uint32x4_t sum = highbd_dc_load_sum_##w(above); \
+ const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+ }
+
+DC_PREDICTOR_TOP(4, 4, 2, )
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, q)
+DC_PREDICTOR_TOP(8, 8, 3, q)
+DC_PREDICTOR_TOP(8, 16, 3, q)
+DC_PREDICTOR_TOP(8, 32, 3, q)
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 16, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 32, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+DC_PREDICTOR_TOP(64, 64, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+#define HIGHBD_V_NXM(W, H) \
+ void aom_highbd_v_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)left; \
+ (void)bd; \
+ vertical##W##xh_neon(dst, stride, above, H); \
+ }
+
+static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
+ uint16x8x2_t x;
+ // Clang/gcc uses ldp here.
+ x.val[0] = vld1q_u16(ptr);
+ x.val[1] = vld1q_u16(ptr + 8);
+ return x;
+}
+
+static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
+ vst1q_u16(ptr, x.val[0]);
+ vst1q_u16(ptr + 8, x.val[1]);
+}
+
+static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x4_t row = vld1_u16(above);
+ int y = height;
+ do {
+ vst1_u16(dst, row);
+ vst1_u16(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x8_t row = vld1q_u16(above);
+ int y = height;
+ do {
+ vst1q_u16(dst, row);
+ vst1q_u16(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x8x2_t row = load_uint16x8x2(above);
+ int y = height;
+ do {
+ store_uint16x8x2(dst, row);
+ store_uint16x8x2(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
+ uint16x8x4_t x;
+ // Clang/gcc uses ldp here.
+ x.val[0] = vld1q_u16(ptr);
+ x.val[1] = vld1q_u16(ptr + 8);
+ x.val[2] = vld1q_u16(ptr + 16);
+ x.val[3] = vld1q_u16(ptr + 24);
+ return x;
+}
+
+static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
+ vst1q_u16(ptr, x.val[0]);
+ vst1q_u16(ptr + 8, x.val[1]);
+ vst1q_u16(ptr + 16, x.val[2]);
+ vst1q_u16(ptr + 24, x.val[3]);
+}
+
+static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x8x4_t row = load_uint16x8x4(above);
+ int y = height;
+ do {
+ store_uint16x8x4(dst, row);
+ store_uint16x8x4(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ uint16_t *dst32 = dst + 32;
+ const uint16x8x4_t row = load_uint16x8x4(above);
+ const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
+ int y = height;
+ do {
+ store_uint16x8x4(dst, row);
+ store_uint16x8x4(dst32, row32);
+ store_uint16x8x4(dst + stride, row);
+ store_uint16x8x4(dst32 + stride, row32);
+ dst += stride << 1;
+ dst32 += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+HIGHBD_V_NXM(4, 4)
+HIGHBD_V_NXM(4, 8)
+HIGHBD_V_NXM(4, 16)
+
+HIGHBD_V_NXM(8, 4)
+HIGHBD_V_NXM(8, 8)
+HIGHBD_V_NXM(8, 16)
+HIGHBD_V_NXM(8, 32)
+
+HIGHBD_V_NXM(16, 4)
+HIGHBD_V_NXM(16, 8)
+HIGHBD_V_NXM(16, 16)
+HIGHBD_V_NXM(16, 32)
+HIGHBD_V_NXM(16, 64)
+
+HIGHBD_V_NXM(32, 8)
+HIGHBD_V_NXM(32, 16)
+HIGHBD_V_NXM(32, 32)
+HIGHBD_V_NXM(32, 64)
+
+HIGHBD_V_NXM(64, 16)
+HIGHBD_V_NXM(64, 32)
+HIGHBD_V_NXM(64, 64)
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
+ vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
+ vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
+ vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
+ vst1q_u16(dst + 0, left);
+ vst1q_u16(dst + 8, left);
+}
+
+static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
+ vst1q_u16(dst + 0, left);
+ vst1q_u16(dst + 8, left);
+ vst1q_u16(dst + 16, left);
+ vst1q_u16(dst + 24, left);
+}
+
+static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
+ vst1q_u16(dst + 0, left);
+ vst1q_u16(dst + 8, left);
+ vst1q_u16(dst + 16, left);
+ vst1q_u16(dst + 24, left);
+ vst1q_u16(dst + 32, left);
+ vst1q_u16(dst + 40, left);
+ vst1q_u16(dst + 48, left);
+ vst1q_u16(dst + 56, left);
+}
+
+static INLINE void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ highbd_h_store_4x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ highbd_h_store_8x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ highbd_h_store_16x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+// For cases where height >= 16 we use pairs of loads to get LDP instructions.
+#define HIGHBD_H_WXH_LARGE(w, h) \
+ void aom_highbd_h_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ for (int i = 0; i < (h) / 16; ++i) { \
+ uint16x8_t l0 = vld1q_u16(left + 0); \
+ uint16x8_t l1 = vld1q_u16(left + 8); \
+ highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0)); \
+ highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0)); \
+ highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1)); \
+ highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
+ left += 16; \
+ dst += 16 * stride; \
+ } \
+ }
+
+HIGHBD_H_WXH_LARGE(4, 16)
+HIGHBD_H_WXH_LARGE(8, 16)
+HIGHBD_H_WXH_LARGE(8, 32)
+HIGHBD_H_WXH_LARGE(16, 16)
+HIGHBD_H_WXH_LARGE(16, 32)
+HIGHBD_H_WXH_LARGE(16, 64)
+HIGHBD_H_WXH_LARGE(32, 16)
+HIGHBD_H_WXH_LARGE(32, 32)
+HIGHBD_H_WXH_LARGE(32, 64)
+HIGHBD_H_WXH_LARGE(64, 16)
+HIGHBD_H_WXH_LARGE(64, 32)
+HIGHBD_H_WXH_LARGE(64, 64)
+
+#undef HIGHBD_H_WXH_LARGE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ int width, int height) {
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint16x8_t top;
+ if (width == 4) {
+ top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
+ } else { // width == 8
+ top = vld1q_u16(top_row);
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint16x8_t left = vdupq_n_u16(left_column[y]);
+
+ const uint16x8_t left_dist = vabdq_u16(top, top_left);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x8_t result = vbslq_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbslq_u16(left_or_top_mask, result, top_left);
+
+ if (width == 4) {
+ vst1_u16(dest, vget_low_u16(result));
+ } else { // width == 8
+ vst1q_u16(dest, result);
+ }
+ dest += stride;
+ }
+}
+
+#define HIGHBD_PAETH_NXM(W, H) \
+ void aom_highbd_paeth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+HIGHBD_PAETH_NXM(4, 4)
+HIGHBD_PAETH_NXM(4, 8)
+HIGHBD_PAETH_NXM(4, 16)
+HIGHBD_PAETH_NXM(8, 4)
+HIGHBD_PAETH_NXM(8, 8)
+HIGHBD_PAETH_NXM(8, 16)
+HIGHBD_PAETH_NXM(8, 32)
+
+// Select the closest values and collect them.
+static INLINE uint16x8_t select_paeth(const uint16x8_t top,
+ const uint16x8_t left,
+ const uint16x8_t top_left,
+ const uint16x8_t left_le_top,
+ const uint16x8_t left_le_top_left,
+ const uint16x8_t top_le_top_left) {
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ const uint16x8_t result = vbslq_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ return vbslq_u16(left_or_top_mask, result, top_left);
+}
+
+#define PAETH_PREDICTOR(num) \
+ do { \
+ const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \
+ const uint16x8_t top_left_dist = \
+ vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \
+ const uint16x8_t result = \
+ select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
+ top_le_top_left); \
+ vst1q_u16(dest + (num * 8), result); \
+ } while (0)
+
+#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
+
+static INLINE void highbd_paeth16_plus_x_h_neon(
+ uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
+ const uint16_t *const left_column, int width, int height) {
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint16x8_t top[8];
+ top[0] = LOAD_TOP_ROW(0);
+ top[1] = LOAD_TOP_ROW(1);
+ if (width > 16) {
+ top[2] = LOAD_TOP_ROW(2);
+ top[3] = LOAD_TOP_ROW(3);
+ if (width == 64) {
+ top[4] = LOAD_TOP_ROW(4);
+ top[5] = LOAD_TOP_ROW(5);
+ top[6] = LOAD_TOP_ROW(6);
+ top[7] = LOAD_TOP_ROW(7);
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint16x8_t left = vdupq_n_u16(left_column[y]);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+ PAETH_PREDICTOR(0);
+ PAETH_PREDICTOR(1);
+ if (width > 16) {
+ PAETH_PREDICTOR(2);
+ PAETH_PREDICTOR(3);
+ if (width == 64) {
+ PAETH_PREDICTOR(4);
+ PAETH_PREDICTOR(5);
+ PAETH_PREDICTOR(6);
+ PAETH_PREDICTOR(7);
+ }
+ }
+ dest += stride;
+ }
+}
+
+#define HIGHBD_PAETH_NXM_WIDE(W, H) \
+ void aom_highbd_paeth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+HIGHBD_PAETH_NXM_WIDE(16, 4)
+HIGHBD_PAETH_NXM_WIDE(16, 8)
+HIGHBD_PAETH_NXM_WIDE(16, 16)
+HIGHBD_PAETH_NXM_WIDE(16, 32)
+HIGHBD_PAETH_NXM_WIDE(16, 64)
+HIGHBD_PAETH_NXM_WIDE(32, 8)
+HIGHBD_PAETH_NXM_WIDE(32, 16)
+HIGHBD_PAETH_NXM_WIDE(32, 32)
+HIGHBD_PAETH_NXM_WIDE(32, 64)
+HIGHBD_PAETH_NXM_WIDE(64, 16)
+HIGHBD_PAETH_NXM_WIDE(64, 32)
+HIGHBD_PAETH_NXM_WIDE(64, 64)
+
+// -----------------------------------------------------------------------------
+// SMOOTH
+
+// 256 - v = vneg_s8(v)
+static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
+ return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
+static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[3];
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4_t top_v = vld1_u16(top_row);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
+ const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
+ const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+ for (int y = 0; y < height; ++y) {
+ // Each variable in the running summation is named for the last item to be
+ // accumulated.
+ const uint32x4_t weighted_top =
+ vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+ const uint32x4_t weighted_left =
+ vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
+ const uint32x4_t weighted_bl =
+ vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+ const uint16x4_t pred =
+ vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+ vst1_u16(dst, pred);
+ dst += stride;
+ }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+static INLINE void highbd_calculate_pred8(
+ uint16_t *dst, const uint32x4_t weighted_corners_low,
+ const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
+ const uint16x4x2_t weights_x, const uint16_t left_y,
+ const uint16_t weight_y) {
+ // Each variable in the running summation is named for the last item to be
+ // accumulated.
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+ const uint32x4_t weighted_edges_low =
+ vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+ const uint16x4_t pred_low =
+ vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+ vst1_u16(dst, pred_low);
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+ const uint32x4_t weighted_edges_high =
+ vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+ const uint16x4_t pred_high =
+ vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+ vst1_u16(dst + 4, pred_high);
+}
+
+static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[7];
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4x2_t top_vals = { { vld1_u16(top_row),
+ vld1_u16(top_row + 4) } };
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+ vld1_u16(smooth_weights_u16 + 8) } };
+ const uint32x4_t weighted_tr_low =
+ vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+ const uint32x4_t weighted_tr_high =
+ vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ const uint32x4_t weighted_corners_low =
+ vaddq_u32(weighted_bl, weighted_tr_low);
+ const uint32x4_t weighted_corners_high =
+ vaddq_u32(weighted_bl, weighted_tr_high);
+ highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
+ top_vals, weights_x, left_column[y], weights_y[y]);
+ dst += stride;
+ }
+}
+
+#define HIGHBD_SMOOTH_NXM(W, H) \
+ void aom_highbd_smooth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_NXM(4, 4)
+HIGHBD_SMOOTH_NXM(4, 8)
+HIGHBD_SMOOTH_NXM(8, 4)
+HIGHBD_SMOOTH_NXM(8, 8)
+HIGHBD_SMOOTH_NXM(4, 16)
+HIGHBD_SMOOTH_NXM(8, 16)
+HIGHBD_SMOOTH_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_PREDICTOR(W) \
+ static void highbd_smooth_##W##xh_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
+ const uint16_t *const left_column, const int height) { \
+ const uint16_t top_right = top_row[(W)-1]; \
+ const uint16_t bottom_left = left_column[height - 1]; \
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
+ \
+ /* Precompute weighted values that don't vary with |y|. */ \
+ uint32x4_t weighted_tr_low[(W) >> 3]; \
+ uint32x4_t weighted_tr_high[(W) >> 3]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ const uint16x4_t weights_x_low = \
+ vld1_u16(smooth_weights_u16 + (W)-4 + x); \
+ weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \
+ const uint16x4_t weights_x_high = \
+ vld1_u16(smooth_weights_u16 + (W) + x); \
+ weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
+ } \
+ \
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
+ for (int y = 0; y < height; ++y) { \
+ const uint32x4_t weighted_bl = \
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
+ uint16_t *dst_x = dst; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \
+ vld1_u16(top_row + x + 4) } }; \
+ const uint32x4_t weighted_corners_low = \
+ vaddq_u32(weighted_bl, weighted_tr_low[i]); \
+ const uint32x4_t weighted_corners_high = \
+ vaddq_u32(weighted_bl, weighted_tr_high[i]); \
+ /* Accumulate weighted edge values and store. */ \
+ const uint16x4x2_t weights_x = { \
+ { vld1_u16(smooth_weights_u16 + (W)-4 + x), \
+ vld1_u16(smooth_weights_u16 + (W) + x) } \
+ }; \
+ highbd_calculate_pred8(dst_x, weighted_corners_low, \
+ weighted_corners_high, top_vals, weights_x, \
+ left_column[y], weights_y[y]); \
+ dst_x += 8; \
+ } \
+ dst += stride; \
+ } \
+ }
+
+HIGHBD_SMOOTH_PREDICTOR(16)
+HIGHBD_SMOOTH_PREDICTOR(32)
+HIGHBD_SMOOTH_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_PREDICTOR
+
+#define HIGHBD_SMOOTH_NXM_WIDE(W, H) \
+ void aom_highbd_smooth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_NXM_WIDE
+
+static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4_t top_v = vld1_u16(top_row);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ const uint32x4_t weighted_top =
+ vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+ vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
+
+ dst += stride;
+ }
+}
+
+static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4_t top_low = vld1_u16(top_row);
+ const uint16x4_t top_high = vld1_u16(top_row + 4);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+ vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+ vst1_u16(dst + 4,
+ vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
+ dst += stride;
+ }
+}
+
+#define HIGHBD_SMOOTH_V_NXM(W, H) \
+ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_V_NXM(4, 4)
+HIGHBD_SMOOTH_V_NXM(4, 8)
+HIGHBD_SMOOTH_V_NXM(4, 16)
+HIGHBD_SMOOTH_V_NXM(8, 4)
+HIGHBD_SMOOTH_V_NXM(8, 8)
+HIGHBD_SMOOTH_V_NXM(8, 16)
+HIGHBD_SMOOTH_V_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_V_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_V_PREDICTOR(W) \
+ static void highbd_smooth_v_##W##xh_neon( \
+ uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \
+ const uint16_t *const left_column, const int height) { \
+ const uint16_t bottom_left = left_column[height - 1]; \
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
+ \
+ uint16x4x2_t top_vals[(W) >> 3]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ top_vals[i].val[0] = vld1_u16(top_row + x); \
+ top_vals[i].val[1] = vld1_u16(top_row + x + 4); \
+ } \
+ \
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
+ for (int y = 0; y < height; ++y) { \
+ const uint32x4_t weighted_bl = \
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
+ \
+ uint16_t *dst_x = dst; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const uint32x4_t weighted_top_low = \
+ vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \
+ vst1_u16(dst_x, \
+ vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ \
+ const uint32x4_t weighted_top_high = \
+ vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \
+ vst1_u16(dst_x + 4, \
+ vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ dst_x += 8; \
+ } \
+ dst += stride; \
+ } \
+ }
+
+HIGHBD_SMOOTH_V_PREDICTOR(16)
+HIGHBD_SMOOTH_V_PREDICTOR(32)
+HIGHBD_SMOOTH_V_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_V_PREDICTOR
+
+#define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \
+ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_V_NXM_WIDE
+
+static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[3];
+
+ const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
+ const uint16x4_t scaled_weights_x = negate_s8(weights_x);
+
+ const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_left =
+ vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
+ vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[7];
+
+ const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+ vld1_u16(smooth_weights_u16 + 8) } };
+
+ const uint32x4_t weighted_tr_low =
+ vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+ const uint32x4_t weighted_tr_high =
+ vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+ for (int y = 0; y < height; ++y) {
+ const uint16_t left_y = left_column[y];
+ const uint32x4_t weighted_left_low =
+ vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+ vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+ const uint32x4_t weighted_left_high =
+ vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+ vst1_u16(dst + 4,
+ vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
+ dst += stride;
+ }
+}
+
+#define HIGHBD_SMOOTH_H_NXM(W, H) \
+ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_H_NXM(4, 4)
+HIGHBD_SMOOTH_H_NXM(4, 8)
+HIGHBD_SMOOTH_H_NXM(4, 16)
+HIGHBD_SMOOTH_H_NXM(8, 4)
+HIGHBD_SMOOTH_H_NXM(8, 8)
+HIGHBD_SMOOTH_H_NXM(8, 16)
+HIGHBD_SMOOTH_H_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_H_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_H_PREDICTOR(W) \
+ void highbd_smooth_h_##W##xh_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
+ const uint16_t *const left_column, const int height) { \
+ const uint16_t top_right = top_row[(W)-1]; \
+ \
+ uint16x4_t weights_x_low[(W) >> 3]; \
+ uint16x4_t weights_x_high[(W) >> 3]; \
+ uint32x4_t weighted_tr_low[(W) >> 3]; \
+ uint32x4_t weighted_tr_high[(W) >> 3]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \
+ weighted_tr_low[i] = \
+ vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \
+ weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \
+ weighted_tr_high[i] = \
+ vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \
+ } \
+ \
+ for (int y = 0; y < height; ++y) { \
+ uint16_t *dst_x = dst; \
+ const uint16_t left_y = left_column[y]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const uint32x4_t weighted_left_low = \
+ vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \
+ vst1_u16(dst_x, \
+ vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ \
+ const uint32x4_t weighted_left_high = \
+ vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \
+ vst1_u16(dst_x + 4, \
+ vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ dst_x += 8; \
+ } \
+ dst += stride; \
+ } \
+ }
+
+HIGHBD_SMOOTH_H_PREDICTOR(16)
+HIGHBD_SMOOTH_H_PREDICTOR(32)
+HIGHBD_SMOOTH_H_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_H_PREDICTOR
+
+#define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \
+ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// Z1
+
+static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
+ uint16x4_t a1,
+ int shift) {
+ // The C implementation of the z1 predictor uses (32 - shift) and a right
+ // shift by 5, however we instead double shift to avoid an unnecessary right
+ // shift by 1.
+ uint32x4_t res = vmull_n_u16(a1, shift);
+ res = vmlal_n_u16(res, a0, 64 - shift);
+ return vrshrn_n_u32(res, 6);
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
+ uint16x8_t a1,
+ int shift) {
+ return vcombine_u16(
+ highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
+ highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
+}
+
+static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh,
+ const uint16_t *above,
+ int dx) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dx > 0);
+
+ const int max_base_x = (bw + bh) - 1;
+ const int above_max = above[max_base_x];
+
+ const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
+ const int16x4_t iota1x4 = vget_low_s16(iota1x8);
+
+ int x = dx;
+ int r = 0;
+ do {
+ const int base = x >> 6;
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ aom_memset16(dst, above_max, bw);
+ dst += stride;
+ }
+ return;
+ }
+
+ // The C implementation of the z1 predictor when not upsampling uses:
+ // ((x & 0x3f) >> 1)
+ // The right shift is unnecessary here since we instead shift by +1 later,
+ // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+ const int shift = x & 0x3e;
+
+ if (bw == 4) {
+ const uint16x4_t a0 = vld1_u16(&above[base]);
+ const uint16x4_t a1 = vld1_u16(&above[base + 1]);
+ const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
+ const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
+ const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
+ vst1_u16(dst, res);
+ } else {
+ int c = 0;
+ do {
+ const uint16x8_t a0 = vld1q_u16(&above[base + c]);
+ const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
+ const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
+ const uint16x8_t cmp =
+ vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
+ const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
+ vst1q_u16(dst + c, res);
+ c += 8;
+ } while (c < bw);
+ }
+
+ dst += stride;
+ x += dx;
+ } while (++r < bh);
+}
+
+static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh,
+ const uint16_t *above,
+ int dx) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dx > 0);
+
+ const int max_base_x = ((bw + bh) - 1) << 1;
+ const int above_max = above[max_base_x];
+
+ const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
+ const int16x4_t iota2x4 = vget_low_s16(iota2x8);
+
+ int x = dx;
+ int r = 0;
+ do {
+ const int base = x >> 5;
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ aom_memset16(dst, above_max, bw);
+ dst += stride;
+ }
+ return;
+ }
+
+ // The C implementation of the z1 predictor when upsampling uses:
+ // (((x << 1) & 0x3f) >> 1)
+ // The right shift is unnecessary here since we instead shift by +1 later,
+ // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+ const int shift = (x << 1) & 0x3e;
+
+ if (bw == 4) {
+ const uint16x4x2_t a01 = vld2_u16(&above[base]);
+ const uint16x4_t val =
+ highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
+ const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
+ const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
+ vst1_u16(dst, res);
+ } else {
+ int c = 0;
+ do {
+ const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
+ const uint16x8_t val =
+ highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
+ const uint16x8_t cmp =
+ vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
+ const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
+ vst1q_u16(dst + c, res);
+ c += 8;
+ } while (c < bw);
+ }
+
+ dst += stride;
+ x += dx;
+ } while (++r < bh);
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int dx, int dy, int bd) {
+ (void)left;
+ (void)dy;
+ (void)bd;
+ assert(dy == 1);
+
+ if (upsample_above) {
+ highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
+ } else {
+ highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Z2
+
+#if AOM_ARCH_AARCH64
+// Incrementally shift more elements from `above` into the result, merging with
+// existing `left` elements.
+// X0, X1, X2, X3
+// Y0, X0, X1, X2
+// Y0, Y1, X0, X1
+// Y0, Y1, Y2, X0
+// Y0, Y1, Y2, Y3
+// clang-format off
+static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
+ { 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 8, 9, 10, 11, 12, 13 },
+ { 0, 1, 2, 3, 8, 9, 10, 11 },
+ { 0, 1, 2, 3, 4, 5, 8, 9 },
+ { 0, 1, 2, 3, 4, 5, 6, 7 },
+};
+// clang-format on
+
+// Incrementally shift more elements from `above` into the result, merging with
+// existing `left` elements.
+// X0, X1, X2, X3, X4, X5, X6, X7
+// Y0, X0, X1, X2, X3, X4, X5, X6
+// Y0, Y1, X0, X1, X2, X3, X4, X5
+// Y0, Y1, Y2, X0, X1, X2, X3, X4
+// Y0, Y1, Y2, Y3, X0, X1, X2, X3
+// Y0, Y1, Y2, Y3, Y4, X0, X1, X2
+// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
+// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
+// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
+// clang-format off
+static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+ { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
+ { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+};
+// clang-format on
+
+// clang-format off
+static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
+ { 0U, 0U, 0U, 0U },
+ { 0xffffU, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
+};
+// clang-format on
+
+// clang-format off
+static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
+ { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
+};
+// clang-format on
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
+ const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x8_t left_indices =
+ vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
+ left_indices = vtrn1_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
+ const uint16x4_t ret = vreinterpret_u16_u8(
+ vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
+ return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+ const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x8_t left_indices =
+ vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
+ left_indices = vtrn1_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
+ uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
+ vreinterpretq_u8_u16(left_data.val[1]) } };
+ const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
+ return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x16_t left_indices =
+ vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
+ left_indices = vtrn1q_u8(left_indices, left_indices);
+ left_indices = vaddq_u8(left_indices, left_indices);
+ left_indices =
+ vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
+ const uint16x8_t ret = vreinterpretq_u16_u8(
+ vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
+ return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x16_t left_indices =
+ vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
+ left_indices = vtrn1q_u8(left_indices, left_indices);
+ left_indices = vaddq_u8(left_indices, left_indices);
+ left_indices =
+ vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
+ uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
+ vreinterpretq_u8_u16(left_data.val[1]) } };
+ const uint16x8_t ret =
+ vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
+ return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
+}
+#endif // AOM_ARCH_AARCH64
+
+static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
+ const uint16_t *left, const int16x4_t indices, int n) {
+ assert(n > 0);
+ assert(n <= 4);
+ // Load two elements at a time and then uzp them into separate vectors, to
+ // reduce the number of memory accesses.
+ uint32x2_t ret0_u32 = vdup_n_u32(0);
+ uint32x2_t ret1_u32 = vdup_n_u32(0);
+
+ // Use a single vget_lane_u64 to minimize vector to general purpose register
+ // transfers and then mask off the bits we actually want.
+ const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
+ const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
+ const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
+ const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
+ const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
+
+ // At time of writing both Clang and GCC produced better code with these
+ // nested if-statements compared to a switch statement with fallthrough.
+ ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+ if (n > 1) {
+ ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+ if (n > 2) {
+ ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0);
+ if (n > 3) {
+ ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1);
+ }
+ }
+ }
+ return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
+ vreinterpret_u16_u32(ret1_u32));
+}
+
+static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
+ const uint16_t *left, const int16x8_t indices, int n) {
+ assert(n > 0);
+ assert(n <= 8);
+ // Load two elements at a time and then uzp them into separate vectors, to
+ // reduce the number of memory accesses.
+ uint32x4_t ret0_u32 = vdupq_n_u32(0);
+ uint32x4_t ret1_u32 = vdupq_n_u32(0);
+
+ // Use a pair of vget_lane_u64 to minimize vector to general purpose register
+ // transfers and then mask off the bits we actually want.
+ const uint64_t indices0123 =
+ vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
+ const uint64_t indices4567 =
+ vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
+ const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
+ const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
+ const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
+ const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
+ const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
+ const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
+ const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
+ const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);
+
+ // At time of writing both Clang and GCC produced better code with these
+ // nested if-statements compared to a switch statement with fallthrough.
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+ if (n > 1) {
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+ if (n > 2) {
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2);
+ if (n > 3) {
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3);
+ if (n > 4) {
+ ret1_u32 =
+ vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0);
+ if (n > 5) {
+ ret1_u32 =
+ vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1);
+ if (n > 6) {
+ ret1_u32 =
+ vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2);
+ if (n > 7) {
+ ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7),
+ ret1_u32, 3);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
+ vreinterpretq_u16_u32(ret1_u32));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
+ uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
+ assert(base_shift >= 0);
+ assert(base_shift <= 4);
+ // On AArch64 we can permute the data from the `above` and `left` vectors
+ // into a single vector in a single load (of the permute vector) + tbl.
+#if AOM_ARCH_AARCH64
+ const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
+ vreinterpret_u8_u16(out_x) } };
+ return vreinterpret_u16_u8(
+ vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
+#else
+ uint16x4_t out = out_y;
+ for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
+ out[c2] = out_x[x_idx];
+ }
+ return out;
+#endif
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
+ uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
+ assert(base_shift >= 0);
+ assert(base_shift <= 8);
+ // On AArch64 we can permute the data from the `above` and `left` vectors
+ // into a single vector in a single load (of the permute vector) + tbl.
+#if AOM_ARCH_AARCH64
+ const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
+ vreinterpretq_u8_u16(out_x) } };
+ return vreinterpretq_u16_u8(
+ vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
+#else
+ uint16x8_t out = out_y;
+ for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
+ out[c2] = out_x[x_idx];
+ }
+ return out;
+#endif
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
+ uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
+ uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
+ res =
+ vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
+ return vrshrn_n_u32(res, 5);
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
+ uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
+ return vcombine_u16(
+ highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
+ vget_low_s16(shift)),
+ highbd_dr_prediction_z2_apply_shift_x4(
+ vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
+ const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
+ const uint16_t *left, int dx, int dy, int r, int c) {
+ const int16x4_t iota = vld1_s16(iota1_s16);
+
+ const int x0 = (c << 6) - (r + 1) * dx;
+ const int y0 = (r << 6) - (c + 1) * dy;
+
+ const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
+ const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
+ const int16x4_t shift_x0123 =
+ vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
+ const int16x4_t shift_y0123 =
+ vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
+ const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);
+
+ const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
+
+ // Based on the value of `base_shift` there are three possible cases to
+ // compute the result:
+ // 1) base_shift <= 0: We can load and operate entirely on data from the
+ // `above` input vector.
+ // 2) base_shift < vl: We can load from `above[-1]` and shift
+ // `vl - base_shift` elements across to the end of the
+ // vector, then compute the remainder from `left`.
+ // 3) base_shift >= vl: We can load and operate entirely on data from the
+ // `left` input vector.
+
+ if (base_shift <= 0) {
+ const int base_x = x0 >> 6;
+ const uint16x4_t a0 = vld1_u16(above + base_x);
+ const uint16x4_t a1 = vld1_u16(above + base_x + 1);
+ return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+ } else if (base_shift < 4) {
+ const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
+ left + 1, base_y0123, base_shift);
+ const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
+ l01.val[0], l01.val[1], shift_y0123);
+
+ // No need to reload from above in the loop, just use pre-loaded constants.
+ const uint16x4_t out16_x =
+ highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);
+
+ return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
+ } else {
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
+ return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
+ shift_y0123);
+ }
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
+ const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
+ const uint16_t *left, int dx, int dy, int r, int c) {
+ const int16x8_t iota = vld1q_s16(iota1_s16);
+
+ const int x0 = (c << 6) - (r + 1) * dx;
+ const int y0 = (r << 6) - (c + 1) * dy;
+
+ const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
+ const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
+ const int16x8_t shift_x01234567 =
+ vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
+ const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);
+
+ const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
+
+ // Based on the value of `base_shift` there are three possible cases to
+ // compute the result:
+ // 1) base_shift <= 0: We can load and operate entirely on data from the
+ // `above` input vector.
+ // 2) base_shift < vl: We can load from `above[-1]` and shift
+ // `vl - base_shift` elements across to the end of the
+ // vector, then compute the remainder from `left`.
+ // 3) base_shift >= vl: We can load and operate entirely on data from the
+ // `left` input vector.
+
+ if (base_shift <= 0) {
+ const int base_x = x0 >> 6;
+ const uint16x8_t a0 = vld1q_u16(above + base_x);
+ const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
+ return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+ } else if (base_shift < 8) {
+ const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
+ left + 1, base_y01234567, base_shift);
+ const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
+ l01.val[0], l01.val[1], shift_y01234567);
+
+ // No need to reload from above in the loop, just use pre-loaded constants.
+ const uint16x8_t out16_x =
+ highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);
+
+ return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
+ } else {
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
+ return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
+ shift_y01234567);
+ }
+}
+
+// Left array is accessed from -1 through `bh - 1` inclusive.
+// Above array is accessed from -1 through `bw - 1` inclusive.
+#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \
+ static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int upsample_above, int upsample_left, int dx, \
+ int dy, int bd) { \
+ (void)bd; \
+ (void)upsample_above; \
+ (void)upsample_left; \
+ assert(!upsample_above); \
+ assert(!upsample_left); \
+ assert(bw % 4 == 0); \
+ assert(bh % 4 == 0); \
+ assert(dx > 0); \
+ assert(dy > 0); \
+ \
+ uint16_t left_data[bh + 1]; \
+ memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \
+ \
+ uint16x8_t a0, a1; \
+ if (bw == 4) { \
+ a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \
+ a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \
+ } else { \
+ a0 = vld1q_u16(above - 1); \
+ a1 = vld1q_u16(above + 0); \
+ } \
+ \
+ int r = 0; \
+ do { \
+ if (bw == 4) { \
+ vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \
+ above, vget_low_u16(a0), vget_low_u16(a1), \
+ left_data, dx, dy, r, 0)); \
+ } else { \
+ int c = 0; \
+ do { \
+ vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \
+ above, a0, a1, left_data, dx, dy, r, c)); \
+ c += 8; \
+ } while (c < bw); \
+ } \
+ dst += stride; \
+ } while (++r < bh); \
+ }
+
+HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
+
+#undef HIGHBD_DR_PREDICTOR_Z2_WXH
+
+typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd);
+
+static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 6 inclusive from `left`.
+ // else we only need -1 through 3 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16(left - 2);
+ left_data1 = vld1q_u16(left - 1);
+ } else {
+ left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
+ left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
+ }
+#endif
+
+ const int16x4_t iota0123 = vld1_s16(iota1_s16);
+ const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 4; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x4_t a0, a1;
+ int16x4_t shift_x0123;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+ } else {
+ a0 = vld1_u16(above + base_x0);
+ a1 = vld1_u16(above + base_x0 + 1);
+ shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
+ }
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
+ } else if (base_shift < 4) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
+ left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
+ left_data_base, y_iters);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x4_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
+
+ // Calculate X component from `above`.
+ const int16x4_t shift_x0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
+ 1);
+ uint16x4_t a0, a1;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ } else {
+ a0 = vld1_u16(above - 1);
+ a1 = vld1_u16(above + 0);
+ }
+ const uint16x4_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+
+ // Combine X and Y vectors.
+ const uint16x4_t out =
+ highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
+ vst1_u16(dst, out);
+ } else {
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
+ left_data_base, 4);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
+ left_data_base, 4);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
+ }
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 14 inclusive from `left`.
+ // else we only need -1 through 6 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8x2_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16_x2(left - 2);
+ left_data1 = vld1q_u16_x2(left - 1);
+ } else {
+ left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
+ left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
+ }
+#endif
+
+ const int16x4_t iota0123 = vld1_s16(iota1_s16);
+ const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 8; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x4_t a0, a1;
+ int16x4_t shift_x0123;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+ } else {
+ a0 = vld1_u16(above + base_x0);
+ a1 = vld1_u16(above + base_x0 + 1);
+ shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
+ }
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
+ } else if (base_shift < 4) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+ left_data0, base_y0123, left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+ left_data1, base_y0123, left_data_base, y_iters);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x4_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
+
+ // Calculate X component from `above`.
+ uint16x4_t a0, a1;
+ int16x4_t shift_x0123;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+ } else {
+ a0 = vld1_u16(above - 1);
+ a1 = vld1_u16(above + 0);
+ shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
+ }
+ const uint16x4_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+
+ // Combine X and Y vectors.
+ const uint16x4_t out =
+ highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
+ vst1_u16(dst, out);
+ } else {
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
+ left_data_base, 4);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
+ left_data_base, 4);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
+ }
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 6 inclusive from `left`.
+ // else we only need -1 through 3 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16(left - 2);
+ left_data1 = vld1q_u16(left - 1);
+ } else {
+ left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
+ left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
+ }
+#endif
+
+ const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
+ const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 4; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x8_t x01234567 =
+ vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above + base_x0);
+ a1 = vld1q_u16(above + base_x0 + 1);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
+ } else if (base_shift < 8) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data0, base_y01234567, left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data1, base_y01234567, left_data_base, y_iters);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x8_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
+
+ // Calculate X component from `above`.
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 =
+ vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above - 1);
+ a1 = vld1q_u16(above + 0);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ const uint16x8_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+
+ // Combine X and Y vectors.
+ const uint16x8_t out =
+ highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
+ vst1q_u16(dst, out);
+ } else {
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data0, base_y01234567, left_data_base, 8);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data1, base_y01234567, left_data_base, 8);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
+ }
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 14 inclusive from `left`.
+ // else we only need -1 through 6 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8x2_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16_x2(left - 2);
+ left_data1 = vld1q_u16_x2(left - 1);
+ } else {
+ left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
+ left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
+ }
+#endif
+
+ const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
+ const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 8; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x8_t x01234567 =
+ vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above + base_x0);
+ a1 = vld1q_u16(above + base_x0 + 1);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
+ } else if (base_shift < 8) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data0, base_y01234567, left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data1, base_y01234567, left_data_base, y_iters);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x8_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
+
+ // Calculate X component from `above`.
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 =
+ vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above - 1);
+ a1 = vld1q_u16(above + 0);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ const uint16x8_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+
+ // Combine X and Y vectors.
+ const uint16x8_t out =
+ highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
+ vst1q_u16(dst, out);
+ } else {
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data0, base_y01234567, left_data_base, 8);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data1, base_y01234567, left_data_base, 8);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
+ }
+ dst += stride;
+ }
+}
+
+static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
+ &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
+ NULL },
+ { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
+ &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
+ &highbd_dr_prediction_z2_8x32_neon, NULL },
+ { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
+ &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
+ &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
+ { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
+ &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
+ &highbd_dr_prediction_z2_32x64_neon },
+ { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
+ &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
+};
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ highbd_dr_prediction_z2_ptr f =
+ dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
+ assert(f != NULL);
+ f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
+}
+
+// -----------------------------------------------------------------------------
+// Z3
+
+// Both the lane to the use and the shift amount must be immediates.
+#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
+ lane, shift) \
+ do { \
+ uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \
+ val = vmlal_lane_u16(val, (in1), (s1), (lane)); \
+ const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \
+ const uint16x4_t res = vrshrn_n_u32(val, (shift)); \
+ *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \
+ vdup_n_u16(left_max)); \
+ } while (0)
+
+#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
+ lane, shift) \
+ do { \
+ uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \
+ val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \
+ uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
+ val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \
+ const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base)); \
+ const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
+ vrshrn_n_u32(val_hi, (shift))); \
+ *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res, \
+ vdupq_n_u16(left_max)); \
+ } while (0)
+
+static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh, const uint16_t *left,
+ int dy) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dy > 0);
+
+ // Factor out left + 1 to give the compiler a better chance of recognising
+ // that the offsets used for the loads from left and left + 1 are otherwise
+ // identical.
+ const uint16_t *left1 = left + 1;
+
+ const int max_base_y = (bw + bh - 1);
+ const int left_max = left[max_base_y];
+ const int frac_bits = 6;
+
+ const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
+ const uint16x4_t iota1x4 = vget_low_u16(iota1x8);
+
+ // The C implementation of the z3 predictor when not upsampling uses:
+ // ((y & 0x3f) >> 1)
+ // The right shift is unnecessary here since we instead shift by +1 later,
+ // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+ const uint16x4_t shift_mask = vdup_n_u16(0x3e);
+
+ if (bh == 4) {
+ int y = dy;
+ int c = 0;
+ do {
+ // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
+ const int base0 = (y + 0 * dy) >> frac_bits;
+ const int base1 = (y + 1 * dy) >> frac_bits;
+ const int base2 = (y + 2 * dy) >> frac_bits;
+ const int base3 = (y + 3 * dy) >> frac_bits;
+ uint16x4_t out[4];
+ if (base0 >= max_base_y) {
+ out[0] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l00 = vld1_u16(left + base0);
+ const uint16x4_t l01 = vld1_u16(left1 + base0);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
+ shifts0, shifts1, 0, 6);
+ }
+ if (base1 >= max_base_y) {
+ out[1] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l10 = vld1_u16(left + base1);
+ const uint16x4_t l11 = vld1_u16(left1 + base1);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
+ shifts0, shifts1, 1, 6);
+ }
+ if (base2 >= max_base_y) {
+ out[2] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l20 = vld1_u16(left + base2);
+ const uint16x4_t l21 = vld1_u16(left1 + base2);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
+ shifts0, shifts1, 2, 6);
+ }
+ if (base3 >= max_base_y) {
+ out[3] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l30 = vld1_u16(left + base3);
+ const uint16x4_t l31 = vld1_u16(left1 + base3);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
+ shifts0, shifts1, 3, 6);
+ }
+ transpose_array_inplace_u16_4x4(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + r2 * stride + c, out[r2]);
+ }
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ } else {
+ int y = dy;
+ int c = 0;
+ do {
+ int r = 0;
+ do {
+ // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
+ const int base0 = ((y + 0 * dy) >> frac_bits) + r;
+ const int base1 = ((y + 1 * dy) >> frac_bits) + r;
+ const int base2 = ((y + 2 * dy) >> frac_bits) + r;
+ const int base3 = ((y + 3 * dy) >> frac_bits) + r;
+ uint16x8_t out[4];
+ if (base0 >= max_base_y) {
+ out[0] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l00 = vld1q_u16(left + base0);
+ const uint16x8_t l01 = vld1q_u16(left1 + base0);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
+ shifts0, shifts1, 0, 6);
+ }
+ if (base1 >= max_base_y) {
+ out[1] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l10 = vld1q_u16(left + base1);
+ const uint16x8_t l11 = vld1q_u16(left1 + base1);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
+ shifts0, shifts1, 1, 6);
+ }
+ if (base2 >= max_base_y) {
+ out[2] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l20 = vld1q_u16(left + base2);
+ const uint16x8_t l21 = vld1q_u16(left1 + base2);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
+ shifts0, shifts1, 2, 6);
+ }
+ if (base3 >= max_base_y) {
+ out[3] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l30 = vld1q_u16(left + base3);
+ const uint16x8_t l31 = vld1q_u16(left1 + base3);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
+ shifts0, shifts1, 3, 6);
+ }
+ transpose_array_inplace_u16_4x8(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
+ }
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
+ }
+ r += 8;
+ } while (r < bh);
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ }
+}
+
+static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh, const uint16_t *left,
+ int dy) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dy > 0);
+
+ const int max_base_y = (bw + bh - 1) << 1;
+ const int left_max = left[max_base_y];
+ const int frac_bits = 5;
+
+ const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
+ const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
+ const uint16x4_t iota2x4 = vget_low_u16(iota2x8);
+
+ // The C implementation of the z3 predictor when upsampling uses:
+ // (((x << 1) & 0x3f) >> 1)
+ // The two shifts are unnecessary here since the lowest bit is guaranteed to
+ // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
+ // needing the shifts at all.
+ const uint16x4_t shift_mask = vdup_n_u16(0x1F);
+
+ if (bh == 4) {
+ int y = dy;
+ int c = 0;
+ do {
+ // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
+ const int base0 = (y + 0 * dy) >> frac_bits;
+ const int base1 = (y + 1 * dy) >> frac_bits;
+ const int base2 = (y + 2 * dy) >> frac_bits;
+ const int base3 = (y + 3 * dy) >> frac_bits;
+ const uint16x4x2_t l0 = vld2_u16(left + base0);
+ const uint16x4x2_t l1 = vld2_u16(left + base1);
+ const uint16x4x2_t l2 = vld2_u16(left + base2);
+ const uint16x4x2_t l3 = vld2_u16(left + base3);
+ uint16x4_t out[4];
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
+ l0.val[1], shifts0, shifts1, 0, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
+ l1.val[1], shifts0, shifts1, 1, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
+ l2.val[1], shifts0, shifts1, 2, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
+ l3.val[1], shifts0, shifts1, 3, 5);
+ transpose_array_inplace_u16_4x4(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + r2 * stride + c, out[r2]);
+ }
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ } else {
+ assert(bh % 8 == 0);
+
+ int y = dy;
+ int c = 0;
+ do {
+ int r = 0;
+ do {
+ // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
+ const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
+ const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
+ const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
+ const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
+ const uint16x8x2_t l0 = vld2q_u16(left + base0);
+ const uint16x8x2_t l1 = vld2q_u16(left + base1);
+ const uint16x8x2_t l2 = vld2q_u16(left + base2);
+ const uint16x8x2_t l3 = vld2q_u16(left + base3);
+ uint16x8_t out[4];
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
+ l0.val[1], shifts0, shifts1, 0, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
+ l1.val[1], shifts0, shifts1, 1, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
+ l2.val[1], shifts0, shifts1, 2, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
+ l3.val[1], shifts0, shifts1, 3, 5);
+ transpose_array_inplace_u16_4x8(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
+ }
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
+ }
+ r += 8;
+ } while (r < bh);
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_left,
+ int dx, int dy, int bd) {
+ (void)above;
+ (void)dx;
+ (void)bd;
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (upsample_left) {
+ highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
+ } else {
+ highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
+ }
+}
+
+#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
+#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8
diff --git a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..77727b7665
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
+ const int16x4_t high) {
+ return vmin_s16(vmax_s16(val, low), high);
+}
+
+static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
+ int bitdepth) {
+ const int16x8_t low = vdupq_n_s16(0);
+ const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+static INLINE uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t thresh) {
+ const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+ return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+static INLINE uint16x4_t outer_threshold(const uint16x4_t p1,
+ const uint16x4_t p0,
+ const uint16x4_t q0,
+ const uint16x4_t q1,
+ const uint16_t outer_thresh) {
+ const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+ const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+ const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+ const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+ const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+ return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// outer_threshold()
+static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// outer_threshold()
+static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// outer_threshold()
+static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16x8_t abd_p2p3_q2q3,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN_masks functions.
+
+static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+ const uint16_t hev_thresh,
+ const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t *const hev_mask,
+ uint16x4_t *const needs_filter4_mask) {
+ const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ // This includes cases where needs_filter4() is not true and so filter2() will
+ // not be applied.
+ const uint16x4_t hev_tmp_mask = hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask = needs_filter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+ // filter2() will only be applied if both needs_filter4() and hev() are true.
+ *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p0p2_q0q2,
+ const int bitdepth) {
+ const int flat_thresh = 1 << (bitdepth - 8);
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+static INLINE void filter6_masks(
+ const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh, const int bitdepth,
+ uint16x4_t *const needs_filter6_mask, uint16x4_t *const is_flat3_mask,
+ uint16x4_t *const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = is_flat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), bitdepth);
+ *needs_filter6_mask = needs_filter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+ inner_thresh, outer_mask);
+}
+
+// is_flat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
+ const uint16x8_t abd_pn1p0_qn1q0,
+ const uint16x8_t abd_pn2p0_qn2q0,
+ const int bitdepth) {
+ const int flat_thresh = 1 << (bitdepth - 8);
+ const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+ const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+static INLINE void filter8_masks(
+ const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, const uint16_t hev_thresh,
+ const uint16x4_t outer_mask, const uint16_t inner_thresh,
+ const int bitdepth, uint16x4_t *const needs_filter8_mask,
+ uint16x4_t *const is_flat4_mask, uint16x4_t *const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+ const uint16x4_t v_is_flat4 = is_flat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2),
+ vabdq_u16(p0q0, p3q3), bitdepth);
+ *needs_filter8_mask =
+ needs_filter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+ inner_thresh, outer_mask);
+ // |is_flat4_mask| is used to decide where to use the result of filter8.
+ // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+ // overriding the question of whether to use filter8. Because filter4 doesn't
+ // apply to p2q2, |is_flat4_mask| chooses directly between filter8 and the
+ // source value. To be correct, the mask must account for this override.
+ *is_flat4_mask = vand_u16(v_is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN functions.
+
+// Calculate filter4() or filter2() based on |hev_mask|.
+static INLINE void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+ const uint16x8_t p1q1, const uint16x4_t hev_mask,
+ int bitdepth, uint16x8_t *const p1q1_result,
+ uint16x8_t *const p0q0_result) {
+ const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // q0mp0 means "q0 minus p0".
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (bitdepth - 1)));
+ const int16x4_t max_signed_pixel = vdup_n_s16((1 << (bitdepth - 1)) - 1);
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int16x4_t p1mq1_saturated =
+ clip3_s16(p1mq1, min_signed_pixel, max_signed_pixel);
+ const int16x4_t hev_option =
+ vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+ // Need to figure out what's going on here because there are some unnecessary
+ // tricks to accommodate 8x8 as smallest 8bpp vector
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four =
+ clip3_s16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t plus_three =
+ clip3_s16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+ const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+ // a3 = (a1 + 1) >> 1;
+ const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+ const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+ const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+ // Need to shift the second term or we end up with a2_ma2.
+ const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+ const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+ *p1q1_result = convert_to_unsigned_pixel_u16(p1q1_a3, bitdepth);
+ *p0q0_result = convert_to_unsigned_pixel_u16(p0q0_a, bitdepth);
+}
+
+void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
+ uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
+ uint16_t *const dst_q0 = (uint16_t *)(s);
+ uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
+
+ const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1) };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+ // Already integrated the hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ // Offset by 2 uint16_t values to load from first p1 position.
+ uint16_t *dst = s - 2;
+ uint16_t *dst_p1 = dst;
+ uint16_t *dst_p0 = dst + pitch;
+ uint16_t *dst_q0 = dst + pitch * 2;
+ uint16_t *dst_q1 = dst + pitch * 3;
+
+ uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1) };
+ transpose_array_inplace_u16_4x4(src);
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+ // Already integrated the hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ transpose_array_inplace_u16_4x4(output);
+
+ vst1_u16(dst_p1, output[0]);
+ vst1_u16(dst_p0, output[1]);
+ vst1_u16(dst_q0, output[2]);
+ vst1_u16(dst_q1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_4_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t *const p1q1_output,
+ uint16x8_t *const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions.
+ // The formula is regrouped to allow 3 doubling operations to be combined.
+ //
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^^^^^^
+ uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p0q0);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^
+ sum = vshlq_n_u16(sum, 1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^ ^^^^^^
+ // Should dual issue with the left shift.
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+ const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+ sum = vaddq_u16(sum, outer_sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+ sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p2 = s - 3 * pitch;
+ uint16_t *const dst_p1 = s - 2 * pitch;
+ uint16_t *const dst_p0 = s - pitch;
+ uint16_t *const dst_q0 = s;
+ uint16_t *const dst_q1 = s + pitch;
+ uint16_t *const dst_q2 = s + 2 * pitch;
+
+ const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
+ vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2) };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+ // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // filter6() does not apply, but filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ // Left side of the filter window.
+ uint16_t *const dst = s - 3;
+ uint16_t *const dst_0 = dst;
+ uint16_t *const dst_1 = dst + pitch;
+ uint16_t *const dst_2 = dst + 2 * pitch;
+ uint16_t *const dst_3 = dst + 3 * pitch;
+
+ // Overread by 2 values. These overreads become the high halves of src_raw[2]
+ // and src_raw[3] after transpose.
+ uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
+ vld1q_u16(dst_2), vld1q_u16(dst_3) };
+ transpose_array_inplace_u16_4x8(src_raw);
+ // p2, p1, p0, q0, q1, q2
+ const uint16x4_t src[6] = {
+ vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
+ vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
+ vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+ };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+ // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // filter6() does not apply, but filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ transpose_array_inplace_u16_4x4(output);
+
+ // dst_n starts at p2, so adjust to p1.
+ vst1_u16(dst_0 + 1, output[0]);
+ vst1_u16(dst_1 + 1, output[1]);
+ vst1_u16(dst_2 + 1, output[2]);
+ vst1_u16(dst_3 + 1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_6_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ uint16x8_t *const p2q2_output,
+ uint16x8_t *const p1q1_output,
+ uint16x8_t *const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions.
+ // The formula is regrouped to allow 2 doubling operations to be combined.
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^
+ uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+ // Add two other terms to make dual issue with shift more likely.
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^^^
+ sum = vaddq_u16(sum, p01q01);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p3q3);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p2q2_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, p23q23);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+ sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
+ sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p3 = s - 4 * pitch;
+ uint16_t *const dst_p2 = s - 3 * pitch;
+ uint16_t *const dst_p1 = s - 2 * pitch;
+ uint16_t *const dst_p0 = s - pitch;
+ uint16_t *const dst_q0 = s;
+ uint16_t *const dst_q1 = s + pitch;
+ uint16_t *const dst_q2 = s + 2 * pitch;
+ uint16_t *const dst_q3 = s + 3 * pitch;
+
+ const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
+ vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1),
+ vld1_u16(dst_q2), vld1_u16(dst_q3) };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[2], src[3], src[4], src[5], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+ const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+ const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+ const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() does not apply, but filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+void aom_highbd_lpf_horizontal_8_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8_t reverse_low_half(const uint16x8_t a) {
+ return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst = s - 4;
+ uint16_t *const dst_0 = dst;
+ uint16_t *const dst_1 = dst + pitch;
+ uint16_t *const dst_2 = dst + 2 * pitch;
+ uint16_t *const dst_3 = dst + 3 * pitch;
+
+ // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+ // To get desired pairs after transpose, one half should be reversed.
+ uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3) };
+
+ // src[0] = p0q0
+ // src[1] = p1q1
+ // src[2] = p2q2
+ // src[3] = p3q3
+ loop_filter_transpose_u16_4x8q(src);
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask = outer_threshold(
+ vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+ vget_high_u16(src[1]), outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = src[0];
+ const uint16x8_t p1q1 = src[1];
+ const uint16x8_t p2q2 = src[2];
+ const uint16x8_t p3q3 = src[3];
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() does not apply, but filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
+ // After transpose, |output| will contain rows of the form:
+ // p0 p1 p2 p3 q0 q1 q2 q3
+ transpose_array_inplace_u16_4x8(output);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, reverse_low_half(output[0]));
+ vst1q_u16(dst_1, reverse_low_half(output[1]));
+ vst1q_u16(dst_2, reverse_low_half(output[2]));
+ vst1q_u16(dst_3, reverse_low_half(output[3]));
+}
+
+void aom_highbd_lpf_vertical_8_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void filter14(
+ const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4,
+ const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t *const p5q5_output,
+ uint16x8_t *const p4q4_output, uint16x8_t *const p3q3_output,
+ uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output,
+ uint16x8_t *const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions.
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^^^^^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^^^^^^^^^^^^
+ uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+ sum = vaddq_u16(sum, p6q6_x7);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p5q5_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+ sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
+ sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+ const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
+ sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+ const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
+ sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+ const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
+ sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p6 = s - 7 * pitch;
+ uint16_t *const dst_p5 = s - 6 * pitch;
+ uint16_t *const dst_p4 = s - 5 * pitch;
+ uint16_t *const dst_p3 = s - 4 * pitch;
+ uint16_t *const dst_p2 = s - 3 * pitch;
+ uint16_t *const dst_p1 = s - 2 * pitch;
+ uint16_t *const dst_p0 = s - pitch;
+ uint16_t *const dst_q0 = s;
+ uint16_t *const dst_q1 = s + pitch;
+ uint16_t *const dst_q2 = s + 2 * pitch;
+ uint16_t *const dst_q3 = s + 3 * pitch;
+ uint16_t *const dst_q4 = s + 4 * pitch;
+ uint16_t *const dst_q5 = s + 5 * pitch;
+ uint16_t *const dst_q6 = s + 6 * pitch;
+
+ const uint16x4_t src[14] = {
+ vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+ vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+ vld1_u16(dst_q5), vld1_u16(dst_q6)
+ };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[5], src[6], src[7], src[8], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+ const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+ const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+ const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+ const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+ const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+ const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+ // Mask to choose between the outputs of filter8 and filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6), bd));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() and filter14() do not apply, but filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // filter14() does not apply, but filter8() and filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+
+ vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+ vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+ vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+ vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+ vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+ vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+void aom_highbd_lpf_horizontal_14_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab,
+ const uint16x8_t cd) {
+ uint16x8x2_t acdb;
+#if AOM_ARCH_AARCH64
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+ vreinterpretq_u64_u16(ab), 1));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+ vreinterpretq_u64_u16(ab), 0));
+#endif // AOM_ARCH_AARCH64
+ return acdb;
+}
+
+void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst = s - 8;
+ uint16_t *const dst_0 = dst;
+ uint16_t *const dst_1 = dst + pitch;
+ uint16_t *const dst_2 = dst + 2 * pitch;
+ uint16_t *const dst_3 = dst + 3 * pitch;
+
+ // Low halves: p7 p6 p5 p4
+ // High halves: p3 p2 p1 p0
+ uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3) };
+ // p7 will be the low half of src_p[0]. Not used until the end.
+ transpose_array_inplace_u16_4x8(src_p);
+
+ // Low halves: q0 q1 q2 q3
+ // High halves: q4 q5 q6 q7
+ uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+ vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
+ // q7 will be the high half of src_q[3]. Not used until the end.
+ transpose_array_inplace_u16_4x8(src_q);
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask = outer_threshold(
+ vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+ vget_low_u16(src_q[1]), outer_thresh);
+ const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+ const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+ const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+ const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+ const uint16x8_t p4q4 =
+ vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+ const uint16x8_t p5q5 =
+ vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+ const uint16x8_t p6q6 =
+ vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+ const uint16x8_t p7q7 =
+ vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+ // Mask to choose between the outputs of filter8 and filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6), bd));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() and filter14() do not apply, but filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // filter14() does not apply, but filter8() and filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+ // To get the correctly ordered rows from the transpose, we need:
+ // p7p3 p6p2 p5p1 p4p0
+ // q0q4 q1q5 q2q6 q3q7
+ const uint16x8x2_t p7p3_q3q7 = permute_acdb64(p7q7, p3q3_output);
+ const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
+ const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
+ const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
+ uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+ p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
+ transpose_array_inplace_u16_4x8(output_p);
+ uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+ p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
+ transpose_array_inplace_u16_4x8(output_q);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, output_p[0]);
+ vst1q_u16(dst_0 + 8, output_q[0]);
+ vst1q_u16(dst_1, output_p[1]);
+ vst1q_u16(dst_1 + 8, output_q[1]);
+ vst1q_u16(dst_2, output_p[2]);
+ vst1q_u16(dst_2 + 8, output_q[2]);
+ vst1q_u16(dst_3, output_p[3]);
+ vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+void aom_highbd_lpf_vertical_14_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1,
+ thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 0000000000..9262d818e9
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ const uint16x8_t s0 = vld1q_u16(src);
+ const uint16x8_t a0 = vld1q_u16(a);
+ const uint16x8_t b0 = vld1q_u16(b);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+ return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+ return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+ return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+ sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+ sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+ sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+ height -= 4;
+ } while (height != 0);
+
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+ sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+ return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad = vdupq_n_u16(0);
+ for (int h = 0; h < 4; ++h) {
+ sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 8; ++h) {
+ sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 8;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 16; ++h) {
+ sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 16;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 8);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 64 elements in the
+ // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ uint16x4_t sad = vdup_n_u16(0);
+ do {
+ uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+ uint16x4_t a0 = load_unaligned_u16_4x1(a);
+ uint16x4_t b0 = load_unaligned_u16_4x1(b);
+ uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+ sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 0000000000..28699e6f41
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+ const int32_t *mask,
+ const int32_t *wsrc,
+ uint32x4_t *sum) {
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+ int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+ *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height / 2;
+ do {
+ uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += 2 * ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t r = vld1q_u16(ref_ptr);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int width, int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+ uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+ wsrc += 16;
+ mask += 16;
+ i += 16;
+ } while (i < width);
+
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+ uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+ uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+ highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+ highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+ wsrc += 32;
+ mask += 32;
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_neon( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+ }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 0000000000..d59224619b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ uint32x4_t *sse,
+ int32x4_t *sum) {
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+ int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+ int32x4_t mask_lo = vld1q_s32(&mask[0]);
+ int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+ int32x4_t diff_hi =
+ vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+ diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+ diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+ // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+ // This difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+ diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+ int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+ int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+ *sum = vaddq_s32(*sum, round_lo);
+ *sum = vaddq_s32(*sum, round_hi);
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+ vreinterpretq_u32_s32(round_lo));
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+ vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+ int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ do {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ int j = 0;
+
+ do {
+ int i = 0;
+
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+ &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ j++;
+ } while (j < h_tmp);
+
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+ h -= h_tmp;
+ } while (h != 0);
+
+ *sse = horizontal_add_u64x2(sse_u64);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = vld1q_u16(pre);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += pre_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ assert(h % 2 == 0);
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += 2 * pre_stride;
+ wsrc += 8;
+ mask += 8;
+ h -= 2;
+ } while (h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+ &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+ &sse64, &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..6149c9f13e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/quantize.h"
+
+static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+ return (uint32_t)vget_lane_u64(c, 0);
+#endif
+}
+
+static INLINE uint16x4_t
+quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32,
+ int32x4_t v_quant_shift_s32, int log_scale) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ // if (abs_coeff < zbins[rc != 0]),
+ const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32);
+ const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+ // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+ const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+ // const int32_t tmpw32 = tmp * wt;
+ const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS)));
+ // const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16);
+ const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32);
+ // const int32_t tmp3 =
+ // ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32);
+ const int32x4_t v_tmp3 = vqdmulhq_s32(
+ vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32);
+ // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0;
+ const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask),
+ vshrq_n_s32(v_tmp3, AOM_QM_BITS));
+ // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale;
+ // vshlq_s32 will shift right if shift value is negative.
+ const int32x4_t v_abs_dqcoeff =
+ vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Used to find eob.
+ const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+ return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+ int16x8_t v_eobmax,
+ uint16x8_t v_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void get_min_max_lane_eob(const int16_t *iscan,
+ int16x8_t *v_eobmin,
+ int16x8_t *v_eobmax, uint16x8_t v_mask,
+ intptr_t n_coeffs) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
+#if SKIP_EOB_FACTOR_ADJUST
+ const int16x8_t v_nz_iscan_min =
+ vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
+ *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
+#else
+ (void)v_eobmin;
+#endif
+ *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
+static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vminvq_s16(v_eobmin);
+#else
+ const int16x4_t v_eobmin_3210 =
+ vmin_s16(vget_low_s16(v_eobmin), vget_high_s16(v_eobmin));
+ const int64x1_t v_eobmin_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmin_3210), 32);
+ const int16x4_t v_eobmin_tmp =
+ vmin_s16(v_eobmin_3210, vreinterpret_s16_s64(v_eobmin_xx32));
+ const int64x1_t v_eobmin_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmin_tmp), 16);
+ const int16x4_t v_eobmin_final =
+ vmin_s16(v_eobmin_tmp, vreinterpret_s16_s64(v_eobmin_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
+#endif
+}
+#endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
+
+static void highbd_quantize_b_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const int log_scale) {
+ (void)scan;
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+ const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+ const int16x4_t v_round_log_scale =
+ vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_round =
+ vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+ const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+ const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+ const int16x4_t v_zbin_log_scale =
+ vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_zbin =
+ vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+ int32x4_t v_round_s32 = vmovl_s16(v_round);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+ int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+ int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+ int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ intptr_t non_zero_count = n_coeffs;
+
+ assert(n_coeffs > 8);
+ // Pre-scan pass
+ const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+ intptr_t i = n_coeffs;
+ do {
+ const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+ const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+ const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+ const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+ const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x);
+ const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x);
+ // If the coefficient is in the base ZBIN range, then discard.
+ if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+ non_zero_count -= 8;
+ } else {
+ break;
+ }
+ i -= 8;
+ } while (i > 0);
+
+ const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+ memset(qcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+ // DC and first 3 AC
+ v_mask_lo =
+ quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+ v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+ v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+ v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+ // 4 more AC
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ intptr_t count = non_zero_count - 8;
+ for (; count > 0; count -= 8) {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ }
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
+
+void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, 2);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_quantize_b_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const int log_scale) {
+ (void)scan;
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+ const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+ const int16x4_t v_round_log_scale =
+ vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_round =
+ vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+ const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+ const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+ const int16x4_t v_zbin_log_scale =
+ vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_zbin =
+ vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+ int32x4_t v_round_s32 = vmovl_s16(v_round);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+ int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+ int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+ int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+ int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
+
+ assert(n_coeffs > 8);
+ // Pre-scan pass
+ const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+ const int prescan_add_1 =
+ ROUND_POWER_OF_TWO(dequant_ptr[1] * EOB_FACTOR, 7 + AOM_QM_BITS);
+ const int32x4_t v_zbin_prescan =
+ vaddq_s32(v_zbin_s32x, vdupq_n_s32(prescan_add_1));
+ intptr_t non_zero_count = n_coeffs;
+ intptr_t i = n_coeffs;
+ do {
+ const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+ const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+ const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+ const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+ const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_prescan);
+ const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_prescan);
+ // If the coefficient is in the base ZBIN range, then discard.
+ if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+ non_zero_count -= 8;
+ } else {
+ break;
+ }
+ i -= 8;
+ } while (i > 0);
+
+ const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+ memset(qcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+ // DC and first 3 AC
+ v_mask_lo =
+ quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+ v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+ v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+ v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+ // 4 more AC
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+
+ get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
+ vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
+
+ intptr_t count = non_zero_count - 8;
+ for (; count > 0; count -= 8) {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+
+ get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
+ vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
+ }
+
+ int eob = get_max_eob(v_eobmax);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ const int first = get_min_eob(v_eobmin);
+ if (eob >= 0 && first == eob) {
+ const int rc = scan[eob];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ eob = -1;
+ }
+ }
+ }
+#endif // SKIP_EOB_FACTOR_ADJUST
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_neon(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_neon(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_neon(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2);
+}
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..d51f639de6
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ sum = vabal_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ sum = vabaq_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t sum_u16 = vabdq_u16(s, r);
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr);
+ uint16x8_t diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
+ uint16x8_t diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + j);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr + j);
+ uint16x8_t diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
+ uint16x8_t diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
+ uint16x8_t diff2 = vabdq_u16(s2, r2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
+ uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
+ uint16x8_t diff3 = vabdq_u16(s3, r3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h);
+}
+
+static INLINE unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h);
+}
+
+static INLINE unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+ (h)); \
+ }
+
+#define HBD_SAD_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+ (h)); \
+ }
+
+HBD_SAD_WXH_SMALL_NEON(4, 4)
+HBD_SAD_WXH_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_SMALL_NEON(8, 4)
+HBD_SAD_WXH_SMALL_NEON(8, 8)
+HBD_SAD_WXH_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_LARGE_NEON(16, 8)
+HBD_SAD_WXH_LARGE_NEON(16, 16)
+HBD_SAD_WXH_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_LARGE_NEON(32, 16)
+HBD_SAD_WXH_LARGE_NEON(32, 32)
+HBD_SAD_WXH_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_LARGE_NEON(64, 32)
+HBD_SAD_WXH_LARGE_NEON(64, 64)
+HBD_SAD_WXH_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_LARGE_NEON(128, 64)
+HBD_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_LARGE_NEON(16, 4)
+HBD_SAD_WXH_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+#define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ uint16x4_t p = vld1_u16(pred16_ptr);
+
+ uint16x4_t avg = vrhadd_u16(r, p);
+ sum = vabal_u16(sum, s, avg);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 4;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t p = vld1q_u16(pred16_ptr);
+
+ uint16x8_t avg = vrhaddq_u16(r, p);
+ uint16x8_t diff = vabdq_u16(s, avg);
+ sum = vpadalq_u16(sum, diff);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1, p0, p1;
+ uint16x8_t avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ p0 = vld1q_u16(pred16_ptr);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ p1 = vld1q_u16(pred16_ptr + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ p0 = vld1q_u16(pred16_ptr + j);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ p1 = vld1q_u16(pred16_ptr + j + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ p2 = vld1q_u16(pred16_ptr + j + 16);
+ avg2 = vrhaddq_u16(r2, p2);
+ diff2 = vabdq_u16(s2, avg2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ p3 = vld1q_u16(pred16_ptr + j + 24);
+ avg3 = vrhaddq_u16(r3, p3);
+ diff3 = vabdq_u16(s3, avg3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += w;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h) \
+ uint32_t aom_highbd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c
new file mode 100644
index 0000000000..85ca6732a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+ uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+ sum[3] = vabal_u16(sum[3], s, r3);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+ sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+ } while (++i < h);
+
+ sum_u32[0] = vpaddlq_u16(sum[0]);
+ sum_u32[1] = vpaddlq_u16(sum[1]);
+ sum_u32[2] = vpaddlq_u16(sum[2]);
+ sum_u32[3] = vpaddlq_u16(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+ uint32x4_t *const sad_sum) {
+ uint16x8_t abs_diff = vabdq_u16(src, ref);
+ *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+ sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+ sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+ sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+ &sum_lo[3]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+ &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad128xhx4d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \
+ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \
+ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+ res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+ res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum[3];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_neon.c b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000000..184e9f9bef
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+ const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[16];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[4];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2];
+ highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ // Peel the first loop iteration.
+ uint16x4_t s = vld1_u16(src);
+ uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t abs_diff = vabd_u16(s, r);
+ uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ s = vld1_u16(src);
+ r = vld1_u16(ref);
+
+ abs_diff = vabd_u16(s, r);
+ sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int width, int height) {
+ // { 0, 1, 2, 3, 4, 5, 6, 7 }
+ uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+ uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+ uint64_t sse = 0;
+
+ do {
+ int w = width;
+ int offset = 0;
+
+ do {
+ uint16x8_t s = vld1q_u16(src + offset);
+ uint16x8_t r = vld1q_u16(ref + offset);
+
+ if (w < 8) {
+ // Mask out-of-range elements.
+ s = vandq_u16(s, remainder_mask);
+ r = vandq_u16(r, remainder_mask);
+ }
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ offset += 8;
+ w -= 8;
+ } while (w > 0);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride, int width,
+ int height) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ switch (width) {
+ case 4:
+ return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+ default:
+ return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_sve.c b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c
new file mode 100644
index 0000000000..b267da5cfb
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+ uint64x2_t *sse) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+
+ *sse = aom_udotq_u16(*sse, abs_diff, abs_diff);
+}
+
+static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ sse[0] = vaddq_u64(sse[0], sse[1]);
+ sse[2] = vaddq_u64(sse[2], sse[3]);
+ sse[0] = vaddq_u64(sse[0], sse[2]);
+ return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ sse[0] = vaddq_u64(sse[0], sse[1]);
+ sse[2] = vaddq_u64(sse[2], sse[3]);
+ sse[0] = vaddq_u64(sse[0], sse[2]);
+ return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ sse[0] = vaddq_u64(sse[0], sse[1]);
+ sse[2] = vaddq_u64(sse[2], sse[3]);
+ sse[0] = vaddq_u64(sse[0], sse[2]);
+ return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
+}
+
+static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ height -= 2;
+ } while (height != 0);
+
+ return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
+}
+
+static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse = vdupq_n_u64(0);
+
+ do {
+ uint16x8_t s = load_unaligned_u16_4x2(src, src_stride);
+ uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ sse = aom_udotq_u16(sse, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ height -= 2;
+ } while (height != 0);
+
+ return vaddvq_u64(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int width, int height) {
+ svuint64_t sse = svdup_n_u64(0);
+ uint64_t step = svcnth();
+
+ do {
+ int w = 0;
+ const uint16_t *src_ptr = src;
+ const uint16_t *ref_ptr = ref;
+
+ do {
+ svbool_t pred = svwhilelt_b16_u32(w, width);
+ svuint16_t s = svld1_u16(pred, src_ptr);
+ svuint16_t r = svld1_u16(pred, ref_ptr);
+
+ svuint16_t abs_diff = svabd_u16_z(pred, s, r);
+
+ sse = svdot_u64(sse, abs_diff, abs_diff);
+
+ src_ptr += step;
+ ref_ptr += step;
+ w += step;
+ } while (w < width);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return svaddv_u64(svptrue_b64(), sse);
+}
+
+int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride, int width,
+ int height) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ switch (width) {
+ case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height);
+ case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height);
+ default:
+ return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..686fa5f226
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, blend);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, blend);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
+ src_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 8, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 128, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+ uint16x8_t p = vld1q_u16(second_pred);
+ avg = vrhaddq_u16(avg, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = vrhaddq_u16(s, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp, source_stride, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp, source_stride, source_stride, h, yoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp0, source_stride, 1, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp0, source_stride, 1, h, xoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \
+ h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ if (xoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ if (yoffset == 0) { \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \
+ w, h, src, src_stride, msk, msk_stride, \
+ invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \
+ w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \
+ src_stride, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ if (yoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ pre, pre_stride, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \
+ pre_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width,
+ int dst_height,
+ const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+ const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+ vst1_u16(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp, source_stride, source_stride, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..18b8efff4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ int i = h;
+ do {
+ const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = horizontal_add_s32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*128*32 =
+// 4286582784 for a 128x128 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s = vld1q_u16(src_ptr + j);
+ const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_s32x4(sum_s32);
+ *sse = horizontal_long_add_u32x4(vaddq_u32(
+ vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+static INLINE void highbd_variance_128xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse,
+ sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32, 64x64, 64x128, 128x64, 128x128 are
+// processed in a different helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+ const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+ int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_s32x4(sum_s32);
+ *sse = (uint64_t)horizontal_add_s64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+ sum);
+}
+
+static INLINE void highbd_variance_128xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 128, h, 8, sse,
+ sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h) \
+ uint32_t aom_highbd_8_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h) \
+ uint32_t aom_highbd_10_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h) \
+ uint32_t aom_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \
+ uint32_t aom_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+HBD_VARIANCE_WXH_8_NEON(64, 128)
+
+HBD_VARIANCE_WXH_8_NEON(128, 64)
+HBD_VARIANCE_WXH_8_NEON(128, 128)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+HBD_VARIANCE_WXH_10_NEON(64, 128)
+
+HBD_VARIANCE_WXH_10_NEON(128, 64)
+HBD_VARIANCE_WXH_10_NEON(128, 128)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 128)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 64)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 16)
+
+HBD_VARIANCE_WXH_8_NEON(8, 32)
+
+HBD_VARIANCE_WXH_8_NEON(16, 4)
+HBD_VARIANCE_WXH_8_NEON(16, 64)
+
+HBD_VARIANCE_WXH_8_NEON(32, 8)
+
+HBD_VARIANCE_WXH_8_NEON(64, 16)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 16)
+
+HBD_VARIANCE_WXH_10_NEON(8, 32)
+
+HBD_VARIANCE_WXH_10_NEON(16, 4)
+HBD_VARIANCE_WXH_10_NEON(16, 64)
+
+HBD_VARIANCE_WXH_10_NEON(32, 8)
+
+HBD_VARIANCE_WXH_10_NEON(64, 16)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 16)
+
+HBD_VARIANCE_WXH_12_NEON(8, 32)
+
+HBD_VARIANCE_WXH_12_NEON(16, 4)
+HBD_VARIANCE_WXH_12_NEON(16, 64)
+
+HBD_VARIANCE_WXH_12_NEON(32, 8)
+
+HBD_VARIANCE_WXH_12_NEON(64, 16)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse_u32[0] =
+ vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+ sse_u32[1] =
+ vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_10_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_12_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+ uint16x8_t s1, uint16x8_t d0,
+ uint16x8_t d1) {
+ uint16x8_t e0 = vabdq_u16(s0, d0);
+ uint16x8_t e1 = vabdq_u16(s1, d1);
+
+ uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+ mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+ mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+ mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+ return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+ uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else { // w == 4
+ do {
+ uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+ uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return horizontal_add_u64x2(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000000..d56ae97571
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h / 2;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t s1 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+ uint16x8_t r1 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+ sse); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..d0058bfa90
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/variance.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s64 = aom_sdotq_s16(sse_s64, diff, diff);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ h -= 2;
+ } while (h != 0);
+
+ *sum = vaddlvq_s16(sum_s16);
+ *sse = vaddvq_s64(sse_s64);
+}
+
+static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref,
+ int32x4_t *sum, int64x2_t *sse) {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ *sum = vpadalq_s16(*sum, diff);
+
+ *sse = aom_sdotq_s16(*sse, diff, diff);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sum = vaddlvq_s32(sum_s32);
+ *sse = vaddvq_s64(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]);
+ variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1]));
+ *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
+}
+
+static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ int j = 0;
+ do {
+ variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]);
+ variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1],
+ &sse_s64[1]);
+ variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2],
+ &sse_s64[2]);
+ variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3],
+ &sse_s64[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+ sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+ *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2]));
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+ sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+ *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2]));
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+static INLINE void highbd_variance_128xh_sve(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_8_SVE(w, h) \
+ uint32_t aom_highbd_8_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HBD_VARIANCE_WXH_10_SVE(w, h) \
+ uint32_t aom_highbd_10_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_SVE(w, h) \
+ uint32_t aom_highbd_12_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_SVE(4, 4)
+HBD_VARIANCE_WXH_8_SVE(4, 8)
+
+HBD_VARIANCE_WXH_8_SVE(8, 4)
+HBD_VARIANCE_WXH_8_SVE(8, 8)
+HBD_VARIANCE_WXH_8_SVE(8, 16)
+
+HBD_VARIANCE_WXH_8_SVE(16, 8)
+HBD_VARIANCE_WXH_8_SVE(16, 16)
+HBD_VARIANCE_WXH_8_SVE(16, 32)
+
+HBD_VARIANCE_WXH_8_SVE(32, 16)
+HBD_VARIANCE_WXH_8_SVE(32, 32)
+HBD_VARIANCE_WXH_8_SVE(32, 64)
+
+HBD_VARIANCE_WXH_8_SVE(64, 32)
+HBD_VARIANCE_WXH_8_SVE(64, 64)
+HBD_VARIANCE_WXH_8_SVE(64, 128)
+
+HBD_VARIANCE_WXH_8_SVE(128, 64)
+HBD_VARIANCE_WXH_8_SVE(128, 128)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_SVE(4, 4)
+HBD_VARIANCE_WXH_10_SVE(4, 8)
+
+HBD_VARIANCE_WXH_10_SVE(8, 4)
+HBD_VARIANCE_WXH_10_SVE(8, 8)
+HBD_VARIANCE_WXH_10_SVE(8, 16)
+
+HBD_VARIANCE_WXH_10_SVE(16, 8)
+HBD_VARIANCE_WXH_10_SVE(16, 16)
+HBD_VARIANCE_WXH_10_SVE(16, 32)
+
+HBD_VARIANCE_WXH_10_SVE(32, 16)
+HBD_VARIANCE_WXH_10_SVE(32, 32)
+HBD_VARIANCE_WXH_10_SVE(32, 64)
+
+HBD_VARIANCE_WXH_10_SVE(64, 32)
+HBD_VARIANCE_WXH_10_SVE(64, 64)
+HBD_VARIANCE_WXH_10_SVE(64, 128)
+
+HBD_VARIANCE_WXH_10_SVE(128, 64)
+HBD_VARIANCE_WXH_10_SVE(128, 128)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_SVE(4, 4)
+HBD_VARIANCE_WXH_12_SVE(4, 8)
+
+HBD_VARIANCE_WXH_12_SVE(8, 4)
+HBD_VARIANCE_WXH_12_SVE(8, 8)
+HBD_VARIANCE_WXH_12_SVE(8, 16)
+
+HBD_VARIANCE_WXH_12_SVE(16, 8)
+HBD_VARIANCE_WXH_12_SVE(16, 16)
+HBD_VARIANCE_WXH_12_SVE(16, 32)
+
+HBD_VARIANCE_WXH_12_SVE(32, 16)
+HBD_VARIANCE_WXH_12_SVE(32, 32)
+HBD_VARIANCE_WXH_12_SVE(32, 64)
+
+HBD_VARIANCE_WXH_12_SVE(64, 32)
+HBD_VARIANCE_WXH_12_SVE(64, 64)
+HBD_VARIANCE_WXH_12_SVE(64, 128)
+
+HBD_VARIANCE_WXH_12_SVE(128, 64)
+HBD_VARIANCE_WXH_12_SVE(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+// 8-bit
+HBD_VARIANCE_WXH_8_SVE(4, 16)
+
+HBD_VARIANCE_WXH_8_SVE(8, 32)
+
+HBD_VARIANCE_WXH_8_SVE(16, 4)
+HBD_VARIANCE_WXH_8_SVE(16, 64)
+
+HBD_VARIANCE_WXH_8_SVE(32, 8)
+
+HBD_VARIANCE_WXH_8_SVE(64, 16)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_SVE(4, 16)
+
+HBD_VARIANCE_WXH_10_SVE(8, 32)
+
+HBD_VARIANCE_WXH_10_SVE(16, 4)
+HBD_VARIANCE_WXH_10_SVE(16, 64)
+
+HBD_VARIANCE_WXH_10_SVE(32, 8)
+
+HBD_VARIANCE_WXH_10_SVE(64, 16)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_SVE(4, 16)
+
+HBD_VARIANCE_WXH_12_SVE(8, 32)
+
+HBD_VARIANCE_WXH_12_SVE(16, 4)
+HBD_VARIANCE_WXH_12_SVE(16, 64)
+
+HBD_VARIANCE_WXH_12_SVE(32, 8)
+
+HBD_VARIANCE_WXH_12_SVE(64, 16)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef HBD_VARIANCE_WXH_8_SVE
+#undef HBD_VARIANCE_WXH_10_SVE
+#undef HBD_VARIANCE_WXH_12_SVE
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ unsigned int *sse) {
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse_u64 = aom_udotq_u16(sse_u64, diff, diff);
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sse = (uint32_t)vaddvq_u64(sse_u64);
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_10_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_12_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+ uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ uint16x8_t abs_diff0 = vabdq_u16(s0, d0);
+ uint16x8_t abs_diff1 = vabdq_u16(s1, d1);
+
+ sum = aom_udotq_u16(sum, abs_diff0, abs_diff0);
+ sum = aom_udotq_u16(sum, abs_diff1, abs_diff1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else { // w == 4
+ do {
+ uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+ uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ uint16x8_t abs_diff0 = vabdq_u16(s0, d0);
+ uint16x8_t abs_diff1 = vabdq_u16(s1, d1);
+
+ sum = aom_udotq_u16(sum, abs_diff0, abs_diff0);
+ sum = aom_udotq_u16(sum, abs_diff1, abs_diff1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return vaddvq_u64(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..d8dc60c1fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,3110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/reinterpret_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/intrapred_common.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
+ const uint8x8_t a = load_u8_4x1(in);
+ const uint16x4_t p0 = vpaddl_u8(a);
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ return vcombine_u16(p1, vdup_n_u16(0));
+}
+
+static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ store_u8_4x1(dst + i * stride, dc);
+ }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_4(above);
+ const uint16x8_t sum_left = dc_load_sum_4(left);
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_4(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
+ (void)above;
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_4(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
+ (void)left;
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc0 = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_4xh(dst, stride, 4, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
+ // This isn't used in the case where we want to load both above and left
+ // vectors, since we want to avoid performing the reduction twice.
+ const uint8x8_t a = vld1_u8(in);
+ const uint16x4_t p0 = vpaddl_u8(a);
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ return vcombine_u16(p2, vdup_n_u16(0));
+}
+
+static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
+#if AOM_ARCH_AARCH64
+ // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
+ // instruction, however the addv instruction is usually slightly more
+ // expensive than a pairwise addition, so the need for immediately
+ // broadcasting the result again seems to negate any benefit.
+ const uint16x8_t b = vpaddq_u16(a, a);
+ const uint16x8_t c = vpaddq_u16(b, b);
+ return vpaddq_u16(c, c);
+#else
+ const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+ const uint16x4_t c = vpadd_u16(b, b);
+ const uint16x4_t d = vpadd_u16(c, c);
+ return vcombine_u16(d, d);
+#endif
+}
+
+static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u8(dst + i * stride, dc);
+ }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t sum_top = vld1_u8(above);
+ const uint8x8_t sum_left = vld1_u8(left);
+ uint16x8_t sum = vaddl_u8(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_8(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
+ (void)above;
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_8(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
+ (void)left;
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc0 = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_8xh(dst, stride, 8, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
+ const uint8x16_t a = vld1q_u8(in);
+ // delay the remainder of the reduction until
+ // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+ // than twice in the case we are loading both above and left.
+ return vpaddlq_u8(a);
+}
+
+static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
+}
+
+static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_partial_sum_16(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_16(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
+ (void)above;
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_16(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
+ (void)left;
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 16, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
+ const uint8x16_t a0 = vld1q_u8(in);
+ const uint8x16_t a1 = vld1q_u8(in + 16);
+ // delay the remainder of the reduction until
+ // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+ // than twice in the case we are loading both above and left.
+ return vpadalq_u8(vpaddlq_u8(a0), a1);
+}
+
+static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
+}
+
+static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_partial_sum_32(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_32(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
+ (void)above;
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_32(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
+ (void)left;
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 32, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 64x64
+
+static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
+ const uint8x16_t a0 = vld1q_u8(in);
+ const uint8x16_t a1 = vld1q_u8(in + 16);
+ const uint8x16_t a2 = vld1q_u8(in + 32);
+ const uint8x16_t a3 = vld1q_u8(in + 48);
+ const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
+ const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
+ // delay the remainder of the reduction until
+ // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+ // than twice in the case we are loading both above and left.
+ return vaddq_u16(p01, p23);
+}
+
+static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
+}
+
+static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ vst1q_u8(dst + i * stride + 32, dc);
+ vst1q_u8(dst + i * stride + 48, dc);
+ }
+}
+
+void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_partial_sum_64(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_64(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
+ (void)above;
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_64(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
+ (void)left;
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_64xh(dst, stride, 64, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC rectangular cases
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier, int shift2) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> shift2;
+}
+
+static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
+ int shift1, int multiplier) {
+ const int expected_dc = divide_using_multiply_shift(
+ sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+ assert(expected_dc < (1 << 8));
+ return expected_dc;
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = load_u8_4x1(above);
+ uint8x8_t l = vld1_u8(left);
+ uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+ uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
+ dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint8x8_t l = load_u8_4x1(left);
+ uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+ uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
+ dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = load_u8_4x1(above);
+ uint8x16_t l = vld1q_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
+ dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a = vld1q_u8(above);
+ uint8x8_t l = load_u8_4x1(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
+ dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint8x16_t l = vld1q_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
+ dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a = vld1q_u8(above);
+ uint8x8_t l = vld1_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
+ dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddw_u8(sum_left, a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
+ dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_top = dc_load_partial_sum_32(above);
+ uint8x8_t l = vld1_u8(left);
+ uint16x8_t sum_al = vaddw_u8(sum_top, l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
+ dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_16(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
+ dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_32(above);
+ uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
+ dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_16(above);
+ uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
+ dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_64(above);
+ uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
+ dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_32(above);
+ uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
+ dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_64(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
+ dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+#define DC_PREDICTOR_128(w, h, q) \
+ void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)above; \
+ (void)left; \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \
+ }
+
+DC_PREDICTOR_128(4, 8, )
+DC_PREDICTOR_128(4, 16, )
+DC_PREDICTOR_128(8, 4, )
+DC_PREDICTOR_128(8, 16, )
+DC_PREDICTOR_128(8, 32, )
+DC_PREDICTOR_128(16, 4, q)
+DC_PREDICTOR_128(16, 8, q)
+DC_PREDICTOR_128(16, 32, q)
+DC_PREDICTOR_128(16, 64, q)
+DC_PREDICTOR_128(32, 8, q)
+DC_PREDICTOR_128(32, 16, q)
+DC_PREDICTOR_128(32, 64, q)
+DC_PREDICTOR_128(64, 32, q)
+DC_PREDICTOR_128(64, 16, q)
+
+#undef DC_PREDICTOR_128
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q) \
+ void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)above; \
+ const uint16x8_t sum = dc_load_sum_##h(left); \
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
+ }
+
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(8, 4, 2, )
+DC_PREDICTOR_LEFT(8, 16, 4, )
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 32, 5, )
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+
+#undef DC_PREDICTOR_LEFT
+
+#define DC_PREDICTOR_TOP(w, h, shift, q) \
+ void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)left; \
+ const uint16x8_t sum = dc_load_sum_##w(above); \
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
+ }
+
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, )
+DC_PREDICTOR_TOP(8, 16, 3, )
+DC_PREDICTOR_TOP(8, 32, 3, )
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
+
+static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t d0) {
+ for (int i = 0; i < h; ++i) {
+ store_u8_4x1(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t d0) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u8(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0, uint8x16_t d1) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + 0, d0);
+ vst1q_u8(dst + 16, d1);
+ dst += stride;
+ }
+}
+
+static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
+ uint8x16_t d3) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + 0, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 4, load_u8_4x1(above));
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 8, vld1_u8(above));
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 16, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 32, d0, d1);
+}
+
+void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 8, load_u8_4x1(above));
+}
+
+void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 16, load_u8_4x1(above));
+}
+
+void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 4, vld1_u8(above));
+}
+
+void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 16, vld1_u8(above));
+}
+
+void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 32, vld1_u8(above));
+}
+
+void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 4, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 8, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 32, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 64, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 8, d0, d1);
+}
+
+void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 16, d0, d1);
+}
+
+void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 64, d0, d1);
+}
+
+void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
+}
+
+// -----------------------------------------------------------------------------
+
+static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
+ store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
+ store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
+ store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
+ store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+ vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
+ vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
+ vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
+ vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = load_u8_4x1(left);
+ (void)above;
+ store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_8x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_16x8(dst, stride, vget_low_u8(d0));
+ h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ (void)above;
+ h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_4x8(dst, stride, d0);
+}
+
+void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = load_u8_4x1(left);
+ (void)above;
+ vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ (void)above;
+ h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = load_u8_4x1(left);
+ (void)above;
+ vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_16x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ (void)above;
+ h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ const uint8x16_t d2 = vld1q_u8(left + 32);
+ const uint8x16_t d3 = vld1q_u8(left + 48);
+ (void)above;
+ h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+ h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
+ h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
+ h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
+ h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_32x8(dst, stride, d0);
+}
+
+void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left + 0);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ const uint8x16_t d2 = vld1q_u8(left + 32);
+ const uint8x16_t d3 = vld1q_u8(left + 48);
+ (void)above;
+ h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+ h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
+ h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
+ h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
+ h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ for (int i = 0; i < 2; ++i) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ left += 16;
+ dst += 16 * stride;
+ }
+}
+
+void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ for (int i = 0; i < 4; ++i) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ left += 16;
+ dst += 16 * stride;
+ }
+}
+
+/* ---------------------P R E D I C T I O N Z 1--------------------------- */
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
+ int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ if (base_max_diff > H) base_max_diff = H;
+
+ uint8x8x2_t a01_128;
+ uint16x8_t shift;
+ if (upsample_above) {
+ a01_128 = vld2_u8(above + base);
+ shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
+ } else {
+ a01_128.val[0] = vld1_u8(above + base);
+ a01_128.val[1] = vld1_u8(above + base + 1);
+ shift = vdupq_n_u16((x & 0x3f) >> 1);
+ }
+ uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
+ uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
+ uint16x8_t res = vmlaq_u16(a32, diff, shift);
+
+ uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
+ dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
+
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ uint8x8_t dstvec[16];
+
+ dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
+ dx);
+ for (int i = 0; i < N; i++) {
+ vst1_lane_u32((uint32_t *)(dst + stride * i),
+ vreinterpret_u32_u8(dstvec[i]), 0);
+ }
+}
+
+static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ uint8x8_t dstvec[32];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
+ dx);
+ for (int i = 0; i < N; i++) {
+ vst1_u8(dst + stride * i, dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
+ int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ if (base_max_diff > H) base_max_diff = H;
+
+ uint16x8_t shift;
+ uint8x16_t a0_128, a1_128;
+ if (upsample_above) {
+ uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
+ a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
+ a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
+ shift = vdupq_n_u16(x & 0x1f);
+ } else {
+ a0_128 = vld1q_u8(above + base);
+ a1_128 = vld1q_u8(above + base + 1);
+ shift = vdupq_n_u16((x & 0x3f) >> 1);
+ }
+ uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+ uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+ uint16x8_t a32_lo =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t a32_hi =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+ uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+ uint8x16_t v_temp =
+ vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+
+ uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
+ dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
+
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ uint8x16_t dstvec[64];
+
+ dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ vst1q_u8(dst + stride * i, dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
+ int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i].val[0] = a_mbase_x; // save 32 values
+ dstvec[i].val[1] = a_mbase_x;
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+
+ uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
+
+ uint8x16_t res16[2];
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ uint8x16_t a0_128 = vld1q_u8(above + base + j);
+ uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
+ uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+ uint16x8_t diff_hi =
+ vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+ uint16x8_t a32_lo =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t a32_hi =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+ uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+
+ res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+ }
+ }
+
+ uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
+ uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
+ dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
+ dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int dx) {
+ uint8x16x2_t dstvec[64];
+
+ dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
+ for (int i = 0; i < N; i++) {
+ vst1q_u8(dst + stride * i, dstvec[i].val[0]);
+ vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
+ }
+}
+
+static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int dx) {
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+ const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ vst1q_u8(dst, a_mbase_x);
+ vst1q_u8(dst + 16, a_mbase_x);
+ vst1q_u8(dst + 32, a_mbase_x);
+ vst1q_u8(dst + 48, a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
+ uint8x16_t base_inc128 =
+ vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
+ vcreate_u8(0x0F0E0D0C0B0A0908)));
+
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ vst1q_u8(dst + j, a_mbase_x);
+ } else {
+ uint8x16_t a0_128 = vld1q_u8(above + base + j);
+ uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
+ uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+ uint16x8_t diff_hi =
+ vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+ uint16x8_t a32_lo =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t a32_hi =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+ uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+ uint8x16_t v_temp =
+ vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+
+ uint8x16_t mask128 =
+ vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
+ uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
+ vst1q_u8(dst + j, res128);
+
+ base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
+ case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
+ default: break;
+ }
+}
+
+/* ---------------------P R E D I C T I O N Z 2--------------------------- */
+
+#if !AOM_ARCH_AARCH64
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff }
+};
+#endif // !AOM_ARCH_AARCH64
+
+static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
+ const uint8_t *above, int upsample_above, int dx, int base_x, int y,
+ uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
+ uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
+ uint16x4_t ydx = vdup_n_u16(y * dx);
+ if (upsample_above) {
+ // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
+ // only load either 16 or 32.
+ uint8x8_t v_tmp = vld1_u8(above + base_x);
+ *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
+ *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
+ *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
+ } else {
+ *a0_x = load_u8_4x1(above + base_x);
+ *a1_x = load_u8_4x1(above + base_x + 1);
+ *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t left_vals,
+#else
+ const uint8_t *left,
+#endif
+ int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
+ uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
+ int16x4_t dy64 = vdup_n_s16(dy);
+ int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
+ int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
+ int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
+ int16x4_t v_r6 = vdup_n_s16(r << 6);
+ int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
+ int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
+
+ // Values in base_y_c64 range from -2 through 14 inclusive.
+ base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
+
+#if AOM_ARCH_AARCH64
+ uint8x8_t left_idx0 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16]
+ uint8x8_t left_idx1 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17]
+
+ *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
+ *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
+#else // !AOM_ARCH_AARCH64
+ DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
+
+ vst1_s16(base_y_c, base_y_c64);
+ uint8x8_t a0_y_u8 = vdup_n_u8(0);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
+
+ base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
+ vst1_s16(base_y_c, base_y_c64);
+ uint8x8_t a1_y_u8 = vdup_n_u8(0);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
+
+ *a0_y = vreinterpret_u16_u8(a0_y_u8);
+ *a1_y = vreinterpret_u16_u8(a1_y_u8);
+#endif // AOM_ARCH_AARCH64
+
+ if (upsample_left) {
+ *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
+ } else {
+ *shift1 =
+ vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
+ }
+}
+
+static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
+ const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
+ uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+ vcreate_u16(0x0008000700060005));
+ uint16x8_t ydx = vdupq_n_u16(y * dx);
+ uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
+
+ uint16x8_t shift0;
+ uint8x8_t a0_x0;
+ uint8x8_t a1_x0;
+ if (upsample_above) {
+ uint8x8x2_t v_tmp = vld2_u8(above + base_x);
+ a0_x0 = v_tmp.val[0];
+ a1_x0 = v_tmp.val[1];
+ shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
+ } else {
+ a0_x0 = vld1_u8(above + base_x);
+ a1_x0 = vld1_u8(above + base_x + 1);
+ shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
+ }
+
+ uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x]
+ uint16x8_t a32 =
+ vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16
+ uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
+ return vshrn_n_u16(res, 5);
+}
+
+static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
+#if AOM_ARCH_AARCH64
+ uint8x16x3_t left_vals,
+#else
+ const uint8_t *left,
+#endif
+ int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
+ int16x8_t v_r6 = vdupq_n_s16(r << 6);
+ int16x8_t dy128 = vdupq_n_s16(dy);
+ int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+ int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
+
+ uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+ vcreate_u16(0x0008000700060005));
+ int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
+ int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
+
+ // Values in base_y_c128 range from -2 through 31 inclusive.
+ base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
+
+#if AOM_ARCH_AARCH64
+ uint8x16_t left_idx0 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33]
+ uint8x16_t left_idx1 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34]
+ uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+ uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
+ uint8x8_t a0_x1 = vget_low_u8(a01_x);
+ uint8x8_t a1_x1 = vget_high_u8(a01_x);
+#else // !AOM_ARCH_AARCH64
+ uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
+ uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
+#endif // AOM_ARCH_AARCH64
+
+ uint16x8_t shift1;
+ if (upsample_left) {
+ shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
+ } else {
+ shift1 = vshrq_n_u16(
+ vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
+ }
+
+ uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
+ uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
+ uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
+ return vshrn_n_u16(res, 5);
+}
+
+static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
+ const uint8_t *above, int dx, int base_x, int y, int j) {
+ uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004)),
+ vcombine_u16(vcreate_u16(0x000B000A00090008),
+ vcreate_u16(0x000F000E000D000C)) } };
+ uint16x8_t j256 = vdupq_n_u16(j);
+ uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
+
+ const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
+ const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
+ uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
+ uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
+ uint16x8_t shift0 =
+ vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
+ uint16x8_t shift1 =
+ vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
+ // a[x+1] - a[x]
+ uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
+ uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
+ // a[x] * 32 + 16
+ uint16x8_t a32_0 =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
+ uint16x8_t a32_1 =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
+ uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
+ uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
+ return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
+}
+
+static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
+#if AOM_ARCH_AARCH64
+ uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
+#else
+ const uint8_t *left,
+#endif
+ int dy, int r, int j) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_y = -1;
+
+ int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
+ int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
+ int16x8_t dy256 = vdupq_n_s16(dy);
+ uint16x8_t j256 = vdupq_n_u16(j);
+
+ uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004)),
+ vcombine_u16(vcreate_u16(0x000B000A00090008),
+ vcreate_u16(0x000F000E000D000C)) } };
+ uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
+ vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
+
+ int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+ int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
+ int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
+ int16x8_t mul16_lo = vreinterpretq_s16_u16(
+ vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
+ vreinterpretq_u16_s16(half_min_base_y256)));
+ int16x8_t mul16_hi = vreinterpretq_s16_u16(
+ vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
+ vreinterpretq_u16_s16(half_min_base_y256)));
+ int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
+ int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
+
+ int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
+ int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
+
+ base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
+ base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
+
+#if !AOM_ARCH_AARCH64
+ int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
+ int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
+ int16_t offset_diff = max_y - min_y;
+
+ uint8x8_t a0_y0;
+ uint8x8_t a0_y1;
+ uint8x8_t a1_y0;
+ uint8x8_t a1_y1;
+ if (offset_diff < 16) {
+ // Avoid gathers where the data we want is close together in memory.
+ // We don't need this for AArch64 since we can already use TBL to cover the
+ // full range of possible values.
+ assert(offset_diff >= 0);
+ int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
+
+ int16x8x2_t base_y_offset;
+ base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
+ base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
+
+ int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
+ vqmovn_s16(base_y_offset.val[1]));
+
+ uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
+ uint8x16_t a0_y128 = vld1q_u8(left + min_y);
+ uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
+ a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
+ a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
+
+ uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
+ uint8x8_t v_index_high =
+ vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
+ uint8x8x2_t v_tmp, v_res;
+ v_tmp.val[0] = vget_low_u8(a0_y128);
+ v_tmp.val[1] = vget_high_u8(a0_y128);
+ v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+ v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+ a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+ v_tmp.val[0] = vget_low_u8(a1_y128);
+ v_tmp.val[1] = vget_high_u8(a1_y128);
+ v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+ v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+ a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+
+ a0_y0 = vget_low_u8(a0_y128);
+ a0_y1 = vget_high_u8(a0_y128);
+ a1_y0 = vget_low_u8(a1_y128);
+ a1_y1 = vget_high_u8(a1_y128);
+ } else {
+ a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
+ a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
+ a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
+ a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
+ }
+#else
+ // Values in left_idx{0,1} range from 0 through 63 inclusive.
+ uint8x16_t left_idx0 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
+ uint8x16_t left_idx1 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
+ uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+ uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
+ uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
+
+ uint8x8_t a0_y0 = vget_low_u8(a0_y01);
+ uint8x8_t a0_y1 = vget_high_u8(a0_y01);
+ uint8x8_t a1_y0 = vget_low_u8(a1_y01);
+ uint8x8_t a1_y1 = vget_high_u8(a1_y01);
+#endif // !AOM_ARCH_AARCH64
+
+ uint16x8_t shifty_lo = vshrq_n_u16(
+ vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
+ uint16x8_t shifty_hi = vshrq_n_u16(
+ vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
+
+ // a[x+1] - a[x]
+ uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
+ uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
+ // a[x] * 32 + 16
+ uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
+ uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
+
+ uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
+ uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
+
+ return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
+}
+
+static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+#if AOM_ARCH_AARCH64
+ // Use ext rather than loading left + 14 directly to avoid over-read.
+ const uint8x16_t left_m2 = vld1q_u8(left - 2);
+ const uint8x16_t left_0 = vld1q_u8(left);
+ const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
+ const uint8x16x2_t left_vals = { { left_m2, left_14 } };
+#define LEFT left_vals
+#else // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif // AOM_ARCH_AARCH64
+
+ for (int r = 0; r < N; r++) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ const int base_min_diff =
+ (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
+ upsample_above;
+
+ if (base_min_diff <= 0) {
+ uint8x8_t a0_x_u8, a1_x_u8;
+ uint16x4_t shift0;
+ dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
+ &a0_x_u8, &a1_x_u8, &shift0);
+ uint8x8_t a0_x = a0_x_u8;
+ uint8x8_t a1_x = a1_x_u8;
+
+ uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x]
+ uint16x8_t a32 =
+ vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16
+ uint16x8_t res =
+ vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
+ uint8x8_t resx = vshrn_n_u16(res, 5);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
+ } else if (base_min_diff < 4) {
+ uint8x8_t a0_x_u8, a1_x_u8;
+ uint16x4_t shift0;
+ dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
+ &a0_x_u8, &a1_x_u8, &shift0);
+ uint16x8_t a0_x = vmovl_u8(a0_x_u8);
+ uint16x8_t a1_x = vmovl_u8(a1_x_u8);
+
+ uint16x4_t a0_y;
+ uint16x4_t a1_y;
+ uint16x4_t shift1;
+ dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
+ frac_bits_y, &a0_y, &a1_y, &shift1);
+ a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
+ a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
+
+ uint16x8_t shift = vcombine_u16(shift0, shift1);
+ uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
+ uint16x8_t a32 =
+ vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16
+ uint16x8_t res = vmlaq_u16(a32, diff, shift);
+ uint8x8_t resx = vshrn_n_u16(res, 5);
+ uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
+
+ uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+ uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
+ } else {
+ uint16x4_t a0_y, a1_y;
+ uint16x4_t shift1;
+ dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
+ frac_bits_y, &a0_y, &a1_y, &shift1);
+ uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x]
+ uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16
+ uint16x4_t res = vmla_u16(a32, diff, shift1);
+ uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
+
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
+ }
+
+ dst += stride;
+ }
+#undef LEFT
+}
+
+static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+#if AOM_ARCH_AARCH64
+ // Use ext rather than loading left + 30 directly to avoid over-read.
+ const uint8x16_t left_m2 = vld1q_u8(left - 2);
+ const uint8x16_t left_0 = vld1q_u8(left + 0);
+ const uint8x16_t left_16 = vld1q_u8(left + 16);
+ const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
+ const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
+ const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
+#define LEFT left_vals
+#else // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif // AOM_ARCH_AARCH64
+
+ for (int r = 0; r < N; r++) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+
+ if (base_min_diff <= 0) {
+ uint8x8_t resx =
+ dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
+ vst1_u8(dst, resx);
+ } else if (base_min_diff < 8) {
+ uint8x8_t resx =
+ dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
+ uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
+ LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
+ uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+ uint8x8_t resxy = vbsl_u8(mask, resy, resx);
+ vst1_u8(dst, resxy);
+ } else {
+ uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
+ LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
+ vst1_u8(dst, resy);
+ }
+
+ dst += stride;
+ }
+#undef LEFT
+}
+
+static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+
+#if AOM_ARCH_AARCH64
+ const uint8x16_t left_m1 = vld1q_u8(left - 1);
+ const uint8x16_t left_0 = vld1q_u8(left + 0);
+ const uint8x16_t left_16 = vld1q_u8(left + 16);
+ const uint8x16_t left_32 = vld1q_u8(left + 32);
+ const uint8x16_t left_48 = vld1q_u8(left + 48);
+ const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
+ const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
+ const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
+ const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
+ const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
+#define LEFT left_vals0, left_vals1
+#else // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif // AOM_ARCH_AARCH64
+
+ for (int r = 0; r < H; r++) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> 6;
+ for (int j = 0; j < W; j += 16) {
+ const int base_min_diff = min_base_x - base_x - j;
+
+ if (base_min_diff <= 0) {
+ uint8x16_t resx =
+ dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
+ vst1q_u8(dst + j, resx);
+ } else if (base_min_diff < 16) {
+ uint8x16_t resx =
+ dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
+ uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
+ uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
+ uint8x16_t resxy = vbslq_u8(mask, resy, resx);
+ vst1q_u8(dst + j, resxy);
+ } else {
+ uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
+ vst1q_u8(dst + j, resy);
+ }
+ } // for j
+ dst += stride;
+ }
+#undef LEFT
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
+ break;
+ }
+}
+
+/* ---------------------P R E D I C T I O N Z 3--------------------------- */
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
+ uint8x16x2_t *d) {
+ uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+ uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+
+ d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+ vreinterpretq_u16_u8(w1.val[0])));
+ d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+ vreinterpretq_u16_u8(w1.val[1])));
+}
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
+ uint8x8x2_t *d) {
+ uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+ uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+
+ *d = aom_reinterpret_u8_u16_x2(
+ vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
+}
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
+ uint8x8x2_t *d) {
+ uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+ uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+
+ d[0] = aom_reinterpret_u8_u16_x2(
+ vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
+ d[1] = aom_reinterpret_u8_u16_x2(
+ vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
+}
+
+static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
+ uint8_t *dst, ptrdiff_t pitchDst) {
+ // The same as the normal transposes in transpose_neon.h, but with a stride
+ // between consecutive vectors of elements.
+ uint8x16_t r[16];
+ uint8x16_t d[16];
+ for (int i = 0; i < 16; i++) {
+ r[i] = vld1q_u8(src + i * pitchSrc);
+ }
+ transpose_arrays_u8_16x16(r, d);
+ for (int i = 0; i < 16; i++) {
+ vst1q_u8(dst + i * pitchDst, d[i]);
+ }
+}
+
+static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
+ ptrdiff_t pitchSrc, uint8_t *dst,
+ ptrdiff_t pitchDst, int width,
+ int height) {
+ for (int j = 0; j < height; j += 16) {
+ for (int i = 0; i < width; i += 16) {
+ z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
+ }
+ }
+}
+
+static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[4];
+ uint8x8x2_t dest;
+
+ dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
+ z3_transpose_arrays_u8_4x4(dstvec, &dest);
+ store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
+ store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
+}
+
+static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[8];
+ uint8x8_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x8(dstvec, d);
+ store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
+}
+
+static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[4];
+ uint8x8x2_t d[2];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
+ z3_transpose_arrays_u8_8x4(dstvec, d);
+ store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
+ store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
+ store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
+ store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
+}
+
+static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[8];
+ uint8x8_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x8(dstvec, d);
+ store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
+}
+
+static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[8];
+ uint8x8_t d[16];
+
+ dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_16x8(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ vst1_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[16];
+ uint8x16_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x16(dstvec, d);
+ for (int i = 0; i < 8; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[4];
+ uint8x16x2_t d[2];
+
+ dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
+ z3_transpose_arrays_u8_16x4(dstvec, d);
+ store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
+ store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
+ store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
+ store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
+}
+
+static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[16];
+ uint8x16_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x16(dstvec, d);
+ for (int i = 0; i < 4; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8x16x2_t dstvec[16];
+ uint8x16_t d[32];
+ uint8x16_t v_zero = vdupq_n_u8(0);
+
+ dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i].val[0] = v_zero;
+ dstvec[i].val[1] = v_zero;
+ }
+ transpose_arrays_u8_32x16(dstvec, d);
+ for (int i = 0; i < 32; i++) {
+ vst1_u8(dst + i * stride, vget_low_u8(d[i]));
+ }
+}
+
+static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[32];
+ uint8x16_t d[16];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x16(dstvec, d);
+ transpose_arrays_u8_8x16(dstvec + 16, d + 8);
+ for (int i = 0; i < 8; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ vst1q_u8(dst + i * stride + 16, d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[16];
+ uint8x16_t d[16];
+
+ dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_16x16(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8x16x2_t dstvec[32];
+ uint8x16_t d[64];
+
+ dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
+ transpose_arrays_u8_32x16(dstvec, d);
+ transpose_arrays_u8_32x16(dstvec + 16, d + 32);
+ for (int i = 0; i < 32; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ vst1q_u8(dst + i * stride + 16, d[i + 32]);
+ }
+}
+
+static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+
+ dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8x16x2_t dstvec[16];
+ uint8x16_t d[32];
+
+ dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
+ transpose_arrays_u8_32x16(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
+ vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
+ }
+}
+
+static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[32];
+
+ dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ uint8x16_t d[16];
+ transpose_arrays_u8_16x16(dstvec + i, d);
+ for (int j = 0; j < 16; j++) {
+ vst1q_u8(dst + j * stride + i, d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8_t dstT[64 * 32];
+
+ dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8_t dstT[32 * 64];
+
+ dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
+}
+
+static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8_t dstT[64 * 16];
+
+ dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[64];
+
+ dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ uint8x16_t d[16];
+ transpose_arrays_u8_16x16(dstvec + i, d);
+ for (int j = 0; j < 16; ++j) {
+ vst1q_u8(dst + j * stride + i, d[j]);
+ }
+ }
+}
+
+typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy);
+
+static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
+ dr_prediction_z3_4x16_neon, NULL, NULL },
+ { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
+ dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
+ { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
+ dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
+ dr_prediction_z3_16x64_neon },
+ { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
+ dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
+ { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
+ dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
+};
+
+void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
+ assert(f != NULL);
+ f(dst, stride, left, upsample_left, dy);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
+ return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
+}
+
+static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ const int height) {
+ const uint8_t top_right = top_row[3];
+ const uint8_t bottom_left = left_column[height - 1];
+ const uint8_t *const weights_y = smooth_weights + height - 4;
+
+ uint8x8_t top_v = load_u8_4x1(top_row);
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
+ const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_bl, weights_y_v, top_v);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x_v, left_v);
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
+ dst += stride;
+ } while (++y != height);
+}
+
+static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
+ const uint16x8_t weighted_left_tr) {
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+}
+
+static INLINE uint8x8_t calculate_weights_and_pred(
+ const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+ const uint8x8_t bottom_left, const uint8x8_t weights_x,
+ const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+ const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+ const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+ return calculate_pred(weighted_top_bl, weighted_left_tr);
+}
+
+static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ const int height) {
+ const uint8_t top_right = top_row[7];
+ const uint8_t bottom_left = left_column[height - 1];
+ const uint8_t *const weights_y = smooth_weights + height - 4;
+
+ const uint8x8_t top_v = vld1_u8(top_row);
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
+ const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+ const uint8x8_t result =
+ calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
+ weights_x_v, scaled_weights_y, weights_y_v);
+
+ vst1_u8(dst, result);
+ dst += stride;
+ } while (++y != height);
+}
+
+#define SMOOTH_NXM(W, H) \
+ void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_NXM(4, 4)
+SMOOTH_NXM(4, 8)
+SMOOTH_NXM(8, 4)
+SMOOTH_NXM(8, 8)
+SMOOTH_NXM(4, 16)
+SMOOTH_NXM(8, 16)
+SMOOTH_NXM(8, 32)
+
+#undef SMOOTH_NXM
+
+static INLINE uint8x16_t calculate_weights_and_predq(
+ const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+ const uint8x8_t weights_y, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+ const uint16x8_t weighted_top_bl_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_low =
+ calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
+
+ const uint16x8_t weighted_top_bl_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_high =
+ calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
+
+ return vcombine_u8(result_low, result_high);
+}
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
+// For width 16 and above.
+#define SMOOTH_PREDICTOR(W) \
+ static void smooth_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t top_right = top_row[(W)-1]; \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x16_t top_v[4]; \
+ top_v[0] = vld1q_u8(top_row); \
+ if ((W) > 16) { \
+ top_v[1] = vld1q_u8(top_row + 16); \
+ if ((W) == 64) { \
+ top_v[2] = vld1q_u8(top_row + 32); \
+ top_v[3] = vld1q_u8(top_row + 48); \
+ } \
+ } \
+ \
+ const uint8x8_t top_right_v = vdup_n_u8(top_right); \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ uint8x16_t weights_x_v[4]; \
+ weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4); \
+ if ((W) > 16) { \
+ weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
+ if ((W) == 64) { \
+ weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
+ weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
+ } \
+ } \
+ \
+ uint8x16_t scaled_weights_x[4]; \
+ scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \
+ if ((W) > 16) { \
+ scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \
+ if ((W) == 64) { \
+ scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \
+ scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \
+ } \
+ } \
+ \
+ for (int y = 0; y < height; ++y) { \
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ const uint16x8_t weighted_bl = \
+ vmull_u8(scaled_weights_y, bottom_left_v); \
+ \
+ vst1q_u8(dst, calculate_weights_and_predq( \
+ top_v[0], left_v, top_right_v, weights_y_v, \
+ weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
+ \
+ if ((W) > 16) { \
+ vst1q_u8(dst + 16, \
+ calculate_weights_and_predq( \
+ top_v[1], left_v, top_right_v, weights_y_v, \
+ weights_x_v[1], scaled_weights_x[1], weighted_bl)); \
+ if ((W) == 64) { \
+ vst1q_u8(dst + 32, \
+ calculate_weights_and_predq( \
+ top_v[2], left_v, top_right_v, weights_y_v, \
+ weights_x_v[2], scaled_weights_x[2], weighted_bl)); \
+ vst1q_u8(dst + 48, \
+ calculate_weights_and_predq( \
+ top_v[3], left_v, top_right_v, weights_y_v, \
+ weights_x_v[3], scaled_weights_x[3], weighted_bl)); \
+ } \
+ } \
+ \
+ dst += stride; \
+ } \
+ }
+
+SMOOTH_PREDICTOR(16)
+SMOOTH_PREDICTOR(32)
+SMOOTH_PREDICTOR(64)
+
+#undef SMOOTH_PREDICTOR
+
+#define SMOOTH_NXM_WIDE(W, H) \
+ void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_NXM_WIDE(16, 4)
+SMOOTH_NXM_WIDE(16, 8)
+SMOOTH_NXM_WIDE(16, 16)
+SMOOTH_NXM_WIDE(16, 32)
+SMOOTH_NXM_WIDE(16, 64)
+SMOOTH_NXM_WIDE(32, 8)
+SMOOTH_NXM_WIDE(32, 16)
+SMOOTH_NXM_WIDE(32, 32)
+SMOOTH_NXM_WIDE(32, 64)
+SMOOTH_NXM_WIDE(64, 16)
+SMOOTH_NXM_WIDE(64, 32)
+SMOOTH_NXM_WIDE(64, 64)
+
+#undef SMOOTH_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_V_PREDICTOR(W) \
+ static void smooth_v_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x8_t top_v; \
+ if ((W) == 4) { \
+ top_v = load_u8_4x1(top_row); \
+ } else { /* width == 8 */ \
+ top_v = vld1_u8(top_row); \
+ } \
+ \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ \
+ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \
+ const uint16x8_t weighted_top_bl = \
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \
+ const uint8x8_t pred = \
+ vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \
+ \
+ if ((W) == 4) { \
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+ } else { /* width == 8 */ \
+ vst1_u8(dst, pred); \
+ } \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_V_PREDICTOR(4)
+SMOOTH_V_PREDICTOR(8)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM(W, H) \
+ void aom_smooth_v_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_V_NXM(4, 4)
+SMOOTH_V_NXM(4, 8)
+SMOOTH_V_NXM(4, 16)
+SMOOTH_V_NXM(8, 4)
+SMOOTH_V_NXM(8, 8)
+SMOOTH_V_NXM(8, 16)
+SMOOTH_V_NXM(8, 32)
+
+#undef SMOOTH_V_NXM
+
+static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+ const uint8x16_t top, const uint8x8_t weights_y,
+ const uint16x8_t weighted_bl) {
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_V_PREDICTOR(W) \
+ static void smooth_v_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x16_t top_v[4]; \
+ top_v[0] = vld1q_u8(top_row); \
+ if ((W) > 16) { \
+ top_v[1] = vld1q_u8(top_row + 16); \
+ if ((W) == 64) { \
+ top_v[2] = vld1q_u8(top_row + 32); \
+ top_v[3] = vld1q_u8(top_row + 48); \
+ } \
+ } \
+ \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ const uint16x8_t weighted_bl = \
+ vmull_u8(scaled_weights_y, bottom_left_v); \
+ \
+ const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \
+ top_v[0], weights_y_v, weighted_bl); \
+ vst1q_u8(dst, pred_0); \
+ \
+ if ((W) > 16) { \
+ const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \
+ top_v[1], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 16, pred_1); \
+ \
+ if ((W) == 64) { \
+ const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
+ top_v[2], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 32, pred_2); \
+ \
+ const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
+ top_v[3], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 48, pred_3); \
+ } \
+ } \
+ \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_V_PREDICTOR(16)
+SMOOTH_V_PREDICTOR(32)
+SMOOTH_V_PREDICTOR(64)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM_WIDE(W, H) \
+ void aom_smooth_v_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_V_NXM_WIDE(16, 4)
+SMOOTH_V_NXM_WIDE(16, 8)
+SMOOTH_V_NXM_WIDE(16, 16)
+SMOOTH_V_NXM_WIDE(16, 32)
+SMOOTH_V_NXM_WIDE(16, 64)
+SMOOTH_V_NXM_WIDE(32, 8)
+SMOOTH_V_NXM_WIDE(32, 16)
+SMOOTH_V_NXM_WIDE(32, 32)
+SMOOTH_V_NXM_WIDE(32, 64)
+SMOOTH_V_NXM_WIDE(64, 16)
+SMOOTH_V_NXM_WIDE(64, 32)
+SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef SMOOTH_V_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_H_PREDICTOR(W) \
+ static void smooth_h_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t top_right = top_row[(W)-1]; \
+ \
+ const uint8x8_t top_right_v = vdup_n_u8(top_right); \
+ /* Over-reads for 4xN but still within the array. */ \
+ const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4); \
+ const uint8x8_t scaled_weights_x = negate_s8(weights_x); \
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
+ const uint16x8_t weighted_left_tr = \
+ vmlal_u8(weighted_tr, weights_x, left_v); \
+ const uint8x8_t pred = \
+ vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \
+ \
+ if ((W) == 4) { \
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+ } else { /* width == 8 */ \
+ vst1_u8(dst, pred); \
+ } \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_H_PREDICTOR(4)
+SMOOTH_H_PREDICTOR(8)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM(W, H) \
+ void aom_smooth_h_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_H_NXM(4, 4)
+SMOOTH_H_NXM(4, 8)
+SMOOTH_H_NXM(4, 16)
+SMOOTH_H_NXM(8, 4)
+SMOOTH_H_NXM(8, 8)
+SMOOTH_H_NXM(8, 16)
+SMOOTH_H_NXM(8, 32)
+
+#undef SMOOTH_H_NXM
+
+static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
+ const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x) {
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
+
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
+
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_H_PREDICTOR(W) \
+ static void smooth_h_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t top_right = top_row[(W)-1]; \
+ \
+ const uint8x8_t top_right_v = vdup_n_u8(top_right); \
+ \
+ uint8x16_t weights_x[4]; \
+ weights_x[0] = vld1q_u8(smooth_weights + (W)-4); \
+ if ((W) > 16) { \
+ weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
+ if ((W) == 64) { \
+ weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
+ weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
+ } \
+ } \
+ \
+ uint8x16_t scaled_weights_x[4]; \
+ scaled_weights_x[0] = negate_s8q(weights_x[0]); \
+ if ((W) > 16) { \
+ scaled_weights_x[1] = negate_s8q(weights_x[1]); \
+ if ((W) == 64) { \
+ scaled_weights_x[2] = negate_s8q(weights_x[2]); \
+ scaled_weights_x[3] = negate_s8q(weights_x[3]); \
+ } \
+ } \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
+ \
+ const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \
+ vst1q_u8(dst, pred_0); \
+ \
+ if ((W) > 16) { \
+ const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \
+ vst1q_u8(dst + 16, pred_1); \
+ \
+ if ((W) == 64) { \
+ const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \
+ vst1q_u8(dst + 32, pred_2); \
+ \
+ const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \
+ vst1q_u8(dst + 48, pred_3); \
+ } \
+ } \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_H_PREDICTOR(16)
+SMOOTH_H_PREDICTOR(32)
+SMOOTH_H_PREDICTOR(64)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM_WIDE(W, H) \
+ void aom_smooth_h_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_H_NXM_WIDE(16, 4)
+SMOOTH_H_NXM_WIDE(16, 8)
+SMOOTH_H_NXM_WIDE(16, 16)
+SMOOTH_H_NXM_WIDE(16, 32)
+SMOOTH_H_NXM_WIDE(16, 64)
+SMOOTH_H_NXM_WIDE(32, 8)
+SMOOTH_H_NXM_WIDE(32, 16)
+SMOOTH_H_NXM_WIDE(32, 32)
+SMOOTH_H_NXM_WIDE(32, 64)
+SMOOTH_H_NXM_WIDE(64, 16)
+SMOOTH_H_NXM_WIDE(64, 32)
+SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ int width, int height) {
+ const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint8x8_t top;
+ if (width == 4) {
+ top = load_u8_4x1(top_row);
+ } else { // width == 8
+ top = vld1_u8(top_row);
+ }
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x8_t left = vdup_n_u8(left_column[y]);
+
+ const uint8x8_t left_dist = vabd_u8(top, top_left);
+ const uint8x8_t top_dist = vabd_u8(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+ const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+ const uint8x8_t left_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+ const uint8x8_t top_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x8_t result = vbsl_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbsl_u8(left_or_top_mask, result, top_left);
+
+ if (width == 4) {
+ store_u8_4x1(dest, result);
+ } else { // width == 8
+ vst1_u8(dest, result);
+ }
+ dest += stride;
+ } while (++y != height);
+}
+
+#define PAETH_NXM(W, H) \
+ void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+PAETH_NXM(4, 4)
+PAETH_NXM(4, 8)
+PAETH_NXM(8, 4)
+PAETH_NXM(8, 8)
+PAETH_NXM(8, 16)
+
+PAETH_NXM(4, 16)
+PAETH_NXM(8, 32)
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
+ const uint16x8_t top_left_dist_low,
+ const uint16x8_t top_left_dist_high) {
+ const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+ vqmovn_u16(top_left_dist_high));
+ return vcleq_u8(x_dist, top_left_dist);
+}
+
+// Select the closest values and collect them.
+static INLINE uint8x16_t select_paeth(const uint8x16_t top,
+ const uint8x16_t left,
+ const uint8x16_t top_left,
+ const uint8x16_t left_le_top,
+ const uint8x16_t left_le_top_left,
+ const uint8x16_t top_le_top_left) {
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x16_t result = vbslq_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num) \
+ const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
+ vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+ const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
+ vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num) \
+ const uint8x16_t left_le_top_left_##num = \
+ x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
+ top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num) \
+ const uint8x16_t top_le_top_left_##num = x_le_top_left( \
+ top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ int width, int height) {
+ const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint8x16_t top[4];
+ top[0] = vld1q_u8(top_row);
+ if (width > 16) {
+ top[1] = vld1q_u8(top_row + 16);
+ if (width == 64) {
+ top[2] = vld1q_u8(top_row + 32);
+ top[3] = vld1q_u8(top_row + 48);
+ }
+ }
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x16_t left = vdupq_n_u8(left_column[y]);
+
+ const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+ const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+ TOP_LEFT_DIST(0);
+ const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+ LEFT_LE_TOP_LEFT(0);
+ TOP_LE_TOP_LEFT(0);
+
+ const uint8x16_t result_0 =
+ select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+ top_le_top_left_0);
+ vst1q_u8(dest, result_0);
+
+ if (width > 16) {
+ const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+ TOP_LEFT_DIST(1);
+ const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+ LEFT_LE_TOP_LEFT(1);
+ TOP_LE_TOP_LEFT(1);
+
+ const uint8x16_t result_1 =
+ select_paeth(top[1], left, top_left, left_1_le_top,
+ left_le_top_left_1, top_le_top_left_1);
+ vst1q_u8(dest + 16, result_1);
+
+ if (width == 64) {
+ const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+ TOP_LEFT_DIST(2);
+ const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+ LEFT_LE_TOP_LEFT(2);
+ TOP_LE_TOP_LEFT(2);
+
+ const uint8x16_t result_2 =
+ select_paeth(top[2], left, top_left, left_2_le_top,
+ left_le_top_left_2, top_le_top_left_2);
+ vst1q_u8(dest + 32, result_2);
+
+ const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+ TOP_LEFT_DIST(3);
+ const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+ LEFT_LE_TOP_LEFT(3);
+ TOP_LE_TOP_LEFT(3);
+
+ const uint8x16_t result_3 =
+ select_paeth(top[3], left, top_left, left_3_le_top,
+ left_le_top_left_3, top_le_top_left_3);
+ vst1q_u8(dest + 48, result_3);
+ }
+ }
+
+ dest += stride;
+ } while (++y != height);
+}
+
+#define PAETH_NXM_WIDE(W, H) \
+ void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+PAETH_NXM_WIDE(16, 8)
+PAETH_NXM_WIDE(16, 16)
+PAETH_NXM_WIDE(16, 32)
+PAETH_NXM_WIDE(32, 16)
+PAETH_NXM_WIDE(32, 32)
+PAETH_NXM_WIDE(32, 64)
+PAETH_NXM_WIDE(64, 32)
+PAETH_NXM_WIDE(64, 64)
+
+PAETH_NXM_WIDE(16, 4)
+PAETH_NXM_WIDE(16, 64)
+PAETH_NXM_WIDE(32, 8)
+PAETH_NXM_WIDE(64, 16)
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..7c64be1253
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,1045 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+ uint8x8_t p0q0, const uint8_t blimit,
+ const uint8_t limit) {
+ // Calculate mask values for four samples
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ uint8x8_t mask_8x8, temp_8x8;
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+ mask_8x8 = vabd_u8(p3q3, p2q2);
+ mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+ mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+ const uint8_t blimit, const uint8_t limit) {
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ uint8x8_t mask_8x8, temp_8x8;
+
+ mask_8x8 = vabd_u8(p1q1, p0q0);
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+ uint8x8_t p1q1, uint8x8_t p0q0) {
+ const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
+ uint8x8_t flat_8x8, temp_8x8;
+
+ flat_8x8 = vabd_u8(p1q1, p0q0);
+ flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+ flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+ flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+ flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+ return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+ uint8x8_t p0q0) {
+ const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
+ uint8x8_t flat_8x8, temp_8x8;
+
+ flat_8x8 = vabd_u8(p1q1, p0q0);
+ flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+ flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+ flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+ return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+ uint8x8_t p0q0, const uint8_t blimit,
+ const uint8_t limit) {
+ // Calculate mask3 values for four samples
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ uint8x8_t mask_8x8, temp_8x8;
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+ mask_8x8 = vabd_u8(p2q2, p1q1);
+ mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+ uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+ uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ uint16x8_t out;
+ uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+ out_f14_pq5;
+ uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+ uint8x8_t out_f4_pq0, out_f4_pq1;
+ uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+ uint8x8_t q0p0, q1p1, q2p2;
+
+ // Calculate filter masks
+ mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+ flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+ flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+ {
+ // filter 4
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ hev_8x8 = vmvn_s8(hev_8x8);
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+ }
+ // reverse p and q
+ q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+ q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+ q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+ {
+ // filter 8
+ uint16x8_t out_pq0, out_pq1, out_pq2;
+ out = vaddl_u8(*p3q3, *p2q2);
+ out = vaddw_u8(out, *p1q1);
+ out = vaddw_u8(out, *p0q0);
+
+ out = vaddw_u8(out, q0p0);
+ out_pq1 = vaddw_u8(out, *p3q3);
+ out_pq2 = vaddw_u8(out_pq1, *p3q3);
+ out_pq2 = vaddw_u8(out_pq2, *p2q2);
+ out_pq1 = vaddw_u8(out_pq1, *p1q1);
+ out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+ out_pq0 = vaddw_u8(out, *p0q0);
+ out_pq0 = vaddw_u8(out_pq0, q1p1);
+ out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+ out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+ out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+ out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+ }
+ {
+ // filter 14
+ uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+ uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+ uint8x8_t qp_rev;
+
+ out = vaddw_u8(out, *p4q4);
+ out = vaddw_u8(out, *p5q5);
+ out = vaddw_u8(out, *p6q6);
+
+ out_pq5 = vaddw_u8(out, *p4q4);
+ out_pq4 = vaddw_u8(out_pq5, *p3q3);
+ out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+ out_pq5 = vaddw_u8(out_pq5, *p5q5);
+ out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+ out_pq0 = vaddw_u8(out, *p1q1);
+ out_pq1 = vaddw_u8(out_pq0, *p2q2);
+ out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+ out_pq0 = vaddw_u8(out_pq0, *p0q0);
+ out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+ out_pq1 = vaddw_u8(out_pq1, *p6q6);
+ p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+ out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+ p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+ out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+ p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+ out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+ p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+ out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+ out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+ qp_sum = vaddl_u8(q2p2, q1p1);
+ out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+ qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+ qp_sum = vaddw_u8(qp_sum, qp_rev);
+ out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+ qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+ qp_sum = vaddw_u8(qp_sum, qp_rev);
+ out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+ qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+ qp_sum = vaddw_u8(qp_sum, qp_rev);
+ out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+ out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+ out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+ out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+ out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+ out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+ out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+ out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+ }
+ {
+ uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+ filter8_cond = vand_u8(flat_8x8, mask_8x8);
+ filter4_cond = vmvn_u8(filter8_cond);
+ filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+ // filter4 outputs
+ *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+ // filter8 outputs
+ *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+ *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+ // filter14 outputs
+ *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+ *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+ *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+ *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+ *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+ }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+ uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ uint16x8_t out;
+ uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+ uint8x8_t out_f4_pq0, out_f4_pq1;
+ uint8x8_t mask_8x8, flat_8x8;
+
+ // Calculate filter masks
+ mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+ flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+ {
+ // filter 4
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ hev_8x8 = vmvn_s8(hev_8x8);
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+ }
+ {
+ // filter 8
+ uint16x8_t out_pq0, out_pq1, out_pq2;
+ uint8x8_t q0p0, q1p1, q2p2;
+
+ out = vaddl_u8(*p3q3, *p2q2);
+ out = vaddw_u8(out, *p1q1);
+ out = vaddw_u8(out, *p0q0);
+
+ // reverse p and q
+ q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+ q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+ q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+ out = vaddw_u8(out, q0p0);
+ out_pq1 = vaddw_u8(out, *p3q3);
+ out_pq2 = vaddw_u8(out_pq1, *p3q3);
+ out_pq2 = vaddw_u8(out_pq2, *p2q2);
+ out_pq1 = vaddw_u8(out_pq1, *p1q1);
+ out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+ out_pq0 = vaddw_u8(out, *p0q0);
+ out_pq0 = vaddw_u8(out_pq0, q1p1);
+ out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+ out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+ out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+ out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+ }
+ {
+ uint8x8_t filter4_cond, filter8_cond;
+ filter8_cond = vand_u8(flat_8x8, mask_8x8);
+ filter4_cond = vmvn_u8(filter8_cond);
+
+ // filter4 outputs
+ *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+ // filter8 outputs
+ *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+ *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+ }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+ const uint8_t blimit, const uint8_t limit,
+ const uint8_t thresh) {
+ uint16x8_t out;
+ uint8x8_t out_f6_pq0, out_f6_pq1;
+ uint8x8_t out_f4_pq0, out_f4_pq1;
+ uint8x8_t mask_8x8, flat_8x8;
+
+ // Calculate filter masks
+ mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+ flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+ {
+ // filter 4
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+ }
+ {
+ // filter 6
+ uint16x8_t out_pq0, out_pq1;
+ uint8x8_t pq_rev;
+
+ out = vaddl_u8(*p0q0, *p1q1);
+ out = vaddq_u16(out, out);
+ out = vaddw_u8(out, *p2q2);
+
+ pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+ out = vaddw_u8(out, pq_rev);
+
+ out_pq0 = vaddw_u8(out, pq_rev);
+ pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+ out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+ out_pq1 = vaddw_u8(out, *p2q2);
+ out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+ out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+ out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+ }
+ {
+ uint8x8_t filter4_cond, filter6_cond;
+ filter6_cond = vand_u8(flat_8x8, mask_8x8);
+ filter4_cond = vmvn_u8(filter6_cond);
+
+ // filter4 outputs
+ *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+ // filter6 outputs
+ *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+ }
+}
+
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ // Calculate filter mask
+ mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x16_t row0, row1, row2, row3;
+ uint8x8_t pxp3, p6p2, p5p1, p4p0;
+ uint8x8_t q0q4, q1q5, q2q6, q3qy;
+ uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+ uint32x2_t pq_rev;
+ uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+ // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ load_u8_16x4(src - 8, stride, &row0, &row1, &row2, &row3);
+
+ pxp3 = vget_low_u8(row0);
+ p6p2 = vget_low_u8(row1);
+ p5p1 = vget_low_u8(row2);
+ p4p0 = vget_low_u8(row3);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+ q0q4 = vget_high_u8(row0);
+ q1q5 = vget_high_u8(row1);
+ q2q6 = vget_high_u8(row2);
+ q3qy = vget_high_u8(row3);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+ pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+ p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+ p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+ p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+ p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+ p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+ p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+ p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+ p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+ p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+ *thresh);
+
+ pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+ p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+ p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+ p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+ pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+ p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+ p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+ p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+ q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+ q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+ q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+ q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+ pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+ p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+ p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+ p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+ row0 = vcombine_u8(pxp3, q0q4);
+ row1 = vcombine_u8(p6p2, q1q5);
+ row2 = vcombine_u8(p5p1, q2q6);
+ row3 = vcombine_u8(p4p0, q3qy);
+
+ store_u8_16x4(src - 8, stride, row0, row1, row2, row3);
+}
+
+void aom_lpf_vertical_14_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_vertical_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_14_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+ uint32x2_t pq_rev;
+ uint8x8_t p3q0, p2q1, p1q2, p0q3;
+ uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+ // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+ // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+ // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+ // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+ load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+ p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+ p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+ lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+ p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+ p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+ p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+ store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
+}
+
+void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ aom_lpf_vertical_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_8_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+ uint32x2_t pq_rev;
+ uint8x8_t pxq0, p2q1, p1q2, p0qy;
+ uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+ // row0: px p2 p1 p0 | q0 q1 q2 qy
+ // row1: px p2 p1 p0 | q0 q1 q2 qy
+ // row2: px p2 p1 p0 | q0 q1 q2 qy
+ // row3: px p2 p1 p0 | q0 q1 q2 qy
+ load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+ pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+ p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+ lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+ pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+ p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+ p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+ store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_6_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ aom_lpf_vertical_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_6_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+ uint32x2_t pq_rev;
+ uint8x8_t p1p0, q0q1;
+ uint8x8_t p0q0, p1q1;
+
+ // row0: p1 p0 | q0 q1
+ // row1: p1 p0 | q0 q1
+ // row2: p1 p0 | q0 q1
+ // row3: p1 p0 | q0 q1
+ load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
+
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
+
+ p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+ pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+ p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+ p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+ p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+ p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+ q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
+
+ store_u8x4_strided_x2(src - 2, 2 * stride, p1p0);
+ store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1);
+}
+
+void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ aom_lpf_vertical_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_4_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+ uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+ uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+ uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+ uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
+
+ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+ *thresh);
+
+ store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0);
+ store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1);
+ store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2);
+ store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3);
+ store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4);
+ store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5);
+}
+
+void aom_lpf_horizontal_14_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_14_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_14_dual_neon(s, pitch, blimit, limit, thresh, blimit,
+ limit, thresh);
+ aom_lpf_horizontal_14_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+ p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+ p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+ p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+ vreinterpret_u32_u8(p0q0), 1));
+ p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+ vreinterpret_u32_u8(p1q1), 1));
+ p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+ vreinterpret_u32_u8(p2q2), 1));
+ p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+ vreinterpret_u32_u8(p3q3), 1));
+
+ lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+ vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+ vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+ vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+ vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+ vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+ vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+ vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
+}
+
+void aom_lpf_horizontal_8_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_8_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_horizontal_8_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p0q0, p1q1, p2q2;
+
+ p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+ p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+ vreinterpret_u32_u8(p0q0), 1));
+ p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+ vreinterpret_u32_u8(p1q1), 1));
+ p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+ vreinterpret_u32_u8(p2q2), 1));
+
+ lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+ vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+ vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+ vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+ vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+ vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+}
+
+void aom_lpf_horizontal_6_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_horizontal_6_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
+
+ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0);
+ store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1);
+}
+
+void aom_lpf_horizontal_4_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_4_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_horizontal_4_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
diff --git a/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c
new file mode 100644
index 0000000000..8f65b805ec
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint8x16_t s0,
+ const uint8x16_t a0,
+ const uint8x16_t b0,
+ const uint8x16_t m0) {
+ uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
+ uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
+ uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
+ blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
+ blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
+
+ uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+ uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+ return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
+}
+
+static INLINE void masked_inv_sadwxhx4d_large_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ int h_limit = height > h_overflow ? h_overflow : height;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src + j);
+ uint8x16_t p0 = vld1q_u8(second_pred + j);
+ uint8x16_t m0 = vld1q_u8(mask + j);
+ sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
+ vld1q_u8(ref[0] + ref_offset + j), m0);
+ sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
+ vld1q_u8(ref[1] + ref_offset + j), m0);
+ sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
+ vld1q_u8(ref[2] + ref_offset + j), m0);
+ sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
+ vld1q_u8(ref[3] + ref_offset + j), m0);
+
+ uint8x16_t s1 = vld1q_u8(src + j + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
+ uint8x16_t m1 = vld1q_u8(mask + j + 16);
+ sum_hi[0] = masked_sad_16x1_neon(
+ sum_hi[0], s1, p1, vld1q_u8(ref[0] + ref_offset + j + 16), m1);
+ sum_hi[1] = masked_sad_16x1_neon(
+ sum_hi[1], s1, p1, vld1q_u8(ref[1] + ref_offset + j + 16), m1);
+ sum_hi[2] = masked_sad_16x1_neon(
+ sum_hi[2], s1, p1, vld1q_u8(ref[2] + ref_offset + j + 16), m1);
+ sum_hi[3] = masked_sad_16x1_neon(
+ sum_hi[3], s1, p1, vld1q_u8(ref[3] + ref_offset + j + 16), m1);
+
+ j += 32;
+ } while (j < width);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += width;
+ mask += mask_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+ sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+ sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+ h_limit += h_overflow;
+ } while (i < height);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void masked_inv_sad128xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 128, h, 32);
+}
+
+static INLINE void masked_inv_sad64xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 64, h, 64);
+}
+
+static INLINE void masked_sadwxhx4d_large_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ int h_limit = height > h_overflow ? h_overflow : height;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src + j);
+ uint8x16_t p0 = vld1q_u8(second_pred + j);
+ uint8x16_t m0 = vld1q_u8(mask + j);
+ sum_lo[0] = masked_sad_16x1_neon(
+ sum_lo[0], s0, vld1q_u8(ref[0] + ref_offset + j), p0, m0);
+ sum_lo[1] = masked_sad_16x1_neon(
+ sum_lo[1], s0, vld1q_u8(ref[1] + ref_offset + j), p0, m0);
+ sum_lo[2] = masked_sad_16x1_neon(
+ sum_lo[2], s0, vld1q_u8(ref[2] + ref_offset + j), p0, m0);
+ sum_lo[3] = masked_sad_16x1_neon(
+ sum_lo[3], s0, vld1q_u8(ref[3] + ref_offset + j), p0, m0);
+
+ uint8x16_t s1 = vld1q_u8(src + j + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
+ uint8x16_t m1 = vld1q_u8(mask + j + 16);
+ sum_hi[0] = masked_sad_16x1_neon(
+ sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + j + 16), p1, m1);
+ sum_hi[1] = masked_sad_16x1_neon(
+ sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + j + 16), p1, m1);
+ sum_hi[2] = masked_sad_16x1_neon(
+ sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + j + 16), p1, m1);
+ sum_hi[3] = masked_sad_16x1_neon(
+ sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + j + 16), p1, m1);
+
+ j += 32;
+ } while (j < width);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += width;
+ mask += mask_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+ sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+ sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+ h_limit += h_overflow;
+ } while (i < height);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 128, h, 32);
+}
+
+static INLINE void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 64, h, 64);
+}
+
+static INLINE void masked_inv_sad32xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
+ vld1q_u8(ref[0] + ref_offset), m0);
+ sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
+ vld1q_u8(ref[1] + ref_offset), m0);
+ sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
+ vld1q_u8(ref[2] + ref_offset), m0);
+ sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
+ vld1q_u8(ref[3] + ref_offset), m0);
+
+ uint8x16_t s1 = vld1q_u8(src + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t m1 = vld1q_u8(mask + 16);
+ sum_hi[0] = masked_sad_16x1_neon(sum_hi[0], s1, p1,
+ vld1q_u8(ref[0] + ref_offset + 16), m1);
+ sum_hi[1] = masked_sad_16x1_neon(sum_hi[1], s1, p1,
+ vld1q_u8(ref[1] + ref_offset + 16), m1);
+ sum_hi[2] = masked_sad_16x1_neon(sum_hi[2], s1, p1,
+ vld1q_u8(ref[2] + ref_offset + 16), m1);
+ sum_hi[3] = masked_sad_16x1_neon(sum_hi[3], s1, p1,
+ vld1q_u8(ref[3] + ref_offset + 16), m1);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 32;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0,
+ vld1q_u8(ref[0] + ref_offset), p0, m0);
+ sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0,
+ vld1q_u8(ref[1] + ref_offset), p0, m0);
+ sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0,
+ vld1q_u8(ref[2] + ref_offset), p0, m0);
+ sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0,
+ vld1q_u8(ref[3] + ref_offset), p0, m0);
+
+ uint8x16_t s1 = vld1q_u8(src + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t m1 = vld1q_u8(mask + 16);
+ sum_hi[0] = masked_sad_16x1_neon(
+ sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + 16), p1, m1);
+ sum_hi[1] = masked_sad_16x1_neon(
+ sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + 16), p1, m1);
+ sum_hi[2] = masked_sad_16x1_neon(
+ sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + 16), p1, m1);
+ sum_hi[3] = masked_sad_16x1_neon(
+ sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + 16), p1, m1);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 32;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void masked_inv_sad16xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0, p0,
+ vld1q_u8(ref[0] + ref_offset), m0);
+ sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0, p0,
+ vld1q_u8(ref[1] + ref_offset), m0);
+ sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0, p0,
+ vld1q_u8(ref[2] + ref_offset), m0);
+ sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0, p0,
+ vld1q_u8(ref[3] + ref_offset), m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 16;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+ sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+ sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+ sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0,
+ vld1q_u8(ref[0] + ref_offset), p0, m0);
+ sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0,
+ vld1q_u8(ref[1] + ref_offset), p0, m0);
+ sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0,
+ vld1q_u8(ref[2] + ref_offset), p0, m0);
+ sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0,
+ vld1q_u8(ref[3] + ref_offset), p0, m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 16;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+ sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+ sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+ sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
+ const uint8x8_t a0,
+ const uint8x8_t b0,
+ const uint8x8_t m0) {
+ uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
+ uint16x8_t blend_u16 = vmull_u8(m0, a0);
+ blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
+
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ return vabal_u8(sad, blend_u8, s0);
+}
+
+static INLINE void masked_inv_sad8xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = vld1_u8(mask);
+ sum[0] =
+ masked_sad_8x1_neon(sum[0], s0, p0, vld1_u8(ref[0] + ref_offset), m0);
+ sum[1] =
+ masked_sad_8x1_neon(sum[1], s0, p0, vld1_u8(ref[1] + ref_offset), m0);
+ sum[2] =
+ masked_sad_8x1_neon(sum[2], s0, p0, vld1_u8(ref[2] + ref_offset), m0);
+ sum[3] =
+ masked_sad_8x1_neon(sum[3], s0, p0, vld1_u8(ref[3] + ref_offset), m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 8;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = vld1_u8(mask);
+
+ sum[0] =
+ masked_sad_8x1_neon(sum[0], s0, vld1_u8(ref[0] + ref_offset), p0, m0);
+ sum[1] =
+ masked_sad_8x1_neon(sum[1], s0, vld1_u8(ref[1] + ref_offset), p0, m0);
+ sum[2] =
+ masked_sad_8x1_neon(sum[2], s0, vld1_u8(ref[2] + ref_offset), p0, m0);
+ sum[3] =
+ masked_sad_8x1_neon(sum[3], s0, vld1_u8(ref[3] + ref_offset), p0, m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 8;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_inv_sad4xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+ sum[0] = masked_sad_8x1_neon(sum[0], s, p0, r0, m0);
+ sum[1] = masked_sad_8x1_neon(sum[1], s, p0, r1, m0);
+ sum[2] = masked_sad_8x1_neon(sum[2], s, p0, r2, m0);
+ sum[3] = masked_sad_8x1_neon(sum[3], s, p0, r3, m0);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ second_pred += 2 * 4;
+ mask += 2 * mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+ sum[0] = masked_sad_8x1_neon(sum[0], s, r0, p0, m0);
+ sum[1] = masked_sad_8x1_neon(sum[1], s, r1, p0, m0);
+ sum[2] = masked_sad_8x1_neon(sum[2], s, r2, p0, m0);
+ sum[3] = masked_sad_8x1_neon(sum[3], s, r3, p0, m0);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ second_pred += 2 * 4;
+ mask += 2 * mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+#define MASKED_SAD4D_WXH_NEON(w, h) \
+ void aom_masked_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int invert_mask, uint32_t res[4]) { \
+ if (invert_mask) { \
+ masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
+ second_pred, msk, msk_stride, res, h); \
+ } else { \
+ masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, res, h); \
+ } \
+ }
+
+MASKED_SAD4D_WXH_NEON(4, 8)
+MASKED_SAD4D_WXH_NEON(4, 4)
+
+MASKED_SAD4D_WXH_NEON(8, 16)
+MASKED_SAD4D_WXH_NEON(8, 8)
+MASKED_SAD4D_WXH_NEON(8, 4)
+
+MASKED_SAD4D_WXH_NEON(16, 32)
+MASKED_SAD4D_WXH_NEON(16, 16)
+MASKED_SAD4D_WXH_NEON(16, 8)
+
+MASKED_SAD4D_WXH_NEON(32, 64)
+MASKED_SAD4D_WXH_NEON(32, 32)
+MASKED_SAD4D_WXH_NEON(32, 16)
+
+MASKED_SAD4D_WXH_NEON(64, 128)
+MASKED_SAD4D_WXH_NEON(64, 64)
+MASKED_SAD4D_WXH_NEON(64, 32)
+
+MASKED_SAD4D_WXH_NEON(128, 128)
+MASKED_SAD4D_WXH_NEON(128, 64)
+
+#if !CONFIG_REALTIME_ONLY
+MASKED_SAD4D_WXH_NEON(4, 16)
+MASKED_SAD4D_WXH_NEON(16, 4)
+MASKED_SAD4D_WXH_NEON(8, 32)
+MASKED_SAD4D_WXH_NEON(32, 8)
+MASKED_SAD4D_WXH_NEON(16, 64)
+MASKED_SAD4D_WXH_NEON(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/arm/masked_sad_neon.c b/third_party/aom/aom_dsp/arm/masked_sad_neon.c
new file mode 100644
index 0000000000..9d263105e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/masked_sad_neon.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint8_t *src,
+ const uint8_t *a,
+ const uint8_t *b,
+ const uint8_t *m) {
+ uint8x16_t m0 = vld1q_u8(m);
+ uint8x16_t a0 = vld1q_u8(a);
+ uint8x16_t b0 = vld1q_u8(b);
+ uint8x16_t s0 = vld1q_u8(src);
+
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
+
+ return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
+}
+
+static INLINE unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // Eight accumulator vectors are required to avoid overflow in the 128x128
+ // case.
+ assert(height <= 128);
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]);
+ sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]);
+ sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]);
+ sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]);
+ sad[4] = masked_sad_16x1_neon(sad[4], &src[64], &a[64], &b[64], &m[64]);
+ sad[5] = masked_sad_16x1_neon(sad[5], &src[80], &a[80], &b[80], &m[80]);
+ sad[6] = masked_sad_16x1_neon(sad[6], &src[96], &a[96], &b[96], &m[96]);
+ sad[7] = masked_sad_16x1_neon(sad[7], &src[112], &a[112], &b[112], &m[112]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_long_add_u16x8(sad[0], sad[1]) +
+ horizontal_long_add_u16x8(sad[2], sad[3]) +
+ horizontal_long_add_u16x8(sad[4], sad[5]) +
+ horizontal_long_add_u16x8(sad[6], sad[7]);
+}
+
+static INLINE unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // Four accumulator vectors are required to avoid overflow in the 64x128 case.
+ assert(height <= 128);
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]);
+ sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]);
+ sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]);
+ sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_long_add_u16x8(sad[0], sad[1]) +
+ horizontal_long_add_u16x8(sad[2], sad[3]);
+}
+
+static INLINE unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // We could use a single accumulator up to height=64 without overflow.
+ assert(height <= 64);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, &src[0], &a[0], &b[0], &m[0]);
+ sad = masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // We could use a single accumulator up to height=128 without overflow.
+ assert(height <= 128);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // We could use a single accumulator up to height=128 without overflow.
+ assert(height <= 128);
+ uint16x4_t sad = vdup_n_u16(0);
+
+ do {
+ uint8x8_t m0 = vld1_u8(m);
+ uint8x8_t a0 = vld1_u8(a);
+ uint8x8_t b0 = vld1_u8(b);
+ uint8x8_t s0 = vld1_u8(src);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
+
+ sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+static INLINE unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // Process two rows per loop iteration.
+ assert(height % 2 == 0);
+
+ // We could use a single accumulator up to height=256 without overflow.
+ assert(height <= 256);
+ uint16x4_t sad = vdup_n_u16(0);
+
+ do {
+ uint8x8_t m0 = load_unaligned_u8(m, m_stride);
+ uint8x8_t a0 = load_unaligned_u8(a, a_stride);
+ uint8x8_t b0 = load_unaligned_u8(b, b_stride);
+ uint8x8_t s0 = load_unaligned_u8(src, src_stride);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
+
+ sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
+
+ src += 2 * src_stride;
+ a += 2 * a_stride;
+ b += 2 * b_stride;
+ m += 2 * m_stride;
+ height -= 2;
+ } while (height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+#define MASKED_SAD_WXH_NEON(width, height) \
+ unsigned aom_masked_sad##width##x##height##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##width##xh_neon(src, src_stride, ref, ref_stride, \
+ second_pred, width, msk, msk_stride, \
+ height); \
+ else \
+ return masked_sad_##width##xh_neon(src, src_stride, second_pred, width, \
+ ref, ref_stride, msk, msk_stride, \
+ height); \
+ }
+
+MASKED_SAD_WXH_NEON(4, 4)
+MASKED_SAD_WXH_NEON(4, 8)
+MASKED_SAD_WXH_NEON(8, 4)
+MASKED_SAD_WXH_NEON(8, 8)
+MASKED_SAD_WXH_NEON(8, 16)
+MASKED_SAD_WXH_NEON(16, 8)
+MASKED_SAD_WXH_NEON(16, 16)
+MASKED_SAD_WXH_NEON(16, 32)
+MASKED_SAD_WXH_NEON(32, 16)
+MASKED_SAD_WXH_NEON(32, 32)
+MASKED_SAD_WXH_NEON(32, 64)
+MASKED_SAD_WXH_NEON(64, 32)
+MASKED_SAD_WXH_NEON(64, 64)
+MASKED_SAD_WXH_NEON(64, 128)
+MASKED_SAD_WXH_NEON(128, 64)
+MASKED_SAD_WXH_NEON(128, 128)
+#if !CONFIG_REALTIME_ONLY
+MASKED_SAD_WXH_NEON(4, 16)
+MASKED_SAD_WXH_NEON(16, 4)
+MASKED_SAD_WXH_NEON(8, 32)
+MASKED_SAD_WXH_NEON(32, 8)
+MASKED_SAD_WXH_NEON(16, 64)
+MASKED_SAD_WXH_NEON(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..52c7a34e3e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@@ -0,0 +1,1253 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
+#define AOM_AOM_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
+
+// Support for xN Neon intrinsics is lacking in some compilers.
+#if defined(__arm__) || defined(_M_ARM)
+#define ARM_32_BIT
+#endif
+
+// DEFICIENT_CLANG_32_BIT includes clang-cl.
+#if defined(__clang__) && defined(ARM_32_BIT) && \
+ (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
+#define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
+#define GCC_32_BIT
+#endif
+
+#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
+
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+ uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+ vld1q_u8(ptr + 2 * 16) } };
+ return res;
+}
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+ uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+ return res;
+}
+
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+ uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+ return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+ uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+ vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+ return res;
+}
+
+#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
+#if __GNUC__ < 8
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+ uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+ return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+ uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+ vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+ return res;
+}
+#endif // __GNUC__ < 8
+
+#if __GNUC__ < 9
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+ uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+ vld1q_u8(ptr + 2 * 16) } };
+ return res;
+}
+#endif // __GNUC__ < 9
+#endif // defined(__GNUC__) && !defined(__clang__)
+
+static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+}
+
+static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
+ return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
+}
+
+// Load four bytes into the low half of a uint8x8_t, zero the upper half.
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ p += stride;
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+ return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+ uint16x4_t ret = vdup_n_u16(0);
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+ p += stride;
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
+ return ret;
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+}
+
+static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3,
+ uint16x4_t *const s4, uint16x4_t *const s5,
+ uint16x4_t *const s6) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+ *s4 = vld1_u16(s);
+ s += p;
+ *s5 = vld1_u16(s);
+ s += p;
+ *s6 = vld1_u16(s);
+}
+
+static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+}
+
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7,
+ int16x4_t *const s8, int16x4_t *const s9,
+ int16x4_t *const s10, int16x4_t *const s11) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+ s += p;
+ *s8 = vld1_s16(s);
+ s += p;
+ *s9 = vld1_s16(s);
+ s += p;
+ *s10 = vld1_s16(s);
+ s += p;
+ *s11 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7,
+ int16x4_t *const s8, int16x4_t *const s9,
+ int16x4_t *const s10) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+ s += p;
+ *s8 = vld1_s16(s);
+ s += p;
+ *s9 = vld1_s16(s);
+ s += p;
+ *s10 = vld1_s16(s);
+}
+
+static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3,
+ uint16x4_t *const s4, uint16x4_t *const s5,
+ uint16x4_t *const s6, uint16x4_t *const s7,
+ uint16x4_t *const s8, uint16x4_t *const s9,
+ uint16x4_t *const s10) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+ *s4 = vld1_u16(s);
+ s += p;
+ *s5 = vld1_u16(s);
+ s += p;
+ *s6 = vld1_u16(s);
+ s += p;
+ *s7 = vld1_u16(s);
+ s += p;
+ *s8 = vld1_u16(s);
+ s += p;
+ *s9 = vld1_u16(s);
+ s += p;
+ *s10 = vld1_u16(s);
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+}
+
+static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3,
+ uint16x4_t *const s4) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+ *s4 = vld1_u16(s);
+ s += p;
+}
+
+static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+}
+
+static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3, const uint8x8_t s4,
+ const uint8x8_t s5, const uint8x8_t s6,
+ const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+ const uint8x16_t s1, const uint8x16_t s2,
+ const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3,
+ const uint16x8_t s4, const uint16x8_t s5,
+ const uint16x8_t s6, const uint16x8_t s7) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+ s += dst_stride;
+ vst1q_u16(s, s2);
+ s += dst_stride;
+ vst1q_u16(s, s3);
+ s += dst_stride;
+ vst1q_u16(s, s4);
+ s += dst_stride;
+ vst1q_u16(s, s5);
+ s += dst_stride;
+ vst1q_u16(s, s6);
+ s += dst_stride;
+ vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2, const uint16x4_t s3) {
+ vst1_u16(s, s0);
+ s += dst_stride;
+ vst1_u16(s, s1);
+ s += dst_stride;
+ vst1_u16(s, s2);
+ s += dst_stride;
+ vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+ s += dst_stride;
+ vst1q_u16(s, s2);
+ s += dst_stride;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+ s += dst_stride;
+ vst1q_s16(s, s4);
+ s += dst_stride;
+ vst1q_s16(s, s5);
+ s += dst_stride;
+ vst1q_s16(s, s6);
+ s += dst_stride;
+ vst1q_s16(s, s7);
+}
+
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3) {
+ vst1_s16(s, s0);
+ s += dst_stride;
+ vst1_s16(s, s1);
+ s += dst_stride;
+ vst1_s16(s, s2);
+ s += dst_stride;
+ vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+}
+
+static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7,
+ uint8x8_t *const s8, uint8x8_t *const s9,
+ uint8x8_t *const s10) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+ s += p;
+ *s8 = vld1_u8(s);
+ s += p;
+ *s9 = vld1_u8(s);
+ s += p;
+ *s10 = vld1_u8(s);
+}
+
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9,
+ int16x8_t *const s10) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+ s += p;
+ *s10 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9,
+ int16x8_t *const s10, int16x8_t *const s11) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+ s += p;
+ *s10 = vld1q_s16(s);
+ s += p;
+ *s11 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4, uint16x8_t *const s5,
+ uint16x8_t *const s6, uint16x8_t *const s7,
+ uint16x8_t *const s8, uint16x8_t *const s9,
+ uint16x8_t *const s10) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+ s += p;
+ *s8 = vld1q_u16(s);
+ s += p;
+ *s9 = vld1q_u16(s);
+ s += p;
+ *s10 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4, uint16x8_t *const s5,
+ uint16x8_t *const s6) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+ uint32_t a;
+ memcpy(&a, buf, 4);
+ buf += stride;
+ uint32x2_t a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x4_t a_u32;
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdupq_n_u32(a);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
+ uint16_t a;
+ uint16x4_t a_u16;
+
+ memcpy(&a, buf, 2);
+ buf += stride;
+ a_u16 = vdup_n_u16(a);
+ memcpy(&a, buf, 2);
+ a_u16 = vset_lane_u16(a, a_u16, 1);
+ return vreinterpret_u8_u16(a_u16);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(0);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(a);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+ uint16_t a;
+ uint16x4_t a_u32;
+
+ memcpy(&a, buf, 2);
+ a_u32 = vdup_n_u16(a);
+ return vreinterpret_u8_u16(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1) {
+ *tu0 = load_unaligned_u8_4x2(buf, stride);
+ buf += 2 * stride;
+ *tu1 = load_unaligned_u8_4x2(buf, stride);
+}
+
+static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1,
+ uint8x8_t *tu2) {
+ load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+ buf += 4 * stride;
+ *tu2 = load_unaligned_u8_4x2(buf, stride);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1,
+ uint8x8_t *tu2, uint8x8_t *tu3) {
+ load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+ buf += 4 * stride;
+ load_unaligned_u8_4x4(buf, stride, tu2, tu3);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+ uint16x8_t *s6, uint16x8_t *s7) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+}
+
+static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4, uint16x8_t *const s5,
+ uint16x8_t *const s6, uint16x8_t *const s7) {
+ *s0 = vld1q_u16(s);
+ *s1 = vld1q_u16(s + 8);
+ s += p;
+ *s2 = vld1q_u16(s);
+ *s3 = vld1q_u16(s + 8);
+ s += p;
+ *s4 = vld1q_u16(s);
+ *s5 = vld1q_u16(s + 8);
+ s += p;
+ *s6 = vld1q_u16(s);
+ *s7 = vld1q_u16(s + 8);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+ int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+ uint64_t a;
+ uint64x1_t a_u64 = vdup_n_u64(0);
+ memcpy(&a, buf, 8);
+ a_u64 = vset_lane_u64(a, a_u64, 0);
+ return vreinterpret_u16_u64(a_u64);
+}
+
+static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
+ uint32_t stride) {
+ uint64_t a;
+ uint64x2_t a_u64;
+
+ memcpy(&a, buf, 8);
+ buf += stride;
+ a_u64 = vdupq_n_u64(0);
+ a_u64 = vsetq_lane_u64(a, a_u64, 0);
+ memcpy(&a, buf, 8);
+ buf += stride;
+ a_u64 = vsetq_lane_u64(a, a_u64, 1);
+ return vreinterpretq_u16_u64(a_u64);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+ uint16x8_t *tu0, uint16x8_t *tu1) {
+ *tu0 = load_unaligned_u16_4x2(buf, stride);
+ buf += 2 * stride;
+ *tu1 = load_unaligned_u16_4x2(buf, stride);
+}
+
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+ int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+ *s1 = vld1q_s32(s);
+ s += p;
+ *s2 = vld1q_s32(s);
+ s += p;
+ *s3 = vld1q_s32(s);
+ s += p;
+ *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+ int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+ vst1q_s32(s, s1);
+ s += p;
+ vst1q_s32(s, s2);
+ s += p;
+ vst1q_s32(s, s3);
+ s += p;
+ vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+ uint32x4_t *s2, uint32x4_t *s3,
+ uint32x4_t *s4) {
+ *s1 = vld1q_u32(s);
+ s += p;
+ *s2 = vld1q_u32(s);
+ s += p;
+ *s3 = vld1q_u32(s);
+ s += p;
+ *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+ uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+ vst1q_u32(s, s1);
+ s += p;
+ vst1q_u32(s, s2);
+ s += p;
+ vst1q_u32(s, s3);
+ s += p;
+ vst1q_u32(s, s4);
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+}
+
+static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
+ const int32x4_t v0 = vmovl_s16(a);
+ vst1q_s32(buf, v0);
+}
+
+static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
+ int16x8_t indices) {
+ // Recent Clang and GCC versions correctly identify that this zero-broadcast
+ // is redundant. Alternatively we could load and broadcast the zeroth element
+ // and then replace the other lanes, however this is slower than loading a
+ // single element without broadcast on some micro-architectures.
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
+ return ret;
+}
+
+// The `lane` parameter here must be an immediate.
+#define store_u8_2x1_lane(dst, src, lane) \
+ do { \
+ uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+ memcpy(dst, &a, 2); \
+ } while (0)
+
+#define store_u8_4x1_lane(dst, src, lane) \
+ do { \
+ uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_u16_2x1_lane(dst, src, lane) \
+ do { \
+ uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_u16_4x1_lane(dst, src, lane) \
+ do { \
+ uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+ memcpy(dst, &a, 8); \
+ } while (0)
+
+// Store the low 16-bits from a single vector.
+static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
+ store_u8_2x1_lane(dst, src, 0);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
+ store_u8_4x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 16-bits from a single vector.
+static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_u8_2x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u8_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
+ uint8x8_t src) {
+ store_u8_4x1_lane(dst, src, 0);
+ dst += stride;
+ store_u8_4x1_lane(dst, src, 1);
+}
+
+// Store four blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
+ uint8x16_t src) {
+ store_u8_4x1_lane(dst, vget_low_u8(src), 0);
+ dst += stride;
+ store_u8_4x1_lane(dst, vget_low_u8(src), 1);
+ dst += stride;
+ store_u8_4x1_lane(dst, vget_high_u8(src), 0);
+ dst += stride;
+ store_u8_4x1_lane(dst, vget_high_u8(src), 1);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
+ store_u16_2x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x4_t src) {
+ store_u16_2x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u16_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x8_t src) {
+ store_u16_4x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u16_4x1_lane(dst, src, 1);
+}
+
+#undef store_u8_2x1_lane
+#undef store_u8_4x1_lane
+#undef store_u16_2x1_lane
+#undef store_u16_4x1_lane
+
+#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c
new file mode 100644
index 0000000000..a692cbb388
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask,
+ const int32_t *wsrc, uint32x4_t *sum) {
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int16x8_t mask_s16 =
+ vuzpq_s16(vreinterpretq_s16_s32(mask_lo), vreinterpretq_s16_s32(mask_hi))
+ .val[0];
+
+ int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+ int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+ *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+#if AOM_ARCH_AARCH64
+
+// Use tbl for doing a double-width zero extension from 8->32 bits since we can
+// do this in one instruction rather than two (indices out of range (255 here)
+// are set to zero by tbl).
+DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
+ 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255,
+ 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255,
+ 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
+ 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
+};
+
+static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo,
+ uint32x4_t ref_u32_hi,
+ const int32_t *mask,
+ const int32_t *wsrc,
+ uint32x4_t sum[2]) {
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int32x4_t pre_lo = vmulq_s32(vreinterpretq_s32_u32(ref_u32_lo), mask_lo);
+ int32x4_t pre_hi = vmulq_s32(vreinterpretq_s32_u32(ref_u32_hi), mask_hi);
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ sum[0] = vrsraq_n_u32(sum[0], abs_lo, 12);
+ sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12);
+}
+
+static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ // Use tbl for doing a double-width zero extension from 8->32 bits since we
+ // can do this in one instruction rather than two.
+ uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
+ uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
+ uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
+ uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
+
+ int h = height;
+ do {
+ int w = width;
+ const uint8_t *ref_ptr = ref;
+ do {
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ uint32x4_t ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx0));
+ uint32x4_t ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx1));
+ obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask, wsrc, sum);
+
+ ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx2));
+ ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx3));
+ obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask + 8, wsrc + 8, sum);
+
+ ref_ptr += 16;
+ wsrc += 16;
+ mask += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else // !AOM_ARCH_AARCH64
+
+static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ const uint8_t *ref_ptr = ref;
+ do {
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r)));
+ obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+ ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r)));
+ obmc_sad_8x1_s16_neon(ref_s16, mask + 8, wsrc + 8, &sum);
+
+ ref_ptr += 16;
+ wsrc += 16;
+ mask += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+#endif // AOM_ARCH_AARCH64
+
+static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+static INLINE unsigned int obmc_sad_64xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int obmc_sad_32xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h);
+}
+
+static INLINE unsigned int obmc_sad_16xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int height) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ uint8x8_t r = vld1_u8(ref);
+
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r));
+ obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+ ref += ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int height) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height / 2;
+ do {
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r));
+ obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+ ref += 2 * ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+#define OBMC_SAD_WXH_NEON(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_neon( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+ }
+
+OBMC_SAD_WXH_NEON(4, 4)
+OBMC_SAD_WXH_NEON(4, 8)
+OBMC_SAD_WXH_NEON(4, 16)
+
+OBMC_SAD_WXH_NEON(8, 4)
+OBMC_SAD_WXH_NEON(8, 8)
+OBMC_SAD_WXH_NEON(8, 16)
+OBMC_SAD_WXH_NEON(8, 32)
+
+OBMC_SAD_WXH_NEON(16, 4)
+OBMC_SAD_WXH_NEON(16, 8)
+OBMC_SAD_WXH_NEON(16, 16)
+OBMC_SAD_WXH_NEON(16, 32)
+OBMC_SAD_WXH_NEON(16, 64)
+
+OBMC_SAD_WXH_NEON(32, 8)
+OBMC_SAD_WXH_NEON(32, 16)
+OBMC_SAD_WXH_NEON(32, 32)
+OBMC_SAD_WXH_NEON(32, 64)
+
+OBMC_SAD_WXH_NEON(64, 16)
+OBMC_SAD_WXH_NEON(64, 32)
+OBMC_SAD_WXH_NEON(64, 64)
+OBMC_SAD_WXH_NEON(64, 128)
+
+OBMC_SAD_WXH_NEON(128, 64)
+OBMC_SAD_WXH_NEON(128, 128)
diff --git a/third_party/aom/aom_dsp/arm/obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c
new file mode 100644
index 0000000000..50cd5f3b6a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE void obmc_variance_8x1_s16_neon(int16x8_t pre_s16,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int32x4_t *ssev,
+ int32x4_t *sumv) {
+ // For 4xh and 8xh we observe it is faster to avoid the double-widening of
+ // pre. Instead we do a single widening step and narrow the mask to 16-bits
+ // to allow us to perform a widening multiply. Widening multiply
+ // instructions have better throughput on some micro-architectures but for
+ // the larger block sizes this benefit is outweighed by the additional
+ // instruction needed to first narrow the mask vectors.
+
+ int32x4_t wsrc_s32_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_s32_hi = vld1q_s32(&wsrc[4]);
+ int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(vld1q_s32(&mask[0])),
+ vreinterpretq_s16_s32(vld1q_s32(&mask[4])))
+ .val[0];
+
+ int32x4_t diff_s32_lo =
+ vmlsl_s16(wsrc_s32_lo, vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+ int32x4_t diff_s32_hi =
+ vmlsl_s16(wsrc_s32_hi, vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+ // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+ // This difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31);
+ diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31);
+ int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12);
+ int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12);
+
+ *sumv = vrsraq_n_s32(*sumv, diff_s32_lo, 12);
+ *sumv = vrsraq_n_s32(*sumv, diff_s32_hi, 12);
+ *ssev = vmlaq_s32(*ssev, round_s32_lo, round_s32_lo);
+ *ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi);
+}
+
+#if AOM_ARCH_AARCH64
+
+// Use tbl for doing a double-width zero extension from 8->32 bits since we can
+// do this in one instruction rather than two (indices out of range (255 here)
+// are set to zero by tbl).
+DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
+ 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255,
+ 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255,
+ 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
+ 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
+};
+
+static INLINE void obmc_variance_8x1_s32_neon(
+ int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc,
+ const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) {
+ int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+ int32x4_t mask_lo = vld1q_s32(&mask[0]);
+ int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+ int32x4_t diff_lo = vmlsq_s32(wsrc_lo, pre_lo, mask_lo);
+ int32x4_t diff_hi = vmlsq_s32(wsrc_hi, pre_hi, mask_hi);
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from
+ // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This
+ // difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+ diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+ int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+ int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+ *sumv = vrsraq_n_s32(*sumv, diff_lo, 12);
+ *sumv = vrsraq_n_s32(*sumv, diff_hi, 12);
+ *ssev = vmlaq_s32(*ssev, round_lo, round_lo);
+ *ssev = vmlaq_s32(*ssev, round_hi, round_hi);
+}
+
+static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height, unsigned *sse,
+ int *sum) {
+ assert(width % 16 == 0);
+
+ // Use tbl for doing a double-width zero extension from 8->32 bits since we
+ // can do this in one instruction rather than two.
+ uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
+ uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
+ uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
+ uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
+
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ do {
+ uint8x16_t pre_u8 = vld1q_u8(pre);
+
+ int32x4_t pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx0));
+ int32x4_t pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx1));
+ obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[0], &mask[0],
+ &ssev, &sumv);
+
+ pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx2));
+ pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx3));
+ obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[8], &mask[8],
+ &ssev, &sumv);
+
+ wsrc += 16;
+ mask += 16;
+ pre += 16;
+ w -= 16;
+ } while (w != 0);
+
+ pre += pre_stride - width;
+ } while (--h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+#else // !AOM_ARCH_AARCH64
+
+static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height, unsigned *sse,
+ int *sum) {
+ // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the
+ // widening version of the core kernel instead.
+
+ assert(width % 16 == 0);
+
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ do {
+ uint8x16_t pre_u8 = vld1q_u8(pre);
+
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pre_u8)));
+ obmc_variance_8x1_s16_neon(pre_s16, &wsrc[0], &mask[0], &ssev, &sumv);
+
+ pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pre_u8)));
+ obmc_variance_8x1_s16_neon(pre_s16, &wsrc[8], &mask[8], &ssev, &sumv);
+
+ wsrc += 16;
+ mask += 16;
+ pre += 16;
+ w -= 16;
+ } while (w != 0);
+
+ pre += pre_stride - width;
+ } while (--h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+#endif // AOM_ARCH_AARCH64
+
+static INLINE void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ do {
+ uint8x8_t pre_u8 = vld1_u8(pre);
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));
+
+ obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);
+
+ pre += pre_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+static INLINE void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ assert(h % 2 == 0);
+
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ do {
+ uint8x8_t pre_u8 = load_unaligned_u8(pre, pre_stride);
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));
+
+ obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);
+
+ pre += 2 * pre_stride;
+ wsrc += 8;
+ mask += 8;
+ h -= 2;
+ } while (h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+#define OBMC_VARIANCE_WXH_NEON(W, H) \
+ unsigned aom_obmc_variance##W##x##H##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned *sse) { \
+ int sum; \
+ obmc_variance_neon_##W##xh(pre, pre_stride, wsrc, mask, H, sse, &sum); \
+ return *sse - (unsigned)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMC_VARIANCE_WXH_NEON(4, 4)
+OBMC_VARIANCE_WXH_NEON(4, 8)
+OBMC_VARIANCE_WXH_NEON(8, 4)
+OBMC_VARIANCE_WXH_NEON(8, 8)
+OBMC_VARIANCE_WXH_NEON(8, 16)
+OBMC_VARIANCE_WXH_NEON(16, 8)
+OBMC_VARIANCE_WXH_NEON(16, 16)
+OBMC_VARIANCE_WXH_NEON(16, 32)
+OBMC_VARIANCE_WXH_NEON(32, 16)
+OBMC_VARIANCE_WXH_NEON(32, 32)
+OBMC_VARIANCE_WXH_NEON(32, 64)
+OBMC_VARIANCE_WXH_NEON(64, 32)
+OBMC_VARIANCE_WXH_NEON(64, 64)
+OBMC_VARIANCE_WXH_NEON(64, 128)
+OBMC_VARIANCE_WXH_NEON(128, 64)
+OBMC_VARIANCE_WXH_NEON(128, 128)
+OBMC_VARIANCE_WXH_NEON(4, 16)
+OBMC_VARIANCE_WXH_NEON(16, 4)
+OBMC_VARIANCE_WXH_NEON(8, 32)
+OBMC_VARIANCE_WXH_NEON(32, 8)
+OBMC_VARIANCE_WXH_NEON(16, 64)
+OBMC_VARIANCE_WXH_NEON(64, 16)
diff --git a/third_party/aom/aom_dsp/arm/reinterpret_neon.h b/third_party/aom/aom_dsp/arm/reinterpret_neon.h
new file mode 100644
index 0000000000..f9702513ad
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/reinterpret_neon.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
+#define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
+#include "config/aom_config.h"
+
+#define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q) \
+ static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t \
+ aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n( \
+ const u##int##from_sz##x##from_count##x##n##_t src) { \
+ u##int##to_sz##x##to_count##x##n##_t ret; \
+ for (int i = 0; i < (n); ++i) { \
+ ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \
+ } \
+ return ret; \
+ }
+
+REINTERPRET_NEON(u, 8, 8, 16, 4, 2, ) // uint8x8x2_t from uint16x4x2_t
+REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q) // uint8x16x2_t from uint16x8x2_t
+
+#endif // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
new file mode 100644
index 0000000000..46a1666331
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+ uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ diff2 = vabdq_u8(s2, r2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ diff3 = vabdq_u8(s3, r3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ s4 = vld1q_u8(src_ptr + 64);
+ r4 = vld1q_u8(ref_ptr + 64);
+ diff4 = vabdq_u8(s4, r4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ s5 = vld1q_u8(src_ptr + 80);
+ r5 = vld1q_u8(ref_ptr + 80);
+ diff5 = vabdq_u8(s5, r5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ s6 = vld1q_u8(src_ptr + 96);
+ r6 = vld1q_u8(ref_ptr + 96);
+ diff6 = vabdq_u8(s6, r6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ s7 = vld1q_u8(src_ptr + 112);
+ r7 = vld1q_u8(ref_ptr + 112);
+ diff7 = vabdq_u8(s7, r7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+ uint8x16_t diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ diff2 = vabdq_u8(s2, r2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ diff3 = vabdq_u8(s3, r3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_NEON(w, h) \
+ unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+SAD_WXH_NEON(64, 128)
+
+SAD_WXH_NEON(128, 64)
+SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON(4, 16)
+SAD_WXH_NEON(8, 32)
+SAD_WXH_NEON(16, 4)
+SAD_WXH_NEON(16, 64)
+SAD_WXH_NEON(32, 8)
+SAD_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+SAD_SKIP_WXH_NEON(64, 128)
+
+SAD_SKIP_WXH_NEON(128, 64)
+SAD_SKIP_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON(4, 16)
+SAD_SKIP_WXH_NEON(8, 32)
+SAD_SKIP_WXH_NEON(16, 4)
+SAD_SKIP_WXH_NEON(16, 64)
+SAD_SKIP_WXH_NEON(32, 8)
+SAD_SKIP_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON
+
+static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+ uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7;
+ uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7;
+ uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ p2 = vld1q_u8(second_pred + 32);
+ avg2 = vrhaddq_u8(r2, p2);
+ diff2 = vabdq_u8(s2, avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ p3 = vld1q_u8(second_pred + 48);
+ avg3 = vrhaddq_u8(r3, p3);
+ diff3 = vabdq_u8(s3, avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ s4 = vld1q_u8(src_ptr + 64);
+ r4 = vld1q_u8(ref_ptr + 64);
+ p4 = vld1q_u8(second_pred + 64);
+ avg4 = vrhaddq_u8(r4, p4);
+ diff4 = vabdq_u8(s4, avg4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ s5 = vld1q_u8(src_ptr + 80);
+ r5 = vld1q_u8(ref_ptr + 80);
+ p5 = vld1q_u8(second_pred + 80);
+ avg5 = vrhaddq_u8(r5, p5);
+ diff5 = vabdq_u8(s5, avg5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ s6 = vld1q_u8(src_ptr + 96);
+ r6 = vld1q_u8(ref_ptr + 96);
+ p6 = vld1q_u8(second_pred + 96);
+ avg6 = vrhaddq_u8(r6, p6);
+ diff6 = vabdq_u8(s6, avg6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ s7 = vld1q_u8(src_ptr + 112);
+ r7 = vld1q_u8(ref_ptr + 112);
+ p7 = vld1q_u8(second_pred + 112);
+ avg7 = vrhaddq_u8(r7, p7);
+ diff7 = vabdq_u8(s7, avg7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ p2 = vld1q_u8(second_pred + 32);
+ avg2 = vrhaddq_u8(r2, p2);
+ diff2 = vabdq_u8(s2, avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ p3 = vld1q_u8(second_pred + 48);
+ avg3 = vrhaddq_u8(r3, p3);
+ diff3 = vabdq_u8(s3, avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+ uint8x16_t diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+ uint8x16_t diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(r, p);
+ uint8x16_t diff = vabdq_u8(s, avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+SAD_WXH_AVG_NEON(64, 128)
+
+SAD_WXH_AVG_NEON(128, 64)
+SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON(4, 16)
+SAD_WXH_AVG_NEON(8, 32)
+SAD_WXH_AVG_NEON(16, 4)
+SAD_WXH_AVG_NEON(16, 64)
+SAD_WXH_AVG_NEON(32, 8)
+SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+ uint8x16_t diff = vabdq_u8(s, wtd_avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000000..5504c6838e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to minimize the accumulation and loop carried
+ // dependencies for better instruction throughput.
+ uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[4] = vaddq_u32(sum[4], sum[5]);
+ sum[6] = vaddq_u32(sum[6], sum[7]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ sum[4] = vaddq_u32(sum[4], sum[6]);
+ sum[0] = vaddq_u32(sum[0], sum[4]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ uint8x16_t s1 = vld1q_u8(src_ptr);
+ uint8x16_t r1 = vld1q_u8(ref_ptr);
+ uint8x16_t p1 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon_dotprod( \
+ src, src_stride, ref, ref_stride, (h), second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon.c b/third_party/aom/aom_dsp/arm/sadxd_neon.c
new file mode 100644
index 0000000000..e89e1c5a73
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sadxd_neon.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint16x8_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3],
+ int ref_stride, uint32_t res[3], int w,
+ int h, int h_overflow) {
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ int h_limit = h > h_overflow ? h_overflow : h;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+
+ h_limit += h_overflow;
+ } while (i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
+static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
+}
+
+static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
+ res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
+ res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
+}
+
+static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u16x8(sum[0]);
+ res[1] = horizontal_add_u16x8(sum[1]);
+ res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ uint16x8_t sum[3];
+
+ uint8x8_t s = vld1_u8(src);
+ sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+ sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+ sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+
+ src += src_stride;
+ int ref_offset = ref_stride;
+ int i = h - 1;
+ do {
+ s = vld1_u8(src);
+ sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+ sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+ sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u16x8(sum[0]);
+ res[1] = horizontal_add_u16x8(sum[1]);
+ res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ assert(h % 2 == 0);
+ uint16x8_t sum[3];
+
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+
+ sum[0] = vabdl_u8(s, r0);
+ sum[1] = vabdl_u8(s, r1);
+ sum[2] = vabdl_u8(s, r2);
+
+ src += 2 * src_stride;
+ int ref_offset = 2 * ref_stride;
+ int i = (h / 2) - 1;
+ do {
+ s = load_unaligned_u8(src, src_stride);
+ r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+
+ sum[0] = vabal_u8(sum[0], s, r0);
+ sum[1] = vabal_u8(sum[1], s, r1);
+ sum[2] = vabal_u8(sum[2], s, r2);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u16x8(sum[0]);
+ res[1] = horizontal_add_u16x8(sum[1]);
+ res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON(w, h) \
+ void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_3D_NEON(4, 4)
+SAD_WXH_3D_NEON(4, 8)
+
+SAD_WXH_3D_NEON(8, 4)
+SAD_WXH_3D_NEON(8, 8)
+SAD_WXH_3D_NEON(8, 16)
+
+SAD_WXH_3D_NEON(16, 8)
+SAD_WXH_3D_NEON(16, 16)
+SAD_WXH_3D_NEON(16, 32)
+
+SAD_WXH_3D_NEON(32, 16)
+SAD_WXH_3D_NEON(32, 32)
+SAD_WXH_3D_NEON(32, 64)
+
+SAD_WXH_3D_NEON(64, 32)
+SAD_WXH_3D_NEON(64, 64)
+SAD_WXH_3D_NEON(64, 128)
+
+SAD_WXH_3D_NEON(128, 64)
+SAD_WXH_3D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON(4, 16)
+SAD_WXH_3D_NEON(8, 32)
+SAD_WXH_3D_NEON(16, 4)
+SAD_WXH_3D_NEON(16, 64)
+SAD_WXH_3D_NEON(32, 8)
+SAD_WXH_3D_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON
+
+static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4], int w,
+ int h, int h_overflow) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ int h_limit = h > h_overflow ? h_overflow : h;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+ sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+ sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+ h_limit += h_overflow;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+ sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+ sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+ sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4];
+
+ uint8x8_t s = vld1_u8(src);
+ sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+ sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+ sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+ sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
+
+ src += src_stride;
+ int ref_offset = ref_stride;
+ int i = h - 1;
+ do {
+ s = vld1_u8(src);
+ sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+ sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+ sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+ sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4];
+
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
+
+ sum[0] = vabdl_u8(s, r0);
+ sum[1] = vabdl_u8(s, r1);
+ sum[2] = vabdl_u8(s, r2);
+ sum[3] = vabdl_u8(s, r3);
+
+ src += 2 * src_stride;
+ int ref_offset = 2 * ref_stride;
+ int i = h / 2;
+ while (--i != 0) {
+ s = load_unaligned_u8(src, src_stride);
+ r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+
+ sum[0] = vabal_u8(sum[0], s, r0);
+ sum[1] = vabal_u8(sum[1], s, r1);
+ sum[2] = vabal_u8(sum[2], s, r2);
+ sum[3] = vabal_u8(sum[3], s, r3);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ }
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+#define SAD_WXH_4D_NEON(w, h) \
+ void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+SAD_WXH_4D_NEON(64, 128)
+
+SAD_WXH_4D_NEON(128, 64)
+SAD_WXH_4D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON(4, 16)
+SAD_WXH_4D_NEON(8, 32)
+SAD_WXH_4D_NEON(16, 4)
+SAD_WXH_4D_NEON(16, 64)
+SAD_WXH_4D_NEON(32, 8)
+SAD_WXH_4D_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h) \
+ void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+ ((h) >> 1)); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
+
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+SAD_SKIP_WXH_4D_NEON(64, 128)
+
+SAD_SKIP_WXH_4D_NEON(128, 64)
+SAD_SKIP_WXH_4D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON(4, 16)
+SAD_SKIP_WXH_4D_NEON(8, 32)
+SAD_SKIP_WXH_4D_NEON(16, 4)
+SAD_SKIP_WXH_4D_NEON(16, 64)
+SAD_SKIP_WXH_4D_NEON(32, 8)
+SAD_SKIP_WXH_4D_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c
new file mode 100644
index 0000000000..3d11d1cb96
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint32x4_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON_DOTPROD(w, h) \
+ void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_3D_NEON_DOTPROD(16, 8)
+SAD_WXH_3D_NEON_DOTPROD(16, 16)
+SAD_WXH_3D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_3D_NEON_DOTPROD(32, 16)
+SAD_WXH_3D_NEON_DOTPROD(32, 32)
+SAD_WXH_3D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_3D_NEON_DOTPROD(64, 32)
+SAD_WXH_3D_NEON_DOTPROD(64, 64)
+SAD_WXH_3D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_3D_NEON_DOTPROD(128, 64)
+SAD_WXH_3D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON_DOTPROD(16, 4)
+SAD_WXH_3D_NEON_DOTPROD(16, 64)
+SAD_WXH_3D_NEON_DOTPROD(32, 8)
+SAD_WXH_3D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON_DOTPROD
+
+static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h) \
+ void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_WXH_4D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \
+ void aom_sad_skip_##w##x##h##x4d_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+ ((h) >> 1)); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/sse_neon.c b/third_party/aom/aom_dsp/arm/sse_neon.c
new file mode 100644
index 0000000000..ec8f0ee183
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sse_neon.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+ uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32x4_t *sse) {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int width, int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
+
+ if ((width & 0x07) && ((width & 0x07) < 5)) {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon(src + j, ref + j, &sse);
+ sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+ j += 8;
+ } while (j + 4 < width);
+
+ sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ } else {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon(src + j, ref + j, &sse);
+ j += 8;
+ } while (j < width);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+ return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+ sse_16x1_neon(src + 64, ref + 64, &sse[0]);
+ sse_16x1_neon(src + 80, ref + 80, &sse[1]);
+ sse_16x1_neon(src + 96, ref + 96, &sse[0]);
+ sse_16x1_neon(src + 112, ref + 112, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_16x1_neon(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
+
+ int i = height;
+ do {
+ sse_8x1_neon(src, ref, &sse);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
+
+ int i = height;
+ do {
+ sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(sse);
+}
+
+int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, int width, int height) {
+ switch (width) {
+ case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+ default:
+ return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000000..979049780b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+ uint32x2_t *sse) {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32x2_t *sse) {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int width, int height) {
+ uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ if ((width & 0x07) && ((width & 0x07) < 5)) {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+ sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+ &sse[1]);
+ j += 8;
+ } while (j + 4 < width);
+
+ sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ } else {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+ sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+ &sse[1]);
+ j += 8;
+ } while (j < width);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ }
+ return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+ sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
+ sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
+ sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
+ sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_16x1_neon_dotprod(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_8x1_neon_dotprod(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_8x1_neon_dotprod(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x2_t sse = vdup_n_u32(0);
+
+ int i = height;
+ do {
+ sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x2(sse);
+}
+
+int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int width,
+ int height) {
+ switch (width) {
+ case 4:
+ return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ default:
+ return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000000..2e6e738853
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+ vst1q_u8(dst_ptr + j, blend_u8);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+
+SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 4.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 8.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint8x8_t p = vld1_u8(second_pred);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void avg_pred_var_filter_block2d_bil_w128(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 128, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine averaging subpel filter with aom_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height,
+ const uint8_t *second_pred) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ avg = vrhaddq_u8(avg, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
+static void dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+ int dst_width, int dst_height,
+ const uint8_t *second_pred) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(s, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
+static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ avg_pred(src, tmp, source_stride, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
+ xoffset, second_pred); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred, \
+ jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred, \
+ jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#if !CONFIG_REALTIME_ONLY
+
+#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \
+ }
+
+#define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \
+ yoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } \
+ } \
+ }
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+#undef OBMC_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
+#endif // !CONFIG_REALTIME_ONLY
+
+#define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
+ invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ if (xoffset == 0) { \
+ uint8_t tmp0[w * h]; \
+ if (yoffset == 0) { \
+ aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \
+ yoffset); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ if (yoffset == 0) { \
+ uint8_t tmp0[w * h]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * (h + padding)]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef MASKED_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 0000000000..a195c40d19
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ if (cols > 16) {
+ int r = rows;
+ do {
+ int c = 0;
+ do {
+ const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t v_diff_lo_00 =
+ vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 =
+ vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 =
+ vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 =
+ vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ c += 32;
+ } while (c < cols);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 8) {
+ int r = rows;
+ do {
+ const uint8x16_t v_src = vld1q_u8(&src[0]);
+ const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+ const uint16x8_t v_diff_lo =
+ vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi =
+ vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 4) {
+ int r = rows;
+ do {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint8x8_t v_pred = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else {
+ int r = rows;
+ do {
+ int c = 0;
+ do {
+ diff[c] = src[c] - pred[c];
+ } while (++c < cols);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+ if (cols > 16) {
+ int r = rows;
+ do {
+ int c = 0;
+ do {
+ const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]);
+ const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]);
+ const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+ const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]);
+ const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]);
+ const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08));
+ c += 16;
+ } while (c < cols);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 8) {
+ int r = rows;
+ do {
+ const uint16x8_t v_src_00 = vld1q_u16(&src[0]);
+ const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]);
+ const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+ const uint16x8_t v_src_08 = vld1q_u16(&src[8]);
+ const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]);
+ const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 4) {
+ int r = rows;
+ do {
+ const uint16x8_t v_src_r0 = vld1q_u16(&src[0]);
+ const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]);
+ const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]);
+ const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]);
+ const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0);
+ const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0));
+ vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1));
+ diff += diff_stride << 1;
+ pred += pred_stride << 1;
+ src += src_stride << 1;
+ r -= 2;
+ } while (r != 0);
+ } else {
+ int r = rows;
+ do {
+ const uint16x4_t v_src_r0 = vld1_u16(&src[0]);
+ const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]);
+ const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]);
+ const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]);
+ const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0);
+ const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1);
+ vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0));
+ vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1));
+ diff += diff_stride << 1;
+ pred += pred_stride << 1;
+ src += src_stride << 1;
+ r -= 2;
+ } while (r != 0);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/arm/sum_neon.h b/third_party/aom/aom_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..30a108e70a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_neon.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
+#define AOM_AOM_DSP_ARM_SUM_NEON_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_u8x8(const uint8x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ uint16x4_t b = vpaddl_u8(a);
+ uint32x2_t c = vpaddl_u16(b);
+ return vget_lane_u32(c, 0) + vget_lane_u32(c, 1);
+#endif
+}
+
+static INLINE int horizontal_add_s16x8(const int16x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s16(a);
+#else
+ const int32x4_t b = vpaddlq_s16(a);
+ const int64x2_t c = vpaddlq_s32(b);
+ const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+ vreinterpret_s32_s64(vget_high_s64(c)));
+ return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_s64(a);
+#else
+ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_u64(a);
+#else
+ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+ uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+ return vpaddq_u32(res01, res23);
+#else
+ uint32x4_t res = vdupq_n_u32(0);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
+ int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
+ return vpaddq_s32(res01, res23);
+#else
+ int32x4_t res = vdupq_n_s32(0);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_u16x8(
+ const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+ const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+ const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+ const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+ const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+ const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if AOM_ARCH_AARCH64
+ const uint32x4_t c0 = vpaddq_u32(b0, b1);
+ const uint32x4_t c1 = vpaddq_u32(b2, b3);
+ return vpaddq_u32(c0, c1);
+#else
+ const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+ const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+ const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+ const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+ const uint32x2_t d0 = vpadd_u32(c0, c1);
+ const uint32x2_t d1 = vpadd_u32(c2, c3);
+ return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_u16(a);
+#else
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint16x8_t b0 = vpaddq_u16(a0, a1);
+ return vpaddlq_u16(b0);
+#else
+ const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint16x4_t b0 = vpadd_u16(a0, a1);
+ const uint16x4_t b1 = vpadd_u16(a2, a3);
+ return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]);
+ const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]);
+ const int16x8_t b0 = vpaddq_s16(a0, a1);
+ return vpaddlq_s16(b0);
+#else
+ const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0]));
+ const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1]));
+ const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2]));
+ const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3]));
+ const int16x4_t b0 = vpadd_s16(a0, a1);
+ const int16x4_t b1 = vpadd_s16(a2, a3);
+ return vpaddlq_s16(vcombine_s16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u64(b, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u16(a);
+#else
+ const uint32x2_t b = vpaddl_u16(a);
+ const uint64x1_t c = vpaddl_u32(b);
+ return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
+#if AOM_ARCH_AARCH64
+ return vpaddq_s32(a, b);
+#else
+ const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+ const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+ return vcombine_s32(a0, b0);
+#endif
+}
+
+static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vget_low_s32(vpaddq_s32(a, a));
+#else
+ return vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
+ return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
+ uint64x2_t sum = vpaddlq_u32(a[0]);
+ sum = vpadalq_u32(sum, a[1]);
+ sum = vpadalq_u32(sum, a[2]);
+ sum = vpadalq_u32(sum, a[3]);
+
+ return horizontal_add_u64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+
+ return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+ sum[0] = vpadalq_u32(sum[0], a[8]);
+ sum[1] = vpadalq_u32(sum[1], a[9]);
+ sum[0] = vpadalq_u32(sum[0], a[10]);
+ sum[1] = vpadalq_u32(sum[1], a[11]);
+ sum[0] = vpadalq_u32(sum[0], a[12]);
+ sum[1] = vpadalq_u32(sum[1], a[13]);
+ sum[0] = vpadalq_u32(sum[0], a[14]);
+ sum[1] = vpadalq_u32(sum[1], a[15]);
+
+ return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+#endif // AOM_AOM_DSP_ARM_SUM_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon.c b/third_party/aom/aom_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..424b2b4445
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
+ int stride) {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ int32x4_t sum_squares = vmull_s16(s0, s0);
+ sum_squares = vmlal_s16(sum_squares, s1, s1);
+ sum_squares = vmlal_s16(sum_squares, s2, s2);
+ sum_squares = vmlal_s16(sum_squares, s3, s3);
+
+ return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares));
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
+ int stride, int height) {
+ int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int h = height;
+ do {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ sum_squares[0] = vmlal_s16(sum_squares[0], s0, s0);
+ sum_squares[0] = vmlal_s16(sum_squares[0], s1, s1);
+ sum_squares[1] = vmlal_s16(sum_squares[1], s2, s2);
+ sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3);
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ return horizontal_long_add_u32x4(
+ vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src,
+ int stride, int width,
+ int height) {
+ uint64x2_t sum_squares = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int w = 0;
+ do {
+ const int16_t *s = src + w;
+ int16x8_t s0 = vld1q_s16(s + 0 * stride);
+ int16x8_t s1 = vld1q_s16(s + 1 * stride);
+ int16x8_t s2 = vld1q_s16(s + 2 * stride);
+ int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s0), vget_low_s16(s0));
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s1), vget_low_s16(s1));
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s2), vget_low_s16(s2));
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s3), vget_low_s16(s3));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s0), vget_high_s16(s0));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s1), vget_high_s16(s1));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s2), vget_high_s16(s2));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s3), vget_high_s16(s3));
+ w += 8;
+ } while (w < width);
+
+ sum_squares = vpadalq_u32(
+ sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1])));
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ return horizontal_add_u64x2(sum_squares);
+}
+
+uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
+ int height) {
+ // 4 elements per row only requires half an SIMD register, so this
+ // must be a special case, but also note that over 75% of all calls
+ // are with size == 4, so it is also the common case.
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_neon(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_neon(src, stride, height);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case
+ return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
+ int stride, int *sum) {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ int32x4_t sse = vmull_s16(s0, s0);
+ sse = vmlal_s16(sse, s1, s1);
+ sse = vmlal_s16(sse, s2, s2);
+ sse = vmlal_s16(sse, s3, s3);
+
+ int32x4_t sum_01 = vaddl_s16(s0, s1);
+ int32x4_t sum_23 = vaddl_s16(s2, s3);
+ *sum += horizontal_add_s32x4(vaddq_s32(sum_01, sum_23));
+
+ return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
+ int stride, int height,
+ int *sum) {
+ int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) };
+
+ int h = height;
+ do {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ sse[0] = vmlal_s16(sse[0], s0, s0);
+ sse[0] = vmlal_s16(sse[0], s1, s1);
+ sse[1] = vmlal_s16(sse[1], s2, s2);
+ sse[1] = vmlal_s16(sse[1], s3, s3);
+
+ sum_acc[0] = vpadal_s16(sum_acc[0], s0);
+ sum_acc[0] = vpadal_s16(sum_acc[0], s1);
+ sum_acc[1] = vpadal_s16(sum_acc[1], s2);
+ sum_acc[1] = vpadal_s16(sum_acc[1], s3);
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1]));
+ return horizontal_long_add_u32x4(
+ vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1])));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src,
+ int stride, int width,
+ int height, int *sum) {
+ uint64x2_t sse = vdupq_n_u64(0);
+ int32x4_t sum_acc = vdupq_n_s32(0);
+
+ int h = height;
+ do {
+ int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int w = 0;
+ do {
+ const int16_t *s = src + w;
+ int16x8_t s0 = vld1q_s16(s + 0 * stride);
+ int16x8_t s1 = vld1q_s16(s + 1 * stride);
+ int16x8_t s2 = vld1q_s16(s + 2 * stride);
+ int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s0), vget_low_s16(s0));
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s1), vget_low_s16(s1));
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s2), vget_low_s16(s2));
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s3), vget_low_s16(s3));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s0), vget_high_s16(s0));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s1), vget_high_s16(s1));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s2), vget_high_s16(s2));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s3), vget_high_s16(s3));
+
+ sum_acc = vpadalq_s16(sum_acc, s0);
+ sum_acc = vpadalq_s16(sum_acc, s1);
+ sum_acc = vpadalq_s16(sum_acc, s2);
+ sum_acc = vpadalq_s16(sum_acc, s3);
+
+ w += 8;
+ } while (w < width);
+
+ sse = vpadalq_u32(sse,
+ vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1])));
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ *sum += horizontal_add_s32x4(sum_acc);
+ return horizontal_add_u64x2(sse);
+}
+
+uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width,
+ int height, int *sum) {
+ uint64_t sse;
+
+ if (LIKELY(width == 4 && height == 4)) {
+ sse = aom_sum_sse_2d_i16_4x4_neon(src, stride, sum);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ // width = 4, height is a multiple of 4.
+ sse = aom_sum_sse_2d_i16_4xn_neon(src, stride, height, sum);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case - width is multiple of 8, height is multiple of 4.
+ sse = aom_sum_sse_2d_i16_nxn_neon(src, stride, width, height, sum);
+ } else {
+ sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+ }
+
+ return sse;
+}
+
+static INLINE uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src,
+ uint32_t n) {
+ uint64x2_t sum_u64 = vdupq_n_u64(0);
+
+ int i = n;
+ do {
+ uint32x4_t sum;
+ int16x4_t s0 = vld1_s16(src);
+
+ sum = vreinterpretq_u32_s32(vmull_s16(s0, s0));
+
+ sum_u64 = vpadalq_u32(sum_u64, sum);
+
+ src += 4;
+ i -= 4;
+ } while (i >= 4);
+
+ if (i > 0) {
+ return horizontal_add_u64x2(sum_u64) + aom_sum_squares_i16_c(src, i);
+ }
+ return horizontal_add_u64x2(sum_u64);
+}
+
+static INLINE uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src,
+ uint32_t n) {
+ uint64x2_t sum_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int i = n;
+ do {
+ uint32x4_t sum[2];
+ int16x8_t s0 = vld1q_s16(src);
+
+ sum[0] =
+ vreinterpretq_u32_s32(vmull_s16(vget_low_s16(s0), vget_low_s16(s0)));
+ sum[1] =
+ vreinterpretq_u32_s32(vmull_s16(vget_high_s16(s0), vget_high_s16(s0)));
+
+ sum_u64[0] = vpadalq_u32(sum_u64[0], sum[0]);
+ sum_u64[1] = vpadalq_u32(sum_u64[1], sum[1]);
+
+ src += 8;
+ i -= 8;
+ } while (i >= 8);
+
+ if (i > 0) {
+ return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])) +
+ aom_sum_squares_i16_c(src, i);
+ }
+ return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1]));
+}
+
+uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) {
+ // This function seems to be called only for values of N >= 64. See
+ // av1/encoder/compound_type.c.
+ if (LIKELY(n >= 8)) {
+ return aom_sum_squares_i16_8xn_neon(src, n);
+ }
+ if (n >= 4) {
+ return aom_sum_squares_i16_4xn_neon(src, n);
+ }
+ return aom_sum_squares_i16_c(src, n);
+}
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+ // element before we need to accumulate to 32-bit elements. Since we're
+ // accumulating in uint16x4_t vectors, this means we can accumulate up to 4
+ // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4
+ // * 256) / width.
+ int h_limit = (4 * 256) / width;
+ int h_tmp = height > h_limit ? h_limit : height;
+
+ int h = 0;
+ do {
+ uint16x4_t sum_u16 = vdup_n_u16(0);
+ do {
+ uint8_t *src_ptr = src;
+ int w = width;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+ sum_u16 = vpadal_u8(sum_u16, s0);
+
+ uint16x8_t sse_u16 = vmull_u8(s0, s0);
+
+ sse_u32 = vpadalq_u16(sse_u32, sse_u16);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += 2 * src_stride;
+ h += 2;
+ } while (h < h_tmp && h < height);
+
+ sum_u32 = vpadal_u16(sum_u32, sum_u16);
+ h_tmp += h_limit;
+ } while (h < height);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+ // element before we need to accumulate to 32-bit elements. Since we're
+ // accumulating in uint16x4_t vectors, this means we can accumulate up to 4
+ // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4
+ // * 256) / width.
+ int h_limit = (4 * 256) / width;
+ int h_tmp = height > h_limit ? h_limit : height;
+
+ int h = 0;
+ do {
+ uint16x4_t sum_u16 = vdup_n_u16(0);
+ do {
+ uint8_t *src_ptr = src;
+ int w = width;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+
+ sum_u16 = vpadal_u8(sum_u16, s0);
+
+ uint16x8_t sse_u16 = vmull_u8(s0, s0);
+
+ sse_u32 = vpadalq_u16(sse_u32, sse_u16);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ ++h;
+ } while (h < h_tmp && h < height);
+
+ sum_u32 = vpadal_u16(sum_u32, sum_u16);
+ h_tmp += h_limit;
+ } while (h < height);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+ // element before we need to accumulate to 32-bit elements. Since we're
+ // accumulating in uint16x8_t vectors, this means we can accumulate up to 8
+ // rows of 256 elements. Therefore the limit can be computed as: h_limit = (8
+ // * 256) / width.
+ int h_limit = (8 * 256) / width;
+ int h_tmp = height > h_limit ? h_limit : height;
+
+ int h = 0;
+ do {
+ uint16x8_t sum_u16 = vdupq_n_u16(0);
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ sum_u16 = vpadalq_u8(sum_u16, s0);
+
+ uint16x8_t sse_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(s0));
+ uint16x8_t sse_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(s0));
+
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse_u16_lo);
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse_u16_hi);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w >= 16);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ ++h;
+ } while (h < h_tmp && h < height);
+
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16);
+ h_tmp += h_limit;
+ } while (h < height);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_long_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 16) {
+ return aom_var_2d_u8_16xh_neon(src, src_stride, width, height);
+ }
+ if (width >= 8) {
+ return aom_var_2d_u8_8xh_neon(src, src_stride, width, height);
+ }
+ if (width >= 4 && height % 2 == 0) {
+ return aom_var_2d_u8_4xh_neon(src, src_stride, width, height);
+ }
+ return aom_var_2d_u8_c(src, src_stride, width, height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x4_t s0 = vld1_u16(src_ptr);
+
+ sum_u32 = vpadal_u16(sum_u32, s0);
+
+ uint32x4_t sse_u32 = vmull_u16(s0, s0);
+
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32);
+
+ src_ptr += 4;
+ w -= 4;
+ } while (w >= 4);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint16_t v = src_u16[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_add_u64x2(sse_u64);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+
+ sum_u32 = vpadalq_u16(sum_u32, s0);
+
+ uint32x4_t sse_u32_lo = vmull_u16(vget_low_u16(s0), vget_low_u16(s0));
+ uint32x4_t sse_u32_hi = vmull_u16(vget_high_u16(s0), vget_high_u16(s0));
+
+ sse_u64[0] = vpadalq_u32(sse_u64[0], sse_u32_lo);
+ sse_u64[1] = vpadalq_u32(sse_u64[1], sse_u32_hi);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint16_t v = src_u16[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_add_u64x2(vaddq_u64(sse_u64[0], sse_u64[1]));
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u16_neon(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 8) {
+ return aom_var_2d_u16_8xh_neon(src, src_stride, width, height);
+ }
+ if (width >= 4) {
+ return aom_var_2d_u16_4xh_neon(src, src_stride, width, height);
+ }
+ return aom_var_2d_u16_c(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c
new file mode 100644
index 0000000000..44462a693c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
+ int src_stride, int width,
+ int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x2_t sse_u32 = vdup_n_u32(0);
+
+ int h = height / 2;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+ sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+ sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += 2 * src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x2(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
+ int src_stride, int width,
+ int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x2_t sse_u32 = vdup_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+
+ sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+ sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x2(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
+ int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
+
+ sse_u32 = vdotq_u32(sse_u32, s0, s0);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w >= 16);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 16) {
+ return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height);
+ }
+ if (width >= 8) {
+ return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height);
+ }
+ if (width >= 4 && height % 2 == 0) {
+ return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height);
+ }
+ return aom_var_2d_u8_c(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_sve.c b/third_party/aom/aom_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..724e43859e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
+ int stride, int height) {
+ int64x2_t sum_squares = vdupq_n_s64(0);
+
+ do {
+ int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
+
+ sum_squares = aom_sdotq_s16(sum_squares, s, s);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ return (uint64_t)vaddvq_s64(sum_squares);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
+ int stride, int height) {
+ int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int16x8_t s0 = vld1q_s16(src + 0 * stride);
+ int16x8_t s1 = vld1q_s16(src + 1 * stride);
+
+ sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
+ sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
+ return (uint64_t)vaddvq_s64(sum_squares[0]);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
+ int stride, int width,
+ int height) {
+ int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ const int16_t *src_ptr = src;
+ int w = width;
+ do {
+ int16x8_t s0 = vld1q_s16(src_ptr);
+ int16x8_t s1 = vld1q_s16(src_ptr + 8);
+
+ sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
+ sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ src += stride;
+ } while (--height != 0);
+
+ sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
+ return (uint64_t)vaddvq_s64(sum_squares[0]);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src,
+ int stride, int width,
+ int height) {
+ svint64_t sum_squares = svdup_n_s64(0);
+ uint64_t step = svcnth();
+
+ do {
+ const int16_t *src_ptr = src;
+ int w = 0;
+ do {
+ svbool_t pred = svwhilelt_b16_u32(w, width);
+ svint16_t s0 = svld1_s16(pred, src_ptr);
+
+ sum_squares = svdot_s64(sum_squares, s0, s0);
+
+ src_ptr += step;
+ w += step;
+ } while (w < width);
+
+ src += stride;
+ } while (--height != 0);
+
+ return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares);
+}
+
+uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width,
+ int height) {
+ if (width == 4) {
+ return aom_sum_squares_2d_i16_4xh_sve(src, stride, height);
+ }
+ if (width == 8) {
+ return aom_sum_squares_2d_i16_8xh_sve(src, stride, height);
+ }
+ if (width % 16 == 0) {
+ return aom_sum_squares_2d_i16_large_sve(src, stride, width, height);
+ }
+ return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height);
+}
+
+uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) {
+ // This function seems to be called only for values of N >= 64. See
+ // av1/encoder/compound_type.c. Additionally, because N = width x height for
+ // width and height between the standard block sizes, N will also be a
+ // multiple of 64.
+ if (LIKELY(n % 64 == 0)) {
+ int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t s1 = vld1q_s16(src + 8);
+ int16x8_t s2 = vld1q_s16(src + 16);
+ int16x8_t s3 = vld1q_s16(src + 24);
+
+ sum[0] = aom_sdotq_s16(sum[0], s0, s0);
+ sum[1] = aom_sdotq_s16(sum[1], s1, s1);
+ sum[2] = aom_sdotq_s16(sum[2], s2, s2);
+ sum[3] = aom_sdotq_s16(sum[3], s3, s3);
+
+ src += 32;
+ n -= 32;
+ } while (n != 0);
+
+ sum[0] = vaddq_s64(sum[0], sum[1]);
+ sum[2] = vaddq_s64(sum[2], sum[3]);
+ sum[0] = vaddq_s64(sum[0], sum[2]);
+ return vaddvq_s64(sum[0]);
+ }
+ return aom_sum_squares_i16_c(src, n);
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
+ int stride, int height,
+ int *sum) {
+ int64x2_t sse = vdupq_n_s64(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
+
+ sse = aom_sdotq_s16(sse, s, s);
+
+ sum_s32 = vpadalq_s16(sum_s32, s);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ *sum += vaddvq_s32(sum_s32);
+ return vaddvq_s64(sse);
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
+ int stride, int height,
+ int *sum) {
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ do {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t s1 = vld1q_s16(src + stride);
+
+ sse[0] = aom_sdotq_s16(sse[0], s0, s0);
+ sse[1] = aom_sdotq_s16(sse[1], s1, s1);
+
+ sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
+ sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
+ return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src,
+ int stride, int width,
+ int height, int *sum) {
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ do {
+ int w = 0;
+ do {
+ int16x8_t s0 = vld1q_s16(src + w);
+ int16x8_t s1 = vld1q_s16(src + w + 8);
+
+ sse[0] = aom_sdotq_s16(sse[0], s0, s0);
+ sse[1] = aom_sdotq_s16(sse[1], s1, s1);
+
+ sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
+ sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
+
+ w += 16;
+ } while (w < width);
+
+ src += stride;
+ } while (--height != 0);
+
+ *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
+ return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width,
+ int height, int *sum) {
+ uint64_t sse;
+
+ if (width == 4) {
+ sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum);
+ } else if (width == 8) {
+ sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum);
+ } else if (width % 16 == 0) {
+ sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum);
+ } else {
+ sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+ }
+
+ return sse;
+}
+
+static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ uint16x8_t s0 =
+ vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride));
+
+ sum_u32 = vpadalq_u16(sum_u32, s0);
+
+ sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
+
+ src_u16 += 2 * src_stride;
+ h -= 2;
+ } while (h != 0);
+
+ sum += vaddlvq_u32(sum_u32);
+ sse += vaddvq_u64(sse_u64);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+
+ sum_u32 = vpadalq_u16(sum_u32, s0);
+
+ sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum += vaddlvq_u32(sum_u32);
+ sse += vaddvq_u64(sse_u64);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
+
+ sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
+ sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
+ sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
+
+ sum += vaddlvq_u32(sum_u32[0]);
+ sse += vaddvq_u64(sse_u64[0]);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+ uint16x8_t s2 = vld1q_u16(src_ptr + 16);
+ uint16x8_t s3 = vld1q_u16(src_ptr + 24);
+
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
+ sum_u32[2] = vpadalq_u16(sum_u32[2], s2);
+ sum_u32[3] = vpadalq_u16(sum_u32[3], s3);
+
+ sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
+ sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
+ sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2);
+ sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3);
+
+ src_ptr += 32;
+ w -= 32;
+ } while (w != 0);
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
+ sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]);
+ sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]);
+ sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
+ sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]);
+ sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]);
+
+ sum += vaddlvq_u32(sum_u32[0]);
+ sse += vaddvq_u64(sse_u64[0]);
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width == 4) {
+ return aom_var_2d_u16_4xh_sve(src, src_stride, width, height);
+ }
+ if (width == 8) {
+ return aom_var_2d_u16_8xh_sve(src, src_stride, width, height);
+ }
+ if (width == 16) {
+ return aom_var_2d_u16_16xh_sve(src, src_stride, width, height);
+ }
+ if (width % 32 == 0) {
+ return aom_var_2d_u16_large_sve(src, src_stride, width, height);
+ }
+ return aom_var_2d_u16_neon(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/transpose_neon.h b/third_party/aom/aom_dsp/arm/transpose_neon.h
new file mode 100644
index 0000000000..8027018235
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/transpose_neon.h
@@ -0,0 +1,1263 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
+#include "config/aom_config.h"
+
+static INLINE void transpose_elems_u8_8x8(
+ uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
+ uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
+ uint8x8_t *o7) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+
+ const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
+ const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
+ uint8x8_t *a2, uint8x8_t *a3,
+ uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6,
+ uint8x8_t *a7) {
+ transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
+ a4, a5, a6, a7);
+}
+
+static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
+ uint8x8_t *out) {
+ transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+ &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
+ uint8x16_t *d) {
+ uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+ uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+ uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
+ uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
+
+ uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
+ uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
+ uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
+ uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
+
+ uint16x4x2_t w4 =
+ vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+ uint16x4x2_t w5 =
+ vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+ uint16x4x2_t w12 =
+ vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
+ uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
+ vreinterpret_u16_u8(w11.val[0]));
+
+ uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+ vreinterpret_u32_u16(w5.val[0]));
+ uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+ vreinterpret_u32_u16(w5.val[1]));
+ uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+ vreinterpret_u32_u16(w13.val[0]));
+ uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+ vreinterpret_u32_u16(w13.val[1]));
+
+ // Store first 4-line result
+ d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
+ d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
+ d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
+ d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
+
+ w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+ w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+ w12 =
+ vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
+ w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
+ vreinterpret_u16_u8(w11.val[1]));
+
+ w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+ vreinterpret_u32_u16(w5.val[0]));
+ w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+ vreinterpret_u32_u16(w5.val[1]));
+ w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+ vreinterpret_u32_u16(w13.val[0]));
+ w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+ vreinterpret_u32_u16(w13.val[1]));
+
+ // Store second 4-line result
+ d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
+ d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
+ d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
+ d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
+ uint8x8_t *d) {
+ uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+ uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+ uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
+ uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
+
+ uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+ vreinterpretq_u16_u8(w1.val[0]));
+ uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+ vreinterpretq_u16_u8(w3.val[0]));
+ uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+ vreinterpretq_u16_u8(w1.val[1]));
+ uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+ vreinterpretq_u16_u8(w3.val[1]));
+
+ uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+ vreinterpretq_u32_u16(w5.val[0]));
+ uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
+ vreinterpretq_u32_u16(w7.val[0]));
+ uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+ vreinterpretq_u32_u16(w5.val[1]));
+ uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
+ vreinterpretq_u32_u16(w7.val[1]));
+
+ d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
+ d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
+ d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
+ d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
+ d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
+ d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
+ d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
+ d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
+ d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
+ d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
+ d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
+ d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
+ d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
+ d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
+ d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
+ d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
+}
+
+static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+ uint16x8x2_t b0;
+#if AOM_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+ b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+ vreinterpret_u16_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+ vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
+ uint8x16_t *d) {
+ uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+ uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+ uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
+ uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
+
+ uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
+ uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
+ uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
+ uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
+
+ uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+ vreinterpretq_u16_u8(w1.val[0]));
+ uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+ vreinterpretq_u16_u8(w3.val[0]));
+ uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
+ vreinterpretq_u16_u8(w5.val[0]));
+ uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
+ vreinterpretq_u16_u8(w7.val[0]));
+
+ uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+ vreinterpretq_u32_u16(w9.val[0]));
+ uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+ vreinterpretq_u32_u16(w11.val[0]));
+ uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+ vreinterpretq_u32_u16(w9.val[1]));
+ uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+ vreinterpretq_u32_u16(w11.val[1]));
+
+ uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
+ d[0] = vreinterpretq_u8_u16(d01.val[0]);
+ d[1] = vreinterpretq_u8_u16(d01.val[1]);
+ uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
+ d[2] = vreinterpretq_u8_u16(d23.val[0]);
+ d[3] = vreinterpretq_u8_u16(d23.val[1]);
+ uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
+ d[4] = vreinterpretq_u8_u16(d45.val[0]);
+ d[5] = vreinterpretq_u8_u16(d45.val[1]);
+ uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
+ d[6] = vreinterpretq_u8_u16(d67.val[0]);
+ d[7] = vreinterpretq_u8_u16(d67.val[1]);
+
+ // upper half
+ w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+ vreinterpretq_u16_u8(w1.val[1]));
+ w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+ vreinterpretq_u16_u8(w3.val[1]));
+ w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
+ vreinterpretq_u16_u8(w5.val[1]));
+ w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
+ vreinterpretq_u16_u8(w7.val[1]));
+
+ w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+ vreinterpretq_u32_u16(w9.val[0]));
+ w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+ vreinterpretq_u32_u16(w11.val[0]));
+ w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+ vreinterpretq_u32_u16(w9.val[1]));
+ w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+ vreinterpretq_u32_u16(w11.val[1]));
+
+ d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
+ d[8] = vreinterpretq_u8_u16(d01.val[0]);
+ d[9] = vreinterpretq_u8_u16(d01.val[1]);
+ d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
+ d[10] = vreinterpretq_u8_u16(d23.val[0]);
+ d[11] = vreinterpretq_u8_u16(d23.val[1]);
+ d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
+ d[12] = vreinterpretq_u8_u16(d45.val[0]);
+ d[13] = vreinterpretq_u8_u16(d45.val[1]);
+ d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
+ d[14] = vreinterpretq_u8_u16(d67.val[0]);
+ d[15] = vreinterpretq_u8_u16(d67.val[1]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
+ uint8x16_t *d) {
+ uint8x16_t x2[32];
+ for (int i = 0; i < 16; ++i) {
+ x2[i] = x[i].val[0];
+ x2[i + 16] = x[i].val[1];
+ }
+ transpose_arrays_u8_16x16(x2, d);
+ transpose_arrays_u8_16x16(x2 + 16, d + 16);
+}
+
+static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
+ uint8x8_t *a2,
+ uint8x8_t *a3) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
+ uint8x8_t *a1) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint16x4x2_t b0 =
+ vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 02 03 22 23
+ // c0.val[1]: 10 11 30 31 12 13 32 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b0.val[1]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x8_t a4, uint8x8_t a5,
+ uint8x8_t a6, uint8x8_t a7,
+ uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 XX XX XX XX
+ // a1: 10 11 12 13 XX XX XX XX
+ // a2: 20 21 22 23 XX XX XX XX
+ // a3; 30 31 32 33 XX XX XX XX
+ // a4: 40 41 42 43 XX XX XX XX
+ // a5: 50 51 52 53 XX XX XX XX
+ // a6: 60 61 62 63 XX XX XX XX
+ // a7: 70 71 72 73 XX XX XX XX
+ // to:
+ // b0.val[0]: 00 01 02 03 40 41 42 43
+ // b1.val[0]: 10 11 12 13 50 51 52 53
+ // b2.val[0]: 20 21 22 23 60 61 62 63
+ // b3.val[0]: 30 31 32 33 70 71 72 73
+
+ const uint32x2x2_t b0 =
+ vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+ const uint32x2x2_t b1 =
+ vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+ const uint32x2x2_t b2 =
+ vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+ const uint32x2x2_t b3 =
+ vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 40 41 60 61
+ // c0.val[1]: 02 03 22 23 42 43 62 63
+ // c1.val[0]: 10 11 30 31 50 51 70 71
+ // c1.val[1]: 12 13 32 33 52 53 72 73
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+ vreinterpret_u16_u32(b2.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+ vreinterpret_u16_u32(b3.val[0]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 01 11 21 31 41 51 61 71
+ // d1.val[0]: 02 12 22 32 42 52 62 72
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+ const uint8x8x2_t d1 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+ *o0 = d0.val[0];
+ *o1 = d0.val[1];
+ *o2 = d1.val[0];
+ *o3 = d1.val[1];
+}
+
+static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
+ // Input:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
+ // b:
+ // 00 10 02 12
+ // 01 11 03 13
+ const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+ // c:
+ // 20 30 22 32
+ // 21 31 23 33
+ const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+ // d:
+ // 00 10 20 30
+ // 02 12 22 32
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+ // e:
+ // 01 11 21 31
+ // 03 13 23 33
+ const uint32x2x2_t e =
+ vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+
+ // Output:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+ a[0] = vreinterpret_u16_u32(d.val[0]);
+ a[1] = vreinterpret_u16_u32(e.val[0]);
+ a[2] = vreinterpret_u16_u32(d.val[1]);
+ a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
+ // 4x8 Input:
+ // a[0]: 00 01 02 03 04 05 06 07
+ // a[1]: 10 11 12 13 14 15 16 17
+ // a[2]: 20 21 22 23 24 25 26 27
+ // a[3]: 30 31 32 33 34 35 36 37
+
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ // 8x4 Output:
+ // a[0]: 00 10 20 30 04 14 24 34
+ // a[1]: 01 11 21 31 05 15 25 35
+ // a[2]: 02 12 22 32 06 16 26 36
+ // a[3]: 03 13 23 33 07 17 27 37
+ a[0] = vreinterpretq_u16_u32(c0.val[0]);
+ a[1] = vreinterpretq_u16_u32(c1.val[0]);
+ a[2] = vreinterpretq_u16_u32(c0.val[1]);
+ a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q: p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34 p0q0
+// a[1]: 02 12 22 32 05 15 25 35 p1q1
+// a[2]: 01 11 21 31 06 16 26 36 p2q2
+// a[3]: 00 10 20 30 07 17 27 37 p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q: p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard transpose_u16_4x8q will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+ // Reverse odd vectors to bring the appropriate items to the front of zips.
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // r0 : 03 13 01 11 07 17 05 15
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // r1 : 23 33 21 31 27 37 25 35
+ const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+ const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+ // Zip to complete the halves.
+ // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
+ // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
+ // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
+ // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
+ const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+ // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
+ // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
+ // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
+ // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
+ const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
+ // The third row of c comes first here to swap p2 with q0.
+ const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
+
+ // 8x4 Output:
+ // a[0]: 03 13 23 33 04 14 24 34 p0q0
+ // a[1]: 02 12 22 32 05 15 25 35 p1q1
+ // a[2]: 01 11 21 31 06 16 26 36 p2q2
+ // a[3]: 00 10 20 30 07 17 27 37 p3q3
+ a[0] = d1.val[0]; // p0q0
+ a[1] = d0.val[1]; // p1q1
+ a[2] = d1.val[1]; // p2q2
+ a[3] = d0.val[0]; // p3q3
+}
+
+static INLINE void transpose_elems_u16_4x8(
+ const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
+ const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
+ const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
+ uint16x8_t *o2, uint16x8_t *o3) {
+ // Combine rows. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const uint16x8_t b0 = vcombine_u16(a0, a4);
+ const uint16x8_t b1 = vcombine_u16(a1, a5);
+ const uint16x8_t b2 = vcombine_u16(a2, a6);
+ const uint16x8_t b3 = vcombine_u16(a3, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
+ const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *o0 = vreinterpretq_u16_u32(d0.val[0]);
+ *o1 = vreinterpretq_u16_u32(d1.val[0]);
+ *o2 = vreinterpretq_u16_u32(d0.val[1]);
+ *o3 = vreinterpretq_u16_u32(d1.val[1]);
+}
+
+static INLINE void transpose_elems_s16_4x8(
+ const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
+ const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
+ int16x8_t *o2, int16x8_t *o3) {
+ // Combine rows. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const int16x8_t b0 = vcombine_s16(a0, a4);
+ const int16x8_t b1 = vcombine_s16(a1, a5);
+ const int16x8_t b2 = vcombine_s16(a2, a6);
+ const int16x8_t b3 = vcombine_s16(a3, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+ const int16x8x2_t c1 = vtrnq_s16(b2, b3);
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+
+ *o0 = vreinterpretq_s16_s32(d0.val[0]);
+ *o1 = vreinterpretq_s16_s32(d1.val[0]);
+ *o2 = vreinterpretq_s16_s32(d0.val[1]);
+ *o3 = vreinterpretq_s16_s32(d1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_u16_8x8(
+ uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
+ uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+ const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+ const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+ const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+ const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+ const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+#if AOM_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s16_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s16_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+ const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+ const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+ const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
+ int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
+}
+
+static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
+ uint16x4_t *a1,
+ uint16x4_t *a2,
+ uint16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+ const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b1.val[0]));
+ const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+ vreinterpret_u32_u16(b1.val[1]));
+
+ *a0 = vreinterpret_u16_u32(c0.val[0]);
+ *a1 = vreinterpret_u16_u32(c1.val[0]);
+ *a2 = vreinterpret_u16_u32(c0.val[1]);
+ *a3 = vreinterpret_u16_u32(c1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2,
+ int16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+ const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+
+ *a0 = vreinterpret_s16_s32(c0.val[0]);
+ *a1 = vreinterpret_s16_s32(c1.val[0]);
+ *a2 = vreinterpret_s16_s32(c0.val[1]);
+ *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+#if AOM_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s32_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s32_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
+ const int32x4_t a1,
+ const int32x4_t a2,
+ const int32x4_t a3, int32x4_t *o0,
+ int32x4_t *o1, int32x4_t *o2,
+ int32x4_t *o3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(a0, a1);
+ const int32x4x2_t b1 = vtrnq_s32(a2, a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *o0 = c0.val[0];
+ *o1 = c1.val[0];
+ *o2 = c0.val[1];
+ *o3 = c1.val[1];
+}
+
+static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2,
+ int32x4_t *a3) {
+ transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
+}
+
+static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
+ int32x4_t *out) {
+ transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
+ &out[3]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
+ int32x4_t *out,
+ const int width,
+ const int height) {
+ const int h = height >> 2;
+ const int w = width >> 2;
+ for (int j = 0; j < w; j++) {
+ for (int i = 0; i < h; i++) {
+ transpose_arrays_s32_4x4(in + j * height + i * 4,
+ out + i * width + j * 4);
+ }
+ }
+}
+
+#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \
+ static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
+ const int32x4_t *in, int32x4_t *out) { \
+ transpose_arrays_s32_4nx4n(in, out, w, h); \
+ }
+
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
+
+#undef TRANSPOSE_ARRAYS_S32_WXH_NEON
+
+static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+ return vtrn1q_s64(a, b);
+#else
+ return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
+#endif
+}
+
+static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+ return vtrn2q_s64(a, b);
+#else
+ return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
+#endif
+}
+
+static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
+ int32x4_t a2, int32x4_t a3,
+ int32x4_t a4, int32x4_t a5,
+ int32x4_t a6, int32x4_t a7,
+ int32x4x2_t *o0, int32x4x2_t *o1,
+ int32x4x2_t *o2, int32x4x2_t *o3) {
+ // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
+ // matrix transpose implementation:
+ // [ A ]^T => [ A^T B^T ]
+ // [ B ]
+
+ transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T
+ transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T
+
+ o0->val[0] = a0;
+ o1->val[0] = a1;
+ o2->val[0] = a2;
+ o3->val[0] = a3;
+
+ o0->val[1] = a4;
+ o1->val[1] = a5;
+ o2->val[1] = a6;
+ o3->val[1] = a7;
+}
+
+static INLINE void transpose_elems_inplace_s32_8x8(
+ int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
+ // matrix transpose implementation:
+ // [ A B ]^T => [ A^T C^T ]
+ // [ C D ] [ B^T D^T ]
+
+ int32x4_t q0_v1 = a0->val[0];
+ int32x4_t q0_v2 = a1->val[0];
+ int32x4_t q0_v3 = a2->val[0];
+ int32x4_t q0_v4 = a3->val[0];
+
+ int32x4_t q1_v1 = a0->val[1];
+ int32x4_t q1_v2 = a1->val[1];
+ int32x4_t q1_v3 = a2->val[1];
+ int32x4_t q1_v4 = a3->val[1];
+
+ int32x4_t q2_v1 = a4->val[0];
+ int32x4_t q2_v2 = a5->val[0];
+ int32x4_t q2_v3 = a6->val[0];
+ int32x4_t q2_v4 = a7->val[0];
+
+ int32x4_t q3_v1 = a4->val[1];
+ int32x4_t q3_v2 = a5->val[1];
+ int32x4_t q3_v3 = a6->val[1];
+ int32x4_t q3_v4 = a7->val[1];
+
+ transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T
+ transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T
+ transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T
+ transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T
+
+ a0->val[0] = q0_v1;
+ a1->val[0] = q0_v2;
+ a2->val[0] = q0_v3;
+ a3->val[0] = q0_v4;
+
+ a0->val[1] = q2_v1;
+ a1->val[1] = q2_v2;
+ a2->val[1] = q2_v3;
+ a3->val[1] = q2_v4;
+
+ a4->val[0] = q1_v1;
+ a5->val[0] = q1_v2;
+ a6->val[0] = q1_v3;
+ a7->val[0] = q1_v4;
+
+ a4->val[1] = q3_v1;
+ a5->val[1] = q3_v2;
+ a6->val[1] = q3_v3;
+ a7->val[1] = q3_v4;
+}
+
+static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
+ int16x4_t *const out) {
+ int16x4_t a0 = in[0];
+ int16x4_t a1 = in[1];
+ int16x4_t a2 = in[2];
+ int16x4_t a3 = in[3];
+
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+ out[0] = a0;
+ out[1] = a1;
+ out[2] = a2;
+ out[3] = a3;
+}
+
+static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
+ int16x8_t *const out) {
+#if AOM_ARCH_AARCH64
+ const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
+ vcombine_s16(in[1], vdup_n_s16(0)));
+ const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
+ vcombine_s16(in[3], vdup_n_s16(0)));
+ const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
+ vcombine_s16(in[5], vdup_n_s16(0)));
+ const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
+ vcombine_s16(in[7], vdup_n_s16(0)));
+#else
+ int16x4x2_t temp;
+ temp = vzip_s16(in[0], in[1]);
+ const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[2], in[3]);
+ const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[4], in[5]);
+ const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[6], in[7]);
+ const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+ const int32x4x2_t b02 =
+ vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+ const int32x4x2_t b13 =
+ vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if AOM_ARCH_AARCH64
+ out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+ vreinterpretq_s64_s32(b13.val[0])));
+ out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+ vreinterpretq_s64_s32(b13.val[0])));
+ out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+ vreinterpretq_s64_s32(b13.val[1])));
+ out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+ vreinterpretq_s64_s32(b13.val[1])));
+#else
+ out[0] = vreinterpretq_s16_s32(
+ vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+ out[2] = vreinterpretq_s16_s32(
+ vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+ out[1] = vreinterpretq_s16_s32(
+ vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+ out[3] = vreinterpretq_s16_s32(
+ vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
+ int16x4_t *const out) {
+ // Swap 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
+ vreinterpretq_u32_s16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
+ vreinterpretq_u32_s16(b1.val[1]));
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+
+ out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
+ out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
+ out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
+ out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
+ out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
+ out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
+ out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
+ out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
+}
+
+#endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
new file mode 100644
index 0000000000..9e4e8c0cf0
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+ assert(h <= 256);
+
+ int i = h;
+ do {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+}
+
+static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128
+ assert(h <= 128);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // Number of rows we can process before 'sum_s16' accumulators overflow:
+ // 32767 / 255 ~= 128, so 128 16-wide rows.
+ assert(h <= 128);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, int h_limit, uint32_t *sse,
+ int *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+ // accumulator overflows. After hitting this limit we accumulate into 32-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src + j);
+ uint8x16_t r = vld1q_u8(ref + j);
+
+ int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ j += 16;
+ } while (j < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_s32x4(sum_s32);
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
+}
+
+#define VARIANCE_WXH_NEON(w, h, shift) \
+ unsigned int aom_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+VARIANCE_WXH_NEON(4, 16, 6)
+
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+VARIANCE_WXH_NEON(8, 32, 8)
+
+VARIANCE_WXH_NEON(16, 4, 6)
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+VARIANCE_WXH_NEON(16, 64, 10)
+
+VARIANCE_WXH_NEON(32, 8, 8)
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 16, 10)
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+VARIANCE_WXH_NEON(64, 128, 13)
+
+VARIANCE_WXH_NEON(128, 64, 13)
+VARIANCE_WXH_NEON(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON
+
+// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
+// AVX2. Also, implement the NEON for variance computation present in this
+// function.
+void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over four 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
+ &sse8x8[k], &sum8x8[k]);
+ }
+
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++) {
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+ }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over two 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
+ 16, &sse16x16[k], &sum16x16[k]);
+ }
+
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++) {
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+ }
+}
+
+static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int h) {
+ uint8x8_t s[2], r[2];
+ int16x4_t diff_lo[2], diff_hi[2];
+ uint16x8_t diff[2];
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ s[0] = vld1_u8(src);
+ src += src_stride;
+ s[1] = vld1_u8(src);
+ src += src_stride;
+ r[0] = vld1_u8(ref);
+ ref += ref_stride;
+ r[1] = vld1_u8(ref);
+ ref += ref_stride;
+
+ diff[0] = vsubl_u8(s[0], r[0]);
+ diff[1] = vsubl_u8(s[1], r[1]);
+
+ diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+ diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+ diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+ diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+ i -= 2;
+ } while (i != 0);
+
+ sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+
+ *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+ return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int h) {
+ uint8x16_t s[2], r[2];
+ int16x4_t diff_lo[4], diff_hi[4];
+ uint16x8_t diff[4];
+ int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ s[0] = vld1q_u8(src);
+ src += src_stride;
+ s[1] = vld1q_u8(src);
+ src += src_stride;
+ r[0] = vld1q_u8(ref);
+ ref += ref_stride;
+ r[1] = vld1q_u8(ref);
+ ref += ref_stride;
+
+ diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0]));
+ diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0]));
+ diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1]));
+ diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1]));
+
+ diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+ diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+ diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
+ diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
+ sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]);
+ sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]);
+
+ diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+ diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+ diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
+ diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
+ sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]);
+ sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]);
+
+ i -= 2;
+ } while (i != 0);
+
+ sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+ sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]);
+ sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]);
+
+ *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+ return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+#define MSE_WXH_NEON(w, h) \
+ unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h); \
+ }
+
+MSE_WXH_NEON(8, 8)
+MSE_WXH_NEON(8, 16)
+
+MSE_WXH_NEON(16, 8)
+MSE_WXH_NEON(16, 16)
+
+#undef MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
+ uint16x8_t s0, uint16x8_t s1,
+ uint8x8_t d0, uint8x8_t d1) {
+ int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
+ int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1));
+
+ int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0));
+ mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0));
+ mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1));
+ mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1));
+
+ return vpadalq_u32(sum, vreinterpretq_u32_s32(mse));
+}
+
+static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint8x8_t d0 = vld1_u8(dst + 0 * dstride);
+ uint8x8_t d1 = vld1_u8(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride);
+ uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return sum;
+}
+
+// Computes mse for a given block size. This function gets called for specific
+// block sizes, which are 8x8, 8x4, 4x8 and 4x4.
+uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h));
+}
+
+uint32_t aom_get_mb_ss_neon(const int16_t *a) {
+ int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ for (int i = 0; i < 256; i = i + 8) {
+ int16x8_t a_s16 = vld1q_s16(a + i);
+
+ sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16));
+ sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16));
+ }
+
+ return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
+}
+
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ int num_blks = 16 / w;
+ do {
+ sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h));
+ dst += w;
+ src += w * h;
+ } while (--num_blks != 0);
+
+ return horizontal_add_u64x2(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000000..9fb52e1df7
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = load_unaligned_u8q(src, src_stride);
+ uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 4 * src_stride;
+ ref += 4 * ref_stride;
+ i -= 4;
+ } while (i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+ uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src + j);
+ uint8x16_t r = vld1q_u8(ref + j);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ j += 16;
+ } while (j < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+ sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+ sum);
+}
+
+static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse,
+ sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \
+ unsigned int aom_variance##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \
+ &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
+
+VARIANCE_WXH_NEON_DOTPROD(128, 64, 13)
+VARIANCE_WXH_NEON_DOTPROD(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+void aom_get_var_sse_sum_8x8_quad_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over four 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8),
+ ref_stride, 8, &sse8x8[k], &sum8x8[k]);
+ }
+
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++) {
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+ }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over two 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16),
+ ref_stride, 16, &sse16x16[k], &sum16x16[k]);
+ }
+
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++) {
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+ }
+}
+
+static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride,
+ unsigned int *sse, int h) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+ uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride,
+ unsigned int *sse, int h) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t s1 = vld1q_u8(src + src_stride);
+ uint8x16_t r0 = vld1q_u8(ref);
+ uint8x16_t r1 = vld1q_u8(ref + ref_stride);
+
+ uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+ uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+ sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+ sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+#define MSE_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \
+ }
+
+MSE_WXH_NEON_DOTPROD(8, 8)
+MSE_WXH_NEON_DOTPROD(8, 16)
+
+MSE_WXH_NEON_DOTPROD(16, 8)
+MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef MSE_WXH_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c
new file mode 100644
index 0000000000..893f9c2f65
--- /dev/null
+++ b/third_party/aom/aom_dsp/avg.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ int i, j;
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j] - d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
+
+ return (sum + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
+
+ return (sum + 32) >> 6;
+}
+
+void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ for (int k = 0; k < 4; k++) {
+ const int x8_idx = x16_idx + ((k & 1) << 3);
+ const int y8_idx = y16_idx + ((k >> 1) << 3);
+ const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
+ avg[k] = aom_avg_8x8_c(s_tmp, p);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+ int i, j;
+ int sum = 0;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
+
+ return (sum + 32) >> 6;
+}
+
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+ int i, j;
+ int sum = 0;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
+
+ return (sum + 8) >> 4;
+}
+
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ int i, j;
+ const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
+ *min = 65535;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j] - d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1;
+ int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1;
+ int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1;
+ int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1;
+
+ coeff[0] = b0 + b2;
+ coeff[1] = b1 + b3;
+ coeff[2] = b0 - b2;
+ coeff[3] = b1 - b3;
+}
+
+void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ int16_t buffer[16];
+ int16_t buffer2[16];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 4; ++idx) {
+ hadamard_col4(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
+ // dynamic range [-255, 255]
+ tmp_buf += 4;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 4; ++idx) {
+ hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ // buffer2: 15 bit
+ // dynamic range [-16320, 16320]
+ ++tmp_buf;
+ }
+
+ // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2).
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 4; j++) {
+ coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i];
+ }
+ }
+}
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+// second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int16_t buffer2[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
+ // dynamic range [-255, 255]
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ // buffer2: 15 bit
+ // dynamic range [-16320, 16320]
+ ++tmp_buf;
+ }
+
+ // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2).
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 8; j++) {
+ coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i];
+ }
+ }
+}
+
+void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t buffer[64];
+ int16_t buffer2[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (int idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
+ // dynamic range [-255, 255]
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (int idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ // buffer2: 15 bit
+ // dynamic range [-16320, 16320]
+ ++tmp_buf;
+ }
+
+ for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
+
+ // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2).
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 8; j++) {
+ coeff[i * 8 + j] = buffer2[j * 8 + i];
+ }
+ }
+}
+
+void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ for (int i = 0; i < 2; i++) {
+ aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride,
+ (int16_t *)coeff + (i * 64));
+ }
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ // coeff: 15 bit, dynamic range [-16320, 16320]
+ for (idx = 0; idx < 64; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[64];
+ tran_low_t a2 = coeff[128];
+ tran_low_t a3 = coeff[192];
+
+ tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ tran_low_t b3 = (a2 - a3) >> 1;
+
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
+
+ ++coeff;
+ }
+
+ coeff -= 64;
+ // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2).
+ // Note that to match SSE2 output, it does not need this step.
+ for (int i = 0; i < 16; i++) {
+ for (int j = 0; j < 4; j++) {
+ tran_low_t temp = coeff[i * 16 + 4 + j];
+ coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j];
+ coeff[i * 16 + 8 + j] = temp;
+ }
+ }
+}
+
+void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ for (int idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ for (int idx = 0; idx < 64; ++idx) {
+ int16_t a0 = coeff[0];
+ int16_t a1 = coeff[64];
+ int16_t a2 = coeff[128];
+ int16_t a3 = coeff[192];
+
+ int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ int16_t b3 = (a2 - a3) >> 1;
+
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+ }
+
+ // coeff: 16 bit, dynamic range [-32768, 32767]
+ for (idx = 0; idx < 256; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[256];
+ tran_low_t a2 = coeff[512];
+ tran_low_t a3 = coeff[768];
+
+ tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535]
+ tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range
+ tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383]
+ tran_low_t b3 = (a2 - a3) >> 2;
+
+ coeff[0] = b0 + b2; // 16 bit, [-32768, 32767]
+ coeff[256] = b1 + b3;
+ coeff[512] = b0 - b2;
+ coeff[768] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ int32_t *coeff) {
+ int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int32_t c0 = b0 + b2;
+ int32_t c1 = b1 + b3;
+ int32_t c2 = b0 - b2;
+ int32_t c3 = b1 - b3;
+ int32_t c4 = b4 + b6;
+ int32_t c5 = b5 + b7;
+ int32_t c6 = b4 - b6;
+ int32_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int32_t buffer2[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ // src_diff: 13 bit
+ // buffer: 16 bit, dynamic range [-32760, 32760]
+ hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ // buffer: 16 bit
+ // buffer2: 19 bit, dynamic range [-262080, 262080]
+ hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+ ++tmp_buf;
+ }
+
+ for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 13 bit, dynamic range [-4095, 4095]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ // coeff: 19 bit, dynamic range [-262080, 262080]
+ for (idx = 0; idx < 64; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[64];
+ tran_low_t a2 = coeff[128];
+ tran_low_t a3 = coeff[192];
+
+ tran_low_t b0 = (a0 + a1) >> 1;
+ tran_low_t b1 = (a0 - a1) >> 1;
+ tran_low_t b2 = (a2 + a3) >> 1;
+ tran_low_t b3 = (a2 - a3) >> 1;
+
+ // new coeff dynamic range: 20 bit
+ coeff[0] = b0 + b2;
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
+
+ ++coeff;
+ }
+}
+
+void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 13 bit, dynamic range [-4095, 4095]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+ }
+
+ // coeff: 20 bit
+ for (idx = 0; idx < 256; ++idx) {
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[256];
+ tran_low_t a2 = coeff[512];
+ tran_low_t a3 = coeff[768];
+
+ tran_low_t b0 = (a0 + a1) >> 2;
+ tran_low_t b1 = (a0 - a1) >> 2;
+ tran_low_t b2 = (a2 + a3) >> 2;
+ tran_low_t b3 = (a2 - a3) >> 2;
+
+ // new coeff dynamic range: 20 bit
+ coeff[0] = b0 + b2;
+ coeff[256] = b1 + b3;
+ coeff[512] = b0 - b2;
+ coeff[768] = b1 - b3;
+
+ ++coeff;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
+int aom_satd_c(const tran_low_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+ return satd;
+}
+
+int aom_satd_lp_c(const int16_t *coeff, int length) {
+ int satd = 0;
+ for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+ return satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64, 128}.
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
+ const int width, const int height, int norm_factor) {
+ assert(height >= 2);
+ for (int idx = 0; idx < width; ++idx) {
+ hbuf[idx] = 0;
+ // hbuf[idx]: 14 bit, dynamic range [0, 32640].
+ for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+ // hbuf[idx]: 9 bit, dynamic range [0, 1020].
+ hbuf[idx] >>= norm_factor;
+ ++ref;
+ }
+}
+
+// width: value range {16, 32, 64, 128}.
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
+ const int width, const int height, int norm_factor) {
+ for (int ht = 0; ht < height; ++ht) {
+ int16_t sum = 0;
+ // sum: 14 bit, dynamic range [0, 32640]
+ for (int idx = 0; idx < width; ++idx) sum += ref[idx];
+ vbuf[ht] = sum >> norm_factor;
+ ref += ref_stride;
+ }
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
+ int i;
+ int width = 4 << bwl;
+ int sse = 0, mean = 0, var;
+
+ for (i = 0; i < width; ++i) {
+ int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits.
+ mean += diff; // mean: dynamic range 16 bits.
+ sse += diff * diff; // sse: dynamic range 26 bits.
+ }
+
+ // (mean * mean): dynamic range 31 bits.
+ // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
+ // 31.99, so it needs to be casted to unsigned int to compute its square.
+ const unsigned int mean_abs = abs(mean);
+ var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
+ return var;
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
new file mode 100644
index 0000000000..ee0ce62278
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/recenter.h"
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r,
+ uint16_t n ACCT_STR_PARAM) {
+ if (n <= 1) return 0;
+ const int l = get_msb(n) + 1;
+ const int m = (1 << l) - n;
+ const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
+ return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
+}
+
+// Decode finite subexponential code that for a symbol v in [0, n-1] with
+// parameter k
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+ uint16_t k ACCT_STR_PARAM) {
+ int i = 0;
+ int mk = 0;
+
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+
+ if (n <= mk + 3 * a) {
+ return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+ }
+
+ if (!aom_read_bit(r, ACCT_STR_NAME)) {
+ return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
+ }
+
+ i = i + 1;
+ mk += a;
+ }
+
+ assert(0);
+ return 0;
+}
+
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+ uint16_t ref ACCT_STR_PARAM) {
+ return inv_recenter_finite_nonneg(
+ n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
new file mode 100644
index 0000000000..d218f0619f
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
+#define AOM_AOM_DSP_BINARY_CODES_READER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+
+#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
+ aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
+ aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
+ aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+ uint16_t k ACCT_STR_PARAM);
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+ uint16_t ref ACCT_STR_PARAM);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
new file mode 100644
index 0000000000..55ce8429d7
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+ unsigned int abs_bits) {
+ if (v == 0) {
+ aom_write_bit(w, 0);
+ } else {
+ const int x = abs(v);
+ const int s = v < 0;
+ aom_write_bit(w, 1);
+ aom_write_bit(w, s);
+ aom_write_literal(w, x - 1, abs_bits);
+ }
+}
+
+int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
+ return (v == 0 ? 1 : abs_bits + 2);
+}
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
+ if (n <= 1) return;
+ const int l = get_msb(n) + 1;
+ const int m = (1 << l) - n;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_bit(w, (v - m) & 1);
+ }
+}
+
+int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
+ if (n <= 1) return 0;
+ const int l = get_msb(n) + 1;
+ const int m = (1 << l) - n;
+ return v < m ? l - 1 : l;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t v) {
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (n <= mk + 3 * a) {
+ aom_write_primitive_quniform(w, n - mk, v - mk);
+ break;
+ } else {
+ int t = (v >= mk + a);
+ aom_write_bit(w, t);
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ aom_write_literal(w, v - mk, b);
+ break;
+ }
+ }
+ }
+}
+
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
+ int count = 0;
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (n <= mk + 3 * a) {
+ count += aom_count_primitive_quniform(n - mk, v - mk);
+ break;
+ } else {
+ int t = (v >= mk + a);
+ count++;
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ count += b;
+ break;
+ }
+ }
+ }
+ return count;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+// Recenters symbol around r first and then uses a finite subexponential code.
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t ref, uint16_t v) {
+ aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+ uint16_t k, int16_t ref,
+ int16_t v) {
+ ref += n - 1;
+ v += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
+}
+
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+ uint16_t v) {
+ return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+ int16_t v) {
+ ref += n - 1;
+ v += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
new file mode 100644
index 0000000000..5ec8662139
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/bitwriter_buffer.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+ unsigned int mag_bits);
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+ uint16_t ref, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
+// parameter k based on a reference ref also in [-(n-1), n-1].
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+ uint16_t k, int16_t ref,
+ int16_t v);
+
+// Functions that counts bits for the above primitives
+int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
+int aom_count_primitive_quniform(uint16_t n, uint16_t v);
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+ uint16_t v);
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+ int16_t v);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.c b/third_party/aom/aom_dsp/bitreader.c
new file mode 100644
index 0000000000..4c70a91712
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitreader.h"
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) {
+ if (size && !buffer) {
+ return 1;
+ }
+ r->buffer_end = buffer + size;
+ r->buffer = buffer;
+ od_ec_dec_init(&r->ec, buffer, (uint32_t)size);
+#if CONFIG_ACCOUNTING
+ r->accounting = NULL;
+#endif
+ return 0;
+}
+
+const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; }
+
+const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; }
+
+uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); }
+
+uint32_t aom_reader_tell_frac(const aom_reader *r) {
+ return od_ec_dec_tell_frac(&r->ec);
+}
+
+int aom_reader_has_overflowed(const aom_reader *r) {
+ const uint32_t tell_bits = aom_reader_tell(r);
+ const uint32_t tell_bytes = (tell_bits + 7) >> 3;
+ return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
+}
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
new file mode 100644
index 0000000000..29321f916e
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_H_
+#define AOM_AOM_DSP_BITREADER_H_
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+ aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+ aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+ aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+ aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
+ aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+ aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_reader {
+ const uint8_t *buffer;
+ const uint8_t *buffer_end;
+ od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
+ uint8_t allow_update_cdf;
+};
+
+typedef struct aom_reader aom_reader;
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size);
+
+const uint8_t *aom_reader_find_begin(aom_reader *r);
+
+const uint8_t *aom_reader_find_end(aom_reader *r);
+
+// Returns true if the bit reader has tried to decode more data from the buffer
+// than was actually provided.
+int aom_reader_has_overflowed(const aom_reader *r);
+
+// Returns the position in the bit reader in bits.
+uint32_t aom_reader_tell(const aom_reader *r);
+
+// Returns the position in the bit reader in 1/8th bits.
+uint32_t aom_reader_tell_frac(const aom_reader *r);
+
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+ if (r->accounting != NULL) {
+ uint32_t tell_frac;
+ tell_frac = aom_reader_tell_frac(r);
+ aom_accounting_record(r->accounting, ACCT_STR_NAME,
+ tell_frac - r->accounting->last_tell_frac);
+ r->accounting->last_tell_frac = tell_frac;
+ }
+}
+
+static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
+ if (r->accounting != NULL) {
+ r->accounting->syms.num_multi_syms += !is_binary;
+ r->accounting->syms.num_binary_syms += !!is_binary;
+ }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+ int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+ int bit = od_ec_decode_bool_q15(&r->ec, p);
+
+#if CONFIG_BITSTREAM_DEBUG
+ {
+ int i;
+ int ref_bit, ref_nsymbs;
+ aom_cdf_prob ref_cdf[16];
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = aom_bitstream_queue_get_frame_read();
+ bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
+ if (ref_nsymbs != 2) {
+ fprintf(stderr,
+ "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
+ "%d queue_r %d\n",
+ frame_idx, 2, ref_nsymbs, queue_r);
+ assert(0);
+ }
+ if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
+ (ref_cdf[1] != 32767)) {
+ fprintf(stderr,
+ "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
+ frame_idx, p, 32767, ref_cdf[0]);
+ for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+ fprintf(stderr, "} queue_r %d\n", queue_r);
+ assert(0);
+ }
+ if (bit != ref_bit) {
+ fprintf(stderr,
+ "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
+ "queue_r %d\n",
+ frame_idx, bit, ref_bit, queue_r);
+ assert(0);
+ }
+ }
+#endif
+
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+ aom_update_symb_counts(r, 1);
+#endif
+ return bit;
+}
+
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+ int ret;
+ ret = aom_read(r, 128, NULL); // aom_prob_half
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+ return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+ int literal = 0, bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+ return literal;
+}
+
+static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
+ int nsymbs ACCT_STR_PARAM) {
+ int symb;
+ assert(cdf != NULL);
+ symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
+
+#if CONFIG_BITSTREAM_DEBUG
+ {
+ int i;
+ int cdf_error = 0;
+ int ref_symb, ref_nsymbs;
+ aom_cdf_prob ref_cdf[16];
+ const int queue_r = bitstream_queue_get_read();
+ const int frame_idx = aom_bitstream_queue_get_frame_read();
+ bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
+ if (nsymbs != ref_nsymbs) {
+ fprintf(stderr,
+ "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
+ "queue_r %d\n",
+ frame_idx, nsymbs, ref_nsymbs, queue_r);
+ cdf_error = 0;
+ assert(0);
+ } else {
+ for (i = 0; i < nsymbs; ++i)
+ if (cdf[i] != ref_cdf[i]) cdf_error = 1;
+ }
+ if (cdf_error) {
+ fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
+ cdf[0]);
+ for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
+ fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
+ for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+ fprintf(stderr, "} queue_r %d\n", queue_r);
+ assert(0);
+ }
+ if (symb != ref_symb) {
+ fprintf(
+ stderr,
+ "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
+ frame_idx, symb, ref_symb, queue_r);
+ assert(0);
+ }
+ }
+#endif
+
+#if CONFIG_ACCOUNTING
+ if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+ aom_update_symb_counts(r, (nsymbs == 2));
+#endif
+ return symb;
+}
+
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+ int nsymbs ACCT_STR_PARAM) {
+ int ret;
+ ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
+ if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
+ return ret;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
new file mode 100644
index 0000000000..d79feea6a3
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
+ return (rb->bit_offset + 7) >> 3;
+}
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
+ const uint32_t off = rb->bit_offset;
+ const uint32_t p = off >> 3;
+ const int q = 7 - (int)(off & 0x7);
+ if (rb->bit_buffer + p < rb->bit_buffer_end) {
+ const int bit = (rb->bit_buffer[p] >> q) & 1;
+ rb->bit_offset = off + 1;
+ return bit;
+ } else {
+ if (rb->error_handler) rb->error_handler(rb->error_handler_data);
+ return 0;
+ }
+}
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+ assert(bits <= 31);
+ int value = 0, bit;
+ for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
+ return value;
+}
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
+ int bits) {
+ assert(bits <= 32);
+ uint32_t value = 0;
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--)
+ value |= (uint32_t)aom_rb_read_bit(rb) << bit;
+ return value;
+}
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+ const int nbits = sizeof(unsigned) * 8 - bits - 1;
+ const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
+ return ((int)value) >> nbits;
+}
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+ int leading_zeros = 0;
+ while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
+ // Maximum 32 bits.
+ if (leading_zeros == 32) return UINT32_MAX;
+ const uint32_t base = (1u << leading_zeros) - 1;
+ const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+ return base + value;
+}
+
+static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
+ uint16_t n) {
+ if (n <= 1) return 0;
+ const int l = get_msb(n) + 1;
+ const int m = (1 << l) - n;
+ const int v = aom_rb_read_literal(rb, l - 1);
+ return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
+ uint16_t n, uint16_t k) {
+ int i = 0;
+ int mk = 0;
+
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+
+ if (n <= mk + 3 * a) {
+ return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
+ }
+
+ if (!aom_rb_read_bit(rb)) {
+ return aom_rb_read_literal(rb, b) + mk;
+ }
+
+ i = i + 1;
+ mk += a;
+ }
+
+ assert(0);
+ return 0;
+}
+
+static uint16_t aom_rb_read_primitive_refsubexpfin(
+ struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
+ return inv_recenter_finite_nonneg(n, ref,
+ aom_rb_read_primitive_subexpfin(rb, n, k));
+}
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+ struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
+ ref += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
+}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
new file mode 100644
index 0000000000..359fbe5194
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_AOM_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*aom_rb_error_handler)(void *data);
+
+struct aom_read_bit_buffer {
+ const uint8_t *bit_buffer;
+ const uint8_t *bit_buffer_end;
+ uint32_t bit_offset;
+
+ void *error_handler_data;
+ aom_rb_error_handler error_handler;
+};
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+ struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.c b/third_party/aom/aom_dsp/bitwriter.c
new file mode 100644
index 0000000000..4c27bb1fc3
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/bitwriter.h"
+
+void aom_start_encode(aom_writer *w, uint8_t *source) {
+ w->buffer = source;
+ w->pos = 0;
+ od_ec_enc_init(&w->ec, 62025);
+}
+
+int aom_stop_encode(aom_writer *w) {
+ int nb_bits;
+ uint32_t bytes;
+ unsigned char *data;
+ data = od_ec_enc_done(&w->ec, &bytes);
+ if (!data) {
+ od_ec_enc_clear(&w->ec);
+ return -1;
+ }
+ nb_bits = od_ec_enc_tell(&w->ec);
+ memcpy(w->buffer, data, bytes);
+ w->pos = bytes;
+ od_ec_enc_clear(&w->ec);
+ return nb_bits;
+}
+
+int aom_tell_size(aom_writer *w) {
+ const int nb_bits = od_ec_enc_tell(&w->ec);
+ return nb_bits;
+}
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
new file mode 100644
index 0000000000..6aedd8ceb9
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_H_
+#define AOM_AOM_DSP_BITWRITER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/common/blockd.h"
+#include "av1/encoder/cost.h"
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_writer {
+ unsigned int pos;
+ uint8_t *buffer;
+ od_ec_enc ec;
+ uint8_t allow_update_cdf;
+};
+
+typedef struct aom_writer aom_writer;
+
+typedef struct TOKEN_STATS {
+ int cost;
+#if CONFIG_RD_DEBUG
+ int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
+#endif
+} TOKEN_STATS;
+
+static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
+#if CONFIG_RD_DEBUG
+ int r, c;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ token_stats->txb_coeff_cost_map[r][c] = 0;
+ }
+ }
+#endif
+ token_stats->cost = 0;
+}
+
+void aom_start_encode(aom_writer *w, uint8_t *buffer);
+
+// Returns a negative number on error. Caller must check the return value and
+// handle error.
+int aom_stop_encode(aom_writer *w);
+
+int aom_tell_size(aom_writer *w);
+
+static INLINE void aom_write(aom_writer *w, int bit, int probability) {
+ int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
+#if CONFIG_BITSTREAM_DEBUG
+ aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
+ bitstream_queue_push(bit, cdf, 2);
+#endif
+
+ od_ec_encode_bool_q15(&w->ec, bit, p);
+}
+
+static INLINE void aom_write_bit(aom_writer *w, int bit) {
+ aom_write(w, bit, 128); // aom_prob_half
+}
+
+static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+ int bit;
+
+ for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
+}
+
+static INLINE void aom_write_cdf(aom_writer *w, int symb,
+ const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_push(symb, cdf, nsymbs);
+#endif
+
+ od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
+}
+
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+ int nsymbs) {
+ aom_write_cdf(w, symb, cdf, nsymbs);
+ if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
new file mode 100644
index 0000000000..7d0ab9486a
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
+ return (wb->bit_offset % CHAR_BIT == 0);
+}
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
+ return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
+ const int off = (int)wb->bit_offset;
+ const int p = off / CHAR_BIT;
+ const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+ if (q == CHAR_BIT - 1) {
+ // Zero next char and write bit
+ wb->bit_buffer[p] = bit << q;
+ } else {
+ wb->bit_buffer[p] &= ~(1 << q);
+ wb->bit_buffer[p] |= bit << q;
+ }
+ wb->bit_offset = off + 1;
+}
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
+ // Do not zero bytes but overwrite exisiting values
+ const int off = (int)wb->bit_offset;
+ const int p = off / CHAR_BIT;
+ const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+ wb->bit_buffer[p] &= ~(1 << q);
+ wb->bit_buffer[p] |= bit << q;
+ wb->bit_offset = off + 1;
+}
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+ assert(bits <= 31);
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+ uint32_t data, int bits) {
+ assert(bits <= 32);
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits) {
+ int bit;
+ for (bit = bits - 1; bit >= 0; bit--)
+ aom_wb_overwrite_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits) {
+ aom_wb_write_literal(wb, data, bits + 1);
+}
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+ int64_t shift_val = ++v;
+ int leading_zeroes = 1;
+
+ assert(shift_val > 0);
+
+ while (shift_val >>= 1) leading_zeroes += 2;
+
+ aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+ aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
+
+static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
+ uint16_t n, uint16_t v) {
+ if (n <= 1) return;
+ const int l = get_msb(n) + 1;
+ const int m = (1 << l) - n;
+ if (v < m) {
+ aom_wb_write_literal(wb, v, l - 1);
+ } else {
+ aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+ aom_wb_write_bit(wb, (v - m) & 1);
+ }
+}
+
+static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
+ uint16_t n, uint16_t k, uint16_t v) {
+ int i = 0;
+ int mk = 0;
+ while (1) {
+ int b = (i ? k + i - 1 : k);
+ int a = (1 << b);
+ if (n <= mk + 3 * a) {
+ wb_write_primitive_quniform(wb, n - mk, v - mk);
+ break;
+ } else {
+ int t = (v >= mk + a);
+ aom_wb_write_bit(wb, t);
+ if (t) {
+ i = i + 1;
+ mk += a;
+ } else {
+ aom_wb_write_literal(wb, v - mk, b);
+ break;
+ }
+ }
+ }
+}
+
+static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+ uint16_t n, uint16_t k,
+ uint16_t ref, uint16_t v) {
+ wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+ uint16_t n, uint16_t k,
+ int16_t ref, int16_t v) {
+ ref += n - 1;
+ v += n - 1;
+ const uint16_t scaled_n = (n << 1) - 1;
+ wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
+}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
new file mode 100644
index 0000000000..fd10e01bb7
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
+#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_write_bit_buffer {
+ uint8_t *bit_buffer;
+ uint32_t bit_offset;
+};
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+ uint32_t data, int bits);
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits);
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+ int bits);
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+ uint16_t n, uint16_t k,
+ int16_t ref, int16_t v);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
new file mode 100644
index 0000000000..fd87dc1810
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BLEND_H_
+#define AOM_AOM_DSP_BLEND_H_
+
+#include "aom_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the aom_blend_* functions in aom_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+
+#define AOM_BLEND_A64_ROUND_BITS 6
+#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64
+
+#define AOM_BLEND_A64(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+ AOM_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define AOM_BLEND_A256_ROUND_BITS 8
+#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
+
+#define AOM_BLEND_A256(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+ AOM_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#define DIFF_FACTOR_LOG2 4
+#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
+
+#endif // AOM_AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
new file mode 100644
index 0000000000..e9e38ef969
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_hmask.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(
+ mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ (void)bd;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(
+ mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
new file mode 100644
index 0000000000..35017fd737
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_mask.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+// NOTE(rachelbarker): The input and output of aom_blend_a64_d16_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d16 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
+void aom_lowbd_blend_a64_d16_mask_c(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ int i, j;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = mask[i * mask_stride + j];
+ res = ((m * (int32_t)src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) *
+ (int32_t)src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
+ res -= round_offset;
+ dst[i * dst_stride + j] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = ROUND_POWER_OF_TWO(
+ mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ res = ((m * (int32_t)src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) *
+ (int32_t)src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
+ res -= round_offset;
+ dst[i * dst_stride + j] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ res = ((m * (int32_t)src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) *
+ (int32_t)src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
+ res -= round_offset;
+ dst[i * dst_stride + j] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
+ (AOM_BLEND_A64_MAX_ALPHA - m) *
+ (int32_t)src1[i * src1_stride + j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
+ res -= round_offset;
+ dst[i * dst_stride + j] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+ }
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_d16_mask_c(
+ uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ // excerpt from clip_pixel_highbd()
+ // set saturation_value to (1 << bd) - 1
+ unsigned int saturation_value;
+ switch (bd) {
+ case 8:
+ default: saturation_value = 255; break;
+ case 10: saturation_value = 1023; break;
+ case 12: saturation_value = 4095; break;
+ }
+
+ if (subw == 0 && subh == 0) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = mask[j];
+ res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+ AOM_BLEND_A64_ROUND_BITS);
+ res -= round_offset;
+ unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+ dst[j] = AOMMIN(v, saturation_value);
+ }
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = ROUND_POWER_OF_TWO(
+ mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
+ mask[mask_stride + 2 * j + 1],
+ 2);
+ res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+ AOM_BLEND_A64_ROUND_BITS;
+ res -= round_offset;
+ unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+ dst[j] = AOMMIN(v, saturation_value);
+ }
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
+ res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+ AOM_BLEND_A64_ROUND_BITS;
+ res -= round_offset;
+ unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+ dst[j] = AOMMIN(v, saturation_value);
+ }
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int32_t res;
+ const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
+ res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+ AOM_BLEND_A64_ROUND_BITS;
+ res -= round_offset;
+ unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+ dst[j] = AOMMIN(v, saturation_value);
+ }
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = ROUND_POWER_OF_TWO(
+ mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h, int subw, int subh, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ (void)bd;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = ROUND_POWER_OF_TWO(
+ mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
new file mode 100644
index 0000000000..c938bb33af
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_vmask.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ (void)bd;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/blk_sse_sum.c b/third_party/aom/aom_dsp/blk_sse_sum.c
new file mode 100644
index 0000000000..d76c3f87b9
--- /dev/null
+++ b/third_party/aom/aom_dsp/blk_sse_sum.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ *x_sum = 0;
+ *x2_sum = 0;
+ for (int i = 0; i < bh; ++i) {
+ for (int j = 0; j < bw; ++j) {
+ const int val = data[j];
+ *x_sum += val;
+ *x2_sum += val * val;
+ }
+ data += stride;
+ }
+}
diff --git a/third_party/aom/aom_dsp/butteraugli.c b/third_party/aom/aom_dsp/butteraugli.c
new file mode 100644
index 0000000000..8d2a29f7a3
--- /dev/null
+++ b/third_party/aom/aom_dsp/butteraugli.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <jxl/butteraugli.h>
+
+#include "aom_dsp/butteraugli.h"
+#include "aom_mem/aom_mem.h"
+#include "third_party/libyuv/include/libyuv/convert_argb.h"
+
+int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+ aom_matrix_coefficients_t matrix_coefficients,
+ aom_color_range_t color_range, float *dist_map) {
+ (void)bit_depth;
+ assert(bit_depth == 8);
+ const int width = source->y_crop_width;
+ const int height = source->y_crop_height;
+ const int ss_x = source->subsampling_x;
+ const int ss_y = source->subsampling_y;
+
+ const struct YuvConstants *yuv_constants;
+ if (matrix_coefficients == AOM_CICP_MC_BT_709) {
+ if (color_range == AOM_CR_FULL_RANGE) return 0;
+ yuv_constants = &kYuvH709Constants;
+ } else {
+ yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants
+ : &kYuvI601Constants;
+ }
+
+ const int stride_argb = width * 4;
+ const size_t buffer_size = (size_t)height * stride_argb;
+ uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size);
+ uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size);
+ if (!src_argb || !distorted_argb) {
+ aom_free(src_argb);
+ aom_free(distorted_argb);
+ return 0;
+ }
+
+ if (ss_x == 1 && ss_y == 1) {
+ I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+ source->uv_stride, source->v_buffer, source->uv_stride,
+ src_argb, stride_argb, yuv_constants, width, height);
+ I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+ distorted->u_buffer, distorted->uv_stride,
+ distorted->v_buffer, distorted->uv_stride, distorted_argb,
+ stride_argb, yuv_constants, width, height);
+ } else if (ss_x == 1 && ss_y == 0) {
+ I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+ source->uv_stride, source->v_buffer, source->uv_stride,
+ src_argb, stride_argb, yuv_constants, width, height);
+ I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+ distorted->u_buffer, distorted->uv_stride,
+ distorted->v_buffer, distorted->uv_stride, distorted_argb,
+ stride_argb, yuv_constants, width, height);
+ } else if (ss_x == 0 && ss_y == 0) {
+ I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+ source->uv_stride, source->v_buffer, source->uv_stride,
+ src_argb, stride_argb, yuv_constants, width, height);
+ I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+ distorted->u_buffer, distorted->uv_stride,
+ distorted->v_buffer, distorted->uv_stride, distorted_argb,
+ stride_argb, yuv_constants, width, height);
+ } else {
+ aom_free(src_argb);
+ aom_free(distorted_argb);
+ return 0;
+ }
+
+ JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
+ JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL);
+ JxlButteraugliApiSetHFAsymmetry(api, 0.8f);
+
+ JxlButteraugliResult *result = JxlButteraugliCompute(
+ api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format,
+ distorted_argb, buffer_size);
+
+ const float *distmap = NULL;
+ uint32_t row_stride;
+ JxlButteraugliResultGetDistmap(result, &distmap, &row_stride);
+ if (distmap == NULL) {
+ JxlButteraugliApiDestroy(api);
+ JxlButteraugliResultDestroy(result);
+ aom_free(src_argb);
+ aom_free(distorted_argb);
+ return 0;
+ }
+
+ for (int j = 0; j < height; ++j) {
+ for (int i = 0; i < width; ++i) {
+ dist_map[j * width + i] = distmap[j * row_stride + i];
+ }
+ }
+
+ JxlButteraugliApiDestroy(api);
+ JxlButteraugliResultDestroy(result);
+ aom_free(src_argb);
+ aom_free(distorted_argb);
+ return 1;
+}
diff --git a/third_party/aom/aom_dsp/butteraugli.h b/third_party/aom/aom_dsp/butteraugli.h
new file mode 100644
index 0000000000..5304092ccb
--- /dev/null
+++ b/third_party/aom/aom_dsp/butteraugli.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BUTTERAUGLI_H_
+#define AOM_AOM_DSP_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+
+// Returns a boolean that indicates success/failure.
+int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+ aom_matrix_coefficients_t matrix_coefficients,
+ aom_color_range_t color_range, float *dist_map);
+
+#endif // AOM_AOM_DSP_BUTTERAUGLI_H_
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
new file mode 100644
index 0000000000..aad96c6fc6
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/entcode.h"
+
+/*Given the current total integer number of bits used and the current value of
+ rng, computes the fraction number of bits used to OD_BITRES precision.
+ This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
+ nbits_total: The number of whole bits currently used, i.e., the value
+ returned by od_ec_enc_tell() or od_ec_dec_tell().
+ rng: The current value of rng from either the encoder or decoder state.
+ Return: The number of bits scaled by 2**OD_BITRES.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
+ uint32_t nbits;
+ int l;
+ int i;
+ /*To handle the non-integral number of bits still left in the encoder/decoder
+ state, we compute the worst-case number of bits of val that must be
+ encoded to ensure that the value is inside the range for any possible
+ subsequent bits.
+ The computation here is independent of val itself (the decoder does not
+ even track that value), even though the real number of bits used after
+ od_ec_enc_done() may be 1 smaller if rng is a power of two and the
+ corresponding trailing bits of val are all zeros.
+ If we did try to track that special case, then coding a value with a
+ probability of 1/(1 << n) might sometimes appear to use more than n bits.
+ This may help explain the surprising result that a newly initialized
+ encoder or decoder claims to have used 1 bit.*/
+ nbits = nbits_total << OD_BITRES;
+ l = 0;
+ for (i = OD_BITRES; i-- > 0;) {
+ int b;
+ rng = rng * rng >> 15;
+ b = (int)(rng >> 16);
+ l = l << 1 | b;
+ rng >>= b;
+ }
+ return nbits - l;
+}
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
new file mode 100644
index 0000000000..526ca598d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTCODE_H_
+#define AOM_AOM_DSP_ENTCODE_H_
+
+#include <limits.h>
+#include <stddef.h>
+#include "aom_dsp/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4 // must be <= (1<<EC_PROB_SHIFT)/16
+
+/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
+ on a larger type, you can speed up the decoder by using it here.*/
+typedef uint32_t od_ec_window;
+
+/*The size in bits of od_ec_window.*/
+#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+ 3 => 1/8th bits.*/
+#define OD_BITRES (3)
+
+#define OD_ICDF AOM_ICDF
+
+/*See entcode.c for further documentation.*/
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
+ uint32_t rng);
+
+#endif // AOM_AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
new file mode 100644
index 0000000000..5bbcddae08
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+
+/*A range decoder.
+ This is an entropy decoder based upon \cite{Mar79}, which is itself a
+ rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+ It is very similar to arithmetic encoding, except that encoding is done with
+ digits in any base, instead of with bits, and so it is faster when using
+ larger bases (i.e.: a byte).
+ The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+ is the base, longer than the theoretical optimum, but to my knowledge there
+ is no published justification for this claim.
+ This only seems true when using near-infinite precision arithmetic so that
+ the process is carried out with no rounding errors.
+
+ An excellent description of implementation details is available at
+ http://www.arturocampos.com/ac_range.html
+ A recent work \cite{MNW98} which proposes several changes to arithmetic
+ encoding for efficiency actually re-discovers many of the principles
+ behind range encoding, and presents a good theoretical analysis of them.
+
+ End of stream is handled by writing out the smallest number of bits that
+ ensures that the stream will be correctly decoded regardless of the value of
+ any subsequent bits.
+ od_ec_dec_tell() can be used to determine how many bits were needed to decode
+ all the symbols thus far; other data can be packed in the remaining bits of
+ the input buffer.
+ @PHDTHESIS{Pas76,
+ author="Richard Clark Pasco",
+ title="Source coding algorithms for fast data compression",
+ school="Dept. of Electrical Engineering, Stanford University",
+ address="Stanford, CA",
+ month=May,
+ year=1976,
+ URL="http://www.richpasco.org/scaffdc.pdf"
+ }
+ @INPROCEEDINGS{Mar79,
+ author="Martin, G.N.N.",
+ title="Range encoding: an algorithm for removing redundancy from a digitised
+ message",
+ booktitle="Video & Data Recording Conference",
+ year=1979,
+ address="Southampton",
+ month=Jul,
+ URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+ }
+ @ARTICLE{MNW98,
+ author="Alistair Moffat and Radford Neal and Ian H. Witten",
+ title="Arithmetic Coding Revisited",
+ journal="{ACM} Transactions on Information Systems",
+ year=1998,
+ volume=16,
+ number=3,
+ pages="256--294",
+ month=Jul,
+ URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+ }*/
+
+/*This is meant to be a large, positive constant that can still be efficiently
+ loaded as an immediate (on platforms like ARM, for example).
+ Even relatively modest values like 100 would work fine.*/
+#define OD_EC_LOTS_OF_BITS (0x4000)
+
+/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
+ call.*/
+static void od_ec_dec_refill(od_ec_dec *dec) {
+ int s;
+ od_ec_window dif;
+ int16_t cnt;
+ const unsigned char *bptr;
+ const unsigned char *end;
+ dif = dec->dif;
+ cnt = dec->cnt;
+ bptr = dec->bptr;
+ end = dec->end;
+ s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
+ for (; s >= 0 && bptr < end; s -= 8, bptr++) {
+ /*Each time a byte is inserted into the window (dif), bptr advances and cnt
+ is incremented by 8, so the total number of consumed bits (the return
+ value of od_ec_dec_tell) does not change.*/
+ assert(s <= OD_EC_WINDOW_SIZE - 8);
+ dif ^= (od_ec_window)bptr[0] << s;
+ cnt += 8;
+ }
+ if (bptr >= end) {
+ /*We've reached the end of the buffer. It is perfectly valid for us to need
+ to fill the window with additional bits past the end of the buffer (and
+ this happens in normal operation). These bits should all just be taken
+ as zero. But we cannot increment bptr past 'end' (this is undefined
+ behavior), so we start to increment dec->tell_offs. We also don't want
+ to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS
+ and adjust dec->tell_offs so that the total number of unconsumed bits in
+ the window (dec->cnt - dec->tell_offs) does not change. This effectively
+ puts lots of zero bits into the window, and means we won't try to refill
+ it from the buffer for a very long time (at which point we'll put lots
+ of zero bits into the window again).*/
+ dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
+ cnt = OD_EC_LOTS_OF_BITS;
+ }
+ dec->dif = dif;
+ dec->cnt = cnt;
+ dec->bptr = bptr;
+}
+
+/*Takes updated dif and range values, renormalizes them so that
+ 32768 <= rng < 65536 (reading more bytes from the stream into dif if
+ necessary), and stores them back in the decoder context.
+ dif: The new value of dif.
+ rng: The new value of the range.
+ ret: The value to return.
+ Return: ret.
+ This allows the compiler to jump to this function via a tail-call.*/
+static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
+ int ret) {
+ int d;
+ assert(rng <= 65535U);
+ /*The number of leading zeros in the 16-bit binary representation of rng.*/
+ d = 16 - OD_ILOG_NZ(rng);
+ /*d bits in dec->dif are consumed.*/
+ dec->cnt -= d;
+ /*This is equivalent to shifting in 1's instead of 0's.*/
+ dec->dif = ((dif + 1) << d) - 1;
+ dec->rng = rng << d;
+ if (dec->cnt < 0) od_ec_dec_refill(dec);
+ return ret;
+}
+
+/*Initializes the decoder.
+ buf: The input buffer to use.
+ storage: The size in bytes of the input buffer.*/
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
+ uint32_t storage) {
+ dec->buf = buf;
+ dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
+ dec->end = buf + storage;
+ dec->bptr = buf;
+ dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
+ dec->rng = 0x8000;
+ dec->cnt = -15;
+ od_ec_dec_refill(dec);
+}
+
+/*Decode a single binary value.
+ f: The probability that the bit is one, scaled by 32768.
+ Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
+ od_ec_window dif;
+ od_ec_window vw;
+ unsigned r;
+ unsigned r_new;
+ unsigned v;
+ int ret;
+ assert(0 < f);
+ assert(f < 32768U);
+ dif = dec->dif;
+ r = dec->rng;
+ assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+ assert(32768U <= r);
+ v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+ v += EC_MIN_PROB;
+ vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+ ret = 1;
+ r_new = v;
+ if (dif >= vw) {
+ r_new = r - v;
+ dif -= vw;
+ ret = 0;
+ }
+ return od_ec_dec_normalize(dec, dif, r_new, ret);
+}
+
+/*Decodes a symbol given an inverse cumulative distribution function (CDF)
+ table in Q15.
+ icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
+ [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
+ The values must be monotonically non-increasing, and icdf[nsyms - 1]
+ must be 0.
+ nsyms: The number of symbols in the alphabet.
+ This should be at most 16.
+ Return: The decoded symbol s.*/
+int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
+ od_ec_window dif;
+ unsigned r;
+ unsigned c;
+ unsigned u;
+ unsigned v;
+ int ret;
+ (void)nsyms;
+ dif = dec->dif;
+ r = dec->rng;
+ const int N = nsyms - 1;
+
+ assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+ assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+ assert(32768U <= r);
+ assert(7 - EC_PROB_SHIFT >= 0);
+ c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+ v = r;
+ ret = -1;
+ do {
+ u = v;
+ v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
+ (7 - EC_PROB_SHIFT));
+ v += EC_MIN_PROB * (N - ret);
+ } while (c < v);
+ assert(v < u);
+ assert(u <= r);
+ r = u - v;
+ dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+ return od_ec_dec_normalize(dec, dif, r, ret);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Return: The number of bits.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+int od_ec_dec_tell(const od_ec_dec *dec) {
+ /*There is a window of bits stored in dec->dif. The difference
+ (dec->bptr - dec->buf) tells us how many bytes have been read into this
+ window. The difference (dec->cnt - dec->tell_offs) tells us how many of
+ the bits in that window remain unconsumed.*/
+ return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Return: The number of bits scaled by 2**OD_BITRES.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
+ return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
+}
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
new file mode 100644
index 0000000000..c746167775
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTDEC_H_
+#define AOM_AOM_DSP_ENTDEC_H_
+#include <limits.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_dec od_ec_dec;
+
+#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
+#define OD_ACC_STR , char *acc_str
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
+#else
+#define OD_ACC_STR
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
+#endif
+
+/*The entropy decoder context.*/
+struct od_ec_dec {
+ /*The start of the current input buffer.*/
+ const unsigned char *buf;
+ /*An offset used to keep track of tell after reaching the end of the stream.
+ This is constant throughout most of the decoding process, but becomes
+ important once we hit the end of the buffer and stop incrementing bptr
+ (and instead pretend cnt has lots of bits).*/
+ int32_t tell_offs;
+ /*The end of the current input buffer.*/
+ const unsigned char *end;
+ /*The read pointer for the entropy-coded bits.*/
+ const unsigned char *bptr;
+ /*The difference between the high end of the current range, (low + rng), and
+ the coded value, minus 1.
+ This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
+ decoder only uses the top 16 bits of the window to decode the next symbol.
+ As we shift up during renormalization, if we don't have enough bits left in
+ the window to fill the top 16, we'll read in more bits of the coded
+ value.*/
+ od_ec_window dif;
+ /*The number of values in the current range.*/
+ uint16_t rng;
+ /*The number of bits of data in the current value.*/
+ int16_t cnt;
+};
+
+/*See entdec.c for further documentation.*/
+
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
+ const uint16_t *cdf, int nsyms)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
+ OD_ARG_NONNULL(1);
+
+OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
+ OD_ARG_NONNULL(1);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_ENTDEC_H_
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
new file mode 100644
index 0000000000..591e0ad214
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if OD_MEASURE_EC_OVERHEAD
+#if !defined(M_LOG2E)
+#define M_LOG2E (1.4426950408889634073599246810019)
+#endif
+#define OD_LOG2(x) (M_LOG2E * log(x))
+#endif // OD_MEASURE_EC_OVERHEAD
+
+/*A range encoder.
+ See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
+
+ @INPROCEEDINGS{Mar79,
+ author="Martin, G.N.N.",
+ title="Range encoding: an algorithm for removing redundancy from a digitised
+ message",
+ booktitle="Video \& Data Recording Conference",
+ year=1979,
+ address="Southampton",
+ month=Jul,
+ URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+ }
+ @ARTICLE{MNW98,
+ author="Alistair Moffat and Radford Neal and Ian H. Witten",
+ title="Arithmetic Coding Revisited",
+ journal="{ACM} Transactions on Information Systems",
+ year=1998,
+ volume=16,
+ number=3,
+ pages="256--294",
+ month=Jul,
+ URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+ }*/
+
+/*Takes updated low and range values, renormalizes them so that
+ 32768 <= rng < 65536 (flushing bytes from low to the output buffer if
+ necessary), and stores them back in the encoder context.
+ low: The new value of low.
+ rng: The new value of the range.*/
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_enc_window low,
+ unsigned rng) {
+ int d;
+ int c;
+ int s;
+ if (enc->error) return;
+ c = enc->cnt;
+ assert(rng <= 65535U);
+ /*The number of leading zeros in the 16-bit binary representation of rng.*/
+ d = 16 - OD_ILOG_NZ(rng);
+ s = c + d;
+
+ /* We flush every time "low" cannot safely and efficiently accommodate any
+ more data. Overall, c must not exceed 63 at the time of byte flush out. To
+ facilitate this, "s" cannot exceed 56-bits because we have to keep 1 byte
+ for carry. Also, we need to subtract 16 because we want to keep room for
+ the next symbol worth "d"-bits (max 15). An alternate condition would be if
+ (e < d), where e = number of leading zeros in "low", indicating there is
+ not enough rooom to accommodate "rng" worth of "d"-bits in "low". However,
+ this approach needs additional computations: (i) compute "e", (ii) push
+ the leading 0x00's as a special case.
+ */
+ if (s >= 40) { // 56 - 16
+ unsigned char *out = enc->buf;
+ uint32_t storage = enc->storage;
+ uint32_t offs = enc->offs;
+ if (offs + 8 > storage) {
+ storage = 2 * storage + 8;
+ out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+ if (out == NULL) {
+ enc->error = -1;
+ return;
+ }
+ enc->buf = out;
+ enc->storage = storage;
+ }
+ // Need to add 1 byte here since enc->cnt always counts 1 byte less
+ // (enc->cnt = -9) to ensure correct operation
+ uint8_t num_bytes_ready = (s >> 3) + 1;
+
+ // Update "c" to contain the number of non-ready bits in "low". Since "low"
+ // has 64-bit capacity, we need to add the (64 - 40) cushion bits and take
+ // off the number of ready bits.
+ c += 24 - (num_bytes_ready << 3);
+
+ // Prepare "output" and update "low"
+ uint64_t output = low >> c;
+ low = low & (((uint64_t)1 << c) - 1);
+
+ // Prepare data and carry mask
+ uint64_t mask = (uint64_t)1 << (num_bytes_ready << 3);
+ uint64_t carry = output & mask;
+
+ mask = mask - 0x01;
+ output = output & mask;
+
+ // Write data in a single operation
+ write_enc_data_to_out_buf(out, offs, output, carry, &enc->offs,
+ num_bytes_ready);
+
+ // Update state of the encoder: enc->cnt to contain the number of residual
+ // bits
+ s = c + d - 24;
+ }
+ enc->low = low << d;
+ enc->rng = rng << d;
+ enc->cnt = s;
+}
+
+/*Initializes the encoder.
+ size: The initial size of the buffer, in bytes.*/
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
+ od_ec_enc_reset(enc);
+ enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
+ enc->storage = size;
+ if (size > 0 && enc->buf == NULL) {
+ enc->storage = 0;
+ enc->error = -1;
+ }
+}
+
+/*Reinitializes the encoder.*/
+void od_ec_enc_reset(od_ec_enc *enc) {
+ enc->offs = 0;
+ enc->low = 0;
+ enc->rng = 0x8000;
+ /*This is initialized to -9 so that it crosses zero after we've accumulated
+ one byte + one carry bit.*/
+ enc->cnt = -9;
+ enc->error = 0;
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy = 0;
+ enc->nb_symbols = 0;
+#endif
+}
+
+/*Frees the buffers used by the encoder.*/
+void od_ec_enc_clear(od_ec_enc *enc) { free(enc->buf); }
+
+/*Encodes a symbol given its frequency in Q15.
+ fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
+ before the one to be encoded.
+ fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
+ including the one to be encoded.*/
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
+ int nsyms) {
+ od_ec_enc_window l;
+ unsigned r;
+ unsigned u;
+ unsigned v;
+ l = enc->low;
+ r = enc->rng;
+ assert(32768U <= r);
+ assert(fh <= fl);
+ assert(fl <= 32768U);
+ assert(7 - EC_PROB_SHIFT >= 0);
+ const int N = nsyms - 1;
+ if (fl < CDF_PROB_TOP) {
+ u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) +
+ EC_MIN_PROB * (N - (s - 1));
+ v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) +
+ EC_MIN_PROB * (N - (s + 0));
+ l += r - u;
+ r = u - v;
+ } else {
+ r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) +
+ EC_MIN_PROB * (N - (s + 0));
+ }
+ od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
+ enc->nb_symbols++;
+#endif
+}
+
+/*Encode a single binary value.
+ val: The value to encode (0 or 1).
+ f: The probability that the val is one, scaled by 32768.*/
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
+ od_ec_enc_window l;
+ unsigned r;
+ unsigned v;
+ assert(0 < f);
+ assert(f < 32768U);
+ l = enc->low;
+ r = enc->rng;
+ assert(32768U <= r);
+ v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+ v += EC_MIN_PROB;
+ if (val) l += r - v;
+ r = val ? v : r - v;
+ od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+ enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
+ enc->nb_symbols++;
+#endif
+}
+
+/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
+ s: The index of the symbol to encode.
+ icdf: 32768 minus the CDF, such that symbol s falls in the range
+ [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+ The values must be monotonically decreasing, and icdf[nsyms - 1] must
+ be 0.
+ nsyms: The number of symbols in the alphabet.
+ This should be at most 16.*/
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
+ int nsyms) {
+ (void)nsyms;
+ assert(s >= 0);
+ assert(s < nsyms);
+ assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+ od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
+}
+
+/*Overwrites a few bits at the very start of an existing stream, after they
+ have already been encoded.
+ This makes it possible to have a few flags up front, where it is easy for
+ decoders to access them without parsing the whole stream, even if their
+ values are not determined until late in the encoding process, without having
+ to buffer all the intermediate symbols in the encoder.
+ In order for this to work, at least nbits bits must have already been encoded
+ using probabilities that are an exact power of two.
+ The encoder can verify the number of encoded bits is sufficient, but cannot
+ check this latter condition.
+ val: The bits to encode (in the least nbits significant bits).
+ They will be decoded in order from most-significant to least.
+ nbits: The number of bits to overwrite.
+ This must be no more than 8.*/
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
+ int shift;
+ unsigned mask;
+ assert(nbits >= 0);
+ assert(nbits <= 8);
+ assert(val < 1U << nbits);
+ shift = 8 - nbits;
+ mask = ((1U << nbits) - 1) << shift;
+ if (enc->offs > 0) {
+ /*The first byte has been finalized.*/
+ enc->buf[0] = (unsigned char)((enc->buf[0] & ~mask) | val << shift);
+ } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
+ /*The first byte has yet to be output.*/
+ enc->low = (enc->low & ~((od_ec_enc_window)mask << (16 + enc->cnt))) |
+ (od_ec_enc_window)val << (16 + enc->cnt + shift);
+ } else {
+ /*The encoder hasn't even encoded _nbits of data yet.*/
+ enc->error = -1;
+ }
+}
+
+#if OD_MEASURE_EC_OVERHEAD
+#include <stdio.h>
+#endif
+
+/*Indicates that there are no more symbols to encode.
+ All remaining output bytes are flushed to the output buffer.
+ od_ec_enc_reset() should be called before using the encoder again.
+ bytes: Returns the size of the encoded data in the returned buffer.
+ Return: A pointer to the start of the final buffer, or NULL if there was an
+ encoding error.*/
+unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
+ unsigned char *out;
+ uint32_t storage;
+ uint32_t offs;
+ od_ec_enc_window m;
+ od_ec_enc_window e;
+ od_ec_enc_window l;
+ int c;
+ int s;
+ if (enc->error) return NULL;
+#if OD_MEASURE_EC_OVERHEAD
+ {
+ uint32_t tell;
+ /* Don't count the 1 bit we lose to raw bits as overhead. */
+ tell = od_ec_enc_tell(enc) - 1;
+ fprintf(stderr, "overhead: %f%%\n",
+ 100 * (tell - enc->entropy) / enc->entropy);
+ fprintf(stderr, "efficiency: %f bits/symbol\n",
+ (double)tell / enc->nb_symbols);
+ }
+#endif
+
+ l = enc->low;
+ c = enc->cnt;
+ s = 10;
+ m = 0x3FFF;
+ e = ((l + m) & ~m) | (m + 1);
+ s += c;
+ offs = enc->offs;
+
+ /*Make sure there's enough room for the entropy-coded bits.*/
+ out = enc->buf;
+ storage = enc->storage;
+ const int s_bits = (s + 7) >> 3;
+ int b = OD_MAXI(s_bits, 0);
+ if (offs + b > storage) {
+ storage = offs + b;
+ out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+ if (out == NULL) {
+ enc->error = -1;
+ return NULL;
+ }
+ enc->buf = out;
+ enc->storage = storage;
+ }
+
+ /*We output the minimum number of bits that ensures that the symbols encoded
+ thus far will be decoded correctly regardless of the bits that follow.*/
+ if (s > 0) {
+ uint64_t n;
+ n = ((uint64_t)1 << (c + 16)) - 1;
+ do {
+ assert(offs < storage);
+ uint16_t val = (uint16_t)(e >> (c + 16));
+ out[offs] = (unsigned char)(val & 0x00FF);
+ if (val & 0x0100) {
+ assert(offs > 0);
+ propagate_carry_bwd(out, offs - 1);
+ }
+ offs++;
+
+ e &= n;
+ s -= 8;
+ c -= 8;
+ n >>= 8;
+ } while (s > 0);
+ }
+ *nbytes = offs;
+
+ return out;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Warning: The value returned by this function can decrease compared to an
+ earlier call, even after encoding more data, if there is an encoding error
+ (i.e., a failure to allocate enough space for the output buffer).
+ Return: The number of bits.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+int od_ec_enc_tell(const od_ec_enc *enc) {
+ /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
+ bit, which we reserve for terminating the stream.*/
+ return (enc->cnt + 10) + enc->offs * 8;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+ This same number can be computed in either the encoder or the decoder, and is
+ suitable for making coding decisions.
+ Warning: The value returned by this function can decrease compared to an
+ earlier call, even after encoding more data, if there is an encoding error
+ (i.e., a failure to allocate enough space for the output buffer).
+ Return: The number of bits scaled by 2**OD_BITRES.
+ This will always be slightly larger than the exact value (e.g., all
+ rounding error is in the positive direction).*/
+uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
+ return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
+}
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
new file mode 100644
index 0000000000..1a38affb4f
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTENC_H_
+#define AOM_AOM_DSP_ENTENC_H_
+#include <stddef.h>
+#include "aom_dsp/entcode.h"
+#include "aom_util/endian_inl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint64_t od_ec_enc_window;
+
+typedef struct od_ec_enc od_ec_enc;
+
+#define OD_MEASURE_EC_OVERHEAD (0)
+
+/*The entropy encoder context.*/
+struct od_ec_enc {
+ /*Buffered output.
+ This contains only the raw bits until the final call to od_ec_enc_done(),
+ where all the arithmetic-coded data gets prepended to it.*/
+ unsigned char *buf;
+ /*The size of the buffer.*/
+ uint32_t storage;
+ /*The offset at which the next entropy-coded byte will be written.*/
+ uint32_t offs;
+ /*The low end of the current range.*/
+ od_ec_enc_window low;
+ /*The number of values in the current range.*/
+ uint16_t rng;
+ /*The number of bits of data in the current value.*/
+ int16_t cnt;
+ /*Nonzero if an error occurred.*/
+ int error;
+#if OD_MEASURE_EC_OVERHEAD
+ double entropy;
+ int nb_symbols;
+#endif
+};
+
+/*See entenc.c for further documentation.*/
+
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
+void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
+void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
+
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
+ OD_ARG_NONNULL(1);
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
+
+void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
+ OD_ARG_NONNULL(1);
+
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
+ uint32_t *nbytes)
+ OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
+ OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
+ OD_ARG_NONNULL(1);
+
+// buf is the frame bitbuffer, offs is where carry to be added
+static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
+ uint16_t sum, carry = 1;
+ do {
+ sum = (uint16_t)buf[offs] + 1;
+ buf[offs--] = (unsigned char)sum;
+ carry = sum >> 8;
+ } while (carry);
+}
+
+// Convert to big-endian byte order and write data to buffer adding the
+// carry-bit
+static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
+ uint32_t offs, uint64_t output,
+ uint64_t carry,
+ uint32_t *enc_offs,
+ uint8_t num_bytes_ready) {
+ const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3));
+ memcpy(&out[offs], &reg, 8);
+ // Propagate carry backwards if exists
+ if (carry) {
+ assert(offs > 0);
+ propagate_carry_bwd(out, offs - 1);
+ }
+ *enc_offs = offs + num_bytes_ready;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_ENTENC_H_
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
new file mode 100644
index 0000000000..0ef0590e89
--- /dev/null
+++ b/third_party/aom/aom_dsp/fastssim.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * This code was originally written by: Nathan E. Egge, at the Daala
+ * project.
+ */
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#define MAX_SSIM_DB 100.0
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+ uint32_t *im1;
+ uint32_t *im2;
+ double *ssim;
+ int w;
+ int h;
+};
+
+struct fs_ctx {
+ fs_level *level;
+ int nlevels;
+ unsigned *col_buf;
+};
+
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+ unsigned char *data;
+ size_t data_size;
+ int lw;
+ int lh;
+ int l;
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ data_size =
+ _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ im_size = lw * (size_t)lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size += im_size;
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ data_size += level_size;
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ data = (unsigned char *)malloc(data_size);
+ if (!data) return -1;
+ _ctx->level = (fs_level *)data;
+ _ctx->nlevels = _nlevels;
+ data += _nlevels * sizeof(*_ctx->level);
+ lw = (_w + 1) >> 1;
+ lh = (_h + 1) >> 1;
+ for (l = 0; l < _nlevels; l++) {
+ size_t im_size;
+ size_t level_size;
+ _ctx->level[l].w = lw;
+ _ctx->level[l].h = lh;
+ im_size = lw * (size_t)lh;
+ level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+ level_size += sizeof(*_ctx->level[l].ssim) - 1;
+ level_size /= sizeof(*_ctx->level[l].ssim);
+ level_size *= sizeof(*_ctx->level[l].ssim);
+ _ctx->level[l].im1 = (uint32_t *)data;
+ _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+ data += level_size;
+ _ctx->level[l].ssim = (double *)data;
+ data += im_size * sizeof(*_ctx->level[l].ssim);
+ lw = (lw + 1) >> 1;
+ lh = (lh + 1) >> 1;
+ }
+ _ctx->col_buf = (unsigned *)data;
+ return 0;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+ const uint32_t *src1;
+ const uint32_t *src2;
+ uint32_t *dst1;
+ uint32_t *dst2;
+ int w2;
+ int h2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ dst1 = _ctx->level[_l].im1;
+ dst2 = _ctx->level[_l].im2;
+ w2 = _ctx->level[_l - 1].w;
+ h2 = _ctx->level[_l - 1].h;
+ src1 = _ctx->level[_l - 1].im1;
+ src2 = _ctx->level[_l - 1].im2;
+ for (j = 0; j < h; j++) {
+ int j0offs;
+ int j1offs;
+ j0offs = 2 * j * w2;
+ j1offs = FS_MINI(2 * j + 1, h2) * w2;
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, w2);
+ dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+ src1[j1offs + i0] + src1[j1offs + i1];
+ dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+ src2[j1offs + i0] + src2[j1offs + i1];
+ }
+ }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+ int _s1ystride, const uint8_t *_src2,
+ int _s2ystride, int _w, int _h, uint32_t shift,
+ int buf_is_hbd) {
+ uint32_t *dst1;
+ uint32_t *dst2;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[0].w;
+ h = _ctx->level[0].h;
+ dst1 = _ctx->level[0].im1;
+ dst2 = _ctx->level[0].im2;
+ for (j = 0; j < h; j++) {
+ int j0;
+ int j1;
+ j0 = 2 * j;
+ j1 = FS_MINI(j0 + 1, _h);
+ for (i = 0; i < w; i++) {
+ int i0;
+ int i1;
+ i0 = 2 * i;
+ i1 = FS_MINI(i0 + 1, _w);
+ if (!buf_is_hbd) {
+ dst1[j * w + i] =
+ _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+ _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+ dst2[j * w + i] =
+ _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+ _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
+ } else {
+ uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+ uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+ dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+ (src1s[j0 * _s1ystride + i1] >> shift) +
+ (src1s[j1 * _s1ystride + i0] >> shift) +
+ (src1s[j1 * _s1ystride + i1] >> shift);
+ dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+ (src2s[j0 * _s2ystride + i1] >> shift) +
+ (src2s[j1 * _s2ystride + i0] >> shift) +
+ (src2s[j1 * _s2ystride + i1] >> shift);
+ }
+ }
+ }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
+ unsigned *col_sums_x;
+ unsigned *col_sums_y;
+ uint32_t *im1;
+ uint32_t *im2;
+ double *ssim;
+ double c1;
+ int w;
+ int h;
+ int j0offs;
+ int j1offs;
+ int i;
+ int j;
+ double ssim_c1 = SSIM_C1;
+
+ if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+ if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
+
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ col_sums_x = _ctx->col_buf;
+ col_sums_y = col_sums_x + w;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+ for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
+ for (j = 1; j < 4; j++) {
+ j1offs = FS_MINI(j, h - 1) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+ }
+ ssim = _ctx->level[_l].ssim;
+ c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
+ for (j = 0; j < h; j++) {
+ unsigned mux;
+ unsigned muy;
+ int i0;
+ int i1;
+ mux = 5 * col_sums_x[0];
+ muy = 5 * col_sums_y[0];
+ for (i = 1; i < 4; i++) {
+ i1 = FS_MINI(i, w - 1);
+ mux += col_sums_x[i1];
+ muy += col_sums_y[i1];
+ }
+ for (i = 0; i < w; i++) {
+ ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+ (mux * (double)mux + muy * (double)muy + c1);
+ if (i + 1 < w) {
+ i0 = FS_MAXI(0, i - 4);
+ i1 = FS_MINI(i + 4, w - 1);
+ mux += col_sums_x[i1] - col_sums_x[i0];
+ muy += col_sums_x[i1] - col_sums_x[i0];
+ }
+ }
+ if (j + 1 < h) {
+ j0offs = FS_MAXI(0, j - 4) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
+ j1offs = FS_MINI(j + 4, h - 1) * w;
+ for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+ for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+ }
+ }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] = gx * (double)gx; \
+ col_sums_gy2[(_col)] = gy * (double)gy; \
+ col_sums_gxgy[(_col)] = gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] += gx * (double)gx; \
+ col_sums_gy2[(_col)] += gy * (double)gy; \
+ col_sums_gxgy[(_col)] += gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+ do { \
+ unsigned gx; \
+ unsigned gy; \
+ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+ col_sums_gx2[(_col)] -= gx * (double)gx; \
+ col_sums_gy2[(_col)] -= gy * (double)gy; \
+ col_sums_gxgy[(_col)] -= gx * (double)gy; \
+ } while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+ } while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+ } while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+ do { \
+ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+ } while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+ uint32_t *im1;
+ uint32_t *im2;
+ unsigned *gx_buf;
+ unsigned *gy_buf;
+ double *ssim;
+ double col_sums_gx2[8];
+ double col_sums_gy2[8];
+ double col_sums_gxgy[8];
+ double c2;
+ int stride;
+ int w;
+ int h;
+ int i;
+ int j;
+ double ssim_c2 = SSIM_C2;
+ if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+ if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
+
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ im1 = _ctx->level[_l].im1;
+ im2 = _ctx->level[_l].im2;
+ ssim = _ctx->level[_l].ssim;
+ gx_buf = _ctx->col_buf;
+ stride = w + 8;
+ gy_buf = gx_buf + 8 * stride;
+ memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+ c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
+ for (j = 0; j < h + 4; j++) {
+ if (j < h - 1) {
+ for (i = 0; i < w - 1; i++) {
+ unsigned g1;
+ unsigned g2;
+ unsigned gx;
+ unsigned gy;
+ g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
+ g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+ gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
+ g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
+ gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+ gx_buf[(j & 7) * stride + i + 4] = gx;
+ gy_buf[(j & 7) * stride + i + 4] = gy;
+ }
+ } else {
+ memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+ memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+ }
+ if (j >= 4) {
+ int k;
+ col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+ col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+ col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+ col_sums_gxgy[0] = 0;
+ for (i = 4; i < 8; i++) {
+ FS_COL_SET(i, -1, 0);
+ FS_COL_ADD(i, 0, 0);
+ for (k = 1; k < 8 - i; k++) {
+ FS_COL_DOUBLE(i, i);
+ FS_COL_ADD(i, -k - 1, 0);
+ FS_COL_ADD(i, k, 0);
+ }
+ }
+ for (i = 0; i < w; i++) {
+ double mugx2;
+ double mugy2;
+ double mugxgy;
+ mugx2 = col_sums_gx2[0];
+ for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
+ mugy2 = col_sums_gy2[0];
+ for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
+ mugxgy = col_sums_gxgy[0];
+ for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
+ ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+ if (i + 1 < w) {
+ FS_COL_SET(0, -1, 1);
+ FS_COL_ADD(0, 0, 1);
+ FS_COL_SUB(2, -3, 2);
+ FS_COL_SUB(2, 2, 2);
+ FS_COL_HALVE(1, 2);
+ FS_COL_SUB(3, -4, 3);
+ FS_COL_SUB(3, 3, 3);
+ FS_COL_HALVE(2, 3);
+ FS_COL_COPY(3, 4);
+ FS_COL_DOUBLE(4, 5);
+ FS_COL_ADD(4, -4, 5);
+ FS_COL_ADD(4, 3, 5);
+ FS_COL_DOUBLE(5, 6);
+ FS_COL_ADD(5, -3, 6);
+ FS_COL_ADD(5, 2, 6);
+ FS_COL_DOUBLE(6, 7);
+ FS_COL_ADD(6, -2, 7);
+ FS_COL_ADD(6, 1, 7);
+ FS_COL_SET(7, -1, 8);
+ FS_COL_ADD(7, 0, 8);
+ }
+ }
+ }
+ }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+ 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+ double *ssim;
+ double ret;
+ int w;
+ int h;
+ int i;
+ int j;
+ w = _ctx->level[_l].w;
+ h = _ctx->level[_l].h;
+ ssim = _ctx->level[_l].ssim;
+ ret = 0;
+ for (j = 0; j < h; j++)
+ for (i = 0; i < w; i++) ret += ssim[j * w + i];
+ return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+ assert(_weight >= _ssim);
+ if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
+ return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+ int _dystride, int _w, int _h, uint32_t _bd,
+ uint32_t _shift, int buf_is_hbd) {
+ fs_ctx ctx;
+ double ret;
+ int l;
+ ret = 1;
+ if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
+ fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
+ buf_is_hbd);
+ for (l = 0; l < FS_NLEVELS - 1; l++) {
+ fs_calc_structure(&ctx, l, _bd);
+ ret *= fs_average(&ctx, l);
+ fs_downsample_level(&ctx, l + 1);
+ }
+ fs_calc_structure(&ctx, l, _bd);
+ fs_apply_luminance(&ctx, l, _bd);
+ ret *= fs_average(&ctx, l);
+ fs_ctx_clear(&ctx);
+ return ret;
+}
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd) {
+ double ssimv;
+ uint32_t bd_shift = 0;
+ assert(bd >= in_bd);
+ assert(source->flags == dest->flags);
+ int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
+ bd_shift = bd - in_bd;
+
+ *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+ dest->y_stride, source->y_crop_width,
+ source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
+ *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+ *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+ dest->uv_stride, source->uv_crop_width,
+ source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+ ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+ return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
new file mode 100644
index 0000000000..a44dbf77b1
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y++) {
+ for (int x = 0; x < n; x++) {
+ B[y * n + x] = A[x * n + y];
+ }
+ }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+ int n) {
+ for (int y = 0; y <= n / 2; ++y) {
+ const int y2 = y + n / 2;
+ const int y_extra = y2 > n / 2 && y2 < n;
+
+ for (int x = 0; x <= n / 2; ++x) {
+ const int x2 = x + n / 2;
+ const int x_extra = x2 > n / 2 && x2 < n;
+ output[2 * (y * n + x)] =
+ col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+ output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+ (x_extra ? col_fft[y * n + x2] : 0);
+ if (y_extra) {
+ output[2 * ((n - y) * n + x)] =
+ col_fft[y * n + x] +
+ (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+ output[2 * ((n - y) * n + x) + 1] =
+ -(y_extra ? col_fft[y2 * n + x] : 0) +
+ (x_extra ? col_fft[y * n + x2] : 0);
+ }
+ }
+ }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+ aom_fft_unpack_func_t unpack, int vec_size) {
+ for (int x = 0; x < n; x += vec_size) {
+ tform(input + x, output + x, n);
+ }
+ transpose(output, temp, n);
+
+ for (int x = 0; x < n; x += vec_size) {
+ tform(temp + x, output + x, n);
+ }
+ transpose(output, temp, n);
+
+ unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float)
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+ sub_float)
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float)
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float)
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float)
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+ unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+ aom_fft_1d_func_t ifft_multi,
+ aom_fft_transpose_func_t transpose, int vec_size) {
+ // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+ // and get real outputs.
+ for (int y = 0; y <= n / 2; ++y) {
+ output[y * n] = input[2 * y * n];
+ output[y * n + 1] = input[2 * (y * n + n / 2)];
+ }
+ for (int y = n / 2 + 1; y < n; ++y) {
+ output[y * n] = input[2 * (y - n / 2) * n + 1];
+ output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+ }
+
+ for (int i = 0; i < 2; i += vec_size) {
+ ifft_multi(output + i, temp + i, n);
+ }
+
+ // For the other columns, since we don't have a full ifft for complex inputs
+ // we have to split them into the real and imaginary counterparts.
+ // Pack the real component, then the imaginary components.
+ for (int y = 0; y < n; ++y) {
+ for (int x = 1; x < n / 2; ++x) {
+ output[y * n + (x + 1)] = input[2 * (y * n + x)];
+ }
+ for (int x = 1; x < n / 2; ++x) {
+ output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+ }
+ }
+ for (int y = 2; y < vec_size; y++) {
+ fft_single(output + y, temp + y, n);
+ }
+ // This is the part that can be sped up with SIMD
+ for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+ fft_multi(output + y, temp + y, n);
+ }
+
+ // Put the 0 and n/2 th results in the correct place.
+ for (int x = 0; x < n; ++x) {
+ output[x] = temp[x * n];
+ output[(n / 2) * n + x] = temp[x * n + 1];
+ }
+ // This rearranges and transposes.
+ for (int y = 1; y < n / 2; ++y) {
+ // Fill in the real columns
+ for (int x = 0; x <= n / 2; ++x) {
+ output[x + y * n] =
+ temp[(y + 1) + x * n] +
+ ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+ }
+ for (int x = n / 2 + 1; x < n; ++x) {
+ output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+ temp[(y + n / 2) + ((n - x) + n / 2) * n];
+ }
+ // Fill in the imag columns
+ for (int x = 0; x <= n / 2; ++x) {
+ output[x + (y + n / 2) * n] =
+ temp[(y + n / 2) + x * n] -
+ ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+ }
+ for (int x = n / 2 + 1; x < n; ++x) {
+ output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+ temp[(y + n / 2) + (n - x) * n];
+ }
+ }
+ for (int y = 0; y < n; y += vec_size) {
+ ifft_multi(output + y, temp + y, n);
+ }
+ transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float)
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+ sub_float)
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float)
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float)
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+ sub_float, mul_float)
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+ aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+ aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+ aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
new file mode 100644
index 0000000000..3de1a045ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft_common.h
@@ -0,0 +1,1056 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FFT_COMMON_H_
+#define AOM_AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in] input Input buffer. See above for size restrictions.
+ * \param[out] output Output buffer. See above for size restrictions.
+ * \param[in] stride The spacing in number of elements between rows
+ * (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+ int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_2_float(const float *input, float *output, int stride);
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+void aom_ifft1d_2_float(const float *input, float *output, int stride);
+void aom_ifft1d_4_float(const float *input, float *output, int stride);
+void aom_ifft1d_8_float(const float *input, float *output, int stride);
+void aom_ifft1d_16_float(const float *input, float *output, int stride);
+void aom_ifft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in] input Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in] n Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+ int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in] input Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in] n Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in] input Input buffer to run the transform on (size n x n)
+ * \param[out] temp Working buffer for computing the transform (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in] tform Forward transform function
+ * \param[in] transpose Transpose function (for n x n matrix)
+ * \param[in] unpack Unpack function used to massage outputs to correct form
+ * \param[in] vec_size Vector size (the transform is done vec_size units at
+ * a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+ aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in] input Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp Working buffer for computations (size 2 x n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in] fft_single Forward transform function (non vectorized)
+ * \param[in] fft_multi Forward transform function (vectorized)
+ * \param[in] ifft_multi Inverse transform function (vectorized)
+ * \param[in] transpose Transpose function (for n x n matrix)
+ * \param[in] vec_size Vector size (the transform is done vec_size
+ * units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+ aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+ aom_fft_1d_func_t ifft_multi,
+ aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \
+ ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ store(output + 0 * stride, i0 + i1); \
+ store(output + 1 * stride, i0 - i1); \
+ }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+ ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC w0 = add(i0, i2); \
+ const T_VEC w1 = sub(i0, i2); \
+ const T_VEC w2 = add(i1, i3); \
+ const T_VEC w3 = sub(i1, i3); \
+ store(output + 0 * stride, add(w0, w2)); \
+ store(output + 1 * stride, w1); \
+ store(output + 2 * stride, sub(w0, w2)); \
+ store(output + 3 * stride, sub(kWeight0, w3)); \
+ }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+ ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC w0 = add(i0, i4); \
+ const T_VEC w1 = sub(i0, i4); \
+ const T_VEC w2 = add(i2, i6); \
+ const T_VEC w3 = sub(i2, i6); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i1, i5); \
+ const T_VEC w8 = sub(i1, i5); \
+ const T_VEC w9 = add(i3, i7); \
+ const T_VEC w10 = sub(i3, i7); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ store(output + 0 * stride, add(w4, w11)); \
+ store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
+ store(output + 2 * stride, w5); \
+ store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
+ store(output + 4 * stride, sub(w4, w11)); \
+ store(output + 5 * stride, \
+ sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \
+ store(output + 6 * stride, sub(kWeight0, w12)); \
+ store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
+ }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC w0 = add(i0, i8); \
+ const T_VEC w1 = sub(i0, i8); \
+ const T_VEC w2 = add(i4, i12); \
+ const T_VEC w3 = sub(i4, i12); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i2, i10); \
+ const T_VEC w8 = sub(i2, i10); \
+ const T_VEC w9 = add(i6, i14); \
+ const T_VEC w10 = sub(i6, i14); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ const T_VEC w14 = add(w4, w11); \
+ const T_VEC w15 = sub(w4, w11); \
+ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(sub(kWeight0, w3), \
+ mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(w3, mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w19 = add(i1, i9); \
+ const T_VEC w20 = sub(i1, i9); \
+ const T_VEC w21 = add(i5, i13); \
+ const T_VEC w22 = sub(i5, i13); \
+ const T_VEC w23 = add(w19, w21); \
+ const T_VEC w24 = sub(w19, w21); \
+ const T_VEC w26 = add(i3, i11); \
+ const T_VEC w27 = sub(i3, i11); \
+ const T_VEC w28 = add(i7, i15); \
+ const T_VEC w29 = sub(i7, i15); \
+ const T_VEC w30 = add(w26, w28); \
+ const T_VEC w31 = sub(w26, w28); \
+ const T_VEC w33 = add(w23, w30); \
+ const T_VEC w34 = sub(w23, w30); \
+ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(sub(kWeight0, w22), \
+ mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(w22, mul(kWeight2, add(w29, w27))) }; \
+ store(output + 0 * stride, add(w14, w33)); \
+ store(output + 1 * stride, \
+ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+ store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
+ store(output + 3 * stride, \
+ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+ store(output + 4 * stride, w15); \
+ store(output + 5 * stride, \
+ add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \
+ mul(kWeight3, w37[1])))); \
+ store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
+ store(output + 7 * stride, \
+ add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \
+ mul(kWeight4, w35[1])))); \
+ store(output + 8 * stride, sub(w14, w33)); \
+ store(output + 9 * stride, \
+ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+ store(output + 10 * stride, \
+ sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \
+ store(output + 11 * stride, \
+ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+ store(output + 12 * stride, sub(kWeight0, w34)); \
+ store(output + 13 * stride, \
+ sub(sub(kWeight0, w18[1]), \
+ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \
+ store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
+ store(output + 15 * stride, \
+ sub(sub(kWeight0, w16[1]), \
+ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \
+ }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC kWeight5 = constant(0.980785f); \
+ const T_VEC kWeight6 = constant(0.19509f); \
+ const T_VEC kWeight7 = constant(0.83147f); \
+ const T_VEC kWeight8 = constant(0.55557f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC i16 = load(input + 16 * stride); \
+ const T_VEC i17 = load(input + 17 * stride); \
+ const T_VEC i18 = load(input + 18 * stride); \
+ const T_VEC i19 = load(input + 19 * stride); \
+ const T_VEC i20 = load(input + 20 * stride); \
+ const T_VEC i21 = load(input + 21 * stride); \
+ const T_VEC i22 = load(input + 22 * stride); \
+ const T_VEC i23 = load(input + 23 * stride); \
+ const T_VEC i24 = load(input + 24 * stride); \
+ const T_VEC i25 = load(input + 25 * stride); \
+ const T_VEC i26 = load(input + 26 * stride); \
+ const T_VEC i27 = load(input + 27 * stride); \
+ const T_VEC i28 = load(input + 28 * stride); \
+ const T_VEC i29 = load(input + 29 * stride); \
+ const T_VEC i30 = load(input + 30 * stride); \
+ const T_VEC i31 = load(input + 31 * stride); \
+ const T_VEC w0 = add(i0, i16); \
+ const T_VEC w1 = sub(i0, i16); \
+ const T_VEC w2 = add(i8, i24); \
+ const T_VEC w3 = sub(i8, i24); \
+ const T_VEC w4 = add(w0, w2); \
+ const T_VEC w5 = sub(w0, w2); \
+ const T_VEC w7 = add(i4, i20); \
+ const T_VEC w8 = sub(i4, i20); \
+ const T_VEC w9 = add(i12, i28); \
+ const T_VEC w10 = sub(i12, i28); \
+ const T_VEC w11 = add(w7, w9); \
+ const T_VEC w12 = sub(w7, w9); \
+ const T_VEC w14 = add(w4, w11); \
+ const T_VEC w15 = sub(w4, w11); \
+ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(sub(kWeight0, w3), \
+ mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
+ sub(w3, mul(kWeight2, add(w10, w8))) }; \
+ const T_VEC w19 = add(i2, i18); \
+ const T_VEC w20 = sub(i2, i18); \
+ const T_VEC w21 = add(i10, i26); \
+ const T_VEC w22 = sub(i10, i26); \
+ const T_VEC w23 = add(w19, w21); \
+ const T_VEC w24 = sub(w19, w21); \
+ const T_VEC w26 = add(i6, i22); \
+ const T_VEC w27 = sub(i6, i22); \
+ const T_VEC w28 = add(i14, i30); \
+ const T_VEC w29 = sub(i14, i30); \
+ const T_VEC w30 = add(w26, w28); \
+ const T_VEC w31 = sub(w26, w28); \
+ const T_VEC w33 = add(w23, w30); \
+ const T_VEC w34 = sub(w23, w30); \
+ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(sub(kWeight0, w22), \
+ mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
+ sub(w22, mul(kWeight2, add(w29, w27))) }; \
+ const T_VEC w38 = add(w14, w33); \
+ const T_VEC w39 = sub(w14, w33); \
+ const T_VEC w40[2] = { \
+ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \
+ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \
+ }; \
+ const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \
+ sub(sub(kWeight0, w12), \
+ mul(kWeight2, add(w31, w24))) }; \
+ const T_VEC w42[2] = { \
+ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \
+ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \
+ }; \
+ const T_VEC w44[2] = { \
+ add(w18[0], \
+ sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+ sub(sub(kWeight0, w18[1]), \
+ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \
+ }; \
+ const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \
+ sub(w12, mul(kWeight2, add(w31, w24))) }; \
+ const T_VEC w46[2] = { \
+ add(w16[0], \
+ sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+ sub(sub(kWeight0, w16[1]), \
+ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \
+ }; \
+ const T_VEC w47 = add(i1, i17); \
+ const T_VEC w48 = sub(i1, i17); \
+ const T_VEC w49 = add(i9, i25); \
+ const T_VEC w50 = sub(i9, i25); \
+ const T_VEC w51 = add(w47, w49); \
+ const T_VEC w52 = sub(w47, w49); \
+ const T_VEC w54 = add(i5, i21); \
+ const T_VEC w55 = sub(i5, i21); \
+ const T_VEC w56 = add(i13, i29); \
+ const T_VEC w57 = sub(i13, i29); \
+ const T_VEC w58 = add(w54, w56); \
+ const T_VEC w59 = sub(w54, w56); \
+ const T_VEC w61 = add(w51, w58); \
+ const T_VEC w62 = sub(w51, w58); \
+ const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \
+ sub(sub(kWeight0, w50), \
+ mul(kWeight2, add(w57, w55))) }; \
+ const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \
+ sub(w50, mul(kWeight2, add(w57, w55))) }; \
+ const T_VEC w66 = add(i3, i19); \
+ const T_VEC w67 = sub(i3, i19); \
+ const T_VEC w68 = add(i11, i27); \
+ const T_VEC w69 = sub(i11, i27); \
+ const T_VEC w70 = add(w66, w68); \
+ const T_VEC w71 = sub(w66, w68); \
+ const T_VEC w73 = add(i7, i23); \
+ const T_VEC w74 = sub(i7, i23); \
+ const T_VEC w75 = add(i15, i31); \
+ const T_VEC w76 = sub(i15, i31); \
+ const T_VEC w77 = add(w73, w75); \
+ const T_VEC w78 = sub(w73, w75); \
+ const T_VEC w80 = add(w70, w77); \
+ const T_VEC w81 = sub(w70, w77); \
+ const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \
+ sub(sub(kWeight0, w69), \
+ mul(kWeight2, add(w76, w74))) }; \
+ const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \
+ sub(w69, mul(kWeight2, add(w76, w74))) }; \
+ const T_VEC w85 = add(w61, w80); \
+ const T_VEC w86 = sub(w61, w80); \
+ const T_VEC w87[2] = { \
+ add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \
+ add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \
+ }; \
+ const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \
+ sub(sub(kWeight0, w59), \
+ mul(kWeight2, add(w78, w71))) }; \
+ const T_VEC w89[2] = { \
+ add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \
+ add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \
+ }; \
+ const T_VEC w91[2] = { \
+ add(w65[0], \
+ sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+ sub(sub(kWeight0, w65[1]), \
+ sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \
+ }; \
+ const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \
+ sub(w59, mul(kWeight2, add(w78, w71))) }; \
+ const T_VEC w93[2] = { \
+ add(w63[0], \
+ sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+ sub(sub(kWeight0, w63[1]), \
+ sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \
+ }; \
+ store(output + 0 * stride, add(w38, w85)); \
+ store(output + 1 * stride, \
+ add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \
+ store(output + 2 * stride, \
+ add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \
+ store(output + 3 * stride, \
+ add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \
+ store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
+ store(output + 5 * stride, \
+ add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \
+ store(output + 6 * stride, \
+ add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \
+ store(output + 7 * stride, \
+ add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \
+ store(output + 8 * stride, w39); \
+ store(output + 9 * stride, \
+ add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \
+ mul(kWeight5, w93[1])))); \
+ store(output + 10 * stride, \
+ add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \
+ mul(kWeight3, w92[1])))); \
+ store(output + 11 * stride, \
+ add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \
+ mul(kWeight7, w91[1])))); \
+ store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
+ store(output + 13 * stride, \
+ add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \
+ mul(kWeight8, w89[1])))); \
+ store(output + 14 * stride, \
+ add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \
+ mul(kWeight4, w88[1])))); \
+ store(output + 15 * stride, \
+ add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \
+ mul(kWeight6, w87[1])))); \
+ store(output + 16 * stride, sub(w38, w85)); \
+ store(output + 17 * stride, \
+ add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \
+ store(output + 18 * stride, \
+ add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \
+ store(output + 19 * stride, \
+ add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \
+ store(output + 20 * stride, \
+ sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \
+ store(output + 21 * stride, \
+ add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \
+ store(output + 22 * stride, \
+ add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \
+ store(output + 23 * stride, \
+ add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \
+ store(output + 24 * stride, sub(kWeight0, w86)); \
+ store(output + 25 * stride, \
+ sub(sub(kWeight0, w46[1]), \
+ sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \
+ store(output + 26 * stride, \
+ sub(sub(kWeight0, w45[1]), \
+ sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \
+ store(output + 27 * stride, \
+ sub(sub(kWeight0, w44[1]), \
+ sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \
+ store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
+ store(output + 29 * stride, \
+ sub(sub(kWeight0, w42[1]), \
+ sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \
+ store(output + 30 * stride, \
+ sub(sub(kWeight0, w41[1]), \
+ sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \
+ store(output + 31 * stride, \
+ sub(sub(kWeight0, w40[1]), \
+ sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \
+ }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \
+ ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ store(output + 0 * stride, i0 + i1); \
+ store(output + 1 * stride, i0 - i1); \
+ }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+ ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC w2 = add(i0, i2); \
+ const T_VEC w3 = sub(i0, i2); \
+ const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \
+ const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \
+ store(output + 0 * stride, add(w2, w4[0])); \
+ store(output + 1 * stride, add(w3, w5[1])); \
+ store(output + 2 * stride, sub(w2, w4[0])); \
+ store(output + 3 * stride, sub(w3, w5[1])); \
+ }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC w6 = add(i0, i4); \
+ const T_VEC w7 = sub(i0, i4); \
+ const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \
+ const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \
+ const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \
+ const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \
+ const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \
+ const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \
+ const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \
+ const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \
+ const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \
+ const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \
+ const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \
+ const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \
+ const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \
+ const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \
+ store(output + 0 * stride, add(w10[0], w18[0])); \
+ store(output + 1 * stride, \
+ add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \
+ store(output + 2 * stride, add(w11[0], w19[1])); \
+ store(output + 3 * stride, \
+ sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
+ store(output + 4 * stride, sub(w10[0], w18[0])); \
+ store(output + 5 * stride, \
+ add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \
+ mul(kWeight2, w20[1])))); \
+ store(output + 6 * stride, sub(w11[0], w19[1])); \
+ store(output + 7 * stride, \
+ add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
+ }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC w14 = add(i0, i8); \
+ const T_VEC w15 = sub(i0, i8); \
+ const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \
+ const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \
+ const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \
+ const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \
+ const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \
+ const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \
+ const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \
+ const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \
+ const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \
+ const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \
+ const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \
+ const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \
+ const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \
+ const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \
+ const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \
+ const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \
+ const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \
+ add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+ const T_VEC w33[2] = { add(w20[0], \
+ sub(sub(kWeight0, mul(kWeight2, w28[0])), \
+ mul(kWeight2, w28[1]))), \
+ add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+ const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \
+ const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \
+ const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
+ sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+ const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
+ add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+ const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \
+ const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \
+ const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \
+ const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \
+ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
+ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
+ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
+ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
+ const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \
+ const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \
+ const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \
+ const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \
+ const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \
+ const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \
+ const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \
+ const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \
+ const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \
+ const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \
+ const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \
+ add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+ const T_VEC w57[2] = { add(w44[0], \
+ sub(sub(kWeight0, mul(kWeight2, w52[0])), \
+ mul(kWeight2, w52[1]))), \
+ add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+ const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \
+ const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \
+ const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
+ sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+ const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
+ add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+ store(output + 0 * stride, add(w30[0], w54[0])); \
+ store(output + 1 * stride, \
+ add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \
+ store(output + 2 * stride, \
+ add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \
+ store(output + 3 * stride, \
+ add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \
+ store(output + 4 * stride, add(w31[0], w55[1])); \
+ store(output + 5 * stride, \
+ sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
+ store(output + 6 * stride, \
+ sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
+ store(output + 7 * stride, \
+ sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
+ store(output + 8 * stride, sub(w30[0], w54[0])); \
+ store(output + 9 * stride, \
+ add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \
+ mul(kWeight4, w56[1])))); \
+ store(output + 10 * stride, \
+ add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \
+ mul(kWeight2, w58[1])))); \
+ store(output + 11 * stride, \
+ add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \
+ mul(kWeight3, w60[1])))); \
+ store(output + 12 * stride, sub(w31[0], w55[1])); \
+ store(output + 13 * stride, \
+ add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
+ store(output + 14 * stride, \
+ add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
+ store(output + 15 * stride, \
+ add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
+ }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+ mul) \
+ ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
+ const T_VEC kWeight0 = constant(0.0f); \
+ const T_VEC kWeight2 = constant(0.707107f); \
+ const T_VEC kWeight3 = constant(0.92388f); \
+ const T_VEC kWeight4 = constant(0.382683f); \
+ const T_VEC kWeight5 = constant(0.980785f); \
+ const T_VEC kWeight6 = constant(0.19509f); \
+ const T_VEC kWeight7 = constant(0.83147f); \
+ const T_VEC kWeight8 = constant(0.55557f); \
+ const T_VEC i0 = load(input + 0 * stride); \
+ const T_VEC i1 = load(input + 1 * stride); \
+ const T_VEC i2 = load(input + 2 * stride); \
+ const T_VEC i3 = load(input + 3 * stride); \
+ const T_VEC i4 = load(input + 4 * stride); \
+ const T_VEC i5 = load(input + 5 * stride); \
+ const T_VEC i6 = load(input + 6 * stride); \
+ const T_VEC i7 = load(input + 7 * stride); \
+ const T_VEC i8 = load(input + 8 * stride); \
+ const T_VEC i9 = load(input + 9 * stride); \
+ const T_VEC i10 = load(input + 10 * stride); \
+ const T_VEC i11 = load(input + 11 * stride); \
+ const T_VEC i12 = load(input + 12 * stride); \
+ const T_VEC i13 = load(input + 13 * stride); \
+ const T_VEC i14 = load(input + 14 * stride); \
+ const T_VEC i15 = load(input + 15 * stride); \
+ const T_VEC i16 = load(input + 16 * stride); \
+ const T_VEC i17 = load(input + 17 * stride); \
+ const T_VEC i18 = load(input + 18 * stride); \
+ const T_VEC i19 = load(input + 19 * stride); \
+ const T_VEC i20 = load(input + 20 * stride); \
+ const T_VEC i21 = load(input + 21 * stride); \
+ const T_VEC i22 = load(input + 22 * stride); \
+ const T_VEC i23 = load(input + 23 * stride); \
+ const T_VEC i24 = load(input + 24 * stride); \
+ const T_VEC i25 = load(input + 25 * stride); \
+ const T_VEC i26 = load(input + 26 * stride); \
+ const T_VEC i27 = load(input + 27 * stride); \
+ const T_VEC i28 = load(input + 28 * stride); \
+ const T_VEC i29 = load(input + 29 * stride); \
+ const T_VEC i30 = load(input + 30 * stride); \
+ const T_VEC i31 = load(input + 31 * stride); \
+ const T_VEC w30 = add(i0, i16); \
+ const T_VEC w31 = sub(i0, i16); \
+ const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \
+ const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \
+ const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \
+ const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \
+ const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \
+ const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \
+ const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \
+ const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \
+ const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \
+ const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \
+ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
+ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
+ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
+ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
+ const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \
+ const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \
+ const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \
+ add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \
+ const T_VEC w49[2] = { add(w36[0], \
+ sub(sub(kWeight0, mul(kWeight2, w44[0])), \
+ mul(kWeight2, w44[1]))), \
+ add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \
+ const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \
+ const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \
+ const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
+ sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
+ const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
+ add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
+ const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \
+ const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \
+ const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \
+ const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \
+ const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \
+ const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \
+ const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \
+ const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \
+ const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \
+ const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \
+ const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \
+ const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \
+ const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \
+ const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \
+ const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \
+ const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \
+ const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \
+ const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \
+ const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \
+ add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \
+ const T_VEC w73[2] = { add(w60[0], \
+ sub(sub(kWeight0, mul(kWeight2, w68[0])), \
+ mul(kWeight2, w68[1]))), \
+ add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \
+ const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \
+ const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \
+ const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
+ sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
+ const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
+ add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
+ const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \
+ const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \
+ const T_VEC w80[2] = { \
+ add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \
+ add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \
+ }; \
+ const T_VEC w81[2] = { \
+ add(w48[0], \
+ sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \
+ add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \
+ }; \
+ const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \
+ add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \
+ const T_VEC w83[2] = { add(w50[0], \
+ sub(sub(kWeight0, mul(kWeight2, w74[0])), \
+ mul(kWeight2, w74[1]))), \
+ add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \
+ const T_VEC w84[2] = { \
+ add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \
+ add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \
+ }; \
+ const T_VEC w85[2] = { \
+ add(w52[0], \
+ sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \
+ add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \
+ }; \
+ const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \
+ const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \
+ const T_VEC w88[2] = { \
+ sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
+ add(w49[1], \
+ sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \
+ }; \
+ const T_VEC w89[2] = { \
+ add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
+ add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \
+ }; \
+ const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
+ sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
+ const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
+ add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
+ const T_VEC w92[2] = { \
+ sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
+ add(w53[1], \
+ sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \
+ }; \
+ const T_VEC w93[2] = { \
+ add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
+ add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \
+ }; \
+ const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \
+ const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \
+ const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \
+ const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \
+ const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \
+ const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \
+ const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \
+ const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \
+ const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \
+ const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \
+ const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \
+ const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \
+ const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \
+ const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \
+ const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \
+ const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \
+ const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \
+ const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \
+ const T_VEC w112[2] = { \
+ add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \
+ add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \
+ }; \
+ const T_VEC w113[2] = { \
+ add(w100[0], \
+ sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+ add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \
+ }; \
+ const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \
+ const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \
+ const T_VEC w116[2] = { \
+ sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
+ sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
+ }; \
+ const T_VEC w117[2] = { \
+ add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
+ add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
+ }; \
+ const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \
+ const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \
+ const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \
+ const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \
+ const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \
+ const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \
+ const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \
+ const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \
+ const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \
+ const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \
+ const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \
+ const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \
+ const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \
+ const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \
+ const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \
+ const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \
+ const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \
+ const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \
+ const T_VEC w136[2] = { \
+ add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \
+ add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \
+ }; \
+ const T_VEC w137[2] = { \
+ add(w124[0], \
+ sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+ add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \
+ }; \
+ const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \
+ const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \
+ const T_VEC w140[2] = { \
+ sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
+ sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
+ }; \
+ const T_VEC w141[2] = { \
+ add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
+ add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
+ }; \
+ const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \
+ const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \
+ const T_VEC w144[2] = { \
+ add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \
+ add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \
+ }; \
+ const T_VEC w145[2] = { \
+ add(w112[0], \
+ sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+ add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \
+ }; \
+ const T_VEC w146[2] = { \
+ add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \
+ add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \
+ }; \
+ const T_VEC w147[2] = { \
+ add(w114[0], \
+ sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+ add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \
+ }; \
+ const T_VEC w148[2] = { \
+ add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \
+ add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \
+ }; \
+ const T_VEC w149[2] = { \
+ add(w116[0], \
+ sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+ add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \
+ }; \
+ const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \
+ const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \
+ const T_VEC w152[2] = { \
+ sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
+ add(w113[1], \
+ sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \
+ }; \
+ const T_VEC w153[2] = { \
+ add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
+ add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \
+ }; \
+ const T_VEC w154[2] = { \
+ sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
+ sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
+ }; \
+ const T_VEC w155[2] = { \
+ add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
+ add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
+ }; \
+ const T_VEC w156[2] = { \
+ sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
+ add(w117[1], \
+ sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \
+ }; \
+ const T_VEC w157[2] = { \
+ add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
+ add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \
+ }; \
+ store(output + 0 * stride, add(w78[0], w142[0])); \
+ store(output + 1 * stride, \
+ add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \
+ store(output + 2 * stride, \
+ add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \
+ store(output + 3 * stride, \
+ add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \
+ store(output + 4 * stride, \
+ add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \
+ store(output + 5 * stride, \
+ add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \
+ store(output + 6 * stride, \
+ add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \
+ store(output + 7 * stride, \
+ add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \
+ store(output + 8 * stride, add(w79[0], w143[1])); \
+ store(output + 9 * stride, \
+ sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
+ store(output + 10 * stride, \
+ sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
+ store(output + 11 * stride, \
+ sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
+ store(output + 12 * stride, \
+ sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
+ store(output + 13 * stride, \
+ sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
+ store(output + 14 * stride, \
+ sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
+ store(output + 15 * stride, \
+ sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
+ store(output + 16 * stride, sub(w78[0], w142[0])); \
+ store(output + 17 * stride, \
+ add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \
+ mul(kWeight6, w144[1])))); \
+ store(output + 18 * stride, \
+ add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \
+ mul(kWeight4, w146[1])))); \
+ store(output + 19 * stride, \
+ add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \
+ mul(kWeight8, w148[1])))); \
+ store(output + 20 * stride, \
+ add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \
+ mul(kWeight2, w150[1])))); \
+ store(output + 21 * stride, \
+ add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \
+ mul(kWeight7, w152[1])))); \
+ store(output + 22 * stride, \
+ add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \
+ mul(kWeight3, w154[1])))); \
+ store(output + 23 * stride, \
+ add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \
+ mul(kWeight5, w156[1])))); \
+ store(output + 24 * stride, sub(w79[0], w143[1])); \
+ store(output + 25 * stride, \
+ add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
+ store(output + 26 * stride, \
+ add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
+ store(output + 27 * stride, \
+ add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
+ store(output + 28 * stride, \
+ add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
+ store(output + 29 * stride, \
+ add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
+ store(output + 30 * stride, \
+ add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
+ store(output + 31 * stride, \
+ add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
+ }
+
+#endif // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
new file mode 100644
index 0000000000..ee42be7393
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+ // Check that the fractional position is in range.
+ //
+ // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+ // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+ // possible to have x == 1 due to floating point rounding. This is fine,
+ // and we still interpolate correctly if we allow x = 1.
+ assert(0 <= x && x <= 1);
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+ kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+ kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+ kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+ double kernel_dbl[4];
+ get_cubic_kernel_dbl(x, kernel_dbl);
+
+ kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+ kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+ kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+ kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+ int width, int height, int stride, int x,
+ int y, double u, double v, int16_t *dt) {
+ // Split offset into integer and fractional parts, and compute cubic
+ // interpolation kernels
+ const int u_int = (int)floor(u);
+ const int v_int = (int)floor(v);
+ const double u_frac = u - floor(u);
+ const double v_frac = v - floor(v);
+
+ int h_kernel[4];
+ int v_kernel[4];
+ get_cubic_kernel_int(u_frac, h_kernel);
+ get_cubic_kernel_int(v_frac, v_kernel);
+
+ int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+
+ // Clamp coordinates so that all pixels we fetch will remain within the
+ // allocated border region, but allow them to go far enough out that
+ // the border pixels' values do not change.
+ // Since we are calculating an 8x8 block, the bottom-right pixel
+ // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+ // interpolation has 4 taps, meaning that the output of pixel
+ // (x_w, y_w) depends on the pixels in the range
+ // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+ //
+ // Thus the most extreme coordinates which will be fetched are
+ // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+ const int x0 = clamp(x + u_int, -9, width);
+ const int y0 = clamp(y + v_int, -9, height);
+
+ // Horizontal convolution.
+ const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1);
+ int16x4_t h_filter = vmovn_s32(vld1q_s32(h_kernel));
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) {
+ uint8x16_t r = vld1q_u8(ref_start + i * stride);
+ uint16x8_t r0 = vmovl_u8(vget_low_u8(r));
+ uint16x8_t r1 = vmovl_u8(vget_high_u8(r));
+
+ int16x8_t s0 = vreinterpretq_s16_u16(r0);
+ int16x8_t s1 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 3));
+
+ int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), h_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), h_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), h_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), h_filter, 3);
+
+ int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), h_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), h_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), h_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), h_filter, 3);
+
+ // 6 is the maximum allowable number of extra bits which will avoid
+ // the intermediate values overflowing an int16_t. The most extreme
+ // intermediate value occurs when:
+ // * The input pixels are [0, 255, 255, 0]
+ // * u_frac = 0.5
+ // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+ // As an integer with 6 fractional bits, that is 18360, which fits
+ // in an int16_t. But with 7 fractional bits it would be 36720,
+ // which is too large.
+
+ int16x8_t sum = vcombine_s16(vrshrn_n_s32(sum_lo, DISFLOW_INTERP_BITS - 6),
+ vrshrn_n_s32(sum_hi, DISFLOW_INTERP_BITS - 6));
+ vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, sum);
+ }
+
+ // Vertical convolution.
+ int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel));
+ int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE;
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+ int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE);
+ int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE);
+ int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE);
+ int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE);
+
+ int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3);
+
+ int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3);
+
+ uint8x8_t s = vld1_u8(src + (i + y) * stride + x);
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3));
+
+ // This time, we have to round off the 6 extra bits which were kept
+ // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+ // of precision to match the scale of the dx and dy arrays.
+ sum_lo = vrshrq_n_s32(sum_lo,
+ DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+ sum_hi = vrshrq_n_s32(sum_hi,
+ DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+ int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16));
+ int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16));
+ vst1q_s16(dt + i * DISFLOW_PATCH_SIZE,
+ vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi)));
+ }
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride) {
+ int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+ // Horizontal filter, using kernel {1, 0, -1}.
+ const uint8_t *src_start = src - 1 * src_stride - 1;
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+ uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+ uint8x8_t s0 = vget_low_u8(s);
+ uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+ // Given that the kernel is {1, 0, -1} the convolution is a simple
+ // subtraction.
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
+
+ vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
+ }
+
+ // Vertical filter, using kernel {1, 2, 1}.
+ // This kernel can be split into two 2-taps kernels of value {1, 1}.
+ // That way we need only 3 add operations to perform the convolution, one of
+ // which can be reused for the next line.
+ int16x8_t s0 = vld1q_s16(tmp);
+ int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
+ int16x8_t sum01 = vaddq_s16(s0, s1);
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
+
+ int16x8_t sum12 = vaddq_s16(s1, s2);
+ int16x8_t sum = vaddq_s16(sum01, sum12);
+
+ vst1q_s16(dst + i * dst_stride, sum);
+
+ sum01 = sum12;
+ s1 = s2;
+ }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride) {
+ int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+ // Horizontal filter, using kernel {1, 2, 1}.
+ // This kernel can be split into two 2-taps kernels of value {1, 1}.
+ // That way we need only 3 add operations to perform the convolution.
+ const uint8_t *src_start = src - 1 * src_stride - 1;
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+ uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+ uint8x8_t s0 = vget_low_u8(s);
+ uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
+ uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+ uint16x8_t sum01 = vaddl_u8(s0, s1);
+ uint16x8_t sum12 = vaddl_u8(s1, s2);
+ uint16x8_t sum = vaddq_u16(sum01, sum12);
+
+ vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
+ }
+
+ // Vertical filter, using kernel {1, 0, -1}.
+ // Load the whole block at once to avoid redundant loads during convolution.
+ int16x8_t t[10];
+ load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
+ &t[5], &t[6], &t[7], &t[8], &t[9]);
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ // Given that the kernel is {1, 0, -1} the convolution is a simple
+ // subtraction.
+ int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
+
+ vst1q_s16(dst + i * dst_stride, diff);
+ }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+// (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+// u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+// M = |sum(dx * dx) sum(dx * dy)|
+// |sum(dx * dy) sum(dy * dy)|
+//
+// b = |sum(dx * dt)|
+// |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+ const int16_t *dy, int dy_stride,
+ double *M_inv) {
+ int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ int16x8_t x = vld1q_s16(dx + i * dx_stride);
+ int16x8_t y = vld1q_s16(dy + i * dy_stride);
+ sum[0] = vmlal_s16(sum[0], vget_low_s16(x), vget_low_s16(x));
+ sum[0] = vmlal_s16(sum[0], vget_high_s16(x), vget_high_s16(x));
+
+ sum[1] = vmlal_s16(sum[1], vget_low_s16(x), vget_low_s16(y));
+ sum[1] = vmlal_s16(sum[1], vget_high_s16(x), vget_high_s16(y));
+
+ sum[3] = vmlal_s16(sum[3], vget_low_s16(y), vget_low_s16(y));
+ sum[3] = vmlal_s16(sum[3], vget_high_s16(y), vget_high_s16(y));
+ }
+ sum[2] = sum[1];
+
+ int32x4_t res = horizontal_add_4d_s32x4(sum);
+
+ // Apply regularization
+ // We follow the standard regularization method of adding `k * I` before
+ // inverting. This ensures that the matrix will be invertible.
+ //
+ // Setting the regularization strength k to 1 seems to work well here, as
+ // typical values coming from the other equations are very large (1e5 to
+ // 1e6, with an upper limit of around 6e7, at the time of writing).
+ // It also preserves the property that all matrix values are whole numbers,
+ // which is convenient for integerized SIMD implementation.
+
+ double M0 = (double)vgetq_lane_s32(res, 0) + 1;
+ double M1 = (double)vgetq_lane_s32(res, 1);
+ double M2 = (double)vgetq_lane_s32(res, 2);
+ double M3 = (double)vgetq_lane_s32(res, 3) + 1;
+
+ // Invert matrix M.
+ double det = (M0 * M3) - (M1 * M2);
+ assert(det >= 1);
+ const double det_inv = 1 / det;
+
+ M_inv[0] = M3 * det_inv;
+ M_inv[1] = -M1 * det_inv;
+ M_inv[2] = -M2 * det_inv;
+ M_inv[3] = M0 * det_inv;
+}
+
+static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+ const int16_t *dy, int dy_stride,
+ const int16_t *dt, int dt_stride,
+ int *b) {
+ int32x4_t b_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ int16x8_t dx16 = vld1q_s16(dx + i * dx_stride);
+ int16x8_t dy16 = vld1q_s16(dy + i * dy_stride);
+ int16x8_t dt16 = vld1q_s16(dt + i * dt_stride);
+
+ b_s32[0] = vmlal_s16(b_s32[0], vget_low_s16(dx16), vget_low_s16(dt16));
+ b_s32[0] = vmlal_s16(b_s32[0], vget_high_s16(dx16), vget_high_s16(dt16));
+
+ b_s32[1] = vmlal_s16(b_s32[1], vget_low_s16(dy16), vget_low_s16(dt16));
+ b_s32[1] = vmlal_s16(b_s32[1], vget_high_s16(dy16), vget_high_s16(dt16));
+ }
+
+ int32x4_t b_red = horizontal_add_2d_s32(b_s32[0], b_s32[1]);
+ vst1_s32(b, add_pairwise_s32x4(b_red));
+}
+
+void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref,
+ int x, int y, int width, int height,
+ int stride, double *u, double *v) {
+ double M_inv[4];
+ int b[2];
+ int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+ int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+ int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+ // Compute gradients within this patch
+ const uint8_t *src_patch = &src[y * stride + x];
+ sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+ sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+ compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv);
+
+ for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+ compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
+ compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
+ DISFLOW_PATCH_SIZE, b);
+
+ // Solve flow equations to find a better estimate for the flow vector
+ // at this point
+ const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+ const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+ *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+ *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+ if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+ // Stop iteration when we're close to convergence
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c
new file mode 100644
index 0000000000..284d1bd7b8
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+
+#define FAST_BARRIER 18
+
+size_t av1_get_corner_list_size(void) { return sizeof(CornerList); }
+
+CornerList *av1_alloc_corner_list(void) {
+ CornerList *corners = (CornerList *)aom_calloc(1, sizeof(*corners));
+ if (!corners) {
+ return NULL;
+ }
+
+ corners->valid = false;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_init(&corners->mutex, NULL);
+#endif // CONFIG_MULTITHREAD
+ return corners;
+}
+
+static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+ const uint8_t *buf = pyr->layers[0].buffer;
+ int width = pyr->layers[0].width;
+ int height = pyr->layers[0].height;
+ int stride = pyr->layers[0].stride;
+
+ int *scores = NULL;
+ int num_corners;
+ xy *const frame_corners_xy = aom_fast9_detect_nonmax(
+ buf, width, height, stride, FAST_BARRIER, &scores, &num_corners);
+ if (num_corners < 0) return false;
+
+ if (num_corners <= MAX_CORNERS) {
+ // Use all detected corners
+ if (num_corners != 0) {
+ memcpy(corners->corners, frame_corners_xy,
+ sizeof(*frame_corners_xy) * num_corners);
+ }
+ corners->num_corners = num_corners;
+ } else {
+ // There are more than MAX_CORNERS corners avilable, so pick out a subset
+ // of the sharpest corners, as these will be the most useful for flow
+ // estimation
+ int histogram[256];
+ av1_zero(histogram);
+ for (int i = 0; i < num_corners; i++) {
+ assert(FAST_BARRIER <= scores[i] && scores[i] <= 255);
+ histogram[scores[i]] += 1;
+ }
+
+ int threshold = -1;
+ int found_corners = 0;
+ for (int bucket = 255; bucket >= 0; bucket--) {
+ if (found_corners + histogram[bucket] > MAX_CORNERS) {
+ // Set threshold here
+ threshold = bucket;
+ break;
+ }
+ found_corners += histogram[bucket];
+ }
+ assert(threshold != -1 && "Failed to select a valid threshold");
+
+ int copied_corners = 0;
+ for (int i = 0; i < num_corners; i++) {
+ if (scores[i] > threshold) {
+ assert(copied_corners < MAX_CORNERS);
+ corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x;
+ corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y;
+ copied_corners += 1;
+ }
+ }
+ assert(copied_corners == found_corners);
+ corners->num_corners = copied_corners;
+ }
+
+ free(scores);
+ free(frame_corners_xy);
+ return true;
+}
+
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+ assert(corners);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+
+ if (!corners->valid) {
+ corners->valid = compute_corner_list(pyr, corners);
+ }
+ bool valid = corners->valid;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+ return valid;
+}
+
+#ifndef NDEBUG
+// Check if a corner list has already been computed.
+// This is mostly a debug helper - as it is necessary to hold corners->mutex
+// while reading the valid flag, we cannot just write:
+// assert(corners->valid);
+// This function allows the check to be correctly written as:
+// assert(aom_is_corner_list_valid(corners));
+bool aom_is_corner_list_valid(CornerList *corners) {
+ assert(corners);
+
+ // Per the comments in the CornerList struct, we must take this mutex
+ // before reading or writing the "valid" flag, and hold it while computing
+ // the pyramid, to ensure proper behaviour if multiple threads call this
+ // function simultaneously
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+
+ bool valid = corners->valid;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+
+ return valid;
+}
+#endif
+
+void av1_invalidate_corner_list(CornerList *corners) {
+ if (corners) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+ corners->valid = false;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+ }
+}
+
+void av1_free_corner_list(CornerList *corners) {
+ if (corners) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&corners->mutex);
+#endif // CONFIG_MULTITHREAD
+ aom_free(corners);
+ }
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h
new file mode 100644
index 0000000000..d05846ce5d
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <memory.h>
+
+#include "aom_dsp/pyramid.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_CORNERS 4096
+
+typedef struct corner_list {
+#if CONFIG_MULTITHREAD
+ // Mutex which is used to prevent the corner list from being computed twice
+ // at the same time
+ //
+ // Semantics:
+ // * This mutex must be held whenever reading or writing the `valid` flag
+ //
+ // * This mutex must also be held while computing the image pyramid,
+ // to ensure that only one thread may do so at a time.
+ //
+ // * However, once you have read the valid flag and seen a true value,
+ // it is safe to drop the mutex and read from the remaining fields.
+ // This is because, once the image pyramid is computed, its contents
+ // will not be changed until the parent frame buffer is recycled,
+ // which will not happen until there are no more outstanding references
+ // to the frame buffer.
+ pthread_mutex_t mutex;
+#endif // CONFIG_MULTITHREAD
+ // Flag indicating whether the corner list contains valid data
+ bool valid;
+ // Number of corners found
+ int num_corners;
+ // (x, y) coordinates of each corner
+ int corners[2 * MAX_CORNERS];
+} CornerList;
+
+size_t av1_get_corner_list_size(void);
+
+CornerList *av1_alloc_corner_list(void);
+
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners);
+
+#ifndef NDEBUG
+// Check if a corner list has already been computed.
+// This is mostly a debug helper - as it is necessary to hold corners->mutex
+// while reading the valid flag, we cannot just write:
+// assert(corners->valid);
+// This function allows the check to be correctly written as:
+// assert(aom_is_corner_list_valid(corners));
+bool aom_is_corner_list_valid(CornerList *corners);
+#endif
+
+void av1_invalidate_corner_list(CornerList *corners);
+
+void av1_free_corner_list(CornerList *corners);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c
new file mode 100644
index 0000000000..cef719b68d
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_dsp/pyramid.h"
+#include "aom_scale/yv12config.h"
+
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame,
+ centered at (x, y).
+*/
+static double compute_variance(const unsigned char *frame, int stride, int x,
+ int y) {
+ int sum = 0;
+ int sumsq = 0;
+ int var;
+ int i, j;
+ for (i = 0; i < MATCH_SZ; ++i)
+ for (j = 0; j < MATCH_SZ; ++j) {
+ sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+ sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+ frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+ }
+ var = sumsq * MATCH_SZ_SQ - sum * sum;
+ return (double)var;
+}
+
+/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the
+ correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+ of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1,
+ int x1, int y1,
+ const unsigned char *frame2, int stride2,
+ int x2, int y2) {
+ int v1, v2;
+ int sum1 = 0;
+ int sum2 = 0;
+ int sumsq2 = 0;
+ int cross = 0;
+ int var2, cov;
+ int i, j;
+ for (i = 0; i < MATCH_SZ; ++i)
+ for (j = 0; j < MATCH_SZ; ++j) {
+ v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+ v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+ sum1 += v1;
+ sum2 += v2;
+ sumsq2 += v2 * v2;
+ cross += v1 * v2;
+ }
+ var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+ cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+ return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+ return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+ pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+ int point2y, int width, int height) {
+ const int thresh = (width < height ? height : width) >> 4;
+ return ((point1x - point2x) * (point1x - point2x) +
+ (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(const unsigned char *src,
+ const unsigned char *ref, int width,
+ int height, int src_stride, int ref_stride,
+ Correspondence *correspondences,
+ int num_correspondences) {
+ int i;
+ for (i = 0; i < num_correspondences; ++i) {
+ int x, y, best_x = 0, best_y = 0;
+ double best_match_ncc = 0.0;
+ // For this algorithm, all points have integer coordinates.
+ // It's a little more efficient to convert them to ints once,
+ // before the inner loops
+ int x0 = (int)correspondences[i].x;
+ int y0 = (int)correspondences[i].y;
+ int rx0 = (int)correspondences[i].rx;
+ int ry0 = (int)correspondences[i].ry;
+ for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+ for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+ double match_ncc;
+ if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue;
+ if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height))
+ continue;
+ match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref,
+ ref_stride, rx0 + x, ry0 + y);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_y = y;
+ best_x = x;
+ }
+ }
+ }
+ correspondences[i].rx += best_x;
+ correspondences[i].ry += best_y;
+ }
+ for (i = 0; i < num_correspondences; ++i) {
+ int x, y, best_x = 0, best_y = 0;
+ double best_match_ncc = 0.0;
+ int x0 = (int)correspondences[i].x;
+ int y0 = (int)correspondences[i].y;
+ int rx0 = (int)correspondences[i].rx;
+ int ry0 = (int)correspondences[i].ry;
+ for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+ for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+ double match_ncc;
+ if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue;
+ if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height))
+ continue;
+ match_ncc = av1_compute_cross_correlation(
+ ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_y = y;
+ best_x = x;
+ }
+ }
+ correspondences[i].x += best_x;
+ correspondences[i].y += best_y;
+ }
+}
+
+static int determine_correspondence(const unsigned char *src,
+ const int *src_corners, int num_src_corners,
+ const unsigned char *ref,
+ const int *ref_corners, int num_ref_corners,
+ int width, int height, int src_stride,
+ int ref_stride,
+ Correspondence *correspondences) {
+ // TODO(sarahparker) Improve this to include 2-way match
+ int i, j;
+ int num_correspondences = 0;
+ for (i = 0; i < num_src_corners; ++i) {
+ double best_match_ncc = 0.0;
+ double template_norm;
+ int best_match_j = -1;
+ if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width,
+ height))
+ continue;
+ for (j = 0; j < num_ref_corners; ++j) {
+ double match_ncc;
+ if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+ height))
+ continue;
+ if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1],
+ ref_corners[2 * j], ref_corners[2 * j + 1],
+ width, height))
+ continue;
+ match_ncc = av1_compute_cross_correlation(
+ src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref,
+ ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_match_j = j;
+ }
+ }
+ // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+ // but need to account for the normalization in
+ // av1_compute_cross_correlation.
+ template_norm = compute_variance(src, src_stride, src_corners[2 * i],
+ src_corners[2 * i + 1]);
+ if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+ correspondences[num_correspondences].x = src_corners[2 * i];
+ correspondences[num_correspondences].y = src_corners[2 * i + 1];
+ correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+ correspondences[num_correspondences].ry =
+ ref_corners[2 * best_match_j + 1];
+ num_correspondences++;
+ }
+ }
+ improve_correspondence(src, ref, width, height, src_stride, ref_stride,
+ correspondences, num_correspondences);
+ return num_correspondences;
+}
+
+bool av1_compute_global_motion_feature_match(
+ TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
+ int bit_depth, MotionModel *motion_models, int num_motion_models,
+ bool *mem_alloc_failed) {
+ int num_correspondences;
+ Correspondence *correspondences;
+ ImagePyramid *src_pyramid = src->y_pyramid;
+ CornerList *src_corners = src->corners;
+ ImagePyramid *ref_pyramid = ref->y_pyramid;
+ CornerList *ref_corners = ref->corners;
+
+ // Precompute information we will need about each frame
+ if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+
+ const uint8_t *src_buffer = src_pyramid->layers[0].buffer;
+ const int src_width = src_pyramid->layers[0].width;
+ const int src_height = src_pyramid->layers[0].height;
+ const int src_stride = src_pyramid->layers[0].stride;
+
+ const uint8_t *ref_buffer = ref_pyramid->layers[0].buffer;
+ assert(ref_pyramid->layers[0].width == src_width);
+ assert(ref_pyramid->layers[0].height == src_height);
+ const int ref_stride = ref_pyramid->layers[0].stride;
+
+ // find correspondences between the two images
+ correspondences = (Correspondence *)aom_malloc(src_corners->num_corners *
+ sizeof(*correspondences));
+ if (!correspondences) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ num_correspondences = determine_correspondence(
+ src_buffer, src_corners->corners, src_corners->num_corners, ref_buffer,
+ ref_corners->corners, ref_corners->num_corners, src_width, src_height,
+ src_stride, ref_stride, correspondences);
+
+ bool result = ransac(correspondences, num_correspondences, type,
+ motion_models, num_motion_models, mem_alloc_failed);
+
+ aom_free(correspondences);
+ return result;
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.h b/third_party/aom/aom_dsp/flow_estimation/corner_match.h
new file mode 100644
index 0000000000..4435d2c767
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+
+bool av1_compute_global_motion_feature_match(
+ TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
+ int bit_depth, MotionModel *motion_models, int num_motion_models,
+ bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c
new file mode 100644
index 0000000000..147a8ab3b3
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Dense Inverse Search flow algorithm
+// Paper: https://arxiv.org/abs/1603.03590
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_dsp/pyramid.h"
+#include "aom_mem/aom_mem.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Amount to downsample the flow field by.
+// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate
+// one flow point for each 4x4 pixel region of the frame
+// Must be a power of 2
+#define DOWNSAMPLE_SHIFT 3
+#define DOWNSAMPLE_FACTOR (1 << DOWNSAMPLE_SHIFT)
+
+// Filters used when upscaling the flow field from one pyramid level
+// to another. See upscale_flow_component for details on kernel selection
+#define FLOW_UPSCALE_TAPS 4
+
+// Number of outermost flow field entries (on each edge) which can't be
+// computed, because the patch they correspond to extends outside of the
+// frame
+// The border is (DISFLOW_PATCH_SIZE >> 1) pixels, which is
+// (DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT many flow field entries
+#define FLOW_BORDER_INNER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT)
+
+// Number of extra padding entries on each side of the flow field.
+// These samples are added so that we do not need to apply clamping when
+// interpolating or upsampling the flow field
+#define FLOW_BORDER_OUTER (FLOW_UPSCALE_TAPS / 2)
+
+// When downsampling the flow field, each flow field entry covers a square
+// region of pixels in the image pyramid. This value is equal to the position
+// of the center of that region, as an offset from the top/left edge.
+//
+// Note: Using ((DOWNSAMPLE_FACTOR - 1) / 2) is equivalent to the more
+// natural expression ((DOWNSAMPLE_FACTOR / 2) - 1),
+// unless DOWNSAMPLE_FACTOR == 1 (ie, no downsampling), in which case
+// this gives the correct offset of 0 instead of -1.
+#define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2)
+
+static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = {
+ // Cubic interpolation kernels for phase=0.75 and phase=0.25, respectively
+ { -3 / 128., 29 / 128., 111 / 128., -9 / 128. },
+ { -9 / 128., 111 / 128., 29 / 128., -3 / 128. }
+};
+
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+ // Check that the fractional position is in range.
+ //
+ // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+ // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+ // possible to have x == 1 due to floating point rounding. This is fine,
+ // and we still interpolate correctly if we allow x = 1.
+ assert(0 <= x && x <= 1);
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+ kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+ kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+ kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+ double kernel_dbl[4];
+ get_cubic_kernel_dbl(x, kernel_dbl);
+
+ kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+ kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+ kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+ kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+static INLINE double get_cubic_value_dbl(const double *p,
+ const double kernel[4]) {
+ return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
+ kernel[3] * p[3];
+}
+
+static INLINE int get_cubic_value_int(const int *p, const int kernel[4]) {
+ return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
+ kernel[3] * p[3];
+}
+
+static INLINE double bicubic_interp_one(const double *arr, int stride,
+ const double h_kernel[4],
+ const double v_kernel[4]) {
+ double tmp[1 * 4];
+
+ // Horizontal convolution
+ for (int i = -1; i < 3; ++i) {
+ tmp[i + 1] = get_cubic_value_dbl(&arr[i * stride - 1], h_kernel);
+ }
+
+ // Vertical convolution
+ return get_cubic_value_dbl(tmp, v_kernel);
+}
+
+static int determine_disflow_correspondence(const ImagePyramid *src_pyr,
+ const ImagePyramid *ref_pyr,
+ CornerList *corners,
+ const FlowField *flow,
+ Correspondence *correspondences) {
+ const int width = flow->width;
+ const int height = flow->height;
+ const int stride = flow->stride;
+
+ int num_correspondences = 0;
+ for (int i = 0; i < corners->num_corners; ++i) {
+ const int x0 = corners->corners[2 * i];
+ const int y0 = corners->corners[2 * i + 1];
+
+ // Offset points, to compensate for the fact that (say) a flow field entry
+ // at horizontal index i, is nominally associated with the pixel at
+ // horizontal coordinate (i << DOWNSAMPLE_FACTOR) + UPSAMPLE_CENTER_OFFSET
+ // This offset must be applied before we split the coordinate into integer
+ // and fractional parts, in order for the interpolation to be correct.
+ const int x = x0 - UPSAMPLE_CENTER_OFFSET;
+ const int y = y0 - UPSAMPLE_CENTER_OFFSET;
+
+ // Split the pixel coordinates into integer flow field coordinates and
+ // an offset for interpolation
+ const int flow_x = x >> DOWNSAMPLE_SHIFT;
+ const double flow_sub_x =
+ (x & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR;
+ const int flow_y = y >> DOWNSAMPLE_SHIFT;
+ const double flow_sub_y =
+ (y & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR;
+
+ // Exclude points which would sample from the outer border of the flow
+ // field, as this would give lower-quality results.
+ //
+ // Note: As we never read from the border region at pyramid level 0, we
+ // can skip filling it in. If the conditions here are removed, or any
+ // other logic is added which reads from this border region, then
+ // compute_flow_field() will need to be modified to call
+ // fill_flow_field_borders() at pyramid level 0 to set up the correct
+ // border data.
+ if (flow_x < 1 || (flow_x + 2) >= width) continue;
+ if (flow_y < 1 || (flow_y + 2) >= height) continue;
+
+ double h_kernel[4];
+ double v_kernel[4];
+ get_cubic_kernel_dbl(flow_sub_x, h_kernel);
+ get_cubic_kernel_dbl(flow_sub_y, v_kernel);
+
+ double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x],
+ stride, h_kernel, v_kernel);
+ double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x],
+ stride, h_kernel, v_kernel);
+
+ // Refine the interpolated flow vector one last time
+ const int patch_tl_x = x0 - DISFLOW_PATCH_CENTER;
+ const int patch_tl_y = y0 - DISFLOW_PATCH_CENTER;
+ aom_compute_flow_at_point(
+ src_pyr->layers[0].buffer, ref_pyr->layers[0].buffer, patch_tl_x,
+ patch_tl_y, src_pyr->layers[0].width, src_pyr->layers[0].height,
+ src_pyr->layers[0].stride, &flow_u, &flow_v);
+
+ // Use original points (without offsets) when filling in correspondence
+ // array
+ correspondences[num_correspondences].x = x0;
+ correspondences[num_correspondences].y = y0;
+ correspondences[num_correspondences].rx = x0 + flow_u;
+ correspondences[num_correspondences].ry = y0 + flow_v;
+ num_correspondences++;
+ }
+ return num_correspondences;
+}
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+ int width, int height, int stride, int x,
+ int y, double u, double v,
+ const int16_t *dx, const int16_t *dy,
+ int *b) {
+ memset(b, 0, 2 * sizeof(*b));
+
+ // Split offset into integer and fractional parts, and compute cubic
+ // interpolation kernels
+ const int u_int = (int)floor(u);
+ const int v_int = (int)floor(v);
+ const double u_frac = u - floor(u);
+ const double v_frac = v - floor(v);
+
+ int h_kernel[4];
+ int v_kernel[4];
+ get_cubic_kernel_int(u_frac, h_kernel);
+ get_cubic_kernel_int(v_frac, v_kernel);
+
+ // Storage for intermediate values between the two convolution directions
+ int tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+ int *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row
+
+ // Clamp coordinates so that all pixels we fetch will remain within the
+ // allocated border region, but allow them to go far enough out that
+ // the border pixels' values do not change.
+ // Since we are calculating an 8x8 block, the bottom-right pixel
+ // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+ // interpolation has 4 taps, meaning that the output of pixel
+ // (x_w, y_w) depends on the pixels in the range
+ // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+ //
+ // Thus the most extreme coordinates which will be fetched are
+ // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+ const int x0 = clamp(x + u_int, -9, width);
+ const int y0 = clamp(y + v_int, -9, height);
+
+ // Horizontal convolution
+ for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) {
+ const int y_w = y0 + i;
+ for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+ const int x_w = x0 + j;
+ int arr[4];
+
+ arr[0] = (int)ref[y_w * stride + (x_w - 1)];
+ arr[1] = (int)ref[y_w * stride + (x_w + 0)];
+ arr[2] = (int)ref[y_w * stride + (x_w + 1)];
+ arr[3] = (int)ref[y_w * stride + (x_w + 2)];
+
+ // Apply kernel and round, keeping 6 extra bits of precision.
+ //
+ // 6 is the maximum allowable number of extra bits which will avoid
+ // the intermediate values overflowing an int16_t. The most extreme
+ // intermediate value occurs when:
+ // * The input pixels are [0, 255, 255, 0]
+ // * u_frac = 0.5
+ // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+ // As an integer with 6 fractional bits, that is 18360, which fits
+ // in an int16_t. But with 7 fractional bits it would be 36720,
+ // which is too large.
+ tmp[i * DISFLOW_PATCH_SIZE + j] = ROUND_POWER_OF_TWO(
+ get_cubic_value_int(arr, h_kernel), DISFLOW_INTERP_BITS - 6);
+ }
+ }
+
+ // Vertical convolution
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+ for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+ const int *p = &tmp[i * DISFLOW_PATCH_SIZE + j];
+ const int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE],
+ p[2 * DISFLOW_PATCH_SIZE] };
+ const int result = get_cubic_value_int(arr, v_kernel);
+
+ // Apply kernel and round.
+ // This time, we have to round off the 6 extra bits which were kept
+ // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+ // of precision to match the scale of the dx and dy arrays.
+ const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2;
+ const int warped = ROUND_POWER_OF_TWO(result, round_bits);
+ const int src_px = src[(x + j) + (y + i) * stride] << 3;
+ const int dt = warped - src_px;
+ b[0] += dx[i * DISFLOW_PATCH_SIZE + j] * dt;
+ b[1] += dy[i * DISFLOW_PATCH_SIZE + j] * dt;
+ }
+ }
+}
+
+static INLINE void sobel_filter(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride, int dir) {
+ int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+ int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
+
+ // Sobel filter kernel
+ // This must have an overall scale factor equal to DISFLOW_DERIV_SCALE,
+ // in order to produce correctly scaled outputs.
+ // To work out the scale factor, we multiply two factors:
+ //
+ // * For the derivative filter (sobel_a), comparing our filter
+ // image[x - 1] - image[x + 1]
+ // to the standard form
+ // d/dx image[x] = image[x+1] - image[x]
+ // tells us that we're actually calculating -2 * d/dx image[2]
+ //
+ // * For the smoothing filter (sobel_b), all coefficients are positive
+ // so the scale factor is just the sum of the coefficients
+ //
+ // Thus we need to make sure that DISFLOW_DERIV_SCALE = 2 * sum(sobel_b)
+ // (and take care of the - sign from sobel_a elsewhere)
+ static const int16_t sobel_a[3] = { 1, 0, -1 };
+ static const int16_t sobel_b[3] = { 1, 2, 1 };
+ const int taps = 3;
+
+ // horizontal filter
+ const int16_t *h_kernel = dir ? sobel_a : sobel_b;
+
+ for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) {
+ for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+ int sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += h_kernel[k] * src[y * src_stride + (x + k - 1)];
+ }
+ tmp[y * DISFLOW_PATCH_SIZE + x] = sum;
+ }
+ }
+
+ // vertical filter
+ const int16_t *v_kernel = dir ? sobel_b : sobel_a;
+
+ for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) {
+ for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+ int sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x];
+ }
+ dst[y * dst_stride + x] = sum;
+ }
+ }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+// (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+// u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+// M = |sum(dx * dx) sum(dx * dy)|
+// |sum(dx * dy) sum(dy * dy)|
+//
+// b = |sum(dx * dt)|
+// |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+ const int16_t *dy, int dy_stride,
+ double *M) {
+ int tmp[4] = { 0 };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
+ tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+ tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+ // Don't compute tmp[2], as it should be equal to tmp[1]
+ tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
+ }
+ }
+
+ // Apply regularization
+ // We follow the standard regularization method of adding `k * I` before
+ // inverting. This ensures that the matrix will be invertible.
+ //
+ // Setting the regularization strength k to 1 seems to work well here, as
+ // typical values coming from the other equations are very large (1e5 to
+ // 1e6, with an upper limit of around 6e7, at the time of writing).
+ // It also preserves the property that all matrix values are whole numbers,
+ // which is convenient for integerized SIMD implementation.
+ tmp[0] += 1;
+ tmp[3] += 1;
+
+ tmp[2] = tmp[1];
+
+ M[0] = (double)tmp[0];
+ M[1] = (double)tmp[1];
+ M[2] = (double)tmp[2];
+ M[3] = (double)tmp[3];
+}
+
+// Try to invert the matrix M
+// Note: Due to the nature of how a least-squares matrix is constructed, all of
+// the eigenvalues will be >= 0, and therefore det M >= 0 as well.
+// The regularization term `+ k * I` further ensures that det M >= k^2.
+// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
+// So we don't have to worry about non-invertible matrices here.
+static INLINE void invert_2x2(const double *M, double *M_inv) {
+ double det = (M[0] * M[3]) - (M[1] * M[2]);
+ assert(det >= 1);
+ const double det_inv = 1 / det;
+
+ M_inv[0] = M[3] * det_inv;
+ M_inv[1] = -M[1] * det_inv;
+ M_inv[2] = -M[2] * det_inv;
+ M_inv[3] = M[0] * det_inv;
+}
+
+void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x,
+ int y, int width, int height, int stride,
+ double *u, double *v) {
+ double M[4];
+ double M_inv[4];
+ int b[2];
+ int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+ int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+ // Compute gradients within this patch
+ const uint8_t *src_patch = &src[y * stride + x];
+ sobel_filter(src_patch, stride, dx, DISFLOW_PATCH_SIZE, 1);
+ sobel_filter(src_patch, stride, dy, DISFLOW_PATCH_SIZE, 0);
+
+ compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M);
+ invert_2x2(M, M_inv);
+
+ for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+ compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+ b);
+
+ // Solve flow equations to find a better estimate for the flow vector
+ // at this point
+ const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+ const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+ *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+ *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+ if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+ // Stop iteration when we're close to convergence
+ break;
+ }
+ }
+}
+
+static void fill_flow_field_borders(double *flow, int width, int height,
+ int stride) {
+ // Calculate the bounds of the rectangle which was filled in by
+ // compute_flow_field() before calling this function.
+ // These indices are inclusive on both ends.
+ const int left_index = FLOW_BORDER_INNER;
+ const int right_index = (width - FLOW_BORDER_INNER - 1);
+ const int top_index = FLOW_BORDER_INNER;
+ const int bottom_index = (height - FLOW_BORDER_INNER - 1);
+
+ // Left area
+ for (int i = top_index; i <= bottom_index; i += 1) {
+ double *row = flow + i * stride;
+ const double left = row[left_index];
+ for (int j = -FLOW_BORDER_OUTER; j < left_index; j++) {
+ row[j] = left;
+ }
+ }
+
+ // Right area
+ for (int i = top_index; i <= bottom_index; i += 1) {
+ double *row = flow + i * stride;
+ const double right = row[right_index];
+ for (int j = right_index + 1; j < width + FLOW_BORDER_OUTER; j++) {
+ row[j] = right;
+ }
+ }
+
+ // Top area
+ const double *top_row = flow + top_index * stride - FLOW_BORDER_OUTER;
+ for (int i = -FLOW_BORDER_OUTER; i < top_index; i++) {
+ double *row = flow + i * stride - FLOW_BORDER_OUTER;
+ size_t length = width + 2 * FLOW_BORDER_OUTER;
+ memcpy(row, top_row, length * sizeof(*row));
+ }
+
+ // Bottom area
+ const double *bottom_row = flow + bottom_index * stride - FLOW_BORDER_OUTER;
+ for (int i = bottom_index + 1; i < height + FLOW_BORDER_OUTER; i++) {
+ double *row = flow + i * stride - FLOW_BORDER_OUTER;
+ size_t length = width + 2 * FLOW_BORDER_OUTER;
+ memcpy(row, bottom_row, length * sizeof(*row));
+ }
+}
+
+// Upscale one component of the flow field, from a size of
+// cur_width x cur_height to a size of (2*cur_width) x (2*cur_height), storing
+// the result back into the same buffer. This function also scales the flow
+// vector by 2, so that when we move to the next pyramid level down, the implied
+// motion vector is the same.
+//
+// The temporary buffer tmpbuf must be large enough to hold an intermediate
+// array of size stride * cur_height, *plus* FLOW_BORDER_OUTER rows above and
+// below. In other words, indices from -FLOW_BORDER_OUTER * stride to
+// (cur_height + FLOW_BORDER_OUTER) * stride - 1 must be valid.
+//
+// Note that the same stride is used for u before and after upscaling
+// and for the temporary buffer, for simplicity.
+//
+// A note on phasing:
+//
+// The flow fields at two adjacent pyramid levels are offset from each other,
+// and we need to account for this in the construction of the interpolation
+// kernels.
+//
+// Consider an 8x8 pixel patch at pyramid level n. This is split into four
+// patches at pyramid level n-1. Bringing these patches back up to pyramid level
+// n, each sub-patch covers 4x4 pixels, and between them they cover the same
+// 8x8 region.
+//
+// Therefore, at pyramid level n, two adjacent patches look like this:
+//
+// + - - - - - - - + - - - - - - - +
+// | | |
+// | x x | x x |
+// | | |
+// | # | # |
+// | | |
+// | x x | x x |
+// | | |
+// + - - - - - - - + - - - - - - - +
+//
+// where # marks the center of a patch at pyramid level n (the input to this
+// function), and x marks the center of a patch at pyramid level n-1 (the output
+// of this function).
+//
+// By counting pixels (marked by +, -, and |), we can see that the flow vectors
+// at pyramid level n-1 are offset relative to the flow vectors at pyramid
+// level n, by 1/4 of the larger (input) patch size. Therefore, our
+// interpolation kernels need to have phases of 0.25 and 0.75.
+//
+// In addition, in order to handle the frame edges correctly, we need to
+// generate one output vector to the left and one to the right of each input
+// vector, even though these must be interpolated using different source points.
+static void upscale_flow_component(double *flow, int cur_width, int cur_height,
+ int stride, double *tmpbuf) {
+ const int half_len = FLOW_UPSCALE_TAPS / 2;
+
+ // Check that the outer border is large enough to avoid needing to clamp
+ // the source locations
+ assert(half_len <= FLOW_BORDER_OUTER);
+
+ // Horizontal upscale and multiply by 2
+ for (int i = 0; i < cur_height; i++) {
+ for (int j = 0; j < cur_width; j++) {
+ double left = 0;
+ for (int k = -half_len; k < half_len; k++) {
+ left +=
+ flow[i * stride + (j + k)] * flow_upscale_filter[0][k + half_len];
+ }
+ tmpbuf[i * stride + (2 * j + 0)] = 2.0 * left;
+
+ // Right output pixel is 0.25 units to the right of the input pixel
+ double right = 0;
+ for (int k = -(half_len - 1); k < (half_len + 1); k++) {
+ right += flow[i * stride + (j + k)] *
+ flow_upscale_filter[1][k + (half_len - 1)];
+ }
+ tmpbuf[i * stride + (2 * j + 1)] = 2.0 * right;
+ }
+ }
+
+ // Fill in top and bottom borders of tmpbuf
+ const double *top_row = &tmpbuf[0];
+ for (int i = -FLOW_BORDER_OUTER; i < 0; i++) {
+ double *row = &tmpbuf[i * stride];
+ memcpy(row, top_row, 2 * cur_width * sizeof(*row));
+ }
+
+ const double *bottom_row = &tmpbuf[(cur_height - 1) * stride];
+ for (int i = cur_height; i < cur_height + FLOW_BORDER_OUTER; i++) {
+ double *row = &tmpbuf[i * stride];
+ memcpy(row, bottom_row, 2 * cur_width * sizeof(*row));
+ }
+
+ // Vertical upscale
+ int upscaled_width = cur_width * 2;
+ for (int i = 0; i < cur_height; i++) {
+ for (int j = 0; j < upscaled_width; j++) {
+ double top = 0;
+ for (int k = -half_len; k < half_len; k++) {
+ top +=
+ tmpbuf[(i + k) * stride + j] * flow_upscale_filter[0][k + half_len];
+ }
+ flow[(2 * i) * stride + j] = top;
+
+ double bottom = 0;
+ for (int k = -(half_len - 1); k < (half_len + 1); k++) {
+ bottom += tmpbuf[(i + k) * stride + j] *
+ flow_upscale_filter[1][k + (half_len - 1)];
+ }
+ flow[(2 * i + 1) * stride + j] = bottom;
+ }
+ }
+}
+
+// make sure flow_u and flow_v start at 0
+static bool compute_flow_field(const ImagePyramid *src_pyr,
+ const ImagePyramid *ref_pyr, FlowField *flow) {
+ bool mem_status = true;
+ assert(src_pyr->n_levels == ref_pyr->n_levels);
+
+ double *flow_u = flow->u;
+ double *flow_v = flow->v;
+
+ double *tmpbuf0;
+ double *tmpbuf;
+
+ if (src_pyr->n_levels < 2) {
+ // tmpbuf not needed
+ tmpbuf0 = NULL;
+ tmpbuf = NULL;
+ } else {
+ // This line must match the calculation of cur_flow_height below
+ const int layer1_height = src_pyr->layers[1].height >> DOWNSAMPLE_SHIFT;
+
+ const size_t tmpbuf_size =
+ (layer1_height + 2 * FLOW_BORDER_OUTER) * flow->stride;
+ tmpbuf0 = aom_malloc(tmpbuf_size * sizeof(*tmpbuf0));
+ if (!tmpbuf0) {
+ mem_status = false;
+ goto free_tmpbuf;
+ }
+ tmpbuf = tmpbuf0 + FLOW_BORDER_OUTER * flow->stride;
+ }
+
+ // Compute flow field from coarsest to finest level of the pyramid
+ //
+ // Note: We stop after refining pyramid level 1 and interpolating it to
+ // generate an initial flow field at level 0. We do *not* refine the dense
+ // flow field at level 0. Instead, we wait until we have generated
+ // correspondences by interpolating this flow field, and then refine the
+ // correspondences themselves. This is both faster and gives better output
+ // compared to refining the flow field at level 0 and then interpolating.
+ for (int level = src_pyr->n_levels - 1; level >= 1; --level) {
+ const PyramidLayer *cur_layer = &src_pyr->layers[level];
+ const int cur_width = cur_layer->width;
+ const int cur_height = cur_layer->height;
+ const int cur_stride = cur_layer->stride;
+
+ const uint8_t *src_buffer = cur_layer->buffer;
+ const uint8_t *ref_buffer = ref_pyr->layers[level].buffer;
+
+ const int cur_flow_width = cur_width >> DOWNSAMPLE_SHIFT;
+ const int cur_flow_height = cur_height >> DOWNSAMPLE_SHIFT;
+ const int cur_flow_stride = flow->stride;
+
+ for (int i = FLOW_BORDER_INNER; i < cur_flow_height - FLOW_BORDER_INNER;
+ i += 1) {
+ for (int j = FLOW_BORDER_INNER; j < cur_flow_width - FLOW_BORDER_INNER;
+ j += 1) {
+ const int flow_field_idx = i * cur_flow_stride + j;
+
+ // Calculate the position of a patch of size DISFLOW_PATCH_SIZE pixels,
+ // which is centered on the region covered by this flow field entry
+ const int patch_center_x =
+ (j << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET; // In pixels
+ const int patch_center_y =
+ (i << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET; // In pixels
+ const int patch_tl_x = patch_center_x - DISFLOW_PATCH_CENTER;
+ const int patch_tl_y = patch_center_y - DISFLOW_PATCH_CENTER;
+ assert(patch_tl_x >= 0);
+ assert(patch_tl_y >= 0);
+
+ aom_compute_flow_at_point(src_buffer, ref_buffer, patch_tl_x,
+ patch_tl_y, cur_width, cur_height, cur_stride,
+ &flow_u[flow_field_idx],
+ &flow_v[flow_field_idx]);
+ }
+ }
+
+ // Fill in the areas which we haven't explicitly computed, with copies
+ // of the outermost values which we did compute
+ fill_flow_field_borders(flow_u, cur_flow_width, cur_flow_height,
+ cur_flow_stride);
+ fill_flow_field_borders(flow_v, cur_flow_width, cur_flow_height,
+ cur_flow_stride);
+
+ if (level > 0) {
+ const int upscale_flow_width = cur_flow_width << 1;
+ const int upscale_flow_height = cur_flow_height << 1;
+ const int upscale_stride = flow->stride;
+
+ upscale_flow_component(flow_u, cur_flow_width, cur_flow_height,
+ cur_flow_stride, tmpbuf);
+ upscale_flow_component(flow_v, cur_flow_width, cur_flow_height,
+ cur_flow_stride, tmpbuf);
+
+ // If we didn't fill in the rightmost column or bottommost row during
+ // upsampling (in order to keep the ratio to exactly 2), fill them
+ // in here by copying the next closest column/row
+ const PyramidLayer *next_layer = &src_pyr->layers[level - 1];
+ const int next_flow_width = next_layer->width >> DOWNSAMPLE_SHIFT;
+ const int next_flow_height = next_layer->height >> DOWNSAMPLE_SHIFT;
+
+ // Rightmost column
+ if (next_flow_width > upscale_flow_width) {
+ assert(next_flow_width == upscale_flow_width + 1);
+ for (int i = 0; i < upscale_flow_height; i++) {
+ const int index = i * upscale_stride + upscale_flow_width;
+ flow_u[index] = flow_u[index - 1];
+ flow_v[index] = flow_v[index - 1];
+ }
+ }
+
+ // Bottommost row
+ if (next_flow_height > upscale_flow_height) {
+ assert(next_flow_height == upscale_flow_height + 1);
+ for (int j = 0; j < next_flow_width; j++) {
+ const int index = upscale_flow_height * upscale_stride + j;
+ flow_u[index] = flow_u[index - upscale_stride];
+ flow_v[index] = flow_v[index - upscale_stride];
+ }
+ }
+ }
+ }
+
+free_tmpbuf:
+ aom_free(tmpbuf0);
+ return mem_status;
+}
+
+static FlowField *alloc_flow_field(int frame_width, int frame_height) {
+ FlowField *flow = (FlowField *)aom_malloc(sizeof(FlowField));
+ if (flow == NULL) return NULL;
+
+ // Calculate the size of the bottom (largest) layer of the flow pyramid
+ flow->width = frame_width >> DOWNSAMPLE_SHIFT;
+ flow->height = frame_height >> DOWNSAMPLE_SHIFT;
+ flow->stride = flow->width + 2 * FLOW_BORDER_OUTER;
+
+ const size_t flow_size =
+ flow->stride * (size_t)(flow->height + 2 * FLOW_BORDER_OUTER);
+
+ flow->buf0 = aom_calloc(2 * flow_size, sizeof(*flow->buf0));
+ if (!flow->buf0) {
+ aom_free(flow);
+ return NULL;
+ }
+
+ flow->u = flow->buf0 + FLOW_BORDER_OUTER * flow->stride + FLOW_BORDER_OUTER;
+ flow->v = flow->u + flow_size;
+
+ return flow;
+}
+
+static void free_flow_field(FlowField *flow) {
+ aom_free(flow->buf0);
+ aom_free(flow);
+}
+
+// Compute flow field between `src` and `ref`, and then use that flow to
+// compute a global motion model relating the two frames.
+//
+// Following the convention in flow_estimation.h, the flow vectors are computed
+// at fixed points in `src` and point to the corresponding locations in `ref`,
+// regardless of the temporal ordering of the frames.
+bool av1_compute_global_motion_disflow(TransformationType type,
+ YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *ref, int bit_depth,
+ MotionModel *motion_models,
+ int num_motion_models,
+ bool *mem_alloc_failed) {
+ // Precompute information we will need about each frame
+ ImagePyramid *src_pyramid = src->y_pyramid;
+ CornerList *src_corners = src->corners;
+ ImagePyramid *ref_pyramid = ref->y_pyramid;
+ if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+ if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+
+ const int src_width = src_pyramid->layers[0].width;
+ const int src_height = src_pyramid->layers[0].height;
+ assert(ref_pyramid->layers[0].width == src_width);
+ assert(ref_pyramid->layers[0].height == src_height);
+
+ FlowField *flow = alloc_flow_field(src_width, src_height);
+ if (!flow) {
+ *mem_alloc_failed = true;
+ return false;
+ }
+
+ if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) {
+ *mem_alloc_failed = true;
+ free_flow_field(flow);
+ return false;
+ }
+
+ // find correspondences between the two images using the flow field
+ Correspondence *correspondences =
+ aom_malloc(src_corners->num_corners * sizeof(*correspondences));
+ if (!correspondences) {
+ *mem_alloc_failed = true;
+ free_flow_field(flow);
+ return false;
+ }
+
+ const int num_correspondences = determine_disflow_correspondence(
+ src_pyramid, ref_pyramid, src_corners, flow, correspondences);
+
+ bool result = ransac(correspondences, num_correspondences, type,
+ motion_models, num_motion_models, mem_alloc_failed);
+
+ aom_free(correspondences);
+ free_flow_field(flow);
+ return result;
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.h b/third_party/aom/aom_dsp/flow_estimation/disflow.h
new file mode 100644
index 0000000000..ef877b638c
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/disflow.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
+
+#include <stdbool.h>
+
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/rect.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Number of pyramid levels in disflow computation
+#define DISFLOW_PYRAMID_LEVELS 12
+
+// Size of square patches in the disflow dense grid
+// Must be a power of 2
+#define DISFLOW_PATCH_SIZE_LOG2 3
+#define DISFLOW_PATCH_SIZE (1 << DISFLOW_PATCH_SIZE_LOG2)
+// Center point of square patch
+#define DISFLOW_PATCH_CENTER ((DISFLOW_PATCH_SIZE / 2) - 1)
+
+// Overall scale of the `dx`, `dy` and `dt` arrays in the disflow code
+// In other words, the various derivatives are calculated with an internal
+// precision of (8 + DISFLOW_DERIV_SCALE_LOG2) bits, from an 8-bit input.
+//
+// This must be carefully synchronized with the code in sobel_filter()
+// (which fills the dx and dy arrays) and compute_flow_error() (which
+// fills dt); see the comments in those functions for more details
+#define DISFLOW_DERIV_SCALE_LOG2 3
+#define DISFLOW_DERIV_SCALE (1 << DISFLOW_DERIV_SCALE_LOG2)
+
+// Scale factor applied to each step in the main refinement loop
+//
+// This should be <= 1.0 to avoid overshoot. Values below 1.0
+// may help in some cases, but slow convergence overall, so
+// will require careful tuning.
+// TODO(rachelbarker): Tune this value
+#define DISFLOW_STEP_SIZE 1.0
+
+// Step size at which we should terminate iteration
+// The idea here is that, if we take a step which is much smaller than 1px in
+// size, then the values won't change much from iteration to iteration, so
+// many future steps will also be small, and that won't have much effect
+// on the ultimate result. So we can terminate early.
+//
+// To look at it another way, when we take a small step, that means that
+// either we're near to convergence (so can stop), or we're stuck in a
+// shallow valley and will take many iterations to get unstuck.
+//
+// Solving the latter properly requires fancier methods, such as "gradient
+// descent with momentum". For now, we terminate to avoid wasting a ton of
+// time on points which are either nearly-converged or stuck.
+//
+// Terminating at 1/8 px seems to give good results for global motion estimation
+#define DISFLOW_STEP_SIZE_THRESOLD (1. / 8.)
+
+// Max number of iterations if warp convergence is not found
+#define DISFLOW_MAX_ITR 4
+
+// Internal precision of cubic interpolation filters
+// The limiting factor here is that:
+// * Before integerizing, the maximum value of any kernel tap is 1.0
+// * After integerizing, each tap must fit into an int16_t.
+// Thus the largest multiplier we can get away with is 2^14 = 16384,
+// as 2^15 = 32768 is too large to fit in an int16_t.
+#define DISFLOW_INTERP_BITS 14
+
+typedef struct {
+ // Start of allocation for u and v buffers
+ double *buf0;
+
+ // x and y directions of flow, per patch
+ double *u;
+ double *v;
+
+ // Sizes of the above arrays
+ int width;
+ int height;
+ int stride;
+} FlowField;
+
+bool av1_compute_global_motion_disflow(TransformationType type,
+ YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *ref, int bit_depth,
+ MotionModel *motion_models,
+ int num_motion_models,
+ bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c
new file mode 100644
index 0000000000..0f47f86f55
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+// For each global motion method, how many pyramid levels should we allocate?
+// Note that this is a maximum, and fewer levels will be allocated if the frame
+// is not large enough to need all of the specified levels
+const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = {
+ 1, // GLOBAL_MOTION_METHOD_FEATURE_MATCH
+ 16, // GLOBAL_MOTION_METHOD_DISFLOW
+};
+
+// clang-format off
+const double kIdentityParams[MAX_PARAMDIM] = {
+ 0.0, 0.0, 1.0, 0.0, 0.0, 1.0
+};
+// clang-format on
+
+// Compute a global motion model between the given source and ref frames.
+//
+// As is standard for video codecs, the resulting model maps from (x, y)
+// coordinates in `src` to the corresponding points in `ref`, regardless
+// of the temporal order of the two frames.
+//
+// Returns true if global motion estimation succeeded, false if not.
+// The output models should only be used if this function succeeds.
+bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *ref, int bit_depth,
+ GlobalMotionMethod gm_method,
+ MotionModel *motion_models,
+ int num_motion_models, bool *mem_alloc_failed) {
+ switch (gm_method) {
+ case GLOBAL_MOTION_METHOD_FEATURE_MATCH:
+ return av1_compute_global_motion_feature_match(
+ type, src, ref, bit_depth, motion_models, num_motion_models,
+ mem_alloc_failed);
+ case GLOBAL_MOTION_METHOD_DISFLOW:
+ return av1_compute_global_motion_disflow(type, src, ref, bit_depth,
+ motion_models, num_motion_models,
+ mem_alloc_failed);
+ default: assert(0 && "Unknown global motion estimation type");
+ }
+ return false;
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h
new file mode 100644
index 0000000000..2dfae24980
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_H_
+
+#include "aom_dsp/pyramid.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_PARAMDIM 6
+#define MIN_INLIER_PROB 0.1
+
+/* clang-format off */
+enum {
+ IDENTITY = 0, // identity transformation, 0-parameter
+ TRANSLATION = 1, // translational motion 2-parameter
+ ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter
+ AFFINE = 3, // affine, 6-parameter
+ TRANS_TYPES,
+} UENUM1BYTE(TransformationType);
+/* clang-format on */
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+
+// Available methods which can be used for global motion estimation
+typedef enum {
+ GLOBAL_MOTION_METHOD_FEATURE_MATCH,
+ GLOBAL_MOTION_METHOD_DISFLOW,
+ GLOBAL_MOTION_METHOD_LAST = GLOBAL_MOTION_METHOD_DISFLOW,
+ GLOBAL_MOTION_METHODS
+} GlobalMotionMethod;
+
+typedef struct {
+ double params[MAX_PARAMDIM];
+ int *inliers;
+ int num_inliers;
+} MotionModel;
+
+// Data structure to store a single correspondence point during global
+// motion search.
+//
+// A correspondence (x, y) -> (rx, ry) means that point (x, y) in the
+// source frame corresponds to point (rx, ry) in the ref frame.
+typedef struct {
+ double x, y;
+ double rx, ry;
+} Correspondence;
+
+// For each global motion method, how many pyramid levels should we allocate?
+// Note that this is a maximum, and fewer levels will be allocated if the frame
+// is not large enough to need all of the specified levels
+extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS];
+
+// Which global motion method should we use in practice?
+// Disflow is both faster and gives better results than feature matching in
+// practically all cases, so we use disflow by default
+static const GlobalMotionMethod default_global_motion_method =
+ GLOBAL_MOTION_METHOD_DISFLOW;
+
+extern const double kIdentityParams[MAX_PARAMDIM];
+
+// Compute a global motion model between the given source and ref frames.
+//
+// As is standard for video codecs, the resulting model maps from (x, y)
+// coordinates in `src` to the corresponding points in `ref`, regardless
+// of the temporal order of the two frames.
+//
+// Returns true if global motion estimation succeeded, false if not.
+// The output models should only be used if this function succeeds.
+bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *ref, int bit_depth,
+ GlobalMotionMethod gm_method,
+ MotionModel *motion_models,
+ int num_motion_models, bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_FLOW_ESTIMATION_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.c b/third_party/aom/aom_dsp/flow_estimation/ransac.c
new file mode 100644
index 0000000000..b88a07b023
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/ransac.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
+// TODO(rachelbarker): Remove dependence on code in av1/encoder/
+#include "av1/encoder/random.h"
+
+#define MAX_MINPTS 4
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.25
+#define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD)
+#define NUM_TRIALS 20
+
+// Flag to enable functions for finding TRANSLATION type models.
+//
+// These modes are not considered currently due to a spec bug (see comments
+// in gm_get_motion_vector() in av1/common/mv.h). Thus we don't need to compile
+// the corresponding search functions, but it is nice to keep the source around
+// but disabled, for completeness.
+#define ALLOW_TRANSLATION_MODELS 0
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef bool (*IsDegenerateFunc)(double *p);
+typedef bool (*FindTransformationFunc)(int points, const double *points1,
+ const double *points2, double *params);
+typedef void (*ProjectPointsFunc)(const double *mat, const double *points,
+ double *proj, int n, int stride_points,
+ int stride_proj);
+
+// vtable-like structure which stores all of the information needed by RANSAC
+// for a particular model type
+typedef struct {
+ IsDegenerateFunc is_degenerate;
+ FindTransformationFunc find_transformation;
+ ProjectPointsFunc project_points;
+ int minpts;
+} RansacModelInfo;
+
+#if ALLOW_TRANSLATION_MODELS
+static void project_points_translation(const double *mat, const double *points,
+ double *proj, int n, int stride_points,
+ int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = x + mat[0];
+ *(proj++) = y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+#endif // ALLOW_TRANSLATION_MODELS
+
+static void project_points_affine(const double *mat, const double *points,
+ double *proj, int n, int stride_points,
+ int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+ *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+#if ALLOW_TRANSLATION_MODELS
+static bool find_translation(int np, const double *pts1, const double *pts2,
+ double *params) {
+ double sumx = 0;
+ double sumy = 0;
+
+ for (int i = 0; i < np; ++i) {
+ double dx = *(pts2++);
+ double dy = *(pts2++);
+ double sx = *(pts1++);
+ double sy = *(pts1++);
+
+ sumx += dx - sx;
+ sumy += dy - sy;
+ }
+
+ params[0] = sumx / np;
+ params[1] = sumy / np;
+ params[2] = 1;
+ params[3] = 0;
+ params[4] = 0;
+ params[5] = 1;
+ return true;
+}
+#endif // ALLOW_TRANSLATION_MODELS
+
+static bool find_rotzoom(int np, const double *pts1, const double *pts2,
+ double *params) {
+ const int n = 4; // Size of least-squares problem
+ double mat[4 * 4]; // Accumulator for A'A
+ double y[4]; // Accumulator for A'b
+ double a[4]; // Single row of A
+ double b; // Single element of b
+
+ least_squares_init(mat, y, n);
+ for (int i = 0; i < np; ++i) {
+ double dx = *(pts2++);
+ double dy = *(pts2++);
+ double sx = *(pts1++);
+ double sy = *(pts1++);
+
+ a[0] = 1;
+ a[1] = 0;
+ a[2] = sx;
+ a[3] = sy;
+ b = dx;
+ least_squares_accumulate(mat, y, a, b, n);
+
+ a[0] = 0;
+ a[1] = 1;
+ a[2] = sy;
+ a[3] = -sx;
+ b = dy;
+ least_squares_accumulate(mat, y, a, b, n);
+ }
+
+ // Fill in params[0] .. params[3] with output model
+ if (!least_squares_solve(mat, y, params, n)) {
+ return false;
+ }
+
+ // Fill in remaining parameters
+ params[4] = -params[3];
+ params[5] = params[2];
+
+ return true;
+}
+
+static bool find_affine(int np, const double *pts1, const double *pts2,
+ double *params) {
+ // Note: The least squares problem for affine models is 6-dimensional,
+ // but it splits into two independent 3-dimensional subproblems.
+ // Solving these two subproblems separately and recombining at the end
+ // results in less total computation than solving the 6-dimensional
+ // problem directly.
+ //
+ // The two subproblems correspond to all the parameters which contribute
+ // to the x output of the model, and all the parameters which contribute
+ // to the y output, respectively.
+
+ const int n = 3; // Size of each least-squares problem
+ double mat[2][3 * 3]; // Accumulator for A'A
+ double y[2][3]; // Accumulator for A'b
+ double x[2][3]; // Output vector
+ double a[2][3]; // Single row of A
+ double b[2]; // Single element of b
+
+ least_squares_init(mat[0], y[0], n);
+ least_squares_init(mat[1], y[1], n);
+ for (int i = 0; i < np; ++i) {
+ double dx = *(pts2++);
+ double dy = *(pts2++);
+ double sx = *(pts1++);
+ double sy = *(pts1++);
+
+ a[0][0] = 1;
+ a[0][1] = sx;
+ a[0][2] = sy;
+ b[0] = dx;
+ least_squares_accumulate(mat[0], y[0], a[0], b[0], n);
+
+ a[1][0] = 1;
+ a[1][1] = sx;
+ a[1][2] = sy;
+ b[1] = dy;
+ least_squares_accumulate(mat[1], y[1], a[1], b[1], n);
+ }
+
+ if (!least_squares_solve(mat[0], y[0], x[0], n)) {
+ return false;
+ }
+ if (!least_squares_solve(mat[1], y[1], x[1], n)) {
+ return false;
+ }
+
+ // Rearrange least squares result to form output model
+ params[0] = x[0][0];
+ params[1] = x[1][0];
+ params[2] = x[0][1];
+ params[3] = x[0][2];
+ params[4] = x[1][1];
+ params[5] = x[1][2];
+
+ return true;
+}
+
+typedef struct {
+ int num_inliers;
+ double sse; // Sum of squared errors of inliers
+ int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+ const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+ const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+ if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+ if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+ if (motion_a->sse < motion_b->sse) return -1;
+ if (motion_a->sse > motion_b->sse) return 1;
+ return 0;
+}
+
+static bool is_better_motion(const RANSAC_MOTION *motion_a,
+ const RANSAC_MOTION *motion_b) {
+ return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+ const int *indices, int num_points) {
+ for (int i = 0; i < num_points; ++i) {
+ const int index = indices[i];
+ dest[i * 2] = src[index * 2];
+ dest[i * 2 + 1] = src[index * 2 + 1];
+ }
+}
+
+// Returns true on success, false on error
+static bool ransac_internal(const Correspondence *matched_points, int npoints,
+ MotionModel *motion_models, int num_desired_motions,
+ const RansacModelInfo *model_info,
+ bool *mem_alloc_failed) {
+ assert(npoints >= 0);
+ int i = 0;
+ int minpts = model_info->minpts;
+ bool ret_val = true;
+
+ unsigned int seed = (unsigned int)npoints;
+
+ int indices[MAX_MINPTS] = { 0 };
+
+ double *points1, *points2;
+ double *corners1, *corners2;
+ double *projected_corners;
+
+ // Store information for the num_desired_motions best transformations found
+ // and the worst motion among them, as well as the motion currently under
+ // consideration.
+ RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+ RANSAC_MOTION current_motion;
+
+ // Store the parameters and the indices of the inlier points for the motion
+ // currently under consideration.
+ double params_this_motion[MAX_PARAMDIM];
+
+ if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+ return false;
+ }
+
+ int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts);
+
+ points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+ points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+ corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+ corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+ projected_corners =
+ (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2);
+ motions =
+ (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION));
+
+ // Allocate one large buffer which will be carved up to store the inlier
+ // indices for the current motion plus the num_desired_motions many
+ // output models
+ // This allows us to keep the allocation/deallocation logic simple, without
+ // having to (for example) check that `motions` is non-null before allocating
+ // the inlier arrays
+ int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints *
+ (num_desired_motions + 1));
+
+ if (!(points1 && points2 && corners1 && corners2 && projected_corners &&
+ motions && inlier_buffer)) {
+ ret_val = false;
+ *mem_alloc_failed = true;
+ goto finish_ransac;
+ }
+
+ // Once all our allocations are known-good, we can fill in our structures
+ worst_kept_motion = motions;
+
+ for (i = 0; i < num_desired_motions; ++i) {
+ motions[i].inlier_indices = inlier_buffer + i * npoints;
+ }
+ memset(&current_motion, 0, sizeof(current_motion));
+ current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints;
+
+ for (i = 0; i < npoints; ++i) {
+ corners1[2 * i + 0] = matched_points[i].x;
+ corners1[2 * i + 1] = matched_points[i].y;
+ corners2[2 * i + 0] = matched_points[i].rx;
+ corners2[2 * i + 1] = matched_points[i].ry;
+ }
+
+ for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) {
+ lcg_pick(npoints, minpts, indices, &seed);
+
+ copy_points_at_indices(points1, corners1, indices, minpts);
+ copy_points_at_indices(points2, corners2, indices, minpts);
+
+ if (model_info->is_degenerate(points1)) {
+ continue;
+ }
+
+ if (!model_info->find_transformation(minpts, points1, points2,
+ params_this_motion)) {
+ continue;
+ }
+
+ model_info->project_points(params_this_motion, corners1, projected_corners,
+ npoints, 2, 2);
+
+ current_motion.num_inliers = 0;
+ double sse = 0.0;
+ for (i = 0; i < npoints; ++i) {
+ double dx = projected_corners[i * 2] - corners2[i * 2];
+ double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1];
+ double squared_error = dx * dx + dy * dy;
+
+ if (squared_error < INLIER_THRESHOLD_SQUARED) {
+ current_motion.inlier_indices[current_motion.num_inliers++] = i;
+ sse += squared_error;
+ }
+ }
+
+ if (current_motion.num_inliers < min_inliers) {
+ // Reject models with too few inliers
+ continue;
+ }
+
+ current_motion.sse = sse;
+ if (is_better_motion(&current_motion, worst_kept_motion)) {
+ // This motion is better than the worst currently kept motion. Remember
+ // the inlier points and sse. The parameters for each kept motion
+ // will be recomputed later using only the inliers.
+ worst_kept_motion->num_inliers = current_motion.num_inliers;
+ worst_kept_motion->sse = current_motion.sse;
+
+ // Rather than copying the (potentially many) inlier indices from
+ // current_motion.inlier_indices to worst_kept_motion->inlier_indices,
+ // we can swap the underlying pointers.
+ //
+ // This is okay because the next time current_motion.inlier_indices
+ // is used will be in the next trial, where we ignore its previous
+ // contents anyway. And both arrays will be deallocated together at the
+ // end of this function, so there are no lifetime issues.
+ int *tmp = worst_kept_motion->inlier_indices;
+ worst_kept_motion->inlier_indices = current_motion.inlier_indices;
+ current_motion.inlier_indices = tmp;
+
+ // Determine the new worst kept motion and its num_inliers and sse.
+ for (i = 0; i < num_desired_motions; ++i) {
+ if (is_better_motion(worst_kept_motion, &motions[i])) {
+ worst_kept_motion = &motions[i];
+ }
+ }
+ }
+ }
+
+ // Sort the motions, best first.
+ qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+ // Recompute the motions using only the inliers.
+ for (i = 0; i < num_desired_motions; ++i) {
+ int num_inliers = motions[i].num_inliers;
+ if (num_inliers > 0) {
+ assert(num_inliers >= minpts);
+
+ copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+ num_inliers);
+ copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+ num_inliers);
+
+ if (!model_info->find_transformation(num_inliers, points1, points2,
+ motion_models[i].params)) {
+ // In the unlikely event that this model fitting fails,
+ // we don't have a good fallback. So just clear the output
+ // model and move on
+ memcpy(motion_models[i].params, kIdentityParams,
+ MAX_PARAMDIM * sizeof(*(motion_models[i].params)));
+ motion_models[i].num_inliers = 0;
+ continue;
+ }
+
+ // Populate inliers array
+ for (int j = 0; j < num_inliers; j++) {
+ int index = motions[i].inlier_indices[j];
+ const Correspondence *corr = &matched_points[index];
+ motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x);
+ motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y);
+ }
+ motion_models[i].num_inliers = num_inliers;
+ } else {
+ memcpy(motion_models[i].params, kIdentityParams,
+ MAX_PARAMDIM * sizeof(*(motion_models[i].params)));
+ motion_models[i].num_inliers = 0;
+ }
+ }
+
+finish_ransac:
+ aom_free(inlier_buffer);
+ aom_free(motions);
+ aom_free(projected_corners);
+ aom_free(corners2);
+ aom_free(corners1);
+ aom_free(points2);
+ aom_free(points1);
+
+ return ret_val;
+}
+
+static bool is_collinear3(double *p1, double *p2, double *p3) {
+ static const double collinear_eps = 1e-3;
+ const double v =
+ (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+ return fabs(v) < collinear_eps;
+}
+
+#if ALLOW_TRANSLATION_MODELS
+static bool is_degenerate_translation(double *p) {
+ return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+#endif // ALLOW_TRANSLATION_MODELS
+
+static bool is_degenerate_affine(double *p) {
+ return is_collinear3(p, p + 2, p + 4);
+}
+
+static const RansacModelInfo ransac_model_info[TRANS_TYPES] = {
+ // IDENTITY
+ { NULL, NULL, NULL, 0 },
+// TRANSLATION
+#if ALLOW_TRANSLATION_MODELS
+ { is_degenerate_translation, find_translation, project_points_translation,
+ 3 },
+#else
+ { NULL, NULL, NULL, 0 },
+#endif
+ // ROTZOOM
+ { is_degenerate_affine, find_rotzoom, project_points_affine, 3 },
+ // AFFINE
+ { is_degenerate_affine, find_affine, project_points_affine, 3 },
+};
+
+// Returns true on success, false on error
+bool ransac(const Correspondence *matched_points, int npoints,
+ TransformationType type, MotionModel *motion_models,
+ int num_desired_motions, bool *mem_alloc_failed) {
+#if ALLOW_TRANSLATION_MODELS
+ assert(type > IDENTITY && type < TRANS_TYPES);
+#else
+ assert(type > TRANSLATION && type < TRANS_TYPES);
+#endif // ALLOW_TRANSLATION_MODELS
+
+ return ransac_internal(matched_points, npoints, motion_models,
+ num_desired_motions, &ransac_model_info[type],
+ mem_alloc_failed);
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.h b/third_party/aom/aom_dsp/flow_estimation/ransac.h
new file mode 100644
index 0000000000..0529b6e13c
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/ransac.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+#include <stdbool.h>
+
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool ransac(const Correspondence *matched_points, int npoints,
+ TransformationType type, MotionModel *motion_models,
+ int num_desired_motions, bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c
new file mode 100644
index 0000000000..87c76fa13b
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 0, 0, 0 };
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the
+correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_avx2(const unsigned char *frame1,
+ int stride1, int x1, int y1,
+ const unsigned char *frame2,
+ int stride2, int x2, int y2) {
+ int i, stride1_i = 0, stride2_i = 0;
+ __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1;
+ const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+ const __m256i zero = _mm256_setzero_si256();
+ __m128i v1, v2;
+
+ sum_vec = zero;
+ sumsq2_vec = zero;
+ cross_vec = zero;
+
+ frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+ frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+ for (i = 0; i < MATCH_SZ; ++i) {
+ v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask);
+ v1_1 = _mm256_cvtepu8_epi16(v1);
+ v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask);
+ v2_1 = _mm256_cvtepu8_epi16(v2);
+
+ v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1);
+ sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1));
+
+ sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero));
+ cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1));
+ stride1_i += stride1;
+ stride2_i += stride2;
+ }
+ __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8);
+ sum_vec = _mm256_add_epi32(sum_vec, sum_vec1);
+ int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec));
+ int sum2_acc = _mm256_extract_epi32(sum_vec, 4);
+
+ __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec);
+ __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec);
+ temp1 = _mm256_add_epi32(unp_low, unp_hig);
+
+ __m128i low_sumsq = _mm256_castsi256_si128(temp1);
+ low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1));
+ low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32));
+ int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq);
+ int cross_acc = _mm_extract_epi32(low_sumsq, 2);
+
+ int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
+ int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
+ return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c
new file mode 100644
index 0000000000..b3cb5bc5fd
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 0, 0, 0 };
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the
+ correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+ of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1,
+ int stride1, int x1, int y1,
+ const unsigned char *frame2,
+ int stride2, int x2, int y2) {
+ int i;
+ // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
+ // 2)
+ __m128i sum1_vec = _mm_setzero_si128();
+ __m128i sum2_vec = _mm_setzero_si128();
+ // 4 32-bit partial sums of squares
+ __m128i sumsq2_vec = _mm_setzero_si128();
+ __m128i cross_vec = _mm_setzero_si128();
+
+ const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+ const __m128i zero = _mm_setzero_si128();
+
+ frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+ frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+ for (i = 0; i < MATCH_SZ; ++i) {
+ const __m128i v1 =
+ _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask);
+ const __m128i v2 =
+ _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask);
+
+ // Using the 'sad' intrinsic here is a bit faster than adding
+ // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit
+ // conversion step later, for a net speedup of ~10%
+ sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero));
+ sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero));
+
+ const __m128i v1_l = _mm_cvtepu8_epi16(v1);
+ const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8));
+ const __m128i v2_l = _mm_cvtepu8_epi16(v2);
+ const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8));
+
+ sumsq2_vec = _mm_add_epi32(
+ sumsq2_vec,
+ _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r)));
+ cross_vec = _mm_add_epi32(
+ cross_vec,
+ _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r)));
+ }
+
+ // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec,
+ // cross_vec)
+ // as holding 4 32-bit elements each, which we want to sum horizontally.
+ // We do this by transposing and then summing vertically.
+ __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec);
+ __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec);
+ __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec);
+ __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec);
+
+ __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2);
+ __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2);
+ __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3);
+ __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3);
+
+ __m128i res =
+ _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7));
+
+ int sum1 = _mm_extract_epi32(res, 0);
+ int sum2 = _mm_extract_epi32(res, 1);
+ int sumsq2 = _mm_extract_epi32(res, 2);
+ int cross = _mm_extract_epi32(res, 3);
+
+ int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+ int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+ return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
new file mode 100644
index 0000000000..d2b04c1973
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 3-Clause Clear License
+ * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
+ * License was not distributed with this source code in the LICENSE file, you
+ * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * aomedia.org/license/patent-license/.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Internal cross-check against C code
+// If you set this to 1 and compile in debug mode, then the outputs of the two
+// convolution stages will be checked against the plain C version of the code,
+// and an assertion will be fired if the results differ.
+#define CHECK_RESULTS 0
+
+// Note: Max sum(+ve coefficients) = 1.125 * scale
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+ // Check that the fractional position is in range.
+ //
+ // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+ // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+ // possible to have x == 1 due to floating point rounding. This is fine,
+ // and we still interpolate correctly if we allow x = 1.
+ assert(0 <= x && x <= 1);
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+ kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+ kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+ kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int16_t kernel[4]) {
+ double kernel_dbl[4];
+ get_cubic_kernel_dbl(x, kernel_dbl);
+
+ kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+ kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+ kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+ kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+#if CHECK_RESULTS
+static INLINE int get_cubic_value_int(const int *p, const int16_t kernel[4]) {
+ return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
+ kernel[3] * p[3];
+}
+#endif // CHECK_RESULTS
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+//
+// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation
+// instad of bicubic interpolation
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+ int width, int height, int stride, int x,
+ int y, double u, double v,
+ const int16_t *dx, const int16_t *dy,
+ int *b) {
+ // This function is written to do 8x8 convolutions only
+ assert(DISFLOW_PATCH_SIZE == 8);
+
+ // Accumulate 4 32-bit partial sums for each element of b
+ // These will be flattened at the end.
+ __m128i b0_acc = _mm_setzero_si128();
+ __m128i b1_acc = _mm_setzero_si128();
+#if CHECK_RESULTS
+ // Also keep a running sum using the C algorithm, for cross-checking
+ int c_result[2] = { 0 };
+#endif // CHECK_RESULTS
+
+ // Split offset into integer and fractional parts, and compute cubic
+ // interpolation kernels
+ const int u_int = (int)floor(u);
+ const int v_int = (int)floor(v);
+ const double u_frac = u - floor(u);
+ const double v_frac = v - floor(v);
+
+ int16_t h_kernel[4];
+ int16_t v_kernel[4];
+ get_cubic_kernel_int(u_frac, h_kernel);
+ get_cubic_kernel_int(v_frac, v_kernel);
+
+ // Storage for intermediate values between the two convolution directions
+ int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+ int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row
+
+ // Clamp coordinates so that all pixels we fetch will remain within the
+ // allocated border region, but allow them to go far enough out that
+ // the border pixels' values do not change.
+ // Since we are calculating an 8x8 block, the bottom-right pixel
+ // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+ // interpolation has 4 taps, meaning that the output of pixel
+ // (x_w, y_w) depends on the pixels in the range
+ // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+ //
+ // Thus the most extreme coordinates which will be fetched are
+ // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+ const int x0 = clamp(x + u_int, -9, width);
+ const int y0 = clamp(y + v_int, -9, height);
+
+ // Horizontal convolution
+
+ // Prepare the kernel vectors
+ // We split the kernel into two vectors with kernel indices:
+ // 0, 1, 0, 1, 0, 1, 0, 1, and
+ // 2, 3, 2, 3, 2, 3, 2, 3
+ __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]);
+ __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]);
+
+ __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1));
+
+ for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) {
+ const int y_w = y0 + i;
+ const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)];
+ int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE];
+
+ // Load this row of pixels.
+ // For an 8x8 patch, we need to load the 8 image pixels + 3 extras,
+ // for a total of 11 pixels. Here we load 16 pixels, but only use
+ // the first 11.
+ __m128i row = _mm_loadu_si128((__m128i *)ref_row);
+
+ // Expand pixels to int16s
+ __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row);
+ __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4));
+
+ // Relevant multiply instruction
+ // This multiplies pointwise, then sums in pairs.
+ //_mm_madd_epi16();
+
+ // Compute first four outputs
+ // input pixels 0, 1, 1, 2, 2, 3, 3, 4
+ // * kernel 0, 1, 0, 1, 0, 1, 0, 1
+ __m128i px0 =
+ _mm_unpacklo_epi16(px_0to7_i16, _mm_srli_si128(px_0to7_i16, 2));
+ // input pixels 2, 3, 3, 4, 4, 5, 5, 6
+ // * kernel 2, 3, 2, 3, 2, 3, 2, 3
+ __m128i px1 = _mm_unpacklo_epi16(_mm_srli_si128(px_0to7_i16, 4),
+ _mm_srli_si128(px_0to7_i16, 6));
+ // Convolve with kernel and sum 2x2 boxes to form first 4 outputs
+ __m128i sum0 = _mm_add_epi32(_mm_madd_epi16(px0, h_kernel_01),
+ _mm_madd_epi16(px1, h_kernel_23));
+
+ __m128i out0 = _mm_srai_epi32(_mm_add_epi32(sum0, round_const_h),
+ DISFLOW_INTERP_BITS - 6);
+
+ // Compute second four outputs
+ __m128i px2 =
+ _mm_unpacklo_epi16(px_4to10_i16, _mm_srli_si128(px_4to10_i16, 2));
+ __m128i px3 = _mm_unpacklo_epi16(_mm_srli_si128(px_4to10_i16, 4),
+ _mm_srli_si128(px_4to10_i16, 6));
+ __m128i sum1 = _mm_add_epi32(_mm_madd_epi16(px2, h_kernel_01),
+ _mm_madd_epi16(px3, h_kernel_23));
+
+ // Round by just enough bits that the result is
+ // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32
+ // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32
+ // as it does now
+ // This means shifting down so we have 6 extra bits, for a maximum value
+ // of +18360, which can occur if u_frac == 0.5 and the input pixels are
+ // {0, 255, 255, 0}.
+ __m128i out1 = _mm_srai_epi32(_mm_add_epi32(sum1, round_const_h),
+ DISFLOW_INTERP_BITS - 6);
+
+ _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1));
+
+#if CHECK_RESULTS && !defined(NDEBUG)
+ // Cross-check
+ for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+ const int x_w = x0 + j;
+ int arr[4];
+
+ arr[0] = (int)ref[y_w * stride + (x_w - 1)];
+ arr[1] = (int)ref[y_w * stride + (x_w + 0)];
+ arr[2] = (int)ref[y_w * stride + (x_w + 1)];
+ arr[3] = (int)ref[y_w * stride + (x_w + 2)];
+
+ // Apply kernel and round, keeping 6 extra bits of precision.
+ //
+ // 6 is the maximum allowable number of extra bits which will avoid
+ // the intermediate values overflowing an int16_t. The most extreme
+ // intermediate value occurs when:
+ // * The input pixels are [0, 255, 255, 0]
+ // * u_frac = 0.5
+ // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+ // As an integer with 6 fractional bits, that is 18360, which fits
+ // in an int16_t. But with 7 fractional bits it would be 36720,
+ // which is too large.
+ const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel),
+ DISFLOW_INTERP_BITS - 6);
+ (void)c_value; // Suppress warnings
+ assert(tmp_row[j] == c_value);
+ }
+#endif // CHECK_RESULTS
+ }
+
+ // Vertical convolution
+ const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2;
+ __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1));
+
+ __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]);
+ __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]);
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+ int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE];
+
+ // Load 4 rows of 8 x 16-bit values
+ __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE));
+ __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row);
+ __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE));
+ __m128i px3 =
+ _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE));
+
+ // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... ,
+ // but each multiply expands its output to 32 bits. So we need to be
+ // a little clever about how we do this
+ __m128i sum0 = _mm_add_epi32(
+ _mm_madd_epi16(_mm_unpacklo_epi16(px0, px1), v_kernel_01),
+ _mm_madd_epi16(_mm_unpacklo_epi16(px2, px3), v_kernel_23));
+ __m128i sum1 = _mm_add_epi32(
+ _mm_madd_epi16(_mm_unpackhi_epi16(px0, px1), v_kernel_01),
+ _mm_madd_epi16(_mm_unpackhi_epi16(px2, px3), v_kernel_23));
+
+ __m128i sum0_rounded =
+ _mm_srai_epi32(_mm_add_epi32(sum0, round_const_v), round_bits);
+ __m128i sum1_rounded =
+ _mm_srai_epi32(_mm_add_epi32(sum1, round_const_v), round_bits);
+
+ __m128i warped = _mm_packs_epi32(sum0_rounded, sum1_rounded);
+ __m128i src_pixels_u8 =
+ _mm_loadl_epi64((__m128i *)&src[(y + i) * stride + x]);
+ __m128i src_pixels = _mm_slli_epi16(_mm_cvtepu8_epi16(src_pixels_u8), 3);
+
+ // Calculate delta from the target patch
+ __m128i dt = _mm_sub_epi16(warped, src_pixels);
+
+ // Load 8 elements each of dx and dt, to pair with the 8 elements of dt
+ // that we have just computed. Then compute 8 partial sums of dx * dt
+ // and dy * dt, implicitly sum to give 4 partial sums of each, and
+ // accumulate.
+ __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * DISFLOW_PATCH_SIZE]);
+ __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]);
+ b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt));
+ b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt));
+
+#if CHECK_RESULTS
+ int16_t dt_arr[8];
+ memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr));
+ for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+ int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j];
+ int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE],
+ p[2 * DISFLOW_PATCH_SIZE] };
+ const int result = get_cubic_value_int(arr, v_kernel);
+
+ // Apply kernel and round.
+ // This time, we have to round off the 6 extra bits which were kept
+ // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+ // of precision to match the scale of the dx and dy arrays.
+ const int c_warped = ROUND_POWER_OF_TWO(result, round_bits);
+ const int c_src_px = src[(x + j) + (y + i) * stride] << 3;
+ const int c_dt = c_warped - c_src_px;
+
+ assert(dt_arr[j] == c_dt);
+
+ c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt;
+ c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt;
+ }
+#endif // CHECK_RESULTS
+ }
+
+ // Flatten the two sets of partial sums to find the final value of b
+ // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc).
+ // We need to do 6 additions in total; a `hadd` instruction can take care
+ // of four of them, leaving two scalar additions.
+ __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc);
+ b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1);
+ b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3);
+
+#if CHECK_RESULTS
+ assert(b[0] == c_result[0]);
+ assert(b[1] == c_result[1]);
+#endif // CHECK_RESULTS
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride) {
+ int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+ int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
+#if CHECK_RESULTS
+ const int taps = 3;
+#endif // CHECK_RESULTS
+
+ // Horizontal filter
+ // As the kernel is simply {1, 0, -1}, we implement this as simply
+ // out[x] = image[x-1] - image[x+1]
+ // rather than doing a "proper" convolution operation
+ for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+
+ // Load pixels and expand to 16 bits
+ __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1));
+ __m128i px0 = _mm_cvtepu8_epi16(row);
+ __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2));
+
+ __m128i out = _mm_sub_epi16(px0, px2);
+
+ // Store to intermediate array
+ _mm_storeu_si128((__m128i *)tmp_row, out);
+
+#if CHECK_RESULTS
+ // Cross-check
+ static const int16_t h_kernel[3] = { 1, 0, -1 };
+ for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+ int sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += h_kernel[k] * src_row[x + k - 1];
+ }
+ (void)sum;
+ assert(tmp_row[x] == sum);
+ }
+#endif // CHECK_RESULTS
+ }
+
+ // Vertical filter
+ // Here the kernel is {1, 2, 1}, which can be implemented
+ // with simple sums rather than multiplies and adds.
+ // In order to minimize dependency chains, we evaluate in the order
+ // (image[y - 1] + image[y + 1]) + (image[y] << 1)
+ // This way, the first addition and the shift can happen in parallel
+ for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) {
+ const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+ int16_t *dst_row = dst + y * dst_stride;
+
+ __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE));
+ __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row);
+ __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE));
+
+ __m128i out =
+ _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1));
+
+ _mm_storeu_si128((__m128i *)dst_row, out);
+
+#if CHECK_RESULTS
+ static const int16_t v_kernel[3] = { 1, 2, 1 };
+ for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+ int sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x];
+ }
+ (void)sum;
+ assert(dst_row[x] == sum);
+ }
+#endif // CHECK_RESULTS
+ }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+ int16_t *dst, int dst_stride) {
+ int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+ int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
+#if CHECK_RESULTS
+ const int taps = 3;
+#endif // CHECK_RESULTS
+
+ // Horizontal filter
+ // Here the kernel is {1, 2, 1}, which can be implemented
+ // with simple sums rather than multiplies and adds.
+ // In order to minimize dependency chains, we evaluate in the order
+ // (image[y - 1] + image[y + 1]) + (image[y] << 1)
+ // This way, the first addition and the shift can happen in parallel
+ for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+
+ // Load pixels and expand to 16 bits
+ __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1));
+ __m128i px0 = _mm_cvtepu8_epi16(row);
+ __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1));
+ __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2));
+
+ __m128i out =
+ _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1));
+
+ // Store to intermediate array
+ _mm_storeu_si128((__m128i *)tmp_row, out);
+
+#if CHECK_RESULTS
+ // Cross-check
+ static const int16_t h_kernel[3] = { 1, 2, 1 };
+ for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+ int sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += h_kernel[k] * src_row[x + k - 1];
+ }
+ (void)sum;
+ assert(tmp_row[x] == sum);
+ }
+#endif // CHECK_RESULTS
+ }
+
+ // Vertical filter
+ // As the kernel is simply {1, 0, -1}, we implement this as simply
+ // out[x] = image[x-1] - image[x+1]
+ // rather than doing a "proper" convolution operation
+ for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) {
+ const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+ int16_t *dst_row = dst + y * dst_stride;
+
+ __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE));
+ __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE));
+
+ __m128i out = _mm_sub_epi16(px0, px2);
+
+ _mm_storeu_si128((__m128i *)dst_row, out);
+
+#if CHECK_RESULTS
+ static const int16_t v_kernel[3] = { 1, 0, -1 };
+ for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+ int sum = 0;
+ for (int k = 0; k < taps; ++k) {
+ sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x];
+ }
+ (void)sum;
+ assert(dst_row[x] == sum);
+ }
+#endif // CHECK_RESULTS
+ }
+}
+
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+ const int16_t *dy, int dy_stride,
+ double *M) {
+ __m128i acc[4] = { 0 };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * dx_stride]);
+ __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * dy_stride]);
+
+ acc[0] = _mm_add_epi32(acc[0], _mm_madd_epi16(dx_row, dx_row));
+ acc[1] = _mm_add_epi32(acc[1], _mm_madd_epi16(dx_row, dy_row));
+ // Don't compute acc[2], as it should be equal to acc[1]
+ acc[3] = _mm_add_epi32(acc[3], _mm_madd_epi16(dy_row, dy_row));
+ }
+
+ // Condense sums
+ __m128i partial_sum_0 = _mm_hadd_epi32(acc[0], acc[1]);
+ __m128i partial_sum_1 = _mm_hadd_epi32(acc[1], acc[3]);
+ __m128i result = _mm_hadd_epi32(partial_sum_0, partial_sum_1);
+
+ // Apply regularization
+ // We follow the standard regularization method of adding `k * I` before
+ // inverting. This ensures that the matrix will be invertible.
+ //
+ // Setting the regularization strength k to 1 seems to work well here, as
+ // typical values coming from the other equations are very large (1e5 to
+ // 1e6, with an upper limit of around 6e7, at the time of writing).
+ // It also preserves the property that all matrix values are whole numbers,
+ // which is convenient for integerized SIMD implementation.
+ result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1));
+
+#if CHECK_RESULTS
+ int tmp[4] = { 0 };
+
+ for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+ for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
+ tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+ tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+ // Don't compute tmp[2], as it should be equal to tmp[1]
+ tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
+ }
+ }
+
+ // Apply regularization
+ tmp[0] += 1;
+ tmp[3] += 1;
+
+ tmp[2] = tmp[1];
+
+ assert(tmp[0] == _mm_extract_epi32(result, 0));
+ assert(tmp[1] == _mm_extract_epi32(result, 1));
+ assert(tmp[2] == _mm_extract_epi32(result, 2));
+ assert(tmp[3] == _mm_extract_epi32(result, 3));
+#endif // CHECK_RESULTS
+
+ // Convert results to doubles and store
+ _mm_storeu_pd(M, _mm_cvtepi32_pd(result));
+ _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8)));
+}
+
+// Try to invert the matrix M
+// Note: Due to the nature of how a least-squares matrix is constructed, all of
+// the eigenvalues will be >= 0, and therefore det M >= 0 as well.
+// The regularization term `+ k * I` further ensures that det M >= k^2.
+// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
+// So we don't have to worry about non-invertible matrices here.
+static INLINE void invert_2x2(const double *M, double *M_inv) {
+ double det = (M[0] * M[3]) - (M[1] * M[2]);
+ assert(det >= 1);
+ const double det_inv = 1 / det;
+
+ M_inv[0] = M[3] * det_inv;
+ M_inv[1] = -M[1] * det_inv;
+ M_inv[2] = -M[2] * det_inv;
+ M_inv[3] = M[0] * det_inv;
+}
+
+void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref,
+ int x, int y, int width, int height,
+ int stride, double *u, double *v) {
+ double M[4];
+ double M_inv[4];
+ int b[2];
+ int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+ int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+ // Compute gradients within this patch
+ const uint8_t *src_patch = &src[y * stride + x];
+ sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+ sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+ compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M);
+ invert_2x2(M, M_inv);
+
+ for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+ compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+ b);
+
+ // Solve flow equations to find a better estimate for the flow vector
+ // at this point
+ const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+ const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+ *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+ *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+ if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+ // Stop iteration when we're close to convergence
+ break;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
new file mode 100644
index 0000000000..5503501d62
--- /dev/null
+++ b/third_party/aom/aom_dsp/fwd_txfm.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/txfm_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows.
+ // We need an intermediate buffer between passes.
+ tran_low_t intermediate[4 * 4];
+ const tran_low_t *in_low = NULL;
+ tran_low_t *out = intermediate;
+ // Do the two transform passes
+ for (int pass = 0; pass < 2; ++pass) {
+ tran_high_t in_high[4]; // canbe16
+ tran_high_t step[4]; // canbe16
+ tran_low_t temp[4];
+ for (int i = 0; i < 4; ++i) {
+ // Load inputs.
+ if (pass == 0) {
+ in_high[0] = input[0 * stride] * 16;
+ in_high[1] = input[1 * stride] * 16;
+ in_high[2] = input[2 * stride] * 16;
+ in_high[3] = input[3 * stride] * 16;
+ if (i == 0 && in_high[0]) {
+ ++in_high[0];
+ }
+ ++input; // Next column
+ } else {
+ assert(in_low != NULL);
+ in_high[0] = in_low[0 * 4];
+ in_high[1] = in_low[1 * 4];
+ in_high[2] = in_low[2 * 4];
+ in_high[3] = in_low[3 * 4];
+ ++in_low; // Next column (which is a transposed row)
+ }
+ // Transform.
+ step[0] = in_high[0] + in_high[3];
+ step[1] = in_high[1] + in_high[2];
+ step[2] = in_high[1] - in_high[2];
+ step[3] = in_high[0] - in_high[3];
+ temp[0] = (tran_low_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64);
+ temp[2] = (tran_low_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64);
+ temp[1] = (tran_low_t)fdct_round_shift(step[2] * cospi_24_64 +
+ step[3] * cospi_8_64);
+ temp[3] = (tran_low_t)fdct_round_shift(-step[2] * cospi_8_64 +
+ step[3] * cospi_24_64);
+ // Only transpose the first pass.
+ if (pass == 0) {
+ out[0] = temp[0];
+ out[1] = temp[1];
+ out[2] = temp[2];
+ out[3] = temp[3];
+ out += 4;
+ } else {
+ out[0 * 4] = temp[0];
+ out[1 * 4] = temp[1];
+ out[2 * 4] = temp[2];
+ out[3 * 4] = temp[3];
+ ++out;
+ }
+ }
+ // Setup in/out for next pass.
+ in_low = intermediate;
+ out = output;
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j)
+ output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+ }
+}
+
+void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows.
+ // We need an intermediate buffer between passes.
+ int16_t intermediate[4 * 4];
+ const int16_t *in_low = NULL;
+ int16_t *out = intermediate;
+ // Do the two transform passes
+ for (int pass = 0; pass < 2; ++pass) {
+ int32_t in_high[4]; // canbe16
+ int32_t step[4]; // canbe16
+ int16_t temp[4];
+ for (int i = 0; i < 4; ++i) {
+ // Load inputs.
+ if (pass == 0) {
+ in_high[0] = input[0 * stride] * 16;
+ in_high[1] = input[1 * stride] * 16;
+ in_high[2] = input[2 * stride] * 16;
+ in_high[3] = input[3 * stride] * 16;
+ ++input;
+ if (i == 0 && in_high[0]) {
+ ++in_high[0];
+ }
+ } else {
+ assert(in_low != NULL);
+ in_high[0] = in_low[0 * 4];
+ in_high[1] = in_low[1 * 4];
+ in_high[2] = in_low[2 * 4];
+ in_high[3] = in_low[3 * 4];
+ ++in_low;
+ }
+ // Transform.
+ step[0] = in_high[0] + in_high[3];
+ step[1] = in_high[1] + in_high[2];
+ step[2] = in_high[1] - in_high[2];
+ step[3] = in_high[0] - in_high[3];
+ temp[0] = (int16_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64);
+ temp[2] = (int16_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64);
+ temp[1] = (int16_t)fdct_round_shift(step[2] * cospi_24_64 +
+ step[3] * cospi_8_64);
+ temp[3] = (int16_t)fdct_round_shift(-step[2] * cospi_8_64 +
+ step[3] * cospi_24_64);
+ // Only transpose the first pass.
+ if (pass == 0) {
+ out[0] = temp[0];
+ out[1] = temp[1];
+ out[2] = temp[2];
+ out[3] = temp[3];
+ out += 4;
+ } else {
+ out[0 * 4] = temp[0];
+ out[1 * 4] = temp[1];
+ out[2 * 4] = temp[2];
+ out[3 * 4] = temp[3];
+ ++out;
+ }
+ }
+ // Setup in/out for next pass.
+ in_low = intermediate;
+ out = output;
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j)
+ output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+ }
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+ int i, j;
+ tran_low_t intermediate[64];
+ int pass;
+ tran_low_t *output = intermediate;
+ const tran_low_t *in = NULL;
+
+ // Transform columns
+ for (pass = 0; pass < 2; ++pass) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ for (i = 0; i < 8; i++) {
+ // stage 1
+ if (pass == 0) {
+ s0 = (input[0 * stride] + input[7 * stride]) * 4;
+ s1 = (input[1 * stride] + input[6 * stride]) * 4;
+ s2 = (input[2 * stride] + input[5 * stride]) * 4;
+ s3 = (input[3 * stride] + input[4 * stride]) * 4;
+ s4 = (input[3 * stride] - input[4 * stride]) * 4;
+ s5 = (input[2 * stride] - input[5 * stride]) * 4;
+ s6 = (input[1 * stride] - input[6 * stride]) * 4;
+ s7 = (input[0 * stride] - input[7 * stride]) * 4;
+ ++input;
+ } else {
+ s0 = in[0 * 8] + in[7 * 8];
+ s1 = in[1 * 8] + in[6 * 8];
+ s2 = in[2 * 8] + in[5 * 8];
+ s3 = in[3 * 8] + in[4 * 8];
+ s4 = in[3 * 8] - in[4 * 8];
+ s5 = in[2 * 8] - in[5 * 8];
+ s6 = in[1 * 8] - in[6 * 8];
+ s7 = in[0 * 8] - in[7 * 8];
+ ++in;
+ }
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ output[0] = (tran_low_t)fdct_round_shift(t0);
+ output[2] = (tran_low_t)fdct_round_shift(t2);
+ output[4] = (tran_low_t)fdct_round_shift(t1);
+ output[6] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ output[1] = (tran_low_t)fdct_round_shift(t0);
+ output[3] = (tran_low_t)fdct_round_shift(t2);
+ output[5] = (tran_low_t)fdct_round_shift(t1);
+ output[7] = (tran_low_t)fdct_round_shift(t3);
+ output += 8;
+ }
+ in = intermediate;
+ output = final_output;
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+#if CONFIG_AV1_HIGHBITDEPTH && CONFIG_INTERNAL_STATS
+void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ aom_fdct8x8_c(input, final_output, stride);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/grain_params.h b/third_party/aom/aom_dsp/grain_params.h
new file mode 100644
index 0000000000..5a28afc2a1
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_params.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters
+ *
+ */
+#ifndef AOM_AOM_DSP_GRAIN_PARAMS_H_
+#define AOM_AOM_DSP_GRAIN_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+/*!\brief Structure containing film grain synthesis parameters for a frame
+ *
+ * This structure contains input parameters for film grain synthesis
+ */
+typedef struct {
+ // This structure is compared element-by-element in the function
+ // aom_check_grain_params_equiv: this function must be updated if any changes
+ // are made to this structure.
+ int apply_grain;
+
+ int update_parameters;
+
+ // 8 bit values
+ int scaling_points_y[14][2];
+ int num_y_points; // value: 0..14
+
+ // 8 bit values
+ int scaling_points_cb[10][2];
+ int num_cb_points; // value: 0..10
+
+ // 8 bit values
+ int scaling_points_cr[10][2];
+ int num_cr_points; // value: 0..10
+
+ int scaling_shift; // values : 8..11
+
+ int ar_coeff_lag; // values: 0..3
+
+ // 8 bit values
+ int ar_coeffs_y[24];
+ int ar_coeffs_cb[25];
+ int ar_coeffs_cr[25];
+
+ // Shift value: AR coeffs range
+ // 6: [-2, 2)
+ // 7: [-1, 1)
+ // 8: [-0.5, 0.5)
+ // 9: [-0.25, 0.25)
+ int ar_coeff_shift; // values : 6..9
+
+ int cb_mult; // 8 bits
+ int cb_luma_mult; // 8 bits
+ int cb_offset; // 9 bits
+
+ int cr_mult; // 8 bits
+ int cr_luma_mult; // 8 bits
+ int cr_offset; // 9 bits
+
+ int overlap_flag;
+
+ int clip_to_restricted_range;
+
+ unsigned int bit_depth; // video bit depth
+
+ int chroma_scaling_from_luma;
+
+ int grain_scale_shift;
+
+ uint16_t random_seed;
+ // This structure is compared element-by-element in the function
+ // aom_check_grain_params_equiv: this function must be updated if any changes
+ // are made to this structure.
+} aom_film_grain_t;
+
+/*!\brief Check if two film grain parameters structs are equivalent
+ *
+ * Check if two film grain parameters are equal, except for the
+ * update_parameters and random_seed elements which are ignored.
+ *
+ * \param[in] pa The first set of parameters to compare
+ * \param[in] pb The second set of parameters to compare
+ * \return Returns 1 if the params are equivalent, 0 otherwise
+ */
+static INLINE int aom_check_grain_params_equiv(
+ const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
+ if (pa->apply_grain != pb->apply_grain) return 0;
+ // Don't compare update_parameters
+
+ if (pa->num_y_points != pb->num_y_points) return 0;
+ if (memcmp(pa->scaling_points_y, pb->scaling_points_y,
+ pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0)
+ return 0;
+
+ if (pa->num_cb_points != pb->num_cb_points) return 0;
+ if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb,
+ pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0)
+ return 0;
+
+ if (pa->num_cr_points != pb->num_cr_points) return 0;
+ if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr,
+ pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0)
+ return 0;
+
+ if (pa->scaling_shift != pb->scaling_shift) return 0;
+ if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0;
+
+ const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1);
+ if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y,
+ num_pos * sizeof(*pa->ar_coeffs_y)) != 0)
+ return 0;
+ if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb,
+ num_pos * sizeof(*pa->ar_coeffs_cb)) != 0)
+ return 0;
+ if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr,
+ num_pos * sizeof(*pa->ar_coeffs_cr)) != 0)
+ return 0;
+
+ if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0;
+
+ if (pa->cb_mult != pb->cb_mult) return 0;
+ if (pa->cb_luma_mult != pb->cb_luma_mult) return 0;
+ if (pa->cb_offset != pb->cb_offset) return 0;
+
+ if (pa->cr_mult != pb->cr_mult) return 0;
+ if (pa->cr_luma_mult != pb->cr_luma_mult) return 0;
+ if (pa->cr_offset != pb->cr_offset) return 0;
+
+ if (pa->overlap_flag != pb->overlap_flag) return 0;
+ if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0;
+ if (pa->bit_depth != pb->bit_depth) return 0;
+ if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0;
+ if (pa->grain_scale_shift != pb->grain_scale_shift) return 0;
+
+ return 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_GRAIN_PARAMS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
new file mode 100644
index 0000000000..3505f9f2c8
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This file has the implementation details of the grain table.
+ *
+ * The file format is an ascii representation for readability and
+ * editability. Array parameters are separated from the non-array
+ * parameters and prefixed with a few characters to make for easy
+ * localization with a parameter set. Each entry is prefixed with "E"
+ * and the other parameters are only specified if "update-parms" is
+ * non-zero.
+ *
+ * filmgrn1
+ * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
+ * p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
+ * sY <num_y_points> <point_0_x> <point_0_y> ...
+ * sCb <num_cb_points> <point_0_x> <point_0_y> ...
+ * sCr <num_cr_points> <point_0_x> <point_0_y> ...
+ * cY <ar_coeff_y_0> ....
+ * cCb <ar_coeff_cb_0> ....
+ * cCr <ar_coeff_cr_0> ....
+ * E <start-time> ...
+ */
+#include <string.h>
+#include <stdio.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_mem/aom_mem.h"
+
+static const char kFileMagic[8] = "filmgrn1";
+
+static void grain_table_entry_read(FILE *file,
+ struct aom_internal_error_info *error_info,
+ aom_film_grain_table_entry_t *entry) {
+ aom_film_grain_t *pars = &entry->params;
+ int num_read =
+ fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
+ &entry->end_time, &pars->apply_grain, &pars->random_seed,
+ &pars->update_parameters);
+ if (num_read == 0 && feof(file)) return;
+ if (num_read != 5) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read entry header. Read %d != 5", num_read);
+ return;
+ }
+ if (pars->update_parameters) {
+ num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
+ &pars->ar_coeff_lag, &pars->ar_coeff_shift,
+ &pars->grain_scale_shift, &pars->scaling_shift,
+ &pars->chroma_scaling_from_luma, &pars->overlap_flag,
+ &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
+ &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
+ if (num_read != 12) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read entry params. Read %d != 12",
+ num_read);
+ return;
+ }
+ if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read num y points");
+ return;
+ }
+ for (int i = 0; i < pars->num_y_points; ++i) {
+ if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
+ &pars->scaling_points_y[i][1])) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read y scaling points");
+ return;
+ }
+ }
+ if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read num cb points");
+ return;
+ }
+ for (int i = 0; i < pars->num_cb_points; ++i) {
+ if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
+ &pars->scaling_points_cb[i][1])) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read cb scaling points");
+ return;
+ }
+ }
+ if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read num cr points");
+ return;
+ }
+ for (int i = 0; i < pars->num_cr_points; ++i) {
+ if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
+ &pars->scaling_points_cr[i][1])) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read cr scaling points");
+ return;
+ }
+ }
+
+ if (fscanf(file, "\n\tcY")) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read Y coeffs header (cY)");
+ return;
+ }
+ const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ for (int i = 0; i < n; ++i) {
+ if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read Y coeffs");
+ return;
+ }
+ }
+ if (fscanf(file, "\n\tcCb")) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read Cb coeffs header (cCb)");
+ return;
+ }
+ for (int i = 0; i <= n; ++i) {
+ if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read Cb coeffs");
+ return;
+ }
+ }
+ if (fscanf(file, "\n\tcCr")) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable read to Cr coeffs header (cCr)");
+ return;
+ }
+ for (int i = 0; i <= n; ++i) {
+ if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read Cr coeffs");
+ return;
+ }
+ }
+ (void)fscanf(file, "\n");
+ }
+}
+
+static void grain_table_entry_write(FILE *file,
+ aom_film_grain_table_entry_t *entry) {
+ const aom_film_grain_t *pars = &entry->params;
+ fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
+ entry->end_time, pars->apply_grain, pars->random_seed,
+ pars->update_parameters);
+ if (pars->update_parameters) {
+ fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
+ pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
+ pars->scaling_shift, pars->chroma_scaling_from_luma,
+ pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
+ pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
+ pars->cr_offset);
+ fprintf(file, "\tsY %d ", pars->num_y_points);
+ for (int i = 0; i < pars->num_y_points; ++i) {
+ fprintf(file, " %d %d", pars->scaling_points_y[i][0],
+ pars->scaling_points_y[i][1]);
+ }
+ fprintf(file, "\n\tsCb %d", pars->num_cb_points);
+ for (int i = 0; i < pars->num_cb_points; ++i) {
+ fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
+ pars->scaling_points_cb[i][1]);
+ }
+ fprintf(file, "\n\tsCr %d", pars->num_cr_points);
+ for (int i = 0; i < pars->num_cr_points; ++i) {
+ fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
+ pars->scaling_points_cr[i][1]);
+ }
+ fprintf(file, "\n\tcY");
+ const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ for (int i = 0; i < n; ++i) {
+ fprintf(file, " %d", pars->ar_coeffs_y[i]);
+ }
+ fprintf(file, "\n\tcCb");
+ for (int i = 0; i <= n; ++i) {
+ fprintf(file, " %d", pars->ar_coeffs_cb[i]);
+ }
+ fprintf(file, "\n\tcCr");
+ for (int i = 0; i <= n; ++i) {
+ fprintf(file, " %d", pars->ar_coeffs_cr[i]);
+ }
+ fprintf(file, "\n");
+ }
+}
+
+// TODO(https://crbug.com/aomedia/3228): Update this function to return an
+// integer status.
+void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
+ int64_t end_time,
+ const aom_film_grain_t *grain) {
+ if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
+ aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+ if (!new_tail) return;
+ memset(new_tail, 0, sizeof(*new_tail));
+ if (t->tail) t->tail->next = new_tail;
+ if (!t->head) t->head = new_tail;
+ t->tail = new_tail;
+
+ new_tail->start_time = time_stamp;
+ new_tail->end_time = end_time;
+ new_tail->params = *grain;
+ } else {
+ t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
+ t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
+ }
+}
+
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+ int64_t end_time, int erase,
+ aom_film_grain_t *grain) {
+ aom_film_grain_table_entry_t *entry = t->head;
+ aom_film_grain_table_entry_t *prev_entry = NULL;
+ uint16_t random_seed = grain ? grain->random_seed : 0;
+ if (grain) memset(grain, 0, sizeof(*grain));
+
+ while (entry) {
+ aom_film_grain_table_entry_t *next = entry->next;
+ if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
+ if (grain) {
+ *grain = entry->params;
+ if (time_stamp != 0) grain->random_seed = random_seed;
+ }
+ if (!erase) return 1;
+
+ const int64_t entry_end_time = entry->end_time;
+ if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
+ if (t->tail == entry) t->tail = prev_entry;
+ if (prev_entry) {
+ prev_entry->next = entry->next;
+ } else {
+ t->head = entry->next;
+ }
+ aom_free(entry);
+ } else if (time_stamp <= entry->start_time &&
+ end_time < entry->end_time) {
+ entry->start_time = end_time;
+ } else if (time_stamp > entry->start_time &&
+ end_time >= entry->end_time) {
+ entry->end_time = time_stamp;
+ } else {
+ aom_film_grain_table_entry_t *new_entry =
+ aom_malloc(sizeof(*new_entry));
+ if (!new_entry) return 0;
+ new_entry->next = entry->next;
+ new_entry->start_time = end_time;
+ new_entry->end_time = entry->end_time;
+ new_entry->params = entry->params;
+ entry->next = new_entry;
+ entry->end_time = time_stamp;
+ if (t->tail == entry) t->tail = new_entry;
+ }
+ // If segments aren't aligned, delete from the beginning of subsequent
+ // segments
+ if (end_time > entry_end_time) {
+ // Ignoring the return value here is safe since we're erasing from the
+ // beginning of subsequent entries.
+ aom_film_grain_table_lookup(t, entry_end_time, end_time, /*erase=*/1,
+ NULL);
+ }
+ return 1;
+ }
+ prev_entry = entry;
+ entry = next;
+ }
+ return 0;
+}
+
+aom_codec_err_t aom_film_grain_table_read(
+ aom_film_grain_table_t *t, const char *filename,
+ struct aom_internal_error_info *error_info) {
+ FILE *file = fopen(filename, "rb");
+ if (!file) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
+ filename);
+ return error_info->error_code;
+ }
+ error_info->error_code = AOM_CODEC_OK;
+
+ // Read in one extra character as there should be white space after
+ // the header.
+ char magic[9];
+ if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to read (or invalid) file magic");
+ fclose(file);
+ return error_info->error_code;
+ }
+
+ aom_film_grain_table_entry_t *prev_entry = NULL;
+ while (!feof(file)) {
+ aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+ if (!entry) {
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Unable to allocate grain table entry");
+ break;
+ }
+ memset(entry, 0, sizeof(*entry));
+ grain_table_entry_read(file, error_info, entry);
+ entry->next = NULL;
+
+ if (prev_entry) prev_entry->next = entry;
+ if (!t->head) t->head = entry;
+ t->tail = entry;
+ prev_entry = entry;
+
+ if (error_info->error_code != AOM_CODEC_OK) break;
+ }
+
+ fclose(file);
+ return error_info->error_code;
+}
+
+aom_codec_err_t aom_film_grain_table_write(
+ const aom_film_grain_table_t *t, const char *filename,
+ struct aom_internal_error_info *error_info) {
+ error_info->error_code = AOM_CODEC_OK;
+
+ FILE *file = fopen(filename, "wb");
+ if (!file) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
+ filename);
+ return error_info->error_code;
+ }
+
+ if (!fwrite(kFileMagic, 8, 1, file)) {
+ aom_internal_error(error_info, AOM_CODEC_ERROR,
+ "Unable to write file magic");
+ fclose(file);
+ return error_info->error_code;
+ }
+
+ fprintf(file, "\n");
+ aom_film_grain_table_entry_t *entry = t->head;
+ while (entry) {
+ grain_table_entry_write(file, entry);
+ entry = entry->next;
+ }
+ fclose(file);
+ return error_info->error_code;
+}
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t) {
+ aom_film_grain_table_entry_t *entry = t->head;
+ while (entry) {
+ aom_film_grain_table_entry_t *next = entry->next;
+ aom_free(entry);
+ entry = next;
+ }
+ memset(t, 0, sizeof(*t));
+}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
new file mode 100644
index 0000000000..49e84980ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief A table mapping from time to corresponding film grain parameters.
+ *
+ * In order to apply grain synthesis in the decoder, the film grain parameters
+ * need to be signalled in the encoder. The film grain parameters are time
+ * varying, and for two-pass encoding (and denoiser implementation flexibility)
+ * it is common to denoise the video and do parameter estimation before encoding
+ * the denoised video.
+ *
+ * The film grain table is used to provide this flexibility and is used as a
+ * parameter that is passed to the encoder.
+ *
+ * Further, if regraining is to be done in say a single pass mode, or in two
+ * pass within the encoder (before frames are added to the lookahead buffer),
+ * this data structure can be used to keep track of on-the-fly estimated grain
+ * parameters, that are then extracted from the table before the encoded frame
+ * is written.
+ */
+#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
+#define AOM_AOM_DSP_GRAIN_TABLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/grain_params.h"
+#include "aom/internal/aom_codec_internal.h"
+
+typedef struct aom_film_grain_table_entry_t {
+ aom_film_grain_t params;
+ int64_t start_time;
+ int64_t end_time;
+ struct aom_film_grain_table_entry_t *next;
+} aom_film_grain_table_entry_t;
+
+typedef struct {
+ aom_film_grain_table_entry_t *head;
+ aom_film_grain_table_entry_t *tail;
+} aom_film_grain_table_t;
+
+/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
+ * parameters
+ *
+ * \param[in,out] table The grain table
+ * \param[in] time_stamp The start time stamp
+ * \param[in] end_stamp The end time_stamp
+ * \param[in] grain The grain parameters
+ */
+void aom_film_grain_table_append(aom_film_grain_table_t *table,
+ int64_t time_stamp, int64_t end_time,
+ const aom_film_grain_t *grain);
+
+/*!\brief Look-up (and optionally erase) the grain parameters for the given time
+ *
+ * \param[in] table The grain table
+ * \param[in] time_stamp The start time stamp
+ * \param[in] end_stamp The end time_stamp
+ * \param[in] erase Whether the time segment can be deleted
+ * \param[out] grain The output grain parameters
+ */
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+ int64_t end_time, int erase,
+ aom_film_grain_t *grain);
+
+/*!\brief Reads the grain table from a file.
+ *
+ * \param[out] table The grain table
+ * \param[in] filename The file to read from
+ * \param[in] error_info Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_read(
+ aom_film_grain_table_t *table, const char *filename,
+ struct aom_internal_error_info *error_info);
+
+/*!\brief Writes the grain table from a file.
+ *
+ * \param[out] table The grain table
+ * \param[in] filename The file to read from
+ * \param[in] error_info Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_write(
+ const aom_film_grain_table_t *t, const char *filename,
+ struct aom_internal_error_info *error_info);
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
new file mode 100644
index 0000000000..6ec091f5f3
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/intrapred_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)left;
+
+ for (r = 0; r < bh; r++) {
+ memcpy(dst, above, bw);
+ dst += stride;
+ }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left) {
+ int r;
+ (void)above;
+
+ for (r = 0; r < bh; r++) {
+ memset(dst, left[r], bw);
+ dst += stride;
+ }
+}
+
+static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+
+static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+ uint16_t top_left) {
+ const int base = top + left - top_left;
+ const int p_left = abs_diff(base, left);
+ const int p_top = abs_diff(base, top);
+ const int p_top_left = abs_diff(base, top_left);
+
+ // Return nearest to base of left, top and top_left.
+ return (p_left <= p_top && p_left <= p_top_left) ? left
+ : (p_top <= p_top_left) ? top
+ : top_left;
+}
+
+static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ int r, c;
+ const uint8_t ytop_left = above[-1];
+
+ for (r = 0; r < bh; r++) {
+ for (c = 0; c < bw; c++)
+ dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
+ dst += stride;
+ }
+}
+
+// Some basic checks on weights for smooth predictor.
+#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
+ pred_scale) \
+ assert(weights_w[0] < weights_scale); \
+ assert(weights_h[0] < weights_scale); \
+ assert(weights_scale - weights_w[bw - 1] < weights_scale); \
+ assert(weights_scale - weights_h[bh - 1] < weights_scale); \
+ assert(pred_scale < 31) // ensures no overflow when calculating predictor.
+
+#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
+
+static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
+ const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel
+ const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+ const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+ // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
+ const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
+ const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+ log2_scale + sizeof(*dst));
+ int r;
+ for (r = 0; r < bh; ++r) {
+ int c;
+ for (c = 0; c < bw; ++c) {
+ const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
+ const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+ sm_weights_w[c], scale - sm_weights_w[c] };
+ uint32_t this_pred = 0;
+ int i;
+ assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+ for (i = 0; i < 4; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = divide_round(this_pred, log2_scale);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
+ const uint8_t *const sm_weights = smooth_weights + bh - 4;
+ // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+ const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+ const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+ log2_scale + sizeof(*dst));
+
+ int r;
+ for (r = 0; r < bh; r++) {
+ int c;
+ for (c = 0; c < bw; ++c) {
+ const uint8_t pixels[] = { above[c], below_pred };
+ const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+ uint32_t this_pred = 0;
+ assert(scale >= sm_weights[r]);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = divide_round(this_pred, log2_scale);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel
+ const uint8_t *const sm_weights = smooth_weights + bw - 4;
+ // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+ const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+ const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+ log2_scale + sizeof(*dst));
+
+ int r;
+ for (r = 0; r < bh; r++) {
+ int c;
+ for (c = 0; c < bw; ++c) {
+ const uint8_t pixels[] = { left[r], right_pred };
+ const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+ uint32_t this_pred = 0;
+ assert(scale >= sm_weights[c]);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = divide_round(this_pred, log2_scale);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ int r;
+ (void)above;
+ (void)left;
+
+ for (r = 0; r < bh; r++) {
+ memset(dst, 128, bw);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ (void)above;
+
+ for (i = 0; i < bh; i++) sum += left[i];
+ expected_dc = (sum + (bh >> 1)) / bh;
+
+ for (r = 0; r < bh; r++) {
+ memset(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ (void)left;
+
+ for (i = 0; i < bw; i++) sum += above[i];
+ expected_dc = (sum + (bw >> 1)) / bw;
+
+ for (r = 0; r < bh; r++) {
+ memset(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left) {
+ int i, r, expected_dc, sum = 0;
+ const int count = bw + bh;
+
+ for (i = 0; i < bw; i++) {
+ sum += above[i];
+ }
+ for (i = 0; i < bh; i++) {
+ sum += left[i];
+ }
+
+ expected_dc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bh; r++) {
+ memset(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier, int shift2) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> shift2;
+}
+
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum_w_h = block width + block height.
+// - Shift 'sum_w_h' right until we reach an odd number. Let the number of
+// shifts for that block size be called 'shift1' (see the parameter in
+// dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
+// possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
+// block].
+// - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
+// using the "Algorithm 1" in:
+// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+// shift will be 16, regardless of the block size.
+
+// Note: For low bitdepth, assembly code may be optimized by using smaller
+// constants for smaller block sizes, where the range of the 'sum' is
+// restricted to fewer bits.
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint8_t *above,
+ const uint8_t *left, int shift1,
+ int multiplier) {
+ int sum = 0;
+
+ for (int i = 0; i < bw; i++) {
+ sum += above[i];
+ }
+ for (int i = 0; i < bh; i++) {
+ sum += left[i];
+ }
+
+ const int expected_dc = divide_using_multiply_shift(
+ sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+ assert(expected_dc < (1 << 8));
+
+ for (int r = 0; r < bh; r++) {
+ memset(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)left;
+ (void)bd;
+ for (r = 0; r < bh; r++) {
+ memcpy(dst, above, bw * sizeof(uint16_t));
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)above;
+ (void)bd;
+ for (r = 0; r < bh; r++) {
+ aom_memset16(dst, left[r], bw);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r, c;
+ const uint16_t ytop_left = above[-1];
+ (void)bd;
+
+ for (r = 0; r < bh; r++) {
+ for (c = 0; c < bw; c++)
+ dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
+ const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel
+ const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+ const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+ // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
+ const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
+ const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+ log2_scale + sizeof(*dst));
+ int r;
+ for (r = 0; r < bh; ++r) {
+ int c;
+ for (c = 0; c < bw; ++c) {
+ const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
+ const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+ sm_weights_w[c], scale - sm_weights_w[c] };
+ uint32_t this_pred = 0;
+ int i;
+ assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+ for (i = 0; i < 4; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = divide_round(this_pred, log2_scale);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
+ const uint8_t *const sm_weights = smooth_weights + bh - 4;
+ // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+ const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+ const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+ log2_scale + sizeof(*dst));
+
+ int r;
+ for (r = 0; r < bh; r++) {
+ int c;
+ for (c = 0; c < bw; ++c) {
+ const uint16_t pixels[] = { above[c], below_pred };
+ const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+ uint32_t this_pred = 0;
+ assert(scale >= sm_weights[r]);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = divide_round(this_pred, log2_scale);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel
+ const uint8_t *const sm_weights = smooth_weights + bw - 4;
+ // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+ const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+ const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+ log2_scale + sizeof(*dst));
+
+ int r;
+ for (r = 0; r < bh; r++) {
+ int c;
+ for (c = 0; c < bw; ++c) {
+ const uint16_t pixels[] = { left[r], right_pred };
+ const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+ uint32_t this_pred = 0;
+ assert(scale >= sm_weights[c]);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ this_pred += weights[i] * pixels[i];
+ }
+ dst[c] = divide_round(this_pred, log2_scale);
+ }
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int r;
+ (void)above;
+ (void)left;
+
+ for (r = 0; r < bh; r++) {
+ aom_memset16(dst, 128 << (bd - 8), bw);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < bh; i++) sum += left[i];
+ expected_dc = (sum + (bh >> 1)) / bh;
+
+ for (r = 0; r < bh; r++) {
+ aom_memset16(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < bw; i++) sum += above[i];
+ expected_dc = (sum + (bw >> 1)) / bw;
+
+ for (r = 0; r < bh; r++) {
+ aom_memset16(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i, r, expected_dc, sum = 0;
+ const int count = bw + bh;
+ (void)bd;
+
+ for (i = 0; i < bw; i++) {
+ sum += above[i];
+ }
+ for (i = 0; i < bh; i++) {
+ sum += left[i];
+ }
+
+ expected_dc = (sum + (count >> 1)) / count;
+
+ for (r = 0; r < bh; r++) {
+ aom_memset16(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
+// assume 2nd shift of 17 bits instead of 16.
+// Note: Strictly speaking, 2nd shift needs to be 17 only when:
+// - bit depth == 12, and
+// - bw + bh is divisible by 5 (as opposed to divisible by 3).
+// All other cases can use half the multipliers with a shift of 16 instead.
+// This special optimization can be used when writing assembly code.
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+// Note: This constant is odd, but a smaller even constant (0x199a) with the
+// appropriate shift should work for neon in 8/10-bit.
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+ int bw, int bh,
+ const uint16_t *above,
+ const uint16_t *left, int bd,
+ int shift1, uint32_t multiplier) {
+ int sum = 0;
+ (void)bd;
+
+ for (int i = 0; i < bw; i++) {
+ sum += above[i];
+ }
+ for (int i = 0; i < bh; i++) {
+ sum += left[i];
+ }
+
+ const int expected_dc = divide_using_multiply_shift(
+ sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
+ assert(expected_dc < (1 << bd));
+
+ for (int r = 0; r < bh; r++) {
+ aom_memset16(dst, expected_dc, bw);
+ dst += stride;
+ }
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
+ HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
+ HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
+ HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd) {
+ highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
+ HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
+ HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
+ HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
+ HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, width, height) \
+ void aom_##type##_predictor_##width##x##height##_c( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ type##_predictor(dst, stride, width, height, above, left); \
+ }
+
+#define intra_pred_highbd_sized(type, width, height) \
+ void aom_highbd_##type##_predictor_##width##x##height##_c( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
+ }
+
+/* clang-format off */
+#define intra_pred_rectangular(type) \
+ intra_pred_sized(type, 4, 8) \
+ intra_pred_sized(type, 8, 4) \
+ intra_pred_sized(type, 8, 16) \
+ intra_pred_sized(type, 16, 8) \
+ intra_pred_sized(type, 16, 32) \
+ intra_pred_sized(type, 32, 16) \
+ intra_pred_sized(type, 32, 64) \
+ intra_pred_sized(type, 64, 32) \
+ intra_pred_sized(type, 4, 16) \
+ intra_pred_sized(type, 16, 4) \
+ intra_pred_sized(type, 8, 32) \
+ intra_pred_sized(type, 32, 8) \
+ intra_pred_sized(type, 16, 64) \
+ intra_pred_sized(type, 64, 16) \
+ intra_pred_highbd_sized(type, 4, 8) \
+ intra_pred_highbd_sized(type, 8, 4) \
+ intra_pred_highbd_sized(type, 8, 16) \
+ intra_pred_highbd_sized(type, 16, 8) \
+ intra_pred_highbd_sized(type, 16, 32) \
+ intra_pred_highbd_sized(type, 32, 16) \
+ intra_pred_highbd_sized(type, 32, 64) \
+ intra_pred_highbd_sized(type, 64, 32) \
+ intra_pred_highbd_sized(type, 4, 16) \
+ intra_pred_highbd_sized(type, 16, 4) \
+ intra_pred_highbd_sized(type, 8, 32) \
+ intra_pred_highbd_sized(type, 32, 8) \
+ intra_pred_highbd_sized(type, 16, 64) \
+ intra_pred_highbd_sized(type, 64, 16)
+
+#define intra_pred_above_4x4(type) \
+ intra_pred_sized(type, 8, 8) \
+ intra_pred_sized(type, 16, 16) \
+ intra_pred_sized(type, 32, 32) \
+ intra_pred_sized(type, 64, 64) \
+ intra_pred_highbd_sized(type, 4, 4) \
+ intra_pred_highbd_sized(type, 8, 8) \
+ intra_pred_highbd_sized(type, 16, 16) \
+ intra_pred_highbd_sized(type, 32, 32) \
+ intra_pred_highbd_sized(type, 64, 64) \
+ intra_pred_rectangular(type)
+#define intra_pred_allsizes(type) \
+ intra_pred_sized(type, 4, 4) \
+ intra_pred_above_4x4(type)
+#define intra_pred_square(type) \
+ intra_pred_sized(type, 4, 4) \
+ intra_pred_sized(type, 8, 8) \
+ intra_pred_sized(type, 16, 16) \
+ intra_pred_sized(type, 32, 32) \
+ intra_pred_sized(type, 64, 64) \
+ intra_pred_highbd_sized(type, 4, 4) \
+ intra_pred_highbd_sized(type, 8, 8) \
+ intra_pred_highbd_sized(type, 16, 16) \
+ intra_pred_highbd_sized(type, 32, 32) \
+ intra_pred_highbd_sized(type, 64, 64)
+
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(smooth)
+intra_pred_allsizes(smooth_v)
+intra_pred_allsizes(smooth_h)
+intra_pred_allsizes(paeth)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_square(dc)
+/* clang-format on */
+#undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
new file mode 100644
index 0000000000..6172224be1
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
+#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
+
+#include "config/aom_config.h"
+
+// Weights are quadratic from '1' to '1 / block_size', scaled by
+// 2^SMOOTH_WEIGHT_LOG2_SCALE.
+#define SMOOTH_WEIGHT_LOG2_SCALE 8
+
+// Note these arrays are aligned to ensure NEON loads using a cast to uint32_t*
+// have sufficient alignment. Using 8 preserves the potential for an alignment
+// hint in load_weight_w8(). For that case, this could be increased to 16 to
+// allow an aligned load in x86.
+DECLARE_ALIGNED(8, static const uint8_t, smooth_weights[]) = {
+ // bs = 4
+ 255, 149, 85, 64,
+ // bs = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // bs = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // bs = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // bs = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+ 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+ 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+DECLARE_ALIGNED(8, static const uint16_t, smooth_weights_u16[]) = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+ 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+ 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
new file mode 100644
index 0000000000..075f13689c
--- /dev/null
+++ b/third_party/aom/aom_dsp/loopfilter.c
@@ -0,0 +1,997 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+ return (int8_t)clamp(t, -128, 127);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+ switch (bd) {
+ case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+ case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
+ case 8:
+ default: return (int16_t)clamp(t, -128, 128 - 1);
+ }
+}
+#endif
+
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+ uint8_t p0, uint8_t q0, uint8_t q1) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+ uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+ uint8_t q1, uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p3 - p2) > limit) * -1;
+ mask |= (abs(p2 - p1) > limit) * -1;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(q2 - q1) > limit) * -1;
+ mask |= (abs(q3 - q2) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+ uint8_t p2, uint8_t p1, uint8_t p0,
+ uint8_t q0, uint8_t q1, uint8_t q2) {
+ int8_t mask = 0;
+ mask |= (abs(p2 - p1) > limit) * -1;
+ mask |= (abs(p1 - p0) > limit) * -1;
+ mask |= (abs(q1 - q0) > limit) * -1;
+ mask |= (abs(q2 - q1) > limit) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
+ uint8_t p0, uint8_t q0, uint8_t q1,
+ uint8_t q2) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > thresh) * -1;
+ mask |= (abs(q1 - q0) > thresh) * -1;
+ mask |= (abs(p2 - p0) > thresh) * -1;
+ mask |= (abs(q2 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+ uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
+ uint8_t q2, uint8_t q3) {
+ int8_t mask = 0;
+ mask |= (abs(p1 - p0) > thresh) * -1;
+ mask |= (abs(q1 - q0) > thresh) * -1;
+ mask |= (abs(p2 - p0) > thresh) * -1;
+ mask |= (abs(q2 - q0) > thresh) * -1;
+ mask |= (abs(p3 - p0) > thresh) * -1;
+ mask |= (abs(q3 - q0) > thresh) * -1;
+ return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+ uint8_t q0, uint8_t q1) {
+ int8_t hev = 0;
+ hev |= (abs(p1 - p0) > thresh) * -1;
+ hev |= (abs(q1 - q0) > thresh) * -1;
+ return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+ int8_t filter1, filter2;
+
+ const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
+ const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
+ const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
+ const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
+ const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+ // add outer taps if we have high edge variance
+ int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+ // inner taps
+ filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+ // save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set to adjust by -1 to account for the fact
+ // we'd round 3 the other way
+ filter1 = signed_char_clamp(filter + 4) >> 3;
+ filter2 = signed_char_clamp(filter + 3) >> 3;
+
+ *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
+ *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
+
+ // outer tap adjustments
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
+ *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
+}
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint8_t p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+ const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+ filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_4_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0) {
+ aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_c(s + 4, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_c(s + 8, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint8_t p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1];
+ const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+ filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+ s += pitch;
+ }
+}
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0) {
+ aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
+static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
+ uint8_t *op2, uint8_t *op1, uint8_t *op0,
+ uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
+ if (flat && mask) {
+ const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+ // 5-tap filter [1, 2, 2, 2, 1]
+ *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+ *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+ } else {
+ filter4(mask, thresh, op1, op0, oq0, oq1);
+ }
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
+ uint8_t *op3, uint8_t *op2, uint8_t *op1,
+ uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+ uint8_t *oq2, uint8_t *oq3) {
+ if (flat && mask) {
+ const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ filter4(mask, thresh, op1, op0, oq0, oq1);
+ }
+}
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+ const int8_t mask =
+ filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+ const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+ filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+ s + 2 * p);
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_6_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0) {
+ aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_6_c(s + 4, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_6_c(s + 8, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_6_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, s + 3 * p);
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0) {
+ aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_c(s + 4, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_c(s + 8, p, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ int count = 4;
+
+ for (i = 0; i < count; ++i) {
+ const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
+ const int8_t mask =
+ filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+ const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+ filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+ s += pitch;
+ }
+}
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0) {
+ aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_6_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_6_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ int i;
+ int count = 4;
+
+ for (i = 0; i < count; ++i) {
+ const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+ s + 3);
+ s += pitch;
+ }
+}
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0) {
+ aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
+static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint8_t *op6, uint8_t *op5,
+ uint8_t *op4, uint8_t *op3, uint8_t *op2,
+ uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+ uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+ uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
+ if (flat2 && flat && mask) {
+ const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
+ p1 = *op1, p0 = *op0;
+ const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+ q5 = *oq5, q6 = *oq6;
+
+ // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+ *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+ 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+ 4);
+ *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+ q0 + q1 + q2 + q3 + q4,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+ 4);
+ } else {
+ filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+ }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count) {
+ int i;
+ int step = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < step * count; ++i) {
+ const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
+ p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
+ q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+ filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+ s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+ s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
+ ++s;
+ }
+}
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+ mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
+}
+
+void aom_lpf_horizontal_14_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0) {
+ mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+ mb_lpf_horizontal_edge_w(s + 4, p, blimit0, limit0, thresh0, 1);
+ mb_lpf_horizontal_edge_w(s + 8, p, blimit0, limit0, thresh0, 1);
+ mb_lpf_horizontal_edge_w(s + 12, p, blimit0, limit0, thresh0, 1);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
+ p1 = s[-2], p0 = s[-1];
+ const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
+ q5 = s[5], q6 = s[6];
+ const int8_t mask =
+ filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+ const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+ filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
+ s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
+ s += p;
+ }
+}
+
+void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
+}
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+ mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
+}
+
+void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0) {
+ mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+ mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit0, limit0, thresh0, 4);
+ mb_lpf_vertical_edge_w(s + 8 * pitch, pitch, blimit0, limit0, thresh0, 4);
+ mb_lpf_vertical_edge_w(s + 12 * pitch, pitch, blimit0, limit0, thresh0, 4);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+ uint16_t p1, uint16_t p0, uint16_t q0,
+ uint16_t q1, int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+ uint16_t p3, uint16_t p2, uint16_t p1,
+ uint16_t p0, uint16_t q0, uint16_t q1,
+ uint16_t q2, uint16_t q3, int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p3 - p2) > limit16) * -1;
+ mask |= (abs(p2 - p1) > limit16) * -1;
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(q2 - q1) > limit16) * -1;
+ mask |= (abs(q3 - q2) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+ uint16_t p2, uint16_t p1,
+ uint16_t p0, uint16_t q0,
+ uint16_t q1, uint16_t q2,
+ int bd) {
+ int8_t mask = 0;
+ int16_t limit16 = (uint16_t)limit << (bd - 8);
+ int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+ mask |= (abs(p2 - p1) > limit16) * -1;
+ mask |= (abs(p1 - p0) > limit16) * -1;
+ mask |= (abs(q1 - q0) > limit16) * -1;
+ mask |= (abs(q2 - q1) > limit16) * -1;
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
+ uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1,
+ uint16_t q2, int bd) {
+ int8_t mask = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p1 - p0) > thresh16) * -1;
+ mask |= (abs(q1 - q0) > thresh16) * -1;
+ mask |= (abs(p2 - p0) > thresh16) * -1;
+ mask |= (abs(q2 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+ uint16_t p1, uint16_t p0, uint16_t q0,
+ uint16_t q1, uint16_t q2, uint16_t q3,
+ int bd) {
+ int8_t mask = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ mask |= (abs(p1 - p0) > thresh16) * -1;
+ mask |= (abs(q1 - q0) > thresh16) * -1;
+ mask |= (abs(p2 - p0) > thresh16) * -1;
+ mask |= (abs(q2 - q0) > thresh16) * -1;
+ mask |= (abs(p3 - p0) > thresh16) * -1;
+ mask |= (abs(q3 - q0) > thresh16) * -1;
+ return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+ uint16_t q0, uint16_t q1, int bd) {
+ int16_t hev = 0;
+ int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+ hev |= (abs(p1 - p0) > thresh16) * -1;
+ hev |= (abs(q1 - q0) > thresh16) * -1;
+ return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ int bd) {
+ int16_t filter1, filter2;
+ // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+ // into -128 to +127 instead of 0 to 255.
+ int shift = bd - 8;
+ const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+ const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+ const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+ const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+ const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+ // Add outer taps if we have high edge variance.
+ int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+ // Inner taps.
+ filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+ // Save bottom 3 bits so that we round one side +4 and the other +3
+ // if it equals 4 we'll set to adjust by -1 to account for the fact
+ // we'd round 3 the other way.
+ filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+ filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+ *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+ *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+ // Outer tap adjustments.
+ filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+ *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+ *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const int8_t mask =
+ highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+ highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_4_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint16_t p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1];
+ const int8_t mask =
+ highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+ highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+ s += pitch;
+ }
+}
+
+void aom_highbd_lpf_vertical_4_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
+ uint16_t *op2, uint16_t *op1, uint16_t *op0,
+ uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+ int bd) {
+ if (flat && mask) {
+ const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+ // 5-tap filter [1, 2, 2, 2, 1]
+ *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+ *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+ } else {
+ highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+ }
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
+ uint16_t *op3, uint16_t *op2, uint16_t *op1,
+ uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+ uint16_t *oq2, uint16_t *oq3, int bd) {
+ if (flat && mask) {
+ const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+ // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+ *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+ *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+ *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+ *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+ *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+ *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+ } else {
+ highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+ }
+}
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+ s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+ int count = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < count; ++i) {
+ const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+ const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+ const int8_t mask =
+ highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+ const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+ highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
+ s + 1 * p, s + 2 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_6_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+ int count = 4;
+
+ for (i = 0; i < count; ++i) {
+ const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
+ const int8_t mask =
+ highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+ const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+ highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+ bd);
+ s += pitch;
+ }
+}
+
+void aom_highbd_lpf_vertical_6_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ int i;
+ int count = 4;
+
+ for (i = 0; i < count; ++i) {
+ const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+ const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+ s + 2, s + 3, bd);
+ s += pitch;
+ }
+}
+
+void aom_highbd_lpf_vertical_8_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
+ int8_t flat2, uint16_t *op6, uint16_t *op5,
+ uint16_t *op4, uint16_t *op3, uint16_t *op2,
+ uint16_t *op1, uint16_t *op0, uint16_t *oq0,
+ uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
+ uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
+ int bd) {
+ if (flat2 && flat && mask) {
+ const uint16_t p6 = *op6;
+ const uint16_t p5 = *op5;
+ const uint16_t p4 = *op4;
+ const uint16_t p3 = *op3;
+ const uint16_t p2 = *op2;
+ const uint16_t p1 = *op1;
+ const uint16_t p0 = *op0;
+ const uint16_t q0 = *oq0;
+ const uint16_t q1 = *oq1;
+ const uint16_t q2 = *oq2;
+ const uint16_t q3 = *oq3;
+ const uint16_t q4 = *oq4;
+ const uint16_t q5 = *oq5;
+ const uint16_t q6 = *oq6;
+
+ // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+ *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+ 4);
+ *op4 = ROUND_POWER_OF_TWO(
+ p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+ *op3 = ROUND_POWER_OF_TWO(
+ p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+ *op2 = ROUND_POWER_OF_TWO(
+ p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+ 4);
+ *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+ q0 + q1 + q2 + q3 + q4,
+ 4);
+ *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5,
+ 4);
+ *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6,
+ 4);
+ *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 * 2,
+ 4);
+ *oq2 = ROUND_POWER_OF_TWO(
+ p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+ 4);
+ *oq3 = ROUND_POWER_OF_TWO(
+ p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+ *oq4 = ROUND_POWER_OF_TWO(
+ p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+ *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+ 4);
+ } else {
+ highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ bd);
+ }
+}
+
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count,
+ int bd) {
+ int i;
+ int step = 4;
+
+ // loop filter designed to work using chars so that we can make maximum use
+ // of 8 bit simd instructions.
+ for (i = 0; i < step * count; ++i) {
+ const uint16_t p3 = s[-4 * p];
+ const uint16_t p2 = s[-3 * p];
+ const uint16_t p1 = s[-2 * p];
+ const uint16_t p0 = s[-p];
+ const uint16_t q0 = s[0 * p];
+ const uint16_t q1 = s[1 * p];
+ const uint16_t q2 = s[2 * p];
+ const uint16_t q3 = s[3 * p];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+
+ const int8_t flat2 =
+ highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
+ s[5 * p], s[6 * p], bd);
+
+ highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+ s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+ s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
+ ++s;
+ }
+}
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
+}
+
+void aom_highbd_lpf_horizontal_14_dual_c(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+ highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int count,
+ int bd) {
+ int i;
+
+ for (i = 0; i < count; ++i) {
+ const uint16_t p3 = s[-4];
+ const uint16_t p2 = s[-3];
+ const uint16_t p1 = s[-2];
+ const uint16_t p0 = s[-1];
+ const uint16_t q0 = s[0];
+ const uint16_t q1 = s[1];
+ const uint16_t q2 = s[2];
+ const uint16_t q3 = s[3];
+ const int8_t mask =
+ highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat =
+ highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+ const int8_t flat2 =
+ highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
+
+ highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
+ s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
+ s + 6, bd);
+ s += p;
+ }
+}
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_c(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+ highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ 4, bd);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/mathutils.h b/third_party/aom/aom_dsp/mathutils.h
new file mode 100644
index 0000000000..cbb6cf491f
--- /dev/null
+++ b/third_party/aom/aom_dsp/mathutils.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MATHUTILS_H_
+#define AOM_AOM_DSP_MATHUTILS_H_
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+
+static const double TINY_NEAR_ZERO = 1.0E-16;
+
+// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
+static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+ int i, j, k;
+ double c;
+ // Forward elimination
+ for (k = 0; k < n - 1; k++) {
+ // Bring the largest magnitude to the diagonal position
+ for (i = n - 1; i > k; i--) {
+ if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+ for (j = 0; j < n; j++) {
+ c = A[i * stride + j];
+ A[i * stride + j] = A[(i - 1) * stride + j];
+ A[(i - 1) * stride + j] = c;
+ }
+ c = b[i];
+ b[i] = b[i - 1];
+ b[i - 1] = c;
+ }
+ }
+ for (i = k; i < n - 1; i++) {
+ if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0;
+ c = A[(i + 1) * stride + k] / A[k * stride + k];
+ for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+ b[i + 1] -= c * b[k];
+ }
+ }
+ // Backward substitution
+ for (i = n - 1; i >= 0; i--) {
+ if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0;
+ c = 0;
+ for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+ x[i] = (b[i] - c) / A[i * stride + i];
+ }
+
+ return 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Least-squares
+// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2
+// The solution is simply x = (A'A)^-1 A'b or simply the solution for
+// the system: A'A x = A'b
+//
+// This process is split into three steps in order to avoid needing to
+// explicitly allocate the A matrix, which may be very large if there
+// are many equations to solve.
+//
+// The process for using this is (in pseudocode):
+//
+// Allocate mat (size n*n), y (size n), a (size n), x (size n)
+// least_squares_init(mat, y, n)
+// for each equation a . x = b {
+// least_squares_accumulate(mat, y, a, b, n)
+// }
+// least_squares_solve(mat, y, x, n)
+//
+// where:
+// * mat, y are accumulators for the values A'A and A'b respectively,
+// * a, b are the coefficients of each individual equation,
+// * x is the result vector
+// * and n is the problem size
+static INLINE void least_squares_init(double *mat, double *y, int n) {
+ memset(mat, 0, n * n * sizeof(double));
+ memset(y, 0, n * sizeof(double));
+}
+
+// Round the given positive value to nearest integer
+static AOM_FORCE_INLINE int iroundpf(float x) {
+ assert(x >= 0.0);
+ return (int)(x + 0.5f);
+}
+
+static INLINE void least_squares_accumulate(double *mat, double *y,
+ const double *a, double b, int n) {
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < n; j++) {
+ mat[i * n + j] += a[i] * a[j];
+ }
+ }
+ for (int i = 0; i < n; i++) {
+ y[i] += a[i] * b;
+ }
+}
+
+static INLINE int least_squares_solve(double *mat, double *y, double *x,
+ int n) {
+ return linsolve(n, mat, n, y, x);
+}
+
+// Matrix multiply
+static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+ const int m1_rows, const int inner_dim,
+ const int m2_cols) {
+ double sum;
+
+ int row, col, inner;
+ for (row = 0; row < m1_rows; ++row) {
+ for (col = 0; col < m2_cols; ++col) {
+ sum = 0;
+ for (inner = 0; inner < inner_dim; ++inner)
+ sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+ *(res++) = sum;
+ }
+ }
+}
+
+static AOM_INLINE float approx_exp(float y) {
+#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2)
+#define B \
+ 127 // Offset for the exponent according to IEEE floating point standard.
+#define C 60801 // Magic number controls the accuracy of approximation
+ union {
+ float as_float;
+ int32_t as_int32;
+ } container;
+ container.as_int32 = ((int32_t)(y * A)) + ((B << 23) - C);
+ return container.as_float;
+#undef A
+#undef B
+#undef C
+}
+#endif // AOM_AOM_DSP_MATHUTILS_H_
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
new file mode 100644
index 0000000000..065ec9a106
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -0,0 +1,1692 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+
+#define kLowPolyNumParams 3
+
+static const int kMaxLag = 4;
+
+// Defines a function that can be used to obtain the mean of a block for the
+// provided data type (uint8_t, or uint16_t)
+#define GET_BLOCK_MEAN(INT_TYPE, suffix) \
+ static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
+ int stride, int x_o, int y_o, \
+ int block_size) { \
+ const int max_h = AOMMIN(h - y_o, block_size); \
+ const int max_w = AOMMIN(w - x_o, block_size); \
+ double block_mean = 0; \
+ for (int y = 0; y < max_h; ++y) { \
+ for (int x = 0; x < max_w; ++x) { \
+ block_mean += data[(y_o + y) * stride + x_o + x]; \
+ } \
+ } \
+ return block_mean / (max_w * max_h); \
+ }
+
+GET_BLOCK_MEAN(uint8_t, lowbd)
+GET_BLOCK_MEAN(uint16_t, highbd)
+
+static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+ int stride, int x_o, int y_o,
+ int block_size, int use_highbd) {
+ if (use_highbd)
+ return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+ block_size);
+ return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+}
+
+// Defines a function that can be used to obtain the variance of a block
+// for the provided data type (uint8_t, or uint16_t)
+#define GET_NOISE_VAR(INT_TYPE, suffix) \
+ static double get_noise_var_##suffix( \
+ const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
+ int h, int x_o, int y_o, int block_size_x, int block_size_y) { \
+ const int max_h = AOMMIN(h - y_o, block_size_y); \
+ const int max_w = AOMMIN(w - x_o, block_size_x); \
+ double noise_var = 0; \
+ double noise_mean = 0; \
+ for (int y = 0; y < max_h; ++y) { \
+ for (int x = 0; x < max_w; ++x) { \
+ double noise = (double)data[(y_o + y) * stride + x_o + x] - \
+ denoised[(y_o + y) * stride + x_o + x]; \
+ noise_mean += noise; \
+ noise_var += noise * noise; \
+ } \
+ } \
+ noise_mean /= (max_w * max_h); \
+ return noise_var / (max_w * max_h) - noise_mean * noise_mean; \
+ }
+
+GET_NOISE_VAR(uint8_t, lowbd)
+GET_NOISE_VAR(uint16_t, highbd)
+
+static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+ int w, int h, int stride, int x_o, int y_o,
+ int block_size_x, int block_size_y,
+ int use_highbd) {
+ if (use_highbd)
+ return get_noise_var_highbd((const uint16_t *)data,
+ (const uint16_t *)denoised, w, h, stride, x_o,
+ y_o, block_size_x, block_size_y);
+ return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
+ block_size_x, block_size_y);
+}
+
+static void equation_system_clear(aom_equation_system_t *eqns) {
+ const int n = eqns->n;
+ memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
+ memset(eqns->x, 0, sizeof(*eqns->x) * n);
+ memset(eqns->b, 0, sizeof(*eqns->b) * n);
+}
+
+static void equation_system_copy(aom_equation_system_t *dst,
+ const aom_equation_system_t *src) {
+ const int n = dst->n;
+ memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
+ memcpy(dst->x, src->x, sizeof(*dst->x) * n);
+ memcpy(dst->b, src->b, sizeof(*dst->b) * n);
+}
+
+static int equation_system_init(aom_equation_system_t *eqns, int n) {
+ eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
+ eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
+ eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
+ eqns->n = n;
+ if (!eqns->A || !eqns->b || !eqns->x) {
+ fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
+ aom_free(eqns->A);
+ aom_free(eqns->b);
+ aom_free(eqns->x);
+ memset(eqns, 0, sizeof(*eqns));
+ return 0;
+ }
+ equation_system_clear(eqns);
+ return 1;
+}
+
+static int equation_system_solve(aom_equation_system_t *eqns) {
+ const int n = eqns->n;
+ double *b = (double *)aom_malloc(sizeof(*b) * n);
+ double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+ int ret = 0;
+ if (A == NULL || b == NULL) {
+ fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
+ aom_free(b);
+ aom_free(A);
+ return 0;
+ }
+ memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
+ memcpy(b, eqns->b, sizeof(*eqns->b) * n);
+ ret = linsolve(n, A, eqns->n, b, eqns->x);
+ aom_free(b);
+ aom_free(A);
+
+ if (ret == 0) {
+ return 0;
+ }
+ return 1;
+}
+
+static void equation_system_add(aom_equation_system_t *dest,
+ aom_equation_system_t *src) {
+ const int n = dest->n;
+ int i, j;
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n; ++j) {
+ dest->A[i * n + j] += src->A[i * n + j];
+ }
+ dest->b[i] += src->b[i];
+ }
+}
+
+static void equation_system_free(aom_equation_system_t *eqns) {
+ if (!eqns) return;
+ aom_free(eqns->A);
+ aom_free(eqns->b);
+ aom_free(eqns->x);
+ memset(eqns, 0, sizeof(*eqns));
+}
+
+static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
+ equation_system_clear(&solver->eqns);
+ solver->num_equations = 0;
+ solver->total = 0;
+}
+
+static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
+ aom_noise_strength_solver_t *src) {
+ equation_system_add(&dest->eqns, &src->eqns);
+ dest->num_equations += src->num_equations;
+ dest->total += src->total;
+}
+
+// Return the number of coefficients required for the given parameters
+static int num_coeffs(const aom_noise_model_params_t params) {
+ const int n = 2 * params.lag + 1;
+ switch (params.shape) {
+ case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
+ case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
+ }
+ return 0;
+}
+
+static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
+ const int kNumBins = 20;
+ if (!equation_system_init(&state->eqns, n)) {
+ fprintf(stderr, "Failed initialization noise state with size %d\n", n);
+ return 0;
+ }
+ state->ar_gain = 1.0;
+ state->num_observations = 0;
+ return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
+ bit_depth);
+}
+
+static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
+ const double kTolerance = 1e-6;
+ const int last = eqns->n - 1;
+ // Set all of the AR coefficients to zero, but try to solve for correlation
+ // with the luma channel
+ memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
+ if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
+ eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
+ }
+}
+
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
+ if (!lut) return 0;
+ if (num_points <= 0) return 0;
+ lut->num_points = 0;
+ lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
+ if (!lut->points) return 0;
+ lut->num_points = num_points;
+ memset(lut->points, 0, sizeof(*lut->points) * num_points);
+ return 1;
+}
+
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
+ if (!lut) return;
+ aom_free(lut->points);
+ memset(lut, 0, sizeof(*lut));
+}
+
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+ double x) {
+ int i = 0;
+ // Constant extrapolation for x < x_0.
+ if (x < lut->points[0][0]) return lut->points[0][1];
+ for (i = 0; i < lut->num_points - 1; ++i) {
+ if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
+ const double a =
+ (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
+ return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
+ }
+ }
+ // Constant extrapolation for x > x_{n-1}
+ return lut->points[lut->num_points - 1][1];
+}
+
+static double noise_strength_solver_get_bin_index(
+ const aom_noise_strength_solver_t *solver, double value) {
+ const double val =
+ fclamp(value, solver->min_intensity, solver->max_intensity);
+ const double range = solver->max_intensity - solver->min_intensity;
+ return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
+}
+
+static double noise_strength_solver_get_value(
+ const aom_noise_strength_solver_t *solver, double x) {
+ const double bin = noise_strength_solver_get_bin_index(solver, x);
+ const int bin_i0 = (int)floor(bin);
+ const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+ const double a = bin - bin_i0;
+ return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
+}
+
+void aom_noise_strength_solver_add_measurement(
+ aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
+ const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
+ const int bin_i0 = (int)floor(bin);
+ const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+ const double a = bin - bin_i0;
+ const int n = solver->num_bins;
+ solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
+ solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
+ solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
+ solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
+ solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
+ solver->eqns.b[bin_i1] += a * noise_std;
+ solver->total += noise_std;
+ solver->num_equations++;
+}
+
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
+ // Add regularization proportional to the number of constraints
+ const int n = solver->num_bins;
+ const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
+ int result = 0;
+ double mean = 0;
+
+ // Do this in a non-destructive manner so it is not confusing to the caller
+ double *old_A = solver->eqns.A;
+ double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+ if (!A) {
+ fprintf(stderr, "Unable to allocate copy of A\n");
+ return 0;
+ }
+ memcpy(A, old_A, sizeof(*A) * n * n);
+
+ for (int i = 0; i < n; ++i) {
+ const int i_lo = AOMMAX(0, i - 1);
+ const int i_hi = AOMMIN(n - 1, i + 1);
+ A[i * n + i_lo] -= kAlpha;
+ A[i * n + i] += 2 * kAlpha;
+ A[i * n + i_hi] -= kAlpha;
+ }
+
+ // Small regularization to give average noise strength
+ mean = solver->total / solver->num_equations;
+ for (int i = 0; i < n; ++i) {
+ A[i * n + i] += 1.0 / 8192.;
+ solver->eqns.b[i] += mean / 8192.;
+ }
+ solver->eqns.A = A;
+ result = equation_system_solve(&solver->eqns);
+ solver->eqns.A = old_A;
+
+ aom_free(A);
+ return result;
+}
+
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+ int num_bins, int bit_depth) {
+ if (!solver) return 0;
+ memset(solver, 0, sizeof(*solver));
+ solver->num_bins = num_bins;
+ solver->min_intensity = 0;
+ solver->max_intensity = (1 << bit_depth) - 1;
+ solver->total = 0;
+ solver->num_equations = 0;
+ return equation_system_init(&solver->eqns, num_bins);
+}
+
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
+ if (!solver) return;
+ equation_system_free(&solver->eqns);
+}
+
+double aom_noise_strength_solver_get_center(
+ const aom_noise_strength_solver_t *solver, int i) {
+ const double range = solver->max_intensity - solver->min_intensity;
+ const int n = solver->num_bins;
+ return ((double)i) / (n - 1) * range + solver->min_intensity;
+}
+
+// Computes the residual if a point were to be removed from the lut. This is
+// calculated as the area between the output of the solver and the line segment
+// that would be formed between [x_{i - 1}, x_{i + 1}).
+static void update_piecewise_linear_residual(
+ const aom_noise_strength_solver_t *solver,
+ const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
+ const double dx = 255. / solver->num_bins;
+ for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
+ const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
+ solver, lut->points[i - 1][0])));
+ const int upper = AOMMIN(solver->num_bins - 1,
+ (int)ceil(noise_strength_solver_get_bin_index(
+ solver, lut->points[i + 1][0])));
+ double r = 0;
+ for (int j = lower; j <= upper; ++j) {
+ const double x = aom_noise_strength_solver_get_center(solver, j);
+ if (x < lut->points[i - 1][0]) continue;
+ if (x >= lut->points[i + 1][0]) continue;
+ const double y = solver->eqns.x[j];
+ const double a = (x - lut->points[i - 1][0]) /
+ (lut->points[i + 1][0] - lut->points[i - 1][0]);
+ const double estimate_y =
+ lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
+ r += fabs(y - estimate_y);
+ }
+ residual[i] = r * dx;
+ }
+}
+
+int aom_noise_strength_solver_fit_piecewise(
+ const aom_noise_strength_solver_t *solver, int max_output_points,
+ aom_noise_strength_lut_t *lut) {
+ // The tolerance is normalized to be give consistent results between
+ // different bit-depths.
+ const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
+ if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
+ fprintf(stderr, "Failed to init lut\n");
+ return 0;
+ }
+ for (int i = 0; i < solver->num_bins; ++i) {
+ lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
+ lut->points[i][1] = solver->eqns.x[i];
+ }
+ if (max_output_points < 0) {
+ max_output_points = solver->num_bins;
+ }
+
+ double *residual = (double *)aom_malloc(solver->num_bins * sizeof(*residual));
+ if (!residual) {
+ aom_noise_strength_lut_free(lut);
+ return 0;
+ }
+ memset(residual, 0, sizeof(*residual) * solver->num_bins);
+
+ update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
+
+ // Greedily remove points if there are too many or if it doesn't hurt local
+ // approximation (never remove the end points)
+ while (lut->num_points > 2) {
+ int min_index = 1;
+ for (int j = 1; j < lut->num_points - 1; ++j) {
+ if (residual[j] < residual[min_index]) {
+ min_index = j;
+ }
+ }
+ const double dx =
+ lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
+ const double avg_residual = residual[min_index] / dx;
+ if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
+ break;
+ }
+
+ const int num_remaining = lut->num_points - min_index - 1;
+ memmove(lut->points + min_index, lut->points + min_index + 1,
+ sizeof(lut->points[0]) * num_remaining);
+ lut->num_points--;
+
+ update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
+ min_index + 1);
+ }
+ aom_free(residual);
+ return 1;
+}
+
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+ int block_size, int bit_depth, int use_highbd) {
+ const int n = block_size * block_size;
+ aom_equation_system_t eqns;
+ double *AtA_inv = 0;
+ double *A = 0;
+ int x = 0, y = 0, i = 0, j = 0;
+ block_finder->A = NULL;
+ block_finder->AtA_inv = NULL;
+
+ if (!equation_system_init(&eqns, kLowPolyNumParams)) {
+ fprintf(stderr, "Failed to init equation system for block_size=%d\n",
+ block_size);
+ return 0;
+ }
+
+ AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
+ sizeof(*AtA_inv));
+ A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
+ if (AtA_inv == NULL || A == NULL) {
+ fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
+ block_size);
+ aom_free(AtA_inv);
+ aom_free(A);
+ equation_system_free(&eqns);
+ return 0;
+ }
+
+ block_finder->A = A;
+ block_finder->AtA_inv = AtA_inv;
+ block_finder->block_size = block_size;
+ block_finder->normalization = (1 << bit_depth) - 1;
+ block_finder->use_highbd = use_highbd;
+
+ for (y = 0; y < block_size; ++y) {
+ const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
+ for (x = 0; x < block_size; ++x) {
+ const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
+ const double coords[3] = { yd, xd, 1 };
+ const int row = y * block_size + x;
+ A[kLowPolyNumParams * row + 0] = yd;
+ A[kLowPolyNumParams * row + 1] = xd;
+ A[kLowPolyNumParams * row + 2] = 1;
+
+ for (i = 0; i < kLowPolyNumParams; ++i) {
+ for (j = 0; j < kLowPolyNumParams; ++j) {
+ eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
+ }
+ }
+ }
+ }
+
+ // Lazy inverse using existing equation solver.
+ for (i = 0; i < kLowPolyNumParams; ++i) {
+ memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
+ eqns.b[i] = 1;
+ equation_system_solve(&eqns);
+
+ for (j = 0; j < kLowPolyNumParams; ++j) {
+ AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
+ }
+ }
+ equation_system_free(&eqns);
+ return 1;
+}
+
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
+ if (!block_finder) return;
+ aom_free(block_finder->A);
+ aom_free(block_finder->AtA_inv);
+ memset(block_finder, 0, sizeof(*block_finder));
+}
+
+void aom_flat_block_finder_extract_block(
+ const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+ int w, int h, int stride, int offsx, int offsy, double *plane,
+ double *block) {
+ const int block_size = block_finder->block_size;
+ const int n = block_size * block_size;
+ const double *A = block_finder->A;
+ const double *AtA_inv = block_finder->AtA_inv;
+ double plane_coords[kLowPolyNumParams];
+ double AtA_inv_b[kLowPolyNumParams];
+ int xi, yi, i;
+
+ if (block_finder->use_highbd) {
+ const uint16_t *const data16 = (const uint16_t *const)data;
+ for (yi = 0; yi < block_size; ++yi) {
+ const int y = clamp(offsy + yi, 0, h - 1);
+ for (xi = 0; xi < block_size; ++xi) {
+ const int x = clamp(offsx + xi, 0, w - 1);
+ block[yi * block_size + xi] =
+ ((double)data16[y * stride + x]) / block_finder->normalization;
+ }
+ }
+ } else {
+ for (yi = 0; yi < block_size; ++yi) {
+ const int y = clamp(offsy + yi, 0, h - 1);
+ for (xi = 0; xi < block_size; ++xi) {
+ const int x = clamp(offsx + xi, 0, w - 1);
+ block[yi * block_size + xi] =
+ ((double)data[y * stride + x]) / block_finder->normalization;
+ }
+ }
+ }
+ multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
+ multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
+ kLowPolyNumParams, 1);
+ multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
+
+ for (i = 0; i < n; ++i) {
+ block[i] -= plane[i];
+ }
+}
+
+typedef struct {
+ int index;
+ float score;
+} index_and_score_t;
+
+static int compare_scores(const void *a, const void *b) {
+ const float diff =
+ ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
+ if (diff < 0)
+ return -1;
+ else if (diff > 0)
+ return 1;
+ return 0;
+}
+
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+ const uint8_t *const data, int w, int h,
+ int stride, uint8_t *flat_blocks) {
+ // The gradient-based features used in this code are based on:
+ // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
+ // correlation for improved video denoising," 2012 19th, ICIP.
+ // The thresholds are more lenient to allow for correct grain modeling
+ // if extreme cases.
+ const int block_size = block_finder->block_size;
+ const int n = block_size * block_size;
+ const double kTraceThreshold = 0.15 / (32 * 32);
+ const double kRatioThreshold = 1.25;
+ const double kNormThreshold = 0.08 / (32 * 32);
+ const double kVarThreshold = 0.005 / (double)n;
+ const int num_blocks_w = (w + block_size - 1) / block_size;
+ const int num_blocks_h = (h + block_size - 1) / block_size;
+ int num_flat = 0;
+ double *plane = (double *)aom_malloc(n * sizeof(*plane));
+ double *block = (double *)aom_malloc(n * sizeof(*block));
+ index_and_score_t *scores = (index_and_score_t *)aom_malloc(
+ num_blocks_w * num_blocks_h * sizeof(*scores));
+ if (plane == NULL || block == NULL || scores == NULL) {
+ fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
+ aom_free(plane);
+ aom_free(block);
+ aom_free(scores);
+ return -1;
+ }
+
+#ifdef NOISE_MODEL_LOG_SCORE
+ fprintf(stderr, "score = [");
+#endif
+ for (int by = 0; by < num_blocks_h; ++by) {
+ for (int bx = 0; bx < num_blocks_w; ++bx) {
+ // Compute gradient covariance matrix.
+ aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
+ bx * block_size, by * block_size,
+ plane, block);
+ double Gxx = 0, Gxy = 0, Gyy = 0;
+ double mean = 0;
+ double var = 0;
+
+ for (int yi = 1; yi < block_size - 1; ++yi) {
+ for (int xi = 1; xi < block_size - 1; ++xi) {
+ const double gx = (block[yi * block_size + xi + 1] -
+ block[yi * block_size + xi - 1]) /
+ 2;
+ const double gy = (block[yi * block_size + xi + block_size] -
+ block[yi * block_size + xi - block_size]) /
+ 2;
+ Gxx += gx * gx;
+ Gxy += gx * gy;
+ Gyy += gy * gy;
+
+ const double value = block[yi * block_size + xi];
+ mean += value;
+ var += value * value;
+ }
+ }
+ mean /= (block_size - 2) * (block_size - 2);
+
+ // Normalize gradients by block_size.
+ Gxx /= ((block_size - 2) * (block_size - 2));
+ Gxy /= ((block_size - 2) * (block_size - 2));
+ Gyy /= ((block_size - 2) * (block_size - 2));
+ var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
+
+ {
+ const double trace = Gxx + Gyy;
+ const double det = Gxx * Gyy - Gxy * Gxy;
+ const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
+ const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
+ const double norm = e1; // Spectral norm
+ const double ratio = (e1 / AOMMAX(e2, 1e-6));
+ const int is_flat = (trace < kTraceThreshold) &&
+ (ratio < kRatioThreshold) &&
+ (norm < kNormThreshold) && (var > kVarThreshold);
+ // The following weights are used to combine the above features to give
+ // a sigmoid score for flatness. If the input was normalized to [0,100]
+ // the magnitude of these values would be close to 1 (e.g., weights
+ // corresponding to variance would be a factor of 10000x smaller).
+ // The weights are given in the following order:
+ // [{var}, {ratio}, {trace}, {norm}, offset]
+ // with one of the most discriminative being simply the variance.
+ const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
+ double sum_weights = weights[0] * var + weights[1] * ratio +
+ weights[2] * trace + weights[3] * norm +
+ weights[4];
+ // clamp the value to [-25.0, 100.0] to prevent overflow
+ sum_weights = fclamp(sum_weights, -25.0, 100.0);
+ const float score = (float)(1.0 / (1 + exp(-sum_weights)));
+ flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
+ scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
+ scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
+#ifdef NOISE_MODEL_LOG_SCORE
+ fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
+ is_flat);
+#endif
+ num_flat += is_flat;
+ }
+ }
+#ifdef NOISE_MODEL_LOG_SCORE
+ fprintf(stderr, "\n");
+#endif
+ }
+#ifdef NOISE_MODEL_LOG_SCORE
+ fprintf(stderr, "];\n");
+#endif
+ // Find the top-scored blocks (most likely to be flat) and set the flat blocks
+ // be the union of the thresholded results and the top 10th percentile of the
+ // scored results.
+ qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
+ const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
+ const float score_threshold = scores[top_nth_percentile].score;
+ for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
+ if (scores[i].score >= score_threshold) {
+ num_flat += flat_blocks[scores[i].index] == 0;
+ flat_blocks[scores[i].index] |= 1;
+ }
+ }
+ aom_free(block);
+ aom_free(plane);
+ aom_free(scores);
+ return num_flat;
+}
+
+int aom_noise_model_init(aom_noise_model_t *model,
+ const aom_noise_model_params_t params) {
+ const int n = num_coeffs(params);
+ const int lag = params.lag;
+ const int bit_depth = params.bit_depth;
+ int x = 0, y = 0, i = 0, c = 0;
+
+ memset(model, 0, sizeof(*model));
+ if (params.lag < 1) {
+ fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
+ return 0;
+ }
+ if (params.lag > kMaxLag) {
+ fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
+ kMaxLag);
+ return 0;
+ }
+ if (!(params.bit_depth == 8 || params.bit_depth == 10 ||
+ params.bit_depth == 12)) {
+ return 0;
+ }
+
+ memcpy(&model->params, &params, sizeof(params));
+ for (c = 0; c < 3; ++c) {
+ if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
+ fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+ aom_noise_model_free(model);
+ return 0;
+ }
+ if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
+ fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+ aom_noise_model_free(model);
+ return 0;
+ }
+ }
+ model->n = n;
+ model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+ if (!model->coords) {
+ aom_noise_model_free(model);
+ return 0;
+ }
+
+ for (y = -lag; y <= 0; ++y) {
+ const int max_x = y == 0 ? -1 : lag;
+ for (x = -lag; x <= max_x; ++x) {
+ switch (params.shape) {
+ case AOM_NOISE_SHAPE_DIAMOND:
+ if (abs(x) <= y + lag) {
+ model->coords[i][0] = x;
+ model->coords[i][1] = y;
+ ++i;
+ }
+ break;
+ case AOM_NOISE_SHAPE_SQUARE:
+ model->coords[i][0] = x;
+ model->coords[i][1] = y;
+ ++i;
+ break;
+ default:
+ fprintf(stderr, "Invalid shape\n");
+ aom_noise_model_free(model);
+ return 0;
+ }
+ }
+ }
+ assert(i == n);
+ return 1;
+}
+
+void aom_noise_model_free(aom_noise_model_t *model) {
+ int c = 0;
+ if (!model) return;
+
+ aom_free(model->coords);
+ for (c = 0; c < 3; ++c) {
+ equation_system_free(&model->latest_state[c].eqns);
+ equation_system_free(&model->combined_state[c].eqns);
+
+ equation_system_free(&model->latest_state[c].strength_solver.eqns);
+ equation_system_free(&model->combined_state[c].strength_solver.eqns);
+ }
+ memset(model, 0, sizeof(*model));
+}
+
+// Extracts the neighborhood defined by coords around point (x, y) from
+// the difference between the data and denoised images. Also extracts the
+// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
+#define EXTRACT_AR_ROW(INT_TYPE, suffix) \
+ static double extract_ar_row_##suffix( \
+ int(*coords)[2], int num_coords, const INT_TYPE *const data, \
+ const INT_TYPE *const denoised, int stride, int sub_log2[2], \
+ const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \
+ int alt_stride, int x, int y, double *buffer) { \
+ for (int i = 0; i < num_coords; ++i) { \
+ const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \
+ buffer[i] = \
+ (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
+ } \
+ const double val = \
+ (double)data[y * stride + x] - denoised[y * stride + x]; \
+ \
+ if (alt_data && alt_denoised) { \
+ double avg_data = 0, avg_denoised = 0; \
+ int num_samples = 0; \
+ for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \
+ const int y_up = (y << sub_log2[1]) + dy_i; \
+ for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \
+ const int x_up = (x << sub_log2[0]) + dx_i; \
+ avg_data += alt_data[y_up * alt_stride + x_up]; \
+ avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \
+ num_samples++; \
+ } \
+ } \
+ buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \
+ } \
+ return val; \
+ }
+
+EXTRACT_AR_ROW(uint8_t, lowbd)
+EXTRACT_AR_ROW(uint16_t, highbd)
+
+static int add_block_observations(
+ aom_noise_model_t *noise_model, int c, const uint8_t *const data,
+ const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
+ const uint8_t *const alt_data, const uint8_t *const alt_denoised,
+ int alt_stride, const uint8_t *const flat_blocks, int block_size,
+ int num_blocks_w, int num_blocks_h) {
+ const int lag = noise_model->params.lag;
+ const int num_coords = noise_model->n;
+ const double normalization = (1 << noise_model->params.bit_depth) - 1;
+ double *A = noise_model->latest_state[c].eqns.A;
+ double *b = noise_model->latest_state[c].eqns.b;
+ double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
+ const int n = noise_model->latest_state[c].eqns.n;
+
+ if (!buffer) {
+ fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
+ return 0;
+ }
+ for (int by = 0; by < num_blocks_h; ++by) {
+ const int y_o = by * (block_size >> sub_log2[1]);
+ for (int bx = 0; bx < num_blocks_w; ++bx) {
+ const int x_o = bx * (block_size >> sub_log2[0]);
+ if (!flat_blocks[by * num_blocks_w + bx]) {
+ continue;
+ }
+ int y_start =
+ (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
+ int x_start =
+ (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
+ int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+ block_size >> sub_log2[1]);
+ int x_end = AOMMIN(
+ (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
+ (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
+ ? (block_size >> sub_log2[0])
+ : ((block_size >> sub_log2[0]) - lag));
+ for (int y = y_start; y < y_end; ++y) {
+ for (int x = x_start; x < x_end; ++x) {
+ const double val =
+ noise_model->params.use_highbd
+ ? extract_ar_row_highbd(noise_model->coords, num_coords,
+ (const uint16_t *const)data,
+ (const uint16_t *const)denoised,
+ stride, sub_log2,
+ (const uint16_t *const)alt_data,
+ (const uint16_t *const)alt_denoised,
+ alt_stride, x + x_o, y + y_o, buffer)
+ : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
+ denoised, stride, sub_log2, alt_data,
+ alt_denoised, alt_stride, x + x_o,
+ y + y_o, buffer);
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ A[i * n + j] +=
+ (buffer[i] * buffer[j]) / (normalization * normalization);
+ }
+ b[i] += (buffer[i] * val) / (normalization * normalization);
+ }
+ noise_model->latest_state[c].num_observations++;
+ }
+ }
+ }
+ }
+ aom_free(buffer);
+ return 1;
+}
+
+static void add_noise_std_observations(
+ aom_noise_model_t *noise_model, int c, const double *coeffs,
+ const uint8_t *const data, const uint8_t *const denoised, int w, int h,
+ int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
+ const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
+ int num_blocks_h) {
+ const int num_coords = noise_model->n;
+ aom_noise_strength_solver_t *noise_strength_solver =
+ &noise_model->latest_state[c].strength_solver;
+
+ const aom_noise_strength_solver_t *noise_strength_luma =
+ &noise_model->latest_state[0].strength_solver;
+ const double luma_gain = noise_model->latest_state[0].ar_gain;
+ const double noise_gain = noise_model->latest_state[c].ar_gain;
+ for (int by = 0; by < num_blocks_h; ++by) {
+ const int y_o = by * (block_size >> sub_log2[1]);
+ for (int bx = 0; bx < num_blocks_w; ++bx) {
+ const int x_o = bx * (block_size >> sub_log2[0]);
+ if (!flat_blocks[by * num_blocks_w + bx]) {
+ continue;
+ }
+ const int num_samples_h =
+ AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+ block_size >> sub_log2[1]);
+ const int num_samples_w =
+ AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
+ (block_size >> sub_log2[0]));
+ // Make sure that we have a reasonable amount of samples to consider the
+ // block
+ if (num_samples_w * num_samples_h > block_size) {
+ const double block_mean = get_block_mean(
+ alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
+ x_o << sub_log2[0], y_o << sub_log2[1], block_size,
+ noise_model->params.use_highbd);
+ const double noise_var = get_noise_var(
+ data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
+ y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
+ noise_model->params.use_highbd);
+ // We want to remove the part of the noise that came from being
+ // correlated with luma. Note that the noise solver for luma must
+ // have already been run.
+ const double luma_strength =
+ c > 0 ? luma_gain * noise_strength_solver_get_value(
+ noise_strength_luma, block_mean)
+ : 0;
+ const double corr = c > 0 ? coeffs[num_coords] : 0;
+ // Chroma noise:
+ // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
+ // The uncorrelated component:
+ // uncorr_var = noise_var - (corr * luma_strength)^2
+ // But don't allow fully correlated noise (hence the max), since the
+ // synthesis cannot model it.
+ const double uncorr_std = sqrt(
+ AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
+ // After we've removed correlation with luma, undo the gain that will
+ // come from running the IIR filter.
+ const double adjusted_strength = uncorr_std / noise_gain;
+ aom_noise_strength_solver_add_measurement(
+ noise_strength_solver, block_mean, adjusted_strength);
+ }
+ }
+ }
+}
+
+// Return true if the noise estimate appears to be different from the combined
+// (multi-frame) estimate. The difference is measured by checking whether the
+// AR coefficients have diverged (using a threshold on normalized cross
+// correlation), or whether the noise strength has changed.
+static int is_noise_model_different(aom_noise_model_t *const noise_model) {
+ // These thresholds are kind of arbitrary and will likely need further tuning
+ // (or exported as parameters). The threshold on noise strength is a weighted
+ // difference between the noise strength histograms
+ const double kCoeffThreshold = 0.9;
+ const double kStrengthThreshold =
+ 0.005 * (1 << (noise_model->params.bit_depth - 8));
+ for (int c = 0; c < 1; ++c) {
+ const double corr =
+ aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
+ noise_model->combined_state[c].eqns.x,
+ noise_model->combined_state[c].eqns.n);
+ if (corr < kCoeffThreshold) return 1;
+
+ const double dx =
+ 1.0 / noise_model->latest_state[c].strength_solver.num_bins;
+
+ const aom_equation_system_t *latest_eqns =
+ &noise_model->latest_state[c].strength_solver.eqns;
+ const aom_equation_system_t *combined_eqns =
+ &noise_model->combined_state[c].strength_solver.eqns;
+ double diff = 0;
+ double total_weight = 0;
+ for (int j = 0; j < latest_eqns->n; ++j) {
+ double weight = 0;
+ for (int i = 0; i < latest_eqns->n; ++i) {
+ weight += latest_eqns->A[i * latest_eqns->n + j];
+ }
+ weight = sqrt(weight);
+ diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
+ total_weight += weight;
+ }
+ if (diff * dx / total_weight > kStrengthThreshold) return 1;
+ }
+ return 0;
+}
+
+static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
+ const int ret = equation_system_solve(&state->eqns);
+ state->ar_gain = 1.0;
+ if (!ret) return ret;
+
+ // Update the AR gain from the equation system as it will be used to fit
+ // the noise strength as a function of intensity. In the Yule-Walker
+ // equations, the diagonal should be the variance of the correlated noise.
+ // In the case of the least squares estimate, there will be some variability
+ // in the diagonal. So use the mean of the diagonal as the estimate of
+ // overall variance (this works for least squares or Yule-Walker formulation).
+ double var = 0;
+ const int n = state->eqns.n;
+ for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
+ var += state->eqns.A[i * n + i] / state->num_observations;
+ }
+ var /= (n - is_chroma);
+
+ // Keep track of E(Y^2) = <b, x> + E(X^2)
+ // In the case that we are using chroma and have an estimate of correlation
+ // with luma we adjust that estimate slightly to remove the correlated bits by
+ // subtracting out the last column of a scaled by our correlation estimate
+ // from b. E(y^2) = <b - A(:, end)*x(end), x>
+ double sum_covar = 0;
+ for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
+ double bi = state->eqns.b[i];
+ if (is_chroma) {
+ bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
+ }
+ sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
+ }
+ // Now, get an estimate of the variance of uncorrelated noise signal and use
+ // it to determine the gain of the AR filter.
+ const double noise_var = AOMMAX(var - sum_covar, 1e-6);
+ state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
+ return ret;
+}
+
+aom_noise_status_t aom_noise_model_update(
+ aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+ const uint8_t *const denoised[3], int w, int h, int stride[3],
+ int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
+ const int num_blocks_w = (w + block_size - 1) / block_size;
+ const int num_blocks_h = (h + block_size - 1) / block_size;
+ int y_model_different = 0;
+ int num_blocks = 0;
+ int i = 0, channel = 0;
+
+ if (block_size <= 1) {
+ fprintf(stderr, "block_size = %d must be > 1\n", block_size);
+ return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+ }
+
+ if (block_size < noise_model->params.lag * 2 + 1) {
+ fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
+ noise_model->params.lag * 2 + 1);
+ return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+ }
+
+ // Clear the latest equation system
+ for (i = 0; i < 3; ++i) {
+ equation_system_clear(&noise_model->latest_state[i].eqns);
+ noise_model->latest_state[i].num_observations = 0;
+ noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
+ }
+
+ // Check that we have enough flat blocks
+ for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
+ if (flat_blocks[i]) {
+ num_blocks++;
+ }
+ }
+
+ if (num_blocks <= 1) {
+ fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
+ return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
+ }
+
+ for (channel = 0; channel < 3; ++channel) {
+ int no_subsampling[2] = { 0, 0 };
+ const uint8_t *alt_data = channel > 0 ? data[0] : 0;
+ const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
+ int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
+ const int is_chroma = channel != 0;
+ if (!data[channel] || !denoised[channel]) break;
+ if (!add_block_observations(noise_model, channel, data[channel],
+ denoised[channel], w, h, stride[channel], sub,
+ alt_data, alt_denoised, stride[0], flat_blocks,
+ block_size, num_blocks_w, num_blocks_h)) {
+ fprintf(stderr, "Adding block observation failed\n");
+ return AOM_NOISE_STATUS_INTERNAL_ERROR;
+ }
+
+ if (!ar_equation_system_solve(&noise_model->latest_state[channel],
+ is_chroma)) {
+ if (is_chroma) {
+ set_chroma_coefficient_fallback_soln(
+ &noise_model->latest_state[channel].eqns);
+ } else {
+ fprintf(stderr, "Solving latest noise equation system failed %d!\n",
+ channel);
+ return AOM_NOISE_STATUS_INTERNAL_ERROR;
+ }
+ }
+
+ add_noise_std_observations(
+ noise_model, channel, noise_model->latest_state[channel].eqns.x,
+ data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
+ stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
+
+ if (!aom_noise_strength_solver_solve(
+ &noise_model->latest_state[channel].strength_solver)) {
+ fprintf(stderr, "Solving latest noise strength failed!\n");
+ return AOM_NOISE_STATUS_INTERNAL_ERROR;
+ }
+
+ // Check noise characteristics and return if error.
+ if (channel == 0 &&
+ noise_model->combined_state[channel].strength_solver.num_equations >
+ 0 &&
+ is_noise_model_different(noise_model)) {
+ y_model_different = 1;
+ }
+
+ // Don't update the combined stats if the y model is different.
+ if (y_model_different) continue;
+
+ noise_model->combined_state[channel].num_observations +=
+ noise_model->latest_state[channel].num_observations;
+ equation_system_add(&noise_model->combined_state[channel].eqns,
+ &noise_model->latest_state[channel].eqns);
+ if (!ar_equation_system_solve(&noise_model->combined_state[channel],
+ is_chroma)) {
+ if (is_chroma) {
+ set_chroma_coefficient_fallback_soln(
+ &noise_model->combined_state[channel].eqns);
+ } else {
+ fprintf(stderr, "Solving combined noise equation system failed %d!\n",
+ channel);
+ return AOM_NOISE_STATUS_INTERNAL_ERROR;
+ }
+ }
+
+ noise_strength_solver_add(
+ &noise_model->combined_state[channel].strength_solver,
+ &noise_model->latest_state[channel].strength_solver);
+
+ if (!aom_noise_strength_solver_solve(
+ &noise_model->combined_state[channel].strength_solver)) {
+ fprintf(stderr, "Solving combined noise strength failed!\n");
+ return AOM_NOISE_STATUS_INTERNAL_ERROR;
+ }
+ }
+
+ return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
+ : AOM_NOISE_STATUS_OK;
+}
+
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
+ for (int c = 0; c < 3; c++) {
+ equation_system_copy(&noise_model->combined_state[c].eqns,
+ &noise_model->latest_state[c].eqns);
+ equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
+ &noise_model->latest_state[c].strength_solver.eqns);
+ noise_model->combined_state[c].strength_solver.num_equations =
+ noise_model->latest_state[c].strength_solver.num_equations;
+ noise_model->combined_state[c].num_observations =
+ noise_model->latest_state[c].num_observations;
+ noise_model->combined_state[c].ar_gain =
+ noise_model->latest_state[c].ar_gain;
+ }
+}
+
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+ aom_film_grain_t *film_grain) {
+ if (noise_model->params.lag > 3) {
+ fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
+ return 0;
+ }
+ uint16_t random_seed = film_grain->random_seed;
+ memset(film_grain, 0, sizeof(*film_grain));
+ film_grain->random_seed = random_seed;
+
+ film_grain->apply_grain = 1;
+ film_grain->update_parameters = 1;
+
+ film_grain->ar_coeff_lag = noise_model->params.lag;
+
+ // Convert the scaling functions to 8 bit values
+ aom_noise_strength_lut_t scaling_points[3];
+ if (!aom_noise_strength_solver_fit_piecewise(
+ &noise_model->combined_state[0].strength_solver, 14,
+ scaling_points + 0)) {
+ return 0;
+ }
+ if (!aom_noise_strength_solver_fit_piecewise(
+ &noise_model->combined_state[1].strength_solver, 10,
+ scaling_points + 1)) {
+ aom_noise_strength_lut_free(scaling_points + 0);
+ return 0;
+ }
+ if (!aom_noise_strength_solver_fit_piecewise(
+ &noise_model->combined_state[2].strength_solver, 10,
+ scaling_points + 2)) {
+ aom_noise_strength_lut_free(scaling_points + 0);
+ aom_noise_strength_lut_free(scaling_points + 1);
+ return 0;
+ }
+
+ // Both the domain and the range of the scaling functions in the film_grain
+ // are normalized to 8-bit (e.g., they are implicitly scaled during grain
+ // synthesis).
+ const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
+ double max_scaling_value = 1e-4;
+ for (int c = 0; c < 3; ++c) {
+ for (int i = 0; i < scaling_points[c].num_points; ++i) {
+ scaling_points[c].points[i][0] =
+ AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
+ scaling_points[c].points[i][1] =
+ AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
+ max_scaling_value =
+ AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
+ }
+ }
+
+ // Scaling_shift values are in the range [8,11]
+ const int max_scaling_value_log2 =
+ clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
+ film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
+
+ const double scale_factor = 1 << (8 - max_scaling_value_log2);
+ film_grain->num_y_points = scaling_points[0].num_points;
+ film_grain->num_cb_points = scaling_points[1].num_points;
+ film_grain->num_cr_points = scaling_points[2].num_points;
+
+ int(*film_grain_scaling[3])[2] = {
+ film_grain->scaling_points_y,
+ film_grain->scaling_points_cb,
+ film_grain->scaling_points_cr,
+ };
+ for (int c = 0; c < 3; c++) {
+ for (int i = 0; i < scaling_points[c].num_points; ++i) {
+ film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
+ film_grain_scaling[c][i][1] = clamp(
+ (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
+ }
+ }
+ aom_noise_strength_lut_free(scaling_points + 0);
+ aom_noise_strength_lut_free(scaling_points + 1);
+ aom_noise_strength_lut_free(scaling_points + 2);
+
+ // Convert the ar_coeffs into 8-bit values
+ const int n_coeff = noise_model->combined_state[0].eqns.n;
+ double max_coeff = 1e-4, min_coeff = -1e-4;
+ double y_corr[2] = { 0, 0 };
+ double avg_luma_strength = 0;
+ for (int c = 0; c < 3; c++) {
+ aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+ for (int i = 0; i < n_coeff; ++i) {
+ max_coeff = AOMMAX(max_coeff, eqns->x[i]);
+ min_coeff = AOMMIN(min_coeff, eqns->x[i]);
+ }
+ // Since the correlation between luma/chroma was computed in an already
+ // scaled space, we adjust it in the un-scaled space.
+ aom_noise_strength_solver_t *solver =
+ &noise_model->combined_state[c].strength_solver;
+ // Compute a weighted average of the strength for the channel.
+ double average_strength = 0, total_weight = 0;
+ for (int i = 0; i < solver->eqns.n; ++i) {
+ double w = 0;
+ for (int j = 0; j < solver->eqns.n; ++j) {
+ w += solver->eqns.A[i * solver->eqns.n + j];
+ }
+ w = sqrt(w);
+ average_strength += solver->eqns.x[i] * w;
+ total_weight += w;
+ }
+ if (total_weight == 0)
+ average_strength = 1;
+ else
+ average_strength /= total_weight;
+ if (c == 0) {
+ avg_luma_strength = average_strength;
+ } else {
+ y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
+ max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
+ min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
+ }
+ }
+ // Shift value: AR coeffs range (values 6-9)
+ // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
+ film_grain->ar_coeff_shift =
+ clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
+ 6, 9);
+ double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
+ int *ar_coeffs[3] = {
+ film_grain->ar_coeffs_y,
+ film_grain->ar_coeffs_cb,
+ film_grain->ar_coeffs_cr,
+ };
+ for (int c = 0; c < 3; ++c) {
+ aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+ for (int i = 0; i < n_coeff; ++i) {
+ ar_coeffs[c][i] =
+ clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
+ }
+ if (c > 0) {
+ ar_coeffs[c][n_coeff] =
+ clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
+ }
+ }
+
+ // At the moment, the noise modeling code assumes that the chroma scaling
+ // functions are a function of luma.
+ film_grain->cb_mult = 128; // 8 bits
+ film_grain->cb_luma_mult = 192; // 8 bits
+ film_grain->cb_offset = 256; // 9 bits
+
+ film_grain->cr_mult = 128; // 8 bits
+ film_grain->cr_luma_mult = 192; // 8 bits
+ film_grain->cr_offset = 256; // 9 bits
+
+ film_grain->chroma_scaling_from_luma = 0;
+ film_grain->grain_scale_shift = 0;
+ film_grain->overlap_flag = 1;
+ return 1;
+}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+ for (int i = 0; i < n; ++i) {
+ b[i] *= a[i];
+ }
+}
+
+static float *get_half_cos_window(int block_size) {
+ float *window_function =
+ (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+ if (!window_function) return NULL;
+ for (int y = 0; y < block_size; ++y) {
+ const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+ for (int x = 0; x < block_size; ++x) {
+ const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+ window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+ }
+ }
+ return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \
+ static void dither_and_quantize_##suffix( \
+ float *result, int result_stride, INT_TYPE *denoised, int w, int h, \
+ int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \
+ float block_normalization) { \
+ for (int y = 0; y < (h >> chroma_sub_h); ++y) { \
+ for (int x = 0; x < (w >> chroma_sub_w); ++x) { \
+ const int result_idx = \
+ (y + (block_size >> chroma_sub_h)) * result_stride + x + \
+ (block_size >> chroma_sub_w); \
+ INT_TYPE new_val = (INT_TYPE)AOMMIN( \
+ AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \
+ block_normalization); \
+ const float err = \
+ -(((float)new_val) / block_normalization - result[result_idx]); \
+ denoised[y * stride + x] = new_val; \
+ if (x + 1 < (w >> chroma_sub_w)) { \
+ result[result_idx + 1] += err * 7.0f / 16.0f; \
+ } \
+ if (y + 1 < (h >> chroma_sub_h)) { \
+ if (x > 0) { \
+ result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \
+ } \
+ result[result_idx + result_stride] += err * 5.0f / 16.0f; \
+ if (x + 1 < (w >> chroma_sub_w)) { \
+ result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \
+ } \
+ } \
+ } \
+ } \
+ }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd)
+DITHER_AND_QUANTIZE(uint16_t, highbd)
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+ int w, int h, int stride[3], int chroma_sub[2],
+ float *noise_psd[3], int block_size, int bit_depth,
+ int use_highbd) {
+ float *plane = NULL, *block = NULL, *window_full = NULL,
+ *window_chroma = NULL;
+ double *block_d = NULL, *plane_d = NULL;
+ struct aom_noise_tx_t *tx_full = NULL;
+ struct aom_noise_tx_t *tx_chroma = NULL;
+ const int num_blocks_w = (w + block_size - 1) / block_size;
+ const int num_blocks_h = (h + block_size - 1) / block_size;
+ const int result_stride = (num_blocks_w + 2) * block_size;
+ const int result_height = (num_blocks_h + 2) * block_size;
+ float *result = NULL;
+ int init_success = 1;
+ aom_flat_block_finder_t block_finder_full;
+ aom_flat_block_finder_t block_finder_chroma;
+ const float kBlockNormalization = (float)((1 << bit_depth) - 1);
+ if (chroma_sub[0] != chroma_sub[1]) {
+ fprintf(stderr,
+ "aom_wiener_denoise_2d doesn't handle different chroma "
+ "subsampling\n");
+ return 0;
+ }
+ init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+ bit_depth, use_highbd);
+ result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+ sizeof(*result));
+ plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+ block =
+ (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+ block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+ plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+ window_full = get_half_cos_window(block_size);
+ tx_full = aom_noise_tx_malloc(block_size);
+
+ if (chroma_sub[0] != 0) {
+ init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+ block_size >> chroma_sub[0],
+ bit_depth, use_highbd);
+ window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+ tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+ } else {
+ window_chroma = window_full;
+ tx_chroma = tx_full;
+ }
+
+ init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+ (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+ (window_full != NULL) && (window_chroma != NULL) &&
+ (result != NULL);
+ for (int c = init_success ? 0 : 3; c < 3; ++c) {
+ float *window_function = c == 0 ? window_full : window_chroma;
+ aom_flat_block_finder_t *block_finder = &block_finder_full;
+ const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+ const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+ struct aom_noise_tx_t *tx =
+ (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+ if (!data[c] || !denoised[c]) continue;
+ if (c > 0 && chroma_sub[0] != 0) {
+ block_finder = &block_finder_chroma;
+ }
+ memset(result, 0, sizeof(*result) * result_stride * result_height);
+ // Do overlapped block processing (half overlapped). The block rows can
+ // easily be done in parallel
+ for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+ offsy += (block_size >> chroma_sub_h) / 2) {
+ for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+ offsx += (block_size >> chroma_sub_w) / 2) {
+ // Pad the boundary when processing each block-set.
+ for (int by = -1; by < num_blocks_h; ++by) {
+ for (int bx = -1; bx < num_blocks_w; ++bx) {
+ const int pixels_per_block =
+ (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+ aom_flat_block_finder_extract_block(
+ block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+ stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+ by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+ for (int j = 0; j < pixels_per_block; ++j) {
+ block[j] = (float)block_d[j];
+ plane[j] = (float)plane_d[j];
+ }
+ pointwise_multiply(window_function, block, pixels_per_block);
+ aom_noise_tx_forward(tx, block);
+ aom_noise_tx_filter(tx, noise_psd[c]);
+ aom_noise_tx_inverse(tx, block);
+
+ // Apply window function to the plane approximation (we will apply
+ // it to the sum of plane + block when composing the results).
+ pointwise_multiply(window_function, plane, pixels_per_block);
+
+ for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+ const int y_result =
+ y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+ for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+ const int x_result =
+ x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+ result[y_result * result_stride + x_result] +=
+ (block[y * (block_size >> chroma_sub_w) + x] +
+ plane[y * (block_size >> chroma_sub_w) + x]) *
+ window_function[y * (block_size >> chroma_sub_w) + x];
+ }
+ }
+ }
+ }
+ }
+ }
+ if (use_highbd) {
+ dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+ w, h, stride[c], chroma_sub_w, chroma_sub_h,
+ block_size, kBlockNormalization);
+ } else {
+ dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+ stride[c], chroma_sub_w, chroma_sub_h,
+ block_size, kBlockNormalization);
+ }
+ }
+ aom_free(result);
+ aom_free(plane);
+ aom_free(block);
+ aom_free(plane_d);
+ aom_free(block_d);
+ aom_free(window_full);
+
+ aom_noise_tx_free(tx_full);
+
+ aom_flat_block_finder_free(&block_finder_full);
+ if (chroma_sub[0] != 0) {
+ aom_flat_block_finder_free(&block_finder_chroma);
+ aom_free(window_chroma);
+ aom_noise_tx_free(tx_chroma);
+ }
+ return init_success;
+}
+
+struct aom_denoise_and_model_t {
+ int block_size;
+ int bit_depth;
+ float noise_level;
+
+ // Size of current denoised buffer and flat_block buffer
+ int width;
+ int height;
+ int y_stride;
+ int uv_stride;
+ int num_blocks_w;
+ int num_blocks_h;
+
+ // Buffers for image and noise_psd allocated on the fly
+ float *noise_psd[3];
+ uint8_t *denoised[3];
+ uint8_t *flat_blocks;
+
+ aom_flat_block_finder_t flat_block_finder;
+ aom_noise_model_t noise_model;
+};
+
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+ int block_size,
+ float noise_level) {
+ struct aom_denoise_and_model_t *ctx =
+ (struct aom_denoise_and_model_t *)aom_malloc(
+ sizeof(struct aom_denoise_and_model_t));
+ if (!ctx) {
+ fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
+ return NULL;
+ }
+ memset(ctx, 0, sizeof(*ctx));
+
+ ctx->block_size = block_size;
+ ctx->noise_level = noise_level;
+ ctx->bit_depth = bit_depth;
+
+ ctx->noise_psd[0] =
+ (float *)aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
+ ctx->noise_psd[1] =
+ (float *)aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
+ ctx->noise_psd[2] =
+ (float *)aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
+ if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
+ fprintf(stderr, "Unable to allocate noise PSD buffers\n");
+ aom_denoise_and_model_free(ctx);
+ return NULL;
+ }
+ return ctx;
+}
+
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
+ aom_free(ctx->flat_blocks);
+ for (int i = 0; i < 3; ++i) {
+ aom_free(ctx->denoised[i]);
+ aom_free(ctx->noise_psd[i]);
+ }
+ aom_noise_model_free(&ctx->noise_model);
+ aom_flat_block_finder_free(&ctx->flat_block_finder);
+ aom_free(ctx);
+}
+
+static int denoise_and_model_realloc_if_necessary(
+ struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
+ if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
+ ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
+ return 1;
+ const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+ const int block_size = ctx->block_size;
+
+ ctx->width = sd->y_width;
+ ctx->height = sd->y_height;
+ ctx->y_stride = sd->y_stride;
+ ctx->uv_stride = sd->uv_stride;
+
+ for (int i = 0; i < 3; ++i) {
+ aom_free(ctx->denoised[i]);
+ ctx->denoised[i] = NULL;
+ }
+ aom_free(ctx->flat_blocks);
+ ctx->flat_blocks = NULL;
+
+ ctx->denoised[0] =
+ (uint8_t *)aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
+ ctx->denoised[1] =
+ (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+ ctx->denoised[2] =
+ (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+ if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
+ fprintf(stderr, "Unable to allocate denoise buffers\n");
+ return 0;
+ }
+ ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
+ ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
+ ctx->flat_blocks =
+ (uint8_t *)aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+ if (!ctx->flat_blocks) {
+ fprintf(stderr, "Unable to allocate flat_blocks buffer\n");
+ return 0;
+ }
+
+ aom_flat_block_finder_free(&ctx->flat_block_finder);
+ if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
+ ctx->bit_depth, use_highbd)) {
+ fprintf(stderr, "Unable to init flat block finder\n");
+ return 0;
+ }
+
+ const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+ ctx->bit_depth, use_highbd };
+ aom_noise_model_free(&ctx->noise_model);
+ if (!aom_noise_model_init(&ctx->noise_model, params)) {
+ fprintf(stderr, "Unable to init noise model\n");
+ return 0;
+ }
+
+ // Simply use a flat PSD (although we could use the flat blocks to estimate
+ // PSD) those to estimate an actual noise PSD)
+ const float y_noise_level =
+ aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
+ const float uv_noise_level = aom_noise_psd_get_default_value(
+ ctx->block_size >> sd->subsampling_x, ctx->noise_level);
+ for (int i = 0; i < block_size * block_size; ++i) {
+ ctx->noise_psd[0][i] = y_noise_level;
+ ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
+ }
+ return 1;
+}
+
+// TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer
+// are null pointers) correctly.
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+ YV12_BUFFER_CONFIG *sd,
+ aom_film_grain_t *film_grain, int apply_denoise) {
+ const int block_size = ctx->block_size;
+ const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+ uint8_t *raw_data[3] = {
+ use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
+ use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
+ use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+ };
+ const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
+ int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
+ int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
+
+ if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
+ fprintf(stderr, "Unable to realloc buffers\n");
+ return 0;
+ }
+
+ aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
+ sd->y_height, strides[0], ctx->flat_blocks);
+
+ if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
+ strides, chroma_sub_log2, ctx->noise_psd,
+ block_size, ctx->bit_depth, use_highbd)) {
+ fprintf(stderr, "Unable to denoise image\n");
+ return 0;
+ }
+
+ const aom_noise_status_t status = aom_noise_model_update(
+ &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
+ sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
+ block_size);
+ int have_noise_estimate = 0;
+ if (status == AOM_NOISE_STATUS_OK) {
+ have_noise_estimate = 1;
+ } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+ aom_noise_model_save_latest(&ctx->noise_model);
+ have_noise_estimate = 1;
+ } else {
+ // Unable to update noise model; proceed if we have a previous estimate.
+ have_noise_estimate =
+ (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
+ }
+
+ film_grain->apply_grain = 0;
+ if (have_noise_estimate) {
+ if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
+ fprintf(stderr, "Unable to get grain parameters.\n");
+ return 0;
+ }
+ if (!film_grain->random_seed) {
+ film_grain->random_seed = 7391;
+ }
+ if (apply_denoise) {
+ memcpy(raw_data[0], ctx->denoised[0],
+ (strides[0] * sd->y_height) << use_highbd);
+ if (!sd->monochrome) {
+ memcpy(raw_data[1], ctx->denoised[1],
+ (strides[1] * sd->uv_height) << use_highbd);
+ memcpy(raw_data[2], ctx->denoised[2],
+ (strides[2] * sd->uv_height) << use_highbd);
+ }
+ }
+ }
+ return 1;
+}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
new file mode 100644
index 0000000000..8228aeacfc
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
+#define AOM_AOM_DSP_NOISE_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#include <stdint.h>
+#include "aom_dsp/grain_params.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+/*!\brief Wrapper of data required to represent linear system of eqns and soln.
+ */
+typedef struct {
+ double *A;
+ double *b;
+ double *x;
+ int n;
+} aom_equation_system_t;
+
+/*!\brief Representation of a piecewise linear curve
+ *
+ * Holds n points as (x, y) pairs, that store the curve.
+ */
+typedef struct {
+ double (*points)[2];
+ int num_points;
+} aom_noise_strength_lut_t;
+
+/*!\brief Init the noise strength lut with the given number of points*/
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
+
+/*!\brief Frees the noise strength lut. */
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
+
+/*!\brief Evaluate the lut at the point x.
+ *
+ * \param[in] lut The lut data.
+ * \param[in] x The coordinate to evaluate the lut.
+ */
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+ double x);
+
+/*!\brief Helper struct to model noise strength as a function of intensity.
+ *
+ * Internally, this structure holds a representation of a linear system
+ * of equations that models noise strength (standard deviation) as a
+ * function of intensity. The mapping is initially stored using a
+ * piecewise representation with evenly spaced bins that cover the entire
+ * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
+ * constraint of the form:
+ * y_{i} (1 - a) + y_{i+1} a = y
+ * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
+ * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
+ * normal equations.
+ *
+ * As there may be missing data, the solution is regularized to get a
+ * complete set of values for the bins. A reduced representation after
+ * solving can be obtained by getting the corresponding noise_strength_lut_t.
+ */
+typedef struct {
+ aom_equation_system_t eqns;
+ double min_intensity;
+ double max_intensity;
+ int num_bins;
+ int num_equations;
+ double total;
+} aom_noise_strength_solver_t;
+
+/*!\brief Initializes the noise solver with the given number of bins.
+ *
+ * Returns 0 if initialization fails.
+ *
+ * \param[in] solver The noise solver to be initialized.
+ * \param[in] num_bins Number of bins to use in the internal representation.
+ * \param[in] bit_depth The bit depth used to derive {min,max}_intensity.
+ */
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+ int num_bins, int bit_depth);
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
+
+/*!\brief Gets the x coordinate of bin i.
+ *
+ * \param[in] i The bin whose coordinate to query.
+ */
+double aom_noise_strength_solver_get_center(
+ const aom_noise_strength_solver_t *solver, int i);
+
+/*!\brief Add an observation of the block mean intensity to its noise strength.
+ *
+ * \param[in] block_mean The average block intensity,
+ * \param[in] noise_std The observed noise strength.
+ */
+void aom_noise_strength_solver_add_measurement(
+ aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
+
+/*!\brief Solves the current set of equations for the noise strength. */
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
+
+/*!\brief Fits a reduced piecewise linear lut to the internal solution
+ *
+ * \param[in] max_num_points The maximum number of output points
+ * \param[out] lut The output piecewise linear lut.
+ */
+int aom_noise_strength_solver_fit_piecewise(
+ const aom_noise_strength_solver_t *solver, int max_num_points,
+ aom_noise_strength_lut_t *lut);
+
+/*!\brief Helper for holding precomputed data for finding flat blocks.
+ *
+ * Internally a block is modeled with a low-order polynomial model. A
+ * planar model would be a bunch of equations like:
+ * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i
+ * for each point in the block. The system matrix A with row i as [y_i x_i 1]
+ * is maintained as is the inverse, inv(A'*A), so that the plane parameters
+ * can be fit for each block.
+ */
+typedef struct {
+ double *AtA_inv;
+ double *A;
+ int num_params; // The number of parameters used for internal low-order model
+ int block_size; // The block size the finder was initialized with
+ double normalization; // Normalization factor (1 / (2^(bit_depth) - 1))
+ int use_highbd; // Whether input data should be interpreted as uint16
+} aom_flat_block_finder_t;
+
+/*!\brief Init the block_finder with the given block size, bit_depth */
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+ int block_size, int bit_depth, int use_highbd);
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
+
+/*!\brief Helper to extract a block and low order "planar" model. */
+void aom_flat_block_finder_extract_block(
+ const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+ int w, int h, int stride, int offsx, int offsy, double *plane,
+ double *block);
+
+/*!\brief Runs the flat block finder on the input data.
+ *
+ * Find flat blocks in the input image data. Returns a map of
+ * flat_blocks, where the value of flat_blocks map will be non-zero
+ * when a block is determined to be flat. A higher value indicates a bigger
+ * confidence in the decision.
+ */
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+ const uint8_t *const data, int w, int h,
+ int stride, uint8_t *flat_blocks);
+
+// The noise shape indicates the allowed coefficients in the AR model.
+enum {
+ AOM_NOISE_SHAPE_DIAMOND = 0,
+ AOM_NOISE_SHAPE_SQUARE = 1
+} UENUM1BYTE(aom_noise_shape);
+
+// The parameters of the noise model include the shape type, lag, the
+// bit depth of the input images provided, and whether the input images
+// will be using uint16 (or uint8) representation.
+typedef struct {
+ aom_noise_shape shape;
+ int lag;
+ int bit_depth;
+ int use_highbd;
+} aom_noise_model_params_t;
+
+/*!\brief State of a noise model estimate for a single channel.
+ *
+ * This contains a system of equations that can be used to solve
+ * for the auto-regressive coefficients as well as a noise strength
+ * solver that can be used to model noise strength as a function of
+ * intensity.
+ */
+typedef struct {
+ aom_equation_system_t eqns;
+ aom_noise_strength_solver_t strength_solver;
+ int num_observations; // The number of observations in the eqn system
+ double ar_gain; // The gain of the current AR filter
+} aom_noise_state_t;
+
+/*!\brief Complete model of noise for a planar video
+ *
+ * This includes a noise model for the latest frame and an aggregated
+ * estimate over all previous frames that had similar parameters.
+ */
+typedef struct {
+ aom_noise_model_params_t params;
+ aom_noise_state_t combined_state[3]; // Combined state per channel
+ aom_noise_state_t latest_state[3]; // Latest state per channel
+ int (*coords)[2]; // Offsets (x,y) of the coefficient samples
+ int n; // Number of parameters (size of coords)
+ int bit_depth;
+} aom_noise_model_t;
+
+/*!\brief Result of a noise model update. */
+enum {
+ AOM_NOISE_STATUS_OK = 0,
+ AOM_NOISE_STATUS_INVALID_ARGUMENT,
+ AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+ AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
+ AOM_NOISE_STATUS_INTERNAL_ERROR,
+} UENUM1BYTE(aom_noise_status_t);
+
+/*!\brief Initializes a noise model with the given parameters.
+ *
+ * Returns 0 on failure.
+ */
+int aom_noise_model_init(aom_noise_model_t *model,
+ const aom_noise_model_params_t params);
+void aom_noise_model_free(aom_noise_model_t *model);
+
+/*!\brief Updates the noise model with a new frame observation.
+ *
+ * Updates the noise model with measurements from the given input frame and a
+ * denoised variant of it. Noise is sampled from flat blocks using the flat
+ * block map.
+ *
+ * Returns a noise_status indicating if the update was successful. If the
+ * Update was successful, the combined_state is updated with measurements from
+ * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
+ * state will be updated with measurements from the provided frame.
+ *
+ * \param[in,out] noise_model The noise model to be updated
+ * \param[in] data Raw frame data
+ * \param[in] denoised Denoised frame data.
+ * \param[in] w Frame width
+ * \param[in] h Frame height
+ * \param[in] strides Stride of the planes
+ * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in] flat_blocks A map to blocks that have been determined flat
+ * \param[in] block_size The size of blocks.
+ */
+aom_noise_status_t aom_noise_model_update(
+ aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+ const uint8_t *const denoised[3], int w, int h, int strides[3],
+ int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
+
+/*\brief Save the "latest" estimate into the "combined" estimate.
+ *
+ * This is meant to be called when the noise modeling detected a change
+ * in parameters (or for example, if a user wanted to reset estimation at
+ * a shot boundary).
+ */
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
+
+/*!\brief Converts the noise_model parameters to the corresponding
+ * grain_parameters.
+ *
+ * The noise structs in this file are suitable for estimation (e.g., using
+ * floats), but the grain parameters in the bitstream are quantized. This
+ * function does the conversion by selecting the correct quantization levels.
+ */
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+ aom_film_grain_t *film_grain);
+
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in] data Raw frame data
+ * \param[out] denoised Denoised frame data
+ * \param[in] w Frame width
+ * \param[in] h Frame height
+ * \param[in] stride Stride of the planes
+ * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in] noise_psd The power spectral density of the noise
+ * \param[in] block_size The size of blocks
+ * \param[in] bit_depth Bit depth of the image
+ * \param[in] use_highbd If true, uint8 pointers are interpreted as
+ * uint16 and stride is measured in uint16.
+ * This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+ int w, int h, int stride[3], int chroma_sub_log2[2],
+ float *noise_psd[3], int block_size, int bit_depth,
+ int use_highbd);
+
+struct aom_denoise_and_model_t;
+
+/*!\brief Denoise the buffer and model the residual noise.
+ *
+ * This is meant to be called sequentially on input frames. The input buffer
+ * is denoised and the residual noise is modelled. The current noise estimate
+ * is populated in film_grain. Returns true on success. The grain.apply_grain
+ * parameter will be true when the input buffer was successfully denoised and
+ * grain was modelled. Returns false on error.
+ *
+ * \param[in] ctx Struct allocated with
+ * aom_denoise_and_model_alloc that holds some
+ * buffers for denoising and the current noise
+ * estimate.
+ * \param[in,out] buf The raw input buffer to be denoised.
+ * \param[out] grain Output film grain parameters
+ * \param[in] apply_denoise Whether or not to apply the denoising to the
+ * frame that will be encoded
+ */
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+ YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain,
+ int apply_denoise);
+
+/*!\brief Allocates a context that can be used for denoising and noise modeling.
+ *
+ * \param[in] bit_depth Bit depth of buffers this will be run on.
+ * \param[in] block_size Block size for noise modeling and flat block
+ * estimation
+ * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for
+ * higher levels of noise)
+ */
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+ int block_size,
+ float noise_level);
+
+/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
+ */
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
new file mode 100644
index 0000000000..3ded8cb099
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+ return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+ float *tx_block;
+ float *temp;
+ int block_size;
+ void (*fft)(const float *, float *, float *);
+ void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+ struct aom_noise_tx_t *noise_tx =
+ (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+ if (!noise_tx) return NULL;
+ memset(noise_tx, 0, sizeof(*noise_tx));
+ switch (block_size) {
+ case 2:
+ noise_tx->fft = aom_fft2x2_float;
+ noise_tx->ifft = aom_ifft2x2_float;
+ break;
+ case 4:
+ noise_tx->fft = aom_fft4x4_float;
+ noise_tx->ifft = aom_ifft4x4_float;
+ break;
+ case 8:
+ noise_tx->fft = aom_fft8x8_float;
+ noise_tx->ifft = aom_ifft8x8_float;
+ break;
+ case 16:
+ noise_tx->fft = aom_fft16x16_float;
+ noise_tx->ifft = aom_ifft16x16_float;
+ break;
+ case 32:
+ noise_tx->fft = aom_fft32x32_float;
+ noise_tx->ifft = aom_ifft32x32_float;
+ break;
+ default:
+ aom_free(noise_tx);
+ fprintf(stderr, "Unsupported block size %d\n", block_size);
+ return NULL;
+ }
+ noise_tx->block_size = block_size;
+ noise_tx->tx_block = (float *)aom_memalign(
+ 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+ noise_tx->temp = (float *)aom_memalign(
+ 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+ if (!noise_tx->tx_block || !noise_tx->temp) {
+ aom_noise_tx_free(noise_tx);
+ return NULL;
+ }
+ // Clear the buffers up front. Some outputs of the forward transform are
+ // real only (the imaginary component will never be touched)
+ memset(noise_tx->tx_block, 0,
+ 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+ memset(noise_tx->temp, 0,
+ 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+ return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+ noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+ const int block_size = noise_tx->block_size;
+ const float kBeta = 1.1f;
+ const float kEps = 1e-6f;
+ for (int y = 0; y < block_size; ++y) {
+ for (int x = 0; x < block_size; ++x) {
+ int i = y * block_size + x;
+ float *c = noise_tx->tx_block + 2 * i;
+ const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f);
+ const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f);
+ const float p = c0 * c0 + c1 * c1;
+ if (p > kBeta * psd[i] && p > 1e-6) {
+ noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+ noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+ } else {
+ noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+ noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+ }
+ }
+ }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+ const int n = noise_tx->block_size * noise_tx->block_size;
+ noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+ for (int i = 0; i < n; ++i) {
+ data[i] /= n;
+ }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+ float *psd) {
+ const int block_size = noise_tx->block_size;
+ for (int yb = 0; yb < block_size; ++yb) {
+ for (int xb = 0; xb <= block_size / 2; ++xb) {
+ float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+ psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+ }
+ }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+ if (!noise_tx) return;
+ aom_free(noise_tx->tx_block);
+ aom_free(noise_tx->temp);
+ aom_free(noise_tx);
+}
+
+double aom_normalized_cross_correlation(const double *a, const double *b,
+ int n) {
+ double c = 0;
+ double a_len = 0;
+ double b_len = 0;
+ for (int i = 0; i < n; ++i) {
+ a_len += a[i] * a[i];
+ b_len += b[i] * b[i];
+ c += a[i] * b[i];
+ }
+ return c / (sqrt(a_len) * sqrt(b_len));
+}
+
+int aom_noise_data_validate(const double *data, int w, int h) {
+ const double kVarianceThreshold = 2;
+ const double kMeanThreshold = 2;
+
+ int x = 0, y = 0;
+ int ret_value = 1;
+ double var = 0, mean = 0;
+ double *mean_x, *mean_y, *var_x, *var_y;
+
+ // Check that noise variance is not increasing in x or y
+ // and that the data is zero mean.
+ mean_x = (double *)aom_calloc(w, sizeof(*mean_x));
+ var_x = (double *)aom_calloc(w, sizeof(*var_x));
+ mean_y = (double *)aom_calloc(h, sizeof(*mean_x));
+ var_y = (double *)aom_calloc(h, sizeof(*var_y));
+ if (!(mean_x && var_x && mean_y && var_y)) {
+ aom_free(mean_x);
+ aom_free(mean_y);
+ aom_free(var_x);
+ aom_free(var_y);
+ return 0;
+ }
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ const double d = data[y * w + x];
+ var_x[x] += d * d;
+ var_y[y] += d * d;
+ mean_x[x] += d;
+ mean_y[y] += d;
+ var += d * d;
+ mean += d;
+ }
+ }
+ mean /= (w * h);
+ var = var / (w * h) - mean * mean;
+
+ for (y = 0; y < h; ++y) {
+ mean_y[y] /= h;
+ var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
+ if (fabs(var_y[y] - var) >= kVarianceThreshold) {
+ fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
+ ret_value = 0;
+ break;
+ }
+ if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
+ fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
+ ret_value = 0;
+ break;
+ }
+ }
+
+ for (x = 0; x < w; ++x) {
+ mean_x[x] /= w;
+ var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
+ if (fabs(var_x[x] - var) >= kVarianceThreshold) {
+ fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
+ ret_value = 0;
+ break;
+ }
+ if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
+ fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
+ ret_value = 0;
+ break;
+ }
+ }
+
+ aom_free(mean_x);
+ aom_free(mean_y);
+ aom_free(var_x);
+ aom_free(var_y);
+
+ return ret_value;
+}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
new file mode 100644
index 0000000000..2284a171a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
+#define AOM_AOM_DSP_NOISE_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+ const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+ float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
+// Computes normalized cross correlation of two vectors a and b of length n.
+double aom_normalized_cross_correlation(const double *a, const double *b,
+ int n);
+
+// Validates the correlated noise in the data buffer of size (w, h).
+int aom_noise_data_validate(const double *data, int w, int h);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/odintrin.c b/third_party/aom/aom_dsp/odintrin.c
new file mode 100644
index 0000000000..eb6d8d8771
--- /dev/null
+++ b/third_party/aom/aom_dsp/odintrin.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#include "aom_dsp/odintrin.h"
+
+/*Constants for use with OD_DIVU_SMALL().
+ See \cite{Rob05} for details on computing these constants.
+ @INPROCEEDINGS{Rob05,
+ author="Arch D. Robison",
+ title="{N}-bit Unsigned Division via {N}-bit Multiply-Add",
+ booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic
+ (ARITH'05)",
+ pages="131--139",
+ address="Cape Cod, MA",
+ month=Jun,
+ year=2005
+ }*/
+uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = {
+ { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xAAAAAAAB, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xCCCCCCCD, 0 }, { 0xAAAAAAAB, 0 },
+ { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xE38E38E4, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xBA2E8BA3, 0 }, { 0xAAAAAAAB, 0 },
+ { 0x9D89D89E, 0 }, { 0x92492492, 0x92492492 },
+ { 0x88888889, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xF0F0F0F1, 0 }, { 0xE38E38E4, 0 },
+ { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 },
+ { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 },
+ { 0xB21642C9, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xA3D70A3E, 0 }, { 0x9D89D89E, 0 },
+ { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 },
+ { 0x8D3DCB09, 0 }, { 0x88888889, 0 },
+ { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xF83E0F84, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 },
+ { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 },
+ { 0xC7CE0C7D, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xBE82FA0C, 0 }, { 0xBA2E8BA3, 0 },
+ { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 },
+ { 0xAE4C415D, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xA72F053A, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA0A0A0A1, 0 }, { 0x9D89D89E, 0 },
+ { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED },
+ { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 },
+ { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 },
+ { 0x8AD8F2FC, 0 }, { 0x88888889, 0 },
+ { 0x864B8A7E, 0 }, { 0x84210842, 0x84210842 },
+ { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFC0FC0FD, 0 }, { 0xF83E0F84, 0 },
+ { 0xF4898D60, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xED7303B6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE6C2B449, 0 }, { 0xE38E38E4, 0 },
+ { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDA740DA8, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD4C77B04, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xCF6474A9, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xCA4587E7, 0 }, { 0xC7CE0C7D, 0 },
+ { 0xC565C87C, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC0C0C0C1, 0 }, { 0xBE82FA0C, 0 },
+ { 0xBC52640C, 0 }, { 0xBA2E8BA3, 0 },
+ { 0xB81702E1, 0 }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 },
+ { 0xB02C0B03, 0 }, { 0xAE4C415D, 0 },
+ { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 },
+ { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 },
+ { 0xA57EB503, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 },
+ { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 },
+ { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED },
+ { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F },
+ { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 },
+ { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 },
+ { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 },
+ { 0x89AE408A, 0 }, { 0x88888889, 0 },
+ { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 },
+ { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 },
+ { 0x83126E98, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFE03F810, 0 }, { 0xFC0FC0FD, 0 },
+ { 0xFA232CF3, 0 }, { 0xF83E0F84, 0 },
+ { 0xF6603D99, 0 }, { 0xF4898D60, 0 },
+ { 0xF2B9D649, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xEF2EB720, 0 }, { 0xED7303B6, 0 },
+ { 0xEBBDB2A6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE865AC7C, 0 }, { 0xE6C2B449, 0 },
+ { 0xE525982B, 0 }, { 0xE38E38E4, 0 },
+ { 0xE1FC780F, 0 }, { 0xE070381C, 0xE070381C },
+ { 0xDEE95C4D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDBEB61EF, 0 }, { 0xDA740DA8, 0 },
+ { 0xD901B204, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD62B80D7, 0 }, { 0xD4C77B04, 0 },
+ { 0xD3680D37, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xD0B69FCC, 0 }, { 0xCF6474A9, 0 },
+ { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 },
+ { 0xCB8727C1, 0 }, { 0xCA4587E7, 0 },
+ { 0xC907DA4F, 0 }, { 0xC7CE0C7D, 0 },
+ { 0xC6980C6A, 0 }, { 0xC565C87C, 0 },
+ { 0xC4372F86, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC1E4BBD6, 0 }, { 0xC0C0C0C1, 0 },
+ { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 },
+ { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 },
+ { 0xBB3EE722, 0 }, { 0xBA2E8BA3, 0 },
+ { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 },
+ { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB509E68B, 0 }, { 0xB40B40B4, 0xB40B40B4 },
+ { 0xB30F6353, 0 }, { 0xB21642C9, 0 },
+ { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 },
+ { 0xAF3ADDC7, 0 }, { 0xAE4C415D, 0 },
+ { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 },
+ { 0xAB8F69E3, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xA9C84A48, 0 }, { 0xA8E83F57, 0xA8E83F57 },
+ { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 },
+ { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 },
+ { 0xA4A9CF1E, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA3065E40, 0 }, { 0xA237C32B, 0xA237C32B },
+ { 0xA16B312F, 0 }, { 0xA0A0A0A1, 0 },
+ { 0x9FD809FE, 0 }, { 0x9F1165E7, 0x9F1165E7 },
+ { 0x9E4CAD24, 0 }, { 0x9D89D89E, 0 },
+ { 0x9CC8E161, 0 }, { 0x9C09C09C, 0x9C09C09C },
+ { 0x9B4C6F9F, 0 }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x99D722DB, 0 }, { 0x991F1A51, 0x991F1A51 },
+ { 0x9868C80A, 0 }, { 0x97B425ED, 0x97B425ED },
+ { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C },
+ { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F },
+ { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 },
+ { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 },
+ { 0x91A2B3C5, 0 }, { 0x90FDBC09, 0x90FDBC09 },
+ { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8F1779DA, 0 }, { 0x8E78356D, 0x8E78356D },
+ { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 },
+ { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C },
+ { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 },
+ { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 },
+ { 0x891AC73B, 0 }, { 0x88888889, 0 },
+ { 0x87F78088, 0 }, { 0x8767AB5F, 0x8767AB5F },
+ { 0x86D90545, 0 }, { 0x864B8A7E, 0 },
+ { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 },
+ { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 },
+ { 0x83993052, 0x83993052 }, { 0x83126E98, 0 },
+ { 0x828CBFBF, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81848DA9, 0 }, { 0x81020408, 0x81020408 },
+ { 0x80808081, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFF00FF01, 0 }, { 0xFE03F810, 0 },
+ { 0xFD08E551, 0 }, { 0xFC0FC0FD, 0 },
+ { 0xFB188566, 0 }, { 0xFA232CF3, 0 },
+ { 0xF92FB222, 0 }, { 0xF83E0F84, 0 },
+ { 0xF74E3FC3, 0 }, { 0xF6603D99, 0 },
+ { 0xF57403D6, 0 }, { 0xF4898D60, 0 },
+ { 0xF3A0D52D, 0 }, { 0xF2B9D649, 0 },
+ { 0xF1D48BCF, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 },
+ { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 },
+ { 0xEC979119, 0 }, { 0xEBBDB2A6, 0 },
+ { 0xEAE56404, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE9396520, 0 }, { 0xE865AC7C, 0 },
+ { 0xE79372E3, 0 }, { 0xE6C2B449, 0 },
+ { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 },
+ { 0xE45932D8, 0 }, { 0xE38E38E4, 0 },
+ { 0xE2C4A689, 0 }, { 0xE1FC780F, 0 },
+ { 0xE135A9CA, 0 }, { 0xE070381C, 0xE070381C },
+ { 0xDFAC1F75, 0 }, { 0xDEE95C4D, 0 },
+ { 0xDE27EB2D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDCA8F159, 0 }, { 0xDBEB61EF, 0 },
+ { 0xDB2F171E, 0 }, { 0xDA740DA8, 0 },
+ { 0xD9BA4257, 0 }, { 0xD901B204, 0 },
+ { 0xD84A598F, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD6DF43FD, 0 }, { 0xD62B80D7, 0 },
+ { 0xD578E97D, 0 }, { 0xD4C77B04, 0 },
+ { 0xD417328A, 0 }, { 0xD3680D37, 0 },
+ { 0xD2BA083C, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 },
+ { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 },
+ { 0xCEBCF8BC, 0 }, { 0xCE168A77, 0xCE168A77 },
+ { 0xCD712753, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xCC29786D, 0 }, { 0xCB8727C1, 0 },
+ { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 },
+ { 0xC9A633FD, 0 }, { 0xC907DA4F, 0 },
+ { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 },
+ { 0xC73293D8, 0 }, { 0xC6980C6A, 0 },
+ { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 },
+ { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 },
+ { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC2780614, 0 }, { 0xC1E4BBD6, 0 },
+ { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 },
+ { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 },
+ { 0xBF112A8B, 0 }, { 0xBE82FA0C, 0 },
+ { 0xBDF59C92, 0 }, { 0xBD691047, 0xBD691047 },
+ { 0xBCDD535E, 0 }, { 0xBC52640C, 0 },
+ { 0xBBC8408D, 0 }, { 0xBB3EE722, 0 },
+ { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 },
+ { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA },
+ { 0xB89BC36D, 0 }, { 0xB81702E1, 0 },
+ { 0xB79300B8, 0 }, { 0xB70FBB5A, 0xB70FBB5A },
+ { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 },
+ { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 },
+ { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 },
+ { 0xB2927C2A, 0 }, { 0xB21642C9, 0 },
+ { 0xB19AB5C5, 0 }, { 0xB11FD3B8, 0xB11FD3B8 },
+ { 0xB0A59B42, 0 }, { 0xB02C0B03, 0 },
+ { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 },
+ { 0xAEC33E20, 0 }, { 0xAE4C415D, 0 },
+ { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 },
+ { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 },
+ { 0xAC02B00B, 0 }, { 0xAB8F69E3, 0 },
+ { 0xAB1CBDD4, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xAA392F36, 0 }, { 0xA9C84A48, 0 },
+ { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 },
+ { 0xA8791709, 0 }, { 0xA80A80A8, 0xA80A80A8 },
+ { 0xA79C7B17, 0 }, { 0xA72F053A, 0 },
+ { 0xA6C21DF7, 0 }, { 0xA655C439, 0xA655C439 },
+ { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 },
+ { 0xA513FD6C, 0 }, { 0xA4A9CF1E, 0 },
+ { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 },
+ { 0xA36E71A3, 0 }, { 0xA3065E40, 0 },
+ { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B },
+ { 0xA1D13986, 0 }, { 0xA16B312F, 0 },
+ { 0xA105A933, 0 }, { 0xA0A0A0A1, 0 },
+ { 0xA03C1689, 0 }, { 0x9FD809FE, 0 },
+ { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 },
+ { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 },
+ { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 },
+ { 0x9D2921C4, 0 }, { 0x9CC8E161, 0 },
+ { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C },
+ { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 },
+ { 0x9AEE72FD, 0 }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 },
+ { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 },
+ { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 },
+ { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED },
+ { 0x975A7510, 0 }, { 0x97012E02, 0x97012E02 },
+ { 0x96A8500A, 0 }, { 0x964FDA6C, 0x964FDA6C },
+ { 0x95F7CC73, 0 }, { 0x95A02568, 0x95A02568 },
+ { 0x9548E498, 0 }, { 0x94F2094F, 0x94F2094F },
+ { 0x949B92DE, 0 }, { 0x94458094, 0x94458094 },
+ { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 },
+ { 0x93459BE7, 0 }, { 0x92F11384, 0x92F11384 },
+ { 0x929CEBF5, 0 }, { 0x92492492, 0x92492492 },
+ { 0x91F5BCB9, 0 }, { 0x91A2B3C5, 0 },
+ { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 },
+ { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 },
+ { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8F67A1E4, 0 }, { 0x8F1779DA, 0 },
+ { 0x8EC7AB3A, 0 }, { 0x8E78356D, 0x8E78356D },
+ { 0x8E2917E1, 0 }, { 0x8DDA5202, 0x8DDA5202 },
+ { 0x8D8BE340, 0 }, { 0x8D3DCB09, 0 },
+ { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 },
+ { 0x8C55841D, 0 }, { 0x8C08C08C, 0x8C08C08C },
+ { 0x8BBC50C9, 0 }, { 0x8B70344A, 0x8B70344A },
+ { 0x8B246A88, 0 }, { 0x8AD8F2FC, 0 },
+ { 0x8A8DCD20, 0 }, { 0x8A42F870, 0x8A42F870 },
+ { 0x89F8746A, 0 }, { 0x89AE408A, 0 },
+ { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 },
+ { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 },
+ { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 },
+ { 0x87AF6FD6, 0 }, { 0x8767AB5F, 0x8767AB5F },
+ { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 },
+ { 0x869222B2, 0 }, { 0x864B8A7E, 0 },
+ { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 },
+ { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 },
+ { 0x84EEDD36, 0 }, { 0x84A9F9C8, 0x84A9F9C8 },
+ { 0x84655D9C, 0 }, { 0x84210842, 0x84210842 },
+ { 0x83DCF94E, 0 }, { 0x83993052, 0x83993052 },
+ { 0x8355ACE4, 0 }, { 0x83126E98, 0 },
+ { 0x82CF7504, 0 }, { 0x828CBFBF, 0 },
+ { 0x824A4E61, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 },
+ { 0x814327E4, 0 }, { 0x81020408, 0x81020408 },
+ { 0x80C121B3, 0 }, { 0x80808081, 0 },
+ { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFF803FE1, 0 }, { 0xFF00FF01, 0 },
+ { 0xFE823CA6, 0 }, { 0xFE03F810, 0 },
+ { 0xFD863087, 0 }, { 0xFD08E551, 0 },
+ { 0xFC8C15B5, 0 }, { 0xFC0FC0FD, 0 },
+ { 0xFB93E673, 0 }, { 0xFB188566, 0 },
+ { 0xFA9D9D20, 0 }, { 0xFA232CF3, 0 },
+ { 0xF9A9342D, 0 }, { 0xF92FB222, 0 },
+ { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 },
+ { 0xF7C5ED9D, 0 }, { 0xF74E3FC3, 0 },
+ { 0xF6D7054E, 0 }, { 0xF6603D99, 0 },
+ { 0xF5E9E7FD, 0 }, { 0xF57403D6, 0 },
+ { 0xF4FE9083, 0 }, { 0xF4898D60, 0 },
+ { 0xF414F9CE, 0 }, { 0xF3A0D52D, 0 },
+ { 0xF32D1EE0, 0 }, { 0xF2B9D649, 0 },
+ { 0xF246FACC, 0 }, { 0xF1D48BCF, 0 },
+ { 0xF16288B9, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 },
+ { 0xEF9EA78C, 0 }, { 0xEF2EB720, 0 },
+ { 0xEEBF2F19, 0 }, { 0xEE500EE5, 0xEE500EE5 },
+ { 0xEDE155F4, 0 }, { 0xED7303B6, 0 },
+ { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 },
+ { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 },
+ { 0xEB5159A0, 0 }, { 0xEAE56404, 0 },
+ { 0xEA79D14A, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 },
+ { 0xE8CF58AB, 0 }, { 0xE865AC7C, 0 },
+ { 0xE7FC600F, 0 }, { 0xE79372E3, 0 },
+ { 0xE72AE476, 0 }, { 0xE6C2B449, 0 },
+ { 0xE65AE1DC, 0 }, { 0xE5F36CB0, 0xE5F36CB0 },
+ { 0xE58C544A, 0 }, { 0xE525982B, 0 },
+ { 0xE4BF37D9, 0 }, { 0xE45932D8, 0 },
+ { 0xE3F388AF, 0 }, { 0xE38E38E4, 0 },
+ { 0xE32942FF, 0 }, { 0xE2C4A689, 0 },
+ { 0xE260630B, 0 }, { 0xE1FC780F, 0 },
+ { 0xE198E520, 0 }, { 0xE135A9CA, 0 },
+ { 0xE0D2C59A, 0 }, { 0xE070381C, 0xE070381C },
+ { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 },
+ { 0xDF4A9369, 0 }, { 0xDEE95C4D, 0 },
+ { 0xDE8879B3, 0 }, { 0xDE27EB2D, 0 },
+ { 0xDDC7B04D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDD0833CE, 0 }, { 0xDCA8F159, 0 },
+ { 0xDC4A00DD, 0 }, { 0xDBEB61EF, 0 },
+ { 0xDB8D1428, 0 }, { 0xDB2F171E, 0 },
+ { 0xDAD16A6B, 0 }, { 0xDA740DA8, 0 },
+ { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 },
+ { 0xD95DD300, 0 }, { 0xD901B204, 0 },
+ { 0xD8A5DEFF, 0 }, { 0xD84A598F, 0 },
+ { 0xD7EF2152, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD73996E9, 0 }, { 0xD6DF43FD, 0 },
+ { 0xD6853CC1, 0 }, { 0xD62B80D7, 0 },
+ { 0xD5D20FDF, 0 }, { 0xD578E97D, 0 },
+ { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 },
+ { 0xD46F3235, 0 }, { 0xD417328A, 0 },
+ { 0xD3BF7BA9, 0 }, { 0xD3680D37, 0 },
+ { 0xD310E6DB, 0 }, { 0xD2BA083C, 0 },
+ { 0xD2637101, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xD1B71759, 0 }, { 0xD161543E, 0xD161543E },
+ { 0xD10BD72C, 0 }, { 0xD0B69FCC, 0 },
+ { 0xD061ADCA, 0 }, { 0xD00D00D0, 0xD00D00D0 },
+ { 0xCFB8988C, 0 }, { 0xCF6474A9, 0 },
+ { 0xCF1094D4, 0 }, { 0xCEBCF8BC, 0 },
+ { 0xCE69A00D, 0 }, { 0xCE168A77, 0xCE168A77 },
+ { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 },
+ { 0xCD1ED924, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xCC7B0200, 0 }, { 0xCC29786D, 0 },
+ { 0xCBD82FC7, 0 }, { 0xCB8727C1, 0 },
+ { 0xCB36600D, 0 }, { 0xCAE5D85F, 0xCAE5D85F },
+ { 0xCA95906C, 0 }, { 0xCA4587E7, 0 },
+ { 0xC9F5BE86, 0 }, { 0xC9A633FD, 0 },
+ { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 },
+ { 0xC8B90A96, 0 }, { 0xC86A7890, 0xC86A7890 },
+ { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 },
+ { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 },
+ { 0xC6E5321D, 0 }, { 0xC6980C6A, 0 },
+ { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 },
+ { 0xC5B200C6, 0 }, { 0xC565C87C, 0 },
+ { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 },
+ { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 },
+ { 0xC3EC1A06, 0 }, { 0xC3A13DE6, 0xC3A13DE6 },
+ { 0xC3569AE6, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC2C1FF3E, 0 }, { 0xC2780614, 0 },
+ { 0xC22E4507, 0 }, { 0xC1E4BBD6, 0 },
+ { 0xC19B6A42, 0 }, { 0xC152500C, 0xC152500C },
+ { 0xC1096CF6, 0 }, { 0xC0C0C0C1, 0 },
+ { 0xC0784B2F, 0 }, { 0xC0300C03, 0xC0300C03 },
+ { 0xBFE80300, 0 }, { 0xBFA02FE8, 0xBFA02FE8 },
+ { 0xBF589280, 0 }, { 0xBF112A8B, 0 },
+ { 0xBEC9F7CE, 0 }, { 0xBE82FA0C, 0 },
+ { 0xBE3C310C, 0 }, { 0xBDF59C92, 0 },
+ { 0xBDAF3C64, 0 }, { 0xBD691047, 0xBD691047 },
+ { 0xBD231803, 0 }, { 0xBCDD535E, 0 },
+ { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 },
+ { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 },
+ { 0xBB837AB1, 0 }, { 0xBB3EE722, 0 },
+ { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 },
+ { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 },
+ { 0xB9EAF063, 0 }, { 0xB9A7862A, 0xB9A7862A },
+ { 0xB9644CC4, 0 }, { 0xB92143FA, 0xB92143FA },
+ { 0xB8DE6B9A, 0 }, { 0xB89BC36D, 0 },
+ { 0xB8594B41, 0 }, { 0xB81702E1, 0 },
+ { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 },
+ { 0xB7514689, 0 }, { 0xB70FBB5A, 0xB70FBB5A },
+ { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 },
+ { 0xB64C31D9, 0 }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB5CABD9B, 0 }, { 0xB58A4855, 0xB58A4855 },
+ { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 },
+ { 0xB4C9F9A5, 0 }, { 0xB48A39D4, 0xB48A39D4 },
+ { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 },
+ { 0xB3CC0706, 0 }, { 0xB38CF9B0, 0xB38CF9B0 },
+ { 0xB34E1884, 0 }, { 0xB30F6353, 0 },
+ { 0xB2D0D9EF, 0 }, { 0xB2927C2A, 0 },
+ { 0xB25449D7, 0 }, { 0xB21642C9, 0 },
+ { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 },
+ { 0xB15D2F76, 0 }, { 0xB11FD3B8, 0xB11FD3B8 },
+ { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 },
+ { 0xB068BE31, 0 }, { 0xB02C0B03, 0 },
+ { 0xAFEF818C, 0 }, { 0xAFB321A1, 0xAFB321A1 },
+ { 0xAF76EB19, 0 }, { 0xAF3ADDC7, 0 },
+ { 0xAEFEF982, 0 }, { 0xAEC33E20, 0 },
+ { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 },
+ { 0xAE10FFA9, 0 }, { 0xADD5E632, 0xADD5E632 },
+ { 0xAD9AF4D0, 0 }, { 0xAD602B58, 0xAD602B58 },
+ { 0xAD2589A4, 0 }, { 0xACEB0F89, 0xACEB0F89 },
+ { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 },
+ { 0xAC3C8D4A, 0 }, { 0xAC02B00B, 0 },
+ { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 },
+ { 0xAB5600AC, 0 }, { 0xAB1CBDD4, 0 },
+ { 0xAAE3A136, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xAA71DA0D, 0 }, { 0xAA392F36, 0 },
+ { 0xAA00AA01, 0 }, { 0xA9C84A48, 0 },
+ { 0xA9900FE6, 0 }, { 0xA957FAB5, 0xA957FAB5 },
+ { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 },
+ { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 },
+ { 0xA841B9AD, 0 }, { 0xA80A80A8, 0xA80A80A8 },
+ { 0xA7D36BD8, 0 }, { 0xA79C7B17, 0 },
+ { 0xA765AE44, 0 }, { 0xA72F053A, 0 },
+ { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 },
+ { 0xA68BDF79, 0 }, { 0xA655C439, 0xA655C439 },
+ { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED },
+ { 0xA5B4449D, 0 }, { 0xA57EB503, 0 },
+ { 0xA54947FE, 0 }, { 0xA513FD6C, 0 },
+ { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 },
+ { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 },
+ { 0xA40B88D0, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 },
+ { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 },
+ { 0xA2D28634, 0 }, { 0xA29ECF16, 0xA29ECF16 },
+ { 0xA26B38C9, 0 }, { 0xA237C32B, 0xA237C32B },
+ { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 },
+ { 0xA19E2540, 0 }, { 0xA16B312F, 0 },
+ { 0xA1385D35, 0 }, { 0xA105A933, 0 },
+ { 0xA0D3150C, 0 }, { 0xA0A0A0A1, 0 },
+ { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 },
+ { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 },
+ { 0x9FA63284, 0 }, { 0x9F747A15, 0x9F747A15 },
+ { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 },
+ { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D },
+ { 0x9E7DADA9, 0 }, { 0x9E4CAD24, 0 },
+ { 0x9E1BCAE3, 0 }, { 0x9DEB06C9, 0x9DEB06C9 },
+ { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 },
+ { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 },
+ { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 },
+ { 0x9C98ED58, 0 }, { 0x9C69169B, 0x9C69169B },
+ { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C },
+ { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E },
+ { 0x9B7B98C0, 0 }, { 0x9B4C6F9F, 0 },
+ { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 },
+ { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x9A624C97, 0 }, { 0x9A33CD67, 0x9A33CD67 },
+ { 0x9A056A31, 0 }, { 0x99D722DB, 0 },
+ { 0x99A8F74C, 0 }, { 0x997AE76B, 0x997AE76B },
+ { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 },
+ { 0x98F15CE7, 0 }, { 0x98C3BAC7, 0x98C3BAC7 },
+ { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 },
+ { 0x983B773B, 0 }, { 0x980E4156, 0x980E4156 },
+ { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED },
+ { 0x97874039, 0 }, { 0x975A7510, 0 },
+ { 0x972DC45B, 0 }, { 0x97012E02, 0x97012E02 },
+ { 0x96D4B1EF, 0 }, { 0x96A8500A, 0 },
+ { 0x967C083B, 0 }, { 0x964FDA6C, 0x964FDA6C },
+ { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 },
+ { 0x95CBEC1B, 0 }, { 0x95A02568, 0x95A02568 },
+ { 0x95747844, 0 }, { 0x9548E498, 0 },
+ { 0x951D6A4E, 0 }, { 0x94F2094F, 0x94F2094F },
+ { 0x94C6C187, 0 }, { 0x949B92DE, 0 },
+ { 0x94707D3F, 0 }, { 0x94458094, 0x94458094 },
+ { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 },
+ { 0x93C51F76, 0 }, { 0x939A85C4, 0x939A85C4 },
+ { 0x9370049C, 0 }, { 0x93459BE7, 0 },
+ { 0x931B4B91, 0 }, { 0x92F11384, 0x92F11384 },
+ { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 },
+ { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 },
+ { 0x921F64BF, 0 }, { 0x91F5BCB9, 0 },
+ { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 },
+ { 0x917952AF, 0 }, { 0x91500915, 0x91500915 },
+ { 0x9126D6E5, 0 }, { 0x90FDBC09, 0x90FDBC09 },
+ { 0x90D4B86F, 0 }, { 0x90ABCC02, 0x90ABCC02 },
+ { 0x9082F6B0, 0 }, { 0x905A3863, 0x905A3863 },
+ { 0x9031910A, 0 }, { 0x90090090, 0x90090090 },
+ { 0x8FE086E3, 0 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8F8FD7A0, 0 }, { 0x8F67A1E4, 0 },
+ { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 },
+ { 0x8EEF8766, 0 }, { 0x8EC7AB3A, 0 },
+ { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D },
+ { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 },
+ { 0x8E01AA05, 0 }, { 0x8DDA5202, 0x8DDA5202 },
+ { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 },
+ { 0x8D64CC5C, 0 }, { 0x8D3DCB09, 0 },
+ { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF },
+ { 0x8CC947C5, 0 }, { 0x8CA29C04, 0x8CA29C04 },
+ { 0x8C7C057D, 0 }, { 0x8C55841D, 0 },
+ { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C },
+ { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 },
+ { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A },
+ { 0x8B4A451A, 0 }, { 0x8B246A88, 0 },
+ { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 },
+ { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 },
+ { 0x8A6858AB, 0 }, { 0x8A42F870, 0x8A42F870 },
+ { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 },
+ { 0x89D3507D, 0 }, { 0x89AE408A, 0 },
+ { 0x89894480, 0 }, { 0x89645C4F, 0x89645C4F },
+ { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 },
+ { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD },
+ { 0x88ACFAEE, 0 }, { 0x88888889, 0 },
+ { 0x8864298F, 0 }, { 0x883FDDF0, 0x883FDDF0 },
+ { 0x881BA59E, 0 }, { 0x87F78088, 0 },
+ { 0x87D36EA0, 0 }, { 0x87AF6FD6, 0 },
+ { 0x878B841B, 0 }, { 0x8767AB5F, 0x8767AB5F },
+ { 0x8743E595, 0 }, { 0x872032AC, 0x872032AC },
+ { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 },
+ { 0x86B58AA8, 0 }, { 0x869222B2, 0 },
+ { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 },
+ { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 },
+ { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 },
+ { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 },
+ { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 },
+ { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 },
+ { 0x84CC6290, 0 }, { 0x84A9F9C8, 0x84A9F9C8 },
+ { 0x8487A2D1, 0 }, { 0x84655D9C, 0 },
+ { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 },
+ { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 },
+ { 0x83BB0C18, 0 }, { 0x83993052, 0x83993052 },
+ { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 },
+ { 0x83340520, 0x83340520 }, { 0x83126E98, 0 },
+ { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 },
+ { 0x82AE11DE, 0 }, { 0x828CBFBF, 0 },
+ { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 },
+ { 0x82292F08, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC },
+ { 0x81A55963, 0 }, { 0x81848DA9, 0 },
+ { 0x8163D283, 0 }, { 0x814327E4, 0 },
+ { 0x81228DBF, 0 }, { 0x81020408, 0x81020408 },
+ { 0x80E18AB3, 0 }, { 0x80C121B3, 0 },
+ { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 },
+ { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 },
+ { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF }
+};
diff --git a/third_party/aom/aom_dsp/odintrin.h b/third_party/aom/aom_dsp/odintrin.h
new file mode 100644
index 0000000000..9e4ba5029a
--- /dev/null
+++ b/third_party/aom/aom_dsp/odintrin.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_AOM_DSP_ODINTRIN_H_
+#define AOM_AOM_DSP_ODINTRIN_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int od_coeff;
+
+#define OD_DIVU_DMAX (1024)
+
+extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
+
+/*Translate unsigned division by small divisors into multiplications.*/
+#define OD_DIVU_SMALL(_x, _d) \
+ ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \
+ OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >> \
+ 32) >> \
+ (OD_ILOG_NZ(_d) - 1))
+
+#define OD_DIVU(_x, _d) \
+ (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d)))
+
+#define OD_MINI AOMMIN
+#define OD_MAXI AOMMAX
+#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
+
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+ OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
+
+/*Enable special features for gcc and compatible compilers.*/
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define OD_GNUC_PREREQ(maj, min, pat) \
+ ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \
+ ((maj) << 16) + ((min) << 8) + pat) // NOLINT
+#else
+#define OD_GNUC_PREREQ(maj, min, pat) (0)
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define OD_WARN_UNUSED_RESULT
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x)))
+#else
+#define OD_ARG_NONNULL(x)
+#endif
+
+/*All of these macros should expect floats as arguments.*/
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_ODINTRIN_H_
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
new file mode 100644
index 0000000000..5711a40a40
--- /dev/null
+++ b/third_party/aom/aom_dsp/prob.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PROB_H_
+#define AOM_AOM_DSP_PROB_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/entcode.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t aom_cdf_prob;
+
+#define CDF_SIZE(x) ((x) + 1)
+#define CDF_PROB_BITS 15
+#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
+ probability (an "inverse" CDF).
+ This function converts from one representation to the other (and is its own
+ inverse).*/
+#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
+
+#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2) \
+ AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
+ AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \
+ AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+ AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+ AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+ AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+ AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+ a14) \
+ AOM_ICDF(a0) \
+ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+ AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \
+ AOM_ICDF(CDF_PROB_TOP), 0
+
+static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
+ assert(den != 0);
+ {
+ const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+ // (p > 255) ? 255 : (p < 1) ? 1 : p;
+ const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+ return (uint8_t)clipped_prob;
+ }
+}
+
+static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
+ assert(nsymbs < 17);
+ const int count = cdf[nsymbs];
+
+ // rate is computed in the spec as:
+ // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+ // In this case cdf[N] is |count|.
+ // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
+ // nsymbs > 3. So the equation becomes:
+ // 4 + (count > 15) + (count > 31) + (nsymbs > 3).
+ // Note that the largest value for count is 32 (it is not incremented beyond
+ // 32). So using that information:
+ // count >> 4 is 0 for count from 0 to 15.
+ // count >> 4 is 1 for count from 16 to 31.
+ // count >> 4 is 2 for count == 31.
+ // Now, the equation becomes:
+ // 4 + (count >> 4) + (nsymbs > 3).
+ const int rate = 4 + (count >> 4) + (nsymbs > 3);
+
+ int i = 0;
+ do {
+ if (i < val) {
+ cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
+ } else {
+ cdf[i] -= cdf[i] >> rate;
+ }
+ } while (++i < nsymbs - 1);
+ cdf[nsymbs] += (count < 32);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
new file mode 100644
index 0000000000..cf0de29945
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_scale/yv12config.h"
+
+double aom_sse_to_psnr(double samples, double peak, double sse) {
+ if (sse > 0.0) {
+ const double psnr = 10.0 * log10(samples * peak * peak / sse);
+ return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+ } else {
+ return MAX_PSNR;
+ }
+}
+
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h) {
+ int i, j;
+ int64_t sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w,
+ int h) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t sse = 0;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const int diff = a[j] - b[j];
+ sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return sse;
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ const int dw = width % 16;
+ const int dh = height % 16;
+ int64_t total_sse = 0;
+ int x, y;
+
+ if (dw > 0) {
+ total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+ dw, height);
+ }
+
+ if (dh > 0) {
+ total_sse +=
+ encoder_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride, width - dw, dh);
+ }
+
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ total_sse += aom_sse(pa, a_stride, pb, b_stride, 16, 16);
+
+ pa += 16;
+ pb += 16;
+ }
+
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+
+ return total_sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height, unsigned int input_shift) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t total_sse = 0;
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; ++x) {
+ int64_t diff;
+ diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+ total_sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+ return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int64_t total_sse = 0;
+ int x, y;
+ const int dw = width % 16;
+ const int dh = height % 16;
+
+ if (dw > 0) {
+ total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw],
+ b_stride, dw, height);
+ }
+ if (dh > 0) {
+ total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride,
+ &b[(height - dh) * b_stride], b_stride,
+ width - dw, dh);
+ }
+
+ for (y = 0; y < height / 16; ++y) {
+ const uint8_t *pa = a;
+ const uint8_t *pb = b;
+ for (x = 0; x < width / 16; ++x) {
+ total_sse += aom_highbd_sse(pa, a_stride, pb, b_stride, 16, 16);
+ pa += 16;
+ pb += 16;
+ }
+ a += 16 * a_stride;
+ b += 16 * b_stride;
+ }
+ return total_sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+ int vstart, int height) {
+ return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+ width, height) /
+ (width * height);
+}
+
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+ int vstart, int height) {
+ return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
+ (width * height);
+}
+
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+ int vstart, int height) {
+ return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
+ (width * height);
+}
+
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height) {
+ return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+ b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
+ width, height);
+}
+
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+
+ return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height) {
+ return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+ b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+ width, height);
+}
+
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+
+ return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height) {
+ return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+ b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+ width, height);
+}
+
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+
+ return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+ int width, int vstart, int height) {
+ return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart,
+ a->y_stride, width, height) /
+ (width * height);
+}
+
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+ int width, int vstart, int height) {
+ return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
+ (width * height);
+}
+
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+ int width, int vstart, int height) {
+ return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride, width, height) /
+ (width * height);
+}
+
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height) {
+ return highbd_get_sse(
+ a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+ b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
+}
+
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+ a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height) {
+ return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride,
+ b->u_buffer + vstart * b->uv_stride + hstart,
+ b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height) {
+ return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
+ a->uv_stride,
+ b->v_buffer + vstart * b->uv_stride + hstart,
+ b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+ assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+ assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+ return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+ a->uv_crop_width, a->uv_crop_height);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ switch (plane) {
+ case 0: return aom_highbd_get_y_sse(a, b);
+ case 1: return aom_highbd_get_u_sse(a, b);
+ case 2: return aom_highbd_get_v_sse(a, b);
+ default: assert(plane >= 0 && plane <= 2); return 0;
+ }
+ } else {
+ switch (plane) {
+ case 0: return aom_get_y_sse(a, b);
+ case 1: return aom_get_u_sse(a, b);
+ case 2: return aom_get_v_sse(a, b);
+ default: assert(plane >= 0 && plane <= 2); return 0;
+ }
+ }
+#else
+ (void)highbd;
+ switch (plane) {
+ case 0: return aom_get_y_sse(a, b);
+ case 1: return aom_get_u_sse(a, b);
+ case 2: return aom_get_v_sse(a, b);
+ default: assert(plane >= 0 && plane <= 2); return 0;
+ }
+#endif
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ uint32_t bit_depth, uint32_t in_bit_depth) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+#if CONFIG_LIBVMAF_PSNR_PEAK
+ double peak = (double)(255 << (in_bit_depth - 8));
+#else
+ double peak = (double)((1 << in_bit_depth) - 1);
+#endif // CONFIG_LIBVMAF_PSNR_PEAK
+ const unsigned int input_shift = bit_depth - in_bit_depth;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ uint64_t sse;
+ if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (input_shift) {
+ sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
+ b_strides[i], w, h, input_shift);
+ } else {
+ sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+ b_strides[i], w, h);
+ }
+ } else {
+ sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
+ h);
+ }
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] =
+ aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+
+ // Compute PSNR based on stream bit depth
+ if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) {
+#if CONFIG_LIBVMAF_PSNR_PEAK
+ peak = (double)(255 << (bit_depth - 8));
+#else
+ peak = (double)((1 << bit_depth) - 1);
+#endif // CONFIG_LIBVMAF_PSNR_PEAK
+ total_sse = 0;
+ total_samples = 0;
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ uint64_t sse;
+ sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+ b_strides[i], w, h);
+ psnr->sse_hbd[1 + i] = sse;
+ psnr->samples_hbd[1 + i] = samples;
+ psnr->psnr_hbd[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse_hbd[0] = total_sse;
+ psnr->samples_hbd[0] = total_samples;
+ psnr->psnr_hbd[0] =
+ aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+ }
+}
+#endif
+
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr) {
+ assert(a->y_crop_width == b->y_crop_width);
+ assert(a->y_crop_height == b->y_crop_height);
+ assert(a->uv_crop_width == b->uv_crop_width);
+ assert(a->uv_crop_height == b->uv_crop_height);
+ static const double peak = 255.0;
+ const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+ const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+ a->uv_crop_height };
+ const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+ const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+ int i;
+ uint64_t total_sse = 0;
+ uint32_t total_samples = 0;
+
+ for (i = 0; i < 3; ++i) {
+ const int w = widths[i];
+ const int h = heights[i];
+ const uint32_t samples = w * h;
+ const uint64_t sse =
+ get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
+ psnr->sse[1 + i] = sse;
+ psnr->samples[1 + i] = samples;
+ psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+ total_sse += sse;
+ total_samples += samples;
+ }
+
+ psnr->sse[0] = total_sse;
+ psnr->samples[0] = total_samples;
+ psnr->psnr[0] =
+ aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
new file mode 100644
index 0000000000..afe6e08856
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PSNR_H_
+#define AOM_AOM_DSP_PSNR_H_
+
+#include "aom_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ double psnr[4]; // total/y/u/v
+ uint64_t sse[4]; // total/y/u/v
+ uint32_t samples[4]; // total/y/u/v
+ double psnr_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth
+ uint64_t sse_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth
+ uint32_t samples_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth
+} PSNR_STATS;
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
+ *
+ * \param[in] samples Number of samples
+ * \param[in] peak Max sample value
+ * \param[in] sse Sum of squared errors
+ */
+double aom_sse_to_psnr(double samples, double peak, double sse);
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+ int vstart, int height);
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+ int vstart, int height);
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart, int width,
+ int vstart, int height);
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int plane, int highbd);
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+ int width, int vstart, int height);
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+ int width, int vstart, int height);
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, int hstart,
+ int width, int vstart, int height);
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+ unsigned int bit_depth, unsigned int in_bit_depth);
+#endif
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+ PSNR_STATS *psnr);
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+ double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
new file mode 100644
index 0000000000..966ba007ed
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnrhvs.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * This code was originally written by: Gregory Maxwell, at the Daala
+ * project.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+ int xstride) {
+ int i, j;
+ (void)xstride;
+ aom_fdct8x8(x, y, ystride);
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+ int xstride) {
+ int i, j;
+ (void)xstride;
+ aom_highbd_fdct8x8(x, y, ystride);
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const double csf_y[8][8] = {
+ { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+ 0.678296995242, 0.466224900598, 0.3265091542 },
+ { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+ 0.868920337363, 0.61280991668, 0.436405793551 },
+ { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+ 0.670882927016, 0.501731932449, 0.372504254596 },
+ { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+ 0.48309405692, 0.380429446972, 0.295774038565 },
+ { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+ 0.352889268808, 0.283006984131, 0.226951348204 },
+ { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+ 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+ { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+ 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+ { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+ 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
+static const double csf_cb420[8][8] = {
+ { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+ 0.898018824055, 0.74725392039, 0.615105596242 },
+ { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+ 1.17428548929, 0.996404342439, 0.830890433625 },
+ { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+ 0.960060382087, 0.849823426169, 0.731221236837 },
+ { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+ 0.751437590932, 0.685398513368, 0.608694761374 },
+ { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+ 0.605503172737, 0.55002013668, 0.495804539034 },
+ { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+ 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+ { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+ 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+ { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+ 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
+static const double csf_cr420[8][8] = {
+ { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+ 0.867069376285, 0.721500455585, 0.593906509971 },
+ { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+ 1.13381474809, 0.962064122248, 0.802254508198 },
+ { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+ 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+ { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+ 0.725539939514, 0.661776842059, 0.587716619023 },
+ { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+ 0.584635025748, 0.531064164893, 0.478717061273 },
+ { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+ 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+ { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+ 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+ { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+ 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
+
+static double convert_score_db(double _score, double _weight, int16_t pix_max) {
+ assert(_score * _weight >= 0.0);
+
+ if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
+ return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+ const unsigned char *dst, int _dystride, double _par,
+ int _w, int _h, int _step, const double _csf[8][8],
+ uint32_t _shift, int buf_is_hbd, int16_t pix_max,
+ int luma) {
+ double ret;
+ const uint8_t *_src8 = src;
+ const uint8_t *_dst8 = dst;
+ const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
+ DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+ DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+ DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+ DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
+ double mask[8][8];
+ int pixels;
+ int x;
+ int y;
+ float sum1;
+ float sum2;
+ float delt;
+ (void)_par;
+ ret = pixels = 0;
+ sum1 = sum2 = delt = 0.0f;
+ for (y = 0; y < _h; y++) {
+ for (x = 0; x < _w; x++) {
+ if (!buf_is_hbd) {
+ sum1 += _src8[y * _systride + x];
+ sum2 += _dst8[y * _dystride + x];
+ } else {
+ sum1 += _src16[y * _systride + x] >> _shift;
+ sum2 += _dst16[y * _dystride + x] >> _shift;
+ }
+ }
+ }
+ if (luma) delt = (sum1 - sum2) / (_w * _h);
+ /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+ their masking table as "we have used the quantization table for the
+ color component Y of JPEG [6] that has been also obtained on the
+ basis of CSF. Note that the values in quantization table JPEG have
+ been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+ was also constructed from the JPEG matrices. I can not find any obvious
+ scheme of normalizing to produce their table, but if I multiply their
+ CSF by 0.3885746225901003 and square the result I get their masking table.
+ I have no idea where this constant comes from, but deviating from it
+ too greatly hurts MOS agreement.
+
+ [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+ Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+ of DCT basis functions", CD-ROM Proceedings of the Third
+ International Workshop on Video Processing and Quality Metrics for Consumer
+ Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+ Suggested in aomedia issue#2363:
+ 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+ of the old JPEG based matrix from the paper. Since you are not using that,
+ divide by actual maximum coefficient. */
+ for (x = 0; x < 8; x++)
+ for (y = 0; y < 8; y++)
+ mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
+ for (y = 0; y < _h - 7; y += _step) {
+ for (x = 0; x < _w - 7; x += _step) {
+ int i;
+ int j;
+ int n = 0;
+ double s_gx = 0;
+ double s_gy = 0;
+ double g = 0;
+ double s_gmean = 0;
+ double s_gvar = 0;
+ double s_mask = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ if (!buf_is_hbd) {
+ dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+ dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+ } else {
+ dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+ dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+ }
+ dct_d[i * 8 + j] += (int)(delt + 0.5f);
+ }
+ }
+ for (i = 1; i < 7; i++) {
+ for (j = 1; j < 7; j++) {
+ s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+ dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 -
+ dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 -
+ dct_s[(i + 1) * 8 + j + 1] * 3) /
+ (pix_max * 16.f);
+ s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+ dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 -
+ dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 -
+ dct_s[(i + 1) * 8 + j + 1] * 3) /
+ (pix_max * 16.f);
+ g = sqrt(s_gx * s_gx + s_gy * s_gy);
+ if (g > 0.1f) n++;
+ s_gmean += g;
+ }
+ }
+ s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (!buf_is_hbd) {
+ od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ } else {
+ hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+ }
+#else
+ od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+ od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ for (i = 0; i < 8; i++)
+ for (j = (i == 0); j < 8; j++)
+ s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+ s_mask = sqrt(s_mask * s_gvar) / 8.f;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ double err;
+ err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+ if (i != 0 || j != 0)
+ err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+ ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+ pixels++;
+ }
+ }
+ }
+ }
+ if (pixels <= 0) return 0;
+ ret /= pixels;
+ ret += 0.04 * delt * delt;
+ return ret;
+}
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
+ double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
+ uint32_t bd, uint32_t in_bd) {
+ double psnrhvs;
+ const double par = 1.0;
+ const int step = 7;
+ uint32_t bd_shift = 0;
+ assert(bd == 8 || bd == 10 || bd == 12);
+ assert(bd >= in_bd);
+ assert(src->flags == dst->flags);
+ const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
+
+ int16_t pix_max = 255;
+ if (in_bd == 10)
+ pix_max = 1023;
+ else if (in_bd == 12)
+ pix_max = 4095;
+
+ bd_shift = bd - in_bd;
+
+ *y_psnrhvs =
+ calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride,
+ par, src->y_crop_width, src->y_crop_height, step, csf_y,
+ bd_shift, buf_is_hbd, pix_max, 1);
+ *u_psnrhvs =
+ calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ par, src->uv_crop_width, src->uv_crop_height, step,
+ csf_cb420, bd_shift, buf_is_hbd, pix_max, 0);
+ *v_psnrhvs =
+ calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ par, src->uv_crop_width, src->uv_crop_height, step,
+ csf_cr420, bd_shift, buf_is_hbd, pix_max, 0);
+ psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+ return convert_score_db(psnrhvs, 1.0, pix_max);
+}
diff --git a/third_party/aom/aom_dsp/pyramid.c b/third_party/aom/aom_dsp/pyramid.c
new file mode 100644
index 0000000000..324a18baea
--- /dev/null
+++ b/third_party/aom/aom_dsp/pyramid.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/pyramid.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_util/aom_thread.h"
+
+// TODO(rachelbarker): Move needed code from av1/ to aom_dsp/
+#include "av1/common/resize.h"
+
+#include <assert.h>
+#include <string.h>
+
+// Lifecycle:
+// * Frame buffer alloc code calls aom_get_pyramid_alloc_size()
+// to work out how much space is needed for a given number of pyramid
+// levels. This is counted in the size checked against the max allocation
+// limit
+// * Then calls aom_alloc_pyramid() to actually create the pyramid
+// * Pyramid is initially marked as invalid (no data)
+// * Whenever pyramid is needed, we check the valid flag. If set, use existing
+// data. If not set, compute full pyramid
+// * Whenever frame buffer is reused, clear the valid flag
+// * Whenever frame buffer is resized, reallocate pyramid
+
+size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels,
+ bool image_is_16bit) {
+ // Limit number of levels on small frames
+ const int msb = get_msb(AOMMIN(width, height));
+ const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+ n_levels = AOMMIN(n_levels, max_levels);
+
+ size_t alloc_size = 0;
+ alloc_size += sizeof(ImagePyramid);
+ alloc_size += n_levels * sizeof(PyramidLayer);
+
+ // Calculate how much memory is needed for downscaled frame buffers
+ size_t buffer_size = 0;
+
+ // Work out if we need to allocate a few extra bytes for alignment.
+ // aom_memalign() will ensure that the start of the allocation is aligned
+ // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel
+ // to be aligned, not the first byte of the allocation.
+ //
+ // In the loop below, we ensure that the stride of every image is a multiple
+ // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will
+ // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the
+ // first pixel in the first pyramid layer aligned properly, that will
+ // automatically mean that the first pixel of every row of every layer is
+ // properly aligned too.
+ //
+ // Thus all we need to consider is the first pixel in the first layer.
+ // This is located at offset
+ // extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING
+ // bytes into the buffer. Since level_stride is a multiple of
+ // PYRAMID_ALIGNMENT, we can ignore that. So we need
+ // extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT
+ //
+ // To solve this, we can round PYRAMID_PADDING up to the next multiple
+ // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate
+ // how many extra bytes are needed.
+ size_t first_px_offset =
+ (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+ size_t extra_bytes = first_px_offset - PYRAMID_PADDING;
+ buffer_size += extra_bytes;
+
+ // If the original image is stored in an 8-bit buffer, then we can point the
+ // lowest pyramid level at that buffer rather than allocating a new one.
+ int first_allocated_level = image_is_16bit ? 0 : 1;
+
+ for (int level = first_allocated_level; level < n_levels; level++) {
+ int level_width = width >> level;
+ int level_height = height >> level;
+
+ // Allocate padding for each layer
+ int padded_width = level_width + 2 * PYRAMID_PADDING;
+ int padded_height = level_height + 2 * PYRAMID_PADDING;
+
+ // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT
+ // This ensures that, as long as the top-left pixel in this pyramid level is
+ // properly aligned, then so will the leftmost pixel in every row of the
+ // pyramid level.
+ int level_stride =
+ (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+
+ buffer_size += level_stride * padded_height;
+ }
+
+ alloc_size += buffer_size;
+
+ return alloc_size;
+}
+
+ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
+ bool image_is_16bit) {
+ // Limit number of levels on small frames
+ const int msb = get_msb(AOMMIN(width, height));
+ const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+ n_levels = AOMMIN(n_levels, max_levels);
+
+ ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr));
+ if (!pyr) {
+ return NULL;
+ }
+
+ pyr->layers = aom_calloc(n_levels, sizeof(*pyr->layers));
+ if (!pyr->layers) {
+ aom_free(pyr);
+ return NULL;
+ }
+
+ pyr->valid = false;
+ pyr->n_levels = n_levels;
+
+ // Compute sizes and offsets for each pyramid level
+ // These are gathered up first, so that we can allocate all pyramid levels
+ // in a single buffer
+ size_t buffer_size = 0;
+ size_t *layer_offsets = aom_calloc(n_levels, sizeof(*layer_offsets));
+ if (!layer_offsets) {
+ aom_free(pyr->layers);
+ aom_free(pyr);
+ return NULL;
+ }
+
+ // Work out if we need to allocate a few extra bytes for alignment.
+ // aom_memalign() will ensure that the start of the allocation is aligned
+ // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel
+ // to be aligned, not the first byte of the allocation.
+ //
+ // In the loop below, we ensure that the stride of every image is a multiple
+ // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will
+ // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the
+ // first pixel in the first pyramid layer aligned properly, that will
+ // automatically mean that the first pixel of every row of every layer is
+ // properly aligned too.
+ //
+ // Thus all we need to consider is the first pixel in the first layer.
+ // This is located at offset
+ // extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING
+ // bytes into the buffer. Since level_stride is a multiple of
+ // PYRAMID_ALIGNMENT, we can ignore that. So we need
+ // extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT
+ //
+ // To solve this, we can round PYRAMID_PADDING up to the next multiple
+ // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate
+ // how many extra bytes are needed.
+ size_t first_px_offset =
+ (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+ size_t extra_bytes = first_px_offset - PYRAMID_PADDING;
+ buffer_size += extra_bytes;
+
+ // If the original image is stored in an 8-bit buffer, then we can point the
+ // lowest pyramid level at that buffer rather than allocating a new one.
+ int first_allocated_level = image_is_16bit ? 0 : 1;
+
+ for (int level = first_allocated_level; level < n_levels; level++) {
+ PyramidLayer *layer = &pyr->layers[level];
+
+ int level_width = width >> level;
+ int level_height = height >> level;
+
+ // Allocate padding for each layer
+ int padded_width = level_width + 2 * PYRAMID_PADDING;
+ int padded_height = level_height + 2 * PYRAMID_PADDING;
+
+ // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT
+ // This ensures that, as long as the top-left pixel in this pyramid level is
+ // properly aligned, then so will the leftmost pixel in every row of the
+ // pyramid level.
+ int level_stride =
+ (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+
+ size_t level_alloc_start = buffer_size;
+ size_t level_start =
+ level_alloc_start + PYRAMID_PADDING * level_stride + PYRAMID_PADDING;
+
+ buffer_size += level_stride * padded_height;
+
+ layer_offsets[level] = level_start;
+ layer->width = level_width;
+ layer->height = level_height;
+ layer->stride = level_stride;
+ }
+
+ pyr->buffer_alloc =
+ aom_memalign(PYRAMID_ALIGNMENT, buffer_size * sizeof(*pyr->buffer_alloc));
+ if (!pyr->buffer_alloc) {
+ aom_free(pyr->layers);
+ aom_free(pyr);
+ aom_free(layer_offsets);
+ return NULL;
+ }
+
+ // Fill in pointers for each level
+ // If image is 8-bit, then the lowest level is left unconfigured for now,
+ // and will be set up properly when the pyramid is filled in
+ for (int level = first_allocated_level; level < n_levels; level++) {
+ PyramidLayer *layer = &pyr->layers[level];
+ layer->buffer = pyr->buffer_alloc + layer_offsets[level];
+ }
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_init(&pyr->mutex, NULL);
+#endif // CONFIG_MULTITHREAD
+
+ aom_free(layer_offsets);
+ return pyr;
+}
+
+// Fill the border region of a pyramid frame.
+// This must be called after the main image area is filled out.
+// `img_buf` should point to the first pixel in the image area,
+// ie. it should be pyr->level_buffer + pyr->level_loc[level].
+static INLINE void fill_border(uint8_t *img_buf, const int width,
+ const int height, const int stride) {
+ // Fill left and right areas
+ for (int row = 0; row < height; row++) {
+ uint8_t *row_start = &img_buf[row * stride];
+ uint8_t left_pixel = row_start[0];
+ memset(row_start - PYRAMID_PADDING, left_pixel, PYRAMID_PADDING);
+ uint8_t right_pixel = row_start[width - 1];
+ memset(row_start + width, right_pixel, PYRAMID_PADDING);
+ }
+
+ // Fill top area
+ for (int row = -PYRAMID_PADDING; row < 0; row++) {
+ uint8_t *row_start = &img_buf[row * stride];
+ memcpy(row_start - PYRAMID_PADDING, img_buf - PYRAMID_PADDING,
+ width + 2 * PYRAMID_PADDING);
+ }
+
+ // Fill bottom area
+ uint8_t *last_row_start = &img_buf[(height - 1) * stride];
+ for (int row = height; row < height + PYRAMID_PADDING; row++) {
+ uint8_t *row_start = &img_buf[row * stride];
+ memcpy(row_start - PYRAMID_PADDING, last_row_start - PYRAMID_PADDING,
+ width + 2 * PYRAMID_PADDING);
+ }
+}
+
+// Compute coarse to fine pyramids for a frame
+// This must only be called while holding frame_pyr->mutex
+static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+ ImagePyramid *frame_pyr) {
+ int n_levels = frame_pyr->n_levels;
+ const int frame_width = frame->y_crop_width;
+ const int frame_height = frame->y_crop_height;
+ const int frame_stride = frame->y_stride;
+ assert((frame_width >> n_levels) >= 0);
+ assert((frame_height >> n_levels) >= 0);
+
+ PyramidLayer *first_layer = &frame_pyr->layers[0];
+ if (frame->flags & YV12_FLAG_HIGHBITDEPTH) {
+ // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits
+ assert(first_layer->width == frame_width);
+ assert(first_layer->height == frame_height);
+
+ uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer);
+ uint8_t *pyr_buffer = first_layer->buffer;
+ int pyr_stride = first_layer->stride;
+ for (int y = 0; y < frame_height; y++) {
+ uint16_t *frame_row = frame_buffer + y * frame_stride;
+ uint8_t *pyr_row = pyr_buffer + y * pyr_stride;
+ for (int x = 0; x < frame_width; x++) {
+ pyr_row[x] = frame_row[x] >> (bit_depth - 8);
+ }
+ }
+
+ fill_border(pyr_buffer, frame_width, frame_height, pyr_stride);
+ } else {
+ // For frames stored in an 8-bit buffer, we need to configure the first
+ // pyramid layer to point at the original image buffer
+ first_layer->buffer = frame->y_buffer;
+ first_layer->width = frame_width;
+ first_layer->height = frame_height;
+ first_layer->stride = frame_stride;
+ }
+
+ // Fill in the remaining levels through progressive downsampling
+ for (int level = 1; level < n_levels; ++level) {
+ PyramidLayer *prev_layer = &frame_pyr->layers[level - 1];
+ uint8_t *prev_buffer = prev_layer->buffer;
+ int prev_stride = prev_layer->stride;
+
+ PyramidLayer *this_layer = &frame_pyr->layers[level];
+ uint8_t *this_buffer = this_layer->buffer;
+ int this_width = this_layer->width;
+ int this_height = this_layer->height;
+ int this_stride = this_layer->stride;
+
+ // Compute the this pyramid level by downsampling the current level.
+ //
+ // We downsample by a factor of exactly 2, clipping the rightmost and
+ // bottommost pixel off of the current level if needed. We do this for
+ // two main reasons:
+ //
+ // 1) In the disflow code, when stepping from a higher pyramid level to a
+ // lower pyramid level, we need to not just interpolate the flow field
+ // but also to scale each flow vector by the upsampling ratio.
+ // So it is much more convenient if this ratio is simply 2.
+ //
+ // 2) Up/downsampling by a factor of 2 can be implemented much more
+ // efficiently than up/downsampling by a generic ratio.
+ // TODO(rachelbarker): Use optimized downsample-by-2 function
+ if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
+ prev_stride, this_buffer, this_height, this_width,
+ this_stride))
+ return false;
+ fill_border(this_buffer, this_width, this_height, this_stride);
+ }
+ return true;
+}
+
+// Fill out a downsampling pyramid for a given frame.
+//
+// The top level (index 0) will always be an 8-bit copy of the input frame,
+// regardless of the input bit depth. Additional levels are then downscaled
+// by powers of 2.
+//
+// For small input frames, the number of levels actually constructed
+// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE
+// pixels along each side.
+//
+// However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
+// we will still construct the top level.
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+ ImagePyramid *pyr) {
+ assert(pyr);
+
+ // Per the comments in the ImagePyramid struct, we must take this mutex
+ // before reading or writing the "valid" flag, and hold it while computing
+ // the pyramid, to ensure proper behaviour if multiple threads call this
+ // function simultaneously
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+
+ if (!pyr->valid) {
+ pyr->valid = fill_pyramid(frame, bit_depth, pyr);
+ }
+ bool valid = pyr->valid;
+
+ // At this point, the pyramid is guaranteed to be valid, and can be safely
+ // read from without holding the mutex any more
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+ return valid;
+}
+
+#ifndef NDEBUG
+// Check if a pyramid has already been computed.
+// This is mostly a debug helper - as it is necessary to hold pyr->mutex
+// while reading the valid flag, we cannot just write:
+// assert(pyr->valid);
+// This function allows the check to be correctly written as:
+// assert(aom_is_pyramid_valid(pyr));
+bool aom_is_pyramid_valid(ImagePyramid *pyr) {
+ assert(pyr);
+
+ // Per the comments in the ImagePyramid struct, we must take this mutex
+ // before reading or writing the "valid" flag, and hold it while computing
+ // the pyramid, to ensure proper behaviour if multiple threads call this
+ // function simultaneously
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+
+ bool valid = pyr->valid;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+
+ return valid;
+}
+#endif
+
+// Mark a pyramid as no longer containing valid data.
+// This must be done whenever the corresponding frame buffer is reused
+void aom_invalidate_pyramid(ImagePyramid *pyr) {
+ if (pyr) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+ pyr->valid = false;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+ }
+}
+
+// Release the memory associated with a pyramid
+void aom_free_pyramid(ImagePyramid *pyr) {
+ if (pyr) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&pyr->mutex);
+#endif // CONFIG_MULTITHREAD
+ aom_free(pyr->buffer_alloc);
+ aom_free(pyr->layers);
+ aom_free(pyr);
+ }
+}
diff --git a/third_party/aom/aom_dsp/pyramid.h b/third_party/aom/aom_dsp/pyramid.h
new file mode 100644
index 0000000000..9442a1ff08
--- /dev/null
+++ b/third_party/aom/aom_dsp/pyramid.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PYRAMID_H_
+#define AOM_AOM_DSP_PYRAMID_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Minimum dimensions of a downsampled image
+#define MIN_PYRAMID_SIZE_LOG2 3
+#define MIN_PYRAMID_SIZE (1 << MIN_PYRAMID_SIZE_LOG2)
+
+// Size of border around each pyramid image, in pixels
+// Similarly to the border around regular image buffers, this border is filled
+// with copies of the outermost pixels of the frame, to allow for more efficient
+// convolution code
+// TODO(rachelbarker): How many pixels do we actually need here?
+// I think we only need 9 for disflow, but how many for corner matching?
+#define PYRAMID_PADDING 16
+
+// Byte alignment of each line within the image pyramids.
+// That is, the first pixel inside the image (ie, not in the border region),
+// on each row of each pyramid level, is aligned to this byte alignment.
+// This value must be a power of 2.
+#define PYRAMID_ALIGNMENT 32
+
+typedef struct {
+ uint8_t *buffer;
+ int width;
+ int height;
+ int stride;
+} PyramidLayer;
+
+// Struct for an image pyramid
+typedef struct image_pyramid {
+#if CONFIG_MULTITHREAD
+ // Mutex which is used to prevent the pyramid being computed twice at the
+ // same time
+ //
+ // Semantics:
+ // * This mutex must be held whenever reading or writing the `valid` flag
+ //
+ // * This mutex must also be held while computing the image pyramid,
+ // to ensure that only one thread may do so at a time.
+ //
+ // * However, once you have read the valid flag and seen a true value,
+ // it is safe to drop the mutex and read from the remaining fields.
+ // This is because, once the image pyramid is computed, its contents
+ // will not be changed until the parent frame buffer is recycled,
+ // which will not happen until there are no more outstanding references
+ // to the frame buffer.
+ pthread_mutex_t mutex;
+#endif
+ // Flag indicating whether the pyramid contains valid data
+ bool valid;
+ // Number of allocated/filled levels in this pyramid
+ int n_levels;
+ // Pointer to allocated buffer
+ uint8_t *buffer_alloc;
+ // Data for each level
+ // The `buffer` pointers inside this array point into the region which
+ // is stored in the `buffer_alloc` field here
+ PyramidLayer *layers;
+} ImagePyramid;
+
+size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels,
+ bool image_is_16bit);
+
+ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
+ bool image_is_16bit);
+
+// Fill out a downsampling pyramid for a given frame.
+//
+// The top level (index 0) will always be an 8-bit copy of the input frame,
+// regardless of the input bit depth. Additional levels are then downscaled
+// by powers of 2.
+//
+// For small input frames, the number of levels actually constructed
+// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE
+// pixels along each side.
+//
+// However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
+// we will still construct the top level.
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+ ImagePyramid *pyr);
+
+#ifndef NDEBUG
+// Check if a pyramid has already been computed.
+// This is mostly a debug helper - as it is necessary to hold pyr->mutex
+// while reading the valid flag, we cannot just write:
+// assert(pyr->valid);
+// This function allows the check to be correctly written as:
+// assert(aom_is_pyramid_valid(pyr));
+bool aom_is_pyramid_valid(ImagePyramid *pyr);
+#endif
+
+// Mark a pyramid as no longer containing valid data.
+// This must be done whenever the corresponding frame buffer is reused
+void aom_invalidate_pyramid(ImagePyramid *pyr);
+
+// Release the memory associated with a pyramid
+void aom_free_pyramid(ImagePyramid *pyr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_PYRAMID_H_
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
new file mode 100644
index 0000000000..e5c960b826
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+#if !CONFIG_REALTIME_ONLY
+void aom_quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ int prescan_add[2];
+ for (i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32;
+
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+ int64_t tmp =
+ clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp *= wt;
+ tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS)); // quantization
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+ if (tmp32) {
+ eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+ if (first == -1) first = i;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ }
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ if (eob >= 0 && first == eob) {
+ const int rc = scan[eob];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ eob = -1;
+ }
+ }
+ }
+#endif // SKIP_EOB_FACTOR_ADJUST
+ *eob_ptr = eob + 1;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+ const int log_scale) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32;
+
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+ int64_t tmp =
+ clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp *= wt;
+ tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS)); // quantization
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+ if (tmp32) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+void aom_highbd_quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ (void)iscan;
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ int prescan_add[2];
+ for (i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) {
+ eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+ if (first == -1) first = eob;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ }
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ if (eob >= 0 && first == eob) {
+ const int rc = scan[eob];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ eob = -1;
+ }
+ }
+ }
+#endif // SKIP_EOB_FACTOR_ADJUST
+ *eob_ptr = eob + 1;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void aom_highbd_quantize_b_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ int i, eob = -1;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int dequant;
+ int idx_arr[4096];
+ (void)iscan;
+ int idx = 0;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
+ coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = idx_arr[i];
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+/* These functions should only be called when quantisation matrices
+ are not used. */
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, NULL, NULL, 2);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+ round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+ round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+ round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 2);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
new file mode 100644
index 0000000000..efe253ddb9
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_QUANTIZE_H_
+#define AOM_AOM_DSP_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
+void aom_quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+ const int log_scale);
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/recenter.h b/third_party/aom/aom_dsp/recenter.h
new file mode 100644
index 0000000000..b3fd412907
--- /dev/null
+++ b/third_party/aom/aom_dsp/recenter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_RECENTER_H_
+#define AOM_AOM_DSP_RECENTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+// Inverse recenters a non-negative literal v around a reference r
+static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
+ if (v > (r << 1))
+ return v;
+ else if ((v & 1) == 0)
+ return (v >> 1) + r;
+ else
+ return r - ((v + 1) >> 1);
+}
+
+// Inverse recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r,
+ uint16_t v) {
+ if ((r << 1) <= n) {
+ return inv_recenter_nonneg(r, v);
+ } else {
+ return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
+ }
+}
+
+// Recenters a non-negative literal v around a reference r
+static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
+ if (v > (r << 1))
+ return v;
+ else if (v >= r)
+ return ((v - r) << 1);
+ else
+ return ((r - v) << 1) - 1;
+}
+
+// Recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static INLINE uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r,
+ uint16_t v) {
+ if ((r << 1) <= n) {
+ return recenter_nonneg(r, v);
+ } else {
+ return recenter_nonneg(n - 1 - r, n - 1 - v);
+ }
+}
+
+#endif // AOM_AOM_DSP_RECENTER_H_
diff --git a/third_party/aom/aom_dsp/rect.h b/third_party/aom/aom_dsp/rect.h
new file mode 100644
index 0000000000..11bdaca979
--- /dev/null
+++ b/third_party/aom/aom_dsp/rect.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_RECT_H_
+#define AOM_AOM_DSP_RECT_H_
+
+#include "config/aom_config.h"
+
+#include <stdbool.h>
+
+// Struct representing a rectangle of pixels.
+// The axes are inclusive-exclusive, ie. the point (top, left) is included
+// in the rectangle but (bottom, right) is not.
+typedef struct {
+ int left, right, top, bottom;
+} PixelRect;
+
+static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; }
+
+static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; }
+
+static INLINE bool is_inside_rect(const int x, const int y,
+ const PixelRect *r) {
+ return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom);
+}
+
+#endif // AOM_AOM_DSP_RECT_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
new file mode 100644
index 0000000000..8d69e3bf1c
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ sad += abs(a[x] - b[x]);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+#define SADMXN(m, n) \
+ unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad(src, src_stride, ref, ref_stride, m, n); \
+ } \
+ unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint8_t comp_pred[m * n]; \
+ aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return sad(src, src_stride, comp_pred, m, m, n); \
+ } \
+ unsigned int aom_dist_wtd_sad##m##x##n##_avg_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint8_t comp_pred[m * n]; \
+ aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, \
+ ref_stride, jcp_param); \
+ return sad(src, src_stride, comp_pred, m, m, n); \
+ } \
+ unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
+ }
+
+// Calculate sad against 4 reference locations and store each in sad_array
+#define SAD_MXNX4D(m, n) \
+ void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = \
+ aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+ } \
+ } \
+ void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i], \
+ 2 * ref_stride, (m), (n / 2)); \
+ } \
+ }
+// Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable.
+#define SAD_MXNX3D(m, n) \
+ void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \
+ }
+
+// 128x128
+SADMXN(128, 128)
+SAD_MXNX4D(128, 128)
+SAD_MXNX3D(128, 128)
+
+// 128x64
+SADMXN(128, 64)
+SAD_MXNX4D(128, 64)
+SAD_MXNX3D(128, 64)
+
+// 64x128
+SADMXN(64, 128)
+SAD_MXNX4D(64, 128)
+SAD_MXNX3D(64, 128)
+
+// 64x64
+SADMXN(64, 64)
+SAD_MXNX4D(64, 64)
+SAD_MXNX3D(64, 64)
+
+// 64x32
+SADMXN(64, 32)
+SAD_MXNX4D(64, 32)
+SAD_MXNX3D(64, 32)
+
+// 32x64
+SADMXN(32, 64)
+SAD_MXNX4D(32, 64)
+SAD_MXNX3D(32, 64)
+
+// 32x32
+SADMXN(32, 32)
+SAD_MXNX4D(32, 32)
+SAD_MXNX3D(32, 32)
+
+// 32x16
+SADMXN(32, 16)
+SAD_MXNX4D(32, 16)
+SAD_MXNX3D(32, 16)
+
+// 16x32
+SADMXN(16, 32)
+SAD_MXNX4D(16, 32)
+SAD_MXNX3D(16, 32)
+
+// 16x16
+SADMXN(16, 16)
+SAD_MXNX4D(16, 16)
+SAD_MXNX3D(16, 16)
+
+// 16x8
+SADMXN(16, 8)
+SAD_MXNX4D(16, 8)
+SAD_MXNX3D(16, 8)
+
+// 8x16
+SADMXN(8, 16)
+SAD_MXNX4D(8, 16)
+SAD_MXNX3D(8, 16)
+
+// 8x8
+SADMXN(8, 8)
+SAD_MXNX4D(8, 8)
+SAD_MXNX3D(8, 8)
+
+// 8x4
+SADMXN(8, 4)
+SAD_MXNX4D(8, 4)
+SAD_MXNX3D(8, 4)
+
+// 4x8
+SADMXN(4, 8)
+SAD_MXNX4D(4, 8)
+SAD_MXNX3D(4, 8)
+
+// 4x4
+SADMXN(4, 4)
+SAD_MXNX4D(4, 4)
+SAD_MXNX3D(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
+SADMXN(4, 16)
+SAD_MXNX4D(4, 16)
+SADMXN(16, 4)
+SAD_MXNX4D(16, 4)
+SADMXN(8, 32)
+SAD_MXNX4D(8, 32)
+SADMXN(32, 8)
+SAD_MXNX4D(32, 8)
+SADMXN(16, 64)
+SAD_MXNX4D(16, 64)
+SADMXN(64, 16)
+SAD_MXNX4D(64, 16)
+SAD_MXNX3D(4, 16)
+SAD_MXNX3D(16, 4)
+SAD_MXNX3D(8, 32)
+SAD_MXNX3D(32, 8)
+SAD_MXNX3D(16, 64)
+SAD_MXNX3D(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ sad += abs(a[x] - b[x]);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ sad += abs(a[x] - b[x]);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+#define HIGHBD_SADMXN(m, n) \
+ unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
+ } \
+ unsigned int aom_highbd_sad##m##x##n##_avg_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint16_t comp_pred[m * n]; \
+ uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \
+ aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride); \
+ return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \
+ } \
+ unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t comp_pred[m * n]; \
+ uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \
+ aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, \
+ ref_stride, jcp_param); \
+ return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \
+ } \
+ unsigned int aom_highbd_sad_skip_##m##x##n##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
+ }
+
+#define HIGHBD_SAD_MXNX4D(m, n) \
+ void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
+ } \
+ void aom_highbd_sad_skip_##m##x##n##x4d_c( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i], \
+ 2 * ref_stride, (m), (n / 2)); \
+ } \
+ }
+// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
+#define HIGHBD_SAD_MXNX3D(m, n) \
+ void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
+
+// 128x128
+HIGHBD_SADMXN(128, 128)
+HIGHBD_SAD_MXNX4D(128, 128)
+HIGHBD_SAD_MXNX3D(128, 128)
+
+// 128x64
+HIGHBD_SADMXN(128, 64)
+HIGHBD_SAD_MXNX4D(128, 64)
+HIGHBD_SAD_MXNX3D(128, 64)
+
+// 64x128
+HIGHBD_SADMXN(64, 128)
+HIGHBD_SAD_MXNX4D(64, 128)
+HIGHBD_SAD_MXNX3D(64, 128)
+
+// 64x64
+HIGHBD_SADMXN(64, 64)
+HIGHBD_SAD_MXNX4D(64, 64)
+HIGHBD_SAD_MXNX3D(64, 64)
+
+// 64x32
+HIGHBD_SADMXN(64, 32)
+HIGHBD_SAD_MXNX4D(64, 32)
+HIGHBD_SAD_MXNX3D(64, 32)
+
+// 32x64
+HIGHBD_SADMXN(32, 64)
+HIGHBD_SAD_MXNX4D(32, 64)
+HIGHBD_SAD_MXNX3D(32, 64)
+
+// 32x32
+HIGHBD_SADMXN(32, 32)
+HIGHBD_SAD_MXNX4D(32, 32)
+HIGHBD_SAD_MXNX3D(32, 32)
+
+// 32x16
+HIGHBD_SADMXN(32, 16)
+HIGHBD_SAD_MXNX4D(32, 16)
+HIGHBD_SAD_MXNX3D(32, 16)
+
+// 16x32
+HIGHBD_SADMXN(16, 32)
+HIGHBD_SAD_MXNX4D(16, 32)
+HIGHBD_SAD_MXNX3D(16, 32)
+
+// 16x16
+HIGHBD_SADMXN(16, 16)
+HIGHBD_SAD_MXNX4D(16, 16)
+HIGHBD_SAD_MXNX3D(16, 16)
+
+// 16x8
+HIGHBD_SADMXN(16, 8)
+HIGHBD_SAD_MXNX4D(16, 8)
+HIGHBD_SAD_MXNX3D(16, 8)
+
+// 8x16
+HIGHBD_SADMXN(8, 16)
+HIGHBD_SAD_MXNX4D(8, 16)
+HIGHBD_SAD_MXNX3D(8, 16)
+
+// 8x8
+HIGHBD_SADMXN(8, 8)
+HIGHBD_SAD_MXNX4D(8, 8)
+HIGHBD_SAD_MXNX3D(8, 8)
+
+// 8x4
+HIGHBD_SADMXN(8, 4)
+HIGHBD_SAD_MXNX4D(8, 4)
+HIGHBD_SAD_MXNX3D(8, 4)
+
+// 4x8
+HIGHBD_SADMXN(4, 8)
+HIGHBD_SAD_MXNX4D(4, 8)
+HIGHBD_SAD_MXNX3D(4, 8)
+
+// 4x4
+HIGHBD_SADMXN(4, 4)
+HIGHBD_SAD_MXNX4D(4, 4)
+HIGHBD_SAD_MXNX3D(4, 4)
+
+HIGHBD_SADMXN(4, 16)
+HIGHBD_SAD_MXNX4D(4, 16)
+HIGHBD_SADMXN(16, 4)
+HIGHBD_SAD_MXNX4D(16, 4)
+HIGHBD_SADMXN(8, 32)
+HIGHBD_SAD_MXNX4D(8, 32)
+HIGHBD_SADMXN(32, 8)
+HIGHBD_SAD_MXNX4D(32, 8)
+HIGHBD_SADMXN(16, 64)
+HIGHBD_SAD_MXNX4D(16, 64)
+HIGHBD_SADMXN(64, 16)
+HIGHBD_SAD_MXNX4D(64, 16)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX3D(4, 16)
+HIGHBD_SAD_MXNX3D(16, 4)
+HIGHBD_SAD_MXNX3D(8, 32)
+HIGHBD_SAD_MXNX3D(32, 8)
+HIGHBD_SAD_MXNX3D(16, 64)
+HIGHBD_SAD_MXNX3D(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
new file mode 100644
index 0000000000..f3d5847bd5
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad_av1.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride, int width,
+ int height) {
+ int y, x;
+ unsigned int sad = 0;
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+ sad += abs(pred - src[x]);
+ }
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+ return sad;
+}
+
+#define MASKSADMxN(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+ msk_stride, m, n); \
+ else \
+ return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
+ msk_stride, m, n); \
+ } \
+ void aom_masked_sad##m##x##n##x4d_c( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int invert_mask, unsigned sads[4]) { \
+ if (!invert_mask) \
+ for (int i = 0; i < 4; i++) { \
+ sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
+ m, msk, msk_stride, m, n); \
+ } \
+ else \
+ for (int i = 0; i < 4; i++) { \
+ sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i], \
+ ref_stride, msk, msk_stride, m, n); \
+ } \
+ }
+
+/* clang-format off */
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+MASKSADMxN(4, 16)
+MASKSADMxN(16, 4)
+MASKSADMxN(8, 32)
+MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+/* clang-format on */
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ static INLINE
+ unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m, int m_stride, int width,
+ int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+ sad += abs(pred - src[x]);
+ }
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_c( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ if (!invert_mask) \
+ return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \
+ second_pred8, m, msk, msk_stride, m, n); \
+ else \
+ return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
+ ref_stride, msk, msk_stride, m, n); \
+ }
+
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+HIGHBD_MASKSADMXN(4, 16)
+HIGHBD_MASKSADMXN(16, 4)
+HIGHBD_MASKSADMXN(8, 32)
+HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+ pre += pre_stride;
+ wsrc += width;
+ mask += width;
+ }
+
+ return sad;
+}
+
+#define OBMCSADMxN(m, n) \
+ unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+ }
+
+/* clang-format off */
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+OBMCSADMxN(4, 16)
+OBMCSADMxN(16, 4)
+OBMCSADMxN(8, 32)
+OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+/* clang-format on */
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ static INLINE
+ unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+ pre += pre_stride;
+ wsrc += width;
+ mask += width;
+ }
+
+ return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n) \
+ unsigned int aom_highbd_obmc_sad##m##x##n##_c( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+ }
+
+/* clang-format off */
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+HIGHBD_OBMCSADMXN(4, 16)
+HIGHBD_OBMCSADMXN(16, 4)
+HIGHBD_OBMCSADMXN(8, 32)
+HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+/* clang-format on */
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
new file mode 100644
index 0000000000..218a7a6186
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v128 v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
+SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
+SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
+SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
+ return c_v128_from_64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
+ return c_v128_from_v64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return c_v128_from_32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+ return c_v128_load_unaligned(p);
+}
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+ return c_v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+ c_v128_store_unaligned(p, a);
+}
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+ c_v128_store_aligned(p, a);
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+ return c_v128_align(a, b, c);
+}
+
+SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
+
+SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) {
+ return c_v128_sad_u8_init();
+}
+SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) {
+ return c_v128_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) {
+ return c_v128_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) {
+ return c_v128_ssd_u8_init();
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) {
+ return c_v128_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) {
+ return c_v128_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+ return c_v128_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+ return c_v128_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+ return c_v128_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
+SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
+SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
+SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+ return c_v128_mullo_s16(a, b);
+}
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+ return c_v128_mulhi_s16(a, b);
+}
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+ return c_v128_mullo_s32(a, b);
+}
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+ return c_v128_blend_8(a, b, c);
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+ return c_v128_rdavg_u16(a, b);
+}
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+ return c_v128_unziplo_8(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+ return c_v128_unziphi_8(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+ return c_v128_unziplo_16(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+ return c_v128_unziphi_16(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+ return c_v128_unziplo_32(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+ return c_v128_unziphi_32(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+ return c_v128_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+ return c_v128_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+ return c_v128_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+ return c_v128_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+ return c_v128_pack_s32_s16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+ return c_v128_pack_s32_u16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+ return c_v128_pack_s16_u8(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+ return c_v128_pack_s16_s8(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+ return c_v128_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+ return c_v128_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+ return c_v128_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+ return c_v128_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
+ return c_v128_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+ return c_v128_cmpgt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+ return c_v128_cmplt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+ return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+ return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+ return c_v128_shl_8(a, c);
+}
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+ return c_v128_shr_u8(a, c);
+}
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+ return c_v128_shr_s8(a, c);
+}
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+ return c_v128_shl_16(a, c);
+}
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+ return c_v128_shr_u16(a, c);
+}
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+ return c_v128_shr_s16(a, c);
+}
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+ return c_v128_shl_32(a, c);
+}
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+ return c_v128_shr_u32(a, c);
+}
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+ return c_v128_shr_s32(a, c);
+}
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+ return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+ return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+ return c_v128_shr_s64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+ return c_v128_shr_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+ return c_v128_shl_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
+ return c_v128_shl_n_8(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
+ return c_v128_shl_n_16(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
+ return c_v128_shl_n_32(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+ return c_v128_shl_n_64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
+ return c_v128_shr_n_u8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
+ return c_v128_shr_n_u16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
+ return c_v128_shr_n_u32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+ return c_v128_shr_n_u64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
+ return c_v128_shr_n_s8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
+ return c_v128_shr_n_s16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
+ return c_v128_shr_n_s32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+ return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+ return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+ v128 b) {
+ return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+ return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) {
+ return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+ v128 b) {
+ return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+ return c_v128_ssd_s16_sum(s);
+}
+
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
new file mode 100644
index 0000000000..f5ca817fb6
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+typedef union {
+ uint8_t u8[16];
+ uint16_t u16[8];
+ uint32_t u32[4];
+ uint64_t u64[2];
+ int8_t s8[16];
+ int16_t s16[8];
+ int32_t s32[4];
+ int64_t s64[2];
+ c_v64 v64[2];
+} c_v128;
+
+SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
+
+SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
+ c_v128 t;
+ t.u64[1] = hi;
+ t.u64[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
+ c_v128 t;
+ t.v64[1] = hi;
+ t.v64[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
+ uint32_t d) {
+ c_v128 t;
+ t.u32[3] = a;
+ t.u32[2] = b;
+ t.u32[1] = c;
+ t.u32[0] = d;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
+ c_v128 t;
+ memcpy(&t, p, 16);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 15) {
+ fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
+ abort();
+ }
+ return c_v128_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
+ memcpy(p, &a, 16);
+}
+
+SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
+ if (SIMD_CHECK && (uintptr_t)p & 15) {
+ fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
+ abort();
+ }
+ c_v128_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v128 c_v128_zero(void) {
+ c_v128 t;
+ t.u64[1] = t.u64[0] = 0;
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
+ c_v128 t;
+ t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
+ c_v128 t;
+ t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
+ c_v128 t;
+ t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+ c_v128 t;
+ t.u64[1] = t.u64[0] = x;
+ return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+ return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+ c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
+ return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
+ c_v64_dotp_s16(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+ // 32 bit products, 64 bit sum
+ return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+ (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+ (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+ (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
+SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
+ return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
+}
+
+typedef struct {
+ uint32_t val;
+ int count;
+} c_sad128_internal;
+
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
+ c_sad128_internal t;
+ t.val = t.count = 0;
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
+ * undefined. */
+SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
+ c_v128 b) {
+ int c;
+ for (c = 0; c < 16; c++)
+ s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.count++;
+ if (SIMD_CHECK && s.count > 32) {
+ fprintf(stderr,
+ "Error: sad called 32 times returning an undefined result\n");
+ abort();
+ }
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
+
+typedef uint32_t c_ssd128_internal;
+
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
+ c_v128 b) {
+ int c;
+ for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
+
+SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
+ c_v64_or(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
+ c_v64_xor(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
+ c_v64_and(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
+ c_v64_andn(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
+ c_v64_add_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
+ c_v64_add_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+ c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+ c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
+ c_v64_sadd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
+ c_v64_add_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+ // Two complement overflow (silences sanitizers)
+ return c_v128_from_64(
+ a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+ : a.v64[1].u64 + b.v64[1].u64,
+ a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+ : a.v64[0].u64 + b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
+ c_v128 t;
+ t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+ t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+ t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+ t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+ c_v128 t;
+ t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+ t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+ t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+ t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+ t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+ t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+ t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+ t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
+ c_v64_sub_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
+ c_v64_ssub_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
+ c_v64_ssub_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
+ c_v64_sub_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
+ c_v64_ssub_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
+ c_v64_ssub_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
+ c_v64_sub_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+ // Two complement underflow (silences sanitizers)
+ return c_v128_from_64(
+ a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+ : a.v64[1].u64 - b.v64[1].u64,
+ a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+ : a.v64[0].u64 - b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
+ return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
+ c_v64 lo_bits = c_v64_mullo_s16(a, b);
+ c_v64 hi_bits = c_v64_mulhi_s16(a, b);
+ return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
+ c_v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
+ c_v64_mullo_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
+ c_v64_mulhi_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
+ c_v64_mullo_s32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
+ c_v64_madd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
+ c_v64_madd_us8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
+ c_v64_avg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
+ c_v64_rdavg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+ c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
+ c_v64_avg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
+ c_v64_min_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
+ c_v64_max_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
+ c_v64_min_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+ return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+ ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+ ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+ ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+ ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+ ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+ ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+ c_v128 t;
+ for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
+ c_v64_max_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
+ c_v64_min_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
+ c_v64_max_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
+ c_v64_ziplo_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
+ c_v64_ziplo_8(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
+ c_v64_ziplo_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
+ c_v64_ziplo_16(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
+ c_v64_ziplo_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
+ c_v64_ziplo_32(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(a.v64[1], b.v64[1]);
+}
+
+SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
+ return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
+ return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
+ return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
+ c_v128 t;
+ if (mode) {
+ t.u8[15] = b.u8[15];
+ t.u8[14] = b.u8[13];
+ t.u8[13] = b.u8[11];
+ t.u8[12] = b.u8[9];
+ t.u8[11] = b.u8[7];
+ t.u8[10] = b.u8[5];
+ t.u8[9] = b.u8[3];
+ t.u8[8] = b.u8[1];
+ t.u8[7] = a.u8[15];
+ t.u8[6] = a.u8[13];
+ t.u8[5] = a.u8[11];
+ t.u8[4] = a.u8[9];
+ t.u8[3] = a.u8[7];
+ t.u8[2] = a.u8[5];
+ t.u8[1] = a.u8[3];
+ t.u8[0] = a.u8[1];
+ } else {
+ t.u8[15] = a.u8[14];
+ t.u8[14] = a.u8[12];
+ t.u8[13] = a.u8[10];
+ t.u8[12] = a.u8[8];
+ t.u8[11] = a.u8[6];
+ t.u8[10] = a.u8[4];
+ t.u8[9] = a.u8[2];
+ t.u8[8] = a.u8[0];
+ t.u8[7] = b.u8[14];
+ t.u8[6] = b.u8[12];
+ t.u8[5] = b.u8[10];
+ t.u8[4] = b.u8[8];
+ t.u8[3] = b.u8[6];
+ t.u8[2] = b.u8[4];
+ t.u8[1] = b.u8[2];
+ t.u8[0] = b.u8[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+ : _c_v128_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+ : _c_v128_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
+ c_v128 t;
+ if (mode) {
+ t.u16[7] = b.u16[7];
+ t.u16[6] = b.u16[5];
+ t.u16[5] = b.u16[3];
+ t.u16[4] = b.u16[1];
+ t.u16[3] = a.u16[7];
+ t.u16[2] = a.u16[5];
+ t.u16[1] = a.u16[3];
+ t.u16[0] = a.u16[1];
+ } else {
+ t.u16[7] = a.u16[6];
+ t.u16[6] = a.u16[4];
+ t.u16[5] = a.u16[2];
+ t.u16[4] = a.u16[0];
+ t.u16[3] = b.u16[6];
+ t.u16[2] = b.u16[4];
+ t.u16[1] = b.u16[2];
+ t.u16[0] = b.u16[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+ : _c_v128_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+ : _c_v128_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
+ c_v128 t;
+ if (mode) {
+ t.u32[3] = b.u32[3];
+ t.u32[2] = b.u32[1];
+ t.u32[1] = a.u32[3];
+ t.u32[0] = a.u32[1];
+ } else {
+ t.u32[3] = a.u32[2];
+ t.u32[2] = a.u32[0];
+ t.u32[1] = b.u32[2];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+ : _c_v128_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+ : _c_v128_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
+ c_v64_unpacklo_u8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
+ c_v64_unpacklo_u8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
+ c_v64_unpacklo_s8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
+ c_v64_unpacklo_s8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
+ c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+ c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
+ c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
+ c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
+ c_v64_unpacklo_u16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
+ c_v64_unpacklo_s16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
+ c_v64_unpacklo_u16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
+ return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
+ c_v64_unpacklo_s16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 16; c++)
+ t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+ : pattern.u8[c] & 15];
+
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
+ c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
+ c_v64_cmplt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
+ c_v64_cmpeq_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
+ c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
+ c_v64_cmplt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
+ return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
+ c_v64_cmpeq_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+ c_v128 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+ return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+ if (n == 0) return a;
+ if (n < 8)
+ return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
+ c_v64_shr_n_byte(a.v64[0], 8 - n)),
+ c_v64_shl_n_byte(a.v64[0], n));
+ else
+ return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+ if (n == 0) return a;
+ if (n < 8)
+ return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
+ c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
+ c_v64_shl_n_byte(a.v64[1], 8 - n)));
+ else
+ return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
+}
+
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
+ if (SIMD_CHECK && c > 15) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
+ : b;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
+ c_v64_shr_u16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
+ c_v64_shr_s16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
+ c_v64_shr_u32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
+ return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
+ c_v64_shr_s32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+ a.v64[1].u64 <<= c;
+ a.v64[0].u64 <<= c;
+ return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+ a.v64[1].u64 >>= c;
+ a.v64[0].u64 >>= c;
+ return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+ a.v64[1].s64 >>= c;
+ a.v64[0].s64 >>= c;
+ return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
+ return c_v128_shl_8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
+ return c_v128_shl_16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
+ return c_v128_shl_32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+ return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
+ return c_v128_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
+ return c_v128_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
+ return c_v128_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+ return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
+ return c_v128_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
+ return c_v128_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
+ return c_v128_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+ return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+ c_v128 a, c_v128 b) {
+ int c;
+ for (c = 0; c < 8; c++)
+ s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+ c_v128 a, c_v128 b) {
+ int c;
+ for (c = 0; c < 8; c++)
+ s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+ (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+ return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
new file mode 100644
index 0000000000..d20f979dd9
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+
+#include <stdint.h>
+#include "aom_dsp/simd/v64_intrinsics_x86.h"
+
+typedef __m128i v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+ return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) {
+ return _mm_unpacklo_epi64(a, v64_zero());
+}
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
+ return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+ return v128_from_v64(v64_from_64(a), v64_from_64(b));
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+ return _mm_load_si128((__m128i *)p);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+#if defined(__SSSE3__)
+ return _mm_lddqu_si128((__m128i *)p);
+#else
+ return _mm_loadu_si128((__m128i *)p);
+#endif
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+ _mm_store_si128((__m128i *)p, a);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+ _mm_storeu_si128((__m128i *)p, a);
+}
+
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#if defined(__SSSE3__)
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
+ return c ? _mm_alignr_epi8(a, b, c) : b;
+}
+#else
+#define v128_align(a, b, c) \
+ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#else
+#if defined(__SSSE3__)
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
+#else
+#define v128_align(a, b, c) \
+ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#endif
+
+SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
+
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+ // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+ return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32),
+ (int32_t)x);
+}
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+ return _mm_madd_epi16(a, _mm_set1_epi16(1));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi16(a);
+#else
+ return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi8(a);
+#else
+ v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+ return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
+ return _mm_unpacklo_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
+ return _mm_unpackhi_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
+ return _mm_unpacklo_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
+ return _mm_unpackhi_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
+ return _mm_unpacklo_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
+ return _mm_unpackhi_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+ return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+ return _mm_unpackhi_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+ return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+ v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+ v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
+ return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+ _mm_shuffle_epi8(a, order));
+#else
+ return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+ return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+ v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+ v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
+ return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+ _mm_shuffle_epi8(a, order));
+#else
+ return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+ return _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+ return _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+ return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+ return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+ return _mm_unpackhi_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+ return _mm_packs_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_packus_epi32(b, a);
+#else
+ return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+ v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+ return _mm_packus_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+ return _mm_packs_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+ return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+ return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+ return _mm_unpackhi_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(x, pattern);
+#else
+ v128 output;
+ unsigned char *input = (unsigned char *)&x;
+ unsigned char *index = (unsigned char *)&pattern;
+ unsigned char *selected = (unsigned char *)&output;
+ int counter;
+
+ for (counter = 0; counter < 16; counter++) {
+ selected[counter] = input[index[counter] & 15];
+ }
+
+ return output;
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+ v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+ v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+ v128 t = v128_add_32(t1, t2);
+ t = v128_add_32(t, _mm_srli_si128(t, 8));
+ t = v128_add_32(t, _mm_srli_si128(t, 4));
+ return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+ v128 r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+ v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
+ _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
+ return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
+#else
+ return (int64_t)_mm_cvtsi128_si32(r) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
+ v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
+ return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
+}
+
+typedef v128 sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
+ return _mm_setzero_si128();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v128_sad_sum().
+ The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+ return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+ return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
+}
+
+typedef int32_t ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+ v128 z = _mm_setzero_si128();
+ v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+ v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
+ v128 rl = _mm_madd_epi16(l, l);
+ v128 rh = _mm_madd_epi16(h, h);
+ v128 r = _mm_add_epi32(rl, rh);
+ r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+ r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+ return s + _mm_cvtsi128_si32(r);
+}
+
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+ v64 lo_bits = v64_mullo_s16(a, b);
+ v64 hi_bits = v64_mulhi_s16(a, b);
+ return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
+ v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+ return _mm_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+ return _mm_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_mullo_epi32(a, b);
+#else
+ return _mm_unpacklo_epi32(
+ _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
+ _mm_shuffle_epi32(
+ _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+ v128 r = v128_mullo_s32(a, b);
+ return (int64_t)_mm_cvtsi128_si32(r) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+ return _mm_maddubs_epi16(a, b);
+#else
+ return _mm_packs_epi32(
+ _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+ _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
+ _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
+ _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
+#endif
+}
+
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+ return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
+ return _mm_sub_epi8(_mm_avg_epu8(a, b),
+ _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+ return _mm_sub_epi16(_mm_avg_epu16(a, b),
+ _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_min_epi8(a, b);
+#else
+ v128 mask = _mm_cmplt_epi8(a, b);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+ return _mm_blendv_epi8(a, b, c);
+#else
+ c = _mm_cmplt_epi8(c, v128_zero());
+ return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_max_epi8(a, b);
+#else
+ v128 mask = _mm_cmplt_epi8(b, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_min_epi32(a, b);
+#else
+ v128 mask = _mm_cmplt_epi32(a, b);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+ return _mm_max_epi32(a, b);
+#else
+ v128 mask = _mm_cmplt_epi32(b, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+ return _mm_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+ return _mm_cmplt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+ return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+ return _mm_cmplt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+ _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
+ _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+ __m128i x = _mm_cvtsi32_si128((int)(c + 8));
+ return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+ _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+ return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+ return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+ return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+ return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+ return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+ return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+ return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+ return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+ // _mm_sra_epi64 is missing in gcc?
+ return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c),
+ (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c));
+ // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
+#define v128_shl_n_8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c) \
+ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+ _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
+#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+ v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+ v128 b) {
+#if defined(__SSE4_1__)
+ v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+ v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+ v128_xor(b, v128_dup_16(32768)));
+ t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+ v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+ return v128_add_32(
+ s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+ return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+ v128_low_u32(v128_shr_n_byte(s, 8)) +
+ v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+ v128 b) {
+ v128 d = v128_sub_16(a, b);
+ d = v128_madd_s16(d, d);
+ return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+ _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+ return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
+#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 0000000000..17e36eed61
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+ return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
+ return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
+
+SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) {
+ return c_v256_sad_u8_init();
+}
+SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) {
+ return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) {
+ return c_v256_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) {
+ return c_v256_ssd_u8_init();
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) {
+ return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) {
+ return c_v256_ssd_u8_sum(s);
+}
+
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) {
+ return c_v256_ssd_s16_init();
+}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a,
+ v256 b) {
+ return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) {
+ return c_v256_ssd_s16_sum(s);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+ return c_v256_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+ return c_v256_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+ return c_v256_blend_8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+ return c_v256_rdavg_u16(a, b);
+}
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+ return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+ return c_v256_unziphi_64(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+ return c_v256_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+ return c_v256_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+ return c_v256_pack_s32_u16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+ return c_v256_wideshuffle_8(a, b, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+ return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+ return c_v256_cmplt_s32(a, b);
+}
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return c_v256_shr_s32(a, c);
+}
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+ return c_v256_shl_64(a, c);
+}
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+ return c_v256_shr_u64(a, c);
+}
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+ return c_v256_shr_s64(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
+ return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
+ return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
+ return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
+ return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
+ return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+ return c_v256_shl_n_64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
+ return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
+ return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
+ return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+ return c_v256_shr_n_u64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
+ return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
+ return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
+ return c_v256_shr_n_s32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+ return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+ return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+ return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+ return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+ v256 b) {
+ return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+ return c_v256_sad_u16_sum(s);
+}
+
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 0000000000..60d0d53f6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+
+typedef union {
+ uint8_t u8[32];
+ uint16_t u16[16];
+ uint32_t u32[8];
+ uint64_t u64[4];
+ int8_t s8[32];
+ int16_t s16[16];
+ int32_t s32[8];
+ int64_t s64[4];
+ c_v64 v64[4];
+ c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+ c_v256 t;
+ t.v128[1] = hi;
+ t.v128[0] = lo;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+ uint64_t d) {
+ c_v256 t;
+ t.u64[3] = a;
+ t.u64[2] = b;
+ t.u64[1] = c;
+ t.u64[0] = d;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+ c_v256 t;
+ t.u64[3] = a.u64;
+ t.u64[2] = b.u64;
+ t.u64[1] = c.u64;
+ t.u64[0] = d.u64;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+ c_v256 t;
+ memcpy(&t, p, 32);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 31) {
+ fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+ abort();
+ }
+ return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+ memcpy(p, &a, 32);
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+ if (SIMD_CHECK && (uintptr_t)p & 31) {
+ fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+ abort();
+ }
+ c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero(void) {
+ c_v256 t;
+ t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+ c_v256 t;
+ t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+ c_v256 t;
+ t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+ return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+ return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+ c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+ return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+ c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+ return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+ c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+ return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef struct {
+ uint32_t val;
+ int count;
+} c_sad256_internal;
+
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
+ c_sad256_internal t;
+ t.val = t.count = 0;
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+ c_v256 b) {
+ int c;
+ for (c = 0; c < 32; c++)
+ s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.count++;
+ if (SIMD_CHECK && s.count > 32) {
+ fprintf(stderr,
+ "Error: sad called 32 times returning an undefined result\n");
+ abort();
+ }
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+ c_v256 b) {
+ int c;
+ for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+ c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+ c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+ c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+ c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+ c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+ c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+ c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+ c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+ c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+ c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+ c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+ c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+ c_v256 t;
+ for (int i = 0; i < 16; i++)
+ t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+ c_v256 t;
+ t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+ t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+ t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+ t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+ t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+ t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+ t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+ t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+ c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+ c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+ c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+ c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+ c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
+ c_v128_ssub_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+ c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
+ return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+ c_v128 lo_bits = c_v128_mullo_s16(a, b);
+ c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+ return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+ c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+ c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+ c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+ c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+ c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+ c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+ c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+ c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+ c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+ c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+ c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+ c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+ c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+ return ((uint32_t)(a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+ ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+ ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+ ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+ ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+ ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+ ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+ ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+ ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+ ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+ ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+ ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+ ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+ ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+ ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+ c_v256 t;
+ for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+ c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+ c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+ c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+ c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+ c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+ c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+ c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+ c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+ c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+ c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+ c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+ c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+ c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+ return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ int i;
+ if (mode) {
+ for (i = 0; i < 16; i++) {
+ t.u8[i] = a.u8[i * 2 + 1];
+ t.u8[i + 16] = b.u8[i * 2 + 1];
+ }
+ } else {
+ for (i = 0; i < 16; i++) {
+ t.u8[i] = b.u8[i * 2];
+ t.u8[i + 16] = a.u8[i * 2];
+ }
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+ : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+ : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ int i;
+ if (mode) {
+ for (i = 0; i < 8; i++) {
+ t.u16[i] = a.u16[i * 2 + 1];
+ t.u16[i + 8] = b.u16[i * 2 + 1];
+ }
+ } else {
+ for (i = 0; i < 8; i++) {
+ t.u16[i] = b.u16[i * 2];
+ t.u16[i + 8] = a.u16[i * 2];
+ }
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+ : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+ : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ if (mode) {
+ t.u32[7] = b.u32[7];
+ t.u32[6] = b.u32[5];
+ t.u32[5] = b.u32[3];
+ t.u32[4] = b.u32[1];
+ t.u32[3] = a.u32[7];
+ t.u32[2] = a.u32[5];
+ t.u32[1] = a.u32[3];
+ t.u32[0] = a.u32[1];
+ } else {
+ t.u32[7] = a.u32[6];
+ t.u32[6] = a.u32[4];
+ t.u32[5] = a.u32[2];
+ t.u32[4] = a.u32[0];
+ t.u32[3] = b.u32[6];
+ t.u32[2] = b.u32[4];
+ t.u32[1] = b.u32[2];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+ : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+ : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+ c_v256 t;
+ if (mode) {
+ t.u64[3] = b.u64[3];
+ t.u64[2] = b.u64[1];
+ t.u64[1] = a.u64[3];
+ t.u64[0] = a.u64[1];
+ } else {
+ t.u64[3] = a.u64[2];
+ t.u64[2] = a.u64[0];
+ t.u64[1] = b.u64[2];
+ t.u64[0] = b.u64[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+ : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+ : _c_v256_unzip_64(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+ c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+ c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
+ c_v128_unpacklo_s8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
+ c_v128_unpacklo_s8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+ c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+ c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+ c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+ c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+ c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+ c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+ c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+ c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+ c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+ return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+ c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+ c_v256 t;
+ int c;
+ for (c = 0; c < 32; c++)
+ t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+ : pattern.u8[c] & 31];
+
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+ c_v256 t;
+ int c;
+ for (c = 0; c < 32; c++)
+ t.u8[c] = (pattern.u8[c] < 32
+ ? b.u8
+ : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+ : pattern.u8[c] & 31];
+ return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+ return c_v256_from_v128(
+ c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+ c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+ c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+ c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+ return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+ c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+ if (n == 0) return a;
+ if (n < 16)
+ return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+ c_v128_shr_n_byte(a.v128[0], 16 - n)),
+ c_v128_shl_n_byte(a.v128[0], n));
+ else if (n > 16)
+ return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+ c_v128_zero());
+ else
+ return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+ if (n == 0) return a;
+ if (n < 16)
+ return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+ c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+ c_v128_shl_n_byte(a.v128[1], 16 - n)));
+ else if (n > 16)
+ return c_v256_from_v128(c_v128_zero(),
+ c_v128_shr_n_byte(a.v128[1], n - 16));
+ else
+ return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
+ if (SIMD_CHECK && c > 31) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+ : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+ c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+ c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+ c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+ c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+ c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+ c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+ c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+ c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
+ return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+ c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+ c_v256 t;
+ if (SIMD_CHECK && n > 63) {
+ fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+ abort();
+ }
+ t.s64[3] = a.s64[3] >> n;
+ t.s64[2] = a.s64[2] >> n;
+ t.s64[1] = a.s64[1] >> n;
+ t.s64[0] = a.s64[0] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+ c_v256 t;
+ if (SIMD_CHECK && n > 63) {
+ fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+ abort();
+ }
+ t.u64[3] = a.u64[3] >> n;
+ t.u64[2] = a.u64[2] >> n;
+ t.u64[1] = a.u64[1] >> n;
+ t.u64[0] = a.u64[0] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+ c_v256 t;
+ if (SIMD_CHECK && n > 63) {
+ fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+ abort();
+ }
+ t.u64[3] = a.u64[3] << n;
+ t.u64[2] = a.u64[2] << n;
+ t.u64[1] = a.u64[1] << n;
+ t.u64[0] = a.u64[0] << n;
+ return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
+ return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
+ return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
+ return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+ return c_v256_shl_64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
+ return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
+ return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
+ return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+ return c_v256_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
+ return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
+ return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
+ return c_v256_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+ return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+ return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+ return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+ c_v256 a, c_v256 b) {
+ int c;
+ for (c = 0; c < 16; c++)
+ s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+ c_v256 a, c_v256 b) {
+ int c;
+ for (c = 0; c < 16; c++)
+ s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+ (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+ return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 0000000000..493130df83
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+
+#include "config/aom_config.h"
+
+#if HAVE_NEON
+#error "Do not use this file for Neon"
+#endif
+
+#if HAVE_SSE2
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+#else
+#include "aom_dsp/simd/v128_intrinsics.h"
+#endif
+
+typedef struct {
+ v128 val[2];
+} v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+ v256 t;
+ t.val[1] = hi;
+ t.val[0] = lo;
+ return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+ v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+ v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ v128_store_unaligned(p, a.val[0]);
+ v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ v128_store_aligned(p, a.val[0]);
+ v128_store_aligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE v256 v256_zero(void) {
+ return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+ v128 t = v128_dup_8(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+ v128 t = v128_dup_16(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+ v128 t = v128_dup_32(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+ v128 t = v128_dup_64(x);
+ return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+ return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+ return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+ return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
+}
+
+typedef struct {
+ sad128_internal val[2];
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
+ sad256_internal t;
+ t.val[1] = v128_sad_u8_init();
+ t.val[0] = v128_sad_u8_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ sad256_internal t;
+ t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+ t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
+}
+
+typedef struct {
+ ssd128_internal val[2];
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
+ ssd256_internal t;
+ t.val[1] = v128_ssd_u8_init();
+ t.val[0] = v128_ssd_u8_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ ssd256_internal t;
+ t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+ t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+ return v256_from_v128(v128_or(a.val[1], b.val[1]),
+ v128_or(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+ return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+ v128_xor(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+ return v256_from_v128(v128_and(a.val[1], b.val[1]),
+ v128_and(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+ return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+ v128_andn(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+ return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+ v128_add_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+ return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+ v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+ v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+ v128_sadd_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+ v128_sadd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+ return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+ v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+ return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+ v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+ return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+ return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+ v128_sub_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+ v128_ssub_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+ v128_ssub_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+ v128_sub_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+ v128_ssub_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+ v128_ssub_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+ v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+ return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+ v128_sub_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+ return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) {
+ return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+ v128 lo_bits = v128_mullo_s16(a, b);
+ v128 hi_bits = v128_mulhi_s16(a, b);
+ return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+ v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+ v128_mullo_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+ v128_mulhi_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+ v128_mullo_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+ v128_madd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+ return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+ v128_madd_us8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+ v128_avg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+ v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+ v128_rdavg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+ v128_avg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+ v128_min_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+ v128_max_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+ v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+ return (v128_movemask_8(v256_high_v128(a)) << 16) |
+ v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+ return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+ v128_blend_8(a.val[0], b.val[0], c.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+ v128_max_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+ v128_min_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+ v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+ v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+ v128_max_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+ v128_ziplo_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+ v128_ziplo_8(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+ v128_ziplo_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+ v128_ziplo_16(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+ v128_ziplo_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+ v128_ziplo_32(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+ v128_ziplo_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+ return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+ v128_ziplo_64(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return v256_from_v128(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return v256_from_v128(a.val[1], b.val[1]);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+ v128_unziplo_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+ v128_unziphi_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+ v128_unziplo_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+ v128_unziphi_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+ v128_unziplo_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+ v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+ return v256_from_v128(
+ _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+ _mm_castsi128_pd(a.val[1]), 0)),
+ _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+ _mm_castsi128_pd(b.val[1]), 0)));
+#else
+ return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+ v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+ return v256_from_v128(
+ _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+ _mm_castsi128_pd(a.val[1]), 3)),
+ _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+ _mm_castsi128_pd(b.val[1]), 3)));
+#else
+ return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+ v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+ v128_unpacklo_u8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+ v128_unpacklo_u8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+ v128_unpacklo_s8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+ v128_unpacklo_s8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+ v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+ v128_pack_s32_u16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+ v128_pack_s16_u8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+ v128_pack_s16_s8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+ v128_unpacklo_u16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+ v128_unpacklo_s16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+ v128_unpacklo_u16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+ v128_unpacklo_s16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+ v128_cmpgt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+ v128_cmplt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+ v128_cmpeq_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+ v128_cmpgt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+ v128_cmplt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+ v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+ v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+ return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+ v128_cmplt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+ return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+ v128_cmpeq_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+ v128 c16 = v128_dup_8(16);
+ v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+ v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+ return v256_from_v128(
+ v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+ v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+ v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+ v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+ v128 c16 = v128_dup_8(16);
+ v128 c32 = v128_dup_8(32);
+ v128 c48 = v128_dup_8(48);
+ v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+ v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+ v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+ v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+ v256 r1 = v256_from_v128(
+ v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+ v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+ maskhi48),
+ v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+ v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+ masklo48));
+ v256 r2 = v256_from_v128(
+ v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+ v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+ v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+ v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+ return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return v256_from_v128(
+ v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+ v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+ return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v256_shl_n_byte(a, n) \
+ ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \
+ v128_shr_n_byte(a.val[0], 16 - (n))), \
+ v128_shl_n_byte(a.val[0], (n))) \
+ : v256_from_v128( \
+ (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
+ v128_zero()))
+
+#define v256_shr_n_byte(a, n) \
+ (n == 0 \
+ ? a \
+ : ((n) < 16 \
+ ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \
+ v128_or(v128_shr_n_byte(a.val[0], n), \
+ v128_shl_n_byte(a.val[1], 16 - (n)))) \
+ : v256_from_v128( \
+ v128_zero(), \
+ (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
+
+#define v256_align(a, b, c) \
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+ v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
+#define v256_shl_n_16(a, n) \
+ v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
+#define v256_shl_n_32(a, n) \
+ v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+ v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
+#define v256_shr_n_u8(a, n) \
+ v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
+#define v256_shr_n_u16(a, n) \
+ v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
+#define v256_shr_n_u32(a, n) \
+ v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+ v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
+#define v256_shr_n_s8(a, n) \
+ v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
+#define v256_shr_n_s16(a, n) \
+ v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
+#define v256_shr_n_s32(a, n) \
+ v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+ v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+ sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+ sad256_internal_u16 t;
+ t.val[1] = v128_sad_u16_init();
+ t.val[0] = v128_sad_u16_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u16_sum().
+ The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+ v256 b) {
+ sad256_internal_u16 t;
+ t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+ t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+ return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+ return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+ ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
+ ssd256_internal_s16 t;
+ t.val[1] = v128_ssd_s16_init();
+ t.val[0] = v128_ssd_s16_init();
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+ v256 b) {
+ ssd256_internal_s16 t;
+ t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+ t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+ return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+ return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
+
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 0000000000..894ddee167
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+
+#if !defined(__AVX2__)
+
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
+ defined(__AVX2__) && defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+ return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+ return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+ return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+ // gcc seems to be missing _mm256_set_m128i()
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+ return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+ return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d);
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+ return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+ return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+ _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+ _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); }
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+ return _mm256_set1_epi64x((int64_t)x);
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+ return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+ return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+ return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+ return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+ return _mm256_subs_epu16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+ return _mm256_unpacklo_epi8(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+ return _mm256_unpackhi_epi8(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+ return _mm256_unpacklo_epi16(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+ return _mm256_unpackhi_epi16(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+ return _mm256_unpacklo_epi32(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+ return _mm256_unpackhi_epi32(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+ return _mm256_unpacklo_epi64(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+ return _mm256_unpackhi_epi64(
+ _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+ return _mm256_permute2x128_si256(a, b, 0x02);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+ return _mm256_permute2x128_si256(a, b, 0x13);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+ return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(
+ _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+ return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+ return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(
+ _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+ _mm256_castsi256_ps(a),
+ _MM_SHUFFLE(3, 1, 3, 1))),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(
+ _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+ _mm256_castsi256_ps(a),
+ _MM_SHUFFLE(2, 0, 2, 0))),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(
+ _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+ _mm256_castsi256_pd(a), 15)),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(
+ _mm256_castpd_si256(
+ _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+ return _mm256_unpacklo_epi8(
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+ return _mm256_unpackhi_epi8(
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+ return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+ return _mm256_srai_epi16(
+ _mm256_unpacklo_epi8(
+ a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+ 8);
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+ return _mm256_srai_epi16(
+ _mm256_unpackhi_epi8(
+ a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+ 8);
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+ return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+ _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+ return _mm256_cvtepu16_epi32(a);
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+ return _mm256_cvtepi16_epi32(a);
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+ return _mm256_unpacklo_epi16(
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+ return _mm256_srai_epi32(
+ _mm256_unpacklo_epi16(
+ a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+ 16);
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+ return _mm256_unpackhi_epi16(
+ _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+ _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+ return _mm256_srai_epi32(
+ _mm256_unpackhi_epi16(
+ a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+ 16);
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+ return _mm256_blendv_epi8(
+ _mm256_shuffle_epi8(
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+ _mm256_shuffle_epi8(
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+ _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+ v256 c32 = v256_dup_8(32);
+ v256 p32 = v256_sub_8(pattern, c32);
+ v256 r1 = _mm256_blendv_epi8(
+ _mm256_shuffle_epi8(
+ _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+ _mm256_shuffle_epi8(
+ _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+ _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+ v256 r2 = _mm256_blendv_epi8(
+ _mm256_shuffle_epi8(
+ _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+ _mm256_shuffle_epi8(
+ _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+ _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+ return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+ return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+ v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+ v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+ t1 = _mm256_add_epi32(t1, t2);
+ v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+ _mm256_extracti128_si256(t1, 1));
+ t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+ t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+ return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+ v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+ v128 t;
+ r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+ _mm256_cvtepi32_epi64(v256_low_v128(r)));
+ t = v256_low_v128(_mm256_add_epi64(
+ r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+ return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+ v128 l = v256_low_v128(r);
+ v128 h = v256_high_v128(r);
+ return (int64_t)_mm_cvtsi128_si32(l) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+ (int64_t)_mm_cvtsi128_si32(h) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+ v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+ v128 t;
+ r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+ _mm256_cvtepi32_epi64(v256_low_v128(r)));
+ t = v256_low_v128(_mm256_add_epi64(
+ r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+ return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+ v128 l = v256_low_v128(r);
+ v128 h = v256_high_v128(r);
+ return (int64_t)_mm_cvtsi128_si32(l) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+ (int64_t)_mm_cvtsi128_si32(h) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+ (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+ v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+ v128 lo = v256_low_v128(t);
+ v128 hi = v256_high_v128(t);
+ lo = v128_add_32(lo, hi);
+ return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
+ return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v256_sad_u8_sum().
+ The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+ return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+ v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+ return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
+ return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+ v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+ _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+ v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+ v256 rl = _mm256_madd_epi16(l, l);
+ v256 rh = _mm256_madd_epi16(h, h);
+ v128 c = _mm_cvtsi32_si128(32);
+ rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+ rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+ rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+ rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+ return _mm256_add_epi64(
+ s,
+ _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+ v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+ return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+ v128 lo_bits = v128_mullo_s16(a, b);
+ v128 hi_bits = v128_mulhi_s16(a, b);
+ return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+ v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+ return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+ return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+ return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+ return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+ return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+ return _mm256_sub_epi8(
+ _mm256_avg_epu8(a, b),
+ _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+ return _mm256_sub_epi16(
+ _mm256_avg_epu16(a, b),
+ _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+ return (uint32_t)_mm256_movemask_epi8(a);
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+ return _mm256_blendv_epi8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+ return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+ return _mm256_cmpgt_epi8(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+ return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+ return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+ return _mm256_cmpgt_epi16(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+ return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+ return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+ return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+ return _mm256_cmpeq_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+ return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)),
+ _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+ return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
+ _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+ __m128i x = _mm_cvtsi32_si128((int)(c + 8));
+ return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+ _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+ return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+ return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+ return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+ return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+ return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+ return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+ return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+ return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512VL__)
+ return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c));
+#else
+ return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+ v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n) \
+ ((n) < 16 ? v256_from_v128( \
+ v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+ v128_shl_n_byte(v256_low_v128(a), n)) \
+ : _mm256_inserti128_si256( \
+ _mm256_setzero_si256(), \
+ v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n) \
+ ((n) < 16 \
+ ? _mm256_alignr_epi8( \
+ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+ : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \
+ : _mm256_inserti128_si256( \
+ _mm256_setzero_si256(), \
+ v128_shr_n_byte(v256_high_v128(a), (n)-16), 0)))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, c) \
+ _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \
+ _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c) \
+ _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \
+ _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c) \
+ _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+ _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+ v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+ v256 b) {
+#if defined(__SSE4_1__)
+ v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+ v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+ v256_xor(b, v256_dup_16(32768)));
+ t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+ v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+ return v256_add_32(
+ s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+ v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+ return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+ v128_low_u32(v128_shr_n_byte(t, 8)) +
+ v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+ v256 b) {
+ v256 d = v256_sub_16(a, b);
+ d = v256_madd_s16(d, d);
+ return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+ _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+ v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+ return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
+#endif
+
+#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
new file mode 100644
index 0000000000..7079949cd8
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v64 v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
+SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
+SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+ return c_v64_from_32(x, y);
+}
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+ return c_v64_from_16(a, b, c, d);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+ return c_u32_load_unaligned(p);
+}
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+ return c_u32_load_aligned(p);
+}
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+ c_u32_store_unaligned(p, a);
+}
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+ c_u32_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+ return c_v64_load_unaligned(p);
+}
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+ return c_v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+ c_v64_store_unaligned(p, a);
+}
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+ c_v64_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+ return c_v64_align(a, b, c);
+}
+
+SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
+SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
+SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+ return c_v64_pack_s32_s16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+ return c_v64_pack_s32_u16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+ return c_v64_pack_s16_u8(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+ return c_v64_pack_s16_s8(a, b);
+}
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+ return c_v64_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+ return c_v64_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+ return c_v64_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+ return c_v64_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
+ return c_v64_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) {
+ return c_v64_sad_u8_init();
+}
+SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) {
+ return c_v64_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) {
+ return c_v64_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) {
+ return c_v64_ssd_u8_init();
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) {
+ return c_v64_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) {
+ return c_v64_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
+ return c_v64_shr_u16(a, n);
+}
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
+ return c_v64_shr_s16(a, n);
+}
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
+ return c_v64_shr_u32(a, n);
+}
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
+ return c_v64_shr_s32(a, n);
+}
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
+ return c_v64_shr_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
+ return c_v64_shl_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+ return c_v64_shl_n_8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+ return c_v64_shr_n_u8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+ return c_v64_shr_n_s8(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+ return c_v64_shl_n_16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+ return c_v64_shr_n_u16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+ return c_v64_shr_n_s16(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+ return c_v64_shl_n_32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+ return c_v64_shr_n_u32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+ return c_v64_shr_n_s32(a, c);
+}
+
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
new file mode 100644
index 0000000000..bfd6fe0710
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -0,0 +1,966 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+
+/* Note: This implements the intrinsics in plain, unoptimised C.
+ Intended for reference, porting or debugging. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+typedef union {
+ uint8_t u8[8];
+ uint16_t u16[4];
+ uint32_t u32[2];
+ uint64_t u64;
+ int8_t s8[8];
+ int16_t s16[4];
+ int32_t s32[2];
+ int64_t s64;
+} c_v64;
+
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+ return a.u32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
+ return a.u32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+ return a.s32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
+ return a.s32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
+ c_v64 t;
+ t.u32[!CONFIG_BIG_ENDIAN] = x;
+ t.u32[!!CONFIG_BIG_ENDIAN] = y;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
+ c_v64 t;
+ t.u64 = x;
+ return t;
+}
+
+SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
+
+SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
+ uint16_t d) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ t.u16[0] = a;
+ t.u16[1] = b;
+ t.u16[2] = c;
+ t.u16[3] = d;
+ } else {
+ t.u16[3] = a;
+ t.u16[2] = b;
+ t.u16[1] = c;
+ t.u16[0] = d;
+ }
+ return t;
+}
+
+SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
+ uint32_t t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 4; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 4; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 3) {
+ fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
+ abort();
+ }
+ return c_u32_load_unaligned(p);
+}
+
+SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
+ if (SIMD_CHECK && (uintptr_t)p & 3) {
+ fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
+ abort();
+ }
+ c_u32_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
+ c_v64 t;
+ uint8_t *pp = (uint8_t *)p;
+ uint8_t *q = (uint8_t *)&t;
+ int c;
+ for (c = 0; c < 8; c++) q[c] = pp[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
+ if (SIMD_CHECK && (uintptr_t)p & 7) {
+ fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
+ abort();
+ }
+ return c_v64_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
+ uint8_t *q = (uint8_t *)p;
+ uint8_t *r = (uint8_t *)&a;
+ int c;
+ for (c = 0; c < 8; c++) q[c] = r[c];
+}
+
+SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
+ if (SIMD_CHECK && (uintptr_t)p & 7) {
+ fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
+ abort();
+ }
+ c_v64_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_zero(void) {
+ c_v64 t;
+ t.u64 = 0;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
+ c_v64 t;
+ t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
+ t.u8[7] = x;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
+ c_v64 t;
+ t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
+ c_v64 t;
+ t.u32[0] = t.u32[1] = x;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++)
+ t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++)
+ t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
+ t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) {
+ int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
+ t.s8[c] = SIMD_CLAMP(d, -128, 127);
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.u16[c] =
+ (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
+ t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++)
+ t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++)
+ t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u8[7] = a.u8[7];
+ t.u8[6] = b.u8[7];
+ t.u8[5] = a.u8[6];
+ t.u8[4] = b.u8[6];
+ t.u8[3] = a.u8[5];
+ t.u8[2] = b.u8[5];
+ t.u8[1] = a.u8[4];
+ t.u8[0] = b.u8[4];
+ } else {
+ t.u8[7] = a.u8[3];
+ t.u8[6] = b.u8[3];
+ t.u8[5] = a.u8[2];
+ t.u8[4] = b.u8[2];
+ t.u8[3] = a.u8[1];
+ t.u8[2] = b.u8[1];
+ t.u8[1] = a.u8[0];
+ t.u8[0] = b.u8[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u16[3] = a.u16[3];
+ t.u16[2] = b.u16[3];
+ t.u16[1] = a.u16[2];
+ t.u16[0] = b.u16[2];
+ } else {
+ t.u16[3] = a.u16[1];
+ t.u16[2] = b.u16[1];
+ t.u16[1] = a.u16[0];
+ t.u16[0] = b.u16[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u32[1] = a.u32[1];
+ t.u32[0] = b.u32[1];
+ } else {
+ t.u32[1] = a.u32[0];
+ t.u32[0] = b.u32[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u8[7] = b.u8[7];
+ t.u8[6] = b.u8[5];
+ t.u8[5] = b.u8[3];
+ t.u8[4] = b.u8[1];
+ t.u8[3] = a.u8[7];
+ t.u8[2] = a.u8[5];
+ t.u8[1] = a.u8[3];
+ t.u8[0] = a.u8[1];
+ } else {
+ t.u8[7] = a.u8[6];
+ t.u8[6] = a.u8[4];
+ t.u8[5] = a.u8[2];
+ t.u8[4] = a.u8[0];
+ t.u8[3] = b.u8[6];
+ t.u8[2] = b.u8[4];
+ t.u8[1] = b.u8[2];
+ t.u8[0] = b.u8[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
+ c_v64 t;
+ if (mode) {
+ t.u16[3] = b.u16[3];
+ t.u16[2] = b.u16[1];
+ t.u16[1] = a.u16[3];
+ t.u16[0] = a.u16[1];
+ } else {
+ t.u16[3] = a.u16[2];
+ t.u16[2] = a.u16[0];
+ t.u16[1] = b.u16[2];
+ t.u16[0] = b.u16[0];
+ }
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
+ : _c_v64_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
+ return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
+ : _c_v64_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.u8[3 + endian];
+ t.s16[2] = (int16_t)a.u8[2 + endian];
+ t.s16[1] = (int16_t)a.u8[1 + endian];
+ t.s16[0] = (int16_t)a.u8[0 + endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.u8[7 - endian];
+ t.s16[2] = (int16_t)a.u8[6 - endian];
+ t.s16[1] = (int16_t)a.u8[5 - endian];
+ t.s16[0] = (int16_t)a.u8[4 - endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.s8[3 + endian];
+ t.s16[2] = (int16_t)a.s8[2 + endian];
+ t.s16[1] = (int16_t)a.s8[1 + endian];
+ t.s16[0] = (int16_t)a.s8[0 + endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
+ c_v64 t;
+ int endian = !!CONFIG_BIG_ENDIAN * 4;
+ t.s16[3] = (int16_t)a.s8[7 - endian];
+ t.s16[2] = (int16_t)a.s8[6 - endian];
+ t.s16[1] = (int16_t)a.s8[5 - endian];
+ t.s16[0] = (int16_t)a.s8[4 - endian];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767);
+ t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767);
+ t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767);
+ t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535);
+ t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535);
+ t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535);
+ t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255);
+ t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255);
+ t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255);
+ t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255);
+ t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255);
+ t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255);
+ t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255);
+ t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ if (CONFIG_BIG_ENDIAN) {
+ c_v64 u = a;
+ a = b;
+ b = u;
+ }
+ t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127);
+ t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127);
+ t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127);
+ t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127);
+ t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127);
+ t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127);
+ t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127);
+ t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
+ c_v64 t;
+ t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
+ t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) {
+ if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
+ fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
+ pattern.u8[c], c);
+ abort();
+ }
+ t.u8[c] =
+ a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
+ }
+ return t;
+}
+
+SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
+ return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
+ a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
+ a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
+ return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
+ (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
+}
+
+SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
+ return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
+ a.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
+ return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
+}
+
+typedef struct {
+ uint32_t val;
+ int count;
+} c_sad64_internal;
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
+ c_sad64_internal t;
+ t.val = t.count = 0;
+ return t;
+}
+
+/* Implementation dependent return value. Result must be finalised with
+ v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
+ undefined. */
+SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
+ c_v64 b) {
+ int c;
+ for (c = 0; c < 8; c++)
+ s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+ s.count++;
+ if (SIMD_CHECK && s.count > 32) {
+ fprintf(stderr,
+ "Error: sad called 32 times returning an undefined result\n");
+ abort();
+ }
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
+
+typedef uint32_t c_ssd64_internal;
+
+/* Implementation dependent return value. Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
+
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
+ c_v64 b) {
+ int c;
+ for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+ return s;
+}
+
+SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
+
+SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 | b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 ^ b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 & b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.u64 = a.u64 & ~b.u64;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
+ t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
+ t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int32_t u;
+ u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
+ t.s16[0] = SIMD_CLAMP(u, -32768, 32767);
+ u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
+ t.s16[1] = SIMD_CLAMP(u, -32768, 32767);
+ u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
+ t.s16[2] = SIMD_CLAMP(u, -32768, 32767);
+ u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
+ t.s16[3] = SIMD_CLAMP(u, -32768, 32767);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
+ c_v64 t;
+ int c;
+ for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 7) {
+ fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 7) {
+ fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 7) {
+ fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 15) {
+ fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n);
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 15) {
+ fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
+ c_v64 t;
+ int c;
+ if (SIMD_CHECK && n > 15) {
+ fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
+ abort();
+ }
+ for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
+ c_v64 t;
+ if (SIMD_CHECK && n > 31) {
+ fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
+ abort();
+ }
+ t.u32[1] = a.u32[1] << n;
+ t.u32[0] = a.u32[0] << n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
+ c_v64 t;
+ if (SIMD_CHECK && n > 31) {
+ fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
+ abort();
+ }
+ t.u32[1] = a.u32[1] >> n;
+ t.u32[0] = a.u32[0] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
+ c_v64 t;
+ if (SIMD_CHECK && n > 31) {
+ fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
+ abort();
+ }
+ t.s32[1] = a.s32[1] >> n;
+ t.s32[0] = a.s32[0] >> n;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
+ c_v64 t;
+ t.u64 = x.u64 >> i * 8;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
+ c_v64 t;
+ t.u64 = x.u64 << i * 8;
+ return t;
+}
+
+SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
+ if (SIMD_CHECK && c > 7) {
+ fprintf(stderr, "Error: undefined alignment %d\n", c);
+ abort();
+ }
+ return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
+ return c_v64_shl_8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
+ return c_v64_shr_u8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
+ return c_v64_shr_s8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
+ return c_v64_shl_16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
+ return c_v64_shr_u16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
+ return c_v64_shr_s16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
+ return c_v64_shl_32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
+ return c_v64_shr_u32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
+ return c_v64_shr_s32(a, c);
+}
+
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
new file mode 100644
index 0000000000..ec27a6bf42
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+typedef __m128i v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+ return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+ return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+ return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+ return _mm_packs_epi32(
+ _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
+ _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+ return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+ return _mm_cvtsi64_si128((int64_t)x);
+#else
+ return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_u64(v64 x) {
+ return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
+}
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+ return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+ return *((uint32_t *)p);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+ *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+ *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+ return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+ return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+ _mm_storel_epi64((__m128i *)p, a);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+ _mm_storel_epi64((__m128i *)p, a);
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#define v64_align(a, b, c) \
+ ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
+#else
+#define v64_align(a, b, c) \
+ ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
+ : (b))
+#endif
+
+SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v64 v64_abs_s16(v64 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi16(a);
+#else
+ return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 a) {
+#if defined(__SSSE3__)
+ return _mm_abs_epi8(a);
+#else
+ v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+ return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
+ return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
+ return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
+ return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packs_epi32(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packus_epi32(t, t);
+#else
+ const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535);
+ const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535);
+ const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535);
+ const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535);
+ return v64_from_16(ah, al, bh, bl);
+#endif
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packus_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+ __m128i t = _mm_unpacklo_epi64(b, a);
+ return _mm_packs_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0f0d0b0907050301LL));
+#else
+ return _mm_packus_epi16(
+ _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
+ _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0e0c0a0806040200LL));
+#else
+ return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0f0e0b0a07060302LL));
+#else
+ return _mm_packs_epi32(
+ _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
+ _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+ v64_from_64(0x0d0c090805040100LL));
+#else
+ return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+ return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+ return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+ return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+ return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+ return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+ return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+ return _mm_srli_si128(
+ _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+#if defined(__SSSE3__)
+ return _mm_shuffle_epi8(x, pattern);
+#else
+ v64 output;
+ unsigned char *input = (unsigned char *)&x;
+ unsigned char *index = (unsigned char *)&pattern;
+ unsigned char *selected = (unsigned char *)&output;
+ int counter;
+
+ for (counter = 0; counter < 8; counter++) {
+ selected[counter] = input[index[counter]];
+ }
+
+ return output;
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
+ __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+ _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+ t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+ t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+ return (int32_t)v64_low_u32(t);
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
+ __m128i r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+ __m128i x = _mm_cvtepi32_epi64(r);
+ return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
+#else
+ return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+ (int64_t)_mm_cvtsi128_si32(r);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
+ return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+ return v64_dotp_s16(a, v64_dup_16(1));
+}
+
+typedef v64 sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ v64_sad_u8_sum().
+ The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+ return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+typedef v64 ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value. Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+ v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
+ v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
+ v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
+ return _mm_add_epi64(
+ s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ return _mm_mullo_epi32(a, b);
+#else
+ return _mm_unpacklo_epi32(
+ _mm_mul_epu32(a, b),
+ _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
+#endif
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+ return _mm_maddubs_epi16(a, b);
+#else
+ __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+ _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
+ return _mm_packs_epi32(t, t);
+#endif
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
+ return _mm_sub_epi8(_mm_avg_epu8(a, b),
+ _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+ return _mm_sub_epi16(_mm_avg_epu16(a, b),
+ _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ return _mm_min_epi8(a, b);
+#else
+ v64 mask = _mm_cmplt_epi8(a, b);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+ return _mm_max_epi8(a, b);
+#else
+ v64 mask = _mm_cmplt_epi8(b, a);
+ return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+ _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+ return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
+ _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+ return _mm_packs_epi16(
+ _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
+ a);
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+ return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+ return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+ return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+ return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+ return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+ return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
+#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
+#define v64_shl_n_8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+ _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
+#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+
+#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c
new file mode 100644
index 0000000000..bfe76edc39
--- /dev/null
+++ b/third_party/aom/aom_dsp/sse.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y, x;
+ int64_t sse = 0;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ const int32_t diff = abs(a[x] - b[x]);
+ sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int y, x;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+ sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sse;
+}
+#endif
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
new file mode 100644
index 0000000000..35d493b038
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_INTERNAL_STATS
+void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 16; i++, s += sp, r += rp) {
+ for (j = 0; j < 16; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+ uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+ uint32_t bd) {
+ double ssim_n, ssim_d;
+ int64_t c1 = 0, c2 = 0;
+ if (bd == 8) {
+ // scale the constants by number of pixels
+ c1 = (cc1 * count * count) >> 12;
+ c2 = (cc2 * count * count) >> 12;
+ } else if (bd == 10) {
+ c1 = (cc1_10 * count * count) >> 12;
+ c2 = (cc2_10 * count * count) >> 12;
+ } else if (bd == 12) {
+ c1 = (cc1_12 * count * count) >> 12;
+ c2 = (cc2_12 * count * count) >> 12;
+ } else {
+ assert(0);
+ // Return similarity as zero for unsupported bit-depth values.
+ return 0;
+ }
+
+ ssim_n = (2.0 * sum_s * sum_r + c1) *
+ (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
+
+ ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+ ((double)count * sum_sq_s - (double)sum_s * sum_s +
+ (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
+
+ return ssim_n / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+ int stride_img2, int width, int height) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ double *fast_ssim) {
+ double abc[3];
+ for (int i = 0; i < 3; ++i) {
+ const int is_uv = i > 0;
+ abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
+ source->strides[is_uv], dest->strides[is_uv],
+ source->crop_widths[is_uv], source->crop_heights[is_uv]);
+ }
+
+ *weight = 1;
+ *fast_ssim = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+ (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+ // Since these variables are unsigned sums, convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side. check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+ // Scale the constants by number of pixels.
+ const int64_t c1 = (cc1 * n * n) >> 12;
+ const int64_t c2 = (cc2 * n * n) >> 12;
+
+ const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+ const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+ // Since these variables are unsigned, sums convert to double so
+ // math is done in double arithmetic.
+ const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+ (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+ n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+ return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, Ssimv *sv) {
+ aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+ &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency) {
+ double dssim_total = 0;
+ double ssim_total = 0;
+ double ssim2_total = 0;
+ double inconsistency_total = 0;
+ int i, j;
+ int c = 0;
+ double norm;
+ double old_ssim_total = 0;
+ // We can sample points as frequently as we like start with 1 per 4x4.
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4, ++c) {
+ Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+ double ssim;
+ double ssim2;
+ double dssim;
+ uint32_t var_new;
+ uint32_t var_old;
+ uint32_t mean_new;
+ uint32_t mean_old;
+ double ssim_new;
+ double ssim_old;
+
+ // Not sure there's a great way to handle the edge pixels
+ // in ssim when using a window. Seems biased against edge pixels
+ // however you handle this. This uses only samples that are
+ // fully in the frame.
+ if (j + 8 <= width && i + 8 <= height) {
+ ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+ }
+
+ ssim = ssimv_similarity(&sv, 64);
+ ssim2 = ssimv_similarity2(&sv, 64);
+
+ sv.ssim = ssim2;
+
+ // dssim is calculated to use as an actual error metric and
+ // is scaled up to the same range as sum square error.
+ // Since we are subsampling every 16th point maybe this should be
+ // *16 ?
+ dssim = 255 * 255 * (1 - ssim2) / 2;
+
+ // Here I introduce a new error metric: consistency-weighted
+ // SSIM-inconsistency. This metric isolates frames where the
+ // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+ // sharper or blurrier than the others. Higher values indicate a
+ // temporally inconsistent SSIM. There are two ideas at work:
+ //
+ // 1) 'SSIM-inconsistency': the total inconsistency value
+ // reflects how much SSIM values are changing between this
+ // source / reference frame pair and the previous pair.
+ //
+ // 2) 'consistency-weighted': weights de-emphasize areas in the
+ // frame where the scene content has changed. Changes in scene
+ // content are detected via changes in local variance and local
+ // mean.
+ //
+ // Thus the overall measure reflects how inconsistent the SSIM
+ // values are, over consistent regions of the frame.
+ //
+ // The metric has three terms:
+ //
+ // term 1 -> uses change in scene Variance to weight error score
+ // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term 2 -> uses change in local scene luminance to weight error
+ // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+ // larger changes from one frame to the next mean we care
+ // less about consistency.
+ //
+ // term3 -> measures inconsistency in ssim scores between frames
+ // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+ //
+ // This term compares the ssim score for the same location in 2
+ // subsequent frames.
+ var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+ var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+ mean_new = sv.sum_s;
+ mean_old = sv2[c].sum_s;
+ ssim_new = sv.ssim;
+ ssim_old = sv2[c].ssim;
+
+ if (do_inconsistency) {
+ // We do the metric once for every 4x4 block in the image. Since
+ // we are scaling the error to SSE for use in a psnr calculation
+ // 1.0 = 4x4x255x255 the worst error we can possibly have.
+ static const double kScaling = 4. * 4 * 255 * 255;
+
+ // The constants have to be non 0 to avoid potential divide by 0
+ // issues other than that they affect kind of a weighting between
+ // the terms. No testing of what the right terms should be has been
+ // done.
+ static const double c1 = 1, c2 = 1, c3 = 1;
+
+ // This measures how much consistent variance is in two consecutive
+ // source frames. 1.0 means they have exactly the same variance.
+ const double variance_term =
+ (2.0 * var_old * var_new + c1) /
+ (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+ // This measures how consistent the local mean are between two
+ // consecutive frames. 1.0 means they have exactly the same mean.
+ const double mean_term =
+ (2.0 * mean_old * mean_new + c2) /
+ (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+ // This measures how consistent the ssims of two
+ // consecutive frames is. 1.0 means they are exactly the same.
+ double ssim_term =
+ pow((2.0 * ssim_old * ssim_new + c3) /
+ (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+ 5);
+
+ double this_inconsistency;
+
+ // Floating point math sometimes makes this > 1 by a tiny bit.
+ // We want the metric to scale between 0 and 1.0 so we can convert
+ // it to an snr scaled value.
+ if (ssim_term > 1) ssim_term = 1;
+
+ // This converts the consistency metric to an inconsistency metric
+ // ( so we can scale it like psnr to something like sum square error.
+ // The reason for the variance and mean terms is the assumption that
+ // if there are big changes in the source we shouldn't penalize
+ // inconsistency in ssim scores a bit less as it will be less visible
+ // to the user.
+ this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+ this_inconsistency *= kScaling;
+ inconsistency_total += this_inconsistency;
+ }
+ sv2[c] = sv;
+ ssim_total += ssim;
+ ssim2_total += ssim2;
+ dssim_total += dssim;
+
+ old_ssim_total += ssim_old;
+ }
+ old_ssim_total += 0;
+ }
+
+ norm = 1. / (width / 4) / (height / 4);
+ ssim_total *= norm;
+ ssim2_total *= norm;
+ m->ssim2 = ssim2_total;
+ m->ssim = ssim_total;
+ if (old_ssim_total == 0) inconsistency_total = 0;
+
+ m->ssimc = inconsistency_total;
+
+ m->dssim = dssim_total;
+ return inconsistency_total;
+}
+#endif // CONFIG_INTERNAL_STATS
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t *sum_s, uint32_t *sum_r,
+ uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+ uint32_t *sum_sxr) {
+ int i, j;
+ for (i = 0; i < 8; i++, s += sp, r += rp) {
+ for (j = 0; j < 8; j++) {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+ int rp, uint32_t bd, uint32_t shift) {
+ uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+ aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+ &sum_sxr);
+ return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+ sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width, int height,
+ uint32_t bd, uint32_t shift) {
+ int i, j;
+ int samples = 0;
+ double ssim_total = 0;
+
+ // sample point start with each 4x4 location
+ for (i = 0; i <= height - 8;
+ i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+ for (j = 0; j <= width - 8; j += 4) {
+ double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+ CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+ shift);
+ ssim_total += v;
+ samples++;
+ }
+ }
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd, double *fast_ssim) {
+ assert(bd >= in_bd);
+ uint32_t shift = bd - in_bd;
+
+ double abc[3];
+ for (int i = 0; i < 3; ++i) {
+ const int is_uv = i > 0;
+ abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+ source->strides[is_uv], dest->strides[is_uv],
+ source->crop_widths[is_uv],
+ source->crop_heights[is_uv], in_bd, shift);
+ }
+
+ weight[0] = 1;
+ fast_ssim[0] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+
+ if (bd > in_bd) {
+ // Compute SSIM based on stream bit depth
+ shift = 0;
+ for (int i = 0; i < 3; ++i) {
+ const int is_uv = i > 0;
+ abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+ source->strides[is_uv], dest->strides[is_uv],
+ source->crop_widths[is_uv],
+ source->crop_heights[is_uv], bd, shift);
+ }
+
+ weight[1] = 1;
+ fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if CONFIG_INTERNAL_STATS
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+ const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+ const uint32_t in_bit_depth, int is_hbd, double *weight,
+ double *frame_ssim2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+ frame_ssim2);
+ return;
+ }
+#else
+ (void)bit_depth;
+ (void)in_bit_depth;
+ (void)is_hbd;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2);
+}
+#endif // CONFIG_INTERNAL_STATS
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
new file mode 100644
index 0000000000..fb92556a8c
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SSIM_H_
+#define AOM_AOM_DSP_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+ // source sum ( over 8x8 region )
+ uint32_t sum_s;
+
+ // reference sum (over 8x8 region )
+ uint32_t sum_r;
+
+ // source sum squared ( over 8x8 region )
+ uint32_t sum_sq_s;
+
+ // reference sum squared (over 8x8 region )
+ uint32_t sum_sq_r;
+
+ // sum of source times reference (over 8x8 region)
+ uint32_t sum_sxr;
+
+ // calculated ssim score between source and reference
+ double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+ // ssim consistency error metric ( see code for explanation )
+ double ssimc;
+
+ // standard ssim
+ double ssim;
+
+ // revised ssim ( see code for explanation)
+ double ssim2;
+
+ // ssim restated as an error metric like sse
+ double dssim;
+
+ // dssim converted to decibels
+ double dssimd;
+
+ // ssimc converted to decibels
+ double ssimcd;
+} Metrics;
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+ int img2_pitch, int width, int height, Ssimv *sv2,
+ Metrics *m, int do_inconsistency);
+
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ double *fast_ssim);
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+ double *ssim_u, double *ssim_v, uint32_t bd,
+ uint32_t in_bd);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, double *weight,
+ uint32_t bd, uint32_t in_bd, double *fast_ssim);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+ const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+ const uint32_t in_bit_depth, int is_hbd, double *weight,
+ double *frame_ssim2);
+#endif // CONFIG_INTERNAL_STATS
+
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+ int stride_img2, int width, int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+ int stride_img1, int stride_img2, int width, int height,
+ uint32_t bd, uint32_t shift);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
new file mode 100644
index 0000000000..4f47e553d4
--- /dev/null
+++ b/third_party/aom/aom_dsp/subtract.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_c(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r, c;
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ int r, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+ for (r = 0; r < rows; r++) {
+ for (c = 0; c < cols; c++) {
+ diff[c] = src[c] - pred[c];
+ }
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
new file mode 100644
index 0000000000..f58defaa11
--- /dev/null
+++ b/third_party/aom/aom_dsp/sum_squares.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
+ int height) {
+ int r, c;
+ uint64_t ss = 0;
+
+ for (r = 0; r < height; r++) {
+ for (c = 0; c < width; c++) {
+ const int16_t v = src[c];
+ ss += v * v;
+ }
+ src += src_stride;
+ }
+
+ return ss;
+}
+
+uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+ uint64_t ss = 0;
+ do {
+ const int16_t v = *src++;
+ ss += v * v;
+ } while (--n);
+
+ return ss;
+}
+
+uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) {
+ int r, c;
+ uint64_t ss = 0, s = 0;
+
+ for (r = 0; r < height; r++) {
+ for (c = 0; c < width; c++) {
+ const uint8_t v = src[c];
+ ss += v * v;
+ s += v;
+ }
+ src += src_stride;
+ }
+
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) {
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ int r, c;
+ uint64_t ss = 0, s = 0;
+
+ for (r = 0; r < height; r++) {
+ for (c = 0; c < width; c++) {
+ const uint16_t v = srcp[c];
+ ss += v * v;
+ s += v;
+ }
+ srcp += src_stride;
+ }
+
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width,
+ int height, int *sum) {
+ int r, c;
+ int16_t *srcp = (int16_t *)src;
+ int64_t ss = 0;
+
+ for (r = 0; r < height; r++) {
+ for (c = 0; c < width; c++) {
+ const int16_t v = srcp[c];
+ ss += v * v;
+ *sum += v;
+ }
+ srcp += src_stride;
+ }
+ return ss;
+}
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
new file mode 100644
index 0000000000..67d9e90ca9
--- /dev/null
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
+#define AOM_AOM_DSP_TXFM_COMMON_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+// block transform size
+enum {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_64X64, // 64x64 transform
+ TX_4X8, // 4x8 transform
+ TX_8X4, // 8x4 transform
+ TX_8X16, // 8x16 transform
+ TX_16X8, // 16x8 transform
+ TX_16X32, // 16x32 transform
+ TX_32X16, // 32x16 transform
+ TX_32X64, // 32x64 transform
+ TX_64X32, // 64x32 transform
+ TX_4X16, // 4x16 transform
+ TX_16X4, // 16x4 transform
+ TX_8X32, // 8x32 transform
+ TX_32X8, // 32x8 transform
+ TX_16X64, // 16x64 transform
+ TX_64X16, // 64x16 transform
+ TX_SIZES_ALL, // Includes rectangular transforms
+ TX_SIZES = TX_4X8, // Does NOT include rectangular transforms
+ TX_SIZES_LARGEST = TX_64X64,
+ TX_INVALID = 255 // Invalid transform size
+} UENUM1BYTE(TX_SIZE);
+
+enum {
+ DCT_DCT, // DCT in both horizontal and vertical
+ ADST_DCT, // ADST in vertical, DCT in horizontal
+ DCT_ADST, // DCT in vertical, ADST in horizontal
+ ADST_ADST, // ADST in both directions
+ FLIPADST_DCT, // FLIPADST in vertical, DCT in horizontal
+ DCT_FLIPADST, // DCT in vertical, FLIPADST in horizontal
+ FLIPADST_FLIPADST, // FLIPADST in both directions
+ ADST_FLIPADST, // ADST in vertical, FLIPADST in horizontal
+ FLIPADST_ADST, // FLIPADST in vertical, ADST in horizontal
+ IDTX, // Identity in both directions
+ V_DCT, // DCT in vertical, identity in horizontal
+ H_DCT, // Identity in vertical, DCT in horizontal
+ V_ADST, // ADST in vertical, identity in horizontal
+ H_ADST, // Identity in vertical, ADST in horizontal
+ V_FLIPADST, // FLIPADST in vertical, identity in horizontal
+ H_FLIPADST, // Identity in vertical, FLIPADST in horizontal
+ TX_TYPES,
+ DCT_ADST_TX_MASK = 0x000F, // Either DCT or ADST in each direction
+ TX_TYPE_INVALID = 255, // Invalid transform type
+} UENUM1BYTE(TX_TYPE);
+
+enum {
+ // DCT only
+ EXT_TX_SET_DCTONLY,
+ // DCT + Identity only
+ EXT_TX_SET_DCT_IDTX,
+ // Discrete Trig transforms w/o flip (4) + Identity (1)
+ EXT_TX_SET_DTT4_IDTX,
+ // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+ EXT_TX_SET_DTT4_IDTX_1DDCT,
+ // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+ EXT_TX_SET_DTT9_IDTX_1DDCT,
+ // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+ EXT_TX_SET_ALL16,
+ EXT_TX_SET_TYPES
+} UENUM1BYTE(TxSetType);
+
+typedef struct txfm_param {
+ // for both forward and inverse transforms
+ TX_TYPE tx_type;
+ TX_SIZE tx_size;
+ int lossless;
+ int bd;
+ // are the pixel buffers octets or shorts? This should collapse to
+ // bd==8 implies !is_hbd, but that's not certain right now.
+ int is_hbd;
+ TxSetType tx_set_type;
+ // for inverse transforms only
+ int eob;
+} TxfmParam;
+
+// Constants:
+// for (int i = 1; i< 32; ++i)
+// printf("static const int cospi_%d_64 = %.0f;\n", i,
+// round(16384 * cos(i*PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+static const tran_high_t InvSqrt2 = 11585;
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return rv;
+}
+
+#endif // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
new file mode 100644
index 0000000000..f02c3077ae
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.c
@@ -0,0 +1,1234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/variance.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+
+uint32_t aom_get_mb_ss_c(const int16_t *a) {
+ unsigned int i, sum = 0;
+
+ for (i = 0; i < 256; ++i) {
+ sum += a[i] * a[i];
+ }
+
+ return sum;
+}
+
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse, int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h) {
+ uint32_t sse;
+ int sum;
+ variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
+ return sse;
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ b[j] = ROUND_POWER_OF_TWO(
+ (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+ ++a;
+ }
+
+ a += src_pixels_per_line - output_width;
+ b += output_width;
+ }
+}
+
+#define VAR(W, H) \
+ uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define SUBPIX_VAR(W, H) \
+ uint32_t aom_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+ }
+
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+ \
+ return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+ } \
+ uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
+ \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
+ }
+
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over 4 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
+ &sum8x8[k]);
+ }
+
+ // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++)
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+}
+
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse16x16, unsigned int *tot_sse,
+ int *tot_sum, uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
+ 16, 16, &sse16x16[k], &sum16x16[k]);
+ }
+
+ // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++)
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+ uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+ VAR(W, H) \
+ SUBPIX_VAR(W, H) \
+ SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+// Realtime mode doesn't use rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+VARIANCES(4, 16)
+VARIANCES(16, 4)
+VARIANCES(8, 32)
+VARIANCES(32, 8)
+VARIANCES(16, 64)
+VARIANCES(64, 16)
+#endif
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ int i, j;
+ const int fwd_offset = jcp_param->fwd_offset;
+ const int bck_offset = jcp_param->bck_offset;
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+ comp_pred[j] = (uint8_t)tmp;
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ int64_t tsum = 0;
+ uint64_t tsse = 0;
+ for (int i = 0; i < h; ++i) {
+ int32_t lsum = 0;
+ for (int j = 0; j < w; ++j) {
+ const int diff = a[j] - b[j];
+ lsum += diff;
+ tsse += (uint32_t)(diff * diff);
+ }
+ tsum += lsum;
+ a += a_stride;
+ b += b_stride;
+ }
+ *sum = tsum;
+ *sse = tsse;
+}
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int w, int h) {
+ uint64_t sse;
+ int64_t sum;
+ highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
+ return sse;
+}
+
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (uint32_t)sse_long;
+ *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint64_t sse_long = 0;
+ int64_t sum_long = 0;
+ highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+ uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_MSE(W, H) \
+ uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+
+ ++src_ptr;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = ROUND_POWER_OF_TWO(
+ (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+ FILTER_BITS);
+ ++src_ptr;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+ dst, dst_stride, sse); \
+ }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+ W, H, CONVERT_TO_BYTEPTR(temp2), W, \
+ jcp_param); \
+ \
+ return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+ W, H, CONVERT_TO_BYTEPTR(temp2), W, \
+ jcp_param); \
+ \
+ return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+ W, H, CONVERT_TO_BYTEPTR(temp2), W, \
+ jcp_param); \
+ \
+ return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ }
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+ HIGHBD_VAR(W, H) \
+ HIGHBD_SUBPIX_VAR(W, H) \
+ HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_VARIANCES(4, 16)
+HIGHBD_VARIANCES(16, 4)
+HIGHBD_VARIANCES(8, 32)
+HIGHBD_VARIANCES(32, 8)
+HIGHBD_VARIANCES(16, 64)
+HIGHBD_VARIANCES(64, 16)
+#endif
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_c(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ int i, j;
+ const int fwd_offset = jcp_param->fwd_offset;
+ const int bck_offset = jcp_param->bck_offset;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+ comp_pred[j] = (uint16_t)tmp;
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask) {
+ int i, j;
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
+ }
+ comp_pred += width;
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ }
+}
+
+#define MASK_SUBPIX_VAR(W, H) \
+ unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
+ W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
+ invert_mask); \
+ return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
+ }
+
+MASK_SUBPIX_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 8)
+MASK_SUBPIX_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 16)
+MASK_SUBPIX_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 32)
+MASK_SUBPIX_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 64)
+MASK_SUBPIX_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 128)
+MASK_SUBPIX_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 128)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+MASK_SUBPIX_VAR(4, 16)
+MASK_SUBPIX_VAR(16, 4)
+MASK_SUBPIX_VAR(8, 32)
+MASK_SUBPIX_VAR(32, 8)
+MASK_SUBPIX_VAR(16, 64)
+MASK_SUBPIX_VAR(64, 16)
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ if (!invert_mask)
+ comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
+ else
+ comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ mask += mask_stride;
+ }
+}
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
+ unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+ invert_mask); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref, ref_stride, sse); \
+ } \
+ \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+ invert_mask); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref, ref_stride, sse); \
+ } \
+ \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+ invert_mask); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ ref, ref_stride, sse); \
+ }
+
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASK_SUBPIX_VAR(4, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ pre += pre_stride;
+ wsrc += w;
+ mask += w;
+ }
+}
+
+#define OBMC_VAR(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define OBMC_SUBPIX_VAR(W, H) \
+ unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
+ W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
+ }
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+
+OBMC_VAR(4, 16)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_VAR(16, 4)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_VAR(8, 32)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_VAR(32, 8)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_VAR(16, 64)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_VAR(64, 16)
+OBMC_SUBPIX_VAR(64, 16)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int i, j;
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ pre += pre_stride;
+ wsrc += w;
+ mask += w;
+ }
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64;
+ uint64_t sse64;
+ highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H) \
+ unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
+ unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, wsrc, mask, sse); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, wsrc, mask, sse); \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+ W, wsrc, mask, sse); \
+ }
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+
+HIGHBD_OBMC_VAR(4, 16)
+HIGHBD_OBMC_SUBPIX_VAR(4, 16)
+HIGHBD_OBMC_VAR(16, 4)
+HIGHBD_OBMC_SUBPIX_VAR(16, 4)
+HIGHBD_OBMC_VAR(8, 32)
+HIGHBD_OBMC_SUBPIX_VAR(8, 32)
+HIGHBD_OBMC_VAR(32, 8)
+HIGHBD_OBMC_SUBPIX_VAR(32, 8)
+HIGHBD_OBMC_VAR(16, 64)
+HIGHBD_OBMC_SUBPIX_VAR(16, 64)
+HIGHBD_OBMC_VAR(64, 16)
+HIGHBD_OBMC_SUBPIX_VAR(64, 16)
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+
+uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ uint64_t sum = 0;
+ for (int i = 0; i < h; i++) {
+ for (int j = 0; j < w; j++) {
+ int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
+ sum += e * e;
+ }
+ }
+ return sum;
+}
+
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
+ int h) {
+ uint16_t *src_temp = src;
+ uint8_t *dst_temp = dst;
+ const int num_blks = 16 / w;
+ int64_t sum = 0;
+ for (int i = 0; i < num_blks; i++) {
+ sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
+ dst_temp += w;
+ src_temp += (w * h);
+ }
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ uint64_t sum = 0;
+ for (int i = 0; i < h; i++) {
+ for (int j = 0; j < w; j++) {
+ int e = dst[i * dstride + j] - src[i * sstride + j];
+ sum += e * e;
+ }
+ }
+ return sum;
+}
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
new file mode 100644
index 0000000000..6603d312b8
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VARIANCE_H_
+#define AOM_AOM_DSP_VARIANCE_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride);
+
+typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *second_pred);
+
+typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+ int b_stride, int n);
+
+typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *const b_array[],
+ int b_stride, unsigned int *sad_array);
+
+typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+
+typedef unsigned int (*aom_subp_avg_variance_fn_t)(
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+ int b_stride, unsigned int *sse, const uint8_t *second_pred);
+
+typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)(
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)(
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+ int b_stride, unsigned int *sse, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride,
+ int invert_mask);
+typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
+ const int32_t *wsrc,
+ const int32_t *msk);
+typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
+ int pred_stride,
+ const int32_t *wsrc,
+ const int32_t *msk,
+ unsigned int *sse);
+typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
+ const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
+ const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
+
+typedef struct aom_variance_vtable {
+ aom_sad_fn_t sdf;
+ // Same as normal sad, but downsample the rows by a factor of 2.
+ aom_sad_fn_t sdsf;
+ aom_sad_avg_fn_t sdaf;
+ aom_variance_fn_t vf;
+ aom_subpixvariance_fn_t svf;
+ aom_subp_avg_variance_fn_t svaf;
+ aom_sad_multi_d_fn_t sdx4df;
+ aom_sad_multi_d_fn_t sdx3df;
+ // Same as sadx4, but downsample the rows by a factor of 2.
+ aom_sad_multi_d_fn_t sdsx4df;
+ aom_masked_sad_fn_t msdf;
+ aom_masked_subpixvariance_fn_t msvf;
+ aom_obmc_sad_fn_t osdf;
+ aom_obmc_variance_fn_t ovf;
+ aom_obmc_subpixvariance_fn_t osvf;
+ aom_dist_wtd_sad_avg_fn_t jsdaf;
+ aom_dist_wtd_subp_avg_variance_fn_t jsvaf;
+} aom_variance_fn_ptr_t;
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter);
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr, uint16_t *output_ptr,
+ unsigned int src_pixels_per_line, unsigned int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint8_t *filter);
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h);
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int w, int h);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/vmaf.c b/third_party/aom/aom_dsp/vmaf.c
new file mode 100644
index 0000000000..a40e00cb23
--- /dev/null
+++ b/third_party/aom/aom_dsp/vmaf.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/vmaf.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "aom_dsp/blend.h"
+
+static void vmaf_fatal_error(const char *message) {
+ fprintf(stderr, "Fatal error: %s\n", message);
+ exit(EXIT_FAILURE);
+}
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) {
+ if (*vmaf_model != NULL) return;
+ VmafModelConfig model_cfg;
+ model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP;
+ model_cfg.name = "vmaf";
+
+ if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) {
+ vmaf_fatal_error("Failed to load VMAF model.");
+ }
+}
+
+void aom_close_vmaf_model(VmafModel *vmaf_model) {
+ vmaf_model_destroy(vmaf_model);
+}
+
+static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src,
+ VmafPicture *dst) {
+ const int width = src->y_width;
+ const int height = src->y_height;
+
+ if (bit_depth > 8) {
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src->y_buffer);
+ uint16_t *dst_ptr = dst->data[0];
+
+ for (int row = 0; row < height; ++row) {
+ memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+ src_ptr += src->y_stride;
+ dst_ptr += dst->stride[0] / 2;
+ }
+ } else {
+ uint8_t *src_ptr = src->y_buffer;
+ uint8_t *dst_ptr = (uint8_t *)dst->data[0];
+
+ for (int row = 0; row < height; ++row) {
+ memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+ src_ptr += src->y_stride;
+ dst_ptr += dst->stride[0];
+ }
+ }
+}
+
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+ bool cal_vmaf_neg) {
+ // TODO(sdeng): make them CLI arguments.
+ VmafConfiguration cfg;
+ cfg.log_level = VMAF_LOG_LEVEL_NONE;
+ cfg.n_threads = 0;
+ cfg.n_subsample = 0;
+ cfg.cpumask = 0;
+
+ if (vmaf_init(vmaf_context, cfg)) {
+ vmaf_fatal_error("Failed to init VMAF context.");
+ }
+
+ if (cal_vmaf_neg) {
+ VmafFeatureDictionary *vif_feature = NULL;
+ if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit",
+ "1.0")) {
+ vmaf_fatal_error("Failed to set vif_enhn_gain_limit.");
+ }
+ if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) {
+ vmaf_fatal_error("Failed to use feature float_vif.");
+ }
+
+ VmafFeatureDictionary *adm_feature = NULL;
+ if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit",
+ "1.0")) {
+ vmaf_fatal_error("Failed to set adm_enhn_gain_limit.");
+ }
+ if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) {
+ vmaf_fatal_error("Failed to use feature float_adm.");
+ }
+ }
+
+ VmafFeatureDictionary *motion_force_zero = NULL;
+ if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero",
+ "1")) {
+ vmaf_fatal_error("Failed to set motion_force_zero.");
+ }
+ if (vmaf_model_feature_overload(vmaf_model, "float_motion",
+ motion_force_zero)) {
+ vmaf_fatal_error("Failed to use feature float_motion.");
+ }
+
+ if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
+ vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
+ }
+}
+
+void aom_close_vmaf_context(VmafContext *vmaf_context) {
+ if (vmaf_close(vmaf_context)) {
+ vmaf_fatal_error("Failed to close VMAF context.");
+ }
+}
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+ bool cal_vmaf_neg, double *vmaf) {
+ VmafContext *vmaf_context;
+ aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg);
+ const int frame_index = 0;
+ VmafPicture ref, dist;
+ if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+ source->y_height) ||
+ vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+ source->y_width, source->y_height)) {
+ vmaf_fatal_error("Failed to alloc VMAF pictures.");
+ }
+ copy_picture(bit_depth, source, &ref);
+ copy_picture(bit_depth, distorted, &dist);
+ if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+ /*picture index=*/frame_index)) {
+ vmaf_fatal_error("Failed to read VMAF pictures.");
+ }
+
+ if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+ vmaf_fatal_error("Failed to flush context.");
+ }
+
+ vmaf_picture_unref(&ref);
+ vmaf_picture_unref(&dist);
+
+ vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index);
+ aom_close_vmaf_context(vmaf_context);
+}
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+ int frame_index) {
+ VmafPicture ref, dist;
+ if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+ source->y_height) ||
+ vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+ source->y_width, source->y_height)) {
+ vmaf_fatal_error("Failed to alloc VMAF pictures.");
+ }
+ copy_picture(bit_depth, source, &ref);
+ copy_picture(bit_depth, distorted, &dist);
+ if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+ /*picture index=*/frame_index)) {
+ vmaf_fatal_error("Failed to read VMAF pictures.");
+ }
+
+ vmaf_picture_unref(&ref);
+ vmaf_picture_unref(&dist);
+}
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+ int frame_index) {
+ double vmaf;
+ if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) {
+ vmaf_fatal_error("Failed to calc VMAF scores.");
+ }
+ return vmaf;
+}
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context) {
+ if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+ vmaf_fatal_error("Failed to flush context.");
+ }
+}
diff --git a/third_party/aom/aom_dsp/vmaf.h b/third_party/aom/aom_dsp/vmaf.h
new file mode 100644
index 0000000000..b539cf8b76
--- /dev/null
+++ b/third_party/aom/aom_dsp/vmaf.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VMAF_H_
+#define AOM_AOM_DSP_VMAF_H_
+
+#include <libvmaf/libvmaf.h>
+#include <stdbool.h>
+
+#include "aom_scale/yv12config.h"
+
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+ bool cal_vmaf_neg);
+void aom_close_vmaf_context(VmafContext *vmaf_context);
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path);
+void aom_close_vmaf_model(VmafModel *vmaf_model);
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+ bool cal_vmaf_neg, double *vmaf);
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+ int frame_index);
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+ int frame_index);
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context);
+
+#endif // AOM_AOM_DSP_VMAF_H_
diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..b3dede75d5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+ const int16_t *round_ptr, __m256i *round,
+ const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr,
+ __m256i *dequant,
+ const int16_t *shift_ptr,
+ __m256i *shift) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+ const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
+ const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+ const int16_t *iscan_ptr, int *is_found,
+ __m256i *mask) {
+ __m256i temp_mask = _mm256_setzero_si256();
+ if (_mm256_movemask_epi8(*cmp_mask)) {
+ __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+ temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+ *is_found = 1;
+ }
+ *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+ const int16_t *iscan_ptr, int *is_found,
+ __m256i *mask) {
+ __m256i zero = _mm256_setzero_si256();
+ __m256i coeff[2], cmp_mask0, cmp_mask1;
+ coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero);
+ coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero);
+ coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS);
+ cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS);
+ cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+ cmp_mask0 =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+ update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+ const __m256i *quant,
+ const __m256i *shift) {
+ __m256i tmp, qcoeff;
+ qcoeff = _mm256_adds_epi16(*coeff, *round);
+ tmp = _mm256_mulhi_epi16(qcoeff, *quant);
+ qcoeff = _mm256_add_epi16(tmp, qcoeff);
+ *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
+}
+
+static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+ return _mm256_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+void aom_quantize_b_adaptive_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i zbin, round, quant, dequant, shift;
+ __m256i coeff, qcoeff;
+ __m256i cmp_mask, mask0 = zero, mask1 = zero;
+ __m128i temp_mask0, temp_mask1;
+ int prescan_add[2];
+ int thresh[2];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ __m256i threshold[2];
+ threshold[0] = _mm256_set1_epi32(thresh[0]);
+ threshold[1] = _mm256_set1_epi32(thresh[1]);
+ threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+
+ // Setup global values.
+ load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff = load_coefficients_avx2(coeff_ptr);
+ qcoeff = _mm256_abs_epi16(coeff);
+ update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0);
+ __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+ zbin = _mm256_unpackhi_epi64(zbin, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+ update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+ threshold[0] = threshold[1];
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ // Reinsert signs
+ qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+ // Mask out zbin threshold coeffs
+ qcoeff = _mm256_and_si256(qcoeff, temp0);
+ store_coefficients_avx2(qcoeff, qcoeff_ptr);
+ coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ store_coefficients_avx2(coeff, dqcoeff_ptr);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff = load_coefficients_avx2(coeff_ptr + index);
+ qcoeff = _mm256_abs_epi16(coeff);
+ update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0);
+ temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+ update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+ qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+ qcoeff = _mm256_and_si256(qcoeff, temp0);
+ store_coefficients_avx2(qcoeff, qcoeff_ptr + index);
+ coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+ store_coefficients_avx2(coeff, dqcoeff_ptr + index);
+ index += 16;
+ }
+ if (is_found0) {
+ temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+ _mm256_extracti128_si256(mask0, 1));
+ non_zero_count = calculate_non_zero_count(temp_mask0);
+ }
+ if (is_found1) {
+ temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+ _mm256_extracti128_si256(mask1, 1));
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+ }
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff0 = qcoeff_ptr[rc];
+ if (qcoeff0) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff0 = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff0);
+ const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..503b9b4682
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ index += 16;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ const int log_scale = 1;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, log_scale_vec);
+ round = _mm_add_epi16(round, log_scale_vec);
+ zbin = _mm_srli_epi16(zbin, log_scale);
+ round = _mm_srli_epi16(round, log_scale);
+ zbin = _mm_sub_epi16(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+ index += 16;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_quantize_b_64x64_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ const int log_scale = 2;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, log_scale_vec);
+ round = _mm_add_epi16(round, log_scale_vec);
+ zbin = _mm_srli_epi16(zbin, log_scale);
+ round = _mm_srli_epi16(round, log_scale);
+ zbin = _mm_sub_epi16(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+ index += 16;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
new file mode 100644
index 0000000000..b08ec2546b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction aom_filter_block1d16_v8_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_sse2;
+filter8_1dfunction aom_filter_block1d16_v4_sse2;
+filter8_1dfunction aom_filter_block1d16_h4_sse2;
+
+filter8_1dfunction aom_filter_block1d8_h4_sse2;
+filter8_1dfunction aom_filter_block1d8_v4_sse2;
+filter8_1dfunction aom_filter_block1d4_h4_sse2;
+filter8_1dfunction aom_filter_block1d4_v4_sse2;
+
+filter8_1dfunction aom_filter_block1d16_v2_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_sse2;
+
+// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
+#endif
+#endif // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c
new file mode 100644
index 0000000000..a1043828fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+ __m256i s[8];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+ s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+ s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+ s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+ _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+ _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+ _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+ _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ assert(w == 128);
+ do {
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c
new file mode 100644
index 0000000000..e78845e97c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+ __m128i s[16];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+ s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+ s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+ s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+ s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+ s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+ s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+ s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+ _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+ _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+ _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+ _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+ _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+ _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+ _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+ _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ __m128i s = _mm_loadl_epi64((__m128i *)src);
+ *(int *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ s = _mm_loadl_epi64((__m128i *)src);
+ *(int *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..d392225906
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -0,0 +1,613 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+ punpcklwd xmm1, xmm7
+
+ movdqa k0k6, xmm0
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+ movdqa k1k7, xmm1
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+ punpcklwd xmm0, xmm6 ;two row in one register
+ punpcklwd xmm1, xmm7
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+
+ pmaddwd xmm0, k0k6 ;multiply the filter factors
+ pmaddwd xmm1, k1k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm3, k3k4
+
+ paddd xmm0, xmm1 ;sum
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+
+ paddd xmm0, krd ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm6, xmm7
+ punpckhwd xmm2, xmm5
+ punpckhwd xmm3, xmm4
+
+ movdqa k0k1, xmm0 ;store filter factors on stack
+ movdqa k6k7, xmm6
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+ movdqu xmm0, [rsi + %1] ;0
+ movdqu xmm1, [rsi + rax + %1] ;1
+ movdqu xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movdqu xmm7, [rsi + rdx * 2 + %1] ;7
+ movdqu xmm2, [rsi + rax + %1] ;2
+ movdqu xmm3, [rsi + rax * 2 + %1] ;3
+ movdqu xmm4, [rsi + rdx + %1] ;4
+ movdqu xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+ movdqu temp, xmm4
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm1, xmm6
+ punpcklwd xmm6, xmm7
+ punpckhwd xmm1, xmm7
+ movdqa xmm7, xmm2
+ punpcklwd xmm2, xmm5
+ punpckhwd xmm7, xmm5
+
+ movdqu xmm5, temp
+ movdqu temp, xmm4
+ movdqa xmm4, xmm3
+ punpcklwd xmm3, xmm5
+ punpckhwd xmm4, xmm5
+ movdqu xmm5, temp
+
+ pmaddwd xmm0, k0k1
+ pmaddwd xmm5, k0k1
+ pmaddwd xmm6, k6k7
+ pmaddwd xmm1, k6k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm7, k2k5
+ pmaddwd xmm3, k3k4
+ pmaddwd xmm4, k3k4
+
+ paddd xmm0, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+ paddd xmm5, xmm1
+ paddd xmm5, xmm7
+ paddd xmm5, xmm4
+
+ paddd xmm0, krd ;rounding
+ paddd xmm5, krd
+ psrad xmm0, 7 ;shift
+ psrad xmm5, 7
+ packssdw xmm0, xmm5 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi + %2]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d4_v8_sse2)
+sym(aom_highbd_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d8_v8_sse2)
+sym(aom_highbd_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d16_v8_sse2)
+sym(aom_highbd_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 0, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d4_h8_sse2)
+sym(aom_highbd_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d8_h8_sse2)
+sym(aom_highbd_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d16_h8_sse2)
+sym(aom_highbd_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 0, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..db4cad9bcb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,367 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklwd xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm5, rdx
+ movq xmm2, rcx
+ pshufd xmm5, xmm5, 0b
+ movdqa xmm1, xmm5
+ psllw xmm5, xmm2
+ psubw xmm5, xmm1 ;max value (for clamping)
+ pxor xmm2, xmm2 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+ punpcklwd xmm0, xmm1 ;two row in one register
+ pmaddwd xmm0, xmm4 ;multiply the filter factors
+
+ paddd xmm0, xmm3 ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, xmm5
+ pmaxsw xmm0, xmm2
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+
+ movq [rdi], xmm0
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm6, [rdx] ;load filters
+
+ pshuflw xmm7, xmm6, 11111111b ;k3
+ pshufhw xmm6, xmm6, 0b ;k4
+ psrldq xmm6, 8
+ punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm3, rdx
+ movq xmm5, rcx
+ pshufd xmm3, xmm3, 0b
+ movdqa xmm1, xmm3
+ psllw xmm3, xmm5
+ psubw xmm3, xmm1 ;max value (for clamping)
+ pxor xmm5, xmm5 ;min value (for clamping)
+
+ movdqa max, xmm3
+ movdqa min, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+ movdqa xmm6, xmm0
+ punpckhwd xmm6, xmm1
+ punpcklwd xmm0, xmm1
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+
+ paddd xmm6, xmm4 ;rounding
+ paddd xmm0, xmm4 ;rounding
+ psrad xmm6, 7 ;shift
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm2
+ punpckhwd xmm5, xmm1
+ punpckhwd xmm6, xmm3
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ pmaddwd xmm5, xmm7
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+ pmaddwd xmm2, xmm7
+
+ paddd xmm5, xmm4 ;rounding
+ paddd xmm6, xmm4
+ paddd xmm0, xmm4
+ paddd xmm2, xmm4
+
+ psrad xmm5, 7 ;shift
+ psrad xmm6, 7
+ psrad xmm0, 7
+ psrad xmm2, 7
+
+ packssdw xmm0, xmm5 ;pack back to word
+ packssdw xmm2, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+ pminsw xmm2, max
+ pmaxsw xmm2, min
+
+%if %1
+ movdqu xmm1, [rdi]
+ movdqu xmm3, [rdi + 16]
+ pavgw xmm0, xmm1
+ pavgw xmm2, xmm3
+%endif
+ movdqu [rdi], xmm0 ;store the result
+ movdqu [rdi + 16], xmm2 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_highbd_filter_block1d4_v2_sse2)
+sym(aom_highbd_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d8_v2_sse2)
+sym(aom_highbd_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d16_v2_sse2)
+sym(aom_highbd_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d4_h2_sse2)
+sym(aom_highbd_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d8_h2_sse2)
+sym(aom_highbd_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d16_h2_sse2)
+sym(aom_highbd_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_quantize_avx.c b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c
new file mode 100644
index 0000000000..b2d6d4b76d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ tran_low_t *dqcoeff) {
+ const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+ const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+
+ *eob_ptr = 0;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+ if (n_coeffs == 16) return;
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+ const int log_scale = 1;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+
+ // Setup global values.
+ // The 32x32 halves zbin and round.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, one);
+ zbin = _mm_srli_epi16(zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ zbin = _mm_sub_epi16(zbin, one);
+
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ round = _mm_add_epi16(round, one);
+ round = _mm_srli_epi16(round, 1);
+
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+ continue;
+ }
+
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000000..22f2e696d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_ports/mem.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) || \
+ (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+ *((int *)(output_ptr + stride)) =
+ _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+ _mm_storel_epi64((__m128i *)(output_ptr + stride),
+ _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+ _mm_store_si128((__m128i *)(output_ptr + stride),
+ _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d4_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg;
+ __m256i firstFilters, secondFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+ __m256i srcReg32b1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 32 bits
+ firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+ // duplicate only the second 32 bits
+ secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ // filter the source buffer
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d16_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m256i srcReg1, srcReg12;
+ __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+ srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+ srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+ // filter the source buffer
+ srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+ srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+ srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+ srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt1_1));
+ }
+}
+
+static void aom_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(
+ srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+ // filter the source buffer
+ srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 =
+ _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i resReg23_34_lo, resReg45_56_lo;
+ __m256i resReglo, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
+static void aom_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+ srcReg32b3 =
+ xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg32b5 =
+ xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+ srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+ srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // shift by 6 bit each 16 bit
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+ }
+}
+
+static void aom_filter_block1d16_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+ __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+ __m256i resReglo, resReghi, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+ resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ resReghi = _mm256_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg23_34_hi = srcReg45_56_hi;
+ srcReg4x = srcReg6x;
+ }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+ srcReg32b3 =
+ xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg32b5 =
+ xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+ srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+ srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+ // save
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+ srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+ srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // shift by 6 bit each 16 bit
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b1 = srcReg32b3;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b3 = srcReg32b5;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b5 = srcReg32b7;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt7 =
+ _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+ srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt7 =
+ _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+ srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+ }
+}
+
+static void aom_filter_block1d4_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i srcReg2345_3456_lo;
+ __m256i resReglo, resReg;
+ __m256i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+ resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
new file mode 100644
index 0000000000..5c36b68727
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_ports/mem.h"
+
+void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr, ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
+ srcRegFilt32b2_2;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
+ __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
+ __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+ __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+ d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+ // reading stride of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+ ss_2 = _mm_srli_si128(srcReg32b2, 2);
+ ss_4 = _mm_srli_si128(srcReg32b2, 4);
+ ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+ d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+ srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
+
+ ss_1 = _mm_srli_si128(srcReg32b2, 3);
+ ss_3 = _mm_srli_si128(srcReg32b2, 5);
+ ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+ ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+ d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+ srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
+
+ res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+ res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+ srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ __m128i tmp_0, tmp_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+ __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+ __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+ __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
+ __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+ __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+ __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+ __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
+ __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+ resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+ resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+ __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+ resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+ __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+ resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
+ resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
+ resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
+ __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
+ resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
+ __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
+ resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+ resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+ resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+ resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+ resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+ resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ resReg23_lo_1 = resReg45_lo_1;
+ resReg23_lo_2 = resReg45_lo_2;
+ resReg23_hi_1 = resReg45_hi_1;
+ resReg23_hi_2 = resReg45_hi_2;
+ resReg34_lo_1 = resReg56_lo_1;
+ resReg34_lo_2 = resReg56_lo_2;
+ resReg34_hi_1 = resReg56_hi_1;
+ resReg34_hi_2 = resReg56_hi_2;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr, ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+ ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+ __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+ ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_3, secondFilters);
+ d2 = _mm_madd_epi16(ss_5, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg34_lo;
+ __m128i srcReg45_lo, srcReg56_lo;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_45_lo, resReg34_56_lo;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ __m128i tmp_0, tmp_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+ __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+ __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+ resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+ resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+ __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+ resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+ __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+ resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+ resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+ resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
+ resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ resReg23_lo_1 = resReg45_lo_1;
+ resReg23_lo_2 = resReg45_lo_2;
+ resReg34_lo_1 = resReg56_lo_1;
+ resReg34_lo_2 = resReg56_lo_2;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr, ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+ __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+
+ ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+
+ __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
+ __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
+
+ __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+ __m128i resReg23_34, resReg45_56;
+ __m128i resReg23_34_45_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ __m128i tmp_0, tmp_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
+ resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
+ __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
+
+ tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
+ resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
+
+ // shift by 6 bit each 16 bit
+ resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
+ resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_34_45_56 =
+ _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
+
+ src_ptr += src_stride;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
+ *((int *)(output_ptr + out_pitch)) =
+ _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ resReg23 = resReg45;
+ resReg34 = resReg56;
+ srcReg4 = srcReg6;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000000..245fda1e94
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,847 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
+ 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+static void aom_filter_block1d4_h4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+ filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
+
+ for (i = output_height; i > 0; i -= 1) {
+ // load the 2 strides of source
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+ output_ptr += output_pitch;
+ }
+}
+
+static void aom_filter_block1d4_v4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
+ srcReg6, srcReg56;
+ __m128i srcReg23_34_lo, srcReg45_56_lo;
+ __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
+ __m128i resReglo, resReghi;
+ __m128i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
+
+ srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+ srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+ resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
+
+ resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
+ resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm_srai_epi16(resReglo, 6);
+ resReghi = _mm_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReglo = _mm_packus_epi16(resReglo, resReglo);
+ resReghi = _mm_packus_epi16(resReghi, resReghi);
+
+ src_ptr += src_stride;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
+ *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4 = srcReg6;
+ }
+}
+
+static void aom_filter_block1d8_h4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32, filt2Reg, filt3Reg;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+ filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+static void aom_filter_block1d8_v4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+ __m128i resReg23, resReg34, resReg45, resReg56;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
+ resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
+ resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
+ resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
+
+ // add and saturate the results together
+ resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
+ resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
+ resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
+ resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
+ resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
+ resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23 = srcReg45;
+ srcReg34 = srcReg56;
+ srcReg4 = srcReg6;
+ }
+}
+
+static void aom_filter_block1d16_h4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32, filt2Reg, filt3Reg;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+ filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading stride of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+static void aom_filter_block1d16_v4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+
+ // have consecutive loads on the same 256 register
+ srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
+ resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
+ resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
+ resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+ resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+ resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+ resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+ resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+ resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg34_lo = srcReg56_lo;
+ srcReg23_hi = srcReg45_hi;
+ srcReg34_hi = srcReg56_hi;
+ srcReg4 = srcReg6;
+ }
+}
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+ const __m128i *const s, const int16_t *const filter) {
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+ return convolve8_8_ssse3(s, f);
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const x_filter) {
+ __m128i s[8], ss[4], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, ss);
+ temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[8];
+
+ load_8bit_8x8(src, src_stride, s);
+ transpose_8bit_8x8(s, s);
+ store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 8) {
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+ }
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[4];
+ __m128i temp;
+
+ load_8bit_8x4(src, src_stride, s);
+ transpose_16bit_4x4(s, s);
+
+ temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[4];
+
+ load_8bit_4x4(src, src_stride, s);
+ s[0] = transpose_8bit_4x4(s);
+ s[1] = _mm_srli_si128(s[0], 4);
+ s[2] = _mm_srli_si128(s[0], 8);
+ s[3] = _mm_srli_si128(s[0], 12);
+ store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 4) {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+ }
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+ const int16_t *const filter) {
+ __m128i ss[4];
+ __m128i temp;
+
+ // 00 10 01 11 02 12 03 13
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ // 20 30 21 31 22 32 23 33
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ // 40 50 41 51 42 52 43 53
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ // 60 70 61 71 62 72 63 73
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+ // shrink to 8 bit each 16 bits
+ return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8];
+ __m128i temp;
+
+ load_8bit_4x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter, const int w) {
+ int i;
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+
+ for (i = 0; i < w; i += 16) {
+ __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+ loadu_8bit_16x8(src, src_stride, s);
+
+ // merge the result together
+ s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+ temp_lo = convolve8_8_ssse3(s_lo, f);
+ temp_hi = convolve8_8_ssse3(s_hi, f);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+ // result and the second lane contain the second convolve result
+ temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+ src += 16;
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i *)&dst[i], temp_hi);
+ }
+}
+
+static void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
+
+filter8_1dfunction aom_filter_block1d16_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+
+// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..640c5b2416
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -0,0 +1,615 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d4_v8_sse2)
+sym(aom_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d8_v8_sse2)
+sym(aom_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d16_v8_sse2)
+sym(aom_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d4_h8_sse2)
+sym(aom_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d8_h8_sse2)
+sym(aom_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d16_h8_sse2)
+sym(aom_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
new file mode 100644
index 0000000000..e5fafb0302
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -0,0 +1,870 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64: times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+ ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 +
+ ; pmaddubsw has a higher latency on some platforms, this might be eased by
+ ; interleaving the instructions.
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ packsswb m4, m4
+ ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+ ; some platforms.
+ pshuflw m0, m4, 0b ;k0_k1
+ pshuflw m1, m4, 01010101b ;k2_k3
+ pshuflw m2, m4, 10101010b ;k4_k5
+ pshuflw m3, m4, 11111111b ;k6_k7
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ mova k0k1, m0
+ mova k2k3, m1
+ mova k4k5, m2
+ mova k6k7, m3
+%if AOM_ARCH_X86_64
+ %define krd m12
+ %define tmp0 [rsp + 16*4]
+ %define tmp1 [rsp + 16*5]
+ mova krd, [GLOBAL(pw_64)]
+%else
+ %define krd [rsp + 16*4]
+%if CONFIG_PIC=0
+ mova m6, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m6, m6 ;all ones
+ psrlw m6, 15
+ psllw m6, 6 ;aka pw_64
+%endif
+ mova krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if AOM_ARCH_X86_64
+ %define LOCAL_VARS_SIZE_H4 0
+%else
+ %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ packsswb m4, m4
+%if AOM_ARCH_X86_64
+ %define k0k1k4k5 m8
+ %define k2k3k6k7 m9
+ %define krd m10
+ mova krd, [GLOBAL(pw_64)]
+ pshuflw k0k1k4k5, m4, 0b ;k0_k1
+ pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+ pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
+ pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+ %define k0k1k4k5 [rsp + 16*0]
+ %define k2k3k6k7 [rsp + 16*1]
+ %define krd [rsp + 16*2]
+ pshuflw m6, m4, 0b ;k0_k1
+ pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
+ pshuflw m7, m4, 01010101b ;k2_k3
+ pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+ mova m1, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m1, m1 ;all ones
+ psrlw m1, 15
+ psllw m1, 6 ;aka pw_64
+%endif
+ mova k0k1k4k5, m6
+ mova k2k3k6k7, m7
+ mova krd, m1
+%endif
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m4, [srcq - 3]
+ movu m5, [srcq + sstrideq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ punpckhbw m3, m5, m5
+ punpcklbw m5, m5
+ palignr m0, m1, m4, 1
+ pmaddubsw m0, k0k1k4k5
+ palignr m1, m4, 5
+ pmaddubsw m1, k2k3k6k7
+ palignr m2, m3, m5, 1
+ pmaddubsw m2, k0k1k4k5
+ palignr m3, m5, 5
+ pmaddubsw m3, k2k3k6k7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ movd m5, [dstq + dstrideq]
+%endif
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ movu m5, [srcq + sstrideq]
+ punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+ punpcklbw m4, m3
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+ psrldq m1, m0, 4
+
+%ifidn %1, h8_avg
+ pavgb m0, m4
+ pavgb m1, m5
+%endif
+ movd [dstq], m0
+ movd [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m4, [srcq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ palignr m0, m1, m4, 1
+ palignr m1, m4, 5
+ pmaddubsw m0, k0k1k4k5
+ pmaddubsw m1, k2k3k6k7
+ psrldq m2, m0, 8
+ psrldq m3, m1, 8
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ punpcklbw m4, m3
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ pavgb m0, m4
+%endif
+ movd [dstq], m0
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m0, [srcq - 3]
+ movu m4, [srcq + sstrideq - 3]
+ punpckhbw m1, m0, m0
+ punpcklbw m0, m0
+ palignr m5, m1, m0, 13
+ pmaddubsw m5, k6k7
+ palignr m2, m1, m0, 5
+ palignr m3, m1, m0, 9
+ palignr m1, m0, 1
+ pmaddubsw m1, k0k1
+ punpckhbw m6, m4, m4
+ punpcklbw m4, m4
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+
+ palignr m7, m6, m4, 13
+ palignr m0, m6, m4, 5
+ pmaddubsw m7, k6k7
+ paddsw m1, m3
+ paddsw m2, m5
+ paddsw m1, m2
+%ifidn %1, h8_avg
+ movh m2, [dstq]
+ movhps m2, [dstq + dstrideq]
+%endif
+ palignr m5, m6, m4, 9
+ palignr m6, m4, 1
+ pmaddubsw m0, k2k3
+ pmaddubsw m6, k0k1
+ paddsw m1, krd
+ pmaddubsw m5, k4k5
+ psraw m1, 7
+ paddsw m0, m7
+ paddsw m6, m5
+ paddsw m6, m0
+ paddsw m6, krd
+ psraw m6, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ movu m5, [srcq + sstrideq]
+ punpcklbw m4, m3
+ punpcklbw m5, m3
+ paddsw m1, m4
+ paddsw m6, m5
+%endif
+ packuswb m1, m6
+%ifidn %1, h8_avg
+ pavgb m1, m2
+%endif
+ movh [dstq], m1
+ movhps [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m0, [srcq - 3]
+ punpckhbw m3, m0, m0
+ punpcklbw m0, m0
+ palignr m1, m3, m0, 1
+ palignr m2, m3, m0, 5
+ palignr m4, m3, m0, 13
+ palignr m3, m0, 9
+ pmaddubsw m1, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+ pmaddubsw m4, k6k7
+ paddsw m1, m3
+ paddsw m4, m2
+ paddsw m1, m4
+ paddsw m1, krd
+ psraw m1, 7
+%ifidn %1, h8_add_src
+ pxor m6, m6
+ movu m5, [srcq]
+ punpcklbw m5, m6
+ paddsw m1, m5
+%endif
+ packuswb m1, m1
+%ifidn %1, h8_avg
+ movh m0, [dstq]
+ pavgb m1, m0
+%endif
+ movh [dstq], m1
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+.loop:
+ prefetcht0 [srcq + 2 * sstrideq -3]
+
+ movu m0, [srcq - 3]
+ movu m4, [srcq - 2]
+ pmaddubsw m0, k0k1
+ pmaddubsw m4, k0k1
+ movu m1, [srcq - 1]
+ movu m5, [srcq + 0]
+ pmaddubsw m1, k2k3
+ pmaddubsw m5, k2k3
+ movu m2, [srcq + 1]
+ movu m6, [srcq + 2]
+ pmaddubsw m2, k4k5
+ pmaddubsw m6, k4k5
+ movu m3, [srcq + 3]
+ movu m7, [srcq + 4]
+ pmaddubsw m3, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ paddsw m4, m5
+ paddsw m0, krd
+ paddsw m4, krd
+ psraw m0, 7
+ psraw m4, 7
+%ifidn %1, h8_add_src
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+ pcmpeqb m2, m2 ;all ones
+ psrlw m2, 8 ;even_byte_mask
+%else
+ mova m2, [GLOBAL(even_byte_mask)]
+%endif
+ movu m5, [srcq]
+ mova m7, m5
+ pand m5, m2
+ psrlw m7, 8
+ paddsw m0, m5
+ paddsw m4, m7
+%endif
+ packuswb m0, m0
+ packuswb m4, m4
+ punpcklbw m0, m4
+%ifidn %1, h8_avg
+ pavgb m0, [dstq]
+%endif
+ lea srcq, [srcq + sstrideq]
+ mova [dstq], m0
+ lea dstq, [dstq + dstrideq]
+ dec heightd
+ jnz .loop
+ REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER8 h8
+SUBPIX_HFILTER4 h8
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+ %define NUM_GENERAL_REG_USED 9
+%else
+ %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+ %define movx movh
+%else
+ %define movx movd
+%endif
+
+ dec heightd
+
+%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if AOM_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ ;Do two rows at once
+ movx m0, [srcq ] ;A
+ movx m1, [src1q ] ;B
+ punpcklbw m0, m1 ;A B
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ pmaddubsw m0, k0k1
+ mova m6, m2
+ movx m3, [src1q + sstrideq * 2] ;D
+ punpcklbw m2, m3 ;C D
+ pmaddubsw m2, k2k3
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ mova m7, m4
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m4, k4k5
+ punpcklbw m1, m6 ;A B next iter
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m5, m6 ;E F next iter
+ punpcklbw m3, m7 ;C D next iter
+ pmaddubsw m5, k4k5
+ movx m7, [src1q + sstride6q ] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m6, k6k7
+ pmaddubsw m3, k2k3
+ pmaddubsw m1, k0k1
+ paddsw m0, m4
+ paddsw m2, m6
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter
+ punpcklbw m7, m6
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ paddsw m1, m5
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [srcq]
+ punpcklbw m4, m6
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+
+ paddsw m3, m7
+ paddsw m1, m3
+ paddsw m1, krd
+ psraw m1, 7
+%ifidn %1, v8_add_src
+ movu m4, [src1q]
+ punpcklbw m4, m6
+ paddsw m1, m4
+%endif
+ lea srcq, [srcq + sstrideq * 2 ]
+ lea src1q, [src1q + sstrideq * 2]
+ packuswb m1, m1
+
+%ifidn %1, v8_avg
+ movx m2, [dstq]
+ pavgb m0, m2
+%endif
+ movx [dstq], m0
+ add dstq, dst_stride
+%ifidn %1, v8_avg
+ movx m3, [dstq]
+ pavgb m1, m3
+%endif
+ movx [dstq], m1
+ add dstq, dst_stride
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m0, m1 ;A B
+ movx m7, [src1q + sstride6q ] ;H
+ pmaddubsw m0, k0k1
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ punpcklbw m6, m7 ;G H
+ movx m3, [src1q + sstrideq * 2] ;D
+ pmaddubsw m6, k6k7
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ punpcklbw m2, m3 ;C D
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ paddsw m2, m6
+ paddsw m0, m4
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [srcq]
+ punpcklbw m4, m6
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%else
+ ; AOM_ARCH_X86_64
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m2, [srcq] ;C
+ movx m3, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m4, [srcq] ;E
+ movx m5, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m6, [srcq] ;G
+ punpcklbw m0, m1 ;A B
+ punpcklbw m1, m2 ;A B next iter
+ punpcklbw m2, m3 ;C D
+ punpcklbw m3, m4 ;C D next iter
+ punpcklbw m4, m5 ;E F
+ punpcklbw m5, m6 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ movx m7, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m14, [srcq] ;H next iter
+ punpcklbw m6, m7 ;G H
+ punpcklbw m7, m14 ;G H next iter
+ pmaddubsw m8, m0, k0k1
+ pmaddubsw m9, m1, k0k1
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m10, m2, k2k3
+ pmaddubsw m11, m3, k2k3
+ mova m2, m4
+ mova m3, m5
+ pmaddubsw m4, k4k5
+ pmaddubsw m5, k4k5
+ paddsw m8, m4
+ paddsw m9, m5
+ mova m4, m6
+ mova m5, m7
+ pmaddubsw m6, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m10, m6
+ paddsw m11, m7
+ paddsw m8, m10
+ paddsw m9, m11
+ mova m6, m14
+ paddsw m8, krd
+ paddsw m9, krd
+ psraw m8, 7
+ psraw m9, 7
+%ifidn %2, 4
+ packuswb m8, m8
+ packuswb m9, m9
+%else
+ packuswb m8, m9
+%endif
+
+%ifidn %1, v8_avg
+ movx m7, [dstq]
+%ifidn %2, 4
+ movx m10, [dstq + dstrideq]
+ pavgb m9, m10
+%else
+ movhpd m7, [dstq + dstrideq]
+%endif
+ pavgb m8, m7
+%endif
+ movx [dstq], m8
+%ifidn %2, 4
+ movx [dstq + dstrideq], m9
+%else
+ movhpd [dstq + dstrideq], m8
+%endif
+
+ lea dstq, [dstq + dstrideq * 2 ]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m7, [srcq + sstrideq] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ pmaddubsw m6, k6k7
+ paddsw m0, m4
+ paddsw m2, m6
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%endif ; AOM_ARCH_X86_64
+
+.done:
+ REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if AOM_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ lea src1q, [srcq + sstrideq]
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ movh m0, [srcq ] ;A
+ movh m1, [src1q ] ;B
+ movh m2, [srcq + sstrideq * 2 ] ;C
+ movh m3, [src1q + sstrideq * 2] ;D
+ movh m4, [srcq + sstrideq * 4 ] ;E
+ movh m5, [src1q + sstrideq * 4] ;F
+
+ punpcklbw m0, m1 ;A B
+ movh m6, [srcq + sstride6q] ;G
+ punpcklbw m2, m3 ;C D
+ movh m7, [src1q + sstride6q] ;H
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m0, k0k1
+ movh m3, [srcq + 8] ;A
+ pmaddubsw m2, k2k3
+ punpcklbw m6, m7 ;G H
+ movh m5, [srcq + sstrideq + 8] ;B
+ pmaddubsw m4, k4k5
+ punpcklbw m3, m5 ;A B
+ movh m7, [srcq + sstrideq * 2 + 8] ;C
+ pmaddubsw m6, k6k7
+ movh m5, [src1q + sstrideq * 2 + 8] ;D
+ punpcklbw m7, m5 ;C D
+ paddsw m2, m6
+ pmaddubsw m3, k0k1
+ movh m1, [srcq + sstrideq * 4 + 8] ;E
+ paddsw m0, m4
+ pmaddubsw m7, k2k3
+ movh m6, [src1q + sstrideq * 4 + 8] ;F
+ punpcklbw m1, m6 ;E F
+ paddsw m0, m2
+ paddsw m0, krd
+ movh m2, [srcq + sstride6q + 8] ;G
+ pmaddubsw m1, k4k5
+ movh m5, [src1q + sstride6q + 8] ;H
+ psraw m0, 7
+ punpcklbw m2, m5 ;G H
+ pmaddubsw m2, k6k7
+ paddsw m7, m2
+ paddsw m3, m1
+ paddsw m3, m7
+ paddsw m3, krd
+ psraw m3, 7
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+ mova m5, m4
+ punpcklbw m4, m6
+ punpckhbw m5, m6
+ paddsw m0, m4
+ paddsw m3, m5
+%endif
+ packuswb m0, m3
+
+ add srcq, sstrideq
+ add src1q, sstrideq
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dst_stride
+ dec heightd
+ jnz .loop
+ REP_RET
+
+%else
+ ; AOM_ARCH_X86_64
+ dec heightd
+
+ movu m1, [srcq ] ;A
+ movu m3, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m0, m1, m3 ;A B
+ punpckhbw m1, m3 ;A B
+ movu m5, [srcq] ;C
+ punpcklbw m2, m3, m5 ;A B next iter
+ punpckhbw m3, m5 ;A B next iter
+ mova tmp0, m2 ;store to stack
+ mova tmp1, m3 ;store to stack
+ movu m7, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m4, m5, m7 ;C D
+ punpckhbw m5, m7 ;C D
+ movu m9, [srcq] ;E
+ punpcklbw m6, m7, m9 ;C D next iter
+ punpckhbw m7, m9 ;C D next iter
+ movu m11, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m8, m9, m11 ;E F
+ punpckhbw m9, m11 ;E F
+ movu m2, [srcq] ;G
+ punpcklbw m10, m11, m2 ;E F next iter
+ punpckhbw m11, m2 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ pmaddubsw m13, m0, k0k1
+ mova m0, m4
+ pmaddubsw m14, m8, k4k5
+ pmaddubsw m15, m4, k2k3
+ mova m4, m8
+ paddsw m13, m14
+ movu m3, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m14, m2, m3 ;G H
+ mova m8, m14
+ pmaddubsw m14, k6k7
+ paddsw m15, m14
+ paddsw m13, m15
+ paddsw m13, krd
+ psraw m13, 7
+
+ pmaddubsw m14, m1, k0k1
+ pmaddubsw m1, m9, k4k5
+ pmaddubsw m15, m5, k2k3
+ paddsw m14, m1
+ mova m1, m5
+ mova m5, m9
+ punpckhbw m2, m3 ;G H
+ mova m9, m2
+ pmaddubsw m2, k6k7
+ paddsw m15, m2
+ paddsw m14, m15
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m13, m14
+%ifidn %1, v8_avg
+ pavgb m13, [dstq]
+%endif
+ mova [dstq], m13
+
+ ; next iter
+ pmaddubsw m15, tmp0, k0k1
+ pmaddubsw m14, m10, k4k5
+ pmaddubsw m13, m6, k2k3
+ paddsw m15, m14
+ mova tmp0, m6
+ mova m6, m10
+ movu m2, [srcq] ;G next iter
+ punpcklbw m14, m3, m2 ;G H next iter
+ mova m10, m14
+ pmaddubsw m14, k6k7
+ paddsw m13, m14
+ paddsw m15, m13
+ paddsw m15, krd
+ psraw m15, 7
+
+ pmaddubsw m14, tmp1, k0k1
+ mova tmp1, m7
+ pmaddubsw m13, m7, k2k3
+ mova m7, m11
+ pmaddubsw m11, k4k5
+ paddsw m14, m11
+ punpckhbw m3, m2 ;G H next iter
+ mova m11, m3
+ pmaddubsw m3, k6k7
+ paddsw m13, m3
+ paddsw m14, m13
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m15, m14
+%ifidn %1, v8_avg
+ pavgb m15, [dstq + dstrideq]
+%endif
+ mova [dstq + dstrideq], m15
+ lea dstq, [dstq + dstrideq * 2]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m3, [srcq + sstrideq] ;H
+ punpcklbw m6, m2, m3 ;G H
+ punpckhbw m2, m3 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m1, k0k1
+ pmaddubsw m4, k2k3
+ pmaddubsw m5, k2k3
+ pmaddubsw m8, k4k5
+ pmaddubsw m9, k4k5
+ pmaddubsw m6, k6k7
+ pmaddubsw m2, k6k7
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m4, m6
+ paddsw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m0, krd
+ paddsw m1, krd
+ psraw m0, 7
+ psraw m1, 7
+ packuswb m0, m1
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+
+.done:
+ REP_RET
+
+%endif ; AOM_ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16 v8
+SUBPIX_VFILTER v8, 8
+SUBPIX_VFILTER v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..90dd55a4be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklqdq xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ pxor xmm2, xmm2
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpcklbw xmm0, xmm2 ;unpack to word
+ pmullw xmm0, xmm4 ;multiply the filter factors
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+
+ paddsw xmm0, xmm3 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+
+ pshuflw xmm6, xmm7, 11111111b ;k3
+ pshufhw xmm7, xmm7, 0b ;k4
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ pxor xmm5, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ pmullw xmm2, xmm6
+ pmullw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm3
+
+ paddsw xmm0, xmm4 ;rounding
+ paddsw xmm2, xmm4
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_filter_block1d4_v2_sse2)
+sym(aom_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_v2_sse2)
+sym(aom_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_v2_sse2)
+sym(aom_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d4_h2_sse2)
+sym(aom_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_h2_sse2)
+sym(aom_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_h2_sse2)
+sym(aom_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000000..253bc26d38
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,267 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm3, [rdx] ;load filters
+ psrldq xmm3, 6
+ packsswb xmm3, xmm3
+ pshuflw xmm3, xmm3, 0b ;k3_k4
+
+ movd xmm2, ecx ;rounding_shift
+ pshufd xmm2, xmm2, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm3
+
+ pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm7, [rdx] ;load filters
+ psrldq xmm7, 6
+ packsswb xmm7, xmm7
+ pshuflw xmm7, xmm7, 0b ;k3_k4
+ punpcklwd xmm7, xmm7
+
+ movd xmm6, ecx ;rounding_shift
+ pshufd xmm6, xmm6, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack back to byte
+
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm2, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ pmulhrsw xmm2, xmm6
+ packuswb xmm0, xmm2 ;pack back to byte
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_filter_block1d4_v2_ssse3)
+sym(aom_filter_block1d4_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_v2_ssse3)
+sym(aom_filter_block1d8_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_v2_ssse3)
+sym(aom_filter_block1d16_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d4_h2_ssse3)
+sym(aom_filter_block1d4_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_h2_ssse3)
+sym(aom_filter_block1d8_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_h2_ssse3)
+sym(aom_filter_block1d16_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..49fcd72098
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+ __m256i *out_lo,
+ __m256i *out_hi) {
+ const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+ *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi16(a0, a1);
+ __m256i b1 = _mm256_sub_epi16(a0, a1);
+ __m256i b2 = _mm256_add_epi16(a2, a3);
+ __m256i b3 = _mm256_sub_epi16(a2, a3);
+ __m256i b4 = _mm256_add_epi16(a4, a5);
+ __m256i b5 = _mm256_sub_epi16(a4, a5);
+ __m256i b6 = _mm256_add_epi16(a6, a7);
+ __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+ a0 = _mm256_add_epi16(b0, b2);
+ a1 = _mm256_add_epi16(b1, b3);
+ a2 = _mm256_sub_epi16(b0, b2);
+ a3 = _mm256_sub_epi16(b1, b3);
+ a4 = _mm256_add_epi16(b4, b6);
+ a5 = _mm256_add_epi16(b5, b7);
+ a6 = _mm256_sub_epi16(b4, b6);
+ a7 = _mm256_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi16(a0, a4);
+ b7 = _mm256_add_epi16(a1, a5);
+ b3 = _mm256_add_epi16(a2, a6);
+ b4 = _mm256_add_epi16(a3, a7);
+ b2 = _mm256_sub_epi16(a0, a4);
+ b6 = _mm256_sub_epi16(a1, a5);
+ b1 = _mm256_sub_epi16(a2, a6);
+ b5 = _mm256_sub_epi16(a3, a7);
+
+ a0 = _mm256_unpacklo_epi16(b0, b1);
+ a1 = _mm256_unpacklo_epi16(b2, b3);
+ a2 = _mm256_unpackhi_epi16(b0, b1);
+ a3 = _mm256_unpackhi_epi16(b2, b3);
+ a4 = _mm256_unpacklo_epi16(b4, b5);
+ a5 = _mm256_unpacklo_epi16(b6, b7);
+ a6 = _mm256_unpackhi_epi16(b4, b5);
+ a7 = _mm256_unpackhi_epi16(b6, b7);
+
+ b0 = _mm256_unpacklo_epi32(a0, a1);
+ b1 = _mm256_unpacklo_epi32(a4, a5);
+ b2 = _mm256_unpackhi_epi32(a0, a1);
+ b3 = _mm256_unpackhi_epi32(a4, a5);
+ b4 = _mm256_unpacklo_epi32(a2, a3);
+ b5 = _mm256_unpacklo_epi32(a6, a7);
+ b6 = _mm256_unpackhi_epi32(a2, a3);
+ b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm256_unpacklo_epi64(b0, b1);
+ in[1] = _mm256_unpackhi_epi64(b0, b1);
+ in[2] = _mm256_unpacklo_epi64(b2, b3);
+ in[3] = _mm256_unpackhi_epi64(b2, b3);
+ in[4] = _mm256_unpacklo_epi64(b4, b5);
+ in[5] = _mm256_unpackhi_epi64(b4, b5);
+ in[6] = _mm256_unpacklo_epi64(b6, b7);
+ in[7] = _mm256_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm256_add_epi16(a0, a4);
+ in[7] = _mm256_add_epi16(a1, a5);
+ in[3] = _mm256_add_epi16(a2, a6);
+ in[4] = _mm256_add_epi16(a3, a7);
+ in[2] = _mm256_sub_epi16(a0, a4);
+ in[6] = _mm256_sub_epi16(a1, a5);
+ in[1] = _mm256_sub_epi16(a2, a6);
+ in[5] = _mm256_sub_epi16(a3, a7);
+ }
+}
+
+void aom_hadamard_lp_8x8_dual_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ __m256i src[8];
+ src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+ src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
+
+ hadamard_col8x2_avx2(src, 0);
+ hadamard_col8x2_avx2(src, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+ aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride,
+ t_coeff + (idx * 64 * 2));
+ }
+
+ for (idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+ if (is_final) {
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+ coeff += 16;
+ } else {
+ _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+ coeff16 += 16;
+ }
+ t_coeff += 16;
+ }
+}
+
+void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t *t_coeff = coeff;
+ for (int idx = 0; idx < 2; ++idx) {
+ const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+ aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride,
+ t_coeff + (idx * 64 * 2));
+ }
+
+ for (int idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+ _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
+ _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+ int idx;
+ __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m256i b0, b1, b2, b3;
+ const __m256i zero = _mm256_setzero_si256();
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_avx2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm256_srai_epi32(b0_lo, 2);
+ b1_lo = _mm256_srai_epi32(b1_lo, 2);
+ b2_lo = _mm256_srai_epi32(b2_lo, 2);
+ b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm256_srai_epi32(b0_hi, 2);
+ b1_hi = _mm256_srai_epi32(b1_hi, 2);
+ b2_hi = _mm256_srai_epi32(b2_hi, 2);
+ b3_hi = _mm256_srai_epi32(b3_hi, 2);
+
+ b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm256_packs_epi32(b3_lo, b3_hi);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi32(a0, a1);
+ __m256i b1 = _mm256_sub_epi32(a0, a1);
+ __m256i b2 = _mm256_add_epi32(a2, a3);
+ __m256i b3 = _mm256_sub_epi32(a2, a3);
+ __m256i b4 = _mm256_add_epi32(a4, a5);
+ __m256i b5 = _mm256_sub_epi32(a4, a5);
+ __m256i b6 = _mm256_add_epi32(a6, a7);
+ __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+ a0 = _mm256_add_epi32(b0, b2);
+ a1 = _mm256_add_epi32(b1, b3);
+ a2 = _mm256_sub_epi32(b0, b2);
+ a3 = _mm256_sub_epi32(b1, b3);
+ a4 = _mm256_add_epi32(b4, b6);
+ a5 = _mm256_add_epi32(b5, b7);
+ a6 = _mm256_sub_epi32(b4, b6);
+ a7 = _mm256_sub_epi32(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi32(a0, a4);
+ b7 = _mm256_add_epi32(a1, a5);
+ b3 = _mm256_add_epi32(a2, a6);
+ b4 = _mm256_add_epi32(a3, a7);
+ b2 = _mm256_sub_epi32(a0, a4);
+ b6 = _mm256_sub_epi32(a1, a5);
+ b1 = _mm256_sub_epi32(a2, a6);
+ b5 = _mm256_sub_epi32(a3, a7);
+
+ a0 = _mm256_unpacklo_epi32(b0, b1);
+ a1 = _mm256_unpacklo_epi32(b2, b3);
+ a2 = _mm256_unpackhi_epi32(b0, b1);
+ a3 = _mm256_unpackhi_epi32(b2, b3);
+ a4 = _mm256_unpacklo_epi32(b4, b5);
+ a5 = _mm256_unpacklo_epi32(b6, b7);
+ a6 = _mm256_unpackhi_epi32(b4, b5);
+ a7 = _mm256_unpackhi_epi32(b6, b7);
+
+ b0 = _mm256_unpacklo_epi64(a0, a1);
+ b1 = _mm256_unpacklo_epi64(a4, a5);
+ b2 = _mm256_unpackhi_epi64(a0, a1);
+ b3 = _mm256_unpackhi_epi64(a4, a5);
+ b4 = _mm256_unpacklo_epi64(a2, a3);
+ b5 = _mm256_unpacklo_epi64(a6, a7);
+ b6 = _mm256_unpackhi_epi64(a2, a3);
+ b7 = _mm256_unpackhi_epi64(a6, a7);
+
+ in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+ in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+ in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+ in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+ in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+ in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+ in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+ in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+ } else {
+ in[0] = _mm256_add_epi32(a0, a4);
+ in[7] = _mm256_add_epi32(a1, a5);
+ in[3] = _mm256_add_epi32(a2, a6);
+ in[4] = _mm256_add_epi32(a3, a7);
+ in[2] = _mm256_sub_epi32(a0, a4);
+ in[6] = _mm256_sub_epi32(a1, a5);
+ in[1] = _mm256_sub_epi32(a2, a6);
+ in[5] = _mm256_sub_epi32(a3, a7);
+ }
+}
+
+void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src16[8];
+ __m256i src32[8];
+
+ src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+ src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
+
+ src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+ src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+ src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+ src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+ src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+ src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+ src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+ src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+ highbd_hadamard_col8_avx2(src32, 0);
+ highbd_hadamard_col8_avx2(src32, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 1);
+ b1 = _mm256_srai_epi32(b1, 1);
+ b2 = _mm256_srai_epi32(b2, 1);
+ b3 = _mm256_srai_epi32(b3, 1);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 2);
+ b1 = _mm256_srai_epi32(b1, 2);
+ b2 = _mm256_srai_epi32(b2, 2);
+ b3 = _mm256_srai_epi32(b3, 2);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int aom_satd_avx2(const tran_low_t *coeff, int length) {
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 8, coeff += 8) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi32(src_line);
+ accum = _mm256_add_epi32(accum, abs);
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+
+int aom_satd_lp_avx2(const int16_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+
+ for (int i = 0; i < length; i += 16) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
+ const uint8_t *s_y1 = s_y0 + 8 * p;
+ __m256i sum0, sum1, s0, s1, s2, s3, u0;
+ u0 = _mm256_setzero_si256();
+ s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
+ s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
+ s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+ s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+ sum0 = _mm256_add_epi16(s0, s1);
+ sum1 = _mm256_add_epi16(s2, s3);
+ s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+ s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+ s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+ s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+ sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
+ sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
+ sum0 = _mm256_add_epi16(sum0, sum1);
+
+ // (avg + 32) >> 6
+ __m256i rounding = _mm256_set1_epi32(32);
+ sum0 = _mm256_add_epi32(sum0, rounding);
+ sum0 = _mm256_srli_epi32(sum0, 6);
+ __m128i lo = _mm256_castsi256_si128(sum0);
+ __m128i hi = _mm256_extracti128_si256(sum0, 1);
+ avg[0] = _mm_cvtsi128_si32(lo);
+ avg[1] = _mm_extract_epi32(lo, 2);
+ avg[2] = _mm_cvtsi128_si32(hi);
+ avg[3] = _mm_extract_epi32(hi, 2);
+}
+
+void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width and height to be multiple of 16 and 2
+ // respectively. For any odd width or height, SIMD support needs to be added.
+ assert(width % 16 == 0 && height % 2 == 0);
+
+ if (width % 32 == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int wd = 0; wd < width; wd += 32) {
+ const uint8_t *ref_tmp = ref + wd;
+ int16_t *hbuf_tmp = hbuf + wd;
+ __m256i s0 = zero;
+ __m256i s1 = zero;
+ int idx = 0;
+ do {
+ __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+ __m256i t0 = _mm256_unpacklo_epi8(src_line, zero);
+ __m256i t1 = _mm256_unpackhi_epi8(src_line, zero);
+ s0 = _mm256_add_epi16(s0, t0);
+ s1 = _mm256_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+
+ src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+ t0 = _mm256_unpacklo_epi8(src_line, zero);
+ t1 = _mm256_unpackhi_epi8(src_line, zero);
+ s0 = _mm256_add_epi16(s0, t0);
+ s1 = _mm256_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+ idx += 2;
+ } while (idx < height);
+ s0 = _mm256_srai_epi16(s0, norm_factor);
+ s1 = _mm256_srai_epi16(s1, norm_factor);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 16),
+ _mm256_extractf128_si256(s0, 1));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 24),
+ _mm256_extractf128_si256(s1, 1));
+ }
+ } else if (width % 16 == 0) {
+ aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor);
+ }
+}
+
+static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+ const int stride) {
+ src[0] = _mm256_loadu_si256((const __m256i *)ref1);
+ src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride)));
+ src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride)));
+}
+
+#define CALC_TOT_SAD_AND_STORE \
+ /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */ \
+ const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0); \
+ /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */ \
+ const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01); \
+ /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */ \
+ const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012); \
+ \
+ const __m128i results0 = _mm_add_epi16( \
+ _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \
+ const __m128i results1 = \
+ _mm_add_epi16(results0, _mm_srli_si128(results0, 8)); \
+ _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
+
+static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride,
+ const int height,
+ int norm_factor) {
+ const __m256i zero = _mm256_setzero_si256();
+ int ht = 0;
+ // Post sad operation, the data is present in lower 16-bit of each 64-bit lane
+ // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to
+ // utilize the higher 16-bits efficiently.
+ do {
+ __m256i src_00 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+ src_00 = _mm256_inserti128_si256(
+ src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1);
+ __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1)));
+ src_01 = _mm256_inserti128_si256(
+ src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1);
+ __m256i src_10 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2)));
+ src_10 = _mm256_inserti128_si256(
+ src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1);
+ __m256i src_11 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3)));
+ src_11 = _mm256_inserti128_si256(
+ src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1);
+
+ // s00 x x x s01 x x x | s40 x x x s41 x x x
+ const __m256i s0 = _mm256_sad_epu8(src_00, zero);
+ // s10 x x x s11 x x x | s50 x x x s51 x x x
+ const __m256i s1 = _mm256_sad_epu8(src_01, zero);
+ // s20 x x x s21 x x x | s60 x x x s61 x x x
+ const __m256i s2 = _mm256_sad_epu8(src_10, zero);
+ // s30 x x x s31 x x x | s70 x x x s71 x x x
+ const __m256i s3 = _mm256_sad_epu8(src_11, zero);
+
+ // s00 s10 x x x x x x | s40 s50 x x x x x x
+ const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1);
+ // s01 s11 x x x x x x | s41 s51 x x x x x x
+ const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1);
+ // s20 s30 x x x x x x | s60 s70 x x x x x x
+ const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3);
+ // s21 s31 x x x x x x | s61 s71 x x x x x x
+ const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3);
+
+ // s0 s1 x x x x x x | s4 s5 x x x x x x
+ const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi);
+ // s2 s3 x x x x x x | s6 s7 x x x x x x
+ const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi);
+
+ // s1 s1 s2 s3 s4 s5 s6 s7
+ const __m128i results = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08));
+ _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor));
+ vbuf += 8;
+ ref += (ref_stride << 3);
+ ht += 8;
+ } while (ht < height);
+}
+
+void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ assert(width % 16 == 0);
+ if (width == 128) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[16];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+ load_from_src_buf(ref + 32, &src[4], ref_stride);
+ load_from_src_buf(ref + 64, &src[8], ref_stride);
+ load_from_src_buf(ref + 96, &src[12], ref_stride);
+
+ // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+ const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero),
+ _mm256_sad_epu8(src[4], zero));
+ const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero),
+ _mm256_sad_epu8(src[12], zero));
+ const __m256i r0 = _mm256_add_epi16(s0, s1);
+ // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+ const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero),
+ _mm256_sad_epu8(src[5], zero));
+ const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero),
+ _mm256_sad_epu8(src[13], zero));
+ const __m256i r1 = _mm256_add_epi16(s2, s3);
+ // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+ const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero),
+ _mm256_sad_epu8(src[6], zero));
+ const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero),
+ _mm256_sad_epu8(src[14], zero));
+ const __m256i r2 = _mm256_add_epi16(s4, s5);
+ // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+ const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero),
+ _mm256_sad_epu8(src[7], zero));
+ const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero),
+ _mm256_sad_epu8(src[15], zero));
+ const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
+ }
+ } else if (width == 64) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[8];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+ load_from_src_buf(ref + 32, &src[4], ref_stride);
+
+ // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+ const __m256i s0 = _mm256_sad_epu8(src[0], zero);
+ const __m256i s1 = _mm256_sad_epu8(src[4], zero);
+ const __m256i r0 = _mm256_add_epi16(s0, s1);
+ // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+ const __m256i s2 = _mm256_sad_epu8(src[1], zero);
+ const __m256i s3 = _mm256_sad_epu8(src[5], zero);
+ const __m256i r1 = _mm256_add_epi16(s2, s3);
+ // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+ const __m256i s4 = _mm256_sad_epu8(src[2], zero);
+ const __m256i s5 = _mm256_sad_epu8(src[6], zero);
+ const __m256i r2 = _mm256_add_epi16(s4, s5);
+ // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+ const __m256i s6 = _mm256_sad_epu8(src[3], zero);
+ const __m256i s7 = _mm256_sad_epu8(src[7], zero);
+ const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
+ }
+ } else if (width == 32) {
+ assert(height % 2 == 0);
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[4];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+
+ // s00 x x x s01 x x x s02 x x x s03 x x x
+ const __m256i r0 = _mm256_sad_epu8(src[0], zero);
+ // s10 x x x s11 x x x s12 x x x s13 x x x
+ const __m256i r1 = _mm256_sad_epu8(src[1], zero);
+ // s20 x x x s21 x x x s22 x x x s23 x x x
+ const __m256i r2 = _mm256_sad_epu8(src[2], zero);
+ // s30 x x x s31 x x x s32 x x x s33 x x x
+ const __m256i r3 = _mm256_sad_epu8(src[3], zero);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
+ }
+ } else if (width == 16) {
+ aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor);
+ }
+}
+
+static inline void calc_vector_mean_sse_64wd(const int16_t *ref,
+ const int16_t *src, __m256i *mean,
+ __m256i *sse) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i ref_line2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i ref_line3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+
+ const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+ const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+ const __m256i diff2 = _mm256_sub_epi16(ref_line2, src_line2);
+ const __m256i diff3 = _mm256_sub_epi16(ref_line3, src_line3);
+ const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+ const __m256i diff_sqr2 = _mm256_madd_epi16(diff2, diff2);
+ const __m256i diff_sqr3 = _mm256_madd_epi16(diff3, diff3);
+
+ *mean = _mm256_add_epi16(*mean, _mm256_add_epi16(diff0, diff1));
+ *mean = _mm256_add_epi16(*mean, diff2);
+ *mean = _mm256_add_epi16(*mean, diff3);
+ *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(diff_sqr0, diff_sqr1));
+ *sse = _mm256_add_epi32(*sse, diff_sqr2);
+ *sse = _mm256_add_epi32(*sse, diff_sqr3);
+}
+
+#define CALC_VAR_FROM_MEAN_SSE(mean, sse) \
+ { \
+ mean = _mm256_madd_epi16(mean, _mm256_set1_epi16(1)); \
+ mean = _mm256_hadd_epi32(mean, sse); \
+ mean = _mm256_add_epi32(mean, _mm256_bsrli_epi128(mean, 4)); \
+ const __m128i result = _mm_add_epi32(_mm256_castsi256_si128(mean), \
+ _mm256_extractf128_si256(mean, 1)); \
+ /*(mean * mean): dynamic range 31 bits.*/ \
+ const int mean_int = _mm_extract_epi32(result, 0); \
+ const int sse_int = _mm_extract_epi32(result, 2); \
+ const unsigned int mean_abs = abs(mean_int); \
+ var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2)); \
+ }
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl) {
+ const int width = 4 << bwl;
+ assert(width % 16 == 0 && width <= 128);
+ int var = 0;
+
+ // Instead of having a loop over width 16, considered loop unrolling to avoid
+ // some addition operations.
+ if (width == 128) {
+ __m256i mean = _mm256_setzero_si256();
+ __m256i sse = _mm256_setzero_si256();
+
+ calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+ calc_vector_mean_sse_64wd(src + 64, ref + 64, &mean, &sse);
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ } else if (width == 64) {
+ __m256i mean = _mm256_setzero_si256();
+ __m256i sse = _mm256_setzero_si256();
+
+ calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ } else if (width == 32) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+
+ const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+ const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+ const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+ const __m256i sse = _mm256_add_epi32(diff_sqr0, diff_sqr1);
+ __m256i mean = _mm256_add_epi16(diff0, diff1);
+
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ } else if (width == 16) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i ref_line = _mm256_loadu_si256((const __m256i *)ref);
+ __m256i mean = _mm256_sub_epi16(ref_line, src_line);
+ const __m256i sse = _mm256_madd_epi16(mean, mean);
+
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ }
+ return var;
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..9ab9143eee
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+ __m128i *out_lo,
+ __m128i *out_hi) {
+ const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+ *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi32(a, sign);
+}
+
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+ __m128i sum0, sum1, s0, s1, s2, s3, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = loadh_epi64((const __m128i *)(s + p),
+ _mm_loadl_epi64((const __m128i *)(s)));
+ s1 = loadh_epi64((const __m128i *)(s + 3 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
+ s2 = loadh_epi64((const __m128i *)(s + 5 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
+ s3 = loadh_epi64((const __m128i *)(s + 7 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
+ s0 = _mm_sad_epu8(s0, u0);
+ s1 = _mm_sad_epu8(s1, u0);
+ s2 = _mm_sad_epu8(s2, u0);
+ s3 = _mm_sad_epu8(s3, u0);
+
+ sum0 = _mm_add_epi16(s0, s1);
+ sum1 = _mm_add_epi16(s2, s3);
+ sum0 = _mm_add_epi16(sum0, sum1);
+ sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
+ avg = _mm_cvtsi128_si32(sum0);
+ return (avg + 32) >> 6;
+}
+
+void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+ __m128i sum0, sum1, s0, s1, s2, s3, u0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
+ s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
+ s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
+ s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
+ sum0 = _mm_add_epi16(s0, s1);
+ sum1 = _mm_add_epi16(s2, s3);
+ s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
+ s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
+ s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
+ s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
+ sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
+ sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
+ sum0 = _mm_add_epi16(sum0, sum1);
+
+ // (avg + 32) >> 6
+ __m128i rounding = _mm_set1_epi32(32);
+ sum0 = _mm_add_epi32(sum0, rounding);
+ sum0 = _mm_srli_epi32(sum0, 6);
+ avg[0] = _mm_cvtsi128_si32(sum0);
+ avg[1] = _mm_extract_epi16(sum0, 4);
+}
+
+void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
+ for (int k = 0; k < 2; k++) {
+ calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
+ s_ptr += 8 * p;
+ }
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
+ _mm_cvtsi32_si128(*(const int *)(s + p)));
+ s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
+ _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+ s0 = _mm_sad_epu8(s0, u0);
+ s1 = _mm_sad_epu8(s1, u0);
+ s0 = _mm_add_epi16(s0, s1);
+ avg = _mm_cvtsi128_si32(s0);
+ return (avg + 8) >> 4;
+}
+
+static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
+ const __m128i a0 = in[0];
+ const __m128i a1 = in[1];
+ const __m128i a2 = in[2];
+ const __m128i a3 = in[3];
+ const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1);
+ const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1);
+ const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1);
+ const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1);
+ in[0] = _mm_add_epi16(b0, b2);
+ in[1] = _mm_add_epi16(b1, b3);
+ in[2] = _mm_sub_epi16(b0, b2);
+ in[3] = _mm_sub_epi16(b1, b3);
+
+ if (iter == 0) {
+ const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+ const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+ in[0] = dcba_lo;
+ in[1] = _mm_srli_si128(dcba_lo, 8);
+ in[2] = dcba_hi;
+ in[3] = _mm_srli_si128(dcba_hi, 8);
+ }
+}
+
+void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src[4];
+ src[0] = _mm_loadl_epi64((const __m128i *)src_diff);
+ src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col4_sse2(src, 0);
+ hadamard_col4_sse2(src, 1);
+
+ store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff);
+ coeff += 8;
+ store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
+}
+
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ if (is_final) {
+ store_tran_low(src[0], coeff);
+ coeff += 8;
+ store_tran_low(src[1], coeff);
+ coeff += 8;
+ store_tran_low(src[2], coeff);
+ coeff += 8;
+ store_tran_low(src[3], coeff);
+ coeff += 8;
+ store_tran_low(src[4], coeff);
+ coeff += 8;
+ store_tran_low(src[5], coeff);
+ coeff += 8;
+ store_tran_low(src[6], coeff);
+ coeff += 8;
+ store_tran_low(src[7], coeff);
+ } else {
+ int16_t *coeff16 = (int16_t *)coeff;
+ _mm_store_si128((__m128i *)coeff16, src[0]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[1]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[2]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[3]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[4]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[5]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[6]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[7]);
+ }
+}
+
+void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ _mm_store_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ for (int i = 0; i < 2; i++) {
+ hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64));
+ }
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ for (int idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ int16_t *t_coeff = coeff;
+ for (int idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ _mm_store_si128((__m128i *)t_coeff, coeff0);
+ _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+ _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+ _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+ t_coeff += 8;
+ }
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+ 0);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ if (is_final) {
+ store_tran_low_offset_4(coeff0, coeff);
+ store_tran_low_offset_4(coeff1, coeff + 64);
+ store_tran_low_offset_4(coeff2, coeff + 128);
+ store_tran_low_offset_4(coeff3, coeff + 192);
+ coeff += 4;
+ } else {
+ _mm_store_si128((__m128i *)coeff16, coeff0);
+ _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+ _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+ coeff16 += 8;
+ }
+
+ t_coeff += 8;
+ // Increment the pointer additionally by 0 and 8 in alternate
+ // iterations(instead of 8) to ensure the coherency with the implementation
+ // of store_tran_low_offset_4()
+ coeff += (((idx >> 3) & 1) << 3);
+ }
+}
+
+void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+ int idx;
+ __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m128i b0, b1, b2, b3;
+ const __m128i zero = _mm_setzero_si128();
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_sse2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm_srai_epi32(b0_lo, 2);
+ b1_lo = _mm_srai_epi32(b1_lo, 2);
+ b2_lo = _mm_srai_epi32(b2_lo, 2);
+ b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm_srai_epi32(b0_hi, 2);
+ b1_hi = _mm_srai_epi32(b1_hi, 2);
+ b2_hi = _mm_srai_epi32(b2_hi, 2);
+ b3_hi = _mm_srai_epi32(b3_hi, 2);
+
+ b0 = _mm_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm_packs_epi32(b3_lo, b3_hi);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ store_tran_low_offset_4(coeff0, coeff);
+ store_tran_low_offset_4(coeff1, coeff + 256);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ store_tran_low_offset_4(coeff2, coeff + 512);
+ store_tran_low_offset_4(coeff3, coeff + 768);
+
+ // Increment the pointer by 4 and 12 in alternate iterations(instead of 8)
+ // to ensure the coherency with the implementation of
+ // store_tran_low_offset_4()
+ coeff += (4 + (((idx >> 3) & 1) << 3));
+ t_coeff += 8;
+ }
+}
+
+int aom_satd_sse2(const tran_low_t *coeff, int length) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum = zero;
+
+ for (i = 0; i < length; i += 4) {
+ const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
+ const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
+ accum = _mm_add_epi32(accum, abs_coeff);
+ coeff += 4;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i accum = zero;
+
+ for (int i = 0; i < length; i += 16) {
+ const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+ const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+ const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+ const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+ const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line)
+ const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line)
+ const __m128i sum0 = _mm_madd_epi16(abs0, one);
+ const __m128i sum1 = _mm_madd_epi16(abs1, one);
+ accum = _mm_add_epi32(accum, sum0);
+ accum = _mm_add_epi32(accum, sum1);
+ coeff += 16;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width and height to be multiple of 16 and 2
+ // respectively. For any odd width or height, SIMD support needs to be added.
+ assert(width % 16 == 0 && height % 2 == 0);
+ __m128i zero = _mm_setzero_si128();
+
+ for (int wd = 0; wd < width; wd += 16) {
+ const uint8_t *ref_tmp = ref + wd;
+ int16_t *hbuf_tmp = hbuf + wd;
+ __m128i s0 = zero;
+ __m128i s1 = zero;
+ int idx = 0;
+ do {
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_add_epi16(s0, t0);
+ s1 = _mm_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_add_epi16(s0, t0);
+ s1 = _mm_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+ idx += 2;
+ } while (idx < height);
+
+ s0 = _mm_srai_epi16(s0, norm_factor);
+ s1 = _mm_srai_epi16(s1, norm_factor);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
+ }
+}
+
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width to be multiple of 16.
+ assert(width % 16 == 0);
+
+ for (int ht = 0; ht < height; ht++) {
+ const uint8_t *ref_tmp = ref + (ht * ref_stride);
+ __m128i zero = _mm_setzero_si128();
+ __m128i s0 = zero;
+ __m128i s1, src_line;
+ for (int i = 0; i < width; i += 16) {
+ src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_add_epi16(s0, s1);
+ ref_tmp += 16;
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_add_epi16(s0, s1);
+ vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c
new file mode 100644
index 0000000000..b83b43122a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) {
+ const int width = 4 << bwl;
+ assert(width % 16 == 0);
+
+ const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+ __m128i mean = _mm_setzero_si128();
+ __m128i sse = _mm_setzero_si128();
+
+ for (int i = 0; i < width; i += 16) {
+ const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
+ const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
+ const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
+ __m128i diff = _mm_sub_epi16(ref_line, src_line);
+ const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
+ __m128i diff_sqr = _mm_madd_epi16(diff, diff);
+ const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
+
+ diff = _mm_add_epi16(diff, diff2);
+ diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
+ sse = _mm_add_epi32(sse, diff_sqr);
+ mean = _mm_add_epi16(mean, diff);
+
+ src += 16;
+ ref += 16;
+ }
+
+ // m0 m1 m2 m3
+ mean = _mm_madd_epi16(mean, k_one_epi16);
+ // m0+m1 m2+m3 s0+s1 s2+s3
+ __m128i result = _mm_hadd_epi32(mean, sse);
+ // m0+m1+m2+m3 s0+s1+s2+s3 x x
+ result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4));
+
+ // (mean * mean): dynamic range 31 bits.
+ const int mean_int = _mm_extract_epi32(result, 0);
+ const int sse_int = _mm_extract_epi32(result, 2);
+ const unsigned int mean_abs = abs(mean_int);
+ const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));
+ return var;
+}
diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..85896e2768
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+ const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+ const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+ return _mm256_packs_epi32(a_low, a_high);
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+ const __m256i a_lo = _mm256_mullo_epi16(a, one);
+ const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+ const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+ _mm256_storeu_si256((__m256i *)b, a_1);
+ _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+}
diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..ff77760b6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_hi = _mm_mulhi_epi16(a, one);
+ const __m128i a_lo = _mm_mullo_epi16(a, one);
+ *a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+ *a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+ __m128i a_1, a_2;
+ unpack_trans(a, &a_1, &a_2);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+// Stores the second result at an offset of 8 (instead of 4) to match the output
+// with that of AVX2 implementation and the function is similar to
+// store_tran_low().
+static INLINE void store_tran_low_offset_4(__m128i a, tran_low_t *b) {
+ __m128i a_1, a_2;
+ unpack_trans(a, &a_1, &a_2);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 8), a_2);
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000000..e0289abe12
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, 0, w, h, 0, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+ uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+ src1_8, src1_stride, mask, 0, w, h, 0, 0,
+ bd);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 0000000000..dfbab324d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,1374 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+#include <immintrin.h> // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+ int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ __m256i res = _mm256_packus_epi16(res0, res0);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+ const __m256i *v_maxval, int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s0_1 = yy_loadu_256(src0 + 16);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ const __m256i s1_1 = yy_loadu_256(src1 + 16);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+ _mm256_unpacklo_epi16(*m1, max_minus_m1));
+ __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+ _mm256_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+ __m256i res = _mm256_packus_epi16(res0, res1);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m = xx_loadu_128(mask);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m = yy_loadu_256(mask + j);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+ const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m256i m_i00 = yy_loadu_256(mask);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+ const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+ const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+ const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + j);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+ const __m256i m_ac =
+ _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+ const __m256i m1 =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+ const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+ const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+ const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+ const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+ const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+ return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = yy_loadu_256(src0);
+ const __m256i v_s1_b = yy_loadu_256(src1);
+
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m256i v_p1_w =
+ _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+ const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m256i v_ral_b = yy_loadu_256(mask);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+ const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ AOM_BLEND_A64_ROUND_BITS);
+
+ xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+ const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rvsbh_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+ do {
+ const __m256i v_rl_b = yy_loadu_256(mask);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+ const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ AOM_BLEND_A64_ROUND_BITS);
+
+ xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+ const __m256i v_ah_b =
+ _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ra_b = yy_loadu_256(mask + c);
+ const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+ const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_m0_b = yy_loadu_256(mask + c);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ do {
+ const __m128i v_m0_b = xx_loadu_128(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ default:
+ blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h, subw, subh);
+ } else {
+ if (subw & subh) {
+ blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (subw) {
+ blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (subh) {
+ blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else {
+ blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h);
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_avx2()
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
+ const __m256i *round_offset, int shift, const __m256i *clip_low,
+ const __m256i *clip_high, const __m256i *mask_max) {
+ // Load 4x u16 pixels from each of 4 rows from each source
+ const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
+ *(int64_t *)(src0 + 2 * src0_stride),
+ *(int64_t *)(src0 + 1 * src0_stride),
+ *(int64_t *)(src0 + 0 * src0_stride));
+ const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
+ *(int64_t *)(src1 + 2 * src1_stride),
+ *(int64_t *)(src1 + 1 * src1_stride),
+ *(int64_t *)(src1 + 0 * src1_stride));
+ // Generate the inverse mask
+ const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
+
+ // Multiply each mask by the respective source
+ const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
+ const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
+ const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
+ const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
+ // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+ // lanes Later, packs does the same again which cancels this out with no need
+ // for a permute. The intermediate values being reordered makes no difference
+
+ const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
+ const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
+ const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
+ const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
+
+ const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
+ const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
+
+ const __m256i roundh =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
+ const __m256i roundl =
+ _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
+
+ const __m256i pack = _mm256_packs_epi32(roundl, roundh);
+ const __m256i clip =
+ _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
+
+ // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
+ const __m128i cliph = _mm256_extracti128_si256(clip, 1);
+ xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
+ xx_storel_64(dst + 2 * dst_stride, cliph);
+ const __m128i clipl = _mm256_castsi256_si128(clip);
+ xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
+ xx_storel_64(dst + 0 * dst_stride, clipl);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift, const __m256i *clip_low,
+ const __m256i *clip_high, const __m256i *mask_max) {
+ do {
+ // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
+ const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride),
+ *(int32_t *)(mask + 2 * mask_stride),
+ *(int32_t *)(mask + 1 * mask_stride),
+ *(int32_t *)(mask + 0 * mask_stride));
+ const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
+
+ highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, &mask0, round_offset, shift,
+ clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 4;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift, const __m256i *clip_low,
+ const __m256i *clip_high, const __m256i *mask_max) {
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ do {
+ // Load 8 pixels from each of 8 rows of mask,
+ // (saturating) add together rows then use madd to add adjacent pixels
+ // Finally, divide each value by 4 (with rounding)
+ const __m256i m0246 =
+ _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
+ *(int64_t *)(mask + 4 * mask_stride),
+ *(int64_t *)(mask + 2 * mask_stride),
+ *(int64_t *)(mask + 0 * mask_stride));
+ const __m256i m1357 =
+ _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
+ *(int64_t *)(mask + 5 * mask_stride),
+ *(int64_t *)(mask + 3 * mask_stride),
+ *(int64_t *)(mask + 1 * mask_stride));
+ const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
+ const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
+ const __m256i mask0 =
+ _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, &mask0, round_offset, shift,
+ clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 8;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+ const __m256i *mask0b, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ // Load 8x u16 pixels from each of 4 rows from each source
+ const __m256i s0a =
+ yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
+ const __m256i s0b =
+ yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
+ const __m256i s1a =
+ yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
+ const __m256i s1b =
+ yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
+
+ // Generate inverse masks
+ const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+ const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+ // Multiply sources by respective masks
+ const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+ const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+ const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+ // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+ // lanes Later, packs does the same again which cancels this out with no need
+ // for a permute. The intermediate values being reordered makes no difference
+
+ const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+ const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+ const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
+ const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
+
+ const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+ const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+ const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+ const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+ const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+ const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
+ const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+ // Divide down each result, with rounding
+ const __m256i roundah =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
+ const __m256i roundal =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
+ const __m256i roundbh =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
+ const __m256i roundbl =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
+
+ // Pack each i32 down to an i16 with saturation, then clip to valid range
+ const __m256i packa = _mm256_packs_epi32(roundal, roundah);
+ const __m256i clipa =
+ _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+ const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
+ const __m256i clipb =
+ _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+ // Store 8x u16 pixels to each of 4 rows in the destination
+ yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
+ yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ do {
+ // Load 8x u8 pixels from each of 4 rows in the mask
+ const __m128i mask0a8 =
+ _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
+ const __m128i mask0b8 =
+ _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
+ *(int64_t *)(mask + 3 * mask_stride));
+ const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
+ const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
+
+ highbd_blend_a64_d16_mask_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+ round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 4;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ do {
+ // Load 16x u8 pixels from each of 8 rows in the mask,
+ // (saturating) add together rows then use madd to add adjacent pixels
+ // Finally, divide each value by 4 (with rounding)
+ const __m256i m02 =
+ yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
+ const __m256i m13 =
+ yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
+ const __m256i m0123 =
+ _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
+ const __m256i mask_0a =
+ _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
+ const __m256i m46 =
+ yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
+ const __m256i m57 =
+ yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
+ const __m256i m4567 =
+ _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
+ const __m256i mask_0b =
+ _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+ &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 8;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+ const __m256i *mask0b, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ // Load 16x pixels from each of 2 rows from each source
+ const __m256i s0a = yy_loadu_256(src0);
+ const __m256i s0b = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1a = yy_loadu_256(src1);
+ const __m256i s1b = yy_loadu_256(src1 + src1_stride);
+
+ // Calculate inverse masks
+ const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+ const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+ // Multiply each source by appropriate mask
+ const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+ const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+ const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+ // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+ // lanes Later, packs does the same again which cancels this out with no need
+ // for a permute. The intermediate values being reordered makes no difference
+
+ const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+ const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+ const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
+ const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
+
+ const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+ const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+ const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+ const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+ const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+ const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
+ const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+ const __m256i resah =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
+ const __m256i resal =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
+ const __m256i resbh =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
+ const __m256i resbl =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
+
+ // Signed saturating pack from i32 to i16:
+ const __m256i packa = _mm256_packs_epi32(resal, resah);
+ const __m256i packb = _mm256_packs_epi32(resbl, resbh);
+
+ // Clip the values to the valid range
+ const __m256i clipa =
+ _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+ const __m256i clipb =
+ _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+ // Store 16 pixels
+ yy_storeu_256(dst, clipa);
+ yy_storeu_256(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ for (int i = 0; i < h; i += 2) {
+ for (int j = 0; j < w; j += 16) {
+ // Load 16x u8 alpha-mask values from each of two rows and pad to u16
+ const __m128i masks_a8 = xx_loadu_128(mask + j);
+ const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
+ const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
+ const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
+
+ highbd_blend_a64_d16_mask_w16_avx2(
+ dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+ &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 2;
+ }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; i += 2) {
+ for (int j = 0; j < w; j += 16) {
+ // Load 32x u8 alpha-mask values from each of four rows
+ // (saturating) add pairs of rows, then use madd to add adjacent values
+ // Finally, divide down each result with rounding
+ const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
+ const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
+ const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
+ const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
+
+ const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
+ const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
+
+ const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
+ const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
+
+ const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
+ const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w16_avx2(
+ dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+ &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 4;
+ }
+}
+
+void aom_highbd_blend_a64_d16_mask_avx2(
+ uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int32_t round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+ const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+ const __m256i clip_low = _mm256_setzero_si256();
+ const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
+ const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >= 16
+ highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >= 16
+ highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+ } else {
+ // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+ // back to the vanilla C implementation instead of having all the optimised
+ // code for these.
+ aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, conv_params, bd);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 0000000000..58a7345ec2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,1560 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h) {
+ (void)w;
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h) {
+ (void)w;
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_m0_b = xx_loadu_128(mask + c);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ra_b = xx_loadu_128(mask + c);
+ const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+ const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+ const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+ const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+ const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+ const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+ const __m128i v_rvsbl_w =
+ _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+ const __m128i v_rvsbh_w =
+ _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
+ const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+ const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ typedef void (*blend_fn)(
+ uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+ // Dimensions are: width_index X subx X suby
+ static const blend_fn blend[3][2][2] = {
+ { // w % 16 == 0
+ { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+ { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+ { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+ { // w == 8
+ { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+ { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h, subw, subh);
+ } else {
+ blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
+ src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_m0_b = xx_loadl_64(mask + c);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = xx_loadl_64(mask + c);
+ const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ uint32_t mask_stride, int w, int h,
+ int subw, int subh, int bd) {
+ typedef void (*blend_fn)(
+ uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+ // Dimensions are: bd_index X width_index X subw X subh
+ static const blend_fn blend[2][2][2][2] = {
+ { // bd == 8 or 10
+ { // w % 8 == 0
+ { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+ { blend_a64_mask_b10_sx_w8n_sse4_1,
+ blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+ { blend_a64_mask_b10_sx_w4_sse4_1,
+ blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
+ { // bd == 12
+ { // w % 8 == 0
+ { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+ { blend_a64_mask_b12_sx_w8n_sse4_1,
+ blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+ { blend_a64_mask_b12_sx_w4_sse4_1,
+ blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, w, h);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void blend_a64_d16_mask_w16_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+ const __m128i *v_maxval, int shift) {
+ const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+ const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+ const __m128i s0_0 = xx_loadu_128(src0);
+ const __m128i s0_1 = xx_loadu_128(src0 + 8);
+ const __m128i s1_0 = xx_loadu_128(src1);
+ const __m128i s1_1 = xx_loadu_128(src1 + 8);
+ __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+ _mm_unpacklo_epi16(*m0, max_minus_m0));
+ __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+ _mm_unpackhi_epi16(*m0, max_minus_m0));
+ __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+ _mm_unpacklo_epi16(*m1, max_minus_m1));
+ __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+ _mm_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+ const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+ const __m128i res = _mm_packus_epi16(res0, res1);
+
+ _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m = xx_loadu_128(mask + j);
+ const __m128i m0 = _mm_cvtepu8_epi16(m);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+ const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+ const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+ const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+ const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+ const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+ const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+ const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+ const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+ const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+ const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_sse4_1()
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+ const __m128i *mask0b, const __m128i *round_offset, int shift,
+ const __m128i *clip_low, const __m128i *clip_high,
+ const __m128i *mask_max) {
+ // Load 4 pixels from each of 4 rows from each source
+ const __m128i s0a =
+ _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride));
+ const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride),
+ *(int64_t *)(src0 + 3 * src0_stride));
+ const __m128i s1a =
+ _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride));
+ const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride),
+ *(int64_t *)(src1 + 3 * src1_stride));
+
+ // Generate the inverse masks
+ const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
+ const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
+
+ // Multiply each mask by the respective source
+ const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+ const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+ const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+ const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+ const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+ const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+ const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+ const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+ const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+ const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+ const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+ const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+ const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+ const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+ const __m128i roundah =
+ _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+ const __m128i roundbh =
+ _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+ const __m128i roundal =
+ _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+ const __m128i roundbl =
+ _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+ const __m128i packa = _mm_packs_epi32(roundal, roundah);
+ const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+
+ const __m128i clipa =
+ _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+ const __m128i clipb =
+ _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+ xx_storel_64(dst, _mm_srli_si128(clipa, 8));
+ xx_storel_64(dst + dst_stride, clipa);
+ xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
+ xx_storel_64(dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ do {
+ const __m128i mask0a8 =
+ _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
+ const __m128i mask0b8 =
+ _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
+ *(int32_t *)(mask + 3 * mask_stride));
+ const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
+ const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
+
+ highbd_blend_a64_d16_mask_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+ round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 4;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ do {
+ // Load 8 pixels from each of 8 rows of mask,
+ // (saturating) add together rows then use madd to add adjacent pixels
+ // Finally, divide each value by 4 (with rounding)
+ const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
+ *(int64_t *)(mask + 2 * mask_stride));
+ const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
+ *(int64_t *)(mask + 3 * mask_stride));
+ const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
+ const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
+ const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
+ *(int64_t *)(mask + 6 * mask_stride));
+ const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
+ *(int64_t *)(mask + 7 * mask_stride));
+ const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
+ const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+ &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 8;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+ const __m128i *mask0b, const __m128i *round_offset, int shift,
+ const __m128i *clip_low, const __m128i *clip_high,
+ const __m128i *max_mask) {
+ // Load 8x pixels from each of 2 rows from each source
+ const __m128i s0a = xx_loadu_128(src0);
+ const __m128i s0b = xx_loadu_128(src0 + src0_stride);
+ const __m128i s1a = xx_loadu_128(src1);
+ const __m128i s1b = xx_loadu_128(src1 + src1_stride);
+
+ // Generate inverse masks
+ const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
+ const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
+
+ // Multiply sources by respective masks
+ const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+ const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+ const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+
+ const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+ const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+ const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+ const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+
+ const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+ const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+ const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+ const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+ const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+ const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+ const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+ const __m128i roundah =
+ _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+ const __m128i roundal =
+ _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+ const __m128i roundbh =
+ _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+ const __m128i roundbl =
+ _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+ const __m128i packa = _mm_packs_epi32(roundal, roundah);
+ const __m128i clipa =
+ _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+ const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+ const __m128i clipb =
+ _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+ xx_storeu_128(dst, clipa);
+ xx_storeu_128(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *max_mask) {
+ do {
+ const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
+ const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
+ highbd_blend_a64_d16_mask_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+ round_offset, shift, clip_low, clip_high, max_mask);
+
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 2;
+ } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *max_mask) {
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ do {
+ const __m128i mask_thisrowa = xx_loadu_128(mask);
+ const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
+ const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
+ const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
+ const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
+ const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
+ const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
+ const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
+ const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
+ const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
+ &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
+
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 4;
+ } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
+ uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *round_offset, int shift, const __m128i *mask0l,
+ const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
+ const __m128i *mask_max) {
+ // Load 16x u16 pixels for this row from each src
+ const __m128i s0l = xx_loadu_128(src0);
+ const __m128i s0h = xx_loadu_128(src0 + 8);
+ const __m128i s1l = xx_loadu_128(src1);
+ const __m128i s1h = xx_loadu_128(src1 + 8);
+
+ // Calculate inverse masks
+ const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
+ const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
+
+ const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
+ const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
+ const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
+ const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
+
+ const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
+ const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
+ const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
+ const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
+
+ const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
+ const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
+
+ const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
+ const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
+ const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
+ const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
+
+ const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
+ const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
+ const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
+ const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
+
+ const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
+ const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
+
+ const __m128i reshh =
+ _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
+ const __m128i reshl =
+ _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
+ const __m128i reslh =
+ _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
+ const __m128i resll =
+ _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
+
+ // Signed saturating pack from i32 to i16:
+ const __m128i packh = _mm_packs_epi32(reshl, reshh);
+ const __m128i packl = _mm_packs_epi32(resll, reslh);
+
+ // Clip the values to the valid range
+ const __m128i cliph =
+ _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
+ const __m128i clipl =
+ _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
+
+ // Store 16 pixels
+ xx_storeu_128(dst, clipl);
+ xx_storeu_128(dst + 8, cliph);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ for (int i = 0; i < h; i++) {
+ for (int j = 0; j < w; j += 16) {
+ // Load 16x u8 alpha-mask values and pad to u16
+ const __m128i masks_u8 = xx_loadu_128(mask + j);
+ const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
+ const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
+
+ highbd_blend_a64_d16_mask_w16_sse4_1(
+ dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
+ clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; i++) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+ const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+ const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+ const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+ const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+ const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+ const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+ const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w16_sse4_1(
+ dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
+ clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride * 2;
+ }
+}
+
+void aom_highbd_blend_a64_d16_mask_sse4_1(
+ uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int32_t round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+ const __m128i clip_low = _mm_setzero_si128();
+ const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >=16
+ highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >=16
+ highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+ } else {
+ // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+ // back to the vanilla C implementation instead of having all the optimised
+ // code for these.
+ aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, conv_params, bd);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000000..75fb1c5a94
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0,
+ uint32_t src0_stride,
+ const uint8_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h);
+
+ // Dimension: width_index
+ static const blend_fn blend[9] = {
+ blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
+ aom_blend_a64_vmask_c, // w == 1
+ aom_blend_a64_vmask_c, // w == 2
+ NULL, // INVALID
+ blend_a64_vmask_w4_sse4_1, // w == 4
+ NULL, // INVALID
+ NULL, // INVALID
+ NULL, // INVALID
+ blend_a64_vmask_w8_sse4_1, // w == 8
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+ h);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, w, h, blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, w, h, blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_vmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+ uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h);
+
+ // Dimensions are: bd_index X width_index
+ static const blend_fn blend[2][2] = {
+ {
+ // bd == 8 or 10
+ blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b10_w4_sse4_1, // w == 4
+ },
+ {
+ // bd == 12
+ blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b12_w4_sse4_1, // w == 4
+ }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, w, h);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 0000000000..c071fdcfc4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadl_64(src0);
+ const __m128i s1 = xx_loadl_64(src1);
+ const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+ const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+ const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+ const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+ const __m128i res_d = _mm_srai_epi32(res_c, shift);
+ const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+ xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadu_128(src0);
+ const __m128i s1 = xx_loadu_128(src1);
+ __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+ _mm_unpacklo_epi16(*m, max_minus_m));
+ __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+ _mm_unpackhi_epi16(*m, max_minus_m));
+ res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+ res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+ const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+ _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_32(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_64(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000000..8d9b325101
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadu_128(src0);
+ const __m128i v_s1_b = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+}
+
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d =
+ _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d =
+ _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
+ const __m128i v_ssumh_d =
+ _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c
new file mode 100644
index 0000000000..fdf7de3f4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+ int *x_sum, int64_t *x2_sum) {
+ __m256i sum_buffer, sse_buffer;
+ __m128i out_buffer;
+
+ // Accumulate the various elements of register into first element.
+ sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8));
+ regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4));
+
+ sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+ regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8));
+
+ out_buffer = _mm256_castsi256_si128(regx_sum);
+ *x_sum += _mm_cvtsi128_si32(out_buffer);
+ out_buffer = _mm256_castsi256_si128(regx2_sum);
+#if AOM_ARCH_X86_64
+ *x2_sum += _mm_cvtsi128_si64(out_buffer);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, out_buffer);
+ *x2_sum += tmp;
+ }
+#endif
+}
+
+static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ __m128i row1, row2, row3;
+ __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+ temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+ const int16_t *data_tmp = data;
+ __m256i one = _mm256_set1_epi16(1);
+ regx_sum = _mm256_setzero_si256();
+ regx2_sum = regx_sum;
+ sum_buffer = _mm256_setzero_si256();
+ sse_buffer = sum_buffer;
+
+ for (int j = 0; j < (bh >> 2); ++j) {
+ // Load 4 rows at a time.
+ row1 = _mm_loadl_epi64((__m128i const *)(data_tmp));
+ row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+ row1 = _mm_unpacklo_epi64(row1, row2);
+ row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride));
+ row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride));
+ row2 = _mm_unpacklo_epi64(row2, row3);
+ load_pixels =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1);
+
+ row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+ row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+ sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+ sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+ data_tmp += 4 * stride;
+ }
+
+ // To prevent 32-bit variable overflow, unpack the elements to 64-bit.
+ temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+ temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+ sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+ accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ __m128i load_128bit, load_next_128bit;
+ __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+ temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+ const int16_t *data_tmp = data;
+ __m256i one = _mm256_set1_epi16(1);
+ regx_sum = _mm256_setzero_si256();
+ regx2_sum = regx_sum;
+ sum_buffer = _mm256_setzero_si256();
+ sse_buffer = sum_buffer;
+
+ for (int j = 0; j < (bh >> 1); ++j) {
+ // Load 2 rows at a time.
+ load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp));
+ load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride));
+ load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit),
+ load_next_128bit, 1);
+
+ row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+ row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+ sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+ sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+ data_tmp += 2 * stride;
+ }
+
+ temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+ temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+ sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+ accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum,
+ int loop_count) {
+ __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+ temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+ const int16_t *data_tmp = data;
+ __m256i one = _mm256_set1_epi16(1);
+ regx_sum = _mm256_setzero_si256();
+ regx2_sum = regx_sum;
+ sum_buffer = _mm256_setzero_si256();
+ sse_buffer = sum_buffer;
+
+ for (int i = 0; i < loop_count; ++i) {
+ data_tmp = data + 16 * i;
+ for (int j = 0; j < bh; ++j) {
+ load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp));
+
+ row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+ row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+ sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+ sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+ data_tmp += stride;
+ }
+ }
+
+ temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+ temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+ sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+ accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ *x_sum = 0;
+ *x2_sum = 0;
+
+ if ((bh & 3) == 0) {
+ switch (bw) {
+ // For smaller block widths, compute multiple rows simultaneously.
+ case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break;
+ case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break;
+ case 16:
+ case 32:
+ sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+ break;
+ case 64:
+ // 32-bit variables will overflow for 64 rows at a single time, so
+ // compute 32 rows at a time.
+ if (bh <= 32) {
+ sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+ } else {
+ sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4);
+ sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+ bw >> 4);
+ }
+ break;
+
+ default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+ } else {
+ aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c
new file mode 100644
index 0000000000..bf89427872
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ const int16_t *data_tmp = data;
+ __m128i temp_buffer1, temp_buffer2;
+ __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
+ __m128i one = _mm_set1_epi16(1);
+ __m128i regx_sum = _mm_setzero_si128();
+ __m128i regx2_sum = regx_sum;
+
+ for (int j = 0; j < (bh >> 1); ++j) {
+ // Load 2 rows (8 pixels) at a time.
+ load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
+ load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+ load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
+ sum_buffer = _mm_madd_epi16(load_pixels_low, one);
+ sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
+ regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+ data_tmp += 2 * stride;
+ }
+
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+ *x_sum = _mm_cvtsi128_si32(regx_sum);
+ temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+ temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+ regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+ regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if AOM_ARCH_X86_64
+ *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+ *x2_sum += tmp;
+ }
+#endif
+}
+
+static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum,
+ int loop_cycles) {
+ const int16_t *data_tmp;
+ __m128i temp_buffer1, temp_buffer2;
+ __m128i one = _mm_set1_epi16(1);
+ __m128i regx_sum = _mm_setzero_si128();
+ __m128i regx2_sum = regx_sum;
+ __m128i load_pixels, sum_buffer, sse_buffer;
+
+ for (int i = 0; i < loop_cycles; ++i) {
+ data_tmp = data + (8 * i);
+ for (int j = 0; j < bh; ++j) {
+ // Load 1 row (8-pixels) at a time.
+ load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
+ sum_buffer = _mm_madd_epi16(load_pixels, one);
+ sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
+ regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+ data_tmp += stride;
+ }
+ }
+
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+ *x_sum += _mm_cvtsi128_si32(regx_sum);
+ temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+ temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+ regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+ regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if AOM_ARCH_X86_64
+ *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+ *x2_sum += tmp;
+ }
+#endif
+}
+
+// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
+void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ *x_sum = 0;
+ *x2_sum = 0;
+
+ if ((bh & 3) == 0) {
+ switch (bw) {
+ case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
+ case 8:
+ case 16:
+ sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+ break;
+ // For widths 32 and 64, the registers may overflow. So compute
+ // partial widths at a time.
+ case 32:
+ if (bh <= 32) {
+ sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+ break;
+ } else {
+ sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
+ sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+ bw >> 3);
+ break;
+ }
+
+ case 64:
+ if (bh <= 16) {
+ sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+ break;
+ } else {
+ for (int i = 0; i < bh; i += 16)
+ sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
+ bw >> 3);
+ break;
+ }
+
+ default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+ } else {
+ aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
new file mode 100644
index 0000000000..96fe4ebb67
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+ __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+ __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+ __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+ __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+ __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+ __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+ __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+ __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+ __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+ __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+ __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+ __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+ // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
+ // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
+ // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
+ // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
+ // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
+ // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
+ // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
+ // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
+
+ // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
+ // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
+ // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
+ // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
+ // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
+ // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
+ // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
+ // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
+
+ __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+ __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+ __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+ __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+ __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+ __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+ __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+ __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+ __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+ __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+ __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+ __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+ __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+ __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+ __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+ __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+ // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
+ // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
+ // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
+ // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
+ // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
+ // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
+ // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
+ // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
+
+ // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
+ // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
+ // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
+ // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
+ // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
+ // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
+ // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
+ // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
+
+ tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+ tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+ tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+ tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+ tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+ tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+ tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+ tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+ tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+ // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
+ // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
+ // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
+ // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
+ // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
+ // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
+
+ // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
+ // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
+ // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
+ // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
+ // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
+ // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
+ // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
+ // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
+
+ out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
+ out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
+ out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+ out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+ out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+ out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+ out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+ out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+ out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+ out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+ out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+ out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+ out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+ out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+ out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+ out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
new file mode 100644
index 0000000000..4ca214f469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height, const int16_t *filter);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void aom_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ (void)filter_x; \
+ (void)x_step_q4; \
+ (void)filter_y; \
+ (void)y_step_q4; \
+ assert((-128 <= filter[3]) && (filter[3] <= 127)); \
+ assert(step_q4 == 16); \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ if (w) { \
+ aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void aom_highbd_convolve8_##name##_##opt( \
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ aom_highbd_convolve8_##name##_c( \
+ CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \
+ dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } \
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..f5a382ce4e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
+ 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
+ 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
+ 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
+ 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
+ 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+ 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+ 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_4TAP \
+ __m256i s[6]; \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[3] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[4] = _mm256_unpackhi_epi16(src_2, src_3); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+ const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+ s[2] = _mm256_unpacklo_epi16(s4, s5); \
+ s[5] = _mm256_unpackhi_epi16(s4, s5); \
+ \
+ __m256i res_a = convolve_4tap(s, coeffs_v + 1); \
+ __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[3] = s[4]; \
+ s[4] = s[5]; \
+ }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ \
+ __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ \
+ __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_6TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ \
+ __m256i s[8]; \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ \
+ s[3] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[4] = _mm256_unpackhi_epi16(src_2, src_3); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+ \
+ s[2] = _mm256_unpacklo_epi16(s6, s7); \
+ s[5] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve_6tap(s, coeffs_v); \
+ __m256i res_b = convolve_6tap(s + 3, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ \
+ s[3] = s[4]; \
+ s[4] = s[5]; \
+ }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ \
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ \
+ __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ __m256i s[8]; \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3); \
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve(s, coeffs_v); \
+ __m256i res_b = convolve(s + 4, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \
+ const __m256i v_zero = _mm256_setzero_si256(); \
+ __m256i s[12]; \
+ if (w <= 4) { \
+ for (i = 0; i < im_h; i += 2) { \
+ const __m256i data = _mm256_permute2x128_si256( \
+ _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \
+ _mm256_castsi128_si256(_mm_loadu_si128( \
+ (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \
+ 0x20); \
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \
+ \
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \
+ \
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \
+ \
+ const __m256i res_lo = convolve_12taps(s, coeffs_h); \
+ \
+ __m256i res_32b_lo = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \
+ const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \
+ if (w > 2) { \
+ _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \
+ _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \
+ res_1); \
+ } else { \
+ uint32_t horiz_2; \
+ horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \
+ im_block[i * im_stride] = (uint16_t)horiz_2; \
+ im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \
+ horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \
+ im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \
+ im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \
+ } \
+ } \
+ } else { \
+ for (i = 0; i < im_h; i++) { \
+ const __m256i data = _mm256_permute2x128_si256( \
+ _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \
+ _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \
+ 0x20); \
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \
+ \
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \
+ \
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \
+ \
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \
+ \
+ const __m256i res_lo = convolve_12taps(s, coeffs_h); \
+ \
+ __m256i res_32b_lo = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \
+ \
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], \
+ _mm256_extracti128_si256( \
+ _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \
+ } \
+ }
+
+#define CONVOLVE_SR_VERTICAL_FILTER_12TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \
+ __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \
+ __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \
+ __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \
+ s[3] = _mm256_unpacklo_epi16(src_6, src_7); \
+ s[4] = _mm256_unpacklo_epi16(src_8, src_9); \
+ \
+ s[6] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[7] = _mm256_unpackhi_epi16(src_2, src_3); \
+ s[8] = _mm256_unpackhi_epi16(src_4, src_5); \
+ s[9] = _mm256_unpackhi_epi16(src_6, src_7); \
+ s[10] = _mm256_unpackhi_epi16(src_8, src_9); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
+ \
+ s[5] = _mm256_unpacklo_epi16(s6, s7); \
+ s[11] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve_12taps(s, coeffs_v); \
+ __m256i res_b = convolve_12taps(s + 6, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ s[3] = s[4]; \
+ s[4] = s[5]; \
+ \
+ s[6] = s[7]; \
+ s[7] = s[8]; \
+ s[8] = s[9]; \
+ s[9] = s[10]; \
+ s[10] = s[11]; \
+ }
+
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \
+ do { \
+ for (i = 0; i < im_h; i += 2) { \
+ __m256i data = \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \
+ if (i + 1 < im_h) \
+ data = _mm256_inserti128_si256( \
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
+ src_h += (src_stride << 1); \
+ __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \
+ round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ } while (0)
+
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \
+ do { \
+ __m256i s[8]; \
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(s0, s1); \
+ s[1] = _mm256_unpacklo_epi16(s2, s3); \
+ s[2] = _mm256_unpacklo_epi16(s4, s5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(s0, s1); \
+ s[5] = _mm256_unpackhi_epi16(s2, s3); \
+ s[6] = _mm256_unpackhi_epi16(s4, s5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = \
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = \
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ const __m256i res_a = convolve(s, coeffs_y); \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ \
+ if (w - j > 4) { \
+ const __m256i res_b = convolve(s + 4, coeffs_y); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ \
+ if (do_average) { \
+ const __m256i data_ref_0 = \
+ load_line2_avx2(&dst[i * dst_stride + j], \
+ &dst[i * dst_stride + j + dst_stride]); \
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \
+ &wt, use_dist_wtd_comp_avg); \
+ \
+ const __m256i round_result = convolve_rounding( \
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
+ \
+ const __m256i res_8 = \
+ _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ \
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \
+ _mm_storel_epi64( \
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
+ \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
+ res_1); \
+ } \
+ } else { \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ \
+ if (do_average) { \
+ const __m256i data_ref_0 = \
+ load_line2_avx2(&dst[i * dst_stride + j], \
+ &dst[i * dst_stride + j + dst_stride]); \
+ \
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \
+ &wt, use_dist_wtd_comp_avg); \
+ \
+ const __m256i round_result = convolve_rounding( \
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
+ \
+ const __m256i res_8 = \
+ _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ \
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \
+ _mm_cvtsi128_si32(res_1); \
+ \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
+ \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
+ res_1); \
+ } \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ } \
+ } while (0)
+
+static INLINE void prepare_coeffs_lowbd(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs_6t_lowbd(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((int16_t)0xffff)));
+
+ const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
+}
+
+static INLINE void prepare_coeffs_6t(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void prepare_coeffs_12taps(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+ __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+ // coeffs 8 9 10 11 0 0 0 0
+ coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
+ coeff = _mm256_broadcastq_epi64(coeff_8);
+ coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9
+ coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+ const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+ _mm256_add_epi16(res_23, res_67));
+
+ return res;
+}
+
+static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res =
+ _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
+
+ return res;
+}
+
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+ return res;
+}
+
+static INLINE __m256i convolve_6tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+
+ const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
+
+ return res;
+}
+
+static INLINE __m256i convolve_12taps(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+ const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+ const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
+ const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
+
+ const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+ _mm256_add_epi32(res_2, res_3));
+ const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
+
+ return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+ const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+ const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+ _mm256_add_epi32(res_2, res_3));
+
+ return res;
+}
+
+static INLINE __m256i convolve_4tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
+
+ const __m256i res = _mm256_add_epi32(res_1, res_2);
+ return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[4];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+ s[2] = _mm256_shuffle_epi8(data, filt[2]);
+ s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+ return convolve_lowbd(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[4];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+ s[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+ return convolve_lowbd_6tap(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[2];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+ return convolve_lowbd_4tap(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+ const __m256i *const res,
+ const int do_average) {
+ __m256i d;
+ if (do_average) {
+ d = _mm256_load_si256((__m256i *)dst);
+ d = _mm256_add_epi32(d, *res);
+ d = _mm256_srai_epi32(d, 1);
+ } else {
+ d = *res;
+ }
+ _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+ const __m256i *const res_unsigned,
+ const __m256i *const wt,
+ const int use_dist_wtd_comp_avg) {
+ __m256i res;
+ if (use_dist_wtd_comp_avg) {
+ const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+ const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+ const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+ const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+ const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+ res = _mm256_packs_epi32(res_lo, res_hi);
+ } else {
+ const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+ res = _mm256_srai_epi16(wt_res, 1);
+ }
+ return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+ const __m256i *const offset_const,
+ const __m256i *const round_const,
+ const int round_shift) {
+ const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+ const __m256i res_round = _mm256_srai_epi16(
+ _mm256_add_epi16(res_signed, *round_const), round_shift);
+ return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+ const __m256i *const res_unsigned,
+ const __m256i *const wt0,
+ const __m256i *const wt1,
+ const int use_dist_wtd_comp_avg) {
+ __m256i res;
+ if (use_dist_wtd_comp_avg) {
+ const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+ const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+ const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+ res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+ } else {
+ const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+ res = _mm256_srai_epi32(wt_res, 1);
+ }
+ return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+ const __m256i *const res_unsigned, const __m256i *const offset_const,
+ const __m256i *const round_const, const int round_shift) {
+ const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+ const __m256i res_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+ return res_round;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 0000000000..9e8662af46
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+ const int do_average) {
+ __m128i d;
+ if (do_average) {
+ d = _mm_load_si128((__m128i *)dst);
+ d = _mm_add_epi32(d, *res);
+ d = _mm_srai_epi32(d, 1);
+ } else {
+ d = *res;
+ }
+ _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
+ int subpel_q4,
+ __m128i *coeffs /* [6] */) {
+ const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ coeffs[0] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 0 1 0 1 0 1 0 1
+ coeffs[1] = _mm_shuffle_epi32(coeffs_y, 85); // coeffs 2 3 2 3 2 3 2 3
+ coeffs[2] = _mm_shuffle_epi32(coeffs_y, 170); // coeffs 4 5 4 5 4 5 4 5
+ coeffs[3] = _mm_shuffle_epi32(coeffs_y, 255); // coeffs 6 7 6 7 6 7 6 7
+
+ coeffs_y = _mm_loadl_epi64((__m128i *)(y_filter + 8));
+
+ coeffs[4] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 8 9 8 9 8 9 8 9
+ coeffs[5] =
+ _mm_shuffle_epi32(coeffs_y, 85); // coeffs 10 11 10 11 10 11 10 11
+}
+
+static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
+ const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+ const __m128i d4 = _mm_madd_epi16(s[4], coeffs[4]);
+ const __m128i d5 = _mm_madd_epi16(s[5], coeffs[5]);
+ const __m128i d_0123 =
+ _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+ const __m128i d = _mm_add_epi32(_mm_add_epi32(d4, d5), d_0123);
+ return d;
+}
+
+static INLINE __m128i convolve_lo_x_12tap(const __m128i *s,
+ const __m128i *coeffs,
+ const __m128i zero) {
+ __m128i ss[6];
+ ss[0] = _mm_unpacklo_epi8(s[0], zero); // 0 1 1 2 2 3 3 4
+ ss[1] = _mm_unpacklo_epi8(s[1], zero); // 2 3 3 4 4 5 5 6
+ ss[2] = _mm_unpacklo_epi8(s[2], zero); // 4 5 5 6 6 7 7 8
+ ss[3] = _mm_unpacklo_epi8(s[3], zero); // 6 7 7 8 8 9 9 10
+ ss[4] = _mm_unpackhi_epi8(s[2], zero); // 8 9 9 10 10 11 11 12
+ ss[5] = _mm_unpackhi_epi8(s[3], zero); // 10 11 11 12 12 13 13 14
+ return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+ const __m128i *coeffs) {
+ __m128i ss[6];
+ const __m128i zero = _mm_setzero_si128();
+ ss[0] = _mm_unpacklo_epi8(s[0], zero);
+ ss[1] = _mm_unpacklo_epi8(s[2], zero);
+ ss[2] = _mm_unpacklo_epi8(s[4], zero);
+ ss[3] = _mm_unpacklo_epi8(s[6], zero);
+ ss[4] = _mm_unpacklo_epi8(s[8], zero);
+ ss[5] = _mm_unpacklo_epi8(s[10], zero);
+ return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+ const __m128i *coeffs) {
+ __m128i ss[6];
+ const __m128i zero = _mm_setzero_si128();
+ ss[0] = _mm_unpackhi_epi8(s[0], zero);
+ ss[1] = _mm_unpackhi_epi8(s[2], zero);
+ ss[2] = _mm_unpackhi_epi8(s[4], zero);
+ ss[3] = _mm_unpackhi_epi8(s[6], zero);
+ ss[4] = _mm_unpackhi_epi8(s[8], zero);
+ ss[5] = _mm_unpackhi_epi8(s[10], zero);
+ return convolve_12tap(ss, coeffs);
+}
+#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 0000000000..36b7d62b98
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+#include "config/aom_scale_rtcd.h"
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+ const __m128i *const coeffs) {
+ const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+ const __m128i res =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+ return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+ const __m128i *const res_unsigned,
+ const __m128i *const wt,
+ const int use_dist_wtd_avg) {
+ __m128i res;
+ if (use_dist_wtd_avg) {
+ const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+ const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+ const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+ const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+ const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+ res = _mm_packs_epi32(res_lo, res_hi);
+ } else {
+ const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+ res = _mm_srai_epi16(wt_res, 1);
+ }
+ return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+ const __m128i *const offset_const,
+ const __m128i *const round_const,
+ const int round_shift) {
+ const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+ const __m128i res_round =
+ _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+ return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+ const __m128i *const res_unsigned, const __m128i *const offset_const,
+ const __m128i *const round_const, const int round_shift) {
+ const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+ const __m128i res_round =
+ _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+ return res_round;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 0000000000..b1a3bb4664
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+ const __m128i *const res,
+ const __m128i *const wt0,
+ const __m128i *const wt1,
+ const int do_average) {
+ __m128i d;
+ if (do_average) {
+ d = _mm_load_si128((__m128i *)dst);
+ d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+ d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+ } else {
+ d = *res;
+ }
+ _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+ const __m128i *const res_unsigned,
+ const __m128i *const wt0,
+ const __m128i *const wt1,
+ const int use_dist_wtd_avg) {
+ __m128i res;
+ if (use_dist_wtd_avg) {
+ const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+ const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+ const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+ res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+ } else {
+ const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+ res = _mm_srai_epi32(wt_res, 1);
+ }
+ return res;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_ssse3.h b/third_party/aom/aom_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..b1abead146
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <tmmintrin.h> // SSSE3
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 0000000000..3f5a9bbeff
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+ int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+ aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+ aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+ aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 0000000000..bdd235bcd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+ const int ldb) {
+ __m128 row1 = _mm_load_ps(&A[0 * lda]);
+ __m128 row2 = _mm_load_ps(&A[1 * lda]);
+ __m128 row3 = _mm_load_ps(&A[2 * lda]);
+ __m128 row4 = _mm_load_ps(&A[3 * lda]);
+ _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+ _mm_store_ps(&B[0 * ldb], row1);
+ _mm_store_ps(&B[1 * ldb], row2);
+ _mm_store_ps(&B[2 * ldb], row3);
+ _mm_store_ps(&B[3 * ldb], row4);
+}
+
+// Referenced by fft_avx2.c.
+void aom_transpose_float_sse2(const float *A, float *B, int n);
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y += 4) {
+ for (int x = 0; x < n; x += 4) {
+ transpose4x4(A + y * n + x, B + x * n + y, n, n);
+ }
+ }
+}
+
+// Referenced by fft_avx2.c.
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n);
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+ const int n2 = n / 2;
+ output[0] = packed[0];
+ output[1] = 0;
+ output[2 * (n2 * n)] = packed[n2 * n];
+ output[2 * (n2 * n) + 1] = 0;
+
+ output[2 * n2] = packed[n2];
+ output[2 * n2 + 1] = 0;
+ output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+ output[2 * (n2 * n + n2) + 1] = 0;
+
+ for (int c = 1; c < n2; ++c) {
+ output[2 * (0 * n + c)] = packed[c];
+ output[2 * (0 * n + c) + 1] = packed[c + n2];
+ output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+ output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+ }
+ for (int r = 1; r < n2; ++r) {
+ output[2 * (r * n + 0)] = packed[r * n];
+ output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+ output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+ output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+ for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+ output[2 * (r * n + c)] =
+ packed[r * n + c] - packed[(r + n2) * n + c + n2];
+ output[2 * (r * n + c) + 1] =
+ packed[(r + n2) * n + c] + packed[r * n + c + n2];
+ }
+
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+ real1 = _mm_sub_ps(real1, real2);
+ imag1 = _mm_add_ps(imag1, imag2);
+ _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+ }
+
+ int r2 = r + n2;
+ int r3 = n - r2;
+ output[2 * (r2 * n + 0)] = packed[r3 * n];
+ output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+ output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+ output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+ for (int c = 1; c < AOMMIN(4, n2); ++c) {
+ output[2 * (r2 * n + c)] =
+ packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+ output[2 * (r2 * n + c) + 1] =
+ -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+ }
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+ real1 = _mm_add_ps(real1, real2);
+ imag1 = _mm_sub_ps(imag2, imag1);
+ _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r2 * n + c + 2),
+ _mm_unpackhi_ps(real1, imag1));
+ }
+ }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+ aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+ aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+ aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+ aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..7ee8ba330e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
+ __m128i *in1) {
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 =
+ _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+
+ // Load inputs.
+ *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ *in1 = _mm_unpacklo_epi64(
+ *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ *in0 = _mm_unpacklo_epi64(
+ *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ // multiply by 16 to give some extra precision
+ *in0 = _mm_slli_epi16(*in0, 4);
+ *in1 = _mm_slli_epi16(*in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a);
+ *in0 = _mm_add_epi16(*in0, mask);
+ *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ *in0 = _mm_shuffle_epi32(x0, 0xD8);
+ *in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ const __m128i t0 = ADD_EPI16(*in0, *in1);
+ const __m128i t1 = SUB_EPI16(*in0, *in1);
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ *in0 = _mm_packs_epi32(w0, w2);
+ *in1 = _mm_packs_epi32(w1, w3);
+ }
+ }
+}
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+ __m128i in0, in1;
+ FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1;
+ FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+ _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+ _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
+}
+
+#if CONFIG_INTERNAL_STATS
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000000..0e4fb80468
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D_HELPER fdct4x4_helper
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
+#define FDCT8x8_2D aom_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D_HELPER
+#undef FDCT4x4_2D
+#undef FDCT4x4_2D_LP
+#undef FDCT8x8_2D
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#undef DCT_HIGH_BIT_DEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT8x8_2D
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000000..78ea98522e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+ const __m128i *preg1) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ cmp0 = _mm_or_si128(cmp0, cmp1);
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+ _mm_cmpeq_epi16(*preg2, min_overflow));
+ __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+ _mm_cmpeq_epi16(*preg3, min_overflow));
+ cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ }
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+ }
+ }
+ }
+ }
+ }
+ return res0 + res1;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..06879040b0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,379 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192: times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
+pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585, 11585
+TRANSFORM_COEFFS 15137, 6270
+TRANSFORM_COEFFS 16069, 3196
+TRANSFORM_COEFFS 9102, 13623
+
+%macro STORE_OUTPUT 2 ; index, result
+ ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+ ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+ pxor m11, m11
+ pcmpgtw m11, m%2
+ movdqa m12, m%2
+ punpcklwd m%2, m11
+ punpckhwd m12, m11
+ mova [outputq + 4*%1 + 0], m%2
+ mova [outputq + 4*%1 + 16], m12
+%endmacro
+
+SECTION .text
+
+%if AOM_ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+ mova m8, [GLOBAL(pd_8192)]
+ mova m12, [GLOBAL(pw_11585x2)]
+
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ ; left shift by 2 to increase forward transformation precision
+ psllw m0, 2
+ psllw m1, 2
+ psllw m2, 2
+ psllw m3, 2
+ psllw m4, 2
+ psllw m5, 2
+ psllw m6, 2
+ psllw m7, 2
+
+ ; column transform
+ ; stage 1
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ paddw m9, m1, m6
+ psubw m1, m6
+
+ paddw m7, m2, m5
+ psubw m2, m5
+
+ paddw m6, m3, m4
+ psubw m3, m4
+
+ ; stage 2
+ paddw m5, m9, m7
+ psubw m9, m7
+
+ paddw m4, m10, m6
+ psubw m10, m6
+
+ paddw m7, m1, m2
+ psubw m1, m2
+
+ ; stage 3
+ paddw m6, m4, m5
+ psubw m4, m5
+
+ pmulhrsw m1, m12
+ pmulhrsw m7, m12
+
+ ; sin(pi / 8), cos(pi / 8)
+ punpcklwd m2, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+ pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+ pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+ paddd m5, m8
+ paddd m2, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m2, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m5, m9
+ packssdw m2, m10
+
+ pmulhrsw m6, m12
+ pmulhrsw m4, m12
+
+ paddw m9, m3, m1
+ psubw m3, m1
+
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ ; stage 4
+ ; sin(pi / 16), cos(pi / 16)
+ punpcklwd m1, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+ pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+ pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m1, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m7, 14
+ psrad m1, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m7, m9
+ packssdw m1, m10
+
+ ; sin(3 * pi / 16), cos(3 * pi / 16)
+ punpcklwd m11, m0, m3
+ punpckhwd m0, m3
+ pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+ pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+ pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+ paddd m9, m8
+ paddd m11, m8
+ paddd m3, m8
+ paddd m0, m8
+ psrad m9, 14
+ psrad m11, 14
+ psrad m3, 14
+ psrad m0, 14
+ packssdw m9, m3
+ packssdw m11, m0
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m6, m7
+ punpcklwd m3, m5, m11
+ punpckhwd m6, m7
+ punpckhwd m5, m11
+ punpcklwd m7, m4, m9
+ punpcklwd m10, m2, m1
+ punpckhwd m4, m9
+ punpckhwd m2, m1
+
+ ; stage 2
+ punpckldq m9, m0, m3
+ punpckldq m1, m6, m5
+ punpckhdq m0, m3
+ punpckhdq m6, m5
+ punpckldq m3, m7, m10
+ punpckldq m5, m4, m2
+ punpckhdq m7, m10
+ punpckhdq m4, m2
+
+ ; stage 3
+ punpcklqdq m10, m9, m3
+ punpckhqdq m9, m3
+ punpcklqdq m2, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m7, m6, m4
+ punpckhqdq m6, m4
+
+ ; row transform
+ ; stage 1
+ paddw m5, m10, m6
+ psubw m10, m6
+
+ paddw m4, m9, m7
+ psubw m9, m7
+
+ paddw m6, m2, m1
+ psubw m2, m1
+
+ paddw m7, m0, m3
+ psubw m0, m3
+
+ ;stage 2
+ paddw m1, m5, m7
+ psubw m5, m7
+
+ paddw m3, m4, m6
+ psubw m4, m6
+
+ paddw m7, m9, m2
+ psubw m9, m2
+
+ ; stage 3
+ punpcklwd m6, m1, m3
+ punpckhwd m1, m3
+ pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+ pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+ pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+ pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+ paddd m2, m8
+ paddd m6, m8
+ paddd m3, m8
+ paddd m1, m8
+ psrad m2, 14
+ psrad m6, 14
+ psrad m3, 14
+ psrad m1, 14
+ packssdw m2, m3
+ packssdw m6, m1
+
+ pmulhrsw m7, m12
+ pmulhrsw m9, m12
+
+ punpcklwd m3, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+ pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+ pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+ paddd m1, m8
+ paddd m3, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m1, 14
+ psrad m3, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m1, m4
+ packssdw m3, m5
+
+ paddw m4, m0, m9
+ psubw m0, m9
+
+ paddw m5, m10, m7
+ psubw m10, m7
+
+ ; stage 4
+ punpcklwd m9, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+ pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+ pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m9, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m7, 14
+ psrad m9, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m7, m4
+ packssdw m9, m5
+
+ punpcklwd m4, m10, m0
+ punpckhwd m10, m0
+ pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+ pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+ pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+ paddd m5, m8
+ paddd m4, m8
+ paddd m0, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m4, 14
+ psrad m0, 14
+ psrad m10, 14
+ packssdw m5, m0
+ packssdw m4, m10
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m2, m7
+ punpcklwd m10, m1, m4
+ punpckhwd m2, m7
+ punpckhwd m1, m4
+ punpcklwd m7, m6, m5
+ punpcklwd m4, m3, m9
+ punpckhwd m6, m5
+ punpckhwd m3, m9
+
+ ; stage 2
+ punpckldq m5, m0, m10
+ punpckldq m9, m2, m1
+ punpckhdq m0, m10
+ punpckhdq m2, m1
+ punpckldq m10, m7, m4
+ punpckldq m1, m6, m3
+ punpckhdq m7, m4
+ punpckhdq m6, m3
+
+ ; stage 3
+ punpcklqdq m4, m5, m10
+ punpckhqdq m5, m10
+ punpcklqdq m3, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m10, m9, m1
+ punpckhqdq m9, m1
+ punpcklqdq m7, m2, m6
+ punpckhqdq m2, m6
+
+ psraw m1, m4, 15
+ psraw m6, m5, 15
+ psraw m8, m3, 15
+ psraw m11, m0, 15
+
+ psubw m4, m1
+ psubw m5, m6
+ psubw m3, m8
+ psubw m0, m11
+
+ psraw m4, 1
+ psraw m5, 1
+ psraw m3, 1
+ psraw m0, 1
+
+ psraw m1, m10, 15
+ psraw m6, m9, 15
+ psraw m8, m7, 15
+ psraw m11, m2, 15
+
+ psubw m10, m1
+ psubw m9, m6
+ psubw m7, m8
+ psubw m2, m11
+
+ psraw m10, 1
+ psraw m9, 1
+ psraw m7, 1
+ psraw m2, 1
+
+ STORE_OUTPUT 0, 4
+ STORE_OUTPUT 8, 5
+ STORE_OUTPUT 16, 3
+ STORE_OUTPUT 24, 0
+ STORE_OUTPUT 32, 10
+ STORE_OUTPUT 40, 9
+ STORE_OUTPUT 48, 7
+ STORE_OUTPUT 56, 2
+
+ RET
+%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..05c87bcff9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void highbd_load_b_values_avx2(
+ const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+ __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ __m256i *shift) {
+ *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1));
+ *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+ *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+ *dequant =
+ _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+ *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
+}
+
+static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+ const int16_t *iscan_ptr,
+ int *is_found, __m256i *mask) {
+ __m256i temp_mask = _mm256_setzero_si256();
+ if (_mm256_movemask_epi8(*cmp_mask)) {
+ __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+ temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+ *is_found = 1;
+ }
+ *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+ __m256i *threshold,
+ const int16_t *iscan_ptr,
+ int *is_found, __m256i *mask) {
+ __m256i coeff[2], cmp_mask0, cmp_mask1;
+ coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
+ cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
+ cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+ cmp_mask0 =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+ __m256i *p, const int shift) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+ prod_lo = _mm256_srli_epi64(prod_lo, shift);
+ prod_hi = _mm256_srli_epi64(prod_hi, shift);
+
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
+}
+
+static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+ const __m256i *round,
+ const __m256i *quant,
+ const __m256i *shift,
+ const int *log_scale) {
+ __m256i tmp, qcoeff;
+ qcoeff = _mm256_add_epi32(*coeff, *round);
+ highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
+ qcoeff = _mm256_add_epi32(tmp, qcoeff);
+ highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+ __m256i dequant) {
+ return _mm256_mullo_epi32(qcoeff, dequant);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+ __m256i qcoeff, __m256i dequant, const int log_scale) {
+ __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
+ highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+ return _mm256_sign_epi32(abs_coeff, qcoeff);
+}
+
+static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+ __m256i coeff1,
+ tran_low_t *coeff_ptr) {
+ _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
+}
+
+void aom_highbd_quantize_b_adaptive_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i zbin, round, quant, dequant, shift;
+ __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+ __m256i cmp_mask, mask0 = zero, mask1 = zero;
+ __m128i temp_mask0, temp_mask1;
+ int prescan_add[2];
+ int thresh[2];
+ const int log_scale = 0;
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ __m256i threshold[2];
+ threshold[0] = _mm256_set1_epi32(thresh[0]);
+ threshold[1] = _mm256_set1_epi32(thresh[1]);
+ threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+
+ // Setup global values.
+ highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
+ &quant, dequant_ptr, &dequant, quant_shift_ptr,
+ &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+ &mask0);
+ __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm256_unpackhi_epi64(zbin, zbin);
+ __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+ threshold[0] = threshold[1];
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ // Reinsert signs
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+ coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+ temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+ index += 16;
+ continue;
+ }
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+ coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+ coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+ index += 16;
+ }
+ if (is_found0) {
+ temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+ _mm256_extracti128_si256(mask0, 1));
+ non_zero_count = calculate_non_zero_count(temp_mask0);
+ }
+ if (is_found1) {
+ temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+ _mm256_extracti128_si256(mask1, 1));
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+ }
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const int log_scale = 1;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i zbin, round, quant, dequant, shift;
+ __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+ __m256i cmp_mask, mask0 = zero, mask1 = zero;
+ __m128i temp_mask0, temp_mask1;
+ const __m256i one = _mm256_set1_epi32(1);
+ const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
+ int prescan_add[2];
+ int thresh[2];
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ __m256i threshold[2];
+ threshold[0] = _mm256_set1_epi32(thresh[0]);
+ threshold[1] = _mm256_set1_epi32(thresh[1]);
+ threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+
+ // Setup global values.
+ zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+ round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+ quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+ dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+ shift =
+ _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
+
+ // Shift with rounding.
+ zbin = _mm256_add_epi32(zbin, log_scale_vec);
+ round = _mm256_add_epi32(round, log_scale_vec);
+ zbin = _mm256_srli_epi32(zbin, log_scale);
+ round = _mm256_srli_epi32(round, log_scale);
+ zbin = _mm256_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+ &mask0);
+ __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
+ __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+ threshold[0] = threshold[1];
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+ round = _mm256_permute2x128_si256(round, round, 0x11);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+ shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+ } else {
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ round = _mm256_permute2x128_si256(round, round, 0x11);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+ shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ // Reinsert signs
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+ coeff0 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+ coeff1 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+ temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+ index += 16;
+ continue;
+ }
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+ coeff0 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+ coeff1 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+ index += 16;
+ }
+ if (is_found0) {
+ temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+ _mm256_extracti128_si256(mask0, 1));
+ non_zero_count = calculate_non_zero_count(temp_mask0);
+ }
+ if (is_found1) {
+ temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+ _mm256_extracti128_si256(mask1, 1));
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+ }
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..ae31116e9d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi64(a, sign);
+}
+
+static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+ __m128i *p, const int shift) {
+ __m128i sign = _mm_srai_epi32(*y, 31);
+ __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
+ __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
+ __m128i abs_y = invert_sign_32_sse2(*y, sign);
+ __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
+ __m128i prod_hi = _mm_srli_epi64(*x, 32);
+ const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
+ prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
+ prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
+ prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
+
+ prod_lo = _mm_srli_epi64(prod_lo, shift);
+ const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
+ prod_lo = _mm_and_si128(prod_lo, mask);
+ prod_hi = _mm_srli_epi64(prod_hi, shift);
+
+ prod_hi = _mm_slli_epi64(prod_hi, 32);
+ *p = _mm_or_si128(prod_lo, prod_hi);
+}
+
+static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+ const __m128i *quant,
+ const __m128i *shift,
+ const int *log_scale) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_add_epi32(*coeff, *round);
+ highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
+ qcoeff = _mm_add_epi32(tmp, qcoeff);
+ highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i temp_mask = _mm_setzero_si128();
+ if (_mm_movemask_epi8(*cmp_mask0)) {
+ __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+ __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+ temp_mask = mask0;
+ *is_found = 1;
+ }
+ *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+ __m128i *threshold,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i coeff[2], cmp_mask0, cmp_mask1;
+
+ coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
+ cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
+ cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+
+ cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+
+ highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+ const int log_scale) {
+ __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
+ __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
+ highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+ return invert_sign_32_sse2(abs_coeff, coeff_sign);
+}
+
+void aom_highbd_quantize_b_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 8;
+ const int log_scale = 0;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, cmp_mask;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+ __m128i round_sign = _mm_srai_epi16(round, 15);
+ __m128i quant_sign = _mm_srai_epi16(quant, 15);
+ __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+ __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+ zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+ round = _mm_unpacklo_epi16(round, round_sign);
+ quant = _mm_unpacklo_epi16(quant, quant_sign);
+ dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+ shift = _mm_unpacklo_epi16(shift, shift_sign);
+ zbin = _mm_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ index += 8;
+ continue;
+ }
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+ index += 8;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 8;
+ const int log_scale = 1;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+ const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, cmp_mask;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+ __m128i round_sign = _mm_srai_epi16(round, 15);
+ __m128i quant_sign = _mm_srai_epi16(quant, 15);
+ __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+ __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+ zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+ round = _mm_unpacklo_epi16(round, round_sign);
+ quant = _mm_unpacklo_epi16(quant, quant_sign);
+ dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+ shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi32(zbin, log_scale_vec);
+ round = _mm_add_epi32(round, log_scale_vec);
+ zbin = _mm_srli_epi32(zbin, log_scale);
+ round = _mm_srli_epi32(round, log_scale);
+ zbin = _mm_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ index += 8;
+ continue;
+ }
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+ index += 8;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 8;
+ const int log_scale = 2;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+ const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, cmp_mask;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+ __m128i round_sign = _mm_srai_epi16(round, 15);
+ __m128i quant_sign = _mm_srai_epi16(quant, 15);
+ __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+ __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+ zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+ round = _mm_unpacklo_epi16(round, round_sign);
+ quant = _mm_unpacklo_epi16(quant, quant_sign);
+ dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+ shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi32(zbin, log_scale_vec);
+ round = _mm_add_epi32(round, log_scale_vec);
+ zbin = _mm_srli_epi32(zbin, log_scale);
+ round = _mm_srli_epi32(round, log_scale);
+ zbin = _mm_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ index += 8;
+ continue;
+ }
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+ index += 8;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..11e45778c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1248 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd);
+
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd) {
+ if (filter_params_y->taps == 12) {
+ av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ return;
+ }
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+
+ __m256i s[8], coeffs_y[4];
+
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m256i src6;
+ __m256i s01 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+ __m256i s12 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+ __m256i s23 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+ __m256i s34 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+ __m256i s45 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ __m256i s56 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi16(s01, s12);
+ s[1] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+ s[4] = _mm256_unpackhi_epi16(s01, s12);
+ s[5] = _mm256_unpackhi_epi16(s23, s34);
+ s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ const __m256i s67 = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+ const __m256i s78 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi16(s67, s78);
+ s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_16bit, 1));
+ } else if (w == 4) {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ } else {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ xx_storel_32(&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (filter_params_x->taps == 12) {
+ av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params,
+ bd);
+ return;
+ }
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[4], coeffs_x[4];
+
+ const __m256i round_const_x =
+ _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res, 1));
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res, 1));
+ } else {
+ xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res, 1));
+ }
+ }
+ }
+}
+
+#define CONV8_ROUNDING_BITS (7)
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15,
+ 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+ const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+ const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+ p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
+ p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
+ p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
+ p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
+}
+
+// Note:
+// Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *x /*x[8]*/) {
+ __m256i pp[8];
+ pack_pixels(s0, pp);
+ pack_pixels(s1, &pp[4]);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+ x[4] = x[2];
+ x[5] = x[3];
+ x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+ x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i pp[8];
+ __m256i s0;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ pack_pixels(&s0, pp);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+ __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+// Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p0 = _mm256_set1_epi32(0x03020100);
+ const __m256i p1 = _mm256_set1_epi32(0x07060504);
+ const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+ const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+ f[0] = _mm256_shuffle_epi8(hh, p0);
+ f[1] = _mm256_shuffle_epi8(hh, p1);
+ f[2] = _mm256_shuffle_epi8(hh, p2);
+ f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void pack_filters_4tap(const int16_t *filter,
+ __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(h);
+
+ // coeffs 2 3 2 3 2 3 2 3
+ f[0] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+ const __m256i *fil /*fil[4]*/,
+ __m256i *y) {
+ __m256i a, a0, a1;
+
+ a0 = _mm256_madd_epi16(fil[0], sig[0]);
+ a1 = _mm256_madd_epi16(fil[3], sig[3]);
+ a = _mm256_add_epi32(a0, a1);
+
+ a0 = _mm256_madd_epi16(fil[1], sig[1]);
+ a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+ {
+ const __m256i min = _mm256_min_epi32(a0, a1);
+ a = _mm256_add_epi32(a, min);
+ }
+ {
+ const __m256i max = _mm256_max_epi32(a0, a1);
+ a = _mm256_add_epi32(a, max);
+ }
+ {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ a = _mm256_add_epi32(a, rounding);
+ *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+ }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y);
+ const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void aom_highbd_filter_block1d8_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void aom_highbd_filter_block1d16_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i ff[2], s[2];
+ uint32_t i;
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+ __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+ __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+ __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+ pack_filters_4tap(filter, ff);
+ src_ptr -= 3;
+ for (i = 0; i <= (height - 2); i += 2) {
+ __m256i row0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+ __m256i row1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
+
+ s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+ s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
+
+ s[0] = _mm256_shuffle_epi8(s[0], mask);
+ s[1] = _mm256_shuffle_epi8(s[1], mask);
+
+ __m256i res = convolve_4tap(s, ff);
+ res =
+ _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+ res = _mm256_packs_epi32(res, res);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
+ _mm256_extracti128_si256(res, 1));
+ }
+ if (height % 2 != 0) {
+ i = height - 1;
+ const __m256i row0_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+ const __m256i row0_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
+
+ const __m256i r0 =
+ _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+ s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+ s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+ __m256i res = convolve_4tap(s, ff);
+ res =
+ _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+ res = _mm256_packs_epi32(res, res);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ }
+}
+
+static void aom_highbd_filter_block1d8_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i ff[2], s[2];
+ uint32_t i = 0;
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11,
+ 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11,
+ 4, 5, 12, 13, 6, 7, 14, 15 };
+
+ __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+ __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+ __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+ pack_filters_4tap(filter, ff);
+ src_ptr -= 3;
+
+ /* Horizontal filter */
+
+ for (i = 0; i <= (height - 2); i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
+
+ const __m256i r0 =
+ _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = r0;
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+
+ __m256i res_even = convolve_4tap(s, ff);
+ res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
+ CONV8_ROUNDING_BITS);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+
+ __m256i res_odd = convolve_4tap(s, ff);
+ res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
+ CONV8_ROUNDING_BITS);
+
+ __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ res = _mm256_shuffle_epi8(res, mask);
+
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+ _mm256_extracti128_si256(res, 1));
+ }
+
+ if (height % 2 != 0) {
+ i = height - 1;
+ const __m256i row0_0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+ const __m256i row0_1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
+
+ const __m256i r0 =
+ _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+ s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+ s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+ __m256i res = convolve_4tap(s, ff);
+ res =
+ _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+ res = _mm256_packs_epi32(res, res);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
+ _mm256_extracti128_si256(res, 1));
+ }
+}
+
+static void aom_highbd_filter_block1d16_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+ dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p = _mm256_set1_epi32(0x09080706);
+ f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *sig) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+ __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+ __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+ __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ r1 = _mm256_shuffle_epi8(r1, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+ sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+ const ptrdiff_t pitch, __m256i *sig) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+ r0 = _mm256_permutevar8x32_epi32(r0, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ x1 = _mm256_add_epi32(x1, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void aom_highbd_filter_block1d8_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void aom_highbd_filter_block1d16_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+ __m256i s1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+ __m256i s2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+ __m256i s3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+ __m256i s4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+ __m256i s5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+ __m256i s6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+ s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+ s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+ s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+ s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+ s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+ sig[0] = _mm256_unpacklo_epi16(s0, s1);
+ sig[4] = _mm256_unpackhi_epi16(s0, s1);
+ sig[1] = _mm256_unpacklo_epi16(s2, s3);
+ sig[5] = _mm256_unpackhi_epi16(s2, s3);
+ sig[2] = _mm256_unpacklo_epi16(s4, s5);
+ sig[6] = _mm256_unpackhi_epi16(s4, s5);
+ sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ __m256i s0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+ // base + 8th row
+ __m256i s1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+ __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+ __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ sig[3] = _mm256_unpacklo_epi16(s2, s3);
+ sig[7] = _mm256_unpackhi_epi16(s2, s3);
+ sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_8x1_pixels(sig, f, y0);
+ filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ sig[i] = sig[i + 1];
+ sig[i + 4] = sig[i + 5];
+ }
+}
+
+static void aom_highbd_filter_block1d8_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i u0, u1, u2, u3;
+ // load 0-6 rows
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+ const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+ const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+ const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+ u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
+ u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
+
+ u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
+ u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
+
+ sig[0] = _mm256_unpacklo_epi16(u0, u2);
+ sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[8] = _mm256_unpacklo_epi16(u1, u3);
+ sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+ u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+ u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+ sig[1] = _mm256_unpacklo_epi16(u0, u2);
+ sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[9] = _mm256_unpacklo_epi16(u1, u3);
+ sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+ u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+ u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+ sig[2] = _mm256_unpacklo_epi16(u0, u2);
+ sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[10] = _mm256_unpacklo_epi16(u1, u3);
+ sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+ // base + 8th row
+ const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+ __m256i u0, u1, u2, u3;
+ u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+ u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+ u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+ sig[3] = _mm256_unpacklo_epi16(u0, u2);
+ sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[11] = _mm256_unpacklo_epi16(u1, u3);
+ sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ __m256i res[4];
+ int i;
+ for (i = 0; i < 4; ++i) {
+ filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+ }
+
+ {
+ const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+ const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+ *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+ *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+ }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+ p = _mm256_min_epi16(*y1, *mask);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+ update_pixels(&sig[0]);
+ update_pixels(&sig[8]);
+}
+
+static void aom_highbd_filter_block1d16_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+ uint32_t i;
+ __m256i s[2], ff[2];
+
+ pack_filters_4tap(filter, ff);
+
+ const uint16_t *data = src_ptr;
+ /* Vertical filter */
+ {
+ __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
+ __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
+
+ __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+ __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
+
+ __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+ s[0] = _mm256_unpacklo_epi16(s23, s34);
+
+ for (i = 0; i < height; i += 2) {
+ data = &src_ptr[i * src_pitch];
+
+ __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
+ __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
+
+ __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+ __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+ s[1] = _mm256_unpacklo_epi16(s45, s56);
+
+ const __m256i res_a = convolve_4tap(s, ff);
+
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+ __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
+ res_16bit = _mm256_max_epi32(res_16bit, zero);
+ res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+ _mm256_extracti128_si256(res_16bit, 1));
+
+ s[0] = s[1];
+ s4 = s6;
+ }
+ }
+}
+
+static void aom_highbd_filter_block1d8_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i s[4], ff[2];
+ uint32_t i;
+ pack_filters_4tap(filter, ff);
+
+ const uint16_t *data = src_ptr;
+ /* Vertical filter */
+ {
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
+
+ __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
+
+ __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+ s[0] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpackhi_epi16(s23, s34);
+
+ for (i = 0; i < height; i += 2) {
+ data = &src_ptr[i * src_pitch];
+
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
+
+ __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+ __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+ s[1] = _mm256_unpacklo_epi16(s45, s56);
+ s[3] = _mm256_unpackhi_epi16(s45, s56);
+
+ const __m256i res_a = convolve_4tap(s, ff);
+
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+ const __m256i res_b = convolve_4tap(s + 2, ff);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+ _mm256_extracti128_si256(res_16bit, 1));
+
+ s[0] = s[1];
+ s[2] = s[3];
+ s4 = s6;
+ }
+ }
+}
+
+static void aom_highbd_filter_block1d16_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+
+ aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+ dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+ sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // load the next row
+ const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void aom_highbd_filter_block1d16_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i p = _mm_set1_epi32(0x09080706);
+ f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+ sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+ __m128i *sig) {
+ // load the next row
+ const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+ sig[0] = _mm_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+ __m128i *y0, __m128i *y1) {
+ const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m128i x0 = _mm_madd_epi16(sig[0], *f);
+ __m128i x1 = _mm_madd_epi16(sig[1], *f);
+ x0 = _mm_add_epi32(x0, rounding);
+ x1 = _mm_add_epi32(x1, rounding);
+ *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ res = _mm_min_epi16(res, *mask);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void aom_highbd_filter_block1d8_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
+#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
+#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
+#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+
+#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
new file mode 100644
index 0000000000..a2bb283222
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+
+void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg34_lo;
+ __m128i srcReg45_lo, srcReg56_lo;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_45_lo, resReg34_56_lo;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg64, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = dst_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+
+ for (i = height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+ resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+ // shift by 7 bit each 32 bit
+ resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+ resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+ resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+ resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+
+ // shrink to 16 bit each 32 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
+ resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
+
+ resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+ resReg23_45 = _mm_min_epi16(resReg23_45, max);
+ resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+ resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+ dst_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg34_lo = srcReg56_lo;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i addFilterReg64;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+
+ __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+ __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
+ __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
+
+ ss_23 = _mm_madd_epi16(ss_23, secondFilters);
+ ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
+
+ // shift by 7 bit each 32 bit
+ srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
+ srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+ src_ptr += src_pitch;
+
+ _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+ dst_ptr += dst_pitch;
+ }
+}
+
+void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg64, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = dst_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
+
+ for (i = height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+ resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
+ resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
+ resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
+ resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
+
+ resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
+
+ // shift by 7 bit each 32 bit
+ resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+ resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+ resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
+ resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
+ resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+ resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+ resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
+ resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
+
+ // shrink to 16 bit each 32 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
+
+ resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+ resReg23_45 = _mm_min_epi16(resReg23_45, max);
+ resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+ resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+ dst_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg23_hi = srcReg45_hi;
+ srcReg34_lo = srcReg56_lo;
+ srcReg34_hi = srcReg56_hi;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i addFilterReg64;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
+
+ __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
+ __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
+
+ __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+ __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
+ __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
+ __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
+ __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
+
+ d1 = _mm_madd_epi16(ss_3, secondFilters);
+ d2 = _mm_madd_epi16(ss_5, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+
+ // shift by 7 bit each 32 bit
+ res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
+ res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
+ res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
+ res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
+
+ srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+ src_ptr += src_pitch;
+
+ _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+ dst_ptr += dst_pitch;
+ }
+}
+
+void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+ dst_pitch, height, filter, bd);
+}
+
+void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+ dst_pitch, height, filter, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 0000000000..31c3c31b3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd) {
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+ if (filter_params_y->taps == 12) {
+ __m128i s[24], coeffs_y[6];
+
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+ __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride));
+ __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[4] = _mm_unpacklo_epi16(s8, s9);
+
+ s[6] = _mm_unpackhi_epi16(s0, s1);
+ s[7] = _mm_unpackhi_epi16(s2, s3);
+ s[8] = _mm_unpackhi_epi16(s4, s5);
+ s[9] = _mm_unpackhi_epi16(s6, s7);
+ s[10] = _mm_unpackhi_epi16(s8, s9);
+
+ s[12] = _mm_unpacklo_epi16(s1, s2);
+ s[13] = _mm_unpacklo_epi16(s3, s4);
+ s[14] = _mm_unpacklo_epi16(s5, s6);
+ s[15] = _mm_unpacklo_epi16(s7, s8);
+ s[16] = _mm_unpacklo_epi16(s9, s10);
+
+ s[18] = _mm_unpackhi_epi16(s1, s2);
+ s[19] = _mm_unpackhi_epi16(s3, s4);
+ s[20] = _mm_unpackhi_epi16(s5, s6);
+ s[21] = _mm_unpackhi_epi16(s7, s8);
+ s[22] = _mm_unpackhi_epi16(s9, s10);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride));
+ __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride));
+
+ s[5] = _mm_unpacklo_epi16(s10, s11);
+ s[11] = _mm_unpackhi_epi16(s10, s11);
+
+ s[17] = _mm_unpacklo_epi16(s11, s12);
+ s[23] = _mm_unpackhi_epi16(s11, s12);
+
+ const __m128i res_a0 = convolve_12tap(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+
+ s[6] = s[7];
+ s[7] = s[8];
+ s[8] = s[9];
+ s[9] = s[10];
+ s[10] = s[11];
+
+ s[12] = s[13];
+ s[13] = s[14];
+ s[14] = s[15];
+ s[15] = s[16];
+ s[16] = s[17];
+
+ s[18] = s[19];
+ s[19] = s[20];
+ s[20] = s[21];
+ s[21] = s[22];
+ s[22] = s[23];
+
+ s10 = s12;
+ }
+ }
+ } else {
+ __m128i s[16], coeffs_y[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ if (filter_params_x->taps == 12) {
+ __m128i s[6], coeffs_x[6];
+
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+ const __m128i row02 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+ s[4] = _mm_alignr_epi8(row02, row01, 0);
+ s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+ __m128i res_even = convolve_12tap(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+ s[4] = _mm_alignr_epi8(row02, row01, 2);
+ s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+ __m128i res_odd = convolve_12tap(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ res = _mm_min_epi16(res, clip_pixel);
+ res = _mm_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+ } else {
+ *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+ }
+ }
+ }
+ }
+ } else {
+ __m128i s[4], coeffs_x[4];
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ res = _mm_min_epi16(res, clip_pixel);
+ res = _mm_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+ } else {
+ *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
new file mode 100644
index 0000000000..91b3d126ca
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
@@ -0,0 +1,259 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ paddw m0, m2
+ pshuflw m1, m0, 0xe
+ paddw m0, m1
+ pshuflw m1, m0, 0x1
+ paddw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, one
+ mov oned, 0x00010001
+ lea stride3q, [strideq*3]
+ movd m3, oned
+ pshufd m3, m3, 0x0
+ paddw m0, m2
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ paddw m0, [GLOBAL(pw_8)]
+ psrlw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m3, [aboveq+16]
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_16)]
+ psrad m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ paddw m0, m2
+ paddw m3, m4
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ mova m5, [leftq+32]
+ mova m6, [leftq+48]
+ paddw m2, m4
+ paddw m5, m6
+ paddw m0, m3
+ paddw m2, m5
+ pxor m1, m1
+ paddw m0, m2
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_32)]
+ psrad m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16 ], m0
+ mova [dstq +32 ], m0
+ mova [dstq +48 ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16 ], m0
+ mova [dstq+strideq*2+32 ], m0
+ mova [dstq+strideq*2+48 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4+16 ], m0
+ mova [dstq+strideq*4+32 ], m0
+ mova [dstq+strideq*4+48 ], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m0
+ mova [dstq+stride3q*2 +32], m0
+ mova [dstq+stride3q*2 +48], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m1
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ mova m2, [aboveq+32]
+ mova m3, [aboveq+48]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq +32], m2
+ mova [dstq +48], m3
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*2 +32], m2
+ mova [dstq+strideq*2 +48], m3
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+strideq*4 +32], m2
+ mova [dstq+strideq*4 +48], m3
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m1
+ mova [dstq+stride3q*2 +32], m2
+ mova [dstq+stride3q*2 +48], m3
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
new file mode 100644
index 0000000000..6a2e915ed7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+ dst += stride << 2;
+ left += 4;
+ aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+}
+
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+}
+
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+ dst += stride << 3;
+ left += 8;
+ aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)*dst, val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_16_unpacklo(&dst, stride, &row0);
+ h_store_16_unpacklo(&dst, stride, &row1);
+ h_store_16_unpacklo(&dst, stride, &row2);
+ h_store_16_unpacklo(&dst, stride, &row3);
+ h_store_16_unpackhi(&dst, stride, &row4);
+ h_store_16_unpackhi(&dst, stride, &row5);
+ h_store_16_unpackhi(&dst, stride, &row6);
+ h_store_16_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ h_predictor_16x8(dst, stride, left);
+}
+
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ h_predictor_16x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ h_predictor_16x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_32_unpacklo(&dst, stride, &row0);
+ h_store_32_unpacklo(&dst, stride, &row1);
+ h_store_32_unpacklo(&dst, stride, &row2);
+ h_store_32_unpacklo(&dst, stride, &row3);
+ h_store_32_unpackhi(&dst, stride, &row4);
+ h_store_32_unpackhi(&dst, stride, &row5);
+ h_store_32_unpackhi(&dst, stride, &row6);
+ h_store_32_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ h_predictor_32x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ h_predictor_32x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP, DC_LEFT, DC_128
+
+// 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+ const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 4x8
+
+static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+// Shared with DC 8xh
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+ const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+ const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sum = dc_sum_8(left);
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x8(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 8xh
+
+static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+ int height, const uint16_t *above) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ dc_store_8xh(dst, stride, height, &dc);
+}
+
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 4, above);
+}
+
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 8, above);
+}
+
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 16, above);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 4, &dc);
+}
+
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 8, &dc);
+}
+
+// Shared with DC 16xh
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+ const __m128i sum_lo = dc_sum_8(ref);
+ const __m128i sum_hi = dc_sum_8(ref + 8);
+ return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 16, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+ int height, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ dc_store_8xh(dst, stride, height, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 4, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 8, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 16, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 16xh
+
+static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 16, &dc);
+}
+
+// Shared with 32xh
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sum_a = dc_sum_16(ref);
+ const __m128i sum_b = dc_sum_16(ref + 16);
+ // 12 bit bd will outrange, so expand to 32 bit before adding final total
+ return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+ _mm_unpacklo_epi16(sum_b, zero));
+}
+
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 8, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 32xh
+
+static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+ }
+}
+
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_storel_epi64((__m128i *)dst, above_u16);
+ _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
+ dst += stride << 2;
+ }
+}
+
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+ _mm_store_si128((__m128i *)dst, above_u16);
+ _mm_store_si128((__m128i *)(dst + stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+}
+
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, above_u16);
+ _mm_store_si128((__m128i *)(dst + stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+ dst += stride << 2;
+ }
+}
+
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ }
+}
+
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ int i;
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ }
+}
+
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const __m128i sum_above = dc_sum_4(above);
+ const __m128i sum_left = dc_sum_8(left);
+ const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 >>= 16;
+ sum32 += 6;
+ sum32 /= 12;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_storel_epi64((__m128i *)dst, row);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const __m128i sum_left = dc_sum_4(left);
+ const __m128i sum_above = dc_sum_8(above);
+ const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 >>= 16;
+ sum32 += 6;
+ sum32 /= 12;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+}
+
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 12;
+ sum32 /= 24;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 12;
+ sum32 /= 24;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_32(left);
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 24;
+ sum32 /= 48;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_32(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 24;
+ sum32 /= 48;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
new file mode 100644
index 0000000000..c954da94e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000000..ea7dc6a9e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+ __m128i *pixel) {
+ *pixel = _mm_min_epi16(*pixel, *max);
+ *pixel = _mm_max_epi16(*pixel, *min);
+}
+
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+ const uint8_t *t, int bd, __m128i *blt,
+ __m128i *lt, __m128i *thr, __m128i *t80_out) {
+ const int shift = bd - 8;
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+ *blt = _mm_slli_epi16(x, shift);
+
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+ *lt = _mm_slli_epi16(x, shift);
+
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+ *thr = _mm_slli_epi16(x, shift);
+
+ *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+ const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+ const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+ int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+ __m128i *t80_out) {
+ const int shift = bd - 8;
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x0 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+ __m128i x1 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+ x0 = _mm_unpacklo_epi64(x0, x1);
+ *blt_out = _mm_slli_epi16(x0, shift);
+
+ x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+ x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+ x0 = _mm_unpacklo_epi64(x0, x1);
+ *lt_out = _mm_slli_epi16(x0, shift);
+
+ x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+ x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+ x0 = _mm_unpacklo_epi64(x0, x1);
+ *thr_out = _mm_slli_epi16(x0, shift);
+
+ *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+ __m128i *p, __m128i *q) {
+ int i;
+ for (i = 0; i < size; i++) {
+ p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
+ q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
+ }
+}
+
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+ const __m128i *l, const __m128i *bl,
+ __m128i *mask) {
+ __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+ __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+
+ __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+ max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+ int i;
+ for (i = 1; i < 4; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+ max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
+ }
+ max = _mm_subs_epu16(max, *l);
+ *mask = _mm_cmpeq_epi16(max, zero); // return ~mask
+}
+
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+ __m128i *p1p0, __m128i *q1q0,
+ __m128i *abs_p1p0, __m128i *l,
+ __m128i *bl, __m128i *t,
+ __m128i *hev, __m128i *mask) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+ __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+ __m128i max, max01, h;
+
+ *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+ *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+ abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2
+
+ max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+ *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+ abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+ max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+ // mask |= (abs(*p1 - *p0) > limit) * -1;
+ // mask |= (abs(*q1 - *q0) > limit) * -1;
+ h = _mm_subs_epu16(max01, *t);
+
+ *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+ // replicate for the further "merged variables" usage
+ *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+ max = _mm_max_epi16(max, max01);
+ int i;
+ for (i = 2; i < x; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+ }
+ max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+ max = _mm_subs_epu16(max, *l);
+ *mask = _mm_cmpeq_epi16(max, zero); // ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+ int start, int end, __m128i *flat) {
+ int i;
+ __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+ abs_diff16(pq[start + 1], pq[0]));
+
+ for (i = start + 2; i < end; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+ }
+ max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+ __m128i ft;
+ ft = _mm_subs_epu16(max, *th);
+
+ const __m128i zero = _mm_setzero_si128();
+ *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+ const __m128i *q, int start, int end,
+ __m128i *flat) {
+ int i;
+ __m128i max =
+ _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+ for (i = start + 1; i < end; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+ max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
+ }
+
+ __m128i ft;
+ ft = _mm_subs_epu16(max, *th);
+
+ const __m128i zero = _mm_setzero_si128();
+ *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+ __m128i *flat2, int bd) {
+ // check the distance 1,2,3 against 0
+ __m128i th = _mm_set1_epi16(1);
+ th = _mm_slli_epi16(th, bd - 8);
+ flat_mask_internal(&th, pq, 1, 4, flat);
+ flat_mask_internal(&th, pq, 4, 7, flat2);
+}
+
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+ const __m128i *q, __m128i *flat,
+ __m128i *flat2, int bd) {
+ // check the distance 1,2,3 against 0
+ __m128i th = _mm_set1_epi16(1);
+ th = _mm_slli_epi16(th, bd - 8);
+ flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+ flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
+}
+
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0,
+ __m128i *ps1ps0, __m128i *t80,
+ int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+ const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+ const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+ __m128i ps1ps0_work, qs1qs0_work, work;
+ __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+ ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+ qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+ work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+ pixel_clamp(&pmin, &pmax, &work);
+ filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+ filt = _mm_subs_epi16(filt, work);
+ filt = _mm_subs_epi16(filt, work);
+ filt = _mm_subs_epi16(filt, work);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ pixel_clamp(&pmin, &pmax, &filt);
+ filt = _mm_and_si128(filt, *mask);
+ filt = _mm_unpacklo_epi64(filt, filt);
+
+ filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+ pixel_clamp(&pmin, &pmax, &filter2filter1);
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
+
+ filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filt, one);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(*hev, filt);
+
+ filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+ filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+ qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+ ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+ pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+ pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+ *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+ *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+ __m128i *qs, const __m128i *mask,
+ const __m128i *th, int bd,
+ __m128i *t80) {
+ __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+ __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+ __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+ __m128i qs1 = _mm_subs_epi16(q[1], *t80);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i pmin = _mm_subs_epi16(zero, *t80);
+ __m128i filter = _mm_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filter);
+
+ // hev_filter
+ __m128i hev;
+ const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+ const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+ __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ h = _mm_subs_epu16(h, *th);
+ const __m128i ffff = _mm_cmpeq_epi16(h, h);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
+ filter = _mm_and_si128(filter, hev);
+
+ const __m128i x = _mm_subs_epi16(qs0, ps0);
+ filter = _mm_adds_epi16(filter, x);
+ filter = _mm_adds_epi16(filter, x);
+ filter = _mm_adds_epi16(filter, x);
+ pixel_clamp(&pmin, &pmax, &filter);
+ filter = _mm_and_si128(filter, *mask);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t4 = _mm_set1_epi16(4);
+ __m128i filter1 = _mm_adds_epi16(filter, t4);
+ __m128i filter2 = _mm_adds_epi16(filter, t3);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ pixel_clamp(&pmin, &pmax, &filter2);
+ filter1 = _mm_srai_epi16(filter1, 3);
+ filter2 = _mm_srai_epi16(filter2, 3);
+ qs0 = _mm_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &qs0);
+ ps0 = _mm_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &ps0);
+ qs[0] = _mm_adds_epi16(qs0, *t80);
+ ps[0] = _mm_adds_epi16(ps0, *t80);
+ filter = _mm_adds_epi16(filter1, one);
+ filter = _mm_srai_epi16(filter, 1);
+ filter = _mm_andnot_si128(hev, filter);
+ qs1 = _mm_subs_epi16(qs1, filter);
+ pixel_clamp(&pmin, &pmax, &qs1);
+ ps1 = _mm_adds_epi16(ps1, filter);
+ pixel_clamp(&pmin, &pmax, &ps1);
+ qs[1] = _mm_adds_epi16(qs1, *t80);
+ ps[1] = _mm_adds_epi16(ps1, *t80);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+ __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+ const unsigned char *lt, const unsigned char *thr, int bd) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit, limit, thresh;
+ __m128i t80;
+ get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+ for (i = 0; i < 7; i++) {
+ pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+ }
+ __m128i mask, hevhev;
+ __m128i p1p0, q1q0, abs_p1p0;
+
+ highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hevhev, &mask);
+
+ __m128i ps0ps1, qs0qs1;
+ // filter4
+ highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
+
+ __m128i flat, flat2;
+ highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
+
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_and_si128(flat2, flat);
+
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+ flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+ // flat and wide flat calculations
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3], flat_pq[3];
+ __m128i flat2_p[6], flat2_q[6];
+ __m128i flat2_pq[6];
+ __m128i sum_p6, sum_p3;
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
+ __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+ __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
+ flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+ sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+ sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+ work0_1 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+
+ sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+ work0 = _mm_add_epi16(sum_p3, pq[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+ flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+ flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
+
+ sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+ work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ } // flat2
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // highbd_filter8
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+ for (i = 0; i < 3; i++) {
+ pq[i] = _mm_andnot_si128(flat, pq[i]);
+ flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ pq[i] = _mm_andnot_si128(flat2, pq[i]);
+ flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ }
+ }
+ } else {
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+ }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ __m128i p[7], q[7], pq[7];
+ int i;
+
+ for (i = 0; i < 7; i++) {
+ p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+ q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+ }
+
+ highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+ for (i = 0; i < 6; i++) {
+ _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+ _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+ __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+ const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+ const uint8_t *thr1, int bd) {
+ __m128i blimit, limit, thresh, t80;
+ const __m128i zero = _mm_setzero_si128();
+
+ get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+ &t80);
+ __m128i mask;
+ highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+ __m128i flat, flat2;
+ highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_and_si128(flat2, flat);
+ __m128i ps[2], qs[2];
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+ // flat and wide flat calculations
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+ __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
+ __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+ sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
+ __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
+ sum_q = _mm_add_epi16(sum_q, sum_lq);
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+ flat_p[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
+ flat_q[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
+ __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+ __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
+ __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
+ __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+ __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+ sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+ flat_p[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
+ flat_q[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm_add_epi16(sum_q3, q[3]);
+ flat_p[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
+ flat_q[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
+
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+ _mm_add_epi16(p[1], q[0]))),
+ 4);
+ flat2_q[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+ _mm_add_epi16(p[0], q[1]))),
+ 4);
+
+ flat2_p[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+ 4);
+ flat2_q[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, p[4]);
+ flat2_p[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+ 4);
+ flat2_q[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, p[3]);
+ flat2_p[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+ 4);
+ flat2_q[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, p[2]);
+ flat2_p[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+ 4);
+ flat2_q[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, p[1]);
+ flat2_p[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+ 4);
+ flat2_q[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+ 4);
+ }
+ // highbd_filter8
+ int i;
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ p[2] = _mm_andnot_si128(flat, p[2]);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+ // when (flat && mask)
+ p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm_andnot_si128(flat, q[2]);
+ flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+ q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
+
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ // highbd_filter16
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ }
+ }
+ } else {
+ p[0] = ps[0];
+ q[0] = qs[0];
+ p[1] = ps[1];
+ q[1] = qs[1];
+ }
+}
+
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p[7], q[7];
+ int i;
+ load_highbd_pixel(s, 7, pitch, p, q);
+
+ highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+ _limit1, _thresh1, bd);
+
+ for (i = 0; i < 6; i++) {
+ _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+ _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+ __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+ __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+ const uint8_t *_limit, const uint8_t *_thresh, int bd) {
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev, flat;
+ __m128i pq[3];
+ __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+ __m128i flat_p1p0, flat_q0q1;
+
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+ pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i t80;
+ const __m128i one = _mm_set1_epi16(0x1);
+
+ get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+ highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
+
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+ // flat_mask
+ flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+ flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
+
+ // op1
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_p1p0 = _mm_srli_epi16(workp_b, 3);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+ pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+ pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(*q2, *q2);
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_q0q1 = _mm_srli_epi16(workp_a, 3);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+ __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+ __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+ const unsigned char *_thresh0, const unsigned char *_blimit1,
+ const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit0, limit0, thresh0;
+ __m128i t80;
+ __m128i mask, flat, work;
+ __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+ __m128i op1, op0, oq0, oq1;
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i one = _mm_set1_epi16(0x1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+ get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit0, &limit0, &thresh0, &t80);
+
+ abs_p2p1 = abs_diff16(*p2, *p1);
+ abs_p1p0 = abs_diff16(*p1, *p0);
+ abs_q1q0 = abs_diff16(*q1, *q0);
+ abs_q2q1 = abs_diff16(*q2, *q1);
+
+ abs_p0q0 = abs_diff16(*p0, *q0);
+ abs_p1q1 = abs_diff16(*p1, *q1);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+ mask = _mm_max_epi16(abs_q2q1, mask);
+ work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_max_epi16(mask, abs_p2p1);
+ mask = _mm_subs_epu16(mask, limit0);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
+ // flat_mask
+ flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+ flat = _mm_max_epi16(flat, work);
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+ // op1
+ workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+ _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+ *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+ workp_shft0 = _mm_add_epi16(
+ workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+ op1 = _mm_srli_epi16(workp_shft0, 3);
+
+ // op0
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
+ workp_a =
+ _mm_add_epi16(workp_a,
+ workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+ op0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+ *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
+ workp_b = _mm_add_epi16(*q1, *q2);
+ workp_shft0 = _mm_add_epi16(
+ workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+ // oq1
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+ *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_b = _mm_add_epi16(*q2, *q2);
+ workp_shft1 = _mm_add_epi16(
+ workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+ oq1 = _mm_srli_epi16(workp_shft1, 3);
+
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+ highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+ _blimit, _limit, _thresh, bd);
+
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2;
+
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+ highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+ _limit0, _thresh0, _blimit1, _limit1,
+ _thresh1, bd);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev, flat;
+ __m128i pq[4];
+ __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+ __m128i work_a, opq2, flat_p1p0, flat_q0q1;
+
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+ pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+ pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+ __m128i abs_p1p0;
+
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i t80;
+ const __m128i one = _mm_set1_epi16(0x1);
+
+ get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+ highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
+
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+ // flat_mask4
+ flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
+ flat = _mm_max_epi16(abs_p1p0, flat);
+ flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ // o*p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+ workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ workp_c = _mm_add_epi16(workp_a, workp_c);
+
+ // o*p1
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+ workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+ // o*p0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+ workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+ workp_a = _mm_add_epi16(workp_a, workp_b);
+ opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+ work_a = _mm_andnot_si128(flat, pq[2]);
+ *p2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ *q2 = _mm_srli_si128(*p2, 8);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+ const unsigned char *_limit0, const unsigned char *_thresh0,
+ const unsigned char *_blimit1, const unsigned char *_limit1,
+ const unsigned char *_thresh1, int bd) {
+ __m128i blimit0, limit0, thresh0;
+ __m128i t80;
+ __m128i mask, flat;
+ __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+ __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i one = _mm_set1_epi16(0x1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+ get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit0, &limit0, &thresh0, &t80);
+
+ abs_p0q0 = abs_diff16(*p0, *q0);
+ abs_p1q1 = abs_diff16(*p1, *q1);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1;
+
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+ work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+ work1 =
+ _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat
+ work0 = _mm_max_epi16(work0, work1);
+ work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+ work2 = _mm_max_epi16(work2, work0);
+ mask = _mm_max_epi16(work2, mask);
+
+ mask = _mm_subs_epu16(mask, limit0);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
+ flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+ flat = _mm_max_epi16(work1, flat);
+ work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+ flat = _mm_max_epi16(work0, flat);
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b;
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ // o*p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // o*p1
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+ op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // o*p0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+ op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+ oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+ oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+ oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+
+ work_a = _mm_andnot_si128(flat, *q2);
+ *q2 = _mm_and_si128(flat, oq2);
+ *q2 = _mm_or_si128(work_a, *q2);
+
+ work_a = _mm_andnot_si128(flat, *p2);
+ *p2 = _mm_and_si128(flat, op2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+ __m128i q1q0, p1p0;
+
+ p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+ highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+ &p1p0, _blimit, _limit, _thresh, bd);
+
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+ highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+ _blimit0, _limit0, _thresh0, _blimit1,
+ _limit1, _thresh1, bd);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+ __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev;
+ __m128i p1p0, q1q0;
+ __m128i pq[2];
+
+ __m128i abs_p1p0;
+
+ __m128i t80;
+ get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+ highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
+
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+ __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i blimit0, limit0, thresh0;
+ __m128i mask, flat;
+ __m128i p[2], q[2];
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+ __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+ __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+ __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
+ const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+ const __m128i one = _mm_set1_epi16(1);
+
+ __m128i t80;
+
+ get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit0, &limit0, &thresh0, &t80);
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+ mask = _mm_max_epi16(flat, mask);
+
+ mask = _mm_subs_epu16(mask, limit0);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
+
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i p1p0, q1q0;
+ __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+ highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+ _thresh, bd);
+
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ __m128i ps[2], qs[2];
+
+ highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+ _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
+}
+
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+ __m128i p1p0, q1q0;
+ __m128i p1, q1;
+
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+ highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
+
+ highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+ thresh, bd);
+
+ p1 = _mm_srli_si128(p1p0, 8);
+ q1 = _mm_srli_si128(q1q0, 8);
+
+ // transpose from 8x4 to 4x8
+ highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i ps[2], qs[2];
+
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+ highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+ &d2, &d3);
+
+ highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+
+ highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+ &d3, &d4, &d5, &d6, &d7);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+ _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+ _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+ _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+ _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x3, x2, x1, x0, p0, q0;
+ __m128i p1p0, q1q0;
+
+ x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+ x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+ x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+ x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+ highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+ limit, thresh, bd);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i p0, q0, p1, q1, p2, q2;
+
+ x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+ x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+ x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+ x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+ x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+ highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+ &p0, &q0, &q1, &q2, &d6, &d7);
+
+ highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+ _limit0, _thresh0, _blimit1, _limit1,
+ _thresh1, bd);
+
+ highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+ _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+ _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+ _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+ _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i p2, p1, p0, p3, q0;
+ __m128i q1q0, p1p0;
+
+ p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+ p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+ p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
+
+ highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ // Loop filtering
+ highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+ &p1p0, blimit, limit, thresh, bd);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+ &d1, &d2, &d3);
+
+ _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+ _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+ _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+ _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+ x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+ x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+ x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+ x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+ x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+ x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+ x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+ highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+ &d2, &d3, &d4, &d5, &d6, &d7);
+
+ highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+ blimit0, limit0, thresh0, blimit1, limit1,
+ thresh1, bd);
+
+ highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+ &x2, &x3, &x4, &x5, &x6, &x7);
+
+ _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+ _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+ _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+ _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+ _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+ _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+ _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+ _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
+
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ __m128i q[7], p[7], pq[7];
+ __m128i p6, p5, p4, p3;
+ __m128i p6_2, p5_2, p4_2, p3_2;
+ __m128i d0, d1, d2, d3;
+ __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
+
+ p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+
+ highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+ &p[3], &p[2], &p[1], &p[0]);
+
+ p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+ p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+ highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+ &q[3], &q[4], &q[5], &q[6], &d7_2);
+
+ highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+ highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+ &pq[1], &pq[0], &d0, &d1, &d2, &d3);
+
+ q[0] = _mm_srli_si128(pq[0], 8);
+ q[1] = _mm_srli_si128(pq[1], 8);
+ q[2] = _mm_srli_si128(pq[2], 8);
+ q[3] = _mm_srli_si128(pq[3], 8);
+ q[4] = _mm_srli_si128(pq[4], 8);
+ q[5] = _mm_srli_si128(pq[5], 8);
+
+ highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+ &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ __m128i q[7], p[7];
+ __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+ __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+ __m128i d0, d7;
+ __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+ p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+ highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+ &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+ p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+ p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+ p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+ p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+ q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+ highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+ &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+ &q[6], &d7);
+
+ highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+
+ highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+ &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+ &d6_out, &d7_out);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+ highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+ &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+ &d6_out, &d7_out);
+
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+ _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..950465cf46
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i sign = _mm_srai_epi16(*p, 15);
+ const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+ const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+ int i;
+ for (i = 0; i < 5; ++i) {
+ qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+ }
+}
+
+static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *dequant_ptr,
+ const int16_t *quant_shift_ptr, __m256i *qp,
+ int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+ const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+ const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ init_one_qp(&zbin, &qp[0]);
+ init_one_qp(&round, &qp[1]);
+ init_one_qp(&quant, &qp[2]);
+ init_one_qp(&dequant, &qp[3]);
+ init_one_qp(&quant_shift, &qp[4]);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+ qp[0] = _mm256_add_epi32(qp[0], rnd);
+ qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+ qp[1] = _mm256_add_epi32(qp[1], rnd);
+ qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16. The output, 16 int32_t is save in *p.
+static INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+ const __m256i *y) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+ prod_lo = _mm256_srli_epi64(prod_lo, 16);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16);
+
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+ __m256i eobmax,
+ __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+}
+
+static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static AOM_FORCE_INLINE void quantize_logscale(
+ const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+ // (16 - log_scale + AOM_QM_BITS));
+ const __m256i abs_q =
+ mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+static AOM_FORCE_INLINE void quantize(const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ const int step = 8;
+
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+void aom_highbd_quantize_b_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ const unsigned int step = 8;
+
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+void aom_highbd_quantize_b_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ const int step = 8;
+
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2);
+
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..3b0c42c4f5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+ __m128i zbins[2];
+ __m128i nzbins[2];
+
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ (void)scan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
+
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] =
+ (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ }
+ }
+ }
+ *eob_ptr = eob_i + 1;
+}
+
+void aom_highbd_quantize_b_32x32_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_64x64_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob + 1;
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000000..03839b493c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,344 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0 ; normal sad
+%if AOM_ARCH_X86_64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ; AOM_ARCH_X86_64
+%else ; %3 == 2, downsample
+%if AOM_ARCH_X86_64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ; AOM_ARCH_X86_64
+%endif ; sad/avg/skip
+
+; set m1
+ push srcq
+ mov srcd, 0x00010001
+ movd m1, srcd
+ pshufd m1, m1, 0x0
+ pop srcq
+
+%if %3 == 2 ; skip rows
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif ; skip rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 2 ; Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+%if %3 == 2 ; skip rows
+ pslld m4, 1
+%endif
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
+HIGH_SADNXN4D 4, 16
+HIGH_SADNXN4D 16, 4
+HIGH_SADNXN4D 8, 32
+HIGH_SADNXN4D 32, 8
+HIGH_SADNXN4D 16, 64
+HIGH_SADNXN4D 64, 16
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16, 8, 2
+HIGH_SADNXN4D 8, 16, 2
+HIGH_SADNXN4D 8, 8, 2
+HIGH_SADNXN4D 4, 8, 2
+HIGH_SADNXN4D 4, 16, 2
+HIGH_SADNXN4D 8, 32, 2
+HIGH_SADNXN4D 32, 8, 2
+HIGH_SADNXN4D 16, 64, 2
+HIGH_SADNXN4D 64, 16, 2
+
+; Current code cannot handle the case when the height is downsampled to 2
+; HIGH_SADNXN4D 16, 4, 2
+; HIGH_SADNXN4D 8, 4, 2
+; HIGH_SADNXN4D 4, 4, 2
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..6c78eeeefb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/mem.h"
+
+// SAD
+static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+ // input 8 32-bit summation
+ __m128i lo128, hi128;
+ __m256i u = _mm256_srli_si256(*v, 8);
+ u = _mm256_add_epi32(u, *v);
+
+ // 4 32-bit summation
+ hi128 = _mm256_extracti128_si256(u, 1);
+ lo128 = _mm256_castsi256_si128(u);
+ lo128 = _mm_add_epi32(hi128, lo128);
+
+ // 2 32-bit summation
+ hi128 = _mm_srli_si128(lo128, 4);
+ lo128 = _mm_add_epi32(lo128, hi128);
+
+ return (unsigned int)_mm_cvtsi128_si32(lo128);
+}
+
+static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+ __m256i *sad_acc) {
+ const __m256i zero = _mm256_setzero_si256();
+ int i;
+ for (i = 0; i < 4; i++) {
+ s[i] = _mm256_sub_epi16(s[i], r[i]);
+ s[i] = _mm256_abs_epi16(s[i]);
+ }
+
+ s[0] = _mm256_add_epi16(s[0], s[1]);
+ s[0] = _mm256_add_epi16(s[0], s[2]);
+ s[0] = _mm256_add_epi16(s[0], s[3]);
+
+ r[0] = _mm256_unpacklo_epi16(s[0], zero);
+ r[1] = _mm256_unpackhi_epi16(s[0], zero);
+
+ r[0] = _mm256_add_epi32(r[0], r[1]);
+ *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+ if (sec_ptr) {
+ r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
+ const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ int i;
+ __m256i sad = _mm256_setzero_si256();
+ for (i = 0; i < N; i += 4) {
+ sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
+ src_ptr += src_stride << 2;
+ ref_ptr += ref_stride << 2;
+ }
+ return (unsigned int)get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ int row_sections = 0;
+
+ while (row_sections < 2) {
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+ if (sec_ptr) {
+ r[0] =
+ _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ sec_ptr += 32 << 1;
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+
+ row_sections += 1;
+ src_ptr += src_stride << 1;
+ ref_ptr += ref_stride << 1;
+ }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
+ const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ const int left_shift = 2;
+ int i;
+
+ for (i = 0; i < N; i += 4) {
+ sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ int i;
+ for (i = 0; i < 2; i++) {
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+ if (sec_ptr) {
+ r[0] =
+ _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ sec_ptr += 64;
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
+ const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ const int left_shift = 1;
+ int i;
+ for (i = 0; i < N; i += 2) {
+ sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ int i;
+ for (i = 0; i < 2; i++) {
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+ if (sec_ptr) {
+ r[0] =
+ _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ sec_ptr += 64;
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+ src_ptr += 64;
+ ref_ptr += 64;
+ }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
+ int N, const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ int row = 0;
+ while (row < N) {
+ sad128x1(srcp, refp, NULL, &sad);
+ srcp += src_stride;
+ refp += ref_stride;
+ row++;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+#define HIGHBD_SADMXN_AVX2(m, n) \
+ unsigned int aom_highbd_sad##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
+ }
+
+#define HIGHBD_SAD_SKIP_MXN_AVX2(m, n) \
+ unsigned int aom_highbd_sad_skip_##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
+ 2 * ref_stride); \
+ }
+
+HIGHBD_SADMXN_AVX2(16, 4)
+HIGHBD_SADMXN_AVX2(16, 8)
+HIGHBD_SADMXN_AVX2(16, 16)
+HIGHBD_SADMXN_AVX2(16, 32)
+HIGHBD_SADMXN_AVX2(16, 64)
+
+HIGHBD_SADMXN_AVX2(32, 8)
+HIGHBD_SADMXN_AVX2(32, 16)
+HIGHBD_SADMXN_AVX2(32, 32)
+HIGHBD_SADMXN_AVX2(32, 64)
+
+HIGHBD_SADMXN_AVX2(64, 16)
+HIGHBD_SADMXN_AVX2(64, 32)
+HIGHBD_SADMXN_AVX2(64, 64)
+HIGHBD_SADMXN_AVX2(64, 128)
+
+HIGHBD_SADMXN_AVX2(128, 64)
+HIGHBD_SADMXN_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 128)
+
+unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+ // Next 4 rows
+ srcp += src_stride << 2;
+ refp += ref_stride << 2;
+ secp += 64;
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 3;
+ uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 4;
+ uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 2;
+ int row_section = 0;
+
+ while (row_section < 2) {
+ sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 32 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 2;
+ int row_section = 0;
+
+ while (row_section < 4) {
+ sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 32 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 4;
+ uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 32 << left_shift;
+ sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 32 << left_shift;
+ sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 1;
+ int row_section = 0;
+
+ while (row_section < 8) {
+ sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 64 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 1;
+ int row_section = 0;
+
+ while (row_section < 16) {
+ sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 64 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 64 << left_shift;
+ sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 6;
+ uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 64 << left_shift;
+ sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ int row = 0;
+ while (row < 64) {
+ sad128x1(srcp, refp, secp, &sad);
+ srcp += src_stride;
+ refp += ref_stride;
+ secp += 16 << 3;
+ row += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ unsigned int sum;
+ const int left_shift = 6;
+
+ sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 128 << left_shift;
+ sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+// SAD 4D
+// Combine 4 __m256i input vectors v to uint32_t result[4]
+static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+ uint32_t *res) {
+ __m256i u0, u1, u2, u3;
+ const __m256i mask = yy_set1_64_from_32i(~0);
+ __m128i sad;
+
+ // 8 32-bit summation
+ u0 = _mm256_srli_si256(v[0], 4);
+ u1 = _mm256_srli_si256(v[1], 4);
+ u2 = _mm256_srli_si256(v[2], 4);
+ u3 = _mm256_srli_si256(v[3], 4);
+
+ u0 = _mm256_add_epi32(u0, v[0]);
+ u1 = _mm256_add_epi32(u1, v[1]);
+ u2 = _mm256_add_epi32(u2, v[2]);
+ u3 = _mm256_add_epi32(u3, v[3]);
+
+ u0 = _mm256_and_si256(u0, mask);
+ u1 = _mm256_and_si256(u1, mask);
+ u2 = _mm256_and_si256(u2, mask);
+ u3 = _mm256_and_si256(u3, mask);
+ // 4 32-bit summation, evenly positioned
+
+ u1 = _mm256_slli_si256(u1, 4);
+ u3 = _mm256_slli_si256(u3, 4);
+
+ u0 = _mm256_or_si256(u0, u1);
+ u2 = _mm256_or_si256(u2, u3);
+ // 8 32-bit summation, interleaved
+
+ u1 = _mm256_unpacklo_epi64(u0, u2);
+ u3 = _mm256_unpackhi_epi64(u0, u2);
+
+ u0 = _mm256_add_epi32(u1, u3);
+ sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
+ _mm256_castsi256_si128(u0));
+ _mm_storeu_si128((__m128i *)res, sad);
+}
+
+static void convert_pointers(const uint8_t *const ref8[],
+ const uint16_t *ref[]) {
+ ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
+ ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
+ ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
+ ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
+}
+
+static void init_sad(__m256i *s) {
+ s[0] = _mm256_setzero_si256();
+ s[1] = _mm256_setzero_si256();
+ s[2] = _mm256_setzero_si256();
+ s[3] = _mm256_setzero_si256();
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
+ int M, int N, int D, const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) {
+ __m256i sad_vec[4];
+ const uint16_t *refp[4];
+ const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *srcp;
+ const int shift_for_rows = (M < 128) + (M < 64);
+ const int row_units = 1 << shift_for_rows;
+ int i, r;
+
+ init_sad(sad_vec);
+ convert_pointers(ref_array, refp);
+
+ for (i = 0; i < D; ++i) {
+ srcp = keep;
+ for (r = 0; r < N; r += row_units) {
+ if (M == 128) {
+ sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+ } else if (M == 64) {
+ sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+ } else if (M == 32) {
+ sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ } else if (M == 16) {
+ sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ } else {
+ assert(0);
+ }
+ srcp += src_stride << shift_for_rows;
+ refp[i] += ref_stride << shift_for_rows;
+ }
+ }
+ get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n) \
+ void aom_highbd_sad##m##x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
+#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \
+ void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+#define HIGHBD_SAD_MXNX3D_AVX2(m, n) \
+ void aom_highbd_sad##m##x##n##x3d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
+
+HIGHBD_SAD_MXNX4D_AVX2(16, 4)
+HIGHBD_SAD_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 8)
+HIGHBD_SAD_MXNX3D_AVX2(16, 16)
+HIGHBD_SAD_MXNX3D_AVX2(16, 32)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(32, 16)
+HIGHBD_SAD_MXNX3D_AVX2(32, 32)
+HIGHBD_SAD_MXNX3D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+HIGHBD_SAD_MXNX3D_AVX2(64, 32)
+HIGHBD_SAD_MXNX3D_AVX2(64, 64)
+HIGHBD_SAD_MXNX3D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(128, 64)
+HIGHBD_SAD_MXNX3D_AVX2(128, 128)
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000000..3dc4e4e0a2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,524 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%elif %4 == 1 ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if AOM_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%else ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2 ; double the stride if we are skipping rows
+ lea src_strided, [src_strided*2]
+ lea ref_strided, [ref_strided*2]
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
+
+; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+HIGH_SAD32XN 8, 2 ; highbd_sad_skip_32x8_sse2
+
+; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD16XN 4, 2 ; highbd_sad_skip_16x4_sse2
+
+; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2, 8
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m7, m1
+ movu m5, [srcq]
+ psubusw m1, m5
+ psubusw m5, m7
+ por m1, m5
+
+ mova m7, m2
+ movu m5, [srcq+src_strideq*2]
+ psubusw m2, m5
+ psubusw m5, m7
+ por m2, m5
+
+ mova m7, m3
+ movu m5, [srcq+src_strideq*4]
+ psubusw m3, m5
+ psubusw m5, m7
+ por m3, m5
+
+ mova m7, m4
+ movu m5, [srcq+src_stride3q*2]
+ psubusw m4, m5
+ psubusw m5, m7
+ por m4, m5
+
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD8XN 4, 2 ; highbd_sad8x4_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+ HIGH_SAD_FN 4, %1, 7, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movq m1, [refq]
+ movq m2, [refq+ref_strideq*2]
+ movq m3, [refq+ref_strideq*4]
+ movq m4, [refq+ref_stride3q*2]
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+%if %2 == 1
+ movq m3, [second_predq+8*0]
+ movq m5, [second_predq+8*2]
+ punpcklwd m3, m5
+ movq m4, [second_predq+8*1]
+ movq m5, [second_predq+8*3]
+ punpcklwd m4, m5
+ lea second_predq, [second_predq+8*4]
+ pavgw m1, m3
+ pavgw m2, m4
+%endif
+ movq m5, [srcq]
+ movq m3, [srcq+src_strideq*4]
+ punpcklwd m5, m3
+ movdqa m3, m1
+ psubusw m1, m5
+ psubusw m5, m3
+ por m1, m5
+ movq m5, [srcq+src_strideq*2]
+ movq m4, [srcq+src_stride3q*2]
+ punpcklwd m5, m4
+ movdqa m4, m2
+ psubusw m2, m5
+ psubusw m5, m4
+ por m2, m5
+ paddw m1, m2
+ movdqa m2, m1
+ punpcklwd m1, m6
+ punpckhwd m2, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN 8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN 4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2
+HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
+HIGH_SAD4XN 8, 2 ; highbd_sad_skip_4x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD4XN 4, 2 ; highbd_sad_skip_4x4_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000000..c0ccc182b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1024 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd eax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+ add srcq, src_stridemp
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if AOM_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ; Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar block_height, 1
+%endif
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [dstq]
+ mova m3, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [dstq]
+ mova m3, [dstq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [dstq]
+ mova m5, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [dstq]
+ mova m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [dstq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ INC_SRC_BY_SRC_STRIDE
+ movu m3, [srcq]
+ movu m5, [srcq+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [dstq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000000..3c3253bdf9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred,
+ ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+ int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+ u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *)(diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *)(diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *)(diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+ u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride));
+
+ v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *)(diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *)(diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *)(diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+ store_diff = (int64_t *)(diff + 4 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x4);
+ store_diff = (int64_t *)(diff + 5 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x5);
+ store_diff = (int64_t *)(diff + 6 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x6);
+ store_diff = (int64_t *)(diff + 7 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+ _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+ _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+ _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+ _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+#define STACK_V(h, fun) \
+ do { \
+ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
+ fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
+ pred + pred_stride * h, pred_stride); \
+ } while (0)
+
+#define STACK_H(w, fun) \
+ do { \
+ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
+ fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
+ } while (0)
+
+#define SUBTRACT_FUN(size) \
+ static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \
+ const uint16_t *src, ptrdiff_t src_stride, \
+ const uint16_t *pred, ptrdiff_t pred_stride)
+
+SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
+SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
+SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
+SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
+SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
+SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
+SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
+SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
+SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
+SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
+SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
+SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
+SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
+SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
+SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
+SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
+SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
+SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+ if (rows == 4) {
+ if (cols == 4) return subtract_4x4;
+ if (cols == 8) return subtract_8x4;
+ if (cols == 16) return subtract_16x4;
+ }
+ if (rows == 8) {
+ if (cols == 4) return subtract_4x8;
+ if (cols == 8) return subtract_8x8;
+ if (cols == 16) return subtract_16x8;
+ if (cols == 32) return subtract_32x8;
+ }
+ if (rows == 16) {
+ if (cols == 4) return subtract_4x16;
+ if (cols == 8) return subtract_8x16;
+ if (cols == 16) return subtract_16x16;
+ if (cols == 32) return subtract_32x16;
+ if (cols == 64) return subtract_64x16;
+ }
+ if (rows == 32) {
+ if (cols == 8) return subtract_8x32;
+ if (cols == 16) return subtract_16x32;
+ if (cols == 32) return subtract_32x32;
+ if (cols == 64) return subtract_64x32;
+ }
+ if (rows == 64) {
+ if (cols == 16) return subtract_16x64;
+ if (cols == 32) return subtract_32x64;
+ if (cols == 64) return subtract_64x64;
+ if (cols == 128) return subtract_128x64;
+ }
+ if (rows == 128) {
+ if (cols == 64) return subtract_64x128;
+ if (cols == 128) return subtract_128x128;
+ }
+ assert(0);
+ return NULL;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ SubtractWxHFuncType func;
+
+ func = getSubtractFunc(rows, cols);
+ func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 0000000000..b4ff91d856
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,904 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+ const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
+ int dst_stride, uint32_t *sse) {
+ const __m256i filter1 =
+ _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) |
+ bilinear_filters_2t[xoffset][0]);
+ const __m256i filter2 =
+ _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) |
+ bilinear_filters_2t[yoffset][0]);
+ const __m256i one = _mm256_set1_epi16(1);
+ const int bitshift = 0x40;
+ (void)pixel_step;
+ unsigned int i, j, prev = 0, curr = 2;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
+ uint16_t *src_ptr_ref = src_ptr;
+ uint16_t *dst_ptr_ref = dst_ptr;
+ int64_t sum_long = 0;
+ uint64_t sse_long = 0;
+ unsigned int rshift = 0, inc = 1;
+ __m256i rbias = _mm256_set1_epi32(bitshift);
+ __m256i opointer[8];
+ unsigned int range;
+ if (xoffset == 0) {
+ if (yoffset == 0) { // xoffset==0 && yoffset==0
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+ }
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+ for (i = 0; i < 16 / inc; ++i) {
+ __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else if (yoffset == 4) { // xoffset==0 && yoffset==4
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else { // xoffset==0 && yoffset==1,2,3,5,6,7
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_S_M1 =
+ _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 =
+ _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+ }
+ } else if (xoffset == 4) {
+ if (yoffset == 0) { // xoffset==4 && yoffset==0
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+ __m256i V_S_M1 =
+ _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 =
+ _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else if (yoffset == 4) { // xoffset==4 && yoffset==4
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+ __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else { // xoffset==4 && yoffset==1,2,3,5,6,7
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+ __m256i V_S_M1 =
+ _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 =
+ _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+ }
+ } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+ __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+ __m256i V_V_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+ __m256i V_V_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+ opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+ __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4
+
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+ __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+ __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+ __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+ __m256i V_H_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+ __m256i V_H_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+ opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+ __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+ __m256i V_V_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+ __m256i V_V_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+ opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+ __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ unsigned int nloop = 16 / inc;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+ __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+ __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+ __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+ __m256i V_H_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+ __m256i V_H_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+ opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < nloop; ++i) {
+ prev = curr;
+ curr = !curr;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+ __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+ __m256i V_V_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+ __m256i V_V_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+ opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+ __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+ }
+
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+
+ int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
+
+ return (var > 0) ? var : 0;
+}
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ for (int i = 0; i < 8; i += 2) {
+ const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+ const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+ __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+ __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+ v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+ v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride * 2;
+ ref += ref_stride * 2;
+ }
+ __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+ __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+ __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ for (int i = 0; i < 16; ++i) {
+ const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride;
+ ref += ref_stride;
+ }
+ __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_avx2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+
+#if !CONFIG_REALTIME_ONLY
+VAR_FN(16, 64, 16, 10)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(64, 16, 16, 10)
+VAR_FN(8, 32, 8, 8)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef VAR_FN
+
+#define SSE2_HEIGHT(H) \
+ uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
+
+SSE2_HEIGHT(8)
+SSE2_HEIGHT(16)
+
+#undef SSE2_Height
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ if (W == 8 && H == 16) \
+ return aom_highbd_10_sub_pixel_variance8x16_sse2( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
+ else if (W == 8 && H == 8) \
+ return aom_highbd_10_sub_pixel_variance8x8_sse2( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
+ else \
+ return aom_highbd_var_filter_block2d_bil_avx2( \
+ src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+ }
+
+HIGHBD_SUBPIX_VAR(128, 128)
+HIGHBD_SUBPIX_VAR(128, 64)
+HIGHBD_SUBPIX_VAR(64, 128)
+HIGHBD_SUBPIX_VAR(64, 64)
+HIGHBD_SUBPIX_VAR(64, 32)
+HIGHBD_SUBPIX_VAR(32, 64)
+HIGHBD_SUBPIX_VAR(32, 32)
+HIGHBD_SUBPIX_VAR(32, 16)
+HIGHBD_SUBPIX_VAR(16, 32)
+HIGHBD_SUBPIX_VAR(16, 16)
+HIGHBD_SUBPIX_VAR(16, 8)
+HIGHBD_SUBPIX_VAR(8, 16)
+HIGHBD_SUBPIX_VAR(8, 8)
+
+#undef HIGHBD_SUBPIX_VAR
+
+uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
+ __m256i src0_8x16, src1_8x16, src_16x16;
+ __m256i dst0_8x16, dst1_8x16, dst_16x16;
+ __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ for (int i = 0; i < h; i += 4) {
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+ reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
+ reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
+ dst0_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+ dst1_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+ dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+ reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+ src1_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+ dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+ src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+ dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+ res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+ res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+ res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+ res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+ square_result = _mm256_add_epi64(
+ square_result,
+ _mm256_add_epi64(
+ _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+ const __m128i sum_2x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(square_result),
+ _mm256_extracti128_si256(square_result, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m256i src0_8x16, src1_8x16, src_16x16;
+ __m256i dst0_8x16, dst1_8x16, dst_16x16;
+ __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+ for (int i = 0; i < h; i += 2) {
+ dst0_8x16 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
+ dst1_8x16 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
+ dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+ src1_8x16 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+ dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+ src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+ dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+ res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+ res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+ res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+ res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+ square_result = _mm256_add_epi64(
+ square_result,
+ _mm256_add_epi64(
+ _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+
+ const __m128i sum_2x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(square_result),
+ _mm256_extracti128_si256(square_result, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must satisfy");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000000..ec6c7e9fa7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,318 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int aom_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(aom_highbd_calc16x16var_sse2)
+sym(aom_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int aom_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(aom_highbd_calc8x8var_sse2)
+sym(aom_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000000..e897aab645
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,735 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ } \
+ \
+ uint32_t aom_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+VAR_FN(8, 32, 8, 8)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(16, 64, 16, 10)
+VAR_FN(64, 16, 16, 10)
+
+#undef VAR_FN
+
+unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt) \
+ int aom_highbd_sub_pixel_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+ DECL(8, opt) \
+ DECL(16, opt)
+
+DECLS(sse2)
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = 0; \
+ unsigned int sse = 0; \
+ unsigned int sse2; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \
+ NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = 0; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
+ se += se2; \
+ long_sse += sse; \
+ if (w > wf) { \
+ uint32_t sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ int64_t var; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ uint16_t *src_tmp = src + (start_row * src_stride); \
+ uint16_t *dst_tmp = dst + (start_row * dst_stride); \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src_tmp += wd_64 * 64; \
+ dst_tmp += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \
+ dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 2 * wf, src_stride, x_offset, y_offset, \
+ dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 3 * wf, src_stride, x_offset, y_offset, \
+ dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int64_t)) \
+ FN(8, 32, 8, 3, 5, opt, (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused);
+#define DECLS(opt) \
+ DECL(16, opt) \
+ DECL(8, opt)
+
+DECLS(sse2)
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \
+ sec + wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \
+ sec + wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ int64_t var; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
+ w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + wf + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + wf + (start_row * dst_stride), dst_stride, \
+ sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \
+ sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \
+ sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int64_t)) \
+ FN(8, 32, 8, 3, 5, opt, (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w0,
+ const __m128i *w1,
+ const __m128i *r,
+ void *const result) {
+ assert(DIST_PRECISION_BITS <= 4);
+ __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+ __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+ __m128i sum = _mm_adds_epu16(mult0, mult1);
+ __m128i round = _mm_adds_epu16(sum, *r);
+ __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, shift);
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ int i;
+ const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+ const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+ const __m128i w0 = _mm_set1_epi16(wt0);
+ const __m128i w1 = _mm_set1_epi16(wt1);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width >= 8) {
+ // Read 8 pixels one row at a time
+ assert(!(width & 7));
+ for (i = 0; i < height; ++i) {
+ int j;
+ for (j = 0; j < width; j += 8) {
+ __m128i p0 = xx_loadu_128(ref);
+ __m128i p1 = xx_loadu_128(pred);
+
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+ comp_pred += 8;
+ pred += 8;
+ ref += 8;
+ }
+ ref += ref_stride - width;
+ }
+ } else {
+ // Read 4 pixels two rows at a time
+ assert(!(width & 3));
+ for (i = 0; i < height; i += 2) {
+ __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+ __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+ __m128i p1 = xx_loadu_128(pred);
+
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+ comp_pred += 8;
+ pred += 8;
+ ref += 2 * ref_stride;
+ }
+ }
+}
+
+uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i reg0_4x16, reg1_4x16;
+ __m128i src_8x16;
+ __m128i dst_8x16;
+ __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m128i sub_result_8x16;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+ for (int i = 0; i < h; i += 2) {
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+ dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+ sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+ res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+ res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+ res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+ res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+ res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+ res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+ res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+ res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+ square_result = _mm_add_epi64(
+ square_result,
+ _mm_add_epi64(
+ _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+
+ const __m128i sum_1x64 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i src_8x16;
+ __m128i dst_8x16;
+ __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m128i sub_result_8x16;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+
+ for (int i = 0; i < h; i++) {
+ dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]);
+ src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+ sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+ res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+ res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+ res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+ res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+ res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+ res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+ res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+ res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+ square_result = _mm_add_epi64(
+ square_result,
+ _mm_add_epi64(
+ _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+
+ const __m128i sum_1x64 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must satisfy");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000000..df5449a9df
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ uint64_t *sse, int64_t *sum) {
+ __m128i u0, u1, u2, u3;
+ __m128i s0, s1, s2, s3;
+ __m128i t0, t1, x0, y0;
+ __m128i a0, a1, a2, a3;
+ __m128i b0, b1, b2, b3;
+ __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+ a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+ a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+ a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+ b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+ b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+ b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+ b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+ u0 = _mm_unpacklo_epi16(a0, a1);
+ u1 = _mm_unpacklo_epi16(a2, a3);
+ u2 = _mm_unpacklo_epi16(b0, b1);
+ u3 = _mm_unpacklo_epi16(b2, b3);
+
+ s0 = _mm_sub_epi16(u0, u2);
+ s1 = _mm_sub_epi16(u1, u3);
+
+ t0 = _mm_madd_epi16(s0, k_one_epi16);
+ t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ y0 = _mm_hadd_epi32(s3, s3);
+
+ t0 = _mm_madd_epi16(s0, s0);
+ t1 = _mm_madd_epi16(s1, s1);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ x0 = _mm_hadd_epi32(s3, s3);
+
+ *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+ *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)local_sse;
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+ sum = ROUND_POWER_OF_TWO(sum, 2);
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+ sum = ROUND_POWER_OF_TWO(sum, 4);
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+ sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+ dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+ dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+ sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+ dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+ dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm
new file mode 100644
index 0000000000..0eb632326b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -0,0 +1,608 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
+pw2_32: times 8 dw 16
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movd m2, [leftq]
+ movd m0, [aboveq]
+ pxor m1, m1
+ punpckldq m0, m2
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_8)]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_16)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ psadbw m3, m1
+ psadbw m4, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_32)]
+ psraw m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ mova m2, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movd m0, [aboveq]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+ movifnidn leftq, leftmp
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0
+ pshufd m1, m0, 0x1
+ movd [dstq ], m0
+ movd [dstq+strideq], m1
+ pshufd m2, m0, 0x2
+ lea dstq, [dstq+strideq*2]
+ pshufd m3, m0, 0x3
+ movd [dstq ], m2
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -2
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [leftq ]
+ punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
+.loop:
+ pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
+ pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
+ movq [dstq ], m1
+ movq [dstq+strideq], m2
+ pshuflw m1, m0, 0xaa
+ pshuflw m2, m0, 0xff
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+ inc lineq
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -4
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+strideq ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -8
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16 ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2 ], m1
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
new file mode 100644
index 0000000000..242a548df9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -0,0 +1,4707 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i y0 = _mm256_sad_epu8(x0, zero);
+ __m256i y1 = _mm256_sad_epu8(x1, zero);
+ y0 = _mm256_add_epi64(y0, y1);
+ __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+ y0 = _mm256_add_epi64(u0, y0);
+ u0 = _mm256_unpackhi_epi64(y0, y0);
+ return _mm256_add_epi16(y0, u0);
+}
+
+static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+ const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i y = _mm256_sad_epu8(x, zero);
+ __m256i u = _mm256_permute2x128_si256(y, y, 1);
+ y = _mm256_add_epi64(u, y);
+ u = _mm256_unpackhi_epi64(y, y);
+ return _mm256_add_epi16(y, u);
+}
+
+static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r);
+ dst += stride;
+ }
+}
+
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+ int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r0);
+ _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+ dst += stride;
+ }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r);
+ _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+ dst += stride;
+ }
+}
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
+ { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+ { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+ { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+ { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+ { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25,
+ 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
+ { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
+ 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21,
+ 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
+};
+
+static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+ 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+ 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
+ __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+
+ r0 = _mm_unpacklo_epi16(x[0], x[1]);
+ r1 = _mm_unpacklo_epi16(x[2], x[3]);
+ r2 = _mm_unpacklo_epi16(x[4], x[5]);
+ r3 = _mm_unpacklo_epi16(x[6], x[7]);
+
+ r4 = _mm_unpacklo_epi16(x[8], x[9]);
+ r5 = _mm_unpacklo_epi16(x[10], x[11]);
+ r6 = _mm_unpacklo_epi16(x[12], x[13]);
+ r7 = _mm_unpacklo_epi16(x[14], x[15]);
+
+ r8 = _mm_unpacklo_epi32(r0, r1);
+ r9 = _mm_unpackhi_epi32(r0, r1);
+ r10 = _mm_unpacklo_epi32(r2, r3);
+ r11 = _mm_unpackhi_epi32(r2, r3);
+
+ r12 = _mm_unpacklo_epi32(r4, r5);
+ r13 = _mm_unpackhi_epi32(r4, r5);
+ r14 = _mm_unpacklo_epi32(r6, r7);
+ r15 = _mm_unpackhi_epi32(r6, r7);
+
+ r0 = _mm_unpacklo_epi64(r8, r9);
+ r1 = _mm_unpackhi_epi64(r8, r9);
+ r2 = _mm_unpacklo_epi64(r10, r11);
+ r3 = _mm_unpackhi_epi64(r10, r11);
+
+ r4 = _mm_unpacklo_epi64(r12, r13);
+ r5 = _mm_unpackhi_epi64(r12, r13);
+ r6 = _mm_unpacklo_epi64(r14, r15);
+ r7 = _mm_unpackhi_epi64(r14, r15);
+
+ d[0] = _mm_unpacklo_epi64(r0, r2);
+ d[1] = _mm_unpacklo_epi64(r4, r6);
+ d[2] = _mm_unpacklo_epi64(r1, r3);
+ d[3] = _mm_unpacklo_epi64(r5, r7);
+
+ d[4] = _mm_unpackhi_epi64(r0, r2);
+ d[5] = _mm_unpackhi_epi64(r4, r6);
+ d[6] = _mm_unpackhi_epi64(r1, r3);
+ d[7] = _mm_unpackhi_epi64(r5, r7);
+}
+
+static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, ww0, ww1;
+
+ w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
+ w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
+ w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53
+ w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
+
+ d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
+ d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
+
+ d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
+ d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, ww0, ww1;
+
+ w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
+ w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
+ w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53
+ w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
+
+ d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
+ d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
+
+ d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
+ d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
+
+ w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17
+ w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37
+ w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57
+ w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
+ ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
+
+ d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
+ d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
+ ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
+
+ d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
+ d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
+}
+
+static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, ww0, ww1;
+ __m256i dd[16];
+ w0 = _mm256_unpacklo_epi16(x[0], x[1]);
+ w1 = _mm256_unpacklo_epi16(x[2], x[3]);
+ w2 = _mm256_unpacklo_epi16(x[4], x[5]);
+ w3 = _mm256_unpacklo_epi16(x[6], x[7]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); //
+ ww1 = _mm256_unpacklo_epi32(w2, w3); //
+
+ dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); //
+ ww1 = _mm256_unpackhi_epi32(w2, w3); //
+
+ dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ w0 = _mm256_unpackhi_epi16(x[0], x[1]);
+ w1 = _mm256_unpackhi_epi16(x[2], x[3]);
+ w2 = _mm256_unpackhi_epi16(x[4], x[5]);
+ w3 = _mm256_unpackhi_epi16(x[6], x[7]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); //
+ ww1 = _mm256_unpacklo_epi32(w2, w3); //
+
+ dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); //
+ ww1 = _mm256_unpackhi_epi32(w2, w3); //
+
+ dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ w0 = _mm256_unpacklo_epi16(x[8], x[9]);
+ w1 = _mm256_unpacklo_epi16(x[10], x[11]);
+ w2 = _mm256_unpacklo_epi16(x[12], x[13]);
+ w3 = _mm256_unpacklo_epi16(x[14], x[15]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1);
+ ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+ dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1);
+ ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+ dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ w0 = _mm256_unpackhi_epi16(x[8], x[9]);
+ w1 = _mm256_unpackhi_epi16(x[10], x[11]);
+ w2 = _mm256_unpackhi_epi16(x[12], x[13]);
+ w3 = _mm256_unpackhi_epi16(x[14], x[15]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1);
+ ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+ dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1);
+ ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+ dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ for (int i = 0; i < 8; i++) {
+ d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
+ d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
+ _mm256_extracti128_si256(dd[i], 1), 0);
+ }
+}
+
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_32(above);
+ __m256i sum_left = dc_sum_32(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum_left = _mm256_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm256_srai_epi16(sum_left, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum_left, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(left);
+ (void)above;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+// There are 32 rows togeter. This function does line:
+// 0,1,2,3, and 16,17,18,19. The next call would do
+// 4,5,6,7, and 20,21,22,23. So 4 times of calling
+// would finish 32 rows.
+static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m256i t[4];
+ __m256i m = _mm256_setzero_si256();
+ const __m256i inc = _mm256_set1_epi8(4);
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ t[i] = _mm256_shuffle_epi8(*row, m);
+ __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
+ __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
+ _mm256_storeu_si256((__m256i *)dst, r0);
+ _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
+ dst += stride;
+ m = _mm256_add_epi8(m, inc);
+ }
+}
+
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
+
+ __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
+
+ __m256i v = _mm256_unpacklo_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ v = _mm256_unpackhi_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ u = _mm256_unpackhi_epi8(left_col, left_col);
+
+ v = _mm256_unpacklo_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ v = _mm256_unpackhi_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// Rectangle
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i top_sum = dc_sum_32_sse2(above);
+ __m128i left_sum = dc_sum_16_sse2(left);
+ left_sum = _mm_add_epi16(top_sum, left_sum);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
+ sum += 24;
+ sum /= 48;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_32(above);
+ __m256i sum_left = dc_sum_64(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 48;
+ sum /= 96;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_64(above);
+ __m256i sum_left = dc_sum_64(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 64;
+ sum /= 128;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_64(above);
+ __m256i sum_left = dc_sum_32(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 48;
+ sum /= 96;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_64(above);
+ __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 40;
+ sum /= 80;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i sum = dc_sum_16_sse2(left);
+ (void)above;
+
+ const __m128i eight = _mm_set1_epi16(8);
+ sum = _mm_add_epi16(sum, eight);
+ sum = _mm_srai_epi16(sum, 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i r = _mm_shuffle_epi8(sum, zero);
+ const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(left);
+ (void)above;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(left);
+ (void)above;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(left);
+ (void)above;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i sum = dc_sum_16_sse2(left);
+ (void)above;
+
+ const __m128i eight = _mm_set1_epi16(8);
+ sum = _mm_add_epi16(sum, eight);
+ sum = _mm_srai_epi16(sum, 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i r = _mm_shuffle_epi8(sum, zero);
+ const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 16 16-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+ const __m256i *topleft) {
+ const __m256i base =
+ _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
+
+ __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
+ __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
+ __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
+
+ __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
+ mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
+ __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
+
+ pl = _mm256_andnot_si256(mask1, *left);
+
+ ptl = _mm256_and_si256(mask2, *topleft);
+ pt = _mm256_andnot_si256(mask2, *top);
+ pt = _mm256_or_si256(pt, ptl);
+ pt = _mm256_and_si256(mask1, pt);
+
+ return _mm256_or_si256(pt, pl);
+}
+
+// Return 16 8-bit pixels in one row (__m128i)
+static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+ const __m256i *topleft) {
+ const __m256i p0 = paeth_pred(left, top, topleft);
+ const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i p = _mm256_packus_epi16(p0, p1);
+ return _mm256_castsi256_si128(p);
+}
+
+static INLINE __m256i get_top_vector(const uint8_t *above) {
+ const __m128i x = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t0 = _mm_unpacklo_epi8(x, zero);
+ const __m128i t1 = _mm_unpackhi_epi8(x, zero);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
+}
+
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i x = _mm_loadl_epi64((const __m128i *)left);
+ const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+static INLINE __m256i get_left_vector(const uint8_t *left) {
+ const __m128i x = _mm_load_si128((const __m128i *)left);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+}
+
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i l = get_left_vector(left);
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m256i l = get_left_vector(left);
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+
+ l = get_left_vector(left + 16);
+ rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ for (int j = 0; j < 4; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+// Return 32 8-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+ const __m256i *top1,
+ const __m256i *topleft) {
+ __m256i p0 = paeth_pred(left, top0, topleft);
+ __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i x0 = _mm256_packus_epi16(p0, p1);
+
+ p0 = paeth_pred(left, top1, topleft);
+ p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i x1 = _mm256_packus_epi16(p0, p1);
+
+ return _mm256_permute2x128_si256(x0, x1, 0x20);
+}
+
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i l = get_left_vector(left);
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
+
+ _mm256_storeu_si256((__m256i *)dst, r);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m256i l = get_left_vector(left);
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+
+ l = get_left_vector(left + 16);
+ rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i t2 = get_top_vector(above + 32);
+ const __m256i t3 = get_top_vector(above + 48);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i, j;
+ for (j = 0; j < 2; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+ const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+ const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i t2 = get_top_vector(above + 32);
+ const __m256i t3 = get_top_vector(above + 48);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+ const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+ const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i t2 = get_top_vector(above + 32);
+ const __m256i t3 = get_top_vector(above + 48);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ const __m256i l = get_left_vector(left);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+ const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+ const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
+#define PERM2x128(c0, c1) c0 + (c1 << 4)
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff, c3f;
+ __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+ __m128i a0_128, a1_128;
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+ max_base_x128 = _mm_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+ if (upsample_above) {
+ a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
+ a1_128 = _mm_srli_si128(a0_128, 8);
+
+ base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
+ base + 10, base + 12, base + 14);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
+ _mm256_set1_epi16(0x3f)),
+ 1);
+ } else {
+ base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
+ base + 5, base + 6, base + 7);
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_castsi128_si256(a0_128);
+ a1 = _mm256_castsi128_si256(a1_128);
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+ res1 = _mm256_castsi256_si128(res);
+
+ mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
+ dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff;
+ __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+ max_base_x128 = _mm_set1_epi32(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ if (upsample_above) {
+ a0 = _mm256_permutevar8x32_epi32(
+ a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+ base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+ _mm256_set1_epi32(0x3f)),
+ 1);
+ } else {
+ base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+ }
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ res1 = _mm256_castsi256_si128(res);
+ res1 = _mm_packus_epi32(res1, res1);
+
+ mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
+ mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit
+ dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m128i dstvec[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a0_1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi32(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res1, shift;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
+ }
+ return;
+ }
+
+ a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ if (upsample_above) {
+ a0 = _mm256_permutevar8x32_epi32(
+ a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+ a0_1 =
+ _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+ a0_1 = _mm256_permutevar8x32_epi32(
+ a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+ a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+ a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+ base_inc256 =
+ _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
+ base + 10, base + 12, base + 14);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+ _mm256_set1_epi32(0x3f)),
+ 1);
+ } else {
+ base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+ }
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ res1 = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
+ mask256 = _mm256_packs_epi32(
+ mask256, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(mask256, 1))); // goto 16 bit
+ res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ dst[r] = _mm256_castsi256_si128(res1);
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res1, shift;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
+ }
+ return;
+ }
+
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
+ if (upsample_above) {
+ __m128i mask, atmp0, atmp1, atmp2, atmp3;
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
+ atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+ atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+ atmp2 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+ atmp3 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+ mask =
+ _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
+ a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+ mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
+ _mm_set1_epi8(15));
+ a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+ base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
+ base + 8, base + 10, base + 12, base + 14,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+ base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7, 0,
+ 0, 0, 0, 0, 0, 0, 0);
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_castsi128_si256(a0_x128);
+ a1 = _mm256_castsi128_si256(a1_x128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ dst[r] = _mm256_castsi256_si128(res1);
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m128i dstvec[32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((16 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res[2], res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 16 values
+ }
+ return;
+ }
+ __m256i shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+ a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+ int mdif = max_base_x - base;
+ if (mdif > 8) {
+ a0_1 =
+ _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+ a1_1 =
+ _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+ 1); // 16 16bit values
+
+ base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7,
+ base + 8, base + 9, base + 10, base + 11,
+ base + 12, base + 13, base + 14, base + 15);
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((16 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 16 values
+ }
+ return;
+ }
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ a0 = _mm256_loadu_si256((__m256i *)(above + base));
+ a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16bit values
+
+ base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7,
+ base + 8, base + 9, base + 10, base + 11,
+ base + 12, base + 13, base + 14, base + 15);
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m256i dstvec[64];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res[2], res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ dstvec[i + N] = a_mbase_x;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+ for (int j = 0; j < 32; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ res1 = a_mbase_x;
+ } else {
+ a0 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + j)));
+ a1 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+ if (mdif > 8) {
+ a0_1 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+ a1_1 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+ 1); // 16 16bit values
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ }
+ if (!j) {
+ dstvec[r] = res1;
+ } else {
+ dstvec[r + N] = res1;
+ }
+ }
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ dstvec[i + N] = a_mbase_x;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ for (int j = 0; j < 32; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ res = a_mbase_x;
+ } else {
+ a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+ a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ }
+ if (!j) {
+ dstvec[r] = res;
+ } else {
+ dstvec[r + N] = res;
+ }
+ }
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m256i dstvec[128];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
+ }
+}
+
+static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above,
+ int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res[2], res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+ __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+ a0 = _mm256_cvtepu16_epi32(a0_128);
+ a1 = _mm256_cvtepu16_epi32(a1_128);
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+ if (mdif > 8) {
+ a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+ a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+ a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
+ a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+ 1); // 16 16bit values
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ _mm256_storeu_si256((__m256i *)(dst + j), res1);
+ }
+ }
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+ } else {
+ a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+ a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int dx, int dy, int bd) {
+ (void)left;
+ (void)dy;
+
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 64:
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
+ upsample_above, dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
+ upsample_above, dx);
+ }
+ break;
+ default: break;
+ }
+ return;
+}
+
+static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
+ uint16_t *dst, ptrdiff_t pitchDst) {
+ __m256i r[16];
+ __m256i d[16];
+ for (int j = 0; j < 16; j++) {
+ r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
+ }
+ highbd_transpose16x16_avx2(r, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
+ }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+ uint16_t *dst, ptrdiff_t pitchDst, int width,
+ int height) {
+ for (int j = 0; j < height; j += 16)
+ for (int i = 0; i < width; i += 16)
+ highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16;
+ __m256i diff;
+ __m128i c3f, min_base_y128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm_set1_epi32(0x3f);
+ min_base_y128 = _mm_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ a0_x128 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(
+ _mm_slli_epi32(
+ _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ DECLARE_ALIGNED(32, int, base_y_c[4]);
+ r6 = _mm_set1_epi32(r << 6);
+ dy128 = _mm_set1_epi32(dy);
+ c1234 = _mm_setr_epi32(1, 2, 3, 4);
+ y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+ base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]]);
+ a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi32(
+ _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resx = _mm_packus_epi32(resx, resx);
+
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi32(resy, resy);
+
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16;
+ __m256i diff;
+ __m128i c3f, min_base_y128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ a0_x128 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx,
+ (3 << 6) - y * dx, 0, 0, 0, 0),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+ (3 << 6) - y * dx, 0, 0, 0, 0),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ r6 = _mm_set1_epi16(r << 6);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+ a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
+ 0, 0);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resy = _mm256_extracti128_si256(res, 1);
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+ __m256i diff;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm256_set1_epi32(0x3f);
+ min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx = _mm_setzero_si128();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ __m128i mask, atmp0, atmp1, atmp2, atmp3;
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+ atmp0 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp1 = _mm_shuffle_epi8(a1_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp2 = _mm_shuffle_epi8(
+ a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ atmp3 = _mm_shuffle_epi8(
+ a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+ _mm_set1_epi8(15));
+ a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+ mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+ _mm_set1_epi8(15));
+ a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(
+ _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1);
+ } else {
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+ (3 << 6) - y * dx, (4 << 6) - y * dx,
+ (5 << 6) - y * dx, (6 << 6) - y * dx,
+ (7 << 6) - y * dx),
+ c3f),
+ 1);
+ }
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ }
+ // y calc
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int, base_y_c[8]);
+ __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+ r6 = _mm256_set1_epi32(r << 6);
+ dy256 = _mm256_set1_epi32(dy);
+ c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]));
+ a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+ if (upsample_left) {
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+ }
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resy = resx;
+ }
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i c3f, min_base_y128;
+ __m256i a0_x, a1_x, diff, a32, a16;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ __m128i mask, atmp0, atmp1, atmp2, atmp3;
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+ atmp0 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp1 = _mm_shuffle_epi8(a1_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp2 = _mm_shuffle_epi8(
+ a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ atmp3 = _mm_shuffle_epi8(
+ a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+ _mm_set1_epi8(15));
+ a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+ mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+ _mm_set1_epi8(15));
+ a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(
+ _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ }
+
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi16(r << 6);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+ left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resy = _mm256_extracti128_si256(res, 1);
+
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+ int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
+ __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+ DECLARE_ALIGNED(32, int, base_y_c[16]);
+
+ a16 = _mm256_set1_epi32(16);
+ c1 = _mm256_srli_epi32(a16, 4);
+ c8 = _mm256_srli_epi32(a16, 1);
+ min_base_y256 = _mm256_set1_epi32(min_base_y);
+ c3f = _mm256_set1_epi32(0x3f);
+ dy256 = _mm256_set1_epi32(dy);
+ c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ c1234 = _mm256_add_epi32(c0123, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift, ydx;
+ __m256i resx[2], resy[2];
+ __m256i resxy, j256, r6;
+ for (int j = 0; j < W; j += 16) {
+ j256 = _mm256_set1_epi32(j);
+ int y = r + 1;
+ ydx = _mm256_set1_epi32(y * dx);
+
+ int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if ((base_x) < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1);
+ }
+ int base_min_diff = (min_base_x - base_x);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx[0] = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+ r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx[0] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+ }
+ int base_shift8 = 0;
+ if ((base_x + 8) < (min_base_x - 1)) {
+ base_shift8 = (min_base_x - (base_x + 8) - 1);
+ }
+ if (base_shift8 > 7) {
+ resx[1] = _mm256_setzero_si256();
+ } else {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
+ a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift8]);
+ a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+ a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+ a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+ r6 = _mm256_slli_epi32(
+ _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ resx[1] = _mm256_add_epi32(a32, b);
+ resx[1] = _mm256_srli_epi32(resx[1], 5);
+ resx[1] = _mm256_packus_epi32(
+ resx[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+ }
+ resx[0] =
+ _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+ 1); // 16 16bit values
+
+ // y calc
+ resy[0] = _mm256_setzero_si256();
+ if ((base_x < min_base_x)) {
+ __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
+ r6 = _mm256_set1_epi32(r << 6);
+ c256 = _mm256_add_epi32(j256, c1234);
+ y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+ c256 = _mm256_add_epi32(c256, c8);
+ y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]));
+ a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy[0] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+ left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]));
+ a1_y = _mm256_cvtepu16_epi32(
+ _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+ left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+ left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+ left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy[1] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ resy[0] =
+ _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+ 1); // 16 16bit values
+ }
+
+ resxy = _mm256_blendv_epi8(resx[0], resy[0],
+ *(__m256i *)HighbdBaseMask[base_min_diff]);
+ _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+ int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16, c3f, c1;
+ __m256i diff, min_base_y256, dy256, c1234, c0123;
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+ a16 = _mm256_set1_epi16(16);
+ c1 = _mm256_srli_epi16(a16, 4);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi16(0x3f);
+ dy256 = _mm256_set1_epi16(dy);
+ c0123 =
+ _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ c1234 = _mm256_add_epi16(c0123, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift;
+ __m256i resx, resy, ydx;
+ __m256i resxy, j256, r6;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+ int y = r + 1;
+ ydx = _mm256_set1_epi16((short)(y * dx));
+
+ for (int j = 0; j < W; j += 16) {
+ j256 = _mm256_set1_epi16(j);
+ int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if ((base_x) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x)-1);
+ }
+ int base_min_diff = (min_base_x - base_x);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift < 8) {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ } else {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ }
+
+ int base_shift1 = 0;
+ if (base_shift > 8) {
+ base_shift1 = base_shift - 8;
+ }
+ if (base_shift1 < 8) {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
+ a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift1]);
+ a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift1]);
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+ }
+ r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
+
+ // y calc
+ resy = _mm256_setzero_si256();
+ __m256i a0_y, a1_y, shifty;
+ if ((base_x < min_base_x)) {
+ __m256i c256, y_c256, base_y_c256, mask256, mul16;
+ r6 = _mm256_set1_epi16(r << 6);
+ c256 = _mm256_add_epi16(j256, c1234);
+ mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+ _mm256_srli_epi16(min_base_y256, 1));
+ y_c256 = _mm256_sub_epi16(r6, mul16);
+ base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+
+ shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shifty);
+ res = _mm256_add_epi16(a32, b);
+ resy = _mm256_srli_epi16(res, 5);
+ }
+
+ resxy = _mm256_blendv_epi8(resx, resy,
+ *(__m256i *)HighbdBaseMask[base_min_diff]);
+ _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ case 8:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ default:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ }
+}
+
+// Directional prediction, zone 3 functions
+static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[4], d[4];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
+ &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+ return;
+}
+
+static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[8], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+ &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[4], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
+ upsample_left, dy);
+ }
+
+ highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[8], d[4];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+
+ highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+ &d[0], &d[1], &d[2], &d[3]);
+ _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[8], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose8x16_16x8_avx2(dstvec, d);
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride),
+ _mm256_extracti128_si256(d[i - 8], 1));
+ }
+}
+
+static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[16], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 16; i += 8) {
+ highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+ &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+ &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+ &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+ &d[5 + i], &d[6 + i], &d[7 + i]);
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+ }
+}
+
+static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[4], d[4], d1;
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose4x16_avx2(dstvec, d);
+ for (int i = 0; i < 4; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ d1 = _mm256_bsrli_epi128(d[i], 8);
+ _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
+ _mm256_castsi256_si128(d1));
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
+ _mm256_extracti128_si256(d1, 1));
+ }
+}
+
+static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[16], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose16x4_8x8_sse2(dstvec, d);
+
+ _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
+ _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
+ _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
+ _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
+ _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
+ _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
+ _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
+}
+
+static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[16], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+
+ for (int i = 0; i < 16; i += 8) {
+ highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+ }
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+}
+
+static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[32], d[32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
+ upsample_left, dy);
+ }
+
+ for (int i = 0; i < 32; i += 8) {
+ highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+ &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+ &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+ &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+ &d[5 + i], &d[6 + i], &d[7 + i]);
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
+ }
+}
+
+static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[16], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+
+ highbd_transpose16x16_avx2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[64], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose16x16_avx2(dstvec, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
+ }
+ highbd_transpose16x16_avx2(dstvec + 16, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
+ }
+ highbd_transpose16x16_avx2(dstvec + 32, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
+ }
+ highbd_transpose16x16_avx2(dstvec + 48, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
+ }
+}
+
+static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
+ dy);
+ }
+ highbd_transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[32], d[32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 32; i += 8) {
+ highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+ }
+ // store
+ for (int j = 0; j < 32; j += 16) {
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
+ _mm256_castsi256_si128(d[(i + j)]));
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
+ _mm256_castsi256_si128(d[(i + j) + 8]));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm256_storeu_si256(
+ (__m256i *)(dst + (i + j) * stride),
+ _mm256_inserti128_si256(
+ d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
+ }
+ }
+}
+
+static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[32], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 32; i += 16) {
+ highbd_transpose16x16_avx2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ uint16_t dstT[64 * 32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
+ dy);
+ }
+ highbd_transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
+ highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
+ highbd_transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
+ dy);
+ }
+ highbd_transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[64], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 64; i += 16) {
+ highbd_transpose16x16_avx2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_left,
+ int dx, int dy, int bd) {
+ (void)above;
+ (void)dx;
+
+ assert(dx == 1);
+ assert(dy > 0);
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 64:
+ highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ }
+ }
+ }
+ return;
+}
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
+ int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ __m256i b, res, shift;
+ __m128i res1, a0_128, a1_128;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+ if (base_max_diff > H) base_max_diff = H;
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+ if (upsample_above) {
+ a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
+ a1_128 = _mm_srli_si128(a0_128, 8);
+
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ res = _mm256_packus_epi16(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // goto 8 bit
+ res1 = _mm256_castsi256_si128(res); // 16 8bit values
+
+ dst[r] =
+ _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[16];
+
+ dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[32];
+
+ dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[64];
+
+ dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i a_mbase_x, diff, c3f;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res16[2];
+ __m128i a0_128, a1_128;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+ res16[jj] = _mm256_packus_epi16(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // 16 8bit values
+ }
+ }
+ res16[1] =
+ _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+ 1); // 32 8bit values
+
+ dstvec[r] = _mm256_blendv_epi8(
+ a_mbase_x, res16[1],
+ *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m256i dstvec[64];
+ dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i a_mbase_x, diff, c3f;
+ __m128i max_base_x128, base_inc128, mask128;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
+ max_base_x128 = _mm_set1_epi8(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res;
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ __m128i a0_128, a1_128, res128;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm_storeu_si128((__m128i *)(dst + j),
+ _mm256_castsi256_si128(a_mbase_x));
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+ res = _mm256_packus_epi16(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // 16 8bit values
+
+ base_inc128 =
+ _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+ (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+ (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+ (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+ (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+ (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+ (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+ (int8_t)(base + j + 14), (int8_t)(base + j + 15));
+
+ mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+ _mm_setzero_si128());
+ res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
+ _mm256_castsi256_si128(res), mask128);
+ _mm_storeu_si128((__m128i *)(dst + j), res128);
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32:
+ dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 64:
+ dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ default: break;
+ }
+ return;
+}
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0_x, a1_x, a32, a16, diff;
+ __m128i c3f, min_base_y128, c1234, dy128;
+
+ a16 = _mm_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+ c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+ dy128 = _mm_set1_epi16(dy);
+
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, shift, r6, ydx;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm_setzero_si128();
+ a1_x = _mm_setzero_si128();
+ shift = _mm_setzero_si128();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(c1234, 6);
+
+ if (upsample_above) {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1);
+ } else {
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+ }
+ a0_x = _mm_cvtepu8_epi16(a0_x128);
+ a1_x = _mm_cvtepu8_epi16(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ __m128i y_c128, base_y_c128, mask128, c1234_;
+ c1234_ = _mm_srli_si128(c1234, 2);
+ r6 = _mm_set1_epi16(r << 6);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+ base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+ a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+ shift = _mm_unpacklo_epi64(shift, shifty);
+ }
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(res, res);
+ resy = _mm_srli_si128(resx, 4);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ *(int *)(dst) = _mm_cvtsi128_si32(resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i diff, a32, a16;
+ __m256i a0_x, a1_x;
+ __m128i a0_x128, a1_x128, min_base_y128, c3f;
+ __m128i c1234, dy128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy, r6, ydx;
+
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+ if (upsample_above) {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1));
+ } else {
+ a1_x128 = _mm_srli_si128(a0_x128, 1);
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(
+ _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
+ }
+ a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+ a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+ }
+
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m128i y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi16(r << 6);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ base_y_c128 = _mm_add_epi16(
+ base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+ _mm256_castsi256_si128(res));
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi16(resy, resy);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
+ __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
+ __m128i a0_x128, a1_x128;
+
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ a16 = _mm256_set1_epi16(16);
+ c1 = _mm256_srli_epi16(a16, 4);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi16(0x3f);
+ dy256 = _mm256_set1_epi16(dy);
+ c0123 =
+ _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ c1234 = _mm256_add_epi16(c0123, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift, j256, r6, ydx;
+ __m128i resx, resy;
+ __m128i resxy;
+ int y = r + 1;
+ ydx = _mm256_set1_epi16((int16_t)(y * dx));
+
+ int base_x = (-y * dx) >> frac_bits_x;
+ for (int j = 0; j < W; j += 16) {
+ j256 = _mm256_set1_epi16(j);
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift < 16) {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ a0_x = _mm256_cvtepu8_epi16(a0_x128);
+ a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+ r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16-bit values
+ resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resx = _mm_setzero_si128();
+ }
+
+ // y calc
+ if (base_x < min_base_x) {
+ __m256i c256, y_c256, base_y_c256, mask256, mul16;
+ r6 = _mm256_set1_epi16(r << 6);
+ c256 = _mm256_add_epi16(j256, c1234);
+ mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+ _mm256_srli_epi16(min_base_y256, 1));
+ y_c256 = _mm256_sub_epi16(r6, mul16);
+
+ base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+
+ base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+ int16_t min_y = (int16_t)_mm_extract_epi16(
+ _mm256_extracti128_si256(base_y_c256, 1), 7);
+ int16_t max_y =
+ (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
+ int16_t offset_diff = max_y - min_y;
+
+ if (offset_diff < 16) {
+ __m256i min_y256 = _mm256_set1_epi16(min_y);
+
+ __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+ __m128i base_y_offset128 =
+ _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+ _mm256_extracti128_si256(base_y_offset, 1));
+
+ __m128i a0_y128 = _mm_maskload_epi32(
+ (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+ __m128i a1_y128 =
+ _mm_maskload_epi32((int *)(left + min_y + 1),
+ *(__m128i *)LoadMaskz2[offset_diff / 4]);
+ a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+ a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+ a0_y = _mm256_cvtepu8_epi16(a0_y128);
+ a1_y = _mm256_cvtepu8_epi16(a1_y128);
+ } else {
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ }
+ shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shifty);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16-bit values
+ resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resy = _mm_setzero_si128();
+ }
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ break;
+ }
+ return;
+}
+
+// z3 functions
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m256i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+ w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+ w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+ w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+ w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+ w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+ w4 = _mm256_unpacklo_epi16(w0, w1);
+ w5 = _mm256_unpacklo_epi16(w2, w3);
+ w12 = _mm256_unpacklo_epi16(w8, w9);
+ w13 = _mm256_unpacklo_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[0] = _mm256_unpacklo_epi64(w6, w14);
+ d[1] = _mm256_unpackhi_epi64(w6, w14);
+ d[2] = _mm256_unpacklo_epi64(w7, w15);
+ d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+ w4 = _mm256_unpackhi_epi16(w0, w1);
+ w5 = _mm256_unpackhi_epi16(w2, w3);
+ w12 = _mm256_unpackhi_epi16(w8, w9);
+ w13 = _mm256_unpackhi_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[4] = _mm256_unpacklo_epi64(w6, w14);
+ d[5] = _mm256_unpackhi_epi64(w6, w14);
+ d[6] = _mm256_unpacklo_epi64(w7, w15);
+ d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+ // upper half
+ w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+ w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+ w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+ w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+ w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+ w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+ w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+ w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+ w4 = _mm256_unpacklo_epi16(w0, w1);
+ w5 = _mm256_unpacklo_epi16(w2, w3);
+ w12 = _mm256_unpacklo_epi16(w8, w9);
+ w13 = _mm256_unpacklo_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[8] = _mm256_unpacklo_epi64(w6, w14);
+ d[9] = _mm256_unpackhi_epi64(w6, w14);
+ d[10] = _mm256_unpacklo_epi64(w7, w15);
+ d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+ w4 = _mm256_unpackhi_epi16(w0, w1);
+ w5 = _mm256_unpackhi_epi16(w2, w3);
+ w12 = _mm256_unpackhi_epi16(w8, w9);
+ w13 = _mm256_unpackhi_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[12] = _mm256_unpacklo_epi64(w6, w14);
+ d[13] = _mm256_unpackhi_epi64(w6, w14);
+ d[14] = _mm256_unpacklo_epi64(w7, w15);
+ d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[4];
+
+ dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3]);
+
+ *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+ *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+ *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+ *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+ return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+ &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+ &d[3]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+ _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+ _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+ &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ for (int i = 0; i < 8; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[4];
+
+ dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+ &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
+ transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+ dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+ d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm_srli_si128(d[i], 8));
+ }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
+ transpose4x16_sse2(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
+ for (int i = 4; i < 8; i++) {
+ d[i] = _mm_setzero_si128();
+ }
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[16], d[16];
+
+ dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i] = _mm256_setzero_si256();
+ }
+ transpose16x32_avx2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
+
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose16x8_8x16_sse2(
+ &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+ &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+ &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+ &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+ &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+ &d[6 + 8], &d[7 + 8]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[32], d[32];
+
+ dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+ transpose16x32_avx2(dstvec, d);
+ transpose16x32_avx2(dstvec + 16, d + 16);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride),
+ _mm256_castsi256_si128(d[j]));
+ _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+ _mm256_castsi256_si128(d[j + 16]));
+ }
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+ _mm256_extracti128_si256(d[j], 1));
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+ _mm256_extracti128_si256(d[j + 16], 1));
+ }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+ dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[16], d[16];
+
+ dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+ transpose16x32_avx2(dstvec, d);
+ // store
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride),
+ _mm256_castsi256_si128(d[j]));
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+ _mm256_extracti128_si256(d[j], 1));
+ }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[64 * 32];
+ dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[32 * 64];
+ dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+ transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[64 * 16];
+ dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[64], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 64:
+ dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 0000000000..61e29731c4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,1411 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; i += 2) {
+ *(uint32_t *)dst = dc;
+ dst += stride;
+ *(uint32_t *)dst = dc;
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_storel_epi64((__m128i *)dst, *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ _mm_store_si128((__m128i *)(dst + 16), *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ _mm_store_si128((__m128i *)(dst + 16), *row);
+ _mm_store_si128((__m128i *)(dst + 32), *row);
+ _mm_store_si128((__m128i *)(dst + 48), *row);
+ dst += stride;
+ }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+ __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_unpacklo_epi8(x, zero);
+ return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+ __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+ __m128i x0 = _mm_load_si128((__m128i const *)ref);
+ __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+ __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+ __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+ const __m128i zero = _mm_setzero_si128();
+ x0 = _mm_sad_epu8(x0, zero);
+ x1 = _mm_sad_epu8(x1, zero);
+ x2 = _mm_sad_epu8(x2, zero);
+ x3 = _mm_sad_epu8(x3, zero);
+ x0 = _mm_add_epi16(x0, x1);
+ x2 = _mm_add_epi16(x2, x3);
+ x0 = _mm_add_epi16(x0, x2);
+ const __m128i high = _mm_unpackhi_epi64(x0, x0);
+ return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> DC_SHIFT2;
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_4(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 6;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ __m128i sum_above = dc_sum_4(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 10;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_4(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 6;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 12;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_32_sse2(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 20;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_4(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 10;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 12;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_32_sse2(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 24;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_64(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 40;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sum_left = dc_sum_8(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 20;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 24;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sum_left = dc_sum_64(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 48;
+ sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i sum_left = dc_sum_64(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 64;
+ sum /= 128;
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i sum_left = dc_sum_32_sse2(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 48;
+ sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 40;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_4(above);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_above = _mm_add_epi16(sum_above, two);
+ sum_above = _mm_srai_epi16(sum_above, 2);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_4(above);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_above = _mm_add_epi16(sum_above, two);
+ sum_above = _mm_srai_epi16(sum_above, 2);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_4(left);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_left = _mm_add_epi16(sum_left, two);
+ sum_left = _mm_srai_epi16(sum_left, 2);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32_sse2(left);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_4(left);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_left = _mm_add_epi16(sum_left, two);
+ sum_left = _mm_srai_epi16(sum_left, 2);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32_sse2(left);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32_sse2(left);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const uint32_t pred = 0x80808080;
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const uint32_t pred = 0x80808080;
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t pred = *(uint32_t *)above;
+ (void)left;
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t pred = *(uint32_t *)above;
+ (void)left;
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int height) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ for (int i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int height) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+ const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+ for (int i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ _mm_store_si128((__m128i *)(dst + 32), row2);
+ _mm_store_si128((__m128i *)(dst + 48), row3);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_64xh(dst, stride, above, 16);
+}
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+ left_col = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+ left_col = _mm_unpackhi_epi64(left_col, left_col);
+ row0 = _mm_shufflelo_epi16(left_col, 0);
+ row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_load_si128((__m128i const *)left);
+ __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+ __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+
+ left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+ row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+
+ left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+ left_col = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int count) {
+ (void)above;
+ for (int i = 0; i < count; ++i) {
+ const __m128i left_col = _mm_load_si128((__m128i const *)left);
+ __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+ __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+ row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+ left += 16;
+ }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
+static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ _mm_store_si128((__m128i *)dst, row[i]);
+ dst += stride;
+ }
+}
+
+static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+ const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
+ const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
+ const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
+ const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
+
+ row[0] = _mm_unpacklo_epi64(u0, u0);
+ row[1] = _mm_unpacklo_epi64(u1, u1);
+ row[2] = _mm_unpacklo_epi64(u2, u2);
+ row[3] = _mm_unpacklo_epi64(u3, u3);
+}
+
+static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+ const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
+ const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
+ const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
+ const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
+
+ row[0] = _mm_unpackhi_epi64(u0, u0);
+ row[1] = _mm_unpackhi_epi64(u1, u1);
+ row[2] = _mm_unpackhi_epi64(u2, u2);
+ row[3] = _mm_unpackhi_epi64(u3, u3);
+}
+
+// Process 16x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_low_4pixels(left, row);
+ h_pred_store_16xh(row, 4, dst, stride);
+}
+
+// Process 16x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_high_4pixels(left, row);
+ h_pred_store_16xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int count) {
+ int i = 0;
+ do {
+ const __m128i left_col = _mm_load_si128((const __m128i *)left);
+ const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
+ dst += stride << 2;
+
+ const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
+ dst += stride << 2;
+
+ left += 16;
+ i++;
+ } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_16xh(dst, stride, left, 4);
+}
+
+static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ _mm_store_si128((__m128i *)dst, row[i]);
+ _mm_store_si128((__m128i *)(dst + 16), row[i]);
+ dst += stride;
+ }
+}
+
+// Process 32x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_low_4pixels(left, row);
+ h_pred_store_32xh(row, 4, dst, stride);
+}
+
+// Process 32x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_high_4pixels(left, row);
+ h_pred_store_32xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i left_col, left_col_8p;
+ (void)above;
+
+ left_col = _mm_load_si128((const __m128i *)left);
+
+ left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i left_col, left_col_8p;
+ (void)above;
+
+ left_col = _mm_load_si128((const __m128i *)left);
+
+ left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+ dst += stride << 2;
+
+ left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int height) {
+ int i = height >> 2;
+ do {
+ __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+ const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r0);
+ _mm_store_si128((__m128i *)(dst + stride), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+ const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+ const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+ _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+ left += 4;
+ dst += stride * 4;
+ } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int height) {
+ int i = height >> 2;
+ do {
+ __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+ const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r0);
+ _mm_store_si128((__m128i *)(dst + 32), r0);
+ _mm_store_si128((__m128i *)(dst + 48), r0);
+ _mm_store_si128((__m128i *)(dst + stride), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+ const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+ const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+ _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+ left += 4;
+ dst += stride * 4;
+ } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse4.c b/third_party/aom/aom_dsp/x86/intrapred_sse4.c
new file mode 100644
index 0000000000..9de8bf3c0f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse4.c
@@ -0,0 +1,1307 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+// Low bit depth functions
+static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
+ { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff } },
+ {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
+ int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0, a1, a32, a16;
+ __m128i diff, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+ c3f = _mm_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ __m128i b, res, res1, shift;
+ __m128i a0_above, a1_above;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+ if (base_max_diff > H) base_max_diff = H;
+ a0_above = _mm_loadu_si128((__m128i *)(above + base));
+ a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+ if (upsample_above) {
+ a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
+ a1_above = _mm_srli_si128(a0_above, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+ }
+ // lower half
+ a0 = _mm_cvtepu8_epi16(a0_above);
+ a1 = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ // uppar half
+ a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ res = _mm_packus_epi16(res, res1);
+
+ dst[r] =
+ _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[32];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[64];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
+ dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
+ int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
+ int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0, a1, a32, a16;
+ __m128i a_mbase_x, diff, c3f;
+
+ a16 = _mm_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+ c3f = _mm_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, res1, res16[2];
+ __m128i a0_above, a1_above;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ dstvec_h[i] = a_mbase_x;
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+ __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+
+ // lower half
+ a0 = _mm_cvtepu8_epi16(a0_above);
+ a1 = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm_mullo_epi16(diff, shift);
+
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ // uppar half
+ a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values
+ }
+ }
+
+ dstvec[r] =
+ _mm_blendv_epi8(a_mbase_x, res16[0],
+ *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values
+
+ dstvec_h[r] =
+ _mm_blendv_epi8(a_mbase_x, res16[1],
+ *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[64], dstvec_h[64];
+ dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
+ upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
+ }
+}
+
+static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0, a1, a32, a16;
+ __m128i a_mbase_x, diff, c3f;
+ __m128i max_base, base_inc, mask;
+
+ a16 = _mm_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+ max_base = _mm_set1_epi8(max_base_x);
+ c3f = _mm_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m128i b, res, res1;
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values
+ _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
+ _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
+ _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m128i shift =
+ _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element
+
+ __m128i a0_above, a1_above, res_val;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
+ } else {
+ a0_above =
+ _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element
+ a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+
+ // lower half
+ a0 = _mm_cvtepu8_epi16(a0_above);
+ a1 = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm_mullo_epi16(diff, shift);
+
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ // uppar half
+ a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ res = _mm_packus_epi16(res, res1); // 16 8bit values
+
+ base_inc =
+ _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+ (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+ (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+ (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+ (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+ (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+ (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+ (int8_t)(base + j + 14), (int8_t)(base + j + 15));
+
+ mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
+ _mm_setzero_si128());
+ res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
+ _mm_storeu_si128((__m128i *)(dst + j), res_val);
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32:
+ dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 64:
+ dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ return;
+}
+
+static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0_x, a1_x, a32, diff;
+
+ const __m128i c3f = _mm_set1_epi16(0x3f);
+ const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+ const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+ const __m128i dy_reg = _mm_set1_epi16(dy);
+ const __m128i a16 = _mm_set1_epi16(16);
+
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, shift, r6, ydx;
+ __m128i resx, resy, resxy;
+ __m128i a0_above, a1_above;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm_setzero_si128();
+ a1_x = _mm_setzero_si128();
+ shift = _mm_setzero_si128();
+ } else {
+ a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(c1234, 6);
+
+ if (upsample_above) {
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_above = _mm_srli_si128(a0_above, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1);
+ } else {
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+ a1_above = _mm_srli_si128(a0_above, 1);
+
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+ }
+ a0_x = _mm_cvtepu8_epi16(a0_above);
+ a1_x = _mm_cvtepu8_epi16(a1_above);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ __m128i y_c, base_y_c_reg, mask, c1234_;
+ c1234_ = _mm_srli_si128(c1234, 2);
+ r6 = _mm_set1_epi16(r << 6);
+ y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
+ base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+ mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+ base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+ base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+ }
+ a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+ a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+ shift = _mm_unpacklo_epi64(shift, shifty);
+ }
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(res, res);
+ resy = _mm_srli_si128(resx, 4);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+ *(int *)(dst) = _mm_cvtsi128_si32(resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i diff, a32;
+ __m128i a0_x, a1_x, a0_y, a1_y;
+ __m128i a0_above, a1_above;
+
+ const __m128i a16 = _mm_set1_epi16(16);
+ const __m128i c3f = _mm_set1_epi16(0x3f);
+ const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+ const __m128i dy_reg = _mm_set1_epi16(dy);
+ const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, res1, shift;
+ __m128i resx, resy, resxy, r6, ydx;
+
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx = _mm_setzero_si128();
+ } else {
+ a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+ if (upsample_above) {
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_above = _mm_srli_si128(a0_above, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1);
+ } else {
+ a1_above = _mm_srli_si128(a0_above, 1);
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+ a1_above =
+ _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+ }
+ a0_x = _mm_cvtepu8_epi16(a0_above);
+ a1_x = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+ resx = _mm_packus_epi16(res, res);
+ }
+
+ // y calc
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m128i y_c, base_y_c_reg, mask;
+ r6 = _mm_set1_epi16(r << 6);
+ y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
+ base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+ mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+ base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+
+ if (upsample_left) {
+ shift = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+ } else {
+ shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+ }
+
+ diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ resy = _mm_packus_epi16(res1, res1);
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+ _mm_storel_epi64((__m128i *)dst, resxy);
+ } else {
+ _mm_storel_epi64((__m128i *)dst, resx);
+ }
+
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
+ __m128i diff, shifty, shifty_h;
+ __m128i a0_above, a1_above;
+
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ const __m128i a16 = _mm_set1_epi16(16);
+ const __m128i c1 = _mm_srli_epi16(a16, 4);
+ const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+ const __m128i c3f = _mm_set1_epi16(0x3f);
+ const __m128i dy256 = _mm_set1_epi16(dy);
+ const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+ const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+ const __m128i c1234 = _mm_add_epi16(c0123, c1);
+ const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m128i b, res, res1, shift, reg_j, r6, ydx;
+ __m128i resx, resy;
+ __m128i resxy;
+ int y = r + 1;
+ ydx = _mm_set1_epi16((int16_t)(y * dx));
+
+ int base_x = (-y * dx) >> frac_bits_x;
+ for (int j = 0; j < W; j += 16) {
+ reg_j = _mm_set1_epi16(j);
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift < 16) {
+ a0_above =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_above =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+ a1_above =
+ _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+ a0_x = _mm_cvtepu8_epi16(a0_above);
+ a1_x = _mm_cvtepu8_epi16(a1_above);
+
+ r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5); // 16 16-bit values
+
+ a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
+
+ resx = _mm_packus_epi16(res, res1);
+ } else {
+ resx = _mm_setzero_si128();
+ }
+
+ // y calc
+ if (base_x < min_base_x) {
+ __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
+ __m128i mask, mask_h, mul16, mul16_h;
+ r6 = _mm_set1_epi16(r << 6);
+ c_reg = _mm_add_epi16(reg_j, c1234);
+ c_reg_h = _mm_add_epi16(reg_j, c1234_h);
+ mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
+ _mm_srli_epi16(min_y_base, 1));
+ mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
+ _mm_srli_epi16(min_y_base, 1));
+ y_reg = _mm_sub_epi16(r6, mul16);
+ y_reg_h = _mm_sub_epi16(r6, mul16_h);
+
+ base_y = _mm_srai_epi16(y_reg, frac_bits_y);
+ base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
+ mask = _mm_cmpgt_epi16(min_y_base, base_y);
+ mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
+
+ base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
+ base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
+ int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
+ int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
+ int16_t offset_diff = max_y - min_y;
+
+ if (offset_diff < 16) {
+ __m128i min_y_reg = _mm_set1_epi16(min_y);
+
+ __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
+ __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
+ __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
+
+ __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
+ __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
+ __m128i LoadMask =
+ _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
+
+ a0_mask = _mm_and_si128(a0_mask, LoadMask);
+ a1_mask = _mm_and_si128(a1_mask, LoadMask);
+
+ a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
+ a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
+ a0_y = _mm_cvtepu8_epi16(a0_mask);
+ a1_y = _mm_cvtepu8_epi16(a1_mask);
+ a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
+ a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
+ } else {
+ base_y = _mm_andnot_si128(mask, base_y);
+ base_y_h = _mm_andnot_si128(mask_h, base_y_h);
+ _mm_store_si128((__m128i *)base_y_c, base_y);
+ _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+ left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]);
+ base_y = _mm_add_epi16(base_y, c1);
+ base_y_h = _mm_add_epi16(base_y_h, c1);
+ _mm_store_si128((__m128i *)base_y_c, base_y);
+ _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+ left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]);
+ }
+ shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
+ shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
+
+ diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shifty);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5); // 16 16-bit values
+
+ diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shifty_h);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
+ resy = _mm_packus_epi16(res, res1);
+ } else {
+ resy = _mm_setzero_si128();
+ }
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ }
+ return;
+}
+
+// z3 functions
+static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[4];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3]);
+
+ *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+ *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+ *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+ *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+ return;
+}
+
+static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+ &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+ &d[3]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+ _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+ _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+ &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ for (int i = 0; i < 8; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[4];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+ &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
+ transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+ dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+ d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm_srli_si128(d[i], 8));
+ }
+}
+
+static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
+ transpose4x16_sse2(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
+ for (int i = 4; i < 8; i++) {
+ d[i] = _mm_setzero_si128();
+ }
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+ dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
+ upsample_left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i] = _mm_setzero_si128();
+ dstvec_h[i] = _mm_setzero_si128();
+ }
+ transpose16x16_sse2(dstvec, d);
+ transpose16x16_sse2(dstvec_h, d_h);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ }
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
+ }
+}
+
+static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
+
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose16x8_8x16_sse2(
+ &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+ &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+ &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+ &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+ &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+ &d[6 + 8], &d[7 + 8]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
+
+ dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
+ upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+ transpose16x16_sse2(dstvec_h, d_h);
+ transpose16x16_sse2(dstvec + 16, d + 16);
+ transpose16x16_sse2(dstvec_h + 16, d_h + 16);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+ _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
+ }
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
+ }
+}
+
+static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[64 * 64];
+ dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+ dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
+ upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+ transpose16x16_sse2(dstvec_h, d_h);
+ // store
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+ }
+}
+
+static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[64 * 32];
+ dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[32 * 64];
+ dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
+ transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[64 * 16];
+ dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[64], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ transpose16x16_sse2(dstvec + i, d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 64:
+ dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
new file mode 100644
index 0000000000..fd48260c6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -0,0 +1,2997 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 8 16-bit pixels in one row
+static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+ const __m128i *topleft) {
+ const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
+
+ __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
+ __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
+ __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
+
+ __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
+ mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
+ __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
+
+ pl = _mm_andnot_si128(mask1, *left);
+
+ ptl = _mm_and_si128(mask2, *topleft);
+ pt = _mm_andnot_si128(mask2, *top);
+ pt = _mm_or_si128(pt, ptl);
+ pt = _mm_and_si128(mask1, pt);
+
+ return _mm_or_si128(pl, pt);
+}
+
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int j = 0; j < 2; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+// Return 16 8-bit pixels in one row
+static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+ const __m128i *top1,
+ const __m128i *topleft) {
+ const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
+ const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
+ return _mm_packus_epi16(p0, p1);
+}
+
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+
+ l = _mm_load_si128((const __m128i *)(left + 16));
+ rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int j = 0; j < 4; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ __m128i l16;
+
+ for (int i = 0; i < 8; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+
+ rep = _mm_set1_epi16((short)0x8000);
+ l = _mm_load_si128((const __m128i *)(left + 16));
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+ const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+ const __m128i cl = _mm_unpacklo_epi8(c, zero);
+ const __m128i ch = _mm_unpackhi_epi8(c, zero);
+ const __m128i dl = _mm_unpacklo_epi8(d, zero);
+ const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i, j;
+ for (j = 0; j < 2; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+ const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+ const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+ const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+ const __m128i cl = _mm_unpacklo_epi8(c, zero);
+ const __m128i ch = _mm_unpackhi_epi8(c, zero);
+ const __m128i dl = _mm_unpacklo_epi8(d, zero);
+ const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+ const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+ const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+ const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+ const __m128i cl = _mm_unpacklo_epi8(c, zero);
+ const __m128i ch = _mm_unpackhi_epi8(c, zero);
+ const __m128i dl = _mm_unpacklo_epi8(d, zero);
+ const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i;
+ const __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+ const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+ const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
+ if (height == 4)
+ pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
+ else if (height == 8)
+ pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+ else
+ pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
+ pixels[2] = _mm_set1_epi16((int16_t)above[3]);
+
+ const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
+ const __m128i zero = _mm_setzero_si128();
+ d = _mm_unpacklo_epi8(d, zero);
+ pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(int height, __m128i *weight_h,
+ __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
+ weight_h[0] = _mm_unpacklo_epi8(t, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+ if (height == 8) {
+ const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
+ weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ } else if (height == 16) {
+ const __m128i weight =
+ _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
+ weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ }
+}
+
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+ const __m128i *ww, int h, uint8_t *dst,
+ ptrdiff_t stride, int second_half) {
+ const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set1_epi32(0xc080400);
+ __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+ : _mm_set1_epi16((short)0x8000);
+ __m128i d = _mm_set1_epi16(0x100);
+
+ for (int i = 0; i < h; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(pixel[1], rep);
+ b = _mm_unpacklo_epi16(b, pixel[2]);
+ __m128i sum = _mm_madd_epi16(b, ww[0]);
+
+ sum = _mm_add_epi32(s, sum);
+ sum = _mm_add_epi32(sum, round);
+ sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+ sum = _mm_shuffle_epi8(sum, gat);
+ *(int *)dst = _mm_cvtsi128_si32(sum);
+ dst += stride;
+
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 4, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w4(4, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 8, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w4(8, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 16, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w4(16, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
+ __m128i d = _mm_loadl_epi64((const __m128i *)above);
+ d = _mm_unpacklo_epi8(d, zero);
+ pixels[0] = _mm_unpacklo_epi16(d, bp);
+ pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+ pixels[3] = _mm_set1_epi16((int16_t)above[7]);
+
+ if (height == 4) {
+ pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
+ } else if (height == 8) {
+ pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+ } else if (height == 16) {
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ } else {
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+ pixels[7] = pixels[3];
+ }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(int height, __m128i *weight_h,
+ __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ const int we_offset = height < 8 ? 0 : 4;
+ __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
+ weight_h[0] = _mm_unpacklo_epi8(we, zero);
+ const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+
+ if (height == 4) {
+ we = _mm_srli_si128(we, 4);
+ __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+ __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+ weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+ weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+ } else {
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ }
+
+ if (height == 16) {
+ we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
+ weight_h[0] = _mm_unpacklo_epi8(we, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(we, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ } else if (height == 32) {
+ const __m128i weight_lo =
+ _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
+ weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ const __m128i weight_hi =
+ _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
+ weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+ weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+ weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+ }
+}
+
+static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+ const __m128i *ww, int h, uint8_t *dst,
+ ptrdiff_t stride, int second_half) {
+ const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+ __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+ : _mm_set1_epi16((short)0x8000);
+ __m128i d = _mm_set1_epi16(0x100);
+
+ int i;
+ for (i = 0; i < h; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+ __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(pixels[2], rep);
+ b = _mm_unpacklo_epi16(b, pixels[3]);
+ __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+ __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+ s0 = _mm_add_epi32(s0, sum0);
+ s0 = _mm_add_epi32(s0, round);
+ s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+ s1 = _mm_add_epi32(s1, sum1);
+ s1 = _mm_add_epi32(s1, round);
+ s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+ sum0 = _mm_packus_epi16(s0, s1);
+ sum0 = _mm_shuffle_epi8(sum0, gat);
+ _mm_storel_epi64((__m128i *)dst, sum0);
+ dst += stride;
+
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 4, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(4, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 8, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(8, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 16, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(16, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[8];
+ load_pixel_w8(above, left, 32, pixels);
+
+ __m128i wh[8], ww[2];
+ load_weight_w8(32, wh, ww);
+
+ smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+// TODO(slavarnway): Visual Studio only supports restrict when /std:c11
+// (available in 2019+) or greater is specified; __restrict can be used in that
+// case. This should be moved to rtcd and used consistently between the
+// function declarations and definitions to avoid warnings in Visual Studio
+// when defining LIBAOM_RESTRICT to restrict or __restrict.
+#if defined(_MSC_VER)
+#define LIBAOM_RESTRICT
+#else
+#define LIBAOM_RESTRICT restrict
+#endif
+
+static AOM_FORCE_INLINE __m128i Load4(const void *src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
+ return _mm_loadl_epi64((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
+ return _mm_loadu_si128((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
+ _mm_storel_epi64((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
+ _mm_storeu_si128((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
+ return _mm_unpacklo_epi8((x), _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
+ const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
+ return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
+ return _mm_unpacklo_epi16((x), _mm_setzero_si128());
+}
+
+void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column, int width,
+ int height) {
+ const uint8_t *const sm_weights_h = smooth_weights + height - 4;
+ const uint8_t *const sm_weights_w = smooth_weights + width - 4;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
+ const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
+ const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ for (int y = 0; y < height; ++y) {
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
+ const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+ __m128i scaled_bottom_left =
+ _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+ const __m128i weight_left_y =
+ _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+ scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+ scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+ for (int x = 0; x < width; x += 8) {
+ const __m128i top_x = LoadLo8(top_row + x);
+ const __m128i weights_x = LoadLo8(sm_weights_w + x);
+ const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+ const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
+ const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+ // Here opposite weights and pixels are multiplied, where the order of
+ // interleaving is indicated in the names.
+ __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+ __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+ // |scaled_bottom_left| is always scaled by the same weight each row, so
+ // we only derive |scaled_top_right| values here.
+ const __m128i inverted_weights_x =
+ _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
+ const __m128i scaled_top_right =
+ _mm_mullo_epi16(inverted_weights_x, top_right);
+ const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
+ const __m128i scaled_top_right_hi =
+ _mm_unpackhi_epi16(scaled_top_right, zero);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+ // The round value for RightShiftWithRounding was added with
+ // |scaled_bottom_left|.
+ pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+ pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+ }
+ dst += stride;
+ }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+// -----------------------------------------------------------------------------
+// Smooth horizontal/vertical helper functions.
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+static AOM_FORCE_INLINE void write_smooth_directional_sum16(
+ uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
+ const __m128i weights1, const __m128i weights2,
+ const __m128i scaled_corner1, const __m128i scaled_corner2,
+ const __m128i round) {
+ const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+ const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+ const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+ const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+ const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+ StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
+}
+
+static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
+ const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
+ const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+ return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+static AOM_FORCE_INLINE void write_smooth_directional_sum8(
+ uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
+ const __m128i *scaled_corner, const __m128i *round) {
+ const __m128i pred_sum =
+ smooth_directional_sum8(*pixels, *weights, *scaled_corner);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
+ StoreLo8(dst, _mm_packus_epi16(pred, pred));
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
+ const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
+ const int height, __m128i *pixels) {
+ __m128i top = Load4(above);
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ top = cvtepu8_epi16(top);
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
+ const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
+ __m128i *weights) {
+ const __m128i inverter = _mm_set1_epi16(256);
+
+ if (height == 4) {
+ const __m128i weight = Load4(weight_array);
+ weights[0] = cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else if (height == 8) {
+ const __m128i weight = LoadLo8(weight_array + 4);
+ weights[0] = cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else {
+ const __m128i weight = LoadUnaligned16(weight_array + 12);
+ const __m128i zero = _mm_setzero_si128();
+ weights[0] = cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ weights[2] = _mm_unpackhi_epi8(weight, zero);
+ weights[3] = _mm_sub_epi16(inverter, weights[2]);
+ }
+}
+
+static AOM_FORCE_INLINE void write_smooth_vertical4xh(
+ const __m128i *pixel, const __m128i *weight, const int height,
+ uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
+ const __m128i pred_round = _mm_set1_epi32(128);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int y = 0; y < height; ++y) {
+ const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+ const __m128i alternate_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+ // The madd instruction yields four results of the form:
+ // (top_row[x] * weight[y] + corner * inverted_weight[y])
+ __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+ sum = _mm_add_epi32(sum, pred_round);
+ sum = _mm_srai_epi32(sum, 8);
+ sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+ Store4(dst, sum);
+ dst += stride;
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void aom_smooth_v_predictor_4x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ __m128i pixels;
+ load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
+
+ __m128i weights[2];
+ load_smooth_vertical_weights4(smooth_weights, 4, weights);
+
+ write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ __m128i pixels;
+ load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
+
+ __m128i weights[2];
+ load_smooth_vertical_weights4(smooth_weights, 8, weights);
+
+ write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ __m128i pixels;
+ load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
+
+ __m128i weights[4];
+ load_smooth_vertical_weights4(smooth_weights, 16, weights);
+
+ write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
+ dst += stride << 3;
+ write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+ const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+}
+
+void aom_smooth_v_predictor_8x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_8x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+ const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+}
+
+void aom_smooth_v_predictor_16x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ const uint8_t *weights_base_ptr = smooth_weights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_32x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_32x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_32x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const uint8_t *weights_base_ptr = smooth_weights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_lolo = LoadUnaligned16(top_row);
+ const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+ const __m128i top5 = cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i top_lolo = LoadUnaligned16(top_row);
+ const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+ const __m128i top5 = cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_64x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+ const __m128i top_lolo = LoadUnaligned16(top_row);
+ const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+ const __m128i top5 = cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t *weights_base_ptr = smooth_weights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
+ uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
+ const __m128i *scaled_top_right, const __m128i *round) {
+ const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
+ const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi32(top_row[3]);
+ const __m128i left = cvtepu8_epi32(Load4(left_column));
+ const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi32(top_row[3]);
+ const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi32(Load4(left_column));
+ __m128i left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 4));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi32(top_row[3]);
+ const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi32(Load4(left_column));
+ __m128i left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 4));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 8));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 12));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+void aom_smooth_h_predictor_8x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i left = cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+void aom_smooth_h_predictor_8x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_8x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i left = cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i y_mask = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ const __m128i left2 =
+ cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2,
+ round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+ weights4, scaled_top_right3,
+ scaled_top_right4, round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
+ const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+ const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
+ const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+ const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
+ const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+ const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2,
+ round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+ weights4, scaled_top_right3,
+ scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
+ weights6, scaled_top_right5,
+ scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
+ weights8, scaled_top_right7,
+ scaled_top_right8, round);
+ dst += stride;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_utils.h b/third_party/aom/aom_dsp/x86/intrapred_utils.h
new file mode 100644
index 0000000000..502574673e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_utils.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+
+#include <emmintrin.h> // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+ { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+ { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+ { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+ { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+ { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+ { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+ { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+ { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+ { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+ { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+ { -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
+};
+
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+ __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm_unpackhi_epi8(x[0], x[1]);
+ w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+ ww0 = _mm_unpacklo_epi16(w0, w1);
+ ww1 = _mm_unpacklo_epi16(w2, w3);
+ ww2 = _mm_unpackhi_epi16(w0, w1);
+ ww3 = _mm_unpackhi_epi16(w2, w3);
+
+ w0 = _mm_unpacklo_epi32(ww0, ww1);
+ w2 = _mm_unpacklo_epi32(ww2, ww3);
+ w1 = _mm_unpackhi_epi32(ww0, ww1);
+ w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+ d[0] = _mm_unpacklo_epi64(w0, w2);
+ d[1] = _mm_unpackhi_epi64(w0, w2);
+ d[2] = _mm_unpacklo_epi64(w1, w3);
+ d[3] = _mm_unpackhi_epi64(w1, w3);
+
+ d[4] = _mm_srli_si128(d[0], 8);
+ d[5] = _mm_srli_si128(d[1], 8);
+ d[6] = _mm_srli_si128(d[2], 8);
+ d[7] = _mm_srli_si128(d[3], 8);
+
+ d[8] = _mm_srli_si128(d[0], 4);
+ d[9] = _mm_srli_si128(d[1], 4);
+ d[10] = _mm_srli_si128(d[2], 4);
+ d[11] = _mm_srli_si128(d[3], 4);
+
+ d[12] = _mm_srli_si128(d[0], 12);
+ d[13] = _mm_srli_si128(d[1], 12);
+ d[14] = _mm_srli_si128(d[2], 12);
+ d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm_unpacklo_epi8(x[4], x[5]);
+ w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+ w8 = _mm_unpacklo_epi8(x[8], x[9]);
+ w9 = _mm_unpacklo_epi8(x[10], x[11]);
+ w10 = _mm_unpacklo_epi8(x[12], x[13]);
+ w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[0] = _mm_unpacklo_epi64(w6, w14);
+ d[1] = _mm_unpackhi_epi64(w6, w14);
+ d[2] = _mm_unpacklo_epi64(w7, w15);
+ d[3] = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[4] = _mm_unpacklo_epi64(w6, w14);
+ d[5] = _mm_unpackhi_epi64(w6, w14);
+ d[6] = _mm_unpacklo_epi64(w7, w15);
+ d[7] = _mm_unpackhi_epi64(w7, w15);
+
+ // upper half
+ w0 = _mm_unpackhi_epi8(x[0], x[1]);
+ w1 = _mm_unpackhi_epi8(x[2], x[3]);
+ w2 = _mm_unpackhi_epi8(x[4], x[5]);
+ w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+ w8 = _mm_unpackhi_epi8(x[8], x[9]);
+ w9 = _mm_unpackhi_epi8(x[10], x[11]);
+ w10 = _mm_unpackhi_epi8(x[12], x[13]);
+ w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[8] = _mm_unpacklo_epi64(w6, w14);
+ d[9] = _mm_unpackhi_epi64(w6, w14);
+ d[10] = _mm_unpacklo_epi64(w7, w15);
+ d[11] = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[12] = _mm_unpacklo_epi64(w6, w14);
+ d[13] = _mm_unpackhi_epi64(w6, w14);
+ d[14] = _mm_unpacklo_epi64(w7, w15);
+ d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+ uint8_t *dst, ptrdiff_t pitchDst) {
+ __m128i r[16];
+ __m128i d[16];
+ for (int j = 0; j < 16; j++) {
+ r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+ }
+ transpose16x16_sse2(r, d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+ }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+ ptrdiff_t pitchDst, int width, int height) {
+ for (int j = 0; j < height; j += 16)
+ for (int i = 0; i < width; i += 16)
+ transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
+}
+
+#endif // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
diff --git a/third_party/aom/aom_dsp/x86/intrapred_x86.h b/third_party/aom/aom_dsp/x86/intrapred_x86.h
new file mode 100644
index 0000000000..b13f575a76
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_x86.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+
+#include <emmintrin.h> // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+ __m128i x = _mm_load_si128((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_sad_epu8(x, zero);
+ const __m128i high = _mm_unpackhi_epi64(x, x);
+ return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+ __m128i x0 = _mm_load_si128((__m128i const *)ref);
+ __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+ const __m128i zero = _mm_setzero_si128();
+ x0 = _mm_sad_epu8(x0, zero);
+ x1 = _mm_sad_epu8(x1, zero);
+ x0 = _mm_add_epi16(x0, x1);
+ const __m128i high = _mm_unpackhi_epi64(x0, x0);
+ return _mm_add_epi16(x0, high);
+}
+
+#endif // AOM_AOM_DSP_X86_INTRAPRED_X86_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..0bc841a7a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,107 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+ ; a c d b to a b c d
+ SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+ ; input:
+ ; m0 a
+ ; m1 b
+ ; m2 c
+ ; m3 d
+ paddw m0, m2
+ psubw m3, m1
+
+ ; wide subtract
+ punpcklwd m4, m0
+ punpcklwd m5, m3
+ psrad m4, 16
+ psrad m5, 16
+ psubd m4, m5
+ psrad m4, 1
+ packssdw m4, m4 ; e
+
+ psubw m5, m4, m1 ; b
+ psubw m4, m2 ; c
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a
+ SWAP 1, 5 ; m1 b
+ SWAP 2, 4 ; m2 c
+ ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movd m%3, [outputq]
+ movd m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+ mova m0, [inputq + 0]
+ packssdw m0, [inputq + 16]
+ mova m1, [inputq + 32]
+ packssdw m1, [inputq + 48]
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ REORDER_INPUTS
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ REORDER_INPUTS
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c
new file mode 100644
index 0000000000..16d2f4be7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static unsigned int sad4xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i;
+ assert(width == 4);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; i += 4) {
+ __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+ __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+ __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+ __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+ __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+ __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+ __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+ x0 = xx_loadl_32(b + 0 * b_stride);
+ x1 = xx_loadl_32(b + 1 * b_stride);
+ x2 = xx_loadl_32(b + 2 * b_stride);
+ x3 = xx_loadl_32(b + 3 * b_stride);
+ x_lo = _mm_unpacklo_epi32(x0, x1);
+ x_hi = _mm_unpacklo_epi32(x2, x3);
+
+ __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+ __m128i sad4x4 = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad4x4);
+
+ a += 4 * a_stride;
+ b += 4 * b_stride;
+ }
+
+ // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad8xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i;
+ assert(width == 8);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; i += 2) {
+ __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+ __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+ __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+ x0 = xx_loadl_64(b + 0 * b_stride);
+ x1 = xx_loadl_64(b + 1 * b_stride);
+
+ __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+ __m128i sad8x2 = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad8x2);
+
+ a += 2 * a_stride;
+ b += 2 * b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad16xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i;
+ assert(width == 16);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ __m128i x = xx_loadu_128(a);
+ __m128i y = xx_loadu_128(b);
+
+ __m128i sad16x1 = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad16x1);
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad32xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i, j;
+ assert(width == 32);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < 2; ++j) {
+ __m128i x = xx_loadu_128(a + j * 16);
+ __m128i y = xx_loadu_128(b + j * 16);
+
+ __m128i sad32_half = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad32_half);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad64xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i, j;
+ assert(width == 64);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < 4; ++j) {
+ __m128i x = xx_loadu_128(a + j * 16);
+ __m128i y = xx_loadu_128(b + j * 16);
+
+ __m128i sad64_quarter = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad64_quarter);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad128xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i, j;
+ assert(width == 128);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < 8; ++j) {
+ __m128i x = xx_loadu_128(a + j * 16);
+ __m128i y = xx_loadu_128(b + j * 16);
+
+ __m128i sad64_quarter = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad64_quarter);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+#define DIST_WTD_SADMXN_SSE2(m, n) \
+ unsigned int aom_dist_wtd_sad##m##x##n##_avg_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint8_t comp_pred[m * n]; \
+ aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+ jcp_param); \
+ return sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \
+ }
+
+DIST_WTD_SADMXN_SSE2(128, 128)
+DIST_WTD_SADMXN_SSE2(128, 64)
+DIST_WTD_SADMXN_SSE2(64, 128)
+DIST_WTD_SADMXN_SSE2(64, 64)
+DIST_WTD_SADMXN_SSE2(64, 32)
+DIST_WTD_SADMXN_SSE2(32, 64)
+DIST_WTD_SADMXN_SSE2(32, 32)
+DIST_WTD_SADMXN_SSE2(32, 16)
+DIST_WTD_SADMXN_SSE2(16, 32)
+DIST_WTD_SADMXN_SSE2(16, 16)
+DIST_WTD_SADMXN_SSE2(16, 8)
+DIST_WTD_SADMXN_SSE2(8, 16)
+DIST_WTD_SADMXN_SSE2(8, 8)
+DIST_WTD_SADMXN_SSE2(8, 4)
+DIST_WTD_SADMXN_SSE2(4, 8)
+DIST_WTD_SADMXN_SSE2(4, 4)
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SADMXN_SSE2(4, 16)
+DIST_WTD_SADMXN_SSE2(16, 4)
+DIST_WTD_SADMXN_SSE2(8, 32)
+DIST_WTD_SADMXN_SSE2(32, 8)
+DIST_WTD_SADMXN_SSE2(16, 64)
+DIST_WTD_SADMXN_SSE2(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 0000000000..dd798ca54a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w, const __m128i *r,
+ void *const result) {
+ __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+ __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+ __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+ __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+ __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+ __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+ __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+ __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ int i;
+ const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+ const int8_t w1 = (int8_t)jcp_param->bck_offset;
+ const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+ w1, w0, w1, w0);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+
+ if (width >= 16) {
+ // Read 16 pixels one row at a time
+ assert(!(width & 15));
+ for (i = 0; i < height; ++i) {
+ int j;
+ for (j = 0; j < width; j += 16) {
+ __m128i p0 = xx_loadu_128(ref);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 16;
+ }
+ ref += ref_stride - width;
+ }
+ } else if (width >= 8) {
+ // Read 8 pixels two row at a time
+ assert(!(width & 7));
+ assert(!(width & 1));
+ for (i = 0; i < height; i += 2) {
+ __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+ __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 2 * ref_stride;
+ }
+ } else {
+ // Read 4 pixels four row at a time
+ assert(!(width & 3));
+ assert(!(height & 3));
+ for (i = 0; i < height; i += 4) {
+ const int8_t *row0 = (const int8_t *)ref + 0 * ref_stride;
+ const int8_t *row1 = (const int8_t *)ref + 1 * ref_stride;
+ const int8_t *row2 = (const int8_t *)ref + 2 * ref_stride;
+ const int8_t *row3 = (const int8_t *)ref + 3 * ref_stride;
+
+ __m128i p0 =
+ _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+ row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+ row3[0], row3[1], row3[2], row3[3]);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 4 * ref_stride;
+ }
+ }
+}
+
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_ssse3( \
+ a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_ssse3( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \
+ jcp_param); \
+ \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
+ }
+
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..6e77742e3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,1016 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "config/aom_dsp_rtcd.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i mask, flat;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+ const __m128i limit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+ p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+
+ {
+ __m128i work;
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // loop filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i hev;
+
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat_p1, flat_p0, flat_q0, flat_q1;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixetFilter, add, res;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+
+ pixetFilter = _mm256_slli_epi16(
+ _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1);
+ pixetFilter =
+ _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0));
+ pixetFilter = _mm256_add_epi16(four, pixetFilter);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2),
+ _mm256_sub_epi16(q256_0, p256_2));
+ pixetFilter = _mm256_add_epi16(pixetFilter, add);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2),
+ _mm256_sub_epi16(q256_1, p256_1));
+ pixetFilter = _mm256_add_epi16(pixetFilter, add);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1),
+ _mm256_sub_epi16(q256_2, p256_0));
+ pixetFilter = _mm256_add_epi16(pixetFilter, add);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i mask, flat;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+ const __m128i limit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+ p256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+ p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+ q256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+
+ {
+ __m128i work;
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // loop filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i hev;
+
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p,
+ sum_q, res_p, res_q;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+ p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+ pixetFilter_p2p1p0 =
+ _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+ pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ sum_p = _mm256_sub_epi16(p256_3, q256_2);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ sum_q = _mm256_sub_epi16(q256_3, p256_2);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ sum_p = _mm256_sub_epi16(p256_3, q256_1);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ sum_q = _mm256_sub_epi16(q256_3, p256_1);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+ }
+ }
+}
+
+static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
+ unsigned char *out, int out_p,
+ int is_store_avx2) {
+ const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
+ const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1));
+ const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2));
+ const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3));
+ const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4));
+ const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5));
+ const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6));
+ const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7));
+
+ const __m256i y0 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)),
+ 0x1);
+ const __m256i y1 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)),
+ 0x1);
+ const __m256i y2 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)),
+ 0x1);
+ const __m256i y3 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)),
+ 0x1);
+ const __m256i y4 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)),
+ 0x1);
+ const __m256i y5 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)),
+ 0x1);
+ const __m256i y6 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)),
+ 0x1);
+ const __m256i y7 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)),
+ 0x1);
+
+ const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1);
+ const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1);
+ const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3);
+ const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3);
+ const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5);
+ const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5);
+ const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7);
+ const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7);
+
+ const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02);
+ const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02);
+ const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03);
+ const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03);
+ const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06);
+ const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06);
+ const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07);
+ const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07);
+
+ const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14);
+ const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14);
+ const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15);
+ const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15);
+ const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16);
+ const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16);
+ const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17);
+ const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17);
+
+ const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8);
+ const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8);
+ const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8);
+ const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8);
+ const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8);
+ const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8);
+ const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8);
+ const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8);
+
+ if (is_store_avx2) {
+ _mm256_storeu_si256((__m256i *)(out), row_s01);
+ _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23);
+ _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45);
+ _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67);
+ _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89);
+ _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011);
+ _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213);
+ _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415);
+ } else {
+ _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01));
+ _mm_storeu_si128((__m128i *)(out + (2 * out_p)),
+ _mm256_castsi256_si128(row_s23));
+ _mm_storeu_si128((__m128i *)(out + (4 * out_p)),
+ _mm256_castsi256_si128(row_s45));
+ _mm_storeu_si128((__m128i *)(out + (6 * out_p)),
+ _mm256_castsi256_si128(row_s67));
+ _mm_storeu_si128((__m128i *)(out + (8 * out_p)),
+ _mm256_castsi256_si128(row_s89));
+ _mm_storeu_si128((__m128i *)(out + (10 * out_p)),
+ _mm256_castsi256_si128(row_s1011));
+ _mm_storeu_si128((__m128i *)(out + (12 * out_p)),
+ _mm256_castsi256_si128(row_s1213));
+ _mm_storeu_si128((__m128i *)(out + (14 * out_p)),
+ _mm256_castsi256_si128(row_s1415));
+ _mm_storeu_si128((__m128i *)(out + (1 * out_p)),
+ _mm256_extracti128_si256(row_s01, 1));
+ _mm_storeu_si128((__m128i *)(out + (3 * out_p)),
+ _mm256_extracti128_si256(row_s23, 1));
+ _mm_storeu_si128((__m128i *)(out + (5 * out_p)),
+ _mm256_extracti128_si256(row_s45, 1));
+ _mm_storeu_si128((__m128i *)(out + (7 * out_p)),
+ _mm256_extracti128_si256(row_s67, 1));
+ _mm_storeu_si128((__m128i *)(out + (9 * out_p)),
+ _mm256_extracti128_si256(row_s89, 1));
+ _mm_storeu_si128((__m128i *)(out + (11 * out_p)),
+ _mm256_extracti128_si256(row_s1011, 1));
+ _mm_storeu_si128((__m128i *)(out + (13 * out_p)),
+ _mm256_extracti128_si256(row_s1213, 1));
+ _mm_storeu_si128((__m128i *)(out + (15 * out_p)),
+ _mm256_extracti128_si256(row_s1415, 1));
+ }
+}
+
+void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ __m128i mask, flat;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+ __m256i p256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+ __m256i p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ __m256i p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ __m256i p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ __m256i q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ __m256i q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ __m256i q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+ __m256i q256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+ __m128i p3 = _mm256_castsi256_si128(p256_3);
+ __m128i p2 = _mm256_castsi256_si128(p256_2);
+ __m128i p1 = _mm256_castsi256_si128(p256_1);
+ __m128i p0 = _mm256_castsi256_si128(p256_0);
+ __m128i q0 = _mm256_castsi256_si128(q256_0);
+ __m128i q1 = _mm256_castsi256_si128(q256_1);
+ __m128i q2 = _mm256_castsi256_si128(q256_2);
+ __m128i q3 = _mm256_castsi256_si128(q256_3);
+
+ {
+ const __m128i limit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ __m128i work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // loop filter
+ {
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t4 = _mm_add_epi8(one, t3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t7f = _mm_sub_epi8(t80, one);
+
+ __m128i hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+
+ __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ __m128i work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+
+ __m128i filter1 = _mm_adds_epi8(filt, t4);
+ __m128i filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, one);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+ // Derive flat
+ __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0);
+ __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0);
+ __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0);
+ const __m256i ps0qs0256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1);
+ const __m256i ps1qs1256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1);
+ const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p2q2256));
+ const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p3q3256));
+ const __m256i max0_256 = _mm256_max_epu8(work01, work02);
+ const __m128i max1_256 =
+ _mm_max_epu8(_mm256_castsi256_si128(max0_256),
+ _mm256_extractf128_si256(max0_256, 1));
+ flat = _mm_max_epu8(max1_256, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m256i flat256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1);
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+
+ __m256i p256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+ __m256i q256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+ __m256i p256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+ __m256i q256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+ __m256i p256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+ __m256i q256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+
+ // Derive flat2
+ __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0);
+ __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0);
+ const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0);
+ const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p4q4256));
+ const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p5q5256));
+ const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p6q6256));
+ __m256i flat2_256 = _mm256_max_epu8(work1, work2);
+ flat2_256 = _mm256_max_epu8(flat2_256, work3);
+ __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256),
+ _mm256_extractf128_si256(flat2_256, 1));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+ const __m256i p2p1p0 =
+ _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ const __m256i q2q1q0 =
+ _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+ __m256i pixetFilter_p2p1p0 =
+ _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+ __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+ // Derive p0 and q0
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+ __m256i res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+ __m256i res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+ __m256i flat_p0q0 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256);
+ flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0);
+ p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256);
+ p0 = _mm256_castsi256_si128(p0q0256);
+ q0 = _mm256_extractf128_si256(p0q0256, 1);
+
+ // Derive p1 and q1
+ __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+ __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+ __m256i flat_p1q1 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256);
+ flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1);
+ p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256);
+ p1 = _mm256_castsi256_si128(p1q1256);
+ q1 = _mm256_extractf128_si256(p1q1256, 1);
+
+ // Derive p2 and q2
+ sum_p = _mm256_sub_epi16(p256_3, q256_1);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+ sum_q = _mm256_sub_epi16(q256_3, p256_1);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+ __m256i flat_p2q2 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p2q2256 = _mm256_andnot_si256(flat256, p2q2256);
+ flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2);
+ p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256);
+ p2 = _mm256_castsi256_si128(p2q2256);
+ q2 = _mm256_extractf128_si256(p2q2256, 1);
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ flat2_256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+
+ __m256i pixelFilter_p =
+ _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
+ __m256i pixelFilter_q =
+ _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
+
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0);
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0);
+ pixelFilter_p = _mm256_add_epi16(
+ eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixelFilter_q = pixelFilter_p;
+
+ // Derive p0 and q0
+ pixelFilter_p =
+ _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ pixelFilter_q =
+ _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p0q0 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256);
+ flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0);
+ p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256);
+
+ p0 = _mm256_castsi256_si128(p0q0256);
+ q0 = _mm256_extractf128_si256(p0q0256, 1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+ // Derive p1 and q1
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
+ _mm256_sub_epi16(p256_2, q256_0));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
+ _mm256_sub_epi16(q256_2, p256_0));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p1q1 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256);
+ flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1);
+ p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256);
+ p1 = _mm256_castsi256_si128(p1q1256);
+ q1 = _mm256_extractf128_si256(p1q1256, 1);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+ // Derive p2 and q2
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
+ _mm256_sub_epi16(p256_3, p256_0));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
+ _mm256_sub_epi16(q256_3, q256_0));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p2q2 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256);
+ flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2);
+ p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256);
+ p2 = _mm256_castsi256_si128(p2q2256);
+ q2 = _mm256_extractf128_si256(p2q2256, 1);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+ // Derive p3 and q3
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
+ _mm256_sub_epi16(p256_4, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
+ _mm256_sub_epi16(q256_4, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p3q3 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256);
+ flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3);
+ p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256);
+ p3 = _mm256_castsi256_si128(p3q3256);
+ q3 = _mm256_extractf128_si256(p3q3256, 1);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ // Derive p4 and q4
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
+ _mm256_sub_epi16(p256_5, p256_2));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
+ _mm256_sub_epi16(q256_5, q256_2));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p4q4 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256);
+ flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4);
+ p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256);
+ _mm_storeu_si128((__m128i *)(s - 5 * p),
+ _mm256_castsi256_si128(p4q4256));
+ _mm_storeu_si128((__m128i *)(s + 4 * p),
+ _mm256_extractf128_si256(p4q4256, 1));
+
+ // Derive p5 and q5
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
+ _mm256_sub_epi16(p256_6, p256_3));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
+ _mm256_sub_epi16(q256_6, q256_3));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p5q5 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256);
+ flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5);
+ p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256);
+ _mm_storeu_si128((__m128i *)(s - 6 * p),
+ _mm256_castsi256_si128(p5q5256));
+ _mm_storeu_si128((__m128i *)(s + 5 * p),
+ _mm256_extractf128_si256(p5q5256, 1));
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ }
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+ }
+ }
+}
+
+void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1);
+
+ // Loop filtering
+ aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0,
+ _thresh0);
+
+ // Transpose back
+ trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0);
+}
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 0000000000..cdf24c332a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,2973 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8 independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *q0p0,
+ __m128i *q1p1, __m128i *q2p2,
+ __m128i *q3p3, __m128i *q4p4,
+ __m128i *q5p5, __m128i *q6p6,
+ __m128i *q7p7) {
+ __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi8(
+ *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+ w3 = _mm_unpackhi_epi8(
+ *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ww2 = _mm_unpacklo_epi16(
+ w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
+ ww3 = _mm_unpackhi_epi16(
+ w2,
+ w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
+
+ *q7p7 = _mm_unpacklo_epi32(
+ ww0,
+ _mm_srli_si128(
+ ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww0, 4),
+ ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(
+ ww0,
+ _mm_slli_si128(
+ ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
+ *q4p4 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww0, 12),
+ ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(
+ ww1,
+ _mm_srli_si128(
+ ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww1, 4),
+ ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(
+ ww1,
+ _mm_slli_si128(
+ ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww1, 12),
+ ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them independently while flipping the second matrix horizontaly Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *pq0, __m128i *pq1,
+ __m128i *pq2, __m128i *pq3) {
+ __m128i w10, w11, w12, w13;
+ __m128i w0, w1, w2, w3, w4, w5;
+ __m128i d0, d1, d2, d3;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w10 = _mm_unpacklo_epi8(
+ *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+ w11 = _mm_unpacklo_epi8(
+ *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+ w12 = _mm_unpacklo_epi8(
+ *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+ w13 = _mm_unpacklo_epi8(
+ *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+ w4 = _mm_unpackhi_epi16(
+ w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpackhi_epi16(
+ w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ *pq0 = _mm_unpacklo_epi64(d0, d1); // pq
+ *pq1 = _mm_unpackhi_epi64(d0, d1); // pq
+ *pq2 = _mm_unpacklo_epi64(d2, d3); // pq
+ *pq3 = _mm_unpackhi_epi64(d2, d3); // pq
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0, __m128i *ps1ps0) {
+ __m128i filter, filter2filter1, work;
+ __m128i ps1ps0_work, qs1qs0_work;
+ __m128i hev1;
+ const __m128i t3t4 =
+ _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+ const __m128i t80 = _mm_set1_epi8((char)0x80);
+ const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+ ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+ qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+ filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
+ filter = _mm_and_si128(filter, *mask); /* & mask */
+ filter = _mm_unpacklo_epi32(filter, filter);
+
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+ filter2filter1 =
+ _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+ filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit
+ filter = _mm_srai_epi16(filter, 9); /* round */
+ filter = _mm_packs_epi16(filter, filter);
+ filter = _mm_andnot_si128(*hev, filter);
+ filter = _mm_unpacklo_epi32(filter, filter);
+
+ filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+ hev1 = _mm_srli_si128(filter2filter1, 8);
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+ *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0,
+ __m128i *ps1ps0) {
+ const __m128i t3t4 =
+ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+ const __m128i t80 = _mm_set1_epi8((char)0x80);
+ __m128i filter, filter2filter1, work;
+ __m128i ps1ps0_work, qs1qs0_work;
+ __m128i hev1;
+ const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+ ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+ qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+ filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
+ filter = _mm_and_si128(filter, *mask); /* & mask */
+ filter = _mm_unpacklo_epi64(filter, filter);
+
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+ filter = _mm_srai_epi16(filter, 11); /* >> 3 */
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+ filter = _mm_unpacklo_epi8(filter, filter);
+ filter = _mm_srai_epi16(filter, 9); /* round */
+ filter = _mm_packs_epi16(filter, filter);
+ filter = _mm_andnot_si128(*hev, filter);
+
+ hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+ filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+ *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
+ __m128i mask, flat, hev;
+ const __m128i zero = _mm_setzero_si128();
+
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ hev = _mm_unpacklo_epi8(flat, zero);
+
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi32(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
+ __m128i mask, hev;
+ const __m128i zero = _mm_setzero_si128();
+
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ __m128i flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ hev = _mm_unpacklo_epi8(flat, zero);
+
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+
+ /* const int8_t mask = filter_mask2(*limit, *blimit, */
+ /* p1, p0, q0, q1); */
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi64(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+ const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+ __m128i qs1qs0, ps1ps0;
+ __m128i p1, p0, q0, q1;
+
+ p1 = xx_loadl_32(s - 2 * p);
+ p0 = xx_loadl_32(s - 1 * p);
+ q0 = xx_loadl_32(s - 0 * p);
+ q1 = xx_loadl_32(s + 1 * p);
+
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+ xx_storel_32(s - 1 * p, ps1ps0);
+ xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
+ xx_storel_32(s + 0 * p, qs1qs0);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
+}
+
+void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+ const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh) {
+ __m128i p1p0, q1q0;
+ __m128i p1, p0, q0, q1;
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+ __m128i x0, x1, x2, x3;
+ __m128i d0, d1, d2, d3;
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+ transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
+
+ // Transpose 8x4 to 4x8
+ p1 = _mm_srli_si128(p1p0, 4);
+ q1 = _mm_srli_si128(q1q0, 4);
+
+ transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+ xx_storel_32(s + 0 * p - 2, d0);
+ xx_storel_32(s + 1 * p - 2, d1);
+ xx_storel_32(s + 2 * p - 2, d2);
+ xx_storel_32(s + 3 * p - 2, d3);
+}
+
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+ xx_storel_32(s - (num + 1) * p, x);
+ xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
+ __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+ __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i mask, hev, flat, flat2;
+ __m128i qs0ps0, qs1ps1;
+ __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+ __m128i abs_p1p0;
+
+ p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+ q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+ __m128i fe, ff, work;
+ abs_p1p0 = abs_diff(*q1p1, *q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8((char)0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi64(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+ qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+ // loopfilter done
+
+ __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p6, sum_q6;
+ __m128i sum_p3, sum_q3, res_p, res_q;
+
+ p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+ pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+ _mm_add_epi16(p1_16, q0_16))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+ _mm_add_epi16(p0_16, q1_16))),
+ 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(p6_16, p6_16);
+ sum_q6 = _mm_add_epi16(q6_16, q6_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+ 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ // work with flat2
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat = _mm_unpacklo_epi64(flat, flat);
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+ __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+ __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i mask, hev, flat, flat2;
+ __m128i flat2_pq[6], flat_pq[3];
+ __m128i qs0ps0, qs1ps1;
+ __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+ __m128i abs_p1p0;
+
+ p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ __m128i fe, ff, work;
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+ abs_p1p0 = abs_diff(*q1p1, *q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+ fe = _mm_set1_epi8((char)0xfe);
+ ff = _mm_cmpeq_epi8(fe, fe);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+ qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+ // loopfilter done
+
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pq_16[7];
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i sum_p6;
+ __m128i sum_p3;
+
+ pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+ pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+ pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+ pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+ pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+ pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+ pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+ q0_16 = _mm_srli_si128(pq_16[0], 8);
+ q1_16 = _mm_srli_si128(pq_16[1], 8);
+ q2_16 = _mm_srli_si128(pq_16[2], 8);
+ q3_16 = _mm_srli_si128(pq_16[3], 8);
+ q4_16 = _mm_srli_si128(pq_16[4], 8);
+ q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
+ __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+ __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+ flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+ sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+ sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+ work0_1 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+ sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+ work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+ flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+ flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+ flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+ flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+ sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+ sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+ work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+ flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+ flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+ flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+ sum_p = _mm_sub_epi16(sum_p, q4_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+ flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q3_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+ flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q2_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+ flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q1_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
+ }
+}
+
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
+ q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
+ q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
+ q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
+
+ q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
+
+ q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
+
+ q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
+
+ lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+ &limit, &thresh);
+
+ store_buffer_horz_8(q0p0, p, 0, s);
+ store_buffer_horz_8(q1p1, p, 1, s);
+ store_buffer_horz_8(q2p2, p, 2, s);
+ store_buffer_horz_8(q3p3, p, 3, s);
+ store_buffer_horz_8(q4p4, p, 4, s);
+ store_buffer_horz_8(q5p5, p, 5, s);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
+ __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+ __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+ __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+ __m128i ps1ps0, qs1qs0;
+
+ q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
+ {
+ // filter_mask and hev_mask
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(*p1p0, *q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ // considering sse doesn't have unsigned elements comparison the idea is
+ // to find at least one case when X > limit, it means the corresponding
+ // mask bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi64(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = abs_diff(q2p2, q1p1);
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+ // flat_mask
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+ // op1
+ workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+ _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+ p2_16); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+ workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+ 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+ p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_add_epi16(q1_16, q2_16);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+ p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(q2_16, q2_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+ 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+ __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+ __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+ __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+ __m128i ps1ps0, qs1qs0;
+
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+ *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ {
+ // filter_mask and hev_mask
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+ abs_p0q0 = abs_diff(*p1p0, *q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ // considering sse doesn't have unsigned elements comparison the idea is
+ // to find at least one case when X > limit, it means the corresponding
+ // mask bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = abs_diff(q2p2, q1p1);
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+ // flat_mask
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
+ pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_srli_si128(pq0_16, 8);
+ q2_16 = _mm_srli_si128(pq2_16, 8);
+
+ // op1
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_b = _mm_srli_epi16(workp_b, 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+ pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+ pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(q2_16, q2_16);
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_a = _mm_srli_epi16(workp_a, 3);
+
+ flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ }
+}
+
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i p1p0, q1q0;
+ __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+ __m128i limit = _mm_load_si128((__m128i *)_limit);
+ __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+ p2 = xx_loadl_32(s - 3 * p);
+ p1 = xx_loadl_32(s - 2 * p);
+ p0 = xx_loadl_32(s - 1 * p);
+ q0 = xx_loadl_32(s - 0 * p);
+ q1 = xx_loadl_32(s + 1 * p);
+ q2 = xx_loadl_32(s + 2 * p);
+
+ lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
+
+ xx_storel_32(s - 1 * p, p1p0);
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+ xx_storel_32(s + 0 * p, q1q0);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i p1p0, q1q0;
+
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+ lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
+ __m128i q2p2, q1p1, q0p0;
+ __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+ __m128i work_pq, opq2, pq2;
+
+ q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ // filter_mask and hev_mask
+
+ // considering sse doesn't have unsigned elements comparison the idea is to
+ // find at least one case when X > limit, it means the corresponding mask
+ // bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+ // flat_mask4
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+ // op2
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+ workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+ // op1
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+
+ opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 4);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
+ __m128i q2p2, q1p1, q0p0;
+ __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+ __m128i work_pq, opq2, pq2;
+
+ q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ {
+ // filter_mask and hev_mask
+
+ // considering sse doesn't have unsigned elements comparison the idea is to
+ // find at least one case when X > limit, it means the corresponding mask
+ // bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi64(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+ // flat_mask4
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+ // op2
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+ workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op1
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+ workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+ workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
+
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 8);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i q1q0, p1p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ p3 = xx_loadl_32(s - 4 * p);
+ p2 = xx_loadl_32(s - 3 * p);
+ p1 = xx_loadl_32(s - 2 * p);
+ p0 = xx_loadl_32(s - 1 * p);
+ q0 = xx_loadl_32(s - 0 * p);
+ q1 = xx_loadl_32(s + 1 * p);
+ q2 = xx_loadl_32(s + 2 * p);
+ q3 = xx_loadl_32(s + 3 * p);
+
+ lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ xx_storel_32(s - 1 * p, p1p0);
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+ xx_storel_32(s + 0 * p, q1q0);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+ xx_storel_32(s - 3 * p, p2);
+ xx_storel_32(s + 2 * p, q2);
+}
+
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ __m128i thresh =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+
+ q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+ q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+ q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+ _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+
+ q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+ q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
+
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+ __m128i q1q0, p1p0;
+
+ p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+ q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+ lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ __m128i p1, p0, q0, q1;
+ __m128i qs1qs0, ps1ps0;
+
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+
+ __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+ __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+ __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+ __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
+}
+
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i p0, q0, q1, p1;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i qs1qs0, ps1ps0;
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+
+ __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+ __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+ __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+ __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+ x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
+
+ transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+ &q1);
+
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+ p1 = _mm_srli_si128(ps1ps0, 8);
+ q1 = _mm_srli_si128(qs1qs0, 8);
+
+ transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+ &d5, &d6, &d7);
+
+ xx_storel_32((s - 2 + 0 * p), d0);
+ xx_storel_32((s - 2 + 1 * p), d1);
+ xx_storel_32((s - 2 + 2 * p), d2);
+ xx_storel_32((s - 2 + 3 * p), d3);
+ xx_storel_32((s - 2 + 4 * p), d4);
+ xx_storel_32((s - 2 + 5 * p), d5);
+ xx_storel_32((s - 2 + 6 * p), d6);
+ xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x2, x1, x0, x3;
+ __m128i p0, q0;
+ __m128i p1p0, q1q0;
+ __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+ __m128i limit = _mm_load_si128((__m128i *)_limit);
+ __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+ x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+ x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+ transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+
+ lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
+
+ transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+ xx_storel_32(s + 0 * p - 2, d0);
+ xx_storel_32(s + 1 * p - 2, d1);
+ xx_storel_32(s + 2 * p - 2, d2);
+ xx_storel_32(s + 3 * p - 2, d3);
+}
+
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i p0, q0;
+ __m128i p1p0, q1q0;
+ __m128i d0d1, d2d3, d4d5, d6d7;
+
+ x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+ transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+ &d6d7);
+
+ d1 = _mm_srli_si128(d0d1, 8);
+ d3 = _mm_srli_si128(d2d3, 8);
+ d5 = _mm_srli_si128(d4d5, 8);
+ d7 = _mm_srli_si128(d6d7, 8);
+
+ lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ xx_storel_32((s - 2 + 0 * p), d0);
+ xx_storel_32((s - 2 + 1 * p), d1);
+ xx_storel_32((s - 2 + 2 * p), d2);
+ xx_storel_32((s - 2 + 3 * p), d3);
+ xx_storel_32((s - 2 + 4 * p), d4);
+ xx_storel_32((s - 2 + 5 * p), d5);
+ xx_storel_32((s - 2 + 6 * p), d6);
+ xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+ __m128i p0, q0;
+ __m128i x2, x1, x0, x3;
+ __m128i q1q0, p1p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+ x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+ transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+ // Loop filtering
+ lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
+
+ transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
+ &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d1, d3, d5, d7;
+ __m128i q1q0, p1p0;
+ __m128i p1, q1;
+ __m128i d0d1, d2d3, d4d5, d6d7;
+
+ x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+ transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+ &d6d7);
+
+ d1 = _mm_srli_si128(d0d1, 8);
+ d3 = _mm_srli_si128(d2d3, 8);
+ d5 = _mm_srli_si128(d4d5, 8);
+ d7 = _mm_srli_si128(d6d7, 8);
+
+ lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+ &q1q0, &p1p0, &blimit, &limit, &thresh);
+
+ p1 = _mm_srli_si128(p1p0, 8);
+ q1 = _mm_srli_si128(q1q0, 8);
+
+ transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+ &d2d3, &d4d5, &d6d7);
+
+ _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+ _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+ _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+ _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+ _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
+}
+
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i x6, x5, x4, x3;
+ __m128i pq0, pq1, pq2, pq3;
+ __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+ __m128i limit = _mm_load_si128((__m128i *)_limit);
+ __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+ x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+
+ transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+ &q5p5, &q6p6, &q7p7);
+
+ lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+ &limit, &thresh);
+
+ transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+ &q0p0, &pq0, &pq1, &pq2, &pq3);
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
+}
+
+void aom_lpf_vertical_14_dual_sse2(
+ unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+ __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+ __m128i q0, q1, q2, q3, q7;
+ __m128i p0p1, p2p3, p4p5, p6p7;
+
+ __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ __m128i thresh =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+
+ x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+ x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+ x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+ x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+ x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+ transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+ &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+ q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+ q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+ q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+ q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+ q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+ q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+ q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+ q7 = _mm_srli_si128(d14d15, 8);
+
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
+
+ x0 = _mm_srli_si128(q0p0, 8);
+ x1 = _mm_srli_si128(q1p1, 8);
+ x2 = _mm_srli_si128(q2p2, 8);
+ x3 = _mm_srli_si128(q3p3, 8);
+ x4 = _mm_srli_si128(q4p4, 8);
+ x5 = _mm_srli_si128(q5p5, 8);
+ x6 = _mm_srli_si128(q6p6, 8);
+
+ transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+ &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+ &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+ _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+ _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+ _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+ const __m128i *const a1,
+ const __m128i *const a2,
+ const __m128i *const s1,
+ const __m128i *const s2) {
+ __m128i x = _mm_add_epi16(*a1, *total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+ return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f8_lo,
+ const __m128i *const f8_hi) {
+ const __m128i f8 =
+ _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+ const __m128i result = _mm_and_si128(*flat, f8);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f_lo,
+ const __m128i *const f_hi) {
+ const __m128i f =
+ _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+ const __m128i result = _mm_and_si128(*flat, f);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev, flat, flat2;
+ __m128i p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q6, q5;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // wide flat calculations
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+ const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+ const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+ const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+ const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+ const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+
+ const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+ const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+ const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+ const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+ const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+ const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+
+ __m128i f_lo;
+ __m128i f_hi;
+
+ f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo);
+ f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo);
+ f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo);
+ f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+ f_lo = _mm_add_epi16(f_lo, eight);
+
+ f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi);
+ f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi);
+ f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi);
+ f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+ f_hi = _mm_add_epi16(f_hi, eight);
+
+ p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi);
+ p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi);
+ p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi);
+ op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi);
+ op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi);
+ op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi);
+ oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi);
+ oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi);
+ oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi);
+ q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi);
+ q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi);
+ q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+ }
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev, flat;
+ __m128i p2, p1, p0, q0, q1, q2;
+
+ __m128i op1, op0, oq0, oq1;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter6
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four),
+ _mm_add_epi16(p2_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo),
+ _mm_add_epi16(p1_lo, p0_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four),
+ _mm_add_epi16(p2_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi),
+ _mm_add_epi16(p1_hi, p0_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev;
+ __m128i p1, p0, q0, q1;
+
+ __m128i op1, op0, oq0, oq1;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+}
+
+void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0);
+
+ // Transpose back
+ transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
+}
+
+void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+ // Transpose 16x8
+ transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+
+ // Transpose back
+ transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
+
+void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+ // Transpose 16x8:: (wxh) 8x16 to 16x8
+ transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+
+ // Transpose back:: (wxh) 16x8 to 8x16
+ transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
+
+void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+ // Transpose 16x8
+ transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
+ _thresh0);
+
+ // Transpose back
+ transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
new file mode 100644
index 0000000000..45464e80b1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5) {
+ __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+
+ w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
+ w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
+ w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51
+ *d1 = _mm_unpackhi_epi64(ww0,
+ _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx
+
+ ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ *d2 = _mm_unpacklo_epi64(ww0,
+ _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx
+
+ w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx
+ w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx
+ w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx
+
+ *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53
+
+ ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35
+ *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55
+ *d5 = _mm_unpackhi_epi64(ww0,
+ _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i w0, w1, ww0, ww1;
+
+ w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
+ w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+
+ *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx
+ *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx
+ *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx
+ *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d4, __m128i *d5,
+ __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, ww2, ww3;
+ __m128i zero = _mm_setzero_si128();
+
+ w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
+ w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
+
+ ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
+ ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
+
+ *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx
+ *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx
+ *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx
+ *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5,
+ __m128i *d6, __m128i *d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // output
+ // 00 10 20 30 xx xx xx xx
+ // 01 11 21 31 xx xx xx xx
+ // 02 12 22 32 xx xx xx xx
+ // 03 13 23 33 xx xx xx xx
+ // 04 14 24 34 xx xx xx xx
+ // 05 15 25 35 xx xx xx xx
+ // 06 16 26 36 xx xx xx xx
+ // 07 17 27 37 xx xx xx xx
+ highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+ highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
+}
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ __m128i w0, w1, w2, w3, ww0, ww1;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+
+ w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
+ w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
+ w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
+ w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
+
+ *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
+ *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
+
+ ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
+
+ *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
+ *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *d4, __m128i *d5,
+ __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, w2, w3, ww0, ww1;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
+ w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57
+ w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
+ ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
+
+ *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
+ *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
+
+ ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
+ ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
+
+ *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
+ *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+ highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+ d5, d6, d7);
+ highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+ x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+ d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ *d0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+ *d1 = _mm_srli_si128(*d0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4,
+ __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, ww0, ww1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+ *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1,
+ 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1,
+ 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1,
+ 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0,
+ __m128i *d1, __m128i *d2,
+ __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, w2, w3, w4, w5;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d1 = _mm_srli_si128(*d0, 8);
+ *d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0d1,
+ __m128i *d2d3, __m128i *d4d5,
+ __m128i *d6d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d2d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w6 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ w7 = _mm_unpackhi_epi16(
+ w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+ *d4d5 = _mm_unpacklo_epi32(
+ w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ *d6d7 = _mm_unpackhi_epi32(
+ w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+ __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+ __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ w1 = _mm_unpacklo_epi8(*x2, *x3);
+ w2 = _mm_unpacklo_epi8(*x4, *x5);
+ w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+ w8 = _mm_unpacklo_epi8(*x8, *x9);
+ w9 = _mm_unpacklo_epi8(*x10, *x11);
+ w10 = _mm_unpacklo_epi8(*x12, *x13);
+ w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ *d0 = _mm_unpacklo_epi64(w6, w14);
+ *d1 = _mm_unpackhi_epi64(w6, w14);
+ *d2 = _mm_unpacklo_epi64(w7, w15);
+ *d3 = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ *d4 = _mm_unpacklo_epi64(w6, w14);
+ *d5 = _mm_unpackhi_epi64(w6, w14);
+ *d6 = _mm_unpacklo_epi64(w7, w15);
+ *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+ __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+ __m128i *d12d13, __m128i *d14d15) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ w1 = _mm_unpacklo_epi8(*x2, *x3);
+ w2 = _mm_unpacklo_epi8(*x4, *x5);
+ w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+ w8 = _mm_unpackhi_epi8(*x0, *x1);
+ w9 = _mm_unpackhi_epi8(*x2, *x3);
+ w10 = _mm_unpackhi_epi8(*x4, *x5);
+ w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ *d0d1 = _mm_unpacklo_epi64(w6, w14);
+ *d2d3 = _mm_unpackhi_epi64(w6, w14);
+ *d4d5 = _mm_unpacklo_epi64(w7, w15);
+ *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ *d8d9 = _mm_unpacklo_epi64(w6, w14);
+ *d10d11 = _mm_unpackhi_epi64(w6, w14);
+ *d12d13 = _mm_unpacklo_epi64(w7, w15);
+ *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
+ int in_p, unsigned char *out, int out_p) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+ x0 = _mm_loadl_epi64((__m128i *)in0);
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
+ x3 = _mm_unpacklo_epi8(x6, x7);
+ x4 = _mm_unpacklo_epi16(x0, x1);
+
+ x8 = _mm_loadl_epi64((__m128i *)in1);
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
+ x8 = _mm_unpacklo_epi8(x8, x9);
+ x5 = _mm_unpacklo_epi16(x2, x3);
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
+ x9 = _mm_unpacklo_epi8(x10, x11);
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
+ x10 = _mm_unpacklo_epi8(x12, x13);
+ x12 = _mm_unpacklo_epi16(x8, x9);
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
+ x11 = _mm_unpacklo_epi8(x14, x15);
+ x13 = _mm_unpacklo_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store first 4-line result
+ _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ x12 = _mm_unpackhi_epi16(x8, x9);
+ x13 = _mm_unpackhi_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store second 4-line result
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
+ unsigned char *dst, int out_p) {
+ // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
+ // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
+ // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
+ // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
+ // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
+ // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
+ // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
+ // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
+ const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
+ const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
+ const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
+ const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
+ const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
+ const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
+ const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
+ const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
+
+ // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
+ // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
+ // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
+ // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
+ // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
+ // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
+ // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
+ // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
+ const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
+ const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
+ const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
+ const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
+ const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
+ const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
+ const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
+ const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
+
+ // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
+ // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
+ // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
+ // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
+ // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
+ // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
+ // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
+ // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
+ const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
+ const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
+ const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
+ const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
+ const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
+ const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
+ const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
+ const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
+
+ // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
+ // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
+ // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
+ // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
+ // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
+ // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
+ // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
+ // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
+ const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
+ const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
+ const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
+ const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
+ const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
+ const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
+ const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
+ const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
+
+ mm_storelu(dst, x_s30);
+ mm_storehu(dst + (1 * out_p), x_s30);
+ mm_storelu(dst + (2 * out_p), x_s31);
+ mm_storehu(dst + (3 * out_p), x_s31);
+ mm_storelu(dst + (4 * out_p), x_s32);
+ mm_storehu(dst + (5 * out_p), x_s32);
+ mm_storelu(dst + (6 * out_p), x_s33);
+ mm_storehu(dst + (7 * out_p), x_s33);
+ mm_storelu(dst + (8 * out_p), x_s34);
+ mm_storehu(dst + (9 * out_p), x_s34);
+ mm_storelu(dst + (10 * out_p), x_s35);
+ mm_storehu(dst + (11 * out_p), x_s35);
+ mm_storelu(dst + (12 * out_p), x_s36);
+ mm_storehu(dst + (13 * out_p), x_s36);
+ mm_storelu(dst + (14 * out_p), x_s37);
+ mm_storehu(dst + (15 * out_p), x_s37);
+}
+
+static INLINE void transpose_8xn(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 =
+ _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ x1 =
+ _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 =
+ _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ x3 =
+ _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 =
+ _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ x5 =
+ _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 =
+ _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ x7 =
+ _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70
+ mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72
+ mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74
+ mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76
+ mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
new file mode 100644
index 0000000000..799ce9ef44
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+#define MASK_SAD16XH_ONE_REF(idx) \
+ a = _mm_loadu_si128((const __m128i *)&ref##idx[x]); \
+ data_l = _mm_unpacklo_epi8(a, b); \
+ mask_l = _mm_unpacklo_epi8(m, m_inv); \
+ pred_l = _mm_maddubs_epi16(data_l, mask_l); \
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ data_r = _mm_unpackhi_epi8(a, b); \
+ mask_r = _mm_unpackhi_epi8(m, m_inv); \
+ pred_r = _mm_maddubs_epi16(data_r, mask_r); \
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred_l, pred_r); \
+ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height, int inv_mask,
+ unsigned sad_array[4]) {
+ int x, y;
+ __m128i a;
+ __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ __m128i res0 = _mm_setzero_si128();
+ __m128i res1 = _mm_setzero_si128();
+ __m128i res2 = _mm_setzero_si128();
+ __m128i res3 = _mm_setzero_si128();
+ const uint8_t *ref0 = a_ptr[0];
+ const uint8_t *ref1 = a_ptr[1];
+ const uint8_t *ref2 = a_ptr[2];
+ const uint8_t *ref3 = a_ptr[3];
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+ __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+ __m128i m = inv_mask ? m_inv : m_copy;
+ m_inv = inv_mask ? m_copy : m_inv;
+
+ MASK_SAD16XH_ONE_REF(0)
+ MASK_SAD16XH_ONE_REF(1)
+ MASK_SAD16XH_ONE_REF(2)
+ MASK_SAD16XH_ONE_REF(3)
+ }
+
+ src_ptr += src_stride;
+ ref0 += a_stride;
+ ref1 += a_stride;
+ ref2 += a_stride;
+ ref3 += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+ _mm_unpackhi_epi32(res0, res1));
+ res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+ _mm_unpackhi_epi32(res2, res3));
+
+ res0 = _mm_unpacklo_epi64(res0, res2);
+ _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD8XH_ONE_REF(idx) \
+ const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx); \
+ const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
+ data_l = _mm_unpacklo_epi8(a##idx##0, b0); \
+ mask_l = _mm_unpacklo_epi8(m, m_inv); \
+ pred_l = _mm_maddubs_epi16(data_l, mask_l); \
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ data_r = _mm_unpacklo_epi8(a##idx##1, b1); \
+ mask_r = _mm_unpackhi_epi8(m, m_inv); \
+ pred_r = _mm_maddubs_epi16(data_r, mask_r); \
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred_l, pred_r); \
+ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_array[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height,
+ int inv_mask, unsigned sad_array[4]) {
+ const uint8_t *ref0 = ref_array[0];
+ const uint8_t *ref1 = ref_array[1];
+ const uint8_t *ref2 = ref_array[2];
+ const uint8_t *ref3 = ref_array[3];
+ __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
+ __m128i res0 = _mm_setzero_si128();
+ __m128i res1 = _mm_setzero_si128();
+ __m128i res2 = _mm_setzero_si128();
+ __m128i res3 = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
+ const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
+ const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
+ const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
+ const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
+ __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
+ __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+ __m128i m = inv_mask ? m_inv : m_copy;
+ m_inv = inv_mask ? m_copy : m_inv;
+
+ MASK_SAD8XH_ONE_REF(0)
+ MASK_SAD8XH_ONE_REF(1)
+ MASK_SAD8XH_ONE_REF(2)
+ MASK_SAD8XH_ONE_REF(3)
+
+ ref0 += 2 * a_stride;
+ ref1 += 2 * a_stride;
+ ref2 += 2 * a_stride;
+ ref3 += 2 * a_stride;
+ src_ptr += 2 * src_stride;
+ b_ptr += 2 * b_stride;
+ m_ptr += 2 * m_stride;
+ }
+ res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+ _mm_unpackhi_epi32(res0, res1));
+ res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+ _mm_unpackhi_epi32(res2, res3));
+ res0 = _mm_unpacklo_epi64(res0, res2);
+ _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD4XH_ONE_REF(idx) \
+ a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx), \
+ _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
+ data = _mm_unpacklo_epi8(a, b); \
+ mask = _mm_unpacklo_epi8(m, m_inv); \
+ pred = _mm_maddubs_epi16(data, mask); \
+ pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \
+ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_array[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height,
+ int inv_mask, unsigned sad_array[4]) {
+ const uint8_t *ref0 = ref_array[0];
+ const uint8_t *ref1 = ref_array[1];
+ const uint8_t *ref2 = ref_array[2];
+ const uint8_t *ref3 = ref_array[3];
+ __m128i data, pred, mask;
+ __m128i res0 = _mm_setzero_si128();
+ __m128i res1 = _mm_setzero_si128();
+ __m128i res2 = _mm_setzero_si128();
+ __m128i res3 = _mm_setzero_si128();
+ __m128i a;
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+ _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
+ const __m128i b =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+ _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
+ const __m128i m_copy =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+ _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
+
+ __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+ __m128i m = inv_mask ? m_inv : m_copy;
+ m_inv = inv_mask ? m_copy : m_inv;
+
+ MASK_SAD4XH_ONE_REF(0)
+ MASK_SAD4XH_ONE_REF(1)
+ MASK_SAD4XH_ONE_REF(2)
+ MASK_SAD4XH_ONE_REF(3)
+
+ ref0 += 2 * a_stride;
+ ref1 += 2 * a_stride;
+ ref2 += 2 * a_stride;
+ ref3 += 2 * a_stride;
+ src_ptr += 2 * src_stride;
+ b_ptr += 2 * b_stride;
+ m_ptr += 2 * m_stride;
+ }
+ res0 = _mm_unpacklo_epi32(res0, res1);
+ res2 = _mm_unpacklo_epi32(res2, res3);
+ res0 = _mm_unpacklo_epi64(res0, res2);
+ _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASKSADMXN_SSSE3(m, n) \
+ void aom_masked_sad##m##x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+ msk_stride, m, n, inv_mask, sad_array); \
+ }
+
+#define MASKSAD8XN_SSSE3(n) \
+ void aom_masked_sad8x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+ 8, msk, msk_stride, n, inv_mask, sad_array); \
+ }
+
+#define MASKSAD4XN_SSSE3(n) \
+ void aom_masked_sad4x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+ 4, msk, msk_stride, n, inv_mask, sad_array); \
+ }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 0000000000..2c022555b5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ int x, y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_scale =
+ _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 32) {
+ const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+ const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+ const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+ const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+ const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+ __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+ pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+ const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+ __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+ pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+ const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+ res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ res = _mm256_shuffle_epi32(res, 0xd8);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int32_t sad = _mm256_extract_epi32(res, 0);
+ return sad;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+ __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+ __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+ __m256i a = _mm256_castsi128_si256(a0);
+ return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_scale =
+ _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ for (y = 0; y < height; y += 2) {
+ const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+ const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+ const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+ const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+ const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+ __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+ pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+ const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+ __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+ pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+ const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+ res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+ src_ptr += src_stride << 1;
+ a_ptr += a_stride << 1;
+ b_ptr += b_stride << 1;
+ m_ptr += m_stride << 1;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ res = _mm256_shuffle_epi32(res, 0xd8);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int32_t sad = _mm256_extract_epi32(res, 0);
+ return sad;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+ int invert_mask, int m, int n) {
+ unsigned int sad;
+ if (!invert_mask) {
+ switch (m) {
+ case 4:
+ sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 16:
+ sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+ m, msk, msk_stride, n);
+ break;
+ default:
+ sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+ m, msk, msk_stride, m, n);
+ break;
+ }
+ } else {
+ switch (m) {
+ case 4:
+ sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 16:
+ sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ default:
+ sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, m, n);
+ break;
+ }
+ }
+ return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, invert_mask, m, n); \
+ }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (y = 0; y < height; y += 2) {
+ const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+ const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+ const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+ // Zero-extend mask to 16 bits
+ const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(m_ptr)),
+ _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+ const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+ const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+ __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+ pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+ __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+ pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+ res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+ src_ptr += src_stride << 1;
+ a_ptr += a_stride << 1;
+ b_ptr += b_stride << 1;
+ m_ptr += m_stride << 1;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+ return sad;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int x, y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+ const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+ const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+ // Zero-extend mask to 16 bits
+ const __m256i m =
+ _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+ const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+ const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+ __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+ pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+ __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+ pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+ res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+ return sad;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+ int invert_mask, int m, int n) {
+ unsigned int sad;
+ if (!invert_mask) {
+ switch (m) {
+ case 4:
+ sad =
+ aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ default:
+ sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, m, n);
+ break;
+ }
+ } else {
+ switch (m) {
+ case 4:
+ sad =
+ aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ default:
+ sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, m, n);
+ break;
+ }
+ }
+ return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+ second_pred8, msk, msk_stride, \
+ invert_mask, m, n); \
+ }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4)
+HIGHBD_MASKSADMXN_AVX2(4, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 16)
+HIGHBD_MASKSADMXN_AVX2(32, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 32)
+HIGHBD_MASKSADMXN_AVX2(64, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 128)
+HIGHBD_MASKSADMXN_AVX2(128, 64)
+HIGHBD_MASKSADMXN_AVX2(128, 128)
+HIGHBD_MASKSADMXN_AVX2(4, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 0000000000..df3a8764e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+// For width a multiple of 16
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height);
+
+#define MASKSADMXN_SSSE3(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+ m, msk, msk_stride, m, n); \
+ else \
+ return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \
+ ref_stride, msk, msk_stride, m, n); \
+ }
+
+#define MASKSAD8XN_SSSE3(n) \
+ unsigned int aom_masked_sad8x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
+ second_pred, 8, msk, msk_stride, n); \
+ else \
+ return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
+ ref_stride, msk, msk_stride, n); \
+ }
+
+#define MASKSAD4XN_SSSE3(n) \
+ unsigned int aom_masked_sad4x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
+ second_pred, 4, msk, msk_stride, n); \
+ else \
+ return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
+ ref_stride, msk, msk_stride, n); \
+ }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ int x, y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+ const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m128i data_l = _mm_unpacklo_epi8(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+ __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi8(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+ __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+ res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) +
+ _mm_cvtsi128_si32(_mm_srli_si128(res, 8)));
+ return sad;
+}
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (y = 0; y < height; y += 2) {
+ const __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
+ const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
+ const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
+ const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
+ const __m128i m =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+ _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+ const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
+ const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+ __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
+ const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+ __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+ res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+ src_ptr += src_stride * 2;
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) +
+ _mm_cvtsi128_si32(_mm_srli_si128(res, 8)));
+ return sad;
+}
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (y = 0; y < height; y += 2) {
+ // Load two rows at a time, this seems to be a bit faster
+ // than four rows at a time in this case.
+ const __m128i src =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+ _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
+ const __m128i a =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr),
+ _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride]));
+ const __m128i b =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+ _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
+ const __m128i m =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+ _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
+ const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+ const __m128i data = _mm_unpacklo_epi8(a, b);
+ const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
+ __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
+ pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
+ res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+ src_ptr += src_stride * 2;
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ // At this point, the SAD is stored in lane 0 of 'res'
+ return (unsigned int)_mm_cvtsi128_si32(res);
+}
+
+// For width a multiple of 8
+static INLINE unsigned int highbd_masked_sad_ssse3(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ if (!invert_mask) \
+ return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \
+ second_pred8, m, msk, msk_stride, m, n); \
+ else \
+ return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
+ ref_stride, msk, msk_stride, m, n); \
+ }
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n) \
+ unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ if (!invert_mask) \
+ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \
+ ref_stride, second_pred8, 4, msk, \
+ msk_stride, n); \
+ else \
+ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+ ref8, ref_stride, msk, msk_stride, \
+ n); \
+ }
+
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+HIGHBD_MASKSAD4XN_SSSE3(16)
+HIGHBD_MASKSADMXN_SSSE3(16, 4)
+HIGHBD_MASKSADMXN_SSSE3(8, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 8)
+HIGHBD_MASKSADMXN_SSSE3(16, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int x, y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 8) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ // Zero-extend mask to 16 bits
+ const __m128i m = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+ res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm_hadd_epi32(res, res);
+ res = _mm_hadd_epi32(res, res);
+ int sad = _mm_cvtsi128_si32(res);
+ return sad;
+}
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (y = 0; y < height; y += 2) {
+ const __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
+ _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
+ const __m128i b =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
+ _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
+ // Zero-extend mask to 16 bits
+ const __m128i m = _mm_unpacklo_epi8(
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+ _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
+ _mm_setzero_si128());
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+ res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+
+ src_ptr += src_stride * 2;
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ res = _mm_hadd_epi32(res, res);
+ res = _mm_hadd_epi32(res, res);
+ int sad = _mm_cvtsi128_si32(res);
+ return sad;
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 0000000000..cffbd9672c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 0000000000..0bf383fffd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,1067 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+// For width a multiple of 16
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int w, int h);
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h);
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h);
+
+// For width a multiple of 16
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int width,
+ int height, unsigned int *sse, int *sum_);
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_);
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_);
+
+#define MASK_SUBPIX_VAR_SSSE3(W, H) \
+ unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ int sum; \
+ uint8_t temp[(H + 1) * W]; \
+ \
+ bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, sse, &sum); \
+ else \
+ masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define MASK_SUBPIX_VAR8XH_SSSE3(H) \
+ unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ int sum; \
+ uint8_t temp[(H + 1) * 8]; \
+ \
+ bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+ H, sse, &sum); \
+ else \
+ masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+ H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \
+ }
+
+#define MASK_SUBPIX_VAR4XH_SSSE3(H) \
+ unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ int sum; \
+ uint8_t temp[(H + 1) * 4]; \
+ \
+ bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+ H, sse, &sum); \
+ else \
+ masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+ H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
+ }
+
+MASK_SUBPIX_VAR_SSSE3(128, 128)
+MASK_SUBPIX_VAR_SSSE3(128, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 128)
+MASK_SUBPIX_VAR_SSSE3(64, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 64)
+MASK_SUBPIX_VAR_SSSE3(32, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 32)
+MASK_SUBPIX_VAR_SSSE3(16, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 8)
+MASK_SUBPIX_VAR8XH_SSSE3(16)
+MASK_SUBPIX_VAR8XH_SSSE3(8)
+MASK_SUBPIX_VAR8XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(8)
+MASK_SUBPIX_VAR4XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(16)
+MASK_SUBPIX_VAR_SSSE3(16, 4)
+MASK_SUBPIX_VAR8XH_SSSE3(32)
+MASK_SUBPIX_VAR_SSSE3(32, 8)
+MASK_SUBPIX_VAR_SSSE3(64, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 64)
+
+static INLINE __m128i filter_block(const __m128i a, const __m128i b,
+ const __m128i filter) {
+ __m128i v0 = _mm_unpacklo_epi8(a, b);
+ v0 = _mm_maddubs_epi16(v0, filter);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpackhi_epi8(a, b);
+ v1 = _mm_maddubs_epi16(v1, filter);
+ v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+ return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int w, int h) {
+ int i, j;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 16) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ _mm_storeu_si128((__m128i *)&b[j], x);
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else if (xoffset == 4) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 16) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+ __m128i z = _mm_alignr_epi8(y, x, 1);
+ _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else {
+ uint8_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+ const __m128i z = _mm_alignr_epi8(y, x, 1);
+ const __m128i res = filter_block(x, z, hfilter_vec);
+ _mm_storeu_si128((__m128i *)&b[j], res);
+ }
+
+ src += src_stride;
+ b += w;
+ }
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
+ }
+ dst += w;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ const __m128i res = filter_block(x, y, vfilter_vec);
+ _mm_storeu_si128((__m128i *)&dst[j], res);
+ }
+
+ dst += w;
+ }
+ }
+}
+
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+ const __m128i *a1, const __m128i *b1,
+ const __m128i *filter) {
+ __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+ v0 = _mm_maddubs_epi16(v0, *filter);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+ v1 = _mm_maddubs_epi16(v1, *filter);
+ v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+ return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h) {
+ int i;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)src);
+ _mm_storel_epi64((__m128i *)b, x);
+ src += src_stride;
+ b += 8;
+ }
+ } else if (xoffset == 4) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadu_si128((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 1);
+ _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
+ src += src_stride;
+ b += 8;
+ }
+ } else {
+ uint8_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 1);
+ const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+ const __m128i z1 = _mm_srli_si128(x1, 1);
+ const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+ _mm_storeu_si128((__m128i *)b, res);
+
+ src += src_stride * 2;
+ b += 16;
+ }
+ // Handle i = h separately
+ const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 1);
+
+ __m128i v0 = _mm_unpacklo_epi8(x0, z0);
+ v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
+ dst += 8;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+ const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
+ const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+ _mm_storeu_si128((__m128i *)dst, res);
+
+ dst += 16;
+ }
+ }
+}
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h) {
+ int i;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = xx_loadl_32((__m128i *)src);
+ xx_storel_32(b, x);
+ src += src_stride;
+ b += 4;
+ }
+ } else if (xoffset == 4) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 1);
+ xx_storel_32(b, _mm_avg_epu8(x, z));
+ src += src_stride;
+ b += 4;
+ }
+ } else {
+ uint8_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+ for (i = 0; i < h; i += 4) {
+ const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 1);
+ const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
+ const __m128i z1 = _mm_srli_si128(x1, 1);
+ const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
+ const __m128i z2 = _mm_srli_si128(x2, 1);
+ const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
+ const __m128i z3 = _mm_srli_si128(x3, 1);
+
+ const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
+ const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
+ const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
+ const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
+ const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
+ _mm_storeu_si128((__m128i *)b, res);
+
+ src += src_stride * 4;
+ b += 16;
+ }
+ // Handle i = h separately
+ const __m128i x = _mm_loadl_epi64((__m128i *)src);
+ const __m128i z = _mm_srli_si128(x, 1);
+
+ __m128i v0 = _mm_unpacklo_epi8(x, z);
+ v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ xx_storel_32(b, _mm_packus_epi16(v0, v0));
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ __m128i x = xx_loadl_32((__m128i *)dst);
+ __m128i y = xx_loadl_32((__m128i *)&dst[4]);
+ xx_storel_32(dst, _mm_avg_epu8(x, y));
+ dst += 4;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+ for (i = 0; i < h; i += 4) {
+ const __m128i a = xx_loadl_32((__m128i *)dst);
+ const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
+ const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
+ const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
+ const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
+
+ const __m128i a0 = _mm_unpacklo_epi32(a, b);
+ const __m128i b0 = _mm_unpacklo_epi32(b, c);
+ const __m128i a1 = _mm_unpacklo_epi32(c, d);
+ const __m128i b1 = _mm_unpacklo_epi32(d, e);
+ const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
+ _mm_storeu_si128((__m128i *)dst, res);
+
+ dst += 16;
+ }
+ }
+}
+
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+ const __m128i *b, const __m128i *m,
+ __m128i *sum, __m128i *sum_sq) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+ const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
+ __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+ const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
+ __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+ const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
+ const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
+ const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
+
+ // Update partial sums and partial sums of squares
+ *sum =
+ _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
+ *sum_sq =
+ _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
+ _mm_madd_epi16(diff_r, diff_r)));
+}
+
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int width,
+ int height, unsigned int *sse, int *sum_) {
+ int x, y;
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, sum);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_) {
+ int y;
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+ for (y = 0; y < height; y += 2) {
+ __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+ const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+ const __m128i m =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+ _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+ src_ptr += src_stride * 2;
+ a_ptr += 16;
+ b_ptr += 16;
+ m_ptr += m_stride * 2;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, sum);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_) {
+ int y;
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+ for (y = 0; y < height; y += 4) {
+ // Load four rows at a time
+ __m128i src = _mm_setr_epi32(*(int *)src_ptr, *(int *)&src_ptr[src_stride],
+ *(int *)&src_ptr[src_stride * 2],
+ *(int *)&src_ptr[src_stride * 3]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+ const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+ const __m128i m = _mm_setr_epi32(*(int *)m_ptr, *(int *)&m_ptr[m_stride],
+ *(int *)&m_ptr[m_stride * 2],
+ *(int *)&m_ptr[m_stride * 3]);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+ src_ptr += src_stride * 4;
+ a_ptr += 16;
+ b_ptr += 16;
+ m_ptr += m_stride * 4;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, sum);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// For width a multiple of 8
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int w, int h);
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int h);
+
+// For width a multiple of 8
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr, int a_stride,
+ const uint16_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height, uint64_t *sse,
+ int *sum_);
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr,
+ const uint16_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride,
+ int height, int *sse, int *sum_);
+
+#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \
+ unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ uint64_t sse64; \
+ int sum; \
+ uint16_t temp[(H + 1) * W]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ else \
+ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ *sse = (uint32_t)sse64; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ uint64_t sse64; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * W]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ else \
+ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \
+ sum = ROUND_POWER_OF_TWO(sum, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ uint64_t sse64; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * W]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ else \
+ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \
+ sum = ROUND_POWER_OF_TWO(sum, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \
+ unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ int sse_; \
+ int sum; \
+ uint16_t temp[(H + 1) * 4]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
+ msk_stride, H, &sse_, &sum); \
+ else \
+ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
+ msk_stride, H, &sse_, &sum); \
+ *sse = (uint32_t)sse_; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
+ } \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ int sse_; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * 4]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
+ msk_stride, H, &sse_, &sum); \
+ else \
+ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
+ msk_stride, H, &sse_, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \
+ sum = ROUND_POWER_OF_TWO(sum, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ int sse_; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * 4]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
+ msk_stride, H, &sse_, &sum); \
+ else \
+ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
+ msk_stride, H, &sse_, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \
+ sum = ROUND_POWER_OF_TWO(sum, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+
+static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
+ const __m128i filter) {
+ __m128i v0 = _mm_unpacklo_epi16(a, b);
+ v0 = _mm_madd_epi16(v0, filter);
+ v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpackhi_epi16(a, b);
+ v1 = _mm_madd_epi16(v1, filter);
+ v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+ return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int w, int h) {
+ int i, j;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ _mm_storeu_si128((__m128i *)&b[j], x);
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else if (xoffset == 4) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+ __m128i z = _mm_alignr_epi8(y, x, 2);
+ _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else {
+ uint16_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+ const __m128i z = _mm_alignr_epi8(y, x, 2);
+ const __m128i res = highbd_filter_block(x, z, hfilter_vec);
+ _mm_storeu_si128((__m128i *)&b[j], res);
+ }
+
+ src += src_stride;
+ b += w;
+ }
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
+ }
+ dst += w;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ const __m128i res = highbd_filter_block(x, y, vfilter_vec);
+ _mm_storeu_si128((__m128i *)&dst[j], res);
+ }
+
+ dst += w;
+ }
+ }
+}
+
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+ const __m128i *b0,
+ const __m128i *a1,
+ const __m128i *b1,
+ const __m128i *filter) {
+ __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+ v0 = _mm_madd_epi16(v0, *filter);
+ v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+ v1 = _mm_madd_epi16(v1, *filter);
+ v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+ return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int h) {
+ int i;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)src);
+ _mm_storel_epi64((__m128i *)b, x);
+ src += src_stride;
+ b += 4;
+ }
+ } else if (xoffset == 4) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadu_si128((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 2);
+ _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
+ src += src_stride;
+ b += 4;
+ }
+ } else {
+ uint16_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 2);
+ const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+ const __m128i z1 = _mm_srli_si128(x1, 2);
+ const __m128i res =
+ highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+ _mm_storeu_si128((__m128i *)b, res);
+
+ src += src_stride * 2;
+ b += 8;
+ }
+ // Process i = h separately
+ __m128i x = _mm_loadu_si128((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 2);
+
+ __m128i v0 = _mm_unpacklo_epi16(x, z);
+ v0 = _mm_madd_epi16(v0, hfilter_vec);
+ v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+ _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
+ dst += 4;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+ const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
+ const __m128i res =
+ highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+ _mm_storeu_si128((__m128i *)dst, res);
+
+ dst += 8;
+ }
+ }
+}
+
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr, int a_stride,
+ const uint16_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height, uint64_t *sse,
+ int *sum_) {
+ int x, y;
+ // Note on bit widths:
+ // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
+ // so this can be kept as four 32-bit values.
+ // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
+ // so this must be stored as two 64-bit values.
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 8) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ // Calculate 8 predicted pixels.
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+ const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+ __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+ __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+ // Update partial sums and partial sums of squares
+ sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+ // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
+ // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
+ // So we can re-pack into 16-bit fields and use _mm_madd_epi16
+ // to calculate the squares and partially sum them.
+ const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+ const __m128i prod = _mm_madd_epi16(tmp, tmp);
+ // Then we want to sign-extend to 64 bits and accumulate
+ const __m128i sign = _mm_srai_epi32(prod, 31);
+ const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
+ const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
+ sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, zero);
+ sum = _mm_hadd_epi32(sum, zero);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
+ _mm_storel_epi64((__m128i *)sse, sum_sq);
+}
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr,
+ const uint16_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride,
+ int height, int *sse, int *sum_) {
+ int y;
+ // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
+ // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
+ // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
+ // So we can safely pack sum_sq into 32-bit fields, which is slightly more
+ // convenient.
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ for (y = 0; y < height; y += 2) {
+ __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+ const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+ const __m128i m = _mm_unpacklo_epi8(
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+ _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
+ zero);
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+ const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+ __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+ __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+ // Update partial sums and partial sums of squares
+ sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+ const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+ const __m128i prod = _mm_madd_epi16(tmp, tmp);
+ sum_sq = _mm_add_epi32(sum_sq, prod);
+
+ src_ptr += src_stride * 2;
+ a_ptr += 8;
+ b_ptr += 8;
+ m_ptr += m_stride * 2;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, zero);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ assert(height % 2 == 0);
+ int i = 0;
+ if (width == 8) {
+ comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+ mask, mask_stride);
+ } else if (width == 16) {
+ do {
+ comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+ comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+ mask + mask_stride, comp_pred + width);
+ comp_pred += (width << 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ i += 2;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred);
+ comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16,
+ comp_pred + 16);
+ comp_pred += 32;
+ }
+ src0 += (stride0);
+ src1 += (stride1);
+ mask += (mask_stride);
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 0000000000..4faa098ace
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *mask, uint8_t *dst) {
+ const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i round_offset =
+ _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+ const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+ const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+ const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+ const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+ const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+ const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+ const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+ const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+ const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+ const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+ const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+ _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ const uint8_t *mask,
+ int mask_stride) {
+ int i = 0;
+ const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i round_offset =
+ _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ // odd line A
+ const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+ const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+ const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+ // even line B
+ const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+ const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+ const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+ _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+ const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+ const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+ const __m128i ma = _mm_sub_epi8(alpha_max, a);
+ const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+ const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+ const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+ const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+ const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+ const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+ const __m128i round = _mm_packus_epi16(roundA, roundB);
+ // comp_pred's stride == width == 8
+ _mm_store_si128((__m128i *)(comp_pred), round);
+ comp_pred += (8 << 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ i += 2;
+ } while (i < height);
+}
+
+#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..085a572cb1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE int16_t loadu_int16(const void *src) {
+ int16_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE int32_t loadu_int32(const void *src) {
+ int32_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE int64_t loadu_int64(const void *src) {
+ int64_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+ _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+ return _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+ const int byte_stride) {
+ return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
+ loadu_int32((int8_t *)src + 1 * byte_stride),
+ loadu_int32((int8_t *)src + 2 * byte_stride),
+ loadu_int32((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+ const int byte_stride) {
+ __m128i dst;
+ dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+ dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+ return dst;
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+ uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+ _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+ *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+ *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+ *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ __m128i ss[4];
+
+ ss[0] = s;
+ ss[1] = _mm_srli_si128(s, 4);
+ ss[2] = _mm_srli_si128(s, 8);
+ ss[3] = _mm_srli_si128(s, 12);
+ store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+ d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+ d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+ d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+ d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+ d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+ d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+ loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+ _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+ _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+ _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+ _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 0000000000..210f466b6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int h) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
+ const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+ const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
new file mode 100644
index 0000000000..27398ffd62
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+ v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if AOM_ARCH_X86_64
+ return _mm_cvtsi128_si64(v_q);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_q);
+ return tmp;
+ }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+ const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+ const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+ const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+ return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+ const __m128i v_tmp_d =
+ _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 0000000000..9d1b7d4968
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+ do {
+ const __m128i v_p_b_0 = xx_loadl_32(pre);
+ const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+ const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+ const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+ const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+ const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+ n += 8;
+ pre += pre_stride << 1;
+ } while (n < 8 * (height >> 1));
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+ const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p0_b = xx_loadl_64(pre + n);
+ const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+ const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+ n += 8;
+
+ if ((n & (width - 1)) == 0) pre += pre_step;
+ } while (n < width * height);
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *msk) { \
+ if (w == 4) { \
+ return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \
+ } else { \
+ return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+ } \
+ }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ do {
+ const __m128i v_p_w_0 = xx_loadl_64(pre);
+ const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+ const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+ const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+ const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+ const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+ n += 8;
+
+ pre += pre_stride << 1;
+ } while (n < 8 * (height >> 1));
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+ const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+ const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ if (w == 4) { \
+ return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \
+ } else { \
+ return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+ } \
+ }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 0000000000..542572c761
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ do {
+ const __m128i v_p_b = xx_loadl_32(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+ const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_b = xx_loadl_32(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+ const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+ const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+ const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+ const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *msk) { \
+ if (w == 4) { \
+ return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
+ } else { \
+ return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
+ } \
+ }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ do {
+ const __m128i v_p_w = xx_loadl_64(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_w = xx_loadl_64(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+ const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+ const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+ const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+ const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ if (w == 4) { \
+ return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
+ } else { \
+ return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+ } \
+ }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 0000000000..c23d8c4eb0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m128i v_d;
+ const uint8_t *pre_temp;
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+ const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+ const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_tmp_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+ const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+ const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ pre_temp += 8;
+ n += 8;
+ width -= 8;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+ v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ *sum = _mm_cvtsi128_si32(v_d);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m256i v_d;
+ __m128i res0;
+ const uint8_t *pre_temp;
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+
+ assert(w >= 16);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+ const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_m1_d =
+ _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+ const __m256i v_w1_d =
+ _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+ const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+ const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+ const __m256i v_tmp0_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+ const __m256i v_tmp1_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+ const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+ const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ pre_temp += 16;
+ n += 16;
+ width -= 16;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+
+ v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm256_hadd_epi32(v_d, v_d);
+ res0 = _mm256_castsi256_si128(v_d);
+ res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+ *sum = _mm_cvtsi128_si32(res0);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else if (W == 8) { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } else { \
+ obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 0000000000..89b050eb20
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ const int pre_step = pre_stride - w;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_b = xx_loadl_32(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+ const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+ const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 8;
+
+ if (n % w == 0) pre += pre_step;
+ } while (n < w * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
+
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H) \
+ uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ aom_var_filter_block2d_bil_first_pass_ssse3( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_ssse3( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \
+ }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void hbd_obmc_variance_w4(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_w = xx_loadl_64(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+ const int h) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - w;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_w = xx_loadl_64(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+ const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+ const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 8;
+
+ if (n % w == 0) pre += pre_step;
+ } while (n < w * h);
+
+ *sum += xx_hsum_epi32_si64(v_sum_d);
+ *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ }
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else if (w < 128 || h < 128) {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ } else {
+ assert(w == 128 && h == 128);
+
+ do {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+ 64);
+ pre8 += 64 * pre_stride;
+ wsrc += 64 * w;
+ mask += 64 * w;
+ h -= 64;
+ } while (h > 0);
+ }
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ int max_pel_allowed_per_ovf = 512;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else if (w * h <= max_pel_allowed_per_ovf) {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ } else {
+ int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+ assert(max_pel_allowed_per_ovf % w == 0);
+ do {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+ h_per_ovf);
+ pre8 += h_per_ovf * pre_stride;
+ wsrc += h_per_ovf * w;
+ mask += h_per_ovf * w;
+ h -= h_per_ovf;
+ } while (h > 0);
+ }
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H) \
+ unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+HBD_OBMCVARWXH(4, 16)
+HBD_OBMCVARWXH(16, 4)
+HBD_OBMCVARWXH(8, 32)
+HBD_OBMCVARWXH(32, 8)
+HBD_OBMCVARWXH(16, 64)
+HBD_OBMCVARWXH(64, 16)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx2.c b/third_party/aom/aom_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..b808d46778
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_avx2.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+ const int16_t *round_ptr, __m256i *round,
+ const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr,
+ __m256i *dequant,
+ const int16_t *shift_ptr, __m256i *shift,
+ int log_scale) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *zbin = _mm256_add_epi16(*zbin, rnd);
+ *zbin = _mm256_srai_epi16(*zbin, log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *round = _mm256_add_epi16(*round, rnd);
+ *round = _mm256_srai_epi16(*round, log_scale);
+ }
+
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+ const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+static AOM_FORCE_INLINE __m256i quantize_b_logscale0_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+ return _mm256_setzero_si256();
+ }
+
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >>
+ // (16 - log_scale + AOM_QM_BITS));
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+ const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+ return v_nz_mask;
+}
+
+static INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax,
+ __m256i v_mask) {
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+ const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+ const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+void aom_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ (void)scan;
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 0);
+
+ // Do DC and first 15 AC.
+ __m256i v_nz_mask =
+ quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask =
+ quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static AOM_FORCE_INLINE __m256i quantize_b_logscale_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift, int log_scale) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+ return _mm256_setzero_si256();
+ }
+
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >>
+ // (16 - log_scale + AOM_QM_BITS));
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32_hi = _mm256_slli_epi16(
+ _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), log_scale);
+ const __m256i v_tmp32_lo = _mm256_srli_epi16(
+ _mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 16 - log_scale);
+ const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+ const __m256i v_dqcoeff_hi = _mm256_slli_epi16(
+ _mm256_mulhi_epi16(v_tmp32, *v_dequant), 16 - log_scale);
+ const __m256i v_dqcoeff_lo =
+ _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32, *v_dequant), log_scale);
+ const __m256i v_dqcoeff =
+ _mm256_sign_epi16(_mm256_or_si256(v_dqcoeff_hi, v_dqcoeff_lo), v_coeff);
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+ return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE void quantize_b_no_qmatrix_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *iscan, int log_scale) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, log_scale);
+
+ // Do DC and first 15 AC.
+ __m256i v_nz_mask = quantize_b_logscale_16(
+ coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round,
+ &v_zbin, &v_quant_shift, log_scale);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask = quantize_b_logscale_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, log_scale);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+void aom_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 1);
+}
+
+void aom_quantize_b_64x64_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 2);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000000..ebef1fbac2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan_ptr;
+
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3.c b/third_party/aom/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..25980a055a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+ const __m128i quant,
+ const __m128i *shift) {
+ __m128i tmp, qcoeff, tmp1;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ tmp = _mm_mullo_epi16(qcoeff, *shift);
+ tmp = _mm_srli_epi16(tmp, 14);
+ tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+ tmp1 = _mm_slli_epi16(tmp1, 2);
+ *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+ const __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+ const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ // "Divide" by 4.
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+ dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i two = _mm_set1_epi16(2);
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+ (void)n_coeffs;
+
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, two);
+ round = _mm_add_epi16(round, two);
+ zbin = _mm_srli_epi16(zbin, 2);
+ round = _mm_srli_epi16(round, 2);
+ zbin = _mm_sub_epi16(zbin, one);
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 1024; index += 16) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ continue;
+ }
+ calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+ calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8 + index);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..fa616a6f1a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,302 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ movifnidn dequantq, dequantmp
+ mova m0, [zbinq] ; m0 = zbin
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, b_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m0, m5
+ paddw m1, m5
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [dequantq] ; m3 = dequant
+ mov r2, shiftmp
+ psubw m0, [GLOBAL(pw_1)]
+ mova m4, [r2] ; m4 = shift
+ mov r3, qcoeffmp
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+ pxor m5, m5 ; m5 = dedicated zero
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea iscanq, [ iscanq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh
+ %endif
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m8, 1
+ psrlw m5, 15
+ por m8, m5
+ %endif
+ punpckhqdq m4, m4
+ %ifidn %1, b_32x32
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+ or r6, r2
+ jz .skip_iter
+%endif
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh
+ %endif
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m14, 1
+ psrlw m5, 15
+ por m14, m5
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pxor m11, m11
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5
+
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+%ifidn %1, b_32x32
+ jmp .accumulate_eob
+.skip_iter:
+ mova [qcoeffq+ncoeffq*4+ 0], m5
+ mova [qcoeffq+ncoeffq*4+16], m5
+ mova [qcoeffq+ncoeffq*4+32], m5
+ mova [qcoeffq+ncoeffq*4+48], m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m5
+ mova [dqcoeffq+ncoeffq*4+16], m5
+ mova [dqcoeffq+ncoeffq*4+32], m5
+ mova [dqcoeffq+ncoeffq*4+48], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 0000000000..5b040a278a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi32(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+ const __m128i round,
+ const __m128i quant,
+ const __m128i *shift,
+ const int *log_scale) {
+ __m128i tmp, tmp1, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ tmp = _mm_mullo_epi16(qcoeff, *shift);
+ tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+ tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+ tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+ *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+ return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+ __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff,
+ const int *log_scale) {
+ // calculate abs
+ __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+ __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+ const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+ dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const __m128i zbin_mask0,
+ const __m128i zbin_mask1,
+ const int16_t *scan_ptr, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+ __m128i eob0, eob1;
+ // Add one to convert from indices to counts
+ scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+ scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+ assert(sizeof(tran_low_t) == 4);
+ const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+ const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+ return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ assert(sizeof(tran_low_t) == 4);
+
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i all_zero;
+ __m128i temp_mask = _mm_setzero_si128();
+ all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+ if (_mm_movemask_epi8(all_zero)) {
+ __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+ __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+ __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+ __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+ temp_mask = _mm_max_epi16(mask0, mask1);
+ *is_found = 1;
+ }
+ *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+ __m128i *threshold, const int16_t *iscan_ptr,
+ int *is_found, __m128i *mask) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+ coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+ coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+ coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+ coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+ coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+ cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+ cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+ coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+ cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+ coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+ cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+ cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+ __m128i mask0, mask1;
+ int non_zero_count = 0;
+ mask0 = _mm_unpackhi_epi64(mask, mask);
+ mask1 = _mm_max_epi16(mask0, mask);
+ mask0 = _mm_shuffle_epi32(mask1, 1);
+ mask0 = _mm_max_epi16(mask0, mask1);
+ mask1 = _mm_srli_epi32(mask0, 16);
+ mask0 = _mm_max_epi16(mask0, mask1);
+ non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+ return non_zero_count;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
new file mode 100644
index 0000000000..0fea6ddfd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
+ const __m256i *sum_ref0,
+ const __m256i *sum_ref1,
+ const __m256i *sum_ref2,
+ const __m256i *sum_ref3) {
+ // In sum_ref-i the result is saved in the first 4 bytes and the other 4
+ // bytes are zeroed.
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ // 0, 0, 1, 1
+ __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ // 2, 2, 3, 3
+ __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+
+ // sum adjacent 32 bit integers
+ __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23);
+
+ // add the low 128 bit to the high 128 bit
+ __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123),
+ _mm256_extractf128_si256(sum_ref0123, 1));
+
+ _mm_storeu_si128((__m128i *)(res), sum);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
+ int M, int N, const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ int i, j;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+ sum_ref3 = _mm256_setzero_si256();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j += 32) {
+ // load src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+ ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+ ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+ ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+ ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+ }
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
+ int M, int N, const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2;
+ int i, j;
+ const uint8_t *ref0, *ref1, *ref2;
+ const __m256i zero = _mm256_setzero_si256();
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j += 32) {
+ // load src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+ ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+ ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+ ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ }
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ }
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+#define SADMXN_AVX2(m, n) \
+ void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
+ } \
+ void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
+ }
+
+SADMXN_AVX2(32, 8)
+SADMXN_AVX2(32, 16)
+SADMXN_AVX2(32, 32)
+SADMXN_AVX2(32, 64)
+
+SADMXN_AVX2(64, 16)
+SADMXN_AVX2(64, 32)
+SADMXN_AVX2(64, 64)
+SADMXN_AVX2(64, 128)
+
+SADMXN_AVX2(128, 64)
+SADMXN_AVX2(128, 128)
+
+#define SAD_SKIP_MXN_AVX2(m, n) \
+ void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref, \
+ 2 * ref_stride, res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_MXN_AVX2(32, 8)
+SAD_SKIP_MXN_AVX2(32, 16)
+SAD_SKIP_MXN_AVX2(32, 32)
+SAD_SKIP_MXN_AVX2(32, 64)
+
+SAD_SKIP_MXN_AVX2(64, 16)
+SAD_SKIP_MXN_AVX2(64, 32)
+SAD_SKIP_MXN_AVX2(64, 64)
+SAD_SKIP_MXN_AVX2(64, 128)
+
+SAD_SKIP_MXN_AVX2(128, 64)
+SAD_SKIP_MXN_AVX2(128, 128)
+
+static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2;
+ const uint8_t *ref0, *ref1, *ref2;
+ const __m256i zero = _mm256_setzero_si256();
+ assert(N % 2 == 0);
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+
+ for (int i = 0; i < N; i += 2) {
+ // load src and all refs
+ src_reg = yy_loadu2_128(src + src_stride, src);
+ ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+ ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+ ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+
+ src += 2 * src_stride;
+ ref0 += 2 * ref_stride;
+ ref1 += 2 * ref_stride;
+ ref2 += 2 * ref_stride;
+ }
+
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ assert(N % 2 == 0);
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+ sum_ref3 = _mm256_setzero_si256();
+
+ for (int i = 0; i < N; i += 2) {
+ // load src and all refs
+ src_reg = yy_loadu2_128(src + src_stride, src);
+ ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+ ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+ ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+ ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+ src += 2 * src_stride;
+ ref0 += 2 * ref_stride;
+ ref1 += 2 * ref_stride;
+ ref2 += 2 * ref_stride;
+ ref3 += 2 * ref_stride;
+ }
+
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+#define SAD16XNX3_AVX2(n) \
+ void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \
+ }
+#define SAD16XNX4_AVX2(n) \
+ void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \
+ }
+
+SAD16XNX4_AVX2(32)
+SAD16XNX4_AVX2(16)
+SAD16XNX4_AVX2(8)
+
+SAD16XNX3_AVX2(32)
+SAD16XNX3_AVX2(16)
+SAD16XNX3_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD16XNX3_AVX2(64)
+SAD16XNX3_AVX2(4)
+
+SAD16XNX4_AVX2(64)
+SAD16XNX4_AVX2(4)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#define SAD_SKIP_16XN_AVX2(n) \
+ void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
+ res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_16XN_AVX2(32)
+SAD_SKIP_16XN_AVX2(16)
+SAD_SKIP_16XN_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_16XN_AVX2(64)
+SAD_SKIP_16XN_AVX2(4)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 0000000000..6edad99516
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,437 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; 'spill_src_stride' affect a lot how the code works.
+;
+; When 'spill_src_stride' is false, the 'src_strideq' resides in
+; register, [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update 'srcq'
+; at each line. We only update 'srcq' each two-lines using a compact
+; LEA instruction like [srcq+src_strideq*2].
+;
+; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
+; we cannot use above form to access memory, we have to update
+; 'srcq' at each line break. As we process two parts (first,second)
+; together in each macro function, the second part may also sit
+; in the next line, which means we also need to possibly add
+; one 'src_strideq' to 'srcq' before processing second part.
+
+%macro HANDLE_SECOND_OFFSET 0
+ %if spill_src_stride
+ %define second_offset 0
+ add srcq, src_strideq
+ %else
+ %define second_offset (src_strideq)
+ %endif
+%endmacro
+
+; This is specically designed to handle when src_strideq is a
+; memory position, under such case, we can not accomplish
+; complex address calculation using LEA, and fall back to
+; using simple ADD instruction at each line ending.
+%macro ADVANCE_END_OF_TWO_LINES 0
+ %if spill_src_stride
+ add srcq, src_strideq
+ %else
+ lea srcq, [srcq+src_strideq*2]
+ %endif
+
+; note: ref_stride is never spilled when processing two lines
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endmacro
+
+; PROCESS_4x2x4 first
+%macro PROCESS_4x2x4 1
+ movd m0, [srcq]
+ HANDLE_SECOND_OFFSET
+%if %1 == 1
+ movd m6, [ref1q]
+ movd m4, [ref2q]
+ movd m7, [ref3q]
+ movd m5, [ref4q]
+
+ movd m1, [srcq + second_offset]
+ movd m2, [ref1q+ref_strideq]
+ punpckldq m0, m1
+ punpckldq m6, m2
+ movd m1, [ref2q+ref_strideq]
+ movd m2, [ref3q+ref_strideq]
+ movd m3, [ref4q+ref_strideq]
+ punpckldq m4, m1
+ punpckldq m7, m2
+ punpckldq m5, m3
+ movlhps m0, m0
+ movlhps m6, m4
+ movlhps m7, m5
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movd m1, [ref1q]
+ movd m5, [ref1q+ref_strideq]
+ movd m2, [ref2q]
+ movd m4, [ref2q+ref_strideq]
+ punpckldq m1, m5
+ punpckldq m2, m4
+ movd m3, [ref3q]
+ movd m5, [ref3q+ref_strideq]
+ punpckldq m3, m5
+ movd m4, [ref4q]
+ movd m5, [ref4q+ref_strideq]
+ punpckldq m4, m5
+ movd m5, [srcq + second_offset]
+ punpckldq m0, m5
+ movlhps m0, m0
+ movlhps m1, m2
+ movlhps m3, m4
+ psadbw m1, m0
+ psadbw m3, m0
+ paddd m6, m1
+ paddd m7, m3
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first
+%macro PROCESS_8x2x4 1
+ movh m0, [srcq]
+ HANDLE_SECOND_OFFSET
+%if %1 == 1
+ movh m4, [ref1q]
+ movh m5, [ref2q]
+ movh m6, [ref3q]
+ movh m7, [ref4q]
+ movhps m0, [srcq + second_offset]
+ movhps m4, [ref1q+ref_strideq]
+ movhps m5, [ref2q+ref_strideq]
+ movhps m6, [ref3q+ref_strideq]
+ movhps m7, [ref4q+ref_strideq]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movh m1, [ref1q]
+ movh m2, [ref2q]
+ movhps m0, [srcq + second_offset]
+ movhps m1, [ref1q+ref_strideq]
+ movhps m2, [ref2q+ref_strideq]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m4, m1
+ paddd m5, m2
+
+ movh m1, [ref3q]
+ movhps m1, [ref3q+ref_strideq]
+ movh m2, [ref4q]
+ movhps m2, [ref4q+ref_strideq]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m6, m1
+ paddd m7, m2
+%endif
+%endmacro
+
+; PROCESS_FIRST_MMSIZE
+%macro PROCESS_FIRST_MMSIZE 0
+ mova m0, [srcq]
+ movu m4, [ref1q]
+ movu m5, [ref2q]
+ movu m6, [ref3q]
+ movu m7, [ref4q]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%endmacro
+
+; PROCESS_16x1x4 offset
+%macro PROCESS_16x1x4 1
+ mova m0, [srcq + %1]
+ movu m1, [ref1q + ref_offsetq + %1]
+ movu m2, [ref2q + ref_offsetq + %1]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m4, m1
+ paddd m5, m2
+
+ movu m1, [ref3q + ref_offsetq + %1]
+ movu m2, [ref4q + ref_offsetq + %1]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m6, m1
+ paddd m7, m2
+%endmacro
+
+; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-3 0
+
+%define spill_src_stride 0
+%define spill_ref_stride 0
+%define spill_cnt 0
+
+; Whether a shared offset should be used instead of adding strides to
+; each reference array. With this option, only one line will be processed
+; per loop iteration.
+%define use_ref_offset (%1 >= mmsize)
+
+; Remove loops in the 4x4 and 8x4 case
+%define use_loop (use_ref_offset || %2 > 4)
+
+%if %3 == 1 ; skip rows
+%if AOM_ARCH_X86_64
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
+ ref2, ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
+ ref2, ref3, ref4, cnt
+%else
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
+ ref2, ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
+ ref4
+%define spill_src_stride 1
+%define spill_ref_stride 1
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
+ ref3, ref4
+%define spill_src_stride 1
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
+ ref3, ref4
+%endif
+%endif
+%else ; normal sad
+%if AOM_ARCH_X86_64
+%if use_ref_offset
+cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+ ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+ ref3, ref4, cnt
+%else
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+ ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
+ %define spill_src_stride 1
+ %define spill_ref_stride 1
+%elif use_loop
+cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
+ %define spill_src_stride 1
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
+ ref4
+%endif
+%endif
+%endif
+
+%if spill_src_stride
+ %define src_strideq r1mp
+ %define src_strided r1mp
+%endif
+%if spill_ref_stride
+ %define ref_strideq r3mp
+ %define ref_strided r3mp
+%endif
+
+%if spill_cnt
+ SUB rsp, 4
+ %define cntd word [rsp]
+%endif
+
+%if %3 == 1
+ sal src_strided, 1
+ sal ref_strided, 1
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; Is the loop for this wxh in another function?
+; If so, we jump into that function for the loop and returning
+%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
+
+%if use_ref_offset
+ PROCESS_FIRST_MMSIZE
+%if %1 > mmsize
+ mov ref_offsetq, 0
+ mov cntd, %2 >> %3
+; Jump part way into the loop for the square version of this width
+%if %3 == 1
+ jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
+%else
+ jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
+%endif
+%else
+ mov ref_offsetq, ref_strideq
+ add srcq, src_strideq
+ mov cntd, (%2 >> %3) - 1
+%endif
+%if external_loop == 0
+.loop:
+; Unrolled horizontal loop
+%assign h_offset 0
+%rep %1/mmsize
+ PROCESS_16x1x4 h_offset
+%if h_offset == 0
+; The first row of the first column is done outside the loop and jumps here
+.midloop:
+%endif
+%assign h_offset h_offset+mmsize
+%endrep
+
+ add srcq, src_strideq
+ add ref_offsetq, ref_strideq
+ sub cntd, 1
+ jnz .loop
+%endif
+%else
+ PROCESS_%1x2x4 1
+ ADVANCE_END_OF_TWO_LINES
+%if use_loop
+ mov cntd, (%2/2 >> %3) - 1
+.loop:
+%endif
+ PROCESS_%1x2x4 0
+%if use_loop
+ ADVANCE_END_OF_TWO_LINES
+ sub cntd, 1
+ jnz .loop
+%endif
+%endif
+
+%if spill_cnt
+; Undo stack allocation for cnt
+ ADD rsp, 4
+%endif
+
+%if external_loop == 0
+%if %3 == 0
+ %define resultq r4
+ %define resultmp r4mp
+%endif
+
+; Undo modifications on parameters on the stack
+%if %3 == 1
+%if spill_src_stride
+ shr src_strided, 1
+%endif
+%if spill_ref_stride
+ shr ref_strided, 1
+%endif
+%endif
+
+%if %1 > 4
+ pslldq m5, 4
+ pslldq m7, 4
+ por m4, m5
+ por m6, m7
+ mova m5, m4
+ mova m7, m6
+ punpcklqdq m4, m6
+ punpckhqdq m5, m7
+ paddd m4, m5
+%if %3 == 1
+ pslld m4, 1
+%endif
+ movifnidn resultq, resultmp
+ movu [resultq], m4
+ RET
+%else
+ pshufd m6, m6, 0x08
+ pshufd m7, m7, 0x08
+%if %3 == 1
+ pslld m6, 1
+ pslld m7, 1
+%endif
+ movifnidn resultq, resultmp
+ movq [resultq+0], m6
+ movq [resultq+8], m7
+ RET
+%endif
+%endif ; external_loop == 0
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64, 128
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16, 8
+SADNXN4D 8, 16
+SADNXN4D 8, 8
+SADNXN4D 8, 4
+SADNXN4D 4, 8
+SADNXN4D 4, 4
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D 4, 16
+SADNXN4D 16, 4
+SADNXN4D 8, 32
+SADNXN4D 32, 8
+SADNXN4D 16, 64
+SADNXN4D 64, 16
+%endif
+SADNXN4D 128, 128, 1
+SADNXN4D 128, 64, 1
+SADNXN4D 64, 128, 1
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D 4, 16, 1
+SADNXN4D 8, 32, 1
+SADNXN4D 32, 8, 1
+SADNXN4D 16, 64, 1
+SADNXN4D 64, 16, 1
+%endif
+
+; Different assembly is needed when the height gets subsampled to 2
+; SADNXN4D 16, 4, 1
+; SADNXN4D 8, 4, 1
+; SADNXN4D 4, 4, 1
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000000..24cea76b37
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+ _mm256_zeroupper();
+ return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ int ref2_stride = ref_stride << 1;
+ int src2_stride = src_stride << 1;
+ int max = h >> 1;
+ for (i = 0; i < max; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref2_stride;
+ src_ptr += src2_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+ _mm256_zeroupper();
+ return res;
+}
+
+#define FSAD64_H(h) \
+ unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS64_H(h) \
+ unsigned int aom_sad_skip_64x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD32_H(h) \
+ unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS32_H(h) \
+ unsigned int aom_sad_skip_32x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32) \
+ FSADS64_H(64) \
+ FSADS64_H(32)
+
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16) \
+ FSADS32_H(64) \
+ FSADS32_H(32) \
+ FSADS32_H(16)
+
+/* clang-format off */
+FSAD64
+FSAD32
+/* clang-format on */
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+ unsigned int aom_sad64x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSADAVG32_H(h) \
+ unsigned int aom_sad32x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSADAVG64 \
+ FSADAVG64_H(64) \
+ FSADAVG64_H(32)
+
+#define FSADAVG32 \
+ FSADAVG32_H(64) \
+ FSADAVG32_H(32) \
+ FSADAVG32_H(16)
+
+/* clang-format off */
+FSADAVG64
+FSADAVG32
+/* clang-format on */
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
new file mode 100644
index 0000000000..c5da6e9ab3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ __m256i s1, s2, r1, r2;
+ __m256i sum = _mm256_setzero_si256();
+ __m128i sum_i128;
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
+ s2 = _mm256_sad_epu8(
+ r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
+ ref_ptr += ref_stride << 1;
+ src_ptr += src_stride << 1;
+ }
+
+ sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
+ sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
+ _mm256_castsi256_si128(sum));
+ return (unsigned int)_mm_cvtsi128_si32(sum_i128);
+}
+
+static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ unsigned int half_width = 32;
+ uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 5;
+ ref_ptr += ref_stride << 5;
+ sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ unsigned int half_width = 64;
+ uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint32_t half_width = 64;
+ uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+ return 2 * sum;
+}
+
+unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint32_t sum =
+ sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+ return 2 * sum;
+}
+
+unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint32_t sum =
+ aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+ return 2 * sum;
+}
+
+static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int h, const uint8_t *second_pred,
+ const int second_pred_stride) {
+ int i;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ ref1_reg = _mm256_avg_epu8(
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
+ ref2_reg = _mm256_avg_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ second_pred += second_pred_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+}
+
+unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 64);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ second_pred += 64 << 6;
+ sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 64);
+ return sum;
+}
+
+unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ unsigned int half_width = 64;
+ uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 128);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ second_pred += half_width;
+ sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 128);
+ return sum;
+}
+
+unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
+ ref_stride, second_pred);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ second_pred += 128 << 6;
+ sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
+ second_pred);
+ return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
new file mode 100644
index 0000000000..dbe8ca3161
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -0,0 +1,432 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro SAD_FN 4
+%if %4 == 0 ; normal sad
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if AOM_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea src_strided, [src_strided*2]
+lea ref_strided, [ref_strided*2]
+%endif ; %4 skip
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+ SAD_FN 128, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*4]
+ pavgb m2, [second_predq+mmsize*5]
+ pavgb m3, [second_predq+mmsize*6]
+ pavgb m4, [second_predq+mmsize*7]
+ lea second_predq, [second_predq+mmsize*8]
+%endif
+ psadbw m1, [srcq+64]
+ psadbw m2, [srcq+80]
+ psadbw m3, [srcq+96]
+ psadbw m4, [srcq+112]
+
+ add refq, ref_strideq
+ add srcq, src_strideq
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ sub n_rowsd, 1
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128 ; sad128x128_sse2
+SAD128XN 128, 1 ; sad128x128_avg_sse2
+SAD128XN 128, 2 ; sad128x128_skip_sse2
+SAD128XN 64 ; sad128x64_sse2
+SAD128XN 64, 1 ; sad128x64_avg_sse2
+SAD128XN 64, 2 ; sad128x64_skip_sse2
+
+
+; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+ paddd m1, m2
+ paddd m3, m4
+ add refq, ref_strideq
+ paddd m0, m1
+ add srcq, src_strideq
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 128 ; sad64x128_sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 16 ; sad64x16_sse2
+SAD64XN 128, 1 ; sad64x128_avg_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 16, 1 ; sad64x16_avg_sse2
+SAD64XN 128, 2 ; sad64x128_skip_sse2
+SAD64XN 64, 2 ; sad64x64_skip_sse2
+SAD64XN 32, 2 ; sad64x32_skip_sse2
+SAD64XN 16, 2 ; sad64x16_skip_sse2
+
+; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq]
+ movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+src_strideq]
+ psadbw m4, [srcq+src_strideq+16]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 8 ; sad_32x8_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 8, 1 ; sad_32x8_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+SAD32XN 8, 2 ; sad_32x8_skip_sse2
+
+; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+src_strideq]
+ psadbw m3, [srcq+src_strideq*2]
+ psadbw m4, [srcq+src_stride3q]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 64 ; sad_16x64_sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN 8 ; sad16x8_sse2
+SAD16XN 4 ; sad_16x4_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
+SAD16XN 4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64, 2 ; sad_16x64_skip_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN 8, 2 ; sad16x8_skip_sse2
+
+; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movh m1, [refq]
+ movhps m1, [refq+ref_strideq]
+ movh m2, [refq+ref_strideq*2]
+ movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
+ movh m3, [srcq]
+ movhps m3, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movhps m4, [srcq+src_stride3q]
+ psadbw m1, m3
+ psadbw m2, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 32 ; sad_8x32_sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN 8 ; sad8x8_sse2
+SAD8XN 4 ; sad8x4_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
+SAD8XN 32, 2 ; sad_8x32_skip_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN 8, 2 ; sad8x8_skip_sse2
+
+; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movd m1, [refq]
+ movd m2, [refq+ref_strideq]
+ movd m3, [refq+ref_strideq*2]
+ movd m4, [refq+ref_stride3q]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movlhps m1, m3
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ lea second_predq, [second_predq+mmsize*1]
+%endif
+ movd m2, [srcq]
+ movd m5, [srcq+src_strideq]
+ movd m4, [srcq+src_strideq*2]
+ movd m3, [srcq+src_stride3q]
+ punpckldq m2, m5
+ punpckldq m4, m3
+ movlhps m2, m4
+ psadbw m1, m2
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN 16 ; sad_4x16_sse2
+SAD4XN 8 ; sad4x8_sse
+SAD4XN 4 ; sad4x4_sse
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
+SAD4XN 16, 2 ; sad_4x16_skip_sse2
+SAD4XN 8, 2 ; sad4x8_skip_sse
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 0000000000..c5a5f5c234
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m256i v_a0 = yy_loadu_256(a);
+ const __m256i v_b0 = yy_loadu_256(b);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+ const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+ const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+ const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+ const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+ const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+ int64_t sum;
+ __m256i zero = _mm256_setzero_si256();
+ const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+ const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+ const __m256i sum0_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+ const __m256i sum1_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+ int64_t sum;
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m256i *sum) {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+ const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+ _mm_unpacklo_epi32(v_a2, v_a3));
+ const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+ _mm_unpacklo_epi32(v_b2, v_b3));
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m256i *sum) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ __m256i sum = _mm256_setzero_si256();
+ __m256i zero = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_a1 = xx_loadu_128(a + a_stride);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m128i v_b1 = xx_loadu_128(b + b_stride);
+ const __m256i v_a =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+ const __m256i v_b =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+ const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+ const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+ const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+ const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+ const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+ const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+ const __m256i temp =
+ _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+ _mm256_madd_epi16(v_bsub, v_bsub));
+ sum = _mm256_add_epi32(sum, temp);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 128:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ sse_w32_avx2(&sum, a + 64, b + 64);
+ sse_w32_avx2(&sum, a + 96, b + 96);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default:
+ if ((width & 0x07) == 0) {
+ do {
+ int i = 0;
+ do {
+ sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ i += 8;
+ } while (i < width);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ } else {
+ do {
+ int i = 0;
+ do {
+ sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ const uint8_t *a2 = a + i + (a_stride << 1);
+ const uint8_t *b2 = b + i + (b_stride << 1);
+ sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+ i += 8;
+ } while (i + 4 < width);
+ sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ }
+ sse = summary_all_avx2(&sum);
+ break;
+ }
+
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m256i v_a_w = yy_loadu_256(a);
+ const __m256i v_b_w = yy_loadu_256(b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+ const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+ _mm_unpacklo_epi64(v_a2, v_a3));
+ const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+ _mm_unpacklo_epi64(v_b2, v_b3));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+ const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 64 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 64;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ case 64:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 32 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 32;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ case 128:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 16 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 16;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ default:
+ if (width & 0x7) {
+ do {
+ int i = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ const uint16_t *a2 = a + i + (a_stride << 1);
+ const uint16_t *b2 = b + i + (b_stride << 1);
+ highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+ i += 8;
+ } while (i + 4 < width);
+ highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ summary_32_avx2(&sum32, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ } else {
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ int i = 0;
+ do {
+ highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ i += 8;
+ } while (i < width);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ l += 2;
+ } while (l < 8 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ }
+ sse = summary_4x64_avx2(sum);
+ break;
+ }
+ return sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 0000000000..7e74554d75
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+ int64_t sum;
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+ const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+ *sum64 = _mm_add_epi64(sum0, *sum64);
+ *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+ const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+ const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+ const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m128i *sum) {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+ const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+ __m128i *sum) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y = 0;
+ int64_t sse = 0;
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ sse8_sse4_1(a, b, &sum);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 128:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+ sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+ sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+ sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default:
+ if (width & 0x07) {
+ do {
+ int i = 0;
+ do {
+ sse8_sse4_1(a + i, b + i, &sum);
+ sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+ i += 8;
+ } while (i + 4 < width);
+ sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+ a += (a_stride << 1);
+ b += (b_stride << 1);
+ y += 2;
+ } while (y < height);
+ } else {
+ do {
+ int i = 0;
+ do {
+ sse8_sse4_1(a + i, b + i, &sum);
+ i += 8;
+ } while (i < width);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ }
+ sse = summary_all_sse4(&sum);
+ break;
+ }
+
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+ const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m128i v_a_w = xx_loadu_128(a);
+ const __m128i v_b_w = xx_loadu_128(b);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 64 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 64;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 32:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 32 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 32;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 64:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 16 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 16;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 128:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 8 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ default:
+ if (width & 0x7) {
+ do {
+ __m128i sum32 = _mm_setzero_si128();
+ int i = 0;
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+ highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+ i += 8;
+ } while (i + 4 < width);
+ highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+ a += (a_stride << 1);
+ b += (b_stride << 1);
+ y += 2;
+ summary_32_sse4(&sum32, &sum);
+ } while (y < height);
+ } else {
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ int i = 0;
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+ i += 8;
+ } while (i < width);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 8 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ }
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ }
+ return sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm
new file mode 100644
index 0000000000..49bc655336
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(aom_ssim_parms_16x16_sse2)
+sym(aom_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(aom_ssim_parms_8x8_sse2)
+sym(aom_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 0000000000..d1d8373456
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1470 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 14, 2
+ times 8 db 12, 4
+ times 8 db 10, 6
+ times 16 db 8
+ times 8 db 6, 10
+ times 8 db 4, 12
+ times 8 db 2, 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%else ; 4xh
+ pshuflw m4, m6, 0xe
+ pshuflw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshuflw m4, m6, 0xe
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if AOM_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ;Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, sec, sec_stride, \
+ height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+%if %1 == 4
+ %define movx movd
+%else
+ %define movx movh
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar block_height, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+%endif
+
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+ pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq*2]
+%else ; 4xh
+ movx m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%endif
+ movx m1, [dstq]
+%if %1 > 4
+ movlhps m0, m2
+%else ; 4xh
+ punpckldq m0, m2
+%endif
+ movx m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+%if %1 > 4
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m4, [secq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq*2]
+ movx m1, [dstq]
+ pavgb m0, m2
+ movx m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq*2]
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movx m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonzero:
+ cmp x_offsetd, 4
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+ movx m2, [srcq+src_strideq+1]
+ punpckldq m4, m2
+%endif
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+%if %1 > 4
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+ movx m1, [dstq]
+ pavgb m0, m4
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movx m2, [srcq]
+ movx m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ movx m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movx m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%endif
+ pavgb m2, m3
+%if %1 > 4
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; 4xh
+ punpckldq m0, m2
+ pshuflw m4, m2, 0xe
+%endif
+ movx m1, [dstq]
+ pavgb m0, m2
+ movx m3, [dstq+dst_strideq]
+%if %1 > 4
+ pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ;x86_32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movx m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movx m1, [dstq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movx m1, [dstq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [dstq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movx m1, [dstq]
+ paddw m4, m3
+ movx m3, [dstq+dst_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [dstq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movx m4, [srcq]
+ movx m3, [srcq+1]
+
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m3, [dstq+dst_strideq]
+ movx m1, [dstq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movx m3, [dstq+dst_strideq]
+ paddw m2, m1
+ movx m1, [dstq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+ STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..b4c5cc7c7b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+ __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+ __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01);
+ __m256i diff0 = _mm256_unpacklo_epi8(s, p);
+ __m256i diff1 = _mm256_unpackhi_epi8(s, p);
+ diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone);
+ diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone);
+ _mm256_store_si256((__m256i *)(diff_ptr),
+ _mm256_permute2x128_si256(diff0, diff1, 0x20));
+ _mm256_store_si256((__m256i *)(diff_ptr + 16),
+ _mm256_permute2x128_si256(diff0, diff1, 0x31));
+}
+
+static INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+ __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+ __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void subtract_block_128xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+ subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 128:
+ subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ default:
+ aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
new file mode 100644
index 0000000000..fd508c0916
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,147 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void aom_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+ cmp colsd, 64
+ je .case_64
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ movu m1, [predq+%3]
+ movu m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_128:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize
+ loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ sub rowsd, 1
+ jnz .loop_128
+ RET
+
+.case_64:
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ emms
+ RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 0000000000..89b9b824bf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+ int width, int height) {
+ uint64_t result;
+ __m256i v_acc_q = _mm256_setzero_si256();
+ const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
+ for (int col = 0; col < height; col += 4) {
+ __m256i v_acc_d = _mm256_setzero_si256();
+ for (int row = 0; row < width; row += 16) {
+ const int16_t *tempsrc = src + row;
+ const __m256i v_val_0_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+ const __m256i v_val_1_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+ const __m256i v_val_2_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+ const __m256i v_val_3_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+ const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+ const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+ v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+ }
+ v_acc_q =
+ _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+ src += 4 * stride;
+ }
+ __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+ __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+ __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+ result_64_2_int = _mm_add_epi64(
+ result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+ xx_storel_64(&result, result_64_2_int);
+
+ return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+ int height) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+ } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+ return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride,
+ int width, int height, int *sum) {
+ uint64_t result;
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i one_reg = _mm256_set1_epi16(1);
+
+ __m256i v_sse_total = zero_reg;
+ __m256i v_sum_total = zero_reg;
+
+ for (int col = 0; col < height; col += 4) {
+ __m256i v_sse_row = zero_reg;
+ for (int row = 0; row < width; row += 16) {
+ const int16_t *tempsrc = src + row;
+ const __m256i v_val_0_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+ const __m256i v_val_1_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+ const __m256i v_val_2_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+ const __m256i v_val_3_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+ const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w);
+ const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w);
+ __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23);
+ v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg);
+ v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123);
+
+ const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d);
+ v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d);
+ }
+ const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg);
+ const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg);
+ v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi);
+ v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row);
+ src += 4 * stride;
+ }
+
+ const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total);
+ const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1);
+ __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low);
+ sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8));
+ sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4));
+ *sum += _mm_cvtsi128_si32(sum_128bit);
+
+ __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total);
+ __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1);
+ __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi);
+
+ sse_128bit =
+ _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit));
+
+ xx_storel_64(&result, sse_128bit);
+
+ return result;
+}
+
+uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width,
+ int height, int *sum) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+ } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+ return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+ } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+ return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum);
+ } else {
+ return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+ }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+ __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+ __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 8);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 4);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 2);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+ __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+ __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 8);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 4);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint8_t *srcp;
+ uint64_t s = 0, ss = 0;
+ __m256i vzero = _mm256_setzero_si256();
+ __m256i v_acc_sum = vzero;
+ __m256i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 32 elements in a row
+ for (i = 0; i < width - 31; i += 32) {
+ srcp = src + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 7; j += 8) {
+ __m256i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+ __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi16(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+ __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+ __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi16(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = src;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint8_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+ uint64_t s = 0, ss = 0;
+ __m256i vzero = _mm256_setzero_si256();
+ __m256i v_acc_sum = vzero;
+ __m256i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 16 elements in a row
+ for (i = 0; i < width - 15; i += 16) {
+ srcp = srcp1 + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 8; j += 8) {
+ __m256i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+ v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi32(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+ __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+ v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi32(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = srcp1;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint16_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000000..cf3ed98974
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+ const __m128d ad = _mm_castsi128_pd(a);
+ return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if AOM_ARCH_X86_64
+ return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, a);
+ return tmp;
+ }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+ const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+ const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+ const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+ const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+ const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+ const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+ return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
+ const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+ __m128i v_sum_d =
+ _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+ v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
+ return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) {
+ const __m128i one_reg = _mm_set1_epi16(1);
+ const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+ const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+ __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+ __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+
+ __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w);
+ v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+ v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8));
+ v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4));
+ *sum = _mm_cvtsi128_si32(v_sum_0123_d);
+
+ const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+ const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+ __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+ v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8));
+ v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4));
+ return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height) {
+ int r = 0;
+ __m128i v_acc_q = _mm_setzero_si128();
+ do {
+ const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+ v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+ src += stride << 2;
+ r += 4;
+ } while (r < height);
+ const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+ __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+ _mm_and_si128(v_acc_q, v_zext_mask_q));
+ v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+ return xx_cvtsi128_si64(v_acc_64);
+}
+
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+ int *sum) {
+ int r = 0;
+ uint64_t sse = 0;
+ do {
+ int curr_sum = 0;
+ sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum);
+ *sum += curr_sum;
+ src += stride << 2;
+ r += 4;
+ } while (r < height);
+ return sse;
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height) {
+ int r = 0;
+
+ const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+ __m128i v_acc_q = _mm_setzero_si128();
+
+ do {
+ __m128i v_acc_d = _mm_setzero_si128();
+ int c = 0;
+ do {
+ const int16_t *b = src + c;
+
+ const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+ const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+ const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+ const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ c += 8;
+ } while (c < width);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+ src += 4 * stride;
+ r += 4;
+ } while (r < height);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+ return xx_cvtsi128_si64(v_acc_q);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height, int *sum) {
+ int r = 0;
+ uint64_t result;
+ const __m128i zero_reg = _mm_setzero_si128();
+ const __m128i one_reg = _mm_set1_epi16(1);
+
+ __m128i v_sse_total = zero_reg;
+ __m128i v_sum_total = zero_reg;
+
+ do {
+ int c = 0;
+ __m128i v_sse_row = zero_reg;
+ do {
+ const int16_t *b = src + c;
+
+ __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+ __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+ __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+ __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+ v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d);
+
+ const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w);
+ const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w);
+ __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23);
+ v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+ v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d);
+
+ c += 8;
+ } while (c < width);
+
+ const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg);
+ const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg);
+ v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi);
+ v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row);
+ src += 4 * stride;
+ r += 4;
+ } while (r < height);
+
+ v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8));
+ v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4));
+ *sum += _mm_cvtsi128_si32(v_sum_total);
+
+ v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8));
+ xx_storel_64(&result, v_sse_total);
+ return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
+ int height) {
+ // 4 elements per row only requires half an XMM register, so this
+ // must be a special case, but also note that over 75% of all calls
+ // are with size == 4, so it is also the common case.
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
+ int height, int *sum) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case
+ return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+ } else {
+ return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+ const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+ __m128i v_acc0_q = _mm_setzero_si128();
+ __m128i v_acc1_q = _mm_setzero_si128();
+
+ const int16_t *const end = src + n;
+
+ assert(n % 64 == 0);
+
+ while (src < end) {
+ const __m128i v_val_0_w = xx_load_128(src);
+ const __m128i v_val_1_w = xx_load_128(src + 8);
+ const __m128i v_val_2_w = xx_load_128(src + 16);
+ const __m128i v_val_3_w = xx_load_128(src + 24);
+ const __m128i v_val_4_w = xx_load_128(src + 32);
+ const __m128i v_val_5_w = xx_load_128(src + 40);
+ const __m128i v_val_6_w = xx_load_128(src + 48);
+ const __m128i v_val_7_w = xx_load_128(src + 56);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+ v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+ src += 64;
+ }
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+ return xx_cvtsi128_si64(v_acc0_q);
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+ if (n % 64 == 0) {
+ return aom_sum_squares_i16_64n_sse2(src, n);
+ } else if (n > 64) {
+ const uint32_t k = n & ~63u;
+ return aom_sum_squares_i16_64n_sse2(src, k) +
+ aom_sum_squares_i16_c(src + k, n - k);
+ } else {
+ return aom_sum_squares_i16_c(src, n);
+ }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+ __m128i vtmp = _mm_srli_si128(vec_a, 8);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 4);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 2);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+ __m128i vtmp = _mm_srli_si128(vec_a, 8);
+ vec_a = _mm_add_epi32(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 4);
+ vec_a = _mm_add_epi32(vec_a, vtmp);
+ return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint8_t *srcp;
+ uint64_t s = 0, ss = 0;
+ __m128i vzero = _mm_setzero_si128();
+ __m128i v_acc_sum = vzero;
+ __m128i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 16 elements in a row
+ for (i = 0; i < width - 15; i += 16) {
+ srcp = src + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 7; j += 8) {
+ __m128i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+ __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi16(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+ __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+ __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi16(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = src;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint8_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+ uint64_t s = 0, ss = 0;
+ __m128i vzero = _mm_setzero_si128();
+ __m128i v_acc_sum = vzero;
+ __m128i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 8 elements in a row
+ for (i = 0; i < width - 8; i += 8) {
+ srcp = srcp1 + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 8; j += 8) {
+ __m128i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+ v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi32(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+ __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+ v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi32(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = srcp1;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint16_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 0000000000..5ed3f2c7bf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+ int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum);
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+ int *sum);
+uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height, int *sum);
+
+#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
new file mode 100644
index 0000000000..6744ec51d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
+
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+ int val;
+ memcpy(&val, a, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+ return _mm_loadl_epi64((const __m128i *)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i *)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+ const int val = _mm_cvtsi128_si32(v);
+ memcpy(a, &val, sizeof(val));
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i *)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+ _mm_store_si128((__m128i *)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+ _mm_storeu_si128((__m128i *)a, v);
+}
+
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+ return _mm_set_epi32(0, e1, 0, e0);
+#else
+ return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+ return _mm_set_epi32(0, a, 0, a);
+#else
+ return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Fill an SSE register using an interleaved pair of values, ie. set the
+// 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
+// as when a register is stored to / loaded from memory.
+//
+// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
+// instruction
+static INLINE __m128i xx_set2_epi16(int16_t a, int16_t b) {
+ return _mm_setr_epi16(a, b, a, b, a, b, a, b);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+ return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+ const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
+ return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+ const __m128i v_tmp_d =
+ _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+#endif // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 0000000000..b729e5f410
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+ return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+ return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+ _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+ _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+ return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+ return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+ return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+ _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+ _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+ const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+ return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..9dab750f44
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 16 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // Unpack 16 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // Unpack 32 bit elements resulting in:
+ // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+ const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+ const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+ const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(c0, c0);
+ out[1] = _mm_unpackhi_epi64(c0, c0);
+ out[2] = _mm_unpacklo_epi64(c1, c1);
+ out[3] = _mm_unpackhi_epi64(c1, c1);
+ out[4] = _mm_unpacklo_epi64(c2, c2);
+ out[5] = _mm_unpackhi_epi64(c2, c2);
+ out[6] = _mm_unpacklo_epi64(c3, c3);
+ out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 01 11 21 31 __ __ __ __
+ // out[2]: 02 12 22 32 03 13 23 33
+ // out[3]: 03 13 23 33 __ __ __ __
+ //
+ // Note: The high 64 bits of the output registers are shown for informational
+ // purposes only. Callers should only use the low 64 bits of the output
+ // registers. "__" indicates zeros.
+ out[0] = _mm_unpacklo_epi32(a0, a1);
+ out[1] = _mm_srli_si128(out[0], 8);
+ out[2] = _mm_unpackhi_epi32(a0, a1);
+ out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b2: 04 14 24 34 05 15 25 35
+ // b4: 02 12 22 32 03 13 23 33
+ // b6: 06 16 26 36 07 17 27 37
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 XX XX XX XX
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 XX XX XX XX
+ // out[3]: 03 13 23 33 XX XX XX XX
+ // out[4]: 04 14 24 34 XX XX XX XX
+ // out[5]: 05 15 25 35 XX XX XX XX
+ // out[6]: 06 16 26 36 XX XX XX XX
+ // out[7]: 07 17 27 37 XX XX XX XX
+ const __m128i zeros = _mm_setzero_si128();
+ out[0] = _mm_unpacklo_epi64(b0, zeros);
+ out[1] = _mm_unpackhi_epi64(b0, zeros);
+ out[2] = _mm_unpacklo_epi64(b4, zeros);
+ out[3] = _mm_unpackhi_epi64(b4, zeros);
+ out[4] = _mm_unpacklo_epi64(b2, zeros);
+ out[5] = _mm_unpackhi_epi64(b2, zeros);
+ out[6] = _mm_unpacklo_epi64(b6, zeros);
+ out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+ __m128i *const right) {
+ __m128i tbuf[8];
+ transpose_16bit_8x8(left, left);
+ transpose_16bit_8x8(right, tbuf);
+ transpose_16bit_8x8(left + 8, right);
+ transpose_16bit_8x8(right + 8, right + 8);
+
+ left[8] = tbuf[0];
+ left[9] = tbuf[1];
+ left[10] = tbuf[2];
+ left[11] = tbuf[3];
+ left[12] = tbuf[4];
+ left[13] = tbuf[5];
+ left[14] = tbuf[6];
+ left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // in[4]: 04 05 06 07
+ // in[5]: 14 15 16 17
+ // in[6]: 24 25 26 27
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 04 05 06 07
+ // in[2]: 10 11 12 13
+ // in[3]: 14 15 16 17
+ // in[4]: 20 21 22 23
+ // in[5]: 24 25 26 27
+ // in[6]: 30 31 32 33
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 0000000000..4105250bc0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+ __m256i *in0, __m256i *in1, const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+ __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+ __m256i u0 = _mm256_madd_epi16(t0, w0);
+ __m256i u1 = _mm256_madd_epi16(t1, w0);
+ __m256i v0 = _mm256_madd_epi16(t0, w1);
+ __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+ __m256i a0 = _mm256_add_epi32(u0, _r);
+ __m256i a1 = _mm256_add_epi32(u1, _r);
+ __m256i b0 = _mm256_add_epi32(v0, _r);
+ __m256i b1 = _mm256_add_epi32(v1, _r);
+
+ __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+ __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+ __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+ __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+ *in0 = _mm256_packs_epi32(c0, c1);
+ *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+ const __m256i _in0 = *in0;
+ const __m256i _in1 = *in1;
+ *in0 = _mm256_adds_epi16(_in0, _in1);
+ *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+ const __m256i _in0 = *in0;
+ const __m256i _in1 = *in1;
+ *in0 = _mm256_add_epi32(_in0, _in1);
+ *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+ __m256i in0, __m256i in1) {
+ const __m256i _in0 = in0;
+ const __m256i _in1 = in1;
+ *out0 = _mm256_adds_epi16(_in0, _in1);
+ *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+ __m256i in0, __m256i in1) {
+ const __m256i _in0 = in0;
+ const __m256i _in1 = in1;
+ *out0 = _mm256_add_epi32(_in0, _in1);
+ *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+ return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+ int stride,
+ __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+ }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+ const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+ const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+ return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (int i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (int i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (int i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + idx] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + idx] = _mm256_inserti128_si256( \
+ t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+ const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+ const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+ const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+ const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+ const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+ const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+ const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+ const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+ const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+ const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+ const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+ const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+ const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ out[0] = _mm256_unpacklo_epi64(b0, b1);
+ out[1] = _mm256_unpackhi_epi64(b0, b1);
+ out[2] = _mm256_unpacklo_epi64(b4, b5);
+ out[3] = _mm256_unpackhi_epi64(b4, b5);
+ out[4] = _mm256_unpacklo_epi64(b2, b3);
+ out[5] = _mm256_unpackhi_epi64(b2, b3);
+ out[6] = _mm256_unpacklo_epi64(b6, b7);
+ out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_adds_epi16(in[i], round);
+ in[i] = _mm256_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
+ const int size, const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size, const int bit,
+ const int val) {
+ const __m256i sqrt2 = _mm256_set1_epi32(val);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = round_shift_32_avx2(input[i], bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+ const __m256i scale_rounding =
+ pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+ return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+ _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+ _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+ _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+ }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+ __m256i *out) {
+ out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+ out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+ out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+ out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+ out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+ out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+ out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+ out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+ out1[0] = _mm256_castsi256_si128(in[0]);
+ out1[1] = _mm256_castsi256_si128(in[1]);
+ out1[2] = _mm256_castsi256_si128(in[2]);
+ out1[3] = _mm256_castsi256_si128(in[3]);
+ out1[4] = _mm256_castsi256_si128(in[4]);
+ out1[5] = _mm256_castsi256_si128(in[5]);
+ out1[6] = _mm256_castsi256_si128(in[6]);
+ out1[7] = _mm256_castsi256_si128(in[7]);
+
+ out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+ out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+ out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+ out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+ out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+ out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+ out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+ out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000000..9c99eb93bd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#define pair_set_epi16(a, b) \
+ _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+ const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+ const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+ return _mm_shuffle_epi32(b, 0x4e);
+}
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000000..046d6f10f8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+ return _mm_add_epi16(_mm256_castsi256_si128(val),
+ _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+ return _mm_add_epi32(_mm256_castsi256_si128(val),
+ _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1)
+
+ // unpack into pairs of source and reference values
+ const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+ const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+ // subtract adjacent elements using src*1 + ref*-1
+ const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+ const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+ const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+ // add to the running totals
+ *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+ *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+ unsigned int *const sse) {
+ // extract the low lane and add it to the high lane
+ const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+ // unpack sse and sum registers and add
+ const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+ // perform the final summation and extract the results
+ const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+ *((int *)sse) = _mm_cvtsi128_si32(res);
+ return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+ unsigned int *const sse) {
+ // extract the low lane and add it to the high lane
+ const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+ const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+ const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+ return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+ unsigned int *const sse) {
+ // extract the low lane and add it to the high lane
+ const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+ const __m128i vsum_64 =
+ _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+ _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+ return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+ const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+ const __m256i sum_hi =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+ return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+ unsigned int *const sse) {
+ vsum = sum_to_32bit_avx2(vsum);
+ const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+ return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+ const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+ const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+ const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+ const uint8_t *const ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i += 2) {
+ variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i++) {
+ variance32_kernel_avx2(src, ref, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i++) {
+ variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+ variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i++) {
+ variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+ variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+ variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+ variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \
+ unsigned int aom_variance##bw##x##bh##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m256i vsse = _mm256_setzero_si256(); \
+ __m256i vsum; \
+ variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
+ const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512)
+
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048)
+
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512)
+#endif
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \
+ unsigned int aom_variance##bw##x##bh##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m256i vsse = _mm256_setzero_si256(); \
+ __m256i vsum = _mm256_setzero_si256(); \
+ for (int i = 0; i < (bh / uh); i++) { \
+ __m256i vsum16; \
+ variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \
+ &vsum16); \
+ vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \
+ src += uh * src_stride; \
+ ref += uh * ref_stride; \
+ } \
+ const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \
+ const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32) // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32) // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16) // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16) // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+ const __m256i d =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+ return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+ const __m256i d =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+ return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+ const __m256i a,
+ uint8_t *comp_pred) {
+ const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+ const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+ const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+ const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+ const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+ const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+ const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+ const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+ const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+ const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+ const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+ const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+ _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int row = 0;
+ if (width == 8) {
+ do {
+ const __m256i pred_0123 = _mm256_loadu_si256((const __m256i *)(pred));
+ const __m128i ref_0 = _mm_loadl_epi64((const __m128i *)(ref));
+ const __m128i ref_1 =
+ _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+ const __m128i ref_2 =
+ _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+ const __m128i ref_3 =
+ _mm_loadl_epi64((const __m128i *)(ref + 3 * ref_stride));
+ const __m128i ref_01 = _mm_unpacklo_epi64(ref_0, ref_1);
+ const __m128i ref_23 = _mm_unpacklo_epi64(ref_2, ref_3);
+
+ const __m256i ref_0123 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(ref_01), ref_23, 1);
+ const __m256i average = _mm256_avg_epu8(pred_0123, ref_0123);
+ _mm256_storeu_si256((__m256i *)(comp_pred), average);
+
+ row += 4;
+ pred += 32;
+ comp_pred += 32;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 16) {
+ do {
+ const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
+ const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
+ const __m256i tmp0 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+ const __m256i ref_0 = _mm256_inserti128_si256(
+ tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+ const __m256i tmp1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+ const __m256i ref_1 = _mm256_inserti128_si256(
+ tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 4;
+ pred += 64;
+ comp_pred += 64;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 32) {
+ do {
+ const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
+ const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref));
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 2;
+ pred += 64;
+ comp_pred += 64;
+ ref += 2 * ref_stride;
+ } while (row < height);
+ } else if (width % 64 == 0) {
+ do {
+ for (int x = 0; x < width; x += 64) {
+ const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred + x));
+ const __m256i pred_1 =
+ _mm256_loadu_si256((const __m256i *)(pred + x + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_storeu_si256((__m256i *)(comp_pred + x), average_0);
+ _mm256_storeu_si256((__m256i *)(comp_pred + x + 32), average_1);
+ }
+ row++;
+ pred += width;
+ comp_pred += width;
+ ref += ref_stride;
+ } while (row < height);
+ } else {
+ aom_comp_avg_pred_c(comp_pred, pred, width, height, ref, ref_stride);
+ }
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask) {
+ int i = 0;
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ if (width == 8) {
+ comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+ mask, mask_stride);
+ } else if (width == 16) {
+ do {
+ const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+ const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+ const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+ const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+ const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ // comp_pred's stride == width == 16
+ comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+ comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+ comp_pred += (16 << 2);
+ i += 4;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x));
+ const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x));
+ const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x));
+
+ comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+ comp_pred += 32;
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ i++;
+ } while (i < height);
+ }
+}
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+ const __m256i s1,
+ const __m256i a) {
+ const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+ const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+ const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+ const __m256i pred_l = _mm256_srai_epi32(
+ _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+ const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+ const __m256i pred_h = _mm256_srai_epi32(
+ _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m256i zero = _mm256_setzero_si256();
+
+ if (width == 8) {
+ do {
+ const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+ const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+ const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+ __m256i m = _mm256_castsi128_si256(m_l);
+ m = _mm256_insertf128_si256(m, m_h, 1);
+ const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+ _mm_storeu_si128((__m128i *)(comp_pred + width),
+ _mm256_extractf128_si256(comp, 1));
+
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ comp_pred += (width << 1);
+ i += 2;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+ const __m256i m_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+ _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16));
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16));
+
+ const __m256i m01_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x)));
+ const __m256i m23_16 = _mm256_cvtepu8_epi16(
+ _mm_loadu_si128((const __m128i *)(mask + x + 16)));
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+ const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+ _mm256_storeu_si256((__m256i *)comp_pred, comp);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+ comp_pred += 32;
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ i += 1;
+ } while (i < height);
+ }
+}
+
+uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
+ __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
+ __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ for (int i = 0; i < h; i += 4) {
+ dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+ dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+ dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
+ dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
+ dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
+ _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
+ dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
+
+ src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+ src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
+ src1_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ // r15 r14 r13------------r1 r0 - 16 bit
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+ src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+ // accumulation of result
+ square_result = _mm256_add_epi32(square_result, src_16x16);
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+// Compute mse of four consecutive 4x4 blocks.
+// In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int src_blk_stride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
+ __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = zeros;
+ uint16_t *src_temp = src;
+
+ for (int i = 0; i < h; i += 4) {
+ dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+ dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+ dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride]));
+ dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride]));
+
+ // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30
+ dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+ // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31
+ dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+ // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32
+ dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8);
+ // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33
+ dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+ // All rows of 1st 4x4 block - r00 r01 r02 r03
+ __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+ // All rows of 2nd 4x4 block - r10 r11 r12 r13
+ __m256i src1_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+ // All rows of 3rd 4x4 block - r20 r21 r22 r23
+ __m256i src2_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride]));
+ // All rows of 4th 4x4 block - r30 r31 r32 r33
+ __m256i src3_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride]));
+
+ // r00 r10 r02 r12
+ __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16);
+ // r01 r11 r03 r13
+ __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16);
+ // r20 r30 r22 r32
+ __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16);
+ // r21 r31 r23 r33
+ __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16);
+
+ // r00 r10 r20 r30
+ src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20);
+ // r01 r11 r21 r31
+ src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20);
+ // r02 r12 r22 r32
+ src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31);
+ // r03 r13 r23 r33
+ src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31);
+
+ // r15 r14 r13------------r1 r0 - 16 bit
+ sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16));
+ sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16));
+ sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16));
+ sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+ src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+ src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+ src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2);
+ src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3);
+
+ // accumulation of result
+ src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+ src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16);
+ const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16);
+ square_result = _mm256_add_epi32(square_result, square_result_0);
+ src_temp += 16;
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_8x8, dst1_8x8, dst3_16x8;
+ __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+ for (int i = 0; i < h; i += 2) {
+ dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+ dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
+ dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+ src1_8x16 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ // r15 r14 r13 - - - r1 r0 - 16 bit
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+ src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+ // accumulation of result
+ square_result = _mm256_add_epi32(square_result, src_16x16);
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+// Compute mse of two consecutive 8x8 blocks.
+// In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int src_blk_stride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_16x8, dst1_16x8;
+ __m256i dst0_16x16, dst1_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result_0, sub_result_1;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = zeros;
+ uint16_t *src_temp = src;
+
+ for (int i = 0; i < h; i += 2) {
+ dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+ dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+
+ // row0 of 1st and 2nd 8x8 block - d00 d10
+ dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+ // row1 of 1st and 2nd 8x8 block - d01 d11
+ dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+
+ // 2 rows of 1st 8x8 block - r00 r01
+ __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+ // 2 rows of 2nd 8x8 block - r10 r11
+ __m256i src1_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+ // r00 r10 - 128bit
+ __m256i tmp0_16x16 =
+ _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20);
+ // r01 r11 - 128bit
+ __m256i tmp1_16x16 =
+ _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31);
+
+ // r15 r14 r13------------r1 r0 - 16 bit
+ sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16));
+ sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each
+ src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+ src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+
+ // accumulation of result
+ src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+ square_result = _mm256_add_epi32(square_result, src0_16x16);
+ src_temp += 16;
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must be satisfied");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
+
+// Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8
+// block and Chroma uses 4x4 block. In src buffer, each block in a filter block
+// is stored sequentially. Hence src_blk_stride is same as block width. Whereas
+// dst buffer is a frame buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must be satisfied");
+ switch (w) {
+ case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
+ case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
+
+static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src,
+ const uint8_t *ref,
+ __m256i set_one_minusone,
+ __m256i sse_8x16[2],
+ __m256i sum_8x16[2]) {
+ const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref));
+
+ const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256);
+ const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256);
+
+ const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone);
+ const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone);
+
+ sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0));
+ sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1));
+ sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0);
+ sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
+}
+
+static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
+ unsigned int *tot_sse, int *tot_sum) {
+ // s00 s01 s10 s11 s20 s21 s30 s31
+ const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]);
+ // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33
+ const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]);
+ // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31
+ const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0);
+ // d00 d01 d10 d11 d20 d21 d30 d31 | X
+ const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08);
+ // d00 d01 d10 d11 d20 d21 d30 d31
+ const __m256i sum_results =
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3));
+
+ // Add sum & sse registers appropriately to get total sum & sse separately.
+ // s0 s1 d0 d1 s2 s3 d2 d3
+ const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results);
+ // s0 s1 s2 s3 d0 d1 d2 d3
+ const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8);
+ // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3
+ const __m256i sum_sse_order_add_1 =
+ _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+ // s0 x x x | d0 x x x
+ const __m256i sum_sse_order_add_final =
+ _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1);
+ // s0
+ const uint32_t first_value =
+ (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0);
+ *tot_sse += first_value;
+ // d0
+ const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4);
+ *tot_sum += second_value;
+ return sum_sse_order_add;
+}
+
+static INLINE void get_var_sse_sum_8x8_quad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref,
+ const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) {
+ assert(h <= 128); // May overflow for larger height.
+ __m256i sse_8x16[2], sum_8x16[2];
+ sum_8x16[0] = _mm256_setzero_si256();
+ sse_8x16[0] = _mm256_setzero_si256();
+ sum_8x16[1] = sum_8x16[0];
+ sse_8x16[1] = sse_8x16[0];
+ const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
+
+ for (int i = 0; i < h; i++) {
+ // Process 8x32 block of one row.
+ calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16);
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ const __m256i sum_sse_order_add =
+ calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum);
+
+ // s0 s1 s2 s3
+ _mm_storeu_si128((__m128i *)sse8x8,
+ _mm256_castsi256_si128(sum_sse_order_add));
+ // d0 d1 d2 d3
+ const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1);
+ _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8);
+
+ // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+ const __m128i mull_results =
+ _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6);
+ // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+ const __m128i variance_8x8 =
+ _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results);
+ // v0 v1 v2 v3
+ _mm_storeu_si128((__m128i *)var8x8, variance_8x8);
+}
+
+static INLINE void get_var_sse_sum_16x16_dual_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref,
+ const int ref_stride, const int h, uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) {
+ assert(h <= 128); // May overflow for larger height.
+ __m256i sse_16x16[2], sum_16x16[2];
+ sum_16x16[0] = _mm256_setzero_si256();
+ sse_16x16[0] = _mm256_setzero_si256();
+ sum_16x16[1] = sum_16x16[0];
+ sse_16x16[1] = sse_16x16[0];
+ const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
+
+ for (int i = 0; i < h; i++) {
+ // Process 16x32 block of one row.
+ calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16);
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ const __m256i sum_sse_order_add =
+ calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum);
+
+ const __m256i sum_sse_order_add_1 =
+ _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+
+ // s0+s1 s2+s3 x x
+ _mm_storel_epi64((__m128i *)sse16x16,
+ _mm256_castsi256_si128(sum_sse_order_add_1));
+
+ // d0+d1 d2+d3 x x
+ const __m128i sum_temp16x16 =
+ _mm256_extractf128_si256(sum_sse_order_add_1, 1);
+
+ // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+ const __m128i mull_results =
+ _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8);
+
+ // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+ const __m128i variance_16x16 =
+ _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results);
+
+ // v0 v1 v2 v3
+ _mm_storel_epi64((__m128i *)var16x16, variance_16x16);
+}
+
+void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8,
+ sse8x8, sum8x8, tot_sse, tot_sum, var8x8);
+}
+
+void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride,
+ 16, sse16x16, tot_sse, tot_sum, var16x16);
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 0000000000..9e9e70ea01
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,924 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+/* clang-format on */
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+ /* load source and destination */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ /* load source and destination of 2 rows and insert*/ \
+ src_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
+ dst_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+ _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+ _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT \
+ /* load source and another source from next row */ \
+ src_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
+ /* load source and next row source from 1 byte onwards */ \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT \
+ dst_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter) \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \
+ __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \
+ __m128i filter_128bit = _mm256_castsi256_si128(filter); \
+ __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter) \
+ /* filter the source */ \
+ src_lo = _mm_maddubs_epi16(src_lo, filter); \
+ src_hi = _mm_maddubs_epi16(src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+ src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+ \
+ /* divide source by 16 */ \
+ src_lo = _mm_srai_epi16(src_lo, 4); \
+ src_hi = _mm_srai_epi16(src_hi, 4);
+
+// TODO(chiyotsai@google.com): These variance functions are macro-fied so we
+// don't have to manually optimize the individual for-loops. We could save some
+// binary size by optimizing the loops more carefully without duplicating the
+// codes with a macro.
+#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height) \
+ static AOM_INLINE int aom_sub_pixel_variance32x##height##_imp_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \
+ __m256i zero_reg; \
+ int i, sum; \
+ sum_reg = _mm256_setzero_si256(); \
+ sse_reg = _mm256_setzero_si256(); \
+ zero_reg = _mm256_setzero_si256(); \
+ \
+ /* x_offset = 0 and y_offset = 0 */ \
+ if (x_offset == 0) { \
+ if (y_offset == 0) { \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 0 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, src_stride) \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 0 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg; \
+ \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, src_stride) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = 4 and y_offset = 0 */ \
+ } else if (x_offset == 4) { \
+ if (y_offset == 0) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg, src_avg; \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ /* average between previous average to current average */ \
+ src_avg = _mm256_avg_epu8(src_avg, src_reg); \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ /* save current source average */ \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg, src_avg; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ /* save current source average */ \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ MERGE_WITH_SRC(src_avg, src_reg) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 0 */ \
+ } else { \
+ if (y_offset == 0) { \
+ __m256i filter, pw8, src_next_reg; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i filter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src_pack = src_reg; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = bilin interpolation \
+ */ \
+ } else { \
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ xfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ y_offset <<= 5; \
+ yfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ \
+ FILTER_SRC(xfilter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(xfilter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* merge previous pack to current pack source */ \
+ MERGE_WITH_SRC(src_pack, src_reg) \
+ /* filter the source */ \
+ FILTER_SRC(yfilter) \
+ src_pack = src_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ } \
+ CALC_SUM_AND_SSE \
+ _mm256_zeroupper(); \
+ return sum; \
+ } \
+ unsigned int aom_sub_pixel_variance32x##height##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ const int sum = aom_sub_pixel_variance32x##height##_imp_avx2( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \
+ }
+
+MAKE_SUB_PIXEL_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_VAR_32XH(16, 4)
+
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ &sse2); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
+ }
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)
+
+#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height) \
+ unsigned int aom_sub_pixel_variance16x##height##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \
+ __m256i zero_reg; \
+ int i, sum; \
+ sum_reg = _mm256_setzero_si256(); \
+ sse_reg = _mm256_setzero_si256(); \
+ zero_reg = _mm256_setzero_si256(); \
+ \
+ /* x_offset = 0 and y_offset = 0 */ \
+ if (x_offset == 0) { \
+ if (y_offset == 0) { \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = 0 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ AVG_NEXT_SRC_INSERT(src_reg, src_stride) \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = 0 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ MERGE_NEXT_SRC_INSERT(src_reg, src_stride) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ } \
+ /* x_offset = 4 and y_offset = 0 */ \
+ } else if (x_offset == 4) { \
+ if (y_offset == 0) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ LOAD_DST_INSERT \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = 4 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg, src_avg, src_temp; \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \
+ src_temp = _mm256_avg_epu8(src_avg, src_temp); \
+ LOAD_DST_INSERT \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_temp, zero_reg) \
+ /* save current source average */ \
+ src_avg = src_next_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride << 1; \
+ src += src_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ LOAD_DST_INSERT \
+ src_avg = _mm256_avg_epu8(src_avg, src_next_reg); \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } else { \
+ /* x_offset = 4 and y_offset = bilin interpolation */ \
+ __m256i filter, pw8, src_next_reg, src_avg, src_temp; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_avg, src_temp) \
+ /* save current source average */ \
+ src_avg = src_next_reg; \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride << 1; \
+ src += src_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_avg, src_next_reg) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 0 */ \
+ } else { \
+ if (y_offset == 0) { \
+ __m256i filter, pw8, src_next_reg; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ MERGE_NEXT_SRC_INSERT(src_reg, 1) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i filter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(filter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src_pack = src_reg; \
+ src += src_stride << 1; \
+ dst += dst_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ LOAD_SRC_MERGE_128BIT(filter) \
+ LOAD_DST_INSERT \
+ FILTER_SRC_128BIT(filter_128bit) \
+ src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } else { \
+ /* x_offset = bilin interpolation and y_offset = bilin interpolation \
+ */ \
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ xfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ y_offset <<= 5; \
+ yfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(xfilter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(xfilter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \
+ /* average between previous pack to the current */ \
+ MERGE_WITH_SRC(src_pack, src_next_reg) \
+ /* filter the source */ \
+ FILTER_SRC(yfilter) \
+ src_pack = src_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride << 1; \
+ dst += dst_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ LOAD_SRC_MERGE_128BIT(xfilter) \
+ LOAD_DST_INSERT \
+ FILTER_SRC_128BIT(filter_128bit) \
+ src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ MERGE_WITH_SRC(src_pack, src_next_reg) \
+ FILTER_SRC(yfilter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } \
+ } \
+ CALC_SUM_AND_SSE \
+ _mm256_zeroupper(); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height)); \
+ }
+
+MAKE_SUB_PIXEL_VAR_16XH(32, 5)
+MAKE_SUB_PIXEL_VAR_16XH(16, 4)
+MAKE_SUB_PIXEL_VAR_16XH(8, 3)
+#if !CONFIG_REALTIME_ONLY
+MAKE_SUB_PIXEL_VAR_16XH(64, 6)
+MAKE_SUB_PIXEL_VAR_16XH(4, 2)
+#endif
+
+#define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \
+ int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
+ unsigned int *sse) { \
+ __m256i sec_reg; \
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \
+ __m256i zero_reg; \
+ int i, sum; \
+ sum_reg = _mm256_setzero_si256(); \
+ sse_reg = _mm256_setzero_si256(); \
+ zero_reg = _mm256_setzero_si256(); \
+ \
+ /* x_offset = 0 and y_offset = 0 */ \
+ if (x_offset == 0) { \
+ if (y_offset == 0) { \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, src_stride) \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 0 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg; \
+ \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, src_stride) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = 4 and y_offset = 0 */ \
+ } else if (x_offset == 4) { \
+ if (y_offset == 0) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg, src_avg; \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ /* save current source average */ \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ /* average between previous average to current average */ \
+ src_avg = _mm256_avg_epu8(src_avg, src_reg); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg); \
+ sec += sec_stride; \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg, src_avg; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ /* save current source average */ \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ MERGE_WITH_SRC(src_avg, src_reg) \
+ FILTER_SRC(filter) \
+ src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg); \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ sec += sec_stride; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 0 */ \
+ } else { \
+ if (y_offset == 0) { \
+ __m256i filter, pw8, src_next_reg; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ sec += sec_stride; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i filter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_reg); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg); \
+ sec += sec_stride; \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ src_pack = src_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = bilin interpolation \
+ */ \
+ } else { \
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ xfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ y_offset <<= 5; \
+ yfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ \
+ FILTER_SRC(xfilter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(xfilter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* merge previous pack to current pack source */ \
+ MERGE_WITH_SRC(src_pack, src_reg) \
+ /* filter the source */ \
+ FILTER_SRC(yfilter) \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ src_pack = src_reg; \
+ sec += sec_stride; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ } \
+ CALC_SUM_AND_SSE \
+ _mm256_zeroupper(); \
+ return sum; \
+ } \
+ unsigned int aom_sub_pixel_avg_variance32x##height##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse, \
+ const uint8_t *sec_ptr) { \
+ const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \
+ sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \
+ }
+
+MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, &sse2); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
+ }
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5)
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 0000000000..699002195b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+ // in computation using _mm_maddubs_epi16.
+ // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+ const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+ const __m128i r = _mm_set1_epi16(round);
+ const int8_t f0 = (int8_t)(filter[0] >> 1);
+ const int8_t f1 = (int8_t)(filter[1] >> 1);
+ const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+ f0, f1, f0, f1, f0, f1);
+ unsigned int i, j;
+ (void)pixel_step;
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ // load source
+ __m128i source_low = xx_loadl_64(a);
+ __m128i source_hi = xx_loadl_64(a + 1);
+
+ // unpack to:
+ // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+ // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+ __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+ // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+ __m128i res = _mm_maddubs_epi16(source, filters);
+
+ // round
+ res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+ xx_storeu_128(b, res);
+
+ a += 8;
+ b += 8;
+ }
+
+ a += src_pixels_per_line - output_width;
+ }
+ } else {
+ const __m128i shuffle_mask =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ for (i = 0; i < output_height; ++i) {
+ // load source, only first 5 values are meaningful:
+ // { a[0], a[1], a[2], a[3], a[4], xxxx }
+ __m128i source = xx_loadl_64(a);
+
+ // shuffle, up to the first 8 are useful
+ // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+ // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+ __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+ res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+ xx_storel_64(b, res);
+
+ a += src_pixels_per_line;
+ b += output_width;
+ }
+ }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ const int16_t round = (1 << FILTER_BITS) >> 1;
+ const __m128i r = _mm_set1_epi32(round);
+ const __m128i filters =
+ _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+ filter[1], filter[0], filter[1]);
+ const __m128i shuffle_mask =
+ _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+ const __m128i mask =
+ _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ // load source as:
+ // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+ __m128i source1 = xx_loadl_64(a);
+ __m128i source2 = xx_loadl_64(a + pixel_step);
+ __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+ // shuffle source to:
+ // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+ // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+ __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+ // round
+ res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+ // shuffle to get each lower 8 bit of every 32 bit
+ res = _mm_shuffle_epi8(res, mask);
+
+ xx_storel_32(b, res);
+
+ a += 4;
+ b += 4;
+ }
+
+ a += src_pixels_per_line - output_width;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000000..faec9cf73d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = xx_loadu_128(src);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src += 8;
+ }
+
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ return (unsigned int)_mm_cvtsi128_si32(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+ const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+ const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
+ return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
+
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+ return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
+
+static INLINE void load16_8to16_sse2(const uint8_t *const p, __m128i *out) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)p);
+ out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); // lower 8 values
+ out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); // upper 8 values
+}
+
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+ return (unsigned int)_mm_cvtsi128_si32(val);
+}
+
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+ const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+ const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+ return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i diff = _mm_sub_epi16(src, ref);
+ *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+ *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+ *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_unpacklo_epi16(vsum, vsum);
+ vsum = _mm_srai_epi32(vsum, 16);
+ *sum = (int)add32x4_sse2(vsum);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = sum_to_32bit_sse2(vsum);
+ *sum = (int)add32x4_sse2(vsum);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 256); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; i += 2) {
+ const __m128i s = load4x2_sse2(src, src_stride);
+ const __m128i r = load4x2_sse2(ref, ref_stride);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 128); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+ *sse = _mm_setzero_si128();
+ for (int i = 0; i < h; i++) {
+ const __m128i s = load8_8to16_sse2(src);
+ const __m128i r = load8_8to16_sse2(ref);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+ const uint8_t *const ref,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i s = _mm_loadu_si128((const __m128i *)src);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+ variance_kernel_sse2(src0, ref0, sse, sum);
+ variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 64); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src, ref, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 32); // May overflow for larger height.
+ // Don't initialize sse here since it's an accumulation.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+ variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 16); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+ variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+ variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+ variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 8); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ const int offset0 = j << 5;
+ const int offset1 = offset0 + 16;
+ variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+ variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+ }
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over 4 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ const uint8_t *src = src_ptr;
+ const uint8_t *ref = ref_ptr;
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ for (int i = 0; i < 8; i++) {
+ const __m128i s = load8_8to16_sse2(src + (k * 8));
+ const __m128i r = load8_8to16_sse2(ref + (k * 8));
+ const __m128i diff = _mm_sub_epi16(s, r);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff, diff));
+ vsum = _mm_add_epi16(vsum, diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+ variance_final_128_pel_sse2(vsse, vsum, &sse8x8[k], &sum8x8[k]);
+ }
+
+ // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++)
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+}
+
+void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over 2 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ const uint8_t *src = src_ptr;
+ const uint8_t *ref = ref_ptr;
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ for (int i = 0; i < 16; i++) {
+ __m128i s[2];
+ __m128i r[2];
+ load16_8to16_sse2(src + (k * 16), s);
+ load16_8to16_sse2(ref + (k * 16), r);
+ const __m128i diff0 = _mm_sub_epi16(s[0], r[0]);
+ const __m128i diff1 = _mm_sub_epi16(s[1], r[1]);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ vsum = _mm_add_epi16(vsum, _mm_add_epi16(diff0, diff1));
+ src += src_stride;
+ ref += ref_stride;
+ }
+ variance_final_256_pel_sse2(vsse, vsum, &sse16x16[k], &sum16x16[k]);
+ }
+
+ // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++)
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \
+ unsigned int aom_variance##bw##x##bh##_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m128i vsse = _mm_setzero_si128(); \
+ __m128i vsum; \
+ int sum = 0; \
+ variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
+ variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \
+ assert(sum <= 255 * bw * bh); \
+ assert(sum >= -255 * bw * bh); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128)
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128)
+
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128)
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512)
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024)
+#endif
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \
+ unsigned int aom_variance##bw##x##bh##_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m128i vsse = _mm_setzero_si128(); \
+ __m128i vsum = _mm_setzero_si128(); \
+ for (int i = 0; i < (bh / uh); ++i) { \
+ __m128i vsum16; \
+ variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \
+ &vsum16); \
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \
+ src += (src_stride * uh); \
+ ref += (ref_stride * uh); \
+ } \
+ *sse = add32x4_sse2(vsse); \
+ int sum = (int)add32x4_sse2(vsum); \
+ assert(sum <= 255 * bw * bh); \
+ assert(sum >= -255 * bw * bh); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32) // 32x32 * ( 64/32 )
+
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16) // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16) // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16) // 64x16 * ( 128/16 )
+
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8) // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8) // 128x8 * ( 128/8 )
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024)
+#endif
+
+unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int aom_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+ void *unused0, void *unused)
+#define DECLS(opt) \
+ DECL(4, opt); \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+ &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused)
+#define DECLS(opt) \
+ DECL(4, opt); \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, hf, &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+ const __m128i s1,
+ const __m128i a) {
+ const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+ const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+ const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+ const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+ const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+ const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (width == 8) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ for (int j = 0; j < 2; j++) {
+ const __m128i s0 =
+ _mm_loadu_si128((const __m128i *)(src0 + x + j * 16));
+ const __m128i s2 =
+ _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16));
+ const __m128i s1 =
+ _mm_loadu_si128((const __m128i *)(src1 + x + j * 16));
+ const __m128i s3 =
+ _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16));
+
+ const __m128i m_8 =
+ _mm_loadu_si128((const __m128i *)(mask + x + j * 16));
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+ }
+ comp_pred += 32;
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ i += 1;
+ } while (i < height);
+ }
+}
+
+uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_8x8, dst1_8x8, dst_16x8;
+ __m128i src0_16x4, src1_16x4, src_16x8;
+ __m128i res0_32x4, res0_64x2, res1_64x2;
+ __m128i sub_result_16x8;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+ for (int i = 0; i < h; i += 2) {
+ dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+ dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+ dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
+
+ src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4);
+
+ sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+ res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
+
+ res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+ res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
+
+ square_result =
+ _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
+ }
+ const __m128i sum_64x1 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_64x1);
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst_8x8, dst_16x8;
+ __m128i src_16x8;
+ __m128i res0_32x4, res0_64x2, res1_64x2;
+ __m128i sub_result_16x8;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+
+ for (int i = 0; i < h; i++) {
+ dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros);
+
+ src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+ sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+ res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
+
+ res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+ res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
+
+ square_result =
+ _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
+ }
+ const __m128i sum_64x1 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_64x1);
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must satisfy");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
+
+uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must be satisfied");
+ const int num_blks = 16 / w;
+ uint64_t sum = 0;
+ for (int i = 0; i < num_blks; i++) {
+ sum += aom_mse_wxh_16bit_sse2(dst, dstride, src, w, w, h);
+ dst += w;
+ src += (w * h);
+ }
+ return sum;
+}
diff --git a/third_party/aom/aom_mem/aom_mem.c b/third_party/aom/aom_mem/aom_mem.c
new file mode 100644
index 0000000000..807ddcf05e
--- /dev/null
+++ b/third_party/aom/aom_mem/aom_mem.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "include/aom_mem_intrnl.h"
+#include "aom/aom_integer.h"
+
+static size_t GetAllocationPaddingSize(size_t align) {
+ assert(align > 0);
+ assert(align < SIZE_MAX - ADDRESS_STORAGE_SIZE);
+ return align - 1 + ADDRESS_STORAGE_SIZE;
+}
+
+// Returns 0 in case of overflow of nmemb * size.
+static int check_size_argument_overflow(size_t nmemb, size_t size,
+ size_t align) {
+ if (nmemb == 0) return 1;
+ const size_t alloc_padding = GetAllocationPaddingSize(align);
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+ assert(AOM_MAX_ALLOCABLE_MEMORY >= alloc_padding);
+ assert(AOM_MAX_ALLOCABLE_MEMORY <= SIZE_MAX);
+ if (size > (AOM_MAX_ALLOCABLE_MEMORY - alloc_padding) / nmemb) return 0;
+#else
+ if (size > (SIZE_MAX - alloc_padding) / nmemb) return 0;
+#endif
+ return 1;
+}
+
+static size_t *GetMallocAddressLocation(void *const mem) {
+ return ((size_t *)mem) - 1;
+}
+
+static void SetActualMallocAddress(void *const mem,
+ const void *const malloc_addr) {
+ size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
+ *malloc_addr_location = (size_t)malloc_addr;
+}
+
+static void *GetActualMallocAddress(void *const mem) {
+ const size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
+ return (void *)(*malloc_addr_location);
+}
+
+void *aom_memalign(size_t align, size_t size) {
+ void *x = NULL;
+ if (!check_size_argument_overflow(1, size, align)) return NULL;
+ const size_t aligned_size = size + GetAllocationPaddingSize(align);
+ void *const addr = malloc(aligned_size);
+ if (addr) {
+ x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
+ SetActualMallocAddress(x, addr);
+ }
+ return x;
+}
+
+void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); }
+
+void *aom_calloc(size_t num, size_t size) {
+ if (!check_size_argument_overflow(num, size, DEFAULT_ALIGNMENT)) return NULL;
+ const size_t total_size = num * size;
+ void *const x = aom_malloc(total_size);
+ if (x) memset(x, 0, total_size);
+ return x;
+}
+
+void aom_free(void *memblk) {
+ if (memblk) {
+ void *addr = GetActualMallocAddress(memblk);
+ free(addr);
+ }
+}
diff --git a/third_party/aom/aom_mem/aom_mem.cmake b/third_party/aom/aom_mem/aom_mem.cmake
new file mode 100644
index 0000000000..346588d2db
--- /dev/null
+++ b/third_party/aom/aom_mem/aom_mem.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_MEM_AOM_MEM_CMAKE_)
+ return()
+endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_
+set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1)
+
+list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c"
+ "${AOM_ROOT}/aom_mem/aom_mem.h"
+ "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+
+# Creates the aom_mem build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_mem_targets)
+ add_library(aom_mem OBJECT ${AOM_MEM_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_mem>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_mem>)
+ endif()
+endfunction()
diff --git a/third_party/aom/aom_mem/aom_mem.h b/third_party/aom/aom_mem/aom_mem.h
new file mode 100644
index 0000000000..ca4af7fc61
--- /dev/null
+++ b/third_party/aom/aom_mem/aom_mem.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_MEM_AOM_MEM_H_
+#define AOM_AOM_MEM_AOM_MEM_H_
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#if defined(__uClinux__)
+#include <lddk.h>
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef AOM_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 32)
+#define AOM_MAX_ALLOCABLE_MEMORY 8589934592 // 8 GB
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif
+
+void *aom_memalign(size_t align, size_t size);
+void *aom_malloc(size_t size);
+void *aom_calloc(size_t num, size_t size);
+void aom_free(void *memblk);
+
+static INLINE void *aom_memset16(void *dest, int val, size_t length) {
+ size_t i;
+ uint16_t *dest16 = (uint16_t *)dest;
+ for (i = 0; i < length; i++) *dest16++ = val;
+ return dest;
+}
+
+/*returns an addr aligned to the byte boundary specified by align*/
+#define aom_align_addr(addr, align) \
+ (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
+
+#include <string.h>
+
+#ifdef AOM_MEM_PLTFRM
+#include AOM_MEM_PLTFRM
+#endif
+
+#if CONFIG_DEBUG
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \
+ do { \
+ lval = (expr); \
+ if (!lval) \
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
+ "Failed to allocate " #lval " at %s:%d", __FILE__, \
+ __LINE__); \
+ } while (0)
+#else
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \
+ do { \
+ lval = (expr); \
+ if (!lval) \
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
+ "Failed to allocate " #lval); \
+ } while (0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // AOM_AOM_MEM_AOM_MEM_H_
diff --git a/third_party/aom/aom_mem/include/aom_mem_intrnl.h b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
new file mode 100644
index 0000000000..2c9819de92
--- /dev/null
+++ b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+#define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+
+#include "config/aom_config.h"
+
+#define ADDRESS_STORAGE_SIZE sizeof(size_t)
+
+#ifndef DEFAULT_ALIGNMENT
+#if defined(VXWORKS)
+/*default addr alignment to use in calls to aom_* functions other than
+ aom_memalign*/
+#define DEFAULT_ALIGNMENT 32
+#else
+#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
+#endif
+#endif
+
+#endif // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
diff --git a/third_party/aom/aom_ports/aarch32_cpudetect.c b/third_party/aom/aom_ports/aarch32_cpudetect.c
new file mode 100644
index 0000000000..753f957112
--- /dev/null
+++ b/third_party/aom/aom_ports/aarch32_cpudetect.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+ // This function should actually be a no-op. There is no way to adjust any of
+ // these because the RTCD tables do not exist: the functions are called
+ // statically.
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ // MSVC has no inline __asm support for Arm, but it does let you __emit
+ // instructions via their assembled hex code.
+ // All of these instructions should be essentially nops.
+ __try {
+ // VORR q0,q0,q0
+ __emit(0xF2200150);
+ flags |= HAS_NEON;
+ } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+ // Ignore exception.
+ }
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ uint64_t features = android_getCpuFeatures();
+ if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define AOM_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+ unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON
+ if (hwcap & AOM_AARCH32_HWCAP_NEON) flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+#else // end __linux__
+#error \
+ "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
+#endif
+
+int aom_arm_cpu_caps(void) {
+ int flags = 0;
+ if (arm_cpu_env_flags(&flags)) {
+ return flags;
+ }
+ return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/third_party/aom/aom_ports/aarch64_cpudetect.c b/third_party/aom/aom_ports/aarch64_cpudetect.c
new file mode 100644
index 0000000000..43d5a149c8
--- /dev/null
+++ b/third_party/aom/aom_ports/aarch64_cpudetect.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+ // This function should actually be a no-op. There is no way to adjust any of
+ // these because the RTCD tables do not exist: the functions are called
+ // statically.
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON;
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE bool have_feature(const char *feature) {
+ int64_t feature_present = 0;
+ size_t size = sizeof(feature_present);
+ if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+ return false;
+ }
+ return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON;
+#endif // HAVE_NEON
+#if HAVE_ARM_CRC32
+ if (have_feature("hw.optional.armv8_crc32")) flags |= HAS_ARM_CRC32;
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+ if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= HAS_NEON_DOTPROD;
+#endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+ if (have_feature("hw.optional.arm.FEAT_I8MM")) flags |= HAS_NEON_I8MM;
+#endif // HAVE_NEON_I8MM
+ return flags;
+}
+
+#elif defined(_WIN32) // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+#if HAVE_ARM_CRC32
+ if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) {
+ flags |= HAS_ARM_CRC32;
+ }
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+ if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+ flags |= HAS_NEON_DOTPROD;
+ }
+#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif // HAVE_NEON_DOTPROD
+ // No I8MM or SVE feature detection available on Windows at time of writing.
+ return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+ return flags;
+}
+
+#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define AOM_AARCH64_HWCAP_CRC32 (1 << 7)
+#define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define AOM_AARCH64_HWCAP_SVE (1 << 22)
+#define AOM_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+ unsigned long hwcap = getauxval(AT_HWCAP);
+ unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+#if HAVE_ARM_CRC32
+ if (hwcap & AOM_AARCH64_HWCAP_CRC32) flags |= HAS_ARM_CRC32;
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+ if (hwcap & AOM_AARCH64_HWCAP_ASIMDDP) flags |= HAS_NEON_DOTPROD;
+#endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+ if (hwcap2 & AOM_AARCH64_HWCAP2_I8MM) flags |= HAS_NEON_I8MM;
+#endif // HAVE_NEON_I8MM
+#if HAVE_SVE
+ if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE;
+#endif // HAVE_SVE
+ return flags;
+}
+
+#elif defined(__Fuchsia__) // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
+
+static int arm_get_cpu_caps(void) {
+ int flags = 0;
+#if HAVE_NEON
+ flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
+#endif // HAVE_NEON
+ uint32_t features;
+ zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+ if (status != ZX_OK) return flags;
+#if HAVE_ARM_CRC32
+ if (features & ZX_ARM64_FEATURE_ISA_CRC32) flags |= HAS_ARM_CRC32;
+#endif // HAVE_ARM_CRC32
+#if HAVE_NEON_DOTPROD
+ if (features & ZX_ARM64_FEATURE_ISA_DP) flags |= HAS_NEON_DOTPROD;
+#endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+ if (features & ZX_ARM64_FEATURE_ISA_I8MM) flags |= HAS_NEON_I8MM;
+#endif // HAVE_NEON_I8MM
+#if HAVE_SVE
+ if (features & ZX_ARM64_FEATURE_ISA_SVE) flags |= HAS_SVE;
+#endif // HAVE_SVE
+ return flags;
+}
+
+#else // end __Fuchsia__
+#error \
+ "Runtime CPU detection selected, but no CPU detection method " \
+"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0."
+#endif
+
+int aom_arm_cpu_caps(void) {
+ int flags = 0;
+ if (!arm_cpu_env_flags(&flags)) {
+ flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+ }
+
+ // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+ if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_NEON_I8MM;
+
+ // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available.
+ if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE;
+ if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE;
+
+ return flags;
+}
diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h
new file mode 100644
index 0000000000..680120feea
--- /dev/null
+++ b/third_party/aom/aom_ports/aom_once.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_AOM_ONCE_H_
+#define AOM_AOM_PORTS_AOM_ONCE_H_
+
+#include "config/aom_config.h"
+
+/* Implement a function wrapper to guarantee initialization
+ * thread-safety for library singletons.
+ *
+ * NOTE: This function uses static locks, and can only be
+ * used with one common argument per compilation unit. So
+ *
+ * file1.c:
+ * aom_once(foo);
+ * ...
+ * aom_once(foo);
+ *
+ * file2.c:
+ * aom_once(bar);
+ *
+ * will ensure foo() and bar() are each called only once, but in
+ *
+ * file1.c:
+ * aom_once(foo);
+ * aom_once(bar):
+ *
+ * bar() will never be called because the lock is used up
+ * by the call to foo().
+ */
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+/* Declare a per-compilation-unit state variable to track the progress
+ * of calling func() only once. This must be at global scope because
+ * local initializers are not thread-safe in MSVC prior to Visual
+ * Studio 2015.
+ */
+static INIT_ONCE aom_init_once = INIT_ONCE_STATIC_INIT;
+
+static void aom_once(void (*func)(void)) {
+ BOOL pending;
+ InitOnceBeginInitialize(&aom_init_once, 0, &pending, NULL);
+ if (!pending) {
+ // Initialization has already completed.
+ return;
+ }
+ func();
+ InitOnceComplete(&aom_init_once, 0, NULL);
+}
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void aom_once(void (*func)(void)) {
+ static pthread_once_t lock = PTHREAD_ONCE_INIT;
+ pthread_once(&lock, func);
+}
+
+#else
+/* Default version that performs no synchronization. */
+
+static void aom_once(void (*func)(void)) {
+ static volatile int done;
+
+ if (!done) {
+ func();
+ done = 1;
+ }
+}
+#endif
+
+#endif // AOM_AOM_PORTS_AOM_ONCE_H_
diff --git a/third_party/aom/aom_ports/aom_ports.cmake b/third_party/aom/aom_ports/aom_ports.cmake
new file mode 100644
index 0000000000..8fd2ffd078
--- /dev/null
+++ b/third_party/aom/aom_ports/aom_ports.cmake
@@ -0,0 +1,96 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_)
+ return()
+endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
+set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1)
+
+list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
+ "${AOM_ROOT}/aom_ports/aom_timer.h" "${AOM_ROOT}/aom_ports/bitops.h"
+ "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+ "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
+ "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+ "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
+
+list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
+
+list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
+
+list(APPEND AOM_PORTS_SOURCES_AARCH32
+ "${AOM_ROOT}/aom_ports/aarch32_cpudetect.c")
+list(APPEND AOM_PORTS_SOURCES_AARCH64
+ "${AOM_ROOT}/aom_ports/aarch64_cpudetect.c")
+
+if(CONFIG_RUNTIME_CPU_DETECT AND ANDROID_NDK)
+ include_directories(${ANDROID_NDK}/sources/android/cpufeatures)
+ list(APPEND AOM_PORTS_SOURCES_ARM
+ "${ANDROID_NDK}/sources/android/cpufeatures/cpu-features.c")
+endif()
+
+list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
+ "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
+
+# For arm and x86 targets:
+#
+# * Creates the aom_ports build target, adds the includes in aom_ports to the
+# target, and makes libaom depend on it.
+#
+# Otherwise:
+#
+# * Adds the includes in aom_ports to the libaom target.
+#
+# For all target platforms:
+#
+# * The libaom target must exist before this function is called.
+function(setup_aom_ports_targets)
+ if(XCODE AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
+ # Xcode is the only one
+ set(aom_ports_is_embedded 1)
+ set(aom_ports_has_symbols 1)
+ elseif(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
+ set(aom_ports_has_symbols 1)
+ elseif("${AOM_TARGET_CPU}" STREQUAL "arm64")
+ add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH64})
+ set(aom_ports_has_symbols 1)
+ elseif("${AOM_TARGET_CPU}" MATCHES "arm")
+ add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH32})
+ set(aom_ports_has_symbols 1)
+ elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+ add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
+ set(aom_ports_has_symbols 1)
+ endif()
+
+ if("${AOM_TARGET_CPU}" MATCHES "arm|ppc")
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
+ endif()
+ endif()
+
+ # Note AOM_PORTS_INCLUDES_X86 are not added to the aom_ports, aom or
+ # aom_static targets to avoid compilation issues in projects that enable ASM
+ # language support in project(). These sources were never included in
+ # libaom_srcs.*; if it becomes necessary for a particular generator another
+ # method should be used.
+ if(aom_ports_has_symbols)
+ if(NOT aom_ports_is_embedded)
+ target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
+ endif()
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+ else()
+ target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES})
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES})
+ endif()
+ endif()
+endfunction()
diff --git a/third_party/aom/aom_ports/aom_timer.h b/third_party/aom/aom_ports/aom_timer.h
new file mode 100644
index 0000000000..642c5a08ba
--- /dev/null
+++ b/third_party/aom/aom_ports/aom_timer.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_AOM_TIMER_H_
+#define AOM_AOM_PORTS_AOM_TIMER_H_
+
+#include "config/aom_config.h"
+
+#if CONFIG_OS_SUPPORT
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_WIN32)
+/*
+ * Win32 specific includes
+ */
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+/*
+ * POSIX specific includes
+ */
+#include <sys/time.h>
+
+/* timersub is not provided by msys at this time. */
+#ifndef timersub
+#define timersub(a, b, result) \
+ do { \
+ (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
+ (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+ if ((result)->tv_usec < 0) { \
+ --(result)->tv_sec; \
+ (result)->tv_usec += 1000000; \
+ } \
+ } while (0)
+#endif
+#endif
+
+struct aom_usec_timer {
+#if defined(_WIN32)
+ LARGE_INTEGER begin, end;
+#else
+ struct timeval begin, end;
+#endif
+};
+
+static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+ QueryPerformanceCounter(&t->begin);
+#else
+ gettimeofday(&t->begin, NULL);
+#endif
+}
+
+static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+ QueryPerformanceCounter(&t->end);
+#else
+ gettimeofday(&t->end, NULL);
+#endif
+}
+
+static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+ LARGE_INTEGER freq, diff;
+
+ diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;
+
+ QueryPerformanceFrequency(&freq);
+ return diff.QuadPart * 1000000 / freq.QuadPart;
+#else
+ struct timeval diff;
+
+ timersub(&t->end, &t->begin, &diff);
+ return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec;
+#endif
+}
+
+#else /* CONFIG_OS_SUPPORT = 0*/
+
+/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */
+#ifndef timersub
+#define timersub(a, b, result)
+#endif
+
+struct aom_usec_timer {
+ void *dummy;
+};
+
+static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; }
+
+static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; }
+
+static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+ (void)t;
+ return 0;
+}
+
+#endif /* CONFIG_OS_SUPPORT */
+
+#endif // AOM_AOM_PORTS_AOM_TIMER_H_
diff --git a/third_party/aom/aom_ports/arm.h b/third_party/aom/aom_ports/arm.h
new file mode 100644
index 0000000000..853741d19a
--- /dev/null
+++ b/third_party/aom/aom_ports/arm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_ARM_H_
+#define AOM_AOM_PORTS_ARM_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.0-A optional CRC32 instructions, mandatory from Armv8.1-A.
+#define HAS_ARM_CRC32 (1 << 1)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 2)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 3)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 4)
+
+int aom_arm_cpu_caps(void);
+
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \
+ __GNUC_MINOR__ <= 6
+#define AOM_INCOMPATIBLE_GCC
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_PORTS_ARM_H_
diff --git a/third_party/aom/aom_ports/arm_cpudetect.h b/third_party/aom/aom_ports/arm_cpudetect.h
new file mode 100644
index 0000000000..33c2d1bb6a
--- /dev/null
+++ b/third_party/aom/aom_ports/arm_cpudetect.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/arm.h"
+#include "config/aom_config.h"
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define ANDROID_USE_CPU_FEATURES_LIB 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static bool arm_cpu_env_flags(int *flags) {
+ const char *env = getenv("AOM_SIMD_CAPS");
+ if (env && *env) {
+ *flags = (int)strtol(env, NULL, 0);
+ return true;
+ }
+ return false;
+}
+
+static int arm_cpu_env_mask(void) {
+ const char *env = getenv("AOM_SIMD_CAPS_MASK");
+ return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/third_party/aom/aom_ports/bitops.h b/third_party/aom/aom_ports/bitops.h
new file mode 100644
index 0000000000..0795855083
--- /dev/null
+++ b/third_party/aom/aom_ports/bitops.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_BITOPS_H_
+#define AOM_AOM_PORTS_BITOPS_H_
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_ports/msvc.h"
+#include "config/aom_config.h"
+
+#ifdef _MSC_VER
+#if defined(_M_X64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM)
+#include <intrin.h>
+#define USE_MSC_INTRINSICS
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// get_msb:
+// Returns (int)floor(log2(n)). n must be > 0.
+// These versions of get_msb() are only valid when n != 0 because all
+// of the optimized versions are undefined when n == 0:
+
+// GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+// MSVC: https://learn.microsoft.com/en-us/cpp/intrinsics/compiler-intrinsics
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_msb(unsigned int n) {
+ assert(n != 0);
+ return 31 ^ __builtin_clz(n);
+}
+#elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanReverse)
+
+static INLINE int get_msb(unsigned int n) {
+ unsigned long first_set_bit;
+ assert(n != 0);
+ _BitScanReverse(&first_set_bit, n);
+ return first_set_bit;
+}
+#else
+static INLINE int get_msb(unsigned int n) {
+ int log = 0;
+ unsigned int value = n;
+
+ assert(n != 0);
+
+ for (int shift = 16; shift != 0; shift >>= 1) {
+ const unsigned int x = value >> shift;
+ if (x != 0) {
+ value = x;
+ log += shift;
+ }
+ }
+ return log;
+}
+#endif
+
+#if defined(__GNUC__) && \
+ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int aom_clzll(uint64_t n) { return __builtin_clzll(n); }
+#elif defined(USE_MSC_INTRINSICS)
+#if defined(_M_X64) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64)
+#endif
+
+static INLINE int aom_clzll(uint64_t n) {
+ assert(n != 0);
+ unsigned long first_set_bit; // NOLINT(runtime/int)
+#if defined(_M_X64) || defined(_M_ARM64)
+ const unsigned char bit_set =
+ _BitScanReverse64(&first_set_bit, (unsigned __int64)n);
+#else // !(defined(_M_X64) || defined(_M_ARM64))
+ const unsigned long n_hi = (unsigned long)(n >> 32); // NOLINT(runtime/int)
+ if (n_hi != 0) {
+ const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
+ assert(bit_set != 0);
+ (void)bit_set;
+ return 31 ^ (int)first_set_bit;
+ }
+ const unsigned char bit_set =
+ _BitScanReverse(&first_set_bit, (unsigned long)n); // NOLINT(runtime/int)
+#endif
+ assert(bit_set != 0);
+ (void)bit_set;
+ return 63 ^ (int)first_set_bit;
+}
+#undef USE_MSC_INTRINSICS
+#else
+static INLINE int aom_clzll(uint64_t n) {
+ assert(n != 0);
+
+ int res = 0;
+ uint64_t high_bit = 1ULL << 63;
+ while (!(n & high_bit)) {
+ res++;
+ n <<= 1;
+ }
+ return res;
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_PORTS_BITOPS_H_
diff --git a/third_party/aom/aom_ports/emmintrin_compat.h b/third_party/aom/aom_ports/emmintrin_compat.h
new file mode 100644
index 0000000000..85d218a3d2
--- /dev/null
+++ b/third_party/aom/aom_ports/emmintrin_compat.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+#define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+
+#if defined(__GNUC__) && __GNUC__ < 4
+/* From emmintrin.h (gcc 4.5.3) */
+/* Casts between various SP, DP, INT vector types. Note that these do no
+ conversion of values, they just change the type. */
+extern __inline __m128
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_castpd_ps(__m128d __A) {
+ return (__m128)__A;
+}
+
+extern __inline __m128i
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_castpd_si128(__m128d __A) {
+ return (__m128i)__A;
+}
+
+extern __inline __m128d
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_castps_pd(__m128 __A) {
+ return (__m128d)__A;
+}
+
+extern __inline __m128i
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_castps_si128(__m128 __A) {
+ return (__m128i)__A;
+}
+
+extern __inline __m128
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_castsi128_ps(__m128i __A) {
+ return (__m128)__A;
+}
+
+extern __inline __m128d
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_castsi128_pd(__m128i __A) {
+ return (__m128d)__A;
+}
+#endif
+
+#endif // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/third_party/aom/aom_ports/float.asm b/third_party/aom/aom_ports/float.asm
new file mode 100644
index 0000000000..abff60a7a4
--- /dev/null
+++ b/third_party/aom/aom_ports/float.asm
@@ -0,0 +1,33 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+section .text
+%if LIBAOM_YASM_WIN64
+globalsym(aom_winx64_fldcw)
+sym(aom_winx64_fldcw):
+ sub rsp, 8
+ mov [rsp], rcx ; win x64 specific
+ fldcw [rsp]
+ add rsp, 8
+ ret
+
+
+globalsym(aom_winx64_fstcw)
+sym(aom_winx64_fstcw):
+ sub rsp, 8
+ fstcw [rsp]
+ mov rax, [rsp]
+ add rsp, 8
+ ret
+%endif
diff --git a/third_party/aom/aom_ports/mem.h b/third_party/aom/aom_ports/mem.h
new file mode 100644
index 0000000000..a70ce825b1
--- /dev/null
+++ b/third_party/aom/aom_ports/mem.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_H_
+#define AOM_AOM_PORTS_MEM_H_
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
+#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val
+#else
+#warning No alignment directives known for this compiler.
+#define DECLARE_ALIGNED(n, typ, val) typ val
+#endif
+
+#if HAVE_NEON && defined(_MSC_VER)
+#define __builtin_prefetch(x)
+#endif
+
+/* Shift down with rounding for use when n >= 0. Usually value >= 0, but the
+ * macro can be used with a negative value if the direction of rounding is
+ * acceptable.
+ */
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
+
+/* Shift down with rounding for signed integers, for use when n >= 0 */
+#define ROUND_POWER_OF_TWO_SIGNED(value, n) \
+ (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
+ : ROUND_POWER_OF_TWO((value), (n)))
+
+/* Shift down with rounding for use when n >= 0 (64-bit value). Usually
+ * value >= 0, but the macro can be used with a negative value if the direction
+ * of rounding is acceptable.
+ */
+#define ROUND_POWER_OF_TWO_64(value, n) \
+ (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
+/* Shift down with rounding for signed integers, for use when n >= 0 (64-bit
+ * value)
+ */
+#define ROUND_POWER_OF_TWO_SIGNED_64(value, n) \
+ (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
+ : ROUND_POWER_OF_TWO_64((value), (n)))
+
+/* Shift down with ceil() for use when n >= 0 and value >= 0.*/
+#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n))
+
+/* shift right or left depending on sign of n */
+#define RIGHT_SIGNED_SHIFT(value, n) \
+ ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n)))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+#define ALIGN_POWER_OF_TWO_UNSIGNED(value, n) \
+ (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1))
+
+#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+
+/*!\brief force enum to be unsigned 1 byte*/
+#define UENUM1BYTE(enumvar) \
+ ; \
+ typedef uint8_t enumvar
+
+/*!\brief force enum to be signed 1 byte*/
+#define SENUM1BYTE(enumvar) \
+ ; \
+ typedef int8_t enumvar
+
+/*!\brief force enum to be unsigned 2 byte*/
+#define UENUM2BYTE(enumvar) \
+ ; \
+ typedef uint16_t enumvar
+
+/*!\brief force enum to be signed 2 byte*/
+#define SENUM2BYTE(enumvar) \
+ ; \
+ typedef int16_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define UENUM4BYTE(enumvar) \
+ ; \
+ typedef uint32_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define SENUM4BYTE(enumvar) \
+ ; \
+ typedef int32_t enumvar
+
+#endif // AOM_AOM_PORTS_MEM_H_
diff --git a/third_party/aom/aom_ports/mem_ops.h b/third_party/aom/aom_ports/mem_ops.h
new file mode 100644
index 0000000000..2b5bc0f0fb
--- /dev/null
+++ b/third_party/aom/aom_ports/mem_ops.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_OPS_H_
+#define AOM_AOM_PORTS_MEM_OPS_H_
+
+/* \file
+ * \brief Provides portable memory access primitives
+ *
+ * This function provides portable primitives for getting and setting of
+ * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations
+ * can be performed on unaligned data regardless of hardware support for
+ * unaligned accesses.
+ *
+ * The type used to pass the integral values may be changed by defining
+ * MEM_VALUE_T with the appropriate type. The type given must be an integral
+ * numeric type.
+ *
+ * The actual functions instantiated have the MEM_VALUE_T type name pasted
+ * on to the symbol name. This allows the developer to instantiate these
+ * operations for multiple types within the same translation unit. This is
+ * of somewhat questionable utility, but the capability exists nonetheless.
+ * Users not making use of this functionality should call the functions
+ * without the type name appended, and the preprocessor will take care of
+ * it.
+ *
+ * NOTE: This code is not supported on platforms where char > 1 octet ATM.
+ */
+
+#ifndef MAU_T
+/* Minimum Access Unit for this target */
+#define MAU_T unsigned char
+#endif
+
+#ifndef MEM_VALUE_T
+#define MEM_VALUE_T int
+#endif
+
+#undef MEM_VALUE_T_SZ_BITS
+#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3)
+
+#undef mem_ops_wrap_symbol
+#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
+#undef mem_ops_wrap_symbol2
+#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
+#undef mem_ops_wrap_symbol3
+#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
+
+/*
+ * Include aligned access routines
+ */
+#define INCLUDED_BY_MEM_OPS_H
+#include "mem_ops_aligned.h"
+#undef INCLUDED_BY_MEM_OPS_H
+
+#undef mem_get_be16
+#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16)
+static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) {
+ unsigned MEM_VALUE_T val;
+ const MAU_T *mem = (const MAU_T *)vmem;
+
+ val = mem[0] << 8;
+ val |= mem[1];
+ return val;
+}
+
+#undef mem_get_be24
+#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24)
+static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) {
+ unsigned MEM_VALUE_T val;
+ const MAU_T *mem = (const MAU_T *)vmem;
+
+ val = mem[0] << 16;
+ val |= mem[1] << 8;
+ val |= mem[2];
+ return val;
+}
+
+#undef mem_get_be32
+#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32)
+static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) {
+ unsigned MEM_VALUE_T val;
+ const MAU_T *mem = (const MAU_T *)vmem;
+
+ val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
+ val |= mem[1] << 16;
+ val |= mem[2] << 8;
+ val |= mem[3];
+ return val;
+}
+
+#undef mem_get_le16
+#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16)
+static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) {
+ unsigned MEM_VALUE_T val;
+ const MAU_T *mem = (const MAU_T *)vmem;
+
+ val = mem[1] << 8;
+ val |= mem[0];
+ return val;
+}
+
+#undef mem_get_le24
+#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24)
+static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) {
+ unsigned MEM_VALUE_T val;
+ const MAU_T *mem = (const MAU_T *)vmem;
+
+ val = mem[2] << 16;
+ val |= mem[1] << 8;
+ val |= mem[0];
+ return val;
+}
+
+#undef mem_get_le32
+#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
+static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
+ unsigned MEM_VALUE_T val;
+ const MAU_T *mem = (const MAU_T *)vmem;
+
+ val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
+ val |= mem[2] << 16;
+ val |= mem[1] << 8;
+ val |= mem[0];
+ return val;
+}
+
+#define mem_get_s_generic(end, sz) \
+ static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \
+ const MAU_T *mem = (const MAU_T *)vmem; \
+ signed MEM_VALUE_T val = mem_get_##end##sz(mem); \
+ return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \
+ }
+
+/* clang-format off */
+#undef mem_get_sbe16
+#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16)
+mem_get_s_generic(be, 16)
+
+#undef mem_get_sbe24
+#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24)
+mem_get_s_generic(be, 24)
+
+#undef mem_get_sbe32
+#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32)
+mem_get_s_generic(be, 32)
+
+#undef mem_get_sle16
+#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16)
+mem_get_s_generic(le, 16)
+
+#undef mem_get_sle24
+#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24)
+mem_get_s_generic(le, 24)
+
+#undef mem_get_sle32
+#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32)
+mem_get_s_generic(le, 32)
+
+#undef mem_put_be16
+#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
+static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+ MAU_T *mem = (MAU_T *)vmem;
+
+ mem[0] = (MAU_T)((val >> 8) & 0xff);
+ mem[1] = (MAU_T)((val >> 0) & 0xff);
+}
+
+#undef mem_put_be24
+#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
+static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
+ MAU_T *mem = (MAU_T *)vmem;
+
+ mem[0] = (MAU_T)((val >> 16) & 0xff);
+ mem[1] = (MAU_T)((val >> 8) & 0xff);
+ mem[2] = (MAU_T)((val >> 0) & 0xff);
+}
+
+#undef mem_put_be32
+#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
+static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
+ MAU_T *mem = (MAU_T *)vmem;
+
+ mem[0] = (MAU_T)((val >> 24) & 0xff);
+ mem[1] = (MAU_T)((val >> 16) & 0xff);
+ mem[2] = (MAU_T)((val >> 8) & 0xff);
+ mem[3] = (MAU_T)((val >> 0) & 0xff);
+}
+
+#undef mem_put_le16
+#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
+static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
+ MAU_T *mem = (MAU_T *)vmem;
+
+ mem[0] = (MAU_T)((val >> 0) & 0xff);
+ mem[1] = (MAU_T)((val >> 8) & 0xff);
+}
+
+#undef mem_put_le24
+#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
+static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
+ MAU_T *mem = (MAU_T *)vmem;
+
+ mem[0] = (MAU_T)((val >> 0) & 0xff);
+ mem[1] = (MAU_T)((val >> 8) & 0xff);
+ mem[2] = (MAU_T)((val >> 16) & 0xff);
+}
+
+#undef mem_put_le32
+#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
+static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
+ MAU_T *mem = (MAU_T *)vmem;
+
+ mem[0] = (MAU_T)((val >> 0) & 0xff);
+ mem[1] = (MAU_T)((val >> 8) & 0xff);
+ mem[2] = (MAU_T)((val >> 16) & 0xff);
+ mem[3] = (MAU_T)((val >> 24) & 0xff);
+}
+/* clang-format on */
+#endif // AOM_AOM_PORTS_MEM_OPS_H_
diff --git a/third_party/aom/aom_ports/mem_ops_aligned.h b/third_party/aom/aom_ports/mem_ops_aligned.h
new file mode 100644
index 0000000000..37c3675318
--- /dev/null
+++ b/third_party/aom/aom_ports/mem_ops_aligned.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+#define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+
+#include "aom/aom_integer.h"
+
+/* \file
+ * \brief Provides portable memory access primitives for operating on aligned
+ * data
+ *
+ * This file is split from mem_ops.h for easier maintenance. See mem_ops.h
+ * for a more detailed description of these primitives.
+ */
+#ifndef INCLUDED_BY_MEM_OPS_H
+#error Include mem_ops.h, not mem_ops_aligned.h directly.
+#endif
+
+/* Architectures that provide instructions for doing this byte swapping
+ * could redefine these macros.
+ */
+#define swap_endian_16(val, raw) \
+ do { \
+ val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \
+ } while (0)
+#define swap_endian_32(val, raw) \
+ do { \
+ val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \
+ ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000); \
+ } while (0)
+#define swap_endian_16_se(val, raw) \
+ do { \
+ swap_endian_16(val, raw); \
+ val = ((val << 16) >> 16); \
+ } while (0)
+#define swap_endian_32_se(val, raw) swap_endian_32(val, raw)
+
+#define mem_get_ne_aligned_generic(end, sz) \
+ static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+ const void *vmem) { \
+ const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \
+ return *mem; \
+ }
+
+#define mem_get_sne_aligned_generic(end, sz) \
+ static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+ const void *vmem) { \
+ const int##sz##_t *mem = (const int##sz##_t *)vmem; \
+ return *mem; \
+ }
+
+#define mem_get_se_aligned_generic(end, sz) \
+ static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+ const void *vmem) { \
+ const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \
+ unsigned MEM_VALUE_T val, raw = *mem; \
+ swap_endian_##sz(val, raw); \
+ return val; \
+ }
+
+#define mem_get_sse_aligned_generic(end, sz) \
+ static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+ const void *vmem) { \
+ const int##sz##_t *mem = (const int##sz##_t *)vmem; \
+ unsigned MEM_VALUE_T val, raw = *mem; \
+ swap_endian_##sz##_se(val, raw); \
+ return val; \
+ }
+
+#define mem_put_ne_aligned_generic(end, sz) \
+ static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem, \
+ MEM_VALUE_T val) { \
+ uint##sz##_t *mem = (uint##sz##_t *)vmem; \
+ *mem = (uint##sz##_t)val; \
+ }
+
+#define mem_put_se_aligned_generic(end, sz) \
+ static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem, \
+ MEM_VALUE_T val) { \
+ uint##sz##_t *mem = (uint##sz##_t *)vmem, raw; \
+ swap_endian_##sz(raw, val); \
+ *mem = (uint##sz##_t)raw; \
+ }
+
+#include "config/aom_config.h"
+
+#if CONFIG_BIG_ENDIAN
+#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz)
+#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz)
+#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz)
+#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz)
+#else
+#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz)
+#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz)
+#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz)
+#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz)
+#endif
+
+/* clang-format off */
+#undef mem_get_be16_aligned
+#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned)
+mem_get_be_aligned_generic(16)
+
+#undef mem_get_be32_aligned
+#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned)
+mem_get_be_aligned_generic(32)
+
+#undef mem_get_le16_aligned
+#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned)
+mem_get_le_aligned_generic(16)
+
+#undef mem_get_le32_aligned
+#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned)
+mem_get_le_aligned_generic(32)
+
+#undef mem_get_sbe16_aligned
+#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned)
+mem_get_sbe_aligned_generic(16)
+
+#undef mem_get_sbe32_aligned
+#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned)
+mem_get_sbe_aligned_generic(32)
+
+#undef mem_get_sle16_aligned
+#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned)
+mem_get_sle_aligned_generic(16)
+
+#undef mem_get_sle32_aligned
+#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned)
+mem_get_sle_aligned_generic(32)
+
+#undef mem_put_be16_aligned
+#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned)
+mem_put_be_aligned_generic(16)
+
+#undef mem_put_be32_aligned
+#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned)
+mem_put_be_aligned_generic(32)
+
+#undef mem_put_le16_aligned
+#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned)
+mem_put_le_aligned_generic(16)
+
+#undef mem_put_le32_aligned
+#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned)
+mem_put_le_aligned_generic(32)
+
+#undef mem_get_ne_aligned_generic
+#undef mem_get_se_aligned_generic
+#undef mem_get_sne_aligned_generic
+#undef mem_get_sse_aligned_generic
+#undef mem_put_ne_aligned_generic
+#undef mem_put_se_aligned_generic
+#undef swap_endian_16
+#undef swap_endian_32
+#undef swap_endian_16_se
+#undef swap_endian_32_se
+/* clang-format on */
+
+#endif // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/third_party/aom/aom_ports/msvc.h b/third_party/aom/aom_ports/msvc.h
new file mode 100644
index 0000000000..e78e605f2f
--- /dev/null
+++ b/third_party/aom/aom_ports/msvc.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MSVC_H_
+#define AOM_AOM_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "config/aom_config.h"
+
+#if _MSC_VER < 1900 // VS2015 provides snprintf
+#define snprintf _snprintf
+#endif // _MSC_VER < 1900
+
+#if _MSC_VER < 1800 // VS2013 provides round
+#include <math.h>
+static INLINE double round(double x) {
+ if (x < 0)
+ return ceil(x - 0.5);
+ else
+ return floor(x + 0.5);
+}
+
+static INLINE float roundf(float x) {
+ if (x < 0)
+ return (float)ceil(x - 0.5f);
+ else
+ return (float)floor(x + 0.5f);
+}
+
+static INLINE long lroundf(float x) {
+ if (x < 0)
+ return (long)(x - 0.5f);
+ else
+ return (long)(x + 0.5f);
+}
+#endif // _MSC_VER < 1800
+
+#if HAVE_AVX
+#include <immintrin.h>
+// Note:
+// _mm256_insert_epi16 intrinsics is available from vs2017.
+// We define this macro for vs2015 and earlier. The
+// intrinsics used here are in vs2015 document:
+// https://msdn.microsoft.com/en-us/library/hh977022.aspx
+// Input parameters:
+// a: __m256i,
+// d: int16_t,
+// indx: imm8 (0 - 15)
+#if _MSC_VER <= 1900
+#define _mm256_insert_epi16(a, d, indx) \
+ _mm256_insertf128_si256( \
+ a, \
+ _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
+ indx >> 3)
+
+static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
+ return a.m256i_i32[i & 7];
+}
+static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
+ __m256i c = a;
+ c.m256i_i32[i & 7] = b;
+ return c;
+}
+#endif // _MSC_VER <= 1900
+#endif // HAVE_AVX
+#endif // _MSC_VER
+#endif // AOM_AOM_PORTS_MSVC_H_
diff --git a/third_party/aom/aom_ports/ppc.h b/third_party/aom/aom_ports/ppc.h
new file mode 100644
index 0000000000..3159bda682
--- /dev/null
+++ b/third_party/aom/aom_ports/ppc.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_PPC_H_
+#define AOM_AOM_PORTS_PPC_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_VSX 0x01
+
+int ppc_simd_caps(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_PORTS_PPC_H_
diff --git a/third_party/aom/aom_ports/ppc_cpudetect.c b/third_party/aom/aom_ports/ppc_cpudetect.c
new file mode 100644
index 0000000000..ce4d5ae231
--- /dev/null
+++ b/third_party/aom/aom_ports/ppc_cpudetect.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/ppc.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+static int cpu_env_flags(int *flags) {
+ char *env;
+ env = getenv("AOM_SIMD_CAPS");
+ if (env && *env) {
+ *flags = (int)strtol(env, NULL, 0);
+ return 0;
+ }
+ *flags = 0;
+ return -1;
+}
+
+static int cpu_env_mask(void) {
+ char *env;
+ env = getenv("AOM_SIMD_CAPS_MASK");
+ return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+int ppc_simd_caps(void) {
+ int flags;
+ int mask;
+ int fd;
+ ssize_t count;
+ unsigned int i;
+ uint64_t buf[64];
+
+ // If AOM_SIMD_CAPS_MASK is set then allow only those capabilities.
+ if (!cpu_env_flags(&flags)) {
+ return flags;
+ }
+
+ mask = cpu_env_mask();
+
+ fd = open("/proc/self/auxv", O_RDONLY);
+ if (fd < 0) {
+ return 0;
+ }
+
+ while ((count = read(fd, buf, sizeof(buf))) > 0) {
+ for (i = 0; i < (count / sizeof(*buf)); i += 2) {
+ if (buf[i] == AT_HWCAP) {
+#if HAVE_VSX
+ if (buf[i + 1] & PPC_FEATURE_HAS_VSX) {
+ flags |= HAS_VSX;
+ }
+#endif // HAVE_VSX
+ goto out_close;
+ } else if (buf[i] == AT_NULL) {
+ goto out_close;
+ }
+ }
+ }
+out_close:
+ close(fd);
+ return flags & mask;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int ppc_simd_caps(void) { return 0; }
+#endif // CONFIG_RUNTIME_CPU_DETECT
diff --git a/third_party/aom/aom_ports/sanitizer.h b/third_party/aom/aom_ports/sanitizer.h
new file mode 100644
index 0000000000..1dd8eb4cf4
--- /dev/null
+++ b/third_party/aom/aom_ports/sanitizer.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_SANITIZER_H_
+#define AOM_AOM_PORTS_SANITIZER_H_
+
+// AddressSanitizer support.
+
+// Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used.
+// Clang.
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define AOM_ADDRESS_SANITIZER 1
+#endif
+#endif // defined(__has_feature)
+// GCC.
+#if defined(__SANITIZE_ADDRESS__)
+#define AOM_ADDRESS_SANITIZER 1
+#endif // defined(__SANITIZE_ADDRESS__)
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if defined(AOM_ADDRESS_SANITIZER)
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
+#endif // AOM_AOM_PORTS_SANITIZER_H_
diff --git a/third_party/aom/aom_ports/x86.h b/third_party/aom/aom_ports/x86.h
new file mode 100644
index 0000000000..c089984085
--- /dev/null
+++ b/third_party/aom/aom_ports/x86.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_X86_H_
+#define AOM_AOM_PORTS_X86_H_
+#include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h> /* For __cpuidex, __rdtsc */
+#endif
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ AOM_CPU_UNKNOWN = -1,
+ AOM_CPU_AMD,
+ AOM_CPU_AMD_OLD,
+ AOM_CPU_CENTAUR,
+ AOM_CPU_CYRIX,
+ AOM_CPU_INTEL,
+ AOM_CPU_NEXGEN,
+ AOM_CPU_NSC,
+ AOM_CPU_RISE,
+ AOM_CPU_SIS,
+ AOM_CPU_TRANSMETA,
+ AOM_CPU_TRANSMETA_OLD,
+ AOM_CPU_UMC,
+ AOM_CPU_VIA,
+
+ AOM_CPU_LAST
+} aom_cpu_t;
+
+#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
+#if AOM_ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx) \
+ __asm__ __volatile__("cpuid \n\t" \
+ : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
+ : "a"(func), "c"(func2))
+#else
+#define cpuid(func, func2, ax, bx, cx, dx) \
+ __asm__ __volatile__( \
+ "mov %%ebx, %%edi \n\t" \
+ "cpuid \n\t" \
+ "xchg %%edi, %%ebx \n\t" \
+ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+ : "a"(func), "c"(func2))
+#endif
+#elif defined(__SUNPRO_C) || \
+ defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
+#if AOM_ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx) \
+ asm volatile( \
+ "xchg %rsi, %rbx \n\t" \
+ "cpuid \n\t" \
+ "movl %ebx, %edi \n\t" \
+ "xchg %rsi, %rbx \n\t" \
+ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+ : "a"(func), "c"(func2))
+#else
+#define cpuid(func, func2, ax, bx, cx, dx) \
+ asm volatile( \
+ "pushl %ebx \n\t" \
+ "cpuid \n\t" \
+ "movl %ebx, %edi \n\t" \
+ "popl %ebx \n\t" \
+ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+ : "a"(func), "c"(func2))
+#endif
+#else /* end __SUNPRO__ */
+#if AOM_ARCH_X86_64
+#if defined(_MSC_VER) && _MSC_VER > 1500
+#define cpuid(func, func2, a, b, c, d) \
+ do { \
+ int regs[4]; \
+ __cpuidex(regs, func, func2); \
+ a = regs[0]; \
+ b = regs[1]; \
+ c = regs[2]; \
+ d = regs[3]; \
+ } while (0)
+#else
+#define cpuid(func, func2, a, b, c, d) \
+ do { \
+ int regs[4]; \
+ __cpuid(regs, func); \
+ a = regs[0]; \
+ b = regs[1]; \
+ c = regs[2]; \
+ d = regs[3]; \
+ } while (0)
+#endif
+#else
+/* clang-format off */
+#define cpuid(func, func2, a, b, c, d) \
+ __asm mov eax, func \
+ __asm mov ecx, func2 \
+ __asm cpuid \
+ __asm mov a, eax \
+ __asm mov b, ebx \
+ __asm mov c, ecx \
+ __asm mov d, edx
+#endif
+/* clang-format on */
+#endif /* end others */
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+ const uint32_t ecx = 0;
+ uint32_t eax, edx;
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(eax), "=d"(edx)
+ : "c"(ecx));
+ return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \
+ _MSC_FULL_VER >= 160040219 // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+ uint32_t eax_, edx_;
+ __asm {
+ xor ecx, ecx // ecx = 0
+ // Use the raw opcode for xgetbv for compatibility with older toolchains.
+ __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+ mov eax_, eax
+ mov edx_, edx
+ }
+ return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#define HAS_MMX 0x01
+#define HAS_SSE 0x02
+#define HAS_SSE2 0x04
+#define HAS_SSE3 0x08
+#define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
+#define HAS_AVX 0x40
+#define HAS_AVX2 0x80
+#define HAS_SSE4_2 0x100
+#ifndef BIT
+#define BIT(n) (1u << (n))
+#endif
+
+static INLINE int x86_simd_caps(void) {
+ unsigned int flags = 0;
+ unsigned int mask = ~0u;
+ unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
+ char *env;
+
+ /* See if the CPU capabilities are being overridden by the environment */
+ env = getenv("AOM_SIMD_CAPS");
+
+ if (env && *env) return (int)strtol(env, NULL, 0);
+
+ env = getenv("AOM_SIMD_CAPS_MASK");
+
+ if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
+
+ /* Ensure that the CPUID instruction supports extended features */
+ cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
+
+ if (max_cpuid_val < 1) return 0;
+
+ /* Get the standard feature flags */
+ cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+ if (reg_edx & BIT(23)) flags |= HAS_MMX;
+
+ if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */
+
+ if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
+
+ if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
+
+ if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
+
+ if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
+ if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2;
+
+ // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+ if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+ // Check for OS-support of YMM state. Necessary for AVX and AVX2.
+ if ((xgetbv() & 0x6) == 0x6) {
+ flags |= HAS_AVX;
+
+ if (max_cpuid_val >= 7) {
+ /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+ cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+ if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+ }
+ }
+ }
+
+ (void)reg_eax; // Avoid compiler warning on unused-but-set variable.
+
+ return flags & mask;
+}
+
+// Fine-Grain Measurement Functions
+//
+// If you are timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+// ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
+static INLINE unsigned int x86_readtsc(void) {
+#if defined(__GNUC__) && __GNUC__
+ unsigned int tsc;
+ __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
+ return tsc;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ unsigned int tsc;
+ asm volatile("rdtsc\n\t" : "=a"(tsc) :);
+ return tsc;
+#else
+#if AOM_ARCH_X86_64
+ return (unsigned int)__rdtsc();
+#else
+ __asm rdtsc;
+#endif
+#endif
+}
+// 64-bit CPU cycle counter
+static INLINE uint64_t x86_readtsc64(void) {
+#if defined(__GNUC__) && __GNUC__
+ uint32_t hi, lo;
+ __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+ return ((uint64_t)hi << 32) | lo;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ uint_t hi, lo;
+ asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
+ return ((uint64_t)hi << 32) | lo;
+#else
+#if AOM_ARCH_X86_64
+ return (uint64_t)__rdtsc();
+#else
+ __asm rdtsc;
+#endif
+#endif
+}
+
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__) && __GNUC__
+ unsigned int tscp;
+ __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+ return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ unsigned int tscp;
+ asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+ return tscp;
+#elif defined(_MSC_VER)
+ unsigned int ui;
+ return (unsigned int)__rdtscp(&ui);
+#else
+#if AOM_ARCH_X86_64
+ return (unsigned int)__rdtscp();
+#else
+ __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+ unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ // This call should not be removed. See function notes above.
+ cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ // Avoid compiler warnings on unused-but-set variables.
+ (void)reg_eax;
+ (void)reg_ebx;
+ (void)reg_ecx;
+ (void)reg_edx;
+ return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+ uint32_t v = x86_readtscp();
+ unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ // This call should not be removed. See function notes above.
+ cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ // Avoid compiler warnings on unused-but-set variables.
+ (void)reg_eax;
+ (void)reg_ebx;
+ (void)reg_ecx;
+ (void)reg_edx;
+ return v;
+}
+
+#if defined(__GNUC__) && __GNUC__
+#define x86_pause_hint() __asm__ __volatile__("pause \n\t")
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#define x86_pause_hint() asm volatile("pause \n\t")
+#else
+#if AOM_ARCH_X86_64
+#define x86_pause_hint() _mm_pause();
+#else
+#define x86_pause_hint() __asm pause
+#endif
+#endif
+
+#if defined(__GNUC__) && __GNUC__
+static void x87_set_control_word(unsigned short mode) {
+ __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short x87_get_control_word(void) {
+ unsigned short mode;
+ __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :);
+ return mode;
+}
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+static void x87_set_control_word(unsigned short mode) {
+ asm volatile("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short x87_get_control_word(void) {
+ unsigned short mode;
+ asm volatile("fstcw %0\n\t" : "=m"(*&mode) :);
+ return mode;
+}
+#elif AOM_ARCH_X86_64
+/* No fldcw intrinsics on Windows x64, punt to external asm */
+extern void aom_winx64_fldcw(unsigned short mode);
+extern unsigned short aom_winx64_fstcw(void);
+#define x87_set_control_word aom_winx64_fldcw
+#define x87_get_control_word aom_winx64_fstcw
+#else
+static void x87_set_control_word(unsigned short mode) {
+ __asm { fldcw mode }
+}
+static unsigned short x87_get_control_word(void) {
+ unsigned short mode;
+ __asm { fstcw mode }
+ return mode;
+}
+#endif
+
+static INLINE unsigned int x87_set_double_precision(void) {
+ unsigned int mode = x87_get_control_word();
+ // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
+ // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
+ // 8.1.5.2 Precision Control Field
+ // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
+ // determine the number of bits used in floating point calculations. To match
+ // later SSE instructions restrict x87 operations to Double Precision (0x200).
+ // Precision PC Field
+ // Single Precision (24-Bits) 00B
+ // Reserved 01B
+ // Double Precision (53-Bits) 10B
+ // Extended Precision (64-Bits) 11B
+ x87_set_control_word((mode & ~0x300u) | 0x200u);
+ return mode;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_PORTS_X86_H_
diff --git a/third_party/aom/aom_ports/x86_abi_support.asm b/third_party/aom/aom_ports/x86_abi_support.asm
new file mode 100644
index 0000000000..f1a65f53e5
--- /dev/null
+++ b/third_party/aom/aom_ports/x86_abi_support.asm
@@ -0,0 +1,416 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "config/aom_config.asm"
+
+; 32/64 bit compatibility macros
+;
+; In general, we make the source use 64 bit syntax, then twiddle with it using
+; the preprocessor to get the 32 bit syntax on 32 bit platforms.
+;
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+%define rax eax
+%define rbx ebx
+%define rcx ecx
+%define rdx edx
+%define rsi esi
+%define rdi edi
+%define rsp esp
+%define rbp ebp
+%define movsxd mov
+%macro movq 2
+ %ifidn %1,eax
+ movd %1,%2
+ %elifidn %2,eax
+ movd %1,%2
+ %elifidn %1,ebx
+ movd %1,%2
+ %elifidn %2,ebx
+ movd %1,%2
+ %elifidn %1,ecx
+ movd %1,%2
+ %elifidn %2,ecx
+ movd %1,%2
+ %elifidn %1,edx
+ movd %1,%2
+ %elifidn %2,edx
+ movd %1,%2
+ %elifidn %1,esi
+ movd %1,%2
+ %elifidn %2,esi
+ movd %1,%2
+ %elifidn %1,edi
+ movd %1,%2
+ %elifidn %2,edi
+ movd %1,%2
+ %elifidn %1,esp
+ movd %1,%2
+ %elifidn %2,esp
+ movd %1,%2
+ %elifidn %1,ebp
+ movd %1,%2
+ %elifidn %2,ebp
+ movd %1,%2
+ %else
+ movq %1,%2
+ %endif
+%endmacro
+%endif
+
+
+; LIBAOM_YASM_WIN64
+; Set LIBAOM_YASM_WIN64 if output is Windows 64bit so the code will work if x64
+; or win64 is defined on the Yasm command line.
+%ifidn __OUTPUT_FORMAT__,win64
+%define LIBAOM_YASM_WIN64 1
+%elifidn __OUTPUT_FORMAT__,x64
+%define LIBAOM_YASM_WIN64 1
+%else
+%define LIBAOM_YASM_WIN64 0
+%endif
+
+; Declare groups of platforms
+%ifidn __OUTPUT_FORMAT__,elf32
+ %define LIBAOM_ELF 1
+%elifidn __OUTPUT_FORMAT__,elfx32
+ %define LIBAOM_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+ %define LIBAOM_ELF 1
+%else
+ %define LIBAOM_ELF 0
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho32
+ %define LIBAOM_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+ %define LIBAOM_MACHO 1
+%else
+ %define LIBAOM_MACHO 0
+%endif
+
+; sym()
+; Return the proper symbol name for the target ABI.
+;
+; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
+; with C linkage be prefixed with an underscore.
+;
+%if LIBAOM_ELF || LIBAOM_YASM_WIN64
+ %define sym(x) x
+%else
+ ; Mach-O / COFF
+ %define sym(x) _ %+ x
+%endif
+
+; globalsym()
+; Return a global declaration with the proper decoration for the target ABI.
+;
+; When CHROMIUM is defined, include attributes to hide the symbol from the
+; global namespace.
+;
+; Chromium doesn't like exported global symbols due to symbol clashing with
+; plugins among other things.
+;
+; Requires Chromium's patched copy of yasm:
+; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+; http://www.tortall.net/projects/yasm/ticket/236
+; or nasm > 2.14.
+;
+%ifdef CHROMIUM
+ %ifdef __NASM_VER__
+ %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+ ; nasm < 2.14 does not support :private_extern directive
+ %fatal Must use nasm 2.14 or newer
+ %endif
+ %endif
+
+ %if LIBAOM_ELF
+ %define globalsym(x) global sym(x) %+ :function hidden
+ %elif LIBAOM_MACHO
+ %define globalsym(x) global sym(x) %+ :private_extern
+ %else
+ ; COFF / PE32+
+ %define globalsym(x) global sym(x)
+ %endif
+%else
+ %define globalsym(x) global sym(x)
+%endif
+
+; arg()
+; Return the address specification of the given argument
+;
+%if ABI_IS_32BIT
+ %define arg(x) [ebp+8+4*x]
+%else
+ ; 64 bit ABI passes arguments in registers. This is a workaround to get up
+ ; and running quickly. Relies on SHADOW_ARGS_TO_STACK
+ %if LIBAOM_YASM_WIN64
+ %define arg(x) [rbp+16+8*x]
+ %else
+ %define arg(x) [rbp-8-8*x]
+ %endif
+%endif
+
+; REG_SZ_BYTES, REG_SZ_BITS
+; Size of a register
+%if ABI_IS_32BIT
+%define REG_SZ_BYTES 4
+%define REG_SZ_BITS 32
+%else
+%define REG_SZ_BYTES 8
+%define REG_SZ_BITS 64
+%endif
+
+
+; ALIGN_STACK <alignment> <register>
+; This macro aligns the stack to the given alignment (in bytes). The stack
+; is left such that the previous value of the stack pointer is the first
+; argument on the stack (ie, the inverse of this macro is 'pop rsp.')
+; This macro uses one temporary register, which is not preserved, and thus
+; must be specified as an argument.
+%macro ALIGN_STACK 2
+ mov %2, rsp
+ and rsp, -%1
+ lea rsp, [rsp - (%1 - REG_SZ_BYTES)]
+ push %2
+%endmacro
+
+
+;
+; The Microsoft assembler tries to impose a certain amount of type safety in
+; its register usage. YASM doesn't recognize these directives, so we just
+; %define them away to maintain as much compatibility as possible with the
+; original inline assembler we're porting from.
+;
+%idefine PTR
+%idefine XMMWORD
+%idefine MMWORD
+
+; PIC macros
+;
+%if ABI_IS_32BIT
+ %if CONFIG_PIC=1
+ %ifidn __OUTPUT_FORMAT__,elf32
+ %define WRT_PLT wrt ..plt
+ %macro GET_GOT 1
+ extern _GLOBAL_OFFSET_TABLE_
+ push %1
+ call %%get_got
+ %%sub_offset:
+ jmp %%exitGG
+ %%get_got:
+ mov %1, [esp]
+ add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+ ret
+ %%exitGG:
+ %undef GLOBAL
+ %define GLOBAL(x) x + %1 wrt ..gotoff
+ %undef RESTORE_GOT
+ %define RESTORE_GOT pop %1
+ %endmacro
+ %elifidn __OUTPUT_FORMAT__,macho32
+ %macro GET_GOT 1
+ push %1
+ call %%get_got
+ %%get_got:
+ pop %1
+ %undef GLOBAL
+ %define GLOBAL(x) x + %1 - %%get_got
+ %undef RESTORE_GOT
+ %define RESTORE_GOT pop %1
+ %endmacro
+ %endif
+ %endif
+
+ %ifdef CHROMIUM
+ %ifidn __OUTPUT_FORMAT__,macho32
+ %define HIDDEN_DATA(x) x:private_extern
+ %else
+ %define HIDDEN_DATA(x) x
+ %endif
+ %else
+ %define HIDDEN_DATA(x) x
+ %endif
+%else
+ %macro GET_GOT 1
+ %endmacro
+ %define GLOBAL(x) rel x
+ %ifidn __OUTPUT_FORMAT__,elf64
+ %define WRT_PLT wrt ..plt
+ %define HIDDEN_DATA(x) x:data hidden
+ %elifidn __OUTPUT_FORMAT__,elfx32
+ %define WRT_PLT wrt ..plt
+ %define HIDDEN_DATA(x) x:data hidden
+ %elifidn __OUTPUT_FORMAT__,macho64
+ %ifdef CHROMIUM
+ %define HIDDEN_DATA(x) x:private_extern
+ %else
+ %define HIDDEN_DATA(x) x
+ %endif
+ %else
+ %define HIDDEN_DATA(x) x
+ %endif
+%endif
+%ifnmacro GET_GOT
+ %macro GET_GOT 1
+ %endmacro
+ %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+%define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+%define WRT_PLT
+%endif
+
+%if ABI_IS_32BIT
+ %macro SHADOW_ARGS_TO_STACK 1
+ %endm
+ %define UNSHADOW_ARGS
+%else
+%if LIBAOM_YASM_WIN64
+ %macro SHADOW_ARGS_TO_STACK 1 ; argc
+ %if %1 > 0
+ mov arg(0),rcx
+ %endif
+ %if %1 > 1
+ mov arg(1),rdx
+ %endif
+ %if %1 > 2
+ mov arg(2),r8
+ %endif
+ %if %1 > 3
+ mov arg(3),r9
+ %endif
+ %endm
+%else
+ %macro SHADOW_ARGS_TO_STACK 1 ; argc
+ %if %1 > 0
+ push rdi
+ %endif
+ %if %1 > 1
+ push rsi
+ %endif
+ %if %1 > 2
+ push rdx
+ %endif
+ %if %1 > 3
+ push rcx
+ %endif
+ %if %1 > 4
+ push r8
+ %endif
+ %if %1 > 5
+ push r9
+ %endif
+ %if %1 > 6
+ %assign i %1-6
+ %assign off 16
+ %rep i
+ mov rax,[rbp+off]
+ push rax
+ %assign off off+8
+ %endrep
+ %endif
+ %endm
+%endif
+ %define UNSHADOW_ARGS mov rsp, rbp
+%endif
+
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
+%if LIBAOM_YASM_WIN64
+%macro SAVE_XMM 1-2 a
+ %if %1 < 6
+ %error Only xmm registers 6-15 must be preserved
+ %else
+ %assign last_xmm %1
+ %define movxmm movdq %+ %2
+ %assign xmm_stack_space ((last_xmm - 5) * 16)
+ sub rsp, xmm_stack_space
+ %assign i 6
+ %rep (last_xmm - 5)
+ movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+ %assign i i+1
+ %endrep
+ %endif
+%endmacro
+%macro RESTORE_XMM 0
+ %ifndef last_xmm
+ %error RESTORE_XMM must be paired with SAVE_XMM n
+ %else
+ %assign i last_xmm
+ %rep (last_xmm - 5)
+ movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+ %assign i i-1
+ %endrep
+ add rsp, xmm_stack_space
+ ; there are a couple functions which return from multiple places.
+ ; otherwise, we could uncomment these:
+ ; %undef last_xmm
+ ; %undef xmm_stack_space
+ ; %undef movxmm
+ %endif
+%endmacro
+%else
+%macro SAVE_XMM 1-2
+%endmacro
+%macro RESTORE_XMM 0
+%endmacro
+%endif
+
+; Name of the rodata section
+;
+; .rodata seems to be an elf-ism, as it doesn't work on OSX.
+;
+%ifidn __OUTPUT_FORMAT__,macho64
+%define SECTION_RODATA section .text
+%elifidn __OUTPUT_FORMAT__,macho32
+%macro SECTION_RODATA 0
+section .text
+%endmacro
+%elifidn __OUTPUT_FORMAT__,aout
+%define SECTION_RODATA section .data
+%else
+%define SECTION_RODATA section .rodata
+%endif
+
+
+; Tell GNU ld that we don't require an executable stack.
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elfx32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
diff --git a/third_party/aom/aom_scale/aom_scale.cmake b/third_party/aom/aom_scale/aom_scale.cmake
new file mode 100644
index 0000000000..ea94dbc063
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale.cmake
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_)
+ return()
+endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_
+set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1)
+
+list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h"
+ "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+ "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+ "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+ "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+ "${AOM_ROOT}/aom_scale/yv12config.h")
+
+# Creates the aom_scale build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_scale_targets)
+ add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES})
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_scale>)
+ endif()
+
+ # Pass the new lib targets up to the parent scope instance of
+ # $AOM_LIB_TARGETS.
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/aom_scale/aom_scale.h b/third_party/aom/aom_scale/aom_scale.h
new file mode 100644
index 0000000000..11812a1453
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_SCALE_AOM_SCALE_H_
+#define AOM_AOM_SCALE_AOM_SCALE_H_
+
+#include "aom_scale/yv12config.h"
+
+extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ unsigned char *temp_area, unsigned char temp_height,
+ unsigned int hscale, unsigned int hratio,
+ unsigned int vscale, unsigned int vratio,
+ unsigned int interlaced, const int num_planes);
+
+#endif // AOM_AOM_SCALE_AOM_SCALE_H_
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.c b/third_party/aom/aom_scale/aom_scale_rtcd.c
new file mode 100644
index 0000000000..93def357d8
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.pl b/third_party/aom/aom_scale/aom_scale_rtcd.pl
new file mode 100644
index 0000000000..ae0a85687f
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.pl
@@ -0,0 +1,55 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_scale_forward_decls() {
+print <<EOF
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/aom_scale_forward_decls/;
+
+# Scaler functions
+if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
+ add_proto qw/void aom_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+ add_proto qw/void aom_vertical_band_5_4_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+ add_proto qw/void aom_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+ add_proto qw/void aom_vertical_band_5_3_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+ add_proto qw/void aom_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+ add_proto qw/void aom_vertical_band_2_1_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+ add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+}
+
+add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes";
+
+add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+
+add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes";
+
+add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
+add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+
+add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+
+add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2";
+add_proto qw/void aom_yv12_partial_coloc_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend";
+add_proto qw/void aom_yv12_partial_copy_u/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2";
+add_proto qw/void aom_yv12_partial_coloc_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2";
+add_proto qw/void aom_yv12_partial_coloc_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_extend_frame_borders_plane_row/, "const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end";
+
+add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+
+add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+
+add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
+1;
diff --git a/third_party/aom/aom_scale/generic/aom_scale.c b/third_party/aom/aom_scale/generic/aom_scale.c
new file mode 100644
index 0000000000..206c42c9f5
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/aom_scale.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/****************************************************************************
+ *
+ * Module Title : scale.c
+ *
+ * Description : Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+ * Header Files
+ ****************************************************************************/
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+typedef struct {
+ int expanded_frame_width;
+ int expanded_frame_height;
+
+ int HScale;
+ int HRatio;
+ int VScale;
+ int VRatio;
+
+ YV12_BUFFER_CONFIG *src_yuv_config;
+ YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_2t1_i
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on
+ * in source.
+ * unsigned int source_scale : Scale for source (UNUSED).
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on
+ * in destination.
+ * unsigned int dest_scale : Scale for destination
+ * (UNUSED).
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-to-1 interpolated scaling.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_2t1_i(const unsigned char *source, int source_step,
+ unsigned int source_scale, unsigned int source_length,
+ unsigned char *dest, int dest_step,
+ unsigned int dest_scale, unsigned int dest_length) {
+ const unsigned char *const dest_end = dest + dest_length * dest_step;
+ (void)source_length;
+ (void)source_scale;
+ (void)dest_scale;
+
+ source_step *= 2; // Every other row.
+
+ dest[0] = source[0]; // Special case: 1st pixel.
+ source += source_step;
+ dest += dest_step;
+
+ while (dest < dest_end) {
+ const unsigned int a = 3 * source[-source_step];
+ const unsigned int b = 10 * source[0];
+ const unsigned int c = 3 * source[source_step];
+ *dest = (unsigned char)((8 + a + b + c) >> 4);
+ source += source_step;
+ dest += dest_step;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_2t1_ps
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on
+ * in source.
+ * unsigned int source_scale : Scale for source (UNUSED).
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on
+ * in destination.
+ * unsigned int dest_scale : Scale for destination
+ * (UNUSED).
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-to-1 point subsampled scaling.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_2t1_ps(const unsigned char *source, int source_step,
+ unsigned int source_scale,
+ unsigned int source_length, unsigned char *dest,
+ int dest_step, unsigned int dest_scale,
+ unsigned int dest_length) {
+ const unsigned char *const dest_end = dest + dest_length * dest_step;
+ (void)source_length;
+ (void)source_scale;
+ (void)dest_scale;
+
+ source_step *= 2; // Every other row.
+
+ while (dest < dest_end) {
+ *dest = *source;
+ source += source_step;
+ dest += dest_step;
+ }
+}
+/****************************************************************************
+ *
+ * ROUTINE : scale1d_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be scaled.
+ * int source_step : Number of pixels to step on
+ * in source.
+ * unsigned int source_scale : Scale for source.
+ * unsigned int source_length : Length of source (UNUSED).
+ * unsigned char *dest : Pointer to output data array.
+ * int dest_step : Number of pixels to step on
+ * in destination.
+ * unsigned int dest_scale : Scale for destination.
+ * unsigned int dest_length : Length of destination.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs linear interpolation in one dimension.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_c(const unsigned char *source, int source_step,
+ unsigned int source_scale, unsigned int source_length,
+ unsigned char *dest, int dest_step,
+ unsigned int dest_scale, unsigned int dest_length) {
+ const unsigned char *const dest_end = dest + dest_length * dest_step;
+ const unsigned int round_value = dest_scale / 2;
+ unsigned int left_modifier = dest_scale;
+ unsigned int right_modifier = 0;
+ unsigned char left_pixel = source[0];
+ unsigned char right_pixel = source[source_step];
+
+ (void)source_length;
+
+ /* These asserts are needed if there are boundary issues... */
+ /* assert ( dest_scale > source_scale );*/
+ /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
+ * source_scale);*/
+
+ while (dest < dest_end) {
+ *dest = (unsigned char)((left_modifier * left_pixel +
+ right_modifier * right_pixel + round_value) /
+ dest_scale);
+
+ right_modifier += source_scale;
+
+ while (right_modifier > dest_scale) {
+ right_modifier -= dest_scale;
+ source += source_step;
+ left_pixel = source[0];
+ right_pixel = source[source_step];
+ }
+
+ left_modifier = dest_scale - right_modifier;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : Scale2D
+ *
+ * INPUTS : const unsigned char *source : Pointer to data to be
+ * scaled.
+ * int source_pitch : Stride of source image.
+ * unsigned int source_width : Width of input image.
+ * unsigned int source_height : Height of input image.
+ * unsigned char *dest : Pointer to output data
+ * array.
+ * int dest_pitch : Stride of destination
+ * image.
+ * unsigned int dest_width : Width of destination image.
+ * unsigned int dest_height : Height of destination
+ * image.
+ * unsigned char *temp_area : Pointer to temp work area.
+ * unsigned char temp_area_height : Height of temp work area.
+ * unsigned int hscale : Horizontal scale factor
+ * numerator.
+ * unsigned int hratio : Horizontal scale factor
+ * denominator.
+ * unsigned int vscale : Vertical scale factor
+ * numerator.
+ * unsigned int vratio : Vertical scale factor
+ * denominator.
+ * unsigned int interlaced : Interlace flag.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-tap linear interpolation in two dimensions.
+ *
+ * SPECIAL NOTES : Expansion is performed one band at a time to help with
+ * caching.
+ *
+ ****************************************************************************/
+static void Scale2D(
+ /*const*/
+ unsigned char *source, int source_pitch, unsigned int source_width,
+ unsigned int source_height, unsigned char *dest, int dest_pitch,
+ unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
+ unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
+ unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
+ unsigned int i, j, k;
+ unsigned int bands;
+ unsigned int dest_band_height;
+ unsigned int source_band_height;
+
+ typedef void (*Scale1D)(const unsigned char *source, int source_step,
+ unsigned int source_scale, unsigned int source_length,
+ unsigned char *dest, int dest_step,
+ unsigned int dest_scale, unsigned int dest_length);
+
+ Scale1D Scale1Dv = scale1d_c;
+ Scale1D Scale1Dh = scale1d_c;
+
+ void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *,
+ unsigned int) = NULL;
+ void (*vert_band_scale)(unsigned char *, int, unsigned char *, int,
+ unsigned int) = NULL;
+
+ int ratio_scalable = 1;
+ int interpolation = 0;
+
+ unsigned char *source_base;
+ unsigned char *line_src;
+
+ source_base = (unsigned char *)source;
+
+ if (source_pitch < 0) {
+ int offset;
+
+ offset = (source_height - 1);
+ offset *= source_pitch;
+
+ source_base += offset;
+ }
+
+ /* find out the ratio for each direction */
+ switch (hratio * 10 / hscale) {
+ case 8:
+ /* 4-5 Scale in Width direction */
+ horiz_line_scale = aom_horizontal_line_5_4_scale;
+ break;
+ case 6:
+ /* 3-5 Scale in Width direction */
+ horiz_line_scale = aom_horizontal_line_5_3_scale;
+ break;
+ case 5:
+ /* 1-2 Scale in Width direction */
+ horiz_line_scale = aom_horizontal_line_2_1_scale;
+ break;
+ default:
+ /* The ratio is not acceptable now */
+ /* throw("The ratio is not acceptable for now!"); */
+ ratio_scalable = 0;
+ break;
+ }
+
+ switch (vratio * 10 / vscale) {
+ case 8:
+ /* 4-5 Scale in vertical direction */
+ vert_band_scale = aom_vertical_band_5_4_scale;
+ source_band_height = 5;
+ dest_band_height = 4;
+ break;
+ case 6:
+ /* 3-5 Scale in vertical direction */
+ vert_band_scale = aom_vertical_band_5_3_scale;
+ source_band_height = 5;
+ dest_band_height = 3;
+ break;
+ case 5:
+ /* 1-2 Scale in vertical direction */
+
+ if (interlaced) {
+ /* if the content is interlaced, point sampling is used */
+ vert_band_scale = aom_vertical_band_2_1_scale;
+ } else {
+ interpolation = 1;
+ /* if the content is progressive, interplo */
+ vert_band_scale = aom_vertical_band_2_1_scale_i;
+ }
+
+ source_band_height = 2;
+ dest_band_height = 1;
+ break;
+ default:
+ /* The ratio is not acceptable now */
+ /* throw("The ratio is not acceptable for now!"); */
+ ratio_scalable = 0;
+ break;
+ }
+
+ if (ratio_scalable) {
+ if (source_height == dest_height) {
+ /* for each band of the image */
+ for (k = 0; k < dest_height; ++k) {
+ horiz_line_scale(source, source_width, dest, dest_width);
+ source += source_pitch;
+ dest += dest_pitch;
+ }
+
+ return;
+ }
+
+ if (interpolation) {
+ if (source < source_base) source = source_base;
+
+ horiz_line_scale(source, source_width, temp_area, dest_width);
+ }
+
+ for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
+ ++k) {
+ /* scale one band horizontally */
+ for (i = 0; i < source_band_height; ++i) {
+ /* Trap case where we could read off the base of the source buffer */
+
+ line_src = source + i * source_pitch;
+
+ if (line_src < source_base) line_src = source_base;
+
+ horiz_line_scale(line_src, source_width,
+ temp_area + (i + 1) * dest_pitch, dest_width);
+ }
+
+ /* Vertical scaling is in place */
+ vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch,
+ dest_width);
+
+ if (interpolation)
+ memcpy(temp_area, temp_area + source_band_height * dest_pitch,
+ dest_width);
+
+ /* Next band... */
+ source += (unsigned long)source_band_height * source_pitch;
+ dest += (unsigned long)dest_band_height * dest_pitch;
+ }
+
+ return;
+ }
+
+ if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps;
+
+ if (vscale == 2 && vratio == 1) {
+ if (interlaced)
+ Scale1Dv = scale1d_2t1_ps;
+ else
+ Scale1Dv = scale1d_2t1_i;
+ }
+
+ if (source_height == dest_height) {
+ /* for each band of the image */
+ for (k = 0; k < dest_height; ++k) {
+ Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
+ dest_width);
+ source += source_pitch;
+ dest += dest_pitch;
+ }
+
+ return;
+ }
+
+ if (dest_height > source_height) {
+ dest_band_height = temp_area_height - 1;
+ source_band_height = dest_band_height * source_height / dest_height;
+ } else {
+ source_band_height = temp_area_height - 1;
+ dest_band_height = source_band_height * vratio / vscale;
+ }
+
+ /* first row needs to be done so that we can stay one row ahead for vertical
+ * zoom */
+ Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio,
+ dest_width);
+
+ /* for each band of the image */
+ bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+ for (k = 0; k < bands; ++k) {
+ /* scale one band horizontally */
+ for (i = 1; i < source_band_height + 1; ++i) {
+ if (k * source_band_height + i < source_height) {
+ Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+ temp_area + i * dest_pitch, 1, hratio, dest_width);
+ } else { /* Duplicate the last row */
+ /* copy temp_area row 0 over from last row in the past */
+ memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch,
+ dest_pitch);
+ }
+ }
+
+ /* scale one band vertically */
+ for (j = 0; j < dest_width; ++j) {
+ Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+ &dest[j], dest_pitch, vratio, dest_band_height);
+ }
+
+ /* copy temp_area row 0 over from last row in the past */
+ memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+ /* move to the next band */
+ source += source_band_height * source_pitch;
+ dest += dest_band_height * dest_pitch;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : aom_scale_frame
+ *
+ * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be
+ * scaled.
+ * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold
+ * scaled frame.
+ * unsigned char *temp_area : Pointer to temp work area.
+ * unsigned char temp_area_height : Height of temp work area.
+ * unsigned int hscale : Horizontal scale factor
+ * numerator.
+ * unsigned int hratio : Horizontal scale factor
+ * denominator.
+ * unsigned int vscale : Vertical scale factor
+ * numerator.
+ * unsigned int vratio : Vertical scale factor
+ * denominator.
+ * unsigned int interlaced : Interlace flag.
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Performs 2-tap linear interpolation in two dimensions.
+ *
+ * SPECIAL NOTES : Expansion is performed one band at a time to help with
+ * caching.
+ *
+ ****************************************************************************/
+void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ unsigned char *temp_area, unsigned char temp_height,
+ unsigned int hscale, unsigned int hratio,
+ unsigned int vscale, unsigned int vratio,
+ unsigned int interlaced, const int num_planes) {
+ const int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+ const int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int is_uv = plane > 0;
+ const int plane_dw = dw >> is_uv;
+ const int plane_dh = dh >> is_uv;
+
+ Scale2D((unsigned char *)src->buffers[plane], src->strides[is_uv],
+ src->widths[is_uv], src->heights[is_uv],
+ (unsigned char *)dst->buffers[plane], dst->strides[is_uv], plane_dw,
+ plane_dh, temp_area, temp_height, hscale, hratio, vscale, vratio,
+ interlaced);
+
+ if (plane_dw < dst->widths[is_uv])
+ for (int i = 0; i < plane_dh; ++i)
+ memset(dst->buffers[plane] + i * dst->strides[is_uv] + plane_dw - 1,
+ dst->buffers[plane][i * dst->strides[is_uv] + plane_dw - 2],
+ dst->widths[is_uv] - plane_dw + 1);
+
+ if (plane_dh < dst->heights[is_uv])
+ for (int i = plane_dh - 1; i < dst->heights[is_uv]; ++i)
+ memcpy(dst->buffers[plane] + i * dst->strides[is_uv],
+ dst->buffers[plane] + (plane_dh - 2) * dst->strides[is_uv],
+ dst->widths[is_uv] + 1);
+ }
+}
diff --git a/third_party/aom/aom_scale/generic/gen_scalers.c b/third_party/aom/aom_scale/generic/gen_scalers.c
new file mode 100644
index 0000000000..549e2aa690
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/gen_scalers.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_scale/aom_scale.h"
+#include "aom_mem/aom_mem.h"
+/****************************************************************************
+ * Imports
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination
+ * (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 4 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width) {
+ const unsigned char *const source_end = source + source_width;
+ (void)dest_width;
+
+ while (source < source_end) {
+ const unsigned int a = source[0];
+ const unsigned int b = source[1];
+ const unsigned int c = source[2];
+ const unsigned int d = source[3];
+ const unsigned int e = source[4];
+
+ dest[0] = (unsigned char)a;
+ dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+ dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+ dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+ source += 5;
+ dest += 4;
+ }
+}
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch,
+ unsigned char *dest, int dest_pitch,
+ unsigned int dest_width) {
+ const unsigned char *const dest_end = dest + dest_width;
+ while (dest < dest_end) {
+ const unsigned int a = source[0 * src_pitch];
+ const unsigned int b = source[1 * src_pitch];
+ const unsigned int c = source[2 * src_pitch];
+ const unsigned int d = source[3 * src_pitch];
+ const unsigned int e = source[4 * src_pitch];
+
+ dest[0 * dest_pitch] = (unsigned char)a;
+ dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+ dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+ dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+ ++source;
+ ++dest;
+ }
+}
+
+/*7***************************************************************************
+ *
+ * ROUTINE : aom_horizontal_line_3_5_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination
+ * (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 3 to 5.
+ *
+ * SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width) {
+ const unsigned char *const source_end = source + source_width;
+ (void)dest_width;
+ while (source < source_end) {
+ const unsigned int a = source[0];
+ const unsigned int b = source[1];
+ const unsigned int c = source[2];
+ const unsigned int d = source[3];
+ const unsigned int e = source[4];
+
+ dest[0] = (unsigned char)a;
+ dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+ dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+ source += 5;
+ dest += 3;
+ }
+}
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch,
+ unsigned char *dest, int dest_pitch,
+ unsigned int dest_width) {
+ const unsigned char *const dest_end = dest + dest_width;
+ while (dest < dest_end) {
+ const unsigned int a = source[0 * src_pitch];
+ const unsigned int b = source[1 * src_pitch];
+ const unsigned int c = source[2 * src_pitch];
+ const unsigned int d = source[3 * src_pitch];
+ const unsigned int e = source[4 * src_pitch];
+
+ dest[0 * dest_pitch] = (unsigned char)a;
+ dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+ dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+ ++source;
+ ++dest;
+ }
+}
+
+/****************************************************************************
+ *
+ * ROUTINE : aom_horizontal_line_1_2_scale_c
+ *
+ * INPUTS : const unsigned char *source : Pointer to source data.
+ * unsigned int source_width : Stride of source.
+ * unsigned char *dest : Pointer to destination data.
+ * unsigned int dest_width : Stride of destination
+ * (NOT USED).
+ *
+ * OUTPUTS : None.
+ *
+ * RETURNS : void
+ *
+ * FUNCTION : Copies horizontal line of pixels from source to
+ * destination scaling up by 1 to 2.
+ *
+ * SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source,
+ unsigned int source_width,
+ unsigned char *dest,
+ unsigned int dest_width) {
+ const unsigned char *const source_end = source + source_width;
+ (void)dest_width;
+ while (source < source_end) {
+ dest[0] = source[0];
+ source += 2;
+ ++dest;
+ }
+}
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch,
+ unsigned char *dest, int dest_pitch,
+ unsigned int dest_width) {
+ (void)dest_pitch;
+ (void)src_pitch;
+ memcpy(dest, source, dest_width);
+}
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch,
+ unsigned char *dest, int dest_pitch,
+ unsigned int dest_width) {
+ const unsigned char *const dest_end = dest + dest_width;
+ (void)dest_pitch;
+ while (dest < dest_end) {
+ const unsigned int a = source[-src_pitch] * 3;
+ const unsigned int b = source[0] * 10;
+ const unsigned int c = source[src_pitch] * 3;
+ dest[0] = (unsigned char)((8 + a + b + c) >> 4);
+ ++source;
+ ++dest;
+ }
+}
diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c
new file mode 100644
index 0000000000..94b400b9e0
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/yv12config.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/internal/aom_image_internal.h"
+#include "aom_dsp/pyramid.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+
+/****************************************************************************
+ * Exports
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+
+// TODO(jkoleszar): Maybe replace this with struct aom_image
+int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+ if (ybf) {
+ if (ybf->buffer_alloc_sz > 0) {
+ aom_free(ybf->buffer_alloc);
+ }
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ if (ybf->y_pyramid) {
+ aom_free_pyramid(ybf->y_pyramid);
+ }
+ if (ybf->corners) {
+ av1_free_corner_list(ybf->corners);
+ }
+#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ aom_remove_metadata_from_frame_buffer(ybf);
+ /* buffer_alloc isn't accessed by most functions. Rather y_buffer,
+ u_buffer and v_buffer point to buffer_alloc and are used. Clear out
+ all of this so that a freed pointer isn't inadvertently used */
+ memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+ return 0;
+ }
+
+ return AOM_CODEC_MEM_ERROR;
+}
+
+static int realloc_frame_buffer_aligned(
+ YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y,
+ int use_highbitdepth, int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb,
+ void *cb_priv, const int y_stride, const uint64_t yplane_size,
+ const uint64_t uvplane_size, const int aligned_width,
+ const int aligned_height, const int uv_width, const int uv_height,
+ const int uv_stride, const int uv_border_w, const int uv_border_h,
+ int num_pyramid_levels, int alloc_y_plane_only) {
+ if (ybf) {
+ const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
+ const uint64_t frame_size =
+ (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
+
+ uint8_t *buf = NULL;
+
+#if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER
+ // We should only need an 8-bit version of the source frame if we are
+ // encoding in non-realtime mode
+ (void)num_pyramid_levels;
+ assert(num_pyramid_levels == 0);
+#endif // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+ // The size of ybf->buffer_alloc.
+ uint64_t alloc_size = frame_size;
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ // The size of ybf->y_pyramid
+ if (num_pyramid_levels > 0) {
+ alloc_size += aom_get_pyramid_alloc_size(
+ width, height, num_pyramid_levels, use_highbitdepth);
+ alloc_size += av1_get_corner_list_size();
+ }
+#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
+ // pool. Bound the total amount of allocated memory as if these REF_FRAMES
+ // frame buffers were allocated in a single allocation.
+ if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES)
+ return AOM_CODEC_MEM_ERROR;
+#endif
+
+ if (cb != NULL) {
+ const int align_addr_extra_size = 31;
+ const uint64_t external_frame_size = frame_size + align_addr_extra_size;
+
+ assert(fb != NULL);
+
+ if (external_frame_size != (size_t)external_frame_size)
+ return AOM_CODEC_MEM_ERROR;
+
+ // Allocation to hold larger frame, or first allocation.
+ if (cb(cb_priv, (size_t)external_frame_size, fb) < 0)
+ return AOM_CODEC_MEM_ERROR;
+
+ if (fb->data == NULL || fb->size < external_frame_size)
+ return AOM_CODEC_MEM_ERROR;
+
+ ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32);
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+ // This memset is needed for fixing the issue of using uninitialized
+ // value in msan test. It will cause a perf loss, so only do this for
+ // msan test.
+ memset(ybf->buffer_alloc, 0, (size_t)frame_size);
+#endif
+#endif
+ } else if (frame_size > ybf->buffer_alloc_sz) {
+ // Allocation to hold larger frame, or first allocation.
+ aom_free(ybf->buffer_alloc);
+ ybf->buffer_alloc = NULL;
+ ybf->buffer_alloc_sz = 0;
+
+ if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR;
+
+ ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
+ if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR;
+
+ ybf->buffer_alloc_sz = (size_t)frame_size;
+
+ // This memset is needed for fixing valgrind error from C loop filter
+ // due to access uninitialized memory in frame border. It could be
+ // removed if border is totally removed.
+ memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+ }
+
+ ybf->y_crop_width = width;
+ ybf->y_crop_height = height;
+ ybf->y_width = aligned_width;
+ ybf->y_height = aligned_height;
+ ybf->y_stride = y_stride;
+
+ ybf->uv_crop_width = (width + ss_x) >> ss_x;
+ ybf->uv_crop_height = (height + ss_y) >> ss_y;
+ ybf->uv_width = uv_width;
+ ybf->uv_height = uv_height;
+ ybf->uv_stride = uv_stride;
+
+ ybf->border = border;
+ ybf->frame_size = (size_t)frame_size;
+ ybf->subsampling_x = ss_x;
+ ybf->subsampling_y = ss_y;
+
+ buf = ybf->buffer_alloc;
+ if (use_highbitdepth) {
+ // Store uint16 addresses when using 16bit framebuffers
+ buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+ ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+ } else {
+ ybf->flags = 0;
+ }
+
+ ybf->y_buffer = (uint8_t *)aom_align_addr(
+ buf + (border * y_stride) + border, aom_byte_align);
+ if (!alloc_y_plane_only) {
+ ybf->u_buffer = (uint8_t *)aom_align_addr(
+ buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+ aom_byte_align);
+ ybf->v_buffer =
+ (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
+ (uv_border_h * uv_stride) + uv_border_w,
+ aom_byte_align);
+ } else {
+ ybf->u_buffer = NULL;
+ ybf->v_buffer = NULL;
+ }
+
+ ybf->use_external_reference_buffers = 0;
+
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ if (ybf->y_pyramid) {
+ aom_free_pyramid(ybf->y_pyramid);
+ ybf->y_pyramid = NULL;
+ }
+ if (ybf->corners) {
+ av1_free_corner_list(ybf->corners);
+ ybf->corners = NULL;
+ }
+ if (num_pyramid_levels > 0) {
+ ybf->y_pyramid = aom_alloc_pyramid(width, height, num_pyramid_levels,
+ use_highbitdepth);
+ if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR;
+ ybf->corners = av1_alloc_corner_list();
+ if (!ybf->corners) return AOM_CODEC_MEM_ERROR;
+ }
+#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+
+ ybf->corrupted = 0; /* assume not corrupted by errors */
+ return 0;
+ }
+ return AOM_CODEC_MEM_ERROR;
+}
+
+static int calc_stride_and_planesize(
+ const int ss_x, const int ss_y, const int aligned_width,
+ const int aligned_height, const int border, const int byte_alignment,
+ int alloc_y_plane_only, int *y_stride, int *uv_stride,
+ uint64_t *yplane_size, uint64_t *uvplane_size, const int uv_height) {
+ /* Only support allocating buffers that have a border that's a multiple
+ * of 32. The border restriction is required to get 16-byte alignment of
+ * the start of the chroma rows without introducing an arbitrary gap
+ * between planes, which would break the semantics of things like
+ * aom_img_set_rect(). */
+ if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
+ *y_stride = aom_calc_y_stride(aligned_width, border);
+ *yplane_size =
+ (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
+
+ if (!alloc_y_plane_only) {
+ *uv_stride = *y_stride >> ss_x;
+ *uvplane_size =
+ (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
+ byte_alignment;
+ } else {
+ *uv_stride = 0;
+ *uvplane_size = 0;
+ }
+ return 0;
+}
+
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth,
+ int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb,
+ aom_get_frame_buffer_cb_fn_t cb, void *cb_priv,
+ int num_pyramid_levels, int alloc_y_plane_only) {
+#if CONFIG_SIZE_LIMIT
+ if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+ return AOM_CODEC_MEM_ERROR;
+#endif
+
+ if (ybf) {
+ int y_stride = 0;
+ int uv_stride = 0;
+ uint64_t yplane_size = 0;
+ uint64_t uvplane_size = 0;
+ const int aligned_width = (width + 7) & ~7;
+ const int aligned_height = (height + 7) & ~7;
+ const int uv_width = aligned_width >> ss_x;
+ const int uv_height = aligned_height >> ss_y;
+ const int uv_border_w = border >> ss_x;
+ const int uv_border_h = border >> ss_y;
+
+ int error = calc_stride_and_planesize(
+ ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
+ alloc_y_plane_only, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
+ uv_height);
+ if (error) return error;
+ return realloc_frame_buffer_aligned(
+ ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
+ byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
+ aligned_width, aligned_height, uv_width, uv_height, uv_stride,
+ uv_border_w, uv_border_h, num_pyramid_levels, alloc_y_plane_only);
+ }
+ return AOM_CODEC_MEM_ERROR;
+}
+
+int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth, int border,
+ int byte_alignment, int num_pyramid_levels,
+ int alloc_y_plane_only) {
+ if (ybf) {
+ aom_free_frame_buffer(ybf);
+ return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+ use_highbitdepth, border, byte_alignment,
+ NULL, NULL, NULL, num_pyramid_levels,
+ alloc_y_plane_only);
+ }
+ return AOM_CODEC_MEM_ERROR;
+}
+
+void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+ if (ybf && ybf->metadata) {
+ aom_img_metadata_array_free(ybf->metadata);
+ ybf->metadata = NULL;
+ }
+}
+
+int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+ const aom_metadata_array_t *arr) {
+ if (!ybf || !arr || !arr->metadata_array) return -1;
+ if (ybf->metadata == arr) return 0;
+ aom_remove_metadata_from_frame_buffer(ybf);
+ ybf->metadata = aom_img_metadata_array_alloc(arr->sz);
+ if (!ybf->metadata) return -1;
+ for (size_t i = 0; i < ybf->metadata->sz; i++) {
+ ybf->metadata->metadata_array[i] = aom_img_metadata_alloc(
+ arr->metadata_array[i]->type, arr->metadata_array[i]->payload,
+ arr->metadata_array[i]->sz, arr->metadata_array[i]->insert_flag);
+ if (ybf->metadata->metadata_array[i] == NULL) {
+ aom_img_metadata_array_free(ybf->metadata);
+ ybf->metadata = NULL;
+ return -1;
+ }
+ }
+ ybf->metadata->sz = arr->sz;
+ return 0;
+}
diff --git a/third_party/aom/aom_scale/generic/yv12extend.c b/third_party/aom/aom_scale/generic/yv12extend.c
new file mode 100644
index 0000000000..5546112d40
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/yv12extend.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+static void extend_plane(uint8_t *const src, int src_stride, int width,
+ int height, int extend_top, int extend_left,
+ int extend_bottom, int extend_right, int v_start,
+ int v_end) {
+ assert(src != NULL);
+ int i;
+ const int linesize = extend_left + extend_right + width;
+ assert(linesize <= src_stride);
+
+ /* copy the left and right most columns out */
+ uint8_t *src_ptr1 = src + v_start * src_stride;
+ uint8_t *src_ptr2 = src + v_start * src_stride + width - 1;
+ uint8_t *dst_ptr1 = src + v_start * src_stride - extend_left;
+ uint8_t *dst_ptr2 = src_ptr2 + 1;
+
+ for (i = v_start; i < v_end; ++i) {
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ memset(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_stride;
+ src_ptr2 += src_stride;
+ dst_ptr1 += src_stride;
+ dst_ptr2 += src_stride;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = src - extend_left;
+ dst_ptr1 = src_ptr1 + src_stride * -extend_top;
+
+ for (i = 0; i < extend_top; ++i) {
+ memcpy(dst_ptr1, src_ptr1, linesize);
+ dst_ptr1 += src_stride;
+ }
+
+ src_ptr2 = src_ptr1 + src_stride * (height - 1);
+ dst_ptr2 = src_ptr2;
+
+ for (i = 0; i < extend_bottom; ++i) {
+ dst_ptr2 += src_stride;
+ memcpy(dst_ptr2, src_ptr2, linesize);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
+ int height, int extend_top, int extend_left,
+ int extend_bottom, int extend_right, int v_start,
+ int v_end) {
+ int i;
+ const int linesize = extend_left + extend_right + width;
+ assert(linesize <= src_stride);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+ /* copy the left and right most columns out */
+ uint16_t *src_ptr1 = src + v_start * src_stride;
+ uint16_t *src_ptr2 = src + v_start * src_stride + width - 1;
+ uint16_t *dst_ptr1 = src + v_start * src_stride - extend_left;
+ uint16_t *dst_ptr2 = src_ptr2 + 1;
+
+ for (i = v_start; i < v_end; ++i) {
+ aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_stride;
+ src_ptr2 += src_stride;
+ dst_ptr1 += src_stride;
+ dst_ptr2 += src_stride;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = src - extend_left;
+ dst_ptr1 = src_ptr1 + src_stride * -extend_top;
+
+ for (i = 0; i < extend_top; ++i) {
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+ dst_ptr1 += src_stride;
+ }
+
+ src_ptr2 = src_ptr1 + src_stride * (height - 1);
+ dst_ptr2 = src_ptr2;
+
+ for (i = 0; i < extend_bottom; ++i) {
+ dst_ptr2 += src_stride;
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_extend_frame_borders_plane_row_c(const YV12_BUFFER_CONFIG *ybf,
+ int plane, int v_start, int v_end) {
+ const int ext_size = ybf->border;
+ const int ss_x = ybf->subsampling_x;
+ const int ss_y = ybf->subsampling_y;
+
+ assert(ybf->y_height - ybf->y_crop_height < 16);
+ assert(ybf->y_width - ybf->y_crop_width < 16);
+ assert(ybf->y_height - ybf->y_crop_height >= 0);
+ assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+ const int is_uv = plane > 0;
+ const int top = ext_size >> (is_uv ? ss_y : 0);
+ const int left = ext_size >> (is_uv ? ss_x : 0);
+ const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+ const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+ const int extend_top_border = (v_start == 0);
+ const int extend_bottom_border = (v_end == ybf->crop_heights[is_uv]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
+ ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+ extend_top_border ? top : 0, left,
+ extend_bottom_border ? bottom : 0, right, v_start, v_end);
+ return;
+ }
+#endif
+
+ extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+ ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+ extend_top_border ? top : 0, left,
+ extend_bottom_border ? bottom : 0, right, v_start, v_end);
+}
+
+void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+ const int num_planes) {
+ assert(ybf->border % 2 == 0);
+ assert(ybf->y_height - ybf->y_crop_height < 16);
+ assert(ybf->y_width - ybf->y_crop_width < 16);
+ assert(ybf->y_height - ybf->y_crop_height >= 0);
+ assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int is_uv = plane > 0;
+ const int plane_border = ybf->border >> is_uv;
+ extend_plane_high(
+ ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
+ ybf->crop_heights[is_uv], plane_border, plane_border,
+ plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+ plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
+ ybf->crop_heights[is_uv]);
+ }
+ return;
+ }
+#endif
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int is_uv = plane > 0;
+ const int plane_border = ybf->border >> is_uv;
+ extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+ ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+ plane_border, plane_border,
+ plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+ plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0,
+ ybf->crop_heights[is_uv]);
+ }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
+ const int num_planes) {
+ const int ss_x = ybf->subsampling_x;
+ const int ss_y = ybf->subsampling_y;
+
+ assert(ybf->y_height - ybf->y_crop_height < 16);
+ assert(ybf->y_width - ybf->y_crop_width < 16);
+ assert(ybf->y_height - ybf->y_crop_height >= 0);
+ assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int is_uv = plane > 0;
+ const int top = ext_size >> (is_uv ? ss_y : 0);
+ const int left = ext_size >> (is_uv ? ss_x : 0);
+ const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+ const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+ extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
+ ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
+ left, bottom, right, 0, ybf->crop_heights[is_uv]);
+ }
+ return;
+ }
+#endif
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int is_uv = plane > 0;
+ const int top = ext_size >> (is_uv ? ss_y : 0);
+ const int left = ext_size >> (is_uv ? ss_x : 0);
+ const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+ const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+ extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+ ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
+ bottom, right, 0, ybf->crop_heights[is_uv]);
+ }
+}
+
+void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
+ extend_frame(ybf, ybf->border, num_planes);
+}
+
+void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
+ const int num_planes) {
+ const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
+ ? AOMINNERBORDERINPIXELS
+ : ybf->border;
+ extend_frame(ybf, inner_bw, num_planes);
+}
+
+void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
+ int ext_size = ybf->border;
+ assert(ybf->y_height - ybf->y_crop_height < 16);
+ assert(ybf->y_width - ybf->y_crop_width < 16);
+ assert(ybf->y_height - ybf->y_crop_height >= 0);
+ assert(ybf->y_width - ybf->y_crop_width >= 0);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ extend_plane_high(
+ ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height,
+ ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height,
+ ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height);
+ return;
+ }
+#endif
+ extend_plane(
+ ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height,
+ ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height,
+ ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif
+
+// Copies the source image into the destination image and updates the
+// destination's UMV borders.
+// Note: The frames are assumed to be identical in size.
+void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
+ assert(src_bc->y_width == dst_bc->y_width);
+ assert(src_bc->y_height == dst_bc->y_height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
+
+ if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint8_t *plane_src = src_bc->buffers[plane];
+ uint8_t *plane_dst = dst_bc->buffers[plane];
+ const int is_uv = plane > 0;
+
+ for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
+ memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
+ plane_src += src_bc->strides[is_uv];
+ plane_dst += dst_bc->strides[is_uv];
+ }
+ }
+ aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
+ return;
+ }
+#endif
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint8_t *plane_src = src_bc->buffers[plane];
+ uint8_t *plane_dst = dst_bc->buffers[plane];
+ const int is_uv = plane > 0;
+
+ for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
+ memcpy(plane_dst, plane_src, src_bc->widths[is_uv]);
+ plane_src += src_bc->strides[is_uv];
+ plane_dst += dst_bc->strides[is_uv];
+ }
+ }
+ aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
+}
+
+void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc) {
+ int row;
+ const uint8_t *src = src_ybc->y_buffer;
+ uint8_t *dst = dst_ybc->y_buffer;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+ src16 += src_ybc->y_stride;
+ dst16 += dst_ybc->y_stride;
+ }
+ return;
+ }
+#endif
+
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ memcpy(dst, src, src_ybc->y_width);
+ src += src_ybc->y_stride;
+ dst += dst_ybc->y_stride;
+ }
+}
+
+void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc) {
+ int row;
+ const uint8_t *src = src_bc->u_buffer;
+ uint8_t *dst = dst_bc->u_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (row = 0; row < src_bc->uv_height; ++row) {
+ memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+ src16 += src_bc->uv_stride;
+ dst16 += dst_bc->uv_stride;
+ }
+ return;
+ }
+#endif
+ for (row = 0; row < src_bc->uv_height; ++row) {
+ memcpy(dst, src, src_bc->uv_width);
+ src += src_bc->uv_stride;
+ dst += dst_bc->uv_stride;
+ }
+}
+
+void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc) {
+ int row;
+ const uint8_t *src = src_bc->v_buffer;
+ uint8_t *dst = dst_bc->v_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (row = 0; row < src_bc->uv_height; ++row) {
+ memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+ src16 += src_bc->uv_stride;
+ dst16 += dst_bc->uv_stride;
+ }
+ return;
+ }
+#endif
+ for (row = 0; row < src_bc->uv_height; ++row) {
+ memcpy(dst, src, src_bc->uv_width);
+ src += src_bc->uv_stride;
+ dst += dst_bc->uv_stride;
+ }
+}
+
+void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, int hstart1,
+ int hend1, int vstart1, int vend1,
+ YV12_BUFFER_CONFIG *dst_ybc, int hstart2,
+ int vstart2) {
+ int row;
+ const uint8_t *src = src_ybc->y_buffer;
+ uint8_t *dst = dst_ybc->y_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 =
+ CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
+ uint16_t *dst16 =
+ CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2);
+
+ for (row = vstart1; row < vend1; ++row) {
+ memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+ src16 += src_ybc->y_stride;
+ dst16 += dst_ybc->y_stride;
+ }
+ return;
+ }
+#endif
+ src = (src + vstart1 * src_ybc->y_stride + hstart1);
+ dst = (dst + vstart2 * dst_ybc->y_stride + hstart2);
+
+ for (row = vstart1; row < vend1; ++row) {
+ memcpy(dst, src, (hend1 - hstart1));
+ src += src_ybc->y_stride;
+ dst += dst_ybc->y_stride;
+ }
+}
+
+void aom_yv12_partial_coloc_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc, int hstart,
+ int hend, int vstart, int vend) {
+ aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc,
+ hstart, vstart);
+}
+
+void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1,
+ int hend1, int vstart1, int vend1,
+ YV12_BUFFER_CONFIG *dst_bc, int hstart2,
+ int vstart2) {
+ int row;
+ const uint8_t *src = src_bc->u_buffer;
+ uint8_t *dst = dst_bc->u_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 =
+ CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
+ uint16_t *dst16 =
+ CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
+ for (row = vstart1; row < vend1; ++row) {
+ memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+ src16 += src_bc->uv_stride;
+ dst16 += dst_bc->uv_stride;
+ }
+ return;
+ }
+#endif
+ src = (src + vstart1 * src_bc->uv_stride + hstart1);
+ dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
+
+ for (row = vstart1; row < vend1; ++row) {
+ memcpy(dst, src, (hend1 - hstart1));
+ src += src_bc->uv_stride;
+ dst += dst_bc->uv_stride;
+ }
+}
+
+void aom_yv12_partial_coloc_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc, int hstart,
+ int hend, int vstart, int vend) {
+ aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
+ vstart);
+}
+
+void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1,
+ int hend1, int vstart1, int vend1,
+ YV12_BUFFER_CONFIG *dst_bc, int hstart2,
+ int vstart2) {
+ int row;
+ const uint8_t *src = src_bc->v_buffer;
+ uint8_t *dst = dst_bc->v_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 =
+ CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
+ uint16_t *dst16 =
+ CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
+ for (row = vstart1; row < vend1; ++row) {
+ memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+ src16 += src_bc->uv_stride;
+ dst16 += dst_bc->uv_stride;
+ }
+ return;
+ }
+#endif
+ src = (src + vstart1 * src_bc->uv_stride + hstart1);
+ dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
+
+ for (row = vstart1; row < vend1; ++row) {
+ memcpy(dst, src, (hend1 - hstart1));
+ src += src_bc->uv_stride;
+ dst += dst_bc->uv_stride;
+ }
+}
+
+void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc, int hstart,
+ int hend, int vstart, int vend) {
+ aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
+ vstart);
+}
+
+int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border,
+ int byte_alignment,
+ int num_pyramid_levels, int num_planes) {
+ if (ybf) {
+ if (new_border == ybf->border) return 0;
+ YV12_BUFFER_CONFIG new_buf;
+ memset(&new_buf, 0, sizeof(new_buf));
+ const int error = aom_alloc_frame_buffer(
+ &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x,
+ ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border,
+ byte_alignment, num_pyramid_levels, 0);
+ if (error) return error;
+ // Copy image buffer
+ aom_yv12_copy_frame(ybf, &new_buf, num_planes);
+
+ // Extend up to new border
+ aom_extend_frame_borders(&new_buf, num_planes);
+
+ // Now free the old buffer and replace with the new
+ aom_free_frame_buffer(ybf);
+ memcpy(ybf, &new_buf, sizeof(new_buf));
+ return 0;
+ }
+ return -2;
+}
diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h
new file mode 100644
index 0000000000..f192a3032e
--- /dev/null
+++ b/third_party/aom/aom_scale/yv12config.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_SCALE_YV12CONFIG_H_
+#define AOM_AOM_SCALE_YV12CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_image_internal.h"
+
+/*!\cond */
+
+#define AOMINNERBORDERINPIXELS 160
+#define AOM_INTERP_EXTEND 4
+#define AOM_BORDER_IN_PIXELS 288
+#define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_ENC_ALLINTRA_BORDER 64
+#define AOM_DEC_BORDER_IN_PIXELS 64
+
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+struct image_pyramid;
+struct corner_list;
+#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+
+/*!\endcond */
+/*!
+ * \brief YV12 frame buffer data structure
+ */
+typedef struct yv12_buffer_config {
+ /*!\cond */
+ union {
+ struct {
+ int y_width;
+ int uv_width;
+ };
+ int widths[2];
+ };
+ union {
+ struct {
+ int y_height;
+ int uv_height;
+ };
+ int heights[2];
+ };
+ union {
+ struct {
+ int y_crop_width;
+ int uv_crop_width;
+ };
+ int crop_widths[2];
+ };
+ union {
+ struct {
+ int y_crop_height;
+ int uv_crop_height;
+ };
+ int crop_heights[2];
+ };
+ union {
+ struct {
+ int y_stride;
+ int uv_stride;
+ };
+ int strides[2];
+ };
+ union {
+ struct {
+ uint8_t *y_buffer;
+ uint8_t *u_buffer;
+ uint8_t *v_buffer;
+ };
+ uint8_t *buffers[3];
+ };
+
+ // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally
+ // allocated memory or external buffers.
+ int use_external_reference_buffers;
+ // This is needed to store y_buffer, u_buffer, and v_buffer when set reference
+ // uses an external refernece, and restore those buffer pointers after the
+ // external reference frame is no longer used.
+ uint8_t *store_buf_adr[3];
+
+ // Global motion search data
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ // 8-bit downsampling pyramid for the Y plane
+ struct image_pyramid *y_pyramid;
+ struct corner_list *corners;
+#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+
+ uint8_t *buffer_alloc;
+ size_t buffer_alloc_sz;
+ int border;
+ size_t frame_size;
+ int subsampling_x;
+ int subsampling_y;
+ unsigned int bit_depth;
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ uint8_t monochrome;
+ aom_chroma_sample_position_t chroma_sample_position;
+ aom_color_range_t color_range;
+ int render_width;
+ int render_height;
+
+ int corrupted;
+ int flags;
+ aom_metadata_array_t *metadata;
+ /*!\endcond */
+} YV12_BUFFER_CONFIG;
+
+/*!\cond */
+
+#define YV12_FLAG_HIGHBITDEPTH 8
+
+// Allocate a frame buffer
+//
+// If ybf currently contains an image, all associated memory will be freed and
+// then reallocated. In contrast, aom_realloc_frame_buffer() will reuse any
+// existing allocations where possible. So, if ybf is likely to already be
+// set up, please consider aom_realloc_frame_buffer() instead.
+//
+// See aom_realloc_frame_buffer() for the meanings of the arguments, and
+// available return values.
+int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth, int border,
+ int byte_alignment, int num_pyramid_levels,
+ int alloc_y_plane_only);
+
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
+// NULL, then libaom is using the frame buffer callbacks to handle memory.
+// If cb is not NULL, libaom will call cb with minimum size in bytes needed
+// to decode the current frame. If cb is NULL, libaom will allocate memory
+// internally to decode the current frame.
+//
+// If num_pyramid_levels > 0, then an image pyramid will be allocated with
+// the specified number of levels.
+//
+// Any buffer which may become a source or ref frame buffer in the encoder
+// must have num_pyramid_levels = cpi->image_pyramid_levels. This will cause
+// an image pyramid to be allocated if one is needed.
+//
+// Any other buffers (in particular, any buffers inside the decoder)
+// must have cpi->image_pyramid_levels = 0, as a pyramid is unneeded there.
+//
+// Returns 0 on success. Returns < 0 on failure.
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth,
+ int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb,
+ aom_get_frame_buffer_cb_fn_t cb, void *cb_priv,
+ int num_pyramid_levels, int alloc_y_plane_only);
+
+int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+/*!\endcond */
+/*!\brief Removes metadata from YUV_BUFFER_CONFIG struct.
+ *
+ * Frees metadata in frame buffer.
+ * Frame buffer metadata pointer will be set to NULL.
+ *
+ * \param[in] ybf Frame buffer struct pointer
+ */
+void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+/*!\brief Copy metadata to YUV_BUFFER_CONFIG struct.
+ *
+ * Copies metadata to frame buffer.
+ * Frame buffer will clear any previous metadata and will reallocate the
+ * metadata array to the new metadata size. Then, it will copy the new metadata
+ * array into it.
+ * If arr metadata pointer points to the same address as current metadata in the
+ * frame buffer, function will do nothing and return 0.
+ * Returns 0 on success or -1 on failure.
+ *
+ * \param[in] ybf Frame buffer struct pointer
+ * \param[in] arr Metadata array struct pointer
+ */
+int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+ const aom_metadata_array_t *arr);
+
+/*!\brief Calculate the stride required for the image.
+ *
+ * Calculates the stride value for an image from aligned width and border.
+ * Returns the y stride value.
+ *
+ * \param[in] aligned_width Aligned width of the image
+ * \param[in] border Border in pixels
+ */
+static AOM_INLINE int aom_calc_y_stride(int aligned_width, int border) {
+ return ((aligned_width + 2 * border) + 31) & ~31;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_SCALE_YV12CONFIG_H_
diff --git a/third_party/aom/aom_util/aom_thread.c b/third_party/aom/aom_util/aom_thread.c
new file mode 100644
index 0000000000..fa3b0a25e4
--- /dev/null
+++ b/third_party/aom/aom_util/aom_thread.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Multi-threaded worker
+//
+// Original source:
+// https://chromium.googlesource.com/webm/libwebp
+
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <assert.h>
+#include <string.h> // for memset()
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/sanitizer.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_MULTITHREAD
+
+struct AVxWorkerImpl {
+ pthread_mutex_t mutex_;
+ pthread_cond_t condition_;
+ pthread_t thread_;
+};
+
+//------------------------------------------------------------------------------
+
+static void execute(AVxWorker *const worker); // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+ AVxWorker *const worker = (AVxWorker *)ptr;
+#ifdef __APPLE__
+ if (worker->thread_name != NULL) {
+ // Apple's version of pthread_setname_np takes one argument and operates on
+ // the current thread only. The maximum size of the thread_name buffer was
+ // noted in the Chromium source code and was confirmed by experiments. If
+ // thread_name is too long, pthread_setname_np returns -1 with errno
+ // ENAMETOOLONG (63).
+ char thread_name[64];
+ strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+ thread_name[sizeof(thread_name) - 1] = '\0';
+ pthread_setname_np(thread_name);
+ }
+#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
+ if (worker->thread_name != NULL) {
+ // Linux and Android require names (with nul) fit in 16 chars, otherwise
+ // pthread_setname_np() returns ERANGE (34).
+ char thread_name[16];
+ strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+ thread_name[sizeof(thread_name) - 1] = '\0';
+ pthread_setname_np(pthread_self(), thread_name);
+ }
+#endif
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ for (;;) {
+ while (worker->status_ == OK) { // wait in idling mode
+ pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+ }
+ if (worker->status_ == WORK) {
+ // When worker->status_ is WORK, the main thread doesn't change
+ // worker->status_ and will wait until the worker changes worker->status_
+ // to OK. See change_state(). So the worker can safely call execute()
+ // without holding worker->impl_->mutex_. When the worker reacquires
+ // worker->impl_->mutex_, worker->status_ must still be WORK.
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ execute(worker);
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ assert(worker->status_ == WORK);
+ worker->status_ = OK;
+ // signal to the main thread that we're done (for sync())
+ pthread_cond_signal(&worker->impl_->condition_);
+ } else {
+ assert(worker->status_ == NOT_OK); // finish the worker
+ break;
+ }
+ }
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ return THREAD_RETURN(NULL); // Thread is finished
+}
+
+// main thread state control
+static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) {
+ // No-op when attempting to change state on a thread that didn't come up.
+ // Checking status_ without acquiring the lock first would result in a data
+ // race.
+ if (worker->impl_ == NULL) return;
+
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ if (worker->status_ >= OK) {
+ // wait for the worker to finish
+ while (worker->status_ != OK) {
+ pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+ }
+ // assign new status and release the working thread if needed
+ if (new_status != OK) {
+ worker->status_ = new_status;
+ pthread_cond_signal(&worker->impl_->condition_);
+ }
+ }
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+}
+
+#endif // CONFIG_MULTITHREAD
+
+//------------------------------------------------------------------------------
+
+static void init(AVxWorker *const worker) {
+ memset(worker, 0, sizeof(*worker));
+ worker->status_ = NOT_OK;
+}
+
+static int sync(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ change_state(worker, OK);
+#endif
+ assert(worker->status_ <= OK);
+ return !worker->had_error;
+}
+
+static int reset(AVxWorker *const worker) {
+ int ok = 1;
+ worker->had_error = 0;
+ if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+ worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_));
+ if (worker->impl_ == NULL) {
+ return 0;
+ }
+ if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+ goto Error;
+ }
+ if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+ pthread_mutex_destroy(&worker->impl_->mutex_);
+ goto Error;
+ }
+ pthread_attr_t attr;
+ if (pthread_attr_init(&attr)) goto Error2;
+ // Debug ASan builds require at least ~1MiB of stack; prevents
+ // failures on macOS arm64 where the default is 512KiB.
+ // See: https://crbug.com/aomedia/3379
+#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \
+ !defined(NDEBUG)
+ size_t stacksize;
+ if (!pthread_attr_getstacksize(&attr, &stacksize)) {
+ const size_t kMinStackSize = 1 << 20; // 1 MiB
+ if (stacksize < kMinStackSize &&
+ pthread_attr_setstacksize(&attr, kMinStackSize)) {
+ pthread_attr_destroy(&attr);
+ goto Error2;
+ }
+ }
+#endif
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker);
+ if (ok) worker->status_ = OK;
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ pthread_attr_destroy(&attr);
+ if (!ok) {
+ Error2:
+ pthread_mutex_destroy(&worker->impl_->mutex_);
+ pthread_cond_destroy(&worker->impl_->condition_);
+ Error:
+ aom_free(worker->impl_);
+ worker->impl_ = NULL;
+ return 0;
+ }
+#else
+ worker->status_ = OK;
+#endif
+ } else if (worker->status_ > OK) {
+ ok = sync(worker);
+ }
+ assert(!ok || (worker->status_ == OK));
+ return ok;
+}
+
+static void execute(AVxWorker *const worker) {
+ if (worker->hook != NULL) {
+ worker->had_error |= !worker->hook(worker->data1, worker->data2);
+ }
+}
+
+static void launch(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ change_state(worker, WORK);
+#else
+ execute(worker);
+#endif
+}
+
+static void end(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ if (worker->impl_ != NULL) {
+ change_state(worker, NOT_OK);
+ pthread_join(worker->impl_->thread_, NULL);
+ pthread_mutex_destroy(&worker->impl_->mutex_);
+ pthread_cond_destroy(&worker->impl_->condition_);
+ aom_free(worker->impl_);
+ worker->impl_ = NULL;
+ }
+#else
+ worker->status_ = NOT_OK;
+ assert(worker->impl_ == NULL);
+#endif
+ assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+static AVxWorkerInterface g_worker_interface = { init, reset, sync,
+ launch, execute, end };
+
+int aom_set_worker_interface(const AVxWorkerInterface *const winterface) {
+ if (winterface == NULL || winterface->init == NULL ||
+ winterface->reset == NULL || winterface->sync == NULL ||
+ winterface->launch == NULL || winterface->execute == NULL ||
+ winterface->end == NULL) {
+ return 0;
+ }
+ g_worker_interface = *winterface;
+ return 1;
+}
+
+const AVxWorkerInterface *aom_get_worker_interface(void) {
+ return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h
new file mode 100644
index 0000000000..ec2ea43491
--- /dev/null
+++ b/third_party/aom/aom_util/aom_thread.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Multi-threaded worker
+//
+// Original source:
+// https://chromium.googlesource.com/webm/libwebp
+
+#ifndef AOM_AOM_UTIL_AOM_THREAD_H_
+#define AOM_AOM_UTIL_AOM_THREAD_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NUM_THREADS 64
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+// Prevent leaking max/min macros.
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <errno.h> // NOLINT
+#include <process.h> // NOLINT
+#include <windows.h> // NOLINT
+typedef HANDLE pthread_t;
+typedef int pthread_attr_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static INLINE int pthread_attr_init(pthread_attr_t *attr) {
+ (void)attr;
+ return 0;
+}
+
+static INLINE int pthread_attr_destroy(pthread_attr_t *attr) {
+ (void)attr;
+ return 0;
+}
+
+static INLINE int pthread_create(pthread_t *const thread,
+ const pthread_attr_t *attr,
+ unsigned int(__stdcall *start)(void *),
+ void *arg) {
+ (void)attr;
+#ifdef USE_CREATE_THREAD
+ *thread = CreateThread(NULL, /* lpThreadAttributes */
+ 0, /* dwStackSize */
+ start, arg, 0, /* dwStackSize */
+ NULL); /* lpThreadId */
+#else
+ *thread = (pthread_t)_beginthreadex(NULL, /* void *security */
+ 0, /* unsigned stack_size */
+ start, arg, 0, /* unsigned initflag */
+ NULL); /* unsigned *thrdaddr */
+#endif
+ if (*thread == NULL) return 1;
+ SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+ return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+ (void)value_ptr;
+ return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+ WAIT_OBJECT_0 ||
+ CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+ void *mutexattr) {
+ (void)mutexattr;
+ InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+ return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+ return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ EnterCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ LeaveCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ DeleteCriticalSection(mutex);
+ return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+ (void)condition;
+ return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+ void *cond_attr) {
+ (void)cond_attr;
+ InitializeConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+ WakeConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+ WakeAllConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+ pthread_mutex_t *const mutex) {
+ int ok;
+ ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+ return !ok;
+}
+#else // _WIN32
+#include <pthread.h> // NOLINT
+#define THREADFN void *
+#define THREAD_RETURN(val) val
+#endif
+
+#endif // CONFIG_MULTITHREAD
+
+// State of the worker thread object
+typedef enum {
+ NOT_OK = 0, // object is unusable
+ OK, // ready to work
+ WORK // busy finishing the current task
+} AVxWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
+typedef int (*AVxWorkerHook)(void *, void *);
+
+// Platform-dependent implementation details for the worker.
+typedef struct AVxWorkerImpl AVxWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
+typedef struct {
+ AVxWorkerImpl *impl_;
+ AVxWorkerStatus status_;
+ // Thread name for the debugger. If not NULL, must point to a string that
+ // outlives the worker thread. For portability, use a name <= 15 characters
+ // long (not including the terminating NUL character).
+ const char *thread_name;
+ AVxWorkerHook hook; // hook to call
+ void *data1; // first argument passed to 'hook'
+ void *data2; // second argument passed to 'hook'
+ int had_error; // true if a call to 'hook' returned false
+} AVxWorker;
+
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+ // Must be called first, before any other method.
+ void (*init)(AVxWorker *const worker);
+ // Must be called to initialize the object and spawn the thread. Re-entrant.
+ // Will potentially launch the thread. Returns false in case of error.
+ int (*reset)(AVxWorker *const worker);
+ // Makes sure the previous work is finished. Returns true if worker->had_error
+ // was not set and no error condition was triggered by the working thread.
+ int (*sync)(AVxWorker *const worker);
+ // Triggers the thread to call hook() with data1 and data2 arguments. These
+ // hook/data1/data2 values can be changed at any time before calling this
+ // function, but not be changed afterward until the next call to Sync().
+ void (*launch)(AVxWorker *const worker);
+ // This function is similar to launch() except that it calls the
+ // hook directly instead of using a thread. Convenient to bypass the thread
+ // mechanism while still using the AVxWorker structs. sync() must
+ // still be called afterward (for error reporting).
+ void (*execute)(AVxWorker *const worker);
+ // Kill the thread and terminate the object. To use the object again, one
+ // must call reset() again.
+ void (*end)(AVxWorker *const worker);
+} AVxWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int aom_set_worker_interface(const AVxWorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const AVxWorkerInterface *aom_get_worker_interface(void);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_UTIL_AOM_THREAD_H_
diff --git a/third_party/aom/aom_util/aom_util.cmake b/third_party/aom/aom_util/aom_util.cmake
new file mode 100644
index 0000000000..6bf4fafc4c
--- /dev/null
+++ b/third_party/aom/aom_util/aom_util.cmake
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_)
+ return()
+endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_
+set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1)
+
+list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c"
+ "${AOM_ROOT}/aom_util/aom_thread.h"
+ "${AOM_ROOT}/aom_util/endian_inl.h")
+
+if(CONFIG_BITSTREAM_DEBUG)
+ list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/debug_util.c"
+ "${AOM_ROOT}/aom_util/debug_util.h")
+endif()
+
+# Creates the aom_util build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_util_targets)
+ add_library(aom_util OBJECT ${AOM_UTIL_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_util>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_util>)
+ endif()
+endfunction()
diff --git a/third_party/aom/aom_util/debug_util.c b/third_party/aom/aom_util/debug_util.c
new file mode 100644
index 0000000000..d0792e34a4
--- /dev/null
+++ b/third_party/aom/aom_util/debug_util.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include "aom_util/debug_util.h"
+
+static int frame_idx_w = 0;
+
+static int frame_idx_r = 0;
+
+void aom_bitstream_queue_set_frame_write(int frame_idx) {
+ frame_idx_w = frame_idx;
+}
+
+int aom_bitstream_queue_get_frame_write(void) { return frame_idx_w; }
+
+void aom_bitstream_queue_set_frame_read(int frame_idx) {
+ frame_idx_r = frame_idx;
+}
+
+int aom_bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 4000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int nsymbs_queue[QUEUE_MAX_SIZE];
+static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+
+void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
+
+void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
+
+void bitstream_queue_record_write(void) { queue_prev_w = queue_w; }
+
+void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; }
+
+int bitstream_queue_get_write(void) { return queue_w; }
+
+int bitstream_queue_get_read(void) { return queue_r; }
+
+void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) {
+ if (!skip_r) {
+ if (queue_w == queue_r) {
+ printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r);
+ assert(0);
+ }
+ *result = result_queue[queue_r];
+ *nsymbs = nsymbs_queue[queue_r];
+ memcpy(cdf, cdf_queue[queue_r], *nsymbs * sizeof(*cdf));
+ queue_r = (queue_r + 1) % QUEUE_MAX_SIZE;
+ }
+}
+
+void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) {
+ // If you observe a CDF error:
+ // - Set 'debug_cdf_mismatch' to true
+ // - Set target_frame_idx_r and target_queue_r to where CDF error was reported
+ // - Set a breakpoint in debugger at the 'fprintf' below.
+ const bool debug_cdf_mismatch = false;
+ if (debug_cdf_mismatch) {
+ int target_frame_idx_r = 1;
+ int target_queue_r = 18005;
+ if (frame_idx_w == target_frame_idx_r && queue_w == target_queue_r) {
+ fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+ frame_idx_w, queue_w);
+ }
+ }
+ if (!skip_w) {
+ result_queue[queue_w] = result;
+ nsymbs_queue[queue_w] = nsymbs;
+ memcpy(cdf_queue[queue_w], cdf, nsymbs * sizeof(*cdf));
+ queue_w = (queue_w + 1) % QUEUE_MAX_SIZE;
+ if (queue_w == queue_r) {
+ printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r);
+ assert(0);
+ }
+ }
+}
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+static int max_frame_buf_num = 5;
+#define MAX_FRAME_STRIDE 1280
+#define MAX_FRAME_HEIGHT 720
+static uint16_t
+ frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only
+static uint16_t
+ frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w(void) {
+ frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num;
+ if (frame_buf_idx_w == frame_buf_idx_r) {
+ printf("frame_buf overflow\n");
+ assert(0);
+ }
+}
+
+void mismatch_reset_frame(int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ memset(frame_pre[frame_buf_idx_w][plane], 0,
+ sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+ memset(frame_tx[frame_buf_idx_w][plane], 0,
+ sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+ }
+}
+
+void mismatch_move_frame_idx_r(void) {
+ if (frame_buf_idx_w == frame_buf_idx_r) {
+ printf("frame_buf underflow\n");
+ assert(0);
+ }
+ frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd) {
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ for (int r = 0; r < blk_h; ++r) {
+ for (int c = 0; c < blk_w; ++c) {
+ frame_pre[frame_buf_idx_w][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] =
+ src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+ }
+ }
+#if 0
+ int ref_frame_idx = 3;
+ int ref_frame_offset = 4;
+ int ref_plane = 1;
+ int ref_pixel_c = 162;
+ int ref_pixel_r = 16;
+ if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+ frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c &&
+ ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r &&
+ ref_pixel_r < pixel_r + blk_h) {
+ printf(
+ "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+ "%d blk_h %d\n",
+ frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+ }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd) {
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ for (int r = 0; r < blk_h; ++r) {
+ for (int c = 0; c < blk_w; ++c) {
+ frame_tx[frame_buf_idx_w][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] =
+ src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+ }
+ }
+#if 0
+ int ref_frame_idx = 3;
+ int ref_frame_offset = 4;
+ int ref_plane = 1;
+ int ref_pixel_c = 162;
+ int ref_pixel_r = 16;
+ if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset &&
+ ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+ ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+ printf(
+ "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+ "%d blk_h %d\n",
+ frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+ }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd) {
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ int mismatch = 0;
+ for (int r = 0; r < blk_h; ++r) {
+ for (int c = 0; c < blk_w; ++c) {
+ if (frame_pre[frame_buf_idx_r][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] !=
+ (uint16_t)(src16 ? src16[r * src_stride + c]
+ : src[r * src_stride + c])) {
+ mismatch = 1;
+ }
+ }
+ }
+ if (mismatch) {
+ printf(
+ "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d "
+ "pixel_c %d pixel_r "
+ "%d blk_w %d blk_h %d\n",
+ frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+ printf("enc\n");
+ for (int rr = 0; rr < blk_h; ++rr) {
+ for (int cc = 0; cc < blk_w; ++cc) {
+ printf("%d ", frame_pre[frame_buf_idx_r][plane]
+ [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+ }
+ printf("\n");
+ }
+
+ printf("dec\n");
+ for (int rr = 0; rr < blk_h; ++rr) {
+ for (int cc = 0; cc < blk_w; ++cc) {
+ printf("%d ",
+ src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+ }
+ printf("\n");
+ }
+ assert(0);
+ }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd) {
+ if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+ printf("frame_buf undersized\n");
+ assert(0);
+ }
+
+ const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+ int mismatch = 0;
+ for (int r = 0; r < blk_h; ++r) {
+ for (int c = 0; c < blk_w; ++c) {
+ if (frame_tx[frame_buf_idx_r][plane]
+ [(r + pixel_r) * frame_stride + c + pixel_c] !=
+ (uint16_t)(src16 ? src16[r * src_stride + c]
+ : src[r * src_stride + c])) {
+ mismatch = 1;
+ }
+ }
+ }
+ if (mismatch) {
+ printf(
+ "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c "
+ "%d pixel_r "
+ "%d blk_w %d blk_h %d\n",
+ frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+ printf("enc\n");
+ for (int rr = 0; rr < blk_h; ++rr) {
+ for (int cc = 0; cc < blk_w; ++cc) {
+ printf("%d ", frame_tx[frame_buf_idx_r][plane]
+ [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+ }
+ printf("\n");
+ }
+
+ printf("dec\n");
+ for (int rr = 0; rr < blk_h; ++rr) {
+ for (int cc = 0; cc < blk_w; ++cc) {
+ printf("%d ",
+ src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+ }
+ printf("\n");
+ }
+ assert(0);
+ }
+}
+#endif // CONFIG_MISMATCH_DEBUG
diff --git a/third_party/aom/aom_util/debug_util.h b/third_party/aom/aom_util/debug_util.h
new file mode 100644
index 0000000000..23cad2a5b9
--- /dev/null
+++ b/third_party/aom/aom_util/debug_util.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_
+#define AOM_AOM_UTIL_DEBUG_UTIL_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_bitstream_queue_set_frame_write(int frame_idx);
+int aom_bitstream_queue_get_frame_writee(void);
+void aom_bitstream_queue_set_frame_read(int frame_idx);
+int aom_bitstream_queue_get_frame_read(void);
+
+#if CONFIG_BITSTREAM_DEBUG
+/* This is a debug tool used to detect bitstream error. On encoder side, it
+ * pushes each bit and probability into a queue before the bit is written into
+ * the Arithmetic coder. On decoder side, whenever a bit is read out from the
+ * Arithmetic coder, it pops out the reference bit and probability from the
+ * queue as well. If the two results do not match, this debug tool will report
+ * an error. This tool can be used to pin down the bitstream error precisely.
+ * By combining gdb's backtrace method, we can detect which module causes the
+ * bitstream error. */
+int bitstream_queue_get_write(void);
+int bitstream_queue_get_read(void);
+void bitstream_queue_record_write(void);
+void bitstream_queue_reset_write(void);
+void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs);
+void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs);
+void bitstream_queue_set_skip_write(int skip);
+void bitstream_queue_set_skip_read(int skip);
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w();
+void mismatch_move_frame_idx_r();
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+ int frame_offset, int plane, int pixel_c,
+ int pixel_r, int blk_w, int blk_h, int highbd);
+#endif // CONFIG_MISMATCH_DEBUG
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_UTIL_DEBUG_UTIL_H_
diff --git a/third_party/aom/aom_util/endian_inl.h b/third_party/aom/aom_util/endian_inl.h
new file mode 100644
index 0000000000..b69102a7f5
--- /dev/null
+++ b/third_party/aom/aom_util/endian_inl.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Endian related functions.
+
+#ifndef AOM_AOM_UTIL_ENDIAN_INL_H_
+#define AOM_AOM_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+// handle clang compatibility
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+ (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+ (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+ return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+ return _byteswap_ushort(x);
+#else
+ // gcc will recognize a 'rorw $8, ...' here:
+ return (x >> 8) | ((x & 0xff) << 8);
+#endif // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(HAVE_BUILTIN_BSWAP32)
+ return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+ uint32_t swapped_bytes;
+ __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+ return swapped_bytes;
+#elif defined(_MSC_VER)
+ return (uint32_t)_byteswap_ulong(x);
+#else
+ return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+ return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+ uint64_t swapped_bytes;
+ __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+ return swapped_bytes;
+#elif defined(_MSC_VER)
+ return (uint64_t)_byteswap_uint64(x);
+#else // generic code for swapping 64-bit values (suggested by bdb@)
+ x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+ x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+ x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8);
+ return x;
+#endif // HAVE_BUILTIN_BSWAP64
+}
+
+#endif // AOM_AOM_UTIL_ENDIAN_INL_H_
diff --git a/third_party/aom/aomedia_logo_200.png b/third_party/aom/aomedia_logo_200.png
new file mode 100644
index 0000000000..4a3b9fcc0c
--- /dev/null
+++ b/third_party/aom/aomedia_logo_200.png
Binary files differ
diff --git a/third_party/aom/apps/aomdec.c b/third_party/aom/apps/aomdec.c
new file mode 100644
index 0000000000..15734cb6a9
--- /dev/null
+++ b/third_party/aom/apps/aomdec.c
@@ -0,0 +1,1088 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#if CONFIG_OS_SUPPORT
+#if HAVE_UNISTD_H
+#include <unistd.h> // NOLINT
+#elif !defined(STDOUT_FILENO)
+#define STDOUT_FILENO 1
+#endif
+#endif
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfdec.h"
+#include "common/md5_utils.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+
+#if CONFIG_WEBM_IO
+#include "common/webmdec.h"
+#endif
+
+#include "common/rawenc.h"
+#include "common/y4menc.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+static const char *exec_name;
+
+struct AvxDecInputContext {
+ struct AvxInputContext *aom_input_ctx;
+ struct ObuDecInputContext *obu_ctx;
+ struct WebmInputContext *webm_ctx;
+};
+
+static const arg_def_t help =
+ ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t looparg =
+ ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 =
+ ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 =
+ ARG_DEF(NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg =
+ ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo =
+ ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg =
+ ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg =
+ ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg =
+ ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg =
+ ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t summaryarg =
+ ARG_DEF(NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile =
+ ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg =
+ ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t rowmtarg =
+ ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading, default: 0");
+static const arg_def_t verbosearg =
+ ARG_DEF("v", "verbose", 0, "Show version string");
+static const arg_def_t scalearg =
+ ARG_DEF("S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg =
+ ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg =
+ ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg =
+ ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
+static const arg_def_t framestatsarg =
+ ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
+static const arg_def_t outbitdeptharg =
+ ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
+static const arg_def_t isannexb =
+ ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format");
+static const arg_def_t oppointarg = ARG_DEF(
+ NULL, "oppoint", 1, "Select an operating point of a scalable bitstream");
+static const arg_def_t outallarg = ARG_DEF(
+ NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");
+static const arg_def_t skipfilmgrain =
+ ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
+
+static const arg_def_t *all_args[] = {
+ &help, &codecarg, &use_yv12, &use_i420,
+ &flipuvarg, &rawvideo, &noblitarg, &progressarg,
+ &limitarg, &skiparg, &summaryarg, &outputfile,
+ &threadsarg, &rowmtarg, &verbosearg, &scalearg,
+ &fb_arg, &md5arg, &framestatsarg, &continuearg,
+ &outbitdeptharg, &isannexb, &oppointarg, &outallarg,
+ &skipfilmgrain, NULL
+};
+
+#if CONFIG_LIBYUV
+// Returns 0 on success and returns -1 on failure.
+static INLINE int libyuv_scale(const aom_image_t *src, aom_image_t *dst,
+ FilterModeEnum mode) {
+ if (src->fmt != dst->fmt) {
+ fprintf(stderr,
+ "%s failed to scale output frame because format changed from %s to "
+ "%s\n",
+ exec_name, image_format_to_string(dst->fmt),
+ image_format_to_string(src->fmt));
+ return -1;
+ }
+ if (src->fmt == AOM_IMG_FMT_I42016) {
+ return I420Scale_16(
+ (uint16_t *)src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y] / 2,
+ (uint16_t *)src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U] / 2,
+ (uint16_t *)src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V] / 2,
+ src->d_w, src->d_h, (uint16_t *)dst->planes[AOM_PLANE_Y],
+ dst->stride[AOM_PLANE_Y] / 2, (uint16_t *)dst->planes[AOM_PLANE_U],
+ dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V],
+ dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode);
+ }
+ if (src->fmt == AOM_IMG_FMT_I420) {
+ return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+ src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+ src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
+ src->d_w, src->d_h, dst->planes[AOM_PLANE_Y],
+ dst->stride[AOM_PLANE_Y], dst->planes[AOM_PLANE_U],
+ dst->stride[AOM_PLANE_U], dst->planes[AOM_PLANE_V],
+ dst->stride[AOM_PLANE_V], dst->d_w, dst->d_h, mode);
+ }
+ fprintf(stderr, "%s cannot scale output frame of format %s\n", exec_name,
+ image_format_to_string(src->fmt));
+ return -1;
+}
+#endif
+
+static void show_help(FILE *fout, int shorthelp) {
+ fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+ if (shorthelp) {
+ fprintf(fout, "Use --help to see the full list of options.\n");
+ return;
+ }
+
+ fprintf(fout, "Options:\n");
+ arg_show_usage(fout, all_args);
+ fprintf(fout,
+ "\nOutput File Patterns:\n\n"
+ " The -o argument specifies the name of the file(s) to "
+ "write to. If the\n argument does not include any escape "
+ "characters, the output will be\n written to a single file. "
+ "Otherwise, the filename will be calculated by\n expanding "
+ "the following escape characters:\n");
+ fprintf(fout,
+ "\n\t%%w - Frame width"
+ "\n\t%%h - Frame height"
+ "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
+ "\n\n Pattern arguments are only supported in conjunction "
+ "with the --yv12 and\n --i420 options. If the -o option is "
+ "not specified, the output will be\n directed to stdout.\n");
+ fprintf(fout, "\nIncluded decoders:\n\n");
+
+ for (int i = 0; i < get_aom_decoder_count(); ++i) {
+ aom_codec_iface_t *decoder = get_aom_decoder_by_index(i);
+ fprintf(fout, " %-6s - %s\n", get_short_name_by_aom_decoder(decoder),
+ aom_codec_iface_name(decoder));
+ }
+}
+
+void usage_exit(void) {
+ show_help(stderr, 1);
+ exit(EXIT_FAILURE);
+}
+
+static int raw_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size) {
+ unsigned char raw_hdr[RAW_FRAME_HDR_SZ];
+ size_t frame_size = 0;
+
+ if (read_from_input(input_ctx, RAW_FRAME_HDR_SZ, raw_hdr) !=
+ RAW_FRAME_HDR_SZ) {
+ if (!input_eof(input_ctx))
+ aom_tools_warn("Failed to read RAW frame size\n");
+ } else {
+ const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
+ const size_t kFrameTooSmallThreshold = 256 * 1024;
+ frame_size = mem_get_le32(raw_hdr);
+
+ if (frame_size > kCorruptFrameThreshold) {
+ aom_tools_warn("Read invalid frame size (%u)\n",
+ (unsigned int)frame_size);
+ frame_size = 0;
+ }
+
+ if (frame_size < kFrameTooSmallThreshold) {
+ aom_tools_warn(
+ "Warning: Read invalid frame size (%u) - not a raw file?\n",
+ (unsigned int)frame_size);
+ }
+
+ if (frame_size > *buffer_size) {
+ uint8_t *new_buf = realloc(*buffer, 2 * frame_size);
+ if (new_buf) {
+ *buffer = new_buf;
+ *buffer_size = 2 * frame_size;
+ } else {
+ aom_tools_warn("Failed to allocate compressed data buffer\n");
+ frame_size = 0;
+ }
+ }
+ }
+
+ if (!input_eof(input_ctx)) {
+ if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) {
+ aom_tools_warn("Failed to read full frame\n");
+ return 1;
+ }
+ *bytes_read = frame_size;
+ }
+
+ return 0;
+}
+
+static int read_frame(struct AvxDecInputContext *input, uint8_t **buf,
+ size_t *bytes_in_buffer, size_t *buffer_size) {
+ switch (input->aom_input_ctx->file_type) {
+#if CONFIG_WEBM_IO
+ case FILE_TYPE_WEBM:
+ return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer,
+ buffer_size);
+#endif
+ case FILE_TYPE_RAW:
+ return raw_read_frame(input->aom_input_ctx, buf, bytes_in_buffer,
+ buffer_size);
+ case FILE_TYPE_IVF:
+ return ivf_read_frame(input->aom_input_ctx, buf, bytes_in_buffer,
+ buffer_size, NULL);
+ case FILE_TYPE_OBU:
+ return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer,
+ buffer_size);
+ default: return 1;
+ }
+}
+
+static int file_is_raw(struct AvxInputContext *input) {
+ uint8_t buf[32];
+ int is_raw = 0;
+ aom_codec_stream_info_t si;
+ memset(&si, 0, sizeof(si));
+
+ if (buffer_input(input, 32, buf, /*buffered=*/true) == 32) {
+ int i;
+
+ if (mem_get_le32(buf) < 256 * 1024 * 1024) {
+ for (i = 0; i < get_aom_decoder_count(); ++i) {
+ aom_codec_iface_t *decoder = get_aom_decoder_by_index(i);
+ if (!aom_codec_peek_stream_info(decoder, buf + 4, 32 - 4, &si)) {
+ is_raw = 1;
+ input->fourcc = get_fourcc_by_aom_decoder(decoder);
+ input->width = si.w;
+ input->height = si.h;
+ input->framerate.numerator = 30;
+ input->framerate.denominator = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ rewind_detect(input);
+ return is_raw;
+}
+
+static void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+ fprintf(stderr,
+ "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r",
+ frame_in, frame_out, dx_time,
+ (double)frame_out * 1000000.0 / (double)dx_time);
+}
+
+struct ExternalFrameBuffer {
+ uint8_t *data;
+ size_t size;
+ int in_use;
+};
+
+struct ExternalFrameBufferList {
+ int num_external_frame_buffers;
+ struct ExternalFrameBuffer *ext_fb;
+};
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Application private data passed into the set function. |min_size| is the
+// minimum size in bytes needed to decode the next frame. |fb| pointer to the
+// frame buffer.
+static int get_av1_frame_buffer(void *cb_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ int i;
+ struct ExternalFrameBufferList *const ext_fb_list =
+ (struct ExternalFrameBufferList *)cb_priv;
+ if (ext_fb_list == NULL) return -1;
+
+ // Find a free frame buffer.
+ for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
+ if (!ext_fb_list->ext_fb[i].in_use) break;
+ }
+
+ if (i == ext_fb_list->num_external_frame_buffers) return -1;
+
+ if (ext_fb_list->ext_fb[i].size < min_size) {
+ free(ext_fb_list->ext_fb[i].data);
+ ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
+ if (!ext_fb_list->ext_fb[i].data) return -1;
+
+ ext_fb_list->ext_fb[i].size = min_size;
+ }
+
+ fb->data = ext_fb_list->ext_fb[i].data;
+ fb->size = ext_fb_list->ext_fb[i].size;
+ ext_fb_list->ext_fb[i].in_use = 1;
+
+ // Set the frame buffer's private data to point at the external frame buffer.
+ fb->priv = &ext_fb_list->ext_fb[i];
+ return 0;
+}
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| user private data passed into the set function. |fb| pointer
+// to the frame buffer.
+static int release_av1_frame_buffer(void *cb_priv,
+ aom_codec_frame_buffer_t *fb) {
+ struct ExternalFrameBuffer *const ext_fb =
+ (struct ExternalFrameBuffer *)fb->priv;
+ (void)cb_priv;
+ ext_fb->in_use = 0;
+ return 0;
+}
+
+static void generate_filename(const char *pattern, char *out, size_t q_len,
+ unsigned int d_w, unsigned int d_h,
+ unsigned int frame_in) {
+ const char *p = pattern;
+ char *q = out;
+
+ do {
+ char *next_pat = strchr(p, '%');
+
+ if (p == next_pat) {
+ size_t pat_len;
+
+ /* parse the pattern */
+ q[q_len - 1] = '\0';
+ switch (p[1]) {
+ case 'w': snprintf(q, q_len - 1, "%d", d_w); break;
+ case 'h': snprintf(q, q_len - 1, "%d", d_h); break;
+ case '1': snprintf(q, q_len - 1, "%d", frame_in); break;
+ case '2': snprintf(q, q_len - 1, "%02d", frame_in); break;
+ case '3': snprintf(q, q_len - 1, "%03d", frame_in); break;
+ case '4': snprintf(q, q_len - 1, "%04d", frame_in); break;
+ case '5': snprintf(q, q_len - 1, "%05d", frame_in); break;
+ case '6': snprintf(q, q_len - 1, "%06d", frame_in); break;
+ case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
+ case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
+ case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
+ default: die("Unrecognized pattern %%%c\n", p[1]);
+ }
+
+ pat_len = strlen(q);
+ if (pat_len >= q_len - 1) die("Output filename too long.\n");
+ q += pat_len;
+ p += 2;
+ q_len -= pat_len;
+ } else {
+ size_t copy_len;
+
+ /* copy the next segment */
+ if (!next_pat)
+ copy_len = strlen(p);
+ else
+ copy_len = next_pat - p;
+
+ if (copy_len >= q_len - 1) die("Output filename too long.\n");
+
+ memcpy(q, p, copy_len);
+ q[copy_len] = '\0';
+ q += copy_len;
+ p += copy_len;
+ q_len -= copy_len;
+ }
+ } while (*p);
+}
+
+static int is_single_file(const char *outfile_pattern) {
+ const char *p = outfile_pattern;
+
+ do {
+ p = strchr(p, '%');
+ if (p && p[1] >= '1' && p[1] <= '9')
+ return 0; // pattern contains sequence number, so it's not unique
+ if (p) p++;
+ } while (p);
+
+ return 1;
+}
+
+static void print_md5(unsigned char digest[16], const char *filename) {
+ int i;
+
+ for (i = 0; i < 16; ++i) printf("%02x", digest[i]);
+ printf(" %s\n", filename);
+}
+
+static FILE *open_outfile(const char *name) {
+ if (strcmp("-", name) == 0) {
+ set_binary_mode(stdout);
+ return stdout;
+ } else {
+ FILE *file = fopen(name, "wb");
+ if (!file) fatal("Failed to open output file '%s'", name);
+ return file;
+ }
+}
+
+static int main_loop(int argc, const char **argv_) {
+ aom_codec_ctx_t decoder;
+ char *fn = NULL;
+ int i;
+ int ret = EXIT_FAILURE;
+ uint8_t *buf = NULL;
+ size_t bytes_in_buffer = 0, buffer_size = 0;
+ FILE *infile;
+ int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+ int do_md5 = 0, progress = 0;
+ int stop_after = 0, summary = 0, quiet = 1;
+ int arg_skip = 0;
+ int keep_going = 0;
+ uint64_t dx_time = 0;
+ struct arg arg;
+ char **argv, **argi, **argj;
+
+ int single_file;
+ int use_y4m = 1;
+ int opt_yv12 = 0;
+ int opt_i420 = 0;
+ int opt_raw = 0;
+ aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+ unsigned int fixed_output_bit_depth = 0;
+ unsigned int is_annexb = 0;
+ int frames_corrupted = 0;
+ int dec_flags = 0;
+ int do_scale = 0;
+ int operating_point = 0;
+ int output_all_layers = 0;
+ int skip_film_grain = 0;
+ int enable_row_mt = 0;
+ aom_image_t *scaled_img = NULL;
+ aom_image_t *img_shifted = NULL;
+ int frame_avail, got_data, flush_decoder = 0;
+ int num_external_frame_buffers = 0;
+ struct ExternalFrameBufferList ext_fb_list = { 0, NULL };
+
+ const char *outfile_pattern = NULL;
+ char outfile_name[PATH_MAX] = { 0 };
+ FILE *outfile = NULL;
+
+ FILE *framestats_file = NULL;
+
+ MD5Context md5_ctx;
+ unsigned char md5_digest[16];
+
+ struct AvxDecInputContext input = { NULL, NULL, NULL };
+ struct AvxInputContext aom_input_ctx;
+ memset(&aom_input_ctx, 0, sizeof(aom_input_ctx));
+#if CONFIG_WEBM_IO
+ struct WebmInputContext webm_ctx;
+ memset(&webm_ctx, 0, sizeof(webm_ctx));
+ input.webm_ctx = &webm_ctx;
+#endif
+ struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+ int is_ivf = 0;
+
+ obu_ctx.avx_ctx = &aom_input_ctx;
+ input.obu_ctx = &obu_ctx;
+ input.aom_input_ctx = &aom_input_ctx;
+
+ /* Parse command line */
+ exec_name = argv_[0];
+ argv = argv_dup(argc - 1, argv_ + 1);
+ if (!argv) {
+ fprintf(stderr, "Error allocating argument list\n");
+ return EXIT_FAILURE;
+ }
+
+ aom_codec_iface_t *interface = NULL;
+ for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+ memset(&arg, 0, sizeof(arg));
+ arg.argv_step = 1;
+
+ if (arg_match(&arg, &help, argi)) {
+ show_help(stdout, 0);
+ exit(EXIT_SUCCESS);
+ } else if (arg_match(&arg, &codecarg, argi)) {
+ interface = get_aom_decoder_by_short_name(arg.val);
+ if (!interface)
+ die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+ } else if (arg_match(&arg, &looparg, argi)) {
+ // no-op
+ } else if (arg_match(&arg, &outputfile, argi)) {
+ outfile_pattern = arg.val;
+ } else if (arg_match(&arg, &use_yv12, argi)) {
+ use_y4m = 0;
+ flipuv = 1;
+ opt_yv12 = 1;
+ opt_i420 = 0;
+ opt_raw = 0;
+ } else if (arg_match(&arg, &use_i420, argi)) {
+ use_y4m = 0;
+ flipuv = 0;
+ opt_yv12 = 0;
+ opt_i420 = 1;
+ opt_raw = 0;
+ } else if (arg_match(&arg, &rawvideo, argi)) {
+ use_y4m = 0;
+ opt_yv12 = 0;
+ opt_i420 = 0;
+ opt_raw = 1;
+ } else if (arg_match(&arg, &flipuvarg, argi)) {
+ flipuv = 1;
+ } else if (arg_match(&arg, &noblitarg, argi)) {
+ noblit = 1;
+ } else if (arg_match(&arg, &progressarg, argi)) {
+ progress = 1;
+ } else if (arg_match(&arg, &limitarg, argi)) {
+ stop_after = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &skiparg, argi)) {
+ arg_skip = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &md5arg, argi)) {
+ do_md5 = 1;
+ } else if (arg_match(&arg, &framestatsarg, argi)) {
+ framestats_file = fopen(arg.val, "w");
+ if (!framestats_file) {
+ die("Error: Could not open --framestats file (%s) for writing.\n",
+ arg.val);
+ }
+ } else if (arg_match(&arg, &summaryarg, argi)) {
+ summary = 1;
+ } else if (arg_match(&arg, &threadsarg, argi)) {
+ cfg.threads = arg_parse_uint(&arg);
+#if !CONFIG_MULTITHREAD
+ if (cfg.threads > 1) {
+ die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = "
+ "0.\n",
+ cfg.threads);
+ }
+#endif
+ } else if (arg_match(&arg, &rowmtarg, argi)) {
+ enable_row_mt = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &verbosearg, argi)) {
+ quiet = 0;
+ } else if (arg_match(&arg, &scalearg, argi)) {
+ do_scale = 1;
+ } else if (arg_match(&arg, &fb_arg, argi)) {
+ num_external_frame_buffers = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &continuearg, argi)) {
+ keep_going = 1;
+ } else if (arg_match(&arg, &outbitdeptharg, argi)) {
+ fixed_output_bit_depth = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &isannexb, argi)) {
+ is_annexb = 1;
+ input.obu_ctx->is_annexb = 1;
+ } else if (arg_match(&arg, &oppointarg, argi)) {
+ operating_point = arg_parse_int(&arg);
+ } else if (arg_match(&arg, &outallarg, argi)) {
+ output_all_layers = 1;
+ } else if (arg_match(&arg, &skipfilmgrain, argi)) {
+ skip_film_grain = 1;
+ } else {
+ argj++;
+ }
+ }
+
+ /* Check for unrecognized options */
+ for (argi = argv; *argi; argi++)
+ if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+ die("Error: Unrecognized option %s\n", *argi);
+
+ /* Handle non-option arguments */
+ fn = argv[0];
+
+ if (!fn) {
+ free(argv);
+ fprintf(stderr, "No input file specified!\n");
+ usage_exit();
+ }
+
+ const bool using_file = strcmp(fn, "-") != 0;
+ /* Open file */
+ infile = using_file ? fopen(fn, "rb") : set_binary_mode(stdin);
+
+ if (!infile) {
+ fatal("Failed to open input file '%s'", using_file ? fn : "stdin");
+ }
+#if CONFIG_OS_SUPPORT
+ /* Make sure we don't dump to the terminal, unless forced to with -o - */
+ if (!outfile_pattern && isatty(STDOUT_FILENO) && !do_md5 && !noblit) {
+ fprintf(stderr,
+ "Not dumping raw video to your terminal. Use '-o -' to "
+ "override.\n");
+ free(argv);
+ return EXIT_FAILURE;
+ }
+#endif
+ input.aom_input_ctx->filename = fn;
+ input.aom_input_ctx->file = infile;
+
+ // TODO(https://crbug.com/aomedia/1706): webm type does not support reading
+ // from stdin yet, and file_is_webm is not using the detect buffer when
+ // determining the type. Therefore it should only be checked when using a file
+ // and needs to be checked prior to other types.
+ if (false) {
+#if CONFIG_WEBM_IO
+ } else if (using_file && file_is_webm(input.webm_ctx, input.aom_input_ctx)) {
+ input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
+#endif
+ } else if (file_is_ivf(input.aom_input_ctx)) {
+ input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+ is_ivf = 1;
+ } else if (file_is_obu(&obu_ctx)) {
+ input.aom_input_ctx->file_type = FILE_TYPE_OBU;
+ } else if (file_is_raw(input.aom_input_ctx)) {
+ input.aom_input_ctx->file_type = FILE_TYPE_RAW;
+ } else {
+ fprintf(stderr, "Unrecognized input file type.\n");
+#if CONFIG_WEBM_IO
+ if (!using_file) {
+ fprintf(stderr, "aomdec does not support piped WebM input.\n");
+ }
+#else
+ fprintf(stderr, "aomdec was built without WebM container support.\n");
+#endif
+ free(argv);
+ return EXIT_FAILURE;
+ }
+
+ outfile_pattern = outfile_pattern ? outfile_pattern : "-";
+ single_file = is_single_file(outfile_pattern);
+
+ if (!noblit && single_file) {
+ generate_filename(outfile_pattern, outfile_name, PATH_MAX,
+ aom_input_ctx.width, aom_input_ctx.height, 0);
+ if (do_md5)
+ MD5Init(&md5_ctx);
+ else
+ outfile = open_outfile(outfile_name);
+ }
+
+ if (use_y4m && !noblit) {
+ if (!single_file) {
+ fprintf(stderr,
+ "YUV4MPEG2 not supported with output patterns,"
+ " try --i420 or --yv12 or --rawvideo.\n");
+ return EXIT_FAILURE;
+ }
+
+#if CONFIG_WEBM_IO
+ if (aom_input_ctx.file_type == FILE_TYPE_WEBM) {
+ if (webm_guess_framerate(input.webm_ctx, input.aom_input_ctx)) {
+ fprintf(stderr,
+ "Failed to guess framerate -- error parsing "
+ "webm file?\n");
+ return EXIT_FAILURE;
+ }
+ }
+#endif
+ }
+
+ aom_codec_iface_t *fourcc_interface =
+ get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+
+ if (is_ivf && !fourcc_interface)
+ fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
+
+ if (interface && fourcc_interface && interface != fourcc_interface)
+ aom_tools_warn("Header indicates codec: %s\n",
+ aom_codec_iface_name(fourcc_interface));
+ else
+ interface = fourcc_interface;
+
+ if (!interface) interface = get_aom_decoder_by_index(0);
+
+ dec_flags = 0;
+ if (aom_codec_dec_init(&decoder, interface, &cfg, dec_flags)) {
+ fprintf(stderr, "Failed to initialize decoder: %s\n",
+ aom_codec_error(&decoder));
+ goto fail2;
+ }
+
+ if (!quiet) fprintf(stderr, "%s\n", decoder.name);
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
+ fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder));
+ goto fail;
+ }
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OPERATING_POINT,
+ operating_point)) {
+ fprintf(stderr, "Failed to set operating_point: %s\n",
+ aom_codec_error(&decoder));
+ goto fail;
+ }
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS,
+ output_all_layers)) {
+ fprintf(stderr, "Failed to set output_all_layers: %s\n",
+ aom_codec_error(&decoder));
+ goto fail;
+ }
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_SKIP_FILM_GRAIN,
+ skip_film_grain)) {
+ fprintf(stderr, "Failed to set skip_film_grain: %s\n",
+ aom_codec_error(&decoder));
+ goto fail;
+ }
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_ROW_MT, enable_row_mt)) {
+ fprintf(stderr, "Failed to set row multithreading mode: %s\n",
+ aom_codec_error(&decoder));
+ goto fail;
+ }
+
+ if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
+ while (arg_skip) {
+ if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
+ arg_skip--;
+ }
+
+ if (num_external_frame_buffers > 0) {
+ ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
+ ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
+ num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+ if (!ext_fb_list.ext_fb) {
+ fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n");
+ goto fail;
+ }
+ if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer,
+ release_av1_frame_buffer,
+ &ext_fb_list)) {
+ fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+ aom_codec_error(&decoder));
+ goto fail;
+ }
+ }
+
+ frame_avail = 1;
+ got_data = 0;
+
+ if (framestats_file) fprintf(framestats_file, "bytes,qp\r\n");
+
+ /* Decode file */
+ while (frame_avail || got_data) {
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img;
+ struct aom_usec_timer timer;
+ int corrupted = 0;
+
+ frame_avail = 0;
+ if (!stop_after || frame_in < stop_after) {
+ if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
+ frame_avail = 1;
+ frame_in++;
+
+ aom_usec_timer_start(&timer);
+
+ if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) {
+ const char *detail = aom_codec_error_detail(&decoder);
+ aom_tools_warn("Failed to decode frame %d: %s", frame_in,
+ aom_codec_error(&decoder));
+
+ if (detail) aom_tools_warn("Additional information: %s", detail);
+ if (!keep_going) goto fail;
+ }
+
+ if (framestats_file) {
+ int qp;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER,
+ &qp)) {
+ aom_tools_warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
+ aom_codec_error(&decoder));
+ if (!keep_going) goto fail;
+ }
+ fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp);
+ }
+
+ aom_usec_timer_mark(&timer);
+ dx_time += aom_usec_timer_elapsed(&timer);
+ } else {
+ flush_decoder = 1;
+ }
+ } else {
+ flush_decoder = 1;
+ }
+
+ aom_usec_timer_start(&timer);
+
+ if (flush_decoder) {
+ // Flush the decoder.
+ if (aom_codec_decode(&decoder, NULL, 0, NULL)) {
+ aom_tools_warn("Failed to flush decoder: %s",
+ aom_codec_error(&decoder));
+ }
+ }
+
+ aom_usec_timer_mark(&timer);
+ dx_time += aom_usec_timer_elapsed(&timer);
+
+ got_data = 0;
+ // TODO(aomedia:3519): Change the prototype of aom_codec_get_frame_fn_t to
+ // facilitate error handling.
+ while ((img = aom_codec_get_frame(&decoder, &iter))) {
+ ++frame_out;
+ got_data = 1;
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED,
+ &corrupted)) {
+ aom_tools_warn("Failed AOM_GET_FRAME_CORRUPTED: %s",
+ aom_codec_error(&decoder));
+ if (!keep_going) goto fail;
+ }
+ frames_corrupted += corrupted;
+
+ if (progress) show_progress(frame_in, frame_out, dx_time);
+
+ if (!noblit) {
+ const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+ const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U };
+ const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
+
+ if (do_scale) {
+ if (frame_out == 1) {
+ // If the output frames are to be scaled to a fixed display size
+ // then use the width and height specified in the container. If
+ // either of these is set to 0, use the display size set in the
+ // first frame header. If that is unavailable, use the raw decoded
+ // size of the first decoded frame.
+ int render_width = aom_input_ctx.width;
+ int render_height = aom_input_ctx.height;
+ if (!render_width || !render_height) {
+ int render_size[2];
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_GET_DISPLAY_SIZE,
+ render_size)) {
+ // As last resort use size of first frame as display size.
+ render_width = img->d_w;
+ render_height = img->d_h;
+ } else {
+ render_width = render_size[0];
+ render_height = render_size[1];
+ }
+ }
+ scaled_img =
+ aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+ if (!scaled_img) {
+ fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n",
+ render_width, render_height);
+ goto fail;
+ }
+ scaled_img->bit_depth = img->bit_depth;
+ scaled_img->monochrome = img->monochrome;
+ scaled_img->csp = img->csp;
+ }
+
+ if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+#if CONFIG_LIBYUV
+ if (libyuv_scale(img, scaled_img, kFilterBox) != 0) goto fail;
+ img = scaled_img;
+#else
+ fprintf(
+ stderr,
+ "Failed to scale output frame: %s.\n"
+ "libyuv is required for scaling but is currently disabled.\n"
+ "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n",
+ aom_codec_error(&decoder));
+ goto fail;
+#endif
+ }
+ }
+ // Default to codec bit depth if output bit depth not set
+ unsigned int output_bit_depth;
+ if (!fixed_output_bit_depth && single_file) {
+ output_bit_depth = img->bit_depth;
+ } else {
+ output_bit_depth = fixed_output_bit_depth;
+ }
+ // Shift up or down if necessary
+ if (output_bit_depth != 0) {
+ if (!aom_shift_img(output_bit_depth, &img, &img_shifted)) {
+ fprintf(stderr, "Error allocating image\n");
+ goto fail;
+ }
+ }
+
+ aom_input_ctx.width = img->d_w;
+ aom_input_ctx.height = img->d_h;
+
+ int num_planes = (opt_raw && img->monochrome) ? 1 : 3;
+ if (single_file) {
+ if (use_y4m) {
+ char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
+ size_t len = 0;
+ if (frame_out == 1) {
+ // Y4M file header
+ len = y4m_write_file_header(
+ y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
+ aom_input_ctx.height, &aom_input_ctx.framerate,
+ img->monochrome, img->csp, img->fmt, img->bit_depth,
+ img->range);
+ if (img->csp == AOM_CSP_COLOCATED) {
+ fprintf(stderr,
+ "Warning: Y4M lacks a colorspace for colocated "
+ "chroma. Using a placeholder.\n");
+ }
+ if (do_md5) {
+ MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+ } else {
+ fputs(y4m_buf, outfile);
+ }
+ }
+
+ // Y4M frame header
+ len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
+ if (do_md5) {
+ MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+ y4m_update_image_md5(img, planes, &md5_ctx);
+ } else {
+ fputs(y4m_buf, outfile);
+ y4m_write_image_file(img, planes, outfile);
+ }
+ } else {
+ if (frame_out == 1) {
+ // Check if --yv12 or --i420 options are consistent with the
+ // bit-stream decoded
+ if (opt_i420) {
+ if (img->fmt != AOM_IMG_FMT_I420 &&
+ img->fmt != AOM_IMG_FMT_I42016) {
+ fprintf(stderr,
+ "Cannot produce i420 output for bit-stream.\n");
+ goto fail;
+ }
+ }
+ if (opt_yv12) {
+ if ((img->fmt != AOM_IMG_FMT_I420 &&
+ img->fmt != AOM_IMG_FMT_YV12) ||
+ img->bit_depth != 8) {
+ fprintf(stderr,
+ "Cannot produce yv12 output for bit-stream.\n");
+ goto fail;
+ }
+ }
+ }
+ if (do_md5) {
+ raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+ } else {
+ raw_write_image_file(img, planes, num_planes, outfile);
+ }
+ }
+ } else {
+ generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
+ img->d_h, frame_in);
+ if (do_md5) {
+ MD5Init(&md5_ctx);
+ if (use_y4m) {
+ y4m_update_image_md5(img, planes, &md5_ctx);
+ } else {
+ raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+ }
+ MD5Final(md5_digest, &md5_ctx);
+ print_md5(md5_digest, outfile_name);
+ } else {
+ outfile = open_outfile(outfile_name);
+ if (use_y4m) {
+ y4m_write_image_file(img, planes, outfile);
+ } else {
+ raw_write_image_file(img, planes, num_planes, outfile);
+ }
+ fclose(outfile);
+ }
+ }
+ }
+ }
+ }
+
+ if (summary || progress) {
+ show_progress(frame_in, frame_out, dx_time);
+ fprintf(stderr, "\n");
+ }
+
+ if (frames_corrupted) {
+ fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted);
+ } else {
+ ret = EXIT_SUCCESS;
+ }
+
+fail:
+
+ if (aom_codec_destroy(&decoder)) {
+ fprintf(stderr, "Failed to destroy decoder: %s\n",
+ aom_codec_error(&decoder));
+ }
+
+fail2:
+
+ if (!noblit && single_file) {
+ if (do_md5) {
+ MD5Final(md5_digest, &md5_ctx);
+ print_md5(md5_digest, outfile_name);
+ } else {
+ fclose(outfile);
+ }
+ }
+
+#if CONFIG_WEBM_IO
+ if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM)
+ webm_free(input.webm_ctx);
+#endif
+ if (input.aom_input_ctx->file_type == FILE_TYPE_OBU)
+ obudec_free(input.obu_ctx);
+
+ if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf);
+
+ if (scaled_img) aom_img_free(scaled_img);
+ if (img_shifted) aom_img_free(img_shifted);
+
+ for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
+ free(ext_fb_list.ext_fb[i].data);
+ }
+ free(ext_fb_list.ext_fb);
+
+ fclose(infile);
+ if (framestats_file) fclose(framestats_file);
+
+ free(argv);
+
+ return ret;
+}
+
+int main(int argc, const char **argv_) {
+ unsigned int loops = 1, i;
+ char **argv, **argi, **argj;
+ struct arg arg;
+ int error = 0;
+
+ argv = argv_dup(argc - 1, argv_ + 1);
+ if (!argv) {
+ fprintf(stderr, "Error allocating argument list\n");
+ return EXIT_FAILURE;
+ }
+ for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+ memset(&arg, 0, sizeof(arg));
+ arg.argv_step = 1;
+
+ if (arg_match(&arg, &looparg, argi)) {
+ loops = arg_parse_uint(&arg);
+ break;
+ }
+ }
+ free(argv);
+ for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_);
+ return error;
+}
diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c
new file mode 100644
index 0000000000..3c9c136eed
--- /dev/null
+++ b/third_party/aom/apps/aomenc.c
@@ -0,0 +1,2688 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "apps/aomenc.h"
+
+#include "config/aom_config.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_AV1_DECODER
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#endif
+
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomcx.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfenc.h"
+#include "common/tools_common.h"
+#include "common/warnings.h"
+
+#if CONFIG_WEBM_IO
+#include "common/webmenc.h"
+#endif
+
+#include "common/y4minput.h"
+#include "examples/encoder_util.h"
+#include "stats/aomstats.h"
+#include "stats/rate_hist.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
+ return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+
+static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
+ FILE *stream) {
+ return fwrite(ptr, size, nmemb, stream);
+}
+#define fwrite wrap_fwrite
+
+static const char *exec_name;
+
+static AOM_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
+ aom_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) {
+ if (ctx->err) {
+ const char *detail = aom_codec_error_detail(ctx);
+
+ vfprintf(stderr, s, ap);
+ fprintf(stderr, ": %s\n", aom_codec_error(ctx));
+
+ if (detail) fprintf(stderr, " %s\n", detail);
+
+ if (fatal) {
+ aom_codec_destroy(ctx);
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+static AOM_TOOLS_FORMAT_PRINTF(2,
+ 3) void ctx_exit_on_error(aom_codec_ctx_t *ctx,
+ const char *s, ...) {
+ va_list ap;
+
+ va_start(ap, s);
+ warn_or_exit_on_errorv(ctx, 1, s, ap);
+ va_end(ap);
+}
+
+static AOM_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error(
+ aom_codec_ctx_t *ctx, int fatal, const char *s, ...) {
+ va_list ap;
+
+ va_start(ap, s);
+ warn_or_exit_on_errorv(ctx, fatal, s, ap);
+ va_end(ap);
+}
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+ FILE *f = input_ctx->file;
+ y4m_input *y4m = &input_ctx->y4m;
+ int shortread = 0;
+
+ if (input_ctx->file_type == FILE_TYPE_Y4M) {
+ if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+ } else {
+ shortread = read_yuv_frame(input_ctx, img);
+ }
+
+ return !shortread;
+}
+
+static int file_is_y4m(const char detect[4]) {
+ if (memcmp(detect, "YUV4", 4) == 0) {
+ return 1;
+ }
+ return 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+ if (memcmp(detect, "DKIF", 4) == 0) {
+ return 1;
+ }
+ return 0;
+}
+
+static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
+ AOME_SET_ENABLEAUTOALTREF,
+ AOME_SET_SHARPNESS,
+ AOME_SET_STATIC_THRESHOLD,
+ AV1E_SET_ROW_MT,
+ AV1E_SET_FP_MT,
+ AV1E_SET_TILE_COLUMNS,
+ AV1E_SET_TILE_ROWS,
+ AV1E_SET_ENABLE_TPL_MODEL,
+ AV1E_SET_ENABLE_KEYFRAME_FILTERING,
+ AOME_SET_ARNR_MAXFRAMES,
+ AOME_SET_ARNR_STRENGTH,
+ AOME_SET_TUNING,
+ AOME_SET_CQ_LEVEL,
+ AOME_SET_MAX_INTRA_BITRATE_PCT,
+ AV1E_SET_MAX_INTER_BITRATE_PCT,
+ AV1E_SET_GF_CBR_BOOST_PCT,
+ AV1E_SET_LOSSLESS,
+ AV1E_SET_ENABLE_CDEF,
+ AV1E_SET_ENABLE_RESTORATION,
+ AV1E_SET_ENABLE_RECT_PARTITIONS,
+ AV1E_SET_ENABLE_AB_PARTITIONS,
+ AV1E_SET_ENABLE_1TO4_PARTITIONS,
+ AV1E_SET_MIN_PARTITION_SIZE,
+ AV1E_SET_MAX_PARTITION_SIZE,
+ AV1E_SET_ENABLE_DUAL_FILTER,
+ AV1E_SET_ENABLE_CHROMA_DELTAQ,
+ AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
+ AV1E_SET_ENABLE_ORDER_HINT,
+ AV1E_SET_ENABLE_TX64,
+ AV1E_SET_ENABLE_FLIP_IDTX,
+ AV1E_SET_ENABLE_RECT_TX,
+ AV1E_SET_ENABLE_DIST_WTD_COMP,
+ AV1E_SET_ENABLE_MASKED_COMP,
+ AV1E_SET_ENABLE_ONESIDED_COMP,
+ AV1E_SET_ENABLE_INTERINTRA_COMP,
+ AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+ AV1E_SET_ENABLE_DIFF_WTD_COMP,
+ AV1E_SET_ENABLE_INTERINTER_WEDGE,
+ AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+ AV1E_SET_ENABLE_GLOBAL_MOTION,
+ AV1E_SET_ENABLE_WARPED_MOTION,
+ AV1E_SET_ENABLE_FILTER_INTRA,
+ AV1E_SET_ENABLE_SMOOTH_INTRA,
+ AV1E_SET_ENABLE_PAETH_INTRA,
+ AV1E_SET_ENABLE_CFL_INTRA,
+ AV1E_SET_ENABLE_DIAGONAL_INTRA,
+ AV1E_SET_FORCE_VIDEO_MODE,
+ AV1E_SET_ENABLE_OBMC,
+ AV1E_SET_ENABLE_OVERLAY,
+ AV1E_SET_ENABLE_PALETTE,
+ AV1E_SET_ENABLE_INTRABC,
+ AV1E_SET_ENABLE_ANGLE_DELTA,
+ AV1E_SET_DISABLE_TRELLIS_QUANT,
+ AV1E_SET_ENABLE_QM,
+ AV1E_SET_QM_MIN,
+ AV1E_SET_QM_MAX,
+ AV1E_SET_REDUCED_TX_TYPE_SET,
+ AV1E_SET_INTRA_DCT_ONLY,
+ AV1E_SET_INTER_DCT_ONLY,
+ AV1E_SET_INTRA_DEFAULT_TX_ONLY,
+ AV1E_SET_QUANT_B_ADAPT,
+ AV1E_SET_COEFF_COST_UPD_FREQ,
+ AV1E_SET_MODE_COST_UPD_FREQ,
+ AV1E_SET_MV_COST_UPD_FREQ,
+ AV1E_SET_FRAME_PARALLEL_DECODING,
+ AV1E_SET_ERROR_RESILIENT_MODE,
+ AV1E_SET_AQ_MODE,
+ AV1E_SET_DELTAQ_MODE,
+ AV1E_SET_DELTAQ_STRENGTH,
+ AV1E_SET_DELTALF_MODE,
+ AV1E_SET_FRAME_PERIODIC_BOOST,
+ AV1E_SET_NOISE_SENSITIVITY,
+ AV1E_SET_TUNE_CONTENT,
+ AV1E_SET_CDF_UPDATE_MODE,
+ AV1E_SET_COLOR_PRIMARIES,
+ AV1E_SET_TRANSFER_CHARACTERISTICS,
+ AV1E_SET_MATRIX_COEFFICIENTS,
+ AV1E_SET_CHROMA_SAMPLE_POSITION,
+ AV1E_SET_MIN_GF_INTERVAL,
+ AV1E_SET_MAX_GF_INTERVAL,
+ AV1E_SET_GF_MIN_PYRAMID_HEIGHT,
+ AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
+ AV1E_SET_SUPERBLOCK_SIZE,
+ AV1E_SET_NUM_TG,
+ AV1E_SET_MTU,
+ AV1E_SET_TIMING_INFO_TYPE,
+ AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+ AV1E_SET_FILM_GRAIN_TABLE,
+#if CONFIG_DENOISE
+ AV1E_SET_DENOISE_NOISE_LEVEL,
+ AV1E_SET_DENOISE_BLOCK_SIZE,
+ AV1E_SET_ENABLE_DNL_DENOISING,
+#endif // CONFIG_DENOISE
+ AV1E_SET_MAX_REFERENCE_FRAMES,
+ AV1E_SET_REDUCED_REFERENCE_SET,
+ AV1E_SET_ENABLE_REF_FRAME_MVS,
+ AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+ AV1E_SET_TIER_MASK,
+ AV1E_SET_MIN_CR,
+ AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP,
+ AV1E_SET_CHROMA_SUBSAMPLING_X,
+ AV1E_SET_CHROMA_SUBSAMPLING_Y,
+#if CONFIG_TUNE_VMAF
+ AV1E_SET_VMAF_MODEL_PATH,
+#endif
+ AV1E_SET_DV_COST_UPD_FREQ,
+ AV1E_SET_PARTITION_INFO_PATH,
+ AV1E_SET_ENABLE_DIRECTIONAL_INTRA,
+ AV1E_SET_ENABLE_TX_SIZE_SEARCH,
+ AV1E_SET_LOOPFILTER_CONTROL,
+ AV1E_SET_AUTO_INTRA_TOOLS_OFF,
+ AV1E_ENABLE_RATE_GUIDE_DELTAQ,
+ AV1E_SET_RATE_DISTRIBUTION_INFO,
+ 0 };
+
+const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help,
+ &g_av1_codec_arg_defs.use_cfg,
+ &g_av1_codec_arg_defs.debugmode,
+ &g_av1_codec_arg_defs.outputfile,
+ &g_av1_codec_arg_defs.codecarg,
+ &g_av1_codec_arg_defs.passes,
+ &g_av1_codec_arg_defs.pass_arg,
+ &g_av1_codec_arg_defs.fpf_name,
+ &g_av1_codec_arg_defs.limit,
+ &g_av1_codec_arg_defs.skip,
+ &g_av1_codec_arg_defs.good_dl,
+ &g_av1_codec_arg_defs.rt_dl,
+ &g_av1_codec_arg_defs.ai_dl,
+ &g_av1_codec_arg_defs.quietarg,
+ &g_av1_codec_arg_defs.verbosearg,
+ &g_av1_codec_arg_defs.psnrarg,
+ &g_av1_codec_arg_defs.use_webm,
+ &g_av1_codec_arg_defs.use_ivf,
+ &g_av1_codec_arg_defs.use_obu,
+ &g_av1_codec_arg_defs.q_hist_n,
+ &g_av1_codec_arg_defs.rate_hist_n,
+ &g_av1_codec_arg_defs.disable_warnings,
+ &g_av1_codec_arg_defs.disable_warning_prompt,
+ &g_av1_codec_arg_defs.recontest,
+ NULL };
+
+const arg_def_t *global_args[] = {
+ &g_av1_codec_arg_defs.use_nv12,
+ &g_av1_codec_arg_defs.use_yv12,
+ &g_av1_codec_arg_defs.use_i420,
+ &g_av1_codec_arg_defs.use_i422,
+ &g_av1_codec_arg_defs.use_i444,
+ &g_av1_codec_arg_defs.usage,
+ &g_av1_codec_arg_defs.threads,
+ &g_av1_codec_arg_defs.profile,
+ &g_av1_codec_arg_defs.width,
+ &g_av1_codec_arg_defs.height,
+ &g_av1_codec_arg_defs.forced_max_frame_width,
+ &g_av1_codec_arg_defs.forced_max_frame_height,
+#if CONFIG_WEBM_IO
+ &g_av1_codec_arg_defs.stereo_mode,
+#endif
+ &g_av1_codec_arg_defs.timebase,
+ &g_av1_codec_arg_defs.framerate,
+ &g_av1_codec_arg_defs.global_error_resilient,
+ &g_av1_codec_arg_defs.bitdeptharg,
+ &g_av1_codec_arg_defs.inbitdeptharg,
+ &g_av1_codec_arg_defs.lag_in_frames,
+ &g_av1_codec_arg_defs.large_scale_tile,
+ &g_av1_codec_arg_defs.monochrome,
+ &g_av1_codec_arg_defs.full_still_picture_hdr,
+ &g_av1_codec_arg_defs.use_16bit_internal,
+ &g_av1_codec_arg_defs.save_as_annexb,
+ NULL
+};
+
+const arg_def_t *rc_args[] = { &g_av1_codec_arg_defs.dropframe_thresh,
+ &g_av1_codec_arg_defs.resize_mode,
+ &g_av1_codec_arg_defs.resize_denominator,
+ &g_av1_codec_arg_defs.resize_kf_denominator,
+ &g_av1_codec_arg_defs.superres_mode,
+ &g_av1_codec_arg_defs.superres_denominator,
+ &g_av1_codec_arg_defs.superres_kf_denominator,
+ &g_av1_codec_arg_defs.superres_qthresh,
+ &g_av1_codec_arg_defs.superres_kf_qthresh,
+ &g_av1_codec_arg_defs.end_usage,
+ &g_av1_codec_arg_defs.target_bitrate,
+ &g_av1_codec_arg_defs.min_quantizer,
+ &g_av1_codec_arg_defs.max_quantizer,
+ &g_av1_codec_arg_defs.undershoot_pct,
+ &g_av1_codec_arg_defs.overshoot_pct,
+ &g_av1_codec_arg_defs.buf_sz,
+ &g_av1_codec_arg_defs.buf_initial_sz,
+ &g_av1_codec_arg_defs.buf_optimal_sz,
+ &g_av1_codec_arg_defs.bias_pct,
+ &g_av1_codec_arg_defs.minsection_pct,
+ &g_av1_codec_arg_defs.maxsection_pct,
+ NULL };
+
+const arg_def_t *kf_args[] = { &g_av1_codec_arg_defs.fwd_kf_enabled,
+ &g_av1_codec_arg_defs.kf_min_dist,
+ &g_av1_codec_arg_defs.kf_max_dist,
+ &g_av1_codec_arg_defs.kf_disabled,
+ &g_av1_codec_arg_defs.sframe_dist,
+ &g_av1_codec_arg_defs.sframe_mode,
+ NULL };
+
+// TODO(bohanli): Currently all options are supported by the key & value API.
+// Consider removing the control ID usages?
+const arg_def_t *av1_ctrl_args[] = {
+ &g_av1_codec_arg_defs.cpu_used_av1,
+ &g_av1_codec_arg_defs.auto_altref,
+ &g_av1_codec_arg_defs.sharpness,
+ &g_av1_codec_arg_defs.static_thresh,
+ &g_av1_codec_arg_defs.rowmtarg,
+ &g_av1_codec_arg_defs.fpmtarg,
+ &g_av1_codec_arg_defs.tile_cols,
+ &g_av1_codec_arg_defs.tile_rows,
+ &g_av1_codec_arg_defs.enable_tpl_model,
+ &g_av1_codec_arg_defs.enable_keyframe_filtering,
+ &g_av1_codec_arg_defs.arnr_maxframes,
+ &g_av1_codec_arg_defs.arnr_strength,
+ &g_av1_codec_arg_defs.tune_metric,
+ &g_av1_codec_arg_defs.cq_level,
+ &g_av1_codec_arg_defs.max_intra_rate_pct,
+ &g_av1_codec_arg_defs.max_inter_rate_pct,
+ &g_av1_codec_arg_defs.gf_cbr_boost_pct,
+ &g_av1_codec_arg_defs.lossless,
+ &g_av1_codec_arg_defs.enable_cdef,
+ &g_av1_codec_arg_defs.enable_restoration,
+ &g_av1_codec_arg_defs.enable_rect_partitions,
+ &g_av1_codec_arg_defs.enable_ab_partitions,
+ &g_av1_codec_arg_defs.enable_1to4_partitions,
+ &g_av1_codec_arg_defs.min_partition_size,
+ &g_av1_codec_arg_defs.max_partition_size,
+ &g_av1_codec_arg_defs.enable_dual_filter,
+ &g_av1_codec_arg_defs.enable_chroma_deltaq,
+ &g_av1_codec_arg_defs.enable_intra_edge_filter,
+ &g_av1_codec_arg_defs.enable_order_hint,
+ &g_av1_codec_arg_defs.enable_tx64,
+ &g_av1_codec_arg_defs.enable_flip_idtx,
+ &g_av1_codec_arg_defs.enable_rect_tx,
+ &g_av1_codec_arg_defs.enable_dist_wtd_comp,
+ &g_av1_codec_arg_defs.enable_masked_comp,
+ &g_av1_codec_arg_defs.enable_onesided_comp,
+ &g_av1_codec_arg_defs.enable_interintra_comp,
+ &g_av1_codec_arg_defs.enable_smooth_interintra,
+ &g_av1_codec_arg_defs.enable_diff_wtd_comp,
+ &g_av1_codec_arg_defs.enable_interinter_wedge,
+ &g_av1_codec_arg_defs.enable_interintra_wedge,
+ &g_av1_codec_arg_defs.enable_global_motion,
+ &g_av1_codec_arg_defs.enable_warped_motion,
+ &g_av1_codec_arg_defs.enable_filter_intra,
+ &g_av1_codec_arg_defs.enable_smooth_intra,
+ &g_av1_codec_arg_defs.enable_paeth_intra,
+ &g_av1_codec_arg_defs.enable_cfl_intra,
+ &g_av1_codec_arg_defs.enable_diagonal_intra,
+ &g_av1_codec_arg_defs.force_video_mode,
+ &g_av1_codec_arg_defs.enable_obmc,
+ &g_av1_codec_arg_defs.enable_overlay,
+ &g_av1_codec_arg_defs.enable_palette,
+ &g_av1_codec_arg_defs.enable_intrabc,
+ &g_av1_codec_arg_defs.enable_angle_delta,
+ &g_av1_codec_arg_defs.disable_trellis_quant,
+ &g_av1_codec_arg_defs.enable_qm,
+ &g_av1_codec_arg_defs.qm_min,
+ &g_av1_codec_arg_defs.qm_max,
+ &g_av1_codec_arg_defs.reduced_tx_type_set,
+ &g_av1_codec_arg_defs.use_intra_dct_only,
+ &g_av1_codec_arg_defs.use_inter_dct_only,
+ &g_av1_codec_arg_defs.use_intra_default_tx_only,
+ &g_av1_codec_arg_defs.quant_b_adapt,
+ &g_av1_codec_arg_defs.coeff_cost_upd_freq,
+ &g_av1_codec_arg_defs.mode_cost_upd_freq,
+ &g_av1_codec_arg_defs.mv_cost_upd_freq,
+ &g_av1_codec_arg_defs.frame_parallel_decoding,
+ &g_av1_codec_arg_defs.error_resilient_mode,
+ &g_av1_codec_arg_defs.aq_mode,
+ &g_av1_codec_arg_defs.deltaq_mode,
+ &g_av1_codec_arg_defs.deltaq_strength,
+ &g_av1_codec_arg_defs.deltalf_mode,
+ &g_av1_codec_arg_defs.frame_periodic_boost,
+ &g_av1_codec_arg_defs.noise_sens,
+ &g_av1_codec_arg_defs.tune_content,
+ &g_av1_codec_arg_defs.cdf_update_mode,
+ &g_av1_codec_arg_defs.input_color_primaries,
+ &g_av1_codec_arg_defs.input_transfer_characteristics,
+ &g_av1_codec_arg_defs.input_matrix_coefficients,
+ &g_av1_codec_arg_defs.input_chroma_sample_position,
+ &g_av1_codec_arg_defs.min_gf_interval,
+ &g_av1_codec_arg_defs.max_gf_interval,
+ &g_av1_codec_arg_defs.gf_min_pyr_height,
+ &g_av1_codec_arg_defs.gf_max_pyr_height,
+ &g_av1_codec_arg_defs.superblock_size,
+ &g_av1_codec_arg_defs.num_tg,
+ &g_av1_codec_arg_defs.mtu_size,
+ &g_av1_codec_arg_defs.timing_info,
+ &g_av1_codec_arg_defs.film_grain_test,
+ &g_av1_codec_arg_defs.film_grain_table,
+#if CONFIG_DENOISE
+ &g_av1_codec_arg_defs.denoise_noise_level,
+ &g_av1_codec_arg_defs.denoise_block_size,
+ &g_av1_codec_arg_defs.enable_dnl_denoising,
+#endif // CONFIG_DENOISE
+ &g_av1_codec_arg_defs.max_reference_frames,
+ &g_av1_codec_arg_defs.reduced_reference_set,
+ &g_av1_codec_arg_defs.enable_ref_frame_mvs,
+ &g_av1_codec_arg_defs.target_seq_level_idx,
+ &g_av1_codec_arg_defs.set_tier_mask,
+ &g_av1_codec_arg_defs.set_min_cr,
+ &g_av1_codec_arg_defs.vbr_corpus_complexity_lap,
+ &g_av1_codec_arg_defs.input_chroma_subsampling_x,
+ &g_av1_codec_arg_defs.input_chroma_subsampling_y,
+#if CONFIG_TUNE_VMAF
+ &g_av1_codec_arg_defs.vmaf_model_path,
+#endif
+ &g_av1_codec_arg_defs.dv_cost_upd_freq,
+ &g_av1_codec_arg_defs.partition_info_path,
+ &g_av1_codec_arg_defs.enable_rate_guide_deltaq,
+ &g_av1_codec_arg_defs.rate_distribution_info,
+ &g_av1_codec_arg_defs.enable_directional_intra,
+ &g_av1_codec_arg_defs.enable_tx_size_search,
+ &g_av1_codec_arg_defs.loopfilter_control,
+ &g_av1_codec_arg_defs.auto_intra_tools_off,
+ NULL,
+};
+
+const arg_def_t *av1_key_val_args[] = {
+ &g_av1_codec_arg_defs.passes,
+ &g_av1_codec_arg_defs.two_pass_output,
+ &g_av1_codec_arg_defs.second_pass_log,
+ &g_av1_codec_arg_defs.fwd_kf_dist,
+ &g_av1_codec_arg_defs.strict_level_conformance,
+ &g_av1_codec_arg_defs.sb_qp_sweep,
+ &g_av1_codec_arg_defs.dist_metric,
+ &g_av1_codec_arg_defs.kf_max_pyr_height,
+ NULL,
+};
+
+static const arg_def_t *no_args[] = { NULL };
+
+static void show_help(FILE *fout, int shorthelp) {
+ fprintf(fout, "Usage: %s <options> -o dst_filename src_filename\n",
+ exec_name);
+
+ if (shorthelp) {
+ fprintf(fout, "Use --help to see the full list of options.\n");
+ return;
+ }
+
+ fprintf(fout, "\nOptions:\n");
+ arg_show_usage(fout, main_args);
+ fprintf(fout, "\nEncoder Global Options:\n");
+ arg_show_usage(fout, global_args);
+ fprintf(fout, "\nRate Control Options:\n");
+ arg_show_usage(fout, rc_args);
+ fprintf(fout, "\nKeyframe Placement Options:\n");
+ arg_show_usage(fout, kf_args);
+#if CONFIG_AV1_ENCODER
+ fprintf(fout, "\nAV1 Specific Options:\n");
+ arg_show_usage(fout, av1_ctrl_args);
+ arg_show_usage(fout, av1_key_val_args);
+#endif
+ fprintf(fout,
+ "\nStream timebase (--timebase):\n"
+ " The desired precision of timestamps in the output, expressed\n"
+ " in fractional seconds. Default is 1/1000.\n");
+ fprintf(fout, "\nIncluded encoders:\n\n");
+
+ const int num_encoder = get_aom_encoder_count();
+ for (int i = 0; i < num_encoder; ++i) {
+ aom_codec_iface_t *encoder = get_aom_encoder_by_index(i);
+ const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
+ fprintf(fout, " %-6s - %s %s\n", get_short_name_by_aom_encoder(encoder),
+ aom_codec_iface_name(encoder), defstr);
+ }
+ fprintf(fout, "\n ");
+ fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
+
+void usage_exit(void) {
+ show_help(stderr, 1);
+ exit(EXIT_FAILURE);
+}
+
+#if CONFIG_AV1_ENCODER
+#define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map)
+#define ARG_KEY_VAL_CNT_MAX NELEMENTS(av1_key_val_args)
+#endif
+
+#if !CONFIG_WEBM_IO
+typedef int stereo_format_t;
+struct WebmOutputContext {
+ int debug;
+};
+#endif
+
+/* Per-stream configuration */
+struct stream_config {
+ struct aom_codec_enc_cfg cfg;
+ const char *out_fn;
+ const char *stats_fn;
+ stereo_format_t stereo_fmt;
+ int arg_ctrls[ARG_CTRL_CNT_MAX][2];
+ int arg_ctrl_cnt;
+ const char *arg_key_vals[ARG_KEY_VAL_CNT_MAX][2];
+ int arg_key_val_cnt;
+ int write_webm;
+ const char *film_grain_filename;
+ int write_ivf;
+ // whether to use 16bit internal buffers
+ int use_16bit_internal;
+#if CONFIG_TUNE_VMAF
+ const char *vmaf_model_path;
+#endif
+ const char *partition_info_path;
+ unsigned int enable_rate_guide_deltaq;
+ const char *rate_distribution_info;
+ aom_color_range_t color_range;
+ const char *two_pass_input;
+ const char *two_pass_output;
+ int two_pass_width;
+ int two_pass_height;
+};
+
+struct stream_state {
+ int index;
+ struct stream_state *next;
+ struct stream_config config;
+ FILE *file;
+ struct rate_hist *rate_hist;
+ struct WebmOutputContext webm_ctx;
+ uint64_t psnr_sse_total[2];
+ uint64_t psnr_samples_total[2];
+ double psnr_totals[2][4];
+ int psnr_count[2];
+ int counts[64];
+ aom_codec_ctx_t encoder;
+ unsigned int frames_out;
+ uint64_t cx_time;
+ size_t nbytes;
+ stats_io_t stats;
+ struct aom_image *img;
+ aom_codec_ctx_t decoder;
+ int mismatch_seen;
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
+ const char *orig_out_fn;
+ unsigned int orig_width;
+ unsigned int orig_height;
+ int orig_write_webm;
+ int orig_write_ivf;
+ char tmp_out_fn[1000];
+};
+
+static void validate_positive_rational(const char *msg,
+ struct aom_rational *rat) {
+ if (rat->den < 0) {
+ rat->num *= -1;
+ rat->den *= -1;
+ }
+
+ if (rat->num < 0) die("Error: %s must be positive\n", msg);
+
+ if (!rat->den) die("Error: %s has zero denominator\n", msg);
+}
+
+static void init_config(cfg_options_t *config) {
+ memset(config, 0, sizeof(cfg_options_t));
+ config->super_block_size = 0; // Dynamic
+ config->max_partition_size = 128;
+ config->min_partition_size = 4;
+ config->disable_trellis_quant = 3;
+}
+
+/* Parses global config arguments into the AvxEncoderConfig. Note that
+ * argv is modified and overwrites all parsed arguments.
+ */
+static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
+ char **argi, **argj;
+ struct arg arg;
+ const int num_encoder = get_aom_encoder_count();
+ char **argv_local = (char **)*argv;
+ if (num_encoder < 1) die("Error: no valid encoder available\n");
+
+ /* Initialize default parameters */
+ memset(global, 0, sizeof(*global));
+ global->codec = get_aom_encoder_by_index(num_encoder - 1);
+ global->passes = 0;
+ global->color_type = I420;
+ global->csp = AOM_CSP_UNKNOWN;
+ global->show_psnr = 0;
+
+ int cfg_included = 0;
+ init_config(&global->encoder_config);
+
+ for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) {
+ arg.argv_step = 1;
+
+ if (arg_match(&arg, &g_av1_codec_arg_defs.use_cfg, argi)) {
+ if (!cfg_included) {
+ parse_cfg(arg.val, &global->encoder_config);
+ cfg_included = 1;
+ }
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.help, argi)) {
+ show_help(stdout, 0);
+ exit(EXIT_SUCCESS);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.codecarg, argi)) {
+ global->codec = get_aom_encoder_by_short_name(arg.val);
+ if (!global->codec)
+ die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.passes, argi)) {
+ global->passes = arg_parse_uint(&arg);
+
+ if (global->passes < 1 || global->passes > 3)
+ die("Error: Invalid number of passes (%d)\n", global->passes);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.pass_arg, argi)) {
+ global->pass = arg_parse_uint(&arg);
+
+ if (global->pass < 1 || global->pass > 3)
+ die("Error: Invalid pass selected (%d)\n", global->pass);
+ } else if (arg_match(&arg,
+ &g_av1_codec_arg_defs.input_chroma_sample_position,
+ argi)) {
+ global->csp = arg_parse_enum(&arg);
+ /* Flag is used by later code as well, preserve it. */
+ argj++;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.usage, argi)) {
+ global->usage = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.good_dl, argi)) {
+ global->usage = AOM_USAGE_GOOD_QUALITY; // Good quality usage
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.rt_dl, argi)) {
+ global->usage = AOM_USAGE_REALTIME; // Real-time usage
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.ai_dl, argi)) {
+ global->usage = AOM_USAGE_ALL_INTRA; // All intra usage
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_nv12, argi)) {
+ global->color_type = NV12;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_yv12, argi)) {
+ global->color_type = YV12;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i420, argi)) {
+ global->color_type = I420;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i422, argi)) {
+ global->color_type = I422;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i444, argi)) {
+ global->color_type = I444;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.quietarg, argi)) {
+ global->quiet = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.verbosearg, argi)) {
+ global->verbose = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.limit, argi)) {
+ global->limit = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.skip, argi)) {
+ global->skip_frames = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.psnrarg, argi)) {
+ if (arg.val)
+ global->show_psnr = arg_parse_int(&arg);
+ else
+ global->show_psnr = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.recontest, argi)) {
+ global->test_decode = arg_parse_enum_or_int(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.framerate, argi)) {
+ global->framerate = arg_parse_rational(&arg);
+ validate_positive_rational(arg.name, &global->framerate);
+ global->have_framerate = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.debugmode, argi)) {
+ global->debug = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.q_hist_n, argi)) {
+ global->show_q_hist_buckets = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_hist_n, argi)) {
+ global->show_rate_hist_buckets = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warnings, argi)) {
+ global->disable_warnings = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warning_prompt,
+ argi)) {
+ global->disable_warning_prompt = 1;
+ } else {
+ argj++;
+ }
+ }
+
+ if (global->pass) {
+ /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
+ if (global->pass > global->passes) {
+ aom_tools_warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
+ global->pass);
+ global->passes = global->pass;
+ }
+ }
+ /* Validate global config */
+ if (global->passes == 0) {
+#if CONFIG_AV1_ENCODER
+ // Make default AV1 passes = 2 until there is a better quality 1-pass
+ // encoder
+ if (global->codec != NULL)
+ global->passes =
+ (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0 &&
+ global->usage != AOM_USAGE_REALTIME)
+ ? 2
+ : 1;
+#else
+ global->passes = 1;
+#endif
+ }
+
+ if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
+ aom_tools_warn("Enforcing one-pass encoding in realtime mode\n");
+ if (global->pass > 1)
+ die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass);
+ global->passes = 1;
+ }
+
+ if (global->usage == AOM_USAGE_ALL_INTRA && global->passes > 1) {
+ aom_tools_warn("Enforcing one-pass encoding in all intra mode\n");
+ global->passes = 1;
+ }
+}
+
+static void open_input_file(struct AvxInputContext *input,
+ aom_chroma_sample_position_t csp) {
+ /* Parse certain options from the input file, if possible */
+ input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+ : set_binary_mode(stdin);
+
+ if (!input->file) fatal("Failed to open input file");
+
+ if (!fseeko(input->file, 0, SEEK_END)) {
+ /* Input file is seekable. Figure out how long it is, so we can get
+ * progress info.
+ */
+ input->length = ftello(input->file);
+ rewind(input->file);
+ }
+
+ /* Default to 1:1 pixel aspect ratio. */
+ input->pixel_aspect_ratio.numerator = 1;
+ input->pixel_aspect_ratio.denominator = 1;
+
+ /* For RAW input sources, these bytes will applied on the first frame
+ * in read_frame().
+ */
+ input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+ input->detect.position = 0;
+
+ if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+ if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp,
+ input->only_i420) >= 0) {
+ input->file_type = FILE_TYPE_Y4M;
+ input->width = input->y4m.pic_w;
+ input->height = input->y4m.pic_h;
+ input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+ input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+ input->framerate.numerator = input->y4m.fps_n;
+ input->framerate.denominator = input->y4m.fps_d;
+ input->fmt = input->y4m.aom_fmt;
+ input->bit_depth = input->y4m.bit_depth;
+ input->color_range = input->y4m.color_range;
+ } else
+ fatal("Unsupported Y4M stream.");
+ } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+ fatal("IVF is not supported as input.");
+ } else {
+ input->file_type = FILE_TYPE_RAW;
+ }
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+ fclose(input->file);
+ if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+static struct stream_state *new_stream(struct AvxEncoderConfig *global,
+ struct stream_state *prev) {
+ struct stream_state *stream;
+
+ stream = calloc(1, sizeof(*stream));
+ if (stream == NULL) {
+ fatal("Failed to allocate new stream.");
+ }
+
+ if (prev) {
+ memcpy(stream, prev, sizeof(*stream));
+ stream->index++;
+ prev->next = stream;
+ } else {
+ aom_codec_err_t res;
+
+ /* Populate encoder configuration */
+ res = aom_codec_enc_config_default(global->codec, &stream->config.cfg,
+ global->usage);
+ if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res));
+
+ /* Change the default timebase to a high enough value so that the
+ * encoder will always create strictly increasing timestamps.
+ */
+ stream->config.cfg.g_timebase.den = 1000;
+
+ /* Never use the library's default resolution, require it be parsed
+ * from the file or set on the command line.
+ */
+ stream->config.cfg.g_w = 0;
+ stream->config.cfg.g_h = 0;
+
+ /* Initialize remaining stream parameters */
+ stream->config.write_webm = 1;
+ stream->config.write_ivf = 0;
+
+#if CONFIG_WEBM_IO
+ stream->config.stereo_fmt = STEREO_FORMAT_MONO;
+ stream->webm_ctx.last_pts_ns = -1;
+ stream->webm_ctx.writer = NULL;
+ stream->webm_ctx.segment = NULL;
+#endif
+
+ /* Allows removal of the application version from the EBML tags */
+ stream->webm_ctx.debug = global->debug;
+ memcpy(&stream->config.cfg.encoder_cfg, &global->encoder_config,
+ sizeof(stream->config.cfg.encoder_cfg));
+ }
+
+ /* Output files must be specified for each stream */
+ stream->config.out_fn = NULL;
+ stream->config.two_pass_input = NULL;
+ stream->config.two_pass_output = NULL;
+ stream->config.two_pass_width = 0;
+ stream->config.two_pass_height = 0;
+
+ stream->next = NULL;
+ return stream;
+}
+
+static void set_config_arg_ctrls(struct stream_config *config, int key,
+ const struct arg *arg) {
+ int j;
+ if (key == AV1E_SET_FILM_GRAIN_TABLE) {
+ config->film_grain_filename = arg->val;
+ return;
+ }
+
+ // For target level, the settings should accumulate rather than overwrite,
+ // so we simply append it.
+ if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) {
+ j = config->arg_ctrl_cnt;
+ assert(j < ARG_CTRL_CNT_MAX);
+ config->arg_ctrls[j][0] = key;
+ config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+ ++config->arg_ctrl_cnt;
+ return;
+ }
+
+ /* Point either to the next free element or the first instance of this
+ * control.
+ */
+ for (j = 0; j < config->arg_ctrl_cnt; j++)
+ if (config->arg_ctrls[j][0] == key) break;
+
+ /* Update/insert */
+ assert(j < ARG_CTRL_CNT_MAX);
+ config->arg_ctrls[j][0] = key;
+ config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+
+ if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) {
+ aom_tools_warn(
+ "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+ config->arg_ctrls[j][1] = 1;
+ }
+
+ if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
+}
+
+static void set_config_arg_key_vals(struct stream_config *config,
+ const char *name, const struct arg *arg) {
+ int j;
+ const char *val = arg->val;
+ // For target level, the settings should accumulate rather than overwrite,
+ // so we simply append it.
+ if (strcmp(name, "target-seq-level-idx") == 0) {
+ j = config->arg_key_val_cnt;
+ assert(j < ARG_KEY_VAL_CNT_MAX);
+ config->arg_key_vals[j][0] = name;
+ config->arg_key_vals[j][1] = val;
+ ++config->arg_key_val_cnt;
+ return;
+ }
+
+ /* Point either to the next free element or the first instance of this
+ * option.
+ */
+ for (j = 0; j < config->arg_key_val_cnt; j++)
+ if (strcmp(name, config->arg_key_vals[j][0]) == 0) break;
+
+ /* Update/insert */
+ assert(j < ARG_KEY_VAL_CNT_MAX);
+ config->arg_key_vals[j][0] = name;
+ config->arg_key_vals[j][1] = val;
+
+ if (strcmp(name, g_av1_codec_arg_defs.auto_altref.long_name) == 0) {
+ int auto_altref = arg_parse_int(arg);
+ if (auto_altref > 1) {
+ aom_tools_warn(
+ "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+ config->arg_key_vals[j][1] = "1";
+ }
+ }
+
+ if (j == config->arg_key_val_cnt) config->arg_key_val_cnt++;
+}
+
+static int parse_stream_params(struct AvxEncoderConfig *global,
+ struct stream_state *stream, char **argv) {
+ char **argi, **argj;
+ struct arg arg;
+ static const arg_def_t **ctrl_args = no_args;
+ static const arg_def_t **key_val_args = no_args;
+ static const int *ctrl_args_map = NULL;
+ struct stream_config *config = &stream->config;
+ int eos_mark_found = 0;
+ int webm_forced = 0;
+
+ // Handle codec specific options
+ if (0) {
+#if CONFIG_AV1_ENCODER
+ } else if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
+ // TODO(jingning): Reuse AV1 specific encoder configuration parameters.
+ // Consider to expand this set for AV1 encoder control.
+#if __STDC_VERSION__ >= 201112L
+ _Static_assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map),
+ "The av1_ctrl_args and av1_arg_ctrl_map arrays must be of "
+ "the same size.");
+#else
+ assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map));
+#endif
+ ctrl_args = av1_ctrl_args;
+ ctrl_args_map = av1_arg_ctrl_map;
+ key_val_args = av1_key_val_args;
+#endif
+ }
+
+ for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+ arg.argv_step = 1;
+
+ /* Once we've found an end-of-stream marker (--) we want to continue
+ * shifting arguments but not consuming them.
+ */
+ if (eos_mark_found) {
+ argj++;
+ continue;
+ } else if (!strcmp(*argj, "--")) {
+ eos_mark_found = 1;
+ continue;
+ }
+
+ if (arg_match(&arg, &g_av1_codec_arg_defs.outputfile, argi)) {
+ config->out_fn = arg.val;
+ if (!webm_forced) {
+ const size_t out_fn_len = strlen(config->out_fn);
+ if (out_fn_len >= 4 &&
+ !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) {
+ config->write_webm = 0;
+ config->write_ivf = 1;
+ } else if (out_fn_len >= 4 &&
+ !strcmp(config->out_fn + out_fn_len - 4, ".obu")) {
+ config->write_webm = 0;
+ config->write_ivf = 0;
+ }
+ }
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.fpf_name, argi)) {
+ config->stats_fn = arg.val;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_webm, argi)) {
+#if CONFIG_WEBM_IO
+ config->write_webm = 1;
+ webm_forced = 1;
+#else
+ die("Error: --webm specified but webm is disabled.");
+#endif
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_ivf, argi)) {
+ config->write_webm = 0;
+ config->write_ivf = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_obu, argi)) {
+ config->write_webm = 0;
+ config->write_ivf = 0;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.threads, argi)) {
+ config->cfg.g_threads = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.profile, argi)) {
+ config->cfg.g_profile = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.width, argi)) {
+ config->cfg.g_w = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.height, argi)) {
+ config->cfg.g_h = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_width,
+ argi)) {
+ config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_height,
+ argi)) {
+ config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.bitdeptharg, argi)) {
+ config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.inbitdeptharg, argi)) {
+ config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_x,
+ argi)) {
+ stream->chroma_subsampling_x = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_y,
+ argi)) {
+ stream->chroma_subsampling_y = arg_parse_uint(&arg);
+#if CONFIG_WEBM_IO
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.stereo_mode, argi)) {
+ config->stereo_fmt = arg_parse_enum_or_int(&arg);
+#endif
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.timebase, argi)) {
+ config->cfg.g_timebase = arg_parse_rational(&arg);
+ validate_positive_rational(arg.name, &config->cfg.g_timebase);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.global_error_resilient,
+ argi)) {
+ config->cfg.g_error_resilient = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.lag_in_frames, argi)) {
+ config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.large_scale_tile, argi)) {
+ config->cfg.large_scale_tile = arg_parse_uint(&arg);
+ if (config->cfg.large_scale_tile) {
+ global->codec = get_aom_encoder_by_short_name("av1");
+ }
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.monochrome, argi)) {
+ config->cfg.monochrome = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.full_still_picture_hdr,
+ argi)) {
+ config->cfg.full_still_picture_hdr = 1;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_16bit_internal,
+ argi)) {
+ config->use_16bit_internal = CONFIG_AV1_HIGHBITDEPTH;
+ if (!config->use_16bit_internal) {
+ aom_tools_warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n",
+ arg.name);
+ }
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.dropframe_thresh, argi)) {
+ config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_mode, argi)) {
+ config->cfg.rc_resize_mode = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_denominator,
+ argi)) {
+ config->cfg.rc_resize_denominator = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_kf_denominator,
+ argi)) {
+ config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_mode, argi)) {
+ config->cfg.rc_superres_mode = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_denominator,
+ argi)) {
+ config->cfg.rc_superres_denominator = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_denominator,
+ argi)) {
+ config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_qthresh, argi)) {
+ config->cfg.rc_superres_qthresh = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_qthresh,
+ argi)) {
+ config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.end_usage, argi)) {
+ config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.target_bitrate, argi)) {
+ config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.min_quantizer, argi)) {
+ config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.max_quantizer, argi)) {
+ config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.undershoot_pct, argi)) {
+ config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.overshoot_pct, argi)) {
+ config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_sz, argi)) {
+ config->cfg.rc_buf_sz = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_initial_sz, argi)) {
+ config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_optimal_sz, argi)) {
+ config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.bias_pct, argi)) {
+ config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
+ if (global->passes < 2)
+ aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.minsection_pct, argi)) {
+ config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
+
+ if (global->passes < 2)
+ aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.maxsection_pct, argi)) {
+ config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
+
+ if (global->passes < 2)
+ aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.fwd_kf_enabled, argi)) {
+ config->cfg.fwd_kf_enabled = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_min_dist, argi)) {
+ config->cfg.kf_min_dist = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_max_dist, argi)) {
+ config->cfg.kf_max_dist = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_disabled, argi)) {
+ config->cfg.kf_mode = AOM_KF_DISABLED;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_dist, argi)) {
+ config->cfg.sframe_dist = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_mode, argi)) {
+ config->cfg.sframe_mode = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.save_as_annexb, argi)) {
+ config->cfg.save_as_annexb = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_width, argi)) {
+ config->cfg.tile_width_count =
+ arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_height, argi)) {
+ config->cfg.tile_height_count =
+ arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS);
+#if CONFIG_TUNE_VMAF
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) {
+ config->vmaf_model_path = arg.val;
+#endif
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path,
+ argi)) {
+ config->partition_info_path = arg.val;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.enable_rate_guide_deltaq,
+ argi)) {
+ config->enable_rate_guide_deltaq = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_distribution_info,
+ argi)) {
+ config->rate_distribution_info = arg.val;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets,
+ argi)) {
+ config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.fixed_qp_offsets, argi)) {
+ config->cfg.use_fixed_qp_offsets = 1;
+ } else if (global->usage == AOM_USAGE_REALTIME &&
+ arg_match(&arg, &g_av1_codec_arg_defs.enable_restoration,
+ argi)) {
+ if (arg_parse_uint(&arg) == 1) {
+ aom_tools_warn("non-zero %s option ignored in realtime mode.\n",
+ arg.name);
+ }
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_input, argi)) {
+ config->two_pass_input = arg.val;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_output, argi)) {
+ config->two_pass_output = arg.val;
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_width, argi)) {
+ config->two_pass_width = arg_parse_int(&arg);
+ } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_height, argi)) {
+ config->two_pass_height = arg_parse_int(&arg);
+ } else {
+ int i, match = 0;
+ // check if the control ID API supports this arg
+ if (ctrl_args_map) {
+ for (i = 0; ctrl_args[i]; i++) {
+ if (arg_match(&arg, ctrl_args[i], argi)) {
+ match = 1;
+ set_config_arg_ctrls(config, ctrl_args_map[i], &arg);
+ break;
+ }
+ }
+ }
+ if (!match) {
+ // check if the key & value API supports this arg
+ for (i = 0; key_val_args[i]; i++) {
+ if (arg_match(&arg, key_val_args[i], argi)) {
+ match = 1;
+ set_config_arg_key_vals(config, key_val_args[i]->long_name, &arg);
+ break;
+ }
+ }
+ }
+ if (!match) argj++;
+ }
+ }
+ config->use_16bit_internal |= config->cfg.g_bit_depth > AOM_BITS_8;
+
+ if (global->usage == AOM_USAGE_REALTIME && config->cfg.g_lag_in_frames != 0) {
+ aom_tools_warn("non-zero lag-in-frames option ignored in realtime mode.\n");
+ config->cfg.g_lag_in_frames = 0;
+ }
+
+ if (global->usage == AOM_USAGE_ALL_INTRA) {
+ if (config->cfg.g_lag_in_frames != 0) {
+ aom_tools_warn(
+ "non-zero lag-in-frames option ignored in all intra mode.\n");
+ config->cfg.g_lag_in_frames = 0;
+ }
+ if (config->cfg.kf_max_dist != 0) {
+ aom_tools_warn(
+ "non-zero max key frame distance option ignored in all intra "
+ "mode.\n");
+ config->cfg.kf_max_dist = 0;
+ }
+ }
+
+ // set the passes field using key & val API
+ if (config->arg_key_val_cnt >= ARG_KEY_VAL_CNT_MAX) {
+ die("Not enough buffer for the key & value API.");
+ }
+ config->arg_key_vals[config->arg_key_val_cnt][0] = "passes";
+ switch (global->passes) {
+ case 0: config->arg_key_vals[config->arg_key_val_cnt][1] = "0"; break;
+ case 1: config->arg_key_vals[config->arg_key_val_cnt][1] = "1"; break;
+ case 2: config->arg_key_vals[config->arg_key_val_cnt][1] = "2"; break;
+ case 3: config->arg_key_vals[config->arg_key_val_cnt][1] = "3"; break;
+ default: die("Invalid value of --passes.");
+ }
+ config->arg_key_val_cnt++;
+
+ // set the two_pass_output field
+ if (!config->two_pass_output && global->passes == 3) {
+ // If not specified, set the name of two_pass_output file here.
+ snprintf(stream->tmp_out_fn, sizeof(stream->tmp_out_fn),
+ "%.980s_pass2_%d.ivf", stream->config.out_fn, stream->index);
+ stream->config.two_pass_output = stream->tmp_out_fn;
+ }
+ if (config->two_pass_output) {
+ config->arg_key_vals[config->arg_key_val_cnt][0] = "two-pass-output";
+ config->arg_key_vals[config->arg_key_val_cnt][1] = config->two_pass_output;
+ config->arg_key_val_cnt++;
+ }
+
+ return eos_mark_found;
+}
+
+#define FOREACH_STREAM(iterator, list) \
+ for (struct stream_state *iterator = list; iterator; \
+ iterator = iterator->next)
+
+static void validate_stream_config(const struct stream_state *stream,
+ const struct AvxEncoderConfig *global) {
+ const struct stream_state *streami;
+ (void)global;
+
+ if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
+ fatal(
+ "Stream %d: Specify stream dimensions with --width (-w) "
+ " and --height (-h)",
+ stream->index);
+
+ /* Even if bit depth is set on the command line flag to be lower,
+ * it is upgraded to at least match the input bit depth.
+ */
+ assert(stream->config.cfg.g_input_bit_depth <=
+ (unsigned int)stream->config.cfg.g_bit_depth);
+
+ for (streami = stream; streami; streami = streami->next) {
+ /* All streams require output files */
+ if (!streami->config.out_fn)
+ fatal("Stream %d: Output file is required (specify with -o)",
+ streami->index);
+
+ /* Check for two streams outputting to the same file */
+ if (streami != stream) {
+ const char *a = stream->config.out_fn;
+ const char *b = streami->config.out_fn;
+ if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul"))
+ fatal("Stream %d: duplicate output file (from stream %d)",
+ streami->index, stream->index);
+ }
+
+ /* Check for two streams sharing a stats file. */
+ if (streami != stream) {
+ const char *a = stream->config.stats_fn;
+ const char *b = streami->config.stats_fn;
+ if (a && b && !strcmp(a, b))
+ fatal("Stream %d: duplicate stats file (from stream %d)",
+ streami->index, stream->index);
+ }
+ }
+}
+
+static void set_stream_dimensions(struct stream_state *stream, unsigned int w,
+ unsigned int h) {
+ if (!stream->config.cfg.g_w) {
+ if (!stream->config.cfg.g_h)
+ stream->config.cfg.g_w = w;
+ else
+ stream->config.cfg.g_w = w * stream->config.cfg.g_h / h;
+ }
+ if (!stream->config.cfg.g_h) {
+ stream->config.cfg.g_h = h * stream->config.cfg.g_w / w;
+ }
+}
+
+static const char *file_type_to_string(enum VideoFileType t) {
+ switch (t) {
+ case FILE_TYPE_RAW: return "RAW";
+ case FILE_TYPE_Y4M: return "Y4M";
+ default: return "Other";
+ }
+}
+
+static void show_stream_config(struct stream_state *stream,
+ struct AvxEncoderConfig *global,
+ struct AvxInputContext *input) {
+#define SHOW(field) \
+ fprintf(stderr, " %-28s = %d\n", #field, stream->config.cfg.field)
+
+ if (stream->index == 0) {
+ fprintf(stderr, "Codec: %s\n", aom_codec_iface_name(global->codec));
+ fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
+ input->filename, file_type_to_string(input->file_type),
+ image_format_to_string(input->fmt));
+ }
+ if (stream->next || stream->index)
+ fprintf(stderr, "\nStream Index: %d\n", stream->index);
+ fprintf(stderr, "Destination file: %s\n", stream->config.out_fn);
+ fprintf(stderr, "Coding path: %s\n",
+ stream->config.use_16bit_internal ? "HBD" : "LBD");
+ fprintf(stderr, "Encoder parameters:\n");
+
+ SHOW(g_usage);
+ SHOW(g_threads);
+ SHOW(g_profile);
+ SHOW(g_w);
+ SHOW(g_h);
+ SHOW(g_bit_depth);
+ SHOW(g_input_bit_depth);
+ SHOW(g_timebase.num);
+ SHOW(g_timebase.den);
+ SHOW(g_error_resilient);
+ SHOW(g_pass);
+ SHOW(g_lag_in_frames);
+ SHOW(large_scale_tile);
+ SHOW(rc_dropframe_thresh);
+ SHOW(rc_resize_mode);
+ SHOW(rc_resize_denominator);
+ SHOW(rc_resize_kf_denominator);
+ SHOW(rc_superres_mode);
+ SHOW(rc_superres_denominator);
+ SHOW(rc_superres_kf_denominator);
+ SHOW(rc_superres_qthresh);
+ SHOW(rc_superres_kf_qthresh);
+ SHOW(rc_end_usage);
+ SHOW(rc_target_bitrate);
+ SHOW(rc_min_quantizer);
+ SHOW(rc_max_quantizer);
+ SHOW(rc_undershoot_pct);
+ SHOW(rc_overshoot_pct);
+ SHOW(rc_buf_sz);
+ SHOW(rc_buf_initial_sz);
+ SHOW(rc_buf_optimal_sz);
+ SHOW(rc_2pass_vbr_bias_pct);
+ SHOW(rc_2pass_vbr_minsection_pct);
+ SHOW(rc_2pass_vbr_maxsection_pct);
+ SHOW(fwd_kf_enabled);
+ SHOW(kf_mode);
+ SHOW(kf_min_dist);
+ SHOW(kf_max_dist);
+
+#define SHOW_PARAMS(field) \
+ fprintf(stderr, " %-28s = %d\n", #field, \
+ stream->config.cfg.encoder_cfg.field)
+ if (global->encoder_config.init_by_cfg_file) {
+ SHOW_PARAMS(super_block_size);
+ SHOW_PARAMS(max_partition_size);
+ SHOW_PARAMS(min_partition_size);
+ SHOW_PARAMS(disable_ab_partition_type);
+ SHOW_PARAMS(disable_rect_partition_type);
+ SHOW_PARAMS(disable_1to4_partition_type);
+ SHOW_PARAMS(disable_flip_idtx);
+ SHOW_PARAMS(disable_cdef);
+ SHOW_PARAMS(disable_lr);
+ SHOW_PARAMS(disable_obmc);
+ SHOW_PARAMS(disable_warp_motion);
+ SHOW_PARAMS(disable_global_motion);
+ SHOW_PARAMS(disable_dist_wtd_comp);
+ SHOW_PARAMS(disable_diff_wtd_comp);
+ SHOW_PARAMS(disable_inter_intra_comp);
+ SHOW_PARAMS(disable_masked_comp);
+ SHOW_PARAMS(disable_one_sided_comp);
+ SHOW_PARAMS(disable_palette);
+ SHOW_PARAMS(disable_intrabc);
+ SHOW_PARAMS(disable_cfl);
+ SHOW_PARAMS(disable_smooth_intra);
+ SHOW_PARAMS(disable_filter_intra);
+ SHOW_PARAMS(disable_dual_filter);
+ SHOW_PARAMS(disable_intra_angle_delta);
+ SHOW_PARAMS(disable_intra_edge_filter);
+ SHOW_PARAMS(disable_tx_64x64);
+ SHOW_PARAMS(disable_smooth_inter_intra);
+ SHOW_PARAMS(disable_inter_inter_wedge);
+ SHOW_PARAMS(disable_inter_intra_wedge);
+ SHOW_PARAMS(disable_paeth_intra);
+ SHOW_PARAMS(disable_trellis_quant);
+ SHOW_PARAMS(disable_ref_frame_mv);
+ SHOW_PARAMS(reduced_reference_set);
+ SHOW_PARAMS(reduced_tx_type_set);
+ }
+}
+
+static void open_output_file(struct stream_state *stream,
+ struct AvxEncoderConfig *global,
+ const struct AvxRational *pixel_aspect_ratio,
+ const char *encoder_settings) {
+ const char *fn = stream->config.out_fn;
+ const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+ if (cfg->g_pass == AOM_RC_FIRST_PASS) return;
+
+ stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
+
+ if (!stream->file) fatal("Failed to open output file");
+
+ if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR))
+ fatal("WebM output to pipes not supported.");
+
+#if CONFIG_WEBM_IO
+ if (stream->config.write_webm) {
+ stream->webm_ctx.stream = stream->file;
+ if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg,
+ stream->config.stereo_fmt,
+ get_fourcc_by_aom_encoder(global->codec),
+ pixel_aspect_ratio, encoder_settings) != 0) {
+ fatal("WebM writer initialization failed.");
+ }
+ }
+#else
+ (void)pixel_aspect_ratio;
+ (void)encoder_settings;
+#endif
+
+ if (!stream->config.write_webm && stream->config.write_ivf) {
+ ivf_write_file_header(stream->file, cfg,
+ get_fourcc_by_aom_encoder(global->codec), 0);
+ }
+}
+
+static void close_output_file(struct stream_state *stream,
+ unsigned int fourcc) {
+ const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+ if (cfg->g_pass == AOM_RC_FIRST_PASS) return;
+
+#if CONFIG_WEBM_IO
+ if (stream->config.write_webm) {
+ if (write_webm_file_footer(&stream->webm_ctx) != 0) {
+ fatal("WebM writer finalization failed.");
+ }
+ }
+#endif
+
+ if (!stream->config.write_webm && stream->config.write_ivf) {
+ if (!fseek(stream->file, 0, SEEK_SET))
+ ivf_write_file_header(stream->file, &stream->config.cfg, fourcc,
+ stream->frames_out);
+ }
+
+ fclose(stream->file);
+}
+
+static void setup_pass(struct stream_state *stream,
+ struct AvxEncoderConfig *global, int pass) {
+ if (stream->config.stats_fn) {
+ if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass))
+ fatal("Failed to open statistics store");
+ } else {
+ if (!stats_open_mem(&stream->stats, pass))
+ fatal("Failed to open statistics store");
+ }
+
+ if (global->passes == 1) {
+ stream->config.cfg.g_pass = AOM_RC_ONE_PASS;
+ } else {
+ switch (pass) {
+ case 0: stream->config.cfg.g_pass = AOM_RC_FIRST_PASS; break;
+ case 1: stream->config.cfg.g_pass = AOM_RC_SECOND_PASS; break;
+ case 2: stream->config.cfg.g_pass = AOM_RC_THIRD_PASS; break;
+ default: fatal("Failed to set pass");
+ }
+ }
+
+ if (pass) {
+ stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
+ }
+
+ stream->cx_time = 0;
+ stream->nbytes = 0;
+ stream->frames_out = 0;
+}
+
+static void initialize_encoder(struct stream_state *stream,
+ struct AvxEncoderConfig *global) {
+ int i;
+ int flags = 0;
+
+ flags |= (global->show_psnr >= 1) ? AOM_CODEC_USE_PSNR : 0;
+ flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
+
+ /* Construct Encoder Context */
+ aom_codec_enc_init(&stream->encoder, global->codec, &stream->config.cfg,
+ flags);
+ ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
+
+ for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
+ int ctrl = stream->config.arg_ctrls[i][0];
+ int value = stream->config.arg_ctrls[i][1];
+ if (aom_codec_control(&stream->encoder, ctrl, value))
+ fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value);
+
+ ctx_exit_on_error(&stream->encoder, "Failed to control codec");
+ }
+
+ for (i = 0; i < stream->config.arg_key_val_cnt; i++) {
+ const char *name = stream->config.arg_key_vals[i][0];
+ const char *val = stream->config.arg_key_vals[i][1];
+ if (aom_codec_set_option(&stream->encoder, name, val))
+ fprintf(stderr, "Error: Tried to set option %s = %s\n", name, val);
+
+ ctx_exit_on_error(&stream->encoder, "Failed to set codec option");
+ }
+
+#if CONFIG_TUNE_VMAF
+ if (stream->config.vmaf_model_path) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH,
+ stream->config.vmaf_model_path);
+ ctx_exit_on_error(&stream->encoder, "Failed to set vmaf model path");
+ }
+#endif
+ if (stream->config.partition_info_path) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_SET_PARTITION_INFO_PATH,
+ stream->config.partition_info_path);
+ ctx_exit_on_error(&stream->encoder, "Failed to set partition info path");
+ }
+ if (stream->config.enable_rate_guide_deltaq) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_ENABLE_RATE_GUIDE_DELTAQ,
+ stream->config.enable_rate_guide_deltaq);
+ ctx_exit_on_error(&stream->encoder, "Failed to enable rate guide deltaq");
+ }
+ if (stream->config.rate_distribution_info) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_SET_RATE_DISTRIBUTION_INFO,
+ stream->config.rate_distribution_info);
+ ctx_exit_on_error(&stream->encoder, "Failed to set rate distribution info");
+ }
+
+ if (stream->config.film_grain_filename) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
+ stream->config.film_grain_filename);
+ ctx_exit_on_error(&stream->encoder, "Failed to set film grain table");
+ }
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_COLOR_RANGE,
+ stream->config.color_range);
+ ctx_exit_on_error(&stream->encoder, "Failed to set color range");
+
+#if CONFIG_AV1_DECODER
+ if (global->test_decode != TEST_DECODE_OFF) {
+ aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(
+ get_short_name_by_aom_encoder(global->codec));
+ aom_codec_dec_cfg_t cfg = { 0, 0, 0, !stream->config.use_16bit_internal };
+ aom_codec_dec_init(&stream->decoder, decoder, &cfg, 0);
+
+ if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE,
+ stream->config.cfg.large_scale_tile);
+ ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
+
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1D_SET_IS_ANNEXB,
+ stream->config.cfg.save_as_annexb);
+ ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb");
+
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_ROW,
+ -1);
+ ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
+
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_COL,
+ -1);
+ ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
+ }
+ }
+#endif
+}
+
+// Convert the input image 'img' to a monochrome image. The Y plane of the
+// output image is a shallow copy of the Y plane of the input image, therefore
+// the input image must remain valid for the lifetime of the output image. The U
+// and V planes of the output image are set to null pointers. The output image
+// format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400.
+static void convert_image_to_monochrome(const struct aom_image *img,
+ struct aom_image *monochrome_img) {
+ *monochrome_img = *img;
+ monochrome_img->fmt = AOM_IMG_FMT_I420;
+ if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ }
+ monochrome_img->monochrome = 1;
+ monochrome_img->csp = AOM_CSP_UNKNOWN;
+ monochrome_img->x_chroma_shift = 1;
+ monochrome_img->y_chroma_shift = 1;
+ monochrome_img->planes[AOM_PLANE_U] = NULL;
+ monochrome_img->planes[AOM_PLANE_V] = NULL;
+ monochrome_img->stride[AOM_PLANE_U] = 0;
+ monochrome_img->stride[AOM_PLANE_V] = 0;
+ monochrome_img->sz = 0;
+ monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+ monochrome_img->img_data = NULL;
+ monochrome_img->img_data_owner = 0;
+ monochrome_img->self_allocd = 0;
+}
+
+static void encode_frame(struct stream_state *stream,
+ struct AvxEncoderConfig *global, struct aom_image *img,
+ unsigned int frames_in) {
+ aom_codec_pts_t frame_start, next_frame_start;
+ struct aom_codec_enc_cfg *cfg = &stream->config.cfg;
+ struct aom_usec_timer timer;
+
+ frame_start =
+ (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) /
+ cfg->g_timebase.num / global->framerate.num;
+ next_frame_start =
+ (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) /
+ cfg->g_timebase.num / global->framerate.num;
+
+ /* Scale if necessary */
+ if (img) {
+ if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) &&
+ (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+ if (img->fmt != AOM_IMG_FMT_I42016) {
+ fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+ exit(EXIT_FAILURE);
+ }
+#if CONFIG_LIBYUV
+ if (!stream->img) {
+ stream->img =
+ aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
+ }
+ I420Scale_16(
+ (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2,
+ (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2,
+ (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2,
+ img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y],
+ stream->img->stride[AOM_PLANE_Y] / 2,
+ (uint16_t *)stream->img->planes[AOM_PLANE_U],
+ stream->img->stride[AOM_PLANE_U] / 2,
+ (uint16_t *)stream->img->planes[AOM_PLANE_V],
+ stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w,
+ stream->img->d_h, kFilterBox);
+ img = stream->img;
+#else
+ stream->encoder.err = 1;
+ ctx_exit_on_error(&stream->encoder,
+ "Stream %d: Failed to encode frame.\n"
+ "libyuv is required for scaling but is currently "
+ "disabled.\n"
+ "Be sure to specify -DCONFIG_LIBYUV=1 when running "
+ "cmake.\n",
+ stream->index);
+#endif
+ }
+ }
+ if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+ if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) {
+ fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
+ exit(EXIT_FAILURE);
+ }
+#if CONFIG_LIBYUV
+ if (!stream->img)
+ stream->img =
+ aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16);
+ I420Scale(
+ img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y],
+ img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U],
+ img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V], img->d_w, img->d_h,
+ stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y],
+ stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U],
+ stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V],
+ stream->img->d_w, stream->img->d_h, kFilterBox);
+ img = stream->img;
+#else
+ stream->encoder.err = 1;
+ ctx_exit_on_error(&stream->encoder,
+ "Stream %d: Failed to encode frame.\n"
+ "Scaling disabled in this configuration. \n"
+ "To enable, configure with --enable-libyuv\n",
+ stream->index);
+#endif
+ }
+
+ struct aom_image monochrome_img;
+ if (img && cfg->monochrome) {
+ convert_image_to_monochrome(img, &monochrome_img);
+ img = &monochrome_img;
+ }
+
+ aom_usec_timer_start(&timer);
+ aom_codec_encode(&stream->encoder, img, frame_start,
+ (uint32_t)(next_frame_start - frame_start), 0);
+ aom_usec_timer_mark(&timer);
+ stream->cx_time += aom_usec_timer_elapsed(&timer);
+ ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
+ stream->index);
+}
+
+static void update_quantizer_histogram(struct stream_state *stream) {
+ if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) {
+ int q;
+
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AOME_GET_LAST_QUANTIZER_64,
+ &q);
+ ctx_exit_on_error(&stream->encoder, "Failed to read quantizer");
+ stream->counts[q]++;
+ }
+}
+
+static void get_cx_data(struct stream_state *stream,
+ struct AvxEncoderConfig *global, int *got_data) {
+ const aom_codec_cx_pkt_t *pkt;
+ const struct aom_codec_enc_cfg *cfg = &stream->config.cfg;
+ aom_codec_iter_t iter = NULL;
+
+ *got_data = 0;
+ while ((pkt = aom_codec_get_cx_data(&stream->encoder, &iter))) {
+ static size_t fsize = 0;
+ static FileOffset ivf_header_pos = 0;
+
+ switch (pkt->kind) {
+ case AOM_CODEC_CX_FRAME_PKT:
+ ++stream->frames_out;
+ if (!global->quiet)
+ fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz);
+
+ update_rate_histogram(stream->rate_hist, cfg, pkt);
+#if CONFIG_WEBM_IO
+ if (stream->config.write_webm) {
+ if (write_webm_block(&stream->webm_ctx, cfg, pkt) != 0) {
+ fatal("WebM writer failed.");
+ }
+ }
+#endif
+ if (!stream->config.write_webm) {
+ if (stream->config.write_ivf) {
+ if (pkt->data.frame.partition_id <= 0) {
+ ivf_header_pos = ftello(stream->file);
+ fsize = pkt->data.frame.sz;
+
+ ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
+ } else {
+ fsize += pkt->data.frame.sz;
+
+ const FileOffset currpos = ftello(stream->file);
+ fseeko(stream->file, ivf_header_pos, SEEK_SET);
+ ivf_write_frame_size(stream->file, fsize);
+ fseeko(stream->file, currpos, SEEK_SET);
+ }
+ }
+
+ (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+ stream->file);
+ }
+ stream->nbytes += pkt->data.raw.sz;
+
+ *got_data = 1;
+#if CONFIG_AV1_DECODER
+ if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) {
+ aom_codec_decode(&stream->decoder, pkt->data.frame.buf,
+ pkt->data.frame.sz, NULL);
+ if (stream->decoder.err) {
+ warn_or_exit_on_error(&stream->decoder,
+ global->test_decode == TEST_DECODE_FATAL,
+ "Failed to decode frame %d in stream %d",
+ stream->frames_out + 1, stream->index);
+ stream->mismatch_seen = stream->frames_out + 1;
+ }
+ }
+#endif
+ break;
+ case AOM_CODEC_STATS_PKT:
+ stream->frames_out++;
+ stats_write(&stream->stats, pkt->data.twopass_stats.buf,
+ pkt->data.twopass_stats.sz);
+ stream->nbytes += pkt->data.raw.sz;
+ break;
+ case AOM_CODEC_PSNR_PKT:
+
+ if (global->show_psnr >= 1) {
+ int i;
+
+ stream->psnr_sse_total[0] += pkt->data.psnr.sse[0];
+ stream->psnr_samples_total[0] += pkt->data.psnr.samples[0];
+ for (i = 0; i < 4; i++) {
+ if (!global->quiet)
+ fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
+ stream->psnr_totals[0][i] += pkt->data.psnr.psnr[i];
+ }
+ stream->psnr_count[0]++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (stream->config.cfg.g_input_bit_depth <
+ (unsigned int)stream->config.cfg.g_bit_depth) {
+ stream->psnr_sse_total[1] += pkt->data.psnr.sse_hbd[0];
+ stream->psnr_samples_total[1] += pkt->data.psnr.samples_hbd[0];
+ for (i = 0; i < 4; i++) {
+ if (!global->quiet)
+ fprintf(stderr, "%.3f ", pkt->data.psnr.psnr_hbd[i]);
+ stream->psnr_totals[1][i] += pkt->data.psnr.psnr_hbd[i];
+ }
+ stream->psnr_count[1]++;
+ }
+#endif
+ }
+
+ break;
+ default: break;
+ }
+ }
+}
+
+static void show_psnr(struct stream_state *stream, double peak, int64_t bps) {
+ int i;
+ double ovpsnr;
+
+ if (!stream->psnr_count[0]) return;
+
+ fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+ ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[0], peak,
+ (double)stream->psnr_sse_total[0]);
+ fprintf(stderr, " %.3f", ovpsnr);
+
+ for (i = 0; i < 4; i++) {
+ fprintf(stderr, " %.3f", stream->psnr_totals[0][i] / stream->psnr_count[0]);
+ }
+ if (bps > 0) {
+ fprintf(stderr, " %7" PRId64 " bps", bps);
+ }
+ fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
+ fprintf(stderr, "\n");
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void show_psnr_hbd(struct stream_state *stream, double peak,
+ int64_t bps) {
+ int i;
+ double ovpsnr;
+ // Compute PSNR based on stream bit depth
+ if (!stream->psnr_count[1]) return;
+
+ fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+ ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[1], peak,
+ (double)stream->psnr_sse_total[1]);
+ fprintf(stderr, " %.3f", ovpsnr);
+
+ for (i = 0; i < 4; i++) {
+ fprintf(stderr, " %.3f", stream->psnr_totals[1][i] / stream->psnr_count[1]);
+ }
+ if (bps > 0) {
+ fprintf(stderr, " %7" PRId64 " bps", bps);
+ }
+ fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
+ fprintf(stderr, "\n");
+}
+#endif
+
+static float usec_to_fps(uint64_t usec, unsigned int frames) {
+ return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
+}
+
+static void test_decode(struct stream_state *stream,
+ enum TestDecodeFatality fatal) {
+ aom_image_t enc_img, dec_img;
+
+ if (stream->mismatch_seen) return;
+
+ /* Get the internal reference frame */
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE,
+ &enc_img);
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE,
+ &dec_img);
+
+ if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+ (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+ if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_image_t enc_hbd_img;
+ aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+ enc_img.d_w, enc_img.d_h, 16);
+ aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+ enc_img = enc_hbd_img;
+ }
+ if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_image_t dec_hbd_img;
+ aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+ dec_img.d_w, dec_img.d_h, 16);
+ aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+ dec_img = dec_hbd_img;
+ }
+ }
+
+ ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
+ ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
+
+ if (!aom_compare_img(&enc_img, &dec_img)) {
+ int y[4], u[4], v[4];
+ if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+ } else {
+ aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+ }
+ stream->decoder.err = 1;
+ warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
+ "Stream %d: Encode/decode mismatch on frame %d at"
+ " Y[%d, %d] {%d/%d},"
+ " U[%d, %d] {%d/%d},"
+ " V[%d, %d] {%d/%d}",
+ stream->index, stream->frames_out, y[0], y[1], y[2],
+ y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]);
+ stream->mismatch_seen = stream->frames_out;
+ }
+
+ aom_img_free(&enc_img);
+ aom_img_free(&dec_img);
+}
+
+static void print_time(const char *label, int64_t etl) {
+ int64_t hours;
+ int64_t mins;
+ int64_t secs;
+
+ if (etl >= 0) {
+ hours = etl / 3600;
+ etl -= hours * 3600;
+ mins = etl / 60;
+ etl -= mins * 60;
+ secs = etl;
+
+ fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label,
+ hours, mins, secs);
+ } else {
+ fprintf(stderr, "[%3s unknown] ", label);
+ }
+}
+
+static void clear_stream_count_state(struct stream_state *stream) {
+ // PSNR counters
+ for (int k = 0; k < 2; k++) {
+ stream->psnr_sse_total[k] = 0;
+ stream->psnr_samples_total[k] = 0;
+ for (int i = 0; i < 4; i++) {
+ stream->psnr_totals[k][i] = 0;
+ }
+ stream->psnr_count[k] = 0;
+ }
+ // q hist
+ memset(stream->counts, 0, sizeof(stream->counts));
+}
+
+// aomenc will downscale the second pass if:
+// 1. the specific pass is not given by commandline (aomenc will perform all
+// passes)
+// 2. there are more than 2 passes in total
+// 3. current pass is the second pass (the parameter pass starts with 0 so
+// pass == 1)
+static int pass_need_downscale(int global_pass, int global_passes, int pass) {
+ return !global_pass && global_passes > 2 && pass == 1;
+}
+
+int main(int argc, const char **argv_) {
+ int pass;
+ aom_image_t raw;
+ aom_image_t raw_shift;
+ int allocated_raw_shift = 0;
+ int do_16bit_internal = 0;
+ int input_shift = 0;
+ int frame_avail, got_data;
+
+ struct AvxInputContext input;
+ struct AvxEncoderConfig global;
+ struct stream_state *streams = NULL;
+ char **argv, **argi;
+ uint64_t cx_time = 0;
+ int stream_cnt = 0;
+ int res = 0;
+ int profile_updated = 0;
+
+ memset(&input, 0, sizeof(input));
+ memset(&raw, 0, sizeof(raw));
+ exec_name = argv_[0];
+
+ /* Setup default input stream settings */
+ input.framerate.numerator = 30;
+ input.framerate.denominator = 1;
+ input.only_i420 = 1;
+ input.bit_depth = 0;
+
+ /* First parse the global configuration values, because we want to apply
+ * other parameters on top of the default configuration provided by the
+ * codec.
+ */
+ argv = argv_dup(argc - 1, argv_ + 1);
+ if (!argv) {
+ fprintf(stderr, "Error allocating argument list\n");
+ return EXIT_FAILURE;
+ }
+ parse_global_config(&global, &argv);
+
+ if (argc < 2) usage_exit();
+
+ switch (global.color_type) {
+ case I420: input.fmt = AOM_IMG_FMT_I420; break;
+ case I422: input.fmt = AOM_IMG_FMT_I422; break;
+ case I444: input.fmt = AOM_IMG_FMT_I444; break;
+ case YV12: input.fmt = AOM_IMG_FMT_YV12; break;
+ case NV12: input.fmt = AOM_IMG_FMT_NV12; break;
+ }
+
+ {
+ /* Now parse each stream's parameters. Using a local scope here
+ * due to the use of 'stream' as loop variable in FOREACH_STREAM
+ * loops
+ */
+ struct stream_state *stream = NULL;
+
+ do {
+ stream = new_stream(&global, stream);
+ stream_cnt++;
+ if (!streams) streams = stream;
+ } while (parse_stream_params(&global, stream, argv));
+ }
+
+ /* Check for unrecognized options */
+ for (argi = argv; *argi; argi++)
+ if (argi[0][0] == '-' && argi[0][1])
+ die("Error: Unrecognized option %s\n", *argi);
+
+ FOREACH_STREAM(stream, streams) {
+ check_encoder_config(global.disable_warning_prompt, &global,
+ &stream->config.cfg);
+
+ // If large_scale_tile = 1, only support to output to ivf format.
+ if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf)
+ die("only support ivf output format while large-scale-tile=1\n");
+ }
+
+ /* Handle non-option arguments */
+ input.filename = argv[0];
+ const char *orig_input_filename = input.filename;
+ FOREACH_STREAM(stream, streams) {
+ stream->orig_out_fn = stream->config.out_fn;
+ stream->orig_width = stream->config.cfg.g_w;
+ stream->orig_height = stream->config.cfg.g_h;
+ stream->orig_write_ivf = stream->config.write_ivf;
+ stream->orig_write_webm = stream->config.write_webm;
+ }
+
+ if (!input.filename) {
+ fprintf(stderr, "No input file specified!\n");
+ usage_exit();
+ }
+
+ /* Decide if other chroma subsamplings than 4:2:0 are supported */
+ if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC)
+ input.only_i420 = 0;
+
+ for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
+ if (pass > 1) {
+ FOREACH_STREAM(stream, streams) { clear_stream_count_state(stream); }
+ }
+
+ int frames_in = 0, seen_frames = 0;
+ int64_t estimated_time_left = -1;
+ int64_t average_rate = -1;
+ int64_t lagged_count = 0;
+ const int need_downscale =
+ pass_need_downscale(global.pass, global.passes, pass);
+
+ // Set the output to the specified two-pass output file, and
+ // restore the width and height to the original values.
+ FOREACH_STREAM(stream, streams) {
+ if (need_downscale) {
+ stream->config.out_fn = stream->config.two_pass_output;
+ // Libaom currently only supports the ivf format for the third pass.
+ stream->config.write_ivf = 1;
+ stream->config.write_webm = 0;
+ } else {
+ stream->config.out_fn = stream->orig_out_fn;
+ stream->config.write_ivf = stream->orig_write_ivf;
+ stream->config.write_webm = stream->orig_write_webm;
+ }
+ stream->config.cfg.g_w = stream->orig_width;
+ stream->config.cfg.g_h = stream->orig_height;
+ }
+
+ // For second pass in three-pass encoding, set the input to
+ // the given two-pass-input file if available. If the scaled input is not
+ // given, we will attempt to re-scale the original input.
+ input.filename = orig_input_filename;
+ const char *two_pass_input = NULL;
+ if (need_downscale) {
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.two_pass_input) {
+ two_pass_input = stream->config.two_pass_input;
+ input.filename = two_pass_input;
+ break;
+ }
+ }
+ }
+
+ open_input_file(&input, global.csp);
+
+ /* If the input file doesn't specify its w/h (raw files), try to get
+ * the data from the first stream's configuration.
+ */
+ if (!input.width || !input.height) {
+ if (two_pass_input) {
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.two_pass_width && stream->config.two_pass_height) {
+ input.width = stream->config.two_pass_width;
+ input.height = stream->config.two_pass_height;
+ break;
+ }
+ }
+ } else {
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+ input.width = stream->config.cfg.g_w;
+ input.height = stream->config.cfg.g_h;
+ break;
+ }
+ }
+ }
+ }
+
+ /* Update stream configurations from the input file's parameters */
+ if (!input.width || !input.height) {
+ if (two_pass_input) {
+ fatal(
+ "Specify downscaled stream dimensions with --two-pass-width "
+ " and --two-pass-height");
+ } else {
+ fatal(
+ "Specify stream dimensions with --width (-w) "
+ " and --height (-h)");
+ }
+ }
+
+ if (need_downscale) {
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.two_pass_width && stream->config.two_pass_height) {
+ stream->config.cfg.g_w = stream->config.two_pass_width;
+ stream->config.cfg.g_h = stream->config.two_pass_height;
+ } else if (two_pass_input) {
+ stream->config.cfg.g_w = input.width;
+ stream->config.cfg.g_h = input.height;
+ } else if (stream->orig_width && stream->orig_height) {
+#if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+ stream->config.cfg.g_w = stream->orig_width;
+ stream->config.cfg.g_h = stream->orig_height;
+#else // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+ stream->config.cfg.g_w = (stream->orig_width + 1) / 2;
+ stream->config.cfg.g_h = (stream->orig_height + 1) / 2;
+#endif // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+ } else {
+#if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+ stream->config.cfg.g_w = input.width;
+ stream->config.cfg.g_h = input.height;
+#else // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+ stream->config.cfg.g_w = (input.width + 1) / 2;
+ stream->config.cfg.g_h = (input.height + 1) / 2;
+#endif // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL
+ }
+ }
+ }
+
+ /* If input file does not specify bit-depth but input-bit-depth parameter
+ * exists, assume that to be the input bit-depth. However, if the
+ * input-bit-depth paramter does not exist, assume the input bit-depth
+ * to be the same as the codec bit-depth.
+ */
+ if (!input.bit_depth) {
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.cfg.g_input_bit_depth)
+ input.bit_depth = stream->config.cfg.g_input_bit_depth;
+ else
+ input.bit_depth = stream->config.cfg.g_input_bit_depth =
+ (int)stream->config.cfg.g_bit_depth;
+ }
+ if (input.bit_depth > 8) input.fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ } else {
+ FOREACH_STREAM(stream, streams) {
+ stream->config.cfg.g_input_bit_depth = input.bit_depth;
+ }
+ }
+
+ FOREACH_STREAM(stream, streams) {
+ if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016 &&
+ input.fmt != AOM_IMG_FMT_NV12) {
+ /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
+ was selected. */
+ switch (stream->config.cfg.g_profile) {
+ case 0:
+ if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+ input.fmt == AOM_IMG_FMT_I44416)) {
+ if (!stream->config.cfg.monochrome) {
+ stream->config.cfg.g_profile = 1;
+ profile_updated = 1;
+ }
+ } else if (input.bit_depth == 12 ||
+ ((input.fmt == AOM_IMG_FMT_I422 ||
+ input.fmt == AOM_IMG_FMT_I42216) &&
+ !stream->config.cfg.monochrome)) {
+ stream->config.cfg.g_profile = 2;
+ profile_updated = 1;
+ }
+ break;
+ case 1:
+ if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+ input.fmt == AOM_IMG_FMT_I42216) {
+ stream->config.cfg.g_profile = 2;
+ profile_updated = 1;
+ } else if (input.bit_depth < 12 &&
+ (input.fmt == AOM_IMG_FMT_I420 ||
+ input.fmt == AOM_IMG_FMT_I42016)) {
+ stream->config.cfg.g_profile = 0;
+ profile_updated = 1;
+ }
+ break;
+ case 2:
+ if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+ input.fmt == AOM_IMG_FMT_I44416)) {
+ stream->config.cfg.g_profile = 1;
+ profile_updated = 1;
+ } else if (input.bit_depth < 12 &&
+ (input.fmt == AOM_IMG_FMT_I420 ||
+ input.fmt == AOM_IMG_FMT_I42016)) {
+ stream->config.cfg.g_profile = 0;
+ profile_updated = 1;
+ } else if (input.bit_depth == 12 &&
+ input.file_type == FILE_TYPE_Y4M) {
+ // Note that here the input file values for chroma subsampling
+ // are used instead of those from the command line.
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_SET_CHROMA_SUBSAMPLING_X,
+ input.y4m.dst_c_dec_h >> 1);
+ ctx_exit_on_error(&stream->encoder,
+ "Failed to set chroma subsampling x");
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_SET_CHROMA_SUBSAMPLING_Y,
+ input.y4m.dst_c_dec_v >> 1);
+ ctx_exit_on_error(&stream->encoder,
+ "Failed to set chroma subsampling y");
+ } else if (input.bit_depth == 12 &&
+ input.file_type == FILE_TYPE_RAW) {
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_SET_CHROMA_SUBSAMPLING_X,
+ stream->chroma_subsampling_x);
+ ctx_exit_on_error(&stream->encoder,
+ "Failed to set chroma subsampling x");
+ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+ AV1E_SET_CHROMA_SUBSAMPLING_Y,
+ stream->chroma_subsampling_y);
+ ctx_exit_on_error(&stream->encoder,
+ "Failed to set chroma subsampling y");
+ }
+ break;
+ default: break;
+ }
+ }
+ /* Automatically set the codec bit depth to match the input bit depth.
+ * Upgrade the profile if required. */
+ if (stream->config.cfg.g_input_bit_depth >
+ (unsigned int)stream->config.cfg.g_bit_depth) {
+ stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth;
+ if (!global.quiet) {
+ fprintf(stderr,
+ "Warning: automatically updating bit depth to %d to "
+ "match input format.\n",
+ stream->config.cfg.g_input_bit_depth);
+ }
+ }
+#if !CONFIG_AV1_HIGHBITDEPTH
+ if (stream->config.cfg.g_bit_depth > 8) {
+ fatal("Unsupported bit-depth with CONFIG_AV1_HIGHBITDEPTH=0\n");
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ if (stream->config.cfg.g_bit_depth > 10) {
+ switch (stream->config.cfg.g_profile) {
+ case 0:
+ case 1:
+ stream->config.cfg.g_profile = 2;
+ profile_updated = 1;
+ break;
+ default: break;
+ }
+ }
+ if (stream->config.cfg.g_bit_depth > 8) {
+ stream->config.use_16bit_internal = 1;
+ }
+ if (profile_updated && !global.quiet) {
+ fprintf(stderr,
+ "Warning: automatically updating to profile %d to "
+ "match input format.\n",
+ stream->config.cfg.g_profile);
+ }
+ if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth ==
+ stream->config.cfg.g_bit_depth)) {
+ fprintf(stderr,
+ "Warning: --psnr==2 and --psnr==1 will provide same "
+ "results when input bit-depth == stream bit-depth, "
+ "falling back to default psnr value\n");
+ global.show_psnr = 1;
+ }
+ if (global.show_psnr < 0 || global.show_psnr > 2) {
+ fprintf(stderr,
+ "Warning: --psnr can take only 0,1,2 as values,"
+ "falling back to default psnr value\n");
+ global.show_psnr = 1;
+ }
+ /* Set limit */
+ stream->config.cfg.g_limit = global.limit;
+ }
+
+ FOREACH_STREAM(stream, streams) {
+ set_stream_dimensions(stream, input.width, input.height);
+ stream->config.color_range = input.color_range;
+ }
+ FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); }
+
+ /* Ensure that --passes and --pass are consistent. If --pass is set and
+ * --passes >= 2, ensure --fpf was set.
+ */
+ if (global.pass > 0 && global.pass <= 3 && global.passes >= 2) {
+ FOREACH_STREAM(stream, streams) {
+ if (!stream->config.stats_fn)
+ die("Stream %d: Must specify --fpf when --pass=%d"
+ " and --passes=%d\n",
+ stream->index, global.pass, global.passes);
+ }
+ }
+
+#if !CONFIG_WEBM_IO
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.write_webm) {
+ stream->config.write_webm = 0;
+ stream->config.write_ivf = 0;
+ aom_tools_warn("aomenc compiled w/o WebM support. Writing OBU stream.");
+ }
+ }
+#endif
+
+ /* Use the frame rate from the file only if none was specified
+ * on the command-line.
+ */
+ if (!global.have_framerate) {
+ global.framerate.num = input.framerate.numerator;
+ global.framerate.den = input.framerate.denominator;
+ }
+ FOREACH_STREAM(stream, streams) {
+ stream->config.cfg.g_timebase.den = global.framerate.num;
+ stream->config.cfg.g_timebase.num = global.framerate.den;
+ }
+ /* Show configuration */
+ if (global.verbose && pass == 0) {
+ FOREACH_STREAM(stream, streams) {
+ show_stream_config(stream, &global, &input);
+ }
+ }
+
+ if (pass == (global.pass ? global.pass - 1 : 0)) {
+ // The Y4M reader does its own allocation.
+ if (input.file_type != FILE_TYPE_Y4M) {
+ aom_img_alloc(&raw, input.fmt, input.width, input.height, 32);
+ }
+ FOREACH_STREAM(stream, streams) {
+ stream->rate_hist =
+ init_rate_histogram(&stream->config.cfg, &global.framerate);
+ }
+ }
+
+ FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); }
+ FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); }
+ FOREACH_STREAM(stream, streams) {
+ char *encoder_settings = NULL;
+#if CONFIG_WEBM_IO
+ // Test frameworks may compare outputs from different versions, but only
+ // wish to check for bitstream changes. The encoder-settings tag, however,
+ // can vary if the version is updated, even if no encoder algorithm
+ // changes were made. To work around this issue, do not output
+ // the encoder-settings tag when --debug is enabled (which is the flag
+ // that test frameworks should use, when they want deterministic output
+ // from the container format).
+ if (stream->config.write_webm && !stream->webm_ctx.debug) {
+ encoder_settings = extract_encoder_settings(
+ aom_codec_version_str(), argv_, argc, input.filename);
+ if (encoder_settings == NULL) {
+ fprintf(
+ stderr,
+ "Warning: unable to extract encoder settings. Continuing...\n");
+ }
+ }
+#endif
+ open_output_file(stream, &global, &input.pixel_aspect_ratio,
+ encoder_settings);
+ free(encoder_settings);
+ }
+
+ if (strcmp(get_short_name_by_aom_encoder(global.codec), "av1") == 0) {
+ // Check to see if at least one stream uses 16 bit internal.
+ // Currently assume that the bit_depths for all streams using
+ // highbitdepth are the same.
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.use_16bit_internal) {
+ do_16bit_internal = 1;
+ }
+ input_shift = (int)stream->config.cfg.g_bit_depth -
+ stream->config.cfg.g_input_bit_depth;
+ }
+ }
+
+ frame_avail = 1;
+ got_data = 0;
+
+ while (frame_avail || got_data) {
+ struct aom_usec_timer timer;
+
+ if (!global.limit || frames_in < global.limit) {
+ frame_avail = read_frame(&input, &raw);
+
+ if (frame_avail) frames_in++;
+ seen_frames =
+ frames_in > global.skip_frames ? frames_in - global.skip_frames : 0;
+
+ if (!global.quiet) {
+ float fps = usec_to_fps(cx_time, seen_frames);
+ fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);
+
+ if (stream_cnt == 1)
+ fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in,
+ streams->frames_out, (int64_t)streams->nbytes);
+ else
+ fprintf(stderr, "frame %4d ", frames_in);
+
+ fprintf(stderr, "%7" PRId64 " %s %.2f %s ",
+ cx_time > 9999999 ? cx_time / 1000 : cx_time,
+ cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60,
+ fps >= 1.0 ? "fps" : "fpm");
+ print_time("ETA", estimated_time_left);
+ // mingw-w64 gcc does not match msvc for stderr buffering behavior
+ // and uses line buffering, thus the progress output is not
+ // real-time. The fflush() is here to make sure the progress output
+ // is sent out while the clip is being processed.
+ fflush(stderr);
+ }
+
+ } else {
+ frame_avail = 0;
+ }
+
+ if (frames_in > global.skip_frames) {
+ aom_image_t *frame_to_encode;
+ if (input_shift || (do_16bit_internal && input.bit_depth == 8)) {
+ assert(do_16bit_internal);
+ // Input bit depth and stream bit depth do not match, so up
+ // shift frame to stream bit depth
+ if (!allocated_raw_shift) {
+ aom_img_alloc(&raw_shift, raw.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+ input.width, input.height, 32);
+ allocated_raw_shift = 1;
+ }
+ aom_img_upshift(&raw_shift, &raw, input_shift);
+ frame_to_encode = &raw_shift;
+ } else {
+ frame_to_encode = &raw;
+ }
+ aom_usec_timer_start(&timer);
+ if (do_16bit_internal) {
+ assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+ FOREACH_STREAM(stream, streams) {
+ if (stream->config.use_16bit_internal)
+ encode_frame(stream, &global,
+ frame_avail ? frame_to_encode : NULL, frames_in);
+ else
+ assert(0);
+ }
+ } else {
+ assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0);
+ FOREACH_STREAM(stream, streams) {
+ encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL,
+ frames_in);
+ }
+ }
+ aom_usec_timer_mark(&timer);
+ cx_time += aom_usec_timer_elapsed(&timer);
+
+ FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); }
+
+ got_data = 0;
+ FOREACH_STREAM(stream, streams) {
+ get_cx_data(stream, &global, &got_data);
+ }
+
+ if (!got_data && input.length && streams != NULL &&
+ !streams->frames_out) {
+ lagged_count = global.limit ? seen_frames : ftello(input.file);
+ } else if (input.length) {
+ int64_t remaining;
+ int64_t rate;
+
+ if (global.limit) {
+ const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000;
+
+ rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
+ remaining = 1000 * (global.limit - global.skip_frames -
+ seen_frames + lagged_count);
+ } else {
+ const int64_t input_pos = ftello(input.file);
+ const int64_t input_pos_lagged = input_pos - lagged_count;
+ const int64_t input_limit = input.length;
+
+ rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
+ remaining = input_limit - input_pos + lagged_count;
+ }
+
+ average_rate =
+ (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8;
+ estimated_time_left = average_rate ? remaining / average_rate : -1;
+ }
+
+ if (got_data && global.test_decode != TEST_DECODE_OFF) {
+ FOREACH_STREAM(stream, streams) {
+ test_decode(stream, global.test_decode);
+ }
+ }
+ }
+
+ fflush(stdout);
+ if (!global.quiet) fprintf(stderr, "\033[K");
+ }
+
+ if (stream_cnt > 1) fprintf(stderr, "\n");
+
+ if (!global.quiet) {
+ FOREACH_STREAM(stream, streams) {
+ const int64_t bpf =
+ seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0;
+ const int64_t bps = bpf * global.framerate.num / global.framerate.den;
+ fprintf(stderr,
+ "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
+ "b/f %7" PRId64
+ "b/s"
+ " %7" PRId64 " %s (%.2f fps)\033[K\n",
+ pass + 1, global.passes, frames_in, stream->frames_out,
+ (int64_t)stream->nbytes, bpf, bps,
+ stream->cx_time > 9999999 ? stream->cx_time / 1000
+ : stream->cx_time,
+ stream->cx_time > 9999999 ? "ms" : "us",
+ usec_to_fps(stream->cx_time, seen_frames));
+ // This instance of cr does not need fflush as it is followed by a
+ // newline in the same string.
+ }
+ }
+
+ if (global.show_psnr >= 1) {
+ if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) {
+ FOREACH_STREAM(stream, streams) {
+ int64_t bps = 0;
+ if (global.show_psnr == 1) {
+ if (stream->psnr_count[0] && seen_frames && global.framerate.den) {
+ bps = (int64_t)stream->nbytes * 8 *
+ (int64_t)global.framerate.num / global.framerate.den /
+ seen_frames;
+ }
+ show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
+ bps);
+ }
+ if (global.show_psnr == 2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (stream->config.cfg.g_input_bit_depth <
+ (unsigned int)stream->config.cfg.g_bit_depth)
+ show_psnr_hbd(stream, (1 << stream->config.cfg.g_bit_depth) - 1,
+ bps);
+#endif
+ }
+ }
+ } else {
+ FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); }
+ }
+ }
+
+ if (pass == global.passes - 1) {
+ FOREACH_STREAM(stream, streams) {
+ int num_operating_points;
+ int levels[32];
+ int target_levels[32];
+ aom_codec_control(&stream->encoder, AV1E_GET_NUM_OPERATING_POINTS,
+ &num_operating_points);
+ aom_codec_control(&stream->encoder, AV1E_GET_SEQ_LEVEL_IDX, levels);
+ aom_codec_control(&stream->encoder, AV1E_GET_TARGET_SEQ_LEVEL_IDX,
+ target_levels);
+
+ for (int i = 0; i < num_operating_points; i++) {
+ if (levels[i] > target_levels[i]) {
+ if (levels[i] == 31) {
+ aom_tools_warn(
+ "Failed to encode to target level %d.%d for operating point "
+ "%d. The output level is SEQ_LEVEL_MAX",
+ 2 + (target_levels[i] >> 2), target_levels[i] & 3, i);
+ } else {
+ aom_tools_warn(
+ "Failed to encode to target level %d.%d for operating point "
+ "%d. The output level is %d.%d",
+ 2 + (target_levels[i] >> 2), target_levels[i] & 3, i,
+ 2 + (levels[i] >> 2), levels[i] & 3);
+ }
+ }
+ }
+ }
+ }
+
+ FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); }
+
+ if (global.test_decode != TEST_DECODE_OFF) {
+ FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->decoder); }
+ }
+
+ close_input_file(&input);
+
+ if (global.test_decode == TEST_DECODE_FATAL) {
+ FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; }
+ }
+ FOREACH_STREAM(stream, streams) {
+ close_output_file(stream, get_fourcc_by_aom_encoder(global.codec));
+ }
+
+ FOREACH_STREAM(stream, streams) {
+ stats_close(&stream->stats, global.passes - 1);
+ }
+
+ if (global.pass) break;
+ }
+
+ if (global.show_q_hist_buckets) {
+ FOREACH_STREAM(stream, streams) {
+ show_q_histogram(stream->counts, global.show_q_hist_buckets);
+ }
+ }
+
+ if (global.show_rate_hist_buckets) {
+ FOREACH_STREAM(stream, streams) {
+ show_rate_histogram(stream->rate_hist, &stream->config.cfg,
+ global.show_rate_hist_buckets);
+ }
+ }
+ FOREACH_STREAM(stream, streams) { destroy_rate_histogram(stream->rate_hist); }
+
+#if CONFIG_INTERNAL_STATS
+ /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now,
+ * to match some existing utilities.
+ */
+ if (!(global.pass == 1 && global.passes == 2)) {
+ FOREACH_STREAM(stream, streams) {
+ FILE *f = fopen("opsnr.stt", "a");
+ if (stream->mismatch_seen) {
+ fprintf(f, "First mismatch occurred in frame %d\n",
+ stream->mismatch_seen);
+ } else {
+ fprintf(f, "No mismatch detected in recon buffers\n");
+ }
+ fclose(f);
+ }
+ }
+#endif
+
+ if (allocated_raw_shift) aom_img_free(&raw_shift);
+ aom_img_free(&raw);
+ free(argv);
+ free(streams);
+ return res ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/third_party/aom/apps/aomenc.h b/third_party/aom/apps/aomenc.h
new file mode 100644
index 0000000000..935d5fcd16
--- /dev/null
+++ b/third_party/aom/apps/aomenc.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_APPS_AOMENC_H_
+#define AOM_APPS_AOMENC_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "av1/arg_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ I420, // 4:2:0 8+ bit-depth
+ I422, // 4:2:2 8+ bit-depth
+ I444, // 4:4:4 8+ bit-depth
+ YV12, // 4:2:0 with uv flipped, only 8-bit depth
+ NV12, // 4:2:0 with uv interleaved, only 8-bit depth
+} ColorInputType;
+
+/* Configuration elements common to all streams. */
+struct AvxEncoderConfig {
+ aom_codec_iface_t *codec;
+ int passes;
+ int pass;
+ unsigned int usage;
+ ColorInputType color_type;
+ int quiet;
+ int verbose;
+ int limit;
+ int skip_frames;
+ int show_psnr;
+ enum TestDecodeFatality test_decode;
+ int have_framerate;
+ struct aom_rational framerate;
+ int debug;
+ int show_q_hist_buckets;
+ int show_rate_hist_buckets;
+ int disable_warnings;
+ int disable_warning_prompt;
+ int experimental_bitstream;
+ aom_chroma_sample_position_t csp;
+ cfg_options_t encoder_config;
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_APPS_AOMENC_H_
diff --git a/third_party/aom/av1/arg_defs.c b/third_party/aom/av1/arg_defs.c
new file mode 100644
index 0000000000..057565411a
--- /dev/null
+++ b/third_party/aom/av1/arg_defs.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/arg_defs.h"
+
+static const struct arg_enum_list test_decode_enum[] = {
+ { "off", TEST_DECODE_OFF },
+ { "fatal", TEST_DECODE_FATAL },
+ { "warn", TEST_DECODE_WARN },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list bitdepth_enum[] = {
+ { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
+};
+
+#if CONFIG_WEBM_IO
+static const struct arg_enum_list stereo_mode_enum[] = {
+ { "mono", STEREO_FORMAT_MONO },
+ { "left-right", STEREO_FORMAT_LEFT_RIGHT },
+ { "bottom-top", STEREO_FORMAT_BOTTOM_TOP },
+ { "top-bottom", STEREO_FORMAT_TOP_BOTTOM },
+ { "right-left", STEREO_FORMAT_RIGHT_LEFT },
+ { NULL, 0 }
+};
+#endif
+
+static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR },
+ { "cbr", AOM_CBR },
+ { "cq", AOM_CQ },
+ { "q", AOM_Q },
+ { NULL, 0 } };
+
+static const struct arg_enum_list tuning_enum[] = {
+ { "psnr", AOM_TUNE_PSNR },
+ { "ssim", AOM_TUNE_SSIM },
+ { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING },
+ { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING },
+ { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
+ { "vmaf_neg", AOM_TUNE_VMAF_NEG_MAX_GAIN },
+ { "butteraugli", AOM_TUNE_BUTTERAUGLI },
+ { "vmaf_saliency_map", AOM_TUNE_VMAF_SALIENCY_MAP },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list dist_metric_enum[] = {
+ { "psnr", AOM_DIST_METRIC_PSNR },
+ { "qm-psnr", AOM_DIST_METRIC_QM_PSNR },
+ { NULL, 0 }
+};
+
+#if CONFIG_AV1_ENCODER
+static const struct arg_enum_list timing_info_enum[] = {
+ { "unspecified", AOM_TIMING_UNSPECIFIED },
+ { "constant", AOM_TIMING_EQUAL },
+ { "model", AOM_TIMING_DEC_MODEL },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list superblock_size_enum[] = {
+ { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
+ { "64", AOM_SUPERBLOCK_SIZE_64X64 },
+ { "128", AOM_SUPERBLOCK_SIZE_128X128 },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list matrix_coefficients_enum[] = {
+ { "identity", AOM_CICP_MC_IDENTITY },
+ { "bt709", AOM_CICP_MC_BT_709 },
+ { "unspecified", AOM_CICP_MC_UNSPECIFIED },
+ { "fcc73", AOM_CICP_MC_FCC },
+ { "bt470bg", AOM_CICP_MC_BT_470_B_G },
+ { "bt601", AOM_CICP_MC_BT_601 },
+ { "smpte240", AOM_CICP_CP_SMPTE_240 },
+ { "ycgco", AOM_CICP_MC_SMPTE_YCGCO },
+ { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL },
+ { "bt2020cl", AOM_CICP_MC_BT_2020_CL },
+ { "smpte2085", AOM_CICP_MC_SMPTE_2085 },
+ { "chromncl", AOM_CICP_MC_CHROMAT_NCL },
+ { "chromcl", AOM_CICP_MC_CHROMAT_CL },
+ { "ictcp", AOM_CICP_MC_ICTCP },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list chroma_sample_position_enum[] = {
+ { "unknown", AOM_CSP_UNKNOWN },
+ { "vertical", AOM_CSP_VERTICAL },
+ { "colocated", AOM_CSP_COLOCATED },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list tune_content_enum[] = {
+ { "default", AOM_CONTENT_DEFAULT },
+ { "screen", AOM_CONTENT_SCREEN },
+ { "film", AOM_CONTENT_FILM },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list transfer_characteristics_enum[] = {
+ { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+ { "bt709", AOM_CICP_TC_BT_709 },
+ { "bt470m", AOM_CICP_TC_BT_470_M },
+ { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+ { "bt601", AOM_CICP_TC_BT_601 },
+ { "smpte240", AOM_CICP_TC_SMPTE_240 },
+ { "lin", AOM_CICP_TC_LINEAR },
+ { "log100", AOM_CICP_TC_LOG_100 },
+ { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 },
+ { "iec61966", AOM_CICP_TC_IEC_61966 },
+ { "bt1361", AOM_CICP_TC_BT_1361 },
+ { "srgb", AOM_CICP_TC_SRGB },
+ { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT },
+ { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT },
+ { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+ { "hlg", AOM_CICP_TC_HLG },
+ { "smpte428", AOM_CICP_TC_SMPTE_428 },
+ { NULL, 0 }
+};
+
+static const struct arg_enum_list color_primaries_enum[] = {
+ { "bt709", AOM_CICP_CP_BT_709 },
+ { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+ { "bt601", AOM_CICP_CP_BT_601 },
+ { "bt470m", AOM_CICP_CP_BT_470_M },
+ { "bt470bg", AOM_CICP_CP_BT_470_B_G },
+ { "smpte240", AOM_CICP_CP_SMPTE_240 },
+ { "film", AOM_CICP_CP_GENERIC_FILM },
+ { "bt2020", AOM_CICP_CP_BT_2020 },
+ { "xyz", AOM_CICP_CP_XYZ },
+ { "smpte431", AOM_CICP_CP_SMPTE_431 },
+ { "smpte432", AOM_CICP_CP_SMPTE_432 },
+ { "ebu3213", AOM_CICP_CP_EBU_3213 },
+ { NULL, 0 }
+};
+#endif // CONFIG_AV1_ENCODER
+
+const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
+ .help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"),
+ .debugmode =
+ ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"),
+ .outputfile = ARG_DEF("o", "output", 1, "Output filename"),
+ .use_nv12 = ARG_DEF(NULL, "nv12", 0, "Input file is NV12"),
+ .use_yv12 = ARG_DEF(NULL, "yv12", 0, "Input file is YV12"),
+ .use_i420 = ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"),
+ .use_i422 = ARG_DEF(NULL, "i422", 0, "Input file is I422"),
+ .use_i444 = ARG_DEF(NULL, "i444", 0, "Input file is I444"),
+ .codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"),
+ .passes = ARG_DEF("p", "passes", 1, "Number of passes (1/2/3)"),
+ .pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2/3)"),
+ .fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"),
+ .limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"),
+ .skip = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"),
+ .good_dl = ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"),
+ .rt_dl = ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"),
+ .ai_dl = ARG_DEF(NULL, "allintra", 0, "Use all intra mode"),
+ .quietarg = ARG_DEF("q", "quiet", 0, "Do not print encode progress"),
+ .verbosearg = ARG_DEF("v", "verbose", 0, "Show encoder parameters"),
+ .psnrarg = ARG_DEF(
+ NULL, "psnr", -1,
+ "Show PSNR in status line "
+ "(0: Disable PSNR status line display, 1: PSNR calculated using input "
+ "bit-depth (default), 2: PSNR calculated using stream bit-depth); "
+ "takes default option when arguments are not specified"),
+ .use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use"),
+ .recontest = ARG_DEF_ENUM(NULL, "test-decode", 1,
+ "Test encode/decode mismatch", test_decode_enum),
+ .framerate = ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"),
+ .use_webm =
+ ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"),
+ .use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"),
+ .use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU"),
+ .q_hist_n =
+ ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"),
+ .rate_hist_n =
+ ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"),
+ .disable_warnings =
+ ARG_DEF(NULL, "disable-warnings", 0,
+ "Disable warnings about potentially incorrect encode settings"),
+ .disable_warning_prompt =
+ ARG_DEF("y", "disable-warning-prompt", 0,
+ "Display warnings, but do not prompt user to continue"),
+ .bitdeptharg =
+ ARG_DEF_ENUM("b", "bit-depth", 1, "Bit depth for codec", bitdepth_enum),
+ .inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"),
+
+ .input_chroma_subsampling_x = ARG_DEF(NULL, "input-chroma-subsampling-x", 1,
+ "Chroma subsampling x value"),
+ .input_chroma_subsampling_y = ARG_DEF(NULL, "input-chroma-subsampling-y", 1,
+ "Chroma subsampling y value"),
+
+ .usage = ARG_DEF("u", "usage", 1,
+ "Usage profile number to use (0: good, 1: rt, 2: allintra)"),
+ .threads = ARG_DEF("t", "threads", 1, "Max number of threads to use"),
+ .profile = ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"),
+ .width = ARG_DEF("w", "width", 1, "Frame width"),
+ .height = ARG_DEF("h", "height", 1, "Frame height"),
+ .forced_max_frame_width = ARG_DEF(NULL, "forced_max_frame_width", 1,
+ "Maximum frame width value to force"),
+ .forced_max_frame_height = ARG_DEF(NULL, "forced_max_frame_height", 1,
+ "Maximum frame height value to force"),
+#if CONFIG_WEBM_IO
+ .stereo_mode = ARG_DEF_ENUM(NULL, "stereo-mode", 1, "Stereo 3D video format",
+ stereo_mode_enum),
+#endif
+ .timebase = ARG_DEF(NULL, "timebase", 1,
+ "Output timestamp precision (fractional seconds)"),
+ .global_error_resilient = ARG_DEF(NULL, "global-error-resilient", 1,
+ "Enable global error resiliency features"),
+ .lag_in_frames =
+ ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"),
+ .large_scale_tile = ARG_DEF(
+ NULL, "large-scale-tile", 1,
+ "Large scale tile coding (0: off (default), 1: on (ivf output only))"),
+ .monochrome =
+ ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"),
+ .full_still_picture_hdr = ARG_DEF(NULL, "full-still-picture-hdr", 0,
+ "Use full header for still picture"),
+ .use_16bit_internal =
+ ARG_DEF(NULL, "use-16bit-internal", 0, "Force use of 16-bit pipeline"),
+ .dropframe_thresh =
+ ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"),
+ .resize_mode = ARG_DEF(
+ NULL, "resize-mode", 1,
+ "Frame resize mode (0: off (default), 1: fixed, 2: random, 3: dynamic)"),
+ .resize_denominator =
+ ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator"),
+ .resize_kf_denominator = ARG_DEF(NULL, "resize-kf-denominator", 1,
+ "Frame resize keyframe denominator"),
+ .superres_mode =
+ ARG_DEF(NULL, "superres-mode", 1,
+ "Frame super-resolution mode (0: disabled (default), 1: fixed, "
+ "2: random, 3: qthresh, 4: auto)"),
+ .superres_denominator = ARG_DEF(NULL, "superres-denominator", 1,
+ "Frame super-resolution denominator"),
+ .superres_kf_denominator =
+ ARG_DEF(NULL, "superres-kf-denominator", 1,
+ "Frame super-resolution keyframe denominator"),
+ .superres_qthresh = ARG_DEF(NULL, "superres-qthresh", 1,
+ "Frame super-resolution qindex threshold"),
+ .superres_kf_qthresh =
+ ARG_DEF(NULL, "superres-kf-qthresh", 1,
+ "Frame super-resolution keyframe qindex threshold"),
+ .end_usage =
+ ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum),
+ .target_bitrate = ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"),
+ .min_quantizer = ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"),
+ .max_quantizer = ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"),
+ .undershoot_pct = ARG_DEF(NULL, "undershoot-pct", 1,
+ "Datarate undershoot (min) target (%)"),
+ .overshoot_pct =
+ ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"),
+ .buf_sz = ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"),
+ .buf_initial_sz =
+ ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"),
+ .buf_optimal_sz =
+ ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"),
+ .bias_pct = ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"),
+ .minsection_pct =
+ ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"),
+ .maxsection_pct =
+ ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"),
+ .fwd_kf_enabled =
+ ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes"),
+ .kf_min_dist =
+ ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"),
+ .kf_max_dist =
+ ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"),
+ .kf_disabled = ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"),
+ .sframe_dist = ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)"),
+ .sframe_mode =
+ ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)"),
+ .save_as_annexb = ARG_DEF(NULL, "annexb", 1, "Save as Annex-B"),
+ .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
+ "Noise sensitivity (frames to blur)"),
+ .sharpness = ARG_DEF(NULL, "sharpness", 1,
+ "Bias towards block sharpness in rate-distortion "
+ "optimization of transform coefficients "
+ "(0..7), default is 0"),
+ .static_thresh =
+ ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"),
+ .auto_altref =
+ ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"),
+ .arnr_maxframes =
+ ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"),
+ .arnr_strength =
+ ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"),
+ .tune_metric = ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with",
+ tuning_enum),
+ .dist_metric = ARG_DEF_ENUM(
+ NULL, "dist-metric", 1,
+ "Distortion metric to use for in-block optimization", dist_metric_enum),
+ .cq_level =
+ ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"),
+ .max_intra_rate_pct =
+ ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"),
+#if CONFIG_AV1_ENCODER
+ .cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1,
+ "Speed setting (0..6 in good mode, 5..11 in realtime "
+ "mode, 0..9 in all intra mode)"),
+ .rowmtarg =
+ ARG_DEF(NULL, "row-mt", 1,
+ "Enable row based multi-threading (0: off, 1: on (default))"),
+ .fpmtarg = ARG_DEF(
+ NULL, "fp-mt", 1,
+ "Enable frame parallel multi-threading (0: off (default), 1: on)"),
+ .tile_cols =
+ ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"),
+ .tile_rows =
+ ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2"),
+ .enable_tpl_model = ARG_DEF(NULL, "enable-tpl-model", 1,
+ "RDO based on frame temporal dependency "
+ "(0: off, 1: backward source based); "
+ "required for deltaq mode"),
+ .enable_keyframe_filtering = ARG_DEF(
+ NULL, "enable-keyframe-filtering", 1,
+ "Apply temporal filtering on key frame "
+ "(0: no filter, 1: filter without overlay (default), "
+ "2: filter with overlay - experimental, may break random access in "
+ "players)"),
+ .tile_width = ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)"),
+ .tile_height =
+ ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)"),
+ .lossless = ARG_DEF(NULL, "lossless", 1,
+ "Lossless mode (0: false (default), 1: true)"),
+ .enable_cdef = ARG_DEF(
+ NULL, "enable-cdef", 1,
+ "Enable the constrained directional enhancement filter (0: false, "
+ "1: true (default), 2: disable for non-reference frames)"),
+ .enable_restoration = ARG_DEF(NULL, "enable-restoration", 1,
+ "Enable the loop restoration filter (0: false "
+ "(default in realtime mode), "
+ "1: true (default in non-realtime mode))"),
+ .enable_rect_partitions = ARG_DEF(NULL, "enable-rect-partitions", 1,
+ "Enable rectangular partitions "
+ "(0: false, 1: true (default))"),
+ .enable_ab_partitions =
+ ARG_DEF(NULL, "enable-ab-partitions", 1,
+ "Enable ab partitions (0: false, 1: true (default))"),
+ .enable_1to4_partitions = ARG_DEF(NULL, "enable-1to4-partitions", 1,
+ "Enable 1:4 and 4:1 partitions "
+ "(0: false, 1: true (default))"),
+ .min_partition_size =
+ ARG_DEF(NULL, "min-partition-size", 1,
+ "Set min partition size "
+ "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128); "
+ "with 4k+ resolutions or higher speed settings, min "
+ "partition size will have a minimum of 8"),
+ .max_partition_size =
+ ARG_DEF(NULL, "max-partition-size", 1,
+ "Set max partition size "
+ "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"),
+ .enable_dual_filter = ARG_DEF(NULL, "enable-dual-filter", 1,
+ "Enable dual filter "
+ "(0: false, 1: true (default))"),
+ .enable_chroma_deltaq = ARG_DEF(NULL, "enable-chroma-deltaq", 1,
+ "Enable chroma delta quant "
+ "(0: false (default), 1: true)"),
+ .enable_intra_edge_filter = ARG_DEF(NULL, "enable-intra-edge-filter", 1,
+ "Enable intra edge filtering "
+ "(0: false, 1: true (default))"),
+ .enable_order_hint = ARG_DEF(NULL, "enable-order-hint", 1,
+ "Enable order hint "
+ "(0: false, 1: true (default))"),
+ .enable_tx64 =
+ ARG_DEF(NULL, "enable-tx64", 1,
+ "Enable 64-pt transform (0: false, 1: true (default))"),
+ .enable_flip_idtx =
+ ARG_DEF(NULL, "enable-flip-idtx", 1,
+ "Enable extended transform type (0: false, 1: true (default)) "
+ "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
+ "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
+ "H_ADST, V_FLIPADST, H_FLIPADST"),
+ .enable_rect_tx =
+ ARG_DEF(NULL, "enable-rect-tx", 1,
+ "Enable rectangular transform (0: false, 1: true (default))"),
+ .enable_dist_wtd_comp = ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
+ "Enable distance-weighted compound "
+ "(0: false, 1: true (default))"),
+ .enable_masked_comp = ARG_DEF(NULL, "enable-masked-comp", 1,
+ "Enable masked (wedge/diff-wtd) compound "
+ "(0: false, 1: true (default))"),
+ .enable_onesided_comp = ARG_DEF(NULL, "enable-onesided-comp", 1,
+ "Enable one sided compound "
+ "(0: false, 1: true (default))"),
+ .enable_interintra_comp = ARG_DEF(NULL, "enable-interintra-comp", 1,
+ "Enable interintra compound "
+ "(0: false, 1: true (default))"),
+ .enable_smooth_interintra = ARG_DEF(NULL, "enable-smooth-interintra", 1,
+ "Enable smooth interintra mode "
+ "(0: false, 1: true (default))"),
+ .enable_diff_wtd_comp = ARG_DEF(NULL, "enable-diff-wtd-comp", 1,
+ "Enable difference-weighted compound "
+ "(0: false, 1: true (default))"),
+ .enable_interinter_wedge = ARG_DEF(NULL, "enable-interinter-wedge", 1,
+ "Enable interinter wedge compound "
+ "(0: false, 1: true (default))"),
+ .enable_interintra_wedge = ARG_DEF(NULL, "enable-interintra-wedge", 1,
+ "Enable interintra wedge compound "
+ "(0: false, 1: true (default))"),
+ .enable_global_motion = ARG_DEF(NULL, "enable-global-motion", 1,
+ "Enable global motion "
+ "(0: false, 1: true (default))"),
+ .enable_warped_motion = ARG_DEF(NULL, "enable-warped-motion", 1,
+ "Enable local warped motion "
+ "(0: false, 1: true (default))"),
+ .enable_filter_intra = ARG_DEF(NULL, "enable-filter-intra", 1,
+ "Enable filter intra prediction mode "
+ "(0: false, 1: true (default))"),
+ .enable_smooth_intra = ARG_DEF(NULL, "enable-smooth-intra", 1,
+ "Enable smooth intra prediction modes "
+ "(0: false, 1: true (default))"),
+ .enable_paeth_intra = ARG_DEF(
+ NULL, "enable-paeth-intra", 1,
+ "Enable Paeth intra prediction mode (0: false, 1: true (default))"),
+ .enable_cfl_intra = ARG_DEF(NULL, "enable-cfl-intra", 1,
+ "Enable chroma from luma intra prediction mode "
+ "(0: false, 1: true (default))"),
+ .enable_directional_intra =
+ ARG_DEF(NULL, "enable-directional-intra", 1,
+ "Enable directional intra prediction modes "
+ "(0: false, 1: true (default))"),
+ .enable_diagonal_intra =
+ ARG_DEF(NULL, "enable-diagonal-intra", 1,
+ "Enable diagonal (D45 to D203) intra prediction modes, which are "
+ "a subset of directional modes; has no effect if "
+ "enable-directional-intra is 0 (0: false, 1: true (default))"),
+ .force_video_mode = ARG_DEF(
+ NULL, "force-video-mode", 1,
+ "Force video mode even for a single frame (0: false (default), 1: true)"),
+ .enable_obmc = ARG_DEF(NULL, "enable-obmc", 1,
+ "Enable OBMC (0: false, 1: true (default))"),
+ .enable_overlay =
+ ARG_DEF(NULL, "enable-overlay", 1,
+ "Enable coding overlay frames (0: false, 1: true (default))"),
+ .enable_palette =
+ ARG_DEF(NULL, "enable-palette", 1,
+ "Enable palette prediction mode (0: false, 1: true (default))"),
+ .enable_intrabc = ARG_DEF(NULL, "enable-intrabc", 1,
+ "Enable intra block copy prediction mode "
+ "(0: false, 1: true (default))"),
+ .enable_angle_delta =
+ ARG_DEF(NULL, "enable-angle-delta", 1,
+ "Enable intra angle delta (0: false, 1: true (default))"),
+ .disable_trellis_quant = ARG_DEF(
+ NULL, "disable-trellis-quant", 1,
+ "Disable trellis optimization of quantized coefficients (0: false "
+ "1: true 2: true for rd search 3: true for estimate yrd search "
+ "(default))"),
+ .enable_qm =
+ ARG_DEF(NULL, "enable-qm", 1,
+ "Enable quantisation matrices (0: false (default), 1: true)"),
+ .qm_min = ARG_DEF(NULL, "qm-min", 1,
+ "Min quant matrix flatness (0..15), default is 8"),
+ .qm_max = ARG_DEF(NULL, "qm-max", 1,
+ "Max quant matrix flatness (0..15), default is 15"),
+ .reduced_tx_type_set = ARG_DEF(NULL, "reduced-tx-type-set", 1,
+ "Use reduced set of transform types"),
+ .use_intra_dct_only =
+ ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes"),
+ .use_inter_dct_only =
+ ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes"),
+ .use_intra_default_tx_only =
+ ARG_DEF(NULL, "use-intra-default-tx-only", 1,
+ "Use Default-transform only for INTRA modes"),
+ .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"),
+ .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
+ "Update freq for coeff costs. "
+ "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+ .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1,
+ "Update freq for mode costs. "
+ "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+ .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1,
+ "Update freq for mv costs. "
+ "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+ .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1,
+ "Update freq for dv costs. "
+ "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+ .num_tg = ARG_DEF(NULL, "num-tile-groups", 1,
+ "Maximum number of tile groups, default is 1"),
+ .mtu_size =
+ ARG_DEF(NULL, "mtu-size", 1,
+ "MTU size for a tile group, default is 0 (no MTU targeting), "
+ "overrides maximum number of tile groups"),
+ .timing_info = ARG_DEF_ENUM(
+ NULL, "timing-info", 1,
+ "Signal timing info in the bitstream (model only works for no "
+ "hidden frames, no super-res yet):",
+ timing_info_enum),
+#if CONFIG_TUNE_VMAF
+ .vmaf_model_path =
+ ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"),
+#endif
+ .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1,
+ "Partition information read and write path"),
+ .enable_rate_guide_deltaq =
+ ARG_DEF(NULL, "enable-rate-guide-deltaq", 1,
+ "Enable rate guide deltaq (1), by default off (0). "
+ "It requires --deltaq-mode=3. "
+ "If turned on, it requires an input file specified "
+ "by --rate-distribution-info."),
+ .rate_distribution_info =
+ ARG_DEF(NULL, "rate-distribution-info", 1,
+ "Rate distribution information input."
+ "It requires --enable-rate-guide-deltaq=1."),
+ .film_grain_test = ARG_DEF(
+ NULL, "film-grain-test", 1,
+ "Film grain test vectors (0: none (default), 1: test-1 2: test-2, "
+ "... 16: test-16)"),
+ .film_grain_table = ARG_DEF(NULL, "film-grain-table", 1,
+ "Path to file containing film grain parameters"),
+#if CONFIG_DENOISE
+ .denoise_noise_level =
+ ARG_DEF(NULL, "denoise-noise-level", 1,
+ "Amount of noise (from 0 = don't denoise, to 50)"),
+ .denoise_block_size = ARG_DEF(NULL, "denoise-block-size", 1,
+ "Denoise block size (default = 32)"),
+ .enable_dnl_denoising = ARG_DEF(NULL, "enable-dnl-denoising", 1,
+ "Apply denoising to the frame "
+ "being encoded when denoise-noise-level is "
+ "enabled (0: false, 1: true (default))"),
+#endif
+ .enable_ref_frame_mvs =
+ ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
+ "Enable temporal mv prediction (default is 1)"),
+ .frame_parallel_decoding =
+ ARG_DEF(NULL, "frame-parallel", 1,
+ "Enable frame parallel decodability features "
+ "(0: false (default), 1: true)"),
+ .error_resilient_mode = ARG_DEF(NULL, "error-resilient", 1,
+ "Enable error resilient features "
+ "(0: false (default), 1: true)"),
+ .aq_mode = ARG_DEF(NULL, "aq-mode", 1,
+ "Adaptive quantization mode (0: off (default), 1: "
+ "variance 2: complexity, "
+ "3: cyclic refresh)"),
+ .deltaq_mode =
+ ARG_DEF(NULL, "deltaq-mode", 1,
+ "Delta qindex mode (0: off, 1: deltaq objective (default), "
+ "2: deltaq placeholder, 3: key frame visual quality, 4: user "
+ "rating based visual quality optimization); "
+ "requires --enable-tpl-model=1"),
+ .deltaq_strength = ARG_DEF(NULL, "deltaq-strength", 1,
+ "Deltaq strength for"
+ " --deltaq-mode=4 (%)"),
+ .deltalf_mode = ARG_DEF(NULL, "delta-lf-mode", 1,
+ "Enable delta-lf-mode (0: off (default), 1: on)"),
+ .frame_periodic_boost =
+ ARG_DEF(NULL, "frame-boost", 1,
+ "Enable frame periodic boost (0: off (default), 1: on)"),
+ .gf_cbr_boost_pct = ARG_DEF(NULL, "gf-cbr-boost", 1,
+ "Boost for Golden Frame in CBR mode (pct)"),
+ .max_inter_rate_pct =
+ ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"),
+ .min_gf_interval = ARG_DEF(
+ NULL, "min-gf-interval", 1,
+ "Min gf/arf frame interval (default 0, indicating in-built behavior)"),
+ .max_gf_interval = ARG_DEF(
+ NULL, "max-gf-interval", 1,
+ "Max gf/arf frame interval (default 0, indicating in-built behavior)"),
+ .gf_min_pyr_height =
+ ARG_DEF(NULL, "gf-min-pyr-height", 1,
+ "Min height for GF group pyramid structure (0 (default) to 5)"),
+ .gf_max_pyr_height = ARG_DEF(
+ NULL, "gf-max-pyr-height", 1,
+ "Maximum height for GF group pyramid structure (0 to 5 (default))"),
+ .max_reference_frames = ARG_DEF(NULL, "max-reference-frames", 1,
+ "Maximum number of reference frames allowed "
+ "per frame (3 to 7 (default))"),
+ .reduced_reference_set =
+ ARG_DEF(NULL, "reduced-reference-set", 1,
+ "Use reduced set of single and compound references (0: off "
+ "(default), 1: on)"),
+ .target_seq_level_idx =
+ ARG_DEF(NULL, "target-seq-level-idx", 1,
+ "Target sequence level index. "
+ "Possible values are in the form of \"ABxy\". "
+ "AB: Operating point (OP) index, "
+ "xy: Target level index for the OP. "
+ "E.g. \"0\" means target level index 0 (2.0) for the 0th OP, "
+ "\"1019\" means target level index 19 (6.3) for the 10th OP."),
+ .set_min_cr = ARG_DEF(
+ NULL, "min-cr", 1,
+ "Set minimum compression ratio. Take integer values. Default is 0. "
+ "If non-zero, encoder will try to keep the compression ratio of "
+ "each frame to be higher than the given value divided by 100."),
+
+ .input_color_primaries = ARG_DEF_ENUM(
+ NULL, "color-primaries", 1,
+ "Color primaries (CICP) of input content:", color_primaries_enum),
+
+ .input_transfer_characteristics =
+ ARG_DEF_ENUM(NULL, "transfer-characteristics", 1,
+ "Transfer characteristics (CICP) of input content:",
+ transfer_characteristics_enum),
+
+ .input_matrix_coefficients = ARG_DEF_ENUM(
+ NULL, "matrix-coefficients", 1,
+ "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum),
+
+ .input_chroma_sample_position =
+ ARG_DEF_ENUM(NULL, "chroma-sample-position", 1,
+ "The chroma sample position when chroma 4:2:0 is signaled:",
+ chroma_sample_position_enum),
+
+ .tune_content = ARG_DEF_ENUM(NULL, "tune-content", 1, "Tune content type",
+ tune_content_enum),
+
+ .cdf_update_mode =
+ ARG_DEF(NULL, "cdf-update-mode", 1,
+ "CDF update mode for entropy coding "
+ "(0: no CDF update, 1: update CDF on all frames (default), "
+ "2: selectively update CDF on some frames)"),
+
+ .superblock_size = ARG_DEF_ENUM(NULL, "sb-size", 1, "Superblock size to use",
+ superblock_size_enum),
+
+ .set_tier_mask =
+ ARG_DEF(NULL, "set-tier-mask", 1,
+ "Set bit mask to specify which tier each of the 32 possible "
+ "operating points conforms to. "
+ "Bit value 0 (default): Main Tier, 1: High Tier."),
+
+ .use_fixed_qp_offsets =
+ ARG_DEF(NULL, "use-fixed-qp-offsets", 1,
+ "Enable fixed QP offsets for frames at different levels of the "
+ "pyramid. Selected automatically from --cq-level if "
+ "--fixed-qp-offsets is not provided. If this option is not "
+ "specified (default), offsets are adaptively chosen by the "
+ "encoder."),
+
+ .fixed_qp_offsets = ARG_DEF(
+ NULL, "fixed-qp-offsets", 1,
+ "Set fixed QP offsets for frames at different levels of the "
+ "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, "
+ "and 3 levels of internal alt-refs. If this option is not "
+ "specified (default), offsets are adaptively chosen by the "
+ "encoder."),
+
+ .vbr_corpus_complexity_lap = ARG_DEF(
+ NULL, "vbr-corpus-complexity-lap", 1,
+ "Set average corpus complexity per mb for single pass VBR using lap. "
+ "(0..10000), default is 0"),
+
+ .fwd_kf_dist = ARG_DEF(NULL, "fwd-kf-dist", -1,
+ "Set distance between forward keyframes. A value of "
+ "-1 (default) means no repetitive forward keyframes."),
+
+ .enable_tx_size_search = ARG_DEF(
+ NULL, "enable-tx-size-search", 1,
+ "Enable transform size search to find the best size for each block. "
+ "If false, transforms always have the largest possible size "
+ "(0: false, 1: true (default)). Ignored in non rd pick mode in "
+ "real-time coding."),
+
+ .loopfilter_control = ARG_DEF(
+ NULL, "loopfilter-control", 1,
+ "Control loop filtering "
+ "(0: Loopfilter disabled for all frames, 1: Enable loopfilter for all "
+ "frames (default), 2: Disable loopfilter for non-reference frames, 3: "
+ "Disable loopfilter for frames with low motion)"),
+
+ .auto_intra_tools_off = ARG_DEF(
+ NULL, "auto-intra-tools-off", 1,
+ "Automatically turn off several intra coding tools for allintra mode; "
+ "only in effect if --deltaq-mode=3"),
+
+ .two_pass_input =
+ ARG_DEF(NULL, "two-pass-input", 1,
+ "The input file for the second pass for three-pass encoding"),
+ .two_pass_output = ARG_DEF(
+ NULL, "two-pass-output", 1,
+ "The output file for the first two passes for three-pass encoding"),
+ .two_pass_width =
+ ARG_DEF(NULL, "two-pass-width", 1, "The width of two-pass-input"),
+ .two_pass_height =
+ ARG_DEF(NULL, "two-pass-height", 1, "The height of two-pass-input"),
+ .second_pass_log =
+ ARG_DEF("spf", "second-pass-log", 1, "Log file from second pass"),
+ .strict_level_conformance =
+ ARG_DEF(NULL, "strict-level-conformance", 1,
+ "When set to 1, exit the encoder when it fails to encode "
+ "to a given target level"),
+ .kf_max_pyr_height = ARG_DEF(
+ NULL, "kf-max-pyr-height", 1,
+ "Maximum height of pyramid structure used for the GOP starting with a "
+ "key frame (-1 to 5). When set to -1 (default), it does not have any "
+ "effect. The actual maximum pyramid height will be the minimum of this "
+ "value and the value of gf_max_pyr_height."),
+ .sb_qp_sweep =
+ ARG_DEF(NULL, "sb-qp-sweep", 1,
+ "When set to 1, enable the superblock level qp sweep for a "
+ "given lambda to minimize the rdcost."),
+#endif // CONFIG_AV1_ENCODER
+};
diff --git a/third_party/aom/av1/arg_defs.h b/third_party/aom/av1/arg_defs.h
new file mode 100644
index 0000000000..73c78caec8
--- /dev/null
+++ b/third_party/aom/av1/arg_defs.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ARG_DEFS_H_
+#define AOM_AV1_ARG_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+#include "common/args_helper.h"
+#if CONFIG_WEBM_IO
+#include "common/webmenc.h"
+#endif
+#include "aom/aomcx.h"
+
+enum TestDecodeFatality {
+ TEST_DECODE_OFF,
+ TEST_DECODE_FATAL,
+ TEST_DECODE_WARN,
+};
+
+typedef struct av1_codec_arg_definitions {
+ arg_def_t help;
+ arg_def_t debugmode;
+ arg_def_t outputfile;
+ arg_def_t use_nv12;
+ arg_def_t use_yv12;
+ arg_def_t use_i420;
+ arg_def_t use_i422;
+ arg_def_t use_i444;
+ arg_def_t codecarg;
+ arg_def_t passes;
+ arg_def_t pass_arg;
+ arg_def_t fpf_name;
+ arg_def_t limit;
+ arg_def_t skip;
+ arg_def_t good_dl;
+ arg_def_t rt_dl;
+ arg_def_t ai_dl;
+ arg_def_t quietarg;
+ arg_def_t verbosearg;
+ arg_def_t psnrarg;
+ arg_def_t use_cfg;
+ arg_def_t recontest;
+ arg_def_t framerate;
+ arg_def_t use_webm;
+ arg_def_t use_ivf;
+ arg_def_t use_obu;
+ arg_def_t q_hist_n;
+ arg_def_t rate_hist_n;
+ arg_def_t disable_warnings;
+ arg_def_t disable_warning_prompt;
+ arg_def_t bitdeptharg;
+ arg_def_t inbitdeptharg;
+ arg_def_t input_chroma_subsampling_x;
+ arg_def_t input_chroma_subsampling_y;
+ arg_def_t usage;
+ arg_def_t threads;
+ arg_def_t profile;
+ arg_def_t width;
+ arg_def_t height;
+ arg_def_t forced_max_frame_width;
+ arg_def_t forced_max_frame_height;
+#if CONFIG_WEBM_IO
+ arg_def_t stereo_mode;
+#endif
+ arg_def_t timebase;
+ arg_def_t global_error_resilient;
+ arg_def_t lag_in_frames;
+ arg_def_t large_scale_tile;
+ arg_def_t monochrome;
+ arg_def_t full_still_picture_hdr;
+ arg_def_t use_16bit_internal;
+ arg_def_t dropframe_thresh;
+ arg_def_t resize_mode;
+ arg_def_t resize_denominator;
+ arg_def_t resize_kf_denominator;
+ arg_def_t superres_mode;
+ arg_def_t superres_denominator;
+ arg_def_t superres_kf_denominator;
+ arg_def_t superres_qthresh;
+ arg_def_t superres_kf_qthresh;
+ arg_def_t end_usage;
+ arg_def_t target_bitrate;
+ arg_def_t min_quantizer;
+ arg_def_t max_quantizer;
+ arg_def_t undershoot_pct;
+ arg_def_t overshoot_pct;
+ arg_def_t buf_sz;
+ arg_def_t buf_initial_sz;
+ arg_def_t buf_optimal_sz;
+ arg_def_t bias_pct;
+ arg_def_t minsection_pct;
+ arg_def_t maxsection_pct;
+ arg_def_t fwd_kf_enabled;
+ arg_def_t kf_min_dist;
+ arg_def_t kf_max_dist;
+ arg_def_t kf_disabled;
+ arg_def_t sframe_dist;
+ arg_def_t sframe_mode;
+ arg_def_t save_as_annexb;
+ arg_def_t noise_sens;
+ arg_def_t sharpness;
+ arg_def_t static_thresh;
+ arg_def_t auto_altref;
+ arg_def_t arnr_maxframes;
+ arg_def_t arnr_strength;
+ arg_def_t tune_metric;
+ arg_def_t dist_metric;
+ arg_def_t cq_level;
+ arg_def_t max_intra_rate_pct;
+#if CONFIG_AV1_ENCODER
+ arg_def_t cpu_used_av1;
+ arg_def_t rowmtarg;
+ arg_def_t fpmtarg;
+ arg_def_t tile_cols;
+ arg_def_t tile_rows;
+ arg_def_t enable_tpl_model;
+ arg_def_t enable_keyframe_filtering;
+ arg_def_t tile_width;
+ arg_def_t tile_height;
+ arg_def_t lossless;
+ arg_def_t enable_cdef;
+ arg_def_t enable_restoration;
+ arg_def_t enable_rect_partitions;
+ arg_def_t enable_ab_partitions;
+ arg_def_t enable_1to4_partitions;
+ arg_def_t min_partition_size;
+ arg_def_t max_partition_size;
+ arg_def_t enable_dual_filter;
+ arg_def_t enable_chroma_deltaq;
+ arg_def_t enable_intra_edge_filter;
+ arg_def_t enable_order_hint;
+ arg_def_t enable_tx64;
+ arg_def_t enable_flip_idtx;
+ arg_def_t enable_rect_tx;
+ arg_def_t enable_dist_wtd_comp;
+ arg_def_t enable_masked_comp;
+ arg_def_t enable_onesided_comp;
+ arg_def_t enable_interintra_comp;
+ arg_def_t enable_smooth_interintra;
+ arg_def_t enable_diff_wtd_comp;
+ arg_def_t enable_interinter_wedge;
+ arg_def_t enable_interintra_wedge;
+ arg_def_t enable_global_motion;
+ arg_def_t enable_warped_motion;
+ arg_def_t enable_filter_intra;
+ arg_def_t enable_smooth_intra;
+ arg_def_t enable_paeth_intra;
+ arg_def_t enable_cfl_intra;
+ arg_def_t enable_directional_intra;
+ arg_def_t enable_diagonal_intra;
+ arg_def_t force_video_mode;
+ arg_def_t enable_obmc;
+ arg_def_t enable_overlay;
+ arg_def_t enable_palette;
+ arg_def_t enable_intrabc;
+ arg_def_t enable_angle_delta;
+ arg_def_t disable_trellis_quant;
+ arg_def_t enable_qm;
+ arg_def_t qm_min;
+ arg_def_t qm_max;
+ arg_def_t reduced_tx_type_set;
+ arg_def_t use_intra_dct_only;
+ arg_def_t use_inter_dct_only;
+ arg_def_t use_intra_default_tx_only;
+ arg_def_t quant_b_adapt;
+ arg_def_t coeff_cost_upd_freq;
+ arg_def_t mode_cost_upd_freq;
+ arg_def_t mv_cost_upd_freq;
+ arg_def_t dv_cost_upd_freq;
+ arg_def_t num_tg;
+ arg_def_t mtu_size;
+ arg_def_t timing_info;
+#if CONFIG_TUNE_VMAF
+ arg_def_t vmaf_model_path;
+#endif
+ arg_def_t partition_info_path;
+ arg_def_t enable_rate_guide_deltaq;
+ arg_def_t rate_distribution_info;
+ arg_def_t film_grain_test;
+ arg_def_t film_grain_table;
+#if CONFIG_DENOISE
+ arg_def_t denoise_noise_level;
+ arg_def_t denoise_block_size;
+ arg_def_t enable_dnl_denoising;
+#endif
+ arg_def_t enable_ref_frame_mvs;
+ arg_def_t frame_parallel_decoding;
+ arg_def_t error_resilient_mode;
+ arg_def_t aq_mode;
+ arg_def_t deltaq_mode;
+ arg_def_t deltaq_strength;
+ arg_def_t deltalf_mode;
+ arg_def_t frame_periodic_boost;
+ arg_def_t gf_cbr_boost_pct;
+ arg_def_t max_inter_rate_pct;
+ arg_def_t min_gf_interval;
+ arg_def_t max_gf_interval;
+ arg_def_t gf_min_pyr_height;
+ arg_def_t gf_max_pyr_height;
+ arg_def_t max_reference_frames;
+ arg_def_t reduced_reference_set;
+ arg_def_t target_seq_level_idx;
+ arg_def_t set_min_cr;
+ arg_def_t input_color_primaries;
+ arg_def_t input_transfer_characteristics;
+ arg_def_t input_matrix_coefficients;
+ arg_def_t input_chroma_sample_position;
+ arg_def_t tune_content;
+ arg_def_t cdf_update_mode;
+ arg_def_t superblock_size;
+ arg_def_t set_tier_mask;
+ arg_def_t use_fixed_qp_offsets;
+ arg_def_t fixed_qp_offsets;
+ arg_def_t vbr_corpus_complexity_lap;
+ arg_def_t fwd_kf_dist;
+ arg_def_t enable_tx_size_search;
+ arg_def_t loopfilter_control;
+ arg_def_t two_pass_input;
+ arg_def_t two_pass_output;
+ arg_def_t two_pass_width;
+ arg_def_t two_pass_height;
+ arg_def_t second_pass_log;
+ arg_def_t auto_intra_tools_off;
+ arg_def_t strict_level_conformance;
+ arg_def_t kf_max_pyr_height;
+ arg_def_t sb_qp_sweep;
+#endif // CONFIG_AV1_ENCODER
+} av1_codec_arg_definitions_t;
+
+extern const av1_codec_arg_definitions_t g_av1_codec_arg_defs;
+
+#ifdef __cplusplus
+}
+#endif
+#endif // AOM_AV1_ARG_DEFS_H_
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
new file mode 100644
index 0000000000..15577d0c0e
--- /dev/null
+++ b/third_party/aom/av1/av1.cmake
@@ -0,0 +1,715 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AV1_AV1_CMAKE_)
+ return()
+endif() # AOM_AV1_AV1_CMAKE_
+set(AOM_AV1_AV1_CMAKE_ 1)
+
+list(APPEND AOM_AV1_COMMON_SOURCES
+ "${AOM_ROOT}/common/args_helper.h"
+ "${AOM_ROOT}/common/args_helper.c"
+ "${AOM_ROOT}/av1/arg_defs.h"
+ "${AOM_ROOT}/av1/arg_defs.c"
+ "${AOM_ROOT}/av1/av1_iface_common.h"
+ "${AOM_ROOT}/av1/common/alloccommon.c"
+ "${AOM_ROOT}/av1/common/alloccommon.h"
+ "${AOM_ROOT}/av1/common/av1_common_int.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+ "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+ "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+ "${AOM_ROOT}/av1/common/av1_txfm.c"
+ "${AOM_ROOT}/av1/common/av1_txfm.h"
+ "${AOM_ROOT}/av1/common/blockd.c"
+ "${AOM_ROOT}/av1/common/blockd.h"
+ "${AOM_ROOT}/av1/common/cdef.c"
+ "${AOM_ROOT}/av1/common/cdef.h"
+ "${AOM_ROOT}/av1/common/cdef_block.c"
+ "${AOM_ROOT}/av1/common/cdef_block.h"
+ "${AOM_ROOT}/av1/common/cfl.c"
+ "${AOM_ROOT}/av1/common/cfl.h"
+ "${AOM_ROOT}/av1/common/common.h"
+ "${AOM_ROOT}/av1/common/common_data.c"
+ "${AOM_ROOT}/av1/common/common_data.h"
+ "${AOM_ROOT}/av1/common/convolve.c"
+ "${AOM_ROOT}/av1/common/convolve.h"
+ "${AOM_ROOT}/av1/common/debugmodes.c"
+ "${AOM_ROOT}/av1/common/entropy.c"
+ "${AOM_ROOT}/av1/common/entropy.h"
+ "${AOM_ROOT}/av1/common/entropymode.c"
+ "${AOM_ROOT}/av1/common/entropymode.h"
+ "${AOM_ROOT}/av1/common/entropymv.c"
+ "${AOM_ROOT}/av1/common/entropymv.h"
+ "${AOM_ROOT}/av1/common/enums.h"
+ "${AOM_ROOT}/av1/common/filter.h"
+ "${AOM_ROOT}/av1/common/frame_buffers.c"
+ "${AOM_ROOT}/av1/common/frame_buffers.h"
+ "${AOM_ROOT}/av1/common/idct.c"
+ "${AOM_ROOT}/av1/common/idct.h"
+ "${AOM_ROOT}/av1/common/mv.h"
+ "${AOM_ROOT}/av1/common/mvref_common.c"
+ "${AOM_ROOT}/av1/common/mvref_common.h"
+ "${AOM_ROOT}/av1/common/obu_util.c"
+ "${AOM_ROOT}/av1/common/obu_util.h"
+ "${AOM_ROOT}/av1/common/pred_common.c"
+ "${AOM_ROOT}/av1/common/pred_common.h"
+ "${AOM_ROOT}/av1/common/quant_common.c"
+ "${AOM_ROOT}/av1/common/quant_common.h"
+ "${AOM_ROOT}/av1/common/reconinter.c"
+ "${AOM_ROOT}/av1/common/reconinter.h"
+ "${AOM_ROOT}/av1/common/reconinter_template.inc"
+ "${AOM_ROOT}/av1/common/reconintra.c"
+ "${AOM_ROOT}/av1/common/reconintra.h"
+ "${AOM_ROOT}/av1/common/resize.c"
+ "${AOM_ROOT}/av1/common/resize.h"
+ "${AOM_ROOT}/av1/common/restoration.c"
+ "${AOM_ROOT}/av1/common/restoration.h"
+ "${AOM_ROOT}/av1/common/scale.c"
+ "${AOM_ROOT}/av1/common/scale.h"
+ "${AOM_ROOT}/av1/common/scan.c"
+ "${AOM_ROOT}/av1/common/scan.h"
+ "${AOM_ROOT}/av1/common/seg_common.c"
+ "${AOM_ROOT}/av1/common/seg_common.h"
+ "${AOM_ROOT}/av1/common/thread_common.c"
+ "${AOM_ROOT}/av1/common/thread_common.h"
+ "${AOM_ROOT}/av1/common/tile_common.c"
+ "${AOM_ROOT}/av1/common/tile_common.h"
+ "${AOM_ROOT}/av1/common/timing.c"
+ "${AOM_ROOT}/av1/common/timing.h"
+ "${AOM_ROOT}/av1/common/token_cdfs.h"
+ "${AOM_ROOT}/av1/common/txb_common.c"
+ "${AOM_ROOT}/av1/common/txb_common.h"
+ "${AOM_ROOT}/av1/common/warped_motion.c"
+ "${AOM_ROOT}/av1/common/warped_motion.h")
+
+list(APPEND AOM_AV1_DECODER_SOURCES
+ "${AOM_ROOT}/av1/av1_dx_iface.c"
+ "${AOM_ROOT}/av1/decoder/decodeframe.c"
+ "${AOM_ROOT}/av1/decoder/decodeframe.h"
+ "${AOM_ROOT}/av1/decoder/decodemv.c"
+ "${AOM_ROOT}/av1/decoder/decodemv.h"
+ "${AOM_ROOT}/av1/decoder/decoder.c"
+ "${AOM_ROOT}/av1/decoder/decoder.h"
+ "${AOM_ROOT}/av1/decoder/decodetxb.c"
+ "${AOM_ROOT}/av1/decoder/decodetxb.h"
+ "${AOM_ROOT}/av1/decoder/detokenize.c"
+ "${AOM_ROOT}/av1/decoder/detokenize.h"
+ "${AOM_ROOT}/av1/decoder/dthread.h"
+ "${AOM_ROOT}/av1/decoder/grain_synthesis.c"
+ "${AOM_ROOT}/av1/decoder/grain_synthesis.h"
+ "${AOM_ROOT}/av1/decoder/obu.h"
+ "${AOM_ROOT}/av1/decoder/obu.c")
+
+list(APPEND AOM_AV1_ENCODER_SOURCES
+ "${AOM_ROOT}/av1/av1_cx_iface.c"
+ "${AOM_ROOT}/av1/av1_cx_iface.h"
+ "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+ "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+ "${AOM_ROOT}/av1/encoder/aq_variance.c"
+ "${AOM_ROOT}/av1/encoder/aq_variance.h"
+ "${AOM_ROOT}/av1/encoder/allintra_vis.c"
+ "${AOM_ROOT}/av1/encoder/allintra_vis.h"
+ "${AOM_ROOT}/av1/encoder/enc_enums.h"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
+ "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+ "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+ "${AOM_ROOT}/av1/encoder/bitstream.c"
+ "${AOM_ROOT}/av1/encoder/bitstream.h"
+ "${AOM_ROOT}/av1/encoder/block.h"
+ "${AOM_ROOT}/av1/encoder/cnn.c"
+ "${AOM_ROOT}/av1/encoder/cnn.h"
+ "${AOM_ROOT}/av1/encoder/compound_type.c"
+ "${AOM_ROOT}/av1/encoder/compound_type.h"
+ "${AOM_ROOT}/av1/encoder/context_tree.c"
+ "${AOM_ROOT}/av1/encoder/context_tree.h"
+ "${AOM_ROOT}/av1/encoder/cost.c"
+ "${AOM_ROOT}/av1/encoder/cost.h"
+ "${AOM_ROOT}/av1/encoder/encodeframe.c"
+ "${AOM_ROOT}/av1/encoder/encodeframe.h"
+ "${AOM_ROOT}/av1/encoder/encodeframe_utils.c"
+ "${AOM_ROOT}/av1/encoder/encodeframe_utils.h"
+ "${AOM_ROOT}/av1/encoder/encodemb.c"
+ "${AOM_ROOT}/av1/encoder/encodemb.h"
+ "${AOM_ROOT}/av1/encoder/encodemv.c"
+ "${AOM_ROOT}/av1/encoder/encodemv.h"
+ "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+ "${AOM_ROOT}/av1/encoder/encode_strategy.h"
+ "${AOM_ROOT}/av1/encoder/encoder.c"
+ "${AOM_ROOT}/av1/encoder/encoder.h"
+ "${AOM_ROOT}/av1/encoder/encoder_alloc.h"
+ "${AOM_ROOT}/av1/encoder/encoder_utils.c"
+ "${AOM_ROOT}/av1/encoder/encoder_utils.h"
+ "${AOM_ROOT}/av1/encoder/encodetxb.c"
+ "${AOM_ROOT}/av1/encoder/encodetxb.h"
+ "${AOM_ROOT}/av1/encoder/ethread.c"
+ "${AOM_ROOT}/av1/encoder/ethread.h"
+ "${AOM_ROOT}/av1/encoder/extend.c"
+ "${AOM_ROOT}/av1/encoder/extend.h"
+ "${AOM_ROOT}/av1/encoder/external_partition.c"
+ "${AOM_ROOT}/av1/encoder/external_partition.h"
+ "${AOM_ROOT}/av1/encoder/firstpass.c"
+ "${AOM_ROOT}/av1/encoder/firstpass.h"
+ "${AOM_ROOT}/av1/encoder/global_motion.c"
+ "${AOM_ROOT}/av1/encoder/global_motion.h"
+ "${AOM_ROOT}/av1/encoder/global_motion_facade.c"
+ "${AOM_ROOT}/av1/encoder/global_motion_facade.h"
+ "${AOM_ROOT}/av1/encoder/gop_structure.c"
+ "${AOM_ROOT}/av1/encoder/gop_structure.h"
+ "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
+ "${AOM_ROOT}/av1/encoder/hash.c"
+ "${AOM_ROOT}/av1/encoder/hash.h"
+ "${AOM_ROOT}/av1/encoder/hash_motion.c"
+ "${AOM_ROOT}/av1/encoder/hash_motion.h"
+ "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+ "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+ "${AOM_ROOT}/av1/encoder/interp_search.c"
+ "${AOM_ROOT}/av1/encoder/interp_search.h"
+ "${AOM_ROOT}/av1/encoder/level.c"
+ "${AOM_ROOT}/av1/encoder/level.h"
+ "${AOM_ROOT}/av1/encoder/lookahead.c"
+ "${AOM_ROOT}/av1/encoder/lookahead.h"
+ "${AOM_ROOT}/av1/encoder/mcomp.c"
+ "${AOM_ROOT}/av1/encoder/mcomp.h"
+ "${AOM_ROOT}/av1/encoder/mcomp_structs.h"
+ "${AOM_ROOT}/av1/encoder/ml.c"
+ "${AOM_ROOT}/av1/encoder/ml.h"
+ "${AOM_ROOT}/av1/encoder/model_rd.h"
+ "${AOM_ROOT}/av1/encoder/motion_search_facade.c"
+ "${AOM_ROOT}/av1/encoder/motion_search_facade.h"
+ "${AOM_ROOT}/av1/encoder/mv_prec.c"
+ "${AOM_ROOT}/av1/encoder/mv_prec.h"
+ "${AOM_ROOT}/av1/encoder/palette.c"
+ "${AOM_ROOT}/av1/encoder/palette.h"
+ "${AOM_ROOT}/av1/encoder/partition_search.h"
+ "${AOM_ROOT}/av1/encoder/partition_search.c"
+ "${AOM_ROOT}/av1/encoder/partition_strategy.h"
+ "${AOM_ROOT}/av1/encoder/partition_strategy.c"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+ "${AOM_ROOT}/av1/encoder/pickcdef.c"
+ "${AOM_ROOT}/av1/encoder/pickcdef.h"
+ "${AOM_ROOT}/av1/encoder/picklpf.c"
+ "${AOM_ROOT}/av1/encoder/picklpf.h"
+ "${AOM_ROOT}/av1/encoder/pickrst.c"
+ "${AOM_ROOT}/av1/encoder/pickrst.h"
+ "${AOM_ROOT}/av1/encoder/ratectrl.c"
+ "${AOM_ROOT}/av1/encoder/ratectrl.h"
+ "${AOM_ROOT}/av1/encoder/rc_utils.h"
+ "${AOM_ROOT}/av1/encoder/rd.c"
+ "${AOM_ROOT}/av1/encoder/rd.h"
+ "${AOM_ROOT}/av1/encoder/rdopt.c"
+ "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c"
+ "${AOM_ROOT}/av1/encoder/nonrd_opt.c"
+ "${AOM_ROOT}/av1/encoder/nonrd_opt.h"
+ "${AOM_ROOT}/av1/encoder/rdopt.h"
+ "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h"
+ "${AOM_ROOT}/av1/encoder/rdopt_utils.h"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
+ "${AOM_ROOT}/av1/encoder/segmentation.c"
+ "${AOM_ROOT}/av1/encoder/segmentation.h"
+ "${AOM_ROOT}/av1/encoder/sorting_network.h"
+ "${AOM_ROOT}/av1/encoder/speed_features.c"
+ "${AOM_ROOT}/av1/encoder/speed_features.h"
+ "${AOM_ROOT}/av1/encoder/superres_scale.c"
+ "${AOM_ROOT}/av1/encoder/superres_scale.h"
+ "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
+ "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+ "${AOM_ROOT}/av1/encoder/thirdpass.c"
+ "${AOM_ROOT}/av1/encoder/thirdpass.h"
+ "${AOM_ROOT}/av1/encoder/tokenize.c"
+ "${AOM_ROOT}/av1/encoder/tokenize.h"
+ "${AOM_ROOT}/av1/encoder/tpl_model.c"
+ "${AOM_ROOT}/av1/encoder/tpl_model.h"
+ "${AOM_ROOT}/av1/encoder/tx_search.c"
+ "${AOM_ROOT}/av1/encoder/tx_search.h"
+ "${AOM_ROOT}/av1/encoder/txb_rdopt.c"
+ "${AOM_ROOT}/av1/encoder/txb_rdopt.h"
+ "${AOM_ROOT}/av1/encoder/txb_rdopt_utils.h"
+ "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+ "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+ "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
+ "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+ "${AOM_ROOT}/av1/encoder/var_based_part.c"
+ "${AOM_ROOT}/av1/encoder/var_based_part.h"
+ "${AOM_ROOT}/av1/encoder/av1_noise_estimate.c"
+ "${AOM_ROOT}/av1/encoder/av1_noise_estimate.h"
+ "${AOM_ROOT}/third_party/fastfeat/fast.c"
+ "${AOM_ROOT}/third_party/fastfeat/fast.h"
+ "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
+ "${AOM_ROOT}/third_party/fastfeat/nonmax.c"
+ "${AOM_ROOT}/third_party/vector/vector.c"
+ "${AOM_ROOT}/third_party/vector/vector.h"
+ "${AOM_ROOT}/av1/encoder/dwt.c"
+ "${AOM_ROOT}/av1/encoder/dwt.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
+ "${AOM_ROOT}/av1/common/x86/cdef_block_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+ "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/resize_ssse3.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+ "${AOM_ROOT}/av1/common/x86/cdef_block_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+ "${AOM_ROOT}/av1/common/x86/cdef_block_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+ "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+ "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c"
+ "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
+ "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+ "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h"
+ "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
+ "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+ "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+ "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+ "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+ "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+ "${AOM_ROOT}/av1/common/arm/cdef_block_neon.c"
+ "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+ "${AOM_ROOT}/av1/common/arm/compound_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+ "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c"
+ "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+ "${AOM_ROOT}/av1/common/arm/reconintra_neon.c"
+ "${AOM_ROOT}/av1/common/arm/resize_neon.c"
+ "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+ "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
+ "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD
+ "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM
+ "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c"
+ "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SVE
+ "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
+ "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
+if(CONFIG_TUNE_VMAF)
+ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c"
+ "${AOM_ROOT}/av1/encoder/tune_vmaf.h")
+endif()
+
+if(CONFIG_TUNE_BUTTERAUGLI)
+ list(APPEND AOM_AV1_ENCODER_SOURCES
+ "${AOM_ROOT}/av1/encoder/tune_butteraugli.c"
+ "${AOM_ROOT}/av1/encoder/tune_butteraugli.h")
+endif()
+
+if(CONFIG_SALIENCY_MAP)
+ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/saliency_map.c"
+ "${AOM_ROOT}/av1/encoder/saliency_map.h")
+endif()
+
+if(CONFIG_OPTICAL_FLOW_API)
+ list(APPEND AOM_AV1_ENCODER_SOURCES
+ "${AOM_ROOT}/av1/encoder/sparse_linear_solver.c"
+ "${AOM_ROOT}/av1/encoder/sparse_linear_solver.h"
+ "${AOM_ROOT}/av1/encoder/optical_flow.c"
+ "${AOM_ROOT}/av1/encoder/optical_flow.h")
+endif()
+
+if(CONFIG_AV1_TEMPORAL_DENOISING)
+ list(APPEND AOM_AV1_ENCODER_SOURCES
+ "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.c"
+ "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.h")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c")
+endif()
+
+if(CONFIG_AV1_HIGHBITDEPTH)
+ list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+ list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c")
+
+ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c")
+
+ list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_convolve_horiz_rs_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c"
+ "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c")
+
+ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c")
+endif()
+
+if(CONFIG_ACCOUNTING)
+ list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
+ "${AOM_ROOT}/av1/decoder/accounting.h")
+endif()
+
+if(CONFIG_INSPECTION)
+ list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c"
+ "${AOM_ROOT}/av1/decoder/inspection.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS)
+ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif()
+
+if(CONFIG_REALTIME_ONLY)
+ list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+
+ list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c")
+
+ list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c")
+
+ list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
+ "${AOM_ROOT}/av1/encoder/cnn.c"
+ "${AOM_ROOT}/av1/encoder/cnn.h"
+ "${AOM_ROOT}/av1/encoder/firstpass.c"
+ "${AOM_ROOT}/av1/encoder/firstpass.h"
+ "${AOM_ROOT}/av1/encoder/global_motion.c"
+ "${AOM_ROOT}/av1/encoder/global_motion.h"
+ "${AOM_ROOT}/av1/encoder/global_motion_facade.c"
+ "${AOM_ROOT}/av1/encoder/global_motion_facade.h"
+ "${AOM_ROOT}/av1/encoder/gop_structure.c"
+ "${AOM_ROOT}/av1/encoder/gop_structure.h"
+ "${AOM_ROOT}/av1/encoder/misc_model_weights.h"
+ "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h"
+ "${AOM_ROOT}/av1/encoder/partition_model_weights.h"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+ "${AOM_ROOT}/av1/encoder/picklpf.h"
+ "${AOM_ROOT}/av1/encoder/pickrst.c"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+ "${AOM_ROOT}/av1/encoder/tpl_model.c"
+ "${AOM_ROOT}/av1/encoder/tpl_model.h")
+endif()
+
+# Setup AV1 common/decoder/encoder targets. The libaom target must exist before
+# this function is called.
+function(setup_av1_targets)
+ add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
+ list(APPEND AOM_LIB_TARGETS aom_av1_common)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+ endif()
+
+ if(CONFIG_AV1_DECODER)
+ add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+ endif()
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+ endif()
+ endif()
+
+ if(HAVE_SSE2)
+ require_compiler_flag_nomsvc("-msse2" NO)
+ add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SSE2")
+ if(CONFIG_AV1_DECODER)
+ if(AOM_AV1_DECODER_ASM_SSE2)
+ add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2")
+ endif()
+
+ if(AOM_AV1_DECODER_INTRIN_SSE2)
+ add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
+ "AOM_AV1_DECODER_INTRIN_SSE2")
+ endif()
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2")
+ add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE2")
+ endif()
+ endif()
+
+ if(HAVE_SSE3)
+ require_compiler_flag_nomsvc("-msse3" NO)
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE3")
+ endif()
+ endif()
+
+ if(HAVE_SSSE3)
+ require_compiler_flag_nomsvc("-mssse3" NO)
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SSSE3")
+
+ if(CONFIG_AV1_DECODER)
+ if(AOM_AV1_DECODER_INTRIN_SSSE3)
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
+ "AOM_AV1_DECODER_INTRIN_SSSE3")
+ endif()
+ endif()
+ if(CONFIG_AV1_ENCODER)
+ if(AOM_AV1_ENCODER_INTRIN_SSSE3)
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSSE3")
+ endif()
+ endif()
+ endif()
+
+ if(HAVE_SSE4_1)
+ require_compiler_flag_nomsvc("-msse4.1" NO)
+ add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SSE4_1")
+
+ if(CONFIG_AV1_ENCODER)
+ if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ add_asm_library("aom_av1_encoder_ssse3"
+ "AOM_AV1_ENCODER_ASM_SSSE3_X86_64")
+ endif()
+
+ if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
+ add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE4_1")
+ endif()
+ endif()
+ endif()
+
+ if(HAVE_SSE4_2)
+ require_compiler_flag_nomsvc("-msse4.2" NO)
+ if(CONFIG_AV1_ENCODER)
+ if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
+ add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE4_2")
+ endif()
+ endif()
+ endif()
+
+ if(HAVE_AVX2)
+ require_compiler_flag_nomsvc("-mavx2" NO)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_AVX2")
+
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_AVX2")
+ endif()
+ endif()
+
+ if(HAVE_NEON)
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+ "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_NEON")
+ endif()
+ endif()
+
+ if(HAVE_ARM_CRC32)
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "arm_crc32"
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_ARM_CRC32")
+ endif()
+ endif()
+
+ if(HAVE_NEON_DOTPROD)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_NEON_DOTPROD")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD")
+ endif()
+ endif()
+
+ if(HAVE_NEON_I8MM)
+ add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+ "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_NEON_I8MM")
+ endif()
+
+ if(HAVE_SVE)
+ add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SVE")
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SVE")
+ endif()
+ endif()
+
+ if(HAVE_VSX)
+ if(AOM_AV1_COMMON_INTRIN_VSX)
+ add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_VSX")
+ endif()
+ endif()
+
+ # Pass the new lib targets up to the parent scope instance of
+ # $AOM_LIB_TARGETS.
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
new file mode 100644
index 0000000000..9214feb4e6
--- /dev/null
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -0,0 +1,4712 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom_ports/mem_ops.h"
+
+#include "aom/aom_encoder.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+
+#include "av1/av1_cx_iface.h"
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/external_partition.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/arg_defs.h"
+
+#include "common/args_helper.h"
+
+struct av1_extracfg {
+ int cpu_used;
+ unsigned int enable_auto_alt_ref;
+ unsigned int enable_auto_bwd_ref;
+ unsigned int noise_sensitivity;
+ unsigned int sharpness;
+ unsigned int static_thresh;
+ unsigned int row_mt;
+ unsigned int fp_mt;
+ unsigned int tile_columns; // log2 number of tile columns
+ unsigned int tile_rows; // log2 number of tile rows
+ unsigned int enable_tpl_model;
+ unsigned int enable_keyframe_filtering;
+ unsigned int arnr_max_frames;
+ unsigned int arnr_strength;
+ unsigned int min_gf_interval;
+ unsigned int max_gf_interval;
+ unsigned int gf_min_pyr_height;
+ unsigned int gf_max_pyr_height;
+ aom_tune_metric tuning;
+ const char *vmaf_model_path;
+ const char *partition_info_path;
+ unsigned int enable_rate_guide_deltaq;
+ const char *rate_distribution_info;
+ aom_dist_metric dist_metric;
+ unsigned int cq_level; // constrained quality level
+ unsigned int rc_max_intra_bitrate_pct;
+ unsigned int rc_max_inter_bitrate_pct;
+ unsigned int gf_cbr_boost_pct;
+ unsigned int lossless;
+ unsigned int enable_cdef;
+ unsigned int enable_restoration;
+ unsigned int force_video_mode;
+ unsigned int enable_obmc;
+ unsigned int disable_trellis_quant;
+ unsigned int enable_qm;
+ unsigned int qm_y;
+ unsigned int qm_u;
+ unsigned int qm_v;
+ unsigned int qm_min;
+ unsigned int qm_max;
+ unsigned int num_tg;
+ unsigned int mtu_size;
+
+ aom_timing_info_type_t timing_info_type;
+ unsigned int frame_parallel_decoding_mode;
+ int enable_dual_filter;
+ unsigned int enable_chroma_deltaq;
+ AQ_MODE aq_mode;
+ DELTAQ_MODE deltaq_mode;
+ int deltaq_strength;
+ int deltalf_mode;
+ unsigned int frame_periodic_boost;
+ aom_bit_depth_t bit_depth;
+ aom_tune_content content;
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ aom_chroma_sample_position_t chroma_sample_position;
+ int color_range;
+ int render_width;
+ int render_height;
+ aom_superblock_size_t superblock_size;
+ unsigned int single_tile_decoding;
+ int error_resilient_mode;
+ int s_frame_mode;
+
+ int film_grain_test_vector;
+ const char *film_grain_table_filename;
+ unsigned int motion_vector_unit_test;
+#if CONFIG_FPMT_TEST
+ unsigned int fpmt_unit_test;
+#endif
+ unsigned int cdf_update_mode;
+ int enable_rect_partitions; // enable rectangular partitions for sequence
+ int enable_ab_partitions; // enable AB partitions for sequence
+ int enable_1to4_partitions; // enable 1:4 and 4:1 partitions for sequence
+ int min_partition_size; // min partition size [4,8,16,32,64,128]
+ int max_partition_size; // max partition size [4,8,16,32,64,128]
+ int enable_intra_edge_filter; // enable intra-edge filter for sequence
+ int enable_order_hint; // enable order hint for sequence
+ int enable_tx64; // enable 64-pt transform usage for sequence
+ int enable_flip_idtx; // enable flip and identity transform types
+ int enable_rect_tx; // enable rectangular transform usage for sequence
+ int enable_dist_wtd_comp; // enable dist wtd compound for sequence
+ int max_reference_frames; // maximum number of references per frame
+ int enable_reduced_reference_set; // enable reduced set of references
+ int enable_ref_frame_mvs; // sequence level
+ int allow_ref_frame_mvs; // frame level
+ int enable_masked_comp; // enable masked compound for sequence
+ int enable_onesided_comp; // enable one sided compound for sequence
+ int enable_interintra_comp; // enable interintra compound for sequence
+ int enable_smooth_interintra; // enable smooth interintra mode usage
+ int enable_diff_wtd_comp; // enable diff-wtd compound usage
+ int enable_interinter_wedge; // enable interinter-wedge compound usage
+ int enable_interintra_wedge; // enable interintra-wedge compound usage
+ int enable_global_motion; // enable global motion usage for sequence
+ int enable_warped_motion; // sequence level
+ int allow_warped_motion; // frame level
+ int enable_filter_intra; // enable filter intra for sequence
+ int enable_smooth_intra; // enable smooth intra modes for sequence
+ int enable_paeth_intra; // enable Paeth intra mode for sequence
+ int enable_cfl_intra; // enable CFL uv intra mode for sequence
+ int enable_directional_intra; // enable directional modes for sequence
+ int enable_diagonal_intra; // enable D45 to D203 intra modes for sequence
+ int enable_superres;
+ int enable_overlay; // enable overlay for filtered arf frames
+ int enable_palette;
+ int enable_intrabc;
+ int enable_angle_delta;
+#if CONFIG_DENOISE
+ float noise_level;
+ int noise_block_size;
+ int enable_dnl_denoising;
+#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
+ int reduced_tx_type_set;
+ int use_intra_dct_only;
+ int use_inter_dct_only;
+ int use_intra_default_tx_only;
+ int enable_tx_size_search;
+ int quant_b_adapt;
+ unsigned int vbr_corpus_complexity_lap;
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ // Bit mask to specify which tier each of the 32 possible operating points
+ // conforms to.
+ unsigned int tier_mask;
+ // min_cr / 100 is the target minimum compression ratio for each frame.
+ unsigned int min_cr;
+ COST_UPDATE_TYPE coeff_cost_upd_freq;
+ COST_UPDATE_TYPE mode_cost_upd_freq;
+ COST_UPDATE_TYPE mv_cost_upd_freq;
+ COST_UPDATE_TYPE dv_cost_upd_freq;
+ unsigned int ext_tile_debug;
+ unsigned int sb_multipass_unit_test;
+ // Total number of passes. If this number is -1, then we assume passes = 1 or
+ // 2 (passes = 1 if pass == AOM_RC_ONE_PASS and passes = 2 otherwise).
+ int passes;
+ int fwd_kf_dist;
+
+ LOOPFILTER_CONTROL loopfilter_control;
+ // Indicates if the application of post-processing filters should be skipped
+ // on reconstructed frame.
+ unsigned int skip_postproc_filtering;
+ // the name of the second pass output file when passes > 2
+ const char *two_pass_output;
+ const char *second_pass_log;
+ // Automatically determine whether to disable several intra tools
+ // when "--deltaq-mode=3" is true.
+ // Default as 0.
+ // When set to 1, the encoder will analyze the reconstruction quality
+ // as compared to the source image in the preprocessing pass.
+ // If the recontruction quality is considered high enough, we disable
+ // the following intra coding tools, for better encoding speed:
+ // "--enable_smooth_intra",
+ // "--enable_paeth_intra",
+ // "--enable_cfl_intra",
+ // "--enable_diagonal_intra".
+ int auto_intra_tools_off;
+ int strict_level_conformance;
+ int kf_max_pyr_height;
+ int sb_qp_sweep;
+};
+
+#if CONFIG_REALTIME_ONLY
+// Settings changed for realtime only build:
+// cpu_used: 7
+// enable_tpl_model: 0
+// enable_restoration: 0
+// enable_obmc: 0
+// deltaq_mode: NO_DELTA_Q
+// enable_global_motion usage: 0
+// enable_warped_motion at sequence level: 0
+// allow_warped_motion at frame level: 0
+// coeff_cost_upd_freq: COST_UPD_OFF
+// mode_cost_upd_freq: COST_UPD_OFF
+// mv_cost_upd_freq: COST_UPD_OFF
+// dv_cost_upd_freq: COST_UPD_OFF
+static const struct av1_extracfg default_extra_cfg = {
+ 7, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // enable_auto_bwd_ref
+ 0, // noise_sensitivity
+ 0, // sharpness
+ 0, // static_thresh
+ 1, // row_mt
+ 0, // fp_mt
+ 0, // tile_columns
+ 0, // tile_rows
+ 0, // enable_tpl_model
+ 1, // enable_keyframe_filtering
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ 0, // min_gf_interval; 0 -> default decision
+ 0, // max_gf_interval; 0 -> default decision
+ 0, // gf_min_pyr_height
+ 5, // gf_max_pyr_height
+ AOM_TUNE_PSNR, // tuning
+ "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path
+ ".", // partition info path
+ 0, // enable rate guide deltaq
+ "./rate_map.txt", // rate distribution input
+ AOM_DIST_METRIC_PSNR, // dist_metric
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // rc_max_inter_bitrate_pct
+ 0, // gf_cbr_boost_pct
+ 0, // lossless
+ 1, // enable_cdef
+ 0, // enable_restoration
+ 0, // force_video_mode
+ 0, // enable_obmc
+ 3, // disable_trellis_quant
+ 0, // enable_qm
+ DEFAULT_QM_Y, // qm_y
+ DEFAULT_QM_U, // qm_u
+ DEFAULT_QM_V, // qm_v
+ DEFAULT_QM_FIRST, // qm_min
+ DEFAULT_QM_LAST, // qm_max
+ 1, // max number of tile groups
+ 0, // mtu_size
+ AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream
+ 0, // frame_parallel_decoding_mode
+ 1, // enable dual filter
+ 0, // enable delta quant in chroma planes
+ NO_AQ, // aq_mode
+ NO_DELTA_Q, // deltaq_mode
+ 100, // deltaq_strength
+ 0, // delta lf mode
+ 0, // frame_periodic_boost
+ AOM_BITS_8, // Bit depth
+ AOM_CONTENT_DEFAULT, // content
+ AOM_CICP_CP_UNSPECIFIED, // CICP color primaries
+ AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics
+ AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients
+ AOM_CSP_UNKNOWN, // chroma sample position
+ 0, // color range
+ 0, // render width
+ 0, // render height
+ AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size
+ 1, // this depends on large_scale_tile.
+ 0, // error_resilient_mode off by default.
+ 0, // s_frame_mode off by default.
+ 0, // film_grain_test_vector
+ NULL, // film_grain_table_filename
+ 0, // motion_vector_unit_test
+#if CONFIG_FPMT_TEST
+ 0, // fpmt_unit_test
+#endif
+ 1, // CDF update mode
+ 1, // enable rectangular partitions
+ 1, // enable ab shape partitions
+ 1, // enable 1:4 and 4:1 partitions
+ 4, // min_partition_size
+ 128, // max_partition_size
+ 1, // enable intra edge filter
+ 1, // frame order hint
+ 1, // enable 64-pt transform usage
+ 1, // enable flip and identity transform
+ 1, // enable rectangular transform usage
+ 1, // dist-wtd compound
+ 7, // max_reference_frames
+ 0, // enable_reduced_reference_set
+ 1, // enable_ref_frame_mvs sequence level
+ 1, // allow ref_frame_mvs frame level
+ 1, // enable masked compound at sequence level
+ 1, // enable one sided compound at sequence level
+ 1, // enable interintra compound at sequence level
+ 1, // enable smooth interintra mode
+ 1, // enable difference-weighted compound
+ 1, // enable interinter wedge compound
+ 1, // enable interintra wedge compound
+ 0, // enable_global_motion usage
+ 0, // enable_warped_motion at sequence level
+ 0, // allow_warped_motion at frame level
+ 1, // enable filter intra at sequence level
+ 1, // enable smooth intra modes usage for sequence
+ 1, // enable Paeth intra mode usage for sequence
+ 1, // enable CFL uv intra mode usage for sequence
+ 1, // enable directional intra mode usage for sequence
+ 1, // enable D45 to D203 intra mode usage for sequence
+ 1, // superres
+ 1, // enable overlay
+ 1, // enable palette
+ 1, // enable intrabc
+ 1, // enable angle delta
+#if CONFIG_DENOISE
+ 0, // noise_level
+ 32, // noise_block_size
+ 1, // enable_dnl_denoising
+#endif
+ 0, // chroma_subsampling_x
+ 0, // chroma_subsampling_y
+ 0, // reduced_tx_type_set
+ 0, // use_intra_dct_only
+ 0, // use_inter_dct_only
+ 0, // use_intra_default_tx_only
+ 1, // enable_tx_size_search
+ 0, // quant_b_adapt
+ 0, // vbr_corpus_complexity_lap
+ {
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ }, // target_seq_level_idx
+ 0, // tier_mask
+ 0, // min_cr
+ COST_UPD_OFF, // coeff_cost_upd_freq
+ COST_UPD_OFF, // mode_cost_upd_freq
+ COST_UPD_OFF, // mv_cost_upd_freq
+ COST_UPD_OFF, // dv_cost_upd_freq
+ 0, // ext_tile_debug
+ 0, // sb_multipass_unit_test
+ -1, // passes
+ -1, // fwd_kf_dist
+ LOOPFILTER_ALL, // loopfilter_control
+ 0, // skip_postproc_filtering
+ NULL, // two_pass_output
+ NULL, // second_pass_log
+ 0, // auto_intra_tools_off
+ 0, // strict_level_conformance
+ -1, // kf_max_pyr_height
+ 0, // sb_qp_sweep
+};
+#else
+static const struct av1_extracfg default_extra_cfg = {
+ 0, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // enable_auto_bwd_ref
+ 0, // noise_sensitivity
+ 0, // sharpness
+ 0, // static_thresh
+ 1, // row_mt
+ 0, // fp_mt
+ 0, // tile_columns
+ 0, // tile_rows
+ 1, // enable_tpl_model
+ 1, // enable_keyframe_filtering
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ 0, // min_gf_interval; 0 -> default decision
+ 0, // max_gf_interval; 0 -> default decision
+ 0, // gf_min_pyr_height
+ 5, // gf_max_pyr_height
+ AOM_TUNE_PSNR, // tuning
+ "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path
+ ".", // partition info path
+ 0, // enable rate guide deltaq
+ "./rate_map.txt", // rate distribution input
+ AOM_DIST_METRIC_PSNR, // dist_metric
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // rc_max_inter_bitrate_pct
+ 0, // gf_cbr_boost_pct
+ 0, // lossless
+ 1, // enable_cdef
+ 1, // enable_restoration
+ 0, // force_video_mode
+ 1, // enable_obmc
+ 3, // disable_trellis_quant
+ 0, // enable_qm
+ DEFAULT_QM_Y, // qm_y
+ DEFAULT_QM_U, // qm_u
+ DEFAULT_QM_V, // qm_v
+ DEFAULT_QM_FIRST, // qm_min
+ DEFAULT_QM_LAST, // qm_max
+ 1, // max number of tile groups
+ 0, // mtu_size
+ AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream
+ 0, // frame_parallel_decoding_mode
+ 1, // enable dual filter
+ 0, // enable delta quant in chroma planes
+ NO_AQ, // aq_mode
+ DELTA_Q_OBJECTIVE, // deltaq_mode
+ 100, // deltaq_strength
+ 0, // delta lf mode
+ 0, // frame_periodic_boost
+ AOM_BITS_8, // Bit depth
+ AOM_CONTENT_DEFAULT, // content
+ AOM_CICP_CP_UNSPECIFIED, // CICP color primaries
+ AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics
+ AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients
+ AOM_CSP_UNKNOWN, // chroma sample position
+ 0, // color range
+ 0, // render width
+ 0, // render height
+ AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size
+ 1, // this depends on large_scale_tile.
+ 0, // error_resilient_mode off by default.
+ 0, // s_frame_mode off by default.
+ 0, // film_grain_test_vector
+ NULL, // film_grain_table_filename
+ 0, // motion_vector_unit_test
+#if CONFIG_FPMT_TEST
+ 0, // fpmt_unit_test
+#endif
+ 1, // CDF update mode
+ 1, // enable rectangular partitions
+ 1, // enable ab shape partitions
+ 1, // enable 1:4 and 4:1 partitions
+ 4, // min_partition_size
+ 128, // max_partition_size
+ 1, // enable intra edge filter
+ 1, // frame order hint
+ 1, // enable 64-pt transform usage
+ 1, // enable flip and identity transform
+ 1, // enable rectangular transform usage
+ 1, // dist-wtd compound
+ 7, // max_reference_frames
+ 0, // enable_reduced_reference_set
+ 1, // enable_ref_frame_mvs sequence level
+ 1, // allow ref_frame_mvs frame level
+ 1, // enable masked compound at sequence level
+ 1, // enable one sided compound at sequence level
+ 1, // enable interintra compound at sequence level
+ 1, // enable smooth interintra mode
+ 1, // enable difference-weighted compound
+ 1, // enable interinter wedge compound
+ 1, // enable interintra wedge compound
+ 1, // enable_global_motion usage
+ 1, // enable_warped_motion at sequence level
+ 1, // allow_warped_motion at frame level
+ 1, // enable filter intra at sequence level
+ 1, // enable smooth intra modes usage for sequence
+ 1, // enable Paeth intra mode usage for sequence
+ 1, // enable CFL uv intra mode usage for sequence
+ 1, // enable directional intra mode usage for sequence
+ 1, // enable D45 to D203 intra mode usage for sequence
+ 1, // superres
+ 1, // enable overlay
+ 1, // enable palette
+ 1, // enable intrabc
+ 1, // enable angle delta
+#if CONFIG_DENOISE
+ 0, // noise_level
+ 32, // noise_block_size
+ 1, // enable_dnl_denoising
+#endif
+ 0, // chroma_subsampling_x
+ 0, // chroma_subsampling_y
+ 0, // reduced_tx_type_set
+ 0, // use_intra_dct_only
+ 0, // use_inter_dct_only
+ 0, // use_intra_default_tx_only
+ 1, // enable_tx_size_search
+ 0, // quant_b_adapt
+ 0, // vbr_corpus_complexity_lap
+ {
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+ }, // target_seq_level_idx
+ 0, // tier_mask
+ 0, // min_cr
+ COST_UPD_SB, // coeff_cost_upd_freq
+ COST_UPD_SB, // mode_cost_upd_freq
+ COST_UPD_SB, // mv_cost_upd_freq
+ COST_UPD_SB, // dv_cost_upd_freq
+ 0, // ext_tile_debug
+ 0, // sb_multipass_unit_test
+ -1, // passes
+ -1, // fwd_kf_dist
+ LOOPFILTER_ALL, // loopfilter_control
+ 0, // skip_postproc_filtering
+ NULL, // two_pass_output
+ NULL, // second_pass_log
+ 0, // auto_intra_tools_off
+ 0, // strict_level_conformance
+ -1, // kf_max_pyr_height
+ 0, // sb_qp_sweep
+};
+#endif
+
+struct aom_codec_alg_priv {
+ aom_codec_priv_t base;
+ aom_codec_enc_cfg_t cfg;
+ struct av1_extracfg extra_cfg;
+ aom_rational64_t timestamp_ratio;
+ aom_codec_pts_t pts_offset;
+ unsigned char pts_offset_initialized;
+ AV1EncoderConfig oxcf;
+ AV1_PRIMARY *ppi;
+ unsigned char *cx_data;
+ size_t cx_data_sz;
+ size_t pending_cx_data_sz;
+ aom_image_t preview_img;
+ aom_enc_frame_flags_t next_frame_flags;
+ aom_codec_pkt_list_decl(256) pkt_list;
+ unsigned int fixed_kf_cntr;
+ // BufferPool that holds all reference frames.
+ BufferPool *buffer_pool;
+
+ // lookahead instance variables
+ BufferPool *buffer_pool_lap;
+ FIRSTPASS_STATS *frame_stats_buffer;
+ // Number of stats buffers required for look ahead
+ int num_lap_buffers;
+ STATS_BUFFER_CTX stats_buf_context;
+ bool monochrome_on_init;
+};
+
+static INLINE int gcd(int64_t a, int b) {
+ int remainder;
+ while (b > 0) {
+ remainder = (int)(a % b);
+ a = b;
+ b = remainder;
+ }
+
+ return (int)a;
+}
+
+static void reduce_ratio(aom_rational64_t *ratio) {
+ const int denom = gcd(ratio->num, ratio->den);
+ ratio->num /= denom;
+ ratio->den /= denom;
+}
+
+// Called by encoder_encode() only. Must not be called by encoder_init()
+// because the `error` paramerer will be destroyed by aom_codec_enc_init_ver()
+// after encoder_init() returns an error. See the "IMPORTANT" comment in
+// aom_codec_enc_init_ver().
+static aom_codec_err_t update_error_state(
+ aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+ const aom_codec_err_t res = error->error_code;
+
+ if (res != AOM_CODEC_OK)
+ ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+ return res;
+}
+
+// This function deep copies a string src to *dst. For default string we will
+// use a string literal, and otherwise we will allocate memory for the string.
+static aom_codec_err_t allocate_and_set_string(const char *src,
+ const char *default_src,
+ const char **dst,
+ char *err_detail) {
+ if (!src) {
+ snprintf(err_detail, ARG_ERR_MSG_MAX_LEN,
+ "Null pointer given to a string parameter.");
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (*dst && strcmp(src, *dst) == 0) return AOM_CODEC_OK;
+ // If the input is exactly the same as default, we will use the string
+ // literal, so do not free here.
+ if (*dst != default_src) {
+ aom_free((void *)*dst);
+ }
+
+ if (default_src && strcmp(src, default_src) == 0) {
+ // default_src should be a string literal
+ *dst = default_src;
+ } else {
+ size_t len = strlen(src) + 1;
+ char *tmp = aom_malloc(len * sizeof(*tmp));
+ if (!tmp) {
+ snprintf(err_detail, ARG_ERR_MSG_MAX_LEN,
+ "Failed to allocate memory for copying parameters.");
+ return AOM_CODEC_MEM_ERROR;
+ }
+ memcpy(tmp, src, len);
+ *dst = tmp;
+ }
+ return 0;
+}
+
+#undef ERROR
+#define ERROR(str) \
+ do { \
+ ctx->base.err_detail = str; \
+ return AOM_CODEC_INVALID_PARAM; \
+ } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi) \
+ do { \
+ if (!((p)->memb >= (lo) && (p)->memb <= (hi))) \
+ ERROR(#memb " out of range [" #lo ".." #hi "]"); \
+ } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi) \
+ do { \
+ if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
+ } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb) \
+ do { \
+ if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
+ } while (0)
+
+static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
+ const aom_codec_enc_cfg_t *cfg,
+ const struct av1_extracfg *extra_cfg) {
+ RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available
+ RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available
+ RANGE_CHECK_HI(cfg, g_forced_max_frame_width, 65536); // 16 bits available
+ RANGE_CHECK_HI(cfg, g_forced_max_frame_height, 65536); // 16 bits available
+ if (cfg->g_forced_max_frame_width) {
+ RANGE_CHECK_HI(cfg, g_w, cfg->g_forced_max_frame_width);
+ }
+ if (cfg->g_forced_max_frame_height) {
+ RANGE_CHECK_HI(cfg, g_h, cfg->g_forced_max_frame_height);
+ }
+ // To avoid integer overflows when multiplying width by height (or values
+ // derived from width and height) using the int type, impose a maximum frame
+ // area (width * height) of 2^30.
+ const unsigned int max_frame_width =
+ cfg->g_forced_max_frame_width ? cfg->g_forced_max_frame_width : cfg->g_w;
+ const unsigned int max_frame_height = cfg->g_forced_max_frame_height
+ ? cfg->g_forced_max_frame_height
+ : cfg->g_h;
+ const int64_t max_frame_area = (int64_t)max_frame_width * max_frame_height;
+ if (max_frame_area > (1 << 30)) {
+ ERROR("max_frame_area out of range [..2^30]");
+ }
+ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+ RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+ RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
+
+ RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+ RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+ RANGE_CHECK_BOOL(extra_cfg, lossless);
+ RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
+ RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1);
+ RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1);
+ RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+#if CONFIG_REALTIME_ONLY
+ RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME);
+#else
+ RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA);
+#endif
+ RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
+ RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
+ RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+ RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
+ RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+ RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
+ RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+ RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS);
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+ if (cfg->g_usage == AOM_USAGE_ALL_INTRA) {
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
+ RANGE_CHECK_HI(cfg, kf_max_dist, 0);
+ }
+ RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+ RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
+ if (extra_cfg->max_gf_interval > 0) {
+ RANGE_CHECK(extra_cfg, max_gf_interval,
+ AOMMAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1));
+ }
+ RANGE_CHECK_HI(extra_cfg, gf_min_pyr_height, 5);
+ RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 5);
+ if (extra_cfg->gf_min_pyr_height > extra_cfg->gf_max_pyr_height) {
+ ERROR(
+ "gf_min_pyr_height must be less than or equal to "
+ "gf_max_pyramid_height");
+ }
+
+ RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
+ RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK_HI(cfg, rc_superres_mode, AOM_SUPERRES_AUTO);
+ RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63);
+ RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
+ RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
+
+ RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+#if CONFIG_FPMT_TEST
+ RANGE_CHECK_HI(extra_cfg, fpmt_unit_test, 1);
+#endif
+ RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1);
+ RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1);
+ RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1);
+ RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
+ RANGE_CHECK(extra_cfg, cpu_used, 0,
+ (cfg->g_usage == AOM_USAGE_REALTIME) ? 11 : 9);
+ RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+ RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
+ AOM_SUPERBLOCK_SIZE_DYNAMIC);
+ RANGE_CHECK_HI(cfg, large_scale_tile, 1);
+ RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
+ RANGE_CHECK_HI(extra_cfg, enable_rate_guide_deltaq, 1);
+
+ RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+ RANGE_CHECK_HI(extra_cfg, fp_mt, 1);
+
+ RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+ RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+
+ RANGE_CHECK_HI(cfg, monochrome, 1);
+
+ if (cfg->large_scale_tile && extra_cfg->aq_mode)
+ ERROR(
+ "Adaptive quantization are not supported in large scale tile "
+ "coding.");
+
+ RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+ RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
+ RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+ RANGE_CHECK_HI(extra_cfg, cq_level, 63);
+ RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
+ RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+ RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
+
+ if (cfg->g_pass >= AOM_RC_SECOND_PASS) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+ const FIRSTPASS_STATS *stats;
+
+ if (cfg->rc_twopass_stats_in.buf == NULL)
+ ERROR("rc_twopass_stats_in.buf not set.");
+
+ if (cfg->rc_twopass_stats_in.sz % packet_sz)
+ ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+ if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+ ERROR("rc_twopass_stats_in requires at least two packets.");
+
+ stats =
+ (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+ if ((int)(stats->count + 0.5) != n_packets - 1)
+ ERROR("rc_twopass_stats_in missing EOS stats packet");
+ }
+
+ if (extra_cfg->passes != -1 && cfg->g_pass == AOM_RC_ONE_PASS &&
+ extra_cfg->passes != 1) {
+ ERROR("One pass encoding but passes != 1.");
+ }
+
+ if (extra_cfg->passes != -1 && (int)cfg->g_pass > extra_cfg->passes) {
+ ERROR("Current pass is larger than total number of passes.");
+ }
+
+ if (cfg->g_profile == (unsigned int)PROFILE_1 && cfg->monochrome) {
+ ERROR("Monochrome is not supported in profile 1");
+ }
+
+ if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+ cfg->g_bit_depth > AOM_BITS_10) {
+ ERROR("Codec bit-depth 12 not supported in profile < 2");
+ }
+ if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+ cfg->g_input_bit_depth > 10) {
+ ERROR("Source bit-depth 12 not supported in profile < 2");
+ }
+
+ if (cfg->rc_end_usage == AOM_Q) {
+ RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1);
+ } else {
+ if (cfg->use_fixed_qp_offsets > 0) {
+ ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q");
+ }
+ }
+
+ RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
+ AOM_CICP_CP_EBU_3213); // Need to check range more precisely to
+ // check for reserved values?
+ RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709,
+ AOM_CICP_TC_HLG);
+ RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY,
+ AOM_CICP_MC_ICTCP);
+ RANGE_CHECK(extra_cfg, color_range, 0, 1);
+
+ /* Average corpus complexity is supported only in the case of single pass
+ * VBR*/
+ if (cfg->g_pass == AOM_RC_ONE_PASS && cfg->rc_end_usage == AOM_VBR)
+ RANGE_CHECK_HI(extra_cfg, vbr_corpus_complexity_lap,
+ MAX_VBR_CORPUS_COMPLEXITY);
+ else if (extra_cfg->vbr_corpus_complexity_lap != 0)
+ ERROR(
+ "VBR corpus complexity is supported only in the case of single pass "
+ "VBR mode.");
+
+#if !CONFIG_TUNE_BUTTERAUGLI
+ if (extra_cfg->tuning == AOM_TUNE_BUTTERAUGLI) {
+ ERROR(
+ "This error may be related to the wrong configuration options: try to "
+ "set -DCONFIG_TUNE_BUTTERAUGLI=1 at the time CMake is run.");
+ }
+#endif
+
+#if !CONFIG_TUNE_VMAF
+ if (extra_cfg->tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ extra_cfg->tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ ERROR(
+ "This error may be related to the wrong configuration options: try to "
+ "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run.");
+ }
+#endif
+
+ RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_SALIENCY_MAP);
+
+ RANGE_CHECK(extra_cfg, dist_metric, AOM_DIST_METRIC_PSNR,
+ AOM_DIST_METRIC_QM_PSNR);
+
+ RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
+ AOM_TIMING_DEC_MODEL);
+
+ RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16);
+
+ if (extra_cfg->lossless) {
+ if (extra_cfg->aq_mode != 0)
+ ERROR("Only --aq_mode=0 can be used with --lossless=1.");
+ if (extra_cfg->enable_chroma_deltaq)
+ ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1.");
+ }
+
+ RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
+ RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1);
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
+ RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
+ RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3);
+ RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3);
+ RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
+ RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3);
+
+ RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
+ RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
+ RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size);
+
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ const int level_idx = extra_cfg->target_seq_level_idx[i];
+ if (!is_valid_seq_level_idx(level_idx) &&
+ level_idx != SEQ_LEVEL_KEEP_STATS) {
+ ERROR("Target sequence level index is invalid");
+ }
+ }
+
+ RANGE_CHECK(extra_cfg, deltaq_strength, 0, 1000);
+ RANGE_CHECK_HI(extra_cfg, loopfilter_control, 3);
+ RANGE_CHECK_BOOL(extra_cfg, skip_postproc_filtering);
+ RANGE_CHECK_HI(extra_cfg, enable_cdef, 2);
+ RANGE_CHECK_BOOL(extra_cfg, auto_intra_tools_off);
+ RANGE_CHECK_BOOL(extra_cfg, strict_level_conformance);
+ RANGE_CHECK_BOOL(extra_cfg, sb_qp_sweep);
+
+ RANGE_CHECK(extra_cfg, kf_max_pyr_height, -1, 5);
+ if (extra_cfg->kf_max_pyr_height != -1 &&
+ extra_cfg->kf_max_pyr_height < (int)extra_cfg->gf_min_pyr_height) {
+ ERROR(
+ "The value of kf-max-pyr-height should not be smaller than "
+ "gf-min-pyr-height");
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
+ const aom_image_t *img) {
+ switch (img->fmt) {
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_NV12:
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_YV1216:
+ case AOM_IMG_FMT_I42016: break;
+ case AOM_IMG_FMT_I444:
+ case AOM_IMG_FMT_I44416:
+ if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 &&
+ !ctx->cfg.monochrome) {
+ ERROR("Invalid image format. I444 images not supported in profile.");
+ }
+ break;
+ case AOM_IMG_FMT_I422:
+ case AOM_IMG_FMT_I42216:
+ if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) {
+ ERROR("Invalid image format. I422 images not supported in profile.");
+ }
+ break;
+ default:
+ ERROR(
+ "Invalid image format. Only YV12, NV12, I420, I422, I444 images are "
+ "supported.");
+ break;
+ }
+
+ if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+ ERROR("Image size must match encoder init configuration size");
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ if (img->bit_depth > 8) {
+ ERROR("Only 8 bit depth images supported in tune=butteraugli mode.");
+ }
+ if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 &&
+ img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) {
+ ERROR(
+ "Only BT.709 and BT.601 matrix coefficients supported in "
+ "tune=butteraugli mode. Identity matrix is treated as BT.601.");
+ }
+ }
+#endif
+
+ return AOM_CODEC_OK;
+}
+
+static int get_image_bps(const aom_image_t *img) {
+ switch (img->fmt) {
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_NV12:
+ case AOM_IMG_FMT_I420: return 12;
+ case AOM_IMG_FMT_I422: return 16;
+ case AOM_IMG_FMT_I444: return 24;
+ case AOM_IMG_FMT_YV1216:
+ case AOM_IMG_FMT_I42016: return 24;
+ case AOM_IMG_FMT_I42216: return 32;
+ case AOM_IMG_FMT_I44416: return 48;
+ default: assert(0 && "Invalid image format"); break;
+ }
+ return 0;
+}
+
+// Set appropriate options to disable frame super-resolution.
+static void disable_superres(SuperResCfg *const superres_cfg) {
+ superres_cfg->superres_mode = AOM_SUPERRES_NONE;
+ superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+ superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+ superres_cfg->superres_qthresh = 255;
+ superres_cfg->superres_kf_qthresh = 255;
+}
+
+static void update_default_encoder_config(const cfg_options_t *cfg,
+ struct av1_extracfg *extra_cfg) {
+ extra_cfg->enable_cdef = (cfg->disable_cdef == 0) ? 1 : 0;
+ extra_cfg->enable_restoration = (cfg->disable_lr == 0);
+ extra_cfg->superblock_size =
+ (cfg->super_block_size == 64) ? AOM_SUPERBLOCK_SIZE_64X64
+ : (cfg->super_block_size == 128) ? AOM_SUPERBLOCK_SIZE_128X128
+ : AOM_SUPERBLOCK_SIZE_DYNAMIC;
+ extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0);
+ extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0);
+ extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0);
+ extra_cfg->enable_dual_filter = (cfg->disable_dual_filter == 0);
+ extra_cfg->enable_angle_delta = (cfg->disable_intra_angle_delta == 0);
+ extra_cfg->enable_rect_partitions = (cfg->disable_rect_partition_type == 0);
+ extra_cfg->enable_ab_partitions = (cfg->disable_ab_partition_type == 0);
+ extra_cfg->enable_1to4_partitions = (cfg->disable_1to4_partition_type == 0);
+ extra_cfg->max_partition_size = cfg->max_partition_size;
+ extra_cfg->min_partition_size = cfg->min_partition_size;
+ extra_cfg->enable_intra_edge_filter = (cfg->disable_intra_edge_filter == 0);
+ extra_cfg->enable_tx64 = (cfg->disable_tx_64x64 == 0);
+ extra_cfg->enable_flip_idtx = (cfg->disable_flip_idtx == 0);
+ extra_cfg->enable_masked_comp = (cfg->disable_masked_comp == 0);
+ extra_cfg->enable_interintra_comp = (cfg->disable_inter_intra_comp == 0);
+ extra_cfg->enable_smooth_interintra = (cfg->disable_smooth_inter_intra == 0);
+ extra_cfg->enable_interinter_wedge = (cfg->disable_inter_inter_wedge == 0);
+ extra_cfg->enable_interintra_wedge = (cfg->disable_inter_intra_wedge == 0);
+ extra_cfg->enable_global_motion = (cfg->disable_global_motion == 0);
+ extra_cfg->enable_filter_intra = (cfg->disable_filter_intra == 0);
+ extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0);
+ extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0);
+ extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0);
+ extra_cfg->enable_obmc = (cfg->disable_obmc == 0);
+ extra_cfg->enable_palette = (cfg->disable_palette == 0);
+ extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0);
+ extra_cfg->disable_trellis_quant = cfg->disable_trellis_quant;
+ extra_cfg->allow_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0);
+ extra_cfg->enable_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0);
+ extra_cfg->enable_onesided_comp = (cfg->disable_one_sided_comp == 0);
+ extra_cfg->enable_reduced_reference_set = cfg->reduced_reference_set;
+ extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set;
+}
+
+static void set_encoder_config(AV1EncoderConfig *oxcf,
+ const aom_codec_enc_cfg_t *cfg,
+ struct av1_extracfg *extra_cfg) {
+ if (cfg->encoder_cfg.init_by_cfg_file) {
+ update_default_encoder_config(&cfg->encoder_cfg, extra_cfg);
+ }
+
+ TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+ FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+
+ TileConfig *const tile_cfg = &oxcf->tile_cfg;
+
+ ResizeCfg *const resize_cfg = &oxcf->resize_cfg;
+
+ GFConfig *const gf_cfg = &oxcf->gf_cfg;
+
+ PartitionCfg *const part_cfg = &oxcf->part_cfg;
+
+ IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg;
+
+ TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg;
+
+ CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg;
+
+ SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+
+ KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+
+ DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+
+ RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+
+ ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+ InputCfg *const input_cfg = &oxcf->input_cfg;
+
+ AlgoCfg *const algo_cfg = &oxcf->algo_cfg;
+
+ ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
+ const int is_vbr = cfg->rc_end_usage == AOM_VBR;
+ oxcf->profile = cfg->g_profile;
+ oxcf->max_threads = (int)cfg->g_threads;
+
+ switch (cfg->g_usage) {
+ case AOM_USAGE_REALTIME: oxcf->mode = REALTIME; break;
+ case AOM_USAGE_ALL_INTRA: oxcf->mode = ALLINTRA; break;
+ default: oxcf->mode = GOOD; break;
+ }
+
+ // Set frame-dimension related configuration.
+ frm_dim_cfg->width = cfg->g_w;
+ frm_dim_cfg->height = cfg->g_h;
+ frm_dim_cfg->forced_max_frame_width = cfg->g_forced_max_frame_width;
+ frm_dim_cfg->forced_max_frame_height = cfg->g_forced_max_frame_height;
+ frm_dim_cfg->render_width = extra_cfg->render_width;
+ frm_dim_cfg->render_height = extra_cfg->render_height;
+
+ // Set input video related configuration.
+ input_cfg->input_bit_depth = cfg->g_input_bit_depth;
+ // guess a frame rate if out of whack, use 30
+ input_cfg->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+ if (cfg->g_pass >= AOM_RC_SECOND_PASS) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+ input_cfg->limit = n_packets - 1;
+ } else {
+ input_cfg->limit = cfg->g_limit;
+ }
+ input_cfg->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+ input_cfg->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+ if (input_cfg->init_framerate > 180) {
+ input_cfg->init_framerate = 30;
+ dec_model_cfg->timing_info_present = 0;
+ }
+
+ // Set Decoder model configuration.
+ if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
+ extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+ dec_model_cfg->timing_info_present = 1;
+ dec_model_cfg->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+ dec_model_cfg->timing_info.time_scale = cfg->g_timebase.den;
+ dec_model_cfg->timing_info.num_ticks_per_picture = 1;
+ } else {
+ dec_model_cfg->timing_info_present = 0;
+ }
+ if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
+ dec_model_cfg->timing_info.equal_picture_interval = 1;
+ dec_model_cfg->decoder_model_info_present_flag = 0;
+ dec_model_cfg->display_model_info_present_flag = 1;
+ } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+ dec_model_cfg->num_units_in_decoding_tick = cfg->g_timebase.num;
+ dec_model_cfg->timing_info.equal_picture_interval = 0;
+ dec_model_cfg->decoder_model_info_present_flag = 1;
+ dec_model_cfg->display_model_info_present_flag = 1;
+ }
+
+ oxcf->pass = cfg->g_pass;
+ // For backward compatibility, assume that if extra_cfg->passes==-1, then
+ // passes = 1 or 2.
+ if (extra_cfg->passes == -1) {
+ if (cfg->g_pass == AOM_RC_ONE_PASS) {
+ oxcf->passes = 1;
+ } else {
+ oxcf->passes = 2;
+ }
+ } else {
+ oxcf->passes = extra_cfg->passes;
+ }
+
+ // Set Rate Control configuration.
+ rc_cfg->max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+ rc_cfg->max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+ rc_cfg->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+ rc_cfg->mode = cfg->rc_end_usage;
+ rc_cfg->min_cr = extra_cfg->min_cr;
+ rc_cfg->best_allowed_q =
+ extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
+ rc_cfg->worst_allowed_q =
+ extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
+ rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+ rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct;
+ rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct;
+ rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+ rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+ rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+ // Convert target bandwidth from Kbit/s to Bit/s
+ rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+ rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+ rc_cfg->vbr_corpus_complexity_lap = extra_cfg->vbr_corpus_complexity_lap;
+ rc_cfg->vbrbias = cfg->rc_2pass_vbr_bias_pct;
+ rc_cfg->vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+ rc_cfg->vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+
+ // Set Toolset related configuration.
+ tool_cfg->bit_depth = cfg->g_bit_depth;
+ tool_cfg->cdef_control = (CDEF_CONTROL)extra_cfg->enable_cdef;
+ tool_cfg->enable_restoration =
+ (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration;
+ tool_cfg->force_video_mode = extra_cfg->force_video_mode;
+ tool_cfg->enable_palette = extra_cfg->enable_palette;
+ // FIXME(debargha): Should this be:
+ // tool_cfg->enable_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
+ // extra_cfg->enable_order_hint ?
+ // Disallow using temporal MVs while large_scale_tile = 1.
+ tool_cfg->enable_ref_frame_mvs =
+ extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
+ tool_cfg->superblock_size = extra_cfg->superblock_size;
+ tool_cfg->enable_monochrome = cfg->monochrome;
+ tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr != 0;
+ tool_cfg->enable_dual_filter = extra_cfg->enable_dual_filter;
+ tool_cfg->enable_order_hint = extra_cfg->enable_order_hint;
+ tool_cfg->enable_interintra_comp = extra_cfg->enable_interintra_comp;
+ tool_cfg->ref_frame_mvs_present =
+ extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+
+ // Explicitly disable global motion in a few cases:
+ // * For realtime mode, we never search global motion, and disabling
+ // it here prevents later code from allocating buffers we don't need
+ // * For large scale tile mode, some of the intended use cases expect
+ // all frame headers to be identical. This breaks if global motion is
+ // used, since global motion data is stored in the frame header.
+ // eg, see test/lightfield_test.sh, which checks that all frame headers
+ // are the same.
+ tool_cfg->enable_global_motion = extra_cfg->enable_global_motion &&
+ cfg->g_usage != AOM_USAGE_REALTIME &&
+ !cfg->large_scale_tile;
+
+ tool_cfg->error_resilient_mode =
+ cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+ tool_cfg->frame_parallel_decoding_mode =
+ extra_cfg->frame_parallel_decoding_mode;
+
+ // Set Quantization related configuration.
+ q_cfg->using_qm = extra_cfg->enable_qm;
+ q_cfg->qm_minlevel = extra_cfg->qm_min;
+ q_cfg->qm_maxlevel = extra_cfg->qm_max;
+ q_cfg->quant_b_adapt = extra_cfg->quant_b_adapt;
+ q_cfg->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
+ q_cfg->aq_mode = extra_cfg->aq_mode;
+ q_cfg->deltaq_mode = extra_cfg->deltaq_mode;
+ q_cfg->deltaq_strength = extra_cfg->deltaq_strength;
+ q_cfg->use_fixed_qp_offsets =
+ cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q);
+ q_cfg->enable_hdr_deltaq =
+ (q_cfg->deltaq_mode == DELTA_Q_HDR) &&
+ (cfg->g_bit_depth == AOM_BITS_10) &&
+ (extra_cfg->color_primaries == AOM_CICP_CP_BT_2020);
+
+ tool_cfg->enable_deltalf_mode =
+ (q_cfg->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
+
+ // Set cost update frequency configuration.
+ oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
+ oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
+ // Avoid MV cost update for allintra encoding mode.
+ oxcf->cost_upd_freq.mv = (cfg->kf_max_dist != 0)
+ ? (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq
+ : COST_UPD_OFF;
+ oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq;
+
+ // Set frame resize mode configuration.
+ resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
+ resize_cfg->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
+ resize_cfg->resize_kf_scale_denominator =
+ (uint8_t)cfg->rc_resize_kf_denominator;
+ if (resize_cfg->resize_mode == RESIZE_FIXED &&
+ resize_cfg->resize_scale_denominator == SCALE_NUMERATOR &&
+ resize_cfg->resize_kf_scale_denominator == SCALE_NUMERATOR)
+ resize_cfg->resize_mode = RESIZE_NONE;
+
+ // Set encoder algorithm related configuration.
+ algo_cfg->enable_overlay = extra_cfg->enable_overlay;
+ algo_cfg->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+ algo_cfg->sharpness = extra_cfg->sharpness;
+ algo_cfg->arnr_max_frames = extra_cfg->arnr_max_frames;
+ algo_cfg->arnr_strength = extra_cfg->arnr_strength;
+ algo_cfg->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
+ // TODO(any): Fix and Enable TPL for resize-mode > 0
+ algo_cfg->enable_tpl_model =
+ resize_cfg->resize_mode ? 0 : extra_cfg->enable_tpl_model;
+ algo_cfg->loopfilter_control = extra_cfg->loopfilter_control;
+ algo_cfg->skip_postproc_filtering = extra_cfg->skip_postproc_filtering;
+
+ // Set two-pass stats configuration.
+ oxcf->twopass_stats_in = cfg->rc_twopass_stats_in;
+
+ if (extra_cfg->two_pass_output)
+ oxcf->two_pass_output = extra_cfg->two_pass_output;
+
+ oxcf->second_pass_log = extra_cfg->second_pass_log;
+
+ // Set Key frame configuration.
+ kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled;
+ kf_cfg->auto_key =
+ cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+ kf_cfg->key_freq_min = cfg->kf_min_dist;
+ kf_cfg->key_freq_max = cfg->kf_max_dist;
+ kf_cfg->sframe_dist = cfg->sframe_dist;
+ kf_cfg->sframe_mode = cfg->sframe_mode;
+ kf_cfg->enable_sframe = extra_cfg->s_frame_mode;
+ kf_cfg->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+ kf_cfg->fwd_kf_dist = extra_cfg->fwd_kf_dist;
+ // Disable key frame filtering in all intra mode.
+ if (cfg->kf_max_dist == 0) {
+ kf_cfg->enable_keyframe_filtering = 0;
+ }
+ kf_cfg->enable_intrabc = extra_cfg->enable_intrabc;
+
+ oxcf->speed = extra_cfg->cpu_used;
+ // TODO(yunqingwang, any) In REALTIME mode, 1080p performance at speed 5 & 6
+ // is quite bad. Force to use speed 7 for now. Will investigate it when we
+ // work on rd path optimization later.
+ if (oxcf->mode == REALTIME && AOMMIN(cfg->g_w, cfg->g_h) >= 1080 &&
+ oxcf->speed < 7)
+ oxcf->speed = 7;
+
+ // Set Color related configuration.
+ color_cfg->color_primaries = extra_cfg->color_primaries;
+ color_cfg->transfer_characteristics = extra_cfg->transfer_characteristics;
+ color_cfg->matrix_coefficients = extra_cfg->matrix_coefficients;
+ color_cfg->color_range = extra_cfg->color_range;
+ color_cfg->chroma_sample_position = extra_cfg->chroma_sample_position;
+
+ // Set Group of frames configuration.
+ // Force lag_in_frames to 0 for REALTIME mode
+ gf_cfg->lag_in_frames = (oxcf->mode == REALTIME)
+ ? 0
+ : clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
+ gf_cfg->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+ gf_cfg->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+ gf_cfg->min_gf_interval = extra_cfg->min_gf_interval;
+ gf_cfg->max_gf_interval = extra_cfg->max_gf_interval;
+ gf_cfg->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
+ gf_cfg->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
+
+ // Set tune related configuration.
+ tune_cfg->tuning = extra_cfg->tuning;
+ tune_cfg->vmaf_model_path = extra_cfg->vmaf_model_path;
+ tune_cfg->content = extra_cfg->content;
+ if (cfg->large_scale_tile) {
+ tune_cfg->film_grain_test_vector = 0;
+ tune_cfg->film_grain_table_filename = NULL;
+ } else {
+ tune_cfg->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+ tune_cfg->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+ }
+ tune_cfg->dist_metric = extra_cfg->dist_metric;
+#if CONFIG_DENOISE
+ oxcf->noise_level = extra_cfg->noise_level;
+ oxcf->noise_block_size = extra_cfg->noise_block_size;
+ oxcf->enable_dnl_denoising = extra_cfg->enable_dnl_denoising;
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ // Temporal denoiser is for nonrd pickmode so disable it for speed < 7.
+ // Also disable it for speed 7 for now since it needs to be modified for
+ // the check_partition_merge_mode feature.
+ if (cfg->g_bit_depth == AOM_BITS_8 && oxcf->speed > 7) {
+ oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+ } else {
+ oxcf->noise_sensitivity = 0;
+ }
+#endif
+ // Set Tile related configuration.
+ tile_cfg->num_tile_groups = extra_cfg->num_tg;
+ // In large-scale tile encoding mode, num_tile_groups is always 1.
+ if (cfg->large_scale_tile) tile_cfg->num_tile_groups = 1;
+ tile_cfg->mtu = extra_cfg->mtu_size;
+ tile_cfg->enable_large_scale_tile = cfg->large_scale_tile;
+ tile_cfg->enable_single_tile_decoding =
+ (tile_cfg->enable_large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
+ tile_cfg->tile_columns = extra_cfg->tile_columns;
+ tile_cfg->tile_rows = extra_cfg->tile_rows;
+ tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
+ tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
+ for (int i = 0; i < tile_cfg->tile_width_count; i++) {
+ tile_cfg->tile_widths[i] = cfg->tile_widths[i];
+ }
+ for (int i = 0; i < tile_cfg->tile_height_count; i++) {
+ tile_cfg->tile_heights[i] = cfg->tile_heights[i];
+ }
+ tile_cfg->enable_ext_tile_debug = extra_cfg->ext_tile_debug;
+
+ if (tile_cfg->enable_large_scale_tile) {
+ // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
+ // AOM_SUPERBLOCK_SIZE_128X128 while tile_cfg->enable_large_scale_tile = 1.
+ // If superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+ // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
+ if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
+ extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
+ tool_cfg->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+ }
+
+ // Set reference frame related configuration.
+ oxcf->ref_frm_cfg.max_reference_frames = extra_cfg->max_reference_frames;
+ oxcf->ref_frm_cfg.enable_reduced_reference_set =
+ extra_cfg->enable_reduced_reference_set;
+ oxcf->ref_frm_cfg.enable_onesided_comp = extra_cfg->enable_onesided_comp;
+
+ oxcf->row_mt = extra_cfg->row_mt;
+ oxcf->fp_mt = extra_cfg->fp_mt;
+
+ // Set motion mode related configuration.
+ oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc;
+ oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion;
+#if !CONFIG_REALTIME_ONLY
+ if (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7 &&
+ oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) {
+ // TODO(marpan): warped motion is causing a crash for RT mode with screen
+ // in nonrd (speed >= 7), for non-realtime build.
+ // Re-enable/allow when the issue is fixed.
+ oxcf->motion_mode_cfg.enable_warped_motion = 0;
+ oxcf->motion_mode_cfg.allow_warped_motion = 0;
+ } else {
+ oxcf->motion_mode_cfg.allow_warped_motion =
+ (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+ }
+#else
+ oxcf->motion_mode_cfg.allow_warped_motion =
+ (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7)
+ ? false
+ : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+#endif
+
+ // Set partition related configuration.
+ part_cfg->enable_rect_partitions = extra_cfg->enable_rect_partitions;
+ part_cfg->enable_ab_partitions = extra_cfg->enable_ab_partitions;
+ part_cfg->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
+ part_cfg->min_partition_size = extra_cfg->min_partition_size;
+ part_cfg->max_partition_size = extra_cfg->max_partition_size;
+
+ // Set intra mode configuration.
+ intra_mode_cfg->enable_angle_delta = extra_cfg->enable_angle_delta;
+ intra_mode_cfg->enable_intra_edge_filter =
+ extra_cfg->enable_intra_edge_filter;
+ intra_mode_cfg->enable_filter_intra = extra_cfg->enable_filter_intra;
+ intra_mode_cfg->enable_smooth_intra = extra_cfg->enable_smooth_intra;
+ intra_mode_cfg->enable_paeth_intra = extra_cfg->enable_paeth_intra;
+ intra_mode_cfg->enable_cfl_intra = extra_cfg->enable_cfl_intra;
+ intra_mode_cfg->enable_directional_intra =
+ extra_cfg->enable_directional_intra;
+ intra_mode_cfg->enable_diagonal_intra = extra_cfg->enable_diagonal_intra;
+ intra_mode_cfg->auto_intra_tools_off = extra_cfg->auto_intra_tools_off;
+
+ // Set transform size/type configuration.
+ txfm_cfg->enable_tx64 = extra_cfg->enable_tx64;
+ txfm_cfg->enable_flip_idtx = extra_cfg->enable_flip_idtx;
+ txfm_cfg->enable_rect_tx = extra_cfg->enable_rect_tx;
+ txfm_cfg->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+ txfm_cfg->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+ txfm_cfg->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+ txfm_cfg->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+ txfm_cfg->enable_tx_size_search = extra_cfg->enable_tx_size_search;
+
+ // Set compound type configuration.
+ comp_type_cfg->enable_dist_wtd_comp =
+ extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
+ comp_type_cfg->enable_masked_comp = extra_cfg->enable_masked_comp;
+ comp_type_cfg->enable_diff_wtd_comp =
+ extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
+ comp_type_cfg->enable_interinter_wedge =
+ extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
+ comp_type_cfg->enable_smooth_interintra =
+ extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
+ comp_type_cfg->enable_interintra_wedge =
+ extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
+
+ // Set Super-resolution mode configuration.
+ if (extra_cfg->lossless || cfg->large_scale_tile) {
+ disable_superres(superres_cfg);
+ } else {
+ superres_cfg->superres_mode = cfg->rc_superres_mode;
+ superres_cfg->superres_scale_denominator =
+ (uint8_t)cfg->rc_superres_denominator;
+ superres_cfg->superres_kf_scale_denominator =
+ (uint8_t)cfg->rc_superres_kf_denominator;
+ superres_cfg->superres_qthresh =
+ av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+ superres_cfg->superres_kf_qthresh =
+ av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+ if (superres_cfg->superres_mode == AOM_SUPERRES_FIXED &&
+ superres_cfg->superres_scale_denominator == SCALE_NUMERATOR &&
+ superres_cfg->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+ disable_superres(superres_cfg);
+ }
+ if (superres_cfg->superres_mode == AOM_SUPERRES_QTHRESH &&
+ superres_cfg->superres_qthresh == 255 &&
+ superres_cfg->superres_kf_qthresh == 255) {
+ disable_superres(superres_cfg);
+ }
+ }
+
+ superres_cfg->enable_superres =
+ (superres_cfg->superres_mode != AOM_SUPERRES_NONE) &&
+ extra_cfg->enable_superres;
+ if (!superres_cfg->enable_superres) {
+ disable_superres(superres_cfg);
+ }
+
+ if (input_cfg->limit == 1) {
+ // still picture mode, display model and timing is meaningless
+ dec_model_cfg->display_model_info_present_flag = 0;
+ dec_model_cfg->timing_info_present = 0;
+ }
+
+ oxcf->save_as_annexb = cfg->save_as_annexb;
+
+ // Set unit test related configuration.
+ oxcf->unit_test_cfg.motion_vector_unit_test =
+ extra_cfg->motion_vector_unit_test;
+ oxcf->unit_test_cfg.sb_multipass_unit_test =
+ extra_cfg->sb_multipass_unit_test;
+
+ oxcf->border_in_pixels =
+ av1_get_enc_border_size(av1_is_resize_needed(oxcf),
+ (oxcf->kf_cfg.key_freq_max == 0), BLOCK_128X128);
+ memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
+ sizeof(oxcf->target_seq_level_idx));
+ oxcf->tier_mask = extra_cfg->tier_mask;
+
+ oxcf->partition_info_path = extra_cfg->partition_info_path;
+
+ oxcf->enable_rate_guide_deltaq = extra_cfg->enable_rate_guide_deltaq;
+ oxcf->rate_distribution_info = extra_cfg->rate_distribution_info;
+
+ oxcf->strict_level_conformance = extra_cfg->strict_level_conformance;
+
+ oxcf->kf_max_pyr_height = extra_cfg->kf_max_pyr_height;
+
+ oxcf->sb_qp_sweep = extra_cfg->sb_qp_sweep;
+}
+
+AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg) {
+ AV1EncoderConfig oxcf;
+ struct av1_extracfg extra_cfg = default_extra_cfg;
+ set_encoder_config(&oxcf, cfg, &extra_cfg);
+ return oxcf;
+}
+
+static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
+ const aom_codec_enc_cfg_t *cfg) {
+ aom_codec_err_t res;
+ int force_key = 0;
+
+ if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+ if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+ ERROR("Cannot change width or height after initialization");
+ // Note: function encoder_set_config() is allowed to be called multiple
+ // times. However, when the original frame width or height is less than two
+ // times of the new frame width or height, a forced key frame should be
+ // used. To make sure the correct detection of a forced key frame, we need
+ // to update the frame width and height only when the actual encoding is
+ // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+ // track the actual coded frame size.
+ if (ctx->ppi->cpi->last_coded_width && ctx->ppi->cpi->last_coded_height &&
+ (!valid_ref_frame_size(ctx->ppi->cpi->last_coded_width,
+ ctx->ppi->cpi->last_coded_height, cfg->g_w,
+ cfg->g_h) ||
+ ((int)cfg->g_w > ctx->ppi->cpi->last_coded_width) ||
+ ((int)cfg->g_h > ctx->ppi->cpi->last_coded_height))) {
+ force_key = 1;
+ }
+ }
+
+ if (ctx->monochrome_on_init && cfg->monochrome == 0) {
+ // TODO(aomedia:3465): Allow this case to work without requiring re-init
+ // of encoder.
+ ERROR("Cannot change to monochrome = 0 after init with monochrome");
+ }
+
+ // Prevent increasing lag_in_frames. This check is stricter than it needs
+ // to be -- the limit is not increasing past the first lag_in_frames
+ // value, but we don't track the initial config, only the last successful
+ // config.
+ if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+ ERROR("Cannot increase lag_in_frames");
+ // Prevent changing lag_in_frames if Lookahead Processing is enabled
+ if (cfg->g_lag_in_frames != ctx->cfg.g_lag_in_frames &&
+ ctx->num_lap_buffers > 0)
+ ERROR("Cannot change lag_in_frames if LAP is enabled");
+
+ res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+ if (res == AOM_CODEC_OK) {
+ ctx->cfg = *cfg;
+ set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+ // On profile change, request a key frame
+ force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile;
+ bool is_sb_size_changed = false;
+ av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+ for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+ av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+ is_sb_size_changed);
+ }
+ if (ctx->ppi->cpi_lap != NULL) {
+ av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
+ }
+ }
+
+ if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
+
+ return res;
+}
+
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+ return av1_get_global_headers(ctx->ppi);
+}
+
+static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = av1_get_quantizer(ctx->ppi->cpi);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->ppi->cpi));
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_loopfilter_level(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = ctx->ppi->cpi->common.lf.filter_level[0];
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = ctx->ppi->p_rc.baseline_gf_interval;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
+ const struct av1_extracfg *extra_cfg) {
+ const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+ if (res == AOM_CODEC_OK) {
+ ctx->extra_cfg = *extra_cfg;
+ set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+ av1_check_fpmt_config(ctx->ppi, &ctx->oxcf);
+ bool is_sb_size_changed = false;
+ av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+ for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+ AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i];
+ struct aom_internal_error_info *const error = cpi->common.error;
+ if (setjmp(error->jmp)) {
+ error->setjmp = 0;
+ return error->error_code;
+ }
+ error->setjmp = 1;
+ av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed);
+ error->setjmp = 0;
+ }
+ if (ctx->ppi->cpi_lap != NULL) {
+ AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap;
+ struct aom_internal_error_info *const error = cpi_lap->common.error;
+ if (setjmp(error->jmp)) {
+ error->setjmp = 0;
+ return error->error_code;
+ }
+ error->setjmp = 1;
+ av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed);
+ error->setjmp = 0;
+ }
+ }
+ return res;
+}
+
+static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int row_mt = CAST(AV1E_SET_ROW_MT, args);
+ if (row_mt == ctx->extra_cfg.row_mt) return AOM_CODEC_OK;
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.row_mt = row_mt;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
+ if (tile_columns == ctx->extra_cfg.tile_columns) return AOM_CODEC_OK;
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tile_columns = tile_columns;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
+ if (tile_rows == ctx->extra_cfg.tile_rows) return AOM_CODEC_OK;
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tile_rows = tile_rows;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+#if CONFIG_REALTIME_ONLY
+ if (tpl_model_arg) {
+ ERROR("TPL model can't be turned on in realtime only build.");
+ }
+#endif
+ extra_cfg.enable_tpl_model = tpl_model_arg;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_keyframe_filtering(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_keyframe_filtering =
+ CAST(AV1E_SET_ENABLE_KEYFRAME_FILTERING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tuning = CAST(AOME_SET_TUNING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.rc_max_intra_bitrate_pct =
+ CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.rc_max_inter_bitrate_pct =
+ CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+#if CONFIG_REALTIME_ONLY
+ if (restoration_arg) {
+ ERROR("Restoration can't be turned on in realtime only build.");
+ }
+#endif
+ extra_cfg.enable_restoration = restoration_arg;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.force_video_mode = CAST(AV1E_SET_FORCE_VIDEO_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args);
+#if CONFIG_REALTIME_ONLY
+ if (obmc_arg) {
+ ERROR("OBMC can't be enabled in realtime only build.");
+ }
+#endif
+ extra_cfg.enable_obmc = obmc_arg;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_chroma_deltaq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_chroma_deltaq = CAST(AV1E_SET_ENABLE_CHROMA_DELTAQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_rect_partitions(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_rect_partitions =
+ CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_1to4_partitions(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_1to4_partitions =
+ CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intra_edge_filter(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_intra_edge_filter =
+ CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_rect_tx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_rect_tx = CAST(AV1E_SET_ENABLE_RECT_TX, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_reduced_reference_set(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_reduced_reference_set =
+ CAST(AV1E_SET_REDUCED_REFERENCE_SET, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_comp(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_interintra_comp =
+ CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_interintra(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_smooth_interintra =
+ CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interinter_wedge(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_interinter_wedge =
+ CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_wedge(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_interintra_wedge =
+ CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+ if (global_motion_arg) {
+ ERROR("Global motion can't be enabled in realtime only build.");
+ }
+#endif
+ extra_cfg.enable_global_motion = global_motion_arg;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+ if (warped_motion_arg) {
+ ERROR("Warped motion can't be enabled in realtime only build.");
+ }
+#endif
+ extra_cfg.enable_warped_motion = warped_motion_arg;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_directional_intra(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_directional_intra =
+ CAST(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_diagonal_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_diagonal_intra = CAST(AV1E_SET_ENABLE_DIAGONAL_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_overlay(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_overlay = CAST(AV1E_SET_ENABLE_OVERLAY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.frame_parallel_decoding_mode =
+ CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args);
+
+ // Skip AQ mode if using fixed QP for current frame.
+ if (ctx->ppi->cpi->rc.use_external_qp_one_pass) extra_cfg.aq_mode = 0;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_intra_default_tx_only =
+ CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_tx_size_search(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_tx_size_search = CAST(AV1E_SET_ENABLE_TX_SIZE_SEARCH, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if CONFIG_REALTIME_ONLY
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_set_vbr_corpus_complexity_lap(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.vbr_corpus_complexity_lap =
+ CAST(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const char *str = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+ const aom_codec_err_t ret = allocate_and_set_string(
+ str, default_extra_cfg.vmaf_model_path, &extra_cfg.vmaf_model_path,
+ ctx->ppi->error.detail);
+ if (ret != AOM_CODEC_OK) return ret;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const char *str = CAST(AV1E_SET_PARTITION_INFO_PATH, args);
+ const aom_codec_err_t ret = allocate_and_set_string(
+ str, default_extra_cfg.partition_info_path,
+ &extra_cfg.partition_info_path, ctx->ppi->error.detail);
+ if (ret != AOM_CODEC_OK) return ret;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_rate_guide_deltaq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_rate_guide_deltaq =
+ CAST(AV1E_ENABLE_RATE_GUIDE_DELTAQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rate_distribution_info(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const char *str = CAST(AV1E_SET_RATE_DISTRIBUTION_INFO, args);
+ const aom_codec_err_t ret = allocate_and_set_string(
+ str, default_extra_cfg.rate_distribution_info,
+ &extra_cfg.rate_distribution_info, ctx->ppi->error.detail);
+ if (ret != AOM_CODEC_OK) return ret;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_test_vector(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.film_grain_test_vector =
+ CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const char *str = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+ if (str == NULL) {
+ // this parameter allows NULL as its value
+ extra_cfg.film_grain_table_filename = str;
+ } else {
+ const aom_codec_err_t ret = allocate_and_set_string(
+ str, default_extra_cfg.film_grain_table_filename,
+ &extra_cfg.film_grain_table_filename, ctx->ppi->error.detail);
+ if (ret != AOM_CODEC_OK) return ret;
+ }
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_DENOISE
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_level =
+ ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+ return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_DENOISE
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_set_enable_dnl_denoising(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_DENOISE
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_dnl_denoising = CAST(AV1E_SET_ENABLE_DNL_DENOISING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args);
+#if CONFIG_REALTIME_ONLY
+ if (deltaq_arg > NO_DELTA_Q) {
+ ERROR("Delta Q mode can't be enabled in realtime only build.");
+ }
+#endif
+ extra_cfg.deltaq_mode = deltaq_arg;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_deltaq_strength(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.deltaq_strength = CAST(AV1E_SET_DELTAQ_STRENGTH, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_deltalf_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.deltalf_mode = CAST(AV1E_SET_DELTALF_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_gf_min_pyr_height(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.gf_min_pyr_height = CAST(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.motion_vector_unit_test =
+ CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_fpmt_unit_test(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_FPMT_TEST
+ (void)args;
+ (void)ctx;
+ return AOM_CODEC_INCAPABLE;
+#else
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.fpmt_unit_test = CAST(AV1E_SET_FP_MT_UNIT_TEST, args);
+ ctx->ppi->fpmt_unit_test_cfg = (extra_cfg.fpmt_unit_test == 1)
+ ? PARALLEL_ENCODE
+ : PARALLEL_SIMULATION_ENCODE;
+ return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.ext_tile_debug = CAST(AV1E_ENABLE_EXT_TILE_DEBUG, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args);
+ const int level = val % 100;
+ const int operating_point_idx = val / 100;
+ if (operating_point_idx < 0 ||
+ operating_point_idx >= MAX_NUM_OPERATING_POINTS) {
+ char *const err_string = ctx->ppi->error.detail;
+ snprintf(err_string, ARG_ERR_MSG_MAX_LEN,
+ "Invalid operating point index: %d", operating_point_idx);
+ ctx->base.err_detail = err_string;
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_cr(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.min_cr = CAST(AV1E_SET_MIN_CR, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_sb_multipass_unit_test(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.sb_multipass_unit_test =
+ CAST(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_sb_qp_sweep(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.sb_qp_sweep = CAST(AV1E_ENABLE_SB_QP_SWEEP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ AV1_COMP *const cpi = ctx->ppi->cpi;
+ aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args);
+ aom_ext_part_config_t config;
+ // TODO(chengchen): verify the sb_size has been set at this point.
+ config.superblock_size = cpi->common.seq_params->sb_size;
+ const aom_codec_err_t status =
+ av1_ext_part_create(funcs, config, &cpi->ext_part_controller);
+ return status;
+}
+
+static aom_codec_err_t ctrl_set_loopfilter_control(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.loopfilter_control = CAST(AV1E_SET_LOOPFILTER_CONTROL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_skip_postproc_filtering(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ // Skipping the application of post-processing filters is allowed only
+ // for ALLINTRA mode.
+ if (ctx->cfg.g_usage != AOM_USAGE_ALL_INTRA) return AOM_CODEC_INCAPABLE;
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.skip_postproc_filtering =
+ CAST(AV1E_SET_SKIP_POSTPROC_FILTERING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rtc_external_rc(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->ppi->cpi->rc.rtc_external_ratectrl =
+ CAST(AV1E_SET_RTC_EXTERNAL_RC, args);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_quantizer_one_pass(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int qp = CAST(AV1E_SET_QUANTIZER_ONE_PASS, args);
+
+ if (qp < 0 || qp > 63) return AOM_CODEC_INVALID_PARAM;
+
+ aom_codec_enc_cfg_t *cfg = &ctx->cfg;
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp;
+ extra_cfg.aq_mode = 0;
+ ctx->ppi->cpi->rc.use_external_qp_one_pass = 1;
+
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_bitrate_one_pass_cbr(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ AV1_PRIMARY *const ppi = ctx->ppi;
+ AV1_COMP *const cpi = ppi->cpi;
+ AV1EncoderConfig *oxcf = &cpi->oxcf;
+ if (!is_one_pass_rt_params(cpi) || oxcf->rc_cfg.mode != AOM_CBR ||
+ cpi->ppi->use_svc || ppi->num_fp_contexts != 1 || ppi->cpi_lap != NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ const int new_bitrate = CAST(AV1E_SET_BITRATE_ONE_PASS_CBR, args);
+ ctx->cfg.rc_target_bitrate = new_bitrate;
+ oxcf->rc_cfg.target_bandwidth = new_bitrate * 1000;
+ set_primary_rc_buffer_sizes(oxcf, ppi);
+ av1_new_framerate(cpi, cpi->framerate);
+ check_reset_rc_flag(cpi);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_max_consec_frame_drop_cbr(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ AV1_PRIMARY *const ppi = ctx->ppi;
+ AV1_COMP *const cpi = ppi->cpi;
+ const int max_consec_drop = CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, args);
+ if (max_consec_drop < 0) return AOM_CODEC_INVALID_PARAM;
+ cpi->rc.max_consec_drop = max_consec_drop;
+ cpi->rc.drop_count_consec = 0;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ AV1_PRIMARY *const ppi = ctx->ppi;
+ AV1_COMP *const cpi = ppi->cpi;
+ cpi->svc.framedrop_mode = CAST(AV1E_SET_SVC_FRAME_DROP_MODE, args);
+ if (cpi->svc.framedrop_mode != AOM_LAYER_DROP &&
+ cpi->svc.framedrop_mode != AOM_FULL_SUPERFRAME_DROP)
+ return AOM_CODEC_INVALID_PARAM;
+ else
+ return AOM_CODEC_OK;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
+ STATS_BUFFER_CTX *stats_buf_context,
+ int num_lap_buffers) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+
+ int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
+ *frame_stats_buffer =
+ (FIRSTPASS_STATS *)aom_calloc(size, sizeof(FIRSTPASS_STATS));
+ if (*frame_stats_buffer == NULL) return AOM_CODEC_MEM_ERROR;
+
+ stats_buf_context->stats_in_start = *frame_stats_buffer;
+ stats_buf_context->stats_in_end = stats_buf_context->stats_in_start;
+ stats_buf_context->stats_in_buf_end =
+ stats_buf_context->stats_in_start + size;
+
+ stats_buf_context->total_left_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS));
+ if (stats_buf_context->total_left_stats == NULL) return AOM_CODEC_MEM_ERROR;
+ av1_twopass_zero_stats(stats_buf_context->total_left_stats);
+ stats_buf_context->total_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS));
+ if (stats_buf_context->total_stats == NULL) return AOM_CODEC_MEM_ERROR;
+ av1_twopass_zero_stats(stats_buf_context->total_stats);
+ return res;
+}
+#endif
+
+aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi,
+ AV1_COMP **p_cpi,
+ BufferPool **p_buffer_pool,
+ const AV1EncoderConfig *oxcf,
+ COMPRESSOR_STAGE stage,
+ int lap_lag_in_frames) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+ BufferPool *buffer_pool = *p_buffer_pool;
+
+ if (buffer_pool == NULL) {
+ buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+ if (buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+ buffer_pool->num_frame_bufs =
+ (oxcf->mode == ALLINTRA) ? FRAME_BUFFERS_ALLINTRA : FRAME_BUFFERS;
+ buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc(
+ buffer_pool->num_frame_bufs, sizeof(*buffer_pool->frame_bufs));
+ if (buffer_pool->frame_bufs == NULL) {
+ buffer_pool->num_frame_bufs = 0;
+ aom_free(buffer_pool);
+ return AOM_CODEC_MEM_ERROR;
+ }
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&buffer_pool->pool_mutex, NULL)) {
+ aom_free(buffer_pool->frame_bufs);
+ buffer_pool->frame_bufs = NULL;
+ buffer_pool->num_frame_bufs = 0;
+ aom_free(buffer_pool);
+ return AOM_CODEC_MEM_ERROR;
+ }
+#endif
+ *p_buffer_pool = buffer_pool;
+ }
+ *p_cpi =
+ av1_create_compressor(ppi, oxcf, buffer_pool, stage, lap_lag_in_frames);
+ if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR;
+
+ return res;
+}
+
+static aom_codec_err_t ctrl_set_fp_mt(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.fp_mt = CAST(AV1E_SET_FP_MT, args);
+ const aom_codec_err_t result = update_extra_cfg(ctx, &extra_cfg);
+ int num_fp_contexts = 1;
+ if (ctx->ppi->num_fp_contexts == 1) {
+ num_fp_contexts =
+ av1_compute_num_fp_contexts(ctx->ppi, &ctx->ppi->parallel_cpi[0]->oxcf);
+ if (num_fp_contexts > 1) {
+ int i;
+ for (i = 1; i < num_fp_contexts; i++) {
+ int res = av1_create_context_and_bufferpool(
+ ctx->ppi, &ctx->ppi->parallel_cpi[i], &ctx->buffer_pool, &ctx->oxcf,
+ ENCODE_STAGE, -1);
+ if (res != AOM_CODEC_OK) {
+ return res;
+ }
+#if !CONFIG_REALTIME_ONLY
+ ctx->ppi->parallel_cpi[i]->twopass_frame.stats_in =
+ ctx->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+ }
+ }
+ }
+ ctx->ppi->num_fp_contexts = num_fp_contexts;
+ return result;
+}
+
+static aom_codec_err_t ctrl_set_auto_intra_tools_off(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.auto_intra_tools_off = CAST(AV1E_SET_AUTO_INTRA_TOOLS_OFF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+
+ if (ctx->priv == NULL) {
+ aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv));
+ if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+ ctx->priv = (aom_codec_priv_t *)priv;
+ ctx->priv->init_flags = ctx->init_flags;
+
+ // Update the reference to the config structure to an internal copy.
+ assert(ctx->config.enc);
+ priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &priv->cfg;
+
+ priv->extra_cfg = default_extra_cfg;
+ // Special handling:
+ // By default, if omitted, --enable-cdef = 1.
+ // Here we set its default value to 0 when --allintra is turned on.
+ // However, if users set --enable-cdef = 1 from command line,
+ // The encoder still respects it.
+ if (priv->cfg.g_usage == ALLINTRA) {
+ priv->extra_cfg.enable_cdef = 0;
+ }
+ av1_initialize_enc(priv->cfg.g_usage, priv->cfg.rc_end_usage);
+
+ res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+ if (res == AOM_CODEC_OK) {
+ int *num_lap_buffers = &priv->num_lap_buffers;
+ int lap_lag_in_frames = 0;
+ *num_lap_buffers = 0;
+ priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
+ priv->timestamp_ratio.num =
+ (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC;
+ reduce_ratio(&priv->timestamp_ratio);
+
+ set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+ if (priv->oxcf.rc_cfg.mode != AOM_CBR &&
+ priv->oxcf.pass == AOM_RC_ONE_PASS && priv->oxcf.mode == GOOD) {
+ // Enable look ahead - enabled for AOM_Q, AOM_CQ, AOM_VBR
+ *num_lap_buffers =
+ AOMMIN((int)priv->cfg.g_lag_in_frames,
+ AOMMIN(MAX_LAP_BUFFERS, priv->oxcf.kf_cfg.key_freq_max +
+ SCENE_CUT_KEY_TEST_INTERVAL));
+ if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >=
+ LAP_LAG_IN_FRAMES) {
+ lap_lag_in_frames = LAP_LAG_IN_FRAMES;
+ }
+ }
+ priv->oxcf.use_highbitdepth =
+ (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+
+ priv->monochrome_on_init = priv->cfg.monochrome;
+
+ priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head,
+ *num_lap_buffers, &priv->oxcf);
+ if (!priv->ppi) return AOM_CODEC_MEM_ERROR;
+
+#if !CONFIG_REALTIME_ONLY
+ res = create_stats_buffer(&priv->frame_stats_buffer,
+ &priv->stats_buf_context, *num_lap_buffers);
+ if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+
+ assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
+ int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS);
+ for (int i = 0; i < size; i++)
+ priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i];
+
+ priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context;
+#endif
+
+ assert(priv->ppi->num_fp_contexts >= 1);
+ res = av1_create_context_and_bufferpool(
+ priv->ppi, &priv->ppi->parallel_cpi[0], &priv->buffer_pool,
+ &priv->oxcf, ENCODE_STAGE, -1);
+ if (res != AOM_CODEC_OK) {
+ return res;
+ }
+#if !CONFIG_REALTIME_ONLY
+ priv->ppi->parallel_cpi[0]->twopass_frame.stats_in =
+ priv->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+ priv->ppi->cpi = priv->ppi->parallel_cpi[0];
+
+ // Create another compressor if look ahead is enabled
+ if (res == AOM_CODEC_OK && *num_lap_buffers) {
+ res = av1_create_context_and_bufferpool(
+ priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf,
+ LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS));
+ }
+ }
+ }
+
+ return res;
+}
+
+void av1_destroy_context_and_bufferpool(AV1_COMP *cpi,
+ BufferPool **p_buffer_pool) {
+ av1_remove_compressor(cpi);
+ if (*p_buffer_pool) {
+ av1_free_ref_frame_buffers(*p_buffer_pool);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex);
+#endif
+ aom_free(*p_buffer_pool);
+ *p_buffer_pool = NULL;
+ }
+}
+
+static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
+ FIRSTPASS_STATS *frame_stats_buffer) {
+ aom_free(stats_buf_context->total_left_stats);
+ aom_free(stats_buf_context->total_stats);
+ aom_free(frame_stats_buffer);
+}
+
+static void check_and_free_string(const char *default_str, const char **ptr) {
+ if (*ptr == default_str) {
+ // Default should be a literal. Do not free.
+ return;
+ }
+ aom_free((void *)*ptr);
+ *ptr = NULL;
+}
+
+static void destroy_extra_config(struct av1_extracfg *extra_cfg) {
+#if CONFIG_TUNE_VMAF
+ check_and_free_string(default_extra_cfg.vmaf_model_path,
+ &extra_cfg->vmaf_model_path);
+#endif
+ check_and_free_string(default_extra_cfg.two_pass_output,
+ &extra_cfg->two_pass_output);
+ check_and_free_string(default_extra_cfg.two_pass_output,
+ &extra_cfg->second_pass_log);
+ check_and_free_string(default_extra_cfg.partition_info_path,
+ &extra_cfg->partition_info_path);
+ check_and_free_string(default_extra_cfg.rate_distribution_info,
+ &extra_cfg->rate_distribution_info);
+ check_and_free_string(default_extra_cfg.film_grain_table_filename,
+ &extra_cfg->film_grain_table_filename);
+}
+
+static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
+ free(ctx->cx_data);
+ destroy_extra_config(&ctx->extra_cfg);
+
+ if (ctx->ppi) {
+ AV1_PRIMARY *ppi = ctx->ppi;
+ for (int i = 0; i < MAX_PARALLEL_FRAMES - 1; i++) {
+ if (ppi->parallel_frames_data[i].cx_data) {
+ free(ppi->parallel_frames_data[i].cx_data);
+ }
+ }
+#if CONFIG_ENTROPY_STATS
+ print_entropy_stats(ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+ print_internal_stats(ppi);
+#endif
+
+ for (int i = 0; i < MAX_PARALLEL_FRAMES; i++) {
+ av1_destroy_context_and_bufferpool(ppi->parallel_cpi[i],
+ &ctx->buffer_pool);
+ }
+ ppi->cpi = NULL;
+
+ if (ppi->cpi_lap) {
+ av1_destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap);
+ }
+ av1_remove_primary_compressor(ppi);
+ }
+ destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer);
+ aom_free(ctx);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
+ unsigned int lib_flags) {
+ aom_codec_frame_flags_t flags = lib_flags << 16;
+ if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
+ if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY;
+ if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH;
+ if (lib_flags & FRAMEFLAGS_ERROR_RESILIENT)
+ flags |= AOM_FRAME_IS_ERROR_RESILIENT;
+ if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE;
+
+ return flags;
+}
+
+static INLINE int get_src_border_in_pixels(AV1_COMP *cpi, BLOCK_SIZE sb_size) {
+ if (cpi->oxcf.mode != REALTIME || av1_is_resize_needed(&cpi->oxcf))
+ return cpi->oxcf.border_in_pixels;
+
+ const int sb_size_in_pixels_log2 = mi_size_wide_log2[sb_size] + MI_SIZE_LOG2;
+ const int sb_aligned_width =
+ ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.width, sb_size_in_pixels_log2);
+ const int sb_aligned_height =
+ ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.height, sb_size_in_pixels_log2);
+ // Align the border pixels to a multiple of 32.
+ const int border_pixels_width =
+ ALIGN_POWER_OF_TWO(sb_aligned_width - cpi->oxcf.frm_dim_cfg.width, 5);
+ const int border_pixels_height =
+ ALIGN_POWER_OF_TWO(sb_aligned_height - cpi->oxcf.frm_dim_cfg.height, 5);
+ const int border_in_pixels =
+ AOMMAX(AOMMAX(border_pixels_width, border_pixels_height), 32);
+ return border_in_pixels;
+}
+
+// TODO(Mufaddal): Check feasibility of abstracting functions related to LAP
+// into a separate function.
+static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
+ const aom_image_t *img,
+ aom_codec_pts_t pts,
+ unsigned long duration,
+ aom_enc_frame_flags_t enc_flags) {
+ const size_t kMinCompressedSize = 8192;
+ volatile aom_codec_err_t res = AOM_CODEC_OK;
+ AV1_PRIMARY *const ppi = ctx->ppi;
+ volatile aom_codec_pts_t ptsvol = pts;
+ AV1_COMP_DATA cpi_data = { 0 };
+
+ cpi_data.timestamp_ratio = &ctx->timestamp_ratio;
+ cpi_data.flush = !img;
+ // LAP context
+ AV1_COMP *cpi_lap = ppi->cpi_lap;
+ if (ppi->cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+
+ ppi->cpi->last_coded_width = ppi->cpi->oxcf.frm_dim_cfg.width;
+ ppi->cpi->last_coded_height = ppi->cpi->oxcf.frm_dim_cfg.height;
+
+ if (ppi->lap_enabled && cpi_lap == NULL &&
+ ppi->cpi->oxcf.pass == AOM_RC_ONE_PASS)
+ return AOM_CODEC_INVALID_PARAM;
+
+ if (img != NULL) {
+ res = validate_img(ctx, img);
+ if (res == AOM_CODEC_OK) {
+ const size_t uncompressed_frame_sz =
+ ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) *
+ ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8;
+
+ // Due to the presence of no-show frames, the ctx->cx_data buffer holds
+ // compressed data corresponding to multiple frames. As no-show frames are
+ // not possible for all intra frame encoding with no forward key frames,
+ // the buffer is allocated with a smaller size in this case.
+ //
+ // For pseudo random input, the compressed frame size is seen to exceed
+ // the uncompressed frame size, but is less than 2 times the uncompressed
+ // frame size. Hence the size of the buffer is chosen as 2 times the
+ // uncompressed frame size.
+ int multiplier = 8;
+ if (ppi->cpi->oxcf.kf_cfg.key_freq_max == 0 &&
+ !ppi->cpi->oxcf.kf_cfg.fwd_kf_enabled)
+ multiplier = 2;
+ size_t data_sz = uncompressed_frame_sz * multiplier;
+ if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
+ if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+ ctx->cx_data_sz = data_sz;
+ free(ctx->cx_data);
+ ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+ if (ctx->cx_data == NULL) {
+ ctx->cx_data_sz = 0;
+ return AOM_CODEC_MEM_ERROR;
+ }
+ }
+ for (int i = 0; i < ppi->num_fp_contexts - 1; i++) {
+ if (ppi->parallel_frames_data[i].cx_data == NULL) {
+ ppi->parallel_frames_data[i].cx_data_sz = uncompressed_frame_sz;
+ ppi->parallel_frames_data[i].frame_display_order_hint = -1;
+ ppi->parallel_frames_data[i].frame_size = 0;
+ ppi->parallel_frames_data[i].cx_data =
+ (unsigned char *)malloc(ppi->parallel_frames_data[i].cx_data_sz);
+ if (ppi->parallel_frames_data[i].cx_data == NULL) {
+ ppi->parallel_frames_data[i].cx_data_sz = 0;
+ return AOM_CODEC_MEM_ERROR;
+ }
+ }
+ }
+ }
+ }
+
+ aom_codec_pkt_list_init(&ctx->pkt_list);
+
+ volatile aom_enc_frame_flags_t flags = enc_flags;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(ppi->error.jmp)) {
+ ppi->error.setjmp = 0;
+ res = update_error_state(ctx, &ppi->error);
+ return res;
+ }
+ ppi->error.setjmp = 1;
+
+ if (ppi->use_svc && ppi->cpi->svc.use_flexible_mode == 0 && flags == 0)
+ av1_set_svc_fixed_mode(ppi->cpi);
+
+ // Note(yunqing): While applying encoding flags, always start from enabling
+ // all, and then modifying according to the flags. Previous frame's flags are
+ // overwritten.
+ av1_apply_encoding_flags(ppi->cpi, flags);
+ if (cpi_lap != NULL) {
+ av1_apply_encoding_flags(cpi_lap, flags);
+ }
+
+#if CONFIG_TUNE_VMAF
+ if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ aom_init_vmaf_model(&ppi->cpi->vmaf_info.vmaf_model,
+ ppi->cpi->oxcf.tune_cfg.vmaf_model_path);
+ }
+#endif
+
+ // Handle fixed keyframe intervals
+ if (is_stat_generation_stage(ppi->cpi) || is_one_pass_rt_params(ppi->cpi)) {
+ if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
+ ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+ if (ppi->cpi->common.spatial_layer_id == 0 &&
+ ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+ flags |= AOM_EFLAG_FORCE_KF;
+ ctx->fixed_kf_cntr = 1;
+ }
+ }
+ }
+
+ if (res == AOM_CODEC_OK) {
+ AV1_COMP *cpi = ppi->cpi;
+
+ // Set up internal flags
+ if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) ppi->b_calculate_psnr = 1;
+
+ if (img != NULL) {
+ if (!ctx->pts_offset_initialized) {
+ ctx->pts_offset = ptsvol;
+ ctx->pts_offset_initialized = 1;
+ }
+ ptsvol -= ctx->pts_offset;
+ int64_t src_time_stamp =
+ timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol);
+ int64_t src_end_time_stamp =
+ timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + duration);
+
+ YV12_BUFFER_CONFIG sd;
+ res = image2yuvconfig(img, &sd);
+ // When generating a monochrome stream, make |sd| a monochrome image.
+ if (ctx->cfg.monochrome) {
+ sd.u_buffer = sd.v_buffer = NULL;
+ sd.uv_stride = 0;
+ sd.monochrome = 1;
+ }
+ int use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+ int subsampling_x = sd.subsampling_x;
+ int subsampling_y = sd.subsampling_y;
+
+ if (!ppi->lookahead) {
+ int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.gf_cfg.lag_in_frames
+ : cpi->oxcf.gf_cfg.lag_in_frames;
+ AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const BLOCK_SIZE sb_size = av1_select_sb_size(
+ oxcf, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ ppi->number_spatial_layers);
+ oxcf->border_in_pixels =
+ av1_get_enc_border_size(av1_is_resize_needed(oxcf),
+ oxcf->kf_cfg.key_freq_max == 0, sb_size);
+ for (int i = 0; i < ppi->num_fp_contexts; i++) {
+ ppi->parallel_cpi[i]->oxcf.border_in_pixels = oxcf->border_in_pixels;
+ }
+
+ const int src_border_in_pixels = get_src_border_in_pixels(cpi, sb_size);
+ ppi->lookahead = av1_lookahead_init(
+ cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+ subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames,
+ src_border_in_pixels, cpi->common.features.byte_alignment,
+ ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0),
+ cpi->image_pyramid_levels);
+ }
+ if (!ppi->lookahead)
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffers");
+ for (int i = 0; i < ppi->num_fp_contexts; i++) {
+ aom_codec_err_t err =
+ av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth,
+ subsampling_x, subsampling_y);
+ if (err != AOM_CODEC_OK) {
+ aom_internal_error(&ppi->error, err,
+ "av1_check_initial_width() failed");
+ }
+ }
+ if (cpi_lap != NULL) {
+ aom_codec_err_t err = av1_check_initial_width(
+ cpi_lap, use_highbitdepth, subsampling_x, subsampling_y);
+ if (err != AOM_CODEC_OK) {
+ aom_internal_error(&ppi->error, err,
+ "av1_check_initial_width() failed");
+ }
+ }
+
+ // Store the original flags in to the frame buffer. Will extract the
+ // key frame flag when we actually encode this frame.
+ if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+ src_time_stamp, src_end_time_stamp)) {
+ res = update_error_state(ctx, cpi->common.error);
+ }
+ ctx->next_frame_flags = 0;
+ }
+
+ cpi_data.cx_data = ctx->cx_data;
+ cpi_data.cx_data_sz = ctx->cx_data_sz;
+
+ /* Any pending invisible frames? */
+ if (ctx->pending_cx_data_sz) {
+ cpi_data.cx_data += ctx->pending_cx_data_sz;
+ cpi_data.cx_data_sz -= ctx->pending_cx_data_sz;
+
+ /* TODO: this is a minimal check, the underlying codec doesn't respect
+ * the buffer size anyway.
+ */
+ if (cpi_data.cx_data_sz < ctx->cx_data_sz / 2) {
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
+ "Compressed data buffer too small");
+ }
+ }
+
+ int is_frame_visible = 0;
+ int has_no_show_keyframe = 0;
+ int num_workers = 0;
+
+ if (cpi->oxcf.pass == AOM_RC_FIRST_PASS) {
+#if !CONFIG_REALTIME_ONLY
+ num_workers = ppi->p_mt_info.num_mod_workers[MOD_FP] =
+ av1_fp_compute_num_enc_workers(cpi);
+#endif
+ } else {
+ av1_compute_num_workers_for_mt(cpi);
+ num_workers = av1_get_max_num_workers(cpi);
+ }
+ if (num_workers > 1 && ppi->p_mt_info.num_workers < num_workers) {
+ // Obtain the maximum no. of frames that can be supported in a parallel
+ // encode set.
+ if (is_stat_consumption_stage(cpi)) {
+ ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
+ }
+ if (ppi->p_mt_info.num_workers > 0) {
+ av1_terminate_workers(ppi);
+ free_thread_data(ppi);
+ aom_free(ppi->p_mt_info.tile_thr_data);
+ ppi->p_mt_info.tile_thr_data = NULL;
+ aom_free(ppi->p_mt_info.workers);
+ ppi->p_mt_info.workers = NULL;
+ ppi->p_mt_info.num_workers = 0;
+ for (int j = 0; j < ppi->num_fp_contexts; j++) {
+ aom_free(ppi->parallel_cpi[j]->td.tctx);
+ ppi->parallel_cpi[j]->td.tctx = NULL;
+ }
+ }
+ av1_create_workers(ppi, num_workers);
+ av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS);
+ }
+
+ // Re-allocate thread data if workers for encoder multi-threading stage
+ // exceeds prev_num_enc_workers.
+ const int num_enc_workers =
+ av1_get_num_mod_workers_for_alloc(&ppi->p_mt_info, MOD_ENC);
+ if (ppi->p_mt_info.prev_num_enc_workers < num_enc_workers &&
+ num_enc_workers <= ppi->p_mt_info.num_workers) {
+ free_thread_data(ppi);
+ for (int j = 0; j < ppi->num_fp_contexts; j++) {
+ aom_free(ppi->parallel_cpi[j]->td.tctx);
+ ppi->parallel_cpi[j]->td.tctx = NULL;
+ }
+ av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS);
+ }
+
+ for (int i = 0; i < ppi->num_fp_contexts; i++) {
+ av1_init_frame_mt(ppi, ppi->parallel_cpi[i]);
+ }
+ if (cpi_lap != NULL) {
+ av1_init_frame_mt(ppi, cpi_lap);
+ }
+#if CONFIG_MULTITHREAD
+ if (ppi->p_mt_info.num_workers > 1) {
+ for (int i = 0; i < ppi->num_fp_contexts; i++) {
+ av1_init_mt_sync(ppi->parallel_cpi[i],
+ ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS);
+ }
+ if (cpi_lap != NULL) {
+ av1_init_mt_sync(cpi_lap, 1);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ // Call for LAP stage
+ if (cpi_lap != NULL) {
+ AV1_COMP_DATA cpi_lap_data = { 0 };
+ cpi_lap_data.flush = !img;
+ cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio;
+ const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data);
+ if (status > AOM_CODEC_OK) {
+ aom_internal_error_copy(&ppi->error, cpi_lap->common.error);
+ }
+ av1_post_encode_updates(cpi_lap, &cpi_lap_data);
+ }
+
+ // Recalculate the maximum number of frames that can be encoded in
+ // parallel at the beginning of sub gop.
+ if (is_stat_consumption_stage(cpi) && ppi->gf_group.size > 0 &&
+ cpi->gf_frame_index == ppi->gf_group.size) {
+ ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf);
+ }
+
+ // Get the next visible frame. Invisible frames get packed with the next
+ // visible frame.
+ while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
+ int simulate_parallel_frame = 0;
+ int status = -1;
+ cpi->do_frame_data_update = true;
+ cpi->ref_idx_to_skip = INVALID_IDX;
+ cpi->ref_refresh_index = INVALID_IDX;
+ cpi->refresh_idx_available = false;
+
+#if CONFIG_FPMT_TEST
+ simulate_parallel_frame =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+ if (simulate_parallel_frame) {
+ if (ppi->num_fp_contexts > 1 && ppi->gf_group.size > 1) {
+ if (cpi->gf_frame_index < ppi->gf_group.size) {
+ calc_frame_data_update_flag(&ppi->gf_group, cpi->gf_frame_index,
+ &cpi->do_frame_data_update);
+ }
+ }
+ status = av1_get_compressed_data(cpi, &cpi_data);
+ }
+
+#endif // CONFIG_FPMT_TEST
+ if (!simulate_parallel_frame) {
+ if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ status = av1_get_compressed_data(cpi, &cpi_data);
+ } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+ 1) {
+ // In case of an error, longjmp() would be invoked and hence "status"
+ // is set to AOM_CODEC_OK here.
+ av1_compress_parallel_frames(ppi, &cpi_data);
+ status = AOM_CODEC_OK;
+ } else {
+ // No possibility of failures from this function and hence "status" is
+ // set to AOM_CODEC_OK here.
+ cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data);
+ status = AOM_CODEC_OK;
+ }
+ }
+ if (status == -1) break;
+ if (status != AOM_CODEC_OK) {
+ aom_internal_error_copy(&ppi->error, cpi->common.error);
+ }
+ if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) {
+ av1_init_sc_decisions(ppi);
+ }
+
+ ppi->seq_params_locked = 1;
+ av1_post_encode_updates(cpi, &cpi_data);
+
+#if CONFIG_ENTROPY_STATS
+ if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame)
+ av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts);
+#endif
+#if CONFIG_INTERNAL_STATS
+ if (ppi->cpi->oxcf.pass != 1) {
+ ppi->total_time_compress_data += cpi->time_compress_data;
+ ppi->total_recode_hits += cpi->frame_recode_hits;
+ ppi->total_bytes += cpi->bytes;
+ for (int i = 0; i < MAX_MODES; i++) {
+ ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
+ }
+ }
+#endif // CONFIG_INTERNAL_STATS
+
+ if (!cpi_data.frame_size) continue;
+ assert(cpi_data.cx_data != NULL && cpi_data.cx_data_sz != 0);
+ const int write_temporal_delimiter =
+ !cpi->common.spatial_layer_id && !ctx->pending_cx_data_sz;
+
+ if (write_temporal_delimiter) {
+ uint32_t obu_header_size = 1;
+ const uint32_t obu_payload_size = 0;
+ const size_t length_field_size =
+ aom_uleb_size_in_bytes(obu_payload_size);
+
+ const size_t move_offset = obu_header_size + length_field_size;
+ memmove(ctx->cx_data + move_offset, ctx->cx_data, cpi_data.frame_size);
+ obu_header_size =
+ av1_write_obu_header(&ppi->level_params, &cpi->frame_header_count,
+ OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
+
+ // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+ ctx->cx_data) != AOM_CODEC_OK) {
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+ }
+
+ cpi_data.frame_size +=
+ obu_header_size + obu_payload_size + length_field_size;
+ }
+
+ if (ctx->oxcf.save_as_annexb) {
+ size_t curr_frame_size = cpi_data.frame_size;
+ if (av1_convert_sect5obus_to_annexb(cpi_data.cx_data,
+ &curr_frame_size) != AOM_CODEC_OK) {
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+ }
+ cpi_data.frame_size = curr_frame_size;
+
+ // B_PRIME (add frame size)
+ const size_t length_field_size =
+ aom_uleb_size_in_bytes(cpi_data.frame_size);
+ memmove(cpi_data.cx_data + length_field_size, cpi_data.cx_data,
+ cpi_data.frame_size);
+ if (av1_write_uleb_obu_size(0, (uint32_t)cpi_data.frame_size,
+ cpi_data.cx_data) != AOM_CODEC_OK) {
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+ }
+ cpi_data.frame_size += length_field_size;
+ }
+
+ ctx->pending_cx_data_sz += cpi_data.frame_size;
+
+ cpi_data.cx_data += cpi_data.frame_size;
+ cpi_data.cx_data_sz -= cpi_data.frame_size;
+
+ is_frame_visible = cpi->common.show_frame;
+
+ has_no_show_keyframe |=
+ (!is_frame_visible &&
+ cpi->common.current_frame.frame_type == KEY_FRAME);
+ }
+ if (is_frame_visible) {
+ // Add the frame packet to the list of returned packets.
+ aom_codec_cx_pkt_t pkt;
+
+ // decrement frames_left counter
+ ppi->frames_left = AOMMAX(0, ppi->frames_left - 1);
+ if (ctx->oxcf.save_as_annexb) {
+ // B_PRIME (add TU size)
+ size_t tu_size = ctx->pending_cx_data_sz;
+ const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
+ memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size);
+ if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, ctx->cx_data) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
+ }
+ ctx->pending_cx_data_sz += length_field_size;
+ }
+
+ pkt.kind = AOM_CODEC_CX_FRAME_PKT;
+
+ pkt.data.frame.buf = ctx->cx_data;
+ pkt.data.frame.sz = ctx->pending_cx_data_sz;
+ pkt.data.frame.partition_id = -1;
+ pkt.data.frame.vis_frame_size = cpi_data.frame_size;
+
+ pkt.data.frame.pts = ticks_to_timebase_units(cpi_data.timestamp_ratio,
+ cpi_data.ts_frame_start) +
+ ctx->pts_offset;
+ pkt.data.frame.flags = get_frame_pkt_flags(cpi, cpi_data.lib_flags);
+ if (has_no_show_keyframe) {
+ // If one of the invisible frames in the packet is a keyframe, set
+ // the delayed random access point flag.
+ pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
+ }
+ pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
+ cpi_data.timestamp_ratio,
+ cpi_data.ts_frame_end - cpi_data.ts_frame_start);
+
+ aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+ ctx->pending_cx_data_sz = 0;
+ }
+ }
+
+ ppi->error.setjmp = 0;
+ return res;
+}
+
+static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx,
+ aom_codec_iter_t *iter) {
+ return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+ if (frame != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ av1_set_reference_enc(ctx->ppi->cpi, frame->idx, &sd);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering)
+ return AOM_CODEC_INCAPABLE;
+ av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+ if (frame != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ av1_copy_reference_enc(ctx->ppi->cpi, frame->idx, &sd);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering)
+ return AOM_CODEC_INCAPABLE;
+ av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+ if (frame != NULL) {
+ YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->ppi->cpi->common, frame->idx);
+ if (fb == NULL) return AOM_CODEC_ERROR;
+
+ yuvconfig2image(&frame->img, fb, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+ if (new_img != NULL) {
+ YV12_BUFFER_CONFIG new_frame;
+
+ if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) {
+ yuvconfig2image(new_img, &new_frame, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+ if (new_img != NULL) {
+ YV12_BUFFER_CONFIG new_frame;
+
+ if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) {
+ YV12_BUFFER_CONFIG sd;
+ image2yuvconfig(new_img, &sd);
+ return av1_copy_new_frame_enc(&ctx->ppi->cpi->common, &new_frame, &sd);
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
+ YV12_BUFFER_CONFIG sd;
+
+ if (av1_get_preview_raw_frame(ctx->ppi->cpi, &sd) == 0) {
+ yuvconfig2image(&ctx->preview_img, &sd, NULL);
+ return &ctx->preview_img;
+ } else {
+ return NULL;
+ }
+}
+
+static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int reference_flag = va_arg(args, int);
+
+ av1_use_as_reference(&ctx->ppi->cpi->ext_flags.ref_frame_flags,
+ reference_flag);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ (void)ctx;
+ (void)args;
+
+ // TODO(yaowu): Need to re-implement and test for AV1.
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+ if (map) {
+ if (!av1_set_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows,
+ (int)map->cols))
+ return AOM_CODEC_OK;
+ else
+ return AOM_CODEC_INVALID_PARAM;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+ if (map) {
+ if (!av1_get_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows,
+ (int)map->cols))
+ return AOM_CODEC_OK;
+ else
+ return AOM_CODEC_INVALID_PARAM;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
+
+ if (mode) {
+ const int res = av1_set_internal_size(
+ &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params,
+ mode->h_scaling_mode, mode->v_scaling_mode);
+ av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+ return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int spatial_layer_id = va_arg(args, int);
+ if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS)
+ return AOM_CODEC_INVALID_PARAM;
+ ctx->ppi->cpi->common.spatial_layer_id = spatial_layer_id;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int number_spatial_layers = va_arg(args, int);
+ if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
+ return AOM_CODEC_INVALID_PARAM;
+ ctx->ppi->number_spatial_layers = number_spatial_layers;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *);
+ ctx->ppi->cpi->common.spatial_layer_id = data->spatial_layer_id;
+ ctx->ppi->cpi->common.temporal_layer_id = data->temporal_layer_id;
+ ctx->ppi->cpi->svc.spatial_layer_id = data->spatial_layer_id;
+ ctx->ppi->cpi->svc.temporal_layer_id = data->temporal_layer_id;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ AV1_PRIMARY *const ppi = ctx->ppi;
+ AV1_COMP *const cpi = ppi->cpi;
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncoderConfig *oxcf = &cpi->oxcf;
+ aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
+ int64_t target_bandwidth = 0;
+ ppi->number_spatial_layers = params->number_spatial_layers;
+ ppi->number_temporal_layers = params->number_temporal_layers;
+ cpi->svc.number_spatial_layers = params->number_spatial_layers;
+ cpi->svc.number_temporal_layers = params->number_temporal_layers;
+ if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) {
+ unsigned int sl, tl;
+ ctx->ppi->use_svc = 1;
+ const int num_layers =
+ ppi->number_spatial_layers * ppi->number_temporal_layers;
+ for (int layer = 0; layer < num_layers; ++layer) {
+ if (params->max_quantizers[layer] > 63 ||
+ params->min_quantizers[layer] < 0 ||
+ params->min_quantizers[layer] > params->max_quantizers[layer]) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ }
+ if (!av1_alloc_layer_context(cpi, num_layers)) return AOM_CODEC_MEM_ERROR;
+
+ for (sl = 0; sl < ppi->number_spatial_layers; ++sl) {
+ for (tl = 0; tl < ppi->number_temporal_layers; ++tl) {
+ const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ lc->max_q = params->max_quantizers[layer];
+ lc->min_q = params->min_quantizers[layer];
+ lc->scaling_factor_num = AOMMAX(1, params->scaling_factor_num[sl]);
+ lc->scaling_factor_den = AOMMAX(1, params->scaling_factor_den[sl]);
+ const int layer_target_bitrate = params->layer_target_bitrate[layer];
+ if (layer_target_bitrate > INT_MAX / 1000) {
+ lc->layer_target_bitrate = INT_MAX;
+ } else {
+ lc->layer_target_bitrate = 1000 * layer_target_bitrate;
+ }
+ lc->framerate_factor = params->framerate_factor[tl];
+ if (tl == ppi->number_temporal_layers - 1)
+ target_bandwidth += lc->layer_target_bitrate;
+ }
+ }
+ if (cm->current_frame.frame_number == 0) {
+ if (!cpi->ppi->seq_params_locked) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ seq_params->operating_points_cnt_minus_1 =
+ ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+ av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
+ }
+ av1_init_layer_context(cpi);
+ }
+ oxcf->rc_cfg.target_bandwidth = target_bandwidth;
+ set_primary_rc_buffer_sizes(oxcf, cpi->ppi);
+ av1_update_layer_context_change_config(cpi, target_bandwidth);
+ check_reset_rc_flag(cpi);
+ }
+ av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ AV1_COMP *const cpi = ctx->ppi->cpi;
+ aom_svc_ref_frame_config_t *const data =
+ va_arg(args, aom_svc_ref_frame_config_t *);
+ cpi->ppi->rtc_ref.set_ref_frame_config = 1;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ cpi->ppi->rtc_ref.reference[i] = data->reference[i];
+ cpi->ppi->rtc_ref.ref_idx[i] = data->ref_idx[i];
+ }
+ for (unsigned int i = 0; i < REF_FRAMES; ++i)
+ cpi->ppi->rtc_ref.refresh[i] = data->refresh[i];
+ cpi->svc.use_flexible_mode = 1;
+ cpi->svc.ksvc_fixed_mode = 0;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_ref_frame_comp_pred(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ AV1_COMP *const cpi = ctx->ppi->cpi;
+ aom_svc_ref_frame_comp_pred_t *const data =
+ va_arg(args, aom_svc_ref_frame_comp_pred_t *);
+ cpi->ppi->rtc_ref.ref_frame_comp[0] = data->use_comp_pred[0];
+ cpi->ppi->rtc_ref.ref_frame_comp[1] = data->use_comp_pred[1];
+ cpi->ppi->rtc_ref.ref_frame_comp[2] = data->use_comp_pred[2];
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_transfer_characteristics(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.transfer_characteristics =
+ CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_sample_position(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_sample_position =
+ CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ int *const render_size = va_arg(args, int *);
+ extra_cfg.render_width = render_size[0];
+ extra_cfg.render_height = render_size[1];
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx,
+ const char *name, const char *value) {
+ if (ctx == NULL || name == NULL || value == NULL)
+ return AOM_CODEC_INVALID_PARAM;
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ // Used to mock the argv with just one string "--{name}={value}"
+ char *argv[2] = { NULL, "" };
+ size_t len = strlen(name) + strlen(value) + 4;
+ char *const err_string = ctx->ppi->error.detail;
+
+#if __STDC_VERSION__ >= 201112L
+ // We use the keyword _Static_assert because clang-cl does not allow the
+ // convenience macro static_assert to be used in function scope. See
+ // https://bugs.llvm.org/show_bug.cgi?id=48904.
+ _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN,
+ "The size of the err_msg buffer for arg_match_helper must be "
+ "at least ARG_ERR_MSG_MAX_LEN");
+#else
+ assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN);
+#endif
+
+ argv[0] = aom_malloc(len * sizeof(argv[1][0]));
+ if (!argv[0]) return AOM_CODEC_MEM_ERROR;
+ snprintf(argv[0], len, "--%s=%s", name, value);
+ struct arg arg;
+ aom_codec_err_t err = AOM_CODEC_OK;
+
+ int match = 1;
+ if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_keyframe_filtering,
+ argv, err_string)) {
+ extra_cfg.enable_keyframe_filtering =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_gf_interval, argv,
+ err_string)) {
+ extra_cfg.min_gf_interval = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_gf_interval, argv,
+ err_string)) {
+ extra_cfg.max_gf_interval = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_min_pyr_height,
+ argv, err_string)) {
+ extra_cfg.gf_min_pyr_height = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_max_pyr_height,
+ argv, err_string)) {
+ extra_cfg.gf_max_pyr_height = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cpu_used_av1, argv,
+ err_string)) {
+ extra_cfg.cpu_used = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_altref, argv,
+ err_string)) {
+ extra_cfg.enable_auto_alt_ref = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.noise_sens, argv,
+ err_string)) {
+ extra_cfg.noise_sensitivity = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sharpness, argv,
+ err_string)) {
+ extra_cfg.sharpness = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.static_thresh, argv,
+ err_string)) {
+ extra_cfg.static_thresh = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.rowmtarg, argv,
+ err_string)) {
+ extra_cfg.row_mt = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fpmtarg, argv,
+ err_string)) {
+ extra_cfg.fp_mt = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_cols, argv,
+ err_string)) {
+ extra_cfg.tile_columns = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_rows, argv,
+ err_string)) {
+ extra_cfg.tile_rows = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tpl_model,
+ argv, err_string)) {
+ extra_cfg.enable_tpl_model = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_maxframes, argv,
+ err_string)) {
+ extra_cfg.arnr_max_frames = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_strength, argv,
+ err_string)) {
+ extra_cfg.arnr_strength = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_metric, argv,
+ err_string)) {
+ extra_cfg.tuning = arg_parse_enum_helper(&arg, err_string);
+ }
+#if CONFIG_TUNE_VMAF
+ else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argv,
+ err_string)) {
+ err = allocate_and_set_string(value, default_extra_cfg.vmaf_model_path,
+ &extra_cfg.vmaf_model_path, err_string);
+ }
+#endif
+ else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path,
+ argv, err_string)) {
+ err = allocate_and_set_string(value, default_extra_cfg.partition_info_path,
+ &extra_cfg.partition_info_path, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_rate_guide_deltaq,
+ argv, err_string)) {
+ extra_cfg.enable_rate_guide_deltaq =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.rate_distribution_info,
+ argv, err_string)) {
+ err =
+ allocate_and_set_string(value, default_extra_cfg.rate_distribution_info,
+ &extra_cfg.rate_distribution_info, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dist_metric, argv,
+ err_string)) {
+ extra_cfg.dist_metric = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
+ err_string)) {
+ extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct,
+ argv, err_string)) {
+ extra_cfg.rc_max_intra_bitrate_pct =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_inter_rate_pct,
+ argv, err_string)) {
+ extra_cfg.rc_max_inter_bitrate_pct =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_cbr_boost_pct,
+ argv, err_string)) {
+ extra_cfg.gf_cbr_boost_pct = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.lossless, argv,
+ err_string)) {
+ extra_cfg.lossless = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cdef, argv,
+ err_string)) {
+ extra_cfg.enable_cdef = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_restoration,
+ argv, err_string)) {
+ extra_cfg.enable_restoration = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.force_video_mode,
+ argv, err_string)) {
+ extra_cfg.force_video_mode = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_obmc, argv,
+ err_string)) {
+ extra_cfg.enable_obmc = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.disable_trellis_quant,
+ argv, err_string)) {
+ extra_cfg.disable_trellis_quant = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_qm, argv,
+ err_string)) {
+ extra_cfg.enable_qm = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_max, argv,
+ err_string)) {
+ extra_cfg.qm_max = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_min, argv,
+ err_string)) {
+ extra_cfg.qm_min = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.num_tg, argv,
+ err_string)) {
+ extra_cfg.num_tg = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mtu_size, argv,
+ err_string)) {
+ extra_cfg.mtu_size = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.timing_info, argv,
+ err_string)) {
+ extra_cfg.timing_info_type = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.frame_parallel_decoding,
+ argv, err_string)) {
+ extra_cfg.frame_parallel_decoding_mode =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dual_filter,
+ argv, err_string)) {
+ extra_cfg.enable_dual_filter = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_chroma_deltaq,
+ argv, err_string)) {
+ extra_cfg.enable_chroma_deltaq = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.aq_mode, argv,
+ err_string)) {
+ extra_cfg.aq_mode = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_mode, argv,
+ err_string)) {
+ extra_cfg.deltaq_mode = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_strength, argv,
+ err_string)) {
+ extra_cfg.deltaq_strength = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltalf_mode, argv,
+ err_string)) {
+ extra_cfg.deltalf_mode = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.frame_periodic_boost,
+ argv, err_string)) {
+ extra_cfg.frame_periodic_boost = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_content, argv,
+ err_string)) {
+ extra_cfg.content = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_color_primaries,
+ argv, err_string)) {
+ extra_cfg.color_primaries = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(
+ &arg, &g_av1_codec_arg_defs.input_transfer_characteristics,
+ argv, err_string)) {
+ extra_cfg.transfer_characteristics =
+ arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.input_matrix_coefficients,
+ argv, err_string)) {
+ extra_cfg.matrix_coefficients = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(
+ &arg, &g_av1_codec_arg_defs.input_chroma_sample_position, argv,
+ err_string)) {
+ extra_cfg.chroma_sample_position = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.superblock_size, argv,
+ err_string)) {
+ extra_cfg.superblock_size = arg_parse_enum_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.error_resilient_mode,
+ argv, err_string)) {
+ extra_cfg.error_resilient_mode = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sframe_mode, argv,
+ err_string)) {
+ extra_cfg.s_frame_mode = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_test, argv,
+ err_string)) {
+ extra_cfg.film_grain_test_vector = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_table,
+ argv, err_string)) {
+ if (value == NULL) {
+ // this parameter allows NULL as its value
+ extra_cfg.film_grain_table_filename = value;
+ } else {
+ err = allocate_and_set_string(
+ value, default_extra_cfg.film_grain_table_filename,
+ &extra_cfg.film_grain_table_filename, err_string);
+ }
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cdf_update_mode, argv,
+ err_string)) {
+ extra_cfg.cdf_update_mode = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_rect_partitions,
+ argv, err_string)) {
+ extra_cfg.enable_rect_partitions = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ab_partitions,
+ argv, err_string)) {
+ extra_cfg.enable_ab_partitions = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_1to4_partitions,
+ argv, err_string)) {
+ extra_cfg.enable_1to4_partitions = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_partition_size,
+ argv, err_string)) {
+ extra_cfg.min_partition_size = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_partition_size,
+ argv, err_string)) {
+ extra_cfg.max_partition_size = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_intra_edge_filter,
+ argv, err_string)) {
+ extra_cfg.enable_intra_edge_filter =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_order_hint,
+ argv, err_string)) {
+ extra_cfg.enable_order_hint = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tx64, argv,
+ err_string)) {
+ extra_cfg.enable_tx64 = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_flip_idtx,
+ argv, err_string)) {
+ extra_cfg.enable_flip_idtx = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_rect_tx, argv,
+ err_string)) {
+ extra_cfg.enable_rect_tx = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dist_wtd_comp,
+ argv, err_string)) {
+ extra_cfg.enable_dist_wtd_comp = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_reference_frames,
+ argv, err_string)) {
+ extra_cfg.max_reference_frames = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_reference_set,
+ argv, err_string)) {
+ extra_cfg.enable_reduced_reference_set =
+ arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ref_frame_mvs,
+ argv, err_string)) {
+ extra_cfg.enable_ref_frame_mvs = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_masked_comp,
+ argv, err_string)) {
+ extra_cfg.enable_masked_comp = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_onesided_comp,
+ argv, err_string)) {
+ extra_cfg.enable_onesided_comp = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_interintra_comp,
+ argv, err_string)) {
+ extra_cfg.enable_interintra_comp = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_smooth_interintra,
+ argv, err_string)) {
+ extra_cfg.enable_smooth_interintra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diff_wtd_comp,
+ argv, err_string)) {
+ extra_cfg.enable_diff_wtd_comp = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_interinter_wedge,
+ argv, err_string)) {
+ extra_cfg.enable_interinter_wedge = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_interintra_wedge,
+ argv, err_string)) {
+ extra_cfg.enable_interintra_wedge = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_global_motion,
+ argv, err_string)) {
+ extra_cfg.enable_global_motion = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_warped_motion,
+ argv, err_string)) {
+ extra_cfg.enable_warped_motion = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_filter_intra,
+ argv, err_string)) {
+ extra_cfg.enable_filter_intra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_smooth_intra,
+ argv, err_string)) {
+ extra_cfg.enable_smooth_intra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_paeth_intra,
+ argv, err_string)) {
+ extra_cfg.enable_paeth_intra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cfl_intra,
+ argv, err_string)) {
+ extra_cfg.enable_cfl_intra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.enable_directional_intra,
+ argv, err_string)) {
+ extra_cfg.enable_directional_intra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diagonal_intra,
+ argv, err_string)) {
+ extra_cfg.enable_diagonal_intra = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_overlay, argv,
+ err_string)) {
+ extra_cfg.enable_overlay = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_palette, argv,
+ err_string)) {
+ extra_cfg.enable_palette = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_intrabc, argv,
+ err_string)) {
+ extra_cfg.enable_intrabc = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_angle_delta,
+ argv, err_string)) {
+ extra_cfg.enable_angle_delta = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_tx_type_set,
+ argv, err_string)) {
+ extra_cfg.reduced_tx_type_set = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_intra_dct_only,
+ argv, err_string)) {
+ extra_cfg.use_intra_dct_only = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_inter_dct_only,
+ argv, err_string)) {
+ extra_cfg.use_inter_dct_only = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.use_intra_default_tx_only,
+ argv, err_string)) {
+ extra_cfg.use_intra_default_tx_only =
+ arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.quant_b_adapt, argv,
+ err_string)) {
+ extra_cfg.quant_b_adapt = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.vbr_corpus_complexity_lap,
+ argv, err_string)) {
+ extra_cfg.vbr_corpus_complexity_lap =
+ arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_tier_mask, argv,
+ err_string)) {
+ extra_cfg.tier_mask = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_min_cr, argv,
+ err_string)) {
+ extra_cfg.min_cr = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.coeff_cost_upd_freq,
+ argv, err_string)) {
+ extra_cfg.coeff_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mode_cost_upd_freq,
+ argv, err_string)) {
+ extra_cfg.mode_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq,
+ argv, err_string)) {
+ extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq,
+ argv, err_string)) {
+ extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+ }
+#if CONFIG_DENOISE
+ else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level,
+ argv, err_string)) {
+ extra_cfg.noise_level =
+ (float)arg_parse_int_helper(&arg, err_string) / 10.0f;
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_block_size,
+ argv, err_string)) {
+ extra_cfg.noise_block_size = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dnl_denoising,
+ argv, err_string)) {
+ extra_cfg.enable_dnl_denoising = arg_parse_uint_helper(&arg, err_string);
+ }
+#endif
+ else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.target_seq_level_idx,
+ argv, err_string)) {
+ const int val = arg_parse_int_helper(&arg, err_string);
+ const int level = val % 100;
+ const int operating_point_idx = val / 100;
+ if (operating_point_idx < 0 ||
+ operating_point_idx >= MAX_NUM_OPERATING_POINTS) {
+ snprintf(err_string, ARG_ERR_MSG_MAX_LEN,
+ "Invalid operating point index: %d", operating_point_idx);
+ err = AOM_CODEC_INVALID_PARAM;
+ } else {
+ extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level;
+ }
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.input_chroma_subsampling_x,
+ argv, err_string)) {
+ extra_cfg.chroma_subsampling_x = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.input_chroma_subsampling_y,
+ argv, err_string)) {
+ extra_cfg.chroma_subsampling_y = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.passes, argv,
+ err_string)) {
+ extra_cfg.passes = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fwd_kf_dist, argv,
+ err_string)) {
+ extra_cfg.fwd_kf_dist = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.two_pass_output, argv,
+ err_string)) {
+ err = allocate_and_set_string(value, default_extra_cfg.two_pass_output,
+ &extra_cfg.two_pass_output, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.second_pass_log, argv,
+ err_string)) {
+ err = allocate_and_set_string(value, default_extra_cfg.second_pass_log,
+ &extra_cfg.second_pass_log, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.loopfilter_control,
+ argv, err_string)) {
+ extra_cfg.loopfilter_control = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_intra_tools_off,
+ argv, err_string)) {
+ extra_cfg.auto_intra_tools_off = arg_parse_uint_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg,
+ &g_av1_codec_arg_defs.strict_level_conformance,
+ argv, err_string)) {
+ extra_cfg.strict_level_conformance = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sb_qp_sweep, argv,
+ err_string)) {
+ extra_cfg.sb_qp_sweep = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.kf_max_pyr_height,
+ argv, err_string)) {
+ extra_cfg.kf_max_pyr_height = arg_parse_int_helper(&arg, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_width, argv,
+ err_string)) {
+ ctx->cfg.tile_width_count = arg_parse_list_helper(
+ &arg, ctx->cfg.tile_widths, MAX_TILE_WIDTHS, err_string);
+ } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_height, argv,
+ err_string)) {
+ ctx->cfg.tile_height_count = arg_parse_list_helper(
+ &arg, ctx->cfg.tile_heights, MAX_TILE_HEIGHTS, err_string);
+ } else {
+ match = 0;
+ snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot find aom option %s",
+ name);
+ }
+ aom_free(argv[0]);
+
+ if (err != AOM_CODEC_OK) {
+ ctx->base.err_detail = err_string;
+ return err;
+ }
+
+ if (strlen(err_string) != 0) {
+ ctx->base.err_detail = err_string;
+ return AOM_CODEC_INVALID_PARAM;
+ }
+
+ ctx->base.err_detail = NULL;
+
+ if (!match) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params,
+ arg);
+}
+
+static aom_codec_err_t ctrl_get_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ return av1_get_target_seq_level_idx(&ctx->ppi->seq_params,
+ &ctx->ppi->level_params, arg);
+}
+
+static aom_codec_err_t ctrl_get_num_operating_points(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = ctx->ppi->seq_params.operating_points_cnt_minus_1 + 1;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_luma_cdef_strength(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *arg = va_arg(args, int *);
+ AV1_COMMON const *cm = &ctx->ppi->cpi->common;
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ memcpy(arg, cm->cdef_info.cdef_strengths, CDEF_MAX_STRENGTHS * sizeof(*arg));
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+ { AV1_COPY_REFERENCE, ctrl_copy_reference },
+ { AOME_USE_REFERENCE, ctrl_use_reference },
+
+ // Setters
+ { AV1_SET_REFERENCE, ctrl_set_reference },
+ { AOME_SET_ROI_MAP, ctrl_set_roi_map },
+ { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
+ { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+ { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
+ { AOME_SET_CPUUSED, ctrl_set_cpuused },
+ { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
+ { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
+ { AOME_SET_SHARPNESS, ctrl_set_sharpness },
+ { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+ { AV1E_SET_ROW_MT, ctrl_set_row_mt },
+ { AV1E_SET_FP_MT, ctrl_set_fp_mt },
+ { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
+ { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
+ { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model },
+ { AV1E_SET_ENABLE_KEYFRAME_FILTERING, ctrl_set_enable_keyframe_filtering },
+ { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
+ { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
+ { AOME_SET_TUNING, ctrl_set_tuning },
+ { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
+ { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+ { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers },
+ { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
+ { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+ { AV1E_SET_LOSSLESS, ctrl_set_lossless },
+ { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
+ { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+ { AV1E_SET_FORCE_VIDEO_MODE, ctrl_set_force_video_mode },
+ { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc },
+ { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
+ { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+ { AV1E_SET_QM_Y, ctrl_set_qm_y },
+ { AV1E_SET_QM_U, ctrl_set_qm_u },
+ { AV1E_SET_QM_V, ctrl_set_qm_v },
+ { AV1E_SET_QM_MIN, ctrl_set_qm_min },
+ { AV1E_SET_QM_MAX, ctrl_set_qm_max },
+ { AV1E_SET_NUM_TG, ctrl_set_num_tg },
+ { AV1E_SET_MTU, ctrl_set_mtu },
+ { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
+ { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+ { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
+ { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
+ { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions },
+ { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions },
+ { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions },
+ { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size },
+ { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size },
+ { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter },
+ { AV1E_SET_ENABLE_CHROMA_DELTAQ, ctrl_set_enable_chroma_deltaq },
+ { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter },
+ { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
+ { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
+ { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
+ { AV1E_SET_ENABLE_RECT_TX, ctrl_set_enable_rect_tx },
+ { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
+ { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
+ { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set },
+ { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
+ { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+ { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp },
+ { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp },
+ { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp },
+ { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra },
+ { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp },
+ { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge },
+ { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge },
+ { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion },
+ { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
+ { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+ { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra },
+ { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra },
+ { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
+ { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
+ { AV1E_SET_ENABLE_DIRECTIONAL_INTRA, ctrl_set_enable_directional_intra },
+ { AV1E_SET_ENABLE_DIAGONAL_INTRA, ctrl_set_enable_diagonal_intra },
+ { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+ { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay },
+ { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette },
+ { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc },
+ { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta },
+ { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+ { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set },
+ { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only },
+ { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only },
+ { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only },
+ { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
+ { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq },
+ { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
+ { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq },
+ { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+ { AV1E_SET_DELTAQ_STRENGTH, ctrl_set_deltaq_strength },
+ { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode },
+ { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
+ { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
+ { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
+ { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries },
+ { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics },
+ { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients },
+ { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position },
+ { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
+ { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
+ { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
+ { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+ { AV1E_SET_GF_MIN_PYRAMID_HEIGHT, ctrl_set_gf_min_pyr_height },
+ { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height },
+ { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
+ { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
+ { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
+ { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path },
+ { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path },
+ { AV1E_ENABLE_RATE_GUIDE_DELTAQ, ctrl_enable_rate_guide_deltaq },
+ { AV1E_SET_RATE_DISTRIBUTION_INFO, ctrl_set_rate_distribution_info },
+ { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
+ { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+ { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+ { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+ { AV1E_SET_ENABLE_DNL_DENOISING, ctrl_set_enable_dnl_denoising },
+ { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+ { AV1E_SET_FP_MT_UNIT_TEST, ctrl_enable_fpmt_unit_test },
+ { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug },
+ { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx },
+ { AV1E_SET_TIER_MASK, ctrl_set_tier_mask },
+ { AV1E_SET_MIN_CR, ctrl_set_min_cr },
+ { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id },
+ { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params },
+ { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
+ { AV1E_SET_SVC_REF_FRAME_COMP_PRED, ctrl_set_svc_ref_frame_comp_pred },
+ { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap },
+ { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
+ { AV1E_ENABLE_SB_QP_SWEEP, ctrl_enable_sb_qp_sweep },
+ { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq },
+ { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition },
+ { AV1E_SET_ENABLE_TX_SIZE_SEARCH, ctrl_set_enable_tx_size_search },
+ { AV1E_SET_LOOPFILTER_CONTROL, ctrl_set_loopfilter_control },
+ { AV1E_SET_SKIP_POSTPROC_FILTERING, ctrl_set_skip_postproc_filtering },
+ { AV1E_SET_AUTO_INTRA_TOOLS_OFF, ctrl_set_auto_intra_tools_off },
+ { AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc },
+ { AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
+ { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr },
+ { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr },
+ { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode },
+
+ // Getters
+ { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
+ { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+ { AOME_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
+ { AV1_GET_REFERENCE, ctrl_get_reference },
+ { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
+ { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+ { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+ { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+ { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
+ { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx },
+ { AV1E_GET_BASELINE_GF_INTERVAL, ctrl_get_baseline_gf_interval },
+ { AV1E_GET_TARGET_SEQ_LEVEL_IDX, ctrl_get_target_seq_level_idx },
+ { AV1E_GET_NUM_OPERATING_POINTS, ctrl_get_num_operating_points },
+ { AV1E_GET_LUMA_CDEF_STRENGTH, ctrl_get_luma_cdef_strength },
+
+ CTRL_MAP_END,
+};
+
+static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
+#if !CONFIG_REALTIME_ONLY
+ {
+ // NOLINT
+ AOM_USAGE_GOOD_QUALITY, // g_usage - non-realtime usage
+ 0, // g_threads
+ 0, // g_profile
+
+ 320, // g_w
+ 240, // g_h
+ 0, // g_limit
+ 0, // g_forced_max_frame_width
+ 0, // g_forced_max_frame_height
+ AOM_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
+ { 1, 30 }, // g_timebase
+
+ 0, // g_error_resilient
+
+ AOM_RC_ONE_PASS, // g_pass
+
+ 35, // g_lag_in_frames
+
+ 0, // rc_dropframe_thresh
+ RESIZE_NONE, // rc_resize_mode
+ SCALE_NUMERATOR, // rc_resize_denominator
+ SCALE_NUMERATOR, // rc_resize_kf_denominator
+
+ AOM_SUPERRES_NONE, // rc_superres_mode
+ SCALE_NUMERATOR, // rc_superres_denominator
+ SCALE_NUMERATOR, // rc_superres_kf_denominator
+ 63, // rc_superres_qthresh
+ 32, // rc_superres_kf_qthresh
+
+ AOM_VBR, // rc_end_usage
+ { NULL, 0 }, // rc_twopass_stats_in
+ { NULL, 0 }, // rc_firstpass_mb_stats_in
+ 256, // rc_target_bitrate
+ 0, // rc_min_quantizer
+ 63, // rc_max_quantizer
+ 25, // rc_undershoot_pct
+ 25, // rc_overshoot_pct
+
+ 6000, // rc_max_buffer_size
+ 4000, // rc_buffer_initial_size
+ 5000, // rc_buffer_optimal_size
+
+ 50, // rc_two_pass_vbrbias
+ 0, // rc_two_pass_vbrmin_section
+ 2000, // rc_two_pass_vbrmax_section
+
+ // keyframing settings (kf)
+ 0, // fwd_kf_enabled
+ AOM_KF_AUTO, // kf_mode
+ 0, // kf_min_dist
+ 9999, // kf_max_dist
+ 0, // sframe_dist
+ 1, // sframe_mode
+ 0, // large_scale_tile
+ 0, // monochrome
+ 0, // full_still_picture_hdr
+ 0, // save_as_annexb
+ 0, // tile_width_count
+ 0, // tile_height_count
+ { 0 }, // tile_widths
+ { 0 }, // tile_heights
+ 0, // use_fixed_qp_offsets
+ { -1, -1, -1, -1, -1 }, // fixed_qp_offsets
+ { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg
+ },
+#endif // !CONFIG_REALTIME_ONLY
+ {
+ // NOLINT
+ AOM_USAGE_REALTIME, // g_usage - real-time usage
+ 0, // g_threads
+ 0, // g_profile
+
+ 320, // g_w
+ 240, // g_h
+ 0, // g_limit
+ 0, // g_forced_max_frame_width
+ 0, // g_forced_max_frame_height
+ AOM_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
+ { 1, 30 }, // g_timebase
+
+ 0, // g_error_resilient
+
+ AOM_RC_ONE_PASS, // g_pass
+
+ 0, // g_lag_in_frames
+
+ 0, // rc_dropframe_thresh
+ RESIZE_NONE, // rc_resize_mode
+ SCALE_NUMERATOR, // rc_resize_denominator
+ SCALE_NUMERATOR, // rc_resize_kf_denominator
+
+ AOM_SUPERRES_NONE, // rc_superres_mode
+ SCALE_NUMERATOR, // rc_superres_denominator
+ SCALE_NUMERATOR, // rc_superres_kf_denominator
+ 63, // rc_superres_qthresh
+ 32, // rc_superres_kf_qthresh
+
+ AOM_CBR, // rc_end_usage
+ { NULL, 0 }, // rc_twopass_stats_in
+ { NULL, 0 }, // rc_firstpass_mb_stats_in
+ 256, // rc_target_bitrate
+ 0, // rc_min_quantizer
+ 63, // rc_max_quantizer
+ 25, // rc_undershoot_pct
+ 25, // rc_overshoot_pct
+
+ 6000, // rc_max_buffer_size
+ 4000, // rc_buffer_initial_size
+ 5000, // rc_buffer_optimal_size
+
+ 50, // rc_two_pass_vbrbias
+ 0, // rc_two_pass_vbrmin_section
+ 2000, // rc_two_pass_vbrmax_section
+
+ // keyframing settings (kf)
+ 0, // fwd_kf_enabled
+ AOM_KF_AUTO, // kf_mode
+ 0, // kf_min_dist
+ 9999, // kf_max_dist
+ 0, // sframe_dist
+ 1, // sframe_mode
+ 0, // large_scale_tile
+ 0, // monochrome
+ 0, // full_still_picture_hdr
+ 0, // save_as_annexb
+ 0, // tile_width_count
+ 0, // tile_height_count
+ { 0 }, // tile_widths
+ { 0 }, // tile_heights
+ 0, // use_fixed_qp_offsets
+ { -1, -1, -1, -1, -1 }, // fixed_qp_offsets
+ { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg
+ },
+#if !CONFIG_REALTIME_ONLY
+ {
+ // NOLINT
+ AOM_USAGE_ALL_INTRA, // g_usage - all intra usage
+ 0, // g_threads
+ 0, // g_profile
+
+ 320, // g_w
+ 240, // g_h
+ 0, // g_limit
+ 0, // g_forced_max_frame_width
+ 0, // g_forced_max_frame_height
+ AOM_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
+ { 1, 30 }, // g_timebase
+
+ 0, // g_error_resilient
+
+ AOM_RC_ONE_PASS, // g_pass
+
+ 0, // g_lag_in_frames
+
+ 0, // rc_dropframe_thresh
+ RESIZE_NONE, // rc_resize_mode
+ SCALE_NUMERATOR, // rc_resize_denominator
+ SCALE_NUMERATOR, // rc_resize_kf_denominator
+
+ AOM_SUPERRES_NONE, // rc_superres_mode
+ SCALE_NUMERATOR, // rc_superres_denominator
+ SCALE_NUMERATOR, // rc_superres_kf_denominator
+ 63, // rc_superres_qthresh
+ 32, // rc_superres_kf_qthresh
+
+ AOM_Q, // rc_end_usage
+ { NULL, 0 }, // rc_twopass_stats_in
+ { NULL, 0 }, // rc_firstpass_mb_stats_in
+ 256, // rc_target_bitrate
+ 0, // rc_min_quantizer
+ 63, // rc_max_quantizer
+ 25, // rc_undershoot_pct
+ 25, // rc_overshoot_pct
+
+ 6000, // rc_max_buffer_size
+ 4000, // rc_buffer_initial_size
+ 5000, // rc_buffer_optimal_size
+
+ 50, // rc_two_pass_vbrbias
+ 0, // rc_two_pass_vbrmin_section
+ 2000, // rc_two_pass_vbrmax_section
+
+ // keyframing settings (kf)
+ 0, // fwd_kf_enabled
+ AOM_KF_DISABLED, // kf_mode
+ 0, // kf_min_dist
+ 0, // kf_max_dist
+ 0, // sframe_dist
+ 1, // sframe_mode
+ 0, // large_scale_tile
+ 0, // monochrome
+ 0, // full_still_picture_hdr
+ 0, // save_as_annexb
+ 0, // tile_width_count
+ 0, // tile_height_count
+ { 0 }, // tile_widths
+ { 0 }, // tile_heights
+ 0, // use_fixed_qp_offsets
+ { -1, -1, -1, -1, -1 }, // fixed_qp_offsets
+ { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg
+ },
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+// This data structure and function are exported in aom/aomcx.h
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+aom_codec_iface_t aom_codec_av1_cx_algo = {
+ "AOMedia Project AV1 Encoder" VERSION_STRING,
+ AOM_CODEC_INTERNAL_ABI_VERSION,
+ (CONFIG_AV1_HIGHBITDEPTH ? AOM_CODEC_CAP_HIGHBITDEPTH : 0) |
+ AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR, // aom_codec_caps_t
+ encoder_init, // aom_codec_init_fn_t
+ encoder_destroy, // aom_codec_destroy_fn_t
+ encoder_ctrl_maps, // aom_codec_ctrl_fn_map_t
+ {
+ // NOLINT
+ NULL, // aom_codec_peek_si_fn_t
+ NULL, // aom_codec_get_si_fn_t
+ NULL, // aom_codec_decode_fn_t
+ NULL, // aom_codec_get_frame_fn_t
+ NULL // aom_codec_set_fb_fn_t
+ },
+ {
+ // NOLINT
+ NELEMENTS(encoder_usage_cfg), // cfg_count
+ encoder_usage_cfg, // aom_codec_enc_cfg_t
+ encoder_encode, // aom_codec_encode_fn_t
+ encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
+ encoder_set_config, // aom_codec_enc_config_set_fn_t
+ encoder_get_global_headers, // aom_codec_get_global_headers_fn_t
+ encoder_get_preview // aom_codec_get_preview_frame_fn_t
+ },
+ encoder_set_option // aom_codec_set_option_fn_t
+};
+
+aom_codec_iface_t *aom_codec_av1_cx(void) { return &aom_codec_av1_cx_algo; }
diff --git a/third_party/aom/av1/av1_cx_iface.h b/third_party/aom/av1/av1_cx_iface.h
new file mode 100644
index 0000000000..b2a7005ea5
--- /dev/null
+++ b/third_party/aom/av1/av1_cx_iface.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_AV1_CX_IFACE_H_
+#define AOM_AV1_AV1_CX_IFACE_H_
+#include "av1/encoder/encoder.h"
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg);
+
+aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi,
+ AV1_COMP **p_cpi,
+ BufferPool **p_buffer_pool,
+ const AV1EncoderConfig *oxcf,
+ COMPRESSOR_STAGE stage,
+ int lap_lag_in_frames);
+
+void av1_destroy_context_and_bufferpool(AV1_COMP *cpi,
+ BufferPool **p_buffer_pool);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_AV1_CX_IFACE_H_
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
new file mode 100644
index 0000000000..3d7e132ab8
--- /dev/null
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -0,0 +1,1777 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
+
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/grain_synthesis.h"
+#include "av1/decoder/obu.h"
+
+#include "av1/av1_iface_common.h"
+
+struct aom_codec_alg_priv {
+ aom_codec_priv_t base;
+ aom_codec_dec_cfg_t cfg;
+ aom_codec_stream_info_t si;
+ aom_image_t img;
+ int img_avail;
+ int flushed;
+ int invert_tile_order;
+ RefCntBuffer *last_show_frame; // Last output frame buffer
+ int byte_alignment;
+ int skip_loop_filter;
+ int skip_film_grain;
+ int decode_tile_row;
+ int decode_tile_col;
+ unsigned int tile_mode;
+ unsigned int ext_tile_debug;
+ unsigned int row_mt;
+ EXTERNAL_REFERENCES ext_refs;
+ unsigned int is_annexb;
+ int operating_point;
+ int output_all_layers;
+
+ AVxWorker *frame_worker;
+
+ aom_image_t image_with_grain;
+ aom_codec_frame_buffer_t grain_image_frame_buffers[MAX_NUM_SPATIAL_LAYERS];
+ size_t num_grain_image_frame_buffers;
+ int need_resync; // wait for key/intra-only frame
+ // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+ BufferPool *buffer_pool;
+
+ // External frame buffer info to save for AV1 common.
+ void *ext_priv; // Private data associated with the external frame buffers.
+ aom_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+ aom_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+#if CONFIG_INSPECTION
+ aom_inspect_cb inspect_cb;
+ void *inspect_ctx;
+#endif
+};
+
+static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) {
+ // This function only allocates space for the aom_codec_alg_priv_t
+ // structure. More memory may be required at the time the stream
+ // information becomes known.
+ if (!ctx->priv) {
+ aom_codec_alg_priv_t *const priv =
+ (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
+ if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+ ctx->priv = (aom_codec_priv_t *)priv;
+ ctx->priv->init_flags = ctx->init_flags;
+ priv->flushed = 0;
+
+ // TODO(tdaede): this should not be exposed to the API
+ priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+ if (ctx->config.dec) {
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
+ }
+ priv->num_grain_image_frame_buffers = 0;
+ // Turn row_mt on by default.
+ priv->row_mt = 1;
+
+ // Turn on normal tile coding mode by default.
+ // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+ // mode(refer to lightfield example).
+ priv->tile_mode = 0;
+ priv->decode_tile_row = -1;
+ priv->decode_tile_col = -1;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
+ if (ctx->frame_worker != NULL) {
+ AVxWorker *const worker = ctx->frame_worker;
+ aom_get_worker_interface()->end(worker);
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ if (frame_worker_data != NULL && frame_worker_data->pbi != NULL) {
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ aom_free(pbi->common.tpl_mvs);
+ pbi->common.tpl_mvs = NULL;
+ av1_remove_common(&pbi->common);
+ av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync);
+ av1_free_cdef_sync(&pbi->cdef_sync);
+ av1_free_restoration_buffers(&pbi->common);
+ av1_decoder_remove(pbi);
+ }
+ aom_free(frame_worker_data);
+ }
+
+ if (ctx->buffer_pool) {
+ for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) {
+ ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv,
+ &ctx->grain_image_frame_buffers[i]);
+ }
+ av1_free_ref_frame_buffers(ctx->buffer_pool);
+ av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+ }
+
+ aom_free(ctx->frame_worker);
+ aom_free(ctx->buffer_pool);
+ assert(!ctx->img.self_allocd);
+ aom_img_free(&ctx->img);
+ aom_free(ctx);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
+ const uint32_t num_units_in_display_tick =
+ aom_rb_read_unsigned_literal(rb, 32);
+ const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32);
+ if (num_units_in_display_tick == 0 || time_scale == 0)
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ const uint8_t equal_picture_interval = aom_rb_read_bit(rb);
+ if (equal_picture_interval) {
+ const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+ if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
+ // num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+ }
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_decoder_model_info(
+ struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) {
+ *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5);
+ const uint32_t num_units_in_decoding_tick =
+ aom_rb_read_unsigned_literal(rb, 32);
+ const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5);
+ const uint8_t frame_presentation_time_length_minus_1 =
+ aom_rb_read_literal(rb, 5);
+ (void)num_units_in_decoding_tick;
+ (void)buffer_removal_time_length_minus_1;
+ (void)frame_presentation_time_length_minus_1;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_op_parameters_info(
+ struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) {
+ const int n = buffer_delay_length_minus_1 + 1;
+ const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+ const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+ const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb);
+ (void)decoder_buffer_delay;
+ (void)encoder_buffer_delay;
+ (void)low_delay_mode_flag;
+ return AOM_CODEC_OK;
+}
+
+// Parses the operating points (including operating_point_idc, seq_level_idx,
+// and seq_tier) and then sets si->number_spatial_layers and
+// si->number_temporal_layers based on operating_point_idc[0].
+static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
+ int is_reduced_header,
+ aom_codec_stream_info_t *si) {
+ int operating_point_idc0 = 0;
+ if (is_reduced_header) {
+ aom_rb_read_literal(rb, LEVEL_BITS); // level
+ } else {
+ uint8_t decoder_model_info_present_flag = 0;
+ int buffer_delay_length_minus_1 = 0;
+ aom_codec_err_t status;
+ const uint8_t timing_info_present_flag = aom_rb_read_bit(rb);
+ if (timing_info_present_flag) {
+ if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status;
+ decoder_model_info_present_flag = aom_rb_read_bit(rb);
+ if (decoder_model_info_present_flag) {
+ if ((status = parse_decoder_model_info(
+ rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+ return status;
+ }
+ }
+ const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb);
+ const uint8_t operating_points_cnt_minus_1 =
+ aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+ for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
+ int operating_point_idc;
+ operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+ if (i == 0) operating_point_idc0 = operating_point_idc;
+ int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level
+ if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier
+ if (decoder_model_info_present_flag) {
+ const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb);
+ if (decoder_model_present_for_this_op) {
+ if ((status = parse_op_parameters_info(
+ rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+ return status;
+ }
+ }
+ if (initial_display_delay_present_flag) {
+ const uint8_t initial_display_delay_present_for_this_op =
+ aom_rb_read_bit(rb);
+ if (initial_display_delay_present_for_this_op)
+ aom_rb_read_literal(rb, 4); // initial_display_delay_minus_1
+ }
+ }
+ }
+
+ if (aom_get_num_layers_from_operating_point_idc(
+ operating_point_idc0, &si->number_spatial_layers,
+ &si->number_temporal_layers) != AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+ size_t data_sz,
+ aom_codec_stream_info_t *si,
+ int *is_intra_only) {
+ int intra_only_flag = 0;
+ int got_sequence_header = 0;
+ int found_keyframe = 0;
+
+ if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
+
+ si->w = 0;
+ si->h = 0;
+ si->is_kf = 0; // is_kf indicates whether the current packet contains a RAP
+
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+ size_t payload_size = 0;
+ size_t bytes_read = 0;
+ uint8_t reduced_still_picture_hdr = 0;
+ aom_codec_err_t status = aom_read_obu_header_and_size(
+ data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+ if (status != AOM_CODEC_OK) return status;
+
+ // If the first OBU is a temporal delimiter, skip over it and look at the next
+ // OBU in the bitstream
+ if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
+ // Skip any associated payload (there shouldn't be one, but just in case)
+ if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
+ data += bytes_read + payload_size;
+ data_sz -= bytes_read + payload_size;
+
+ status = aom_read_obu_header_and_size(
+ data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+ if (status != AOM_CODEC_OK) return status;
+ }
+ while (1) {
+ data += bytes_read;
+ data_sz -= bytes_read;
+ if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
+ // Check that the selected OBU is a sequence header
+ if (obu_header.type == OBU_SEQUENCE_HEADER) {
+ // Sanity check on sequence header size
+ if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
+ // Read a few values from the sequence header payload
+ struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+
+ av1_read_profile(&rb); // profile
+ const uint8_t still_picture = aom_rb_read_bit(&rb);
+ reduced_still_picture_hdr = aom_rb_read_bit(&rb);
+
+ if (!still_picture && reduced_still_picture_hdr) {
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+
+ if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
+ int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
+ int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
+ int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
+ si->w = max_frame_width;
+ si->h = max_frame_height;
+ got_sequence_header = 1;
+ } else if (obu_header.type == OBU_FRAME_HEADER ||
+ obu_header.type == OBU_FRAME) {
+ if (got_sequence_header && reduced_still_picture_hdr) {
+ found_keyframe = 1;
+ break;
+ } else {
+ // make sure we have enough bits to get the frame type out
+ if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
+ struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+ const int show_existing_frame = aom_rb_read_bit(&rb);
+ if (!show_existing_frame) {
+ const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
+ if (frame_type == KEY_FRAME) {
+ found_keyframe = 1;
+ break; // Stop here as no further OBUs will change the outcome.
+ } else if (frame_type == INTRA_ONLY_FRAME) {
+ intra_only_flag = 1;
+ }
+ }
+ }
+ }
+ // skip past any unread OBU header data
+ data += payload_size;
+ data_sz -= payload_size;
+ if (data_sz == 0) break; // exit if we're out of OBUs
+ status = aom_read_obu_header_and_size(
+ data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+ if (status != AOM_CODEC_OK) return status;
+ }
+ if (got_sequence_header && found_keyframe) si->is_kf = 1;
+ if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz,
+ aom_codec_stream_info_t *si) {
+ return decoder_peek_si_internal(data, data_sz, si, NULL);
+}
+
+static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
+ aom_codec_stream_info_t *si) {
+ memcpy(si, &ctx->si, sizeof(*si));
+
+ return AOM_CODEC_OK;
+}
+
+static void set_error_detail(aom_codec_alg_priv_t *ctx,
+ const char *const error) {
+ ctx->base.err_detail = error;
+}
+
+static aom_codec_err_t update_error_state(
+ aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+ if (error->error_code)
+ set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+ return error->error_code;
+}
+
+static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+
+ cm->cur_frame = NULL;
+ cm->features.byte_alignment = ctx->byte_alignment;
+ pbi->skip_loop_filter = ctx->skip_loop_filter;
+ pbi->skip_film_grain = ctx->skip_film_grain;
+
+ if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+ pool->get_fb_cb = ctx->get_ext_fb_cb;
+ pool->release_fb_cb = ctx->release_ext_fb_cb;
+ pool->cb_priv = ctx->ext_priv;
+ } else {
+ pool->get_fb_cb = av1_get_frame_buffer;
+ pool->release_fb_cb = av1_release_frame_buffer;
+
+ if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to initialize internal frame buffers");
+
+ pool->cb_priv = &pool->int_frame_buffers;
+ }
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+ const uint8_t *data = frame_worker_data->data;
+ (void)arg2;
+
+ int result = av1_receive_compressed_data(frame_worker_data->pbi,
+ frame_worker_data->data_size, &data);
+ frame_worker_data->data_end = data;
+
+ if (result != 0) {
+ // Check decode result in serial decode.
+ frame_worker_data->pbi->need_resync = 1;
+ }
+ return !result;
+}
+
+static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ ctx->last_show_frame = NULL;
+ ctx->need_resync = 1;
+ ctx->flushed = 0;
+
+ ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+ if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+ ctx->buffer_pool->num_frame_bufs = FRAME_BUFFERS;
+ ctx->buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc(
+ ctx->buffer_pool->num_frame_bufs, sizeof(*ctx->buffer_pool->frame_bufs));
+ if (ctx->buffer_pool->frame_bufs == NULL) {
+ ctx->buffer_pool->num_frame_bufs = 0;
+ aom_free(ctx->buffer_pool);
+ ctx->buffer_pool = NULL;
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+ aom_free(ctx->buffer_pool->frame_bufs);
+ ctx->buffer_pool->frame_bufs = NULL;
+ ctx->buffer_pool->num_frame_bufs = 0;
+ aom_free(ctx->buffer_pool);
+ ctx->buffer_pool = NULL;
+ set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+ return AOM_CODEC_MEM_ERROR;
+ }
+#endif
+
+ ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker));
+ if (ctx->frame_worker == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_worker");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ AVxWorker *const worker = ctx->frame_worker;
+ winterface->init(worker);
+ worker->thread_name = "aom frameworker";
+ worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
+ if (worker->data1 == NULL) {
+ winterface->end(worker);
+ aom_free(worker);
+ ctx->frame_worker = NULL;
+ set_error_detail(ctx, "Failed to allocate frame_worker_data");
+ return AOM_CODEC_MEM_ERROR;
+ }
+ FrameWorkerData *frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
+ if (frame_worker_data->pbi == NULL) {
+ winterface->end(worker);
+ aom_free(frame_worker_data);
+ aom_free(worker);
+ ctx->frame_worker = NULL;
+ set_error_detail(ctx, "Failed to allocate frame_worker_data->pbi");
+ return AOM_CODEC_MEM_ERROR;
+ }
+ frame_worker_data->frame_context_ready = 0;
+ frame_worker_data->received_frame = 0;
+ frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
+
+ // If decoding in serial mode, FrameWorker thread could create tile worker
+ // thread or loopfilter thread.
+ frame_worker_data->pbi->max_threads = ctx->cfg.threads;
+ frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+ frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
+ frame_worker_data->pbi->is_annexb = ctx->is_annexb;
+ frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+ frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+ frame_worker_data->pbi->operating_point = ctx->operating_point;
+ frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+ frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+ frame_worker_data->pbi->row_mt = ctx->row_mt;
+ frame_worker_data->pbi->is_fwd_kf_present = 0;
+ frame_worker_data->pbi->is_arf_frame_present = 0;
+ worker->hook = frame_worker_hook;
+
+ init_buffer_callbacks(ctx);
+
+ return AOM_CODEC_OK;
+}
+
+static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
+ const AV1Decoder *const pbi) {
+ // Clear resync flag if worker got a key frame or intra only frame.
+ if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+ frame_is_intra_only(&pbi->common))
+ ctx->need_resync = 0;
+}
+
+static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
+ const uint8_t **data, size_t data_sz,
+ void *user_priv) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ // Determine the stream parameters. Note that we rely on peek_si to
+ // validate that we have a buffer that does not wrap around the top
+ // of the heap.
+ if (!ctx->si.h) {
+ int is_intra_only = 0;
+ ctx->si.is_annexb = ctx->is_annexb;
+ const aom_codec_err_t res =
+ decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
+ if (res != AOM_CODEC_OK) return res;
+
+ if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
+ }
+
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->data = *data;
+ frame_worker_data->data_size = data_sz;
+ frame_worker_data->user_priv = user_priv;
+ frame_worker_data->received_frame = 1;
+
+ frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
+ frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+ frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+ frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+ frame_worker_data->pbi->row_mt = ctx->row_mt;
+ frame_worker_data->pbi->ext_refs = ctx->ext_refs;
+
+ frame_worker_data->pbi->is_annexb = ctx->is_annexb;
+
+ worker->had_error = 0;
+ winterface->execute(worker);
+
+ // Update data pointer after decode.
+ *data = frame_worker_data->data_end;
+
+ if (worker->had_error)
+ return update_error_state(ctx, &frame_worker_data->pbi->error);
+
+ check_resync(ctx, frame_worker_data->pbi);
+
+ return AOM_CODEC_OK;
+}
+
+static void release_pending_output_frames(aom_codec_alg_priv_t *ctx) {
+ // Release any pending output frames from the previous decoder_decode or
+ // decoder_inspect call. We need to do this even if the decoder is being
+ // flushed or the input arguments are invalid.
+ if (ctx->frame_worker) {
+ BufferPool *const pool = ctx->buffer_pool;
+ lock_buffer_pool(pool);
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ struct AV1Decoder *pbi = frame_worker_data->pbi;
+ for (size_t j = 0; j < pbi->num_output_frames; j++) {
+ decrease_ref_count(pbi->output_frames[j], pool);
+ }
+ pbi->num_output_frames = 0;
+ unlock_buffer_pool(pool);
+ for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
+ pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
+ ctx->grain_image_frame_buffers[j].data = NULL;
+ ctx->grain_image_frame_buffers[j].size = 0;
+ ctx->grain_image_frame_buffers[j].priv = NULL;
+ }
+ ctx->num_grain_image_frame_buffers = 0;
+ }
+}
+
+// This function enables the inspector to inspect non visible frames.
+static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
+ const uint8_t *data, size_t data_sz,
+ void *user_priv) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+
+ release_pending_output_frames(ctx);
+
+ /* Sanity checks */
+ /* NULL data ptr allowed if data_sz is 0 too */
+ if (data == NULL && data_sz == 0) {
+ ctx->flushed = 1;
+ return AOM_CODEC_OK;
+ }
+ if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
+
+ // Reset flushed when receiving a valid frame.
+ ctx->flushed = 0;
+
+ const uint8_t *data_start = data;
+ const uint8_t *data_end = data + data_sz;
+
+ uint64_t frame_size;
+ if (ctx->is_annexb) {
+ // read the size of this temporal unit
+ size_t length_of_size;
+ uint64_t temporal_unit_size;
+ if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+ &length_of_size) != 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ data_start += length_of_size;
+ if (temporal_unit_size > (size_t)(data_end - data_start))
+ return AOM_CODEC_CORRUPT_FRAME;
+ data_end = data_start + temporal_unit_size;
+
+ // read the size of this frame unit
+ if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+ &frame_size, &length_of_size) != 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ data_start += length_of_size;
+ if (frame_size > (size_t)(data_end - data_start))
+ return AOM_CODEC_CORRUPT_FRAME;
+ } else {
+ frame_size = (uint64_t)(data_end - data_start);
+ }
+
+ if (ctx->frame_worker == NULL) {
+ res = init_decoder(ctx);
+ if (res != AOM_CODEC_OK) return res;
+ }
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)ctx->frame_worker->data1;
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_INSPECTION
+ frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+ frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+#endif
+ res = av1_receive_compressed_data(frame_worker_data->pbi, (size_t)frame_size,
+ &data_start);
+ check_resync(ctx, frame_worker_data->pbi);
+
+ if (ctx->frame_worker->had_error)
+ return update_error_state(ctx, &frame_worker_data->pbi->error);
+
+ // Allow extra zero bytes after the frame end
+ while (data_start < data_end) {
+ const uint8_t marker = data_start[0];
+ if (marker) break;
+ ++data_start;
+ }
+
+ Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv;
+ data2->idx = -1;
+ if (cm->cur_frame) {
+ for (int i = 0; i < REF_FRAMES; ++i)
+ if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
+ }
+ data2->buf = data_start;
+ data2->show_existing = cm->show_existing_frame;
+ return res;
+}
+
+static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
+ const uint8_t *data, size_t data_sz,
+ void *user_priv) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+
+#if CONFIG_INSPECTION
+ if (user_priv != 0) {
+ return decoder_inspect(ctx, data, data_sz, user_priv);
+ }
+#endif
+
+ release_pending_output_frames(ctx);
+
+ /* Sanity checks */
+ /* NULL data ptr allowed if data_sz is 0 too */
+ if (data == NULL && data_sz == 0) {
+ ctx->flushed = 1;
+ return AOM_CODEC_OK;
+ }
+ if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
+
+ // Reset flushed when receiving a valid frame.
+ ctx->flushed = 0;
+
+ // Initialize the decoder worker on the first frame.
+ if (ctx->frame_worker == NULL) {
+ res = init_decoder(ctx);
+ if (res != AOM_CODEC_OK) return res;
+ }
+
+ const uint8_t *data_start = data;
+ const uint8_t *data_end = data + data_sz;
+
+ if (ctx->is_annexb) {
+ // read the size of this temporal unit
+ size_t length_of_size;
+ uint64_t temporal_unit_size;
+ if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+ &length_of_size) != 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ data_start += length_of_size;
+ if (temporal_unit_size > (size_t)(data_end - data_start))
+ return AOM_CODEC_CORRUPT_FRAME;
+ data_end = data_start + temporal_unit_size;
+ }
+
+ // Decode in serial mode.
+ while (data_start < data_end) {
+ uint64_t frame_size;
+ if (ctx->is_annexb) {
+ // read the size of this frame unit
+ size_t length_of_size;
+ if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+ &frame_size, &length_of_size) != 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ data_start += length_of_size;
+ if (frame_size > (size_t)(data_end - data_start))
+ return AOM_CODEC_CORRUPT_FRAME;
+ } else {
+ frame_size = (uint64_t)(data_end - data_start);
+ }
+
+ res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
+ if (res != AOM_CODEC_OK) return res;
+
+ // Allow extra zero bytes after the frame end
+ while (data_start < data_end) {
+ const uint8_t marker = data_start[0];
+ if (marker) break;
+ ++data_start;
+ }
+ }
+
+ return res;
+}
+
+typedef struct {
+ BufferPool *pool;
+ aom_codec_frame_buffer_t *fb;
+} AllocCbParam;
+
+static void *AllocWithGetFrameBufferCb(void *priv, size_t size) {
+ AllocCbParam *param = (AllocCbParam *)priv;
+ if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0)
+ return NULL;
+ if (param->fb->data == NULL || param->fb->size < size) return NULL;
+ return param->fb->data;
+}
+
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in grain_img, and returns grain_img.
+static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx,
+ aom_image_t *img,
+ aom_image_t *grain_img,
+ aom_film_grain_t *grain_params) {
+ if (!grain_params->apply_grain) return img;
+
+ const int w_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_w, 1);
+ const int h_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_h, 1);
+
+ BufferPool *const pool = ctx->buffer_pool;
+ aom_codec_frame_buffer_t *fb =
+ &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers];
+ AllocCbParam param;
+ param.pool = pool;
+ param.fb = fb;
+ if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16,
+ AllocWithGetFrameBufferCb, &param)) {
+ return NULL;
+ }
+
+ grain_img->user_priv = img->user_priv;
+ grain_img->fb_priv = fb->priv;
+ if (av1_add_film_grain(grain_params, img, grain_img)) {
+ pool->release_fb_cb(pool->cb_priv, fb);
+ return NULL;
+ }
+
+ ctx->num_grain_image_frame_buffers++;
+ return grain_img;
+}
+
+// Copies and clears the metadata from AV1Decoder.
+static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) {
+ if (pbi->metadata && img) {
+ assert(!img->metadata);
+ img->metadata = pbi->metadata;
+ pbi->metadata = NULL;
+ }
+}
+
+static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
+ aom_codec_iter_t *iter) {
+ aom_image_t *img = NULL;
+
+ if (!iter) {
+ return NULL;
+ }
+
+ // To avoid having to allocate any extra storage, treat 'iter' as
+ // simply a pointer to an integer index
+ uintptr_t *index = (uintptr_t *)iter;
+
+ if (ctx->frame_worker == NULL) {
+ return NULL;
+ }
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ pbi->error.error_code = AOM_CODEC_OK;
+ pbi->error.has_detail = 0;
+ AV1_COMMON *const cm = &pbi->common;
+ CommonTileParams *const tiles = &cm->tiles;
+ // Wait for the frame from worker thread.
+ if (!winterface->sync(worker)) {
+ // Decoding failed. Release the worker thread.
+ frame_worker_data->received_frame = 0;
+ ctx->need_resync = 1;
+ // TODO(aomedia:3519): Set an error code. Check if a different error code
+ // should be used if ctx->flushed != 1.
+ return NULL;
+ }
+ // Check if worker has received any frames.
+ if (frame_worker_data->received_frame == 1) {
+ frame_worker_data->received_frame = 0;
+ check_resync(ctx, frame_worker_data->pbi);
+ }
+ YV12_BUFFER_CONFIG *sd;
+ aom_film_grain_t *grain_params;
+ if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) !=
+ 0) {
+ return NULL;
+ }
+ RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
+ ctx->last_show_frame = output_frame_buf;
+ if (ctx->need_resync) return NULL;
+ aom_img_remove_metadata(&ctx->img);
+ yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+ move_decoder_metadata_to_img(pbi, &ctx->img);
+
+ if (!pbi->ext_tile_debug && tiles->large_scale) {
+ *index += 1; // Advance the iterator to point to the next image
+ aom_img_remove_metadata(&ctx->img);
+ yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL);
+ move_decoder_metadata_to_img(pbi, &ctx->img);
+ img = &ctx->img;
+ return img;
+ }
+
+ const int num_planes = av1_num_planes(cm);
+ if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
+ pbi->dec_tile_row >= 0) {
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1);
+ const int mi_row = tile_row * tile_height;
+ const int ssy = ctx->img.y_chroma_shift;
+ int plane;
+ ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+ if (num_planes > 1) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ ctx->img.planes[plane] +=
+ mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+ }
+ }
+ ctx->img.d_h =
+ AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
+ }
+
+ if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
+ pbi->dec_tile_col >= 0) {
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1);
+ const int mi_col = tile_col * tile_width;
+ const int ssx = ctx->img.x_chroma_shift;
+ const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+ int plane;
+ ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
+ if (num_planes > 1) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
+ }
+ }
+ ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
+ }
+
+ ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
+ img = &ctx->img;
+ img->temporal_id = output_frame_buf->temporal_id;
+ img->spatial_id = output_frame_buf->spatial_id;
+ if (pbi->skip_film_grain) grain_params->apply_grain = 0;
+ aom_image_t *res =
+ add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
+ if (!res) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ pbi->error.has_detail = 1;
+ snprintf(pbi->error.detail, sizeof(pbi->error.detail),
+ "Grain synthesis failed\n");
+ return res;
+ }
+ *index += 1; // Advance the iterator to point to the next image
+ return res;
+}
+
+static aom_codec_err_t decoder_set_fb_fn(
+ aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+ if (cb_get == NULL || cb_release == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (ctx->frame_worker != NULL) {
+ // If the decoder has already been initialized, do not accept changes to
+ // the frame buffer functions.
+ return AOM_CODEC_ERROR;
+ }
+
+ ctx->get_ext_fb_cb = cb_get;
+ ctx->release_ext_fb_cb = cb_release;
+ ctx->ext_priv = cb_priv;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
+
+ if (data) {
+ av1_ref_frame_t *const frame = data;
+ YV12_BUFFER_CONFIG sd;
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ image2yuvconfig(&frame->img, &sd);
+ return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
+ frame->use_external_ref, &sd);
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+ if (frame) {
+ YV12_BUFFER_CONFIG sd;
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ image2yuvconfig(&frame->img, &sd);
+ return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
+ if (data) {
+ YV12_BUFFER_CONFIG *fb;
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+ if (fb == NULL) return AOM_CODEC_ERROR;
+ yuvconfig2image(&data->img, fb, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *new_img = va_arg(args, aom_image_t *);
+ if (new_img) {
+ YV12_BUFFER_CONFIG new_frame;
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+ if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+ yuvconfig2image(new_img, &new_frame, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *img = va_arg(args, aom_image_t *);
+ if (img) {
+ YV12_BUFFER_CONFIG new_frame;
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+ if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+ YV12_BUFFER_CONFIG sd;
+ image2yuvconfig(img, &sd);
+ return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
+ &sd);
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const update_info = va_arg(args, int *);
+
+ if (update_info) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ *update_info =
+ frame_worker_data->pbi->common.current_frame.refresh_frame_flags;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
+ ->pbi->common.quant_params.base_qindex;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_fwd_kf_value(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ *arg = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_fwd_kf_present;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_altref_present(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ *arg =
+ ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_arf_frame_present;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_flags(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ AV1Decoder *pbi = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi;
+ *arg = 0;
+ switch (pbi->common.current_frame.frame_type) {
+ case KEY_FRAME:
+ *arg |= AOM_FRAME_IS_KEY;
+ *arg |= AOM_FRAME_IS_INTRAONLY;
+ if (!pbi->common.show_frame) {
+ *arg |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
+ }
+ break;
+ case INTRA_ONLY_FRAME: *arg |= AOM_FRAME_IS_INTRAONLY; break;
+ case S_FRAME: *arg |= AOM_FRAME_IS_SWITCH; break;
+ }
+ if (pbi->common.features.error_resilient_mode) {
+ *arg |= AOM_FRAME_IS_ERROR_RESILIENT;
+ }
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_tile_info(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_tile_info *const tile_info = va_arg(args, aom_tile_info *);
+
+ if (tile_info) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ const CommonTileParams *tiles = &pbi->common.tiles;
+
+ int tile_rows = tiles->rows;
+ int tile_cols = tiles->cols;
+
+ if (tiles->uniform_spacing) {
+ tile_info->tile_rows = 1 << tiles->log2_rows;
+ tile_info->tile_columns = 1 << tiles->log2_cols;
+ } else {
+ tile_info->tile_rows = tile_rows;
+ tile_info->tile_columns = tile_cols;
+ }
+
+ for (int tile_col = 1; tile_col <= tile_cols; tile_col++) {
+ tile_info->tile_widths[tile_col - 1] =
+ tiles->col_start_sb[tile_col] - tiles->col_start_sb[tile_col - 1];
+ }
+
+ for (int tile_row = 1; tile_row <= tile_rows; tile_row++) {
+ tile_info->tile_heights[tile_row - 1] =
+ tiles->row_start_sb[tile_row] - tiles->row_start_sb[tile_row - 1];
+ }
+ tile_info->num_tile_groups = pbi->num_tile_groups;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_screen_content_tools_info(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ aom_screen_content_tools_info *const sc_info =
+ va_arg(args, aom_screen_content_tools_info *);
+ if (sc_info) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ sc_info->allow_screen_content_tools =
+ pbi->common.features.allow_screen_content_tools;
+ sc_info->allow_intrabc = pbi->common.features.allow_intrabc;
+ sc_info->force_integer_mv =
+ (int)pbi->common.features.cur_frame_force_integer_mv;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_still_picture_info *const still_picture_info =
+ va_arg(args, aom_still_picture_info *);
+ if (still_picture_info) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture;
+ still_picture_info->is_reduced_still_picture_hdr =
+ (int)(pbi->seq_params.reduced_still_picture_hdr);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_superblock_size_t *const sb_size = va_arg(args, aom_superblock_size_t *);
+ if (sb_size) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ if (pbi->seq_params.sb_size == BLOCK_128X128) {
+ *sb_size = AOM_SUPERBLOCK_SIZE_128X128;
+ } else {
+ *sb_size = AOM_SUPERBLOCK_SIZE_64X64;
+ }
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_show_existing_frame_flag(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
+ ->pbi->common.show_existing_frame;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_s_frame_info(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_s_frame_info *const s_frame_info = va_arg(args, aom_s_frame_info *);
+ if (s_frame_info) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ s_frame_info->is_s_frame = pbi->sframe_info.is_s_frame;
+ s_frame_info->is_s_frame_at_altref =
+ pbi->sframe_info.is_s_frame_at_altref;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *corrupted = va_arg(args, int *);
+
+ if (corrupted) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ if (pbi->seen_frame_header && pbi->num_output_frames == 0)
+ return AOM_CODEC_ERROR;
+ if (ctx->last_show_frame != NULL)
+ *corrupted = ctx->last_show_frame->buf.corrupted;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const frame_size = va_arg(args, int *);
+
+ if (frame_size) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ frame_size[0] = cm->width;
+ frame_size[1] = cm->height;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
+
+ if (frame_header_info) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
+ frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
+ frame_header_info->extra_size = pbi->frame_header_size;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
+
+ if (tile_data) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ tile_data->coded_tile_data_size =
+ pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size;
+ tile_data->coded_tile_data =
+ pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *);
+
+ if (data) {
+ av1_ext_ref_frame_t *const ext_frames = data;
+ ctx->ext_refs.num = ext_frames->num;
+ for (int i = 0; i < ctx->ext_refs.num; i++) {
+ image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]);
+ }
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const render_size = va_arg(args, int *);
+
+ if (render_size) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ render_size[0] = cm->render_width;
+ render_size[1] = cm->render_height;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const bit_depth = va_arg(args, unsigned int *);
+ AVxWorker *const worker = ctx->frame_worker;
+
+ if (bit_depth) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ *bit_depth = cm->seq_params->bit_depth;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+ int use_highbitdepth) {
+ aom_img_fmt_t fmt = 0;
+
+ if (subsampling_x == 0 && subsampling_y == 0)
+ fmt = AOM_IMG_FMT_I444;
+ else if (subsampling_x == 1 && subsampling_y == 0)
+ fmt = AOM_IMG_FMT_I422;
+ else if (subsampling_x == 1 && subsampling_y == 1)
+ fmt = AOM_IMG_FMT_I420;
+
+ if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+ AVxWorker *const worker = ctx->frame_worker;
+
+ if (img_fmt) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+ *img_fmt = get_img_format(cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const tile_size = va_arg(args, unsigned int *);
+ AVxWorker *const worker = ctx->frame_worker;
+
+ if (tile_size) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_count(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const tile_count = va_arg(args, unsigned int *);
+
+ if (tile_count) {
+ AVxWorker *const worker = ctx->frame_worker;
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ *tile_count = frame_worker_data->pbi->tile_count_minus_1 + 1;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_base_q_idx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)ctx->frame_worker->data1;
+ *arg = frame_worker_data->pbi->common.quant_params.base_qindex;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_show_frame_flag(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)ctx->frame_worker->data1;
+ *arg = frame_worker_data->pbi->common.show_frame;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_order_hint(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const arg = va_arg(args, unsigned int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)ctx->frame_worker->data1;
+ *arg = frame_worker_data->pbi->common.current_frame.order_hint;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_mi_info(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int mi_row = va_arg(args, int);
+ int mi_col = va_arg(args, int);
+ MB_MODE_INFO *mi = va_arg(args, MB_MODE_INFO *);
+ if (mi == NULL) return AOM_CODEC_INVALID_PARAM;
+ if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)ctx->frame_worker->data1;
+ if (frame_worker_data == NULL) return AOM_CODEC_ERROR;
+
+ AV1_COMMON *cm = &frame_worker_data->pbi->common;
+ const int mi_rows = cm->mi_params.mi_rows;
+ const int mi_cols = cm->mi_params.mi_cols;
+ const int mi_stride = cm->mi_params.mi_stride;
+ const int offset = mi_row * mi_stride + mi_col;
+
+ if (mi_row < 0 || mi_row >= mi_rows || mi_col < 0 || mi_col >= mi_cols) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+
+ memcpy(mi, cm->mi_params.mi_grid_base[offset], sizeof(*mi));
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->invert_tile_order = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int legacy_byte_alignment = 0;
+ const int min_byte_alignment = 32;
+ const int max_byte_alignment = 1024;
+ const int byte_alignment = va_arg(args, int);
+
+ if (byte_alignment != legacy_byte_alignment &&
+ (byte_alignment < min_byte_alignment ||
+ byte_alignment > max_byte_alignment ||
+ (byte_alignment & (byte_alignment - 1)) != 0))
+ return AOM_CODEC_INVALID_PARAM;
+
+ ctx->byte_alignment = byte_alignment;
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.features.byte_alignment = byte_alignment;
+ }
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->skip_loop_filter = va_arg(args, int);
+
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->skip_loop_filter = ctx->skip_loop_filter;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->skip_film_grain = va_arg(args, int);
+
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->skip_film_grain = ctx->skip_film_grain;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_ACCOUNTING
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ Accounting **acct = va_arg(args, Accounting **);
+
+ if (acct) {
+ if (ctx->frame_worker) {
+ AVxWorker *const worker = ctx->frame_worker;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ AV1Decoder *pbi = frame_worker_data->pbi;
+ *acct = &pbi->accounting;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+#endif
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->decode_tile_row = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->decode_tile_col = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->tile_mode = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->is_annexb = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->operating_point = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->output_all_layers = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_INSPECTION
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ aom_inspect_init *init = va_arg(args, aom_inspect_init *);
+ ctx->inspect_cb = init->inspect_cb;
+ ctx->inspect_ctx = init->inspect_ctx;
+ return AOM_CODEC_OK;
+#endif
+}
+
+static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->ext_tile_debug = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->row_mt = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+ { AV1_COPY_REFERENCE, ctrl_copy_reference },
+
+ // Setters
+ { AV1_SET_REFERENCE, ctrl_set_reference },
+ { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
+ { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
+ { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+ { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
+ { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+ { AV1_SET_TILE_MODE, ctrl_set_tile_mode },
+ { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb },
+ { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point },
+ { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
+ { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+ { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+ { AV1D_SET_ROW_MT, ctrl_set_row_mt },
+ { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+ { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
+
+ // Getters
+ { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
+ { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
+ { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
+ { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+ { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+ { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
+ { AV1D_GET_TILE_COUNT, ctrl_get_tile_count },
+ { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
+ { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
+ { AV1_GET_ACCOUNTING, ctrl_get_accounting },
+ { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+ { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+ { AV1_GET_REFERENCE, ctrl_get_reference },
+ { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
+ { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
+ { AOMD_GET_FWD_KF_PRESENT, ctrl_get_fwd_kf_value },
+ { AOMD_GET_ALTREF_PRESENT, ctrl_get_altref_present },
+ { AOMD_GET_FRAME_FLAGS, ctrl_get_frame_flags },
+ { AOMD_GET_TILE_INFO, ctrl_get_tile_info },
+ { AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, ctrl_get_screen_content_tools_info },
+ { AOMD_GET_STILL_PICTURE, ctrl_get_still_picture },
+ { AOMD_GET_SB_SIZE, ctrl_get_sb_size },
+ { AOMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag },
+ { AOMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info },
+ { AOMD_GET_SHOW_FRAME_FLAG, ctrl_get_show_frame_flag },
+ { AOMD_GET_BASE_Q_IDX, ctrl_get_base_q_idx },
+ { AOMD_GET_ORDER_HINT, ctrl_get_order_hint },
+ { AV1D_GET_MI_INFO, ctrl_get_mi_info },
+ CTRL_MAP_END,
+};
+
+// This data structure and function are exported in aom/aomdx.h
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+aom_codec_iface_t aom_codec_av1_dx_algo = {
+ "AOMedia Project AV1 Decoder" VERSION_STRING,
+ AOM_CODEC_INTERNAL_ABI_VERSION,
+ AOM_CODEC_CAP_DECODER |
+ AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t
+ decoder_init, // aom_codec_init_fn_t
+ decoder_destroy, // aom_codec_destroy_fn_t
+ decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t
+ {
+ // NOLINT
+ decoder_peek_si, // aom_codec_peek_si_fn_t
+ decoder_get_si, // aom_codec_get_si_fn_t
+ decoder_decode, // aom_codec_decode_fn_t
+ decoder_get_frame, // aom_codec_get_frame_fn_t
+ decoder_set_fb_fn, // aom_codec_set_fb_fn_t
+ },
+ {
+ // NOLINT
+ 0,
+ NULL, // aom_codec_enc_cfg_t
+ NULL, // aom_codec_encode_fn_t
+ NULL, // aom_codec_get_cx_data_fn_t
+ NULL, // aom_codec_enc_config_set_fn_t
+ NULL, // aom_codec_get_global_headers_fn_t
+ NULL // aom_codec_get_preview_frame_fn_t
+ },
+ NULL // aom_codec_set_option_fn_t
+};
+
+// Decoder interface for inspecting frame data. It uses decoder_inspect instead
+// of decoder_decode so it only decodes one frame at a time, whether the frame
+// is shown or not.
+aom_codec_iface_t aom_codec_av1_inspect_algo = {
+ "AOMedia Project AV1 Decoder Inspector" VERSION_STRING,
+ AOM_CODEC_INTERNAL_ABI_VERSION,
+ AOM_CODEC_CAP_DECODER |
+ AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t
+ decoder_init, // aom_codec_init_fn_t
+ decoder_destroy, // aom_codec_destroy_fn_t
+ decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t
+ {
+ // NOLINT
+ decoder_peek_si, // aom_codec_peek_si_fn_t
+ decoder_get_si, // aom_codec_get_si_fn_t
+ decoder_inspect, // aom_codec_decode_fn_t
+ decoder_get_frame, // aom_codec_get_frame_fn_t
+ decoder_set_fb_fn, // aom_codec_set_fb_fn_t
+ },
+ {
+ // NOLINT
+ 0,
+ NULL, // aom_codec_enc_cfg_t
+ NULL, // aom_codec_encode_fn_t
+ NULL, // aom_codec_get_cx_data_fn_t
+ NULL, // aom_codec_enc_config_set_fn_t
+ NULL, // aom_codec_get_global_headers_fn_t
+ NULL // aom_codec_get_preview_frame_fn_t
+ },
+ NULL // aom_codec_set_option_fn_t
+};
+
+aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; }
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
new file mode 100644
index 0000000000..b923c3dcff
--- /dev/null
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
+
+#include <assert.h>
+
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+extern aom_codec_iface_t aom_codec_av1_inspect_algo;
+
+static AOM_INLINE void yuvconfig2image(aom_image_t *img,
+ const YV12_BUFFER_CONFIG *yv12,
+ void *user_priv) {
+ /* aom_img_wrap() doesn't allow specifying independent strides for
+ * the Y, U, and V planes, nor other alignment adjustments that
+ * might be representable by a YV12_BUFFER_CONFIG, so we just
+ * initialize all the fields.
+ */
+ int bps;
+ if (!yv12->subsampling_y) {
+ if (!yv12->subsampling_x) {
+ img->fmt = AOM_IMG_FMT_I444;
+ bps = 24;
+ } else {
+ img->fmt = AOM_IMG_FMT_I422;
+ bps = 16;
+ }
+ } else {
+ img->fmt = AOM_IMG_FMT_I420;
+ bps = 12;
+ }
+ img->cp = yv12->color_primaries;
+ img->tc = yv12->transfer_characteristics;
+ img->mc = yv12->matrix_coefficients;
+ img->monochrome = yv12->monochrome;
+ img->csp = yv12->chroma_sample_position;
+ img->range = yv12->color_range;
+ img->bit_depth = 8;
+ img->w = yv12->y_width;
+ img->h = yv12->y_height;
+ img->d_w = yv12->y_crop_width;
+ img->d_h = yv12->y_crop_height;
+ img->r_w = yv12->render_width;
+ img->r_h = yv12->render_height;
+ img->x_chroma_shift = yv12->subsampling_x;
+ img->y_chroma_shift = yv12->subsampling_y;
+ img->planes[AOM_PLANE_Y] = yv12->y_buffer;
+ img->planes[AOM_PLANE_U] = yv12->u_buffer;
+ img->planes[AOM_PLANE_V] = yv12->v_buffer;
+ img->stride[AOM_PLANE_Y] = yv12->y_stride;
+ img->stride[AOM_PLANE_U] = yv12->uv_stride;
+ img->stride[AOM_PLANE_V] = yv12->uv_stride;
+ if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+ bps *= 2;
+ // aom_image_t uses byte strides and a pointer to the first byte
+ // of the image.
+ img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
+ img->bit_depth = yv12->bit_depth;
+ img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+ img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+ img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+ img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
+ img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
+ img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
+ }
+ img->bps = bps;
+ img->user_priv = user_priv;
+ img->img_data = yv12->buffer_alloc;
+ img->img_data_owner = 0;
+ img->self_allocd = 0;
+ img->sz = yv12->frame_size;
+ assert(!yv12->metadata);
+ img->metadata = NULL;
+}
+
+static AOM_INLINE aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+ YV12_BUFFER_CONFIG *yv12) {
+ yv12->y_buffer = img->planes[AOM_PLANE_Y];
+ yv12->u_buffer = img->planes[AOM_PLANE_U];
+ yv12->v_buffer = img->planes[AOM_PLANE_V];
+
+ yv12->y_crop_width = img->d_w;
+ yv12->y_crop_height = img->d_h;
+ yv12->render_width = img->r_w;
+ yv12->render_height = img->r_h;
+ yv12->y_width = img->w;
+ yv12->y_height = img->h;
+
+ yv12->uv_width = (yv12->y_width + img->x_chroma_shift) >> img->x_chroma_shift;
+ yv12->uv_height =
+ (yv12->y_height + img->y_chroma_shift) >> img->y_chroma_shift;
+ yv12->uv_crop_width =
+ (yv12->y_crop_width + img->x_chroma_shift) >> img->x_chroma_shift;
+ yv12->uv_crop_height =
+ (yv12->y_crop_height + img->y_chroma_shift) >> img->y_chroma_shift;
+
+ yv12->y_stride = img->stride[AOM_PLANE_Y];
+ yv12->uv_stride = img->stride[AOM_PLANE_U];
+ yv12->color_primaries = img->cp;
+ yv12->transfer_characteristics = img->tc;
+ yv12->matrix_coefficients = img->mc;
+ yv12->monochrome = img->monochrome;
+ yv12->chroma_sample_position = img->csp;
+ yv12->color_range = img->range;
+
+ if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ // In aom_image_t
+ // planes point to uint8 address of start of data
+ // stride counts uint8s to reach next row
+ // In YV12_BUFFER_CONFIG
+ // y_buffer, u_buffer, v_buffer point to uint16 address of data
+ // stride and border counts in uint16s
+ // This means that all the address calculations in the main body of code
+ // should work correctly.
+ // However, before we do any pixel operations we need to cast the address
+ // to a uint16 ponter and double its value.
+ yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+ yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+ yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+ yv12->y_stride >>= 1;
+ yv12->uv_stride >>= 1;
+ yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+ } else {
+ yv12->flags = 0;
+ }
+
+ // Note(yunqing): if img is allocated the same as the frame buffer, y_stride
+ // is 32-byte aligned. Also, handle the cases while allocating img without a
+ // border or stride_align is less than 32.
+ int border = (yv12->y_stride - (int)((img->w + 31) & ~31u)) / 2;
+ yv12->border = (border < 0) ? 0 : border;
+ yv12->subsampling_x = img->x_chroma_shift;
+ yv12->subsampling_y = img->y_chroma_shift;
+ yv12->metadata = img->metadata;
+ return AOM_CODEC_OK;
+}
+
+#endif // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
new file mode 100644
index 0000000000..2a9a8beb40
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -0,0 +1,506 @@
+/*
+ *
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/thread_common.h"
+
+int av1_get_MBs(int width, int height) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+ const int mi_cols = aligned_width >> MI_SIZE_LOG2;
+ const int mi_rows = aligned_height >> MI_SIZE_LOG2;
+
+ const int mb_cols = ROUND_POWER_OF_TWO(mi_cols, 2);
+ const int mb_rows = ROUND_POWER_OF_TWO(mi_rows, 2);
+ return mb_rows * mb_cols;
+}
+
+void av1_free_ref_frame_buffers(BufferPool *pool) {
+ int i;
+
+ for (i = 0; i < pool->num_frame_bufs; ++i) {
+ if (pool->frame_bufs[i].ref_count > 0 &&
+ pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+ pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+ pool->frame_bufs[i].raw_frame_buffer.data = NULL;
+ pool->frame_bufs[i].raw_frame_buffer.size = 0;
+ pool->frame_bufs[i].raw_frame_buffer.priv = NULL;
+ pool->frame_bufs[i].ref_count = 0;
+ }
+ aom_free(pool->frame_bufs[i].mvs);
+ pool->frame_bufs[i].mvs = NULL;
+ aom_free(pool->frame_bufs[i].seg_map);
+ pool->frame_bufs[i].seg_map = NULL;
+ aom_free_frame_buffer(&pool->frame_bufs[i].buf);
+ }
+ aom_free(pool->frame_bufs);
+ pool->frame_bufs = NULL;
+ pool->num_frame_bufs = 0;
+}
+
+static INLINE void free_cdef_linebuf_conditional(
+ AV1_COMMON *const cm, const size_t *new_linebuf_size) {
+ CdefInfo *cdef_info = &cm->cdef_info;
+ for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+ if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) {
+ aom_free(cdef_info->linebuf[plane]);
+ cdef_info->linebuf[plane] = NULL;
+ }
+ }
+}
+
+static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm,
+ uint16_t **colbuf,
+ uint16_t **srcbuf,
+ const size_t *new_colbuf_size,
+ const size_t new_srcbuf_size) {
+ CdefInfo *cdef_info = &cm->cdef_info;
+ if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) {
+ aom_free(*srcbuf);
+ *srcbuf = NULL;
+ }
+ for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+ if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) {
+ aom_free(colbuf[plane]);
+ colbuf[plane] = NULL;
+ }
+ }
+}
+
+static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
+ aom_free(*srcbuf);
+ *srcbuf = NULL;
+ for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+ aom_free(colbuf[plane]);
+ colbuf[plane] = NULL;
+ }
+}
+
+static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt,
+ const int num_mi_rows) {
+ if (*cdef_row_mt == NULL) return;
+#if CONFIG_MULTITHREAD
+ for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+ if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) {
+ pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_);
+ aom_free((*cdef_row_mt)[row_idx].row_mutex_);
+ }
+ if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) {
+ pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_);
+ aom_free((*cdef_row_mt)[row_idx].row_cond_);
+ }
+ }
+#else
+ (void)num_mi_rows;
+#endif // CONFIG_MULTITHREAD
+ aom_free(*cdef_row_mt);
+ *cdef_row_mt = NULL;
+}
+
+void av1_free_cdef_buffers(AV1_COMMON *const cm,
+ AV1CdefWorkerData **cdef_worker,
+ AV1CdefSync *cdef_sync) {
+ CdefInfo *cdef_info = &cm->cdef_info;
+ const int num_mi_rows = cdef_info->allocated_mi_rows;
+
+ for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+ aom_free(cdef_info->linebuf[plane]);
+ cdef_info->linebuf[plane] = NULL;
+ }
+ // De-allocation of column buffer & source buffer (worker_0).
+ free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf);
+
+ free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows);
+
+ if (cdef_info->allocated_num_workers < 2) return;
+ if (*cdef_worker != NULL) {
+ for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) {
+ // De-allocation of column buffer & source buffer for remaining workers.
+ free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+ }
+ aom_free(*cdef_worker);
+ *cdef_worker = NULL;
+ }
+}
+
+static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
+ const int num_planes) {
+ CdefInfo *cdef_info = &cm->cdef_info;
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (linebuf[plane] == NULL)
+ CHECK_MEM_ERROR(cm, linebuf[plane],
+ aom_malloc(cdef_info->allocated_linebuf_size[plane]));
+ }
+}
+
+static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
+ uint16_t **srcbuf, const int num_planes) {
+ CdefInfo *cdef_info = &cm->cdef_info;
+ if (*srcbuf == NULL)
+ CHECK_MEM_ERROR(cm, *srcbuf,
+ aom_memalign(16, cdef_info->allocated_srcbuf_size));
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (colbuf[plane] == NULL)
+ CHECK_MEM_ERROR(cm, colbuf[plane],
+ aom_malloc(cdef_info->allocated_colbuf_size[plane]));
+ }
+}
+
+static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm,
+ AV1CdefRowSync **cdef_row_mt,
+ const int num_mi_rows) {
+ if (*cdef_row_mt != NULL) return;
+
+ CHECK_MEM_ERROR(cm, *cdef_row_mt,
+ aom_calloc(num_mi_rows, sizeof(**cdef_row_mt)));
+#if CONFIG_MULTITHREAD
+ for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+ CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_,
+ aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_)));
+ pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL);
+
+ CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_,
+ aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_)));
+ pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL);
+ }
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_alloc_cdef_buffers(AV1_COMMON *const cm,
+ AV1CdefWorkerData **cdef_worker,
+ AV1CdefSync *cdef_sync, int num_workers,
+ int init_worker) {
+ const int num_planes = av1_num_planes(cm);
+ size_t new_linebuf_size[MAX_MB_PLANE] = { 0 };
+ size_t new_colbuf_size[MAX_MB_PLANE] = { 0 };
+ size_t new_srcbuf_size = 0;
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ // Check for configuration change
+ const int num_mi_rows =
+ (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int is_num_workers_changed =
+ cdef_info->allocated_num_workers != num_workers;
+ const int is_cdef_enabled =
+ cm->seq_params->enable_cdef && !cm->tiles.large_scale;
+
+ // num-bufs=3 represents ping-pong buffers for top linebuf,
+ // followed by bottom linebuf.
+ // ping-pong is to avoid top linebuf over-write by consecutive row.
+ int num_bufs = 3;
+ if (num_workers > 1)
+ num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+ if (is_cdef_enabled) {
+ // Calculate src buffer size
+ new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE;
+ for (int plane = 0; plane < num_planes; plane++) {
+ const int shift =
+ plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x;
+ // Calculate top and bottom line buffer size
+ const int luma_stride =
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+ new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs *
+ (CDEF_VBORDER << 1) * (luma_stride >> shift);
+ // Calculate column buffer size
+ const int block_height =
+ (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
+ new_colbuf_size[plane] =
+ sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER;
+ }
+ }
+
+ // Free src, line and column buffers for worker 0 in case of reallocation
+ free_cdef_linebuf_conditional(cm, new_linebuf_size);
+ free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf,
+ new_colbuf_size, new_srcbuf_size);
+
+ // The flag init_worker indicates if cdef_worker has to be allocated for the
+ // frame. This is passed as 1 always from decoder. At encoder side, it is 0
+ // when called for parallel frames during FPMT (where cdef_worker is shared
+ // across parallel frames) and 1 otherwise.
+ if (*cdef_worker != NULL && init_worker) {
+ if (is_num_workers_changed) {
+ // Free src and column buffers for remaining workers in case of change in
+ // num_workers
+ for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--)
+ free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+
+ aom_free(*cdef_worker);
+ *cdef_worker = NULL;
+ } else if (num_workers > 1) {
+ // Free src and column buffers for remaining workers in case of
+ // reallocation
+ for (int idx = num_workers - 1; idx >= 1; idx--)
+ free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf,
+ &(*cdef_worker)[idx].srcbuf, new_colbuf_size,
+ new_srcbuf_size);
+ }
+ }
+
+ if (cdef_info->allocated_mi_rows != num_mi_rows)
+ free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows);
+
+ // Store allocated sizes for reallocation
+ cdef_info->allocated_srcbuf_size = new_srcbuf_size;
+ av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size);
+ av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size);
+ // Store configuration to check change in configuration
+ cdef_info->allocated_mi_rows = num_mi_rows;
+ cdef_info->allocated_num_workers = num_workers;
+
+ if (!is_cdef_enabled) return;
+
+ // Memory allocation of column buffer & source buffer (worker_0).
+ alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes);
+ alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes);
+
+ if (num_workers < 2) return;
+
+ if (init_worker) {
+ if (*cdef_worker == NULL)
+ CHECK_MEM_ERROR(cm, *cdef_worker,
+ aom_calloc(num_workers, sizeof(**cdef_worker)));
+
+ // Memory allocation of column buffer & source buffer for remaining workers.
+ for (int idx = num_workers - 1; idx >= 1; idx--)
+ alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf,
+ &(*cdef_worker)[idx].srcbuf, num_planes);
+ }
+
+ alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt,
+ cdef_info->allocated_mi_rows);
+}
+
+// Allocate buffers which are independent of restoration_unit_size
+void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) {
+ const int num_planes = av1_num_planes(cm);
+
+ if (cm->rst_tmpbuf == NULL && is_sgr_enabled) {
+ CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
+ (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+ }
+
+ if (cm->rlbs == NULL) {
+ CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
+ }
+
+ // For striped loop restoration, we divide each plane into "stripes",
+ // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
+ // luma pixels to match the output from CDEF. We will need to store 2 *
+ // RESTORATION_CTX_VERT lines of data for each stripe.
+ int mi_h = cm->mi_params.mi_rows;
+ const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+ const int num_stripes = (ext_h + 63) / 64;
+
+ // Now we need to allocate enough space to store the line buffers for the
+ // stripes
+ const int frame_w = cm->superres_upscaled_width;
+ const int use_highbd = cm->seq_params->use_highbitdepth;
+
+ for (int p = 0; p < num_planes; ++p) {
+ const int is_uv = p > 0;
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
+ const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
+ const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
+ << use_highbd;
+ RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+
+ if (buf_size != boundaries->stripe_boundary_size ||
+ boundaries->stripe_boundary_above == NULL ||
+ boundaries->stripe_boundary_below == NULL) {
+ aom_free(boundaries->stripe_boundary_above);
+ aom_free(boundaries->stripe_boundary_below);
+
+ CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
+ (uint8_t *)aom_memalign(32, buf_size));
+ CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
+ (uint8_t *)aom_memalign(32, buf_size));
+
+ boundaries->stripe_boundary_size = buf_size;
+ }
+ boundaries->stripe_boundary_stride = stride;
+ }
+}
+
+void av1_free_restoration_buffers(AV1_COMMON *cm) {
+ int p;
+ for (p = 0; p < MAX_MB_PLANE; ++p)
+ av1_free_restoration_struct(&cm->rst_info[p]);
+ aom_free(cm->rst_tmpbuf);
+ cm->rst_tmpbuf = NULL;
+ aom_free(cm->rlbs);
+ cm->rlbs = NULL;
+ for (p = 0; p < MAX_MB_PLANE; ++p) {
+ RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+ aom_free(boundaries->stripe_boundary_above);
+ aom_free(boundaries->stripe_boundary_below);
+ boundaries->stripe_boundary_above = NULL;
+ boundaries->stripe_boundary_below = NULL;
+ }
+
+ aom_free_frame_buffer(&cm->rst_frame);
+}
+
+void av1_free_above_context_buffers(CommonContexts *above_contexts) {
+ int i;
+ const int num_planes = above_contexts->num_planes;
+
+ for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
+ for (i = 0; i < num_planes; i++) {
+ if (above_contexts->entropy[i] == NULL) break;
+ aom_free(above_contexts->entropy[i][tile_row]);
+ above_contexts->entropy[i][tile_row] = NULL;
+ }
+ if (above_contexts->partition != NULL) {
+ aom_free(above_contexts->partition[tile_row]);
+ above_contexts->partition[tile_row] = NULL;
+ }
+
+ if (above_contexts->txfm != NULL) {
+ aom_free(above_contexts->txfm[tile_row]);
+ above_contexts->txfm[tile_row] = NULL;
+ }
+ }
+ for (i = 0; i < num_planes; i++) {
+ aom_free(above_contexts->entropy[i]);
+ above_contexts->entropy[i] = NULL;
+ }
+ aom_free(above_contexts->partition);
+ above_contexts->partition = NULL;
+
+ aom_free(above_contexts->txfm);
+ above_contexts->txfm = NULL;
+
+ above_contexts->num_tile_rows = 0;
+ above_contexts->num_mi_cols = 0;
+ above_contexts->num_planes = 0;
+}
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+ if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params);
+
+ av1_free_above_context_buffers(&cm->above_contexts);
+}
+
+int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
+ int num_tile_rows, int num_mi_cols,
+ int num_planes) {
+ const int aligned_mi_cols =
+ ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
+
+ // Allocate above context buffers
+ above_contexts->num_tile_rows = num_tile_rows;
+ above_contexts->num_mi_cols = aligned_mi_cols;
+ above_contexts->num_planes = num_planes;
+ for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+ above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+ num_tile_rows, sizeof(above_contexts->entropy[0]));
+ if (!above_contexts->entropy[plane_idx]) return 1;
+ }
+
+ above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
+ num_tile_rows, sizeof(above_contexts->partition));
+ if (!above_contexts->partition) return 1;
+
+ above_contexts->txfm =
+ (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
+ if (!above_contexts->txfm) return 1;
+
+ for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
+ for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+ above_contexts->entropy[plane_idx][tile_row] =
+ (ENTROPY_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
+ if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
+ }
+
+ above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
+ if (!above_contexts->partition[tile_row]) return 1;
+
+ above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
+ if (!above_contexts->txfm[tile_row]) return 1;
+ }
+
+ return 0;
+}
+
+// Allocate the dynamically allocated arrays in 'mi_params' assuming
+// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of
+// the struct members.
+static int alloc_mi(CommonModeInfoParams *mi_params) {
+ const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
+ const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
+ const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ const int alloc_mi_size =
+ mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
+
+ if (mi_params->mi_alloc_size < alloc_mi_size ||
+ mi_params->mi_grid_size < mi_grid_size) {
+ mi_params->free_mi(mi_params);
+
+ mi_params->mi_alloc =
+ aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
+ if (!mi_params->mi_alloc) return 1;
+ mi_params->mi_alloc_size = alloc_mi_size;
+
+ mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
+ mi_grid_size, sizeof(*mi_params->mi_grid_base));
+ if (!mi_params->mi_grid_base) return 1;
+
+ mi_params->tx_type_map =
+ aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
+ if (!mi_params->tx_type_map) return 1;
+ mi_params->mi_grid_size = mi_grid_size;
+ }
+
+ return 0;
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height,
+ BLOCK_SIZE min_partition_size) {
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ mi_params->set_mb_mi(mi_params, width, height, min_partition_size);
+ if (alloc_mi(mi_params)) goto fail;
+ return 0;
+
+fail:
+ // clear the mi_* values to force a realloc on resync
+ mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4);
+ av1_free_context_buffers(cm);
+ return 1;
+}
+
+void av1_remove_common(AV1_COMMON *cm) {
+ av1_free_context_buffers(cm);
+
+ aom_free(cm->fc);
+ cm->fc = NULL;
+ aom_free(cm->default_frame_context);
+ cm->default_frame_context = NULL;
+}
+
+void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
+ mi_params->setup_mi(mi_params);
+}
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
new file mode 100644
index 0000000000..d31b4c56b6
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1 // Invalid buffer index.
+
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct BufferPool;
+struct CommonContexts;
+struct CommonModeInfoParams;
+struct AV1CdefWorker;
+struct AV1CdefSyncData;
+
+void av1_remove_common(struct AV1Common *cm);
+
+int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts,
+ int num_tile_rows, int num_mi_cols,
+ int num_planes);
+void av1_free_above_context_buffers(struct CommonContexts *above_contexts);
+int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height,
+ BLOCK_SIZE min_partition_size);
+void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
+void av1_free_context_buffers(struct AV1Common *cm);
+
+void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_cdef_buffers(struct AV1Common *const cm,
+ struct AV1CdefWorker **cdef_worker,
+ struct AV1CdefSyncData *cdef_sync, int num_workers,
+ int init_worker);
+void av1_free_cdef_buffers(struct AV1Common *const cm,
+ struct AV1CdefWorker **cdef_worker,
+ struct AV1CdefSyncData *cdef_sync);
+void av1_alloc_restoration_buffers(struct AV1Common *cm, bool is_sgr_enabled);
+void av1_free_restoration_buffers(struct AV1Common *cm);
+
+int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
+void av1_free_state_buffers(struct AV1Common *cm);
+
+int av1_get_MBs(int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 0000000000..09e5166b14
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,4217 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { av1_idct4, av1_iadst4, av1_iidentity4_c },
+ { av1_idct8, av1_iadst8, av1_iidentity8_c },
+ { av1_idct16, av1_iadst16, av1_iidentity16_c },
+ { av1_idct32, NULL, NULL },
+ { av1_idct64, NULL, NULL },
+};
+
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ int16x8_t temp_output;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+ temp_output = vaddq_s16(temp_output, in[j]);
+ vst1_u8(output, vqmovun_s16(temp_output));
+ output += stride;
+ }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+ int16x8_t res0,
+ int16x8_t res1) {
+ int16x8_t temp_output[2];
+ uint8x16_t temp_output_8q;
+ temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+ temp_output[0] = vaddq_s16(temp_output[0], res0);
+ temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+ temp_output[1] = vaddq_s16(temp_output[1], res1);
+ temp_output_8q =
+ vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+ return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud, int height) {
+ uint8x16_t temp_output_8q;
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output_8q = vld1q_u8(output + i * stride);
+ temp_output_8q =
+ lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+ vst1q_u8((output + i * stride), temp_output_8q);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+ int value) {
+ for (int i = 0; i < size; i++) {
+ a[i] = vdupq_n_s16((int16_t)value);
+ }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+ int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0_l, s0_h, s1_l, s1_h;
+ int16x4_t v0[2], v1[2];
+
+ s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+ s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+ s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+ s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+ v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+ int32x4_t t0[2], t1[2];
+ int16x4_t v0[2], v1[2];
+
+ // Don't add/sub before multiply, which will overflow in iadst8.
+ const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+ const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+ const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+ const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+ t0[0] = vaddq_s32(x0_lo, x1_lo);
+ t0[1] = vaddq_s32(x0_hi, x1_hi);
+ t1[0] = vsubq_s32(x0_lo, x1_lo);
+ t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+ v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+ x[0] = vcombine_s16(v0[0], v0[1]);
+ x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
+ const int16_t c2, const int16_t c3) {
+ int16x4_t val = vdup_n_s16(c0);
+ val = vset_lane_s16(c1, val, 1);
+ val = vset_lane_s16(c2, val, 2);
+ val = vset_lane_s16(c3, val, 3);
+ return val;
+}
+
+static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[20], (int16_t)cospi[44]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // Stage 1
+ x[0] = in[7];
+ x[1] = in[0];
+ x[2] = in[5];
+ x[3] = in[2];
+ x[4] = in[3];
+ x[5] = in[4];
+ x[6] = in[1];
+ x[7] = in[6];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s4);
+ x[1] = vqaddq_s16(s1, s5);
+ x[2] = vqaddq_s16(s2, s6);
+ x[3] = vqaddq_s16(s3, s7);
+ x[4] = vqsubq_s16(s0, s4);
+ x[5] = vqsubq_s16(s1, s5);
+ x[6] = vqsubq_s16(s2, s6);
+ x[7] = vqsubq_s16(s3, s7);
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ s2 = x[2];
+ s3 = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+ // Stage 5
+ x[0] = vqaddq_s16(s0, s2);
+ x[1] = vqaddq_s16(s1, s3);
+ x[2] = vqsubq_s16(s0, s2);
+ x[3] = vqsubq_s16(s1, s3);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vqnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vqnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vqnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vqnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s4, s5;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+
+ btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[4] = s0;
+ x[5] = s1;
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+ // Stage 5
+ x[0] = s0;
+ x[1] = s1;
+ x[2] = s0;
+ x[3] = s1;
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vqnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vqnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vqnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vqnegq_s16(x[1]);
+}
+
+static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[8], step2[8];
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ // stage 2
+ btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+ // stage 3
+ btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+ // stage 4
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
+
+ // stage 5
+ out[0] = vqaddq_s16(step1[0], step2[7]);
+ out[1] = vqaddq_s16(step1[1], step1[6]);
+ out[2] = vqaddq_s16(step1[2], step1[5]);
+ out[3] = vqaddq_s16(step1[3], step2[4]);
+ out[4] = vqsubq_s16(step1[3], step2[4]);
+ out[5] = vqsubq_s16(step1[2], step1[5]);
+ out[6] = vqsubq_s16(step1[1], step1[6]);
+ out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 4
+ // stage 5
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+ assert(!(size % 4));
+ if (!bit) return;
+ const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+ for (int i = 0; i < size; i++) {
+ arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+ }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+ int16x8_t temp[8];
+ for (int i = 0; i < size; ++i) {
+ temp[i] = input[size - 1 - i];
+ }
+ for (int i = 0; i < size; ++i) {
+ input[i] = temp[i];
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+ int stride,
+ int16x8_t *const a,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+ vmovn_s32(vld1q_s32(input + 4)));
+ input += stride;
+ }
+}
+
+static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
+ int txw_idx, int8_t size, int bit) {
+ const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+ int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
+ int16x4_t low_i16, high_i16;
+ int32x4_t low_i32, high_i32;
+ for (int i = 0; i < size; i++) {
+ int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
+ int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
+ low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
+ high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
+ low_i16 = vqmovn_s32(low_i32);
+ high_i16 = vqmovn_s32(high_i32);
+ output[i] = vcombine_s16(low_i16, high_i16);
+ }
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+ int size) {
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+
+ for (int z = 0; z < size; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 4
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+}
+
+static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+ btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+ btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+ btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+ step2[0] = in[0];
+ step2[1] = in[8];
+ step2[2] = in[4];
+ step2[3] = in[12];
+ step2[4] = in[2];
+ step2[5] = in[10];
+ step2[6] = in[6];
+ step2[7] = in[14];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c1 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[2] = in[4];
+ step2[4] = in[2];
+ step2[6] = in[6];
+
+ btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+ btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+ btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+ // stage 3
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+ (int16_t)cospi[10], (int16_t)cospi[54]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+ (int16_t)cospi[26], (int16_t)cospi[38]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
+ (int16_t)cospi[42], (int16_t)cospi[22]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
+ (int16_t)cospi[58], (int16_t)cospi[6]);
+ const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[0] = in[15];
+ x[1] = in[0];
+ x[2] = in[13];
+ x[3] = in[2];
+ x[4] = in[11];
+ x[5] = in[4];
+ x[6] = in[9];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[9] = in[8];
+ x[10] = in[5];
+ x[11] = in[10];
+ x[12] = in[3];
+ x[13] = in[12];
+ x[14] = in[1];
+ x[15] = in[14];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+ btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+ btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+ btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c5);
+ btf_16_half_neon(x + 6, c5);
+ btf_16_half_neon(x + 10, c5);
+ btf_16_half_neon(x + 14, c5);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vqnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vqnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vqnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vqnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vqnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vqnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vqnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vqnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ int16x8_t x[16];
+ int16x8_t t[10];
+ int16x8_t s0, s1, s4, s5;
+ int16x8_t s8, s9, s12, s13;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[8] = s0;
+ x[9] = s1;
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+
+ // Stage 5
+ x[0] = t[0];
+ x[1] = t[1];
+ x[4] = t[0];
+ x[5] = t[1];
+ x[8] = s8;
+ x[9] = s9;
+ x[12] = s8;
+ x[13] = s9;
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+ t[8] = x[8];
+ t[9] = x[9];
+ btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+
+ // Stage 7
+ x[0] = t[0];
+ x[1] = t[1];
+ x[2] = t[0];
+ x[3] = t[1];
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+ x[8] = t[8];
+ x[9] = t[9];
+ x[10] = t[8];
+ x[11] = t[9];
+ x[12] = s12;
+ x[13] = s13;
+ x[14] = s12;
+ x[15] = s13;
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c1);
+ btf_16_half_neon(x + 6, c1);
+ btf_16_half_neon(x + 10, c1);
+ btf_16_half_neon(x + 14, c1);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vqnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vqnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vqnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vqnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vqnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vqnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vqnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vqnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[1] = in[0];
+ x[3] = in[2];
+ x[5] = in[4];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[10] = in[5];
+ x[12] = in[3];
+ x[14] = in[1];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+ btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+ btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+ btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+ btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+ btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+ btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+ btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c1);
+ btf_16_half_neon(x + 6, c1);
+ btf_16_half_neon(x + 10, c1);
+ btf_16_half_neon(x + 14, c1);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vqnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vqnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vqnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vqnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vqnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vqnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vqnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vqnegq_s16(x[1]);
+}
+
+static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+ (int16_t)cospi[34], (int16_t)cospi[30]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+ (int16_t)cospi[50], (int16_t)cospi[14]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
+ (int16_t)cospi[42], (int16_t)cospi[22]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
+ (int16_t)cospi[58], (int16_t)cospi[6]);
+ const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c8 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c9 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+ btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+ btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+ btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+ btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+ btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+ step2[0] = in[0];
+ step2[1] = in[16];
+ step2[2] = in[8];
+ step2[3] = in[24];
+ step2[4] = in[4];
+ step2[5] = in[20];
+ step2[6] = in[12];
+ step2[7] = in[28];
+ step2[8] = in[2];
+ step2[9] = in[18];
+ step2[10] = in[10];
+ step2[11] = in[26];
+ step2[12] = in[6];
+ step2[13] = in[22];
+ step2[14] = in[14];
+ step2[15] = in[30];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+ btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+ btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+ btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[4] = step2[4];
+ step1[5] = step2[5];
+ step1[6] = step2[6];
+ step1[7] = step2[7];
+
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+ btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+ btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[1], step1[2]);
+ step2[2] = vqsubq_s16(step1[1], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+ out[16] = step1;
+ out[17] = step1;
+ out[18] = step1;
+ out[19] = step1;
+ out[20] = step1;
+ out[21] = step1;
+ out[22] = step1;
+ out[23] = step1;
+ out[24] = step1;
+ out[25] = step1;
+ out[26] = step1;
+ out[27] = step1;
+ out[28] = step1;
+ out[29] = step1;
+ out[30] = step1;
+ out[31] = step1;
+}
+
+static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], cospi[48]);
+ const int16x4_t c2 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c3 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[4] = in[4];
+ step2[8] = in[2];
+ step2[12] = in[6];
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = step2[4];
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[16];
+ step1[18] = step2[19];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[21] = step2[20];
+ step1[22] = step2[23];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[24];
+ step1[26] = step2[27];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[29] = step2[28];
+ step1[30] = step2[31];
+ step1[31] = step2[31];
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[8] = step1[8];
+ step2[9] = step1[8];
+ step2[10] = step1[11];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[13] = step1[12];
+ step2[14] = step1[15];
+ step2[15] = step1[15];
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
+
+ step1[4] = step2[4];
+ step1[5] = step2[4];
+ step1[6] = step2[7];
+ step1[7] = step2[7];
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[0];
+ step2[2] = step1[0];
+ step2[3] = step1[0];
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c2 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c3 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+ btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+ btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ step2[0] = in[0];
+ step2[2] = in[8];
+ step2[4] = in[4];
+ step2[6] = in[12];
+ step2[8] = in[2];
+ step2[10] = in[10];
+ step2[12] = in[6];
+ step2[14] = in[14];
+
+ // stage 3
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+ btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = step2[4];
+ step1[6] = step2[6];
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[0], step1[2]);
+ step2[2] = vqsubq_s16(step1[0], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
+ btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
+ btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
+ btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[15]);
+ step1[1] = vqaddq_s16(step2[1], step2[14]);
+ step1[2] = vqaddq_s16(step2[2], step2[13]);
+ step1[3] = vqaddq_s16(step2[3], step2[12]);
+ step1[4] = vqaddq_s16(step2[4], step2[11]);
+ step1[5] = vqaddq_s16(step2[5], step2[10]);
+ step1[6] = vqaddq_s16(step2[6], step2[9]);
+ step1[7] = vqaddq_s16(step2[7], step2[8]);
+ step1[8] = vqsubq_s16(step2[7], step2[8]);
+ step1[9] = vqsubq_s16(step2[6], step2[9]);
+ step1[10] = vqsubq_s16(step2[5], step2[10]);
+ step1[11] = vqsubq_s16(step2[4], step2[11]);
+ step1[12] = vqsubq_s16(step2[3], step2[12]);
+ step1[13] = vqsubq_s16(step2[2], step2[13]);
+ step1[14] = vqsubq_s16(step2[1], step2[14]);
+ step1[15] = vqsubq_s16(step2[0], step2[15]);
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[47]);
+ step1[33] = vqaddq_s16(step2[33], step2[46]);
+ step1[34] = vqaddq_s16(step2[34], step2[45]);
+ step1[35] = vqaddq_s16(step2[35], step2[44]);
+ step1[36] = vqaddq_s16(step2[36], step2[43]);
+ step1[37] = vqaddq_s16(step2[37], step2[42]);
+ step1[38] = vqaddq_s16(step2[38], step2[41]);
+ step1[39] = vqaddq_s16(step2[39], step2[40]);
+ step1[40] = vqsubq_s16(step2[39], step2[40]);
+ step1[41] = vqsubq_s16(step2[38], step2[41]);
+ step1[42] = vqsubq_s16(step2[37], step2[42]);
+ step1[43] = vqsubq_s16(step2[36], step2[43]);
+ step1[44] = vqsubq_s16(step2[35], step2[44]);
+ step1[45] = vqsubq_s16(step2[34], step2[45]);
+ step1[46] = vqsubq_s16(step2[33], step2[46]);
+ step1[47] = vqsubq_s16(step2[32], step2[47]);
+ step1[48] = vqsubq_s16(step2[63], step2[48]);
+ step1[49] = vqsubq_s16(step2[62], step2[49]);
+ step1[50] = vqsubq_s16(step2[61], step2[50]);
+ step1[51] = vqsubq_s16(step2[60], step2[51]);
+ step1[52] = vqsubq_s16(step2[59], step2[52]);
+ step1[53] = vqsubq_s16(step2[58], step2[53]);
+ step1[54] = vqsubq_s16(step2[57], step2[54]);
+ step1[55] = vqsubq_s16(step2[56], step2[55]);
+ step1[56] = vqaddq_s16(step2[56], step2[55]);
+ step1[57] = vqaddq_s16(step2[57], step2[54]);
+ step1[58] = vqaddq_s16(step2[58], step2[53]);
+ step1[59] = vqaddq_s16(step2[59], step2[52]);
+ step1[60] = vqaddq_s16(step2[60], step2[51]);
+ step1[61] = vqaddq_s16(step2[61], step2[50]);
+ step1[62] = vqaddq_s16(step2[62], step2[49]);
+ step1[63] = vqaddq_s16(step2[63], step2[48]);
+}
+
+static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+
+ btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
+ btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
+ btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
+ btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
+ btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
+ btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
+ btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[31]);
+ step2[1] = vqaddq_s16(step1[1], step1[30]);
+ step2[2] = vqaddq_s16(step1[2], step1[29]);
+ step2[3] = vqaddq_s16(step1[3], step1[28]);
+ step2[4] = vqaddq_s16(step1[4], step1[27]);
+ step2[5] = vqaddq_s16(step1[5], step1[26]);
+ step2[6] = vqaddq_s16(step1[6], step1[25]);
+ step2[7] = vqaddq_s16(step1[7], step1[24]);
+ step2[8] = vqaddq_s16(step1[8], step1[23]);
+ step2[9] = vqaddq_s16(step1[9], step1[22]);
+ step2[10] = vqaddq_s16(step1[10], step1[21]);
+ step2[11] = vqaddq_s16(step1[11], step1[20]);
+ step2[12] = vqaddq_s16(step1[12], step1[19]);
+ step2[13] = vqaddq_s16(step1[13], step1[18]);
+ step2[14] = vqaddq_s16(step1[14], step1[17]);
+ step2[15] = vqaddq_s16(step1[15], step1[16]);
+ step2[16] = vqsubq_s16(step1[15], step1[16]);
+ step2[17] = vqsubq_s16(step1[14], step1[17]);
+ step2[18] = vqsubq_s16(step1[13], step1[18]);
+ step2[19] = vqsubq_s16(step1[12], step1[19]);
+ step2[20] = vqsubq_s16(step1[11], step1[20]);
+ step2[21] = vqsubq_s16(step1[10], step1[21]);
+ step2[22] = vqsubq_s16(step1[9], step1[22]);
+ step2[23] = vqsubq_s16(step1[8], step1[23]);
+ step2[24] = vqsubq_s16(step1[7], step1[24]);
+ step2[25] = vqsubq_s16(step1[6], step1[25]);
+ step2[26] = vqsubq_s16(step1[5], step1[26]);
+ step2[27] = vqsubq_s16(step1[4], step1[27]);
+ step2[28] = vqsubq_s16(step1[3], step1[28]);
+ step2[29] = vqsubq_s16(step1[2], step1[29]);
+ step2[30] = vqsubq_s16(step1[1], step1[30]);
+ step2[31] = vqsubq_s16(step1[0], step1[31]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[36] = step1[36];
+ step2[37] = step1[37];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[58] = step1[58];
+ step2[59] = step1[59];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+}
+
+static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step2[64], step1[64];
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+ (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+ const int16x4_t c5 =
+ set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+ (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+ const int16x4_t c6 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c7 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[2] = in[16];
+ step2[4] = in[8];
+ step2[6] = in[24];
+ step2[8] = in[4];
+ step2[10] = in[20];
+ step2[12] = in[12];
+ step2[14] = in[28];
+ step2[16] = in[2];
+ step2[18] = in[18];
+ step2[20] = in[10];
+ step2[22] = in[26];
+ step2[24] = in[6];
+ step2[26] = in[22];
+ step2[28] = in[14];
+ step2[30] = in[30];
+
+ btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+ btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
+ btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
+ btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+ btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+ btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
+ btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
+ btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+ btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+ btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
+ btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
+ btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+ btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+ btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
+ btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
+ btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+ // stage 3
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = step2[4];
+ step1[6] = step2[6];
+ step1[8] = step2[8];
+ step1[10] = step2[10];
+ step1[12] = step2[12];
+ step1[14] = step2[14];
+
+ btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+ btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
+ btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
+ btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+ btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+ btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
+ btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
+ btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+ step1[32] = vqaddq_s16(step2[32], step2[33]);
+ step1[33] = vqsubq_s16(step2[32], step2[33]);
+ step1[34] = vqsubq_s16(step2[35], step2[34]);
+ step1[35] = vqaddq_s16(step2[35], step2[34]);
+ step1[36] = vqaddq_s16(step2[36], step2[37]);
+ step1[37] = vqsubq_s16(step2[36], step2[37]);
+ step1[38] = vqsubq_s16(step2[39], step2[38]);
+ step1[39] = vqaddq_s16(step2[39], step2[38]);
+ step1[40] = vqaddq_s16(step2[40], step2[41]);
+ step1[41] = vqsubq_s16(step2[40], step2[41]);
+ step1[42] = vqsubq_s16(step2[43], step2[42]);
+ step1[43] = vqaddq_s16(step2[43], step2[42]);
+ step1[44] = vqaddq_s16(step2[44], step2[45]);
+ step1[45] = vqsubq_s16(step2[44], step2[45]);
+ step1[46] = vqsubq_s16(step2[47], step2[46]);
+ step1[47] = vqaddq_s16(step2[47], step2[46]);
+ step1[48] = vqaddq_s16(step2[48], step2[49]);
+ step1[49] = vqsubq_s16(step2[48], step2[49]);
+ step1[50] = vqsubq_s16(step2[51], step2[50]);
+ step1[51] = vqaddq_s16(step2[51], step2[50]);
+ step1[52] = vqaddq_s16(step2[52], step2[53]);
+ step1[53] = vqsubq_s16(step2[52], step2[53]);
+ step1[54] = vqsubq_s16(step2[55], step2[54]);
+ step1[55] = vqaddq_s16(step2[55], step2[54]);
+ step1[56] = vqaddq_s16(step2[56], step2[57]);
+ step1[57] = vqsubq_s16(step2[56], step2[57]);
+ step1[58] = vqsubq_s16(step2[59], step2[58]);
+ step1[59] = vqaddq_s16(step2[59], step2[58]);
+ step1[60] = vqaddq_s16(step2[60], step2[61]);
+ step1[61] = vqsubq_s16(step2[60], step2[61]);
+ step1[62] = vqsubq_s16(step2[63], step2[62]);
+ step1[63] = vqaddq_s16(step2[63], step2[62]);
+
+ // stage 4
+
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+
+ btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
+ btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
+ btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+ btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+ btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+ btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+ btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+ btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+ btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+ step2[16] = vqaddq_s16(step1[16], step1[17]);
+ step2[17] = vqsubq_s16(step1[16], step1[17]);
+ step2[18] = vqsubq_s16(step1[19], step1[18]);
+ step2[19] = vqaddq_s16(step1[19], step1[18]);
+ step2[20] = vqaddq_s16(step1[20], step1[21]);
+ step2[21] = vqsubq_s16(step1[20], step1[21]);
+ step2[22] = vqsubq_s16(step1[23], step1[22]);
+ step2[23] = vqaddq_s16(step1[23], step1[22]);
+ step2[24] = vqaddq_s16(step1[24], step1[25]);
+ step2[25] = vqsubq_s16(step1[24], step1[25]);
+ step2[26] = vqsubq_s16(step1[27], step1[26]);
+ step2[27] = vqaddq_s16(step1[27], step1[26]);
+ step2[28] = vqaddq_s16(step1[28], step1[29]);
+ step2[29] = vqsubq_s16(step1[28], step1[29]);
+ step2[30] = vqsubq_s16(step1[31], step1[30]);
+ step2[31] = vqaddq_s16(step1[31], step1[30]);
+ step2[32] = step1[32];
+ step2[35] = step1[35];
+ step2[36] = step1[36];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[43] = step1[43];
+ step2[44] = step1[44];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[51] = step1[51];
+ step2[52] = step1[52];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[59] = step1[59];
+ step2[60] = step1[60];
+ step2[63] = step1[63];
+
+ // stage 5
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+ btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+ btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+ btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+ btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+ step1[16] = step2[16];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[35]);
+ step1[33] = vqaddq_s16(step2[33], step2[34]);
+ step1[34] = vqsubq_s16(step2[33], step2[34]);
+ step1[35] = vqsubq_s16(step2[32], step2[35]);
+ step1[36] = vqsubq_s16(step2[39], step2[36]);
+ step1[37] = vqsubq_s16(step2[38], step2[37]);
+ step1[38] = vqaddq_s16(step2[38], step2[37]);
+ step1[39] = vqaddq_s16(step2[39], step2[36]);
+ step1[40] = vqaddq_s16(step2[40], step2[43]);
+ step1[41] = vqaddq_s16(step2[41], step2[42]);
+ step1[42] = vqsubq_s16(step2[41], step2[42]);
+ step1[43] = vqsubq_s16(step2[40], step2[43]);
+ step1[44] = vqsubq_s16(step2[47], step2[44]);
+ step1[45] = vqsubq_s16(step2[46], step2[45]);
+ step1[46] = vqaddq_s16(step2[46], step2[45]);
+ step1[47] = vqaddq_s16(step2[47], step2[44]);
+ step1[48] = vqaddq_s16(step2[48], step2[51]);
+ step1[49] = vqaddq_s16(step2[49], step2[50]);
+ step1[50] = vqsubq_s16(step2[49], step2[50]);
+ step1[51] = vqsubq_s16(step2[48], step2[51]);
+ step1[52] = vqsubq_s16(step2[55], step2[52]);
+ step1[53] = vqsubq_s16(step2[54], step2[53]);
+ step1[54] = vqaddq_s16(step2[54], step2[53]);
+ step1[55] = vqaddq_s16(step2[55], step2[52]);
+ step1[56] = vqaddq_s16(step2[56], step2[59]);
+ step1[57] = vqaddq_s16(step2[57], step2[58]);
+ step1[58] = vqsubq_s16(step2[57], step2[58]);
+ step1[59] = vqsubq_s16(step2[56], step2[59]);
+ step1[60] = vqsubq_s16(step2[63], step2[60]);
+ step1[61] = vqsubq_s16(step2[62], step2[61]);
+ step1[62] = vqaddq_s16(step2[62], step2[61]);
+ step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+ // stage 6
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+ btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+ btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+ btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+ btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+ btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+ btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+ btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+ btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[19]);
+ step2[17] = vqaddq_s16(step1[17], step1[18]);
+ step2[18] = vqsubq_s16(step1[17], step1[18]);
+ step2[19] = vqsubq_s16(step1[16], step1[19]);
+ step2[20] = vqsubq_s16(step1[23], step1[20]);
+ step2[21] = vqsubq_s16(step1[22], step1[21]);
+ step2[22] = vqaddq_s16(step1[22], step1[21]);
+ step2[23] = vqaddq_s16(step1[23], step1[20]);
+ step2[24] = vqaddq_s16(step1[24], step1[27]);
+ step2[25] = vqaddq_s16(step1[25], step1[26]);
+ step2[26] = vqsubq_s16(step1[25], step1[26]);
+ step2[27] = vqsubq_s16(step1[24], step1[27]);
+ step2[28] = vqsubq_s16(step1[31], step1[28]);
+ step2[29] = vqsubq_s16(step1[30], step1[29]);
+ step2[30] = vqaddq_s16(step1[30], step1[29]);
+ step2[31] = vqaddq_s16(step1[31], step1[28]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[41] = step1[41];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[54] = step1[54];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+ btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+ btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+ btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+ btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[39]);
+ step1[33] = vqaddq_s16(step2[33], step2[38]);
+ step1[34] = vqaddq_s16(step2[34], step2[37]);
+ step1[35] = vqaddq_s16(step2[35], step2[36]);
+ step1[36] = vqsubq_s16(step2[35], step2[36]);
+ step1[37] = vqsubq_s16(step2[34], step2[37]);
+ step1[38] = vqsubq_s16(step2[33], step2[38]);
+ step1[39] = vqsubq_s16(step2[32], step2[39]);
+ step1[40] = vqsubq_s16(step2[47], step2[40]);
+ step1[41] = vqsubq_s16(step2[46], step2[41]);
+ step1[42] = vqsubq_s16(step2[45], step2[42]);
+ step1[43] = vqsubq_s16(step2[44], step2[43]);
+ step1[44] = vqaddq_s16(step2[43], step2[44]);
+ step1[45] = vqaddq_s16(step2[42], step2[45]);
+ step1[46] = vqaddq_s16(step2[41], step2[46]);
+ step1[47] = vqaddq_s16(step2[40], step2[47]);
+ step1[48] = vqaddq_s16(step2[48], step2[55]);
+ step1[49] = vqaddq_s16(step2[49], step2[54]);
+ step1[50] = vqaddq_s16(step2[50], step2[53]);
+ step1[51] = vqaddq_s16(step2[51], step2[52]);
+ step1[52] = vqsubq_s16(step2[51], step2[52]);
+ step1[53] = vqsubq_s16(step2[50], step2[53]);
+ step1[54] = vqsubq_s16(step2[49], step2[54]);
+ step1[55] = vqsubq_s16(step2[48], step2[55]);
+ step1[56] = vqsubq_s16(step2[63], step2[56]);
+ step1[57] = vqsubq_s16(step2[62], step2[57]);
+ step1[58] = vqsubq_s16(step2[61], step2[58]);
+ step1[59] = vqsubq_s16(step2[60], step2[59]);
+ step1[60] = vqaddq_s16(step2[59], step2[60]);
+ step1[61] = vqaddq_s16(step2[58], step2[61]);
+ step1[62] = vqaddq_s16(step2[57], step2[62]);
+ step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+ btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+ btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+ btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+ btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+ btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+ btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+ btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[23]);
+ step2[17] = vqaddq_s16(step1[17], step1[22]);
+ step2[18] = vqaddq_s16(step1[18], step1[21]);
+ step2[19] = vqaddq_s16(step1[19], step1[20]);
+ step2[20] = vqsubq_s16(step1[19], step1[20]);
+ step2[21] = vqsubq_s16(step1[18], step1[21]);
+ step2[22] = vqsubq_s16(step1[17], step1[22]);
+ step2[23] = vqsubq_s16(step1[16], step1[23]);
+ step2[24] = vqsubq_s16(step1[31], step1[24]);
+ step2[25] = vqsubq_s16(step1[30], step1[25]);
+ step2[26] = vqsubq_s16(step1[29], step1[26]);
+ step2[27] = vqsubq_s16(step1[28], step1[27]);
+ step2[28] = vqaddq_s16(step1[28], step1[27]);
+ step2[29] = vqaddq_s16(step1[29], step1[26]);
+ step2[30] = vqaddq_s16(step1[30], step1[25]);
+ step2[31] = vqaddq_s16(step1[31], step1[24]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[44] = step1[44];
+ step2[45] = step1[45];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[50] = step1[50];
+ step2[51] = step1[51];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 9
+ idct64_stage9_neon(step2, step1, cos_bit);
+
+ // stage 10
+ idct64_stage10_neon(step1, step2, cos_bit);
+
+ // stage 11
+
+ out[0] = vqaddq_s16(step2[0], step2[63]);
+ out[1] = vqaddq_s16(step2[1], step2[62]);
+ out[2] = vqaddq_s16(step2[2], step2[61]);
+ out[3] = vqaddq_s16(step2[3], step2[60]);
+ out[4] = vqaddq_s16(step2[4], step2[59]);
+ out[5] = vqaddq_s16(step2[5], step2[58]);
+ out[6] = vqaddq_s16(step2[6], step2[57]);
+ out[7] = vqaddq_s16(step2[7], step2[56]);
+ out[8] = vqaddq_s16(step2[8], step2[55]);
+ out[9] = vqaddq_s16(step2[9], step2[54]);
+ out[10] = vqaddq_s16(step2[10], step2[53]);
+ out[11] = vqaddq_s16(step2[11], step2[52]);
+ out[12] = vqaddq_s16(step2[12], step2[51]);
+ out[13] = vqaddq_s16(step2[13], step2[50]);
+ out[14] = vqaddq_s16(step2[14], step2[49]);
+ out[15] = vqaddq_s16(step2[15], step2[48]);
+ out[16] = vqaddq_s16(step2[16], step2[47]);
+ out[17] = vqaddq_s16(step2[17], step2[46]);
+ out[18] = vqaddq_s16(step2[18], step2[45]);
+ out[19] = vqaddq_s16(step2[19], step2[44]);
+ out[20] = vqaddq_s16(step2[20], step2[43]);
+ out[21] = vqaddq_s16(step2[21], step2[42]);
+ out[22] = vqaddq_s16(step2[22], step2[41]);
+ out[23] = vqaddq_s16(step2[23], step2[40]);
+ out[24] = vqaddq_s16(step2[24], step2[39]);
+ out[25] = vqaddq_s16(step2[25], step2[38]);
+ out[26] = vqaddq_s16(step2[26], step2[37]);
+ out[27] = vqaddq_s16(step2[27], step2[36]);
+ out[28] = vqaddq_s16(step2[28], step2[35]);
+ out[29] = vqaddq_s16(step2[29], step2[34]);
+ out[30] = vqaddq_s16(step2[30], step2[33]);
+ out[31] = vqaddq_s16(step2[31], step2[32]);
+ out[32] = vqsubq_s16(step2[31], step2[32]);
+ out[33] = vqsubq_s16(step2[30], step2[33]);
+ out[34] = vqsubq_s16(step2[29], step2[34]);
+ out[35] = vqsubq_s16(step2[28], step2[35]);
+ out[36] = vqsubq_s16(step2[27], step2[36]);
+ out[37] = vqsubq_s16(step2[26], step2[37]);
+ out[38] = vqsubq_s16(step2[25], step2[38]);
+ out[39] = vqsubq_s16(step2[24], step2[39]);
+ out[40] = vqsubq_s16(step2[23], step2[40]);
+ out[41] = vqsubq_s16(step2[22], step2[41]);
+ out[42] = vqsubq_s16(step2[21], step2[42]);
+ out[43] = vqsubq_s16(step2[20], step2[43]);
+ out[44] = vqsubq_s16(step2[19], step2[44]);
+ out[45] = vqsubq_s16(step2[18], step2[45]);
+ out[46] = vqsubq_s16(step2[17], step2[46]);
+ out[47] = vqsubq_s16(step2[16], step2[47]);
+ out[48] = vqsubq_s16(step2[15], step2[48]);
+ out[49] = vqsubq_s16(step2[14], step2[49]);
+ out[50] = vqsubq_s16(step2[13], step2[50]);
+ out[51] = vqsubq_s16(step2[12], step2[51]);
+ out[52] = vqsubq_s16(step2[11], step2[52]);
+ out[53] = vqsubq_s16(step2[10], step2[53]);
+ out[54] = vqsubq_s16(step2[9], step2[54]);
+ out[55] = vqsubq_s16(step2[8], step2[55]);
+ out[56] = vqsubq_s16(step2[7], step2[56]);
+ out[57] = vqsubq_s16(step2[6], step2[57]);
+ out[58] = vqsubq_s16(step2[5], step2[58]);
+ out[59] = vqsubq_s16(step2[4], step2[59]);
+ out[60] = vqsubq_s16(step2[3], step2[60]);
+ out[61] = vqsubq_s16(step2[2], step2[61]);
+ out[62] = vqsubq_s16(step2[1], step2[62]);
+ out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+
+ t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
+
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+ out[16] = step1;
+ out[17] = step1;
+ out[18] = step1;
+ out[19] = step1;
+ out[20] = step1;
+ out[21] = step1;
+ out[22] = step1;
+ out[23] = step1;
+ out[24] = step1;
+ out[25] = step1;
+ out[26] = step1;
+ out[27] = step1;
+ out[28] = step1;
+ out[29] = step1;
+ out[30] = step1;
+ out[31] = step1;
+ out[32] = step1;
+ out[33] = step1;
+ out[34] = step1;
+ out[35] = step1;
+ out[36] = step1;
+ out[37] = step1;
+ out[38] = step1;
+ out[39] = step1;
+ out[40] = step1;
+ out[41] = step1;
+ out[42] = step1;
+ out[43] = step1;
+ out[44] = step1;
+ out[45] = step1;
+ out[46] = step1;
+ out[47] = step1;
+ out[48] = step1;
+ out[49] = step1;
+ out[50] = step1;
+ out[51] = step1;
+ out[52] = step1;
+ out[53] = step1;
+ out[54] = step1;
+ out[55] = step1;
+ out[56] = step1;
+ out[57] = step1;
+ out[58] = step1;
+ out[59] = step1;
+ out[60] = step1;
+ out[61] = step1;
+ out[62] = step1;
+ out[63] = step1;
+}
+
+static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step2[64], step1[64];
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
+ (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+ const int16x4_t c5 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c6 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[8] = in[4];
+ step2[16] = in[2];
+ step2[24] = in[6];
+
+ btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+ btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+ btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+ btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+ // stage 3
+
+ step1[0] = step2[0];
+ step1[8] = step2[8];
+
+ btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+ btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+ step1[32] = step2[32];
+ step1[33] = step2[32];
+ step1[38] = step2[39];
+ step1[39] = step2[39];
+ step1[40] = step2[40];
+ step1[41] = step2[40];
+ step1[46] = step2[47];
+ step1[47] = step2[47];
+ step1[48] = step2[48];
+ step1[49] = step2[48];
+ step1[54] = step2[55];
+ step1[55] = step2[55];
+ step1[56] = step2[56];
+ step1[57] = step2[56];
+ step1[62] = step2[63];
+ step1[63] = step2[63];
+
+ // stage 4
+
+ step2[0] = step1[0];
+
+ btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+ btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+ btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
+
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+ step2[32] = step1[32];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[63] = step1[63];
+
+ // stage 5
+
+ step1[0] = step2[0];
+
+ btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+ btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
+
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ step1[16] = step2[16];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[31] = step2[31];
+ step1[32] = step2[32];
+ step1[33] = step2[33];
+ step1[34] = step2[33];
+ step1[35] = step2[32];
+ step1[36] = step2[39];
+ step1[37] = step2[38];
+ step1[38] = step2[38];
+ step1[39] = step2[39];
+ step1[40] = step2[40];
+ step1[41] = step2[41];
+ step1[42] = step2[41];
+ step1[43] = step2[40];
+ step1[44] = step2[47];
+ step1[45] = step2[46];
+ step1[46] = step2[46];
+ step1[47] = step2[47];
+ step1[48] = step2[48];
+ step1[49] = step2[49];
+ step1[50] = step2[49];
+ step1[51] = step2[48];
+ step1[52] = step2[55];
+ step1[53] = step2[54];
+ step1[54] = step2[54];
+ step1[55] = step2[55];
+ step1[56] = step2[56];
+ step1[57] = step2[57];
+ step1[58] = step2[57];
+ step1[59] = step2[56];
+ step1[60] = step2[63];
+ step1[61] = step2[62];
+ step1[62] = step2[62];
+ step1[63] = step2[63];
+
+ // stage 6
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+ btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+ btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
+ btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
+ btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+ btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+ btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
+ btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[17];
+ step2[19] = step1[16];
+ step2[20] = step1[23];
+ step2[21] = step1[22];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[26] = step1[25];
+ step2[27] = step1[24];
+ step2[28] = step1[31];
+ step2[29] = step1[30];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[41] = step1[41];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[54] = step1[54];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 7
+
+ btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+ btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+ btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
+ btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[10] = step2[9];
+ step1[11] = step2[8];
+ step1[12] = step2[15];
+ step1[13] = step2[14];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[39]);
+ step1[33] = vqaddq_s16(step2[33], step2[38]);
+ step1[34] = vqaddq_s16(step2[34], step2[37]);
+ step1[35] = vqaddq_s16(step2[35], step2[36]);
+ step1[36] = vqsubq_s16(step2[35], step2[36]);
+ step1[37] = vqsubq_s16(step2[34], step2[37]);
+ step1[38] = vqsubq_s16(step2[33], step2[38]);
+ step1[39] = vqsubq_s16(step2[32], step2[39]);
+ step1[40] = vqsubq_s16(step2[47], step2[40]);
+ step1[41] = vqsubq_s16(step2[46], step2[41]);
+ step1[42] = vqsubq_s16(step2[45], step2[42]);
+ step1[43] = vqsubq_s16(step2[44], step2[43]);
+ step1[44] = vqaddq_s16(step2[43], step2[44]);
+ step1[45] = vqaddq_s16(step2[42], step2[45]);
+ step1[46] = vqaddq_s16(step2[41], step2[46]);
+ step1[47] = vqaddq_s16(step2[40], step2[47]);
+ step1[48] = vqaddq_s16(step2[48], step2[55]);
+ step1[49] = vqaddq_s16(step2[49], step2[54]);
+ step1[50] = vqaddq_s16(step2[50], step2[53]);
+ step1[51] = vqaddq_s16(step2[51], step2[52]);
+ step1[52] = vqsubq_s16(step2[51], step2[52]);
+ step1[53] = vqsubq_s16(step2[50], step2[53]);
+ step1[54] = vqsubq_s16(step2[49], step2[54]);
+ step1[55] = vqsubq_s16(step2[48], step2[55]);
+ step1[56] = vqsubq_s16(step2[63], step2[56]);
+ step1[57] = vqsubq_s16(step2[62], step2[57]);
+ step1[58] = vqsubq_s16(step2[61], step2[58]);
+ step1[59] = vqsubq_s16(step2[60], step2[59]);
+ step1[60] = vqaddq_s16(step2[59], step2[60]);
+ step1[61] = vqaddq_s16(step2[58], step2[61]);
+ step1[62] = vqaddq_s16(step2[57], step2[62]);
+ step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+ btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+ btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+ btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+ btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
+ btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
+ btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
+ btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[3];
+ step2[5] = step1[2];
+ step2[6] = step1[1];
+ step2[7] = step1[0];
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[23]);
+ step2[17] = vqaddq_s16(step1[17], step1[22]);
+ step2[18] = vqaddq_s16(step1[18], step1[21]);
+ step2[19] = vqaddq_s16(step1[19], step1[20]);
+ step2[20] = vqsubq_s16(step1[19], step1[20]);
+ step2[21] = vqsubq_s16(step1[18], step1[21]);
+ step2[22] = vqsubq_s16(step1[17], step1[22]);
+ step2[23] = vqsubq_s16(step1[16], step1[23]);
+ step2[24] = vqsubq_s16(step1[31], step1[24]);
+ step2[25] = vqsubq_s16(step1[30], step1[25]);
+ step2[26] = vqsubq_s16(step1[29], step1[26]);
+ step2[27] = vqsubq_s16(step1[28], step1[27]);
+ step2[28] = vqaddq_s16(step1[28], step1[27]);
+ step2[29] = vqaddq_s16(step1[29], step1[26]);
+ step2[30] = vqaddq_s16(step1[30], step1[25]);
+ step2[31] = vqaddq_s16(step1[31], step1[24]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[44] = step1[44];
+ step2[45] = step1[45];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[50] = step1[50];
+ step2[51] = step1[51];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 9
+ idct64_stage9_neon(step2, step1, cos_bit);
+
+ // stage 10
+ idct64_stage10_neon(step1, step2, cos_bit);
+
+ // stage 11
+
+ out[0] = vqaddq_s16(step2[0], step2[63]);
+ out[1] = vqaddq_s16(step2[1], step2[62]);
+ out[2] = vqaddq_s16(step2[2], step2[61]);
+ out[3] = vqaddq_s16(step2[3], step2[60]);
+ out[4] = vqaddq_s16(step2[4], step2[59]);
+ out[5] = vqaddq_s16(step2[5], step2[58]);
+ out[6] = vqaddq_s16(step2[6], step2[57]);
+ out[7] = vqaddq_s16(step2[7], step2[56]);
+ out[8] = vqaddq_s16(step2[8], step2[55]);
+ out[9] = vqaddq_s16(step2[9], step2[54]);
+ out[10] = vqaddq_s16(step2[10], step2[53]);
+ out[11] = vqaddq_s16(step2[11], step2[52]);
+ out[12] = vqaddq_s16(step2[12], step2[51]);
+ out[13] = vqaddq_s16(step2[13], step2[50]);
+ out[14] = vqaddq_s16(step2[14], step2[49]);
+ out[15] = vqaddq_s16(step2[15], step2[48]);
+ out[16] = vqaddq_s16(step2[16], step2[47]);
+ out[17] = vqaddq_s16(step2[17], step2[46]);
+ out[18] = vqaddq_s16(step2[18], step2[45]);
+ out[19] = vqaddq_s16(step2[19], step2[44]);
+ out[20] = vqaddq_s16(step2[20], step2[43]);
+ out[21] = vqaddq_s16(step2[21], step2[42]);
+ out[22] = vqaddq_s16(step2[22], step2[41]);
+ out[23] = vqaddq_s16(step2[23], step2[40]);
+ out[24] = vqaddq_s16(step2[24], step2[39]);
+ out[25] = vqaddq_s16(step2[25], step2[38]);
+ out[26] = vqaddq_s16(step2[26], step2[37]);
+ out[27] = vqaddq_s16(step2[27], step2[36]);
+ out[28] = vqaddq_s16(step2[28], step2[35]);
+ out[29] = vqaddq_s16(step2[29], step2[34]);
+ out[30] = vqaddq_s16(step2[30], step2[33]);
+ out[31] = vqaddq_s16(step2[31], step2[32]);
+ out[32] = vqsubq_s16(step2[31], step2[32]);
+ out[33] = vqsubq_s16(step2[30], step2[33]);
+ out[34] = vqsubq_s16(step2[29], step2[34]);
+ out[35] = vqsubq_s16(step2[28], step2[35]);
+ out[36] = vqsubq_s16(step2[27], step2[36]);
+ out[37] = vqsubq_s16(step2[26], step2[37]);
+ out[38] = vqsubq_s16(step2[25], step2[38]);
+ out[39] = vqsubq_s16(step2[24], step2[39]);
+ out[40] = vqsubq_s16(step2[23], step2[40]);
+ out[41] = vqsubq_s16(step2[22], step2[41]);
+ out[42] = vqsubq_s16(step2[21], step2[42]);
+ out[43] = vqsubq_s16(step2[20], step2[43]);
+ out[44] = vqsubq_s16(step2[19], step2[44]);
+ out[45] = vqsubq_s16(step2[18], step2[45]);
+ out[46] = vqsubq_s16(step2[17], step2[46]);
+ out[47] = vqsubq_s16(step2[16], step2[47]);
+ out[48] = vqsubq_s16(step2[15], step2[48]);
+ out[49] = vqsubq_s16(step2[14], step2[49]);
+ out[50] = vqsubq_s16(step2[13], step2[50]);
+ out[51] = vqsubq_s16(step2[12], step2[51]);
+ out[52] = vqsubq_s16(step2[11], step2[52]);
+ out[53] = vqsubq_s16(step2[10], step2[53]);
+ out[54] = vqsubq_s16(step2[9], step2[54]);
+ out[55] = vqsubq_s16(step2[8], step2[55]);
+ out[56] = vqsubq_s16(step2[7], step2[56]);
+ out[57] = vqsubq_s16(step2[6], step2[57]);
+ out[58] = vqsubq_s16(step2[5], step2[58]);
+ out[59] = vqsubq_s16(step2[4], step2[59]);
+ out[60] = vqsubq_s16(step2[3], step2[60]);
+ out[61] = vqsubq_s16(step2[2], step2[61]);
+ out[62] = vqsubq_s16(step2[1], step2[62]);
+ out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step2[64], step1[64];
+
+ const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+ (int16_t)cospi[36], (int16_t)cospi[28]);
+ const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+ (int16_t)cospi[52], (int16_t)cospi[12]);
+ const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+ (int16_t)cospi[40], (int16_t)cospi[24]);
+ const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+ (int16_t)cospi[16], (int16_t)cospi[48]);
+ const int16x4_t c4 =
+ set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+ (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+ const int16x4_t c5 =
+ set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+ (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+ const int16x4_t c6 =
+ set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+ (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+ const int16x4_t c7 =
+ set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+ (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[4] = in[8];
+ step2[8] = in[4];
+ step2[12] = in[12];
+ step2[16] = in[2];
+ step2[20] = in[10];
+ step2[24] = in[6];
+ step2[28] = in[14];
+
+ btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+ btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+ btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+ btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+ btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+ btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+ btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+ btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+ // stage 3
+
+ step1[0] = step2[0];
+ step1[4] = step2[4];
+ step1[8] = step2[8];
+ step1[12] = step2[12];
+
+ btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+ btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+ btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+ btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+
+ step1[32] = step2[32];
+ step1[33] = step2[32];
+ step1[34] = step2[35];
+ step1[35] = step2[35];
+ step1[36] = step2[36];
+ step1[37] = step2[36];
+ step1[38] = step2[39];
+ step1[39] = step2[39];
+ step1[40] = step2[40];
+ step1[41] = step2[40];
+ step1[42] = step2[43];
+ step1[43] = step2[43];
+ step1[44] = step2[44];
+ step1[45] = step2[44];
+ step1[46] = step2[47];
+ step1[47] = step2[47];
+ step1[48] = step2[48];
+ step1[49] = step2[48];
+ step1[50] = step2[51];
+ step1[51] = step2[51];
+ step1[52] = step2[52];
+ step1[53] = step2[52];
+ step1[54] = step2[55];
+ step1[55] = step2[55];
+ step1[56] = step2[56];
+ step1[57] = step2[56];
+ step1[58] = step2[59];
+ step1[59] = step2[59];
+ step1[60] = step2[60];
+ step1[61] = step2[60];
+ step1[62] = step2[63];
+ step1[63] = step2[63];
+
+ // stage 4
+
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+
+ btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+ btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+ btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+ btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+ btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+ btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+ btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+ btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+ step2[16] = step1[16];
+ step2[17] = step1[16];
+ step2[18] = step1[19];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[21] = step1[20];
+ step2[22] = step1[23];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[24];
+ step2[26] = step1[27];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[29] = step1[28];
+ step2[30] = step1[31];
+ step2[31] = step1[31];
+ step2[32] = step1[32];
+ step2[35] = step1[35];
+ step2[36] = step1[36];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[43] = step1[43];
+ step2[44] = step1[44];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[51] = step1[51];
+ step2[52] = step1[52];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[59] = step1[59];
+ step2[60] = step1[60];
+ step2[63] = step1[63];
+
+ // stage 5
+
+ step1[0] = step2[0];
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+ btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+ btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+ btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+ step1[16] = step2[16];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[35]);
+ step1[33] = vqaddq_s16(step2[33], step2[34]);
+ step1[34] = vqsubq_s16(step2[33], step2[34]);
+ step1[35] = vqsubq_s16(step2[32], step2[35]);
+ step1[36] = vqsubq_s16(step2[39], step2[36]);
+ step1[37] = vqsubq_s16(step2[38], step2[37]);
+ step1[38] = vqaddq_s16(step2[38], step2[37]);
+ step1[39] = vqaddq_s16(step2[39], step2[36]);
+ step1[40] = vqaddq_s16(step2[40], step2[43]);
+ step1[41] = vqaddq_s16(step2[41], step2[42]);
+ step1[42] = vqsubq_s16(step2[41], step2[42]);
+ step1[43] = vqsubq_s16(step2[40], step2[43]);
+ step1[44] = vqsubq_s16(step2[47], step2[44]);
+ step1[45] = vqsubq_s16(step2[46], step2[45]);
+ step1[46] = vqaddq_s16(step2[46], step2[45]);
+ step1[47] = vqaddq_s16(step2[47], step2[44]);
+ step1[48] = vqaddq_s16(step2[48], step2[51]);
+ step1[49] = vqaddq_s16(step2[49], step2[50]);
+ step1[50] = vqsubq_s16(step2[49], step2[50]);
+ step1[51] = vqsubq_s16(step2[48], step2[51]);
+ step1[52] = vqsubq_s16(step2[55], step2[52]);
+ step1[53] = vqsubq_s16(step2[54], step2[53]);
+ step1[54] = vqaddq_s16(step2[54], step2[53]);
+ step1[55] = vqaddq_s16(step2[55], step2[52]);
+ step1[56] = vqaddq_s16(step2[56], step2[59]);
+ step1[57] = vqaddq_s16(step2[57], step2[58]);
+ step1[58] = vqsubq_s16(step2[57], step2[58]);
+ step1[59] = vqsubq_s16(step2[56], step2[59]);
+ step1[60] = vqsubq_s16(step2[63], step2[60]);
+ step1[61] = vqsubq_s16(step2[62], step2[61]);
+ step1[62] = vqaddq_s16(step2[62], step2[61]);
+ step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+ // stage 6
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+ btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+ btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+ btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+ btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+ btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+ btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+ btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+ btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[19]);
+ step2[17] = vqaddq_s16(step1[17], step1[18]);
+ step2[18] = vqsubq_s16(step1[17], step1[18]);
+ step2[19] = vqsubq_s16(step1[16], step1[19]);
+ step2[20] = vqsubq_s16(step1[23], step1[20]);
+ step2[21] = vqsubq_s16(step1[22], step1[21]);
+ step2[22] = vqaddq_s16(step1[22], step1[21]);
+ step2[23] = vqaddq_s16(step1[23], step1[20]);
+ step2[24] = vqaddq_s16(step1[24], step1[27]);
+ step2[25] = vqaddq_s16(step1[25], step1[26]);
+ step2[26] = vqsubq_s16(step1[25], step1[26]);
+ step2[27] = vqsubq_s16(step1[24], step1[27]);
+ step2[28] = vqsubq_s16(step1[31], step1[28]);
+ step2[29] = vqsubq_s16(step1[30], step1[29]);
+ step2[30] = vqaddq_s16(step1[30], step1[29]);
+ step2[31] = vqaddq_s16(step1[31], step1[28]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[38] = step1[38];
+ step2[39] = step1[39];
+ step2[40] = step1[40];
+ step2[41] = step1[41];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[54] = step1[54];
+ step2[55] = step1[55];
+ step2[56] = step1[56];
+ step2[57] = step1[57];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+ btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+ btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+ btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+ btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+ step1[32] = vqaddq_s16(step2[32], step2[39]);
+ step1[33] = vqaddq_s16(step2[33], step2[38]);
+ step1[34] = vqaddq_s16(step2[34], step2[37]);
+ step1[35] = vqaddq_s16(step2[35], step2[36]);
+ step1[36] = vqsubq_s16(step2[35], step2[36]);
+ step1[37] = vqsubq_s16(step2[34], step2[37]);
+ step1[38] = vqsubq_s16(step2[33], step2[38]);
+ step1[39] = vqsubq_s16(step2[32], step2[39]);
+ step1[40] = vqsubq_s16(step2[47], step2[40]);
+ step1[41] = vqsubq_s16(step2[46], step2[41]);
+ step1[42] = vqsubq_s16(step2[45], step2[42]);
+ step1[43] = vqsubq_s16(step2[44], step2[43]);
+ step1[44] = vqaddq_s16(step2[43], step2[44]);
+ step1[45] = vqaddq_s16(step2[42], step2[45]);
+ step1[46] = vqaddq_s16(step2[41], step2[46]);
+ step1[47] = vqaddq_s16(step2[40], step2[47]);
+ step1[48] = vqaddq_s16(step2[48], step2[55]);
+ step1[49] = vqaddq_s16(step2[49], step2[54]);
+ step1[50] = vqaddq_s16(step2[50], step2[53]);
+ step1[51] = vqaddq_s16(step2[51], step2[52]);
+ step1[52] = vqsubq_s16(step2[51], step2[52]);
+ step1[53] = vqsubq_s16(step2[50], step2[53]);
+ step1[54] = vqsubq_s16(step2[49], step2[54]);
+ step1[55] = vqsubq_s16(step2[48], step2[55]);
+ step1[56] = vqsubq_s16(step2[63], step2[56]);
+ step1[57] = vqsubq_s16(step2[62], step2[57]);
+ step1[58] = vqsubq_s16(step2[61], step2[58]);
+ step1[59] = vqsubq_s16(step2[60], step2[59]);
+ step1[60] = vqaddq_s16(step2[59], step2[60]);
+ step1[61] = vqaddq_s16(step2[58], step2[61]);
+ step1[62] = vqaddq_s16(step2[57], step2[62]);
+ step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+ btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+ btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+ btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+ btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+ btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+ btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+ btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+ btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+ step2[16] = vqaddq_s16(step1[16], step1[23]);
+ step2[17] = vqaddq_s16(step1[17], step1[22]);
+ step2[18] = vqaddq_s16(step1[18], step1[21]);
+ step2[19] = vqaddq_s16(step1[19], step1[20]);
+ step2[20] = vqsubq_s16(step1[19], step1[20]);
+ step2[21] = vqsubq_s16(step1[18], step1[21]);
+ step2[22] = vqsubq_s16(step1[17], step1[22]);
+ step2[23] = vqsubq_s16(step1[16], step1[23]);
+ step2[24] = vqsubq_s16(step1[31], step1[24]);
+ step2[25] = vqsubq_s16(step1[30], step1[25]);
+ step2[26] = vqsubq_s16(step1[29], step1[26]);
+ step2[27] = vqsubq_s16(step1[28], step1[27]);
+ step2[28] = vqaddq_s16(step1[28], step1[27]);
+ step2[29] = vqaddq_s16(step1[29], step1[26]);
+ step2[30] = vqaddq_s16(step1[30], step1[25]);
+ step2[31] = vqaddq_s16(step1[31], step1[24]);
+ step2[32] = step1[32];
+ step2[33] = step1[33];
+ step2[34] = step1[34];
+ step2[35] = step1[35];
+ step2[44] = step1[44];
+ step2[45] = step1[45];
+ step2[46] = step1[46];
+ step2[47] = step1[47];
+ step2[48] = step1[48];
+ step2[49] = step1[49];
+ step2[50] = step1[50];
+ step2[51] = step1[51];
+ step2[60] = step1[60];
+ step2[61] = step1[61];
+ step2[62] = step1[62];
+ step2[63] = step1[63];
+
+ // stage 9
+ idct64_stage9_neon(step2, step1, cos_bit);
+
+ // stage 10
+ idct64_stage10_neon(step1, step2, cos_bit);
+
+ // stage 11
+
+ out[0] = vqaddq_s16(step2[0], step2[63]);
+ out[1] = vqaddq_s16(step2[1], step2[62]);
+ out[2] = vqaddq_s16(step2[2], step2[61]);
+ out[3] = vqaddq_s16(step2[3], step2[60]);
+ out[4] = vqaddq_s16(step2[4], step2[59]);
+ out[5] = vqaddq_s16(step2[5], step2[58]);
+ out[6] = vqaddq_s16(step2[6], step2[57]);
+ out[7] = vqaddq_s16(step2[7], step2[56]);
+ out[8] = vqaddq_s16(step2[8], step2[55]);
+ out[9] = vqaddq_s16(step2[9], step2[54]);
+ out[10] = vqaddq_s16(step2[10], step2[53]);
+ out[11] = vqaddq_s16(step2[11], step2[52]);
+ out[12] = vqaddq_s16(step2[12], step2[51]);
+ out[13] = vqaddq_s16(step2[13], step2[50]);
+ out[14] = vqaddq_s16(step2[14], step2[49]);
+ out[15] = vqaddq_s16(step2[15], step2[48]);
+ out[16] = vqaddq_s16(step2[16], step2[47]);
+ out[17] = vqaddq_s16(step2[17], step2[46]);
+ out[18] = vqaddq_s16(step2[18], step2[45]);
+ out[19] = vqaddq_s16(step2[19], step2[44]);
+ out[20] = vqaddq_s16(step2[20], step2[43]);
+ out[21] = vqaddq_s16(step2[21], step2[42]);
+ out[22] = vqaddq_s16(step2[22], step2[41]);
+ out[23] = vqaddq_s16(step2[23], step2[40]);
+ out[24] = vqaddq_s16(step2[24], step2[39]);
+ out[25] = vqaddq_s16(step2[25], step2[38]);
+ out[26] = vqaddq_s16(step2[26], step2[37]);
+ out[27] = vqaddq_s16(step2[27], step2[36]);
+ out[28] = vqaddq_s16(step2[28], step2[35]);
+ out[29] = vqaddq_s16(step2[29], step2[34]);
+ out[30] = vqaddq_s16(step2[30], step2[33]);
+ out[31] = vqaddq_s16(step2[31], step2[32]);
+ out[32] = vqsubq_s16(step2[31], step2[32]);
+ out[33] = vqsubq_s16(step2[30], step2[33]);
+ out[34] = vqsubq_s16(step2[29], step2[34]);
+ out[35] = vqsubq_s16(step2[28], step2[35]);
+ out[36] = vqsubq_s16(step2[27], step2[36]);
+ out[37] = vqsubq_s16(step2[26], step2[37]);
+ out[38] = vqsubq_s16(step2[25], step2[38]);
+ out[39] = vqsubq_s16(step2[24], step2[39]);
+ out[40] = vqsubq_s16(step2[23], step2[40]);
+ out[41] = vqsubq_s16(step2[22], step2[41]);
+ out[42] = vqsubq_s16(step2[21], step2[42]);
+ out[43] = vqsubq_s16(step2[20], step2[43]);
+ out[44] = vqsubq_s16(step2[19], step2[44]);
+ out[45] = vqsubq_s16(step2[18], step2[45]);
+ out[46] = vqsubq_s16(step2[17], step2[46]);
+ out[47] = vqsubq_s16(step2[16], step2[47]);
+ out[48] = vqsubq_s16(step2[15], step2[48]);
+ out[49] = vqsubq_s16(step2[14], step2[49]);
+ out[50] = vqsubq_s16(step2[13], step2[50]);
+ out[51] = vqsubq_s16(step2[12], step2[51]);
+ out[52] = vqsubq_s16(step2[11], step2[52]);
+ out[53] = vqsubq_s16(step2[10], step2[53]);
+ out[54] = vqsubq_s16(step2[9], step2[54]);
+ out[55] = vqsubq_s16(step2[8], step2[55]);
+ out[56] = vqsubq_s16(step2[7], step2[56]);
+ out[57] = vqsubq_s16(step2[6], step2[57]);
+ out[58] = vqsubq_s16(step2[5], step2[58]);
+ out[59] = vqsubq_s16(step2[4], step2[59]);
+ out[60] = vqsubq_s16(step2[3], step2[60]);
+ out[61] = vqsubq_s16(step2[2], step2[61]);
+ out[62] = vqsubq_s16(step2[1], step2[62]);
+ out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_neon
+ lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8_low1_neon, idct8_neon, NULL, NULL },
+ { iadst8_low1_neon, iadst8_neon, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
+ { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
+ idct64_low32_neon },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)tx_type;
+ int16x8_t a[32 * 4];
+ int16x8_t b[32 * 4];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int input_stride = txfm_size_row;
+ int temp_b = 0;
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ int16x8_t *cur_a = &a[i * txfm_size_col];
+ load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
+ buf_size_nonzero_w);
+ input += 8;
+ if (abs(rect_type) == 1) {
+ round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
+ }
+ identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+ txh_idx, txfm_size_row, -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int input_stride = txfm_size_row;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ int16x8_t *cur_a = &a[i * txfm_size_col];
+ load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
+ buf_size_nonzero_w);
+ input += 8;
+ if (abs(rect_type) == 1) {
+ round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
+ }
+ row_txfm(cur_a, cur_a, INV_COS_BIT);
+ av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ flip_buf_ud_neon(&cur_a[j * 8], 8);
+ transpose_arrays_s16_8x8(
+ &cur_a[j * 8],
+ &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+ txh_idx, txfm_size_row, -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int input_stride = txfm_size_row;
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ int temp_b = 0;
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ int16x8_t *cur_a = &a[i * txfm_size_col];
+ load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
+ buf_size_nonzero_w);
+ input += 8;
+ if (abs(rect_type) == 1) {
+ round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
+ }
+ identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, int eob) {
+ (void)eob;
+ TX_SIZE tx_size = TX_4X4;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
+ int r;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int c = 0; c < txfm_size_col; ++c)
+ temp_in[c] = input[c * txfm_size_row];
+ row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
+
+ input++;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, 16);
+ col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ clip_pixel(output[r * stride + c] + temp_out[r]);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int eob) {
+ (void)eob;
+ TX_SIZE tx_size = TX_4X8;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+ 16, 16, 16, 16 };
+ int r;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int c = 0; c < txfm_size_col; c++)
+ temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
+ NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
+ input++;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, 16);
+ col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ clip_pixel(output[r * stride + c] + temp_out[r]);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int eob) {
+ (void)eob;
+ TX_SIZE tx_size = TX_8X4;
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+ 16, 16, 16, 16 };
+ int r;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int c = 0; c < txfm_size_col; c++)
+ temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
+ NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
+ input++;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, 16);
+ col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ clip_pixel(output[r * stride + c] + temp_out[r]);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int eob) {
+ (void)eob;
+ TX_SIZE tx_size = TX_4X16;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16 };
+ int r;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int c = 0; c < txfm_size_col; c++)
+ temp_in[c] = input[c * txfm_size_row];
+ row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input++;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, 16);
+ col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ clip_pixel(output[r * stride + c] + temp_out[r]);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int eob) {
+ (void)eob;
+ TX_SIZE tx_size = TX_16X4;
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16 };
+ int r;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int c = 0; c < txfm_size_col; c++)
+ temp_in[c] = input[c * txfm_size_row];
+ row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input++;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, 16);
+ col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ clip_pixel(output[r * stride + c] + temp_out[r]);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = clip_pixel(output[r * stride + c] +
+ temp_out[txfm_size_row - r - 1]);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[64 * 8];
+ int16x8_t b[64 * 8];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ int temp_b = 0;
+
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ int16x8_t *cur_a = &a[i * txfm_size_col];
+ load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
+ buf_size_nonzero_w);
+ input += 8;
+ if (abs(rect_type) == 1) {
+ round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
+ }
+ row_txfm(cur_a, cur_a, INV_COS_BIT);
+ av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ flip_buf_ud_neon(&cur_a[j * 8], 8);
+ transpose_arrays_s16_8x8(
+ &cur_a[j * 8],
+ &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
+ break;
+
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
+ break;
+
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
+ break;
+
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
+ break;
+
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 0000000000..97099c2042
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+ const int8_t cos_bit,
+ const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob / (eoby_max + 1);
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c
new file mode 100644
index 0000000000..f955a379f7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c
@@ -0,0 +1,30 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
+ assert(!(size % 4));
+ if (!bit) return;
+ const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+ for (int i = 0; i < size; i += 4) {
+ int32x4_t tmp_q_s32 = vld1q_s32(arr);
+ tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4);
+ vst1q_s32(arr, tmp_q_s32);
+ arr += 4;
+ }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
new file mode 100644
index 0000000000..7afb1a909d
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -0,0 +1,102 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 2);
+ assert(w >= 2);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ const uint8x8_t m0 = vld1_u8(mask);
+ do {
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ const uint8x8_t m0 = load_unaligned_dup_u8_4x2(mask);
+ do {
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 16) {
+ const uint8x8_t m0 = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+ do {
+ uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_u8x2_strided_x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_blend_a64_hmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, w, h);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
new file mode 100644
index 0000000000..9aea29992a
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -0,0 +1,112 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 2);
+ assert(w >= 2);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (w > 8) {
+ do {
+ uint8x16_t m0 = vdupq_n_u8(mask[0]);
+ int i = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vdup_n_u8(mask[0]);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ const uint16x4_t m0 = vdup_n_u16((uint16_t)mask[0]);
+ const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[1]);
+ const uint8x8_t m = vmovn_u16(vcombine_u16(m0, m1));
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 16) {
+ do {
+ uint16x4_t m0 = vdup_n_u16(0);
+ m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+ uint8x8_t m =
+ vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+ uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1);
+
+ store_u8x2_strided_x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, w, h);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/cdef_block_neon.c b/third_party/aom/av1/common/arm/cdef_block_neon.c
new file mode 100644
index 0000000000..53d3a9f1e0
--- /dev/null
+++ b/third_party/aom/av1/common/arm/cdef_block_neon.c
@@ -0,0 +1,1355 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/cdef_block.h"
+
+void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ do {
+ const uint8_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+
+ int w = 0;
+ while (width - w >= 16) {
+ uint8x16_t row = vld1q_u8(src_ptr + w);
+ uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } };
+ vst2q_u8((uint8_t *)(dst_ptr + w), row_u16);
+
+ w += 16;
+ }
+ if (width - w >= 8) {
+ uint8x8_t row = vld1_u8(src_ptr + w);
+ vst1q_u16(dst_ptr + w, vmovl_u8(row));
+ w += 8;
+ }
+ if (width - w == 4) {
+ for (int i = w; i < w + 4; i++) {
+ dst_ptr[i] = src_ptr[i];
+ }
+ }
+
+ src += sstride;
+ dst += dstride;
+ } while (--height != 0);
+}
+
+void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride,
+ int width, int height) {
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+
+ int w = 0;
+ while (width - w >= 8) {
+ uint16x8_t row = vld1q_u16(src_ptr + w);
+ vst1q_u16(dst_ptr + w, row);
+
+ w += 8;
+ }
+ if (width - w == 4) {
+ uint16x4_t row = vld1_u16(src_ptr + w);
+ vst1_u16(dst_ptr + w, row);
+ }
+
+ src += sstride;
+ dst += dstride;
+ } while (--height != 0);
+}
+
+// partial A is a 16-bit vector of the form:
+// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+// [0 y1 y2 y3 y4 y5 y6 y7].
+// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+// and const2.
+static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala,
+ int16x8_t partialb,
+ uint32x4_t const1,
+ uint32x4_t const2) {
+ // Reverse partial B.
+ // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }.
+ uint8x16_t pattern = vreinterpretq_u8_u64(
+ vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c),
+ vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504)));
+
+#if AOM_ARCH_AARCH64
+ partialb =
+ vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern));
+#else
+ int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)),
+ vget_high_s8(vreinterpretq_s8_s16(partialb)) } };
+ int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
+ int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
+ partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
+#endif
+
+ // Square and add the corresponding x and y values.
+ int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala));
+ cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb));
+ int32x4_t cost_hi =
+ vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala));
+ cost_hi =
+ vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb));
+
+ // Multiply by constant.
+ uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1);
+ cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2);
+ return cost;
+}
+
+// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
+// down-right, 6 is vertical).
+//
+// For each direction the lines are shifted so that we can perform a
+// basic sum on each vector element. For example, direction 5 is "south by
+// southeast", so we need to add the pixels along each line i below:
+//
+// 0 1 2 3 4 5 6 7
+// 0 1 2 3 4 5 6 7
+// 8 0 1 2 3 4 5 6
+// 8 0 1 2 3 4 5 6
+// 9 8 0 1 2 3 4 5
+// 9 8 0 1 2 3 4 5
+// 10 9 8 0 1 2 3 4
+// 10 9 8 0 1 2 3 4
+//
+// For this to fit nicely in vectors, the lines need to be shifted like so:
+// 0 1 2 3 4 5 6 7
+// 0 1 2 3 4 5 6 7
+// 8 0 1 2 3 4 5 6
+// 8 0 1 2 3 4 5 6
+// 9 8 0 1 2 3 4 5
+// 9 8 0 1 2 3 4 5
+// 10 9 8 0 1 2 3 4
+// 10 9 8 0 1 2 3 4
+//
+// In this configuration we can now perform SIMD additions to get the cost
+// along direction 5. Since this won't fit into a single 128-bit vector, we use
+// two of them to compute each half of the new configuration, and pad the empty
+// spaces with zeros. Similar shifting is done for other directions, except
+// direction 6 which is straightforward as it's the vertical direction.
+static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8],
+ uint32_t cost[4]) {
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ // Partial sums for lines 0 and 1.
+ int16x8_t partial4a = vextq_s16(zero, lines[0], 1);
+ partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2));
+ int16x8_t partial4b = vextq_s16(lines[0], zero, 1);
+ partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2));
+ int16x8_t tmp = vaddq_s16(lines[0], lines[1]);
+ int16x8_t partial5a = vextq_s16(zero, tmp, 3);
+ int16x8_t partial5b = vextq_s16(tmp, zero, 3);
+ int16x8_t partial7a = vextq_s16(zero, tmp, 6);
+ int16x8_t partial7b = vextq_s16(tmp, zero, 6);
+ int16x8_t partial6 = tmp;
+
+ // Partial sums for lines 2 and 3.
+ partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3));
+ partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4));
+ partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3));
+ partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4));
+ tmp = vaddq_s16(lines[2], lines[3]);
+ partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4));
+ partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4));
+ partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5));
+ partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5));
+ partial6 = vaddq_s16(partial6, tmp);
+
+ // Partial sums for lines 4 and 5.
+ partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5));
+ partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6));
+ partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5));
+ partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6));
+ tmp = vaddq_s16(lines[4], lines[5]);
+ partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5));
+ partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5));
+ partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4));
+ partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4));
+ partial6 = vaddq_s16(partial6, tmp);
+
+ // Partial sums for lines 6 and 7.
+ partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7));
+ partial4a = vaddq_s16(partial4a, lines[7]);
+ partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7));
+ tmp = vaddq_s16(lines[6], lines[7]);
+ partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6));
+ partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6));
+ partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3));
+ partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3));
+ partial6 = vaddq_s16(partial6, tmp);
+
+ uint32x4_t const0 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
+ vcreate_u64((uint64_t)210 << 32 | 280)));
+ uint32x4_t const1 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
+ vcreate_u64((uint64_t)105 << 32 | 120)));
+ uint32x4_t const2 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420)));
+ uint32x4_t const3 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140),
+ vcreate_u64((uint64_t)105 << 32 | 105)));
+
+ // Compute costs in terms of partial sums.
+ int32x4_t partial6_s32 =
+ vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6));
+ partial6_s32 =
+ vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6));
+
+ uint32x4_t costs[4];
+ costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1);
+ costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3);
+ costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105);
+ costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3);
+
+ costs[0] = horizontal_add_4d_u32x4(costs);
+ vst1q_u32(cost, costs[0]);
+ return costs[0];
+}
+
+static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala,
+ int16x8_t partialb,
+ int16x8_t partialc,
+ uint32x4_t const0) {
+ // Reverse partial c.
+ // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }.
+ uint8x16_t pattern = vreinterpretq_u8_u64(
+ vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a),
+ vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302)));
+
+#if AOM_ARCH_AARCH64
+ partialc =
+ vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern));
+#else
+ int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)),
+ vget_high_s8(vreinterpretq_s8_s16(partialc)) } };
+ int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern)));
+ int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern)));
+ partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi));
+#endif
+
+ int32x4_t partiala_s32 = vpaddlq_s16(partiala);
+ int32x4_t partialb_s32 = vpaddlq_s16(partialb);
+ int32x4_t partialc_s32 = vpaddlq_s16(partialc);
+
+ partiala_s32 = vmulq_s32(partiala_s32, partiala_s32);
+ partialb_s32 = vmulq_s32(partialb_s32, partialb_s32);
+ partialc_s32 = vmulq_s32(partialc_s32, partialc_s32);
+
+ partiala_s32 = vaddq_s32(partiala_s32, partialc_s32);
+
+ uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105);
+ cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0);
+ return cost;
+}
+
+// This function computes the cost along directions 0, 1, 2, 3. (0 means
+// 45-degree up-right, 2 is horizontal).
+//
+// For direction 1 and 3 ("east northeast" and "east southeast") the shifted
+// lines need three vectors instead of two. For direction 1 for example, we need
+// to compute the sums along the line i below:
+// 0 0 1 1 2 2 3 3
+// 1 1 2 2 3 3 4 4
+// 2 2 3 3 4 4 5 5
+// 3 3 4 4 5 5 6 6
+// 4 4 5 5 6 6 7 7
+// 5 5 6 6 7 7 8 8
+// 6 6 7 7 8 8 9 9
+// 7 7 8 8 9 9 10 10
+//
+// Which means we need the following configuration:
+// 0 0 1 1 2 2 3 3
+// 1 1 2 2 3 3 4 4
+// 2 2 3 3 4 4 5 5
+// 3 3 4 4 5 5 6 6
+// 4 4 5 5 6 6 7 7
+// 5 5 6 6 7 7 8 8
+// 6 6 7 7 8 8 9 9
+// 7 7 8 8 9 9 10 10
+//
+// Three vectors are needed to compute this, as well as some extra pairwise
+// additions.
+static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8],
+ uint32_t cost[4]) {
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ // Compute diagonal directions (1, 2, 3).
+ // Partial sums for lines 0 and 1.
+ int16x8_t partial0a = lines[0];
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7));
+ int16x8_t partial0b = vextq_s16(lines[1], zero, 7);
+ int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6));
+ int16x8_t partial1b = vextq_s16(lines[1], zero, 6);
+ int16x8_t partial3a = vextq_s16(lines[0], zero, 2);
+ partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4));
+ int16x8_t partial3b = vextq_s16(zero, lines[0], 2);
+ partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4));
+
+ // Partial sums for lines 2 and 3.
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6));
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5));
+ partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6));
+ partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5));
+ partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4));
+ partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2));
+ partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4));
+ partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2));
+ partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6));
+ partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6));
+ partial3b = vaddq_s16(partial3b, lines[3]);
+
+ // Partial sums for lines 4 and 5.
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4));
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3));
+ partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4));
+ partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3));
+ partial1b = vaddq_s16(partial1b, lines[4]);
+ partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6));
+ int16x8_t partial1c = vextq_s16(lines[5], zero, 6);
+ partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2));
+ partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4));
+ int16x8_t partial3c = vextq_s16(zero, lines[4], 2);
+ partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4));
+
+ // Partial sums for lines 6 and 7.
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2));
+ partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1));
+ partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2));
+ partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1));
+ partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4));
+ partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2));
+ partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4));
+ partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2));
+ partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6));
+ partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6));
+ partial3c = vaddq_s16(partial3c, lines[7]);
+
+ // Special case for direction 2 as it's just a sum along each line.
+ int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] };
+ int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] };
+ int32x4_t partial2a = horizontal_add_4d_s16x8(lines03);
+ int32x4_t partial2b = horizontal_add_4d_s16x8(lines47);
+
+ uint32x4_t partial2a_u32 =
+ vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a));
+ uint32x4_t partial2b_u32 =
+ vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b));
+
+ uint32x4_t const0 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840),
+ vcreate_u64((uint64_t)210 << 32 | 280)));
+ uint32x4_t const1 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168),
+ vcreate_u64((uint64_t)105 << 32 | 120)));
+ uint32x4_t const2 = vreinterpretq_u32_u64(
+ vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420),
+ vcreate_u64((uint64_t)105 << 32 | 140)));
+
+ uint32x4_t costs[4];
+ costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1);
+ costs[1] =
+ fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2);
+ costs[2] = vaddq_u32(partial2a_u32, partial2b_u32);
+ costs[2] = vmulq_n_u32(costs[2], 105);
+ costs[3] =
+ fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2);
+
+ costs[0] = horizontal_add_4d_u32x4(costs);
+ vst1q_u32(cost, costs[0]);
+ return costs[0];
+}
+
+int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift) {
+ uint32_t cost[8];
+ uint32_t best_cost = 0;
+ int best_dir = 0;
+ int16x8_t lines[8];
+ for (int i = 0; i < 8; i++) {
+ uint16x8_t s = vld1q_u16(&img[i * stride]);
+ lines[i] = vreinterpretq_s16_u16(
+ vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128)));
+ }
+
+ // Compute "mostly vertical" directions.
+ uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4);
+
+ // Compute "mostly horizontal" directions.
+ uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost);
+
+ // Find max cost as well as its index to get best_dir.
+ // The max cost needs to be propagated in the whole vector to find its
+ // position in the original cost vectors cost03 and cost47.
+ uint32x4_t cost07 = vmaxq_u32(cost03, cost47);
+#if AOM_ARCH_AARCH64
+ best_cost = vmaxvq_u32(cost07);
+ uint32x4_t max_cost = vdupq_n_u32(best_cost);
+ uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)),
+ vreinterpretq_u8_u32(
+ vceqq_u32(max_cost, cost47)) } };
+ // idx = { 28, 24, 20, 16, 12, 8, 4, 0 };
+ uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL));
+ // Get the lowest 8 bit of each 32-bit elements and reverse them.
+ uint8x8_t tbl = vqtbl2_u8(costs, idx);
+ uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0);
+ best_dir = aom_clzll(a) >> 3;
+#else
+ uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07));
+ cost64 = vpmax_u32(cost64, cost64);
+ uint32x4_t max_cost = vcombine_u32(cost64, cost64);
+ best_cost = vget_lane_u32(cost64, 0);
+ uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)),
+ vmovn_u32(vceqq_u32(max_cost, cost47)));
+ uint8x8_t idx =
+ vand_u8(vmovn_u16(costs),
+ vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL)));
+ int sum = horizontal_add_u8x8(idx);
+ best_dir = get_msb(sum ^ (sum - 1));
+#endif
+
+ // Difference between the optimal variance and the variance along the
+ // orthogonal direction. Again, the sum(x^2) terms cancel out.
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ // We'd normally divide by 840, but dividing by 1024 is close enough
+ // for what we're going to do with this.
+ *var >>= 10;
+ return best_dir;
+}
+
+void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b,
+ unsigned int threshold, int adjdamp) {
+ uint16x8_t diff = vabdq_u16(a, b);
+ const uint16x8_t a_gt_b = vcgtq_u16(a, b);
+ const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold),
+ vshlq_u16(diff, vdupq_n_s16(-adjdamp)));
+ const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s));
+ return vbslq_s16(a_gt_b, clip, vnegq_s16(clip));
+}
+
+static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4],
+ const int *pri_taps, int pri_strength,
+ int pri_damping, int16x8_t *sum) {
+ // Near taps
+ int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping);
+ int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping);
+ // sum += pri_taps[0] * (n0 + n1)
+ n0 = vaddq_s16(n0, n1);
+ *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]);
+
+ // Far taps
+ int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping);
+ int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping);
+ // sum += pri_taps[1] * (f0 + f1)
+ f0 = vaddq_s16(f0, f1);
+ *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]);
+}
+
+static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8],
+ const int *sec_taps, int sec_strength,
+ int sec_damping, int16x8_t *sum) {
+ // Near taps
+ int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping);
+ int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping);
+ int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping);
+ int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ s0 = vaddq_s16(s0, s1);
+ s2 = vaddq_s16(s2, s3);
+ s0 = vaddq_s16(s0, s2);
+ *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]);
+
+ // Far taps
+ s0 = constrain16(tap[4], s, sec_strength, sec_damping);
+ s1 = constrain16(tap[5], s, sec_strength, sec_damping);
+ s2 = constrain16(tap[6], s, sec_strength, sec_damping);
+ s3 = constrain16(tap[7], s, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ s0 = vaddq_s16(s0, s1);
+ s2 = vaddq_s16(s2, s3);
+ s0 = vaddq_s16(s0, s2);
+ *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]);
+}
+
+void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ uint16x8_t max, min;
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (pri_strength) {
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ }
+ if (sec_strength) {
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ }
+
+ if (block_width == 8) {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+ max = min = s;
+
+ uint16x8_t pri_src[4];
+
+ // Primary near taps
+ pri_src[0] = vld1q_u16(in + po1);
+ pri_src[1] = vld1q_u16(in - po1);
+
+ // Primary far taps
+ pri_src[2] = vld1q_u16(in + po2);
+ pri_src[3] = vld1q_u16(in - po2);
+
+ primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
+ vreinterpretq_u8_u16(pri_src[1]));
+ uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
+ vreinterpretq_u8_u16(pri_src[3]));
+ pri_max0 = vmaxq_u8(pri_max0, pri_max1);
+ max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
+ cdef_large_value_mask));
+
+ uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
+ uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
+ pri_min0 = vminq_u16(pri_min0, pri_min1);
+ min = vminq_u16(min, pri_min0);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = vld1q_u16(in + s1o1);
+ sec_src[1] = vld1q_u16(in - s1o1);
+ sec_src[2] = vld1q_u16(in + s2o1);
+ sec_src[3] = vld1q_u16(in - s2o1);
+
+ // Secondary far taps
+ sec_src[4] = vld1q_u16(in + s1o2);
+ sec_src[5] = vld1q_u16(in - s1o2);
+ sec_src[6] = vld1q_u16(in + s2o2);
+ sec_src[7] = vld1q_u16(in - s2o2);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
+ vreinterpretq_u8_u16(sec_src[1]));
+ uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
+ vreinterpretq_u8_u16(sec_src[3]));
+ uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
+ vreinterpretq_u8_u16(sec_src[5]));
+ uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
+ vreinterpretq_u8_u16(sec_src[7]));
+ sec_max0 = vmaxq_u8(sec_max0, sec_max1);
+ sec_max2 = vmaxq_u8(sec_max2, sec_max3);
+ sec_max0 = vmaxq_u8(sec_max0, sec_max2);
+ max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
+ cdef_large_value_mask));
+
+ uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
+ uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
+ uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
+ uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
+ sec_min0 = vminq_u16(sec_min0, sec_min1);
+ sec_min2 = vminq_u16(sec_min2, sec_min3);
+ sec_min0 = vminq_u16(sec_min0, sec_min2);
+ min = vminq_u16(min, sec_min0);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
+ vreinterpretq_s16_u16(max));
+
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ vst1_u8(dst8, res_u8);
+
+ in += CDEF_BSTRIDE;
+ dst8 += dstride;
+ } while (--h != 0);
+ } else {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+ max = min = s;
+
+ uint16x8_t pri_src[4];
+
+ // Primary near taps
+ pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
+ pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
+
+ // Primary far taps
+ pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
+ pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
+
+ primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]),
+ vreinterpretq_u8_u16(pri_src[1]));
+ uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]),
+ vreinterpretq_u8_u16(pri_src[3]));
+ pri_max0 = vmaxq_u8(pri_max0, pri_max1);
+ max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0),
+ cdef_large_value_mask));
+
+ uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
+ uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
+ pri_min1 = vminq_u16(pri_min1, pri_min2);
+ min = vminq_u16(min, pri_min1);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
+ sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
+ sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
+ sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
+
+ // Secondary far taps
+ sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
+ sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
+ sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
+ sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]),
+ vreinterpretq_u8_u16(sec_src[1]));
+ uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]),
+ vreinterpretq_u8_u16(sec_src[3]));
+ uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]),
+ vreinterpretq_u8_u16(sec_src[5]));
+ uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]),
+ vreinterpretq_u8_u16(sec_src[7]));
+ sec_max0 = vmaxq_u8(sec_max0, sec_max1);
+ sec_max2 = vmaxq_u8(sec_max2, sec_max3);
+ sec_max0 = vmaxq_u8(sec_max0, sec_max2);
+ max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0),
+ cdef_large_value_mask));
+
+ uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
+ uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
+ uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
+ uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
+ sec_min0 = vminq_u16(sec_min0, sec_min1);
+ sec_min2 = vminq_u16(sec_min2, sec_min3);
+ sec_min0 = vminq_u16(sec_min0, sec_min2);
+ min = vminq_u16(min, sec_min0);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)),
+ vreinterpretq_s16_u16(max));
+
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ store_u8x4_strided_x2(dst8, dstride, res_u8);
+
+ in += 2 * CDEF_BSTRIDE;
+ dst8 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)sec_strength;
+ (void)sec_damping;
+
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+
+ if (pri_strength) {
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ }
+
+ if (block_width == 8) {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+
+ uint16x8_t tap[4];
+
+ // Primary near taps
+ tap[0] = vld1q_u16(in + po1);
+ tap[1] = vld1q_u16(in - po1);
+
+ // Primary far taps
+ tap[2] = vld1q_u16(in + po2);
+ tap[3] = vld1q_u16(in - po2);
+
+ primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ vst1_u8(dst8, res_u8);
+
+ in += CDEF_BSTRIDE;
+ dst8 += dstride;
+ } while (--h != 0);
+
+ } else {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+
+ uint16x8_t pri_src[4];
+
+ // Primary near taps
+ pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
+ pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
+
+ // Primary far taps
+ pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
+ pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
+
+ primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ store_u8x4_strided_x2(dst8, dstride, res_u8);
+
+ in += 2 * CDEF_BSTRIDE;
+ dst8 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)pri_strength;
+ (void)pri_damping;
+ (void)coeff_shift;
+
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (sec_strength) {
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ }
+
+ if (block_width == 8) {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = vld1q_u16(in + s1o1);
+ sec_src[1] = vld1q_u16(in - s1o1);
+ sec_src[2] = vld1q_u16(in + s2o1);
+ sec_src[3] = vld1q_u16(in - s2o1);
+
+ // Secondary far taps
+ sec_src[4] = vld1q_u16(in + s1o2);
+ sec_src[5] = vld1q_u16(in - s1o2);
+ sec_src[6] = vld1q_u16(in + s2o2);
+ sec_src[7] = vld1q_u16(in - s2o2);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ vst1_u8(dst8, res_u8);
+
+ in += CDEF_BSTRIDE;
+ dst8 += dstride;
+ } while (--h != 0);
+ } else {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
+ sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
+ sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
+ sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
+
+ // Secondary far taps
+ sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
+ sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
+ sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
+ sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+ store_u8x4_strided_x2(dst8, dstride, res_u8);
+
+ in += 2 * CDEF_BSTRIDE;
+ dst8 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)pri_strength;
+ (void)sec_strength;
+ (void)dir;
+ (void)pri_damping;
+ (void)sec_damping;
+ (void)coeff_shift;
+ (void)block_width;
+ if (block_width == 8) {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ const uint16x8_t s = vld1q_u16(in);
+ const uint8x8_t res = vqmovn_u16(s);
+ vst1_u8(dst8, res);
+
+ in += CDEF_BSTRIDE;
+ dst8 += dstride;
+ } while (--h != 0);
+ } else {
+ uint8_t *dst8 = (uint8_t *)dest;
+
+ int h = block_height;
+ do {
+ const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+ const uint8x8_t res = vqmovn_u16(s);
+ store_u8x4_strided_x2(dst8, dstride, res);
+
+ in += 2 * CDEF_BSTRIDE;
+ dst8 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ uint16x8_t max, min;
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE));
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (pri_strength) {
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ }
+ if (sec_strength) {
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ }
+
+ if (block_width == 8) {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+ max = min = s;
+
+ uint16x8_t pri_src[4];
+
+ // Primary near taps
+ pri_src[0] = vld1q_u16(in + po1);
+ pri_src[1] = vld1q_u16(in - po1);
+
+ // Primary far taps
+ pri_src[2] = vld1q_u16(in + po2);
+ pri_src[3] = vld1q_u16(in - po2);
+
+ primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
+
+ uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]);
+ uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]);
+ pri_min0 = vminq_u16(pri_min0, pri_min1);
+ min = vminq_u16(min, pri_min0);
+
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
+ pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
+ pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
+ pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
+
+ uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
+ uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
+ pri_max0 = vmaxq_u16(pri_max0, pri_max1);
+ max = vmaxq_u16(max, pri_max0);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = vld1q_u16(in + s1o1);
+ sec_src[1] = vld1q_u16(in - s1o1);
+ sec_src[2] = vld1q_u16(in + s2o1);
+ sec_src[3] = vld1q_u16(in - s2o1);
+
+ // Secondary far taps
+ sec_src[4] = vld1q_u16(in + s1o2);
+ sec_src[5] = vld1q_u16(in - s1o2);
+ sec_src[6] = vld1q_u16(in + s2o2);
+ sec_src[7] = vld1q_u16(in - s2o2);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
+ uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
+ uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
+ uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
+ sec_min0 = vminq_u16(sec_min0, sec_min1);
+ sec_min2 = vminq_u16(sec_min2, sec_min3);
+ sec_min0 = vminq_u16(sec_min0, sec_min2);
+ min = vminq_u16(min, sec_min0);
+
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
+ sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
+ sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
+ sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
+ sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
+ sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
+ sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
+ sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
+
+ uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
+ uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
+ uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
+ uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
+ sec_max0 = vmaxq_u16(sec_max0, sec_max1);
+ sec_max2 = vmaxq_u16(sec_max2, sec_max3);
+ sec_max0 = vmaxq_u16(sec_max0, sec_max2);
+ max = vmaxq_u16(max, sec_max0);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
+ vreinterpretq_s16_u16(max));
+
+ vst1q_u16(dst16, vreinterpretq_u16_s16(res));
+
+ in += CDEF_BSTRIDE;
+ dst16 += dstride;
+ } while (--h != 0);
+ } else {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+ max = min = s;
+
+ uint16x8_t pri_src[4];
+
+ // Primary near taps
+ pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
+ pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
+
+ // Primary far taps
+ pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
+ pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
+
+ primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
+
+ uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]);
+ uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]);
+ pri_min1 = vminq_u16(pri_min1, pri_min2);
+ min = vminq_u16(min, pri_min1);
+
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask);
+ pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask);
+ pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask);
+ pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask);
+ uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]);
+ uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]);
+ pri_max0 = vmaxq_u16(pri_max0, pri_max1);
+ max = vmaxq_u16(max, pri_max0);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
+ sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
+ sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
+ sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
+
+ // Secondary far taps
+ sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
+ sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
+ sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
+ sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]);
+ uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]);
+ uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]);
+ uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]);
+ sec_min0 = vminq_u16(sec_min0, sec_min1);
+ sec_min2 = vminq_u16(sec_min2, sec_min3);
+ sec_min0 = vminq_u16(sec_min0, sec_min2);
+ min = vminq_u16(min, sec_min0);
+
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask);
+ sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask);
+ sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask);
+ sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask);
+ sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask);
+ sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask);
+ sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask);
+ sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask);
+
+ uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]);
+ uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]);
+ uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]);
+ uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]);
+ sec_max0 = vmaxq_u16(sec_max0, sec_max1);
+ sec_max2 = vmaxq_u16(sec_max2, sec_max3);
+ sec_max0 = vmaxq_u16(sec_max0, sec_max2);
+ max = vmaxq_u16(max, sec_max0);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)),
+ vreinterpretq_s16_u16(max));
+
+ store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
+
+ in += 2 * CDEF_BSTRIDE;
+ dst16 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)sec_strength;
+ (void)sec_damping;
+
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+
+ if (pri_strength) {
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ }
+
+ if (block_width == 8) {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+
+ uint16x8_t tap[4];
+
+ // Primary near taps
+ tap[0] = vld1q_u16(in + po1);
+ tap[1] = vld1q_u16(in - po1);
+
+ // Primary far taps
+ tap[2] = vld1q_u16(in + po2);
+ tap[3] = vld1q_u16(in - po2);
+
+ primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ vst1q_u16(dst16, vreinterpretq_u16_s16(res));
+
+ in += CDEF_BSTRIDE;
+ dst16 += dstride;
+ } while (--h != 0);
+ } else {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+
+ uint16x8_t pri_src[4];
+
+ // Primary near taps
+ pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE);
+ pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE);
+
+ // Primary far taps
+ pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE);
+ pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE);
+
+ primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
+
+ in += 2 * CDEF_BSTRIDE;
+ dst16 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)pri_strength;
+ (void)pri_damping;
+ (void)coeff_shift;
+
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (sec_strength) {
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ }
+
+ if (block_width == 8) {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = vld1q_u16(in);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = vld1q_u16(in + s1o1);
+ sec_src[1] = vld1q_u16(in - s1o1);
+ sec_src[2] = vld1q_u16(in + s2o1);
+ sec_src[3] = vld1q_u16(in - s2o1);
+
+ // Secondary far taps
+ sec_src[4] = vld1q_u16(in + s1o2);
+ sec_src[5] = vld1q_u16(in - s1o2);
+ sec_src[6] = vld1q_u16(in + s2o2);
+ sec_src[7] = vld1q_u16(in - s2o2);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ vst1q_u16(dst16, vreinterpretq_u16_s16(res));
+
+ in += CDEF_BSTRIDE;
+ dst16 += dstride;
+ } while (--h != 0);
+ } else {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ int16x8_t sum = vdupq_n_s16(0);
+ uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+
+ uint16x8_t sec_src[8];
+
+ // Secondary near taps
+ sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE);
+ sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE);
+ sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE);
+ sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE);
+
+ // Secondary far taps
+ sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE);
+ sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE);
+ sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE);
+ sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE);
+
+ secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum);
+
+ // res = s + ((sum - (sum < 0) + 8) >> 4)
+ sum =
+ vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0))));
+ const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4);
+
+ store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res));
+
+ in += 2 * CDEF_BSTRIDE;
+ dst16 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
+
+void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ (void)pri_strength;
+ (void)sec_strength;
+ (void)dir;
+ (void)pri_damping;
+ (void)sec_damping;
+ (void)coeff_shift;
+ (void)block_width;
+ if (block_width == 8) {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ const uint16x8_t s = vld1q_u16(in);
+ vst1q_u16(dst16, s);
+
+ in += CDEF_BSTRIDE;
+ dst16 += dstride;
+ } while (--h != 0);
+ } else {
+ uint16_t *dst16 = (uint16_t *)dest;
+
+ int h = block_height;
+ do {
+ const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE);
+ store_u16x4_strided_x2(dst16, dstride, s);
+
+ in += 2 * CDEF_BSTRIDE;
+ dst16 += 2 * dstride;
+ h -= 2;
+ } while (h != 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
new file mode 100644
index 0000000000..0871b4fe06
--- /dev/null
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+ int16x8_t sub) {
+ vst1q_s16(dst + offset,
+ vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
+}
+
+static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+ return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
+}
+
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+ return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+ vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0);
+}
+
+// Store half of a vector.
+static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+ vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0);
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+ const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+ vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+ } else if (width == 8) {
+ const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+ const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+ vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+ } else if (width == 16) {
+ const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+ const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
+ } else {
+ const uint8x8x4_t top = vld4_u8(input);
+ const uint8x8x4_t bot = vld4_u8(input + input_stride);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
+ uint16x8x2_t sum;
+ sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+ sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+ vst2q_u16(pred_buf_q3, sum);
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+ vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
+ } else if (width == 8) {
+ const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+ vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
+ } else if (width == 16) {
+ const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
+ } else {
+ const uint8x8x4_t top = vld4_u8(input);
+ uint16x8x2_t sum;
+ // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+ sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+ vst2q_u16(pred_buf_q3, sum);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
+ vst1_u16(pred_buf_q3, vget_low_u16(top));
+ } else if (width == 8) {
+ const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
+ vst1q_u16(pred_buf_q3, top);
+ } else {
+ const uint8x16_t top = vld1q_u8(input);
+ vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+ vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
+ if (width == 32) {
+ const uint8x16_t next_top = vld1q_u8(input + 16);
+ vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+ vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
+ }
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !AOM_ARCH_AARCH64
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+ return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+ vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vld1_u16(input);
+ const uint16x4_t bot = vld1_u16(input + input_stride);
+ const uint16x4_t sum = vadd_u16(top, bot);
+ const uint16x4_t hsum = vpadd_u16(sum, sum);
+ vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+ } else if (width < 32) {
+ const uint16x8_t top = vld1q_u16(input);
+ const uint16x8_t bot = vld1q_u16(input + input_stride);
+ const uint16x8_t sum = vaddq_u16(top, bot);
+ if (width == 8) {
+ const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+ vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+ } else {
+ const uint16x8_t top_1 = vld1q_u16(input + 8);
+ const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
+ const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
+ const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
+ }
+ } else {
+ const uint16x8x4_t top = vld4q_u16(input);
+ const uint16x8x4_t bot = vld4q_u16(input + input_stride);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
+ uint16x8x2_t sum;
+ sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+ sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+ vst2q_u16(pred_buf_q3, sum);
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vld1_u16(input);
+ const uint16x4_t hsum = vpadd_u16(top, top);
+ vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+ } else if (width == 8) {
+ const uint16x4x2_t top = vld2_u16(input);
+ // equivalent to a vpadd_u16 (because vld2 interleaves)
+ const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
+ vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+ } else if (width == 16) {
+ const uint16x8x2_t top = vld2q_u16(input);
+ // equivalent to a vpaddq_u16 (because vld2q interleaves)
+ const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
+ } else {
+ const uint16x8x4_t top = vld4q_u16(input);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
+ uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+ vshlq_n_u16(hsum_1, 2) } };
+ vst2q_u16(pred_buf_q3, result);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vld1_u16(input);
+ vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
+ } else if (width == 8) {
+ const uint16x8_t top = vld1q_u16(input);
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
+ } else if (width == 16) {
+ uint16x8x2_t top = vld2q_u16(input);
+ top.val[0] = vshlq_n_u16(top.val[0], 3);
+ top.val[1] = vshlq_n_u16(top.val[1], 3);
+ vst2q_u16(pred_buf_q3, top);
+ } else {
+ uint16x8x4_t top = vld4q_u16(input);
+ top.val[0] = vshlq_n_u16(top.val[0], 3);
+ top.val[1] = vshlq_n_u16(top.val[1], 3);
+ top.val[2] = vshlq_n_u16(top.val[2], 3);
+ top.val[3] = vshlq_n_u16(top.val[3], 3);
+ vst4q_u16(pred_buf_q3, top);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+ int width, int height,
+ int round_offset,
+ const int num_pel_log2) {
+ const uint16_t *const end = src + height * CFL_BUF_LINE;
+
+ // Round offset is not needed, because NEON will handle the rounding.
+ (void)round_offset;
+
+ // To optimize the use of the CPU pipeline, we process 4 rows per iteration
+ const int step = 4 * CFL_BUF_LINE;
+
+ // At this stage, the prediction buffer contains scaled reconstructed luma
+ // pixels, which are positive integer and only require 15 bits. By using
+ // unsigned integer for the sum, we can do one addition operation inside 16
+ // bits (8 lanes) before having to convert to 32 bits (4 lanes).
+ const uint16_t *sum_buf = src;
+ uint32x4_t sum_32x4 = vdupq_n_u32(0);
+ do {
+ // For all widths, we load, add and combine the data so it fits in 4 lanes.
+ if (width == 4) {
+ const uint16x4_t a0 =
+ vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
+ const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
+ vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
+ sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
+ } else if (width == 8) {
+ const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
+ const uint16x8_t a1 =
+ vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
+ sum_32x4 = vpadalq_u16(sum_32x4, a0);
+ sum_32x4 = vpadalq_u16(sum_32x4, a1);
+ } else {
+ const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
+ const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
+ const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
+ const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
+ sum_32x4 = vpadalq_u16(sum_32x4, row0);
+ sum_32x4 = vpadalq_u16(sum_32x4, row1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row2);
+ sum_32x4 = vpadalq_u16(sum_32x4, row3);
+
+ if (width == 32) {
+ const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
+ const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
+ const uint16x8_t row2_1 =
+ vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
+ const uint16x8_t row3_1 =
+ vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
+
+ sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
+ }
+ }
+ sum_buf += step;
+ } while (sum_buf < end);
+
+ // Permute and add in such a way that each lane contains the block sum.
+ // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
+#if AOM_ARCH_AARCH64
+ sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+ sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+#else
+ uint32x4_t flip =
+ vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
+ sum_32x4 = vaddq_u32(sum_32x4, flip);
+ sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
+#endif
+
+ // Computing the average could be done using scalars, but getting off the NEON
+ // engine introduces latency, so we use vqrshrn.
+ int16x4_t avg_16x4;
+ // Constant propagation makes for some ugly code.
+ switch (num_pel_log2) {
+ case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
+ case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
+ case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
+ case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
+ case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
+ case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
+ case 10:
+ avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
+ break;
+ default: assert(0);
+ }
+
+ if (width == 4) {
+ do {
+ vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
+ src += CFL_BUF_LINE;
+ dst += CFL_BUF_LINE;
+ } while (src < end);
+ } else {
+ const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
+ do {
+ vldsubstq_s16(dst, src, 0, avg_16x8);
+ vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
+
+ if (width > 8) {
+ vldsubstq_s16(dst, src, 8, avg_16x8);
+ vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
+ }
+ if (width == 32) {
+ vldsubstq_s16(dst, src, 16, avg_16x8);
+ vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 24, avg_16x8);
+ vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
+ }
+ src += step;
+ dst += step;
+ } while (src < end);
+ }
+}
+
+CFL_SUB_AVG_FN(neon)
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+// practice, as scaled_luma is the multiplication of two absolute values.
+// * In the Intel equivalent, elements in a are zeroed out when the
+// corresponding elements in b are zero. Because vsign is used twice in a
+// row, with b in the first call becoming a in the second call, there's no
+// impact from not zeroing out.
+static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
+ const int16x4_t mask = vshr_n_s16(b, 15);
+ return veor_s16(vadd_s16(a, mask), mask);
+}
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+// practice, as scaled_luma is the multiplication of two absolute values.
+// * In the Intel equivalent, elements in a are zeroed out when the
+// corresponding elements in b are zero. Because vsignq is used twice in a
+// row, with b in the first call becoming a in the second call, there's no
+// impact from not zeroing out.
+static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
+ const int16x8_t mask = vshrq_n_s16(b, 15);
+ return veorq_s16(vaddq_s16(a, mask), mask);
+}
+
+static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+ int16x4_t alpha_sign, int abs_alpha_q12,
+ int16x4_t dc) {
+ const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
+ const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
+ int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
+ return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+ int16x8_t alpha_sign, int abs_alpha_q12,
+ int16x8_t dc) {
+ const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
+ const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
+ int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
+ return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+ int16x8_t alpha_sign, int abs_alpha_q12,
+ int16x8_t dc) {
+ // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+ // does not interleave, but is not currently available in the compilier used
+ // by the AOM build system.
+ const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+ const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+ const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+ const int16x8_t scaled_luma_0 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+ const int16x8_t scaled_luma_1 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+ int16x8x2_t result;
+ result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+ result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+ return result;
+}
+
+static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+ int16x8_t alpha_sign, int abs_alpha_q12,
+ int16x8_t dc) {
+ // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+ // does not interleave, but is not currently available in the compilier used
+ // by the AOM build system.
+ const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+ const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+ const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+ const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
+ const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]);
+ const int16x8_t scaled_luma_0 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+ const int16x8_t scaled_luma_1 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+ const int16x8_t scaled_luma_2 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12);
+ const int16x8_t scaled_luma_3 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12);
+ int16x8x4_t result;
+ result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+ result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+ result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc);
+ result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc);
+ return result;
+}
+
+static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+ const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+ if (width == 4) {
+ const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+ const int16x4_t dc = vdup_n_s16(*dst);
+ do {
+ const int16x4_t pred =
+ predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ } else {
+ const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+ const int16x8_t dc = vdupq_n_s16(*dst);
+ do {
+ if (width == 8) {
+ vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
+ abs_alpha_q12, dc)));
+ } else if (width == 16) {
+ const int16x8x2_t pred =
+ predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
+ vqmovun_s16(pred.val[1]) } };
+ vst2_u8(dst, predun);
+ } else {
+ const int16x8x4_t pred =
+ predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ const uint8x8x4_t predun = {
+ { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
+ vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
+ };
+ vst4_u8(dst, predun);
+ }
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ }
+}
+
+CFL_PREDICT_FN(neon, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+ return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+ return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+ uint16x8x2_t result;
+ result.val[0] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+ result.val[1] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+ return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+ uint16x8x4_t result;
+ result.val[0] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+ result.val[1] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+ result.val[2] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+ result.val[3] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+ return result;
+}
+
+static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ const int max = (1 << bd) - 1;
+ const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+ const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+ if (width == 4) {
+ const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+ const int16x4_t dc = vdup_n_s16(*dst);
+ const int16x4_t max_16x4 = vdup_n_s16(max);
+ do {
+ const int16x4_t scaled_luma =
+ predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ } else {
+ const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+ const int16x8_t dc = vdupq_n_s16(*dst);
+ const int16x8_t max_16x8 = vdupq_n_s16(max);
+ do {
+ if (width == 8) {
+ const int16x8_t pred =
+ predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst1q_u16(dst, clampq_s16(pred, max_16x8));
+ } else if (width == 16) {
+ const int16x8x2_t pred =
+ predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+ } else {
+ const int16x8x4_t pred =
+ predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+ }
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ }
+}
+
+CFL_PREDICT_FN(neon, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon.c b/third_party/aom/av1/common/arm/compound_convolve_neon.c
new file mode 100644
index 0000000000..6a596234dc
--- /dev/null
+++ b/third_party/aom/av1/common/arm/compound_convolve_neon.c
@@ -0,0 +1,2719 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t x_filter,
+ const int16x4_t horiz_const) {
+ int16x4_t sum = horiz_const;
+ sum = vmla_lane_s16(sum, s0, x_filter, 0);
+ sum = vmla_lane_s16(sum, s1, x_filter, 1);
+ sum = vmla_lane_s16(sum, s2, x_filter, 2);
+ sum = vmla_lane_s16(sum, s3, x_filter, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshr_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t x_filter,
+ const int16x8_t horiz_const) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int16x8_t sum = horiz_const;
+ sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ const int16_t *x_filter_ptr, const int im_h, int w) {
+ const int bd = 8;
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w == 4) {
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s = src_ptr + 7;
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
+ x_filter, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
+ x_filter, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
+ x_filter, horiz_const);
+ int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, horiz_const);
+ int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, horiz_const);
+ int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, horiz_const);
+ int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, horiz_const);
+
+ transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ height -= 8;
+ } while (height > 8);
+#endif // AOM_ARCH_AARCH64
+
+ do {
+ const uint8_t *s;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(src_ptr);
+ int16x8_t s0 =
+ vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ s = src_ptr + 8;
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+ vst1q_s16(d, d0);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ x_filter_ptr, im_h, w);
+
+ if (clamped_y_taps == 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+ const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+ uint16x4_t d1 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+ uint16x4_t d2 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+ uint16x4_t d3 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_dist_wtd_avg_4x4(
+ dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
+ vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
+
+ store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
+ store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ CONV_BUF_TYPE *d = dst;
+ uint8_t *d_u8 = dst8;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+ uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+ uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+ uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset,
+ vreinterpretq_s16_u16(round_offset_vec),
+ &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+ const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+ uint16x4_t d1 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+ uint16x4_t d2 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+ uint16x4_t d3 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ vreinterpretq_s16_u16(round_offset_vec), &d01,
+ &d23);
+
+ store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
+ store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ CONV_BUF_TYPE *d = dst;
+ uint8_t *d_u8 = dst8;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+ uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+ uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+ uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
+ &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ dst8 += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
+ int src_stride, int w, int h,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
+ const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
+ uint16x4_t d1 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
+ uint16x4_t d2 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
+ uint16x4_t d3 =
+ vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
+
+ store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ CONV_BUF_TYPE *d = dst;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3;
+ load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
+ uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
+ uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
+ uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w,
+ int h, ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
+ src, src_stride, dst8, dst8_stride, w, h, conv_params);
+ } else {
+ dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
+ h, conv_params);
+ }
+ } else {
+ dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
+ }
+}
+
+static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t x_filter,
+ const int16x4_t round_offset) {
+ int16x4_t sum = vmul_lane_s16(s0, x_filter, 0);
+ sum = vmla_lane_s16(sum, s1, x_filter, 1);
+ sum = vmla_lane_s16(sum, s2, x_filter, 2);
+ sum = vmla_lane_s16(sum, s3, x_filter, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t x_filter,
+ const int16x8_t round_offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+ __builtin_prefetch(dst8_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(dst8_ptr, d01);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ } while (--height != 0);
+ } else {
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height >= 8) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, round_offset_vec);
+
+ transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+ bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+ &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
+ d7_u8);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ dst8_ptr += 8 * dst8_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (height > 0) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ __builtin_prefetch(d);
+
+ s += 8;
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ height--;
+ }
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+ __builtin_prefetch(dst8_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(dst8_ptr, d01);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ } while (--height != 0);
+ } else {
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height >= 8) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, round_offset_vec);
+
+ transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+ round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
+ d7_u8);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ dst8_ptr += 8 * dst8_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (height > 0) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ __builtin_prefetch(d);
+
+ s += 8;
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst8_ptr += dst8_stride;
+ height--;
+ }
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_neon(
+ const uint8_t *src, int src_stride, int w, int h,
+ const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ __builtin_prefetch(dst_ptr);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
+ vget_low_s16(round_offset_vec));
+
+ vst1_u16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height >= 8) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, round_offset_vec);
+
+ transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (height > 0) {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ __builtin_prefetch(d);
+
+ s = src_ptr + 8;
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ round_offset_vec);
+
+ vst1q_u16(d, d0);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
+ w, h, filter_params_x, subpel_x_qn,
+ conv_params);
+ } else {
+ dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ }
+ } else {
+ dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ }
+}
+
+static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter,
+ const int16x4_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter,
+ const int16x8_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01, &d23);
+
+ store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
+ store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr + (5 * src_stride);
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+ uint16x8_t d1 =
+ convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+ uint16x8_t d2 =
+ convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+ uint16x8_t d3 =
+ convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+ uint16x8_t d4 =
+ convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+ uint16x8_t d5 =
+ convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+ uint16x8_t d6 =
+ convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+ bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+ &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01, &d23);
+
+ store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
+ store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr + (5 * src_stride);
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+ uint16x8_t d1 =
+ convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+ uint16x8_t d2 =
+ convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+ uint16x8_t d3 =
+ convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+ uint16x8_t d4 =
+ convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+ uint16x8_t d5 =
+ convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+ uint16x8_t d6 =
+ convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+ round_offset_vec);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+ round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
+ int src_stride, int w, int h,
+ const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ vst1_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr + (5 * src_stride);
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+ uint16x8_t d1 =
+ convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
+ uint16x8_t d2 =
+ convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
+ uint16x8_t d3 =
+ convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
+ uint16x8_t d4 =
+ convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
+ uint16x8_t d5 =
+ convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
+ uint16x8_t d6 =
+ convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
+ round_offset_vec);
+
+ store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint16x8_t d0 =
+ convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+
+ vst1q_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter,
+ const int16x4_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpret_u16_s16(res);
+}
+
+static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter,
+ const int16x8_t round_offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+ __builtin_prefetch(d_u8 + 1 * dst8_stride);
+ __builtin_prefetch(d_u8 + 2 * dst8_stride);
+ __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01, &d23);
+
+ store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
+ store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d);
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter, round_offset_vec);
+
+ __builtin_prefetch(d + 0 * dst8_stride);
+ __builtin_prefetch(d + 1 * dst8_stride);
+ __builtin_prefetch(d + 2 * dst8_stride);
+ __builtin_prefetch(d + 3 * dst8_stride);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
+ bck_offset, round_offset_vec, &d4_u8, &d5_u8,
+ &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ __builtin_prefetch(d);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_avg_neon(
+ const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
+ const int dst8_stride, int w, int h, const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+ __builtin_prefetch(d_u8 + 1 * dst8_stride);
+ __builtin_prefetch(d_u8 + 2 * dst8_stride);
+ __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01, d23;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01, &d23);
+
+ store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
+ store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ d_u8 += 4 * dst8_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ __builtin_prefetch(d);
+
+ uint16x4_t dd0 = vld1_u16(d);
+
+ uint8x8_t d01;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
+
+ store_u8_4x1(d_u8, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ d_u8 += dst8_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst8_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter, round_offset_vec);
+
+ __builtin_prefetch(d + 0 * dst8_stride);
+ __builtin_prefetch(d + 1 * dst8_stride);
+ __builtin_prefetch(d + 2 * dst8_stride);
+ __builtin_prefetch(d + 3 * dst8_stride);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ uint16x8_t dd4, dd5, dd6, dd7;
+ load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
+
+ uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
+ compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
+ round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ __builtin_prefetch(d);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
+ int src_stride, int w, int h,
+ const int16x8_t y_filter,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ int width = w;
+
+ if (w == 4 || h == 4) {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
+ t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
+ t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
+ t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ vget_low_s16(round_offset_vec));
+ uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ t0 = load_unaligned_u8_4x1(s);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+
+ uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ vget_low_s16(round_offset_vec));
+
+ vst1_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ width -= 4;
+ } while (width != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ __builtin_prefetch(s + 4 * src_stride);
+ __builtin_prefetch(s + 5 * src_stride);
+ __builtin_prefetch(s + 6 * src_stride);
+ __builtin_prefetch(s + 7 * src_stride);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+ uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_offset_vec);
+ uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_offset_vec);
+ uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_offset_vec);
+ uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter, round_offset_vec);
+ uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter, round_offset_vec);
+ uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter, round_offset_vec);
+ uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter, round_offset_vec);
+
+ store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8 * src_stride;
+ d += 8 * dst_stride;
+ height -= 8;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_offset_vec);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ vst1q_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ // Vertical filter.
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ // Filter values are even, so downshift by 1 to reduce intermediate
+ // precision requirements.
+ const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
+
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+ if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
+ src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
+ conv_params);
+ } else {
+ dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
+ dst8, dst8_stride, w, h, y_filter,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
+ y_filter, conv_params);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
+ dst8_stride, w, h, y_filter,
+ conv_params);
+ } else {
+ dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
+ dst8_stride, w, h, y_filter,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
+ conv_params);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon.h b/third_party/aom/av1/common/arm/compound_convolve_neon.h
new file mode 100644
index 0000000000..d719680a32
--- /dev/null
+++ b/third_party/aom/av1/common/arm/compound_convolve_neon.h
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "av1/common/convolve.h"
+#include "av1/common/enums.h"
+#include "av1/common/filter.h"
+
+static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
+ blend0 = vmlal_n_u16(blend0, d0, bck_offset);
+
+ uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
+
+ int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
+
+ int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
+
+ *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0,
+ const int16x4_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint16x4_t avg0 = vhadd_u16(dd0, d0);
+
+ int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset);
+
+ int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0));
+
+ *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x8_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
+ blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
+ uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
+ blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
+
+ uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0,
+ const int16x8_t round_offset,
+ uint8x8_t *d0_u8) {
+ uint16x8_t avg0 = vhaddq_u16(dd0, d0);
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_4x4(
+ uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
+ uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+ const uint16_t fwd_offset, const uint16_t bck_offset,
+ const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
+ uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
+ blend0 = vmlal_n_u16(blend0, d0, bck_offset);
+ uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
+ blend1 = vmlal_n_u16(blend1, d1, bck_offset);
+ uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
+ blend2 = vmlal_n_u16(blend2, d2, bck_offset);
+ uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
+ blend3 = vmlal_n_u16(blend3, d3, bck_offset);
+
+ uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
+ uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
+ uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
+ uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);
+
+ int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
+ int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
+
+ dst_01 = vsubq_s16(dst_01, round_offset);
+ dst_23 = vsubq_s16(dst_23, round_offset);
+
+ *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
+ *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
+ uint16x4_t dd2, uint16x4_t dd3,
+ uint16x4_t d0, uint16x4_t d1,
+ uint16x4_t d2, uint16x4_t d3,
+ const int16x8_t round_offset,
+ uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
+ uint16x4_t avg0 = vhadd_u16(dd0, d0);
+ uint16x4_t avg1 = vhadd_u16(dd1, d1);
+ uint16x4_t avg2 = vhadd_u16(dd2, d2);
+ uint16x4_t avg3 = vhadd_u16(dd3, d3);
+
+ int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
+ int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));
+
+ dst_01 = vsubq_s16(dst_01, round_offset);
+ dst_23 = vsubq_s16(dst_23, round_offset);
+
+ *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
+ *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_dist_wtd_avg_8x4(
+ uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
+ uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+ const uint16_t fwd_offset, const uint16_t bck_offset,
+ const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8,
+ uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
+ uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
+ blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
+ uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
+ blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);
+
+ uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
+ blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
+ uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
+ blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);
+
+ uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
+ blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
+ uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
+ blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);
+
+ uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
+ blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
+ uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
+ blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);
+
+ uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
+ uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
+ uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
+ uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
+ vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+ int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
+ int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
+ int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+ *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
+ *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
+ *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
+ uint16x8_t dd2, uint16x8_t dd3,
+ uint16x8_t d0, uint16x8_t d1,
+ uint16x8_t d2, uint16x8_t d3,
+ const int16x8_t round_offset,
+ uint8x8_t *d0_u8, uint8x8_t *d1_u8,
+ uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
+ uint16x8_t avg0 = vhaddq_u16(dd0, d0);
+ uint16x8_t avg1 = vhaddq_u16(dd1, d1);
+ uint16x8_t avg2 = vhaddq_u16(dd2, d2);
+ uint16x8_t avg3 = vhaddq_u16(dd3, d3);
+
+ int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
+ int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
+ int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
+ int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);
+
+ *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
+ *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
+ *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
+ *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint16x4_t
+convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = offset_const;
+ // Filter values at indices 0 and 7 are 0.
+ sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = offset_const;
+ // Filter values at indices 0 and 7 are 0.
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = offset_const;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x4_t d1 =
+ convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x4_t d2 =
+ convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x4_t d3 =
+ convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x4_t d1 =
+ convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x4_t d2 =
+ convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x4_t d3 =
+ convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_6tap_neon(
+ int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
+ const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x4_t d1 =
+ convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x4_t d2 =
+ convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x4_t d3 =
+ convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 =
+ convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ vst1_u16(dst_ptr, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+ uint16x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const);
+ uint16x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const);
+ uint16x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+
+ uint16x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const);
+
+ vst1q_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t
+convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = offset_const;
+ sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter, const int32x4_t offset_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = offset_const;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = offset_const;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
+ vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+ uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_const);
+ uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_const);
+ uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
+ round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon(
+ int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride,
+ ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+ dst8_ptr += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+
+ uint16x4_t dd0 = vld1_u16(dst_ptr);
+
+ uint8x8_t d01_u8;
+ compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8);
+
+ store_u8_4x1(dst8_ptr, d01_u8);
+ dst8_ptr += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+ uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_const);
+ uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_const);
+ uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+ d_u8 += 4 * dst8_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+
+ uint16x8_t dd0 = vld1q_u16(d);
+
+ uint8x8_t d0_u8;
+ compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
+
+ vst1_u8(d_u8, d0_u8);
+ d_u8 += dst8_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst8_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_8tap_neon(
+ int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params,
+ const int16x8_t y_filter, int h, int w) {
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+ uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ offset_const);
+ uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ offset_const);
+ uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+
+ uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ offset_const);
+
+ vst1_u16(dst_ptr, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ int16_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+ uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_const);
+ uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_const);
+ uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_const);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+
+ uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_const);
+
+ vst1q_u16(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+#endif // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
new file mode 100644
index 0000000000..3aeffbb0e6
--- /dev/null
+++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product. */
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ const int16_t *x_filter_ptr, const int im_h, int w) {
+ const int bd = 8;
+ const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
+ // Dot product constants and other shims.
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold horiz_const into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
+ permute_tbl);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ dist_wtd_convolve_2d_horiz_neon_dotprod(src_ptr, src_stride, im_block,
+ im_stride, x_filter_ptr, im_h, w);
+
+ if (clamped_y_taps == 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ }
+}
+
+static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product. */
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+ // Dot-product constants and other shims.
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold round_offset into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+ // Dot-product constants and other shims.
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold round_offset into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_neon_dotprod(
+ const uint8_t *src, int src_stride, int w, int h,
+ const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+
+ // Dot-product constants and other shims.
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // Fold round_offset into the dot-product filter correction constant. The
+ // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
+ // rounding shifts - which are generally faster than rounding shifts on
+ // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_x_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
+ src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ } else {
+ dist_wtd_convolve_x_avg_neon_dotprod(src, src_stride, dst8, dst8_stride,
+ w, h, filter_params_x, subpel_x_qn,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_x_neon_dotprod(src, src_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c b/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c
new file mode 100644
index 0000000000..a72af9e36a
--- /dev/null
+++ b/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/arm/compound_convolve_neon.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16_t permute_tbl,
+ const int32x4_t horiz_const) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ const int16_t *x_filter_ptr, const int im_h, int w) {
+ const int bd = 8;
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride,
+ x_filter_ptr, im_h, w);
+
+ if (clamped_y_taps == 6) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
+ im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
+ w);
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
+ dst8_stride, conv_params,
+ y_filter, h, w);
+ }
+ } else {
+ dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
+ y_filter, h, w);
+ }
+ }
+}
+
+static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16_t permute_tbl,
+ const int32x4_t round_offset) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
+}
+
+static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
+ const int8x8_t x_filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t round_offset) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+ return vreinterpretq_u16_s16(res);
+}
+
+static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t round_offset_shim = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
+ bck_offset, round_offset_vec, &d0_u8, &d1_u8,
+ &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t round_offset_shim = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ uint8_t *dst8_ptr = dst8;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x4_t dd0, dd1, dd2, dd3;
+ load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d01_u8, d23_u8;
+ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d01_u8, &d23_u8);
+
+ store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
+ store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ uint8_t *d_u8 = dst8_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ uint16x8_t dd0, dd1, dd2, dd3;
+ load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
+ compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
+ round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
+
+ store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
+
+ s += 8;
+ d += 8;
+ d_u8 += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ dst8_ptr += 4 * dst8_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void dist_wtd_convolve_x_neon_i8mm(
+ const uint8_t *src, int src_stride, int w, int h,
+ const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int32x4_t round_offset_shim = vdupq_n_s32(
+ (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
+
+ // Horizontal filter.
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - horiz_offset;
+ CONV_BUF_TYPE *dst_ptr = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int height = h;
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x4_t d0 =
+ convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d1 =
+ convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d2 =
+ convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x4_t d3 =
+ convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ CONV_BUF_TYPE *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint16x8_t d0 =
+ convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d1 =
+ convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d2 =
+ convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
+ uint16x8_t d3 =
+ convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_dist_wtd_convolve_x_neon_i8mm(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->do_average) {
+ if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
+ dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
+ src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ } else {
+ dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w,
+ h, filter_params_x, subpel_x_qn,
+ conv_params);
+ }
+ } else {
+ dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
new file mode 100644
index 0000000000..10442f9bf9
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -0,0 +1,1659 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/convolve_neon.h"
+
+static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x4_t s8, const int16x4_t s9,
+ const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t horiz_const) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum = horiz_const;
+ sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
+
+ return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
+ int src_stride, uint8_t *dst_ptr,
+ const int dst_stride, int w, int h,
+ const int16_t *x_filter_ptr) {
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
+ // shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS.
+ const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
+
+#if AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ s += 11;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ int16x4_t d0 =
+ convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d1 =
+ convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d2 =
+ convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d3 =
+ convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+
+ transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+ uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+ store_u8x4_strided_x2(d, dst_stride, d01);
+ store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+#else // !AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t t0 = vld1q_u8(s);
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+
+ int16x4_t s0 = vget_low_s16(tt0);
+ int16x4_t s4 = vget_high_s16(tt0);
+ int16x4_t s8 = vget_low_s16(tt8);
+ int16x4_t s12 = vget_high_s16(tt8);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
+ int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
+ int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
+ int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
+ int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
+ int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
+
+ int16x4_t d0 =
+ convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+
+ uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
+
+ store_u8_4x1(d, dd0);
+
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+#endif // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t filter,
+ const int16x4_t horiz_const) {
+ int16x4_t sum = horiz_const;
+ sum = vmla_lane_s16(sum, s0, filter, 0);
+ sum = vmla_lane_s16(sum, s1, filter, 1);
+ sum = vmla_lane_s16(sum, s2, filter, 2);
+ sum = vmla_lane_s16(sum, s3, filter, 3);
+
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter,
+ const int16x8_t horiz_const) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x8_t sum = horiz_const;
+ sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ return;
+ }
+
+ const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ src -= horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr);
+ return;
+ }
+
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+ // rounding right shift by FILTER_BITS - instead of a first rounding right
+ // shift by ROUND0_BITS, followed by second rounding right shift by
+ // FILTER_BITS - ROUND0_BITS.
+ // The outermost -1 is needed because we will halve the filter values.
+ const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
+
+ if (w <= 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ uint8x8_t d0 =
+ convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const));
+
+ store_u8_4x1(dst, d0);
+
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ // Filter values are even so halve to reduce precision requirements.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (h >= 8) {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ int width = w;
+ const uint8_t *s = src + 7;
+ uint8_t *d = dst;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(d + 4 * dst_stride);
+ __builtin_prefetch(d + 5 * dst_stride);
+ __builtin_prefetch(d + 6 * dst_stride);
+ __builtin_prefetch(d + 7 * dst_stride);
+
+ do {
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+ load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+ transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
+ &t14);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+
+ uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const);
+ uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ horiz_const);
+ uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ horiz_const);
+ uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ horiz_const);
+ uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+ horiz_const);
+ uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, horiz_const);
+ uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, horiz_const);
+ uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, horiz_const);
+
+ transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ while (h-- != 0) {
+ uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ int width = w;
+ const uint8_t *s = src + 8;
+ uint8_t *d = dst;
+
+ __builtin_prefetch(d);
+
+ do {
+ uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ horiz_const);
+
+ vst1_u8(d, d0);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+}
+
+static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter_0_7) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return sum;
+}
+
+static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filters) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filters);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filters);
+
+ // Filter values at indices 0 and 7 are 0.
+ int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
+ // We halved the convolution filter values so -1 from the right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
+ int src_stride, uint8_t *dst_ptr,
+ const int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ if (w <= 4) {
+ uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
+
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+
+ int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
+ int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
+ int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
+ store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+
+ int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 =
+ vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
+
+ store_u8_4x1(dst_ptr, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4;
+ load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t5, t6, t7, t8;
+ load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
+
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+
+ uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
+ uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
+ uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
+ uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
+
+ vst1_u8(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmla_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
+ int src_stride, uint8_t *dst_ptr,
+ const int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ if (w <= 4) {
+ uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+ uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
+ uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
+ uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
+
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
+ uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
+ uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
+ uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
+
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
+ store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+
+ int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ // We halved the convolution filter values so -1 from the right shift.
+ uint8x8_t d01 =
+ vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
+
+ store_u8_4x1(dst_ptr, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ vst1_u8(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x4_t s8, const int16x4_t s9,
+ const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7,
+ const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
+
+ sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ // Saturating addition is required for the largest filter taps to avoid
+ // overflow (while staying in 16-bit elements.)
+ sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
+ sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
+
+ return sum;
+}
+
+static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t s8, const int16x8_t s9,
+ const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7,
+ const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
+
+ sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ // Saturating addition is required for the largest filter taps to avoid
+ // overflow (while staying in 16-bit elements.)
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
+
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
+ int src_stride, uint8_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16_t *y_filter_ptr) {
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ if (w <= 4) {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
+ &t8, &t9, &t10);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+ src_ptr += 11 * src_stride;
+
+ do {
+ uint8x8_t t11, t12, t13, t14;
+ load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
+
+ int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
+ int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
+ int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
+ int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
+
+ int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, y_filter_0_7, y_filter_8_11);
+ int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, s12, y_filter_0_7, y_filter_8_11);
+ int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, s13, y_filter_0_7, y_filter_8_11);
+ int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, y_filter_0_7, y_filter_8_11);
+
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
+ store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+ int height = h;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
+ &t9, &t10);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ s += 11 * src_stride;
+
+ do {
+ uint8x8_t t11, t12, t13, t14;
+ load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
+
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+
+ uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+ s10, s11, y_filter_0_7, y_filter_8_11);
+ uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, s12, y_filter_0_7, y_filter_8_11);
+ uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, s13, y_filter_0_7, y_filter_8_11);
+ uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, y_filter_0_7, y_filter_8_11);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ if (w == 2 || h == 2) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+ subpel_y_qn);
+ return;
+ }
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+
+ src -= vert_offset * src_stride;
+
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (y_filter_taps > 8) {
+ convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
+ y_filter_ptr);
+ return;
+ }
+
+ // Filter values are even so halve to reduce precision requirements.
+ const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
+
+ if (y_filter_taps < 8) {
+ convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
+ } else {
+ convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
+ }
+}
+
+static INLINE int16x4_t
+convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
+ const int32x4_t horiz_const) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum = horiz_const;
+ sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
+
+ return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11) {
+ const int bd = 8;
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
+ // which are generally faster than rounding shifts on modern CPUs.
+ const int32x4_t horiz_const =
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+
+#if AOM_ARCH_AARCH64
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ s += 11;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d1 =
+ convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d2 =
+ convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ int16x4_t d3 =
+ convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+
+ transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+ store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+#endif // AOM_ARCH_AARCH64
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t t0 = vld1q_u8(s);
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+
+ int16x4_t s0 = vget_low_s16(tt0);
+ int16x4_t s4 = vget_high_s16(tt0);
+ int16x4_t s8 = vget_low_s16(tt1);
+ int16x4_t s12 = vget_high_s16(tt1);
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8
+ int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9
+ int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10
+ int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12
+ int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13
+ int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14
+
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_0_7, x_filter_8_11, horiz_const);
+ vst1_s16(d, d0);
+
+ s += 4;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t filter,
+ const int16x4_t horiz_const) {
+ int16x4_t sum = horiz_const;
+ sum = vmla_lane_s16(sum, s0, filter, 0);
+ sum = vmla_lane_s16(sum, s1, filter, 1);
+ sum = vmla_lane_s16(sum, s2, filter, 2);
+ sum = vmla_lane_s16(sum, s3, filter, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshr_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter,
+ const int16x8_t horiz_const) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x8_t sum = horiz_const;
+ sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
+ sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrq_n_s16(sum, ROUND0_BITS - 1);
+}
+
+static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride,
+ int16_t *im_block, int im_stride,
+ int w, int im_h,
+ const int16_t *x_filter_ptr) {
+ const int bd = 8;
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w <= 4) {
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
+
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // (The extra -1 is needed because we halved the filter values.)
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+#if AOM_ARCH_AARCH64
+ while (height > 8) {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ s += 7;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
+ x_filter, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
+ x_filter, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
+ x_filter, horiz_const);
+ int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter, horiz_const);
+ int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter, horiz_const);
+ int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter, horiz_const);
+ int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, horiz_const);
+
+ transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ height -= 8;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ do {
+ uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+ int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter, horiz_const);
+
+ vst1q_s16(d, d0);
+
+ s0 = s8;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
+ im_h, x_filter_0_7, x_filter_8_11);
+
+ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_0_7, y_filter_8_11);
+ } else {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h,
+ x_filter_ptr);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ if (clamped_y_taps <= 6) {
+ convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ } else {
+ convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ }
+ }
+}
+
+void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)conv_params;
+
+ if (w <= 4) {
+ do {
+ uint8x8_t s0_0 = vld1_u8(src);
+ uint8x8_t s0_1 = vld1_u8(src + 1);
+ uint8x8_t s1_0 = vld1_u8(src + src_stride);
+ uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
+
+ uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
+ uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
+
+ if (w == 2) {
+ store_u8_2x1(dst + 0 * dst_stride, d0);
+ store_u8_2x1(dst + 1 * dst_stride, d1);
+ } else {
+ store_u8_4x1(dst + 0 * dst_stride, d0);
+ store_u8_4x1(dst + 1 * dst_stride, d1);
+ }
+
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t s0_0 = vld1_u8(src);
+ uint8x8_t s0_1 = vld1_u8(src + 1);
+ uint8x8_t s1_0 = vld1_u8(src + src_stride);
+ uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
+
+ uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
+ uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
+
+ vst1_u8(dst, d0);
+ vst1_u8(dst + dst_stride, d1);
+
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *src_ptr = src;
+ uint8_t *dst_ptr = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t s1 = vld1q_u8(src_ptr + 1);
+
+ uint8x16_t d0 = vrhaddq_u8(s0, s1);
+
+ vst1q_u8(dst_ptr, d0);
+
+ src_ptr += 16;
+ dst_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+
+ if (w <= 4) {
+ do {
+ uint8x8_t s0 = load_unaligned_u8_4x1(src);
+ uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
+ uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
+
+ uint8x8_t d0 = vrhadd_u8(s0, s1);
+ uint8x8_t d1 = vrhadd_u8(s1, s2);
+
+ if (w == 2) {
+ store_u8_2x1(dst + 0 * dst_stride, d0);
+ store_u8_2x1(dst + 1 * dst_stride, d1);
+ } else {
+ store_u8_4x1(dst + 0 * dst_stride, d0);
+ store_u8_4x1(dst + 1 * dst_stride, d1);
+ }
+
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + src_stride);
+ uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
+
+ uint8x8_t d0 = vrhadd_u8(s0, s1);
+ uint8x8_t d1 = vrhadd_u8(s1, s2);
+
+ vst1_u8(dst, d0);
+ vst1_u8(dst + dst_stride, d1);
+
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *src_ptr = src;
+ uint8_t *dst_ptr = dst;
+ int height = h;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
+
+ uint8x16_t d0 = vrhaddq_u8(s0, s1);
+
+ vst1q_u8(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ } while (w != 0);
+ }
+}
+
+void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)conv_params;
+
+ uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+
+ uint16_t *im = im_block;
+
+ // Horizontal filter.
+ if (w <= 4) {
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+
+ uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
+
+ // Safe to store the whole vector, the im buffer is big enough.
+ vst1_u16(im, sum);
+
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
+ } else {
+ do {
+ const uint8_t *src_ptr = src;
+ uint16_t *im_ptr = im;
+ int width = w;
+
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + 1);
+
+ uint16x8_t sum = vaddl_u8(s0, s1);
+
+ vst1q_u16(im_ptr, sum);
+
+ src_ptr += 8;
+ im_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
+ }
+
+ im = im_block;
+
+ // Vertical filter.
+ if (w <= 4) {
+ do {
+ uint16x4_t s0 = vld1_u16(im);
+ uint16x4_t s1 = vld1_u16(im + im_stride);
+ uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
+
+ uint16x4_t sum0 = vadd_u16(s0, s1);
+ uint16x4_t sum1 = vadd_u16(s1, s2);
+
+ uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
+ uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
+
+ if (w == 2) {
+ store_u8_2x1(dst + 0 * dst_stride, d0);
+ store_u8_2x1(dst + 1 * dst_stride, d1);
+ } else {
+ store_u8_4x1(dst + 0 * dst_stride, d0);
+ store_u8_4x1(dst + 1 * dst_stride, d1);
+ }
+
+ im += 2 * im_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ uint16_t *im_ptr = im;
+ uint8_t *dst_ptr = dst;
+ int height = h;
+
+ do {
+ uint16x8_t s0 = vld1q_u16(im_ptr);
+ uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
+
+ uint16x8_t sum = vaddq_u16(s0, s1);
+ uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
+
+ vst1_u8(dst_ptr, d0);
+
+ im_ptr += im_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ im += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
new file mode 100644
index 0000000000..9fbf8aa12f
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+static INLINE int32x4_t
+convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ return sum;
+}
+
+static INLINE uint8x8_t
+convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+ const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+ const int16x8_t sub_const) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+ int16x8_t res =
+ vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+ res = vsubq_s16(res, sub_const);
+
+ return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_12tap_neon(
+ int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
+ int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int bd = 8;
+ const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+ &s8, &s9, &s10);
+ src_ptr += 11 * src_stride;
+
+ do {
+ int16x4_t s11, s12, s13, s14;
+ load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14);
+
+ int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+ s10, s11, y_filter_0_7, y_filter_8_11);
+ int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, s12, y_filter_0_7, y_filter_8_11);
+ int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, s13, y_filter_0_7, y_filter_8_11);
+ int32x4_t d3 =
+ convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter_0_7, y_filter_8_11);
+
+ int16x8_t dd01 =
+ vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS));
+ int16x8_t dd23 =
+ vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS));
+
+ dd01 = vsubq_s16(dd01, sub_const);
+ dd23 = vsubq_s16(dd23, sub_const);
+
+ uint8x8_t d01 = vqmovun_s16(dd01);
+ uint8x8_t d23 = vqmovun_s16(dd23);
+
+ store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ } else {
+ do {
+ int height = h;
+ int16_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9, &s10);
+ s += 11 * src_stride;
+
+ do {
+ int16x8_t s11, s12, s13, s14;
+ load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+ uint8x8_t d0 =
+ convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ y_filter_0_7, y_filter_8_11, sub_const);
+ uint8x8_t d1 =
+ convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter_0_7, y_filter_8_11, sub_const);
+ uint8x8_t d2 =
+ convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, y_filter_0_7, y_filter_8_11, sub_const);
+ uint8x8_t d3 =
+ convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, y_filter_0_7, y_filter_8_11, sub_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t y_filter,
+ const int16x8_t sub_const) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+ int16x8_t res =
+ vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+ res = vsubq_s16(res, sub_const);
+
+ return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr,
+ int src_stride,
+ uint8_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ const int bd = 8;
+ const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src_ptr += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+ int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ int16x4_t d3 =
+ convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+ uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+ store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s7 = vld1_s16(src_ptr);
+ int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint8x8_t d01 =
+ vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+
+ store_u8_4x1(dst_ptr, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ // Width is a multiple of 8 and height is a multiple of 4.
+ do {
+ int height = h;
+ int16_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, sub_const);
+ uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, sub_const);
+ uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, sub_const);
+ uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, sub_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s7 = vld1q_s16(s);
+ uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, sub_const);
+ vst1_u8(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2);
+
+ return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS);
+}
+
+static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter,
+ const int16x8_t sub_const) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2);
+
+ int16x8_t res =
+ vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS),
+ vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS));
+ res = vsubq_s16(res, sub_const);
+
+ return vqmovun_s16(res);
+}
+
+static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr,
+ int src_stride,
+ uint8_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16x8_t y_filter) {
+ const int bd = 8;
+ const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1));
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4);
+ src_ptr += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+
+ int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
+ int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter);
+ int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter);
+ int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter);
+
+ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const));
+ uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const));
+
+ store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x4_t s5 = vld1_s16(src_ptr);
+ int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter);
+ uint8x8_t d01 =
+ vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const));
+
+ store_u8_4x1(dst_ptr, d01);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ h--;
+#endif // AOM_ARCH_AARCH64
+ } while (h != 0);
+ } else {
+ // Width is a multiple of 8 and height is a multiple of 4.
+ do {
+ int height = h;
+ int16_t *s = src_ptr;
+ uint8_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+#if AOM_ARCH_AARCH64
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint8x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
+ uint8x8_t d1 =
+ convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const);
+ uint8x8_t d2 =
+ convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const);
+ uint8x8_t d3 =
+ convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+#else // !AOM_ARCH_AARCH64
+ int16x8_t s5 = vld1q_s16(s);
+ uint8x8_t d0 =
+ convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const);
+ vst1_u8(d, d0);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s += src_stride;
+ d += dst_stride;
+ height--;
+#endif // AOM_ARCH_AARCH64
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
new file mode 100644
index 0000000000..c29229eb09
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+ const int8x16_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum;
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
+ sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
+ sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+
+ return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+ const int8x16_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples[2], permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
+ clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
+ // Second 4 output values.
+ sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+
+ // Narrow and re-pack.
+ int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
+ vqrshrn_n_s32(sum[1], FILTER_BITS));
+ return vqmovun_s16(sum_s16);
+}
+
+static INLINE void convolve_x_sr_12tap_neon_dotprod(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filter_ptr) {
+ const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
+ const int8x16_t filter =
+ vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+
+ const int32_t correction_s32 =
+ vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)),
+ vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS))));
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
+ // shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS.
+ int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1)));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ // Special case the following no-op filter as 128 won't fit into the
+ // 8-bit signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(filter_0_7, 5) == 128) {
+ // Undo the horizontal offset in the calling function.
+ src += 5;
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x8_t d0 = vld1_u8(s);
+ if (w == 4) {
+ store_u8_4x1(d, d0);
+ } else {
+ vst1_u8(d, d0);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
+
+ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+ uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ dst += 4 * dst_stride;
+ src += 4 * src_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ uint8x8_t d0 =
+ convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
+ uint8x8_t d1 =
+ convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
+ uint8x8_t d2 =
+ convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
+ uint8x8_t d3 =
+ convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
+
+ store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0);
+
+ // Packing is performed by the caller.
+ return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product. */
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ return;
+ }
+
+ const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ src -= horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr);
+ return;
+ }
+
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+ // Dot product constants.
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+ // rounding right shift by FILTER_BITS - instead of a first rounding right
+ // shift by ROUND0_BITS, followed by second rounding right shift by
+ // FILTER_BITS - ROUND0_BITS.
+ // The outermost -1 is needed because we will halve the filter values.
+ const int32x4_t correction =
+ vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ // We halved the convolution filter values so - 1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ int width = w;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 =
+ convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl);
+ uint8x8_t d1 =
+ convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl);
+ uint8x8_t d2 =
+ convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl);
+ uint8x8_t d3 =
+ convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+ const int8x16_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum;
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+ sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+ sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+ // Narrow and re-pack.
+ return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+ const int8x16_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples[2], permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit));
+ clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+ sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+ // Second 4 output values.
+ sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+ sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+ // Narrow and re-pack.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+ vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11) {
+ const int bd = 8;
+
+ // Special case the following no-op filter as 128 won't fit into the 8-bit
+ // signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+ const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
+ // Undo the horizontal offset in the calling function.
+ src_ptr += 5;
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x8_t s0 = vld1_u8(s);
+ uint16x8_t d0 = vaddw_u8(horiz_const, s0);
+ d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
+ // Store 8 elements to avoid additional branches. This is safe if the
+ // actual block width is < 8 because the intermediate buffer is large
+ // enough to accommodate 128x128 blocks.
+ vst1q_s16(d, vreinterpretq_s16_u16(d0));
+
+ d += 8;
+ s += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ // Narrow filter values to 8-bit.
+ const int16x8x2_t x_filter_s16 = {
+ { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+ };
+ const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+ vmovn_s16(x_filter_s16.val[1]));
+
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ // Dot product constants.
+ const int32x4_t correct_tmp =
+ vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
+ vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
+ const int32x4_t correction =
+ vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit,
+ permute_tbl);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
+ range_limit, permute_tbl);
+ int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction,
+ range_limit, permute_tbl);
+ int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction,
+ range_limit, permute_tbl);
+ int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction,
+ range_limit, permute_tbl);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2];
+ s0[0] = vld1q_u8(s);
+ s0[1] = vld1q_u8(s + 4);
+ int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
+ range_limit, permute_tbl);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16_t permute_tbl) {
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t clamped_samples =
+ vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Clamp sample range to [-128, 127] for 8-bit signed dot product.
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ // Accumulate dot product into 'correction' to account for range clamp.
+ // First 4 output values.
+ sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+ sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
+ // Second 4 output values.
+ sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+ sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_neon_dotprod(
+ const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+ int im_h, const int16_t *x_filter_ptr) {
+ const int bd = 8;
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
+ // Dot product constants.
+ const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
+ const int32_t correction_s32 =
+ vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
+ const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const);
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d1 =
+ convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d2 =
+ convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+ int16x4_t d3 =
+ convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 =
+ convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit,
+ permute_tbl);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit,
+ permute_tbl);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit,
+ permute_tbl);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_0_7,
+ x_filter_8_11);
+
+ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_0_7, y_filter_8_11);
+ } else {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride,
+ w, im_h, x_filter_ptr);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ if (clamped_y_taps <= 6) {
+ convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ } else {
+ convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon_i8mm.c b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c
new file mode 100644
index 0000000000..bbcd6f201a
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x4_t convolve12_4_x(uint8x16_t samples,
+ const int8x16_t filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum;
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2);
+
+ return vqrshrn_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2],
+ const int8x16_t filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2);
+ // Second 4 output values.
+ sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2);
+
+ // Narrow and re-pack.
+ int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS),
+ vqrshrn_n_s32(sum[1], FILTER_BITS));
+ return vqmovun_s16(sum_s16);
+}
+
+static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
+ int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const int16_t *x_filter_ptr) {
+ const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
+ const int8x16_t filter =
+ vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
+
+ // Special case the following no-op filter as 128 won't fit into the
+ // 8-bit signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(filter_0_7, 5) == 128) {
+ // Undo the horizontal offset in the calling function.
+ src += 5;
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x8_t d0 = vld1_u8(s);
+ if (w == 4) {
+ store_u8_4x1(d, d0);
+ } else {
+ vst1_u8(d, d0);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+ // right shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS.
+ const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
+
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
+
+ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+ uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ dst += 4 * dst_stride;
+ src += 4 * src_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
+ uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
+ uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
+ uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
+
+ store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16_t permute_tbl,
+ const int32x4_t horiz_const) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0);
+
+ // Packing is performed by the caller.
+ return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1);
+
+ int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1]));
+ // We halved the convolution filter values so - 1 from the right shift.
+ return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
+}
+
+void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ return;
+ }
+
+ const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ src -= horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr);
+ return;
+ }
+
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
+ // rounding right shift by FILTER_BITS - instead of a first rounding right
+ // shift by ROUND0_BITS, followed by second rounding right shift by
+ // FILTER_BITS - ROUND0_BITS.
+ // The outermost -1 is needed because we will halve the filter values.
+ const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1));
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const);
+
+ // We halved the convolution filter values so - 1 from the right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
+ uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
+ uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
+ uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples,
+ const int8x16_t filters,
+ const uint8x16x3_t permute_tbl,
+ int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum;
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1);
+ sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2);
+
+ // Narrow and re-pack.
+ return vshrn_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
+ const int8x16_t filters,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[4];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]);
+ // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+ permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1);
+ sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2);
+ // Second 4 output values.
+ sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1);
+ sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2);
+
+ // Narrow and re-pack.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS),
+ vshrn_n_s32(sum[1], ROUND0_BITS));
+}
+
+static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm(
+ const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+ const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11) {
+ const int bd = 8;
+
+ // Special case the following no-op filter as 128 won't fit into the
+ // 8-bit signed dot-product instruction:
+ // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
+ if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
+ const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
+ // Undo the horizontal offset in the calling function.
+ src_ptr += 5;
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x8_t s0 = vld1_u8(s);
+ uint16x8_t d0 = vaddw_u8(horiz_const, s0);
+ d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
+ // Store 8 elements to avoid additional branches. This is safe if the
+ // actual block width is < 8 because the intermediate buffer is large
+ // enough to accommodate 128x128 blocks.
+ vst1q_s16(d, vreinterpretq_s16_u16(d0));
+
+ d += 8;
+ s += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ // Narrow filter values to 8-bit.
+ const int16x8x2_t x_filter_s16 = {
+ { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+ };
+ const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+ vmovn_s16(x_filter_s16.val[1]));
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32x4_t horiz_const =
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ if (w <= 4) {
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 =
+ convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 =
+ convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 =
+ convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 =
+ convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+
+ } else {
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2], s1[2], s2[2], s3[2];
+ load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+ load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+ int16x8_t d0 =
+ convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x8_t d1 =
+ convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x8_t d2 =
+ convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x8_t d3 =
+ convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0[2];
+ s0[0] = vld1q_u8(s);
+ s0[1] = vld1q_u8(s + 4);
+ int16x8_t d0 =
+ convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+ }
+}
+
+static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16_t permute_tbl,
+ const int32x4_t horiz_const) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+ // First 4 output values.
+ int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0);
+
+ // We halved the convolution filter values so -1 from the right shift.
+ return vshrn_n_s32(sum, ROUND0_BITS - 1);
+}
+
+static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl,
+ const int32x4_t horiz_const) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum[2];
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ // First 4 output values.
+ sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0);
+ sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1);
+ // Second 4 output values.
+ sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0);
+ sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ // We halved the convolution filter values so -1 from the right shift.
+ return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
+ vshrn_n_s32(sum[1], ROUND0_BITS - 1));
+}
+
+static INLINE void convolve_2d_sr_horiz_neon_i8mm(
+ const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
+ int im_h, const int16_t *x_filter_ptr) {
+ const int bd = 8;
+ // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The outermost -1 is needed because we halved the filter values.
+ const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
+ (1 << ((ROUND0_BITS - 1) - 1)));
+
+ const uint8_t *src_ptr = src;
+ int16_t *dst_ptr = im_block;
+ int dst_stride = im_stride;
+ int height = im_h;
+
+ if (w <= 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ // 4-tap filters are used for blocks having width <= 4.
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter =
+ vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
+
+ src_ptr += 2;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1_s16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ // Filter values are even, so halve to reduce intermediate precision reqs.
+ const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+ int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+ int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+ store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ const uint8_t *s = src_ptr;
+ int16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+ int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+ vst1q_s16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (w == 2 || h == 2) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ return;
+ }
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (filter_params_x->taps > 8) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_0_7,
+ x_filter_8_11);
+
+ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_0_7, y_filter_8_11);
+ } else {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+
+ convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w,
+ im_h, x_filter_ptr);
+
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ if (clamped_y_taps <= 6) {
+ convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ } else {
+ convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c
new file mode 100644
index 0000000000..fc03a2ee04
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c
@@ -0,0 +1,2031 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS
+
+static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr,
+ int src_stride, uint16_t *dst_ptr,
+ int dst_stride, int w, int h,
+ ConvolveParams *conv_params,
+ const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint16x4_t offset_vec = vdup_n_u16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint16x4_t avg = vhadd_u16(src, ref);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vhaddq_u16(s, r);
+ int32x4_t d0_lo =
+ vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+ int32x4_t d0_hi =
+ vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+ uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2),
+ vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2));
+ d0 = vminq_u16(d0, max);
+ vst1q_u16(dst, d0);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+ uint16_t *dst_ptr, int dst_stride,
+ int w, int h,
+ ConvolveParams *conv_params,
+ const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint16x4_t offset_vec = vdup_n_u16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint16x4_t avg = vhadd_u16(src, ref);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vhaddq_u16(s, r);
+ int32x4_t d0_lo =
+ vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+ int32x4_t d0_hi =
+ vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+
+ uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT),
+ vqrshrun_n_s32(d0_hi, ROUND_SHIFT));
+ d0 = vminq_u16(d0, max);
+ vst1q_u16(dst, d0);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_comp_avg_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint32x4_t offset_vec = vdupq_n_u32(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+ uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+ // Weighted averaging
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+ wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+ wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+ wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+ wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+ uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+ wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+ wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+ int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+ uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2),
+ vqrshrun_n_s32(d1, ROUND_SHIFT - 2));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst, d01);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, ConvolveParams *conv_params, const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const uint32x4_t offset_vec = vdupq_n_u32(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+ uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+ // Weighted averaging
+ if (w == 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+ wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+ wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+
+ uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ vst1_u16(dst_ptr, d0_u16);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+ wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+ wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+
+ uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+ wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+ wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+ int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+
+ uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT),
+ vqrshrun_n_s32(d1, ROUND_SHIFT));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst, d01);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_12_convolve6_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x4_t
+highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_12_convolve6_8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+ vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE uint16x8_t
+highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+}
+
+static INLINE uint16x4_t highbd_12_convolve8_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
+ const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS + 2);
+}
+
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter, const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE uint16x8_t highbd_12_convolve8_8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
+ const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
+ vqshrun_n_s32(sum1, ROUND0_BITS + 2));
+}
+
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter, const int32x4_t offset) {
+ const int16x4_t filter_0_3 = vget_low_s16(filter);
+ const int16x4_t filter_4_7 = vget_high_s16(filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
+
+ return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
+ vqshrun_n_s32(sum1, ROUND0_BITS));
+}
+
+static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t offset) {
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ return vqshrun_n_s32(sum, 5);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t offset) {
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ return vqshrun_n_s32(sum, ROUND0_BITS);
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_x_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 2);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+ s0[6], s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
+ s1[6], s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
+ s2[6], s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
+ s3[6], s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_convolve_x_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 2);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+ s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
+ s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
+ s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
+ s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ int dst16_stride = conv_params->dst_stride;
+ const int im_stride = MAX_SB_SIZE;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
+ (1 << (bd + FILTER_BITS)) +
+ (1 << (bd + FILTER_BITS - 1));
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ src -= horiz_offset;
+
+ // horizontal filter
+ if (bd == 12) {
+ if (conv_params->do_average) {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
+ im_stride, w, h, x_filter_ptr,
+ offset_convolve);
+ } else {
+ highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
+ w, h, x_filter_ptr, offset_convolve);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+ w, h, conv_params, offset_avg, bd);
+ } else {
+ highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, offset_avg, bd);
+ }
+ } else {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
+ dst16_stride, w, h,
+ x_filter_ptr, offset_convolve);
+ } else {
+ highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
+ w, h, x_filter_ptr, offset_convolve);
+ }
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
+ im_stride, w, h, x_filter_ptr,
+ offset_convolve);
+ } else {
+ highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
+ h, x_filter_ptr, offset_convolve);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+ h, conv_params, offset_avg, bd);
+ } else {
+ highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, offset_avg, bd);
+ }
+ } else {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
+ dst16_stride, w, h, x_filter_ptr,
+ offset_convolve);
+ } else {
+ highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
+ h, x_filter_ptr, offset_convolve);
+ }
+ }
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x4_t d1 =
+ highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x4_t d2 =
+ highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x4_t d3 =
+ highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x4_t d1 =
+ highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x4_t d2 =
+ highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x4_t d3 =
+ highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+static INLINE void highbd_dist_wtd_convolve_y_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_y_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ int dst16_stride = conv_params->dst_stride;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
+ (1 << (bd + FILTER_BITS)) +
+ (1 << (bd + FILTER_BITS - 1));
+
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ src -= vert_offset * src_stride;
+
+ if (bd == 12) {
+ if (conv_params->do_average) {
+ if (y_filter_taps <= 6) {
+ highbd_12_dist_wtd_convolve_y_6tap_neon(
+ src + src_stride, src_stride, im_block, im_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+ w, h, conv_params, round_offset_avg,
+ bd);
+ } else {
+ highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ }
+ } else {
+ if (y_filter_taps <= 6) {
+ highbd_12_dist_wtd_convolve_y_6tap_neon(
+ src + src_stride, src_stride, dst16, dst16_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_12_dist_wtd_convolve_y_8tap_neon(
+ src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ }
+ } else {
+ if (conv_params->do_average) {
+ if (y_filter_taps <= 6) {
+ highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
+ im_block, im_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+ h, conv_params, round_offset_avg, bd);
+ } else {
+ highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ }
+ } else {
+ if (y_filter_taps <= 6) {
+ highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
+ dst16, dst16_stride, w, h,
+ y_filter_ptr, round_offset_conv);
+ } else {
+ highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
+ dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv);
+ }
+ }
+ }
+}
+
+static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
+ uint16_t *dst_ptr, int dst_stride, int w,
+ int h, const int round_bits,
+ const int offset) {
+ if (w <= 4) {
+ const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
+ const uint16x4_t offset_u16 = vdup_n_u16(offset);
+
+ for (int y = 0; y < h; ++y) {
+ const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
+ uint16x4_t d = vshl_u16(s, round_shift_s16);
+ d = vadd_u16(d, offset_u16);
+ if (w == 2) {
+ store_u16_2x1(dst_ptr + y * dst_stride, d);
+ } else {
+ vst1_u16(dst_ptr + y * dst_stride, d);
+ }
+ }
+ } else {
+ const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
+ const uint16x8_t offset_u16 = vdupq_n_u16(offset);
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; x += 8) {
+ const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
+ uint16x8_t d = vshlq_u16(s, round_shift_s16);
+ d = vaddq_u16(d, offset_u16);
+ vst1q_u16(dst_ptr + y * dst_stride + x, d);
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
+ int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+ const int im_stride = MAX_SB_SIZE;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+
+ if (conv_params->do_average) {
+ highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
+ round_offset);
+ } else {
+ highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
+ round_offset);
+ }
+
+ if (conv_params->do_average) {
+ if (conv_params->use_dist_wtd_comp_avg) {
+ if (bd == 12) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
+ w, h, conv_params, round_offset, bd);
+ } else {
+ highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
+ h, conv_params, round_offset, bd);
+ }
+ } else {
+ if (bd == 12) {
+ highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset, bd);
+ } else {
+ highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset, bd);
+ }
+ }
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
+ uint16x4_t d1 =
+ highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
+ uint16x4_t d2 =
+ highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
+ uint16x4_t d3 =
+ highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, int offset) {
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w <= 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6];
+ load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+ uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], x_filter, offset_vec);
+ uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], x_filter, offset_vec);
+ uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6];
+ load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+}
+
+static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 1);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ int16x4_t s0[4];
+ load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+ uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+ s0[6], s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
+ s1[6], s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
+ s2[6], s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
+ s3[6], s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+
+ uint16x8_t d0 =
+ highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
+ s0[6], s0[7], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, const int offset) {
+ // The smallest block height is 4, and the horizontal convolution needs to
+ // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t offset_vec = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 1);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+ uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
+ uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
+ uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ int16x4_t s0[4];
+ load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+ s0[7], x_filter, offset_vec);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
+ s1[7], x_filter, offset_vec);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
+ s2[7], x_filter, offset_vec);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
+ s3[7], x_filter, offset_vec);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
+ s0[7], x_filter, offset_vec);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = clamped_x_taps / 2 - 1;
+ // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
+ // faster non-rounding non-saturating left shift.
+ const int round_offset_conv_x =
+ (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
+ const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset_conv_y = (1 << y_offset_bits);
+ const int round_offset_avg =
+ ((1 << (y_offset_bits - conv_params->round_1)) +
+ (1 << (y_offset_bits - conv_params->round_1 - 1)));
+
+ const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ // horizontal filter
+ if (bd == 12) {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ } else {
+ highbd_12_dist_wtd_convolve_2d_horiz_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ }
+ } else {
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ } else {
+ highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_ptr,
+ round_offset_conv_x);
+ }
+ }
+
+ // vertical filter
+ if (y_filter_taps <= 6) {
+ if (conv_params->do_average) {
+ highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ } else {
+ highbd_dist_wtd_convolve_2d_vert_6tap_neon(
+ im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ }
+ } else {
+ if (conv_params->do_average) {
+ highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
+ im_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ } else {
+ highbd_dist_wtd_convolve_2d_vert_8tap_neon(
+ im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
+ round_offset_conv_y);
+ }
+ }
+
+ // Do the compound averaging outside the loop, avoids branching within the
+ // main loop
+ if (conv_params->do_average) {
+ if (conv_params->use_dist_wtd_comp_avg) {
+ if (bd == 12) {
+ highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
+ w, h, conv_params, round_offset_avg,
+ bd);
+ } else {
+ highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+ h, conv_params, round_offset_avg, bd);
+ }
+ } else {
+ if (bd == 12) {
+ highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ } else {
+ highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+ conv_params, round_offset_avg, bd);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c
new file mode 100644
index 0000000000..4f1c25d122
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+#define UPSCALE_NORMATIVE_TAPS 8
+
+void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ int x0_qn, int x_step_qn, int bd) {
+ const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ static const int32_t kIdx[4] = { 0, 1, 2, 3 };
+ const int32x4_t idx = vld1q_s32(kIdx);
+ const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
+ const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
+ const int32x4_t offset_s32 = vdupq_n_s32(0);
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+
+ const uint16_t *src_ptr = src - horiz_offset;
+ uint16_t *dst_ptr = dst;
+
+ if (w <= 4) {
+ int height = h;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int x_qn = x0_qn;
+
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
+ // 2
+ const int32x4_t src_idx =
+ vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+ // Similarly for the filter vector indices, we calculate the filter
+ // indices for 4 columns. First we calculate the indices:
+ // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
+ // Then we calculate the actual pointers, multiplying with
+ // UPSCALE_UPSCALE_NORMATIVE_TAPS
+ // again shift left by 1
+ const int32x4_t x_filter4_idx = vshlq_n_s32(
+ vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
+ // Even though pointers are unsigned 32/64-bit ints we do signed
+ // addition The reason for this is that x_qn can be negative, leading to
+ // negative offsets. Argon test
+ // profile0_core/streams/test10573_11003.obu was failing because of
+ // this.
+#if AOM_ARCH_AARCH64
+ uint64x2_t tmp4[2];
+ tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
+ vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
+ tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
+ vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+
+ // filter vectors
+ tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+ tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint64_t *)&x_filter4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+#else
+ uint32x4_t tmp4;
+ tmp4 = vreinterpretq_u32_s32(
+ vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+
+ // filter vectors
+ tmp4 = vreinterpretq_u32_s32(
+ vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
+ vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint32_t *)&x_filter4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+#endif // AOM_ARCH_AARCH64
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, max);
+
+ if (w == 2) {
+ store_u16_2x1(d, d0);
+ } else {
+ vst1_u16(d, d0);
+ }
+
+ src_ptr += src_stride;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ int x_qn = x0_qn;
+ uint16_t *d = dst_ptr;
+ const uint16_t *s = src_ptr;
+
+ do {
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const int32x4_t xqn_idx =
+ vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
+ // = 2
+ const int32x4_t src_idx =
+ vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
+
+ // Similarly for the filter vector indices, we calculate the filter
+ // indices for 4 columns. First we calculate the indices:
+ // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
+ // Then we calculate the actual pointers, multiplying with
+ // UPSCALE_UPSCALE_NORMATIVE_TAPS
+ // again shift left by 1
+ const int32x4_t x_filter4_idx = vshlq_n_s32(
+ vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
+ 1);
+ // Even though pointers are unsigned 32/64-bit ints we do signed
+ // addition The reason for this is that x_qn can be negative, leading to
+ // negative offsets. Argon test
+ // profile0_core/streams/test10573_11003.obu was failing because of
+ // this.
+#if AOM_ARCH_AARCH64
+ uint64x2_t tmp4[2];
+ tmp4[0] = vreinterpretq_u64_s64(
+ vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
+ tmp4[1] = vreinterpretq_u64_s64(
+ vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+
+ // filter vectors
+ tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+ tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
+ vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
+ vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint64_t *)&x_filter4_ptr;
+ vst1q_u64(tmp_ptr, tmp4[0]);
+ vst1q_u64(tmp_ptr + 2, tmp4[1]);
+#else
+ uint32x4_t tmp4;
+ tmp4 = vreinterpretq_u32_s32(
+ vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+
+ // filter vectors
+ tmp4 = vreinterpretq_u32_s32(
+ vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
+ vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));
+
+ const int16_t *x_filter4_ptr[4];
+ tmp_ptr = (uint32_t *)&x_filter4_ptr;
+ vst1q_u32(tmp_ptr, tmp4);
+#endif // AOM_ARCH_AARCH64
+
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale X convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, max);
+ vst1_u16(d, d0);
+
+ x_qn += 4 * x_step_qn;
+ d += 4;
+ width -= 4;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ } while (height > 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_neon.c
new file mode 100644
index 0000000000..3a3e33fcba
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_convolve_neon.c
@@ -0,0 +1,2120 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+static INLINE uint16x4_t
+highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t
+highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_convolve_y_sr_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, const int bd) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)(src_ptr + src_stride);
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 =
+ highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+ uint16x4_t d1 =
+ highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+ uint16x4_t d2 =
+ highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+ uint16x4_t d3 =
+ highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ // Width is a multiple of 8 and height is a multiple of 4.
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)(src_ptr + src_stride);
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
+ uint16x8_t d1 =
+ highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
+ uint16x8_t d2 =
+ highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
+ uint16x8_t d3 =
+ highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_y(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_y(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_convolve_y_sr_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, int bd) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x4_t d1 =
+ highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x4_t d2 =
+ highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x4_t d3 =
+ highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x8_t d1 =
+ highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x8_t d2 =
+ highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x8_t d3 =
+ highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve12_4_y(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_y(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+ const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
+ vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
+}
+
+static INLINE void highbd_convolve_y_sr_12tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, int bd) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9, &s10);
+ s += 11 * src_stride;
+
+ do {
+ int16x4_t s11, s12, s13, s14;
+ load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+ uint16x4_t d0 =
+ highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, y_filter_0_7, y_filter_8_11);
+ uint16x4_t d1 =
+ highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, y_filter_0_7, y_filter_8_11);
+ uint16x4_t d2 =
+ highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, y_filter_0_7, y_filter_8_11);
+ uint16x4_t d3 =
+ highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, y_filter_0_7, y_filter_8_11);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9, &s10);
+ s += 11 * src_stride;
+
+ do {
+ int16x8_t s11, s12, s13, s14;
+ load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+ uint16x8_t d0 =
+ highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
+ s11, y_filter_0_7, y_filter_8_11);
+ uint16x8_t d1 =
+ highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+ s12, y_filter_0_7, y_filter_8_11);
+ uint16x8_t d2 =
+ highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, y_filter_0_7, y_filter_8_11);
+ uint16x8_t d3 =
+ highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, y_filter_0_7, y_filter_8_11);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd) {
+ if (w == 2 || h == 2) {
+ av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ return;
+ }
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ src -= vert_offset * src_stride;
+
+ if (y_filter_taps > 8) {
+ highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
+ y_filter_ptr, bd);
+ return;
+ }
+ if (y_filter_taps < 8) {
+ highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h,
+ y_filter_ptr, bd);
+ return;
+ }
+
+ highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
+ y_filter_ptr, bd);
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
+ const int16x8_t x_filter,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = offset;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
+
+ int32x4_t sum1 = offset;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
+ int bd) {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ // This shim allows to do only one rounding shift instead of two.
+ const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
+
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset);
+ uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset);
+ uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset);
+ uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t offset) {
+ int32x4_t sum = offset;
+ sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
+ const int16x8_t x_filter,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = offset;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+
+ int32x4_t sum1 = offset;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
+ int src_stride, uint16_t *dst_ptr,
+ int dst_stride, int w, int h,
+ const int16_t *x_filter_ptr,
+ ConvolveParams *conv_params,
+ int bd) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ // This shim allows to do only one rounding shift instead of two.
+ const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width == 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 2);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset);
+ uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset);
+ uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset);
+ uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset);
+ uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset);
+ uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset);
+ uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum = offset;
+ sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum0 = offset;
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
+
+ int32x4_t sum1 = offset;
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE void highbd_convolve_x_sr_12tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
+ int bd) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ // This shim allows to do only one rounding shift instead of two.
+ const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x4_t d0 =
+ highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset);
+ uint16x4_t d1 =
+ highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset);
+ uint16x4_t d2 =
+ highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset);
+ uint16x4_t d3 =
+ highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x8_t d0 =
+ highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset);
+ uint16x8_t d1 =
+ highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset);
+ uint16x8_t d2 =
+ highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset);
+ uint16x8_t d3 =
+ highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ }
+}
+
+void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (w == 2 || h == 2) {
+ av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params, bd);
+ return;
+ }
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ src -= horiz_offset;
+
+ if (x_filter_taps > 8) {
+ highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr, conv_params, bd);
+ return;
+ }
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr, conv_params, bd);
+ return;
+ }
+
+ highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
+ x_filter_ptr, conv_params, bd);
+}
+
+static INLINE uint16x4_t highbd_convolve6_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x8_t y_filter, const int32x4_t round_shift,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
+
+ sum = vshlq_s32(sum, round_shift);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t y_filter, const int32x4_t round_shift,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
+
+ sum0 = vshlq_s32(sum0, round_shift);
+ sum1 = vshlq_s32(sum1, round_shift);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
+ int bd, const int offset) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+ const int round1_shift = conv_params->round_1;
+ const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int16x4_t s0, s1, s2, s3, s4;
+ load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x4_t s5, s6, s7, s8;
+ load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
+ round1_shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+ s += 5 * src_stride;
+
+ do {
+ int16x8_t s5, s6, s7, s8;
+ load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
+
+ uint16x8_t d0 = highbd_convolve6_8_2d_v(
+ s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32);
+ uint16x8_t d1 = highbd_convolve6_8_2d_v(
+ s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32);
+ uint16x8_t d2 = highbd_convolve6_8_2d_v(
+ s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32);
+ uint16x8_t d3 = highbd_convolve6_8_2d_v(
+ s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ sum = vshlq_s32(sum, round_shift);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
+
+ sum0 = vshlq_s32(sum0, round_shift);
+ sum1 = vshlq_s32(sum1, round_shift);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
+ int bd, const int offset) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+ const int round1_shift = conv_params->round_1;
+ const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d1 =
+ highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d2 =
+ highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x4_t d3 =
+ highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round1_shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x8_t d1 =
+ highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x8_t d2 =
+ highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round1_shift_s32, offset_s32);
+ uint16x8_t d3 =
+ highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round1_shift_s32, offset_s32);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve12_4_2d_v(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
+ const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
+
+ sum = vshlq_s32(sum, round_shift);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+ const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
+ const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
+ const int32x4_t round_shift, const int32x4_t offset) {
+ const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
+ const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
+
+ sum0 = vshlq_s32(sum0, round_shift);
+ sum1 = vshlq_s32(sum1, round_shift);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
+ const int bd, const int offset) {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
+ const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+ const int round1_shift = conv_params->round_1;
+ const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9, &s10);
+ s += 11 * src_stride;
+
+ do {
+ int16x4_t s11, s12, s13, s14;
+ load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+ uint16x4_t d0 = highbd_convolve12_4_2d_v(
+ s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x4_t d1 = highbd_convolve12_4_2d_v(
+ s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x4_t d2 = highbd_convolve12_4_2d_v(
+ s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x4_t d3 = highbd_convolve12_4_2d_v(
+ s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+
+ d0 = vmin_u16(d0, vget_low_u16(max));
+ d1 = vmin_u16(d1, vget_low_u16(max));
+ d2 = vmin_u16(d2, vget_low_u16(max));
+ d3 = vmin_u16(d3, vget_low_u16(max));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9, &s10);
+ s += 11 * src_stride;
+
+ do {
+ int16x8_t s11, s12, s13, s14;
+ load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
+
+ uint16x8_t d0 = highbd_convolve12_8_2d_v(
+ s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x8_t d1 = highbd_convolve12_8_2d_v(
+ s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x8_t d2 = highbd_convolve12_8_2d_v(
+ s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+ uint16x8_t d3 = highbd_convolve12_8_2d_v(
+ s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
+ y_filter_8_11, round1_shift_s32, offset_s32);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s7 = s11;
+ s8 = s12;
+ s9 = s13;
+ s10 = s14;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
+ const int16x8_t x_filter,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ // Values at indices 0 and 7 of y_filter are zero.
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
+
+ sum0 = vqrshlq_s32(sum0, shift_s32);
+ sum1 = vqrshlq_s32(sum1, shift_s32);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
+ const int offset) {
+ // The smallest block height processed by the SIMD functions is 4, and the
+ // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+ // for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6], s1[6], s2[6], s3[6];
+ load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5]);
+ load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5]);
+ load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5]);
+ load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5]);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ uint16x8_t d1 =
+ highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
+ uint16x8_t d2 =
+ highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
+ uint16x8_t d3 =
+ highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[6];
+ load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
+
+ uint16x8_t d0 =
+ highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+}
+
+static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
+ const int16x4_t x_filter,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
+
+ sum = vqrshlq_s32(sum, shift_s32);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
+ const int16x8_t x_filter,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+
+ sum0 = vqrshlq_s32(sum0, shift_s32);
+ sum1 = vqrshlq_s32(sum1, shift_s32);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
+ const int offset) {
+ // The smallest block height processed by the SIMD functions is 4, and the
+ // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+ // for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ // 4-tap filters are used for blocks having width <= 4.
+ const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
+ const int16_t *s = (const int16_t *)(src_ptr + 1);
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 =
+ highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
+ uint16x4_t d1 =
+ highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
+ uint16x4_t d2 =
+ highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
+ uint16x4_t d3 =
+ highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ int16x4_t s0[4];
+ load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+
+ uint16x4_t d0 =
+ highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
+
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
+ } else {
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ uint16x8_t d1 =
+ highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
+ uint16x8_t d2 =
+ highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
+ uint16x8_t d3 =
+ highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
+ sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
+ sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
+ sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
+ sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
+ sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
+ sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
+ sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
+ sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
+ sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
+ sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
+ sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
+
+ sum = vqrshlq_s32(sum, shift_s32);
+ return vqmovun_s32(sum);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
+ const int16x8_t x_filter_0_7,
+ const int16x4_t x_filter_8_11,
+ const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
+ const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
+
+ int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
+
+ int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
+
+ sum0 = vqrshlq_s32(sum0, shift_s32);
+ sum1 = vqrshlq_s32(sum1, shift_s32);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
+ const int offset) {
+ // The smallest block height processed by the SIMD functions is 4, and the
+ // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
+ // for the vertical convolution.
+ assert(h >= 5);
+ const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+ const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
+ const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x4_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+ uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+ uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+ uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 4);
+
+ do {
+ int16x4_t s0[12];
+ load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
+ &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
+
+ uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
+ shift_s32, offset_s32);
+
+ vst1_u16(d, d0);
+
+ s += src_stride;
+ d += dst_stride;
+ } while (--h != 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[12], s1[12], s2[12], s3[12];
+ load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+ load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+ &s1[11]);
+ load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
+ &s2[11]);
+ load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
+ &s3[11]);
+
+ uint16x8_t d0 = highbd_convolve12_8_2d_h(
+ s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ uint16x8_t d1 = highbd_convolve12_8_2d_h(
+ s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ uint16x8_t d2 = highbd_convolve12_8_2d_h(
+ s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ uint16x8_t d3 = highbd_convolve12_8_2d_h(
+ s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 4);
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0[12];
+ load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+ &s0[11]);
+
+ uint16x8_t d0 = highbd_convolve12_8_2d_h(
+ s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ }
+}
+
+void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (w == 2 || h == 2) {
+ av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params, bd);
+ return;
+ }
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
+ const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
+
+ const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
+ const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+ const int im_h = h + clamped_y_taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = clamped_y_taps / 2 - 1;
+ const int horiz_offset = clamped_x_taps / 2 - 1;
+ const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
+ const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
+ // simple shift left instead of a rounding saturating shift left.
+ const int y_offset =
+ (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
+
+ const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+ if (x_filter_taps > 8) {
+ highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_ptr,
+ conv_params, x_offset_initial);
+
+ highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_ptr, conv_params, bd,
+ y_offset);
+ return;
+ }
+ if (x_filter_taps <= 6 && w != 4) {
+ highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
+ im_stride, w, im_h, x_filter_ptr,
+ conv_params, x_offset_initial);
+ } else {
+ highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ w, im_h, x_filter_ptr, conv_params,
+ x_offset_initial);
+ }
+
+ if (y_filter_taps <= 6) {
+ highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_ptr, conv_params, bd,
+ y_offset);
+ } else {
+ highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_ptr, conv_params, bd,
+ y_offset);
+ }
+}
+
+// Filter used is [64, 64].
+void av1_highbd_convolve_x_sr_intrabc_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)conv_params;
+ (void)bd;
+
+ if (w <= 4) {
+ do {
+ uint16x4_t s0 = vld1_u16(src);
+ uint16x4_t s1 = vld1_u16(src + 1);
+
+ uint16x4_t d0 = vrhadd_u16(s0, s1);
+
+ if (w == 2) {
+ store_u16_2x1(dst, d0);
+ } else {
+ vst1_u16(dst, d0);
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+ int width = w;
+
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 1);
+
+ uint16x8_t d0 = vrhaddq_u16(s0, s1);
+
+ vst1q_u16(dst_ptr, d0);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+// Filter used is [64, 64].
+void av1_highbd_convolve_y_sr_intrabc_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ int bd) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)bd;
+
+ if (w <= 4) {
+ do {
+ uint16x4_t s0 = vld1_u16(src);
+ uint16x4_t s1 = vld1_u16(src + src_stride);
+
+ uint16x4_t d0 = vrhadd_u16(s0, s1);
+
+ if (w == 2) {
+ store_u16_2x1(dst, d0);
+ } else {
+ vst1_u16(dst, d0);
+ }
+
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *dst_ptr = dst;
+ int height = h;
+
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
+
+ uint16x8_t d0 = vrhaddq_u16(s0, s1);
+
+ vst1q_u16(dst_ptr, d0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+// Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
+void av1_highbd_convolve_2d_sr_intrabc_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)conv_params;
+ (void)bd;
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + 1;
+ int im_stride = MAX_SB_SIZE;
+
+ uint16x8_t vert_offset = vdupq_n_u16(1);
+
+ uint16_t *im = im_block;
+
+ // Horizontal filter.
+ if (w <= 4) {
+ do {
+ uint16x4_t s0 = vld1_u16(src);
+ uint16x4_t s1 = vld1_u16(src + 1);
+
+ uint16x4_t d0 = vadd_u16(s0, s1);
+
+ // Safe to store the whole vector, the im buffer is big enough.
+ vst1_u16(im, d0);
+
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
+ } else {
+ do {
+ const uint16_t *src_ptr = src;
+ uint16_t *im_ptr = im;
+ int width = w;
+
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 1);
+
+ uint16x8_t d0 = vaddq_u16(s0, s1);
+
+ vst1q_u16(im_ptr, d0);
+
+ src_ptr += 8;
+ im_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ src += src_stride;
+ im += im_stride;
+ } while (--im_h != 0);
+ }
+
+ im = im_block;
+
+ // Vertical filter.
+ if (w <= 4) {
+ do {
+ uint16x4_t s0 = vld1_u16(im);
+ uint16x4_t s1 = vld1_u16(im + im_stride);
+
+ uint16x4_t d0 = vhadd_u16(s0, s1);
+ d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
+
+ if (w == 2) {
+ store_u16_2x1(dst, d0);
+ } else {
+ vst1_u16(dst, d0);
+ }
+
+ im += im_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16_t *im_ptr = im;
+ uint16_t *dst_ptr = dst;
+ int height = h;
+
+ do {
+ uint16x8_t s0 = vld1q_u16(im_ptr);
+ uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
+
+ uint16x8_t d0 = vhaddq_u16(s0, s1);
+ d0 = vhaddq_u16(d0, vert_offset);
+
+ vst1q_u16(dst_ptr, d0);
+
+ im_ptr += im_stride;
+ dst_ptr += dst_stride;
+ } while (--height != 0);
+ im += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_convolve_neon.h b/third_party/aom/av1/common/arm/highbd_convolve_neon.h
new file mode 100644
index 0000000000..08b2bda4e5
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_convolve_neon.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/convolve.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t offset) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t shift_s32, const int32x4_t offset) {
+ int32x4_t sum =
+ highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset);
+
+ sum = vqrshlq_s32(sum, shift_s32);
+ return vqmovun_s32(sum);
+}
+
+// Like above but also perform round shifting and subtract correction term
+static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
+ const int32x4_t round_shift, const int32x4_t offset,
+ const int32x4_t correction) {
+ int32x4_t sum =
+ highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset);
+
+ sum = vsubq_s32(vqrshlq_s32(sum, round_shift), correction);
+ return vqmovun_s32(sum);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+// Like above but also perform round shifting and subtract correction term
+static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ const int32x4_t round_shift, const int32x4_t offset,
+ const int32x4_t correction) {
+ int32x4_t sum0;
+ int32x4_t sum1;
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset,
+ &sum0, &sum1);
+
+ sum0 = vsubq_s32(vqrshlq_s32(sum0, round_shift), correction);
+ sum1 = vsubq_s32(vqrshlq_s32(sum1, round_shift), correction);
+
+ return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
+}
+
+static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x4_t *filters_lo,
+ const int16x4_t *filters_hi, const int32x4_t offset) {
+ int16x4_t s_lo[] = { vget_low_s16(s0), vget_low_s16(s1), vget_low_s16(s2),
+ vget_low_s16(s3) };
+ int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2),
+ vget_high_s16(s3) };
+
+ transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi);
+
+ int32x4_t sum = vmlal_s16(offset, s_lo[0], filters_lo[0]);
+ sum = vmlal_s16(sum, s_lo[1], filters_lo[1]);
+ sum = vmlal_s16(sum, s_lo[2], filters_lo[2]);
+ sum = vmlal_s16(sum, s_lo[3], filters_lo[3]);
+ sum = vmlal_s16(sum, s_hi[0], filters_hi[0]);
+ sum = vmlal_s16(sum, s_hi[1], filters_hi[1]);
+ sum = vmlal_s16(sum, s_hi[2], filters_hi[2]);
+ sum = vmlal_s16(sum, s_hi[3], filters_hi[3]);
+
+ return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x4_t *filters_lo,
+ const int16x4_t *filters_hi, const int32x4_t shift_s32,
+ const int32x4_t offset) {
+ int32x4_t sum = highbd_convolve8_2d_scale_horiz4x8_s32(
+ s0, s1, s2, s3, filters_lo, filters_hi, offset);
+
+ sum = vqrshlq_s32(sum, shift_s32);
+ return vqmovun_s32(sum);
+}
+
+#endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c
new file mode 100644
index 0000000000..702c651536
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/highbd_convolve_neon.h"
+
+static INLINE void highbd_dist_wtd_comp_avg_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, ConvolveParams *conv_params, const int round_bits,
+ const int offset, const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const int32x4_t round_shift = vdupq_n_s32(-round_bits);
+ const uint32x4_t offset_vec = vdupq_n_u32(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
+ uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
+
+ // Weighted averaging
+ if (w <= 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
+ wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
+ wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
+ d0 = vqrshlq_s32(d0, round_shift);
+
+ uint16x4_t d0_u16 = vqmovun_s32(d0);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ if (w == 2) {
+ store_u16_2x1(dst_ptr, d0_u16);
+ } else {
+ vst1_u16(dst_ptr, d0_u16);
+ }
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
+ wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
+ wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
+ d0 = vqrshlq_s32(d0, round_shift);
+
+ uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
+ wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
+ wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
+ int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
+ d1 = vqrshlq_s32(d1, round_shift);
+
+ uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst, d01);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
+ uint16_t *dst_ptr, int dst_stride,
+ int w, int h,
+ ConvolveParams *conv_params,
+ const int round_bits, const int offset,
+ const int bd) {
+ CONV_BUF_TYPE *ref_ptr = conv_params->dst;
+ const int ref_stride = conv_params->dst_stride;
+ const int32x4_t round_shift = vdupq_n_s32(-round_bits);
+ const uint16x4_t offset_vec = vdup_n_u16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w <= 4) {
+ do {
+ const uint16x4_t src = vld1_u16(src_ptr);
+ const uint16x4_t ref = vld1_u16(ref_ptr);
+
+ uint16x4_t avg = vhadd_u16(src, ref);
+ int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
+ d0 = vqrshlq_s32(d0, round_shift);
+
+ uint16x4_t d0_u16 = vqmovun_s32(d0);
+ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
+
+ if (w == 2) {
+ store_u16_2x1(dst_ptr, d0_u16);
+ } else {
+ vst1_u16(dst_ptr, d0_u16);
+ }
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *src = src_ptr;
+ const uint16_t *ref = ref_ptr;
+ uint16_t *dst = dst_ptr;
+ do {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vhaddq_u16(s, r);
+ int32x4_t d0_lo =
+ vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
+ int32x4_t d0_hi =
+ vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
+ d0_lo = vqrshlq_s32(d0_lo, round_shift);
+ d0_hi = vqrshlq_s32(d0_hi, round_shift);
+
+ uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi));
+ d0 = vminq_u16(d0, max);
+ vst1q_u16(dst, d0);
+
+ src += 8;
+ ref += 8;
+ dst += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_2d_x_scale_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int subpel_x_qn, const int x_step_qn,
+ const InterpFilterParams *filter_params, ConvolveParams *conv_params,
+ const int offset) {
+ static const uint32_t kIdx[4] = { 0, 1, 2, 3 };
+ const uint32x4_t idx = vld1q_u32(kIdx);
+ const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK);
+ const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
+ const int32x4_t offset_s32 = vdupq_n_s32(offset);
+
+ if (w <= 4) {
+ int height = h;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int x_qn = subpel_x_qn;
+
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
+ // 2
+ const uint32x4_t src_idx_u32 =
+ vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
+#if AOM_ARCH_AARCH64
+ uint64x2_t src4[2];
+ src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
+ vget_low_u32(src_idx_u32));
+ src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
+ vget_high_u32(src_idx_u32));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, src4[0]);
+ vst1q_u64(tmp_ptr + 2, src4[1]);
+#else
+ uint32x4_t src4;
+ src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32);
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, src4);
+#endif // AOM_ARCH_AARCH64
+ // Same for the filter vectors
+ const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
+ vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
+ int32_t x_filter4_idx[4];
+ vst1q_s32(x_filter4_idx, filter_idx_s32);
+ const int16_t *x_filter4_ptr[4];
+
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // We could easily do this using SIMD as well instead of calling the
+ // inline function 4 times.
+ x_filter4_ptr[0] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]);
+ x_filter4_ptr[1] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]);
+ x_filter4_ptr[2] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]);
+ x_filter4_ptr[3] =
+ av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ if (w == 2) {
+ store_u16_2x1(d, d0);
+ } else {
+ vst1_u16(d, d0);
+ }
+
+ src_ptr += src_stride;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ int x_qn = subpel_x_qn;
+ uint16_t *d = dst_ptr;
+ const uint16_t *s = src_ptr;
+
+ do {
+ // Load 4 src vectors at a time, they might be the same, but we have to
+ // calculate the indices anyway. Doing it in SIMD and then storing the
+ // indices is faster than having to calculate the expression
+ // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
+ // Ideally this should be a gather using the indices, but NEON does not
+ // have that, so have to emulate
+ const uint32x4_t xqn_idx =
+ vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
+ // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
+ // = 2
+ const uint32x4_t src_idx_u32 =
+ vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
+#if AOM_ARCH_AARCH64
+ uint64x2_t src4[2];
+ src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
+ vget_low_u32(src_idx_u32));
+ src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
+ vget_high_u32(src_idx_u32));
+ int16_t *src4_ptr[4];
+ uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
+ vst1q_u64(tmp_ptr, src4[0]);
+ vst1q_u64(tmp_ptr + 2, src4[1]);
+#else
+ uint32x4_t src4;
+ src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32);
+ int16_t *src4_ptr[4];
+ uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
+ vst1q_u32(tmp_ptr, src4);
+#endif // AOM_ARCH_AARCH64
+ // Same for the filter vectors
+ const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
+ vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
+ int32_t x_filter4_idx[4];
+ vst1q_s32(x_filter4_idx, filter_idx_s32);
+ const int16_t *x_filter4_ptr[4];
+
+ // Load source
+ int16x8_t s0 = vld1q_s16(src4_ptr[0]);
+ int16x8_t s1 = vld1q_s16(src4_ptr[1]);
+ int16x8_t s2 = vld1q_s16(src4_ptr[2]);
+ int16x8_t s3 = vld1q_s16(src4_ptr[3]);
+
+ // We could easily do this using SIMD as well instead of calling the
+ // inline function 4 times.
+ x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[0]);
+ x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[1]);
+ x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[2]);
+ x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(
+ filter_params, x_filter4_idx[3]);
+
+ // Actually load the filters
+ const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
+ const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
+ const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
+ const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
+
+ // Group low and high parts and transpose
+ int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
+ vget_low_s16(x_filter1),
+ vget_low_s16(x_filter2),
+ vget_low_s16(x_filter3) };
+ int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
+ vget_high_s16(x_filter1),
+ vget_high_s16(x_filter2),
+ vget_high_s16(x_filter3) };
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
+ transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
+
+ // Run the 2D Scale X convolution
+ uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
+ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
+
+ vst1_u16(d, d0);
+
+ x_qn += 4 * x_step_qn;
+ d += 4;
+ width -= 4;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ height--;
+ } while (height > 0);
+ }
+}
+
+static INLINE void highbd_convolve_2d_y_scale_8tap_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int subpel_y_qn, const int y_step_qn,
+ const InterpFilterParams *filter_params, const int round1_bits,
+ const int offset) {
+ const int32x4_t offset_s32 = vdupq_n_s32(1 << offset);
+
+ const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits);
+ if (w <= 4) {
+ int height = h;
+ uint16_t *d = dst_ptr;
+ int y_qn = subpel_y_qn;
+
+ do {
+ const int16_t *s =
+ (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ const int16_t *y_filter_ptr =
+ av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16(
+ s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
+ offset_s32, vdupq_n_s32(0));
+
+ if (w == 2) {
+ store_u16_2x1(d, d0);
+ } else {
+ vst1_u16(d, d0);
+ }
+
+ y_qn += y_step_qn;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ } else {
+ int width = w;
+
+ do {
+ int height = h;
+ int y_qn = subpel_y_qn;
+
+ uint16_t *d = dst_ptr;
+
+ do {
+ const int16_t *s =
+ (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ const int16_t *y_filter_ptr =
+ av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+
+ uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16(
+ s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
+ offset_s32, vdupq_n_s32(0));
+ vst1q_u16(d, d0);
+
+ y_qn += y_step_qn;
+ d += dst_stride;
+ height--;
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+static INLINE void highbd_convolve_correct_offset_neon(
+ const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
+ int w, int h, const int round_bits, const int offset, const int bd) {
+ const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
+ const int16x4_t offset_s16 = vdup_n_s16(offset);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w <= 4) {
+ for (int y = 0; y < h; ++y) {
+ const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride);
+ const int32x4_t d0 =
+ vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32);
+ uint16x4_t d = vqmovun_s32(d0);
+ d = vmin_u16(d, vget_low_u16(max));
+ if (w == 2) {
+ store_u16_2x1(dst_ptr + y * dst_stride, d);
+ } else {
+ vst1_u16(dst_ptr + y * dst_stride, d);
+ }
+ }
+ } else {
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; x += 8) {
+ // Subtract round offset and convolve round
+ const int16x8_t s =
+ vld1q_s16((const int16_t *)src_ptr + y * src_stride + x);
+ const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16),
+ round_shift_s32);
+ const int32x4_t d1 = vqrshlq_s32(
+ vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32);
+ uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
+ d01 = vminq_u16(d01, max);
+ vst1q_u16(dst_ptr + y * dst_stride + x, d01);
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_neon(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ uint16_t *im_block = (uint16_t *)aom_memalign(
+ 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
+ if (!im_block) return;
+ uint16_t *im_block2 = (uint16_t *)aom_memalign(
+ 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
+ if (!im_block2) {
+ aom_free(im_block); // free the first block and return.
+ return;
+ }
+
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ const int im_stride = MAX_SB_SIZE;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const int x_offset_bits = (1 << (bd + FILTER_BITS - 1));
+ const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int y_offset_correction =
+ ((1 << (y_offset_bits - conv_params->round_1)) +
+ (1 << (y_offset_bits - conv_params->round_1 - 1)));
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+
+ const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ highbd_convolve_2d_x_scale_8tap_neon(
+ src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn,
+ filter_params_x, conv_params, x_offset_bits);
+ if (conv_params->is_compound && !conv_params->do_average) {
+ highbd_convolve_2d_y_scale_8tap_neon(
+ im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params->round_1, y_offset_bits);
+ } else {
+ highbd_convolve_2d_y_scale_8tap_neon(
+ im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params->round_1, y_offset_bits);
+ }
+
+ // Do the compound averaging outside the loop, avoids branching within the
+ // main loop
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ if (conv_params->use_dist_wtd_comp_avg) {
+ highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
+ h, conv_params, bits, y_offset_correction,
+ bd);
+ } else {
+ highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
+ conv_params, bits, y_offset_correction, bd);
+ }
+ }
+ } else {
+ highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride,
+ w, h, bits, y_offset_correction, bd);
+ }
+ aom_free(im_block);
+ aom_free(im_block2);
+}
diff --git a/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c b/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c
new file mode 100644
index 0000000000..84bc8fd963
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c
@@ -0,0 +1,5994 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you canzip
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#if AOM_ARCH_AARCH64
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+ do { \
+ int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
+ int32x4x2_t swap_high = vtrnq_s32(x2, x3); \
+ y0 = vreinterpretq_s32_s64( \
+ vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \
+ vreinterpretq_s64_s32(swap_high.val[0]))); \
+ y1 = vreinterpretq_s32_s64( \
+ vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \
+ vreinterpretq_s64_s32(swap_high.val[1]))); \
+ y2 = vreinterpretq_s32_s64( \
+ vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \
+ vreinterpretq_s64_s32(swap_high.val[0]))); \
+ y3 = vreinterpretq_s32_s64( \
+ vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \
+ vreinterpretq_s64_s32(swap_high.val[1]))); \
+ } while (0)
+#else
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+ do { \
+ int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
+ int32x4x2_t swap_high = vtrnq_s32(x2, x3); \
+ y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2), \
+ swap_high.val[0], 2); \
+ y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2), \
+ swap_high.val[1], 2); \
+ y2 = vextq_s32(swap_low.val[0], \
+ vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
+ y3 = vextq_s32(swap_low.val[1], \
+ vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
+ } while (0)
+#endif // AOM_ARCH_AARCH64
+
+static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
+ TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
+}
+
+static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
+ TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+ TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+ TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+ TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+ out[15]);
+}
+
+static INLINE void round_shift_array_32_neon(int32x4_t *input,
+ int32x4_t *output, const int size,
+ const int bit) {
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ for (int i = 0; i < size; i++) {
+ output[i] = vrshlq_s32(input[i], v_bit);
+ }
+}
+
+static INLINE void round_shift_rect_array_32_neon(int32x4_t *input,
+ int32x4_t *output,
+ const int size) {
+ for (int i = 0; i < size; i++) {
+ const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2);
+ output[i] = vrshrq_n_s32(r0, NewSqrt2Bits);
+ }
+}
+
+static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
+ const int32_t *n1, const int32x4_t *w1,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t x;
+ x = vmlaq_n_s32(*rnding, *w0, *n0);
+ x = vmlaq_n_s32(x, *w1, *n1);
+ x = vshlq_s32(x, *v_bit);
+ return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode11_r(
+ const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+ const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+ int32x4_t x;
+ x = vmlaq_n_s32(*rnding, *w0, -*n0);
+ x = vmlaq_n_s32(x, *w1, -*n1);
+ x = vshlq_s32(x, *v_bit);
+ return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode01_r(
+ const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+ const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+ int32x4_t x;
+ x = vmlaq_n_s32(*rnding, *w0, *n0);
+ x = vmlsq_n_s32(x, *w1, *n1);
+ x = vshlq_s32(x, *v_bit);
+ return x;
+}
+
+static INLINE int32x4_t half_btf_neon_mode10_r(
+ const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
+ const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
+ int32x4_t x;
+ x = vmlaq_n_s32(*rnding, *w1, *n1);
+ x = vmlsq_n_s32(x, *w0, *n0);
+ x = vshlq_s32(x, *v_bit);
+ return x;
+}
+
+static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0,
+ const int32x4_t *w0,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t x;
+ x = vmlaq_n_s32(*rnding, *w0, *n0);
+ x = vshlq_s32(x, *v_bit);
+ return x;
+}
+
+static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
+ const int32x4_t *w0,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t x;
+ x = vmlaq_n_s32(*rnding, *w0, -*n0);
+ x = vshlq_s32(x, *v_bit);
+ return x;
+}
+
+static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
+ const int num_cols);
+
+typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
+ int32_t do_cols, int32_t bd,
+ int32_t out_shift);
+
+static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
+ const uint16x8_t *max) {
+ int16x8_t clamped;
+ clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
+ clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min));
+ return vreinterpretq_u16_s16(clamped);
+}
+
+static INLINE void round_shift_4x4(int32x4_t *in, int shift) {
+ if (shift != 0) {
+ const int32x4_t v_shift = vdupq_n_s32(-shift);
+ in[0] = vrshlq_s32(in[0], v_shift);
+ in[1] = vrshlq_s32(in[1], v_shift);
+ in[2] = vrshlq_s32(in[2], v_shift);
+ in[3] = vrshlq_s32(in[3], v_shift);
+ }
+}
+
+static void round_shift_8x8(int32x4_t *in, int shift) {
+ assert(shift != 0);
+ const int32x4_t v_shift = vdupq_n_s32(-shift);
+ in[0] = vrshlq_s32(in[0], v_shift);
+ in[1] = vrshlq_s32(in[1], v_shift);
+ in[2] = vrshlq_s32(in[2], v_shift);
+ in[3] = vrshlq_s32(in[3], v_shift);
+ in[4] = vrshlq_s32(in[4], v_shift);
+ in[5] = vrshlq_s32(in[5], v_shift);
+ in[6] = vrshlq_s32(in[6], v_shift);
+ in[7] = vrshlq_s32(in[7], v_shift);
+ in[8] = vrshlq_s32(in[8], v_shift);
+ in[9] = vrshlq_s32(in[9], v_shift);
+ in[10] = vrshlq_s32(in[10], v_shift);
+ in[11] = vrshlq_s32(in[11], v_shift);
+ in[12] = vrshlq_s32(in[12], v_shift);
+ in[13] = vrshlq_s32(in[13], v_shift);
+ in[14] = vrshlq_s32(in[14], v_shift);
+ in[15] = vrshlq_s32(in[15], v_shift);
+}
+
+static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi, int size) {
+ int32x4_t a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = vmaxq_s32(in[i], *clamp_lo);
+ out[i] = vminq_s32(a0, *clamp_hi);
+
+ a1 = vmaxq_s32(in[i + 1], *clamp_lo);
+ out[i + 1] = vminq_s32(a1, *clamp_hi);
+
+ a0 = vmaxq_s32(in[i + 2], *clamp_lo);
+ out[i + 2] = vminq_s32(a0, *clamp_hi);
+
+ a1 = vmaxq_s32(in[i + 3], *clamp_lo);
+ out[i + 3] = vminq_s32(a1, *clamp_hi);
+ }
+}
+
+static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
+ int32x4_t res0,
+ int32x4_t res1,
+ const int bd) {
+ const uint16x8_t v_zero = vdupq_n_u16(0);
+ int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero);
+ int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1);
+ uint16x8x2_t x;
+ x.val[0] = vreinterpretq_u16_s32(
+ vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred))));
+ x.val[1] = vreinterpretq_u16_s32(
+ vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred))));
+ x.val[0] = vreinterpretq_u16_s32(
+ vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val));
+ x.val[0] = vreinterpretq_u16_s32(
+ vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val));
+ x.val[1] = vreinterpretq_u16_s32(
+ vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val));
+ x.val[1] = vreinterpretq_u16_s32(
+ vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val));
+ uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
+ vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
+ return res;
+}
+
+static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
+ int32x4_t res0,
+ const int bd) {
+ uint16x4_t x0_ = vreinterpret_u16_s16(
+ vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred))));
+ uint16x8_t x0 = vcombine_u16(x0_, x0_);
+ const uint16x8_t vmin = vdupq_n_u16(0);
+ const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+ x0 = highbd_clamp_u16(&x0, &vmin, &vmax);
+ return vget_low_u16(x0);
+}
+
+static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ uint16x4_t v = vld1_u16(output + i * stride);
+ uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd);
+
+ vst1_u16(output + i * stride, u);
+ }
+}
+
+static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ uint16x8_t v = vld1q_u16(output + i * stride);
+ uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd);
+
+ vst1q_u16(output + i * stride, u);
+ }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ int32x4_t *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1q_s32(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
+ in[0] = vld1q_s32(coeff + 0);
+ in[1] = vld1q_s32(coeff + 4);
+ in[2] = vld1q_s32(coeff + 8);
+ in[3] = vld1q_s32(coeff + 12);
+}
+
+static void addsub_neon(const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) {
+ int32x4_t a0 = vaddq_s32(in0, in1);
+ int32x4_t a1 = vsubq_s32(in0, in1);
+
+ a0 = vmaxq_s32(a0, *clamp_lo);
+ a0 = vminq_s32(a0, *clamp_hi);
+ a1 = vmaxq_s32(a1, *clamp_lo);
+ a1 = vminq_s32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_shift) {
+ int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift);
+ int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift);
+
+ in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo);
+ in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi);
+ in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo);
+ in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi);
+
+ *in0 = in0_w_offset;
+ *in1 = in1_w_offset;
+}
+
+static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t temp1, temp2;
+ temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
+ v_bit, rnding);
+ bf1[30] =
+ half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
+ v_bit, rnding);
+ bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
+ v_bit, rnding);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
+ v_bit, rnding);
+ bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit,
+ rnding);
+ bf1[21] = temp1;
+
+ temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
+ v_bit, rnding);
+ bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
+ v_bit, rnding);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t temp1, temp2;
+ temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14],
+ v_bit, rnding);
+ bf1[14] =
+ half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding);
+ bf1[9] = temp1;
+
+ temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13],
+ v_bit, rnding);
+ bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13],
+ v_bit, rnding);
+ bf1[10] = temp2;
+
+ addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t temp1, temp2;
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
+ v_bit, rnding);
+ bf1[6] =
+ half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding);
+ bf1[5] = temp1;
+
+ addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
+ v_bit, rnding);
+ bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit,
+ rnding);
+ bf1[18] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
+ v_bit, rnding);
+ bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit,
+ rnding);
+ bf1[19] = temp2;
+ temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
+ v_bit, rnding);
+ bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
+ v_bit, rnding);
+ bf1[20] = temp1;
+ temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
+ v_bit, rnding);
+ bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
+ v_bit, rnding);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t temp1, temp2;
+ addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13],
+ v_bit, rnding);
+ bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit,
+ rnding);
+ bf1[10] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12],
+ v_bit, rnding);
+ bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit,
+ rnding);
+ bf1[11] = temp2;
+
+ addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t temp1, temp2;
+ addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
+ v_bit, rnding);
+ bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit,
+ rnding);
+ bf1[20] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
+ v_bit, rnding);
+ bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit,
+ rnding);
+ bf1[21] = temp2;
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
+ v_bit, rnding);
+ bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit,
+ rnding);
+ bf1[22] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
+ v_bit, rnding);
+ bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit,
+ rnding);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi) {
+ addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ for (int i = 0; i < 32; i += 8) {
+ round_shift_4x4(out + i, out_shift);
+ round_shift_4x4(out + i + 4, out_shift);
+ }
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1,
+ int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t *clamp_lo, const int32x4_t *clamp_hi,
+ const int32x4_t *v_shift, int32x4_t *offset) {
+ int32x4_t a0 = vaddq_s32(*offset, *in0);
+ int32x4_t a1 = vsubq_s32(*offset, *in1);
+
+ a0 = vshlq_s32(a0, *v_shift);
+ a1 = vshlq_s32(a1, *v_shift);
+
+ a0 = vmaxq_s32(a0, *clamp_lo);
+ a0 = vminq_s32(a0, *clamp_hi);
+ a1 = vmaxq_s32(a1, *clamp_lo);
+ a1 = vminq_s32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ int32x4_t u0, u1, u2, u3;
+ int32x4_t v0, v1, v2, v3, x, y;
+
+ // Stage 0-1-2
+
+ u0 = in[0];
+ u1 = in[1];
+ u2 = in[2];
+ u3 = in[3];
+
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ x = vmlaq_n_s32(rnding, u0, cospi[32]);
+ y = vmulq_n_s32(u2, cospi[32]);
+ v0 = vaddq_s32(x, y);
+ v0 = vshlq_s32(v0, v_bit);
+
+ v1 = vsubq_s32(x, y);
+ v1 = vshlq_s32(v1, v_bit);
+
+ x = vmlaq_n_s32(rnding, u1, cospi[48]);
+ v2 = vmlsq_n_s32(x, u3, cospi[16]);
+ v2 = vshlq_s32(v2, v_bit);
+
+ x = vmlaq_n_s32(rnding, u1, cospi[16]);
+ v3 = vmlaq_n_s32(x, u3, cospi[48]);
+ v3 = vshlq_s32(v3, v_bit);
+ // Stage 3
+ addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+ addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift);
+ shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift);
+ }
+}
+
+static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const int32x4_t zero = vdupq_n_s32(0);
+ int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1));
+ const int32x2_t mul = vdup_n_s32(1 << 4);
+ int32x4_t t;
+ int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int32x4_t x0, x1, x2, x3;
+ int32x4_t u0, u1, u2, u3;
+
+ x0 = in[0];
+ x1 = in[1];
+ x2 = in[2];
+ x3 = in[3];
+
+ s0 = vmulq_n_s32(x0, sinpi[1]);
+ s1 = vmulq_n_s32(x0, sinpi[2]);
+ s2 = vmulq_n_s32(x1, sinpi[3]);
+ s3 = vmulq_n_s32(x2, sinpi[4]);
+ s4 = vmulq_n_s32(x2, sinpi[1]);
+ s5 = vmulq_n_s32(x3, sinpi[2]);
+ s6 = vmulq_n_s32(x3, sinpi[4]);
+ t = vsubq_s32(x0, x2);
+ s7 = vaddq_s32(t, x3);
+
+ t = vaddq_s32(s0, s3);
+ s0 = vaddq_s32(t, s5);
+ t = vsubq_s32(s1, s4);
+ s1 = vsubq_s32(t, s6);
+ s3 = s2;
+ s2 = vmulq_n_s32(s7, sinpi[3]);
+
+ u0 = vaddq_s32(s0, s3);
+ u1 = vaddq_s32(s1, s3);
+ u2 = s2;
+ t = vaddq_s32(s0, s1);
+ u3 = vsubq_s32(t, s3);
+
+ // u0
+ int32x4x2_t u0x;
+ u0x.val[0] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
+ u0x.val[0] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding));
+
+ u0 = vextq_s32(u0, zero, 1);
+ u0x.val[1] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
+ u0x.val[1] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding));
+
+ u0x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1));
+ u0x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+ u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
+#if AOM_ARCH_AARCH64
+ u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
+ vreinterpretq_s64_s32(u0x.val[1])));
+#else
+ u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
+#endif // AOM_ARCH_AARCH64
+ // u1
+ int32x4x2_t u1x;
+ u1x.val[0] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
+ u1x.val[0] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding));
+
+ u1 = vextq_s32(u1, zero, 1);
+ u1x.val[1] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
+ u1x.val[1] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding));
+
+ u1x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1));
+ u1x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+ u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
+#if AOM_ARCH_AARCH64
+ u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
+ vreinterpretq_s64_s32(u1x.val[1])));
+#else
+ u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
+#endif // AOM_ARCH_AARCH64
+
+ // u2
+ int32x4x2_t u2x;
+ u2x.val[0] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
+ u2x.val[0] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding));
+
+ u2 = vextq_s32(u2, zero, 1);
+ u2x.val[1] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
+ u2x.val[1] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding));
+
+ u2x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1));
+ u2x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+ u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
+#if AOM_ARCH_AARCH64
+ u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
+ vreinterpretq_s64_s32(u2x.val[1])));
+#else
+ u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
+#endif // AOM_ARCH_AARCH64
+
+ // u3
+ int32x4x2_t u3x;
+ u3x.val[0] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
+ u3x.val[0] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding));
+
+ u3 = vextq_s32(u3, zero, 1);
+ u3x.val[1] = vreinterpretq_s32_s64(
+ vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
+ u3x.val[1] = vreinterpretq_s32_s64(
+ vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding));
+
+ u3x.val[0] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1));
+ u3x.val[1] = vreinterpretq_s32_s16(vextq_s16(
+ vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
+
+ u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
+#if AOM_ARCH_AARCH64
+ u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
+ vreinterpretq_s64_s32(u3x.val[1])));
+#else
+ u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
+#endif // AOM_ARCH_AARCH64
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+}
+
+static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ uint32x4_t u0, u1, u2, u3;
+ uint16x4_t v0, v1, v2, v3;
+ round_shift_4x4(in, shift);
+
+ v0 = vld1_u16(output + 0 * stride);
+ v1 = vld1_u16(output + 1 * stride);
+ v2 = vld1_u16(output + 2 * stride);
+ v3 = vld1_u16(output + 3 * stride);
+
+ if (fliplr) {
+ u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0]));
+ in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+ u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1]));
+ in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+ u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2]));
+ in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+ u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3]));
+ in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
+ }
+
+ if (flipud) {
+ u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0);
+ u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1);
+ u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2);
+ u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3);
+ } else {
+ u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0);
+ u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1);
+ u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2);
+ u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3);
+ }
+
+ uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1));
+ uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3));
+ const uint16x8_t vmin = vdupq_n_u16(0);
+ const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+ u4 = highbd_clamp_u16(&u4, &vmin, &vmax);
+ u5 = highbd_clamp_u16(&u5, &vmin, &vmax);
+
+ vst1_u16(output + 0 * stride, vget_low_u16(u4));
+ vst1_u16(output + 1 * stride, vget_high_u16(u4));
+ vst1_u16(output + 2 * stride, vget_low_u16(u5));
+ vst1_u16(output + 3 * stride, vget_high_u16(u5));
+}
+
+static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ int32x4_t zero = vdupq_n_s32(0);
+ int32x2_t fact = vdup_n_s32(NewSqrt2);
+ int32x4x2_t a0;
+ const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
+
+ for (int i = 0; i < 4; i++) {
+ a0.val[0] = vreinterpretq_s32_s64(
+ vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
+ a0.val[0] = vreinterpretq_s32_s64(
+ vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
+ a0.val[1] = vextq_s32(in[i], zero, 1);
+ a0.val[1] = vreinterpretq_s32_s64(
+ vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
+ a0.val[1] = vreinterpretq_s32_s64(
+ vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
+
+ a0 = vzipq_s32(a0.val[0], a0.val[1]);
+#if AOM_ARCH_AARCH64
+ out[i] = vreinterpretq_s32_s64(vzip1q_s64(
+ vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
+#else
+ out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
+#endif
+ }
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+}
+
+void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ int32x4_t in[4];
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_4x4(in, in);
+ iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) {
+ in[0] = vld1q_s32(coeff + 0);
+ in[1] = vld1q_s32(coeff + 4);
+ in[2] = vld1q_s32(coeff + 8);
+ in[3] = vld1q_s32(coeff + 12);
+ in[4] = vld1q_s32(coeff + 16);
+ in[5] = vld1q_s32(coeff + 20);
+ in[6] = vld1q_s32(coeff + 24);
+ in[7] = vld1q_s32(coeff + 28);
+ in[8] = vld1q_s32(coeff + 32);
+ in[9] = vld1q_s32(coeff + 36);
+ in[10] = vld1q_s32(coeff + 40);
+ in[11] = vld1q_s32(coeff + 44);
+ in[12] = vld1q_s32(coeff + 48);
+ in[13] = vld1q_s32(coeff + 52);
+ in[14] = vld1q_s32(coeff + 56);
+ in[15] = vld1q_s32(coeff + 60);
+}
+
+static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+ int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+ int32x4_t x, y;
+ int col;
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0 * 2 + col];
+ u1 = in[4 * 2 + col];
+ u2 = in[2 * 2 + col];
+ u3 = in[6 * 2 + col];
+
+ x = vmulq_n_s32(in[1 * 2 + col], cospi[56]);
+ u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]);
+ u4 = vaddq_s32(u4, rnding);
+ u4 = vshlq_s32(u4, v_bit);
+
+ x = vmulq_n_s32(in[1 * 2 + col], cospi[8]);
+ u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]);
+ u7 = vaddq_s32(u7, rnding);
+ u7 = vshlq_s32(u7, v_bit);
+
+ x = vmulq_n_s32(in[5 * 2 + col], cospi[24]);
+ u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]);
+ u5 = vaddq_s32(u5, rnding);
+ u5 = vshlq_s32(u5, v_bit);
+
+ x = vmulq_n_s32(in[5 * 2 + col], cospi[40]);
+ u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]);
+ u6 = vaddq_s32(u6, rnding);
+ u6 = vshlq_s32(u6, v_bit);
+
+ // stage 3
+ x = vmulq_n_s32(u0, cospi[32]);
+ y = vmulq_n_s32(u1, cospi[32]);
+ v0 = vaddq_s32(x, y);
+ v0 = vaddq_s32(v0, rnding);
+ v0 = vshlq_s32(v0, v_bit);
+
+ v1 = vsubq_s32(x, y);
+ v1 = vaddq_s32(v1, rnding);
+ v1 = vshlq_s32(v1, v_bit);
+
+ x = vmulq_n_s32(u2, cospi[48]);
+ v2 = vmlaq_n_s32(x, u3, -cospi[16]);
+ v2 = vaddq_s32(v2, rnding);
+ v2 = vshlq_s32(v2, v_bit);
+
+ x = vmulq_n_s32(u2, cospi[16]);
+ v3 = vmlaq_n_s32(x, u3, cospi[48]);
+ v3 = vaddq_s32(v3, rnding);
+ v3 = vshlq_s32(v3, v_bit);
+
+ addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = vmulq_n_s32(v5, cospi[32]);
+ y = vmulq_n_s32(v6, cospi[32]);
+ u6 = vaddq_s32(y, x);
+ u6 = vaddq_s32(u6, rnding);
+ u6 = vshlq_s32(u6, v_bit);
+
+ u5 = vsubq_s32(y, x);
+ u5 = vaddq_s32(u5, rnding);
+ u5 = vshlq_s32(u5, v_bit);
+
+ // stage 5
+ addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+}
+
+static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int32x4_t kZero = vdupq_n_s32(0);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t u[8], v[8], x;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-1-2
+ // (1)
+ u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]);
+ u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
+ u[0] = vshlq_s32(u[0], v_bit);
+
+ u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]);
+ u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
+ u[1] = vshlq_s32(u[1], v_bit);
+
+ // (2)
+ u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]);
+ u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]);
+ u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ // (3)
+ u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]);
+ u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]);
+ u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ // (4)
+ u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]);
+ u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]);
+ u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 3
+ addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+ u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+ u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
+ u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]);
+ u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 5
+ addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+ x = vmulq_n_s32(v[3], cospi[32]);
+ u[2] = vaddq_s32(v[0], x);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vsubq_s32(v[0], x);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+ x = vmulq_n_s32(v[7], cospi[32]);
+ u[6] = vaddq_s32(v[0], x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vsubq_s32(v[0], x);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[2] = vsubq_s32(kZero, u[4]);
+ out[4] = u[6];
+ out[6] = vsubq_s32(kZero, u[2]);
+ out[8] = u[3];
+ out[10] = vsubq_s32(kZero, u[7]);
+ out[12] = u[5];
+ out[14] = vsubq_s32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ }
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]);
+ u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]);
+ u[0] = vshlq_s32(u[0], v_bit);
+
+ u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]);
+ u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]);
+ u[1] = vshlq_s32(u[1], v_bit);
+
+ // (2)
+ u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]);
+ u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]);
+ u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ // (3)
+ u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]);
+ u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]);
+ u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ // (4)
+ u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]);
+ u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]);
+ u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 3
+ addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+ u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+ u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
+ u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
+ u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 5
+ addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+ x = vmulq_n_s32(v[3], cospi[32]);
+ u[2] = vaddq_s32(v[0], x);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vsubq_s32(v[0], x);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+ x = vmulq_n_s32(v[7], cospi[32]);
+ u[6] = vaddq_s32(v[0], x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vsubq_s32(v[0], x);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 7
+ if (do_cols) {
+ out[1] = u[0];
+ out[3] = vsubq_s32(kZero, u[4]);
+ out[5] = u[6];
+ out[7] = vsubq_s32(kZero, u[2]);
+ out[9] = u[3];
+ out[11] = vsubq_s32(kZero, u[7]);
+ out[13] = u[5];
+ out[15] = vsubq_s32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ }
+}
+
+static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ out[0] = vaddq_s32(in[0], in[0]);
+ out[1] = vaddq_s32(in[1], in[1]);
+ out[2] = vaddq_s32(in[2], in[2]);
+ out[3] = vaddq_s32(in[3], in[3]);
+ out[4] = vaddq_s32(in[4], in[4]);
+ out[5] = vaddq_s32(in[5], in[5]);
+ out[6] = vaddq_s32(in[6], in[6]);
+ out[7] = vaddq_s32(in[7], in[7]);
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8);
+ }
+}
+
+static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo,
+ int32x4_t res_hi, int fliplr, int bd) {
+ uint16x8x2_t x;
+
+ if (fliplr) {
+ res_lo = vrev64q_s32(res_lo);
+ res_lo = vextq_s32(res_lo, res_lo, 2);
+ res_hi = vrev64q_s32(res_hi);
+ res_hi = vextq_s32(res_hi, res_hi, 2);
+ x.val[0] = vreinterpretq_u16_s32(
+ vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred))));
+ x.val[1] = vreinterpretq_u16_s32(
+ vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred))));
+
+ } else {
+ x.val[0] = vreinterpretq_u16_s32(
+ vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred))));
+ x.val[1] = vreinterpretq_u16_s32(
+ vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred))));
+ }
+
+ uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
+ vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
+ const uint16x8_t vmin = vdupq_n_u16(0);
+ const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
+ return highbd_clamp_u16(&x2, &vmin, &vmax);
+}
+
+static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7;
+ uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
+ round_shift_8x8(in, shift);
+
+ v0 = vld1q_u16(output + 0 * stride);
+ v1 = vld1q_u16(output + 1 * stride);
+ v2 = vld1q_u16(output + 2 * stride);
+ v3 = vld1q_u16(output + 3 * stride);
+ v4 = vld1q_u16(output + 4 * stride);
+ v5 = vld1q_u16(output + 5 * stride);
+ v6 = vld1q_u16(output + 6 * stride);
+ v7 = vld1q_u16(output + 7 * stride);
+
+ if (flipud) {
+ u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+ } else {
+ u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+ }
+
+ vst1q_u16(output + 0 * stride, u0);
+ vst1q_u16(output + 1 * stride, u1);
+ vst1q_u16(output + 2 * stride, u2);
+ vst1q_u16(output + 3 * stride, u3);
+ vst1q_u16(output + 4 * stride, u4);
+ vst1q_u16(output + 5 * stride, u5);
+ vst1q_u16(output + 6 * stride, u6);
+ vst1q_u16(output + 7 * stride, u7);
+}
+
+void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ int32x4_t in[16], out[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t x;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-1-2-3
+ x = vmulq_n_s32(in[0], cospi[32]);
+ x = vaddq_s32(vshlq_s32(x, v_bit), rnding);
+
+ // stage 4-5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ x = vaddq_s32(x, offset);
+ x = vshlq_s32(x, vdupq_n_s32(-out_shift));
+ }
+
+ x = vmaxq_s32(x, clamp_lo);
+ x = vminq_s32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+
+static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+ int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+ int32x4_t x, y;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = vmlaq_n_s32(rnding, in[1], cospi[56]);
+ u4 = vmlaq_n_s32(x, in[7], -cospi[8]);
+ u4 = vshlq_s32(u4, v_bit);
+
+ x = vmlaq_n_s32(rnding, in[1], cospi[8]);
+ u7 = vmlaq_n_s32(x, in[7], cospi[56]);
+ u7 = vshlq_s32(u7, v_bit);
+
+ x = vmlaq_n_s32(rnding, in[5], cospi[24]);
+ u5 = vmlaq_n_s32(x, in[3], -cospi[40]);
+ u5 = vshlq_s32(u5, v_bit);
+
+ x = vmlaq_n_s32(rnding, in[5], cospi[40]);
+ u6 = vmlaq_n_s32(x, in[3], cospi[24]);
+ u6 = vshlq_s32(u6, v_bit);
+
+ // stage 3
+ x = vmlaq_n_s32(rnding, u0, cospi[32]);
+ y = vmulq_n_s32(u1, cospi[32]);
+ v0 = vaddq_s32(x, y);
+ v0 = vshlq_s32(v0, v_bit);
+
+ v1 = vsubq_s32(x, y);
+ v1 = vshlq_s32(v1, v_bit);
+
+ x = vmlaq_n_s32(rnding, u2, cospi[48]);
+ v2 = vmlaq_n_s32(x, u3, -cospi[16]);
+ v2 = vshlq_s32(v2, v_bit);
+
+ x = vmlaq_n_s32(rnding, u2, cospi[16]);
+ v3 = vmlaq_n_s32(x, u3, cospi[48]);
+ v3 = vshlq_s32(v3, v_bit);
+
+ addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = vmulq_n_s32(v5, cospi[32]);
+ y = vmlaq_n_s32(rnding, v6, cospi[32]);
+ u6 = vaddq_s32(y, x);
+ u6 = vshlq_s32(u6, v_bit);
+
+ u5 = vsubq_s32(y, x);
+ u5 = vshlq_s32(u5, v_bit);
+
+ // stage 5
+ addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+ }
+}
+
+static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ int32x4_t u[8], x;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-2
+
+ u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]);
+ u[0] = vshlq_s32(u[0], v_bit);
+
+ u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]);
+ u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit);
+
+ // stage 3-4
+ int32x4_t temp1, temp2;
+ temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]);
+ temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]);
+ temp1 = vshlq_s32(temp1, v_bit);
+ u[4] = temp1;
+
+ temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]);
+ u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ // stage 5-6
+ temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]);
+ x = vmulq_n_s32(u[1], cospi[32]);
+ u[2] = vaddq_s32(temp1, x);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vsubq_s32(temp1, x);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]);
+ x = vmulq_n_s32(u[5], cospi[32]);
+ u[6] = vaddq_s32(temp1, x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vsubq_s32(temp1, x);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = vnegq_s32(u[4]);
+ out[2] = u[6];
+ out[3] = vnegq_s32(u[2]);
+ out[4] = u[3];
+ out[5] = vnegq_s32(u[7]);
+ out[6] = u[5];
+ out[7] = vnegq_s32(u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ }
+}
+
+static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t u[8], v[8], x;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-2
+
+ u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]);
+ u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
+ u[0] = vshlq_s32(u[0], v_bit);
+
+ u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]);
+ u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
+ u[1] = vshlq_s32(u[1], v_bit);
+
+ // (2)
+ u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]);
+ u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]);
+ u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ // (3)
+ u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]);
+ u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]);
+ u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ // (4)
+ u[6] = vmulq_n_s32(in[1], cospi[52]);
+ u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]);
+ u[6] = vaddq_s32(u[6], rnding);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmulq_n_s32(in[1], cospi[12]);
+ u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]);
+ u[7] = vaddq_s32(u[7], rnding);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 3
+ addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
+ u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
+ u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]);
+ u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
+ u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 5
+ addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
+ x = vmulq_n_s32(v[3], cospi[32]);
+ u[2] = vaddq_s32(v[0], x);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vsubq_s32(v[0], x);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
+ x = vmulq_n_s32(v[7], cospi[32]);
+ u[6] = vaddq_s32(v[0], x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vsubq_s32(v[0], x);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = vnegq_s32(u[4]);
+ out[2] = u[6];
+ out[3] = vnegq_s32(u[2]);
+ out[4] = u[3];
+ out[5] = vnegq_s32(u[7]);
+ out[6] = u[5];
+ out[7] = vnegq_s32(u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ }
+}
+
+static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-4
+ in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]);
+ in[0] = vshlq_s32(in[0], v_bit);
+
+ // stage 5-7
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ if (out_shift != 0) {
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ in[0] = vaddq_s32(in[0], offset);
+ in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift));
+ }
+ }
+
+ in[0] = vmaxq_s32(in[0], clamp_lo);
+ in[0] = vminq_s32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+}
+
+static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ int32x4_t u[16], x, y;
+ // stage 0-1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+ u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+
+ u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
+ u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
+
+ u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
+ u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
+
+ u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+ u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+
+ // stage 3
+ u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
+ u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
+ u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding);
+ u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding);
+
+ addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = vmlaq_n_s32(rnding, u[0], cospi[32]);
+ u[0] = vshlq_s32(x, v_bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
+ u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
+
+ addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+ &rnding);
+ u[14] =
+ half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+ u[9] = x;
+ y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit,
+ &rnding);
+ u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit,
+ &rnding);
+ u[10] = y;
+
+ // stage 5
+ addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = vmulq_n_s32(u[5], cospi[32]);
+ y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+ u[5] = vsubq_s32(y, x);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ u[6] = vaddq_s32(y, x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = vmulq_n_s32(u[10], cospi[32]);
+ y = vmlaq_n_s32(rnding, u[13], cospi[32]);
+ u[10] = vsubq_s32(y, x);
+ u[10] = vshlq_s32(u[10], v_bit);
+
+ u[13] = vaddq_s32(x, y);
+ u[13] = vshlq_s32(u[13], v_bit);
+
+ x = vmulq_n_s32(u[11], cospi[32]);
+ y = vmlaq_n_s32(rnding, u[12], cospi[32]);
+ u[11] = vsubq_s32(y, x);
+ u[11] = vshlq_s32(u[11], v_bit);
+
+ u[12] = vaddq_s32(x, y);
+ u[12] = vshlq_s32(u[12], v_bit);
+ // stage 7
+ addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+}
+
+static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ int32x4_t v[16], x, y, temp1, temp2;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
+ v[0] = vshlq_s32(v[0], v_bit);
+
+ v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
+ v[1] = vshlq_s32(v[1], v_bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]);
+ temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]);
+ temp1 = vshlq_s32(temp1, v_bit);
+
+ temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]);
+ temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]);
+ temp2 = vshlq_s32(temp2, v_bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]);
+ temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]);
+ temp1 = vshlq_s32(temp1, v_bit);
+
+ temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]);
+ temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]);
+ temp2 = vshlq_s32(temp2, v_bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]);
+ temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]);
+ temp1 = vshlq_s32(temp1, v_bit);
+
+ temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]);
+ temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]);
+ temp2 = vshlq_s32(temp2, v_bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = vmlaq_n_s32(rnding, v[2], cospi[32]);
+ x = vmulq_n_s32(v[3], cospi[32]);
+ v[2] = vaddq_s32(y, x);
+ v[2] = vshlq_s32(v[2], v_bit);
+
+ v[3] = vsubq_s32(y, x);
+ v[3] = vshlq_s32(v[3], v_bit);
+
+ y = vmlaq_n_s32(rnding, v[6], cospi[32]);
+ x = vmulq_n_s32(v[7], cospi[32]);
+ v[6] = vaddq_s32(y, x);
+ v[6] = vshlq_s32(v[6], v_bit);
+
+ v[7] = vsubq_s32(y, x);
+ v[7] = vshlq_s32(v[7], v_bit);
+
+ y = vmlaq_n_s32(rnding, v[10], cospi[32]);
+ x = vmulq_n_s32(v[11], cospi[32]);
+ v[10] = vaddq_s32(y, x);
+ v[10] = vshlq_s32(v[10], v_bit);
+
+ v[11] = vsubq_s32(y, x);
+ v[11] = vshlq_s32(v[11], v_bit);
+
+ y = vmlaq_n_s32(rnding, v[14], cospi[32]);
+ x = vmulq_n_s32(v[15], cospi[32]);
+ v[14] = vaddq_s32(y, x);
+ v[14] = vshlq_s32(v[14], v_bit);
+
+ v[15] = vsubq_s32(y, x);
+ v[15] = vshlq_s32(v[15], v_bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = vnegq_s32(v[8]);
+ out[2] = v[12];
+ out[3] = vnegq_s32(v[4]);
+ out[4] = v[6];
+ out[5] = vnegq_s32(v[14]);
+ out[6] = v[10];
+ out[7] = vnegq_s32(v[2]);
+ out[8] = v[3];
+ out[9] = vnegq_s32(v[11]);
+ out[10] = v[15];
+ out[11] = vnegq_s32(v[7]);
+ out[12] = v[5];
+ out[13] = vnegq_s32(v[13]);
+ out[14] = v[9];
+ out[15] = vnegq_s32(v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ }
+}
+
+static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t zero = vdupq_n_s32(0);
+ int32x4_t u[16], x, y;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-2
+ u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
+ u[0] = vshlq_s32(u[0], v_bit);
+
+ u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
+ u[1] = vshlq_s32(u[1], v_bit);
+
+ u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]);
+ u[3] = vshlq_s32(u[3], v_bit);
+
+ u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
+ u[8] = vshlq_s32(u[8], v_bit);
+
+ u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
+ u[9] = vshlq_s32(u[9], v_bit);
+
+ u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
+ u[10] = vshlq_s32(u[10], v_bit);
+
+ u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
+ u[11] = vshlq_s32(u[11], v_bit);
+
+ u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
+ u[12] = vshlq_s32(u[12], v_bit);
+
+ u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
+ u[13] = vshlq_s32(u[13], v_bit);
+
+ u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
+ u[14] = vshlq_s32(u[14], v_bit);
+
+ u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
+ u[15] = vshlq_s32(u[15], v_bit);
+
+ // stage 3
+ addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = vmlaq_n_s32(rnding, u[8], cospi[56]);
+ u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
+ u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]);
+ u[8] = vshlq_s32(u[8], v_bit);
+
+ u[9] = vmlsq_n_s32(y, u[9], cospi[8]);
+ u[9] = vshlq_s32(u[9], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[10], cospi[24]);
+ u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
+ u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]);
+ u[10] = vshlq_s32(u[10], v_bit);
+
+ u[11] = vmlsq_n_s32(y, u[11], cospi[40]);
+ u[11] = vshlq_s32(u[11], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[12], cospi[8]);
+ u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]);
+ u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]);
+ u[12] = vshlq_s32(u[12], v_bit);
+
+ u[13] = vmlaq_n_s32(y, u[13], cospi[56]);
+ u[13] = vshlq_s32(u[13], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[14], cospi[40]);
+ u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]);
+ u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]);
+ u[14] = vshlq_s32(u[14], v_bit);
+
+ u[15] = vmlaq_n_s32(y, u[15], cospi[24]);
+ u[15] = vshlq_s32(u[15], v_bit);
+
+ // stage 5
+ addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ y = vmlaq_n_s32(rnding, u[4], cospi[48]);
+ u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
+ u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]);
+ u[4] = vshlq_s32(u[4], v_bit);
+
+ u[5] = vmlsq_n_s32(y, u[5], cospi[16]);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[6], cospi[16]);
+ u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]);
+ u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vmlaq_n_s32(y, u[7], cospi[48]);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[12], cospi[48]);
+ u[12] = vmulq_n_s32(u[12], cospi[16]);
+ u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]);
+ u[12] = vshlq_s32(u[12], v_bit);
+
+ u[13] = vmlsq_n_s32(y, u[13], cospi[16]);
+ u[13] = vshlq_s32(u[13], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[14], cospi[16]);
+ u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]);
+ u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]);
+ u[14] = vshlq_s32(u[14], v_bit);
+
+ u[15] = vmlaq_n_s32(y, u[15], cospi[48]);
+ u[15] = vshlq_s32(u[15], v_bit);
+
+ // stage 7
+ addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = vmlaq_n_s32(rnding, u[2], cospi[32]);
+ x = vmulq_n_s32(u[3], cospi[32]);
+ u[2] = vaddq_s32(y, x);
+ u[2] = vshlq_s32(u[2], v_bit);
+
+ u[3] = vsubq_s32(y, x);
+ u[3] = vshlq_s32(u[3], v_bit);
+ y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+ x = vmulq_n_s32(u[7], cospi[32]);
+ u[6] = vaddq_s32(y, x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = vsubq_s32(y, x);
+ u[7] = vshlq_s32(u[7], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[10], cospi[32]);
+ x = vmulq_n_s32(u[11], cospi[32]);
+ u[10] = vaddq_s32(y, x);
+ u[10] = vshlq_s32(u[10], v_bit);
+
+ u[11] = vsubq_s32(y, x);
+ u[11] = vshlq_s32(u[11], v_bit);
+
+ y = vmlaq_n_s32(rnding, u[14], cospi[32]);
+ x = vmulq_n_s32(u[15], cospi[32]);
+ u[14] = vaddq_s32(y, x);
+ u[14] = vshlq_s32(u[14], v_bit);
+
+ u[15] = vsubq_s32(y, x);
+ u[15] = vshlq_s32(u[15], v_bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = vsubq_s32(zero, u[8]);
+ out[2] = u[12];
+ out[3] = vsubq_s32(zero, u[4]);
+ out[4] = u[6];
+ out[5] = vsubq_s32(zero, u[14]);
+ out[6] = u[10];
+ out[7] = vsubq_s32(zero, u[2]);
+ out[8] = u[3];
+ out[9] = vsubq_s32(zero, u[11]);
+ out[10] = u[15];
+ out[11] = vsubq_s32(zero, u[7]);
+ out[12] = u[5];
+ out[13] = vsubq_s32(zero, u[13]);
+ out[14] = u[9];
+ out[15] = vsubq_s32(zero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ }
+}
+
+static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t u[16], v[16], x, y;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ {
+ // stage 0-1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit,
+ &rnding);
+ v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit,
+ &rnding);
+ v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13],
+ &v_bit, &rnding);
+ v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12],
+ &v_bit, &rnding);
+ v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit,
+ &rnding);
+ v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit,
+ &rnding);
+ v[14] =
+ half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding);
+ v[15] =
+ half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit,
+ &rnding);
+ u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit,
+ &rnding);
+ u[6] =
+ half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding);
+ u[7] =
+ half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding);
+ addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = vmlaq_n_s32(rnding, u[0], cospi[32]);
+ y = vmulq_n_s32(u[1], cospi[32]);
+ v[0] = vaddq_s32(x, y);
+ v[0] = vshlq_s32(v[0], v_bit);
+
+ v[1] = vsubq_s32(x, y);
+ v[1] = vshlq_s32(v[1], v_bit);
+
+ v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit,
+ &rnding);
+ v[3] =
+ half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding);
+ addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+ &rnding);
+ v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
+ &v_bit, &rnding);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+ &v_bit, &rnding);
+ v[14] =
+ half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = vmulq_n_s32(v[5], cospi[32]);
+ y = vmlaq_n_s32(rnding, v[6], cospi[32]);
+ u[5] = vsubq_s32(y, x);
+ u[5] = vshlq_s32(u[5], v_bit);
+
+ u[6] = vaddq_s32(y, x);
+ u[6] = vshlq_s32(u[6], v_bit);
+
+ u[7] = v[7];
+ addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = vmulq_n_s32(u[10], cospi[32]);
+ y = vmlaq_n_s32(rnding, u[13], cospi[32]);
+ v[10] = vsubq_s32(y, x);
+ v[10] = vshlq_s32(v[10], v_bit);
+
+ v[13] = vaddq_s32(x, y);
+ v[13] = vshlq_s32(v[13], v_bit);
+
+ x = vmulq_n_s32(u[11], cospi[32]);
+ y = vmlaq_n_s32(rnding, u[12], cospi[32]);
+ v[11] = vsubq_s32(y, x);
+ v[11] = vshlq_s32(v[11], v_bit);
+
+ v[12] = vaddq_s32(x, y);
+ v[12] = vshlq_s32(v[12], v_bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out =
+ vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ const int32x4_t zero = vdupq_n_s32(0);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ int32x4_t u[16], v[16], x, y;
+ // Calculate the column 0, 1, 2, 3
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]);
+ v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]);
+ v[0] = vshlq_s32(v[0], v_bit);
+
+ v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]);
+ v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]);
+ v[1] = vshlq_s32(v[1], v_bit);
+
+ v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]);
+ v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]);
+ v[2] = vshlq_s32(v[2], v_bit);
+
+ v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]);
+ v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]);
+ v[3] = vshlq_s32(v[3], v_bit);
+
+ v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]);
+ v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]);
+ v[4] = vshlq_s32(v[4], v_bit);
+
+ v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]);
+ v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]);
+ v[5] = vshlq_s32(v[5], v_bit);
+
+ v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]);
+ v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]);
+ v[6] = vshlq_s32(v[6], v_bit);
+
+ v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]);
+ v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]);
+ v[7] = vshlq_s32(v[7], v_bit);
+
+ v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
+ v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]);
+ v[8] = vshlq_s32(v[8], v_bit);
+
+ v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
+ v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]);
+ v[9] = vshlq_s32(v[9], v_bit);
+
+ v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
+ v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]);
+ v[10] = vshlq_s32(v[10], v_bit);
+
+ v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
+ v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]);
+ v[11] = vshlq_s32(v[11], v_bit);
+
+ v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
+ v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]);
+ v[12] = vshlq_s32(v[12], v_bit);
+
+ v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
+ v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]);
+ v[13] = vshlq_s32(v[13], v_bit);
+
+ v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
+ v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]);
+ v[14] = vshlq_s32(v[14], v_bit);
+
+ v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
+ v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]);
+ v[15] = vshlq_s32(v[15], v_bit);
+
+ // stage 3
+ addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
+ v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]);
+ v[8] = vshlq_s32(v[8], v_bit);
+
+ v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]);
+ v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]);
+ v[9] = vshlq_s32(v[9], v_bit);
+
+ v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
+ v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]);
+ v[10] = vshlq_s32(v[10], v_bit);
+
+ v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]);
+ v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]);
+ v[11] = vshlq_s32(v[11], v_bit);
+
+ v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]);
+ v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]);
+ v[12] = vshlq_s32(v[12], v_bit);
+
+ v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]);
+ v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]);
+ v[13] = vshlq_s32(v[13], v_bit);
+
+ v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]);
+ v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]);
+ v[14] = vshlq_s32(v[14], v_bit);
+
+ v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]);
+ v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]);
+ v[15] = vshlq_s32(v[15], v_bit);
+
+ // stage 5
+ addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
+ v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]);
+ v[4] = vshlq_s32(v[4], v_bit);
+
+ v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]);
+ v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]);
+ v[5] = vshlq_s32(v[5], v_bit);
+
+ v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]);
+ v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]);
+ v[6] = vshlq_s32(v[6], v_bit);
+
+ v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]);
+ v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]);
+ v[7] = vshlq_s32(v[7], v_bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]);
+ v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]);
+ v[12] = vshlq_s32(v[12], v_bit);
+
+ v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]);
+ v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]);
+ v[13] = vshlq_s32(v[13], v_bit);
+
+ v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]);
+ v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]);
+ v[14] = vshlq_s32(v[14], v_bit);
+
+ v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]);
+ v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]);
+ v[15] = vshlq_s32(v[15], v_bit);
+
+ // stage 7
+ addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = vmlaq_n_s32(rnding, u[2], cospi[32]);
+ x = vmulq_n_s32(u[3], cospi[32]);
+ v[2] = vaddq_s32(y, x);
+ v[2] = vshlq_s32(v[2], v_bit);
+
+ v[3] = vsubq_s32(y, x);
+ v[3] = vshlq_s32(v[3], v_bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = vmlaq_n_s32(rnding, u[6], cospi[32]);
+ x = vmulq_n_s32(u[7], cospi[32]);
+ v[6] = vaddq_s32(y, x);
+ v[6] = vshlq_s32(v[6], v_bit);
+
+ v[7] = vsubq_s32(y, x);
+ v[7] = vshlq_s32(v[7], v_bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = vmlaq_n_s32(rnding, u[10], cospi[32]);
+ x = vmulq_n_s32(u[11], cospi[32]);
+ v[10] = vaddq_s32(y, x);
+ v[10] = vshlq_s32(v[10], v_bit);
+
+ v[11] = vsubq_s32(y, x);
+ v[11] = vshlq_s32(v[11], v_bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = vmlaq_n_s32(rnding, u[14], cospi[32]);
+ x = vmulq_n_s32(u[15], cospi[32]);
+ v[14] = vaddq_s32(y, x);
+ v[14] = vshlq_s32(v[14], v_bit);
+
+ v[15] = vsubq_s32(y, x);
+ v[15] = vshlq_s32(v[15], v_bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = vsubq_s32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = vsubq_s32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = vsubq_s32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = vsubq_s32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = vsubq_s32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = vsubq_s32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = vsubq_s32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = vsubq_s32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ const int32x4_t v_shift = vdupq_n_s32(-out_shift);
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ &v_shift, &offset);
+ neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, &v_shift, &offset);
+ }
+}
+
+static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ (void)bit;
+ int32x2_t fact = vdup_n_s32(2 * NewSqrt2);
+ int32x4x2_t a0;
+ int32x4_t zero = vdupq_n_s32(0);
+ const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
+ for (int i = 0; i < 16; i++) {
+ a0.val[0] = vreinterpretq_s32_s64(
+ vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
+ a0.val[0] = vreinterpretq_s32_s64(
+ vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
+ a0.val[1] = vextq_s32(in[i], zero, 1);
+ a0.val[1] = vreinterpretq_s32_s64(
+ vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
+ a0.val[1] = vreinterpretq_s32_s64(
+ vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
+ a0 = vzipq_s32(a0.val[0], a0.val[1]);
+#if AOM_ARCH_AARCH64
+ out[i] = vreinterpretq_s32_s64(vzip1q_s64(
+ vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
+#else
+ out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
+#endif
+ }
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16);
+ }
+}
+
+static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int i;
+ int32x4_t temp1, temp2, temp3, temp4;
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit,
+ rnding);
+ u[13] =
+ half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding);
+ u[10] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit,
+ rnding);
+ u[12] =
+ half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit,
+ rnding);
+ temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit,
+ rnding);
+ temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit,
+ rnding);
+ temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit,
+ rnding);
+ u[56] =
+ half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding);
+ u[57] =
+ half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding);
+ u[58] =
+ half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding);
+ u[59] =
+ half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit,
+ rnding);
+ temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit,
+ rnding);
+ temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit,
+ rnding);
+ temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit,
+ rnding);
+ u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit,
+ rnding);
+ u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit,
+ rnding);
+ u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit,
+ rnding);
+ u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit,
+ rnding);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int i;
+ int32x4_t temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit,
+ rnding);
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit,
+ rnding);
+ temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit,
+ rnding);
+ temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit,
+ rnding);
+ u[24] =
+ half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding);
+ u[25] =
+ half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding);
+ u[26] =
+ half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding);
+ u[27] =
+ half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi,
+ const int32x4_t *v_bit,
+ const int32x4_t *rnding) {
+ int32x4_t temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit,
+ rnding);
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit,
+ rnding);
+ temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit,
+ rnding);
+ temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit,
+ rnding);
+ u[52] =
+ half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding);
+ u[53] =
+ half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding);
+ u[54] =
+ half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding);
+ u[55] =
+ half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit,
+ rnding);
+ temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit,
+ rnding);
+ temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit,
+ rnding);
+ temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit,
+ rnding);
+ u[48] =
+ half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding);
+ u[49] =
+ half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding);
+ u[50] =
+ half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding);
+ u[51] =
+ half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
+ int do_cols, int bd, int out_shift,
+ const int32x4_t *clamp_lo,
+ const int32x4_t *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ for (int i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4);
+ }
+ }
+}
+
+static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ {
+ int32x4_t x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
+ x = vaddq_s32(x, offset);
+ x = vshlq_s32(x, vdupq_n_s32(-out_shift));
+ }
+ }
+ x = vmaxq_s32(x, clamp_lo);
+ x = vminq_s32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+ out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
+ }
+}
+
+static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ {
+ int32x4_t u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+ u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+ u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+ u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+ u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+ u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+ u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+ u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+
+ // stage 3
+ u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
+ u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
+ u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
+ u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ int32x4_t temp1, temp2;
+ u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+ u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
+ &v_bit, &rnding);
+ u[62] =
+ half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+ u[33] = temp1;
+
+ temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+ &v_bit, &rnding);
+ u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+ &v_bit, &rnding);
+ u[57] = temp2;
+
+ temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+ &v_bit, &rnding);
+ u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+ &rnding);
+ u[41] = temp1;
+
+ temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+ &v_bit, &rnding);
+ u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+ &v_bit, &rnding);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30],
+ &v_bit, &rnding);
+ u[30] =
+ half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
+ u[17] = temp1;
+
+ temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
+ &v_bit, &rnding);
+ u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
+ &v_bit, &rnding);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+
+ // stage 6
+ temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+ u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+ u[0] = temp1;
+
+ temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14],
+ &v_bit, &rnding);
+ u[14] =
+ half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
+ &v_bit, &rnding);
+ u[61] =
+ half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+ u[34] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
+ &v_bit, &rnding);
+ u[60] =
+ half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+ u[35] = temp2;
+ temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
+ &v_bit, &rnding);
+ u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+ &v_bit, &rnding);
+ u[36] = temp1;
+ temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
+ &v_bit, &rnding);
+ u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+ &v_bit, &rnding);
+ u[37] = temp2;
+ temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+ &v_bit, &rnding);
+ u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+ &rnding);
+ u[42] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+ &v_bit, &rnding);
+ u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+ &rnding);
+ u[43] = temp2;
+ temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+ &v_bit, &rnding);
+ u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+ &v_bit, &rnding);
+ u[44] = temp1;
+ temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+ &v_bit, &rnding);
+ u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+ &v_bit, &rnding);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
+ &v_bit, &rnding);
+ u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
+ &rnding);
+ u[18] = temp1;
+ temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
+ &v_bit, &rnding);
+ u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
+ &rnding);
+ u[19] = temp2;
+ temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
+ &v_bit, &rnding);
+ u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
+ &v_bit, &rnding);
+ u[20] = temp1;
+ temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
+ &v_bit, &rnding);
+ u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
+ &v_bit, &rnding);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+ u[9] = u[9];
+
+ idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 9
+ idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 10
+ idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 11
+ idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ {
+ int32x4_t u[64];
+ int32x4_t tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+ u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+ u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
+ u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
+ u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
+ u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
+ u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+ u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+ u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+ u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+ u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
+ u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
+ u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+ u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+ u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
+ u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
+
+ // stage 3
+ u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
+ u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
+ u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding);
+ u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding);
+ u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding);
+ u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding);
+ u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
+ u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+ u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+ u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+ u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit,
+ &rnding);
+ tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit,
+ &rnding);
+ tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
+ &v_bit, &rnding);
+ tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+ &v_bit, &rnding);
+ u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+ &v_bit, &rnding);
+ u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
+ &rnding);
+ u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
+ &v_bit, &rnding);
+ u[62] =
+ half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+ &v_bit, &rnding);
+ tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
+ &v_bit, &rnding);
+ tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit,
+ &rnding);
+ tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+ &v_bit, &rnding);
+ u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+ &v_bit, &rnding);
+ u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
+ &rnding);
+ u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
+ &v_bit, &rnding);
+ u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+ &rnding);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
+ u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit,
+ &rnding);
+ tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit,
+ &rnding);
+ tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26],
+ &v_bit, &rnding);
+ tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
+ &v_bit, &rnding);
+ u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
+ &v_bit, &rnding);
+ u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit,
+ &rnding);
+ u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29],
+ &v_bit, &rnding);
+ u[30] =
+ half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+ u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+ &rnding);
+ u[14] =
+ half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+ u[9] = tmp1;
+ tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13],
+ &v_bit, &rnding);
+ u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+ &v_bit, &rnding);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit,
+ &rnding);
+ tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit,
+ &rnding);
+ tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit,
+ &rnding);
+ tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit,
+ &rnding);
+ u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+ &v_bit, &rnding);
+ u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+ &v_bit, &rnding);
+ u[60] =
+ half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+ u[61] =
+ half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+ &v_bit, &rnding);
+ tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+ &v_bit, &rnding);
+ tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+ &v_bit, &rnding);
+ tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+ &v_bit, &rnding);
+ u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+ &v_bit, &rnding);
+ u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+ &v_bit, &rnding);
+ u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+ &rnding);
+ u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+ &rnding);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit,
+ &rnding);
+ u[6] =
+ half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding);
+ u[5] = tmp1;
+ addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
+ &v_bit, &rnding);
+ tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
+ &v_bit, &rnding);
+ tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
+ &v_bit, &rnding);
+ tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
+ &v_bit, &rnding);
+ u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
+ &v_bit, &rnding);
+ u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
+ &v_bit, &rnding);
+ u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
+ &rnding);
+ u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
+ &rnding);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 9
+ idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 10
+ idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 11
+ idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+
+ {
+ int32x4_t u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
+ v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding);
+ v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding);
+ v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
+ v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
+ v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding);
+ v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding);
+ v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
+ v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
+ v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding);
+ v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding);
+ v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
+ v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
+ v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding);
+ v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding);
+ v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
+ v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
+ v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding);
+ v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding);
+ v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
+ v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
+ v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding);
+ v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding);
+ v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
+ v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
+ v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding);
+ v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding);
+ v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
+ v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
+ v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding);
+ v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding);
+ v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
+
+ // stage 3
+ u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding);
+ u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding);
+ u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding);
+ u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding);
+ u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding);
+ u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding);
+ u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding);
+ u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding);
+ u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding);
+ u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding);
+ u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding);
+ u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding);
+ u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding);
+ u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding);
+ u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding);
+ u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
+ v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
+ v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
+ v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
+ v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
+ v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
+ v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
+ v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
+ &v_bit, &rnding);
+ v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61],
+ &v_bit, &rnding);
+ v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
+ &v_bit, &rnding);
+ v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
+ &v_bit, &rnding);
+ v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
+ &v_bit, &rnding);
+ v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
+ &v_bit, &rnding);
+ v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50],
+ &v_bit, &rnding);
+ v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
+ &v_bit, &rnding);
+ v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
+ &v_bit, &rnding);
+ v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
+ &rnding);
+ v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
+ &v_bit, &rnding);
+ v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
+ &rnding);
+ v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
+ &v_bit, &rnding);
+ v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
+ &rnding);
+ v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
+ &v_bit, &rnding);
+ v[62] =
+ half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
+
+ // stage 5
+ u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding);
+ u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding);
+ u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding);
+ u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30],
+ &v_bit, &rnding);
+ u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29],
+ &v_bit, &rnding);
+ u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26],
+ &v_bit, &rnding);
+ u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25],
+ &v_bit, &rnding);
+ u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25],
+ &v_bit, &rnding);
+ u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit,
+ &rnding);
+ u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29],
+ &v_bit, &rnding);
+ u[30] =
+ half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+ v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
+ v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
+ v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
+
+ addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
+ &rnding);
+ v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
+ &v_bit, &rnding);
+ v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
+ &v_bit, &rnding);
+ v[14] =
+ half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
+ &v_bit, &rnding);
+ v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
+ &v_bit, &rnding);
+ v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
+ &v_bit, &rnding);
+ v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
+ &v_bit, &rnding);
+ v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
+ &v_bit, &rnding);
+ v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
+ &v_bit, &rnding);
+ v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
+ &v_bit, &rnding);
+ v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
+ &v_bit, &rnding);
+ v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
+ &v_bit, &rnding);
+ v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
+ &v_bit, &rnding);
+ v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
+ &rnding);
+ v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
+ &rnding);
+ v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
+ &v_bit, &rnding);
+ v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
+ &v_bit, &rnding);
+ v[60] =
+ half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
+ v[61] =
+ half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
+
+ // stage 7
+ addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit,
+ &rnding);
+ u[6] =
+ half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding);
+
+ addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29],
+ &v_bit, &rnding);
+ u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28],
+ &v_bit, &rnding);
+ u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27],
+ &v_bit, &rnding);
+ u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26],
+ &v_bit, &rnding);
+ u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26],
+ &v_bit, &rnding);
+ u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27],
+ &v_bit, &rnding);
+ u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit,
+ &rnding);
+ u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit,
+ &rnding);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13],
+ &v_bit, &rnding);
+ v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12],
+ &v_bit, &rnding);
+ v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit,
+ &rnding);
+ v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit,
+ &rnding);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59],
+ &v_bit, &rnding);
+ v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58],
+ &v_bit, &rnding);
+ v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57],
+ &v_bit, &rnding);
+ v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56],
+ &v_bit, &rnding);
+ v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55],
+ &v_bit, &rnding);
+ v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54],
+ &v_bit, &rnding);
+ v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53],
+ &v_bit, &rnding);
+ v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52],
+ &v_bit, &rnding);
+ v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52],
+ &v_bit, &rnding);
+ v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53],
+ &v_bit, &rnding);
+ v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54],
+ &v_bit, &rnding);
+ v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55],
+ &v_bit, &rnding);
+ v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit,
+ &rnding);
+ v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit,
+ &rnding);
+ v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit,
+ &rnding);
+ v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit,
+ &rnding);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27],
+ &v_bit, &rnding);
+ u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26],
+ &v_bit, &rnding);
+ u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25],
+ &v_bit, &rnding);
+ u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24],
+ &v_bit, &rnding);
+ u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit,
+ &rnding);
+ u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit,
+ &rnding);
+ u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit,
+ &rnding);
+ u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit,
+ &rnding);
+
+ for (i = 32; i < 40; i++) {
+ addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55],
+ &v_bit, &rnding);
+ v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54],
+ &v_bit, &rnding);
+ v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53],
+ &v_bit, &rnding);
+ v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52],
+ &v_bit, &rnding);
+ v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51],
+ &v_bit, &rnding);
+ v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50],
+ &v_bit, &rnding);
+ v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49],
+ &v_bit, &rnding);
+ v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48],
+ &v_bit, &rnding);
+ v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit,
+ &rnding);
+ v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit,
+ &rnding);
+ v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit,
+ &rnding);
+ v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit,
+ &rnding);
+ v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit,
+ &rnding);
+ v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit,
+ &rnding);
+ v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit,
+ &rnding);
+ v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit,
+ &rnding);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ for (i = 0; i < 32; i++) {
+ addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out =
+ vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ for (i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+ 4);
+ }
+ }
+ }
+}
+
+static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t bf1;
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0-1
+ bf1 = in[0];
+
+ // stage 2-5
+ bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding);
+
+ // stage 6-9
+ if (do_cols) {
+ bf1 = vmaxq_s32(bf1, clamp_lo);
+ bf1 = vminq_s32(bf1, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift));
+ }
+ }
+
+ bf1 = vmaxq_s32(bf1, clamp_lo);
+ bf1 = vminq_s32(bf1, clamp_hi);
+
+ for (int i = 0; i < 32; i++) out[i] = bf1;
+}
+
+static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t bf1[32];
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ // stage 0-1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
+ bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
+ bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
+ bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
+ bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
+ bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
+ bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
+ bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
+
+ // stage 3
+ bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
+ bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
+
+ bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
+ bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4 :
+ bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
+ bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
+
+ // stage 5
+ bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 7
+ idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 8
+ idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 9
+ idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t bf1[32];
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+
+ // stage 0-1
+
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
+ bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
+ bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding);
+ bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding);
+ bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding);
+ bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding);
+ bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
+ bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
+ bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
+ bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
+ bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding);
+ bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding);
+ bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding);
+ bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding);
+ bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
+ bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
+
+ // stage 3
+ bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
+ bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
+ bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding);
+ bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding);
+ bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding);
+ bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding);
+ bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
+ bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
+
+ addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+ // stage 4
+ bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
+ bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
+ bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding);
+ bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding);
+
+ addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
+
+ // stage 5
+ bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding);
+ bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding);
+
+ addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 6
+ addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 7
+ idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+
+ // stage 8
+ idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
+ // stage 9
+ idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
+ const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
+ int32x4_t bf1[32], bf0[32];
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ for (int i = 0; i < 16; i++) bf0[i] = bf1[i];
+
+ bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31],
+ &v_bit, &rnding);
+ bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30],
+ &v_bit, &rnding);
+ bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29],
+ &v_bit, &rnding);
+ bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28],
+ &v_bit, &rnding);
+ bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27],
+ &v_bit, &rnding);
+ bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26],
+ &v_bit, &rnding);
+ bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25],
+ &v_bit, &rnding);
+ bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24],
+ &v_bit, &rnding);
+ bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit,
+ &rnding);
+ bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit,
+ &rnding);
+ bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit,
+ &rnding);
+ bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit,
+ &rnding);
+ bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit,
+ &rnding);
+ bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit,
+ &rnding);
+ bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit,
+ &rnding);
+ bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit,
+ &rnding);
+
+ // stage 3
+ for (int i = 0; i < 8; i++) bf1[i] = bf0[i];
+
+ bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15],
+ &v_bit, &rnding);
+ bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14],
+ &v_bit, &rnding);
+ bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13],
+ &v_bit, &rnding);
+ bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12],
+ &v_bit, &rnding);
+ bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit,
+ &rnding);
+ bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit,
+ &rnding);
+ bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit,
+ &rnding);
+ bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit,
+ &rnding);
+
+ addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7],
+ &v_bit, &rnding);
+ bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6],
+ &v_bit, &rnding);
+ bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit,
+ &rnding);
+ bf0[7] =
+ half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding);
+
+ addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
+ &v_bit, &rnding);
+ bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
+ &v_bit, &rnding);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
+ &v_bit, &rnding);
+ bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
+ &v_bit, &rnding);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
+ &v_bit, &rnding);
+ bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit,
+ &rnding);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
+ &v_bit, &rnding);
+ bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit,
+ &rnding);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit,
+ &rnding);
+ bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1],
+ &v_bit, &rnding);
+ bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3],
+ &v_bit, &rnding);
+ bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit,
+ &rnding);
+ addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14],
+ &v_bit, &rnding);
+ bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13],
+ &v_bit, &rnding);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13],
+ &v_bit, &rnding);
+ bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit,
+ &rnding);
+ bf1[15] = bf0[15];
+ addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
+ &v_bit, &rnding);
+ bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit,
+ &rnding);
+ bf0[7] = bf1[7];
+ addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
+ &v_bit, &rnding);
+ bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
+ &v_bit, &rnding);
+ bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
+ &v_bit, &rnding);
+ bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
+ &v_bit, &rnding);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
+ &v_bit, &rnding);
+ bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
+ &v_bit, &rnding);
+ bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit,
+ &rnding);
+ bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit,
+ &rnding);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13],
+ &v_bit, &rnding);
+ bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12],
+ &v_bit, &rnding);
+ bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit,
+ &rnding);
+ bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit,
+ &rnding);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
+ &v_bit, &rnding);
+ bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
+ &v_bit, &rnding);
+ bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
+ &v_bit, &rnding);
+ bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
+ &v_bit, &rnding);
+ bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit,
+ &rnding);
+ bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit,
+ &rnding);
+ bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit,
+ &rnding);
+ bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit,
+ &rnding);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ (void)bit;
+ for (int i = 0; i < 32; i += 16) {
+ out[i] = vshlq_n_s32(in[i], 2);
+ out[i + 1] = vshlq_n_s32(in[i + 1], 2);
+ out[i + 2] = vshlq_n_s32(in[i + 2], 2);
+ out[i + 3] = vshlq_n_s32(in[i + 3], 2);
+ out[i + 4] = vshlq_n_s32(in[i + 4], 2);
+ out[i + 5] = vshlq_n_s32(in[i + 5], 2);
+ out[i + 6] = vshlq_n_s32(in[i + 6], 2);
+ out[i + 7] = vshlq_n_s32(in[i + 7], 2);
+ out[i + 8] = vshlq_n_s32(in[i + 8], 2);
+ out[i + 9] = vshlq_n_s32(in[i + 9], 2);
+ out[i + 10] = vshlq_n_s32(in[i + 10], 2);
+ out[i + 11] = vshlq_n_s32(in[i + 11], 2);
+ out[i + 12] = vshlq_n_s32(in[i + 12], 2);
+ out[i + 13] = vshlq_n_s32(in[i + 13], 2);
+ out[i + 14] = vshlq_n_s32(in[i + 14], 2);
+ out[i + 15] = vshlq_n_s32(in[i + 15], 2);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
+ const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+static const transform_1d_neon
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4x4_neon, NULL, NULL, NULL },
+ { iadst4x4_neon, NULL, NULL, NULL },
+ { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL },
+ },
+ { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL },
+ { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL },
+ { iidentity8_neon, iidentity8_neon, NULL, NULL } },
+ {
+ { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL },
+ { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL },
+ { iidentity16_neon, NULL, iidentity16_neon, NULL },
+ },
+ { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon,
+ idct32x32_neon },
+ { NULL, NULL, NULL, NULL },
+ { iidentity32_neon, NULL, NULL, NULL } },
+ { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon,
+ idct64x64_neon },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, const int bd) {
+ TX_SIZE tx_size = TX_4X8;
+ int32x4_t buf1[32] = { vdupq_n_s32(0) };
+
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+ const int input_stride = AOMMIN(32, txfm_size_row);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ int32x4_t buf0[8];
+ load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
+ load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
+ round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row);
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+ row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
+
+ if (lr_flip) {
+ TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ } else {
+ TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, const int bd) {
+ TX_SIZE tx_size = TX_8X4;
+ int32x4_t buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ int32x4_t buf0[8];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col);
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_neon(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < 2; i++) {
+ int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
+ transpose_4x4(buf1_cur, buf1_cur);
+ col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
+ }
+ round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+ // write to buffer
+ highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, const int bd) {
+ TX_SIZE tx_size = TX_4X16;
+ int32x4_t buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_h_div8 = txfm_size_row >> 2;
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+ const int input_stride = AOMMIN(32, txfm_size_row);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ int32x4_t buf0[16];
+ for (int i = 0; i < (txfm_size_row >> 2); i++) {
+ const int32_t *input_row = input + i * 4;
+ int32x4_t *buf0_cur = buf0 + i * 4;
+ load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
+ row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
+ }
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+ buf1[4 * j + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+ buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+ buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, const int bd) {
+ TX_SIZE tx_size = TX_16X4;
+ int32x4_t buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ int32x4_t buf0[16];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_neon(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
+ transpose_4x4(buf1_cur, buf1_cur);
+ col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
+ }
+ round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip, txfm_size_row,
+ bd);
+ }
+}
+
+static void highbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, int eob,
+ const int bd) {
+ (void)eob;
+ TX_SIZE tx_size = TX_4X16;
+ int32x4_t buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_h_div8 = txfm_size_row >> 2;
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ int32x4_t buf0[16];
+ const int32_t *input_row = input;
+ int32x4_t *buf0_cur = buf0;
+ load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+ for (int i = 0; i < (txfm_size_row >> 2); i++) {
+ row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
+ }
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+ buf1[4 * j + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+ buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+ buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, int eob,
+ const int bd) {
+ (void)eob;
+ TX_SIZE tx_size = TX_16X4;
+ int32x4_t buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ int32x4_t buf0[16];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ for (int j = 0; j < buf_size_w_div8; j++) {
+ TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+ buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_neon(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+ INV_COS_BIT, 1, bd, 0);
+ }
+ round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip, txfm_size_row,
+ bd);
+ }
+}
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size) {
+ if (tx_size == 2) {
+ *eoby = 15, *eobx = 15;
+ } else if (tx_size == 3) {
+ *eoby = 31, *eobx = 31;
+ } else if (tx_size == 4) {
+ *eoby = 31, *eobx = 31;
+ } else if (tx_size == 7) {
+ *eoby = 15, *eobx = 7;
+ } else if (tx_size == 8) {
+ *eoby = 7, *eobx = 15;
+ } else if (tx_size == 9) {
+ *eoby = 31, *eobx = 15;
+ } else if (tx_size == 10) {
+ *eoby = 15, *eobx = 31;
+ } else if (tx_size == 11) {
+ *eoby = 31, *eobx = 31;
+ } else if (tx_size == 12) {
+ *eoby = 31, *eobx = 31;
+ } else if (tx_size == 15) {
+ *eoby = 31, *eobx = 7;
+ } else if (tx_size == 16) {
+ *eoby = 7, *eobx = 31;
+ } else if (tx_size == 17) {
+ *eoby = 31, *eobx = 15;
+ } else if (tx_size == 18) {
+ *eoby = 15, *eobx = 31;
+ } else {
+ *eoby = 0, *eobx = 0;
+ }
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size) {
+ const int txfm_size_row = tx_size_high[tx_size];
+ *eoby = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = 0;
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size) {
+ const int txfm_size_col = tx_size_wide[tx_size];
+ *eobx = AOMMIN(32, txfm_size_col) - 1;
+ *eoby = 0;
+}
+
+static void inv_txfm2d_add_h_identity_neon(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ const int bd) {
+ int32x4_t buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = buf_size_w >> 2;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ assert(row_txfm != NULL);
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+ assert(col_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+ int32x4_t buf0[16];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *_buf1 = buf1 + i * 4;
+
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ int32x4_t *buf0_cur = buf0 + j * 4;
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ _buf1[j * txfm_size_row + 0] = buf0_cur[0];
+ _buf1[j * txfm_size_row + 1] = buf0_cur[1];
+ _buf1[j * txfm_size_row + 2] = buf0_cur[2];
+ _buf1[j * txfm_size_row + 3] = buf0_cur[3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_neon(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+}
+
+static void inv_txfm2d_add_v_identity_neon(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ const int bd) {
+ int32x4_t buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+ assert(row_txfm != NULL);
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ assert(col_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ int32x4_t buf0[16];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_neon(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+ }
+}
+
+static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, const int bd) {
+ int32x4_t buf1[64 * 4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int buf_size_w = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = buf_size_w >> 2;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ assert(row_txfm != NULL);
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ assert(col_txfm != NULL);
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ int32x4_t buf0[32];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *_buf1 = buf1 + i * 4;
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ int32x4_t *buf0_cur = buf0 + j * 4;
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ _buf1[j * txfm_size_row + 0] = buf0_cur[0];
+ _buf1[j * txfm_size_row + 1] = buf0_cur[1];
+ _buf1[j * txfm_size_row + 2] = buf0_cur[2];
+ _buf1[j * txfm_size_row + 3] = buf0_cur[3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_neon(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, 0, txfm_size_row, bd);
+ }
+ }
+}
+
+static void inv_txfm2d_add_no_identity_neon(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ const int bd) {
+ int32x4_t buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div4 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ int32x4_t buf0[64];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *_buf1 = &buf1[i * 4];
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_neon(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+ }
+}
+
+static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ int32x4_t buf1[64 * 16];
+ int eobx, eoby;
+ highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ int32x4_t buf0[64];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+ int32x4_t *buf0_cur = &buf0[j * 4];
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ int32x4_t *_buf1 = &buf1[i * 4];
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_neon(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+ }
+}
+
+static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+ tx_type, tx_size, bd);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+ tx_type, tx_size, bd);
+ break;
+ case IDTX:
+ inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+ tx_type, tx_size, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, bd);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+ tx_type, tx_size, bd);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+ tx_type, tx_size, bd);
+ break;
+ case IDTX:
+ inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
+ tx_type, tx_size, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case IDTX:
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob,
+ bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+ av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8_neon(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_neon(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_8X16, txfm_param->eob, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_16X8, txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_16X32, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_16X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_32X16, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_32X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_32X32, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_32X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_64X64, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_64X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_32X64, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_32X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_64X32, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_64X32, bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_64X16, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_64X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_16X64, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_16X64, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_16X16, txfm_param->eob,
+ txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
+ TX_16X16, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_32X8, txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32_neon(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type,
+ TX_8X32, txfm_param->eob, txfm_param->bd);
+}
+
+void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, TX_TYPE tx_type, const int bd) {
+ inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const TX_SIZE tx_size = txfm_param->tx_size;
+
+ TX_TYPE tx_type = txfm_param->tx_type;
+ int bd = txfm_param->bd;
+ switch (tx_size) {
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ case TX_8X4:
+ av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ case TX_4X16:
+ av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ case TX_8X16:
+ av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_16X8:
+ av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_16X32:
+ av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_32X16:
+ av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_16X16:
+ av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_32X32:
+ av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_64X64:
+ av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_32X64:
+ av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_64X32:
+ av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_16X64:
+ av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_64X16:
+ av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_32X8:
+ av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ case TX_8X32:
+ av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type,
+ bd);
+ break;
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_reconinter_neon.c b/third_party/aom/av1/common/arm/highbd_reconinter_neon.c
new file mode 100644
index 0000000000..da7f6c57d0
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_reconinter_neon.c
@@ -0,0 +1,327 @@
+/*
+ *
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+
+static INLINE void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse,
+ const uint16_t *src0,
+ int src0_stride,
+ const uint16_t *src1,
+ int src1_stride, int h, int w,
+ const unsigned int bd) {
+ assert(DIFF_FACTOR > 0);
+ uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+ uint8x16_t mask_base = vdupq_n_u8(38);
+ uint8x16_t mask_diff = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA - 38);
+
+ if (bd == 8) {
+ if (w >= 16) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+ uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+ uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+ uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+ uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+ uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(mask_diff, diff);
+ } else {
+ m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+ }
+
+ vst1q_u8(mask_ptr, m);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0_ptr);
+ uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ vst1_u8(mask_ptr, m);
+
+ src0_ptr += 8;
+ src1_ptr += 8;
+ mask_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ store_u8x4_strided_x2(mask, w, m);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ mask += 2 * w;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (bd == 10) {
+ if (w >= 16) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+ uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+ uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+ uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+ uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+ uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(mask_diff, diff);
+ } else {
+ m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+ }
+
+ vst1q_u8(mask_ptr, m);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0_ptr);
+ uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ vst1_u8(mask_ptr, m);
+
+ src0_ptr += 8;
+ src1_ptr += 8;
+ mask_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ store_u8x4_strided_x2(mask, w, m);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ mask += 2 * w;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ assert(bd == 12);
+ if (w >= 16) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0_ptr);
+ uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8);
+ uint16x8_t s1_lo = vld1q_u16(src1_ptr);
+ uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8);
+
+ uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo);
+ uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(mask_diff, diff);
+ } else {
+ m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha);
+ }
+
+ vst1q_u8(mask_ptr, m);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ width -= 16;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8_t *mask_ptr = mask;
+ const uint16_t *src0_ptr = src0;
+ const uint16_t *src1_ptr = src1;
+ int width = w;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0_ptr);
+ uint16x8_t s1 = vld1q_u16(src1_ptr);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ vst1_u8(mask_ptr, m);
+
+ src0_ptr += 8;
+ src1_ptr += 8;
+ mask_ptr += 8;
+ width -= 8;
+ } while (width != 0);
+ mask += w;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t diff_u16 = vabdq_u16(s0, s1);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vget_low_u8(mask_diff), diff_u8);
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)),
+ vget_low_u8(max_alpha));
+ }
+
+ store_u8x4_strided_x2(mask, w, m);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ mask += 2 * w;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_neon(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ assert(h % 4 == 0);
+ assert(w % 4 == 0);
+ assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
+
+ if (mask_type == DIFFWTD_38) {
+ diffwtd_mask_highbd_neon(mask, /*inverse=*/false, CONVERT_TO_SHORTPTR(src0),
+ src0_stride, CONVERT_TO_SHORTPTR(src1),
+ src1_stride, h, w, bd);
+ } else { // mask_type == DIFFWTD_38_INV
+ diffwtd_mask_highbd_neon(mask, /*inverse=*/true, CONVERT_TO_SHORTPTR(src0),
+ src0_stride, CONVERT_TO_SHORTPTR(src1),
+ src1_stride, h, w, bd);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_reconintra_neon.c b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c
new file mode 100644
index 0000000000..170491b504
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+
+#define MAX_UPSAMPLE_SZ 16
+
+void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+ assert(sz >= 0 && sz <= 129);
+
+ DECLARE_ALIGNED(16, static const uint16_t,
+ idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 };
+ const uint16x8_t index = vld1q_u16(idx);
+
+ uint16_t edge[160]; // Max value of sz + enough padding for vector accesses.
+ memcpy(edge + 1, p, sz * sizeof(*p));
+
+ // Populate extra space appropriately.
+ edge[0] = edge[1];
+ edge[sz + 1] = edge[sz];
+ edge[sz + 2] = edge[sz];
+
+ // Don't overwrite first pixel.
+ uint16_t *dst = p + 1;
+ sz--;
+
+ if (strength == 1) { // Filter: {4, 8, 4}.
+ const uint16_t *src = edge + 1;
+
+ while (sz >= 8) {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ // Make use of the identity:
+ // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+ uint16x8_t t0 = vaddq_u16(s0, s2);
+ uint16x8_t t1 = vaddq_u16(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+ vst1q_u16(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ // Make use of the identity:
+ // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+ uint16x8_t t0 = vaddq_u16(s0, s2);
+ uint16x8_t t1 = vaddq_u16(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 2);
+
+ // Mask off out-of-bounds indices.
+ uint16x8_t current_dst = vld1q_u16(dst);
+ uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+ res = vbslq_u16(mask, res, current_dst);
+
+ vst1q_u16(dst, res);
+ }
+ } else if (strength == 2) { // Filter: {5, 6, 5}.
+ const uint16_t *src = edge + 1;
+
+ const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6),
+ vdupq_n_u16(5) } };
+ while (sz >= 8) {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+ accum = vmlaq_u16(accum, s1, filter.val[1]);
+ accum = vmlaq_u16(accum, s2, filter.val[2]);
+ uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+ vst1q_u16(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+
+ uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
+ accum = vmlaq_u16(accum, s1, filter.val[1]);
+ accum = vmlaq_u16(accum, s2, filter.val[2]);
+ uint16x8_t res = vrshrq_n_u16(accum, 4);
+
+ // Mask off out-of-bounds indices.
+ uint16x8_t current_dst = vld1q_u16(dst);
+ uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+ res = vbslq_u16(mask, res, current_dst);
+
+ vst1q_u16(dst, res);
+ }
+ } else { // Filter {2, 4, 4, 4, 2}.
+ const uint16_t *src = edge;
+
+ while (sz >= 8) {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+ uint16x8_t s4 = vld1q_u16(src + 4);
+
+ // Make use of the identity:
+ // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+ uint16x8_t t0 = vaddq_u16(s0, s4);
+ uint16x8_t t1 = vaddq_u16(s1, s2);
+ t1 = vaddq_u16(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+ vst1q_u16(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+ uint16x8_t s4 = vld1q_u16(src + 4);
+
+ // Make use of the identity:
+ // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+ uint16x8_t t0 = vaddq_u16(s0, s4);
+ uint16x8_t t1 = vaddq_u16(s1, s2);
+ t1 = vaddq_u16(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint16x8_t res = vrshrq_n_u16(sum, 3);
+
+ // Mask off out-of-bounds indices.
+ uint16x8_t current_dst = vld1q_u16(dst);
+ uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
+ res = vbslq_u16(mask, res, current_dst);
+
+ vst1q_u16(dst, res);
+ }
+ }
+}
+
+void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) {
+ if (!sz) return;
+
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint16_t edge[MAX_UPSAMPLE_SZ + 3];
+ const uint16_t *src = edge;
+
+ // Copy p[-1..(sz-1)] and pad out both ends.
+ edge[0] = p[-1];
+ edge[1] = p[-1];
+ memcpy(edge + 2, p, sz * 2);
+ edge[sz + 2] = p[sz - 1];
+ p[-2] = p[-1];
+
+ uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1);
+
+ uint16_t *dst = p - 1;
+
+ if (bd == 12) {
+ do {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+
+ uint16x8_t t0 = vaddq_u16(s1, s2);
+ uint16x8_t t1 = vaddq_u16(s0, s3);
+ uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9);
+ acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1)));
+ uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9);
+ acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1)));
+
+ uint16x8x2_t res;
+ res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4));
+ // Clamp pixel values at bitdepth maximum.
+ res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+ res.val[1] = s2;
+
+ vst2q_u16(dst, res);
+
+ src += 8;
+ dst += 16;
+ sz -= 8;
+ } while (sz > 0);
+ } else { // Bit depth is 8 or 10.
+ do {
+ uint16x8_t s0 = vld1q_u16(src);
+ uint16x8_t s1 = vld1q_u16(src + 1);
+ uint16x8_t s2 = vld1q_u16(src + 2);
+ uint16x8_t s3 = vld1q_u16(src + 3);
+
+ uint16x8_t t0 = vaddq_u16(s0, s3);
+ uint16x8_t t1 = vaddq_u16(s1, s2);
+ t1 = vmulq_n_u16(t1, 9);
+ t1 = vqsubq_u16(t1, t0);
+
+ uint16x8x2_t res;
+ res.val[0] = vrshrq_n_u16(t1, 4);
+ // Clamp pixel values at bitdepth maximum.
+ res.val[0] = vminq_u16(res.val[0], pixel_val_max);
+ res.val[1] = s2;
+
+ vst2q_u16(dst, res);
+
+ src += 8;
+ dst += 16;
+ sz -= 8;
+ } while (sz > 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c
new file mode 100644
index 0000000000..c6f1e3ad92
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/scale.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+#include "highbd_warp_plane_neon.h"
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd,
+ int sx, int alpha) {
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 0);
+ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 1);
+ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 2);
+ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 3);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
+ m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
+ int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
+ m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1));
+ int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2));
+ m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2));
+ int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3));
+ m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ int32x4_t res = horizontal_add_4d_s32x4(m0123);
+ res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+ res = vrshlq_s32(res, vdupq_n_s32(-round0));
+ return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd,
+ int sx, int alpha) {
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 0);
+ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 1);
+ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 2);
+ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 3);
+ int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 4);
+ int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 5);
+ int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 6);
+ int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 7);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0));
+ m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0));
+ int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1));
+ m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1));
+ int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2));
+ m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2));
+ int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3));
+ m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3));
+ int32x4_t m4 = vmull_s16(vget_low_s16(f[4]), vget_low_s16(rv4));
+ m4 = vmlal_s16(m4, vget_high_s16(f[4]), vget_high_s16(rv4));
+ int32x4_t m5 = vmull_s16(vget_low_s16(f[5]), vget_low_s16(rv5));
+ m5 = vmlal_s16(m5, vget_high_s16(f[5]), vget_high_s16(rv5));
+ int32x4_t m6 = vmull_s16(vget_low_s16(f[6]), vget_low_s16(rv6));
+ m6 = vmlal_s16(m6, vget_high_s16(f[6]), vget_high_s16(rv6));
+ int32x4_t m7 = vmull_s16(vget_low_s16(f[7]), vget_low_s16(rv7));
+ m7 = vmlal_s16(m7, vget_high_s16(f[7]), vget_high_s16(rv7));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ int32x4_t res0 = horizontal_add_4d_s32x4(m0123);
+ int32x4_t res1 = horizontal_add_4d_s32x4(m4567);
+ res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+ res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+ res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+ res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+ return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd,
+ int sx) {
+ int16x8_t f = load_filters_1(sx);
+
+ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 0);
+ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 1);
+ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 2);
+ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 3);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0));
+ m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0));
+ int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1));
+ m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1));
+ int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2));
+ m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2));
+ int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3));
+ m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ int32x4_t res = horizontal_add_4d_s32x4(m0123);
+ res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz));
+ res = vrshlq_s32(res, vdupq_n_s32(-round0));
+ return vcombine_s16(vmovn_s32(res), vdup_n_s16(0));
+}
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd,
+ int sx) {
+ int16x8_t f = load_filters_1(sx);
+
+ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 0);
+ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 1);
+ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 2);
+ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 3);
+ int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 4);
+ int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 5);
+ int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 6);
+ int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]),
+ vreinterpretq_s16_u16(in.val[1]), 7);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0));
+ m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0));
+ int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1));
+ m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1));
+ int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2));
+ m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2));
+ int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3));
+ m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3));
+ int32x4_t m4 = vmull_s16(vget_low_s16(f), vget_low_s16(rv4));
+ m4 = vmlal_s16(m4, vget_high_s16(f), vget_high_s16(rv4));
+ int32x4_t m5 = vmull_s16(vget_low_s16(f), vget_low_s16(rv5));
+ m5 = vmlal_s16(m5, vget_high_s16(f), vget_high_s16(rv5));
+ int32x4_t m6 = vmull_s16(vget_low_s16(f), vget_low_s16(rv6));
+ m6 = vmlal_s16(m6, vget_high_s16(f), vget_high_s16(rv6));
+ int32x4_t m7 = vmull_s16(vget_low_s16(f), vget_low_s16(rv7));
+ m7 = vmlal_s16(m7, vget_high_s16(f), vget_high_s16(rv7));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ int32x4_t res0 = horizontal_add_4d_s32x4(m0123);
+ int32x4_t res1 = horizontal_add_4d_s32x4(m4567);
+ res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz));
+ res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz));
+ res0 = vrshlq_s32(res0, vdupq_n_s32(-round0));
+ res1 = vrshlq_s32(res1, vdupq_n_s32(-round0));
+ return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
+static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) {
+ const int16x8_t f = load_filters_1(sy);
+ const int16x4_t f0123 = vget_low_s16(f);
+ const int16x4_t f4567 = vget_high_s16(f);
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+ return m0123;
+}
+
+static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) {
+ const int16x8_t f = load_filters_1(sy);
+ const int16x4_t f0123 = vget_low_s16(f);
+ const int16x4_t f4567 = vget_high_s16(f);
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3);
+ return (int32x4x2_t){ { m0123, m4567 } };
+}
+
+static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy,
+ int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]),
+ vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]),
+ vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ return horizontal_add_4d_s32x4(m0123);
+}
+
+static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy,
+ int gamma) {
+ int16x8_t s0 = tmp[0];
+ int16x8_t s1 = tmp[1];
+ int16x8_t s2 = tmp[2];
+ int16x8_t s3 = tmp[3];
+ int16x8_t s4 = tmp[4];
+ int16x8_t s5 = tmp[5];
+ int16x8_t s6 = tmp[6];
+ int16x8_t s7 = tmp[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+ int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+ m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+ int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+ m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+ int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+ m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+ int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+ m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+ int32x4_t m0123[] = { m0, m1, m2, m3 };
+ int32x4_t m4567[] = { m4, m5, m6, m7 };
+
+ int32x4x2_t ret;
+ ret.val[0] = horizontal_add_4d_s32x4(m0123);
+ ret.val[1] = horizontal_add_4d_s32x4(m4567);
+ return ret;
+}
+
+void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, bd, conv_params, alpha, beta, gamma,
+ delta);
+}
diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
new file mode 100644
index 0000000000..3b8982898e
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_
+#define AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/scale.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd,
+ int sx, int alpha);
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd,
+ int sx, int alpha);
+
+static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd,
+ int sx);
+
+static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd,
+ int sx);
+
+static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy);
+
+static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy);
+
+static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy,
+ int gamma);
+
+static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy,
+ int gamma);
+
+static INLINE int16x8_t load_filters_1(int ofs) {
+ const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS);
+
+ const int16_t *base =
+ (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+ return vld1q_s16(base + ofs0 * 8);
+}
+
+static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) {
+ const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS);
+ const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS);
+ const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
+ const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
+
+ const int16_t *base =
+ (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+ out[0] = vld1q_s16(base + ofs0 * 8);
+ out[1] = vld1q_s16(base + ofs1 * 8);
+ out[2] = vld1q_s16(base + ofs2 * 8);
+ out[3] = vld1q_s16(base + ofs3 * 8);
+}
+
+static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) {
+ const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS);
+ const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS);
+ const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS);
+ const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS);
+ const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS);
+ const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS);
+ const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS);
+ const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS);
+
+ const int16_t *base =
+ (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8;
+ out[0] = vld1q_s16(base + ofs0 * 8);
+ out[1] = vld1q_s16(base + ofs1 * 8);
+ out[2] = vld1q_s16(base + ofs2 * 8);
+ out[3] = vld1q_s16(base + ofs3 * 8);
+ out[4] = vld1q_s16(base + ofs4 * 8);
+ out[5] = vld1q_s16(base + ofs5 * 8);
+ out[6] = vld1q_s16(base + ofs6 * 8);
+ out[7] = vld1q_s16(base + ofs7 * 8);
+}
+
+static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) {
+ const int limit = (1 << bd) - 1;
+ return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit)));
+}
+
+static INLINE void warp_affine_horizontal(const uint16_t *ref, int width,
+ int height, int stride, int p_width,
+ int16_t alpha, int16_t beta, int iy4,
+ int sx4, int ix4, int16x8_t tmp[],
+ int bd) {
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+
+ if (ix4 <= -7) {
+ for (int k = 0; k < 15; ++k) {
+ int iy = clamp(iy4 + k - 7, 0, height - 1);
+ int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - round0));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ } else if (ix4 >= width + 6) {
+ for (int k = 0; k < 15; ++k) {
+ int iy = clamp(iy4 + k - 7, 0, height - 1);
+ int32_t dup_val =
+ (1 << (bd + FILTER_BITS - round0 - 1)) +
+ ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ }
+
+ static const uint16_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+ const uint16x8_t indx0 = vld1q_u16(kIotaArr);
+ const uint16x8_t indx1 = vld1q_u16(kIotaArr + 8);
+
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+#define APPLY_HORIZONTAL_SHIFT(fn, ...) \
+ do { \
+ if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \
+ for (int k = 0; k < 15; ++k) { \
+ const int iy = clamp(iy4 + k - 7, 0, height - 1); \
+ uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \
+ \
+ if (out_of_boundary_left >= 0) { \
+ uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); \
+ uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); \
+ uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); \
+ uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); \
+ src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \
+ src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \
+ } \
+ if (out_of_boundary_right >= 0) { \
+ uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); \
+ uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); \
+ uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); \
+ uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); \
+ src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \
+ src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \
+ } \
+ tmp[k] = (fn)(src_1, __VA_ARGS__); \
+ } \
+ } else { \
+ for (int k = 0; k < 15; ++k) { \
+ const int iy = clamp(iy4 + k - 7, 0, height - 1); \
+ uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \
+ tmp[k] = (fn)(src_1, __VA_ARGS__); \
+ } \
+ } \
+ } while (0)
+
+ if (p_width == 4) {
+ if (beta == 0) {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, sx4);
+ } else {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, sx4, alpha);
+ }
+ } else {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd,
+ (sx4 + beta * (k - 3)));
+ } else {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd,
+ (sx4 + beta * (k - 3)), alpha);
+ }
+ }
+ } else {
+ if (beta == 0) {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, sx4);
+ } else {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, sx4, alpha);
+ }
+ } else {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd,
+ (sx4 + beta * (k - 3)));
+ } else {
+ APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd,
+ (sx4 + beta * (k - 3)), alpha);
+ }
+ }
+ }
+}
+
+static INLINE void highbd_vertical_filter_4x1_f4(
+ uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride,
+ bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd,
+ int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) {
+ int32x4_t sum0 = gamma == 0 ? vertical_filter_4x1_f1(tmp, sy)
+ : vertical_filter_4x1_f4(tmp, sy, gamma);
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - round0;
+
+ sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert));
+
+ uint16_t *dst16 = &pred[i * p_stride + j];
+
+ if (!is_compound) {
+ const int reduce_bits_vert = 2 * FILTER_BITS - round0;
+ sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert));
+
+ const int res_sub_const = (1 << (bd - 1)) + (1 << bd);
+ sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const));
+ uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd);
+ vst1_u16(dst16, res0);
+ return;
+ }
+
+ sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS);
+
+ uint16_t *p = &dst[i * dst_stride + j];
+
+ if (!do_average) {
+ vst1_u16(p, vqmovun_s32(sum0));
+ return;
+ }
+
+ uint16x4_t p0 = vld1_u16(p);
+ int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0));
+ if (use_dist_wtd_comp_avg) {
+ p_vec0 = vmulq_n_s32(p_vec0, fwd);
+ p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd);
+ p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS);
+ } else {
+ p_vec0 = vhaddq_s32(p_vec0, sum0);
+ }
+
+ const int offset_bits = bd + 2 * FILTER_BITS - round0;
+ const int round1 = COMPOUND_ROUND1_BITS;
+ const int res_sub_const =
+ (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1));
+ const int round_bits = 2 * FILTER_BITS - round0 - round1;
+
+ p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const));
+ p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits));
+ uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd);
+ vst1_u16(dst16, res0);
+}
+
+static INLINE void highbd_vertical_filter_8x1_f8(
+ uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride,
+ bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd,
+ int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) {
+ int32x4x2_t sums = gamma == 0 ? vertical_filter_8x1_f1(tmp, sy)
+ : vertical_filter_8x1_f8(tmp, sy, gamma);
+ int32x4_t sum0 = sums.val[0];
+ int32x4_t sum1 = sums.val[1];
+
+ const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - round0;
+
+ sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert));
+ sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert));
+
+ uint16_t *dst16 = &pred[i * p_stride + j];
+
+ if (!is_compound) {
+ const int reduce_bits_vert = 2 * FILTER_BITS - round0;
+ sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert));
+ sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert));
+
+ const int res_sub_const = (1 << (bd - 1)) + (1 << bd);
+ sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const));
+ sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const));
+ uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd);
+ uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd);
+ vst1_u16(dst16, res0);
+ vst1_u16(dst16 + 4, res1);
+ return;
+ }
+
+ sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS);
+ sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS);
+
+ uint16_t *p = &dst[i * dst_stride + j];
+
+ if (!do_average) {
+ vst1_u16(p, vqmovun_s32(sum0));
+ vst1_u16(p + 4, vqmovun_s32(sum1));
+ return;
+ }
+
+ uint16x8_t p0 = vld1q_u16(p);
+ int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0)));
+ int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0)));
+ if (use_dist_wtd_comp_avg) {
+ p_vec0 = vmulq_n_s32(p_vec0, fwd);
+ p_vec1 = vmulq_n_s32(p_vec1, fwd);
+ p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd);
+ p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd);
+ p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS);
+ p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS);
+ } else {
+ p_vec0 = vhaddq_s32(p_vec0, sum0);
+ p_vec1 = vhaddq_s32(p_vec1, sum1);
+ }
+
+ const int offset_bits = bd + 2 * FILTER_BITS - round0;
+ const int round1 = COMPOUND_ROUND1_BITS;
+ const int res_sub_const =
+ (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1));
+ const int round_bits = 2 * FILTER_BITS - round0 - round1;
+
+ p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const));
+ p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const));
+
+ p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits));
+ p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits));
+ uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd);
+ uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd);
+ vst1_u16(dst16, res0);
+ vst1_u16(dst16 + 4, res1);
+}
+
+static INLINE void warp_affine_vertical(
+ uint16_t *pred, int p_width, int p_height, int p_stride, int bd,
+ uint16_t *dst, int dst_stride, bool is_compound, bool do_average,
+ bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta,
+ const int16x8_t *tmp, int i, int sy4, int j) {
+ int limit_height = p_height > 4 ? 8 : 4;
+
+ if (p_width > 4) {
+ // p_width == 8
+ for (int k = 0; k < limit_height; ++k) {
+ int sy = sy4 + delta * k;
+ highbd_vertical_filter_8x1_f8(
+ pred, p_stride, bd, dst, dst_stride, is_compound, do_average,
+ use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j);
+ }
+ } else {
+ // p_width == 4
+ for (int k = 0; k < limit_height; ++k) {
+ int sy = sy4 + delta * k;
+ highbd_vertical_filter_4x1_f4(
+ pred, p_stride, bd, dst, dst_stride, is_compound, do_average,
+ use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j);
+ }
+ }
+}
+
+static INLINE void highbd_warp_affine_common(
+ const int32_t *mat, const uint16_t *ref, int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ uint16_t *const dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const bool is_compound = conv_params->is_compound;
+ const bool do_average = conv_params->do_average;
+ const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int fwd = conv_params->fwd_offset;
+ const int bwd = conv_params->bck_offset;
+
+ assert(IMPLIES(is_compound, dst != NULL));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (j + 4 + p_col) << subsampling_x;
+ const int32_t src_y = (i + 4 + p_row) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4);
+ sy4 += gamma * (-4) + delta * (-4);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Each horizontal filter result is formed by the sum of up to eight
+ // multiplications by filter values and then a shift. Although both the
+ // inputs and filters are loaded as int16, the input data is at most bd
+ // bits and the filters are at most 8 bits each. Additionally since we
+ // know all possible filter values we know that the sum of absolute
+ // filter values will fit in at most 9 bits. With this in mind we can
+ // conclude that the sum of each filter application will fit in bd + 9
+ // bits. The shift following the summation is ROUND0_BITS (which is 3),
+ // +2 for 12-bit, which gives us a final storage of:
+ // bd == 8: ( 8 + 9) - 3 => 14 bits
+ // bd == 10: (10 + 9) - 3 => 16 bits
+ // bd == 12: (12 + 9) - 5 => 16 bits
+ // So it is safe to use int16x8_t as the intermediate storage type here.
+ int16x8_t tmp[15];
+
+ warp_affine_horizontal(ref, width, height, stride, p_width, alpha, beta,
+ iy4, sx4, ix4, tmp, bd);
+ warp_affine_vertical(pred, p_width, p_height, p_stride, bd, dst,
+ dst_stride, is_compound, do_average,
+ use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, tmp,
+ i, sy4, j);
+ }
+ }
+}
+
+#endif // AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c
new file mode 100644
index 0000000000..a6bd6d38e4
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#define HBD_WIENER_5TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s04 = vaddq_s16(s0, s4); \
+ int16x8_t s13 = vaddq_s16(s1, s3); \
+ \
+ /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \
+ s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_5TAP_HORIZ
+
+#define HBD_WIENER_7TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \
+ const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s06 = vaddq_s16(s0, s6); \
+ int16x8_t s15 = vaddq_s16(s1, s5); \
+ int16x8_t s24 = vaddq_s16(s2, s4); \
+ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_7tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \
+ s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_7TAP_HORIZ
+
+#define HBD_WIENER_5TAP_VERT(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)); \
+ int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3)); \
+ \
+ /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0); \
+ sum_lo = \
+ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1); \
+ \
+ int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)); \
+ int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3)); \
+ \
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0); \
+ sum_hi = \
+ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_vert( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int height = h; \
+ \
+ while (height > 3) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; \
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \
+ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \
+ uint16x8_t d1 = name##_wiener_convolve5_8_2d_v( \
+ s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val); \
+ uint16x8_t d2 = name##_wiener_convolve5_8_2d_v( \
+ s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ uint16x8_t d3 = name##_wiener_convolve5_8_2d_v( \
+ s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \
+ \
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \
+ \
+ s += 4 * src_stride; \
+ d += 4 * dst_stride; \
+ height -= 4; \
+ } \
+ \
+ while (height-- != 0) { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \
+ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += src_stride; \
+ d += dst_stride; \
+ } \
+ \
+ src_ptr += 8; \
+ dst_ptr += 8; \
+ w -= 8; \
+ } while (w != 0); \
+ }
+
+HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_5TAP_VERT
+
+#define HBD_WIENER_7TAP_VERT(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \
+ const uint16x8_t res_max_val) { \
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \
+ int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \
+ int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \
+ \
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \
+ sum_lo = \
+ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \
+ \
+ int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \
+ int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \
+ int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \
+ \
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \
+ sum_hi = \
+ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_7tap_vert( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int height = h; \
+ \
+ while (height > 3) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \
+ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \
+ &s8, &s9); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \
+ s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \
+ uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \
+ s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \
+ uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \
+ s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \
+ \
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \
+ \
+ s += 4 * src_stride; \
+ d += 4 * dst_stride; \
+ height -= 4; \
+ } \
+ \
+ while (height-- != 0) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += src_stride; \
+ d += dst_stride; \
+ } \
+ \
+ src_ptr += 8; \
+ dst_ptr += 8; \
+ w -= 8; \
+ } while (w != 0); \
+ }
+
+HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_7TAP_VERT
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+ assert(filter[7] == 0);
+ if (filter[0] == 0 && filter[6] == 0) {
+ return WIENER_WIN_REDUCED;
+ }
+ return WIENER_WIN;
+}
+
+void av1_highbd_wiener_convolve_add_src_neon(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4,
+ const int16_t *y_filter, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ assert(w % 8 == 0);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(x_filter[7] == 0 && y_filter[7] == 0);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+ const int x_filter_taps = get_wiener_filter_taps(x_filter);
+ const int y_filter_taps = get_wiener_filter_taps(y_filter);
+ int16x4_t x_filter_s16 = vld1_s16(x_filter);
+ int16x4_t y_filter_s16 = vld1_s16(y_filter);
+ // Add 128 to tap 3. (Needed for rounding.)
+ x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+ y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
+
+ const int im_stride = MAX_SB_SIZE;
+ const int im_h = h + y_filter_taps - 1;
+ const int horiz_offset = x_filter_taps / 2;
+ const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
+
+ const int extraprec_clamp_limit =
+ WIENER_CLAMP_LIMIT(conv_params->round_0, bd);
+ const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1);
+ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+ const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1);
+ const int32x4_t vert_round_vec =
+ vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1)));
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ if (bd == 12) {
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ }
+
+ } else {
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w,
+ h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w,
+ h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c
new file mode 100644
index 0000000000..2b0274cc64
--- /dev/null
+++ b/third_party/aom/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,217 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+static AOM_INLINE void diffwtd_mask_d16_neon(
+ uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round));
+
+ if (w >= 16) {
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0_lo = vld1q_u16(src0 + j);
+ uint16x8_t s1_lo = vld1q_u16(src1 + j);
+ uint16x8_t s0_hi = vld1q_u16(src0 + j + 8);
+ uint16x8_t s1_hi = vld1q_u16(src1 + j + 8);
+
+ uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec);
+ uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec);
+ uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
+ uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
+
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ j += 16;
+ } while (j < w);
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (++i < h);
+ } else if (w == 8) {
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0);
+ uint16x8_t s1 = vld1q_u16(src1);
+
+ uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
+ }
+
+ vst1_u8(mask, m);
+
+ mask += 8;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (++i < h);
+ } else if (w == 4) {
+ int i = 0;
+ do {
+ uint16x8_t s0 =
+ vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride));
+ uint16x8_t s1 =
+ vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride));
+
+ uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
+ uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
+ uint8x8_t m;
+ if (inverse) {
+ m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0
+ } else {
+ m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
+ }
+
+ vst1_u8(mask, m);
+
+ mask += 8;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ i += 2;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ assert(h >= 4);
+ assert(w >= 4);
+ assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+
+ if (mask_type == DIFFWTD_38) {
+ diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
+ src1_stride, h, w, conv_params, bd);
+ } else { // mask_type == DIFFWTD_38_INV
+ diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
+ src1_stride, h, w, conv_params, bd);
+ }
+}
+
+static AOM_INLINE void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ if (w >= 16) {
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src0 + j);
+ uint8x16_t s1 = vld1q_u8(src1 + j);
+
+ uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ j += 16;
+ } while (j < w);
+ src0 += src0_stride;
+ src1 += src1_stride;
+ } while (++i < h);
+ } else if (w == 8) {
+ int i = 0;
+ do {
+ uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride));
+ uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride));
+
+ uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ i += 2;
+ } while (i < h);
+ } else if (w == 4) {
+ int i = 0;
+ do {
+ uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride);
+ uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride);
+
+ uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
+ uint8x16_t m;
+ if (inverse) {
+ m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
+ } else {
+ m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
+ }
+
+ vst1q_u8(mask, m);
+
+ mask += 16;
+ src0 += 4 * src0_stride;
+ src1 += 4 * src1_stride;
+ i += 4;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_neon(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ assert(h % 4 == 0);
+ assert(w % 4 == 0);
+ assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
+
+ if (mask_type == DIFFWTD_38) {
+ diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
+ src1_stride, h, w);
+ } else { // mask_type == DIFFWTD_38_INV
+ diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
+ src1_stride, h, w);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/reconintra_neon.c b/third_party/aom/av1/common/arm/reconintra_neon.c
new file mode 100644
index 0000000000..3db39987a6
--- /dev/null
+++ b/third_party/aom/av1/common/arm/reconintra_neon.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+#define MAX_UPSAMPLE_SZ 16
+
+// These kernels are a transposed version of those defined in reconintra.c,
+// with the absolute value of the negatives taken in the top row.
+DECLARE_ALIGNED(16, const uint8_t,
+ av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = {
+ // clang-format off
+ {
+ { 6, 5, 3, 3, 4, 3, 3, 3 },
+ { 10, 2, 1, 1, 6, 2, 2, 1 },
+ { 0, 10, 1, 1, 0, 6, 2, 2 },
+ { 0, 0, 10, 2, 0, 0, 6, 2 },
+ { 0, 0, 0, 10, 0, 0, 0, 6 },
+ { 12, 9, 7, 5, 2, 2, 2, 3 },
+ { 0, 0, 0, 0, 12, 9, 7, 5 }
+ },
+ {
+ { 10, 6, 4, 2, 10, 6, 4, 2 },
+ { 16, 0, 0, 0, 16, 0, 0, 0 },
+ { 0, 16, 0, 0, 0, 16, 0, 0 },
+ { 0, 0, 16, 0, 0, 0, 16, 0 },
+ { 0, 0, 0, 16, 0, 0, 0, 16 },
+ { 10, 6, 4, 2, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 10, 6, 4, 2 }
+ },
+ {
+ { 8, 8, 8, 8, 4, 4, 4, 4 },
+ { 8, 0, 0, 0, 4, 0, 0, 0 },
+ { 0, 8, 0, 0, 0, 4, 0, 0 },
+ { 0, 0, 8, 0, 0, 0, 4, 0 },
+ { 0, 0, 0, 8, 0, 0, 0, 4 },
+ { 16, 16, 16, 16, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 16, 16, 16, 16 }
+ },
+ {
+ { 2, 1, 1, 0, 1, 1, 1, 1 },
+ { 8, 3, 2, 1, 4, 3, 2, 2 },
+ { 0, 8, 3, 2, 0, 4, 3, 2 },
+ { 0, 0, 8, 3, 0, 0, 4, 3 },
+ { 0, 0, 0, 8, 0, 0, 0, 4 },
+ { 10, 6, 4, 2, 3, 4, 4, 3 },
+ { 0, 0, 0, 0, 10, 6, 4, 3 }
+ },
+ {
+ { 12, 10, 9, 8, 10, 9, 8, 7 },
+ { 14, 0, 0, 0, 12, 1, 0, 0 },
+ { 0, 14, 0, 0, 0, 12, 0, 0 },
+ { 0, 0, 14, 0, 0, 0, 12, 1 },
+ { 0, 0, 0, 14, 0, 0, 0, 12 },
+ { 14, 12, 11, 10, 0, 0, 1, 1 },
+ { 0, 0, 0, 0, 14, 12, 11, 9 }
+ }
+ // clang-format on
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ assert(width <= 32 && height <= 32);
+
+ const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]);
+ const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]);
+ const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]);
+ const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]);
+ const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]);
+ const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]);
+ const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]);
+
+ uint8_t buffer[33][33];
+ // Populate the top row in the scratch buffer with data from above.
+ memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t));
+ // Populate the first column in the scratch buffer with data from the left.
+ int r = 0;
+ do {
+ buffer[r + 1][0] = left[r];
+ } while (++r < height);
+
+ // Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster.
+ if (width <= 8) {
+ r = 1;
+ do {
+ int c = 1;
+ uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]);
+ uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]);
+ uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]);
+
+ do {
+ uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1);
+ uint8x8_t s1 = vdup_lane_u8(s1234, 0);
+ uint8x8_t s2 = vdup_lane_u8(s1234, 1);
+ uint8x8_t s3 = vdup_lane_u8(s1234, 2);
+ uint8x8_t s4 = vdup_lane_u8(s1234, 3);
+
+ uint16x8_t sum = vmull_u8(s1, f1);
+ // First row of each filter has all negative values so subtract.
+ sum = vmlsl_u8(sum, s0, f0);
+ sum = vmlal_u8(sum, s2, f2);
+ sum = vmlal_u8(sum, s3, f3);
+ sum = vmlal_u8(sum, s4, f4);
+ sum = vmlal_u8(sum, s5, f5);
+ sum = vmlal_u8(sum, s6, f6);
+
+ uint8x8_t res =
+ vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS);
+
+ // Store buffer[r + 0][c] and buffer[r + 1][c].
+ store_u8x4_strided_x2(&buffer[r][c], 33, res);
+
+ store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res);
+
+ s0 = s4;
+ s5 = vdup_lane_u8(res, 3);
+ s6 = vdup_lane_u8(res, 7);
+ c += 4;
+ } while (c < width + 1);
+
+ r += 2;
+ } while (r < height + 1);
+ } else {
+ r = 1;
+ do {
+ int c = 1;
+ uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]);
+ uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]);
+ uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]);
+
+ do {
+ uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1);
+ uint8x8_t s1_lo = vdup_lane_u8(s1234, 0);
+ uint8x8_t s2_lo = vdup_lane_u8(s1234, 1);
+ uint8x8_t s3_lo = vdup_lane_u8(s1234, 2);
+ uint8x8_t s4_lo = vdup_lane_u8(s1234, 3);
+
+ uint16x8_t sum_lo = vmull_u8(s1_lo, f1);
+ // First row of each filter has all negative values so subtract.
+ sum_lo = vmlsl_u8(sum_lo, s0_lo, f0);
+ sum_lo = vmlal_u8(sum_lo, s2_lo, f2);
+ sum_lo = vmlal_u8(sum_lo, s3_lo, f3);
+ sum_lo = vmlal_u8(sum_lo, s4_lo, f4);
+ sum_lo = vmlal_u8(sum_lo, s5_lo, f5);
+ sum_lo = vmlal_u8(sum_lo, s6_lo, f6);
+
+ uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo),
+ FILTER_INTRA_SCALE_BITS);
+
+ uint8x8_t s0_hi = s4_lo;
+ uint8x8_t s1_hi = vdup_lane_u8(s1234, 4);
+ uint8x8_t s2_hi = vdup_lane_u8(s1234, 5);
+ uint8x8_t s3_hi = vdup_lane_u8(s1234, 6);
+ uint8x8_t s4_hi = vdup_lane_u8(s1234, 7);
+ uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3);
+ uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7);
+
+ uint16x8_t sum_hi = vmull_u8(s1_hi, f1);
+ // First row of each filter has all negative values so subtract.
+ sum_hi = vmlsl_u8(sum_hi, s0_hi, f0);
+ sum_hi = vmlal_u8(sum_hi, s2_hi, f2);
+ sum_hi = vmlal_u8(sum_hi, s3_hi, f3);
+ sum_hi = vmlal_u8(sum_hi, s4_hi, f4);
+ sum_hi = vmlal_u8(sum_hi, s5_hi, f5);
+ sum_hi = vmlal_u8(sum_hi, s6_hi, f6);
+
+ uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi),
+ FILTER_INTRA_SCALE_BITS);
+
+ uint32x2x2_t res =
+ vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));
+
+ vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0]));
+ vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1]));
+
+ vst1_u8(dst + (r - 1) * stride + c - 1,
+ vreinterpret_u8_u32(res.val[0]));
+ vst1_u8(dst + (r + 0) * stride + c - 1,
+ vreinterpret_u8_u32(res.val[1]));
+
+ s0_lo = s4_hi;
+ s5_lo = vdup_lane_u8(res_hi, 3);
+ s6_lo = vdup_lane_u8(res_hi, 7);
+ c += 8;
+ } while (c < width + 1);
+
+ r += 2;
+ } while (r < height + 1);
+ }
+}
+
+void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+ assert(sz >= 0 && sz <= 129);
+
+ uint8_t edge[160]; // Max value of sz + enough padding for vector accesses.
+ memcpy(edge + 1, p, sz * sizeof(*p));
+
+ // Populate extra space appropriately.
+ edge[0] = edge[1];
+ edge[sz + 1] = edge[sz];
+ edge[sz + 2] = edge[sz];
+
+ // Don't overwrite first pixel.
+ uint8_t *dst = p + 1;
+ sz--;
+
+ if (strength == 1) { // Filter: {4, 8, 4}.
+ const uint8_t *src = edge + 1;
+
+ while (sz >= 8) {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ // Make use of the identity:
+ // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
+ uint16x8_t t0 = vaddl_u8(s0, s2);
+ uint16x8_t t1 = vaddl_u8(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 2);
+
+ vst1_u8(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ uint16x8_t t0 = vaddl_u8(s0, s2);
+ uint16x8_t t1 = vaddl_u8(s1, s1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 2);
+
+ // Mask off out-of-bounds indices.
+ uint8x8_t current_dst = vld1_u8(dst);
+ uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+ res = vbsl_u8(mask, res, current_dst);
+
+ vst1_u8(dst, res);
+ }
+ } else if (strength == 2) { // Filter: {5, 6, 5}.
+ const uint8_t *src = edge + 1;
+
+ const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } };
+
+ while (sz >= 8) {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ uint16x8_t accum = vmull_u8(s0, filter.val[0]);
+ accum = vmlal_u8(accum, s1, filter.val[1]);
+ accum = vmlal_u8(accum, s2, filter.val[2]);
+ uint8x8_t res = vrshrn_n_u16(accum, 4);
+
+ vst1_u8(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+
+ uint16x8_t accum = vmull_u8(s0, filter.val[0]);
+ accum = vmlal_u8(accum, s1, filter.val[1]);
+ accum = vmlal_u8(accum, s2, filter.val[2]);
+ uint8x8_t res = vrshrn_n_u16(accum, 4);
+
+ // Mask off out-of-bounds indices.
+ uint8x8_t current_dst = vld1_u8(dst);
+ uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+ res = vbsl_u8(mask, res, current_dst);
+
+ vst1_u8(dst, res);
+ }
+ } else { // Filter {2, 4, 4, 4, 2}.
+ const uint8_t *src = edge;
+
+ while (sz >= 8) {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+ uint8x8_t s3 = vld1_u8(src + 3);
+ uint8x8_t s4 = vld1_u8(src + 4);
+
+ // Make use of the identity:
+ // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
+ uint16x8_t t0 = vaddl_u8(s0, s4);
+ uint16x8_t t1 = vaddl_u8(s1, s2);
+ t1 = vaddw_u8(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 3);
+
+ vst1_u8(dst, res);
+
+ src += 8;
+ dst += 8;
+ sz -= 8;
+ }
+
+ if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+ uint8x8_t s3 = vld1_u8(src + 3);
+ uint8x8_t s4 = vld1_u8(src + 4);
+
+ uint16x8_t t0 = vaddl_u8(s0, s4);
+ uint16x8_t t1 = vaddl_u8(s1, s2);
+ t1 = vaddw_u8(t1, s3);
+ t1 = vaddq_u16(t1, t1);
+ uint16x8_t sum = vaddq_u16(t0, t1);
+ uint8x8_t res = vrshrn_n_u16(sum, 3);
+
+ // Mask off out-of-bounds indices.
+ uint8x8_t current_dst = vld1_u8(dst);
+ uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
+ res = vbsl_u8(mask, res, current_dst);
+
+ vst1_u8(dst, res);
+ }
+ }
+}
+
+void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
+ if (!sz) return;
+
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint8_t edge[MAX_UPSAMPLE_SZ + 3];
+ const uint8_t *src = edge;
+
+ // Copy p[-1..(sz-1)] and pad out both ends.
+ edge[0] = p[-1];
+ edge[1] = p[-1];
+ memcpy(edge + 2, p, sz);
+ edge[sz + 2] = p[sz - 1];
+ p[-2] = p[-1];
+
+ uint8_t *dst = p - 1;
+
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t s1 = vld1_u8(src + 1);
+ uint8x8_t s2 = vld1_u8(src + 2);
+ uint8x8_t s3 = vld1_u8(src + 3);
+
+ int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
+ int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
+ t1 = vmulq_n_s16(t1, 9);
+ t1 = vsubq_s16(t1, t0);
+
+ uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } };
+
+ vst2_u8(dst, res);
+
+ src += 8;
+ dst += 16;
+ sz -= 8;
+ } while (sz > 0);
+}
diff --git a/third_party/aom/av1/common/arm/resize_neon.c b/third_party/aom/av1/common/arm/resize_neon.c
new file mode 100644
index 0000000000..b00ebd1fc2
--- /dev/null
+++ b/third_party/aom/av1/common/arm/resize_neon.c
@@ -0,0 +1,1178 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/resize.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+
+ int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+ const int16x8_t filter) {
+ int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+ int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+ int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+ int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+ return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter);
+}
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const int src_stride,
+ uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h) {
+ const int max_width = (w + 15) & ~15;
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ const uint8x16x2_t s = vld2q_u8(src);
+ vst1q_u8(dst, s.val[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const int src_stride,
+ uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h) {
+ const int max_width = (w + 15) & ~15;
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ const uint8x16x4_t s = vld4q_u8(src);
+ vst1q_u8(dst, s.val[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+ const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+ const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+ uint8_t *const dst) {
+ const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+ const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+ const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+ const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+ const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+ const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+ const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+ const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+ const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07
+ const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F
+ const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17
+ const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F
+ const uint16x8_t v0 = vmull_u8(hor0, coef0);
+ const uint16x8_t v1 = vmull_u8(hor1, coef0);
+ const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+ const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+ // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+ vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+ const uint8_t *const src, const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w, const int h, const int16_t c0,
+ const int16_t c1) {
+ const int max_width = (w + 15) & ~15;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + src_stride;
+ const uint8x8_t coef0 = vdup_n_u8(c0);
+ const uint8x8_t coef1 = vdup_n_u8(c1);
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E
+ // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F
+ const uint8x16x2_t s0 = vld2q_u8(src0);
+ // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E
+ // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F
+ const uint8x16x2_t s1 = vld2q_u8(src1);
+ scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+ coef0, coef1, dst);
+ src0 += 32;
+ src1 += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src0 += 2 * (src_stride - max_width);
+ src1 += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+ const uint8_t *const src, const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w, const int h, const int16_t c0,
+ const int16_t c1) {
+ const int max_width = (w + 15) & ~15;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + src_stride;
+ const uint8x8_t coef0 = vdup_n_u8(c0);
+ const uint8x8_t coef1 = vdup_n_u8(c1);
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ // (*) -- useless
+ // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C
+ // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D
+ // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*)
+ // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*)
+ const uint8x16x4_t s0 = vld4q_u8(src0);
+ // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C
+ // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D
+ // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*)
+ // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*)
+ const uint8x16x4_t s1 = vld4q_u8(src1);
+ scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+ coef0, coef1, dst);
+ src0 += 64;
+ src1 += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src0 += 4 * (src_stride - max_width);
+ src1 += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ const int16x8_t filters = vld1q_s16(coef);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[14], d[4];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ // Note: processing 4x8 is about 20% faster than processing row by row using
+ // vld4_u8().
+ do {
+ load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+ transpose_elems_inplace_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
+ d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71
+ d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72
+ d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ transpose_elems_inplace_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+ vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+ 1);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+
+ t += 4;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 7 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+ t += 8 * width_hor;
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
+ d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17
+ d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27
+ d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ const int16x8_t filters = vld1q_s16(coef);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[12], d[2];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ // Note: processing 2x8 is about 20% faster than processing row by row using
+ // vld4_u8().
+ do {
+ load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_elems_u8_4x8(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
+ &s[0], &s[1], &s[2], &s[3]);
+ x = width_hor;
+
+ do {
+ uint8x8x2_t dd;
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+ transpose_elems_inplace_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
+ d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71
+ // dd.val[0]: 00 01 20 21 40 41 60 61
+ // dd.val[1]: 10 11 30 31 50 51 70 71
+ dd = vtrn_u8(d[0], d[1]);
+ vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 0);
+ vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 0);
+ vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 1);
+ vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 1);
+ vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 2);
+ vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 2);
+ vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 3);
+ vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 3);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+
+ t += 2;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 7 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+ t += 8 * width_hor;
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
+ d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+ const uint8x8_t *const coef) {
+ const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+ const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+ return vrshrn_n_u16(h1, 7);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+ const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h, const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = width_hor + 2; // store 2 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We only need 1 extra row below because there are only 2 bilinear
+ // coefficients.
+ const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[9], d[8], c[6];
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr;
+ assert(w && h);
+
+ c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]);
+ c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]);
+ c[2] = vdup_n_u8(
+ (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]);
+ c[3] = vdup_n_u8(
+ (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]);
+ c[4] = vdup_n_u8(
+ (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]);
+ c[5] = vdup_n_u8(
+ (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]);
+
+ d[6] = vdup_n_u8(0);
+ d[7] = vdup_n_u8(0);
+
+ // horizontal 6x8
+ do {
+ load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ src += 1;
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+ src += 8;
+ transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = scale_filter_bilinear(&s[0], &c[0]);
+ d[1] =
+ scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+ d[2] =
+ scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+ d[3] = scale_filter_bilinear(&s[4], &c[0]);
+ d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+ &c[2]);
+ d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+ &c[4]);
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+ // 60 61 62 63 64 65 xx xx
+ // 70 71 72 73 74 75 xx xx
+ transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+ &d[6], &d[7]);
+ // store 2 extra pixels
+ vst1_u8(t + 0 * stride_hor, d[0]);
+ vst1_u8(t + 1 * stride_hor, d[1]);
+ vst1_u8(t + 2 * stride_hor, d[2]);
+ vst1_u8(t + 3 * stride_hor, d[3]);
+ vst1_u8(t + 4 * stride_hor, d[4]);
+ vst1_u8(t + 5 * stride_hor, d[5]);
+ vst1_u8(t + 6 * stride_hor, d[6]);
+ vst1_u8(t + 7 * stride_hor, d[7]);
+
+ s[0] = s[8];
+
+ t += 6;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3 - 1;
+ t += 7 * stride_hor + 2;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += stride_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+ t += 8 * stride_hor;
+
+ d[0] = scale_filter_bilinear(&s[0], &c[0]);
+ d[1] =
+ scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+ d[2] =
+ scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+ d[3] = scale_filter_bilinear(&s[4], &c[0]);
+ d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+ &c[2]);
+ d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+ &c[4]);
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+ vst1_u8(dst + 4 * dst_stride, d[4]);
+ vst1_u8(dst + 5 * dst_stride, d[5]);
+
+ s[0] = s[8];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * (4 * height_ver / 3 + 1);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = width_hor + 2; // store 2 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ const int16x8_t filters0 = vld1q_s16(
+ (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+ const int16x8_t filters1 = vld1q_s16(
+ (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+ const int16x8_t filters2 = vld1q_s16(
+ (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[15], d[8];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+ d[6] = vdup_n_u8(0);
+ d[7] = vdup_n_u8(0);
+
+ // horizontal 6x8
+ do {
+ load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13], &s[14]);
+ transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13], &s[14]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = scale_filter_8(&s[0], filters0);
+ d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+ d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+ d[3] = scale_filter_8(&s[4], filters0);
+ d[4] =
+ scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+ d[5] =
+ scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+ // 60 61 62 63 64 65 xx xx
+ // 70 71 72 73 74 75 xx xx
+ transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+ &d[6], &d[7]);
+ // store 2 extra pixels
+ vst1_u8(t + 0 * stride_hor, d[0]);
+ vst1_u8(t + 1 * stride_hor, d[1]);
+ vst1_u8(t + 2 * stride_hor, d[2]);
+ vst1_u8(t + 3 * stride_hor, d[3]);
+ vst1_u8(t + 4 * stride_hor, d[4]);
+ vst1_u8(t + 5 * stride_hor, d[5]);
+ vst1_u8(t + 6 * stride_hor, d[6]);
+ vst1_u8(t + 7 * stride_hor, d[7]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+ s[6] = s[14];
+
+ t += 6;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 7 * stride_hor + 2;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += 7 * stride_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13], &s[14]);
+ t += 8 * stride_hor;
+
+ d[0] = scale_filter_8(&s[0], filters0);
+ d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+ d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+ d[3] = scale_filter_8(&s[4], filters0);
+ d[4] =
+ scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+ d[5] =
+ scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+ vst1_u8(dst + 4 * dst_stride, d[4]);
+ vst1_u8(dst + 5 * dst_stride, d[5]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+ s[6] = s[14];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * (4 * height_ver / 3 + 7);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON.
+static INLINE bool has_normative_scaler_neon(const int src_width,
+ const int src_height,
+ const int dst_width,
+ const int dst_height) {
+ const bool has_normative_scaler =
+ (2 * dst_width == src_width && 2 * dst_height == src_height) ||
+ (4 * dst_width == src_width && 4 * dst_height == src_height) ||
+ (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height);
+
+ return has_normative_scaler;
+}
+
+void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ const InterpFilter filter,
+ const int phase, const int num_planes) {
+ bool has_normative_scaler =
+ has_normative_scaler_neon(src->y_crop_width, src->y_crop_height,
+ dst->y_crop_width, dst->y_crop_height);
+
+ if (num_planes > 1) {
+ has_normative_scaler =
+ has_normative_scaler &&
+ has_normative_scaler_neon(src->uv_crop_width, src->uv_crop_height,
+ dst->uv_crop_width, dst->uv_crop_height);
+ }
+
+ if (!has_normative_scaler) {
+ av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+ return;
+ }
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ int malloc_failed = 0;
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+ const int src_w = src->crop_widths[is_uv];
+ const int src_h = src->crop_heights[is_uv];
+ const int dst_w = dst->crop_widths[is_uv];
+ const int dst_h = dst->crop_heights[is_uv];
+ const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+ const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+ if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ if (phase == 0) {
+ scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0, c1);
+ } else {
+ const int buffer_stride = (dst_y_w + 3) & ~3;
+ const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel[phase], temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ if (phase == 0) {
+ scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0, c1);
+ } else {
+ const int buffer_stride = (dst_y_w + 1) & ~1;
+ const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel[phase], temp_buffer);
+ free(temp_buffer);
+ }
+ } else {
+ assert(4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h);
+ // 4 to 3
+ const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+ const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ if (filter == BILINEAR) {
+ scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, phase, temp_buffer);
+ } else {
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel, phase, temp_buffer);
+ }
+ free(temp_buffer);
+ }
+ }
+
+ if (malloc_failed) {
+ av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+ } else {
+ aom_extend_frame_borders(dst, num_planes);
+ }
+}
+
+static INLINE void scaledconvolve_horiz_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ y = h;
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x8_t ss[4];
+ int16x4_t t[8], tt;
+
+ load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+ transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ t[0] = vget_low_s16(ss[0]);
+ t[1] = vget_low_s16(ss[1]);
+ t[2] = vget_low_s16(ss[2]);
+ t[3] = vget_low_s16(ss[3]);
+ t[4] = vget_high_s16(ss[0]);
+ t[5] = vget_high_s16(ss[1]);
+ t[6] = vget_high_s16(ss[2]);
+ t[7] = vget_high_s16(ss[3]);
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+ filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ store_u8_4x1(&temp[4 * z], d);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ {
+ const uint8x8x4_t d4 = vld4_u8(temp);
+ store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]);
+ store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]);
+ store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]);
+ store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]);
+ }
+ x += 4;
+ } while (x < w);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ y -= 4;
+ } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = (h + 7) & ~7;
+
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ uint8x8_t d[8];
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8];
+ load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ d[0] = scale_filter_8(s, filters);
+ vst1_u8(&temp[8 * z], d[0]);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5],
+ &d[6], &d[7]);
+ store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5],
+ d[6], d[7]);
+ x += 8;
+ } while (x < w);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x4_t t[8], tt;
+
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+ t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+ t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+ t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+ t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+ t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+ t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+ t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ store_u8_4x1(dst, d);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ d = scale_filter_8(s, filters);
+ vst1_u8(dst, d);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ x = 0;
+ do {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x16_t ss[8];
+ uint8x8_t s[8], d[2];
+ load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+ &ss[5], &ss[6], &ss[7]);
+ s[0] = vget_low_u8(ss[0]);
+ s[1] = vget_low_u8(ss[1]);
+ s[2] = vget_low_u8(ss[2]);
+ s[3] = vget_low_u8(ss[3]);
+ s[4] = vget_low_u8(ss[4]);
+ s[5] = vget_low_u8(ss[5]);
+ s[6] = vget_low_u8(ss[6]);
+ s[7] = vget_low_u8(ss[7]);
+ d[0] = scale_filter_8(s, filters);
+
+ s[0] = vget_high_u8(ss[0]);
+ s[1] = vget_high_u8(ss[1]);
+ s[2] = vget_high_u8(ss[2]);
+ s[3] = vget_high_u8(ss[3]);
+ s[4] = vget_high_u8(ss[4]);
+ s[5] = vget_high_u8(ss[5]);
+ s[6] = vget_high_u8(ss[6]);
+ s[7] = vget_high_u8(ss[7]);
+ d[1] = scale_filter_8(s, filters);
+ vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+ src_y += 16;
+ x += 16;
+ } while (x < w);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
new file mode 100644
index 0000000000..1d3a3cc038
--- /dev/null
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1595 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+ int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+ uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+ uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+ const int buf_stride) {
+ uint32x4_t q0, q1, q2, q3;
+ uint32x4_t p0, p1, p2, p3;
+ uint16x4_t d0, d1, d2, d3;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+
+ q0 = vmulq_u32(s4, s4);
+ q1 = vmulq_u32(s5, s5);
+ q2 = vmulq_u32(s6, s6);
+ q3 = vmulq_u32(s7, s7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 4; y++) {
+ dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+ }
+ p0 = vsubl_u16(sgrproj_sgr, d0);
+ p1 = vsubl_u16(sgrproj_sgr, d1);
+ p2 = vsubl_u16(sgrproj_sgr, d2);
+ p3 = vsubl_u16(sgrproj_sgr, d3);
+
+ s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+ s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+ s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+ s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+ s4 = vmulq_u32(s4, p0);
+ s5 = vmulq_u32(s5, p1);
+ s6 = vmulq_u32(s6, p2);
+ s7 = vmulq_u32(s7, p3);
+
+ p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+ uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+ uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+ uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+ uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+ uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+ uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+ s4 = vmulq_u32(s4, const_n_val);
+ s5 = vmulq_u32(s5, const_n_val);
+ s6 = vmulq_u32(s6, const_n_val);
+ s7 = vmulq_u32(s7, const_n_val);
+
+ d0 = vget_low_u16(s16_4);
+ d1 = vget_low_u16(s16_5);
+ d2 = vget_low_u16(s16_6);
+ d3 = vget_low_u16(s16_7);
+ d4 = vget_high_u16(s16_4);
+ d5 = vget_high_u16(s16_5);
+ d6 = vget_high_u16(s16_6);
+ d7 = vget_high_u16(s16_7);
+
+ q0 = vmull_u16(d0, d0);
+ q1 = vmull_u16(d1, d1);
+ q2 = vmull_u16(d2, d2);
+ q3 = vmull_u16(d3, d3);
+ q4 = vmull_u16(d4, d4);
+ q5 = vmull_u16(d5, d5);
+ q6 = vmull_u16(d6, d6);
+ q7 = vmull_u16(d7, d7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+ p4 = vcleq_u32(q4, s4);
+ p5 = vcleq_u32(q5, s5);
+ p6 = vcleq_u32(q6, s6);
+ p7 = vcleq_u32(q7, s7);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+ q4 = vsubq_u32(s4, q4);
+ q5 = vsubq_u32(s5, q5);
+ q6 = vsubq_u32(s6, q6);
+ q7 = vsubq_u32(s7, q7);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+ p4 = vandq_u32(p4, q4);
+ p5 = vandq_u32(p5, q5);
+ p6 = vandq_u32(p6, q6);
+ p7 = vandq_u32(p7, q7);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+ p4 = vmulq_u32(p4, s_vec);
+ p5 = vmulq_u32(p5, s_vec);
+ p6 = vmulq_u32(p6, s_vec);
+ p7 = vmulq_u32(p7, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+ p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+ p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+ p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+ p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+ p4 = vminq_u32(p4, const_val);
+ p5 = vminq_u32(p5, const_val);
+ p6 = vminq_u32(p6, const_val);
+ p7 = vminq_u32(p7, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+ store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 8; y++) {
+ dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+ }
+
+ s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+ s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+ s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+ s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+ s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+ s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+ s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+ s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+ s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+ s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+ s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+ s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+ s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+ s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+ s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+ s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+ s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+ s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+ s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+ s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+ p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+ p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+ store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+ vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+ vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+ int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+ int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+ int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+ int32x4_t r12, r34, r67, r89, r1011;
+ int32x4_t r345, r6789, r789;
+
+ d1 = vmull_s16(t1, t1);
+ d2 = vmull_s16(t2, t2);
+ d3 = vmull_s16(t3, t3);
+ d4 = vmull_s16(t4, t4);
+ d5 = vmull_s16(t5, t5);
+ d6 = vmull_s16(t6, t6);
+ d7 = vmull_s16(t7, t7);
+ d8 = vmull_s16(t8, t8);
+ d9 = vmull_s16(t9, t9);
+ d10 = vmull_s16(t10, t10);
+ d11 = vmull_s16(t11, t11);
+
+ r12 = vaddq_s32(d1, d2);
+ r34 = vaddq_s32(d3, d4);
+ r67 = vaddq_s32(d6, d7);
+ r89 = vaddq_s32(d8, d9);
+ r1011 = vaddq_s32(d10, d11);
+ r345 = vaddq_s32(r34, d5);
+ r6789 = vaddq_s32(r67, r89);
+ r789 = vsubq_s32(r6789, d6);
+ *r0 = vaddq_s32(r12, r345);
+ *r1 = vaddq_s32(r67, r345);
+ *r2 = vaddq_s32(d5, r6789);
+ *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+ int32_t *dst32, int32_t *dst2, const int dst_stride,
+ const int width, const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *dst1_16_ptr, *src_ptr;
+ int32_t *dst2_ptr;
+ int h, w, count = 0;
+ const int dst_stride_2 = (dst_stride << 1);
+ const int dst_stride_8 = (dst_stride << 3);
+
+ dst1_16_ptr = dst16;
+ dst2_ptr = dst2;
+ src_ptr = src;
+ w = width;
+ {
+ int16x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t t8, t9, t10, t11, t12;
+
+ int16x8_t q12345, q56789, q34567, q7891011;
+ int16x8_t q12, q34, q67, q89, q1011;
+ int16x8_t q345, q6789, q789;
+
+ int32x4_t r12345, r56789, r34567, r7891011;
+
+ do {
+ h = height;
+ dst1_16_ptr = dst16 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+
+ dst1_16_ptr += dst_stride_2;
+ dst2_ptr += dst_stride_2;
+ do {
+ load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+ q12 = vaddq_s16(t1, t2);
+ q34 = vaddq_s16(t3, t4);
+ q67 = vaddq_s16(t6, t7);
+ q89 = vaddq_s16(t8, t9);
+ q1011 = vaddq_s16(t10, t11);
+ q345 = vaddq_s16(q34, t5);
+ q6789 = vaddq_s16(q67, q89);
+ q789 = vaddq_s16(q89, t7);
+ q12345 = vaddq_s16(q12, q345);
+ q34567 = vaddq_s16(q67, q345);
+ q56789 = vaddq_s16(t5, q6789);
+ q7891011 = vaddq_s16(q789, q1011);
+
+ store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+ q7891011);
+ dst1_16_ptr += dst_stride_8;
+
+ boxsum2_square_sum_calc(
+ vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+ vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+ vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+ vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+ boxsum2_square_sum_calc(
+ vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+ vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+ vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+ vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+ r7891011);
+ dst2_ptr += (dst_stride_8);
+ h -= 8;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+
+ // memset needed for row pixels as 2nd stage of boxsum filter uses
+ // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+ for (int x = 0; x < 2; x++) {
+ memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+ memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+ }
+
+ // memset needed for extra columns as 2nd stage of boxsum filter uses
+ // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+ for (int x = 2; x < height + 2; x++) {
+ int dst_offset = x * dst_stride + width + 2;
+ memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+ memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+ }
+ }
+
+ {
+ int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int32x4_t q12345, q34567, q23456, q45678;
+ int32x4_t q23, q45, q67;
+ int32x4_t q2345, q4567;
+
+ int32x4_t r12345, r34567, r23456, r45678;
+ int32x4_t r23, r45, r67;
+ int32x4_t r2345, r4567;
+
+ int32_t *src2_ptr, *dst1_32_ptr;
+ int16_t *src1_ptr;
+ count = 0;
+ h = height;
+ do {
+ dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+ dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+ src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ w = width;
+
+ dst1_32_ptr += 2;
+ dst2_ptr += 2;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+ transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+ transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4);
+ do {
+ src1_ptr += 4;
+ src2_ptr += 4;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+ transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+ transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8);
+ q23 = vaddl_s16(s2, s3);
+ q45 = vaddl_s16(s4, s5);
+ q67 = vaddl_s16(s6, s7);
+ q2345 = vaddq_s32(q23, q45);
+ q4567 = vaddq_s32(q45, q67);
+ q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+ q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+ q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+ q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+ transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+ store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+ q45678);
+ dst1_32_ptr += 4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+
+ r23 = vaddq_s32(d2, d3);
+ r45 = vaddq_s32(d4, d5);
+ r67 = vaddq_s32(d6, d7);
+ r2345 = vaddq_s32(r23, r45);
+ r4567 = vaddq_s32(r45, r67);
+ r12345 = vaddq_s32(d1, r2345);
+ r23456 = vaddq_s32(r2345, d6);
+ r34567 = vaddq_s32(r4567, d3);
+ r45678 = vaddq_s32(r4567, d8);
+
+ transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+ dst2_ptr += 4;
+ d1 = d5;
+ d2 = d6;
+ d3 = d7;
+ d4 = d8;
+ w -= 4;
+ } while (w > 0);
+ h -= 8;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+ load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s16_4 = s16_0;
+ s16_5 = s16_1;
+ s16_6 = s16_2;
+ s16_7 = s16_3;
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int bit_depth,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint16x8_t s16_0, s16_1, s16_2, s16_3;
+ uint16x8_t s16_4, s16_5, s16_6, s16_7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+ s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+ s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+ s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+ s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vreinterpretq_u32_s32(sr0);
+ s1 = vreinterpretq_u32_s32(sr1);
+ s2 = vreinterpretq_u32_s32(sr2);
+ s3 = vreinterpretq_u32_s32(sr3);
+ s4 = vreinterpretq_u32_s32(sr4);
+ s5 = vreinterpretq_u32_s32(sr5);
+ s6 = vreinterpretq_u32_s32(sr6);
+ s7 = vreinterpretq_u32_s32(sr7);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int bit_depth, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+ int32_t *dst2, const int dst_stride, const int width,
+ const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *src_ptr;
+ int32_t *dst2_ptr;
+ uint16_t *dst1_ptr;
+ int h, w, count = 0;
+
+ w = width;
+ {
+ int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int16x8_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r345, r456, r567, r78, r678;
+ int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+ int32x4_t r2, r3, r5, r6, r7, r8;
+ int16x8_t q678, q78;
+
+ do {
+ dst1_ptr = dst1 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+ h = height;
+
+ load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ src_ptr += 4 * src_stride;
+
+ q23 = vaddq_s16(s2, s3);
+ q234 = vaddq_s16(q23, s4);
+ q34 = vaddq_s16(s3, s4);
+ dst1_ptr += (dst_stride << 1);
+
+ r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+ r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+ r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_low = vaddq_s32(r23, r4_low);
+ r34_low = vaddq_s32(r3, r4_low);
+
+ r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+ r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+ r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_high = vaddq_s32(r23, r4_high);
+ r34_high = vaddq_s32(r3, r4_high);
+
+ dst2_ptr += (dst_stride << 1);
+
+ do {
+ load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+ src_ptr += 4 * src_stride;
+
+ q345 = vaddq_s16(s5, q34);
+ q56 = vaddq_s16(s5, s6);
+ q456 = vaddq_s16(s4, q56);
+ q567 = vaddq_s16(s7, q56);
+ q78 = vaddq_s16(s7, s8);
+ q678 = vaddq_s16(s6, q78);
+
+ store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += (dst_stride << 2);
+
+ s4 = s8;
+ q34 = q78;
+ q234 = q678;
+
+ r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+ r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+ r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+ r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_low);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_low, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+ r4_low = r8;
+ r34_low = r78;
+ r234_low = r678;
+
+ r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+ r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+ r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+ r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_high);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_high, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+ dst2_ptr += (dst_stride << 2);
+
+ r4_high = r8;
+ r34_high = r78;
+ r234_high = r678;
+
+ h -= 4;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+
+ // memset needed for row pixels as 2nd stage of boxsum filter uses
+ // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+ for (int x = 0; x < 2; x++) {
+ memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+ memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+ }
+
+ // memset needed for extra columns as 2nd stage of boxsum filter uses
+ // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+ for (int x = 2; x < height + 2; x++) {
+ int dst_offset = x * dst_stride + width + 2;
+ memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+ memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+ }
+ }
+
+ {
+ int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int16x4_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+ int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+ int16x4_t q678, q78;
+
+ int32_t *src2_ptr;
+ uint16_t *src1_ptr;
+ count = 0;
+ h = height;
+ w = width;
+ do {
+ dst1_ptr = dst1 + (count << 2) * dst_stride;
+ dst2_ptr = dst2 + (count << 2) * dst_stride;
+ src1_ptr = dst1 + (count << 2) * dst_stride;
+ src2_ptr = dst2 + (count << 2) * dst_stride;
+ w = width;
+
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+ transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4);
+ load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+ transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q23 = vadd_s16(d2, d3);
+ q234 = vadd_s16(q23, d4);
+ q34 = vadd_s16(d3, d4);
+ dst1_ptr += 2;
+ r23 = vaddq_s32(r2, r3);
+ r234 = vaddq_s32(r23, r4);
+ r34 = vaddq_s32(r3, r4);
+ dst2_ptr += 2;
+
+ do {
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+ transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8);
+ load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+ transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q345 = vadd_s16(d5, q34);
+ q56 = vadd_s16(d5, d6);
+ q456 = vadd_s16(d4, q56);
+ q567 = vadd_s16(d7, q56);
+ q78 = vadd_s16(d7, d8);
+ q678 = vadd_s16(d6, q78);
+ transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567);
+ store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += 4;
+
+ d4 = d8;
+ q34 = q78;
+ q234 = q678;
+
+ r345 = vaddq_s32(r5, r34);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567);
+ store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+ dst2_ptr += 4;
+
+ r4 = r8;
+ r34 = r78;
+ r234 = r678;
+ w -= 4;
+ } while (w > 0);
+ h -= 4;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ int32x4_t fours, threes, res;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+ threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+ return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ uint16x8_t r0, r1;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xb = vaddq_u16(xb, x);
+ xt = vaddq_u16(xt, xr);
+ xl = vaddq_u16(xl, xb);
+ xl = vaddq_u16(xl, xt);
+
+ r0 = vshlq_n_u16(xl, 2);
+
+ xbl = vaddq_u16(xbl, xbr);
+ xtl = vaddq_u16(xtl, xtr);
+ xtl = vaddq_u16(xtl, xbl);
+
+ r1 = vshlq_n_u16(xtl, 2);
+ r1 = vsubq_u16(r1, xtl);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ sixes = vaddq_s32(xt, xb);
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xbr = vaddq_u16(xbr, xbl);
+ xtr = vaddq_u16(xtr, xtl);
+ xbr = vaddq_u16(xbr, xtr);
+ xtl = vshlq_n_u16(xbr, 2);
+ xbr = vaddq_u16(xtl, xbr);
+
+ xb = vaddq_u16(xb, xt);
+ xb0 = vshlq_n_u16(xb, 1);
+ xb = vshlq_n_u16(xb, 2);
+ xb = vaddq_u16(xb, xb0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+ int32x4_t xl, x, xr;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ fives = vaddq_s32(xl, xr);
+ sixes = x;
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+ int32x4_t *a1) {
+ uint16x8_t xl, x, xr;
+ uint16x8_t x0;
+
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xl = vaddq_u16(xl, xr);
+ x0 = vshlq_n_u16(xl, 2);
+ xl = vaddq_u16(xl, x0);
+
+ x0 = vshlq_n_u16(x, 1);
+ x = vshlq_n_u16(x, 2);
+ x = vaddq_u16(x, x0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+ const int buf_stride, int16_t *src,
+ const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+
+ A_tmp = A;
+ B_tmp = B;
+ src_ptr = src;
+ dst_ptr = dst;
+ h = height;
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ if (!(count & 1)) {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+ b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ } else {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_odd_row(B_tmp);
+ b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+ int16_t *src, const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+ h = height;
+
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+ b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+ int height, int dgd_stride,
+ int32_t *dst, int dst_stride,
+ int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ const int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ int32_t *sum_buf = B_;
+ uint16_t *tmp16_buf = A16_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 0);
+ assert(r == 2);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+ // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+ // buffer(square_sum_buf).
+ boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+ width_ext, height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bit_depth > 8) {
+ calc_ab_fast_internal_hbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+ bit_depth, r, params->s[radius_idx], 2);
+ } else {
+ calc_ab_fast_internal_lbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+ params->s[radius_idx], 2);
+ }
+#else
+ (void)bit_depth;
+ calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
+ (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2,
+ width + 2, height + 2, r, params->s[radius_idx], 2);
+#endif
+ final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+ dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ uint16_t *sum_buf = B16_;
+ uint16_t *A16 = A16_;
+ int32_t *B = B_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 1);
+ assert(r == 1);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels output will be in 16bit(sum_buf).
+ // sum of squares output is kept in 32bit buffer(square_sum_buf).
+ boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+ height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+ if (bit_depth > 8) {
+ calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, bit_depth, r, params->s[radius_idx], 1);
+ } else {
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, r, params->s[radius_idx], 1);
+ }
+#else
+ (void)bit_depth;
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2, height + 2,
+ r, params->s[radius_idx], 1);
+#endif
+ final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+ dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+ const int src_stride, uint16_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ const uint8_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+
+ uint8x8_t t1, t2, t3, t4;
+ uint16x8_t s1, s2, s3, s4;
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ if (w >= 7) {
+ do {
+ load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ s1 = vmovl_u8(t1);
+ s2 = vmovl_u8(t2);
+ s3 = vmovl_u8(t3);
+ s4 = vmovl_u8(t4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+ }
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ for (int x = 0; x < h; x++) {
+ for (int y = 0; y < width; y++) {
+ dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+ }
+ }
+
+ // memset uninitialized rows of src buffer as they are needed for the
+ // boxsum filter calculation.
+ for (int x = height; x < height + 5; x++)
+ memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+ uint16_t *dst, const int dst_stride,
+ int width, int height) {
+ const uint16_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+ uint16x8_t s1, s2, s3, s4;
+
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ do {
+ load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+
+ for (int x = 0; x < h; x++) {
+ memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+ sizeof(uint16_t) * width);
+ }
+ // memset uninitialized rows of src buffer as they are needed for the
+ // boxsum filter calculation.
+ for (int x = height; x < height + 5; x++)
+ memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+ int stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+#else
+ (void)highbd;
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+#endif
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+ flt_stride, bit_depth, sgr_params_idx, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+ bit_depth, sgr_params_idx, 1);
+ return 0;
+}
+
+int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+ const sgr_params_type *const params = &av1_sgr_params[eps];
+ int xq[2];
+
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+#else
+ (void)highbd;
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+#endif
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+ bit_depth, eps, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+ bit_depth, eps, 1);
+
+ av1_decode_xq(xqd, xq, params);
+
+ {
+ int16_t *src_ptr;
+ uint8_t *dst_ptr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *dst16_ptr;
+#endif
+ int16x4_t d0, d4;
+ int16x8_t r0, s0;
+ uint16x8_t r4;
+ int32x4_t u0, u4, v0, v4, f00, f10;
+ uint8x8_t t0;
+ int count = 0, w = width, h = height, rc = 0;
+
+ const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+ const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+ const int16x8_t zero = vdupq_n_s16(0);
+ const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+ src_ptr = (int16_t *)dgd16;
+ do {
+ w = width;
+ count = 0;
+ dst_ptr = dst8 + rc * dst_stride;
+#if CONFIG_AV1_HIGHBITDEPTH
+ dst16_ptr = dst16 + rc * dst_stride;
+#endif
+ do {
+ s0 = vld1q_s16(src_ptr + count);
+
+ u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+ u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+ v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+ v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ f00 = vld1q_s32(flt0 + count);
+ f10 = vld1q_s32(flt0 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq0_vec, f00);
+ v4 = vmlaq_s32(v4, xq0_vec, f10);
+ }
+
+ if (params->r[1] > 0) {
+ f00 = vld1q_s32(flt1 + count);
+ f10 = vld1q_s32(flt1 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq1_vec, f00);
+ v4 = vmlaq_s32(v4, xq1_vec, f10);
+ }
+
+ d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ r0 = vcombine_s16(d0, d4);
+
+ r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ r4 = vminq_u16(r4, max);
+ vst1q_u16(dst16_ptr, r4);
+ dst16_ptr += 8;
+ } else {
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+ dst_ptr += 8;
+ }
+#else
+ (void)max;
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+ dst_ptr += 8;
+#endif
+ w -= 8;
+ count += 8;
+ } while (w > 0);
+
+ src_ptr += dgd16_stride;
+ flt1 += width;
+ flt0 += width;
+ rc++;
+ h--;
+ } while (h > 0);
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 0000000000..4723154398
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "warp_plane_neon.h"
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f[0], in16_lo);
+ int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f[0], in16_lo);
+ int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3));
+ int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4));
+ int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5));
+ int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6));
+ int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+ int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
+ vpaddlq_s16(m7) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
+ int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in)));
+ int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in)));
+
+ int16x8_t m0 = vmulq_s16(f_s16, in16_lo);
+ int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1));
+ int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2));
+ int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3));
+ int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4));
+ int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5));
+ int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6));
+ int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7));
+
+ int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2),
+ vpaddlq_s16(m3) };
+ int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6),
+ vpaddlq_s16(m7) };
+
+ int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy) {
+ int16x4_t s0 = vget_low_s16(src[0]);
+ int16x4_t s1 = vget_low_s16(src[1]);
+ int16x4_t s2 = vget_low_s16(src[2]);
+ int16x4_t s3 = vget_low_s16(src[3]);
+ int16x4_t s4 = vget_low_s16(src[4]);
+ int16x4_t s5 = vget_low_s16(src[5]);
+ int16x4_t s6 = vget_low_s16(src[6]);
+ int16x4_t s7 = vget_low_s16(src[7]);
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+ *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+ vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+ vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+
+ *res = horizontal_add_4d_s32x4(m0123_pairs);
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+ *res_low = m0123;
+ *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+ int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+ m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+ int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+ m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+ int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+ m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+ int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+ m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+ int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
+
+ *res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ *res_high = horizontal_add_4d_s32x4(m4567_pairs);
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.h b/third_party/aom/av1/common/arm/warp_plane_neon.h
new file mode 100644
index 0000000000..5afd72f4ab
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
+#define AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha);
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha);
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx);
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx);
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy);
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma);
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy);
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma);
+
+static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) {
+ out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+}
+
+static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) {
+ out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+ out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >>
+ WARPEDDIFF_PREC_BITS)));
+}
+
+static INLINE int clamp_iy(int iy, int height) {
+ return clamp(iy, 0, height - 1);
+}
+
+static INLINE void warp_affine_horizontal(const uint8_t *ref, int width,
+ int height, int stride, int p_width,
+ int p_height, int16_t alpha,
+ int16_t beta, const int64_t x4,
+ const int64_t y4, const int i,
+ int16x8_t tmp[]) {
+ const int bd = 8;
+ const int reduce_bits_horiz = ROUND0_BITS;
+ const int height_limit = AOMMIN(8, p_height - i) + 7;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ if (ix4 <= -7) {
+ for (int k = 0; k < height_limit; ++k) {
+ int iy = clamp_iy(iy4 + k - 7, height);
+ int16_t dup_val =
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ } else if (ix4 >= width + 6) {
+ for (int k = 0; k < height_limit; ++k) {
+ int iy = clamp_iy(iy4 + k - 7, height);
+ int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k] = vdupq_n_s16(dup_val);
+ }
+ return;
+ }
+
+ static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+ const uint8x16_t indx = vld1q_u8(kIotaArr);
+
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+#define APPLY_HORIZONTAL_SHIFT(fn, ...) \
+ do { \
+ if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \
+ for (int k = 0; k < height_limit; ++k) { \
+ const int iy = clamp_iy(iy4 + k - 7, height); \
+ const uint8_t *src = ref + iy * stride + ix4 - 7; \
+ uint8x16_t src_1 = vld1q_u8(src); \
+ \
+ if (out_of_boundary_left >= 0) { \
+ int limit = out_of_boundary_left + 1; \
+ uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); \
+ uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \
+ uint8x16_t mask_val = vcleq_u8(indx, cmp_vec); \
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1); \
+ } \
+ if (out_of_boundary_right >= 0) { \
+ int limit = 15 - (out_of_boundary_right + 1); \
+ uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); \
+ uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \
+ uint8x16_t mask_val = vcgeq_u8(indx, cmp_vec); \
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1); \
+ } \
+ tmp[k] = (fn)(src_1, __VA_ARGS__); \
+ } \
+ } else { \
+ for (int k = 0; k < height_limit; ++k) { \
+ const int iy = clamp_iy(iy4 + k - 7, height); \
+ const uint8_t *src = ref + iy * stride + ix4 - 7; \
+ uint8x16_t src_1 = vld1q_u8(src); \
+ tmp[k] = (fn)(src_1, __VA_ARGS__); \
+ } \
+ } \
+ } while (0)
+
+ if (p_width == 4) {
+ if (beta == 0) {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, sx4);
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
+ }
+ } else {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1,
+ (sx4 + beta * (k - 3)));
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)),
+ alpha);
+ }
+ }
+ } else {
+ if (beta == 0) {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, sx4);
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
+ }
+ } else {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1,
+ (sx4 + beta * (k - 3)));
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)),
+ alpha);
+ }
+ }
+ }
+}
+
+static INLINE void warp_affine_vertical(
+ uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound,
+ uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg,
+ int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j,
+ int16x8_t tmp[], const int fwd, const int bwd) {
+ const int bd = 8;
+ const int reduce_bits_horiz = ROUND0_BITS;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ int add_const_vert;
+ if (is_compound) {
+ add_const_vert =
+ (1 << offset_bits_vert) + (1 << (COMPOUND_ROUND1_BITS - 1));
+ } else {
+ add_const_vert =
+ (1 << offset_bits_vert) + (1 << (2 * FILTER_BITS - ROUND0_BITS - 1));
+ }
+ const int sub_constant = (1 << (bd - 1)) + (1 << bd);
+
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int res_sub_const =
+ (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1)) -
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS)) -
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ if (p_width > 4) {
+ for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ int32x4_t res_lo, res_hi;
+ if (gamma == 0) {
+ vertical_filter_8x1_f1(v_src, &res_lo, &res_hi, sy);
+ } else {
+ vertical_filter_8x1_f8(v_src, &res_lo, &res_hi, sy, gamma);
+ }
+
+ res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert));
+ res_hi = vaddq_s32(res_hi, vdupq_n_s32(add_const_vert));
+
+ if (is_compound) {
+ uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j];
+ int16x8_t res_s16 =
+ vcombine_s16(vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS),
+ vshrn_n_s32(res_hi, COMPOUND_ROUND1_BITS));
+ if (do_average) {
+ int16x8_t tmp16 = vreinterpretq_s16_u16(vld1q_u16(p));
+ if (use_dist_wtd_comp_avg) {
+ int32x4_t tmp32_lo = vmull_n_s16(vget_low_s16(tmp16), fwd);
+ int32x4_t tmp32_hi = vmull_n_s16(vget_high_s16(tmp16), fwd);
+ tmp32_lo = vmlal_n_s16(tmp32_lo, vget_low_s16(res_s16), bwd);
+ tmp32_hi = vmlal_n_s16(tmp32_hi, vget_high_s16(res_s16), bwd);
+ tmp16 = vcombine_s16(vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS),
+ vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS));
+ } else {
+ tmp16 = vhaddq_s16(tmp16, res_s16);
+ }
+ int16x8_t res = vaddq_s16(tmp16, vdupq_n_s16(res_sub_const));
+ uint8x8_t res8 = vqshrun_n_s16(
+ res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+ vst1_u8(&pred[(i + k + 4) * p_stride + j], res8);
+ } else {
+ vst1q_u16(p, vreinterpretq_u16_s16(res_s16));
+ }
+ } else {
+ int16x8_t res16 =
+ vcombine_s16(vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS),
+ vshrn_n_s32(res_hi, 2 * FILTER_BITS - ROUND0_BITS));
+ res16 = vsubq_s16(res16, vdupq_n_s16(sub_constant));
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ vst1_u8(p, vqmovun_s16(res16));
+ }
+ }
+ } else {
+ // p_width == 4
+ for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ int32x4_t res_lo;
+ if (gamma == 0) {
+ vertical_filter_4x1_f1(v_src, &res_lo, sy);
+ } else {
+ vertical_filter_4x1_f4(v_src, &res_lo, sy, gamma);
+ }
+
+ res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert));
+
+ if (is_compound) {
+ uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j];
+
+ int16x4_t res_lo_s16 = vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS);
+ if (do_average) {
+ uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+ int16x4_t tmp16_lo = vreinterpret_s16_u16(vld1_u16(p));
+ if (use_dist_wtd_comp_avg) {
+ int32x4_t tmp32_lo = vmull_n_s16(tmp16_lo, fwd);
+ tmp32_lo = vmlal_n_s16(tmp32_lo, res_lo_s16, bwd);
+ tmp16_lo = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+ } else {
+ tmp16_lo = vhadd_s16(tmp16_lo, res_lo_s16);
+ }
+ int16x4_t res = vadd_s16(tmp16_lo, vdup_n_s16(res_sub_const));
+ uint8x8_t res8 = vqshrun_n_s16(
+ vcombine_s16(res, vdup_n_s16(0)),
+ 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+ vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res8), 0);
+ } else {
+ uint16x4_t res_u16_low = vreinterpret_u16_s16(res_lo_s16);
+ vst1_u16(p, res_u16_low);
+ }
+ } else {
+ int16x4_t res16 = vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS);
+ res16 = vsub_s16(res16, vdup_n_s16(sub_constant));
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ uint8x8_t val = vqmovun_s16(vcombine_s16(res16, vdup_n_s16(0)));
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+ }
+ }
+ }
+}
+
+static INLINE void av1_warp_affine_common(
+ const int32_t *mat, const uint8_t *ref, int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const int is_compound = conv_params->is_compound;
+ uint16_t *const dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ assert(IMPLIES(is_compound, dst != NULL));
+ assert(IMPLIES(do_average, is_compound));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int16x8_t tmp[15];
+ warp_affine_horizontal(ref, width, height, stride, p_width, p_height,
+ alpha, beta, x4, y4, i, tmp);
+ warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
+ dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
+ delta, y4, i, j, tmp, w0, w1);
+ }
+ }
+}
+
+#endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
new file mode 100644
index 0000000000..39e3ad99f4
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "warp_plane_neon.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+ int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5]));
+ int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+ uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4));
+ uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5));
+ uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6));
+ uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+ int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8);
+ int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+ int32x4_t tmp_res_high = vpaddq_s32(m45, m67);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+ uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+ uint8x16_t in_89ab = vqtbl1q_u8(in, perm2);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0);
+ m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+ int32x4_t tmp_res_high = m4567;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy) {
+ int16x4_t s0 = vget_low_s16(src[0]);
+ int16x4_t s1 = vget_low_s16(src[1]);
+ int16x4_t s2 = vget_low_s16(src[2]);
+ int16x4_t s3 = vget_low_s16(src[3]);
+ int16x4_t s4 = vget_low_s16(src[4]);
+ int16x4_t s5 = vget_low_s16(src[5]);
+ int16x4_t s6 = vget_low_s16(src[6]);
+ int16x4_t s7 = vget_low_s16(src[7]);
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+ *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+ vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+ vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+
+ *res = horizontal_add_4d_s32x4(m0123_pairs);
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+ *res_low = m0123;
+ *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0]));
+ m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0]));
+ int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1]));
+ m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1]));
+ int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2]));
+ m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2]));
+ int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3]));
+ m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3]));
+ int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4]));
+ m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4]));
+ int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5]));
+ m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5]));
+ int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6]));
+ m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6]));
+ int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7]));
+ m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7]));
+
+ int32x4_t m0123_pairs[] = { m0, m1, m2, m3 };
+ int32x4_t m4567_pairs[] = { m4, m5, m6, m7 };
+
+ *res_low = horizontal_add_4d_s32x4(m0123_pairs);
+ *res_high = horizontal_add_4d_s32x4(m4567_pairs);
+}
+
+void av1_warp_affine_neon_i8mm(const int32_t *mat, const uint8_t *ref,
+ int width, int height, int stride, uint8_t *pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params,
+ int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/third_party/aom/av1/common/arm/warp_plane_sve.c b/third_party/aom/av1/common/arm/warp_plane_sve.c
new file mode 100644
index 0000000000..8a4bf5747b
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_sve.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "warp_plane_neon.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[4];
+ load_filters_4(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx,
+ int alpha) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ // Loading the 8 filter taps
+ int16x8_t f[8];
+ load_filters_8(f, sx, alpha);
+
+ int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1]));
+ int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3]));
+ int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5]));
+ int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7]));
+
+ uint8x8_t in0 = vget_low_u8(in);
+ uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1));
+ uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2));
+ uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3));
+ uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4));
+ uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5));
+ uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6));
+ uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7));
+
+ int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8);
+ int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8);
+ int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8);
+ int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8);
+
+ int32x4_t tmp_res_low = vpaddq_s32(m01, m23);
+ int32x4_t tmp_res_high = vpaddq_s32(m45, m67);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+
+ uint16x8_t res =
+ vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) {
+ const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+
+ int16x8_t f_s16 =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
+
+ uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
+ uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]);
+ uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]);
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ uint8x16_t in_0123 = vqtbl1q_u8(in, perm0);
+ uint8x16_t in_4567 = vqtbl1q_u8(in, perm1);
+ uint8x16_t in_89ab = vqtbl1q_u8(in, perm2);
+
+ int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0);
+ m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1);
+
+ int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0);
+ m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1);
+
+ int32x4_t tmp_res_low = m0123;
+ int32x4_t tmp_res_high = m4567;
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS),
+ vqrshrun_n_s32(tmp_res_high, ROUND0_BITS));
+ return vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res,
+ int sy) {
+ int16x4_t s0 = vget_low_s16(src[0]);
+ int16x4_t s1 = vget_low_s16(src[1]);
+ int16x4_t s2 = vget_low_s16(src[2]);
+ int16x4_t s3 = vget_low_s16(src[3]);
+ int16x4_t s4 = vget_low_s16(src[4]);
+ int16x4_t s5 = vget_low_s16(src[5]);
+ int16x4_t s6 = vget_low_s16(src[6]);
+ int16x4_t s7 = vget_low_s16(src[7]);
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3);
+
+ *res = m0123;
+}
+
+static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res,
+ int sy, int gamma) {
+ int16x8_t s0, s1, s2, s3;
+ transpose_elems_s16_4x8(
+ vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]),
+ vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]),
+ vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3);
+
+ int16x8_t f[4];
+ load_filters_4(f, sy, gamma);
+
+ int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+ int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+ int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+ int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+
+ int64x2_t m01 = vpaddq_s64(m0, m1);
+ int64x2_t m23 = vpaddq_s64(m2, m3);
+
+ *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+}
+
+static INLINE void vertical_filter_8x1_f1(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+
+ int16x8_t f =
+ vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2);
+ m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3);
+
+ int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2);
+ m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3);
+
+ *res_low = m0123;
+ *res_high = m4567;
+}
+
+static INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
+ int32x4_t *res_low,
+ int32x4_t *res_high, int sy,
+ int gamma) {
+ int16x8_t s0 = src[0];
+ int16x8_t s1 = src[1];
+ int16x8_t s2 = src[2];
+ int16x8_t s3 = src[3];
+ int16x8_t s4 = src[4];
+ int16x8_t s5 = src[5];
+ int16x8_t s6 = src[6];
+ int16x8_t s7 = src[7];
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ int16x8_t f[8];
+ load_filters_8(f, sy, gamma);
+
+ int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]);
+ int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]);
+ int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]);
+ int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]);
+ int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]);
+ int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]);
+ int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]);
+ int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]);
+
+ int64x2_t m01 = vpaddq_s64(m0, m1);
+ int64x2_t m23 = vpaddq_s64(m2, m3);
+ int64x2_t m45 = vpaddq_s64(m4, m5);
+ int64x2_t m67 = vpaddq_s64(m6, m7);
+
+ *res_low = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23));
+ *res_high = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67));
+}
+
+void av1_warp_affine_sve(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, conv_params, alpha, beta, gamma, delta);
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
new file mode 100644
index 0000000000..6440c16adb
--- /dev/null
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/restoration.h"
+
+static INLINE uint16x8_t wiener_convolve5_8_2d_h(
+ const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
+ const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter,
+ const int32x4_t round_vec, const uint16x8_t im_max_val) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 2) add
+ // mirrored source elements before multiplying filter coefficients.
+ int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4));
+ int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+ // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.)
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
+ vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
+
+ return vminq_u16(res, im_max_val);
+}
+
+static INLINE void convolve_add_src_horiz_5tap_neon(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int32x4_t round_vec, const uint16x8_t im_max_val) {
+ do {
+ const uint8_t *s = src_ptr;
+ uint16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3, s4;
+ load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4);
+
+ uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter,
+ round_vec, im_max_val);
+
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+}
+
+static INLINE uint16x8_t wiener_convolve7_8_2d_h(
+ const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2,
+ const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5,
+ const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec,
+ const uint16x8_t im_max_val) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+ int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS),
+ vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS));
+
+ return vminq_u16(res, im_max_val);
+}
+
+static INLINE void convolve_add_src_horiz_7tap_neon(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int32x4_t round_vec, const uint16x8_t im_max_val) {
+ do {
+ const uint8_t *s = src_ptr;
+ uint16_t *d = dst_ptr;
+ int width = w;
+
+ do {
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6,
+ x_filter, round_vec, im_max_val);
+
+ vst1q_u16(d, d0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ } while (--h != 0);
+}
+
+static INLINE uint8x8_t wiener_convolve5_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter,
+ const int32x4_t round_vec) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 2) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s04 = vaddq_s16(s0, s4);
+ int16x8_t s13 = vaddq_s16(s1, s3);
+
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3);
+
+ int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+ int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+
+ return vqmovun_s16(vcombine_s16(res_lo, res_hi));
+}
+
+static INLINE void convolve_add_src_vert_5tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+ const int32x4_t round_vec) {
+ do {
+ const int16_t *s = (int16_t *)src;
+ uint8_t *d = dst;
+ int height = h;
+
+ while (height > 3) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ uint8x8_t d0 =
+ wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
+ uint8x8_t d1 =
+ wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec);
+ uint8x8_t d2 =
+ wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec);
+ uint8x8_t d3 =
+ wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ }
+
+ while (height-- != 0) {
+ int16x8_t s0, s1, s2, s3, s4;
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
+
+ uint8x8_t d0 =
+ wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec);
+
+ vst1_u8(d, d0);
+
+ d += dst_stride;
+ s += src_stride;
+ }
+
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+static INLINE uint8x8_t wiener_convolve7_8_2d_v(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) {
+ // Since the Wiener filter is symmetric about the middle tap (tap 3) add
+ // mirrored source elements before multiplying by filter coefficients.
+ int16x8_t s06 = vaddq_s16(s0, s6);
+ int16x8_t s15 = vaddq_s16(s1, s5);
+ int16x8_t s24 = vaddq_s16(s2, s4);
+
+ int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2);
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3);
+
+ int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2);
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3);
+
+ int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+ int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS);
+
+ return vqmovun_s16(vcombine_s16(res_lo, res_hi));
+}
+
+static INLINE void convolve_add_src_vert_7tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter,
+ const int32x4_t round_vec) {
+ do {
+ const int16_t *s = (int16_t *)src;
+ uint8_t *d = dst;
+ int height = h;
+
+ while (height > 3) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
+ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+ &s9);
+
+ uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
+ y_filter, round_vec);
+ uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_vec);
+ uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8,
+ y_filter, round_vec);
+ uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9,
+ y_filter, round_vec);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ }
+
+ while (height-- != 0) {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6,
+ y_filter, round_vec);
+
+ vst1_u8(d, d0);
+
+ d += dst_stride;
+ s += src_stride;
+ }
+
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+ assert(filter[7] == 0);
+ if (filter[0] == 0 && filter[6] == 0) {
+ return WIENER_WIN_REDUCED;
+ }
+ return WIENER_WIN;
+}
+
+// Wiener filter 2D
+// Apply horizontal filter and store in a temporary buffer. When applying
+// vertical filter, overwrite the original pixel values.
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *x_filter, int x_step_q4,
+ const int16_t *y_filter, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)conv_params;
+
+ assert(w % 8 == 0);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(x_filter[7] == 0 && y_filter[7] == 0);
+ // For bd == 8, assert horizontal filtering output will not exceed 15-bit:
+ assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+ const int x_filter_taps = get_wiener_filter_taps(x_filter);
+ const int y_filter_taps = get_wiener_filter_taps(y_filter);
+ int16x4_t x_filter_s16 = vld1_s16(x_filter);
+ int16x4_t y_filter_s16 = vld1_s16(y_filter);
+ // Add 128 to tap 3. (Needed for rounding.)
+ x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+ y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
+
+ const int im_stride = MAX_SB_SIZE;
+ const int im_h = h + y_filter_taps - 1;
+ const int horiz_offset = x_filter_taps / 2;
+ const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
+
+ const int bd = 8;
+ const uint16x8_t im_max_val =
+ vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1);
+ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+ const int32x4_t vert_round_vec =
+ vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) -
+ (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1)));
+
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w, im_h,
+ x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w, im_h,
+ x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_s16, vert_round_vec);
+ } else {
+ convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_s16, vert_round_vec);
+ }
+}
diff --git a/third_party/aom/av1/common/av1_common_int.h b/third_party/aom/av1/common/av1_common_int.h
new file mode 100644
index 0000000000..4c0cb99d2b
--- /dev/null
+++ b/third_party/aom/av1/common/av1_common_int.h
@@ -0,0 +1,1882 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_
+#define AOM_AV1_COMMON_AV1_COMMON_INT_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_util/aom_thread.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
+#include "aom_dsp/grain_params.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/odintrin.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT
+#endif
+
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+ do { \
+ } while (0)
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
+
+#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
+// Extra frame context which is always kept at default values
+#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
+
+#define NUM_PING_PONG_BUFFERS 2
+
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+ (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS)
+/* clang-format on */
+
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
+
+/*!\cond */
+
+enum {
+ SINGLE_REFERENCE = 0,
+ COMPOUND_REFERENCE = 1,
+ REFERENCE_MODE_SELECT = 2,
+ REFERENCE_MODES = 3,
+} UENUM1BYTE(REFERENCE_MODE);
+
+enum {
+ /**
+ * Frame context updates are disabled
+ */
+ REFRESH_FRAME_CONTEXT_DISABLED,
+ /**
+ * Update frame context to values resulting from backward probability
+ * updates based on entropy/counts in the decoded frame
+ */
+ REFRESH_FRAME_CONTEXT_BACKWARD,
+} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE);
+
+#define MFMV_STACK_SIZE 3
+typedef struct {
+ int_mv mfmv0;
+ uint8_t ref_frame_offset;
+} TPL_MV_REF;
+
+typedef struct {
+ int_mv mv;
+ MV_REFERENCE_FRAME ref_frame;
+} MV_REF;
+
+typedef struct RefCntBuffer {
+ // For a RefCntBuffer, the following are reference-holding variables:
+ // - cm->ref_frame_map[]
+ // - cm->cur_frame
+ // - cm->scaled_ref_buf[] (encoder only)
+ // - pbi->output_frame_index[] (decoder only)
+ // With that definition, 'ref_count' is the number of reference-holding
+ // variables that are currently referencing this buffer.
+ // For example:
+ // - suppose this buffer is at index 'k' in the buffer pool, and
+ // - Total 'n' of the variables / array elements above have value 'k' (that
+ // is, they are pointing to buffer at index 'k').
+ // Then, pool->frame_bufs[k].ref_count = n.
+ int ref_count;
+
+ unsigned int order_hint;
+ unsigned int ref_order_hints[INTER_REFS_PER_FRAME];
+
+ // These variables are used only in encoder and compare the absolute
+ // display order hint to compute the relative distance and overcome
+ // the limitation of get_relative_dist() which returns incorrect
+ // distance when a very old frame is used as a reference.
+ unsigned int display_order_hint;
+ unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
+ // Frame's level within the hierarchical structure.
+ unsigned int pyramid_level;
+ MV_REF *mvs;
+ uint8_t *seg_map;
+ struct segmentation seg;
+ int mi_rows;
+ int mi_cols;
+ // Width and height give the size of the buffer (before any upscaling, unlike
+ // the sizes that can be derived from the buf structure)
+ int width;
+ int height;
+ WarpedMotionParams global_motion[REF_FRAMES];
+ int showable_frame; // frame can be used as show existing frame in future
+ uint8_t film_grain_params_present;
+ aom_film_grain_t film_grain_params;
+ aom_codec_frame_buffer_t raw_frame_buffer;
+ YV12_BUFFER_CONFIG buf;
+ int temporal_id; // Temporal layer ID of the frame
+ int spatial_id; // Spatial layer ID of the frame
+ FRAME_TYPE frame_type;
+
+ // This is only used in the encoder but needs to be indexed per ref frame
+ // so it's extremely convenient to keep it here.
+ int interp_filter_selected[SWITCHABLE];
+
+ // Inter frame reference frame delta for loop filter
+ int8_t ref_deltas[REF_FRAMES];
+
+ // 0 = ZERO_MV, MV
+ int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+ FRAME_CONTEXT frame_context;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+// TODO(wtc): Remove this. See
+// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630.
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t pool_mutex;
+#endif
+
+ // Private data associated with the frame buffer callbacks.
+ void *cb_priv;
+
+ aom_get_frame_buffer_cb_fn_t get_fb_cb;
+ aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+ RefCntBuffer *frame_bufs;
+ uint8_t num_frame_bufs;
+
+ // Frame buffers allocated internally by the codec.
+ InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+/*!\endcond */
+
+/*!\brief Parameters related to CDEF */
+typedef struct {
+ //! CDEF column line buffer
+ uint16_t *colbuf[MAX_MB_PLANE];
+ //! CDEF top & bottom line buffer
+ uint16_t *linebuf[MAX_MB_PLANE];
+ //! CDEF intermediate buffer
+ uint16_t *srcbuf;
+ //! CDEF column line buffer sizes
+ size_t allocated_colbuf_size[MAX_MB_PLANE];
+ //! CDEF top and bottom line buffer sizes
+ size_t allocated_linebuf_size[MAX_MB_PLANE];
+ //! CDEF intermediate buffer size
+ size_t allocated_srcbuf_size;
+ //! CDEF damping factor
+ int cdef_damping;
+ //! Number of CDEF strength values
+ int nb_cdef_strengths;
+ //! CDEF strength values for luma
+ int cdef_strengths[CDEF_MAX_STRENGTHS];
+ //! CDEF strength values for chroma
+ int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+ //! Number of CDEF strength values in bits
+ int cdef_bits;
+ //! Number of rows in the frame in 4 pixel
+ int allocated_mi_rows;
+ //! Number of CDEF workers
+ int allocated_num_workers;
+} CdefInfo;
+
+/*!\cond */
+
+typedef struct {
+ int delta_q_present_flag;
+ // Resolution of delta quant
+ int delta_q_res;
+ int delta_lf_present_flag;
+ // Resolution of delta lf level
+ int delta_lf_res;
+ // This is a flag for number of deltas of loop filter level
+ // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
+ // 1: use separate deltas for each filter level
+ int delta_lf_multi;
+} DeltaQInfo;
+
+typedef struct {
+ int enable_order_hint; // 0 - disable order hint, and related tools
+ int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs,
+ // frame_sign_bias
+ // if 0, enable_dist_wtd_comp and
+ // enable_ref_frame_mvs must be set as 0.
+ int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes
+ // 1 - enable it
+ int enable_ref_frame_mvs; // 0 - disable ref frame mvs
+ // 1 - enable it
+} OrderHintInfo;
+
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
+// One exception is the last member 'op_params' that is ignored by
+// are_seq_headers_consistent() function.
+typedef struct SequenceHeader {
+ int num_bits_width;
+ int num_bits_height;
+ int max_frame_width;
+ int max_frame_height;
+ // Whether current and reference frame IDs are signaled in the bitstream.
+ // Frame id numbers are additional information that do not affect the
+ // decoding process, but provide decoders with a way of detecting missing
+ // reference frames so that appropriate action can be taken.
+ uint8_t frame_id_numbers_present_flag;
+ int frame_id_length;
+ int delta_frame_id_length;
+ BLOCK_SIZE sb_size; // Size of the superblock used for this frame
+ int mib_size; // Size of the superblock in units of MI blocks
+ int mib_size_log2; // Log 2 of above.
+
+ OrderHintInfo order_hint_info;
+
+ uint8_t force_screen_content_tools; // 0 - force off
+ // 1 - force on
+ // 2 - adaptive
+ uint8_t still_picture; // Video is a single frame still picture
+ uint8_t reduced_still_picture_hdr; // Use reduced header for still picture
+ uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel
+ // 1 - force to integer
+ // 2 - adaptive
+ uint8_t enable_filter_intra; // enables/disables filterintra
+ uint8_t enable_intra_edge_filter; // enables/disables edge upsampling
+ uint8_t enable_interintra_compound; // enables/disables interintra_compound
+ uint8_t enable_masked_compound; // enables/disables masked compound
+ uint8_t enable_dual_filter; // 0 - disable dual interpolation filter
+ // 1 - enable vert/horz filter selection
+ uint8_t enable_warped_motion; // 0 - disable warp for the sequence
+ // 1 - enable warp for the sequence
+ uint8_t enable_superres; // 0 - Disable superres for the sequence
+ // and no frame level superres flag
+ // 1 - Enable superres for the sequence
+ // enable per-frame superres flag
+ uint8_t enable_cdef; // To turn on/off CDEF
+ uint8_t enable_restoration; // To turn on/off loop restoration
+ BITSTREAM_PROFILE profile;
+
+ // Color config.
+ aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
+ // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+ uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers.
+ uint8_t monochrome; // Monochrome video
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ int color_range;
+ int subsampling_x; // Chroma subsampling for x
+ int subsampling_y; // Chroma subsampling for y
+ aom_chroma_sample_position_t chroma_sample_position;
+ uint8_t separate_uv_delta_q;
+ uint8_t film_grain_params_present;
+
+ // Operating point info.
+ int operating_points_cnt_minus_1;
+ int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+ int timing_info_present;
+ aom_timing_info_t timing_info;
+ uint8_t decoder_model_info_present_flag;
+ aom_dec_model_info_t decoder_model_info;
+ uint8_t display_model_info_present_flag;
+ AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1.
+
+ // IMPORTANT: the op_params member must be at the end of the struct so that
+ // are_seq_headers_consistent() can be implemented with a memcmp() call.
+ // TODO(urvang): We probably don't need the +1 here.
+ aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+} SequenceHeader;
+
+typedef struct {
+ int skip_mode_allowed;
+ int skip_mode_flag;
+ int ref_frame_idx_0;
+ int ref_frame_idx_1;
+} SkipModeInfo;
+
+typedef struct {
+ FRAME_TYPE frame_type;
+ REFERENCE_MODE reference_mode;
+
+ unsigned int order_hint;
+ unsigned int display_order_hint;
+ // Frame's level within the hierarchical structure.
+ unsigned int pyramid_level;
+ unsigned int frame_number;
+ SkipModeInfo skip_mode_info;
+ int refresh_frame_flags; // Which ref frames are overwritten by this frame
+ int frame_refs_short_signaling;
+} CurrentFrame;
+
+/*!\endcond */
+
+/*!
+ * \brief Frame level features.
+ */
+typedef struct {
+ /*!
+ * If true, CDF update in the symbol encoding/decoding process is disabled.
+ */
+ bool disable_cdf_update;
+ /*!
+ * If true, motion vectors are specified to eighth pel precision; and
+ * if false, motion vectors are specified to quarter pel precision.
+ */
+ bool allow_high_precision_mv;
+ /*!
+ * If true, force integer motion vectors; if false, use the default.
+ */
+ bool cur_frame_force_integer_mv;
+ /*!
+ * If true, palette tool and/or intra block copy tools may be used.
+ */
+ bool allow_screen_content_tools;
+ bool allow_intrabc; /*!< If true, intra block copy tool may be used. */
+ bool allow_warped_motion; /*!< If true, frame may use warped motion mode. */
+ /*!
+ * If true, using previous frames' motion vectors for prediction is allowed.
+ */
+ bool allow_ref_frame_mvs;
+ /*!
+ * If true, frame is fully lossless at coded resolution.
+ * */
+ bool coded_lossless;
+ /*!
+ * If true, frame is fully lossless at upscaled resolution.
+ */
+ bool all_lossless;
+ /*!
+ * If true, the frame is restricted to a reduced subset of the full set of
+ * transform types.
+ */
+ bool reduced_tx_set_used;
+ /*!
+ * If true, error resilient mode is enabled.
+ * Note: Error resilient mode allows the syntax of a frame to be parsed
+ * independently of previously decoded frames.
+ */
+ bool error_resilient_mode;
+ /*!
+ * If false, only MOTION_MODE that may be used is SIMPLE_TRANSLATION;
+ * if true, all MOTION_MODES may be used.
+ */
+ bool switchable_motion_mode;
+ TX_MODE tx_mode; /*!< Transform mode at frame level. */
+ InterpFilter interp_filter; /*!< Interpolation filter at frame level. */
+ /*!
+ * The reference frame that contains the CDF values and other state that
+ * should be loaded at the start of the frame.
+ */
+ int primary_ref_frame;
+ /*!
+ * Byte alignment of the planes in the reference buffers.
+ */
+ int byte_alignment;
+ /*!
+ * Flag signaling how frame contexts should be updated at the end of
+ * a frame decode.
+ */
+ REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+} FeatureFlags;
+
+/*!
+ * \brief Params related to tiles.
+ */
+typedef struct CommonTileParams {
+ int cols; /*!< number of tile columns that frame is divided into */
+ int rows; /*!< number of tile rows that frame is divided into */
+ int max_width_sb; /*!< maximum tile width in superblock units. */
+ int max_height_sb; /*!< maximum tile height in superblock units. */
+
+ /*!
+ * Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+ */
+ int min_inner_width;
+
+ /*!
+ * If true, tiles are uniformly spaced with power-of-two number of rows and
+ * columns.
+ * If false, tiles have explicitly configured widths and heights.
+ */
+ int uniform_spacing;
+
+ /**
+ * \name Members only valid when uniform_spacing == 1
+ */
+ /**@{*/
+ int log2_cols; /*!< log2 of 'cols'. */
+ int log2_rows; /*!< log2 of 'rows'. */
+ int width; /*!< tile width in MI units */
+ int height; /*!< tile height in MI units */
+ /**@}*/
+
+ /*!
+ * Min num of tile columns possible based on 'max_width_sb' and frame width.
+ */
+ int min_log2_cols;
+ /*!
+ * Min num of tile rows possible based on 'max_height_sb' and frame height.
+ */
+ int min_log2_rows;
+ /*!
+ * Max num of tile columns possible based on frame width.
+ */
+ int max_log2_cols;
+ /*!
+ * Max num of tile rows possible based on frame height.
+ */
+ int max_log2_rows;
+ /*!
+ * log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+ */
+ int min_log2;
+ /*!
+ * col_start_sb[i] is the start position of tile column i in superblock units.
+ * valid for 0 <= i <= cols
+ */
+ int col_start_sb[MAX_TILE_COLS + 1];
+ /*!
+ * row_start_sb[i] is the start position of tile row i in superblock units.
+ * valid for 0 <= i <= rows
+ */
+ int row_start_sb[MAX_TILE_ROWS + 1];
+ /*!
+ * If true, we are using large scale tile mode.
+ */
+ unsigned int large_scale;
+ /*!
+ * Only relevant when large_scale == 1.
+ * If true, the independent decoding of a single tile or a section of a frame
+ * is allowed.
+ */
+ unsigned int single_tile_decoding;
+} CommonTileParams;
+
+typedef struct CommonModeInfoParams CommonModeInfoParams;
+/*!
+ * \brief Params related to MB_MODE_INFO arrays and related info.
+ */
+struct CommonModeInfoParams {
+ /*!
+ * Number of rows in the frame in 16 pixel units.
+ * This is computed from frame height aligned to a multiple of 8.
+ */
+ int mb_rows;
+ /*!
+ * Number of cols in the frame in 16 pixel units.
+ * This is computed from frame width aligned to a multiple of 8.
+ */
+ int mb_cols;
+
+ /*!
+ * Total MBs = mb_rows * mb_cols.
+ */
+ int MBs;
+
+ /*!
+ * Number of rows in the frame in 4 pixel (MB_MODE_INFO) units.
+ * This is computed from frame height aligned to a multiple of 8.
+ */
+ int mi_rows;
+ /*!
+ * Number of cols in the frame in 4 pixel (MB_MODE_INFO) units.
+ * This is computed from frame width aligned to a multiple of 8.
+ */
+ int mi_cols;
+
+ /*!
+ * An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+ * in the frame.
+ * Note: This array should be treated like a scratch memory, and should NOT be
+ * accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+ */
+ MB_MODE_INFO *mi_alloc;
+ /*!
+ * Number of allocated elements in 'mi_alloc'.
+ */
+ int mi_alloc_size;
+ /*!
+ * Stride for 'mi_alloc' array.
+ */
+ int mi_alloc_stride;
+ /*!
+ * The minimum block size that each element in 'mi_alloc' can correspond to.
+ * For decoder, this is always BLOCK_4X4.
+ * For encoder, this is BLOCK_8X8 for resolution >= 4k case or REALTIME mode
+ * case. Otherwise, this is BLOCK_4X4.
+ */
+ BLOCK_SIZE mi_alloc_bsize;
+
+ /*!
+ * Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+ * It's possible that:
+ * - Multiple pointers in the grid point to the same element in 'mi_alloc'
+ * (for example, for all 4x4 blocks that belong to the same partition block).
+ * - Some pointers can be NULL (for example, for blocks outside visible area).
+ */
+ MB_MODE_INFO **mi_grid_base;
+ /*!
+ * Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+ */
+ int mi_grid_size;
+ /*!
+ * Stride for 'mi_grid_base' (and 'tx_type_map' also).
+ */
+ int mi_stride;
+
+ /*!
+ * An array of tx types for each 4x4 block in the frame.
+ * Number of allocated elements is same as 'mi_grid_size', and stride is
+ * same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+ * 'mi_grid_base'.
+ */
+ TX_TYPE *tx_type_map;
+
+ /**
+ * \name Function pointers to allow separate logic for encoder and decoder.
+ */
+ /**@{*/
+ /*!
+ * Free the memory allocated to arrays in 'mi_params'.
+ * \param[in,out] mi_params object containing common mode info parameters
+ */
+ void (*free_mi)(struct CommonModeInfoParams *mi_params);
+ /*!
+ * Initialize / reset appropriate arrays in 'mi_params'.
+ * \param[in,out] mi_params object containing common mode info parameters
+ */
+ void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+ /*!
+ * Allocate required memory for arrays in 'mi_params'.
+ * \param[in,out] mi_params object containing common mode info
+ * parameters
+ * \param width frame width
+ * \param height frame height
+ * \param min_partition_size minimum partition size allowed while
+ * encoding
+ */
+ void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
+ int height, BLOCK_SIZE min_partition_size);
+ /**@}*/
+};
+
+typedef struct CommonQuantParams CommonQuantParams;
+/*!
+ * \brief Parameters related to quantization at the frame level.
+ */
+struct CommonQuantParams {
+ /*!
+ * Base qindex of the frame in the range 0 to 255.
+ */
+ int base_qindex;
+
+ /*!
+ * Delta of qindex (from base_qindex) for Y plane DC coefficient.
+ * Note: y_ac_delta_q is implicitly 0.
+ */
+ int y_dc_delta_q;
+
+ /*!
+ * Delta of qindex (from base_qindex) for U plane DC coefficients.
+ */
+ int u_dc_delta_q;
+ /*!
+ * Delta of qindex (from base_qindex) for U plane AC coefficients.
+ */
+ int v_dc_delta_q;
+
+ /*!
+ * Delta of qindex (from base_qindex) for V plane DC coefficients.
+ * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
+ */
+ int u_ac_delta_q;
+ /*!
+ * Delta of qindex (from base_qindex) for V plane AC coefficients.
+ * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
+ */
+ int v_ac_delta_q;
+
+ /*
+ * Note: The qindex per superblock may have a delta from the qindex obtained
+ * at frame level from parameters above, based on 'cm->delta_q_info'.
+ */
+
+ /**
+ * \name True dequantizers.
+ * The dequantizers below are true dequantizers used only in the
+ * dequantization process. They have the same coefficient
+ * shift/scale as TX.
+ */
+ /**@{*/
+ int16_t y_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for Y plane */
+ int16_t u_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for U plane */
+ int16_t v_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for V plane */
+ /**@}*/
+
+ /**
+ * \name Global quantization matrix tables.
+ */
+ /**@{*/
+ /*!
+ * Global dequantization matrix table.
+ */
+ const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+ /*!
+ * Global quantization matrix table.
+ */
+ const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+ /**@}*/
+
+ /**
+ * \name Local dequantization matrix tables for each frame.
+ */
+ /**@{*/
+ /*!
+ * Local dequant matrix for Y plane.
+ */
+ const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ /*!
+ * Local dequant matrix for U plane.
+ */
+ const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ /*!
+ * Local dequant matrix for V plane.
+ */
+ const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ /**@}*/
+
+ /*!
+ * Flag indicating whether quantization matrices are being used:
+ * - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+ * indices to be used to access appropriate global quant matrix tables.
+ * - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+ */
+ bool using_qmatrix;
+ /**
+ * \name Valid only when using_qmatrix == true
+ * Indicate the level indices to be used to access appropriate global quant
+ * matrix tables.
+ */
+ /**@{*/
+ int qmatrix_level_y; /*!< Level index for Y plane */
+ int qmatrix_level_u; /*!< Level index for U plane */
+ int qmatrix_level_v; /*!< Level index for V plane */
+ /**@}*/
+};
+
+typedef struct CommonContexts CommonContexts;
+/*!
+ * \brief Contexts used for transmitting various symbols in the bitstream.
+ */
+struct CommonContexts {
+ /*!
+ * Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+ * partition[i][j] is the context for ith tile row, jth mi_col.
+ */
+ PARTITION_CONTEXT **partition;
+
+ /*!
+ * Context used to derive context for multiple symbols:
+ * - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+ * to transmit skip_txfm flag.
+ * - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+ * sign.
+ * entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+ */
+ ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
+
+ /*!
+ * Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+ * transmit 'is_split' flag to indicate if this transform block should be
+ * split into smaller sub-blocks.
+ * txfm[i][j] is the context for ith tile row, jth mi_col.
+ */
+ TXFM_CONTEXT **txfm;
+
+ /*!
+ * Dimensions that were used to allocate the arrays above.
+ * If these dimensions change, the arrays may have to be re-allocated.
+ */
+ int num_planes; /*!< Corresponds to av1_num_planes(cm) */
+ int num_tile_rows; /*!< Corresponds to cm->tiles.row */
+ int num_mi_cols; /*!< Corresponds to cm->mi_params.mi_cols */
+};
+
+/*!
+ * \brief Top level common structure used by both encoder and decoder.
+ */
+typedef struct AV1Common {
+ /*!
+ * Information about the current frame that is being coded.
+ */
+ CurrentFrame current_frame;
+ /*!
+ * Code and details about current error status.
+ */
+ struct aom_internal_error_info *error;
+
+ /*!
+ * AV1 allows two types of frame scaling operations:
+ * 1. Frame super-resolution: that allows coding a frame at lower resolution
+ * and after decoding the frame, normatively scales and restores the frame --
+ * inside the coding loop.
+ * 2. Frame resize: that allows coding frame at lower/higher resolution, and
+ * then non-normatively upscale the frame at the time of rendering -- outside
+ * the coding loop.
+ * Hence, the need for 3 types of dimensions.
+ */
+
+ /**
+ * \name Coded frame dimensions.
+ */
+ /**@{*/
+ int width; /*!< Coded frame width */
+ int height; /*!< Coded frame height */
+ /**@}*/
+
+ /**
+ * \name Rendered frame dimensions.
+ * Dimensions after applying both super-resolution and resize to the coded
+ * frame. Different from coded dimensions if super-resolution and/or resize
+ * are being used for this frame.
+ */
+ /**@{*/
+ int render_width; /*!< Rendered frame width */
+ int render_height; /*!< Rendered frame height */
+ /**@}*/
+
+ /**
+ * \name Super-resolved frame dimensions.
+ * Frame dimensions after applying super-resolution to the coded frame (if
+ * present), but before applying resize.
+ * Larger than the coded dimensions if super-resolution is being used for
+ * this frame.
+ * Different from rendered dimensions if resize is being used for this frame.
+ */
+ /**@{*/
+ int superres_upscaled_width; /*!< Super-resolved frame width */
+ int superres_upscaled_height; /*!< Super-resolved frame height */
+ /**@}*/
+
+ /*!
+ * The denominator of the superres scale used by this frame.
+ * Note: The numerator is fixed to be SCALE_NUMERATOR.
+ */
+ uint8_t superres_scale_denominator;
+
+ /*!
+ * buffer_removal_times[op_num] specifies the frame removal time in units of
+ * DecCT clock ticks counted from the removal time of the last random access
+ * point for operating point op_num.
+ * TODO(urvang): We probably don't need the +1 here.
+ */
+ uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
+ /*!
+ * Presentation time of the frame in clock ticks DispCT counted from the
+ * removal time of the last random access point for the operating point that
+ * is being decoded.
+ */
+ uint32_t frame_presentation_time;
+
+ /*!
+ * Buffer where previous frame is stored.
+ */
+ RefCntBuffer *prev_frame;
+
+ /*!
+ * Buffer into which the current frame will be stored and other related info.
+ * TODO(hkuang): Combine this with cur_buf in macroblockd.
+ */
+ RefCntBuffer *cur_frame;
+
+ /*!
+ * For encoder, we have a two-level mapping from reference frame type to the
+ * corresponding buffer in the buffer pool:
+ * * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+ * EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+ * * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+ * the reference counted buffer structure RefCntBuffer, taken from the buffer
+ * pool cm->buffer_pool->frame_bufs.
+ *
+ * LAST_FRAME, ..., EXTREF_FRAME
+ * | |
+ * v v
+ * remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1]
+ * | |
+ * v v
+ * ref_frame_map[], ..., ref_frame_map[]
+ *
+ * Note: INTRA_FRAME always refers to the current frame, so there's no need to
+ * have a remapped index for the same.
+ */
+ int remapped_ref_idx[REF_FRAMES];
+
+ /*!
+ * Scale of the current frame with respect to itself.
+ * This is currently used for intra block copy, which behaves like an inter
+ * prediction mode, where the reference frame is the current frame itself.
+ */
+ struct scale_factors sf_identity;
+
+ /*!
+ * Scale factors of the reference frame with respect to the current frame.
+ * This is required for generating inter prediction and will be non-identity
+ * for a reference frame, if it has different dimensions than the coded
+ * dimensions of the current frame.
+ */
+ struct scale_factors ref_scale_factors[REF_FRAMES];
+
+ /*!
+ * For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+ * the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+ * For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
+ * remapped reference index 'j' (that is, original reference type 'i') to
+ * a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+ */
+ RefCntBuffer *ref_frame_map[REF_FRAMES];
+
+ /*!
+ * If true, this frame is actually shown after decoding.
+ * If false, this frame is coded in the bitstream, but not shown. It is only
+ * used as a reference for other frames coded later.
+ */
+ int show_frame;
+
+ /*!
+ * If true, this frame can be used as a show-existing frame for other frames
+ * coded later.
+ * When 'show_frame' is true, this is always true for all non-keyframes.
+ * When 'show_frame' is false, this value is transmitted in the bitstream.
+ */
+ int showable_frame;
+
+ /*!
+ * If true, show an existing frame coded before, instead of actually coding a
+ * frame. The existing frame comes from one of the existing reference buffers,
+ * as signaled in the bitstream.
+ */
+ int show_existing_frame;
+
+ /*!
+ * Whether some features are allowed or not.
+ */
+ FeatureFlags features;
+
+ /*!
+ * Params related to MB_MODE_INFO arrays and related info.
+ */
+ CommonModeInfoParams mi_params;
+
+#if CONFIG_ENTROPY_STATS
+ /*!
+ * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1).
+ */
+ int coef_cdf_category;
+#endif // CONFIG_ENTROPY_STATS
+
+ /*!
+ * Quantization params.
+ */
+ CommonQuantParams quant_params;
+
+ /*!
+ * Segmentation info for current frame.
+ */
+ struct segmentation seg;
+
+ /*!
+ * Segmentation map for previous frame.
+ */
+ uint8_t *last_frame_seg_map;
+
+ /**
+ * \name Deblocking filter parameters.
+ */
+ /**@{*/
+ loop_filter_info_n lf_info; /*!< Loop filter info */
+ struct loopfilter lf; /*!< Loop filter parameters */
+ /**@}*/
+
+ /**
+ * \name Loop Restoration filter parameters.
+ */
+ /**@{*/
+ RestorationInfo rst_info[MAX_MB_PLANE]; /*!< Loop Restoration filter info */
+ int32_t *rst_tmpbuf; /*!< Scratch buffer for self-guided restoration */
+ RestorationLineBuffers *rlbs; /*!< Line buffers needed by loop restoration */
+ YV12_BUFFER_CONFIG rst_frame; /*!< Stores the output of loop restoration */
+ /**@}*/
+
+ /*!
+ * CDEF (Constrained Directional Enhancement Filter) parameters.
+ */
+ CdefInfo cdef_info;
+
+ /*!
+ * Parameters for film grain synthesis.
+ */
+ aom_film_grain_t film_grain_params;
+
+ /*!
+ * Parameters for delta quantization and delta loop filter level.
+ */
+ DeltaQInfo delta_q_info;
+
+ /*!
+ * Global motion parameters for each reference frame.
+ */
+ WarpedMotionParams global_motion[REF_FRAMES];
+
+ /*!
+ * Elements part of the sequence header, that are applicable for all the
+ * frames in the video.
+ */
+ SequenceHeader *seq_params;
+
+ /*!
+ * Current CDFs of all the symbols for the current frame.
+ */
+ FRAME_CONTEXT *fc;
+ /*!
+ * Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+ * (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+ * copied from default CDF tables for each symbol.
+ */
+ FRAME_CONTEXT *default_frame_context;
+
+ /*!
+ * Parameters related to tiling.
+ */
+ CommonTileParams tiles;
+
+ /*!
+ * External BufferPool passed from outside.
+ */
+ BufferPool *buffer_pool;
+
+ /*!
+ * Above context buffers and their sizes.
+ * Note: above contexts are allocated in this struct, as their size is
+ * dependent on frame width, while left contexts are declared and allocated in
+ * MACROBLOCKD struct, as they have a fixed size.
+ */
+ CommonContexts above_contexts;
+
+ /**
+ * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1
+ */
+ /**@{*/
+ int current_frame_id; /*!< frame ID for the current frame. */
+ int ref_frame_id[REF_FRAMES]; /*!< frame IDs for the reference frames. */
+ /**@}*/
+
+ /*!
+ * Motion vectors provided by motion field estimation.
+ * tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+ * mi_row = 2 * row,
+ * mi_col = 2 * col, and
+ * stride = cm->mi_params.mi_stride / 2
+ */
+ TPL_MV_REF *tpl_mvs;
+ /*!
+ * Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+ */
+ int tpl_mvs_mem_size;
+ /*!
+ * ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+ * current frame is positive; and 0 otherwise.
+ */
+ int ref_frame_sign_bias[REF_FRAMES];
+ /*!
+ * ref_frame_side[k] is 1 if relative distance between reference 'k' and
+ * current frame is positive, -1 if relative distance is 0; and 0 otherwise.
+ * TODO(jingning): This can be combined with sign_bias later.
+ */
+ int8_t ref_frame_side[REF_FRAMES];
+
+ /*!
+ * Temporal layer ID of this frame
+ * (in the range 0 ... (number_temporal_layers - 1)).
+ */
+ int temporal_layer_id;
+
+ /*!
+ * Spatial layer ID of this frame
+ * (in the range 0 ... (number_spatial_layers - 1)).
+ */
+ int spatial_layer_id;
+
+#if TXCOEFF_TIMER
+ int64_t cum_txcoeff_timer;
+ int64_t txcoeff_timer;
+ int txb_count;
+#endif // TXCOEFF_TIMER
+
+#if TXCOEFF_COST_TIMER
+ int64_t cum_txcoeff_cost_timer;
+ int64_t txcoeff_cost_timer;
+ int64_t txcoeff_cost_count;
+#endif // TXCOEFF_COST_TIMER
+} AV1_COMMON;
+
+/*!\cond */
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+ if (index < 0 || index >= REF_FRAMES) return NULL;
+ if (cm->ref_frame_map[index] == NULL) return NULL;
+ return &cm->ref_frame_map[index]->buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ lock_buffer_pool(cm->buffer_pool);
+ const int num_frame_bufs = cm->buffer_pool->num_frame_bufs;
+ for (i = 0; i < num_frame_bufs; ++i)
+ if (frame_bufs[i].ref_count == 0) break;
+
+ if (i != num_frame_bufs) {
+ if (frame_bufs[i].buf.use_external_reference_buffers) {
+ // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+ // external reference buffers. Restore the buffer pointers to point to the
+ // internally allocated memory.
+ YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+ ybf->y_buffer = ybf->store_buf_adr[0];
+ ybf->u_buffer = ybf->store_buf_adr[1];
+ ybf->v_buffer = ybf->store_buf_adr[2];
+ ybf->use_external_reference_buffers = 0;
+ }
+
+ frame_bufs[i].ref_count = 1;
+ } else {
+ // We should never run out of free buffers. If this assertion fails, there
+ // is a reference leak.
+ assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
+ // Reset i to be INVALID_IDX to indicate no free buffer found.
+ i = INVALID_IDX;
+ }
+
+ unlock_buffer_pool(cm->buffer_pool);
+ return i;
+}
+
+static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+ // Release the previously-used frame-buffer
+ if (cm->cur_frame != NULL) {
+ --cm->cur_frame->ref_count;
+ cm->cur_frame = NULL;
+ }
+
+ // Assign a new framebuffer
+ const int new_fb_idx = get_free_fb(cm);
+ if (new_fb_idx == INVALID_IDX) return NULL;
+
+ cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ aom_invalidate_pyramid(cm->cur_frame->buf.y_pyramid);
+ av1_invalidate_corner_list(cm->cur_frame->buf.corners);
+#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
+ av1_zero(cm->cur_frame->interp_filter_selected);
+ return cm->cur_frame;
+}
+
+// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
+// counts accordingly.
+static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+ RefCntBuffer *rhs_ptr) {
+ RefCntBuffer *const old_ptr = *lhs_ptr;
+ if (old_ptr != NULL) {
+ assert(old_ptr->ref_count > 0);
+ // One less reference to the buffer at 'old_ptr', so decrease ref count.
+ --old_ptr->ref_count;
+ }
+
+ *lhs_ptr = rhs_ptr;
+ // One more reference to the buffer at 'rhs_ptr', so increase ref count.
+ ++rhs_ptr->ref_count;
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+ return cm->current_frame.frame_type == KEY_FRAME ||
+ cm->current_frame.frame_type == INTRA_ONLY_FRAME;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+ return cm->current_frame.frame_type == S_FRAME;
+}
+
+// These functions take a reference frame label between LAST_FRAME and
+// EXTREF_FRAME inclusive. Note that this is different to the indexing
+// previously used by the frame_refs[] array.
+static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME ref_frame) {
+ return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
+ ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
+ : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_frame_buf(
+ const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Both const and non-const versions of this function are provided so that it
+// can be used with a const AV1_COMMON if needed.
+static INLINE const struct scale_factors *get_ref_scale_factors_const(
+ const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE struct scale_factors *get_ref_scale_factors(
+ AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+ const AV1_COMMON *const cm) {
+ const int primary_ref_frame = cm->features.primary_ref_frame;
+ if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+ const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+ return !cm->features.error_resilient_mode &&
+ cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
+ cm->seq_params->order_hint_info.enable_order_hint &&
+ !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+ return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
+ cm->seq_params->enable_warped_motion;
+}
+
+static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+ const int buf_rows = buf->mi_rows;
+ const int buf_cols = buf->mi_cols;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
+ buf_cols != mi_params->mi_cols) {
+ aom_free(buf->mvs);
+ buf->mi_rows = mi_params->mi_rows;
+ buf->mi_cols = mi_params->mi_cols;
+ CHECK_MEM_ERROR(cm, buf->mvs,
+ (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
+ ((mi_params->mi_cols + 1) >> 1),
+ sizeof(*buf->mvs)));
+ aom_free(buf->seg_map);
+ CHECK_MEM_ERROR(
+ cm, buf->seg_map,
+ (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
+ sizeof(*buf->seg_map)));
+ }
+
+ const int mem_size =
+ ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
+
+ if (cm->tpl_mvs == NULL || cm->tpl_mvs_mem_size < mem_size) {
+ aom_free(cm->tpl_mvs);
+ CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+ (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+ cm->tpl_mvs_mem_size = mem_size;
+ }
+}
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+ return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
+}
+
+static INLINE void av1_init_above_context(CommonContexts *above_contexts,
+ int num_planes, int tile_row,
+ MACROBLOCKD *xd) {
+ for (int i = 0; i < num_planes; ++i) {
+ xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
+ }
+ xd->above_partition_context = above_contexts->partition[tile_row];
+ xd->above_txfm_context = above_contexts->txfm[tile_row];
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) {
+ const int num_planes = av1_num_planes(cm);
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+
+ for (int i = 0; i < num_planes; ++i) {
+ if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+ memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
+ sizeof(quant_params->y_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
+ sizeof(quant_params->y_iqmatrix));
+
+ } else {
+ if (i == AOM_PLANE_U) {
+ memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
+ sizeof(quant_params->u_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
+ sizeof(quant_params->u_iqmatrix));
+ } else {
+ memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
+ sizeof(quant_params->v_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
+ sizeof(quant_params->v_iqmatrix));
+ }
+ }
+ }
+ xd->mi_stride = cm->mi_params.mi_stride;
+ xd->error_info = cm->error;
+ cfl_init(&xd->cfl, cm->seq_params);
+}
+
+static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+ const int num_planes) {
+ int i;
+ int row_offset = mi_row;
+ int col_offset = mi_col;
+ for (i = 0; i < num_planes; ++i) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ // Offset the buffer pointer
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+ row_offset = mi_row - 1;
+ if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+ col_offset = mi_col - 1;
+ int above_idx = col_offset;
+ int left_idx = row_offset & MAX_MIB_MASK;
+ pd->above_entropy_context =
+ &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
+ pd->left_entropy_context =
+ &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
+ }
+}
+
+static INLINE int calc_mi_size(int len) {
+ // len is in mi units. Align to a multiple of SBs.
+ return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+ const int num_planes) {
+ int i;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+ xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+ xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+ xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+ }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+ int mi_row, int bh, int mi_col, int bw,
+ int mi_rows, int mi_cols) {
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
+ xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
+ xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
+
+ xd->mi_row = mi_row;
+ xd->mi_col = mi_col;
+
+ // Are edges available for intra prediction?
+ xd->up_available = (mi_row > tile->mi_row_start);
+
+ const int ss_x = xd->plane[1].subsampling_x;
+ const int ss_y = xd->plane[1].subsampling_y;
+
+ xd->left_available = (mi_col > tile->mi_col_start);
+ xd->chroma_up_available = xd->up_available;
+ xd->chroma_left_available = xd->left_available;
+ if (ss_x && bw < mi_size_wide[BLOCK_8X8])
+ xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+ if (ss_y && bh < mi_size_high[BLOCK_8X8])
+ xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+ if (xd->up_available) {
+ xd->above_mbmi = xd->mi[-xd->mi_stride];
+ } else {
+ xd->above_mbmi = NULL;
+ }
+
+ if (xd->left_available) {
+ xd->left_mbmi = xd->mi[-1];
+ } else {
+ xd->left_mbmi = NULL;
+ }
+
+ const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+ ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+ xd->is_chroma_ref = chroma_ref;
+ if (chroma_ref) {
+ // To help calculate the "above" and "left" chroma blocks, note that the
+ // current block may cover multiple luma blocks (e.g., if partitioned into
+ // 4x4 luma blocks).
+ // First, find the top-left-most luma block covered by this chroma block
+ MB_MODE_INFO **base_mi =
+ &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+ // Then, we consider the luma region covered by the left or above 4x4 chroma
+ // prediction. We want to point to the chroma reference block in that
+ // region, which is the bottom-right-most mi unit.
+ // This leads to the following offsets:
+ MB_MODE_INFO *chroma_above_mi =
+ xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+ xd->chroma_above_mbmi = chroma_above_mi;
+
+ MB_MODE_INFO *chroma_left_mi =
+ xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+ xd->chroma_left_mbmi = chroma_left_mi;
+ }
+
+ xd->height = bh;
+ xd->width = bw;
+
+ xd->is_last_vertical_rect = 0;
+ if (xd->width < xd->height) {
+ if (!((mi_col + xd->width) & (xd->height - 1))) {
+ xd->is_last_vertical_rect = 1;
+ }
+ }
+
+ xd->is_first_horizontal_rect = 0;
+ if (xd->width > xd->height)
+ if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1;
+}
+
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi) {
+ const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[above];
+ const int left_ctx = intra_mode_context[left];
+ return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize) {
+ PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
+ PARTITION_CONTEXT *const left_ctx =
+ xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ memset(above_ctx, partition_context_lookup[subsize].above, bw);
+ memset(left_ctx, partition_context_lookup[subsize].left, bh);
+}
+
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int subsampling_x, int subsampling_y) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+ ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+ return ref_pos;
+}
+
+static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+ size_t element) {
+ assert(cdf != NULL);
+ return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
+}
+
+static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+ const aom_cdf_prob *const in,
+ BLOCK_SIZE bsize) {
+ (void)bsize;
+ out[0] = CDF_PROB_TOP;
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ);
+ out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+ if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+ out[0] = AOM_ICDF(out[0]);
+ out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+ const aom_cdf_prob *const in,
+ BLOCK_SIZE bsize) {
+ (void)bsize;
+ out[0] = CDF_PROB_TOP;
+ out[0] -= cdf_element_prob(in, PARTITION_VERT);
+ out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
+ if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+ out[0] = AOM_ICDF(out[0]);
+ out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize,
+ PARTITION_TYPE partition) {
+ if (bsize >= BLOCK_8X8) {
+ const int hbs = mi_size_wide[bsize] / 2;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ switch (partition) {
+ case PARTITION_SPLIT:
+ if (bsize != BLOCK_8X8) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case PARTITION_NONE:
+ case PARTITION_HORZ:
+ case PARTITION_VERT:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+ break;
+ case PARTITION_HORZ_A:
+ update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+ update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+ break;
+ case PARTITION_HORZ_B:
+ update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+ update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+ break;
+ case PARTITION_VERT_A:
+ update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+ update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+ break;
+ case PARTITION_VERT_B:
+ update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+ update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+ break;
+ default: assert(0 && "Invalid partition type");
+ }
+ }
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx =
+ xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+ // Minimum partition point is 8x8. Offset the bsl accordingly.
+ const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+ int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+ assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+ assert(bsl >= 0);
+
+ return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+ if (bsize <= BLOCK_8X8)
+ return PARTITION_TYPES;
+ else if (bsize == BLOCK_128X128)
+ return EXT_PARTITION_TYPES - 2;
+ else
+ return EXT_PARTITION_TYPES;
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ int max_blocks_wide = block_size_wide[bsize];
+
+ if (xd->mb_to_right_edge < 0) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+ }
+
+ // Scale the width in the transform block unit.
+ return max_blocks_wide >> MI_SIZE_LOG2;
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ int max_blocks_high = block_size_high[bsize];
+
+ if (xd->mb_to_bottom_edge < 0) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+ }
+
+ // Scale the height in the transform block unit.
+ return max_blocks_high >> MI_SIZE_LOG2;
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+ const MACROBLOCKD *xd,
+ int mi_col_start, int mi_col_end,
+ const int tile_row) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ const int width = mi_col_end - mi_col_start;
+ const int aligned_width =
+ ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
+ const int offset_y = mi_col_start;
+ const int width_y = aligned_width;
+ const int offset_uv = offset_y >> seq_params->subsampling_x;
+ const int width_uv = width_y >> seq_params->subsampling_x;
+ CommonContexts *const above_contexts = &cm->above_contexts;
+
+ av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
+ if (num_planes > 1) {
+ if (above_contexts->entropy[1][tile_row] &&
+ above_contexts->entropy[2][tile_row]) {
+ av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
+ width_uv);
+ av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
+ width_uv);
+ } else {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid value of planes");
+ }
+ }
+
+ av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
+ aligned_width);
+
+ memset(above_contexts->txfm[tile_row] + mi_col_start,
+ tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+ av1_zero(xd->left_entropy_context);
+ av1_zero(xd->left_partition_context);
+
+ memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+ sizeof(xd->left_txfm_context_buffer));
+}
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+ int i;
+ for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+ const MACROBLOCKD *xd) {
+ uint8_t bw = tx_size_wide[tx_size];
+ uint8_t bh = tx_size_high[tx_size];
+
+ if (skip) {
+ bw = n4_w * MI_SIZE;
+ bh = n4_h * MI_SIZE;
+ }
+
+ set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+ set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
+}
+
+static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col) {
+ return mi_row * mi_params->mi_stride + mi_col;
+}
+
+static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col) {
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ const int mi_alloc_row = mi_row / mi_alloc_size_1d;
+ const int mi_alloc_col = mi_col / mi_alloc_size_1d;
+
+ return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
+}
+
+// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi.
+static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col) {
+ // 'mi_grid_base' should point to appropriate memory in 'mi'.
+ const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+ const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+ mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
+ // 'xd->mi' should point to an offset in 'mi_grid_base';
+ xd->mi = mi_params->mi_grid_base + mi_grid_idx;
+ // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
+ xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
+ xd->tx_type_map_stride = mi_params->mi_stride;
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+ TXFM_CONTEXT *left_ctx,
+ TX_SIZE tx_size, TX_SIZE txb_size) {
+ BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+ uint8_t txw = tx_size_wide[tx_size];
+ uint8_t txh = tx_size_high[tx_size];
+ int i;
+ for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+ for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+ switch (tx_dim) {
+ case 128:
+ case 64: return TX_64X64; break;
+ case 32: return TX_32X32; break;
+ case 16: return TX_16X16; break;
+ case 8: return TX_8X8; break;
+ default: return TX_4X4;
+ }
+}
+
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+ if (width == height) {
+ return get_sqr_tx_size(width);
+ }
+ if (width < height) {
+ if (width + width == height) {
+ switch (width) {
+ case 4: return TX_4X8; break;
+ case 8: return TX_8X16; break;
+ case 16: return TX_16X32; break;
+ case 32: return TX_32X64; break;
+ }
+ } else {
+ switch (width) {
+ case 4: return TX_4X16; break;
+ case 8: return TX_8X32; break;
+ case 16: return TX_16X64; break;
+ }
+ }
+ } else {
+ if (height + height == width) {
+ switch (height) {
+ case 4: return TX_8X4; break;
+ case 8: return TX_16X8; break;
+ case 16: return TX_32X16; break;
+ case 32: return TX_64X32; break;
+ }
+ } else {
+ switch (height) {
+ case 4: return TX_16X4; break;
+ case 8: return TX_32X8; break;
+ case 16: return TX_64X16; break;
+ }
+ }
+ }
+ assert(0);
+ return TX_4X4;
+}
+
+static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+ const TXFM_CONTEXT *const left_ctx,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ const int above = *above_ctx < txw;
+ const int left = *left_ctx < txh;
+ int category = TXFM_PARTITION_CONTEXTS;
+
+ // dummy return, not used by others.
+ if (tx_size <= TX_4X4) return 0;
+
+ TX_SIZE max_tx_size =
+ get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
+
+ if (max_tx_size >= TX_8X8) {
+ category =
+ (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+ (TX_SIZES - 1 - max_tx_size) * 2;
+ }
+ assert(category != TXFM_PARTITION_CONTEXTS);
+ return category * 3 + above + left;
+}
+
+// Compute the next partition in the direction of the sb_type stored in the mi
+// array, starting with bsize.
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
+ return PARTITION_INVALID;
+
+ const int offset = mi_row * mi_params->mi_stride + mi_col;
+ MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
+ const BLOCK_SIZE subsize = mi[0]->bsize;
+
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ if (subsize == bsize) return PARTITION_NONE;
+
+ const int bhigh = mi_size_high[bsize];
+ const int bwide = mi_size_wide[bsize];
+ const int sshigh = mi_size_high[subsize];
+ const int sswide = mi_size_wide[subsize];
+
+ if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
+ mi_col + bhigh / 2 < mi_params->mi_cols) {
+ // In this case, the block might be using an extended partition
+ // type.
+ const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+ const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
+
+ if (sswide == bwide) {
+ // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
+ // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
+ // half was split.
+ if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+ assert(sshigh * 2 == bhigh);
+
+ if (mbmi_below->bsize == subsize)
+ return PARTITION_HORZ;
+ else
+ return PARTITION_HORZ_B;
+ } else if (sshigh == bhigh) {
+ // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
+ // PARTITION_VERT_B. To distinguish the latter two, check if the right
+ // half was split.
+ if (sswide * 4 == bwide) return PARTITION_VERT_4;
+ assert(sswide * 2 == bhigh);
+
+ if (mbmi_right->bsize == subsize)
+ return PARTITION_VERT;
+ else
+ return PARTITION_VERT_B;
+ } else {
+ // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
+ // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
+ // dimensions, we immediately know this is a split (which will recurse to
+ // get to subsize). Otherwise look down and to the right. With
+ // PARTITION_VERT_A, the right block will have height bhigh; with
+ // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
+ // it's PARTITION_SPLIT.
+ if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
+
+ if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A;
+ if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A;
+
+ return PARTITION_SPLIT;
+ }
+ }
+ const int vert_split = sswide < bwide;
+ const int horz_split = sshigh < bhigh;
+ const int split_idx = (vert_split << 1) | horz_split;
+ assert(split_idx != 0);
+
+ static const PARTITION_TYPE base_partitions[4] = {
+ PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
+ };
+
+ return base_partitions[split_idx];
+}
+
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+ BLOCK_SIZE sb_size) {
+ seq_params->sb_size = sb_size;
+ seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+ seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
+}
+
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ int coded_lossless = 1;
+ if (cm->seg.enabled) {
+ for (int i = 0; i < MAX_SEGMENTS; ++i) {
+ if (!xd->lossless[i]) {
+ coded_lossless = 0;
+ break;
+ }
+ }
+ } else {
+ coded_lossless = xd->lossless[0];
+ }
+ return coded_lossless;
+}
+
+static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+ return seq_level_idx == SEQ_LEVEL_MAX ||
+ (seq_level_idx < SEQ_LEVELS &&
+ // The following levels are currently undefined.
+ seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
+ seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
+ seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3
+#if !CONFIG_CWG_C013
+ && seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
+ seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3 &&
+ seq_level_idx != SEQ_LEVEL_8_0 && seq_level_idx != SEQ_LEVEL_8_1 &&
+ seq_level_idx != SEQ_LEVEL_8_2 && seq_level_idx != SEQ_LEVEL_8_3
+#endif
+ );
+}
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
new file mode 100644
index 0000000000..8d69efcd2d
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -0,0 +1,1841 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 4;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[4];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[2];
+ bf1[2] = input[1];
+ bf1[3] = input[3];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+}
+
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 8;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[4];
+ bf1[2] = input[2];
+ bf1[3] = input[6];
+ bf1[4] = input[1];
+ bf1[5] = input[5];
+ bf1[6] = input[3];
+ bf1[7] = input[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+}
+
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 16;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[8];
+ bf1[2] = input[4];
+ bf1[3] = input[12];
+ bf1[4] = input[2];
+ bf1[5] = input[10];
+ bf1[6] = input[6];
+ bf1[7] = input[14];
+ bf1[8] = input[1];
+ bf1[9] = input[9];
+ bf1[10] = input[5];
+ bf1[11] = input[13];
+ bf1[12] = input[3];
+ bf1[13] = input[11];
+ bf1[14] = input[7];
+ bf1[15] = input[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+ bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+ bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+}
+
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 32;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[32];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[16];
+ bf1[2] = input[8];
+ bf1[3] = input[24];
+ bf1[4] = input[4];
+ bf1[5] = input[20];
+ bf1[6] = input[12];
+ bf1[7] = input[28];
+ bf1[8] = input[2];
+ bf1[9] = input[18];
+ bf1[10] = input[10];
+ bf1[11] = input[26];
+ bf1[12] = input[6];
+ bf1[13] = input[22];
+ bf1[14] = input[14];
+ bf1[15] = input[30];
+ bf1[16] = input[1];
+ bf1[17] = input[17];
+ bf1[18] = input[9];
+ bf1[19] = input[25];
+ bf1[20] = input[5];
+ bf1[21] = input[21];
+ bf1[22] = input[13];
+ bf1[23] = input[29];
+ bf1[24] = input[3];
+ bf1[25] = input[19];
+ bf1[26] = input[11];
+ bf1[27] = input[27];
+ bf1[28] = input[7];
+ bf1[29] = input[23];
+ bf1[30] = input[15];
+ bf1[31] = input[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+ bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+ bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+ bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+ bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+ bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+ bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+ bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+ bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+ bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+ bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+}
+
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ int bit = cos_bit;
+ const int32_t *sinpi = sinpi_arr(bit);
+ int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ int32_t x0 = input[0];
+ int32_t x1 = input[1];
+ int32_t x2 = input[2];
+ int32_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ assert(sinpi[1] + sinpi[2] == sinpi[4]);
+
+ // stage 1
+ s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
+ s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
+ s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
+ s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
+ s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
+ s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
+ s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
+
+ // stage 2
+ // NOTICE: (x0 - x2) here may use one extra bit compared to the
+ // opt_range_row/col specified in av1_gen_inv_stage_range()
+ s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
+
+ // stage 3
+ s0 = range_check_value(s0 + s3, stage_range[3] + bit);
+ s1 = range_check_value(s1 - s4, stage_range[3] + bit);
+ s3 = range_check_value(s2, stage_range[3] + bit);
+ s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
+
+ // stage 4
+ s0 = range_check_value(s0 + s5, stage_range[4] + bit);
+ s1 = range_check_value(s1 - s6, stage_range[4] + bit);
+
+ // stage 5
+ x0 = range_check_value(s0 + s3, stage_range[5] + bit);
+ x1 = range_check_value(s1 + s3, stage_range[5] + bit);
+ x2 = range_check_value(s2, stage_range[5] + bit);
+ x3 = range_check_value(s0 + s1, stage_range[5] + bit);
+
+ // stage 6
+ x3 = range_check_value(x3 - s3, stage_range[6] + bit);
+
+ output[0] = round_shift(x0, bit);
+ output[1] = round_shift(x1, bit);
+ output[2] = round_shift(x2, bit);
+ output[3] = round_shift(x3, bit);
+}
+
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 8;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[7];
+ bf1[1] = input[0];
+ bf1[2] = input[5];
+ bf1[3] = input[2];
+ bf1[4] = input[3];
+ bf1[5] = input[4];
+ bf1[6] = input[1];
+ bf1[7] = input[6];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = -bf0[4];
+ bf1[2] = bf0[6];
+ bf1[3] = -bf0[2];
+ bf1[4] = bf0[3];
+ bf1[5] = -bf0[7];
+ bf1[6] = bf0[5];
+ bf1[7] = -bf0[1];
+}
+
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 16;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[15];
+ bf1[1] = input[0];
+ bf1[2] = input[13];
+ bf1[3] = input[2];
+ bf1[4] = input[11];
+ bf1[5] = input[4];
+ bf1[6] = input[9];
+ bf1[7] = input[6];
+ bf1[8] = input[7];
+ bf1[9] = input[8];
+ bf1[10] = input[5];
+ bf1[11] = input[10];
+ bf1[12] = input[3];
+ bf1[13] = input[12];
+ bf1[14] = input[1];
+ bf1[15] = input[14];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+ bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+ bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+ bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = -bf0[8];
+ bf1[2] = bf0[12];
+ bf1[3] = -bf0[4];
+ bf1[4] = bf0[6];
+ bf1[5] = -bf0[14];
+ bf1[6] = bf0[10];
+ bf1[7] = -bf0[2];
+ bf1[8] = bf0[3];
+ bf1[9] = -bf0[11];
+ bf1[10] = bf0[15];
+ bf1[11] = -bf0[7];
+ bf1[12] = bf0[5];
+ bf1[13] = -bf0[13];
+ bf1[14] = bf0[9];
+ bf1[15] = -bf0[1];
+}
+
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 4; ++i) {
+ output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
+ }
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
+}
+
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 16; ++i)
+ output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
+}
+
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 64;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[64];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[32];
+ bf1[2] = input[16];
+ bf1[3] = input[48];
+ bf1[4] = input[8];
+ bf1[5] = input[40];
+ bf1[6] = input[24];
+ bf1[7] = input[56];
+ bf1[8] = input[4];
+ bf1[9] = input[36];
+ bf1[10] = input[20];
+ bf1[11] = input[52];
+ bf1[12] = input[12];
+ bf1[13] = input[44];
+ bf1[14] = input[28];
+ bf1[15] = input[60];
+ bf1[16] = input[2];
+ bf1[17] = input[34];
+ bf1[18] = input[18];
+ bf1[19] = input[50];
+ bf1[20] = input[10];
+ bf1[21] = input[42];
+ bf1[22] = input[26];
+ bf1[23] = input[58];
+ bf1[24] = input[6];
+ bf1[25] = input[38];
+ bf1[26] = input[22];
+ bf1[27] = input[54];
+ bf1[28] = input[14];
+ bf1[29] = input[46];
+ bf1[30] = input[30];
+ bf1[31] = input[62];
+ bf1[32] = input[1];
+ bf1[33] = input[33];
+ bf1[34] = input[17];
+ bf1[35] = input[49];
+ bf1[36] = input[9];
+ bf1[37] = input[41];
+ bf1[38] = input[25];
+ bf1[39] = input[57];
+ bf1[40] = input[5];
+ bf1[41] = input[37];
+ bf1[42] = input[21];
+ bf1[43] = input[53];
+ bf1[44] = input[13];
+ bf1[45] = input[45];
+ bf1[46] = input[29];
+ bf1[47] = input[61];
+ bf1[48] = input[3];
+ bf1[49] = input[35];
+ bf1[50] = input[19];
+ bf1[51] = input[51];
+ bf1[52] = input[11];
+ bf1[53] = input[43];
+ bf1[54] = input[27];
+ bf1[55] = input[59];
+ bf1[56] = input[7];
+ bf1[57] = input[39];
+ bf1[58] = input[23];
+ bf1[59] = input[55];
+ bf1[60] = input[15];
+ bf1[61] = input[47];
+ bf1[62] = input[31];
+ bf1[63] = input[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = bf0[26];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
+ bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
+ bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
+ bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
+ bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
+ bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
+ bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
+ bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
+ bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
+ bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
+ bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
+ bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
+ bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
+ bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
+ bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
+ bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
+ bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
+ bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
+ bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
+ bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
+ bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
+ bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
+ bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
+ bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
+ bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
+ bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
+ bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
+ bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
+ bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
+ bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
+ bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+ bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+ bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
+ bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
+ bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
+ bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
+ bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
+ bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
+ bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
+ bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
+ bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+ bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+ bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+ bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+ bf1[43] = bf0[43];
+ bf1[44] = bf0[44];
+ bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
+ bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[52];
+ bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
+ bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
+ bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
+ bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+ bf1[31] = bf0[31];
+ bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
+ bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
+ bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
+ bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
+ bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
+ bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
+ bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
+ bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
+ bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
+ bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
+ bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+ bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+ bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+ bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+ bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = bf0[41];
+ bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
+ bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
+ bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
+ bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
+ bf1[54] = bf0[54];
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
+ bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
+ bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
+ bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+ bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+ bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
+ bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
+ bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
+ bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
+ bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
+ bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
+ bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
+ bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
+ bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
+ bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
+ bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+ bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+ bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+ bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+ bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+ bf1[44] = bf0[44];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = bf0[50];
+ bf1[51] = bf0[51];
+ bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
+ bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
+ bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
+ bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
+ bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
+ bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
+ bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
+ bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
+ bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
+ bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
+ bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
+ bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
+ bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
+ bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
+ bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
+ bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
+ bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 10
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+ bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = bf0[37];
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = bf0[58];
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 11
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
+ bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
+ bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
+ bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
+ bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
+ bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
+ bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
+ bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
+ bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
+}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
new file mode 100644
index 0000000000..e1d5d98d10
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+ if (bit <= 0) return value; // Do nothing for invalid clamp bit.
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+ return (int32_t)clamp64(value, min_value, max_value);
+}
+
+static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+ for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
+}
+
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
new file mode 100644
index 0000000000..b4f7801295
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#include "av1/common/av1_inv_txfm1d.h"
+
+// sum of fwd_shift_##
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+ 5, // 4x4 transform
+ 6, // 8x8 transform
+ 7, // 16x16 transform
+ 7, // 32x32 transform
+ 7, // 64x64 transform
+ 5, // 4x8 transform
+ 5, // 8x4 transform
+ 6, // 8x16 transform
+ 6, // 16x8 transform
+ 6, // 16x32 transform
+ 6, // 32x16 transform
+ 6, // 32x64 transform
+ 6, // 64x32 transform
+ 6, // 4x16 transform
+ 6, // 16x4 transform
+ 7, // 8x32 transform
+ 7, // 32x8 transform
+ 7, // 16x64 transform
+ 7, // 64x16 transform
+};
+
+extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL];
+
+// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12
+// for each valid row and col combination
+#define INV_COS_BIT 12
+
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c
new file mode 100644
index 0000000000..ee67dffe23
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm2d.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_low_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[4 * 1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[4 * 2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[4 * 3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+
+ op[4 * 0] = a1;
+ op[4 * 1] = b1;
+ op[4 * 2] = c1;
+ op[4 * 3] = d1;
+ ip++;
+ op++;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0];
+ c1 = ip[1];
+ d1 = ip[2];
+ b1 = ip[3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+
+ range_check_value(a1, bd + 1);
+ range_check_value(b1, bd + 1);
+ range_check_value(c1, bd + 1);
+ range_check_value(d1, bd + 1);
+
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+ ip += 4;
+ dest++;
+ }
+}
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_low_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void)bd;
+
+ a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = a1;
+ op[1] = op[2] = op[3] = e1;
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] =
+ highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] =
+ highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] =
+ highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] =
+ highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT4: return av1_idct4;
+ case TXFM_TYPE_DCT8: return av1_idct8;
+ case TXFM_TYPE_DCT16: return av1_idct16;
+ case TXFM_TYPE_DCT32: return av1_idct32;
+ case TXFM_TYPE_DCT64: return av1_idct64;
+ case TXFM_TYPE_ADST4: return av1_iadst4;
+ case TXFM_TYPE_ADST8: return av1_iadst8;
+ case TXFM_TYPE_ADST16: return av1_iadst16;
+ case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
+ case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
+ case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
+ case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
+ default: assert(0); return NULL;
+ }
+}
+
+static const int8_t inv_shift_4x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
+static const int8_t inv_shift_4x8[2] = { 0, -4 };
+static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x32[2] = { -1, -4 };
+static const int8_t inv_shift_32x16[2] = { -1, -4 };
+static const int8_t inv_shift_32x64[2] = { -1, -4 };
+static const int8_t inv_shift_64x32[2] = { -1, -4 };
+static const int8_t inv_shift_4x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x4[2] = { -1, -4 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
+
+const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = {
+ inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32,
+ inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16,
+ inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
+ inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32,
+ inv_shift_32x8, inv_shift_16x64, inv_shift_64x16,
+};
+
+static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg) {
+ assert(cfg != NULL);
+ cfg->tx_size = tx_size;
+ av1_zero(cfg->stage_range_col);
+ av1_zero(cfg->stage_range_row);
+ set_flip_cfg(tx_type, cfg);
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ cfg->shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ cfg->cos_bit_col = INV_COS_BIT;
+ cfg->cos_bit_row = INV_COS_BIT;
+ cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+ if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
+ memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
+ }
+ cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+ if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
+ memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
+ }
+ cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+ cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+}
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+ int bd) {
+ const int fwd_shift = inv_start_range[tx_size];
+ const int8_t *shift = cfg->shift;
+ int8_t opt_range_row, opt_range_col;
+ if (bd == 8) {
+ opt_range_row = 16;
+ opt_range_col = 16;
+ } else if (bd == 10) {
+ opt_range_row = 18;
+ opt_range_col = 16;
+ } else {
+ assert(bd == 12);
+ opt_range_row = 20;
+ opt_range_col = 18;
+ }
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
+ (void)real_range_row;
+ if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
+ // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+ // so opt_range_row >= real_range_row will not hold
+ stage_range_row[i] = opt_range_row;
+ } else {
+ assert(opt_range_row >= real_range_row);
+ stage_range_row[i] = opt_range_row;
+ }
+ }
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+ int real_range_col =
+ cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
+ (void)real_range_col;
+ if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
+ // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
+ // so opt_range_col >= real_range_col will not hold
+ stage_range_col[i] = opt_range_col;
+ } else {
+ assert(opt_range_col >= real_range_col);
+ stage_range_col[i] = opt_range_col;
+ }
+ }
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
+ int stride, TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf, TX_SIZE tx_size,
+ int bd) {
+ // Note when assigning txfm_size_col, we use the txfm_size from the
+ // row configuration and vice versa. This is intentionally done to
+ // accurately perform rectangular transforms. When the transform is
+ // rectangular, the number of columns will be the same as the
+ // txfm_size stored in the row cfg struct. It will make no difference
+ // for square transforms.
+ const int txfm_size_col = tx_size_wide[cfg->tx_size];
+ const int txfm_size_row = tx_size_high[cfg->tx_size];
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+ assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+ av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
+
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
+
+ // txfm_buf's length is txfm_size_row * txfm_size_col + 2 *
+ // AOMMAX(txfm_size_row, txfm_size_col)
+ // it is used for intermediate data buffering
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_in = txfm_buf;
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ int c, r;
+
+ // Rows
+ for (r = 0; r < txfm_size_row; ++r) {
+ if (abs(rect_type) == 1) {
+ for (c = 0; c < txfm_size_col; ++c) {
+ temp_in[c] = round_shift(
+ (int64_t)input[c * txfm_size_row + r] * NewInvSqrt2, NewSqrt2Bits);
+ }
+ clamp_buf(temp_in, txfm_size_col, bd + 8);
+ txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+ } else {
+ for (c = 0; c < txfm_size_col; ++c) {
+ temp_in[c] = input[c * txfm_size_row + r];
+ }
+ clamp_buf(temp_in, txfm_size_col, bd + 8);
+ txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ buf_ptr += txfm_size_col;
+ }
+
+ // Columns
+ for (c = 0; c < txfm_size_col; ++c) {
+ if (cfg->lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
+ txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+ if (cfg->ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
+ int stride, int32_t *txfm_buf,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int bd) {
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
+ // Forward shift sum uses larger square size, to be consistent with what
+ // av1_gen_inv_stage_range() does for inverse shifts.
+ inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
+}
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
+}
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
+}
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
+}
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
+}
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // TODO(urvang): Can the same array be reused, instead of using a new array?
+ // Remap 32x32 input into a modified 64x64 by:
+ // - Copying over these values in top-left 32x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[64 * 64];
+ for (int col = 0; col < 32; ++col) {
+ memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
+ bd);
+}
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 32x32 input into a modified 64x32 by:
+ // - Copying over these values in top-left 32x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[32 * 64];
+ memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
+ memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+ bd);
+}
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 32x32 input into a modified 32x64 input by:
+ // - Copying over these values in top-left 32x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[64 * 32];
+ for (int col = 0; col < 32; ++col) {
+ memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
+ bd);
+}
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 16x32 input into a modified 16x64 input by:
+ // - Copying over these values in top-left 16x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[64 * 16];
+ for (int col = 0; col < 16; ++col) {
+ memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
+ bd);
+}
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 32x16 input into a modified 64x16 by:
+ // - Copying over these values in top-left 32x16 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[16 * 64];
+ memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
+ memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+ bd);
+}
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
new file mode 100644
index 0000000000..5af025c654
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -0,0 +1,2099 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+enum {
+ USE_SINGLE,
+ USE_DUAL,
+ USE_QUAD,
+} UENUM1BYTE(USE_FILTER_TYPE);
+
+static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
+ { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
+ { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
+ { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
+};
+
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
+ { 2, 2 },
+ { 3, 3 } };
+
+static const int mode_lf_lut[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES
+ 1, 1, 0, 1, // INTER_MODES (GLOBALMV == 0)
+ 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
+};
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+ int lvl;
+
+ // For each possible value for the loop filter fill out limits
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+ // Set loop filter parameters that control sharpness.
+ int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+ if (sharpness_lvl > 0) {
+ if (block_inside_limit > (9 - sharpness_lvl))
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+
+ if (block_inside_limit < 1) block_inside_limit = 1;
+
+ memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+ memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
+ }
+}
+
+uint8_t av1_get_filter_level(const AV1_COMMON *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi) {
+ const int segment_id = mbmi->segment_id;
+ if (cm->delta_q_info.delta_lf_present_flag) {
+ int8_t delta_lf;
+ if (cm->delta_q_info.delta_lf_multi) {
+ const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
+ delta_lf = mbmi->delta_lf[delta_lf_idx];
+ } else {
+ delta_lf = mbmi->delta_lf_from_base;
+ }
+ int base_level;
+ if (plane == 0)
+ base_level = cm->lf.filter_level[dir_idx];
+ else if (plane == 1)
+ base_level = cm->lf.filter_level_u;
+ else
+ base_level = cm->lf.filter_level_v;
+ int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
+ assert(plane >= 0 && plane <= 2);
+ const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
+ if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
+ const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
+ lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+ }
+
+ if (cm->lf.mode_ref_delta_enabled) {
+ const int scale = 1 << (lvl_seg >> 5);
+ lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
+ if (mbmi->ref_frame[0] > INTRA_FRAME)
+ lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
+ lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
+ }
+ return lvl_seg;
+ } else {
+ return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
+ [mode_lf_lut[mbmi->mode]];
+ }
+}
+
+void av1_loop_filter_init(AV1_COMMON *cm) {
+ assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
+ loop_filter_info_n *lfi = &cm->lf_info;
+ struct loopfilter *lf = &cm->lf;
+ int lvl;
+
+ // init limits for given sharpness
+ update_sharpness(lfi, lf->sharpness_level);
+
+ // init hev threshold const vectors
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+ memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+// Update the loop filter for the current frame.
+// This should be called before loop_filter_rows(),
+// av1_loop_filter_frame() calls this function directly.
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
+ int plane_end) {
+ int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
+ int plane;
+ int seg_id;
+ // n_shift is the multiplier for lf_deltas
+ // the multiplier is 1 for when filter_lvl is between 0 and 31;
+ // 2 when filter_lvl is between 32 and 63
+ loop_filter_info_n *const lfi = &cm->lf_info;
+ struct loopfilter *const lf = &cm->lf;
+ const struct segmentation *const seg = &cm->seg;
+
+ // update sharpness limits
+ update_sharpness(lfi, lf->sharpness_level);
+
+ filt_lvl[0] = cm->lf.filter_level[0];
+ filt_lvl[1] = cm->lf.filter_level_u;
+ filt_lvl[2] = cm->lf.filter_level_v;
+
+ filt_lvl_r[0] = cm->lf.filter_level[1];
+ filt_lvl_r[1] = cm->lf.filter_level_u;
+ filt_lvl_r[2] = cm->lf.filter_level_v;
+
+ assert(plane_start >= AOM_PLANE_Y);
+ assert(plane_end <= MAX_MB_PLANE);
+
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
+ break;
+ else if (plane == 1 && !filt_lvl[1])
+ continue;
+ else if (plane == 2 && !filt_lvl[2])
+ continue;
+
+ for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+ for (int dir = 0; dir < 2; ++dir) {
+ int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
+ const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
+ if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
+ const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
+ lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+ }
+
+ if (!lf->mode_ref_delta_enabled) {
+ // we could get rid of this if we assume that deltas are set to
+ // zero when not in use; encoder always uses deltas
+ memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
+ sizeof(lfi->lvl[plane][seg_id][dir]));
+ } else {
+ int ref, mode;
+ const int scale = 1 << (lvl_seg >> 5);
+ const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+ lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
+ clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+ for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
+ for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+ const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+ lf->mode_deltas[mode] * scale;
+ lfi->lvl[plane][seg_id][dir][ref][mode] =
+ clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_FORCE_INLINE TX_SIZE
+get_transform_size(const MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi,
+ const int mi_row, const int mi_col, const int plane,
+ const int ss_x, const int ss_y) {
+ assert(mbmi != NULL);
+ if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
+
+ TX_SIZE tx_size = (plane == AOM_PLANE_Y)
+ ? mbmi->tx_size
+ : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y);
+ assert(tx_size < TX_SIZES_ALL);
+ if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) {
+ const BLOCK_SIZE sb_type = mbmi->bsize;
+ const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
+ const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
+ const TX_SIZE mb_tx_size =
+ mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
+ assert(mb_tx_size < TX_SIZES_ALL);
+ tx_size = mb_tx_size;
+ }
+
+ return tx_size;
+}
+
+static const int tx_dim_to_filter_length[TX_SIZES] = { 4, 8, 14, 14, 14 };
+
+// Return TX_SIZE from get_transform_size(), so it is plane and direction
+// aware
+static TX_SIZE set_lpf_parameters(
+ AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
+ const int plane, const struct macroblockd_plane *const plane_ptr) {
+ // reset to initial values
+ params->filter_length = 0;
+
+ // no deblocking is required
+ const uint32_t width = plane_ptr->dst.width;
+ const uint32_t height = plane_ptr->dst.height;
+ if ((width <= x) || (height <= y)) {
+ // just return the smallest transform unit size
+ return TX_4X4;
+ }
+
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
+ // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
+ // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
+ // and mi_col should be odd number for chroma plane.
+ const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
+ const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
+ MB_MODE_INFO **mi =
+ cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
+ const MB_MODE_INFO *mbmi = mi[0];
+ // If current mbmi is not correctly setup, return an invalid value to stop
+ // filtering. One example is that if this tile is not coded, then its mbmi
+ // it not set up.
+ if (mbmi == NULL) return TX_INVALID;
+
+ const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane,
+ scale_horz, scale_vert);
+
+ {
+ const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
+ const uint32_t transform_masks =
+ edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+ const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
+
+ if (!tu_edge) return ts;
+
+ // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+ {
+ const uint32_t curr_level =
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+ const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
+ uint32_t level = curr_level;
+ if (coord) {
+ {
+ const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+ if (mi_prev == NULL) return TX_INVALID;
+ const int pv_row =
+ (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
+ const int pv_col =
+ (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
+ const TX_SIZE pv_ts = get_transform_size(
+ xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert);
+
+ const uint32_t pv_lvl =
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+
+ const int pv_skip_txfm =
+ mi_prev->skip_txfm && is_inter_block(mi_prev);
+ const BLOCK_SIZE bsize = get_plane_block_size(
+ mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y);
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int prediction_masks = edge_dir == VERT_EDGE
+ ? block_size_wide[bsize] - 1
+ : block_size_high[bsize] - 1;
+ const int32_t pu_edge = !(coord & prediction_masks);
+ // if the current and the previous blocks are skipped,
+ // deblock the edge if the edge belongs to a PU's edge only.
+ if ((curr_level || pv_lvl) &&
+ (!pv_skip_txfm || !curr_skipped || pu_edge)) {
+ const int dim = (VERT_EDGE == edge_dir)
+ ? AOMMIN(tx_size_wide_unit_log2[ts],
+ tx_size_wide_unit_log2[pv_ts])
+ : AOMMIN(tx_size_high_unit_log2[ts],
+ tx_size_high_unit_log2[pv_ts]);
+ if (plane) {
+ params->filter_length = (dim == 0) ? 4 : 6;
+ } else {
+ assert(dim < TX_SIZES);
+ assert(dim >= 0);
+ params->filter_length = tx_dim_to_filter_length[dim];
+ }
+
+ // update the level if the current block is skipped,
+ // but the previous one is not
+ level = (curr_level) ? (curr_level) : (pv_lvl);
+ }
+ }
+ }
+ // prepare common parameters
+ if (params->filter_length) {
+ const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+ params->lfthr = limits;
+ }
+ }
+ }
+
+ return ts;
+}
+
+static const uint32_t vert_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+ // TX_4X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X8
+ {
+ 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+ },
+ // TX_16X16
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_32X32
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_64X64
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_4X8
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X4
+ {
+ 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+ },
+ // TX_8X16
+ {
+ 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+ },
+ // TX_16X8
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_16X32
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_32X16
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_32X64
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_64X32
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_4X16
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_16X4
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_8X32
+ {
+ 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8,
+ },
+ // TX_32X8
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_16X64
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+ // TX_64X16
+ {
+ 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14,
+ },
+};
+
+static const uint32_t horz_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+ // TX_4X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X8
+ {
+ 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+ },
+ // TX_16X16
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_32X32
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_64X64
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_4X8
+ {
+ 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+ },
+ // TX_8X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X16
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_16X8
+ {
+ 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+ },
+ // TX_16X32
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_32X16
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_32X64
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_64X32
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_4X16
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_16X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X32
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_32X8
+ {
+ 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8,
+ },
+ // TX_16X64
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+ // TX_64X16
+ {
+ 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14,
+ },
+};
+
+static const uint32_t vert_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+ // TX_4X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X8
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_16X16
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_32X32
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_64X64
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_4X8
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X4
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_8X16
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_16X8
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_16X32
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_32X16
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_32X64
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_64X32
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_4X16
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_16X4
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_8X32
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_32X8
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_16X64
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+ // TX_64X16
+ {
+ 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6,
+ },
+};
+
+static const uint32_t horz_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = {
+ // TX_4X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X8
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_16X16
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_32X32
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_64X64
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_4X8
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_8X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X16
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_16X8
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_16X32
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_32X16
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_32X64
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_64X32
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_4X16
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_16X4
+ {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ },
+ // TX_8X32
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_32X8
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_16X64
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+ // TX_64X16
+ {
+ 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6,
+ },
+};
+
+static AOM_FORCE_INLINE void set_one_param_for_line_luma(
+ AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size,
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row,
+ const struct macroblockd_plane *const plane_ptr, int coord,
+ bool is_first_block, TX_SIZE prev_tx_size, const ptrdiff_t mode_step,
+ int *min_dim) {
+ (void)plane_ptr;
+ assert(mi_col << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.width &&
+ mi_row << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.height);
+ const int is_vert = edge_dir == VERT_EDGE;
+ // reset to initial values
+ params->filter_length = 0;
+
+ MB_MODE_INFO **mi =
+ cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
+ const MB_MODE_INFO *mbmi = mi[0];
+ assert(mbmi);
+
+ const TX_SIZE ts =
+ get_transform_size(xd, mi[0], mi_row, mi_col, AOM_PLANE_Y, 0, 0);
+
+#ifndef NDEBUG
+ const uint32_t transform_masks =
+ is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+ const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1);
+ assert(tu_edge);
+#endif // NDEBUG
+ // If we are not the first block, then coord is always true, so
+ // !is_first_block is technically redundant. But we are keeping it here so the
+ // compiler can compile away this conditional if we pass in is_first_block :=
+ // false
+ bool curr_skipped = false;
+ if (!is_first_block || coord) {
+ const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+ const int pv_row = is_vert ? mi_row : (mi_row - 1);
+ const int pv_col = is_vert ? (mi_col - 1) : mi_col;
+ const TX_SIZE pv_ts =
+ is_first_block
+ ? get_transform_size(xd, mi_prev, pv_row, pv_col, AOM_PLANE_Y, 0, 0)
+ : prev_tx_size;
+ if (is_first_block) {
+ *min_dim = is_vert ? block_size_high[mi_prev->bsize]
+ : block_size_wide[mi_prev->bsize];
+ }
+ assert(mi_prev);
+ uint8_t level =
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi);
+ if (!level) {
+ level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y,
+ mi_prev);
+ }
+
+ const int32_t pu_edge = mi_prev != mbmi;
+
+ // The quad loop filter assumes that all the transform blocks within a
+ // 8x16/16x8/16x16 prediction block are of the same size.
+ assert(IMPLIES(
+ !pu_edge && (mbmi->bsize >= BLOCK_8X16 && mbmi->bsize <= BLOCK_16X16),
+ pv_ts == ts));
+
+ if (!pu_edge) {
+ curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
+ }
+ if ((pu_edge || !curr_skipped) && level) {
+ params->filter_length = is_vert ? vert_filter_length_luma[ts][pv_ts]
+ : horz_filter_length_luma[ts][pv_ts];
+
+ // prepare common parameters
+ const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+ params->lfthr = limits;
+ }
+ }
+ const int block_dim =
+ is_vert ? block_size_high[mbmi->bsize] : block_size_wide[mbmi->bsize];
+ *min_dim = AOMMIN(*min_dim, block_dim);
+
+ *tx_size = ts;
+}
+
+// Similar to set_lpf_parameters, but does so one row/col at a time to reduce
+// calls to \ref get_transform_size and \ref av1_get_filter_level
+static AOM_FORCE_INLINE void set_lpf_parameters_for_line_luma(
+ AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf,
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row,
+ const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range,
+ const ptrdiff_t mode_step, int *min_dim) {
+ const int is_vert = edge_dir == VERT_EDGE;
+
+ AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+ TX_SIZE *tx_size = tx_buf;
+ uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row;
+ TX_SIZE prev_tx_size = TX_INVALID;
+
+ // Unroll the first iteration of the loop
+ set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row,
+ plane_ptr, *counter_ptr, true, prev_tx_size,
+ mode_step, min_dim);
+
+ // Advance
+ int advance_units =
+ is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+ prev_tx_size = *tx_size;
+ *counter_ptr += advance_units;
+ params += advance_units;
+ tx_size += advance_units;
+
+ while (*counter_ptr < mi_range) {
+ set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col,
+ mi_row, plane_ptr, *counter_ptr, false,
+ prev_tx_size, mode_step, min_dim);
+
+ // Advance
+ advance_units =
+ is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+ prev_tx_size = *tx_size;
+ *counter_ptr += advance_units;
+ params += advance_units;
+ tx_size += advance_units;
+ }
+}
+
+static AOM_FORCE_INLINE void set_one_param_for_line_chroma(
+ AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size,
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, int coord,
+ bool is_first_block, TX_SIZE prev_tx_size,
+ const struct macroblockd_plane *const plane_ptr, const ptrdiff_t mode_step,
+ const int scale_horz, const int scale_vert, int *min_dim, int plane,
+ int joint_filter_chroma) {
+ const int is_vert = edge_dir == VERT_EDGE;
+ (void)plane_ptr;
+ assert((mi_col << MI_SIZE_LOG2) <
+ (uint32_t)(plane_ptr->dst.width << scale_horz) &&
+ (mi_row << MI_SIZE_LOG2) <
+ (uint32_t)(plane_ptr->dst.height << scale_vert));
+ // reset to initial values
+ params->filter_length = 0;
+
+ // for sub8x8 block, chroma prediction mode is obtained from the
+ // bottom/right mi structure of the co-located 8x8 luma block. so for chroma
+ // plane, mi_row and mi_col should map to the bottom/right mi structure,
+ // i.e, both mi_row and mi_col should be odd number for chroma plane.
+ mi_row |= scale_vert;
+ mi_col |= scale_horz;
+ MB_MODE_INFO **mi =
+ cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
+ const MB_MODE_INFO *mbmi = mi[0];
+ assert(mbmi);
+
+ const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane,
+ scale_horz, scale_vert);
+ *tx_size = ts;
+
+#ifndef NDEBUG
+ const uint32_t transform_masks =
+ is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+ const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1);
+ assert(tu_edge);
+#endif // NDEBUG
+
+ // If we are not the first block, then coord is always true, so
+ // !is_first_block is technically redundant. But we are keeping it here so the
+ // compiler can compile away this conditional if we pass in is_first_block :=
+ // false
+ bool curr_skipped = false;
+ if (!is_first_block || coord) {
+ const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+ assert(mi_prev);
+ const int pv_row = is_vert ? (mi_row) : (mi_row - (1 << scale_vert));
+ const int pv_col = is_vert ? (mi_col - (1 << scale_horz)) : (mi_col);
+ const TX_SIZE pv_ts =
+ is_first_block ? get_transform_size(xd, mi_prev, pv_row, pv_col, plane,
+ scale_horz, scale_vert)
+ : prev_tx_size;
+ if (is_first_block) {
+ *min_dim = is_vert ? tx_size_high[pv_ts] : tx_size_wide[pv_ts];
+ }
+
+ uint8_t level =
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+ if (!level) {
+ level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+ }
+#ifndef NDEBUG
+ if (joint_filter_chroma) {
+ uint8_t v_level =
+ av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi);
+ if (!v_level) {
+ v_level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V,
+ mi_prev);
+ }
+ assert(level == v_level);
+ }
+#else
+ (void)joint_filter_chroma;
+#endif // NDEBUG
+ const int32_t pu_edge = mi_prev != mbmi;
+
+ if (!pu_edge) {
+ curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi);
+ }
+ // For realtime mode, u and v have the same level
+ if ((!curr_skipped || pu_edge) && level) {
+ params->filter_length = is_vert ? vert_filter_length_chroma[ts][pv_ts]
+ : horz_filter_length_chroma[ts][pv_ts];
+
+ const loop_filter_thresh *const limits = cm->lf_info.lfthr;
+ params->lfthr = limits + level;
+ }
+ }
+ const int tx_dim = is_vert ? tx_size_high[ts] : tx_size_wide[ts];
+ *min_dim = AOMMIN(*min_dim, tx_dim);
+}
+
+static AOM_FORCE_INLINE void set_lpf_parameters_for_line_chroma(
+ AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf,
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row,
+ const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range,
+ const ptrdiff_t mode_step, const int scale_horz, const int scale_vert,
+ int *min_dim, int plane, int joint_filter_chroma) {
+ const int is_vert = edge_dir == VERT_EDGE;
+
+ AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+ TX_SIZE *tx_size = tx_buf;
+ uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row;
+ const uint32_t scale = is_vert ? scale_horz : scale_vert;
+ TX_SIZE prev_tx_size = TX_INVALID;
+
+ // Unroll the first iteration of the loop
+ set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col,
+ mi_row, *counter_ptr, true, prev_tx_size,
+ plane_ptr, mode_step, scale_horz, scale_vert,
+ min_dim, plane, joint_filter_chroma);
+
+ // Advance
+ int advance_units =
+ is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+ prev_tx_size = *tx_size;
+ *counter_ptr += advance_units << scale;
+ params += advance_units;
+ tx_size += advance_units;
+
+ while (*counter_ptr < mi_range) {
+ set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col,
+ mi_row, *counter_ptr, false, prev_tx_size,
+ plane_ptr, mode_step, scale_horz, scale_vert,
+ min_dim, plane, joint_filter_chroma);
+
+ // Advance
+ advance_units =
+ is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size];
+ prev_tx_size = *tx_size;
+ *counter_ptr += advance_units << scale;
+ params += advance_units;
+ tx_size += advance_units;
+ }
+}
+
+static AOM_INLINE void filter_vert(uint8_t *dst, int dst_stride,
+ const AV1_DEBLOCKING_PARAMETERS *params,
+ const SequenceHeader *seq_params,
+ USE_FILTER_TYPE use_filter_type) {
+ const loop_filter_thresh *limits = params->lfthr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int use_highbitdepth = seq_params->use_highbitdepth;
+ const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+ if (use_highbitdepth) {
+ uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
+ if (use_filter_type == USE_QUAD) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_vertical_4_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_vertical_4_dual(
+ dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_vertical_6_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_vertical_6_dual(
+ dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_highbd_lpf_vertical_8_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_vertical_8_dual(
+ dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_highbd_lpf_vertical_14_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_vertical_14_dual(
+ dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ limits->mblim, limits->lim, limits->hev_thr, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_vertical_4_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_vertical_6_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_highbd_lpf_vertical_8_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_highbd_lpf_vertical_14_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // no filtering
+ default: break;
+ }
+ }
+ return;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ if (use_filter_type == USE_QUAD) {
+ // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+ // passed as argument to quad loop filter because quad loop filter is
+ // called for those cases where all the 4 set of loop filter parameters
+ // are equal.
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ }
+#if !CONFIG_AV1_HIGHBITDEPTH
+ (void)seq_params;
+#endif // !CONFIG_AV1_HIGHBITDEPTH
+}
+
+static AOM_INLINE void filter_vert_chroma(
+ uint8_t *u_dst, uint8_t *v_dst, int dst_stride,
+ const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params,
+ USE_FILTER_TYPE use_filter_type) {
+ const loop_filter_thresh *u_limits = params->lfthr;
+ const loop_filter_thresh *v_limits = params->lfthr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int use_highbitdepth = seq_params->use_highbitdepth;
+ const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+ if (use_highbitdepth) {
+ uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst);
+ uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst);
+ if (use_filter_type == USE_QUAD) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_vertical_4_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_4_dual(
+ u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ u_limits->mblim, u_limits->lim, u_limits->hev_thr,
+ u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_4_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_4_dual(
+ v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ v_limits->mblim, v_limits->lim, v_limits->hev_thr,
+ v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_vertical_6_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_6_dual(
+ u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ u_limits->mblim, u_limits->lim, u_limits->hev_thr,
+ u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_6_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_6_dual(
+ v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride,
+ v_limits->mblim, v_limits->lim, v_limits->hev_thr,
+ v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_vertical_4_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_4_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_vertical_6_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_vertical_6_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_vertical_4(u_dst_shortptr, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_vertical_4(v_dst_shortptr, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr,
+ bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_vertical_6(u_dst_shortptr, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_vertical_6(v_dst_shortptr, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr,
+ bit_depth);
+ break;
+ case 8:
+ case 14: assert(0); break;
+ // no filtering
+ default: break;
+ }
+ }
+ return;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ if (use_filter_type == USE_QUAD) {
+ // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+ // passed as argument to quad loop filter because quad loop filter is
+ // called for those cases where all the 4 set of loop filter parameters
+ // are equal.
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4_quad(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr);
+ aom_lpf_vertical_4_quad(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_vertical_6_quad(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr);
+ aom_lpf_vertical_6_quad(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4_dual(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr,
+ u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_vertical_4_dual(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr,
+ v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_vertical_6_dual(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr,
+ u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_vertical_6_dual(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr,
+ v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_vertical_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_vertical_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+ u_limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_vertical_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_vertical_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr);
+ break;
+ case 8:
+ case 14: assert(0); break;
+ // no filtering
+ default: break;
+ }
+ }
+#if !CONFIG_AV1_HIGHBITDEPTH
+ (void)seq_params;
+#endif // !CONFIG_AV1_HIGHBITDEPTH
+}
+
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int plane_mi_rows =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+ const int plane_mi_cols =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+ const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+ (MAX_MIB_SIZE >> scale_vert));
+ const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+ (MAX_MIB_SIZE >> scale_horz));
+
+ for (int y = 0; y < y_range; y++) {
+ uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+ for (int x = 0; x < x_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+ // If 4x4 transform is used, it will then filter the internal edge
+ // aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(&params, 0, sizeof(params));
+
+ tx_size =
+ set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+ VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ filter_vert(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
+
+ // advance the destination pointer
+ advance_units = tx_size_wide_unit[tx_size];
+ x += advance_units;
+ p += advance_units * MI_SIZE;
+ }
+ }
+}
+
+void av1_filter_block_plane_vert_opt(
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) {
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+ // to MI_SIZE.
+ const int plane_mi_cols =
+ CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2);
+ const int plane_mi_rows =
+ CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2);
+ // Whenever 'pipeline_lpf_mt_with_enc' is enabled, height of the unit to
+ // filter (i.e., y_range) is calculated based on the size of the superblock
+ // used.
+ const int y_range = AOMMIN((int)(plane_mi_rows - mi_row),
+ (1 << num_mis_in_lpf_unit_height_log2));
+ // Width of the unit to filter (i.e., x_range) should always be calculated
+ // based on maximum superblock size as this function is called for mi_col = 0,
+ // MAX_MIB_SIZE, 2 * MAX_MIB_SIZE etc.
+ const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE);
+ const ptrdiff_t mode_step = 1;
+ for (int y = 0; y < y_range; y++) {
+ const uint32_t curr_y = mi_row + y;
+ const uint32_t x_start = mi_col;
+ const uint32_t x_end = mi_col + x_range;
+ int min_block_height = block_size_high[BLOCK_128X128];
+ set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, VERT_EDGE,
+ x_start, curr_y, plane_ptr, x_end,
+ mode_step, &min_block_height);
+
+ AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+ TX_SIZE *tx_size = tx_buf;
+ USE_FILTER_TYPE use_filter_type = USE_SINGLE;
+
+ uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+
+ if ((y & 3) == 0 && (y + 3) < y_range && min_block_height >= 16) {
+ // If we are on a row which is a multiple of 4, and the minimum height is
+ // 16 pixels, then the current and right 3 cols must contain the same
+ // prediction block. This is because dim 16 can only happen every unit of
+ // 4 mi's.
+ use_filter_type = USE_QUAD;
+ y += 3;
+ } else if ((y + 1) < y_range && min_block_height >= 8) {
+ use_filter_type = USE_DUAL;
+ y += 1;
+ }
+
+ for (int x = 0; x < x_range;) {
+ if (*tx_size == TX_INVALID) {
+ params->filter_length = 0;
+ *tx_size = TX_4X4;
+ }
+
+ filter_vert(p, dst_stride, params, cm->seq_params, use_filter_type);
+
+ // advance the destination pointer
+ const uint32_t advance_units = tx_size_wide_unit[*tx_size];
+ x += advance_units;
+ p += advance_units * MI_SIZE;
+ params += advance_units;
+ tx_size += advance_units;
+ }
+ }
+}
+
+void av1_filter_block_plane_vert_opt_chroma(
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+ int num_mis_in_lpf_unit_height_log2) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ const int dst_stride = plane_ptr->dst.stride;
+ // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+ // to MI_SIZE.
+ const int mi_cols =
+ ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+ const int mi_rows =
+ ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+ const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert);
+ const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz);
+ const int y_range =
+ AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+ ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert));
+ const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+ (MAX_MIB_SIZE >> scale_horz));
+ const ptrdiff_t mode_step = (ptrdiff_t)1 << scale_horz;
+
+ for (int y = 0; y < y_range; y++) {
+ const uint32_t curr_y = mi_row + (y << scale_vert);
+ const uint32_t x_start = mi_col + (0 << scale_horz);
+ const uint32_t x_end = mi_col + (x_range << scale_horz);
+ int min_height = tx_size_high[TX_64X64];
+ set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, VERT_EDGE,
+ x_start, curr_y, plane_ptr, x_end,
+ mode_step, scale_horz, scale_vert,
+ &min_height, plane, joint_filter_chroma);
+
+ AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+ TX_SIZE *tx_size = tx_buf;
+ int use_filter_type = USE_SINGLE;
+ int y_inc = 0;
+
+ if ((y & 3) == 0 && (y + 3) < y_range && min_height >= 16) {
+ // If we are on a row which is a multiple of 4, and the minimum height is
+ // 16 pixels, then the current and below 3 rows must contain the same tx
+ // block. This is because dim 16 can only happen every unit of 4 mi's.
+ use_filter_type = USE_QUAD;
+ y_inc = 3;
+ } else if (y % 2 == 0 && (y + 1) < y_range && min_height >= 8) {
+ // If we are on an even row, and the minimum height is 8 pixels, then the
+ // current and below rows must contain the same tx block. This is because
+ // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1,
+ // etc.
+ use_filter_type = USE_DUAL;
+ y_inc = 1;
+ }
+
+ for (int x = 0; x < x_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+ // If 4x4 transform is used, it will then filter the internal edge
+ // aligned with a 4x4 block
+ if (*tx_size == TX_INVALID) {
+ params->filter_length = 0;
+ *tx_size = TX_4X4;
+ }
+
+ const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE;
+ if (joint_filter_chroma) {
+ uint8_t *u_dst = plane_ptr[0].dst.buf + offset;
+ uint8_t *v_dst = plane_ptr[1].dst.buf + offset;
+ filter_vert_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params,
+ use_filter_type);
+ } else {
+ uint8_t *dst_ptr = plane_ptr->dst.buf + offset;
+ filter_vert(dst_ptr, dst_stride, params, cm->seq_params,
+ use_filter_type);
+ }
+
+ // advance the destination pointer
+ const uint32_t advance_units = tx_size_wide_unit[*tx_size];
+ x += advance_units;
+ params += advance_units;
+ tx_size += advance_units;
+ }
+ y += y_inc;
+ }
+}
+
+static AOM_INLINE void filter_horz(uint8_t *dst, int dst_stride,
+ const AV1_DEBLOCKING_PARAMETERS *params,
+ const SequenceHeader *seq_params,
+ USE_FILTER_TYPE use_filter_type) {
+ const loop_filter_thresh *limits = params->lfthr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int use_highbitdepth = seq_params->use_highbitdepth;
+ const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+ if (use_highbitdepth) {
+ uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst);
+ if (use_filter_type == USE_QUAD) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_horizontal_4_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_horizontal_4_dual(
+ dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_horizontal_6_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_horizontal_6_dual(
+ dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr, bit_depth);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_highbd_lpf_horizontal_8_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_horizontal_8_dual(
+ dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr, bit_depth);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_highbd_lpf_horizontal_14_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ aom_highbd_lpf_horizontal_14_dual(
+ dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr, bit_depth);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_horizontal_4_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_horizontal_6_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_highbd_lpf_horizontal_8_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_highbd_lpf_horizontal_14_dual(
+ dst_shortptr, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr,
+ bit_depth);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim,
+ limits->lim, limits->hev_thr, bit_depth);
+ break;
+ // no filtering
+ default: break;
+ }
+ }
+ return;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ if (use_filter_type == USE_QUAD) {
+ // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+ // passed as argument to quad loop filter because quad loop filter is
+ // called for those cases where all the 4 set of loop filter parameters
+ // are equal.
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim,
+ limits->hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ }
+#if !CONFIG_AV1_HIGHBITDEPTH
+ (void)seq_params;
+#endif // !CONFIG_AV1_HIGHBITDEPTH
+}
+
+static AOM_INLINE void filter_horz_chroma(
+ uint8_t *u_dst, uint8_t *v_dst, int dst_stride,
+ const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params,
+ USE_FILTER_TYPE use_filter_type) {
+ const loop_filter_thresh *u_limits = params->lfthr;
+ const loop_filter_thresh *v_limits = params->lfthr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int use_highbitdepth = seq_params->use_highbitdepth;
+ const aom_bit_depth_t bit_depth = seq_params->bit_depth;
+ if (use_highbitdepth) {
+ uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst);
+ uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst);
+ if (use_filter_type == USE_QUAD) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_horizontal_4_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_4_dual(
+ u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_4_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_4_dual(
+ v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_horizontal_6_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_6_dual(
+ u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_6_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_6_dual(
+ v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_horizontal_4_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_4_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_horizontal_6_dual(
+ u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_6_dual(
+ v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_highbd_lpf_horizontal_4(u_dst_shortptr, dst_stride,
+ u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_4(v_dst_shortptr, dst_stride,
+ v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_highbd_lpf_horizontal_6(u_dst_shortptr, dst_stride,
+ u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr, bit_depth);
+ aom_highbd_lpf_horizontal_6(v_dst_shortptr, dst_stride,
+ v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr, bit_depth);
+ break;
+ case 8:
+ case 14: assert(0); break;
+ // no filtering
+ default: break;
+ }
+ }
+ return;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ if (use_filter_type == USE_QUAD) {
+ // Only one set of loop filter parameters (mblim, lim and hev_thr) is
+ // passed as argument to quad loop filter because quad loop filter is
+ // called for those cases where all the 4 set of loop filter parameters
+ // are equal.
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4_quad(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr);
+ aom_lpf_horizontal_4_quad(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_horizontal_6_quad(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr);
+ aom_lpf_horizontal_6_quad(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else if (use_filter_type == USE_DUAL) {
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4_dual(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr,
+ u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_horizontal_4_dual(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr,
+ v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_horizontal_6_dual(u_dst, dst_stride, u_limits->mblim,
+ u_limits->lim, u_limits->hev_thr,
+ u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_horizontal_6_dual(v_dst, dst_stride, v_limits->mblim,
+ v_limits->lim, v_limits->hev_thr,
+ v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr);
+ break;
+ case 8:
+ case 14: assert(0);
+ // no filtering
+ default: break;
+ }
+ } else {
+ assert(use_filter_type == USE_SINGLE);
+ switch (params->filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ aom_lpf_horizontal_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_horizontal_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+ u_limits->hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ aom_lpf_horizontal_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim,
+ u_limits->hev_thr);
+ aom_lpf_horizontal_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim,
+ v_limits->hev_thr);
+ break;
+ case 8:
+ case 14: assert(0); break;
+ // no filtering
+ default: break;
+ }
+ }
+#if !CONFIG_AV1_HIGHBITDEPTH
+ (void)seq_params;
+#endif // !CONFIG_AV1_HIGHBITDEPTH
+}
+
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int plane_mi_rows =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+ const int plane_mi_cols =
+ ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+ const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+ (MAX_MIB_SIZE >> scale_vert));
+ const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+ (MAX_MIB_SIZE >> scale_horz));
+ for (int x = 0; x < x_range; x++) {
+ uint8_t *p = dst_ptr + x * MI_SIZE;
+ for (int y = 0; y < y_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will first filter the vertical edge aligned with a 8x8
+ // block. If 4x4 transform is used, it will then filter the internal
+ // edge aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(&params, 0, sizeof(params));
+
+ tx_size = set_lpf_parameters(
+ &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+ curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ filter_horz(p, dst_stride, &params, cm->seq_params, USE_SINGLE);
+
+ // advance the destination pointer
+ advance_units = tx_size_high_unit[tx_size];
+ y += advance_units;
+ p += advance_units * dst_stride * MI_SIZE;
+ }
+ }
+}
+
+void av1_filter_block_plane_horz_opt(
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) {
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+ // to MI_SIZE.
+ const int plane_mi_cols =
+ CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2);
+ const int plane_mi_rows =
+ CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2);
+ const int y_range = AOMMIN((int)(plane_mi_rows - mi_row),
+ (1 << num_mis_in_lpf_unit_height_log2));
+ const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE);
+
+ const ptrdiff_t mode_step = cm->mi_params.mi_stride;
+ for (int x = 0; x < x_range; x++) {
+ const uint32_t curr_x = mi_col + x;
+ const uint32_t y_start = mi_row;
+ const uint32_t y_end = mi_row + y_range;
+ int min_block_width = block_size_high[BLOCK_128X128];
+ set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, HORZ_EDGE,
+ curr_x, y_start, plane_ptr, y_end,
+ mode_step, &min_block_width);
+
+ AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+ TX_SIZE *tx_size = tx_buf;
+ USE_FILTER_TYPE filter_type = USE_SINGLE;
+
+ uint8_t *p = dst_ptr + x * MI_SIZE;
+
+ if ((x & 3) == 0 && (x + 3) < x_range && min_block_width >= 16) {
+ // If we are on a col which is a multiple of 4, and the minimum width is
+ // 16 pixels, then the current and right 3 cols must contain the same
+ // prediction block. This is because dim 16 can only happen every unit of
+ // 4 mi's.
+ filter_type = USE_QUAD;
+ x += 3;
+ } else if ((x + 1) < x_range && min_block_width >= 8) {
+ filter_type = USE_DUAL;
+ x += 1;
+ }
+
+ for (int y = 0; y < y_range;) {
+ if (*tx_size == TX_INVALID) {
+ params->filter_length = 0;
+ *tx_size = TX_4X4;
+ }
+
+ filter_horz(p, dst_stride, params, cm->seq_params, filter_type);
+
+ // advance the destination pointer
+ const uint32_t advance_units = tx_size_high_unit[*tx_size];
+ y += advance_units;
+ p += advance_units * dst_stride * MI_SIZE;
+ params += advance_units;
+ tx_size += advance_units;
+ }
+ }
+}
+
+void av1_filter_block_plane_horz_opt_chroma(
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+ int num_mis_in_lpf_unit_height_log2) {
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ const int dst_stride = plane_ptr->dst.stride;
+ // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned
+ // to MI_SIZE.
+ const int mi_cols =
+ ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+ const int mi_rows =
+ ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2;
+ const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert);
+ const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz);
+ const int y_range =
+ AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+ ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert));
+ const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+ (MAX_MIB_SIZE >> scale_horz));
+ const ptrdiff_t mode_step = cm->mi_params.mi_stride << scale_vert;
+ for (int x = 0; x < x_range; x++) {
+ const uint32_t y_start = mi_row + (0 << scale_vert);
+ const uint32_t curr_x = mi_col + (x << scale_horz);
+ const uint32_t y_end = mi_row + (y_range << scale_vert);
+ int min_width = tx_size_wide[TX_64X64];
+ set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, HORZ_EDGE,
+ curr_x, y_start, plane_ptr, y_end,
+ mode_step, scale_horz, scale_vert,
+ &min_width, plane, joint_filter_chroma);
+
+ AV1_DEBLOCKING_PARAMETERS *params = params_buf;
+ TX_SIZE *tx_size = tx_buf;
+ USE_FILTER_TYPE use_filter_type = USE_SINGLE;
+ int x_inc = 0;
+
+ if ((x & 3) == 0 && (x + 3) < x_range && min_width >= 16) {
+ // If we are on a col which is a multiple of 4, and the minimum width is
+ // 16 pixels, then the current and right 3 cols must contain the same tx
+ // block. This is because dim 16 can only happen every unit of 4 mi's.
+ use_filter_type = USE_QUAD;
+ x_inc = 3;
+ } else if (x % 2 == 0 && (x + 1) < x_range && min_width >= 8) {
+ // If we are on an even col, and the minimum width is 8 pixels, then the
+ // current and left cols must contain the same tx block. This is because
+ // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1,
+ // etc.
+ use_filter_type = USE_DUAL;
+ x_inc = 1;
+ }
+
+ for (int y = 0; y < y_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will first filter the vertical edge aligned with a 8x8
+ // block. If 4x4 transform is used, it will then filter the internal
+ // edge aligned with a 4x4 block
+ if (*tx_size == TX_INVALID) {
+ params->filter_length = 0;
+ *tx_size = TX_4X4;
+ }
+
+ const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE;
+ if (joint_filter_chroma) {
+ uint8_t *u_dst = plane_ptr[0].dst.buf + offset;
+ uint8_t *v_dst = plane_ptr[1].dst.buf + offset;
+ filter_horz_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params,
+ use_filter_type);
+ } else {
+ uint8_t *dst_ptr = plane_ptr->dst.buf + offset;
+ filter_horz(dst_ptr, dst_stride, params, cm->seq_params,
+ use_filter_type);
+ }
+
+ // advance the destination pointer
+ const int advance_units = tx_size_high_unit[*tx_size];
+ y += advance_units;
+ params += advance_units;
+ tx_size += advance_units;
+ }
+ x += x_inc;
+ }
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
new file mode 100644
index 0000000000..c9880cf5da
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/internal/aom_codec_internal.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+enum lf_path {
+ LF_PATH_420,
+ LF_PATH_444,
+ LF_PATH_SLOW,
+};
+
+/*!\cond */
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
+typedef struct {
+ uint64_t bits[4];
+} FilterMask;
+
+struct loopfilter {
+ int filter_level[2];
+ int filter_level_u;
+ int filter_level_v;
+
+ int sharpness_level;
+
+ uint8_t mode_ref_delta_enabled;
+ uint8_t mode_ref_delta_update;
+
+ // 0 = Intra, Last, Last2+Last3,
+ // GF, BRF, ARF2, ARF
+ int8_t ref_deltas[REF_FRAMES];
+
+ // 0 = ZERO_MV, MV
+ int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+ loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+ uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+ // length of the filter applied to the outer edge
+ uint8_t filter_length;
+ // deblocking limits
+ const loop_filter_thresh *lfthr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+typedef struct LoopFilterWorkerData {
+ YV12_BUFFER_CONFIG *frame_buffer;
+ struct AV1Common *cm;
+ struct macroblockd_plane planes[MAX_MB_PLANE];
+ // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+ // add lossless as a member here.
+ MACROBLOCKD *xd;
+
+ AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
+ TX_SIZE tx_buf[MAX_MIB_SIZE];
+ struct aom_internal_error_info error_info;
+} LFWorkerData;
+/*!\endcond */
+
+/* assorted loopfilter functions which get used elsewhere */
+struct AV1Common;
+struct macroblockd;
+struct AV1LfSyncData;
+
+void av1_loop_filter_init(struct AV1Common *cm);
+
+void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
+ int plane_end);
+
+void av1_filter_block_plane_vert(const struct AV1Common *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col);
+
+void av1_filter_block_plane_horz(const struct AV1Common *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col);
+
+void av1_filter_block_plane_vert_opt(
+ const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2);
+
+void av1_filter_block_plane_vert_opt_chroma(
+ const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+ int num_mis_in_lpf_unit_height_log2);
+
+void av1_filter_block_plane_horz_opt(
+ const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2);
+
+void av1_filter_block_plane_horz_opt_chroma(
+ const struct AV1Common *const cm, const MACROBLOCKD *const xd,
+ const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row,
+ const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf,
+ TX_SIZE *tx_buf, int plane, bool joint_filter_chroma,
+ int num_mis_in_lpf_unit_height_log2);
+
+uint8_t av1_get_filter_level(const struct AV1Common *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
new file mode 100644
index 0000000000..8a35dca369
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void av1_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
new file mode 100644
index 0000000000..c5fe389ba1
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -0,0 +1,655 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub av1_common_forward_decls() {
+print <<EOF
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/common.h"
+#include "av1/common/convolve.h"
+#include "av1/common/enums.h"
+#include "av1/common/filter.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
+
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+#endif
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+EOF
+}
+forward_decls qw/av1_common_forward_decls/;
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+}
+
+add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
+specialize qw/av1_convolve_horiz_rs sse4_1/;
+
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+ specialize qw/av1_highbd_convolve_horiz_rs sse4_1 neon/;
+
+ add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd";
+ specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2 neon/;
+}
+
+add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params";
+specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
+
+# directional intra predictor functions
+add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+specialize qw/av1_dr_prediction_z1 sse4_1 avx2 neon/;
+add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z2 sse4_1 avx2 neon/;
+add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z3 sse4_1 avx2 neon/;
+
+# FILTER_INTRA predictor functions
+add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+specialize qw/av1_filter_intra_predictor sse4_1 neon/;
+
+# High bitdepth functions
+
+#
+# Sub Pixel Filters
+#
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+ add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+ add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
+
+ add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+}
+
+#inv txfm
+add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2 neon/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x16/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x4/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x32/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x32 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x16/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x16 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x64/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x64 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x32/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x32 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x64/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x64 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x32/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x8/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x64 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x64/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x32 neon/;
+add_proto qw/void av1_highbd_inv_txfm_add_64x16/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_64x64 neon/;
+
+add_proto qw/void av1_inv_txfm2d_add_4x4/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_4x8/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x4/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_4x16 neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x4 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x16/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x16 neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x8/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x32/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x32 neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x16/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x16 neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x32/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x32 neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x64 neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x32 neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x64 neon/;
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_8x32 neon/;
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_32x8 neon/;
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_16x64 neon/;
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const tran_low_t *input, uint8_t *dest, int stride, TX_TYPE tx_type, const int bd";
+specialize qw/av1_inv_txfm2d_add_64x16 neon/;
+
+add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/av1_highbd_iwht4x4_16_add sse4_1/;
+
+add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ # directional intra predictor functions
+ add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+ specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/;
+ add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+ specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/;
+ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+ specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/;
+}
+
+# build compound seg mask functions
+add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
+specialize qw/av1_build_compound_diffwtd_mask neon sse4_1 avx2/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2 neon/;
+}
+
+add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
+
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
+# Resize functions.
+add_proto qw/void av1_resize_and_extend_frame/, "const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes";
+specialize qw/av1_resize_and_extend_frame ssse3 neon/;
+
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+ # ENCODEMB INVOKE
+ add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
+ specialize qw/aom_upsampled_pred neon sse2/;
+ #
+ #
+ #
+ add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search";
+ specialize qw/aom_comp_avg_upsampled_pred sse2 neon/;
+
+ add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+ specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3 neon/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+ specialize qw/aom_highbd_upsampled_pred sse2 neon/;
+
+ add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+ specialize qw/aom_highbd_comp_avg_upsampled_pred sse2 neon/;
+
+ add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+ specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2 neon/;
+ }
+
+ # the transform coefficients are held in 32-bit
+ # values, so the assembler code for av1_block_error can no longer be used.
+ add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/av1_block_error sse2 avx2 neon sve/;
+
+ add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
+ specialize qw/av1_block_error_lp sse2 avx2 neon sve/;
+
+ add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_fp sse2 avx2 neon/;
+
+ add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_lp sse2 avx2 neon/;
+
+ add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_fp_32x32 neon avx2/;
+
+ add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_fp_64x64 neon avx2/;
+
+ add_proto qw/void aom_quantize_b_helper/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale";
+ specialize qw/aom_quantize_b_helper neon/;
+
+ # fdct functions
+
+ add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/av1_fwht4x4 sse4_1 neon/;
+
+ #fwd txfm
+ add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+ specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/;
+
+ add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x4 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2 neon/;
+ add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2 neon/;
+ add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x32 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x16 sse4_1 neon/;
+
+ add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_4x4 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2 neon/;
+ add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2 neon/;
+ add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2 neon/;
+
+ add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2 neon/;
+ add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x64 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x32 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x4 sse4_1 neon/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_4x16 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x32 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x8 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x64 sse4_1 neon/;
+ add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x16 sse4_1 neon/;
+ }
+ #
+ # Motion search
+ #
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+ specialize qw/av1_apply_temporal_filter sse2 avx2 neon neon_dotprod/;
+
+ add_proto qw/double av1_estimate_noise_from_single_plane/, "const uint8_t *src, int height, int width, int stride, int edge_thresh";
+ specialize qw/av1_estimate_noise_from_single_plane avx2 neon/;
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_apply_temporal_filter/, "const struct yv12_buffer_config *frame_to_filter, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+ specialize qw/av1_highbd_apply_temporal_filter sse2 avx2 neon/;
+
+ add_proto qw/double av1_highbd_estimate_noise_from_single_plane/, "const uint16_t *src, int height, int width, int stride, int bit_depth, int edge_thresh";
+ specialize qw/av1_highbd_estimate_noise_from_single_plane neon/;
+ }
+ }
+
+ add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+ add_proto qw/void av1_calc_indices_dim1/, "const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k";
+ specialize qw/av1_calc_indices_dim1 sse2 avx2 neon/;
+
+ add_proto qw/void av1_calc_indices_dim2/, "const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k";
+ specialize qw/av1_calc_indices_dim2 sse2 avx2 neon/;
+
+ # ENCODEMB INVOKE
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/av1_highbd_block_error sse2 avx2 neon/;
+ }
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+ specialize qw/av1_highbd_quantize_fp sse4_1 avx2 neon/;
+ }
+
+ # End av1_high encoder functions
+
+ # txb
+ add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
+ specialize qw/av1_get_nz_map_contexts sse2 neon/;
+ add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
+ specialize qw/av1_txb_init_levels sse4_1 avx2 neon/;
+
+ add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+ specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon/;
+ add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+ specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon/;
+ add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+ specialize qw/av1_wedge_compute_delta_squares sse2 avx2 neon/;
+
+ # hash
+ add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
+ specialize qw/av1_get_crc32c_value sse4_2 arm_crc32/;
+
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
+ specialize qw/av1_compute_stats sse4_1 avx2 neon/;
+ add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+ specialize qw/av1_calc_proj_params sse4_1 avx2 neon/;
+ add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2 neon/;
+
+ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_calc_proj_params_high_bd/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+ specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/;
+ add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
+ add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+ specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
+ }
+ }
+
+ add_proto qw/void av1_get_horver_correlation_full/, "const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+ specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
+
+ add_proto qw/void av1_nn_predict/, "const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+
+ add_proto qw/void av1_nn_fast_softmax_16/, "const float *input_nodes, float *output";
+ if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
+ specialize qw/av1_nn_predict sse3 avx2 neon/;
+ specialize qw/av1_nn_fast_softmax_16 sse3/;
+ }
+
+ # CNN functions
+ if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+ add_proto qw/void av1_cnn_activate/, "float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+ add_proto qw/void av1_cnn_add/, "float **input, int channels, int width, int height, int stride, const float **add";
+ add_proto qw/bool av1_cnn_predict/, "const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+ add_proto qw/void av1_cnn_convolve_no_maxpool_padding_valid/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step";
+ if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
+ specialize qw/av1_cnn_convolve_no_maxpool_padding_valid avx2/;
+ }
+ specialize qw/av1_cnn_convolve_no_maxpool_padding_valid neon/;
+ add_proto qw/void av1_cnn_deconvolve/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+ add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
+ }
+
+ # Temporal Denoiser
+ if (aom_config("CONFIG_AV1_TEMPORAL_DENOISING") eq "yes") {
+ add_proto qw/int av1_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
+ specialize qw/av1_denoiser_filter neon sse2/;
+ }
+}
+# end encoder functions
+
+
+# Deringing Functions
+
+add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
+add_proto qw/void cdef_find_dir_dual/, "const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2";
+
+# 8 bit dst
+add_proto qw/void cdef_filter_8_0/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_8_1/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_8_2/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_8_3/, "void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+# 16 bit dst
+add_proto qw/void cdef_filter_16_0/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_16_1/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_16_2/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+add_proto qw/void cdef_filter_16_3/, "void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height";
+
+add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height";
+add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height";
+
+# VS compiling for 32 bit targets does not support vector types in
+# structs as arguments, which makes the v256 type of the intrinsics
+# hard to support, so optimizations for this target are disabled.
+if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+ specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_find_dir_dual sse2 ssse3 sse4_1 avx2 neon/;
+
+ specialize qw/cdef_filter_8_0 sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_8_1 sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_8_2 sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_8_3 sse2 ssse3 sse4_1 avx2 neon/;
+
+ specialize qw/cdef_filter_16_0 sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_16_1 sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_16_2 sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_16_3 sse2 ssse3 sse4_1 avx2 neon/;
+
+ specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+}
+
+# WARPED_MOTION / GLOBAL_MOTION functions
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+ specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/;
+}
+
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
+
+# LOOP_RESTORATION functions
+add_proto qw/int av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
+
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth, int highbd";
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
+
+# CONVOLVE_ROUND/COMPOUND_ROUND functions
+
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
+add_proto qw/void av1_convolve_y_sr_intrabc/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_2d_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_x_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
+ add_proto qw/void av1_highbd_convolve_y_sr_intrabc/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params, int bd";
+ add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
+}
+
+ add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
+
+ specialize qw/av1_convolve_2d_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+ specialize qw/av1_convolve_2d_sr_intrabc neon/;
+ specialize qw/av1_convolve_x_sr sse2 avx2 neon neon_dotprod neon_i8mm/;
+ specialize qw/av1_convolve_x_sr_intrabc neon/;
+ specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_y_sr_intrabc neon/;
+ specialize qw/av1_convolve_2d_scale sse4_1/;
+ specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon neon_dotprod neon_i8mm/;
+ specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/;
+ specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
+ if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/;
+ specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon/;
+ specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/;
+ specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/;
+ specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon/;
+ specialize qw/av1_highbd_convolve_2d_sr_intrabc neon/;
+ specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon/;
+ specialize qw/av1_highbd_convolve_x_sr_intrabc neon/;
+ specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon/;
+ specialize qw/av1_highbd_convolve_y_sr_intrabc neon/;
+ specialize qw/av1_highbd_convolve_2d_scale sse4_1 neon/;
+ }
+
+# INTRA_EDGE functions
+add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge sse4_1 neon/;
+add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
+specialize qw/av1_upsample_intra_edge sse4_1 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void av1_highbd_filter_intra_edge/, "uint16_t *p, int sz, int strength";
+ specialize qw/av1_highbd_filter_intra_edge sse4_1 neon/;
+ add_proto qw/void av1_highbd_upsample_intra_edge/, "uint16_t *p, int sz, int bd";
+ specialize qw/av1_highbd_upsample_intra_edge sse4_1 neon/;
+}
+
+# CFL
+add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+
+ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+
+ add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+
+ add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size";
+ specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/;
+}
+
+add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/;
+
+1;
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
new file mode 100644
index 0000000000..011403b1fa
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+
+// av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
+const int32_t av1_cospi_arr_data[4][64] = {
+ { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+ 972, 964, 955, 946, 936, 926, 915, 903, 891, 878, 865, 851, 837,
+ 822, 807, 792, 775, 759, 742, 724, 706, 688, 669, 650, 630, 610,
+ 590, 569, 548, 526, 505, 483, 460, 438, 415, 392, 369, 345, 321,
+ 297, 273, 249, 224, 200, 175, 150, 125, 100, 75, 50, 25 },
+ { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+ 1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+ 1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+ 1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+ 965, 921, 876, 830, 784, 737, 690, 642, 595, 546, 498,
+ 449, 400, 350, 301, 251, 201, 151, 100, 50 },
+ { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+ 3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+ 3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+ 2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+ 1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+ 897, 799, 700, 601, 501, 401, 301, 201, 101 },
+ { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+ 7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+ 7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+ 5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+ 3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+ 1795, 1598, 1401, 1202, 1003, 803, 603, 402, 201 }
+};
+
+// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
+// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
+const int32_t av1_sinpi_arr_data[4][5] = { { 0, 330, 621, 836, 951 },
+ { 0, 660, 1241, 1672, 1901 },
+ { 0, 1321, 2482, 3344, 3803 },
+ { 0, 2642, 4964, 6689, 7606 } };
+
+// The reduced bit-width arrays are only used in the Arm Neon implementations
+// in av1_fwd_txfm2d_neon.c for now.
+#if HAVE_NEON
+// Constants are stored in groups of four, where symmetrical constants in the
+// cospi array are stored adjacent in memory, followed immediately by the same
+// constants but negated, i.e.:
+// f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i))) << (3-i)
+// and then in memory we store 4-tuples of constants together as:
+// f4(i,j) = [ f(i,j), f(i,64-j), -f(i,j), -f(i,64-j) ]
+//
+// Constants are stored in Q2.13 format, see:
+// https://en.wikipedia.org/wiki/Q_(number_format)
+//
+// The order of the constants is such that increasing subdivisions of 64 store
+// f4 tuples contiguously:
+// av1_cospi_arr_q13_data[i] = {
+// f4(i,32), // f(i,32) twice
+// f4(i,16), // f(i,16) and f(i,48), f4(i,32) skipped since present above.
+// f4(i,8), f(i,24), // f4(i,16) and f4(i,32) skipped since present above.
+// f4(i,4), f(i,12), f4(i,20), f4(i,28),
+// f4(i,2), f4(i,6), f4(i,10), f4(i,14), f4(i,18), ...
+// f4(i,1), f4(i,3), f4(i,5), f4(i,7), f4(i,9), f4(i,11), ...
+// }
+const int16_t av1_cospi_arr_q13_data[4][128] = {
+ {
+ 5792, 5792, -5792, -5792, 7568, 3136, -7568, -3136, 8032, 1600,
+ -8032, -1600, 6808, 4552, -6808, -4552, 8152, 800, -8152, -800,
+ 7840, 2376, -7840, -2376, 7224, 3864, -7224, -3864, 6336, 5200,
+ -6336, -5200, 8184, 400, -8184, -400, 8104, 1200, -8104, -1200,
+ 7944, 1992, -7944, -1992, 7712, 2760, -7712, -2760, 7408, 3504,
+ -7408, -3504, 7024, 4208, -7024, -4208, 6576, 4880, -6576, -4880,
+ 6072, 5504, -6072, -5504, 8192, 200, -8192, -200, 8168, 600,
+ -8168, -600, 8128, 1000, -8128, -1000, 8072, 1400, -8072, -1400,
+ 7992, 1792, -7992, -1792, 7896, 2184, -7896, -2184, 7776, 2568,
+ -7776, -2568, 7640, 2952, -7640, -2952, 7488, 3320, -7488, -3320,
+ 7320, 3680, -7320, -3680, 7128, 4040, -7128, -4040, 6920, 4384,
+ -6920, -4384, 6696, 4720, -6696, -4720, 6456, 5040, -6456, -5040,
+ 6200, 5352, -6200, -5352, 5936, 5648, -5936, -5648,
+ },
+ {
+ 5792, 5792, -5792, -5792, 7568, 3136, -7568, -3136, 8036, 1600,
+ -8036, -1600, 6812, 4552, -6812, -4552, 8152, 804, -8152, -804,
+ 7840, 2380, -7840, -2380, 7224, 3860, -7224, -3860, 6332, 5196,
+ -6332, -5196, 8184, 400, -8184, -400, 8104, 1204, -8104, -1204,
+ 7948, 1992, -7948, -1992, 7712, 2760, -7712, -2760, 7404, 3504,
+ -7404, -3504, 7028, 4212, -7028, -4212, 6580, 4880, -6580, -4880,
+ 6068, 5500, -6068, -5500, 8188, 200, -8188, -200, 8168, 604,
+ -8168, -604, 8132, 1004, -8132, -1004, 8072, 1400, -8072, -1400,
+ 7992, 1796, -7992, -1796, 7896, 2184, -7896, -2184, 7780, 2568,
+ -7780, -2568, 7644, 2948, -7644, -2948, 7488, 3320, -7488, -3320,
+ 7316, 3684, -7316, -3684, 7128, 4036, -7128, -4036, 6920, 4384,
+ -6920, -4384, 6696, 4716, -6696, -4716, 6460, 5040, -6460, -5040,
+ 6204, 5352, -6204, -5352, 5932, 5648, -5932, -5648,
+ },
+ {
+ 5792, 5792, -5792, -5792, 7568, 3134, -7568, -3134, 8034, 1598,
+ -8034, -1598, 6812, 4552, -6812, -4552, 8152, 802, -8152, -802,
+ 7840, 2378, -7840, -2378, 7224, 3862, -7224, -3862, 6332, 5196,
+ -6332, -5196, 8182, 402, -8182, -402, 8104, 1202, -8104, -1202,
+ 7946, 1990, -7946, -1990, 7714, 2760, -7714, -2760, 7406, 3502,
+ -7406, -3502, 7026, 4212, -7026, -4212, 6580, 4880, -6580, -4880,
+ 6070, 5502, -6070, -5502, 8190, 202, -8190, -202, 8170, 602,
+ -8170, -602, 8130, 1002, -8130, -1002, 8072, 1400, -8072, -1400,
+ 7992, 1794, -7992, -1794, 7896, 2184, -7896, -2184, 7778, 2570,
+ -7778, -2570, 7644, 2948, -7644, -2948, 7490, 3320, -7490, -3320,
+ 7318, 3684, -7318, -3684, 7128, 4038, -7128, -4038, 6922, 4382,
+ -6922, -4382, 6698, 4718, -6698, -4718, 6458, 5040, -6458, -5040,
+ 6204, 5350, -6204, -5350, 5934, 5648, -5934, -5648,
+ },
+ {
+ 5793, 5793, -5793, -5793, 7568, 3135, -7568, -3135, 8035, 1598,
+ -8035, -1598, 6811, 4551, -6811, -4551, 8153, 803, -8153, -803,
+ 7839, 2378, -7839, -2378, 7225, 3862, -7225, -3862, 6333, 5197,
+ -6333, -5197, 8182, 402, -8182, -402, 8103, 1202, -8103, -1202,
+ 7946, 1990, -7946, -1990, 7713, 2760, -7713, -2760, 7405, 3503,
+ -7405, -3503, 7027, 4212, -7027, -4212, 6580, 4880, -6580, -4880,
+ 6070, 5501, -6070, -5501, 8190, 201, -8190, -201, 8170, 603,
+ -8170, -603, 8130, 1003, -8130, -1003, 8071, 1401, -8071, -1401,
+ 7993, 1795, -7993, -1795, 7895, 2185, -7895, -2185, 7779, 2570,
+ -7779, -2570, 7643, 2948, -7643, -2948, 7489, 3320, -7489, -3320,
+ 7317, 3683, -7317, -3683, 7128, 4038, -7128, -4038, 6921, 4383,
+ -6921, -4383, 6698, 4717, -6698, -4717, 6458, 5040, -6458, -5040,
+ 6203, 5351, -6203, -5351, 5933, 5649, -5933, -5649,
+ }
+};
+
+// av1_sinpi_arr_q13_data[i][j] =
+// round((sqrt2 * sin((j+1)*Pi/9) * 2/3) * (1 << (cos_bit_min + i))) << (3-i)
+// modified so that elements j=0,1 sum to element j=3.
+// See also: https://en.wikipedia.org/wiki/Q_(number_format)
+const int16_t av1_sinpi_arr_q13_data[4][4] = { { 2640, 4968, 6688, 7608 },
+ { 2640, 4964, 6688, 7604 },
+ { 2642, 4964, 6688, 7606 },
+ { 2642, 4964, 6689, 7606 } };
+
+// Constants are stored in pairs, where symmetrical constants in the
+// cospi array are stored adjacent in memory, i.e.:
+// f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)))
+// and then in memory we store 4-tuples of constants together as:
+// f2(i,j) = [ f(i,j), f(i,64-j) ]
+const int32_t av1_cospi_arr_s32_data[4][66] = {
+ {
+ 1024, 0, 1024, 25, 1023, 50, 1021, 75, 1019, 100, 1016,
+ 125, 1013, 150, 1009, 175, 1004, 200, 999, 224, 993, 249,
+ 987, 273, 980, 297, 972, 321, 964, 345, 955, 369, 946,
+ 392, 936, 415, 926, 438, 915, 460, 903, 483, 891, 505,
+ 878, 526, 865, 548, 851, 569, 837, 590, 822, 610, 807,
+ 630, 792, 650, 775, 669, 759, 688, 742, 706, 724, 724,
+ },
+ {
+ 2048, 0, 2047, 50, 2046, 100, 2042, 151, 2038, 201, 2033,
+ 251, 2026, 301, 2018, 350, 2009, 400, 1998, 449, 1987, 498,
+ 1974, 546, 1960, 595, 1945, 642, 1928, 690, 1911, 737, 1892,
+ 784, 1872, 830, 1851, 876, 1829, 921, 1806, 965, 1782, 1009,
+ 1757, 1053, 1730, 1096, 1703, 1138, 1674, 1179, 1645, 1220, 1615,
+ 1260, 1583, 1299, 1551, 1338, 1517, 1375, 1483, 1412, 1448, 1448,
+ },
+ {
+ 4096, 0, 4095, 101, 4091, 201, 4085, 301, 4076, 401, 4065,
+ 501, 4052, 601, 4036, 700, 4017, 799, 3996, 897, 3973, 995,
+ 3948, 1092, 3920, 1189, 3889, 1285, 3857, 1380, 3822, 1474, 3784,
+ 1567, 3745, 1660, 3703, 1751, 3659, 1842, 3612, 1931, 3564, 2019,
+ 3513, 2106, 3461, 2191, 3406, 2276, 3349, 2359, 3290, 2440, 3229,
+ 2520, 3166, 2598, 3102, 2675, 3035, 2751, 2967, 2824, 2896, 2896,
+ },
+ {
+ 8192, 0, 8190, 201, 8182, 402, 8170, 603, 8153, 803, 8130,
+ 1003, 8103, 1202, 8071, 1401, 8035, 1598, 7993, 1795, 7946, 1990,
+ 7895, 2185, 7839, 2378, 7779, 2570, 7713, 2760, 7643, 2948, 7568,
+ 3135, 7489, 3320, 7405, 3503, 7317, 3683, 7225, 3862, 7128, 4038,
+ 7027, 4212, 6921, 4383, 6811, 4551, 6698, 4717, 6580, 4880, 6458,
+ 5040, 6333, 5197, 6203, 5351, 6070, 5501, 5933, 5649, 5793, 5793,
+ }
+};
+
+#endif // HAVE_NEON
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
+ int i;
+ if (bit == 0) {
+ return;
+ } else {
+ if (bit > 0) {
+ for (i = 0; i < size; i++) {
+ arr[i] = round_shift(arr[i], bit);
+ }
+ } else {
+ for (i = 0; i < size; i++) {
+ arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN,
+ INT32_MAX);
+ }
+ }
+ }
+}
+
+const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = {
+ { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
+ { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
+ { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+ { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
+ TXFM_TYPE_IDENTITY32 },
+ { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
+};
+
+const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
+ 4, // TXFM_TYPE_DCT4
+ 6, // TXFM_TYPE_DCT8
+ 8, // TXFM_TYPE_DCT16
+ 10, // TXFM_TYPE_DCT32
+ 12, // TXFM_TYPE_DCT64
+ 7, // TXFM_TYPE_ADST4
+ 8, // TXFM_TYPE_ADST8
+ 10, // TXFM_TYPE_ADST16
+ 1, // TXFM_TYPE_IDENTITY4
+ 1, // TXFM_TYPE_IDENTITY8
+ 1, // TXFM_TYPE_IDENTITY16
+ 1, // TXFM_TYPE_IDENTITY32
+};
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+
+ int in_range = 1;
+
+ for (int i = 0; i < size; ++i) {
+ if (buf[i] < min_value || buf[i] > max_value) {
+ in_range = 0;
+ }
+ }
+
+ if (!in_range) {
+ fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+ fprintf(stderr, "size: %d\n", size);
+ fprintf(stderr, "stage: %d\n", stage);
+ fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+ max_value);
+
+ fprintf(stderr, "coeffs: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", input[j]);
+ }
+ fprintf(stderr, "]\n");
+
+ fprintf(stderr, " buf: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", buf[j]);
+ }
+ fprintf(stderr, "]\n\n");
+ }
+
+ assert(in_range);
+#else
+ (void)stage;
+ (void)input;
+ (void)buf;
+ (void)size;
+ (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
new file mode 100644
index 0000000000..7ad70af86a
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(DO_RANGE_CHECK_CLAMP)
+#define DO_RANGE_CHECK_CLAMP 0
+#endif
+
+extern const int32_t av1_cospi_arr_data[4][64];
+extern const int32_t av1_sinpi_arr_data[4][5];
+
+#define MAX_TXFM_STAGE_NUM 12
+
+static const int cos_bit_min = 10;
+
+#define NewSqrt2Bits ((int32_t)12)
+// 2^12 * sqrt(2)
+static const int32_t NewSqrt2 = 5793;
+// 2^12 / sqrt(2)
+static const int32_t NewInvSqrt2 = 2896;
+
+static INLINE const int32_t *cospi_arr(int n) {
+ return av1_cospi_arr_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *sinpi_arr(int n) {
+ return av1_sinpi_arr_data[n - cos_bit_min];
+}
+
+// The reduced bit-width and permuted arrays are only used in the Arm Neon
+// implementations in av1_fwd_txfm2d_neon.c and highbd_fwd_txfm_neon.c for now.
+#if HAVE_NEON
+// Store cospi/sinpi costants in Q2.13 format.
+// See: https://en.wikipedia.org/wiki/Q_(number_format)
+extern const int16_t av1_cospi_arr_q13_data[4][128];
+extern const int16_t av1_sinpi_arr_q13_data[4][4];
+
+extern const int32_t av1_cospi_arr_s32_data[4][66];
+
+static INLINE const int16_t *cospi_arr_q13(int n) {
+ return av1_cospi_arr_q13_data[n - cos_bit_min];
+}
+
+static INLINE const int16_t *sinpi_arr_q13(int n) {
+ return av1_sinpi_arr_q13_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *cospi_arr_s32(int n) {
+ return av1_cospi_arr_s32_data[n - cos_bit_min];
+}
+#endif // HAVE_NEON
+
+static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+ if (value < min_value || value > max_value) {
+ fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+#if !CONFIG_AV1_ENCODER
+ assert(0);
+#endif
+ }
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+#if DO_RANGE_CHECK_CLAMP
+ bit = AOMMIN(bit, 31);
+ return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
+#endif // DO_RANGE_CHECK_CLAMP
+ (void)bit;
+ return value;
+}
+
+static INLINE int32_t round_shift(int64_t value, int bit) {
+ assert(bit >= 1);
+ return (int32_t)((value + (1ll << (bit - 1))) >> bit);
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+ int bit) {
+ int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+ int64_t intermediate = result_64 + (1LL << (bit - 1));
+ // NOTE(rachelbarker): The value 'result_64' may not necessarily fit
+ // into 32 bits. However, the result of this function is nominally
+ // ROUND_POWER_OF_TWO_64(result_64, bit)
+ // and that is required to fit into stage_range[stage] many bits
+ // (checked by range_check_buf()).
+ //
+ // Here we've unpacked that rounding operation, and it can be shown
+ // that the value of 'intermediate' here *does* fit into 32 bits
+ // for any conformant bitstream.
+ // The upshot is that, if you do all this calculation using
+ // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+ // then you'll still get the correct result.
+ // To provide a check on this logic, we assert that 'intermediate'
+ // would fit into an int32 if range checking is enabled.
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
+#endif
+ return (int32_t)(intermediate >> bit);
+}
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ return clip_pixel_highbd(dest + (int)trans, bd);
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+
+typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd);
+
+enum {
+ TXFM_TYPE_DCT4,
+ TXFM_TYPE_DCT8,
+ TXFM_TYPE_DCT16,
+ TXFM_TYPE_DCT32,
+ TXFM_TYPE_DCT64,
+ TXFM_TYPE_ADST4,
+ TXFM_TYPE_ADST8,
+ TXFM_TYPE_ADST16,
+ TXFM_TYPE_IDENTITY4,
+ TXFM_TYPE_IDENTITY8,
+ TXFM_TYPE_IDENTITY16,
+ TXFM_TYPE_IDENTITY32,
+ TXFM_TYPES,
+ TXFM_TYPE_INVALID,
+} UENUM1BYTE(TXFM_TYPE);
+
+typedef struct TXFM_2D_FLIP_CFG {
+ TX_SIZE tx_size;
+ int ud_flip; // flip upside down
+ int lr_flip; // flip left to right
+ const int8_t *shift;
+ int8_t cos_bit_col;
+ int8_t cos_bit_row;
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ TXFM_TYPE txfm_type_col;
+ TXFM_TYPE txfm_type_row;
+ int stage_num_col;
+ int stage_num_row;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ *ud_flip = 0;
+ *lr_flip = 0;
+ break;
+ case IDTX:
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ *ud_flip = 0;
+ *lr_flip = 0;
+ break;
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST:
+ *ud_flip = 1;
+ *lr_flip = 0;
+ break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ *ud_flip = 0;
+ *lr_flip = 1;
+ break;
+ case FLIPADST_FLIPADST:
+ *ud_flip = 1;
+ *lr_flip = 1;
+ break;
+ default:
+ *ud_flip = 0;
+ *lr_flip = 0;
+ assert(0);
+ }
+}
+
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+ get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
+// Utility function that returns the log of the ratio of the col and row
+// sizes.
+static INLINE int get_rect_tx_log_ratio(int col, int row) {
+ if (col == row) return 0;
+ if (col > row) {
+ if (col == row * 2) return 1;
+ if (col == row * 4) return 2;
+ assert(0 && "Unsupported transform size");
+ } else {
+ if (row == col * 2) return -1;
+ if (row == col * 4) return -2;
+ assert(0 && "Unsupported transform size");
+ }
+ return 0; // Invalid
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, int bd);
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+ int bd);
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg);
+extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
+extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
+static INLINE int get_txw_idx(TX_SIZE tx_size) {
+ return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+}
+static INLINE int get_txh_idx(TX_SIZE tx_size) {
+ return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+}
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit);
+#define MAX_TXWH_IDX 5
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
new file mode 100644
index 0000000000..1d597502ce
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
+ if (!left_mi) return DC_PRED;
+ assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
+ return left_mi->mode;
+}
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
+ if (!above_mi) return DC_PRED;
+ assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
+ return above_mi->mode;
+}
+
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+ struct macroblockd_plane *pd, int plane,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int has_eob, int aoff, int loff) {
+ ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
+ ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
+ const int txs_wide = tx_size_wide_unit[tx_size];
+ const int txs_high = tx_size_high_unit[tx_size];
+
+ // above
+ if (has_eob && xd->mb_to_right_edge < 0) {
+ const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
+ memset(a, has_eob, sizeof(*a) * above_contexts);
+ memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
+ } else {
+ memset(a, has_eob, sizeof(*a) * txs_wide);
+ }
+
+ // left
+ if (has_eob && xd->mb_to_bottom_edge < 0) {
+ const int blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
+ memset(l, has_eob, sizeof(*l) * left_contexts);
+ memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
+ } else {
+ memset(l, has_eob, sizeof(*l) * txs_high);
+ }
+}
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ const int num_planes) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
+ for (int i = 0; i < nplanes; i++) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int txs_wide = mi_size_wide[plane_bsize];
+ const int txs_high = mi_size_high[plane_bsize];
+ memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+ memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+ }
+}
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
+ xd->delta_lf_from_base = 0;
+ const int frame_lf_count =
+ num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
+}
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
+ for (int p = 0; p < num_planes; ++p) {
+ set_default_wiener(xd->wiener_info + p);
+ set_default_sgrproj(xd->sgrproj_info + p);
+ }
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+ const int num_planes) {
+ int i;
+
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].plane_type = get_plane_type(i);
+ xd->plane[i].subsampling_x = i ? ss_x : 0;
+ xd->plane[i].subsampling_y = i ? ss_y : 0;
+ }
+ for (i = num_planes; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].subsampling_x = 1;
+ xd->plane[i].subsampling_y = 1;
+ }
+}
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
new file mode 100644
index 0000000000..0cfd1f3954
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.h
@@ -0,0 +1,1612 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_B_QUANT_NO_TRELLIS 1
+
+#define MAX_MB_PLANE 3
+
+#define MAX_DIFFWTD_MASK_BITS 1
+
+#define INTERINTRA_WEDGE_SIGN 0
+
+#define DEFAULT_INTER_TX_TYPE DCT_DCT
+
+#define MAX_PALETTE_BLOCK_WIDTH 64
+
+#define MAX_PALETTE_BLOCK_HEIGHT 64
+
+/*!\cond */
+
+// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
+enum {
+ DIFFWTD_38 = 0,
+ DIFFWTD_38_INV,
+ DIFFWTD_MASK_TYPES,
+} UENUM1BYTE(DIFFWTD_MASK_TYPE);
+
+enum {
+ KEY_FRAME = 0,
+ INTER_FRAME = 1,
+ INTRA_ONLY_FRAME = 2, // replaces intra-only
+ S_FRAME = 3,
+ FRAME_TYPES,
+} UENUM1BYTE(FRAME_TYPE);
+
+static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
+ return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+ return mode >= INTER_MODE_START && mode < INTER_MODE_END;
+}
+
+typedef struct {
+ uint8_t *plane[MAX_MB_PLANE];
+ int stride[MAX_MB_PLANE];
+} BUFFER_SET;
+
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+ return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
+}
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+ return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+ static const PREDICTION_MODE lut[] = {
+ DC_PRED, // DC_PRED
+ V_PRED, // V_PRED
+ H_PRED, // H_PRED
+ D45_PRED, // D45_PRED
+ D135_PRED, // D135_PRED
+ D113_PRED, // D113_PRED
+ D157_PRED, // D157_PRED
+ D203_PRED, // D203_PRED
+ D67_PRED, // D67_PRED
+ SMOOTH_PRED, // SMOOTH_PRED
+ SMOOTH_V_PRED, // SMOOTH_V_PRED
+ SMOOTH_H_PRED, // SMOOTH_H_PRED
+ PAETH_PRED, // PAETH_PRED
+ NEARESTMV, // NEARESTMV
+ NEARMV, // NEARMV
+ GLOBALMV, // GLOBALMV
+ NEWMV, // NEWMV
+ NEARESTMV, // NEAREST_NEARESTMV
+ NEARMV, // NEAR_NEARMV
+ NEARESTMV, // NEAREST_NEWMV
+ NEWMV, // NEW_NEARESTMV
+ NEARMV, // NEAR_NEWMV
+ NEWMV, // NEW_NEARMV
+ GLOBALMV, // GLOBAL_GLOBALMV
+ NEWMV, // NEW_NEWMV
+ };
+ assert(NELEMENTS(lut) == MB_MODE_COUNT);
+ assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
+ return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+ static const PREDICTION_MODE lut[] = {
+ MB_MODE_COUNT, // DC_PRED
+ MB_MODE_COUNT, // V_PRED
+ MB_MODE_COUNT, // H_PRED
+ MB_MODE_COUNT, // D45_PRED
+ MB_MODE_COUNT, // D135_PRED
+ MB_MODE_COUNT, // D113_PRED
+ MB_MODE_COUNT, // D157_PRED
+ MB_MODE_COUNT, // D203_PRED
+ MB_MODE_COUNT, // D67_PRED
+ MB_MODE_COUNT, // SMOOTH_PRED
+ MB_MODE_COUNT, // SMOOTH_V_PRED
+ MB_MODE_COUNT, // SMOOTH_H_PRED
+ MB_MODE_COUNT, // PAETH_PRED
+ MB_MODE_COUNT, // NEARESTMV
+ MB_MODE_COUNT, // NEARMV
+ MB_MODE_COUNT, // GLOBALMV
+ MB_MODE_COUNT, // NEWMV
+ NEARESTMV, // NEAREST_NEARESTMV
+ NEARMV, // NEAR_NEARMV
+ NEWMV, // NEAREST_NEWMV
+ NEARESTMV, // NEW_NEARESTMV
+ NEWMV, // NEAR_NEWMV
+ NEARMV, // NEW_NEARMV
+ GLOBALMV, // GLOBAL_GLOBALMV
+ NEWMV, // NEW_NEWMV
+ };
+ assert(NELEMENTS(lut) == MB_MODE_COUNT);
+ assert(is_inter_compound_mode(mode));
+ return lut[mode];
+}
+
+static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+ return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
+ mode == NEW_NEARMV);
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+ return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
+ mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+
+static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
+ return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
+}
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+ modes for the Y blocks to the left and above us; for interframes, there
+ is a single probability table. */
+
+typedef struct {
+ // Value of base colors for Y, U, and V
+ uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+ // Number of base colors for Y (0) and UV (1)
+ uint8_t palette_size[2];
+} PALETTE_MODE_INFO;
+
+typedef struct {
+ FILTER_INTRA_MODE filter_intra_mode;
+ uint8_t use_filter_intra;
+} FILTER_INTRA_MODE_INFO;
+
+static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
+ DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED
+};
+
+#if CONFIG_RD_DEBUG
+#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE)
+#endif
+
+typedef struct RD_STATS {
+ int rate;
+ int zero_rate;
+ int64_t dist;
+ // Please be careful of using rdcost, it's not guaranteed to be set all the
+ // time.
+ // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In
+ // these functions, make sure rdcost is always up-to-date according to
+ // rate/dist.
+ int64_t rdcost;
+ int64_t sse;
+ uint8_t skip_txfm; // sse should equal to dist when skip_txfm == 1
+#if CONFIG_RD_DEBUG
+ int txb_coeff_cost[MAX_MB_PLANE];
+#endif // CONFIG_RD_DEBUG
+} RD_STATS;
+
+// This struct is used to group function args that are commonly
+// sent together in functions related to interinter compound modes
+typedef struct {
+ uint8_t *seg_mask;
+ int8_t wedge_index;
+ int8_t wedge_sign;
+ DIFFWTD_MASK_TYPE mask_type;
+ COMPOUND_TYPE type;
+} INTERINTER_COMPOUND_DATA;
+
+#define INTER_TX_SIZE_BUF_LEN 16
+#define TXK_TYPE_BUF_LEN 64
+/*!\endcond */
+
+/*! \brief Stores the prediction/txfm mode of the current coding block
+ */
+typedef struct MB_MODE_INFO {
+ /*****************************************************************************
+ * \name General Info of the Coding Block
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief The block size of the current coding block */
+ BLOCK_SIZE bsize;
+ /*! \brief The partition type of the current coding block. */
+ PARTITION_TYPE partition;
+ /*! \brief The prediction mode used */
+ PREDICTION_MODE mode;
+ /*! \brief The UV mode when intra is used */
+ UV_PREDICTION_MODE uv_mode;
+ /*! \brief The q index for the current coding block. */
+ int current_qindex;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Mode Info
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief The motion vectors used by the current inter mode */
+ int_mv mv[2];
+ /*! \brief The reference frames for the MV */
+ MV_REFERENCE_FRAME ref_frame[2];
+ /*! \brief Filter used in subpel interpolation. */
+ int_interpfilters interp_filters;
+ /*! \brief The motion mode used by the inter prediction. */
+ MOTION_MODE motion_mode;
+ /*! \brief Number of samples used by warp causal */
+ uint8_t num_proj_ref;
+ /*! \brief The number of overlapped neighbors above/left for obmc/warp motion
+ * mode. */
+ uint8_t overlappable_neighbors;
+ /*! \brief The parameters used in warp motion mode. */
+ WarpedMotionParams wm_params;
+ /*! \brief The type of intra mode used by inter-intra */
+ INTERINTRA_MODE interintra_mode;
+ /*! \brief The type of wedge used in interintra mode. */
+ int8_t interintra_wedge_index;
+ /*! \brief Struct that stores the data used in interinter compound mode. */
+ INTERINTER_COMPOUND_DATA interinter_comp;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Intra Mode Info
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Directional mode delta: the angle is base angle + (angle_delta *
+ * step). */
+ int8_t angle_delta[PLANE_TYPES];
+ /*! \brief The type of filter intra mode used (if applicable). */
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ /*! \brief Chroma from Luma: Joint sign of alpha Cb and alpha Cr */
+ int8_t cfl_alpha_signs;
+ /*! \brief Chroma from Luma: Index of the alpha Cb and alpha Cr combination */
+ uint8_t cfl_alpha_idx;
+ /*! \brief Stores the size and colors of palette mode */
+ PALETTE_MODE_INFO palette_mode_info;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Transform Info
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Whether to skip transforming and sending. */
+ uint8_t skip_txfm;
+ /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */
+ TX_SIZE tx_size;
+ /*! \brief Transform size when recursive txfm tree is on. */
+ TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Loop Filter Info
+ ****************************************************************************/
+ /**@{*/
+ /*! \copydoc MACROBLOCKD::delta_lf_from_base */
+ int8_t delta_lf_from_base;
+ /*! \copydoc MACROBLOCKD::delta_lf */
+ int8_t delta_lf[FRAME_LF_COUNT];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Bitfield for Memory Reduction
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief The segment id */
+ uint8_t segment_id : 3;
+ /*! \brief Only valid when temporal update if off. */
+ uint8_t seg_id_predicted : 1;
+ /*! \brief Which ref_mv to use */
+ uint8_t ref_mv_idx : 2;
+ /*! \brief Inter skip mode */
+ uint8_t skip_mode : 1;
+ /*! \brief Whether intrabc is used. */
+ uint8_t use_intrabc : 1;
+ /*! \brief Indicates if masked compound is used(1) or not (0). */
+ uint8_t comp_group_idx : 1;
+ /*! \brief Indicates whether dist_wtd_comp(0) is used or not (0). */
+ uint8_t compound_idx : 1;
+ /*! \brief Whether to use interintra wedge */
+ uint8_t use_wedge_interintra : 1;
+ /*! \brief CDEF strength per BLOCK_64X64 */
+ int8_t cdef_strength : 4;
+ /**@}*/
+
+#if CONFIG_RD_DEBUG
+ /*! \brief RD info used for debugging */
+ RD_STATS rd_stats;
+ /*! \brief The current row in unit of 4x4 blocks for debugging */
+ int mi_row;
+ /*! \brief The current col in unit of 4x4 blocks for debugging */
+ int mi_col;
+#endif
+#if CONFIG_INSPECTION
+ /*! \brief Whether we are skipping the current rows or columns. */
+ int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
+} MB_MODE_INFO;
+
+/*!\cond */
+
+static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
+ return mbmi->use_intrabc;
+}
+
+static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
+ assert(mode < UV_INTRA_MODES);
+ static const PREDICTION_MODE uv2y[] = {
+ DC_PRED, // UV_DC_PRED
+ V_PRED, // UV_V_PRED
+ H_PRED, // UV_H_PRED
+ D45_PRED, // UV_D45_PRED
+ D135_PRED, // UV_D135_PRED
+ D113_PRED, // UV_D113_PRED
+ D157_PRED, // UV_D157_PRED
+ D203_PRED, // UV_D203_PRED
+ D67_PRED, // UV_D67_PRED
+ SMOOTH_PRED, // UV_SMOOTH_PRED
+ SMOOTH_V_PRED, // UV_SMOOTH_V_PRED
+ SMOOTH_H_PRED, // UV_SMOOTH_H_PRED
+ PAETH_PRED, // UV_PAETH_PRED
+ DC_PRED, // UV_CFL_PRED
+ INTRA_INVALID, // UV_INTRA_MODES
+ INTRA_INVALID, // UV_MODE_INVALID
+ };
+ return uv2y[mode];
+}
+
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+ return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+ return mbmi->ref_frame[1] > INTRA_FRAME;
+}
+
+static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
+ return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
+ (mbmi->ref_frame[1] >= BWDREF_FRAME)));
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
+ static const MV_REFERENCE_FRAME lut[] = {
+ LAST_FRAME, // LAST_LAST2_FRAMES,
+ LAST_FRAME, // LAST_LAST3_FRAMES,
+ LAST_FRAME, // LAST_GOLDEN_FRAMES,
+ BWDREF_FRAME, // BWDREF_ALTREF_FRAMES,
+ LAST2_FRAME, // LAST2_LAST3_FRAMES
+ LAST2_FRAME, // LAST2_GOLDEN_FRAMES,
+ LAST3_FRAME, // LAST3_GOLDEN_FRAMES,
+ BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES,
+ ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES,
+ };
+ assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+ return lut[ref_idx];
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
+ static const MV_REFERENCE_FRAME lut[] = {
+ LAST2_FRAME, // LAST_LAST2_FRAMES,
+ LAST3_FRAME, // LAST_LAST3_FRAMES,
+ GOLDEN_FRAME, // LAST_GOLDEN_FRAMES,
+ ALTREF_FRAME, // BWDREF_ALTREF_FRAMES,
+ LAST3_FRAME, // LAST2_LAST3_FRAMES
+ GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES,
+ GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES,
+ ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES,
+ ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES,
+ };
+ assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+ return lut[ref_idx];
+}
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
+
+static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
+ TransformationType type) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int block_size_allowed =
+ AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+ return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
+ block_size_allowed;
+}
+
+#if CONFIG_MISMATCH_DEBUG
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+ int mi_row, int tx_blk_col, int tx_blk_row,
+ int subsampling_x, int subsampling_y) {
+ *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
+ (tx_blk_col << MI_SIZE_LOG2);
+ *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
+ (tx_blk_row << MI_SIZE_LOG2);
+}
+#endif
+
+enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision);
+
+struct buf_2d {
+ uint8_t *buf;
+ uint8_t *buf0;
+ int width;
+ int height;
+ int stride;
+};
+
+typedef struct eob_info {
+ uint16_t eob;
+ uint16_t max_scan_line;
+} eob_info;
+
+typedef struct {
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]);
+ eob_info eob_data[MAX_MB_PLANE]
+ [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+} CB_BUFFER;
+
+typedef struct macroblockd_plane {
+ PLANE_TYPE plane_type;
+ int subsampling_x;
+ int subsampling_y;
+ struct buf_2d dst;
+ struct buf_2d pre[2];
+ ENTROPY_CONTEXT *above_entropy_context;
+ ENTROPY_CONTEXT *left_entropy_context;
+
+ // The dequantizers below are true dequantizers used only in the
+ // dequantization process. They have the same coefficient
+ // shift/scale as TX.
+ int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
+ // Pointer to color index map of:
+ // - Current coding block, on encoder side.
+ // - Current superblock, on decoder side.
+ uint8_t *color_index_map;
+
+ // block size in pixels
+ uint8_t width, height;
+
+ qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+} MACROBLOCKD_PLANE;
+
+#define BLOCK_OFFSET(i) ((i) << 4)
+
+/*!\endcond */
+
+/*!\brief Parameters related to Wiener Filter */
+typedef struct {
+ /*!
+ * Vertical filter kernel.
+ */
+ DECLARE_ALIGNED(16, InterpKernel, vfilter);
+
+ /*!
+ * Horizontal filter kernel.
+ */
+ DECLARE_ALIGNED(16, InterpKernel, hfilter);
+} WienerInfo;
+
+/*!\brief Parameters related to Sgrproj Filter */
+typedef struct {
+ /*!
+ * Parameter index.
+ */
+ int ep;
+
+ /*!
+ * Weights for linear combination of filtered versions
+ */
+ int xqd[2];
+} SgrprojInfo;
+
+/*!\cond */
+
+#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
+#define CFL_BUF_LINE (32)
+#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
+#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
+#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
+typedef struct cfl_ctx {
+ // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
+ // shifts)
+ uint16_t recon_buf_q3[CFL_BUF_SQUARE];
+ // Q3 AC contributions (reconstructed luma pixels - tx block avg)
+ int16_t ac_buf_q3[CFL_BUF_SQUARE];
+
+ // Cache the DC_PRED when performing RDO, so it does not have to be recomputed
+ // for every scaling parameter
+ bool dc_pred_is_cached[CFL_PRED_PLANES];
+ // Whether the DC_PRED cache is enabled. The DC_PRED cache is disabled when
+ // decoding.
+ bool use_dc_pred_cache;
+ // Only cache the first row of the DC_PRED
+ int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
+
+ // Height and width currently used in the CfL prediction buffer.
+ int buf_height, buf_width;
+
+ int are_parameters_computed;
+
+ // Chroma subsampling
+ int subsampling_x, subsampling_y;
+
+ // Whether the reconstructed luma pixels need to be stored
+ int store_y;
+} CFL_CTX;
+
+typedef struct dist_wtd_comp_params {
+ int use_dist_wtd_comp_avg;
+ int fwd_offset;
+ int bck_offset;
+} DIST_WTD_COMP_PARAMS;
+
+struct scale_factors;
+
+/*!\endcond */
+
+/*! \brief Variables related to current coding block.
+ *
+ * This is a common set of variables used by both encoder and decoder.
+ * Most/all of the pointers are mere pointers to actual arrays are allocated
+ * elsewhere. This is mostly for coding convenience.
+ */
+typedef struct macroblockd {
+ /**
+ * \name Position of current macroblock in mi units
+ */
+ /**@{*/
+ int mi_row; /*!< Row position in mi units. */
+ int mi_col; /*!< Column position in mi units. */
+ /**@}*/
+
+ /*!
+ * Same as cm->mi_params.mi_stride, copied here for convenience.
+ */
+ int mi_stride;
+
+ /*!
+ * True if current block transmits chroma information.
+ * More detail:
+ * Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+ * in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+ * blocks smaller than 8x8 maybe combined into one chroma block.
+ * For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+ * luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+ * these four luma blocks. This is implemented in bitstream as follows:
+ * - There are four MB_MODE_INFO structs for the four luma blocks.
+ * - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+ * any information for chroma planes.
+ * - Last block will have is_chroma_ref = true and transmits chroma
+ * information for the 4x4 chroma block that covers whole 8x8 area covered by
+ * four luma blocks.
+ * Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+ */
+ bool is_chroma_ref;
+
+ /*!
+ * Info specific to each plane.
+ */
+ struct macroblockd_plane plane[MAX_MB_PLANE];
+
+ /*!
+ * Tile related info.
+ */
+ TileInfo tile;
+
+ /*!
+ * Appropriate offset inside cm->mi_params.mi_grid_base based on current
+ * mi_row and mi_col.
+ */
+ MB_MODE_INFO **mi;
+
+ /*!
+ * True if 4x4 block above the current block is available.
+ */
+ bool up_available;
+ /*!
+ * True if 4x4 block to the left of the current block is available.
+ */
+ bool left_available;
+ /*!
+ * True if the above chrome reference block is available.
+ */
+ bool chroma_up_available;
+ /*!
+ * True if the left chrome reference block is available.
+ */
+ bool chroma_left_available;
+
+ /*!
+ * MB_MODE_INFO for 4x4 block to the left of the current block, if
+ * left_available == true; otherwise NULL.
+ */
+ MB_MODE_INFO *left_mbmi;
+ /*!
+ * MB_MODE_INFO for 4x4 block above the current block, if
+ * up_available == true; otherwise NULL.
+ */
+ MB_MODE_INFO *above_mbmi;
+ /*!
+ * Above chroma reference block if is_chroma_ref == true for the current block
+ * and chroma_up_available == true; otherwise NULL.
+ * See also: the special case logic when current chroma block covers more than
+ * one luma blocks in set_mi_row_col().
+ */
+ MB_MODE_INFO *chroma_left_mbmi;
+ /*!
+ * Left chroma reference block if is_chroma_ref == true for the current block
+ * and chroma_left_available == true; otherwise NULL.
+ * See also: the special case logic when current chroma block covers more than
+ * one luma blocks in set_mi_row_col().
+ */
+ MB_MODE_INFO *chroma_above_mbmi;
+
+ /*!
+ * Appropriate offset based on current 'mi_row' and 'mi_col', inside
+ * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+ * 'MACROBLOCK' structs.
+ */
+ uint8_t *tx_type_map;
+ /*!
+ * Stride for 'tx_type_map'. Note that this may / may not be same as
+ * 'mi_stride', depending on which actual array 'tx_type_map' points to.
+ */
+ int tx_type_map_stride;
+
+ /**
+ * \name Distance of this macroblock from frame edges in 1/8th pixel units.
+ */
+ /**@{*/
+ int mb_to_left_edge; /*!< Distance from left edge */
+ int mb_to_right_edge; /*!< Distance from right edge */
+ int mb_to_top_edge; /*!< Distance from top edge */
+ int mb_to_bottom_edge; /*!< Distance from bottom edge */
+ /**@}*/
+
+ /*!
+ * Scale factors for reference frames of the current block.
+ * These are pointers into 'cm->ref_scale_factors'.
+ */
+ const struct scale_factors *block_ref_scale_factors[2];
+
+ /*!
+ * - On encoder side: points to cpi->source, which is the buffer containing
+ * the current *source* frame (maybe filtered).
+ * - On decoder side: points to cm->cur_frame->buf, which is the buffer into
+ * which current frame is being *decoded*.
+ */
+ const YV12_BUFFER_CONFIG *cur_buf;
+
+ /*!
+ * Entropy contexts for the above blocks.
+ * above_entropy_context[i][j] corresponds to above entropy context for ith
+ * plane and jth mi column of this *frame*, wrt current 'mi_row'.
+ * These are pointers into 'cm->above_contexts.entropy'.
+ */
+ ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
+ /*!
+ * Entropy contexts for the left blocks.
+ * left_entropy_context[i][j] corresponds to left entropy context for ith
+ * plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+ * Note: These contain actual data, NOT pointers.
+ */
+ ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+
+ /*!
+ * Partition contexts for the above blocks.
+ * above_partition_context[i] corresponds to above partition context for ith
+ * mi column of this *frame*, wrt current 'mi_row'.
+ * This is a pointer into 'cm->above_contexts.partition'.
+ */
+ PARTITION_CONTEXT *above_partition_context;
+ /*!
+ * Partition contexts for the left blocks.
+ * left_partition_context[i] corresponds to left partition context for ith
+ * mi row of this *superblock*, wrt current 'mi_col'.
+ * Note: These contain actual data, NOT pointers.
+ */
+ PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
+
+ /*!
+ * Transform contexts for the above blocks.
+ * above_txfm_context[i] corresponds to above transform context for ith mi col
+ * from the current position (mi row and mi column) for this *frame*.
+ * This is a pointer into 'cm->above_contexts.txfm'.
+ */
+ TXFM_CONTEXT *above_txfm_context;
+ /*!
+ * Transform contexts for the left blocks.
+ * left_txfm_context[i] corresponds to left transform context for ith mi row
+ * from the current position (mi_row and mi_col) for this *superblock*.
+ * This is a pointer into 'left_txfm_context_buffer'.
+ */
+ TXFM_CONTEXT *left_txfm_context;
+ /*!
+ * left_txfm_context_buffer[i] is the left transform context for ith mi_row
+ * in this *superblock*.
+ * Behaves like an internal actual buffer which 'left_txt_context' points to,
+ * and never accessed directly except to fill in initial default values.
+ */
+ TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+ /**
+ * \name Default values for the two restoration filters for each plane.
+ * Default values for the two restoration filters for each plane.
+ * These values are used as reference values when writing the bitstream. That
+ * is, we transmit the delta between the actual values in
+ * cm->rst_info[plane].unit_info[unit_idx] and these reference values.
+ */
+ /**@{*/
+ WienerInfo wiener_info[MAX_MB_PLANE]; /*!< Defaults for Wiener filter*/
+ SgrprojInfo sgrproj_info[MAX_MB_PLANE]; /*!< Defaults for SGR filter */
+ /**@}*/
+
+ /**
+ * \name Block dimensions in MB_MODE_INFO units.
+ */
+ /**@{*/
+ uint8_t width; /*!< Block width in MB_MODE_INFO units */
+ uint8_t height; /*!< Block height in MB_MODE_INFO units */
+ /**@}*/
+
+ /*!
+ * Contains the motion vector candidates found during motion vector prediction
+ * process. ref_mv_stack[i] contains the candidates for ith type of
+ * reference frame (single/compound). The actual number of candidates found in
+ * ref_mv_stack[i] is stored in either dcb->ref_mv_count[i] (decoder side)
+ * or mbmi_ext->ref_mv_count[i] (encoder side).
+ */
+ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+ /*!
+ * weight[i][j] is the weight for ref_mv_stack[i][j] and used to compute the
+ * DRL (dynamic reference list) mode contexts.
+ */
+ uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+
+ /*!
+ * True if this is the last vertical rectangular block in a VERTICAL or
+ * VERTICAL_4 partition.
+ */
+ bool is_last_vertical_rect;
+ /*!
+ * True if this is the 1st horizontal rectangular block in a HORIZONTAL or
+ * HORIZONTAL_4 partition.
+ */
+ bool is_first_horizontal_rect;
+
+ /*!
+ * Counts of each reference frame in the above and left neighboring blocks.
+ * NOTE: Take into account both single and comp references.
+ */
+ uint8_t neighbors_ref_counts[REF_FRAMES];
+
+ /*!
+ * Current CDFs of all the symbols for the current tile.
+ */
+ FRAME_CONTEXT *tile_ctx;
+
+ /*!
+ * Bit depth: copied from cm->seq_params->bit_depth for convenience.
+ */
+ int bd;
+
+ /*!
+ * Quantizer index for each segment (base qindex + delta for each segment).
+ */
+ int qindex[MAX_SEGMENTS];
+ /*!
+ * lossless[s] is true if segment 's' is coded losslessly.
+ */
+ int lossless[MAX_SEGMENTS];
+ /*!
+ * Q index for the coding blocks in this superblock will be stored in
+ * mbmi->current_qindex. Now, when cm->delta_q_info.delta_q_present_flag is
+ * true, mbmi->current_qindex is computed by taking 'current_base_qindex' as
+ * the base, and adding any transmitted delta qindex on top of it.
+ * Precisely, this is the latest qindex used by the first coding block of a
+ * non-skip superblock in the current tile; OR
+ * same as cm->quant_params.base_qindex (if not explicitly set yet).
+ * Note: This is 'CurrentQIndex' in the AV1 spec.
+ */
+ int current_base_qindex;
+
+ /*!
+ * Same as cm->features.cur_frame_force_integer_mv.
+ */
+ int cur_frame_force_integer_mv;
+
+ /*!
+ * Pointer to cm->error.
+ */
+ struct aom_internal_error_info *error_info;
+
+ /*!
+ * Same as cm->global_motion.
+ */
+ const WarpedMotionParams *global_motion;
+
+ /*!
+ * Since actual frame level loop filtering level value is not available
+ * at the beginning of the tile (only available during actual filtering)
+ * at encoder side.we record the delta_lf (against the frame level loop
+ * filtering level) and code the delta between previous superblock's delta
+ * lf and current delta lf. It is equivalent to the delta between previous
+ * superblock's actual lf and current lf.
+ */
+ int8_t delta_lf_from_base;
+ /*!
+ * We have four frame filter levels for different plane and direction. So, to
+ * support the per superblock update, we need to add a few more params:
+ * 0. delta loop filter level for y plane vertical
+ * 1. delta loop filter level for y plane horizontal
+ * 2. delta loop filter level for u plane
+ * 3. delta loop filter level for v plane
+ * To make it consistent with the reference to each filter level in segment,
+ * we need to -1, since
+ * - SEG_LVL_ALT_LF_Y_V = 1;
+ * - SEG_LVL_ALT_LF_Y_H = 2;
+ * - SEG_LVL_ALT_LF_U = 3;
+ * - SEG_LVL_ALT_LF_V = 4;
+ */
+ int8_t delta_lf[FRAME_LF_COUNT];
+ /*!
+ * cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+ * current superblock has already been read from (decoder) / written to
+ * (encoder) the bitstream; and false otherwise.
+ * More detail:
+ * 1. CDEF strength is transmitted only once per CDEF unit, in the 1st
+ * non-skip coding block. So, we need this array to keep track of whether CDEF
+ * strengths for the given CDEF units have been transmitted yet or not.
+ * 2. Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+ * fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+ * superblock size is 128x128). Hence the array size is 4.
+ * 3. In the current implementation, CDEF strength for this CDEF unit is
+ * stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+ * cm->mi_params.mi_grid_base).
+ */
+ bool cdef_transmitted[4];
+
+ /*!
+ * Mask for this block used for compound prediction.
+ */
+ uint8_t *seg_mask;
+
+ /*!
+ * CFL (chroma from luma) related parameters.
+ */
+ CFL_CTX cfl;
+
+ /*!
+ * Offset to plane[p].color_index_map.
+ * Currently:
+ * - On encoder side, this is always 0 as 'color_index_map' is allocated per
+ * *coding block* there.
+ * - On decoder side, this may be non-zero, as 'color_index_map' is a (static)
+ * memory pointing to the base of a *superblock* there, and we need an offset
+ * to it to get the color index map for current coding block.
+ */
+ uint16_t color_index_map_offset[2];
+
+ /*!
+ * Temporary buffer used for convolution in case of compound reference only
+ * for (weighted or uniform) averaging operation.
+ * There are pointers to actual buffers allocated elsewhere: e.g.
+ * - In decoder, 'pbi->td.tmp_conv_dst' or
+ * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and
+ * - In encoder, 'x->tmp_conv_dst' or
+ * 'cpi->tile_thr_data[t].td->mb.tmp_conv_dst'.
+ */
+ CONV_BUF_TYPE *tmp_conv_dst;
+ /*!
+ * Temporary buffers used to build OBMC prediction by above (index 0) and left
+ * (index 1) predictors respectively.
+ * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'.
+ * There are pointers to actual buffers allocated elsewhere: e.g.
+ * - In decoder, 'pbi->td.tmp_obmc_bufs' or
+ * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and
+ * -In encoder, 'x->tmp_pred_bufs' or
+ * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
+ */
+ uint8_t *tmp_obmc_bufs[2];
+} MACROBLOCKD;
+
+/*!\cond */
+
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+#else
+ (void)xd;
+ return 0;
+#endif
+}
+
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? CONVERT_TO_BYTEPTR(buf16)
+ : buf16;
+#else
+ (void)xd;
+ return buf16;
+#endif
+}
+
+typedef struct BitDepthInfo {
+ int bit_depth;
+ /*! Is the image buffer high bit depth?
+ * Low bit depth buffer uses uint8_t.
+ * High bit depth buffer uses uint16_t.
+ * Equivalent to cm->seq_params->use_highbitdepth
+ */
+ int use_highbitdepth_buf;
+} BitDepthInfo;
+
+static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
+ BitDepthInfo bit_depth_info;
+ bit_depth_info.bit_depth = xd->bd;
+ bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd);
+ assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf,
+ bit_depth_info.bit_depth == 8));
+ return bit_depth_info;
+}
+
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_4X4: return 0;
+ case BLOCK_8X8: return 1;
+ case BLOCK_16X16: return 2;
+ case BLOCK_32X32: return 3;
+ case BLOCK_64X64: return 4;
+ case BLOCK_128X128: return 5;
+ default: return SQR_BLOCK_SIZES;
+ }
+}
+
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+ PARTITION_TYPE partition) {
+ if (partition == PARTITION_INVALID) {
+ return BLOCK_INVALID;
+ } else {
+ const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+ return sqr_bsize_idx >= SQR_BLOCK_SIZES
+ ? BLOCK_INVALID
+ : subsize_lookup[partition][sqr_bsize_idx];
+ }
+}
+
+static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
+ PLANE_TYPE plane_type) {
+ static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
+ DCT_DCT, // DC_PRED
+ ADST_DCT, // V_PRED
+ DCT_ADST, // H_PRED
+ DCT_DCT, // D45_PRED
+ ADST_ADST, // D135_PRED
+ ADST_DCT, // D113_PRED
+ DCT_ADST, // D157_PRED
+ DCT_ADST, // D203_PRED
+ ADST_DCT, // D67_PRED
+ ADST_ADST, // SMOOTH_PRED
+ ADST_DCT, // SMOOTH_V_PRED
+ DCT_ADST, // SMOOTH_H_PRED
+ ADST_ADST, // PAETH_PRED
+ };
+ const PREDICTION_MODE mode =
+ (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+ assert(mode < INTRA_MODES);
+ return _intra_mode_to_tx_type[mode];
+}
+
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+
+static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
+ return bsize > BLOCK_4X4;
+}
+
+// Number of transform types in each set type
+static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = {
+ 1, 2, 5, 7, 12, 16,
+};
+
+static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+
+// The bitmask corresponds to the transform types as defined in
+// enums.h TX_TYPE enumeration type. Setting the bit 0 means to disable
+// the use of the corresponding transform type in that table.
+// The av1_derived_intra_tx_used_flag table is used when
+// use_reduced_intra_txset is set to 2, where one only searches
+// the transform types derived from residual statistics.
+static const uint16_t av1_derived_intra_tx_used_flag[INTRA_MODES] = {
+ 0x0209, // DC_PRED: 0000 0010 0000 1001
+ 0x0403, // V_PRED: 0000 0100 0000 0011
+ 0x0805, // H_PRED: 0000 1000 0000 0101
+ 0x020F, // D45_PRED: 0000 0010 0000 1111
+ 0x0009, // D135_PRED: 0000 0000 0000 1001
+ 0x0009, // D113_PRED: 0000 0000 0000 1001
+ 0x0009, // D157_PRED: 0000 0000 0000 1001
+ 0x0805, // D203_PRED: 0000 1000 0000 0101
+ 0x0403, // D67_PRED: 0000 0100 0000 0011
+ 0x0205, // SMOOTH_PRED: 0000 0010 0000 1001
+ 0x0403, // SMOOTH_V_PRED: 0000 0100 0000 0011
+ 0x0805, // SMOOTH_H_PRED: 0000 1000 0000 0101
+ 0x0209, // PAETH_PRED: 0000 0010 0000 1001
+};
+
+static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
+ 0x080F, // DC_PRED: 0000 1000 0000 1111
+ 0x040F, // V_PRED: 0000 0100 0000 1111
+ 0x080F, // H_PRED: 0000 1000 0000 1111
+ 0x020F, // D45_PRED: 0000 0010 0000 1111
+ 0x080F, // D135_PRED: 0000 1000 0000 1111
+ 0x040F, // D113_PRED: 0000 0100 0000 1111
+ 0x080F, // D157_PRED: 0000 1000 0000 1111
+ 0x080F, // D203_PRED: 0000 1000 0000 1111
+ 0x040F, // D67_PRED: 0000 0100 0000 1111
+ 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111
+ 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111
+ 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111
+ 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110
+};
+
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+ 0x0001, // 0000 0000 0000 0001
+ 0x0201, // 0000 0010 0000 0001
+ 0x020F, // 0000 0010 0000 1111
+ 0x0E0F, // 0000 1110 0000 1111
+ 0x0FFF, // 0000 1111 1111 1111
+ 0xFFFF, // 1111 1111 1111 1111
+};
+
+static const TxSetType av1_ext_tx_set_lookup[2][2] = {
+ { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX },
+ { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
+};
+
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+ int use_reduced_set) {
+ const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+ if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
+ if (tx_size_sqr_up == TX_32X32)
+ return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
+ if (use_reduced_set)
+ return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+ const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+ return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
+}
+
+// Maps tx set types to the indices.
+static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
+ { // Intra
+ 0, -1, 2, 1, -1, -1 },
+ { // Inter
+ 0, 3, -1, -1, 2, 1 },
+};
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
+ int use_reduced_set) {
+ const TxSetType set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+ return ext_tx_set_index[is_inter][set_type];
+}
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
+ int use_reduced_set) {
+ const int set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+ return av1_num_ext_tx_set[set_type];
+}
+
+#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
+#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
+
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
+ const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+ if (bsize == BLOCK_4X4)
+ return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
+ if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
+ return max_rect_tx_size;
+ else
+ return largest_tx_size;
+}
+
+static const uint8_t mode_to_angle_map[INTRA_MODES] = {
+ 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
+};
+
+// Converts block_index for given transform size to index of the block in raster
+// order.
+static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
+ int block_idx) {
+ // For transform size 4x8, the possible block_idx values are 0 & 2, because
+ // block_idx values are incremented in steps of size 'tx_width_unit x
+ // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to
+ // block number 1 in raster order, inside an 8x8 MI block.
+ // For any other transform size, the two indices are equivalent.
+ return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx;
+}
+
+// Inverse of above function.
+// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now.
+static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
+ int raster_order) {
+ assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
+ // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4.
+ return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0;
+}
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+ const MACROBLOCKD *xd,
+ TX_SIZE tx_size,
+ int use_screen_content_tools) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+ xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
+ use_screen_content_tools)
+ return DEFAULT_INTER_TX_TYPE;
+
+ return intra_mode_to_tx_type(mbmi, plane_type);
+}
+
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+ int subsampling_x,
+ int subsampling_y) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ assert(subsampling_x >= 0 && subsampling_x < 2);
+ assert(subsampling_y >= 0 && subsampling_y < 2);
+ return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y];
+}
+
+/*
+ * Logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ * txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
+static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+ int blk_col) {
+ static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
+ };
+ static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
+ };
+ static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
+ };
+ const int index =
+ ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+ (blk_col >> tw_w_log2_table[bsize]);
+ assert(index < INTER_TX_SIZE_BUF_LEN);
+ return index;
+}
+
+#if CONFIG_INSPECTION
+/*
+ * Here is the logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+ * txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
+static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+ int blk_col) {
+ static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+ };
+ static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+ };
+ static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+ 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2,
+ };
+ const int index =
+ ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+ (blk_col >> tw_w_log2_table[bsize]);
+ assert(index < TXK_TYPE_BUF_LEN);
+ return index;
+}
+#endif // CONFIG_INSPECTION
+
+static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
+ int blk_col, TX_SIZE tx_size,
+ TX_TYPE tx_type) {
+ const int stride = xd->tx_type_map_stride;
+ xd->tx_type_map[blk_row * stride + blk_col] = tx_type;
+
+ const int txw = tx_size_wide_unit[tx_size];
+ const int txh = tx_size_high_unit[tx_size];
+ // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+ // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+ // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+ // the intricacy, cover all the 16x16 units inside a 64 level transform.
+ if (txw == tx_size_wide_unit[TX_64X64] ||
+ txh == tx_size_high_unit[TX_64X64]) {
+ const int tx_unit = tx_size_wide_unit[TX_16X16];
+ for (int idy = 0; idy < txh; idy += tx_unit) {
+ for (int idx = 0; idx < txw; idx += tx_unit) {
+ xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type;
+ }
+ }
+ }
+}
+
+static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
+ PLANE_TYPE plane_type, int blk_row,
+ int blk_col, TX_SIZE tx_size,
+ int reduced_tx_set) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+ return DCT_DCT;
+ }
+
+ TX_TYPE tx_type;
+ if (plane_type == PLANE_TYPE_Y) {
+ tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+ } else {
+ if (is_inter_block(mbmi)) {
+ // scale back to y plane's coordinate
+ const struct macroblockd_plane *const pd = &xd->plane[plane_type];
+ blk_row <<= pd->subsampling_y;
+ blk_col <<= pd->subsampling_x;
+ tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+ } else {
+ // In intra mode, uv planes don't share the same prediction mode as y
+ // plane, so the tx_type should not be shared
+ tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
+ }
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+ if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
+ }
+ assert(tx_type < TX_TYPES);
+ assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
+ reduced_tx_set)][tx_type]);
+ return tx_type;
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+ const int num_planes);
+
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * int depth = 0;
+ * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+ * depth++;
+ * tx_size = sub_tx_size_map[tx_size];
+ * }
+ */
+static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+ static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
+ 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ };
+ return bsize_to_max_depth_table[bsize];
+}
+
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * assert(tx_size != TX_4X4);
+ * int depth = 0;
+ * while (tx_size != TX_4X4) {
+ * depth++;
+ * tx_size = sub_tx_size_map[tx_size];
+ * }
+ * assert(depth < 10);
+ */
+static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
+ 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
+ };
+ const int depth = bsize_to_tx_size_depth_table[bsize];
+ assert(depth <= MAX_TX_CATS);
+ return depth - 1;
+}
+
+static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+ TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+ TX_SIZE tx_size = max_tx_size;
+ for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
+ return tx_size;
+}
+
+static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+ switch (tx_size) {
+ case TX_64X64:
+ case TX_64X32:
+ case TX_32X64: return TX_32X32;
+ case TX_64X16: return TX_32X16;
+ case TX_16X64: return TX_16X32;
+ default: return tx_size;
+ }
+}
+
+static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+ int subsampling_y) {
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
+ return av1_get_adjusted_tx_size(uv_tx);
+}
+
+static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ if (xd->lossless[mbmi->segment_id]) return TX_4X4;
+ if (plane == 0) return mbmi->tx_size;
+ const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+ return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y);
+}
+
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ const int num_planes);
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes);
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
+
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+ struct macroblockd_plane *pd, int plane,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int has_eob, int aoff, int loff);
+
+#define MAX_INTERINTRA_SB_SQUARE 32 * 32
+static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+ return (mbmi->ref_frame[0] > INTRA_FRAME &&
+ mbmi->ref_frame[1] == INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+ return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+ return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+ return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+ return is_interintra_allowed_bsize(mbmi->bsize) &&
+ is_interintra_allowed_mode(mbmi->mode) &&
+ is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(int group) {
+ int i;
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ if (size_group_lookup[i] == group &&
+ is_interintra_allowed_bsize((BLOCK_SIZE)i)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+ return mbmi->ref_frame[0] > INTRA_FRAME &&
+ mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
+}
+
+static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+ const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+ if (plane == 0) return max_txsize; // luma
+ return av1_get_adjusted_tx_size(max_txsize); // chroma
+}
+
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_motion_variation_allowed_compound(
+ const MB_MODE_INFO *mbmi) {
+ return !has_second_ref(mbmi);
+}
+
+// input: log2 of length, 0(4), 1(8), ...
+static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
+
+static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
+ return mbmi->overlappable_neighbors != 0;
+}
+
+static INLINE MOTION_MODE
+motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+ if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
+ if (xd->cur_frame_force_integer_mv == 0) {
+ const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
+ if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
+ }
+ if (is_motion_variation_allowed_bsize(mbmi->bsize) &&
+ is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
+ is_motion_variation_allowed_compound(mbmi)) {
+ assert(!has_second_ref(mbmi));
+ if (mbmi->num_proj_ref >= 1 && allow_warped_motion &&
+ !xd->cur_frame_force_integer_mv &&
+ !av1_is_scaled(xd->block_ref_scale_factors[0])) {
+ return WARPED_CAUSAL;
+ }
+ return OBMC_CAUSAL;
+ }
+ return SIMPLE_TRANSLATION;
+}
+
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+ return (is_inter_block(mbmi));
+}
+
+static INLINE int av1_allow_palette(int allow_screen_content_tools,
+ BLOCK_SIZE sb_type) {
+ assert(sb_type < BLOCK_SIZES_ALL);
+ return allow_screen_content_tools &&
+ block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH &&
+ block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT &&
+ sb_type >= BLOCK_8X8;
+}
+
+// Returns sub-sampled dimensions of the given block.
+// The output values for 'rows_within_bounds' and 'cols_within_bounds' will
+// differ from 'height' and 'width' when part of the block is outside the
+// right
+// and/or bottom image boundary.
+static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
+ const MACROBLOCKD *xd, int *width,
+ int *height,
+ int *rows_within_bounds,
+ int *cols_within_bounds) {
+ const int block_height = block_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ const int block_rows = (xd->mb_to_bottom_edge >= 0)
+ ? block_height
+ : (xd->mb_to_bottom_edge >> 3) + block_height;
+ const int block_cols = (xd->mb_to_right_edge >= 0)
+ ? block_width
+ : (xd->mb_to_right_edge >> 3) + block_width;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
+ assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
+ assert(block_width >= block_cols);
+ assert(block_height >= block_rows);
+ const int plane_block_width = block_width >> pd->subsampling_x;
+ const int plane_block_height = block_height >> pd->subsampling_y;
+ // Special handling for chroma sub8x8.
+ const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
+ const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
+ if (width) {
+ *width = plane_block_width + 2 * is_chroma_sub8_x;
+ assert(*width >= 0);
+ }
+ if (height) {
+ *height = plane_block_height + 2 * is_chroma_sub8_y;
+ assert(*height >= 0);
+ }
+ if (rows_within_bounds) {
+ *rows_within_bounds =
+ (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+ assert(*rows_within_bounds >= 0);
+ }
+ if (cols_within_bounds) {
+ *cols_within_bounds =
+ (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+ assert(*cols_within_bounds >= 0);
+ }
+}
+
+/* clang-format off */
+// Pointer to a three-dimensional array whose first dimension is PALETTE_SIZES.
+typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS]
+ [CDF_SIZE(PALETTE_COLORS)];
+// Pointer to a const three-dimensional array whose first dimension is
+// PALETTE_SIZES.
+typedef const int (*ColorCost)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS];
+/* clang-format on */
+
+typedef struct {
+ int rows;
+ int cols;
+ int n_colors;
+ int plane_width;
+ int plane_height;
+ uint8_t *color_map;
+ MapCdf map_cdf;
+ ColorCost color_cost;
+} Av1ColorMapParam;
+
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ int ref;
+
+ // First check if all modes are GLOBALMV
+ if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
+
+ if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2)
+ return 0;
+
+ // Now check if all global motion is non translational
+ for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
+ }
+ return 1;
+}
+
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+ return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+}
+
+static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+ if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
+ return 1024;
+ }
+ if (tx_size == TX_16X64 || tx_size == TX_64X16) {
+ return 512;
+ }
+ return tx_size_2d[tx_size];
+}
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
new file mode 100644
index 0000000000..12e9545441
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/thread_common.h"
+
+static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
+ int mi_stride) {
+ MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
+ for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
+ for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
+ if (!mbmi[c]->skip_txfm) return 0;
+ }
+ }
+
+ return 1;
+}
+
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col, cdef_list *dlist,
+ BLOCK_SIZE bs) {
+ MB_MODE_INFO **grid = mi_params->mi_grid_base;
+ int maxc = mi_params->mi_cols - mi_col;
+ int maxr = mi_params->mi_rows - mi_row;
+
+ if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
+ maxc = AOMMIN(maxc, MI_SIZE_128X128);
+ else
+ maxc = AOMMIN(maxc, MI_SIZE_64X64);
+ if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
+ maxr = AOMMIN(maxr, MI_SIZE_128X128);
+ else
+ maxr = AOMMIN(maxr, MI_SIZE_64X64);
+
+ const int r_step = 2; // mi_size_high[BLOCK_8X8]
+ const int c_step = 2; // mi_size_wide[BLOCK_8X8]
+ const int r_shift = 1;
+ const int c_shift = 1;
+ int count = 0;
+ for (int r = 0; r < maxr; r += r_step) {
+ for (int c = 0; c < maxc; c += c_step) {
+ if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
+ mi_params->mi_stride)) {
+ dlist[count].by = r >> r_shift;
+ dlist[count].bx = c >> c_shift;
+ count++;
+ }
+ }
+ }
+ return count;
+}
+
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride, int width,
+ int height) {
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride,
+ int width, int height) {
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride,
+ const uint8_t *src, int src_voffset,
+ int src_hoffset, int sstride, int vsize,
+ int hsize) {
+ const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+ cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
+}
+
+void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride,
+ const uint8_t *src, int src_voffset,
+ int src_hoffset, int sstride, int vsize,
+ int hsize) {
+ const uint16_t *base =
+ &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+ cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
+}
+
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+ int dstride, const uint8_t *src, int src_voffset,
+ int src_hoffset, int sstride, int vsize, int hsize) {
+ if (cm->seq_params->use_highbitdepth) {
+ av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset,
+ sstride, vsize, hsize);
+ } else {
+ av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset,
+ sstride, vsize, hsize);
+ }
+}
+
+static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
+ int sstride, int v, int h) {
+ for (int i = 0; i < v; i++) {
+ for (int j = 0; j < h; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+// Prepares intermediate input buffer for CDEF.
+// Inputs:
+// cm: Pointer to common structure.
+// fb_info: Pointer to the CDEF block-level parameter structure.
+// colbuf: Left column buffer for CDEF.
+// cdef_left: Left block is filtered or not.
+// fbc, fbr: col and row index of a block.
+// plane: plane index Y/CB/CR.
+// Returns:
+// Nothing will be returned.
+static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info,
+ uint16_t **const colbuf, const int cdef_left,
+ int fbc, int fbr, int plane) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ uint16_t *src = fb_info->src;
+ const int luma_stride =
+ ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4);
+ const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ int cstart = 0;
+ if (!cdef_left) cstart = -CDEF_HBORDER;
+ int rend, cend;
+ const int nhb =
+ AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+ const int nvb =
+ AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+ const int hsize = nhb << fb_info->mi_wide_l2;
+ const int vsize = nvb << fb_info->mi_high_l2;
+ const uint16_t *top_linebuf = fb_info->top_linebuf[plane];
+ const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane];
+ const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
+ const int stride =
+ luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x);
+
+ if (fbc == nhfb - 1)
+ cend = hsize;
+ else
+ cend = hsize + CDEF_HBORDER;
+
+ if (fbr == nvfb - 1)
+ rend = vsize;
+ else
+ rend = vsize + CDEF_VBORDER;
+
+ /* Copy in the pixels we need from the current superblock for
+ deringing.*/
+ av1_cdef_copy_sb8_16(
+ cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+ CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart,
+ fb_info->dst_stride, vsize, cend - cstart);
+
+ /* Copy in the pixels we need for the current superblock from bottom buffer.*/
+ if (fbr < nvfb - 1) {
+ copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
+ &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize);
+ } else {
+ fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+ hsize, CDEF_VERY_LARGE);
+ }
+ if (fbr < nvfb - 1 && fbc > 0) {
+ copy_rect(&src[bot_offset], CDEF_BSTRIDE,
+ &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride,
+ CDEF_VBORDER, CDEF_HBORDER);
+ } else {
+ fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (fbr < nvfb - 1 && fbc < nhfb - 1) {
+ copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+ &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
+ CDEF_HBORDER);
+ } else {
+ fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+ CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+
+ /* Copy in the pixels we need from the current superblock from top buffer.*/
+ if (fbr > 0) {
+ copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
+ stride, CDEF_VBORDER, hsize);
+ } else {
+ fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
+ CDEF_VERY_LARGE);
+ }
+ if (fbr > 0 && fbc > 0) {
+ copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
+ stride, CDEF_VBORDER, CDEF_HBORDER);
+ } else {
+ fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ if (fbr > 0 && fbc < nhfb - 1) {
+ copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+ &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
+ CDEF_HBORDER);
+ } else {
+ fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+ CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ if (cdef_left) {
+ /* If we deringed the superblock on the left then we need to copy in
+ saved pixels. */
+ copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
+ rend + CDEF_VBORDER, CDEF_HBORDER);
+ }
+ /* Saving pixels in case we need to dering the superblock on the
+ right. */
+ copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+ rend + CDEF_VBORDER, CDEF_HBORDER);
+
+ if (fb_info->frame_boundary[LEFT]) {
+ fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (fb_info->frame_boundary[RIGHT]) {
+ fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+ vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+}
+
+static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
+ uint8_t use_highbitdepth) {
+ int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
+ if (use_highbitdepth) {
+ av1_cdef_filter_fb(
+ NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride,
+ &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
+ fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
+ fb_info->dlist, fb_info->cdef_count, fb_info->level,
+ fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
+ } else {
+ av1_cdef_filter_fb(
+ fb_info->dst + offset, NULL, fb_info->dst_stride,
+ &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER],
+ fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane,
+ fb_info->dlist, fb_info->cdef_count, fb_info->level,
+ fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift);
+ }
+}
+
+// Initializes block-level parameters for CDEF.
+static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info, int *level,
+ int *sec_strength, int fbc, int fbr,
+ int plane) {
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ fb_info->level = level[plane_type];
+ fb_info->sec_strength = sec_strength[plane_type];
+ fb_info->dst = xd->plane[plane].dst.buf;
+ fb_info->dst_stride = xd->plane[plane].dst.stride;
+
+ fb_info->xdec = xd->plane[plane].subsampling_x;
+ fb_info->ydec = xd->plane[plane].subsampling_y;
+ fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x;
+ fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+ fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2;
+ fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2;
+}
+
+static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info, uint16_t **const colbuf,
+ int *cdef_left, int fbc, int fbr) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mbmi_cdef_strength =
+ mi_params
+ ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc]
+ ->cdef_strength;
+ const int num_planes = av1_num_planes(cm);
+ int is_zero_level[PLANE_TYPES] = { 1, 1 };
+ int level[PLANE_TYPES] = { 0 };
+ int sec_strength[PLANE_TYPES] = { 0 };
+ const CdefInfo *const cdef_info = &cm->cdef_info;
+
+ if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc] == NULL ||
+ mbmi_cdef_strength == -1) {
+ av1_zero_array(cdef_left, num_planes);
+ return;
+ }
+
+ // Compute level and secondary strength for planes
+ level[PLANE_TYPE_Y] =
+ cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+ sec_strength[PLANE_TYPE_Y] =
+ cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+ sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3;
+ is_zero_level[PLANE_TYPE_Y] =
+ (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0);
+
+ if (num_planes > 1) {
+ level[PLANE_TYPE_UV] =
+ cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+ sec_strength[PLANE_TYPE_UV] =
+ cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+ sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3;
+ is_zero_level[PLANE_TYPE_UV] =
+ (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0);
+ }
+
+ if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) {
+ av1_zero_array(cdef_left, num_planes);
+ return;
+ }
+
+ fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
+ fbc * MI_SIZE_64X64,
+ fb_info->dlist, BLOCK_64X64);
+ if (!fb_info->cdef_count) {
+ av1_zero_array(cdef_left, num_planes);
+ return;
+ }
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ // Do not skip cdef filtering for luma plane as filter direction is
+ // computed based on luma.
+ if (plane && is_zero_level[get_plane_type(plane)]) {
+ cdef_left[plane] = 0;
+ continue;
+ }
+ cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane);
+ cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane);
+ cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth);
+ cdef_left[plane] = 1;
+ }
+}
+
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info,
+ uint16_t **const linebuf, uint16_t *const src,
+ struct AV1CdefSyncData *const cdef_sync, int fbr) {
+ (void)cdef_sync;
+ const int num_planes = av1_num_planes(cm);
+ const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int luma_stride =
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+ const bool ping_pong = fbr & 1;
+ // for the current filter block, it's top left corner mi structure (mi_tl)
+ // is first accessed to check whether the top and left boundaries are
+ // frame boundaries. Then bottom-left and top-right mi structures are
+ // accessed to check whether the bottom and right boundaries
+ // (respectively) are frame boundaries.
+ //
+ // Note that we can't just check the bottom-right mi structure - eg. if
+ // we're at the right-hand edge of the frame but not the bottom, then
+ // the bottom-right mi is NULL but the bottom-left is not.
+ fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
+ if (fbr != nvfb - 1)
+ fb_info->frame_boundary[BOTTOM] =
+ (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
+ else
+ fb_info->frame_boundary[BOTTOM] = 1;
+
+ fb_info->src = src;
+ fb_info->damping = cm->cdef_info.cdef_damping;
+ fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+ av1_zero(fb_info->dir);
+ av1_zero(fb_info->var);
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+ const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+ const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+ // here ping-pong buffers are maintained for top linebuf
+ // to avoid linebuf over-write by consecutive row.
+ uint16_t *const top_linebuf =
+ &linebuf[plane][ping_pong * CDEF_VBORDER * stride];
+ fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride];
+
+ if (fbr != nvfb - 1) // top line buffer copy
+ av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf,
+ offset - CDEF_VBORDER, 0,
+ xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+ fb_info->top_linebuf[plane] =
+ &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride];
+
+ if (fbr != nvfb - 1) // bottom line buffer copy
+ av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride,
+ xd->plane[plane].dst.buf, offset, 0,
+ xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+ }
+}
+
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ uint16_t **const linebuf, uint16_t **const colbuf,
+ uint16_t *const src, int fbr,
+ cdef_init_fb_row_t cdef_init_fb_row_fn,
+ struct AV1CdefSyncData *const cdef_sync,
+ struct aom_internal_error_info *error_info) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
+ CdefBlockInfo fb_info;
+ int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 };
+ const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+ cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
+#if CONFIG_MULTITHREAD
+ if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) {
+ pthread_mutex_lock(cdef_sync->mutex_);
+ const bool cdef_mt_exit = cdef_sync->cdef_mt_exit;
+ pthread_mutex_unlock(cdef_sync->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (cdef_mt_exit) return;
+ }
+#endif
+ for (int fbc = 0; fbc < nhfb; fbc++) {
+ fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
+ if (fbc != nhfb - 1)
+ fb_info.frame_boundary[RIGHT] =
+ (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
+ else
+ fb_info.frame_boundary[RIGHT] = 1;
+ cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr);
+ }
+}
+
+// Perform CDEF on input frame.
+// Inputs:
+// frame: Pointer to input frame buffer.
+// cm: Pointer to common structure.
+// xd: Pointer to common current coding block structure.
+// Returns:
+// Nothing will be returned.
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+ MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) {
+ const int num_planes = av1_num_planes(cm);
+ const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+ av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+ num_planes);
+
+ for (int fbr = 0; fbr < nvfb; fbr++)
+ av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
+ cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL,
+ xd->error_info);
+}
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
new file mode 100644
index 0000000000..a56cd9db4a
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
+
+#define CDEF_STRENGTH_BITS 6
+
+#define CDEF_PRI_STRENGTHS 16
+#define CDEF_SEC_STRENGTHS 4
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cdef_block.h"
+
+enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY);
+
+struct AV1CdefSyncData;
+
+/*!\brief Parameters related to CDEF Block */
+typedef struct {
+ uint16_t *src; /*!< CDEF intermediate buffer */
+ uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */
+ uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */
+ uint8_t *dst; /*!< CDEF destination buffer */
+ cdef_list
+ dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */
+
+ int xdec; /*!< Sub-sampling X */
+ int ydec; /*!< Sub-sampling X */
+ int mi_wide_l2; /*!< Pixels per mi unit in width */
+ int mi_high_l2; /*!< Pixels per mi unit in height */
+ int frame_boundary[BOUNDARIES]; /*!< frame boundaries */
+
+ int damping; /*!< CDEF damping factor */
+ int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */
+ int level; /*!< CDEF filtering level */
+ int sec_strength; /*!< CDEF secondary strength */
+ int cdef_count; /*!< Number of CDEF sub-blocks in superblock */
+ int dir[CDEF_NBLOCKS]
+ [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */
+
+ int dst_stride; /*!< CDEF destination buffer stride */
+ int coffset; /*!< current superblock offset in a row */
+ int roffset; /*!< current row offset */
+} CdefBlockInfo;
+
+static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
+
+static INLINE int constrain(int diff, int threshold, int damping) {
+ if (!threshold) return 0;
+
+ const int shift = AOMMAX(0, damping - get_msb(threshold));
+ return sign(diff) *
+ AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col, cdef_list *dlist,
+ BLOCK_SIZE bsize);
+
+typedef void (*cdef_init_fb_row_t)(
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src,
+ struct AV1CdefSyncData *const cdef_sync, int fbr);
+
+/*!\brief Function for applying CDEF to a frame
+ *
+ * \ingroup in_loop_cdef
+ * This function applies CDEF to a frame.
+ *
+ * \param[in, out] frame Compressed frame buffer
+ * \param[in, out] cm Pointer to top level common structure
+ * \param[in] xd Pointer to common current coding block structure
+ * \param[in] cdef_init_fb_row_fn Function Pointer
+ *
+ * \remark Nothing is returned. Instead, the filtered frame is output in
+ * \c frame.
+ */
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+ MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ uint16_t **const linebuf, uint16_t **const colbuf,
+ uint16_t *const src, int fbr,
+ cdef_init_fb_row_t cdef_init_fb_row_fn,
+ struct AV1CdefSyncData *const cdef_sync,
+ struct aom_internal_error_info *error_info);
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info,
+ uint16_t **const linebuf, uint16_t *const src,
+ struct AV1CdefSyncData *const cdef_sync, int fbr);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c
new file mode 100644
index 0000000000..ce7039f374
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef.h"
+/*
+This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+beginning and end of the table. The cdef direction range is [0, 7] and the
+first index is offset +/-2. This removes the need to constrain the first
+index to the same range using e.g., & 7.
+*/
+DECLARE_ALIGNED(16, const int, cdef_directions_padded[12][2]) = {
+ /* Padding: cdef_directions[6] */
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+ /* Padding: cdef_directions[7] */
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 },
+
+ /* Begin cdef_directions */
+ { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
+ { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 },
+ /* End cdef_directions */
+
+ /* Padding: cdef_directions[0] */
+ { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+ /* Padding: cdef_directions[1] */
+ { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+};
+
+const int (*const cdef_directions)[2] = cdef_directions_padded + 2;
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+ The search minimizes the weighted variance along all the lines in a
+ particular direction, i.e. the squared error between the input and a
+ "predicted" block where each pixel is replaced by the average along a line
+ in a particular direction. Since each direction have the same sum(x^2) term,
+ that term is never computed. See Section 2, step 2, of:
+ http://jmvalin.ca/notes/intra_paint.pdf */
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift) {
+ int i;
+ int32_t cost[8] = { 0 };
+ int partial[8][15] = { { 0 } };
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+ The output is then 840 times larger, but we don't care for finding
+ the max. */
+ static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 };
+ for (i = 0; i < 8; i++) {
+ int j;
+ for (j = 0; j < 8; j++) {
+ int x;
+ /* We subtract 128 here to reduce the maximum range of the squared
+ partial sums. */
+ x = (img[i * stride + j] >> coeff_shift) - 128;
+ partial[0][i + j] += x;
+ partial[1][i + j / 2] += x;
+ partial[2][i] += x;
+ partial[3][3 + i - j / 2] += x;
+ partial[4][7 + i - j] += x;
+ partial[5][3 - i / 2 + j] += x;
+ partial[6][j] += x;
+ partial[7][i / 2 + j] += x;
+ }
+ }
+ for (i = 0; i < 8; i++) {
+ cost[2] += partial[2][i] * partial[2][i];
+ cost[6] += partial[6][i] * partial[6][i];
+ }
+ cost[2] *= div_table[8];
+ cost[6] *= div_table[8];
+ for (i = 0; i < 7; i++) {
+ cost[0] += (partial[0][i] * partial[0][i] +
+ partial[0][14 - i] * partial[0][14 - i]) *
+ div_table[i + 1];
+ cost[4] += (partial[4][i] * partial[4][i] +
+ partial[4][14 - i] * partial[4][14 - i]) *
+ div_table[i + 1];
+ }
+ cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+ cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+ for (i = 1; i < 8; i += 2) {
+ int j;
+ for (j = 0; j < 4 + 1; j++) {
+ cost[i] += partial[i][3 + j] * partial[i][3 + j];
+ }
+ cost[i] *= div_table[8];
+ for (j = 0; j < 4 - 1; j++) {
+ cost[i] += (partial[i][j] * partial[i][j] +
+ partial[i][10 - j] * partial[i][10 - j]) *
+ div_table[2 * j + 2];
+ }
+ }
+ for (i = 0; i < 8; i++) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ best_dir = i;
+ }
+ }
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var1, int32_t *var2,
+ int coeff_shift, int *out1, int *out2) {
+ *out1 = cdef_find_dir_c(img1, stride, var1, coeff_shift);
+ *out2 = cdef_find_dir_c(img2, stride, var2, coeff_shift);
+}
+
+const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+const int cdef_sec_taps[2] = { 2, 1 };
+
+/* Smooth in the direction detected. */
+static void cdef_filter_block_internal(
+ uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir, int pri_damping,
+ int sec_damping, int coeff_shift, int block_width, int block_height,
+ int enable_primary, int enable_secondary) {
+ const int clipping_required = (enable_primary && enable_secondary);
+ int i, j, k;
+ const int s = CDEF_BSTRIDE;
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+ for (i = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++) {
+ int16_t sum = 0;
+ int16_t y;
+ int16_t x = in[i * s + j];
+ int max = x;
+ int min = x;
+ for (k = 0; k < 2; k++) {
+ if (enable_primary) {
+ int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+ int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+ sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+ sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+ if (clipping_required) {
+ if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+ if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+ min = AOMMIN(p0, min);
+ min = AOMMIN(p1, min);
+ }
+ }
+ if (enable_secondary) {
+ int16_t s0 = in[i * s + j + cdef_directions[dir + 2][k]];
+ int16_t s1 = in[i * s + j - cdef_directions[dir + 2][k]];
+ int16_t s2 = in[i * s + j + cdef_directions[dir - 2][k]];
+ int16_t s3 = in[i * s + j - cdef_directions[dir - 2][k]];
+ if (clipping_required) {
+ if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+ if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+ if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+ if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+ min = AOMMIN(s0, min);
+ min = AOMMIN(s1, min);
+ min = AOMMIN(s2, min);
+ min = AOMMIN(s3, min);
+ }
+ sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+ }
+ }
+ y = ((int16_t)x + ((8 + sum - (sum < 0)) >> 4));
+ if (clipping_required) {
+ y = clamp(y, min, max);
+ }
+
+ if (dst8)
+ dst8[i * dstride + j] = (uint8_t)y;
+ else
+ dst16[i * dstride + j] = (uint16_t)y;
+ }
+ }
+}
+
+void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/1, /*enable_secondary=*/1);
+}
+
+void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/1, /*enable_secondary=*/0);
+}
+
+void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/0, /*enable_secondary=*/1);
+}
+
+void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/0, /*enable_secondary=*/0);
+}
+
+void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/1, /*enable_secondary=*/1);
+}
+
+void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/1, /*enable_secondary=*/0);
+}
+
+void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/0, /*enable_secondary=*/1);
+}
+
+void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping, int coeff_shift,
+ int block_width, int block_height) {
+ cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping,
+ coeff_shift, block_width, block_height,
+ /*enable_primary=*/0, /*enable_secondary=*/0);
+}
+
+/* Compute the primary filter strength for an 8x8 block based on the
+ directional variance difference. A high variance difference means
+ that we have a highly directional pattern (e.g. a high contrast
+ edge), so we can apply more deringing. A low variance means that we
+ either have a low contrast edge, or a non-directional texture, so
+ we want to be careful not to blur. */
+static INLINE int adjust_strength(int strength, int32_t var) {
+ const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
+ /* We use the variance of 8x8 blocks to adjust the strength. */
+ return var ? (strength * (4 + i) + 8) >> 4 : 0;
+}
+
+static AOM_INLINE void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS],
+ int cdef_count, int coeff_shift,
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) {
+ int bi;
+
+ // Find direction of two 8x8 blocks together.
+ for (bi = 0; bi < cdef_count - 1; bi += 2) {
+ const int by = dlist[bi].by;
+ const int bx = dlist[bi].bx;
+ const int by2 = dlist[bi + 1].by;
+ const int bx2 = dlist[bi + 1].bx;
+ const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx;
+ const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2;
+ cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx],
+ &var[by2][bx2], coeff_shift, &dir[by][bx],
+ &dir[by2][bx2]);
+ }
+
+ // Process remaining 8x8 blocks here. One 8x8 at a time.
+ if (cdef_count % 2) {
+ const int by = dlist[bi].by;
+ const int bx = dlist[bi].bx;
+ dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+ CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+ }
+}
+
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int xdec, int ydec,
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int damping, int coeff_shift) {
+ int bi;
+ int bx;
+ int by;
+ const int pri_strength = level << coeff_shift;
+ sec_strength <<= coeff_shift;
+ damping += coeff_shift - (pli != AOM_PLANE_Y);
+ const int bw_log2 = 3 - xdec;
+ const int bh_log2 = 3 - ydec;
+ if (dirinit && pri_strength == 0 && sec_strength == 0) {
+ // If we're here, both primary and secondary strengths are 0, and
+ // we still haven't written anything to y[] yet, so we just copy
+ // the input to y[]. This is necessary only for av1_cdef_search()
+ // and only av1_cdef_search() sets dirinit.
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ // TODO(stemidts/jmvalin): SIMD optimisations
+ for (int iy = 0; iy < 1 << bh_log2; iy++) {
+ memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
+ &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
+ ((size_t)1 << bw_log2) * sizeof(*dst16));
+ }
+ }
+ return;
+ }
+
+ if (pli == 0) {
+ if (!dirinit || !*dirinit) {
+ aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir);
+ if (dirinit) *dirinit = 1;
+ }
+ }
+ if (pli == 1 && xdec != ydec) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
+ static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
+ }
+ }
+
+ if (dst8) {
+ const int block_width = 8 >> xdec;
+ const int block_height = 8 >> ydec;
+ /*
+ * strength_index == 0 : enable_primary = 1, enable_secondary = 1
+ * strength_index == 1 : enable_primary = 1, enable_secondary = 0
+ * strength_index == 2 : enable_primary = 0, enable_secondary = 1
+ * strength_index == 3 : enable_primary = 0, enable_secondary = 0
+ */
+ const cdef_filter_block_func cdef_filter_fn[4] = {
+ cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3
+ };
+
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ const int t =
+ (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
+ const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
+
+ cdef_filter_fn[strength_index](
+ &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride,
+ &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
+ sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
+ coeff_shift, block_width, block_height);
+ }
+ } else {
+ const int block_width = 8 >> xdec;
+ const int block_height = 8 >> ydec;
+ /*
+ * strength_index == 0 : enable_primary = 1, enable_secondary = 1
+ * strength_index == 1 : enable_primary = 1, enable_secondary = 0
+ * strength_index == 2 : enable_primary = 0, enable_secondary = 1
+ * strength_index == 3 : enable_primary = 0, enable_secondary = 0
+ */
+ const cdef_filter_block_func cdef_filter_fn[4] = {
+ cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3
+ };
+
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ const int t =
+ (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx]));
+ const int strength_index = (sec_strength == 0) | ((t == 0) << 1);
+
+ cdef_filter_fn[strength_index](
+ &dst16[dirinit ? bi << (bw_log2 + bh_log2)
+ : (by << bh_log2) * dstride + (bx << bw_log2)],
+ dirinit ? 1 << bw_log2 : dstride,
+ &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t,
+ sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping,
+ coeff_shift, block_width, block_height);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
new file mode 100644
index 0000000000..b5e4f124ae
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
+
+#include "aom_dsp/odintrin.h"
+
+#define CDEF_BLOCKSIZE 64
+#define CDEF_BLOCKSIZE_LOG2 6
+#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
+#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
+
+/* We need to buffer two vertical lines. */
+#define CDEF_VBORDER (2)
+/* We only need to buffer three horizontal pixels too, but let's align to
+ 16 bytes (8 x 16 bits) to make vectorization easier. */
+#define CDEF_HBORDER (8)
+#define CDEF_BSTRIDE \
+ ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
+
+#define CDEF_VERY_LARGE (0x4000)
+#define CDEF_INBUF_SIZE \
+ (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
+
+extern const int cdef_pri_taps[2][2];
+extern const int cdef_sec_taps[2];
+extern const int (*const cdef_directions)[2];
+
+typedef struct {
+ uint8_t by;
+ uint8_t bx;
+} cdef_list;
+
+typedef void (*cdef_filter_block_func)(void *dest, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height);
+
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int xdec, int ydec,
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int damping, int coeff_shift);
+
+static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+ uint16_t x) {
+ for (int i = 0; i < v; i++) {
+ for (int j = 0; j < h; j++) {
+ dst[i * dstride + j] = x;
+ }
+ }
+}
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
new file mode 100644
index 0000000000..5c62201f1e
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef_block.h"
+
+/* partial A is a 16-bit vector of the form:
+ [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+ [0 y1 y2 y3 y4 y5 y6 y7].
+ This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+ (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+ and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+ v128 const2) {
+ v128 tmp;
+ /* Reverse partial B. */
+ partialb = v128_shuffle_8(
+ partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+ /* Interleave the x and y values of identical indices and pair x8 with 0. */
+ tmp = partiala;
+ partiala = v128_ziplo_16(partialb, partiala);
+ partialb = v128_ziphi_16(partialb, tmp);
+ /* Square and add the corresponding x and y values. */
+ partiala = v128_madd_s16(partiala, partiala);
+ partialb = v128_madd_s16(partialb, partialb);
+ /* Multiply by constant. */
+ partiala = v128_mullo_s32(partiala, const1);
+ partialb = v128_mullo_s32(partialb, const2);
+ /* Sum all results. */
+ partiala = v128_add_32(partiala, partialb);
+ return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+ v128 t0, t1, t2, t3;
+ t0 = v128_ziplo_32(x1, x0);
+ t1 = v128_ziplo_32(x3, x2);
+ t2 = v128_ziphi_32(x1, x0);
+ t3 = v128_ziphi_32(x3, x2);
+ x0 = v128_ziplo_64(t1, t0);
+ x1 = v128_ziphi_64(t1, t0);
+ x2 = v128_ziplo_64(t3, t2);
+ x3 = v128_ziphi_64(t3, t2);
+ return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+ to compute the remaining directions. */
+static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+ v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+ v128 partial6;
+ v128 tmp;
+ /* Partial sums for lines 0 and 1. */
+ partial4a = v128_shl_n_byte(lines[0], 14);
+ partial4b = v128_shr_n_byte(lines[0], 2);
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+ tmp = v128_add_16(lines[0], lines[1]);
+ partial5a = v128_shl_n_byte(tmp, 10);
+ partial5b = v128_shr_n_byte(tmp, 6);
+ partial7a = v128_shl_n_byte(tmp, 4);
+ partial7b = v128_shr_n_byte(tmp, 12);
+ partial6 = tmp;
+
+ /* Partial sums for lines 2 and 3. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+ tmp = v128_add_16(lines[2], lines[3]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Partial sums for lines 4 and 5. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+ tmp = v128_add_16(lines[4], lines[5]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Partial sums for lines 6 and 7. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+ partial4a = v128_add_16(partial4a, lines[7]);
+ tmp = v128_add_16(lines[6], lines[7]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Compute costs in terms of partial sums. */
+ partial4a =
+ fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+ v128_from_32(105, 120, 140, 168));
+ partial7a =
+ fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+ v128_from_32(105, 105, 105, 140));
+ partial5a =
+ fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+ v128_from_32(105, 105, 105, 140));
+ partial6 = v128_madd_s16(partial6, partial6);
+ partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+ partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+ v128_store_unaligned(tmp_cost1, partial4a);
+ return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+ counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+ const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+ const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+ const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+ const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+ const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+ const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+ const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+ const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+ const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+ const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+ const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+ const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+ const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+ const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+ const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+ const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+ res[7] = v128_ziplo_64(tr1_1, tr1_0);
+ res[6] = v128_ziphi_64(tr1_1, tr1_0);
+ res[5] = v128_ziplo_64(tr1_3, tr1_2);
+ res[4] = v128_ziphi_64(tr1_3, tr1_2);
+ res[3] = v128_ziplo_64(tr1_5, tr1_4);
+ res[2] = v128_ziphi_64(tr1_5, tr1_4);
+ res[1] = v128_ziplo_64(tr1_7, tr1_6);
+ res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift) {
+ int i;
+ int32_t cost[8];
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ v128 lines[8];
+ for (i = 0; i < 8; i++) {
+ lines[i] = v128_load_unaligned(&img[i * stride]);
+ lines[i] =
+ v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+ }
+
+ /* Compute "mostly vertical" directions. */
+ v128 dir47 = compute_directions(lines, cost + 4);
+
+ array_reverse_transpose_8x8(lines, lines);
+
+ /* Compute "mostly horizontal" directions. */
+ v128 dir03 = compute_directions(lines, cost);
+
+ v128 max = v128_max_s32(dir03, dir47);
+ max = v128_max_s32(max, v128_align(max, max, 8));
+ max = v128_max_s32(max, v128_align(max, max, 4));
+ best_cost = v128_low_u32(max);
+ v128 t =
+ v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
+ best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
+ best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
+
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+// Work around compiler out of memory issues with Win32 builds. This issue has
+// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4).
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1940
+#define CDEF_INLINE static INLINE
+#else
+#define CDEF_INLINE SIMD_INLINE
+#endif
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
+ unsigned int adjdamp) {
+ v256 diff = v256_sub_16(a, b);
+ const v256 sign = v256_shr_n_s16(diff, 15);
+ diff = v256_abs_s16(diff);
+ const v256 s =
+ v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
+ return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
+}
+
+SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max,
+ v256 cdef_large_value_mask) {
+ if (is_lowbd) {
+ v256 max_u8;
+ max_u8 = tap[0];
+ max_u8 = v256_max_u8(max_u8, tap[1]);
+ max_u8 = v256_max_u8(max_u8, tap[2]);
+ max_u8 = v256_max_u8(max_u8, tap[3]);
+ /* The source is 16 bits, however, we only really care about the lower
+ 8 bits. The upper 8 bits contain the "large" flag. After the final
+ primary max has been calculated, zero out the upper 8 bits. Use this
+ to find the "16 bit" max. */
+ max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
+ } else {
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
+ }
+ return max;
+}
+
+SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max,
+ v256 cdef_large_value_mask) {
+ if (is_lowbd) {
+ v256 max_u8;
+ max_u8 = tap[0];
+ max_u8 = v256_max_u8(max_u8, tap[1]);
+ max_u8 = v256_max_u8(max_u8, tap[2]);
+ max_u8 = v256_max_u8(max_u8, tap[3]);
+ max_u8 = v256_max_u8(max_u8, tap[4]);
+ max_u8 = v256_max_u8(max_u8, tap[5]);
+ max_u8 = v256_max_u8(max_u8, tap[6]);
+ max_u8 = v256_max_u8(max_u8, tap[7]);
+ /* The source is 16 bits, however, we only really care about the lower
+ 8 bits. The upper 8 bits contain the "large" flag. After the final
+ primary max has been calculated, zero out the upper 8 bits. Use this
+ to find the "16 bit" max. */
+ max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask));
+ } else {
+ /* Convert CDEF_VERY_LARGE to 0 before calculating max. */
+ max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask));
+ max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask));
+ }
+ return max;
+}
+
+// MSVC takes far too much time optimizing these.
+// https://bugs.chromium.org/p/aomedia/issues/detail?id=3395
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
+
+CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir, int pri_damping,
+ int sec_damping, int coeff_shift, int height,
+ int enable_primary, int enable_secondary) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+ const int clipping_required = enable_primary && enable_secondary;
+ v256 p0, p1, p2, p3;
+ v256 sum, row, res;
+ v256 max, min;
+ const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+ int i;
+
+ if (enable_primary && pri_strength)
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ if (enable_secondary && sec_strength)
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+ for (i = 0; i < height; i += 4) {
+ sum = v256_zero();
+ row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
+ max = min = row;
+
+ if (enable_primary) {
+ v256 tap[4];
+ // Primary near taps
+ tap[0] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+ p0 = constrain16(tap[0], row, pri_strength, pri_damping);
+ tap[1] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
+ p1 = constrain16(tap[1], row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+ // Primary far taps
+ tap[2] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+ p0 = constrain16(tap[2], row, pri_strength, pri_damping);
+ tap[3] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
+ p1 = constrain16(tap[3], row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+ if (clipping_required) {
+ max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = v256_min_s16(min, tap[0]);
+ min = v256_min_s16(min, tap[1]);
+ min = v256_min_s16(min, tap[2]);
+ min = v256_min_s16(min, tap[3]);
+ }
+ }
+
+ if (enable_secondary) {
+ v256 tap[8];
+ // Secondary near taps
+ tap[0] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+ p0 = constrain16(tap[0], row, sec_strength, sec_damping);
+ tap[1] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+ p1 = constrain16(tap[1], row, sec_strength, sec_damping);
+ tap[2] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+ p2 = constrain16(tap[2], row, sec_strength, sec_damping);
+ tap[3] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
+ p3 = constrain16(tap[3], row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ // Secondary far taps
+ tap[4] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+ p0 = constrain16(tap[4], row, sec_strength, sec_damping);
+ tap[5] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+ p1 = constrain16(tap[5], row, sec_strength, sec_damping);
+ tap[6] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+ p2 = constrain16(tap[6], row, sec_strength, sec_damping);
+ tap[7] =
+ v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
+ p3 = constrain16(tap[7], row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ if (clipping_required) {
+ max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = v256_min_s16(min, tap[0]);
+ min = v256_min_s16(min, tap[1]);
+ min = v256_min_s16(min, tap[2]);
+ min = v256_min_s16(min, tap[3]);
+ min = v256_min_s16(min, tap[4]);
+ min = v256_min_s16(min, tap[5]);
+ min = v256_min_s16(min, tap[6]);
+ min = v256_min_s16(min, tap[7]);
+ }
+ }
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+ if (clipping_required) {
+ res = v256_min_s16(v256_max_s16(res, min), max);
+ }
+
+ if (is_lowbd) {
+ const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
+ u32_store_aligned(&dst8[(i + 0) * dstride],
+ v64_high_u32(v128_high_v64(res_128)));
+ u32_store_aligned(&dst8[(i + 1) * dstride],
+ v64_low_u32(v128_high_v64(res_128)));
+ u32_store_aligned(&dst8[(i + 2) * dstride],
+ v64_high_u32(v128_low_v64(res_128)));
+ u32_store_aligned(&dst8[(i + 3) * dstride],
+ v64_low_u32(v128_low_v64(res_128)));
+ } else {
+ v64_store_aligned(&dst16[(i + 0) * dstride],
+ v128_high_v64(v256_high_v128(res)));
+ v64_store_aligned(&dst16[(i + 1) * dstride],
+ v128_low_v64(v256_high_v128(res)));
+ v64_store_aligned(&dst16[(i + 2) * dstride],
+ v128_high_v64(v256_low_v128(res)));
+ v64_store_aligned(&dst16[(i + 3) * dstride],
+ v128_low_v64(v256_low_v128(res)));
+ }
+ }
+}
+
+CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir, int pri_damping,
+ int sec_damping, int coeff_shift, int height,
+ int enable_primary, int enable_secondary) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+ const int clipping_required = enable_primary && enable_secondary;
+ int i;
+ v256 sum, p0, p1, p2, p3, row, res;
+ const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE);
+ v256 max, min;
+ const int po1 = cdef_directions[dir][0];
+ const int po2 = cdef_directions[dir][1];
+ const int s1o1 = cdef_directions[dir + 2][0];
+ const int s1o2 = cdef_directions[dir + 2][1];
+ const int s2o1 = cdef_directions[dir - 2][0];
+ const int s2o2 = cdef_directions[dir - 2][1];
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps;
+
+ if (enable_primary && pri_strength)
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ if (enable_secondary && sec_strength)
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+ for (i = 0; i < height; i += 2) {
+ v256 tap[8];
+ sum = v256_zero();
+ row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+ v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+ min = max = row;
+ if (enable_primary) {
+ // Primary near taps
+ tap[0] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+ tap[1] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+ p0 = constrain16(tap[0], row, pri_strength, pri_damping);
+ p1 = constrain16(tap[1], row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+ // Primary far taps
+ tap[2] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+ tap[3] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+ p0 = constrain16(tap[2], row, pri_strength, pri_damping);
+ p1 = constrain16(tap[3], row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+ if (clipping_required) {
+ max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = v256_min_s16(min, tap[0]);
+ min = v256_min_s16(min, tap[1]);
+ min = v256_min_s16(min, tap[2]);
+ min = v256_min_s16(min, tap[3]);
+ }
+ // End primary
+ }
+
+ if (enable_secondary) {
+ // Secondary near taps
+ tap[0] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+ tap[1] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+ tap[2] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+ tap[3] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+ p0 = constrain16(tap[0], row, sec_strength, sec_damping);
+ p1 = constrain16(tap[1], row, sec_strength, sec_damping);
+ p2 = constrain16(tap[2], row, sec_strength, sec_damping);
+ p3 = constrain16(tap[3], row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ // Secondary far taps
+ tap[4] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+ tap[5] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+ tap[6] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+ tap[7] = v256_from_v128(
+ v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+ p0 = constrain16(tap[4], row, sec_strength, sec_damping);
+ p1 = constrain16(tap[5], row, sec_strength, sec_damping);
+ p2 = constrain16(tap[6], row, sec_strength, sec_damping);
+ p3 = constrain16(tap[7], row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ if (clipping_required) {
+ max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask);
+
+ min = v256_min_s16(min, tap[0]);
+ min = v256_min_s16(min, tap[1]);
+ min = v256_min_s16(min, tap[2]);
+ min = v256_min_s16(min, tap[3]);
+ min = v256_min_s16(min, tap[4]);
+ min = v256_min_s16(min, tap[5]);
+ min = v256_min_s16(min, tap[6]);
+ min = v256_min_s16(min, tap[7]);
+ }
+ // End secondary
+ }
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+ if (clipping_required) {
+ res = v256_min_s16(v256_max_s16(res, min), max);
+ }
+
+ if (is_lowbd) {
+ const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res));
+ v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128));
+ v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128));
+ } else {
+ v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res));
+ v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res));
+ }
+ }
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
+
+SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int height) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+ int i;
+ for (i = 0; i < height; i += 4) {
+ const v128 row0 =
+ v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+ const v128 row1 =
+ v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
+ if (is_lowbd) {
+ /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
+ const v128 res_128 = v128_pack_s16_u8(row1, row0);
+ u32_store_aligned(&dst8[(i + 0) * dstride],
+ v64_high_u32(v128_low_v64(res_128)));
+ u32_store_aligned(&dst8[(i + 1) * dstride],
+ v64_low_u32(v128_low_v64(res_128)));
+ u32_store_aligned(&dst8[(i + 2) * dstride],
+ v64_high_u32(v128_high_v64(res_128)));
+ u32_store_aligned(&dst8[(i + 3) * dstride],
+ v64_low_u32(v128_high_v64(res_128)));
+ } else {
+ v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0));
+ v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0));
+ v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1));
+ v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1));
+ }
+ }
+}
+
+SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride,
+ const uint16_t *in, int height) {
+ uint8_t *dst8 = (uint8_t *)dest;
+ uint16_t *dst16 = (uint16_t *)dest;
+ int i;
+ for (i = 0; i < height; i += 2) {
+ const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
+ const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]);
+ if (is_lowbd) {
+ /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */
+ const v128 res_128 = v128_pack_s16_u8(row1, row0);
+ v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128));
+ v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128));
+ } else {
+ v128_store_unaligned(&dst16[i * dstride], row0);
+ v128_store_unaligned(&dst16[(i + 1) * dstride], row1);
+ }
+ }
+}
+
+void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ }
+}
+
+void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ }
+}
+void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ }
+}
+
+void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ (void)pri_strength;
+ (void)sec_strength;
+ (void)dir;
+ (void)pri_damping;
+ (void)sec_damping;
+ (void)coeff_shift;
+ (void)block_width;
+
+ if (block_width == 8) {
+ copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
+ } else {
+ copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height);
+ }
+}
+
+void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/1);
+ }
+}
+
+void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/1,
+ /*enable_secondary=*/0);
+ }
+}
+void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ if (block_width == 8) {
+ filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ } else {
+ filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, coeff_shift,
+ block_height, /*enable_primary=*/0,
+ /*enable_secondary=*/1);
+ }
+}
+
+void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int coeff_shift, int block_width,
+ int block_height) {
+ (void)pri_strength;
+ (void)sec_strength;
+ (void)dir;
+ (void)pri_damping;
+ (void)sec_damping;
+ (void)coeff_shift;
+ (void)block_width;
+ if (block_width == 8) {
+ copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
+ } else {
+ copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height);
+ }
+}
+
+void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride,
+ int width, int height) {
+ int i, j;
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v128 row = v128_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], row);
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+#undef CDEF_INLINE
+
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
new file mode 100644
index 0000000000..0e37d45980
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/common_data.h"
+
+#include "config/av1_rtcd.h"
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
+ assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+ assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+
+ memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
+ memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
+ cfl->subsampling_x = seq_params->subsampling_x;
+ cfl->subsampling_y = seq_params->subsampling_y;
+ cfl->are_parameters_computed = 0;
+ cfl->store_y = 0;
+ // The DC_PRED cache is disabled by default and is only enabled in
+ // cfl_rd_pick_alpha
+ clear_cfl_dc_pred_cache_flags(cfl);
+}
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+ CFL_PRED_TYPE pred_plane, int width) {
+ assert(pred_plane < CFL_PRED_PLANES);
+ assert(width <= CFL_BUF_LINE);
+
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+ memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+ return;
+ }
+
+ memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
+}
+
+static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
+ int dst_stride, int width, int height) {
+ for (int j = 0; j < height; j++) {
+ memcpy(dst, dc_pred_cache, width);
+ dst += dst_stride;
+ }
+}
+
+static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
+ int dst_stride, int width, int height) {
+ const size_t num_bytes = width << 1;
+ for (int j = 0; j < height; j++) {
+ memcpy(dst, dc_pred_cache, num_bytes);
+ dst += dst_stride;
+ }
+}
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ assert(pred_plane < CFL_PRED_PLANES);
+ assert(width <= CFL_BUF_LINE);
+ assert(height <= CFL_BUF_LINE);
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+ cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
+ width, height);
+ return;
+ }
+ cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+ width, height);
+}
+
+// Due to frame boundary issues, it is possible that the total area covered by
+// chroma exceeds that of luma. When this happens, we fill the missing pixels by
+// repeating the last columns and/or rows.
+static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
+ const int diff_width = width - cfl->buf_width;
+ const int diff_height = height - cfl->buf_height;
+
+ if (diff_width > 0) {
+ const int min_height = height - diff_height;
+ uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
+ for (int j = 0; j < min_height; j++) {
+ const uint16_t last_pixel = recon_buf_q3[-1];
+ assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+ for (int i = 0; i < diff_width; i++) {
+ recon_buf_q3[i] = last_pixel;
+ }
+ recon_buf_q3 += CFL_BUF_LINE;
+ }
+ cfl->buf_width = width;
+ }
+ if (diff_height > 0) {
+ uint16_t *recon_buf_q3 =
+ cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
+ for (int j = 0; j < diff_height; j++) {
+ const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+ assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+ for (int i = 0; i < width; i++) {
+ recon_buf_q3[i] = last_row_q3[i];
+ }
+ recon_buf_q3 += CFL_BUF_LINE;
+ }
+ cfl->buf_height = height;
+ }
+}
+
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
+ int height, int round_offset, int num_pel_log2) {
+ int sum = round_offset;
+ const uint16_t *recon = src;
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ sum += recon[i];
+ }
+ recon += CFL_BUF_LINE;
+ }
+ const int avg = sum >> num_pel_log2;
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = src[i] - avg;
+ }
+ src += CFL_BUF_LINE;
+ dst += CFL_BUF_LINE;
+ }
+}
+
+CFL_SUB_AVG_FN(c)
+
+static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
+ CFL_PRED_TYPE pred_type) {
+ const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
+ : CFL_SIGN_V(joint_sign);
+ if (alpha_sign == CFL_SIGN_ZERO) return 0;
+ const int abs_alpha_q3 =
+ (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
+ return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
+}
+
+static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3, int width,
+ int height) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
+ }
+ dst += dst_stride;
+ ac_buf_q3 += CFL_BUF_LINE;
+ }
+}
+
+CFL_PREDICT_FN(c, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
+ int alpha_q3, int bit_depth, int width, int height) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = clip_pixel_highbd(
+ get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth);
+ }
+ dst += dst_stride;
+ ac_buf_q3 += CFL_BUF_LINE;
+ }
+}
+
+CFL_PREDICT_FN(c, hbd)
+#endif
+
+static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
+ CFL_CTX *const cfl = &xd->cfl;
+ // Do not call cfl_compute_parameters multiple time on the same values.
+ assert(cfl->are_parameters_computed == 0);
+
+ cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
+ cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+ cfl->are_parameters_computed = 1;
+}
+
+void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, int plane) {
+ CFL_CTX *const cfl = &xd->cfl;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(is_cfl_allowed(xd));
+
+ if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
+
+ const int alpha_q3 =
+ cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
+ assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
+ CFL_BUF_SQUARE);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+ cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
+ alpha_q3, xd->bd);
+ return;
+ }
+#endif
+ cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+}
+
+static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ for (int j = 0; j < height; j += 2) {
+ for (int i = 0; i < width; i += 2) {
+ const int bot = i + input_stride;
+ output_q3[i >> 1] =
+ (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+ }
+ input += input_stride << 1;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i += 2) {
+ output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ output_q3[i] = input[i] << 3;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ for (int j = 0; j < height; j += 2) {
+ for (int i = 0; i < width; i += 2) {
+ const int bot = i + input_stride;
+ output_q3[i >> 1] =
+ (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+ }
+ input += input_stride << 1;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i += 2) {
+ output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ output_q3[i] = input[i] << 3;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+#endif
+
+CFL_GET_SUBSAMPLE_FUNCTION(c)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+ int sub_x, int sub_y) {
+ if (sub_x == 1) {
+ if (sub_y == 1) {
+ return cfl_get_luma_subsampling_420_hbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_422_hbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_444_hbd(tx_size);
+}
+#endif
+
+static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+ int sub_x, int sub_y) {
+ if (sub_x == 1) {
+ if (sub_y == 1) {
+ return cfl_get_luma_subsampling_420_lbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_422_lbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_444_lbd(tx_size);
+}
+
+static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
+ int row, int col, TX_SIZE tx_size, int use_hbd) {
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const int tx_off_log2 = MI_SIZE_LOG2;
+ const int sub_x = cfl->subsampling_x;
+ const int sub_y = cfl->subsampling_y;
+ const int store_row = row << (tx_off_log2 - sub_y);
+ const int store_col = col << (tx_off_log2 - sub_x);
+ const int store_height = height >> sub_y;
+ const int store_width = width >> sub_x;
+
+ // Invalidate current parameters
+ cfl->are_parameters_computed = 0;
+
+ // Store the surface of the pixel buffer that was written to, this way we
+ // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
+ // frame boundary)
+ if (col == 0 && row == 0) {
+ cfl->buf_width = store_width;
+ cfl->buf_height = store_height;
+ } else {
+ cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width);
+ cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height);
+ }
+
+ // Check that we will remain inside the pixel buffer.
+ assert(store_row + store_height <= CFL_BUF_LINE);
+ assert(store_col + store_width <= CFL_BUF_LINE);
+
+ // Store the input into the CfL pixel buffer
+ uint16_t *recon_buf_q3 =
+ cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+ input_stride, recon_buf_q3);
+ } else {
+ cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
+ recon_buf_q3);
+ }
+#else
+ (void)use_hbd;
+ cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
+#endif
+}
+
+// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
+// and non-chroma-referenced blocks are stored together in the CfL buffer.
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
+ int mi_col, int *row_out,
+ int *col_out) {
+ // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
+ if ((mi_row & 0x01) && cfl->subsampling_y) {
+ assert(*row_out == 0);
+ (*row_out)++;
+ }
+
+ // Increment col index for right: 4x8, 4x16 or both right 4x4s.
+ if ((mi_col & 0x01) && cfl->subsampling_x) {
+ assert(*col_out == 0);
+ (*col_out)++;
+ }
+}
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+ BLOCK_SIZE bsize) {
+ CFL_CTX *const cfl = &xd->cfl;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
+
+ if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+ // Only dimensions of size 4 can have an odd offset.
+ assert(!((col & 1) && tx_size_wide[tx_size] != 4));
+ assert(!((row & 1) && tx_size_high[tx_size] != 4));
+ sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
+ }
+ cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
+}
+
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+ BLOCK_SIZE plane_bsize, int plane,
+ TX_SIZE tx_size) {
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+ << MI_SIZE_LOG2;
+ return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+ BLOCK_SIZE plane_bsize, int plane,
+ TX_SIZE tx_size) {
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+ << MI_SIZE_LOG2;
+ return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ CFL_CTX *const cfl = &xd->cfl;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ int row = 0;
+ int col = 0;
+
+ if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+ sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
+ }
+ const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
+ const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
+ tx_size = get_tx_size(width, height);
+ cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
+ is_cur_buf_hbd(xd));
+}
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
new file mode 100644
index 0000000000..dcaa87bd48
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+// Can we use CfL for the current block?
+static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+ if (xd->lossless[mbmi->segment_id]) {
+ // In lossless, CfL is available when the partition size is equal to the
+ // transform size.
+ const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+ const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+ const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+ return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
+ }
+ // Spec: CfL is available to luma partitions lesser than or equal to 32x32
+ return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+ block_size_high[bsize] <= 32);
+}
+
+// Do we need to save the luma pixels from the current block,
+// for a possible future CfL prediction?
+static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+
+ if (cm->seq_params->monochrome) return CFL_DISALLOWED;
+
+ if (!xd->is_chroma_ref) {
+ // For non-chroma-reference blocks, we should always store the luma pixels,
+ // in case the corresponding chroma-reference block uses CfL.
+ // Note that this can only happen for block sizes which are <8 on
+ // their shortest side, as otherwise they would be chroma reference
+ // blocks.
+ return CFL_ALLOWED;
+ }
+
+ // If this block has chroma information, we know whether we're
+ // actually going to perform a CfL prediction
+ return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
+ mbmi->uv_mode == UV_CFL_PRED);
+}
+
+static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
+ int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
+ return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
+}
+
+static INLINE CFL_PRED_TYPE get_cfl_pred_type(int plane) {
+ assert(plane > 0);
+ return (CFL_PRED_TYPE)(plane - 1);
+}
+
+static INLINE void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) {
+ cfl->use_dc_pred_cache = false;
+ cfl->dc_pred_is_cached[CFL_PRED_U] = false;
+ cfl->dc_pred_is_cached[CFL_PRED_V] = false;
+}
+
+void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, int plane);
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+ BLOCK_SIZE bsize);
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+ CFL_PRED_TYPE pred_plane, int width);
+
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
+
+// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
+#define CFL_lbd_TYPE uint8_t *cfl_type
+#define CFL_hbd_TYPE uint16_t *cfl_type
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \
+ void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \
+ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \
+ cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \
+ output_q3, width, height); \
+ }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+ TX_SIZE tx_size) { \
+ CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
+ return subfn_##sub[tx_size]; \
+ }
+
+// Declare an architecture-specific array of function pointers for size-specific
+// wrappers.
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \
+ cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \
+ cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \
+ cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \
+ cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \
+ cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \
+ cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \
+ cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \
+ cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \
+ cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \
+ cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \
+ cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ };
+
+// The RTCD script does not support passing in an array, so we wrap it in this
+// function.
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+#else
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd)
+#endif
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
+ void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+ int16_t *dst) { \
+ subtract_average_##arch(src, dst, width, height, round_offset, \
+ num_pel_log2); \
+ }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUB_AVG_FN(arch) \
+ CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \
+ CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \
+ CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \
+ CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \
+ CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \
+ CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \
+ CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \
+ CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \
+ CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \
+ CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \
+ CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \
+ CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \
+ CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \
+ CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \
+ cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \
+ TX_SIZE tx_size) { \
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \
+ cfl_subtract_average_4x4_##arch, /* 4x4 */ \
+ cfl_subtract_average_8x8_##arch, /* 8x8 */ \
+ cfl_subtract_average_16x16_##arch, /* 16x16 */ \
+ cfl_subtract_average_32x32_##arch, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subtract_average_4x8_##arch, /* 4x8 */ \
+ cfl_subtract_average_8x4_##arch, /* 8x4 */ \
+ cfl_subtract_average_8x16_##arch, /* 8x16 */ \
+ cfl_subtract_average_16x8_##arch, /* 16x8 */ \
+ cfl_subtract_average_16x32_##arch, /* 16x32 */ \
+ cfl_subtract_average_32x16_##arch, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \
+ cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \
+ cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \
+ cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+ /* index the function pointer array out of bounds. */ \
+ return sub_avg[tx_size % TX_SIZES_ALL]; \
+ }
+
+// For VSX SIMD optimization, the C versions of width == 4 subtract are
+// faster than the VSX. As such, the VSX code calls the C versions.
+void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height) \
+ void cfl_predict_lbd_##width##x##height##_##arch( \
+ const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \
+ int alpha_q3) { \
+ cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
+ height); \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_PREDICT_hbd(arch, width, height) \
+ void cfl_predict_hbd_##width##x##height##_##arch( \
+ const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+ int bd) { \
+ cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
+ height); \
+ }
+#endif
+
+// This wrapper exists because clang format does not like calling macros with
+// lowercase letters.
+#define CFL_PREDICT_X(arch, width, height, bd) \
+ CFL_PREDICT_##bd(arch, width, height)
+
+#define CFL_PREDICT_FN(arch, bd) \
+ CFL_PREDICT_X(arch, 4, 4, bd) \
+ CFL_PREDICT_X(arch, 4, 8, bd) \
+ CFL_PREDICT_X(arch, 4, 16, bd) \
+ CFL_PREDICT_X(arch, 8, 4, bd) \
+ CFL_PREDICT_X(arch, 8, 8, bd) \
+ CFL_PREDICT_X(arch, 8, 16, bd) \
+ CFL_PREDICT_X(arch, 8, 32, bd) \
+ CFL_PREDICT_X(arch, 16, 4, bd) \
+ CFL_PREDICT_X(arch, 16, 8, bd) \
+ CFL_PREDICT_X(arch, 16, 16, bd) \
+ CFL_PREDICT_X(arch, 16, 32, bd) \
+ CFL_PREDICT_X(arch, 32, 8, bd) \
+ CFL_PREDICT_X(arch, 32, 16, bd) \
+ CFL_PREDICT_X(arch, 32, 32, bd) \
+ cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
+ static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \
+ cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \
+ cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \
+ cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \
+ cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \
+ cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \
+ cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \
+ cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \
+ cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \
+ cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \
+ cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \
+ cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \
+ cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+ /* index the function pointer array out of bounds. */ \
+ return pred[tx_size % TX_SIZES_ALL]; \
+ }
+
+#endif // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
new file mode 100644
index 0000000000..ccb45b68ce
--- /dev/null
+++ b/third_party/aom/av1/common/common.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/bitops.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define av1_copy(dest, src) \
+ do { \
+ assert(sizeof(dest) == sizeof(src)); \
+ memcpy(dest, src, sizeof(src)); \
+ } while (0)
+
+// Use this for variably-sized arrays.
+#define av1_copy_array(dest, src, n) \
+ do { \
+ assert(sizeof(*(dest)) == sizeof(*(src))); \
+ memcpy(dest, src, n * sizeof(*(src))); \
+ } while (0)
+
+#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+ return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#define CHECK_MEM_ERROR(cm, lval, expr) \
+ AOM_CHECK_MEM_ERROR((cm)->error, lval, expr)
+
+#define AOM_FRAME_MARKER 0x2
+
+#define AV1_MIN_TILE_SIZE_BYTES 1
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.c b/third_party/aom/av1/common/common_data.c
new file mode 100644
index 0000000000..482aecfcc0
--- /dev/null
+++ b/third_party/aom/av1/common/common_data.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
+/* clang-format off */
+const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
+ // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
+ // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
+ { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } },
+ { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
+ { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+ { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
+ { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
+ { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+ { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
+ { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
+ { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+ { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
+ { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
+ { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
+ { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
+ { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } },
+ { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
+ { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
+ { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
+ { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
+};
+/* clang-format on */
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
new file mode 100644
index 0000000000..dfe927c6ef
--- /dev/null
+++ b/third_party/aom/av1/common/common_data.h
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
+};
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
+ 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
+};
+
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
+ 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
+};
+
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
+ 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
+};
+
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
+ 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32,
+ 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64
+};
+
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
+ 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64,
+ 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16
+};
+
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
+static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
+};
+
+static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
+ 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
+};
+
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
+/* clang-format off */
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
+ { // PARTITION_NONE
+ BLOCK_4X4, BLOCK_8X8, BLOCK_16X16,
+ BLOCK_32X32, BLOCK_64X64, BLOCK_128X128
+ }, { // PARTITION_HORZ
+ BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+ BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+ }, { // PARTITION_VERT
+ BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+ BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+ }, { // PARTITION_SPLIT
+ BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
+ BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
+ }, { // PARTITION_HORZ_A
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+ }, { // PARTITION_HORZ_B
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+ }, { // PARTITION_VERT_A
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+ }, { // PARTITION_VERT_B
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+ }, { // PARTITION_HORZ_4
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+ BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID
+ }, { // PARTITION_VERT_4
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
+ }
+};
+
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
+ // 4X4
+ TX_4X4,
+ // 4X8, 8X4, 8X8
+ TX_4X4, TX_4X4, TX_8X8,
+ // 8X16, 16X8, 16X16
+ TX_8X8, TX_8X8, TX_16X16,
+ // 16X32, 32X16, 32X32
+ TX_16X16, TX_16X16, TX_32X32,
+ // 32X64, 64X32,
+ TX_32X32, TX_32X32,
+ // 64X64
+ TX_64X64,
+ // 64x128, 128x64, 128x128
+ TX_64X64, TX_64X64, TX_64X64,
+ // 4x16, 16x4, 8x32
+ TX_4X4, TX_4X4, TX_8X8,
+ // 32x8, 16x64 64x16
+ TX_8X8, TX_16X16, TX_16X16
+};
+
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = {
+ // 4X4
+ TX_4X4,
+ // 4X8, 8X4, 8X8
+ TX_4X8, TX_8X4, TX_8X8,
+ // 8X16, 16X8, 16X16
+ TX_8X16, TX_16X8, TX_16X16,
+ // 16X32, 32X16, 32X32
+ TX_16X32, TX_32X16, TX_32X32,
+ // 32X64, 64X32,
+ TX_32X64, TX_64X32,
+ // 64X64
+ TX_64X64,
+ // 64x128, 128x64, 128x128
+ TX_64X64, TX_64X64, TX_64X64,
+ // 4x16, 16x4,
+ TX_4X16, TX_16X4,
+ // 8x32, 32x8
+ TX_8X32, TX_32X8,
+ // 16x64, 64x16
+ TX_16X64, TX_64X16
+};
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+ DCT_1D, ADST_1D, DCT_1D, ADST_1D,
+ FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+ DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D,
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+ DCT_1D, DCT_1D, ADST_1D, ADST_1D,
+ DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+ IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D,
+};
+
+#define TXSIZE_CAT_INVALID (-1)
+
+/* clang-format on */
+
+static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_4X4, // TX_8X8
+ TX_8X8, // TX_16X16
+ TX_16X16, // TX_32X32
+ TX_32X32, // TX_64X64
+ TX_4X4, // TX_4X8
+ TX_4X4, // TX_8X4
+ TX_8X8, // TX_8X16
+ TX_8X8, // TX_16X8
+ TX_16X16, // TX_16X32
+ TX_16X16, // TX_32X16
+ TX_32X32, // TX_32X64
+ TX_32X32, // TX_64X32
+ TX_4X8, // TX_4X16
+ TX_8X4, // TX_16X4
+ TX_8X16, // TX_8X32
+ TX_16X8, // TX_32X8
+ TX_16X32, // TX_16X64
+ TX_32X16, // TX_64X16
+};
+
+static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_4X4, // TX_4X8
+ TX_8X8, // TX_8X4
+ TX_8X8, // TX_8X16
+ TX_16X16, // TX_16X8
+ TX_16X16, // TX_16X32
+ TX_32X32, // TX_32X16
+ TX_32X32, // TX_32X64
+ TX_64X64, // TX_64X32
+ TX_4X4, // TX_4X16
+ TX_16X16, // TX_16X4
+ TX_8X8, // TX_8X32
+ TX_32X32, // TX_32X8
+ TX_16X16, // TX_16X64
+ TX_64X64, // TX_64X16
+};
+
+static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_8X8, // TX_4X8
+ TX_4X4, // TX_8X4
+ TX_16X16, // TX_8X16
+ TX_8X8, // TX_16X8
+ TX_32X32, // TX_16X32
+ TX_16X16, // TX_32X16
+ TX_64X64, // TX_32X64
+ TX_32X32, // TX_64X32
+ TX_16X16, // TX_4X16
+ TX_4X4, // TX_16X4
+ TX_32X32, // TX_8X32
+ TX_8X8, // TX_32X8
+ TX_64X64, // TX_16X64
+ TX_16X16, // TX_64X16
+};
+
+#define TX_SIZE_W_MIN 4
+
+// Transform block width in pixels
+static const int tx_size_wide[TX_SIZES_ALL] = {
+ 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
+};
+
+#define TX_SIZE_H_MIN 4
+
+// Transform block height in pixels
+static const int tx_size_high[TX_SIZES_ALL] = {
+ 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
+};
+
+// Transform block width in unit
+static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+ 1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16,
+};
+
+// Transform block height in unit
+static const int tx_size_high_unit[TX_SIZES_ALL] = {
+ 1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4,
+};
+
+// Transform block width in log2
+static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
+};
+
+// Transform block width in log2 unit
+static const int tx_size_wide_unit_log2[TX_SIZES_ALL] = {
+ 0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4,
+};
+
+// Transform block height in log2
+static const int tx_size_high_log2[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
+};
+
+// Transform block height in log2 unit
+static const int tx_size_high_unit_log2[TX_SIZES_ALL] = {
+ 0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2,
+};
+
+static const int tx_size_2d[TX_SIZES_ALL + 1] = {
+ 16, 64, 256, 1024, 4096, 32, 32, 128, 128, 512,
+ 512, 2048, 2048, 64, 64, 256, 256, 1024, 1024,
+};
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+ BLOCK_4X4, // TX_4X4
+ BLOCK_8X8, // TX_8X8
+ BLOCK_16X16, // TX_16X16
+ BLOCK_32X32, // TX_32X32
+ BLOCK_64X64, // TX_64X64
+ BLOCK_4X8, // TX_4X8
+ BLOCK_8X4, // TX_8X4
+ BLOCK_8X16, // TX_8X16
+ BLOCK_16X8, // TX_16X8
+ BLOCK_16X32, // TX_16X32
+ BLOCK_32X16, // TX_32X16
+ BLOCK_32X64, // TX_32X64
+ BLOCK_64X32, // TX_64X32
+ BLOCK_4X16, // TX_4X16
+ BLOCK_16X4, // TX_16X4
+ BLOCK_8X32, // TX_8X32
+ BLOCK_32X8, // TX_32X8
+ BLOCK_16X64, // TX_16X64
+ BLOCK_64X16, // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_4X4, // TX_4X8
+ TX_4X4, // TX_8X4
+ TX_8X8, // TX_8X16
+ TX_8X8, // TX_16X8
+ TX_16X16, // TX_16X32
+ TX_16X16, // TX_32X16
+ TX_32X32, // TX_32X64
+ TX_32X32, // TX_64X32
+ TX_4X4, // TX_4X16
+ TX_4X4, // TX_16X4
+ TX_8X8, // TX_8X32
+ TX_8X8, // TX_32X8
+ TX_16X16, // TX_16X64
+ TX_16X16, // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_8X8, // TX_4X8
+ TX_8X8, // TX_8X4
+ TX_16X16, // TX_8X16
+ TX_16X16, // TX_16X8
+ TX_32X32, // TX_16X32
+ TX_32X32, // TX_32X16
+ TX_64X64, // TX_32X64
+ TX_64X64, // TX_64X32
+ TX_16X16, // TX_4X16
+ TX_16X16, // TX_16X4
+ TX_32X32, // TX_8X32
+ TX_32X32, // TX_32X8
+ TX_64X64, // TX_16X64
+ TX_64X64, // TX_64X16
+};
+
+static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
+ 0, // TX_4X4
+ 2, // TX_8X8
+ 4, // TX_16X16
+ 6, // TX_32X32
+ 6, // TX_64X64
+ 1, // TX_4X8
+ 1, // TX_8X4
+ 3, // TX_8X16
+ 3, // TX_16X8
+ 5, // TX_16X32
+ 5, // TX_32X16
+ 6, // TX_32X64
+ 6, // TX_64X32
+ 2, // TX_4X16
+ 2, // TX_16X4
+ 4, // TX_8X32
+ 4, // TX_32X8
+ 5, // TX_16X64
+ 5, // TX_64X16
+};
+
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+ TX_4X4, // ONLY_4X4
+ TX_64X64, // TX_MODE_LARGEST
+ TX_64X64, // TX_MODE_SELECT
+};
+
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
+extern const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2];
+
+// Generates 5 bit field in which each bit set to 1 represents
+// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16
+// and 8x8. 10000 means we just split the 128x128 to 64x64
+/* clang-format off */
+static const struct {
+ PARTITION_CONTEXT above;
+ PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES_ALL] = {
+ { 31, 31 }, // 4X4 - {0b11111, 0b11111}
+ { 31, 30 }, // 4X8 - {0b11111, 0b11110}
+ { 30, 31 }, // 8X4 - {0b11110, 0b11111}
+ { 30, 30 }, // 8X8 - {0b11110, 0b11110}
+ { 30, 28 }, // 8X16 - {0b11110, 0b11100}
+ { 28, 30 }, // 16X8 - {0b11100, 0b11110}
+ { 28, 28 }, // 16X16 - {0b11100, 0b11100}
+ { 28, 24 }, // 16X32 - {0b11100, 0b11000}
+ { 24, 28 }, // 32X16 - {0b11000, 0b11100}
+ { 24, 24 }, // 32X32 - {0b11000, 0b11000}
+ { 24, 16 }, // 32X64 - {0b11000, 0b10000}
+ { 16, 24 }, // 64X32 - {0b10000, 0b11000}
+ { 16, 16 }, // 64X64 - {0b10000, 0b10000}
+ { 16, 0 }, // 64X128- {0b10000, 0b00000}
+ { 0, 16 }, // 128X64- {0b00000, 0b10000}
+ { 0, 0 }, // 128X128-{0b00000, 0b00000}
+ { 31, 28 }, // 4X16 - {0b11111, 0b11100}
+ { 28, 31 }, // 16X4 - {0b11100, 0b11111}
+ { 30, 24 }, // 8X32 - {0b11110, 0b11000}
+ { 24, 30 }, // 32X8 - {0b11000, 0b11110}
+ { 28, 16 }, // 16X64 - {0b11100, 0b10000}
+ { 16, 28 }, // 64X16 - {0b10000, 0b11100}
+};
+/* clang-format on */
+
+static const int intra_mode_context[INTRA_MODES] = {
+ 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0,
+};
+
+// Note: this is also used in unit tests. So whenever one changes the table,
+// the unit tests need to be changed accordingly.
+static const int quant_dist_weight[4][2] = {
+ { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
+};
+
+static const int quant_dist_lookup_table[4][2] = {
+ { 9, 7 },
+ { 11, 5 },
+ { 12, 4 },
+ { 13, 3 },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
new file mode 100644
index 0000000000..bb72e0cbd2
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.c
@@ -0,0 +1,1508 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn) {
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+ const int x_filter_idx =
+ (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ assert(x_filter_idx <= RS_SUBPEL_MASK);
+ const int16_t *const x_filter =
+ &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+ int sum = 0;
+ for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_qn += x_step_qn;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn, int bd) {
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+ const int x_filter_idx =
+ (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ assert(x_filter_idx <= RS_SUBPEL_MASK);
+ const int16_t *const x_filter =
+ &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+ int sum = 0;
+ for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ x_qn += x_step_qn;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+
+ // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
+ // be beyond the following range. For better prediction, a clamping can be
+ // added for 12 tap filter to ensure the horizontal filtering result is
+ // within 16 bit. The same applies to the vertical filtering.
+ assert(filter_params_x->taps > 8 ||
+ (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(filter_params_y->taps > 8 ||
+ (0 <= sum && sum < (1 << (offset_bits + 2))));
+ int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+ }
+ }
+}
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ dst[y * dst_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
+ }
+ }
+}
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn, ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+ }
+ }
+}
+
+// This function is exactly the same as av1_convolve_2d_sr_c, and is an
+// optimized version for intrabc. Use the following 2-tap filter:
+// DECLARE_ALIGNED(256, static const int16_t,
+// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// };
+void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)conv_params;
+
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ const int bd = 8;
+
+ // horizontal filter
+ // explicitly operate for subpel_x_qn = 8.
+ int16_t *im = im_block;
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t sum = (1 << bd) + src[x] + src[x + 1];
+ assert(0 <= sum && sum < (1 << (bd + 2)));
+ im[x] = sum;
+ }
+ src += src_stride;
+ im += im_stride;
+ }
+
+ // vertical filter
+ // explicitly operate for subpel_y_qn = 8.
+ int16_t *src_vert = im_block;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t sum =
+ (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
+ assert(0 <= sum && sum < (1 << (bd + 4)));
+ const int16_t res =
+ ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
+ dst[x] = clip_pixel(res);
+ }
+ src_vert += im_stride;
+ dst += dst_stride;
+ }
+}
+
+// This function is exactly the same as av1_convolve_y_sr_c, and is an
+// optimized version for intrabc.
+void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+
+ // vertical filter
+ // explicitly operate for subpel_y_qn = 8.
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t res = src[x] + src[src_stride + x];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// This function is exactly the same as av1_convolve_x_sr_c, and is an
+// optimized version for intrabc.
+void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)conv_params;
+
+ // horizontal filter
+ // explicitly operate for subpel_x_qn = 8.
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t res = src[x] + src[x + 1];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(filter_params_x->taps > 8 ||
+ (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(filter_params_y->taps > 8 ||
+ (0 <= sum && sum < (1 << (offset_bits + 2))));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ dst[y * dst_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ res *= (1 << bits);
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst[y * dst_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ res += round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst[y * dst_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+ res += round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ for (int y = 0; y < im_h; ++y) {
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+ const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(x_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *x_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_x[k - fo_horiz];
+ }
+ assert(filter_params_x->taps > 8 ||
+ (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ src_horiz += src_stride;
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int x = 0; x < w; ++x) {
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(y_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *y_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+ }
+ assert(filter_params_y->taps > 8 ||
+ (0 <= sum && sum < (1 << (offset_bits + 2))));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ }
+ }
+ src_vert++;
+ }
+}
+
+static void convolve_2d_scale_wrapper(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->is_compound) {
+ assert(conv_params->dst != NULL);
+ }
+ av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
+ y_step_qn, conv_params);
+}
+
+static void convolve_2d_facade_compound(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ const bool need_x = subpel_x_qn != 0;
+ const bool need_y = subpel_y_qn != 0;
+ if (!need_x && !need_y) {
+ av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
+ conv_params);
+ } else if (need_x && !need_y) {
+ av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ } else if (!need_x && need_y) {
+ av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, conv_params);
+ } else {
+ assert(need_y && need_x);
+ av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ }
+}
+
+static void convolve_2d_facade_single(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ const bool need_x = subpel_x_qn != 0;
+ const bool need_y = subpel_y_qn != 0;
+ if (!need_x && !need_y) {
+ aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
+ } else if (need_x && !need_y) {
+ av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ subpel_x_qn, conv_params);
+ } else if (!need_x && need_y) {
+ av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+ subpel_y_qn);
+ } else {
+ assert(need_x && need_y);
+ av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
+ }
+}
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4, int scaled,
+ ConvolveParams *conv_params) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)dst;
+ (void)dst_stride;
+
+ const InterpFilterParams *filter_params_x = interp_filters[0];
+ const InterpFilterParams *filter_params_y = interp_filters[1];
+
+ // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
+ // 2-tap filter indicates that it is for IntraBC.
+ if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert(!scaled);
+ if (subpel_x_qn && subpel_y_qn) {
+ av1_convolve_2d_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params);
+ return;
+ } else if (subpel_x_qn) {
+ av1_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ return;
+ } else if (subpel_y_qn) {
+ av1_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ return;
+ }
+ }
+
+ if (scaled) {
+ convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ x_step_q4, subpel_y_qn, y_step_q4, conv_params);
+ } else if (conv_params->is_compound) {
+ convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ } else {
+ convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ }
+}
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+
+ // horizontal filter
+ const uint16_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(filter_params_x->taps > 8 ||
+ (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
+ im_block[y * im_stride + x] =
+ ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(filter_params_y->taps > 8 ||
+ (0 <= sum && sum < (1 << (offset_bits + 2))));
+ int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ }
+}
+
+// This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
+// optimized version for intrabc. Use the following 2-tap filter:
+// DECLARE_ALIGNED(256, static const int16_t,
+// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// };
+void av1_highbd_convolve_2d_sr_intrabc_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+ assert(subpel_x_qn == 8);
+ assert(subpel_y_qn == 8);
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+ (void)conv_params;
+
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + 1;
+ int im_stride = w;
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+
+ // horizontal filter
+ // explicitly operate for subpel_x_qn = 8.
+ int16_t *im = im_block;
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ im[x] = sum;
+ }
+ src += src_stride;
+ im += im_stride;
+ }
+
+ // vertical filter
+ // explicitly operate for subpel_y_qn = 8.
+ int16_t *src_vert = im_block;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t sum =
+ (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ src_vert += im_stride;
+ dst += dst_stride;
+ }
+}
+
+// This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
+// optimized version for intrabc.
+void av1_highbd_convolve_y_sr_intrabc_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ int bd) {
+ assert(subpel_y_qn == 8);
+ assert(filter_params_y->taps == 2);
+ (void)filter_params_y;
+ (void)subpel_y_qn;
+
+ // vertical filter
+ // explicitly operate for subpel_y_qn = 8.
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ const int32_t res = src[x] + src[src_stride + x];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+// This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
+// optimized version for intrabc.
+void av1_highbd_convolve_x_sr_intrabc_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ const int bits = FILTER_BITS - conv_params->round_0;
+ assert(bits >= 0);
+ assert(subpel_x_qn == 8);
+ assert(filter_params_x->taps == 2);
+ assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
+ (void)filter_params_x;
+ (void)subpel_x_qn;
+
+ // horizontal filter
+ // explicitly operate for subpel_x_qn = 8.
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 64 * (src[x] + src[x + 1]);
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ int x, y, k;
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+
+ // horizontal filter
+ const uint16_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (y = 0; y < im_h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(filter_params_x->taps > 8 ||
+ (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
+ (void)bd;
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(filter_params_y->taps > 8 ||
+ (0 <= sum && sum < (1 << (offset_bits + 2))));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+ assert(bits >= 0);
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ res += round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+ assert(bits >= 0);
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ res *= (1 << bits);
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride,
+ int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ assert(bits >= 0);
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+ res += round_offset;
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+ // horizontal filter
+ const uint16_t *src_horiz = src - fo_vert * src_stride;
+ for (int y = 0; y < im_h; ++y) {
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+ const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(x_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *x_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_x[k - fo_horiz];
+ }
+ assert(filter_params_x->taps > 8 ||
+ (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ src_horiz += src_stride;
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int x = 0; x < w; ++x) {
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(y_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *y_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+ }
+ assert(filter_params_y->taps > 8 ||
+ (0 <= sum && sum < (1 << (offset_bits + 2))));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ }
+ }
+ src_vert++;
+ }
+}
+
+static void highbd_convolve_2d_facade_compound(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+ const int w, const int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ const bool need_x = subpel_x_qn != 0;
+ const bool need_y = subpel_y_qn != 0;
+ if (!need_x && !need_y) {
+ av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
+ conv_params, bd);
+ } else if (need_x && !need_y) {
+ av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params,
+ bd);
+ } else if (!need_x && need_y) {
+ av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, conv_params,
+ bd);
+ } else {
+ assert(need_x && need_y);
+ av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params, bd);
+ }
+}
+
+static void highbd_convolve_2d_facade_single(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+ const int w, const int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ const bool need_x = subpel_x_qn != 0;
+ const bool need_y = subpel_y_qn != 0;
+
+ if (!need_x && !need_y) {
+ aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
+ } else if (need_x && !need_y) {
+ av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params, bd);
+ } else if (!need_x && need_y) {
+ av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ } else {
+ assert(need_x && need_y);
+ av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params, bd);
+ }
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+ uint8_t *dst8, int dst_stride, int w, int h,
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4,
+ int scaled, ConvolveParams *conv_params,
+ int bd) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)dst_stride;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+ const InterpFilterParams *filter_params_x = interp_filters[0];
+ const InterpFilterParams *filter_params_y = interp_filters[1];
+
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ // 2-tap filter indicates that it is for IntraBC.
+ if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+ assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+ assert(!scaled);
+ if (subpel_x_qn && subpel_y_qn) {
+ av1_highbd_convolve_2d_sr_intrabc_c(
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+ return;
+ } else if (subpel_x_qn) {
+ av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn,
+ conv_params, bd);
+ return;
+ } else if (subpel_y_qn) {
+ av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ return;
+ }
+ }
+
+ if (scaled) {
+ if (conv_params->is_compound) {
+ assert(conv_params->dst != NULL);
+ }
+ av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ x_step_q4, subpel_y_qn, y_step_q4, conv_params,
+ bd);
+ } else if (conv_params->is_compound) {
+ highbd_convolve_2d_facade_compound(
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+ } else {
+ highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params, bd);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+// (1) Interpolate horizontally into an intermediate buffer, temp.
+// (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 128x128 pixels.
+// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
+// original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
+#define WIENER_MAX_EXT_SIZE 263
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+ return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+ return sum;
+}
+#endif
+
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+ ptrdiff_t a_stride,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+ return sum;
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ // TODO(agrange) Modify to make independent of table alignment.
+ return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+ return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h,
+ int round0_bits) {
+ const int bd = 8;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (int x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+ (1 << (bd + FILTER_BITS - 1));
+ const int sum = horz_scalar_product(src_x, x_filter) + rounding;
+ dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+ WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h,
+ int round1_bits) {
+ const int bd = 8;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (int x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (int y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ const int rounding =
+ ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+ (1 << (bd + round1_bits - 1));
+ const int sum =
+ highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
+ x_step_q4, w, intermediate_height,
+ conv_params->round_0);
+ convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, conv_params->round_1);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_convolve_add_src_horiz_hip(
+ const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int round0_bits, int bd) {
+ const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (int x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+ (1 << (bd + FILTER_BITS - 1));
+ const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
+ dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+ extraprec_clamp_limit - 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_add_src_vert_hip(
+ const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int round1_bits, int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (int x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (int y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ const int rounding =
+ ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+ (1 << (bd + round1_bits - 1));
+ const int sum =
+ highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+void av1_highbd_wiener_convolve_add_src_c(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+
+ highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, MAX_SB_SIZE, filters_x,
+ x0_q4, x_step_q4, w, intermediate_height,
+ conv_params->round_0, bd);
+ highbd_convolve_add_src_vert_hip(
+ temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
+ filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
new file mode 100644
index 0000000000..d6dd8763c3
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
+#include "av1/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t CONV_BUF_TYPE;
+typedef struct ConvolveParams {
+ int do_average;
+ CONV_BUF_TYPE *dst;
+ int dst_stride;
+ int round_0;
+ int round_1;
+ int plane;
+ int is_compound;
+ int use_dist_wtd_comp_avg;
+ int fwd_offset;
+ int bck_offset;
+} ConvolveParams;
+
+typedef struct WienerConvolveParams {
+ int round_0;
+ int round_1;
+} WienerConvolveParams;
+
+#define ROUND0_BITS 3
+#define COMPOUND_ROUND1_BITS 7
+#define WIENER_ROUND0_BITS 3
+
+#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
+
+typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params);
+
+typedef void (*aom_highbd_convolve_fn_t)(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+struct AV1Common;
+struct scale_factors;
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4, int scaled,
+ ConvolveParams *conv_params);
+
+static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
+ CONV_BUF_TYPE *dst,
+ int dst_stride,
+ int is_compound, int bd) {
+ ConvolveParams conv_params;
+ assert(IMPLIES(cmp_index, is_compound));
+
+ conv_params.is_compound = is_compound;
+ conv_params.use_dist_wtd_comp_avg = 0;
+ conv_params.round_0 = ROUND0_BITS;
+ conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+ : 2 * FILTER_BITS - conv_params.round_0;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+ assert(IMPLIES(bd < 12, intbufrange <= 16));
+ if (intbufrange > 16) {
+ conv_params.round_0 += intbufrange - 16;
+ if (!is_compound) conv_params.round_1 -= intbufrange - 16;
+ }
+#else
+ (void)bd;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ // TODO(yunqing): The following dst should only be valid while
+ // is_compound = 1;
+ conv_params.dst = dst;
+ conv_params.dst_stride = dst_stride;
+ conv_params.plane = plane;
+
+ // By default, set do average to 1 if this is the second single prediction
+ // in a compound mode.
+ conv_params.do_average = cmp_index;
+ return conv_params;
+}
+
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
+ int bd) {
+ return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
+}
+
+static INLINE WienerConvolveParams get_conv_params_wiener(int bd) {
+ WienerConvolveParams conv_params;
+ conv_params.round_0 = WIENER_ROUND0_BITS;
+ conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
+ const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+ assert(IMPLIES(bd < 12, intbufrange <= 16));
+ if (intbufrange > 16) {
+ conv_params.round_0 += intbufrange - 16;
+ conv_params.round_1 -= intbufrange - 16;
+ }
+ return conv_params;
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *interp_filters[2],
+ const int subpel_x_qn, int x_step_q4,
+ const int subpel_y_qn, int y_step_q4,
+ int scaled, ConvolveParams *conv_params,
+ int bd);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
new file mode 100644
index 0000000000..7e6160f9a5
--- /dev/null
+++ b/third_party/aom/av1/common/debugmodes.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
+ fprintf(f, "%s", str);
+ fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
+ cm->show_frame, cm->quant_params.base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
+ size_t member_offset) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MB_MODE_INFO **mi = mi_params->mi_grid_base;
+ int rows = mi_params->mi_rows;
+ int cols = mi_params->mi_cols;
+ char prefix = descriptor[0];
+
+ log_frame_info(cm, descriptor, file);
+ for (int mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(file, "%c ", prefix);
+ for (int mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
+ mi++;
+ }
+ fprintf(file, "\n");
+ mi += mi_params->mi_stride - cols;
+ }
+ fprintf(file, "\n");
+}
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
+ CommonModeInfoParams *mi_params = &cm->mi_params;
+ FILE *mvs = fopen(file, "a");
+ MB_MODE_INFO **mi = mi_params->mi_grid_base;
+ const int rows = mi_params->mi_rows;
+ const int cols = mi_params->mi_cols;
+
+ print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, bsize));
+ print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+ print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+ print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+ print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+ // output skip infomation.
+ log_frame_info(cm, "Skips:", mvs);
+ for (int mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs, "S ");
+ for (int mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(mvs, "%2d ", mi[0]->skip_txfm);
+ mi++;
+ }
+ fprintf(mvs, "\n");
+ mi += mi_params->mi_stride - cols;
+ }
+ fprintf(mvs, "\n");
+
+ // output motion vectors.
+ log_frame_info(cm, "Vectors ", mvs);
+ mi = mi_params->mi_grid_base;
+ for (int mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs, "V ");
+ for (int mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
+ mi++;
+ }
+ fprintf(mvs, "\n");
+ mi += mi_params->mi_stride - cols;
+ }
+ fprintf(mvs, "\n");
+
+ fclose(mvs);
+}
+
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+ const char *filename) {
+ FILE *hdrFile = fopen(filename, "w");
+ fwrite(data, size, sizeof(uint8_t), hdrFile);
+
+ // Reset order hints(7bit + a previous bit) to 0, so that all camera frame
+ // headers are identical in large scale coding.
+ uint8_t zero = 0;
+ fseek(hdrFile, 1, SEEK_SET);
+ // Reset second byte.
+ fwrite(&zero, 1, sizeof(uint8_t), hdrFile);
+ fclose(hdrFile);
+}
+
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) {
+ FILE *fcFile = fopen(filename, "w");
+ const uint16_t *fcp = (uint16_t *)fc;
+ const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t);
+ unsigned int i;
+
+ for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++);
+ fclose(fcFile);
+}
diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c
new file mode 100644
index 0000000000..97d95ea394
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/scan.h"
+#include "av1/common/token_cdfs.h"
+#include "av1/common/txb_common.h"
+
+static int get_q_ctx(int q) {
+ if (q <= 20) return 0;
+ if (q <= 60) return 1;
+ if (q <= 120) return 2;
+ return 3;
+}
+
+void av1_default_coef_probs(AV1_COMMON *cm) {
+ const int index = get_q_ctx(cm->quant_params.base_qindex);
+#if CONFIG_ENTROPY_STATS
+ cm->coef_cdf_category = index;
+#endif
+
+ av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
+ av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
+ av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
+ av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
+ av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
+ av1_copy(cm->fc->coeff_base_eob_cdf,
+ av1_default_coeff_base_eob_multi_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
+}
+
+static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
+ int num_cdfs, int cdf_stride,
+ int nsymbs) {
+ for (int i = 0; i < num_cdfs; i++) {
+ cdf_ptr[i * cdf_stride + nsymbs] = 0;
+ }
+}
+
+#define RESET_CDF_COUNTER(cname, nsymbs) \
+ RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
+
+#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride) \
+ do { \
+ aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname; \
+ int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob); \
+ int num_cdfs = array_size / cdf_stride; \
+ reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
+ } while (0)
+
+static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) {
+ RESET_CDF_COUNTER(nmv->joints_cdf, 4);
+ for (int i = 0; i < 2; i++) {
+ RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
+ RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
+ RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
+ RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
+ RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
+ RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
+ RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
+ RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
+ }
+}
+
+void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
+ RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
+ RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
+ RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
+ RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
+ RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
+ RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
+ RESET_CDF_COUNTER(fc->newmv_cdf, 2);
+ RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
+ RESET_CDF_COUNTER(fc->refmv_cdf, 2);
+ RESET_CDF_COUNTER(fc->drl_cdf, 2);
+ RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+ RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
+ RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
+ RESET_CDF_COUNTER(fc->interintra_cdf, 2);
+ RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
+ RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
+ RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
+ RESET_CDF_COUNTER(fc->obmc_cdf, 2);
+ RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
+ RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
+ for (int j = 0; j < PALETTE_SIZES; j++) {
+ int nsymbs = j + PALETTE_MIN_SIZE;
+ RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ }
+ RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
+ RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
+ RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
+ RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
+ RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
+ RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
+ RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
+ RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2);
+ RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
+ reset_nmv_counter(&fc->nmvc);
+ reset_nmv_counter(&fc->ndvc);
+ RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
+ RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
+ RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+ RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
+ RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
+ RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
+ RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
+ RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
+ RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
+ RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
+ CDF_SIZE(UV_INTRA_MODES));
+ RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
+ for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+ if (i < 4) {
+ RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
+ } else if (i < 16) {
+ RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
+ } else {
+ RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
+ }
+ }
+ RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
+ RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
+ RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
+ RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
+ CDF_SIZE(MAX_TX_DEPTH + 1));
+ RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
+ RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
+ RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
+ RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
+ RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
+ for (int i = 0; i < FRAME_LF_COUNT; i++) {
+ RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
+ }
+ RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
+}
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
new file mode 100644
index 0000000000..53ef3b1c89
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TOKEN_CDF_Q_CTXS 4
+
+#define TXB_SKIP_CONTEXTS 13
+
+#define EOB_COEF_CONTEXTS 9
+
+#define SIG_COEF_CONTEXTS_2D 26
+#define SIG_COEF_CONTEXTS_1D 16
+#define SIG_COEF_CONTEXTS_EOB 4
+#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D)
+
+#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS)
+#define DC_SIGN_CONTEXTS 3
+
+#define BR_TMP_OFFSET 12
+#define BR_REF_CAT 4
+#define LEVEL_CONTEXTS 21
+
+#define NUM_BASE_LEVELS 2
+
+#define BR_CDF_SIZE (4)
+#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
+
+#define COEFF_CONTEXT_BITS 3
+#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
+
+#define BASE_CONTEXT_POSITION_NUM 12
+
+enum {
+ TX_CLASS_2D = 0,
+ TX_CLASS_HORIZ = 1,
+ TX_CLASS_VERT = 2,
+ TX_CLASSES = 3,
+} UENUM1BYTE(TX_CLASS);
+
+#define DCT_MAX_VALUE 16384
+#define DCT_MAX_VALUE_HIGH10 65536
+#define DCT_MAX_VALUE_HIGH12 262144
+
+/* Coefficients are predicted via a 3-dimensional probability table indexed on
+ * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */
+#define REF_TYPES 2 // intra=0, inter=1
+
+struct AV1Common;
+struct frame_contexts;
+void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
+void av1_default_coef_probs(struct AV1Common *cm);
+void av1_init_mode_probs(struct frame_contexts *fc);
+
+struct frame_contexts;
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+ ENTROPY_CONTEXT b) {
+ return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l) {
+ ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+ switch (tx_size) {
+ case TX_4X4:
+ above_ec = a[0] != 0;
+ left_ec = l[0] != 0;
+ break;
+ case TX_4X8:
+ above_ec = a[0] != 0;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_8X4:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = l[0] != 0;
+ break;
+ case TX_8X16:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_16X8:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_16X32:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_32X16:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_8X8:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_16X16:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_32X32:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_64X64:
+ above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+ left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+ break;
+ case TX_32X64:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+ break;
+ case TX_64X32:
+ above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_4X16:
+ above_ec = a[0] != 0;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_16X4:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = l[0] != 0;
+ break;
+ case TX_8X32:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_32X8:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_16X64:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+ break;
+ case TX_64X16:
+ above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ default: assert(0 && "Invalid transform size."); break;
+ }
+ return combine_entropy_contexts(above_ec, left_ec);
+}
+
+static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+ return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
+ 1);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c
new file mode 100644
index 0000000000..8381c1fdd0
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.c
@@ -0,0 +1,1094 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+
+static const aom_cdf_prob
+ default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
+ INTRA_MODES)] = {
+ { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
+ 24189, 28165, 29093, 30466) },
+ { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032,
+ 24434, 28658, 30172, 31409) },
+ { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620,
+ 26160, 29336, 29929, 31567) },
+ { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096,
+ 24746, 29585, 30958, 32462) },
+ { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583,
+ 26437, 30261, 31073, 32475) } },
+ { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023,
+ 25381, 29014, 30482, 31436) },
+ { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423,
+ 27610, 29905, 31276, 31794) },
+ { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405,
+ 24469, 27915, 29090, 30492) },
+ { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825,
+ 24649, 29153, 31096, 32210) },
+ { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516,
+ 26001, 29675, 30981, 31994) } },
+ { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055,
+ 25729, 29538, 30305, 32077) },
+ { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062,
+ 23219, 27743, 29211, 30907) },
+ { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555,
+ 30467, 30794, 32086) },
+ { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523,
+ 23878, 28975, 30287, 32252) },
+ { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561,
+ 30072, 30737, 32463) } },
+ { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419,
+ 25060, 29696, 30917, 32409) },
+ { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468,
+ 25225, 29485, 31158, 32342) },
+ { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605,
+ 29118, 30078, 32018) },
+ { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743,
+ 30389, 31536, 32528) },
+ { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718,
+ 25769, 29953, 30983, 32485) } },
+ { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449,
+ 26219, 30214, 31150, 32477) },
+ { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236,
+ 25380, 29653, 31143, 32277) },
+ { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466,
+ 29900, 30523, 32261) },
+ { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753,
+ 24615, 29489, 30883, 32482) },
+ { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180,
+ 31355, 31802, 32593) } }
+ };
+
+static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE(
+ 2 * MAX_ANGLE_DELTA + 1)] = {
+ { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) },
+ { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) },
+ { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) },
+ { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) },
+ { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) },
+ { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) },
+ { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) },
+ { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) }
+};
+
+static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+ INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123,
+ 26606, 27418, 27945, 29228, 29685, 30349) },
+ { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649,
+ 25527, 27364, 28152, 29701, 29984, 30852) },
+ { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654,
+ 25136, 27073, 27830, 29360, 29730, 30659) },
+ { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533,
+ 23703, 24804, 25352, 26575, 27016, 28049) } };
+
+static const aom_cdf_prob
+ default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE(
+ UV_INTRA_MODES)] = {
+ { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923,
+ 28244, 30059, 30941, 31961) },
+ { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824,
+ 28359, 29505, 29800, 31796) },
+ { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854,
+ 30764, 31777, 32029) },
+ { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148,
+ 28577, 30612, 31355, 32493) },
+ { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243,
+ 31101, 31744, 32363) },
+ { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458,
+ 29711, 31161, 31441, 32550) },
+ { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200,
+ 30245, 31837, 32342, 32667) },
+ { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128,
+ 29267, 30643, 31961, 32461) },
+ { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273,
+ 28443, 30388, 30767, 32416) },
+ { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719,
+ 23174, 28861, 30379, 32175) },
+ { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119,
+ 23527, 27053, 31397, 32148) },
+ { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907,
+ 22482, 25896, 26541, 31819) },
+ { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166,
+ 15255, 15753, 16039, 16606) } },
+ { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656,
+ 15986, 20086, 20995, 22455, 24212) },
+ { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451,
+ 22099, 24228, 24693, 27032, 29472) },
+ { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774,
+ 23138, 24256, 24703, 26679) },
+ { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371,
+ 21520, 22206, 23389, 24182) },
+ { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411,
+ 24911, 25380, 26027, 26376) },
+ { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981,
+ 24780, 25386, 26517, 27176) },
+ { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803,
+ 23188, 23763, 24455, 24940) },
+ { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059,
+ 22336, 23204, 23964, 24793) },
+ { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898,
+ 22494, 23139, 24764, 25989) },
+ { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004,
+ 15534, 20714, 21789, 23443, 24861) },
+ { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235,
+ 15902, 20102, 22696, 23774, 25838) },
+ { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163,
+ 15636, 19676, 20474, 23519, 25208) },
+ { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
+ 9875, 10521, 29048) } }
+ };
+
+static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
+ EXT_PARTITION_TYPES)] = {
+ { AOM_CDF4(19132, 25510, 30392) },
+ { AOM_CDF4(13928, 19855, 28540) },
+ { AOM_CDF4(12522, 23679, 28629) },
+ { AOM_CDF4(9896, 18783, 25853) },
+ { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) },
+ { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) },
+ { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) },
+ { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) },
+ { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) },
+ { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) },
+ { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) },
+ { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) },
+ { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) },
+ { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) },
+ { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) },
+ { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) },
+ { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+ { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+ { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+ { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
+};
+
+static const aom_cdf_prob default_intra_ext_tx_cdf
+ [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
+ {
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ },
+ {
+ {
+ { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) },
+ { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) },
+ { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) },
+ { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) },
+ { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) },
+ { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) },
+ { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) },
+ { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) },
+ { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) },
+ { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) },
+ { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) },
+ { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) },
+ { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) },
+ },
+ {
+ { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) },
+ { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) },
+ { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) },
+ { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) },
+ { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) },
+ { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) },
+ { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) },
+ { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) },
+ { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) },
+ { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) },
+ { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) },
+ { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) },
+ { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) },
+ },
+ {
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ },
+ {
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ },
+ },
+ {
+ {
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ },
+ {
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ },
+ {
+ { AOM_CDF5(1127, 12814, 22772, 27483) },
+ { AOM_CDF5(145, 6761, 11980, 26667) },
+ { AOM_CDF5(362, 5887, 11678, 16725) },
+ { AOM_CDF5(385, 15213, 18587, 30693) },
+ { AOM_CDF5(25, 2914, 23134, 27903) },
+ { AOM_CDF5(60, 4470, 11749, 23991) },
+ { AOM_CDF5(37, 3332, 14511, 21448) },
+ { AOM_CDF5(157, 6320, 13036, 17439) },
+ { AOM_CDF5(119, 6719, 12906, 29396) },
+ { AOM_CDF5(47, 5537, 12576, 21499) },
+ { AOM_CDF5(269, 6076, 11258, 23115) },
+ { AOM_CDF5(83, 5615, 12001, 17228) },
+ { AOM_CDF5(1968, 5556, 12023, 18547) },
+ },
+ {
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ },
+ },
+ };
+
+static const aom_cdf_prob
+ default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+ TX_TYPES)] = {
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504,
+ 22848, 23934, 25474, 27727, 28915, 30631) },
+ { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674,
+ 20408, 22517, 25010, 27116, 28856, 30749) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ },
+ {
+ { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+ 24576, 27307, 30037) },
+ { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+ 24576, 27307, 30037) },
+ { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595,
+ 28526, 30529) },
+ { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+ 24576, 27307, 30037) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(4167) },
+ { AOM_CDF2(1998) },
+ { AOM_CDF2(748) },
+ },
+ };
+
+static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
+ AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
+};
+
+static const aom_cdf_prob
+ default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+ { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
+ 32704, 32708, 32712, 32716, 32720, 32724) },
+ { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620,
+ 32647, 32668, 32672, 32676, 32680, 32684) },
+ { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673,
+ 32677, 32681, 32685, 32689, 32693, 32697) },
+ { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708,
+ 32712, 32716, 32720, 32724, 32728, 32732) },
+ { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394,
+ 32464, 32516, 32560, 32576, 32593, 32622) },
+ { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
+ 32413, 32520, 32594, 32622, 32656, 32660) }
+ };
+
+static const aom_cdf_prob
+ default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+ SWITCHABLE_FILTERS)] = {
+ { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) },
+ { AOM_CDF3(422, 2938) }, { AOM_CDF3(28244, 32608) },
+ { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) },
+ { AOM_CDF3(770, 1152) }, { AOM_CDF3(20889, 25637) },
+ { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) },
+ { AOM_CDF3(305, 2247) }, { AOM_CDF3(27403, 32636) },
+ { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) },
+ { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) }
+ };
+
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+ { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+
+static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
+
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+ { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+
+static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
+};
+
+static const aom_cdf_prob
+ default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
+ INTER_COMPOUND_MODES)] = {
+ { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+ { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+ { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+ { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+ { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+ { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+ { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+ { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) }
+ };
+
+static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(16384) },
+ { AOM_CDF2(26887) },
+ { AOM_CDF2(27597) },
+ { AOM_CDF2(30237) } };
+
+static const aom_cdf_prob
+ default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+ INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(1875, 11082, 27332) },
+ { AOM_CDF4(2473, 9996, 26388) },
+ { AOM_CDF4(4238, 11537, 25926) } };
+
+static const aom_cdf_prob
+ default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) },
+ { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) },
+ { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }
+ };
+
+static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ MASKED_COMPOUND_TYPES)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+ { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) },
+ { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }
+};
+
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+ 20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+ { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323,
+ 17367, 18452, 19422, 22839, 26127, 29629) },
+ { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939,
+ 21332, 24520, 27470, 29456, 30529, 31656) },
+ { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+ 19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+ { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369,
+ 16730, 18114, 19313, 22521, 26012, 29550) },
+ { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+ 17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+ { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+ 20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703,
+ 24284, 24985, 25684, 27259, 28883, 30911) },
+ { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935,
+ 25057, 27251, 29173, 30089, 30960, 31933) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+ 18432, 20480, 22528, 24576, 26624, 28672, 30720) } };
+
+static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) },
+ { AOM_CDF3(4738, 24765) }, { AOM_CDF3(5391, 25528) },
+ { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) },
+ { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) },
+ { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) },
+ { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) },
+ { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) },
+ { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) },
+ { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } };
+
+static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(10437) }, { AOM_CDF2(9371) }, { AOM_CDF2(9301) },
+ { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) },
+ { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) },
+ { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) },
+ { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) },
+ { AOM_CDF2(26879) }
+};
+
+static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS]
+ [CDF_SIZE(2)] = {
+ { AOM_CDF2(806) },
+ { AOM_CDF2(16662) },
+ { AOM_CDF2(20186) },
+ { AOM_CDF2(26538) }
+ };
+
+static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(26828) },
+ { AOM_CDF2(24035) },
+ { AOM_CDF2(12031) },
+ { AOM_CDF2(10640) },
+ { AOM_CDF2(2901) } };
+
+static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS]
+ [CDF_SIZE(2)] = {
+ { AOM_CDF2(1198) },
+ { AOM_CDF2(2070) },
+ { AOM_CDF2(9166) },
+ { AOM_CDF2(7499) },
+ { AOM_CDF2(22475) }
+ };
+
+static const aom_cdf_prob
+ default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS -
+ 1][CDF_SIZE(2)] = {
+ { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } },
+ { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } },
+ { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } }
+ };
+
+static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1]
+ [CDF_SIZE(2)] = {
+ { { AOM_CDF2(4897) },
+ { AOM_CDF2(1555) },
+ { AOM_CDF2(4236) },
+ { AOM_CDF2(8650) },
+ { AOM_CDF2(904) },
+ { AOM_CDF2(1444) } },
+ { { AOM_CDF2(16973) },
+ { AOM_CDF2(16751) },
+ { AOM_CDF2(19647) },
+ { AOM_CDF2(24773) },
+ { AOM_CDF2(11014) },
+ { AOM_CDF2(15087) } },
+ { { AOM_CDF2(29744) },
+ { AOM_CDF2(30279) },
+ { AOM_CDF2(31194) },
+ { AOM_CDF2(31895) },
+ { AOM_CDF2(26875) },
+ { AOM_CDF2(30304) } }
+ };
+
+static const aom_cdf_prob
+ default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
+ { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } },
+ { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } },
+ { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } }
+ };
+
+static const aom_cdf_prob
+ default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
+ { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } },
+ { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } },
+ { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } }
+ };
+
+static const aom_cdf_prob
+ default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+ { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) },
+ { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) },
+ { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) },
+ { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) },
+ { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) },
+ { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) },
+ { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) }
+ };
+
+static const aom_cdf_prob
+ default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+ { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) },
+ { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) },
+ { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) },
+ { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) },
+ { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) },
+ { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) },
+ { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) }
+ };
+
+static const aom_cdf_prob default_palette_y_mode_cdf
+ [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = {
+ { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } },
+ { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } },
+ { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } },
+ { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } },
+ { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } },
+ { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } },
+ { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } }
+ };
+
+static const aom_cdf_prob
+ default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(32461) }, { AOM_CDF2(21488) }
+ };
+
+static const aom_cdf_prob default_palette_y_color_index_cdf
+ [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+ {
+ { AOM_CDF2(28710) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(10553) },
+ { AOM_CDF2(27036) },
+ { AOM_CDF2(31603) },
+ },
+ {
+ { AOM_CDF3(27877, 30490) },
+ { AOM_CDF3(11532, 25697) },
+ { AOM_CDF3(6544, 30234) },
+ { AOM_CDF3(23018, 28072) },
+ { AOM_CDF3(31915, 32385) },
+ },
+ {
+ { AOM_CDF4(25572, 28046, 30045) },
+ { AOM_CDF4(9478, 21590, 27256) },
+ { AOM_CDF4(7248, 26837, 29824) },
+ { AOM_CDF4(19167, 24486, 28349) },
+ { AOM_CDF4(31400, 31825, 32250) },
+ },
+ {
+ { AOM_CDF5(24779, 26955, 28576, 30282) },
+ { AOM_CDF5(8669, 20364, 24073, 28093) },
+ { AOM_CDF5(4255, 27565, 29377, 31067) },
+ { AOM_CDF5(19864, 23674, 26716, 29530) },
+ { AOM_CDF5(31646, 31893, 32147, 32426) },
+ },
+ {
+ { AOM_CDF6(23132, 25407, 26970, 28435, 30073) },
+ { AOM_CDF6(7443, 17242, 20717, 24762, 27982) },
+ { AOM_CDF6(6300, 24862, 26944, 28784, 30671) },
+ { AOM_CDF6(18916, 22895, 25267, 27435, 29652) },
+ { AOM_CDF6(31270, 31550, 31808, 32059, 32353) },
+ },
+ {
+ { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) },
+ { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) },
+ { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) },
+ { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) },
+ { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) },
+ },
+ {
+ { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+ { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+ { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+ { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+ { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+ },
+ };
+
+static const aom_cdf_prob default_palette_uv_color_index_cdf
+ [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+ {
+ { AOM_CDF2(29089) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(8713) },
+ { AOM_CDF2(29257) },
+ { AOM_CDF2(31610) },
+ },
+ {
+ { AOM_CDF3(25257, 29145) },
+ { AOM_CDF3(12287, 27293) },
+ { AOM_CDF3(7033, 27960) },
+ { AOM_CDF3(20145, 25405) },
+ { AOM_CDF3(30608, 31639) },
+ },
+ {
+ { AOM_CDF4(24210, 27175, 29903) },
+ { AOM_CDF4(9888, 22386, 27214) },
+ { AOM_CDF4(5901, 26053, 29293) },
+ { AOM_CDF4(18318, 22152, 28333) },
+ { AOM_CDF4(30459, 31136, 31926) },
+ },
+ {
+ { AOM_CDF5(22980, 25479, 27781, 29986) },
+ { AOM_CDF5(8413, 21408, 24859, 28874) },
+ { AOM_CDF5(2257, 29449, 30594, 31598) },
+ { AOM_CDF5(19189, 21202, 25915, 28620) },
+ { AOM_CDF5(31844, 32044, 32281, 32518) },
+ },
+ {
+ { AOM_CDF6(22217, 24567, 26637, 28683, 30548) },
+ { AOM_CDF6(7307, 16406, 19636, 24632, 28424) },
+ { AOM_CDF6(4441, 25064, 26879, 28942, 30919) },
+ { AOM_CDF6(17210, 20528, 23319, 26750, 29582) },
+ { AOM_CDF6(30674, 30953, 31396, 31735, 32207) },
+ },
+ {
+ { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) },
+ { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) },
+ { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) },
+ { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) },
+ { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) },
+ },
+ {
+ { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+ { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+ { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+ { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+ { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+ },
+ };
+
+static const aom_cdf_prob
+ default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) },
+ { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) },
+ { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) },
+ { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) },
+ { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) },
+ { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) },
+ { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
+ };
+
+static const aom_cdf_prob default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
+};
+
+static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } };
+
+static const aom_cdf_prob
+ default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) },
+ { AOM_CDF2(13259) }, { AOM_CDF2(9334) }, { AOM_CDF2(4644) }
+ };
+
+static const aom_cdf_prob
+ default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) },
+ { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) }
+ };
+
+static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+ 30531) };
+
+static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE(
+ FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) };
+
+static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(
+ 2)] = { { AOM_CDF2(4621) }, { AOM_CDF2(6743) }, { AOM_CDF2(5893) },
+ { AOM_CDF2(7866) }, { AOM_CDF2(12551) }, { AOM_CDF2(9394) },
+ { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) },
+ { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) },
+ { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } };
+
+static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE(
+ RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) };
+
+static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+ 11570) };
+
+static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+ 16855) };
+
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+ AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(
+ DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) },
+ { AOM_CDF4(28160, 32120, 32677) },
+ { AOM_CDF4(28160, 32120, 32677) },
+ { AOM_CDF4(28160, 32120, 32677) } };
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+ AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob
+ default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
+ { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
+ };
+
+static const aom_cdf_prob
+ default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE(
+ MAX_SEGMENTS)] = {
+ {
+ AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533),
+ },
+ {
+ AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344),
+ },
+ {
+ AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679),
+ },
+ };
+
+static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+ [CDF_SIZE(MAX_TX_DEPTH + 1)] = {
+ { { AOM_CDF2(19968) },
+ { AOM_CDF2(19968) },
+ { AOM_CDF2(24320) } },
+ { { AOM_CDF3(12272, 30172) },
+ { AOM_CDF3(12272, 30172) },
+ { AOM_CDF3(18677, 30848) } },
+ { { AOM_CDF3(12986, 15180) },
+ { AOM_CDF3(12986, 15180) },
+ { AOM_CDF3(24302, 25602) } },
+ { { AOM_CDF3(5782, 11475) },
+ { AOM_CDF3(5782, 11475) },
+ { AOM_CDF3(16803, 22759) } },
+ };
+
+// Negative values are invalid
+const int av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1] = {
+ -1, -1, 0, -1, -1, 4, 3, 2, 1
+};
+
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+ int r, int c, int palette_size,
+ uint8_t *color_order, int *color_idx) {
+ assert(palette_size <= PALETTE_MAX_SIZE);
+ assert(r > 0 || c > 0);
+
+ // Get color indices of neighbors.
+ int color_neighbors[NUM_PALETTE_NEIGHBORS];
+ color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+ color_neighbors[1] =
+ (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+ color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+
+ // The +10 below should not be needed. But we get a warning "array subscript
+ // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+ // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+ int scores[PALETTE_MAX_SIZE + 10] = { 0 };
+ int i;
+ static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
+ for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+ if (color_neighbors[i] >= 0) {
+ scores[color_neighbors[i]] += weights[i];
+ }
+ }
+
+ int inverse_color_order[PALETTE_MAX_SIZE];
+ for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+ color_order[i] = i;
+ inverse_color_order[i] = i;
+ }
+
+ // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
+ for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+ int max = scores[i];
+ int max_idx = i;
+ for (int j = i + 1; j < palette_size; ++j) {
+ if (scores[j] > max) {
+ max = scores[j];
+ max_idx = j;
+ }
+ }
+ if (max_idx != i) {
+ // Move the score at index 'max_idx' to index 'i', and shift the scores
+ // from 'i' to 'max_idx - 1' by 1.
+ const int max_score = scores[max_idx];
+ const uint8_t max_color_order = color_order[max_idx];
+ for (int k = max_idx; k > i; --k) {
+ scores[k] = scores[k - 1];
+ color_order[k] = color_order[k - 1];
+ inverse_color_order[color_order[k]] = k;
+ }
+ scores[i] = max_score;
+ color_order[i] = max_color_order;
+ inverse_color_order[color_order[i]] = i;
+ }
+ }
+
+ if (color_idx != NULL)
+ *color_idx = inverse_color_order[color_map[r * stride + c]];
+
+ // Get hash value of context.
+ int color_index_ctx_hash = 0;
+ static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+ for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+ color_index_ctx_hash += scores[i] * hash_multipliers[i];
+ }
+ assert(color_index_ctx_hash > 0);
+ assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+ // Lookup context from hash.
+ const int color_index_ctx =
+ av1_palette_color_index_context_lookup[color_index_ctx_hash];
+ assert(color_index_ctx >= 0);
+ assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+ return color_index_ctx;
+}
+
+void av1_init_mode_probs(FRAME_CONTEXT *fc) {
+ av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
+ av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
+ av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
+ av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
+ av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
+ av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
+ av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
+ av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
+ av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
+ av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
+ av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
+ av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
+ av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
+ av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
+ av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
+ av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
+ av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
+ av1_copy(fc->newmv_cdf, default_newmv_cdf);
+ av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
+ av1_copy(fc->refmv_cdf, default_refmv_cdf);
+ av1_copy(fc->drl_cdf, default_drl_cdf);
+ av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
+ av1_copy(fc->obmc_cdf, default_obmc_cdf);
+ av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
+ av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
+ av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
+ av1_copy(fc->interintra_cdf, default_interintra_cdf);
+ av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
+ av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
+ av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
+ av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
+ av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
+ av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
+ av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
+ av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
+ av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
+ av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
+ av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
+ av1_copy(fc->partition_cdf, default_partition_cdf);
+ av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
+ av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
+ av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
+ av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs);
+ av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
+ for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
+ av1_copy(fc->seg.spatial_pred_seg_cdf[i],
+ default_spatial_pred_seg_tree_cdf[i]);
+ av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
+ av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
+ av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
+ av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
+ av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
+ av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
+ av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
+}
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas) {
+ assert(ref_deltas != NULL);
+
+ ref_deltas[INTRA_FRAME] = 1;
+ ref_deltas[LAST_FRAME] = 0;
+ ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
+ ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
+ ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
+ ref_deltas[GOLDEN_FRAME] = -1;
+ ref_deltas[ALTREF2_FRAME] = -1;
+ ref_deltas[ALTREF_FRAME] = -1;
+}
+
+void av1_set_default_mode_deltas(int8_t *mode_deltas) {
+ assert(mode_deltas != NULL);
+
+ mode_deltas[0] = 0;
+ mode_deltas[1] = 0;
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+ lf->mode_ref_delta_enabled = 1;
+ lf->mode_ref_delta_update = 1;
+
+ av1_set_default_ref_deltas(lf->ref_deltas);
+ av1_set_default_mode_deltas(lf->mode_deltas);
+}
+
+void av1_setup_frame_contexts(AV1_COMMON *cm) {
+ // Store the frame context into a special slot (not associated with any
+ // reference buffer), so that we can set up cm->pre_fc correctly later
+ // This function must ONLY be called when cm->fc has been initialized with
+ // default probs, either by av1_setup_past_independence or after manually
+ // initializing them
+ *cm->default_frame_context = *cm->fc;
+ // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
+ // but could do with fuller testing
+ if (cm->tiles.large_scale) {
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
+ if (buf != NULL) buf->frame_context = *cm->fc;
+ }
+ for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i)
+ cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
+ }
+}
+
+void av1_setup_past_independence(AV1_COMMON *cm) {
+ // Reset the segment feature data to the default stats:
+ // Features disabled, 0, with delta coding (Default state).
+ av1_clearall_segfeatures(&cm->seg);
+
+ if (cm->cur_frame->seg_map) {
+ memset(cm->cur_frame->seg_map, 0,
+ (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
+ }
+
+ // reset mode ref deltas
+ av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+ av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+ set_default_lf_deltas(&cm->lf);
+
+ av1_default_coef_probs(cm);
+ av1_init_mode_probs(cm->fc);
+ av1_init_mv_probs(cm);
+ cm->fc->initialized = 1;
+ av1_setup_frame_contexts(cm);
+}
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
new file mode 100644
index 0000000000..09cd6bd1e9
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/filter.h"
+#include "av1/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 3
+
+#define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV)
+
+// Number of possible contexts for a color index.
+// As can be seen from av1_get_palette_color_index_context(), the possible
+// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to
+// a value from 0 to 4 using 'av1_palette_color_index_context_lookup' table.
+#define PALETTE_COLOR_INDEX_CONTEXTS 5
+
+// Palette Y mode context for a block is determined by number of neighboring
+// blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
+// context values are:
+// 0 if neither left nor top block uses palette for Y plane,
+// 1 if exactly one of left or top block uses palette for Y plane, and
+// 2 if both left and top blocks use palette for Y plane.
+#define PALETTE_Y_MODE_CONTEXTS 3
+
+// Palette UV mode context for a block is determined by whether this block uses
+// palette for the Y plane. So, possible values are:
+// 0 if this block doesn't use palette for Y plane.
+// 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
+#define PALETTE_UV_MODE_CONTEXTS 2
+
+// Map the number of pixels in a block size to a context
+// 64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4) -> 0
+// 128(BLOCK_8X16, BLOCK_16x8) -> 1
+// ...
+// 4096(BLOCK_64X64) -> 6
+#define PALATTE_BSIZE_CTXS 7
+
+#define MAX_COLOR_CONTEXT_HASH 8
+
+#define NUM_PALETTE_NEIGHBORS 3 // left, top-left and top.
+
+#define KF_MODE_CONTEXTS 5
+
+struct AV1Common;
+
+typedef struct {
+ const int16_t *scan;
+ const int16_t *iscan;
+} SCAN_ORDER;
+
+typedef struct frame_contexts {
+ aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
+ [CDF_SIZE(2)];
+ aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)];
+ aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)];
+ aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)];
+ aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)];
+ aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)];
+ aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)];
+ aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)];
+ aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]
+ [CDF_SIZE(3)];
+ aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+ [CDF_SIZE(4)];
+ aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+ [CDF_SIZE(BR_CDF_SIZE)];
+
+ aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)];
+
+ aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
+ [CDF_SIZE(INTER_COMPOUND_MODES)];
+ aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL]
+ [CDF_SIZE(MASKED_COMPOUND_TYPES)];
+ aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
+ aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
+ aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+ aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS]
+ [CDF_SIZE(INTERINTRA_MODES)];
+ aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)];
+ aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+ aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+ aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+ aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [CDF_SIZE(PALETTE_COLORS)];
+ aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [CDF_SIZE(PALETTE_COLORS)];
+ aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]
+ [CDF_SIZE(2)];
+ aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)];
+ aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+ [CDF_SIZE(2)];
+ aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)];
+ aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)];
+ aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
+ nmv_context nmvc;
+ nmv_context ndvc;
+ aom_cdf_prob intrabc_cdf[CDF_SIZE(2)];
+ struct segmentation_probs seg;
+ aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+ aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)];
+ aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)];
+ aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)];
+ aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)];
+ aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
+ aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]
+ [CDF_SIZE(UV_INTRA_MODES)];
+ aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
+ aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
+ [CDF_SIZE(SWITCHABLE_FILTERS)];
+ /* kf_y_cdf is discarded after use, so does not require persistent storage.
+ However, we keep it with the other CDFs in this struct since it needs to
+ be copied to each tile to support parallelism just like the others.
+ */
+ aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]
+ [CDF_SIZE(INTRA_MODES)];
+
+ aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES]
+ [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)];
+
+ aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+ [CDF_SIZE(MAX_TX_DEPTH + 1)];
+ aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
+ aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)];
+ aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
+ aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [CDF_SIZE(TX_TYPES)];
+ aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
+ [CDF_SIZE(TX_TYPES)];
+ aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)];
+ aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)];
+ int initialized;
+} FRAME_CONTEXT;
+
+static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 },
+ { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 },
+ { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 },
+};
+
+static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 },
+ { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 },
+};
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas);
+void av1_set_default_mode_deltas(int8_t *mode_deltas);
+void av1_setup_frame_contexts(struct AV1Common *cm);
+void av1_setup_past_independence(struct AV1Common *cm);
+
+// Returns (int)ceil(log2(n)).
+static INLINE int av1_ceil_log2(int n) {
+ if (n < 2) return 0;
+ int i = 1;
+ unsigned int p = 2;
+ while (p < (unsigned int)n) {
+ i++;
+ p = p << 1;
+ }
+ return i;
+}
+
+// Returns the context for palette color index at row 'r' and column 'c',
+// along with the 'color_order' of neighbors and the 'color_idx'.
+// The 'color_map' is a 2D array with the given 'stride'.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+ int r, int c, int palette_size,
+ uint8_t *color_order, int *color_idx);
+
+extern const int
+ av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
new file mode 100644
index 0000000000..e1e42f2f18
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/entropymv.h"
+
+static const nmv_context default_nmv_context = {
+ { AOM_CDF4(4096, 11264, 19328) }, // joints_cdf
+ { {
+ // Vertical component
+ { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+ 32762, 32767) }, // class_cdf // fp
+ { { AOM_CDF4(16384, 24576, 26624) },
+ { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf
+ { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf
+ { AOM_CDF2(128 * 128) }, // sign_cdf
+ { AOM_CDF2(160 * 128) }, // class0_hp_cdf
+ { AOM_CDF2(128 * 128) }, // hp_cdf
+ { AOM_CDF2(216 * 128) }, // class0_cdf
+ { { AOM_CDF2(128 * 136) },
+ { AOM_CDF2(128 * 140) },
+ { AOM_CDF2(128 * 148) },
+ { AOM_CDF2(128 * 160) },
+ { AOM_CDF2(128 * 176) },
+ { AOM_CDF2(128 * 192) },
+ { AOM_CDF2(128 * 224) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 240) } }, // bits_cdf
+ },
+ {
+ // Horizontal component
+ { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+ 32762, 32767) }, // class_cdf // fp
+ { { AOM_CDF4(16384, 24576, 26624) },
+ { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf
+ { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf
+ { AOM_CDF2(128 * 128) }, // sign_cdf
+ { AOM_CDF2(160 * 128) }, // class0_hp_cdf
+ { AOM_CDF2(128 * 128) }, // hp_cdf
+ { AOM_CDF2(216 * 128) }, // class0_cdf
+ { { AOM_CDF2(128 * 136) },
+ { AOM_CDF2(128 * 140) },
+ { AOM_CDF2(128 * 148) },
+ { AOM_CDF2(128 * 160) },
+ { AOM_CDF2(128 * 176) },
+ { AOM_CDF2(128 * 192) },
+ { AOM_CDF2(128 * 224) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 240) } }, // bits_cdf
+ } },
+};
+
+void av1_init_mv_probs(AV1_COMMON *cm) {
+ // NB: this sets CDFs too
+ cm->fc->nmvc = default_nmv_context;
+ cm->fc->ndvc = default_nmv_context;
+}
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
new file mode 100644
index 0000000000..cddc80768c
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+void av1_init_mv_probs(struct AV1Common *cm);
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS 4
+enum {
+ MV_JOINT_ZERO = 0, /* Zero vector */
+ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */
+ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */
+ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
+} UENUM1BYTE(MV_JOINT_TYPE);
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+ return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+ return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES 11
+enum {
+ MV_CLASS_0 = 0, /* (0, 2] integer pel */
+ MV_CLASS_1 = 1, /* (2, 4] integer pel */
+ MV_CLASS_2 = 2, /* (4, 8] integer pel */
+ MV_CLASS_3 = 3, /* (8, 16] integer pel */
+ MV_CLASS_4 = 4, /* (16, 32] integer pel */
+ MV_CLASS_5 = 5, /* (32, 64] integer pel */
+ MV_CLASS_6 = 6, /* (64, 128] integer pel */
+ MV_CLASS_7 = 7, /* (128, 256] integer pel */
+ MV_CLASS_8 = 8, /* (256, 512] integer pel */
+ MV_CLASS_9 = 9, /* (512, 1024] integer pel */
+ MV_CLASS_10 = 10, /* (1024,2048] integer pel */
+} UENUM1BYTE(MV_CLASS_TYPE);
+
+#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
+#define CLASS0_SIZE (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_BITS_CONTEXTS 6
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP (1 << MV_IN_USE_BITS)
+#define MV_LOW (-(1 << MV_IN_USE_BITS))
+
+typedef struct {
+ aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)];
+ aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
+ aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
+ aom_cdf_prob sign_cdf[CDF_SIZE(2)];
+ aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)];
+ aom_cdf_prob hp_cdf[CDF_SIZE(2)];
+ aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)];
+ aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)];
+} nmv_component;
+
+typedef struct {
+ aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)];
+ nmv_component comps[2];
+} nmv_context;
+
+enum {
+ MV_SUBPEL_NONE = -1,
+ MV_SUBPEL_LOW_PRECISION = 0,
+ MV_SUBPEL_HIGH_PRECISION,
+} SENUM1BYTE(MvSubpelPrecision);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
new file mode 100644
index 0000000000..b99a138675
--- /dev/null
+++ b/third_party/aom/av1/common/enums.h
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @file */
+
+/*!\cond */
+
+// Max superblock size
+#define MAX_SB_SIZE_LOG2 7
+#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
+// Pixels per Mode Info (MI) unit
+#define MI_SIZE_LOG2 2
+#define MI_SIZE (1 << MI_SIZE_LOG2)
+
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
+
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
+
+// Maximum number of tile rows and tile columns
+#define MAX_TILE_ROWS 64
+#define MAX_TILE_COLS 64
+
+#define MAX_VARTX_DEPTH 2
+
+#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
+
+#define MAX_PALETTE_SQUARE (64 * 64)
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+#define FRAME_OFFSET_BITS 5
+#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
+
+// 4 frame filter levels: y plane vertical, y plane horizontal,
+// u plane, and v plane
+#define FRAME_LF_COUNT 4
+#define DEFAULT_DELTA_LF_MULTI 0
+#define MAX_MODE_LF_DELTAS 2
+
+#define DIST_PRECISION_BITS 4
+#define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16
+
+#define PROFILE_BITS 3
+// The following three profiles are currently defined.
+// Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only.
+// Profile 1. 8-bit and 10-bit 4:4:4
+// Profile 2. 8-bit and 10-bit 4:2:2
+// 12-bit 4:0:0, 4:2:2 and 4:4:4
+// Since we have three bits for the profiles, it can be extended later.
+enum {
+ PROFILE_0,
+ PROFILE_1,
+ PROFILE_2,
+ MAX_PROFILES,
+} SENUM1BYTE(BITSTREAM_PROFILE);
+
+#define OP_POINTS_CNT_MINUS_1_BITS 5
+#define OP_POINTS_IDC_BITS 12
+
+// Note: Some enums use the attribute 'packed' to use smallest possible integer
+// type, so that we can save memory when they are used in structs/arrays.
+
+typedef enum ATTRIBUTE_PACKED {
+ BLOCK_4X4,
+ BLOCK_4X8,
+ BLOCK_8X4,
+ BLOCK_8X8,
+ BLOCK_8X16,
+ BLOCK_16X8,
+ BLOCK_16X16,
+ BLOCK_16X32,
+ BLOCK_32X16,
+ BLOCK_32X32,
+ BLOCK_32X64,
+ BLOCK_64X32,
+ BLOCK_64X64,
+ BLOCK_64X128,
+ BLOCK_128X64,
+ BLOCK_128X128,
+ BLOCK_4X16,
+ BLOCK_16X4,
+ BLOCK_8X32,
+ BLOCK_32X8,
+ BLOCK_16X64,
+ BLOCK_64X16,
+ BLOCK_SIZES_ALL,
+ BLOCK_SIZES = BLOCK_4X16,
+ BLOCK_INVALID = 255,
+ BLOCK_LARGEST = (BLOCK_SIZES - 1)
+} BLOCK_SIZE;
+
+// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+#define SQR_BLOCK_SIZES 6
+
+// Partition types. R: Recursive
+//
+// NONE HORZ VERT SPLIT
+// +-------+ +-------+ +---+---+ +---+---+
+// | | | | | | | | R | R |
+// | | +-------+ | | | +---+---+
+// | | | | | | | | R | R |
+// +-------+ +-------+ +---+---+ +---+---+
+//
+// HORZ_A HORZ_B VERT_A VERT_B
+// +---+---+ +-------+ +---+---+ +---+---+
+// | | | | | | | | | | |
+// +---+---+ +---+---+ +---+ | | +---+
+// | | | | | | | | | | |
+// +-------+ +---+---+ +---+---+ +---+---+
+//
+// HORZ_4 VERT_4
+// +-----+ +-+-+-+
+// +-----+ | | | |
+// +-----+ | | | |
+// +-----+ +-+-+-+
+enum {
+ PARTITION_NONE,
+ PARTITION_HORZ,
+ PARTITION_VERT,
+ PARTITION_SPLIT,
+ PARTITION_HORZ_A, // HORZ split and the top partition is split again
+ PARTITION_HORZ_B, // HORZ split and the bottom partition is split again
+ PARTITION_VERT_A, // VERT split and the left partition is split again
+ PARTITION_VERT_B, // VERT split and the right partition is split again
+ PARTITION_HORZ_4, // 4:1 horizontal partition
+ PARTITION_VERT_4, // 4:1 vertical partition
+ EXT_PARTITION_TYPES,
+ PARTITION_TYPES = PARTITION_SPLIT + 1,
+ PARTITION_INVALID = 255
+} UENUM1BYTE(PARTITION_TYPE);
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET 4 // number of probability models per block size
+#define PARTITION_BLOCK_SIZES 5
+#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
+
+#define TX_SIZE_LUMA_MIN (TX_4X4)
+/* We don't need to code a transform size unless the allowed size is at least
+ one more than the minimum. */
+#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1)
+
+// Maximum tx_size categories
+#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN)
+#define MAX_TX_DEPTH 2
+
+#define MAX_TX_SIZE_LOG2 (6)
+#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2 2
+#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR_LOG2 2
+#define TX_PAD_HOR 4
+// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
+// check.
+#define TX_PAD_TOP 0
+#define TX_PAD_BOTTOM 4
+#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
+
+// Number of maximum size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+// frame transform mode
+enum {
+ ONLY_4X4, // use only 4x4 transform
+ TX_MODE_LARGEST, // transform size is the largest possible for pu size
+ TX_MODE_SELECT, // transform specified for each block
+ TX_MODES,
+} UENUM1BYTE(TX_MODE);
+
+// 1D tx types
+enum {
+ DCT_1D,
+ ADST_1D,
+ FLIPADST_1D,
+ IDTX_1D,
+ TX_TYPES_1D,
+} UENUM1BYTE(TX_TYPE_1D);
+
+enum {
+ REG_REG,
+ REG_SMOOTH,
+ REG_SHARP,
+ SMOOTH_REG,
+ SMOOTH_SMOOTH,
+ SMOOTH_SHARP,
+ SHARP_REG,
+ SHARP_SMOOTH,
+ SHARP_SHARP,
+} UENUM1BYTE(DUAL_FILTER_TYPE);
+
+#define EXT_TX_SIZES 4 // number of sizes that use extended transforms
+#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA
+
+enum {
+ AOM_LAST_FLAG = 1 << 0,
+ AOM_LAST2_FLAG = 1 << 1,
+ AOM_LAST3_FLAG = 1 << 2,
+ AOM_GOLD_FLAG = 1 << 3,
+ AOM_BWD_FLAG = 1 << 4,
+ AOM_ALT2_FLAG = 1 << 5,
+ AOM_ALT_FLAG = 1 << 6,
+ AOM_REFFRAME_ALL = (1 << 7) - 1
+} UENUM1BYTE(AOM_REFFRAME);
+
+enum {
+ UNIDIR_COMP_REFERENCE,
+ BIDIR_COMP_REFERENCE,
+ COMP_REFERENCE_TYPES,
+} UENUM1BYTE(COMP_REFERENCE_TYPE);
+
+enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
+
+#define CFL_ALPHABET_SIZE_LOG2 4
+#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
+#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
+#define CFL_INDEX_ZERO CFL_ALPHABET_SIZE
+#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
+#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
+
+enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE);
+
+enum {
+ CFL_SIGN_ZERO,
+ CFL_SIGN_NEG,
+ CFL_SIGN_POS,
+ CFL_SIGNS
+} UENUM1BYTE(CFL_SIGN_TYPE);
+
+enum {
+ CFL_DISALLOWED,
+ CFL_ALLOWED,
+ CFL_ALLOWED_TYPES
+} UENUM1BYTE(CFL_ALLOWED_TYPE);
+
+// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
+#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
+// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8
+#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
+// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8
+#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
+
+// There is no context when the alpha for a given plane is zero.
+// So there are 2 fewer contexts than joint signs.
+#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS)
+#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS)
+// Also, the contexts are symmetric under swapping the planes.
+#define CFL_CONTEXT_V(js) \
+ (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
+
+enum {
+ PALETTE_MAP,
+ COLOR_MAP_TYPES,
+} UENUM1BYTE(COLOR_MAP_TYPE);
+
+enum {
+ TWO_COLORS,
+ THREE_COLORS,
+ FOUR_COLORS,
+ FIVE_COLORS,
+ SIX_COLORS,
+ SEVEN_COLORS,
+ EIGHT_COLORS,
+ PALETTE_SIZES
+} UENUM1BYTE(PALETTE_SIZE);
+
+enum {
+ PALETTE_COLOR_ONE,
+ PALETTE_COLOR_TWO,
+ PALETTE_COLOR_THREE,
+ PALETTE_COLOR_FOUR,
+ PALETTE_COLOR_FIVE,
+ PALETTE_COLOR_SIX,
+ PALETTE_COLOR_SEVEN,
+ PALETTE_COLOR_EIGHT,
+ PALETTE_COLORS
+} UENUM1BYTE(PALETTE_COLOR);
+
+// Note: All directional predictors must be between V_PRED and D67_PRED (both
+// inclusive).
+enum {
+ DC_PRED, // Average of above and left pixels
+ V_PRED, // Vertical
+ H_PRED, // Horizontal
+ D45_PRED, // Directional 45 degree
+ D135_PRED, // Directional 135 degree
+ D113_PRED, // Directional 113 degree
+ D157_PRED, // Directional 157 degree
+ D203_PRED, // Directional 203 degree
+ D67_PRED, // Directional 67 degree
+ SMOOTH_PRED, // Combination of horizontal and vertical interpolation
+ SMOOTH_V_PRED, // Vertical interpolation
+ SMOOTH_H_PRED, // Horizontal interpolation
+ PAETH_PRED, // Predict from the direction of smallest gradient
+ NEARESTMV,
+ NEARMV,
+ GLOBALMV,
+ NEWMV,
+ // Compound ref compound modes
+ NEAREST_NEARESTMV,
+ NEAR_NEARMV,
+ NEAREST_NEWMV,
+ NEW_NEARESTMV,
+ NEAR_NEWMV,
+ NEW_NEARMV,
+ GLOBAL_GLOBALMV,
+ NEW_NEWMV,
+ MB_MODE_COUNT,
+ PRED_MODE_INVALID = MB_MODE_COUNT,
+ INTRA_MODE_START = DC_PRED,
+ INTRA_MODE_END = NEARESTMV,
+ DIR_MODE_START = V_PRED,
+ DIR_MODE_END = D67_PRED + 1,
+ INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
+ SINGLE_INTER_MODE_START = NEARESTMV,
+ SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
+ SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START,
+ COMP_INTER_MODE_START = NEAREST_NEARESTMV,
+ COMP_INTER_MODE_END = MB_MODE_COUNT,
+ COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+ INTER_MODE_START = NEARESTMV,
+ INTER_MODE_END = MB_MODE_COUNT,
+ INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode.
+ INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks
+} UENUM1BYTE(PREDICTION_MODE);
+
+// TODO(ltrudeau) Do we really want to pack this?
+// TODO(ltrudeau) Do we match with PREDICTION_MODE?
+enum {
+ UV_DC_PRED, // Average of above and left pixels
+ UV_V_PRED, // Vertical
+ UV_H_PRED, // Horizontal
+ UV_D45_PRED, // Directional 45 degree
+ UV_D135_PRED, // Directional 135 degree
+ UV_D113_PRED, // Directional 113 degree
+ UV_D157_PRED, // Directional 157 degree
+ UV_D203_PRED, // Directional 203 degree
+ UV_D67_PRED, // Directional 67 degree
+ UV_SMOOTH_PRED, // Combination of horizontal and vertical interpolation
+ UV_SMOOTH_V_PRED, // Vertical interpolation
+ UV_SMOOTH_H_PRED, // Horizontal interpolation
+ UV_PAETH_PRED, // Predict from the direction of smallest gradient
+ UV_CFL_PRED, // Chroma-from-Luma
+ UV_INTRA_MODES,
+ UV_MODE_INVALID, // For uv_mode in inter blocks
+} UENUM1BYTE(UV_PREDICTION_MODE);
+
+// Number of top model rd to store for pruning y modes in intra mode decision
+#define TOP_INTRA_MODEL_COUNT 4
+// Total number of luma intra prediction modes (include both directional and
+// non-directional modes)
+// Because there are 8 directional modes, each has additional 6 delta angles.
+#define LUMA_MODE_COUNT (PAETH_PRED - DC_PRED + 1 + 6 * 8)
+
+enum {
+ SIMPLE_TRANSLATION,
+ OBMC_CAUSAL, // 2-sided OBMC
+ WARPED_CAUSAL, // 2-sided WARPED
+ MOTION_MODES
+} UENUM1BYTE(MOTION_MODE);
+
+enum {
+ II_DC_PRED,
+ II_V_PRED,
+ II_H_PRED,
+ II_SMOOTH_PRED,
+ INTERINTRA_MODES
+} UENUM1BYTE(INTERINTRA_MODE);
+
+enum {
+ COMPOUND_AVERAGE,
+ COMPOUND_DISTWTD,
+ COMPOUND_WEDGE,
+ COMPOUND_DIFFWTD,
+ COMPOUND_TYPES,
+ MASKED_COMPOUND_TYPES = 2,
+} UENUM1BYTE(COMPOUND_TYPE);
+
+enum {
+ FILTER_DC_PRED,
+ FILTER_V_PRED,
+ FILTER_H_PRED,
+ FILTER_D157_PRED,
+ FILTER_PAETH_PRED,
+ FILTER_INTRA_MODES,
+} UENUM1BYTE(FILTER_INTRA_MODE);
+
+enum {
+ SEQ_LEVEL_2_0,
+ SEQ_LEVEL_2_1,
+ SEQ_LEVEL_2_2,
+ SEQ_LEVEL_2_3,
+ SEQ_LEVEL_3_0,
+ SEQ_LEVEL_3_1,
+ SEQ_LEVEL_3_2,
+ SEQ_LEVEL_3_3,
+ SEQ_LEVEL_4_0,
+ SEQ_LEVEL_4_1,
+ SEQ_LEVEL_4_2,
+ SEQ_LEVEL_4_3,
+ SEQ_LEVEL_5_0,
+ SEQ_LEVEL_5_1,
+ SEQ_LEVEL_5_2,
+ SEQ_LEVEL_5_3,
+ SEQ_LEVEL_6_0,
+ SEQ_LEVEL_6_1,
+ SEQ_LEVEL_6_2,
+ SEQ_LEVEL_6_3,
+ SEQ_LEVEL_7_0,
+ SEQ_LEVEL_7_1,
+ SEQ_LEVEL_7_2,
+ SEQ_LEVEL_7_3,
+ SEQ_LEVEL_8_0,
+ SEQ_LEVEL_8_1,
+ SEQ_LEVEL_8_2,
+ SEQ_LEVEL_8_3,
+ SEQ_LEVELS,
+ SEQ_LEVEL_MAX = 31,
+ SEQ_LEVEL_KEEP_STATS = 32,
+} UENUM1BYTE(AV1_LEVEL);
+
+#define LEVEL_BITS 5
+
+#define DIRECTIONAL_MODES 8
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define SKIP_MODE_CONTEXTS 3
+
+#define COMP_INDEX_CONTEXTS 6
+#define COMP_GROUP_IDX_CONTEXTS 6
+
+#define NMV_CONTEXTS 3
+
+#define NEWMV_MODE_CONTEXTS 6
+#define GLOBALMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 6
+#define DRL_MODE_CONTEXTS 3
+
+#define GLOBALMV_OFFSET 3
+#define REFMV_OFFSET 4
+
+#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
+#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define COMP_NEWMV_CTXS 5
+#define INTER_MODE_CONTEXTS 8
+
+#define DELTA_Q_SMALL 3
+#define DELTA_Q_PROBS (DELTA_Q_SMALL)
+#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4
+#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4
+#define DEFAULT_DELTA_Q_RES_DUCKY_ENCODE 4
+
+#define DELTA_LF_SMALL 3
+#define DELTA_LF_PROBS (DELTA_LF_SMALL)
+#define DEFAULT_DELTA_LF_RES 2
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define MAX_REF_MV_STACK_SIZE 8
+#define USABLE_REF_MV_STACK_SIZE 4
+#define REF_CAT_LEVEL 640
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 3
+
+#define COMP_REF_TYPE_CONTEXTS 5
+#define UNI_COMP_REF_CONTEXTS 3
+
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
+typedef uint8_t TXFM_CONTEXT;
+
+// An enum for single reference types (and some derived values).
+enum {
+ NONE_FRAME = -1,
+ INTRA_FRAME,
+ LAST_FRAME,
+ LAST2_FRAME,
+ LAST3_FRAME,
+ GOLDEN_FRAME,
+ BWDREF_FRAME,
+ ALTREF2_FRAME,
+ ALTREF_FRAME,
+ REF_FRAMES,
+
+ // Extra/scratch reference frame. It may be:
+ // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or
+ // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()).
+ EXTREF_FRAME = REF_FRAMES,
+
+ // Number of inter (non-intra) reference types.
+ INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1,
+
+ // Number of forward (aka past) reference types.
+ FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1,
+
+ // Number of backward (aka future) reference types.
+ BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1,
+
+ SINGLE_REFS = FWD_REFS + BWD_REFS,
+};
+
+#define REF_FRAMES_LOG2 3
+
+// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new
+// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the
+// encoder in the cpi->scaled_ref_buf array.
+// The encoder uses FRAME_BUFFERS only in GOOD and REALTIME encoding modes.
+// The decoder also uses FRAME_BUFFERS.
+#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
+
+// During allintra encoding, one reference frame buffer is free to be used again
+// only after another frame buffer is stored as the reference frame. Hence, it
+// is necessary and sufficient to maintain only two reference frame buffers in
+// this case.
+#define FRAME_BUFFERS_ALLINTRA 2
+
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+
+// Select all the decoded frame buffer slots
+#define SELECT_ALL_BUF_SLOTS 0xFF
+
+enum {
+ LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME }
+ LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME }
+ LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME }
+ BWDREF_ALTREF_FRAMES, // { BWDREF_FRAME, ALTREF_FRAME }
+ LAST2_LAST3_FRAMES, // { LAST2_FRAME, LAST3_FRAME }
+ LAST2_GOLDEN_FRAMES, // { LAST2_FRAME, GOLDEN_FRAME }
+ LAST3_GOLDEN_FRAMES, // { LAST3_FRAME, GOLDEN_FRAME }
+ BWDREF_ALTREF2_FRAMES, // { BWDREF_FRAME, ALTREF2_FRAME }
+ ALTREF2_ALTREF_FRAMES, // { ALTREF2_FRAME, ALTREF_FRAME }
+ TOTAL_UNIDIR_COMP_REFS,
+ // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
+ // that are explicitly signaled.
+ UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
+} UENUM1BYTE(UNIDIR_COMP_REF);
+
+#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
+
+#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
+
+// NOTE: A limited number of unidirectional reference pairs can be signalled for
+// compound prediction. The use of skip mode, on the other hand, makes it
+// possible to have a reference pair not listed for explicit signaling.
+#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
+
+// Note: It includes single and compound references. So, it can take values from
+// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
+typedef int8_t MV_REFERENCE_FRAME;
+
+/*!\endcond */
+
+/*!\enum RestorationType
+ * \brief This enumeration defines various restoration types supported
+ */
+typedef enum {
+ RESTORE_NONE, /**< No restoration */
+ RESTORE_WIENER, /**< Separable Wiener restoration */
+ RESTORE_SGRPROJ, /**< Selfguided restoration */
+ RESTORE_SWITCHABLE, /**< Switchable restoration */
+ RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, /**< Num Switchable types */
+ RESTORE_TYPES = 4, /**< Num Restore types */
+} RestorationType;
+
+/*!\cond */
+// Picture prediction structures (0-13 are predefined) in scalability metadata.
+enum {
+ SCALABILITY_L1T2 = 0,
+ SCALABILITY_L1T3 = 1,
+ SCALABILITY_L2T1 = 2,
+ SCALABILITY_L2T2 = 3,
+ SCALABILITY_L2T3 = 4,
+ SCALABILITY_S2T1 = 5,
+ SCALABILITY_S2T2 = 6,
+ SCALABILITY_S2T3 = 7,
+ SCALABILITY_L2T1h = 8,
+ SCALABILITY_L2T2h = 9,
+ SCALABILITY_L2T3h = 10,
+ SCALABILITY_S2T1h = 11,
+ SCALABILITY_S2T2h = 12,
+ SCALABILITY_S2T3h = 13,
+ SCALABILITY_SS = 14
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
+
+#define SUPERRES_SCALE_BITS 3
+#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
+
+// In large_scale_tile coding, external references are used.
+#define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
new file mode 100644
index 0000000000..4344aea916
--- /dev/null
+++ b/third_party/aom/av1/common/filter.h
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_FILTER_TAP 12
+
+typedef enum ATTRIBUTE_PACKED {
+ EIGHTTAP_REGULAR,
+ EIGHTTAP_SMOOTH,
+ MULTITAP_SHARP,
+ BILINEAR,
+ // Encoder side only filters
+ MULTITAP_SHARP2,
+
+ INTERP_FILTERS_ALL,
+ SWITCHABLE_FILTERS = BILINEAR,
+ SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
+ EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+ INTERP_INVALID = 0xff,
+} InterpFilter;
+
+enum {
+ USE_2_TAPS_ORIG = 0, // This is used in temporal filtering.
+ USE_2_TAPS,
+ USE_4_TAPS,
+ USE_8_TAPS,
+} UENUM1BYTE(SUBPEL_SEARCH_TYPE);
+
+enum {
+ INTERP_EVAL_LUMA_EVAL_CHROMA = 0,
+ INTERP_SKIP_LUMA_EVAL_CHROMA,
+ INTERP_EVAL_INVALID,
+ INTERP_SKIP_LUMA_SKIP_CHROMA,
+} UENUM1BYTE(INTERP_EVAL_PLANE);
+
+enum {
+ INTERP_HORZ_NEQ_VERT_NEQ = 0,
+ INTERP_HORZ_EQ_VERT_NEQ,
+ INTERP_HORZ_NEQ_VERT_EQ,
+ INTERP_HORZ_EQ_VERT_EQ,
+ INTERP_PRED_TYPE_ALL,
+} UENUM1BYTE(INTERP_PRED_TYPE);
+// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
+// we can use 16 bits for each and have more than enough space. This reduces
+// argument passing and unifies the operation of setting a (pair of) filters.
+typedef struct InterpFilters {
+ uint16_t y_filter;
+ uint16_t x_filter;
+} InterpFilters;
+
+typedef union int_interpfilters {
+ uint32_t as_int;
+ InterpFilters as_filters;
+} int_interpfilters;
+
+static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters,
+ int dir) {
+ return (InterpFilter)((dir) ? filters.as_filters.x_filter
+ : filters.as_filters.y_filter);
+}
+
+static INLINE int_interpfilters
+av1_broadcast_interp_filter(InterpFilter filter) {
+ int_interpfilters filters;
+ filters.as_filters.x_filter = filter;
+ filters.as_filters.y_filter = filter;
+ return filters;
+}
+
+static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
+ return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
+}
+
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
+
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff)
+
+typedef struct InterpFilterParams {
+ const int16_t *filter_ptr;
+ uint16_t taps;
+ InterpFilter interp_filter;
+} InterpFilterParams;
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
+ { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+ { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
+ { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
+ { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
+ { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
+ { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
+ { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
+ { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
+ { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+ { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
+ { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
+ { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
+ { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+ { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
+ { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
+ { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+ { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+ { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+ { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+ { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+ { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
+ { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const int16_t,
+ av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
+ { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+ { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
+ { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 },
+ { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 },
+ { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 },
+ { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 },
+ { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 },
+ { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 },
+ { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 },
+ { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 },
+ { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 },
+ { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 },
+ { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 },
+ { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 },
+ { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 },
+ { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 }
+};
+
+static const InterpFilterParams
+ av1_interp_filter_params_list[INTERP_FILTERS_ALL] = {
+ { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+ EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS,
+ MULTITAP_SHARP },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
+
+ // The following filters are for encoder only, and now they are used in
+ // temporal filtering. The predictor block size >= 16 in temporal filter.
+ { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 },
+ };
+
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t,
+ av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+ 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+ av1_intrabc_bilinear_filter, 2, BILINEAR
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
+ { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
+ { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+ { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+ { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+ { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+ { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+ { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+ { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+static const uint16_t
+ av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = {
+ { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG),
+ (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH),
+ (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) },
+ { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP),
+ (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP),
+ (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) }
+ };
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS,
+ EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+ const int w) {
+ if (w <= 4 && interp_filter != MULTITAP_SHARP2)
+ return &av1_interp_4tap[interp_filter];
+ return &av1_interp_filter_params_list[interp_filter];
+}
+
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+ const InterpFilter interp_filter, int subpel_search) {
+ assert(subpel_search >= USE_2_TAPS);
+ return (subpel_search == USE_2_TAPS)
+ ? av1_interp_4tap[BILINEAR].filter_ptr
+ : ((subpel_search == USE_4_TAPS)
+ ? av1_interp_4tap[interp_filter].filter_ptr
+ : av1_interp_filter_params_list[interp_filter].filter_ptr);
+}
+
+static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
+ const InterpFilterParams *const filter_params, const int subpel) {
+ return filter_params->filter_ptr + filter_params->taps * subpel;
+}
+
+static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
+ assert(subpel_search >= USE_2_TAPS);
+
+ switch (subpel_search) {
+ case USE_2_TAPS: return &av1_interp_4tap[BILINEAR];
+ case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR];
+ case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR];
+ default: assert(0); return NULL;
+ }
+}
+
+static INLINE void reset_interp_filter_allowed_mask(
+ uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+ uint16_t tmp = (~(1 << filt_type)) & 0xffff;
+ *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK);
+}
+
+static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
+ DUAL_FILTER_TYPE filt_type) {
+ *allow_interp_mask |= (1 << filt_type);
+}
+
+static INLINE uint8_t get_interp_filter_allowed_mask(
+ uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+ return (allow_interp_mask >> filt_type) & 1;
+}
+
+static AOM_INLINE int get_filter_tap(
+ const InterpFilterParams *const filter_params, int subpel_qn) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_qn & SUBPEL_MASK);
+ if (filter_params->taps == 12) {
+ return 12;
+ }
+ if (filter[0] | filter[7]) {
+ return 8;
+ }
+ if (filter[1] | filter[6]) {
+ return 6;
+ }
+ if (filter[2] | filter[5]) {
+ return 4;
+ }
+ return 2;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
new file mode 100644
index 0000000000..f10ccd5942
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/frame_buffers.h"
+#include "aom_mem/aom_mem.h"
+
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+ assert(list != NULL);
+ av1_free_internal_frame_buffers(list);
+
+ list->num_internal_frame_buffers =
+ AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ list->int_fb = (InternalFrameBuffer *)aom_calloc(
+ list->num_internal_frame_buffers, sizeof(*list->int_fb));
+ if (list->int_fb == NULL) {
+ list->num_internal_frame_buffers = 0;
+ return 1;
+ }
+ return 0;
+}
+
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
+ int i;
+
+ assert(list != NULL);
+
+ for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+ aom_free(list->int_fb[i].data);
+ list->int_fb[i].data = NULL;
+ }
+ aom_free(list->int_fb);
+ list->int_fb = NULL;
+ list->num_internal_frame_buffers = 0;
+}
+
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+ int i;
+
+ assert(list != NULL);
+
+ for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+ if (list->int_fb[i].data && !list->int_fb[i].in_use)
+ memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+ }
+}
+
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ int i;
+ InternalFrameBufferList *const int_fb_list =
+ (InternalFrameBufferList *)cb_priv;
+ if (int_fb_list == NULL) return -1;
+
+ // Find a free frame buffer.
+ for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+ if (!int_fb_list->int_fb[i].in_use) break;
+ }
+
+ if (i == int_fb_list->num_internal_frame_buffers) return -1;
+
+ if (int_fb_list->int_fb[i].size < min_size) {
+ aom_free(int_fb_list->int_fb[i].data);
+ // The data must be zeroed to fix a valgrind error from the C loop filter
+ // due to access uninitialized memory in frame border. It could be
+ // skipped if border were totally removed.
+ int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
+ if (!int_fb_list->int_fb[i].data) {
+ int_fb_list->int_fb[i].size = 0;
+ return -1;
+ }
+ int_fb_list->int_fb[i].size = min_size;
+ }
+
+ fb->data = int_fb_list->int_fb[i].data;
+ fb->size = int_fb_list->int_fb[i].size;
+ int_fb_list->int_fb[i].in_use = 1;
+
+ // Set the frame buffer's private data to point at the internal frame buffer.
+ fb->priv = &int_fb_list->int_fb[i];
+ return 0;
+}
+
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
+ InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+ (void)cb_priv;
+ if (int_fb) int_fb->in_use = 0;
+ return 0;
+}
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
new file mode 100644
index 0000000000..16188e51c7
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
+
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+ uint8_t *data;
+ size_t size;
+ int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+ int num_internal_frame_buffers;
+ InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb);
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
new file mode 100644
index 0000000000..bff438f3c6
--- /dev/null
+++ b/third_party/aom/av1/common/idct.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+int av1_get_tx_scale(const TX_SIZE tx_size) {
+ const int pels = tx_size_2d[tx_size];
+ // Largest possible pels is 4096 (64x64).
+ return (pels > 256) + (pels > 1024);
+}
+
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
+// idct
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ if (eob > 1)
+ av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
+ else
+ av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ assert(tx_type == DCT_DCT);
+ av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type, int eob, int reduced_tx_set,
+ TxfmParam *txfm_param) {
+ (void)plane;
+ txfm_param->tx_type = tx_type;
+ txfm_param->tx_size = tx_size;
+ txfm_param->eob = eob;
+ txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
+ txfm_param->bd = xd->bd;
+ txfm_param->is_hbd = is_cur_buf_hbd(xd);
+ txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+ txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+}
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X64:
+ av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param);
+ break;
+ case TX_64X16:
+ av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ // this is like av1_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param);
+ break;
+ default: assert(0 && "Invalid transform size"); break;
+ }
+}
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
+ int tmp_stride = MAX_TX_SIZE;
+ int w = tx_size_wide[tx_size];
+ int h = tx_size_high[tx_size];
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ tmp[r * tmp_stride + c] = dst[r * stride + c];
+ }
+ }
+
+ av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+ txfm_param);
+
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
+ }
+ }
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+ const tran_low_t *dqcoeff, int plane,
+ TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+ int stride, int eob, int reduced_tx_set) {
+ if (!eob) return;
+
+ assert(eob <= av1_get_max_eob(tx_size));
+
+ TxfmParam txfm_param;
+ init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
+ &txfm_param);
+ assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
+
+ if (txfm_param.is_hbd) {
+ av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+ } else {
+ av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
new file mode 100644
index 0000000000..004d25d49a
--- /dev/null
+++ b/third_party/aom/av1/common/idct.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "aom_dsp/txfm_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d)(const tran_low_t *, tran_low_t *);
+
+typedef struct {
+ transform_1d cols, rows; // vertical and horizontal
+} transform_2d;
+
+#define MAX_TX_SCALE 1
+int av1_get_tx_scale(const TX_SIZE tx_size);
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+ const tran_low_t *dqcoeff, int plane,
+ TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+ int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+ assert(sizeof(int32_t) == sizeof(tran_low_t));
+ return (const int32_t *)input;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
new file mode 100644
index 0000000000..6828834e05
--- /dev/null
+++ b/third_party/aom/av1/common/mv.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
+
+#include <stdlib.h>
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_MV 0x80008000
+#define INVALID_MV_ROW_COL -32768
+#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3)
+#define GET_MV_SUBPEL(x) ((x)*8)
+
+#define MARK_MV_INVALID(mv) \
+ do { \
+ ((int_mv *)(mv))->as_int = INVALID_MV; \
+ } while (0)
+#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col))
+
+// The motion vector in units of full pixel
+typedef struct fullpel_mv {
+ int16_t row;
+ int16_t col;
+} FULLPEL_MV;
+
+// The motion vector in units of 1/8-pel
+typedef struct mv {
+ int16_t row;
+ int16_t col;
+} MV;
+
+static const MV kZeroMv = { 0, 0 };
+static const FULLPEL_MV kZeroFullMv = { 0, 0 };
+
+typedef union int_mv {
+ uint32_t as_int;
+ MV as_mv;
+ FULLPEL_MV as_fullmv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+ int32_t row;
+ int32_t col;
+} MV32;
+
+// The mv limit for fullpel mvs
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} FullMvLimits;
+
+// The mv limit for subpel mvs
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} SubpelMvLimits;
+
+static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
+ const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row),
+ (int16_t)GET_MV_RAWPEL(subpel_mv->col) };
+ return full_mv;
+}
+
+static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
+ const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
+ (int16_t)GET_MV_SUBPEL(full_mv->col) };
+ return subpel_mv;
+}
+
+static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
+ mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
+}
+
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS 16
+
+#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
+#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS 6
+#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
+
+#define WARP_PARAM_REDUCE_BITS 6
+
+#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+typedef struct {
+ int global_warp_allowed;
+ int local_warp_allowed;
+} WarpTypesAllowed;
+
+// The order of values in the wmmat matrix below is best described
+// by the affine transformation:
+// [x' (m2 m3 m0 [x
+// z . y' = m4 m5 m1 * y
+// 1] 0 0 1) 1]
+typedef struct {
+ int32_t wmmat[MAX_PARAMDIM];
+ int16_t alpha, beta, gamma, delta;
+ TransformationType wmtype;
+ int8_t invalid;
+} WarpedMotionParams;
+
+/* clang-format off */
+static const WarpedMotionParams default_warp_params = {
+ { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS) },
+ 0, 0, 0, 0,
+ IDENTITY,
+ 0,
+};
+/* clang-format on */
+
+// The following constants describe the various precisions
+// of different parameters in the global motion experiment.
+//
+// Given the general homography:
+// [x' (a b c [x
+// z . y' = d e f * y
+// 1] g h i) 1]
+//
+// Constants using the name ALPHA here are related to parameters
+// a, b, d, e. Constants using the name TRANS are related
+// to parameters c and f.
+//
+// Anything ending in PREC_BITS is the number of bits of precision
+// to maintain when converting from double to integer.
+//
+// The ABS parameters are used to create an upper and lower bound
+// for each parameter. In other words, after a parameter is integerized
+// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS).
+//
+// XXX_PREC_DIFF and XXX_DECODE_FACTOR
+// are computed once here to prevent repetitive
+// computation on the decoder side. These are
+// to allow the global motion parameters to be encoded in a lower
+// precision than the warped model precision. This means that they
+// need to be changed to warped precision when they are decoded.
+//
+// XX_MIN, XX_MAX are also computed to avoid repeated computation
+
+#define SUBEXPFIN_K 3
+#define GM_TRANS_PREC_BITS 6
+#define GM_ABS_TRANS_BITS 12
+#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
+#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
+#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
+#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
+#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
+
+#define GM_ALPHA_PREC_BITS 15
+#define GM_ABS_ALPHA_BITS 12
+#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
+#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
+
+#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS)
+#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
+
+#define GM_TRANS_MIN -GM_TRANS_MAX
+#define GM_ALPHA_MIN -GM_ALPHA_MAX
+
+static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
+ const int bw = block_size_wide[bs];
+ return mi_col * MI_SIZE + bw / 2 - 1;
+}
+
+static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
+ const int bh = block_size_high[bs];
+ return mi_row * MI_SIZE + bh / 2 - 1;
+}
+
+static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
+ if (allow_hp)
+ return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
+ else
+ return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
+}
+static INLINE void integer_mv_precision(MV *mv) {
+ int mod = (mv->row % 8);
+ if (mod != 0) {
+ mv->row -= mod;
+ if (abs(mod) > 4) {
+ if (mod > 0) {
+ mv->row += 8;
+ } else {
+ mv->row -= 8;
+ }
+ }
+ }
+
+ mod = (mv->col % 8);
+ if (mod != 0) {
+ mv->col -= mod;
+ if (abs(mod) > 4) {
+ if (mod > 0) {
+ mv->col += 8;
+ } else {
+ mv->col -= 8;
+ }
+ }
+ }
+}
+// Convert a global motion vector into a motion vector at the centre of the
+// given block.
+//
+// The resulting motion vector will have three fractional bits of precision. If
+// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
+// is_integer is true, the bottom three bits will be zero (so the motion vector
+// represents an integer)
+static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+ int allow_hp, BLOCK_SIZE bsize,
+ int mi_col, int mi_row,
+ int is_integer) {
+ int_mv res;
+
+ if (gm->wmtype == IDENTITY) {
+ res.as_int = 0;
+ return res;
+ }
+
+ const int32_t *mat = gm->wmmat;
+ int x, y, tx, ty;
+
+ if (gm->wmtype == TRANSLATION) {
+ // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
+ // bits of fractional precision. The offset for a translation is stored in
+ // entries 0 and 1. For translations, all but the top three (two if
+ // cm->features.allow_high_precision_mv is false) fractional bits are always
+ // zero.
+ //
+ // After the right shifts, there are 3 fractional bits of precision. If
+ // allow_hp is false, the bottom bit is always zero (so we don't need a
+ // call to convert_to_trans_prec here)
+ //
+ // Note: There is an AV1 specification bug here:
+ //
+ // gm->wmmat[0] is supposed to be the horizontal translation, and so should
+ // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical
+ // translation and so should go into res.as_mv.row
+ //
+ // However, in the spec, these assignments are accidentally reversed, and so
+ // we must keep this incorrect logic to match the spec.
+ //
+ // See also: https://crbug.com/aomedia/3328
+ res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
+ res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
+ assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
+ if (is_integer) {
+ integer_mv_precision(&res.as_mv);
+ }
+ return res;
+ }
+
+ x = block_center_x(mi_col, bsize);
+ y = block_center_y(mi_row, bsize);
+
+ if (gm->wmtype == ROTZOOM) {
+ assert(gm->wmmat[5] == gm->wmmat[2]);
+ assert(gm->wmmat[4] == -gm->wmmat[3]);
+ }
+
+ const int xc =
+ (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+ const int yc =
+ mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+ tx = convert_to_trans_prec(allow_hp, xc);
+ ty = convert_to_trans_prec(allow_hp, yc);
+
+ res.as_mv.row = ty;
+ res.as_mv.col = tx;
+
+ if (is_integer) {
+ integer_mv_precision(&res.as_mv);
+ }
+ return res;
+}
+
+static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
+ if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
+ gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
+ return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
+ }
+ if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4])
+ return ROTZOOM;
+ else
+ return AFFINE;
+}
+
+typedef struct candidate_mv {
+ int_mv this_mv;
+ int_mv comp_mv;
+} CANDIDATE_MV;
+
+static INLINE int is_zero_mv(const MV *mv) {
+ return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+ return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
+ mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+ mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
+ mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+ mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
new file mode 100644
index 0000000000..d8889f3eb3
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "av1/common/mvref_common.h"
+#include "av1/common/warped_motion.h"
+
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
+ 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092,
+ 1024, 963, 910, 862, 819, 780, 744, 712,
+ 682, 655, 630, 606, 585, 564, 546, 528 };
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) {
+ den = AOMMIN(den, MAX_FRAME_DISTANCE);
+ num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
+ : AOMMAX(num, -MAX_FRAME_DISTANCE);
+ const int mv_row =
+ ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+ const int mv_col =
+ ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+ const int clamp_max = MV_UPP - 1;
+ const int clamp_min = MV_LOW + 1;
+ output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
+ output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis) {
+ const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
+ MV_REF *frame_mvs =
+ cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+ x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+ y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+ int w, h;
+
+ for (h = 0; h < y_mis; h++) {
+ MV_REF *mv = frame_mvs;
+ for (w = 0; w < x_mis; w++) {
+ mv->ref_frame = NONE_FRAME;
+ mv->mv.as_int = 0;
+
+ for (int idx = 0; idx < 2; ++idx) {
+ MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
+ if (ref_frame > INTRA_FRAME) {
+ int8_t ref_idx = cm->ref_frame_side[ref_frame];
+ if (ref_idx) continue;
+ if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
+ (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
+ continue;
+ mv->ref_frame = ref_frame;
+ mv->mv.as_int = mi->mv[idx].as_int;
+ }
+ }
+ mv++;
+ }
+ frame_mvs += frame_mvs_stride;
+ }
+}
+
+static AOM_INLINE void add_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
+ uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
+ CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+ int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
+ uint16_t weight) {
+ if (!is_inter_block(candidate)) return;
+ assert(weight % 2 == 0);
+ int index, ref;
+
+ if (rf[1] == NONE_FRAME) {
+ // single reference frame
+ for (ref = 0; ref < 2; ++ref) {
+ if (candidate->ref_frame[ref] == rf[0]) {
+ const int is_gm_block =
+ is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
+ const int_mv this_refmv =
+ is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
+ for (index = 0; index < *refmv_count; ++index) {
+ if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
+ ref_mv_weight[index] += weight;
+ break;
+ }
+ }
+
+ // Add a new item to the list.
+ if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[index].this_mv = this_refmv;
+ ref_mv_weight[index] = weight;
+ ++(*refmv_count);
+ }
+ if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+ ++*ref_match_count;
+ }
+ }
+ } else {
+ // compound reference frame
+ if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
+ int_mv this_refmv[2];
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
+ this_refmv[ref] = gm_mv_candidates[ref];
+ else
+ this_refmv[ref] = get_block_mv(candidate, ref);
+ }
+
+ for (index = 0; index < *refmv_count; ++index) {
+ if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
+ (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
+ ref_mv_weight[index] += weight;
+ break;
+ }
+ }
+
+ // Add a new item to the list.
+ if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[index].this_mv = this_refmv[0];
+ ref_mv_stack[index].comp_mv = this_refmv[1];
+ ref_mv_weight[index] = weight;
+ ++(*refmv_count);
+ }
+ if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+ ++*ref_match_count;
+ }
+ }
+}
+
+static AOM_INLINE void scan_row_mbmi(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
+ const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
+ uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+ uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
+ int *processed_rows) {
+ int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+ end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
+ const int width_8x8 = mi_size_wide[BLOCK_8X8];
+ const int width_16x16 = mi_size_wide[BLOCK_16X16];
+ int col_offset = 0;
+ // TODO(jingning): Revisit this part after cb4x4 is stable.
+ if (abs(row_offset) > 1) {
+ col_offset = 1;
+ if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
+ }
+ const int use_step_16 = (xd->width >= 16);
+ MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
+
+ for (int i = 0; i < end_mi;) {
+ const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
+ const int candidate_bsize = candidate->bsize;
+ const int n4_w = mi_size_wide[candidate_bsize];
+ int len = AOMMIN(xd->width, n4_w);
+ if (use_step_16)
+ len = AOMMAX(width_16x16, len);
+ else if (abs(row_offset) > 1)
+ len = AOMMAX(len, width_8x8);
+
+ uint16_t weight = 2;
+ if (xd->width >= width_8x8 && xd->width <= n4_w) {
+ uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
+ mi_size_high[candidate_bsize]);
+ // Obtain range used in weight calculation.
+ weight = AOMMAX(weight, inc);
+ // Update processed rows.
+ *processed_rows = inc - row_offset - 1;
+ }
+
+ add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+ newmv_count, ref_mv_stack, ref_mv_weight,
+ gm_mv_candidates, cm->global_motion, len * weight);
+
+ i += len;
+ }
+}
+
+static AOM_INLINE void scan_col_mbmi(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
+ const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
+ uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+ uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
+ int *processed_cols) {
+ int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+ end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
+ const int n8_h_8 = mi_size_high[BLOCK_8X8];
+ const int n8_h_16 = mi_size_high[BLOCK_16X16];
+ int i;
+ int row_offset = 0;
+ if (abs(col_offset) > 1) {
+ row_offset = 1;
+ if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
+ }
+ const int use_step_16 = (xd->height >= 16);
+
+ for (i = 0; i < end_mi;) {
+ const MB_MODE_INFO *const candidate =
+ xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
+ const int candidate_bsize = candidate->bsize;
+ const int n4_h = mi_size_high[candidate_bsize];
+ int len = AOMMIN(xd->height, n4_h);
+ if (use_step_16)
+ len = AOMMAX(n8_h_16, len);
+ else if (abs(col_offset) > 1)
+ len = AOMMAX(len, n8_h_8);
+
+ int weight = 2;
+ if (xd->height >= n8_h_8 && xd->height <= n4_h) {
+ int inc = AOMMIN(-max_col_offset + col_offset + 1,
+ mi_size_wide[candidate_bsize]);
+ // Obtain range used in weight calculation.
+ weight = AOMMAX(weight, inc);
+ // Update processed cols.
+ *processed_cols = inc - col_offset - 1;
+ }
+
+ add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+ newmv_count, ref_mv_stack, ref_mv_weight,
+ gm_mv_candidates, cm->global_motion, len * weight);
+
+ i += len;
+ }
+}
+
+static AOM_INLINE void scan_blk_mbmi(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row,
+ const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
+ int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+ uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
+ uint8_t *refmv_count) {
+ const TileInfo *const tile = &xd->tile;
+ POSITION mi_pos;
+
+ mi_pos.row = row_offset;
+ mi_pos.col = col_offset;
+
+ if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
+ const MB_MODE_INFO *const candidate =
+ xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+ const int len = mi_size_wide[BLOCK_8X8];
+
+ add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+ newmv_count, ref_mv_stack, ref_mv_weight,
+ gm_mv_candidates, cm->global_motion, 2 * len);
+ } // Analyze a single 8x8 block motion information.
+}
+
+static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int mi_row, int mi_col, int bs) {
+ const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
+ const int mask_row = mi_row & (sb_mi_size - 1);
+ const int mask_col = mi_col & (sb_mi_size - 1);
+
+ if (bs > mi_size_wide[BLOCK_64X64]) return 0;
+
+ // In a split partition all apart from the bottom right has a top right
+ int has_tr = !((mask_row & bs) && (mask_col & bs));
+
+ // bs > 0 and bs is a power of 2
+ assert(bs > 0 && !(bs & (bs - 1)));
+
+ // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+ // to the right have not been decoded therefore the bottom right does
+ // not have a top right
+ while (bs < sb_mi_size) {
+ if (mask_col & bs) {
+ if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
+ has_tr = 0;
+ break;
+ }
+ } else {
+ break;
+ }
+ bs <<= 1;
+ }
+
+ // In a VERTICAL or VERTICAL_4 partition, all partition before the last one
+ // always have a top right (as the block above will have been decoded).
+ if (xd->width < xd->height) {
+ if (!xd->is_last_vertical_rect) has_tr = 1;
+ }
+
+ // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one
+ // never have a top right (as the block to the right won't have been decoded).
+ if (xd->width > xd->height) {
+ if (!xd->is_first_horizontal_rect) has_tr = 0;
+ }
+
+ // The bottom left square of a Vertical A (in the old format) does
+ // not have a top right as it is decoded before the right hand
+ // rectangle of the partition
+ if (xd->mi[0]->partition == PARTITION_VERT_A) {
+ if (xd->width == xd->height)
+ if (mask_row & bs) has_tr = 0;
+ }
+
+ return has_tr;
+}
+
+static int check_sb_border(const int mi_row, const int mi_col,
+ const int row_offset, const int col_offset) {
+ const int sb_mi_size = mi_size_wide[BLOCK_64X64];
+ const int row = mi_row & (sb_mi_size - 1);
+ const int col = mi_col & (sb_mi_size - 1);
+
+ if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
+ col + col_offset < 0 || col + col_offset >= sb_mi_size)
+ return 0;
+
+ return 1;
+}
+
+static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
+ int blk_row, int blk_col, int_mv *gm_mv_candidates,
+ uint8_t *const refmv_count,
+ CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+ int16_t *mode_context) {
+ POSITION mi_pos;
+ mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
+ mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
+
+ if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
+
+ const TPL_MV_REF *prev_frame_mvs =
+ cm->tpl_mvs +
+ ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
+ ((mi_col + mi_pos.col) >> 1);
+ if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
+
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+
+ const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8];
+ const int cur_frame_index = cm->cur_frame->order_hint;
+ const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
+ const int frame0_index = buf_0->order_hint;
+ const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info,
+ cur_frame_index, frame0_index);
+ int idx;
+ const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+ const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+
+ int_mv this_refmv;
+ get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_0, prev_frame_mvs->ref_frame_offset);
+ lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
+ force_integer_mv);
+
+ if (rf[1] == NONE_FRAME) {
+ if (blk_row == 0 && blk_col == 0) {
+ if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+ abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+ mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+ }
+
+ for (idx = 0; idx < *refmv_count; ++idx)
+ if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+ if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
+
+ if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+ ref_mv_weight[idx] = 2 * weight_unit;
+ ++(*refmv_count);
+ }
+ } else {
+ // Process compound inter mode
+ const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
+ const int frame1_index = buf_1->order_hint;
+ const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info,
+ cur_frame_index, frame1_index);
+ int_mv comp_refmv;
+ get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_1, prev_frame_mvs->ref_frame_offset);
+ lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
+ force_integer_mv);
+
+ if (blk_row == 0 && blk_col == 0) {
+ if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+ abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+ abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+ abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+ mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+ }
+
+ for (idx = 0; idx < *refmv_count; ++idx) {
+ if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+ comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+ break;
+ }
+
+ if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
+
+ if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+ ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+ ref_mv_weight[idx] = 2 * weight_unit;
+ ++(*refmv_count);
+ }
+ }
+
+ return 1;
+}
+
+static AOM_INLINE void process_compound_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+ int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+ for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+ if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+ ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+ ++ref_id_count[cmp_idx];
+ } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[can_rf] !=
+ cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+ ++ref_diff_count[cmp_idx];
+ }
+ }
+ }
+}
+
+static AOM_INLINE void process_single_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count,
+ CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+ cm->ref_frame_sign_bias[ref_frame]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ int stack_idx;
+ for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
+ const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
+ if (this_mv.as_int == stack_mv.as_int) break;
+ }
+
+ if (stack_idx == *refmv_count) {
+ ref_mv_stack[stack_idx].this_mv = this_mv;
+
+ // TODO(jingning): Set an arbitrary small number here. The weight
+ // doesn't matter as long as it is properly initialized.
+ ref_mv_weight[stack_idx] = 2;
+ ++(*refmv_count);
+ }
+ }
+ }
+}
+
+static AOM_INLINE void setup_ref_mv_list(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
+ uint8_t *const refmv_count,
+ CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+ int mi_row, int mi_col, int16_t *mode_context) {
+ const int bs = AOMMAX(xd->width, xd->height);
+ const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
+ MV_REFERENCE_FRAME rf[2];
+
+ const TileInfo *const tile = &xd->tile;
+ int max_row_offset = 0, max_col_offset = 0;
+ const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+ const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+ int processed_rows = 0;
+ int processed_cols = 0;
+
+ av1_set_ref_frame(rf, ref_frame);
+ mode_context[ref_frame] = 0;
+ *refmv_count = 0;
+
+ // Find valid maximum row/col offset.
+ if (xd->up_available) {
+ max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
+
+ if (xd->height < mi_size_high[BLOCK_8X8])
+ max_row_offset = -(2 << 1) + row_adj;
+
+ max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
+ }
+
+ if (xd->left_available) {
+ max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
+
+ if (xd->width < mi_size_wide[BLOCK_8X8])
+ max_col_offset = -(2 << 1) + col_adj;
+
+ max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
+ }
+
+ uint8_t col_match_count = 0;
+ uint8_t row_match_count = 0;
+ uint8_t newmv_count = 0;
+
+ // Scan the first above row mode info. row_offset = -1;
+ if (abs(max_row_offset) >= 1)
+ scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
+ refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
+ max_row_offset, &processed_rows);
+ // Scan the first left column mode info. col_offset = -1;
+ if (abs(max_col_offset) >= 1)
+ scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
+ refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
+ max_col_offset, &processed_cols);
+ // Check top-right boundary
+ if (has_tr)
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
+ ref_mv_weight, &row_match_count, &newmv_count,
+ gm_mv_candidates, refmv_count);
+
+ const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+ const uint8_t nearest_refmv_count = *refmv_count;
+
+ // TODO(yunqing): for comp_search, do it for all 3 cases.
+ for (int idx = 0; idx < nearest_refmv_count; ++idx)
+ ref_mv_weight[idx] += REF_CAT_LEVEL;
+
+ if (cm->features.allow_ref_frame_mvs) {
+ int is_available = 0;
+ const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
+ const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
+ const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
+ const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
+
+ const int tpl_sample_pos[3][2] = {
+ { voffset, -2 },
+ { voffset, hoffset },
+ { voffset - 2, hoffset },
+ };
+ const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
+ (xd->height < mi_size_high[BLOCK_64X64]) &&
+ (xd->width >= mi_size_wide[BLOCK_8X8]) &&
+ (xd->width < mi_size_wide[BLOCK_64X64]);
+
+ const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
+ ? mi_size_high[BLOCK_16X16]
+ : mi_size_high[BLOCK_8X8];
+ const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
+ ? mi_size_wide[BLOCK_16X16]
+ : mi_size_wide[BLOCK_8X8];
+
+ for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
+ for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
+ int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
+ blk_col, gm_mv_candidates, refmv_count,
+ ref_mv_stack, ref_mv_weight, mode_context);
+ if (blk_row == 0 && blk_col == 0) is_available = ret;
+ }
+ }
+
+ if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+ for (int i = 0; i < 3 && allow_extension; ++i) {
+ const int blk_row = tpl_sample_pos[i][0];
+ const int blk_col = tpl_sample_pos[i][1];
+
+ if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
+ add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
+ gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
+ mode_context);
+ }
+ }
+
+ uint8_t dummy_newmv_count = 0;
+
+ // Scan the second outer area.
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
+ &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+ refmv_count);
+
+ for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
+ const int row_offset = -(idx << 1) + 1 + row_adj;
+ const int col_offset = -(idx << 1) + 1 + col_adj;
+
+ if (abs(row_offset) <= abs(max_row_offset) &&
+ abs(row_offset) > processed_rows)
+ scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
+ refmv_count, &row_match_count, &dummy_newmv_count,
+ gm_mv_candidates, max_row_offset, &processed_rows);
+
+ if (abs(col_offset) <= abs(max_col_offset) &&
+ abs(col_offset) > processed_cols)
+ scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
+ refmv_count, &col_match_count, &dummy_newmv_count,
+ gm_mv_candidates, max_col_offset, &processed_cols);
+ }
+
+ const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+
+ switch (nearest_match) {
+ case 0:
+ if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
+ if (ref_match_count == 1)
+ mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+ else if (ref_match_count >= 2)
+ mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+ break;
+ case 1:
+ mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+ if (ref_match_count == 1)
+ mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+ else if (ref_match_count >= 2)
+ mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+ break;
+ case 2:
+ default:
+ if (newmv_count >= 1)
+ mode_context[ref_frame] |= 4;
+ else
+ mode_context[ref_frame] |= 5;
+
+ mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+ break;
+ }
+
+ // Rank the likelihood and assign nearest and near mvs.
+ int len = nearest_refmv_count;
+ while (len > 0) {
+ int nr_len = 0;
+ for (int idx = 1; idx < len; ++idx) {
+ if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+ const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+ const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+ ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+ ref_mv_stack[idx] = tmp_mv;
+ ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+ ref_mv_weight[idx] = tmp_ref_mv_weight;
+ nr_len = idx;
+ }
+ }
+ len = nr_len;
+ }
+
+ len = *refmv_count;
+ while (len > nearest_refmv_count) {
+ int nr_len = nearest_refmv_count;
+ for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
+ if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+ const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+ const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+ ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+ ref_mv_stack[idx] = tmp_mv;
+ ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+ ref_mv_weight[idx] = tmp_ref_mv_weight;
+ nr_len = idx;
+ }
+ }
+ len = nr_len;
+ }
+
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
+ mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
+ mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
+ const int mi_size = AOMMIN(mi_width, mi_height);
+ if (rf[1] > NONE_FRAME) {
+ // TODO(jingning, yunqing): Refactor and consolidate the compound and
+ // single reference frame modes. Reduce unnecessary redundancy.
+ if (*refmv_count < MAX_MV_REF_CANDIDATES) {
+ int_mv ref_id[2][2], ref_diff[2][2];
+ int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
+
+ for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
+ const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_wide[candidate->bsize];
+ }
+
+ for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
+ const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_high[candidate->bsize];
+ }
+
+ // Build up the compound mv predictor
+ int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
+
+ for (int idx = 0; idx < 2; ++idx) {
+ int comp_idx = 0;
+ for (int list_idx = 0;
+ list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
+ ++list_idx, ++comp_idx)
+ comp_list[comp_idx][idx] = ref_id[idx][list_idx];
+ for (int list_idx = 0;
+ list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
+ ++list_idx, ++comp_idx)
+ comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
+ for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
+ comp_list[comp_idx][idx] = gm_mv_candidates[idx];
+ }
+
+ if (*refmv_count) {
+ assert(*refmv_count == 1);
+ if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
+ comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
+ ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
+ ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
+ } else {
+ ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
+ ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
+ }
+ ref_mv_weight[*refmv_count] = 2;
+ ++*refmv_count;
+ } else {
+ for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+ ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
+ ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
+ ref_mv_weight[*refmv_count] = 2;
+ ++*refmv_count;
+ }
+ }
+ }
+
+ assert(*refmv_count >= 2);
+
+ for (int idx = 0; idx < *refmv_count; ++idx) {
+ clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+ xd->height << MI_SIZE_LOG2, xd);
+ clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
+ xd->height << MI_SIZE_LOG2, xd);
+ }
+ } else {
+ // Handle single reference frame extension
+ for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
+ *refmv_count < MAX_MV_REF_CANDIDATES;) {
+ const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack, ref_mv_weight);
+ idx += mi_size_wide[candidate->bsize];
+ }
+
+ for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
+ *refmv_count < MAX_MV_REF_CANDIDATES;) {
+ const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack, ref_mv_weight);
+ idx += mi_size_high[candidate->bsize];
+ }
+
+ for (int idx = 0; idx < *refmv_count; ++idx) {
+ clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+ xd->height << MI_SIZE_LOG2, xd);
+ }
+
+ if (mv_ref_list != NULL) {
+ for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
+ mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
+
+ for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
+ ++idx) {
+ mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+ }
+ }
+ }
+}
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+ int_mv *global_mvs, int16_t *mode_context) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int_mv gm_mv[2];
+
+ if (ref_frame == INTRA_FRAME) {
+ gm_mv[0].as_int = gm_mv[1].as_int = 0;
+ if (global_mvs != NULL) {
+ global_mvs[ref_frame].as_int = INVALID_MV;
+ }
+ } else {
+ const BLOCK_SIZE bsize = mi->bsize;
+ const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+ const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+ if (ref_frame < REF_FRAMES) {
+ gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
+ allow_high_precision_mv, bsize, mi_col,
+ mi_row, force_integer_mv);
+ gm_mv[1].as_int = 0;
+ if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
+ } else {
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+ gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
+ allow_high_precision_mv, bsize, mi_col,
+ mi_row, force_integer_mv);
+ gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
+ allow_high_precision_mv, bsize, mi_col,
+ mi_row, force_integer_mv);
+ }
+ }
+
+ setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
+ ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
+ mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
+ mi_col, mode_context);
+}
+
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+ int_mv *near_mv, int is_integer) {
+ int i;
+ // Make sure all the candidates are properly clamped etc
+ for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+ lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
+ }
+ *nearest_mv = mvlist[0];
+ *near_mv = mvlist[1];
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
+ cm->cur_frame->order_hint = cm->current_frame.order_hint;
+ cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
+ cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level;
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
+ cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
+ buf->display_order_hint;
+ }
+ }
+}
+
+void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) {
+ const int ref_order_hint = buf->order_hint;
+ cm->ref_frame_sign_bias[ref_frame] =
+ (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint,
+ (int)cm->current_frame.order_hint) <= 0)
+ ? 0
+ : 1;
+ } else {
+ cm->ref_frame_sign_bias[ref_frame] = 0;
+ }
+ }
+}
+
+#define MAX_OFFSET_WIDTH 64
+#define MAX_OFFSET_HEIGHT 0
+
+static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
+ int blk_col, MV mv, int sign_bias) {
+ const int base_blk_row = (blk_row >> 3) << 3;
+ const int base_blk_col = (blk_col >> 3) << 3;
+
+ const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
+ : -((-mv.row) >> (4 + MI_SIZE_LOG2));
+
+ const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
+ : -((-mv.col) >> (4 + MI_SIZE_LOG2));
+
+ const int row =
+ (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+ const int col =
+ (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+
+ if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
+ col >= (cm->mi_params.mi_cols >> 1))
+ return 0;
+
+ if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
+ row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
+ col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
+ col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
+ return 0;
+
+ *mi_r = row;
+ *mi_c = col;
+
+ return 1;
+}
+
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+ MV_REFERENCE_FRAME start_frame, int dir) {
+ TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+ int ref_offset[REF_FRAMES] = { 0 };
+
+ const RefCntBuffer *const start_frame_buf =
+ get_ref_frame_buf(cm, start_frame);
+ if (start_frame_buf == NULL) return 0;
+
+ if (start_frame_buf->frame_type == KEY_FRAME ||
+ start_frame_buf->frame_type == INTRA_ONLY_FRAME)
+ return 0;
+
+ if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
+ start_frame_buf->mi_cols != cm->mi_params.mi_cols)
+ return 0;
+
+ const int start_frame_order_hint = start_frame_buf->order_hint;
+ const unsigned int *const ref_order_hints =
+ &start_frame_buf->ref_order_hints[0];
+ const int cur_order_hint = cm->cur_frame->order_hint;
+ int start_to_current_frame_offset = get_relative_dist(
+ &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint);
+
+ for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
+ ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info,
+ start_frame_order_hint,
+ ref_order_hints[rf - LAST_FRAME]);
+ }
+
+ if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
+
+ MV_REF *mv_ref_base = start_frame_buf->mvs;
+ const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
+ const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
+
+ for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
+ for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
+ MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
+ MV fwd_mv = mv_ref->mv.as_mv;
+
+ if (mv_ref->ref_frame > INTRA_FRAME) {
+ int_mv this_mv;
+ int mi_r, mi_c;
+ const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
+
+ int pos_valid =
+ abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+ ref_frame_offset > 0 &&
+ abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
+
+ if (pos_valid) {
+ get_mv_projection(&this_mv.as_mv, fwd_mv,
+ start_to_current_frame_offset, ref_frame_offset);
+ pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
+ this_mv.as_mv, dir >> 1);
+ }
+
+ if (pos_valid) {
+ const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
+
+ tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
+ tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
+ tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
+ }
+ }
+ }
+ }
+
+ return 1;
+}
+
+// cm->ref_frame_side is calculated here, and will be used in
+// av1_copy_frame_mvs() to affect how mvs are copied.
+void av1_calculate_ref_frame_side(AV1_COMMON *cm) {
+ const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
+
+ memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
+ if (!order_hint_info->enable_order_hint) return;
+
+ const int cur_order_hint = cm->cur_frame->order_hint;
+
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ int order_hint = 0;
+
+ if (buf != NULL) order_hint = buf->order_hint;
+
+ if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
+ cm->ref_frame_side[ref_frame] = 1;
+ else if (order_hint == cur_order_hint)
+ cm->ref_frame_side[ref_frame] = -1;
+ }
+}
+
+void av1_setup_motion_field(AV1_COMMON *cm) {
+ const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
+
+ if (!order_hint_info->enable_order_hint) return;
+
+ TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+ int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
+ (cm->mi_params.mi_stride >> 1);
+ for (int idx = 0; idx < size; ++idx) {
+ tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
+ tpl_mvs_base[idx].ref_frame_offset = 0;
+ }
+
+ const int cur_order_hint = cm->cur_frame->order_hint;
+ const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
+ int ref_order_hint[INTER_REFS_PER_FRAME];
+
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ const int ref_idx = ref_frame - LAST_FRAME;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ int order_hint = 0;
+
+ if (buf != NULL) order_hint = buf->order_hint;
+
+ ref_buf[ref_idx] = buf;
+ ref_order_hint[ref_idx] = order_hint;
+ }
+
+ int ref_stamp = MFMV_STACK_SIZE - 1;
+
+ if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) {
+ const int alt_of_lst_order_hint =
+ ref_buf[LAST_FRAME - LAST_FRAME]
+ ->ref_order_hints[ALTREF_FRAME - LAST_FRAME];
+
+ const int is_lst_overlay =
+ (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
+ if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
+ --ref_stamp;
+ }
+
+ if (get_relative_dist(order_hint_info,
+ ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+ cur_order_hint) > 0) {
+ if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
+ }
+
+ if (get_relative_dist(order_hint_info,
+ ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+ cur_order_hint) > 0) {
+ if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
+ }
+
+ if (get_relative_dist(order_hint_info,
+ ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+ cur_order_hint) > 0 &&
+ ref_stamp >= 0)
+ if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
+
+ if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
+}
+
+static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
+ int *pts_inref, int row_offset, int sign_r,
+ int col_offset, int sign_c) {
+ const int bw = block_size_wide[mbmi->bsize];
+ const int bh = block_size_high[mbmi->bsize];
+ const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1;
+ const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1;
+
+ pts[0] = GET_MV_SUBPEL(x);
+ pts[1] = GET_MV_SUBPEL(y);
+ pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col;
+ pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row;
+}
+
+// Select samples according to the motion vector difference.
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+ BLOCK_SIZE bsize) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
+ uint8_t ret = 0;
+ assert(len <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // Only keep the samples with MV differences within threshold.
+ for (int i = 0; i < len; ++i) {
+ const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+ abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+ if (diff > thresh) continue;
+ if (ret != i) {
+ memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0]));
+ memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0]));
+ }
+ ++ret;
+ }
+ // Keep at least 1 sample.
+ return AOMMAX(ret, 1);
+}
+
+// Note: Samples returned are at 1/8-pel precision
+// Sample are the neighbor block center point's coordinates relative to the
+// left-top pixel of current block.
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+ int *pts_inref) {
+ const MB_MODE_INFO *const mbmi0 = xd->mi[0];
+ const int ref_frame = mbmi0->ref_frame[0];
+ const int up_available = xd->up_available;
+ const int left_available = xd->left_available;
+ uint8_t np = 0;
+ int do_tl = 1;
+ int do_tr = 1;
+ const int mi_stride = xd->mi_stride;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // scan the nearest above rows
+ if (up_available) {
+ const int mi_row_offset = -1;
+ const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
+ uint8_t superblock_width = mi_size_wide[mbmi->bsize];
+
+ if (xd->width <= superblock_width) {
+ // Handle "current block width <= above block width" case.
+ const int col_offset = -mi_col % superblock_width;
+
+ if (col_offset < 0) do_tl = 0;
+ if (col_offset + superblock_width > xd->width) do_tr = 0;
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
+ pts += 2;
+ pts_inref += 2;
+ if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ } else {
+ // Handle "current block width > above block width" case.
+ for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+ i += superblock_width) {
+ mbmi = xd->mi[i + mi_row_offset * mi_stride];
+ superblock_width = mi_size_wide[mbmi->bsize];
+
+ if (mbmi->ref_frame[0] == ref_frame &&
+ mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
+ pts += 2;
+ pts_inref += 2;
+ if (++np >= LEAST_SQUARES_SAMPLES_MAX)
+ return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // scan the nearest left columns
+ if (left_available) {
+ const int mi_col_offset = -1;
+ const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+ uint8_t superblock_height = mi_size_high[mbmi->bsize];
+
+ if (xd->height <= superblock_height) {
+ // Handle "current block height <= above block height" case.
+ const int row_offset = -mi_row % superblock_height;
+
+ if (row_offset < 0) do_tl = 0;
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ } else {
+ // Handle "current block height > above block height" case.
+ for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+ i += superblock_height) {
+ mbmi = xd->mi[mi_col_offset + i * mi_stride];
+ superblock_height = mi_size_high[mbmi->bsize];
+
+ if (mbmi->ref_frame[0] == ref_frame &&
+ mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
+ pts += 2;
+ pts_inref += 2;
+ if (++np >= LEAST_SQUARES_SAMPLES_MAX)
+ return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // Top-left block
+ if (do_tl && left_available && up_available) {
+ const int mi_row_offset = -1;
+ const int mi_col_offset = -1;
+ MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
+ pts += 2;
+ pts_inref += 2;
+ if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // Top-right block
+ if (do_tr &&
+ has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
+ const POSITION trb_pos = { -1, xd->width };
+ const TileInfo *const tile = &xd->tile;
+ if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
+ const int mi_row_offset = -1;
+ const int mi_col_offset = xd->width;
+ const MB_MODE_INFO *mbmi =
+ xd->mi[mi_col_offset + mi_row_offset * mi_stride];
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
+ if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ return np;
+}
+
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
+ const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
+ SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+
+ skip_mode_info->skip_mode_allowed = 0;
+ skip_mode_info->ref_frame_idx_0 = INVALID_IDX;
+ skip_mode_info->ref_frame_idx_1 = INVALID_IDX;
+
+ if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) ||
+ cm->current_frame.reference_mode == SINGLE_REFERENCE)
+ return;
+
+ const int cur_order_hint = cm->current_frame.order_hint;
+ int ref_order_hints[2] = { -1, INT_MAX };
+ int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
+
+ // Identify the nearest forward and backward references.
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+ if (buf == NULL) continue;
+
+ const int ref_order_hint = buf->order_hint;
+ if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) <
+ 0) {
+ // Forward reference
+ if (ref_order_hints[0] == -1 ||
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[0]) > 0) {
+ ref_order_hints[0] = ref_order_hint;
+ ref_idx[0] = i;
+ }
+ } else if (get_relative_dist(order_hint_info, ref_order_hint,
+ cur_order_hint) > 0) {
+ // Backward reference
+ if (ref_order_hints[1] == INT_MAX ||
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[1]) < 0) {
+ ref_order_hints[1] = ref_order_hint;
+ ref_idx[1] = i;
+ }
+ }
+ }
+
+ if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
+ // == Bi-directional prediction ==
+ skip_mode_info->skip_mode_allowed = 1;
+ skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+ skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+ } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
+ // == Forward prediction only ==
+ // Identify the second nearest forward reference.
+ ref_order_hints[1] = -1;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+ if (buf == NULL) continue;
+
+ const int ref_order_hint = buf->order_hint;
+ if ((ref_order_hints[0] != -1 &&
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[0]) < 0) &&
+ (ref_order_hints[1] == -1 ||
+ get_relative_dist(order_hint_info, ref_order_hint,
+ ref_order_hints[1]) > 0)) {
+ // Second closest forward reference
+ ref_order_hints[1] = ref_order_hint;
+ ref_idx[1] = i;
+ }
+ }
+ if (ref_order_hints[1] != -1) {
+ skip_mode_info->skip_mode_allowed = 1;
+ skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+ skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+ }
+ }
+}
+
+typedef struct {
+ int map_idx; // frame map index
+ RefCntBuffer *buf; // frame buffer
+ int sort_idx; // index based on the offset to be used for sorting
+} REF_FRAME_INFO;
+
+// Compares the sort_idx fields. If they are equal, then compares the map_idx
+// fields to break the tie. This ensures a stable sort.
+static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
+ const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
+ const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
+
+ const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
+ if (sort_idx_diff != 0) return sort_idx_diff;
+ return info_a->map_idx - info_b->map_idx;
+}
+
+static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
+ REF_FRAME_INFO *ref_info) {
+ assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
+
+ remapped_ref_idx[frame_idx] = ref_info->map_idx;
+}
+
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+ int lst_map_idx, int gld_map_idx) {
+ int lst_frame_sort_idx = -1;
+ int gld_frame_sort_idx = -1;
+
+ assert(cm->seq_params->order_hint_info.enable_order_hint);
+ assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0);
+ const int cur_order_hint = (int)cm->current_frame.order_hint;
+ const int cur_frame_sort_idx =
+ 1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1;
+
+ REF_FRAME_INFO ref_frame_info[REF_FRAMES];
+ int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
+
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ const int map_idx = i;
+
+ ref_frame_info[i].map_idx = map_idx;
+ ref_frame_info[i].sort_idx = -1;
+
+ RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
+ ref_frame_info[i].buf = buf;
+
+ if (buf == NULL) continue;
+ // If this assertion fails, there is a reference leak.
+ assert(buf->ref_count > 0);
+
+ const int offset = (int)buf->order_hint;
+ ref_frame_info[i].sort_idx =
+ (offset == -1) ? -1
+ : cur_frame_sort_idx +
+ get_relative_dist(&cm->seq_params->order_hint_info,
+ offset, cur_order_hint);
+ assert(ref_frame_info[i].sort_idx >= -1);
+
+ if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
+ if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
+ }
+
+ // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
+ // frames.
+ if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests a look-ahead frame as LAST");
+ }
+ if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests a look-ahead frame as GOLDEN");
+ }
+
+ // Sort ref frames based on their frame_offset values.
+ qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
+ compare_ref_frame_info);
+
+ // Identify forward and backward reference frames.
+ // Forward reference: offset < order_hint
+ // Backward reference: offset >= order_hint
+ int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
+
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (ref_frame_info[i].sort_idx == -1) {
+ fwd_start_idx++;
+ continue;
+ }
+
+ if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
+ fwd_end_idx = i - 1;
+ break;
+ }
+ }
+
+ int bwd_start_idx = fwd_end_idx + 1;
+ int bwd_end_idx = REF_FRAMES - 1;
+
+ // === Backward Reference Frames ===
+
+ // == ALTREF_FRAME ==
+ if (bwd_start_idx <= bwd_end_idx) {
+ set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
+ &ref_frame_info[bwd_end_idx]);
+ ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
+ bwd_end_idx--;
+ }
+
+ // == BWDREF_FRAME ==
+ if (bwd_start_idx <= bwd_end_idx) {
+ set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
+ &ref_frame_info[bwd_start_idx]);
+ ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
+ bwd_start_idx++;
+ }
+
+ // == ALTREF2_FRAME ==
+ if (bwd_start_idx <= bwd_end_idx) {
+ set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
+ &ref_frame_info[bwd_start_idx]);
+ ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
+ }
+
+ // === Forward Reference Frames ===
+
+ for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
+ // == LAST_FRAME ==
+ if (ref_frame_info[i].map_idx == lst_map_idx) {
+ set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
+ &ref_frame_info[i]);
+ ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
+ }
+
+ // == GOLDEN_FRAME ==
+ if (ref_frame_info[i].map_idx == gld_map_idx) {
+ set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
+ &ref_frame_info[i]);
+ ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
+ }
+ }
+
+ assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
+ ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
+
+ // == LAST2_FRAME ==
+ // == LAST3_FRAME ==
+ // == BWDREF_FRAME ==
+ // == ALTREF2_FRAME ==
+ // == ALTREF_FRAME ==
+
+ // Set up the reference frames in the anti-chronological order.
+ static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
+ LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
+ };
+
+ int ref_idx;
+ for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+ const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+
+ if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+
+ while (fwd_start_idx <= fwd_end_idx &&
+ (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
+ ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
+ fwd_end_idx--;
+ }
+ if (fwd_start_idx > fwd_end_idx) break;
+
+ set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
+ &ref_frame_info[fwd_end_idx]);
+ ref_flag_list[ref_frame - LAST_FRAME] = 1;
+
+ fwd_end_idx--;
+ }
+
+ // Assign all the remaining frame(s), if any, to the earliest reference
+ // frame.
+ for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+ const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+ if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+ set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
+ &ref_frame_info[fwd_start_idx]);
+ ref_flag_list[ref_frame - LAST_FRAME] = 1;
+ }
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ assert(ref_flag_list[i] == 1);
+ }
+}
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
new file mode 100644
index 0000000000..3ab784c1ed
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MVREF_ROW_COLS 3
+
+// Set the upper limit of the motion vector component magnitude.
+// This would make a motion vector fit in 26 bits. Plus 3 bits for the
+// reference frame index. A tuple of motion vector can hence be stored within
+// 32 bit range for efficient load/store operations.
+#define REFMVS_LIMIT ((1 << 12) - 1)
+
+typedef struct position {
+ int row;
+ int col;
+} POSITION;
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+
+static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
+ if (!oh->enable_order_hint) return 0;
+
+ const int bits = oh->order_hint_bits_minus_1 + 1;
+
+ assert(bits >= 1);
+ assert(a >= 0 && a < (1 << bits));
+ assert(b >= 0 && b < (1 << bits));
+
+ int diff = a - b;
+ const int m = 1 << (bits - 1);
+ diff = (diff & (m - 1)) - (diff & m);
+ return diff;
+}
+
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+ const SubpelMvLimits mv_limits = {
+ xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
+ xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
+ xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
+ xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
+ };
+ clamp_mv(mv, &mv_limits);
+}
+
+static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
+ return candidate->mv[which_mv];
+}
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
+ const POSITION *mi_pos) {
+ return !(mi_row + mi_pos->row < tile->mi_row_start ||
+ mi_col + mi_pos->col < tile->mi_col_start ||
+ mi_row + mi_pos->row >= tile->mi_row_end ||
+ mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
+static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
+ int row_offset) {
+ return clamp(row_offset, tile->mi_row_start - mi_row,
+ tile->mi_row_end - mi_row - 1);
+}
+
+static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
+ int col_offset) {
+ return clamp(col_offset, tile->mi_col_start - mi_col,
+ tile->mi_col_end - mi_col - 1);
+}
+
+static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
+ if (is_integer) {
+ integer_mv_precision(mv);
+ } else {
+ if (!allow_hp) {
+ if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
+ if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
+ }
+ }
+}
+
+static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+ // Single ref pred
+ if (rf[1] <= INTRA_FRAME) return -1;
+
+ // Bi-directional comp ref pred
+ if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
+
+ for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
+ if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
+ return ref_idx;
+ }
+ return -1;
+}
+
+static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+ if (rf[1] > INTRA_FRAME) {
+ const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
+ if (uni_comp_ref_idx >= 0) {
+ assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
+ MODE_CTX_REF_FRAMES);
+ return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
+ } else {
+ return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
+ BWD_RF_OFFSET(rf[1]) * FWD_REFS;
+ }
+ }
+
+ return rf[0];
+}
+
+// clang-format off
+static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
+ { LAST_FRAME, BWDREF_FRAME }, { LAST2_FRAME, BWDREF_FRAME },
+ { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
+
+ { LAST_FRAME, ALTREF2_FRAME }, { LAST2_FRAME, ALTREF2_FRAME },
+ { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
+
+ { LAST_FRAME, ALTREF_FRAME }, { LAST2_FRAME, ALTREF_FRAME },
+ { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+
+ { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
+ { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+
+ // NOTE: Following reference frame pairs are not supported to be explicitly
+ // signalled, but they are possibly chosen by the use of skip_mode,
+ // which may use the most recent one-sided reference frame pair.
+ { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
+ { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
+ { ALTREF2_FRAME, ALTREF_FRAME }
+};
+// clang-format on
+
+static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
+ MV_REFERENCE_FRAME ref_frame_type) {
+ if (ref_frame_type >= REF_FRAMES) {
+ rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
+ rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
+ } else {
+ assert(ref_frame_type > NONE_FRAME);
+ rf[0] = ref_frame_type;
+ rf[1] = NONE_FRAME;
+ }
+}
+
+static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
+ { 0, 1, 1, 1, 1 },
+ { 1, 2, 3, 4, 4 },
+ { 4, 4, 5, 6, 7 },
+};
+
+static INLINE int16_t av1_mode_context_analyzer(
+ const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
+ const int8_t ref_frame = av1_ref_frame_type(rf);
+
+ if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
+
+ const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
+ const int16_t refmv_ctx =
+ (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+ const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+ newmv_ctx, COMP_NEWMV_CTXS - 1)];
+ return comp_ctx;
+}
+
+static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
+ if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+ ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
+ return 0;
+
+ if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+ ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
+ return 1;
+
+ if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
+ ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
+ return 2;
+
+ return 0;
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm);
+void av1_setup_frame_sign_bias(AV1_COMMON *cm);
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
+void av1_calculate_ref_frame_side(AV1_COMMON *cm);
+void av1_setup_motion_field(AV1_COMMON *cm);
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+ int lst_map_idx, int gld_map_idx);
+
+static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+ av1_zero(xd->neighbors_ref_counts);
+
+ uint8_t *const ref_counts = xd->neighbors_ref_counts;
+
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int above_in_image = xd->up_available;
+ const int left_in_image = xd->left_available;
+
+ // Above neighbor
+ if (above_in_image && is_inter_block(above_mbmi)) {
+ ref_counts[above_mbmi->ref_frame[0]]++;
+ if (has_second_ref(above_mbmi)) {
+ ref_counts[above_mbmi->ref_frame[1]]++;
+ }
+ }
+
+ // Left neighbor
+ if (left_in_image && is_inter_block(left_mbmi)) {
+ ref_counts[left_mbmi->ref_frame[0]]++;
+ if (has_second_ref(left_mbmi)) {
+ ref_counts[left_mbmi->ref_frame[1]]++;
+ }
+ }
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis);
+
+// The global_mvs output parameter points to an array of REF_FRAMES elements.
+// The caller may pass a null global_mvs if it does not need the global_mvs
+// output.
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+ int_mv *global_mvs, int16_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+ int_mv *near_mv, int is_integer);
+
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+ BLOCK_SIZE bsize);
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+ int *pts_inref);
+
+#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels
+#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
+
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+ int mib_size, int mi_row) {
+ if (mi_row - mib_size < tile->mi_row_start) {
+ ref_dv->as_fullmv.row = 0;
+ ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+ } else {
+ ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
+ ref_dv->as_fullmv.col = 0;
+ }
+ convert_fullmv_to_mv(ref_dv);
+}
+
+static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int mib_size_log2) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int SCALE_PX_TO_MV = 8;
+ // Disallow subpixel for now
+ // SUBPEL_MASK is not the correct scale
+ if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
+ return 0;
+
+ const TileInfo *const tile = &xd->tile;
+ // Is the source top-left inside the current tile?
+ const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
+ const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_top_edge < tile_top_edge) return 0;
+ const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
+ const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_left_edge < tile_left_edge) return 0;
+ // Is the bottom right inside the current tile?
+ const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
+ const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_bottom_edge > tile_bottom_edge) return 0;
+ const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
+ const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_right_edge > tile_right_edge) return 0;
+
+ // Special case for sub 8x8 chroma cases, to prevent referring to chroma
+ // pixels outside current tile.
+ if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
+ const struct macroblockd_plane *const pd = &xd->plane[1];
+ if (bw < 8 && pd->subsampling_x)
+ if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+ if (bh < 8 && pd->subsampling_y)
+ if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
+ }
+
+ // Is the bottom right within an already coded SB? Also consider additional
+ // constraints to facilitate HW decoder.
+ const int max_mib_size = 1 << mib_size_log2;
+ const int active_sb_row = mi_row >> mib_size_log2;
+ const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
+ const int sb_size = max_mib_size * MI_SIZE;
+ const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
+ const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
+ const int total_sb64_per_row =
+ ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
+ const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
+ const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
+ if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
+
+ // Wavefront constraint: use only top left area of frame for reference.
+ const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
+ const int wf_offset = gradient * (active_sb_row - src_sb_row);
+ if (src_sb_row > active_sb_row ||
+ src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
+ return 0;
+
+ return 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
new file mode 100644
index 0000000000..b84034541e
--- /dev/null
+++ b/third_party/aom/av1/common/obmc.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
+
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row,
+ int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *nb_mi,
+ void *fun_ctxt, const int num_planes);
+
+static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int nb_max,
+ overlappable_nb_visitor_t fun,
+ void *fun_ctxt) {
+ if (!xd->up_available) return;
+
+ const int num_planes = av1_num_planes(cm);
+ int nb_count = 0;
+ const int mi_col = xd->mi_col;
+ // prev_row_mi points into the mi array, starting at the beginning of the
+ // previous row.
+ MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+ const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
+ uint8_t mi_step;
+ for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
+ above_mi_col += mi_step) {
+ MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+ mi_step =
+ AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]);
+ // If we're considering a block with width 4, it should be treated as
+ // half of a pair of blocks with chroma information in the second. Move
+ // above_mi_col back to the start of the pair if needed, set above_mbmi
+ // to point at the block with chroma information, and set mi_step to 2 to
+ // step over the entire pair at the end of the iteration.
+ if (mi_step == 1) {
+ above_mi_col &= ~1;
+ above_mi = prev_row_mi + above_mi_col + 1;
+ mi_step = 2;
+ }
+ if (is_neighbor_overlappable(*above_mi)) {
+ ++nb_count;
+ fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
+ *above_mi, fun_ctxt, num_planes);
+ }
+ }
+}
+
+static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int nb_max,
+ overlappable_nb_visitor_t fun,
+ void *fun_ctxt) {
+ if (!xd->left_available) return;
+
+ const int num_planes = av1_num_planes(cm);
+ int nb_count = 0;
+ // prev_col_mi points into the mi array, starting at the top of the
+ // previous column
+ const int mi_row = xd->mi_row;
+ MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+ const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
+ uint8_t mi_step;
+ for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
+ left_mi_row += mi_step) {
+ MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+ mi_step =
+ AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]);
+ if (mi_step == 1) {
+ left_mi_row &= ~1;
+ left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
+ mi_step = 2;
+ }
+ if (is_neighbor_overlappable(*left_mi)) {
+ ++nb_count;
+ fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
+ fun_ctxt, num_planes);
+ }
+ }
+}
+
+#endif // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 0000000000..cfca03bb4d
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+ size_t bytes_available,
+ size_t *const obu_size,
+ size_t *const length_field_size) {
+ uint64_t u_obu_size = 0;
+ if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+ 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+ *obu_size = (size_t)u_obu_size;
+ return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+ int is_annexb, ObuHeader *header) {
+ if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+ const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+ if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size = 1;
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // Forbidden bit. Must not be set.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+ header->has_extension = aom_rb_read_bit(rb);
+ header->has_size_field = aom_rb_read_bit(rb);
+
+ if (!header->has_size_field && !is_annexb) {
+ // section 5 obu streams must have obu_size field set.
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+
+ // obu_reserved_1bit must be set to 0. The value is ignored by a decoder.
+ aom_rb_read_bit(rb);
+
+ if (header->has_extension) {
+ if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size += 1;
+ header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+ header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+ // extension_header_reserved_3bits must be set to 0. The value is ignored by
+ // a decoder.
+ aom_rb_read_literal(rb, 3);
+ } else {
+ header->temporal_layer_id = 0;
+ header->spatial_layer_id = 0;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb) {
+ if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+ // TODO(tomfinegan): Set the error handler here and throughout this file, and
+ // confirm parsing work done via aom_read_bit_buffer is successful.
+ struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+ NULL };
+ aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+ if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+ return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read) {
+ size_t length_field_size_obu = 0;
+ size_t length_field_size_payload = 0;
+ size_t obu_size = 0;
+ aom_codec_err_t status;
+
+ if (is_annexb) {
+ // Size field comes before the OBU header, and includes the OBU header
+ status =
+ read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu);
+
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ struct aom_read_bit_buffer rb = { data + length_field_size_obu,
+ data + bytes_available, 0, NULL, NULL };
+
+ status = read_obu_header(&rb, is_annexb, obu_header);
+ if (status != AOM_CODEC_OK) return status;
+
+ if (!obu_header->has_size_field) {
+ assert(is_annexb);
+ // Derive the payload size from the data we've already read
+ if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+ *payload_size = obu_size - obu_header->size;
+ } else {
+ // Size field comes after the OBU header, and is just the payload size
+ status = read_obu_size(
+ data + length_field_size_obu + obu_header->size,
+ bytes_available - length_field_size_obu - obu_header->size,
+ payload_size, &length_field_size_payload);
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ *bytes_read =
+ length_field_size_obu + obu_header->size + length_field_size_payload;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 0000000000..adf3568e15
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ size_t size; // Size (1 or 2 bytes) of the OBU header (including the
+ // optional OBU extension header) in the bitstream.
+ OBU_TYPE type;
+ int has_size_field;
+ int has_extension; // Whether the optional OBU extension header is present.
+ // The following fields come from the OBU extension header. They are set to 0
+ // if has_extension is false.
+ int temporal_layer_id;
+ int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 0000000000..6f88768f2f
--- /dev/null
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector signed char int8x16_t; // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int)
+typedef vector signed short int16x8_t; // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int)
+typedef vector signed int int32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int)
+
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+ int width, int height, int round_offset,
+ int num_pel_log2) {
+ // int16_t *dst = dst_ptr;
+ const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+ const int16_t *sum_buf = (const int16_t *)src_ptr;
+ const int16_t *end = sum_buf + height * CFL_BUF_LINE;
+ const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+ const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+ const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+ 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+ int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+ int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+ do {
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+ sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ if (width >= 16) {
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+ sum_32x4_1 =
+ vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ }
+ if (width == 32) {
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+ sum_32x4_1 =
+ vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+ sum_32x4_1 =
+ vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ }
+ } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+ int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+ const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+ sum_32x4 = vec_add(sum_32x4, perm_64);
+ const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+ sum_32x4 = vec_add(sum_32x4, perm_32);
+ const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+ const int16x8_t vec_avg = vec_pack(avg, avg);
+ do {
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+ OFF_0 + CFL_BUF_LINE_BYTES, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+ OFF_0 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+ OFF_0 + CFL_LINE_3, dst);
+ if (width >= 16) {
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+ OFF_1 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+ OFF_1 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+ OFF_1 + CFL_LINE_3, dst);
+ }
+ if (width == 32) {
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+ OFF_2 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+ OFF_2 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+ OFF_2 + CFL_LINE_3, dst);
+
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+ OFF_3 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+ OFF_3 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+ OFF_3 + CFL_LINE_3, dst);
+ }
+ } while ((dst += CFL_BUF_LINE * 4) < dst_end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+ cfl_subtract_average_4x4_c, /* 4x4 */
+ cfl_subtract_average_8x8_vsx, /* 8x8 */
+ cfl_subtract_average_16x16_vsx, /* 16x16 */
+ cfl_subtract_average_32x32_vsx, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_subtract_average_4x8_c, /* 4x8 */
+ cfl_subtract_average_8x4_vsx, /* 8x4 */
+ cfl_subtract_average_8x16_vsx, /* 8x16 */
+ cfl_subtract_average_16x8_vsx, /* 16x8 */
+ cfl_subtract_average_16x32_vsx, /* 16x32 */
+ cfl_subtract_average_32x16_vsx, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_subtract_average_4x16_c, /* 4x16 */
+ cfl_subtract_average_16x4_vsx, /* 16x4 */
+ cfl_subtract_average_8x32_vsx, /* 8x32 */
+ cfl_subtract_average_32x8_vsx, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+ // index the function pointer array out of bounds.
+ return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
new file mode 100644
index 0000000000..5952441d1f
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi,
+ const MACROBLOCKD *xd, int dir,
+ MV_REFERENCE_FRAME ref_frame) {
+ (void)xd;
+
+ return ((ref_mbmi->ref_frame[0] == ref_frame ||
+ ref_mbmi->ref_frame[1] == ref_frame)
+ ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
+ : SWITCHABLE_FILTERS);
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int ctx_offset =
+ (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+ assert(dir == 0 || dir == 1);
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries corresponding to real macroblocks.
+ // The prediction flags in these dummy entries are initialized to 0.
+ int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+ int left_type = SWITCHABLE_FILTERS;
+ int above_type = SWITCHABLE_FILTERS;
+
+ if (xd->left_available)
+ left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+ if (xd->up_available)
+ above_type =
+ get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
+
+ if (left_type == above_type) {
+ filter_type_ctx += left_type;
+ } else if (left_type == SWITCHABLE_FILTERS) {
+ assert(above_type != SWITCHABLE_FILTERS);
+ filter_type_ctx += above_type;
+ } else if (above_type == SWITCHABLE_FILTERS) {
+ assert(left_type != SWITCHABLE_FILTERS);
+ filter_type_ctx += left_type;
+ } else {
+ filter_type_ctx += SWITCHABLE_FILTERS;
+ }
+
+ return filter_type_ctx;
+}
+
+static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
+ // Do not add an already existing value
+ if (*n > 0 && val == cache[*n - 1]) return;
+
+ cache[(*n)++] = val;
+}
+
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+ uint16_t *cache) {
+ const int row = -xd->mb_to_top_edge >> 3;
+ // Do not refer to above SB row when on SB boundary.
+ const MB_MODE_INFO *const above_mi =
+ (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ int above_n = 0, left_n = 0;
+ if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
+ if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
+ if (above_n == 0 && left_n == 0) return 0;
+ int above_idx = plane * PALETTE_MAX_SIZE;
+ int left_idx = plane * PALETTE_MAX_SIZE;
+ int n = 0;
+ const uint16_t *above_colors =
+ above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
+ const uint16_t *left_colors =
+ left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
+ // Merge the sorted lists of base colors from above and left to get
+ // combined sorted color cache.
+ while (above_n > 0 && left_n > 0) {
+ uint16_t v_above = above_colors[above_idx];
+ uint16_t v_left = left_colors[left_idx];
+ if (v_left < v_above) {
+ palette_add_to_cache(cache, &n, v_left);
+ ++left_idx, --left_n;
+ } else {
+ palette_add_to_cache(cache, &n, v_above);
+ ++above_idx, --above_n;
+ if (v_left == v_above) ++left_idx, --left_n;
+ }
+ }
+ while (above_n-- > 0) {
+ uint16_t val = above_colors[above_idx++];
+ palette_add_to_cache(cache, &n, val);
+ }
+ while (left_n-- > 0) {
+ uint16_t val = left_colors[left_idx++];
+ palette_add_to_cache(cache, &n, val);
+ }
+ assert(n <= 2 * PALETTE_MAX_SIZE);
+ return n;
+}
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+
+ if (has_above && has_left) { // both edges available
+ const int above_intra = !is_inter_block(above_mbmi);
+ const int left_intra = !is_inter_block(left_mbmi);
+ return left_intra && above_intra ? 3 : left_intra || above_intra;
+ } else if (has_above || has_left) { // one edge available
+ return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+ } else {
+ return 0;
+ }
+}
+
+#define CHECK_BACKWARD_REFS(ref_frame) \
+ (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
+#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
+ int ctx;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries corresponding to real macroblocks.
+ // The prediction flags in these dummy entries are initialized to 0.
+ if (has_above && has_left) { // both edges available
+ if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+ // neither edge uses comp pred (0/1)
+ ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
+ IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
+ else if (!has_second_ref(above_mbmi))
+ // one of two edges uses comp pred (2/3)
+ ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
+ !is_inter_block(above_mbmi));
+ else if (!has_second_ref(left_mbmi))
+ // one of two edges uses comp pred (2/3)
+ ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
+ !is_inter_block(left_mbmi));
+ else // both edges use comp pred (4)
+ ctx = 4;
+ } else if (has_above || has_left) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+ if (!has_second_ref(edge_mbmi))
+ // edge does not use comp pred (0/1)
+ ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
+ else
+ // edge uses comp pred (3)
+ ctx = 3;
+ } else { // no edges available (1)
+ ctx = 1;
+ }
+ assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+ return ctx;
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
+ int pred_context;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int above_in_image = xd->up_available;
+ const int left_in_image = xd->left_available;
+
+ if (above_in_image && left_in_image) { // both edges available
+ const int above_intra = !is_inter_block(above_mbmi);
+ const int left_intra = !is_inter_block(left_mbmi);
+
+ if (above_intra && left_intra) { // intra/intra
+ pred_context = 2;
+ } else if (above_intra || left_intra) { // intra/inter
+ const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+ if (!has_second_ref(inter_mbmi)) // single pred
+ pred_context = 2;
+ else // comp pred
+ pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi);
+ } else { // inter/inter
+ const int a_sg = !has_second_ref(above_mbmi);
+ const int l_sg = !has_second_ref(left_mbmi);
+ const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0];
+ const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
+
+ if (a_sg && l_sg) { // single/single
+ pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
+ IS_BACKWARD_REF_FRAME(frfl)));
+ } else if (l_sg || a_sg) { // single/comp
+ const int uni_rfc =
+ a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
+
+ if (!uni_rfc) // comp bidir
+ pred_context = 1;
+ else // comp unidir
+ pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^
+ IS_BACKWARD_REF_FRAME(frfl)));
+ } else { // comp/comp
+ const int a_uni_rfc = has_uni_comp_refs(above_mbmi);
+ const int l_uni_rfc = has_uni_comp_refs(left_mbmi);
+
+ if (!a_uni_rfc && !l_uni_rfc) // bidir/bidir
+ pred_context = 0;
+ else if (!a_uni_rfc || !l_uni_rfc) // unidir/bidir
+ pred_context = 2;
+ else // unidir/unidir
+ pred_context =
+ 3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME)));
+ }
+ }
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+ if (!is_inter_block(edge_mbmi)) { // intra
+ pred_context = 2;
+ } else { // inter
+ if (!has_second_ref(edge_mbmi)) // single pred
+ pred_context = 2;
+ else // comp pred
+ pred_context = 4 * has_uni_comp_refs(edge_mbmi);
+ }
+ } else { // no edges available
+ pred_context = 2;
+ }
+
+ assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS);
+ return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as either
+// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as uni-directional.
+//
+// 3 contexts: Voting is used to compare the count of forward references with
+// that of backward references from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of forward references (L, L2, L3, or G)
+ const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+ // Count of backward references (B or A)
+ const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+ ref_counts[ALTREF_FRAME];
+
+ const int pred_context =
+ (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+ return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above three.
+//
+// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the
+// total count of LAST3/GOLDEN from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST2
+ const int last2_count = ref_counts[LAST2_FRAME];
+ // Count of LAST3 or GOLDEN
+ const int last3_or_gld_count =
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+ const int pred_context = (last2_count == last3_or_gld_count)
+ ? 1
+ : ((last2_count < last3_or_gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+ return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST3) or (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above two.
+//
+// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the
+// total count of GOLDEN_FRAME from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST3
+ const int last3_count = ref_counts[LAST3_FRAME];
+ // Count of GOLDEN
+ const int gld_count = ref_counts[GOLDEN_FRAME];
+
+ const int pred_context =
+ (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+ return pred_context;
+}
+
+// == Common context functions for both comp and single ref ==
+//
+// Obtain contexts to signal a reference frame to be either LAST/LAST2 or
+// LAST3/GOLDEN.
+static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST + LAST2
+ const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
+ // Count of LAST3 + GOLDEN
+ const int last3_gld_count =
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+ const int pred_context = (last_last2_count == last3_gld_count)
+ ? 1
+ : ((last_last2_count < last3_gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST or LAST2.
+static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST
+ const int last_count = ref_counts[LAST_FRAME];
+ // Count of LAST2
+ const int last2_count = ref_counts[LAST2_FRAME];
+
+ const int pred_context =
+ (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN.
+static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST3
+ const int last3_count = ref_counts[LAST3_FRAME];
+ // Count of GOLDEN
+ const int gld_count = ref_counts[GOLDEN_FRAME];
+
+ const int pred_context =
+ (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or
+// ALTREF.
+static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
+ const int brfarf2_count =
+ ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
+ const int arf_count = ref_counts[ALTREF_FRAME];
+
+ const int pred_context =
+ (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2.
+static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of BWDREF frames (B)
+ const int brf_count = ref_counts[BWDREF_FRAME];
+ // Count of ALTREF2 frames (A2)
+ const int arf2_count = ref_counts[ALTREF2_FRAME];
+
+ const int pred_context =
+ (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// == Context functions for comp ref ==
+//
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
+ return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
+ return get_pred_context_last_or_last2(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
+ return get_pred_context_last3_or_gld(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF, or ALTREF2/BWDREF.
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
+ return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF2 or BWDREF.
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
+ return get_pred_context_brf_or_arf2(xd);
+}
+
+// == Context functions for single ref ==
+//
+// For the bit to signal whether the single reference is a forward reference
+// frame or a backward reference frame.
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of forward reference frames
+ const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+ // Count of backward reference frames
+ const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+ ref_counts[ALTREF_FRAME];
+
+ const int pred_context =
+ (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// non-ALTREF backward reference frame, knowing that it shall be either of
+// these 2 choices.
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+ return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// For the bit to signal whether the single reference is LAST3/GOLDEN or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+ return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+ return get_pred_context_last_or_last2(xd);
+}
+
+// For the bit to signal whether the single reference is GOLDEN_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+ return get_pred_context_last3_or_gld(xd);
+}
+
+// For the bit to signal whether the single reference is ALTREF2_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
+ return get_pred_context_brf_or_arf2(xd);
+}
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
new file mode 100644
index 0000000000..361a4078d4
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
+
+#include <stdint.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE uint8_t get_segment_id(
+ const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+ const int seg_stride = mi_params->mi_cols;
+ uint8_t segment_id = MAX_SEGMENTS;
+
+ for (int y = 0; y < ymis; ++y) {
+ for (int x = 0; x < xmis; ++x) {
+ segment_id =
+ AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]);
+ }
+ }
+
+ assert(segment_id < MAX_SEGMENTS);
+ return segment_id;
+}
+
+static INLINE uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ int *cdf_index,
+ int skip_over4x4) {
+ const int step_size = skip_over4x4 ? 2 : 1;
+ uint8_t prev_ul = UINT8_MAX; // top left segment_id
+ uint8_t prev_l = UINT8_MAX; // left segment_id
+ uint8_t prev_u = UINT8_MAX; // top segment_id
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const uint8_t *seg_map = cm->cur_frame->seg_map;
+ if ((xd->up_available) && (xd->left_available)) {
+ prev_ul = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size,
+ mi_col - step_size);
+ }
+ if (xd->up_available) {
+ prev_u = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size,
+ mi_col - 0);
+ }
+ if (xd->left_available) {
+ prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0,
+ mi_col - step_size);
+ }
+ assert(IMPLIES(prev_ul != UINT8_MAX,
+ prev_u != UINT8_MAX && prev_l != UINT8_MAX));
+
+ // Pick CDF index based on number of matching/out-of-bounds segment IDs.
+ if (prev_ul == UINT8_MAX) /* Edge cases */
+ *cdf_index = 0;
+ else if ((prev_ul == prev_u) && (prev_ul == prev_l))
+ *cdf_index = 2;
+ else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
+ *cdf_index = 1;
+ else
+ *cdf_index = 0;
+
+ // If 2 or more are identical returns that as predictor, otherwise prev_l.
+ if (prev_u == UINT8_MAX) // edge case
+ return prev_l == UINT8_MAX ? 0 : prev_l;
+ if (prev_l == UINT8_MAX) // edge case
+ return prev_u;
+ return (prev_ul == prev_u) ? prev_u : prev_l;
+}
+
+static INLINE uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
+ const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
+
+ return above_sip + left_sip;
+}
+
+static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+ const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+ int bck_frame_index = 0, fwd_frame_index = 0;
+ int cur_frame_index = cm->cur_frame->order_hint;
+
+ if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+ if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
+
+ int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info,
+ fwd_frame_index, cur_frame_index));
+ int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info,
+ cur_frame_index, bck_frame_index));
+
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+
+ int above_ctx = 0, left_ctx = 0;
+ const int offset = (fwd == bck);
+
+ if (above_mi != NULL) {
+ if (has_second_ref(above_mi))
+ above_ctx = above_mi->compound_idx;
+ else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+ above_ctx = 1;
+ }
+
+ if (left_mi != NULL) {
+ if (has_second_ref(left_mi))
+ left_ctx = left_mi->compound_idx;
+ else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+ left_ctx = 1;
+ }
+
+ return above_ctx + left_ctx + 3 * offset;
+}
+
+static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ int above_ctx = 0, left_ctx = 0;
+
+ if (above_mi) {
+ if (has_second_ref(above_mi))
+ above_ctx = above_mi->comp_group_idx;
+ else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+ above_ctx = 3;
+ }
+ if (left_mi) {
+ if (has_second_ref(left_mi))
+ left_ctx = left_mi->comp_group_idx;
+ else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+ left_ctx = 3;
+ }
+
+ return AOMMIN(5, above_ctx + left_ctx);
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
+ struct segmentation_probs *segp, const MACROBLOCKD *xd) {
+ return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
+ const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
+ return above_skip_mode + left_skip_mode;
+}
+
+static INLINE int av1_get_skip_txfm_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0;
+ const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0;
+ return above_skip_txfm + left_skip_txfm;
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+
+// Get a list of palette base colors that are used in the above and left blocks,
+// referred to as "color cache". The return value is the number of colors in the
+// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache"
+// in ascending order.
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+ uint16_t *cache);
+
+static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
+}
+
+static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ int ctx = 0;
+ if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
+ if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
+ return ctx;
+}
+
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+ return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
+}
+
+static INLINE aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) {
+ return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)];
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
+
+// == Uni-directional contexts ==
+
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_comp_reference_type_context(xd);
+ return xd->tile_ctx->comp_ref_type_cdf[pred_context];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
+ return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
+ return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
+ return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
+}
+
+// == Bi-directional contexts ==
+
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_ref_p(xd);
+ return xd->tile_ctx->comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
+ return xd->tile_ctx->comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
+ return xd->tile_ctx->comp_ref_cdf[pred_context][2];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
+ return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
+ return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
+}
+
+// == Single contexts ==
+
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
+}
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize];
+ const int max_tx_wide = tx_size_wide[max_tx_size];
+ const int max_tx_high = tx_size_high[max_tx_size];
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+
+ int above = xd->above_txfm_context[0] >= max_tx_wide;
+ int left = xd->left_txfm_context[0] >= max_tx_high;
+
+ if (has_above)
+ if (is_inter_block(above_mbmi))
+ above = block_size_wide[above_mbmi->bsize] >= max_tx_wide;
+
+ if (has_left)
+ if (is_inter_block(left_mbmi))
+ left = block_size_high[left_mbmi->bsize] >= max_tx_high;
+
+ if (has_above && has_left)
+ return (above + left);
+ else if (has_above)
+ return above;
+ else if (has_left)
+ return left;
+ else
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
new file mode 100644
index 0000000000..b0976287ef
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.c
@@ -0,0 +1,12876 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = {
+ 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18,
+ 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30,
+ 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42,
+ 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53,
+ 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65,
+ 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+ 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88,
+ 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110,
+ 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134,
+ 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164,
+ 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202,
+ 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+ 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300,
+ 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364,
+ 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441,
+ 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549,
+ 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736,
+ 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+ 1184, 1232, 1282, 1336,
+};
+
+static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = {
+ 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37,
+ 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82,
+ 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132,
+ 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182,
+ 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230,
+ 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276,
+ 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321,
+ 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387,
+ 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466,
+ 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567,
+ 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687,
+ 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+ 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001,
+ 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
+ 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
+ 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
+ 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
+ 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
+ 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
+ 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+};
+
+static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = {
+ 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91,
+ 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237,
+ 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405,
+ 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580,
+ 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752,
+ 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919,
+ 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080,
+ 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234,
+ 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419,
+ 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692,
+ 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957,
+ 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334,
+ 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746,
+ 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226,
+ 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788,
+ 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420,
+ 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153,
+ 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984,
+ 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966,
+ 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214,
+ 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031,
+ 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
+ 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
+ 19718, 20521, 21387,
+};
+
+static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = {
+ 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+ 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
+ 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118,
+ 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144,
+ 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179,
+ 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223,
+ 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+ 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353,
+ 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448,
+ 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571,
+ 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729,
+ 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933,
+ 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
+ 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
+ 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = {
+ 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40,
+ 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92,
+ 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149,
+ 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208,
+ 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267,
+ 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324,
+ 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379,
+ 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466,
+ 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571,
+ 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713,
+ 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889,
+ 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+ 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
+ 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
+ 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
+ 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
+ 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
+ 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
+ 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
+ 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+};
+
+static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = {
+ 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99,
+ 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263,
+ 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456,
+ 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660,
+ 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865,
+ 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067,
+ 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264,
+ 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457,
+ 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693,
+ 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052,
+ 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411,
+ 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943,
+ 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555,
+ 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310,
+ 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256,
+ 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410,
+ 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867,
+ 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660,
+ 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
+ 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
+ 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
+ 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
+ 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
+ 28143, 28687, 29247,
+};
+
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms. Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values. Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale(). Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients. However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// dequantized/decoded coefficients, even for large TX blocks, are always
+// effectively Q3. Meanwhile, quantized/coded coefficients are Q0
+// because Qn quantizers are applied to Qn tx coefficients.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input. In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ const int q_clamped = clamp(qindex + delta, 0, MAXQ);
+ switch (bit_depth) {
+ case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
+ case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
+ case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ const int q_clamped = clamp(qindex + delta, 0, MAXQ);
+ switch (bit_depth) {
+ case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
+ case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
+ case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+ int base_qindex) {
+ if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+ const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+ const int seg_qindex = base_qindex + data;
+ return clamp(seg_qindex, 0, MAXQ);
+ } else {
+ return base_qindex;
+ }
+}
+
+bool av1_use_qmatrix(const CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int segment_id) {
+ // True if explicit Q matrix levels and this is not a lossless segment.
+ return quant_params->using_qmatrix && !xd->lossless[segment_id];
+}
+
+const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel,
+ int plane, TX_SIZE tx_size) {
+ assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
+ qmlevel == NUM_QM_LEVELS - 1);
+ return quant_params->giqmatrix[qmlevel][plane][tx_size];
+}
+const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel,
+ int plane, TX_SIZE tx_size) {
+ assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL ||
+ qmlevel == NUM_QM_LEVELS - 1);
+ return quant_params->gqmatrix[qmlevel][plane][tx_size];
+}
+
+// Returns true if the tx_type corresponds to non-identity transform in both
+// horizontal and vertical directions.
+static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
+
+const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int seg_id = mbmi->segment_id;
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+ return is_2d_transform(tx_type)
+ ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+ : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int seg_id = mbmi->segment_id;
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+ return is_2d_transform(tx_type)
+ ? pd->seg_qmatrix[seg_id][qm_tx_size]
+ : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+#define QM_TOTAL_SIZE 3344
+// We only use wt_matrix_ref[q] and iwt_matrix_ref[q]
+// for q = 0, ..., NUM_QM_LEVELS - 2.
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
+
+void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
+ for (int q = 0; q < NUM_QM_LEVELS; ++q) {
+ for (int c = 0; c < num_planes; ++c) {
+ int current = 0;
+ for (int t = 0; t < TX_SIZES_ALL; ++t) {
+ const int size = tx_size_2d[t];
+ const int qm_tx_size = av1_get_adjusted_tx_size(t);
+ if (q == NUM_QM_LEVELS - 1) {
+ quant_params->gqmatrix[q][c][t] = NULL;
+ quant_params->giqmatrix[q][c][t] = NULL;
+ } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size'
+ assert(t > qm_tx_size);
+ quant_params->gqmatrix[q][c][t] =
+ quant_params->gqmatrix[q][c][qm_tx_size];
+ quant_params->giqmatrix[q][c][t] =
+ quant_params->giqmatrix[q][c][qm_tx_size];
+ } else {
+ assert(current + size <= QM_TOTAL_SIZE);
+ quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+ quant_params->giqmatrix[q][c][t] =
+ &iwt_matrix_ref[q][c >= 1][current];
+ current += size;
+ }
+ }
+ }
+ }
+}
+
+/* Provide 15 sets of quantization matrices for chroma and luma
+ and each TX size. Matrices for different TX sizes are in fact
+ sub-sampled from the 32x32 and 16x16 sizes, but explicitly
+ defined here for convenience. Intra and inter matrix sets are the
+ same but changing DEFAULT_QM_INTER_OFFSET from zero allows
+ for different matrices for inter and intra blocks in the same
+ frame.
+ Matrices for different QM levels have been rescaled in the
+ frequency domain according to different nominal viewing
+ distances. Matrices for QM level 15 are omitted because they are
+ not used.
+ */
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200,
+ /* Size 8x8 */
+ 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38,
+ 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63,
+ 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89,
+ 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220,
+ /* Size 16x16 */
+ 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31,
+ 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32,
+ 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35,
+ 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48,
+ 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63,
+ 67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71,
+ 80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92,
+ 98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98,
+ 105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110,
+ 118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113,
+ 121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107,
+ 115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100,
+ 108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96,
+ 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100,
+ 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119,
+ 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220,
+ 231,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71,
+ 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32,
+ 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77,
+ 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32,
+ 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83,
+ 86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35,
+ 35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91,
+ 94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42,
+ 44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100,
+ 104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49,
+ 53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106,
+ 109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63,
+ 68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34,
+ 34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77,
+ 79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36,
+ 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88,
+ 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47,
+ 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98,
+ 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58,
+ 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102,
+ 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67,
+ 69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110,
+ 111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71,
+ 76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117,
+ 119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82,
+ 87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120,
+ 121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+ 92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125,
+ 129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89,
+ 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131,
+ 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+ 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133,
+ 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90,
+ 97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137,
+ 140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93,
+ 96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142,
+ 144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85,
+ 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148,
+ 149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81,
+ 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156,
+ 152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82,
+ 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154,
+ 158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81,
+ 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154,
+ 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84,
+ 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146,
+ 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90,
+ 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143,
+ 148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101,
+ 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130,
+ 138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197,
+ 204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122,
+ 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200,
+ 204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110,
+ 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191,
+ 193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102,
+ 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173,
+ 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104,
+ 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151,
+ 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119,
+ 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130,
+ 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220,
+ 222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114,
+ 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191,
+ 204, 206, 222, 224, 230, 232, 242,
+ /* Size 4x8 */
+ 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75,
+ 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190,
+ /* Size 8x4 */
+ 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65,
+ 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190,
+ /* Size 8x16 */
+ 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32,
+ 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34,
+ 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50,
+ 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59,
+ 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77,
+ 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86,
+ 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99,
+ 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
+ /* Size 16x8 */
+ 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34,
+ 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60,
+ 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102,
+ 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124,
+ 122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121,
+ 144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107,
+ 128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101,
+ 117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+ 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32,
+ 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74,
+ 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34,
+ 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80,
+ 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41,
+ 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87,
+ 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53,
+ 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100,
+ 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67,
+ 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108,
+ 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82,
+ 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117,
+ 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+ 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123,
+ 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+ 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136,
+ 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96,
+ 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144,
+ 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98,
+ 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152,
+ 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89,
+ 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160,
+ 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84,
+ 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162,
+ 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83,
+ 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152,
+ 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89,
+ 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+ 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93,
+ 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142,
+ 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+ /* Size 32x16 */
+ 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32,
+ 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33,
+ 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41,
+ 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54,
+ 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69,
+ 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78,
+ 80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+ 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90,
+ 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41,
+ 43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49,
+ 55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56,
+ 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60,
+ 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63,
+ 75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66,
+ 77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68,
+ 79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73,
+ 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79,
+ 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+ 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72,
+ 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80,
+ 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91,
+ 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155,
+ 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166,
+ 168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173,
+ 171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173,
+ 178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170,
+ 174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155,
+ 168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138,
+ 141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117,
+ 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105,
+ 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111,
+ 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217,
+ /* Size 4x16 */
+ 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44,
+ 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72,
+ 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90,
+ 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
+ /* Size 16x4 */
+ 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54,
+ 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118,
+ 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100,
+ 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+ 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32,
+ 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71,
+ 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36,
+ 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88,
+ 91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57,
+ 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105,
+ 107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65,
+ 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131,
+ 128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82,
+ 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148,
+ 153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80,
+ 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153,
+ 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89,
+ 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+ 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
+ /* Size 32x8 */
+ 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33,
+ 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50,
+ 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79,
+ 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90,
+ 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44,
+ 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60,
+ 82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66,
+ 89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73,
+ 97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80,
+ 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77,
+ 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94,
+ 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171,
+ 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174,
+ 186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138,
+ 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105,
+ 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
+ /* Size 8x8 */
+ 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46,
+ 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72,
+ 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95,
+ 104, 107, 71, 67, 68, 75, 84, 95, 107, 113,
+ /* Size 16x16 */
+ 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32,
+ 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45,
+ 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49,
+ 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55,
+ 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67,
+ 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74,
+ 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78,
+ 78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83,
+ 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58,
+ 57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59,
+ 64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69,
+ 73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73,
+ 78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78,
+ 82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83,
+ 88, 93, 98, 104, 109, 112, 116,
+ /* Size 32x32 */
+ 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60,
+ 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33,
+ 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44,
+ 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66,
+ 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46,
+ 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+ 69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50,
+ 51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38,
+ 40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55,
+ 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47,
+ 48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59,
+ 60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51,
+ 51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63,
+ 63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54,
+ 55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66,
+ 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61,
+ 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45,
+ 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69,
+ 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51,
+ 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70,
+ 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60,
+ 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74,
+ 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69,
+ 70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52,
+ 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78,
+ 79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49,
+ 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80,
+ 81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60,
+ 63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82,
+ 82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72,
+ 75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85,
+ 89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57,
+ 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92,
+ 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60,
+ 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94,
+ 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70,
+ 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95,
+ 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82,
+ 84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65,
+ 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92,
+ 96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63,
+ 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+ 99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62,
+ 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100,
+ 101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61,
+ 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102,
+ 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63,
+ 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104,
+ 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66,
+ 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106,
+ 108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68,
+ 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+ 110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72,
+ 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111,
+ 112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74,
+ 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113,
+ 115, 115, 118,
+ /* Size 4x8 */
+ 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54,
+ 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105,
+ /* Size 8x4 */
+ 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65,
+ 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105,
+ /* Size 8x16 */
+ 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40,
+ 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50,
+ 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61,
+ 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73,
+ 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92,
+ 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98,
+ 99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98,
+ 104, 106, 109,
+ /* Size 16x8 */
+ 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43,
+ 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54,
+ 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73,
+ 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59,
+ 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75,
+ 83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82,
+ 91, 101, 109,
+ /* Size 16x32 */
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+ 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34,
+ 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61,
+ 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47,
+ 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63,
+ 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49,
+ 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65,
+ 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56,
+ 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47,
+ 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68,
+ 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47,
+ 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72,
+ 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59,
+ 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76,
+ 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71,
+ 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85,
+ 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59,
+ 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95,
+ 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61,
+ 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98,
+ 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68,
+ 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103,
+ 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73,
+ 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105,
+ 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75,
+ 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109,
+ 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77,
+ 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113,
+ /* Size 32x16 */
+ 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31,
+ 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42,
+ 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45,
+ 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49,
+ 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54,
+ 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,
+ 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60,
+ 61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+ 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45,
+ 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49,
+ 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59,
+ 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68,
+ 71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78,
+ 80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,
+ 79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80,
+ 79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+ 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58,
+ 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58,
+ 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69,
+ 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78,
+ 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88,
+ 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,
+ 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102,
+ 102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101,
+ 103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105,
+ 106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106,
+ 107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+ 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77,
+ 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113,
+ /* Size 4x16 */
+ 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45,
+ 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57,
+ 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66,
+ 70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
+ /* Size 16x4 */
+ 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53,
+ 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80,
+ 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68,
+ 89, 103, 67, 70, 86, 105, 69, 72, 88, 107,
+ /* Size 8x32 */
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+ 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41,
+ 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58,
+ 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51,
+ 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67,
+ 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62,
+ 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74,
+ 73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75,
+ 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63,
+ 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92,
+ 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60,
+ 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99,
+ 99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61,
+ 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102,
+ 104, 106, 106, 109, 109, 108,
+ /* Size 32x8 */
+ 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40,
+ 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47,
+ 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60,
+ 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62,
+ 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46,
+ 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66,
+ 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82,
+ 79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56,
+ 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74,
+ 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96,
+ 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101,
+ 104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106,
+ 75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77,
+ 70, 67, 73, 81, 90, 99, 108 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184,
+ /* Size 8x8 */
+ 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39,
+ 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71,
+ 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93,
+ 106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31,
+ 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33,
+ 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40,
+ 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56,
+ 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70,
+ 78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89,
+ 95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97,
+ 104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106,
+ 113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117,
+ 125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125,
+ 134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119,
+ 131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110,
+ 122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104,
+ 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102,
+ 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98,
+ 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65,
+ 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32,
+ 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76,
+ 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32,
+ 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82,
+ 85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35,
+ 36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89,
+ 92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42,
+ 45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97,
+ 100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49,
+ 53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32,
+ 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65,
+ 68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35,
+ 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79,
+ 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48,
+ 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90,
+ 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58,
+ 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100,
+ 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65,
+ 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106,
+ 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79,
+ 84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45,
+ 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+ 95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46,
+ 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98,
+ 100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49,
+ 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+ 106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54,
+ 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110,
+ 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56,
+ 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113,
+ 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59,
+ 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118,
+ 121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65,
+ 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122,
+ 125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70,
+ 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122,
+ 127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76,
+ 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125,
+ 130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78,
+ 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121,
+ 128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162,
+ 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119,
+ 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165,
+ 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114,
+ 120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169,
+ 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110,
+ 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174,
+ 174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106,
+ 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176,
+ 180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102,
+ 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170,
+ 176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95,
+ 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165,
+ 173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90,
+ 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160,
+ 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96,
+ 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150,
+ 155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105,
+ 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135,
+ 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211,
+ 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120,
+ 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203,
+ 204, 210, 211, 219,
+ /* Size 4x8 */
+ 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64,
+ 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177,
+ /* Size 8x4 */
+ 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79,
+ 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177,
+ /* Size 8x16 */
+ 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32,
+ 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36,
+ 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56,
+ 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73,
+ 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84,
+ 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91,
+ 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87,
+ 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
+ /* Size 16x8 */
+ 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34,
+ 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56,
+ 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95,
+ 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113,
+ 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133,
+ 142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121,
+ 148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110,
+ 129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+ 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32,
+ 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72,
+ 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35,
+ 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40,
+ 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85,
+ 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56,
+ 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98,
+ 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77,
+ 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45,
+ 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+ 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49,
+ 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+ 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59,
+ 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118,
+ 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66,
+ 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123,
+ 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72,
+ 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125,
+ 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83,
+ 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126,
+ 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+ 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116,
+ 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169,
+ 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113,
+ 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177,
+ 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110,
+ 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181,
+ 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106,
+ 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185,
+ 186, 192, 193, 201,
+ /* Size 32x16 */
+ 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32,
+ 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33,
+ 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41,
+ 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50,
+ 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64,
+ 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76,
+ 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+ 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87,
+ 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37,
+ 40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45,
+ 53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66,
+ 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71,
+ 77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82,
+ 92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87,
+ 98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89,
+ 100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+ 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97,
+ 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101,
+ 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104,
+ 118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106,
+ 121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100,
+ 109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99,
+ 112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95,
+ 101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92,
+ 95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+ 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91,
+ 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95,
+ 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107,
+ 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192,
+ 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188,
+ 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166,
+ 189, 190, 201,
+ /* Size 4x16 */
+ 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41,
+ 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65,
+ 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83,
+ 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
+ /* Size 16x4 */
+ 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54,
+ 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107,
+ 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132,
+ 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+ 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32,
+ 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70,
+ 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38,
+ 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87,
+ 89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58,
+ 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108,
+ 110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79,
+ 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124,
+ 128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90,
+ 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141,
+ 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94,
+ 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163,
+ 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90,
+ 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161,
+ 171, 174, 179, 181, 188, 188, 190,
+ /* Size 32x8 */
+ 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32,
+ 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45,
+ 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71,
+ 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87,
+ 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42,
+ 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57,
+ 71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79,
+ 98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+ 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92,
+ 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97,
+ 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99,
+ 124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92,
+ 105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91,
+ 94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107,
+ 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188,
+ 114, 104, 100, 111, 127, 145, 166, 190 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
+ /* Size 8x8 */
+ 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47,
+ 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67,
+ 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92,
+ 101, 103, 69, 65, 66, 73, 82, 92, 103, 109,
+ /* Size 16x16 */
+ 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31,
+ 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44,
+ 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48,
+ 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54,
+ 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61,
+ 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72,
+ 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76,
+ 76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80,
+ 60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58,
+ 56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58,
+ 62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67,
+ 71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71,
+ 76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76,
+ 80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80,
+ 85, 90, 95, 100, 105, 108, 111,
+ /* Size 32x32 */
+ 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57,
+ 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32,
+ 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61,
+ 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42,
+ 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64,
+ 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45,
+ 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66,
+ 67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49,
+ 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38,
+ 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53,
+ 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47,
+ 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58,
+ 58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50,
+ 49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61,
+ 62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54,
+ 54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58,
+ 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45,
+ 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64,
+ 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49,
+ 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68,
+ 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72,
+ 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66,
+ 66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50,
+ 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74,
+ 75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48,
+ 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78,
+ 79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58,
+ 59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80,
+ 80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67,
+ 70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83,
+ 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79,
+ 82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56,
+ 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88,
+ 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57,
+ 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91,
+ 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68,
+ 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92,
+ 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80,
+ 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64,
+ 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89,
+ 93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59,
+ 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97,
+ 99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62,
+ 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101,
+ 100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65,
+ 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103,
+ 102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+ 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104,
+ 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+ 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106,
+ 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+ 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108,
+ 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80,
+ 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75,
+ 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84,
+ 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113,
+ /* Size 4x8 */
+ 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52,
+ 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102,
+ /* Size 8x4 */
+ 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64,
+ 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102,
+ /* Size 8x16 */
+ 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38,
+ 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48,
+ 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56,
+ 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71,
+ 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85,
+ 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95,
+ 97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96,
+ 101, 103, 105,
+ /* Size 16x8 */
+ 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41,
+ 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54,
+ 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70,
+ 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78,
+ 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58,
+ 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73,
+ 81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79,
+ 89, 98, 105,
+ /* Size 16x32 */
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+ 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33,
+ 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59,
+ 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46,
+ 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61,
+ 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+ 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63,
+ 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+ 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47,
+ 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46,
+ 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71,
+ 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56,
+ 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74,
+ 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68,
+ 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+ 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79,
+ 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57,
+ 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90,
+ 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60,
+ 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96,
+ 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67,
+ 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100,
+ 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73,
+ 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103,
+ 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78,
+ 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71,
+ 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80,
+ 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109,
+ /* Size 32x16 */
+ 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31,
+ 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40,
+ 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45,
+ 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47,
+ 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52,
+ 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57,
+ 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59,
+ 60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+ 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46,
+ 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47,
+ 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59,
+ 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64,
+ 68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73,
+ 75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79,
+ 78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78,
+ 78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+ 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56,
+ 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56,
+ 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68,
+ 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76,
+ 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86,
+ 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95,
+ 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99,
+ 100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101,
+ 99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+ 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73,
+ 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67,
+ 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68,
+ 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109,
+ /* Size 4x16 */
+ 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45,
+ 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54,
+ 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65,
+ 68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
+ /* Size 16x4 */
+ 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53,
+ 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78,
+ 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66,
+ 87, 100, 65, 68, 83, 102, 67, 70, 86, 103,
+ /* Size 8x32 */
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+ 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38,
+ 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56,
+ 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50,
+ 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65,
+ 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72,
+ 71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73,
+ 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60,
+ 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87,
+ 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59,
+ 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96,
+ 97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61,
+ 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100,
+ 101, 103, 103, 105, 105, 105,
+ /* Size 32x8 */
+ 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38,
+ 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46,
+ 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55,
+ 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61,
+ 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45,
+ 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61,
+ 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78,
+ 78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+ 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55,
+ 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72,
+ 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94,
+ 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98,
+ 101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103,
+ 73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75,
+ 68, 65, 71, 78, 87, 96, 105 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
+ /* Size 8x8 */
+ 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37,
+ 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64,
+ 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87,
+ 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32,
+ 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34,
+ 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40,
+ 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53,
+ 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73,
+ 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91,
+ 96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103,
+ 107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114,
+ 118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125,
+ 125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133,
+ 134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143,
+ 145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150,
+ 156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156,
+ 163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148,
+ 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141,
+ 152, 163, 177, 184, 191,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59,
+ 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32,
+ 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68,
+ 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81,
+ 83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37,
+ 38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89,
+ 91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46,
+ 50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32,
+ 32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58,
+ 62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34,
+ 34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71,
+ 72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41,
+ 42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82,
+ 83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50,
+ 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92,
+ 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63,
+ 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38,
+ 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78,
+ 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41,
+ 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91,
+ 93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54,
+ 58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99,
+ 102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66,
+ 70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105,
+ 105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71,
+ 73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112,
+ 112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77,
+ 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113,
+ 116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81,
+ 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121,
+ 121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+ 87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122,
+ 125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87,
+ 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130,
+ 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90,
+ 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134,
+ 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+ 92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136,
+ 139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96,
+ 98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140,
+ 144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+ 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148,
+ 148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92,
+ 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153,
+ 153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91,
+ 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151,
+ 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87,
+ 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+ 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85,
+ 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153,
+ 156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83,
+ 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148,
+ 153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86,
+ 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144,
+ 148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91,
+ 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139,
+ 144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94,
+ 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135,
+ 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101,
+ 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129,
+ 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+ /* Size 4x8 */
+ 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58,
+ 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165,
+ /* Size 8x4 */
+ 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69,
+ 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165,
+ /* Size 8x16 */
+ 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32,
+ 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38,
+ 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58,
+ 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74,
+ 81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90,
+ 97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97,
+ 104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97,
+ 105, 113, 122, 131, 141, 151, 163, 169, 175,
+ /* Size 16x8 */
+ 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33,
+ 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50,
+ 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90,
+ 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111,
+ 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135,
+ 131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148,
+ 151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141,
+ 160, 169, 103, 94, 92, 103, 119, 137, 158, 175,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+ 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32,
+ 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65,
+ 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34,
+ 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77,
+ 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40,
+ 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85,
+ 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57,
+ 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39,
+ 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73,
+ 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41,
+ 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90,
+ 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49,
+ 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105,
+ 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52,
+ 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111,
+ 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63,
+ 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121,
+ 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73,
+ 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136,
+ 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75,
+ 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139,
+ 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79,
+ 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145,
+ 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78,
+ 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136,
+ 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84,
+ 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131,
+ 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87,
+ 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128,
+ 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+ /* Size 32x16 */
+ 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32,
+ 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32,
+ 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37,
+ 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50,
+ 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58,
+ 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70,
+ 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+ 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84,
+ 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37,
+ 39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41,
+ 51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58,
+ 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75,
+ 79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81,
+ 86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92,
+ 103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+ 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98,
+ 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105,
+ 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109,
+ 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111,
+ 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118,
+ 133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120,
+ 135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+ 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125,
+ 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129,
+ 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117,
+ 128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105,
+ 120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103,
+ 108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98,
+ 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92,
+ 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93,
+ 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187,
+ /* Size 4x16 */
+ 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38,
+ 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58,
+ 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78,
+ 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
+ /* Size 16x4 */
+ 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47,
+ 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97,
+ 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125,
+ 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+ 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32,
+ 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64,
+ 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38,
+ 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85,
+ 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58,
+ 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103,
+ 103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74,
+ 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120,
+ 121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90,
+ 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137,
+ 141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97,
+ 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153,
+ 155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92,
+ 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163,
+ 168, 169, 175, 175, 176,
+ /* Size 32x8 */
+ 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32,
+ 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42,
+ 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69,
+ 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84,
+ 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40,
+ 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66,
+ 79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86,
+ 103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91,
+ 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100,
+ 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109,
+ 133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111,
+ 136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113,
+ 135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103,
+ 120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90,
+ 103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97,
+ 93, 104, 118, 135, 155, 176 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
+ /* Size 8x8 */
+ 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47,
+ 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65,
+ 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89,
+ 97, 99, 67, 63, 64, 71, 79, 89, 99, 104,
+ /* Size 16x16 */
+ 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31,
+ 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43,
+ 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46,
+ 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51,
+ 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60,
+ 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68,
+ 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74,
+ 73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77,
+ 57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55,
+ 53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55,
+ 59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65,
+ 69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74,
+ 79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77,
+ 82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82,
+ 87, 92, 96, 101, 104, 106,
+ /* Size 32x32 */
+ 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55,
+ 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31,
+ 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60,
+ 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42,
+ 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63,
+ 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45,
+ 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64,
+ 65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47,
+ 49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36,
+ 37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+ 53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45,
+ 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55,
+ 57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50,
+ 49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59,
+ 60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51,
+ 51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55,
+ 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45,
+ 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63,
+ 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49,
+ 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67,
+ 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+ 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70,
+ 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63,
+ 64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48,
+ 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69,
+ 69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47,
+ 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75,
+ 77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55,
+ 58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78,
+ 77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65,
+ 66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73,
+ 76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53,
+ 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84,
+ 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55,
+ 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88,
+ 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65,
+ 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89,
+ 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75,
+ 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61,
+ 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85,
+ 86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58,
+ 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93,
+ 95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63,
+ 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98,
+ 99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71,
+ 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101,
+ 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78,
+ 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67,
+ 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85,
+ 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64,
+ 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89,
+ 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64,
+ 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93,
+ 96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62,
+ 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99,
+ 99, 104, 104, 106, 106, 108,
+ /* Size 4x8 */
+ 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50,
+ 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99,
+ /* Size 8x4 */
+ 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59,
+ 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99,
+ /* Size 8x16 */
+ 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36,
+ 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47,
+ 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56,
+ 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65,
+ 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82,
+ 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93,
+ 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98,
+ 100, 102,
+ /* Size 16x8 */
+ 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40,
+ 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51,
+ 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67,
+ 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76,
+ 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57,
+ 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71,
+ 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86,
+ 95, 102,
+ /* Size 16x32 */
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+ 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32,
+ 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57,
+ 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45,
+ 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60,
+ 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46,
+ 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61,
+ 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54,
+ 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47,
+ 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61,
+ 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45,
+ 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68,
+ 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54,
+ 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72,
+ 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65,
+ 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74,
+ 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57,
+ 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89,
+ 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58,
+ 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93,
+ 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66,
+ 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97,
+ 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75,
+ 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69,
+ 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81,
+ 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63,
+ 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86,
+ 87, 91, 91, 95, 96, 101, 101, 103, 103, 105,
+ /* Size 32x16 */
+ 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31,
+ 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39,
+ 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45,
+ 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47,
+ 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50,
+ 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55,
+ 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,
+ 59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+ 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46,
+ 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46,
+ 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56,
+ 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63,
+ 65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68,
+ 71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75,
+ 76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,
+ 76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+ 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54,
+ 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52,
+ 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64,
+ 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75,
+ 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83,
+ 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93,
+ 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,
+ 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+ 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70,
+ 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65,
+ 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65,
+ 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63,
+ 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105,
+ /* Size 4x16 */
+ 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46,
+ 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50,
+ 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64,
+ 67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
+ /* Size 16x4 */
+ 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51,
+ 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77,
+ 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64,
+ 84, 97, 64, 66, 81, 99, 65, 68, 83, 100,
+ /* Size 8x32 */
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+ 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36,
+ 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56,
+ 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50,
+ 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63,
+ 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56,
+ 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70,
+ 69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68,
+ 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60,
+ 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84,
+ 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58,
+ 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94,
+ 94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62,
+ 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100,
+ 100, 102, 102, 101,
+ /* Size 32x8 */
+ 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36,
+ 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46,
+ 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54,
+ 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61,
+ 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46,
+ 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59,
+ 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75,
+ 76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+ 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53,
+ 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68,
+ 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91,
+ 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98,
+ 69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71,
+ 64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66,
+ 63, 69, 76, 84, 93, 101 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
+ /* Size 8x8 */
+ 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36,
+ 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73,
+ 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92,
+ 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32,
+ 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33,
+ 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38,
+ 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50,
+ 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64,
+ 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83,
+ 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100,
+ 102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110,
+ 109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117,
+ 117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125,
+ 126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135,
+ 137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144,
+ 148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150,
+ 155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155,
+ 162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162,
+ 168, 174,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56,
+ 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68,
+ 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77,
+ 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37,
+ 38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86,
+ 88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45,
+ 49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32,
+ 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58,
+ 59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34,
+ 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69,
+ 71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39,
+ 40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78,
+ 80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47,
+ 50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86,
+ 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60,
+ 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34,
+ 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73,
+ 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42,
+ 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86,
+ 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55,
+ 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97,
+ 97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67,
+ 71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47,
+ 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79,
+ 83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44,
+ 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90,
+ 93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50,
+ 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100,
+ 104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51,
+ 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106,
+ 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55,
+ 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112,
+ 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62,
+ 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118,
+ 119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63,
+ 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120,
+ 121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68,
+ 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127,
+ 128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70,
+ 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131,
+ 131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74,
+ 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136,
+ 137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76,
+ 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+ 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76,
+ 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+ 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80,
+ 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142,
+ 144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78,
+ 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141,
+ 147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80,
+ 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137,
+ 142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83,
+ 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138,
+ 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86,
+ 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133,
+ 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90,
+ 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129,
+ 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+ /* Size 4x8 */
+ 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57,
+ 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154,
+ /* Size 8x4 */
+ 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61,
+ 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154,
+ /* Size 8x16 */
+ 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32,
+ 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37,
+ 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54,
+ 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75,
+ 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92,
+ 98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106,
+ 112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108,
+ 116, 124, 134, 142, 153, 157, 163,
+ /* Size 16x8 */
+ 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33,
+ 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48,
+ 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76,
+ 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106,
+ 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120,
+ 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139,
+ 142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150,
+ 157, 97, 88, 86, 97, 111, 128, 147, 163,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+ 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+ 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65,
+ 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34,
+ 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74,
+ 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40,
+ 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82,
+ 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54,
+ 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35,
+ 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67,
+ 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42,
+ 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90,
+ 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+ 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99,
+ 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67,
+ 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114,
+ 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78,
+ 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116,
+ 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79,
+ 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132,
+ 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90,
+ 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135,
+ 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92,
+ 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150,
+ 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97,
+ 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153,
+ 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93,
+ 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153,
+ 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91,
+ 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150,
+ 161, 162, 166, 167, 173,
+ /* Size 32x16 */
+ 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32,
+ 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32,
+ 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34,
+ 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44,
+ 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57,
+ 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69,
+ 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+ 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80,
+ 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34,
+ 36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40,
+ 48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51,
+ 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66,
+ 71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82,
+ 84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+ 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106,
+ 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108,
+ 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112,
+ 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118,
+ 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120,
+ 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127,
+ 129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131,
+ 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136,
+ 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139,
+ 145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139,
+ 148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144,
+ 148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145,
+ 153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150,
+ 153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151,
+ 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147,
+ 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144,
+ 163, 163, 173,
+ /* Size 4x16 */
+ 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35,
+ 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56,
+ 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76,
+ 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
+ /* Size 16x4 */
+ 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42,
+ 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107,
+ 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+ 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+ 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+ 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64,
+ 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39,
+ 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79,
+ 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+ 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97,
+ 97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76,
+ 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111,
+ 110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92,
+ 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106,
+ 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147,
+ 144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101,
+ 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163,
+ 163, 163,
+ /* Size 32x8 */
+ 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32,
+ 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42,
+ 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58,
+ 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81,
+ 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39,
+ 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63,
+ 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+ 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108,
+ 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118,
+ 119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127,
+ 129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136,
+ 139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139,
+ 150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145,
+ 157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151,
+ 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144,
+ 163 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
+ /* Size 8x8 */
+ 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47,
+ 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62,
+ 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86,
+ 91, 95, 65, 61, 62, 68, 76, 86, 95, 100,
+ /* Size 16x16 */
+ 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31,
+ 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42,
+ 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48,
+ 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50,
+ 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55,
+ 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64,
+ 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72,
+ 71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75,
+ 54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53,
+ 51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53,
+ 56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61,
+ 66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72,
+ 77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79,
+ 83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88,
+ 93, 97, 100, 102,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53,
+ 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31,
+ 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57,
+ 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40,
+ 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60,
+ 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45,
+ 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62,
+ 63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46,
+ 47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34,
+ 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51,
+ 51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43,
+ 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54,
+ 55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48,
+ 49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58,
+ 58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49,
+ 50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54,
+ 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46,
+ 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58,
+ 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48,
+ 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64,
+ 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+ 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68,
+ 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60,
+ 61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48,
+ 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66,
+ 66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46,
+ 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71,
+ 71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53,
+ 54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75,
+ 75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62,
+ 64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78,
+ 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69,
+ 71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51,
+ 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78,
+ 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51,
+ 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83,
+ 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61,
+ 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86,
+ 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,
+ 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60,
+ 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81,
+ 82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57,
+ 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90,
+ 91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61,
+ 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94,
+ 95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69,
+ 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95,
+ 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78,
+ 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62,
+ 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87,
+ 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60,
+ 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92,
+ 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63,
+ 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98,
+ 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,
+ 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101,
+ 101, 104,
+ /* Size 4x8 */
+ 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50,
+ 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95,
+ /* Size 8x4 */
+ 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55,
+ 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95,
+ /* Size 8x16 */
+ 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34,
+ 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47,
+ 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53,
+ 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63,
+ 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73,
+ 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87,
+ 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95,
+ 97, 98,
+ /* Size 16x8 */
+ 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37,
+ 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49,
+ 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61,
+ 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74,
+ 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55,
+ 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69,
+ 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83,
+ 92, 98,
+ /* Size 16x32 */
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+ 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32,
+ 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54,
+ 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43,
+ 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47,
+ 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59,
+ 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52,
+ 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47,
+ 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56,
+ 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46,
+ 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67,
+ 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+ 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70,
+ 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+ 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+ 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71,
+ 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52,
+ 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79,
+ 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55,
+ 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90,
+ 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64,
+ 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94,
+ 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73,
+ 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64,
+ 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80,
+ 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60,
+ 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89,
+ 89, 93, 93, 97, 98, 99, 99, 102,
+ /* Size 32x16 */
+ 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31,
+ 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39,
+ 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46,
+ 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46,
+ 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50,
+ 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54,
+ 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57,
+ 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59,
+ 47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46,
+ 47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47,
+ 51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53,
+ 57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59,
+ 61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65,
+ 66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71,
+ 71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75,
+ 74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76,
+ 54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+ 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50,
+ 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58,
+ 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69,
+ 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80,
+ 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90,
+ 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94,
+ 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93,
+ 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62,
+ 61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59,
+ 61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66,
+ 67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74,
+ 74, 82, 82, 90, 90, 98, 98, 102,
+ /* Size 4x16 */
+ 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46,
+ 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49,
+ 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63,
+ 66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
+ /* Size 16x4 */
+ 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50,
+ 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75,
+ 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63,
+ 82, 94, 62, 64, 79, 96, 63, 66, 81, 97,
+ /* Size 8x32 */
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+ 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35,
+ 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48,
+ 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60,
+ 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+ 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68,
+ 67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64,
+ 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54,
+ 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76,
+ 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57,
+ 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90,
+ 91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61,
+ 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97,
+ 97, 99, 98, 98,
+ /* Size 32x8 */
+ 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34,
+ 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45,
+ 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50,
+ 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60,
+ 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46,
+ 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58,
+ 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67,
+ 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+ 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51,
+ 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66,
+ 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83,
+ 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95,
+ 67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62,
+ 61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67,
+ 74, 82, 90, 98 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
+ /* Size 8x8 */
+ 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35,
+ 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65,
+ 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86,
+ 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32,
+ 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33,
+ 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37,
+ 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46,
+ 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60,
+ 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74,
+ 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91,
+ 95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101,
+ 104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112,
+ 59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66,
+ 63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70,
+ 67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77,
+ 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81,
+ 77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86,
+ 82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52,
+ 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62,
+ 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75,
+ 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80,
+ 82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43,
+ 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32,
+ 32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53,
+ 54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33,
+ 34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63,
+ 66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37,
+ 38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72,
+ 74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45,
+ 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81,
+ 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53,
+ 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34,
+ 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68,
+ 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38,
+ 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79,
+ 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51,
+ 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91,
+ 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61,
+ 63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42,
+ 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75,
+ 75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44,
+ 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87,
+ 89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50,
+ 51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98,
+ 99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59,
+ 64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106,
+ 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67,
+ 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112,
+ 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75,
+ 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115,
+ 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80,
+ 85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65,
+ 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92,
+ 97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63,
+ 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98,
+ 99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67,
+ 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103,
+ 103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70,
+ 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105,
+ 106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75,
+ 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110,
+ 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77,
+ 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111,
+ 112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78,
+ 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112,
+ 113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81,
+ 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115,
+ 116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84,
+ 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114,
+ 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86,
+ 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115,
+ 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88,
+ 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118,
+ 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+ /* Size 4x8 */
+ 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50,
+ 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144,
+ /* Size 8x4 */
+ 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59,
+ 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144,
+ /* Size 8x16 */
+ 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32,
+ 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36,
+ 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51,
+ 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69,
+ 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85,
+ 92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96,
+ 103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103,
+ 111, 118, 126, 134, 143, 147, 151,
+ /* Size 16x8 */
+ 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32,
+ 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44,
+ 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73,
+ 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103,
+ 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66,
+ 60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+ 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91,
+ 82, 80, 90, 103, 119, 137, 151,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+ 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60,
+ 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72,
+ 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38,
+ 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77,
+ 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48,
+ 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35,
+ 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63,
+ 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39,
+ 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79,
+ 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48,
+ 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92,
+ 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63,
+ 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105,
+ 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71,
+ 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114,
+ 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85,
+ 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+ 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91,
+ 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79,
+ 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+ 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103,
+ 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+ 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+ 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100,
+ 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155,
+ 155, 160,
+ /* Size 32x16 */
+ 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32,
+ 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32,
+ 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34,
+ 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41,
+ 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50,
+ 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59,
+ 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+ 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78,
+ 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34,
+ 35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38,
+ 42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50,
+ 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59,
+ 65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71,
+ 79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+ 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98,
+ 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105,
+ 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107,
+ 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58,
+ 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60,
+ 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61,
+ 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65,
+ 65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68,
+ 67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72,
+ 72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74,
+ 73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+ 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78,
+ 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81,
+ 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83,
+ 82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85,
+ 85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35,
+ 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49,
+ 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74,
+ 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
+ /* Size 16x4 */
+ 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41,
+ 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98,
+ 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74,
+ 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+ 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59,
+ 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36,
+ 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73,
+ 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51,
+ 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90,
+ 90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72,
+ 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90,
+ 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79,
+ 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+ 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+ 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+ 152,
+ /* Size 32x8 */
+ 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32,
+ 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38,
+ 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58,
+ 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78,
+ 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35,
+ 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56,
+ 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+ 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101,
+ 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113,
+ 58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66,
+ 60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74,
+ 67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81,
+ 73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86,
+ 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91,
+ 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
+ /* Size 8x8 */
+ 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45,
+ 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58,
+ 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82,
+ 89, 92, 64, 59, 60, 66, 74, 83, 92, 96,
+ /* Size 16x16 */
+ 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31,
+ 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39,
+ 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47,
+ 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48,
+ 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54,
+ 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60,
+ 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68,
+ 69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72,
+ 52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51,
+ 49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51,
+ 53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59,
+ 62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68,
+ 71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76,
+ 80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85,
+ 89, 94, 96, 98,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52,
+ 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31,
+ 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54,
+ 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39,
+ 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59,
+ 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46,
+ 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61,
+ 61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46,
+ 46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34,
+ 35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49,
+ 49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41,
+ 44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52,
+ 53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47,
+ 48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55,
+ 56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49,
+ 48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59,
+ 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50,
+ 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46,
+ 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+ 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47,
+ 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61,
+ 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+ 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66,
+ 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57,
+ 57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47,
+ 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62,
+ 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46,
+ 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67,
+ 68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50,
+ 54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71,
+ 72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+ 61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75,
+ 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65,
+ 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49,
+ 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73,
+ 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48,
+ 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78,
+ 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57,
+ 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83,
+ 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65,
+ 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57,
+ 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75,
+ 75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54,
+ 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83,
+ 84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57,
+ 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90,
+ 91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65,
+ 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91,
+ 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74,
+ 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60,
+ 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84,
+ 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57,
+ 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92,
+ 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64,
+ 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96,
+ 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71,
+ 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99,
+ /* Size 4x8 */
+ 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47,
+ 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93,
+ /* Size 8x4 */
+ 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54,
+ 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93,
+ /* Size 8x16 */
+ 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33,
+ 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46,
+ 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53,
+ 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61,
+ 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71,
+ 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82,
+ 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92,
+ 94, 95,
+ /* Size 16x8 */
+ 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35,
+ 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49,
+ 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60,
+ 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52,
+ 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65,
+ 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81,
+ 89, 95,
+ /* Size 16x32 */
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+ 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32,
+ 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52,
+ 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41,
+ 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57,
+ 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48,
+ 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58,
+ 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49,
+ 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47,
+ 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55,
+ 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46,
+ 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62,
+ 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49,
+ 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68,
+ 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+ 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+ 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66,
+ 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52,
+ 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76,
+ 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50,
+ 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82,
+ 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61,
+ 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91,
+ 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69,
+ 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63,
+ 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78,
+ 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60,
+ 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86,
+ 86, 90, 90, 95, 95, 96, 96, 98,
+ /* Size 32x16 */
+ 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31,
+ 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38,
+ 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46,
+ 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45,
+ 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47,
+ 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51,
+ 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55,
+ 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58,
+ 42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46,
+ 47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48,
+ 50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53,
+ 54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56,
+ 58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61,
+ 64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66,
+ 69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71,
+ 73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74,
+ 52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+ 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49,
+ 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56,
+ 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64,
+ 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72,
+ 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81,
+ 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90,
+ 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90,
+ 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60,
+ 59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58,
+ 59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64,
+ 65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71,
+ 71, 79, 79, 87, 87, 95, 95, 98,
+ /* Size 4x16 */
+ 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47,
+ 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47,
+ 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61,
+ 64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
+ /* Size 16x4 */
+ 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49,
+ 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71,
+ 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61,
+ 75, 90, 60, 62, 76, 92, 62, 64, 78, 94,
+ /* Size 8x32 */
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+ 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33,
+ 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+ 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47,
+ 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57,
+ 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+ 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66,
+ 65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62,
+ 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54,
+ 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73,
+ 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56,
+ 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85,
+ 86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60,
+ 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94,
+ 94, 96, 95, 95,
+ /* Size 32x8 */
+ 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33,
+ 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46,
+ 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50,
+ 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59,
+ 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47,
+ 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55,
+ 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65,
+ 69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50,
+ 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61,
+ 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80,
+ 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92,
+ 64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60,
+ 59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65,
+ 71, 79, 87, 95 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
+ /* Size 8x8 */
+ 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35,
+ 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61,
+ 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90,
+ 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32,
+ 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32,
+ 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35,
+ 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40,
+ 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51,
+ 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64,
+ 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78,
+ 84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92,
+ 48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51,
+ 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54,
+ 54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59,
+ 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63,
+ 68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74,
+ 79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76,
+ 81, 86, 92, 99, 106, 113, 121, 128, 137, 140,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+ 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56,
+ 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68,
+ 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75,
+ 75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41,
+ 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50,
+ 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33,
+ 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59,
+ 59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37,
+ 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69,
+ 69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+ 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51,
+ 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33,
+ 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58,
+ 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38,
+ 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73,
+ 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81,
+ 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58,
+ 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38,
+ 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65,
+ 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42,
+ 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79,
+ 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47,
+ 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+ 90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61,
+ 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102,
+ 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
+ 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49,
+ 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87,
+ 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50,
+ 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92,
+ 97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58,
+ 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103,
+ 110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64,
+ 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110,
+ 113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73,
+ 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121,
+ 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73,
+ 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121,
+ 124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84,
+ 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132,
+ 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90,
+ 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80,
+ 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96,
+ 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76,
+ 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104,
+ 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78,
+ 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+ 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78,
+ 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+ 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83,
+ 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109,
+ 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+ /* Size 4x8 */
+ 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50,
+ 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136,
+ /* Size 8x4 */
+ 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56,
+ 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136,
+ /* Size 8x16 */
+ 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32,
+ 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34,
+ 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42,
+ 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58,
+ 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76,
+ 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92,
+ 98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103,
+ 110, 118, 125, 133, 136,
+ /* Size 16x8 */
+ 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+ 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38,
+ 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60,
+ 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54,
+ 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63,
+ 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81,
+ 92, 106, 121, 136,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+ 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+ 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65,
+ 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38,
+ 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72,
+ 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
+ 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35,
+ 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60,
+ 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68,
+ 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48,
+ 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+ 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+ 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
+ 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51,
+ 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82,
+ 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59,
+ 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105,
+ 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58,
+ 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105,
+ 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+ 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+ 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+ 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+ 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75,
+ 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124,
+ 132, 132, 141, 141, 144, 144, 149,
+ /* Size 32x16 */
+ 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32,
+ 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32,
+ 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34,
+ 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41,
+ 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50,
+ 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59,
+ 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+ 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75,
+ 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34,
+ 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38,
+ 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48,
+ 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58,
+ 58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65,
+ 65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+ 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90,
+ 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
+ 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49,
+ 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49,
+ 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54,
+ 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54,
+ 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68,
+ 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68,
+ 79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84,
+ 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84,
+ 97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+ 104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+ 104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+ 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+ 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96,
+ 109, 109, 124, 124, 141, 141, 149,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35,
+ 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50,
+ 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69,
+ 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
+ /* Size 16x4 */
+ 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38,
+ 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90,
+ 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65,
+ 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+ 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+ 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63,
+ 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81,
+ 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
+ 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51,
+ 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59,
+ 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98,
+ 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71,
+ 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
+ 118, 125, 125, 133, 133, 136, 136, 141,
+ /* Size 32x8 */
+ 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+ 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34,
+ 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50,
+ 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69,
+ 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34,
+ 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50,
+ 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71,
+ 79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49,
+ 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54,
+ 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68,
+ 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84,
+ 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90,
+ 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92,
+ 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
+ /* Size 8x8 */
+ 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45,
+ 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56,
+ 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75,
+ 82, 86, 61, 57, 58, 64, 71, 79, 86, 91,
+ /* Size 16x16 */
+ 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31,
+ 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35,
+ 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45,
+ 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46,
+ 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50,
+ 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55,
+ 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61,
+ 63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68,
+ 50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50,
+ 47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49,
+ 48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53,
+ 56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61,
+ 65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71,
+ 75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79,
+ 83, 86, 90, 91,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50,
+ 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31,
+ 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52,
+ 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38,
+ 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57,
+ 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46,
+ 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58,
+ 58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45,
+ 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34,
+ 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47,
+ 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39,
+ 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51,
+ 51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47,
+ 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54,
+ 54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+ 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50,
+ 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42,
+ 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52,
+ 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48,
+ 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58,
+ 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
+ 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+ 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55,
+ 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47,
+ 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57,
+ 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45,
+ 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63,
+ 63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49,
+ 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67,
+ 67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56,
+ 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61,
+ 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47,
+ 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68,
+ 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47,
+ 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72,
+ 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55,
+ 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79,
+ 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62,
+ 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54,
+ 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70,
+ 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76,
+ 76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55,
+ 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85,
+ 85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61,
+ 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71,
+ 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57,
+ 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78,
+ 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55,
+ 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86,
+ 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61,
+ 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91,
+ 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69,
+ 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+ /* Size 4x8 */
+ 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47,
+ 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90,
+ /* Size 8x4 */
+ 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54,
+ 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90,
+ /* Size 8x16 */
+ 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31,
+ 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43,
+ 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50,
+ 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56,
+ 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64,
+ 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73,
+ 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85,
+ 89, 90,
+ /* Size 16x8 */
+ 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32,
+ 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47,
+ 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54,
+ 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50,
+ 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58,
+ 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75,
+ 83, 90,
+ /* Size 16x32 */
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+ 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+ 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+ 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40,
+ 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54,
+ 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48,
+ 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56,
+ 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
+ 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47,
+ 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+ 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47,
+ 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+ 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49,
+ 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67,
+ 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+ 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64,
+ 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48,
+ 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68,
+ 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50,
+ 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79,
+ 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57,
+ 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83,
+ 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67,
+ 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60,
+ 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75,
+ 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59,
+ 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84,
+ 84, 88, 88, 92, 92, 93, 93, 95,
+ /* Size 32x16 */
+ 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31,
+ 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38,
+ 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46,
+ 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45,
+ 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47,
+ 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51,
+ 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54,
+ 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57,
+ 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43,
+ 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48,
+ 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53,
+ 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56,
+ 56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57,
+ 57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64,
+ 64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67,
+ 67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73,
+ 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+ 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47,
+ 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55,
+ 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62,
+ 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70,
+ 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76,
+ 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85,
+ 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88,
+ 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57,
+ 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56,
+ 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61,
+ 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69,
+ 69, 77, 77, 84, 84, 92, 92, 95,
+ /* Size 4x16 */
+ 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47,
+ 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47,
+ 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57,
+ 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
+ /* Size 16x4 */
+ 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47,
+ 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67,
+ 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58,
+ 72, 85, 57, 60, 75, 89, 59, 61, 75, 90,
+ /* Size 8x32 */
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+ 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+ 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+ 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47,
+ 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52,
+ 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+ 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+ 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
+ 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50,
+ 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66,
+ 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76,
+ 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57,
+ 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
+ 89, 90, 90, 92,
+ /* Size 32x8 */
+ 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31,
+ 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46,
+ 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47,
+ 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54,
+ 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46,
+ 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53,
+ 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61,
+ 64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48,
+ 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55,
+ 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70,
+ 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85,
+ 63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59,
+ 56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63,
+ 69, 77, 84, 92 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
+ /* Size 8x8 */
+ 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33,
+ 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54,
+ 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89,
+ 98, 108, 69, 65, 64, 73, 85, 97, 108, 119,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32,
+ 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32,
+ 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35,
+ 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39,
+ 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48,
+ 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59,
+ 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71,
+ 76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86,
+ 45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46,
+ 44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50,
+ 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58,
+ 62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67,
+ 71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76,
+ 80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86,
+ 91, 96, 104, 110, 118, 125, 134,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+ 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51,
+ 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59,
+ 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66,
+ 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37,
+ 38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45,
+ 45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52,
+ 54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60,
+ 64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38,
+ 39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43,
+ 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33,
+ 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54,
+ 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35,
+ 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60,
+ 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42,
+ 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72,
+ 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50,
+ 52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37,
+ 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59,
+ 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38,
+ 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67,
+ 69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41,
+ 46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77,
+ 80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54,
+ 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64,
+ 65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45,
+ 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75,
+ 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45,
+ 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83,
+ 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54,
+ 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97,
+ 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63,
+ 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+ 75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56,
+ 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80,
+ 86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56,
+ 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93,
+ 95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59,
+ 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101,
+ 105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60,
+ 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106,
+ 108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68,
+ 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+ 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+ 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118,
+ 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79,
+ 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+ 134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82,
+ 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+ 134, 134,
+ /* Size 4x8 */
+ 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42,
+ 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111,
+ /* Size 8x4 */
+ 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48,
+ 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111,
+ /* Size 8x16 */
+ 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32,
+ 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34,
+ 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42,
+ 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56,
+ 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72,
+ 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90,
+ 95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105,
+ 112, 119, 127,
+ /* Size 16x8 */
+ 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32,
+ 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38,
+ 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59,
+ 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49,
+ 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68,
+ 79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90,
+ 104, 115, 127,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+ 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49,
+ 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57,
+ 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36,
+ 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64,
+ 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40,
+ 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34,
+ 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51,
+ 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35,
+ 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62,
+ 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40,
+ 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72,
+ 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54,
+ 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64,
+ 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50,
+ 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81,
+ 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50,
+ 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87,
+ 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56,
+ 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102,
+ 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63,
+ 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111,
+ 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74,
+ 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119,
+ 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81,
+ 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125,
+ 133, 133,
+ /* Size 32x16 */
+ 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32,
+ 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32,
+ 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33,
+ 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36,
+ 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42,
+ 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50,
+ 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59,
+ 65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+ 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34,
+ 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36,
+ 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42,
+ 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49,
+ 54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57,
+ 63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65,
+ 71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76,
+ 81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+ 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45,
+ 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46,
+ 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54,
+ 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63,
+ 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75,
+ 85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,
+ 87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89,
+ 98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102,
+ 105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106,
+ 114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119,
+ 125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34,
+ 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43,
+ 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63,
+ 67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
+ /* Size 16x4 */
+ 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37,
+ 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76,
+ 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63,
+ 80, 105, 66, 68, 85, 111, 73, 74, 91, 118,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+ 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49,
+ 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56,
+ 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42,
+ 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73,
+ 79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58,
+ 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51,
+ 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76,
+ 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57,
+ 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90,
+ 94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65,
+ 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103,
+ 105, 108, 112, 114, 119, 119, 127, 127,
+ /* Size 32x8 */
+ 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32,
+ 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34,
+ 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50,
+ 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64,
+ 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34,
+ 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48,
+ 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65,
+ 71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45,
+ 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60,
+ 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87,
+ 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102,
+ 112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119,
+ 72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79,
+ 72, 70, 79, 90, 104, 115, 127 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
+ /* Size 8x8 */
+ 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42,
+ 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53,
+ 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69,
+ 73, 77, 57, 54, 52, 58, 65, 72, 77, 82,
+ /* Size 16x16 */
+ 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+ 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35,
+ 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45,
+ 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47,
+ 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49,
+ 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53,
+ 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58,
+ 60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65,
+ 49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48,
+ 46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47,
+ 47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52,
+ 55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58,
+ 61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66,
+ 68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75,
+ 78, 82, 85, 89,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49,
+ 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31,
+ 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50,
+ 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35,
+ 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53,
+ 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42,
+ 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55,
+ 58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45,
+ 45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32,
+ 33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46,
+ 46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37,
+ 39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49,
+ 49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44,
+ 45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51,
+ 53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48,
+ 47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54,
+ 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45,
+ 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42,
+ 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
+ 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45,
+ 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52,
+ 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50,
+ 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58,
+ 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53,
+ 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47,
+ 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55,
+ 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46,
+ 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58,
+ 58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+ 49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62,
+ 63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53,
+ 55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59,
+ 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47,
+ 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63,
+ 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46,
+ 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66,
+ 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50,
+ 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72,
+ 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57,
+ 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51,
+ 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65,
+ 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49,
+ 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70,
+ 71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49,
+ 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75,
+ 77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56,
+ 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82,
+ 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64,
+ 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55,
+ 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72,
+ 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53,
+ 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77,
+ 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57,
+ 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85,
+ 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63,
+ 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+ /* Size 4x8 */
+ 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45,
+ 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79,
+ /* Size 8x4 */
+ 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49,
+ 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79,
+ /* Size 8x16 */
+ 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+ 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43,
+ 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50,
+ 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55,
+ 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62,
+ 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70,
+ 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79,
+ 82, 86,
+ /* Size 16x8 */
+ 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32,
+ 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47,
+ 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53,
+ 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48,
+ 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56,
+ 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75,
+ 80, 86,
+ /* Size 16x32 */
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+ 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+ 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48,
+ 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37,
+ 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51,
+ 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46,
+ 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53,
+ 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47,
+ 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42,
+ 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50,
+ 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46,
+ 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
+ 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47,
+ 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59,
+ 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53,
+ 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59,
+ 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48,
+ 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66,
+ 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47,
+ 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69,
+ 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53,
+ 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78,
+ 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60,
+ 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58,
+ 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69,
+ 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57,
+ 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77,
+ 78, 80, 82, 83, 85, 85, 89, 89,
+ /* Size 32x16 */
+ 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31,
+ 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36,
+ 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42,
+ 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46,
+ 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45,
+ 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47,
+ 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51,
+ 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54,
+ 37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43,
+ 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46,
+ 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50,
+ 52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53,
+ 53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55,
+ 56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57,
+ 59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61,
+ 63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67,
+ 49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+ 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46,
+ 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50,
+ 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56,
+ 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62,
+ 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68,
+ 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75,
+ 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82,
+ 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55,
+ 54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53,
+ 53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58,
+ 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62,
+ 67, 68, 75, 75, 80, 82, 86, 89,
+ /* Size 4x16 */
+ 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42,
+ 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46,
+ 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53,
+ 56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
+ /* Size 16x4 */
+ 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47,
+ 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61,
+ 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53,
+ 64, 76, 55, 55, 66, 79, 58, 58, 68, 82,
+ /* Size 8x32 */
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+ 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+ 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48,
+ 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44,
+ 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49,
+ 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50,
+ 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59,
+ 60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56,
+ 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50,
+ 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64,
+ 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51,
+ 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71,
+ 72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53,
+ 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80,
+ 82, 83, 86, 86,
+ /* Size 32x8 */
+ 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31,
+ 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46,
+ 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47,
+ 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52,
+ 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43,
+ 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53,
+ 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57,
+ 59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46,
+ 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54,
+ 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68,
+ 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79,
+ 58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55,
+ 53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60,
+ 67, 75, 80, 86 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
+ /* Size 8x8 */
+ 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32,
+ 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47,
+ 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75,
+ 82, 92, 63, 59, 58, 65, 73, 84, 92, 105,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32,
+ 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32,
+ 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33,
+ 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37,
+ 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41,
+ 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51,
+ 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63,
+ 65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72,
+ 41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42,
+ 41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45,
+ 45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49,
+ 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63,
+ 67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73,
+ 77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85,
+ 92, 97, 101, 105,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+ 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46,
+ 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51,
+ 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61,
+ 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41,
+ 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47,
+ 49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54,
+ 56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36,
+ 37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40,
+ 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32,
+ 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45,
+ 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35,
+ 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53,
+ 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+ 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63,
+ 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45,
+ 46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35,
+ 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54,
+ 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34,
+ 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58,
+ 60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40,
+ 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68,
+ 69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47,
+ 50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77,
+ 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55,
+ 57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42,
+ 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+ 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42,
+ 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71,
+ 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49,
+ 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84,
+ 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57,
+ 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49,
+ 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68,
+ 68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49,
+ 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+ 82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49,
+ 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87,
+ 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59,
+ 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101,
+ 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69,
+ 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57,
+ 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81,
+ 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59,
+ 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92,
+ 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58,
+ 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+ 101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67,
+ 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109,
+ 109, 114,
+ /* Size 4x8 */
+ 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41,
+ 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97,
+ /* Size 8x4 */
+ 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40,
+ 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32,
+ 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34,
+ 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37,
+ 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50,
+ 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60,
+ 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76,
+ 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97,
+ 100, 105,
+ /* Size 16x8 */
+ 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32,
+ 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36,
+ 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48,
+ 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45,
+ 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60,
+ 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79,
+ 92, 105,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+ 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44,
+ 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49,
+ 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35,
+ 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59,
+ 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+ 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32,
+ 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43,
+ 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34,
+ 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55,
+ 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+ 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63,
+ 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+ 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58,
+ 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41,
+ 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+ 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48,
+ 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79,
+ 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53,
+ 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92,
+ 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63,
+ 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63,
+ 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79,
+ 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59,
+ 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85,
+ 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32,
+ 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32,
+ 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32,
+ 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34,
+ 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41,
+ 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48,
+ 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,
+ 59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32,
+ 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35,
+ 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37,
+ 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44,
+ 46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54,
+ 54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58,
+ 60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,
+ 72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+ 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41,
+ 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42,
+ 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45,
+ 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56,
+ 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68,
+ 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79,
+ 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,
+ 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+ 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57,
+ 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60,
+ 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59,
+ 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62,
+ 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32,
+ 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42,
+ 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52,
+ 57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
+ /* Size 16x4 */
+ 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34,
+ 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67,
+ 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53,
+ 74, 90, 57, 56, 77, 93, 61, 58, 79, 97,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+ 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44,
+ 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34,
+ 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50,
+ 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+ 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66,
+ 66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50,
+ 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42,
+ 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63,
+ 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49,
+ 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+ 82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58,
+ 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+ 100, 105, 105, 109,
+ /* Size 32x8 */
+ 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32,
+ 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34,
+ 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42,
+ 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58,
+ 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33,
+ 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43,
+ 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54,
+ 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41,
+ 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54,
+ 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71,
+ 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+ 58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60,
+ 58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62,
+ 70, 76, 83, 96, 109 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
+ /* Size 8x8 */
+ 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40,
+ 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51,
+ 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63,
+ 66, 70, 55, 52, 50, 54, 60, 66, 70, 76,
+ /* Size 16x16 */
+ 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31,
+ 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34,
+ 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42,
+ 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47,
+ 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46,
+ 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50,
+ 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55,
+ 56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59,
+ 49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47,
+ 45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46,
+ 46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47,
+ 50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55,
+ 58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60,
+ 63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67,
+ 70, 73, 74, 76,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48,
+ 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+ 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48,
+ 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34,
+ 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50,
+ 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41,
+ 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53,
+ 53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46,
+ 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31,
+ 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45,
+ 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34,
+ 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47,
+ 47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43,
+ 43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49,
+ 50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46,
+ 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46,
+ 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39,
+ 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46,
+ 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44,
+ 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49,
+ 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48,
+ 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53,
+ 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48,
+ 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+ 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46,
+ 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+ 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47,
+ 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58,
+ 58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51,
+ 53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55,
+ 55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46,
+ 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59,
+ 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45,
+ 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61,
+ 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49,
+ 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66,
+ 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54,
+ 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49,
+ 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60,
+ 60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65,
+ 66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47,
+ 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68,
+ 69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53,
+ 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74,
+ 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58,
+ 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52,
+ 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65,
+ 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51,
+ 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70,
+ 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52,
+ 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76,
+ 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58,
+ 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80,
+ /* Size 4x8 */
+ 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45,
+ 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73,
+ /* Size 8x4 */
+ 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47,
+ 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73,
+ /* Size 8x16 */
+ 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31,
+ 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42,
+ 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47,
+ 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53,
+ 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57,
+ 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64,
+ 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73,
+ 74, 76,
+ /* Size 16x8 */
+ 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32,
+ 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46,
+ 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49,
+ 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46,
+ 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54,
+ 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64,
+ 70, 76,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+ 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+ 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47,
+ 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36,
+ 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48,
+ 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44,
+ 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51,
+ 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+ 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38,
+ 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46,
+ 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44,
+ 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52,
+ 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+ 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55,
+ 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51,
+ 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56,
+ 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46,
+ 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59,
+ 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47,
+ 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65,
+ 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50,
+ 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70,
+ 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55,
+ 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55,
+ 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64,
+ 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52,
+ 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68,
+ 71, 71, 73, 73, 74, 76, 76, 78,
+ /* Size 32x16 */
+ 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31,
+ 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34,
+ 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39,
+ 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46,
+ 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45,
+ 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47,
+ 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49,
+ 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51,
+ 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39,
+ 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44,
+ 47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47,
+ 49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51,
+ 51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53,
+ 53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54,
+ 54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58,
+ 59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60,
+ 48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+ 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45,
+ 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46,
+ 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54,
+ 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60,
+ 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65,
+ 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68,
+ 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73,
+ 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51,
+ 51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51,
+ 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50,
+ 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58,
+ 61, 65, 65, 70, 72, 74, 78, 78,
+ /* Size 4x16 */
+ 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38,
+ 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45,
+ 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48,
+ 51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
+ /* Size 16x4 */
+ 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44,
+ 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58,
+ 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48,
+ 62, 70, 51, 49, 63, 71, 53, 50, 64, 73,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+ 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32,
+ 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46,
+ 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43,
+ 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47,
+ 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+ 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55,
+ 55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
+ 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47,
+ 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58,
+ 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48,
+ 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65,
+ 66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50,
+ 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73,
+ 74, 76, 76, 78,
+ /* Size 32x8 */
+ 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31,
+ 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44,
+ 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45,
+ 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51,
+ 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42,
+ 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50,
+ 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53,
+ 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45,
+ 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52,
+ 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61,
+ 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+ 54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52,
+ 50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57,
+ 61, 65, 72, 78 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
+ /* Size 8x8 */
+ 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32,
+ 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42,
+ 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65,
+ 71, 77, 53, 50, 51, 55, 61, 70, 77, 85,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32,
+ 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32,
+ 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33,
+ 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35,
+ 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44,
+ 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51,
+ 54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64,
+ 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38,
+ 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41,
+ 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45,
+ 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51,
+ 57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63,
+ 65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75,
+ 79, 81, 87, 92,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+ 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41,
+ 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46,
+ 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52,
+ 56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37,
+ 37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41,
+ 44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49,
+ 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+ 35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+ 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32,
+ 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42,
+ 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46,
+ 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52,
+ 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41,
+ 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34,
+ 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45,
+ 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34,
+ 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51,
+ 53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+ 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59,
+ 60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+ 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50,
+ 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38,
+ 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56,
+ 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39,
+ 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61,
+ 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42,
+ 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70,
+ 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50,
+ 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43,
+ 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58,
+ 58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44,
+ 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+ 69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45,
+ 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75,
+ 76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51,
+ 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81,
+ 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59,
+ 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51,
+ 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69,
+ 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51,
+ 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77,
+ 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53,
+ 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88,
+ 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61,
+ 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+ /* Size 4x8 */
+ 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36,
+ 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83,
+ /* Size 8x4 */
+ 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38,
+ 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32,
+ 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33,
+ 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36,
+ 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42,
+ 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56,
+ 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66,
+ 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77,
+ 82, 87,
+ /* Size 16x8 */
+ 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32,
+ 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34,
+ 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44,
+ 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41,
+ 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48,
+ 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75,
+ 79, 87,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+ 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40,
+ 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45,
+ 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51,
+ 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37,
+ 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40,
+ 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33,
+ 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45,
+ 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+ 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58,
+ 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+ 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49,
+ 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42,
+ 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61,
+ 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42,
+ 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67,
+ 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45,
+ 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76,
+ 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57,
+ 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52,
+ 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65,
+ 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54,
+ 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76,
+ 79, 80, 81, 86, 87, 88, 92, 92,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32,
+ 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32,
+ 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32,
+ 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34,
+ 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36,
+ 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41,
+ 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49,
+ 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54,
+ 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32,
+ 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33,
+ 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36,
+ 36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42,
+ 42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44,
+ 48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50,
+ 53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60,
+ 60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+ 38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+ 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38,
+ 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42,
+ 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52,
+ 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56,
+ 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66,
+ 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76,
+ 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81,
+ 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51,
+ 49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51,
+ 51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54,
+ 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62,
+ 63, 67, 75, 75, 79, 87, 87, 92,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32,
+ 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36,
+ 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49,
+ 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
+ /* Size 16x4 */
+ 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34,
+ 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60,
+ 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47,
+ 60, 77, 51, 50, 63, 82, 55, 54, 67, 87,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+ 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40,
+ 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33,
+ 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46,
+ 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53,
+ 55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45,
+ 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43,
+ 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58,
+ 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44,
+ 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+ 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49,
+ 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
+ 82, 83, 87, 87,
+ /* Size 32x8 */
+ 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32,
+ 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33,
+ 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41,
+ 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50,
+ 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33,
+ 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38,
+ 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50,
+ 53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37,
+ 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45,
+ 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66,
+ 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77,
+ 53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51,
+ 51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55,
+ 63, 75, 79, 87 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
+ /* Size 8x8 */
+ 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36,
+ 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50,
+ 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59,
+ 61, 64, 51, 48, 48, 51, 54, 60, 64, 68,
+ /* Size 16x16 */
+ 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31,
+ 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32,
+ 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40,
+ 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45,
+ 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47,
+ 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+ 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50,
+ 50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47,
+ 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45,
+ 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46,
+ 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50,
+ 54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56,
+ 57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62,
+ 64, 66, 68, 71,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49,
+ 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+ 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47,
+ 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34,
+ 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38,
+ 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50,
+ 52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44,
+ 46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31,
+ 31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45,
+ 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32,
+ 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45,
+ 46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40,
+ 43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47,
+ 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45,
+ 45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49,
+ 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47,
+ 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38,
+ 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
+ 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43,
+ 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46,
+ 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47,
+ 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48,
+ 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49,
+ 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42,
+ 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49,
+ 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44,
+ 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+ 52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47,
+ 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+ 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47,
+ 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55,
+ 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46,
+ 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56,
+ 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45,
+ 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60,
+ 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51,
+ 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47,
+ 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+ 56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59,
+ 61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+ 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63,
+ 63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50,
+ 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66,
+ 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54,
+ 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49,
+ 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60,
+ 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48,
+ 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64,
+ 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48,
+ 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69,
+ 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53,
+ 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71,
+ /* Size 4x8 */
+ 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46,
+ 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67,
+ /* Size 8x4 */
+ 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48,
+ 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67,
+ /* Size 8x16 */
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31,
+ 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38,
+ 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47,
+ 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50,
+ 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55,
+ 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
+ 66, 68,
+ /* Size 16x8 */
+ 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32,
+ 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44,
+ 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47,
+ 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45,
+ 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48,
+ 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62,
+ 65, 68,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+ 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+ 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46,
+ 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35,
+ 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+ 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41,
+ 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48,
+ 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47,
+ 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38,
+ 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47,
+ 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41,
+ 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47,
+ 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47,
+ 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53,
+ 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+ 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47,
+ 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57,
+ 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45,
+ 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59,
+ 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46,
+ 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63,
+ 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52,
+ 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50,
+ 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57,
+ 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50,
+ 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63,
+ 65, 65, 66, 68, 68, 69, 71, 71,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31,
+ 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32,
+ 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38,
+ 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45,
+ 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46,
+ 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45,
+ 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47,
+ 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49,
+ 34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39,
+ 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41,
+ 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47,
+ 47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50,
+ 50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50,
+ 49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51,
+ 52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54,
+ 54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+ 48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+ 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46,
+ 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46,
+ 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52,
+ 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55,
+ 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59,
+ 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64,
+ 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66,
+ 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49,
+ 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48,
+ 48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49,
+ 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55,
+ 55, 57, 62, 62, 65, 68, 68, 71,
+ /* Size 4x16 */
+ 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38,
+ 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46,
+ 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47,
+ 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
+ /* Size 16x4 */
+ 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43,
+ 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54,
+ 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47,
+ 55, 64, 49, 47, 56, 66, 51, 49, 57, 68,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+ 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+ 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45,
+ 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41,
+ 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46,
+ 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47,
+ 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49,
+ 50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51,
+ 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48,
+ 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+ 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60,
+ 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47,
+ 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66,
+ 66, 67, 68, 68,
+ /* Size 32x8 */
+ 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31,
+ 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40,
+ 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45,
+ 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47,
+ 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40,
+ 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48,
+ 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51,
+ 52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46,
+ 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47,
+ 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59,
+ 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64,
+ 52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48,
+ 48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50,
+ 55, 62, 65, 68 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
+ /* Size 8x8 */
+ 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32,
+ 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37,
+ 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56,
+ 63, 67, 47, 44, 45, 46, 52, 59, 67, 71,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+ 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34,
+ 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36,
+ 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40,
+ 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44,
+ 47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51,
+ 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35,
+ 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37,
+ 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40,
+ 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44,
+ 47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+ 56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61,
+ 63, 67, 70, 71,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34,
+ 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+ 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42,
+ 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46,
+ 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+ 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+ 38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42,
+ 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
+ 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38,
+ 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42,
+ 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46,
+ 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37,
+ 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40,
+ 40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33,
+ 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45,
+ 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35,
+ 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48,
+ 50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+ 38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54,
+ 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43,
+ 46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35,
+ 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50,
+ 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35,
+ 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54,
+ 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39,
+ 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59,
+ 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45,
+ 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39,
+ 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50,
+ 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55,
+ 57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42,
+ 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64,
+ 66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+ 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49,
+ 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45,
+ 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58,
+ 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45,
+ 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67,
+ 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45,
+ 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71,
+ 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53,
+ 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77,
+ /* Size 4x8 */
+ 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34,
+ 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67,
+ /* Size 8x4 */
+ 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37,
+ 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32,
+ 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34,
+ 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39,
+ 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
+ 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+ 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
+ 69, 70,
+ /* Size 16x8 */
+ 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32,
+ 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34,
+ 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38,
+ 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37,
+ 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43,
+ 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56,
+ 67, 70,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+ 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+ 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41,
+ 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45,
+ 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35,
+ 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38,
+ 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40,
+ 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45,
+ 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37,
+ 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43,
+ 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35,
+ 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49,
+ 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38,
+ 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59,
+ 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42,
+ 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67,
+ 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48,
+ 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46,
+ 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56,
+ 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49,
+ 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65,
+ 67, 71, 71, 72, 75, 76, 76, 79,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32,
+ 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32,
+ 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38,
+ 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41,
+ 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49,
+ 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32,
+ 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32,
+ 33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35,
+ 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36,
+ 37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40,
+ 40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45,
+ 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48,
+ 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55,
+ 35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+ 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34,
+ 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40,
+ 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42,
+ 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50,
+ 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56,
+ 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63,
+ 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71,
+ 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45,
+ 44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45,
+ 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46,
+ 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50,
+ 56, 58, 58, 64, 69, 69, 73, 79,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32,
+ 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34,
+ 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42,
+ 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
+ /* Size 16x4 */
+ 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34,
+ 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48,
+ 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43,
+ 53, 63, 45, 45, 56, 66, 46, 46, 56, 67,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+ 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36,
+ 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42,
+ 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46,
+ 46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41,
+ 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35,
+ 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48,
+ 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41,
+ 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58,
+ 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45,
+ 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
+ 69, 70, 70, 73,
+ /* Size 32x8 */
+ 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32,
+ 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32,
+ 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34,
+ 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44,
+ 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32,
+ 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35,
+ 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42,
+ 48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34,
+ 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40,
+ 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51,
+ 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66,
+ 44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45,
+ 45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48,
+ 56, 58, 69, 73 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
+ /* Size 8x8 */
+ 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35,
+ 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47,
+ 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55,
+ 58, 60, 49, 46, 46, 46, 50, 55, 60, 61,
+ /* Size 16x16 */
+ 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31,
+ 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31,
+ 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35,
+ 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43,
+ 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47,
+ 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46,
+ 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47,
+ 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47,
+ 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45,
+ 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46,
+ 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47,
+ 49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+ 54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56,
+ 58, 60, 61, 61,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43,
+ 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48,
+ 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37,
+ 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42,
+ 42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31,
+ 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46,
+ 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32,
+ 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45,
+ 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36,
+ 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+ 46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42,
+ 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45,
+ 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34,
+ 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46,
+ 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38,
+ 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45,
+ 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46,
+ 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46,
+ 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47,
+ 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39,
+ 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+ 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42,
+ 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49,
+ 49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45,
+ 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49,
+ 50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47,
+ 48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51,
+ 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47,
+ 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46,
+ 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55,
+ 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49,
+ 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47,
+ 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53,
+ 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55,
+ 55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59,
+ 59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50,
+ 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48,
+ 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55,
+ 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60,
+ 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61,
+ 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50,
+ 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64,
+ /* Size 4x8 */
+ 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46,
+ 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59,
+ /* Size 8x4 */
+ 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47,
+ 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59,
+ /* Size 8x16 */
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31,
+ 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35,
+ 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43,
+ 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48,
+ 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52,
+ 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+ 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
+ 61, 61,
+ /* Size 16x8 */
+ 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32,
+ 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42,
+ 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47,
+ 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46,
+ 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46,
+ 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54,
+ 59, 61,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+ 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47,
+ 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33,
+ 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45,
+ 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38,
+ 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46,
+ 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44,
+ 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38,
+ 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40,
+ 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+ 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47,
+ 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47,
+ 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+ 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47,
+ 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45,
+ 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56,
+ 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45,
+ 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59,
+ 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49,
+ 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49,
+ 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54,
+ 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48,
+ 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57,
+ 59, 61, 61, 62, 63, 64, 64, 65,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31,
+ 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31,
+ 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38,
+ 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40,
+ 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46,
+ 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45,
+ 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45,
+ 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47,
+ 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35,
+ 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38,
+ 41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47,
+ 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48,
+ 48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49,
+ 49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49,
+ 50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51,
+ 47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+ 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46,
+ 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47,
+ 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48,
+ 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53,
+ 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55,
+ 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58,
+ 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61,
+ 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48,
+ 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46,
+ 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46,
+ 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48,
+ 52, 54, 54, 58, 60, 60, 62, 65,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38,
+ 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46,
+ 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46,
+ 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
+ /* Size 16x4 */
+ 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42,
+ 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49,
+ 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46,
+ 53, 58, 48, 46, 54, 59, 48, 46, 54, 59,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+ 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+ 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46,
+ 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35,
+ 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45,
+ 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46,
+ 46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49,
+ 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47,
+ 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45,
+ 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56,
+ 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60,
+ 61, 61, 61, 62,
+ /* Size 32x8 */
+ 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31,
+ 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39,
+ 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46,
+ 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46,
+ 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38,
+ 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47,
+ 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50,
+ 49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46,
+ 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47,
+ 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53,
+ 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59,
+ 49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46,
+ 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47,
+ 52, 54, 60, 62 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
+ /* Size 8x8 */
+ 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32,
+ 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35,
+ 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46,
+ 51, 54, 41, 39, 40, 41, 44, 49, 54, 58,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36,
+ 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+ 40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34,
+ 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34,
+ 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40,
+ 40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45,
+ 45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54,
+ 54, 58, 58, 63,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37,
+ 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+ 34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+ 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+ 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36,
+ 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38,
+ 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40,
+ 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+ 34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+ 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+ 40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35,
+ 36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+ 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34,
+ 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41,
+ 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33,
+ 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44,
+ 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36,
+ 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49,
+ 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+ 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+ 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+ 48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37,
+ 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52,
+ 52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40,
+ 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42,
+ 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38,
+ 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47,
+ 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39,
+ 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54,
+ 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42,
+ 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60,
+ 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+ 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ /* Size 4x8 */
+ 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34,
+ 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56,
+ /* Size 8x4 */
+ 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34,
+ 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34,
+ 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42,
+ 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+ 58, 63,
+ /* Size 16x8 */
+ 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32,
+ 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33,
+ 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36,
+ 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34,
+ 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40,
+ 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53,
+ 53, 63,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+ 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39,
+ 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+ 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+ 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+ 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+ 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35,
+ 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45,
+ 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34,
+ 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49,
+ 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
+ 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52,
+ 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40,
+ 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43,
+ 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48,
+ 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41,
+ 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54,
+ 54, 56, 58, 58, 58, 60, 63, 63,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34,
+ 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+ 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37,
+ 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41,
+ 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32,
+ 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32,
+ 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34,
+ 34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37,
+ 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38,
+ 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45,
+ 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+ 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34,
+ 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36,
+ 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38,
+ 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43,
+ 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48,
+ 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52,
+ 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58,
+ 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38,
+ 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39,
+ 39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42,
+ 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43,
+ 43, 48, 53, 53, 53, 58, 63, 63,
+ /* Size 4x16 */
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33,
+ 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39,
+ 39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
+ /* Size 16x4 */
+ 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32,
+ 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40,
+ 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39,
+ 45, 54, 38, 39, 45, 54, 42, 42, 48, 58,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+ 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+ 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42,
+ 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+ 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+ 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+ 48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41,
+ 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
+ 58, 60, 63, 63,
+ /* Size 32x8 */
+ 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32,
+ 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32,
+ 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34,
+ 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32,
+ 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34,
+ 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38,
+ 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34,
+ 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38,
+ 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48,
+ 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39,
+ 39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43,
+ 43, 53, 53, 63 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
+ /* Size 8x8 */
+ 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33,
+ 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45,
+ 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51,
+ 53, 54, 48, 46, 45, 46, 47, 51, 54, 56,
+ /* Size 16x16 */
+ 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31,
+ 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+ 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35,
+ 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40,
+ 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45,
+ 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47,
+ 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+ 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42,
+ 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46,
+ 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47,
+ 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46,
+ 46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49,
+ 49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53,
+ 53, 55, 55, 58,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39,
+ 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34,
+ 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38,
+ 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42,
+ 42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31,
+ 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46,
+ 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33,
+ 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45,
+ 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37,
+ 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42,
+ 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34,
+ 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46,
+ 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35,
+ 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47,
+ 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39,
+ 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46,
+ 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45,
+ 45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37,
+ 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39,
+ 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+ 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46,
+ 46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45,
+ 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48,
+ 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+ 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42,
+ 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50,
+ 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46,
+ 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48,
+ 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48,
+ 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47,
+ 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46,
+ 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+ 53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+ 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48,
+ 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47,
+ 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51,
+ 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54,
+ 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57,
+ 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58,
+ /* Size 4x8 */
+ 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42,
+ 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55,
+ /* Size 8x4 */
+ 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44,
+ 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55,
+ /* Size 8x16 */
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31,
+ 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32,
+ 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43,
+ 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47,
+ 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50,
+ 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+ 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+ 56, 58,
+ /* Size 16x8 */
+ 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31,
+ 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40,
+ 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47,
+ 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46,
+ 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47,
+ 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53,
+ 53, 58,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+ 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45,
+ 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+ 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36,
+ 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45,
+ 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40,
+ 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45,
+ 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39,
+ 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+ 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+ 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48,
+ 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47,
+ 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51,
+ 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46,
+ 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46,
+ 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48,
+ 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49,
+ 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46,
+ 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53,
+ 53, 54, 56, 56, 56, 57, 58, 58,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31,
+ 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31,
+ 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34,
+ 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38,
+ 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42,
+ 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46,
+ 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45,
+ 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45,
+ 32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34,
+ 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36,
+ 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39,
+ 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45,
+ 45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47,
+ 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47,
+ 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47,
+ 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+ 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43,
+ 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46,
+ 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48,
+ 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50,
+ 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53,
+ 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54,
+ 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56,
+ 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47,
+ 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45,
+ 45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45,
+ 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46,
+ 46, 49, 53, 53, 53, 56, 58, 58,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34,
+ 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42,
+ 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46,
+ 46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
+ /* Size 16x4 */
+ 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35,
+ 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46,
+ 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46,
+ 50, 54, 47, 46, 50, 54, 47, 45, 49, 56,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+ 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44,
+ 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+ 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43,
+ 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46,
+ 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47,
+ 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46,
+ 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45,
+ 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56,
+ 56, 57, 58, 58,
+ /* Size 32x8 */
+ 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31,
+ 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38,
+ 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46,
+ 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+ 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36,
+ 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45,
+ 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47,
+ 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43,
+ 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48,
+ 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53,
+ 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45,
+ 45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46,
+ 46, 53, 53, 58 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
+ /* Size 8x8 */
+ 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32,
+ 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34,
+ 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38,
+ 39, 42, 35, 35, 34, 36, 38, 40, 42, 48,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36,
+ 36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33,
+ 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33,
+ 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+ 36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+ 38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42,
+ 42, 45, 48, 48,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35,
+ 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36,
+ 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37,
+ 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+ 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37,
+ 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40,
+ 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35,
+ 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34,
+ 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37,
+ 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34,
+ 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39,
+ 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+ 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42,
+ 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37,
+ 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35,
+ 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40,
+ 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34,
+ 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42,
+ 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35,
+ 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48,
+ 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37,
+ 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50,
+ /* Size 4x8 */
+ 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32,
+ 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48,
+ /* Size 8x4 */
+ 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33,
+ 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34,
+ 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35,
+ 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+ 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44,
+ 48, 48,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32,
+ 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32,
+ 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34,
+ 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33,
+ 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35,
+ 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38,
+ 46, 48,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37,
+ 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+ 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39,
+ 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35,
+ 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46,
+ 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36,
+ 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38,
+ 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35,
+ 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42,
+ 42, 42, 44, 47, 48, 48, 48, 49,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35,
+ 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
+ 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
+ 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37,
+ 37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38,
+ 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+ 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33,
+ 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33,
+ 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36,
+ 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37,
+ 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+ 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42,
+ 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44,
+ 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35,
+ 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34,
+ 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34,
+ 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38,
+ 39, 39, 39, 42, 46, 49, 49, 49,
+ /* Size 4x16 */
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34,
+ 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
+ /* Size 16x4 */
+ 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32,
+ 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37,
+ 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34,
+ 37, 44, 35, 34, 38, 48, 35, 34, 38, 48,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36,
+ 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35,
+ 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41,
+ 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34,
+ 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
+ 48, 48, 48, 49,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32,
+ 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32,
+ 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32,
+ 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32,
+ 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33,
+ 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34,
+ 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32,
+ 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35,
+ 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37,
+ 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+ 35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35,
+ 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37,
+ 39, 39, 46, 49 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
+ /* Size 8x8 */
+ 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31,
+ 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42,
+ 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48,
+ 48, 50, 48, 47, 46, 47, 47, 49, 50, 53,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31,
+ 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31,
+ 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31,
+ 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35,
+ 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40,
+ 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44,
+ 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46,
+ 47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38,
+ 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41,
+ 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42,
+ 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46,
+ 46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48,
+ 48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50,
+ 50, 51, 53, 53,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36,
+ 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+ 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42,
+ 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47,
+ 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34,
+ 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38,
+ 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42,
+ 42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46,
+ 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35,
+ 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39,
+ 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41,
+ 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43,
+ 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+ 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43,
+ 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45,
+ 45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37,
+ 37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47,
+ 47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42,
+ 42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39,
+ 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42,
+ 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49,
+ 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44,
+ 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47,
+ 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48,
+ 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50,
+ 50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46,
+ 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51,
+ 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47,
+ 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47,
+ 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49,
+ 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46,
+ 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50,
+ 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53,
+ 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47,
+ 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ /* Size 4x8 */
+ 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38,
+ 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53,
+ /* Size 8x4 */
+ 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39,
+ 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31,
+ 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32,
+ 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35,
+ 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43,
+ 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47,
+ 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+ 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51,
+ 53, 53,
+ /* Size 16x8 */
+ 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31,
+ 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35,
+ 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42,
+ 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42,
+ 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45,
+ 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48,
+ 52, 53,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39,
+ 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42,
+ 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+ 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46,
+ 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40,
+ 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44,
+ 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38,
+ 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47,
+ 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+ 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42,
+ 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48,
+ 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45,
+ 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52,
+ 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48,
+ 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47,
+ 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50,
+ 50, 50, 51, 52, 53, 53, 53, 53,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31,
+ 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31,
+ 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+ 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37,
+ 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38,
+ 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41,
+ 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46,
+ 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46,
+ 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31,
+ 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33,
+ 33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35,
+ 37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41,
+ 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43,
+ 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44,
+ 46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47,
+ 47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47,
+ 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+ 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40,
+ 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41,
+ 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46,
+ 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47,
+ 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48,
+ 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50,
+ 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51,
+ 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48,
+ 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46,
+ 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46,
+ 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47,
+ 47, 47, 47, 49, 52, 53, 53, 53,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31,
+ 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39,
+ 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46,
+ 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
+ /* Size 16x4 */
+ 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32,
+ 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47,
+ 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44,
+ 47, 51, 48, 46, 48, 53, 48, 46, 48, 53,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40,
+ 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+ 47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37,
+ 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47,
+ 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45,
+ 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49,
+ 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+ 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
+ 53, 53, 53, 53,
+ /* Size 32x8 */
+ 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31,
+ 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34,
+ 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39,
+ 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33,
+ 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38,
+ 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43,
+ 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39,
+ 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44,
+ 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47,
+ 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+ 47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47,
+ 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47,
+ 47, 47, 52, 53 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32,
+ 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34,
+ 35, 36, 33, 33, 33, 33, 35, 35, 36, 38,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36,
+ 37, 37, 38, 39,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
+ 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
+ 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+ 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+ 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38,
+ 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34,
+ 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+ /* Size 4x8 */
+ 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36,
+ /* Size 8x4 */
+ 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32,
+ 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+ 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+ 36, 38,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+ 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33,
+ 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+ 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33,
+ 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37,
+ 37, 38,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+ 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+ 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+ 37, 37, 37, 37, 38, 38, 39, 39,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35,
+ 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35,
+ 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37,
+ 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33,
+ 33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+ 35, 36, 37, 37, 37, 37, 38, 39,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+ /* Size 16x4 */
+ 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33,
+ 34, 35, 33, 33, 35, 36, 34, 34, 36, 37,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+ 36, 37, 38, 38,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+ 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32,
+ 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32,
+ 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33,
+ 33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+ 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32,
+ 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35,
+ 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33,
+ 33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34,
+ 35, 37, 37, 38 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
+ /* Size 8x8 */
+ 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31,
+ 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35,
+ 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44,
+ 47, 47, 40, 41, 41, 42, 44, 45, 47, 48,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35,
+ 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38,
+ 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40,
+ 41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34,
+ 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37,
+ 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40,
+ 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40,
+ 42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+ 44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46,
+ 47, 47, 48, 48,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+ 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35,
+ 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41,
+ 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
+ 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34,
+ 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38,
+ 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39,
+ 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35,
+ 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36,
+ 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40,
+ 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42,
+ 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+ 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+ 36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41,
+ 41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43,
+ 43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39,
+ 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40,
+ 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43,
+ 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36,
+ 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45,
+ 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39,
+ 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37,
+ 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43,
+ 43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47,
+ 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39,
+ 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47,
+ 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40,
+ 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43,
+ 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40,
+ 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45,
+ 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41,
+ 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47,
+ 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48,
+ 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43,
+ 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36,
+ 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47,
+ /* Size 8x4 */
+ 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36,
+ 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31,
+ 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31,
+ 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32,
+ 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35,
+ 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
+ 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+ 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+ 47, 48,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31,
+ 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32,
+ 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40,
+ 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37,
+ 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40,
+ 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47,
+ 47, 48,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+ 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35,
+ 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39,
+ 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+ 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+ 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+ 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39,
+ 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41,
+ 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+ 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40,
+ 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38,
+ 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44,
+ 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38,
+ 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47,
+ 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40,
+ 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41,
+ 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39,
+ 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44,
+ 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47,
+ 47, 47, 47, 47, 48, 48, 48, 48,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36,
+ 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38,
+ 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38,
+ 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42,
+ 30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31,
+ 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32,
+ 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32,
+ 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33,
+ 35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38,
+ 40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41,
+ 41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42,
+ 43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45,
+ 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+ 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35,
+ 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37,
+ 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39,
+ 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44,
+ 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47,
+ 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47,
+ 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47,
+ 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39,
+ 40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42,
+ 42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43,
+ 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43,
+ 44, 46, 47, 47, 47, 47, 48, 48,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31,
+ 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36,
+ 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40,
+ 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+ /* Size 16x4 */
+ 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32,
+ 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42,
+ 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40,
+ 45, 47, 39, 41, 45, 47, 42, 43, 46, 47,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+ 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36,
+ 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40,
+ 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+ 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36,
+ 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37,
+ 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43,
+ 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47,
+ 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40,
+ 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31,
+ 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31,
+ 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38,
+ 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40,
+ 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31,
+ 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32,
+ 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41,
+ 41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35,
+ 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38,
+ 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47,
+ 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41,
+ 42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43,
+ 44, 47, 47, 48 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ /* Size 8x4 */
+ 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32,
+ 33, 34,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ /* Size 16x4 */
+ 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32,
+ 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 34,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32,
+ 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32,
+ 32, 32, 33, 34 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31,
+ 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31,
+ 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36,
+ 39, 39, 33, 34, 34, 35, 35, 36, 39, 39,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+ 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+ 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37,
+ 38, 39, 39, 39,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31,
+ 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34,
+ 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34,
+ 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33,
+ 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34,
+ 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35,
+ 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+ 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35,
+ 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36,
+ 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+ 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
+ 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39,
+ 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+ 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31,
+ 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40,
+ /* Size 8x4 */
+ 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32,
+ 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41,
+ 41, 41,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31,
+ 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31,
+ 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32,
+ 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32,
+ 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36,
+ 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36,
+ 38, 41,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34,
+ 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+ 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36,
+ 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37,
+ 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38,
+ 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35,
+ 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38,
+ 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41,
+ 42, 43, 43, 43, 43, 43, 43, 44,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+ 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+ 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36,
+ 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+ 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33,
+ 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34,
+ 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40,
+ 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43,
+ 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34,
+ 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35,
+ 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36,
+ 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36,
+ 36, 36, 36, 38, 39, 40, 42, 44,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35,
+ 36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
+ /* Size 16x4 */
+ 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31,
+ 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36,
+ 36, 40, 34, 36, 36, 40, 34, 36, 36, 40,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36,
+ 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+ 37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37,
+ 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41,
+ 41, 41, 41, 42,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31,
+ 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31,
+ 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31,
+ 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31,
+ 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32,
+ 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32,
+ 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31,
+ 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33,
+ 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35,
+ 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34,
+ 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36,
+ 36, 36, 39, 42 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31,
+ 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31,
+ 32, 32, 31, 31, 32, 32, 30, 31, 32, 32,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31,
+ 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31,
+ 32, 32, 32, 32 },
+ },
+};
+
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5,
+ /* Size 8x8 */
+ 32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26,
+ 19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9,
+ 8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10,
+ 10, 8, 7, 6, 5, 5,
+ /* Size 16x16 */
+ 32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32,
+ 32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29,
+ 28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22,
+ 20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16,
+ 15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11,
+ 11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9,
+ 9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16,
+ 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13,
+ 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7,
+ 7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6,
+ 11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11,
+ 11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7,
+ 6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+ 32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12,
+ 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28,
+ 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10,
+ 10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19,
+ 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33,
+ 32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14,
+ 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30,
+ 29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24,
+ 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+ 11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19,
+ 18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28,
+ 27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12,
+ 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22,
+ 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10,
+ 10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15,
+ 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12,
+ 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20,
+ 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+ 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13,
+ 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17,
+ 17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9,
+ 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16,
+ 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10,
+ 9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15,
+ 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10,
+ 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14,
+ 14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10,
+ 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12,
+ 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+ 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12,
+ 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+ 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+ 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11,
+ 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+ 5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+ 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10,
+ 10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5,
+ 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+ 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
+ 4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+ 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+ /* Size 4x8 */
+ 32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15,
+ 12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5,
+ /* Size 8x4 */
+ 32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12,
+ 8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5,
+ /* Size 8x16 */
+ 32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31,
+ 30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24,
+ 21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14,
+ 13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10,
+ 9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6,
+ 7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+ 12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5,
+ /* Size 16x8 */
+ 32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30,
+ 28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17,
+ 15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10,
+ 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13,
+ 15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8,
+ 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10,
+ 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+ 31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12,
+ 12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27,
+ 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20,
+ 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+ 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25,
+ 25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18,
+ 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9,
+ 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12,
+ 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17,
+ 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8,
+ 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12,
+ 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12,
+ 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+ 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12,
+ 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+ 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9,
+ 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5,
+ 5, 5, 5, 5,
+ /* Size 32x16 */
+ 32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32,
+ 32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31,
+ 30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25,
+ 21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19,
+ 17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15,
+ 14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13,
+ 13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12,
+ 12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11,
+ 26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25,
+ 24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21,
+ 19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15,
+ 13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12,
+ 11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9,
+ 9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9,
+ 16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15,
+ 14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9,
+ 9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+ 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13,
+ 13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9,
+ 8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6,
+ 6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+ 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9,
+ 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5,
+ 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10,
+ 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7,
+ 6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5,
+ /* Size 4x16 */
+ 33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25,
+ 24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14,
+ 13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8,
+ 8, 7, 6, 6, 6, 5, 5,
+ /* Size 16x4 */
+ 33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19,
+ 13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14,
+ 11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9,
+ 7, 5, 9, 9, 7, 5,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30,
+ 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21,
+ 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+ 10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16,
+ 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9,
+ 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12,
+ 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7,
+ 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+ 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5,
+ 5, 5,
+ /* Size 32x8 */
+ 32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31,
+ 30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20,
+ 17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13,
+ 13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11,
+ 26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23,
+ 19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12,
+ 11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9,
+ 16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10,
+ 9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12,
+ 13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7,
+ 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10,
+ 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9,
+ 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6,
+ 5 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9,
+ /* Size 8x8 */
+ 33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22,
+ 19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14,
+ 12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11,
+ 10, 10, 14, 15, 15, 14, 12, 11, 10, 9,
+ /* Size 16x16 */
+ 32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32,
+ 29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23,
+ 22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21,
+ 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19,
+ 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15,
+ 15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14,
+ 14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13,
+ 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12,
+ 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18,
+ 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17,
+ 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15,
+ 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13,
+ 12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12,
+ 11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+ 9, 9, 9,
+ /* Size 32x32 */
+ 32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31,
+ 30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23,
+ 22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22,
+ 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15,
+ 15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20,
+ 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27,
+ 26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+ 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22,
+ 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17,
+ 17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+ 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16,
+ 16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+ 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23,
+ 23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20,
+ 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+ 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17,
+ 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15,
+ 15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20,
+ 21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21,
+ 20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13,
+ 13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+ 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+ 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17,
+ 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11,
+ 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12,
+ 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16,
+ 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17,
+ 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15,
+ 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14,
+ 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16,
+ 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14,
+ 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11,
+ 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ /* Size 4x8 */
+ 33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19,
+ 16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10,
+ /* Size 8x4 */
+ 33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16,
+ 12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10,
+ /* Size 8x16 */
+ 32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26,
+ 24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20,
+ 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17,
+ 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14,
+ 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10,
+ 10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+ 10, 9,
+ /* Size 16x8 */
+ 32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24,
+ 22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19,
+ 18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14,
+ 14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13,
+ 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17,
+ 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14,
+ 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11,
+ 10, 9,
+ /* Size 16x32 */
+ 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30,
+ 28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22,
+ 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21,
+ 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16,
+ 16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22,
+ 23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15,
+ 15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22,
+ 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17,
+ 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14,
+ 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+ 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17,
+ 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10,
+ 10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+ 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13,
+ 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15,
+ 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12,
+ 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+ 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11,
+ 10, 10, 10, 10, 9, 9, 9,
+ /* Size 32x16 */
+ 32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33,
+ 27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24,
+ 22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23,
+ 21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21,
+ 20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19,
+ 18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18,
+ 17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17,
+ 17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17,
+ 21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23,
+ 22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21,
+ 19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17,
+ 16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15,
+ 14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13,
+ 13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13,
+ 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13,
+ 13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13,
+ 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18,
+ 19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18,
+ 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15,
+ 13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13,
+ 12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12,
+ 11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11,
+ 10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10,
+ 10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10,
+ 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15,
+ 15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16,
+ 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14,
+ 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13,
+ 13, 11, 11, 10, 10, 9, 9, 9,
+ /* Size 4x16 */
+ 33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23,
+ 22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18,
+ 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16,
+ 15, 14, 13, 12, 11, 11, 10, 10, 10, 10,
+ /* Size 16x4 */
+ 33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19,
+ 17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13,
+ 18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15,
+ 12, 10, 15, 15, 12, 10, 15, 14, 12, 10,
+ /* Size 8x32 */
+ 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25,
+ 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15,
+ 16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16,
+ 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+ 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10,
+ 10, 9, 9, 9,
+ /* Size 32x8 */
+ 32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26,
+ 22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22,
+ 20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17,
+ 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+ 21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22,
+ 19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16,
+ 14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12,
+ 13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13,
+ 16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18,
+ 17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14,
+ 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11,
+ 10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10,
+ 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15,
+ 16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14,
+ 13, 11, 10, 9 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6,
+ /* Size 8x8 */
+ 32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26,
+ 20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12,
+ 10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10,
+ 11, 10, 9, 8, 7, 6, 5,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32,
+ 32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30,
+ 28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24,
+ 22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17,
+ 16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13,
+ 12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10,
+ 10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8,
+ 17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16,
+ 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11,
+ 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7,
+ 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10,
+ 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11,
+ 10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7,
+ 6, 6, 5, 5, 5,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16,
+ 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32,
+ 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12,
+ 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30,
+ 28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11,
+ 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23,
+ 23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+ 10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18,
+ 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32,
+ 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28,
+ 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23,
+ 22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+ 11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17,
+ 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27,
+ 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13,
+ 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26,
+ 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12,
+ 11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19,
+ 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9,
+ 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12,
+ 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20,
+ 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+ 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15,
+ 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+ 17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+ 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11,
+ 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15,
+ 15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8,
+ 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13,
+ 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12,
+ 12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6,
+ 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+ 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12,
+ 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8,
+ 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12,
+ 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11,
+ 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+ 5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+ 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10,
+ 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5,
+ 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+ 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
+ /* Size 4x8 */
+ 32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16,
+ 13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6,
+ /* Size 8x4 */
+ 32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13,
+ 9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6,
+ /* Size 8x16 */
+ 32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32,
+ 30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26,
+ 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17,
+ 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10,
+ 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7,
+ 7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12,
+ 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5,
+ /* Size 16x8 */
+ 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30,
+ 28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18,
+ 15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11,
+ 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14,
+ 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8,
+ 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11,
+ 11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+ 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32,
+ 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14,
+ 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28,
+ 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12,
+ 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+ 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16,
+ 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24,
+ 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12,
+ 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23,
+ 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16,
+ 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8,
+ 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+ 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16,
+ 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11,
+ 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13,
+ 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7,
+ 7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11,
+ 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+ 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+ 5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+ 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5,
+ /* Size 32x16 */
+ 32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32,
+ 32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31,
+ 29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25,
+ 23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20,
+ 17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16,
+ 15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13,
+ 13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13,
+ 12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12,
+ 27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28,
+ 26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23,
+ 19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16,
+ 15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13,
+ 12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10,
+ 10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9,
+ 9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17,
+ 17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12,
+ 12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8,
+ 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8,
+ 12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13,
+ 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8,
+ 8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6,
+ 6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12,
+ 12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9,
+ 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+ 6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10,
+ 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8,
+ 7, 7, 6, 6, 5, 5, 5,
+ /* Size 4x16 */
+ 33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25,
+ 24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16,
+ 14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10,
+ 9, 8, 7, 7, 6, 6, 6, 6,
+ /* Size 16x4 */
+ 33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19,
+ 14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9,
+ 16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6,
+ 11, 10, 8, 6, 10, 9, 7, 6,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+ 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31,
+ 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24,
+ 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11,
+ 11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+ 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+ 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15,
+ 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+ 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12,
+ 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6,
+ 6, 6, 6, 6, 5, 5, 5,
+ /* Size 32x8 */
+ 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32,
+ 29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23,
+ 17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14,
+ 13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12,
+ 27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24,
+ 19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14,
+ 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9,
+ 9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14,
+ 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7,
+ 12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10,
+ 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10,
+ 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7,
+ 6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9,
+ 8, 7, 6, 5 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10,
+ /* Size 8x8 */
+ 33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22,
+ 19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15,
+ 13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11,
+ 10, 10, 15, 16, 16, 14, 12, 11, 10, 9,
+ /* Size 16x16 */
+ 32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33,
+ 29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23,
+ 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21,
+ 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19,
+ 18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17,
+ 16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14,
+ 14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13,
+ 13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13,
+ 17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18,
+ 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18,
+ 17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15,
+ 14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13,
+ 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12,
+ 12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11,
+ 10, 10, 9, 9,
+ /* Size 32x32 */
+ 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32,
+ 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17,
+ 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24,
+ 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16,
+ 16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23,
+ 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16,
+ 15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21,
+ 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27,
+ 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22,
+ 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20,
+ 21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+ 17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16,
+ 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+ 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23,
+ 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21,
+ 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20,
+ 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14,
+ 14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21,
+ 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18,
+ 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18,
+ 19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12,
+ 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18,
+ 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ 11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13,
+ 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+ 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17,
+ 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16,
+ 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14,
+ 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12,
+ 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16,
+ 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16,
+ 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14,
+ 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12,
+ 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ /* Size 4x8 */
+ 33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20,
+ 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10,
+ /* Size 8x4 */
+ 33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16,
+ 13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10,
+ /* Size 8x16 */
+ 32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27,
+ 25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21,
+ 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14,
+ 14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+ 12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11,
+ 11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10,
+ 10, 10,
+ /* Size 16x8 */
+ 32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25,
+ 22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19,
+ 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15,
+ 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13,
+ 17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18,
+ 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14,
+ 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12,
+ 10, 10,
+ /* Size 16x32 */
+ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31,
+ 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22,
+ 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+ 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16,
+ 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+ 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16,
+ 15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22,
+ 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18,
+ 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14,
+ 14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13,
+ 17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+ 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17,
+ 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16,
+ 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17,
+ 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 10, 10, 10, 10, 10, 10, 9,
+ /* Size 32x16 */
+ 32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33,
+ 28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26,
+ 22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23,
+ 22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22,
+ 20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20,
+ 19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18,
+ 18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17,
+ 17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17,
+ 21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+ 22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22,
+ 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17,
+ 17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16,
+ 15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14,
+ 14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13,
+ 13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13,
+ 13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13,
+ 17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18,
+ 19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18,
+ 17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15,
+ 14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13,
+ 12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12,
+ 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11,
+ 11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10,
+ 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10,
+ 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16,
+ 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16,
+ 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15,
+ 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13,
+ 13, 12, 12, 11, 11, 10, 10, 9,
+ /* Size 4x16 */
+ 33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23,
+ 22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19,
+ 17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16,
+ 15, 14, 13, 12, 12, 11, 10, 10, 10, 10,
+ /* Size 16x4 */
+ 33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19,
+ 17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13,
+ 19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16,
+ 12, 10, 16, 15, 12, 10, 15, 15, 12, 10,
+ /* Size 8x32 */
+ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27,
+ 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14,
+ 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17,
+ 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17,
+ 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16,
+ 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+ 10, 10, 10, 10,
+ /* Size 32x8 */
+ 32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27,
+ 22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22,
+ 20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19,
+ 18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+ 21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23,
+ 19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17,
+ 15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13,
+ 13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13,
+ 17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19,
+ 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14,
+ 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11,
+ 11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10,
+ 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16,
+ 16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14,
+ 13, 12, 11, 10 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6,
+ /* Size 8x8 */
+ 32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28,
+ 21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13,
+ 11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11,
+ 11, 11, 10, 8, 7, 6, 6,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32,
+ 32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30,
+ 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26,
+ 24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19,
+ 17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14,
+ 13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11,
+ 11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10,
+ 9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17,
+ 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15,
+ 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9,
+ 9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7,
+ 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11,
+ 12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10,
+ 10, 9, 8, 8, 7, 7, 6, 6, 6, 5,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+ 32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12,
+ 12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27,
+ 25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20,
+ 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32,
+ 32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17,
+ 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30,
+ 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14,
+ 14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24,
+ 23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+ 12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19,
+ 19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11,
+ 28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28,
+ 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+ 12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22,
+ 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17,
+ 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22,
+ 22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11,
+ 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21,
+ 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9,
+ 9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14,
+ 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18,
+ 19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10,
+ 10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16,
+ 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11,
+ 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16,
+ 16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+ 8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14,
+ 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12,
+ 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12,
+ 12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+ 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12,
+ 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6,
+ 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+ 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6,
+ 6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11,
+ 11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+ 6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11,
+ 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6,
+ 6, 6, 6, 6, 5, 5, 5,
+ /* Size 4x8 */
+ 32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18,
+ 15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6,
+ /* Size 8x4 */
+ 32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15,
+ 10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6,
+ /* Size 8x16 */
+ 32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32,
+ 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27,
+ 23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18,
+ 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13,
+ 11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8,
+ 7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11,
+ 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 16x8 */
+ 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31,
+ 28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20,
+ 17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11,
+ 11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16,
+ 17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9,
+ 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12,
+ 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+ 32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14,
+ 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29,
+ 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25,
+ 24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17,
+ 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27,
+ 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13,
+ 13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24,
+ 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9,
+ 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17,
+ 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12,
+ 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13,
+ 14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+ 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12,
+ 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7,
+ 11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13,
+ 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+ 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9,
+ 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+ /* Size 32x16 */
+ 32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32,
+ 32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32,
+ 29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28,
+ 25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20,
+ 19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18,
+ 15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15,
+ 13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13,
+ 13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12,
+ 28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28,
+ 26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25,
+ 20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18,
+ 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14,
+ 13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12,
+ 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10,
+ 9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9,
+ 17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+ 18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13,
+ 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9,
+ 8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8,
+ 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14,
+ 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11,
+ 9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7,
+ 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12,
+ 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11,
+ 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7,
+ 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6,
+ 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5,
+ /* Size 4x16 */
+ 33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27,
+ 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18,
+ 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11,
+ 10, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 16x4 */
+ 33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22,
+ 16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9,
+ 17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6,
+ 12, 11, 8, 6, 11, 10, 8, 6,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32,
+ 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14,
+ 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24,
+ 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+ 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17,
+ 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14,
+ 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7,
+ 8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11,
+ 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12,
+ 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7,
+ 7, 7, 6, 6, 6, 6, 6, 6, 6,
+ /* Size 32x8 */
+ 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32,
+ 29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24,
+ 19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15,
+ 13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12,
+ 28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26,
+ 20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16,
+ 13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10,
+ 9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17,
+ 15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8,
+ 8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14,
+ 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7,
+ 6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12,
+ 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+ 10, 11, 11, 10, 9, 8, 7, 6 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10,
+ /* Size 8x8 */
+ 33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22,
+ 19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16,
+ 14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12,
+ 11, 10, 15, 16, 16, 14, 13, 12, 10, 10,
+ /* Size 16x16 */
+ 32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33,
+ 29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24,
+ 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22,
+ 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20,
+ 19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15,
+ 15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+ 14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13,
+ 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19,
+ 19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19,
+ 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+ 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12,
+ 12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11,
+ 11, 10, 10, 10,
+ /* Size 32x32 */
+ 32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+ 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33,
+ 30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24,
+ 23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23,
+ 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22,
+ 21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28,
+ 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23,
+ 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20,
+ 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17,
+ 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23,
+ 22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+ 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21,
+ 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+ 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22,
+ 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14,
+ 13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19,
+ 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+ 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19,
+ 19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19,
+ 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16,
+ 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12,
+ 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17,
+ 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+ 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18,
+ 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+ 10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14,
+ 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+ 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16,
+ 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+ 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10,
+ 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15,
+ 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10,
+ 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+ /* Size 4x8 */
+ 33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20,
+ 18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10,
+ /* Size 8x4 */
+ 33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17,
+ 14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10,
+ /* Size 8x16 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28,
+ 26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22,
+ 20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16,
+ 15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+ 12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11,
+ 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10,
+ 10, 10,
+ /* Size 16x8 */
+ 32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26,
+ 22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20,
+ 19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15,
+ 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13,
+ 18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18,
+ 17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14,
+ 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12,
+ 11, 10,
+ /* Size 16x32 */
+ 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+ 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32,
+ 28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23,
+ 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17,
+ 17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22,
+ 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+ 22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23,
+ 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+ 15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19,
+ 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14,
+ 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16,
+ 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18,
+ 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12,
+ 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18,
+ 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16,
+ 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17,
+ 17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 10, 10, 10, 10, 10,
+ /* Size 32x16 */
+ 32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33,
+ 29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26,
+ 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23,
+ 23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22,
+ 21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20,
+ 19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19,
+ 18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18,
+ 17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17,
+ 21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22,
+ 22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22,
+ 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18,
+ 18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16,
+ 16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15,
+ 14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14,
+ 13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13,
+ 13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13,
+ 18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19,
+ 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20,
+ 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16,
+ 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12,
+ 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11,
+ 11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11,
+ 11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11,
+ 15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16,
+ 16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17,
+ 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15,
+ 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13,
+ 13, 12, 12, 11, 11, 10, 10, 10,
+ /* Size 4x16 */
+ 33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22,
+ 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20,
+ 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16,
+ 15, 14, 13, 12, 12, 11, 11, 11, 10, 10,
+ /* Size 16x4 */
+ 33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20,
+ 19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13,
+ 20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16,
+ 12, 11, 16, 16, 13, 10, 16, 15, 12, 10,
+ /* Size 8x32 */
+ 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+ 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28,
+ 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18,
+ 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20,
+ 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18,
+ 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+ 15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+ 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17,
+ 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18,
+ 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10,
+ /* Size 32x8 */
+ 32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28,
+ 22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22,
+ 21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19,
+ 18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17,
+ 21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22,
+ 19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17,
+ 16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14,
+ 13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13,
+ 18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19,
+ 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15,
+ 13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11,
+ 11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10,
+ 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16,
+ 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15,
+ 13, 12, 11, 10 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7,
+ /* Size 8x8 */
+ 32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28,
+ 22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14,
+ 12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7,
+ 11, 12, 12, 10, 9, 8, 7, 6,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32,
+ 32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31,
+ 29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27,
+ 25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20,
+ 19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16,
+ 15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12,
+ 12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10,
+ 10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17,
+ 19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18,
+ 16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12,
+ 11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9,
+ 8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+ 11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12,
+ 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18,
+ 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32,
+ 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15,
+ 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13,
+ 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+ 27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+ 12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23,
+ 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32,
+ 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18,
+ 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30,
+ 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26,
+ 26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+ 13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22,
+ 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12,
+ 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17,
+ 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30,
+ 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24,
+ 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12,
+ 11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19,
+ 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+ 11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23,
+ 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+ 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23,
+ 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11,
+ 10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17,
+ 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9,
+ 9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19,
+ 19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10,
+ 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16,
+ 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8,
+ 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12,
+ 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16,
+ 16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13,
+ 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13,
+ 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14,
+ 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7,
+ 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+ 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13,
+ 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+ 7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10,
+ 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12,
+ 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+ 7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11,
+ 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11,
+ 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+ 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11,
+ 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ /* Size 4x8 */
+ 32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18,
+ 16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7,
+ /* Size 8x4 */
+ 32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17,
+ 11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7,
+ /* Size 8x16 */
+ 32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32,
+ 31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28,
+ 25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19,
+ 17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14,
+ 12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10,
+ 9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+ 11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+ /* Size 16x8 */
+ 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31,
+ 29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21,
+ 19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13,
+ 12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9,
+ 18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14,
+ 12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7,
+ 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+ 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+ 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16,
+ 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30,
+ 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26,
+ 25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12,
+ 12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19,
+ 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29,
+ 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24,
+ 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+ 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14,
+ 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17,
+ 17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11,
+ 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17,
+ 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+ 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14,
+ 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+ 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12,
+ 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12,
+ 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12,
+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+ /* Size 32x16 */
+ 32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32,
+ 32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32,
+ 30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30,
+ 25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23,
+ 21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18,
+ 17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15,
+ 14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13,
+ 13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13,
+ 29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30,
+ 28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26,
+ 21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20,
+ 17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16,
+ 14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12,
+ 12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11,
+ 10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10,
+ 9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18,
+ 19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18,
+ 16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13,
+ 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9,
+ 8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8,
+ 13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14,
+ 13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11,
+ 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7,
+ 7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12,
+ 12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11,
+ 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8,
+ 7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 4x16 */
+ 33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29,
+ 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18,
+ 17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11,
+ 10, 10, 9, 8, 8, 7, 7, 7, 6,
+ /* Size 16x4 */
+ 33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24,
+ 17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10,
+ 19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7,
+ 12, 12, 9, 7, 12, 11, 8, 6,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+ 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+ 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16,
+ 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26,
+ 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+ 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+ 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13,
+ 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7,
+ 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9,
+ 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6,
+ /* Size 32x8 */
+ 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32,
+ 30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24,
+ 21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18,
+ 14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13,
+ 29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26,
+ 21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16,
+ 14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12,
+ 10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18,
+ 19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13,
+ 11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8,
+ 13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13,
+ 11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7,
+ 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11,
+ 9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11,
+ /* Size 8x8 */
+ 33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22,
+ 20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17,
+ 15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12,
+ 11, 11, 16, 17, 17, 15, 13, 12, 11, 10,
+ /* Size 16x16 */
+ 32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33,
+ 30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24,
+ 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21,
+ 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20,
+ 20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19,
+ 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16,
+ 16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14,
+ 14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14,
+ 19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+ 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19,
+ 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17,
+ 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14,
+ 13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13,
+ 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12,
+ 11, 11, 10, 10,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33,
+ 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26,
+ 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23,
+ 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22,
+ 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30,
+ 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20,
+ 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24,
+ 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+ 19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21,
+ 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18,
+ 18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21,
+ 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+ 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22,
+ 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+ 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+ 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19,
+ 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+ 15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+ 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21,
+ 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22,
+ 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+ 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14,
+ 14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17,
+ 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20,
+ 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13,
+ 13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17,
+ 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17,
+ 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17,
+ 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17,
+ 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+ 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+ 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ /* Size 4x8 */
+ 33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20,
+ 19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11,
+ /* Size 8x4 */
+ 33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19,
+ 14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11,
+ /* Size 8x16 */
+ 32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30,
+ 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22,
+ 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19,
+ 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14,
+ 13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12,
+ 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11,
+ 11, 10,
+ /* Size 16x8 */
+ 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28,
+ 22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21,
+ 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17,
+ 16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14,
+ 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19,
+ 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15,
+ 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12,
+ 11, 10,
+ /* Size 16x32 */
+ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32,
+ 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24,
+ 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+ 17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17,
+ 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22,
+ 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+ 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15,
+ 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14,
+ 18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14,
+ 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20,
+ 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+ 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11,
+ 11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16,
+ 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+ 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16,
+ 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17,
+ 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 10, 10, 10, 10,
+ /* Size 32x16 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33,
+ 30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26,
+ 23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22,
+ 23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22,
+ 22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20,
+ 20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19,
+ 19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18,
+ 18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22,
+ 22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22,
+ 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19,
+ 18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17,
+ 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16,
+ 16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14,
+ 14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14,
+ 14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13,
+ 19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20,
+ 20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20,
+ 19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18,
+ 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15,
+ 14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13,
+ 12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11,
+ 11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11,
+ 11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11,
+ 15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17,
+ 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17,
+ 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16,
+ 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14,
+ 14, 12, 12, 11, 11, 10, 10, 10,
+ /* Size 4x16 */
+ 33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22,
+ 22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21,
+ 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16,
+ 16, 15, 14, 13, 12, 12, 11, 11, 11, 11,
+ /* Size 16x4 */
+ 33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20,
+ 19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14,
+ 20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16,
+ 12, 11, 17, 16, 13, 11, 16, 16, 13, 11,
+ /* Size 8x32 */
+ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29,
+ 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+ 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19,
+ 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+ 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17,
+ 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+ 11, 10, 10, 10,
+ /* Size 32x8 */
+ 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30,
+ 23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23,
+ 22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20,
+ 19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17,
+ 22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22,
+ 20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18,
+ 17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15,
+ 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14,
+ 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20,
+ 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16,
+ 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12,
+ 11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11,
+ 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17,
+ 17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15,
+ 14, 12, 11, 10 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7,
+ /* Size 8x8 */
+ 32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29,
+ 26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16,
+ 13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8,
+ 7, 12, 13, 13, 11, 10, 8, 7, 7,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32,
+ 32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31,
+ 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28,
+ 26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22,
+ 20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17,
+ 16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14,
+ 13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11,
+ 11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10,
+ 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18,
+ 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16,
+ 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12,
+ 11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8,
+ 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11,
+ 12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20,
+ 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17,
+ 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14,
+ 13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+ 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+ 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24,
+ 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32,
+ 32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19,
+ 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31,
+ 30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16,
+ 16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28,
+ 27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14,
+ 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23,
+ 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+ 30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19,
+ 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30,
+ 29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27,
+ 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20,
+ 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24,
+ 24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+ 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23,
+ 23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20,
+ 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16,
+ 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+ 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19,
+ 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10,
+ 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12,
+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17,
+ 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+ 8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12,
+ 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14,
+ 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+ 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12,
+ 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14,
+ 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+ 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+ 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12,
+ 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+ 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11,
+ 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+ /* Size 4x8 */
+ 32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20,
+ 19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7,
+ /* Size 8x4 */
+ 32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17,
+ 13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7,
+ /* Size 8x16 */
+ 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32,
+ 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28,
+ 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20,
+ 19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15,
+ 14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11,
+ 10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8,
+ 7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7,
+ /* Size 16x8 */
+ 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+ 30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23,
+ 20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14,
+ 12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10,
+ 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16,
+ 14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8,
+ 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+ 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17,
+ 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14,
+ 14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27,
+ 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13,
+ 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21,
+ 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29,
+ 29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26,
+ 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21,
+ 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16,
+ 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10,
+ 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17,
+ 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10,
+ 9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8,
+ 8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+ 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14,
+ 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+ 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12,
+ 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+ 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6,
+ /* Size 32x16 */
+ 32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32,
+ 32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32,
+ 31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30,
+ 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25,
+ 22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20,
+ 17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17,
+ 15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15,
+ 14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13,
+ 30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30,
+ 29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27,
+ 24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20,
+ 19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17,
+ 16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14,
+ 13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12,
+ 11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10,
+ 10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10,
+ 19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19,
+ 19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18,
+ 16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13,
+ 12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10,
+ 10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8,
+ 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14,
+ 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13,
+ 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10,
+ 10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7,
+ 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12,
+ 12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11,
+ 11, 10, 10, 9, 9, 8, 8, 7, 7, 6,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29,
+ 29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21,
+ 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13,
+ 12, 11, 10, 10, 9, 8, 8, 7, 7, 7,
+ /* Size 16x4 */
+ 33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25,
+ 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10,
+ 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10,
+ 7, 13, 12, 9, 7, 12, 12, 9, 7,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+ 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+ 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17,
+ 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28,
+ 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+ 14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20,
+ 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14,
+ 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17,
+ 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+ 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+ 7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ /* Size 32x8 */
+ 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+ 31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27,
+ 22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18,
+ 15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13,
+ 30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29,
+ 24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18,
+ 16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12,
+ 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10,
+ 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18,
+ 15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10,
+ 9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14,
+ 14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8,
+ 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12,
+ 13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11,
+ /* Size 8x8 */
+ 33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23,
+ 21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18,
+ 16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12,
+ 12, 11, 16, 17, 17, 16, 14, 12, 11, 11,
+ /* Size 16x16 */
+ 32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33,
+ 32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26,
+ 24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22,
+ 22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21,
+ 21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19,
+ 19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17,
+ 17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15,
+ 15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20,
+ 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20,
+ 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17,
+ 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15,
+ 14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13,
+ 13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12,
+ 12, 11, 11, 10,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33,
+ 32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26,
+ 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+ 17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22,
+ 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+ 17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22,
+ 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30,
+ 29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21,
+ 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25,
+ 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22,
+ 21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19,
+ 18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21,
+ 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20,
+ 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22,
+ 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+ 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+ 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+ 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22,
+ 22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17,
+ 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22,
+ 22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+ 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21,
+ 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+ 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21,
+ 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13,
+ 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18,
+ 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12,
+ 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18,
+ 18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19,
+ 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18,
+ 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+ 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+ 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17,
+ 17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18,
+ 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ 10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10,
+ /* Size 4x8 */
+ 33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22,
+ 20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11,
+ /* Size 8x4 */
+ 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19,
+ 16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11,
+ /* Size 8x16 */
+ 32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31,
+ 29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22,
+ 21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19,
+ 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17,
+ 16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14,
+ 14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+ 12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11,
+ 11, 11,
+ /* Size 16x8 */
+ 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29,
+ 24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21,
+ 21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17,
+ 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14,
+ 20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20,
+ 19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16,
+ 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13,
+ 12, 11,
+ /* Size 16x32 */
+ 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32,
+ 31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20,
+ 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25,
+ 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+ 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22,
+ 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+ 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+ 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20,
+ 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+ 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17,
+ 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11,
+ 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+ 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13,
+ 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+ 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 11, 10,
+ /* Size 32x16 */
+ 32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33,
+ 31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27,
+ 25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22,
+ 23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23,
+ 22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22,
+ 20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20,
+ 19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19,
+ 18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18,
+ 24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22,
+ 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21,
+ 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19,
+ 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18,
+ 18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17,
+ 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16,
+ 15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+ 14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14,
+ 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20,
+ 21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21,
+ 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18,
+ 17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14,
+ 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13,
+ 12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11,
+ 11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11,
+ 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17,
+ 17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18,
+ 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+ 16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14,
+ 14, 13, 13, 12, 12, 11, 11, 10,
+ /* Size 4x16 */
+ 33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22,
+ 22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22,
+ 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+ /* Size 16x4 */
+ 33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21,
+ 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14,
+ 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17,
+ 14, 11, 17, 17, 13, 11, 17, 16, 13, 11,
+ /* Size 8x32 */
+ 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31,
+ 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+ 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22,
+ 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18,
+ 18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+ 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+ 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19,
+ 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+ 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18,
+ 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17,
+ 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11,
+ /* Size 32x8 */
+ 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31,
+ 25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22,
+ 22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20,
+ 19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17,
+ 24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22,
+ 20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19,
+ 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16,
+ 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14,
+ 20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20,
+ 20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17,
+ 15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13,
+ 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11,
+ 16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17,
+ 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16,
+ 14, 13, 12, 11 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8,
+ /* Size 8x8 */
+ 32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29,
+ 26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17,
+ 14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10,
+ 9, 8, 13, 14, 13, 12, 10, 9, 8, 7,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32,
+ 32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32,
+ 32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29,
+ 28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+ 24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20,
+ 19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13,
+ 12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11,
+ 21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20,
+ 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19,
+ 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16,
+ 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12,
+ 11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9,
+ 9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+ 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18,
+ 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15,
+ 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14,
+ 14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25,
+ 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32,
+ 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20,
+ 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31,
+ 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17,
+ 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28,
+ 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15,
+ 15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+ 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14,
+ 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20,
+ 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31,
+ 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+ 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27,
+ 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+ 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+ 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27,
+ 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24,
+ 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22,
+ 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17,
+ 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10,
+ 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14,
+ 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21,
+ 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21,
+ 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14,
+ 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17,
+ 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10,
+ 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12,
+ 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16,
+ 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10,
+ 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14,
+ 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7,
+ 7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14,
+ 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+ 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+ 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+ /* Size 4x8 */
+ 32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20,
+ 19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8,
+ /* Size 8x4 */
+ 32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18,
+ 13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8,
+ /* Size 8x16 */
+ 32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32,
+ 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30,
+ 29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24,
+ 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18,
+ 16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13,
+ 12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10,
+ 10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+ /* Size 16x8 */
+ 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+ 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27,
+ 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17,
+ 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11,
+ 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19,
+ 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14,
+ 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8,
+ 8,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+ 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+ 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16,
+ 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27,
+ 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14,
+ 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+ 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29,
+ 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28,
+ 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21,
+ 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+ 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11,
+ 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13,
+ 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21,
+ 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18,
+ 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+ 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13,
+ 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11,
+ 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15,
+ 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+ 8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12,
+ 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+ /* Size 32x16 */
+ 32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32,
+ 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32,
+ 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30,
+ 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25,
+ 25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20,
+ 20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17,
+ 17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15,
+ 15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14,
+ 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30,
+ 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27,
+ 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21,
+ 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18,
+ 18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16,
+ 16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13,
+ 13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11,
+ 11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+ 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21,
+ 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20,
+ 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16,
+ 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14,
+ 12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10,
+ 10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9,
+ 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16,
+ 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15,
+ 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11,
+ 11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10,
+ 8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8,
+ 7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29,
+ 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20,
+ 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14,
+ 13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+ /* Size 16x4 */
+ 33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27,
+ 21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11,
+ 23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14,
+ 11, 8, 14, 13, 10, 8, 14, 13, 10, 8,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+ 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+ 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+ 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20,
+ 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17,
+ 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13,
+ 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7,
+ /* Size 32x8 */
+ 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+ 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30,
+ 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20,
+ 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15,
+ 30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30,
+ 27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20,
+ 18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14,
+ 13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11,
+ 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21,
+ 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16,
+ 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11,
+ 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13,
+ 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13,
+ 11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8,
+ 7 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12,
+ /* Size 8x8 */
+ 33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23,
+ 21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18,
+ 17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14,
+ 12, 12, 17, 18, 18, 16, 14, 13, 12, 11,
+ /* Size 16x16 */
+ 32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+ 33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29,
+ 26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23,
+ 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22,
+ 23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20,
+ 20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19,
+ 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17,
+ 16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15,
+ 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20,
+ 22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21,
+ 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19,
+ 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14,
+ 14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13,
+ 12, 12, 11, 11,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33,
+ 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27,
+ 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22,
+ 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+ 18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23,
+ 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30,
+ 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+ 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26,
+ 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20,
+ 20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22,
+ 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+ 19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+ 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18,
+ 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20,
+ 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24,
+ 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20,
+ 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+ 17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+ 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23,
+ 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21,
+ 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+ 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22,
+ 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22,
+ 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14,
+ 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+ 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+ 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15,
+ 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12,
+ 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18,
+ 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+ 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ /* Size 4x8 */
+ 33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22,
+ 20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11,
+ /* Size 8x4 */
+ 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19,
+ 16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11,
+ /* Size 8x16 */
+ 32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+ 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24,
+ 22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20,
+ 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18,
+ 18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16,
+ 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14,
+ 13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12,
+ 12, 11,
+ /* Size 16x8 */
+ 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32,
+ 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22,
+ 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19,
+ 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15,
+ 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20,
+ 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18,
+ 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14,
+ 12, 11,
+ /* Size 16x32 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+ 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26,
+ 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+ 18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21,
+ 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+ 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21,
+ 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+ 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17,
+ 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17,
+ 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 11, 11,
+ /* Size 32x16 */
+ 32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33,
+ 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27,
+ 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22,
+ 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23,
+ 23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22,
+ 22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20,
+ 20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19,
+ 19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+ 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24,
+ 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21,
+ 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19,
+ 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18,
+ 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18,
+ 18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16,
+ 16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21,
+ 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22,
+ 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19,
+ 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17,
+ 17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15,
+ 15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13,
+ 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12,
+ 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+ 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18,
+ 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18,
+ 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17,
+ 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15,
+ 15, 13, 13, 12, 12, 11, 11, 11,
+ /* Size 4x16 */
+ 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22,
+ 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22,
+ 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18,
+ 17, 16, 15, 14, 14, 13, 12, 12, 12, 11,
+ /* Size 16x4 */
+ 33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22,
+ 22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15,
+ 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18,
+ 14, 12, 18, 17, 14, 12, 17, 17, 14, 11,
+ /* Size 8x32 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+ 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22,
+ 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+ 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20,
+ 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+ 12, 11, 11, 11,
+ /* Size 32x8 */
+ 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33,
+ 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22,
+ 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22,
+ 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19,
+ 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22,
+ 21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19,
+ 18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17,
+ 16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14,
+ 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21,
+ 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19,
+ 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15,
+ 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12,
+ 16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17,
+ 18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16,
+ 15, 13, 12, 11 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9,
+ /* Size 8x8 */
+ 33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31,
+ 29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19,
+ 16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12,
+ 10, 9, 15, 16, 16, 14, 12, 11, 9, 9,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32,
+ 32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32,
+ 32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29,
+ 28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+ 25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21,
+ 20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17,
+ 17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14,
+ 13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+ 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22,
+ 23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20,
+ 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18,
+ 17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14,
+ 14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12,
+ 11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9,
+ 9, 8, 8,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+ 23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20,
+ 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17,
+ 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+ 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28,
+ 27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23,
+ 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31,
+ 31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20,
+ 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27,
+ 26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15,
+ 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24,
+ 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31,
+ 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19,
+ 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29,
+ 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24,
+ 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14,
+ 13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28,
+ 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17,
+ 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27,
+ 26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25,
+ 22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+ 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19,
+ 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+ 23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23,
+ 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+ 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23,
+ 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+ 12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19,
+ 18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19,
+ 19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13,
+ 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19,
+ 19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11,
+ 11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18,
+ 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10,
+ 10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16,
+ 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+ 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14,
+ 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9,
+ 9, 9, 9, 9, 8, 8, 8, 8,
+ /* Size 4x8 */
+ 32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24,
+ 23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9,
+ /* Size 8x4 */
+ 32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21,
+ 16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9,
+ /* Size 8x16 */
+ 32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32,
+ 32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30,
+ 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24,
+ 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18,
+ 17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11,
+ 11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+ 8,
+ /* Size 16x8 */
+ 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+ 31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27,
+ 24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17,
+ 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13,
+ 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21,
+ 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15,
+ 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9,
+ 8,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+ 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21,
+ 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18,
+ 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28,
+ 28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16,
+ 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26,
+ 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30,
+ 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20,
+ 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29,
+ 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+ 23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+ 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19,
+ 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+ 23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20,
+ 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21,
+ 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17,
+ 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+ 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15,
+ 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+ 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+ 8, 8,
+ /* Size 32x16 */
+ 32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32,
+ 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32,
+ 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31,
+ 29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28,
+ 25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24,
+ 21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20,
+ 18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17,
+ 16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15,
+ 32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30,
+ 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28,
+ 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24,
+ 22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21,
+ 19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18,
+ 16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16,
+ 14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13,
+ 13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11,
+ 23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23,
+ 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22,
+ 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19,
+ 17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16,
+ 14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14,
+ 12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12,
+ 11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10,
+ 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15,
+ 17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16,
+ 16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15,
+ 14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11,
+ 11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10,
+ 9, 9, 8, 8,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30,
+ 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24,
+ 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16,
+ 15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+ /* Size 16x4 */
+ 33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28,
+ 24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13,
+ 24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16,
+ 13, 10, 16, 15, 12, 9, 14, 14, 11, 9,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+ 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21,
+ 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30,
+ 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18,
+ 18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24,
+ 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+ 13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20,
+ 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18,
+ 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 8, 8,
+ /* Size 32x8 */
+ 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+ 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30,
+ 25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20,
+ 18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16,
+ 32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30,
+ 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21,
+ 19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16,
+ 14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12,
+ 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23,
+ 22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17,
+ 14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12,
+ 11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9,
+ 15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16,
+ 14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10,
+ 9, 8 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13,
+ /* Size 8x8 */
+ 33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24,
+ 22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19,
+ 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15,
+ 14, 13, 18, 19, 20, 18, 16, 14, 13, 12,
+ /* Size 16x16 */
+ 32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+ 33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29,
+ 26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23,
+ 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22,
+ 22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21,
+ 20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19,
+ 19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18,
+ 17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16,
+ 21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21,
+ 22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22,
+ 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20,
+ 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18,
+ 17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16,
+ 15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14,
+ 13, 12, 12, 12,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21,
+ 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33,
+ 33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29,
+ 27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+ 19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24,
+ 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+ 18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23,
+ 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32,
+ 31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28,
+ 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21,
+ 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+ 28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+ 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24,
+ 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20,
+ 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23,
+ 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+ 17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22,
+ 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+ 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17,
+ 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19,
+ 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+ 21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22,
+ 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16,
+ 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+ 15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+ 14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20,
+ 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21,
+ 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15,
+ 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21,
+ 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14,
+ 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18,
+ 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12,
+ 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19,
+ 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19,
+ 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13,
+ 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+ 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+ /* Size 4x8 */
+ 33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23,
+ 22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13,
+ /* Size 8x4 */
+ 33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21,
+ 17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13,
+ /* Size 8x16 */
+ 32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+ 32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24,
+ 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20,
+ 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19,
+ 18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15,
+ 14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13,
+ 12, 12,
+ /* Size 16x8 */
+ 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32,
+ 26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22,
+ 23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19,
+ 19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16,
+ 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21,
+ 22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18,
+ 16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14,
+ 13, 12,
+ /* Size 16x32 */
+ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+ 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+ 32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28,
+ 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20,
+ 20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22,
+ 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19,
+ 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24,
+ 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20,
+ 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17,
+ 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19,
+ 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+ 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21,
+ 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22,
+ 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+ 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+ 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+ 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18,
+ 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15,
+ 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+ 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 13, 12, 12, 12, 12, 12, 12,
+ /* Size 32x16 */
+ 32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33,
+ 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28,
+ 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24,
+ 22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22,
+ 23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23,
+ 21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22,
+ 20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20,
+ 19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19,
+ 28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24,
+ 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22,
+ 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20,
+ 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19,
+ 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+ 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18,
+ 17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17,
+ 16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15,
+ 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22,
+ 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22,
+ 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20,
+ 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18,
+ 17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17,
+ 15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15,
+ 14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14,
+ 13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12,
+ 18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19,
+ 19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19,
+ 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18,
+ 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17,
+ 15, 15, 14, 14, 13, 12, 12, 12,
+ /* Size 4x16 */
+ 33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24,
+ 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22,
+ 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19,
+ 18, 18, 17, 16, 15, 14, 14, 13, 13, 12,
+ /* Size 16x4 */
+ 33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22,
+ 22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17,
+ 22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19,
+ 16, 13, 19, 19, 16, 13, 18, 18, 15, 12,
+ /* Size 8x32 */
+ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+ 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+ 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21,
+ 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23,
+ 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+ 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+ 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20,
+ 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20,
+ 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+ 12, 12, 12, 12,
+ /* Size 32x8 */
+ 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33,
+ 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22,
+ 23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22,
+ 20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20,
+ 28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24,
+ 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19,
+ 19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18,
+ 17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16,
+ 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22,
+ 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19,
+ 17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15,
+ 14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13,
+ 18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19,
+ 19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17,
+ 15, 14, 13, 12 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11,
+ /* Size 8x8 */
+ 33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32,
+ 29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22,
+ 19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14,
+ 12, 11, 16, 17, 18, 16, 14, 12, 11, 10,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+ 32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32,
+ 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31,
+ 29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28,
+ 26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25,
+ 24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20,
+ 19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16,
+ 16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14,
+ 25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24,
+ 25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23,
+ 23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21,
+ 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16,
+ 15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12,
+ 11, 11, 10, 10,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+ 25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22,
+ 22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20,
+ 19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+ 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25,
+ 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22,
+ 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+ 30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19,
+ 18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28,
+ 28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26,
+ 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32,
+ 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23,
+ 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29,
+ 29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19,
+ 18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26,
+ 26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16,
+ 16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23,
+ 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29,
+ 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30,
+ 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26,
+ 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15,
+ 15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22,
+ 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19,
+ 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24,
+ 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24,
+ 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14,
+ 14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21,
+ 20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+ 12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18,
+ 17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+ 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21,
+ 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+ 12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21,
+ 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12,
+ 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17,
+ 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+ 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18,
+ 18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17,
+ 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+ 10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14,
+ 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+ /* Size 4x8 */
+ 32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25,
+ 24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11,
+ /* Size 8x4 */
+ 32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26,
+ 18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+ 32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30,
+ 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28,
+ 25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20,
+ 20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17,
+ 16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13,
+ 12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11,
+ 10, 10,
+ /* Size 16x8 */
+ 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32,
+ 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28,
+ 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21,
+ 19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14,
+ 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23,
+ 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17,
+ 15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13,
+ 11, 10,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+ 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23,
+ 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21,
+ 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29,
+ 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+ 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27,
+ 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32,
+ 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24,
+ 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30,
+ 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19,
+ 18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+ 27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16,
+ 16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18,
+ 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25,
+ 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21,
+ 21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19,
+ 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+ 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16,
+ 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16,
+ 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13,
+ 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17,
+ 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 10, 9,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32,
+ 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32,
+ 32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32,
+ 30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30,
+ 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25,
+ 25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21,
+ 21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19,
+ 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18,
+ 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32,
+ 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29,
+ 28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28,
+ 25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23,
+ 22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19,
+ 19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18,
+ 17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15,
+ 14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14,
+ 25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25,
+ 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24,
+ 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23,
+ 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18,
+ 17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15,
+ 15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13,
+ 12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12,
+ 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11,
+ 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18,
+ 18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17,
+ 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18,
+ 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14,
+ 13, 12, 12, 11, 11, 10, 9, 9,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32,
+ 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24,
+ 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20,
+ 18, 16, 15, 15, 14, 13, 12, 11, 11, 11,
+ /* Size 16x4 */
+ 33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30,
+ 24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15,
+ 26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19,
+ 14, 11, 18, 18, 13, 11, 17, 18, 13, 11,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+ 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23,
+ 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30,
+ 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20,
+ 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16,
+ 16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24,
+ 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+ 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21,
+ 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+ 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18,
+ 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,
+ 10, 10, 10, 9,
+ /* Size 32x8 */
+ 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32,
+ 32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30,
+ 28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24,
+ 21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18,
+ 32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31,
+ 28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24,
+ 22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19,
+ 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14,
+ 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25,
+ 24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19,
+ 17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14,
+ 12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11,
+ 18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17,
+ 18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15,
+ 13, 12, 11, 9 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14,
+ /* Size 8x8 */
+ 33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26,
+ 22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20,
+ 19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16,
+ 16, 15, 19, 20, 20, 19, 17, 16, 15, 13,
+ /* Size 16x16 */
+ 32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+ 33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30,
+ 28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24,
+ 23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22,
+ 22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22,
+ 23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20,
+ 20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+ 21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22,
+ 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22,
+ 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22,
+ 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19,
+ 18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17,
+ 16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15,
+ 15, 14, 14, 13,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+ 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30,
+ 28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20,
+ 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25,
+ 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+ 19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22,
+ 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33,
+ 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23,
+ 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30,
+ 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+ 22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24,
+ 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21,
+ 20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20,
+ 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+ 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26,
+ 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22,
+ 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23,
+ 23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21,
+ 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19,
+ 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+ 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18,
+ 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17,
+ 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+ 17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+ 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+ 16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+ 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22,
+ 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+ 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14,
+ 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20,
+ 20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20,
+ 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+ 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13,
+ 13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13,
+ /* Size 4x8 */
+ 33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23,
+ 23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14,
+ /* Size 8x4 */
+ 33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22,
+ 18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14,
+ /* Size 8x16 */
+ 32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+ 32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24,
+ 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22,
+ 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19,
+ 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18,
+ 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16,
+ 16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14,
+ 14, 13,
+ /* Size 16x8 */
+ 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32,
+ 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22,
+ 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21,
+ 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17,
+ 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22,
+ 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19,
+ 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16,
+ 15, 13,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+ 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28,
+ 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21,
+ 21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23,
+ 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20,
+ 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27,
+ 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17,
+ 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22,
+ 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+ 15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20,
+ 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+ 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19,
+ 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19,
+ 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20,
+ 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+ 14, 14, 14, 14, 14, 13, 13, 13,
+ /* Size 32x16 */
+ 32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33,
+ 33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30,
+ 27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26,
+ 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22,
+ 23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23,
+ 23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22,
+ 22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21,
+ 20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20,
+ 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26,
+ 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23,
+ 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22,
+ 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20,
+ 20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19,
+ 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19,
+ 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18,
+ 17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+ 21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+ 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23,
+ 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22,
+ 20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19,
+ 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17,
+ 17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16,
+ 16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15,
+ 14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14,
+ 19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20,
+ 20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20,
+ 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20,
+ 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18,
+ 17, 16, 16, 15, 14, 14, 13, 13,
+ /* Size 4x16 */
+ 33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27,
+ 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23,
+ 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21,
+ 20, 19, 18, 17, 17, 16, 15, 15, 14, 14,
+ /* Size 16x4 */
+ 33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23,
+ 23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18,
+ 22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21,
+ 17, 15, 20, 21, 16, 14, 19, 20, 16, 14,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32,
+ 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22,
+ 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24,
+ 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22,
+ 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21,
+ 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+ 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14,
+ 14, 13, 13, 13,
+ /* Size 32x8 */
+ 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33,
+ 27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23,
+ 23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23,
+ 22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20,
+ 28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24,
+ 22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20,
+ 20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19,
+ 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17,
+ 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23,
+ 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20,
+ 18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17,
+ 16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14,
+ 19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20,
+ 20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18,
+ 17, 16, 14, 13 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13,
+ /* Size 8x8 */
+ 33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32,
+ 30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24,
+ 21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16,
+ 14, 13, 19, 20, 20, 19, 17, 15, 13, 12,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32,
+ 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32,
+ 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31,
+ 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29,
+ 28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26,
+ 26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+ 22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20,
+ 19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16,
+ 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27,
+ 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25,
+ 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23,
+ 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20,
+ 18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16,
+ 16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14,
+ 13, 13, 12, 11,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+ 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25,
+ 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22,
+ 22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20,
+ 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+ 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28,
+ 28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25,
+ 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21,
+ 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+ 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+ 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32,
+ 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24,
+ 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22,
+ 22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+ 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+ 19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25,
+ 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30,
+ 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23,
+ 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30,
+ 30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20,
+ 19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+ 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+ 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20,
+ 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27,
+ 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18,
+ 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26,
+ 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+ 17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24,
+ 23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+ 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20,
+ 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24,
+ 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+ 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14,
+ 13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20,
+ 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13,
+ 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17,
+ 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20,
+ 20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15,
+ 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13,
+ 13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17,
+ 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11,
+ /* Size 4x8 */
+ 32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28,
+ 27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12,
+ /* Size 8x4 */
+ 32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27,
+ 21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32,
+ 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31,
+ 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28,
+ 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24,
+ 21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18,
+ 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+ 15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13,
+ 12, 12,
+ /* Size 16x8 */
+ 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32,
+ 32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30,
+ 28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23,
+ 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17,
+ 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25,
+ 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21,
+ 18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14,
+ 13, 12,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+ 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26,
+ 24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23,
+ 22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20,
+ 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28,
+ 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26,
+ 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31,
+ 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23,
+ 22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+ 27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18,
+ 17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+ 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21,
+ 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24,
+ 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17,
+ 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24,
+ 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15,
+ 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13,
+ 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20,
+ 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+ 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 11, 11,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32,
+ 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32,
+ 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32,
+ 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30,
+ 29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+ 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25,
+ 23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21,
+ 21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19,
+ 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32,
+ 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31,
+ 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28,
+ 28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24,
+ 24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23,
+ 21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20,
+ 19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17,
+ 17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16,
+ 27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27,
+ 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27,
+ 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24,
+ 23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20,
+ 19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18,
+ 16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16,
+ 15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13,
+ 13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13,
+ 19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20,
+ 21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20,
+ 20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19,
+ 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17,
+ 16, 15, 14, 14, 13, 12, 12, 11,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32,
+ 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28,
+ 27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21,
+ 20, 19, 17, 16, 16, 14, 14, 13, 12, 12,
+ /* Size 16x4 */
+ 33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30,
+ 27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17,
+ 28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22,
+ 17, 13, 20, 20, 16, 12, 19, 19, 15, 12,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+ 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26,
+ 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31,
+ 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22,
+ 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+ 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19,
+ 19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23,
+ 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24,
+ 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+ 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21,
+ 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12,
+ /* Size 32x8 */
+ 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32,
+ 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31,
+ 29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25,
+ 23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20,
+ 32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31,
+ 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27,
+ 24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20,
+ 19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17,
+ 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28,
+ 26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23,
+ 19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16,
+ 15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13,
+ 19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20,
+ 20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19,
+ 16, 14, 13, 12 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16,
+ /* Size 8x8 */
+ 33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28,
+ 25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20,
+ 19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17,
+ 17, 16, 20, 21, 21, 20, 19, 17, 16, 15,
+ /* Size 16x16 */
+ 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+ 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32,
+ 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26,
+ 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23,
+ 22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22,
+ 22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+ 22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20,
+ 20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22,
+ 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23,
+ 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22,
+ 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20,
+ 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18,
+ 18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17,
+ 16, 16, 15, 14,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21,
+ 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+ 33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30,
+ 30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27,
+ 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20,
+ 20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23,
+ 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33,
+ 33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23,
+ 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32,
+ 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26,
+ 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+ 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23,
+ 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+ 30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27,
+ 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24,
+ 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+ 21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21,
+ 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24,
+ 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+ 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22,
+ 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22,
+ 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+ 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+ 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+ 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+ 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+ 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+ 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21,
+ 21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
+ 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+ 14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+ 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14,
+ /* Size 4x8 */
+ 33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22,
+ 22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15,
+ /* Size 8x4 */
+ 33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21,
+ 19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15,
+ /* Size 8x16 */
+ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+ 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27,
+ 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22,
+ 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19,
+ 18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+ 17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16,
+ 16, 15,
+ /* Size 16x8 */
+ 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32,
+ 27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23,
+ 22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22,
+ 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19,
+ 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23,
+ 23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21,
+ 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17,
+ 16, 15,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+ 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+ 33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29,
+ 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+ 22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21,
+ 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22,
+ 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27,
+ 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25,
+ 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+ 21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22,
+ 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+ 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22,
+ 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+ 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23,
+ 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+ 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16,
+ 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20,
+ 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18,
+ 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20,
+ 20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 14, 14,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+ 33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32,
+ 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27,
+ 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23,
+ 22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22,
+ 23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23,
+ 22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22,
+ 22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+ 30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26,
+ 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25,
+ 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22,
+ 22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20,
+ 20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20,
+ 21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20,
+ 20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22,
+ 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22,
+ 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22,
+ 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20,
+ 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19,
+ 18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17,
+ 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16,
+ 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+ 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21,
+ 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21,
+ 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21,
+ 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19,
+ 19, 18, 17, 17, 16, 15, 15, 14,
+ /* Size 4x16 */
+ 33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27,
+ 26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+ 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22,
+ 21, 20, 19, 18, 18, 17, 16, 16, 16, 15,
+ /* Size 16x4 */
+ 33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24,
+ 22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19,
+ 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22,
+ 19, 16, 21, 22, 18, 16, 20, 21, 18, 15,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+ 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+ 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25,
+ 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22,
+ 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21,
+ 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+ 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+ 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22,
+ 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+ 16, 15, 15, 15,
+ /* Size 32x8 */
+ 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33,
+ 28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26,
+ 22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23,
+ 22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22,
+ 30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26,
+ 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21,
+ 20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20,
+ 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19,
+ 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22,
+ 22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22,
+ 19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17,
+ 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16,
+ 20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21,
+ 21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20,
+ 19, 17, 16, 15 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16,
+ /* Size 8x8 */
+ 33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32,
+ 31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28,
+ 25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18,
+ 16, 15, 22, 23, 23, 22, 20, 17, 15, 14,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+ 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32,
+ 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30,
+ 29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28,
+ 28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26,
+ 25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23,
+ 22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20,
+ 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29,
+ 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28,
+ 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26,
+ 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23,
+ 22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+ 18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17,
+ 16, 15, 15, 14,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30,
+ 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+ 26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24,
+ 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22,
+ 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+ 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+ 27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24,
+ 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29,
+ 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27,
+ 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24,
+ 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+ 22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28,
+ 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26,
+ 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31,
+ 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23,
+ 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29,
+ 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21,
+ 20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+ 27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+ 29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+ 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29,
+ 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29,
+ 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19,
+ 19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26,
+ 26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17,
+ 17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23,
+ 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26,
+ 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20,
+ 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27,
+ 27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19,
+ 18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24,
+ 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16,
+ 16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+ 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15,
+ 23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21,
+ 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18,
+ 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15,
+ 15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14,
+ 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19,
+ 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+ /* Size 4x8 */
+ 33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30,
+ 28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15,
+ /* Size 8x4 */
+ 33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28,
+ 24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+ 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32,
+ 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30,
+ 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26,
+ 25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22,
+ 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+ 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16,
+ 15, 15,
+ /* Size 16x8 */
+ 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32,
+ 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30,
+ 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27,
+ 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20,
+ 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28,
+ 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24,
+ 20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18,
+ 15, 15,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+ 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+ 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25,
+ 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23,
+ 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32,
+ 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27,
+ 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31,
+ 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+ 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+ 22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28,
+ 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18,
+ 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+ 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29,
+ 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21,
+ 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27,
+ 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17,
+ 17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24,
+ 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15,
+ 15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21,
+ 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18,
+ 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21,
+ 21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+ 15, 14, 14, 14, 14, 13, 13, 13,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33,
+ 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32,
+ 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32,
+ 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31,
+ 30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+ 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27,
+ 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25,
+ 23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21,
+ 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32,
+ 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32,
+ 31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29,
+ 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28,
+ 28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26,
+ 26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23,
+ 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21,
+ 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19,
+ 29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29,
+ 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30,
+ 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26,
+ 26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24,
+ 21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20,
+ 20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18,
+ 17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16,
+ 16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14,
+ 23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23,
+ 23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23,
+ 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22,
+ 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20,
+ 18, 18, 18, 16, 15, 15, 14, 13,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32,
+ 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30,
+ 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24,
+ 24, 23, 21, 20, 19, 18, 17, 16, 16, 15,
+ /* Size 16x4 */
+ 33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30,
+ 28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21,
+ 29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24,
+ 19, 16, 23, 23, 18, 16, 22, 22, 18, 15,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+ 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28,
+ 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24,
+ 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+ 22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25,
+ 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29,
+ 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21,
+ 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25,
+ 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18,
+ 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15,
+ 15, 15, 15, 14,
+ /* Size 32x8 */
+ 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32,
+ 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32,
+ 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30,
+ 25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23,
+ 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32,
+ 31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29,
+ 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24,
+ 21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20,
+ 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30,
+ 28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26,
+ 21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20,
+ 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16,
+ 23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23,
+ 23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21,
+ 18, 18, 15, 14 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17,
+ /* Size 8x8 */
+ 33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29,
+ 26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22,
+ 21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19,
+ 18, 17, 21, 22, 22, 22, 20, 19, 17, 17,
+ /* Size 16x16 */
+ 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+ 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33,
+ 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29,
+ 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24,
+ 23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22,
+ 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22,
+ 22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22,
+ 22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20,
+ 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22,
+ 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23,
+ 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22,
+ 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22,
+ 21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+ 19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18,
+ 18, 17, 17, 17,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+ 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28,
+ 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24,
+ 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33,
+ 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32,
+ 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23,
+ 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28,
+ 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24,
+ 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+ 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30,
+ 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27,
+ 25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+ 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26,
+ 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24,
+ 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23,
+ 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+ 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22,
+ 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22,
+ 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+ 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+ 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+ 17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+ 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+ /* Size 4x8 */
+ 33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17,
+ /* Size 8x4 */
+ 33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22,
+ 20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17,
+ /* Size 8x16 */
+ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+ 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29,
+ 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24,
+ 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+ 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+ 17, 17,
+ /* Size 16x8 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32,
+ 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24,
+ 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22,
+ 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20,
+ 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+ 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22,
+ 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19,
+ 17, 17,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31,
+ 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23,
+ 23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27,
+ 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22,
+ 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27,
+ 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26,
+ 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+ 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+ 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23,
+ 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17,
+ 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21,
+ 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 17, 17, 17, 17, 16, 16, 16, 16,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33,
+ 33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33,
+ 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27,
+ 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26,
+ 23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22,
+ 22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23,
+ 23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23,
+ 22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22,
+ 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29,
+ 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27,
+ 25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22,
+ 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21,
+ 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21,
+ 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22,
+ 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+ 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+ 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21,
+ 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19,
+ 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19,
+ 18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18,
+ 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17,
+ 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21,
+ 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22,
+ 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22,
+ 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21,
+ 20, 19, 19, 18, 17, 17, 17, 16,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+ 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22,
+ 23, 22, 21, 19, 19, 18, 18, 18, 17, 17,
+ /* Size 16x4 */
+ 33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24,
+ 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21,
+ 22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22,
+ 19, 18, 21, 22, 19, 17, 21, 22, 19, 17,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+ 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22,
+ 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29,
+ 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23,
+ 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+ 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+ 17, 17, 17, 17,
+ /* Size 32x8 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33,
+ 30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26,
+ 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22,
+ 23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22,
+ 31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27,
+ 25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22,
+ 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20,
+ 21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20,
+ 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+ 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22,
+ 20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19,
+ 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17,
+ 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22,
+ 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22,
+ 20, 19, 17, 17 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19,
+ /* Size 8x8 */
+ 33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32,
+ 32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29,
+ 28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22,
+ 20, 19, 25, 26, 26, 25, 23, 21, 19, 18,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28,
+ 28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+ 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+ 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30,
+ 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30,
+ 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28,
+ 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26,
+ 26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23,
+ 23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19,
+ 19, 18, 18, 16,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+ 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+ 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+ 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30,
+ 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+ 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+ 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30,
+ 30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+ 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+ 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26,
+ 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+ 30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+ 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+ 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+ 26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+ 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+ 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30,
+ 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25,
+ 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31,
+ 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+ 23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+ 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+ 20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27,
+ 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29,
+ 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+ 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+ 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20,
+ 20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26,
+ 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24,
+ 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27,
+ 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22,
+ 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19,
+ 18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24,
+ 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+ 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+ 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16,
+ /* Size 4x8 */
+ 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30,
+ 30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18,
+ /* Size 8x4 */
+ 33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30,
+ 27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+ 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30,
+ 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24,
+ 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+ 21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+ 18, 16,
+ /* Size 16x8 */
+ 32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32,
+ 32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31,
+ 31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28,
+ 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24,
+ 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30,
+ 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26,
+ 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19,
+ 19, 16,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+ 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+ 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26,
+ 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+ 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+ 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+ 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24,
+ 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+ 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29,
+ 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+ 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30,
+ 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21,
+ 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28,
+ 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20,
+ 19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26,
+ 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24,
+ 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21,
+ 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25,
+ 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19,
+ 19, 18, 18, 18, 18, 17, 16, 16,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30,
+ 29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+ 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28,
+ 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25,
+ 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32,
+ 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32,
+ 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30,
+ 30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28,
+ 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27,
+ 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+ 24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23,
+ 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30,
+ 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30,
+ 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28,
+ 28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27,
+ 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24,
+ 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21,
+ 21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20,
+ 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18,
+ 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27,
+ 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26,
+ 26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24,
+ 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24,
+ 24, 21, 19, 19, 19, 18, 16, 16,
+ /* Size 4x16 */
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31,
+ 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26,
+ 26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+ /* Size 16x4 */
+ 33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32,
+ 31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26,
+ 30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26,
+ 23, 19, 27, 26, 23, 19, 24, 24, 21, 18,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+ 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+ 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24,
+ 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+ 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29,
+ 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+ 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+ 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25,
+ 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 16, 16,
+ /* Size 32x8 */
+ 32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32,
+ 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32,
+ 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30,
+ 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25,
+ 33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32,
+ 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30,
+ 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27,
+ 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23,
+ 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30,
+ 30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27,
+ 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21,
+ 21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18,
+ 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26,
+ 26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24,
+ 24, 19, 19, 16 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19,
+ /* Size 8x8 */
+ 33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31,
+ 29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23,
+ 22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20,
+ 19, 19, 21, 22, 23, 22, 22, 20, 19, 18,
+ /* Size 16x16 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33,
+ 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33,
+ 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29,
+ 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26,
+ 26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23,
+ 23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22,
+ 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+ 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23,
+ 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24,
+ 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+ 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22,
+ 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+ 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19,
+ 19, 19, 19, 18,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+ 25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30,
+ 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27,
+ 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24,
+ 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33,
+ 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31,
+ 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23,
+ 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28,
+ 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29,
+ 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26,
+ 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28,
+ 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+ 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+ 22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23,
+ 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+ 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+ 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+ 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+ 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22,
+ 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19,
+ 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ /* Size 4x8 */
+ 33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24,
+ 23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19,
+ /* Size 8x4 */
+ 33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23,
+ 21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19,
+ /* Size 8x16 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33,
+ 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32,
+ 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24,
+ 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22,
+ 22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+ 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+ 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+ 18, 18,
+ /* Size 16x8 */
+ 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33,
+ 33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26,
+ 26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22,
+ 22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23,
+ 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22,
+ 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22,
+ 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19,
+ 19, 18,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+ 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+ 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28,
+ 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26,
+ 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30,
+ 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+ 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+ 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21,
+ 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21,
+ 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+ 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19,
+ 19, 19, 18, 18, 18, 18, 18, 18,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33,
+ 33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33,
+ 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30,
+ 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27,
+ 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24,
+ 22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22,
+ 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23,
+ 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23,
+ 32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30,
+ 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28,
+ 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23,
+ 23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22,
+ 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22,
+ 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24,
+ 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24,
+ 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22,
+ 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21,
+ 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20,
+ 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19,
+ 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+ 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23,
+ 23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23,
+ 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22,
+ 22, 21, 19, 19, 19, 18, 18, 18,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30,
+ 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24,
+ 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22,
+ 22, 22, 22, 20, 20, 19, 19, 19, 19, 18,
+ /* Size 16x4 */
+ 33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29,
+ 24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22,
+ 24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22,
+ 20, 19, 22, 22, 20, 19, 22, 23, 21, 18,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+ 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+ 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24,
+ 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+ 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+ 18, 18, 18, 18,
+ /* Size 32x8 */
+ 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33,
+ 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27,
+ 27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22,
+ 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23,
+ 32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28,
+ 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23,
+ 23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22,
+ 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22,
+ 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24,
+ 24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21,
+ 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19,
+ 19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18,
+ 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23,
+ 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22,
+ 22, 19, 19, 18 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22,
+ /* Size 8x8 */
+ 33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32,
+ 32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30,
+ 29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27,
+ 26, 24, 29, 29, 30, 28, 27, 26, 24, 21,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+ 29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28,
+ 28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31,
+ 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31,
+ 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28,
+ 28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+ 27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24,
+ 24, 23, 21, 21,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29,
+ 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+ 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29,
+ 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+ 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+ 28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26,
+ 26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29,
+ 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28,
+ 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30,
+ 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26,
+ 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24,
+ 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28,
+ 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28,
+ 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29,
+ 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+ 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30,
+ 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24,
+ 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21,
+ 21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+ 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20,
+ /* Size 4x8 */
+ 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32,
+ 31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21,
+ /* Size 8x4 */
+ 33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31,
+ 29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32,
+ 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30,
+ 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29,
+ 29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23,
+ 21, 21,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32,
+ 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32,
+ 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30,
+ 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28,
+ 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31,
+ 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29,
+ 28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27,
+ 22, 21,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+ 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+ 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+ 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+ 26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29,
+ 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22,
+ 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28,
+ 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27,
+ 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29,
+ 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24,
+ 24, 24, 23, 22, 21, 21, 21, 21,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29,
+ 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+ 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30,
+ 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28,
+ 28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27,
+ 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32,
+ 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31,
+ 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31,
+ 30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28,
+ 28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28,
+ 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24,
+ 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23,
+ 29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29,
+ 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30,
+ 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30,
+ 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27,
+ 26, 26, 26, 24, 22, 21, 21, 21,
+ /* Size 4x16 */
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32,
+ 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30,
+ 28, 28, 28, 27, 27, 25, 24, 23, 21, 21,
+ /* Size 16x4 */
+ 33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32,
+ 31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28,
+ 32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30,
+ 28, 23, 29, 30, 27, 21, 29, 30, 27, 21,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29,
+ 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29,
+ 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25,
+ 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30,
+ 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22,
+ 21, 21, 21, 21,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32,
+ 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32,
+ 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32,
+ 30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30,
+ 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32,
+ 32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31,
+ 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30,
+ 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27,
+ 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32,
+ 31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29,
+ 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28,
+ 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23,
+ 29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29,
+ 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28,
+ 26, 26, 22, 21 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20,
+ /* Size 8x8 */
+ 33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33,
+ 32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24,
+ 22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21,
+ 21, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33,
+ 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33,
+ 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33,
+ 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29,
+ 28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26,
+ 26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23,
+ 23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22,
+ 22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22,
+ 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+ 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25,
+ 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24,
+ 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 19, 19,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28,
+ 28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24,
+ 23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22,
+ 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30,
+ 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27,
+ 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29,
+ 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22,
+ 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26,
+ 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25,
+ 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28,
+ 28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+ 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+ 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30,
+ 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24,
+ 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30,
+ 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26,
+ 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24,
+ 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+ 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+ 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+ 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20,
+ 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19,
+ 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+ /* Size 4x8 */
+ 33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27,
+ 26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19,
+ /* Size 8x4 */
+ 33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26,
+ 22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33,
+ 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32,
+ 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29,
+ 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24,
+ 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22,
+ 22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 19, 19,
+ /* Size 16x8 */
+ 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33,
+ 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29,
+ 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24,
+ 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22,
+ 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24,
+ 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23,
+ 22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21,
+ 20, 19,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+ 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26,
+ 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+ 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+ 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28,
+ 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26,
+ 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30,
+ 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23,
+ 23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27,
+ 27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22,
+ 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24,
+ 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+ 21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+ 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 20, 20, 19, 19, 19, 19,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33,
+ 33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+ 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28,
+ 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27,
+ 27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25,
+ 23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22,
+ 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22,
+ 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33,
+ 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31,
+ 31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29,
+ 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24,
+ 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23,
+ 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22,
+ 22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+ 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27,
+ 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26,
+ 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25,
+ 24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22,
+ 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22,
+ 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+ 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 20, 19, 19, 19,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33,
+ 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26,
+ 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+ /* Size 16x4 */
+ 33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32,
+ 26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22,
+ 27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23,
+ 22, 20, 21, 22, 21, 19, 21, 22, 21, 19,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+ 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26,
+ 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+ 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+ 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28,
+ 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+ 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19,
+ /* Size 32x8 */
+ 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33,
+ 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30,
+ 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26,
+ 23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22,
+ 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31,
+ 31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27,
+ 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24,
+ 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22,
+ 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26,
+ 26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23,
+ 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22,
+ 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20,
+ 22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22,
+ 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22,
+ 22, 22, 20, 19 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32,
+ 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+ 31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30,
+ 29, 28, 31, 31, 31, 31, 29, 29, 28, 27,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28,
+ 28, 28, 27, 26,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
+ 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+ 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27,
+ 26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30,
+ 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+ /* Size 4x8 */
+ 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28,
+ /* Size 8x4 */
+ 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
+ 31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+ 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+ 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28,
+ 28, 27,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32,
+ 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+ 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31,
+ 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30,
+ 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+ 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31,
+ 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28,
+ 28, 27,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+ 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+ 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+ 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+ 28, 28, 28, 28, 27, 27, 26, 26,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30,
+ 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+ 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29,
+ 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29,
+ 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28,
+ 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30,
+ 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30,
+ 29, 28, 28, 28, 28, 28, 27, 26,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31,
+ 31, 31, 30, 30, 30, 29, 29, 29, 28, 28,
+ /* Size 16x4 */
+ 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30,
+ 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31,
+ 30, 29, 31, 31, 29, 28, 30, 30, 28, 28,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+ 28, 28, 27, 27,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33,
+ 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+ 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32,
+ 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31,
+ 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32,
+ 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32,
+ 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31,
+ 31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30,
+ 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+ 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32,
+ 31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29,
+ 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28,
+ 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31,
+ 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30,
+ 29, 28, 28, 27 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22,
+ /* Size 8x8 */
+ 33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33,
+ 33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29,
+ 26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23,
+ 22, 22, 26, 25, 25, 24, 23, 23, 22, 21,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+ 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29,
+ 29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27,
+ 26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26,
+ 25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+ 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30,
+ 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28,
+ 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26,
+ 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26,
+ 24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+ 23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22,
+ 22, 22, 21, 21,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+ 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28,
+ 28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25,
+ 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31,
+ 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30,
+ 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27,
+ 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26,
+ 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+ 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29,
+ 29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28,
+ 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+ 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+ 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+ 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+ 28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25,
+ 25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24,
+ 24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+ 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+ 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26,
+ 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30,
+ 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26,
+ 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30,
+ 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+ 24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+ 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26,
+ 25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28,
+ 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26,
+ 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26,
+ 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25,
+ 25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28,
+ 28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22,
+ /* Size 8x4 */
+ 33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28,
+ 26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33,
+ 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33,
+ 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32,
+ 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29,
+ 28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24,
+ 24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+ 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22,
+ 22, 21,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33,
+ 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32,
+ 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26,
+ 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24,
+ 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28,
+ 27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26,
+ 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22,
+ 22, 21,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+ 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29,
+ 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26,
+ 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+ 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30,
+ 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+ 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26,
+ 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25,
+ 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+ 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23,
+ 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+ 26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27,
+ 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23,
+ 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27,
+ 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26,
+ 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25,
+ 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 21, 21, 21, 21,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+ 33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+ 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28,
+ 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27,
+ 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27,
+ 26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24,
+ 34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33,
+ 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32,
+ 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32,
+ 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31,
+ 29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27,
+ 26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25,
+ 25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24,
+ 24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23,
+ 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30,
+ 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29,
+ 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28,
+ 28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26,
+ 25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23,
+ 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22,
+ 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22,
+ 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22,
+ 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26,
+ 26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24,
+ 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 21, 21,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33,
+ 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28,
+ 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26,
+ 26, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+ /* Size 16x4 */
+ 33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32,
+ 28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24,
+ 30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26,
+ 23, 22, 26, 25, 23, 22, 24, 24, 22, 22,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+ 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28,
+ 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+ 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+ 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+ 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28,
+ 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33,
+ 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33,
+ 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27,
+ 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26,
+ 34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33,
+ 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32,
+ 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25,
+ 25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23,
+ 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29,
+ 28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27,
+ 25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22,
+ 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+ 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25,
+ 24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24,
+ 23, 22, 22, 21 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ /* Size 8x4 */
+ 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 30,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
+ 31, 30,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 30, 30,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+ /* Size 16x4 */
+ 33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32,
+ 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 30,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+ 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+ 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30,
+ 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32,
+ 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32,
+ 32, 32, 31, 30 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33,
+ 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33,
+ 32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28,
+ 26, 26, 31, 30, 30, 29, 29, 28, 26, 26,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+ 30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+ 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29,
+ 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+ 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+ 27, 26, 26, 26,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33,
+ 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30,
+ 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30,
+ 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31,
+ 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30,
+ 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29,
+ 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+ 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29,
+ 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+ 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+ 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28,
+ 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+ 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+ 28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26,
+ 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26,
+ 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28,
+ 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26,
+ 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26,
+ 26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33,
+ 33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26,
+ /* Size 8x4 */
+ 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32,
+ 32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+ 28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25,
+ 25, 25,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33,
+ 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33,
+ 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32,
+ 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+ 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32,
+ 31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28,
+ 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28,
+ 27, 25,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30,
+ 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+ 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28,
+ 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28,
+ 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+ 27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27,
+ 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25,
+ 24, 24, 24, 24, 24, 24, 24, 23,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+ 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+ 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+ 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28,
+ 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26,
+ 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31,
+ 31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31,
+ 31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30,
+ 30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+ 28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26,
+ 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24,
+ 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30,
+ 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29,
+ 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28,
+ 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28,
+ 28, 28, 28, 27, 26, 26, 24, 23,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29,
+ 28, 28, 28, 28, 28, 28, 26, 26, 26, 26,
+ /* Size 16x4 */
+ 33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33,
+ 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28,
+ 33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28,
+ 28, 26, 30, 28, 28, 26, 30, 28, 28, 26,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+ 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28,
+ 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28,
+ 28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25,
+ 25, 25, 25, 24,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33,
+ 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33,
+ 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33,
+ 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28,
+ 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33,
+ 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32,
+ 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32,
+ 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+ 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33,
+ 32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31,
+ 31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29,
+ 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25,
+ 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30,
+ 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28,
+ 28, 28, 26, 24 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33,
+ 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33,
+ 32, 32, 33, 33, 32, 32, 34, 33, 32, 32,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+ 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+ 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33,
+ 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33,
+ 32, 32, 32, 32 },
+ },
+}; \ No newline at end of file
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
new file mode 100644
index 0000000000..8f36eb105b
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
+
+#include <stdbool.h>
+#include "aom/aom_codec.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+// Total number of QM sets stored
+#define QM_LEVEL_BITS 4
+#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
+/* Range of QMS is between first and last value, with offset applied to inter
+ * blocks*/
+#define DEFAULT_QM_Y 10
+#define DEFAULT_QM_U 11
+#define DEFAULT_QM_V 12
+#define DEFAULT_QM_FIRST 5
+#define DEFAULT_QM_LAST 9
+#define LOSSLESS_Q_STEP 4 // this should equal to dc/ac_qlookup_QTX[0]
+
+struct AV1Common;
+struct CommonQuantParams;
+struct macroblockd;
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+ int base_qindex);
+
+// Returns true if we are using quantization matrix.
+bool av1_use_qmatrix(const struct CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int segment_id);
+
+// Reduce the large number of quantizers to a smaller number of levels for which
+// different matrices may be defined
+static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
+ return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
+}
+
+// Initialize all global quant/dequant matrices.
+void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes);
+
+// Get global dequant matrix.
+const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params,
+ int qmlevel, int plane, TX_SIZE tx_size);
+// Get global quant matrix.
+const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params,
+ int qmlevel, int plane, TX_SIZE tx_size);
+
+// Get either local / global dequant matrix as appropriate.
+const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type);
+// Get either local / global quant matrix as appropriate.
+const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params,
+ const struct macroblockd *xd, int plane,
+ TX_SIZE tx_size, TX_TYPE tx_type);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
new file mode 100644
index 0000000000..602fab7237
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.c
@@ -0,0 +1,1169 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+// This function will determine whether or not to create a warped
+// prediction.
+static int allow_warp(const MB_MODE_INFO *const mbmi,
+ const WarpTypesAllowed *const warp_types,
+ const WarpedMotionParams *const gm_params,
+ int build_for_obmc, const struct scale_factors *const sf,
+ WarpedMotionParams *final_warp_params) {
+ // Note: As per the spec, we must test the fixed point scales here, which are
+ // at a higher precision (1 << 14) than the xs and ys in subpel_params (that
+ // have 1 << 10 precision).
+ if (av1_is_scaled(sf)) return 0;
+
+ if (final_warp_params != NULL) *final_warp_params = default_warp_params;
+
+ if (build_for_obmc) return 0;
+
+ if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
+ if (final_warp_params != NULL)
+ memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
+ return 1;
+ } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
+ if (final_warp_params != NULL)
+ memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+ return 1;
+ }
+
+ return 0;
+}
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+ const WarpTypesAllowed *warp_types, int ref,
+ const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
+ if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
+ return;
+
+ if (xd->cur_frame_force_integer_mv) return;
+
+ if (allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
+ inter_pred_params->scale_factors,
+ &inter_pred_params->warp_params)) {
+ inter_pred_params->mode = WARP_PRED;
+ }
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params) {
+ assert(IMPLIES(inter_pred_params->conv_params.is_compound,
+ inter_pred_params->conv_params.dst != NULL));
+
+ if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (inter_pred_params->use_hbd_buf) {
+ highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+ inter_pred_params->block_width,
+ inter_pred_params->block_height,
+ &inter_pred_params->conv_params,
+ inter_pred_params->interp_filter_params,
+ inter_pred_params->bit_depth);
+ } else {
+ inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+ inter_pred_params->block_width,
+ inter_pred_params->block_height,
+ &inter_pred_params->conv_params,
+ inter_pred_params->interp_filter_params);
+ }
+#else
+ inter_predictor(src, src_stride, dst, dst_stride, subpel_params,
+ inter_pred_params->block_width,
+ inter_pred_params->block_height,
+ &inter_pred_params->conv_params,
+ inter_pred_params->interp_filter_params);
+#endif
+ }
+ // TODO(jingning): av1_warp_plane() can be further cleaned up.
+ else if (inter_pred_params->mode == WARP_PRED) {
+ av1_warp_plane(
+ &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
+ inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
+ inter_pred_params->ref_frame_buf.width,
+ inter_pred_params->ref_frame_buf.height,
+ inter_pred_params->ref_frame_buf.stride, dst,
+ inter_pred_params->pix_col, inter_pred_params->pix_row,
+ inter_pred_params->block_width, inter_pred_params->block_height,
+ dst_stride, inter_pred_params->subsampling_x,
+ inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
+ } else {
+ assert(0 && "Unsupported inter_pred_params->mode");
+ }
+}
+
+static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18,
+ 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27,
+ 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21,
+ 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
+ int width) {
+ if (shift >= 0) {
+ memcpy(dst + shift, src, width - shift);
+ memset(dst, src[0], shift);
+ } else {
+ shift = -shift;
+ memcpy(dst, src + shift, width - shift);
+ memset(dst + width - shift, src[width - 1], shift);
+ }
+}
+
+/* clang-format off */
+DECLARE_ALIGNED(16, static uint8_t,
+ wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+};
+/* clang-format on */
+
+// [negative][direction]
+DECLARE_ALIGNED(
+ 16, static uint8_t,
+ wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(16, static uint8_t,
+ wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
+
+DECLARE_ALIGNED(16, static uint8_t,
+ smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL]
+ [MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = {
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+ wedge_masks[BLOCK_8X8] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+ wedge_masks[BLOCK_8X16] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+ wedge_masks[BLOCK_16X8] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+ wedge_masks[BLOCK_16X16] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+ wedge_masks[BLOCK_16X32] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+ wedge_masks[BLOCK_32X16] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+ wedge_masks[BLOCK_32X32] },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+ wedge_masks[BLOCK_8X32] },
+ { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+ wedge_masks[BLOCK_32X8] },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+};
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
+ BLOCK_SIZE sb_type) {
+ const uint8_t *master;
+ const int bh = block_size_high[sb_type];
+ const int bw = block_size_wide[sb_type];
+ const wedge_code_type *a =
+ av1_wedge_params_lookup[sb_type].codebook + wedge_index;
+ int woff, hoff;
+ const uint8_t wsignflip =
+ av1_wedge_params_lookup[sb_type].signflip[wedge_index];
+
+ assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
+ woff = (a->x_offset * bw) >> 3;
+ hoff = (a->y_offset * bh) >> 3;
+ master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
+ MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+ MASK_MASTER_SIZE / 2 - woff;
+ return master;
+}
+
+const uint8_t *av1_get_compound_type_mask(
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
+ (void)sb_type;
+ switch (comp_data->type) {
+ case COMPOUND_WEDGE:
+ return av1_get_contiguous_soft_mask(comp_data->wedge_index,
+ comp_data->wedge_sign, sb_type);
+ default: return comp_data->seg_mask;
+ }
+}
+
+static AOM_INLINE void diffwtd_mask_d16(
+ uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ int i, j, m, diff;
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
+ diff = ROUND_POWER_OF_TWO(diff, round);
+ m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+ mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_c(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ switch (mask_type) {
+ case DIFFWTD_38:
+ diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
+ conv_params, bd);
+ break;
+ case DIFFWTD_38_INV:
+ diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
+ conv_params, bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
+ int mask_base, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1,
+ int src1_stride, int h, int w) {
+ int i, j, m, diff;
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ diff =
+ abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
+ m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+ mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ switch (mask_type) {
+ case DIFFWTD_38:
+ diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
+ break;
+ case DIFFWTD_38_INV:
+ diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
+ break;
+ default: assert(0);
+ }
+}
+
+static AOM_FORCE_INLINE void diffwtd_mask_highbd(
+ uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
+ int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
+ const unsigned int bd) {
+ assert(bd >= 8);
+ if (bd == 8) {
+ if (which_inverse) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const unsigned int bd_shift = bd - 8;
+ if (which_inverse) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff =
+ (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff =
+ (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_c(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ switch (mask_type) {
+ case DIFFWTD_38:
+ diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+ CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+ break;
+ case DIFFWTD_38_INV:
+ diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+ CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static AOM_INLINE void init_wedge_master_masks(void) {
+ int i, j;
+ const int w = MASK_MASTER_SIZE;
+ const int h = MASK_MASTER_SIZE;
+ const int stride = MASK_MASTER_STRIDE;
+ // Note: index [0] stores the masters, and [1] its complement.
+ // Generate prototype by shifting the masters
+ int shift = h / 4;
+ for (i = 0; i < h; i += 2) {
+ shift_copy(wedge_master_oblique_even,
+ &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
+ MASK_MASTER_SIZE);
+ shift--;
+ shift_copy(wedge_master_oblique_odd,
+ &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
+ MASK_MASTER_SIZE);
+ memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
+ wedge_master_vertical,
+ MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+ memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
+ wedge_master_vertical,
+ MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+ }
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
+ wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
+ wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+ wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+ (1 << WEDGE_WEIGHT_BITS) - msk;
+ wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
+ wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
+ (1 << WEDGE_WEIGHT_BITS) - msk;
+ wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+ wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
+ const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
+ wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+ wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
+ wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
+ (1 << WEDGE_WEIGHT_BITS) - mskx;
+ }
+ }
+}
+
+static AOM_INLINE void init_wedge_masks(void) {
+ uint8_t *dst = wedge_mask_buf;
+ BLOCK_SIZE bsize;
+ memset(wedge_masks, 0, sizeof(wedge_masks));
+ for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
+ const int wtypes = wedge_params->wedge_types;
+ if (wtypes == 0) continue;
+ const uint8_t *mask;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int w;
+ for (w = 0; w < wtypes; ++w) {
+ mask = get_wedge_mask_inplace(w, 0, bsize);
+ aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
+ bh);
+ wedge_params->masks[0][w] = dst;
+ dst += bw * bh;
+
+ mask = get_wedge_mask_inplace(w, 1, bsize);
+ aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw,
+ bh);
+ wedge_params->masks[1][w] = dst;
+ dst += bw * bh;
+ }
+ assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+ }
+}
+
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+ 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4,
+ 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+ 32, 16, 16, 16, 8, 8, 8, 4,
+ 4, 4, 2, 2, 2, 1, 1, 1,
+ 8, 8, 4, 4, 2, 2
+};
+/* clang-format on */
+
+static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
+ BLOCK_SIZE plane_bsize,
+ INTERINTRA_MODE mode) {
+ int i, j;
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const int size_scale = ii_size_scales[plane_bsize];
+
+ switch (mode) {
+ case II_V_PRED:
+ for (i = 0; i < bh; ++i) {
+ memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+ mask += stride;
+ }
+ break;
+
+ case II_H_PRED:
+ for (i = 0; i < bh; ++i) {
+ for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+ mask += stride;
+ }
+ break;
+
+ case II_SMOOTH_PRED:
+ for (i = 0; i < bh; ++i) {
+ for (j = 0; j < bw; ++j)
+ mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+ mask += stride;
+ }
+ break;
+
+ case II_DC_PRED:
+ default:
+ for (i = 0; i < bh; ++i) {
+ memset(mask, 32, bw * sizeof(mask[0]));
+ mask += stride;
+ }
+ break;
+ }
+}
+
+static AOM_INLINE void init_smooth_interintra_masks(void) {
+ for (int m = 0; m < INTERINTRA_MODES; ++m) {
+ for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
+ const int bw = block_size_wide[bs];
+ const int bh = block_size_high[bs];
+ if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
+ build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
+ m);
+ }
+ }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+static void init_all_wedge_masks(void) {
+ init_wedge_master_masks();
+ init_wedge_masks();
+ init_smooth_interintra_masks();
+}
+
+void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); }
+
+static AOM_INLINE void build_masked_compound_no_round(
+ uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, InterPredParams *inter_pred_params) {
+ const int ssy = inter_pred_params->subsampling_y;
+ const int ssx = inter_pred_params->subsampling_x;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ const int mask_stride = block_size_wide[sb_type];
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (inter_pred_params->use_hbd_buf) {
+ aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, ssx,
+ ssy, &inter_pred_params->conv_params,
+ inter_pred_params->bit_depth);
+ } else {
+ aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, ssx, ssy,
+ &inter_pred_params->conv_params);
+ }
+#else
+ aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, ssx, ssy,
+ &inter_pred_params->conv_params);
+#endif
+}
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+ uint8_t *dst, int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params) {
+ const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
+ BLOCK_SIZE sb_type = inter_pred_params->sb_type;
+
+ // We're going to call av1_make_inter_predictor to generate a prediction into
+ // a temporary buffer, then will blend that temporary buffer with that from
+ // the other reference.
+ DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_dst =
+ inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
+
+ const int tmp_buf_stride = MAX_SB_SIZE;
+ CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
+ int org_dst_stride = inter_pred_params->conv_params.dst_stride;
+ CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+ inter_pred_params->conv_params.dst = tmp_buf16;
+ inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
+ assert(inter_pred_params->conv_params.do_average == 0);
+
+ // This will generate a prediction in tmp_buf for the second reference
+ av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+ inter_pred_params, subpel_params);
+
+ if (!inter_pred_params->conv_params.plane &&
+ comp_data->type == COMPOUND_DIFFWTD) {
+ av1_build_compound_diffwtd_mask_d16(
+ comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+ tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
+ inter_pred_params->block_width, &inter_pred_params->conv_params,
+ inter_pred_params->bit_depth);
+ }
+ build_masked_compound_no_round(
+ dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
+ comp_data, sb_type, inter_pred_params->block_height,
+ inter_pred_params->block_width, inter_pred_params);
+}
+
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+ const MB_MODE_INFO *mbmi, int *fwd_offset,
+ int *bck_offset,
+ int *use_dist_wtd_comp_avg,
+ int is_compound) {
+ assert(fwd_offset != NULL && bck_offset != NULL);
+ if (!is_compound || mbmi->compound_idx) {
+ *fwd_offset = 8;
+ *bck_offset = 8;
+ *use_dist_wtd_comp_avg = 0;
+ return;
+ }
+
+ *use_dist_wtd_comp_avg = 1;
+ const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+ const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+ const int cur_frame_index = cm->cur_frame->order_hint;
+ int bck_frame_index = 0, fwd_frame_index = 0;
+
+ if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+ if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
+
+ int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
+ fwd_frame_index, cur_frame_index)),
+ 0, MAX_FRAME_DISTANCE);
+ int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
+ cur_frame_index, bck_frame_index)),
+ 0, MAX_FRAME_DISTANCE);
+
+ const int order = d0 <= d1;
+
+ if (d0 == 0 || d1 == 0) {
+ *fwd_offset = quant_dist_lookup_table[3][order];
+ *bck_offset = quant_dist_lookup_table[3][1 - order];
+ return;
+ }
+
+ int i;
+ for (i = 0; i < 3; ++i) {
+ int c0 = quant_dist_weight[i][order];
+ int c1 = quant_dist_weight[i][!order];
+ int d0_c0 = d0 * c0;
+ int d1_c1 = d1 * c1;
+ if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+ }
+
+ *fwd_offset = quant_dist_lookup_table[i][order];
+ *bck_offset = quant_dist_lookup_table[i][1 - order];
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const int plane_start, const int plane_end) {
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
+ struct macroblockd_plane *const pd = &planes[i];
+ const int is_uv = i > 0;
+ setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+ src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+ mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+}
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *sf,
+ const int num_planes) {
+ if (src != NULL) {
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const int is_uv = i > 0;
+ setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i],
+ src->crop_widths[is_uv], src->crop_heights[is_uv],
+ src->strides[is_uv], mi_row, mi_col, sf,
+ pd->subsampling_x, pd->subsampling_y);
+ }
+ }
+}
+
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = { 64 };
+DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 };
+
+DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 };
+
+static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
+
+static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
+ 56, 58, 60, 61, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
+ 45, 47, 48, 50, 51, 52, 53, 55,
+ 56, 57, 58, 59, 60, 60, 61, 62,
+ 64, 64, 64, 64, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_64[64] = {
+ 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+ 45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+ 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+ 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+const uint8_t *av1_get_obmc_mask(int length) {
+ switch (length) {
+ case 1: return obmc_mask_1;
+ case 2: return obmc_mask_2;
+ case 4: return obmc_mask_4;
+ case 8: return obmc_mask_8;
+ case 16: return obmc_mask_16;
+ case 32: return obmc_mask_32;
+ case 64: return obmc_mask_64;
+ default: assert(0); return NULL;
+ }
+}
+
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
+ int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *mi, void *fun_ctxt,
+ const int num_planes) {
+ (void)xd;
+ (void)rel_mi_row;
+ (void)rel_mi_col;
+ (void)op_mi_size;
+ (void)dir;
+ (void)mi;
+ ++*(uint8_t *)fun_ctxt;
+ (void)num_planes;
+}
+
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+
+ mbmi->overlappable_neighbors = 0;
+
+ if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return;
+
+ foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
+ &mbmi->overlappable_neighbors);
+ if (mbmi->overlappable_neighbors) return;
+ foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
+ &mbmi->overlappable_neighbors);
+}
+
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
+// block-size of current plane is smaller than 8x8, always only blend with the
+// left neighbor(s) (skip blending with the above side).
+#define DISABLE_CHROMA_U8X8_OBMC 0 // 0: one-sided obmc; 1: disable
+
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd, int dir) {
+ assert(is_motion_variation_allowed_bsize(bsize));
+
+ const BLOCK_SIZE bsize_plane =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ switch (bsize_plane) {
+#if DISABLE_CHROMA_U8X8_OBMC
+ case BLOCK_4X4:
+ case BLOCK_8X4:
+ case BLOCK_4X8: return 1;
+#else
+ case BLOCK_4X4:
+ case BLOCK_8X4:
+ case BLOCK_4X8: return dir == 0;
+#endif
+ default: return 0;
+ }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+}
+
+struct obmc_inter_pred_ctxt {
+ uint8_t **adjacent;
+ int *adjacent_stride;
+};
+
+static INLINE void build_obmc_inter_pred_above(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
+ (void)above_mi;
+ (void)rel_mi_row;
+ (void)dir;
+ struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int overlap =
+ AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+ const int bh = overlap >> pd->subsampling_y;
+ const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
+ const int dst_stride = pd->dst.stride;
+ uint8_t *const dst = &pd->dst.buf[plane_col];
+ const int tmp_stride = ctxt->adjacent_stride[plane];
+ const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
+ const uint8_t *const mask = av1_get_obmc_mask(bh);
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int is_hbd = is_cur_buf_hbd(xd);
+ if (is_hbd)
+ aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+ tmp_stride, mask, bw, bh, xd->bd);
+ else
+ aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+ mask, bw, bh);
+#else
+ aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+ bw, bh);
+#endif
+ }
+}
+
+static INLINE void build_obmc_inter_pred_left(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
+ (void)left_mi;
+ (void)rel_mi_col;
+ (void)dir;
+ struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int overlap =
+ AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = overlap >> pd->subsampling_x;
+ const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
+ const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
+ const int dst_stride = pd->dst.stride;
+ uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
+ const int tmp_stride = ctxt->adjacent_stride[plane];
+ const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
+ const uint8_t *const mask = av1_get_obmc_mask(bw);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const int is_hbd = is_cur_buf_hbd(xd);
+ if (is_hbd)
+ aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+ tmp_stride, mask, bw, bh, xd->bd);
+ else
+ aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+ mask, bw, bh);
+#else
+ aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+ bw, bh);
+#endif
+ }
+}
+
+// This function combines motion compensated predictions that are generated by
+// top/left neighboring blocks' inter predictors with the regular inter
+// prediction. We assume the original prediction (bmc) is stored in
+// xd->plane[].dst.buf
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *above[MAX_MB_PLANE],
+ int above_stride[MAX_MB_PLANE],
+ uint8_t *left[MAX_MB_PLANE],
+ int left_stride[MAX_MB_PLANE]) {
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+
+ // handle above row
+ struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
+ foreach_overlappable_nb_above(cm, xd,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_obmc_inter_pred_above, &ctxt_above);
+
+ // handle left column
+ struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
+ foreach_overlappable_nb_left(cm, xd,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_obmc_inter_pred_left, &ctxt_left);
+}
+
+void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
+ uint8_t **dst_buf2) {
+ if (is_cur_buf_hbd(xd)) {
+ int len = sizeof(uint16_t);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+ } else {
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+ }
+}
+
+void av1_setup_build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize);
+ const int above_mi_col = xd->mi_col + rel_mi_col;
+
+ av1_modify_neighbor_predictor_for_obmc(above_mbmi);
+
+ for (int j = 0; j < num_planes; ++j) {
+ struct macroblockd_plane *const pd = &xd->plane[j];
+ setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+ ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const int num_refs = 1 + has_second_ref(above_mbmi);
+
+ for (int ref = 0; ref < num_refs; ++ref) {
+ const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+ xd->block_ref_scale_factors[ref] = sf;
+ if ((!av1_is_valid_scale(sf)))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
+ num_planes);
+ }
+
+ xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
+ xd->mb_to_right_edge =
+ ctxt->mb_to_far_edge +
+ (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+}
+
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+ uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize);
+ const int left_mi_row = xd->mi_row + rel_mi_row;
+
+ av1_modify_neighbor_predictor_for_obmc(left_mbmi);
+
+ for (int j = 0; j < num_planes; ++j) {
+ struct macroblockd_plane *const pd = &xd->plane[j];
+ setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+ ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const int num_refs = 1 + has_second_ref(left_mbmi);
+
+ for (int ref = 0; ref < num_refs; ++ref) {
+ const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const ref_scale_factors =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+
+ xd->block_ref_scale_factors[ref] = ref_scale_factors;
+ if ((!av1_is_valid_scale(ref_scale_factors)))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
+ ref_scale_factors, num_planes);
+ }
+
+ xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
+ xd->mb_to_bottom_edge =
+ ctxt->mb_to_far_edge +
+ GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
+}
+
+static AOM_INLINE void combine_interintra(
+ INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+ int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+ uint8_t *comppred, int compstride, const uint8_t *interpred,
+ int interstride, const uint8_t *intrapred, int intrastride) {
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ if (use_wedge_interintra) {
+ if (av1_is_wedge_used(bsize)) {
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ const int subw = 2 * mi_size_wide[bsize] == bw;
+ const int subh = 2 * mi_size_high[bsize] == bh;
+ aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+ interpred, interstride, mask, block_size_wide[bsize],
+ bw, bh, subw, subh);
+ }
+ return;
+ }
+
+ const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
+ aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
+ interstride, mask, bw, bw, bh, 0, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void combine_interintra_highbd(
+ INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+ int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+ uint8_t *comppred8, int compstride, const uint8_t *interpred8,
+ int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ if (use_wedge_interintra) {
+ if (av1_is_wedge_used(bsize)) {
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ const int subh = 2 * mi_size_high[bsize] == bh;
+ const int subw = 2 * mi_size_wide[bsize] == bw;
+ aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+ interpred8, interstride, mask,
+ block_size_wide[bsize], bw, bh, subw, subh, bd);
+ }
+ return;
+ }
+
+ uint8_t mask[MAX_SB_SQUARE];
+ build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+ aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+ interpred8, interstride, mask, bw, bw, bh, 0, 0,
+ bd);
+}
+#endif
+
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+ MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int plane,
+ const BUFFER_SET *ctx,
+ uint8_t *dst, int dst_stride) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ssx = xd->plane[plane].subsampling_x;
+ const int ssy = xd->plane[plane].subsampling_y;
+ BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+ PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+ assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+ assert(xd->mi[0]->use_intrabc == 0);
+ const SequenceHeader *seq_params = cm->seq_params;
+
+ av1_predict_intra_block(xd, seq_params->sb_size,
+ seq_params->enable_intra_edge_filter, pd->width,
+ pd->height, max_txsize_rect_lookup[plane_bsize], mode,
+ 0, 0, FILTER_INTRA_MODES, ctx->plane[plane],
+ ctx->stride[plane], dst, dst_stride, 0, 0, plane);
+}
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride) {
+ const int ssx = xd->plane[plane].subsampling_x;
+ const int ssy = xd->plane[plane].subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ combine_interintra_highbd(
+ xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+ xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+ plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+ inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
+ return;
+ }
+#endif
+ combine_interintra(
+ xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+ xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+ plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+ inter_pred, inter_stride, intra_pred, intra_stride);
+}
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
+ const BUFFER_SET *ctx, int plane,
+ BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ if (is_cur_buf_hbd(xd)) {
+ DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+ av1_build_intra_predictors_for_interintra(
+ cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
+ MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride,
+ CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
+ intrapredictor, MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
+ MAX_SB_SIZE);
+ }
+}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
new file mode 100644
index 0000000000..c31f4531e2
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/warped_motion.h"
+#include "aom/aom_integer.h"
+
+// Work out how many pixels off the edge of a reference frame we're allowed
+// to go when forming an inter prediction.
+// The outermost row/col of each referernce frame is extended by
+// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep
+// at least AOM_INTERP_EXTEND pixels within that to account for filtering.
+//
+// We have to break this up into two macros to keep both clang-format and
+// tools/lint-hunks.py happy.
+#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \
+ ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
+#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \
+ (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_WEDGE_TYPES 16
+
+#define MAX_WEDGE_SIZE_LOG2 5 // 32x32
+#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE -1
+
+// Angles are with respect to horizontal anti-clockwise
+enum {
+ WEDGE_HORIZONTAL = 0,
+ WEDGE_VERTICAL = 1,
+ WEDGE_OBLIQUE27 = 2,
+ WEDGE_OBLIQUE63 = 3,
+ WEDGE_OBLIQUE117 = 4,
+ WEDGE_OBLIQUE153 = 5,
+ WEDGE_DIRECTIONS
+} UENUM1BYTE(WedgeDirectionType);
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+ WedgeDirectionType direction;
+ int x_offset;
+ int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+ int wedge_types;
+ const wedge_code_type *codebook;
+ uint8_t *signflip;
+ wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL];
+
+typedef struct SubpelParams {
+ int xs;
+ int ys;
+ int subpel_x;
+ int subpel_y;
+ int pos_x;
+ int pos_y;
+} SubpelParams;
+
+struct build_prediction_ctxt {
+ const AV1_COMMON *cm;
+ uint8_t **tmp_buf;
+ int *tmp_width;
+ int *tmp_height;
+ int *tmp_stride;
+ int mb_to_far_edge;
+ void *dcb; // Decoder-only coding block.
+};
+
+typedef enum InterPredMode {
+ TRANSLATION_PRED,
+ WARP_PRED,
+} InterPredMode;
+
+typedef enum InterCompMode {
+ UNIFORM_SINGLE,
+ UNIFORM_COMP,
+ MASK_COMP,
+} InterCompMode;
+
+typedef struct InterPredParams {
+ InterPredMode mode;
+ InterCompMode comp_mode;
+ WarpedMotionParams warp_params;
+ ConvolveParams conv_params;
+ const InterpFilterParams *interp_filter_params[2];
+ int block_width;
+ int block_height;
+ int pix_row;
+ int pix_col;
+ struct buf_2d ref_frame_buf;
+ int subsampling_x;
+ int subsampling_y;
+ const struct scale_factors *scale_factors;
+ int bit_depth;
+ int use_hbd_buf;
+ INTERINTER_COMPOUND_DATA mask_comp;
+ BLOCK_SIZE sb_type;
+ int is_intrabc;
+ int top;
+ int left;
+} InterPredParams;
+
+// Initialize sub-pel params required for inter prediction.
+static AOM_INLINE void init_subpel_params(
+ const MV *const src_mv, InterPredParams *const inter_pred_params,
+ SubpelParams *subpel_params, int width, int height) {
+ const struct scale_factors *sf = inter_pred_params->scale_factors;
+ int ssx = inter_pred_params->subsampling_x;
+ int ssy = inter_pred_params->subsampling_y;
+ int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+ orig_pos_y += src_mv->row * (1 << (1 - ssy));
+ int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+ orig_pos_x += src_mv->col * (1 << (1 - ssx));
+ const int is_scaled = av1_is_scaled(sf);
+ int pos_x, pos_y;
+ if (LIKELY(!is_scaled)) {
+ pos_y = av1_unscaled_value(orig_pos_y, sf);
+ pos_x = av1_unscaled_value(orig_pos_x, sf);
+ } else {
+ pos_y = av1_scaled_y(orig_pos_y, sf);
+ pos_x = av1_scaled_x(orig_pos_x, sf);
+ }
+
+ pos_x += SCALE_EXTRA_OFF;
+ pos_y += SCALE_EXTRA_OFF;
+
+ const int bottom = (height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ const int right = (width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ pos_y = clamp(pos_y, inter_pred_params->top, bottom);
+ pos_x = clamp(pos_x, inter_pred_params->left, right);
+
+ subpel_params->pos_x = pos_x;
+ subpel_params->pos_y = pos_y;
+ subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+ subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+ subpel_params->xs = sf->x_step_q4;
+ subpel_params->ys = sf->y_step_q4;
+}
+
+// Initialize interp filter required for inter prediction.
+static AOM_INLINE void init_interp_filter_params(
+ const InterpFilterParams *interp_filter_params[2],
+ const InterpFilters *filter, int block_width, int block_height,
+ int is_intrabc) {
+ if (UNLIKELY(is_intrabc)) {
+ interp_filter_params[0] = &av1_intrabc_filter_params;
+ interp_filter_params[1] = &av1_intrabc_filter_params;
+ } else {
+ interp_filter_params[0] = av1_get_interp_filter_params_with_block_size(
+ (InterpFilter)filter->x_filter, block_width);
+ interp_filter_params[1] = av1_get_interp_filter_params_with_block_size(
+ (InterpFilter)filter->y_filter, block_height);
+ }
+}
+
+// Initialize parameters required for inter prediction at mode level.
+static AOM_INLINE void init_inter_mode_params(
+ const MV *const src_mv, InterPredParams *const inter_pred_params,
+ SubpelParams *subpel_params, const struct scale_factors *sf, int width,
+ int height) {
+ inter_pred_params->scale_factors = sf;
+ init_subpel_params(src_mv, inter_pred_params, subpel_params, width, height);
+}
+
+// Initialize parameters required for inter prediction at block level.
+static AOM_INLINE void init_inter_block_params(
+ InterPredParams *inter_pred_params, int block_width, int block_height,
+ int pix_row, int pix_col, int subsampling_x, int subsampling_y,
+ int bit_depth, int use_hbd_buf, int is_intrabc) {
+ inter_pred_params->block_width = block_width;
+ inter_pred_params->block_height = block_height;
+ inter_pred_params->pix_row = pix_row;
+ inter_pred_params->pix_col = pix_col;
+ inter_pred_params->subsampling_x = subsampling_x;
+ inter_pred_params->subsampling_y = subsampling_y;
+ inter_pred_params->bit_depth = bit_depth;
+ inter_pred_params->use_hbd_buf = use_hbd_buf;
+ inter_pred_params->is_intrabc = is_intrabc;
+ inter_pred_params->mode = TRANSLATION_PRED;
+ inter_pred_params->comp_mode = UNIFORM_SINGLE;
+ inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y);
+ inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x);
+}
+
+// Initialize params required for inter prediction.
+static AOM_INLINE void av1_init_inter_params(
+ InterPredParams *inter_pred_params, int block_width, int block_height,
+ int pix_row, int pix_col, int subsampling_x, int subsampling_y,
+ int bit_depth, int use_hbd_buf, int is_intrabc,
+ const struct scale_factors *sf, const struct buf_2d *ref_buf,
+ int_interpfilters interp_filters) {
+ init_inter_block_params(inter_pred_params, block_width, block_height, pix_row,
+ pix_col, subsampling_x, subsampling_y, bit_depth,
+ use_hbd_buf, is_intrabc);
+ init_interp_filter_params(inter_pred_params->interp_filter_params,
+ &interp_filters.as_filters, block_width,
+ block_height, is_intrabc);
+ inter_pred_params->scale_factors = sf;
+ inter_pred_params->ref_frame_buf = *ref_buf;
+}
+
+static AOM_INLINE void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+ inter_pred_params->comp_mode = UNIFORM_COMP;
+}
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+ const WarpTypesAllowed *warp_types, int ref,
+ const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
+
+static INLINE int has_scale(int xs, int ys) {
+ return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
+}
+
+static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+ sp->subpel_x >>= SCALE_EXTRA_BITS;
+ sp->subpel_y >>= SCALE_EXTRA_BITS;
+ sp->xs >>= SCALE_EXTRA_BITS;
+ sp->ys >>= SCALE_EXTRA_BITS;
+ assert(sp->subpel_x < SUBPEL_SHIFTS);
+ assert(sp->subpel_y < SUBPEL_SHIFTS);
+ assert(sp->xs <= SUBPEL_SHIFTS);
+ assert(sp->ys <= SUBPEL_SHIFTS);
+}
+
+static INLINE void inter_predictor(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params, int w, int h,
+ ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) {
+ assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ if (is_scaled) {
+ av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, subpel_params->subpel_x,
+ subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params);
+ } else {
+ SubpelParams sp = *subpel_params;
+ revert_scale_extra_bits(&sp);
+ av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
+ sp.ys, 0, conv_params);
+ }
+}
+
+static INLINE void highbd_inter_predictor(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params, int w, int h,
+ ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2],
+ int bd) {
+ assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ if (is_scaled) {
+ av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, subpel_params->subpel_x,
+ subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params, bd);
+ } else {
+ SubpelParams sp = *subpel_params;
+ revert_scale_extra_bits(&sp);
+ av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, sp.subpel_x, sp.xs,
+ sp.subpel_y, sp.ys, 0, conv_params, bd);
+ }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd, int dir);
+
+static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
+ BLOCK_SIZE sb_type) {
+ const int comp_allowed = is_comp_ref_allowed(sb_type);
+ switch (type) {
+ case COMPOUND_AVERAGE:
+ case COMPOUND_DISTWTD:
+ case COMPOUND_DIFFWTD: return comp_allowed;
+ case COMPOUND_WEDGE:
+ return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
+ default: assert(0); return 0;
+ }
+}
+
+static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
+ COMPOUND_TYPE comp_type;
+ int i;
+ if (!is_comp_ref_allowed(sb_type)) return 0;
+ for (i = 0; i < COMPOUND_TYPES; i++) {
+ comp_type = (COMPOUND_TYPE)i;
+ if (is_masked_compound_type(comp_type) &&
+ is_interinter_compound_used(comp_type, sb_type))
+ return 1;
+ }
+ return 0;
+}
+
+static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
+ return av1_wedge_params_lookup[sb_type].wedge_types;
+}
+
+static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) {
+ return av1_wedge_params_lookup[sb_type].wedge_types > 0;
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params);
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+ uint8_t *dst, int dst_stride,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params);
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+ const MV *src_mv, int bw, int bh,
+ int ss_x, int ss_y) {
+ // If the MV points so far into the UMV border that no visible pixels
+ // are used for reconstruction, the subpel part of the MV can be
+ // discarded and the MV limited to 16 pixels with equivalent results.
+ const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
+ const int spel_right = spel_left - SUBPEL_SHIFTS;
+ const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
+ const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+ MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+ (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
+ assert(ss_x <= 1);
+ assert(ss_y <= 1);
+ const SubpelMvLimits mv_limits = {
+ xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+ xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+ xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+ xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
+ };
+
+ clamp_mv(&clamped_mv, &mv_limits);
+
+ return clamped_mv;
+}
+
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+ int stride,
+ const struct scale_factors *sf) {
+ int x, y;
+ if (!sf) {
+ x = x_offset;
+ y = y_offset;
+ } else if (av1_is_scaled(sf)) {
+ x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS;
+ y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS;
+ } else {
+ x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS;
+ y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS;
+ }
+ return (int64_t)y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
+ uint8_t *src, int width, int height,
+ int stride, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ int subsampling_x, int subsampling_y) {
+ // Offset the buffer pointer
+ if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+ mi_row -= 1;
+ if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+ mi_col -= 1;
+
+ const int x = (MI_SIZE * mi_col) >> subsampling_x;
+ const int y = (MI_SIZE * mi_row) >> subsampling_y;
+ dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+ dst->buf0 = src;
+ dst->width = width;
+ dst->height = height;
+ dst->stride = stride;
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const int plane_start, const int plane_end);
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *sf, const int num_planes);
+
+static INLINE void set_default_interp_filters(
+ MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
+ mbmi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
+}
+
+static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ if (mbmi->skip_mode) return 0;
+ if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
+ if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
+ return 1;
+}
+
+// Sets up buffers 'dst_buf1' and 'dst_buf2' from relevant buffers in 'xd' for
+// subsequent use in OBMC prediction.
+void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
+ uint8_t **dst_buf2);
+
+void av1_setup_build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+ const int num_planes);
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+ uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *above[MAX_MB_PLANE],
+ int above_stride[MAX_MB_PLANE],
+ uint8_t *left[MAX_MB_PLANE],
+ int left_stride[MAX_MB_PLANE]);
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
+#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
+
+void av1_init_wedge_masks(void);
+
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
+ int8_t wedge_sign,
+ BLOCK_SIZE sb_type) {
+ return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+ const MB_MODE_INFO *mbmi, int *fwd_offset,
+ int *bck_offset,
+ int *use_dist_wtd_comp_avg,
+ int is_compound);
+
+const uint8_t *av1_get_compound_type_mask(
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
+ const BUFFER_SET *ctx, int plane,
+ BLOCK_SIZE bsize);
+
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+ MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int plane,
+ const BUFFER_SET *ctx,
+ uint8_t *dst, int dst_stride);
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconinter_template.inc b/third_party/aom/av1/common/reconinter_template.inc
new file mode 100644
index 0000000000..863c13c112
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter_template.inc
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef IS_DEC
+#error "IS_DEC must be defined for reconinter_template.inc."
+#endif
+
+#if IS_DEC
+static AOM_INLINE void build_one_inter_predictor(
+ uint8_t *dst, int dst_stride, const MV *src_mv,
+ InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+ int ref, uint8_t **mc_buf) {
+#else
+static AOM_INLINE void build_one_inter_predictor(
+ uint8_t *dst, int dst_stride, const MV *src_mv,
+ InterPredParams *inter_pred_params) {
+#endif // IS_DEC
+ SubpelParams subpel_params;
+ uint8_t *src;
+ int src_stride;
+#if IS_DEC
+ dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y,
+ ref, mc_buf, &src, &subpel_params,
+ &src_stride);
+#else
+ enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params,
+ &src_stride);
+#endif // IS_DEC
+ if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
+ inter_pred_params->comp_mode == UNIFORM_COMP) {
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride,
+ inter_pred_params, &subpel_params);
+ } else {
+ av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+ inter_pred_params, &subpel_params);
+ }
+}
+
+// True if the following hold:
+// 1. Not intrabc and not build_for_obmc
+// 2. At least one dimension is size 4 with subsampling
+// 3. If sub-sampled, none of the previous blocks around the sub-sample
+// are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+ int is_intrabc, int build_for_obmc) {
+ if (is_intrabc || build_for_obmc) {
+ return false;
+ }
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x;
+ const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y;
+ if (!is_sub4_x && !is_sub4_y) {
+ return false;
+ }
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start = is_sub4_y ? -1 : 0;
+ const int col_start = is_sub4_x ? -1 : 0;
+
+ for (int row = row_start; row <= 0; ++row) {
+ for (int col = col_start; col <= 0; ++col) {
+ const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ if (!is_inter_block(this_mbmi)) return false;
+ if (is_intrabc_block(this_mbmi)) return false;
+ }
+ }
+ return true;
+}
+
+#if IS_DEC
+static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int plane,
+ const MB_MODE_INFO *mi,
+ int mi_x, int mi_y,
+ uint8_t **mc_buf) {
+#else
+static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int plane,
+ const MB_MODE_INFO *mi,
+ int mi_x, int mi_y) {
+#endif // IS_DEC
+ const BLOCK_SIZE bsize = mi->bsize;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const bool ss_x = pd->subsampling_x;
+ const bool ss_y = pd->subsampling_y;
+ const int b4_w = block_size_wide[bsize] >> ss_x;
+ const int b4_h = block_size_high[bsize] >> ss_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ const int b8_w = block_size_wide[plane_bsize];
+ const int b8_h = block_size_high[plane_bsize];
+ const int is_compound = has_second_ref(mi);
+ assert(!is_compound);
+ assert(!is_intrabc_block(mi));
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+ const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ int row = row_start;
+ for (int y = 0; y < b8_h; y += b4_h) {
+ int col = col_start;
+ for (int x = 0; x < b8_w; x += b4_w) {
+ MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+ int ref = 0;
+ const RefCntBuffer *ref_buf =
+ get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+ const struct scale_factors *ref_scale_factors =
+ get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+ const struct scale_factors *const sf = ref_scale_factors;
+ const struct buf_2d pre_buf = {
+ NULL,
+ (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+ ref_buf->buf.uv_crop_width,
+ ref_buf->buf.uv_crop_height,
+ ref_buf->buf.uv_stride,
+ };
+
+ const MV mv = this_mbmi->mv[ref].as_mv;
+
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+ pre_x + x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+ &pre_buf, this_mbmi->interp_filters);
+ inter_pred_params.conv_params =
+ get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd);
+
+#if IS_DEC
+ build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+ xd, mi_x + x, mi_y + y, ref, mc_buf);
+#else
+ build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
+#endif // IS_DEC
+
+ ++col;
+ }
+ ++row;
+ }
+}
+
+#if IS_DEC
+static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
+#else
+static AOM_INLINE void build_inter_predictors_8x8_and_bigger(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh, int mi_x, int mi_y) {
+#endif // IS_DEC
+ const int is_compound = has_second_ref(mi);
+ const int is_intrabc = is_intrabc_block(mi);
+ assert(IMPLIES(is_intrabc, !is_compound));
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+
+ int is_global[2] = { 0, 0 };
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+ }
+
+ const BLOCK_SIZE bsize = mi->bsize;
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int row_start =
+ (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+ const int col_start =
+ (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+ const MV mv = mi->mv[ref].as_mv;
+ const WarpTypesAllowed warp_types = { is_global[ref],
+ mi->motion_mode == WARPED_CAUSAL };
+
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+ mi->interp_filters);
+ if (is_compound) av1_init_comp_mode(&inter_pred_params);
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+ av1_dist_wtd_comp_weight_assign(
+ cm, mi, &inter_pred_params.conv_params.fwd_offset,
+ &inter_pred_params.conv_params.bck_offset,
+ &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+ if (!build_for_obmc)
+ av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ inter_pred_params.sb_type = mi->bsize;
+ inter_pred_params.mask_comp = mi->interinter_comp;
+ if (ref == 1) {
+ inter_pred_params.conv_params.do_average = 0;
+ inter_pred_params.comp_mode = MASK_COMP;
+ }
+ // Assign physical buffer.
+ inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+ }
+
+#if IS_DEC
+ build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd,
+ mi_x, mi_y, ref, mc_buf);
+#else
+ build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params);
+#endif // IS_DEC
+ }
+}
+
+#if IS_DEC
+static AOM_INLINE void build_inter_predictors(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) {
+ if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
+ build_for_obmc)) {
+ assert(bw < 8 || bh < 8);
+ build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf);
+ } else {
+ build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+ bh, mi_x, mi_y, mc_buf);
+ }
+}
+#else
+static AOM_INLINE void build_inter_predictors(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int plane,
+ const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw,
+ int bh, int mi_x, int mi_y) {
+ if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi),
+ build_for_obmc)) {
+ assert(bw < 8 || bh < 8);
+ build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y);
+ } else {
+ build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+ bh, mi_x, mi_y);
+ }
+}
+#endif // IS_DEC
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
new file mode 100644
index 0000000000..20a1e12476
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.c
@@ -0,0 +1,1798 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+
+enum {
+ NEED_LEFT = 1 << 1,
+ NEED_ABOVE = 1 << 2,
+ NEED_ABOVERIGHT = 1 << 3,
+ NEED_ABOVELEFT = 1 << 4,
+ NEED_BOTTOMLEFT = 1 << 5,
+};
+
+#define INTRA_EDGE_FILT 3
+#define INTRA_EDGE_TAPS 5
+#define MAX_UPSAMPLE_SZ 16
+#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32)
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+ NEED_ABOVE | NEED_LEFT, // DC
+ NEED_ABOVE, // V
+ NEED_LEFT, // H
+ NEED_ABOVE | NEED_ABOVERIGHT, // D45
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D135
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D113
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D157
+ NEED_LEFT | NEED_BOTTOMLEFT, // D203
+ NEED_ABOVE | NEED_ABOVERIGHT, // D67
+ NEED_LEFT | NEED_ABOVE, // SMOOTH
+ NEED_LEFT | NEED_ABOVE, // SMOOTH_V
+ NEED_LEFT | NEED_ABOVE, // SMOOTH_H
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // PAETH
+};
+
+// Tables to store if the top-right reference pixels are available. The flags
+// are represented with bits, packed into 8-bit integers. E.g., for the 32x32
+// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
+// order), so its flag is stored at the 3rd bit of the 2nd entry in the table,
+// i.e. (table[10 / 8] >> (10 % 8)) & 1.
+// . . . .
+// . . . .
+// . . o .
+// . . . .
+static uint8_t has_tr_4x4[128] = {
+ 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+};
+static uint8_t has_tr_4x8[64] = {
+ 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119,
+ 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127,
+ 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119,
+ 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127,
+ 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119,
+};
+static uint8_t has_tr_8x4[64] = {
+ 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+ 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+ 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+ 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+};
+static uint8_t has_tr_8x8[32] = {
+ 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+ 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+};
+static uint8_t has_tr_8x16[16] = {
+ 255, 255, 119, 119, 127, 127, 119, 119,
+ 255, 127, 119, 119, 127, 127, 119, 119,
+};
+static uint8_t has_tr_16x8[16] = {
+ 255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0,
+};
+static uint8_t has_tr_16x16[8] = {
+ 255, 85, 119, 85, 127, 85, 119, 85,
+};
+static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 };
+static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 };
+static uint8_t has_tr_32x32[2] = { 95, 87 };
+static uint8_t has_tr_32x64[1] = { 127 };
+static uint8_t has_tr_64x32[1] = { 19 };
+static uint8_t has_tr_64x64[1] = { 7 };
+static uint8_t has_tr_64x128[1] = { 3 };
+static uint8_t has_tr_128x64[1] = { 1 };
+static uint8_t has_tr_128x128[1] = { 1 };
+static uint8_t has_tr_4x16[32] = {
+ 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255,
+ 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127,
+ 127, 127, 255, 127, 255, 127, 127, 127, 127, 127,
+};
+static uint8_t has_tr_16x4[32] = {
+ 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+ 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+};
+static uint8_t has_tr_8x32[8] = {
+ 255, 255, 127, 127, 255, 127, 127, 127,
+};
+static uint8_t has_tr_32x8[8] = {
+ 15, 0, 5, 0, 7, 0, 5, 0,
+};
+static uint8_t has_tr_16x64[2] = { 255, 127 };
+static uint8_t has_tr_64x16[2] = { 3, 1 };
+
+static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = {
+ // 4X4
+ has_tr_4x4,
+ // 4X8, 8X4, 8X8
+ has_tr_4x8, has_tr_8x4, has_tr_8x8,
+ // 8X16, 16X8, 16X16
+ has_tr_8x16, has_tr_16x8, has_tr_16x16,
+ // 16X32, 32X16, 32X32
+ has_tr_16x32, has_tr_32x16, has_tr_32x32,
+ // 32X64, 64X32, 64X64
+ has_tr_32x64, has_tr_64x32, has_tr_64x64,
+ // 64x128, 128x64, 128x128
+ has_tr_64x128, has_tr_128x64, has_tr_128x128,
+ // 4x16, 16x4, 8x32
+ has_tr_4x16, has_tr_16x4, has_tr_8x32,
+ // 32x8, 16x64, 64x16
+ has_tr_32x8, has_tr_16x64, has_tr_64x16
+};
+
+static uint8_t has_tr_vert_8x8[32] = {
+ 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+ 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+};
+static uint8_t has_tr_vert_16x16[8] = {
+ 255, 0, 119, 0, 127, 0, 119, 0,
+};
+static uint8_t has_tr_vert_32x32[2] = { 15, 7 };
+static uint8_t has_tr_vert_64x64[1] = { 3 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = {
+ // 4X4
+ NULL,
+ // 4X8, 8X4, 8X8
+ has_tr_4x8, NULL, has_tr_vert_8x8,
+ // 8X16, 16X8, 16X16
+ has_tr_8x16, NULL, has_tr_vert_16x16,
+ // 16X32, 32X16, 32X32
+ has_tr_16x32, NULL, has_tr_vert_32x32,
+ // 32X64, 64X32, 64X64
+ has_tr_32x64, NULL, has_tr_vert_64x64,
+ // 64x128, 128x64, 128x128
+ has_tr_64x128, NULL, has_tr_128x128
+};
+
+static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ const uint8_t *ret = NULL;
+ // If this is a mixed vertical partition, look up bsize in orders_vert.
+ if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+ assert(bsize < BLOCK_SIZES);
+ ret = has_tr_vert_tables[bsize];
+ } else {
+ ret = has_tr_tables[bsize];
+ }
+ assert(ret);
+ return ret;
+}
+
+static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int top_available, int right_available,
+ PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+ int col_off, int ss_x, int ss_y) {
+ if (!top_available || !right_available) return 0;
+
+ const int bw_unit = mi_size_wide[bsize];
+ const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
+ const int top_right_count_unit = tx_size_wide_unit[txsz];
+
+ if (row_off > 0) { // Just need to check if enough pixels on the right.
+ if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+ // Special case: For 128x128 blocks, the transform unit whose
+ // top-right corner is at the center of the block does in fact have
+ // pixels available at its top-right corner.
+ if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+ col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+ return 1;
+ }
+ const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+ const int col_off_64 = col_off % plane_bw_unit_64;
+ return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+ }
+ return col_off + top_right_count_unit < plane_bw_unit;
+ } else {
+ // All top-right pixels are in the block above, which is already available.
+ if (col_off + top_right_count_unit < plane_bw_unit) return 1;
+
+ const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+ const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+ const int sb_mi_size = mi_size_high[sb_size];
+ const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+ const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+ // Top row of superblock: so top-right pixels are in the top and/or
+ // top-right superblocks, both of which are already available.
+ if (blk_row_in_sb == 0) return 1;
+
+ // Rightmost column of superblock (and not the top row): so top-right pixels
+ // fall in the right superblock, which is not available yet.
+ if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
+ return 0;
+ }
+
+ // General case (neither top row nor rightmost column): check if the
+ // top-right block is coded before the current block.
+ const int this_blk_index =
+ ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+ blk_col_in_sb + 0;
+ const int idx1 = this_blk_index / 8;
+ const int idx2 = this_blk_index % 8;
+ const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
+ return (has_tr_table[idx1] >> idx2) & 1;
+ }
+}
+
+// Similar to the has_tr_* tables, but store if the bottom-left reference
+// pixels are available.
+static uint8_t has_bl_4x4[128] = {
+ 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85,
+ 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17,
+ 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84,
+ 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
+ 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1,
+ 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85,
+ 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0,
+};
+static uint8_t has_bl_4x8[64] = {
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+};
+static uint8_t has_bl_8x4[64] = {
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+};
+static uint8_t has_bl_8x8[32] = {
+ 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+ 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+};
+static uint8_t has_bl_8x16[16] = {
+ 16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
+};
+static uint8_t has_bl_16x8[16] = {
+ 254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
+};
+static uint8_t has_bl_16x16[8] = {
+ 84, 16, 84, 0, 84, 16, 84, 0,
+};
+static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
+static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
+static uint8_t has_bl_32x32[2] = { 4, 4 };
+static uint8_t has_bl_32x64[1] = { 0 };
+static uint8_t has_bl_64x32[1] = { 34 };
+static uint8_t has_bl_64x64[1] = { 0 };
+static uint8_t has_bl_64x128[1] = { 0 };
+static uint8_t has_bl_128x64[1] = { 0 };
+static uint8_t has_bl_128x128[1] = { 0 };
+static uint8_t has_bl_4x16[32] = {
+ 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+ 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+};
+static uint8_t has_bl_16x4[32] = {
+ 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+ 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+};
+static uint8_t has_bl_8x32[8] = {
+ 0, 1, 0, 0, 0, 1, 0, 0,
+};
+static uint8_t has_bl_32x8[8] = {
+ 238, 78, 238, 14, 238, 78, 238, 14,
+};
+static uint8_t has_bl_16x64[2] = { 0, 0 };
+static uint8_t has_bl_64x16[2] = { 42, 42 };
+
+static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
+ // 4X4
+ has_bl_4x4,
+ // 4X8, 8X4, 8X8
+ has_bl_4x8, has_bl_8x4, has_bl_8x8,
+ // 8X16, 16X8, 16X16
+ has_bl_8x16, has_bl_16x8, has_bl_16x16,
+ // 16X32, 32X16, 32X32
+ has_bl_16x32, has_bl_32x16, has_bl_32x32,
+ // 32X64, 64X32, 64X64
+ has_bl_32x64, has_bl_64x32, has_bl_64x64,
+ // 64x128, 128x64, 128x128
+ has_bl_64x128, has_bl_128x64, has_bl_128x128,
+ // 4x16, 16x4, 8x32
+ has_bl_4x16, has_bl_16x4, has_bl_8x32,
+ // 32x8, 16x64, 64x16
+ has_bl_32x8, has_bl_16x64, has_bl_64x16
+};
+
+static uint8_t has_bl_vert_8x8[32] = {
+ 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+ 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+};
+static uint8_t has_bl_vert_16x16[8] = {
+ 254, 16, 254, 0, 254, 16, 254, 0,
+};
+static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
+static uint8_t has_bl_vert_64x64[1] = { 2 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
+ // 4X4
+ NULL,
+ // 4X8, 8X4, 8X8
+ has_bl_4x8, NULL, has_bl_vert_8x8,
+ // 8X16, 16X8, 16X16
+ has_bl_8x16, NULL, has_bl_vert_16x16,
+ // 16X32, 32X16, 32X32
+ has_bl_16x32, NULL, has_bl_vert_32x32,
+ // 32X64, 64X32, 64X64
+ has_bl_32x64, NULL, has_bl_vert_64x64,
+ // 64x128, 128x64, 128x128
+ has_bl_64x128, NULL, has_bl_128x128
+};
+
+static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ const uint8_t *ret = NULL;
+ // If this is a mixed vertical partition, look up bsize in orders_vert.
+ if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+ assert(bsize < BLOCK_SIZES);
+ ret = has_bl_vert_tables[bsize];
+ } else {
+ ret = has_bl_tables[bsize];
+ }
+ assert(ret);
+ return ret;
+}
+
+static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int bottom_available, int left_available,
+ PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+ int col_off, int ss_x, int ss_y) {
+ if (!bottom_available || !left_available) return 0;
+
+ // Special case for 128x* blocks, when col_off is half the block width.
+ // This is needed because 128x* superblocks are divided into 64x* blocks in
+ // raster order
+ if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
+ const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+ const int col_off_64 = col_off % plane_bw_unit_64;
+ if (col_off_64 == 0) {
+ // We are at the left edge of top-right or bottom-right 64x* block.
+ const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
+ const int row_off_64 = row_off % plane_bh_unit_64;
+ const int plane_bh_unit =
+ AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
+ // Check if all bottom-left pixels are in the left 64x* block (which is
+ // already coded).
+ return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
+ }
+ }
+
+ if (col_off > 0) {
+ // Bottom-left pixels are in the bottom-left block, which is not available.
+ return 0;
+ } else {
+ const int bh_unit = mi_size_high[bsize];
+ const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
+ const int bottom_left_count_unit = tx_size_high_unit[txsz];
+
+ // All bottom-left pixels are in the left block, which is already available.
+ if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
+
+ const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+ const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+ const int sb_mi_size = mi_size_high[sb_size];
+ const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+ const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+ // Leftmost column of superblock: so bottom-left pixels maybe in the left
+ // and/or bottom-left superblocks. But only the left superblock is
+ // available, so check if all required pixels fall in that superblock.
+ if (blk_col_in_sb == 0) {
+ const int blk_start_row_off =
+ blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
+ ss_y;
+ const int row_off_in_sb = blk_start_row_off + row_off;
+ const int sb_height_unit = sb_mi_size >> ss_y;
+ return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
+ }
+
+ // Bottom row of superblock (and not the leftmost column): so bottom-left
+ // pixels fall in the bottom superblock, which is not available yet.
+ if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
+
+ // General case (neither leftmost column nor bottom row): check if the
+ // bottom-left block is coded before the current block.
+ const int this_blk_index =
+ ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+ blk_col_in_sb + 0;
+ const int idx1 = this_blk_index / 8;
+ const int idx2 = this_blk_index % 8;
+ const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
+ return (has_bl_table[idx1] >> idx2) & 1;
+ }
+}
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
+static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+#endif
+
+static void init_intra_predictors_internal(void) {
+ assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
+
+#define INIT_RECTANGULAR(p, type) \
+ p[TX_4X8] = aom_##type##_predictor_4x8; \
+ p[TX_8X4] = aom_##type##_predictor_8x4; \
+ p[TX_8X16] = aom_##type##_predictor_8x16; \
+ p[TX_16X8] = aom_##type##_predictor_16x8; \
+ p[TX_16X32] = aom_##type##_predictor_16x32; \
+ p[TX_32X16] = aom_##type##_predictor_32x16; \
+ p[TX_32X64] = aom_##type##_predictor_32x64; \
+ p[TX_64X32] = aom_##type##_predictor_64x32; \
+ p[TX_4X16] = aom_##type##_predictor_4x16; \
+ p[TX_16X4] = aom_##type##_predictor_16x4; \
+ p[TX_8X32] = aom_##type##_predictor_8x32; \
+ p[TX_32X8] = aom_##type##_predictor_32x8; \
+ p[TX_16X64] = aom_##type##_predictor_16x64; \
+ p[TX_64X16] = aom_##type##_predictor_64x16;
+
+#define INIT_NO_4X4(p, type) \
+ p[TX_8X8] = aom_##type##_predictor_8x8; \
+ p[TX_16X16] = aom_##type##_predictor_16x16; \
+ p[TX_32X32] = aom_##type##_predictor_32x32; \
+ p[TX_64X64] = aom_##type##_predictor_64x64; \
+ INIT_RECTANGULAR(p, type)
+
+#define INIT_ALL_SIZES(p, type) \
+ p[TX_4X4] = aom_##type##_predictor_4x4; \
+ INIT_NO_4X4(p, type)
+
+ INIT_ALL_SIZES(pred[V_PRED], v)
+ INIT_ALL_SIZES(pred[H_PRED], h)
+ INIT_ALL_SIZES(pred[PAETH_PRED], paeth)
+ INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth)
+ INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v)
+ INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h)
+ INIT_ALL_SIZES(dc_pred[0][0], dc_128)
+ INIT_ALL_SIZES(dc_pred[0][1], dc_top)
+ INIT_ALL_SIZES(dc_pred[1][0], dc_left)
+ INIT_ALL_SIZES(dc_pred[1][1], dc)
+#if CONFIG_AV1_HIGHBITDEPTH
+ INIT_ALL_SIZES(pred_high[V_PRED], highbd_v)
+ INIT_ALL_SIZES(pred_high[H_PRED], highbd_h)
+ INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth)
+ INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth)
+ INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v)
+ INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h)
+ INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128)
+ INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top)
+ INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left)
+ INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc)
+#endif
+#undef intra_pred_allsizes
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ int r, c, x, base, shift, val;
+
+ (void)left;
+ (void)dy;
+ assert(dy == 1);
+ assert(dx > 0);
+
+ const int max_base_x = ((bw + bh) - 1) << upsample_above;
+ const int frac_bits = 6 - upsample_above;
+ const int base_inc = 1 << upsample_above;
+ x = dx;
+ for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+ base = x >> frac_bits;
+ shift = ((x << upsample_above) & 0x3F) >> 1;
+
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ memset(dst, above[max_base_x], bw * sizeof(dst[0]));
+ dst += stride;
+ }
+ return;
+ }
+
+ for (c = 0; c < bw; ++c, base += base_inc) {
+ if (base < max_base_x) {
+ val = above[base] * (32 - shift) + above[base + 1] * shift;
+ dst[c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ dst[c] = above[max_base_x];
+ }
+ }
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ (void)min_base_y;
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ for (int r = 0; r < bh; ++r) {
+ for (int c = 0; c < bw; ++c) {
+ int val;
+ int y = r + 1;
+ int x = (c << 6) - y * dx;
+ const int base_x = x >> frac_bits_x;
+ if (base_x >= min_base_x) {
+ const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+ val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ x = c + 1;
+ y = (r << 6) - x * dy;
+ const int base_y = y >> frac_bits_y;
+ assert(base_y >= min_base_y);
+ const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+ val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ }
+ dst[c] = val;
+ }
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ int r, c, y, base, shift, val;
+
+ (void)above;
+ (void)dx;
+
+ assert(dx == 1);
+ assert(dy > 0);
+
+ const int max_base_y = (bw + bh - 1) << upsample_left;
+ const int frac_bits = 6 - upsample_left;
+ const int base_inc = 1 << upsample_left;
+ y = dy;
+ for (c = 0; c < bw; ++c, y += dy) {
+ base = y >> frac_bits;
+ shift = ((y << upsample_left) & 0x3F) >> 1;
+
+ for (r = 0; r < bh; ++r, base += base_inc) {
+ if (base < max_base_y) {
+ val = left[base] * (32 - shift) + left[base + 1] * shift;
+ dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+ break;
+ }
+ }
+ }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int angle) {
+ const int dx = av1_get_dx(angle);
+ const int dy = av1_get_dy(angle);
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ assert(angle > 0 && angle < 270);
+
+ if (angle > 0 && angle < 90) {
+ av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
+ dy);
+ } else if (angle > 90 && angle < 180) {
+ av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
+ upsample_left, dx, dy);
+ } else if (angle > 180 && angle < 270) {
+ av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
+ dy);
+ } else if (angle == 90) {
+ pred[V_PRED][tx_size](dst, stride, above, left);
+ } else if (angle == 180) {
+ pred[H_PRED][tx_size](dst, stride, above, left);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int dx, int dy, int bd) {
+ int r, c, x, base, shift, val;
+
+ (void)left;
+ (void)dy;
+ (void)bd;
+ assert(dy == 1);
+ assert(dx > 0);
+
+ const int max_base_x = ((bw + bh) - 1) << upsample_above;
+ const int frac_bits = 6 - upsample_above;
+ const int base_inc = 1 << upsample_above;
+ x = dx;
+ for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+ base = x >> frac_bits;
+ shift = ((x << upsample_above) & 0x3F) >> 1;
+
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ aom_memset16(dst, above[max_base_x], bw);
+ dst += stride;
+ }
+ return;
+ }
+
+ for (c = 0; c < bw; ++c, base += base_inc) {
+ if (base < max_base_x) {
+ val = above[base] * (32 - shift) + above[base + 1] * shift;
+ dst[c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ dst[c] = above[max_base_x];
+ }
+ }
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ (void)min_base_y;
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ for (int r = 0; r < bh; ++r) {
+ for (int c = 0; c < bw; ++c) {
+ int val;
+ int y = r + 1;
+ int x = (c << 6) - y * dx;
+ const int base_x = x >> frac_bits_x;
+ if (base_x >= min_base_x) {
+ const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+ val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ x = c + 1;
+ y = (r << 6) - x * dy;
+ const int base_y = y >> frac_bits_y;
+ assert(base_y >= min_base_y);
+ const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+ val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ }
+ dst[c] = val;
+ }
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_left,
+ int dx, int dy, int bd) {
+ int r, c, y, base, shift, val;
+
+ (void)above;
+ (void)dx;
+ (void)bd;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ const int max_base_y = (bw + bh - 1) << upsample_left;
+ const int frac_bits = 6 - upsample_left;
+ const int base_inc = 1 << upsample_left;
+ y = dy;
+ for (c = 0; c < bw; ++c, y += dy) {
+ base = y >> frac_bits;
+ shift = ((y << upsample_left) & 0x3F) >> 1;
+
+ for (r = 0; r < bh; ++r, base += base_inc) {
+ if (base < max_base_y) {
+ val = left[base] * (32 - shift) + left[base + 1] * shift;
+ dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+ break;
+ }
+ }
+ }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int angle, int bd) {
+ const int dx = av1_get_dx(angle);
+ const int dy = av1_get_dy(angle);
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ assert(angle > 0 && angle < 270);
+
+ if (angle > 0 && angle < 90) {
+ av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
+ upsample_above, dx, dy, bd);
+ } else if (angle > 90 && angle < 180) {
+ av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
+ upsample_above, upsample_left, dx, dy, bd);
+ } else if (angle > 180 && angle < 270) {
+ av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
+ dx, dy, bd);
+ } else if (angle == 90) {
+ pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
+ } else if (angle == 180) {
+ pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+DECLARE_ALIGNED(16, const int8_t,
+ av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
+ {
+ { -6, 10, 0, 0, 0, 12, 0, 0 },
+ { -5, 2, 10, 0, 0, 9, 0, 0 },
+ { -3, 1, 1, 10, 0, 7, 0, 0 },
+ { -3, 1, 1, 2, 10, 5, 0, 0 },
+ { -4, 6, 0, 0, 0, 2, 12, 0 },
+ { -3, 2, 6, 0, 0, 2, 9, 0 },
+ { -3, 2, 2, 6, 0, 2, 7, 0 },
+ { -3, 1, 2, 2, 6, 3, 5, 0 },
+ },
+ {
+ { -10, 16, 0, 0, 0, 10, 0, 0 },
+ { -6, 0, 16, 0, 0, 6, 0, 0 },
+ { -4, 0, 0, 16, 0, 4, 0, 0 },
+ { -2, 0, 0, 0, 16, 2, 0, 0 },
+ { -10, 16, 0, 0, 0, 0, 10, 0 },
+ { -6, 0, 16, 0, 0, 0, 6, 0 },
+ { -4, 0, 0, 16, 0, 0, 4, 0 },
+ { -2, 0, 0, 0, 16, 0, 2, 0 },
+ },
+ {
+ { -8, 8, 0, 0, 0, 16, 0, 0 },
+ { -8, 0, 8, 0, 0, 16, 0, 0 },
+ { -8, 0, 0, 8, 0, 16, 0, 0 },
+ { -8, 0, 0, 0, 8, 16, 0, 0 },
+ { -4, 4, 0, 0, 0, 0, 16, 0 },
+ { -4, 0, 4, 0, 0, 0, 16, 0 },
+ { -4, 0, 0, 4, 0, 0, 16, 0 },
+ { -4, 0, 0, 0, 4, 0, 16, 0 },
+ },
+ {
+ { -2, 8, 0, 0, 0, 10, 0, 0 },
+ { -1, 3, 8, 0, 0, 6, 0, 0 },
+ { -1, 2, 3, 8, 0, 4, 0, 0 },
+ { 0, 1, 2, 3, 8, 2, 0, 0 },
+ { -1, 4, 0, 0, 0, 3, 10, 0 },
+ { -1, 3, 4, 0, 0, 4, 6, 0 },
+ { -1, 2, 3, 4, 0, 4, 4, 0 },
+ { -1, 2, 2, 3, 4, 3, 3, 0 },
+ },
+ {
+ { -12, 14, 0, 0, 0, 14, 0, 0 },
+ { -10, 0, 14, 0, 0, 12, 0, 0 },
+ { -9, 0, 0, 14, 0, 11, 0, 0 },
+ { -8, 0, 0, 0, 14, 10, 0, 0 },
+ { -10, 12, 0, 0, 0, 0, 14, 0 },
+ { -9, 1, 12, 0, 0, 0, 12, 0 },
+ { -8, 0, 0, 12, 0, 1, 11, 0 },
+ { -7, 0, 0, 1, 12, 1, 9, 0 },
+ },
+};
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ int r, c;
+ uint8_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+ for (r = 1; r < bh + 1; r += 2)
+ for (c = 1; c < bw + 1; c += 4) {
+ const uint8_t p0 = buffer[r - 1][c - 1];
+ const uint8_t p1 = buffer[r - 1][c];
+ const uint8_t p2 = buffer[r - 1][c + 1];
+ const uint8_t p3 = buffer[r - 1][c + 2];
+ const uint8_t p4 = buffer[r - 1][c + 3];
+ const uint8_t p5 = buffer[r][c - 1];
+ const uint8_t p6 = buffer[r + 1][c - 1];
+ for (int k = 0; k < 8; ++k) {
+ int r_offset = k >> 2;
+ int c_offset = k & 0x03;
+ int pr = av1_filter_intra_taps[mode][k][0] * p0 +
+ av1_filter_intra_taps[mode][k][1] * p1 +
+ av1_filter_intra_taps[mode][k][2] * p2 +
+ av1_filter_intra_taps[mode][k][3] * p3 +
+ av1_filter_intra_taps[mode][k][4] * p4 +
+ av1_filter_intra_taps[mode][k][5] * p5 +
+ av1_filter_intra_taps[mode][k][6] * p6;
+ // Section 7.11.2.3 specifies the right-hand side of the assignment as
+ // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+ // Since Clip1() clips a negative value to 0, it is safe to replace
+ // Round2Signed() with Round2().
+ buffer[r + r_offset][c + c_offset] =
+ clip_pixel(ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS));
+ }
+ }
+
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+ dst += stride;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size,
+ const uint16_t *above,
+ const uint16_t *left, int mode,
+ int bd) {
+ int r, c;
+ uint16_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 1; r < bh + 1; r += 2)
+ for (c = 1; c < bw + 1; c += 4) {
+ const uint16_t p0 = buffer[r - 1][c - 1];
+ const uint16_t p1 = buffer[r - 1][c];
+ const uint16_t p2 = buffer[r - 1][c + 1];
+ const uint16_t p3 = buffer[r - 1][c + 2];
+ const uint16_t p4 = buffer[r - 1][c + 3];
+ const uint16_t p5 = buffer[r][c - 1];
+ const uint16_t p6 = buffer[r + 1][c - 1];
+ for (int k = 0; k < 8; ++k) {
+ int r_offset = k >> 2;
+ int c_offset = k & 0x03;
+ int pr = av1_filter_intra_taps[mode][k][0] * p0 +
+ av1_filter_intra_taps[mode][k][1] * p1 +
+ av1_filter_intra_taps[mode][k][2] * p2 +
+ av1_filter_intra_taps[mode][k][3] * p3 +
+ av1_filter_intra_taps[mode][k][4] * p4 +
+ av1_filter_intra_taps[mode][k][5] * p5 +
+ av1_filter_intra_taps[mode][k][6] * p6;
+ // Section 7.11.2.3 specifies the right-hand side of the assignment as
+ // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+ // Since Clip1() clips a negative value to 0, it is safe to replace
+ // Round2Signed() with Round2().
+ buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
+ ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd);
+ }
+ }
+
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
+ dst += stride;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
+ if (plane == 0) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+ mode == SMOOTH_H_PRED);
+ } else {
+ // uv_mode is not set for inter blocks, so need to explicitly
+ // detect that case.
+ if (is_inter_block(mbmi)) return 0;
+
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
+ uv_mode == UV_SMOOTH_H_PRED);
+ }
+}
+
+static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) {
+ const MB_MODE_INFO *above;
+ const MB_MODE_INFO *left;
+
+ if (plane == 0) {
+ above = xd->above_mbmi;
+ left = xd->left_mbmi;
+ } else {
+ above = xd->chroma_above_mbmi;
+ left = xd->chroma_left_mbmi;
+ }
+
+ return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane));
+}
+
+static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
+ const int d = abs(delta);
+ int strength = 0;
+
+ const int blk_wh = bs0 + bs1;
+ if (type == 0) {
+ if (blk_wh <= 8) {
+ if (d >= 56) strength = 1;
+ } else if (blk_wh <= 12) {
+ if (d >= 40) strength = 1;
+ } else if (blk_wh <= 16) {
+ if (d >= 40) strength = 1;
+ } else if (blk_wh <= 24) {
+ if (d >= 8) strength = 1;
+ if (d >= 16) strength = 2;
+ if (d >= 32) strength = 3;
+ } else if (blk_wh <= 32) {
+ if (d >= 1) strength = 1;
+ if (d >= 4) strength = 2;
+ if (d >= 32) strength = 3;
+ } else {
+ if (d >= 1) strength = 3;
+ }
+ } else {
+ if (blk_wh <= 8) {
+ if (d >= 40) strength = 1;
+ if (d >= 64) strength = 2;
+ } else if (blk_wh <= 16) {
+ if (d >= 20) strength = 1;
+ if (d >= 48) strength = 2;
+ } else if (blk_wh <= 24) {
+ if (d >= 4) strength = 3;
+ } else {
+ if (d >= 1) strength = 3;
+ }
+ }
+ return strength;
+}
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 } };
+ const int filt = strength - 1;
+ uint8_t edge[129];
+
+ memcpy(edge, p, sz * sizeof(*p));
+ for (int i = 1; i < sz; i++) {
+ int s = 0;
+ for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+ int k = i - 2 + j;
+ k = (k < 0) ? 0 : k;
+ k = (k > sz - 1) ? sz - 1 : k;
+ s += edge[k] * kernel[filt][j];
+ }
+ s = (s + 8) >> 4;
+ p[i] = s;
+ }
+}
+
+static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
+ const int kernel[3] = { 5, 6, 5 };
+
+ int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+ (p_above[0] * kernel[2]);
+ s = (s + 8) >> 4;
+ p_above[-1] = s;
+ p_left[-1] = s;
+}
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
+ // interpolate half-sample positions
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint8_t in[MAX_UPSAMPLE_SZ + 3];
+ // copy p[-1..(sz-1)] and extend first and last samples
+ in[0] = p[-1];
+ in[1] = p[-1];
+ for (int i = 0; i < sz; i++) {
+ in[i + 2] = p[i];
+ }
+ in[sz + 2] = p[sz - 1];
+
+ // interpolate half-sample edge positions
+ p[-2] = in[0];
+ for (int i = 0; i < sz; i++) {
+ int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+ s = clip_pixel((s + 8) >> 4);
+ p[2 * i - 1] = s;
+ p[2 * i] = in[i + 2];
+ }
+}
+
+static void build_directional_and_filter_intra_predictors(
+ const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
+ PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
+ TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+ int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
+ int i;
+ const uint8_t *above_ref = ref - ref_stride;
+ const uint8_t *left_ref = ref - 1;
+ DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ uint8_t *const above_row = above_data + 16;
+ uint8_t *const left_col = left_data + 16;
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ int need_left = extend_modes[mode] & NEED_LEFT;
+ int need_above = extend_modes[mode] & NEED_ABOVE;
+ int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+ const int is_dr_mode = av1_is_directional_mode(mode);
+ const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+ assert(use_filter_intra || is_dr_mode);
+ // The left_data, above_data buffers must be zeroed to fix some intermittent
+ // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+ // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to
+ // be the potential reason for this issue.
+ memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
+ memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
+
+ // The default values if ref pixels are not available:
+ // 128 127 127 .. 127 127 127 127 127 127
+ // 129 A B .. Y Z
+ // 129 C D .. W X
+ // 129 E F .. U V
+ // 129 G H .. S T T T T T
+ // ..
+
+ if (is_dr_mode) {
+ if (p_angle <= 90)
+ need_above = 1, need_left = 0, need_above_left = 1;
+ else if (p_angle < 180)
+ need_above = 1, need_left = 1, need_above_left = 1;
+ else
+ need_above = 0, need_left = 1, need_above_left = 1;
+ }
+ if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+ assert(n_top_px >= 0);
+ assert(n_topright_px >= -1);
+ assert(n_left_px >= 0);
+ assert(n_bottomleft_px >= -1);
+
+ if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+ int val;
+ if (need_left) {
+ val = (n_top_px > 0) ? above_ref[0] : 129;
+ } else {
+ val = (n_left_px > 0) ? left_ref[0] : 127;
+ }
+ for (i = 0; i < txhpx; ++i) {
+ memset(dst, val, txwpx);
+ dst += dst_stride;
+ }
+ return;
+ }
+
+ // NEED_LEFT
+ if (need_left) {
+ const int num_left_pixels_needed =
+ txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
+ i = 0;
+ if (n_left_px > 0) {
+ for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+ if (n_bottomleft_px > 0) {
+ assert(i == txhpx);
+ for (; i < txhpx + n_bottomleft_px; i++)
+ left_col[i] = left_ref[i * ref_stride];
+ }
+ if (i < num_left_pixels_needed)
+ memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+ } else if (n_top_px > 0) {
+ memset(left_col, above_ref[0], num_left_pixels_needed);
+ }
+ }
+
+ // NEED_ABOVE
+ if (need_above) {
+ const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
+ if (n_top_px > 0) {
+ memcpy(above_row, above_ref, n_top_px);
+ i = n_top_px;
+ if (n_topright_px > 0) {
+ assert(n_top_px == txwpx);
+ memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
+ i += n_topright_px;
+ }
+ if (i < num_top_pixels_needed)
+ memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
+ } else if (n_left_px > 0) {
+ memset(above_row, left_ref[0], num_top_pixels_needed);
+ }
+ }
+
+ if (need_above_left) {
+ if (n_top_px > 0 && n_left_px > 0) {
+ above_row[-1] = above_ref[-1];
+ } else if (n_top_px > 0) {
+ above_row[-1] = above_ref[0];
+ } else if (n_left_px > 0) {
+ above_row[-1] = left_ref[0];
+ } else {
+ above_row[-1] = 128;
+ }
+ left_col[-1] = above_row[-1];
+ }
+
+ if (use_filter_intra) {
+ av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode);
+ return;
+ }
+
+ assert(is_dr_mode);
+ int upsample_above = 0;
+ int upsample_left = 0;
+ if (!disable_edge_filter) {
+ const int need_right = p_angle < 90;
+ const int need_bottom = p_angle > 180;
+ if (p_angle != 90 && p_angle != 180) {
+ const int ab_le = need_above_left ? 1 : 0;
+ if (need_above && need_left && (txwpx + txhpx >= 24)) {
+ filter_intra_edge_corner(above_row, left_col);
+ }
+ if (need_above && n_top_px > 0) {
+ const int strength = intra_edge_filter_strength(
+ txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
+ const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+ av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+ }
+ if (need_left && n_left_px > 0) {
+ const int strength = intra_edge_filter_strength(
+ txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
+ const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+ av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+ }
+ }
+ upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+ intra_edge_filter_type);
+ if (need_above && upsample_above) {
+ const int n_px = txwpx + (need_right ? txhpx : 0);
+ av1_upsample_intra_edge(above_row, n_px);
+ }
+ upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+ intra_edge_filter_type);
+ if (need_left && upsample_left) {
+ const int n_px = txhpx + (need_bottom ? txwpx : 0);
+ av1_upsample_intra_edge(left_col, n_px);
+ }
+ }
+ dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+ upsample_left, p_angle);
+}
+
+// This function generates the pred data of a given block for non-directional
+// intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, SMOOTH_V and PAETH).
+static void build_non_directional_intra_predictors(
+ const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
+ PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) {
+ const uint8_t *above_ref = ref - ref_stride;
+ const uint8_t *left_ref = ref - 1;
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ const int need_left = extend_modes[mode] & NEED_LEFT;
+ const int need_above = extend_modes[mode] & NEED_ABOVE;
+ const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+ int i = 0;
+ assert(n_top_px >= 0);
+ assert(n_left_px >= 0);
+ assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+ mode == SMOOTH_H_PRED || mode == PAETH_PRED);
+
+ if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+ int val = 0;
+ if (need_left) {
+ val = (n_top_px > 0) ? above_ref[0] : 129;
+ } else {
+ val = (n_left_px > 0) ? left_ref[0] : 127;
+ }
+ for (i = 0; i < txhpx; ++i) {
+ memset(dst, val, txwpx);
+ dst += dst_stride;
+ }
+ return;
+ }
+
+ DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ uint8_t *const above_row = above_data + 16;
+ uint8_t *const left_col = left_data + 16;
+
+ if (need_left) {
+ memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS);
+ if (n_left_px > 0) {
+ for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+ if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i);
+ } else if (n_top_px > 0) {
+ memset(left_col, above_ref[0], txhpx);
+ }
+ }
+
+ if (need_above) {
+ memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS);
+ if (n_top_px > 0) {
+ memcpy(above_row, above_ref, n_top_px);
+ i = n_top_px;
+ if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i);
+ } else if (n_left_px > 0) {
+ memset(above_row, left_ref[0], txwpx);
+ }
+ }
+
+ if (need_above_left) {
+ if (n_top_px > 0 && n_left_px > 0) {
+ above_row[-1] = above_ref[-1];
+ } else if (n_top_px > 0) {
+ above_row[-1] = above_ref[0];
+ } else if (n_left_px > 0) {
+ above_row[-1] = left_ref[0];
+ } else {
+ above_row[-1] = 128;
+ }
+ left_col[-1] = above_row[-1];
+ }
+
+ if (mode == DC_PRED) {
+ dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
+ left_col);
+ } else {
+ pred[mode][tx_size](dst, dst_stride, above_row, left_col);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 } };
+ const int filt = strength - 1;
+ uint16_t edge[129];
+
+ memcpy(edge, p, sz * sizeof(*p));
+ for (int i = 1; i < sz; i++) {
+ int s = 0;
+ for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+ int k = i - 2 + j;
+ k = (k < 0) ? 0 : k;
+ k = (k > sz - 1) ? sz - 1 : k;
+ s += edge[k] * kernel[filt][j];
+ }
+ s = (s + 8) >> 4;
+ p[i] = s;
+ }
+}
+
+static void highbd_filter_intra_edge_corner(uint16_t *p_above,
+ uint16_t *p_left) {
+ const int kernel[3] = { 5, 6, 5 };
+
+ int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+ (p_above[0] * kernel[2]);
+ s = (s + 8) >> 4;
+ p_above[-1] = s;
+ p_left[-1] = s;
+}
+
+void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) {
+ // interpolate half-sample positions
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint16_t in[MAX_UPSAMPLE_SZ + 3];
+ // copy p[-1..(sz-1)] and extend first and last samples
+ in[0] = p[-1];
+ in[1] = p[-1];
+ for (int i = 0; i < sz; i++) {
+ in[i + 2] = p[i];
+ }
+ in[sz + 2] = p[sz - 1];
+
+ // interpolate half-sample edge positions
+ p[-2] = in[0];
+ for (int i = 0; i < sz; i++) {
+ int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+ s = (s + 8) >> 4;
+ s = clip_pixel_highbd(s, bd);
+ p[2 * i - 1] = s;
+ p[2 * i] = in[i + 2];
+ }
+}
+
+static void highbd_build_intra_predictors(
+ const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+ PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode,
+ TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+ int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
+ int bit_depth) {
+ int i;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]);
+ uint16_t *const above_row = above_data + 16;
+ uint16_t *const left_col = left_data + 16;
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ int need_left = extend_modes[mode] & NEED_LEFT;
+ int need_above = extend_modes[mode] & NEED_ABOVE;
+ int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+ const uint16_t *above_ref = ref - ref_stride;
+ const uint16_t *left_ref = ref - 1;
+ const int is_dr_mode = av1_is_directional_mode(mode);
+ const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+ int base = 128 << (bit_depth - 8);
+ // The left_data, above_data buffers must be zeroed to fix some intermittent
+ // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
+ // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
+ // seen to be the potential reason for this issue.
+ aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+ aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS);
+
+ // The default values if ref pixels are not available:
+ // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+ // base+1 A B .. Y Z
+ // base+1 C D .. W X
+ // base+1 E F .. U V
+ // base+1 G H .. S T T T T T
+
+ if (is_dr_mode) {
+ if (p_angle <= 90)
+ need_above = 1, need_left = 0, need_above_left = 1;
+ else if (p_angle < 180)
+ need_above = 1, need_left = 1, need_above_left = 1;
+ else
+ need_above = 0, need_left = 1, need_above_left = 1;
+ }
+ if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+ assert(n_top_px >= 0);
+ assert(n_topright_px >= -1);
+ assert(n_left_px >= 0);
+ assert(n_bottomleft_px >= -1);
+
+ if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+ int val;
+ if (need_left) {
+ val = (n_top_px > 0) ? above_ref[0] : base + 1;
+ } else {
+ val = (n_left_px > 0) ? left_ref[0] : base - 1;
+ }
+ for (i = 0; i < txhpx; ++i) {
+ aom_memset16(dst, val, txwpx);
+ dst += dst_stride;
+ }
+ return;
+ }
+
+ // NEED_LEFT
+ if (need_left) {
+ const int num_left_pixels_needed =
+ txhpx + (n_bottomleft_px >= 0 ? txwpx : 0);
+ i = 0;
+ if (n_left_px > 0) {
+ for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+ if (n_bottomleft_px > 0) {
+ assert(i == txhpx);
+ for (; i < txhpx + n_bottomleft_px; i++)
+ left_col[i] = left_ref[i * ref_stride];
+ }
+ if (i < num_left_pixels_needed)
+ aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+ } else if (n_top_px > 0) {
+ aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
+ }
+ }
+
+ // NEED_ABOVE
+ if (need_above) {
+ const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0);
+ if (n_top_px > 0) {
+ memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+ i = n_top_px;
+ if (n_topright_px > 0) {
+ assert(n_top_px == txwpx);
+ memcpy(above_row + txwpx, above_ref + txwpx,
+ n_topright_px * sizeof(above_ref[0]));
+ i += n_topright_px;
+ }
+ if (i < num_top_pixels_needed)
+ aom_memset16(&above_row[i], above_row[i - 1],
+ num_top_pixels_needed - i);
+ } else if (n_left_px > 0) {
+ aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
+ }
+ }
+
+ if (need_above_left) {
+ if (n_top_px > 0 && n_left_px > 0) {
+ above_row[-1] = above_ref[-1];
+ } else if (n_top_px > 0) {
+ above_row[-1] = above_ref[0];
+ } else if (n_left_px > 0) {
+ above_row[-1] = left_ref[0];
+ } else {
+ above_row[-1] = base;
+ }
+ left_col[-1] = above_row[-1];
+ }
+
+ if (use_filter_intra) {
+ highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode, bit_depth);
+ return;
+ }
+
+ if (is_dr_mode) {
+ int upsample_above = 0;
+ int upsample_left = 0;
+ if (!disable_edge_filter) {
+ const int need_right = p_angle < 90;
+ const int need_bottom = p_angle > 180;
+ if (p_angle != 90 && p_angle != 180) {
+ const int ab_le = need_above_left ? 1 : 0;
+ if (need_above && need_left && (txwpx + txhpx >= 24)) {
+ highbd_filter_intra_edge_corner(above_row, left_col);
+ }
+ if (need_above && n_top_px > 0) {
+ const int strength = intra_edge_filter_strength(
+ txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
+ const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+ av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength);
+ }
+ if (need_left && n_left_px > 0) {
+ const int strength = intra_edge_filter_strength(
+ txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
+ const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+ av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength);
+ }
+ }
+ upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+ intra_edge_filter_type);
+ if (need_above && upsample_above) {
+ const int n_px = txwpx + (need_right ? txhpx : 0);
+ av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth);
+ }
+ upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+ intra_edge_filter_type);
+ if (need_left && upsample_left) {
+ const int n_px = txhpx + (need_bottom ? txwpx : 0);
+ av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth);
+ }
+ }
+ highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ upsample_above, upsample_left, p_angle, bit_depth);
+ return;
+ }
+
+ // predict
+ if (mode == DC_PRED) {
+ dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+ dst, dst_stride, above_row, left_col, bit_depth);
+ } else {
+ pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+ int subsampling_y) {
+ assert(subsampling_x >= 0 && subsampling_x < 2);
+ assert(subsampling_y >= 0 && subsampling_y < 2);
+ BLOCK_SIZE bs = bsize;
+ switch (bsize) {
+ case BLOCK_4X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X8;
+ break;
+ case BLOCK_4X8:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X8;
+ break;
+ case BLOCK_8X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_8X8;
+ break;
+ case BLOCK_4X16:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X16;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X16;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X16;
+ break;
+ case BLOCK_16X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_16X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_16X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_16X8;
+ break;
+ default: break;
+ }
+ return bs;
+}
+
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+ int enable_intra_edge_filter, int wpx, int hpx,
+ TX_SIZE tx_size, PREDICTION_MODE mode,
+ int angle_delta, int use_palette,
+ FILTER_INTRA_MODE filter_intra_mode,
+ const uint8_t *ref, int ref_stride, uint8_t *dst,
+ int dst_stride, int col_off, int row_off,
+ int plane) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ const int x = col_off << MI_SIZE_LOG2;
+ const int y = row_off << MI_SIZE_LOG2;
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ assert(mode < INTRA_MODES);
+
+ if (use_palette) {
+ int r, c;
+ const uint8_t *const map = xd->plane[plane != 0].color_index_map +
+ xd->color_index_map_offset[plane != 0];
+ const uint16_t *const palette =
+ mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
+ if (is_hbd) {
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (r = 0; r < txhpx; ++r) {
+ for (c = 0; c < txwpx; ++c) {
+ dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
+ }
+ }
+ } else {
+ for (r = 0; r < txhpx; ++r) {
+ for (c = 0; c < txwpx; ++c) {
+ dst[r * dst_stride + c] =
+ (uint8_t)palette[map[(r + y) * wpx + c + x]];
+ }
+ }
+ }
+ return;
+ }
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int have_top =
+ row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
+ const int have_left =
+ col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
+
+ // Distance between the right edge of this prediction block to
+ // the frame right edge
+ const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx;
+ // Distance between the bottom edge of this prediction block to
+ // the frame bottom edge
+ const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx;
+ const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+ const int is_dr_mode = av1_is_directional_mode(mode);
+
+ // The computations in this function, as well as in build_intra_predictors(),
+ // are generalized for all intra modes. Some of these operations are not
+ // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H,
+ // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a
+ // separate function build_non_directional_intra_predictors() is introduced
+ // for these modes to avoid redundant computations while generating pred data.
+
+ // TODO(aomedia:3532): Enable this refactoring for high bd path as well.
+ if (!is_hbd && !use_filter_intra && !is_dr_mode) {
+ build_non_directional_intra_predictors(
+ ref, ref_stride, dst, dst_stride, mode, tx_size,
+ have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+ have_left ? AOMMIN(txhpx, yd + txhpx) : 0);
+ return;
+ }
+
+ const int txw = tx_size_wide_unit[tx_size];
+ const int txh = tx_size_high_unit[tx_size];
+ const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+ const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+ const int right_available =
+ mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
+ const int bottom_available =
+ (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
+
+ const PARTITION_TYPE partition = mbmi->partition;
+
+ BLOCK_SIZE bsize = mbmi->bsize;
+ // force 4x4 chroma component block size.
+ if (ss_x || ss_y) {
+ bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+ }
+
+ int p_angle = 0;
+ int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT;
+ int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT;
+
+ if (use_filter_intra) {
+ need_top_right = 0;
+ need_bottom_left = 0;
+ }
+ if (is_dr_mode) {
+ p_angle = mode_to_angle_map[mode] + angle_delta;
+ need_top_right = p_angle < 90;
+ need_bottom_left = p_angle > 180;
+ }
+
+ // Possible states for have_top_right(TR) and have_bottom_left(BL)
+ // -1 : TR and BL are not needed
+ // 0 : TR and BL are needed but not available
+ // > 0 : TR and BL are needed and pixels are available
+ const int have_top_right =
+ need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top,
+ right_available, partition, tx_size,
+ row_off, col_off, ss_x, ss_y)
+ : -1;
+ const int have_bottom_left =
+ need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col,
+ bottom_available, have_left, partition,
+ tx_size, row_off, col_off, ss_x, ss_y)
+ : -1;
+
+ const int disable_edge_filter = !enable_intra_edge_filter;
+ const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ highbd_build_intra_predictors(
+ ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
+ tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+ have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
+ have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+ have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
+ intra_edge_filter_type, xd->bd);
+ return;
+ }
+#endif
+ build_directional_and_filter_intra_predictors(
+ ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode,
+ tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+ have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right,
+ have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+ have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left,
+ intra_edge_filter_type);
+}
+
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int blk_col, int blk_row,
+ TX_SIZE tx_size) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ const PREDICTION_MODE mode =
+ (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+ const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
+ const FILTER_INTRA_MODE filter_intra_mode =
+ (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
+ ? mbmi->filter_intra_mode_info.filter_intra_mode
+ : FILTER_INTRA_MODES;
+ const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+ const SequenceHeader *seq_params = cm->seq_params;
+
+ if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
+#if CONFIG_DEBUG
+ assert(is_cfl_allowed(xd));
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+ (void)plane_bsize;
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ if (!xd->lossless[mbmi->segment_id]) {
+ assert(blk_col == 0);
+ assert(blk_row == 0);
+ assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+ assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+ }
+#endif
+ CFL_CTX *const cfl = &xd->cfl;
+ CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+ if (!cfl->dc_pred_is_cached[pred_plane]) {
+ av1_predict_intra_block(xd, seq_params->sb_size,
+ seq_params->enable_intra_edge_filter, pd->width,
+ pd->height, tx_size, mode, angle_delta,
+ use_palette, filter_intra_mode, dst, dst_stride,
+ dst, dst_stride, blk_col, blk_row, plane);
+ if (cfl->use_dc_pred_cache) {
+ cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
+ cfl->dc_pred_is_cached[pred_plane] = true;
+ }
+ } else {
+ cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
+ }
+ av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
+ return;
+ }
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+ pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode,
+ dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+}
+
+void av1_init_intra_predictors(void) {
+ aom_once(init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
new file mode 100644
index 0000000000..fa66ccd541
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
+
+#include <stdlib.h>
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_init_intra_predictors(void);
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int blk_col, int blk_row,
+ TX_SIZE tx_size);
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+ int enable_intra_edge_filter, int wpx, int hpx,
+ TX_SIZE tx_size, PREDICTION_MODE mode,
+ int angle_delta, int use_palette,
+ FILTER_INTRA_MODE filter_intra_mode,
+ const uint8_t *ref, int ref_stride, uint8_t *dst,
+ int dst_stride, int col_off, int row_off,
+ int plane);
+
+// Mapping of interintra to intra mode for use in the intra component
+static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
+ DC_PRED, V_PRED, H_PRED, SMOOTH_PRED
+};
+
+// Mapping of intra mode to the interintra mode
+static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
+ II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_V_PRED,
+ II_H_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+ return mode >= V_PRED && mode <= D67_PRED;
+}
+
+static INLINE int av1_is_diagonal_mode(PREDICTION_MODE mode) {
+ return mode >= D45_PRED && mode <= D67_PRED;
+}
+
+static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
+ return bsize >= BLOCK_8X8;
+}
+
+static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+ return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
+ cm->features.allow_intrabc;
+}
+
+static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+ BLOCK_SIZE bs) {
+ if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
+
+ return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
+}
+
+static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *mbmi) {
+ return mbmi->mode == DC_PRED &&
+ mbmi->palette_mode_info.palette_size[0] == 0 &&
+ av1_filter_intra_allowed_bsize(cm, mbmi->bsize);
+}
+
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+
+static const int16_t dr_intra_derivative[90] = {
+ // More evenly spread out angles and limited to 10-bit
+ // Values that are 0 will never be used
+ // Approx angle
+ 0, 0, 0, //
+ 1023, 0, 0, // 3, ...
+ 547, 0, 0, // 6, ...
+ 372, 0, 0, 0, 0, // 9, ...
+ 273, 0, 0, // 14, ...
+ 215, 0, 0, // 17, ...
+ 178, 0, 0, // 20, ...
+ 151, 0, 0, // 23, ... (113 & 203 are base angles)
+ 132, 0, 0, // 26, ...
+ 116, 0, 0, // 29, ...
+ 102, 0, 0, 0, // 32, ...
+ 90, 0, 0, // 36, ...
+ 80, 0, 0, // 39, ...
+ 71, 0, 0, // 42, ...
+ 64, 0, 0, // 45, ... (45 & 135 are base angles)
+ 57, 0, 0, // 48, ...
+ 51, 0, 0, // 51, ...
+ 45, 0, 0, 0, // 54, ...
+ 40, 0, 0, // 58, ...
+ 35, 0, 0, // 61, ...
+ 31, 0, 0, // 64, ...
+ 27, 0, 0, // 67, ... (67 & 157 are base angles)
+ 23, 0, 0, // 70, ...
+ 19, 0, 0, // 73, ...
+ 15, 0, 0, 0, 0, // 76, ...
+ 11, 0, 0, // 81, ...
+ 7, 0, 0, // 84, ...
+ 3, 0, 0, // 87, ...
+};
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int av1_get_dx(int angle) {
+ if (angle > 0 && angle < 90) {
+ return dr_intra_derivative[angle];
+ } else if (angle > 90 && angle < 180) {
+ return dr_intra_derivative[180 - angle];
+ } else {
+ // In this case, we are not really going to use dx. We may return any value.
+ return 1;
+ }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int av1_get_dy(int angle) {
+ if (angle > 90 && angle < 180) {
+ return dr_intra_derivative[angle - 90];
+ } else if (angle > 180 && angle < 270) {
+ return dr_intra_derivative[270 - angle];
+ } else {
+ // In this case, we are not really going to use dy. We may return any value.
+ return 1;
+ }
+}
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+ int type) {
+ const int d = abs(delta);
+ const int blk_wh = bs0 + bs1;
+ if (d == 0 || d >= 40) return 0;
+ return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
new file mode 100644
index 0000000000..1b348836a5
--- /dev/null
+++ b/third_party/aom/av1/common/resize.c
@@ -0,0 +1,1452 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = {
+ { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, 0, 34, 64, 36, 0, -3, 0 },
+ { -3, -1, 34, 64, 36, 1, -3, 0 }, { -3, -1, 33, 64, 37, 1, -3, 0 },
+ { -3, -1, 32, 64, 38, 1, -3, 0 }, { -3, -1, 31, 64, 39, 1, -3, 0 },
+ { -3, -1, 31, 63, 39, 2, -3, 0 }, { -2, -2, 30, 63, 40, 2, -3, 0 },
+ { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 29, 63, 41, 3, -4, 0 },
+ { -2, -2, 28, 63, 42, 3, -4, 0 }, { -2, -2, 27, 63, 43, 3, -4, 0 },
+ { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 26, 62, 44, 5, -4, 0 },
+ { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 },
+ { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 },
+ { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 22, 61, 48, 7, -4, -1 },
+ { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 49, 8, -4, 0 },
+ { -1, -4, 20, 60, 50, 8, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 },
+ { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 18, 58, 52, 10, -4, -1 },
+ { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 },
+ { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 },
+ { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 },
+ { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 },
+ { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 },
+ { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 },
+ { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 },
+ { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 },
+ { -1, -4, 8, 50, 60, 20, -4, -1 }, { 0, -4, 8, 49, 60, 20, -4, -1 },
+ { 0, -4, 7, 49, 60, 21, -3, -2 }, { -1, -4, 7, 48, 61, 22, -3, -2 },
+ { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 },
+ { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 },
+ { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 44, 62, 26, -3, -2 },
+ { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 43, 63, 27, -2, -2 },
+ { 0, -4, 3, 42, 63, 28, -2, -2 }, { 0, -4, 3, 41, 63, 29, -2, -2 },
+ { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 40, 63, 30, -2, -2 },
+ { 0, -3, 2, 39, 63, 31, -1, -3 }, { 0, -3, 1, 39, 64, 31, -1, -3 },
+ { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 37, 64, 33, -1, -3 },
+ { 0, -3, 1, 36, 64, 34, -1, -3 }, { 0, -3, 0, 36, 64, 34, 0, -3 },
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = {
+ { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 },
+ { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 },
+ { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 },
+ { 0, -8, 26, 79, 39, -7, -2, 1 }, { 0, -8, 25, 79, 40, -7, -2, 1 },
+ { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 23, 78, 42, -6, -2, 1 },
+ { 0, -8, 22, 78, 43, -6, -2, 1 }, { 0, -8, 21, 78, 44, -6, -2, 1 },
+ { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 19, 77, 47, -5, -3, 1 },
+ { 0, -8, 18, 77, 48, -5, -3, 1 }, { 0, -8, 17, 77, 49, -5, -3, 1 },
+ { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 76, 51, -4, -3, 1 },
+ { 0, -8, 15, 75, 52, -3, -4, 1 }, { 0, -7, 14, 74, 53, -3, -4, 1 },
+ { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 12, 73, 55, -2, -4, 1 },
+ { 0, -7, 11, 73, 56, -2, -4, 1 }, { 0, -7, 10, 72, 57, -1, -4, 1 },
+ { 1, -7, 10, 71, 58, -1, -5, 1 }, { 0, -7, 9, 71, 59, 0, -5, 1 },
+ { 1, -7, 8, 70, 60, 0, -5, 1 }, { 1, -7, 7, 69, 61, 1, -5, 1 },
+ { 1, -6, 6, 68, 62, 1, -5, 1 }, { 0, -6, 6, 68, 62, 2, -5, 1 },
+ { 1, -6, 5, 67, 63, 2, -5, 1 }, { 1, -6, 5, 66, 64, 3, -6, 1 },
+ { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -6, 3, 64, 66, 5, -6, 1 },
+ { 1, -5, 2, 63, 67, 5, -6, 1 }, { 1, -5, 2, 62, 68, 6, -6, 0 },
+ { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 1, 61, 69, 7, -7, 1 },
+ { 1, -5, 0, 60, 70, 8, -7, 1 }, { 1, -5, 0, 59, 71, 9, -7, 0 },
+ { 1, -5, -1, 58, 71, 10, -7, 1 }, { 1, -4, -1, 57, 72, 10, -7, 0 },
+ { 1, -4, -2, 56, 73, 11, -7, 0 }, { 1, -4, -2, 55, 73, 12, -7, 0 },
+ { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 53, 74, 14, -7, 0 },
+ { 1, -4, -3, 52, 75, 15, -8, 0 }, { 1, -3, -4, 51, 76, 15, -8, 0 },
+ { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 49, 77, 17, -8, 0 },
+ { 1, -3, -5, 48, 77, 18, -8, 0 }, { 1, -3, -5, 47, 77, 19, -8, 0 },
+ { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 44, 78, 21, -8, 0 },
+ { 1, -2, -6, 43, 78, 22, -8, 0 }, { 1, -2, -6, 42, 78, 23, -8, 0 },
+ { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 40, 79, 25, -8, 0 },
+ { 1, -2, -7, 39, 79, 26, -8, 0 }, { 1, -2, -7, 38, 80, 27, -8, -1 },
+ { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 },
+ { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 },
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = {
+ { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 },
+ { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 },
+ { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 },
+ { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 },
+ { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -9, 13, 94, 38, -12, 2, 0 },
+ { 2, -8, 12, 93, 40, -12, 1, 0 }, { 2, -8, 11, 93, 41, -12, 1, 0 },
+ { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -8, 8, 92, 44, -12, 1, 1 },
+ { 2, -7, 7, 91, 46, -12, 1, 0 }, { 2, -7, 6, 90, 47, -12, 1, 1 },
+ { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 4, 89, 50, -12, 1, 0 },
+ { 2, -6, 3, 88, 52, -12, 0, 1 }, { 2, -6, 2, 87, 54, -12, 0, 1 },
+ { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, 0, 85, 57, -12, 0, 1 },
+ { 2, -5, -1, 84, 58, -11, 0, 1 }, { 2, -5, -2, 83, 60, -11, 0, 1 },
+ { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 },
+ { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 },
+ { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 76, 69, -9, -1, 1 },
+ { 1, -3, -6, 75, 70, -8, -2, 1 }, { 1, -2, -7, 74, 71, -8, -2, 1 },
+ { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 71, 74, -7, -2, 1 },
+ { 1, -2, -8, 70, 75, -6, -3, 1 }, { 1, -1, -9, 69, 76, -6, -3, 1 },
+ { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 66, 79, -4, -4, 1 },
+ { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 },
+ { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 },
+ { 1, 0, -11, 58, 84, -1, -5, 2 }, { 1, 0, -12, 57, 85, 0, -5, 2 },
+ { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 54, 87, 2, -6, 2 },
+ { 1, 0, -12, 52, 88, 3, -6, 2 }, { 0, 1, -12, 50, 89, 4, -6, 2 },
+ { 0, 1, -12, 49, 90, 5, -7, 2 }, { 1, 1, -12, 47, 90, 6, -7, 2 },
+ { 0, 1, -12, 46, 91, 7, -7, 2 }, { 1, 1, -12, 44, 92, 8, -8, 2 },
+ { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 41, 93, 11, -8, 2 },
+ { 0, 1, -12, 40, 93, 12, -8, 2 }, { 0, 2, -12, 38, 94, 13, -9, 2 },
+ { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 35, 95, 15, -9, 2 },
+ { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 },
+ { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 },
+ { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 },
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
+ { 3, -8, 13, 112, 13, -8, 3, 0 }, { 2, -7, 12, 112, 15, -8, 3, -1 },
+ { 3, -7, 10, 112, 17, -9, 3, -1 }, { 2, -6, 8, 112, 19, -9, 3, -1 },
+ { 2, -6, 7, 112, 21, -10, 3, -1 }, { 2, -5, 6, 111, 22, -10, 3, -1 },
+ { 2, -5, 4, 111, 24, -10, 3, -1 }, { 2, -4, 3, 110, 26, -11, 3, -1 },
+ { 2, -4, 1, 110, 28, -11, 3, -1 }, { 2, -4, 0, 109, 30, -12, 4, -1 },
+ { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 },
+ { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 },
+ { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 },
+ { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 },
+ { 1, 0, -9, 100, 48, -15, 4, -1 }, { 1, 0, -10, 99, 50, -15, 4, -1 },
+ { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -11, 96, 55, -16, 4, -1 },
+ { 0, 1, -12, 95, 57, -16, 4, -1 }, { 0, 2, -13, 93, 59, -16, 4, -1 },
+ { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 90, 63, -16, 4, -1 },
+ { 0, 2, -14, 88, 65, -16, 4, -1 }, { 0, 2, -15, 86, 67, -16, 4, 0 },
+ { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 83, 71, -17, 4, 0 },
+ { 0, 3, -16, 81, 73, -16, 3, 0 }, { 0, 3, -16, 79, 75, -16, 3, 0 },
+ { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 75, 79, -16, 3, 0 },
+ { 0, 3, -16, 73, 81, -16, 3, 0 }, { 0, 4, -17, 71, 83, -16, 3, 0 },
+ { 0, 4, -17, 69, 84, -15, 3, 0 }, { 0, 4, -16, 67, 86, -15, 2, 0 },
+ { -1, 4, -16, 65, 88, -14, 2, 0 }, { -1, 4, -16, 63, 90, -14, 2, 0 },
+ { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 59, 93, -13, 2, 0 },
+ { -1, 4, -16, 57, 95, -12, 1, 0 }, { -1, 4, -16, 55, 96, -11, 1, 0 },
+ { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 50, 99, -10, 0, 1 },
+ { -1, 4, -15, 48, 100, -9, 0, 1 }, { -1, 4, -15, 46, 101, -8, 0, 1 },
+ { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 },
+ { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 },
+ { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 },
+ { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 },
+ { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -11, 26, 110, 3, -4, 2 },
+ { -1, 3, -10, 24, 111, 4, -5, 2 }, { -1, 3, -10, 22, 111, 6, -5, 2 },
+ { -1, 3, -10, 21, 112, 7, -6, 2 }, { -1, 3, -9, 19, 112, 8, -6, 2 },
+ { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 },
+};
+
+const int16_t av1_resize_filter_normative[(
+ 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
+#if UPSCALE_NORMATIVE_TAPS == 8
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 },
+ { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 },
+ { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 },
+ { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 },
+ { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 },
+ { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 },
+ { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 },
+ { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
+ { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
+ { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
+ { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
+ { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
+ { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 },
+ { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 },
+ { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 },
+ { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 },
+ { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 },
+ { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 },
+ { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 },
+ { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 },
+ { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 },
+ { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
+ { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
+ { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
+ { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
+ { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
+ { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 },
+ { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 },
+ { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 },
+ { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 },
+ { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 },
+ { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 },
+#else
+#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
+#endif // UPSCALE_NORMATIVE_TAPS == 8
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
+static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
+ int out_length16 = out_length * 16;
+ if (out_length16 >= in_length * 16)
+ return filteredinterp_filters1000;
+ else if (out_length16 >= in_length * 13)
+ return filteredinterp_filters875;
+ else if (out_length16 >= in_length * 11)
+ return filteredinterp_filters750;
+ else if (out_length16 >= in_length * 9)
+ return filteredinterp_filters625;
+ else
+ return filteredinterp_filters500;
+}
+
+static void interpolate_core(const uint8_t *const input, int in_length,
+ uint8_t *output, int out_length,
+ const int16_t *interp_filters, int interp_taps) {
+ const int32_t delta =
+ (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+ out_length;
+ const int32_t offset =
+ in_length > out_length
+ ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length
+ : -(((int32_t)(out_length - in_length)
+ << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length;
+ uint8_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int32_t y;
+
+ x = 0;
+ y = offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = out_length - 1;
+ y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+ in_length) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+ ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k) {
+ const int pk = int_pel - interp_taps / 2 + 1 + k;
+ sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+ }
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ // End part.
+ for (; x < out_length; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] *
+ input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ }
+}
+
+static void interpolate(const uint8_t *const input, int in_length,
+ uint8_t *output, int out_length) {
+ const InterpKernel *interp_filters =
+ choose_interp_filter(in_length, out_length);
+
+ interpolate_core(input, in_length, output, out_length, &interp_filters[0][0],
+ SUBPEL_TAPS);
+}
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
+ return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
+}
+
+static int32_t get_upscale_convolve_x0(int in_length, int out_length,
+ int32_t x_step_qn) {
+ const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
+ const int32_t x0 =
+ (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length +
+ RS_SCALE_EXTRA_OFF - err / 2;
+ return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+ uint8_t *output) {
+ // Actual filter len = 2 * filter_len_half.
+ const int16_t *filter = av1_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint8_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+ uint8_t *output) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ const int16_t *filter = av1_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint8_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ }
+}
+
+static int get_down2_length(int length, int steps) {
+ for (int s = 0; s < steps; ++s) length = (length + 1) >> 1;
+ return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+ int steps = 0;
+ int proj_in_length;
+ while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+ ++steps;
+ in_length = proj_in_length;
+ if (in_length == 1) {
+ // Special case: we break because any further calls to get_down2_length()
+ // with be with length == 1, which return 1, resulting in an infinite
+ // loop.
+ break;
+ }
+ }
+ return steps;
+}
+
+static void resize_multistep(const uint8_t *const input, int length,
+ uint8_t *output, int olength, uint8_t *otmp) {
+ if (length == olength) {
+ memcpy(output, input, sizeof(output[0]) * length);
+ return;
+ }
+ const int steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ uint8_t *out = NULL;
+ int filteredlength = length;
+
+ assert(otmp != NULL);
+ uint8_t *otmp2 = otmp + get_down2_length(length, 1);
+ for (int s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint8_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ down2_symodd(in, filteredlength, out);
+ else
+ down2_symeven(in, filteredlength, out);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ interpolate(out, filteredlength, output, olength);
+ }
+ } else {
+ interpolate(input, length, output, olength);
+ }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+ int i;
+ uint8_t *iptr = img;
+ uint8_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+ int i;
+ uint8_t *iptr = img;
+ uint8_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+bool av1_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2, int width2,
+ int out_stride) {
+ int i;
+ bool mem_status = true;
+ uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
+ uint8_t *tmpbuf =
+ (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height));
+ uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
+ uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) {
+ mem_status = false;
+ goto Error;
+ }
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ for (i = 0; i < height; ++i)
+ resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2,
+ tmpbuf);
+ for (i = 0; i < width2; ++i) {
+ fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+ fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+ }
+
+Error:
+ aom_free(intbuf);
+ aom_free(tmpbuf);
+ aom_free(arrbuf);
+ aom_free(arrbuf2);
+ return mem_status;
+}
+
+static bool upscale_normative_rect(const uint8_t *const input, int height,
+ int width, int in_stride, uint8_t *output,
+ int height2, int width2, int out_stride,
+ int x_step_qn, int x0_qn, int pad_left,
+ int pad_right) {
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ assert(height2 == height);
+
+ // Extend the left/right pixels of the tile column if needed
+ // (either because we can't sample from other tiles, or because we're at
+ // a frame edge).
+ // Save the overwritten pixels into tmp_left and tmp_right.
+ // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+ // column of border pixels compared to what we'd naively think.
+ const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+ uint8_t *tmp_left =
+ NULL; // Silence spurious "may be used uninitialized" warnings
+ uint8_t *tmp_right = NULL;
+ uint8_t *const in_tl = (uint8_t *)(input - border_cols); // Cast off 'const'
+ uint8_t *const in_tr = (uint8_t *)(input + width);
+ if (pad_left) {
+ tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+ if (!tmp_left) return false;
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
+ memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
+ }
+ }
+ if (pad_right) {
+ tmp_right =
+ (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+ if (!tmp_right) {
+ aom_free(tmp_left);
+ return false;
+ }
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
+ memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
+ border_cols);
+ }
+ }
+
+ av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
+ height2, &av1_resize_filter_normative[0][0], x0_qn,
+ x_step_qn);
+
+ // Restore the left/right border pixels
+ if (pad_left) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
+ }
+ aom_free(tmp_left);
+ }
+ if (pad_right) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
+ }
+ aom_free(tmp_right);
+ }
+ return true;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_interpolate_core(const uint16_t *const input, int in_length,
+ uint16_t *output, int out_length, int bd,
+ const int16_t *interp_filters,
+ int interp_taps) {
+ const int32_t delta =
+ (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+ out_length;
+ const int32_t offset =
+ in_length > out_length
+ ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length
+ : -(((int32_t)(out_length - in_length)
+ << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length;
+ uint16_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int32_t y;
+
+ x = 0;
+ y = offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = out_length - 1;
+ y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+ in_length) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+ ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k) {
+ const int pk = int_pel - interp_taps / 2 + 1 + k;
+ sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+ }
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // End part.
+ for (; x < out_length; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] *
+ input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ }
+}
+
+static void highbd_interpolate(const uint16_t *const input, int in_length,
+ uint16_t *output, int out_length, int bd) {
+ const InterpKernel *interp_filters =
+ choose_interp_filter(in_length, out_length);
+
+ highbd_interpolate_core(input, in_length, output, out_length, bd,
+ &interp_filters[0][0], SUBPEL_TAPS);
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half.
+ static const int16_t *filter = av1_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ static const int16_t *filter = av1_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input, int length,
+ uint16_t *output, int olength,
+ uint16_t *otmp, int bd) {
+ if (length == olength) {
+ memcpy(output, input, sizeof(output[0]) * length);
+ return;
+ }
+ const int steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ uint16_t *out = NULL;
+ int filteredlength = length;
+
+ assert(otmp != NULL);
+ uint16_t *otmp2 = otmp + get_down2_length(length, 1);
+ for (int s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint16_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ highbd_down2_symodd(in, filteredlength, out, bd);
+ else
+ highbd_down2_symeven(in, filteredlength, out, bd);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ highbd_interpolate(out, filteredlength, output, olength, bd);
+ }
+ } else {
+ highbd_interpolate(input, length, output, olength, bd);
+ }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride, int bd) {
+ int i;
+ uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
+ uint16_t *tmpbuf =
+ (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height));
+ uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height);
+ uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2);
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+ goto Error;
+ for (i = 0; i < height; ++i) {
+ highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+ intbuf + width2 * i, width2, tmpbuf, bd);
+ }
+ for (i = 0; i < width2; ++i) {
+ highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
+ highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+ arrbuf2);
+ }
+
+Error:
+ aom_free(intbuf);
+ aom_free(tmpbuf);
+ aom_free(arrbuf);
+ aom_free(arrbuf2);
+}
+
+static bool highbd_upscale_normative_rect(const uint8_t *const input,
+ int height, int width, int in_stride,
+ uint8_t *output, int height2,
+ int width2, int out_stride,
+ int x_step_qn, int x0_qn,
+ int pad_left, int pad_right, int bd) {
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ assert(height2 == height);
+
+ // Extend the left/right pixels of the tile column if needed
+ // (either because we can't sample from other tiles, or because we're at
+ // a frame edge).
+ // Save the overwritten pixels into tmp_left and tmp_right.
+ // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+ // column of border pixels compared to what we'd naively think.
+ const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+ const int border_size = border_cols * sizeof(uint16_t);
+ uint16_t *tmp_left =
+ NULL; // Silence spurious "may be used uninitialized" warnings
+ uint16_t *tmp_right = NULL;
+ uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
+ uint16_t *const in_tl = input16 - border_cols;
+ uint16_t *const in_tr = input16 + width;
+ if (pad_left) {
+ tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+ if (!tmp_left) return false;
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
+ aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
+ }
+ }
+ if (pad_right) {
+ tmp_right =
+ (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+ if (!tmp_right) {
+ aom_free(tmp_left);
+ return false;
+ }
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
+ aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
+ border_cols);
+ }
+ }
+
+ av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
+ CONVERT_TO_SHORTPTR(output), out_stride, width2,
+ height2, &av1_resize_filter_normative[0][0],
+ x0_qn, x_step_qn, bd);
+
+ // Restore the left/right border pixels
+ if (pad_left) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
+ }
+ aom_free(tmp_left);
+ }
+ if (pad_right) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
+ }
+ aom_free(tmp_right);
+ }
+ return true;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride))
+ abort();
+ if (!av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride))
+ abort();
+ if (!av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride))
+ abort();
+}
+
+bool av1_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride))
+ return false;
+ if (!av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+ owidth / 2, ouv_stride))
+ return false;
+ if (!av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+ owidth / 2, ouv_stride))
+ return false;
+ return true;
+}
+
+bool av1_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride))
+ return false;
+ if (!av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride))
+ return false;
+ if (!av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride))
+ return false;
+ return true;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+ av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+ owidth / 2, ouv_stride, bd);
+ av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+ owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride, bd);
+ av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride, bd);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ const InterpFilter filter,
+ const int phase_scaler,
+ const int num_planes) {
+ assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH ||
+ filter == EIGHTTAP_REGULAR);
+ const InterpKernel *const kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter].filter_ptr;
+
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+ const int src_w = src->crop_widths[is_uv];
+ const int src_h = src->crop_heights[is_uv];
+ const uint8_t *src_buffer = src->buffers[i];
+ const int src_stride = src->strides[is_uv];
+ const int dst_w = dst->crop_widths[is_uv];
+ const int dst_h = dst->crop_heights[is_uv];
+ uint8_t *dst_buffer = dst->buffers[i];
+ const int dst_stride = dst->strides[is_uv];
+ for (int y = 0; y < dst_h; y += 16) {
+ const int y_q4 =
+ src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler;
+ for (int x = 0; x < dst_w; x += 16) {
+ const int x_q4 =
+ src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler;
+ const uint8_t *src_ptr =
+ src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w;
+ uint8_t *dst_ptr = dst_buffer + y * dst_stride + x;
+
+ // Width and height of the actual working area.
+ const int work_w = AOMMIN(16, dst_w - x);
+ const int work_h = AOMMIN(16, dst_h - y);
+ // SIMD versions of aom_scaled_2d() have some trouble handling
+ // nonstandard sizes, so fall back on the C version to handle borders.
+ if (work_w != 16 || work_h != 16) {
+ aom_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, work_w, work_h);
+ } else {
+ aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16, 16);
+ }
+ }
+ }
+ }
+ aom_extend_frame_borders(dst, num_planes);
+}
+
+bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ const int num_planes) {
+ // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv], bd);
+ } else if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv],
+ dst->strides[is_uv])) {
+ return false;
+ }
+#else
+ (void)bd;
+ if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv]))
+ return false;
+#endif
+ }
+ aom_extend_frame_borders(dst, num_planes);
+ return true;
+}
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int plane, int rows) {
+ const int is_uv = (plane > 0);
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
+ const int upscaled_plane_width =
+ ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+ const int superres_denom = cm->superres_scale_denominator;
+
+ TileInfo tile_col;
+ const int32_t x_step_qn = av1_get_upscale_convolve_step(
+ downscaled_plane_width, upscaled_plane_width);
+ int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
+ upscaled_plane_width, x_step_qn);
+
+ for (int j = 0; j < cm->tiles.cols; j++) {
+ av1_tile_set_col(&tile_col, cm, j);
+ // Determine the limits of this tile column in both the source
+ // and destination images.
+ // Note: The actual location which we start sampling from is
+ // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
+ // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
+ const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
+ const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
+ const int src_width = downscaled_x1 - downscaled_x0;
+
+ const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
+ int upscaled_x1;
+ if (j == cm->tiles.cols - 1) {
+ // Note that we can't just use AOMMIN here - due to rounding,
+ // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
+ // upscaled_plane_width.
+ upscaled_x1 = upscaled_plane_width;
+ } else {
+ upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
+ }
+
+ const uint8_t *const src_ptr = src + downscaled_x0;
+ uint8_t *const dst_ptr = dst + upscaled_x0;
+ const int dst_width = upscaled_x1 - upscaled_x0;
+
+ const int pad_left = (j == 0);
+ const int pad_right = (j == cm->tiles.cols - 1);
+
+ bool success;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params->use_highbitdepth)
+ success = highbd_upscale_normative_rect(
+ src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width,
+ dst_stride, x_step_qn, x0_qn, pad_left, pad_right,
+ cm->seq_params->bit_depth);
+ else
+ success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+ dst_ptr, rows, dst_width, dst_stride,
+ x_step_qn, x0_qn, pad_left, pad_right);
+#else
+ success = upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+ dst_ptr, rows, dst_width, dst_stride,
+ x_step_qn, x0_qn, pad_left, pad_right);
+#endif
+ if (!success) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error upscaling frame");
+ }
+ // Update the fractional pixel offset to prepare for the next tile column.
+ x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
+ }
+}
+
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ const int num_planes = av1_num_planes(cm);
+ for (int i = 0; i < num_planes; ++i) {
+ const int is_uv = (i > 0);
+ av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], i,
+ src->crop_heights[is_uv]);
+ }
+
+ aom_extend_frame_borders(dst, num_planes);
+}
+
+YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
+ AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ const InterpFilter filter, const int phase, const bool use_optimized_scaler,
+ const bool for_psnr, const int border_in_pixels,
+ const int num_pyramid_levels) {
+ // If scaling is performed for the sole purpose of calculating PSNR, then our
+ // target dimensions are superres upscaled width/height. Otherwise our target
+ // dimensions are coded width/height.
+ const int scaled_width = for_psnr ? cm->superres_upscaled_width : cm->width;
+ const int scaled_height =
+ for_psnr ? cm->superres_upscaled_height : cm->height;
+ const bool scaling_required = (scaled_width != unscaled->y_crop_width) ||
+ (scaled_height != unscaled->y_crop_height);
+
+ if (scaling_required) {
+ const int num_planes = av1_num_planes(cm);
+ const SequenceHeader *seq_params = cm->seq_params;
+
+ // Reallocate the frame buffer based on the target dimensions when scaling
+ // is required.
+ if (aom_realloc_frame_buffer(
+ scaled, scaled_width, scaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL,
+ num_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled buffer");
+
+ bool has_optimized_scaler = av1_has_optimized_scaler(
+ unscaled->y_crop_width, unscaled->y_crop_height, scaled_width,
+ scaled_height);
+ if (num_planes > 1) {
+ has_optimized_scaler = has_optimized_scaler &&
+ av1_has_optimized_scaler(unscaled->uv_crop_width,
+ unscaled->uv_crop_height,
+ scaled->uv_crop_width,
+ scaled->uv_crop_height);
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_optimized_scaler && has_optimized_scaler &&
+ cm->seq_params->bit_depth == AOM_BITS_8) {
+ av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+ } else {
+ if (!av1_resize_and_extend_frame_nonnormative(
+ unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffers during resize");
+ }
+#else
+ if (use_optimized_scaler && has_optimized_scaler) {
+ av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
+ } else {
+ if (!av1_resize_and_extend_frame_nonnormative(
+ unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffers during resize");
+ }
+#endif
+ return scaled;
+ }
+ return unscaled;
+}
+
+// Calculates the scaled dimension given the original dimension and the scale
+// denominator.
+static void calculate_scaled_size_helper(int *dim, int denom) {
+ if (denom != SCALE_NUMERATOR) {
+ // We need to ensure the constraint in "Appendix A" of the spec:
+ // * FrameWidth is greater than or equal to 16
+ // * FrameHeight is greater than or equal to 16
+ // For this, we clamp the downscaled dimension to at least 16. One
+ // exception: if original dimension itself was < 16, then we keep the
+ // downscaled dimension to be same as the original, to ensure that resizing
+ // is valid.
+ const int min_dim = AOMMIN(16, *dim);
+ // Use this version if we need *dim to be even
+ // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
+ // *width <<= 1;
+ *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
+ *dim = AOMMAX(*dim, min_dim);
+ }
+}
+
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom) {
+ calculate_scaled_size_helper(width, resize_denom);
+ calculate_scaled_size_helper(height, resize_denom);
+}
+
+void av1_calculate_scaled_superres_size(int *width, int *height,
+ int superres_denom) {
+ (void)height;
+ calculate_scaled_size_helper(width, superres_denom);
+}
+
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
+ if (denom != SCALE_NUMERATOR) {
+ // Note: av1_calculate_scaled_superres_size() rounds *up* after division
+ // when the resulting dimensions are odd. So here, we round *down*.
+ *width = *width * denom / SCALE_NUMERATOR;
+ (void)height;
+ }
+}
+
+// Copy only the config data from 'src' to 'dst'.
+static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
+ YV12_BUFFER_CONFIG *const dst) {
+ dst->bit_depth = src->bit_depth;
+ dst->color_primaries = src->color_primaries;
+ dst->transfer_characteristics = src->transfer_characteristics;
+ dst->matrix_coefficients = src->matrix_coefficients;
+ dst->monochrome = src->monochrome;
+ dst->chroma_sample_position = src->chroma_sample_position;
+ dst->color_range = src->color_range;
+}
+
+// TODO(afergs): Look for in-place upscaling
+// TODO(afergs): aom_ vs av1_ functions? Which can I use?
+// Upscale decoded image.
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool,
+ int num_pyramid_levels) {
+ const int num_planes = av1_num_planes(cm);
+ if (!av1_superres_scaled(cm)) return;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int byte_alignment = cm->features.byte_alignment;
+
+ YV12_BUFFER_CONFIG copy_buffer;
+ memset(&copy_buffer, 0, sizeof(copy_buffer));
+
+ YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf;
+
+ const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
+ if (aom_alloc_frame_buffer(
+ &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, byte_alignment, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate copy buffer for superres upscaling");
+
+ // Copy function assumes the frames are the same size.
+ // Note that it does not copy YV12_BUFFER_CONFIG config data.
+ aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
+
+ assert(copy_buffer.y_crop_width == aligned_width);
+ assert(copy_buffer.y_crop_height == cm->height);
+
+ // Realloc the current frame buffer at a higher resolution in place.
+ if (pool != NULL) {
+ // Use callbacks if on the decoder.
+ aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer;
+ aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
+ aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
+ void *cb_priv = pool->cb_priv;
+
+ lock_buffer_pool(pool);
+ // Realloc with callback does not release the frame buffer - release first.
+ if (release_fb_cb(cb_priv, fb)) {
+ unlock_buffer_pool(pool);
+ aom_internal_error(
+ cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to free current frame buffer before superres upscaling");
+ }
+ // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
+ if (aom_realloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv,
+ num_pyramid_levels, 0)) {
+ unlock_buffer_pool(pool);
+ aom_internal_error(
+ cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate current frame buffer for superres upscaling");
+ }
+ unlock_buffer_pool(pool);
+ } else {
+ // Make a copy of the config data for frame_to_show in copy_buffer
+ copy_buffer_config(frame_to_show, &copy_buffer);
+
+ // Don't use callbacks on the encoder.
+ // aom_alloc_frame_buffer() clears the config data for frame_to_show
+ if (aom_alloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, byte_alignment, num_pyramid_levels, 0))
+ aom_internal_error(
+ cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate current frame buffer for superres upscaling");
+
+ // Restore config data back to frame_to_show
+ copy_buffer_config(&copy_buffer, frame_to_show);
+ }
+ // TODO(afergs): verify frame_to_show is correct after realloc
+ // encoder:
+ // decoder:
+
+ assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
+ assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
+
+ // Scale up and back into frame_to_show.
+ assert(frame_to_show->y_crop_width != cm->width);
+ av1_upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
+
+ // Free the copy buffer
+ aom_free_frame_buffer(&copy_buffer);
+}
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
new file mode 100644
index 0000000000..0ba3108f72
--- /dev/null
+++ b/third_party/aom/av1/common/resize.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
+
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool av1_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2, int width2,
+ int out_stride);
+// TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from
+// av1/exports_com and delete this function.
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+bool av1_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+bool av1_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride, int bd);
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int plane, int rows);
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
+ AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ const InterpFilter filter, const int phase, const bool use_optimized_scaler,
+ const bool for_psnr, const int border_in_pixels,
+ const int num_pyramid_levels);
+
+bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ const int num_planes);
+
+// Calculates the scaled dimensions from the given original dimensions and the
+// resize scale denominator.
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom);
+
+// Similar to above, but calculates scaled dimensions after superres from the
+// given original dimensions and superres scale denominator.
+void av1_calculate_scaled_superres_size(int *width, int *height,
+ int superres_denom);
+
+// Inverse of av1_calculate_scaled_superres_size() above: calculates the
+// original dimensions from the given scaled dimensions and the scale
+// denominator.
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
+
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool,
+ int num_pyramid_levels);
+
+// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
+static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+ // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
+ // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
+ // So, the following check is more accurate.
+ return (cm->width != cm->superres_upscaled_width);
+}
+
+// The optimized scaler av1_resize_and_extend_frame() can only handle scaling
+// ratios >= 1/4 and <= 16. See comment in aom_convolve8_c() for detail.
+// Visual assessment shows that if the scaling ratio or its reciprocal is not a
+// multiple of 1/16, there are some artifacts in the output of the optimized
+// scaler, especially on lines, due to non-exact ratio representation. SSSE3
+// and NEON have a specialized 3/4 version of av1_resize_and_extend_frame()
+// that does not have this issue.
+//
+// Use the non-normative scaler av1_resize_and_extend_frame_nonnormative()
+// for other scaling ratios.
+static INLINE bool av1_has_optimized_scaler(const int src_width,
+ const int src_height,
+ const int dst_width,
+ const int dst_height) {
+ bool has_optimized_scaler =
+ (dst_width * 4 >= src_width && dst_height * 4 >= src_height) &&
+ (dst_width <= src_width * 16 && dst_height <= src_height * 16) &&
+ (16 * dst_width % src_width == 0) && (16 * src_width % dst_width == 0) &&
+ (16 * dst_height % src_height == 0) &&
+ (16 * src_height % dst_height == 0);
+#if HAVE_SSSE3 || HAVE_NEON
+ has_optimized_scaler =
+ has_optimized_scaler ||
+ (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height);
+#endif
+ return has_optimized_scaler;
+}
+
+#define UPSCALE_NORMATIVE_TAPS 8
+extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS]
+ [UPSCALE_NORMATIVE_TAPS];
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
new file mode 100644
index 0000000000..0be126fa65
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.c
@@ -0,0 +1,1494 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/thread_common.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+
+#include "aom_ports/mem.h"
+
+// The 's' values are calculated based on original 'r' and 'e' values in the
+// spec using GenSgrprojVtable().
+// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
+const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
+ { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
+ { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
+ { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
+ { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
+ { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
+ { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
+ { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
+ { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
+};
+
+void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w,
+ int *plane_h) {
+ int ss_x = is_uv && cm->seq_params->subsampling_x;
+ int ss_y = is_uv && cm->seq_params->subsampling_y;
+ *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+ *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y);
+}
+
+// Count horizontal or vertical units in a plane (use a width or height for
+// plane_size, respectively). We basically want to divide the plane size by the
+// size of a restoration unit. Rather than rounding up unconditionally as you
+// might expect, we round to nearest, which models the way a right or bottom
+// restoration unit can extend to up to 150% its normal width or height.
+//
+// The max with 1 is to deal with small frames, which may be smaller than
+// half of an LR unit in size.
+int av1_lr_count_units(int unit_size, int plane_size) {
+ return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1);
+}
+
+void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
+ int is_uv) {
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ const int unit_size = rsi->restoration_unit_size;
+ const int horz_units = av1_lr_count_units(unit_size, plane_w);
+ const int vert_units = av1_lr_count_units(unit_size, plane_h);
+
+ rsi->num_rest_units = horz_units * vert_units;
+ rsi->horz_units = horz_units;
+ rsi->vert_units = vert_units;
+
+ aom_free(rsi->unit_info);
+ CHECK_MEM_ERROR(cm, rsi->unit_info,
+ (RestorationUnitInfo *)aom_memalign(
+ 16, sizeof(*rsi->unit_info) * rsi->num_rest_units));
+}
+
+void av1_free_restoration_struct(RestorationInfo *rst_info) {
+ aom_free(rst_info->unit_info);
+ rst_info->unit_info = NULL;
+}
+
+#if 0
+// Pair of values for each sgrproj parameter:
+// Index 0 corresponds to r[0], e[0]
+// Index 1 corresponds to r[1], e[1]
+int sgrproj_mtable[SGRPROJ_PARAMS][2];
+
+static void GenSgrprojVtable(void) {
+ for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
+ const sgr_params_type *const params = &av1_sgr_params[i];
+ for (int j = 0; j < 2; ++j) {
+ const int e = params->e[j];
+ const int r = params->r[j];
+ if (r == 0) { // filter is disabled
+ sgrproj_mtable[i][j] = -1; // mark invalid
+ } else { // filter is enabled
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const int n2e = n * n * e;
+ assert(n2e != 0);
+ sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+ }
+ }
+ }
+}
+#endif
+
+void av1_loop_restoration_precal(void) {
+#if 0
+ GenSgrprojVtable();
+#endif
+}
+
+static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert) {
+ uint8_t *data_p;
+ int i;
+ for (i = 0; i < height; ++i) {
+ data_p = data + i * stride;
+ memset(data_p - border_horz, data_p[0], border_horz);
+ memset(data_p + width, data_p[width - 1], border_horz);
+ }
+ data_p = data - border_horz;
+ for (i = -border_vert; i < 0; ++i) {
+ memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
+ }
+ for (i = height; i < height + border_vert; ++i) {
+ memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+ width + 2 * border_horz);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void extend_frame_highbd(uint16_t *data, int width, int height,
+ int stride, int border_horz, int border_vert) {
+ uint16_t *data_p;
+ int i, j;
+ for (i = 0; i < height; ++i) {
+ data_p = data + i * stride;
+ for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+ for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
+ }
+ data_p = data - border_horz;
+ for (i = -border_vert; i < 0; ++i) {
+ memcpy(data_p + i * stride, data_p,
+ (width + 2 * border_horz) * sizeof(uint16_t));
+ }
+ for (i = height; i < height + border_vert; ++i) {
+ memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+ (width + 2 * border_horz) * sizeof(uint16_t));
+ }
+}
+
+static void copy_rest_unit_highbd(int width, int height, const uint16_t *src,
+ int src_stride, uint16_t *dst,
+ int dst_stride) {
+ for (int i = 0; i < height; ++i)
+ memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+#endif
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+ border_horz, border_vert);
+ return;
+ }
+#endif
+ (void)highbd;
+ extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+}
+
+static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride) {
+ for (int i = 0; i < height; ++i)
+ memcpy(dst + i * dst_stride, src + i * src_stride, width);
+}
+
+static void copy_rest_unit(int width, int height, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+ CONVERT_TO_SHORTPTR(dst), dst_stride);
+ return;
+ }
+#endif
+ (void)highbd;
+ copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride);
+}
+
+#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+
+// With striped loop restoration, the filtering for each 64-pixel stripe gets
+// most of its input from the output of CDEF (stored in data8), but we need to
+// fill out a border of 3 pixels above/below the stripe according to the
+// following rules:
+//
+// * At the top and bottom of the frame, we copy the outermost row of CDEF
+// pixels three times. This extension is done by a call to av1_extend_frame()
+// at the start of the loop restoration process, so the value of
+// copy_above/copy_below doesn't strictly matter.
+//
+// * All other boundaries are stripe boundaries within the frame. In that case,
+// we take 2 rows of deblocked pixels and extend them to 3 rows of context.
+static void get_stripe_boundary_info(const RestorationTileLimits *limits,
+ int plane_w, int plane_h, int ss_y,
+ int *copy_above, int *copy_below) {
+ (void)plane_w;
+
+ *copy_above = 1;
+ *copy_below = 1;
+
+ const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+ const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+ const int first_stripe_in_plane = (limits->v_start == 0);
+ const int this_stripe_height =
+ full_stripe_height - (first_stripe_in_plane ? runit_offset : 0);
+ const int last_stripe_in_plane =
+ (limits->v_start + this_stripe_height >= plane_h);
+
+ if (first_stripe_in_plane) *copy_above = 0;
+ if (last_stripe_in_plane) *copy_below = 0;
+}
+
+// Overwrite the border pixels around a processing stripe so that the conditions
+// listed above get_stripe_boundary_info() are preserved.
+// We save the pixels which get overwritten into a temporary buffer, so that
+// they can be restored by restore_processing_stripe_boundary() after we've
+// processed the stripe.
+//
+// limits gives the rectangular limits of the remaining stripes for the current
+// restoration unit. rsb is the stored stripe boundaries (taken from either
+// deblock or CDEF output as necessary).
+static void setup_processing_stripe_boundary(
+ const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
+ int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+ RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
+ // Offsets within the line buffers. The buffer logically starts at column
+ // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
+ // has column x0 in the buffer.
+ const int buf_stride = rsb->stripe_boundary_stride;
+ const int buf_x0_off = limits->h_start;
+ const int line_width =
+ (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+ const int line_size = line_width << use_highbd;
+
+ const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+ // Replace RESTORATION_BORDER pixels above the top of the stripe
+ // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
+ // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
+ // duplicating the topmost of the 2 lines (see the AOMMAX call when
+ // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
+ if (!opt) {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+ for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+ const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+ const int buf_off = buf_x0_off + buf_row * buf_stride;
+ const uint8_t *buf =
+ rsb->stripe_boundary_above + (buf_off << use_highbd);
+ uint8_t *dst8 = data8_tl + i * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_above
+ memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+ REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+ }
+ }
+
+ // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+ // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+ // for i = 0, 1, 2.
+ if (copy_below) {
+ const int stripe_end = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+ for (int i = 0; i < RESTORATION_BORDER; ++i) {
+ const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+ const int buf_off = buf_x0_off + buf_row * buf_stride;
+ const uint8_t *src =
+ rsb->stripe_boundary_below + (buf_off << use_highbd);
+
+ uint8_t *dst8 = data8_bl + i * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_below
+ memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+ }
+ }
+ } else {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+ // Only save and overwrite i=-RESTORATION_BORDER line.
+ uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_above
+ memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8),
+ REAL_PTR(use_highbd,
+ data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+ line_size);
+ }
+
+ if (copy_below) {
+ const int stripe_end = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+ // Only save and overwrite i=2 line.
+ uint8_t *dst8 = data8_bl + 2 * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_below
+ memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8),
+ REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+ }
+ }
+}
+
+// Once a processing stripe is finished, this function sets the boundary
+// pixels which were overwritten by setup_processing_stripe_boundary()
+// back to their original values
+static void restore_processing_stripe_boundary(
+ const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
+ int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
+ int copy_below, int opt) {
+ const int line_width =
+ (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+ const int line_size = line_width << use_highbd;
+
+ const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+ if (!opt) {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+ for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+ uint8_t *dst8 = data8_tl + i * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8),
+ rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+ }
+ }
+
+ if (copy_below) {
+ const int stripe_bottom = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+ for (int i = 0; i < RESTORATION_BORDER; ++i) {
+ if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+
+ uint8_t *dst8 = data8_bl + i * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+ }
+ }
+ } else {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+ // Only restore i=-RESTORATION_BORDER line.
+ uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+ }
+
+ if (copy_below) {
+ const int stripe_bottom = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+ // Only restore i=2 line.
+ if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+ uint8_t *dst8 = data8_bl + 2 * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
+ }
+ }
+ }
+}
+
+static void wiener_filter_stripe(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
+ (void)tmpbuf;
+ (void)bit_depth;
+ (void)error_info;
+ assert(bit_depth == 8);
+ const WienerConvolveParams conv_params = get_conv_params_wiener(8);
+
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+ const uint8_t *src_p = src + j;
+ uint8_t *dst_p = dst + j;
+ av1_wiener_convolve_add_src(
+ src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
+ rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
+ }
+}
+
+/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
+ over the input. The window is of size (2r + 1)x(2r + 1), and we
+ specialize to r = 1, 2, 3. A default function is used for r > 3.
+
+ Each loop follows the same format: We keep a window's worth of input
+ in individual variables and select data out of that as appropriate.
+*/
+static void boxsum1(int32_t *src, int width, int height, int src_stride,
+ int sqr, int32_t *dst, int dst_stride) {
+ int i, j, a, b, c;
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ // Vertical sum over 3-pixel regions, from src into dst.
+ if (!sqr) {
+ for (j = 0; j < width; ++j) {
+ a = src[j];
+ b = src[src_stride + j];
+ c = src[2 * src_stride + j];
+
+ dst[j] = a + b;
+ for (i = 1; i < height - 2; ++i) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[(i - 1) * src_stride + j]
+ // b = src[(i ) * src_stride + j]
+ // c = src[(i + 1) * src_stride + j]
+ dst[i * dst_stride + j] = a + b + c;
+ a = b;
+ b = c;
+ c = src[(i + 2) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c;
+ dst[(i + 1) * dst_stride + j] = b + c;
+ }
+ } else {
+ for (j = 0; j < width; ++j) {
+ a = src[j] * src[j];
+ b = src[src_stride + j] * src[src_stride + j];
+ c = src[2 * src_stride + j] * src[2 * src_stride + j];
+
+ dst[j] = a + b;
+ for (i = 1; i < height - 2; ++i) {
+ dst[i * dst_stride + j] = a + b + c;
+ a = b;
+ b = c;
+ c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c;
+ dst[(i + 1) * dst_stride + j] = b + c;
+ }
+ }
+
+ // Horizontal sum over 3-pixel regions of dst
+ for (i = 0; i < height; ++i) {
+ a = dst[i * dst_stride];
+ b = dst[i * dst_stride + 1];
+ c = dst[i * dst_stride + 2];
+
+ dst[i * dst_stride] = a + b;
+ for (j = 1; j < width - 2; ++j) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[i * src_stride + (j - 1)]
+ // b = src[i * src_stride + (j )]
+ // c = src[i * src_stride + (j + 1)]
+ dst[i * dst_stride + j] = a + b + c;
+ a = b;
+ b = c;
+ c = dst[i * dst_stride + (j + 2)];
+ }
+ dst[i * dst_stride + j] = a + b + c;
+ dst[i * dst_stride + (j + 1)] = b + c;
+ }
+}
+
+static void boxsum2(int32_t *src, int width, int height, int src_stride,
+ int sqr, int32_t *dst, int dst_stride) {
+ int i, j, a, b, c, d, e;
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ // Vertical sum over 5-pixel regions, from src into dst.
+ if (!sqr) {
+ for (j = 0; j < width; ++j) {
+ a = src[j];
+ b = src[src_stride + j];
+ c = src[2 * src_stride + j];
+ d = src[3 * src_stride + j];
+ e = src[4 * src_stride + j];
+
+ dst[j] = a + b + c;
+ dst[dst_stride + j] = a + b + c + d;
+ for (i = 2; i < height - 3; ++i) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[(i - 2) * src_stride + j]
+ // b = src[(i - 1) * src_stride + j]
+ // c = src[(i ) * src_stride + j]
+ // d = src[(i + 1) * src_stride + j]
+ // e = src[(i + 2) * src_stride + j]
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ e = src[(i + 3) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ dst[(i + 1) * dst_stride + j] = b + c + d + e;
+ dst[(i + 2) * dst_stride + j] = c + d + e;
+ }
+ } else {
+ for (j = 0; j < width; ++j) {
+ a = src[j] * src[j];
+ b = src[src_stride + j] * src[src_stride + j];
+ c = src[2 * src_stride + j] * src[2 * src_stride + j];
+ d = src[3 * src_stride + j] * src[3 * src_stride + j];
+ e = src[4 * src_stride + j] * src[4 * src_stride + j];
+
+ dst[j] = a + b + c;
+ dst[dst_stride + j] = a + b + c + d;
+ for (i = 2; i < height - 3; ++i) {
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ dst[(i + 1) * dst_stride + j] = b + c + d + e;
+ dst[(i + 2) * dst_stride + j] = c + d + e;
+ }
+ }
+
+ // Horizontal sum over 5-pixel regions of dst
+ for (i = 0; i < height; ++i) {
+ a = dst[i * dst_stride];
+ b = dst[i * dst_stride + 1];
+ c = dst[i * dst_stride + 2];
+ d = dst[i * dst_stride + 3];
+ e = dst[i * dst_stride + 4];
+
+ dst[i * dst_stride] = a + b + c;
+ dst[i * dst_stride + 1] = a + b + c + d;
+ for (j = 2; j < width - 3; ++j) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[i * src_stride + (j - 2)]
+ // b = src[i * src_stride + (j - 1)]
+ // c = src[i * src_stride + (j )]
+ // d = src[i * src_stride + (j + 1)]
+ // e = src[i * src_stride + (j + 2)]
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ e = dst[i * dst_stride + (j + 3)];
+ }
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ dst[i * dst_stride + (j + 1)] = b + c + d + e;
+ dst[i * dst_stride + (j + 2)] = c + d + e;
+ }
+}
+
+static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
+ int sqr, int32_t *dst, int dst_stride) {
+ if (r == 1)
+ boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
+ else if (r == 2)
+ boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
+ else
+ assert(0 && "Invalid value of r in self-guided filter");
+}
+
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+ if (params->r[0] == 0) {
+ xq[0] = 0;
+ xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
+ } else if (params->r[1] == 0) {
+ xq[0] = xqd[0];
+ xq[1] = 0;
+ } else {
+ xq[0] = xqd[0];
+ xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
+ }
+}
+
+const int32_t av1_x_by_xplus1[256] = {
+ // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
+ // instead of 0. See comments in selfguided_restoration_internal() for why
+ 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+ 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
+ 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
+ 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
+ 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
+ 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
+ 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 256,
+};
+
+const int32_t av1_one_by_x[MAX_NELEM] = {
+ 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+ 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
+};
+
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+ int dgd_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx,
+ int pass, int32_t *A, int32_t *B) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ const int step = pass == 0 ? 1 : 2;
+ int i, j;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+ width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+ boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+ width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+ // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+ for (i = -1; i < height + 1; i += step) {
+ for (j = -1; j < width + 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int n = (2 * r + 1) * (2 * r + 1);
+
+ // a < 2^16 * n < 2^22 regardless of bit depth
+ uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+ // b < 2^8 * n < 2^14 regardless of bit depth
+ uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+ // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+ // and p itself satisfies p < 2^14 * n^2 < 2^26.
+ // This bound on p is due to:
+ // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+ //
+ // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+ // This is an artefact of rounding, and can only happen if all pixels
+ // are (almost) identical, so in this case we saturate to p=0.
+ uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+
+ const uint32_t s = params->s[radius_idx];
+
+ // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+ // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+ // (this holds even after accounting for the rounding in s)
+ const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+ // Note: We have to be quite careful about the value of A[k].
+ // This is used as a blend factor between individual pixel values and the
+ // local mean. So it logically has a range of [0, 256], including both
+ // endpoints.
+ //
+ // This is a pain for hardware, as we'd like something which can be stored
+ // in exactly 8 bits.
+ // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+ // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+ // slightly above 2^(8 + bit depth), due to rounding in the value of
+ // av1_one_by_x[25-1].
+ //
+ // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+ // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+ // overflow), without significantly affecting the final result: z == 0
+ // implies that the image is essentially "flat", so the local mean and
+ // individual pixel values are very similar.
+ //
+ // Note that saturating on the other side, ie. requring A[k] <= 255,
+ // would be a bad idea, as that corresponds to the case where the image
+ // is very variable, when we want to preserve the local pixel value as
+ // much as possible.
+ A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
+
+ // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
+ // av1_one_by_x[n - 1] = round(2^12 / n)
+ // => the product here is < 2^(20 + bit_depth) <= 2^32,
+ // and B[k] is set to a value < 2^(8 + bit depth)
+ // This holds even with the rounding in av1_one_by_x and in the overall
+ // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
+ B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+ (uint32_t)B[k] *
+ (uint32_t)av1_one_by_x[n - 1],
+ SGRPROJ_RECIP_BITS);
+ }
+ }
+}
+
+static void selfguided_restoration_fast_internal(
+ int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int i, j;
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 1, A, B);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Use the A[] and B[] arrays to calculate the filtered image
+ (void)r;
+ assert(r == 2);
+ for (i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (j = 0; j < width; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+ const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+ (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+ A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+ 5;
+ const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+ (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+ B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+ 5;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ } else { // odd row
+ for (j = 0; j < width; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 4;
+ const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+ const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+ }
+}
+
+static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth,
+ int sgr_params_idx,
+ int radius_idx) {
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int i, j;
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 0, A, B);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Use the A[] and B[] arrays to calculate the filtered image
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+ const int32_t a =
+ (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+ 4 +
+ (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+ A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+ 3;
+ const int32_t b =
+ (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+ 4 +
+ (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+ B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+ 3;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+}
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ int32_t *dgd32 =
+ dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+
+ if (highbd) {
+ const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+ for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+ for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+ dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
+ }
+ }
+ } else {
+ for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+ for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+ dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
+ }
+ }
+ }
+
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ if (params->r[0] > 0)
+ selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
+ flt0, flt_stride, bit_depth,
+ sgr_params_idx, 0);
+ if (params->r[1] > 0)
+ selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
+ flt_stride, bit_depth, sgr_params_idx, 1);
+ return 0;
+}
+
+int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+
+ const int ret = av1_selfguided_restoration_c(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ if (ret != 0) return ret;
+ const sgr_params_type *const params = &av1_sgr_params[eps];
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int k = i * width + j;
+ uint8_t *dst8ij = dst8 + i * dst_stride + j;
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+
+ const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+ const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ // If params->r == 0 then we skipped the filtering in
+ // av1_selfguided_restoration_c, i.e. flt[k] == u
+ if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
+ if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
+ const int16_t w =
+ (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ const uint16_t out = clip_pixel_highbd(w, bit_depth);
+ if (highbd)
+ *CONVERT_TO_SHORTPTR(dst8ij) = out;
+ else
+ *dst8ij = (uint8_t)out;
+ }
+ }
+ return 0;
+}
+
+static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
+ (void)bit_depth;
+ assert(bit_depth == 8);
+
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, stripe_width - j);
+ if (av1_apply_selfguided_restoration(
+ src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+ rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth,
+ 0) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_apply_selfguided_restoration");
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void wiener_filter_stripe_highbd(
+ const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
+ (void)tmpbuf;
+ (void)error_info;
+ const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+ const uint8_t *src8_p = src8 + j;
+ uint8_t *dst8_p = dst8 + j;
+ av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
+ rui->wiener_info.hfilter, 16,
+ rui->wiener_info.vfilter, 16, w,
+ stripe_height, &conv_params, bit_depth);
+ }
+}
+
+static void sgrproj_filter_stripe_highbd(
+ const RestorationUnitInfo *rui, int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info) {
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, stripe_width - j);
+ if (av1_apply_selfguided_restoration(
+ src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+ rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth,
+ 1) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_apply_selfguided_restoration");
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf, int bit_depth,
+ struct aom_internal_error_info *error_info);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define NUM_STRIPE_FILTERS 4
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+ wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
+ sgrproj_filter_stripe_highbd
+};
+#else
+#define NUM_STRIPE_FILTERS 2
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+ wiener_filter_stripe, sgrproj_filter_stripe
+};
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Filter one restoration unit
+void av1_loop_restoration_filter_unit(
+ const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+ const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+ int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
+ uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
+ int optimized_lr, struct aom_internal_error_info *error_info) {
+ RestorationType unit_rtype = rui->restoration_type;
+
+ int unit_h = limits->v_end - limits->v_start;
+ int unit_w = limits->h_end - limits->h_start;
+ uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
+ uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
+
+ if (unit_rtype == RESTORE_NONE) {
+ copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride,
+ highbd);
+ return;
+ }
+
+ const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+ assert(filter_idx < NUM_STRIPE_FILTERS);
+ const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
+
+ const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+
+ // Filter the whole image one stripe at a time
+ RestorationTileLimits remaining_stripes = *limits;
+ int i = 0;
+ while (i < unit_h) {
+ int copy_above, copy_below;
+ remaining_stripes.v_start = limits->v_start + i;
+
+ get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y,
+ &copy_above, &copy_below);
+
+ const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+ const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+ // Work out where this stripe's boundaries are within
+ // rsb->stripe_boundary_{above,below}
+ const int frame_stripe =
+ (remaining_stripes.v_start + runit_offset) / full_stripe_height;
+ const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
+
+ // Calculate this stripe's height, based on two rules:
+ // * The topmost stripe in the frame is 8 luma pixels shorter than usual.
+ // * We can't extend past the end of the current restoration unit
+ const int nominal_stripe_height =
+ full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0);
+ const int h = AOMMIN(nominal_stripe_height,
+ remaining_stripes.v_end - remaining_stripes.v_start);
+
+ setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
+ h, data8, stride, rlbs, copy_above,
+ copy_below, optimized_lr);
+
+ stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
+ dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth,
+ error_info);
+
+ restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
+ data8, stride, copy_above, copy_below,
+ optimized_lr);
+
+ i += h;
+ }
+}
+
+static void filter_frame_on_unit(const RestorationTileLimits *limits,
+ int rest_unit_idx, void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+ const RestorationInfo *rsi = ctxt->rsi;
+
+ av1_loop_restoration_filter_unit(
+ limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs,
+ ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
+ ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
+ ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info);
+}
+
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+ YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int optimized_lr,
+ int num_planes) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int bit_depth = seq_params->bit_depth;
+ const int highbd = seq_params->use_highbitdepth;
+ lr_ctxt->dst = &cm->rst_frame;
+
+ const int frame_width = frame->crop_widths[0];
+ const int frame_height = frame->crop_heights[0];
+ if (aom_realloc_frame_buffer(
+ lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) != AOM_CODEC_OK)
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate restoration dst buffer");
+
+ lr_ctxt->on_rest_unit = filter_frame_on_unit;
+ lr_ctxt->frame = frame;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationType rtype = rsi->frame_restoration_type;
+ rsi->optimized_lr = optimized_lr;
+ lr_ctxt->ctxt[plane].rsi = rsi;
+
+ if (rtype == RESTORE_NONE) {
+ continue;
+ }
+
+ const int is_uv = plane > 0;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ assert(plane_w == frame->crop_widths[is_uv]);
+ assert(plane_h == frame->crop_heights[is_uv]);
+
+ av1_extend_frame(frame->buffers[plane], plane_w, plane_h,
+ frame->strides[is_uv], RESTORATION_BORDER,
+ RESTORATION_BORDER, highbd);
+
+ FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+ lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+ lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
+ lr_plane_ctxt->plane_w = plane_w;
+ lr_plane_ctxt->plane_h = plane_h;
+ lr_plane_ctxt->highbd = highbd;
+ lr_plane_ctxt->bit_depth = bit_depth;
+ lr_plane_ctxt->data8 = frame->buffers[plane];
+ lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
+ lr_plane_ctxt->data_stride = frame->strides[is_uv];
+ lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
+ }
+}
+
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+ AV1_COMMON *cm, int num_planes) {
+ typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+ int vstart, int vend);
+ static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+ aom_yv12_partial_coloc_copy_u,
+ aom_yv12_partial_coloc_copy_v };
+ assert(num_planes <= 3);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane];
+ copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0,
+ lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h);
+ }
+}
+
+static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
+ int num_planes) {
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
+ continue;
+ }
+
+ av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
+ &ctxt[plane], cm->rst_tmpbuf, cm->rlbs);
+ }
+}
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int optimized_lr,
+ void *lr_ctxt) {
+ assert(!cm->features.all_lossless);
+ const int num_planes = av1_num_planes(cm);
+
+ AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+ av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+ optimized_lr, num_planes);
+
+ foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
+
+ av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
+}
+
+void av1_foreach_rest_unit_in_row(
+ RestorationTileLimits *limits, int plane_w,
+ rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+ int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
+ sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
+ struct aom_internal_error_info *error_info) {
+ const int ext_size = unit_size * 3 / 2;
+ int x0 = 0, j = 0;
+ while (x0 < plane_w) {
+ int remaining_w = plane_w - x0;
+ int w = (remaining_w < ext_size) ? remaining_w : unit_size;
+
+ limits->h_start = x0;
+ limits->h_end = x0 + w;
+ assert(limits->h_end <= plane_w);
+
+ const int unit_idx = row_number * hnum_rest_units + j;
+
+ // No sync for even numbered rows
+ // For odd numbered rows, Loop Restoration of current block requires the LR
+ // of top-right and bottom-right blocks to be completed
+
+ // top-right sync
+ on_sync_read(lr_sync, row_number, j, plane);
+ if ((row_number + 1) < vnum_rest_units)
+ // bottom-right sync
+ on_sync_read(lr_sync, row_number + 2, j, plane);
+
+#if CONFIG_MULTITHREAD
+ if (lr_sync && lr_sync->num_workers > 1) {
+ pthread_mutex_lock(lr_sync->job_mutex);
+ const bool lr_mt_exit = lr_sync->lr_mt_exit;
+ pthread_mutex_unlock(lr_sync->job_mutex);
+ // Exit in case any worker has encountered an error.
+ if (lr_mt_exit) return;
+ }
+#endif
+
+ on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info);
+
+ on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane);
+
+ x0 += w;
+ ++j;
+ }
+}
+
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane) {
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+ (void)plane;
+}
+
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+ rest_unit_visitor_t on_rest_unit,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ const int hnum_rest_units = rsi->horz_units;
+ const int vnum_rest_units = rsi->vert_units;
+ const int unit_size = rsi->restoration_unit_size;
+
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int ext_size = unit_size * 3 / 2;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ int y0 = 0, i = 0;
+ while (y0 < plane_h) {
+ int remaining_h = plane_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+ RestorationTileLimits limits;
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+ av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size,
+ hnum_rest_units, vnum_rest_units, plane, priv,
+ tmpbuf, rlbs, av1_lr_sync_read_dummy,
+ av1_lr_sync_write_dummy, NULL, cm->error);
+
+ y0 += h;
+ ++i;
+ }
+}
+
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *rcol0, int *rcol1, int *rrow0,
+ int *rrow1) {
+ assert(rcol0 && rcol1 && rrow0 && rrow1);
+
+ if (bsize != cm->seq_params->sb_size) return 0;
+
+ assert(!cm->features.all_lossless);
+
+ const int is_uv = plane > 0;
+
+ // Compute the mi-unit corners of the superblock
+ const int mi_row0 = mi_row;
+ const int mi_col0 = mi_col;
+ const int mi_row1 = mi_row0 + mi_size_high[bsize];
+ const int mi_col1 = mi_col0 + mi_size_wide[bsize];
+
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ const int size = rsi->restoration_unit_size;
+ const int horz_units = rsi->horz_units;
+ const int vert_units = rsi->vert_units;
+
+ // The size of an MI-unit on this plane of the image
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int mi_size_x = MI_SIZE >> ss_x;
+ const int mi_size_y = MI_SIZE >> ss_y;
+
+ // Write m for the relative mi column or row, D for the superres denominator
+ // and N for the superres numerator. If u is the upscaled pixel offset then
+ // we can write the downscaled pixel offset in two ways as:
+ //
+ // MI_SIZE * m = N / D u
+ //
+ // from which we get u = D * MI_SIZE * m / N
+ const int mi_to_num_x = av1_superres_scaled(cm)
+ ? mi_size_x * cm->superres_scale_denominator
+ : mi_size_x;
+ const int mi_to_num_y = mi_size_y;
+ const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
+ const int denom_y = size;
+
+ const int rnd_x = denom_x - 1;
+ const int rnd_y = denom_y - 1;
+
+ // rcol0/rrow0 should be the first column/row of restoration units that
+ // doesn't start left/below of mi_col/mi_row. For this calculation, we need
+ // to round up the division (if the sb starts at runit column 10.1, the first
+ // matching runit has column index 11)
+ *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x;
+ *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y;
+
+ // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
+ // below-right. If we're at the bottom or right of the frame, this restoration
+ // unit might not exist, in which case we'll clamp accordingly.
+ *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+ *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+
+ return *rcol0 < *rcol1 && *rrow0 < *rrow1;
+}
+
+// Extend to left and right
+static void extend_lines(uint8_t *buf, int width, int height, int stride,
+ int extend, int use_highbitdepth) {
+ for (int i = 0; i < height; ++i) {
+ if (use_highbitdepth) {
+ uint16_t *buf16 = (uint16_t *)buf;
+ aom_memset16(buf16 - extend, buf16[0], extend);
+ aom_memset16(buf16 + width, buf16[width - 1], extend);
+ } else {
+ memset(buf - extend, buf[0], extend);
+ memset(buf + width, buf[width - 1], extend);
+ }
+ buf += stride;
+ }
+}
+
+static void save_deblock_boundary_lines(
+ const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
+ int stripe, int use_highbd, int is_above,
+ RestorationStripeBoundaries *boundaries) {
+ const int is_uv = plane > 0;
+ const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+ const int src_stride = frame->strides[is_uv] << use_highbd;
+ const uint8_t *src_rows = src_buf + row * src_stride;
+
+ uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+ : boundaries->stripe_boundary_below;
+ uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+ const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+ uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+
+ // There is a rare case in which a processing stripe can end 1px above the
+ // crop border. In this case, we do want to use deblocked pixels from below
+ // the stripe (hence why we ended up in this function), but instead of
+ // fetching 2 "below" rows we need to fetch one and duplicate it.
+ // This is equivalent to clamping the sample locations against the crop border
+ const int lines_to_save =
+ AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
+ assert(lines_to_save == 1 || lines_to_save == 2);
+
+ int upscaled_width;
+ int line_bytes;
+ if (av1_superres_scaled(cm)) {
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
+ line_bytes = upscaled_width << use_highbd;
+ if (use_highbd)
+ av1_upscale_normative_rows(
+ cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+ CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+ plane, lines_to_save);
+ else
+ av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
+ boundaries->stripe_boundary_stride, plane,
+ lines_to_save);
+ } else {
+ upscaled_width = frame->crop_widths[is_uv];
+ line_bytes = upscaled_width << use_highbd;
+ for (int i = 0; i < lines_to_save; i++) {
+ memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
+ line_bytes);
+ }
+ }
+ // If we only saved one line, then copy it into the second line buffer
+ if (lines_to_save == 1)
+ memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
+
+ extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+ RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ const AV1_COMMON *cm, int plane, int row,
+ int stripe, int use_highbd, int is_above,
+ RestorationStripeBoundaries *boundaries) {
+ const int is_uv = plane > 0;
+ const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+ const int src_stride = frame->strides[is_uv] << use_highbd;
+ const uint8_t *src_rows = src_buf + row * src_stride;
+
+ uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+ : boundaries->stripe_boundary_below;
+ uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+ const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+ uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+ const int src_width = frame->crop_widths[is_uv];
+
+ // At the point where this function is called, we've already applied
+ // superres. So we don't need to extend the lines here, we can just
+ // pull directly from the topmost row of the upscaled frame.
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int upscaled_width = av1_superres_scaled(cm)
+ ? (cm->superres_upscaled_width + ss_x) >> ss_x
+ : src_width;
+ const int line_bytes = upscaled_width << use_highbd;
+ for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
+ // Copy the line at 'src_rows' into both context lines
+ memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
+ }
+ extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+ RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd,
+ int plane, AV1_COMMON *cm, int after_cdef) {
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+ const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
+
+ const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
+
+ int stripe_idx;
+ for (stripe_idx = 0;; ++stripe_idx) {
+ const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off);
+ const int y0 = rel_y0;
+ if (y0 >= plane_h) break;
+
+ const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off;
+ const int y1 = AOMMIN(rel_y1, plane_h);
+
+ // Extend using CDEF pixels at the top and bottom of the frame,
+ // and deblocked pixels at internal stripe boundaries
+ const int use_deblock_above = (stripe_idx > 0);
+ const int use_deblock_below = (y1 < plane_height);
+
+ if (!after_cdef) {
+ // Save deblocked context at internal stripe boundaries
+ if (use_deblock_above) {
+ save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
+ stripe_idx, use_highbd, 1, boundaries);
+ }
+ if (use_deblock_below) {
+ save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx,
+ use_highbd, 0, boundaries);
+ }
+ } else {
+ // Save CDEF context at frame boundaries
+ if (!use_deblock_above) {
+ save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd,
+ 1, boundaries);
+ }
+ if (!use_deblock_below) {
+ save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx,
+ use_highbd, 0, boundaries);
+ }
+ }
+ }
+}
+
+// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
+// lines to be used as boundary in the loop restoration process. The
+// lines are saved in rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int after_cdef) {
+ const int num_planes = av1_num_planes(cm);
+ const int use_highbd = cm->seq_params->use_highbitdepth;
+ for (int p = 0; p < num_planes; ++p) {
+ save_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+ }
+}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
new file mode 100644
index 0000000000..644e06980f
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.h
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
+
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @file */
+
+/*!\cond */
+
+// Border for Loop restoration buffer
+#define AOM_RESTORATION_FRAME_BORDER 32
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
+
+#define RESTORATION_PROC_UNIT_SIZE 64
+
+// Filter stripe grid offset upwards compared to the superblock grid
+#define RESTORATION_UNIT_OFFSET 8
+
+#define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 3 // Horizontal border used for Sgr
+
+#define WIENER_BORDER_VERT 2 // Vertical border used for Wiener
+#define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener
+
+// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
+// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
+// Note the line buffer needed is twice the value of this macro.
+#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
+#else
+#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
+#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
+#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
+#else
+#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
+#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+// How many border pixels do we need for each processing unit?
+#define RESTORATION_BORDER 3
+
+// How many rows of deblocked pixels do we save above/below each processing
+// stripe?
+#define RESTORATION_CTX_VERT 2
+
+// Additional pixels to the left and right in above/below buffers
+// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
+#define RESTORATION_EXTRA_HORZ 4
+
+// Pad up to 20 more (may be much less is needed)
+#define RESTORATION_PADDING 20
+#define RESTORATION_PROC_UNIT_PELS \
+ ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
+ RESTORATION_PADDING) * \
+ (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
+ RESTORATION_PADDING))
+
+#define RESTORATION_UNITSIZE_MAX 256
+#define RESTORATION_UNITPELS_HORZ_MAX \
+ (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_UNITPELS_VERT_MAX \
+ ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+ RESTORATION_UNIT_OFFSET))
+#define RESTORATION_UNITPELS_MAX \
+ (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
+
+// Two 32-bit buffers needed for the restored versions from two filters
+// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
+// on the decoder side.
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t))
+
+#define SGRPROJ_EXTBUF_SIZE (0)
+#define SGRPROJ_PARAMS_BITS 4
+#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
+
+// Precision bits for projection
+#define SGRPROJ_PRJ_BITS 7
+// Restoration precision bits generated higher than source before projection
+#define SGRPROJ_RST_BITS 4
+// Internal precision bits for core selfguided_restoration
+#define SGRPROJ_SGR_BITS 8
+#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
+
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+
+#define SGRPROJ_PRJ_SUBEXP_K 4
+
+#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
+
+#define MAX_RADIUS 2 // Only 1, 2, 3 allowed
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
+
+#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
+#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
+#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
+#define WIENER_TMPBUF_SIZE (0)
+#define WIENER_EXTBUF_SIZE (0)
+
+// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
+// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
+#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN_REDUCED (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
+#define WIENER_STATS_DOWNSAMPLE_FACTOR 4
+
+#define WIENER_FILT_PREC_BITS 7
+#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
+
+// Central values for the taps
+#define WIENER_FILT_TAP0_MIDV (3)
+#define WIENER_FILT_TAP1_MIDV (-7)
+#define WIENER_FILT_TAP2_MIDV (15)
+#define WIENER_FILT_TAP3_MIDV \
+ (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
+ WIENER_FILT_TAP2_MIDV))
+
+#define WIENER_FILT_TAP0_BITS 4
+#define WIENER_FILT_TAP1_BITS 5
+#define WIENER_FILT_TAP2_BITS 6
+
+#define WIENER_FILT_BITS \
+ ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MINV \
+ (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MINV \
+ (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MINV \
+ (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+ (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MAXV \
+ (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MAXV \
+ (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_SUBEXP_K 1
+#define WIENER_FILT_TAP1_SUBEXP_K 2
+#define WIENER_FILT_TAP2_SUBEXP_K 3
+
+// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE
+#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE)
+
+// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE
+#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE)
+
+// Check the assumptions of the existing code
+#if SUBPEL_TAPS != WIENER_WIN + 1
+#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1"
+#endif
+#if WIENER_FILT_PREC_BITS != 7
+#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
+#endif
+
+typedef struct {
+ int r[2]; // radii
+ int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
+} sgr_params_type;
+/*!\endcond */
+
+/*!\brief Parameters related to Restoration Unit Info */
+typedef struct {
+ /*!
+ * restoration type
+ */
+ RestorationType restoration_type;
+
+ /*!
+ * Wiener filter parameters if restoration_type indicates Wiener
+ */
+ WienerInfo wiener_info;
+
+ /*!
+ * Sgrproj filter parameters if restoration_type indicates Sgrproj
+ */
+ SgrprojInfo sgrproj_info;
+} RestorationUnitInfo;
+
+/*!\cond */
+
+// A restoration line buffer needs space for two lines plus a horizontal filter
+// margin of RESTORATION_EXTRA_HORZ on each side.
+#define RESTORATION_LINEBUFFER_WIDTH \
+ (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
+
+typedef struct {
+ // Temporary buffers to save/restore 3 lines above/below the restoration
+ // stripe.
+ uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+ uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+} RestorationLineBuffers;
+/*!\endcond */
+
+/*!\brief Parameters related to Restoration Stripe boundaries */
+typedef struct {
+ /*!
+ * stripe boundary above
+ */
+ uint8_t *stripe_boundary_above;
+
+ /*!
+ * stripe boundary below
+ */
+ uint8_t *stripe_boundary_below;
+
+ /*!
+ * strides for stripe boundaries above and below
+ */
+ int stripe_boundary_stride;
+
+ /*!
+ * size of stripe boundaries above and below
+ */
+ int stripe_boundary_size;
+} RestorationStripeBoundaries;
+
+/*!\brief Parameters related to Restoration Info */
+typedef struct {
+ /*!
+ * Restoration type for frame
+ */
+ RestorationType frame_restoration_type;
+
+ /*!
+ * Restoration unit size
+ */
+ int restoration_unit_size;
+
+ /**
+ * \name Fields allocated and initialised by av1_alloc_restoration_struct.
+ */
+ /**@{*/
+ /*!
+ * Total number of restoration units in this plane
+ */
+ int num_rest_units;
+
+ /*!
+ * Number of vertical restoration units in this plane
+ */
+ int vert_units;
+
+ /*!
+ * Number of horizontal restoration units in this plane
+ */
+ int horz_units;
+ /**@}*/
+
+ /*!
+ * Parameters for each restoration unit in this plane
+ */
+ RestorationUnitInfo *unit_info;
+
+ /*!
+ * Restoration Stripe boundary info
+ */
+ RestorationStripeBoundaries boundaries;
+
+ /*!
+ * Whether optimized lr can be used for speed.
+ * That includes cases of no cdef and no superres, or if fast trial runs
+ * are used on the encoder side.
+ */
+ int optimized_lr;
+} RestorationInfo;
+
+/*!\cond */
+
+static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
+ sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
+ sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
+}
+
+static INLINE void set_default_wiener(WienerInfo *wiener_info) {
+ wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
+ wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
+ wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
+ wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
+ -2 *
+ (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
+ wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
+ wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
+ wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
+}
+
+typedef struct {
+ int h_start, h_end, v_start, v_end;
+} RestorationTileLimits;
+
+typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
+ int rest_unit_idx, void *priv,
+ int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info);
+
+typedef struct FilterFrameCtxt {
+ const RestorationInfo *rsi;
+ int ss_x, ss_y;
+ int plane_w, plane_h;
+ int highbd, bit_depth;
+ uint8_t *data8, *dst8;
+ int data_stride, dst_stride;
+} FilterFrameCtxt;
+
+typedef struct AV1LrStruct {
+ rest_unit_visitor_t on_rest_unit;
+ FilterFrameCtxt ctxt[MAX_MB_PLANE];
+ YV12_BUFFER_CONFIG *frame;
+ YV12_BUFFER_CONFIG *dst;
+} AV1LrStruct;
+
+extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS];
+extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
+extern const int32_t av1_x_by_xplus1[256];
+extern const int32_t av1_one_by_x[MAX_NELEM];
+
+void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
+ int is_uv);
+void av1_free_restoration_struct(RestorationInfo *rst_info);
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert, int highbd);
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+
+/*!\endcond */
+
+/*!\brief Function for applying loop restoration filter to a single unit.
+ *
+ * \ingroup in_loop_restoration
+ * This function applies the loop restoration filter to a single
+ * loop restoration unit.
+ *
+ * \param[in] limits Limits of the unit
+ * \param[in] rui The parameters to use for this unit and its
+ * coefficients
+ * \param[in] rsb Deblocked pixels to use for stripe boundaries
+ * \param[in] rlbs Space to use as a scratch buffer
+ * \param[in] ss_x Horizontal subsampling for plane
+ * \param[in] ss_y Vertical subsampling for plane
+ * \param[in] plane_w Width of the current plane
+ * \param[in] plane_h Height of the current plane
+ * \param[in] highbd Whether high bitdepth pipeline is used
+ * \param[in] bit_depth Bit-depth of the video
+ * \param[in] data8 Frame data (pointing at the top-left corner of
+ * the frame, not the restoration unit).
+ * \param[in] stride Stride of \c data8
+ * \param[out] dst8 Buffer where the results will be written. Like
+ * \c data8, \c dst8 should point at the top-left
+ * corner of the frame
+ * \param[in] dst_stride Stride of \c dst8
+ * \param[in] tmpbuf Scratch buffer used by the sgrproj filter
+ * which should be at least SGRPROJ_TMPBUF_SIZE
+ * big.
+ * \param[in] optimized_lr Whether to use fast optimized Loop Restoration
+ * \param[in,out] error_info Error info for reporting errors
+ *
+ * \remark Nothing is returned. Instead, the filtered unit is output in
+ * \c dst8 at the proper restoration unit offset.
+ */
+void av1_loop_restoration_filter_unit(
+ const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+ const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+ int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth,
+ uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf,
+ int optimized_lr, struct aom_internal_error_info *error_info);
+
+/*!\brief Function for applying loop restoration filter to a frame
+ *
+ * \ingroup in_loop_restoration
+ * This function applies the loop restoration filter to a frame.
+ *
+ * \param[in,out] frame Compressed frame buffer
+ * \param[in,out] cm Pointer to top level common structure
+ * \param[in] optimized_lr Whether to use fast optimized Loop Restoration
+ * \param[in] lr_ctxt Loop restoration context
+ *
+ * \remark Nothing is returned. Instead, the filtered frame is output in
+ * \c frame.
+ */
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm, int optimized_lr,
+ void *lr_ctxt);
+/*!\cond */
+
+void av1_loop_restoration_precal(void);
+
+struct AV1LrSyncData;
+
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane);
+
+// Call on_rest_unit for each loop restoration unit in the plane.
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+ rest_unit_visitor_t on_rest_unit,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs);
+
+// Return 1 iff the block at mi_row, mi_col with size bsize is a
+// top-level superblock containing the top-left corner of at least one
+// loop restoration unit.
+//
+// If the block is a top-level superblock, the function writes to
+// *rcol0, *rcol1, *rrow0, *rrow1. This means that the parameters for all
+// restoration units in the rectangle [*rcol0, *rcol1) x [*rrow0, *rrow1)
+// are signaled in this superblock.
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *rcol0, int *rcol1, int *rrow0,
+ int *rrow1);
+
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm,
+ int after_cdef);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+ YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm,
+ int optimized_lr, int num_planes);
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+ struct AV1Common *cm, int num_planes);
+void av1_foreach_rest_unit_in_row(
+ RestorationTileLimits *limits, int plane_w,
+ rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+ int hnum_rest_units, int vnum_rest_units, int plane, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
+ sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync,
+ struct aom_internal_error_info *error_info);
+
+void av1_get_upsampled_plane_size(const struct AV1Common *cm, int is_uv,
+ int *plane_w, int *plane_h);
+int av1_lr_count_units(int unit_size, int plane_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane);
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c
new file mode 100644
index 0000000000..d7c6a24378
--- /dev/null
+++ b/third_party/aom/av1/common/scale.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/scale.h"
+#include "aom_dsp/aom_filter.h"
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+ // Calculate scaling factor once for each reference frame
+ // and use fixed point scaling factors in decoding and encoding routines.
+ // Hardware implementations can calculate scale factor in device driver
+ // and use multiplication and shifting on hardware instead of division.
+ return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
+}
+
+// Given the fixed point scale, calculate coarse point scale.
+static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
+ return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
+}
+
+// Note: x and y are integer precision, mvq4 is q4 precision.
+MV32 av1_scale_mv(const MV *mvq4, int x, int y,
+ const struct scale_factors *sf) {
+ const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf);
+ const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf);
+ const MV32 res = {
+ av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
+ av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4
+ };
+ return res;
+}
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+ int other_h, int this_w, int this_h) {
+ if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+ sf->x_scale_fp = REF_INVALID_SCALE;
+ sf->y_scale_fp = REF_INVALID_SCALE;
+ return;
+ }
+
+ sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+ sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+
+ sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
+ sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
+}
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
new file mode 100644
index 0000000000..d8481bfc2c
--- /dev/null
+++ b/third_party/aom/av1/common/scale.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
+
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCALE_NUMERATOR 8
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+ int x_scale_fp; // horizontal fixed point scale factor
+ int y_scale_fp; // vertical fixed point scale factor
+ int x_step_q4;
+ int y_step_q4;
+};
+
+// Note: Expect val to be in q4 precision
+static INLINE int av1_scaled_x(int val, const struct scale_factors *sf) {
+ const int off =
+ (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+ const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
+ return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+ REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int av1_scaled_y(int val, const struct scale_factors *sf) {
+ const int off =
+ (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+ const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
+ return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+ REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int av1_unscaled_value(int val, const struct scale_factors *sf) {
+ (void)sf;
+ return val * (1 << SCALE_EXTRA_BITS);
+}
+
+MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+ int other_h, int this_w, int this_h);
+
+static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+ assert(sf != NULL);
+ return sf->x_scale_fp != REF_INVALID_SCALE &&
+ sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+ assert(sf != NULL);
+ return av1_is_valid_scale(sf) &&
+ (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+// See AV1 spec, Section 6.8.6. Frame size with refs semantics.
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+ int this_width, int this_height) {
+ return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
+ this_width <= 16 * ref_width && this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c
new file mode 100644
index 0000000000..0943579db1
--- /dev/null
+++ b/third_party/aom/av1/common/scan.c
@@ -0,0 +1,2038 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/common_data.h"
+#include "av1/common/scan.h"
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
+ 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+ 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19,
+ 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
+ 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35,
+ 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39,
+ 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+ 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+ 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+ 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
+ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51,
+ 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55,
+ 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+ 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226,
+ 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10,
+ 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43,
+ 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76,
+ 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109,
+ 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142,
+ 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175,
+ 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208,
+ 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241,
+ 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25,
+ 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58,
+ 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91,
+ 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124,
+ 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126,
+ 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116,
+ 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+ 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+ 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+ 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+ 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+ 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+ 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+ 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+ 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225,
+ 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227,
+ 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229,
+ 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231,
+ 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233,
+ 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+ 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+ 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+ 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+ 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+ 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+ 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+ 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+ 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+ 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+ 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112,
+ 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+ 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,
+ 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+ 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82,
+ 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+ 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67,
+ 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+ 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52,
+ 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+ 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37,
+ 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157,
+ 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22,
+ 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142,
+ 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+ 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+ 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80,
+ 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67,
+ 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69,
+ 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71,
+ 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73,
+ 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75,
+ 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62,
+ 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110,
+ 117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
+ 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
+ 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117,
+ 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119,
+ 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121,
+ 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+ 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+ 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+ 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
+ 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
+ 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
+ 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+ 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+ 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8,
+ 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196,
+ 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104,
+ 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43,
+ 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13,
+ 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14,
+ 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46,
+ 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+ 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+ 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+ 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+ 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+ 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238,
+ 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270,
+ 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302,
+ 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334,
+ 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366,
+ 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398,
+ 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430,
+ 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462,
+ 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494,
+ 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29,
+ 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61,
+ 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+ 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+ 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+ 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+ 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+ 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+ 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+ 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+ 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
+ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64,
+ 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22,
+ 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70,
+ 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177,
+ 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226,
+ 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227,
+ 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228,
+ 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229,
+ 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230,
+ 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+ 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+ 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+ 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+ 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+ 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+ 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+ 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+ 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+ 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+ 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+ 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+ 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+ 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+ 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+ 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+ 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+ 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+ 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+ 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+ 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+ 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+ 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+ 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+ 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+ 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+ 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+ 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+ 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+ 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+ 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+ 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+ 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+ 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+ 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+ 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+ 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+ 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+ 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+ 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+ 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+ 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224,
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+ 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193,
+ 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+ 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162,
+ 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+ 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131,
+ 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+ 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100,
+ 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+ 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69,
+ 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+ 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38,
+ 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+ 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+ 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+ 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216,
+ 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+ 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185,
+ 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+ 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154,
+ 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+ 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123,
+ 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+ 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92,
+ 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+ 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61,
+ 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+ 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30,
+ 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+ 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239,
+ 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+ 495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+ 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64,
+ 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97,
+ 82, 67, 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128,
+ 144, 129, 114, 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70,
+ 85, 100, 115, 130, 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56,
+ 41, 26, 11, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177,
+ 192, 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, 28, 13,
+ 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224,
+ 240, 225, 210, 195, 180, 165, 150, 135, 120, 105, 90, 75, 60, 45, 30,
+ 15, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226,
+ 241, 242, 227, 212, 197, 182, 167, 152, 137, 122, 107, 92, 77, 62, 47,
+ 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 244, 229,
+ 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110, 125, 140, 155,
+ 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156, 141, 126,
+ 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203, 188,
+ 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
+ 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+ 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+ 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+ 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+ 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+ 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+ 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+ 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+ 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+ 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+ 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+ 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+ 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+ 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
+ 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+ 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+ 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
+ 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
+ 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
+ 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
+ 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
+ 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
+ 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311,
+ 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
+ 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+ 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350,
+ 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376,
+ 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402,
+ 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415,
+ 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
+ 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+ 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454,
+ 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467,
+ 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480,
+ 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493,
+ 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506,
+ 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519,
+ 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532,
+ 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545,
+ 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558,
+ 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571,
+ 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584,
+ 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
+ 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+ 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
+ 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636,
+ 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649,
+ 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662,
+ 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675,
+ 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688,
+ 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701,
+ 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714,
+ 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727,
+ 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740,
+ 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753,
+ 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766,
+ 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779,
+ 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792,
+ 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805,
+ 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818,
+ 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
+ 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844,
+ 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857,
+ 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870,
+ 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+ 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896,
+ 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909,
+ 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922,
+ 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935,
+ 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948,
+ 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961,
+ 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974,
+ 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987,
+ 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000,
+ 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+ 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
+ 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864,
+ 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289,
+ 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737,
+ 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162,
+ 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610,
+ 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35,
+ 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931,
+ 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356,
+ 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804,
+ 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229,
+ 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677,
+ 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102,
+ 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550,
+ 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423,
+ 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871,
+ 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296,
+ 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744,
+ 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169,
+ 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617,
+ 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42,
+ 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938,
+ 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363,
+ 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811,
+ 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236,
+ 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684,
+ 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109,
+ 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557,
+ 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430,
+ 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878,
+ 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303,
+ 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751,
+ 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176,
+ 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624,
+ 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49,
+ 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945,
+ 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370,
+ 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818,
+ 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243,
+ 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691,
+ 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116,
+ 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564,
+ 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437,
+ 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885,
+ 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310,
+ 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758,
+ 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183,
+ 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631,
+ 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56,
+ 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952,
+ 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377,
+ 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825,
+ 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250,
+ 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698,
+ 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123,
+ 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571,
+ 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444,
+ 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892,
+ 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317,
+ 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765,
+ 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190,
+ 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638,
+ 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63,
+ 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+ 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959,
+ 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+ 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66,
+ 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130,
+ 161, 192, 224, 193, 162, 131, 100, 69, 38, 7, 8, 39, 70,
+ 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102,
+ 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289,
+ 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11,
+ 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384,
+ 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44,
+ 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355,
+ 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201,
+ 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202,
+ 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482,
+ 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79,
+ 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328,
+ 359, 390, 421, 452, 483, 514, 545, 576, 608, 577, 546, 515, 484,
+ 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81,
+ 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330,
+ 361, 392, 423, 454, 485, 516, 547, 578, 609, 640, 672, 641, 610,
+ 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207,
+ 176, 145, 114, 83, 52, 21, 22, 53, 84, 115, 146, 177, 208,
+ 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611,
+ 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457,
+ 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54,
+ 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365,
+ 396, 427, 458, 489, 520, 551, 582, 613, 644, 675, 706, 737, 768,
+ 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428,
+ 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25,
+ 26, 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398,
+ 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801,
+ 832, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523,
+ 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120,
+ 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307,
+ 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710,
+ 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742,
+ 711, 680, 649, 618, 587, 556, 525, 494, 463, 432, 401, 370, 339,
+ 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92,
+ 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, 495,
+ 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898,
+ 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+ 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279,
+ 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187,
+ 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528, 559, 590,
+ 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993,
+ 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622,
+ 591, 560, 529, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219,
+ 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375,
+ 406, 437, 468, 499, 530, 561, 592, 623, 654, 685, 716, 747, 778,
+ 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841,
+ 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438,
+ 407, 376, 345, 314, 283, 252, 221, 190, 159, 191, 222, 253, 284,
+ 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687,
+ 718, 749, 780, 811, 842, 873, 904, 935, 966, 997, 998, 967, 936,
+ 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533,
+ 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317,
+ 348, 379, 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720,
+ 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907,
+ 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, 535, 504,
+ 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474,
+ 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877,
+ 908, 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754,
+ 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351,
+ 383, 414, 445, 476, 507, 538, 569, 600, 631, 662, 693, 724, 755,
+ 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880,
+ 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477,
+ 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757,
+ 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882,
+ 851, 820, 789, 758, 727, 696, 665, 634, 603, 572, 541, 510, 479,
+ 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883,
+ 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791, 760,
+ 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730,
+ 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917,
+ 886, 855, 824, 793, 762, 731, 700, 669, 638, 607, 639, 670, 701,
+ 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950,
+ 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734, 765, 796,
+ 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859,
+ 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015,
+ 1016, 985, 954, 923, 892, 861, 830, 799, 831, 862, 893, 924, 955,
+ 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019,
+ 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
+ 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
+ 0, 2, 5, 9, 13, 17, 21, 25, 1, 4, 8, 12, 16, 20, 24, 28,
+ 3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
+ 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
+ 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18,
+ 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
+ 0, 2, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57,
+ 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62,
+ 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
+ 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18,
+ 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+ 29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+ 45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = {
+ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51,
+ 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55,
+ 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+ 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
+ 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91,
+ 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211,
+ 219, 227, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82,
+ 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+ 210, 218, 226, 234, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73,
+ 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193,
+ 201, 209, 217, 225, 233, 240, 6, 11, 17, 24, 32, 40, 48, 56, 64,
+ 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184,
+ 192, 200, 208, 216, 224, 232, 239, 245, 10, 16, 23, 31, 39, 47, 55,
+ 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175,
+ 183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15, 22, 30, 38, 46,
+ 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166,
+ 174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21, 29, 37,
+ 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157,
+ 165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28,
+ 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148,
+ 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
+ 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29,
+ 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38,
+ 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47,
+ 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56,
+ 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65,
+ 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 108, 67, 74,
+ 81, 88, 95, 102, 109, 116, 75, 82, 89, 96, 103, 110, 117, 124, 83,
+ 90, 97, 104, 111, 118, 125, 132, 91, 98, 105, 112, 119, 126, 133, 140,
+ 99, 106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149,
+ 156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158,
+ 165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167,
+ 174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176,
+ 183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185,
+ 192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194,
+ 201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
+ 210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
+ 219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112,
+ 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+ 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,
+ 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+ 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82,
+ 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+ 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67,
+ 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+ 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52,
+ 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+ 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37,
+ 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157,
+ 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22,
+ 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142,
+ 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+ 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+ 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225,
+ 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227,
+ 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229,
+ 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231,
+ 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233,
+ 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+ 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+ 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+ 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+ 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+ 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+ 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+ 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+ 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+ 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+ 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
+ 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36,
+ 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49,
+ 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58,
+ 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
+ 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,
+ 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106,
+ 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 112,
+ 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 111, 117,
+ 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 110, 116, 121,
+ 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 109, 115, 120, 124,
+ 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 108, 114, 119, 123, 126,
+ 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
+ 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36,
+ 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52,
+ 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68,
+ 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84,
+ 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100,
+ 59, 66, 73, 80, 87, 94, 101, 107, 67, 74, 81, 88, 95, 102, 108, 113,
+ 75, 82, 89, 96, 103, 109, 114, 118, 83, 90, 97, 104, 110, 115, 119, 122,
+ 91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+ 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
+ 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
+ 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
+ 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+ 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+ 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
+ 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
+ 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117,
+ 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119,
+ 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121,
+ 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+ 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+ 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
+ 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90, 104, 119,
+ 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
+ 375, 391, 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76, 89, 103,
+ 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342,
+ 358, 374, 390, 406, 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88,
+ 102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325,
+ 341, 357, 373, 389, 405, 420, 6, 11, 17, 24, 32, 41, 51, 62, 74,
+ 87, 101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308,
+ 324, 340, 356, 372, 388, 404, 419, 433, 10, 16, 23, 31, 40, 50, 61,
+ 73, 86, 100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291,
+ 307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15, 22, 30, 39, 49,
+ 60, 72, 85, 99, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274,
+ 290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21, 29, 38,
+ 48, 59, 71, 84, 98, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257,
+ 273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28,
+ 37, 47, 58, 70, 83, 97, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465,
+ 475, 36, 46, 57, 69, 82, 96, 111, 127, 143, 159, 175, 191, 207, 223,
+ 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453,
+ 464, 474, 483, 45, 56, 68, 81, 95, 110, 126, 142, 158, 174, 190, 206,
+ 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440,
+ 452, 463, 473, 482, 490, 55, 67, 80, 94, 109, 125, 141, 157, 173, 189,
+ 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426,
+ 439, 451, 462, 472, 481, 489, 496, 66, 79, 93, 108, 124, 140, 156, 172,
+ 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411,
+ 425, 438, 450, 461, 471, 480, 488, 495, 501, 78, 92, 107, 123, 139, 155,
+ 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395,
+ 410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91, 106, 122, 138,
+ 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378,
+ 394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121,
+ 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361,
+ 377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510,
+ 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+ 360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506,
+ 509, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
+ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105,
+ 120, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 67, 79, 92, 106,
+ 121, 136, 5, 8, 12, 17, 23, 30, 38, 47, 57, 68, 80, 93, 107,
+ 122, 137, 152, 9, 13, 18, 24, 31, 39, 48, 58, 69, 81, 94, 108,
+ 123, 138, 153, 168, 14, 19, 25, 32, 40, 49, 59, 70, 82, 95, 109,
+ 124, 139, 154, 169, 184, 20, 26, 33, 41, 50, 60, 71, 83, 96, 110,
+ 125, 140, 155, 170, 185, 200, 27, 34, 42, 51, 61, 72, 84, 97, 111,
+ 126, 141, 156, 171, 186, 201, 216, 35, 43, 52, 62, 73, 85, 98, 112,
+ 127, 142, 157, 172, 187, 202, 217, 232, 44, 53, 63, 74, 86, 99, 113,
+ 128, 143, 158, 173, 188, 203, 218, 233, 248, 54, 64, 75, 87, 100, 114,
+ 129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65, 76, 88, 101, 115,
+ 130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77, 89, 102, 116,
+ 131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90, 103, 117,
+ 132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118,
+ 133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119,
+ 134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344,
+ 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345,
+ 360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346,
+ 361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347,
+ 362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348,
+ 363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349,
+ 364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350,
+ 365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351,
+ 366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352,
+ 367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353,
+ 368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354,
+ 369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355,
+ 370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356,
+ 371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357,
+ 372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358,
+ 373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359,
+ 374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506,
+ 375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507,
+ 509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224,
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+ 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193,
+ 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+ 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162,
+ 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+ 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131,
+ 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+ 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100,
+ 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+ 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69,
+ 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+ 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38,
+ 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+ 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+ 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+ 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216,
+ 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+ 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185,
+ 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+ 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154,
+ 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+ 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123,
+ 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+ 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92,
+ 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+ 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61,
+ 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+ 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30,
+ 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+ 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239,
+ 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+ 495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+ 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+ 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+ 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+ 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+ 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+ 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+ 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+ 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+ 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+ 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+ 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+ 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+ 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+ 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+ 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+ 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+ 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+ 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+ 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+ 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+ 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+ 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+ 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+ 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+ 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+ 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+ 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+ 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+ 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+ 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+ 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+ 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+ 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+ 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+ 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
+ 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78, 104, 105,
+ 135, 1, 4, 8, 11, 19, 22, 34, 37, 53, 56, 76, 79, 103, 106,
+ 134, 136, 5, 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107,
+ 133, 137, 164, 6, 13, 17, 24, 32, 39, 51, 58, 74, 81, 101, 108,
+ 132, 138, 163, 165, 14, 16, 25, 31, 40, 50, 59, 73, 82, 100, 109,
+ 131, 139, 162, 166, 189, 15, 26, 30, 41, 49, 60, 72, 83, 99, 110,
+ 130, 140, 161, 167, 188, 190, 27, 29, 42, 48, 61, 71, 84, 98, 111,
+ 129, 141, 160, 168, 187, 191, 210, 28, 43, 47, 62, 70, 85, 97, 112,
+ 128, 142, 159, 169, 186, 192, 209, 211, 44, 46, 63, 69, 86, 96, 113,
+ 127, 143, 158, 170, 185, 193, 208, 212, 227, 45, 64, 68, 87, 95, 114,
+ 126, 144, 157, 171, 184, 194, 207, 213, 226, 228, 65, 67, 88, 94, 115,
+ 125, 145, 156, 172, 183, 195, 206, 214, 225, 229, 240, 66, 89, 93, 116,
+ 124, 146, 155, 173, 182, 196, 205, 215, 224, 230, 239, 241, 90, 92, 117,
+ 123, 147, 154, 174, 181, 197, 204, 216, 223, 231, 238, 242, 249, 91, 118,
+ 122, 148, 153, 175, 180, 198, 203, 217, 222, 232, 237, 243, 248, 250, 119,
+ 121, 149, 152, 176, 179, 199, 202, 218, 221, 233, 236, 244, 247, 251, 254,
+ 120, 150, 151, 177, 178, 200, 201, 219, 220, 234, 235, 245, 246, 252, 253,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+ 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
+ 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+ 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+ 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
+ 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
+ 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
+ 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
+ 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
+ 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
+ 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311,
+ 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
+ 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+ 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350,
+ 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376,
+ 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402,
+ 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415,
+ 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
+ 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+ 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454,
+ 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467,
+ 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480,
+ 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493,
+ 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506,
+ 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519,
+ 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532,
+ 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545,
+ 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558,
+ 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571,
+ 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584,
+ 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
+ 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+ 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
+ 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636,
+ 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649,
+ 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662,
+ 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675,
+ 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688,
+ 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701,
+ 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714,
+ 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727,
+ 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740,
+ 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753,
+ 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766,
+ 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779,
+ 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792,
+ 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805,
+ 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818,
+ 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
+ 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844,
+ 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857,
+ 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870,
+ 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+ 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896,
+ 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909,
+ 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922,
+ 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935,
+ 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948,
+ 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961,
+ 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974,
+ 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987,
+ 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000,
+ 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+ 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
+ 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864,
+ 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289,
+ 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737,
+ 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162,
+ 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610,
+ 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35,
+ 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931,
+ 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356,
+ 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804,
+ 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229,
+ 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677,
+ 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102,
+ 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550,
+ 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423,
+ 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871,
+ 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296,
+ 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744,
+ 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169,
+ 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617,
+ 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42,
+ 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938,
+ 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363,
+ 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811,
+ 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236,
+ 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684,
+ 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109,
+ 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557,
+ 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430,
+ 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878,
+ 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303,
+ 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751,
+ 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176,
+ 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624,
+ 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49,
+ 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945,
+ 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370,
+ 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818,
+ 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243,
+ 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691,
+ 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116,
+ 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564,
+ 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437,
+ 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885,
+ 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310,
+ 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758,
+ 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183,
+ 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631,
+ 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56,
+ 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952,
+ 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377,
+ 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825,
+ 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250,
+ 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698,
+ 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123,
+ 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571,
+ 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444,
+ 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892,
+ 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317,
+ 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765,
+ 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190,
+ 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638,
+ 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63,
+ 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+ 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959,
+ 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
+ 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78,
+ 104, 105, 135, 136, 170, 171, 209, 210, 252, 253, 299, 300, 350,
+ 351, 405, 406, 464, 465, 527, 1, 4, 8, 11, 19, 22, 34,
+ 37, 53, 56, 76, 79, 103, 106, 134, 137, 169, 172, 208, 211,
+ 251, 254, 298, 301, 349, 352, 404, 407, 463, 466, 526, 528, 5,
+ 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107, 133,
+ 138, 168, 173, 207, 212, 250, 255, 297, 302, 348, 353, 403, 408,
+ 462, 467, 525, 529, 588, 6, 13, 17, 24, 32, 39, 51, 58,
+ 74, 81, 101, 108, 132, 139, 167, 174, 206, 213, 249, 256, 296,
+ 303, 347, 354, 402, 409, 461, 468, 524, 530, 587, 589, 14, 16,
+ 25, 31, 40, 50, 59, 73, 82, 100, 109, 131, 140, 166, 175,
+ 205, 214, 248, 257, 295, 304, 346, 355, 401, 410, 460, 469, 523,
+ 531, 586, 590, 645, 15, 26, 30, 41, 49, 60, 72, 83, 99,
+ 110, 130, 141, 165, 176, 204, 215, 247, 258, 294, 305, 345, 356,
+ 400, 411, 459, 470, 522, 532, 585, 591, 644, 646, 27, 29, 42,
+ 48, 61, 71, 84, 98, 111, 129, 142, 164, 177, 203, 216, 246,
+ 259, 293, 306, 344, 357, 399, 412, 458, 471, 521, 533, 584, 592,
+ 643, 647, 698, 28, 43, 47, 62, 70, 85, 97, 112, 128, 143,
+ 163, 178, 202, 217, 245, 260, 292, 307, 343, 358, 398, 413, 457,
+ 472, 520, 534, 583, 593, 642, 648, 697, 699, 44, 46, 63, 69,
+ 86, 96, 113, 127, 144, 162, 179, 201, 218, 244, 261, 291, 308,
+ 342, 359, 397, 414, 456, 473, 519, 535, 582, 594, 641, 649, 696,
+ 700, 747, 45, 64, 68, 87, 95, 114, 126, 145, 161, 180, 200,
+ 219, 243, 262, 290, 309, 341, 360, 396, 415, 455, 474, 518, 536,
+ 581, 595, 640, 650, 695, 701, 746, 748, 65, 67, 88, 94, 115,
+ 125, 146, 160, 181, 199, 220, 242, 263, 289, 310, 340, 361, 395,
+ 416, 454, 475, 517, 537, 580, 596, 639, 651, 694, 702, 745, 749,
+ 792, 66, 89, 93, 116, 124, 147, 159, 182, 198, 221, 241, 264,
+ 288, 311, 339, 362, 394, 417, 453, 476, 516, 538, 579, 597, 638,
+ 652, 693, 703, 744, 750, 791, 793, 90, 92, 117, 123, 148, 158,
+ 183, 197, 222, 240, 265, 287, 312, 338, 363, 393, 418, 452, 477,
+ 515, 539, 578, 598, 637, 653, 692, 704, 743, 751, 790, 794, 833,
+ 91, 118, 122, 149, 157, 184, 196, 223, 239, 266, 286, 313, 337,
+ 364, 392, 419, 451, 478, 514, 540, 577, 599, 636, 654, 691, 705,
+ 742, 752, 789, 795, 832, 834, 119, 121, 150, 156, 185, 195, 224,
+ 238, 267, 285, 314, 336, 365, 391, 420, 450, 479, 513, 541, 576,
+ 600, 635, 655, 690, 706, 741, 753, 788, 796, 831, 835, 870, 120,
+ 151, 155, 186, 194, 225, 237, 268, 284, 315, 335, 366, 390, 421,
+ 449, 480, 512, 542, 575, 601, 634, 656, 689, 707, 740, 754, 787,
+ 797, 830, 836, 869, 871, 152, 154, 187, 193, 226, 236, 269, 283,
+ 316, 334, 367, 389, 422, 448, 481, 511, 543, 574, 602, 633, 657,
+ 688, 708, 739, 755, 786, 798, 829, 837, 868, 872, 903, 153, 188,
+ 192, 227, 235, 270, 282, 317, 333, 368, 388, 423, 447, 482, 510,
+ 544, 573, 603, 632, 658, 687, 709, 738, 756, 785, 799, 828, 838,
+ 867, 873, 902, 904, 189, 191, 228, 234, 271, 281, 318, 332, 369,
+ 387, 424, 446, 483, 509, 545, 572, 604, 631, 659, 686, 710, 737,
+ 757, 784, 800, 827, 839, 866, 874, 901, 905, 932, 190, 229, 233,
+ 272, 280, 319, 331, 370, 386, 425, 445, 484, 508, 546, 571, 605,
+ 630, 660, 685, 711, 736, 758, 783, 801, 826, 840, 865, 875, 900,
+ 906, 931, 933, 230, 232, 273, 279, 320, 330, 371, 385, 426, 444,
+ 485, 507, 547, 570, 606, 629, 661, 684, 712, 735, 759, 782, 802,
+ 825, 841, 864, 876, 899, 907, 930, 934, 957, 231, 274, 278, 321,
+ 329, 372, 384, 427, 443, 486, 506, 548, 569, 607, 628, 662, 683,
+ 713, 734, 760, 781, 803, 824, 842, 863, 877, 898, 908, 929, 935,
+ 956, 958, 275, 277, 322, 328, 373, 383, 428, 442, 487, 505, 549,
+ 568, 608, 627, 663, 682, 714, 733, 761, 780, 804, 823, 843, 862,
+ 878, 897, 909, 928, 936, 955, 959, 978, 276, 323, 327, 374, 382,
+ 429, 441, 488, 504, 550, 567, 609, 626, 664, 681, 715, 732, 762,
+ 779, 805, 822, 844, 861, 879, 896, 910, 927, 937, 954, 960, 977,
+ 979, 324, 326, 375, 381, 430, 440, 489, 503, 551, 566, 610, 625,
+ 665, 680, 716, 731, 763, 778, 806, 821, 845, 860, 880, 895, 911,
+ 926, 938, 953, 961, 976, 980, 995, 325, 376, 380, 431, 439, 490,
+ 502, 552, 565, 611, 624, 666, 679, 717, 730, 764, 777, 807, 820,
+ 846, 859, 881, 894, 912, 925, 939, 952, 962, 975, 981, 994, 996,
+ 377, 379, 432, 438, 491, 501, 553, 564, 612, 623, 667, 678, 718,
+ 729, 765, 776, 808, 819, 847, 858, 882, 893, 913, 924, 940, 951,
+ 963, 974, 982, 993, 997, 1008, 378, 433, 437, 492, 500, 554, 563,
+ 613, 622, 668, 677, 719, 728, 766, 775, 809, 818, 848, 857, 883,
+ 892, 914, 923, 941, 950, 964, 973, 983, 992, 998, 1007, 1009, 434,
+ 436, 493, 499, 555, 562, 614, 621, 669, 676, 720, 727, 767, 774,
+ 810, 817, 849, 856, 884, 891, 915, 922, 942, 949, 965, 972, 984,
+ 991, 999, 1006, 1010, 1017, 435, 494, 498, 556, 561, 615, 620, 670,
+ 675, 721, 726, 768, 773, 811, 816, 850, 855, 885, 890, 916, 921,
+ 943, 948, 966, 971, 985, 990, 1000, 1005, 1011, 1016, 1018, 495, 497,
+ 557, 560, 616, 619, 671, 674, 722, 725, 769, 772, 812, 815, 851,
+ 854, 886, 889, 917, 920, 944, 947, 967, 970, 986, 989, 1001, 1004,
+ 1012, 1015, 1019, 1022, 496, 558, 559, 617, 618, 672, 673, 723, 724,
+ 770, 771, 813, 814, 852, 853, 887, 888, 918, 919, 945, 946, 968,
+ 969, 987, 988, 1002, 1003, 1013, 1014, 1020, 1021, 1023,
+};
+
+const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+ {
+ // TX_4X4
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { default_scan_4x4, av1_default_iscan_4x4 },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+ },
+ {
+ // TX_8X8
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { default_scan_8x8, av1_default_iscan_8x8 },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+ },
+ {
+ // TX_16X16
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { default_scan_16x16, av1_default_iscan_16x16 },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+ },
+ {
+ // TX_32X32
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ },
+ {
+ // TX_64X64
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ },
+ {
+ // TX_4X8
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { default_scan_4x8, av1_default_iscan_4x8 },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+ },
+ {
+ // TX_8X4
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { default_scan_8x4, av1_default_iscan_8x4 },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+ },
+ {
+ // TX_8X16
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { default_scan_8x16, av1_default_iscan_8x16 },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+ },
+ {
+ // TX_16X8
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { default_scan_16x8, av1_default_iscan_16x8 },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+ },
+ {
+ // TX_16X32
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ },
+ {
+ // TX_32X16
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ },
+ {
+ // TX_32X64
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ },
+ {
+ // TX_64X32
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { default_scan_32x32, av1_default_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+ },
+ {
+ // TX_4X16
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { default_scan_4x16, av1_default_iscan_4x16 },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+ },
+ {
+ // TX_16X4
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { default_scan_16x4, av1_default_iscan_16x4 },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+ },
+ {
+ // TX_8X32
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { default_scan_8x32, av1_default_iscan_8x32 },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+ },
+ {
+ // TX_32X8
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { default_scan_32x8, av1_default_iscan_32x8 },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+ },
+ {
+ // TX_16X64
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { default_scan_16x32, av1_default_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+ },
+ {
+ // TX_64X16
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { default_scan_32x16, av1_default_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+ },
+};
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
new file mode 100644
index 0000000000..4f369786f2
--- /dev/null
+++ b/third_party/aom/av1/common/scan.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+enum {
+ SCAN_MODE_ZIG_ZAG,
+ SCAN_MODE_COL_DIAG,
+ SCAN_MODE_ROW_DIAG,
+ SCAN_MODE_COL_1D,
+ SCAN_MODE_ROW_1D,
+ SCAN_MODES
+} UENUM1BYTE(SCAN_MODE);
+
+extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
+
+void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
+ TX_TYPE tx_type) {
+ return &av1_scan_orders[tx_size][tx_type];
+}
+
+static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+ return get_default_scan(tx_size, tx_type);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c
new file mode 100644
index 0000000000..60b185161c
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = {
+ 1, 1, 1, 1, 1, 0, 0, 0
+};
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ,
+ MAX_LOOP_FILTER,
+ MAX_LOOP_FILTER,
+ MAX_LOOP_FILTER,
+ MAX_LOOP_FILTER,
+ 7,
+ 0,
+ 0 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void av1_clearall_segfeatures(struct segmentation *seg) {
+ av1_zero(seg->feature_data);
+ av1_zero(seg->feature_mask);
+}
+
+void av1_calculate_segdata(struct segmentation *seg) {
+ seg->segid_preskip = 0;
+ seg->last_active_segid = 0;
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ for (int j = 0; j < SEG_LVL_MAX; j++) {
+ if (seg->feature_mask[i] & (1 << j)) {
+ seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
+ seg->last_active_segid = i;
+ }
+ }
+ }
+}
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+ return seg_feature_data_max[feature_id];
+}
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+ return seg_feature_data_signed[feature_id];
+}
+
+// The 'seg_data' given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id, int seg_data) {
+ if (seg_data < 0) {
+ assert(seg_feature_data_signed[feature_id]);
+ assert(-seg_data <= seg_feature_data_max[feature_id]);
+ } else {
+ assert(seg_data <= seg_feature_data_max[feature_id]);
+ }
+
+ seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
new file mode 100644
index 0000000000..44b508b146
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_SEGMENTS 8
+#define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
+
+#define SEG_TEMPORAL_PRED_CTXS 3
+#define SPATIAL_PREDICTION_PROBS 3
+
+enum {
+ SEG_LVL_ALT_Q, // Use alternate Quantizer ....
+ SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical
+ SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal
+ SEG_LVL_ALT_LF_U, // Use alternate loop filter value on u plane
+ SEG_LVL_ALT_LF_V, // Use alternate loop filter value on v plane
+ SEG_LVL_REF_FRAME, // Optional Segment reference frame
+ SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode
+ SEG_LVL_GLOBALMV,
+ SEG_LVL_MAX
+} UENUM1BYTE(SEG_LVL_FEATURES);
+
+struct segmentation {
+ uint8_t enabled;
+ uint8_t update_map;
+ uint8_t update_data;
+ uint8_t temporal_update;
+
+ int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+ unsigned int feature_mask[MAX_SEGMENTS];
+ int last_active_segid; // The highest numbered segment id that has some
+ // enabled feature.
+ uint8_t segid_preskip; // Whether the segment id will be read before the
+ // skip syntax element.
+ // 1: the segment id will be read first.
+ // 0: the skip syntax element will be read first.
+};
+
+struct segmentation_probs {
+ aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
+ aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
+ [CDF_SIZE(MAX_SEGMENTS)];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+ uint8_t segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+static INLINE void segfeatures_copy(struct segmentation *dst,
+ const struct segmentation *src) {
+ int i, j;
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ dst->feature_mask[i] = src->feature_mask[i];
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ dst->feature_data[i][j] = src->feature_data[i][j];
+ }
+ }
+ dst->segid_preskip = src->segid_preskip;
+ dst->last_active_segid = src->last_active_segid;
+}
+
+void av1_clearall_segfeatures(struct segmentation *seg);
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void av1_calculate_segdata(struct segmentation *seg);
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id, int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ return seg->feature_data[segment_id][feature_id];
+}
+
+static AOM_INLINE void set_segment_id(uint8_t *segment_ids, int mi_offset,
+ int x_mis, int y_mis, int mi_stride,
+ uint8_t segment_id) {
+ segment_ids += mi_offset;
+ for (int y = 0; y < y_mis; ++y) {
+ memset(&segment_ids[y * mi_stride], segment_id,
+ x_mis * sizeof(segment_ids[0]));
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
new file mode 100644
index 0000000000..45695147ff
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.c
@@ -0,0 +1,1250 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_image.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+ // nsync numbers are picked by testing. For example, for 4k
+ // video, using 4 gives best performance.
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+}
+
+static INLINE int get_lr_sync_range(int width) {
+#if 0
+ // nsync numbers are picked by testing. For example, for 4k
+ // video, using 4 gives best performance.
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+#else
+ (void)width;
+ return 1;
+#endif
+}
+
+// Allocate memory for lf row synchronization
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+ int width, int num_workers) {
+ lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i, j;
+
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
+ aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
+ if (lf_sync->mutex_[j]) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
+ aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
+ if (lf_sync->cond_[j]) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->cond_[j][i], NULL);
+ }
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
+ aom_malloc(sizeof(*(lf_sync->job_mutex))));
+ if (lf_sync->job_mutex) {
+ pthread_mutex_init(lf_sync->job_mutex, NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+ aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
+ lf_sync->num_workers = num_workers;
+
+ for (int j = 0; j < MAX_MB_PLANE; j++) {
+ CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
+ aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
+ }
+ CHECK_MEM_ERROR(
+ cm, lf_sync->job_queue,
+ aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
+ // Set up nsync.
+ lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+ if (lf_sync != NULL) {
+ int j;
+#if CONFIG_MULTITHREAD
+ int i;
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ if (lf_sync->mutex_[j] != NULL) {
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
+ }
+ aom_free(lf_sync->mutex_[j]);
+ }
+ if (lf_sync->cond_[j] != NULL) {
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_cond_destroy(&lf_sync->cond_[j][i]);
+ }
+ aom_free(lf_sync->cond_[j]);
+ }
+ }
+ if (lf_sync->job_mutex != NULL) {
+ pthread_mutex_destroy(lf_sync->job_mutex);
+ aom_free(lf_sync->job_mutex);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(lf_sync->lfdata);
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ aom_free(lf_sync->cur_sb_col[j]);
+ }
+
+ aom_free(lf_sync->job_queue);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*lf_sync);
+ }
+}
+
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+ int num_workers) {
+ if (num_workers < 1) return;
+#if CONFIG_MULTITHREAD
+ if (cdef_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+ aom_malloc(sizeof(*(cdef_sync->mutex_))));
+ if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+ }
+#else
+ (void)cm;
+ (void)cdef_sync;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
+ if (cdef_sync == NULL) return;
+#if CONFIG_MULTITHREAD
+ if (cdef_sync->mutex_ != NULL) {
+ pthread_mutex_destroy(cdef_sync->mutex_);
+ aom_free(cdef_sync->mutex_);
+ }
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
+ int row) {
+ if (!row) return;
+#if CONFIG_MULTITHREAD
+ AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+ pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_);
+ while (cdef_row_mt[row - 1].is_row_done != 1)
+ pthread_cond_wait(cdef_row_mt[row - 1].row_cond_,
+ cdef_row_mt[row - 1].row_mutex_);
+ cdef_row_mt[row - 1].is_row_done = 0;
+ pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_);
+#else
+ (void)cdef_sync;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
+ int row) {
+#if CONFIG_MULTITHREAD
+ AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+ pthread_mutex_lock(cdef_row_mt[row].row_mutex_);
+ pthread_cond_signal(cdef_row_mt[row].row_cond_);
+ cdef_row_mt[row].is_row_done = 1;
+ pthread_mutex_unlock(cdef_row_mt[row].row_mutex_);
+#else
+ (void)cdef_sync;
+ (void)row;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+ int plane) {
+#if CONFIG_MULTITHREAD
+ const int nsync = lf_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
+ pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)lf_sync;
+ (void)r;
+ (void)c;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
+ const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+ const int nsync = lf_sync->sync_range;
+ int cur;
+ // Only signal when there are enough filtered SB for next row to run.
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
+
+ // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum
+ // column number. In this case, the AOMMAX operation here ensures that
+ // cur_sb_col[plane][r] is not overwritten with a smaller value thus
+ // preventing the infinite waiting of threads in the relevant sync_read()
+ // function.
+ lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur);
+
+ pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
+ pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
+ }
+#else
+ (void)lf_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+// One job of row loopfiltering.
+void av1_thread_loop_filter_rows(
+ const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+ struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
+ int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+ struct aom_internal_error_info *error_info,
+ AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf,
+ int num_mis_in_lpf_unit_height_log2) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
+ const int sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2);
+ const int r = mi_row >> num_mis_in_lpf_unit_height_log2;
+ int mi_col, c;
+
+ const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y;
+ const int num_planes = joint_filter_chroma ? 2 : 1;
+ assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U));
+
+ if (dir == 0) {
+ for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
+ c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+ av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
+ mi_row, mi_col, plane, plane + num_planes);
+ if (lpf_opt_level) {
+ if (plane == AOM_PLANE_Y) {
+ av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row,
+ mi_col, params_buf, tx_buf,
+ num_mis_in_lpf_unit_height_log2);
+ } else {
+ av1_filter_block_plane_vert_opt_chroma(
+ cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
+ joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
+ }
+ } else {
+ av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+ mi_col);
+ }
+ if (lf_sync != NULL) {
+ sync_write(lf_sync, r, c, sb_cols, plane);
+ }
+ }
+ } else if (dir == 1) {
+ for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
+ c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+ if (lf_sync != NULL) {
+ // Wait for vertical edge filtering of the top-right block to be
+ // completed
+ sync_read(lf_sync, r, c, plane);
+
+ // Wait for vertical edge filtering of the right block to be completed
+ sync_read(lf_sync, r + 1, c, plane);
+ }
+
+#if CONFIG_MULTITHREAD
+ if (lf_sync && lf_sync->num_workers > 1) {
+ pthread_mutex_lock(lf_sync->job_mutex);
+ const bool lf_mt_exit = lf_sync->lf_mt_exit;
+ pthread_mutex_unlock(lf_sync->job_mutex);
+ // Exit in case any worker has encountered an error.
+ if (lf_mt_exit) return;
+ }
+#endif
+
+ av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
+ mi_row, mi_col, plane, plane + num_planes);
+ if (lpf_opt_level) {
+ if (plane == AOM_PLANE_Y) {
+ av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row,
+ mi_col, params_buf, tx_buf,
+ num_mis_in_lpf_unit_height_log2);
+ } else {
+ av1_filter_block_plane_horz_opt_chroma(
+ cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane,
+ joint_filter_chroma, num_mis_in_lpf_unit_height_log2);
+ }
+ } else {
+ av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+ mi_col);
+ }
+ }
+ }
+}
+
+void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
+ int num_mis_in_lpf_unit_height_log2) {
+ int plane, sb_row;
+ const int sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, num_mis_in_lpf_unit_height_log2);
+ const int sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
+
+ // In case of loopfilter row-multithreading, the worker on an SB row waits for
+ // the vertical edge filtering of the right and top-right SBs. Hence, in case
+ // a thread (main/worker) encounters an error, update that vertical
+ // loopfiltering of every SB row in the frame is complete in order to avoid
+ // dependent workers waiting indefinitely.
+ for (sb_row = 0; sb_row < sb_rows; ++sb_row)
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+ sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane);
+}
+
+static AOM_INLINE void sync_lf_workers(AVxWorker *const workers,
+ AV1_COMMON *const cm, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int had_error = workers[0].had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ AVxWorker *const worker = &workers[0];
+ error_info = ((LFWorkerData *)worker->data2)->error_info;
+ }
+
+ // Wait till all rows are finished.
+ for (int i = num_workers - 1; i > 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((LFWorkerData *)worker->data2)->error_info;
+ }
+ }
+ if (had_error) aom_internal_error_copy(cm->error, &error_info);
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+ AV1LfMTInfo *cur_job_info;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex_ = lf_sync->job_mutex;
+#endif
+
+ struct aom_internal_error_info *const error_info = &lf_data->error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(job_mutex_);
+ lf_sync->lf_mt_exit = true;
+ pthread_mutex_unlock(job_mutex_);
+#endif
+ av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+ const int lpf_opt_level = cur_job_info->lpf_opt_level;
+ av1_thread_loop_filter_rows(
+ lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+ cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+ lpf_opt_level, lf_sync, error_info, lf_data->params_buf,
+ lf_data->tx_buf, MAX_MIB_SIZE_LOG2);
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int start, int stop,
+ const int planes_to_lf[MAX_MB_PLANE],
+ AVxWorker *workers, int num_workers,
+ AV1LfSync *lf_sync, int lpf_opt_level) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int i;
+ loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync,
+ lpf_opt_level, MAX_MIB_SIZE_LOG2);
+
+ // Set up loopfilter thread data.
+ for (i = num_workers - 1; i >= 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = loop_filter_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ loop_filter_data_reset(lf_data, frame, cm, xd);
+
+ // Start loopfiltering
+ worker->had_error = 0;
+ if (i == 0) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ sync_lf_workers(workers, cm, num_workers);
+}
+
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int start, int stop,
+ const int planes_to_lf[MAX_MB_PLANE],
+ int lpf_opt_level) {
+ // Filter top rows of all planes first, in case the output can be partially
+ // reconstructed row by row.
+ int mi_row, plane, dir;
+
+ AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE];
+ TX_SIZE tx_buf[MAX_MIB_SIZE];
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
+ continue;
+ }
+
+ for (dir = 0; dir < 2; ++dir) {
+ av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane,
+ dir, lpf_opt_level, /*lf_sync=*/NULL,
+ xd->error_info, params_buf, tx_buf,
+ MAX_MIB_SIZE_LOG2);
+ }
+ }
+ }
+}
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int plane_start, int plane_end,
+ int partial_frame, AVxWorker *workers,
+ int num_workers, AV1LfSync *lf_sync,
+ int lpf_opt_level) {
+ int start_mi_row, end_mi_row, mi_rows_to_filter;
+ int planes_to_lf[MAX_MB_PLANE];
+
+ if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start,
+ plane_end))
+ return;
+
+ start_mi_row = 0;
+ mi_rows_to_filter = cm->mi_params.mi_rows;
+ if (partial_frame && cm->mi_params.mi_rows > 8) {
+ start_mi_row = cm->mi_params.mi_rows >> 1;
+ start_mi_row &= 0xfffffff8;
+ mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
+ }
+ end_mi_row = start_mi_row + mi_rows_to_filter;
+ av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+ if (num_workers > 1) {
+ // Enqueue and execute loopfiltering jobs.
+ loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
+ workers, num_workers, lf_sync, lpf_opt_level);
+ } else {
+ // Directly filter in the main thread.
+ loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
+ lpf_opt_level);
+ }
+}
+
+static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+#if CONFIG_MULTITHREAD
+ AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+ const int nsync = loop_res_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
+ pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+ AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+ const int nsync = loop_res_sync->sync_range;
+ int cur;
+ // Only signal when there are enough filtered SB for next row to run.
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
+
+ // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum
+ // column number. In this case, the AOMMAX operation here ensures that
+ // cur_sb_col[plane][r] is not overwritten with a smaller value thus
+ // preventing the infinite waiting of threads in the relevant sync_read()
+ // function.
+ loop_res_sync->cur_sb_col[plane][r] =
+ AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur);
+
+ pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
+ pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
+ }
+#else
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for loop restoration row synchronization
+void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+ int num_workers, int num_rows_lr,
+ int num_planes, int width) {
+ lr_sync->rows = num_rows_lr;
+ lr_sync->num_planes = num_planes;
+#if CONFIG_MULTITHREAD
+ {
+ int i, j;
+
+ for (j = 0; j < num_planes; j++) {
+ CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
+ aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
+ if (lr_sync->mutex_[j]) {
+ for (i = 0; i < num_rows_lr; ++i) {
+ pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
+ aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
+ if (lr_sync->cond_[j]) {
+ for (i = 0; i < num_rows_lr; ++i) {
+ pthread_cond_init(&lr_sync->cond_[j][i], NULL);
+ }
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
+ aom_malloc(sizeof(*(lr_sync->job_mutex))));
+ if (lr_sync->job_mutex) {
+ pthread_mutex_init(lr_sync->job_mutex, NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
+ aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata))));
+ lr_sync->num_workers = num_workers;
+
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ if (worker_idx < num_workers - 1) {
+ CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
+ (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+ CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
+ aom_malloc(sizeof(RestorationLineBuffers)));
+
+ } else {
+ lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
+ lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
+ }
+ }
+
+ for (int j = 0; j < num_planes; j++) {
+ CHECK_MEM_ERROR(
+ cm, lr_sync->cur_sb_col[j],
+ aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
+ }
+ CHECK_MEM_ERROR(
+ cm, lr_sync->job_queue,
+ aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
+ // Set up nsync.
+ lr_sync->sync_range = get_lr_sync_range(width);
+}
+
+// Deallocate loop restoration synchronization related mutex and data
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync) {
+ if (lr_sync != NULL) {
+ int j;
+#if CONFIG_MULTITHREAD
+ int i;
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ if (lr_sync->mutex_[j] != NULL) {
+ for (i = 0; i < lr_sync->rows; ++i) {
+ pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
+ }
+ aom_free(lr_sync->mutex_[j]);
+ }
+ if (lr_sync->cond_[j] != NULL) {
+ for (i = 0; i < lr_sync->rows; ++i) {
+ pthread_cond_destroy(&lr_sync->cond_[j][i]);
+ }
+ aom_free(lr_sync->cond_[j]);
+ }
+ }
+ if (lr_sync->job_mutex != NULL) {
+ pthread_mutex_destroy(lr_sync->job_mutex);
+ aom_free(lr_sync->job_mutex);
+ }
+#endif // CONFIG_MULTITHREAD
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ aom_free(lr_sync->cur_sb_col[j]);
+ }
+
+ aom_free(lr_sync->job_queue);
+
+ if (lr_sync->lrworkerdata) {
+ for (int worker_idx = 0; worker_idx < lr_sync->num_workers - 1;
+ worker_idx++) {
+ LRWorkerData *const workerdata_data =
+ lr_sync->lrworkerdata + worker_idx;
+
+ aom_free(workerdata_data->rst_tmpbuf);
+ aom_free(workerdata_data->rlbs);
+ }
+ aom_free(lr_sync->lrworkerdata);
+ }
+
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*lr_sync);
+ }
+}
+
+static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
+ AV1_COMMON *cm) {
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+ const int num_planes = av1_num_planes(cm);
+ AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+ int32_t lr_job_counter[2], num_even_lr_jobs = 0;
+ lr_sync->jobs_enqueued = 0;
+ lr_sync->jobs_dequeued = 0;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ num_even_lr_jobs =
+ num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1);
+ }
+ lr_job_counter[0] = 0;
+ lr_job_counter[1] = num_even_lr_jobs;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+ const int plane_h = ctxt[plane].plane_h;
+ const int ext_size = unit_size * 3 / 2;
+
+ int y0 = 0, i = 0;
+ while (y0 < plane_h) {
+ int remaining_h = plane_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+ RestorationTileLimits limits;
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+ assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+ lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+ lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+ lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+ lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+ lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+ if ((i & 1) == 0) {
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+ limits.v_start + RESTORATION_BORDER;
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+ limits.v_end - RESTORATION_BORDER;
+ if (i == 0) {
+ assert(limits.v_start == 0);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0;
+ }
+ if (i == (ctxt[plane].rsi->vert_units - 1)) {
+ assert(limits.v_end == plane_h);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h;
+ }
+ } else {
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+ AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+ AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h);
+ }
+ lr_job_counter[i & 1]++;
+ lr_sync->jobs_enqueued++;
+
+ y0 += h;
+ ++i;
+ }
+ }
+}
+
+static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+ AV1LrMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lr_sync->job_mutex);
+
+ if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+ cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
+ lr_sync->jobs_dequeued++;
+ }
+
+ pthread_mutex_unlock(lr_sync->job_mutex);
+#else
+ (void)lr_sync;
+#endif
+
+ return cur_job_info;
+}
+
+static void set_loop_restoration_done(AV1LrSync *const lr_sync,
+ FilterFrameCtxt *const ctxt) {
+ for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ if (ctxt[plane].rsi->frame_restoration_type == RESTORE_NONE) continue;
+ int y0 = 0, row_number = 0;
+ const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+ const int plane_h = ctxt[plane].plane_h;
+ const int ext_size = unit_size * 3 / 2;
+ const int hnum_rest_units = ctxt[plane].rsi->horz_units;
+ while (y0 < plane_h) {
+ const int remaining_h = plane_h - y0;
+ const int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+ lr_sync_write(lr_sync, row_number, hnum_rest_units - 1, hnum_rest_units,
+ plane);
+ y0 += h;
+ ++row_number;
+ }
+ }
+}
+
+// Implement row loop restoration for each thread.
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+ AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+ LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
+ AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+ int lr_unit_row;
+ int plane;
+ int plane_w;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex_ = lr_sync->job_mutex;
+#endif
+ struct aom_internal_error_info *const error_info = &lrworkerdata->error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(job_mutex_);
+ lr_sync->lr_mt_exit = true;
+ pthread_mutex_unlock(job_mutex_);
+#endif
+ // In case of loop restoration multithreading, the worker on an even lr
+ // block row waits for the completion of the filtering of the top-right and
+ // bottom-right blocks. Hence, in case a thread (main/worker) encounters an
+ // error, update that filtering of every row in the frame is complete in
+ // order to avoid the dependent workers from waiting indefinitely.
+ set_loop_restoration_done(lr_sync, lr_ctxt->ctxt);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+ int vstart, int vend);
+ static const copy_fun copy_funs[MAX_MB_PLANE] = {
+ aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u,
+ aom_yv12_partial_coloc_copy_v
+ };
+
+ while (1) {
+ AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
+ if (cur_job_info != NULL) {
+ RestorationTileLimits limits;
+ sync_read_fn_t on_sync_read;
+ sync_write_fn_t on_sync_write;
+ limits.v_start = cur_job_info->v_start;
+ limits.v_end = cur_job_info->v_end;
+ lr_unit_row = cur_job_info->lr_unit_row;
+ plane = cur_job_info->plane;
+ plane_w = ctxt[plane].plane_w;
+
+ // sync_mode == 1 implies only sync read is required in LR Multi-threading
+ // sync_mode == 0 implies only sync write is required.
+ on_sync_read =
+ cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+ on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+ : av1_lr_sync_write_dummy;
+
+ av1_foreach_rest_unit_in_row(
+ &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row,
+ ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units,
+ ctxt[plane].rsi->vert_units, plane, &ctxt[plane],
+ lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+ on_sync_write, lr_sync, error_info);
+
+ copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w,
+ cur_job_info->v_copy_start, cur_job_info->v_copy_end);
+
+ if (lrworkerdata->do_extend_border) {
+ aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane,
+ cur_job_info->v_copy_start,
+ cur_job_info->v_copy_end);
+ }
+ } else {
+ break;
+ }
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+static AOM_INLINE void sync_lr_workers(AVxWorker *const workers,
+ AV1_COMMON *const cm, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int had_error = workers[0].had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ AVxWorker *const worker = &workers[0];
+ error_info = ((LRWorkerData *)worker->data2)->error_info;
+ }
+
+ // Wait till all rows are finished.
+ for (int i = num_workers - 1; i > 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((LRWorkerData *)worker->data2)->error_info;
+ }
+ }
+ if (had_error) aom_internal_error_copy(cm->error, &error_info);
+}
+
+static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
+ AVxWorker *workers, int num_workers,
+ AV1LrSync *lr_sync, AV1_COMMON *cm,
+ int do_extend_border) {
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+ const int num_planes = av1_num_planes(cm);
+
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_rows_lr = 0;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+ const int plane_h = ctxt[plane].plane_h;
+ const int unit_size = cm->rst_info[plane].restoration_unit_size;
+
+ num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h));
+ }
+
+ int i;
+ assert(MAX_MB_PLANE == 3);
+
+ if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+ num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) {
+ av1_loop_restoration_dealloc(lr_sync);
+ av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr,
+ num_planes, cm->width);
+ }
+ lr_sync->lr_mt_exit = false;
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ for (i = 0; i < num_planes; i++) {
+ memset(lr_sync->cur_sb_col[i], -1,
+ sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
+ }
+
+ enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
+
+ // Set up looprestoration thread data.
+ for (i = num_workers - 1; i >= 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+ lr_sync->lrworkerdata[i].do_extend_border = do_extend_border;
+ worker->hook = loop_restoration_row_worker;
+ worker->data1 = lr_sync;
+ worker->data2 = &lr_sync->lrworkerdata[i];
+
+ // Start loop restoration
+ worker->had_error = 0;
+ if (i == 0) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ sync_lr_workers(workers, cm, num_workers);
+}
+
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int optimized_lr,
+ AVxWorker *workers, int num_workers,
+ AV1LrSync *lr_sync, void *lr_ctxt,
+ int do_extend_border) {
+ assert(!cm->features.all_lossless);
+
+ const int num_planes = av1_num_planes(cm);
+
+ AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+ av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+ optimized_lr, num_planes);
+
+ foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
+ cm, do_extend_border);
+}
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
+ cdef_sync->end_of_frame = 0;
+ cdef_sync->fbr = 0;
+ cdef_sync->fbc = 0;
+ cdef_sync->cdef_mt_exit = false;
+}
+
+static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &workers[i];
+ worker->had_error = 0;
+ if (i == 0)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
+
+static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers,
+ AV1_COMMON *const cm,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int had_error = workers[0].had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ AVxWorker *const worker = &workers[0];
+ error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
+ }
+
+ // Wait till all rows are finished.
+ for (int i = num_workers - 1; i > 0; --i) {
+ AVxWorker *const worker = &workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((AV1CdefWorkerData *)worker->data2)->error_info;
+ }
+ }
+ if (had_error) aom_internal_error_copy(cm->error, &error_info);
+}
+
+// Updates the row index of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all rows is complete.
+static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync,
+ const int nvfb) {
+ cdef_sync->fbr++;
+ if (cdef_sync->fbr == nvfb) {
+ cdef_sync->end_of_frame = 1;
+ }
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
+ volatile int *cur_fbr,
+ const int nvfb) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(cdef_sync->mutex_);
+#endif // CONFIG_MULTITHREAD
+ int do_next_row = 0;
+ // Populates information needed for current job and update the row
+ // index of the next row to be processed.
+ if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
+ do_next_row = 1;
+ *cur_fbr = cdef_sync->fbr;
+ update_cdef_row_next_job_info(cdef_sync, nvfb);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(cdef_sync->mutex_);
+#endif // CONFIG_MULTITHREAD
+ return do_next_row;
+}
+
+static void set_cdef_init_fb_row_done(AV1CdefSync *const cdef_sync, int nvfb) {
+ for (int fbr = 0; fbr < nvfb; fbr++) cdef_row_mt_sync_write(cdef_sync, fbr);
+}
+
+// Hook function for each thread in CDEF multi-threading.
+static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
+ AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
+ AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
+ AV1_COMMON *cm = cdef_worker->cm;
+ const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex_ = cdef_sync->mutex_;
+#endif
+ struct aom_internal_error_info *const error_info = &cdef_worker->error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(job_mutex_);
+ cdef_sync->cdef_mt_exit = true;
+ pthread_mutex_unlock(job_mutex_);
+#endif
+ // In case of cdef row-multithreading, the worker on a filter block row
+ // (fbr) waits for the line buffers (top and bottom) copy of the above row.
+ // Hence, in case a thread (main/worker) encounters an error before copying
+ // of the line buffers, update that line buffer copy is complete in order to
+ // avoid dependent workers waiting indefinitely.
+ set_cdef_init_fb_row_done(cdef_sync, nvfb);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ volatile int cur_fbr;
+ const int num_planes = av1_num_planes(cm);
+ while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
+ MACROBLOCKD *xd = cdef_worker->xd;
+ av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf,
+ cdef_worker->srcbuf, cur_fbr,
+ cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info);
+ if (cdef_worker->do_extend_border) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
+ const int is_uv = plane > 0;
+ const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+ const int unit_height = MI_SIZE_64X64 << mi_high;
+ const int v_start = cur_fbr * unit_height;
+ const int v_end =
+ AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]);
+ aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end);
+ }
+ }
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns CDEF hook function and thread data to each worker.
+static void prepare_cdef_frame_workers(
+ AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker,
+ AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+ int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn,
+ int do_extend_border) {
+ const int num_planes = av1_num_planes(cm);
+
+ cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
+ for (int plane = 0; plane < num_planes; plane++)
+ cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane];
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &workers[i];
+ cdef_worker[i].cm = cm;
+ cdef_worker[i].xd = xd;
+ cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
+ cdef_worker[i].do_extend_border = do_extend_border;
+ for (int plane = 0; plane < num_planes; plane++)
+ cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
+
+ worker->hook = hook;
+ worker->data1 = cdef_sync;
+ worker->data2 = &cdef_worker[i];
+ }
+}
+
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info,
+ uint16_t **const linebuf, uint16_t *const src,
+ struct AV1CdefSyncData *const cdef_sync, int fbr) {
+ const int num_planes = av1_num_planes(cm);
+ const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int luma_stride =
+ ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+
+ // for the current filter block, it's top left corner mi structure (mi_tl)
+ // is first accessed to check whether the top and left boundaries are
+ // frame boundaries. Then bottom-left and top-right mi structures are
+ // accessed to check whether the bottom and right boundaries
+ // (respectively) are frame boundaries.
+ //
+ // Note that we can't just check the bottom-right mi structure - eg. if
+ // we're at the right-hand edge of the frame but not the bottom, then
+ // the bottom-right mi is NULL but the bottom-left is not.
+ fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
+ if (fbr != nvfb - 1)
+ fb_info->frame_boundary[BOTTOM] =
+ (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
+ else
+ fb_info->frame_boundary[BOTTOM] = 1;
+
+ fb_info->src = src;
+ fb_info->damping = cm->cdef_info.cdef_damping;
+ fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+ av1_zero(fb_info->dir);
+ av1_zero(fb_info->var);
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+ uint16_t *top_linebuf = &linebuf[plane][0];
+ uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride];
+ {
+ const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+ const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+ const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+
+ if (fbr != nvfb - 1) // if (fbr != 0) // top line buffer copy
+ av1_cdef_copy_sb8_16(
+ cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride,
+ xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0,
+ xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+ if (fbr != nvfb - 1) // bottom line buffer copy
+ av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride],
+ stride, xd->plane[plane].dst.buf, bot_offset, 0,
+ xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+ }
+
+ fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride];
+ fb_info->bot_linebuf[plane] =
+ &linebuf[plane]
+ [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
+ }
+
+ cdef_row_mt_sync_write(cdef_sync, fbr);
+ cdef_row_mt_sync_read(cdef_sync, fbr);
+}
+
+// Implements multi-threading for CDEF.
+// Perform CDEF on input frame.
+// Inputs:
+// frame: Pointer to input frame buffer.
+// cm: Pointer to common structure.
+// xd: Pointer to common current coding block structure.
+// Returns:
+// Nothing will be returned.
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ AV1CdefWorkerData *const cdef_worker,
+ AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+ int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn,
+ int do_extend_border) {
+ YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
+ const int num_planes = av1_num_planes(cm);
+
+ av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+ num_planes);
+
+ reset_cdef_job_info(cdef_sync);
+ prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
+ workers, cdef_sync, num_workers,
+ cdef_init_fb_row_fn, do_extend_border);
+ launch_cdef_workers(workers, num_workers);
+ sync_cdef_workers(workers, cm, num_workers);
+}
+
+int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm) {
+ // No additional top-right delay when intraBC tool is not enabled.
+ if (!av1_allow_intrabc(cm)) return 0;
+ // Due to the hardware constraints on processing the intraBC tool with row
+ // multithreading, a top-right delay of 3 superblocks of size 128x128 or 5
+ // superblocks of size 64x64 is mandated. However, a minimum top-right delay
+ // of 1 superblock is assured with 'sync_range'. Hence return only the
+ // additional superblock delay when the intraBC tool is enabled.
+ return cm->seq_params->sb_size == BLOCK_128X128 ? 2 : 4;
+}
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
new file mode 100644
index 0000000000..675687dc98
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/cdef.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+typedef struct AV1LfMTInfo {
+ int mi_row;
+ int plane;
+ int dir;
+ int lpf_opt_level;
+} AV1LfMTInfo;
+
+// Loopfilter row synchronization
+typedef struct AV1LfSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_[MAX_MB_PLANE];
+ pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+ // Allocate memory to store the loop-filtered superblock index in each row.
+ int *cur_sb_col[MAX_MB_PLANE];
+ // The optimal sync_range for different resolution and platform should be
+ // determined by testing. Currently, it is chosen to be a power-of-2 number.
+ int sync_range;
+ int rows;
+
+ // Row-based parallel loopfilter data
+ LFWorkerData *lfdata;
+ int num_workers;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex;
+#endif
+ AV1LfMTInfo *job_queue;
+ int jobs_enqueued;
+ int jobs_dequeued;
+
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool lf_mt_exit;
+} AV1LfSync;
+
+typedef struct AV1LrMTInfo {
+ int v_start;
+ int v_end;
+ int lr_unit_row;
+ int plane;
+ int sync_mode;
+ int v_copy_start;
+ int v_copy_end;
+} AV1LrMTInfo;
+
+typedef struct LoopRestorationWorkerData {
+ int32_t *rst_tmpbuf;
+ void *rlbs;
+ void *lr_ctxt;
+ int do_extend_border;
+ struct aom_internal_error_info error_info;
+} LRWorkerData;
+
+// Looprestoration row synchronization
+typedef struct AV1LrSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_[MAX_MB_PLANE];
+ pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+ // Allocate memory to store the loop-restoration block index in each row.
+ int *cur_sb_col[MAX_MB_PLANE];
+ // The optimal sync_range for different resolution and platform should be
+ // determined by testing. Currently, it is chosen to be a power-of-2 number.
+ int sync_range;
+ int rows;
+ int num_planes;
+
+ int num_workers;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex;
+#endif
+ // Row-based parallel loopfilter data
+ LRWorkerData *lrworkerdata;
+
+ AV1LrMTInfo *job_queue;
+ int jobs_enqueued;
+ int jobs_dequeued;
+ // Initialized to false, set to true by the worker thread that encounters
+ // an error in order to abort the processing of other worker threads.
+ bool lr_mt_exit;
+} AV1LrSync;
+
+typedef struct AV1CdefWorker {
+ AV1_COMMON *cm;
+ MACROBLOCKD *xd;
+ uint16_t *colbuf[MAX_MB_PLANE];
+ uint16_t *srcbuf;
+ uint16_t *linebuf[MAX_MB_PLANE];
+ cdef_init_fb_row_t cdef_init_fb_row_fn;
+ int do_extend_border;
+ struct aom_internal_error_info error_info;
+} AV1CdefWorkerData;
+
+typedef struct AV1CdefRowSync {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *row_mutex_;
+ pthread_cond_t *row_cond_;
+#endif // CONFIG_MULTITHREAD
+ int is_row_done;
+} AV1CdefRowSync;
+
+// Data related to CDEF search multi-thread synchronization.
+typedef struct AV1CdefSyncData {
+#if CONFIG_MULTITHREAD
+ // Mutex lock used while dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif // CONFIG_MULTITHREAD
+ // Data related to CDEF row mt sync information
+ AV1CdefRowSync *cdef_row_mt;
+ // Flag to indicate all blocks are processed and end of frame is reached
+ int end_of_frame;
+ // Row index in units of 64x64 block
+ int fbr;
+ // Column index in units of 64x64 block
+ int fbc;
+ // Initialized to false, set to true by the worker thread that encounters
+ // an error in order to abort the processing of other worker threads.
+ bool cdef_mt_exit;
+} AV1CdefSync;
+
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ AV1CdefWorkerData *const cdef_worker,
+ AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+ int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn,
+ int do_extend_border);
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ CdefBlockInfo *const fb_info,
+ uint16_t **const linebuf, uint16_t *const src,
+ struct AV1CdefSyncData *const cdef_sync, int fbr);
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+ int dstride, const uint8_t *src, int src_voffset,
+ int src_hoffset, int sstride, int vsize, int hsize);
+void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride,
+ const uint8_t *src, int src_voffset,
+ int src_hoffset, int sstride, int vsize,
+ int hsize);
+void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride,
+ const uint8_t *src, int src_voffset,
+ int src_hoffset, int sstride, int vsize,
+ int hsize);
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+ int num_workers);
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+ int width, int num_workers);
+
+void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync,
+ int num_mis_in_lpf_unit_height_log2);
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+ struct macroblockd *xd, int plane_start,
+ int plane_end, int partial_frame,
+ AVxWorker *workers, int num_workers,
+ AV1LfSync *lf_sync, int lpf_opt_level);
+
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm,
+ int optimized_lr, AVxWorker *workers,
+ int num_workers, AV1LrSync *lr_sync,
+ void *lr_ctxt, int do_extend_border);
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync);
+void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+ int num_workers, int num_rows_lr,
+ int num_planes, int width);
+int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm);
+
+void av1_thread_loop_filter_rows(
+ const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+ struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
+ int dir, int lpf_opt_level, AV1LfSync *const lf_sync,
+ struct aom_internal_error_info *error_info,
+ AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2);
+
+static AOM_FORCE_INLINE bool skip_loop_filter_plane(
+ const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) {
+ // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both
+ // chroma planes together
+ if (lpf_opt_level == 2) {
+ if (plane == AOM_PLANE_Y) {
+ return !planes_to_lf[plane];
+ }
+ if (plane == AOM_PLANE_U) {
+ // U and V are handled together
+ return !planes_to_lf[1] && !planes_to_lf[2];
+ }
+ assert(plane == AOM_PLANE_V);
+ if (plane == AOM_PLANE_V) {
+ // V is handled when u is filtered
+ return true;
+ }
+ }
+
+ // Normal operation mode
+ return !planes_to_lf[plane];
+}
+
+static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
+ const int planes_to_lf[MAX_MB_PLANE],
+ int lpf_opt_level,
+ int num_mis_in_lpf_unit_height) {
+ int mi_row, plane, dir;
+ AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+ lf_sync->jobs_enqueued = 0;
+ lf_sync->jobs_dequeued = 0;
+
+ // Launch all vertical jobs first, as they are blocking the horizontal ones.
+ // Launch top row jobs for all planes first, in case the output can be
+ // partially reconstructed row by row.
+ for (dir = 0; dir < 2; ++dir) {
+ for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) {
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) {
+ continue;
+ }
+ if (!planes_to_lf[plane]) continue;
+ lf_job_queue->mi_row = mi_row;
+ lf_job_queue->plane = plane;
+ lf_job_queue->dir = dir;
+ lf_job_queue->lpf_opt_level = lpf_opt_level;
+ lf_job_queue++;
+ lf_sync->jobs_enqueued++;
+ }
+ }
+ }
+}
+
+static AOM_INLINE void loop_filter_frame_mt_init(
+ AV1_COMMON *cm, int start_mi_row, int end_mi_row,
+ const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync,
+ int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) {
+ // Number of superblock rows
+ const int sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2);
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_workers > lf_sync->num_workers) {
+ av1_loop_filter_dealloc(lf_sync);
+ av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+ }
+ lf_sync->lf_mt_exit = false;
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ for (int i = 0; i < MAX_MB_PLANE; i++) {
+ memset(lf_sync->cur_sb_col[i], -1,
+ sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
+ }
+
+ enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf,
+ lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2));
+}
+
+static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+ AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lf_sync->job_mutex);
+
+ if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+ cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+ lf_sync->jobs_dequeued++;
+ }
+
+ pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+ (void)lf_sync;
+#endif
+
+ return cur_job_info;
+}
+
+static AOM_INLINE void loop_filter_data_reset(LFWorkerData *lf_data,
+ YV12_BUFFER_CONFIG *frame_buffer,
+ struct AV1Common *cm,
+ MACROBLOCKD *xd) {
+ struct macroblockd_plane *pd = xd->plane;
+ lf_data->frame_buffer = frame_buffer;
+ lf_data->cm = cm;
+ lf_data->xd = xd;
+ for (int i = 0; i < MAX_MB_PLANE; i++) {
+ memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+ lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+ lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+ }
+}
+
+static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf,
+ int planes_to_lf[MAX_MB_PLANE],
+ int plane_start,
+ int plane_end) {
+ // For each luma and chroma plane, whether to filter it or not.
+ planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) &&
+ plane_start <= 0 && 0 < plane_end;
+ planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end;
+ planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end;
+}
+
+static AOM_INLINE int check_planes_to_loop_filter(
+ const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE],
+ int plane_start, int plane_end) {
+ set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
+ // If the luma plane is purposely not filtered, neither are the chroma
+ // planes.
+ if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0;
+ // Early exit.
+ if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0;
+ return 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
new file mode 100644
index 0000000000..b964f259b8
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
+ av1_tile_set_row(tile, cm, row);
+ av1_tile_set_col(tile, cm, col);
+}
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static int tile_log2(int blk_size, int target) {
+ int k;
+ for (k = 0; (blk_size << k) < target; k++) {
+ }
+ return k;
+}
+
+void av1_get_tile_limits(AV1_COMMON *const cm) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ CommonTileParams *const tiles = &cm->tiles;
+ const int sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+ const int sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+
+ const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
+ tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+
+#if CONFIG_CWG_C013
+ bool use_level_7_above = false;
+ for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+ if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 &&
+ seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) {
+ // Currently it is assumed that levels 7.x and 8.x are either used for all
+ // operating points, or none of them.
+ if (i != 0 && !use_level_7_above) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Either all the operating points are levels 7.x or "
+ "8.x, or none of them are.");
+ }
+ use_level_7_above = true;
+ }
+ }
+ const int max_tile_area_sb =
+ (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >>
+ (2 * sb_size_log2);
+#else
+ const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+#endif
+
+ tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
+ tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+ tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+ tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+ tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
+}
+
+void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
+ int cm_mi_rows, int cm_mi_cols,
+ CommonTileParams *const tiles) {
+ int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
+ int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+ int i;
+
+ // This will be overridden if there is at least two columns of tiles
+ // (otherwise there is no inner tile width)
+ tiles->min_inner_width = -1;
+
+ if (tiles->uniform_spacing) {
+ int start_sb;
+ int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols);
+ assert(size_sb > 0);
+ for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
+ tiles->col_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ }
+ tiles->cols = i;
+ tiles->col_start_sb[i] = sb_cols;
+ tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
+ tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
+
+ tiles->width = size_sb << seq_params->mib_size_log2;
+ tiles->width = AOMMIN(tiles->width, cm_mi_cols);
+ if (tiles->cols > 1) {
+ tiles->min_inner_width = tiles->width;
+ }
+ } else {
+ int max_tile_area_sb = (sb_rows * sb_cols);
+ int widest_tile_sb = 1;
+ int narrowest_inner_tile_sb = 65536;
+ tiles->log2_cols = tile_log2(1, tiles->cols);
+ for (i = 0; i < tiles->cols; i++) {
+ int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+ widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+ // ignore the rightmost tile in frame for determining the narrowest
+ if (i < tiles->cols - 1)
+ narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
+ }
+ if (tiles->min_log2) {
+ max_tile_area_sb >>= (tiles->min_log2 + 1);
+ }
+ tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+ if (tiles->cols > 1) {
+ tiles->min_inner_width = narrowest_inner_tile_sb
+ << seq_params->mib_size_log2;
+ }
+ }
+}
+
+void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
+ int cm_mi_rows, CommonTileParams *const tiles) {
+ int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+ int start_sb, size_sb, i;
+
+ if (tiles->uniform_spacing) {
+ size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows);
+ assert(size_sb > 0);
+ for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
+ tiles->row_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ }
+ tiles->rows = i;
+ tiles->row_start_sb[i] = sb_rows;
+
+ tiles->height = size_sb << seq_params->mib_size_log2;
+ tiles->height = AOMMIN(tiles->height, cm_mi_rows);
+ } else {
+ tiles->log2_rows = tile_log2(1, tiles->rows);
+ }
+}
+
+void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
+ assert(row < cm->tiles.rows);
+ int mi_row_start = cm->tiles.row_start_sb[row]
+ << cm->seq_params->mib_size_log2;
+ int mi_row_end = cm->tiles.row_start_sb[row + 1]
+ << cm->seq_params->mib_size_log2;
+ tile->tile_row = row;
+ tile->mi_row_start = mi_row_start;
+ tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
+ assert(tile->mi_row_end > tile->mi_row_start);
+}
+
+void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
+ assert(col < cm->tiles.cols);
+ int mi_col_start = cm->tiles.col_start_sb[col]
+ << cm->seq_params->mib_size_log2;
+ int mi_col_end = cm->tiles.col_start_sb[col + 1]
+ << cm->seq_params->mib_size_log2;
+ tile->tile_col = col;
+ tile->mi_col_start = mi_col_start;
+ tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
+ assert(tile->mi_col_end > tile->mi_col_start);
+}
+
+int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
+ return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start,
+ cm->seq_params->mib_size_log2);
+}
+
+int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) {
+ return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start,
+ cm->seq_params->mib_size_log2);
+}
+
+PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+ int is_uv) {
+ PixelRect r;
+
+ // Calculate position in the Y plane
+ r.left = tile_info->mi_col_start * MI_SIZE;
+ r.right = tile_info->mi_col_end * MI_SIZE;
+ r.top = tile_info->mi_row_start * MI_SIZE;
+ r.bottom = tile_info->mi_row_end * MI_SIZE;
+
+ // If upscaling is enabled, the tile limits need scaling to match the
+ // upscaled frame where the restoration units live. To do this, scale up the
+ // top-left and bottom-right of the tile.
+ if (av1_superres_scaled(cm)) {
+ av1_calculate_unscaled_superres_size(&r.left, &r.top,
+ cm->superres_scale_denominator);
+ av1_calculate_unscaled_superres_size(&r.right, &r.bottom,
+ cm->superres_scale_denominator);
+ }
+
+ const int frame_w = cm->superres_upscaled_width;
+ const int frame_h = cm->superres_upscaled_height;
+
+ // Make sure we don't fall off the bottom-right of the frame.
+ r.right = AOMMIN(r.right, frame_w);
+ r.bottom = AOMMIN(r.bottom, frame_h);
+
+ // Convert to coordinates in the appropriate plane
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+
+ r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
+ r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
+ r.top = ROUND_POWER_OF_TWO(r.top, ss_y);
+ r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y);
+
+ return r;
+}
+
+void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
+ const CommonTileParams *const tiles = &cm->tiles;
+ if (tiles->uniform_spacing) {
+ *w = tiles->width;
+ *h = tiles->height;
+ } else {
+ for (int i = 0; i < tiles->cols; ++i) {
+ const int tile_width_sb =
+ tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+ const int tile_w = tile_width_sb * cm->seq_params->mib_size;
+ assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension
+ *w = tile_w;
+ }
+
+ for (int i = 0; i < tiles->rows; ++i) {
+ const int tile_height_sb =
+ tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+ const int tile_h = tile_height_sb * cm->seq_params->mib_size;
+ assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension
+ *h = tile_h;
+ }
+ }
+}
+
+int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+ // Disable check if there is a single tile col in the frame
+ if (cm->tiles.cols == 1) return 1;
+
+ return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
+ (64 << av1_superres_scaled(cm)));
+}
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
new file mode 100644
index 0000000000..5383ae940b
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+#include "aom_dsp/rect.h"
+
+struct AV1Common;
+struct SequenceHeader;
+struct CommonTileParams;
+
+#define DEFAULT_MAX_NUM_TG 1
+
+typedef struct TileInfo {
+ int mi_row_start, mi_row_end;
+ int mi_col_start, mi_col_end;
+ int tile_row;
+ int tile_col;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
+ int col);
+
+void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
+void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
+
+int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile);
+int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile);
+
+// Return the pixel extents of the given tile
+PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+ const struct AV1Common *cm, int is_uv);
+
+// Define tile maximum width and area
+// There is no maximum height since height is limited by area and width limits
+// The minimum tile width or height is fixed at one superblock
+#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels
+#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels
+#if CONFIG_CWG_C013
+#define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608)
+#endif
+
+void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
+void av1_get_tile_limits(struct AV1Common *const cm);
+void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params,
+ int cm_mi_rows, int cm_mi_cols,
+ struct CommonTileParams *const tiles);
+void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params,
+ int cm_mi_rows,
+ struct CommonTileParams *const tiles);
+
+// Checks if the minimum tile_width requirement is satisfied
+int av1_is_min_tile_width_satisfied(const struct AV1Common *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
new file mode 100644
index 0000000000..a959cdf768
--- /dev/null
+++ b/third_party/aom/av1/common/timing.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/timing.h"
+
+/* Tables for AV1 max bitrates for different levels of main and high tier.
+ * The tables are in Kbps instead of Mbps in the specification.
+ * Note that depending on the profile, a multiplier is needed.
+ */
+#define UNDEFINED_RATE \
+ (1 << 21) // Placeholder rate for levels with undefined rate
+#define INVALID_RATE \
+ (0) // For invalid profile-level configuration, set rate to 0
+
+/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t main_kbps[1 << LEVEL_BITS] = {
+ 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 30000, 40000, 60000, 60000,
+ 60000, 100000, 160000, 160000,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
+};
+
+/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t high_kbps[1 << LEVEL_BITS] = {
+ INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE,
+ INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE,
+ 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE,
+ 100000, 160000, 240000, 240000,
+ 240000, 480000, 800000, 800000,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+ UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
+};
+
+/* BitrateProfileFactor */
+static int bitrate_profile_factor[1 << PROFILE_BITS] = {
+ 1, 2, 3, 0, 0, 0, 0, 0
+};
+
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+ int seq_tier) {
+ int64_t bitrate;
+
+ if (seq_tier) {
+ bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+ } else {
+ bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+ }
+
+ return bitrate * 1000;
+}
+
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+ decoder_model->encoder_decoder_buffer_delay_length = 16;
+ decoder_model->buffer_removal_time_length = 10;
+ decoder_model->frame_presentation_time_length = 10;
+}
+
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+ op_params->decoder_model_param_present_flag = 1;
+ op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s
+ op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s
+ op_params->low_delay_mode_flag = 0;
+ op_params->display_model_param_present_flag = 1;
+ op_params->initial_display_delay = 8; // 8 frames delay
+}
+
+void av1_set_resource_availability_parameters(
+ aom_dec_model_op_parameters_t *op_params) {
+ op_params->decoder_model_param_present_flag = 0;
+ op_params->decoder_buffer_delay =
+ 70000; // Resource availability mode default
+ op_params->encoder_buffer_delay =
+ 20000; // Resource availability mode default
+ op_params->low_delay_mode_flag = 0; // Resource availability mode default
+ op_params->display_model_param_present_flag = 1;
+ op_params->initial_display_delay = 8; // 8 frames delay
+}
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
new file mode 100644
index 0000000000..9192124f72
--- /dev/null
+++ b/third_party/aom/av1/common/timing.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+#define MAX_NUM_OP_POINTS 32
+
+typedef struct aom_timing {
+ uint32_t num_units_in_display_tick;
+ uint32_t time_scale;
+ int equal_picture_interval;
+ uint32_t num_ticks_per_picture;
+} aom_timing_info_t;
+
+typedef struct aom_dec_model_info {
+ uint32_t num_units_in_decoding_tick;
+ int encoder_decoder_buffer_delay_length;
+ int buffer_removal_time_length;
+ int frame_presentation_time_length;
+} aom_dec_model_info_t;
+
+typedef struct aom_dec_model_op_parameters {
+ int decoder_model_param_present_flag;
+ int64_t bitrate;
+ int64_t buffer_size;
+ uint32_t decoder_buffer_delay;
+ uint32_t encoder_buffer_delay;
+ int low_delay_mode_flag;
+ int display_model_param_present_flag;
+ int initial_display_delay;
+} aom_dec_model_op_parameters_t;
+
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
+
+void av1_set_resource_availability_parameters(
+ aom_dec_model_op_parameters_t *op_params);
+
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+ int seq_tier);
+
+#endif // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
new file mode 100644
index 0000000000..f1edda58d7
--- /dev/null
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -0,0 +1,3555 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/entropy.h"
+
+static const aom_cdf_prob
+ av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS]
+ [CDF_SIZE(2)] = {
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ };
+
+static const aom_cdf_prob
+ av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS]
+ [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) },
+ { AOM_CDF2(5892) },
+ { AOM_CDF2(12112) },
+ { AOM_CDF2(21935) },
+ { AOM_CDF2(20289) },
+ { AOM_CDF2(27473) },
+ { AOM_CDF2(32487) },
+ { AOM_CDF2(7654) },
+ { AOM_CDF2(19473) },
+ { AOM_CDF2(29984) },
+ { AOM_CDF2(9961) },
+ { AOM_CDF2(30242) },
+ { AOM_CDF2(32117) } },
+ { { AOM_CDF2(31548) },
+ { AOM_CDF2(1549) },
+ { AOM_CDF2(10130) },
+ { AOM_CDF2(16656) },
+ { AOM_CDF2(18591) },
+ { AOM_CDF2(26308) },
+ { AOM_CDF2(32537) },
+ { AOM_CDF2(5403) },
+ { AOM_CDF2(18096) },
+ { AOM_CDF2(30003) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(29957) },
+ { AOM_CDF2(5391) },
+ { AOM_CDF2(18039) },
+ { AOM_CDF2(23566) },
+ { AOM_CDF2(22431) },
+ { AOM_CDF2(25822) },
+ { AOM_CDF2(32197) },
+ { AOM_CDF2(3778) },
+ { AOM_CDF2(15336) },
+ { AOM_CDF2(28981) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(17920) },
+ { AOM_CDF2(1818) },
+ { AOM_CDF2(7282) },
+ { AOM_CDF2(25273) },
+ { AOM_CDF2(10923) },
+ { AOM_CDF2(31554) },
+ { AOM_CDF2(32624) },
+ { AOM_CDF2(1366) },
+ { AOM_CDF2(15628) },
+ { AOM_CDF2(30462) },
+ { AOM_CDF2(146) },
+ { AOM_CDF2(5132) },
+ { AOM_CDF2(31657) } },
+ { { AOM_CDF2(6308) },
+ { AOM_CDF2(117) },
+ { AOM_CDF2(1638) },
+ { AOM_CDF2(2161) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(10923) },
+ { AOM_CDF2(30247) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } },
+ { { { AOM_CDF2(30371) },
+ { AOM_CDF2(7570) },
+ { AOM_CDF2(13155) },
+ { AOM_CDF2(20751) },
+ { AOM_CDF2(20969) },
+ { AOM_CDF2(27067) },
+ { AOM_CDF2(32013) },
+ { AOM_CDF2(5495) },
+ { AOM_CDF2(17942) },
+ { AOM_CDF2(28280) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31782) },
+ { AOM_CDF2(1836) },
+ { AOM_CDF2(10689) },
+ { AOM_CDF2(17604) },
+ { AOM_CDF2(21622) },
+ { AOM_CDF2(27518) },
+ { AOM_CDF2(32399) },
+ { AOM_CDF2(4419) },
+ { AOM_CDF2(16294) },
+ { AOM_CDF2(28345) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31901) },
+ { AOM_CDF2(10311) },
+ { AOM_CDF2(18047) },
+ { AOM_CDF2(24806) },
+ { AOM_CDF2(23288) },
+ { AOM_CDF2(27914) },
+ { AOM_CDF2(32296) },
+ { AOM_CDF2(4215) },
+ { AOM_CDF2(15756) },
+ { AOM_CDF2(28341) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(26726) },
+ { AOM_CDF2(1045) },
+ { AOM_CDF2(11703) },
+ { AOM_CDF2(20590) },
+ { AOM_CDF2(18554) },
+ { AOM_CDF2(25970) },
+ { AOM_CDF2(31938) },
+ { AOM_CDF2(5583) },
+ { AOM_CDF2(21313) },
+ { AOM_CDF2(29390) },
+ { AOM_CDF2(641) },
+ { AOM_CDF2(22265) },
+ { AOM_CDF2(31452) } },
+ { { AOM_CDF2(26584) },
+ { AOM_CDF2(188) },
+ { AOM_CDF2(8847) },
+ { AOM_CDF2(24519) },
+ { AOM_CDF2(22938) },
+ { AOM_CDF2(30583) },
+ { AOM_CDF2(32608) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } },
+ { { { AOM_CDF2(29614) },
+ { AOM_CDF2(9068) },
+ { AOM_CDF2(12924) },
+ { AOM_CDF2(19538) },
+ { AOM_CDF2(17737) },
+ { AOM_CDF2(24619) },
+ { AOM_CDF2(30642) },
+ { AOM_CDF2(4119) },
+ { AOM_CDF2(16026) },
+ { AOM_CDF2(25657) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31957) },
+ { AOM_CDF2(3230) },
+ { AOM_CDF2(11153) },
+ { AOM_CDF2(18123) },
+ { AOM_CDF2(20143) },
+ { AOM_CDF2(26536) },
+ { AOM_CDF2(31986) },
+ { AOM_CDF2(3050) },
+ { AOM_CDF2(14603) },
+ { AOM_CDF2(25155) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(32363) },
+ { AOM_CDF2(10692) },
+ { AOM_CDF2(19090) },
+ { AOM_CDF2(24357) },
+ { AOM_CDF2(24442) },
+ { AOM_CDF2(28312) },
+ { AOM_CDF2(32169) },
+ { AOM_CDF2(3648) },
+ { AOM_CDF2(15690) },
+ { AOM_CDF2(26815) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(30669) },
+ { AOM_CDF2(3832) },
+ { AOM_CDF2(11663) },
+ { AOM_CDF2(18889) },
+ { AOM_CDF2(19782) },
+ { AOM_CDF2(23313) },
+ { AOM_CDF2(31330) },
+ { AOM_CDF2(5124) },
+ { AOM_CDF2(18719) },
+ { AOM_CDF2(28468) },
+ { AOM_CDF2(3082) },
+ { AOM_CDF2(20982) },
+ { AOM_CDF2(29443) } },
+ { { AOM_CDF2(28573) },
+ { AOM_CDF2(3183) },
+ { AOM_CDF2(17802) },
+ { AOM_CDF2(25977) },
+ { AOM_CDF2(26677) },
+ { AOM_CDF2(27832) },
+ { AOM_CDF2(32387) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } },
+ { { { AOM_CDF2(26887) },
+ { AOM_CDF2(6729) },
+ { AOM_CDF2(10361) },
+ { AOM_CDF2(17442) },
+ { AOM_CDF2(15045) },
+ { AOM_CDF2(22478) },
+ { AOM_CDF2(29072) },
+ { AOM_CDF2(2713) },
+ { AOM_CDF2(11861) },
+ { AOM_CDF2(20773) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31903) },
+ { AOM_CDF2(2044) },
+ { AOM_CDF2(7528) },
+ { AOM_CDF2(14618) },
+ { AOM_CDF2(16182) },
+ { AOM_CDF2(24168) },
+ { AOM_CDF2(31037) },
+ { AOM_CDF2(2786) },
+ { AOM_CDF2(11194) },
+ { AOM_CDF2(20155) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(32510) },
+ { AOM_CDF2(8430) },
+ { AOM_CDF2(17318) },
+ { AOM_CDF2(24154) },
+ { AOM_CDF2(23674) },
+ { AOM_CDF2(28789) },
+ { AOM_CDF2(32139) },
+ { AOM_CDF2(3440) },
+ { AOM_CDF2(13117) },
+ { AOM_CDF2(22702) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31671) },
+ { AOM_CDF2(2056) },
+ { AOM_CDF2(11746) },
+ { AOM_CDF2(16852) },
+ { AOM_CDF2(18635) },
+ { AOM_CDF2(24715) },
+ { AOM_CDF2(31484) },
+ { AOM_CDF2(4656) },
+ { AOM_CDF2(16074) },
+ { AOM_CDF2(24704) },
+ { AOM_CDF2(1806) },
+ { AOM_CDF2(14645) },
+ { AOM_CDF2(25336) } },
+ { { AOM_CDF2(31539) },
+ { AOM_CDF2(8433) },
+ { AOM_CDF2(20576) },
+ { AOM_CDF2(27904) },
+ { AOM_CDF2(27852) },
+ { AOM_CDF2(30026) },
+ { AOM_CDF2(32441) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = {
+ { { {
+ { AOM_CDF2(16961) },
+ { AOM_CDF2(17223) },
+ { AOM_CDF2(7621) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(19069) },
+ { AOM_CDF2(22525) },
+ { AOM_CDF2(13377) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(20401) },
+ { AOM_CDF2(17025) },
+ { AOM_CDF2(12845) },
+ { AOM_CDF2(12873) },
+ { AOM_CDF2(14094) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20681) },
+ { AOM_CDF2(20701) },
+ { AOM_CDF2(15250) },
+ { AOM_CDF2(15017) },
+ { AOM_CDF2(14928) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(23905) },
+ { AOM_CDF2(17194) },
+ { AOM_CDF2(16170) },
+ { AOM_CDF2(17695) },
+ { AOM_CDF2(13826) },
+ { AOM_CDF2(15810) },
+ { AOM_CDF2(12036) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(23959) },
+ { AOM_CDF2(20799) },
+ { AOM_CDF2(19021) },
+ { AOM_CDF2(16203) },
+ { AOM_CDF2(17886) },
+ { AOM_CDF2(14144) },
+ { AOM_CDF2(12010) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(27399) },
+ { AOM_CDF2(16327) },
+ { AOM_CDF2(18071) },
+ { AOM_CDF2(19584) },
+ { AOM_CDF2(20721) },
+ { AOM_CDF2(18432) },
+ { AOM_CDF2(19560) },
+ { AOM_CDF2(10150) },
+ { AOM_CDF2(8805) },
+ },
+ {
+ { AOM_CDF2(24932) },
+ { AOM_CDF2(20833) },
+ { AOM_CDF2(12027) },
+ { AOM_CDF2(16670) },
+ { AOM_CDF2(19914) },
+ { AOM_CDF2(15106) },
+ { AOM_CDF2(17662) },
+ { AOM_CDF2(13783) },
+ { AOM_CDF2(28756) },
+ } },
+ { {
+ { AOM_CDF2(23406) },
+ { AOM_CDF2(21845) },
+ { AOM_CDF2(18432) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(17096) },
+ { AOM_CDF2(12561) },
+ { AOM_CDF2(17320) },
+ { AOM_CDF2(22395) },
+ { AOM_CDF2(21370) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } },
+ { { {
+ { AOM_CDF2(17471) },
+ { AOM_CDF2(20223) },
+ { AOM_CDF2(11357) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20335) },
+ { AOM_CDF2(21667) },
+ { AOM_CDF2(14818) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(20430) },
+ { AOM_CDF2(20662) },
+ { AOM_CDF2(15367) },
+ { AOM_CDF2(16970) },
+ { AOM_CDF2(14657) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(22117) },
+ { AOM_CDF2(22028) },
+ { AOM_CDF2(18650) },
+ { AOM_CDF2(16042) },
+ { AOM_CDF2(15885) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(22409) },
+ { AOM_CDF2(21012) },
+ { AOM_CDF2(15650) },
+ { AOM_CDF2(17395) },
+ { AOM_CDF2(15469) },
+ { AOM_CDF2(20205) },
+ { AOM_CDF2(19511) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(24220) },
+ { AOM_CDF2(22480) },
+ { AOM_CDF2(17737) },
+ { AOM_CDF2(18916) },
+ { AOM_CDF2(19268) },
+ { AOM_CDF2(18412) },
+ { AOM_CDF2(18844) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(25991) },
+ { AOM_CDF2(20314) },
+ { AOM_CDF2(17731) },
+ { AOM_CDF2(19678) },
+ { AOM_CDF2(18649) },
+ { AOM_CDF2(17307) },
+ { AOM_CDF2(21798) },
+ { AOM_CDF2(17549) },
+ { AOM_CDF2(15630) },
+ },
+ {
+ { AOM_CDF2(26585) },
+ { AOM_CDF2(21469) },
+ { AOM_CDF2(20432) },
+ { AOM_CDF2(17735) },
+ { AOM_CDF2(19280) },
+ { AOM_CDF2(15235) },
+ { AOM_CDF2(20297) },
+ { AOM_CDF2(22471) },
+ { AOM_CDF2(28997) },
+ } },
+ { {
+ { AOM_CDF2(26605) },
+ { AOM_CDF2(11304) },
+ { AOM_CDF2(16726) },
+ { AOM_CDF2(16560) },
+ { AOM_CDF2(20866) },
+ { AOM_CDF2(23524) },
+ { AOM_CDF2(19878) },
+ { AOM_CDF2(13469) },
+ { AOM_CDF2(23084) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } },
+ { { {
+ { AOM_CDF2(18983) },
+ { AOM_CDF2(20512) },
+ { AOM_CDF2(14885) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20090) },
+ { AOM_CDF2(19444) },
+ { AOM_CDF2(17286) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(19139) },
+ { AOM_CDF2(21487) },
+ { AOM_CDF2(18959) },
+ { AOM_CDF2(20910) },
+ { AOM_CDF2(19089) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20536) },
+ { AOM_CDF2(20664) },
+ { AOM_CDF2(20625) },
+ { AOM_CDF2(19123) },
+ { AOM_CDF2(14862) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(19833) },
+ { AOM_CDF2(21502) },
+ { AOM_CDF2(17485) },
+ { AOM_CDF2(20267) },
+ { AOM_CDF2(18353) },
+ { AOM_CDF2(23329) },
+ { AOM_CDF2(21478) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(22041) },
+ { AOM_CDF2(23434) },
+ { AOM_CDF2(20001) },
+ { AOM_CDF2(20554) },
+ { AOM_CDF2(20951) },
+ { AOM_CDF2(20145) },
+ { AOM_CDF2(15562) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(23312) },
+ { AOM_CDF2(21607) },
+ { AOM_CDF2(16526) },
+ { AOM_CDF2(18957) },
+ { AOM_CDF2(18034) },
+ { AOM_CDF2(18934) },
+ { AOM_CDF2(24247) },
+ { AOM_CDF2(16921) },
+ { AOM_CDF2(17080) },
+ },
+ {
+ { AOM_CDF2(26579) },
+ { AOM_CDF2(24910) },
+ { AOM_CDF2(18637) },
+ { AOM_CDF2(19800) },
+ { AOM_CDF2(20388) },
+ { AOM_CDF2(9887) },
+ { AOM_CDF2(15642) },
+ { AOM_CDF2(30198) },
+ { AOM_CDF2(24721) },
+ } },
+ { {
+ { AOM_CDF2(26998) },
+ { AOM_CDF2(16737) },
+ { AOM_CDF2(17838) },
+ { AOM_CDF2(18922) },
+ { AOM_CDF2(19515) },
+ { AOM_CDF2(18636) },
+ { AOM_CDF2(17333) },
+ { AOM_CDF2(15776) },
+ { AOM_CDF2(22658) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } },
+ { { {
+ { AOM_CDF2(20177) },
+ { AOM_CDF2(20789) },
+ { AOM_CDF2(20262) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(21416) },
+ { AOM_CDF2(20855) },
+ { AOM_CDF2(23410) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(20238) },
+ { AOM_CDF2(21057) },
+ { AOM_CDF2(19159) },
+ { AOM_CDF2(22337) },
+ { AOM_CDF2(20159) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20125) },
+ { AOM_CDF2(20559) },
+ { AOM_CDF2(21707) },
+ { AOM_CDF2(22296) },
+ { AOM_CDF2(17333) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(19941) },
+ { AOM_CDF2(20527) },
+ { AOM_CDF2(21470) },
+ { AOM_CDF2(22487) },
+ { AOM_CDF2(19558) },
+ { AOM_CDF2(22354) },
+ { AOM_CDF2(20331) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(22752) },
+ { AOM_CDF2(25006) },
+ { AOM_CDF2(22075) },
+ { AOM_CDF2(21576) },
+ { AOM_CDF2(17740) },
+ { AOM_CDF2(21690) },
+ { AOM_CDF2(19211) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(21442) },
+ { AOM_CDF2(22358) },
+ { AOM_CDF2(18503) },
+ { AOM_CDF2(20291) },
+ { AOM_CDF2(19945) },
+ { AOM_CDF2(21294) },
+ { AOM_CDF2(21178) },
+ { AOM_CDF2(19400) },
+ { AOM_CDF2(10556) },
+ },
+ {
+ { AOM_CDF2(24648) },
+ { AOM_CDF2(24949) },
+ { AOM_CDF2(20708) },
+ { AOM_CDF2(23905) },
+ { AOM_CDF2(20501) },
+ { AOM_CDF2(9558) },
+ { AOM_CDF2(9423) },
+ { AOM_CDF2(30365) },
+ { AOM_CDF2(19253) },
+ } },
+ { {
+ { AOM_CDF2(26064) },
+ { AOM_CDF2(22098) },
+ { AOM_CDF2(19613) },
+ { AOM_CDF2(20525) },
+ { AOM_CDF2(17595) },
+ { AOM_CDF2(16618) },
+ { AOM_CDF2(20497) },
+ { AOM_CDF2(18989) },
+ { AOM_CDF2(15513) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } }
+ };
+
+static const aom_cdf_prob
+ av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) },
+ { AOM_CDF5(370, 671, 1883, 4471) } },
+ { { AOM_CDF5(3247, 4950, 9688, 14563) },
+ { AOM_CDF5(1904, 3354, 7763, 14647) } } },
+ { { { AOM_CDF5(2125, 2551, 5165, 8946) },
+ { AOM_CDF5(513, 765, 1859, 6339) } },
+ { { AOM_CDF5(7637, 9498, 14259, 19108) },
+ { AOM_CDF5(2497, 4096, 8866, 16993) } } },
+ { { { AOM_CDF5(4016, 4897, 8881, 14968) },
+ { AOM_CDF5(716, 1105, 2646, 10056) } },
+ { { AOM_CDF5(11139, 13270, 18241, 23566) },
+ { AOM_CDF5(3192, 5032, 10297, 19755) } } },
+ { { { AOM_CDF5(6708, 8958, 14746, 22133) },
+ { AOM_CDF5(1222, 2074, 4783, 15410) } },
+ { { AOM_CDF5(19575, 21766, 26044, 29709) },
+ { AOM_CDF5(7297, 10767, 19273, 28194) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) },
+ { AOM_CDF6(210, 405, 1315, 3326, 7537) } },
+ { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) },
+ { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } },
+ { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) },
+ { AOM_CDF6(313, 441, 1099, 2917, 8562) } },
+ { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) },
+ { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } },
+ { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) },
+ { AOM_CDF6(574, 821, 1836, 5089, 13128) } },
+ { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) },
+ { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } },
+ { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) },
+ { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } },
+ { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) },
+ { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) },
+ { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } },
+ { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) },
+ { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } },
+ { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) },
+ { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } },
+ { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) },
+ { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } },
+ { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) },
+ { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } },
+ { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) },
+ { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } },
+ { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) },
+ { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } },
+ { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) },
+ { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 8)] = {
+ { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) },
+ { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } },
+ { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+ { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } },
+ { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) },
+ { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } },
+ { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+ { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } },
+ { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+ { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } },
+ { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+ { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } },
+ { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+ { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } },
+ { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+ { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } }
+ };
+
+static const aom_cdf_prob
+ av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 9)] = {
+ { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) },
+ { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } },
+ { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) },
+ { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } },
+ { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) },
+ { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } },
+ { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) },
+ { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } },
+ { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) },
+ { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } },
+ { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) },
+ { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842,
+ 32708) } } },
+ { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) },
+ { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } },
+ { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) },
+ { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403,
+ 32695) } } }
+ };
+
+static const aom_cdf_prob
+ av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788,
+ 23412, 26061) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919,
+ 26129, 29140) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } },
+ { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590,
+ 24584, 28749) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478,
+ 28396, 31811) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } },
+ { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267,
+ 28410, 31078) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812,
+ 27300, 29219, 32114) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } },
+ { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456,
+ 31142, 32060) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716,
+ 30073, 30820, 31956) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047,
+ 22571, 25830) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354,
+ 27255, 28546, 31784) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } },
+ { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851,
+ 21856, 25692, 28034) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527,
+ 28027, 28377, 30876) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } },
+ { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155,
+ 26682, 29229, 31045) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601,
+ 25483, 25843, 32056) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } },
+ { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434,
+ 29326, 31082, 32050) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913,
+ 29486, 29724, 29807, 32570) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } } };
+
+static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
+ [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+ [CDF_SIZE(BR_CDF_SIZE)] = {
+ { { { { AOM_CDF4(14298, 20718, 24174) },
+ { AOM_CDF4(12536, 19601, 23789) },
+ { AOM_CDF4(8712, 15051, 19503) },
+ { AOM_CDF4(6170, 11327, 15434) },
+ { AOM_CDF4(4742, 8926, 12538) },
+ { AOM_CDF4(3803, 7317, 10546) },
+ { AOM_CDF4(1696, 3317, 4871) },
+ { AOM_CDF4(14392, 19951, 22756) },
+ { AOM_CDF4(15978, 23218, 26818) },
+ { AOM_CDF4(12187, 19474, 23889) },
+ { AOM_CDF4(9176, 15640, 20259) },
+ { AOM_CDF4(7068, 12655, 17028) },
+ { AOM_CDF4(5656, 10442, 14472) },
+ { AOM_CDF4(2580, 4992, 7244) },
+ { AOM_CDF4(12136, 18049, 21426) },
+ { AOM_CDF4(13784, 20721, 24481) },
+ { AOM_CDF4(10836, 17621, 21900) },
+ { AOM_CDF4(8372, 14444, 18847) },
+ { AOM_CDF4(6523, 11779, 16000) },
+ { AOM_CDF4(5337, 9898, 13760) },
+ { AOM_CDF4(3034, 5860, 8462) } },
+ { { AOM_CDF4(15967, 22905, 26286) },
+ { AOM_CDF4(13534, 20654, 24579) },
+ { AOM_CDF4(9504, 16092, 20535) },
+ { AOM_CDF4(6975, 12568, 16903) },
+ { AOM_CDF4(5364, 10091, 14020) },
+ { AOM_CDF4(4357, 8370, 11857) },
+ { AOM_CDF4(2506, 4934, 7218) },
+ { AOM_CDF4(23032, 28815, 30936) },
+ { AOM_CDF4(19540, 26704, 29719) },
+ { AOM_CDF4(15158, 22969, 27097) },
+ { AOM_CDF4(11408, 18865, 23650) },
+ { AOM_CDF4(8885, 15448, 20250) },
+ { AOM_CDF4(7108, 12853, 17416) },
+ { AOM_CDF4(4231, 8041, 11480) },
+ { AOM_CDF4(19823, 26490, 29156) },
+ { AOM_CDF4(18890, 25929, 28932) },
+ { AOM_CDF4(15660, 23491, 27433) },
+ { AOM_CDF4(12147, 19776, 24488) },
+ { AOM_CDF4(9728, 16774, 21649) },
+ { AOM_CDF4(7919, 14277, 19066) },
+ { AOM_CDF4(5440, 10170, 14185) } } },
+ { { { AOM_CDF4(14406, 20862, 24414) },
+ { AOM_CDF4(11824, 18907, 23109) },
+ { AOM_CDF4(8257, 14393, 18803) },
+ { AOM_CDF4(5860, 10747, 14778) },
+ { AOM_CDF4(4475, 8486, 11984) },
+ { AOM_CDF4(3606, 6954, 10043) },
+ { AOM_CDF4(1736, 3410, 5048) },
+ { AOM_CDF4(14430, 20046, 22882) },
+ { AOM_CDF4(15593, 22899, 26709) },
+ { AOM_CDF4(12102, 19368, 23811) },
+ { AOM_CDF4(9059, 15584, 20262) },
+ { AOM_CDF4(6999, 12603, 17048) },
+ { AOM_CDF4(5684, 10497, 14553) },
+ { AOM_CDF4(2822, 5438, 7862) },
+ { AOM_CDF4(15785, 21585, 24359) },
+ { AOM_CDF4(18347, 25229, 28266) },
+ { AOM_CDF4(14974, 22487, 26389) },
+ { AOM_CDF4(11423, 18681, 23271) },
+ { AOM_CDF4(8863, 15350, 20008) },
+ { AOM_CDF4(7153, 12852, 17278) },
+ { AOM_CDF4(3707, 7036, 9982) } },
+ { { AOM_CDF4(15460, 21696, 25469) },
+ { AOM_CDF4(12170, 19249, 23191) },
+ { AOM_CDF4(8723, 15027, 19332) },
+ { AOM_CDF4(6428, 11704, 15874) },
+ { AOM_CDF4(4922, 9292, 13052) },
+ { AOM_CDF4(4139, 7695, 11010) },
+ { AOM_CDF4(2291, 4508, 6598) },
+ { AOM_CDF4(19856, 26920, 29828) },
+ { AOM_CDF4(17923, 25289, 28792) },
+ { AOM_CDF4(14278, 21968, 26297) },
+ { AOM_CDF4(10910, 18136, 22950) },
+ { AOM_CDF4(8423, 14815, 19627) },
+ { AOM_CDF4(6771, 12283, 16774) },
+ { AOM_CDF4(4074, 7750, 11081) },
+ { AOM_CDF4(19852, 26074, 28672) },
+ { AOM_CDF4(19371, 26110, 28989) },
+ { AOM_CDF4(16265, 23873, 27663) },
+ { AOM_CDF4(12758, 20378, 24952) },
+ { AOM_CDF4(10095, 17098, 21961) },
+ { AOM_CDF4(8250, 14628, 19451) },
+ { AOM_CDF4(5205, 9745, 13622) } } },
+ { { { AOM_CDF4(10563, 16233, 19763) },
+ { AOM_CDF4(9794, 16022, 19804) },
+ { AOM_CDF4(6750, 11945, 15759) },
+ { AOM_CDF4(4963, 9186, 12752) },
+ { AOM_CDF4(3845, 7435, 10627) },
+ { AOM_CDF4(3051, 6085, 8834) },
+ { AOM_CDF4(1311, 2596, 3830) },
+ { AOM_CDF4(11246, 16404, 19689) },
+ { AOM_CDF4(12315, 18911, 22731) },
+ { AOM_CDF4(10557, 17095, 21289) },
+ { AOM_CDF4(8136, 14006, 18249) },
+ { AOM_CDF4(6348, 11474, 15565) },
+ { AOM_CDF4(5196, 9655, 13400) },
+ { AOM_CDF4(2349, 4526, 6587) },
+ { AOM_CDF4(13337, 18730, 21569) },
+ { AOM_CDF4(19306, 26071, 28882) },
+ { AOM_CDF4(15952, 23540, 27254) },
+ { AOM_CDF4(12409, 19934, 24430) },
+ { AOM_CDF4(9760, 16706, 21389) },
+ { AOM_CDF4(8004, 14220, 18818) },
+ { AOM_CDF4(4138, 7794, 10961) } },
+ { { AOM_CDF4(10870, 16684, 20949) },
+ { AOM_CDF4(9664, 15230, 18680) },
+ { AOM_CDF4(6886, 12109, 15408) },
+ { AOM_CDF4(4825, 8900, 12305) },
+ { AOM_CDF4(3630, 7162, 10314) },
+ { AOM_CDF4(3036, 6429, 9387) },
+ { AOM_CDF4(1671, 3296, 4940) },
+ { AOM_CDF4(13819, 19159, 23026) },
+ { AOM_CDF4(11984, 19108, 23120) },
+ { AOM_CDF4(10690, 17210, 21663) },
+ { AOM_CDF4(7984, 14154, 18333) },
+ { AOM_CDF4(6868, 12294, 16124) },
+ { AOM_CDF4(5274, 8994, 12868) },
+ { AOM_CDF4(2988, 5771, 8424) },
+ { AOM_CDF4(19736, 26647, 29141) },
+ { AOM_CDF4(18933, 26070, 28984) },
+ { AOM_CDF4(15779, 23048, 27200) },
+ { AOM_CDF4(12638, 20061, 24532) },
+ { AOM_CDF4(10692, 17545, 22220) },
+ { AOM_CDF4(9217, 15251, 20054) },
+ { AOM_CDF4(5078, 9284, 12594) } } },
+ { { { AOM_CDF4(2331, 3662, 5244) },
+ { AOM_CDF4(2891, 4771, 6145) },
+ { AOM_CDF4(4598, 7623, 9729) },
+ { AOM_CDF4(3520, 6845, 9199) },
+ { AOM_CDF4(3417, 6119, 9324) },
+ { AOM_CDF4(2601, 5412, 7385) },
+ { AOM_CDF4(600, 1173, 1744) },
+ { AOM_CDF4(7672, 13286, 17469) },
+ { AOM_CDF4(4232, 7792, 10793) },
+ { AOM_CDF4(2915, 5317, 7397) },
+ { AOM_CDF4(2318, 4356, 6152) },
+ { AOM_CDF4(2127, 4000, 5554) },
+ { AOM_CDF4(1850, 3478, 5275) },
+ { AOM_CDF4(977, 1933, 2843) },
+ { AOM_CDF4(18280, 24387, 27989) },
+ { AOM_CDF4(15852, 22671, 26185) },
+ { AOM_CDF4(13845, 20951, 24789) },
+ { AOM_CDF4(11055, 17966, 22129) },
+ { AOM_CDF4(9138, 15422, 19801) },
+ { AOM_CDF4(7454, 13145, 17456) },
+ { AOM_CDF4(3370, 6393, 9013) } },
+ { { AOM_CDF4(5842, 9229, 10838) },
+ { AOM_CDF4(2313, 3491, 4276) },
+ { AOM_CDF4(2998, 6104, 7496) },
+ { AOM_CDF4(2420, 7447, 9868) },
+ { AOM_CDF4(3034, 8495, 10923) },
+ { AOM_CDF4(4076, 8937, 10975) },
+ { AOM_CDF4(1086, 2370, 3299) },
+ { AOM_CDF4(9714, 17254, 20444) },
+ { AOM_CDF4(8543, 13698, 17123) },
+ { AOM_CDF4(4918, 9007, 11910) },
+ { AOM_CDF4(4129, 7532, 10553) },
+ { AOM_CDF4(2364, 5533, 8058) },
+ { AOM_CDF4(1834, 3546, 5563) },
+ { AOM_CDF4(1473, 2908, 4133) },
+ { AOM_CDF4(15405, 21193, 25619) },
+ { AOM_CDF4(15691, 21952, 26561) },
+ { AOM_CDF4(12962, 19194, 24165) },
+ { AOM_CDF4(10272, 17855, 22129) },
+ { AOM_CDF4(8588, 15270, 20718) },
+ { AOM_CDF4(8682, 14669, 19500) },
+ { AOM_CDF4(4870, 9636, 13205) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(14995, 21341, 24749) },
+ { AOM_CDF4(13158, 20289, 24601) },
+ { AOM_CDF4(8941, 15326, 19876) },
+ { AOM_CDF4(6297, 11541, 15807) },
+ { AOM_CDF4(4817, 9029, 12776) },
+ { AOM_CDF4(3731, 7273, 10627) },
+ { AOM_CDF4(1847, 3617, 5354) },
+ { AOM_CDF4(14472, 19659, 22343) },
+ { AOM_CDF4(16806, 24162, 27533) },
+ { AOM_CDF4(12900, 20404, 24713) },
+ { AOM_CDF4(9411, 16112, 20797) },
+ { AOM_CDF4(7056, 12697, 17148) },
+ { AOM_CDF4(5544, 10339, 14460) },
+ { AOM_CDF4(2954, 5704, 8319) },
+ { AOM_CDF4(12464, 18071, 21354) },
+ { AOM_CDF4(15482, 22528, 26034) },
+ { AOM_CDF4(12070, 19269, 23624) },
+ { AOM_CDF4(8953, 15406, 20106) },
+ { AOM_CDF4(7027, 12730, 17220) },
+ { AOM_CDF4(5887, 10913, 15140) },
+ { AOM_CDF4(3793, 7278, 10447) } },
+ { { AOM_CDF4(15571, 22232, 25749) },
+ { AOM_CDF4(14506, 21575, 25374) },
+ { AOM_CDF4(10189, 17089, 21569) },
+ { AOM_CDF4(7316, 13301, 17915) },
+ { AOM_CDF4(5783, 10912, 15190) },
+ { AOM_CDF4(4760, 9155, 13088) },
+ { AOM_CDF4(2993, 5966, 8774) },
+ { AOM_CDF4(23424, 28903, 30778) },
+ { AOM_CDF4(20775, 27666, 30290) },
+ { AOM_CDF4(16474, 24410, 28299) },
+ { AOM_CDF4(12471, 20180, 24987) },
+ { AOM_CDF4(9410, 16487, 21439) },
+ { AOM_CDF4(7536, 13614, 18529) },
+ { AOM_CDF4(5048, 9586, 13549) },
+ { AOM_CDF4(21090, 27290, 29756) },
+ { AOM_CDF4(20796, 27402, 30026) },
+ { AOM_CDF4(17819, 25485, 28969) },
+ { AOM_CDF4(13860, 21909, 26462) },
+ { AOM_CDF4(11002, 18494, 23529) },
+ { AOM_CDF4(8953, 15929, 20897) },
+ { AOM_CDF4(6448, 11918, 16454) } } },
+ { { { AOM_CDF4(15999, 22208, 25449) },
+ { AOM_CDF4(13050, 19988, 24122) },
+ { AOM_CDF4(8594, 14864, 19378) },
+ { AOM_CDF4(6033, 11079, 15238) },
+ { AOM_CDF4(4554, 8683, 12347) },
+ { AOM_CDF4(3672, 7139, 10337) },
+ { AOM_CDF4(1900, 3771, 5576) },
+ { AOM_CDF4(15788, 21340, 23949) },
+ { AOM_CDF4(16825, 24235, 27758) },
+ { AOM_CDF4(12873, 20402, 24810) },
+ { AOM_CDF4(9590, 16363, 21094) },
+ { AOM_CDF4(7352, 13209, 17733) },
+ { AOM_CDF4(5960, 10989, 15184) },
+ { AOM_CDF4(3232, 6234, 9007) },
+ { AOM_CDF4(15761, 20716, 23224) },
+ { AOM_CDF4(19318, 25989, 28759) },
+ { AOM_CDF4(15529, 23094, 26929) },
+ { AOM_CDF4(11662, 18989, 23641) },
+ { AOM_CDF4(8955, 15568, 20366) },
+ { AOM_CDF4(7281, 13106, 17708) },
+ { AOM_CDF4(4248, 8059, 11440) } },
+ { { AOM_CDF4(14899, 21217, 24503) },
+ { AOM_CDF4(13519, 20283, 24047) },
+ { AOM_CDF4(9429, 15966, 20365) },
+ { AOM_CDF4(6700, 12355, 16652) },
+ { AOM_CDF4(5088, 9704, 13716) },
+ { AOM_CDF4(4243, 8154, 11731) },
+ { AOM_CDF4(2702, 5364, 7861) },
+ { AOM_CDF4(22745, 28388, 30454) },
+ { AOM_CDF4(20235, 27146, 29922) },
+ { AOM_CDF4(15896, 23715, 27637) },
+ { AOM_CDF4(11840, 19350, 24131) },
+ { AOM_CDF4(9122, 15932, 20880) },
+ { AOM_CDF4(7488, 13581, 18362) },
+ { AOM_CDF4(5114, 9568, 13370) },
+ { AOM_CDF4(20845, 26553, 28932) },
+ { AOM_CDF4(20981, 27372, 29884) },
+ { AOM_CDF4(17781, 25335, 28785) },
+ { AOM_CDF4(13760, 21708, 26297) },
+ { AOM_CDF4(10975, 18415, 23365) },
+ { AOM_CDF4(9045, 15789, 20686) },
+ { AOM_CDF4(6130, 11199, 15423) } } },
+ { { { AOM_CDF4(13549, 19724, 23158) },
+ { AOM_CDF4(11844, 18382, 22246) },
+ { AOM_CDF4(7919, 13619, 17773) },
+ { AOM_CDF4(5486, 10143, 13946) },
+ { AOM_CDF4(4166, 7983, 11324) },
+ { AOM_CDF4(3364, 6506, 9427) },
+ { AOM_CDF4(1598, 3160, 4674) },
+ { AOM_CDF4(15281, 20979, 23781) },
+ { AOM_CDF4(14939, 22119, 25952) },
+ { AOM_CDF4(11363, 18407, 22812) },
+ { AOM_CDF4(8609, 14857, 19370) },
+ { AOM_CDF4(6737, 12184, 16480) },
+ { AOM_CDF4(5506, 10263, 14262) },
+ { AOM_CDF4(2990, 5786, 8380) },
+ { AOM_CDF4(20249, 25253, 27417) },
+ { AOM_CDF4(21070, 27518, 30001) },
+ { AOM_CDF4(16854, 24469, 28074) },
+ { AOM_CDF4(12864, 20486, 25000) },
+ { AOM_CDF4(9962, 16978, 21778) },
+ { AOM_CDF4(8074, 14338, 19048) },
+ { AOM_CDF4(4494, 8479, 11906) } },
+ { { AOM_CDF4(13960, 19617, 22829) },
+ { AOM_CDF4(11150, 17341, 21228) },
+ { AOM_CDF4(7150, 12964, 17190) },
+ { AOM_CDF4(5331, 10002, 13867) },
+ { AOM_CDF4(4167, 7744, 11057) },
+ { AOM_CDF4(3480, 6629, 9646) },
+ { AOM_CDF4(1883, 3784, 5686) },
+ { AOM_CDF4(18752, 25660, 28912) },
+ { AOM_CDF4(16968, 24586, 28030) },
+ { AOM_CDF4(13520, 21055, 25313) },
+ { AOM_CDF4(10453, 17626, 22280) },
+ { AOM_CDF4(8386, 14505, 19116) },
+ { AOM_CDF4(6742, 12595, 17008) },
+ { AOM_CDF4(4273, 8140, 11499) },
+ { AOM_CDF4(22120, 27827, 30233) },
+ { AOM_CDF4(20563, 27358, 29895) },
+ { AOM_CDF4(17076, 24644, 28153) },
+ { AOM_CDF4(13362, 20942, 25309) },
+ { AOM_CDF4(10794, 17965, 22695) },
+ { AOM_CDF4(9014, 15652, 20319) },
+ { AOM_CDF4(5708, 10512, 14497) } } },
+ { { { AOM_CDF4(5705, 10930, 15725) },
+ { AOM_CDF4(7946, 12765, 16115) },
+ { AOM_CDF4(6801, 12123, 16226) },
+ { AOM_CDF4(5462, 10135, 14200) },
+ { AOM_CDF4(4189, 8011, 11507) },
+ { AOM_CDF4(3191, 6229, 9408) },
+ { AOM_CDF4(1057, 2137, 3212) },
+ { AOM_CDF4(10018, 17067, 21491) },
+ { AOM_CDF4(7380, 12582, 16453) },
+ { AOM_CDF4(6068, 10845, 14339) },
+ { AOM_CDF4(5098, 9198, 12555) },
+ { AOM_CDF4(4312, 8010, 11119) },
+ { AOM_CDF4(3700, 6966, 9781) },
+ { AOM_CDF4(1693, 3326, 4887) },
+ { AOM_CDF4(18757, 24930, 27774) },
+ { AOM_CDF4(17648, 24596, 27817) },
+ { AOM_CDF4(14707, 22052, 26026) },
+ { AOM_CDF4(11720, 18852, 23292) },
+ { AOM_CDF4(9357, 15952, 20525) },
+ { AOM_CDF4(7810, 13753, 18210) },
+ { AOM_CDF4(3879, 7333, 10328) } },
+ { { AOM_CDF4(8278, 13242, 15922) },
+ { AOM_CDF4(10547, 15867, 18919) },
+ { AOM_CDF4(9106, 15842, 20609) },
+ { AOM_CDF4(6833, 13007, 17218) },
+ { AOM_CDF4(4811, 9712, 13923) },
+ { AOM_CDF4(3985, 7352, 11128) },
+ { AOM_CDF4(1688, 3458, 5262) },
+ { AOM_CDF4(12951, 21861, 26510) },
+ { AOM_CDF4(9788, 16044, 20276) },
+ { AOM_CDF4(6309, 11244, 14870) },
+ { AOM_CDF4(5183, 9349, 12566) },
+ { AOM_CDF4(4389, 8229, 11492) },
+ { AOM_CDF4(3633, 6945, 10620) },
+ { AOM_CDF4(3600, 6847, 9907) },
+ { AOM_CDF4(21748, 28137, 30255) },
+ { AOM_CDF4(19436, 26581, 29560) },
+ { AOM_CDF4(16359, 24201, 27953) },
+ { AOM_CDF4(13961, 21693, 25871) },
+ { AOM_CDF4(11544, 18686, 23322) },
+ { AOM_CDF4(9372, 16462, 20952) },
+ { AOM_CDF4(6138, 11210, 15390) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(16138, 22223, 25509) },
+ { AOM_CDF4(15347, 22430, 26332) },
+ { AOM_CDF4(9614, 16736, 21332) },
+ { AOM_CDF4(6600, 12275, 16907) },
+ { AOM_CDF4(4811, 9424, 13547) },
+ { AOM_CDF4(3748, 7809, 11420) },
+ { AOM_CDF4(2254, 4587, 6890) },
+ { AOM_CDF4(15196, 20284, 23177) },
+ { AOM_CDF4(18317, 25469, 28451) },
+ { AOM_CDF4(13918, 21651, 25842) },
+ { AOM_CDF4(10052, 17150, 21995) },
+ { AOM_CDF4(7499, 13630, 18587) },
+ { AOM_CDF4(6158, 11417, 16003) },
+ { AOM_CDF4(4014, 7785, 11252) },
+ { AOM_CDF4(15048, 21067, 24384) },
+ { AOM_CDF4(18202, 25346, 28553) },
+ { AOM_CDF4(14302, 22019, 26356) },
+ { AOM_CDF4(10839, 18139, 23166) },
+ { AOM_CDF4(8715, 15744, 20806) },
+ { AOM_CDF4(7536, 13576, 18544) },
+ { AOM_CDF4(5413, 10335, 14498) } },
+ { { AOM_CDF4(17394, 24501, 27895) },
+ { AOM_CDF4(15889, 23420, 27185) },
+ { AOM_CDF4(11561, 19133, 23870) },
+ { AOM_CDF4(8285, 14812, 19844) },
+ { AOM_CDF4(6496, 12043, 16550) },
+ { AOM_CDF4(4771, 9574, 13677) },
+ { AOM_CDF4(3603, 6830, 10144) },
+ { AOM_CDF4(21656, 27704, 30200) },
+ { AOM_CDF4(21324, 27915, 30511) },
+ { AOM_CDF4(17327, 25336, 28997) },
+ { AOM_CDF4(13417, 21381, 26033) },
+ { AOM_CDF4(10132, 17425, 22338) },
+ { AOM_CDF4(8580, 15016, 19633) },
+ { AOM_CDF4(5694, 11477, 16411) },
+ { AOM_CDF4(24116, 29780, 31450) },
+ { AOM_CDF4(23853, 29695, 31591) },
+ { AOM_CDF4(20085, 27614, 30428) },
+ { AOM_CDF4(15326, 24335, 28575) },
+ { AOM_CDF4(11814, 19472, 24810) },
+ { AOM_CDF4(10221, 18611, 24767) },
+ { AOM_CDF4(7689, 14558, 20321) } } },
+ { { { AOM_CDF4(16214, 22380, 25770) },
+ { AOM_CDF4(14213, 21304, 25295) },
+ { AOM_CDF4(9213, 15823, 20455) },
+ { AOM_CDF4(6395, 11758, 16139) },
+ { AOM_CDF4(4779, 9187, 13066) },
+ { AOM_CDF4(3821, 7501, 10953) },
+ { AOM_CDF4(2293, 4567, 6795) },
+ { AOM_CDF4(15859, 21283, 23820) },
+ { AOM_CDF4(18404, 25602, 28726) },
+ { AOM_CDF4(14325, 21980, 26206) },
+ { AOM_CDF4(10669, 17937, 22720) },
+ { AOM_CDF4(8297, 14642, 19447) },
+ { AOM_CDF4(6746, 12389, 16893) },
+ { AOM_CDF4(4324, 8251, 11770) },
+ { AOM_CDF4(16532, 21631, 24475) },
+ { AOM_CDF4(20667, 27150, 29668) },
+ { AOM_CDF4(16728, 24510, 28175) },
+ { AOM_CDF4(12861, 20645, 25332) },
+ { AOM_CDF4(10076, 17361, 22417) },
+ { AOM_CDF4(8395, 14940, 19963) },
+ { AOM_CDF4(5731, 10683, 14912) } },
+ { { AOM_CDF4(14433, 21155, 24938) },
+ { AOM_CDF4(14658, 21716, 25545) },
+ { AOM_CDF4(9923, 16824, 21557) },
+ { AOM_CDF4(6982, 13052, 17721) },
+ { AOM_CDF4(5419, 10503, 15050) },
+ { AOM_CDF4(4852, 9162, 13014) },
+ { AOM_CDF4(3271, 6395, 9630) },
+ { AOM_CDF4(22210, 27833, 30109) },
+ { AOM_CDF4(20750, 27368, 29821) },
+ { AOM_CDF4(16894, 24828, 28573) },
+ { AOM_CDF4(13247, 21276, 25757) },
+ { AOM_CDF4(10038, 17265, 22563) },
+ { AOM_CDF4(8587, 14947, 20327) },
+ { AOM_CDF4(5645, 11371, 15252) },
+ { AOM_CDF4(22027, 27526, 29714) },
+ { AOM_CDF4(23098, 29146, 31221) },
+ { AOM_CDF4(19886, 27341, 30272) },
+ { AOM_CDF4(15609, 23747, 28046) },
+ { AOM_CDF4(11993, 20065, 24939) },
+ { AOM_CDF4(9637, 18267, 23671) },
+ { AOM_CDF4(7625, 13801, 19144) } } },
+ { { { AOM_CDF4(14438, 20798, 24089) },
+ { AOM_CDF4(12621, 19203, 23097) },
+ { AOM_CDF4(8177, 14125, 18402) },
+ { AOM_CDF4(5674, 10501, 14456) },
+ { AOM_CDF4(4236, 8239, 11733) },
+ { AOM_CDF4(3447, 6750, 9806) },
+ { AOM_CDF4(1986, 3950, 5864) },
+ { AOM_CDF4(16208, 22099, 24930) },
+ { AOM_CDF4(16537, 24025, 27585) },
+ { AOM_CDF4(12780, 20381, 24867) },
+ { AOM_CDF4(9767, 16612, 21416) },
+ { AOM_CDF4(7686, 13738, 18398) },
+ { AOM_CDF4(6333, 11614, 15964) },
+ { AOM_CDF4(3941, 7571, 10836) },
+ { AOM_CDF4(22819, 27422, 29202) },
+ { AOM_CDF4(22224, 28514, 30721) },
+ { AOM_CDF4(17660, 25433, 28913) },
+ { AOM_CDF4(13574, 21482, 26002) },
+ { AOM_CDF4(10629, 17977, 22938) },
+ { AOM_CDF4(8612, 15298, 20265) },
+ { AOM_CDF4(5607, 10491, 14596) } },
+ { { AOM_CDF4(13569, 19800, 23206) },
+ { AOM_CDF4(13128, 19924, 23869) },
+ { AOM_CDF4(8329, 14841, 19403) },
+ { AOM_CDF4(6130, 10976, 15057) },
+ { AOM_CDF4(4682, 8839, 12518) },
+ { AOM_CDF4(3656, 7409, 10588) },
+ { AOM_CDF4(2577, 5099, 7412) },
+ { AOM_CDF4(22427, 28684, 30585) },
+ { AOM_CDF4(20913, 27750, 30139) },
+ { AOM_CDF4(15840, 24109, 27834) },
+ { AOM_CDF4(12308, 20029, 24569) },
+ { AOM_CDF4(10216, 16785, 21458) },
+ { AOM_CDF4(8309, 14203, 19113) },
+ { AOM_CDF4(6043, 11168, 15307) },
+ { AOM_CDF4(23166, 28901, 30998) },
+ { AOM_CDF4(21899, 28405, 30751) },
+ { AOM_CDF4(18413, 26091, 29443) },
+ { AOM_CDF4(15233, 23114, 27352) },
+ { AOM_CDF4(12683, 20472, 25288) },
+ { AOM_CDF4(10702, 18259, 23409) },
+ { AOM_CDF4(8125, 14464, 19226) } } },
+ { { { AOM_CDF4(9040, 14786, 18360) },
+ { AOM_CDF4(9979, 15718, 19415) },
+ { AOM_CDF4(7913, 13918, 18311) },
+ { AOM_CDF4(5859, 10889, 15184) },
+ { AOM_CDF4(4593, 8677, 12510) },
+ { AOM_CDF4(3820, 7396, 10791) },
+ { AOM_CDF4(1730, 3471, 5192) },
+ { AOM_CDF4(11803, 18365, 22709) },
+ { AOM_CDF4(11419, 18058, 22225) },
+ { AOM_CDF4(9418, 15774, 20243) },
+ { AOM_CDF4(7539, 13325, 17657) },
+ { AOM_CDF4(6233, 11317, 15384) },
+ { AOM_CDF4(5137, 9656, 13545) },
+ { AOM_CDF4(2977, 5774, 8349) },
+ { AOM_CDF4(21207, 27246, 29640) },
+ { AOM_CDF4(19547, 26578, 29497) },
+ { AOM_CDF4(16169, 23871, 27690) },
+ { AOM_CDF4(12820, 20458, 25018) },
+ { AOM_CDF4(10224, 17332, 22214) },
+ { AOM_CDF4(8526, 15048, 19884) },
+ { AOM_CDF4(5037, 9410, 13118) } },
+ { { AOM_CDF4(12339, 17329, 20140) },
+ { AOM_CDF4(13505, 19895, 23225) },
+ { AOM_CDF4(9847, 16944, 21564) },
+ { AOM_CDF4(7280, 13256, 18348) },
+ { AOM_CDF4(4712, 10009, 14454) },
+ { AOM_CDF4(4361, 7914, 12477) },
+ { AOM_CDF4(2870, 5628, 7995) },
+ { AOM_CDF4(20061, 25504, 28526) },
+ { AOM_CDF4(15235, 22878, 26145) },
+ { AOM_CDF4(12985, 19958, 24155) },
+ { AOM_CDF4(9782, 16641, 21403) },
+ { AOM_CDF4(9456, 16360, 20760) },
+ { AOM_CDF4(6855, 12940, 18557) },
+ { AOM_CDF4(5661, 10564, 15002) },
+ { AOM_CDF4(25656, 30602, 31894) },
+ { AOM_CDF4(22570, 29107, 31092) },
+ { AOM_CDF4(18917, 26423, 29541) },
+ { AOM_CDF4(15940, 23649, 27754) },
+ { AOM_CDF4(12803, 20581, 25219) },
+ { AOM_CDF4(11082, 18695, 23376) },
+ { AOM_CDF4(7939, 14373, 19005) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(18315, 24289, 27551) },
+ { AOM_CDF4(16854, 24068, 27835) },
+ { AOM_CDF4(10140, 17927, 23173) },
+ { AOM_CDF4(6722, 12982, 18267) },
+ { AOM_CDF4(4661, 9826, 14706) },
+ { AOM_CDF4(3832, 8165, 12294) },
+ { AOM_CDF4(2795, 6098, 9245) },
+ { AOM_CDF4(17145, 23326, 26672) },
+ { AOM_CDF4(20733, 27680, 30308) },
+ { AOM_CDF4(16032, 24461, 28546) },
+ { AOM_CDF4(11653, 20093, 25081) },
+ { AOM_CDF4(9290, 16429, 22086) },
+ { AOM_CDF4(7796, 14598, 19982) },
+ { AOM_CDF4(6502, 12378, 17441) },
+ { AOM_CDF4(21681, 27732, 30320) },
+ { AOM_CDF4(22389, 29044, 31261) },
+ { AOM_CDF4(19027, 26731, 30087) },
+ { AOM_CDF4(14739, 23755, 28624) },
+ { AOM_CDF4(11358, 20778, 25511) },
+ { AOM_CDF4(10995, 18073, 24190) },
+ { AOM_CDF4(9162, 14990, 20617) } },
+ { { AOM_CDF4(21425, 27952, 30388) },
+ { AOM_CDF4(18062, 25838, 29034) },
+ { AOM_CDF4(11956, 19881, 24808) },
+ { AOM_CDF4(7718, 15000, 20980) },
+ { AOM_CDF4(5702, 11254, 16143) },
+ { AOM_CDF4(4898, 9088, 16864) },
+ { AOM_CDF4(3679, 6776, 11907) },
+ { AOM_CDF4(23294, 30160, 31663) },
+ { AOM_CDF4(24397, 29896, 31836) },
+ { AOM_CDF4(19245, 27128, 30593) },
+ { AOM_CDF4(13202, 19825, 26404) },
+ { AOM_CDF4(11578, 19297, 23957) },
+ { AOM_CDF4(8073, 13297, 21370) },
+ { AOM_CDF4(5461, 10923, 19745) },
+ { AOM_CDF4(27367, 30521, 31934) },
+ { AOM_CDF4(24904, 30671, 31940) },
+ { AOM_CDF4(23075, 28460, 31299) },
+ { AOM_CDF4(14400, 23658, 30417) },
+ { AOM_CDF4(13885, 23882, 28325) },
+ { AOM_CDF4(14746, 22938, 27853) },
+ { AOM_CDF4(5461, 16384, 27307) } } },
+ { { { AOM_CDF4(18274, 24813, 27890) },
+ { AOM_CDF4(15537, 23149, 27003) },
+ { AOM_CDF4(9449, 16740, 21827) },
+ { AOM_CDF4(6700, 12498, 17261) },
+ { AOM_CDF4(4988, 9866, 14198) },
+ { AOM_CDF4(4236, 8147, 11902) },
+ { AOM_CDF4(2867, 5860, 8654) },
+ { AOM_CDF4(17124, 23171, 26101) },
+ { AOM_CDF4(20396, 27477, 30148) },
+ { AOM_CDF4(16573, 24629, 28492) },
+ { AOM_CDF4(12749, 20846, 25674) },
+ { AOM_CDF4(10233, 17878, 22818) },
+ { AOM_CDF4(8525, 15332, 20363) },
+ { AOM_CDF4(6283, 11632, 16255) },
+ { AOM_CDF4(20466, 26511, 29286) },
+ { AOM_CDF4(23059, 29174, 31191) },
+ { AOM_CDF4(19481, 27263, 30241) },
+ { AOM_CDF4(15458, 23631, 28137) },
+ { AOM_CDF4(12416, 20608, 25693) },
+ { AOM_CDF4(10261, 18011, 23261) },
+ { AOM_CDF4(8016, 14655, 19666) } },
+ { { AOM_CDF4(17616, 24586, 28112) },
+ { AOM_CDF4(15809, 23299, 27155) },
+ { AOM_CDF4(10767, 18890, 23793) },
+ { AOM_CDF4(7727, 14255, 18865) },
+ { AOM_CDF4(6129, 11926, 16882) },
+ { AOM_CDF4(4482, 9704, 14861) },
+ { AOM_CDF4(3277, 7452, 11522) },
+ { AOM_CDF4(22956, 28551, 30730) },
+ { AOM_CDF4(22724, 28937, 30961) },
+ { AOM_CDF4(18467, 26324, 29580) },
+ { AOM_CDF4(13234, 20713, 25649) },
+ { AOM_CDF4(11181, 17592, 22481) },
+ { AOM_CDF4(8291, 18358, 24576) },
+ { AOM_CDF4(7568, 11881, 14984) },
+ { AOM_CDF4(24948, 29001, 31147) },
+ { AOM_CDF4(25674, 30619, 32151) },
+ { AOM_CDF4(20841, 26793, 29603) },
+ { AOM_CDF4(14669, 24356, 28666) },
+ { AOM_CDF4(11334, 23593, 28219) },
+ { AOM_CDF4(8922, 14762, 22873) },
+ { AOM_CDF4(8301, 13544, 20535) } } },
+ { { { AOM_CDF4(17113, 23733, 27081) },
+ { AOM_CDF4(14139, 21406, 25452) },
+ { AOM_CDF4(8552, 15002, 19776) },
+ { AOM_CDF4(5871, 11120, 15378) },
+ { AOM_CDF4(4455, 8616, 12253) },
+ { AOM_CDF4(3469, 6910, 10386) },
+ { AOM_CDF4(2255, 4553, 6782) },
+ { AOM_CDF4(18224, 24376, 27053) },
+ { AOM_CDF4(19290, 26710, 29614) },
+ { AOM_CDF4(14936, 22991, 27184) },
+ { AOM_CDF4(11238, 18951, 23762) },
+ { AOM_CDF4(8786, 15617, 20588) },
+ { AOM_CDF4(7317, 13228, 18003) },
+ { AOM_CDF4(5101, 9512, 13493) },
+ { AOM_CDF4(22639, 28222, 30210) },
+ { AOM_CDF4(23216, 29331, 31307) },
+ { AOM_CDF4(19075, 26762, 29895) },
+ { AOM_CDF4(15014, 23113, 27457) },
+ { AOM_CDF4(11938, 19857, 24752) },
+ { AOM_CDF4(9942, 17280, 22282) },
+ { AOM_CDF4(7167, 13144, 17752) } },
+ { { AOM_CDF4(15820, 22738, 26488) },
+ { AOM_CDF4(13530, 20885, 25216) },
+ { AOM_CDF4(8395, 15530, 20452) },
+ { AOM_CDF4(6574, 12321, 16380) },
+ { AOM_CDF4(5353, 10419, 14568) },
+ { AOM_CDF4(4613, 8446, 12381) },
+ { AOM_CDF4(3440, 7158, 9903) },
+ { AOM_CDF4(24247, 29051, 31224) },
+ { AOM_CDF4(22118, 28058, 30369) },
+ { AOM_CDF4(16498, 24768, 28389) },
+ { AOM_CDF4(12920, 21175, 26137) },
+ { AOM_CDF4(10730, 18619, 25352) },
+ { AOM_CDF4(10187, 16279, 22791) },
+ { AOM_CDF4(9310, 14631, 22127) },
+ { AOM_CDF4(24970, 30558, 32057) },
+ { AOM_CDF4(24801, 29942, 31698) },
+ { AOM_CDF4(22432, 28453, 30855) },
+ { AOM_CDF4(19054, 25680, 29580) },
+ { AOM_CDF4(14392, 23036, 28109) },
+ { AOM_CDF4(12495, 20947, 26650) },
+ { AOM_CDF4(12442, 20326, 26214) } } },
+ { { { AOM_CDF4(12162, 18785, 22648) },
+ { AOM_CDF4(12749, 19697, 23806) },
+ { AOM_CDF4(8580, 15297, 20346) },
+ { AOM_CDF4(6169, 11749, 16543) },
+ { AOM_CDF4(4836, 9391, 13448) },
+ { AOM_CDF4(3821, 7711, 11613) },
+ { AOM_CDF4(2228, 4601, 7070) },
+ { AOM_CDF4(16319, 24725, 28280) },
+ { AOM_CDF4(15698, 23277, 27168) },
+ { AOM_CDF4(12726, 20368, 25047) },
+ { AOM_CDF4(9912, 17015, 21976) },
+ { AOM_CDF4(7888, 14220, 19179) },
+ { AOM_CDF4(6777, 12284, 17018) },
+ { AOM_CDF4(4492, 8590, 12252) },
+ { AOM_CDF4(23249, 28904, 30947) },
+ { AOM_CDF4(21050, 27908, 30512) },
+ { AOM_CDF4(17440, 25340, 28949) },
+ { AOM_CDF4(14059, 22018, 26541) },
+ { AOM_CDF4(11288, 18903, 23898) },
+ { AOM_CDF4(9411, 16342, 21428) },
+ { AOM_CDF4(6278, 11588, 15944) } },
+ { { AOM_CDF4(13981, 20067, 23226) },
+ { AOM_CDF4(16922, 23580, 26783) },
+ { AOM_CDF4(11005, 19039, 24487) },
+ { AOM_CDF4(7389, 14218, 19798) },
+ { AOM_CDF4(5598, 11505, 17206) },
+ { AOM_CDF4(6090, 11213, 15659) },
+ { AOM_CDF4(3820, 7371, 10119) },
+ { AOM_CDF4(21082, 26925, 29675) },
+ { AOM_CDF4(21262, 28627, 31128) },
+ { AOM_CDF4(18392, 26454, 30437) },
+ { AOM_CDF4(14870, 22910, 27096) },
+ { AOM_CDF4(12620, 19484, 24908) },
+ { AOM_CDF4(9290, 16553, 22802) },
+ { AOM_CDF4(6668, 14288, 20004) },
+ { AOM_CDF4(27704, 31055, 31949) },
+ { AOM_CDF4(24709, 29978, 31788) },
+ { AOM_CDF4(21668, 29264, 31657) },
+ { AOM_CDF4(18295, 26968, 30074) },
+ { AOM_CDF4(16399, 24422, 29313) },
+ { AOM_CDF4(14347, 23026, 28104) },
+ { AOM_CDF4(12370, 19806, 24477) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } }
+ };
+
+static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
+ [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+ [CDF_SIZE(NUM_BASE_LEVELS +
+ 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) },
+ { AOM_CDF4(18082, 29741, 31877) },
+ { AOM_CDF4(12596, 26124, 30493) },
+ { AOM_CDF4(9446, 21118, 27005) },
+ { AOM_CDF4(6308, 15141, 21279) },
+ { AOM_CDF4(2463, 6357, 9783) },
+ { AOM_CDF4(20667, 30546, 31929) },
+ { AOM_CDF4(13043, 26123, 30134) },
+ { AOM_CDF4(8151, 18757, 24778) },
+ { AOM_CDF4(5255, 12839, 18632) },
+ { AOM_CDF4(2820, 7206, 11161) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(15736, 27553, 30604) },
+ { AOM_CDF4(11210, 23794, 28787) },
+ { AOM_CDF4(5947, 13874, 19701) },
+ { AOM_CDF4(4215, 9323, 13891) },
+ { AOM_CDF4(2833, 6462, 10059) },
+ { AOM_CDF4(19605, 30393, 31582) },
+ { AOM_CDF4(13523, 26252, 30248) },
+ { AOM_CDF4(8446, 18622, 24512) },
+ { AOM_CDF4(3818, 10343, 15974) },
+ { AOM_CDF4(1481, 4117, 6796) },
+ { AOM_CDF4(22649, 31302, 32190) },
+ { AOM_CDF4(14829, 27127, 30449) },
+ { AOM_CDF4(8313, 17702, 23304) },
+ { AOM_CDF4(3022, 8301, 12786) },
+ { AOM_CDF4(1536, 4412, 7184) },
+ { AOM_CDF4(22354, 29774, 31372) },
+ { AOM_CDF4(14723, 25472, 29214) },
+ { AOM_CDF4(6673, 13745, 18662) },
+ { AOM_CDF4(2068, 5766, 9322) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6302, 16444, 21761) },
+ { AOM_CDF4(23040, 31538, 32475) },
+ { AOM_CDF4(15196, 28452, 31496) },
+ { AOM_CDF4(10020, 22946, 28514) },
+ { AOM_CDF4(6533, 16862, 23501) },
+ { AOM_CDF4(3538, 9816, 15076) },
+ { AOM_CDF4(24444, 31875, 32525) },
+ { AOM_CDF4(15881, 28924, 31635) },
+ { AOM_CDF4(9922, 22873, 28466) },
+ { AOM_CDF4(6527, 16966, 23691) },
+ { AOM_CDF4(4114, 11303, 17220) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(20201, 30770, 32209) },
+ { AOM_CDF4(14754, 28071, 31258) },
+ { AOM_CDF4(8378, 20186, 26517) },
+ { AOM_CDF4(5916, 15299, 21978) },
+ { AOM_CDF4(4268, 11583, 17901) },
+ { AOM_CDF4(24361, 32025, 32581) },
+ { AOM_CDF4(18673, 30105, 31943) },
+ { AOM_CDF4(10196, 22244, 27576) },
+ { AOM_CDF4(5495, 14349, 20417) },
+ { AOM_CDF4(2676, 7415, 11498) },
+ { AOM_CDF4(24678, 31958, 32585) },
+ { AOM_CDF4(18629, 29906, 31831) },
+ { AOM_CDF4(9364, 20724, 26315) },
+ { AOM_CDF4(4641, 12318, 18094) },
+ { AOM_CDF4(2758, 7387, 11579) },
+ { AOM_CDF4(25433, 31842, 32469) },
+ { AOM_CDF4(18795, 29289, 31411) },
+ { AOM_CDF4(7644, 17584, 23592) },
+ { AOM_CDF4(3408, 9014, 15047) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4536, 10072, 14001) },
+ { AOM_CDF4(25459, 31416, 32206) },
+ { AOM_CDF4(16605, 28048, 30818) },
+ { AOM_CDF4(11008, 22857, 27719) },
+ { AOM_CDF4(6915, 16268, 22315) },
+ { AOM_CDF4(2625, 6812, 10537) },
+ { AOM_CDF4(24257, 31788, 32499) },
+ { AOM_CDF4(16880, 29454, 31879) },
+ { AOM_CDF4(11958, 25054, 29778) },
+ { AOM_CDF4(7916, 18718, 25084) },
+ { AOM_CDF4(3383, 8777, 13446) },
+ { AOM_CDF4(22720, 31603, 32393) },
+ { AOM_CDF4(14960, 28125, 31335) },
+ { AOM_CDF4(9731, 22210, 27928) },
+ { AOM_CDF4(6304, 15832, 22277) },
+ { AOM_CDF4(2910, 7818, 12166) },
+ { AOM_CDF4(20375, 30627, 32131) },
+ { AOM_CDF4(13904, 27284, 30887) },
+ { AOM_CDF4(9368, 21558, 27144) },
+ { AOM_CDF4(5937, 14966, 21119) },
+ { AOM_CDF4(2667, 7225, 11319) },
+ { AOM_CDF4(23970, 31470, 32378) },
+ { AOM_CDF4(17173, 29734, 32018) },
+ { AOM_CDF4(12795, 25441, 29965) },
+ { AOM_CDF4(8981, 19680, 25893) },
+ { AOM_CDF4(4728, 11372, 16902) },
+ { AOM_CDF4(24287, 31797, 32439) },
+ { AOM_CDF4(16703, 29145, 31696) },
+ { AOM_CDF4(10833, 23554, 28725) },
+ { AOM_CDF4(6468, 16566, 23057) },
+ { AOM_CDF4(2415, 6562, 10278) },
+ { AOM_CDF4(26610, 32395, 32659) },
+ { AOM_CDF4(18590, 30498, 32117) },
+ { AOM_CDF4(12420, 25756, 29950) },
+ { AOM_CDF4(7639, 18746, 24710) },
+ { AOM_CDF4(3001, 8086, 12347) },
+ { AOM_CDF4(25076, 32064, 32580) },
+ { AOM_CDF4(17946, 30128, 32028) },
+ { AOM_CDF4(12024, 24985, 29378) },
+ { AOM_CDF4(7517, 18390, 24304) },
+ { AOM_CDF4(3243, 8781, 13331) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6037, 16771, 21957) },
+ { AOM_CDF4(24774, 31704, 32426) },
+ { AOM_CDF4(16830, 28589, 31056) },
+ { AOM_CDF4(10602, 22828, 27760) },
+ { AOM_CDF4(6733, 16829, 23071) },
+ { AOM_CDF4(3250, 8914, 13556) },
+ { AOM_CDF4(25582, 32220, 32668) },
+ { AOM_CDF4(18659, 30342, 32223) },
+ { AOM_CDF4(12546, 26149, 30515) },
+ { AOM_CDF4(8420, 20451, 26801) },
+ { AOM_CDF4(4636, 12420, 18344) },
+ { AOM_CDF4(27581, 32362, 32639) },
+ { AOM_CDF4(18987, 30083, 31978) },
+ { AOM_CDF4(11327, 24248, 29084) },
+ { AOM_CDF4(7264, 17719, 24120) },
+ { AOM_CDF4(3995, 10768, 16169) },
+ { AOM_CDF4(25893, 31831, 32487) },
+ { AOM_CDF4(16577, 28587, 31379) },
+ { AOM_CDF4(10189, 22748, 28182) },
+ { AOM_CDF4(6832, 17094, 23556) },
+ { AOM_CDF4(3708, 10110, 15334) },
+ { AOM_CDF4(25904, 32282, 32656) },
+ { AOM_CDF4(19721, 30792, 32276) },
+ { AOM_CDF4(12819, 26243, 30411) },
+ { AOM_CDF4(8572, 20614, 26891) },
+ { AOM_CDF4(5364, 14059, 20467) },
+ { AOM_CDF4(26580, 32438, 32677) },
+ { AOM_CDF4(20852, 31225, 32340) },
+ { AOM_CDF4(12435, 25700, 29967) },
+ { AOM_CDF4(8691, 20825, 26976) },
+ { AOM_CDF4(4446, 12209, 17269) },
+ { AOM_CDF4(27350, 32429, 32696) },
+ { AOM_CDF4(21372, 30977, 32272) },
+ { AOM_CDF4(12673, 25270, 29853) },
+ { AOM_CDF4(9208, 20925, 26640) },
+ { AOM_CDF4(5018, 13351, 18732) },
+ { AOM_CDF4(27351, 32479, 32713) },
+ { AOM_CDF4(21398, 31209, 32387) },
+ { AOM_CDF4(12162, 25047, 29842) },
+ { AOM_CDF4(7896, 18691, 25319) },
+ { AOM_CDF4(4670, 12882, 18881) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5487, 10460, 13708) },
+ { AOM_CDF4(21597, 28303, 30674) },
+ { AOM_CDF4(11037, 21953, 26476) },
+ { AOM_CDF4(8147, 17962, 22952) },
+ { AOM_CDF4(5242, 13061, 18532) },
+ { AOM_CDF4(1889, 5208, 8182) },
+ { AOM_CDF4(26774, 32133, 32590) },
+ { AOM_CDF4(17844, 29564, 31767) },
+ { AOM_CDF4(11690, 24438, 29171) },
+ { AOM_CDF4(7542, 18215, 24459) },
+ { AOM_CDF4(2993, 8050, 12319) },
+ { AOM_CDF4(28023, 32328, 32591) },
+ { AOM_CDF4(18651, 30126, 31954) },
+ { AOM_CDF4(12164, 25146, 29589) },
+ { AOM_CDF4(7762, 18530, 24771) },
+ { AOM_CDF4(3492, 9183, 13920) },
+ { AOM_CDF4(27591, 32008, 32491) },
+ { AOM_CDF4(17149, 28853, 31510) },
+ { AOM_CDF4(11485, 24003, 28860) },
+ { AOM_CDF4(7697, 18086, 24210) },
+ { AOM_CDF4(3075, 7999, 12218) },
+ { AOM_CDF4(28268, 32482, 32654) },
+ { AOM_CDF4(19631, 31051, 32404) },
+ { AOM_CDF4(13860, 27260, 31020) },
+ { AOM_CDF4(9605, 21613, 27594) },
+ { AOM_CDF4(4876, 12162, 17908) },
+ { AOM_CDF4(27248, 32316, 32576) },
+ { AOM_CDF4(18955, 30457, 32075) },
+ { AOM_CDF4(11824, 23997, 28795) },
+ { AOM_CDF4(7346, 18196, 24647) },
+ { AOM_CDF4(3403, 9247, 14111) },
+ { AOM_CDF4(29711, 32655, 32735) },
+ { AOM_CDF4(21169, 31394, 32417) },
+ { AOM_CDF4(13487, 27198, 30957) },
+ { AOM_CDF4(8828, 21683, 27614) },
+ { AOM_CDF4(4270, 11451, 17038) },
+ { AOM_CDF4(28708, 32578, 32731) },
+ { AOM_CDF4(20120, 31241, 32482) },
+ { AOM_CDF4(13692, 27550, 31321) },
+ { AOM_CDF4(9418, 22514, 28439) },
+ { AOM_CDF4(4999, 13283, 19462) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(5673, 14302, 19711) },
+ { AOM_CDF4(26251, 30701, 31834) },
+ { AOM_CDF4(12782, 23783, 27803) },
+ { AOM_CDF4(9127, 20657, 25808) },
+ { AOM_CDF4(6368, 16208, 21462) },
+ { AOM_CDF4(2465, 7177, 10822) },
+ { AOM_CDF4(29961, 32563, 32719) },
+ { AOM_CDF4(18318, 29891, 31949) },
+ { AOM_CDF4(11361, 24514, 29357) },
+ { AOM_CDF4(7900, 19603, 25607) },
+ { AOM_CDF4(4002, 10590, 15546) },
+ { AOM_CDF4(29637, 32310, 32595) },
+ { AOM_CDF4(18296, 29913, 31809) },
+ { AOM_CDF4(10144, 21515, 26871) },
+ { AOM_CDF4(5358, 14322, 20394) },
+ { AOM_CDF4(3067, 8362, 13346) },
+ { AOM_CDF4(28652, 32470, 32676) },
+ { AOM_CDF4(17538, 30771, 32209) },
+ { AOM_CDF4(13924, 26882, 30494) },
+ { AOM_CDF4(10496, 22837, 27869) },
+ { AOM_CDF4(7236, 16396, 21621) },
+ { AOM_CDF4(30743, 32687, 32746) },
+ { AOM_CDF4(23006, 31676, 32489) },
+ { AOM_CDF4(14494, 27828, 31120) },
+ { AOM_CDF4(10174, 22801, 28352) },
+ { AOM_CDF4(6242, 15281, 21043) },
+ { AOM_CDF4(25817, 32243, 32720) },
+ { AOM_CDF4(18618, 31367, 32325) },
+ { AOM_CDF4(13997, 28318, 31878) },
+ { AOM_CDF4(12255, 26534, 31383) },
+ { AOM_CDF4(9561, 21588, 28450) },
+ { AOM_CDF4(28188, 32635, 32724) },
+ { AOM_CDF4(22060, 32365, 32728) },
+ { AOM_CDF4(18102, 30690, 32528) },
+ { AOM_CDF4(14196, 28864, 31999) },
+ { AOM_CDF4(12262, 25792, 30865) },
+ { AOM_CDF4(24176, 32109, 32628) },
+ { AOM_CDF4(18280, 29681, 31963) },
+ { AOM_CDF4(10205, 23703, 29664) },
+ { AOM_CDF4(7889, 20025, 27676) },
+ { AOM_CDF4(6060, 16743, 23970) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5141, 7096, 8260) },
+ { AOM_CDF4(27186, 29022, 29789) },
+ { AOM_CDF4(6668, 12568, 15682) },
+ { AOM_CDF4(2172, 6181, 8638) },
+ { AOM_CDF4(1126, 3379, 4531) },
+ { AOM_CDF4(443, 1361, 2254) },
+ { AOM_CDF4(26083, 31153, 32436) },
+ { AOM_CDF4(13486, 24603, 28483) },
+ { AOM_CDF4(6508, 14840, 19910) },
+ { AOM_CDF4(3386, 8800, 13286) },
+ { AOM_CDF4(1530, 4322, 7054) },
+ { AOM_CDF4(29639, 32080, 32548) },
+ { AOM_CDF4(15897, 27552, 30290) },
+ { AOM_CDF4(8588, 20047, 25383) },
+ { AOM_CDF4(4889, 13339, 19269) },
+ { AOM_CDF4(2240, 6871, 10498) },
+ { AOM_CDF4(28165, 32197, 32517) },
+ { AOM_CDF4(20735, 30427, 31568) },
+ { AOM_CDF4(14325, 24671, 27692) },
+ { AOM_CDF4(5119, 12554, 17805) },
+ { AOM_CDF4(1810, 5441, 8261) },
+ { AOM_CDF4(31212, 32724, 32748) },
+ { AOM_CDF4(23352, 31766, 32545) },
+ { AOM_CDF4(14669, 27570, 31059) },
+ { AOM_CDF4(8492, 20894, 27272) },
+ { AOM_CDF4(3644, 10194, 15204) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(2461, 7013, 9371) },
+ { AOM_CDF4(24749, 29600, 30986) },
+ { AOM_CDF4(9466, 19037, 22417) },
+ { AOM_CDF4(3584, 9280, 14400) },
+ { AOM_CDF4(1505, 3929, 5433) },
+ { AOM_CDF4(677, 1500, 2736) },
+ { AOM_CDF4(23987, 30702, 32117) },
+ { AOM_CDF4(13554, 24571, 29263) },
+ { AOM_CDF4(6211, 14556, 21155) },
+ { AOM_CDF4(3135, 10972, 15625) },
+ { AOM_CDF4(2435, 7127, 11427) },
+ { AOM_CDF4(31300, 32532, 32550) },
+ { AOM_CDF4(14757, 30365, 31954) },
+ { AOM_CDF4(4405, 11612, 18553) },
+ { AOM_CDF4(580, 4132, 7322) },
+ { AOM_CDF4(1695, 10169, 14124) },
+ { AOM_CDF4(30008, 32282, 32591) },
+ { AOM_CDF4(19244, 30108, 31748) },
+ { AOM_CDF4(11180, 24158, 29555) },
+ { AOM_CDF4(5650, 14972, 19209) },
+ { AOM_CDF4(2114, 5109, 8456) },
+ { AOM_CDF4(31856, 32716, 32748) },
+ { AOM_CDF4(23012, 31664, 32572) },
+ { AOM_CDF4(13694, 26656, 30636) },
+ { AOM_CDF4(8142, 19508, 26093) },
+ { AOM_CDF4(4253, 10955, 16724) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(601, 983, 1311) },
+ { AOM_CDF4(18725, 23406, 28087) },
+ { AOM_CDF4(5461, 8192, 10923) },
+ { AOM_CDF4(3781, 15124, 21425) },
+ { AOM_CDF4(2587, 7761, 12072) },
+ { AOM_CDF4(106, 458, 810) },
+ { AOM_CDF4(22282, 29710, 31894) },
+ { AOM_CDF4(8508, 20926, 25984) },
+ { AOM_CDF4(3726, 12713, 18083) },
+ { AOM_CDF4(1620, 7112, 10893) },
+ { AOM_CDF4(729, 2236, 3495) },
+ { AOM_CDF4(30163, 32474, 32684) },
+ { AOM_CDF4(18304, 30464, 32000) },
+ { AOM_CDF4(11443, 26526, 29647) },
+ { AOM_CDF4(6007, 15292, 21299) },
+ { AOM_CDF4(2234, 6703, 8937) },
+ { AOM_CDF4(30954, 32177, 32571) },
+ { AOM_CDF4(17363, 29562, 31076) },
+ { AOM_CDF4(9686, 22464, 27410) },
+ { AOM_CDF4(8192, 16384, 21390) },
+ { AOM_CDF4(1755, 8046, 11264) },
+ { AOM_CDF4(31168, 32734, 32748) },
+ { AOM_CDF4(22486, 31441, 32471) },
+ { AOM_CDF4(12833, 25627, 29738) },
+ { AOM_CDF4(6980, 17379, 23122) },
+ { AOM_CDF4(3111, 8887, 13479) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(6041, 11854, 15927) },
+ { AOM_CDF4(20326, 30905, 32251) },
+ { AOM_CDF4(14164, 26831, 30725) },
+ { AOM_CDF4(9760, 20647, 26585) },
+ { AOM_CDF4(6416, 14953, 21219) },
+ { AOM_CDF4(2966, 7151, 10891) },
+ { AOM_CDF4(23567, 31374, 32254) },
+ { AOM_CDF4(14978, 27416, 30946) },
+ { AOM_CDF4(9434, 20225, 26254) },
+ { AOM_CDF4(6658, 14558, 20535) },
+ { AOM_CDF4(3916, 8677, 12989) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(18088, 29545, 31587) },
+ { AOM_CDF4(13062, 25843, 30073) },
+ { AOM_CDF4(8940, 16827, 22251) },
+ { AOM_CDF4(7654, 13220, 17973) },
+ { AOM_CDF4(5733, 10316, 14456) },
+ { AOM_CDF4(22879, 31388, 32114) },
+ { AOM_CDF4(15215, 27993, 30955) },
+ { AOM_CDF4(9397, 19445, 24978) },
+ { AOM_CDF4(3442, 9813, 15344) },
+ { AOM_CDF4(1368, 3936, 6532) },
+ { AOM_CDF4(25494, 32033, 32406) },
+ { AOM_CDF4(16772, 27963, 30718) },
+ { AOM_CDF4(9419, 18165, 23260) },
+ { AOM_CDF4(2677, 7501, 11797) },
+ { AOM_CDF4(1516, 4344, 7170) },
+ { AOM_CDF4(26556, 31454, 32101) },
+ { AOM_CDF4(17128, 27035, 30108) },
+ { AOM_CDF4(8324, 15344, 20249) },
+ { AOM_CDF4(1903, 5696, 9469) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8455, 19003, 24368) },
+ { AOM_CDF4(23563, 32021, 32604) },
+ { AOM_CDF4(16237, 29446, 31935) },
+ { AOM_CDF4(10724, 23999, 29358) },
+ { AOM_CDF4(6725, 17528, 24416) },
+ { AOM_CDF4(3927, 10927, 16825) },
+ { AOM_CDF4(26313, 32288, 32634) },
+ { AOM_CDF4(17430, 30095, 32095) },
+ { AOM_CDF4(11116, 24606, 29679) },
+ { AOM_CDF4(7195, 18384, 25269) },
+ { AOM_CDF4(4726, 12852, 19315) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(22822, 31648, 32483) },
+ { AOM_CDF4(16724, 29633, 31929) },
+ { AOM_CDF4(10261, 23033, 28725) },
+ { AOM_CDF4(7029, 17840, 24528) },
+ { AOM_CDF4(4867, 13886, 21502) },
+ { AOM_CDF4(25298, 31892, 32491) },
+ { AOM_CDF4(17809, 29330, 31512) },
+ { AOM_CDF4(9668, 21329, 26579) },
+ { AOM_CDF4(4774, 12956, 18976) },
+ { AOM_CDF4(2322, 7030, 11540) },
+ { AOM_CDF4(25472, 31920, 32543) },
+ { AOM_CDF4(17957, 29387, 31632) },
+ { AOM_CDF4(9196, 20593, 26400) },
+ { AOM_CDF4(4680, 12705, 19202) },
+ { AOM_CDF4(2917, 8456, 13436) },
+ { AOM_CDF4(26471, 32059, 32574) },
+ { AOM_CDF4(18458, 29783, 31909) },
+ { AOM_CDF4(8400, 19464, 25956) },
+ { AOM_CDF4(3812, 10973, 17206) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(6779, 13743, 17678) },
+ { AOM_CDF4(24806, 31797, 32457) },
+ { AOM_CDF4(17616, 29047, 31372) },
+ { AOM_CDF4(11063, 23175, 28003) },
+ { AOM_CDF4(6521, 16110, 22324) },
+ { AOM_CDF4(2764, 7504, 11654) },
+ { AOM_CDF4(25266, 32367, 32637) },
+ { AOM_CDF4(19054, 30553, 32175) },
+ { AOM_CDF4(12139, 25212, 29807) },
+ { AOM_CDF4(7311, 18162, 24704) },
+ { AOM_CDF4(3397, 9164, 14074) },
+ { AOM_CDF4(25988, 32208, 32522) },
+ { AOM_CDF4(16253, 28912, 31526) },
+ { AOM_CDF4(9151, 21387, 27372) },
+ { AOM_CDF4(5688, 14915, 21496) },
+ { AOM_CDF4(2717, 7627, 12004) },
+ { AOM_CDF4(23144, 31855, 32443) },
+ { AOM_CDF4(16070, 28491, 31325) },
+ { AOM_CDF4(8702, 20467, 26517) },
+ { AOM_CDF4(5243, 13956, 20367) },
+ { AOM_CDF4(2621, 7335, 11567) },
+ { AOM_CDF4(26636, 32340, 32630) },
+ { AOM_CDF4(19990, 31050, 32341) },
+ { AOM_CDF4(13243, 26105, 30315) },
+ { AOM_CDF4(8588, 19521, 25918) },
+ { AOM_CDF4(4717, 11585, 17304) },
+ { AOM_CDF4(25844, 32292, 32582) },
+ { AOM_CDF4(19090, 30635, 32097) },
+ { AOM_CDF4(11963, 24546, 28939) },
+ { AOM_CDF4(6218, 16087, 22354) },
+ { AOM_CDF4(2340, 6608, 10426) },
+ { AOM_CDF4(28046, 32576, 32694) },
+ { AOM_CDF4(21178, 31313, 32296) },
+ { AOM_CDF4(13486, 26184, 29870) },
+ { AOM_CDF4(7149, 17871, 23723) },
+ { AOM_CDF4(2833, 7958, 12259) },
+ { AOM_CDF4(27710, 32528, 32686) },
+ { AOM_CDF4(20674, 31076, 32268) },
+ { AOM_CDF4(12413, 24955, 29243) },
+ { AOM_CDF4(6676, 16927, 23097) },
+ { AOM_CDF4(2966, 8333, 12919) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8639, 19339, 24429) },
+ { AOM_CDF4(24404, 31837, 32525) },
+ { AOM_CDF4(16997, 29425, 31784) },
+ { AOM_CDF4(11253, 24234, 29149) },
+ { AOM_CDF4(6751, 17394, 24028) },
+ { AOM_CDF4(3490, 9830, 15191) },
+ { AOM_CDF4(26283, 32471, 32714) },
+ { AOM_CDF4(19599, 31168, 32442) },
+ { AOM_CDF4(13146, 26954, 30893) },
+ { AOM_CDF4(8214, 20588, 26890) },
+ { AOM_CDF4(4699, 13081, 19300) },
+ { AOM_CDF4(28212, 32458, 32669) },
+ { AOM_CDF4(18594, 30316, 32100) },
+ { AOM_CDF4(11219, 24408, 29234) },
+ { AOM_CDF4(6865, 17656, 24149) },
+ { AOM_CDF4(3678, 10362, 16006) },
+ { AOM_CDF4(25825, 32136, 32616) },
+ { AOM_CDF4(17313, 29853, 32021) },
+ { AOM_CDF4(11197, 24471, 29472) },
+ { AOM_CDF4(6947, 17781, 24405) },
+ { AOM_CDF4(3768, 10660, 16261) },
+ { AOM_CDF4(27352, 32500, 32706) },
+ { AOM_CDF4(20850, 31468, 32469) },
+ { AOM_CDF4(14021, 27707, 31133) },
+ { AOM_CDF4(8964, 21748, 27838) },
+ { AOM_CDF4(5437, 14665, 21187) },
+ { AOM_CDF4(26304, 32492, 32698) },
+ { AOM_CDF4(20409, 31380, 32385) },
+ { AOM_CDF4(13682, 27222, 30632) },
+ { AOM_CDF4(8974, 21236, 26685) },
+ { AOM_CDF4(4234, 11665, 16934) },
+ { AOM_CDF4(26273, 32357, 32711) },
+ { AOM_CDF4(20672, 31242, 32441) },
+ { AOM_CDF4(14172, 27254, 30902) },
+ { AOM_CDF4(9870, 21898, 27275) },
+ { AOM_CDF4(5164, 13506, 19270) },
+ { AOM_CDF4(26725, 32459, 32728) },
+ { AOM_CDF4(20991, 31442, 32527) },
+ { AOM_CDF4(13071, 26434, 30811) },
+ { AOM_CDF4(8184, 20090, 26742) },
+ { AOM_CDF4(4803, 13255, 19895) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7555, 14942, 18501) },
+ { AOM_CDF4(24410, 31178, 32287) },
+ { AOM_CDF4(14394, 26738, 30253) },
+ { AOM_CDF4(8413, 19554, 25195) },
+ { AOM_CDF4(4766, 12924, 18785) },
+ { AOM_CDF4(2029, 5806, 9207) },
+ { AOM_CDF4(26776, 32364, 32663) },
+ { AOM_CDF4(18732, 29967, 31931) },
+ { AOM_CDF4(11005, 23786, 28852) },
+ { AOM_CDF4(6466, 16909, 23510) },
+ { AOM_CDF4(3044, 8638, 13419) },
+ { AOM_CDF4(29208, 32582, 32704) },
+ { AOM_CDF4(20068, 30857, 32208) },
+ { AOM_CDF4(12003, 25085, 29595) },
+ { AOM_CDF4(6947, 17750, 24189) },
+ { AOM_CDF4(3245, 9103, 14007) },
+ { AOM_CDF4(27359, 32465, 32669) },
+ { AOM_CDF4(19421, 30614, 32174) },
+ { AOM_CDF4(11915, 25010, 29579) },
+ { AOM_CDF4(6950, 17676, 24074) },
+ { AOM_CDF4(3007, 8473, 13096) },
+ { AOM_CDF4(29002, 32676, 32735) },
+ { AOM_CDF4(22102, 31849, 32576) },
+ { AOM_CDF4(14408, 28009, 31405) },
+ { AOM_CDF4(9027, 21679, 27931) },
+ { AOM_CDF4(4694, 12678, 18748) },
+ { AOM_CDF4(28216, 32528, 32682) },
+ { AOM_CDF4(20849, 31264, 32318) },
+ { AOM_CDF4(12756, 25815, 29751) },
+ { AOM_CDF4(7565, 18801, 24923) },
+ { AOM_CDF4(3509, 9533, 14477) },
+ { AOM_CDF4(30133, 32687, 32739) },
+ { AOM_CDF4(23063, 31910, 32515) },
+ { AOM_CDF4(14588, 28051, 31132) },
+ { AOM_CDF4(9085, 21649, 27457) },
+ { AOM_CDF4(4261, 11654, 17264) },
+ { AOM_CDF4(29518, 32691, 32748) },
+ { AOM_CDF4(22451, 31959, 32613) },
+ { AOM_CDF4(14864, 28722, 31700) },
+ { AOM_CDF4(9695, 22964, 28716) },
+ { AOM_CDF4(4932, 13358, 19502) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6465, 16958, 21688) },
+ { AOM_CDF4(25199, 31514, 32360) },
+ { AOM_CDF4(14774, 27149, 30607) },
+ { AOM_CDF4(9257, 21438, 26972) },
+ { AOM_CDF4(5723, 15183, 21882) },
+ { AOM_CDF4(3150, 8879, 13731) },
+ { AOM_CDF4(26989, 32262, 32682) },
+ { AOM_CDF4(17396, 29937, 32085) },
+ { AOM_CDF4(11387, 24901, 29784) },
+ { AOM_CDF4(7289, 18821, 25548) },
+ { AOM_CDF4(3734, 10577, 16086) },
+ { AOM_CDF4(29728, 32501, 32695) },
+ { AOM_CDF4(17431, 29701, 31903) },
+ { AOM_CDF4(9921, 22826, 28300) },
+ { AOM_CDF4(5896, 15434, 22068) },
+ { AOM_CDF4(3430, 9646, 14757) },
+ { AOM_CDF4(28614, 32511, 32705) },
+ { AOM_CDF4(19364, 30638, 32263) },
+ { AOM_CDF4(13129, 26254, 30402) },
+ { AOM_CDF4(8754, 20484, 26440) },
+ { AOM_CDF4(4378, 11607, 17110) },
+ { AOM_CDF4(30292, 32671, 32744) },
+ { AOM_CDF4(21780, 31603, 32501) },
+ { AOM_CDF4(14314, 27829, 31291) },
+ { AOM_CDF4(9611, 22327, 28263) },
+ { AOM_CDF4(4890, 13087, 19065) },
+ { AOM_CDF4(25862, 32567, 32733) },
+ { AOM_CDF4(20794, 32050, 32567) },
+ { AOM_CDF4(17243, 30625, 32254) },
+ { AOM_CDF4(13283, 27628, 31474) },
+ { AOM_CDF4(9669, 22532, 28918) },
+ { AOM_CDF4(27435, 32697, 32748) },
+ { AOM_CDF4(24922, 32390, 32714) },
+ { AOM_CDF4(21449, 31504, 32536) },
+ { AOM_CDF4(16392, 29729, 31832) },
+ { AOM_CDF4(11692, 24884, 29076) },
+ { AOM_CDF4(24193, 32290, 32735) },
+ { AOM_CDF4(18909, 31104, 32563) },
+ { AOM_CDF4(12236, 26841, 31403) },
+ { AOM_CDF4(8171, 21840, 29082) },
+ { AOM_CDF4(7224, 17280, 25275) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(3078, 6839, 9890) },
+ { AOM_CDF4(13837, 20450, 24479) },
+ { AOM_CDF4(5914, 14222, 19328) },
+ { AOM_CDF4(3866, 10267, 14762) },
+ { AOM_CDF4(2612, 7208, 11042) },
+ { AOM_CDF4(1067, 2991, 4776) },
+ { AOM_CDF4(25817, 31646, 32529) },
+ { AOM_CDF4(13708, 26338, 30385) },
+ { AOM_CDF4(7328, 18585, 24870) },
+ { AOM_CDF4(4691, 13080, 19276) },
+ { AOM_CDF4(1825, 5253, 8352) },
+ { AOM_CDF4(29386, 32315, 32624) },
+ { AOM_CDF4(17160, 29001, 31360) },
+ { AOM_CDF4(9602, 21862, 27396) },
+ { AOM_CDF4(5915, 15772, 22148) },
+ { AOM_CDF4(2786, 7779, 12047) },
+ { AOM_CDF4(29246, 32450, 32663) },
+ { AOM_CDF4(18696, 29929, 31818) },
+ { AOM_CDF4(10510, 23369, 28560) },
+ { AOM_CDF4(6229, 16499, 23125) },
+ { AOM_CDF4(2608, 7448, 11705) },
+ { AOM_CDF4(30753, 32710, 32748) },
+ { AOM_CDF4(21638, 31487, 32503) },
+ { AOM_CDF4(12937, 26854, 30870) },
+ { AOM_CDF4(8182, 20596, 26970) },
+ { AOM_CDF4(3637, 10269, 15497) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(5244, 12150, 16906) },
+ { AOM_CDF4(20486, 26858, 29701) },
+ { AOM_CDF4(7756, 18317, 23735) },
+ { AOM_CDF4(3452, 9256, 13146) },
+ { AOM_CDF4(2020, 5206, 8229) },
+ { AOM_CDF4(1801, 4993, 7903) },
+ { AOM_CDF4(27051, 31858, 32531) },
+ { AOM_CDF4(15988, 27531, 30619) },
+ { AOM_CDF4(9188, 21484, 26719) },
+ { AOM_CDF4(6273, 17186, 23800) },
+ { AOM_CDF4(3108, 9355, 14764) },
+ { AOM_CDF4(31076, 32520, 32680) },
+ { AOM_CDF4(18119, 30037, 31850) },
+ { AOM_CDF4(10244, 22969, 27472) },
+ { AOM_CDF4(4692, 14077, 19273) },
+ { AOM_CDF4(3694, 11677, 17556) },
+ { AOM_CDF4(30060, 32581, 32720) },
+ { AOM_CDF4(21011, 30775, 32120) },
+ { AOM_CDF4(11931, 24820, 29289) },
+ { AOM_CDF4(7119, 17662, 24356) },
+ { AOM_CDF4(3833, 10706, 16304) },
+ { AOM_CDF4(31954, 32731, 32748) },
+ { AOM_CDF4(23913, 31724, 32489) },
+ { AOM_CDF4(15520, 28060, 31286) },
+ { AOM_CDF4(11517, 23008, 28571) },
+ { AOM_CDF4(6193, 14508, 20629) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(1035, 2807, 4156) },
+ { AOM_CDF4(13162, 18138, 20939) },
+ { AOM_CDF4(2696, 6633, 8755) },
+ { AOM_CDF4(1373, 4161, 6853) },
+ { AOM_CDF4(1099, 2746, 4716) },
+ { AOM_CDF4(340, 1021, 1599) },
+ { AOM_CDF4(22826, 30419, 32135) },
+ { AOM_CDF4(10395, 21762, 26942) },
+ { AOM_CDF4(4726, 12407, 17361) },
+ { AOM_CDF4(2447, 7080, 10593) },
+ { AOM_CDF4(1227, 3717, 6011) },
+ { AOM_CDF4(28156, 31424, 31934) },
+ { AOM_CDF4(16915, 27754, 30373) },
+ { AOM_CDF4(9148, 20990, 26431) },
+ { AOM_CDF4(5950, 15515, 21148) },
+ { AOM_CDF4(2492, 7327, 11526) },
+ { AOM_CDF4(30602, 32477, 32670) },
+ { AOM_CDF4(20026, 29955, 31568) },
+ { AOM_CDF4(11220, 23628, 28105) },
+ { AOM_CDF4(6652, 17019, 22973) },
+ { AOM_CDF4(3064, 8536, 13043) },
+ { AOM_CDF4(31769, 32724, 32748) },
+ { AOM_CDF4(22230, 30887, 32373) },
+ { AOM_CDF4(12234, 25079, 29731) },
+ { AOM_CDF4(7326, 18816, 25353) },
+ { AOM_CDF4(3933, 10907, 16616) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(8896, 16227, 20630) },
+ { AOM_CDF4(23629, 31782, 32527) },
+ { AOM_CDF4(15173, 27755, 31321) },
+ { AOM_CDF4(10158, 21233, 27382) },
+ { AOM_CDF4(6420, 14857, 21558) },
+ { AOM_CDF4(3269, 8155, 12646) },
+ { AOM_CDF4(24835, 32009, 32496) },
+ { AOM_CDF4(16509, 28421, 31579) },
+ { AOM_CDF4(10957, 21514, 27418) },
+ { AOM_CDF4(7881, 15930, 22096) },
+ { AOM_CDF4(5388, 10960, 15918) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(20745, 30773, 32093) },
+ { AOM_CDF4(15200, 27221, 30861) },
+ { AOM_CDF4(13032, 20873, 25667) },
+ { AOM_CDF4(12285, 18663, 23494) },
+ { AOM_CDF4(11563, 17481, 21489) },
+ { AOM_CDF4(26260, 31982, 32320) },
+ { AOM_CDF4(15397, 28083, 31100) },
+ { AOM_CDF4(9742, 19217, 24824) },
+ { AOM_CDF4(3261, 9629, 15362) },
+ { AOM_CDF4(1480, 4322, 7499) },
+ { AOM_CDF4(27599, 32256, 32460) },
+ { AOM_CDF4(16857, 27659, 30774) },
+ { AOM_CDF4(9551, 18290, 23748) },
+ { AOM_CDF4(3052, 8933, 14103) },
+ { AOM_CDF4(2021, 5910, 9787) },
+ { AOM_CDF4(29005, 32015, 32392) },
+ { AOM_CDF4(17677, 27694, 30863) },
+ { AOM_CDF4(9204, 17356, 23219) },
+ { AOM_CDF4(2403, 7516, 12814) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(10808, 22056, 26896) },
+ { AOM_CDF4(25739, 32313, 32676) },
+ { AOM_CDF4(17288, 30203, 32221) },
+ { AOM_CDF4(11359, 24878, 29896) },
+ { AOM_CDF4(6949, 17767, 24893) },
+ { AOM_CDF4(4287, 11796, 18071) },
+ { AOM_CDF4(27880, 32521, 32705) },
+ { AOM_CDF4(19038, 31004, 32414) },
+ { AOM_CDF4(12564, 26345, 30768) },
+ { AOM_CDF4(8269, 19947, 26779) },
+ { AOM_CDF4(5674, 14657, 21674) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(25742, 32319, 32671) },
+ { AOM_CDF4(19557, 31164, 32454) },
+ { AOM_CDF4(13381, 26381, 30755) },
+ { AOM_CDF4(10101, 21466, 26722) },
+ { AOM_CDF4(9209, 19650, 26825) },
+ { AOM_CDF4(27107, 31917, 32432) },
+ { AOM_CDF4(18056, 28893, 31203) },
+ { AOM_CDF4(10200, 21434, 26764) },
+ { AOM_CDF4(4660, 12913, 19502) },
+ { AOM_CDF4(2368, 6930, 12504) },
+ { AOM_CDF4(26960, 32158, 32613) },
+ { AOM_CDF4(18628, 30005, 32031) },
+ { AOM_CDF4(10233, 22442, 28232) },
+ { AOM_CDF4(5471, 14630, 21516) },
+ { AOM_CDF4(3235, 10767, 17109) },
+ { AOM_CDF4(27696, 32440, 32692) },
+ { AOM_CDF4(20032, 31167, 32438) },
+ { AOM_CDF4(8700, 21341, 28442) },
+ { AOM_CDF4(5662, 14831, 21795) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(9704, 17294, 21132) },
+ { AOM_CDF4(26762, 32278, 32633) },
+ { AOM_CDF4(18382, 29620, 31819) },
+ { AOM_CDF4(10891, 23475, 28723) },
+ { AOM_CDF4(6358, 16583, 23309) },
+ { AOM_CDF4(3248, 9118, 14141) },
+ { AOM_CDF4(27204, 32573, 32699) },
+ { AOM_CDF4(19818, 30824, 32329) },
+ { AOM_CDF4(11772, 25120, 30041) },
+ { AOM_CDF4(6995, 18033, 25039) },
+ { AOM_CDF4(3752, 10442, 16098) },
+ { AOM_CDF4(27222, 32256, 32559) },
+ { AOM_CDF4(15356, 28399, 31475) },
+ { AOM_CDF4(8821, 20635, 27057) },
+ { AOM_CDF4(5511, 14404, 21239) },
+ { AOM_CDF4(2935, 8222, 13051) },
+ { AOM_CDF4(24875, 32120, 32529) },
+ { AOM_CDF4(15233, 28265, 31445) },
+ { AOM_CDF4(8605, 20570, 26932) },
+ { AOM_CDF4(5431, 14413, 21196) },
+ { AOM_CDF4(2994, 8341, 13223) },
+ { AOM_CDF4(28201, 32604, 32700) },
+ { AOM_CDF4(21041, 31446, 32456) },
+ { AOM_CDF4(13221, 26213, 30475) },
+ { AOM_CDF4(8255, 19385, 26037) },
+ { AOM_CDF4(4930, 12585, 18830) },
+ { AOM_CDF4(28768, 32448, 32627) },
+ { AOM_CDF4(19705, 30561, 32021) },
+ { AOM_CDF4(11572, 23589, 28220) },
+ { AOM_CDF4(5532, 15034, 21446) },
+ { AOM_CDF4(2460, 7150, 11456) },
+ { AOM_CDF4(29874, 32619, 32699) },
+ { AOM_CDF4(21621, 31071, 32201) },
+ { AOM_CDF4(12511, 24747, 28992) },
+ { AOM_CDF4(6281, 16395, 22748) },
+ { AOM_CDF4(3246, 9278, 14497) },
+ { AOM_CDF4(29715, 32625, 32712) },
+ { AOM_CDF4(20958, 31011, 32283) },
+ { AOM_CDF4(11233, 23671, 28806) },
+ { AOM_CDF4(6012, 16128, 22868) },
+ { AOM_CDF4(3427, 9851, 15414) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(11016, 22111, 26794) },
+ { AOM_CDF4(25946, 32357, 32677) },
+ { AOM_CDF4(17890, 30452, 32252) },
+ { AOM_CDF4(11678, 25142, 29816) },
+ { AOM_CDF4(6720, 17534, 24584) },
+ { AOM_CDF4(4230, 11665, 17820) },
+ { AOM_CDF4(28400, 32623, 32747) },
+ { AOM_CDF4(21164, 31668, 32575) },
+ { AOM_CDF4(13572, 27388, 31182) },
+ { AOM_CDF4(8234, 20750, 27358) },
+ { AOM_CDF4(5065, 14055, 20897) },
+ { AOM_CDF4(28981, 32547, 32705) },
+ { AOM_CDF4(18681, 30543, 32239) },
+ { AOM_CDF4(10919, 24075, 29286) },
+ { AOM_CDF4(6431, 17199, 24077) },
+ { AOM_CDF4(3819, 10464, 16618) },
+ { AOM_CDF4(26870, 32467, 32693) },
+ { AOM_CDF4(19041, 30831, 32347) },
+ { AOM_CDF4(11794, 25211, 30016) },
+ { AOM_CDF4(6888, 18019, 24970) },
+ { AOM_CDF4(4370, 12363, 18992) },
+ { AOM_CDF4(29578, 32670, 32744) },
+ { AOM_CDF4(23159, 32007, 32613) },
+ { AOM_CDF4(15315, 28669, 31676) },
+ { AOM_CDF4(9298, 22607, 28782) },
+ { AOM_CDF4(6144, 15913, 22968) },
+ { AOM_CDF4(28110, 32499, 32669) },
+ { AOM_CDF4(21574, 30937, 32015) },
+ { AOM_CDF4(12759, 24818, 28727) },
+ { AOM_CDF4(6545, 16761, 23042) },
+ { AOM_CDF4(3649, 10597, 16833) },
+ { AOM_CDF4(28163, 32552, 32728) },
+ { AOM_CDF4(22101, 31469, 32464) },
+ { AOM_CDF4(13160, 25472, 30143) },
+ { AOM_CDF4(7303, 18684, 25468) },
+ { AOM_CDF4(5241, 13975, 20955) },
+ { AOM_CDF4(28400, 32631, 32744) },
+ { AOM_CDF4(22104, 31793, 32603) },
+ { AOM_CDF4(13557, 26571, 30846) },
+ { AOM_CDF4(7749, 19861, 26675) },
+ { AOM_CDF4(4873, 14030, 21234) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(9800, 17635, 21073) },
+ { AOM_CDF4(26153, 31885, 32527) },
+ { AOM_CDF4(15038, 27852, 31006) },
+ { AOM_CDF4(8718, 20564, 26486) },
+ { AOM_CDF4(5128, 14076, 20514) },
+ { AOM_CDF4(2636, 7566, 11925) },
+ { AOM_CDF4(27551, 32504, 32701) },
+ { AOM_CDF4(18310, 30054, 32100) },
+ { AOM_CDF4(10211, 23420, 29082) },
+ { AOM_CDF4(6222, 16876, 23916) },
+ { AOM_CDF4(3462, 9954, 15498) },
+ { AOM_CDF4(29991, 32633, 32721) },
+ { AOM_CDF4(19883, 30751, 32201) },
+ { AOM_CDF4(11141, 24184, 29285) },
+ { AOM_CDF4(6420, 16940, 23774) },
+ { AOM_CDF4(3392, 9753, 15118) },
+ { AOM_CDF4(28465, 32616, 32712) },
+ { AOM_CDF4(19850, 30702, 32244) },
+ { AOM_CDF4(10983, 24024, 29223) },
+ { AOM_CDF4(6294, 16770, 23582) },
+ { AOM_CDF4(3244, 9283, 14509) },
+ { AOM_CDF4(30023, 32717, 32748) },
+ { AOM_CDF4(22940, 32032, 32626) },
+ { AOM_CDF4(14282, 27928, 31473) },
+ { AOM_CDF4(8562, 21327, 27914) },
+ { AOM_CDF4(4846, 13393, 19919) },
+ { AOM_CDF4(29981, 32590, 32695) },
+ { AOM_CDF4(20465, 30963, 32166) },
+ { AOM_CDF4(11479, 23579, 28195) },
+ { AOM_CDF4(5916, 15648, 22073) },
+ { AOM_CDF4(3031, 8605, 13398) },
+ { AOM_CDF4(31146, 32691, 32739) },
+ { AOM_CDF4(23106, 31724, 32444) },
+ { AOM_CDF4(13783, 26738, 30439) },
+ { AOM_CDF4(7852, 19468, 25807) },
+ { AOM_CDF4(3860, 11124, 16853) },
+ { AOM_CDF4(31014, 32724, 32748) },
+ { AOM_CDF4(23629, 32109, 32628) },
+ { AOM_CDF4(14747, 28115, 31403) },
+ { AOM_CDF4(8545, 21242, 27478) },
+ { AOM_CDF4(4574, 12781, 19067) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(9185, 19694, 24688) },
+ { AOM_CDF4(26081, 31985, 32621) },
+ { AOM_CDF4(16015, 29000, 31787) },
+ { AOM_CDF4(10542, 23690, 29206) },
+ { AOM_CDF4(6732, 17945, 24677) },
+ { AOM_CDF4(3916, 11039, 16722) },
+ { AOM_CDF4(28224, 32566, 32744) },
+ { AOM_CDF4(19100, 31138, 32485) },
+ { AOM_CDF4(12528, 26620, 30879) },
+ { AOM_CDF4(7741, 20277, 26885) },
+ { AOM_CDF4(4566, 12845, 18990) },
+ { AOM_CDF4(29933, 32593, 32718) },
+ { AOM_CDF4(17670, 30333, 32155) },
+ { AOM_CDF4(10385, 23600, 28909) },
+ { AOM_CDF4(6243, 16236, 22407) },
+ { AOM_CDF4(3976, 10389, 16017) },
+ { AOM_CDF4(28377, 32561, 32738) },
+ { AOM_CDF4(19366, 31175, 32482) },
+ { AOM_CDF4(13327, 27175, 31094) },
+ { AOM_CDF4(8258, 20769, 27143) },
+ { AOM_CDF4(4703, 13198, 19527) },
+ { AOM_CDF4(31086, 32706, 32748) },
+ { AOM_CDF4(22853, 31902, 32583) },
+ { AOM_CDF4(14759, 28186, 31419) },
+ { AOM_CDF4(9284, 22382, 28348) },
+ { AOM_CDF4(5585, 15192, 21868) },
+ { AOM_CDF4(28291, 32652, 32746) },
+ { AOM_CDF4(19849, 32107, 32571) },
+ { AOM_CDF4(14834, 26818, 29214) },
+ { AOM_CDF4(10306, 22594, 28672) },
+ { AOM_CDF4(6615, 17384, 23384) },
+ { AOM_CDF4(28947, 32604, 32745) },
+ { AOM_CDF4(25625, 32289, 32646) },
+ { AOM_CDF4(18758, 28672, 31403) },
+ { AOM_CDF4(10017, 23430, 28523) },
+ { AOM_CDF4(6862, 15269, 22131) },
+ { AOM_CDF4(23933, 32509, 32739) },
+ { AOM_CDF4(19927, 31495, 32631) },
+ { AOM_CDF4(11903, 26023, 30621) },
+ { AOM_CDF4(7026, 20094, 27252) },
+ { AOM_CDF4(5998, 18106, 24437) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4456, 11274, 15533) },
+ { AOM_CDF4(21219, 29079, 31616) },
+ { AOM_CDF4(11173, 23774, 28567) },
+ { AOM_CDF4(7282, 18293, 24263) },
+ { AOM_CDF4(4890, 13286, 19115) },
+ { AOM_CDF4(1890, 5508, 8659) },
+ { AOM_CDF4(26651, 32136, 32647) },
+ { AOM_CDF4(14630, 28254, 31455) },
+ { AOM_CDF4(8716, 21287, 27395) },
+ { AOM_CDF4(5615, 15331, 22008) },
+ { AOM_CDF4(2675, 7700, 12150) },
+ { AOM_CDF4(29954, 32526, 32690) },
+ { AOM_CDF4(16126, 28982, 31633) },
+ { AOM_CDF4(9030, 21361, 27352) },
+ { AOM_CDF4(5411, 14793, 21271) },
+ { AOM_CDF4(2943, 8422, 13163) },
+ { AOM_CDF4(29539, 32601, 32730) },
+ { AOM_CDF4(18125, 30385, 32201) },
+ { AOM_CDF4(10422, 24090, 29468) },
+ { AOM_CDF4(6468, 17487, 24438) },
+ { AOM_CDF4(2970, 8653, 13531) },
+ { AOM_CDF4(30912, 32715, 32748) },
+ { AOM_CDF4(20666, 31373, 32497) },
+ { AOM_CDF4(12509, 26640, 30917) },
+ { AOM_CDF4(8058, 20629, 27290) },
+ { AOM_CDF4(4231, 12006, 18052) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(10202, 20633, 25484) },
+ { AOM_CDF4(27336, 31445, 32352) },
+ { AOM_CDF4(12420, 24384, 28552) },
+ { AOM_CDF4(7648, 18115, 23856) },
+ { AOM_CDF4(5662, 14341, 19902) },
+ { AOM_CDF4(3611, 10328, 15390) },
+ { AOM_CDF4(30945, 32616, 32736) },
+ { AOM_CDF4(18682, 30505, 32253) },
+ { AOM_CDF4(11513, 25336, 30203) },
+ { AOM_CDF4(7449, 19452, 26148) },
+ { AOM_CDF4(4482, 13051, 18886) },
+ { AOM_CDF4(32022, 32690, 32747) },
+ { AOM_CDF4(18578, 30501, 32146) },
+ { AOM_CDF4(11249, 23368, 28631) },
+ { AOM_CDF4(5645, 16958, 22158) },
+ { AOM_CDF4(5009, 11444, 16637) },
+ { AOM_CDF4(31357, 32710, 32748) },
+ { AOM_CDF4(21552, 31494, 32504) },
+ { AOM_CDF4(13891, 27677, 31340) },
+ { AOM_CDF4(9051, 22098, 28172) },
+ { AOM_CDF4(5190, 13377, 19486) },
+ { AOM_CDF4(32364, 32740, 32748) },
+ { AOM_CDF4(24839, 31907, 32551) },
+ { AOM_CDF4(17160, 28779, 31696) },
+ { AOM_CDF4(12452, 24137, 29602) },
+ { AOM_CDF4(6165, 15389, 22477) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(2575, 7281, 11077) },
+ { AOM_CDF4(14002, 20866, 25402) },
+ { AOM_CDF4(6343, 15056, 19658) },
+ { AOM_CDF4(4474, 11858, 17041) },
+ { AOM_CDF4(2865, 8299, 12534) },
+ { AOM_CDF4(1344, 3949, 6391) },
+ { AOM_CDF4(24720, 31239, 32459) },
+ { AOM_CDF4(12585, 25356, 29968) },
+ { AOM_CDF4(7181, 18246, 24444) },
+ { AOM_CDF4(5025, 13667, 19885) },
+ { AOM_CDF4(2521, 7304, 11605) },
+ { AOM_CDF4(29908, 32252, 32584) },
+ { AOM_CDF4(17421, 29156, 31575) },
+ { AOM_CDF4(9889, 22188, 27782) },
+ { AOM_CDF4(5878, 15647, 22123) },
+ { AOM_CDF4(2814, 8665, 13323) },
+ { AOM_CDF4(30183, 32568, 32713) },
+ { AOM_CDF4(18528, 30195, 32049) },
+ { AOM_CDF4(10982, 24606, 29657) },
+ { AOM_CDF4(6957, 18165, 25231) },
+ { AOM_CDF4(3508, 10118, 15468) },
+ { AOM_CDF4(31761, 32736, 32748) },
+ { AOM_CDF4(21041, 31328, 32546) },
+ { AOM_CDF4(12568, 26732, 31166) },
+ { AOM_CDF4(8052, 20720, 27733) },
+ { AOM_CDF4(4336, 12192, 18396) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(7062, 16472, 22319) },
+ { AOM_CDF4(24538, 32261, 32674) },
+ { AOM_CDF4(13675, 28041, 31779) },
+ { AOM_CDF4(8590, 20674, 27631) },
+ { AOM_CDF4(5685, 14675, 22013) },
+ { AOM_CDF4(3655, 9898, 15731) },
+ { AOM_CDF4(26493, 32418, 32658) },
+ { AOM_CDF4(16376, 29342, 32090) },
+ { AOM_CDF4(10594, 22649, 28970) },
+ { AOM_CDF4(8176, 17170, 24303) },
+ { AOM_CDF4(5605, 12694, 19139) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(23888, 31902, 32542) },
+ { AOM_CDF4(18612, 29687, 31987) },
+ { AOM_CDF4(16245, 24852, 29249) },
+ { AOM_CDF4(15765, 22608, 27559) },
+ { AOM_CDF4(19895, 24699, 27510) },
+ { AOM_CDF4(28401, 32212, 32457) },
+ { AOM_CDF4(15274, 27825, 30980) },
+ { AOM_CDF4(9364, 18128, 24332) },
+ { AOM_CDF4(2283, 8193, 15082) },
+ { AOM_CDF4(1228, 3972, 7881) },
+ { AOM_CDF4(29455, 32469, 32620) },
+ { AOM_CDF4(17981, 28245, 31388) },
+ { AOM_CDF4(10921, 20098, 26240) },
+ { AOM_CDF4(3743, 11829, 18657) },
+ { AOM_CDF4(2374, 9593, 15715) },
+ { AOM_CDF4(31068, 32466, 32635) },
+ { AOM_CDF4(20321, 29572, 31971) },
+ { AOM_CDF4(10771, 20255, 27119) },
+ { AOM_CDF4(2795, 10410, 17361) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(9320, 22102, 27840) },
+ { AOM_CDF4(27057, 32464, 32724) },
+ { AOM_CDF4(16331, 30268, 32309) },
+ { AOM_CDF4(10319, 23935, 29720) },
+ { AOM_CDF4(6189, 16448, 24106) },
+ { AOM_CDF4(3589, 10884, 18808) },
+ { AOM_CDF4(29026, 32624, 32748) },
+ { AOM_CDF4(19226, 31507, 32587) },
+ { AOM_CDF4(12692, 26921, 31203) },
+ { AOM_CDF4(7049, 19532, 27635) },
+ { AOM_CDF4(7727, 15669, 23252) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(28056, 32625, 32748) },
+ { AOM_CDF4(22383, 32075, 32669) },
+ { AOM_CDF4(15417, 27098, 31749) },
+ { AOM_CDF4(18127, 26493, 27190) },
+ { AOM_CDF4(5461, 16384, 21845) },
+ { AOM_CDF4(27982, 32091, 32584) },
+ { AOM_CDF4(19045, 29868, 31972) },
+ { AOM_CDF4(10397, 22266, 27932) },
+ { AOM_CDF4(5990, 13697, 21500) },
+ { AOM_CDF4(1792, 6912, 15104) },
+ { AOM_CDF4(28198, 32501, 32718) },
+ { AOM_CDF4(21534, 31521, 32569) },
+ { AOM_CDF4(11109, 25217, 30017) },
+ { AOM_CDF4(5671, 15124, 26151) },
+ { AOM_CDF4(4681, 14043, 18725) },
+ { AOM_CDF4(28688, 32580, 32741) },
+ { AOM_CDF4(22576, 32079, 32661) },
+ { AOM_CDF4(10627, 22141, 28340) },
+ { AOM_CDF4(9362, 14043, 28087) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7754, 16948, 22142) },
+ { AOM_CDF4(25670, 32330, 32691) },
+ { AOM_CDF4(15663, 29225, 31994) },
+ { AOM_CDF4(9878, 23288, 29158) },
+ { AOM_CDF4(6419, 17088, 24336) },
+ { AOM_CDF4(3859, 11003, 17039) },
+ { AOM_CDF4(27562, 32595, 32725) },
+ { AOM_CDF4(17575, 30588, 32399) },
+ { AOM_CDF4(10819, 24838, 30309) },
+ { AOM_CDF4(7124, 18686, 25916) },
+ { AOM_CDF4(4479, 12688, 19340) },
+ { AOM_CDF4(28385, 32476, 32673) },
+ { AOM_CDF4(15306, 29005, 31938) },
+ { AOM_CDF4(8937, 21615, 28322) },
+ { AOM_CDF4(5982, 15603, 22786) },
+ { AOM_CDF4(3620, 10267, 16136) },
+ { AOM_CDF4(27280, 32464, 32667) },
+ { AOM_CDF4(15607, 29160, 32004) },
+ { AOM_CDF4(9091, 22135, 28740) },
+ { AOM_CDF4(6232, 16632, 24020) },
+ { AOM_CDF4(4047, 11377, 17672) },
+ { AOM_CDF4(29220, 32630, 32718) },
+ { AOM_CDF4(19650, 31220, 32462) },
+ { AOM_CDF4(13050, 26312, 30827) },
+ { AOM_CDF4(9228, 20870, 27468) },
+ { AOM_CDF4(6146, 15149, 21971) },
+ { AOM_CDF4(30169, 32481, 32623) },
+ { AOM_CDF4(17212, 29311, 31554) },
+ { AOM_CDF4(9911, 21311, 26882) },
+ { AOM_CDF4(4487, 13314, 20372) },
+ { AOM_CDF4(2570, 7772, 12889) },
+ { AOM_CDF4(30924, 32613, 32708) },
+ { AOM_CDF4(19490, 30206, 32107) },
+ { AOM_CDF4(11232, 23998, 29276) },
+ { AOM_CDF4(6769, 17955, 25035) },
+ { AOM_CDF4(4398, 12623, 19214) },
+ { AOM_CDF4(30609, 32627, 32722) },
+ { AOM_CDF4(19370, 30582, 32287) },
+ { AOM_CDF4(10457, 23619, 29409) },
+ { AOM_CDF4(6443, 17637, 24834) },
+ { AOM_CDF4(4645, 13236, 20106) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8626, 20271, 26216) },
+ { AOM_CDF4(26707, 32406, 32711) },
+ { AOM_CDF4(16999, 30329, 32286) },
+ { AOM_CDF4(11445, 25123, 30286) },
+ { AOM_CDF4(6411, 18828, 25601) },
+ { AOM_CDF4(6801, 12458, 20248) },
+ { AOM_CDF4(29918, 32682, 32748) },
+ { AOM_CDF4(20649, 31739, 32618) },
+ { AOM_CDF4(12879, 27773, 31581) },
+ { AOM_CDF4(7896, 21751, 28244) },
+ { AOM_CDF4(5260, 14870, 23698) },
+ { AOM_CDF4(29252, 32593, 32731) },
+ { AOM_CDF4(17072, 30460, 32294) },
+ { AOM_CDF4(10653, 24143, 29365) },
+ { AOM_CDF4(6536, 17490, 23983) },
+ { AOM_CDF4(4929, 13170, 20085) },
+ { AOM_CDF4(28137, 32518, 32715) },
+ { AOM_CDF4(18171, 30784, 32407) },
+ { AOM_CDF4(11437, 25436, 30459) },
+ { AOM_CDF4(7252, 18534, 26176) },
+ { AOM_CDF4(4126, 13353, 20978) },
+ { AOM_CDF4(31162, 32726, 32748) },
+ { AOM_CDF4(23017, 32222, 32701) },
+ { AOM_CDF4(15629, 29233, 32046) },
+ { AOM_CDF4(9387, 22621, 29480) },
+ { AOM_CDF4(6922, 17616, 25010) },
+ { AOM_CDF4(28838, 32265, 32614) },
+ { AOM_CDF4(19701, 30206, 31920) },
+ { AOM_CDF4(11214, 22410, 27933) },
+ { AOM_CDF4(5320, 14177, 23034) },
+ { AOM_CDF4(5049, 12881, 17827) },
+ { AOM_CDF4(27484, 32471, 32734) },
+ { AOM_CDF4(21076, 31526, 32561) },
+ { AOM_CDF4(12707, 26303, 31211) },
+ { AOM_CDF4(8169, 21722, 28219) },
+ { AOM_CDF4(6045, 19406, 27042) },
+ { AOM_CDF4(27753, 32572, 32745) },
+ { AOM_CDF4(20832, 31878, 32653) },
+ { AOM_CDF4(13250, 27356, 31674) },
+ { AOM_CDF4(7718, 21508, 29858) },
+ { AOM_CDF4(7209, 18350, 25559) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7876, 16901, 21741) },
+ { AOM_CDF4(24001, 31898, 32625) },
+ { AOM_CDF4(14529, 27959, 31451) },
+ { AOM_CDF4(8273, 20818, 27258) },
+ { AOM_CDF4(5278, 14673, 21510) },
+ { AOM_CDF4(2983, 8843, 14039) },
+ { AOM_CDF4(28016, 32574, 32732) },
+ { AOM_CDF4(17471, 30306, 32301) },
+ { AOM_CDF4(10224, 24063, 29728) },
+ { AOM_CDF4(6602, 17954, 25052) },
+ { AOM_CDF4(4002, 11585, 17759) },
+ { AOM_CDF4(30190, 32634, 32739) },
+ { AOM_CDF4(17497, 30282, 32270) },
+ { AOM_CDF4(10229, 23729, 29538) },
+ { AOM_CDF4(6344, 17211, 24440) },
+ { AOM_CDF4(3849, 11189, 17108) },
+ { AOM_CDF4(28570, 32583, 32726) },
+ { AOM_CDF4(17521, 30161, 32238) },
+ { AOM_CDF4(10153, 23565, 29378) },
+ { AOM_CDF4(6455, 17341, 24443) },
+ { AOM_CDF4(3907, 11042, 17024) },
+ { AOM_CDF4(30689, 32715, 32748) },
+ { AOM_CDF4(21546, 31840, 32610) },
+ { AOM_CDF4(13547, 27581, 31459) },
+ { AOM_CDF4(8912, 21757, 28309) },
+ { AOM_CDF4(5548, 15080, 22046) },
+ { AOM_CDF4(30783, 32540, 32685) },
+ { AOM_CDF4(17540, 29528, 31668) },
+ { AOM_CDF4(10160, 21468, 26783) },
+ { AOM_CDF4(4724, 13393, 20054) },
+ { AOM_CDF4(2702, 8174, 13102) },
+ { AOM_CDF4(31648, 32686, 32742) },
+ { AOM_CDF4(20954, 31094, 32337) },
+ { AOM_CDF4(12420, 25698, 30179) },
+ { AOM_CDF4(7304, 19320, 26248) },
+ { AOM_CDF4(4366, 12261, 18864) },
+ { AOM_CDF4(31581, 32723, 32748) },
+ { AOM_CDF4(21373, 31586, 32525) },
+ { AOM_CDF4(12744, 26625, 30885) },
+ { AOM_CDF4(7431, 20322, 26950) },
+ { AOM_CDF4(4692, 13323, 20111) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(7833, 18369, 24095) },
+ { AOM_CDF4(26650, 32273, 32702) },
+ { AOM_CDF4(16371, 29961, 32191) },
+ { AOM_CDF4(11055, 24082, 29629) },
+ { AOM_CDF4(6892, 18644, 25400) },
+ { AOM_CDF4(5006, 13057, 19240) },
+ { AOM_CDF4(29834, 32666, 32748) },
+ { AOM_CDF4(19577, 31335, 32570) },
+ { AOM_CDF4(12253, 26509, 31122) },
+ { AOM_CDF4(7991, 20772, 27711) },
+ { AOM_CDF4(5677, 15910, 23059) },
+ { AOM_CDF4(30109, 32532, 32720) },
+ { AOM_CDF4(16747, 30166, 32252) },
+ { AOM_CDF4(10134, 23542, 29184) },
+ { AOM_CDF4(5791, 16176, 23556) },
+ { AOM_CDF4(4362, 10414, 17284) },
+ { AOM_CDF4(29492, 32626, 32748) },
+ { AOM_CDF4(19894, 31402, 32525) },
+ { AOM_CDF4(12942, 27071, 30869) },
+ { AOM_CDF4(8346, 21216, 27405) },
+ { AOM_CDF4(6572, 17087, 23859) },
+ { AOM_CDF4(32035, 32735, 32748) },
+ { AOM_CDF4(22957, 31838, 32618) },
+ { AOM_CDF4(14724, 28572, 31772) },
+ { AOM_CDF4(10364, 23999, 29553) },
+ { AOM_CDF4(7004, 18433, 25655) },
+ { AOM_CDF4(27528, 32277, 32681) },
+ { AOM_CDF4(16959, 31171, 32096) },
+ { AOM_CDF4(10486, 23593, 27962) },
+ { AOM_CDF4(8192, 16384, 23211) },
+ { AOM_CDF4(8937, 17873, 20852) },
+ { AOM_CDF4(27715, 32002, 32615) },
+ { AOM_CDF4(15073, 29491, 31676) },
+ { AOM_CDF4(11264, 24576, 28672) },
+ { AOM_CDF4(2341, 18725, 23406) },
+ { AOM_CDF4(7282, 18204, 25486) },
+ { AOM_CDF4(28547, 32213, 32657) },
+ { AOM_CDF4(20788, 29773, 32239) },
+ { AOM_CDF4(6780, 21469, 30508) },
+ { AOM_CDF4(5958, 14895, 23831) },
+ { AOM_CDF4(16384, 21845, 27307) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5992, 14304, 19765) },
+ { AOM_CDF4(22612, 31238, 32456) },
+ { AOM_CDF4(13456, 27162, 31087) },
+ { AOM_CDF4(8001, 20062, 26504) },
+ { AOM_CDF4(5168, 14105, 20764) },
+ { AOM_CDF4(2632, 7771, 12385) },
+ { AOM_CDF4(27034, 32344, 32709) },
+ { AOM_CDF4(15850, 29415, 31997) },
+ { AOM_CDF4(9494, 22776, 28841) },
+ { AOM_CDF4(6151, 16830, 23969) },
+ { AOM_CDF4(3461, 10039, 15722) },
+ { AOM_CDF4(30134, 32569, 32731) },
+ { AOM_CDF4(15638, 29422, 31945) },
+ { AOM_CDF4(9150, 21865, 28218) },
+ { AOM_CDF4(5647, 15719, 22676) },
+ { AOM_CDF4(3402, 9772, 15477) },
+ { AOM_CDF4(28530, 32586, 32735) },
+ { AOM_CDF4(17139, 30298, 32292) },
+ { AOM_CDF4(10200, 24039, 29685) },
+ { AOM_CDF4(6419, 17674, 24786) },
+ { AOM_CDF4(3544, 10225, 15824) },
+ { AOM_CDF4(31333, 32726, 32748) },
+ { AOM_CDF4(20618, 31487, 32544) },
+ { AOM_CDF4(12901, 27217, 31232) },
+ { AOM_CDF4(8624, 21734, 28171) },
+ { AOM_CDF4(5104, 14191, 20748) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(11206, 21090, 26561) },
+ { AOM_CDF4(28759, 32279, 32671) },
+ { AOM_CDF4(14171, 27952, 31569) },
+ { AOM_CDF4(9743, 22907, 29141) },
+ { AOM_CDF4(6871, 17886, 24868) },
+ { AOM_CDF4(4960, 13152, 19315) },
+ { AOM_CDF4(31077, 32661, 32748) },
+ { AOM_CDF4(19400, 31195, 32515) },
+ { AOM_CDF4(12752, 26858, 31040) },
+ { AOM_CDF4(8370, 22098, 28591) },
+ { AOM_CDF4(5457, 15373, 22298) },
+ { AOM_CDF4(31697, 32706, 32748) },
+ { AOM_CDF4(17860, 30657, 32333) },
+ { AOM_CDF4(12510, 24812, 29261) },
+ { AOM_CDF4(6180, 19124, 24722) },
+ { AOM_CDF4(5041, 13548, 17959) },
+ { AOM_CDF4(31552, 32716, 32748) },
+ { AOM_CDF4(21908, 31769, 32623) },
+ { AOM_CDF4(14470, 28201, 31565) },
+ { AOM_CDF4(9493, 22982, 28608) },
+ { AOM_CDF4(6858, 17240, 24137) },
+ { AOM_CDF4(32543, 32752, 32756) },
+ { AOM_CDF4(24286, 32097, 32666) },
+ { AOM_CDF4(15958, 29217, 32024) },
+ { AOM_CDF4(10207, 24234, 29958) },
+ { AOM_CDF4(6929, 18305, 25652) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4137, 10847, 15682) },
+ { AOM_CDF4(17824, 27001, 30058) },
+ { AOM_CDF4(10204, 22796, 28291) },
+ { AOM_CDF4(6076, 15935, 22125) },
+ { AOM_CDF4(3852, 10937, 16816) },
+ { AOM_CDF4(2252, 6324, 10131) },
+ { AOM_CDF4(25840, 32016, 32662) },
+ { AOM_CDF4(15109, 28268, 31531) },
+ { AOM_CDF4(9385, 22231, 28340) },
+ { AOM_CDF4(6082, 16672, 23479) },
+ { AOM_CDF4(3318, 9427, 14681) },
+ { AOM_CDF4(30594, 32574, 32718) },
+ { AOM_CDF4(16836, 29552, 31859) },
+ { AOM_CDF4(9556, 22542, 28356) },
+ { AOM_CDF4(6305, 16725, 23540) },
+ { AOM_CDF4(3376, 9895, 15184) },
+ { AOM_CDF4(29383, 32617, 32745) },
+ { AOM_CDF4(18891, 30809, 32401) },
+ { AOM_CDF4(11688, 25942, 30687) },
+ { AOM_CDF4(7468, 19469, 26651) },
+ { AOM_CDF4(3909, 11358, 17012) },
+ { AOM_CDF4(31564, 32736, 32748) },
+ { AOM_CDF4(20906, 31611, 32600) },
+ { AOM_CDF4(13191, 27621, 31537) },
+ { AOM_CDF4(8768, 22029, 28676) },
+ { AOM_CDF4(5079, 14109, 20906) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } } };
+
+static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
+ [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
+ NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) },
+ { AOM_CDF3(29600, 31446) },
+ { AOM_CDF3(30844, 31878) },
+ { AOM_CDF3(24926, 28948) } },
+ { { AOM_CDF3(21365, 30026) },
+ { AOM_CDF3(30512, 32423) },
+ { AOM_CDF3(31658, 32621) },
+ { AOM_CDF3(29630, 31881) } } },
+ { { { AOM_CDF3(5717, 26477) },
+ { AOM_CDF3(30491, 31703) },
+ { AOM_CDF3(31550, 32158) },
+ { AOM_CDF3(29648, 31491) } },
+ { { AOM_CDF3(12608, 27820) },
+ { AOM_CDF3(30680, 32225) },
+ { AOM_CDF3(30809, 32335) },
+ { AOM_CDF3(31299, 32423) } } },
+ { { { AOM_CDF3(1786, 12612) },
+ { AOM_CDF3(30663, 31625) },
+ { AOM_CDF3(32339, 32468) },
+ { AOM_CDF3(31148, 31833) } },
+ { { AOM_CDF3(18857, 23865) },
+ { AOM_CDF3(31428, 32428) },
+ { AOM_CDF3(31744, 32373) },
+ { AOM_CDF3(31775, 32526) } } },
+ { { { AOM_CDF3(1787, 2532) },
+ { AOM_CDF3(30832, 31662) },
+ { AOM_CDF3(31824, 32682) },
+ { AOM_CDF3(32133, 32569) } },
+ { { AOM_CDF3(13751, 22235) },
+ { AOM_CDF3(32089, 32409) },
+ { AOM_CDF3(27084, 27920) },
+ { AOM_CDF3(29291, 32594) } } },
+ { { { AOM_CDF3(1725, 3449) },
+ { AOM_CDF3(31102, 31935) },
+ { AOM_CDF3(32457, 32613) },
+ { AOM_CDF3(32412, 32649) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } },
+ { { { { AOM_CDF3(17560, 29888) },
+ { AOM_CDF3(29671, 31549) },
+ { AOM_CDF3(31007, 32056) },
+ { AOM_CDF3(27286, 30006) } },
+ { { AOM_CDF3(26594, 31212) },
+ { AOM_CDF3(31208, 32582) },
+ { AOM_CDF3(31835, 32637) },
+ { AOM_CDF3(30595, 32206) } } },
+ { { { AOM_CDF3(15239, 29932) },
+ { AOM_CDF3(31315, 32095) },
+ { AOM_CDF3(32130, 32434) },
+ { AOM_CDF3(30864, 31996) } },
+ { { AOM_CDF3(26279, 30968) },
+ { AOM_CDF3(31142, 32495) },
+ { AOM_CDF3(31713, 32540) },
+ { AOM_CDF3(31929, 32594) } } },
+ { { { AOM_CDF3(2644, 25198) },
+ { AOM_CDF3(32038, 32451) },
+ { AOM_CDF3(32639, 32695) },
+ { AOM_CDF3(32166, 32518) } },
+ { { AOM_CDF3(17187, 27668) },
+ { AOM_CDF3(31714, 32550) },
+ { AOM_CDF3(32283, 32678) },
+ { AOM_CDF3(31930, 32563) } } },
+ { { { AOM_CDF3(1044, 2257) },
+ { AOM_CDF3(30755, 31923) },
+ { AOM_CDF3(32208, 32693) },
+ { AOM_CDF3(32244, 32615) } },
+ { { AOM_CDF3(21317, 26207) },
+ { AOM_CDF3(29133, 30868) },
+ { AOM_CDF3(29311, 31231) },
+ { AOM_CDF3(29657, 31087) } } },
+ { { { AOM_CDF3(478, 1834) },
+ { AOM_CDF3(31005, 31987) },
+ { AOM_CDF3(32317, 32724) },
+ { AOM_CDF3(30865, 32648) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } },
+ { { { { AOM_CDF3(20092, 30774) },
+ { AOM_CDF3(30695, 32020) },
+ { AOM_CDF3(31131, 32103) },
+ { AOM_CDF3(28666, 30870) } },
+ { { AOM_CDF3(27258, 31095) },
+ { AOM_CDF3(31804, 32623) },
+ { AOM_CDF3(31763, 32528) },
+ { AOM_CDF3(31438, 32506) } } },
+ { { { AOM_CDF3(18049, 30489) },
+ { AOM_CDF3(31706, 32286) },
+ { AOM_CDF3(32163, 32473) },
+ { AOM_CDF3(31550, 32184) } },
+ { { AOM_CDF3(27116, 30842) },
+ { AOM_CDF3(31971, 32598) },
+ { AOM_CDF3(32088, 32576) },
+ { AOM_CDF3(32067, 32664) } } },
+ { { { AOM_CDF3(12854, 29093) },
+ { AOM_CDF3(32272, 32558) },
+ { AOM_CDF3(32667, 32729) },
+ { AOM_CDF3(32306, 32585) } },
+ { { AOM_CDF3(25476, 30366) },
+ { AOM_CDF3(32169, 32687) },
+ { AOM_CDF3(32479, 32689) },
+ { AOM_CDF3(31673, 32634) } } },
+ { { { AOM_CDF3(2809, 19301) },
+ { AOM_CDF3(32205, 32622) },
+ { AOM_CDF3(32338, 32730) },
+ { AOM_CDF3(31786, 32616) } },
+ { { AOM_CDF3(22737, 29105) },
+ { AOM_CDF3(30810, 32362) },
+ { AOM_CDF3(30014, 32627) },
+ { AOM_CDF3(30528, 32574) } } },
+ { { { AOM_CDF3(935, 3382) },
+ { AOM_CDF3(30789, 31909) },
+ { AOM_CDF3(32466, 32756) },
+ { AOM_CDF3(30860, 32513) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } },
+ { { { { AOM_CDF3(22497, 31198) },
+ { AOM_CDF3(31715, 32495) },
+ { AOM_CDF3(31606, 32337) },
+ { AOM_CDF3(30388, 31990) } },
+ { { AOM_CDF3(27877, 31584) },
+ { AOM_CDF3(32170, 32728) },
+ { AOM_CDF3(32155, 32688) },
+ { AOM_CDF3(32219, 32702) } } },
+ { { { AOM_CDF3(21457, 31043) },
+ { AOM_CDF3(31951, 32483) },
+ { AOM_CDF3(32153, 32562) },
+ { AOM_CDF3(31473, 32215) } },
+ { { AOM_CDF3(27558, 31151) },
+ { AOM_CDF3(32020, 32640) },
+ { AOM_CDF3(32097, 32575) },
+ { AOM_CDF3(32242, 32719) } } },
+ { { { AOM_CDF3(19980, 30591) },
+ { AOM_CDF3(32219, 32597) },
+ { AOM_CDF3(32581, 32706) },
+ { AOM_CDF3(31803, 32287) } },
+ { { AOM_CDF3(26473, 30507) },
+ { AOM_CDF3(32431, 32723) },
+ { AOM_CDF3(32196, 32611) },
+ { AOM_CDF3(31588, 32528) } } },
+ { { { AOM_CDF3(24647, 30463) },
+ { AOM_CDF3(32412, 32695) },
+ { AOM_CDF3(32468, 32720) },
+ { AOM_CDF3(31269, 32523) } },
+ { { AOM_CDF3(28482, 31505) },
+ { AOM_CDF3(32152, 32701) },
+ { AOM_CDF3(31732, 32598) },
+ { AOM_CDF3(31767, 32712) } } },
+ { { { AOM_CDF3(12358, 24977) },
+ { AOM_CDF3(31331, 32385) },
+ { AOM_CDF3(32634, 32756) },
+ { AOM_CDF3(30411, 32548) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } } };
+
+#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c
new file mode 100644
index 0000000000..bf2bc36b04
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+
+// The ctx offset table when TX is TX_CLASS_2D.
+// TX col and row indices are clamped to 4
+
+const int8_t av1_nz_map_ctx_offset_4x4[16] = {
+ 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x8[64] = {
+ 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21,
+ 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x16[256] = {
+ 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x32[1024] = {
+ 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_4x8[32] = {
+ 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21,
+ 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x16[128] = {
+ 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x32[512] = {
+ 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x16[512] = {
+ 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x64[1024] = {
+ 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11,
+ 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_64x32[1024] = {
+ 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_4x16[64] = {
+ 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x4[64] = {
+ 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x32[256] = {
+ 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x8[256] = {
+ 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21,
+ 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t *av1_nz_map_ctx_offset[19] = {
+ av1_nz_map_ctx_offset_4x4, // TX_4x4
+ av1_nz_map_ctx_offset_8x8, // TX_8x8
+ av1_nz_map_ctx_offset_16x16, // TX_16x16
+ av1_nz_map_ctx_offset_32x32, // TX_32x32
+ av1_nz_map_ctx_offset_32x32, // TX_64x64
+ av1_nz_map_ctx_offset_4x8, // TX_4x8
+ av1_nz_map_ctx_offset_16x4, // TX_8x4
+ av1_nz_map_ctx_offset_8x16, // TX_8x16
+ av1_nz_map_ctx_offset_32x8, // TX_16x8
+ av1_nz_map_ctx_offset_16x32, // TX_16x32
+ av1_nz_map_ctx_offset_32x16, // TX_32x16
+ av1_nz_map_ctx_offset_32x64, // TX_32x64
+ av1_nz_map_ctx_offset_64x32, // TX_64x32
+ av1_nz_map_ctx_offset_4x16, // TX_4x16
+ av1_nz_map_ctx_offset_16x4, // TX_16x4
+ av1_nz_map_ctx_offset_8x32, // TX_8x32
+ av1_nz_map_ctx_offset_32x8, // TX_32x8
+ av1_nz_map_ctx_offset_32x64, // TX_16x64
+ av1_nz_map_ctx_offset_32x16, // TX_64x16
+};
+
+const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9,
+ 17, 33, 65, 129, 257, 513 };
+const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
new file mode 100644
index 0000000000..9628090b63
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.h
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
+
+#include "av1/common/av1_common_int.h"
+
+extern const int16_t av1_eob_group_start[12];
+extern const int16_t av1_eob_offset_bits[12];
+
+extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL];
+
+typedef struct txb_ctx {
+ int txb_skip_ctx;
+ int dc_sign_ctx;
+} TXB_CTX;
+
+static const int base_level_count_to_index[13] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+};
+
+static const TX_CLASS tx_type_to_class[TX_TYPES] = {
+ TX_CLASS_2D, // DCT_DCT
+ TX_CLASS_2D, // ADST_DCT
+ TX_CLASS_2D, // DCT_ADST
+ TX_CLASS_2D, // ADST_ADST
+ TX_CLASS_2D, // FLIPADST_DCT
+ TX_CLASS_2D, // DCT_FLIPADST
+ TX_CLASS_2D, // FLIPADST_FLIPADST
+ TX_CLASS_2D, // ADST_FLIPADST
+ TX_CLASS_2D, // FLIPADST_ADST
+ TX_CLASS_2D, // IDTX
+ TX_CLASS_VERT, // V_DCT
+ TX_CLASS_HORIZ, // H_DCT
+ TX_CLASS_VERT, // V_ADST
+ TX_CLASS_HORIZ, // H_ADST
+ TX_CLASS_VERT, // V_FLIPADST
+ TX_CLASS_HORIZ, // H_FLIPADST
+};
+
+static INLINE int get_txb_bhl(TX_SIZE tx_size) {
+ tx_size = av1_get_adjusted_tx_size(tx_size);
+ return tx_size_high_log2[tx_size];
+}
+
+static INLINE int get_txb_wide(TX_SIZE tx_size) {
+ tx_size = av1_get_adjusted_tx_size(tx_size);
+ return tx_size_wide[tx_size];
+}
+
+static INLINE int get_txb_high(TX_SIZE tx_size) {
+ tx_size = av1_get_adjusted_tx_size(tx_size);
+ return tx_size_high[tx_size];
+}
+
+static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int height) {
+ return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR);
+}
+
+static INLINE int get_padded_idx(const int idx, const int bhl) {
+ return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2);
+}
+
+static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+ const int c, // raster order
+ const int bhl) {
+ assert(c > 0);
+ const int col = c >> bhl;
+ const int row = c - (col << bhl);
+ const int stride = (1 << bhl) + TX_PAD_HOR;
+ const int pos = col * stride + row;
+ int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
+ AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
+ AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
+ if ((row | col) < 2) return mag + 7;
+ return mag + 14;
+}
+
+static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order
+ const int bhl,
+ const TX_CLASS tx_class) {
+ const int col = c >> bhl;
+ const int row = c - (col << bhl);
+ if (c == 0) return 0;
+ if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
+ (tx_class == TX_CLASS_HORIZ && col == 0) ||
+ (tx_class == TX_CLASS_VERT && row == 0))
+ return 7;
+ return 14;
+}
+
+static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
+ const int c, // raster order
+ const int bhl, const TX_CLASS tx_class) {
+ const int col = c >> bhl;
+ const int row = c - (col << bhl);
+ const int stride = (1 << bhl) + TX_PAD_HOR;
+ const int pos = col * stride + row;
+ int mag = levels[pos + 1];
+ mag += levels[pos + stride];
+ switch (tx_class) {
+ case TX_CLASS_2D:
+ mag += levels[pos + stride + 1];
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ if (c == 0) return mag;
+ if ((row < 2) && (col < 2)) return mag + 7;
+ break;
+ case TX_CLASS_HORIZ:
+ mag += levels[pos + (stride << 1)];
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ if (c == 0) return mag;
+ if (col == 0) return mag + 7;
+ break;
+ case TX_CLASS_VERT:
+ mag += levels[pos + 2];
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ if (c == 0) return mag;
+ if (row == 0) return mag + 7;
+ break;
+ default: break;
+ }
+
+ return mag + 14;
+}
+
+static const uint8_t clip_max3[256] = {
+ 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
+ const int bhl, const TX_CLASS tx_class) {
+ int mag;
+
+ // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+ mag = clip_max3[levels[(1 << bhl) + TX_PAD_HOR]]; // { 0, 1 }
+ mag += clip_max3[levels[1]]; // { 1, 0 }
+
+ if (tx_class == TX_CLASS_2D) {
+ mag += clip_max3[levels[(1 << bhl) + TX_PAD_HOR + 1]]; // { 1, 1 }
+ mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 }
+ mag += clip_max3[levels[2]]; // { 2, 0 }
+ } else if (tx_class == TX_CLASS_VERT) {
+ mag += clip_max3[levels[2]]; // { 2, 0 }
+ mag += clip_max3[levels[3]]; // { 3, 0 }
+ mag += clip_max3[levels[4]]; // { 4, 0 }
+ } else {
+ mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 }
+ mag += clip_max3[levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]]; // { 0, 3 }
+ mag += clip_max3[levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]]; // { 0, 4 }
+ }
+
+ return mag;
+}
+
+#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
+#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
+#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
+
+static const int nz_map_ctx_offset_1d[32] = {
+ NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+};
+
+static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
+ const int stats,
+ const int coeff_idx, // raster order
+ const int bhl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
+ // tx_class == 0(TX_CLASS_2D)
+ if ((tx_class | coeff_idx) == 0) return 0;
+ int ctx = (stats + 1) >> 1;
+ ctx = AOMMIN(ctx, 4);
+ switch (tx_class) {
+ case TX_CLASS_2D: {
+ // This is the algorithm to generate av1_nz_map_ctx_offset[][]
+ // const int width = tx_size_wide[tx_size];
+ // const int height = tx_size_high[tx_size];
+ // if (width < height) {
+ // if (row < 2) return 11 + ctx;
+ // } else if (width > height) {
+ // if (col < 2) return 16 + ctx;
+ // }
+ // if (row + col < 2) return ctx + 1;
+ // if (row + col < 4) return 5 + ctx + 1;
+ // return 21 + ctx;
+ return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+ }
+ case TX_CLASS_HORIZ: {
+ const int col = coeff_idx >> bhl;
+ return ctx + nz_map_ctx_offset_1d[col];
+ }
+ case TX_CLASS_VERT: {
+ const int col = coeff_idx >> bhl;
+ const int row = coeff_idx - (col << bhl);
+ return ctx + nz_map_ctx_offset_1d[row];
+ }
+ default: break;
+ }
+ return 0;
+}
+
+typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
+typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
+
+static INLINE int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (width << bhl) / 8) return 1;
+ if (scan_idx <= (width << bhl) / 4) return 2;
+ return 3;
+}
+
+static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+ int bhl, TX_SIZE tx_size) {
+ assert(coeff_idx > 0);
+ int mag;
+ // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+ levels = levels + get_padded_idx(coeff_idx, bhl);
+ mag = AOMMIN(levels[(1 << bhl) + TX_PAD_HOR], 3); // { 0, 1 }
+ mag += AOMMIN(levels[1], 3); // { 1, 0 }
+ mag += AOMMIN(levels[(1 << bhl) + TX_PAD_HOR + 1], 3); // { 1, 1 }
+ mag += AOMMIN(levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)], 3); // { 0, 2 }
+ mag += AOMMIN(levels[2], 3); // { 2, 0 }
+
+ const int ctx = AOMMIN((mag + 1) >> 1, 4);
+ return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+}
+static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
+ int coeff_idx, int bhl,
+ TX_SIZE tx_size,
+ TX_CLASS tx_class) {
+ const int stats =
+ get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class);
+ return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
+}
+
+static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+ int bhl, int width,
+ const uint8_t *levels,
+ int coeff_idx, TX_SIZE tx_size,
+ TX_CLASS tx_class) {
+ if (is_last) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (width << bhl) >> 3) return 1;
+ if (scan_idx <= (width << bhl) >> 2) return 2;
+ return 3;
+ }
+ return get_lower_levels_ctx(levels, coeff_idx, bhl, tx_size, tx_class);
+}
+
+static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+ if (dc_val < 0)
+ *cul_level |= 1 << COEFF_CONTEXT_BITS;
+ else if (dc_val > 0)
+ *cul_level += 2 << COEFF_CONTEXT_BITS;
+}
+
+static void get_txb_ctx_general(const BLOCK_SIZE plane_bsize,
+ const TX_SIZE tx_size, const int plane,
+ const ENTROPY_CONTEXT *const a,
+ const ENTROPY_CONTEXT *const l,
+ TXB_CTX *const txb_ctx) {
+#define MAX_TX_SIZE_UNIT 16
+ static const int8_t signs[3] = { 0, -1, 1 };
+ static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ };
+ const int txb_w_unit = tx_size_wide_unit[tx_size];
+ const int txb_h_unit = tx_size_high_unit[tx_size];
+ int dc_sign = 0;
+ int k = 0;
+
+ do {
+ const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+ assert(sign <= 2);
+ dc_sign += signs[sign];
+ } while (++k < txb_w_unit);
+
+ k = 0;
+ do {
+ const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+ assert(sign <= 2);
+ dc_sign += signs[sign];
+ } while (++k < txb_h_unit);
+
+ txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
+
+ if (plane == 0) {
+ if (plane_bsize == txsize_to_bsize[tx_size]) {
+ txb_ctx->txb_skip_ctx = 0;
+ } else {
+ // This is the algorithm to generate table skip_contexts[top][left].
+ // const int max = AOMMIN(top | left, 4);
+ // const int min = AOMMIN(AOMMIN(top, left), 4);
+ // if (!max)
+ // txb_skip_ctx = 1;
+ // else if (!min)
+ // txb_skip_ctx = 2 + (max > 3);
+ // else if (max <= 3)
+ // txb_skip_ctx = 4;
+ // else if (min <= 3)
+ // txb_skip_ctx = 5;
+ // else
+ // txb_skip_ctx = 6;
+ static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 3, 5, 5, 5, 6 } };
+ // For top and left, we only care about which of the following three
+ // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
+ // spec calculates top and left with the Max() function. We can calculate
+ // an approximate max with bitwise OR because the real max and the
+ // approximate max belong to the same category.
+ int top = 0;
+ int left = 0;
+
+ k = 0;
+ do {
+ top |= a[k];
+ } while (++k < txb_w_unit);
+ top &= COEFF_CONTEXT_MASK;
+ top = AOMMIN(top, 4);
+
+ k = 0;
+ do {
+ left |= l[k];
+ } while (++k < txb_h_unit);
+ left &= COEFF_CONTEXT_MASK;
+ left = AOMMIN(left, 4);
+
+ txb_ctx->txb_skip_ctx = skip_contexts[top][left];
+ }
+ } else {
+ const int ctx_base = get_entropy_context(tx_size, a, l);
+ const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
+ num_pels_log2_lookup[txsize_to_bsize[tx_size]])
+ ? 10
+ : 7;
+ txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
+ }
+}
+
+#define SPECIALIZE_GET_TXB_CTX(w, h) \
+ static void get_txb_ctx_##w##x##h( \
+ const BLOCK_SIZE plane_bsize, const int plane, \
+ const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, \
+ TXB_CTX *const txb_ctx) { \
+ static const int8_t signs[3] = { 0, -1, 1 }; \
+ static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, \
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 \
+ }; \
+ const TX_SIZE tx_size = TX_##w##X##h; \
+ const int txb_w_unit = tx_size_wide_unit[tx_size]; \
+ const int txb_h_unit = tx_size_high_unit[tx_size]; \
+ int dc_sign = 0; \
+ int k = 0; \
+ \
+ do { \
+ const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; \
+ assert(sign <= 2); \
+ dc_sign += signs[sign]; \
+ } while (++k < txb_w_unit); \
+ \
+ k = 0; \
+ do { \
+ const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; \
+ assert(sign <= 2); \
+ dc_sign += signs[sign]; \
+ } while (++k < txb_h_unit); \
+ \
+ txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; \
+ \
+ if (plane == 0) { \
+ if (plane_bsize == txsize_to_bsize[tx_size]) { \
+ txb_ctx->txb_skip_ctx = 0; \
+ } else { \
+ static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, \
+ { 2, 4, 4, 4, 5 }, \
+ { 2, 4, 4, 4, 5 }, \
+ { 2, 4, 4, 4, 5 }, \
+ { 3, 5, 5, 5, 6 } }; \
+ int top = 0; \
+ int left = 0; \
+ \
+ k = 0; \
+ do { \
+ top |= a[k]; \
+ } while (++k < txb_w_unit); \
+ top &= COEFF_CONTEXT_MASK; \
+ top = AOMMIN(top, 4); \
+ \
+ k = 0; \
+ do { \
+ left |= l[k]; \
+ } while (++k < txb_h_unit); \
+ left &= COEFF_CONTEXT_MASK; \
+ left = AOMMIN(left, 4); \
+ \
+ txb_ctx->txb_skip_ctx = skip_contexts[top][left]; \
+ } \
+ } else { \
+ const int ctx_base = get_entropy_context(tx_size, a, l); \
+ const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > \
+ num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
+ ? 10 \
+ : 7; \
+ txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; \
+ } \
+ }
+
+SPECIALIZE_GET_TXB_CTX(4, 4)
+SPECIALIZE_GET_TXB_CTX(8, 8)
+SPECIALIZE_GET_TXB_CTX(16, 16)
+SPECIALIZE_GET_TXB_CTX(32, 32)
+
+// Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_*
+// so that the compiler can compile away the while loops.
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+ const TX_SIZE tx_size, const int plane,
+ const ENTROPY_CONTEXT *const a,
+ const ENTROPY_CONTEXT *const l,
+ TXB_CTX *const txb_ctx) {
+ switch (tx_size) {
+ case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break;
+ case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break;
+ case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break;
+ case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break;
+ default:
+ get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx);
+ break;
+ }
+}
+#undef MAX_TX_SIZE_UNIT
+
+#endif // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
new file mode 100644
index 0000000000..4282b92bfa
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
+// at a time. The zoom/rotation/shear in the model are applied to the
+// "fractional" position of each pixel, which therefore varies within
+// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
+// We need an extra 2 taps to fit this in, for a total of 8 taps.
+/* clang-format off */
+const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+ // [-1, 0)
+ { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
+ { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
+ { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
+ { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
+ { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
+ { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
+ { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
+ { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
+ { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
+ { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
+ { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
+ { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
+ { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
+ { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
+ { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
+ { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
+ { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
+ { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
+ { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
+ { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
+ { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
+ { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
+ { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
+ { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
+ { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
+ { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
+ { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
+ { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
+ { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
+ { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
+ { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
+ { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
+
+ // [0, 1)
+ { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
+ { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
+ { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
+ {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
+ {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
+ {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
+ {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
+ {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
+ {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
+ {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
+ {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
+ {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
+ {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
+ {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
+ {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
+ {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
+ {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
+ {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
+ { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
+ { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
+
+ // [1, 2)
+ { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
+ { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
+ { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
+ { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
+ { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
+ { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
+ { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
+ { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
+ { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
+ { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
+ { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
+ { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
+ { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
+ { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
+ { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
+ { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
+ { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
+ { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
+ { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
+ { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
+ { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
+ { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
+ { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
+ { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
+ { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
+ { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
+ { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
+ { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
+ { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
+ { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
+ { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
+ { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
+ // dummy (replicate row index 191)
+ { 0, 0, 0, 0, 2, 127, - 1, 0 },
+};
+
+/* clang-format on */
+
+#define DIV_LUT_PREC_BITS 14
+#define DIV_LUT_BITS 8
+#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
+
+static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192,
+};
+
+// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
+// at precision of DIV_LUT_PREC_BITS along with the shift.
+static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
+ int64_t f;
+ *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+ : get_msb((unsigned int)D));
+ // e is obtained from D after resetting the most significant 1 bit.
+ const int64_t e = D - ((uint64_t)1 << *shift);
+ // Get the most significant DIV_LUT_BITS (8) bits of e into f
+ if (*shift > DIV_LUT_BITS)
+ f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
+ else
+ f = e << (DIV_LUT_BITS - *shift);
+ assert(f <= DIV_LUT_NUM);
+ *shift += DIV_LUT_PREC_BITS;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
+ int32_t f;
+ *shift = get_msb(D);
+ // e is obtained from D after resetting the most significant 1 bit.
+ const int32_t e = D - ((uint32_t)1 << *shift);
+ // Get the most significant DIV_LUT_BITS (8) bits of e into f
+ if (*shift > DIV_LUT_BITS)
+ f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
+ else
+ f = e << (DIV_LUT_BITS - *shift);
+ assert(f <= DIV_LUT_NUM);
+ *shift += DIV_LUT_PREC_BITS;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+static int is_affine_valid(const WarpedMotionParams *const wm) {
+ const int32_t *mat = wm->wmmat;
+ return (mat[2] > 0);
+}
+
+static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+ (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+ return 0;
+ else
+ return 1;
+}
+
+#ifndef NDEBUG
+// Check that the given warp model satisfies the relevant constraints for
+// its stated model type
+static void check_model_consistency(WarpedMotionParams *wm) {
+ switch (wm->wmtype) {
+ case IDENTITY:
+ assert(wm->wmmat[0] == 0);
+ assert(wm->wmmat[1] == 0);
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
+ assert(wm->wmmat[3] == 0);
+ AOM_FALLTHROUGH_INTENDED;
+ case ROTZOOM:
+ assert(wm->wmmat[4] == -wm->wmmat[3]);
+ assert(wm->wmmat[5] == wm->wmmat[2]);
+ AOM_FALLTHROUGH_INTENDED;
+ case AFFINE: break;
+ default: assert(0 && "Bad wmtype");
+ }
+}
+#endif // NDEBUG
+
+// Returns 1 on success or 0 on an invalid affine set
+int av1_get_shear_params(WarpedMotionParams *wm) {
+#ifndef NDEBUG
+ // Check that models have been constructed sensibly
+ // This is a good place to check, because this function does not need to
+ // be called until after model construction is complete, but must be called
+ // before the model can be used for prediction.
+ check_model_consistency(wm);
+#endif // NDEBUG
+
+ const int32_t *mat = wm->wmmat;
+ if (!is_affine_valid(wm)) return 0;
+
+ wm->alpha =
+ clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+ wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+ int16_t shift;
+ int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
+ int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+ wm->gamma =
+ clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
+ v = ((int64_t)mat[3] * mat[4]) * y;
+ wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
+ (1 << WARPEDMODEL_PREC_BITS),
+ INT16_MIN, INT16_MAX);
+
+ wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+
+ if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+ return 0;
+
+ return 1;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/* Note: For an explanation of the warp algorithm, and some notes on bit widths
+ for hardware implementations, see the comments above av1_warp_affine_c
+*/
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride, uint16_t *pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x,
+ int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int32_t tmp[15 * 8];
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ (void)max_bits_horiz;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ for (int i = p_row; i < p_row + p_height; i += 8) {
+ for (int j = p_col; j < p_col + p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (j + 4) << subsampling_x;
+ const int32_t src_y = (i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4);
+ sy4 += gamma * (-4) + delta * (-4);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ for (int k = -7; k < 8; ++k) {
+ const int iy = clamp(iy4 + k, 0, height - 1);
+
+ int sx = sx4 + beta * (k + 4);
+ for (int l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = av1_warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_horiz;
+ for (int m = 0; m < 8; ++m) {
+ const int sample_x = clamp(ix + m, 0, width - 1);
+ sum += ref[iy * stride + sample_x] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+ assert(0 <= sum && sum < (1 << max_bits_horiz));
+ tmp[(k + 7) * 8 + (l + 4)] = sum;
+ sx += alpha;
+ }
+ }
+
+ // Vertical filter
+ for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = av1_warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_vert;
+ for (int m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *p =
+ &conv_params
+ ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+ (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ if (conv_params->do_average) {
+ uint16_t *dst16 =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ int32_t tmp32 = *p;
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp32 = tmp32 * conv_params->fwd_offset +
+ sum * conv_params->bck_offset;
+ tmp32 = tmp32 >> DIST_PRECISION_BITS;
+ } else {
+ tmp32 += sum;
+ tmp32 = tmp32 >> 1;
+ }
+ tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ *dst16 =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
+ } else {
+ *p = sum;
+ }
+ } else {
+ uint16_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ assert(0 <= sum && sum < (1 << (bd + 2)));
+ *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
+ }
+ sy += gamma;
+ }
+ }
+ }
+ }
+}
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+ int width, int height, int stride, uint16_t *const pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ int bd, ConvolveParams *conv_params) {
+ const int32_t *const mat = wm->wmmat;
+ const int16_t alpha = wm->alpha;
+ const int16_t beta = wm->beta;
+ const int16_t gamma = wm->gamma;
+ const int16_t delta = wm->delta;
+
+ av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, bd, conv_params, alpha, beta, gamma,
+ delta);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+ * Split the input into 8x8 blocks
+ * For each block, project the point (4, 4) within the block, to get the
+ overall block position. Split into integer and fractional coordinates,
+ maintaining full WARPEDMODEL precision
+ * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+ variable horizontal offset. This means that, while the rows of the
+ intermediate buffer align with the rows of the *reference* image, the
+ columns align with the columns of the *destination* image.
+ * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+ destination is too small we crop the output at this stage). Each pixel has
+ a variable vertical offset, so that the resulting rows are aligned with
+ the rows of the destination image.
+
+ To accomplish these alignments, we factor the warp matrix as a
+ product of two shear / asymmetric zoom matrices:
+ / a b \ = / 1 0 \ * / 1+alpha beta \
+ \ c d / \ gamma 1+delta / \ 0 1 /
+ where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+ The horizontal shear (with alpha and beta) is applied first,
+ then the vertical shear (with gamma and delta) is applied second.
+
+ The only limitation is that, to fit this in a fixed 8-tap filter size,
+ the fractional pixel offsets must be at most +-1. Since the horizontal filter
+ generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+ within the block, the parameters must satisfy
+ 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
+ for this filter to be applicable.
+
+ Note: This function assumes that the caller has done all of the relevant
+ checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+ are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+ gamma, delta are all in range.
+
+ TODO(rachelbarker): Maybe support scaled references?
+*/
+/* A note on hardware implementation:
+ The warp filter is intended to be implementable using the same hardware as
+ the high-precision convolve filters from the loop-restoration and
+ convolve-round experiments.
+
+ For a single filter stage, considering all of the coefficient sets for the
+ warp filter and the regular convolution filter, an input in the range
+ [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
+ before rounding.
+
+ Allowing for some changes to the filter coefficient sets, call the range
+ [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
+ we can replace this by the range [0, 256 * 2^k], which can be stored in an
+ unsigned value with 8 + k bits.
+
+ This allows the derivation of the appropriate bit widths and offsets for
+ the various intermediate values: If
+
+ F := FILTER_BITS = 7 (or else the above ranges need adjusting)
+ So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
+ intermediate value.
+ H := ROUND0_BITS
+ V := VERSHEAR_REDUCE_PREC_BITS
+ (and note that we must have H + V = 2*F for the output to have the same
+ scale as the input)
+
+ then we end up with the following offsets and ranges:
+ Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
+ uint{bd + F + 1}
+ After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
+ Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
+ uint{bd + 2*F + 2 - H}
+ After rounding: The final value, before undoing the offset, fits into a
+ uint{bd + 2}.
+
+ Then we need to undo the offsets before clamping to a pixel. Note that,
+ if we do this at the end, the amount to subtract is actually independent
+ of H and V:
+
+ offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
+ (1 << ((bd + 2*F - H) - V))
+ == (1 << (bd - 1)) + (1 << bd)
+
+ This allows us to entirely avoid clamping in both the warp filter and
+ the convolve-round experiment. As of the time of writing, the Wiener filter
+ from loop-restoration can encode a central coefficient up to 216, which
+ leads to a maximum value of about 282 * 2^k after applying the offset.
+ So in that case we still need to clamp.
+*/
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta) {
+ int32_t tmp[15 * 8];
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ (void)max_bits_horiz;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ for (int i = p_row; i < p_row + p_height; i += 8) {
+ for (int j = p_col; j < p_col + p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (j + 4) << subsampling_x;
+ const int32_t src_y = (i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4);
+ sy4 += gamma * (-4) + delta * (-4);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ for (int k = -7; k < 8; ++k) {
+ // Clamp to top/bottom edge of the frame
+ const int iy = clamp(iy4 + k, 0, height - 1);
+
+ int sx = sx4 + beta * (k + 4);
+
+ for (int l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ // At this point, sx = sx4 + alpha * l + beta * k
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = av1_warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_horiz;
+ for (int m = 0; m < 8; ++m) {
+ // Clamp to left/right edge of the frame
+ const int sample_x = clamp(ix + m, 0, width - 1);
+
+ sum += ref[iy * stride + sample_x] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+ assert(0 <= sum && sum < (1 << max_bits_horiz));
+ tmp[(k + 7) * 8 + (l + 4)] = sum;
+ sx += alpha;
+ }
+ }
+
+ // Vertical filter
+ for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+ // At this point, sy = sy4 + gamma * l + delta * k
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = av1_warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_vert;
+ for (int m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *p =
+ &conv_params
+ ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+ (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ if (conv_params->do_average) {
+ uint8_t *dst8 =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ int32_t tmp32 = *p;
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp32 = tmp32 * conv_params->fwd_offset +
+ sum * conv_params->bck_offset;
+ tmp32 = tmp32 >> DIST_PRECISION_BITS;
+ } else {
+ tmp32 += sum;
+ tmp32 = tmp32 >> 1;
+ }
+ tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
+ } else {
+ *p = sum;
+ }
+ } else {
+ uint8_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ assert(0 <= sum && sum < (1 << (bd + 2)));
+ *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
+ }
+ sy += gamma;
+ }
+ }
+ }
+ }
+}
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+ int height, int stride, uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params) {
+ const int32_t *const mat = wm->wmmat;
+ const int16_t alpha = wm->alpha;
+ const int16_t beta = wm->beta;
+ const int16_t gamma = wm->gamma;
+ const int16_t delta = wm->delta;
+ av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, conv_params,
+ alpha, beta, gamma, delta);
+}
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd)
+ highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+ CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, bd,
+ conv_params);
+ else
+ warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#else
+ (void)use_hbd;
+ (void)bd;
+ warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#endif
+}
+
+#define LS_MV_MAX 256 // max mv in 1/8-pel
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
+
+// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
+// the precision needed is:
+// (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
+// (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
+// 1 [for sign] +
+// LEAST_SQUARES_SAMPLES_MAX_BITS
+// [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
+// The value is 23
+#define LS_MAT_RANGE_BITS \
+ ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+// Bit-depth reduction from the full-range
+#define LS_MAT_DOWN_BITS 2
+
+// bits range of A, Bx and By after downshifting
+#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
+#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
+#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
+
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a) \
+ (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+ (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b) \
+ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+ (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b) \
+ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+ (2 + LS_MAT_DOWN_BITS))
+
+#define USE_LIMITED_PREC_MULT 0
+
+#if USE_LIMITED_PREC_MULT
+
+#define MUL_PREC_BITS 16
+static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
+ int msb = 0;
+ uint16_t mult = 0;
+ *shift = 0;
+ if (D != 0) {
+ msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+ : get_msb((unsigned int)D));
+ if (msb >= MUL_PREC_BITS) {
+ mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
+ *shift = msb + 1 - MUL_PREC_BITS;
+ } else {
+ mult = (uint16_t)D;
+ *shift = 0;
+ }
+ }
+ return mult;
+}
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+ int32_t ret;
+ int16_t mshift;
+ uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+ int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+ shift -= mshift;
+ if (shift > 0) {
+ return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ } else {
+ return (int32_t)clamp(v * (1 << (-shift)),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ }
+ return ret;
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+ int16_t mshift;
+ uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+ int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+ shift -= mshift;
+ if (shift > 0) {
+ return (int32_t)clamp(
+ ROUND_POWER_OF_TWO_SIGNED(v, shift),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ } else {
+ return (int32_t)clamp(
+ v * (1 << (-shift)),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ }
+}
+
+#else
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+ int64_t v = Px * (int64_t)iDet;
+ return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+ int64_t v = Px * (int64_t)iDet;
+ return (int32_t)clamp64(
+ ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+#endif // USE_LIMITED_PREC_MULT
+
+static int find_affine_int(int np, const int *pts1, const int *pts2,
+ BLOCK_SIZE bsize, int mvy, int mvx,
+ WarpedMotionParams *wm, int mi_row, int mi_col) {
+ int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
+ int32_t Bx[2] = { 0, 0 };
+ int32_t By[2] = { 0, 0 };
+
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int rsuy = bh / 2 - 1;
+ const int rsux = bw / 2 - 1;
+ const int suy = rsuy * 8;
+ const int sux = rsux * 8;
+ const int duy = suy + mvy;
+ const int dux = sux + mvx;
+
+ // Assume the center pixel of the block has exactly the same motion vector
+ // as transmitted for the block. First shift the origin of the source
+ // points to the block center, and the origin of the destination points to
+ // the block center added to the motion vector transmitted.
+ // Let (xi, yi) denote the source points and (xi', yi') denote destination
+ // points after origin shfifting, for i = 0, 1, 2, .... n-1.
+ // Then if P = [x0, y0,
+ // x1, y1
+ // x2, y1,
+ // ....
+ // ]
+ // q = [x0', x1', x2', ... ]'
+ // r = [y0', y1', y2', ... ]'
+ // the least squares problems that need to be solved are:
+ // [h1, h2]' = inv(P'P)P'q and
+ // [h3, h4]' = inv(P'P)P'r
+ // where the affine transformation is given by:
+ // x' = h1.x + h2.y
+ // y' = h3.x + h4.y
+ //
+ // The loop below computes: A = P'P, Bx = P'q, By = P'r
+ // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+ // Contribution from neighbor block
+ for (int i = 0; i < np; i++) {
+ const int dx = pts2[i * 2] - dux;
+ const int dy = pts2[i * 2 + 1] - duy;
+ const int sx = pts1[i * 2] - sux;
+ const int sy = pts1[i * 2 + 1] - suy;
+ // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+ // selection is done in find_samples(). Also, global offset can be removed
+ // while collecting samples.
+ if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+ A[0][0] += LS_SQUARE(sx);
+ A[0][1] += LS_PRODUCT1(sx, sy);
+ A[1][1] += LS_SQUARE(sy);
+ Bx[0] += LS_PRODUCT2(sx, dx);
+ Bx[1] += LS_PRODUCT1(sy, dx);
+ By[0] += LS_PRODUCT1(sx, dy);
+ By[1] += LS_PRODUCT2(sy, dy);
+ }
+ }
+
+ // Just for debugging, and can be removed later.
+ assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+ assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+ assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+ assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+ assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+ assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+ assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
+
+ // Compute Determinant of A
+ const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+ if (Det == 0) return 1;
+
+ int16_t shift;
+ int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+ shift -= WARPEDMODEL_PREC_BITS;
+ if (shift < 0) {
+ iDet <<= (-shift);
+ shift = 0;
+ }
+
+ int64_t Px[2], Py[2];
+ // These divided by the Det, are the least squares solutions
+ Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+ Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+ Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+ Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
+ wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
+ wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
+ wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
+ wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
+
+ const int isuy = (mi_row * MI_SIZE + rsuy);
+ const int isux = (mi_col * MI_SIZE + rsux);
+ // Note: In the vx, vy expressions below, the max value of each of the
+ // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
+ // for the first term so that the overall sum in the worst case fits
+ // within 32 bits overall.
+ const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+ isuy * wm->wmmat[3]);
+ const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * wm->wmmat[4] +
+ isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+ wm->wmmat[0] =
+ clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+ wm->wmmat[1] =
+ clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+ return 0;
+}
+
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+ BLOCK_SIZE bsize, int mvy, int mvx,
+ WarpedMotionParams *wm_params, int mi_row, int mi_col) {
+ assert(wm_params->wmtype == AFFINE);
+
+ if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
+ mi_col))
+ return 1;
+
+ // check compatibility with the fast warp filter
+ if (!av1_get_shear_params(wm_params)) return 1;
+
+ return 0;
+}
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
new file mode 100644
index 0000000000..d772df8873
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/mv.h"
+#include "av1/common/convolve.h"
+
+#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
+#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
+#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
+#define WARPED_MOTION_DEBUG 0
+#define DEFAULT_WMTYPE AFFINE
+#define WARP_ERROR_BLOCK_LOG 5
+#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
+
+extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+DECLARE_ALIGNED(8, extern const int8_t,
+ av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
+
+static const uint8_t warp_pad_left[14][16] = {
+ { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
+ { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
+ { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
+ { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
+ { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
+ { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
+};
+
+static const uint8_t warp_pad_right[14][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
+ { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
+ { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
+ { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+ { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
+};
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+ int width, int height, int stride, uint16_t *const pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ int bd, ConvolveParams *conv_params);
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+ int height, int stride, uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params);
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params);
+
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+ BLOCK_SIZE bsize, int mvy, int mvx,
+ WarpedMotionParams *wm_params, int mi_row, int mi_col);
+
+int av1_get_shear_params(WarpedMotionParams *wm);
+#endif // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 0000000000..8aa14696f6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const uint8_t *src_y;
+ uint8_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint8_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 8-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_8 = xx_loadl_64(src_x0);
+ const __m128i src1_8 = xx_loadl_64(src_x1);
+ const __m128i src2_8 = xx_loadl_64(src_x2);
+ const __m128i src3_8 = xx_loadl_64(src_x3);
+
+ // Now zero-extend up to 16-bit precision, i.e.
+ // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+ const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+ const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+ const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+ const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Pack 16-bit values into 8-bit values, i.e.
+ // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+ // -> [ 0 0 0 0 0 0 DC BA ]
+ const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+ // Write to the output
+ xx_storel_32(&dst_y[x], shifted_8);
+ }
+ }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ int x0_qn, int x_step_qn, int bd) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+ const uint16_t *src_y;
+ uint16_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint16_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 16-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_16 = xx_loadu_128(src_x0);
+ const __m128i src1_16 = xx_loadu_128(src_x1);
+ const __m128i src2_16 = xx_loadu_128(src_x2);
+ const __m128i src3_16 = xx_loadu_128(src_x3);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+ // Write to the output
+ xx_storel_64(&dst_y[x], clipped_16);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
new file mode 100644
index 0000000000..8e293b5bb1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
+ int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params, int round) {
+ const int bd = 8;
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+ const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint8_t *const src0 = src_col + y * src_stride;
+ const uint8_t *const src1 = src0 + 1 * src_stride;
+ const uint8_t *const src2 = src0 + 2 * src_stride;
+ const uint8_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 8-bit input data; each load is just
+ // loading the lower half of the register and gets 8 pixels
+ const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
+ const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
+ const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
+ const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
+
+ // Now zero-extend up to 16-bit precision by interleaving with
+ // zeros. Drop the upper half of each register (which just had zeros)
+ const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+ const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+ const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+ const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ shifted = _mm_packus_epi32(shifted, shifted);
+ // Write transposed to the output
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint8_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+ __m128i data = _mm_loadu_si128((__m128i *)src);
+ return _mm_madd_epi16(data, coeff);
+}
+
+// A specialised version of vfilter, the vertical filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn, const InterpFilterParams *filter_params,
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int ntaps = 8;
+
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi16(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16((short)w0);
+ const __m128i wt1 = _mm_set1_epi16((short)w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+ conv = _mm_add_epi32(conv, res_add_const);
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint8_t *dst_x = dst + y * dst_stride + x;
+ __m128i result;
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+ if (conv_params->do_average) {
+ const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+ }
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(int *)dst_x = _mm_cvtsi128_si32(result_8);
+ } else {
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(int *)dst_x = _mm_cvtsi128_si32(result_8);
+ }
+ }
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - sub32;
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ }
+ }
+ }
+}
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
+ // horizontal filter
+ hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+ x_step_qn, filter_params_x, conv_params->round_0);
+
+ // vertical filter (input is transposed)
+ vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, 8);
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+ int w, int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params, int round,
+ int bd) {
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint16_t *const src0 = src_col + y * src_stride;
+ const uint16_t *const src1 = src0 + 1 * src_stride;
+ const uint16_t *const src2 = src0 + 2 * src_stride;
+ const uint16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 16-bit input data, so each load gets the 8
+ // pixels we need.
+ const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+ const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+ const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+ const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ shifted = _mm_packus_epi32(shifted, shifted);
+ // Write transposed to the output
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint16_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+// A specialised version of vfilter, the vertical filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn,
+ const InterpFilterParams *filter_params,
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int ntaps = 8;
+
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi32(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const __m128i clip_pixel_ =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+ conv = _mm_add_epi32(conv, res_add_const);
+
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint16_t *dst_x = dst + y * dst_stride + x;
+
+ __m128i result;
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+ if (conv_params->do_average) {
+ __m128i p_32 =
+ _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(shifted, wt1));
+ shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+ } else {
+ shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+ }
+ result = _mm_sub_epi32(shifted, sub);
+ result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const),
+ round_bits_shift);
+
+ result = _mm_packus_epi32(result, result);
+ result = _mm_min_epi16(result, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, result);
+ } else {
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ result = _mm_sub_epi32(shifted, sub);
+ result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift);
+ result = _mm_packus_epi32(result, result);
+ result = _mm_min_epi16(result, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, result);
+ }
+ }
+
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_dist_wtd_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ // TODO(yaowu): Move this out of stack
+ DECLARE_ALIGNED(16, int16_t,
+ tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+
+ memset(tmp, 0, sizeof(tmp));
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
+ // horizontal filter
+ highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+ subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+ bd);
+
+ // vertical filter (input is transposed)
+ highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, bd);
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 0000000000..0afd42b170
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,2254 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
+}
+
+static void idct16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = input[8];
+ x1[2] = input[4];
+ x1[3] = input[12];
+ x1[4] = input[2];
+ x1[5] = input[10];
+ x1[6] = input[6];
+ x1[7] = input[14];
+ x1[8] = input[1];
+ x1[9] = input[9];
+ x1[10] = input[5];
+ x1[11] = input[13];
+ x1[12] = input[3];
+ x1[13] = input[11];
+ x1[14] = input[7];
+ x1[15] = input[15];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
+ INV_COS_BIT);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+
+ idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[2] = input[4];
+ x1[4] = input[2];
+ x1[6] = input[6];
+ x1[8] = input[1];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+
+ idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x1[2];
+ x1[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+ // stage 5
+ // stage 6
+ output[0] = x1[0];
+ output[1] = x1[1];
+ output[2] = x1[1];
+ output[3] = x1[0];
+ output[4] = x1[0];
+ output[5] = x1[1];
+ output[6] = x1[1];
+ output[7] = x1[0];
+ output[8] = x1[0];
+ output[9] = x1[1];
+ output[10] = x1[1];
+ output[11] = x1[0];
+ output[12] = x1[0];
+ output[13] = x1[1];
+ output[14] = x1[1];
+ output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[8]);
+ btf_16_adds_subs_avx2(&x[1], &x[9]);
+ btf_16_adds_subs_avx2(&x[2], &x[10]);
+ btf_16_adds_subs_avx2(&x[3], &x[11]);
+ btf_16_adds_subs_avx2(&x[4], &x[12]);
+ btf_16_adds_subs_avx2(&x[5], &x[13]);
+ btf_16_adds_subs_avx2(&x[6], &x[14]);
+ btf_16_adds_subs_avx2(&x[7], &x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[4]);
+ btf_16_adds_subs_avx2(&x[1], &x[5]);
+ btf_16_adds_subs_avx2(&x[2], &x[6]);
+ btf_16_adds_subs_avx2(&x[3], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[12]);
+ btf_16_adds_subs_avx2(&x[9], &x[13]);
+ btf_16_adds_subs_avx2(&x[10], &x[14]);
+ btf_16_adds_subs_avx2(&x[11], &x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[2]);
+ btf_16_adds_subs_avx2(&x[1], &x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[6]);
+ btf_16_adds_subs_avx2(&x[5], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[10]);
+ btf_16_adds_subs_avx2(&x[9], &x[11]);
+ btf_16_adds_subs_avx2(&x[12], &x[14]);
+ btf_16_adds_subs_avx2(&x[13], &x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+ const __m256i __zero = _mm256_setzero_si256();
+ output[0] = x1[0];
+ output[1] = _mm256_subs_epi16(__zero, x1[8]);
+ output[2] = x1[12];
+ output[3] = _mm256_subs_epi16(__zero, x1[4]);
+ output[4] = x1[6];
+ output[5] = _mm256_subs_epi16(__zero, x1[14]);
+ output[6] = x1[10];
+ output[7] = _mm256_subs_epi16(__zero, x1[2]);
+ output[8] = x1[3];
+ output[9] = _mm256_subs_epi16(__zero, x1[11]);
+ output[10] = x1[15];
+ output[11] = _mm256_subs_epi16(__zero, x1[7]);
+ output[12] = x1[5];
+ output[13] = _mm256_subs_epi16(__zero, x1[13]);
+ output[14] = x1[9];
+ output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[15];
+ x1[1] = input[0];
+ x1[2] = input[13];
+ x1[3] = input[2];
+ x1[4] = input[11];
+ x1[5] = input[4];
+ x1[6] = input[9];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[9] = input[8];
+ x1[10] = input[5];
+ x1[11] = input[10];
+ x1[12] = input[3];
+ x1[13] = input[12];
+ x1[14] = input[1];
+ x1[15] = input[14];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
+ INV_COS_BIT);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+ x1[3] = input[2];
+ x1[5] = input[4];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[1];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+ btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+ btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+ btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+ btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+ btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+ btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+ // stage 3
+ x1[8] = x1[0];
+ x1[9] = x1[1];
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
+ INV_COS_BIT);
+
+ // stage 5
+ x1[4] = x1[0];
+ x1[5] = x1[1];
+
+ x1[12] = x1[8];
+ x1[13] = x1[9];
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
+ INV_COS_BIT);
+
+ // stage 7
+ x1[2] = x1[0];
+ x1[3] = x1[1];
+ x1[6] = x1[4];
+ x1[7] = x1[5];
+ x1[10] = x1[8];
+ x1[11] = x1[9];
+ x1[14] = x1[12];
+ x1[15] = x1[13];
+
+ iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
+}
+
+static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_avx2(x);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
+
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m256i x1[32];
+ x1[0] = input[0];
+ x1[1] = input[16];
+ x1[2] = input[8];
+ x1[3] = input[24];
+ x1[4] = input[4];
+ x1[5] = input[20];
+ x1[6] = input[12];
+ x1[7] = input[28];
+ x1[8] = input[2];
+ x1[9] = input[18];
+ x1[10] = input[10];
+ x1[11] = input[26];
+ x1[12] = input[6];
+ x1[13] = input[22];
+ x1[14] = input[14];
+ x1[15] = input[30];
+ x1[16] = input[1];
+ x1[17] = input[17];
+ x1[18] = input[9];
+ x1[19] = input[25];
+ x1[20] = input[5];
+ x1[21] = input[21];
+ x1[22] = input[13];
+ x1[23] = input[29];
+ x1[24] = input[3];
+ x1[25] = input[19];
+ x1[26] = input[11];
+ x1[27] = input[27];
+ x1[28] = input[7];
+ x1[29] = input[23];
+ x1[30] = input[15];
+ x1[31] = input[31];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
+ INV_COS_BIT);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
+ INV_COS_BIT);
+ idct32_high16_stage3_avx2(x1);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
+ INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
+
+ // stage 6
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
+
+ idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
+ idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[35]);
+ btf_16_adds_subs_avx2(&x[33], &x[34]);
+ btf_16_adds_subs_avx2(&x[39], &x[36]);
+ btf_16_adds_subs_avx2(&x[38], &x[37]);
+ btf_16_adds_subs_avx2(&x[40], &x[43]);
+ btf_16_adds_subs_avx2(&x[41], &x[42]);
+ btf_16_adds_subs_avx2(&x[47], &x[44]);
+ btf_16_adds_subs_avx2(&x[46], &x[45]);
+ btf_16_adds_subs_avx2(&x[48], &x[51]);
+ btf_16_adds_subs_avx2(&x[49], &x[50]);
+ btf_16_adds_subs_avx2(&x[55], &x[52]);
+ btf_16_adds_subs_avx2(&x[54], &x[53]);
+ btf_16_adds_subs_avx2(&x[56], &x[59]);
+ btf_16_adds_subs_avx2(&x[57], &x[58]);
+ btf_16_adds_subs_avx2(&x[63], &x[60]);
+ btf_16_adds_subs_avx2(&x[62], &x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[39]);
+ btf_16_adds_subs_avx2(&x[33], &x[38]);
+ btf_16_adds_subs_avx2(&x[34], &x[37]);
+ btf_16_adds_subs_avx2(&x[35], &x[36]);
+ btf_16_adds_subs_avx2(&x[47], &x[40]);
+ btf_16_adds_subs_avx2(&x[46], &x[41]);
+ btf_16_adds_subs_avx2(&x[45], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[43]);
+ btf_16_adds_subs_avx2(&x[48], &x[55]);
+ btf_16_adds_subs_avx2(&x[49], &x[54]);
+ btf_16_adds_subs_avx2(&x[50], &x[53]);
+ btf_16_adds_subs_avx2(&x[51], &x[52]);
+ btf_16_adds_subs_avx2(&x[63], &x[56]);
+ btf_16_adds_subs_avx2(&x[62], &x[57]);
+ btf_16_adds_subs_avx2(&x[61], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[47]);
+ btf_16_adds_subs_avx2(&x[33], &x[46]);
+ btf_16_adds_subs_avx2(&x[34], &x[45]);
+ btf_16_adds_subs_avx2(&x[35], &x[44]);
+ btf_16_adds_subs_avx2(&x[36], &x[43]);
+ btf_16_adds_subs_avx2(&x[37], &x[42]);
+ btf_16_adds_subs_avx2(&x[38], &x[41]);
+ btf_16_adds_subs_avx2(&x[39], &x[40]);
+ btf_16_adds_subs_avx2(&x[63], &x[48]);
+ btf_16_adds_subs_avx2(&x[62], &x[49]);
+ btf_16_adds_subs_avx2(&x[61], &x[50]);
+ btf_16_adds_subs_avx2(&x[60], &x[51]);
+ btf_16_adds_subs_avx2(&x[59], &x[52]);
+ btf_16_adds_subs_avx2(&x[58], &x[53]);
+ btf_16_adds_subs_avx2(&x[57], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[31]);
+ btf_16_adds_subs_avx2(&x[1], &x[30]);
+ btf_16_adds_subs_avx2(&x[2], &x[29]);
+ btf_16_adds_subs_avx2(&x[3], &x[28]);
+ btf_16_adds_subs_avx2(&x[4], &x[27]);
+ btf_16_adds_subs_avx2(&x[5], &x[26]);
+ btf_16_adds_subs_avx2(&x[6], &x[25]);
+ btf_16_adds_subs_avx2(&x[7], &x[24]);
+ btf_16_adds_subs_avx2(&x[8], &x[23]);
+ btf_16_adds_subs_avx2(&x[9], &x[22]);
+ btf_16_adds_subs_avx2(&x[10], &x[21]);
+ btf_16_adds_subs_avx2(&x[11], &x[20]);
+ btf_16_adds_subs_avx2(&x[12], &x[19]);
+ btf_16_adds_subs_avx2(&x[13], &x[18]);
+ btf_16_adds_subs_avx2(&x[14], &x[17]);
+ btf_16_adds_subs_avx2(&x[15], &x[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+ btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+ btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+ btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+ btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+ btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+ btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+ btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+ btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+ btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+ btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+ btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+ btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+ btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+ btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+ btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+ btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
+}
+
+static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
+ INV_COS_BIT);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
+ INV_COS_BIT);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+ INV_COS_BIT);
+ idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 8
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+ INV_COS_BIT);
+ idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(&x[32], &x[33]);
+ btf_16_adds_subs_avx2(&x[35], &x[34]);
+ btf_16_adds_subs_avx2(&x[36], &x[37]);
+ btf_16_adds_subs_avx2(&x[39], &x[38]);
+ btf_16_adds_subs_avx2(&x[40], &x[41]);
+ btf_16_adds_subs_avx2(&x[43], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[45]);
+ btf_16_adds_subs_avx2(&x[47], &x[46]);
+ btf_16_adds_subs_avx2(&x[48], &x[49]);
+ btf_16_adds_subs_avx2(&x[51], &x[50]);
+ btf_16_adds_subs_avx2(&x[52], &x[53]);
+ btf_16_adds_subs_avx2(&x[55], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[57]);
+ btf_16_adds_subs_avx2(&x[59], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[61]);
+ btf_16_adds_subs_avx2(&x[63], &x[62]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+ idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 8
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
+ INV_COS_BIT);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
+ INV_COS_BIT);
+ idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
+
+ // stage 9~11
+ idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
+ idct64_stage11_avx2(output, x);
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+ lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+ { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
+ idct64_low32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
+ const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
+ for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+ __m256i buf0[64];
+ load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ for (int j = 0; j < txfm_size_col; ++j) {
+ buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
+ }
+
+ __m256i *buf1_cur = buf1 + (i << 4);
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
+ int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+ transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+ }
+ }
+ }
+ const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i *buf1_cur = buf1 + i * txfm_size_row;
+ col_txfm(buf1_cur, buf1_cur);
+ for (int j = 0; j < txfm_size_row; ++j) {
+ buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
+ }
+ }
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+ stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m256i rect_scale =
+ _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ src = _mm256_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+ __m256i *buf, int shift, int height,
+ int txh_idx) {
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
+ for (int h = 0; h < height; ++h) {
+ __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+ __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+ lo = _mm256_madd_epi16(lo, scale_coeff);
+ hi = _mm256_madd_epi16(hi, scale_coeff);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm256_add_epi32(lo, shift__r);
+ hi = _mm256_add_epi32(hi, shift__r);
+ lo = _mm256_srai_epi32(lo, -shift);
+ hi = _mm256_srai_epi32(hi, -shift);
+ const __m256i x = _mm256_packs_epi32(lo, hi);
+ write_recon_w16_avx2(x, output);
+ output += stride;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_SIZE tx_size,
+ int32_t eob) {
+ (void)eob;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int col_max = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ __m256i buf[32];
+
+ for (int i = 0; i < (col_max >> 4); ++i) {
+ for (int j = 0; j < (row_max >> 4); j++) {
+ iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
+ row_max, shift[0], 16, txw_idx, rect_type);
+ transpose_16bit_16x16_avx2(buf, buf);
+ iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
+ shift[1], 16, txh_idx);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
+ const int input_stride = txfm_size_row_notzero;
+ const int buf_size_w_div16 = (eobx + 16) >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i buf0[64];
+ for (int j = 0; j < buf_size_h_div16; j++) {
+ __m256i *buf0_cur = buf0 + j * 16;
+ const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
+ iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
+ txw_idx, rect_type);
+ transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+ }
+ col_txfm(buf0, buf0);
+ __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+ write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
+ const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+ assert(row_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div16; i++) {
+ __m256i buf0[64];
+ load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+ __m256i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
+ transpose_16bit_16x16_avx2(temp,
+ _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+ }
+ }
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+ buf1 + j * 16, shift[1], 16, txh_idx);
+ }
+ }
+}
+
+static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = {
+ { av1_idct8_low1_ssse3, av1_idct8_sse2 },
+ { av1_iadst8_low1_ssse3, av1_iadst8_sse2 }
+};
+
+static INLINE void load_buffer_avx2(const int32_t *in, int stride,
+ __m128i *out) {
+ const __m256i a = _mm256_load_si256((const __m256i *)in);
+ const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
+ const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
+ const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
+ const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
+ const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
+ const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
+ const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
+
+ // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
+ const __m256i ab_16bit = _mm256_packs_epi32(a, b);
+ // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
+ const __m256i cd_16bit = _mm256_packs_epi32(c, d);
+ // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
+ const __m256i ef_16bit = _mm256_packs_epi32(e, f);
+ // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
+ const __m256i gh_16bit = _mm256_packs_epi32(g, h);
+
+ // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
+ const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
+ // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
+ const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
+ // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
+ const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
+ // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
+ const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
+
+ out[0] = _mm256_castsi256_si128(ab);
+ out[1] = _mm256_extractf128_si256(ab, 1);
+ out[2] = _mm256_castsi256_si128(cd);
+ out[3] = _mm256_extractf128_si256(cd, 1);
+ out[4] = _mm256_castsi256_si128(ef);
+ out[5] = _mm256_extractf128_si256(ef, 1);
+ out[6] = _mm256_castsi256_si128(gh);
+ out[7] = _mm256_extractf128_si256(gh, 1);
+}
+
+static INLINE void round_and_transpose_avx2(const __m128i *const in,
+ __m128i *const out, int bit,
+ int *lr_flip) {
+ __m256i buf_temp[4];
+ const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
+ int j = *lr_flip ? 7 : 0;
+ const int step = *lr_flip ? -1 : 1;
+
+ // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
+ buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+ j += step;
+ // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
+ buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+ j += step;
+ // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
+ buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+ j += step;
+ // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
+ buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
+ in[j + 4 * step], 1);
+
+ // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
+ buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
+ // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
+ buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
+ // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
+ buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
+ // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
+ buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
+
+ // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
+ const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
+ // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
+ const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
+ // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
+ const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
+ // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
+ const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
+
+ // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
+ const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
+ // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
+ const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
+ // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
+ const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
+ // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
+ const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
+
+ // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
+ const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
+ // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
+ const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
+ // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
+ const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
+ // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
+ const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
+
+ // 70 60 50 40 30 20 10 00
+ out[0] = _mm256_castsi256_si128(reg_00);
+ // 71 61 51 41 31 21 11 01
+ out[1] = _mm256_extracti128_si256(reg_00, 1);
+ // 72 62 52 42 32 22 12 02
+ out[2] = _mm256_castsi256_si128(reg_01);
+ // 73 63 53 43 33 23 13 03
+ out[3] = _mm256_extracti128_si256(reg_01, 1);
+ // 74 64 54 44 34 24 14 04
+ out[4] = _mm256_castsi256_si128(reg_10);
+ // 75 65 55 45 35 25 15 05
+ out[5] = _mm256_extracti128_si256(reg_10, 1);
+ // 76 66 56 46 36 26 16 06
+ out[6] = _mm256_castsi256_si128(reg_11);
+ // 77 67 57 47 37 27 17 07
+ out[7] = _mm256_extracti128_si256(reg_11, 1);
+}
+
+static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
+ uint8_t *output,
+ int stride, int flipud) {
+ __m256i in_256[4], v_256[4];
+ int j = flipud ? 7 : 0;
+ const int step = flipud ? -1 : 1;
+ const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
+ const __m256i zero = _mm256_setzero_si256();
+ // in[0], in[1]
+ in_256[0] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+ j += 2 * step;
+ // in[2], in[3]
+ in_256[1] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+ j += 2 * step;
+ // in[4], in[5]
+ in_256[2] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+ j += 2 * step;
+ // in[6], in[7]
+ in_256[3] =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
+
+ // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
+ in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
+ // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
+ in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
+ // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
+ in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
+ // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
+ in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
+
+ const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
+ const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
+ const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+ const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+ const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
+ const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
+ const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
+ const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
+
+ v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
+ v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
+ v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
+ v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
+
+ const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
+ const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
+ const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
+ const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
+ // 00 01 10 11
+ const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
+ // 20 21 30 31
+ const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
+ // 40 41 50 51
+ const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
+ // 60 61 70 71
+ const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
+
+ // 00 01 20 21 10 11 30 31
+ const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
+ // 40 41 60 61 50 51 70 71
+ const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
+
+ // 00 01 20 21
+ const __m128i res_02 = _mm256_castsi256_si128(res_0123);
+ // 10 11 30 31
+ const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
+ // 40 41 60 61
+ const __m128i res_46 = _mm256_castsi256_si128(res_4567);
+ // 50 51 70 71
+ const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
+
+ // 00 01
+ _mm_storel_epi64((__m128i *)(output), res_02);
+ // 10 11
+ _mm_storel_epi64((__m128i *)(output + stride), res_13);
+ // 20 21
+ _mm_storel_epi64((__m128i *)(output + 2 * stride),
+ _mm_unpackhi_epi64(res_02, res_02));
+ // 30 31
+ _mm_storel_epi64((__m128i *)(output + 3 * stride),
+ _mm_unpackhi_epi64(res_13, res_13));
+ // 40 41
+ _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
+ // 50 51
+ _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
+ // 60 61
+ _mm_storel_epi64((__m128i *)(output + 6 * stride),
+ _mm_unpackhi_epi64(res_46, res_46));
+ // 70 71
+ _mm_storel_epi64((__m128i *)(output + 7 * stride),
+ _mm_unpackhi_epi64(res_57, res_57));
+}
+
+// AVX2 implementation has the advantage when combined multiple operations
+// together.
+static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[8];
+ const int input_stride = 8;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ assert(hitx_1d_tab[tx_type] < 2);
+ assert(vitx_1d_tab[tx_type] < 2);
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ __m128i buf0[8];
+ __m128i *buf0_cur = buf0;
+ load_buffer_avx2(input, input_stride, buf0_cur);
+ row_txfm(buf0, buf0);
+
+ assert(shift[0] < 0);
+ __m128i *_buf1 = buf1;
+ round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
+ assert(shift[1] < 0);
+ col_txfm(buf1, buf1);
+ round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
+}
+
+// AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for
+// tx_type with identity in either of the direction has no advantage.
+static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ case ADST_ADST: // ADST in both directions
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ case TX_4X8:
+ case TX_8X4:
+ case TX_8X16:
+ case TX_16X8:
+ case TX_4X16:
+ case TX_16X4:
+ case TX_8X32:
+ case TX_32X8:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_8X8:
+ lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X16:
+ case TX_32X32:
+ case TX_64X64:
+ case TX_16X32:
+ case TX_32X16:
+ case TX_32X64:
+ case TX_64X32:
+ case TX_16X64:
+ case TX_64X16:
+ default:
+ lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 0000000000..a09dea389f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \
+ do { \
+ const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+ const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+ const __m256i _in = in; \
+ out0 = _mm256_mulhrs_epi16(_in, _w0); \
+ out1 = _mm256_mulhrs_epi16(_in, _w1); \
+ } while (0)
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+ int size) {
+ const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm256_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+ __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+ __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+ __m128i y = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+ _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ write_recon_w16_avx2(in[j], output + i * stride);
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 0000000000..79a6064c3e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2904 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 4
+ // stage 5
+ output[0] = x[0];
+ output[7] = x[0];
+ output[1] = x[1];
+ output[6] = x[1];
+ output[2] = x[1];
+ output[5] = x[1];
+ output[3] = x[0];
+ output[4] = x[0];
+}
+
+void av1_idct8_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+ btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+ btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+ btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+ btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+ btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+ btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+ btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ output[0] = x[0];
+ output[15] = x[0];
+ output[1] = x[1];
+ output[14] = x[1];
+ output[2] = x[1];
+ output[13] = x[1];
+ output[3] = x[0];
+ output[12] = x[0];
+ output[4] = x[0];
+ output[11] = x[0];
+ output[5] = x[1];
+ output[10] = x[1];
+ output[6] = x[1];
+ output[9] = x[1];
+ output[7] = x[0];
+ output[8] = x[0];
+}
+
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[2] = input[4];
+ x[4] = input[2];
+ x[6] = input[6];
+ x[8] = input[1];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+static void idct16_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5~7
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+ // stage 7
+ idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+ btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+ btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+ btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+ btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+ btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+ btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+ btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+ btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+ btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+ btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+ btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+ btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+ btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+ btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+ btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[1] = input[16];
+ x[2] = input[8];
+ x[3] = input[24];
+ x[4] = input[4];
+ x[5] = input[20];
+ x[6] = input[12];
+ x[7] = input[28];
+ x[8] = input[2];
+ x[9] = input[18];
+ x[10] = input[10];
+ x[11] = input[26];
+ x[12] = input[6];
+ x[13] = input[22];
+ x[14] = input[14];
+ x[15] = input[30];
+ x[16] = input[1];
+ x[17] = input[17];
+ x[18] = input[9];
+ x[19] = input[25];
+ x[20] = input[5];
+ x[21] = input[21];
+ x[22] = input[13];
+ x[23] = input[29];
+ x[24] = input[3];
+ x[25] = input[19];
+ x[26] = input[11];
+ x[27] = input[27];
+ x[28] = input[7];
+ x[29] = input[23];
+ x[30] = input[15];
+ x[31] = input[31];
+
+ // stage 2
+ btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+ btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_adds_subs_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7~8
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_adds_subs_sse2(x[32], x[35]);
+ btf_16_adds_subs_sse2(x[33], x[34]);
+ btf_16_subs_adds_sse2(x[39], x[36]);
+ btf_16_subs_adds_sse2(x[38], x[37]);
+ btf_16_adds_subs_sse2(x[40], x[43]);
+ btf_16_adds_subs_sse2(x[41], x[42]);
+ btf_16_subs_adds_sse2(x[47], x[44]);
+ btf_16_subs_adds_sse2(x[46], x[45]);
+ btf_16_adds_subs_sse2(x[48], x[51]);
+ btf_16_adds_subs_sse2(x[49], x[50]);
+ btf_16_subs_adds_sse2(x[55], x[52]);
+ btf_16_subs_adds_sse2(x[54], x[53]);
+ btf_16_adds_subs_sse2(x[56], x[59]);
+ btf_16_adds_subs_sse2(x[57], x[58]);
+ btf_16_subs_adds_sse2(x[63], x[60]);
+ btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_adds_subs_sse2(x[32], x[39]);
+ btf_16_adds_subs_sse2(x[33], x[38]);
+ btf_16_adds_subs_sse2(x[34], x[37]);
+ btf_16_adds_subs_sse2(x[35], x[36]);
+ btf_16_subs_adds_sse2(x[47], x[40]);
+ btf_16_subs_adds_sse2(x[46], x[41]);
+ btf_16_subs_adds_sse2(x[45], x[42]);
+ btf_16_subs_adds_sse2(x[44], x[43]);
+ btf_16_adds_subs_sse2(x[48], x[55]);
+ btf_16_adds_subs_sse2(x[49], x[54]);
+ btf_16_adds_subs_sse2(x[50], x[53]);
+ btf_16_adds_subs_sse2(x[51], x[52]);
+ btf_16_subs_adds_sse2(x[63], x[56]);
+ btf_16_subs_adds_sse2(x[62], x[57]);
+ btf_16_subs_adds_sse2(x[61], x[58]);
+ btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[47]);
+ btf_16_adds_subs_sse2(x[33], x[46]);
+ btf_16_adds_subs_sse2(x[34], x[45]);
+ btf_16_adds_subs_sse2(x[35], x[44]);
+ btf_16_adds_subs_sse2(x[36], x[43]);
+ btf_16_adds_subs_sse2(x[37], x[42]);
+ btf_16_adds_subs_sse2(x[38], x[41]);
+ btf_16_adds_subs_sse2(x[39], x[40]);
+ btf_16_subs_adds_sse2(x[63], x[48]);
+ btf_16_subs_adds_sse2(x[62], x[49]);
+ btf_16_subs_adds_sse2(x[61], x[50]);
+ btf_16_subs_adds_sse2(x[60], x[51]);
+ btf_16_subs_adds_sse2(x[59], x[52]);
+ btf_16_subs_adds_sse2(x[58], x[53]);
+ btf_16_subs_adds_sse2(x[57], x[54]);
+ btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[31]);
+ btf_16_adds_subs_sse2(x[1], x[30]);
+ btf_16_adds_subs_sse2(x[2], x[29]);
+ btf_16_adds_subs_sse2(x[3], x[28]);
+ btf_16_adds_subs_sse2(x[4], x[27]);
+ btf_16_adds_subs_sse2(x[5], x[26]);
+ btf_16_adds_subs_sse2(x[6], x[25]);
+ btf_16_adds_subs_sse2(x[7], x[24]);
+ btf_16_adds_subs_sse2(x[8], x[23]);
+ btf_16_adds_subs_sse2(x[9], x[22]);
+ btf_16_adds_subs_sse2(x[10], x[21]);
+ btf_16_adds_subs_sse2(x[11], x[20]);
+ btf_16_adds_subs_sse2(x[12], x[19]);
+ btf_16_adds_subs_sse2(x[13], x[18]);
+ btf_16_adds_subs_sse2(x[14], x[17]);
+ btf_16_adds_subs_sse2(x[15], x[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+ btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+ btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+ btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+ btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+ btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+ btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+ btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+ btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+ btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+ btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+ btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+ btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+ btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+ btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+ btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+ btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+ btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+ btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+ btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+ btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+ btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+ btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+ btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+ btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+ btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+ btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+ btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+ btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+ btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+ btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+ btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[33]);
+ btf_16_subs_adds_sse2(x[35], x[34]);
+ btf_16_adds_subs_sse2(x[36], x[37]);
+ btf_16_subs_adds_sse2(x[39], x[38]);
+ btf_16_adds_subs_sse2(x[40], x[41]);
+ btf_16_subs_adds_sse2(x[43], x[42]);
+ btf_16_adds_subs_sse2(x[44], x[45]);
+ btf_16_subs_adds_sse2(x[47], x[46]);
+ btf_16_adds_subs_sse2(x[48], x[49]);
+ btf_16_subs_adds_sse2(x[51], x[50]);
+ btf_16_adds_subs_sse2(x[52], x[53]);
+ btf_16_subs_adds_sse2(x[55], x[54]);
+ btf_16_adds_subs_sse2(x[56], x[57]);
+ btf_16_subs_adds_sse2(x[59], x[58]);
+ btf_16_adds_subs_sse2(x[60], x[61]);
+ btf_16_subs_adds_sse2(x[63], x[62]);
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 9~11
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void iadst4_sse2(const __m128i *input, __m128i *output) {
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[4];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+ u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+ u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+ __m128i x1[16];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+ x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+ x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+ x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+ x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+ x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
+ x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+ x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+ x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+ x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+ __m128i x2[8];
+ x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[5]);
+ x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+ x2[3] = _mm_add_epi32(x1[3], x1[7]);
+ x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
+ x2[5] = _mm_add_epi32(x1[9], x1[11]);
+ x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+ x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+ __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out1);
+ }
+}
+
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[2];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+ __m128i x1[8];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
+ x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+
+ __m128i x2[4];
+ x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+ x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
+ x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[i], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out0);
+ }
+}
+
+void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+ // stage 3
+ x[4] = x[0];
+ x[5] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+ // stage 5
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void av1_iadst8_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[8]);
+ btf_16_adds_subs_sse2(x[1], x[9]);
+ btf_16_adds_subs_sse2(x[2], x[10]);
+ btf_16_adds_subs_sse2(x[3], x[11]);
+ btf_16_adds_subs_sse2(x[4], x[12]);
+ btf_16_adds_subs_sse2(x[5], x[13]);
+ btf_16_adds_subs_sse2(x[6], x[14]);
+ btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[12]);
+ btf_16_adds_subs_sse2(x[9], x[13]);
+ btf_16_adds_subs_sse2(x[10], x[14]);
+ btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[10]);
+ btf_16_adds_subs_sse2(x[9], x[11]);
+ btf_16_adds_subs_sse2(x[12], x[14]);
+ btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+ const __m128i __zero = _mm_setzero_si128();
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[8]);
+ output[2] = x[12];
+ output[3] = _mm_subs_epi16(__zero, x[4]);
+ output[4] = x[6];
+ output[5] = _mm_subs_epi16(__zero, x[14]);
+ output[6] = x[10];
+ output[7] = _mm_subs_epi16(__zero, x[2]);
+ output[8] = x[3];
+ output[9] = _mm_subs_epi16(__zero, x[11]);
+ output[10] = x[15];
+ output[11] = _mm_subs_epi16(__zero, x[7]);
+ output[12] = x[5];
+ output[13] = _mm_subs_epi16(__zero, x[13]);
+ output[14] = x[9];
+ output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+ // stage 3
+ x[8] = x[0];
+ x[9] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+ // stage 5
+ x[4] = x[0];
+ x[5] = x[1];
+ x[12] = x[8];
+ x[13] = x[9];
+
+ // stage 6
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+ // stage 7
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+ x[10] = x[8];
+ x[11] = x[9];
+ x[14] = x[12];
+ x[15] = x[13];
+
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+ x[3] = input[2];
+ x[5] = input[4];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[1];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+ btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+ btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+ btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+ btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+ btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+ btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+ btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+static void iadst16_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3~9
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
+ const int8_t cos_bit = INV_COS_BIT;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+ // stage 5
+ iadst16_stage5_ssse3(x);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+ // stage 7
+ iadst16_stage7_ssse3(x);
+
+ // stage 8
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+ // stage 9
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
+ const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 4; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ output[i] = _mm_adds_epi16(x, input[i]);
+ }
+}
+
+static void iidentity8_sse2(const __m128i *input, __m128i *output) {
+ for (int i = 0; i < 8; ++i) {
+ output[i] = _mm_adds_epi16(input[i], input[i]);
+ }
+}
+
+static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
+ const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 16; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+ output[i] = _mm_adds_epi16(x, srcx2);
+ }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+ __m128i res) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+ return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
+ __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+ u = _mm_packus_epi16(u, zero);
+ *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+ }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
+ { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 },
+ { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
+ { idct32_sse2, NULL, NULL },
+ { idct64_low32_ssse3, NULL, NULL },
+ };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4_sse2, idct4_sse2, NULL, NULL },
+ { iadst4_sse2, iadst4_sse2, NULL, NULL },
+ { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
+ },
+ { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL },
+ { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL },
+ { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
+ {
+ { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
+ { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
+ idct32_sse2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
+ idct64_low32_ssse3 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
+ { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
+ { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
+ { NULL, NULL, NULL },
+ { NULL, NULL, NULL },
+ };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m128i src = load_32bit_to_16bit(input_row);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m128i rect_scale =
+ _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m128i src = load_32bit_to_16bit(input_row);
+ src = _mm_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+ __m128i *buf, int shift, int height,
+ int txh_idx) {
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+ const __m128i zero = _mm_setzero_si128();
+ for (int h = 0; h < height; ++h) {
+ __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+ __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+ lo = _mm_madd_epi16(lo, scale_coeff);
+ hi = _mm_madd_epi16(hi, scale_coeff);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm_add_epi32(lo, shift_rounding);
+ hi = _mm_add_epi32(hi, shift_rounding);
+ lo = _mm_srai_epi32(lo, -shift);
+ hi = _mm_srai_epi32(hi, -shift);
+ __m128i x = _mm_packs_epi32(lo, hi);
+
+ const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+ x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+ const __m128i u = _mm_packus_epi16(x, x);
+ _mm_storel_epi64((__m128i *)(output), u);
+ output += stride;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_SIZE tx_size) {
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int col_max = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ for (int i = 0; i < (col_max >> 3); ++i) {
+ for (int j = 0; j < (row_max >> 3); j++) {
+ __m128i buf[8];
+ iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride,
+ row_max, shift[0], 8, txw_idx, rect_type);
+ transpose_16bit_8x8(buf, buf);
+ iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf,
+ shift[1], 8, txh_idx);
+ }
+ }
+}
+
+static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[4];
+ const TX_SIZE tx_size = TX_4X4;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
+ row_txfm(buf, buf);
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x4(temp, buf);
+ } else {
+ transpose_16bit_4x4(buf, buf);
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+ __m128i res0, __m128i res1) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+ __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+ x0 = _mm_adds_epi16(res0, x0);
+ x1 = _mm_adds_epi16(res1, x1);
+ return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+ int size) {
+ const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[64 * 8];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m128i buf0[64];
+ load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1 + i * 8;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp,
+ _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
+ round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ assert(fun_idx < 5);
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ __m128i buf0[64];
+ for (int j = 0; j < buf_size_h_div8; j++) {
+ __m128i *buf0_cur = buf0 + j * 8;
+ const int32_t *input_cur = input + i * 8 * input_stride + j * 8;
+ iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8,
+ txw_idx, rect_type);
+ transpose_16bit_8x8(buf0_cur, buf0_cur);
+ }
+ col_txfm(buf0, buf0);
+ __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ uint8_t *out = output + 8 * i;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+ __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+ _mm_storel_epi64((__m128i *)(out), u);
+ out += stride;
+ }
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div8; i++) {
+ __m128i buf0[64];
+ load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code
+ }
+ row_txfm(buf0, buf0);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+ }
+ }
+
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+ buf1 + j * 8, shift[1], 8, txh_idx);
+ }
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case DCT_DCT:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_4X8;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf);
+ } else {
+ transpose_16bit_8x4(buf, buf);
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_8X4;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x8(temp, buf);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_4X16;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ const int row_one_loop = 8;
+ for (int i = 0; i < 2; ++i) {
+ const int32_t *input_cur = input + i * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur,
+ txfm_size_col);
+ if (row_txfm == iidentity4_ssse3) {
+ const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
+ const __m128i ones = _mm_set1_epi16(1);
+ for (int j = 0; j < 4; ++j) {
+ const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
+ const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
+ const __m128i buf_32_lo =
+ _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+ const __m128i buf_32_hi =
+ _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+ buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+ }
+ } else {
+ row_txfm(buf_cur, buf_cur);
+ round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+ }
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf_cur, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf_cur);
+ } else {
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ }
+ col_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size_,
+ int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_16X4;
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int row_one_loop = 8;
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col);
+ if (row_txfm == iidentity16_ssse3) {
+ const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
+ const __m128i ones = _mm_set1_epi16(1);
+ for (int j = 0; j < 16; ++j) {
+ const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
+ const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
+ const __m128i buf_32_lo =
+ _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+ const __m128i buf_32_hi =
+ _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+ buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+ }
+ } else {
+ row_txfm(buf, buf);
+ round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+ }
+ if (lr_flip) {
+ __m128i temp[16];
+ flip_buf_sse2(buf, temp, 16);
+ transpose_16bit_4x8(temp, buf);
+ transpose_16bit_4x8(temp + 8, buf + 8);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
+ round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+ }
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+ lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ if (!txfm_param->lossless) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 0000000000..1873d01bc0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h> // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1) \
+ do { \
+ const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+ const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+ const __m128i _in = in; \
+ out0 = _mm_mulhrs_epi16(_in, _w0); \
+ out1 = _mm_mulhrs_epi16(_in, _w1); \
+ } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ out0 = _mm_adds_epi16(_in0, _in1); \
+ out1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_mulhrs_epi16(in[i], scale);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+// 1D itx types
+enum {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} UENUM1BYTE(ITX_TYPE_1D);
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob_fill[eob / (eoby_max + 1)];
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob);
+
+void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_SIZE tx_size);
+
+void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+
+void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output);
+
+void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 0000000000..129721cf05
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+ const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+ const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+ __m128i *const out0, __m128i *const out1) {
+ const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i u0 = _mm_madd_epi16(t0, *w0);
+ const __m128i v0 = _mm_madd_epi16(t0, *w1);
+ const __m128i a0 = _mm_add_epi32(u0, __rounding);
+ const __m128i b0 = _mm_add_epi32(v0, __rounding);
+ const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+ const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+ *out0 = _mm_packs_epi32(c0, c0);
+ *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+ do { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c0); \
+ out1 = _mm_packs_epi32(d0, d0); \
+ } while (0)
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+ do { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i t1 = _mm_unpackhi_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i u1 = _mm_madd_epi16(t1, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ __m128i v1 = _mm_madd_epi16(t1, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i a1 = _mm_add_epi32(u1, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ __m128i b1 = _mm_add_epi32(v1, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i c1 = _mm_srai_epi32(a1, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ __m128i d1 = _mm_srai_epi32(b1, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c1); \
+ out1 = _mm_packs_epi32(d0, d1); \
+ } while (0)
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+ const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m128i b = _mm_madd_epi16(a, scale_rounding);
+ return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+ _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w4(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+ uint16_t *out,
+ const int stride) {
+ for (int i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+ }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_adds_epi16(in[i], rounding);
+ in[i] = _mm_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+void av1_iadst8_sse2(const __m128i *input, __m128i *output);
+
+void av1_idct8_sse2(const __m128i *input, __m128i *output);
+
+typedef struct {
+ transform_1d_sse2 col, row; // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 0000000000..1894efdc10
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+// This function assumes `arr` is 16-byte aligned.
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+ __m128i *const vec = (__m128i *)arr;
+ const int vec_size = size >> 2;
+ av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 0000000000..387dfd6bb3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+ __m128i tmp, round;
+ round = _mm_set1_epi32(1 << (bit - 1));
+ tmp = _mm_add_epi32(vec, round);
+ return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(const __m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(const __m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit,
+ const int val) {
+ const __m128i sqrt2 = _mm_set1_epi32(val);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cdef_block_avx2.c b/third_party/aom/av1/common/x86/cdef_block_avx2.c
new file mode 100644
index 0000000000..1ec4b6c332
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_avx2.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "av1/common/cdef_block_simd.h"
+
+// Mask used to shuffle the elements present in 256bit register.
+const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
+ 0x0f0e0100, 0x0b0a0d0c, 0x07060908,
+ 0x03020504, 0x0f0e0100 };
+
+/* partial A is a 16-bit vector of the form:
+[x8 - - x1 | x16 - - x9] and partial B has the form:
+[0 y1 - y7 | 0 y9 - y15].
+This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
+are in const1 and const2. */
+static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
+ __m256i *partialb,
+ const __m256i *const1,
+ const __m256i *const2) {
+ __m256i tmp;
+ /* Reverse partial B. */
+ *partialb = _mm256_shuffle_epi8(
+ *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
+
+ /* Interleave the x and y values of identical indices and pair x8 with 0. */
+ tmp = *partiala;
+ *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
+ *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
+
+ /* Square and add the corresponding x and y values. */
+ *partiala = _mm256_madd_epi16(*partiala, *partiala);
+ *partialb = _mm256_madd_epi16(*partialb, *partialb);
+ /* Multiply by constant. */
+ *partiala = _mm256_mullo_epi32(*partiala, *const1);
+ *partialb = _mm256_mullo_epi32(*partialb, *const2);
+ /* Sum all results. */
+ *partiala = _mm256_add_epi32(*partiala, *partialb);
+ return *partiala;
+}
+
+static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
+ __m256i *x3) {
+ const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
+ const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
+ const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
+ const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
+
+ *x0 = _mm256_unpacklo_epi64(t0, t1);
+ *x1 = _mm256_unpackhi_epi64(t0, t1);
+ *x2 = _mm256_unpacklo_epi64(t2, t3);
+ *x3 = _mm256_unpackhi_epi64(t2, t3);
+ return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
+ _mm256_add_epi32(*x2, *x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+to compute the remaining directions. */
+static INLINE __m256i compute_directions_avx2(__m256i *lines,
+ int32_t cost_frist_8x8[4],
+ int32_t cost_second_8x8[4]) {
+ __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+ __m256i partial6;
+ __m256i tmp;
+ /* Partial sums for lines 0 and 1. */
+ partial4a = _mm256_slli_si256(lines[0], 14);
+ partial4b = _mm256_srli_si256(lines[0], 2);
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
+ tmp = _mm256_add_epi16(lines[0], lines[1]);
+ partial5a = _mm256_slli_si256(tmp, 10);
+ partial5b = _mm256_srli_si256(tmp, 6);
+ partial7a = _mm256_slli_si256(tmp, 4);
+ partial7b = _mm256_srli_si256(tmp, 12);
+ partial6 = tmp;
+
+ /* Partial sums for lines 2 and 3. */
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
+ tmp = _mm256_add_epi16(lines[2], lines[3]);
+ partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
+ partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
+ partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
+ partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
+ partial6 = _mm256_add_epi16(partial6, tmp);
+
+ /* Partial sums for lines 4 and 5. */
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
+ tmp = _mm256_add_epi16(lines[4], lines[5]);
+ partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
+ partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
+ partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
+ partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
+ partial6 = _mm256_add_epi16(partial6, tmp);
+
+ /* Partial sums for lines 6 and 7. */
+ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
+ partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
+ partial4a = _mm256_add_epi16(partial4a, lines[7]);
+ tmp = _mm256_add_epi16(lines[6], lines[7]);
+ partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
+ partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
+ partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
+ partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
+ partial6 = _mm256_add_epi16(partial6, tmp);
+
+ const __m256i const_reg_1 =
+ _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
+ const __m256i const_reg_2 =
+ _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
+ const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
+ const __m256i const_reg_4 =
+ _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
+
+ /* Compute costs in terms of partial sums. */
+ partial4a =
+ fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
+ partial7a =
+ fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
+ partial5a =
+ fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
+ partial6 = _mm256_madd_epi16(partial6, partial6);
+ partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
+
+ partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
+ _mm_storeu_si128((__m128i *)cost_frist_8x8,
+ _mm256_castsi256_si128(partial4a));
+ _mm_storeu_si128((__m128i *)cost_second_8x8,
+ _mm256_extractf128_si256(partial4a, 1));
+
+ return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
+ const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
+ const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
+ const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
+ const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
+ const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+ const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+ const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+ const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+ const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+ const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+ const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+ const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+
+ res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
+ res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
+ res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
+ res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
+ res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
+ res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
+ res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
+ res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ int32_t cost_first_8x8[8];
+ int32_t cost_second_8x8[8];
+ // Used to store the best cost for 2 8x8's.
+ int32_t best_cost[2] = { 0 };
+ // Best direction for 2 8x8's.
+ int best_dir[2] = { 0 };
+
+ const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
+ const __m256i const_128_reg = _mm256_set1_epi16(128);
+ __m256i lines[8];
+ for (int i = 0; i < 8; i++) {
+ const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
+ const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
+
+ lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
+ lines[i] = _mm256_sub_epi16(
+ _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
+ }
+
+ /* Compute "mostly vertical" directions. */
+ const __m256i dir47 =
+ compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
+
+ /* Transpose and reverse the order of the lines. */
+ array_reverse_transpose_8x8_avx2(lines, lines);
+
+ /* Compute "mostly horizontal" directions. */
+ const __m256i dir03 =
+ compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
+
+ __m256i max = _mm256_max_epi32(dir03, dir47);
+ max =
+ _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
+ _mm256_slli_si256(max, 16 - (8))));
+ max =
+ _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
+ _mm256_slli_si256(max, 16 - (4))));
+
+ const __m128i first_8x8_output = _mm256_castsi256_si128(max);
+ const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
+ const __m128i cmpeg_res_00 =
+ _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
+ const __m128i cmpeg_res_01 =
+ _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
+ const __m128i cmpeg_res_10 =
+ _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
+ const __m128i cmpeg_res_11 =
+ _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
+ const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
+ const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
+
+ best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
+ best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
+ best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
+ best_dir[0] =
+ get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros
+ best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
+ best_dir[1] =
+ get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros
+
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
+ *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
+
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var_out_1st >>= 10;
+ *var_out_2nd >>= 10;
+ *out_dir_1st_8x8 = best_dir[0];
+ *out_dir_2nd_8x8 = best_dir[1];
+}
+
+void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j = 0;
+ int remaining_width = width;
+ assert(height % 2 == 0);
+ assert(height > 0);
+ assert(width > 0);
+
+ // Process multiple 32 pixels at a time.
+ if (remaining_width > 31) {
+ int i = 0;
+ do {
+ j = 0;
+ do {
+ __m128i row00 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
+ __m128i row01 = _mm_loadu_si128(
+ (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
+ __m128i row10 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
+ __m128i row11 = _mm_loadu_si128(
+ (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
+ _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
+ _mm256_cvtepu8_epi16(row00));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
+ _mm256_cvtepu8_epi16(row01));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
+ _mm256_cvtepu8_epi16(row10));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
+ _mm256_cvtepu8_epi16(row11));
+ j += 32;
+ } while (j <= width - 32);
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 31;
+ }
+
+ // Process 16 pixels at a time.
+ if (remaining_width > 15) {
+ int i = 0;
+ do {
+ __m128i row0 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
+ __m128i row1 =
+ _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
+ _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
+ _mm256_cvtepu8_epi16(row0));
+ _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
+ _mm256_cvtepu8_epi16(row1));
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 15;
+ j += 16;
+ }
+
+ // Process 8 pixels at a time.
+ if (remaining_width > 7) {
+ int i = 0;
+ do {
+ __m128i row0 =
+ _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
+ __m128i row1 =
+ _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
+ _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
+ _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
+ _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
+ _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 7;
+ j += 8;
+ }
+
+ // Process 4 pixels at a time.
+ if (remaining_width > 3) {
+ int i = 0;
+ do {
+ __m128i row0 =
+ _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
+ __m128i row1 =
+ _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
+ _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
+ _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
+ _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
+ _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
+ i += 2;
+ } while (i < height);
+ remaining_width = width & 3;
+ j += 4;
+ }
+
+ // Process the remaining pixels.
+ if (remaining_width) {
+ for (int i = 0; i < height; i++) {
+ for (int k = j; k < width; k++) {
+ dst[i * dstride + k] = src[i * sstride + k];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cdef_block_sse2.c b/third_party/aom/av1/common/x86/cdef_block_sse2.c
new file mode 100644
index 0000000000..5ab7ffa2ff
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_sse2.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j = 0;
+ for (int i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cdef_block_sse4.c b/third_party/aom/av1/common/x86/cdef_block_sse4.c
new file mode 100644
index 0000000000..344c1e47c9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_sse4.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j = 0;
+ for (int i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cdef_block_ssse3.c b/third_party/aom/av1/common/x86/cdef_block_ssse3.c
new file mode 100644
index 0000000000..0fb36eb6e0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cdef_block_ssse3.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
+
+void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var_out_1st,
+ int32_t *var_out_2nd, int coeff_shift,
+ int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
+ // Process first 8x8.
+ *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift);
+
+ // Process second 8x8.
+ *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift);
+}
+
+void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride,
+ int width, int height) {
+ int j;
+ for (int i = 0; i < height; i++) {
+ for (j = 0; j < (width & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < width; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 0000000000..e1e187c4a6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
+ TX_SIZE tx_size) { \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
+ cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
+ cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
+ cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
+ NULL, /* 64x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
+ cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
+ cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
+ cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
+ cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
+ cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
+ NULL, /* 32x64 (invalid CFL size) */ \
+ NULL, /* 64x32 (invalid CFL size) */ \
+ cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
+ cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
+ cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
+ cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
+ NULL, /* 16x64 (invalid CFL size) */ \
+ NULL, /* 64x16 (invalid CFL size) */ \
+ }; \
+ return subfn_##sub[tx_size]; \
+ }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+ __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+ __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+ _mm256_storeu_si256(row, sum_16x16);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+ _mm256_storeu_si256(row, top_16x16);
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ const __m256i zeros = _mm256_setzero_si256();
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+ __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+ row_lo = _mm256_slli_epi16(row_lo, 3);
+ __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+ row_hi = _mm256_slli_epi16(row_hi, 3);
+
+ _mm256_storeu_si256(row, row_lo);
+ _mm256_storeu_si256(row + 1, row_hi);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+ __m256i sum = _mm256_add_epi16(top, bot);
+
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+ __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+ __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_add_epi16(hsum, hsum);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i hsum = _mm256_hadd_epi16(top, top_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_slli_epi16(hsum, 2);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+ _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+ __m256i alpha_sign, __m256i dc_q0) {
+ __m256i ac_q3 = _mm256_loadu_si256(input);
+ __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+ __m256i scaled_luma_q0 =
+ _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ (void)width;
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+ do {
+ __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm256_packus_epi16(res, next);
+ res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+ _mm256_storeu_si256((__m256i *)dst, res);
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd)
+CFL_PREDICT_X(avx2, 32, 16, lbd)
+CFL_PREDICT_X(avx2, 32, 32, lbd)
+
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+ cfl_predict_lbd_4x4_ssse3, /* 4x4 */
+ cfl_predict_lbd_8x8_ssse3, /* 8x8 */
+ cfl_predict_lbd_16x16_ssse3, /* 16x16 */
+ cfl_predict_lbd_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_predict_lbd_4x8_ssse3, /* 4x8 */
+ cfl_predict_lbd_8x4_ssse3, /* 8x4 */
+ cfl_predict_lbd_8x16_ssse3, /* 8x16 */
+ cfl_predict_lbd_16x8_ssse3, /* 16x8 */
+ cfl_predict_lbd_16x32_ssse3, /* 16x32 */
+ cfl_predict_lbd_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_predict_lbd_4x16_ssse3, /* 4x16 */
+ cfl_predict_lbd_16x4_ssse3, /* 16x4 */
+ cfl_predict_lbd_8x32_ssse3, /* 8x32 */
+ cfl_predict_lbd_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static __m256i highbd_max_epi16(int bd) {
+ const __m256i neg_one = _mm256_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+ return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ // Use SSSE3 version for smaller widths
+ assert(width == 16 || width == 32);
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+ const __m256i max = highbd_max_epi16(bd);
+
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256((__m256i *)dst,
+ highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+ if (width == 32) {
+ const __m256i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256(
+ (__m256i *)(dst + 16),
+ highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+ cfl_predict_hbd_4x4_ssse3, /* 4x4 */
+ cfl_predict_hbd_8x8_ssse3, /* 8x8 */
+ cfl_predict_hbd_16x16_avx2, /* 16x16 */
+ cfl_predict_hbd_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_predict_hbd_4x8_ssse3, /* 4x8 */
+ cfl_predict_hbd_8x4_ssse3, /* 8x4 */
+ cfl_predict_hbd_8x16_ssse3, /* 8x16 */
+ cfl_predict_hbd_16x8_avx2, /* 16x8 */
+ cfl_predict_hbd_16x32_avx2, /* 16x32 */
+ cfl_predict_hbd_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_predict_hbd_4x16_ssse3, /* 4x16 */
+ cfl_predict_hbd_16x4_avx2, /* 16x4 */
+ cfl_predict_hbd_8x32_ssse3, /* 8x32 */
+ cfl_predict_hbd_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+ // Given that a == [A, B, C, D, E, F, G, H]
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+ // a == [A', C', A', C', E', G', E', G']
+ a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+ // a == [A', C', E', G', A', C', E', G']
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A'' == A' + C' and E'' == E' + G'
+ // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+ return _mm256_hadd_epi32(a, a);
+ // Given that A''' == A'' + E''
+ // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+ return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ // Use SSE2 version for smaller widths
+ assert(width == 16 || width == 32);
+
+ const __m256i *src = (__m256i *)src_ptr;
+ const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+ // To maximize usage of the AVX2 registers, we sum two rows per loop
+ // iteration
+ const int step = 2 * CFL_BUF_LINE_I256;
+
+ __m256i sum = _mm256_setzero_si256();
+ // For width 32, we use a second sum accumulator to reduce accumulator
+ // dependencies in the loop.
+ __m256i sum2;
+ if (width == 32) sum2 = _mm256_setzero_si256();
+
+ do {
+ // Add top row to the bottom row
+ __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+ _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+ sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+ if (width == 32) { /* Don't worry, this if it gets optimized out. */
+ // Add the second part of the top row to the second part of the bottom row
+ __m256i l1 =
+ _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+ _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+ sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+ }
+ src += step;
+ } while (src < end);
+ // Combine both sum accumulators
+ if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+ __m256i fill = fill_sum_epi32(sum);
+
+ __m256i avg_epi16 = _mm256_srli_epi32(
+ _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+ avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+ // Store and subtract loop
+ src = (__m256i *)src_ptr;
+ __m256i *dst = (__m256i *)dst_ptr;
+ do {
+ _mm256_storeu_si256(dst,
+ _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+ if (width == 32) {
+ _mm256_storeu_si256(
+ dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+ }
+ src += CFL_BUF_LINE_I256;
+ dst += CFL_BUF_LINE_I256;
+ } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+ cfl_subtract_average_4x4_sse2, /* 4x4 */
+ cfl_subtract_average_8x8_sse2, /* 8x8 */
+ cfl_subtract_average_16x16_avx2, /* 16x16 */
+ cfl_subtract_average_32x32_avx2, /* 32x32 */
+ NULL, /* 64x64 (invalid CFL size) */
+ cfl_subtract_average_4x8_sse2, /* 4x8 */
+ cfl_subtract_average_8x4_sse2, /* 8x4 */
+ cfl_subtract_average_8x16_sse2, /* 8x16 */
+ cfl_subtract_average_16x8_avx2, /* 16x8 */
+ cfl_subtract_average_16x32_avx2, /* 16x32 */
+ cfl_subtract_average_32x16_avx2, /* 32x16 */
+ NULL, /* 32x64 (invalid CFL size) */
+ NULL, /* 64x32 (invalid CFL size) */
+ cfl_subtract_average_4x16_sse2, /* 4x16 */
+ cfl_subtract_average_16x4_avx2, /* 16x4 */
+ cfl_subtract_average_8x32_sse2, /* 8x32 */
+ cfl_subtract_average_32x8_avx2, /* 32x8 */
+ NULL, /* 16x64 (invalid CFL size) */
+ NULL, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+ // index the function pointer array out of bounds.
+ return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 0000000000..03ae02a922
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+ uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+ int input_stride, uint16_t *output_q3);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 0000000000..4783fe098c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+ l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+ return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ const __m128i zeros = _mm_setzero_si128();
+ const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+ const __m128i *src = (__m128i *)src_ptr;
+ const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+ const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+ __m128i sum = zeros;
+ do {
+ __m128i l0;
+ if (width == 4) {
+ l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+ _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+ __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+ _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpacklo_epi16(l1, zeros)));
+ } else {
+ if (width == 8) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src),
+ _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+ } else {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+ }
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ if (width == 32) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ }
+ }
+ src += step;
+ } while (src < end);
+
+ sum = fill_sum_epi32(sum);
+
+ __m128i avg_epi16 =
+ _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+ avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+ src = (__m128i *)src_ptr;
+ __m128i *dst = (__m128i *)dst_ptr;
+ do {
+ if (width == 4) {
+ _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+ } else {
+ _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+ if (width > 8) {
+ _mm_storeu_si128(dst + 1,
+ _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+ if (width == 32) {
+ _mm_storeu_si128(dst + 2,
+ _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+ _mm_storeu_si128(dst + 3,
+ _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+ }
+ }
+ }
+ src += CFL_BUF_LINE_I128;
+ dst += CFL_BUF_LINE_I128;
+ } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 0000000000..476b6609a9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+ return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+ *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i twos = _mm_set1_epi8(2);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, twos);
+ bot_1 = _mm_maddubs_epi16(bot_1, twos);
+ __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i fours = _mm_set1_epi8(4);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeh_epi32(pred_buf_m128i, top);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storel_epi64(pred_buf_m128i, top);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeu_si128(pred_buf_m128i, top);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, fours);
+ _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+ }
+ }
+ input += input_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i zeros = _mm_setzero_si128();
+ const int luma_stride = input_stride;
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i row = _mm_loadh_epi32((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else if (width == 8) {
+ __m128i row = _mm_loadl_epi64((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else {
+ __m128i row = _mm_loadu_si128((__m128i *)input);
+ const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+ const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+ if (width == 32) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+ const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+ _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ sum = _mm_hadd_epi16(sum, sum);
+ *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ if (width == 8) {
+ sum = _mm_hadd_epi16(sum, sum);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+ _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i bot_2 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i bot_3 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+ const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+ const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+ __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+ _mm_add_epi16(next_sum, next_sum));
+ }
+ }
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ if (width == 8) {
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ }
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ input += input_stride;
+ } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+ } else {
+ const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+ _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+ if (width >= 16) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ row_1 = _mm_slli_epi16(row_1, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+ if (width == 32) {
+ __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ row_2 = _mm_slli_epi16(row_2, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+ __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ row_3 = _mm_slli_epi16(row_3, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+ }
+ }
+ }
+ input += input_stride;
+ pred_buf_q3 += CFL_BUF_LINE;
+ } while (pred_buf_q3 < end);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ __m128i ac_q3 = _mm_loadu_si128(input);
+ __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4)
+ _mm_storeh_epi32((__m128i *)dst, res);
+ else
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)dst, res);
+ if (width == 32) {
+ res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)(dst + 16), res);
+ }
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE __m128i highbd_max_epi16(int bd) {
+ const __m128i neg_one = _mm_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ const __m128i max = highbd_max_epi16(bd);
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ res = highbd_clamp_epi16(res, zeros, max);
+ if (width == 4) {
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ _mm_storeu_si128((__m128i *)dst, res);
+ }
+ if (width >= 16) {
+ const __m128i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128(((__m128i *)dst) + 1,
+ highbd_clamp_epi16(res_1, zeros, max));
+ }
+ if (width == 32) {
+ const __m128i res_2 =
+ predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 16),
+ highbd_clamp_epi16(res_2, zeros, max));
+ const __m128i res_3 =
+ predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 24),
+ highbd_clamp_epi16(res_3, zeros, max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 0000000000..1b39a0a8d5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/SVT-AV1/convolve_2d_avx2.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ const int bd = 8;
+ int im_stride = 8, i;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const_h12 = _mm256_set1_epi32(
+ ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+ __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
+
+ int horiz_tap = 12;
+ int vert_tap = 12;
+
+ prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
+ prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int im_h = h + vert_tap - 1;
+ const int fo_vert = vert_tap / 2 - 1;
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ for (int j = 0; j < w; j += 8) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
+ CONVOLVE_SR_VERTICAL_FILTER_12TAP
+ }
+ } else {
+ const int bd = 8;
+ int im_stride = 8, i;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const_h =
+ _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
+ (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
+ int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
+
+ if (horiz_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+ else
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+
+ if (vert_tap == 6)
+ prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
+ else
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int im_h = h + vert_tap - 1;
+ const int fo_vert = vert_tap / 2 - 1;
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ for (int j = 0; j < w; j += 8) {
+ if (horiz_tap == 4) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
+ } else if (horiz_tap == 6) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
+ } else {
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
+ }
+
+ if (vert_tap == 4) {
+ CONVOLVE_SR_VERTICAL_FILTER_4TAP
+ } else if (vert_tap == 6) {
+ CONVOLVE_SR_VERTICAL_FILTER_6TAP
+ } else {
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_sr_avx2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
+ const int32_t subpel_y_q4, ConvolveParams *conv_params) {
+ const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
+ const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ const bool use_general = (tap_x == 12 || tap_y == 12);
+ if (use_general) {
+ av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_q4, subpel_y_q4, conv_params);
+ } else {
+ av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_q4, subpel_y_q4, conv_params);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 0000000000..1b85f37294
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+ __m128i coeffs[6];
+
+ /* Horizontal filter */
+ {
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data_2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i src_6 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+ const __m128i src_8 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero);
+ const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+ const __m128i src_10 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero);
+ const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
+
+ const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+ const __m128i src_5 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+ const __m128i src_7 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+ const __m128i src_9 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero);
+ const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+ const __m128i src_11 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero);
+ const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
+
+ const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
+
+ const __m128i sum_round =
+ _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+ const __m128i src_8 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride),
+ *(__m128i *)(data + 9 * im_stride));
+ const __m128i src_10 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride),
+ *(__m128i *)(data + 11 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+ const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+ const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
+
+ const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+ const __m128i src_9 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride),
+ *(__m128i *)(data + 9 * im_stride));
+ const __m128i src_11 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride),
+ *(__m128i *)(data + 11 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+ const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+ const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
+
+ const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ if (w < 8) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ } else {
+ av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params);
+ }
+ } else {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i sum_round = _mm_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ if (w == 2) {
+ *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
+ } else if (w == 4) {
+ *(int *)p = _mm_cvtsi128_si32(res);
+ } else {
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w,
+ int h, ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ int i, j;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert((w % 4) == 0);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+ const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+ const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+ const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+ const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+ const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+ const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+ const __m128i data_ref_0_hi =
+ _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+ const __m128i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_hi = convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 =
+ _mm_packus_epi16(round_result_lo, round_result_hi);
+
+ _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+ _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+ const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+ const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+ else
+ *(int *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
new file mode 100644
index 0000000000..3862bbeac1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/SVT-AV1/convolve_avx2.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static AOM_INLINE void av1_convolve_y_sr_general_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
+ // right shift is F-1 because we are already dividing
+ // filter co-efficients by 2
+ const int right_shift_bits = (FILTER_BITS - 1);
+ __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+ __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+
+ __m256i coeffs[6], s[12];
+ __m128i d[10];
+
+ int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
+
+ if (vert_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ else if (vert_tap == 12) {
+ prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
+ } else {
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+ }
+
+ // vert_filt as 4 tap
+ if (vert_tap == 4) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else if (vert_tap == 6) {
+ const int fo_vert = vert_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+ const __m256i src_34a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else if (vert_tap == 12) { // vert_tap == 12
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ const __m256i v_zero = _mm256_setzero_si256();
+ right_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
+
+ for (int j = 0; j < w; j += 8) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src10;
+
+ d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
+ d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
+ d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
+ d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
+
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
+
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
+
+ const __m256i src_89a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
+
+ src10 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
+ const __m256i src_910a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
+
+ const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
+ const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
+ const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
+ const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
+ const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
+ const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
+ const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
+ const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
+ const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
+ const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
+
+ s[0] = _mm256_unpacklo_epi16(src_01, src_12);
+ s[1] = _mm256_unpacklo_epi16(src_23, src_34);
+ s[2] = _mm256_unpacklo_epi16(src_45, src_56);
+ s[3] = _mm256_unpacklo_epi16(src_67, src_78);
+ s[4] = _mm256_unpacklo_epi16(src_89, src_910);
+
+ s[6] = _mm256_unpackhi_epi16(src_01, src_12);
+ s[7] = _mm256_unpackhi_epi16(src_23, src_34);
+ s[8] = _mm256_unpackhi_epi16(src_45, src_56);
+ s[9] = _mm256_unpackhi_epi16(src_67, src_78);
+ s[10] = _mm256_unpackhi_epi16(src_89, src_910);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_1011a = _mm256_permute2x128_si256(
+ src10,
+ _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
+ 0x20);
+
+ src10 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
+
+ const __m256i src_1112a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
+ src10, 0x20);
+
+ const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
+ const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
+
+ s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
+ s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
+
+ const __m256i res_lo = convolve_12taps(s, coeffs);
+
+ const __m256i res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 4) {
+ const __m256i res_hi = convolve_12taps(s + 6, coeffs);
+
+ const __m256i res_32b_hi = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi, right_shift_const), right_shift);
+ __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 2) {
+ *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
+ *(int *)&dst[i * dst_stride + j + dst_stride] =
+ _mm_cvtsi128_si32(res_1);
+ } else {
+ *(uint16_t *)&dst[i * dst_stride + j] =
+ (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
+ (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+
+ s[6] = s[7];
+ s[7] = s[8];
+ s[8] = s[9];
+ s[9] = s[10];
+ s[10] = s[11];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t w,
+ int32_t h,
+ const InterpFilterParams *filter_params_y,
+ const int32_t subpel_y_q4) {
+ const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ if (vert_tap == 12) {
+ av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_q4);
+ } else {
+ av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_q4);
+ }
+}
+
+static AOM_INLINE void av1_convolve_x_sr_general_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ __m256i round_0_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+ __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+ int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+ assert(conv_params->round_0 > 0);
+
+ __m256i coeffs[6], filt[4];
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ if (horiz_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
+ else if (horiz_tap == 12) {
+ prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
+ } else {
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+ }
+
+ // horz_filt as 4 tap
+ if (horiz_tap == 4) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ } else if (horiz_tap == 6) {
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ } else if (horiz_tap == 12) { // horiz_tap == 12
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ const __m256i v_zero = _mm256_setzero_si256();
+ round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
+ round_const = _mm256_set1_epi32((1 << bits) >> 1);
+ round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ __m256i s[6];
+
+ if (w <= 4) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+ // row0 0..7 row1 0..7
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
+ // row0 8..F row1 8..F
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
+
+ // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
+ // row0 04 04 .. 07 07 row1 04 04 .. 07 07
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
+
+ // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
+ // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
+
+ // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
+ // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
+ // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
+ // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
+ // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
+
+ const __m256i res_lo = convolve_12taps(s, coeffs);
+
+ __m256i res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
+
+ // 00 01 02 03 10 12 13 14
+ res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
+ round_shift);
+ // 8 bit conversion and saturation to uint8
+ // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+ // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
+ // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
+ const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+ // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w > 2) {
+ // 00 01 02 03
+ *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
+ // 10 11 12 13
+ *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
+ } else {
+ // 00 01
+ *(uint16_t *)&dst[i * dst_stride] =
+ (uint16_t)_mm_cvtsi128_si32(res_0);
+ // 10 11
+ *(uint16_t *)&dst[i * dst_stride + dst_stride] =
+ (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; i++) {
+ for (int j = 0; j < w; j += 8) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
+ 0x20);
+ // row0 0..7 4..B
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
+ // row0 8..F C..13
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
+
+ // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
+ // row0 04 04 .. 07 07 08 08 .. 0B 0B
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
+
+ // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
+ // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
+
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
+
+ const __m256i res_lo = convolve_12taps(s, coeffs);
+
+ __m256i res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
+
+ res_32b_lo = _mm256_sra_epi32(
+ _mm256_add_epi32(res_32b_lo, round_const), round_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+ const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
+ *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ }
+}
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride, int32_t w,
+ int32_t h,
+ const InterpFilterParams *filter_params_x,
+ const int32_t subpel_x_q4,
+ ConvolveParams *conv_params) {
+ const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+ if (horz_tap == 12) {
+ av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_q4, conv_params);
+ } else {
+ av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_q4,
+ conv_params);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 0000000000..012e75c1ae
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
+ coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+ coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+ const __m128i *const coeffs) {
+ const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+ const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+ return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ int subpel_y_qn) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[6];
+
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
+
+ int j = 0;
+ do {
+ __m128i s[12], src10, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
+ s[0] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
+ s[6] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ s[7] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
+ s[8] =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
+ s[9] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[10] = _mm_unpacklo_epi8(
+ src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
+ src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
+ s[11] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
+
+ res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ s[6] = s[8];
+ s[7] = s[9];
+ s[8] = s[10];
+ s[9] = s[11];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn) {
+ if (filter_params_y->taps > 8) {
+ if (w < 8) {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ } else {
+ av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn);
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+ if (w <= 4) {
+ __m128i s[8], src6, res, res_round, res16;
+ int res_int;
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)res_int;
+ else
+ *(int *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)res_int;
+ else
+ *(int *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+ }
+}
+
+void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_0_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i coeffs[6];
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+
+ const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
+
+ __m128i res32_round =
+ _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
+ res32_round =
+ _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res32_round, zero);
+ const __m128i res = _mm_packus_epi16(res16, zero);
+
+ const int val = _mm_cvtsi128_si32(res);
+ memcpy((dst + i * dst_stride + j), &val, sizeof(val));
+ j += 4;
+ } while (j < w);
+ } while (++i < h);
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ if (w < 4) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ } else {
+ av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params);
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_0_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ __m128i coeffs[4];
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+ if (w <= 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ int r = _mm_cvtsi128_si32(res);
+ if (w == 2)
+ *(uint16_t *)dst = (uint16_t)r;
+ else
+ *(int *)dst = r;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_lo_round = _mm_sra_epi32(
+ _mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ __m128i res_hi_round = _mm_sra_epi32(
+ _mm_add_epi32(res_hi, round_0_const), round_0_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 0000000000..d05bb0e15f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+//------------------------------------------------------------------------------
+// filter_intra_predictor_sse4_1
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+#define DUPLICATE_FIRST_HALF 0x44
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride,
+ const __m128i *pixels,
+ const __m128i *taps_0_1,
+ const __m128i *taps_2_3,
+ const __m128i *taps_4_5,
+ const __m128i *taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3);
+ // |output_half| contains 8 partial sums.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+ /* arbitrary pack arg */ output);
+ xx_storel_32(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4),
+ /* arbitrary pack arg */ output);
+ xx_storel_32(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because xx_loadl_64 goes out
+// of bounds and every block involves the left column. This implementation
+// loads TL from the top row for the first block, so it is not
+static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride,
+ const uint8_t *const top_ptr,
+ const uint8_t *const left_ptr, int mode,
+ const int height) {
+ const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+ __m128i top = xx_loadl_32(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4);
+ __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ pixels = _mm_or_si128(left, pixels);
+
+ // Duplicate first 8 bytes.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = xx_loadl_32(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows.
+ // Because the common code below this block assumes that
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = xx_loadl_32(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = xx_loadl_32(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ pixels = _mm_or_si128(left, pixels);
+ left = xx_loadl_64(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = xx_loadl_32(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = xx_loadl_32(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case, we assume that the left vector has the next TL
+ // at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = xx_loadl_32(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = xx_loadl_32(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ }
+}
+
+static INLINE void filter_intra_predictor_sse4_1(void *const dest,
+ ptrdiff_t stride,
+ const void *const top_row,
+ const void *const left_column,
+ int mode, const int width,
+ const int height) {
+ const uint8_t *const top_ptr = (const uint8_t *)top_row;
+ const uint8_t *const left_ptr = (const uint8_t *)left_column;
+ uint8_t *dst = (uint8_t *)dest;
+ if (width == 4) {
+ filter_4xh(dst, stride, top_ptr, left_ptr, mode, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ const int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ const int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = xx_loadl_64(top_ptr - 1);
+ __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = xx_loadl_32(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = xx_loadl_32(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ pixels = xx_loadl_32(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = xx_loadl_32(dst - stride);
+ left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = xx_loadl_32(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = xx_loadl_32(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5,
+ &taps_6_7);
+ pixels = xx_loadl_32(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF);
+ filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3,
+ &taps_4_5, &taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh);
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 0000000000..d65318ccfa
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (filter_params_x->taps == 12) {
+ av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params, bd);
+ return;
+ }
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_setzero_si256();
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+ res_b_round =
+ _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+ round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_16bit, 1));
+ } else if (w == 4) {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ } else {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ xx_storel_32(&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 0000000000..89d7199f48
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src,
+ int src_stride, uint16_t *dst0,
+ int dst_stride0, int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const __m128i offset_const_16b = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 8)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_16bit =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_32b_lo, offset_const);
+
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 4) {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+ const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+ const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_lo_round, offset_const);
+
+ if (w < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+ const __m128i comp_avg_res =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ } else {
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_hi_round, offset_const);
+
+ if (do_average) {
+ const __m128i data_lo =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_hi =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+ const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+ const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 0000000000..88974ba260
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const __m128i round_const_x = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ if (filter_params_x->taps == 12) {
+ __m128i coeffs_x[6], coeffs_y[6], s[24];
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+ const __m128i row02 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+ s[4] = _mm_alignr_epi8(row02, row01, 0);
+ s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+ __m128i res_even = convolve_12tap(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+ s[4] = _mm_alignr_epi8(row02, row01, 2);
+ s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+ __m128i res_odd = convolve_12tap(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+ __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride));
+ __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride));
+ __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[4] = _mm_unpacklo_epi16(s8, s9);
+
+ s[6] = _mm_unpackhi_epi16(s0, s1);
+ s[7] = _mm_unpackhi_epi16(s2, s3);
+ s[8] = _mm_unpackhi_epi16(s4, s5);
+ s[9] = _mm_unpackhi_epi16(s6, s7);
+ s[10] = _mm_unpackhi_epi16(s8, s9);
+
+ s[12] = _mm_unpacklo_epi16(s1, s2);
+ s[13] = _mm_unpacklo_epi16(s3, s4);
+ s[14] = _mm_unpacklo_epi16(s5, s6);
+ s[15] = _mm_unpacklo_epi16(s7, s8);
+ s[16] = _mm_unpacklo_epi16(s9, s10);
+
+ s[18] = _mm_unpackhi_epi16(s1, s2);
+ s[19] = _mm_unpackhi_epi16(s3, s4);
+ s[20] = _mm_unpackhi_epi16(s5, s6);
+ s[21] = _mm_unpackhi_epi16(s7, s8);
+ s[22] = _mm_unpackhi_epi16(s9, s10);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride));
+ __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride));
+
+ s[5] = _mm_unpacklo_epi16(s10, s11);
+ s[11] = _mm_unpackhi_epi16(s10, s11);
+
+ s[17] = _mm_unpacklo_epi16(s11, s12);
+ s[23] = _mm_unpackhi_epi16(s11, s12);
+
+ const __m128i res_a0 = convolve_12tap(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_y), round_shift_y);
+ res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_y), round_shift_y);
+ res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+ res_b_round0 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
+ round_shift_bits);
+
+ const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+ res_b_round1 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+
+ s[6] = s[7];
+ s[7] = s[8];
+ s[8] = s[9];
+ s[9] = s[10];
+ s[10] = s[11];
+
+ s[12] = s[13];
+ s[13] = s[14];
+ s[14] = s[15];
+ s[15] = s[16];
+ s[16] = s[17];
+
+ s[18] = s[19];
+ s[19] = s[20];
+ s[20] = s[21];
+ s[21] = s[22];
+ s[22] = s[23];
+
+ s10 = s12;
+ }
+ }
+ }
+ } else {
+ __m128i coeffs_x[4], coeffs_y[4], s[16];
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_y), round_shift_y);
+ res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_y), round_shift_y);
+ res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+ res_b_round0 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits),
+ round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+ res_b_round1 =
+ _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 0000000000..cbfe5614c3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,4239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+// Note:
+// Total 32x4 registers to represent 32x32 block coefficients.
+// For high bit depth, each coefficient is 4-byte.
+// Each __m256i register holds 8 coefficients.
+// So each "row" we needs 4 register. Totally 32 rows
+// Register layout:
+// v0, v1, v2, v3,
+// v4, v5, v6, v7,
+// ... ...
+// v124, v125, v126, v127
+
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(u, max);
+ clamped = _mm256_andnot_si256(mask, u);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ clamped = _mm256_and_si256(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+ if (shift != 0) {
+ __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+ in[0] = _mm256_add_epi32(in[0], rnding);
+ in[1] = _mm256_add_epi32(in[1], rnding);
+ in[2] = _mm256_add_epi32(in[2], rnding);
+ in[3] = _mm256_add_epi32(in[3], rnding);
+
+ in[0] = _mm256_srai_epi32(in[0], shift);
+ in[1] = _mm256_srai_epi32(in[1], shift);
+ in[2] = _mm256_srai_epi32(in[2], shift);
+ in[3] = _mm256_srai_epi32(in[3], shift);
+ }
+}
+
+static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+ round_shift_4x4_avx2(in, shift);
+ round_shift_4x4_avx2(in + 4, shift);
+ round_shift_4x4_avx2(in + 8, shift);
+ round_shift_4x4_avx2(in + 12, shift);
+}
+
+static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi, int size) {
+ __m256i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm256_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm256_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
+ }
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+ __m256i res0, __m256i res1,
+ const int bd) {
+ __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+ __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+ x0 = _mm256_add_epi32(res0, x0);
+ x1 = _mm256_add_epi32(res1, x1);
+ x0 = _mm256_packus_epi32(x0, x1);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+ __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+ _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+ }
+}
+static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
+ const int bd) {
+ __m256i x0 = pred;
+ x0 = _mm256_add_epi32(res, x0);
+ x0 = _mm256_packus_epi32(x0, x0);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ __m128i temp;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m256i v = _mm256_cvtepi16_epi32(temp);
+ __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
+ __m128i u1 = _mm256_castsi256_si128(u);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u1);
+ }
+}
+static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, int shift) {
+ __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+ __m256i a0 = _mm256_add_epi32(offset, in0);
+ __m256i a1 = _mm256_sub_epi32(offset, in1);
+
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+ u1 = _mm256_unpackhi_epi32(in[0], in[1]);
+
+ u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+ u3 = _mm256_unpackhi_epi32(in[2], in[3]);
+
+ u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+ u5 = _mm256_unpackhi_epi32(in[4], in[5]);
+
+ u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+ u7 = _mm256_unpackhi_epi32(in[6], in[7]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[7], in[6]);
+ u1 = _mm256_unpackhi_epi32(in[7], in[6]);
+
+ u2 = _mm256_unpacklo_epi32(in[5], in[4]);
+ u3 = _mm256_unpackhi_epi32(in[5], in[4]);
+
+ u4 = _mm256_unpacklo_epi32(in[3], in[2]);
+ u5 = _mm256_unpackhi_epi32(in[3], in[2]);
+
+ u6 = _mm256_unpacklo_epi32(in[1], in[0]);
+ u7 = _mm256_unpackhi_epi32(in[1], in[0]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m256i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+ }
+}
+
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *rounding, int bit) {
+ __m256i x;
+ x = _mm256_mullo_epi32(*w0, *n0);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ __m256i a0 = _mm256_add_epi32(in0, in1);
+ __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static INLINE void idct32_stage4_avx2(
+ __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+ const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+ const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+ __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+ const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+ __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rounding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ x = _mm256_add_epi32(offset, x);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32], bf0[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+ }
+}
+static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm256_mullo_epi32(in[0], cospi32);
+ in[0] = _mm256_add_epi32(in[0], rnding);
+ in[0] = _mm256_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm256_add_epi32(in[0], offset);
+ in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ }
+ in[0] = _mm256_max_epi32(in[0], clamp_lo);
+ in[0] = _mm256_min_epi32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+ }
+}
+
+static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
+
+ addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm256_mullo_epi32(u[0], cospi32);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+
+ addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm256_mullo_epi32(u[5], cospi32);
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[13], cospi32);
+ u[10] = _mm256_sub_epi32(y, x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[13] = _mm256_add_epi32(x, y);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ y = _mm256_mullo_epi32(u[12], cospi32);
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ u[12] = _mm256_add_epi32(x, y);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+ // stage 7
+ addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+ addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm256_mullo_epi32(u[0], cospi32);
+ y = _mm256_mullo_epi32(u[1], cospi32);
+ v[0] = _mm256_add_epi32(x, y);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_sub_epi32(x, y);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+ addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = _mm256_mullo_epi32(v[5], cospi32);
+ y = _mm256_mullo_epi32(v[6], cospi32);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[13], cospi32);
+ v[10] = _mm256_sub_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_add_epi32(x, y);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ y = _mm256_mullo_epi32(u[12], cospi32);
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_add_epi32(x, y);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8_avx2(out, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i v[16], x, y, temp1, temp2;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ v[0] = _mm256_add_epi32(x, rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ v[1] = _mm256_sub_epi32(zero, x);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm256_mullo_epi32(v[8], cospi8);
+ x = _mm256_mullo_epi32(v[9], cospi56);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[8], cospi56);
+ x = _mm256_mullo_epi32(v[9], cospi8);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm256_mullo_epi32(v[4], cospi16);
+ x = _mm256_mullo_epi32(v[5], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[4], cospi48);
+ x = _mm256_mullo_epi32(v[5], cospi16);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm256_mullo_epi32(v[12], cospi16);
+ x = _mm256_mullo_epi32(v[13], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+
+ temp2 = _mm256_mullo_epi32(v[12], cospi48);
+ x = _mm256_mullo_epi32(v[13], cospi16);
+ temp2 = _mm256_sub_epi32(temp2, x);
+ temp2 = _mm256_add_epi32(temp2, rnding);
+ temp2 = _mm256_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm256_mullo_epi32(v[2], cospi32);
+ x = _mm256_mullo_epi32(v[3], cospi32);
+ v[2] = _mm256_add_epi32(y, x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(y, x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ y = _mm256_mullo_epi32(v[6], cospi32);
+ x = _mm256_mullo_epi32(v[7], cospi32);
+ v[6] = _mm256_add_epi32(y, x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(y, x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ y = _mm256_mullo_epi32(v[10], cospi32);
+ x = _mm256_mullo_epi32(v[11], cospi32);
+ v[10] = _mm256_add_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ y = _mm256_mullo_epi32(v[14], cospi32);
+ x = _mm256_mullo_epi32(v[15], cospi32);
+ v[14] = _mm256_add_epi32(y, x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(y, x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ __m256i zero = _mm256_setzero_si256();
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ u[1] = _mm256_sub_epi32(zero, x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ x = _mm256_mullo_epi32(in[2], cospi54);
+ u[2] = _mm256_add_epi32(x, rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ x = _mm256_mullo_epi32(in[2], cospi10);
+ u[3] = _mm256_sub_epi32(zero, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ x = _mm256_mullo_epi32(in[4], cospi46);
+ u[4] = _mm256_add_epi32(x, rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ x = _mm256_mullo_epi32(in[4], cospi18);
+ u[5] = _mm256_sub_epi32(zero, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ x = _mm256_mullo_epi32(in[6], cospi38);
+ u[6] = _mm256_add_epi32(x, rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ x = _mm256_mullo_epi32(in[6], cospi26);
+ u[7] = _mm256_sub_epi32(zero, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ u[8] = _mm256_mullo_epi32(in[7], cospi34);
+ u[8] = _mm256_add_epi32(u[8], rnding);
+ u[8] = _mm256_srai_epi32(u[8], bit);
+
+ u[9] = _mm256_mullo_epi32(in[7], cospi30);
+ u[9] = _mm256_add_epi32(u[9], rnding);
+ u[9] = _mm256_srai_epi32(u[9], bit);
+
+ u[10] = _mm256_mullo_epi32(in[5], cospi42);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[11] = _mm256_mullo_epi32(in[5], cospi22);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ u[12] = _mm256_mullo_epi32(in[3], cospi50);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ u[13] = _mm256_mullo_epi32(in[3], cospi14);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ u[14] = _mm256_mullo_epi32(in[1], cospi58);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ u[15] = _mm256_mullo_epi32(in[1], cospi6);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm256_mullo_epi32(u[8], cospi56);
+ x = _mm256_mullo_epi32(u[9], cospi56);
+ u[8] = _mm256_mullo_epi32(u[8], cospi8);
+ u[8] = _mm256_add_epi32(u[8], x);
+ u[8] = _mm256_add_epi32(u[8], rnding);
+ u[8] = _mm256_srai_epi32(u[8], bit);
+
+ x = _mm256_mullo_epi32(u[9], cospi8);
+ u[9] = _mm256_sub_epi32(y, x);
+ u[9] = _mm256_add_epi32(u[9], rnding);
+ u[9] = _mm256_srai_epi32(u[9], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi24);
+ y = _mm256_mullo_epi32(u[10], cospi24);
+ u[10] = _mm256_mullo_epi32(u[10], cospi40);
+ u[10] = _mm256_add_epi32(u[10], x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ x = _mm256_mullo_epi32(u[11], cospi40);
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi8);
+ y = _mm256_mullo_epi32(u[12], cospi8);
+ u[12] = _mm256_mullo_epi32(u[12], cospim56);
+ u[12] = _mm256_add_epi32(u[12], x);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospim56);
+ u[13] = _mm256_sub_epi32(y, x);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospi40);
+ y = _mm256_mullo_epi32(u[14], cospi40);
+ u[14] = _mm256_mullo_epi32(u[14], cospim24);
+ u[14] = _mm256_add_epi32(u[14], x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospim24);
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm256_mullo_epi32(u[5], cospi48);
+ y = _mm256_mullo_epi32(u[4], cospi48);
+ u[4] = _mm256_mullo_epi32(u[4], cospi16);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ x = _mm256_mullo_epi32(u[5], cospi16);
+ u[5] = _mm256_sub_epi32(y, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ x = _mm256_mullo_epi32(u[7], cospi16);
+ y = _mm256_mullo_epi32(u[6], cospi16);
+ u[6] = _mm256_mullo_epi32(u[6], cospim48);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ x = _mm256_mullo_epi32(u[7], cospim48);
+ u[7] = _mm256_sub_epi32(y, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi48);
+ y = _mm256_mullo_epi32(u[12], cospi48);
+ u[12] = _mm256_mullo_epi32(u[12], cospi16);
+ u[12] = _mm256_add_epi32(u[12], x);
+ u[12] = _mm256_add_epi32(u[12], rnding);
+ u[12] = _mm256_srai_epi32(u[12], bit);
+
+ x = _mm256_mullo_epi32(u[13], cospi16);
+ u[13] = _mm256_sub_epi32(y, x);
+ u[13] = _mm256_add_epi32(u[13], rnding);
+ u[13] = _mm256_srai_epi32(u[13], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospi16);
+ y = _mm256_mullo_epi32(u[14], cospi16);
+ u[14] = _mm256_mullo_epi32(u[14], cospim48);
+ u[14] = _mm256_add_epi32(u[14], x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ x = _mm256_mullo_epi32(u[15], cospim48);
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm256_mullo_epi32(u[2], cospi32);
+ x = _mm256_mullo_epi32(u[3], cospi32);
+ u[2] = _mm256_add_epi32(y, x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(y, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ x = _mm256_mullo_epi32(u[7], cospi32);
+ u[6] = _mm256_add_epi32(y, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(y, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ y = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ u[10] = _mm256_add_epi32(y, x);
+ u[10] = _mm256_add_epi32(u[10], rnding);
+ u[10] = _mm256_srai_epi32(u[10], bit);
+
+ u[11] = _mm256_sub_epi32(y, x);
+ u[11] = _mm256_add_epi32(u[11], rnding);
+ u[11] = _mm256_srai_epi32(u[11], bit);
+
+ y = _mm256_mullo_epi32(u[14], cospi32);
+ x = _mm256_mullo_epi32(u[15], cospi32);
+ u[14] = _mm256_add_epi32(y, x);
+ u[14] = _mm256_add_epi32(u[14], rnding);
+ u[14] = _mm256_srai_epi32(u[14], bit);
+
+ u[15] = _mm256_sub_epi32(y, x);
+ u[15] = _mm256_add_epi32(u[15], rnding);
+ u[15] = _mm256_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
+ out[2] = u[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
+ out[4] = u[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
+ out[6] = u[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
+ out[8] = u[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
+ out[10] = u[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
+ out[12] = u[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
+ out[14] = u[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm256_mullo_epi32(in[15], cospi2);
+ x = _mm256_mullo_epi32(in[0], cospi62);
+ v[0] = _mm256_add_epi32(v[0], x);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_mullo_epi32(in[15], cospi62);
+ x = _mm256_mullo_epi32(in[0], cospi2);
+ v[1] = _mm256_sub_epi32(v[1], x);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = _mm256_mullo_epi32(in[13], cospi10);
+ x = _mm256_mullo_epi32(in[2], cospi54);
+ v[2] = _mm256_add_epi32(v[2], x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_mullo_epi32(in[13], cospi54);
+ x = _mm256_mullo_epi32(in[2], cospi10);
+ v[3] = _mm256_sub_epi32(v[3], x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = _mm256_mullo_epi32(in[11], cospi18);
+ x = _mm256_mullo_epi32(in[4], cospi46);
+ v[4] = _mm256_add_epi32(v[4], x);
+ v[4] = _mm256_add_epi32(v[4], rnding);
+ v[4] = _mm256_srai_epi32(v[4], bit);
+
+ v[5] = _mm256_mullo_epi32(in[11], cospi46);
+ x = _mm256_mullo_epi32(in[4], cospi18);
+ v[5] = _mm256_sub_epi32(v[5], x);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ v[6] = _mm256_mullo_epi32(in[9], cospi26);
+ x = _mm256_mullo_epi32(in[6], cospi38);
+ v[6] = _mm256_add_epi32(v[6], x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_mullo_epi32(in[9], cospi38);
+ x = _mm256_mullo_epi32(in[6], cospi26);
+ v[7] = _mm256_sub_epi32(v[7], x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = _mm256_mullo_epi32(in[7], cospi34);
+ x = _mm256_mullo_epi32(in[8], cospi30);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[9] = _mm256_mullo_epi32(in[7], cospi30);
+ x = _mm256_mullo_epi32(in[8], cospi34);
+ v[9] = _mm256_sub_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[10] = _mm256_mullo_epi32(in[5], cospi42);
+ x = _mm256_mullo_epi32(in[10], cospi22);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_mullo_epi32(in[5], cospi22);
+ x = _mm256_mullo_epi32(in[10], cospi42);
+ v[11] = _mm256_sub_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(in[3], cospi50);
+ x = _mm256_mullo_epi32(in[12], cospi14);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(in[3], cospi14);
+ x = _mm256_mullo_epi32(in[12], cospi50);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(in[1], cospi58);
+ x = _mm256_mullo_epi32(in[14], cospi6);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(in[1], cospi6);
+ x = _mm256_mullo_epi32(in[14], cospi58);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 3
+ addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm256_mullo_epi32(u[8], cospi8);
+ x = _mm256_mullo_epi32(u[9], cospi56);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[9] = _mm256_mullo_epi32(u[8], cospi56);
+ x = _mm256_mullo_epi32(u[9], cospi8);
+ v[9] = _mm256_sub_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospi40);
+ x = _mm256_mullo_epi32(u[11], cospi24);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_mullo_epi32(u[10], cospi24);
+ x = _mm256_mullo_epi32(u[11], cospi40);
+ v[11] = _mm256_sub_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[12], cospim56);
+ x = _mm256_mullo_epi32(u[13], cospi8);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(u[12], cospi8);
+ x = _mm256_mullo_epi32(u[13], cospim56);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(u[14], cospim24);
+ x = _mm256_mullo_epi32(u[15], cospi40);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(u[14], cospi40);
+ x = _mm256_mullo_epi32(u[15], cospim24);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 5
+ addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm256_mullo_epi32(u[4], cospi16);
+ x = _mm256_mullo_epi32(u[5], cospi48);
+ v[4] = _mm256_add_epi32(v[4], x);
+ v[4] = _mm256_add_epi32(v[4], rnding);
+ v[4] = _mm256_srai_epi32(v[4], bit);
+
+ v[5] = _mm256_mullo_epi32(u[4], cospi48);
+ x = _mm256_mullo_epi32(u[5], cospi16);
+ v[5] = _mm256_sub_epi32(v[5], x);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ v[6] = _mm256_mullo_epi32(u[6], cospim48);
+ x = _mm256_mullo_epi32(u[7], cospi16);
+ v[6] = _mm256_add_epi32(v[6], x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_mullo_epi32(u[6], cospi16);
+ x = _mm256_mullo_epi32(u[7], cospim48);
+ v[7] = _mm256_sub_epi32(v[7], x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm256_mullo_epi32(u[12], cospi16);
+ x = _mm256_mullo_epi32(u[13], cospi48);
+ v[12] = _mm256_add_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ v[13] = _mm256_mullo_epi32(u[12], cospi48);
+ x = _mm256_mullo_epi32(u[13], cospi16);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[14] = _mm256_mullo_epi32(u[14], cospim48);
+ x = _mm256_mullo_epi32(u[15], cospi16);
+ v[14] = _mm256_add_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_mullo_epi32(u[14], cospi16);
+ x = _mm256_mullo_epi32(u[15], cospim48);
+ v[15] = _mm256_sub_epi32(v[15], x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 7
+ addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm256_mullo_epi32(u[2], cospi32);
+ x = _mm256_mullo_epi32(u[3], cospi32);
+ v[2] = _mm256_add_epi32(y, x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(y, x);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm256_mullo_epi32(u[6], cospi32);
+ x = _mm256_mullo_epi32(u[7], cospi32);
+ v[6] = _mm256_add_epi32(y, x);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(y, x);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[11], cospi32);
+ v[10] = _mm256_add_epi32(y, x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(y, x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm256_mullo_epi32(u[14], cospi32);
+ x = _mm256_mullo_epi32(u[15], cospi32);
+ v[14] = _mm256_add_epi32(y, x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(y, x);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rnding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ x = _mm256_add_epi32(x, offset);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm256_mullo_epi32(in[1], cospi56);
+ y = _mm256_mullo_epi32(in[7], cospim8);
+ u4 = _mm256_add_epi32(x, y);
+ u4 = _mm256_add_epi32(u4, rnding);
+ u4 = _mm256_srai_epi32(u4, bit);
+
+ x = _mm256_mullo_epi32(in[1], cospi8);
+ y = _mm256_mullo_epi32(in[7], cospi56);
+ u7 = _mm256_add_epi32(x, y);
+ u7 = _mm256_add_epi32(u7, rnding);
+ u7 = _mm256_srai_epi32(u7, bit);
+
+ x = _mm256_mullo_epi32(in[5], cospi24);
+ y = _mm256_mullo_epi32(in[3], cospim40);
+ u5 = _mm256_add_epi32(x, y);
+ u5 = _mm256_add_epi32(u5, rnding);
+ u5 = _mm256_srai_epi32(u5, bit);
+
+ x = _mm256_mullo_epi32(in[5], cospi40);
+ y = _mm256_mullo_epi32(in[3], cospi24);
+ u6 = _mm256_add_epi32(x, y);
+ u6 = _mm256_add_epi32(u6, rnding);
+ u6 = _mm256_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm256_mullo_epi32(u0, cospi32);
+ y = _mm256_mullo_epi32(u1, cospi32);
+ v0 = _mm256_add_epi32(x, y);
+ v0 = _mm256_add_epi32(v0, rnding);
+ v0 = _mm256_srai_epi32(v0, bit);
+
+ v1 = _mm256_sub_epi32(x, y);
+ v1 = _mm256_add_epi32(v1, rnding);
+ v1 = _mm256_srai_epi32(v1, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi48);
+ y = _mm256_mullo_epi32(u3, cospim16);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi16);
+ y = _mm256_mullo_epi32(u3, cospi48);
+ v3 = _mm256_add_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm256_mullo_epi32(v5, cospi32);
+ y = _mm256_mullo_epi32(v6, cospi32);
+ u6 = _mm256_add_epi32(y, x);
+ u6 = _mm256_add_epi32(u6, rnding);
+ u6 = _mm256_srai_epi32(u6, bit);
+
+ u5 = _mm256_sub_epi32(y, x);
+ u5 = _mm256_add_epi32(u5, rnding);
+ u5 = _mm256_srai_epi32(u5, bit);
+
+ addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_4x4_avx2(out, out_shift);
+ round_shift_4x4_avx2(out + 4, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+ }
+}
+static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i kZero = _mm256_setzero_si256();
+ __m256i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm256_mullo_epi32(in[0], cospi60);
+ u[0] = _mm256_add_epi32(x, rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ x = _mm256_mullo_epi32(in[0], cospi4);
+ u[1] = _mm256_sub_epi32(kZero, x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m256i temp1, temp2;
+ temp1 = _mm256_mullo_epi32(u[0], cospi16);
+ x = _mm256_mullo_epi32(u[1], cospi48);
+ temp1 = _mm256_add_epi32(temp1, x);
+ temp1 = _mm256_add_epi32(temp1, rnding);
+ temp1 = _mm256_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm256_mullo_epi32(u[0], cospi48);
+ x = _mm256_mullo_epi32(u[1], cospi16);
+ u[5] = _mm256_sub_epi32(temp2, x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm256_mullo_epi32(u[0], cospi32);
+ x = _mm256_mullo_epi32(u[1], cospi32);
+ u[2] = _mm256_add_epi32(temp1, x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(temp1, x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ temp1 = _mm256_mullo_epi32(u[4], cospi32);
+ x = _mm256_mullo_epi32(u[5], cospi32);
+ u[6] = _mm256_add_epi32(temp1, x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(temp1, x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm256_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm256_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm256_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i kZero = _mm256_setzero_si256();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ u[0] = _mm256_mullo_epi32(in[7], cospi4);
+ x = _mm256_mullo_epi32(in[0], cospi60);
+ u[0] = _mm256_add_epi32(u[0], x);
+ u[0] = _mm256_add_epi32(u[0], rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ u[1] = _mm256_mullo_epi32(in[7], cospi60);
+ x = _mm256_mullo_epi32(in[0], cospi4);
+ u[1] = _mm256_sub_epi32(u[1], x);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ u[2] = _mm256_mullo_epi32(in[5], cospi20);
+ x = _mm256_mullo_epi32(in[2], cospi44);
+ u[2] = _mm256_add_epi32(u[2], x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_mullo_epi32(in[5], cospi44);
+ x = _mm256_mullo_epi32(in[2], cospi20);
+ u[3] = _mm256_sub_epi32(u[3], x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ u[4] = _mm256_mullo_epi32(in[3], cospi36);
+ x = _mm256_mullo_epi32(in[4], cospi28);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[5] = _mm256_mullo_epi32(in[3], cospi28);
+ x = _mm256_mullo_epi32(in[4], cospi36);
+ u[5] = _mm256_sub_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(in[1], cospi52);
+ x = _mm256_mullo_epi32(in[6], cospi12);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_mullo_epi32(in[1], cospi12);
+ x = _mm256_mullo_epi32(in[6], cospi52);
+ u[7] = _mm256_sub_epi32(u[7], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm256_mullo_epi32(v[4], cospi16);
+ x = _mm256_mullo_epi32(v[5], cospi48);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[5] = _mm256_mullo_epi32(v[4], cospi48);
+ x = _mm256_mullo_epi32(v[5], cospi16);
+ u[5] = _mm256_sub_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[6], cospim48);
+ x = _mm256_mullo_epi32(v[7], cospi16);
+ u[6] = _mm256_add_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_mullo_epi32(v[6], cospi16);
+ x = _mm256_mullo_epi32(v[7], cospim48);
+ u[7] = _mm256_sub_epi32(u[7], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm256_mullo_epi32(v[2], cospi32);
+ x = _mm256_mullo_epi32(v[3], cospi32);
+ u[2] = _mm256_add_epi32(v[0], x);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ u[3] = _mm256_sub_epi32(v[0], x);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ v[0] = _mm256_mullo_epi32(v[6], cospi32);
+ x = _mm256_mullo_epi32(v[7], cospi32);
+ u[6] = _mm256_add_epi32(v[0], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = _mm256_sub_epi32(v[0], x);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm256_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm256_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm256_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm256_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+static INLINE void idct64_stage8_avx2(
+ __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ int i;
+ __m256i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ int i;
+ __m256i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rnding, int bit) {
+ __m256i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
+ int bd, int out_shift,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ round_shift_8x8_avx2(out + 32, out_shift);
+ round_shift_8x8_avx2(out + 48, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+ }
+}
+
+static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+
+ {
+ __m256i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ x = _mm256_add_epi32(x, offset);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ }
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+ out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
+ }
+}
+static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+
+ {
+ __m256i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ __m256i temp1, temp2;
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+
+ // stage 6
+ temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+
+ idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+ {
+ __m256i u[64];
+ __m256i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+ const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+ const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+ {
+ __m256i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+
+ addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ for (i = 0; i < 32; i++) {
+ addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out =
+ _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m256i clamp_hi_out =
+ _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_8x8_avx2(out, out_shift);
+ round_shift_8x8_avx2(out + 16, out_shift);
+ round_shift_8x8_avx2(out + 32, out_shift);
+ round_shift_8x8_avx2(out + 48, out_shift);
+ highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+ }
+ }
+}
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ {
+ { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
+ { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ {
+ { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+ { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+
+ { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m256i buf1[64 * 8];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m256i buf0[64];
+ load_buffer_32bit_input(input + i * 8, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m256i *_buf1 = buf1 + i * 8;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_flip_avx2(
+ &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ } else if (txfm_size_col == 8) {
+ highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
+ case IDTX:
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+ tx_size, eob, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_avx2(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
+ break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 0000000000..4ff6a90f95
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,5830 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(u, max);
+ clamped = _mm_andnot_si128(mask, u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ clamped = _mm_and_si128(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+ if (shift != 0) {
+ __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[1] = _mm_add_epi32(in[1], rnding);
+ in[2] = _mm_add_epi32(in[2], rnding);
+ in[3] = _mm_add_epi32(in[3], rnding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+ round_shift_4x4(&in[0], shift);
+ round_shift_4x4(&in[4], shift);
+ round_shift_4x4(&in[8], shift);
+ round_shift_4x4(&in[12], shift);
+}
+
+static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int size) {
+ __m128i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+ }
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+ __m128i res0, __m128i res1,
+ const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+ __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+ __m128i min_clip_val = _mm_setzero_si128();
+ __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
+ x0 = _mm_add_epi32(res0, x0);
+ x1 = _mm_add_epi32(res1, x1);
+ x0 = _mm_max_epi32(x0, min_clip_val);
+ x0 = _mm_min_epi32(x0, max_clip_val);
+ x1 = _mm_max_epi32(x1, min_clip_val);
+ x1 = _mm_min_epi32(x1, max_clip_val);
+ x0 = _mm_packus_epi32(x0, x1);
+ return x0;
+}
+
+static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
+ __m128i res0, const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+
+ x0 = _mm_add_epi32(res0, x0);
+ x0 = _mm_packus_epi32(x0, x0);
+ x0 = highbd_clamp_epi16(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
+
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ __m128i op[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ load_buffer_4x4(input, op);
+
+ // Shift before-hand.
+ op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
+ op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
+ op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
+ op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
+
+ for (int i = 0; i < 2; ++i) {
+ __m128i a1 = op[0];
+ __m128i c1 = op[1];
+ __m128i d1 = op[2];
+ __m128i b1 = op[3];
+ a1 = _mm_add_epi32(a1, c1); // a1 += c1
+ d1 = _mm_sub_epi32(d1, b1); // d1 -= b1
+ __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
+ e1 = _mm_srai_epi32(e1, 1);
+ b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
+ c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
+ a1 = _mm_sub_epi32(a1, b1); // a1 -= b1
+ d1 = _mm_add_epi32(d1, c1); // d1 += c1
+
+ op[0] = a1;
+ op[1] = b1;
+ op[2] = c1;
+ op[3] = d1;
+ if (i == 0) {
+ transpose_32bit_4x4(op, op);
+ }
+ }
+
+ // Convert to int16_t. The C code checks that we are in range.
+ op[0] = _mm_packs_epi32(op[0], op[1]);
+ op[1] = _mm_packs_epi32(op[2], op[3]);
+
+ // Load uint16_t.
+ __m128i dst[2];
+ __m128i tmp[4];
+ tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+ tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
+ dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
+ tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
+ tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
+ dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
+
+ // Add to the previous results.
+ dst[0] = _mm_add_epi16(dst[0], op[0]);
+ dst[1] = _mm_add_epi16(dst[1], op[1]);
+
+ // Clamp.
+ dst[0] = highbd_clamp_epi16(dst[0], bd);
+ dst[1] = highbd_clamp_epi16(dst[1], bd);
+
+ // Store.
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
+ dst[0] = _mm_srli_si128(dst[0], 8);
+ _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
+ _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
+ dst[1] = _mm_srli_si128(dst[1], 8);
+ _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
+}
+
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+ __m128i *out1, const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ __m128i a0 = _mm_add_epi32(in0, in1);
+ __m128i a1 = _mm_sub_epi32(in0, in1);
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
+ __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
+
+ in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
+ in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
+
+ in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
+ in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
+ in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
+ in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
+
+ *in0 = in0_w_offset;
+ *in1 = in1_w_offset;
+}
+
+static INLINE void idct32_stage4_sse4_1(
+ __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+ const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+ const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+ __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+ const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+ const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+ __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] =
+ half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 =
+ half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] =
+ half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (int i = 0; i < 32; i += 8) {
+ round_shift_4x4(out + i, out_shift);
+ round_shift_4x4(out + i + 4, out_shift);
+ }
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1,
+ const __m128i *clamp_lo, const __m128i *clamp_hi,
+ int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i a0 = _mm_add_epi32(offset, in0);
+ __m128i a1 = _mm_sub_epi32(offset, in1);
+
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3, x, y;
+
+ // Stage 0
+ // Stage 1
+ // Stage 2
+ u0 = in[0];
+ u1 = in[1];
+ u2 = in[2];
+ u3 = in[3];
+
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u2, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u1, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u1, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ // Stage 3
+ addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
+ shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
+ }
+}
+
+static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
+ rnding = _mm_unpacklo_epi32(rnding, zero);
+ const __m128i mul = _mm_set1_epi32(1 << 4);
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
+ __m128i u0, u1, u2, u3;
+ __m128i u0_low, u1_low, u2_low, u3_low;
+ __m128i u0_high, u1_high, u2_high, u3_high;
+
+ x0 = in[0];
+ x1 = in[1];
+ x2 = in[2];
+ x3 = in[3];
+
+ s0 = _mm_mullo_epi32(x0, sinpi1);
+ s1 = _mm_mullo_epi32(x0, sinpi2);
+ s2 = _mm_mullo_epi32(x1, sinpi3);
+ s3 = _mm_mullo_epi32(x2, sinpi4);
+ s4 = _mm_mullo_epi32(x2, sinpi1);
+ s5 = _mm_mullo_epi32(x3, sinpi2);
+ s6 = _mm_mullo_epi32(x3, sinpi4);
+ t = _mm_sub_epi32(x0, x2);
+ s7 = _mm_add_epi32(t, x3);
+
+ t = _mm_add_epi32(s0, s3);
+ s0 = _mm_add_epi32(t, s5);
+ t = _mm_sub_epi32(s1, s4);
+ s1 = _mm_sub_epi32(t, s6);
+ s3 = s2;
+ s2 = _mm_mullo_epi32(s7, sinpi3);
+
+ u0 = _mm_add_epi32(s0, s3);
+ u1 = _mm_add_epi32(s1, s3);
+ u2 = s2;
+ t = _mm_add_epi32(s0, s1);
+ u3 = _mm_sub_epi32(t, s3);
+
+ // u0
+ u0_low = _mm_mul_epi32(u0, mul);
+ u0_low = _mm_add_epi64(u0_low, rnding);
+
+ u0 = _mm_srli_si128(u0, 4);
+ u0_high = _mm_mul_epi32(u0, mul);
+ u0_high = _mm_add_epi64(u0_high, rnding);
+
+ u0_low = _mm_srli_si128(u0_low, 2);
+ u0_high = _mm_srli_si128(u0_high, 2);
+
+ u0 = _mm_unpacklo_epi32(u0_low, u0_high);
+ u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
+ u0 = _mm_unpacklo_epi64(u0, u0_high);
+
+ // u1
+ u1_low = _mm_mul_epi32(u1, mul);
+ u1_low = _mm_add_epi64(u1_low, rnding);
+
+ u1 = _mm_srli_si128(u1, 4);
+ u1_high = _mm_mul_epi32(u1, mul);
+ u1_high = _mm_add_epi64(u1_high, rnding);
+
+ u1_low = _mm_srli_si128(u1_low, 2);
+ u1_high = _mm_srli_si128(u1_high, 2);
+
+ u1 = _mm_unpacklo_epi32(u1_low, u1_high);
+ u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
+ u1 = _mm_unpacklo_epi64(u1, u1_high);
+
+ // u2
+ u2_low = _mm_mul_epi32(u2, mul);
+ u2_low = _mm_add_epi64(u2_low, rnding);
+
+ u2 = _mm_srli_si128(u2, 4);
+ u2_high = _mm_mul_epi32(u2, mul);
+ u2_high = _mm_add_epi64(u2_high, rnding);
+
+ u2_low = _mm_srli_si128(u2_low, 2);
+ u2_high = _mm_srli_si128(u2_high, 2);
+
+ u2 = _mm_unpacklo_epi32(u2_low, u2_high);
+ u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
+ u2 = _mm_unpacklo_epi64(u2, u2_high);
+
+ // u3
+ u3_low = _mm_mul_epi32(u3, mul);
+ u3_low = _mm_add_epi64(u3_low, rnding);
+
+ u3 = _mm_srli_si128(u3, 4);
+ u3_high = _mm_mul_epi32(u3, mul);
+ u3_high = _mm_add_epi64(u3_high, rnding);
+
+ u3_low = _mm_srli_si128(u3_low, 2);
+ u3_high = _mm_srli_si128(u3_high, 2);
+
+ u3 = _mm_unpacklo_epi32(u3_low, u3_high);
+ u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
+ u3 = _mm_unpacklo_epi64(u3, u3_high);
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ round_shift_4x4(in, shift);
+
+ v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+ v0 = _mm_unpacklo_epi16(v0, zero);
+ v1 = _mm_unpacklo_epi16(v1, zero);
+ v2 = _mm_unpacklo_epi16(v2, zero);
+ v3 = _mm_unpacklo_epi16(v3, zero);
+
+ if (fliplr) {
+ in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+ in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+ in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+ in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+ }
+
+ if (flipud) {
+ u0 = _mm_add_epi32(in[3], v0);
+ u1 = _mm_add_epi32(in[2], v1);
+ u2 = _mm_add_epi32(in[1], v2);
+ u3 = _mm_add_epi32(in[0], v3);
+ } else {
+ u0 = _mm_add_epi32(in[0], v0);
+ u1 = _mm_add_epi32(in[1], v1);
+ u2 = _mm_add_epi32(in[2], v2);
+ u3 = _mm_add_epi32(in[3], v3);
+ }
+
+ v0 = _mm_packus_epi32(u0, u1);
+ v2 = _mm_packus_epi32(u2, u3);
+
+ u0 = highbd_clamp_epi16(v0, bd);
+ u2 = highbd_clamp_epi16(v2, bd);
+
+ v0 = _mm_unpacklo_epi64(u0, u0);
+ v1 = _mm_unpackhi_epi64(u0, u0);
+ v2 = _mm_unpacklo_epi64(u2, u2);
+ v3 = _mm_unpackhi_epi64(u2, u2);
+
+ _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+ _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+ _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+ _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ __m128i zero = _mm_setzero_si128();
+ __m128i fact = _mm_set1_epi32(NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0_low, a1_low;
+ __m128i a0_high, a1_high;
+
+ offset = _mm_unpacklo_epi32(offset, zero);
+
+ for (int i = 0; i < 4; i++) {
+ a0_low = _mm_mul_epi32(in[i], fact);
+ a0_low = _mm_add_epi32(a0_low, offset);
+ a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+ a0_high = _mm_srli_si128(in[i], 4);
+ a0_high = _mm_mul_epi32(a0_high, fact);
+ a0_high = _mm_add_epi32(a0_high, offset);
+ a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+ a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+ a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+ out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+ }
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+}
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in);
+ idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in);
+ iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
+ transpose_32bit_4x4(in, in);
+ iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+ in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+ in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+ in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+ in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+ in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+ in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+ in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+ in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+ in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+ in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+ in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+ in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0 * 2 + col];
+ u1 = in[4 * 2 + col];
+ u2 = in[2 * 2 + col];
+ u3 = in[6 * 2 + col];
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // Even 8 points: 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[14], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[14], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[10], cospi20);
+ x = _mm_mullo_epi32(in[4], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[10], cospi44);
+ x = _mm_mullo_epi32(in[4], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[6], cospi36);
+ x = _mm_mullo_epi32(in[8], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[6], cospi28);
+ x = _mm_mullo_epi32(in[8], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[2], cospi52);
+ x = _mm_mullo_epi32(in[12], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[2], cospi12);
+ x = _mm_mullo_epi32(in[12], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[2] = _mm_sub_epi32(kZero, u[4]);
+ out[4] = u[6];
+ out[6] = _mm_sub_epi32(kZero, u[2]);
+ out[8] = u[3];
+ out[10] = _mm_sub_epi32(kZero, u[7]);
+ out[12] = u[5];
+ out[14] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[15], cospi4);
+ x = _mm_mullo_epi32(in[1], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[15], cospi60);
+ x = _mm_mullo_epi32(in[1], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[11], cospi20);
+ x = _mm_mullo_epi32(in[5], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[11], cospi44);
+ x = _mm_mullo_epi32(in[5], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[7], cospi36);
+ x = _mm_mullo_epi32(in[9], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[7], cospi28);
+ x = _mm_mullo_epi32(in[9], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[3], cospi52);
+ x = _mm_mullo_epi32(in[13], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[3], cospi12);
+ x = _mm_mullo_epi32(in[13], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[1] = u[0];
+ out[3] = _mm_sub_epi32(kZero, u[4]);
+ out[5] = u[6];
+ out[7] = _mm_sub_epi32(kZero, u[2]);
+ out[9] = u[3];
+ out[11] = _mm_sub_epi32(kZero, u[7]);
+ out[13] = u[5];
+ out[15] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ out[0] = _mm_add_epi32(in[0], in[0]);
+ out[1] = _mm_add_epi32(in[1], in[1]);
+ out[2] = _mm_add_epi32(in[2], in[2]);
+ out[3] = _mm_add_epi32(in[3], in[3]);
+ out[4] = _mm_add_epi32(in[4], in[4]);
+ out[5] = _mm_add_epi32(in[5], in[5]);
+ out[6] = _mm_add_epi32(in[6], in[6]);
+ out[7] = _mm_add_epi32(in[7], in[7]);
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
+ }
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+ int fliplr, int bd) {
+ __m128i x0, x1;
+ const __m128i zero = _mm_setzero_si128();
+
+ x0 = _mm_unpacklo_epi16(pred, zero);
+ x1 = _mm_unpackhi_epi16(pred, zero);
+
+ if (fliplr) {
+ res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+ res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+ x0 = _mm_add_epi32(res_hi, x0);
+ x1 = _mm_add_epi32(res_lo, x1);
+
+ } else {
+ x0 = _mm_add_epi32(res_lo, x0);
+ x1 = _mm_add_epi32(res_hi, x1);
+ }
+
+ x0 = _mm_packus_epi32(x0, x1);
+ return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ round_shift_8x8(in, shift);
+
+ v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+ v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+ v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+ v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+ v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+ v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+ v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+ v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+ if (flipud) {
+ u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+ } else {
+ u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+ }
+
+ _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+ _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+ _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+ _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+ _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+ _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+ _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+ _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16], out[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
+ transpose_8x8(out, in);
+ iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
+ write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm_mullo_epi32(in[0], cospi32);
+ x = _mm_add_epi32(x, rnding);
+ x = _mm_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm_mullo_epi32(in[1], cospi56);
+ y = _mm_mullo_epi32(in[7], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1], cospi8);
+ y = _mm_mullo_epi32(in[7], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi24);
+ y = _mm_mullo_epi32(in[3], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi40);
+ y = _mm_mullo_epi32(in[3], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ round_shift_4x4(out, out_shift);
+ round_shift_4x4(out + 4, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+ }
+}
+
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(kZero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m128i temp1, temp2;
+ temp1 = _mm_mullo_epi32(u[0], cospi16);
+ x = _mm_mullo_epi32(u[1], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm_mullo_epi32(u[0], cospi48);
+ x = _mm_mullo_epi32(u[1], cospi16);
+ u[5] = _mm_sub_epi32(temp2, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm_mullo_epi32(u[0], cospi32);
+ x = _mm_mullo_epi32(u[1], cospi32);
+ u[2] = _mm_add_epi32(temp1, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(temp1, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ temp1 = _mm_mullo_epi32(u[4], cospi32);
+ x = _mm_mullo_epi32(u[5], cospi32);
+ u[6] = _mm_add_epi32(temp1, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(temp1, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ u[0] = _mm_mullo_epi32(in[7], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[7], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[5], cospi20);
+ x = _mm_mullo_epi32(in[2], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[5], cospi44);
+ x = _mm_mullo_epi32(in[2], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[3], cospi36);
+ x = _mm_mullo_epi32(in[4], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[3], cospi28);
+ x = _mm_mullo_epi32(in[4], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[1], cospi52);
+ x = _mm_mullo_epi32(in[6], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[1], cospi12);
+ x = _mm_mullo_epi32(in[6], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm_mullo_epi32(in[0], cospi32);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (!do_cols) {
+ log_range = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm_add_epi32(in[0], offset);
+ in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ }
+ }
+
+ in[0] = _mm_max_epi32(in[0], clamp_lo);
+ in[0] = _mm_min_epi32(in[0], clamp_hi);
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+ addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[5], cospi32);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ u[10] = _mm_sub_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[13] = _mm_add_epi32(x, y);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_add_epi32(x, y);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+ // stage 7
+ addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i v[16], x, y, temp1, temp2;
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(x, rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(zero, x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm_mullo_epi32(v[8], cospi8);
+ x = _mm_mullo_epi32(v[9], cospi56);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[8], cospi56);
+ x = _mm_mullo_epi32(v[9], cospi8);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm_mullo_epi32(v[12], cospi16);
+ x = _mm_mullo_epi32(v[13], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[12], cospi48);
+ x = _mm_mullo_epi32(v[13], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ y = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ y = _mm_mullo_epi32(v[10], cospi32);
+ x = _mm_mullo_epi32(v[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ y = _mm_mullo_epi32(v[14], cospi32);
+ x = _mm_mullo_epi32(v[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i zero = _mm_setzero_si128();
+ __m128i u[16], x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ u[1] = _mm_sub_epi32(zero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi54);
+ u[2] = _mm_add_epi32(x, rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi10);
+ u[3] = _mm_sub_epi32(zero, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi46);
+ u[4] = _mm_add_epi32(x, rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi18);
+ u[5] = _mm_sub_epi32(zero, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi38);
+ u[6] = _mm_add_epi32(x, rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi26);
+ u[7] = _mm_sub_epi32(zero, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[8] = _mm_mullo_epi32(in[7], cospi34);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ u[9] = _mm_mullo_epi32(in[7], cospi30);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ u[10] = _mm_mullo_epi32(in[5], cospi42);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_mullo_epi32(in[5], cospi22);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_mullo_epi32(in[3], cospi50);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ u[13] = _mm_mullo_epi32(in[3], cospi14);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ u[14] = _mm_mullo_epi32(in[1], cospi58);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_mullo_epi32(in[1], cospi6);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ u[8] = _mm_mullo_epi32(u[8], cospi8);
+ u[8] = _mm_add_epi32(u[8], x);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ x = _mm_mullo_epi32(u[9], cospi8);
+ u[9] = _mm_sub_epi32(y, x);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi24);
+ y = _mm_mullo_epi32(u[10], cospi24);
+ u[10] = _mm_mullo_epi32(u[10], cospi40);
+ u[10] = _mm_add_epi32(u[10], x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi40);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi8);
+ y = _mm_mullo_epi32(u[12], cospi8);
+ u[12] = _mm_mullo_epi32(u[12], cospim56);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospim56);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi40);
+ y = _mm_mullo_epi32(u[14], cospi40);
+ u[14] = _mm_mullo_epi32(u[14], cospim24);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim24);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm_mullo_epi32(u[5], cospi48);
+ y = _mm_mullo_epi32(u[4], cospi48);
+ u[4] = _mm_mullo_epi32(u[4], cospi16);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(u[5], cospi16);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(u[7], cospi16);
+ y = _mm_mullo_epi32(u[6], cospi16);
+ u[6] = _mm_mullo_epi32(u[6], cospim48);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(u[7], cospim48);
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi48);
+ y = _mm_mullo_epi32(u[12], cospi48);
+ u[12] = _mm_mullo_epi32(u[12], cospi16);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi16);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi16);
+ y = _mm_mullo_epi32(u[14], cospi16);
+ u[14] = _mm_mullo_epi32(u[14], cospim48);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim48);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ u[2] = _mm_add_epi32(y, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(y, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ u[10] = _mm_add_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ u[14] = _mm_add_epi32(y, x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(zero, u[8]);
+ out[2] = u[12];
+ out[3] = _mm_sub_epi32(zero, u[4]);
+ out[4] = u[6];
+ out[5] = _mm_sub_epi32(zero, u[14]);
+ out[6] = u[10];
+ out[7] = _mm_sub_epi32(zero, u[2]);
+ out[8] = u[3];
+ out[9] = _mm_sub_epi32(zero, u[11]);
+ out[10] = u[15];
+ out[11] = _mm_sub_epi32(zero, u[7]);
+ out[12] = u[5];
+ out[13] = _mm_sub_epi32(zero, u[13]);
+ out[14] = u[9];
+ out[15] = _mm_sub_epi32(zero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+ addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ y = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(x, y);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(x, y);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = _mm_mullo_epi32(v[5], cospi32);
+ y = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_sub_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_add_epi32(x, y);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_add_epi32(x, y);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+ }
+ }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u[16], v[16], x, y;
+ // Calculate the column 0, 1, 2, 3
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm_mullo_epi32(in[15], cospi2);
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(v[0], x);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_mullo_epi32(in[15], cospi62);
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(v[1], x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(in[13], cospi10);
+ x = _mm_mullo_epi32(in[2], cospi54);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(in[13], cospi54);
+ x = _mm_mullo_epi32(in[2], cospi10);
+ v[3] = _mm_sub_epi32(v[3], x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_mullo_epi32(in[11], cospi18);
+ x = _mm_mullo_epi32(in[4], cospi46);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(in[11], cospi46);
+ x = _mm_mullo_epi32(in[4], cospi18);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(in[9], cospi26);
+ x = _mm_mullo_epi32(in[6], cospi38);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(in[9], cospi38);
+ x = _mm_mullo_epi32(in[6], cospi26);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = _mm_mullo_epi32(in[7], cospi34);
+ x = _mm_mullo_epi32(in[8], cospi30);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(in[7], cospi30);
+ x = _mm_mullo_epi32(in[8], cospi34);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(in[5], cospi42);
+ x = _mm_mullo_epi32(in[10], cospi22);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(in[5], cospi22);
+ x = _mm_mullo_epi32(in[10], cospi42);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(in[3], cospi50);
+ x = _mm_mullo_epi32(in[12], cospi14);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(in[3], cospi14);
+ x = _mm_mullo_epi32(in[12], cospi50);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(in[1], cospi58);
+ x = _mm_mullo_epi32(in[14], cospi6);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(in[1], cospi6);
+ x = _mm_mullo_epi32(in[14], cospi58);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi8);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi8);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi40);
+ x = _mm_mullo_epi32(u[11], cospi24);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(u[10], cospi24);
+ x = _mm_mullo_epi32(u[11], cospi40);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[12], cospim56);
+ x = _mm_mullo_epi32(u[13], cospi8);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi8);
+ x = _mm_mullo_epi32(u[13], cospim56);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim24);
+ x = _mm_mullo_epi32(u[15], cospi40);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi40);
+ x = _mm_mullo_epi32(u[15], cospim24);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 5
+ addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm_mullo_epi32(u[4], cospi16);
+ x = _mm_mullo_epi32(u[5], cospi48);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(u[4], cospi48);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(u[6], cospim48);
+ x = _mm_mullo_epi32(u[7], cospi16);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(u[6], cospi16);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm_mullo_epi32(u[12], cospi16);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi48);
+ x = _mm_mullo_epi32(u[13], cospi16);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim48);
+ x = _mm_mullo_epi32(u[15], cospi16);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi16);
+ x = _mm_mullo_epi32(u[15], cospim48);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(zero, v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(zero, v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(zero, v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(zero, v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(zero, v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(zero, v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(zero, v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(zero, v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0_low, a0_high, a1_low, a1_high;
+ __m128i zero = _mm_setzero_si128();
+ offset = _mm_unpacklo_epi32(offset, zero);
+
+ for (int i = 0; i < 16; i++) {
+ a0_low = _mm_mul_epi32(in[i], fact);
+ a0_low = _mm_add_epi32(a0_low, offset);
+ a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+ a0_high = _mm_srli_si128(in[i], 4);
+ a0_high = _mm_mul_epi32(a0_high, fact);
+ a0_high = _mm_add_epi32(a0_high, offset);
+ a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+ a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+ a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+ out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+ }
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
+ }
+}
+static INLINE void idct64_stage8_sse4_1(
+ __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+ clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ __m128i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+ int bd, int out_shift,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ for (int i = 0; i < 32; i++) {
+ addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ for (int i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+ 4);
+ }
+ }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+ {
+ __m128i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ }
+ }
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+ out[32] = x;
+ out[33] = x;
+ out[34] = x;
+ out[35] = x;
+ out[36] = x;
+ out[37] = x;
+ out[38] = x;
+ out[39] = x;
+ out[40] = x;
+ out[41] = x;
+ out[42] = x;
+ out[43] = x;
+ out[44] = x;
+ out[45] = x;
+ out[46] = x;
+ out[47] = x;
+ out[48] = x;
+ out[49] = x;
+ out[50] = x;
+ out[51] = x;
+ out[52] = x;
+ out[53] = x;
+ out[54] = x;
+ out[55] = x;
+ out[56] = x;
+ out[57] = x;
+ out[58] = x;
+ out[59] = x;
+ out[60] = x;
+ out[61] = x;
+ out[62] = x;
+ out[63] = x;
+ }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+ {
+ __m128i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ __m128i temp1, temp2;
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+
+ // stage 6
+ temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64];
+ __m128i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ for (i = 0; i < 32; i++) {
+ addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+ &clamp_hi);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ for (i = 0; i < 64; i += 4) {
+ round_shift_4x4(out + i, out_shift);
+ highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
+ &clamp_hi_out, 4);
+ }
+ }
+ }
+}
+
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1;
+
+ // stage 0
+ // stage 1
+ bf1 = in[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ if (out_shift != 0) {
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ bf1 = _mm_add_epi32(bf1, offset);
+ bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+ }
+ }
+
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ out[0] = bf1;
+ out[1] = bf1;
+ out[2] = bf1;
+ out[3] = bf1;
+ out[4] = bf1;
+ out[5] = bf1;
+ out[6] = bf1;
+ out[7] = bf1;
+ out[8] = bf1;
+ out[9] = bf1;
+ out[10] = bf1;
+ out[11] = bf1;
+ out[12] = bf1;
+ out[13] = bf1;
+ out[14] = bf1;
+ out[15] = bf1;
+ out[16] = bf1;
+ out[17] = bf1;
+ out[18] = bf1;
+ out[19] = bf1;
+ out[20] = bf1;
+ out[21] = bf1;
+ out[22] = bf1;
+ out[23] = bf1;
+ out[24] = bf1;
+ out[25] = bf1;
+ out[26] = bf1;
+ out[27] = bf1;
+ out[28] = bf1;
+ out[29] = bf1;
+ out[30] = bf1;
+ out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4 :
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+ // stage 4
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32], bf0[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case IDTX:
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ for (int i = 0; i < 32; i += 16) {
+ out[i] = _mm_slli_epi32(in[i], 2);
+ out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+ out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+ out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+ out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+ out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+ out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+ out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+ out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+ out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+ out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+ out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+ out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+ out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+ out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+ out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+ round_shift_8x8(out, out_shift);
+ round_shift_8x8(out + 16, out_shift);
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+ }
+}
+static const transform_1d_sse4_1
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4x4_sse4_1, NULL, NULL, NULL },
+ { iadst4x4_sse4_1, NULL, NULL, NULL },
+ { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
+ },
+ { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+ { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+ { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
+ {
+ { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+ NULL },
+ { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+ NULL },
+ { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
+ },
+ { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+ idct32x32_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { iidentity32_sse4_1, NULL, NULL, NULL } },
+ { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+ idct64x64_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = buf_size_w >> 2;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+ __m128i buf0[16];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ _buf1[j * txfm_size_row + 0] = buf0_cur[0];
+ _buf1[j * txfm_size_row + 1] = buf0_cur[1];
+ _buf1[j * txfm_size_row + 2] = buf0_cur[2];
+ _buf1[j * txfm_size_row + 3] = buf0_cur[3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[16];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[64 * 4];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int input_stride = row_max;
+ const int buf_size_w = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = buf_size_w >> 2;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[32];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ _buf1[j * txfm_size_row + 0] = buf0_cur[0];
+ _buf1[j * txfm_size_row + 1] = buf0_cur[1];
+ _buf1[j * txfm_size_row + 2] = buf0_cur[2];
+ _buf1[j * txfm_size_row + 3] = buf0_cur[3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, 0, txfm_size_row,
+ bd);
+ }
+ }
+}
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div4 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ __m128i buf0[64];
+ load_buffer_32bit_input(input + i * 4, input_stride, buf0,
+ buf_size_nonzero_w);
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
+ bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+ const int input_stride = AOMMIN(32, txfm_size_row);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[8];
+ load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
+ load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
+ NewInvSqrt2);
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+ row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
+
+ if (lr_flip) {
+ TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ } else {
+ TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+ buf1[3]);
+
+ TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+ buf1[7]);
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[8];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[8];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0,
+ NewInvSqrt2);
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_sse2(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
+ transpose_32bit_4x4(buf1_cur, buf1_cur);
+ col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
+ }
+ av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+ // write to buffer
+ highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
+ txfm_size_row, bd);
+}
+
+static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_h_div8 = txfm_size_row >> 2;
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+ const int input_stride = AOMMIN(32, txfm_size_row);
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[16];
+ for (int i = 0; i < (txfm_size_row >> 2); i++) {
+ const int32_t *input_row = input + i * 4;
+ __m128i *buf0_cur = buf0 + i * 4;
+ load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
+ row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]);
+ }
+
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+ buf1[4 * j + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_h_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+ buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+ buf1[4 * j + 2], buf1[4 * j + 3]);
+ }
+ }
+
+ // 2nd stage: column transform
+ col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+ // write to buffer
+ highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+ bd);
+}
+
+static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[16];
+ const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ __m128i buf0[16];
+ const int32_t *input_row = input;
+ load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+ row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
+
+ __m128i *buf1_ptr;
+ if (lr_flip) {
+ flip_buf_sse2(buf0, buf1, txfm_size_col);
+ buf1_ptr = buf1;
+ } else {
+ buf1_ptr = buf0;
+ }
+
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
+ transpose_32bit_4x4(buf1_cur, buf1_cur);
+ col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
+ }
+ av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_sse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ highbd_inv_txfm2d_add_h_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ highbd_inv_txfm2d_add_v_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case IDTX:
+ highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ int eob = txfm_param->eob;
+ highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+ txfm_param->bd);
+ break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 0000000000..6dcac10e45
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,849 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src,
+ int src_stride, uint16_t *dst0,
+ int dst_stride0, int w, int h,
+ ConvolveParams *conv_params,
+ int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 16) {
+ const __m256i src_16bit =
+ _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+ if (do_average) {
+ const __m256i data_0 =
+ _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_row_0 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+ // since not all compilers yet support _mm256_set_m128i()
+ const __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+ const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b, offset_const);
+
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_setzero_si256();
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+
+ int i, j;
+ __m256i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x =
+ _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+ __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+ __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_y_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ assert(bits >= 0);
+ int i, j;
+ __m256i s[8], coeffs_y[4];
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i round_const_y =
+ _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m256i src6;
+ __m256i s01 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+ __m256i s12 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+ __m256i s23 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+ __m256i s34 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+ __m256i s45 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ __m256i s56 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi16(s01, s12);
+ s[1] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+ s[4] = _mm256_unpackhi_epi16(s01, s12);
+ s[5] = _mm256_unpackhi_epi16(s23, s34);
+ s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ const __m256i s67 = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+ const __m256i s78 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi16(s67, s78);
+ s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+ res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 0000000000..5a7fc536a2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ assert(bits >= 0);
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i s[16], coeffs_y[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+ res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+ round_shift_y);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+ res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+ round_shift_y);
+
+ __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+ __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_1 =
+ highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_0, round_result_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_1, round_result_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+
+ } else {
+ __m128i res_16b_0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+ __m128i res_16b_1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16b_1);
+ }
+ } else {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+ res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+ res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+ __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+ __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+ const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+ const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_lo_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_lo_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+ &wt0, &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_lo_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+ } else {
+ __m128i res_16bit0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+ __m128i res_16bit1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_16bit1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+}
+
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+
+ int i, j;
+ __m128i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+ res_even = _mm_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+ __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+ const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+ }
+ } else {
+ __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 0000000000..5734810f52
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+ do { \
+ __m128i u0, u1, u2, u3; \
+ u0 = _mm_unpacklo_epi32(x0, x1); \
+ u1 = _mm_unpackhi_epi32(x0, x1); \
+ u2 = _mm_unpacklo_epi32(x2, x3); \
+ u3 = _mm_unpackhi_epi32(x2, x3); \
+ y0 = _mm_unpacklo_epi64(u0, u2); \
+ y1 = _mm_unpackhi_epi64(u0, u2); \
+ y2 = _mm_unpacklo_epi64(u1, u3); \
+ y3 = _mm_unpackhi_epi64(u1, u3); \
+ } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+ TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+ TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+ TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+ TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+ out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+ // Upper left 8x8
+ TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+ TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+ out[28]);
+ TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+ out[13]);
+ TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+ out[29]);
+
+ // Upper right 8x8
+ TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+ out[44]);
+ TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+ out[60]);
+ TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+ out[45]);
+ TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+ out[61]);
+
+ // Lower left 8x8
+ TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+ out[14]);
+ TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+ out[30]);
+ TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+ out[15]);
+ TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+ out[31]);
+ // Lower right 8x8
+ TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+ out[46]);
+ TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+ out[62]);
+ TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+ out[47]);
+ TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+ out[63]);
+}
+
+static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
+ const int width, const int height) {
+ const int numcol = height >> 2;
+ const int numrow = width >> 2;
+ for (int j = 0; j < numrow; j++) {
+ for (int i = 0; i < numcol; i++) {
+ TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+ input[i * width + j + (numrow * 1)],
+ input[i * width + j + (numrow * 2)],
+ input[i * width + j + (numrow * 3)],
+ output[j * height + i + (numcol * 0)],
+ output[j * height + i + (numcol * 1)],
+ output[j * height + i + (numcol * 2)],
+ output[j * height + i + (numcol * 3)]);
+ }
+ }
+}
+
+// Note:
+// rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *w1, const __m128i *n1,
+ const __m128i *rounding, int bit) {
+ __m128i x, y;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ y = _mm_mullo_epi32(*w1, *n1);
+ x = _mm_add_epi32(x, y);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *rounding, int bit) {
+ __m128i x;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd);
+
+#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c
new file mode 100644
index 0000000000..75108b49da
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m256i tmp[15];
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ (void)max_bits_horiz;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m256i reduce_bits_vert_const =
+ _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+ const __m256i res_sub_const =
+ _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+
+ __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1));
+ __m256i v_zeros = _mm256_setzero_si256();
+ int ohoriz = 1 << offset_bits_horiz;
+ int mhoriz = 1 << max_bits_horiz;
+ (void)mhoriz;
+ int sx;
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ if (ix4 <= -7) {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))));
+ }
+ } else if (ix4 >= width + 6) {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm256_cvtepi16_epi32(
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz))));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ int32_t tmp1[8];
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ const int iy = clamp(iy4 + k, 0, height - 1);
+
+ sx = sx4 + beta * (k + 4);
+ for (int l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ const int offs = sx >> WARPEDDIFF_PREC_BITS;
+ const int16_t *coeffs = av1_warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_horiz;
+ for (int m = 0; m < 8; ++m) {
+ const int sample_x = clamp(ix + m, 0, width - 1);
+ sum += ref[iy * stride + sample_x] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+ tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum;
+ sx += alpha;
+ }
+ tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1);
+ }
+ } else {
+ if (beta == 0 && alpha == 0) {
+ sx = sx4;
+ __m128i v_01 = _mm_loadu_si128(
+ (__m128i *)
+ av1_warped_filter[sx >>
+ WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0
+ __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0
+ __m256i v_c23 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2
+ __m256i v_c45 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4
+ __m256i v_c67 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum = _mm256_madd_epi16(
+ v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+ 0)); // R8R7R6..R1R7R6R5..R1R0
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum = _mm256_madd_epi16(
+ v_c23,
+ _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+ 8)); // R12R11..R5R11R10..R5R4
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+ 12)); // R14R13..R7R13R12..R7R6
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+ } else if (alpha == 0) {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ sx = sx4 + beta * (k + 4);
+
+ __m128i v_01 = _mm_loadu_si128(
+ (__m128i *)av1_warped_filter
+ [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0
+ __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0
+ __m256i v_c23 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2
+ __m256i v_c45 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4
+ __m256i v_c67 = _mm256_broadcastd_epi32(
+ _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum =
+ _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum =
+ _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum =
+ _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(v_c67,
+ _mm256_alignr_epi8(v_refu, v_refl, 12));
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+ } else if (beta == 0) {
+ sx = sx4;
+ __m256i v_coeff01 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff01 = _mm256_inserti128_si256(
+ v_coeff01,
+ _mm_loadu_si128(
+ (__m128i *)
+ av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]),
+ 1); // B7B6..B1B0A7A6..A1A0
+ __m256i v_coeff23 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff23 = _mm256_inserti128_si256(
+ v_coeff23,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // D7D6..D1D0C7C6..C1C0
+ __m256i v_coeff45 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff45 = _mm256_inserti128_si256(
+ v_coeff45,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // F7F6..F1F0E7E6..E1E0
+ __m256i v_coeff67 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff67 = _mm256_inserti128_si256(
+ v_coeff67,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // H7H6..H1H0G7G6..G1G0
+
+ __m256i v_c0123 = _mm256_unpacklo_epi32(
+ v_coeff01,
+ v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+ __m256i v_c0123u = _mm256_unpackhi_epi32(
+ v_coeff01,
+ v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+ __m256i v_c4567 = _mm256_unpacklo_epi32(
+ v_coeff45,
+ v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+ __m256i v_c4567u = _mm256_unpackhi_epi32(
+ v_coeff45,
+ v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+ __m256i v_c01 = _mm256_unpacklo_epi64(
+ v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+ __m256i v_c23 =
+ _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
+ __m256i v_c45 =
+ _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
+ __m256i v_c67 =
+ _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
+
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum = _mm256_madd_epi16(
+ v_c01, _mm256_alignr_epi8(v_refu, v_refl,
+ 0)); // R8R7R6..R1R7R6R5..R1R0
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum = _mm256_madd_epi16(
+ v_c23,
+ _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c45, _mm256_alignr_epi8(v_refu, v_refl,
+ 8)); // R12R11..R5R11R10..R5R4
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(
+ v_c67, _mm256_alignr_epi8(v_refu, v_refl,
+ 12)); // R14R13..R7R13R12..R7R6
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+
+ } else {
+ for (int k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ iy = iy * stride;
+
+ sx = sx4 + beta * (k + 4);
+
+ __m256i v_coeff01 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff01 = _mm256_inserti128_si256(
+ v_coeff01,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // B7B6..B1B0A7A6..A1A0
+ __m256i v_coeff23 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff23 = _mm256_inserti128_si256(
+ v_coeff23,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // D7D6..D1D0C7C6..C1C0
+ __m256i v_coeff45 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff45 = _mm256_inserti128_si256(
+ v_coeff45,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // F7F6..F1F0E7E6..E1E0
+ __m256i v_coeff67 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff67 = _mm256_inserti128_si256(
+ v_coeff67,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sx + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1); // H7H6..H1H0G7G6..G1G0
+
+ __m256i v_c0123 = _mm256_unpacklo_epi32(
+ v_coeff01,
+ v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+ __m256i v_c0123u = _mm256_unpackhi_epi32(
+ v_coeff01,
+ v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+ __m256i v_c4567 = _mm256_unpacklo_epi32(
+ v_coeff45,
+ v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+ __m256i v_c4567u = _mm256_unpackhi_epi32(
+ v_coeff45,
+ v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+ __m256i v_c01 = _mm256_unpacklo_epi64(
+ v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+ __m256i v_c23 =
+ _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
+ __m256i v_c45 =
+ _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
+ __m256i v_c67 =
+ _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
+
+ __m256i v_refl = _mm256_inserti128_si256(
+ _mm256_setzero_si256(),
+ _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0);
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]),
+ 1); // R15 .. R0
+
+ __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE);
+
+ __m256i v_refu =
+ _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1
+
+ v_refl = _mm256_inserti128_si256(
+ v_refl, _mm256_extracti128_si256(v_refu, 0), 1);
+ v_refu = _mm256_inserti128_si256(
+ v_refu, _mm256_extracti128_si256(v_ref, 0), 0);
+
+ __m256i v_sum = _mm256_set1_epi32(ohoriz);
+ __m256i parsum =
+ _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0));
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum);
+
+ parsum =
+ _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4));
+ __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum);
+ parsum =
+ _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8));
+ __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum);
+ parsum = _mm256_madd_epi16(v_c67,
+ _mm256_alignr_epi8(v_refu, v_refl, 12));
+ __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum);
+
+ tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz),
+ reduce_bits_horiz);
+ }
+ }
+ }
+
+ // Vertical filter
+ for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ const __m256i *src = tmp + (k + 4);
+
+ __m256i v_coeff01 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128(
+ (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff01 = _mm256_inserti128_si256(
+ v_coeff01,
+ _mm_loadu_si128(
+ (__m128i *)
+ av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]),
+ 1);
+ __m256i v_coeff23 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff23 = _mm256_inserti128_si256(
+ v_coeff23,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1);
+ __m256i v_coeff45 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff45 = _mm256_inserti128_si256(
+ v_coeff45,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1);
+ __m256i v_coeff67 = _mm256_inserti128_si256(
+ v_zeros,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 0);
+ v_coeff67 = _mm256_inserti128_si256(
+ v_coeff67,
+ _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >>
+ WARPEDDIFF_PREC_BITS]),
+ 1);
+
+ __m256i v_c0123 = _mm256_unpacklo_epi32(
+ v_coeff01,
+ v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0
+ __m256i v_c0123u = _mm256_unpackhi_epi32(
+ v_coeff01,
+ v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4
+ __m256i v_c4567 = _mm256_unpacklo_epi32(
+ v_coeff45,
+ v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0
+ __m256i v_c4567u = _mm256_unpackhi_epi32(
+ v_coeff45,
+ v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4
+
+ __m256i v_c01 = _mm256_unpacklo_epi64(
+ v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0
+ __m256i v_c23 =
+ _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2
+ __m256i v_c45 =
+ _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4
+ __m256i v_c67 =
+ _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6
+
+ __m256i v_src01l =
+ _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00
+ __m256i v_src01u =
+ _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04
+ __m256i v_sum =
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u),
+ v_c01); // S7S5S3S1S6S4S2S0
+
+ __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]);
+ __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]);
+ v_sum = _mm256_add_epi32(
+ v_sum,
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23));
+
+ __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]);
+ __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]);
+ v_sum = _mm256_add_epi32(
+ v_sum,
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45));
+
+ __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]);
+ __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]);
+ v_sum = _mm256_add_epi32(
+ v_sum,
+ _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67));
+
+ // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0
+
+ __m256i v_suml =
+ _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0
+ __m256i v_sumh =
+ _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1
+ v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+ v_sum = _mm256_add_epi32(v_sum, res_add_const);
+ v_sum =
+ _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ if (conv_params->do_average) {
+ __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0),
+ _mm256_mullo_epi32(v_sum, wt1));
+ v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS);
+ } else {
+ v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1);
+ }
+
+ __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const);
+ v_sum1 = _mm256_sra_epi32(
+ _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift);
+
+ __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1);
+ v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8);
+ v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel);
+ _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0));
+ } else {
+ v_sum = _mm256_packus_epi32(v_sum, v_sum);
+ __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8);
+ _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+ }
+ } else {
+ // Round and pack into 8 bits
+ const __m256i round_const =
+ _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+
+ __m256i v_sum1 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_sum, round_const), reduce_bits_vert);
+
+ v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1);
+ __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1);
+ const __m256i zero = _mm256_setzero_si256();
+ v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero);
+
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0));
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 0000000000..96fb4cf632
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10,
+ 12, 14, 1, 3, 5, 7,
+ 9, 11, 13, 15 };
+
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,
+ 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11 };
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
+ 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15 };
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+ coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+ coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+ coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+ coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ // Filter odd-index pixels
+ const __m128i tmp_1 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+ int sx, __m128i *coeff) {
+ // Filter coeff
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+ coeff[4] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+ coeff[6] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+ coeff[1] = coeff[0];
+ coeff[3] = coeff[2];
+ coeff[5] = coeff[4];
+ coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+ const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+ const __m128i src_1 = *src;
+ const __m128i src2_1 = *src2;
+
+ const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
+
+ __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+ __m128i *tmp, int sx, int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ highbd_warp_horizontal_filter_alpha0_beta0(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha == 0 && beta != 0)
+ highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha != 0 && beta == 0)
+ highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else
+ highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+ assert(!(bd == 12 && reduce_bits_horiz < 5));
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const __m128i res_sub_const =
+ _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ const __m128i src_01 = _mm_shuffle_epi8(
+ src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+ const __m128i src2_01 = _mm_shuffle_epi8(
+ src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+ __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+ __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+ }
+
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+ }
+
+ const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+ const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+ highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ highbd_prepare_warp_horizontal_filter(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo = _mm_add_epi32(res_lo, res_add_const);
+ res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+
+ if (conv_params->do_average) {
+ __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(res_lo, wt1));
+ res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+ } else {
+ res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+ }
+
+ __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+ res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+ round_bits_shift);
+
+ __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+ res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+ _mm_storel_epi64(dst16, res16_lo);
+ } else {
+ res_lo = _mm_packus_epi32(res_lo, res_lo);
+ _mm_storel_epi64(p, res_lo);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = _mm_add_epi32(res_hi, res_add_const);
+ res_hi =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ if (conv_params->do_average) {
+ __m128i *const dst16_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+ _mm_mullo_epi32(res_hi, wt1));
+ res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+ } else {
+ res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+ }
+
+ __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+ res32_hi = _mm_sra_epi32(
+ _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+ __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+ res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+ _mm_storel_epi64(dst16_4, res16_hi);
+ } else {
+ res_hi = _mm_packus_epi32(res_hi, res_hi);
+ _mm_storel_epi64(p4, res_hi);
+ }
+ }
+ } else {
+ // Round and pack into 8 bits
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ _mm_storel_epi64(p, res_16bit);
+ } else {
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 0000000000..562c623fa9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+
+ /* Horizontal filter */
+ {
+ const __m256i clamp_high_ep =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+ // Load 16-bit src data
+ const __m256i src_0 = yy_loadu_256(src_ij + 0);
+ const __m256i src_1 = yy_loadu_256(src_ij + 1);
+ const __m256i src_2 = yy_loadu_256(src_ij + 2);
+ const __m256i src_3 = yy_loadu_256(src_ij + 3);
+ const __m256i src_4 = yy_loadu_256(src_ij + 4);
+ const __m256i src_5 = yy_loadu_256(src_ij + 5);
+ const __m256i src_6 = yy_loadu_256(src_ij + 6);
+ const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_16bit_clamped = _mm256_min_epi16(
+ _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+ // Store in the dst array
+ yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 0000000000..cab37fa910
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ const __m128i maxval =
+ _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
new file mode 100644
index 0000000000..3eee46faeb
--- /dev/null
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
+ { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
+ { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
+ { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
+ };
+
+ DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
+ { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ };
+
+ // Extend the first and last samples to simplify the loop for the 5-tap case
+ p[-1] = p[0];
+ __m128i last = _mm_set1_epi8((char)p[sz - 1]);
+ _mm_storeu_si128((__m128i *)&p[sz], last);
+
+ // Adjust input pointer for filter support area
+ uint8_t *in = (strength == 3) ? p - 1 : p;
+
+ // Avoid modifying first sample
+ uint8_t *out = p + 1;
+ int len = sz - 1;
+
+ const int use_3tap_filter = (strength < 3);
+
+ if (use_3tap_filter) {
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d0 = _mm_packus_epi16(d0, d0);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi8(n_out);
+ __m128i mask = _mm_cmpgt_epi8(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storel_epi64((__m128i *)out, out0);
+ __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+ in0 = _mm_alignr_epi8(in1, in0, 8);
+ in += 8;
+ out += 8;
+ len -= n_out;
+ }
+ } else { // 5-tap filter
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i two = _mm_set1_epi8(2);
+ __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
+ __m128i shuf_b = _mm_add_epi8(shuf_a, two);
+ __m128i shuf_c = _mm_add_epi8(shuf_b, two);
+ __m128i shuf_d = _mm_add_epi8(shuf_c, two);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
+ __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
+ __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ d0 = _mm_hadd_epi16(d0, d2);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d0 = _mm_packus_epi16(d0, d0);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi8(n_out);
+ __m128i mask = _mm_cmpgt_epi8(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storel_epi64((__m128i *)out, out0);
+ __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+ in0 = _mm_alignr_epi8(in1, in0, 8);
+ in += 8;
+ out += 8;
+ len -= n_out;
+ }
+ }
+}
+
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+ { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+ };
+
+ DECLARE_ALIGNED(
+ 16, static const int8_t,
+ v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint8_t *in = &p[-2];
+ uint8_t *out = &p[-2];
+
+ int n = sz + 1; // Input length including upper-left sample
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+ while (n > 0) {
+ __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+ __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d2 = _mm_add_epi16(d2, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d2 = _mm_srai_epi16(d2, 4);
+ d0 = _mm_packus_epi16(d0, d2);
+ __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+ __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[16], out1);
+ in0 = in16;
+ in16 = _mm_setzero_si128();
+ out += 32;
+ n -= 16;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
+ { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
+ { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
+ { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
+ };
+
+ DECLARE_ALIGNED(16, static const int16_t,
+ v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+ // Extend the first and last samples to simplify the loop for the 5-tap case
+ p[-1] = p[0];
+ __m128i last = _mm_set1_epi16(p[sz - 1]);
+ _mm_storeu_si128((__m128i *)&p[sz], last);
+
+ // Adjust input pointer for filter support area
+ uint16_t *in = (strength == 3) ? p - 1 : p;
+
+ // Avoid modifying first sample
+ uint16_t *out = p + 1;
+ int len = sz - 1;
+
+ const int use_3tap_filter = (strength < 3);
+
+ if (use_3tap_filter) {
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in02 = _mm_add_epi16(in0, in2);
+ __m128i d0 = _mm_unpacklo_epi16(in02, in1);
+ __m128i d1 = _mm_unpackhi_epi16(in02, in1);
+ d0 = _mm_mullo_epi16(d0, coef0);
+ d1 = _mm_mullo_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srli_epi16(d0, 4);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi16(n_out);
+ __m128i mask = _mm_cmpgt_epi16(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storeu_si128((__m128i *)out, out0);
+ in += 8;
+ in0 = in8;
+ in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ out += 8;
+ len -= n_out;
+ }
+ } else { // 5-tap filter
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+ __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
+ __m128i in04 = _mm_add_epi16(in0, in4);
+ __m128i in123 = _mm_add_epi16(in1, in2);
+ in123 = _mm_add_epi16(in123, in3);
+ __m128i d0 = _mm_unpacklo_epi16(in04, in123);
+ __m128i d1 = _mm_unpackhi_epi16(in04, in123);
+ d0 = _mm_mullo_epi16(d0, coef0);
+ d1 = _mm_mullo_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srli_epi16(d0, 4);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi16(n_out);
+ __m128i mask = _mm_cmpgt_epi16(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storeu_si128((__m128i *)out, out0);
+ in += 8;
+ in0 = in8;
+ in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ out += 8;
+ len -= n_out;
+ }
+ }
+}
+
+void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int16_t,
+ kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint16_t *in = &p[-2];
+ uint16_t *out = in;
+ int n = sz + 1;
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+ __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
+
+ while (n > 0) {
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+ __m128i sum0 = _mm_add_epi16(in0, in3);
+ __m128i sum1 = _mm_add_epi16(in1, in2);
+ __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
+ __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ d0 = _mm_madd_epi16(d0, coef0);
+ d1 = _mm_madd_epi16(d1, coef0);
+ __m128i eight = _mm_set1_epi32(8);
+ d0 = _mm_add_epi32(d0, eight);
+ d1 = _mm_add_epi32(d1, eight);
+ d0 = _mm_srai_epi32(d0, 4);
+ d1 = _mm_srai_epi32(d1, 4);
+ d0 = _mm_packus_epi32(d0, d1);
+ __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
+ d0 = _mm_min_epi16(d0, max0);
+ __m128i out0 = _mm_unpacklo_epi16(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi16(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[8], out1);
+ in0 = in8;
+ in8 = in16;
+ in16 = in24;
+ in24 = _mm_setzero_si128();
+ out += 16;
+ n -= 8;
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 0000000000..9f82ed2300
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+#include "av1/common/convolve.h"
+
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
+ const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+ return _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j, is_horiz_4tap = 0;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert(bits >= 0);
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ __m256i filt[4], coeffs[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // horz_filt as 4 tap
+ if (is_horiz_4tap) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j, is_vert_4tap = 0;
+ // +1 to compensate for dividing the filter coeffs by 2
+ const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+ const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+ const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i coeffs[4], s[8];
+
+ assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_vert_4tap) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src4;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[4];
+ __m256i src_a[5];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 4; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src4 = src_a[4];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+ s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ }
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 5) * src_stride + j];
+ const __m256i src5 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
+
+ src4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
+
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
+
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+
+ int im_stride = 8;
+ int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert(conv_params->round_0 > 0);
+
+ const __m256i round_const_h = _mm256_set1_epi16(
+ ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_horiz_4tap) {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+ src_h += (src_stride << 1);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+ round_shift_h);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ }
+ } else if (is_vert_4tap) {
+ int im_h = h + 3;
+ const int fo_vert = 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+ /* Vertical filter */
+ __m256i s[6];
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+ const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ }
+ }
+}
+
+#define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \
+ do { \
+ src_0 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \
+ src_1 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \
+ src_2 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \
+ src_3 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \
+ \
+ src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \
+ src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \
+ src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \
+ src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \
+ \
+ src_0 = _mm256_add_epi16(src_0, offset_const); \
+ src_1 = _mm256_add_epi16(src_1, offset_const); \
+ src_2 = _mm256_add_epi16(src_2, offset_const); \
+ src_3 = _mm256_add_epi16(src_3, offset_const); \
+ \
+ _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \
+ _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \
+ _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \
+ _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \
+ } while (0)
+
+#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7)
+static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2(
+ const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
+ int w, int h, const __m256i offset_const) {
+ int i = h;
+ if (w >= 16) {
+ __m256i src_0, src_1, src_2, src_3;
+ if (w == 128) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
+ DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112);
+ src += 1 * src_stride;
+ dst += 1 * dst_stride;
+ i -= 1;
+ } while (i);
+ } else if (w == 64) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48);
+ src += 1 * src_stride;
+ dst += 1 * dst_stride;
+ i -= 1;
+ } while (i);
+ } else if (w == 32) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16);
+ src += 2 * src_stride;
+ dst += 2 * dst_stride;
+ i -= 2;
+ } while (i);
+ } else if (w == 16) {
+ do {
+ DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ i -= 4;
+ } while (i);
+ }
+ } else {
+ const __m256i zero = _mm256_setzero_si256();
+ do {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[0 * src_stride]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[1 * src_stride]));
+ const __m128i src_row_2 =
+ _mm_loadl_epi64((__m128i *)(&src[2 * src_stride]));
+ const __m128i src_row_3 =
+ _mm_loadl_epi64((__m128i *)(&src[3 * src_stride]));
+
+ __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+ __m256i src_32 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_2), src_row_3, 1);
+
+ src_10 = _mm256_unpacklo_epi8(src_10, zero);
+ src_32 = _mm256_unpacklo_epi8(src_32, zero);
+
+ src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT);
+ src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT);
+
+ src_10 = _mm256_add_epi16(src_10, offset_const);
+ src_32 = _mm256_add_epi16(src_32, offset_const);
+
+ // Accumulate values into the destination buffer
+ _mm_store_si128((__m128i *)(&dst[0 * dst_stride]),
+ _mm256_castsi256_si128(src_10));
+ _mm_store_si128((__m128i *)(&dst[1 * dst_stride]),
+ _mm256_extracti128_si256(src_10, 1));
+ _mm_store_si128((__m128i *)(&dst[2 * dst_stride]),
+ _mm256_castsi256_si128(src_32));
+ _mm_store_si128((__m128i *)(&dst[3 * dst_stride]),
+ _mm256_extracti128_si256(src_32, 1));
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ i -= 4;
+ } while (i);
+ }
+}
+
+#define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \
+ do { \
+ src_0 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \
+ src_1 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \
+ src_2 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \
+ src_3 = _mm256_cvtepu8_epi16( \
+ _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \
+ \
+ src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \
+ src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \
+ src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \
+ src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \
+ src_0 = _mm256_add_epi16(src_0, offset_const); \
+ src_1 = _mm256_add_epi16(src_1, offset_const); \
+ src_2 = _mm256_add_epi16(src_2, offset_const); \
+ src_3 = _mm256_add_epi16(src_3, offset_const); \
+ \
+ ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \
+ ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \
+ ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \
+ ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \
+ \
+ res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \
+ res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \
+ res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \
+ res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \
+ \
+ res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \
+ rounding_shift); \
+ \
+ res_10 = _mm256_packus_epi16(res_0, res_1); \
+ res_32 = _mm256_packus_epi16(res_2, res_3); \
+ res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \
+ res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \
+ \
+ _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \
+ _mm256_castsi256_si128(res_10)); \
+ _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \
+ _mm256_extracti128_si256(res_10, 1)); \
+ _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \
+ _mm256_castsi256_si128(res_32)); \
+ _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \
+ _mm256_extracti128_si256(res_32, 1)); \
+ } while (0)
+
+#define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \
+ int i = h; \
+ if (w >= 16) { \
+ __m256i src_0, src_1, src_2, src_3; \
+ __m256i ref_0, ref_1, ref_2, ref_3; \
+ __m256i res_0, res_1, res_2, res_3; \
+ __m256i res_10, res_32; \
+ if (w == 128) { \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \
+ i -= 1; \
+ src += 1 * src_stride; \
+ dst += 1 * dst_stride; \
+ dst0 += 1 * dst_stride0; \
+ } while (i); \
+ } else if (w == 64) { \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \
+ \
+ i -= 1; \
+ src += 1 * src_stride; \
+ dst += 1 * dst_stride; \
+ dst0 += 1 * dst_stride0; \
+ } while (i); \
+ } else if (w == 32) { \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \
+ \
+ i -= 2; \
+ src += 2 * src_stride; \
+ dst += 2 * dst_stride; \
+ dst0 += 2 * dst_stride0; \
+ } while (i); \
+ } else { \
+ assert(w == 16); \
+ do { \
+ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \
+ \
+ i -= 4; \
+ src += 4 * src_stride; \
+ dst += 4 * dst_stride; \
+ dst0 += 4 * dst_stride0; \
+ } while (i); \
+ } \
+ } else if (w == 8) { \
+ do { \
+ const __m128i src_0 = \
+ _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \
+ const __m128i src_1 = \
+ _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \
+ const __m128i src_2 = \
+ _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \
+ const __m128i src_3 = \
+ _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \
+ __m256i src_10 = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \
+ __m256i src_32 = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \
+ \
+ src_10 = _mm256_unpacklo_epi8(src_10, zero); \
+ src_32 = _mm256_unpacklo_epi8(src_32, zero); \
+ \
+ src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \
+ src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \
+ \
+ src_10 = _mm256_add_epi16(src_10, offset_const); \
+ src_32 = _mm256_add_epi16(src_32, offset_const); \
+ \
+ const __m256i ref_10 = \
+ load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \
+ const __m256i ref_32 = \
+ load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \
+ __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \
+ __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \
+ \
+ res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \
+ rounding_shift); \
+ res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \
+ rounding_shift); \
+ \
+ __m256i res = _mm256_packus_epi16(res_10, res_32); \
+ const __m128i res_20 = _mm256_castsi256_si128(res); \
+ const __m128i res_31 = _mm256_extracti128_si256(res, 1); \
+ \
+ _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \
+ _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \
+ _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \
+ _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \
+ i -= 4; \
+ src += 4 * src_stride; \
+ dst += 4 * dst_stride; \
+ dst0 += 4 * dst_stride0; \
+ } while (i); \
+ } else { \
+ assert(w == 4); \
+ do { \
+ __m256i src_3210_8bit = \
+ _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \
+ loadu_int32(src + 1 * src_stride), 0, 0, \
+ loadu_int32(src + 2 * src_stride), \
+ loadu_int32(src + 3 * src_stride), 0, 0); \
+ \
+ __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \
+ src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \
+ src_3210 = _mm256_add_epi16(src_3210, offset_const); \
+ \
+ __m256i ref_3210 = \
+ _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \
+ *(int64_t *)(dst + 1 * dst_stride), \
+ *(int64_t *)(dst + 2 * dst_stride), \
+ *(int64_t *)(dst + 3 * dst_stride)); \
+ __m256i res_3210 = \
+ comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \
+ \
+ res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \
+ rounding_shift); \
+ \
+ res_3210 = _mm256_packus_epi16(res_3210, res_3210); \
+ const __m128i res_10 = _mm256_castsi256_si128(res_3210); \
+ const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \
+ \
+ *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \
+ *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \
+ *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \
+ *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \
+ i -= 4; \
+ src += 4 * src_stride; \
+ dst += 4 * dst_stride; \
+ dst0 += 4 * dst_stride0; \
+ } while (i); \
+ }
+
+void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w,
+ int h, ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ assert(conv_params->round_0 == 3);
+ assert(conv_params->round_1 == 7);
+ assert(w % 4 == 0);
+ assert(h % 4 == 0);
+
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+ if (do_average) {
+ if (use_dist_wtd_comp_avg) {
+ DO_AVG_2D_COPY(1)
+ } else {
+ DO_AVG_2D_COPY(0)
+ }
+ } else {
+ av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride,
+ w, h, offset_const);
+ }
+}
+#undef LEFT_SHIFT
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 0000000000..8c5d9918fb
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,606 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ __m128i coeffs[4];
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+ if (w == 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+ }
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+ const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+}
+
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+ const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ __m128i coeffs[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+ if (w == 4) {
+ __m128i s[8], src6, res, res_shift;
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ res_16b = _mm_packs_epi32(res_shift, res_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+ res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+}
+
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i temp_lo, temp_hi;
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ temp_lo = _mm_srli_si128(src_lo, 4);
+ temp_hi = _mm_slli_si128(src_hi, 12);
+ const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ temp_lo = _mm_srli_si128(src_lo, 8);
+ temp_hi = _mm_slli_si128(src_hi, 8);
+ const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ temp_lo = _mm_srli_si128(src_lo, 12);
+ temp_hi = _mm_slli_si128(src_hi, 4);
+ const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ temp_lo = _mm_srli_si128(src_lo, 2);
+ temp_hi = _mm_slli_si128(src_hi, 14);
+ const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ temp_lo = _mm_srli_si128(src_lo, 6);
+ temp_hi = _mm_slli_si128(src_hi, 10);
+ const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ temp_lo = _mm_srli_si128(src_lo, 10);
+ temp_hi = _mm_slli_si128(src_hi, 6);
+ const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ temp_lo = _mm_srli_si128(src_lo, 14);
+ temp_hi = _mm_slli_si128(src_hi, 2);
+ const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 0000000000..f6bf67815d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_dist_wtd_convolve_2d_ssse3(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 0000000000..71fab7a577
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+ const __m256i s1) {
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+ return _mm256_abs_epi16(
+ _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = xx_loadl_32(src0);
+ const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+ const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+ const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+ const __m128i s1A = xx_loadl_32(src1);
+ const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+ const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+ const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+ const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ const __m128i x_m8 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+ xx_storeu_128(mask, x_m8);
+ src0 += (src0_stride << 2);
+ src1 += (src1_stride << 2);
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+ const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+ const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+ const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+ const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+ yy_storeu_256(mask, m8);
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (16 == w) {
+ do {
+ const __m128i s0A = xx_load_128(src0);
+ const __m128i s0B = xx_load_128(src0 + src0_stride);
+ const __m128i s1A = xx_load_128(src1);
+ const __m128i s1B = xx_load_128(src1 + src1_stride);
+ const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+ const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+ const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+ const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+ const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+ const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+ yy_storeu_256(mask, m8);
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m256i s0 = yy_loadu_256(src0 + j);
+ const __m256i s1 = yy_loadu_256(src1 + j);
+ const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+ const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+ const __m256i s0H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+ const __m256i s1H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+ const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+ const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+ yy_storeu_256(mask + j, m8);
+ j += 32;
+ } while (j < w);
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff, int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff,
+ int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+ return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 =
+ calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+ if (mask_type == DIFFWTD_38) {
+ build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ } else {
+ build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 16) {
+ av1_build_compound_diffwtd_mask_highbd_ssse3(
+ mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+ } else {
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ assert(bd >= 8);
+ assert((w % 16) == 0);
+ const __m256i y0 = _mm256_setzero_si256();
+ const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+ _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 0000000000..eb4a4d1da3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+ const __m128i s1) {
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+ return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m128i mask_base = _mm_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0);
+ const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0));
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+ const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1);
+ const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1));
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+ *(int *)mask = _mm_cvtsi128_si32(m8);
+ *(int *)(mask + w) = _mm_extract_epi32(m8, 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += 8;
+ i += 2;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+ __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+ s0 = _mm_cvtepu8_epi16(s0);
+ s1 = _mm_cvtepu8_epi16(s1);
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+ _mm_storel_epi64((__m128i *)mask, m8);
+ src0 += stride0;
+ src1 += stride1;
+ mask += 8;
+ i += 1;
+ } while (i < h);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+ do {
+ int j = 0;
+ do {
+ const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+ const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+ const __m128i s0L = _mm_cvtepu8_epi16(s0);
+ const __m128i s1L = _mm_cvtepu8_epi16(s1);
+ const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+ const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+ const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+ const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+ const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+ _mm_store_si128((__m128i *)(mask + j), m8);
+ j += 16;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+ const int mask_base = 38;
+ int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+ const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+ const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i add_const =
+ _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+ const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+ int i, j;
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data_src0 =
+ _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+ const __m128i data_src1 =
+ _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+ const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+ const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+ const __m128i diff = _mm_max_epu16(diffa, diffb);
+ const __m128i diff_round =
+ _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+ const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+ __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+ // clamp to 0 can be skipped since we are using add and saturate
+ // instruction
+
+ const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+ const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+ // Store values into the destination buffer
+ __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+ if ((w - j) > 4) {
+ _mm_storel_epi64(dst, res_8);
+ } else { // w==4
+ *(int *)dst = _mm_cvtsi128_si32(res_8);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 0000000000..c9a3709a62
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#include <tmmintrin.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 8) {
+ av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+ src1, src1_stride, h, w, bd);
+ } else {
+ assert(bd >= 8);
+ assert((w % 8) == 0);
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ const __m128i x0 = _mm_setzero_si128();
+ const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+ _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m128i xmask_base = _mm_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/resize_ssse3.c b/third_party/aom/av1/common/x86/resize_ssse3.c
new file mode 100644
index 0000000000..a7fdb5a9a4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/resize_ssse3.c
@@ -0,0 +1,974 @@
+/*
+ *
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h> // SSSE3
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "av1/common/resize.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+ const uint8_t *const src, const __m128i *const mask) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+ const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+ const __m128i a_and = _mm_and_si128(a, *mask);
+ const __m128i b_and = _mm_and_si128(b, *mask);
+ return _mm_packus_epi16(a_and, b_and);
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi16(0x00FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi32(0x000000FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+ const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+ const __m128i d2 = _mm_packus_epi16(d0, d1);
+ _mm_storeu_si128((__m128i *)dst, d2);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+ const __m128i c0c1) {
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+ const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+ // round and shift by 7 bit each 16 bit
+ const __m128i t2 = _mm_adds_epi16(t0, k_64);
+ const __m128i t3 = _mm_adds_epi16(t1, k_64);
+ const __m128i t4 = _mm_srai_epi16(t2, 7);
+ const __m128i t5 = _mm_srai_epi16(t3, 7);
+ return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[2], d[2];
+
+ // Horizontal
+ // Even rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // odd rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // Vertical
+ s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[8], d[8];
+
+ // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+ // Here we tried to not use shuffle instructions which would be slow
+ // on some x86 CPUs.
+
+ // Horizontal
+ // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx
+ // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx
+ // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx
+ // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx
+ // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx
+ // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx
+ // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx
+ // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx
+ s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+ s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+ s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+ s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+ s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+ s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+ // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx
+ // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx
+ // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx
+ // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx
+ // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx
+ // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx
+ // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx
+ // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx
+ d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+ d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+ d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+ d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+ d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+ d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+ d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+ d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+ // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx
+ // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx
+ // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx
+ // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx
+ // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx
+ // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx
+ // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx
+ // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx
+ s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+ s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+ s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+ s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+ s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+ s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+ // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D
+ // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D
+ // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D
+ // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D
+ d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+ d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+ d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+ d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+ d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+ // Vertical
+ d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ do {
+ load_8bit_8x8(src + 4, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped)
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[2]);
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ transpose_16bit_4x8(&s[2], &s[2]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71
+
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ d[0] = _mm_packus_epi16(d[0], d[0]);
+ d[1] = _mm_packus_epi16(d[1], d[1]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+ store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ t += 4;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ do {
+ load_8bit_8x8(src + 2, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[3]);
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ transpose_16bit_4x8(&s[3], &s[3]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71
+ d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72
+ d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ d[0] = _mm_packus_epi16(d[0], d[2]);
+ d[1] = _mm_packus_epi16(d[1], d[3]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+ d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+ d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ t += 8;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17
+ d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27
+ d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[1] = _mm_packus_epi16(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+ __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+ const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[12], d[6], dd[4];
+ __m128i f0[4], f1[5], f2[5];
+ // The offset of the first row is always less than 1 pixel.
+ const int offset1_q4 = phase + 1 * step_q4;
+ const int offset2_q4 = phase + 2 * step_q4;
+ // offset_idxx indicates the pixel offset is even (0) or odd (1).
+ // It's used to choose the src offset and filter coefficient offset.
+ const int offset_idx1 = (offset1_q4 >> 4) & 1;
+ const int offset_idx2 = (offset2_q4 >> 4) & 1;
+ static const shuffle_filter_funcs shuffle_filter_func_list[2] = {
+ shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+ };
+ static const convolve8_funcs convolve8_func_list[2] = {
+ convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+ };
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0);
+ shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+ shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+ // Sub 64 to avoid overflow.
+ // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+ // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+ // When filter phase idx is 1, the two biggest coefficients are shuffled
+ // together, and the sum of them are always no less than 128. Sub 64 here.
+ // After the subtraction, when the sum of all positive coefficients are no
+ // larger than 128, and the sum of all negative coefficients are no
+ // less than -128, there will be no overflow in the convolve8 functions.
+ f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+ f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+ f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+ // horizontal 6x8
+ do {
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[4]);
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F
+ transpose_16bit_4x8(&s[4], &s[4]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ dd[0] = _mm_packus_epi16(d[0], d[2]);
+ dd[1] = _mm_packus_epi16(d[1], d[3]);
+ dd[2] = _mm_packus_epi16(d[4], d[4]);
+ dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75
+ d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+ d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+ d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx
+ // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx
+ dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+ dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
+ // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
+ // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx
+ // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx
+ d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+ d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+ d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+ d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+ // store 4 extra pixels
+ storeu_8bit_16x4(d, t, stride_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ t += 12;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 3 * stride_hor + 4;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ loadu_8bit_16x4(t, stride_hor, s);
+ y = height_ver;
+
+ do {
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7
+ // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7
+ t += 4 * stride_hor;
+ loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[2] = _mm_packus_epi16(d[2], d[3]);
+ d[4] = _mm_packus_epi16(d[4], d[5]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+ _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+ _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * 2 * height_ver / 3;
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+ const __m128i *const f) {
+ __m128i ss[4], temp;
+
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ temp = convolve8_8_ssse3(ss, f);
+ return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+ const int w, const __m128i *const f) {
+ int x = w;
+
+ do {
+ __m128i s[8], temp;
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+ s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+ temp = scale_1_to_2_phase_0_kernel(s, f);
+ _mm_storel_epi64((__m128i *)dst, temp);
+ src += 8;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int src_w, const int src_h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ int max_width;
+ int y;
+ uint8_t *tmp[9];
+ __m128i f[4];
+
+ max_width = (src_w + 7) & ~7;
+ tmp[0] = temp_buffer + 0 * max_width;
+ tmp[1] = temp_buffer + 1 * max_width;
+ tmp[2] = temp_buffer + 2 * max_width;
+ tmp[3] = temp_buffer + 3 * max_width;
+ tmp[4] = temp_buffer + 4 * max_width;
+ tmp[5] = temp_buffer + 5 * max_width;
+ tmp[6] = temp_buffer + 6 * max_width;
+ tmp[7] = temp_buffer + 7 * max_width;
+
+ shuffle_filter_ssse3(coef, f);
+
+ scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+ scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+ scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+ scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+ scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+ scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+ scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+ y = src_h;
+ do {
+ int x;
+ scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+ for (x = 0; x < max_width; x += 8) {
+ __m128i s[8], C, D, CD;
+
+ // Even rows
+ const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+ const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ const __m128i ab = _mm_unpacklo_epi8(a, b);
+ _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+ // Odd rows
+ // Even columns
+ load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+ C = scale_1_to_2_phase_0_kernel(s, f);
+
+ // Odd columns
+ s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+ s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+ s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+ s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+ s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+ s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+ s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+ D = scale_1_to_2_phase_0_kernel(s, f);
+
+ CD = _mm_unpacklo_epi8(C, D);
+ _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
+ }
+
+ src += src_stride;
+ dst += 2 * dst_stride;
+ tmp[8] = tmp[0];
+ tmp[0] = tmp[1];
+ tmp[1] = tmp[2];
+ tmp[2] = tmp[3];
+ tmp[3] = tmp[4];
+ tmp[4] = tmp[5];
+ tmp[5] = tmp[6];
+ tmp[6] = tmp[7];
+ tmp[7] = tmp[8];
+ } while (--y);
+}
+
+// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling
+// in SSSE3.
+static INLINE bool has_normative_scaler_ssse3(const int src_width,
+ const int src_height,
+ const int dst_width,
+ const int dst_height) {
+ const bool has_normative_scaler =
+ (2 * dst_width == src_width && 2 * dst_height == src_height) ||
+ (4 * dst_width == src_width && 4 * dst_height == src_height) ||
+ (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) ||
+ (dst_width == src_width * 2 && dst_height == src_height * 2);
+
+ return has_normative_scaler;
+}
+
+void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ const InterpFilter filter,
+ const int phase, const int num_planes) {
+ bool has_normative_scaler =
+ has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height,
+ dst->y_crop_width, dst->y_crop_height);
+
+ if (num_planes > 1) {
+ has_normative_scaler =
+ has_normative_scaler &&
+ has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height,
+ dst->uv_crop_width, dst->uv_crop_height);
+ }
+
+ if (!has_normative_scaler) {
+ av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+ return;
+ }
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ int malloc_failed = 0;
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+ const int src_w = src->crop_widths[is_uv];
+ const int src_h = src->crop_heights[is_uv];
+ const int src_y_w = (src->crop_widths[0] + 1) & ~1;
+ const int dst_w = dst->crop_widths[is_uv];
+ const int dst_h = dst->crop_heights[is_uv];
+ const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
+ const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
+
+ if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ // 2 to 1
+ if (phase == 0) {
+ scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_y_w + 3) & ~3;
+ const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel[phase], temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ // 4 to 1
+ if (phase == 0) {
+ scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h);
+ } else if (filter == BILINEAR) {
+ const int16_t c0 = av1_bilinear_filters[phase][3];
+ const int16_t c1 = av1_bilinear_filters[phase][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_y_w + 1) & ~1;
+ const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ // When dst_w is 1 or 2, we need extra padding to avoid heap read
+ // overflow
+ const int extra_padding = 16;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel[phase], temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
+ const int buffer_stride_ver = (dst_y_w + 7) & ~7;
+ const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ // When the vertical filter reads more pixels than the horizontal filter
+ // generated in each row, we need extra padding to avoid heap read
+ // overflow. For example, the horizontal filter generates 18 pixels but
+ // the vertical filter reads 24 pixels in a row. The difference is
+ // multiplied by 2 since two rows are interlaced together in the
+ // optimization.
+ const int extra_padding =
+ (buffer_stride_ver > buffer_stride_hor)
+ ? 2 * (buffer_stride_ver - buffer_stride_hor)
+ : 0;
+ const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+ uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], dst_w,
+ dst_h, interp_kernel, phase, temp_buffer);
+ free(temp_buffer);
+ } else {
+ assert(dst_w == src_w * 2 && dst_h == src_h * 2);
+ // 1 to 2
+ uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
+ if (!temp_buffer) {
+ malloc_failed = 1;
+ break;
+ }
+ const InterpKernel *interp_kernel =
+ (const InterpKernel *)av1_interp_filter_params_list[filter]
+ .filter_ptr;
+ scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], src_w,
+ src_h, interp_kernel[8], temp_buffer);
+ free(temp_buffer);
+ }
+ }
+
+ if (malloc_failed) {
+ av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
+ } else {
+ aom_extend_frame_borders(dst, num_planes);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 0000000000..5ab6c46f8a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+ return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+ return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+ const __m256i x01 = _mm256_slli_si256(x, 4);
+ const __m256i x02 = _mm256_add_epi32(x, x01);
+ const __m256i x03 = _mm256_slli_si256(x02, 8);
+ const __m256i x04 = _mm256_add_epi32(x02, x03);
+ const int32_t s = _mm256_extract_epi32(x04, 3);
+ const __m128i s01 = _mm_set1_epi32(s);
+ const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+ return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+ unsigned int i = 0;
+ for (i = 0; i < (count & 0xffffffe0); i += 32) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+ }
+ for (; i < (count & 0xfffffff8); i += 8) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ }
+ for (; i < count; i++) {
+ dest[i] = 0;
+ }
+ return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+ const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+ const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+ const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+ const __m256i u = _mm256_sub_epi32(tr, tl);
+ const __m256i v = _mm256_sub_epi32(br, bl);
+ return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+ return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+ __m256i an, bb;
+ if (bit_depth > 8) {
+ const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m256i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m256i a =
+ _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+ const __m256i b =
+ _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm256_madd_epi16(b, b);
+ an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+ } else {
+ bb = _mm256_madd_epi16(sum1, sum1);
+ an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+ }
+ return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fours = _mm256_add_epi32(
+ xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+ const __m256i threes =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+ return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+ threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fives =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+ const __m256i sixes = _mm256_add_epi32(xt, xb);
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+
+ const __m256i fives = _mm256_add_epi32(xl, xr);
+ const __m256i sixes = x;
+
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m256i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m256i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m256i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+ // Ctl and Dtl is 32-byte aligned.
+ const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+ int32_t *buf = aom_memalign(
+ 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+ if (!buf) return -1;
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 32 bytes for efficiency.
+ int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array.
+ int32_t *Atl = buf + 0 * buf_elts + 7;
+ int32_t *Btl = buf + 1 * buf_elts + 7;
+ int32_t *Ctl = buf + 2 * buf_elts + 7;
+ int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+ aom_free(buf);
+ return 0;
+}
+
+int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ const int ret = av1_selfguided_restoration_avx2(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ if (ret != 0) return ret;
+ const sgr_params_type *const params = &av1_sgr_params[eps];
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+
+ __m256i xq0 = _mm256_set1_epi32(xq[0]);
+ __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 16 pixels
+ for (int j = 0; j < width; j += 16) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m256i ep_0, ep_1;
+ __m128i src_0, src_1;
+ if (highbd) {
+ src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+ ep_0 = _mm256_cvtepu16_epi32(src_0);
+ ep_1 = _mm256_cvtepu16_epi32(src_1);
+ } else {
+ src_0 = xx_loadu_128(dat8ij);
+ ep_0 = _mm256_cvtepu8_epi32(src_0);
+ ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+ }
+
+ const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+ const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+ __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+ const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+ const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_0 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_1 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ // Note that packing into 16 bits messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+ const __m256i res = _mm256_min_epi16(tmp2, max);
+ yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ // Note that each pack messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i res =
+ _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+ const __m128i res2 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+ xx_storeu_128(dst8 + m, res2);
+ }
+ }
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 0000000000..ac850f5691
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+ return _mm_cvtepu8_epi32(xx_loadl_32(p));
+}
+
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+ return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
+
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+ const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+ return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+ }
+ }
+}
+
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+ const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+ const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+ const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+ const __m128i u = _mm_sub_epi32(tr, tl);
+ const __m128i v = _mm_sub_epi32(br, bl);
+ return _mm_sub_epi32(v, u);
+}
+
+static __m128i round_for_shift(unsigned shift) {
+ return _mm_set1_epi32((1 << shift) >> 1);
+}
+
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+ __m128i an, bb;
+ if (bit_depth > 8) {
+ const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m128i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+ const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm_madd_epi16(b, b);
+ an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+ } else {
+ bb = _mm_madd_epi16(sum1, sum1);
+ an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+ }
+ return _mm_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
+
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res =
+ _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ xx_storeu_128(A + i * buf_stride + j, a_res);
+
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+ xx_storeu_128(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fours = _mm_add_epi32(
+ xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+ const __m128i threes =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+ return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
+
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res =
+ _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+ av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ xx_storeu_128(A + i * buf_stride + j, a_res);
+
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+ xx_storeu_128(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fives =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+ const __m128i sixes = _mm_add_epi32(xt, xb);
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+
+ const __m128i fives = _mm_add_epi32(xl, xr);
+ const __m128i sixes = x;
+
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m128i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m128i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m128i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+ int height, int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ int32_t *buf = (int32_t *)aom_memalign(
+ 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+ if (!buf) return -1;
+ memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes for efficiency.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+ int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+ aom_free(buf);
+ return 0;
+}
+
+int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ const int ret = av1_selfguided_restoration_sse4_1(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ if (ret != 0) return ret;
+ const sgr_params_type *const params = &av1_sgr_params[eps];
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+
+ __m128i xq0 = _mm_set1_epi32(xq[0]);
+ __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 8 pixels
+ for (int j = 0; j < width; j += 8) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m128i src;
+ if (highbd) {
+ src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ } else {
+ src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+ }
+
+ const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+ const __m128i u_0 = _mm_cvtepu16_epi32(u);
+ const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+ __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+ const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+ const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+ const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+ const __m128i res = _mm_min_epi16(tmp, max);
+ xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+ const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+ xx_storel_64(dst8 + m, res);
+ }
+ }
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_avx2.c b/third_party/aom/av1/common/x86/warp_plane_avx2.c
new file mode 100644
index 0000000000..663b8cde93
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_avx2.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+ 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+ 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7,
+ 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10,
+ 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4,
+ 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+ 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8,
+ 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11,
+ 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 };
+
+static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+ __m256i *coeff,
+ const __m256i *shuffle_src,
+ const __m256i *round_const,
+ const __m128i *shift, int row) {
+ const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
+ const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
+ const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
+ const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
+
+ const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
+ const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
+ const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
+ const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
+
+ const __m256i res_even = _mm256_add_epi16(res_02, res_46);
+ const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
+ const __m256i res =
+ _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
+ horz_out[row] = _mm256_srl_epi16(res, *shift);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+ int sx,
+ __m256i *coeff) {
+ __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
+ __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+ WARPEDDIFF_PREC_BITS]);
+
+ __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+ __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+ __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+ __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
+
+ __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+ __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+ __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+ __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
+
+ __m128i tmp_8 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
+
+ __m128i tmp_9 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+ __m128i tmp_10 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+ __m128i tmp_11 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+ tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+ tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+ tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+ tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+ WARPEDDIFF_PREC_BITS]);
+ tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+ const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+ const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+ const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+ const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
+
+ const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+ const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+ const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+ const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+ coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+ coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+ coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+ coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+ __m256i *coeff) {
+ __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
+ const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
+ const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
+ const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
+
+ const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+ const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+ const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+ const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+ coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+ coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+ coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+ coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+ __m256i *coeff) {
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
+
+ const __m256i res_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
+
+ coeff[0] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+ coeff[1] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+ coeff[2] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+ coeff[3] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+}
+
+static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+ int sx, int alpha, int beta, int row,
+ const __m256i *shuffle_src,
+ const __m256i *round_const,
+ const __m128i *shift) {
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
+ filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
+ row);
+}
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m256i *coeff) {
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
+ coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
+ coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
+ coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
+}
+
+static INLINE void warp_horizontal_filter_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ int k, iy, sx, row = 0;
+ __m256i coeff[4];
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ sx = sx4 + beta * (k + 4);
+ horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
+ round_const, shift);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)alpha;
+ int k, iy, sx, row = 0;
+ __m256i coeff[4];
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ sx = sx4 + beta * (k + 4);
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_beta0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)beta;
+ int k, iy, row = 0;
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src_1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ (void)alpha;
+ int k, iy, row = 0;
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m128i src1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+ shift, row);
+}
+
+static INLINE void unpack_weights_and_set_round_const_avx2(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
+ *res_sub_const =
+ _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16((short)w0);
+ const __m256i wt1 = _mm256_set1_epi16((short)w1);
+ *wt = _mm256_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+ int sy,
+ __m256i *coeffs) {
+ __m128i filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m128i filt_10 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_11 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_12 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_13 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i filt_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+ __m256i filt_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+ __m256i filt_2 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+ __m256i filt_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+ __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+ filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_10 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_11 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_12 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_13 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter +
+ (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+ filt_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+ filt_2 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+ filt_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+ res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+ __m256i *coeffs) {
+ __m128i filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ __m128i filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
+ __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
+ __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
+ __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+ __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+ filt_00 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_01 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_02 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ filt_03 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ filt_0 = _mm256_broadcastsi128_si256(filt_00);
+ filt_1 = _mm256_broadcastsi128_si256(filt_01);
+ filt_2 = _mm256_broadcastsi128_si256(filt_02);
+ filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+ res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+ res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+ res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+ res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+ coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+ coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+ coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+ coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+ __m256i *coeffs) {
+ const __m128i filt_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+ const __m128i filt_1 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
+
+ __m256i res_0 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
+
+ coeffs[0] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+ coeffs[1] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+ coeffs[2] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+ coeffs[3] = _mm256_shuffle_epi8(
+ res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+ __m256i *src,
+ __m256i *coeffs,
+ __m256i *res_lo,
+ __m256i *res_hi, int row) {
+ const __m256i src_6 = horz_out[row + 3];
+ const __m256i src_7 =
+ _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
+
+ src[6] = _mm256_unpacklo_epi16(src_6, src_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
+ const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
+ const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
+ const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
+
+ const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
+ _mm256_add_epi32(res_4, res_6));
+
+ src[7] = _mm256_unpackhi_epi16(src_6, src_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
+ const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
+ const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
+ const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
+
+ const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
+ _mm256_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output_avx2(
+ const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
+ const __m256i *wt, const __m256i *res_sub_const,
+ const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
+ int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m256i res_lo_1 = *res_lo;
+ __m256i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p_0 =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&conv_params
+ ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
+
+ res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+
+ const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
+ __m256i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const dst8_1 =
+ (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+ const __m128i p_16_0 = _mm_loadl_epi64(p_0);
+ const __m128i p_16_1 = _mm_loadl_epi64(p_1);
+ const __m256i p_16 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
+ const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
+ const __m256i shifted_32 =
+ _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
+ }
+ res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
+ res_lo_16 = _mm256_srai_epi16(
+ _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
+ const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
+ const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
+ const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
+ *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
+ *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
+ } else {
+ const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
+ const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
+ _mm_storel_epi64(p_0, temp_lo_16_0);
+ _mm_storel_epi64(p_1, temp_lo_16_1);
+ }
+ if (p_width > 4) {
+ __m128i *const p4_0 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ __m128i *const p4_1 =
+ (__m128i *)&conv_params
+ ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
+ __m256i res_hi_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8_4_0 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i *const dst8_4_1 =
+ (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
+ const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
+ const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
+ const __m256i p4_16 = _mm256_inserti128_si256(
+ _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
+ const __m256i shifted_32 =
+ _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
+ res_hi_16 = _mm256_srai_epi16(
+ _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
+ __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
+ const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
+ const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
+ *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
+ *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
+ } else {
+ const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
+ const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
+ _mm_storel_epi64(p4_0, temp_hi_16_0);
+ _mm_storel_epi64(p4_1, temp_hi_16_1);
+ }
+ }
+ } else {
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
+ const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
+ const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+
+ if (p_width == 4) {
+ *(int *)p = _mm_cvtsi128_si32(res_8bit0);
+ *(int *)p1 = _mm_cvtsi128_si32(res_8bit1);
+ } else {
+ _mm_storel_epi64(p, res_8bit0);
+ _mm_storel_epi64(p1, res_8bit1);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ int k, row = 0;
+ __m256i src[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ int sy = sy4 + delta * (k + 4);
+ __m256i coeffs[8];
+ prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)gamma;
+ int k, row = 0;
+ __m256i src[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ int sy = sy4 + delta * (k + 4);
+ __m256i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)delta;
+ int k, row = 0;
+ __m256i src[8], coeffs[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ (void)gamma;
+ int k, row = 0;
+ __m256i src[8], coeffs[8];
+ const __m256i src_0 = horz_out[0];
+ const __m256i src_1 =
+ _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+ const __m256i src_2 = horz_out[1];
+ const __m256i src_3 =
+ _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+ const __m256i src_4 = horz_out[2];
+ const __m256i src_5 =
+ _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+ src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+ src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+ src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+ src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
+
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+ __m256i res_lo, res_hi;
+ filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+ row);
+ store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+ res_sub_const, round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ src[0] = src[2];
+ src[2] = src[4];
+ src[4] = src[6];
+ src[1] = src[3];
+ src[3] = src[5];
+ src[5] = src[7];
+ row += 1;
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter_avx2(
+ uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+ int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+ int i, int j, int sy4, const int reduce_bits_vert,
+ const __m256i *res_add_const, const int round_bits,
+ const __m256i *res_sub_const, const __m256i *round_bits_const,
+ const __m256i *wt) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+ i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+ round_bits_const, wt);
+ else
+ warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
+ p_height, p_stride, p_width, i, j, sy4,
+ reduce_bits_vert, res_add_const, round_bits,
+ res_sub_const, round_bits_const, wt);
+}
+
+static INLINE void prepare_warp_horizontal_filter_avx2(
+ const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const __m256i *round_const, const __m128i *shift,
+ const __m256i *shuffle_src) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0_avx2(
+ ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+ alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+ alpha, beta, p_height, height, i,
+ round_const, shift, shuffle_src);
+ else
+ warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i, round_const, shift,
+ shuffle_src);
+}
+
+void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m256i horz_out[8];
+ int i, j, k;
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m256i reduce_bits_vert_const =
+ _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ const __m256i round_const = _mm256_set1_epi16(
+ (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
+ const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
+
+ __m256i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const,
+ &wt);
+
+ __m256i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
+ const int32_t const1 = alpha * (-4) + beta * (-4) +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ const int32_t const2 = gamma * (-4) + delta * (-4) +
+ (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
+ const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
+
+ __m256i shuffle_src[4];
+ shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+ shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+ shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+ shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += const1;
+ sy4 += const2;
+
+ sx4 &= ~const3;
+ sy4 &= ~const3;
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+
+ if (ix4 <= -7) {
+ int iy, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_0 =
+ _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_1 =
+ _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+ } else if (ix4 >= width + 6) {
+ int iy, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_0 = _mm256_set1_epi16(
+ const4 + ref[iy * stride + (width - 1)] * const5);
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ const __m256i temp_1 = _mm256_set1_epi16(
+ const4 + ref[iy * stride + (width - 1)] * const5);
+ horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ horz_out[row] =
+ _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ int iy, sx, row = 0;
+ for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src0 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ iy = iy4 + k + 1;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src1 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
+ src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
+ src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
+ }
+ sx = sx4 + beta * (k + 4);
+ const __m256i src_01 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+ horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
+ shuffle_src, &round_const, &shift);
+ row += 1;
+ }
+ iy = iy4 + k;
+ iy = clamp(iy, 0, height - 1);
+ __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right =
+ _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ sx = sx4 + beta * (k + 4);
+ const __m256i src_01 = _mm256_castsi128_si256(src);
+ __m256i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
+ &round_const, &shift, row);
+ } else {
+ prepare_warp_horizontal_filter_avx2(
+ ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
+ i, &round_const, &shift, shuffle_src);
+ }
+
+ // Vertical filter
+ prepare_warp_vertical_filter_avx2(
+ pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
+ p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
new file mode 100644
index 0000000000..4c05555ff7
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, const int8_t,
+ av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+DECLARE_ALIGNED(16, static const uint8_t,
+ even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9,
+ 9, 11, 11, 13, 13, 15, 15, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+ shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+ 12, 13, 14, 15, 12, 13, 14, 15 };
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz, int k) {
+ const __m128i src_even =
+ _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
+ const __m128i src_odd =
+ _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+ _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+ const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
+}
+
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+ int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+ int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta,
+ int p_height, int height, int i,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+ *res_sub_const =
+ _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
+ const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
+ *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // even coeffs
+ coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i tmp_1 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 =
+ _mm_loadu_si128((__m128i *)(av1_warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ // odd coeffs
+ coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ // even coeffs
+ coeffs[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
+ coeffs[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
+ coeffs[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
+ coeffs[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
+
+ // odd coeffs
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+ __m128i *res_lo, __m128i *res_hi,
+ int k) {
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+ const __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+ const __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+ __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+ const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+ uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+ const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m128i res_lo_1 = *res_lo;
+ __m128i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+ __m128i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ const __m128i p_16 = _mm_loadl_epi64(p);
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+ }
+
+ res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+ *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+ } else {
+ _mm_storel_epi64(p, temp_lo_16);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+ __m128i res_hi_16;
+
+ if (conv_params->do_average) {
+ __m128i *const dst8_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+ if (conv_params->use_dist_wtd_comp_avg) {
+ const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+ *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+ } else {
+ _mm_storel_epi64(p4, temp_hi_16);
+ }
+ }
+ } else {
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ *(int *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ (void)gamma;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ (void)gamma;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+ sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else
+ warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else
+ warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+ __m128i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+ } else {
+ prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+
+ // Vertical filter
+ prepare_warp_vertical_filter(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+ j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 0000000000..3de630f203
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+
+// Exploiting the range of wiener filter coefficients,
+// horizontal filtering can be done in 16 bit intermediate precision.
+// The details are as follows :
+// Consider the horizontal wiener filter coefficients of the following form :
+// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
+// Subtracting 2^(FILTER_BITS) from the centre tap we get the following :
+// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0]
+// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
+// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
+// precision. Finally, after rounding the above result by round_0, we multiply
+// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
+// horizontal filter output.
+
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
+ int im_h = h + SUBPEL_TAPS - 2;
+ int im_stride = 8;
+ memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
+ int i, j;
+ const int center_tap = (SUBPEL_TAPS - 1) / 2;
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
+
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
+ const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs_h[0] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs_h[1] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs_h[2] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs_h[3] =
+ _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
+
+ const __m256i round_const_h =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
+ const __m256i round_const_horz =
+ _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
+ const __m256i clamp_low = _mm256_setzero_si256();
+ const __m256i clamp_high =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
+
+ const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
+
+ const __m256i round_const_v =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (j = 0; j < w; j += 8) {
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data,
+ _mm_loadu_si128(
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+ 1);
+
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+ res =
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+ __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
+
+ // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
+ // the result
+ data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
+ res = _mm256_add_epi16(res, data_0);
+ res = _mm256_add_epi16(res, round_const_horz);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
+ }
+
+ /* Vertical filter */
+ {
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ __m256i s[8];
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (i = 0; i < h - 1; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ __m256i res_a = convolve(s, coeffs_v);
+ __m256i res_b = convolve(s + 4, coeffs_v);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ // 8 bit conversion and saturation to uint8
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ // Store values into the destination buffer
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+
+ _mm_storel_epi64(p_0, res_0);
+ _mm_storel_epi64(p_1, res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ if (h - i) {
+ s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
+ s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
+ s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
+
+ const int16_t *data = &im_block[i * im_stride];
+ const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
+ const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+
+ __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
+ __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
+
+ s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
+ __m256i convolveres = convolve(s, coeffs_v);
+
+ const __m256i res_round = _mm256_sra_epi32(
+ _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ __m128i reslo = _mm256_castsi256_si128(res_round);
+ __m128i reshi = _mm256_extracti128_si256(res_round, 1);
+ const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p_0, res_8b);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 0000000000..1c039e80c6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(
+ _mm_max_epi16(res, zero),
+ _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c
new file mode 100644
index 0000000000..1ded380ec3
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "av1/decoder/accounting.h"
+
+static int accounting_hash(const char *str) {
+ uint32_t val;
+ const unsigned char *ustr;
+ val = 0;
+ ustr = (const unsigned char *)str;
+ /* This is about the worst hash one can design, but it should be good enough
+ here. */
+ while (*ustr) val += *ustr++;
+ return val % AOM_ACCOUNTING_HASH_SIZE;
+}
+
+/* Dictionary lookup based on an open-addressing hash table. */
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
+ int hash;
+ size_t len;
+ AccountingDictionary *dictionary;
+ dictionary = &accounting->syms.dictionary;
+ hash = accounting_hash(str);
+ while (accounting->hash_dictionary[hash] != -1) {
+ if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
+ return accounting->hash_dictionary[hash];
+ }
+ hash++;
+ if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0;
+ }
+ /* No match found. */
+ assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES);
+ accounting->hash_dictionary[hash] = dictionary->num_strs;
+ len = strlen(str);
+ dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+ if (!dictionary->strs[dictionary->num_strs]) abort();
+ snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
+ dictionary->num_strs++;
+ return dictionary->num_strs - 1;
+}
+
+void aom_accounting_init(Accounting *accounting) {
+ int i;
+ accounting->num_syms_allocated = 1000;
+ accounting->syms.syms =
+ malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+ if (!accounting->syms.syms) abort();
+ accounting->syms.dictionary.num_strs = 0;
+ assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
+ for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
+ accounting->hash_dictionary[i] = -1;
+ aom_accounting_reset(accounting);
+}
+
+void aom_accounting_reset(Accounting *accounting) {
+ accounting->syms.num_syms = 0;
+ accounting->syms.num_binary_syms = 0;
+ accounting->syms.num_multi_syms = 0;
+ accounting->context.x = -1;
+ accounting->context.y = -1;
+ accounting->last_tell_frac = 0;
+}
+
+void aom_accounting_clear(Accounting *accounting) {
+ int i;
+ AccountingDictionary *dictionary;
+ free(accounting->syms.syms);
+ dictionary = &accounting->syms.dictionary;
+ for (i = 0; i < dictionary->num_strs; i++) {
+ free(dictionary->strs[i]);
+ }
+}
+
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) {
+ accounting->context.x = x;
+ accounting->context.y = y;
+}
+
+void aom_accounting_record(Accounting *accounting, const char *str,
+ uint32_t bits) {
+ AccountingSymbol sym;
+ // Reuse previous symbol if it has the same context and symbol id.
+ if (accounting->syms.num_syms) {
+ AccountingSymbol *last_sym;
+ last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1];
+ if (memcmp(&last_sym->context, &accounting->context,
+ sizeof(AccountingSymbolContext)) == 0) {
+ uint32_t id;
+ id = aom_accounting_dictionary_lookup(accounting, str);
+ if (id == last_sym->id) {
+ last_sym->bits += bits;
+ last_sym->samples++;
+ return;
+ }
+ }
+ }
+ sym.context = accounting->context;
+ sym.samples = 1;
+ sym.bits = bits;
+ sym.id = aom_accounting_dictionary_lookup(accounting, str);
+ assert(sym.id <= 255);
+ if (accounting->syms.num_syms == accounting->num_syms_allocated) {
+ accounting->num_syms_allocated *= 2;
+ accounting->syms.syms =
+ realloc(accounting->syms.syms,
+ sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+ if (!accounting->syms.syms) abort();
+ }
+ accounting->syms.syms[accounting->syms.num_syms++] = sym;
+}
+
+void aom_accounting_dump(Accounting *accounting) {
+ int i;
+ AccountingSymbol *sym;
+ printf("\n----- Number of recorded syntax elements = %d -----\n",
+ accounting->syms.num_syms);
+ printf("----- Total number of symbol calls = %d (%d binary) -----\n",
+ accounting->syms.num_multi_syms + accounting->syms.num_binary_syms,
+ accounting->syms.num_binary_syms);
+ for (i = 0; i < accounting->syms.num_syms; i++) {
+ sym = &accounting->syms.syms[i];
+ printf("%s x: %d, y: %d bits: %f samples: %d\n",
+ accounting->syms.dictionary.strs[sym->id], sym->context.x,
+ sym->context.y, (float)sym->bits / 8.0, sym->samples);
+ }
+}
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
new file mode 100644
index 0000000000..ad2e8b6cfe
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
+#include <stdlib.h>
+#include "aom/aomdx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define AOM_ACCOUNTING_HASH_SIZE (1021)
+
+/* Max number of entries for symbol types in the dictionary (increase as
+ necessary). */
+#define MAX_SYMBOL_TYPES (256)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+ 3 => 1/8th bits.*/
+#define AOM_ACCT_BITRES (3)
+
+typedef struct {
+ int16_t x;
+ int16_t y;
+} AccountingSymbolContext;
+
+typedef struct {
+ AccountingSymbolContext context;
+ uint32_t id;
+ /** Number of bits in units of 1/8 bit. */
+ uint32_t bits;
+ uint32_t samples;
+} AccountingSymbol;
+
+/** Dictionary for translating strings into id. */
+typedef struct {
+ char *strs[MAX_SYMBOL_TYPES];
+ int num_strs;
+} AccountingDictionary;
+
+typedef struct {
+ /** All recorded symbols decoded. */
+ AccountingSymbol *syms;
+ /** Number of syntax actually recorded. */
+ int num_syms;
+ /** Raw symbol decoding calls for non-binary values. */
+ int num_multi_syms;
+ /** Raw binary symbol decoding calls. */
+ int num_binary_syms;
+ /** Dictionary for translating strings into id. */
+ AccountingDictionary dictionary;
+} AccountingSymbols;
+
+struct Accounting {
+ AccountingSymbols syms;
+ /** Size allocated for symbols (not all may be used). */
+ int num_syms_allocated;
+ int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
+ AccountingSymbolContext context;
+ uint32_t last_tell_frac;
+};
+
+void aom_accounting_init(Accounting *accounting);
+void aom_accounting_reset(Accounting *accounting);
+void aom_accounting_clear(Accounting *accounting);
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y);
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str);
+void aom_accounting_record(Accounting *accounting, const char *str,
+ uint32_t bits);
+void aom_accounting_dump(Accounting *accounting);
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
new file mode 100644
index 0000000000..bb09347e1c
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -0,0 +1,5369 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/obmc.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodetxb.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+#define AOM_MIN_THREADS_PER_TILE 1
+#define AOM_MAX_THREADS_PER_TILE 2
+
+// This is needed by ext_tile related unit tests.
+#define EXT_TILE_DEBUG 1
+#define MC_TEMP_BUF_PELS \
+ (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
+ ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
+
+// Checks that the remaining bits start with a 1 and ends with 0s.
+// It consumes an additional byte, if already byte aligned before the check.
+int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+ // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
+ int bits_before_alignment = 8 - rb->bit_offset % 8;
+ int trailing = aom_rb_read_literal(rb, bits_before_alignment);
+ if (trailing != (1 << (bits_before_alignment - 1))) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ return 0;
+}
+
+// Use only_chroma = 1 to only set the chroma planes
+static AOM_INLINE void set_planes_to_neutral_grey(
+ const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf,
+ int only_chroma) {
+ if (seq_params->use_highbitdepth) {
+ const int val = 1 << (seq_params->bit_depth - 1);
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+ // Set the first row to neutral grey. Then copy the first row to all
+ // subsequent rows.
+ if (buf->crop_heights[is_uv] > 0) {
+ aom_memset16(base, val, buf->crop_widths[is_uv]);
+ for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+ memcpy(&base[row_idx * buf->strides[is_uv]], base,
+ sizeof(*base) * buf->crop_widths[is_uv]);
+ }
+ }
+ }
+ } else {
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+ memset(&buf->buffers[plane][row_idx * buf->strides[is_uv]], 1 << 7,
+ buf->crop_widths[is_uv]);
+ }
+ }
+ }
+}
+
+static AOM_INLINE void loop_restoration_read_sb_coeffs(
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
+ int runit_idx);
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+ return len != 0 && len <= (size_t)(end - start);
+}
+
+static TX_MODE read_tx_mode(struct aom_read_bit_buffer *rb,
+ int coded_lossless) {
+ if (coded_lossless) return ONLY_4X4;
+ return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
+}
+
+static REFERENCE_MODE read_frame_reference_mode(
+ const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ if (frame_is_intra_only(cm)) {
+ return SINGLE_REFERENCE;
+ } else {
+ return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
+ }
+}
+
+static AOM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb,
+ int plane, const TX_TYPE tx_type,
+ const TX_SIZE tx_size,
+ uint8_t *dst, int stride,
+ int reduced_tx_set) {
+ tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
+ eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
+ uint16_t scan_line = eob_data->max_scan_line;
+ uint16_t eob = eob_data->eob;
+ av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst,
+ stride, eob, reduced_tx_set);
+ memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
+}
+
+static AOM_INLINE void read_coeffs_tx_intra_block(
+ const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
+ const int plane, const int row, const int col, const TX_SIZE tx_size) {
+ MB_MODE_INFO *mbmi = dcb->xd.mi[0];
+ if (!mbmi->skip_txfm) {
+#if TXCOEFF_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ av1_read_coeffs_txb_facade(cm, dcb, r, plane, row, col, tx_size);
+#if TXCOEFF_TIMER
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ cm->txcoeff_timer += elapsed_time;
+ ++cm->txb_count;
+#endif
+ }
+}
+
+static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size) {
+ (void)cm;
+ (void)dcb;
+ (void)r;
+ (void)plane;
+ (void)row;
+ (void)col;
+ (void)tx_size;
+}
+
+static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb,
+ BLOCK_SIZE bsize) {
+ (void)cm;
+ (void)dcb;
+ (void)bsize;
+}
+
+static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd) {
+ (void)cm;
+ (void)xd;
+}
+
+static AOM_INLINE void predict_and_reconstruct_intra_block(
+ const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
+ const int plane, const int row, const int col, const TX_SIZE tx_size) {
+ (void)r;
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ PLANE_TYPE plane_type = get_plane_type(plane);
+
+ av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+ if (!mbmi->skip_txfm) {
+ eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
+ if (eob_data->eob) {
+ const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
+ // tx_type was read out in av1_read_coeffs_txb.
+ const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
+ reduced_tx_set_used);
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
+ inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
+ reduced_tx_set_used);
+ }
+ }
+ if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
+ cfl_store_tx(xd, row, col, tx_size, mbmi->bsize);
+ }
+}
+
+static AOM_INLINE void inverse_transform_inter_block(
+ const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r,
+ const int plane, const int blk_row, const int blk_col,
+ const TX_SIZE tx_size) {
+ (void)r;
+ MACROBLOCKD *const xd = &dcb->xd;
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
+ // tx_type was read out in av1_read_coeffs_txb.
+ const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+ tx_size, reduced_tx_set_used);
+
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+ inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride,
+ reduced_tx_set_used);
+#if CONFIG_MISMATCH_DEBUG
+ int pixel_c, pixel_r;
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ int blk_w = block_size_wide[bsize];
+ int blk_h = block_size_high[bsize];
+ const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+ const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+ pd->subsampling_x, pd->subsampling_y);
+ mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
+ plane, pixel_c, pixel_r, blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
+}
+
+static AOM_INLINE void set_cb_buffer_offsets(DecoderCodingBlock *dcb,
+ TX_SIZE tx_size, int plane) {
+ dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+ dcb->txb_offset[plane] =
+ dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+}
+
+static AOM_INLINE void decode_reconstruct_tx(
+ AV1_COMMON *cm, ThreadData *const td, aom_reader *r,
+ MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row,
+ int blk_col, int block, TX_SIZE tx_size, int *eob_total) {
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+ // Scale to match transform block unit.
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size || plane) {
+ td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
+ tx_size);
+
+ td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col,
+ tx_size);
+ eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
+ *eob_total += eob_data->eob;
+ set_cb_buffer_offsets(dcb, tx_size, plane);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+ assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int sub_step = bsw * bsh;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+
+ decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
+ offsetc, block, sub_txs, eob_total);
+ block += sub_step;
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int bw, int bh, int x_mis, int y_mis) {
+ const int num_planes = av1_num_planes(cm);
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const TileInfo *const tile = &xd->tile;
+
+ set_mi_offsets(mi_params, xd, mi_row, mi_col);
+ xd->mi[0]->bsize = bsize;
+#if CONFIG_RD_DEBUG
+ xd->mi[0]->mi_row = mi_row;
+ xd->mi[0]->mi_col = mi_col;
+#endif
+
+ assert(x_mis && y_mis);
+ for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
+ int idx = mi_params->mi_stride;
+ for (int y = 1; y < y_mis; ++y) {
+ memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
+ idx += mi_params->mi_stride;
+ }
+
+ set_plane_n4(xd, bw, bh, num_planes);
+ set_entropy_context(xd, mi_row, mi_col, num_planes);
+
+ // Distance of Mb to the various image edges. These are specified to 8th pel
+ // as they are always compared to values that are in 1/8th pel units
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+ mi_params->mi_cols);
+
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+}
+
+static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
+ DecoderCodingBlock *dcb, int mi_row,
+ int mi_col, aom_reader *r,
+ PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
+ const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
+ MACROBLOCKD *const xd = &dcb->xd;
+
+#if CONFIG_ACCOUNTING
+ aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+ set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+ xd->mi[0]->partition = partition;
+ av1_read_mode_info(pbi, dcb, r, x_mis, y_mis);
+ if (bsize >= BLOCK_8X8 &&
+ (seq_params->subsampling_x || seq_params->subsampling_y)) {
+ const BLOCK_SIZE uv_subsize =
+ av1_ss_size_lookup[bsize][seq_params->subsampling_x]
+ [seq_params->subsampling_y];
+ if (uv_subsize == BLOCK_INVALID)
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid block size.");
+ }
+}
+
+typedef struct PadBlock {
+ int x0;
+ int x1;
+ int y0;
+ int y1;
+} PadBlock;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8,
+ int src_stride, uint8_t *dst8,
+ int dst_stride, int x, int y,
+ int b_w, int b_h, int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w) left = b_w;
+
+ if (x + b_w > w) right = x + b_w - w;
+
+ if (right > b_w) right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left) aom_memset16(dst, ref_row[0], left);
+
+ if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+ if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h) ref_row += src_stride;
+ } while (--b_h);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int x,
+ int y, int b_w, int b_h, int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint8_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w) left = b_w;
+
+ if (x + b_w > w) right = x + b_w - w;
+
+ if (right > b_w) right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left) memset(dst, ref_row[0], left);
+
+ if (copy) memcpy(dst + left, ref_row + x + left, copy);
+
+ if (right) memset(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h) ref_row += src_stride;
+ } while (--b_h);
+}
+
+static INLINE int update_extend_mc_border_params(
+ const struct scale_factors *const sf, struct buf_2d *const pre_buf,
+ MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
+ int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
+ const int is_scaled = av1_is_scaled(sf);
+ // Get reference width and height.
+ int frame_width = pre_buf->width;
+ int frame_height = pre_buf->height;
+
+ // Do border extension if there is motion or
+ // width/height is not a multiple of 8 pixels.
+ if ((!is_intrabc) && (!do_warp) &&
+ (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
+ (frame_height & 0x7))) {
+ if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+ block->x0 -= AOM_INTERP_EXTEND - 1;
+ block->x1 += AOM_INTERP_EXTEND;
+ *x_pad = 1;
+ }
+
+ if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+ block->y0 -= AOM_INTERP_EXTEND - 1;
+ block->y1 += AOM_INTERP_EXTEND;
+ *y_pad = 1;
+ }
+
+ // Skip border extension if block is inside the frame.
+ if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
+ block->y1 > frame_height - 1) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE void extend_mc_border(const struct scale_factors *const sf,
+ struct buf_2d *const pre_buf,
+ MV32 scaled_mv, PadBlock block,
+ int subpel_x_mv, int subpel_y_mv,
+ int do_warp, int is_intrabc, int highbd,
+ uint8_t *mc_buf, uint8_t **pre,
+ int *src_stride) {
+ int x_pad = 0, y_pad = 0;
+ if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
+ subpel_x_mv, subpel_y_mv, do_warp,
+ is_intrabc, &x_pad, &y_pad)) {
+ // Get reference block pointer.
+ const uint8_t *const buf_ptr =
+ pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+ int buf_stride = pre_buf->stride;
+ const int b_w = block.x1 - block.x0;
+ const int b_h = block.y1 - block.y0;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // Extend the border.
+ if (highbd) {
+ highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
+ block.y0, b_w, b_h, pre_buf->width,
+ pre_buf->height);
+ } else {
+ build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+ b_h, pre_buf->width, pre_buf->height);
+ }
+#else
+ (void)highbd;
+ build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+ b_h, pre_buf->width, pre_buf->height);
+#endif
+ *src_stride = b_w;
+ *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
+ x_pad * (AOM_INTERP_EXTEND - 1);
+ }
+}
+
+static AOM_INLINE void dec_calc_subpel_params(
+ const MV *const src_mv, InterPredParams *const inter_pred_params,
+ const MACROBLOCKD *const xd, int mi_x, int mi_y, uint8_t **pre,
+ SubpelParams *subpel_params, int *src_stride, PadBlock *block,
+ MV32 *scaled_mv, int *subpel_x_mv, int *subpel_y_mv) {
+ const struct scale_factors *sf = inter_pred_params->scale_factors;
+ struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+ const int bw = inter_pred_params->block_width;
+ const int bh = inter_pred_params->block_height;
+ const int is_scaled = av1_is_scaled(sf);
+ if (is_scaled) {
+ int ssx = inter_pred_params->subsampling_x;
+ int ssy = inter_pred_params->subsampling_y;
+ int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+ orig_pos_y += src_mv->row * (1 << (1 - ssy));
+ int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+ orig_pos_x += src_mv->col * (1 << (1 - ssx));
+ int pos_y = av1_scaled_y(orig_pos_y, sf);
+ int pos_x = av1_scaled_x(orig_pos_x, sf);
+ pos_x += SCALE_EXTRA_OFF;
+ pos_y += SCALE_EXTRA_OFF;
+
+ const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+ const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+ const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+ << SCALE_SUBPEL_BITS;
+ const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ pos_y = clamp(pos_y, top, bottom);
+ pos_x = clamp(pos_x, left, right);
+
+ subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+ subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+ subpel_params->xs = sf->x_step_q4;
+ subpel_params->ys = sf->y_step_q4;
+
+ // Get reference block top left coordinate.
+ block->x0 = pos_x >> SCALE_SUBPEL_BITS;
+ block->y0 = pos_y >> SCALE_SUBPEL_BITS;
+
+ // Get reference block bottom right coordinate.
+ block->x1 =
+ ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
+ block->y1 =
+ ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
+
+ MV temp_mv;
+ temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh,
+ inter_pred_params->subsampling_x,
+ inter_pred_params->subsampling_y);
+ *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf);
+ scaled_mv->row += SCALE_EXTRA_OFF;
+ scaled_mv->col += SCALE_EXTRA_OFF;
+
+ *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
+ *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
+ } else {
+ // Get block position in current frame.
+ int pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+ int pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, src_mv, bw, bh, inter_pred_params->subsampling_x,
+ inter_pred_params->subsampling_y);
+ subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+ subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+
+ // Get reference block top left coordinate.
+ pos_x += mv_q4.col;
+ pos_y += mv_q4.row;
+ block->x0 = pos_x >> SUBPEL_BITS;
+ block->y0 = pos_y >> SUBPEL_BITS;
+
+ // Get reference block bottom right coordinate.
+ block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
+ block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
+
+ scaled_mv->row = mv_q4.row;
+ scaled_mv->col = mv_q4.col;
+ *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
+ *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
+ }
+ *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0;
+ *src_stride = pre_buf->stride;
+}
+
+static AOM_INLINE void dec_calc_subpel_params_and_extend(
+ const MV *const src_mv, InterPredParams *const inter_pred_params,
+ MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf,
+ uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
+ PadBlock block;
+ MV32 scaled_mv;
+ int subpel_x_mv, subpel_y_mv;
+ dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre,
+ subpel_params, src_stride, &block, &scaled_mv,
+ &subpel_x_mv, &subpel_y_mv);
+ extend_mc_border(
+ inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
+ scaled_mv, block, subpel_x_mv, subpel_y_mv,
+ inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
+ inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride);
+}
+
+#define IS_DEC 1
+#include "av1/common/reconinter_template.inc"
+#undef IS_DEC
+
+static void dec_build_inter_predictors(const AV1_COMMON *cm,
+ DecoderCodingBlock *dcb, int plane,
+ const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh,
+ int mi_x, int mi_y) {
+ build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh, mi_x,
+ mi_y, dcb->mc_buf);
+}
+
+static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
+ DecoderCodingBlock *dcb,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0,
+ xd->plane[plane].width, xd->plane[plane].height,
+ mi_x, mi_y);
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
+ xd->plane[2].dst.buf },
+ { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+ xd->plane[2].dst.stride } };
+ av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+ xd->plane[plane].dst.stride, &ctx, plane,
+ bsize);
+ }
+ }
+}
+
+static INLINE void dec_build_prediction_by_above_pred(
+ MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int above_mi_col = xd->mi_col + rel_mi_col;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+ (void)rel_mi_row;
+ (void)dir;
+
+ av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size,
+ &backup_mbmi, ctxt, num_planes);
+ mi_x = above_mi_col << MI_SIZE_LOG2;
+ mi_y = xd->mi_row << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+ int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+ dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
+ &backup_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+}
+
+static AOM_INLINE void dec_build_prediction_by_above_preds(
+ const AV1_COMMON *cm, DecoderCodingBlock *dcb,
+ uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ if (!xd->up_available) return;
+
+ // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+ // prediction block. This is half the height of the original block,
+ // except for 128-wide blocks, where we only use a height of 32.
+ const int this_height = xd->height * MI_SIZE;
+ const int pred_height = AOMMIN(this_height / 2, 32);
+ xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
+ struct build_prediction_ctxt ctxt = {
+ cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb
+ };
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ foreach_overlappable_nb_above(cm, xd,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ dec_build_prediction_by_above_pred, &ctxt);
+
+ xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE);
+ xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+ xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height);
+}
+
+static INLINE void dec_build_prediction_by_left_pred(
+ MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int left_mi_row = xd->mi_row + rel_mi_row;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+ (void)rel_mi_col;
+ (void)dir;
+
+ av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size,
+ &backup_mbmi, ctxt, num_planes);
+ mi_x = xd->mi_col << MI_SIZE_LOG2;
+ mi_y = left_mi_row << MI_SIZE_LOG2;
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+ dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j,
+ &backup_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+}
+
+static AOM_INLINE void dec_build_prediction_by_left_preds(
+ const AV1_COMMON *cm, DecoderCodingBlock *dcb,
+ uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ if (!xd->left_available) return;
+
+ // Adjust mb_to_right_edge to have the correct value for the OBMC
+ // prediction block. This is half the width of the original block,
+ // except for 128-wide blocks, where we only use a width of 32.
+ const int this_width = xd->width * MI_SIZE;
+ const int pred_width = AOMMIN(this_width / 2, 32);
+ xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
+
+ struct build_prediction_ctxt ctxt = {
+ cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb
+ };
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ foreach_overlappable_nb_left(cm, xd,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ dec_build_prediction_by_left_pred, &ctxt);
+
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE);
+ xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width);
+ xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+static AOM_INLINE void dec_build_obmc_inter_predictors_sb(
+ const AV1_COMMON *cm, DecoderCodingBlock *dcb) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ MACROBLOCKD *const xd = &dcb->xd;
+ av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
+
+ dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1,
+ dst_height1, dst_stride1);
+ dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2,
+ dst_stride2);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+ mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+ dst_stride2);
+}
+
+static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ if (store_cfl_required(cm, xd)) {
+ cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
+ }
+}
+
+static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb,
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+ if (frame < LAST_FRAME) {
+ assert(is_intrabc_block(mbmi));
+ assert(frame == INTRA_FRAME);
+ assert(ref == 0);
+ } else {
+ const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame);
+ const struct scale_factors *ref_scale_factors =
+ get_ref_scale_factors_const(cm, frame);
+
+ xd->block_ref_scale_factors[ref] = ref_scale_factors;
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
+ ref_scale_factors, num_planes);
+ }
+ }
+
+ dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ dec_build_obmc_inter_predictors_sb(cm, dcb);
+ }
+#if CONFIG_MISMATCH_DEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
+ pd->subsampling_y);
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ mismatch_check_block_pre(pd->dst.buf, pd->dst.stride,
+ cm->current_frame.order_hint, plane, pixel_c,
+ pixel_r, pd->width, pd->height,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
+}
+
+static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd,
+ int plane, aom_reader *r) {
+ (void)r;
+ Av1ColorMapParam params;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
+ &params.plane_height, NULL, NULL);
+ xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
+}
+
+static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
+ ThreadData *const td,
+ aom_reader *r,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+
+ if (!is_inter_block(mbmi)) {
+ int row, col;
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+
+ for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += stepr) {
+ for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += stepc) {
+ td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row,
+ blk_col, tx_size);
+ td->predict_and_recon_intra_block_visit(
+ cm, dcb, r, plane, blk_row, blk_col, tx_size);
+ set_cb_buffer_offsets(dcb, tx_size, plane);
+ }
+ }
+ }
+ }
+ }
+ } else {
+ td->predict_inter_block_visit(cm, dcb, bsize);
+ // Reconstruction
+ if (!mbmi->skip_txfm) {
+ int eobtotal = 0;
+
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ int row, col;
+
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ assert(max_unit_bsize ==
+ get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, ss_x, ss_y);
+ const TX_SIZE max_tx_size =
+ get_vartx_max_txsize(xd, plane_bsize, plane);
+ const int bh_var_tx = tx_size_high_unit[max_tx_size];
+ const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+ int block = 0;
+ int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ int blk_row, blk_col;
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x);
+
+ for (blk_row = row >> ss_y; blk_row < unit_height;
+ blk_row += bh_var_tx) {
+ for (blk_col = col >> ss_x; blk_col < unit_width;
+ blk_col += bw_var_tx) {
+ decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
+ blk_row, blk_col, block, max_tx_size,
+ &eobtotal);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+ }
+ td->cfl_store_inter_block_visit(cm, xd);
+ }
+
+ av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
+}
+
+static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
+ int tx_w_log2, int tx_h_log2,
+ int min_txs, int split_size, int txs,
+ int blk_row, int blk_col) {
+ for (int idy = 0; idy < tx_size_high_unit[split_size];
+ idy += tx_size_high_unit[min_txs]) {
+ for (int idx = 0; idx < tx_size_wide_unit[split_size];
+ idx += tx_size_wide_unit[min_txs]) {
+ const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) +
+ ((blk_col + idx) >> tx_w_log2);
+ mbmi->inter_tx_size[index] = txs;
+ }
+ }
+}
+
+static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+ TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col,
+ aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ int is_split = 0;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+ assert(tx_size > TX_4X4);
+ TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ txs = sub_tx_size_map[txs];
+ const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ const int bw_log2 = mi_size_wide_log2[bsize];
+ const int stride_log2 = bw_log2 - tx_w_log2;
+
+ if (depth == MAX_VARTX_DEPTH) {
+ set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
+ tx_size, blk_row, blk_col);
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row,
+ mbmi->bsize, tx_size);
+ is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
+
+ if (is_split) {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+ if (sub_txs == TX_4X4) {
+ set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
+ sub_txs, blk_row, blk_col);
+ mbmi->tx_size = sub_txs;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, sub_txs, tx_size);
+ return;
+ }
+
+ assert(bsw > 0 && bsh > 0);
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ int offsetr = blk_row + row;
+ int offsetc = blk_col + col;
+ read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
+ }
+ }
+ } else {
+ set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
+ tx_size, blk_row, blk_col);
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ }
+}
+
+static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd,
+ aom_reader *r) {
+ // TODO(debargha): Clean up the logic here. This function should only
+ // be called for intra.
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+ const int ctx = get_tx_size_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
+ max_depths + 1, ACCT_STR);
+ assert(depth >= 0 && depth <= max_depths);
+ const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
+ return tx_size;
+}
+
+static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode,
+ int is_inter, int allow_select_inter,
+ aom_reader *r) {
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+
+ if (block_signals_txsize(bsize)) {
+ if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+ const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
+ return coded_tx_size;
+ } else {
+ return tx_size_from_tx_mode(bsize, tx_mode);
+ }
+ } else {
+ assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+ return max_txsize_rect_lookup[bsize];
+ }
+}
+
+static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
+ ThreadData *const td, int mi_row,
+ int mi_col, aom_reader *r,
+ PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+ decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize);
+
+ av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
+
+ AV1_COMMON *cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+ if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+ !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int width = mi_size_wide[bsize];
+ const int height = mi_size_high[bsize];
+
+ for (int idy = 0; idy < height; idy += bh)
+ for (int idx = 0; idx < width; idx += bw)
+ read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
+ } else {
+ mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx,
+ !mbmi->skip_txfm, r);
+ if (inter_block_tx)
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+ mbmi->skip_txfm && is_inter_block(mbmi), xd);
+ }
+
+ if (cm->delta_q_info.delta_q_present_flag) {
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ const int current_qindex =
+ av1_get_qindex(&cm->seg, i, xd->current_base_qindex);
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+ for (int j = 0; j < num_planes; ++j) {
+ const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
+ : (j == 1 ? quant_params->u_dc_delta_q
+ : quant_params->v_dc_delta_q);
+ const int ac_delta_q = j == 0 ? 0
+ : (j == 1 ? quant_params->u_ac_delta_q
+ : quant_params->v_ac_delta_q);
+ xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
+ current_qindex, dc_delta_q, cm->seq_params->bit_depth);
+ xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
+ current_qindex, ac_delta_q, cm->seq_params->bit_depth);
+ }
+ }
+ }
+ if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes);
+
+ decode_token_recon_block(pbi, td, r, bsize);
+}
+
+static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+ ThreadData *const td,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int num_planes = av1_num_planes(cm);
+
+ const int offset = mi_row * mi_params->mi_stride + mi_col;
+ const TileInfo *const tile = &xd->tile;
+
+ xd->mi = mi_params->mi_grid_base + offset;
+ xd->tx_type_map =
+ &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col];
+ xd->tx_type_map_stride = mi_params->mi_stride;
+
+ set_plane_n4(xd, bw, bh, num_planes);
+
+ // Distance of Mb to the various image edges. These are specified to 8th pel
+ // as they are always compared to values that are in 1/8th pel units
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+ mi_params->mi_cols);
+
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+}
+
+static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ (void)partition;
+ set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+ decode_token_recon_block(pbi, td, r, bsize);
+}
+
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+ aom_reader *r, int has_rows, int has_cols,
+ BLOCK_SIZE bsize) {
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!has_rows && !has_cols) return PARTITION_SPLIT;
+
+ assert(ctx >= 0);
+ aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
+ if (has_rows && has_cols) {
+ return (PARTITION_TYPE)aom_read_symbol(
+ r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
+ } else if (!has_rows && has_cols) {
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_vert_alike(cdf, partition_cdf, bsize);
+ assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+ return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+ } else {
+ assert(has_rows && !has_cols);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_horz_alike(cdf, partition_cdf, bsize);
+ assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+ return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
+ }
+}
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
+ ThreadData *const td, int mi_row,
+ int mi_col, aom_reader *reader,
+ BLOCK_SIZE bsize,
+ int parse_decode_flag) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ AV1_COMMON *const cm = &pbi->common;
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+ const int bw = mi_size_wide[bsize];
+ const int hbs = bw >> 1;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+ const int quarter_step = bw / 4;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ // parse_decode_flag takes the following values :
+ // 01 - do parse only
+ // 10 - do decode only
+ // 11 - do parse and decode
+ static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block,
+ decode_block,
+ parse_decode_block };
+
+ if (parse_decode_flag & 1) {
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ int rcol0, rcol1, rrow0, rrow1;
+
+ // Skip some unnecessary work if loop restoration is disabled
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+ if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
+ const int rstride = cm->rst_info[plane].horz_units;
+ for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+ for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+ const int runit_idx = rcol + rrow * rstride;
+ loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx);
+ }
+ }
+ }
+ }
+
+ partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+ : read_partition(xd, mi_row, mi_col, reader,
+ has_rows, has_cols, bsize);
+ } else {
+ partition = get_partition(cm, mi_row, mi_col, bsize);
+ }
+ subsize = get_partition_subsize(bsize, partition);
+ if (subsize == BLOCK_INVALID) {
+ // When an internal error occurs ensure that xd->mi_row is set appropriately
+ // w.r.t. current tile, which is used to signal processing of current row is
+ // done.
+ xd->mi_row = mi_row;
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Partition is invalid for block size %dx%d",
+ block_size_wide[bsize], block_size_high[bsize]);
+ }
+ // Check the bitstream is conformant: if there is subsampling on the
+ // chroma planes, subsize must subsample to a valid block size.
+ const struct macroblockd_plane *const pd_u = &xd->plane[1];
+ if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
+ BLOCK_INVALID) {
+ // When an internal error occurs ensure that xd->mi_row is set appropriately
+ // w.r.t. current tile, which is used to signal processing of current row is
+ // done.
+ xd->mi_row = mi_row;
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Block size %dx%d invalid with this subsampling mode",
+ block_size_wide[subsize], block_size_high[subsize]);
+ }
+
+#define DEC_BLOCK_STX_ARG
+#define DEC_BLOCK_EPT_ARG partition,
+#define DEC_BLOCK(db_r, db_c, db_subsize) \
+ block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
+ reader, DEC_BLOCK_EPT_ARG(db_subsize))
+#define DEC_PARTITION(db_r, db_c, db_subsize) \
+ decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
+ (db_subsize), parse_decode_flag)
+
+ switch (partition) {
+ case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+ case PARTITION_HORZ:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+ break;
+ case PARTITION_VERT:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+ break;
+ case PARTITION_SPLIT:
+ DEC_PARTITION(mi_row, mi_col, subsize);
+ DEC_PARTITION(mi_row, mi_col + hbs, subsize);
+ DEC_PARTITION(mi_row + hbs, mi_col, subsize);
+ DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+ break;
+ case PARTITION_HORZ_A:
+ DEC_BLOCK(mi_row, mi_col, bsize2);
+ DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+ break;
+ case PARTITION_HORZ_B:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+ break;
+ case PARTITION_VERT_A:
+ DEC_BLOCK(mi_row, mi_col, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+ DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+ break;
+ case PARTITION_VERT_B:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+ break;
+ case PARTITION_HORZ_4:
+ for (int i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
+ DEC_BLOCK(this_mi_row, mi_col, subsize);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (int i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
+ DEC_BLOCK(mi_row, this_mi_col, subsize);
+ }
+ break;
+ default: assert(0 && "Invalid partition type");
+ }
+
+#undef DEC_PARTITION
+#undef DEC_BLOCK
+#undef DEC_BLOCK_EPT_ARG
+#undef DEC_BLOCK_STX_ARG
+
+ if (parse_decode_flag & 1)
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE void setup_bool_decoder(
+ MACROBLOCKD *const xd, const uint8_t *data, const uint8_t *data_end,
+ const size_t read_size, struct aom_internal_error_info *error_info,
+ aom_reader *r, uint8_t allow_update_cdf) {
+ // Validate the calculated partition length. If the buffer
+ // described by the partition can't be fully read, then restrict
+ // it to the portion that can be (for EC mode) or throw an error.
+ if (!read_is_valid(data, read_size, data_end)) {
+ // When internal error occurs ensure that xd->mi_row is set appropriately
+ // w.r.t. current tile, which is used to signal processing of current row is
+ // done in row-mt decoding.
+ xd->mi_row = xd->tile.mi_row_start;
+
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+ }
+ if (aom_reader_init(r, data, read_size)) {
+ // When internal error occurs ensure that xd->mi_row is set appropriately
+ // w.r.t. current tile, which is used to signal processing of current row is
+ // done in row-mt decoding.
+ xd->mi_row = xd->tile.mi_row_start;
+
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder %d", 1);
+ }
+
+ r->allow_update_cdf = allow_update_cdf;
+}
+
+static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb) {
+ struct segmentation *const seg = &cm->seg;
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+ seg->temporal_update = 0;
+
+ seg->enabled = aom_rb_read_bit(rb);
+ if (!seg->enabled) {
+ if (cm->cur_frame->seg_map) {
+ memset(cm->cur_frame->seg_map, 0,
+ (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols));
+ }
+
+ memset(seg, 0, sizeof(*seg));
+ segfeatures_copy(&cm->cur_frame->seg, seg);
+ return;
+ }
+ if (cm->seg.enabled && cm->prev_frame &&
+ (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
+ (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ } else {
+ cm->last_frame_seg_map = NULL;
+ }
+ // Read update flags
+ if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+ // These frames can't use previous frames, so must signal map + features
+ seg->update_map = 1;
+ seg->temporal_update = 0;
+ seg->update_data = 1;
+ } else {
+ seg->update_map = aom_rb_read_bit(rb);
+ if (seg->update_map) {
+ seg->temporal_update = aom_rb_read_bit(rb);
+ } else {
+ seg->temporal_update = 0;
+ }
+ seg->update_data = aom_rb_read_bit(rb);
+ }
+
+ // Segmentation data update
+ if (seg->update_data) {
+ av1_clearall_segfeatures(seg);
+
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ for (int j = 0; j < SEG_LVL_MAX; j++) {
+ int data = 0;
+ const int feature_enabled = aom_rb_read_bit(rb);
+ if (feature_enabled) {
+ av1_enable_segfeature(seg, i, j);
+
+ const int data_max = av1_seg_feature_data_max(j);
+ const int data_min = -data_max;
+ const int ubits = get_unsigned_bits(data_max);
+
+ if (av1_is_segfeature_signed(j)) {
+ data = aom_rb_read_inv_signed_literal(rb, ubits);
+ } else {
+ data = aom_rb_read_literal(rb, ubits);
+ }
+
+ data = clamp(data, data_min, data_max);
+ }
+ av1_set_segdata(seg, i, j, data);
+ }
+ }
+ av1_calculate_segdata(seg);
+ } else if (cm->prev_frame) {
+ segfeatures_copy(seg, &cm->prev_frame->seg);
+ }
+ segfeatures_copy(&cm->cur_frame->seg, seg);
+}
+
+static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ assert(!cm->features.all_lossless);
+ const int num_planes = av1_num_planes(cm);
+ if (cm->features.allow_intrabc) return;
+ int all_none = 1, chroma_none = 1;
+ for (int p = 0; p < num_planes; ++p) {
+ RestorationInfo *rsi = &cm->rst_info[p];
+ if (aom_rb_read_bit(rb)) {
+ rsi->frame_restoration_type =
+ aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
+ } else {
+ rsi->frame_restoration_type =
+ aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+ }
+ if (rsi->frame_restoration_type != RESTORE_NONE) {
+ all_none = 0;
+ chroma_none &= p == 0;
+ }
+ }
+ if (!all_none) {
+ assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+ cm->seq_params->sb_size == BLOCK_128X128);
+ const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
+
+ for (int p = 0; p < num_planes; ++p)
+ cm->rst_info[p].restoration_unit_size = sb_size;
+
+ RestorationInfo *rsi = &cm->rst_info[0];
+
+ if (sb_size == 64) {
+ rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+ }
+ if (rsi->restoration_unit_size > 64) {
+ rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+ }
+ } else {
+ const int size = RESTORATION_UNITSIZE_MAX;
+ for (int p = 0; p < num_planes; ++p)
+ cm->rst_info[p].restoration_unit_size = size;
+ }
+
+ if (num_planes > 1) {
+ int s =
+ AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
+ if (s && !chroma_none) {
+ cm->rst_info[1].restoration_unit_size =
+ cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
+ } else {
+ cm->rst_info[1].restoration_unit_size =
+ cm->rst_info[0].restoration_unit_size;
+ }
+ cm->rst_info[2].restoration_unit_size =
+ cm->rst_info[1].restoration_unit_size;
+ }
+}
+
+static AOM_INLINE void read_wiener_filter(int wiener_win,
+ WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info,
+ aom_reader *rb) {
+ memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
+ memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
+
+ if (wiener_win == WIENER_WIN)
+ wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+ WIENER_FILT_TAP0_MINV;
+ else
+ wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0;
+ wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+ WIENER_FILT_TAP1_MINV;
+ wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+ WIENER_FILT_TAP2_MINV;
+ // The central element has an implicit +WIENER_FILT_STEP
+ wiener_info->vfilter[WIENER_HALFWIN] =
+ -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
+ wiener_info->vfilter[2]);
+
+ if (wiener_win == WIENER_WIN)
+ wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+ WIENER_FILT_TAP0_MINV;
+ else
+ wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0;
+ wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+ WIENER_FILT_TAP1_MINV;
+ wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+ WIENER_FILT_TAP2_MINV;
+ // The central element has an implicit +WIENER_FILT_STEP
+ wiener_info->hfilter[WIENER_HALFWIN] =
+ -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
+ wiener_info->hfilter[2]);
+ memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info,
+ aom_reader *rb) {
+ sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
+ const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+
+ if (params->r[0] == 0) {
+ sgrproj_info->xqd[0] = 0;
+ sgrproj_info->xqd[1] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+ SGRPROJ_PRJ_MIN1;
+ } else if (params->r[1] == 0) {
+ sgrproj_info->xqd[0] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+ SGRPROJ_PRJ_MIN0;
+ sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
+ SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+ } else {
+ sgrproj_info->xqd[0] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+ SGRPROJ_PRJ_MIN0;
+ sgrproj_info->xqd[1] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+ SGRPROJ_PRJ_MIN1;
+ }
+
+ memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static AOM_INLINE void loop_restoration_read_sb_coeffs(
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
+ int runit_idx) {
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
+ assert(rsi->frame_restoration_type != RESTORE_NONE);
+
+ assert(!cm->features.all_lossless);
+
+ const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+ WienerInfo *wiener_info = xd->wiener_info + plane;
+ SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+
+ if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+ rui->restoration_type =
+ aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES, ACCT_STR);
+ switch (rui->restoration_type) {
+ case RESTORE_WIENER:
+ read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+ break;
+ case RESTORE_SGRPROJ:
+ read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+ break;
+ default: assert(rui->restoration_type == RESTORE_NONE); break;
+ }
+ } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+ if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
+ rui->restoration_type = RESTORE_WIENER;
+ read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+ } else {
+ rui->restoration_type = RESTORE_NONE;
+ }
+ } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+ if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
+ rui->restoration_type = RESTORE_SGRPROJ;
+ read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+ } else {
+ rui->restoration_type = RESTORE_NONE;
+ }
+ }
+}
+
+static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *lf = &cm->lf;
+
+ if (cm->features.allow_intrabc || cm->features.coded_lossless) {
+ // write default deltas to frame buffer
+ av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+ av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+ return;
+ }
+ assert(!cm->features.coded_lossless);
+ if (cm->prev_frame) {
+ // write deltas to frame buffer
+ memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+ memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+ } else {
+ av1_set_default_ref_deltas(lf->ref_deltas);
+ av1_set_default_mode_deltas(lf->mode_deltas);
+ }
+ lf->filter_level[0] = aom_rb_read_literal(rb, 6);
+ lf->filter_level[1] = aom_rb_read_literal(rb, 6);
+ if (num_planes > 1) {
+ if (lf->filter_level[0] || lf->filter_level[1]) {
+ lf->filter_level_u = aom_rb_read_literal(rb, 6);
+ lf->filter_level_v = aom_rb_read_literal(rb, 6);
+ }
+ }
+ lf->sharpness_level = aom_rb_read_literal(rb, 3);
+
+ // Read in loop filter deltas applied at the MB level based on mode or ref
+ // frame.
+ lf->mode_ref_delta_update = 0;
+
+ lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
+ if (lf->mode_ref_delta_enabled) {
+ lf->mode_ref_delta_update = aom_rb_read_bit(rb);
+ if (lf->mode_ref_delta_update) {
+ for (int i = 0; i < REF_FRAMES; i++)
+ if (aom_rb_read_bit(rb))
+ lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+
+ for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ if (aom_rb_read_bit(rb))
+ lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+ }
+ }
+
+ // write deltas to frame buffer
+ memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
+ memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
+}
+
+static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ const int num_planes = av1_num_planes(cm);
+ CdefInfo *const cdef_info = &cm->cdef_info;
+
+ if (cm->features.allow_intrabc) return;
+ cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3;
+ cdef_info->cdef_bits = aom_rb_read_literal(rb, 2);
+ cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits;
+ for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) {
+ cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+ cdef_info->cdef_uv_strengths[i] =
+ num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
+ }
+}
+
+static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
+ return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
+}
+
+static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
+ int num_planes,
+ bool separate_uv_delta_q,
+ struct aom_read_bit_buffer *rb) {
+ quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
+ quant_params->y_dc_delta_q = read_delta_q(rb);
+ if (num_planes > 1) {
+ int diff_uv_delta = 0;
+ if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+ quant_params->u_dc_delta_q = read_delta_q(rb);
+ quant_params->u_ac_delta_q = read_delta_q(rb);
+ if (diff_uv_delta) {
+ quant_params->v_dc_delta_q = read_delta_q(rb);
+ quant_params->v_ac_delta_q = read_delta_q(rb);
+ } else {
+ quant_params->v_dc_delta_q = quant_params->u_dc_delta_q;
+ quant_params->v_ac_delta_q = quant_params->u_ac_delta_q;
+ }
+ } else {
+ quant_params->u_dc_delta_q = 0;
+ quant_params->u_ac_delta_q = 0;
+ quant_params->v_dc_delta_q = 0;
+ quant_params->v_ac_delta_q = 0;
+ }
+ quant_params->using_qmatrix = aom_rb_read_bit(rb);
+ if (quant_params->using_qmatrix) {
+ quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+ quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+ if (!separate_uv_delta_q)
+ quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
+ else
+ quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+ } else {
+ quant_params->qmatrix_level_y = 0;
+ quant_params->qmatrix_level_u = 0;
+ quant_params->qmatrix_level_v = 0;
+ }
+}
+
+// Build y/uv dequant values based on segmentation.
+static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd) {
+ const int bit_depth = cm->seq_params->bit_depth;
+ // When segmentation is disabled, only the first value is used. The
+ // remaining are don't cares.
+ const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
+ CommonQuantParams *const quant_params = &cm->quant_params;
+ for (int i = 0; i < max_segments; ++i) {
+ const int qindex = xd->qindex[i];
+ quant_params->y_dequant_QTX[i][0] =
+ av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth);
+ quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
+ quant_params->u_dequant_QTX[i][0] =
+ av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth);
+ quant_params->u_dequant_QTX[i][1] =
+ av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth);
+ quant_params->v_dequant_QTX[i][0] =
+ av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth);
+ quant_params->v_dequant_QTX[i][1] =
+ av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth);
+ const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i);
+ // NB: depends on base index so there is only 1 set per frame
+ // No quant weighting when lossless or signalled not using QM
+ const int qmlevel_y =
+ use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+ for (int j = 0; j < TX_SIZES_ALL; ++j) {
+ quant_params->y_iqmatrix[i][j] =
+ av1_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j);
+ }
+ const int qmlevel_u =
+ use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+ for (int j = 0; j < TX_SIZES_ALL; ++j) {
+ quant_params->u_iqmatrix[i][j] =
+ av1_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j);
+ }
+ const int qmlevel_v =
+ use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+ for (int j = 0; j < TX_SIZES_ALL; ++j) {
+ quant_params->v_iqmatrix[i][j] =
+ av1_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j);
+ }
+ }
+}
+
+static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
+ return aom_rb_read_bit(rb) ? SWITCHABLE
+ : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
+}
+
+static AOM_INLINE void setup_render_size(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ cm->render_width = cm->superres_upscaled_width;
+ cm->render_height = cm->superres_upscaled_height;
+ if (aom_rb_read_bit(rb))
+ av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
+}
+
+// TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
+static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb,
+ int *width, int *height) {
+ cm->superres_upscaled_width = *width;
+ cm->superres_upscaled_height = *height;
+
+ const SequenceHeader *const seq_params = cm->seq_params;
+ if (!seq_params->enable_superres) return;
+
+ if (aom_rb_read_bit(rb)) {
+ cm->superres_scale_denominator =
+ (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
+ cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN;
+ // Don't edit cm->width or cm->height directly, or the buffers won't get
+ // resized correctly
+ av1_calculate_scaled_superres_size(width, height,
+ cm->superres_scale_denominator);
+ } else {
+ // 1:1 scaling - ie. no scaling, scale not provided
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ }
+}
+
+static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
+ int height) {
+#if CONFIG_SIZE_LIMIT
+ if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Dimensions of %dx%d beyond allowed size of %dx%d.",
+ width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+ if (cm->width != width || cm->height != height) {
+ const int new_mi_rows = CEIL_POWER_OF_TWO(height, MI_SIZE_LOG2);
+ const int new_mi_cols = CEIL_POWER_OF_TWO(width, MI_SIZE_LOG2);
+
+ // Allocations in av1_alloc_context_buffers() depend on individual
+ // dimensions as well as the overall size.
+ if (new_mi_cols > cm->mi_params.mi_cols ||
+ new_mi_rows > cm->mi_params.mi_rows) {
+ if (av1_alloc_context_buffers(cm, width, height, BLOCK_4X4)) {
+ // The cm->mi_* values have been cleared and any existing context
+ // buffers have been freed. Clear cm->width and cm->height to be
+ // consistent and to force a realloc next time.
+ cm->width = 0;
+ cm->height = 0;
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+ } else {
+ cm->mi_params.set_mb_mi(&cm->mi_params, width, height, BLOCK_4X4);
+ }
+ av1_init_mi_buffers(&cm->mi_params);
+ cm->width = width;
+ cm->height = height;
+ }
+
+ ensure_mv_buffer(cm->cur_frame, cm);
+ cm->cur_frame->width = cm->width;
+ cm->cur_frame->height = cm->height;
+}
+
+static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
+ BufferPool *const pool = cm->buffer_pool;
+ const SequenceHeader *const seq_params = cm->seq_params;
+
+ lock_buffer_pool(pool);
+ if (aom_realloc_frame_buffer(
+ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
+ &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0,
+ 0)) {
+ unlock_buffer_pool(pool);
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ unlock_buffer_pool(pool);
+
+ cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth;
+ cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+ cm->cur_frame->buf.transfer_characteristics =
+ seq_params->transfer_characteristics;
+ cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+ cm->cur_frame->buf.monochrome = seq_params->monochrome;
+ cm->cur_frame->buf.chroma_sample_position =
+ seq_params->chroma_sample_position;
+ cm->cur_frame->buf.color_range = seq_params->color_range;
+ cm->cur_frame->buf.render_width = cm->render_width;
+ cm->cur_frame->buf.render_height = cm->render_height;
+}
+
+static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
+ int frame_size_override_flag,
+ struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ int width, height;
+
+ if (frame_size_override_flag) {
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+ if (width > seq_params->max_frame_width ||
+ height > seq_params->max_frame_height) {
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Frame dimensions are larger than the maximum values");
+ }
+ } else {
+ width = seq_params->max_frame_width;
+ height = seq_params->max_frame_height;
+ }
+
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ setup_render_size(cm, rb);
+ setup_buffer_pool(cm);
+}
+
+static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params,
+ struct aom_read_bit_buffer *rb) {
+ set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+}
+
+static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
+ int ref_xss, int ref_yss,
+ aom_bit_depth_t this_bit_depth,
+ int this_xss, int this_yss) {
+ return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+ ref_yss == this_yss;
+}
+
+static AOM_INLINE void setup_frame_size_with_refs(
+ AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ int width, height;
+ int found = 0;
+ int has_valid_ref_frame = 0;
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ if (aom_rb_read_bit(rb)) {
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+ // This will never be NULL in a normal stream, as streams are required to
+ // have a shown keyframe before any inter frames, which would refresh all
+ // the reference buffers. However, it might be null if we're starting in
+ // the middle of a stream, and static analysis will error if we don't do
+ // a null check here.
+ if (ref_buf == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid condition: invalid reference buffer");
+ } else {
+ const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
+ width = buf->y_crop_width;
+ height = buf->y_crop_height;
+ cm->render_width = buf->render_width;
+ cm->render_height = buf->render_height;
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ const SequenceHeader *const seq_params = cm->seq_params;
+ if (!found) {
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+
+ av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ setup_render_size(cm, rb);
+ }
+
+ if (width <= 0 || height <= 0)
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid frame size");
+
+ // Check to make sure at least one of frames that this frame references
+ // has valid dimensions.
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
+ has_valid_ref_frame |=
+ valid_ref_frame_size(ref_frame->buf.y_crop_width,
+ ref_frame->buf.y_crop_height, width, height);
+ }
+ if (!has_valid_ref_frame)
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Referenced frame has invalid size");
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
+ if (!valid_ref_frame_img_fmt(
+ ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
+ ref_frame->buf.subsampling_y, seq_params->bit_depth,
+ seq_params->subsampling_x, seq_params->subsampling_y))
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Referenced frame has incompatible color format");
+ }
+ setup_buffer_pool(cm);
+}
+
+// Same function as av1_read_uniform but reading from uncompresses header wb
+static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ const int v = aom_rb_read_literal(rb, l - 1);
+ assert(l != 0);
+ if (v < m)
+ return v;
+ else
+ return (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static AOM_INLINE void read_tile_info_max_tile(
+ AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ CommonTileParams *const tiles = &cm->tiles;
+ int width_sb =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+ int height_sb =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+
+ av1_get_tile_limits(cm);
+ tiles->uniform_spacing = aom_rb_read_bit(rb);
+
+ // Read tile columns
+ if (tiles->uniform_spacing) {
+ tiles->log2_cols = tiles->min_log2_cols;
+ while (tiles->log2_cols < tiles->max_log2_cols) {
+ if (!aom_rb_read_bit(rb)) {
+ break;
+ }
+ tiles->log2_cols++;
+ }
+ } else {
+ int i;
+ int start_sb;
+ for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
+ const int size_sb =
+ 1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb));
+ tiles->col_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ width_sb -= size_sb;
+ }
+ tiles->cols = i;
+ tiles->col_start_sb[i] = start_sb + width_sb;
+ }
+ av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols, tiles);
+
+ // Read tile rows
+ if (tiles->uniform_spacing) {
+ tiles->log2_rows = tiles->min_log2_rows;
+ while (tiles->log2_rows < tiles->max_log2_rows) {
+ if (!aom_rb_read_bit(rb)) {
+ break;
+ }
+ tiles->log2_rows++;
+ }
+ } else {
+ int i;
+ int start_sb;
+ for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
+ const int size_sb =
+ 1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb));
+ tiles->row_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ height_sb -= size_sb;
+ }
+ tiles->rows = i;
+ tiles->row_start_sb[i] = start_sb + height_sb;
+ }
+ av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles);
+}
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
+ cm->tiles.single_tile_decoding = 0;
+ if (cm->tiles.large_scale) {
+ struct loopfilter *lf = &cm->lf;
+ RestorationInfo *const rst_info = cm->rst_info;
+ const CdefInfo *const cdef_info = &cm->cdef_info;
+
+ // Figure out single_tile_decoding by loopfilter_level.
+ const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
+ const int no_cdef = cdef_info->cdef_bits == 0 &&
+ cdef_info->cdef_strengths[0] == 0 &&
+ cdef_info->cdef_uv_strengths[0] == 0;
+ const int no_restoration =
+ rst_info[0].frame_restoration_type == RESTORE_NONE &&
+ rst_info[1].frame_restoration_type == RESTORE_NONE &&
+ rst_info[2].frame_restoration_type == RESTORE_NONE;
+ assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef));
+ assert(IMPLIES(cm->features.all_lossless, no_restoration));
+ cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+ }
+}
+
+static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
+ struct aom_read_bit_buffer *const rb) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ read_tile_info_max_tile(cm, rb);
+
+ pbi->context_update_tile_id = 0;
+ if (cm->tiles.rows * cm->tiles.cols > 1) {
+ // tile to use for cdf update
+ pbi->context_update_tile_id =
+ aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
+ if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid context_update_tile_id");
+ }
+ // tile size magnitude
+ pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+ }
+}
+
+#if EXT_TILE_DEBUG
+static AOM_INLINE void read_ext_tile_info(
+ AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ // This information is stored as a separate byte.
+ int mod = rb->bit_offset % CHAR_BIT;
+ if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
+ assert(rb->bit_offset % CHAR_BIT == 0);
+
+ if (cm->tiles.cols * cm->tiles.rows > 1) {
+ // Read the number of bytes used to store tile size
+ pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+ pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+ }
+}
+#endif // EXT_TILE_DEBUG
+
+static size_t mem_get_varsize(const uint8_t *src, int sz) {
+ switch (sz) {
+ case 1: return src[0];
+ case 2: return mem_get_le16(src);
+ case 3: return mem_get_le24(src);
+ case 4: return mem_get_le32(src);
+ default: assert(0 && "Invalid size"); return -1;
+ }
+}
+
+#if EXT_TILE_DEBUG
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'. On return, '*data' is updated to point to the end of the
+// raw tile buffer in the bit stream.
+static AOM_INLINE void get_ls_tile_buffer(
+ const uint8_t *const data_end, struct aom_internal_error_info *error_info,
+ const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+ int tile_size_bytes, int col, int row, int tile_copy_mode) {
+ size_t size;
+
+ size_t copy_size = 0;
+ const uint8_t *copy_data = NULL;
+
+ if (!read_is_valid(*data, tile_size_bytes, data_end))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+ size = mem_get_varsize(*data, tile_size_bytes);
+
+ // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
+ // mode.
+ if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
+ // The remaining bits in the top byte signal the row offset
+ int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+ // Currently, only use tiles in same column as reference tiles.
+ copy_data = tile_buffers[row - offset][col].data;
+ copy_size = tile_buffers[row - offset][col].size;
+ size = 0;
+ } else {
+ size += AV1_MIN_TILE_SIZE_BYTES;
+ }
+
+ *data += tile_size_bytes;
+
+ if (size > (size_t)(data_end - *data))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile size");
+
+ if (size > 0) {
+ tile_buffers[row][col].data = *data;
+ tile_buffers[row][col].size = size;
+ } else {
+ tile_buffers[row][col].data = copy_data;
+ tile_buffers[row][col].size = copy_size;
+ }
+
+ *data += size;
+}
+
+// Returns the end of the last tile buffer
+// (tile_buffers[cm->tiles.rows - 1][cm->tiles.cols - 1]).
+static const uint8_t *get_ls_tile_buffers(
+ AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+ TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int have_tiles = tile_cols * tile_rows > 1;
+ const uint8_t *raw_data_end; // The end of the last tile buffer
+
+ if (!have_tiles) {
+ const size_t tile_size = data_end - data;
+ tile_buffers[0][0].data = data;
+ tile_buffers[0][0].size = tile_size;
+ raw_data_end = NULL;
+ } else {
+ // We locate only the tile buffers that are required, which are the ones
+ // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+ // need the last (bottom right) tile buffer, as we need to know where the
+ // end of the compressed frame buffer is for proper superframe decoding.
+
+ const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
+ const uint8_t *const data_start = data;
+
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int tile_rows_start = single_row ? dec_tile_row : 0;
+ const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ const int tile_cols_start = single_col ? dec_tile_col : 0;
+ const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+ const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+ const int tile_size_bytes = pbi->tile_size_bytes;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_copy_mode =
+ ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0;
+ // Read tile column sizes for all columns (we need the last tile buffer)
+ for (int c = 0; c < tile_cols; ++c) {
+ const int is_last = c == tile_cols - 1;
+ size_t tile_col_size;
+
+ if (!is_last) {
+ tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+ data += tile_col_size_bytes;
+ tile_col_data_end[c] = data + tile_col_size;
+ } else {
+ tile_col_size = data_end - data;
+ tile_col_data_end[c] = data_end;
+ }
+ data += tile_col_size;
+ }
+
+ data = data_start;
+
+ // Read the required tile sizes.
+ for (int c = tile_cols_start; c < tile_cols_end; ++c) {
+ const int is_last = c == tile_cols - 1;
+
+ if (c > 0) data = tile_col_data_end[c - 1];
+
+ if (!is_last) data += tile_col_size_bytes;
+
+ // Get the whole of the last column, otherwise stop at the required tile.
+ for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+ get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
+ tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+ }
+ }
+
+ // If we have not read the last column, then read it to get the last tile.
+ if (tile_cols_end != tile_cols) {
+ const int c = tile_cols - 1;
+
+ data = tile_col_data_end[c - 1];
+
+ for (int r = 0; r < tile_rows; ++r) {
+ get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
+ tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+ }
+ }
+ raw_data_end = data;
+ }
+ return raw_data_end;
+}
+#endif // EXT_TILE_DEBUG
+
+static const uint8_t *get_ls_single_tile_buffer(
+ AV1Decoder *pbi, const uint8_t *data,
+ TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+ assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0);
+ tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data;
+ tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size =
+ (size_t)pbi->coded_tile_data_size;
+ return data + pbi->coded_tile_data_size;
+}
+
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static AOM_INLINE void get_tile_buffer(
+ const uint8_t *const data_end, const int tile_size_bytes, int is_last,
+ struct aom_internal_error_info *error_info, const uint8_t **data,
+ TileBufferDec *const buf) {
+ size_t size;
+
+ if (!is_last) {
+ if (!read_is_valid(*data, tile_size_bytes, data_end))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Not enough data to read tile size");
+
+ size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
+ *data += tile_size_bytes;
+
+ if (size > (size_t)(data_end - *data))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile size");
+ } else {
+ size = data_end - *data;
+ }
+
+ buf->data = *data;
+ buf->size = size;
+
+ *data += size;
+}
+
+static AOM_INLINE void get_tile_buffers(
+ AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+ TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile,
+ int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tc = 0;
+
+ for (int r = 0; r < tile_rows; ++r) {
+ for (int c = 0; c < tile_cols; ++c, ++tc) {
+ TileBufferDec *const buf = &tile_buffers[r][c];
+
+ const int is_last = (tc == end_tile);
+ const size_t hdr_offset = 0;
+
+ if (tc < start_tile || tc > end_tile) continue;
+
+ if (data + hdr_offset >= data_end)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Data ended before all tiles were read.");
+ data += hdr_offset;
+ get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error,
+ &data, buf);
+ }
+ }
+}
+
+static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb,
+ CB_BUFFER *cb_buffer_base,
+ const int num_planes, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &pbi->common;
+ int mib_size_log2 = cm->seq_params->mib_size_log2;
+ int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
+ int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+ CB_BUFFER *cb_buffer = cb_buffer_base + offset;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane];
+ dcb->eob_data[plane] = cb_buffer->eob_data[plane];
+ dcb->cb_offset[plane] = 0;
+ dcb->txb_offset[plane] = 0;
+ }
+ MACROBLOCKD *const xd = &dcb->xd;
+ xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
+ xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
+ xd->color_index_map_offset[0] = 0;
+ xd->color_index_map_offset[1] = 0;
+}
+
+static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi,
+ const int n_tiles) {
+ AV1_COMMON *const cm = &pbi->common;
+ aom_free(pbi->tile_data);
+ pbi->allocated_tiles = 0;
+ CHECK_MEM_ERROR(cm, pbi->tile_data,
+ aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+ pbi->allocated_tiles = n_tiles;
+ for (int i = 0; i < n_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_zero(tile_data->dec_row_mt_sync);
+ }
+ pbi->allocated_row_mt_sync_rows = 0;
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+// nsync numbers are picked by testing.
+#if 0
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+#else
+ (void)width;
+#endif
+ return 1;
+}
+
+// Allocate memory for decoder row synchronization
+static AOM_INLINE void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync,
+ AV1_COMMON *cm, int rows) {
+ dec_row_mt_sync->allocated_sb_rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
+ aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
+ if (dec_row_mt_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
+ aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
+ if (dec_row_mt_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
+ }
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
+ aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
+
+ // Set up nsync.
+ dec_row_mt_sync->sync_range = get_sync_range(cm->width);
+}
+
+// Deallocate decoder row synchronization related mutex and data
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
+ if (dec_row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+ if (dec_row_mt_sync->mutex_ != NULL) {
+ for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+ pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
+ }
+ aom_free(dec_row_mt_sync->mutex_);
+ }
+ if (dec_row_mt_sync->cond_ != NULL) {
+ for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+ pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
+ }
+ aom_free(dec_row_mt_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(dec_row_mt_sync->cur_sb_col);
+
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*dec_row_mt_sync);
+ }
+}
+
+static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+ int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = dec_row_mt_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync -
+ dec_row_mt_sync->intrabc_extra_top_right_sb_delay) {
+ pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)dec_row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+ int c, const int sb_cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = dec_row_mt_sync->sync_range;
+ int cur;
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync + dec_row_mt_sync->intrabc_extra_top_right_sb_delay;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
+
+ dec_row_mt_sync->cur_sb_col[r] = cur;
+
+ pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)dec_row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void signal_decoding_done_for_erroneous_row(
+ AV1Decoder *const pbi, const MACROBLOCKD *const xd) {
+ AV1_COMMON *const cm = &pbi->common;
+ const TileInfo *const tile = &xd->tile;
+ const int sb_row_in_tile =
+ ((xd->mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2);
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile);
+ TileDataDec *const tile_data =
+ pbi->tile_data + tile->tile_row * cm->tiles.cols + tile->tile_col;
+ AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ sync_write(dec_row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1,
+ sb_cols_in_tile);
+}
+
+static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+ const TileInfo *tile_info,
+ const int mi_row) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileDataDec *const tile_data = pbi->tile_data +
+ tile_info->tile_row * cm->tiles.cols +
+ tile_info->tile_col;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+ int sb_col_in_tile = 0;
+ int row_mt_exit = 0;
+
+ for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+ mi_col += cm->seq_params->mib_size, sb_col_in_tile++) {
+ set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
+ mi_col);
+
+ sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ row_mt_exit = pbi->frame_row_mt_info.row_mt_exit;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+ if (!row_mt_exit) {
+ // Decoding of the super-block
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params->sb_size, 0x2);
+ }
+
+ sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
+ sb_cols_in_tile);
+ }
+}
+
+static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+ if (aom_reader_has_overflowed(r)) return -1;
+
+ uint32_t nb_bits = aom_reader_tell(r);
+ uint32_t nb_bytes = (nb_bits + 7) >> 3;
+ const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
+
+ // aom_reader_tell() returns 1 for a newly initialized decoder, and the
+ // return value only increases as values are decoded. So nb_bits > 0, and
+ // thus p > p_begin. Therefore accessing p[-1] is safe.
+ uint8_t last_byte = p[-1];
+ uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
+ if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
+
+ // Make sure that all padding bytes are zero as required by the spec.
+ const uint8_t *p_end = aom_reader_find_end(r);
+ while (p < p_end) {
+ if (*p != 0) return -1;
+ p++;
+ }
+ return 0;
+}
+
+static AOM_INLINE void set_decode_func_pointers(ThreadData *td,
+ int parse_decode_flag) {
+ td->read_coeffs_tx_intra_block_visit = decode_block_void;
+ td->predict_and_recon_intra_block_visit = decode_block_void;
+ td->read_coeffs_tx_inter_block_visit = decode_block_void;
+ td->inverse_tx_inter_block_visit = decode_block_void;
+ td->predict_inter_block_visit = predict_inter_block_void;
+ td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
+
+ if (parse_decode_flag & 0x1) {
+ td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
+ td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade;
+ }
+ if (parse_decode_flag & 0x2) {
+ td->predict_and_recon_intra_block_visit =
+ predict_and_reconstruct_intra_block;
+ td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
+ td->predict_inter_block_visit = predict_inter_block;
+ td->cfl_store_inter_block_visit = cfl_store_inter_block;
+ }
+}
+
+static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
+ int tile_row, int tile_col) {
+ TileInfo tile_info;
+
+ AV1_COMMON *const cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+
+ av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end,
+ tile_row);
+ av1_reset_loop_filter_delta(xd, num_planes);
+ av1_reset_loop_restoration(xd, num_planes);
+
+ for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+ mi_row += cm->seq_params->mib_size) {
+ av1_zero_left_context(xd);
+
+ for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params->mib_size) {
+ set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
+
+ // Bit-stream parsing and decoding of the superblock
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params->sb_size, 0x3);
+
+ if (aom_reader_has_overflowed(td->bit_reader)) {
+ aom_merge_corrupted_flag(&dcb->corrupted, 1);
+ return;
+ }
+ }
+ }
+
+ int corrupted =
+ (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+ aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
+}
+
+static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end, int start_tile,
+ int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ ThreadData *const td = &pbi->td;
+ CommonTileParams *const tiles = &cm->tiles;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int inv_col_order;
+ int inv_row_order;
+ int tile_row, tile_col;
+ uint8_t allow_update_cdf;
+ const uint8_t *raw_data_end = NULL;
+
+ if (tiles->large_scale) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ inv_col_order = pbi->inv_tile_order && !single_col;
+ inv_row_order = pbi->inv_tile_order && !single_row;
+ allow_update_cdf = 0;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
+ inv_col_order = pbi->inv_tile_order;
+ inv_row_order = pbi->inv_tile_order;
+ allow_update_cdf = 1;
+ }
+
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * tiles->cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+
+#if EXT_TILE_DEBUG
+ if (tiles->large_scale && !pbi->ext_tile_debug)
+ raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
+ else if (tiles->large_scale && pbi->ext_tile_debug)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
+ if (pbi->dcb.xd.seg_mask == NULL)
+ CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+ (uint8_t *)aom_memalign(
+ 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ aom_accounting_reset(&pbi->accounting);
+ }
+#endif
+
+ set_decode_func_pointers(&pbi->td, 0x3);
+
+ // Load all tile information into thread_data.
+ td->dcb = pbi->dcb;
+
+ td->dcb.corrupted = 0;
+ td->dcb.mc_buf[0] = td->mc_buf[0];
+ td->dcb.mc_buf[1] = td->mc_buf[1];
+ td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+ }
+
+ for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+ const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+
+ for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+ const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+ TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col;
+ const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
+
+ if (row * tiles->cols + col < start_tile ||
+ row * tiles->cols + col > end_tile)
+ continue;
+
+ td->bit_reader = &tile_data->bit_reader;
+ av1_zero(td->cb_buffer_base.dqcoeff);
+ av1_tile_init(&td->dcb.xd.tile, cm, row, col);
+ td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
+ setup_bool_decoder(&td->dcb.xd, tile_bs_buf->data, data_end,
+ tile_bs_buf->size, &pbi->error, td->bit_reader,
+ allow_update_cdf);
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ td->bit_reader->accounting = &pbi->accounting;
+ td->bit_reader->accounting->last_tell_frac =
+ aom_reader_tell_frac(td->bit_reader);
+ } else {
+ td->bit_reader->accounting = NULL;
+ }
+#endif
+ av1_init_macroblockd(cm, &td->dcb.xd);
+ av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
+ &td->dcb.xd);
+
+ // Initialise the tile context from the frame context
+ tile_data->tctx = *cm->fc;
+ td->dcb.xd.tile_ctx = &tile_data->tctx;
+
+ // decode tile
+ decode_tile(pbi, td, row, col);
+ aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
+ if (pbi->dcb.corrupted)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+ }
+ }
+
+ if (tiles->large_scale) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
+ }
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
+ TileJobsDec *cur_job_info = NULL;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tile_mt_info->job_mutex);
+
+ if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
+ cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
+ tile_mt_info->jobs_dequeued++;
+ }
+
+ pthread_mutex_unlock(tile_mt_info->job_mutex);
+#else
+ (void)tile_mt_info;
+#endif
+ return cur_job_info;
+}
+
+static AOM_INLINE void tile_worker_hook_init(
+ AV1Decoder *const pbi, DecWorkerData *const thread_data,
+ const TileBufferDec *const tile_buffer, TileDataDec *const tile_data,
+ uint8_t allow_update_cdf) {
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ int tile_row = tile_data->tile_info.tile_row;
+ int tile_col = tile_data->tile_info.tile_col;
+
+ td->bit_reader = &tile_data->bit_reader;
+ av1_zero(td->cb_buffer_base.dqcoeff);
+
+ MACROBLOCKD *const xd = &td->dcb.xd;
+ av1_tile_init(&xd->tile, cm, tile_row, tile_col);
+ xd->current_base_qindex = cm->quant_params.base_qindex;
+
+ setup_bool_decoder(xd, tile_buffer->data, thread_data->data_end,
+ tile_buffer->size, &thread_data->error_info,
+ td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ td->bit_reader->accounting = &pbi->accounting;
+ td->bit_reader->accounting->last_tell_frac =
+ aom_reader_tell_frac(td->bit_reader);
+ } else {
+ td->bit_reader->accounting = NULL;
+ }
+#endif
+ av1_init_macroblockd(cm, xd);
+ xd->error_info = &thread_data->error_info;
+ av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd);
+
+ // Initialise the tile context from the frame context
+ tile_data->tctx = *cm->fc;
+ xd->tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ tile_data->bit_reader.accounting->last_tell_frac =
+ aom_reader_tell_frac(&tile_data->bit_reader);
+ }
+#endif
+}
+
+static int tile_worker_hook(void *arg1, void *arg2) {
+ DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+ AV1Decoder *const pbi = (AV1Decoder *)arg2;
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ uint8_t allow_update_cdf;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(thread_data->error_info.jmp)) {
+ thread_data->error_info.setjmp = 0;
+ thread_data->td->dcb.corrupted = 1;
+ return 0;
+ }
+ thread_data->error_info.setjmp = 1;
+
+ allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
+ allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
+
+ set_decode_func_pointers(td, 0x3);
+
+ assert(cm->tiles.cols > 0);
+ while (!td->dcb.corrupted) {
+ TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+ if (cur_job_info != NULL) {
+ const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+ TileDataDec *const tile_data = cur_job_info->tile_data;
+ tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+ allow_update_cdf);
+ // decode tile
+ int tile_row = tile_data->tile_info.tile_row;
+ int tile_col = tile_data->tile_info.tile_col;
+ decode_tile(pbi, td, tile_row, tile_col);
+ } else {
+ break;
+ }
+ }
+ thread_data->error_info.setjmp = 0;
+ return !td->dcb.corrupted;
+}
+
+static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
+ const TileInfo *tile) {
+ // NOTE: Currently value of max workers is calculated based
+ // on the parse and decode time. As per the theoretical estimate
+ // when percentage of parse time is equal to percentage of decode
+ // time, number of workers needed to parse + decode a tile can not
+ // exceed more than 2.
+ // TODO(any): Modify this value if parsing is optimized in future.
+ int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
+ int max_workers =
+ sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
+ return max_workers;
+}
+
+// The caller must hold pbi->row_mt_mutex_ when calling this function.
+// Returns 1 if either the next job is stored in *next_job_info or 1 is stored
+// in *end_of_frame.
+// NOTE: The caller waits on pbi->row_mt_cond_ if this function returns 0.
+// The return value of this function depends on the following variables:
+// - frame_row_mt_info->mi_rows_parse_done
+// - frame_row_mt_info->mi_rows_decode_started
+// - frame_row_mt_info->row_mt_exit
+// Therefore we may need to signal or broadcast pbi->row_mt_cond_ if any of
+// these variables is modified.
+static int get_next_job_info(AV1Decoder *const pbi,
+ AV1DecRowMTJobInfo *next_job_info,
+ int *end_of_frame) {
+ AV1_COMMON *cm = &pbi->common;
+ TileDataDec *tile_data;
+ AV1DecRowMTSync *dec_row_mt_sync;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+ const int tile_rows_start = frame_row_mt_info->tile_rows_start;
+ const int tile_rows_end = frame_row_mt_info->tile_rows_end;
+ const int tile_cols_start = frame_row_mt_info->tile_cols_start;
+ const int tile_cols_end = frame_row_mt_info->tile_cols_end;
+ const int start_tile = frame_row_mt_info->start_tile;
+ const int end_tile = frame_row_mt_info->end_tile;
+ const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
+ int num_mis_to_decode, num_threads_working;
+ int num_mis_waiting_for_decode;
+ int min_threads_working = INT_MAX;
+ int max_mis_to_decode = 0;
+ int tile_row_idx, tile_col_idx;
+ int tile_row = -1;
+ int tile_col = -1;
+
+ memset(next_job_info, 0, sizeof(*next_job_info));
+
+ // Frame decode is completed or error is encountered.
+ *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
+ frame_row_mt_info->mi_rows_to_decode) ||
+ (frame_row_mt_info->row_mt_exit == 1);
+ if (*end_of_frame) {
+ return 1;
+ }
+
+ // Decoding cannot start as bit-stream parsing is not complete.
+ assert(frame_row_mt_info->mi_rows_parse_done >=
+ frame_row_mt_info->mi_rows_decode_started);
+ if (frame_row_mt_info->mi_rows_parse_done ==
+ frame_row_mt_info->mi_rows_decode_started)
+ return 0;
+
+ // Choose the tile to decode.
+ for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
+ ++tile_row_idx) {
+ for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
+ ++tile_col_idx) {
+ if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile ||
+ tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile)
+ continue;
+
+ tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx;
+ dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ num_threads_working = dec_row_mt_sync->num_threads_working;
+ num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
+ dec_row_mt_sync->mi_rows_decode_started) *
+ dec_row_mt_sync->mi_cols;
+ num_mis_to_decode =
+ (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
+ dec_row_mt_sync->mi_cols;
+
+ assert(num_mis_to_decode >= num_mis_waiting_for_decode);
+
+ // Pick the tile which has minimum number of threads working on it.
+ if (num_mis_waiting_for_decode > 0) {
+ if (num_threads_working < min_threads_working) {
+ min_threads_working = num_threads_working;
+ max_mis_to_decode = 0;
+ }
+ if (num_threads_working == min_threads_working &&
+ num_mis_to_decode > max_mis_to_decode &&
+ num_threads_working <
+ get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info)) {
+ max_mis_to_decode = num_mis_to_decode;
+ tile_row = tile_row_idx;
+ tile_col = tile_col_idx;
+ }
+ }
+ }
+ }
+ // No job found to process
+ if (tile_row == -1 || tile_col == -1) return 0;
+
+ tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
+ dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ next_job_info->tile_row = tile_row;
+ next_job_info->tile_col = tile_col;
+ next_job_info->mi_row = dec_row_mt_sync->mi_rows_decode_started +
+ tile_data->tile_info.mi_row_start;
+
+ dec_row_mt_sync->num_threads_working++;
+ dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
+ frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
+ assert(frame_row_mt_info->mi_rows_parse_done >=
+ frame_row_mt_info->mi_rows_decode_started);
+#if CONFIG_MULTITHREAD
+ if (frame_row_mt_info->mi_rows_decode_started ==
+ frame_row_mt_info->mi_rows_to_decode) {
+ pthread_cond_broadcast(pbi->row_mt_cond_);
+ }
+#endif
+
+ return 1;
+}
+
+static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+ TileDataDec *const tile_data,
+ const int sb_mi_size) {
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ assert(frame_row_mt_info->mi_rows_parse_done >=
+ frame_row_mt_info->mi_rows_decode_started);
+ tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
+ frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
+#if CONFIG_MULTITHREAD
+ // A new decode job is available. Wake up one worker thread to handle the
+ // new decode job.
+ // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started
+ // by the same increment (sb_mi_size).
+ pthread_cond_signal(pbi->row_mt_cond_);
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+}
+
+// This function is very similar to decode_tile(). It would be good to figure
+// out how to share code.
+static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
+ TileDataDec *const tile_data) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
+ const int num_planes = av1_num_planes(cm);
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ int tile_row = tile_info->tile_row;
+ DecoderCodingBlock *const dcb = &td->dcb;
+ MACROBLOCKD *const xd = &dcb->xd;
+
+ av1_zero_above_context(cm, xd, tile_info->mi_col_start, tile_info->mi_col_end,
+ tile_row);
+ av1_reset_loop_filter_delta(xd, num_planes);
+ av1_reset_loop_restoration(xd, num_planes);
+
+ for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += cm->seq_params->mib_size) {
+ av1_zero_left_context(xd);
+
+ for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+ mi_col += cm->seq_params->mib_size) {
+ set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
+
+ // Bit-stream parsing of the superblock
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params->sb_size, 0x1);
+
+ if (aom_reader_has_overflowed(td->bit_reader)) {
+ aom_merge_corrupted_flag(&dcb->corrupted, 1);
+ return;
+ }
+ }
+ signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
+ }
+
+ int corrupted =
+ (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+ aom_merge_corrupted_flag(&dcb->corrupted, corrupted);
+}
+
+static int row_mt_worker_hook(void *arg1, void *arg2) {
+ DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+ AV1Decoder *const pbi = (AV1Decoder *)arg2;
+ ThreadData *const td = thread_data->td;
+ uint8_t allow_update_cdf;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+ td->dcb.corrupted = 0;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(thread_data->error_info.jmp)) {
+ thread_data->error_info.setjmp = 0;
+ thread_data->td->dcb.corrupted = 1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ // If any SB row (erroneous row) processed by a thread encounters an
+ // internal error, there is a need to indicate other threads that decoding
+ // of the erroneous row is complete. This ensures that other threads which
+ // wait upon the completion of SB's present in erroneous row are not waiting
+ // indefinitely.
+ signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd);
+ return 0;
+ }
+ thread_data->error_info.setjmp = 1;
+
+ AV1_COMMON *cm = &pbi->common;
+ allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
+ allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
+
+ set_decode_func_pointers(td, 0x1);
+
+ assert(cm->tiles.cols > 0);
+ while (!td->dcb.corrupted) {
+ TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+ if (cur_job_info != NULL) {
+ const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+ TileDataDec *const tile_data = cur_job_info->tile_data;
+ tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+ allow_update_cdf);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ tile_data->dec_row_mt_sync.num_threads_working++;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ // decode tile
+ parse_tile_row_mt(pbi, td, tile_data);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ tile_data->dec_row_mt_sync.num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ } else {
+ break;
+ }
+ }
+
+ if (td->dcb.corrupted) {
+ thread_data->error_info.setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ return 0;
+ }
+
+ set_decode_func_pointers(td, 0x2);
+
+ while (1) {
+ AV1DecRowMTJobInfo next_job_info;
+ int end_of_frame = 0;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
+#if CONFIG_MULTITHREAD
+ pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
+#endif
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+ if (end_of_frame) break;
+
+ int tile_row = next_job_info.tile_row;
+ int tile_col = next_job_info.tile_col;
+ int mi_row = next_job_info.mi_row;
+
+ TileDataDec *tile_data =
+ pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
+ AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col);
+ av1_init_macroblockd(cm, &td->dcb.xd);
+ td->dcb.xd.error_info = &thread_data->error_info;
+
+ decode_tile_sb_row(pbi, td, &tile_data->tile_info, mi_row);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ dec_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ }
+ thread_data->error_info.setjmp = 0;
+ return !td->dcb.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+ const TileJobsDec *const buf1 = (const TileJobsDec *)a;
+ const TileJobsDec *const buf2 = (const TileJobsDec *)b;
+ return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
+}
+
+static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+ int tile_rows_start, int tile_rows_end,
+ int tile_cols_start, int tile_cols_end,
+ int start_tile, int end_tile) {
+ AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
+ TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
+ tile_mt_info->jobs_enqueued = 0;
+ tile_mt_info->jobs_dequeued = 0;
+
+ for (int row = tile_rows_start; row < tile_rows_end; row++) {
+ for (int col = tile_cols_start; col < tile_cols_end; col++) {
+ if (row * cm->tiles.cols + col < start_tile ||
+ row * cm->tiles.cols + col > end_tile)
+ continue;
+ tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
+ tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col;
+ tile_job_queue++;
+ tile_mt_info->jobs_enqueued++;
+ }
+ }
+}
+
+static AOM_INLINE void alloc_dec_jobs(AV1DecTileMT *tile_mt_info,
+ AV1_COMMON *cm, int tile_rows,
+ int tile_cols) {
+ tile_mt_info->alloc_tile_rows = tile_rows;
+ tile_mt_info->alloc_tile_cols = tile_cols;
+ int num_tiles = tile_rows * tile_cols;
+#if CONFIG_MULTITHREAD
+ {
+ CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
+ aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
+
+ for (int i = 0; i < num_tiles; i++) {
+ pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
+ }
+ }
+#endif
+ CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
+ aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
+}
+
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
+ int ref;
+ for (ref = 0; ref < 2; ref++) {
+ if (thread_data->mc_buf_use_highbd)
+ aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
+ else
+ aom_free(thread_data->mc_buf[ref]);
+ thread_data->mc_buf[ref] = NULL;
+ }
+ thread_data->mc_buf_size = 0;
+ thread_data->mc_buf_use_highbd = 0;
+
+ aom_free(thread_data->tmp_conv_dst);
+ thread_data->tmp_conv_dst = NULL;
+ aom_free(thread_data->seg_mask);
+ thread_data->seg_mask = NULL;
+ for (int i = 0; i < 2; ++i) {
+ aom_free(thread_data->tmp_obmc_bufs[i]);
+ thread_data->tmp_obmc_bufs[i] = NULL;
+ }
+}
+
+static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
+ ThreadData *thread_data,
+ int buf_size, int use_highbd) {
+ for (int ref = 0; ref < 2; ref++) {
+ // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error
+ // 'Conditional jump or move depends on uninitialised value' from the loop
+ // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in
+ // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the
+ // potential reason for this issue.
+ if (use_highbd) {
+ uint16_t *hbd_mc_buf;
+ CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+ memset(hbd_mc_buf, 0, buf_size);
+ thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
+ } else {
+ CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
+ (uint8_t *)aom_memalign(16, buf_size));
+ memset(thread_data->mc_buf[ref], 0, buf_size);
+ }
+ }
+ thread_data->mc_buf_size = buf_size;
+ thread_data->mc_buf_use_highbd = use_highbd;
+
+ CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->tmp_conv_dst)));
+ CHECK_MEM_ERROR(cm, thread_data->seg_mask,
+ (uint8_t *)aom_memalign(
+ 16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask)));
+
+ for (int i = 0; i < 2; ++i) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->tmp_obmc_bufs[i])));
+ }
+}
+
+static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
+ AVxWorkerHook worker_hook,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ // Reset tile decoding hook
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ thread_data->td->dcb = pbi->dcb;
+ thread_data->td->dcb.corrupted = 0;
+ thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
+ thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
+ thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ if (worker_idx)
+ thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
+ thread_data->td->tmp_obmc_bufs[j];
+ }
+ winterface->sync(worker);
+
+ worker->hook = worker_hook;
+ worker->data1 = thread_data;
+ worker->data2 = pbi;
+ }
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ aom_accounting_reset(&pbi->accounting);
+ }
+#endif
+}
+
+static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi,
+ const uint8_t *data_end,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+ thread_data->data_end = data_end;
+
+ worker->had_error = 0;
+ if (worker_idx == 0) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+}
+
+static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int corrupted = 0;
+
+ for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+ aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+ }
+
+ pbi->dcb.corrupted = corrupted;
+}
+
+static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int worker_idx;
+
+ // Create workers and thread_data
+ if (pbi->num_workers == 0) {
+ const int num_threads = pbi->max_threads;
+ CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+ CHECK_MEM_ERROR(cm, pbi->thread_data,
+ aom_calloc(num_threads, sizeof(*pbi->thread_data)));
+
+ for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+
+ winterface->init(worker);
+ worker->thread_name = "aom tile worker";
+ if (worker_idx != 0 && !winterface->reset(worker)) {
+ aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
+ "Tile decoder thread creation failed");
+ }
+ ++pbi->num_workers;
+
+ if (worker_idx != 0) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+ } else {
+ // Main thread acts as a worker and uses the thread data in pbi
+ thread_data->td = &pbi->td;
+ }
+ thread_data->error_info.error_code = AOM_CODEC_OK;
+ thread_data->error_info.setjmp = 0;
+ }
+ }
+ const int use_highbd = cm->seq_params->use_highbitdepth;
+ const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+ for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) {
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ if (thread_data->td->mc_buf_size != buf_size) {
+ av1_free_mc_tmp_buf(thread_data->td);
+ allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+ }
+ }
+}
+
+static AOM_INLINE void tile_mt_queue(AV1Decoder *pbi, int tile_cols,
+ int tile_rows, int tile_rows_start,
+ int tile_rows_end, int tile_cols_start,
+ int tile_cols_end, int start_tile,
+ int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+ pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+ av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+ alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+ }
+ enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+ tile_cols_end, start_tile, end_tile);
+ qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+ sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+}
+
+static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end, int start_tile,
+ int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ CommonTileParams *const tiles = &cm->tiles;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int tile_count_tg;
+ int num_workers;
+ const uint8_t *raw_data_end = NULL;
+
+ if (tiles->large_scale) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
+ }
+ tile_count_tg = end_tile - start_tile + 1;
+ num_workers = AOMMIN(pbi->max_threads, tile_count_tg);
+
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+ assert(tile_count_tg > 0);
+ assert(num_workers > 0);
+ assert(start_tile <= end_tile);
+ assert(start_tile >= 0 && end_tile < n_tiles);
+
+ decode_mt_init(pbi);
+
+ // get tile size in tile group
+#if EXT_TILE_DEBUG
+ if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
+ if (tiles->large_scale)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
+ if (pbi->dcb.xd.seg_mask == NULL)
+ CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+ (uint8_t *)aom_memalign(
+ 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
+
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
+ av1_tile_init(&tile_data->tile_info, cm, row, col);
+ }
+ }
+
+ tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+ tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+ reset_dec_workers(pbi, tile_worker_hook, num_workers);
+ launch_dec_workers(pbi, data_end, num_workers);
+ sync_dec_workers(pbi, num_workers);
+
+ if (pbi->dcb.corrupted)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ if (tiles->large_scale) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
+ }
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
+ ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
+
+ if (pbi->cb_buffer_alloc_size < size) {
+ av1_dec_free_cb_buf(pbi);
+ CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
+ aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+ memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size);
+ pbi->cb_buffer_alloc_size = size;
+ }
+}
+
+static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+ int tile_rows_end, int tile_cols_start,
+ int tile_cols_end, int start_tile,
+ int end_tile, int max_sb_rows) {
+ AV1_COMMON *const cm = &pbi->common;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+
+ frame_row_mt_info->tile_rows_start = tile_rows_start;
+ frame_row_mt_info->tile_rows_end = tile_rows_end;
+ frame_row_mt_info->tile_cols_start = tile_cols_start;
+ frame_row_mt_info->tile_cols_end = tile_cols_end;
+ frame_row_mt_info->start_tile = start_tile;
+ frame_row_mt_info->end_tile = end_tile;
+ frame_row_mt_info->mi_rows_to_decode = 0;
+ frame_row_mt_info->mi_rows_parse_done = 0;
+ frame_row_mt_info->mi_rows_decode_started = 0;
+ frame_row_mt_info->row_mt_exit = 0;
+
+ for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+ for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+ if (tile_row * cm->tiles.cols + tile_col < start_tile ||
+ tile_row * cm->tiles.cols + tile_col > end_tile)
+ continue;
+
+ TileDataDec *const tile_data =
+ pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+
+ tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
+ tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
+ tile_data->dec_row_mt_sync.num_threads_working = 0;
+ tile_data->dec_row_mt_sync.mi_rows =
+ ALIGN_POWER_OF_TWO(tile_info->mi_row_end - tile_info->mi_row_start,
+ cm->seq_params->mib_size_log2);
+ tile_data->dec_row_mt_sync.mi_cols =
+ ALIGN_POWER_OF_TWO(tile_info->mi_col_end - tile_info->mi_col_start,
+ cm->seq_params->mib_size_log2);
+ tile_data->dec_row_mt_sync.intrabc_extra_top_right_sb_delay =
+ av1_get_intrabc_extra_top_right_sb_delay(cm);
+
+ frame_row_mt_info->mi_rows_to_decode +=
+ tile_data->dec_row_mt_sync.mi_rows;
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
+ sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
+ }
+ }
+
+#if CONFIG_MULTITHREAD
+ if (pbi->row_mt_mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
+ aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
+ if (pbi->row_mt_mutex_) {
+ pthread_mutex_init(pbi->row_mt_mutex_, NULL);
+ }
+ }
+
+ if (pbi->row_mt_cond_ == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
+ aom_malloc(sizeof(*(pbi->row_mt_cond_))));
+ if (pbi->row_mt_cond_) {
+ pthread_cond_init(pbi->row_mt_cond_, NULL);
+ }
+ }
+#endif
+}
+
+static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ int start_tile, int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ CommonTileParams *const tiles = &cm->tiles;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int tile_count_tg;
+ int num_workers = 0;
+ int max_threads;
+ const uint8_t *raw_data_end = NULL;
+ int max_sb_rows = 0;
+
+ if (tiles->large_scale) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
+ }
+ tile_count_tg = end_tile - start_tile + 1;
+ max_threads = pbi->max_threads;
+
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+ assert(tile_count_tg > 0);
+ assert(max_threads > 0);
+ assert(start_tile <= end_tile);
+ assert(start_tile >= 0 && end_tile < n_tiles);
+
+ (void)tile_count_tg;
+
+ decode_mt_init(pbi);
+
+ // get tile size in tile group
+#if EXT_TILE_DEBUG
+ if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
+ if (tiles->large_scale)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ if (pbi->tile_data != NULL) {
+ for (int i = 0; i < pbi->allocated_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ }
+ }
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
+ if (pbi->dcb.xd.seg_mask == NULL)
+ CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask,
+ (uint8_t *)aom_memalign(
+ 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask)));
+
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
+ av1_tile_init(&tile_data->tile_info, cm, row, col);
+
+ max_sb_rows = AOMMAX(max_sb_rows,
+ av1_get_sb_rows_in_tile(cm, &tile_data->tile_info));
+ num_workers += get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info);
+ }
+ }
+ num_workers = AOMMIN(num_workers, max_threads);
+
+ if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
+ for (int i = 0; i < n_tiles; ++i) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
+ }
+ pbi->allocated_row_mt_sync_rows = max_sb_rows;
+ }
+
+ tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+ tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+ dec_alloc_cb_buf(pbi);
+
+ row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
+ tile_cols_end, start_tile, end_tile, max_sb_rows);
+
+ reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
+ launch_dec_workers(pbi, data_end, num_workers);
+ sync_dec_workers(pbi, num_workers);
+
+ if (pbi->dcb.corrupted)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ if (tiles->large_scale) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
+ }
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static AOM_INLINE void error_handler(void *data) {
+ AV1_COMMON *const cm = (AV1_COMMON *)data;
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
+// seq_params->bit_depth based on the values of those fields and
+// seq_params->profile. Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+static AOM_INLINE void read_bitdepth(
+ struct aom_read_bit_buffer *rb, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info) {
+ const int high_bitdepth = aom_rb_read_bit(rb);
+ if (seq_params->profile == PROFILE_2 && high_bitdepth) {
+ const int twelve_bit = aom_rb_read_bit(rb);
+ seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+ } else if (seq_params->profile <= PROFILE_2) {
+ seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+ } else {
+ aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Unsupported profile/bit-depth combination");
+ }
+#if !CONFIG_AV1_HIGHBITDEPTH
+ if (seq_params->bit_depth > AOM_BITS_8) {
+ aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Bit-depth %d not supported", seq_params->bit_depth);
+ }
+#endif
+}
+
+void av1_read_film_grain_params(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ aom_film_grain_t *pars = &cm->film_grain_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+
+ pars->apply_grain = aom_rb_read_bit(rb);
+ if (!pars->apply_grain) {
+ memset(pars, 0, sizeof(*pars));
+ return;
+ }
+
+ pars->random_seed = aom_rb_read_literal(rb, 16);
+ if (cm->current_frame.frame_type == INTER_FRAME)
+ pars->update_parameters = aom_rb_read_bit(rb);
+ else
+ pars->update_parameters = 1;
+
+ pars->bit_depth = seq_params->bit_depth;
+
+ if (!pars->update_parameters) {
+ // inherit parameters from a previous reference frame
+ int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
+ // Section 6.8.20: It is a requirement of bitstream conformance that
+ // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value
+ // of j in the range 0 to REFS_PER_FRAME - 1.
+ int found = 0;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Invalid film grain reference idx %d. ref_frame_idx = "
+ "{%d, %d, %d, %d, %d, %d, %d}",
+ film_grain_params_ref_idx, cm->remapped_ref_idx[0],
+ cm->remapped_ref_idx[1], cm->remapped_ref_idx[2],
+ cm->remapped_ref_idx[3], cm->remapped_ref_idx[4],
+ cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]);
+ }
+ RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
+ if (buf == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Invalid Film grain reference idx");
+ }
+ if (!buf->film_grain_params_present) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Film grain reference parameters not available");
+ }
+ uint16_t random_seed = pars->random_seed;
+ *pars = buf->film_grain_params; // inherit paramaters
+ pars->random_seed = random_seed; // with new random seed
+ return;
+ }
+
+ // Scaling functions parameters
+ pars->num_y_points = aom_rb_read_literal(rb, 4); // max 14
+ if (pars->num_y_points > 14)
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Number of points for film grain luma scaling function "
+ "exceeds the maximum value.");
+ for (int i = 0; i < pars->num_y_points; i++) {
+ pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
+ if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "First coordinate of the scaling function points "
+ "shall be increasing.");
+ pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
+ }
+
+ if (!seq_params->monochrome)
+ pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+ else
+ pars->chroma_scaling_from_luma = 0;
+
+ if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
+ ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+ (pars->num_y_points == 0))) {
+ pars->num_cb_points = 0;
+ pars->num_cr_points = 0;
+ } else {
+ pars->num_cb_points = aom_rb_read_literal(rb, 4); // max 10
+ if (pars->num_cb_points > 10)
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Number of points for film grain cb scaling function "
+ "exceeds the maximum value.");
+ for (int i = 0; i < pars->num_cb_points; i++) {
+ pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
+ if (i &&
+ pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "First coordinate of the scaling function points "
+ "shall be increasing.");
+ pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
+ }
+
+ pars->num_cr_points = aom_rb_read_literal(rb, 4); // max 10
+ if (pars->num_cr_points > 10)
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Number of points for film grain cr scaling function "
+ "exceeds the maximum value.");
+ for (int i = 0; i < pars->num_cr_points; i++) {
+ pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
+ if (i &&
+ pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "First coordinate of the scaling function points "
+ "shall be increasing.");
+ pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
+ }
+
+ if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+ (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
+ ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "In YCbCr 4:2:0, film grain shall be applied "
+ "to both chroma components or neither.");
+ }
+
+ pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8; // 8 + value
+
+ // AR coefficients
+ // Only sent if the corresponsing scaling function has
+ // more than 0 points
+
+ pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
+
+ int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (pars->num_y_points > 0) ++num_pos_chroma;
+
+ if (pars->num_y_points)
+ for (int i = 0; i < num_pos_luma; i++)
+ pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
+
+ if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
+
+ if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
+
+ pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6; // 6 + value
+
+ pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
+
+ if (pars->num_cb_points) {
+ pars->cb_mult = aom_rb_read_literal(rb, 8);
+ pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
+ pars->cb_offset = aom_rb_read_literal(rb, 9);
+ }
+
+ if (pars->num_cr_points) {
+ pars->cr_mult = aom_rb_read_literal(rb, 8);
+ pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
+ pars->cr_offset = aom_rb_read_literal(rb, 9);
+ }
+
+ pars->overlap_flag = aom_rb_read_bit(rb);
+
+ pars->clip_to_restricted_range = aom_rb_read_bit(rb);
+}
+
+static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ if (cm->seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
+ av1_read_film_grain_params(cm, rb);
+ } else {
+ memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+ }
+ cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+ memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
+ sizeof(aom_film_grain_t));
+}
+
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+ int allow_lowbitdepth, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info) {
+ read_bitdepth(rb, seq_params, error_info);
+
+ seq_params->use_highbitdepth =
+ seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+ // monochrome bit (not needed for PROFILE_1)
+ const int is_monochrome =
+ seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+ seq_params->monochrome = is_monochrome;
+ int color_description_present_flag = aom_rb_read_bit(rb);
+ if (color_description_present_flag) {
+ seq_params->color_primaries = aom_rb_read_literal(rb, 8);
+ seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
+ seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
+ } else {
+ seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+ seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+ seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+ }
+ if (is_monochrome) {
+ // [16,235] (including xvycc) vs [0,255] range
+ seq_params->color_range = aom_rb_read_bit(rb);
+ seq_params->subsampling_y = seq_params->subsampling_x = 1;
+ seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
+ seq_params->separate_uv_delta_q = 0;
+ return;
+ }
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ seq_params->subsampling_y = seq_params->subsampling_x = 0;
+ seq_params->color_range = 1; // assume full color-range
+ if (!(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12))) {
+ aom_internal_error(
+ error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "sRGB colorspace not compatible with specified profile");
+ }
+ } else {
+ // [16,235] (including xvycc) vs [0,255] range
+ seq_params->color_range = aom_rb_read_bit(rb);
+ if (seq_params->profile == PROFILE_0) {
+ // 420 only
+ seq_params->subsampling_x = seq_params->subsampling_y = 1;
+ } else if (seq_params->profile == PROFILE_1) {
+ // 444 only
+ seq_params->subsampling_x = seq_params->subsampling_y = 0;
+ } else {
+ assert(seq_params->profile == PROFILE_2);
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ seq_params->subsampling_x = aom_rb_read_bit(rb);
+ if (seq_params->subsampling_x)
+ seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420
+ else
+ seq_params->subsampling_y = 0; // 444
+ } else {
+ // 422
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 0;
+ }
+ }
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+ (seq_params->subsampling_x || seq_params->subsampling_y)) {
+ aom_internal_error(
+ error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
+ }
+ if (seq_params->subsampling_x && seq_params->subsampling_y) {
+ seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
+ }
+ }
+ seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
+}
+
+void av1_read_timing_info_header(aom_timing_info_t *timing_info,
+ struct aom_internal_error_info *error,
+ struct aom_read_bit_buffer *rb) {
+ timing_info->num_units_in_display_tick =
+ aom_rb_read_unsigned_literal(rb,
+ 32); // Number of units in a display tick
+ timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32); // Time scale
+ if (timing_info->num_units_in_display_tick == 0 ||
+ timing_info->time_scale == 0) {
+ aom_internal_error(
+ error, AOM_CODEC_UNSUP_BITSTREAM,
+ "num_units_in_display_tick and time_scale must be greater than 0.");
+ }
+ timing_info->equal_picture_interval =
+ aom_rb_read_bit(rb); // Equal picture interval bit
+ if (timing_info->equal_picture_interval) {
+ const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+ if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
+ aom_internal_error(
+ error, AOM_CODEC_UNSUP_BITSTREAM,
+ "num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1.");
+ }
+ timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
+ }
+}
+
+void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info,
+ struct aom_read_bit_buffer *rb) {
+ decoder_model_info->encoder_decoder_buffer_delay_length =
+ aom_rb_read_literal(rb, 5) + 1;
+ decoder_model_info->num_units_in_decoding_tick =
+ aom_rb_read_unsigned_literal(rb,
+ 32); // Number of units in a decoding tick
+ decoder_model_info->buffer_removal_time_length =
+ aom_rb_read_literal(rb, 5) + 1;
+ decoder_model_info->frame_presentation_time_length =
+ aom_rb_read_literal(rb, 5) + 1;
+}
+
+void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
+ int buffer_delay_length,
+ struct aom_read_bit_buffer *rb) {
+ op_params->decoder_buffer_delay =
+ aom_rb_read_unsigned_literal(rb, buffer_delay_length);
+ op_params->encoder_buffer_delay =
+ aom_rb_read_unsigned_literal(rb, buffer_delay_length);
+ op_params->low_delay_mode_flag = aom_rb_read_bit(rb);
+}
+
+static AOM_INLINE void read_temporal_point_info(
+ AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
+ cm->frame_presentation_time = aom_rb_read_unsigned_literal(
+ rb, cm->seq_params->decoder_model_info.frame_presentation_time_length);
+}
+
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params) {
+ const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+ const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+ const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+ const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+
+ seq_params->num_bits_width = num_bits_width;
+ seq_params->num_bits_height = num_bits_height;
+ seq_params->max_frame_width = max_frame_width;
+ seq_params->max_frame_height = max_frame_height;
+
+ if (seq_params->reduced_still_picture_hdr) {
+ seq_params->frame_id_numbers_present_flag = 0;
+ } else {
+ seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ // We must always have delta_frame_id_length < frame_id_length,
+ // in order for a frame to be referenced with a unique delta.
+ // Avoid wasting bits by using a coding that enforces this restriction.
+ seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
+ seq_params->frame_id_length =
+ aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
+ if (seq_params->frame_id_length > 16)
+ aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid frame_id_length");
+ }
+
+ setup_sb_size(seq_params, rb);
+
+ seq_params->enable_filter_intra = aom_rb_read_bit(rb);
+ seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
+
+ if (seq_params->reduced_still_picture_hdr) {
+ seq_params->enable_interintra_compound = 0;
+ seq_params->enable_masked_compound = 0;
+ seq_params->enable_warped_motion = 0;
+ seq_params->enable_dual_filter = 0;
+ seq_params->order_hint_info.enable_order_hint = 0;
+ seq_params->order_hint_info.enable_dist_wtd_comp = 0;
+ seq_params->order_hint_info.enable_ref_frame_mvs = 0;
+ seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS
+ seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
+ seq_params->order_hint_info.order_hint_bits_minus_1 = -1;
+ } else {
+ seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
+ seq_params->enable_masked_compound = aom_rb_read_bit(rb);
+ seq_params->enable_warped_motion = aom_rb_read_bit(rb);
+ seq_params->enable_dual_filter = aom_rb_read_bit(rb);
+
+ seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb);
+ seq_params->order_hint_info.enable_dist_wtd_comp =
+ seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
+ seq_params->order_hint_info.enable_ref_frame_mvs =
+ seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
+
+ if (aom_rb_read_bit(rb)) {
+ seq_params->force_screen_content_tools =
+ 2; // SELECT_SCREEN_CONTENT_TOOLS
+ } else {
+ seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
+ }
+
+ if (seq_params->force_screen_content_tools > 0) {
+ if (aom_rb_read_bit(rb)) {
+ seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
+ } else {
+ seq_params->force_integer_mv = aom_rb_read_bit(rb);
+ }
+ } else {
+ seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
+ }
+ seq_params->order_hint_info.order_hint_bits_minus_1 =
+ seq_params->order_hint_info.enable_order_hint
+ ? aom_rb_read_literal(rb, 3)
+ : -1;
+ }
+
+ seq_params->enable_superres = aom_rb_read_bit(rb);
+ seq_params->enable_cdef = aom_rb_read_bit(rb);
+ seq_params->enable_restoration = aom_rb_read_bit(rb);
+}
+
+static int read_global_motion_params(WarpedMotionParams *params,
+ const WarpedMotionParams *ref_params,
+ struct aom_read_bit_buffer *rb,
+ int allow_hp) {
+ TransformationType type = aom_rb_read_bit(rb);
+ if (type != IDENTITY) {
+ if (aom_rb_read_bit(rb))
+ type = ROTZOOM;
+ else
+ type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
+ }
+
+ *params = default_warp_params;
+ params->wmtype = type;
+
+ if (type >= ROTZOOM) {
+ params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS)) *
+ GM_ALPHA_DECODE_FACTOR +
+ (1 << WARPEDMODEL_PREC_BITS);
+ params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+ GM_ALPHA_DECODE_FACTOR;
+ }
+
+ if (type >= AFFINE) {
+ params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+ GM_ALPHA_DECODE_FACTOR;
+ params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS)) *
+ GM_ALPHA_DECODE_FACTOR +
+ (1 << WARPEDMODEL_PREC_BITS);
+ } else {
+ params->wmmat[4] = -params->wmmat[3];
+ params->wmmat[5] = params->wmmat[2];
+ }
+
+ if (type >= TRANSLATION) {
+ const int trans_bits = (type == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ const int trans_dec_factor =
+ (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+ : GM_TRANS_DECODE_FACTOR;
+ const int trans_prec_diff = (type == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[0] >> trans_prec_diff)) *
+ trans_dec_factor;
+ params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[1] >> trans_prec_diff)) *
+ trans_dec_factor;
+ }
+
+ int good_shear_params = av1_get_shear_params(params);
+ if (!good_shear_params) return 0;
+
+ return 1;
+}
+
+static AOM_INLINE void read_global_motion(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+ int good_params =
+ read_global_motion_params(&cm->global_motion[frame], ref_params, rb,
+ cm->features.allow_high_precision_mv);
+ if (!good_params) {
+#if WARPED_MOTION_DEBUG
+ printf("Warning: unexpected global motion shear params from aomenc\n");
+#endif
+ cm->global_motion[frame].invalid = 1;
+ }
+
+ // TODO(sarahparker, debargha): The logic in the commented out code below
+ // does not work currently and causes mismatches when resize is on. Fix it
+ // before turning the optimization back on.
+ /*
+ YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame);
+ if (cm->width == ref_buf->y_crop_width &&
+ cm->height == ref_buf->y_crop_height) {
+ read_global_motion_params(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame], rb,
+ cm->features.allow_high_precision_mv);
+ } else {
+ cm->global_motion[frame] = default_warp_params;
+ }
+ */
+ /*
+ printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
+ frame, cm->current_frame.frame_number, cm->show_frame,
+ cm->global_motion[frame].wmmat[0],
+ cm->global_motion[frame].wmmat[1],
+ cm->global_motion[frame].wmmat[2],
+ cm->global_motion[frame].wmmat[3]);
+ */
+ }
+ memcpy(cm->cur_frame->global_motion, cm->global_motion,
+ REF_FRAMES * sizeof(WarpedMotionParams));
+}
+
+// Release the references to the frame buffers in cm->ref_frame_map and reset
+// all elements of cm->ref_frame_map to NULL.
+static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) {
+ BufferPool *const pool = cm->buffer_pool;
+
+ for (int i = 0; i < REF_FRAMES; i++) {
+ decrease_ref_count(cm->ref_frame_map[i], pool);
+ cm->ref_frame_map[i] = NULL;
+ }
+}
+
+// If the refresh_frame_flags bitmask is set, update reference frame id values
+// and mark frames as valid for reference.
+static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((refresh_frame_flags >> i) & 1) {
+ cm->ref_frame_id[i] = cm->current_frame_id;
+ pbi->valid_for_referencing[i] = 1;
+ }
+ }
+}
+
+static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi,
+ int existing_frame_idx) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ assert(cm->show_existing_frame);
+
+ cm->current_frame.frame_type = KEY_FRAME;
+
+ cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ cm->remapped_ref_idx[i] = INVALID_IDX;
+ }
+
+ if (pbi->need_resync) {
+ reset_ref_frame_map(cm);
+ pbi->need_resync = 0;
+ }
+
+ // Note that the displayed frame must be valid for referencing in order to
+ // have been selected.
+ cm->current_frame_id = cm->ref_frame_id[existing_frame_idx];
+ update_ref_frame_id(pbi);
+
+ cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+}
+
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ lock_buffer_pool(cm->buffer_pool);
+ reset_ref_frame_map(cm);
+ assert(cm->cur_frame->ref_count == 1);
+ for (i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) {
+ // Reset all unreferenced frame buffers. We can also reset cm->cur_frame
+ // because we are the sole owner of cm->cur_frame.
+ if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) {
+ continue;
+ }
+ frame_bufs[i].order_hint = 0;
+ av1_zero(frame_bufs[i].ref_order_hints);
+ }
+ av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+ unlock_buffer_pool(cm->buffer_pool);
+}
+
+// On success, returns 0. On failure, calls aom_internal_error and does not
+// return.
+static int read_uncompressed_header(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb) {
+ AV1_COMMON *const cm = &pbi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ FeatureFlags *const features = &cm->features;
+ MACROBLOCKD *const xd = &pbi->dcb.xd;
+ BufferPool *const pool = cm->buffer_pool;
+ RefCntBuffer *const frame_bufs = pool->frame_bufs;
+ aom_s_frame_info *sframe_info = &pbi->sframe_info;
+ sframe_info->is_s_frame = 0;
+ sframe_info->is_s_frame_at_altref = 0;
+
+ if (!pbi->sequence_header_ready) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "No sequence header");
+ }
+
+ if (seq_params->reduced_still_picture_hdr) {
+ cm->show_existing_frame = 0;
+ cm->show_frame = 1;
+ current_frame->frame_type = KEY_FRAME;
+ if (pbi->sequence_header_changed) {
+ // This is the start of a new coded video sequence.
+ pbi->sequence_header_changed = 0;
+ pbi->decoding_first_frame = 1;
+ reset_frame_buffers(cm);
+ }
+ features->error_resilient_mode = 1;
+ } else {
+ cm->show_existing_frame = aom_rb_read_bit(rb);
+ pbi->reset_decoder_state = 0;
+
+ if (cm->show_existing_frame) {
+ if (pbi->sequence_header_changed) {
+ aom_internal_error(
+ &pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "New sequence header starts with a show_existing_frame.");
+ }
+ // Show an existing frame directly.
+ const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+ RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
+ if (frame_to_show == NULL) {
+ aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer does not contain a decoded frame");
+ }
+ if (seq_params->decoder_model_info_present_flag &&
+ seq_params->timing_info.equal_picture_interval == 0) {
+ read_temporal_point_info(cm, rb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+ /* Compare display_frame_id with ref_frame_id and check valid for
+ * referencing */
+ if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+ pbi->valid_for_referencing[existing_frame_idx] == 0)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference buffer frame ID mismatch");
+ }
+ lock_buffer_pool(pool);
+ assert(frame_to_show->ref_count > 0);
+ // cm->cur_frame should be the buffer referenced by the return value
+ // of the get_free_fb() call in assign_cur_frame_new_fb() (called by
+ // av1_receive_compressed_data()), so the ref_count should be 1.
+ assert(cm->cur_frame->ref_count == 1);
+ // assign_frame_buffer_p() decrements ref_count directly rather than
+ // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
+ // already been allocated, it will not be released by
+ // assign_frame_buffer_p()!
+ assert(!cm->cur_frame->raw_frame_buffer.data);
+ assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+ pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
+ unlock_buffer_pool(pool);
+
+ cm->lf.filter_level[0] = 0;
+ cm->lf.filter_level[1] = 0;
+ cm->show_frame = 1;
+ current_frame->order_hint = frame_to_show->order_hint;
+
+ // Section 6.8.2: It is a requirement of bitstream conformance that when
+ // show_existing_frame is used to show a previous frame, that the value
+ // of showable_frame for the previous frame was equal to 1.
+ if (!frame_to_show->showable_frame) {
+ aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer does not contain a showable frame");
+ }
+ // Section 6.8.2: It is a requirement of bitstream conformance that when
+ // show_existing_frame is used to show a previous frame with
+ // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the
+ // frame is output via the show_existing_frame mechanism at most once.
+ if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
+
+ cm->film_grain_params = frame_to_show->film_grain_params;
+
+ if (pbi->reset_decoder_state) {
+ show_existing_frame_reset(pbi, existing_frame_idx);
+ } else {
+ current_frame->refresh_frame_flags = 0;
+ }
+
+ return 0;
+ }
+
+ current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);
+ if (pbi->sequence_header_changed) {
+ if (current_frame->frame_type == KEY_FRAME) {
+ // This is the start of a new coded video sequence.
+ pbi->sequence_header_changed = 0;
+ pbi->decoding_first_frame = 1;
+ reset_frame_buffers(cm);
+ } else {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Sequence header has changed without a keyframe.");
+ }
+ }
+
+ cm->show_frame = aom_rb_read_bit(rb);
+ if (cm->show_frame == 0) pbi->is_arf_frame_present = 1;
+ if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME)
+ pbi->is_fwd_kf_present = 1;
+ if (cm->current_frame.frame_type == S_FRAME) {
+ sframe_info->is_s_frame = 1;
+ sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1;
+ }
+ if (seq_params->still_picture &&
+ (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Still pictures must be coded as shown keyframes");
+ }
+ cm->showable_frame = current_frame->frame_type != KEY_FRAME;
+ if (cm->show_frame) {
+ if (seq_params->decoder_model_info_present_flag &&
+ seq_params->timing_info.equal_picture_interval == 0)
+ read_temporal_point_info(cm, rb);
+ } else {
+ // See if this frame can be used as show_existing_frame in future
+ cm->showable_frame = aom_rb_read_bit(rb);
+ }
+ cm->cur_frame->showable_frame = cm->showable_frame;
+ features->error_resilient_mode =
+ frame_is_sframe(cm) ||
+ (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+ ? 1
+ : aom_rb_read_bit(rb);
+ }
+
+ if (current_frame->frame_type == KEY_FRAME && cm->show_frame) {
+ /* All frames need to be marked as not valid for referencing */
+ for (int i = 0; i < REF_FRAMES; i++) {
+ pbi->valid_for_referencing[i] = 0;
+ }
+ }
+ features->disable_cdf_update = aom_rb_read_bit(rb);
+ if (seq_params->force_screen_content_tools == 2) {
+ features->allow_screen_content_tools = aom_rb_read_bit(rb);
+ } else {
+ features->allow_screen_content_tools =
+ seq_params->force_screen_content_tools;
+ }
+
+ if (features->allow_screen_content_tools) {
+ if (seq_params->force_integer_mv == 2) {
+ features->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+ } else {
+ features->cur_frame_force_integer_mv = seq_params->force_integer_mv;
+ }
+ } else {
+ features->cur_frame_force_integer_mv = 0;
+ }
+
+ int frame_size_override_flag = 0;
+ features->allow_intrabc = 0;
+ features->primary_ref_frame = PRIMARY_REF_NONE;
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int prev_frame_id = 0;
+ int have_prev_frame_id =
+ !pbi->decoding_first_frame &&
+ !(current_frame->frame_type == KEY_FRAME && cm->show_frame);
+ if (have_prev_frame_id) {
+ prev_frame_id = cm->current_frame_id;
+ }
+ cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+ if (have_prev_frame_id) {
+ int diff_frame_id;
+ if (cm->current_frame_id > prev_frame_id) {
+ diff_frame_id = cm->current_frame_id - prev_frame_id;
+ } else {
+ diff_frame_id =
+ (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+ }
+ /* Check current_frame_id for conformance */
+ if (prev_frame_id == cm->current_frame_id ||
+ diff_frame_id >= (1 << (frame_id_length - 1))) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid value of current_frame_id");
+ }
+ }
+ /* Check if some frames need to be marked as not valid for referencing */
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (cm->current_frame_id - (1 << diff_len) > 0) {
+ if (cm->ref_frame_id[i] > cm->current_frame_id ||
+ cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+ pbi->valid_for_referencing[i] = 0;
+ } else {
+ if (cm->ref_frame_id[i] > cm->current_frame_id &&
+ cm->ref_frame_id[i] < (1 << frame_id_length) +
+ cm->current_frame_id - (1 << diff_len))
+ pbi->valid_for_referencing[i] = 0;
+ }
+ }
+ }
+
+ frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
+
+ current_frame->order_hint = aom_rb_read_literal(
+ rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+ if (seq_params->order_hint_info.enable_order_hint)
+ current_frame->frame_number = current_frame->order_hint;
+
+ if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+ features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+ }
+ }
+
+ if (seq_params->decoder_model_info_present_flag) {
+ pbi->buffer_removal_time_present = aom_rb_read_bit(rb);
+ if (pbi->buffer_removal_time_present) {
+ for (int op_num = 0;
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+ if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
+ if (seq_params->operating_point_idc[op_num] == 0 ||
+ (((seq_params->operating_point_idc[op_num] >>
+ cm->temporal_layer_id) &
+ 0x1) &&
+ ((seq_params->operating_point_idc[op_num] >>
+ (cm->spatial_layer_id + 8)) &
+ 0x1))) {
+ cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
+ rb, seq_params->decoder_model_info.buffer_removal_time_length);
+ } else {
+ cm->buffer_removal_times[op_num] = 0;
+ }
+ } else {
+ cm->buffer_removal_times[op_num] = 0;
+ }
+ }
+ }
+ }
+ if (current_frame->frame_type == KEY_FRAME) {
+ if (!cm->show_frame) { // unshown keyframe (forward keyframe)
+ current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+ } else { // shown keyframe
+ current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+ }
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ cm->remapped_ref_idx[i] = INVALID_IDX;
+ }
+ if (pbi->need_resync) {
+ reset_ref_frame_map(cm);
+ pbi->need_resync = 0;
+ }
+ } else {
+ if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+ current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+ if (current_frame->refresh_frame_flags == 0xFF) {
+ aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Intra only frames cannot have refresh flags 0xFF");
+ }
+ if (pbi->need_resync) {
+ reset_ref_frame_map(cm);
+ pbi->need_resync = 0;
+ }
+ } else if (pbi->need_resync != 1) { /* Skip if need resync */
+ current_frame->refresh_frame_flags =
+ frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
+ }
+ }
+
+ if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
+ // Read all ref frame order hints if error_resilient_mode == 1
+ if (features->error_resilient_mode &&
+ seq_params->order_hint_info.enable_order_hint) {
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ // Read order hint from bit stream
+ unsigned int order_hint = aom_rb_read_literal(
+ rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+ // Get buffer
+ RefCntBuffer *buf = cm->ref_frame_map[ref_idx];
+ if (buf == NULL || order_hint != buf->order_hint) {
+ if (buf != NULL) {
+ lock_buffer_pool(pool);
+ decrease_ref_count(buf, pool);
+ unlock_buffer_pool(pool);
+ cm->ref_frame_map[ref_idx] = NULL;
+ }
+ // If no corresponding buffer exists, allocate a new buffer with all
+ // pixels set to neutral grey.
+ int buf_idx = get_free_fb(cm);
+ if (buf_idx == INVALID_IDX) {
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Unable to find free frame buffer");
+ }
+ buf = &frame_bufs[buf_idx];
+ lock_buffer_pool(pool);
+ if (aom_realloc_frame_buffer(
+ &buf->buf, seq_params->max_frame_width,
+ seq_params->max_frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, features->byte_alignment,
+ &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0,
+ 0)) {
+ decrease_ref_count(buf, pool);
+ unlock_buffer_pool(pool);
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ unlock_buffer_pool(pool);
+ // According to the specification, valid bitstreams are required to
+ // never use missing reference frames so the filling process for
+ // missing frames is not normatively defined and RefValid for missing
+ // frames is set to 0.
+
+ // To make libaom more robust when the bitstream has been corrupted
+ // by the loss of some frames of data, this code adds a neutral grey
+ // buffer in place of missing frames, i.e.
+ //
+ set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
+ //
+ // and allows the frames to be used for referencing, i.e.
+ //
+ pbi->valid_for_referencing[ref_idx] = 1;
+ //
+ // Please note such behavior is not normative and other decoders may
+ // use a different approach.
+ cm->ref_frame_map[ref_idx] = buf;
+ buf->order_hint = order_hint;
+ }
+ }
+ }
+ }
+
+ if (current_frame->frame_type == KEY_FRAME) {
+ setup_frame_size(cm, frame_size_override_flag, rb);
+
+ if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+ features->allow_intrabc = aom_rb_read_bit(rb);
+ features->allow_ref_frame_mvs = 0;
+ cm->prev_frame = NULL;
+ } else {
+ features->allow_ref_frame_mvs = 0;
+
+ if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+ cm->cur_frame->film_grain_params_present =
+ seq_params->film_grain_params_present;
+ setup_frame_size(cm, frame_size_override_flag, rb);
+ if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+ features->allow_intrabc = aom_rb_read_bit(rb);
+
+ } else if (pbi->need_resync != 1) { /* Skip if need resync */
+ int frame_refs_short_signaling = 0;
+ // Frame refs short signaling is off when error resilient mode is on.
+ if (seq_params->order_hint_info.enable_order_hint)
+ frame_refs_short_signaling = aom_rb_read_bit(rb);
+
+ if (frame_refs_short_signaling) {
+ // == LAST_FRAME ==
+ const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+ const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref];
+
+ // == GOLDEN_FRAME ==
+ const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+ const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref];
+
+ // Most of the time, streams start with a keyframe. In that case,
+ // ref_frame_map will have been filled in at that point and will not
+ // contain any NULLs. However, streams are explicitly allowed to start
+ // with an intra-only frame, so long as they don't then signal a
+ // reference to a slot that hasn't been set yet. That's what we are
+ // checking here.
+ if (lst_buf == NULL)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests nonexistent reference");
+ if (gld_buf == NULL)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests nonexistent reference");
+
+ av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
+ }
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ int ref = 0;
+ if (!frame_refs_short_signaling) {
+ ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+
+ // Most of the time, streams start with a keyframe. In that case,
+ // ref_frame_map will have been filled in at that point and will not
+ // contain any NULLs. However, streams are explicitly allowed to start
+ // with an intra-only frame, so long as they don't then signal a
+ // reference to a slot that hasn't been set yet. That's what we are
+ // checking here.
+ if (cm->ref_frame_map[ref] == NULL)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests nonexistent reference");
+ cm->remapped_ref_idx[i] = ref;
+ } else {
+ ref = cm->remapped_ref_idx[i];
+ }
+ // Check valid for referencing
+ if (pbi->valid_for_referencing[ref] == 0)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference frame not valid for referencing");
+
+ cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
+ int ref_frame_id =
+ ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
+ (1 << frame_id_length)) %
+ (1 << frame_id_length));
+ // Compare values derived from delta_frame_id_minus_1 and
+ // refresh_frame_flags.
+ if (ref_frame_id != cm->ref_frame_id[ref])
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference buffer frame ID mismatch");
+ }
+ }
+
+ if (!features->error_resilient_mode && frame_size_override_flag) {
+ setup_frame_size_with_refs(cm, rb);
+ } else {
+ setup_frame_size(cm, frame_size_override_flag, rb);
+ }
+
+ if (features->cur_frame_force_integer_mv) {
+ features->allow_high_precision_mv = 0;
+ } else {
+ features->allow_high_precision_mv = aom_rb_read_bit(rb);
+ }
+ features->interp_filter = read_frame_interp_filter(rb);
+ features->switchable_motion_mode = aom_rb_read_bit(rb);
+ }
+
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
+ if (features->primary_ref_frame != PRIMARY_REF_NONE &&
+ get_primary_ref_frame_buf(cm) == NULL) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference frame containing this frame's initial "
+ "frame context is unavailable.");
+ }
+
+ if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
+ pbi->need_resync != 1) {
+ if (frame_might_allow_ref_frame_mvs(cm))
+ features->allow_ref_frame_mvs = aom_rb_read_bit(rb);
+ else
+ features->allow_ref_frame_mvs = 0;
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+ struct scale_factors *const ref_scale_factors =
+ get_ref_scale_factors(cm, i);
+ av1_setup_scale_factors_for_frame(
+ ref_scale_factors, ref_buf->buf.y_crop_width,
+ ref_buf->buf.y_crop_height, cm->width, cm->height);
+ if ((!av1_is_valid_scale(ref_scale_factors)))
+ aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+ }
+ }
+ }
+
+ av1_setup_frame_buf_refs(cm);
+
+ av1_setup_frame_sign_bias(cm);
+
+ cm->cur_frame->frame_type = current_frame->frame_type;
+
+ update_ref_frame_id(pbi);
+
+ const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+ !(features->disable_cdf_update);
+ if (might_bwd_adapt) {
+ features->refresh_frame_context = aom_rb_read_bit(rb)
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ } else {
+ features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+ }
+
+ cm->cur_frame->buf.bit_depth = seq_params->bit_depth;
+ cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+ cm->cur_frame->buf.transfer_characteristics =
+ seq_params->transfer_characteristics;
+ cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+ cm->cur_frame->buf.monochrome = seq_params->monochrome;
+ cm->cur_frame->buf.chroma_sample_position =
+ seq_params->chroma_sample_position;
+ cm->cur_frame->buf.color_range = seq_params->color_range;
+ cm->cur_frame->buf.render_width = cm->render_width;
+ cm->cur_frame->buf.render_height = cm->render_height;
+
+ if (pbi->need_resync) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Keyframe / intra-only frame required to reset decoder"
+ " state");
+ }
+
+ if (features->allow_intrabc) {
+ // Set parameters corresponding to no filtering.
+ struct loopfilter *lf = &cm->lf;
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ cm->cdef_info.cdef_bits = 0;
+ cm->cdef_info.cdef_strengths[0] = 0;
+ cm->cdef_info.nb_cdef_strengths = 1;
+ cm->cdef_info.cdef_uv_strengths[0] = 0;
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+ }
+
+ read_tile_info(pbi, rb);
+ if (!av1_is_min_tile_width_satisfied(cm)) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Minimum tile width requirement not satisfied");
+ }
+
+ CommonQuantParams *const quant_params = &cm->quant_params;
+ setup_quantization(quant_params, av1_num_planes(cm),
+ cm->seq_params->separate_uv_delta_q, rb);
+ xd->bd = (int)seq_params->bit_depth;
+
+ CommonContexts *const above_contexts = &cm->above_contexts;
+ if (above_contexts->num_planes < av1_num_planes(cm) ||
+ above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+ above_contexts->num_tile_rows < cm->tiles.rows) {
+ av1_free_above_context_buffers(above_contexts);
+ if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+ cm->mi_params.mi_cols,
+ av1_num_planes(cm))) {
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+ }
+
+ if (features->primary_ref_frame == PRIMARY_REF_NONE) {
+ av1_setup_past_independence(cm);
+ }
+
+ setup_segmentation(cm, rb);
+
+ cm->delta_q_info.delta_q_res = 1;
+ cm->delta_q_info.delta_lf_res = 1;
+ cm->delta_q_info.delta_lf_present_flag = 0;
+ cm->delta_q_info.delta_lf_multi = 0;
+ cm->delta_q_info.delta_q_present_flag =
+ quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+ if (cm->delta_q_info.delta_q_present_flag) {
+ xd->current_base_qindex = quant_params->base_qindex;
+ cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+ if (!features->allow_intrabc)
+ cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
+ if (cm->delta_q_info.delta_lf_present_flag) {
+ cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+ cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb);
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+
+ xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv;
+
+ for (int i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex);
+ xd->lossless[i] =
+ qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+ quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+ quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+ xd->qindex[i] = qindex;
+ }
+ features->coded_lossless = is_coded_lossless(cm, xd);
+ features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
+ setup_segmentation_dequant(cm, xd);
+ if (features->coded_lossless) {
+ cm->lf.filter_level[0] = 0;
+ cm->lf.filter_level[1] = 0;
+ }
+ if (features->coded_lossless || !seq_params->enable_cdef) {
+ cm->cdef_info.cdef_bits = 0;
+ cm->cdef_info.cdef_strengths[0] = 0;
+ cm->cdef_info.cdef_uv_strengths[0] = 0;
+ }
+ if (features->all_lossless || !seq_params->enable_restoration) {
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+ }
+ setup_loopfilter(cm, rb);
+
+ if (!features->coded_lossless && seq_params->enable_cdef) {
+ setup_cdef(cm, rb);
+ }
+ if (!features->all_lossless && seq_params->enable_restoration) {
+ decode_restoration_mode(cm, rb);
+ }
+
+ features->tx_mode = read_tx_mode(rb, features->coded_lossless);
+ current_frame->reference_mode = read_frame_reference_mode(cm, rb);
+
+ av1_setup_skip_mode_allowed(cm);
+ current_frame->skip_mode_info.skip_mode_flag =
+ current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
+
+ if (frame_might_allow_warped_motion(cm))
+ features->allow_warped_motion = aom_rb_read_bit(rb);
+ else
+ features->allow_warped_motion = 0;
+
+ features->reduced_tx_set_used = aom_rb_read_bit(rb);
+
+ if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Frame wrongly requests reference frame MVs");
+ }
+
+ if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
+
+ cm->cur_frame->film_grain_params_present =
+ seq_params->film_grain_params_present;
+ read_film_grain(cm, rb);
+
+#if EXT_TILE_DEBUG
+ if (pbi->ext_tile_debug && cm->tiles.large_scale) {
+ read_ext_tile_info(pbi, rb);
+ av1_set_single_tile_decoding_mode(cm);
+ }
+#endif // EXT_TILE_DEBUG
+ return 0;
+}
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+ AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+ const uint8_t *data_end) {
+ rb->bit_offset = 0;
+ rb->error_handler = error_handler;
+ rb->error_handler_data = &pbi->common;
+ rb->bit_buffer = data;
+ rb->bit_buffer_end = data_end;
+ return rb;
+}
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+ int num_bits_height, int *width, int *height) {
+ *width = aom_rb_read_literal(rb, num_bits_width) + 1;
+ *height = aom_rb_read_literal(rb, num_bits_height) + 1;
+}
+
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
+ int profile = aom_rb_read_literal(rb, PROFILE_BITS);
+ return (BITSTREAM_PROFILE)profile;
+}
+
+static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+
+ if (!av1_superres_scaled(cm)) return;
+ assert(!cm->features.all_lossless);
+
+ av1_superres_upscale(cm, pool, 0);
+}
+
+uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ int trailing_bits_present) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &pbi->dcb.xd;
+
+#if CONFIG_BITSTREAM_DEBUG
+ if (cm->seq_params->order_hint_info.enable_order_hint) {
+ aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 +
+ cm->show_frame);
+ } else {
+ // This is currently used in RTC encoding. cm->show_frame is always 1.
+ assert(cm->show_frame);
+ aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number);
+ }
+#endif
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_r();
+#endif
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ cm->global_motion[i] = default_warp_params;
+ cm->cur_frame->global_motion[i] = default_warp_params;
+ }
+ xd->global_motion = cm->global_motion;
+
+ read_uncompressed_header(pbi, rb);
+
+ if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
+
+ if (!cm->tiles.single_tile_decoding &&
+ (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
+ pbi->dec_tile_row = -1;
+ pbi->dec_tile_col = -1;
+ }
+
+ const uint32_t uncomp_hdr_size =
+ (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header
+ YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf;
+ xd->cur_buf = new_fb;
+ if (av1_allow_intrabc(cm)) {
+ av1_setup_scale_factors_for_frame(
+ &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
+ xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
+ }
+
+ // Showing a frame directly.
+ if (cm->show_existing_frame) {
+ if (pbi->reset_decoder_state) {
+ // Use the default frame context values.
+ *cm->fc = *cm->default_frame_context;
+ if (!cm->fc->initialized)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+ }
+ return uncomp_hdr_size;
+ }
+
+ cm->mi_params.setup_mi(&cm->mi_params);
+
+ av1_calculate_ref_frame_side(cm);
+ if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm);
+
+ av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, num_planes);
+ if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+ // use the default frame context values
+ *cm->fc = *cm->default_frame_context;
+ } else {
+ *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
+ }
+ if (!cm->fc->initialized)
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+
+ pbi->dcb.corrupted = 0;
+ return uncomp_hdr_size;
+}
+
+// Once-per-frame initialization
+static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+ av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true);
+ for (int p = 0; p < av1_num_planes(cm); p++) {
+ av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+ }
+ }
+
+ const int use_highbd = cm->seq_params->use_highbitdepth;
+ const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+ if (pbi->td.mc_buf_size != buf_size) {
+ av1_free_mc_tmp_buf(&pbi->td);
+ allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
+ }
+}
+
+void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end, int start_tile,
+ int end_tile, int initialize_flag) {
+ AV1_COMMON *const cm = &pbi->common;
+ CommonTileParams *const tiles = &cm->tiles;
+ MACROBLOCKD *const xd = &pbi->dcb.xd;
+ const int tile_count_tg = end_tile - start_tile + 1;
+
+ xd->error_info = cm->error;
+ if (initialize_flag) setup_frame_info(pbi);
+ const int num_planes = av1_num_planes(cm);
+
+ if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
+ pbi->row_mt)
+ *p_data_end =
+ decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
+ else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
+ !(tiles->large_scale && !pbi->ext_tile_debug))
+ *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
+ else
+ *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
+
+ // If the bit stream is monochrome, set the U and V buffers to a constant.
+ if (num_planes < 3) {
+ set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1);
+ }
+
+ if (end_tile != tiles->rows * tiles->cols - 1) {
+ return;
+ }
+
+ av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync,
+ pbi->num_workers, 1);
+ av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers);
+
+ if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
+ if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0,
+ num_planes, 0, pbi->tile_workers,
+ pbi->num_workers, &pbi->lf_row_sync, 0);
+ }
+
+ const int do_cdef =
+ !pbi->skip_loop_filter && !cm->features.coded_lossless &&
+ (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
+ cm->cdef_info.cdef_uv_strengths[0]);
+ const int do_superres = av1_superres_scaled(cm);
+ const int optimized_loop_restoration = !do_cdef && !do_superres;
+ const int do_loop_restoration =
+ cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+ // Frame border extension is not required in the decoder
+ // as it happens in extend_mc_border().
+ int do_extend_border_mt = 0;
+ if (!optimized_loop_restoration) {
+ if (do_loop_restoration)
+ av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
+ cm, 0);
+
+ if (do_cdef) {
+ if (pbi->num_workers > 1) {
+ av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
+ pbi->tile_workers, &pbi->cdef_sync,
+ pbi->num_workers, av1_cdef_init_fb_row_mt,
+ do_extend_border_mt);
+ } else {
+ av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
+ av1_cdef_init_fb_row);
+ }
+ }
+
+ superres_post_decode(pbi);
+
+ if (do_loop_restoration) {
+ av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
+ cm, 1);
+ if (pbi->num_workers > 1) {
+ av1_loop_restoration_filter_frame_mt(
+ (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+ pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+ &pbi->lr_ctxt, do_extend_border_mt);
+ } else {
+ av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+ cm, optimized_loop_restoration,
+ &pbi->lr_ctxt);
+ }
+ }
+ } else {
+ // In no cdef and no superres case. Provide an optimized version of
+ // loop_restoration_filter.
+ if (do_loop_restoration) {
+ if (pbi->num_workers > 1) {
+ av1_loop_restoration_filter_frame_mt(
+ (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+ pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+ &pbi->lr_ctxt, do_extend_border_mt);
+ } else {
+ av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+ cm, optimized_loop_restoration,
+ &pbi->lr_ctxt);
+ }
+ }
+ }
+ }
+
+ if (!pbi->dcb.corrupted) {
+ if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ assert(pbi->context_update_tile_id < pbi->allocated_tiles);
+ *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
+ av1_reset_cdf_symbol_counters(cm->fc);
+ }
+ } else {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data is corrupted.");
+ }
+
+#if CONFIG_INSPECTION
+ if (pbi->inspect_cb != NULL) {
+ (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
+ }
+#endif
+
+ // Non frame parallel update frame context here.
+ if (!tiles->large_scale) {
+ cm->cur_frame->frame_context = *cm->fc;
+ }
+
+ if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) {
+ ++cm->current_frame.frame_number;
+ }
+}
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
new file mode 100644
index 0000000000..46ae475ff5
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Decoder;
+struct aom_read_bit_buffer;
+struct ThreadData;
+
+// Reads the middle part of the sequence header OBU (from
+// frame_width_bits_minus_1 to enable_restoration) into seq_params.
+// Reports errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params);
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+ int num_bits_height, int *width, int *height);
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
+
+// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on
+// failure.
+int av1_check_trailing_bits(struct AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb);
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ int trailing_bits_present);
+
+void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end, int start_tile,
+ int end_tile, int initialize_flag);
+
+// Implements the color_config() function in the spec. Reports errors by
+// calling rb->error_handler() or aom_internal_error().
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+ int allow_lowbitdepth, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info);
+
+// Implements the timing_info() function in the spec. Reports errors by calling
+// rb->error_handler() or aom_internal_error().
+void av1_read_timing_info_header(aom_timing_info_t *timing_info,
+ struct aom_internal_error_info *error,
+ struct aom_read_bit_buffer *rb);
+
+// Implements the decoder_model_info() function in the spec. Reports errors by
+// calling rb->error_handler().
+void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info,
+ struct aom_read_bit_buffer *rb);
+
+// Implements the operating_parameters_info() function in the spec. Reports
+// errors by calling rb->error_handler().
+void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
+ int buffer_delay_length,
+ struct aom_read_bit_buffer *rb);
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+ struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+ const uint8_t *data_end);
+
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
new file mode 100644
index 0000000000..bb0ccf5fd8
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -0,0 +1,1586 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define ACCT_STR __func__
+
+#define DEC_MISMATCH_DEBUG 0
+
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
+ return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
+}
+
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
+ const int skip_txfm = xd->mi[0]->skip_txfm;
+ if (cm->features.coded_lossless) return;
+ if (cm->features.allow_intrabc) {
+ assert(cm->cdef_info.cdef_bits == 0);
+ return;
+ }
+
+ // At the start of a superblock, mark that we haven't yet read CDEF strengths
+ // for any of the CDEF units contained in this superblock.
+ const int sb_mask = (cm->seq_params->mib_size - 1);
+ const int mi_row_in_sb = (xd->mi_row & sb_mask);
+ const int mi_col_in_sb = (xd->mi_col & sb_mask);
+ if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+ xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+ xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
+ }
+
+ // CDEF unit size is 64x64 irrespective of the superblock size.
+ const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+ // Find index of this CDEF unit in this superblock.
+ const int index_mask = cdef_size;
+ const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+ const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+ const int index = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
+ : 0;
+
+ // Read CDEF strength from the first non-skip coding block in this CDEF unit.
+ if (!xd->cdef_transmitted[index] && !skip_txfm) {
+ // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
+ // of the 1st block in this CDEF unit.
+ const int first_block_mask = ~(cdef_size - 1);
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int grid_idx =
+ get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+ xd->mi_col & first_block_mask);
+ MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+ mbmi->cdef_strength =
+ aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
+ xd->cdef_transmitted[index] = true;
+ }
+}
+
+static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
+ aom_reader *r, MB_MODE_INFO *const mbmi) {
+ int sign, abs, reduced_delta_qindex = 0;
+ BLOCK_SIZE bsize = mbmi->bsize;
+ const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1);
+ const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1);
+ const int read_delta_q_flag = (b_col == 0 && b_row == 0);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
+ read_delta_q_flag) {
+ abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
+ const int smallval = (abs < DELTA_Q_SMALL);
+
+ if (!smallval) {
+ const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+ const int thr = (1 << rem_bits) + 1;
+ abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+ }
+
+ if (abs) {
+ sign = aom_read_bit(r, ACCT_STR);
+ } else {
+ sign = 1;
+ }
+
+ reduced_delta_qindex = sign ? -abs : abs;
+ }
+ return reduced_delta_qindex;
+}
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+ aom_cdf_prob *const cdf,
+ const MB_MODE_INFO *const mbmi, int mi_col,
+ int mi_row) {
+ int reduced_delta_lflevel = 0;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int b_col = mi_col & (cm->seq_params->mib_size - 1);
+ const int b_row = mi_row & (cm->seq_params->mib_size - 1);
+ const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
+
+ if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
+ read_delta_lf_flag) {
+ int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
+ const int smallval = (abs < DELTA_LF_SMALL);
+ if (!smallval) {
+ const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+ const int thr = (1 << rem_bits) + 1;
+ abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+ }
+ const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
+ reduced_delta_lflevel = sign ? -abs : abs;
+ }
+ return reduced_delta_lflevel;
+}
+
+static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx,
+ aom_reader *r,
+ CFL_ALLOWED_TYPE cfl_allowed,
+ PREDICTION_MODE y_mode) {
+ const UV_PREDICTION_MODE uv_mode =
+ aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+ UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
+ return uv_mode;
+}
+
+static uint8_t read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
+ int8_t *signs_out) {
+ const int8_t joint_sign =
+ aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
+ uint8_t idx = 0;
+ // Magnitudes are only coded for nonzero values
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
+ << CFL_ALPHABET_SIZE_LOG2;
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
+ }
+ *signs_out = joint_sign;
+ return idx;
+}
+
+static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r,
+ int size_group) {
+ const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
+ r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
+ ACCT_STR);
+ return ii_mode;
+}
+
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
+ int16_t ctx) {
+ int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+ int is_newmv, is_zeromv, is_refmv;
+ is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+ if (is_newmv) return NEWMV;
+
+ mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ is_zeromv =
+ aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+ if (is_zeromv) return GLOBALMV;
+
+ mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+ if (is_refmv)
+ return NEARESTMV;
+ else
+ return NEARMV;
+}
+
+static void read_drl_idx(FRAME_CONTEXT *ec_ctx, DecoderCodingBlock *dcb,
+ MB_MODE_INFO *mbmi, aom_reader *r) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ mbmi->ref_mv_idx = 0;
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
+ int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+ mbmi->ref_mv_idx = idx + drl_idx;
+ if (!drl_idx) return;
+ }
+ }
+ }
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ // Offset the NEARESTMV mode.
+ // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+ // mode is factored in.
+ for (int idx = 1; idx < 3; ++idx) {
+ if (dcb->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
+ int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+ mbmi->ref_mv_idx = idx + drl_idx - 1;
+ if (!drl_idx) return;
+ }
+ }
+ }
+}
+
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+ MB_MODE_INFO *mbmi, aom_reader *r) {
+ if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+ if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
+
+ const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mbmi, cm->features.allow_warped_motion);
+ int motion_mode;
+
+ if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
+
+ if (last_motion_mode_allowed == OBMC_CAUSAL) {
+ motion_mode =
+ aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR);
+ return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+ } else {
+ motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
+ MOTION_MODES, ACCT_STR);
+ return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+ }
+}
+
+static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r,
+ int16_t ctx) {
+ const int mode =
+ aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
+ INTER_COMPOUND_MODES, ACCT_STR);
+ assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+ return NEAREST_NEARESTMV + mode;
+}
+
+int av1_neg_deinterleave(int diff, int ref, int max) {
+ if (!ref) return diff;
+ if (ref >= (max - 1)) return max - diff - 1;
+ if (2 * ref < max) {
+ if (diff <= 2 * ref) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return diff;
+ } else {
+ if (diff <= 2 * (max - ref - 1)) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return max - (diff + 1);
+ }
+}
+
+static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ aom_reader *r, int skip) {
+ int cdf_num;
+ const uint8_t pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0);
+ if (skip) return pred;
+
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+ const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
+ const int segment_id =
+ av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
+
+ if (segment_id < 0 || segment_id > seg->last_active_segid) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Corrupted segment_ids");
+ }
+ return segment_id;
+}
+
+static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
+ int mi_offset, int x_mis, int y_mis) {
+ int segment_id = INT_MAX;
+
+ for (int y = 0; y < y_mis; y++)
+ for (int x = 0; x < x_mis; x++)
+ segment_id = AOMMIN(
+ segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+ return segment_id;
+}
+
+static int read_intra_segment_id(AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+ aom_reader *r, int skip) {
+ struct segmentation *const seg = &cm->seg;
+ if (!seg->enabled) return 0; // Default for disabled segmentation
+ assert(seg->update_map && !seg->temporal_update);
+
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int mi_stride = cm->mi_params.mi_cols;
+ const int mi_offset = mi_row * mi_stride + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+ const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+ const int segment_id = read_segment_id(cm, xd, r, skip);
+ set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
+ segment_id);
+ return segment_id;
+}
+
+static void copy_segment_id(const CommonModeInfoParams *const mi_params,
+ const uint8_t *last_segment_ids,
+ uint8_t *current_segment_ids, int mi_offset,
+ int x_mis, int y_mis) {
+ const int stride = mi_params->mi_cols;
+ if (last_segment_ids) {
+ assert(last_segment_ids != current_segment_ids);
+ for (int y = 0; y < y_mis; y++) {
+ memcpy(&current_segment_ids[mi_offset + y * stride],
+ &last_segment_ids[mi_offset + y * stride],
+ sizeof(current_segment_ids[0]) * x_mis);
+ }
+ } else {
+ for (int y = 0; y < y_mis; y++) {
+ memset(&current_segment_ids[mi_offset + y * stride], 0,
+ sizeof(current_segment_ids[0]) * x_mis);
+ }
+ }
+}
+
+static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
+ int x_mis, int y_mis) {
+ return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+ mi_offset, x_mis, y_mis)
+ : 0;
+}
+
+static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int preskip, aom_reader *r) {
+ struct segmentation *const seg = &cm->seg;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+ const int bw = mi_size_wide[mbmi->bsize];
+ const int bh = mi_size_high[mbmi->bsize];
+
+ // TODO(slavarnway): move x_mis, y_mis into xd ?????
+ const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+ const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+
+ if (!seg->enabled) return 0; // Default for disabled segmentation
+
+ if (!seg->update_map) {
+ copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map,
+ mi_offset, x_mis, y_mis);
+ return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+ }
+
+ uint8_t segment_id;
+ const int mi_stride = cm->mi_params.mi_cols;
+ if (preskip) {
+ if (!seg->segid_preskip) return 0;
+ } else {
+ if (mbmi->skip_txfm) {
+ if (seg->temporal_update) {
+ mbmi->seg_id_predicted = 0;
+ }
+ segment_id = read_segment_id(cm, xd, r, 1);
+ set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
+ segment_id);
+ return segment_id;
+ }
+ }
+
+ if (seg->temporal_update) {
+ const uint8_t ctx = av1_get_pred_context_seg_id(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
+ mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
+ if (mbmi->seg_id_predicted) {
+ segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+ } else {
+ segment_id = read_segment_id(cm, xd, r, 0);
+ }
+ } else {
+ segment_id = read_segment_id(cm, xd, r, 0);
+ }
+ set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride,
+ segment_id);
+ return segment_id;
+}
+
+static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+ aom_reader *r) {
+ if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
+
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 0;
+ }
+
+ if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0;
+
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ // These features imply single-reference mode, while skip mode implies
+ // compound reference. Hence, the two are mutually exclusive.
+ // In other words, skip_mode is implicitly 0 here.
+ return 0;
+ }
+
+ const int ctx = av1_get_skip_mode_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int skip_mode =
+ aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
+ return skip_mode;
+}
+
+static int read_skip_txfm(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+ aom_reader *r) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int ctx = av1_get_skip_txfm_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int skip_txfm =
+ aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR);
+ return skip_txfm;
+ }
+}
+
+// Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1])
+// and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into
+// one single sorted list(colors[...]).
+static void merge_colors(uint16_t *colors, uint16_t *cached_colors,
+ int n_colors, int n_cached_colors) {
+ if (n_cached_colors == 0) return;
+ int cache_idx = 0, trans_idx = n_cached_colors;
+ for (int i = 0; i < n_colors; ++i) {
+ if (cache_idx < n_cached_colors &&
+ (trans_idx >= n_colors ||
+ cached_colors[cache_idx] <= colors[trans_idx])) {
+ colors[i] = cached_colors[cache_idx++];
+ } else {
+ assert(trans_idx < n_colors);
+ colors[i] = colors[trans_idx++];
+ }
+ }
+}
+
+static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth,
+ PALETTE_MODE_INFO *const pmi, aom_reader *r) {
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ uint16_t cached_colors[PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ const int n = pmi->palette_size[0];
+ int idx = 0;
+ for (int i = 0; i < n_cache && idx < n; ++i)
+ if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+ if (idx < n) {
+ const int n_cached_colors = idx;
+ pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+ if (idx < n) {
+ const int min_bits = bit_depth - 3;
+ int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+ int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1;
+ for (; idx < n; ++idx) {
+ assert(range >= 0);
+ const int delta = aom_read_literal(r, bits, ACCT_STR) + 1;
+ pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+ 0, (1 << bit_depth) - 1);
+ range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+ }
+ merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors);
+ } else {
+ memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0]));
+ }
+}
+
+static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth,
+ PALETTE_MODE_INFO *const pmi,
+ aom_reader *r) {
+ const int n = pmi->palette_size[1];
+ // U channel colors.
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ uint16_t cached_colors[PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ int idx = 0;
+ for (int i = 0; i < n_cache && idx < n; ++i)
+ if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+ if (idx < n) {
+ const int n_cached_colors = idx;
+ idx += PALETTE_MAX_SIZE;
+ pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+ if (idx < PALETTE_MAX_SIZE + n) {
+ const int min_bits = bit_depth - 3;
+ int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+ int range = (1 << bit_depth) - pmi->palette_colors[idx - 1];
+ for (; idx < PALETTE_MAX_SIZE + n; ++idx) {
+ assert(range >= 0);
+ const int delta = aom_read_literal(r, bits, ACCT_STR);
+ pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+ 0, (1 << bit_depth) - 1);
+ range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+ }
+ merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n,
+ n_cached_colors);
+ } else {
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors,
+ n * sizeof(cached_colors[0]));
+ }
+
+ // V channel colors.
+ if (aom_read_bit(r, ACCT_STR)) { // Delta encoding.
+ const int min_bits_v = bit_depth - 4;
+ const int max_val = 1 << bit_depth;
+ int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
+ aom_read_literal(r, bit_depth, ACCT_STR);
+ for (int i = 1; i < n; ++i) {
+ int delta = aom_read_literal(r, bits, ACCT_STR);
+ if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
+ int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
+ if (val < 0) val += max_val;
+ if (val >= max_val) val -= max_val;
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
+ }
+ } else {
+ for (int i = 0; i < n; ++i) {
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+ aom_read_literal(r, bit_depth, ACCT_STR);
+ }
+ }
+}
+
+static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *r) {
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ if (mbmi->mode == DC_PRED) {
+ const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+ const int modev = aom_read_symbol(
+ r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
+ ACCT_STR);
+ if (modev) {
+ pmi->palette_size[0] =
+ aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+ PALETTE_SIZES, ACCT_STR) +
+ 2;
+ read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r);
+ }
+ }
+ if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+ const int modev = aom_read_symbol(
+ r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
+ if (modev) {
+ pmi->palette_size[1] =
+ aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+ PALETTE_SIZES, ACCT_STR) +
+ 2;
+ read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r);
+ }
+ }
+}
+
+static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
+ const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
+ return sym - MAX_ANGLE_DELTA;
+}
+
+static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, aom_reader *r) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+ &mbmi->filter_intra_mode_info;
+
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ filter_intra_mode_info->use_filter_intra = aom_read_symbol(
+ r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR);
+ if (filter_intra_mode_info->use_filter_intra) {
+ filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
+ r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
+ }
+ } else {
+ filter_intra_mode_info->use_filter_intra = 0;
+ }
+}
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+ int blk_col, TX_SIZE tx_size, aom_reader *r) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ uint8_t *tx_type =
+ &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+ *tx_type = DCT_DCT;
+
+ // No need to read transform type if block is skipped.
+ if (mbmi->skip_txfm ||
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ return;
+
+ // No need to read transform type for lossless mode(qindex==0).
+ const int qindex = xd->qindex[mbmi->segment_id];
+ if (qindex == 0) return;
+
+ const int inter_block = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) >
+ 1) {
+ const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, inter_block, cm->features.reduced_tx_set_used);
+ const int eset =
+ get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used);
+ // eset == 0 should correspond to a set with only DCT_DCT and
+ // there is no need to read the tx_type
+ assert(eset != 0);
+
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ if (inter_block) {
+ *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+ r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+ av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+ } else {
+ const PREDICTION_MODE intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra
+ ? fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode]
+ : mbmi->mode;
+ *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+ r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
+ av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+ }
+ }
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+ nmv_context *ctx, MvSubpelPrecision precision);
+
+static INLINE int is_mv_valid(const MV *mv);
+
+static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
+ const int_mv *ref_mv, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
+ // DV should not have sub-pel.
+ assert((mv->as_mv.col & 7) == 0);
+ assert((mv->as_mv.row & 7) == 0);
+ mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
+ mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
+ int valid = is_mv_valid(&mv->as_mv) &&
+ av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
+ cm->seq_params->mib_size_log2);
+ return valid;
+}
+
+static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb,
+ aom_reader *r) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
+ if (mbmi->use_intrabc) {
+ BLOCK_SIZE bsize = mbmi->bsize;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+ int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
+
+ av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
+ inter_mode_ctx);
+
+ int_mv nearestmv, nearmv;
+
+ av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
+ int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+ if (dv_ref.as_int == 0)
+ av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row);
+ // Ref DV should not have sub-pel.
+ int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
+ dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
+ dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
+ valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row,
+ xd->mi_col, bsize, r);
+ if (!valid_dv) {
+ // Intra bc motion vectors are not valid - signal corrupt frame
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid intrabc dv");
+ }
+ }
+}
+
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *r) {
+ DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+
+ if (delta_q_info->delta_q_present_flag) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ xd->current_base_qindex +=
+ read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
+ /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+ xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ);
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ if (delta_q_info->delta_lf_present_flag) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ const int tmp_lvl =
+ xd->delta_lf[lf_id] +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+ mi_col, mi_row) *
+ delta_q_info->delta_lf_res;
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ } else {
+ const int tmp_lvl = xd->delta_lf_from_base +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+ mbmi, mi_col, mi_row) *
+ delta_q_info->delta_lf_res;
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ }
+ }
+}
+
+static void read_intra_frame_mode_info(AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb, aom_reader *r) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ struct segmentation *const seg = &cm->seg;
+
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (seg->segid_preskip)
+ mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
+
+ mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
+
+ if (!seg->segid_preskip)
+ mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm);
+
+ read_cdef(cm, r, xd);
+
+ read_delta_q_params(cm, xd, r);
+
+ mbmi->current_qindex = xd->current_base_qindex;
+
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ if (av1_allow_intrabc(cm)) {
+ read_intrabc_info(cm, dcb, r);
+ if (is_intrabc_block(mbmi)) return;
+ }
+
+ mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
+
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ (use_angle_delta && av1_is_directional_mode(mbmi->mode))
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+ : 0;
+
+ if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
+ mbmi->uv_mode =
+ read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+ if (mbmi->uv_mode == UV_CFL_PRED) {
+ mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
+ }
+ const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
+ mbmi->angle_delta[PLANE_TYPE_UV] =
+ (use_angle_delta && av1_is_directional_mode(intra_mode))
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
+ : 0;
+ } else {
+ // Avoid decoding angle_info if there is no chroma prediction
+ mbmi->uv_mode = UV_DC_PRED;
+ }
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
+ read_palette_mode_info(cm, xd, r);
+
+ read_filter_intra_mode_info(cm, xd, r);
+}
+
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
+ int use_subpel, int usehp) {
+ int mag, d, fr, hp;
+ const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
+ const int mv_class =
+ aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
+ const int class0 = mv_class == MV_CLASS_0;
+
+ // Integer part
+ if (class0) {
+ d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
+ mag = 0;
+ } else {
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ d = 0;
+ for (int i = 0; i < n; ++i)
+ d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
+ mag = CLASS0_SIZE << (mv_class + 2);
+ }
+
+ if (use_subpel) {
+ // Fractional part
+ fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+ MV_FP_SIZE, ACCT_STR);
+
+ // High precision part (if hp is not used, the default value of the hp is 1)
+ hp = usehp ? aom_read_symbol(
+ r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
+ ACCT_STR)
+ : 1;
+ } else {
+ fr = 3;
+ hp = 1;
+ }
+
+ // Result
+ mag += ((d << 3) | (fr << 1) | hp) + 1;
+ return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+ nmv_context *ctx, MvSubpelPrecision precision) {
+ MV diff = kZeroMv;
+ const MV_JOINT_TYPE joint_type =
+ (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
+
+ if (mv_joint_vertical(joint_type))
+ diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
+ precision > MV_SUBPEL_LOW_PRECISION);
+
+ if (mv_joint_horizontal(joint_type))
+ diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
+ precision > MV_SUBPEL_LOW_PRECISION);
+
+ mv->row = ref->row + diff.row;
+ mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ aom_reader *r) {
+ if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE;
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ const int ctx = av1_get_reference_mode_context(xd);
+ const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
+ r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
+ return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE
+ } else {
+ assert(cm->current_frame.reference_mode == SINGLE_REFERENCE);
+ return cm->current_frame.reference_mode;
+ }
+}
+
+#define READ_REF_BIT(pname) \
+ aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
+
+static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd,
+ aom_reader *r) {
+ const int ctx = av1_get_comp_reference_type_context(xd);
+ const COMP_REFERENCE_TYPE comp_ref_type =
+ (COMP_REFERENCE_TYPE)aom_read_symbol(
+ r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
+ return comp_ref_type; // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
+}
+
+static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref_frame[2]) {
+ ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0;
+ ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1;
+}
+
+// Read the referncence frame
+static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *r, int segment_id,
+ MV_REFERENCE_FRAME ref_frame[2]) {
+ if (xd->mi[0]->skip_mode) {
+ set_ref_frames_for_skip_mode(cm, ref_frame);
+ return;
+ }
+
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+ SEG_LVL_REF_FRAME);
+ ref_frame[1] = NONE_FRAME;
+ } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = NONE_FRAME;
+ } else {
+ const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+
+ if (mode == COMPOUND_REFERENCE) {
+ const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = READ_REF_BIT(uni_comp_ref_p);
+ if (bit) {
+ ref_frame[0] = BWDREF_FRAME;
+ ref_frame[1] = ALTREF_FRAME;
+ } else {
+ const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
+ if (bit1) {
+ const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
+ if (bit2) {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = GOLDEN_FRAME;
+ } else {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = LAST3_FRAME;
+ }
+ } else {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = LAST2_FRAME;
+ }
+ }
+
+ return;
+ }
+
+ assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+ const int idx = 1;
+ const int bit = READ_REF_BIT(comp_ref_p);
+ // Decode forward references.
+ if (!bit) {
+ const int bit1 = READ_REF_BIT(comp_ref_p1);
+ ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME;
+ } else {
+ const int bit2 = READ_REF_BIT(comp_ref_p2);
+ ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME;
+ }
+
+ // Decode backward references.
+ const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
+ if (!bit_bwd) {
+ const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
+ ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME;
+ } else {
+ ref_frame[idx] = ALTREF_FRAME;
+ }
+ } else if (mode == SINGLE_REFERENCE) {
+ const int bit0 = READ_REF_BIT(single_ref_p1);
+ if (bit0) {
+ const int bit1 = READ_REF_BIT(single_ref_p2);
+ if (!bit1) {
+ const int bit5 = READ_REF_BIT(single_ref_p6);
+ ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
+ } else {
+ ref_frame[0] = ALTREF_FRAME;
+ }
+ } else {
+ const int bit2 = READ_REF_BIT(single_ref_p3);
+ if (bit2) {
+ const int bit4 = READ_REF_BIT(single_ref_p5);
+ ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+ } else {
+ const int bit3 = READ_REF_BIT(single_ref_p4);
+ ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+ }
+ }
+
+ ref_frame[1] = NONE_FRAME;
+ } else {
+ assert(0 && "Invalid prediction mode.");
+ }
+ }
+}
+
+static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd,
+ InterpFilter interp_filter,
+ bool enable_dual_filter,
+ MB_MODE_INFO *const mbmi,
+ aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!av1_is_interp_needed(xd)) {
+ set_default_interp_filters(mbmi, interp_filter);
+ return;
+ }
+
+ if (interp_filter != SWITCHABLE) {
+ mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter);
+ } else {
+ InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+ for (int dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ ref0_filter[dir] = (InterpFilter)aom_read_symbol(
+ r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
+ if (!enable_dual_filter) {
+ ref0_filter[1] = ref0_filter[0];
+ break;
+ }
+ }
+ // The index system works as: (0, 1) -> (vertical, horizontal) filter types
+ mbmi->interp_filters.as_filters.x_filter = ref0_filter[1];
+ mbmi->interp_filters.as_filters.y_filter = ref0_filter[0];
+ }
+}
+
+static void read_intra_block_mode_info(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ MB_MODE_INFO *const mbmi,
+ aom_reader *r) {
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
+
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ use_angle_delta && av1_is_directional_mode(mbmi->mode)
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+ : 0;
+ if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
+ mbmi->uv_mode =
+ read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+ if (mbmi->uv_mode == UV_CFL_PRED) {
+ mbmi->cfl_alpha_idx =
+ read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
+ }
+ const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode);
+ mbmi->angle_delta[PLANE_TYPE_UV] =
+ use_angle_delta && av1_is_directional_mode(intra_mode)
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED])
+ : 0;
+ } else {
+ // Avoid decoding angle_info if there is no chroma prediction
+ mbmi->uv_mode = UV_DC_PRED;
+ }
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
+ read_palette_mode_info(cm, xd, r);
+
+ read_filter_intra_mode_info(cm, xd, r);
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+ return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
+ mv->col < MV_UPP;
+}
+
+static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
+ PREDICTION_MODE mode,
+ MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
+ int_mv ref_mv[2], int_mv nearest_mv[2],
+ int_mv near_mv[2], int is_compound, int allow_hp,
+ aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ BLOCK_SIZE bsize = mbmi->bsize;
+ FeatureFlags *const features = &cm->features;
+ if (features->cur_frame_force_integer_mv) {
+ allow_hp = MV_SUBPEL_NONE;
+ }
+ switch (mode) {
+ case NEWMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+ break;
+ }
+ case NEARESTMV: {
+ mv[0].as_int = nearest_mv[0].as_int;
+ break;
+ }
+ case NEARMV: {
+ mv[0].as_int = near_mv[0].as_int;
+ break;
+ }
+ case GLOBALMV: {
+ mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+ features->allow_high_precision_mv,
+ bsize, xd->mi_col, xd->mi_row,
+ features->cur_frame_force_integer_mv)
+ .as_int;
+ break;
+ }
+ case NEW_NEWMV: {
+ assert(is_compound);
+ for (int i = 0; i < 2; ++i) {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
+ }
+ break;
+ }
+ case NEAREST_NEARESTMV: {
+ assert(is_compound);
+ mv[0].as_int = nearest_mv[0].as_int;
+ mv[1].as_int = nearest_mv[1].as_int;
+ break;
+ }
+ case NEAR_NEARMV: {
+ assert(is_compound);
+ mv[0].as_int = near_mv[0].as_int;
+ mv[1].as_int = near_mv[1].as_int;
+ break;
+ }
+ case NEW_NEARESTMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ mv[1].as_int = nearest_mv[1].as_int;
+ break;
+ }
+ case NEAREST_NEWMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ mv[0].as_int = nearest_mv[0].as_int;
+ read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ break;
+ }
+ case NEAR_NEWMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ mv[0].as_int = near_mv[0].as_int;
+ read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ break;
+ }
+ case NEW_NEARMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ mv[1].as_int = near_mv[1].as_int;
+ break;
+ }
+ case GLOBAL_GLOBALMV: {
+ assert(is_compound);
+ mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+ features->allow_high_precision_mv,
+ bsize, xd->mi_col, xd->mi_row,
+ features->cur_frame_force_integer_mv)
+ .as_int;
+ mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+ features->allow_high_precision_mv,
+ bsize, xd->mi_col, xd->mi_row,
+ features->cur_frame_force_integer_mv)
+ .as_int;
+ break;
+ }
+ default: {
+ return 0;
+ }
+ }
+
+ int ret = is_mv_valid(&mv[0].as_mv);
+ if (is_compound) {
+ ret = ret && is_mv_valid(&mv[1].as_mv);
+ }
+ return ret;
+}
+
+static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int segment_id, aom_reader *r) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (frame < LAST_FRAME) return 0;
+ return frame != INTRA_FRAME;
+ }
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ return 1;
+ }
+ const int ctx = av1_get_intra_inter_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int is_inter =
+ aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
+ return is_inter;
+}
+
+#if DEC_MISMATCH_DEBUG
+static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, int16_t mode_ctx) {
+ int_mv mv[2] = { { 0 } };
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
+ mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+ int16_t zeromv_ctx = -1;
+ int16_t refmv_ctx = -1;
+ if (mbmi->mode != NEWMV) {
+ zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mbmi->mode != GLOBALMV)
+ refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ }
+
+#define FRAME_TO_CHECK 11
+ if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) {
+ printf(
+ "=== DECODER ===: "
+ "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+ "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+ "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+ "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+ cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode,
+ mbmi->mode, mbmi->sb_type, cm->show_frame, mv[0].as_mv.row,
+ mv[0].as_mv.col, mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+ mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx,
+ refmv_ctx, mbmi->tx_size);
+ }
+}
+#endif // DEC_MISMATCH_DEBUG
+
+static void read_inter_block_mode_info(AV1Decoder *const pbi,
+ DecoderCodingBlock *dcb,
+ MB_MODE_INFO *const mbmi,
+ aom_reader *r) {
+ AV1_COMMON *const cm = &pbi->common;
+ FeatureFlags *const features = &cm->features;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int allow_hp = features->allow_high_precision_mv;
+ int_mv nearestmv[2], nearmv[2];
+ int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
+ int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ MACROBLOCKD *const xd = &dcb->xd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+ const int is_compound = has_second_ref(mbmi);
+
+ const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack,
+ xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
+
+ mbmi->ref_mv_idx = 0;
+
+ if (mbmi->skip_mode) {
+ assert(is_compound);
+ mbmi->mode = NEAREST_NEARESTMV;
+ } else {
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
+ mbmi->mode = GLOBALMV;
+ } else {
+ const int mode_ctx =
+ av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
+ if (is_compound)
+ mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
+ else
+ mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(mbmi->mode))
+ read_drl_idx(ec_ctx, dcb, mbmi, r);
+ }
+ }
+
+ if (is_compound != is_inter_compound_mode(mbmi->mode)) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Prediction mode %d invalid with ref frame %d %d",
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ }
+
+ if (!is_compound && mbmi->mode != GLOBALMV) {
+ av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
+ &nearmv[0], features->cur_frame_force_integer_mv);
+ }
+
+ if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
+ const int ref_mv_idx = mbmi->ref_mv_idx + 1;
+ nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
+ nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
+ nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+ nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+ lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
+ features->cur_frame_force_integer_mv);
+ lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
+ features->cur_frame_force_integer_mv);
+ lower_mv_precision(&nearmv[0].as_mv, allow_hp,
+ features->cur_frame_force_integer_mv);
+ lower_mv_precision(&nearmv[1].as_mv, allow_hp,
+ features->cur_frame_force_integer_mv);
+ } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
+ nearmv[0] =
+ xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
+ }
+
+ int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] };
+
+ if (is_compound) {
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+ // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+ // mbmi->ref_mv_idx (like NEWMV)
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+ ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+ // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
+ if (compound_ref0_mode(mbmi->mode) == NEWMV)
+ ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+
+ if (compound_ref1_mode(mbmi->mode) == NEWMV)
+ ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+ } else {
+ if (mbmi->mode == NEWMV) {
+ if (dcb->ref_mv_count[ref_frame] > 1)
+ ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
+ }
+ }
+
+ if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV);
+
+ const int mv_corrupted_flag =
+ !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
+ nearestmv, nearmv, is_compound, allow_hp, r);
+ aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
+
+ mbmi->use_wedge_interintra = 0;
+ if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode &&
+ is_interintra_allowed(mbmi)) {
+ const int bsize_group = size_group_lookup[bsize];
+ const int interintra =
+ aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
+ assert(mbmi->ref_frame[1] == NONE_FRAME);
+ if (interintra) {
+ const INTERINTRA_MODE interintra_mode =
+ read_interintra_mode(xd, r, bsize_group);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ mbmi->interintra_mode = interintra_mode;
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ if (av1_is_wedge_used(bsize)) {
+ mbmi->use_wedge_interintra = aom_read_symbol(
+ r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
+ if (mbmi->use_wedge_interintra) {
+ mbmi->interintra_wedge_index = (int8_t)aom_read_symbol(
+ r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
+ }
+ }
+ }
+ }
+
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+ xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame);
+ }
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode &&
+ !has_second_ref(mbmi)) {
+ mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+ }
+ av1_count_overlappable_neighbors(cm, xd);
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME)
+ mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
+
+ // init
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+ if (has_second_ref(mbmi) && !mbmi->skip_mode) {
+ // Read idx to indicate current compound inter prediction mode group
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+
+ if (masked_compound_used) {
+ const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+ mbmi->comp_group_idx = (uint8_t)aom_read_symbol(
+ r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ mbmi->compound_idx = (uint8_t)aom_read_symbol(
+ r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+ mbmi->interinter_comp.type =
+ mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
+ } else {
+ // Distance-weighted compound is disabled, so always use average
+ mbmi->compound_idx = 1;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ }
+ } else {
+ assert(cm->current_frame.reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+ assert(masked_compound_used);
+
+ // compound_diffwtd, wedge
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+ mbmi->interinter_comp.type =
+ COMPOUND_WEDGE + aom_read_symbol(r,
+ ec_ctx->compound_type_cdf[bsize],
+ MASKED_COMPOUND_TYPES, ACCT_STR);
+ } else {
+ mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+ }
+
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol(
+ r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
+ mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR);
+ } else {
+ assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+ mbmi->interinter_comp.mask_type =
+ aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
+ }
+ }
+ }
+
+ read_mb_interp_filter(xd, features->interp_filter,
+ cm->seq_params->enable_dual_filter, mbmi, r);
+
+ if (mbmi->motion_mode == WARPED_CAUSAL) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->wm_params.invalid = 0;
+
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
+ }
+
+ if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+ &mbmi->wm_params, mi_row, mi_col)) {
+#if WARPED_MOTION_DEBUG
+ printf("Warning: unexpected warped model from aomenc\n");
+#endif
+ mbmi->wm_params.invalid = 1;
+ }
+ }
+
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+
+#if DEC_MISMATCH_DEBUG
+ dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
+#endif // DEC_MISMATCH_DEBUG
+}
+
+static void read_inter_frame_mode_info(AV1Decoder *const pbi,
+ DecoderCodingBlock *dcb, aom_reader *r) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int inter_block = 1;
+
+ mbmi->mv[0].as_int = 0;
+ mbmi->mv[1].as_int = 0;
+ mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r);
+
+ mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
+
+ if (mbmi->skip_mode)
+ mbmi->skip_txfm = 1;
+ else
+ mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r);
+
+ if (!cm->seg.segid_preskip)
+ mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
+
+ read_cdef(cm, r, xd);
+
+ read_delta_q_params(cm, xd, r);
+
+ if (!mbmi->skip_mode)
+ inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+ mbmi->current_qindex = xd->current_base_qindex;
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+ if (inter_block)
+ read_inter_block_mode_info(pbi, dcb, mbmi, r);
+ else
+ read_intra_block_mode_info(cm, xd, mbmi, r);
+}
+
+static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
+ int x_mis, int y_mis) {
+ const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
+ MV_REF *frame_mvs =
+ cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+ x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+ y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+
+ for (int h = 0; h < y_mis; h++) {
+ MV_REF *mv = frame_mvs;
+ for (int w = 0; w < x_mis; w++) {
+ mv->ref_frame = NONE_FRAME;
+ mv++;
+ }
+ frame_mvs += frame_mvs_stride;
+ }
+}
+
+void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
+ aom_reader *r, int x_mis, int y_mis) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ mi->use_intrabc = 0;
+
+ if (frame_is_intra_only(cm)) {
+ read_intra_frame_mode_info(cm, dcb, r);
+ if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
+ intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
+ } else {
+ read_inter_frame_mode_info(pbi, dcb, r);
+ if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
+ av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
+ }
+}
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
new file mode 100644
index 0000000000..3d8629c9a5
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
+
+#include "aom_dsp/bitreader.h"
+
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb,
+ aom_reader *r, int x_mis, int y_mis);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+ int blk_col, TX_SIZE tx_size, aom_reader *r);
+
+#endif // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
new file mode 100644
index 0000000000..32e94840be
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/detokenize.h"
+#include "av1/decoder/obu.h"
+
+static void initialize_dec(void) {
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_wedge_masks();
+}
+
+static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+ int height, BLOCK_SIZE min_partition_size) {
+ (void)min_partition_size;
+ // Ensure that the decoded width and height are both multiples of
+ // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+ // subsampling is used).
+ // This simplifies the implementation of various experiments,
+ // eg. cdef, which operates on units of 8x8 luma pixels.
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+ mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+ mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+ mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+ mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
+ mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
+ mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+ mi_params->mi_alloc_bsize = BLOCK_4X4;
+ mi_params->mi_alloc_stride = mi_params->mi_stride;
+
+ assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+ mi_size_high[mi_params->mi_alloc_bsize]);
+}
+
+static void dec_setup_mi(CommonModeInfoParams *mi_params) {
+ const int mi_grid_size =
+ mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+ memset(mi_params->mi_grid_base, 0,
+ mi_grid_size * sizeof(*mi_params->mi_grid_base));
+}
+
+static void dec_free_mi(CommonModeInfoParams *mi_params) {
+ aom_free(mi_params->mi_alloc);
+ mi_params->mi_alloc = NULL;
+ mi_params->mi_alloc_size = 0;
+ aom_free(mi_params->mi_grid_base);
+ mi_params->mi_grid_base = NULL;
+ mi_params->mi_grid_size = 0;
+ aom_free(mi_params->tx_type_map);
+ mi_params->tx_type_map = NULL;
+}
+
+AV1Decoder *av1_decoder_create(BufferPool *const pool) {
+ AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
+ if (!pbi) return NULL;
+ av1_zero(*pbi);
+
+ AV1_COMMON *volatile const cm = &pbi->common;
+ cm->seq_params = &pbi->seq_params;
+ cm->error = &pbi->error;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(pbi->error.jmp)) {
+ pbi->error.setjmp = 0;
+ av1_decoder_remove(pbi);
+ return NULL;
+ }
+
+ pbi->error.setjmp = 1;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(
+ cm, cm->default_frame_context,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+ memset(cm->fc, 0, sizeof(*cm->fc));
+ memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+ pbi->need_resync = 1;
+ initialize_dec();
+
+ // Initialize the references to not point to any frame buffers.
+ for (int i = 0; i < REF_FRAMES; i++) {
+ cm->ref_frame_map[i] = NULL;
+ }
+
+ cm->current_frame.frame_number = 0;
+ pbi->decoding_first_frame = 1;
+ pbi->common.buffer_pool = pool;
+
+ cm->seq_params->bit_depth = AOM_BITS_8;
+
+ cm->mi_params.free_mi = dec_free_mi;
+ cm->mi_params.setup_mi = dec_setup_mi;
+ cm->mi_params.set_mb_mi = dec_set_mb_mi;
+
+ av1_loop_filter_init(cm);
+
+ av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+ av1_loop_restoration_precal();
+
+#if CONFIG_ACCOUNTING
+ pbi->acct_enabled = 1;
+ aom_accounting_init(&pbi->accounting);
+#endif
+
+ pbi->error.setjmp = 0;
+
+ aom_get_worker_interface()->init(&pbi->lf_worker);
+ pbi->lf_worker.thread_name = "aom lf worker";
+
+ return pbi;
+}
+
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
+ if (tile_mt_info != NULL) {
+#if CONFIG_MULTITHREAD
+ if (tile_mt_info->job_mutex != NULL) {
+ pthread_mutex_destroy(tile_mt_info->job_mutex);
+ aom_free(tile_mt_info->job_mutex);
+ }
+#endif
+ aom_free(tile_mt_info->job_queue);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*tile_mt_info);
+ }
+}
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi) {
+ aom_free(pbi->cb_buffer_base);
+ pbi->cb_buffer_base = NULL;
+ pbi->cb_buffer_alloc_size = 0;
+}
+
+void av1_decoder_remove(AV1Decoder *pbi) {
+ int i;
+
+ if (!pbi) return;
+
+ // Free the tile list output buffer.
+ aom_free_frame_buffer(&pbi->tile_list_outbuf);
+
+ aom_get_worker_interface()->end(&pbi->lf_worker);
+ aom_free(pbi->lf_worker.data1);
+
+ if (pbi->thread_data) {
+ for (int worker_idx = 1; worker_idx < pbi->num_workers; worker_idx++) {
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ if (thread_data->td != NULL) {
+ av1_free_mc_tmp_buf(thread_data->td);
+ aom_free(thread_data->td);
+ }
+ }
+ aom_free(pbi->thread_data);
+ }
+ aom_free(pbi->dcb.xd.seg_mask);
+
+ for (i = 0; i < pbi->num_workers; ++i) {
+ AVxWorker *const worker = &pbi->tile_workers[i];
+ aom_get_worker_interface()->end(worker);
+ }
+#if CONFIG_MULTITHREAD
+ if (pbi->row_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(pbi->row_mt_mutex_);
+ aom_free(pbi->row_mt_mutex_);
+ }
+ if (pbi->row_mt_cond_ != NULL) {
+ pthread_cond_destroy(pbi->row_mt_cond_);
+ aom_free(pbi->row_mt_cond_);
+ }
+#endif
+ for (i = 0; i < pbi->allocated_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ }
+ aom_free(pbi->tile_data);
+ aom_free(pbi->tile_workers);
+
+ if (pbi->num_workers > 0) {
+ av1_loop_filter_dealloc(&pbi->lf_row_sync);
+ av1_loop_restoration_dealloc(&pbi->lr_row_sync);
+ av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+ }
+
+ av1_dec_free_cb_buf(pbi);
+#if CONFIG_ACCOUNTING
+ aom_accounting_clear(&pbi->accounting);
+#endif
+ av1_free_mc_tmp_buf(&pbi->td);
+ aom_img_metadata_array_free(pbi->metadata);
+ av1_remove_common(&pbi->common);
+ aom_free(pbi);
+}
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+ aom_reader *r, palette_visitor_fn_t visit) {
+ if (!is_inter_block(xd->mi[0])) {
+ for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
+ ++plane) {
+ if (plane == 0 || xd->is_chroma_ref) {
+ if (xd->mi[0]->palette_mode_info.palette_size[plane])
+ visit(xd, plane, r);
+ } else {
+ assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
+ }
+ }
+ }
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
+ YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
+ if (cfg == NULL) {
+ aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame");
+ return AOM_CODEC_ERROR;
+ }
+ if (!equal_dimensions(cfg, sd))
+ aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(cfg, sd, num_planes);
+
+ return pbi->error.error_code;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+ a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+ a->border == b->border &&
+ (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+ int use_external_ref,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *ref_buf = NULL;
+
+ // Get the destination reference buffer.
+ ref_buf = get_ref_frame(cm, idx);
+
+ if (ref_buf == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame");
+ return AOM_CODEC_ERROR;
+ }
+
+ if (!use_external_ref) {
+ if (!equal_dimensions(ref_buf, sd)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ } else {
+ // Overwrite the reference frame buffer.
+ aom_yv12_copy_frame(sd, ref_buf, num_planes);
+ }
+ } else {
+ if (!equal_dimensions_and_border(ref_buf, sd)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ } else {
+ // Overwrite the reference frame buffer pointers.
+ // Once we no longer need the external reference buffer, these pointers
+ // are restored.
+ ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
+ ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
+ ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
+ ref_buf->y_buffer = sd->y_buffer;
+ ref_buf->u_buffer = sd->u_buffer;
+ ref_buf->v_buffer = sd->v_buffer;
+ ref_buf->use_external_reference_buffers = 1;
+ }
+ }
+
+ return cm->error->error_code;
+}
+
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+
+ if (!equal_dimensions_and_border(new_frame, sd))
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+ return cm->error->error_code;
+}
+
+static void release_current_frame(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+
+ cm->cur_frame->buf.corrupted = 1;
+ lock_buffer_pool(pool);
+ decrease_ref_count(cm->cur_frame, pool);
+ unlock_buffer_pool(pool);
+ cm->cur_frame = NULL;
+}
+
+// If any buffer updating is signaled it should be done here.
+// Consumes a reference to cm->cur_frame.
+//
+// This functions returns void. It reports failure by setting
+// pbi->error.error_code.
+static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
+ int ref_index = 0, mask;
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+
+ if (frame_decoded) {
+ lock_buffer_pool(pool);
+
+ // In ext-tile decoding, the camera frame header is only decoded once. So,
+ // we don't update the references here.
+ if (!pbi->camera_frame_header_ready) {
+ // The following for loop needs to release the reference stored in
+ // cm->ref_frame_map[ref_index] before storing a reference to
+ // cm->cur_frame in cm->ref_frame_map[ref_index].
+ for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
+ if (mask & 1) {
+ decrease_ref_count(cm->ref_frame_map[ref_index], pool);
+ cm->ref_frame_map[ref_index] = cm->cur_frame;
+ ++cm->cur_frame->ref_count;
+ }
+ ++ref_index;
+ }
+ }
+
+ if (cm->show_existing_frame || cm->show_frame) {
+ if (pbi->output_all_layers) {
+ // Append this frame to the output queue
+ if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
+ // We can't store the new frame anywhere, so drop it and return an
+ // error
+ cm->cur_frame->buf.corrupted = 1;
+ decrease_ref_count(cm->cur_frame, pool);
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ } else {
+ pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
+ pbi->num_output_frames++;
+ }
+ } else {
+ // Replace any existing output frame
+ assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
+ if (pbi->num_output_frames > 0) {
+ decrease_ref_count(pbi->output_frames[0], pool);
+ }
+ pbi->output_frames[0] = cm->cur_frame;
+ pbi->num_output_frames = 1;
+ }
+ } else {
+ decrease_ref_count(cm->cur_frame, pool);
+ }
+
+ unlock_buffer_pool(pool);
+ } else {
+ // Nothing was decoded, so just drop this frame buffer
+ lock_buffer_pool(pool);
+ decrease_ref_count(cm->cur_frame, pool);
+ unlock_buffer_pool(pool);
+ }
+ cm->cur_frame = NULL;
+
+ if (!pbi->camera_frame_header_ready) {
+ // Invalidate these references until the next frame starts.
+ for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+ cm->remapped_ref_idx[ref_index] = INVALID_IDX;
+ }
+ }
+}
+
+int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
+ const uint8_t **psource) {
+ AV1_COMMON *volatile const cm = &pbi->common;
+ const uint8_t *source = *psource;
+ pbi->error.error_code = AOM_CODEC_OK;
+ pbi->error.has_detail = 0;
+
+ if (size == 0) {
+ // This is used to signal that we are missing frames.
+ // We do not know if the missing frame(s) was supposed to update
+ // any of the reference buffers, but we act conservative and
+ // mark only the last buffer as corrupted.
+ //
+ // TODO(jkoleszar): Error concealment is undefined and non-normative
+ // at this point, but if it becomes so, [0] may not always be the correct
+ // thing to do here.
+ RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME);
+ if (ref_buf != NULL) ref_buf->buf.corrupted = 1;
+ }
+
+ if (assign_cur_frame_new_fb(cm) == NULL) {
+ pbi->error.error_code = AOM_CODEC_MEM_ERROR;
+ return 1;
+ }
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(pbi->error.jmp)) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int i;
+
+ pbi->error.setjmp = 0;
+
+ // Synchronize all threads immediately as a subsequent decode call may
+ // cause a resize invalidating some allocations.
+ winterface->sync(&pbi->lf_worker);
+ for (i = 0; i < pbi->num_workers; ++i) {
+ winterface->sync(&pbi->tile_workers[i]);
+ }
+
+ release_current_frame(pbi);
+ return -1;
+ }
+
+ pbi->error.setjmp = 1;
+
+ int frame_decoded =
+ aom_decode_frame_from_obus(pbi, source, source + size, psource);
+
+ if (frame_decoded < 0) {
+ assert(pbi->error.error_code != AOM_CODEC_OK);
+ release_current_frame(pbi);
+ pbi->error.setjmp = 0;
+ return 1;
+ }
+
+#if TXCOEFF_TIMER
+ cm->cum_txcoeff_timer += cm->txcoeff_timer;
+ fprintf(stderr,
+ "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
+ cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
+ cm->txcoeff_timer = 0;
+ cm->txb_count = 0;
+#endif
+
+ // Note: At this point, this function holds a reference to cm->cur_frame
+ // in the buffer pool. This reference is consumed by update_frame_buffers().
+ update_frame_buffers(pbi, frame_decoded);
+
+ if (frame_decoded) {
+ pbi->decoding_first_frame = 0;
+ }
+
+ if (pbi->error.error_code != AOM_CODEC_OK) {
+ pbi->error.setjmp = 0;
+ return 1;
+ }
+
+ if (!cm->show_existing_frame) {
+ if (cm->seg.enabled) {
+ if (cm->prev_frame &&
+ (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
+ (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ } else {
+ cm->last_frame_seg_map = NULL;
+ }
+ }
+ }
+
+ // Update progress in frame parallel decode.
+ pbi->error.setjmp = 0;
+
+ return 0;
+}
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+ aom_film_grain_t **grain_params) {
+ if (index >= pbi->num_output_frames) return -1;
+ *sd = &pbi->output_frames[index]->buf;
+ *grain_params = &pbi->output_frames[index]->film_grain_params;
+ return 0;
+}
+
+// Get the highest-spatial-layer output
+// TODO(rachelbarker): What should this do?
+int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
+ if (pbi->num_output_frames == 0) return -1;
+
+ *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf;
+ return 0;
+}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
new file mode 100644
index 0000000000..560b1d9f24
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/thread_common.h"
+#include "av1/decoder/dthread.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief Contains coding block data required by the decoder.
+ *
+ * This includes:
+ * - Coding block info that is common between encoder and decoder.
+ * - Other coding block info only needed by the decoder.
+ * Contrast this with a similar struct MACROBLOCK on encoder side.
+ * This data is also common between ThreadData and AV1Decoder structs.
+ */
+typedef struct DecoderCodingBlock {
+ /*!
+ * Coding block info that is common between encoder and decoder.
+ */
+ DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+ /*!
+ * True if the at least one of the coding blocks decoded was corrupted.
+ */
+ int corrupted;
+ /*!
+ * Pointer to 'mc_buf' inside 'pbi->td' (single-threaded decoding) or
+ * 'pbi->thread_data[i].td' (multi-threaded decoding).
+ */
+ uint8_t *mc_buf[2];
+ /*!
+ * Pointer to 'dqcoeff' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base'
+ * with appropriate offset for the current superblock, for each plane.
+ */
+ tran_low_t *dqcoeff_block[MAX_MB_PLANE];
+ /*!
+ * cb_offset[p] is the offset into the dqcoeff_block[p] for the current coding
+ * block, for each plane 'p'.
+ */
+ uint16_t cb_offset[MAX_MB_PLANE];
+ /*!
+ * Pointer to 'eob_data' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base'
+ * with appropriate offset for the current superblock, for each plane.
+ */
+ eob_info *eob_data[MAX_MB_PLANE];
+ /*!
+ * txb_offset[p] is the offset into the eob_data[p] for the current coding
+ * block, for each plane 'p'.
+ */
+ uint16_t txb_offset[MAX_MB_PLANE];
+ /*!
+ * ref_mv_count[i] specifies the number of number of motion vector candidates
+ * in xd->ref_mv_stack[i].
+ */
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+} DecoderCodingBlock;
+
+/*!\cond */
+
+typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size);
+
+typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb,
+ BLOCK_SIZE bsize);
+
+typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd);
+
+typedef struct ThreadData {
+ DecoderCodingBlock dcb;
+
+ // Coding block buffer for the current superblock.
+ // Used only for single-threaded decoding and multi-threaded decoding with
+ // row_mt == 1 cases.
+ // See also: similar buffer in 'AV1Decoder'.
+ CB_BUFFER cb_buffer_base;
+
+ aom_reader *bit_reader;
+
+ // Motion compensation buffer used to get a prediction buffer with extended
+ // borders. One buffer for each of the two possible references.
+ uint8_t *mc_buf[2];
+ // Mask for this block used for compound prediction.
+ uint8_t *seg_mask;
+ // Allocated size of 'mc_buf'.
+ int32_t mc_buf_size;
+ // If true, the pointers in 'mc_buf' were converted from highbd pointers.
+ int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in
+ // mc_buf were converted from highbd pointers.
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+
+ decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
+ decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
+ decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit;
+ decode_block_visitor_fn_t inverse_tx_inter_block_visit;
+ predict_inter_block_visitor_fn_t predict_inter_block_visit;
+ cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit;
+} ThreadData;
+
+typedef struct AV1DecRowMTJobInfo {
+ int tile_row;
+ int tile_col;
+ int mi_row;
+} AV1DecRowMTJobInfo;
+
+typedef struct AV1DecRowMTSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ int allocated_sb_rows;
+ int *cur_sb_col;
+ // Denotes the superblock interval at which conditional signalling should
+ // happen. Also denotes the minimum number of extra superblocks of the top row
+ // to be complete to start decoding the current superblock. A value of 1
+ // indicates top-right dependency.
+ int sync_range;
+ // Denotes the additional number of superblocks in the previous row to be
+ // complete to start decoding the current superblock when intraBC tool is
+ // enabled. This additional top-right delay is required to satisfy the
+ // hardware constraints for intraBC tool when row multithreading is enabled.
+ int intrabc_extra_top_right_sb_delay;
+ int mi_rows;
+ int mi_cols;
+ int mi_rows_parse_done;
+ int mi_rows_decode_started;
+ int num_threads_working;
+} AV1DecRowMTSync;
+
+typedef struct AV1DecRowMTInfo {
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int start_tile;
+ int end_tile;
+ int mi_rows_to_decode;
+
+ // Invariant:
+ // mi_rows_parse_done >= mi_rows_decode_started.
+ // mi_rows_parse_done and mi_rows_decode_started are both initialized to 0.
+ // mi_rows_parse_done is incremented freely. mi_rows_decode_started may only
+ // be incremented to catch up with mi_rows_parse_done but is not allowed to
+ // surpass mi_rows_parse_done.
+ //
+ // When mi_rows_decode_started reaches mi_rows_to_decode, there are no more
+ // decode jobs.
+
+ // Indicates the progress of the bit-stream parsing of superblocks.
+ // Initialized to 0. Incremented by sb_mi_size when parse sb row is done.
+ int mi_rows_parse_done;
+ // Indicates the progress of the decoding of superblocks.
+ // Initialized to 0. Incremented by sb_mi_size when decode sb row is started.
+ int mi_rows_decode_started;
+ // Boolean: Initialized to 0 (false). Set to 1 (true) on error to abort
+ // decoding.
+ int row_mt_exit;
+} AV1DecRowMTInfo;
+
+typedef struct TileDataDec {
+ TileInfo tile_info;
+ aom_reader bit_reader;
+ DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+ AV1DecRowMTSync dec_row_mt_sync;
+} TileDataDec;
+
+typedef struct TileBufferDec {
+ const uint8_t *data;
+ size_t size;
+} TileBufferDec;
+
+typedef struct DataBuffer {
+ const uint8_t *data;
+ size_t size;
+} DataBuffer;
+
+typedef struct EXTERNAL_REFERENCES {
+ YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES];
+ int num;
+} EXTERNAL_REFERENCES;
+
+typedef struct TileJobsDec {
+ TileBufferDec *tile_buffer;
+ TileDataDec *tile_data;
+} TileJobsDec;
+
+typedef struct AV1DecTileMTData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex;
+#endif
+ TileJobsDec *job_queue;
+ int jobs_enqueued;
+ int jobs_dequeued;
+ int alloc_tile_rows;
+ int alloc_tile_cols;
+} AV1DecTileMT;
+
+typedef struct AV1Decoder {
+ DecoderCodingBlock dcb;
+
+ DECLARE_ALIGNED(32, AV1_COMMON, common);
+
+ AVxWorker lf_worker;
+ AV1LfSync lf_row_sync;
+ AV1LrSync lr_row_sync;
+ AV1LrStruct lr_ctxt;
+ AV1CdefSync cdef_sync;
+ AV1CdefWorkerData *cdef_worker;
+ AVxWorker *tile_workers;
+ int num_workers;
+ DecWorkerData *thread_data;
+ ThreadData td;
+ TileDataDec *tile_data;
+ int allocated_tiles;
+
+ TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+ AV1DecTileMT tile_mt_info;
+
+ // Each time the decoder is called, we expect to receive a full temporal unit.
+ // This can contain up to one shown frame per spatial layer in the current
+ // operating point (note that some layers may be entirely omitted).
+ // If the 'output_all_layers' option is true, we save all of these shown
+ // frames so that they can be returned to the application. If the
+ // 'output_all_layers' option is false, then we only output one image per
+ // temporal unit.
+ //
+ // Note: The saved buffers are released at the start of the next time the
+ // application calls aom_codec_decode().
+ int output_all_layers;
+ RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS];
+ size_t num_output_frames; // How many frames are queued up so far?
+
+ // In order to properly support random-access decoding, we need
+ // to behave slightly differently for the very first frame we decode.
+ // So we track whether this is the first frame or not.
+ int decoding_first_frame;
+
+ int allow_lowbitdepth;
+ int max_threads;
+ int inv_tile_order;
+ int need_resync; // wait for key/intra-only frame.
+ int reset_decoder_state;
+
+ int tile_size_bytes;
+ int tile_col_size_bytes;
+ int dec_tile_row, dec_tile_col; // always -1 for non-VR tile encoding
+#if CONFIG_ACCOUNTING
+ int acct_enabled;
+ Accounting accounting;
+#endif
+ int sequence_header_ready;
+ int sequence_header_changed;
+#if CONFIG_INSPECTION
+ aom_inspect_cb inspect_cb;
+ void *inspect_ctx;
+#endif
+ int operating_point;
+ int current_operating_point;
+ int seen_frame_header;
+ // The expected start_tile (tg_start syntax element) of the next tile group.
+ int next_start_tile;
+
+ // State if the camera frame header is already decoded while
+ // large_scale_tile = 1.
+ int camera_frame_header_ready;
+ size_t frame_header_size;
+ DataBuffer obu_size_hdr;
+ int output_frame_width_in_tiles_minus_1;
+ int output_frame_height_in_tiles_minus_1;
+ int tile_count_minus_1;
+ uint32_t coded_tile_data_size;
+ unsigned int ext_tile_debug; // for ext-tile software debug & testing
+
+ // Decoder has 3 modes of operation:
+ // (1) Single-threaded decoding.
+ // (2) Multi-threaded decoding with each tile decoded in parallel.
+ // (3) In addition to (2), each thread decodes 1 superblock row in parallel.
+ // row_mt = 1 triggers mode (3) above, while row_mt = 0, will trigger mode (1)
+ // or (2) depending on 'max_threads'.
+ unsigned int row_mt;
+
+ EXTERNAL_REFERENCES ext_refs;
+ YV12_BUFFER_CONFIG tile_list_outbuf;
+
+ // Coding block buffer for the current frame.
+ // Allocated and used only for multi-threaded decoding with 'row_mt == 0'.
+ // See also: similar buffer in 'ThreadData' struct.
+ CB_BUFFER *cb_buffer_base;
+ // Allocated size of 'cb_buffer_base'. Currently same as the number of
+ // superblocks in the coded frame.
+ int cb_buffer_alloc_size;
+
+ int allocated_row_mt_sync_rows;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *row_mt_mutex_;
+ pthread_cond_t *row_mt_cond_;
+#endif
+
+ AV1DecRowMTInfo frame_row_mt_info;
+ aom_metadata_array_t *metadata;
+
+ int context_update_tile_id;
+ int skip_loop_filter;
+ int skip_film_grain;
+ int is_annexb;
+ int valid_for_referencing[REF_FRAMES];
+ int is_fwd_kf_present;
+ int is_arf_frame_present;
+ int num_tile_groups;
+ aom_s_frame_info sframe_info;
+
+ /*!
+ * Elements part of the sequence header, that are applicable for all the
+ * frames in the video.
+ */
+ SequenceHeader seq_params;
+
+ /*!
+ * If true, buffer removal times are present.
+ */
+ bool buffer_removal_time_present;
+
+ /*!
+ * Code and details about current error status.
+ */
+ struct aom_internal_error_info error;
+
+ /*!
+ * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+ */
+ unsigned int number_temporal_layers;
+
+ /*!
+ * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+ */
+ unsigned int number_spatial_layers;
+} AV1Decoder;
+
+// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
+// code and returns a nonzero value on failure.
+int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
+ const uint8_t **psource);
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+ aom_film_grain_t **grain_params);
+
+int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx,
+ YV12_BUFFER_CONFIG *sd);
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+ int use_external_ref,
+ YV12_BUFFER_CONFIG *sd);
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd);
+
+struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
+
+void av1_decoder_remove(struct AV1Decoder *pbi);
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info);
+
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi);
+
+static INLINE void decrease_ref_count(RefCntBuffer *const buf,
+ BufferPool *const pool) {
+ if (buf != NULL) {
+ --buf->ref_count;
+ // Reference counts should never become negative. If this assertion fails,
+ // there is a bug in our reference count management.
+ assert(buf->ref_count >= 0);
+ // A worker may only get a free framebuffer index when calling get_free_fb.
+ // But the raw frame buffer is not set up until we finish decoding header.
+ // So if any error happens during decoding header, frame_bufs[idx] will not
+ // have a valid raw frame buffer.
+ if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
+ pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
+ buf->raw_frame_buffer.data = NULL;
+ buf->raw_frame_buffer.size = 0;
+ buf->raw_frame_buffer.priv = NULL;
+ }
+ }
+}
+
+#define ACCT_STR __func__
+static INLINE int av1_read_uniform(aom_reader *r, int n) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ const int v = aom_read_literal(r, l - 1, ACCT_STR);
+ assert(l != 0);
+ if (v < m)
+ return v;
+ else
+ return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
+}
+
+typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
+ aom_reader *r);
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+ aom_reader *r, palette_visitor_fn_t visit);
+
+typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize);
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
new file mode 100644
index 0000000000..dd5aa62001
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/decoder/decodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "av1/decoder/decodemv.h"
+
+#define ACCT_STR __func__
+
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
+ int x = 1;
+ int length = 0;
+ int i = 0;
+
+ while (!i) {
+ i = aom_read_bit(r, ACCT_STR);
+ ++length;
+ if (length > 20) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid length in read_golomb");
+ break;
+ }
+ }
+
+ for (i = 0; i < length - 1; ++i) {
+ x <<= 1;
+ x += aom_read_bit(r, ACCT_STR);
+ }
+
+ return x - 1;
+}
+
+static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+ int eob = av1_eob_group_start[eob_token];
+ if (eob > 2) {
+ eob += extra;
+ }
+ return eob;
+}
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+ const qm_val_t *iqmatrix) {
+ int dqv = dequant[!!coeff_idx];
+ if (iqmatrix != NULL)
+ dqv =
+ ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ return dqv;
+}
+
+static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+ int start_si, int end_si,
+ const int16_t *scan, int bhl,
+ uint8_t *levels,
+ base_cdf_arr base_cdf,
+ br_cdf_arr br_cdf) {
+ for (int c = end_si; c >= start_si; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bhl, tx_size);
+ const int nsymbs = 4;
+ int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+ if (level > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx_2d(levels, pos, bhl);
+ aom_cdf_prob *cdf = br_cdf[br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+ level += k;
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ levels[get_padded_idx(pos, bhl)] = level;
+ }
+}
+
+static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+ TX_CLASS tx_class, int start_si,
+ int end_si, const int16_t *scan, int bhl,
+ uint8_t *levels, base_cdf_arr base_cdf,
+ br_cdf_arr br_cdf) {
+ for (int c = end_si; c >= start_si; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, pos, bhl, tx_size, tx_class);
+ const int nsymbs = 4;
+ int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+ if (level > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ aom_cdf_prob *cdf = br_cdf[br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+ level += k;
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ levels[get_padded_idx(pos, bhl)] = level;
+ }
+}
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb,
+ aom_reader *const r, const int blk_row,
+ const int blk_col, const int plane,
+ const TXB_CTX *const txb_ctx,
+ const TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &dcb->xd;
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ const int32_t max_value = (1 << (7 + xd->bd)) - 1;
+ const int32_t min_value = -(1 << (7 + xd->bd));
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
+ tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane];
+ const int shift = av1_get_tx_scale(tx_size);
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ int cul_level = 0;
+ int dc_val = 0;
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ const int all_zero = aom_read_symbol(
+ r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
+ eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane];
+ uint16_t *const eob = &(eob_data->eob);
+ uint16_t *const max_scan_line = &(eob_data->max_scan_line);
+ *max_scan_line = 0;
+ *eob = 0;
+
+#if CONFIG_INSPECTION
+ if (plane == 0) {
+ const int txk_type_idx =
+ av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col);
+ mbmi->tx_skip[txk_type_idx] = all_zero;
+ }
+#endif
+
+ if (all_zero) {
+ *max_scan_line = 0;
+ if (plane == 0) {
+ xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT;
+ }
+ return 0;
+ }
+
+ if (plane == AOM_PLANE_Y) {
+ // only y plane's tx_type is transmitted
+ av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
+ }
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const qm_val_t *iqmatrix =
+ av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ int eob_extra = 0;
+ int eob_pt = 1;
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ switch (eob_multi_size) {
+ case 0:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
+ 5, ACCT_STR) +
+ 1;
+ break;
+ case 1:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
+ 6, ACCT_STR) +
+ 1;
+ break;
+ case 2:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
+ 7, ACCT_STR) +
+ 1;
+ break;
+ case 3:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
+ 8, ACCT_STR) +
+ 1;
+ break;
+ case 4:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
+ 9, ACCT_STR) +
+ 1;
+ break;
+ case 5:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
+ 10, ACCT_STR) +
+ 1;
+ break;
+ case 6:
+ default:
+ eob_pt = aom_read_symbol(
+ r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
+ ACCT_STR) +
+ 1;
+ break;
+ }
+
+ const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+ if (eob_offset_bits > 0) {
+ const int eob_ctx = eob_pt - 3;
+ int bit = aom_read_symbol(
+ r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
+ if (bit) {
+ eob_extra += (1 << (eob_offset_bits - 1));
+ }
+
+ for (int i = 1; i < eob_offset_bits; i++) {
+ bit = aom_read_bit(r, ACCT_STR);
+ if (bit) {
+ eob_extra += (1 << (eob_offset_bits - 1 - i));
+ }
+ }
+ }
+ *eob = rec_eob_pos(eob_pt, eob_extra);
+
+ if (*eob > 1) {
+ memset(levels_buf, 0,
+ sizeof(*levels_buf) *
+ ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END));
+ }
+
+ {
+ // Read the non-zero coefficient with scan index eob-1
+ // TODO(angiebird): Put this into a function
+ const int c = *eob - 1;
+ const int pos = scan[c];
+ const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, c);
+ const int nsymbs = 3;
+ aom_cdf_prob *cdf =
+ ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
+ int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
+ if (level > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx_eob(pos, bhl, tx_class);
+ cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+ level += k;
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ levels[get_padded_idx(pos, bhl)] = level;
+ }
+ if (*eob > 1) {
+ base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
+ br_cdf_arr br_cdf =
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
+ if (tx_class == TX_CLASS_2D) {
+ read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bhl, levels,
+ base_cdf, br_cdf);
+ read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bhl, levels,
+ base_cdf, br_cdf);
+ } else {
+ read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bhl,
+ levels, base_cdf, br_cdf);
+ }
+ }
+
+ for (int c = 0; c < *eob; ++c) {
+ const int pos = scan[c];
+ uint8_t sign;
+ tran_low_t level = levels[get_padded_idx(pos, bhl)];
+ if (level) {
+ *max_scan_line = AOMMAX(*max_scan_line, pos);
+ if (c == 0) {
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+ 2, ACCT_STR);
+ } else {
+ sign = aom_read_bit(r, ACCT_STR);
+ }
+ if (level >= MAX_BASE_BR_RANGE) {
+ level += read_golomb(xd, r);
+ }
+
+ if (c == 0) dc_val = sign ? -level : level;
+
+ // Bitmasking to clamp level to valid range:
+ // The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
+ level &= 0xfffff;
+ cul_level += level;
+ tran_low_t dq_coeff;
+ // Bitmasking to clamp dq_coeff to valid range:
+ // The valid range for 8/10/12 bit video is at most 17/19/21 bit
+ dq_coeff = (tran_low_t)(
+ (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+ dq_coeff = dq_coeff >> shift;
+ if (sign) {
+ dq_coeff = -dq_coeff;
+ }
+ tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
+ }
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+
+ // DC value
+ set_dc_sign(&cul_level, dc_val);
+
+ return cul_level;
+}
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+ DecoderCodingBlock *dcb, aom_reader *const r,
+ const int plane, const int row, const int col,
+ const TX_SIZE tx_size) {
+#if TXCOEFF_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ MACROBLOCKD *const xd = &dcb->xd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
+ pd->left_entropy_context + row, &txb_ctx);
+ const uint8_t cul_level =
+ av1_read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size);
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
+ row);
+
+ if (is_inter_block(mbmi)) {
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ // tx_type will be read out in av1_read_coeffs_txb_facade
+ const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
+ cm->features.reduced_tx_set_used);
+
+ if (plane == 0) {
+ const int txw = tx_size_wide_unit[tx_size];
+ const int txh = tx_size_high_unit[tx_size];
+ // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+ // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+ // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+ // the intricacy, cover all the 16x16 units inside a 64 level transform.
+ if (txw == tx_size_wide_unit[TX_64X64] ||
+ txh == tx_size_high_unit[TX_64X64]) {
+ const int tx_unit = tx_size_wide_unit[TX_16X16];
+ const int stride = xd->tx_type_map_stride;
+ for (int idy = 0; idy < txh; idy += tx_unit) {
+ for (int idx = 0; idx < txw; idx += tx_unit) {
+ xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type;
+ }
+ }
+ }
+ }
+ }
+
+#if TXCOEFF_TIMER
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ cm->txcoeff_timer += elapsed_time;
+ ++cm->txb_count;
+#endif
+}
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
new file mode 100644
index 0000000000..fd34d40341
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
+
+#include "av1/common/enums.h"
+
+struct aom_reader;
+struct AV1Common;
+struct DecoderCodingBlock;
+struct txb_ctx;
+
+uint8_t av1_read_coeffs_txb(const struct AV1Common *const cm,
+ struct DecoderCodingBlock *dcb,
+ struct aom_reader *const r, const int blk_row,
+ const int blk_col, const int plane,
+ const struct txb_ctx *const txb_ctx,
+ const TX_SIZE tx_size);
+
+void av1_read_coeffs_txb_facade(const struct AV1Common *const cm,
+ struct DecoderCodingBlock *dcb,
+ struct aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size);
+#endif // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c
new file mode 100644
index 0000000000..3c6a006eaf
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/idct.h"
+
+static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
+ uint8_t color_order[PALETTE_MAX_SIZE];
+ const int n = param->n_colors;
+ uint8_t *const color_map = param->color_map;
+ MapCdf color_map_cdf = param->map_cdf;
+ int plane_block_width = param->plane_width;
+ int plane_block_height = param->plane_height;
+ int rows = param->rows;
+ int cols = param->cols;
+
+ // The first color index.
+ color_map[0] = av1_read_uniform(r, n);
+ assert(color_map[0] < n);
+
+ // Run wavefront on the palette map index decoding.
+ for (int i = 1; i < rows + cols - 1; ++i) {
+ for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
+ const int color_ctx = av1_get_palette_color_index_context(
+ color_map, plane_block_width, (i - j), j, n, color_order, NULL);
+ const int color_idx = aom_read_symbol(
+ r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
+ assert(color_idx >= 0 && color_idx < n);
+ color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
+ }
+ }
+ // Copy last column to extra columns.
+ if (cols < plane_block_width) {
+ for (int i = 0; i < rows; ++i) {
+ memset(color_map + i * plane_block_width + cols,
+ color_map[i * plane_block_width + cols - 1],
+ (plane_block_width - cols));
+ }
+ }
+ // Copy last row to extra rows.
+ for (int i = rows; i < plane_block_height; ++i) {
+ memcpy(color_map + i * plane_block_width,
+ color_map + (rows - 1) * plane_block_width, plane_block_width);
+ }
+}
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+ aom_reader *r) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam params;
+ params.color_map =
+ xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
+ params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+ : xd->tile_ctx->palette_y_color_index_cdf;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ params.n_colors = mbmi->palette_mode_info.palette_size[plane];
+ av1_get_block_dimensions(mbmi->bsize, plane, xd, &params.plane_width,
+ &params.plane_height, &params.rows, &params.cols);
+ decode_color_map_tokens(&params, r);
+}
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
new file mode 100644
index 0000000000..173b437a94
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/scan.h"
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
new file mode 100644
index 0000000000..f82b9d8ccf
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_util/aom_thread.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct AV1Decoder;
+struct ThreadData;
+
+typedef struct DecWorkerData {
+ struct ThreadData *td;
+ const uint8_t *data_end;
+ struct aom_internal_error_info error_info;
+} DecWorkerData;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+ struct AV1Decoder *pbi;
+ const uint8_t *data;
+ const uint8_t *data_end;
+ size_t data_size;
+ void *user_priv;
+ int received_frame;
+ int frame_context_ready; // Current frame's context is ready to read.
+ int frame_decoded; // Finished decoding current frame.
+} FrameWorkerData;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/grain_synthesis.c b/third_party/aom/av1/decoder/grain_synthesis.c
new file mode 100644
index 0000000000..d276f6f90e
--- /dev/null
+++ b/third_party/aom/av1/decoder/grain_synthesis.c
@@ -0,0 +1,1461 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/decoder/grain_synthesis.h"
+
+// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
+// with zero mean and standard deviation of about 512.
+// should be divided by 4 for 10-bit range and 16 for 8-bit range.
+static const int gaussian_sequence[2048] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484
+};
+
+static const int gauss_bits = 11;
+
+static int luma_subblock_size_y = 32;
+static int luma_subblock_size_x = 32;
+
+static int chroma_subblock_size_y = 16;
+static int chroma_subblock_size_x = 16;
+
+static const int min_luma_legal_range = 16;
+static const int max_luma_legal_range = 235;
+
+static const int min_chroma_legal_range = 16;
+static const int max_chroma_legal_range = 240;
+
+static int scaling_lut_y[256];
+static int scaling_lut_cb[256];
+static int scaling_lut_cr[256];
+
+static int grain_min;
+static int grain_max;
+
+static uint16_t random_register = 0; // random number generator register
+
+static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
+ int ***pred_pos_chroma, int **luma_grain_block,
+ int **cb_grain_block, int **cr_grain_block,
+ int **y_line_buf, int **cb_line_buf,
+ int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
+ int **cr_col_buf) {
+ int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (params->num_y_points > 0) ++num_pos_chroma;
+
+ if (*pred_pos_luma) {
+ for (int row = 0; row < num_pos_luma; row++) {
+ aom_free((*pred_pos_luma)[row]);
+ }
+ aom_free(*pred_pos_luma);
+ *pred_pos_luma = NULL;
+ }
+
+ if (*pred_pos_chroma) {
+ for (int row = 0; row < num_pos_chroma; row++) {
+ aom_free((*pred_pos_chroma)[row]);
+ }
+ aom_free(*pred_pos_chroma);
+ *pred_pos_chroma = NULL;
+ }
+
+ aom_free(*y_line_buf);
+ *y_line_buf = NULL;
+
+ aom_free(*cb_line_buf);
+ *cb_line_buf = NULL;
+
+ aom_free(*cr_line_buf);
+ *cr_line_buf = NULL;
+
+ aom_free(*y_col_buf);
+ *y_col_buf = NULL;
+
+ aom_free(*cb_col_buf);
+ *cb_col_buf = NULL;
+
+ aom_free(*cr_col_buf);
+ *cr_col_buf = NULL;
+
+ aom_free(*luma_grain_block);
+ *luma_grain_block = NULL;
+
+ aom_free(*cb_grain_block);
+ *cb_grain_block = NULL;
+
+ aom_free(*cr_grain_block);
+ *cr_grain_block = NULL;
+}
+
+static bool init_arrays(const aom_film_grain_t *params, int luma_stride,
+ int chroma_stride, int ***pred_pos_luma_p,
+ int ***pred_pos_chroma_p, int **luma_grain_block,
+ int **cb_grain_block, int **cr_grain_block,
+ int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
+ int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
+ int luma_grain_samples, int chroma_grain_samples,
+ int chroma_subsamp_y, int chroma_subsamp_x) {
+ *pred_pos_luma_p = NULL;
+ *pred_pos_chroma_p = NULL;
+ *luma_grain_block = NULL;
+ *cb_grain_block = NULL;
+ *cr_grain_block = NULL;
+ *y_line_buf = NULL;
+ *cb_line_buf = NULL;
+ *cr_line_buf = NULL;
+ *y_col_buf = NULL;
+ *cb_col_buf = NULL;
+ *cr_col_buf = NULL;
+
+ memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
+ memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
+ memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
+
+ int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (params->num_y_points > 0) ++num_pos_chroma;
+
+ int **pred_pos_luma;
+ int **pred_pos_chroma;
+
+ pred_pos_luma = (int **)aom_calloc(num_pos_luma, sizeof(*pred_pos_luma));
+ if (!pred_pos_luma) return false;
+
+ for (int row = 0; row < num_pos_luma; row++) {
+ pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
+ if (!pred_pos_luma[row]) {
+ dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p,
+ luma_grain_block, cb_grain_block, cr_grain_block,
+ y_line_buf, cb_line_buf, cr_line_buf, y_col_buf,
+ cb_col_buf, cr_col_buf);
+ return false;
+ }
+ }
+
+ pred_pos_chroma =
+ (int **)aom_calloc(num_pos_chroma, sizeof(*pred_pos_chroma));
+ if (!pred_pos_chroma) {
+ dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block,
+ cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf,
+ cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf);
+ return false;
+ }
+
+ for (int row = 0; row < num_pos_chroma; row++) {
+ pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
+ if (!pred_pos_chroma[row]) {
+ dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p,
+ luma_grain_block, cb_grain_block, cr_grain_block,
+ y_line_buf, cb_line_buf, cr_line_buf, y_col_buf,
+ cb_col_buf, cr_col_buf);
+ return false;
+ }
+ }
+
+ int pos_ar_index = 0;
+
+ for (int row = -params->ar_coeff_lag; row < 0; row++) {
+ for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
+ col++) {
+ pred_pos_luma[pos_ar_index][0] = row;
+ pred_pos_luma[pos_ar_index][1] = col;
+ pred_pos_luma[pos_ar_index][2] = 0;
+
+ pred_pos_chroma[pos_ar_index][0] = row;
+ pred_pos_chroma[pos_ar_index][1] = col;
+ pred_pos_chroma[pos_ar_index][2] = 0;
+ ++pos_ar_index;
+ }
+ }
+
+ for (int col = -params->ar_coeff_lag; col < 0; col++) {
+ pred_pos_luma[pos_ar_index][0] = 0;
+ pred_pos_luma[pos_ar_index][1] = col;
+ pred_pos_luma[pos_ar_index][2] = 0;
+
+ pred_pos_chroma[pos_ar_index][0] = 0;
+ pred_pos_chroma[pos_ar_index][1] = col;
+ pred_pos_chroma[pos_ar_index][2] = 0;
+
+ ++pos_ar_index;
+ }
+
+ if (params->num_y_points > 0) {
+ pred_pos_chroma[pos_ar_index][0] = 0;
+ pred_pos_chroma[pos_ar_index][1] = 0;
+ pred_pos_chroma[pos_ar_index][2] = 1;
+ }
+
+ *pred_pos_luma_p = pred_pos_luma;
+ *pred_pos_chroma_p = pred_pos_chroma;
+
+ *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
+ *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
+ (2 >> chroma_subsamp_y));
+ *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
+ (2 >> chroma_subsamp_y));
+
+ *y_col_buf =
+ (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
+ *cb_col_buf =
+ (int *)aom_malloc(sizeof(**cb_col_buf) *
+ (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+ (2 >> chroma_subsamp_x));
+ *cr_col_buf =
+ (int *)aom_malloc(sizeof(**cr_col_buf) *
+ (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+ (2 >> chroma_subsamp_x));
+
+ *luma_grain_block =
+ (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
+ *cb_grain_block =
+ (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
+ *cr_grain_block =
+ (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
+ if (!(*pred_pos_luma_p && *pred_pos_chroma_p && *y_line_buf && *cb_line_buf &&
+ *cr_line_buf && *y_col_buf && *cb_col_buf && *cr_col_buf &&
+ *luma_grain_block && *cb_grain_block && *cr_grain_block)) {
+ dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block,
+ cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf,
+ cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf);
+ return false;
+ }
+ return true;
+}
+
+// get a number between 0 and 2^bits - 1
+static INLINE int get_random_number(int bits) {
+ uint16_t bit;
+ bit = ((random_register >> 0) ^ (random_register >> 1) ^
+ (random_register >> 3) ^ (random_register >> 12)) &
+ 1;
+ random_register = (random_register >> 1) | (bit << 15);
+ return (random_register >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static void init_random_generator(int luma_line, uint16_t seed) {
+ // same for the picture
+
+ uint16_t msb = (seed >> 8) & 255;
+ uint16_t lsb = seed & 255;
+
+ random_register = (msb << 8) + lsb;
+
+ // changes for each row
+ int luma_num = luma_line >> 5;
+
+ random_register ^= ((luma_num * 37 + 178) & 255) << 8;
+ random_register ^= ((luma_num * 173 + 105) & 255);
+}
+
+static void generate_luma_grain_block(
+ const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+ int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
+ int left_pad, int top_pad, int right_pad, int bottom_pad) {
+ if (params->num_y_points == 0) {
+ memset(luma_grain_block, 0,
+ sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
+ return;
+ }
+
+ int bit_depth = params->bit_depth;
+ int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+ int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+ int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+
+ for (int i = 0; i < luma_block_size_y; i++)
+ for (int j = 0; j < luma_block_size_x; j++)
+ luma_grain_block[i * luma_grain_stride + j] =
+ (gaussian_sequence[get_random_number(gauss_bits)] +
+ ((1 << gauss_sec_shift) >> 1)) >>
+ gauss_sec_shift;
+
+ for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
+ for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
+ int wsum = 0;
+ for (int pos = 0; pos < num_pos_luma; pos++) {
+ wsum = wsum + params->ar_coeffs_y[pos] *
+ luma_grain_block[(i + pred_pos_luma[pos][0]) *
+ luma_grain_stride +
+ j + pred_pos_luma[pos][1]];
+ }
+ luma_grain_block[i * luma_grain_stride + j] =
+ clamp(luma_grain_block[i * luma_grain_stride + j] +
+ ((wsum + rounding_offset) >> params->ar_coeff_shift),
+ grain_min, grain_max);
+ }
+}
+
+static bool generate_chroma_grain_blocks(
+ const aom_film_grain_t *params, int **pred_pos_chroma,
+ int *luma_grain_block, int *cb_grain_block, int *cr_grain_block,
+ int luma_grain_stride, int chroma_block_size_y, int chroma_block_size_x,
+ int chroma_grain_stride, int left_pad, int top_pad, int right_pad,
+ int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
+ int bit_depth = params->bit_depth;
+ int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+ int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+ if (params->num_y_points > 0) ++num_pos_chroma;
+ int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+ int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
+
+ if (params->num_cb_points || params->chroma_scaling_from_luma) {
+ init_random_generator(7 << 5, params->random_seed);
+
+ for (int i = 0; i < chroma_block_size_y; i++)
+ for (int j = 0; j < chroma_block_size_x; j++)
+ cb_grain_block[i * chroma_grain_stride + j] =
+ (gaussian_sequence[get_random_number(gauss_bits)] +
+ ((1 << gauss_sec_shift) >> 1)) >>
+ gauss_sec_shift;
+ } else {
+ memset(cb_grain_block, 0,
+ sizeof(*cb_grain_block) * chroma_grain_block_size);
+ }
+
+ if (params->num_cr_points || params->chroma_scaling_from_luma) {
+ init_random_generator(11 << 5, params->random_seed);
+
+ for (int i = 0; i < chroma_block_size_y; i++)
+ for (int j = 0; j < chroma_block_size_x; j++)
+ cr_grain_block[i * chroma_grain_stride + j] =
+ (gaussian_sequence[get_random_number(gauss_bits)] +
+ ((1 << gauss_sec_shift) >> 1)) >>
+ gauss_sec_shift;
+ } else {
+ memset(cr_grain_block, 0,
+ sizeof(*cr_grain_block) * chroma_grain_block_size);
+ }
+
+ for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
+ for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
+ int wsum_cb = 0;
+ int wsum_cr = 0;
+ for (int pos = 0; pos < num_pos_chroma; pos++) {
+ if (pred_pos_chroma[pos][2] == 0) {
+ wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
+ cb_grain_block[(i + pred_pos_chroma[pos][0]) *
+ chroma_grain_stride +
+ j + pred_pos_chroma[pos][1]];
+ wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
+ cr_grain_block[(i + pred_pos_chroma[pos][0]) *
+ chroma_grain_stride +
+ j + pred_pos_chroma[pos][1]];
+ } else if (pred_pos_chroma[pos][2] == 1) {
+ int av_luma = 0;
+ int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
+ int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
+
+ for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
+ k++)
+ for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
+ l++)
+ av_luma += luma_grain_block[k * luma_grain_stride + l];
+
+ av_luma =
+ (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
+ (chroma_subsamp_y + chroma_subsamp_x);
+
+ wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
+ wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
+ } else {
+ fprintf(
+ stderr,
+ "Grain synthesis: prediction between two chroma components is "
+ "not supported!");
+ return false;
+ }
+ }
+ if (params->num_cb_points || params->chroma_scaling_from_luma)
+ cb_grain_block[i * chroma_grain_stride + j] =
+ clamp(cb_grain_block[i * chroma_grain_stride + j] +
+ ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
+ grain_min, grain_max);
+ if (params->num_cr_points || params->chroma_scaling_from_luma)
+ cr_grain_block[i * chroma_grain_stride + j] =
+ clamp(cr_grain_block[i * chroma_grain_stride + j] +
+ ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
+ grain_min, grain_max);
+ }
+ return true;
+}
+
+static void init_scaling_function(const int scaling_points[][2], int num_points,
+ int scaling_lut[]) {
+ if (num_points == 0) return;
+
+ for (int i = 0; i < scaling_points[0][0]; i++)
+ scaling_lut[i] = scaling_points[0][1];
+
+ for (int point = 0; point < num_points - 1; point++) {
+ int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
+ int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
+
+ int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+
+ for (int x = 0; x < delta_x; x++) {
+ scaling_lut[scaling_points[point][0] + x] =
+ scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
+ }
+ }
+
+ for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
+ scaling_lut[i] = scaling_points[num_points - 1][1];
+}
+
+// function that extracts samples from a LUT (and interpolates intemediate
+// frames for 10- and 12-bit video)
+static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
+ int x = index >> (bit_depth - 8);
+
+ if (!(bit_depth - 8) || x == 255)
+ return scaling_lut[x];
+ else
+ return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
+ (index & ((1 << (bit_depth - 8)) - 1)) +
+ (1 << (bit_depth - 9))) >>
+ (bit_depth - 8));
+}
+
+static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
+ uint8_t *cb, uint8_t *cr, int luma_stride,
+ int chroma_stride, int *luma_grain,
+ int *cb_grain, int *cr_grain,
+ int luma_grain_stride, int chroma_grain_stride,
+ int half_luma_height, int half_luma_width,
+ int bit_depth, int chroma_subsamp_y,
+ int chroma_subsamp_x, int mc_identity) {
+ int cb_mult = params->cb_mult - 128; // fixed scale
+ int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale
+ int cb_offset = params->cb_offset - 256;
+
+ int cr_mult = params->cr_mult - 128; // fixed scale
+ int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale
+ int cr_offset = params->cr_offset - 256;
+
+ int rounding_offset = (1 << (params->scaling_shift - 1));
+
+ int apply_y = params->num_y_points > 0 ? 1 : 0;
+ int apply_cb =
+ (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+ int apply_cr =
+ (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+
+ if (params->chroma_scaling_from_luma) {
+ cb_mult = 0; // fixed scale
+ cb_luma_mult = 64; // fixed scale
+ cb_offset = 0;
+
+ cr_mult = 0; // fixed scale
+ cr_luma_mult = 64; // fixed scale
+ cr_offset = 0;
+ }
+
+ int min_luma, max_luma, min_chroma, max_chroma;
+
+ if (params->clip_to_restricted_range) {
+ min_luma = min_luma_legal_range;
+ max_luma = max_luma_legal_range;
+
+ if (mc_identity) {
+ min_chroma = min_luma_legal_range;
+ max_chroma = max_luma_legal_range;
+ } else {
+ min_chroma = min_chroma_legal_range;
+ max_chroma = max_chroma_legal_range;
+ }
+ } else {
+ min_luma = min_chroma = 0;
+ max_luma = max_chroma = 255;
+ }
+
+ for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+ for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+ int average_luma = 0;
+ if (chroma_subsamp_x) {
+ average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+ (j << chroma_subsamp_x)] +
+ luma[(i << chroma_subsamp_y) * luma_stride +
+ (j << chroma_subsamp_x) + 1] +
+ 1) >>
+ 1;
+ } else {
+ average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+ }
+
+ if (apply_cb) {
+ cb[i * chroma_stride + j] = clamp(
+ cb[i * chroma_stride + j] +
+ ((scale_LUT(scaling_lut_cb,
+ clamp(((average_luma * cb_luma_mult +
+ cb_mult * cb[i * chroma_stride + j]) >>
+ 6) +
+ cb_offset,
+ 0, (256 << (bit_depth - 8)) - 1),
+ 8) *
+ cb_grain[i * chroma_grain_stride + j] +
+ rounding_offset) >>
+ params->scaling_shift),
+ min_chroma, max_chroma);
+ }
+
+ if (apply_cr) {
+ cr[i * chroma_stride + j] = clamp(
+ cr[i * chroma_stride + j] +
+ ((scale_LUT(scaling_lut_cr,
+ clamp(((average_luma * cr_luma_mult +
+ cr_mult * cr[i * chroma_stride + j]) >>
+ 6) +
+ cr_offset,
+ 0, (256 << (bit_depth - 8)) - 1),
+ 8) *
+ cr_grain[i * chroma_grain_stride + j] +
+ rounding_offset) >>
+ params->scaling_shift),
+ min_chroma, max_chroma);
+ }
+ }
+ }
+
+ if (apply_y) {
+ for (int i = 0; i < (half_luma_height << 1); i++) {
+ for (int j = 0; j < (half_luma_width << 1); j++) {
+ luma[i * luma_stride + j] =
+ clamp(luma[i * luma_stride + j] +
+ ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
+ luma_grain[i * luma_grain_stride + j] +
+ rounding_offset) >>
+ params->scaling_shift),
+ min_luma, max_luma);
+ }
+ }
+ }
+}
+
+static void add_noise_to_block_hbd(
+ const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+ int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
+ int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
+ int half_luma_height, int half_luma_width, int bit_depth,
+ int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
+ int cb_mult = params->cb_mult - 128; // fixed scale
+ int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale
+ // offset value depends on the bit depth
+ int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+ int cr_mult = params->cr_mult - 128; // fixed scale
+ int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale
+ // offset value depends on the bit depth
+ int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+ int rounding_offset = (1 << (params->scaling_shift - 1));
+
+ int apply_y = params->num_y_points > 0 ? 1 : 0;
+ int apply_cb =
+ (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+ : 0;
+ int apply_cr =
+ (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+ : 0;
+
+ if (params->chroma_scaling_from_luma) {
+ cb_mult = 0; // fixed scale
+ cb_luma_mult = 64; // fixed scale
+ cb_offset = 0;
+
+ cr_mult = 0; // fixed scale
+ cr_luma_mult = 64; // fixed scale
+ cr_offset = 0;
+ }
+
+ int min_luma, max_luma, min_chroma, max_chroma;
+
+ if (params->clip_to_restricted_range) {
+ min_luma = min_luma_legal_range << (bit_depth - 8);
+ max_luma = max_luma_legal_range << (bit_depth - 8);
+
+ if (mc_identity) {
+ min_chroma = min_luma_legal_range << (bit_depth - 8);
+ max_chroma = max_luma_legal_range << (bit_depth - 8);
+ } else {
+ min_chroma = min_chroma_legal_range << (bit_depth - 8);
+ max_chroma = max_chroma_legal_range << (bit_depth - 8);
+ }
+ } else {
+ min_luma = min_chroma = 0;
+ max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
+ }
+
+ for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+ for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+ int average_luma = 0;
+ if (chroma_subsamp_x) {
+ average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+ (j << chroma_subsamp_x)] +
+ luma[(i << chroma_subsamp_y) * luma_stride +
+ (j << chroma_subsamp_x) + 1] +
+ 1) >>
+ 1;
+ } else {
+ average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+ }
+
+ if (apply_cb) {
+ cb[i * chroma_stride + j] = clamp(
+ cb[i * chroma_stride + j] +
+ ((scale_LUT(scaling_lut_cb,
+ clamp(((average_luma * cb_luma_mult +
+ cb_mult * cb[i * chroma_stride + j]) >>
+ 6) +
+ cb_offset,
+ 0, (256 << (bit_depth - 8)) - 1),
+ bit_depth) *
+ cb_grain[i * chroma_grain_stride + j] +
+ rounding_offset) >>
+ params->scaling_shift),
+ min_chroma, max_chroma);
+ }
+ if (apply_cr) {
+ cr[i * chroma_stride + j] = clamp(
+ cr[i * chroma_stride + j] +
+ ((scale_LUT(scaling_lut_cr,
+ clamp(((average_luma * cr_luma_mult +
+ cr_mult * cr[i * chroma_stride + j]) >>
+ 6) +
+ cr_offset,
+ 0, (256 << (bit_depth - 8)) - 1),
+ bit_depth) *
+ cr_grain[i * chroma_grain_stride + j] +
+ rounding_offset) >>
+ params->scaling_shift),
+ min_chroma, max_chroma);
+ }
+ }
+ }
+
+ if (apply_y) {
+ for (int i = 0; i < (half_luma_height << 1); i++) {
+ for (int j = 0; j < (half_luma_width << 1); j++) {
+ luma[i * luma_stride + j] =
+ clamp(luma[i * luma_stride + j] +
+ ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
+ bit_depth) *
+ luma_grain[i * luma_grain_stride + j] +
+ rounding_offset) >>
+ params->scaling_shift),
+ min_luma, max_luma);
+ }
+ }
+ }
+}
+
+static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int width, int height,
+ int use_high_bit_depth) {
+ int hbd_coeff = use_high_bit_depth ? 2 : 1;
+ while (height) {
+ memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
+ src += src_stride;
+ dst += dst_stride;
+ --height;
+ }
+ return;
+}
+
+static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
+ int width, int height) {
+ while (height) {
+ memcpy(dst, src, width * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ --height;
+ }
+ return;
+}
+
+static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
+ int use_high_bit_depth) {
+ if ((width & 1) == 0 && (height & 1) == 0) return;
+ if (use_high_bit_depth) {
+ uint16_t *dst16 = (uint16_t *)dst;
+ int dst16_stride = dst_stride / 2;
+ if (width & 1) {
+ for (int i = 0; i < height; ++i)
+ dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
+ }
+ width = (width + 1) & (~1);
+ if (height & 1) {
+ memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
+ sizeof(*dst16) * width);
+ }
+ } else {
+ if (width & 1) {
+ for (int i = 0; i < height; ++i)
+ dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
+ }
+ width = (width + 1) & (~1);
+ if (height & 1) {
+ memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
+ sizeof(*dst) * width);
+ }
+ }
+}
+
+static void ver_boundary_overlap(int *left_block, int left_stride,
+ int *right_block, int right_stride,
+ int *dst_block, int dst_stride, int width,
+ int height) {
+ if (width == 1) {
+ while (height) {
+ *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
+ grain_min, grain_max);
+ left_block += left_stride;
+ right_block += right_stride;
+ dst_block += dst_stride;
+ --height;
+ }
+ return;
+ } else if (width == 2) {
+ while (height) {
+ dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
+ grain_min, grain_max);
+ dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
+ grain_min, grain_max);
+ left_block += left_stride;
+ right_block += right_stride;
+ dst_block += dst_stride;
+ --height;
+ }
+ return;
+ }
+}
+
+static void hor_boundary_overlap(int *top_block, int top_stride,
+ int *bottom_block, int bottom_stride,
+ int *dst_block, int dst_stride, int width,
+ int height) {
+ if (height == 1) {
+ while (width) {
+ *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
+ grain_min, grain_max);
+ ++top_block;
+ ++bottom_block;
+ ++dst_block;
+ --width;
+ }
+ return;
+ } else if (height == 2) {
+ while (width) {
+ dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
+ grain_min, grain_max);
+ dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
+ 27 * bottom_block[bottom_stride] + 16) >>
+ 5,
+ grain_min, grain_max);
+ ++top_block;
+ ++bottom_block;
+ ++dst_block;
+ --width;
+ }
+ return;
+ }
+}
+
+int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
+ aom_image_t *dst) {
+ uint8_t *luma, *cb, *cr;
+ int height, width, luma_stride, chroma_stride;
+ int use_high_bit_depth = 0;
+ int chroma_subsamp_x = 0;
+ int chroma_subsamp_y = 0;
+ int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
+
+ switch (src->fmt) {
+ case AOM_IMG_FMT_AOMI420:
+ case AOM_IMG_FMT_I420:
+ use_high_bit_depth = 0;
+ chroma_subsamp_x = 1;
+ chroma_subsamp_y = 1;
+ break;
+ case AOM_IMG_FMT_I42016:
+ use_high_bit_depth = 1;
+ chroma_subsamp_x = 1;
+ chroma_subsamp_y = 1;
+ break;
+ // case AOM_IMG_FMT_444A:
+ case AOM_IMG_FMT_I444:
+ use_high_bit_depth = 0;
+ chroma_subsamp_x = 0;
+ chroma_subsamp_y = 0;
+ break;
+ case AOM_IMG_FMT_I44416:
+ use_high_bit_depth = 1;
+ chroma_subsamp_x = 0;
+ chroma_subsamp_y = 0;
+ break;
+ case AOM_IMG_FMT_I422:
+ use_high_bit_depth = 0;
+ chroma_subsamp_x = 1;
+ chroma_subsamp_y = 0;
+ break;
+ case AOM_IMG_FMT_I42216:
+ use_high_bit_depth = 1;
+ chroma_subsamp_x = 1;
+ chroma_subsamp_y = 0;
+ break;
+ default: // unknown input format
+ fprintf(stderr, "Film grain error: input format is not supported!");
+ return -1;
+ }
+
+ assert(params->bit_depth == src->bit_depth);
+
+ dst->fmt = src->fmt;
+ dst->bit_depth = src->bit_depth;
+
+ dst->r_w = src->r_w;
+ dst->r_h = src->r_h;
+ dst->d_w = src->d_w;
+ dst->d_h = src->d_h;
+
+ dst->cp = src->cp;
+ dst->tc = src->tc;
+ dst->mc = src->mc;
+
+ dst->monochrome = src->monochrome;
+ dst->csp = src->csp;
+ dst->range = src->range;
+
+ dst->x_chroma_shift = src->x_chroma_shift;
+ dst->y_chroma_shift = src->y_chroma_shift;
+
+ dst->temporal_id = src->temporal_id;
+ dst->spatial_id = src->spatial_id;
+
+ width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
+ height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
+
+ copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+ dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+ src->d_h, use_high_bit_depth);
+ // Note that dst is already assumed to be aligned to even.
+ extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+ src->d_h, use_high_bit_depth);
+
+ if (!src->monochrome) {
+ copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+ dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+ width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+ use_high_bit_depth);
+
+ copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
+ dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
+ width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+ use_high_bit_depth);
+ }
+
+ luma = dst->planes[AOM_PLANE_Y];
+ cb = dst->planes[AOM_PLANE_U];
+ cr = dst->planes[AOM_PLANE_V];
+
+ // luma and chroma strides in samples
+ luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
+ chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
+
+ return av1_add_film_grain_run(
+ params, luma, cb, cr, height, width, luma_stride, chroma_stride,
+ use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+}
+
+int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
+ uint8_t *cb, uint8_t *cr, int height, int width,
+ int luma_stride, int chroma_stride,
+ int use_high_bit_depth, int chroma_subsamp_y,
+ int chroma_subsamp_x, int mc_identity) {
+ int **pred_pos_luma;
+ int **pred_pos_chroma;
+ int *luma_grain_block;
+ int *cb_grain_block;
+ int *cr_grain_block;
+
+ int *y_line_buf;
+ int *cb_line_buf;
+ int *cr_line_buf;
+
+ int *y_col_buf;
+ int *cb_col_buf;
+ int *cr_col_buf;
+
+ random_register = params->random_seed;
+
+ int left_pad = 3;
+ int right_pad = 3; // padding to offset for AR coefficients
+ int top_pad = 3;
+ int bottom_pad = 0;
+
+ int ar_padding = 3; // maximum lag used for stabilization of AR coefficients
+
+ luma_subblock_size_y = 32;
+ luma_subblock_size_x = 32;
+
+ chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
+ chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
+
+ // Initial padding is only needed for generation of
+ // film grain templates (to stabilize the AR process)
+ // Only a 64x64 luma and 32x32 chroma part of a template
+ // is used later for adding grain, padding can be discarded
+
+ int luma_block_size_y =
+ top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
+ int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
+ 2 * ar_padding + right_pad;
+
+ int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+ chroma_subblock_size_y * 2 + bottom_pad;
+ int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+ chroma_subblock_size_x * 2 +
+ (2 >> chroma_subsamp_x) * ar_padding + right_pad;
+
+ int luma_grain_stride = luma_block_size_x;
+ int chroma_grain_stride = chroma_block_size_x;
+
+ int overlap = params->overlap_flag;
+ int bit_depth = params->bit_depth;
+
+ const int grain_center = 128 << (bit_depth - 8);
+ grain_min = 0 - grain_center;
+ grain_max = grain_center - 1;
+
+ if (!init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
+ &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
+ &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
+ &y_col_buf, &cb_col_buf, &cr_col_buf,
+ luma_block_size_y * luma_block_size_x,
+ chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
+ chroma_subsamp_x))
+ return -1;
+
+ generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+ luma_block_size_y, luma_block_size_x,
+ luma_grain_stride, left_pad, top_pad, right_pad,
+ bottom_pad);
+
+ if (!generate_chroma_grain_blocks(
+ params, pred_pos_chroma, luma_grain_block, cb_grain_block,
+ cr_grain_block, luma_grain_stride, chroma_block_size_y,
+ chroma_block_size_x, chroma_grain_stride, left_pad, top_pad,
+ right_pad, bottom_pad, chroma_subsamp_y, chroma_subsamp_x))
+ return -1;
+
+ init_scaling_function(params->scaling_points_y, params->num_y_points,
+ scaling_lut_y);
+
+ if (params->chroma_scaling_from_luma) {
+ memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+ memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+ } else {
+ init_scaling_function(params->scaling_points_cb, params->num_cb_points,
+ scaling_lut_cb);
+ init_scaling_function(params->scaling_points_cr, params->num_cr_points,
+ scaling_lut_cr);
+ }
+ for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
+ init_random_generator(y * 2, params->random_seed);
+
+ for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
+ int offset_y = get_random_number(8);
+ int offset_x = (offset_y >> 4) & 15;
+ offset_y &= 15;
+
+ int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
+ int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
+
+ int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+ offset_y * (2 >> chroma_subsamp_y);
+ int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+ offset_x * (2 >> chroma_subsamp_x);
+
+ if (overlap && x) {
+ ver_boundary_overlap(
+ y_col_buf, 2,
+ luma_grain_block + luma_offset_y * luma_grain_stride +
+ luma_offset_x,
+ luma_grain_stride, y_col_buf, 2, 2,
+ AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+ ver_boundary_overlap(
+ cb_col_buf, 2 >> chroma_subsamp_x,
+ cb_grain_block + chroma_offset_y * chroma_grain_stride +
+ chroma_offset_x,
+ chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+ 2 >> chroma_subsamp_x,
+ AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+ (height - (y << 1)) >> chroma_subsamp_y));
+
+ ver_boundary_overlap(
+ cr_col_buf, 2 >> chroma_subsamp_x,
+ cr_grain_block + chroma_offset_y * chroma_grain_stride +
+ chroma_offset_x,
+ chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+ 2 >> chroma_subsamp_x,
+ AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+ (height - (y << 1)) >> chroma_subsamp_y));
+
+ int i = y ? 1 : 0;
+
+ if (use_high_bit_depth) {
+ add_noise_to_block_hbd(
+ params,
+ (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
+ (uint16_t *)cb +
+ ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << (1 - chroma_subsamp_x)),
+ (uint16_t *)cr +
+ ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << (1 - chroma_subsamp_x)),
+ luma_stride, chroma_stride, y_col_buf + i * 4,
+ cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+ cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+ 2, (2 - chroma_subsamp_x),
+ AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+ bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+ } else {
+ add_noise_to_block(
+ params, luma + ((y + i) << 1) * luma_stride + (x << 1),
+ cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << (1 - chroma_subsamp_x)),
+ cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << (1 - chroma_subsamp_x)),
+ luma_stride, chroma_stride, y_col_buf + i * 4,
+ cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+ cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+ 2, (2 - chroma_subsamp_x),
+ AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+ bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+ }
+ }
+
+ if (overlap && y) {
+ if (x) {
+ hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
+ y_line_buf + (x << 1), luma_stride, 2, 2);
+
+ hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
+ chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+ cb_line_buf + x * (2 >> chroma_subsamp_x),
+ chroma_stride, 2 >> chroma_subsamp_x,
+ 2 >> chroma_subsamp_y);
+
+ hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
+ chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+ cr_line_buf + x * (2 >> chroma_subsamp_x),
+ chroma_stride, 2 >> chroma_subsamp_x,
+ 2 >> chroma_subsamp_y);
+ }
+
+ hor_boundary_overlap(
+ y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+ luma_grain_block + luma_offset_y * luma_grain_stride +
+ luma_offset_x + (x ? 2 : 0),
+ luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+ AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
+ width - ((x ? x + 1 : 0) << 1)),
+ 2);
+
+ hor_boundary_overlap(
+ cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_stride,
+ cb_grain_block + chroma_offset_y * chroma_grain_stride +
+ chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_grain_stride,
+ cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_stride,
+ AOMMIN(chroma_subblock_size_x -
+ ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+ (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+ 2 >> chroma_subsamp_y);
+
+ hor_boundary_overlap(
+ cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_stride,
+ cr_grain_block + chroma_offset_y * chroma_grain_stride +
+ chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_grain_stride,
+ cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_stride,
+ AOMMIN(chroma_subblock_size_x -
+ ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+ (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+ 2 >> chroma_subsamp_y);
+
+ if (use_high_bit_depth) {
+ add_noise_to_block_hbd(
+ params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
+ (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << ((1 - chroma_subsamp_x))),
+ (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << ((1 - chroma_subsamp_x))),
+ luma_stride, chroma_stride, y_line_buf + (x << 1),
+ cb_line_buf + (x << (1 - chroma_subsamp_x)),
+ cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+ chroma_stride, 1,
+ AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+ chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+ } else {
+ add_noise_to_block(
+ params, luma + (y << 1) * luma_stride + (x << 1),
+ cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << ((1 - chroma_subsamp_x))),
+ cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+ (x << ((1 - chroma_subsamp_x))),
+ luma_stride, chroma_stride, y_line_buf + (x << 1),
+ cb_line_buf + (x << (1 - chroma_subsamp_x)),
+ cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+ chroma_stride, 1,
+ AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+ chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+ }
+ }
+
+ int i = overlap && y ? 1 : 0;
+ int j = overlap && x ? 1 : 0;
+
+ if (use_high_bit_depth) {
+ add_noise_to_block_hbd(
+ params,
+ (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+ (uint16_t *)cb +
+ ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ ((x + j) << (1 - chroma_subsamp_x)),
+ (uint16_t *)cr +
+ ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ ((x + j) << (1 - chroma_subsamp_x)),
+ luma_stride, chroma_stride,
+ luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+ luma_offset_x + (j << 1),
+ cb_grain_block +
+ (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+ chroma_grain_stride +
+ chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+ cr_grain_block +
+ (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+ chroma_grain_stride +
+ chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+ luma_grain_stride, chroma_grain_stride,
+ AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+ AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+ chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+ } else {
+ add_noise_to_block(
+ params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+ cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ ((x + j) << (1 - chroma_subsamp_x)),
+ cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+ ((x + j) << (1 - chroma_subsamp_x)),
+ luma_stride, chroma_stride,
+ luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+ luma_offset_x + (j << 1),
+ cb_grain_block +
+ (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+ chroma_grain_stride +
+ chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+ cr_grain_block +
+ (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+ chroma_grain_stride +
+ chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+ luma_grain_stride, chroma_grain_stride,
+ AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+ AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+ chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+ }
+
+ if (overlap) {
+ if (x) {
+ // Copy overlapped column bufer to line buffer
+ copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
+ y_line_buf + (x << 1), luma_stride, 2, 2);
+
+ copy_area(
+ cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+ 2 >> chroma_subsamp_x,
+ cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+ 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+
+ copy_area(
+ cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+ 2 >> chroma_subsamp_x,
+ cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+ 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+ }
+
+ // Copy grain to the line buffer for overlap with a bottom block
+ copy_area(
+ luma_grain_block +
+ (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
+ luma_offset_x + ((x ? 2 : 0)),
+ luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+ AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
+
+ copy_area(cb_grain_block +
+ (chroma_offset_y + chroma_subblock_size_y) *
+ chroma_grain_stride +
+ chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+ chroma_grain_stride,
+ cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_stride,
+ AOMMIN(chroma_subblock_size_x,
+ ((width - (x << 1)) >> chroma_subsamp_x)) -
+ (x ? 2 >> chroma_subsamp_x : 0),
+ 2 >> chroma_subsamp_y);
+
+ copy_area(cr_grain_block +
+ (chroma_offset_y + chroma_subblock_size_y) *
+ chroma_grain_stride +
+ chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+ chroma_grain_stride,
+ cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+ chroma_stride,
+ AOMMIN(chroma_subblock_size_x,
+ ((width - (x << 1)) >> chroma_subsamp_x)) -
+ (x ? 2 >> chroma_subsamp_x : 0),
+ 2 >> chroma_subsamp_y);
+
+ // Copy grain to the column buffer for overlap with the next block to
+ // the right
+
+ copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
+ luma_offset_x + luma_subblock_size_x,
+ luma_grain_stride, y_col_buf, 2, 2,
+ AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+ copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
+ chroma_offset_x + chroma_subblock_size_x,
+ chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+ 2 >> chroma_subsamp_x,
+ AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+ (height - (y << 1)) >> chroma_subsamp_y));
+
+ copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
+ chroma_offset_x + chroma_subblock_size_x,
+ chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+ 2 >> chroma_subsamp_x,
+ AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+ (height - (y << 1)) >> chroma_subsamp_y));
+ }
+ }
+ }
+
+ dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
+ &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
+ &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
+ return 0;
+}
diff --git a/third_party/aom/av1/decoder/grain_synthesis.h b/third_party/aom/av1/decoder/grain_synthesis.h
new file mode 100644
index 0000000000..9858ce0013
--- /dev/null
+++ b/third_party/aom/av1/decoder/grain_synthesis.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain synthesis
+ *
+ */
+#ifndef AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_
+#define AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom_dsp/grain_params.h"
+#include "aom/aom_image.h"
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in] grain_params Grain parameters
+ * \param[in] luma luma plane
+ * \param[in] cb cb plane
+ * \param[in] cr cr plane
+ * \param[in] height luma plane height
+ * \param[in] width luma plane width
+ * \param[in] luma_stride luma plane stride
+ * \param[in] chroma_stride chroma plane stride
+ */
+int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
+ uint8_t *cb, uint8_t *cr, int height, int width,
+ int luma_stride, int chroma_stride,
+ int use_high_bit_depth, int chroma_subsamp_y,
+ int chroma_subsamp_x, int mc_identity);
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in] grain_params Grain parameters
+ * \param[in] src Source image
+ * \param[out] dst Resulting image with grain
+ */
+int av1_add_film_grain(const aom_film_grain_t *grain_params,
+ const aom_image_t *src, aom_image_t *dst);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c
new file mode 100644
index 0000000000..288d69a224
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/inspection.h"
+#include "av1/common/enums.h"
+#include "av1/common/cdef.h"
+
+static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
+ fd->mi_cols = mi_cols;
+ fd->mi_rows = mi_rows;
+ fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows *
+ fd->mi_cols);
+ if (!fd->mi_grid) {
+ fprintf(stderr, "Error allocating inspection data\n");
+ abort();
+ }
+}
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) {
+ int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2;
+ int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2;
+ ifd_init_mi_rc(fd, mi_cols, mi_rows);
+}
+
+void ifd_clear(insp_frame_data *fd) {
+ aom_free(fd->mi_grid);
+ fd->mi_grid = NULL;
+}
+
+/* TODO(negge) This function may be called by more than one thread when using
+ a multi-threaded decoder and this may cause a data race. */
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
+ struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
+ AV1_COMMON *const cm = &pbi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const CommonQuantParams *quant_params = &cm->quant_params;
+
+ if (fd->mi_rows != mi_params->mi_rows || fd->mi_cols != mi_params->mi_cols) {
+ ifd_clear(fd);
+ ifd_init_mi_rc(fd, mi_params->mi_rows, mi_params->mi_cols);
+ }
+ fd->show_existing_frame = cm->show_existing_frame;
+ fd->frame_number = cm->current_frame.frame_number;
+ fd->show_frame = cm->show_frame;
+ fd->frame_type = cm->current_frame.frame_type;
+ fd->base_qindex = quant_params->base_qindex;
+ // Set width and height of the first tile until generic support can be added
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, 0);
+ av1_tile_set_col(&tile_info, cm, 0);
+ fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start;
+ fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start;
+ fd->delta_q_present_flag = cm->delta_q_info.delta_q_present_flag;
+ fd->delta_q_res = cm->delta_q_info.delta_q_res;
+#if CONFIG_ACCOUNTING
+ fd->accounting = &pbi->accounting;
+#endif
+ // TODO(negge): copy per frame CDEF data
+ int i, j;
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < 2; j++) {
+ fd->y_dequant[i][j] = quant_params->y_dequant_QTX[i][j];
+ fd->u_dequant[i][j] = quant_params->u_dequant_QTX[i][j];
+ fd->v_dequant[i][j] = quant_params->v_dequant_QTX[i][j];
+ }
+ }
+ for (j = 0; j < mi_params->mi_rows; j++) {
+ for (i = 0; i < mi_params->mi_cols; i++) {
+ const MB_MODE_INFO *mbmi =
+ mi_params->mi_grid_base[j * mi_params->mi_stride + i];
+ insp_mi_data *mi = &fd->mi_grid[j * mi_params->mi_cols + i];
+ // Segment
+ mi->segment_id = mbmi->segment_id;
+ // Motion Vectors
+ mi->mv[0].row = mbmi->mv[0].as_mv.row;
+ mi->mv[0].col = mbmi->mv[0].as_mv.col;
+ mi->mv[1].row = mbmi->mv[1].as_mv.row;
+ mi->mv[1].col = mbmi->mv[1].as_mv.col;
+ // Reference Frames
+ mi->ref_frame[0] = mbmi->ref_frame[0];
+ mi->ref_frame[1] = mbmi->ref_frame[1];
+ // Prediction Mode
+ mi->mode = mbmi->mode;
+ mi->intrabc = (int16_t)mbmi->use_intrabc;
+ mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0];
+ mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1];
+ // Prediction Mode for Chromatic planes
+ if (mi->mode < INTRA_MODES) {
+ mi->uv_mode = mbmi->uv_mode;
+ } else {
+ mi->uv_mode = UV_MODE_INVALID;
+ }
+
+ mi->motion_mode = mbmi->motion_mode;
+ mi->compound_type = mbmi->interinter_comp.type;
+
+ // Block Size
+ mi->bsize = mbmi->bsize;
+ // Skip Flag
+ mi->skip = mbmi->skip_txfm;
+ mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
+ mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
+ mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
+
+ // Transform
+ // TODO(anyone): extract tx type info from mbmi->txk_type[].
+
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int c = i % mi_size_wide[bsize];
+ const int r = j % mi_size_high[bsize];
+ if (is_inter_block(mbmi) || is_intrabc_block(mbmi))
+ mi->tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(bsize, r, c)];
+ else
+ mi->tx_size = mbmi->tx_size;
+
+ if (skip_not_transform && mi->skip) mi->tx_size = -1;
+
+ if (mi->skip) {
+ const int tx_type_row = j - j % tx_size_high_unit[mi->tx_size];
+ const int tx_type_col = i - i % tx_size_wide_unit[mi->tx_size];
+ const int tx_type_map_idx =
+ tx_type_row * mi_params->mi_stride + tx_type_col;
+ mi->tx_type = mi_params->tx_type_map[tx_type_map_idx];
+ } else {
+ mi->tx_type = 0;
+ }
+
+ if (skip_not_transform &&
+ (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)]))
+ mi->tx_type = -1;
+
+ mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] /
+ CDEF_SEC_STRENGTHS;
+ mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] %
+ CDEF_SEC_STRENGTHS;
+
+ mi->cdef_strength += mi->cdef_strength == 3;
+ if (mbmi->uv_mode == UV_CFL_PRED) {
+ mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
+ mi->cfl_alpha_sign = mbmi->cfl_alpha_signs;
+ } else {
+ mi->cfl_alpha_idx = 0;
+ mi->cfl_alpha_sign = 0;
+ }
+ // delta_q
+ mi->current_qindex = mbmi->current_qindex;
+ }
+ }
+ return 1;
+}
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
new file mode 100644
index 0000000000..70b1c80fab
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#include "av1/common/seg_common.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#ifndef AOM_AOM_AOMDX_H_
+typedef void (*aom_inspect_cb)(void *decoder, void *data);
+#endif
+
+typedef struct insp_mv insp_mv;
+
+struct insp_mv {
+ int16_t row;
+ int16_t col;
+};
+
+typedef struct insp_mi_data insp_mi_data;
+
+struct insp_mi_data {
+ insp_mv mv[2];
+ int16_t ref_frame[2];
+ int16_t mode;
+ int16_t uv_mode;
+ int16_t bsize;
+ int16_t skip;
+ int16_t segment_id;
+ int16_t dual_filter_type;
+ int16_t filter[2];
+ int16_t tx_type;
+ int16_t tx_size;
+ int16_t cdef_level;
+ int16_t cdef_strength;
+ int16_t cfl_alpha_idx;
+ int16_t cfl_alpha_sign;
+ int16_t current_qindex;
+ int16_t compound_type;
+ int16_t motion_mode;
+ int16_t intrabc;
+ int16_t palette;
+ int16_t uv_palette;
+};
+
+typedef struct insp_frame_data insp_frame_data;
+
+struct insp_frame_data {
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
+ insp_mi_data *mi_grid;
+ int16_t frame_number;
+ int show_frame;
+ int frame_type;
+ int base_qindex;
+ int mi_rows;
+ int mi_cols;
+ int tile_mi_rows;
+ int tile_mi_cols;
+ int16_t y_dequant[MAX_SEGMENTS][2];
+ int16_t u_dequant[MAX_SEGMENTS][2];
+ int16_t v_dequant[MAX_SEGMENTS][2];
+ // TODO(negge): add per frame CDEF data
+ int delta_q_present_flag;
+ int delta_q_res;
+ int show_existing_frame;
+};
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
+void ifd_clear(insp_frame_data *fd);
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
new file mode 100644
index 0000000000..0e31ce9404
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.c
@@ -0,0 +1,1101 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_ports/mem_ops.h"
+
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+#include "av1/common/timing.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+ int operating_point_idc, unsigned int *number_spatial_layers,
+ unsigned int *number_temporal_layers) {
+ // derive number of spatial/temporal layers from operating_point_idc
+
+ if (!number_spatial_layers || !number_temporal_layers)
+ return AOM_CODEC_INVALID_PARAM;
+
+ if (operating_point_idc == 0) {
+ *number_temporal_layers = 1;
+ *number_spatial_layers = 1;
+ } else {
+ *number_spatial_layers = 0;
+ *number_temporal_layers = 0;
+ for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
+ *number_spatial_layers +=
+ (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
+ }
+ for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
+ *number_temporal_layers += (operating_point_idc >> j) & 0x1;
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static int is_obu_in_current_operating_point(AV1Decoder *pbi,
+ const ObuHeader *obu_header) {
+ if (!pbi->current_operating_point || !obu_header->has_extension) {
+ return 1;
+ }
+
+ if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 &&
+ (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) &
+ 0x1) {
+ return 1;
+ }
+ return 0;
+}
+
+static int byte_alignment(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *const rb) {
+ while (rb->bit_offset & 7) {
+ if (aom_rb_read_bit(rb)) {
+ cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static uint32_t read_temporal_delimiter_obu(void) { return 0; }
+
+// Returns a boolean that indicates success.
+static int read_bitstream_level(AV1_LEVEL *seq_level_idx,
+ struct aom_read_bit_buffer *rb) {
+ *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+ if (!is_valid_seq_level_idx(*seq_level_idx)) return 0;
+ return 1;
+}
+
+// Returns whether two sequence headers are consistent with each other.
+// Note that the 'op_params' field is not compared per Section 7.5 in the spec:
+// Within a particular coded video sequence, the contents of
+// sequence_header_obu must be bit-identical each time the sequence header
+// appears except for the contents of operating_parameters_info.
+static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
+ const SequenceHeader *seq_params_new) {
+ return !memcmp(seq_params_old, seq_params_new,
+ offsetof(SequenceHeader, op_params));
+}
+
+// On success, sets pbi->sequence_header_ready to 1 and returns the number of
+// bytes read from 'rb'.
+// On failure, sets pbi->common.error.error_code and returns 0.
+static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb) {
+ AV1_COMMON *const cm = &pbi->common;
+ const uint32_t saved_bit_offset = rb->bit_offset;
+
+ // Verify rb has been configured to report errors.
+ assert(rb->error_handler);
+
+ // Use a local variable to store the information as we decode. At the end,
+ // if no errors have occurred, cm->seq_params is updated.
+ SequenceHeader sh = *cm->seq_params;
+ SequenceHeader *const seq_params = &sh;
+
+ seq_params->profile = av1_read_profile(rb);
+ if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+
+ // Still picture or not
+ seq_params->still_picture = aom_rb_read_bit(rb);
+ seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
+ // Video must have reduced_still_picture_hdr = 0
+ if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+
+ if (seq_params->reduced_still_picture_hdr) {
+ seq_params->timing_info_present = 0;
+ seq_params->decoder_model_info_present_flag = 0;
+ seq_params->display_model_info_present_flag = 0;
+ seq_params->operating_points_cnt_minus_1 = 0;
+ seq_params->operating_point_idc[0] = 0;
+ if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+ seq_params->tier[0] = 0;
+ seq_params->op_params[0].decoder_model_param_present_flag = 0;
+ seq_params->op_params[0].display_model_param_present_flag = 0;
+ } else {
+ seq_params->timing_info_present = aom_rb_read_bit(rb);
+ if (seq_params->timing_info_present) {
+ av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb);
+
+ seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
+ if (seq_params->decoder_model_info_present_flag)
+ av1_read_decoder_model_info(&seq_params->decoder_model_info, rb);
+ } else {
+ seq_params->decoder_model_info_present_flag = 0;
+ }
+ seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
+ seq_params->operating_points_cnt_minus_1 =
+ aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+ for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+ seq_params->operating_point_idc[i] =
+ aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+ if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+ // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
+ // is equivalent to level 3.3.
+ if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+ seq_params->tier[i] = aom_rb_read_bit(rb);
+ else
+ seq_params->tier[i] = 0;
+ if (seq_params->decoder_model_info_present_flag) {
+ seq_params->op_params[i].decoder_model_param_present_flag =
+ aom_rb_read_bit(rb);
+ if (seq_params->op_params[i].decoder_model_param_present_flag)
+ av1_read_op_parameters_info(&seq_params->op_params[i],
+ seq_params->decoder_model_info
+ .encoder_decoder_buffer_delay_length,
+ rb);
+ } else {
+ seq_params->op_params[i].decoder_model_param_present_flag = 0;
+ }
+ if (seq_params->timing_info_present &&
+ (seq_params->timing_info.equal_picture_interval ||
+ seq_params->op_params[i].decoder_model_param_present_flag)) {
+ seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+ seq_params->profile, seq_params->seq_level_idx[i],
+ seq_params->tier[i]);
+ // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
+ // the check
+ if (seq_params->op_params[i].bitrate == 0)
+ aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support this combination of "
+ "profile, level, and tier.");
+ // Buffer size in bits/s is bitrate in bits/s * 1 s
+ seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
+ }
+ if (seq_params->timing_info_present &&
+ seq_params->timing_info.equal_picture_interval &&
+ !seq_params->op_params[i].decoder_model_param_present_flag) {
+ // When the decoder_model_parameters are not sent for this op, set
+ // the default ones that can be used with the resource availability mode
+ seq_params->op_params[i].decoder_buffer_delay = 70000;
+ seq_params->op_params[i].encoder_buffer_delay = 20000;
+ seq_params->op_params[i].low_delay_mode_flag = 0;
+ }
+
+ if (seq_params->display_model_info_present_flag) {
+ seq_params->op_params[i].display_model_param_present_flag =
+ aom_rb_read_bit(rb);
+ if (seq_params->op_params[i].display_model_param_present_flag) {
+ seq_params->op_params[i].initial_display_delay =
+ aom_rb_read_literal(rb, 4) + 1;
+ if (seq_params->op_params[i].initial_display_delay > 10)
+ aom_internal_error(
+ &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support more than 10 decoded frames delay");
+ } else {
+ seq_params->op_params[i].initial_display_delay = 10;
+ }
+ } else {
+ seq_params->op_params[i].display_model_param_present_flag = 0;
+ seq_params->op_params[i].initial_display_delay = 10;
+ }
+ }
+ }
+ // This decoder supports all levels. Choose operating point provided by
+ // external means
+ int operating_point = pbi->operating_point;
+ if (operating_point < 0 ||
+ operating_point > seq_params->operating_points_cnt_minus_1)
+ operating_point = 0;
+ pbi->current_operating_point =
+ seq_params->operating_point_idc[operating_point];
+ if (aom_get_num_layers_from_operating_point_idc(
+ pbi->current_operating_point, &pbi->number_spatial_layers,
+ &pbi->number_temporal_layers) != AOM_CODEC_OK) {
+ pbi->error.error_code = AOM_CODEC_ERROR;
+ return 0;
+ }
+
+ av1_read_sequence_header(cm, rb, seq_params);
+
+ av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error);
+ if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
+ !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
+ !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
+ aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
+ "%d %d subsampling is not supported.\n",
+ seq_params->subsampling_x, seq_params->subsampling_y);
+ }
+
+ seq_params->film_grain_params_present = aom_rb_read_bit(rb);
+
+ if (av1_check_trailing_bits(pbi, rb) != 0) {
+ // pbi->error.error_code is already set.
+ return 0;
+ }
+
+ // If a sequence header has been decoded before, we check if the new
+ // one is consistent with the old one.
+ if (pbi->sequence_header_ready) {
+ if (!are_seq_headers_consistent(cm->seq_params, seq_params))
+ pbi->sequence_header_changed = 1;
+ }
+
+ *cm->seq_params = *seq_params;
+ pbi->sequence_header_ready = 1;
+
+ return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return. If show existing frame,
+// also marks the data processing to end after the frame header.
+static uint32_t read_frame_header_obu(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t **p_data_end,
+ int trailing_bits_present) {
+ const uint32_t hdr_size =
+ av1_decode_frame_headers_and_setup(pbi, rb, trailing_bits_present);
+ const AV1_COMMON *cm = &pbi->common;
+ if (cm->show_existing_frame) {
+ *p_data_end = data + hdr_size;
+ }
+ return hdr_size;
+}
+
+// On success, returns the tile group header size. On failure, calls
+// aom_internal_error() and returns -1.
+static int32_t read_tile_group_header(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ int *start_tile, int *end_tile,
+ int tile_start_implicit) {
+ AV1_COMMON *const cm = &pbi->common;
+ CommonTileParams *const tiles = &cm->tiles;
+ uint32_t saved_bit_offset = rb->bit_offset;
+ int tile_start_and_end_present_flag = 0;
+ const int num_tiles = tiles->rows * tiles->cols;
+
+ if (!tiles->large_scale && num_tiles > 1) {
+ tile_start_and_end_present_flag = aom_rb_read_bit(rb);
+ if (tile_start_implicit && tile_start_and_end_present_flag) {
+ aom_internal_error(
+ &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
+ return -1;
+ }
+ }
+ if (tiles->large_scale || num_tiles == 1 ||
+ !tile_start_and_end_present_flag) {
+ *start_tile = 0;
+ *end_tile = num_tiles - 1;
+ } else {
+ int tile_bits = tiles->log2_rows + tiles->log2_cols;
+ *start_tile = aom_rb_read_literal(rb, tile_bits);
+ *end_tile = aom_rb_read_literal(rb, tile_bits);
+ }
+ if (*start_tile != pbi->next_start_tile) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "tg_start (%d) must be equal to %d", *start_tile,
+ pbi->next_start_tile);
+ return -1;
+ }
+ if (*start_tile > *end_tile) {
+ aom_internal_error(
+ &pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
+ *start_tile);
+ return -1;
+ }
+ if (*end_tile >= num_tiles) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
+ num_tiles);
+ return -1;
+ }
+ pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1;
+
+ return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+// On success, returns the tile group OBU size. On failure, sets
+// pbi->common.error.error_code and returns 0.
+static uint32_t read_one_tile_group_obu(
+ AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg,
+ const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end,
+ int *is_last_tg, int tile_start_implicit) {
+ AV1_COMMON *const cm = &pbi->common;
+ int start_tile, end_tile;
+ int32_t header_size, tg_payload_size;
+
+ assert((rb->bit_offset & 7) == 0);
+ assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
+
+ header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
+ tile_start_implicit);
+ if (header_size == -1 || byte_alignment(cm, rb)) return 0;
+ data += header_size;
+ av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
+ end_tile, is_first_tg);
+
+ tg_payload_size = (uint32_t)(*p_data_end - data);
+
+ *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1;
+ return header_size + tg_payload_size;
+}
+
+static void alloc_tile_list_buffer(AV1Decoder *pbi) {
+ // The resolution of the output frame is read out from the bitstream. The data
+ // are stored in the order of Y plane, U plane and V plane. As an example, for
+ // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the
+ // output frame.
+ AV1_COMMON *const cm = &pbi->common;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_width_in_pixels = tile_width * MI_SIZE;
+ const int tile_height_in_pixels = tile_height * MI_SIZE;
+ const int output_frame_width =
+ (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels;
+ const int output_frame_height =
+ (pbi->output_frame_height_in_tiles_minus_1 + 1) * tile_height_in_pixels;
+ // The output frame is used to store the decoded tile list. The decoded tile
+ // list has to fit into 1 output frame.
+ assert((pbi->tile_count_minus_1 + 1) <=
+ (pbi->output_frame_width_in_tiles_minus_1 + 1) *
+ (pbi->output_frame_height_in_tiles_minus_1 + 1));
+
+ // Allocate the tile list output buffer.
+ // Note: if cm->seq_params->use_highbitdepth is 1 and
+ // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8
+ // bits/pixel.
+ if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
+ output_frame_height, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y,
+ (cm->seq_params->use_highbitdepth &&
+ (cm->seq_params->bit_depth > AOM_BITS_8)),
+ 0, cm->features.byte_alignment, 0, 0))
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate the tile list output buffer");
+}
+
+static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1,
+ int hend1, int vstart1, int vend1,
+ YV12_BUFFER_CONFIG *dst, int hstart2, int vstart2,
+ int plane) {
+ const int src_stride = (plane > 0) ? src->strides[1] : src->strides[0];
+ const int dst_stride = (plane > 0) ? dst->strides[1] : dst->strides[0];
+ int row, col;
+
+ assert(src->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(!(dst->flags & YV12_FLAG_HIGHBITDEPTH));
+
+ const uint16_t *src16 =
+ CONVERT_TO_SHORTPTR(src->buffers[plane] + vstart1 * src_stride + hstart1);
+ uint8_t *dst8 = dst->buffers[plane] + vstart2 * dst_stride + hstart2;
+
+ for (row = vstart1; row < vend1; ++row) {
+ for (col = 0; col < (hend1 - hstart1); ++col) *dst8++ = (uint8_t)(*src16++);
+ src16 += src_stride - (hend1 - hstart1);
+ dst8 += dst_stride - (hend1 - hstart1);
+ }
+ return;
+}
+
+static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
+ int tile_idx) {
+ AV1_COMMON *const cm = &pbi->common;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_width_in_pixels = tile_width * MI_SIZE;
+ const int tile_height_in_pixels = tile_height * MI_SIZE;
+ const int ssy = cm->seq_params->subsampling_y;
+ const int ssx = cm->seq_params->subsampling_x;
+ const int num_planes = av1_num_planes(cm);
+
+ YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
+ const int tr = tile_idx / (pbi->output_frame_width_in_tiles_minus_1 + 1);
+ const int tc = tile_idx % (pbi->output_frame_width_in_tiles_minus_1 + 1);
+ int plane;
+
+ // Copy decoded tile to the tile list output buffer.
+ for (plane = 0; plane < num_planes; ++plane) {
+ const int shift_x = plane > 0 ? ssx : 0;
+ const int shift_y = plane > 0 ? ssy : 0;
+ const int h = tile_height_in_pixels >> shift_y;
+ const int w = tile_width_in_pixels >> shift_x;
+
+ // src offset
+ int vstart1 = pbi->dec_tile_row * h;
+ int vend1 = vstart1 + h;
+ int hstart1 = pbi->dec_tile_col * w;
+ int hend1 = hstart1 + w;
+ // dst offset
+ int vstart2 = tr * h;
+ int hstart2 = tc * w;
+
+ if (cm->seq_params->use_highbitdepth &&
+ cm->seq_params->bit_depth == AOM_BITS_8) {
+ yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1,
+ &pbi->tile_list_outbuf, hstart2, vstart2, plane);
+ } else {
+ switch (plane) {
+ case 0:
+ aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1,
+ &pbi->tile_list_outbuf, hstart2, vstart2);
+ break;
+ case 1:
+ aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1,
+ &pbi->tile_list_outbuf, hstart2, vstart2);
+ break;
+ case 2:
+ aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1,
+ &pbi->tile_list_outbuf, hstart2, vstart2);
+ break;
+ default: assert(0);
+ }
+ }
+ }
+}
+
+// Only called while large_scale_tile = 1.
+//
+// On success, returns the tile list OBU size. On failure, sets
+// pbi->common.error.error_code and returns 0.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end,
+ int *frame_decoding_finished) {
+ AV1_COMMON *const cm = &pbi->common;
+ uint32_t tile_list_payload_size = 0;
+ const int num_tiles = cm->tiles.cols * cm->tiles.rows;
+ const int start_tile = 0;
+ const int end_tile = num_tiles - 1;
+ int i = 0;
+
+ // Process the tile list info.
+ pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+ pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+ pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+ if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ // Allocate output frame buffer for the tile list.
+ alloc_tile_list_buffer(pbi);
+
+ uint32_t tile_list_info_bytes = 4;
+ tile_list_payload_size += tile_list_info_bytes;
+ data += tile_list_info_bytes;
+
+ int tile_idx = 0;
+ for (i = 0; i <= pbi->tile_count_minus_1; i++) {
+ // Process 1 tile.
+ // Reset the bit reader.
+ rb->bit_offset = 0;
+ rb->bit_buffer = data;
+
+ // Read out the tile info.
+ uint32_t tile_info_bytes = 5;
+ // Set reference for each tile.
+ int ref_idx = aom_rb_read_literal(rb, 8);
+ if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
+ &pbi->ext_refs.refs[ref_idx]);
+
+ pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
+ pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
+ if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
+ pbi->dec_tile_row >= cm->tiles.rows ||
+ pbi->dec_tile_col >= cm->tiles.cols) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
+ data += tile_info_bytes;
+ if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
+ p_data_end, start_tile, end_tile, 0);
+ uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
+
+ tile_list_payload_size += tile_info_bytes + tile_payload_size;
+
+ // Update data ptr for next tile decoding.
+ data = *p_data_end;
+ assert(data <= data_end);
+
+ // Copy the decoded tile to the tile list output buffer.
+ copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx);
+ tile_idx++;
+ }
+
+ *frame_decoding_finished = 1;
+ return tile_list_payload_size;
+}
+
+// Returns the last nonzero byte index in 'data'. If there is no nonzero byte in
+// 'data', returns -1.
+static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) {
+ // Scan backward and return on the first nonzero byte.
+ int i = (int)sz - 1;
+ while (i >= 0 && data[i] == 0) {
+ --i;
+ }
+ return i;
+}
+
+// Allocates metadata that was read and adds it to the decoders metadata array.
+static void alloc_read_metadata(AV1Decoder *const pbi,
+ OBU_METADATA_TYPE metadata_type,
+ const uint8_t *data, size_t sz,
+ aom_metadata_insert_flags_t insert_flag) {
+ if (!pbi->metadata) {
+ pbi->metadata = aom_img_metadata_array_alloc(0);
+ if (!pbi->metadata) {
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate metadata array");
+ }
+ }
+ aom_metadata_t *metadata =
+ aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
+ if (!metadata) {
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating metadata");
+ }
+ aom_metadata_t **metadata_array =
+ (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
+ (pbi->metadata->sz + 1) * sizeof(metadata));
+ if (!metadata_array) {
+ aom_img_metadata_free(metadata);
+ aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
+ "Error growing metadata array");
+ }
+ pbi->metadata->metadata_array = metadata_array;
+ pbi->metadata->metadata_array[pbi->metadata->sz] = metadata;
+ pbi->metadata->sz++;
+}
+
+// On failure, calls aom_internal_error() and does not return.
+static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
+ size_t sz) {
+ if (sz == 0) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "itu_t_t35_country_code is missing");
+ }
+ int country_code_size = 1;
+ if (*data == 0xFF) {
+ if (sz == 1) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "itu_t_t35_country_code_extension_byte is missing");
+ }
+ ++country_code_size;
+ }
+ int end_index = get_last_nonzero_byte_index(data, sz);
+ if (end_index < country_code_size) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "No trailing bits found in ITU-T T.35 metadata OBU");
+ }
+ // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
+ // itu_t_t35_payload_bytes shall be bytes containing data registered as
+ // specified in Recommendation ITU-T T.35.
+ // Therefore the first trailing byte should be 0x80.
+ if (data[end_index] != 0x80) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "The last nonzero byte of the ITU-T T.35 metadata OBU "
+ "is 0x%02x, should be 0x80.",
+ data[end_index]);
+ }
+ alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index,
+ AOM_MIF_ANY_FRAME);
+}
+
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
+ size_t sz) {
+ const size_t kHdrCllPayloadSize = 4;
+ if (sz < kHdrCllPayloadSize) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Incorrect HDR CLL metadata payload size");
+ }
+ alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
+ AOM_MIF_ANY_FRAME);
+ return kHdrCllPayloadSize;
+}
+
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
+ size_t sz) {
+ const size_t kMdcvPayloadSize = 24;
+ if (sz < kMdcvPayloadSize) {
+ aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
+ "Incorrect HDR MDCV metadata payload size");
+ }
+ alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
+ AOM_MIF_ANY_FRAME);
+ return kMdcvPayloadSize;
+}
+
+static void scalability_structure(struct aom_read_bit_buffer *rb) {
+ const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2);
+ const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+ const int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+ const int temporal_group_description_present_flag = aom_rb_read_bit(rb);
+ // scalability_structure_reserved_3bits must be set to zero and be ignored by
+ // decoders.
+ aom_rb_read_literal(rb, 3);
+
+ if (spatial_layer_dimensions_present_flag) {
+ for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
+ aom_rb_read_literal(rb, 16);
+ aom_rb_read_literal(rb, 16);
+ }
+ }
+ if (spatial_layer_description_present_flag) {
+ for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
+ aom_rb_read_literal(rb, 8);
+ }
+ }
+ if (temporal_group_description_present_flag) {
+ const int temporal_group_size = aom_rb_read_literal(rb, 8);
+ for (int i = 0; i < temporal_group_size; i++) {
+ aom_rb_read_literal(rb, 3);
+ aom_rb_read_bit(rb);
+ aom_rb_read_bit(rb);
+ const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+ for (int j = 0; j < temporal_group_ref_cnt; j++) {
+ aom_rb_read_literal(rb, 8);
+ }
+ }
+ }
+}
+
+static void read_metadata_scalability(struct aom_read_bit_buffer *rb) {
+ const int scalability_mode_idc = aom_rb_read_literal(rb, 8);
+ if (scalability_mode_idc == SCALABILITY_SS) {
+ scalability_structure(rb);
+ }
+}
+
+static void read_metadata_timecode(struct aom_read_bit_buffer *rb) {
+ aom_rb_read_literal(rb, 5); // counting_type f(5)
+ const int full_timestamp_flag =
+ aom_rb_read_bit(rb); // full_timestamp_flag f(1)
+ aom_rb_read_bit(rb); // discontinuity_flag (f1)
+ aom_rb_read_bit(rb); // cnt_dropped_flag f(1)
+ aom_rb_read_literal(rb, 9); // n_frames f(9)
+ if (full_timestamp_flag) {
+ aom_rb_read_literal(rb, 6); // seconds_value f(6)
+ aom_rb_read_literal(rb, 6); // minutes_value f(6)
+ aom_rb_read_literal(rb, 5); // hours_value f(5)
+ } else {
+ const int seconds_flag = aom_rb_read_bit(rb); // seconds_flag f(1)
+ if (seconds_flag) {
+ aom_rb_read_literal(rb, 6); // seconds_value f(6)
+ const int minutes_flag = aom_rb_read_bit(rb); // minutes_flag f(1)
+ if (minutes_flag) {
+ aom_rb_read_literal(rb, 6); // minutes_value f(6)
+ const int hours_flag = aom_rb_read_bit(rb); // hours_flag f(1)
+ if (hours_flag) {
+ aom_rb_read_literal(rb, 5); // hours_value f(5)
+ }
+ }
+ }
+ }
+ // time_offset_length f(5)
+ const int time_offset_length = aom_rb_read_literal(rb, 5);
+ if (time_offset_length) {
+ // time_offset_value f(time_offset_length)
+ aom_rb_read_literal(rb, time_offset_length);
+ }
+}
+
+// Returns the last nonzero byte in 'data'. If there is no nonzero byte in
+// 'data', returns 0.
+//
+// Call this function to check the following requirement in the spec:
+// This implies that when any payload data is present for this OBU type, at
+// least one byte of the payload data (including the trailing bit) shall not
+// be equal to 0.
+static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
+ // Scan backward and return on the first nonzero byte.
+ size_t i = sz;
+ while (i != 0) {
+ --i;
+ if (data[i] != 0) return data[i];
+ }
+ return 0;
+}
+
+// Checks the metadata for correct syntax but ignores the parsed metadata.
+//
+// On success, returns the number of bytes read from 'data'. On failure, sets
+// pbi->common.error.error_code and returns 0, or calls aom_internal_error()
+// and does not return.
+static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
+ size_t type_length;
+ uint64_t type_value;
+ if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
+ if (metadata_type == 0 || metadata_type >= 6) {
+ // If metadata_type is reserved for future use or a user private value,
+ // ignore the entire OBU and just check trailing bits.
+ if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ return sz;
+ }
+ if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
+ // read_metadata_itut_t35() checks trailing bits.
+ read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
+ return sz;
+ } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
+ size_t bytes_read =
+ type_length +
+ read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
+ if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ return sz;
+ } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
+ size_t bytes_read =
+ type_length +
+ read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
+ if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ return sz;
+ }
+
+ struct aom_read_bit_buffer rb;
+ av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz);
+ if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+ read_metadata_scalability(&rb);
+ } else {
+ assert(metadata_type == OBU_METADATA_TYPE_TIMECODE);
+ read_metadata_timecode(&rb);
+ }
+ if (av1_check_trailing_bits(pbi, &rb) != 0) {
+ // pbi->error.error_code is already set.
+ return 0;
+ }
+ assert((rb.bit_offset & 7) == 0);
+ return type_length + (rb.bit_offset >> 3);
+}
+
+// On success, returns 'sz'. On failure, sets pbi->common.error.error_code and
+// returns 0.
+static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
+ size_t sz) {
+ // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So
+ // check trailing bits only if sz > 0.
+ if (sz > 0) {
+ // The payload of a padding OBU is byte aligned. Therefore the first
+ // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+ const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
+ if (last_nonzero_byte != 0x80) {
+ cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ }
+ return sz;
+}
+
+// On success, returns a boolean that indicates whether the decoding of the
+// current frame is finished. On failure, sets pbi->error.error_code and
+// returns -1.
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end) {
+ AV1_COMMON *const cm = &pbi->common;
+ int frame_decoding_finished = 0;
+ int is_first_tg_obu_received = 1;
+ // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the
+ // beginning of the frame_header_obu and frame_header_size is set to its
+ // size. This allows us to check if a redundant frame_header_obu is a copy
+ // of the previous frame_header_obu.
+ //
+ // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang
+ // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is
+ // passed as an argument to a 'nonnull' parameter of memcmp(). The initial
+ // value will not be used.
+ const uint8_t *frame_header = data;
+ uint32_t frame_header_size = 0;
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+ pbi->seen_frame_header = 0;
+ pbi->next_start_tile = 0;
+ pbi->num_tile_groups = 0;
+
+ if (data_end < data) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0.
+ if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
+
+ // decode frame as a series of OBUs
+ while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) {
+ struct aom_read_bit_buffer rb;
+ size_t payload_size = 0;
+ size_t decoded_payload_size = 0;
+ size_t obu_payload_offset = 0;
+ size_t bytes_read = 0;
+ const size_t bytes_available = data_end - data;
+
+ if (bytes_available == 0 && !pbi->seen_frame_header) {
+ *p_data_end = data;
+ pbi->error.error_code = AOM_CODEC_OK;
+ break;
+ }
+
+ aom_codec_err_t status =
+ aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb,
+ &obu_header, &payload_size, &bytes_read);
+
+ if (status != AOM_CODEC_OK) {
+ pbi->error.error_code = status;
+ return -1;
+ }
+
+ // Record obu size header information.
+ pbi->obu_size_hdr.data = data + obu_header.size;
+ pbi->obu_size_hdr.size = bytes_read - obu_header.size;
+
+ // Note: aom_read_obu_header_and_size() takes care of checking that this
+ // doesn't cause 'data' to advance past 'data_end'.
+ data += bytes_read;
+
+ if ((size_t)(data_end - data) < payload_size) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ cm->temporal_layer_id = obu_header.temporal_layer_id;
+ cm->spatial_layer_id = obu_header.spatial_layer_id;
+
+ if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+ obu_header.type != OBU_SEQUENCE_HEADER) {
+ // don't decode obu if it's not in current operating mode
+ if (!is_obu_in_current_operating_point(pbi, &obu_header)) {
+ data += payload_size;
+ continue;
+ }
+ }
+
+ av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
+
+ switch (obu_header.type) {
+ case OBU_TEMPORAL_DELIMITER:
+ decoded_payload_size = read_temporal_delimiter_obu();
+ if (pbi->seen_frame_header) {
+ // A new temporal unit has started, but the frame in the previous
+ // temporal unit is incomplete.
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ break;
+ case OBU_SEQUENCE_HEADER:
+ decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+ if (pbi->error.error_code != AOM_CODEC_OK) return -1;
+ // The sequence header should not change in the middle of a frame.
+ if (pbi->sequence_header_changed && pbi->seen_frame_header) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ break;
+ case OBU_FRAME_HEADER:
+ case OBU_REDUNDANT_FRAME_HEADER:
+ case OBU_FRAME:
+ if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
+ if (!pbi->seen_frame_header) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ } else {
+ // OBU_FRAME_HEADER or OBU_FRAME.
+ if (pbi->seen_frame_header) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ }
+ // Only decode first frame header received
+ if (!pbi->seen_frame_header ||
+ (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
+ frame_header_size = read_frame_header_obu(
+ pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+ frame_header = data;
+ pbi->seen_frame_header = 1;
+ if (!pbi->ext_tile_debug && cm->tiles.large_scale)
+ pbi->camera_frame_header_ready = 1;
+ } else {
+ // Verify that the frame_header_obu is identical to the original
+ // frame_header_obu.
+ if (frame_header_size > payload_size ||
+ memcmp(data, frame_header, frame_header_size) != 0) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ assert(rb.bit_offset == 0);
+ rb.bit_offset = 8 * frame_header_size;
+ }
+
+ decoded_payload_size = frame_header_size;
+ pbi->frame_header_size = frame_header_size;
+ cm->cur_frame->temporal_id = obu_header.temporal_layer_id;
+ cm->cur_frame->spatial_id = obu_header.spatial_layer_id;
+
+ if (cm->show_existing_frame) {
+ if (obu_header.type == OBU_FRAME) {
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
+ frame_decoding_finished = 1;
+ pbi->seen_frame_header = 0;
+
+ if (cm->show_frame &&
+ !cm->seq_params->order_hint_info.enable_order_hint) {
+ ++cm->current_frame.frame_number;
+ }
+ break;
+ }
+
+ // In large scale tile coding, decode the common camera frame header
+ // before any tile list OBU.
+ if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
+ frame_decoding_finished = 1;
+ // Skip the rest of the frame data.
+ decoded_payload_size = payload_size;
+ // Update data_end.
+ *p_data_end = data_end;
+ break;
+ }
+
+ if (obu_header.type != OBU_FRAME) break;
+ obu_payload_offset = frame_header_size;
+ // Byte align the reader before reading the tile group.
+ // byte_alignment() has set pbi->error.error_code if it returns -1.
+ if (byte_alignment(cm, &rb)) return -1;
+ AOM_FALLTHROUGH_INTENDED; // fall through to read tile group.
+ case OBU_TILE_GROUP:
+ if (!pbi->seen_frame_header) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ if (obu_payload_offset > payload_size) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ decoded_payload_size += read_one_tile_group_obu(
+ pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
+ data + payload_size, p_data_end, &frame_decoding_finished,
+ obu_header.type == OBU_FRAME);
+ if (pbi->error.error_code != AOM_CODEC_OK) return -1;
+ is_first_tg_obu_received = 0;
+ if (frame_decoding_finished) {
+ pbi->seen_frame_header = 0;
+ pbi->next_start_tile = 0;
+ }
+ pbi->num_tile_groups++;
+ break;
+ case OBU_METADATA:
+ decoded_payload_size = read_metadata(pbi, data, payload_size);
+ if (pbi->error.error_code != AOM_CODEC_OK) return -1;
+ break;
+ case OBU_TILE_LIST:
+ if (CONFIG_NORMAL_TILE_MODE) {
+ pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
+
+ // This OBU type is purely for the large scale tile coding mode.
+ // The common camera frame header has to be already decoded.
+ if (!pbi->camera_frame_header_ready) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ cm->tiles.large_scale = 1;
+ av1_set_single_tile_decoding_mode(cm);
+ decoded_payload_size =
+ read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
+ p_data_end, &frame_decoding_finished);
+ if (pbi->error.error_code != AOM_CODEC_OK) return -1;
+ break;
+ case OBU_PADDING:
+ decoded_payload_size = read_padding(cm, data, payload_size);
+ if (pbi->error.error_code != AOM_CODEC_OK) return -1;
+ break;
+ default:
+ // Skip unrecognized OBUs
+ if (payload_size > 0 &&
+ get_last_nonzero_byte(data, payload_size) == 0) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ decoded_payload_size = payload_size;
+ break;
+ }
+
+ // Check that the signalled OBU size matches the actual amount of data read
+ if (decoded_payload_size > payload_size) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ // If there are extra padding bytes, they should all be zero
+ while (decoded_payload_size < payload_size) {
+ uint8_t padding_byte = data[decoded_payload_size++];
+ if (padding_byte != 0) {
+ pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ }
+
+ data += payload_size;
+ }
+
+ if (pbi->error.error_code != AOM_CODEC_OK) return -1;
+ return frame_decoding_finished;
+}
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
new file mode 100644
index 0000000000..d8ebe368e6
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
+
+#include "aom/aom_codec.h"
+#include "av1/decoder/decoder.h"
+
+// Try to decode one frame from a buffer.
+// Returns 1 if we decoded a frame,
+// 0 if we didn't decode a frame but that's okay
+// (eg, if there was a frame but we skipped it),
+// or -1 on error
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end);
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+ int operating_point_idc, unsigned int *number_spatial_layers,
+ unsigned int *number_temporal_layers);
+
+#endif // AOM_AV1_DECODER_OBU_H_
diff --git a/third_party/aom/av1/encoder/allintra_vis.c b/third_party/aom/av1/encoder/allintra_vis.c
new file mode 100644
index 0000000000..8dcef5fc85
--- /dev/null
+++ b/third_party/aom/av1/encoder/allintra_vis.c
@@ -0,0 +1,1055 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#if CONFIG_TFLITE
+#include "tensorflow/lite/c/c_api.h"
+#include "av1/encoder/deltaq4_model.c"
+#endif
+
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128
+#define MB_WIENER_PRED_BUF_STRIDE 128
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) {
+ const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd);
+ assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL);
+ const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE];
+ const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE];
+ assert(buf_width == MB_WIENER_PRED_BUF_STRIDE);
+ const size_t buf_size =
+ (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf))
+ << is_high_bitdepth;
+ CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size));
+}
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) {
+ aom_free(td->wiener_tmp_pred_buf);
+ td->wiener_tmp_pred_buf = NULL;
+}
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ // This block size is also used to determine number of workers in
+ // multi-threading. If it is changed, one needs to change it accordingly in
+ // "compute_num_ai_workers()".
+ cpi->weber_bsize = BLOCK_8X8;
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ if (cpi->mb_weber_stats && cpi->prep_rate_estimates &&
+ cpi->ext_rate_distribution)
+ return;
+ } else {
+ if (cpi->mb_weber_stats) return;
+ }
+
+ CHECK_MEM_ERROR(cm, cpi->mb_weber_stats,
+ aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+ sizeof(*cpi->mb_weber_stats)));
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ CHECK_MEM_ERROR(
+ cm, cpi->prep_rate_estimates,
+ aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+ sizeof(*cpi->prep_rate_estimates)));
+
+ CHECK_MEM_ERROR(
+ cm, cpi->ext_rate_distribution,
+ aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+ sizeof(*cpi->ext_rate_distribution)));
+ }
+}
+
+static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int mb_stride = cpi->frame_info.mi_cols;
+ int mb_count = 0;
+ int64_t satd = 0;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+ .satd;
+ ++mb_count;
+ }
+ }
+
+ if (mb_count) satd = (int)(satd / mb_count);
+ satd = AOMMAX(1, satd);
+
+ return (int)satd;
+}
+
+static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int mb_stride = cpi->frame_info.mi_cols;
+ int mb_count = 0;
+ int64_t distortion = 0;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ distortion +=
+ cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+ .distortion;
+ ++mb_count;
+ }
+ }
+
+ if (mb_count) distortion = (int)(distortion / mb_count);
+ distortion = AOMMAX(1, distortion);
+
+ return (int)distortion;
+}
+
+static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int mb_stride = cpi->frame_info.mi_cols;
+ double min_max_scale = 10.0;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+ WeberStats *weber_stats =
+ &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+ if (weber_stats->max_scale < 1.0) continue;
+ if (weber_stats->max_scale < min_max_scale)
+ min_max_scale = weber_stats->max_scale;
+ }
+ }
+ return min_max_scale;
+}
+
+static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int sb_wiener_var = 0;
+ int mb_stride = cpi->frame_info.mi_cols;
+ int mb_count = 0;
+ double base_num = 1;
+ double base_den = 1;
+ double base_reg = 1;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ WeberStats *weber_stats =
+ &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+
+ base_num += ((double)weber_stats->distortion) *
+ sqrt((double)weber_stats->src_variance) *
+ weber_stats->rec_pix_max;
+
+ base_den += fabs(
+ weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) -
+ weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance));
+
+ base_reg += sqrt((double)weber_stats->distortion) *
+ sqrt((double)weber_stats->src_pix_max) * 0.1;
+ ++mb_count;
+ }
+ }
+
+ sb_wiener_var =
+ (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count);
+ sb_wiener_var = AOMMAX(1, sb_wiener_var);
+
+ return (int)sb_wiener_var;
+}
+
+static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col);
+
+ if (mi_row >= (mi_high / 2)) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col));
+ }
+ if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col));
+ }
+ if (mi_col >= (mi_wide / 2)) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2));
+ }
+ if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2));
+ }
+
+ return sb_wiener_var;
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+
+ assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+ int rate_cost = 1;
+
+ for (int idx = 0; idx < eob; ++idx) {
+ int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+ rate_cost += (int)(log1p(abs_level) / log(2.0)) + 1 + (abs_level > 0);
+ }
+
+ return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, const int mi_row,
+ int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ double *sum_rec_distortion,
+ double *sum_est_rate, uint8_t *pred_buffer) {
+ AV1_COMMON *const cm = &cpi->common;
+ uint8_t *buffer = cpi->source->y_buffer;
+ int buf_stride = cpi->source->y_stride;
+ MB_MODE_INFO mbmi;
+ memset(&mbmi, 0, sizeof(mbmi));
+ MB_MODE_INFO *mbmi_ptr = &mbmi;
+ xd->mi = &mbmi_ptr;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int block_size = tx_size_wide[tx_size];
+ const int coeff_count = block_size * block_size;
+ const int mb_step = mi_size_wide[bsize];
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ const MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+ const int mi_cols = cm->mi_params.mi_cols;
+ const int mt_thread_id = mi_row / mb_step;
+ // TODO(chengchen): test different unit step size
+ const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+ const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+ int mt_unit_col = 0;
+ const int is_high_bitdepth = is_cur_buf_hbd(xd);
+
+ uint8_t *dst_buffer = pred_buffer;
+ const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE;
+
+ if (is_high_bitdepth) {
+ uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer;
+ dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16);
+ }
+
+ for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) {
+ if (mi_col % mt_unit_step == 0) {
+ intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id,
+ mt_unit_col);
+#if CONFIG_MULTITHREAD
+ const int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+ if (num_workers > 1) {
+ const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ const bool exit = enc_row_mt->mb_wiener_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Stop further processing in case any worker has encountered an error.
+ if (exit) break;
+ }
+#endif
+ }
+
+ PREDICTION_MODE best_mode = DC_PRED;
+ int best_intra_cost = INT_MAX;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+ set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+ AOMMIN(mi_row + mi_height, cm->mi_params.mi_rows),
+ AOMMIN(mi_col + mi_width, cm->mi_params.mi_cols));
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+ av1_num_planes(cm));
+ xd->mi[0]->bsize = bsize;
+ xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+ // Set above and left mbmi to NULL as they are not available in the
+ // preprocessing stage.
+ // They are used to detemine intra edge filter types in intra prediction.
+ if (xd->up_available) {
+ xd->above_mbmi = NULL;
+ }
+ if (xd->left_available) {
+ xd->left_mbmi = NULL;
+ }
+ uint8_t *mb_buffer =
+ buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+ for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
+ ++mode) {
+ // TODO(chengchen): Here we use src instead of reconstructed frame as
+ // the intra predictor to make single and multithread version match.
+ // Ideally we want to use the reconstructed.
+ av1_predict_intra_block(
+ xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+ block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+ av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+ av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+ int intra_cost = aom_satd(coeff, coeff_count);
+ if (intra_cost < best_intra_cost) {
+ best_intra_cost = intra_cost;
+ best_mode = mode;
+ }
+ }
+
+ av1_predict_intra_block(
+ xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+ block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+ av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+ av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+
+ const struct macroblock_plane *const p = &x->plane[0];
+ uint16_t eob;
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+ QUANT_PARAM quant_param;
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+ scan_order, &quant_param);
+ } else {
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+ scan_order, &quant_param);
+ }
+#else
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order,
+ &quant_param);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ const int rate_cost = rate_estimator(qcoeff, eob, tx_size);
+ cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+ (mi_col / mb_step)] = rate_cost;
+ }
+
+ av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
+ dst_buffer_stride, eob, 0);
+ WeberStats *weber_stats =
+ &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+ (mi_col / mb_step)];
+
+ weber_stats->rec_pix_max = 1;
+ weber_stats->rec_variance = 0;
+ weber_stats->src_pix_max = 1;
+ weber_stats->src_variance = 0;
+ weber_stats->distortion = 0;
+
+ int64_t src_mean = 0;
+ int64_t rec_mean = 0;
+ int64_t dist_mean = 0;
+
+ for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+ for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+ int src_pix, rec_pix;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
+ uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
+ src_pix = src[pix_row * buf_stride + pix_col];
+ rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
+ } else {
+ src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+ rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+ }
+#else
+ src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+ rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+#endif
+ src_mean += src_pix;
+ rec_mean += rec_pix;
+ dist_mean += src_pix - rec_pix;
+ weber_stats->src_variance += src_pix * src_pix;
+ weber_stats->rec_variance += rec_pix * rec_pix;
+ weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
+ weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
+ weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
+ }
+ }
+
+ if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+ *sum_rec_distortion += weber_stats->distortion;
+ int est_block_rate = 0;
+ int64_t est_block_dist = 0;
+ model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion,
+ pix_num, &est_block_rate,
+ &est_block_dist);
+ *sum_est_rate += est_block_rate;
+ }
+
+ weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
+ weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
+ weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
+ weber_stats->satd = best_intra_cost;
+
+ qcoeff[0] = 0;
+ int max_scale = 0;
+ for (int idx = 1; idx < coeff_count; ++idx) {
+ const int abs_qcoeff = abs(qcoeff[idx]);
+ max_scale = AOMMAX(max_scale, abs_qcoeff);
+ }
+ weber_stats->max_scale = max_scale;
+
+ if ((mi_col + mb_step) % mt_unit_step == 0 ||
+ (mi_col + mb_step) >= mi_cols) {
+ intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+ mt_unit_col, mt_unit_cols);
+ ++mt_unit_col;
+ }
+ }
+ // Set the pointer to null since mbmi is only allocated inside this function.
+ xd->mi = NULL;
+}
+
+static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
+ double *sum_est_rate) {
+ MACROBLOCK *x = &cpi->td.mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+ for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+ av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
+ dqcoeff, sum_rec_distortion, sum_est_rate,
+ cpi->td.wiener_tmp_pred_buf);
+ }
+}
+
+static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi,
+ const BLOCK_SIZE norm_block_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int64_t norm_factor = 1;
+ assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128);
+ const int norm_step = mi_size_wide[norm_block_size];
+ double sb_wiener_log = 0;
+ double sb_count = 0;
+ for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+ for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) {
+ const int sb_wiener_var =
+ get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+ const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+ const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+ const double scaled_satd = (double)satd / sqrt((double)sse);
+ sb_wiener_log += scaled_satd * log(sb_wiener_var);
+ sb_count += scaled_satd;
+ }
+ }
+ if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count));
+ norm_factor = AOMMAX(1, norm_factor);
+
+ return norm_factor;
+}
+
+static void automatic_intra_tools_off(AV1_COMP *cpi,
+ const double sum_rec_distortion,
+ const double sum_est_rate) {
+ if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return;
+
+ // Thresholds
+ const int high_quality_qindex = 128;
+ const double high_quality_bpp = 2.0;
+ const double high_quality_dist_per_pix = 4.0;
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int qindex = cm->quant_params.base_qindex;
+ const double dist_per_pix =
+ (double)sum_rec_distortion / (cm->width * cm->height);
+ // The estimate bpp is not accurate, an empirical constant 100 is divided.
+ const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100);
+
+ if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp &&
+ dist_per_pix < high_quality_dist_per_pix) {
+ cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0;
+ cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0;
+ cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0;
+ cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0;
+ }
+}
+
+static void ext_rate_guided_quantization(AV1_COMP *cpi) {
+ // Calculation uses 8x8.
+ const int mb_step = mi_size_wide[cpi->weber_bsize];
+ // Accumulate to 16x16, step size is in the unit of mi.
+ const int block_step = 4;
+
+ const char *filename = cpi->oxcf.rate_distribution_info;
+ FILE *pfile = fopen(filename, "r");
+ if (pfile == NULL) {
+ assert(pfile != NULL);
+ return;
+ }
+
+ double ext_rate_sum = 0.0;
+ for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) {
+ for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) {
+ float val;
+ const int fields_converted = fscanf(pfile, "%f", &val);
+ if (fields_converted != 1) {
+ assert(fields_converted == 1);
+ fclose(pfile);
+ return;
+ }
+ ext_rate_sum += val;
+ cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols +
+ (col / mb_step)] = val;
+ }
+ }
+ fclose(pfile);
+
+ int uniform_rate_sum = 0;
+ for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) {
+ for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) {
+ int rate_sum = 0;
+ for (int r = 0; r < block_step; r += mb_step) {
+ for (int c = 0; c < block_step; c += mb_step) {
+ const int mi_row = row + r;
+ const int mi_col = col + c;
+ rate_sum += cpi->prep_rate_estimates[(mi_row / mb_step) *
+ cpi->frame_info.mi_cols +
+ (mi_col / mb_step)];
+ }
+ }
+ uniform_rate_sum += rate_sum;
+ }
+ }
+
+ const double scale = uniform_rate_sum / ext_rate_sum;
+ cpi->ext_rate_scale = scale;
+}
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ if (aom_realloc_frame_buffer(
+ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td);
+ cpi->norm_wiener_variance = 0;
+
+ MACROBLOCK *x = &cpi->td.mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ // xd->mi needs to be setup since it is used in av1_frame_init_quantizer.
+ MB_MODE_INFO mbmi;
+ memset(&mbmi, 0, sizeof(mbmi));
+ MB_MODE_INFO *mbmi_ptr = &mbmi;
+ xd->mi = &mbmi_ptr;
+ cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
+ av1_frame_init_quantizer(cpi);
+
+ double sum_rec_distortion = 0.0;
+ double sum_est_rate = 0.0;
+
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+ AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
+ intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read_dummy;
+ intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write_dummy;
+ // Calculate differential contrast for each block for the entire image.
+ // TODO(chengchen): properly accumulate the distortion and rate in
+ // av1_calc_mb_wiener_var_mt(). Until then, call calc_mb_wiener_var() if
+ // auto_intra_tools_off is true.
+ if (num_workers > 1 && !cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+ intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read;
+ intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write;
+ av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion,
+ &sum_est_rate);
+ } else {
+ calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate);
+ }
+
+ // Determine whether to turn off several intra coding tools.
+ automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate);
+
+ // Read external rate distribution and use it to guide delta quantization
+ if (cpi->oxcf.enable_rate_guide_deltaq) ext_rate_guided_quantization(cpi);
+
+ const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size;
+ cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size);
+ const int norm_step = mi_size_wide[norm_block_size];
+
+ double sb_wiener_log = 0;
+ double sb_count = 0;
+ for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
+ sb_wiener_log = 0;
+ sb_count = 0;
+ for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+ for (int mi_col = 0; mi_col < cm->mi_params.mi_cols;
+ mi_col += norm_step) {
+ int sb_wiener_var =
+ get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+
+ double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+ double min_max_scale = AOMMAX(
+ 1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
+
+ beta = AOMMIN(beta, 4);
+ beta = AOMMAX(beta, 0.25);
+
+ if (beta < 1 / min_max_scale) continue;
+
+ sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
+
+ int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+ int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+ double scaled_satd = (double)satd / sqrt((double)sse);
+ sb_wiener_log += scaled_satd * log(sb_wiener_var);
+ sb_count += scaled_satd;
+ }
+ }
+
+ if (sb_count > 0)
+ cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count));
+ cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
+ }
+
+ // Set the pointer to null since mbmi is only allocated inside this function.
+ xd->mi = NULL;
+ aom_free_frame_buffer(&cm->cur_frame->buf);
+ av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+}
+
+static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ // Calculation uses 8x8.
+ const int mb_step = mi_size_wide[cpi->weber_bsize];
+ // Accumulate to 16x16
+ const int block_step = mi_size_wide[BLOCK_16X16];
+ double sb_rate_hific = 0.0;
+ double sb_rate_uniform = 0.0;
+ for (int row = mi_row; row < mi_row + mi_size_wide[bsize];
+ row += block_step) {
+ for (int col = mi_col; col < mi_col + mi_size_high[bsize];
+ col += block_step) {
+ sb_rate_hific +=
+ cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols +
+ (col / mb_step)];
+
+ for (int r = 0; r < block_step; r += mb_step) {
+ for (int c = 0; c < block_step; c += mb_step) {
+ const int this_row = row + r;
+ const int this_col = col + c;
+ sb_rate_uniform +=
+ cpi->prep_rate_estimates[(this_row / mb_step) *
+ cpi->frame_info.mi_cols +
+ (this_col / mb_step)];
+ }
+ }
+ }
+ }
+ sb_rate_hific *= cpi->ext_rate_scale;
+
+ const double weight = 1.0;
+ const double rate_diff =
+ weight * (sb_rate_hific - sb_rate_uniform) / sb_rate_uniform;
+ double scale = pow(2, rate_diff);
+
+ scale = scale * scale;
+ double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+ scale = 1.0 / AOMMIN(1.0 / scale, min_max_scale);
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ int offset =
+ av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, scale);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ const int max_offset = delta_q_info->delta_q_res * 10;
+ offset = AOMMIN(offset, max_offset - 1);
+ offset = AOMMAX(offset, -max_offset + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+ if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+ return qindex;
+}
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ return get_rate_guided_quantizer(cpi, bsize, mi_row, mi_col);
+ }
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col);
+ int offset = 0;
+ double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+ double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+ beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+
+ // Cap beta such that the delta q value is not much far away from the base q.
+ beta = AOMMIN(beta, 4);
+ beta = AOMMAX(beta, 0.25);
+ offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1);
+ offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+ if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+ return qindex;
+}
+
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ if (cpi->mb_delta_q) return;
+
+ CHECK_MEM_ERROR(cm, cpi->mb_delta_q,
+ aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols,
+ sizeof(*cpi->mb_delta_q)));
+}
+
+#if CONFIG_TFLITE
+static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows,
+ int bit_depth, uint8_t *y_buffer, int y_stride,
+ float *predicts0, float *predicts1) {
+ // Create the model and interpreter options.
+ TfLiteModel *model =
+ TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize);
+ if (model == NULL) return 1;
+
+ TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate();
+ TfLiteInterpreterOptionsSetNumThreads(options, 2);
+ if (options == NULL) {
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ // Create the interpreter.
+ TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options);
+ if (interpreter == NULL) {
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ // Allocate tensors and populate the input tensor data.
+ TfLiteInterpreterAllocateTensors(interpreter);
+ TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+ if (input_tensor == NULL) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ size_t input_size = TfLiteTensorByteSize(input_tensor);
+ float *input_data = aom_calloc(input_size, 1);
+ if (input_data == NULL) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int row_offset = (row * num_mi_h) << 2;
+ const int col_offset = (col * num_mi_w) << 2;
+
+ uint8_t *buf = y_buffer + row_offset * y_stride + col_offset;
+ int r = row_offset, pos = 0;
+ const float base = (float)((1 << bit_depth) - 1);
+ while (r < row_offset + (num_mi_h << 2)) {
+ for (int c = 0; c < (num_mi_w << 2); ++c) {
+ input_data[pos++] = bit_depth > 8
+ ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base
+ : (float)*(buf + c) / base;
+ }
+ buf += y_stride;
+ ++r;
+ }
+ TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size);
+
+ // Execute inference.
+ if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ // Extract the output tensor data.
+ const TfLiteTensor *output_tensor =
+ TfLiteInterpreterGetOutputTensor(interpreter, 0);
+ if (output_tensor == NULL) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ size_t output_size = TfLiteTensorByteSize(output_tensor);
+ float output_data[2];
+
+ TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size);
+ predicts0[row * num_cols + col] = output_data[0];
+ predicts1[row * num_cols + col] = output_data[1];
+ }
+ }
+
+ // Dispose of the model and interpreter objects.
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ aom_free(input_data);
+ return 0;
+}
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ uint8_t *y_buffer = cpi->source->y_buffer;
+ const int y_stride = cpi->source->y_stride;
+ const int block_size = cpi->common.seq_params->sb_size;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+ // TODO(sdeng): fit a better model_1; disable it at this time.
+ float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f;
+ CHECK_MEM_ERROR(cm, mb_delta_q0,
+ aom_calloc(num_rows * num_cols, sizeof(float)));
+ CHECK_MEM_ERROR(cm, mb_delta_q1,
+ aom_calloc(num_rows * num_cols, sizeof(float)));
+
+ if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer,
+ y_stride, mb_delta_q0, mb_delta_q1)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Failed to call TFlite functions.");
+ }
+
+ // Loop through each SB block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ delta_q_avg0 += mb_delta_q0[index];
+ }
+ }
+
+ delta_q_avg0 /= (float)(num_rows * num_cols);
+
+ float scaling_factor;
+ const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ;
+ if (cq_level < delta_q_avg0) {
+ scaling_factor = cq_level / delta_q_avg0;
+ } else {
+ scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0);
+ }
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->mb_delta_q[index] =
+ RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ *
+ scaling_factor * (mb_delta_q0[index] - delta_q_avg0));
+ }
+ }
+
+ aom_free(mb_delta_q0);
+ aom_free(mb_delta_q1);
+}
+#else // !CONFIG_TFLITE
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ uint8_t *y_buffer = cpi->source->y_buffer;
+ const int y_stride = cpi->source->y_stride;
+ const int block_size = cpi->common.seq_params->sb_size;
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+ int *mb_delta_q[2];
+ CHECK_MEM_ERROR(cm, mb_delta_q[0],
+ aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0])));
+ CHECK_MEM_ERROR(cm, mb_delta_q[1],
+ aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1])));
+
+ // Approximates the model change between current version (Spet 2021) and the
+ // baseline (July 2021).
+ const double model_change[] = { 3.0, 3.0 };
+ // The following parameters are fitted from user labeled data.
+ const double a[] = { -24.50 * 4.0, -17.20 * 4.0 };
+ const double b[] = { 0.004898, 0.003093 };
+ const double c[] = { (29.932 + model_change[0]) * 4.0,
+ (42.100 + model_change[1]) * 4.0 };
+ int delta_q_avg[2] = { 0, 0 };
+ // Loop through each SB block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ double var = 0.0, num_of_var = 0.0;
+ const int index = row * num_cols + col;
+
+ // Loop through each 8x8 block.
+ for (int mi_row = row * num_mi_h;
+ mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+ mi_row += 2) {
+ for (int mi_col = col * num_mi_w;
+ mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+ mi_col += 2) {
+ struct buf_2d buf;
+ const int row_offset_y = mi_row << 2;
+ const int col_offset_y = mi_col << 2;
+
+ buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ unsigned int block_variance;
+ block_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y);
+
+ block_variance = AOMMAX(block_variance, 1);
+ var += log((double)block_variance);
+ num_of_var += 1.0;
+ }
+ }
+ var = exp(var / num_of_var);
+ mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]);
+ mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]);
+ delta_q_avg[0] += mb_delta_q[0][index];
+ delta_q_avg[1] += mb_delta_q[1][index];
+ }
+ }
+
+ delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols));
+ delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols));
+
+ int model_idx;
+ double scaling_factor;
+ const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+ if (cq_level < delta_q_avg[0]) {
+ model_idx = 0;
+ scaling_factor = (double)cq_level / delta_q_avg[0];
+ } else if (cq_level < delta_q_avg[1]) {
+ model_idx = 2;
+ scaling_factor =
+ (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]);
+ } else {
+ model_idx = 1;
+ scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]);
+ }
+
+ const double new_delta_q_avg =
+ delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]);
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ if (model_idx == 2) {
+ const double delta_q =
+ mb_delta_q[0][index] +
+ scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]);
+ cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength /
+ 100.0 * (delta_q - new_delta_q_avg));
+ } else {
+ cpi->mb_delta_q[index] = RINT(
+ (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor *
+ (mb_delta_q[model_idx][index] - delta_q_avg[model_idx]));
+ }
+ }
+ }
+
+ aom_free(mb_delta_q[0]);
+ aom_free(mb_delta_q[1]);
+}
+#endif
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) {
+ const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex;
+
+ const int num_mi_w = mi_size_wide[bsize];
+ const int num_mi_h = mi_size_high[bsize];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w);
+ const int delta_q = cpi->mb_delta_q[index];
+
+ int qindex = base_qindex + delta_q;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ + 1);
+
+ return qindex;
+}
diff --git a/third_party/aom/av1/encoder/allintra_vis.h b/third_party/aom/av1/encoder/allintra_vis.h
new file mode 100644
index 0000000000..0d34ce0841
--- /dev/null
+++ b/third_party/aom/av1/encoder/allintra_vis.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+
+#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
+
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, const int mi_row,
+ int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ double *sum_rec_distortion,
+ double *sum_est_rate, uint8_t *pred_buffer);
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col);
+
+// User rating based mode
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col);
+
+#endif // AOM_AV1_ENCODER_ALLINTRA_VIS_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 0000000000..4cf6bd572d
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 1.75, 1.25, 1.05, 1.00, 0.90 },
+ { 2.00, 1.50, 1.15, 1.00, 0.85 },
+ { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 0.15, 0.30, 0.55, 2.00, 100.0 },
+ { 0.20, 0.40, 0.65, 2.00, 100.0 },
+ { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { -4.0, -3.0, -2.0, 100.00, 100.0 },
+ { -3.5, -2.5, -1.5, 100.00, 100.0 },
+ { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+ // Approximate base quatizer (truncated to int)
+ const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4;
+ return (base_quant > 10) + (base_quant > 25);
+}
+
+static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+
+ return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ refresh_frame->alt_ref_frame ||
+ (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+// Segmentation only makes sense if the target bits per SB is above a threshold.
+// Below this the overheads will usually outweigh any benefit.
+static bool is_sb_aq_enabled(const AV1_COMP *const cpi) {
+ return cpi->rc.sb64_target_rate >= 256;
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ struct segmentation *const seg = &cm->seg;
+ const int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+
+ // Make SURE use of floating point in this function is safe.
+
+ if (resolution_change) {
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ av1_clearall_segfeatures(seg);
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ if (is_frame_aq_enabled(cpi)) {
+ int segment;
+ const int aq_strength =
+ get_aq_c_strength(base_qindex, cm->seq_params->bit_depth);
+
+ // Clear down the segment map.
+ memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
+ cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+ av1_clearall_segfeatures(seg);
+
+ if (!is_sb_aq_enabled(cpi)) {
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ av1_enable_segmentation(seg);
+
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+ // Use some of the segments for in frame Q adjustment.
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG) continue;
+
+ qindex_delta = av1_compute_qdelta_by_rate(
+ cpi, cm->current_frame.frame_type, base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment]);
+
+ // For AQ complexity mode, we dont allow Q0 in a segment if the base
+ // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -base_qindex + 1;
+ }
+ if ((base_qindex + qindex_delta) > 0) {
+ av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
+ }
+ }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
+ if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col;
+ const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]);
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]);
+ int i;
+ unsigned char segment;
+
+ // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+ // It is converted to bits << AV1_PROB_COST_SHIFT units.
+ const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+ << AV1_PROB_COST_SHIFT;
+ const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
+ const int target_rate = (int)(num / denom);
+ double logvar;
+ double low_var_thresh;
+ const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
+ cm->seq_params->bit_depth);
+
+ low_var_thresh =
+ (is_stat_consumption_stage_twopass(cpi))
+ ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
+
+ av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
+ logvar = av1_log_block_var(cpi, mb, bs);
+
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
+ if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
+ break;
+ }
+ }
+
+ // Fill in the entires in the segment map corresponding to this SB64.
+ const int mi_stride = cm->mi_params.mi_cols;
+ set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment);
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 0000000000..3421d74c93
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs, int mi_row, int mi_col,
+ int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 0000000000..f48ff11e51
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/pred_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+ CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+ if (cr == NULL) return NULL;
+
+ cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->percent_refresh_adjustment = 5;
+ cr->rate_ratio_qdelta_adjustment = 0.25;
+ if (cr->map == NULL) {
+ av1_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+ if (cr != NULL) {
+ aom_free(cr->map);
+ aom_free(cr);
+ }
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+ const MB_MODE_INFO *mbmi, int64_t rate,
+ int64_t dist, BLOCK_SIZE bsize,
+ int noise_level) {
+ MV mv = mbmi->mv[0].as_mv;
+ int is_compound = has_second_ref(mbmi);
+ // Reject the block for lower-qp coding for non-compound mode if
+ // projected distortion is above the threshold, and any of the following
+ // is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ // Otherwise accept for refresh.
+ if (!is_compound && dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mbmi)))
+ return CR_SEGMENT_ID_BASE;
+ else if ((is_compound && noise_level < kMedium) ||
+ (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+ is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+ cr->rate_boost_fac > 10))
+ // More aggressive delta-q for bigger blocks with zero motion.
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int deltaq = av1_compute_qdelta_by_rate(
+ cpi, cpi->common.current_frame.frame_type, q, rate_factor);
+ if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+ deltaq = -cr->max_qdelta_perc * q / 100;
+ }
+ return deltaq;
+}
+
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int mbs = cm->mi_params.MBs;
+ const int num4x4bl = mbs << 4;
+ // Weight for non-base segments: use actual number of blocks refreshed in
+ // previous/just encoded frame. Note number of blocks here is in 4x4 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+ if (cpi->rc.rtc_external_ratectrl) {
+ weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows *
+ cm->mi_params.mi_cols / 100) /
+ num4x4bl;
+ weight_segment2 = 0;
+ }
+ // Take segment weighted average for estimated bits.
+ const int estimated_bits =
+ (int)((1.0 - weight_segment1 - weight_segment2) *
+ av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) +
+ weight_segment1 *
+ av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1],
+ correction_factor) +
+ weight_segment2 *
+ av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2],
+ correction_factor));
+ return estimated_bits;
+}
+
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bits_per_mb;
+ int num4x4bl = cm->mi_params.MBs << 4;
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ double weight_segment =
+ (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+ cr->actual_num_seg2_blocks) >>
+ 1) /
+ num4x4bl;
+ if (cpi->rc.rtc_external_ratectrl) {
+ weight_segment = (double)((cr->target_num_seg_blocks +
+ cr->percent_refresh * cm->mi_params.mi_rows *
+ cm->mi_params.mi_cols / 100) >>
+ 1) /
+ num4x4bl;
+ }
+ // Compute delta-q corresponding to qindex i.
+ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate;
+ // Take segment weighted average for bits per mb.
+ bits_per_mb =
+ (int)((1.0 - weight_segment) *
+ av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i,
+ correction_factor, accurate_estimate) +
+ weight_segment * av1_rc_bits_per_mb(
+ cpi, cm->current_frame.frame_type, i + deltaq,
+ correction_factor, accurate_estimate));
+ return bits_per_mb;
+}
+
+void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run) {
+ int cdf_num;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int prev_segment_id = mbmi->segment_id;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+
+ assert(cm->seg.enabled);
+
+ if (!cr->skip_over4x4) {
+ mbmi->segment_id =
+ av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4);
+ if (prev_segment_id != mbmi->segment_id) {
+ const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+ const int mi_stride = cm->mi_params.mi_cols;
+ const uint8_t segment_id = mbmi->segment_id;
+ for (int mi_y = 0; mi_y < ymis; mi_y++) {
+ const int map_offset = block_index + mi_y * mi_stride;
+ memset(&cr->map[map_offset], 0, xmis);
+ memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+ memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
+ }
+ }
+ }
+ if (!dry_run) {
+ if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+ x->actual_num_seg1_blocks -= xmis * ymis;
+ else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+ x->actual_num_seg2_blocks -= xmis * ymis;
+ }
+}
+
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip,
+ RUN_TYPE dry_run) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+ int noise_level = 0;
+ if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level;
+ const int refresh_this_block =
+ candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level);
+ int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
+ // Default is to not update the refresh map.
+ int new_map_value = cr->map[block_index];
+
+ // If this block is labeled for refresh, check if we should reset the
+ // segment_id.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ mbmi->segment_id = refresh_this_block;
+ // Reset segment_id if will be skipped.
+ if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+ }
+ const uint8_t segment_id = mbmi->segment_id;
+
+ // Update the cyclic refresh map, to be used for setting segmentation map
+ // for the next frame. If the block will be refreshed this frame, mark it
+ // as clean. The magnitude of the -ve influences how long before we consider
+ // it for refresh again.
+ if (cyclic_refresh_segment_id_boosted(segment_id)) {
+ new_map_value = -cr->time_for_refresh;
+ } else if (refresh_this_block) {
+ // Else if it is accepted as candidate for refresh, and has not already
+ // been refreshed (marked as 1) then mark it as a candidate for cleanup
+ // for future time (marked as 0), otherwise don't update it.
+ if (cr->map[block_index] == 1) new_map_value = 0;
+ } else {
+ // Leave it marked as block that is not candidate for refresh.
+ new_map_value = 1;
+ }
+
+ // Update entries in the cyclic refresh map with new_map_value, and
+ // copy mbmi->segment_id into global segmentation map.
+ const int mi_stride = cm->mi_params.mi_cols;
+ for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+ const int map_offset = block_index + mi_y * mi_stride;
+ memset(&cr->map[map_offset], new_map_value, xmis);
+ memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+ memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
+ }
+
+ // Accumulate cyclic refresh update counters.
+ if (!dry_run) {
+ if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1)
+ x->actual_num_seg1_blocks += xmis * ymis;
+ else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2)
+ x->actual_num_seg2_blocks += xmis * ymis;
+ }
+}
+
+// Initializes counters used for cyclic refresh.
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) {
+ x->actual_num_seg1_blocks = 0;
+ x->actual_num_seg2_blocks = 0;
+}
+
+// Accumulate cyclic refresh counters.
+void av1_accumulate_cyclic_refresh_counters(
+ CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) {
+ cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks;
+ cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks;
+}
+
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // Set minimum gf_interval for GF update to a multiple of the refresh period,
+ // with some max limit. Depending on past encoding stats, GF flag may be
+ // reset and update may not occur until next baseline_gf_interval.
+ const int gf_length_mult[2] = { 8, 4 };
+ if (cr->percent_refresh > 0)
+ p_rc->baseline_gf_interval =
+ AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] *
+ (100 / cr->percent_refresh),
+ MAX_GF_INTERVAL_RT);
+ else
+ p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+ if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+ p_rc->baseline_gf_interval = 16;
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+ int xmis, ymis, x, y;
+ uint64_t sb_sad = 0;
+ uint64_t thresh_sad_low = 0;
+ uint64_t thresh_sad = INT64_MAX;
+ const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols;
+ const int mi_stride = mi_cols;
+ memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols);
+ sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+ sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+ sbs_in_frame = sb_cols * sb_rows;
+ // Number of target blocks to get the q delta (segment 1).
+ block_count = cr->percent_refresh * mi_rows * mi_cols / 100;
+ // Set the segmentation map: cycle through the superblocks, starting at
+ // cr->mb_index, and stopping when either block_count blocks have been found
+ // to be refreshed, or we have passed through whole frame.
+ if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
+ assert(cr->sb_index < sbs_in_frame);
+ i = cr->sb_index;
+ cr->last_sb_index = cr->sb_index;
+ cr->target_num_seg_blocks = 0;
+ do {
+ int sum_map = 0;
+ // Get the mi_row/mi_col corresponding to superblock index i.
+ int sb_row_index = (i / sb_cols);
+ int sb_col_index = i - sb_row_index * sb_cols;
+ int mi_row = sb_row_index * cm->seq_params->mib_size;
+ int mi_col = sb_col_index * cm->seq_params->mib_size;
+ assert(mi_row >= 0 && mi_row < mi_rows);
+ assert(mi_col >= 0 && mi_col < mi_cols);
+ bl_index = mi_row * mi_stride + mi_col;
+ // Loop through all MI blocks in superblock and update map.
+ xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size);
+ ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size);
+ if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 &&
+ cr->counter_encode_maxq_scene_change > 30 &&
+ cpi->src_sad_blk_64x64 != NULL &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index];
+ int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8;
+ int scale_low = 2;
+ thresh_sad = (scale * 64 * 64);
+ thresh_sad_low = (scale_low * 64 * 64);
+ // For temporal layers: the base temporal layer (temporal_layer_id = 0)
+ // has larger frame separation (2 or 4 frames apart), so use larger sad
+ // thresholds to compensate for larger frame sad. The larger thresholds
+ // also increase the amount of refresh, which is needed for the base
+ // temporal layer.
+ if (cpi->svc.number_temporal_layers > 1 &&
+ cpi->svc.temporal_layer_id == 0) {
+ thresh_sad <<= 4;
+ thresh_sad_low <<= 2;
+ }
+ }
+ // cr_map only needed at 8x8 blocks.
+ for (y = 0; y < ymis; y += 2) {
+ for (x = 0; x < xmis; x += 2) {
+ const int bl_index2 = bl_index + y * mi_stride + x;
+ // If the block is as a candidate for clean up then mark it
+ // for possible boost/refresh (segment 1). The segment id may get
+ // reset to 0 later if block gets coded anything other than low motion.
+ // If the block_sad (sb_sad) is very low label it for refresh anyway.
+ if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) {
+ sum_map += 4;
+ } else if (cr->map[bl_index2] < 0) {
+ cr->map[bl_index2]++;
+ }
+ }
+ }
+ // Enforce constant segment over superblock.
+ // If segment is at least half of superblock, set to 1.
+ // Enforce that block sad (sb_sad) is not too high.
+ if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) {
+ set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride,
+ CR_SEGMENT_ID_BOOST1);
+ cr->target_num_seg_blocks += xmis * ymis;
+ }
+ i++;
+ if (i == sbs_in_frame) {
+ i = 0;
+ }
+ } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+ cr->sb_index = i;
+ if (cr->target_num_seg_blocks == 0) {
+ // Disable segmentation, seg_map is already set to 0 above.
+ av1_disable_segmentation(&cm->seg);
+ }
+}
+
+static int is_scene_change_detected(AV1_COMP *const cpi) {
+ return cpi->rc.high_source_sad;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+ // TODO(marpan): Parameters need to be tuned.
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ SVC *const svc = &cpi->svc;
+ const int qp_thresh = AOMMAX(16, rc->best_quality + 4);
+ const int qp_max_thresh = 118 * MAXQ >> 7;
+ const int scene_change_detected = is_scene_change_detected(cpi);
+ const int is_screen_content =
+ (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+
+ // A scene change or key frame marks the start of a cyclic refresh cycle.
+ const int frames_since_scene_change =
+ (cpi->ppi->use_svc || !is_screen_content)
+ ? cpi->rc.frames_since_key
+ : AOMMIN(cpi->rc.frames_since_key,
+ cr->counter_encode_maxq_scene_change);
+
+ // Cases to reset the cyclic refresh adjustment parameters.
+ if (frame_is_intra_only(cm) || scene_change_detected ||
+ cpi->ppi->rtc_ref.bias_recovery_frame) {
+ // Reset adaptive elements for intra only frames and scene changes.
+ cr->percent_refresh_adjustment = 5;
+ cr->rate_ratio_qdelta_adjustment = 0.25;
+ }
+
+ // Although this segment feature for RTC is only used for
+ // blocks >= 8X8, for more efficient coding of the seg map
+ // cur_frame->seg_map needs to set at 4x4 along with the
+ // function av1_cyclic_reset_segment_skip(). Skipping over
+ // 4x4 will therefore have small bdrate loss (~0.2%), so
+ // we use it only for speed > 9 for now.
+ // Also if loop-filter deltas is applied via segment, then
+ // we need to set cr->skip_over4x4 = 1.
+ cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
+
+ // should we enable cyclic refresh on this frame.
+ cr->apply_cyclic_refresh = 1;
+ if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
+ scene_change_detected || svc->temporal_layer_id > 0 ||
+ svc->prev_number_spatial_layers != svc->number_spatial_layers ||
+ p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+ (svc->number_spatial_layers > 1 &&
+ svc->layer_context[svc->temporal_layer_id].is_key_frame) ||
+ (frames_since_scene_change > 20 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+ (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 &&
+ frames_since_scene_change > 40) ||
+ cpi->ppi->rtc_ref.bias_recovery_frame) {
+ cr->apply_cyclic_refresh = 0;
+ return;
+ }
+
+ // Increase the amount of refresh for #temporal_layers > 2
+ if (svc->number_temporal_layers > 2)
+ cr->percent_refresh = 15;
+ else
+ cr->percent_refresh = 10 + cr->percent_refresh_adjustment;
+
+ cr->max_qdelta_perc = 60;
+ cr->time_for_refresh = 0;
+ cr->use_block_sad_scene_det =
+ (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ cm->seq_params->sb_size == BLOCK_64X64)
+ ? 1
+ : 0;
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac =
+ (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15;
+
+ // Use larger delta-qp (increase rate_ratio_qdelta) for first few
+ // refresh cycles after a key frame (svc) or scene change (non svc).
+ // For non svc screen content, after a scene change gradually reduce
+ // this boost and supress it further if either of the previous two
+ // frames overshot.
+ if (cr->percent_refresh > 0) {
+ if (cpi->ppi->use_svc || !is_screen_content) {
+ if (frames_since_scene_change <
+ ((4 * svc->number_temporal_layers) * (100 / cr->percent_refresh))) {
+ cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment;
+ } else {
+ cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
+ }
+ } else {
+ double distance_from_sc_factor =
+ AOMMIN(0.75, (int)(frames_since_scene_change / 10) * 0.1);
+ cr->rate_ratio_qdelta =
+ 3.0 + cr->rate_ratio_qdelta_adjustment - distance_from_sc_factor;
+ if ((frames_since_scene_change < 10) &&
+ ((cpi->rc.rc_1_frame < 0) || (cpi->rc.rc_2_frame < 0))) {
+ cr->rate_ratio_qdelta -= 0.25;
+ }
+ }
+ } else {
+ cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
+ }
+ // Adjust some parameters for low resolutions.
+ if (cm->width * cm->height <= 352 * 288) {
+ if (cpi->svc.number_temporal_layers > 1) {
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac = 13;
+ } else {
+ if (rc->avg_frame_bandwidth < 3000) {
+ cr->motion_thresh = 16;
+ cr->rate_boost_fac = 13;
+ } else {
+ cr->max_qdelta_perc = 50;
+ cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0);
+ }
+ }
+ }
+ if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+ // To be adjusted for VBR mode, e.g., based on gf period and boost.
+ // For now use smaller qp-delta (than CBR), no second boosted seg, and
+ // turn-off (no refresh) on golden refresh (since it's already boosted).
+ cr->percent_refresh = 10;
+ cr->rate_ratio_qdelta = 1.5;
+ cr->rate_boost_fac = 10;
+ if (cpi->refresh_frame.golden_frame) {
+ cr->percent_refresh = 0;
+ cr->rate_ratio_qdelta = 1.0;
+ }
+ }
+ if (rc->rtc_external_ratectrl) {
+ cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows *
+ cm->mi_params.mi_cols / 100;
+ cr->actual_num_seg2_blocks = 0;
+ }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ struct segmentation *const seg = &cm->seg;
+ const int scene_change_detected = is_scene_change_detected(cpi);
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ // Set resolution_change flag: for svc only set it when the
+ // number of spatial layers has not changed.
+ const int resolution_change =
+ cm->prev_frame &&
+ (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height) &&
+ cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers;
+
+ if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
+ if (!cr->apply_cyclic_refresh) {
+ // Set segmentation map to 0 and disable.
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ av1_disable_segmentation(&cm->seg);
+ if (frame_is_intra_only(cm) || scene_change_detected ||
+ cpi->ppi->rtc_ref.bias_recovery_frame) {
+ cr->sb_index = 0;
+ cr->last_sb_index = 0;
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->actual_num_seg1_blocks = 0;
+ cr->actual_num_seg2_blocks = 0;
+ }
+ return;
+ } else {
+ cr->counter_encode_maxq_scene_change++;
+ const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+ cm->seq_params->bit_depth);
+ // Set rate threshold to some multiple (set to 2 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+ // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+ // q will not exceed 457, so (q * q) is within 32bit; see:
+ // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+ cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+ // For low-resoln or lower speeds, the rate/dist thresholds need to be
+ // tuned/updated.
+ if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) {
+ cr->thresh_dist_sb = 0;
+ cr->thresh_rate_sb = INT64_MAX;
+ }
+ // Set up segmentation.
+ // Clear down the segment map.
+ av1_enable_segmentation(&cm->seg);
+ av1_clearall_segfeatures(seg);
+
+ // Note: setting temporal_update has no effect, as the seg-map coding method
+ // (temporal or spatial) is determined in
+ // av1_choose_segmap_coding_method(),
+ // based on the coding cost of each method. For error_resilient mode on the
+ // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+ // relative to 0 previous map.
+ // seg->temporal_update = 0;
+
+ // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+ // Use segment BOOST1 for in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+ // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+ // Set the q delta for segment BOOST1.
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+ int qindex_delta =
+ compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta);
+ cr->qindex_delta[1] = qindex_delta;
+
+ // Compute rd-mult for segment BOOST1.
+ const int qindex2 = clamp(
+ quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta,
+ 0, MAXQ);
+ cr->rdmult = av1_compute_rd_mult(
+ qindex2, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Set a more aggressive (higher) q delta for segment BOOST2.
+ qindex_delta = compute_deltaq(
+ cpi, quant_params->base_qindex,
+ AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+ 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+ cr->qindex_delta[2] = qindex_delta;
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Update the segmentation and refresh map.
+ cyclic_refresh_update_map(cpi);
+ }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+ return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ cr->sb_index = 0;
+ cr->last_sb_index = 0;
+ cpi->refresh_frame.golden_frame = true;
+ cr->apply_cyclic_refresh = 0;
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->percent_refresh_adjustment = 5;
+ cr->rate_ratio_qdelta_adjustment = 0.25;
+}
+
+int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int qindex = cpi->common.quant_params.base_qindex;
+ if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
+ cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
+ cpi->rc.frame_source_sad < 1000 &&
+ qindex < 7 * (cpi->rc.worst_quality >> 3))
+ return 1;
+ // More aggressive skip.
+ else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad &&
+ cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality)
+ return 1;
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 0000000000..10974f018b
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+/*!
+ * \brief The stucture of CYCLIC_REFRESH.
+ * \ingroup cyclic_refresh
+ */
+struct CYCLIC_REFRESH {
+ /*!
+ * Percentage of blocks per frame that are targeted as candidates
+ * for cyclic refresh.
+ */
+ int percent_refresh;
+
+ /*!
+ * Active adjustment delta for cyclic refresh for rate control.
+ */
+ int percent_refresh_adjustment;
+
+ /*!
+ * Maximum q-delta as percentage of base q.
+ */
+ int max_qdelta_perc;
+ /*!
+ *Superblock starting index for cycling through the frame.
+ */
+ int sb_index;
+ /*!
+ *Superblock index cyclic refresh index last frame
+ */
+ int last_sb_index;
+ /*!
+ * Controls how long block will need to wait to be refreshed again, in
+ * excess of the cycle time, i.e., in the case of all zero motion, block
+ * will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+ */
+ int time_for_refresh;
+ /*!
+ * Target number of (4x4) blocks that are set for delta-q.
+ */
+ int target_num_seg_blocks;
+ /*!
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 1.
+ */
+ int actual_num_seg1_blocks;
+ /*!
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 2.
+ */
+ int actual_num_seg2_blocks;
+ /*!
+ * RD mult. parameters for segment 1.
+ */
+ int rdmult;
+ /*!
+ * Cyclic refresh map.
+ */
+ int8_t *map;
+ /*!
+ * Threshold applied to the projected rate of the coding block,
+ * when deciding whether block should be refreshed.
+ */
+ int64_t thresh_rate_sb;
+ /*!
+ * Threshold applied to the projected distortion of the coding block,
+ * when deciding whether block should be refreshed.
+ */
+ int64_t thresh_dist_sb;
+ /*!
+ * Threshold applied to the motion vector (in units of 1/8 pel) of the
+ * coding block, when deciding whether block should be refreshed.
+ */
+ int16_t motion_thresh;
+ /*!
+ * Rate target ratio to set q delta.
+ */
+ double rate_ratio_qdelta;
+
+ /*!
+ * Active adjustment of qdelta rate ratio for enhanced rate control
+ */
+ double rate_ratio_qdelta_adjustment;
+
+ /*!
+ * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+ */
+ int rate_boost_fac;
+
+ /*!\cond */
+ int qindex_delta[3];
+ int apply_cyclic_refresh;
+ int skip_over4x4;
+ int counter_encode_maxq_scene_change;
+ int use_block_sad_scene_det;
+ /*!\endcond */
+};
+
+struct AV1_COMP;
+
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+/*!\brief Estimate the bits, incorporating the delta-q from the segments.
+ *
+ * For the just encoded frame, estimate the bits, incorporating the delta-q
+ * from non-base segment(s). Note this function is called in the postencode
+ * (called from rc_update_rate_correction_factors()).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] correction_factor rate correction factor
+ *
+ * \return Return the estimated bits at given q.
+ */
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+ double correction_factor);
+
+/*!\brief Estimate the bits per mb, for given q = i and delta-q.
+ *
+ * Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+ * a corresponding delta-q (for segment 1). This function is called in the
+ * rc_regulate_q() to set the base qp index. Note: the segment map is set to
+ * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1
+ * (refresh) for each superblock, prior to encoding.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] i q index
+ * \param[in] correction_factor rate correction factor
+ *
+ * \return Return the estimated bits for q = i and delta-q (segment 1).
+ */
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+ double correction_factor);
+
+/*!\brief Update segment_id for blocks are skipped.
+ *
+ * After encoding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id based on skip_txfm,
+ * and update the cyclic_refresh map and segmentation counters.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] x Pointer to MACROBLOCK structure
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE
+ * \param[in] bsize Block size
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ *
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+
+void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+/*!\brief Update segment_id for block based on mode selected.
+ *
+ * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id (based on mode/motion/skip selected
+ * for that block) and update the cyclic_refresh map and segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] x Pointer to MACROBLOCK structure
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE
+ * \param[in] bsize Block size
+ * \param[in] rate Projected block rate from pickmode
+ * \param[in] dist Projected block dist from pickmode
+ * \param[in] skip Skip flag set from picmode
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ *
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip,
+ RUN_TYPE dry_run);
+
+/*!\brief Initialize counters used for cyclic refresh.
+ *
+ * Initializes cyclic refresh counters actual_num_seg1_blocks and
+ * actual_num_seg2_blocks.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] x Pointer to MACROBLOCK structure
+ *
+ * \remark Update the \c x->actual_num_seg1_blocks and the
+ * \c x->actual_num_seg2_blocks.
+ */
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x);
+
+/*!\brief Accumulate cyclic refresh counters.
+ *
+ * Accumulates cyclic refresh counters actual_num_seg1_blocks and
+ * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cyclic_refresh Pointer to CYCLIC_REFRESH structure
+ * \param[in] x Pointer to MACROBLOCK structure
+ *
+ * \remark Update the \c cyclic_refresh->actual_num_seg1_blocks and the
+ * \c cyclic_refresh->actual_num_seg2_blocks.
+ */
+void av1_accumulate_cyclic_refresh_counters(
+ CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x);
+
+/*!\brief Set golden frame update interval nased on cyclic refresh.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Returns the interval in \c cpi->rc.baseline_gf_interval.
+ */
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+/*!\brief Set the global/frame level parameters for cyclic refresh.
+ *
+ * First call to the cyclic refresh, before encoding the frame.
+ * Sets the flag on whether cyclic refresh should be applied, sets
+ * the amount/percent of refresh, and the amount of boost applied to
+ * the two segments (set by rate_ratio_qdelta and rate_boost_fac).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Updates the \c cpi->cyclic_refresh with the settings.
+ */
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+/*!\brief Setup the cyclic background refresh.
+ *
+ * Set the delta q for the segment(s), and set the segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh
+ * parameters and the \c cm->seg with the segmentation data.
+ */
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+ return segment_id == CR_SEGMENT_ID_BOOST1 ||
+ segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+ if (segment_id == CR_SEGMENT_ID_BOOST1)
+ return CR_SEGMENT_ID_BOOST1;
+ else if (segment_id == CR_SEGMENT_ID_BOOST2)
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 0000000000..086928a118
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+ 0.9, .8, .7, .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+DECLARE_ALIGNED(16, static const uint16_t,
+ av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const int base_qindex = cm->quant_params.base_qindex;
+ struct segmentation *seg = &cm->seg;
+ int i;
+
+ int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+ int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2);
+ double avg_ratio;
+ if (avg_energy > 7) avg_energy = 7;
+ if (avg_energy < 0) avg_energy = 0;
+ avg_ratio = rate_ratio[avg_energy];
+
+ if (resolution_change) {
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ av1_clearall_segfeatures(seg);
+ av1_disable_segmentation(seg);
+ return;
+ }
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ refresh_frame->alt_ref_frame ||
+ (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ cpi->vaq_refresh = 1;
+
+ av1_enable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ // Set up avg segment id to be 1.0 and adjust the other segments around
+ // it.
+ int qindex_delta =
+ av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+ base_qindex, rate_ratio[i] / avg_ratio);
+
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -base_qindex + 1;
+ }
+
+ av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+ }
+}
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ // This functions returns a score for the blocks local variance as calculated
+ // by: sum of the log of the (4x4 variances) of each subblock to the current
+ // block (x,bs)
+ // * 32 / number of pixels in the block_size.
+ // This is used for segmentation because to avoid situations in which a large
+ // block with a gentle gradient gets marked high variance even though each
+ // subblock has a low variance. This allows us to assign the same segment
+ // number for the same sorts of area regardless of how the partitioning goes.
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ double var = 0;
+ unsigned int sse;
+ int i, j;
+
+ int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ for (i = 0; i < bh; i += 4) {
+ for (j = 0; j < bw; j += 4) {
+ if (is_cur_buf_hbd(xd)) {
+ var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+ 16.0);
+ } else {
+ var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+ 16.0);
+ }
+ }
+ }
+ // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+ var /= (bw / 4 * bh / 4);
+ if (var > 7) var = 7;
+
+ return (int)(var);
+}
+
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ int mi_row, int mi_col) {
+ // This functions returns the block average of luma block
+ unsigned int sum, avg, num_pix;
+ int r, c;
+ const int pic_w = cpi->common.width;
+ const int pic_h = cpi->common.height;
+ const int bw = MI_SIZE * mi_size_wide[bs];
+ const int bh = MI_SIZE * mi_size_high[bs];
+ const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+ sum = 0;
+ num_pix = 0;
+ avg = 0;
+ int row = mi_row << MI_SIZE_LOG2;
+ int col = mi_col << MI_SIZE_LOG2;
+ for (r = row; (r < (row + bh)) && (r < pic_h); r++) {
+ for (c = col; (c < (col + bw)) && (c < pic_w); c++) {
+ sum += *(x16 + r * x->plane[0].src.stride + c);
+ num_pix++;
+ }
+ }
+ if (num_pix != 0) {
+ avg = sum / num_pix;
+ }
+ return avg;
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+
+static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int stride = x->plane[0].src.stride;
+ uint8_t *buf = x->plane[0].src.buf;
+ const int num_8x8_cols = block_size_wide[bs] / 8;
+ const int num_8x8_rows = block_size_high[bs] / 8;
+ const int hbd = is_cur_buf_hbd(xd);
+
+ int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows,
+ num_8x8_cols);
+
+ return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+static double log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+ unsigned int haar_sad = haar_ac_energy(x, bs);
+ return log1p(haar_sad);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ double energy, energy_midpoint;
+ energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
+ ? cpi->twopass_frame.frame_avg_haar_energy
+ : DEFAULT_E_MIDPOINT;
+ energy = log_block_wavelet_energy(x, bs) - energy_midpoint;
+ return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+ int block_var_level) {
+ int rate_level;
+ const AV1_COMMON *const cm = &cpi->common;
+
+ if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+ ENERGY_IN_BOUNDS(block_var_level);
+ rate_level = SEGMENT_ID(block_var_level);
+ } else {
+ rate_level = block_var_level;
+ }
+ const int base_qindex = cm->quant_params.base_qindex;
+ int qindex_delta =
+ av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex,
+ deltaq_rate_ratio[rate_level]);
+
+ if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -base_qindex + 1;
+ }
+ return base_qindex + qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 0000000000..aa0535ad72
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ int mi_row, int mi_col);
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+ int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
new file mode 100644
index 0000000000..91fc1e00a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#else
+#include <arm_acle.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+
+#define CRC_LOOP(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+#define CRC_SINGLE(op, crc, type, buf, len) \
+ if ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/* Return 32-bit CRC for the input buffer.
+ * Polynomial is 0x1EDC6F41.
+ */
+
+uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+#if !AOM_ARCH_AARCH64
+ // Align input to 8-byte boundary (only necessary for 32-bit builds.)
+ while (len && ((uintptr_t)buf & 7)) {
+ crc = __crc32cb(crc, *buf++);
+ len--;
+ }
+#endif
+
+ CRC_LOOP(__crc32cd, crc, uint64_t, buf, len)
+ CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len)
+ CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len)
+ CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len)
+
+ return ~crc;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
new file mode 100644
index 0000000000..26d06b46fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // By operating on unsigned integers we can store up to 4 squared diff in a
+ // 32-bit element before having to widen to 64 bits.
+ uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ // We can't do the same here as we're operating on signed integers, so we
+ // can only accumulate 2 squares.
+ int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+ ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+ int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+ ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_s64x2(ssz_s64);
+ return (int64_t)horizontal_add_u64x2(err_u64);
+}
+
+int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = vld1q_s16(coeff);
+ const int16x8_t c1 = vld1q_s16(coeff + 8);
+ const int16x8_t d0 = vld1q_s16(dqcoeff);
+ const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // By operating on unsigned integers we can store up to 4 squared diff in a
+ // 32-bit element before having to widen to 64 bits.
+ uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return (int64_t)horizontal_add_u64x2(err_u64);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
new file mode 100644
index 0000000000..63aad0b785
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const int16x8_t diff0 = vsubq_s16(c0, d0);
+ const int16x8_t diff1 = vsubq_s16(c1, d1);
+
+ error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+ error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+ sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0);
+ sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1]));
+ return vaddvq_s64(vaddq_s64(error[0], error[1]));
+}
+
+int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ if (block_size % 32 == 0) {
+ int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ const int16x8_t c0 = vld1q_s16(coeff);
+ const int16x8_t c1 = vld1q_s16(coeff + 8);
+ const int16x8_t c2 = vld1q_s16(coeff + 16);
+ const int16x8_t c3 = vld1q_s16(coeff + 24);
+ const int16x8_t d0 = vld1q_s16(dqcoeff);
+ const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+ const int16x8_t d2 = vld1q_s16(dqcoeff + 16);
+ const int16x8_t d3 = vld1q_s16(dqcoeff + 24);
+
+ const int16x8_t diff0 = vsubq_s16(c0, d0);
+ const int16x8_t diff1 = vsubq_s16(c1, d1);
+ const int16x8_t diff2 = vsubq_s16(c2, d2);
+ const int16x8_t diff3 = vsubq_s16(c3, d3);
+
+ error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+ error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+ error[2] = aom_sdotq_s16(error[2], diff2, diff2);
+ error[3] = aom_sdotq_s16(error[3], diff3, diff3);
+
+ coeff += 32;
+ dqcoeff += 32;
+ block_size -= 32;
+ } while (block_size != 0);
+
+ error[0] = vaddq_s64(error[0], error[1]);
+ error[2] = vaddq_s64(error[2], error[3]);
+ error[0] = vaddq_s64(error[0], error[2]);
+ return vaddvq_s64(error[0]);
+ }
+ assert(block_size == 16);
+
+ int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ const int16x8_t c0 = vld1q_s16(coeff);
+ const int16x8_t c1 = vld1q_s16(coeff + 8);
+ const int16x8_t d0 = vld1q_s16(dqcoeff);
+ const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+
+ const int16x8_t diff0 = vsubq_s16(c0, d0);
+ const int16x8_t diff1 = vsubq_s16(c1, d1);
+
+ error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+ error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return vaddvq_s64(vaddq_s64(error[0], error[1]));
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
new file mode 100644
index 0000000000..5148ee74a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -0,0 +1,3090 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+#define TXFM_COS_BIT_MAX 13
+
+// A note on butterfly helper naming:
+//
+// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
+// e.g. butterfly_s32_s32_x4_0231_neon
+// | | | ^ Weights are applied as indices 0, 2, 3, 1
+// | | | (see more detail below)
+// | | ^ (int32)x4 input/output parameters
+// | ^ 32-bit accumulators internally
+// ^ 32-bit input/output parameters
+//
+// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
+// avoid needing separate negation instructions. This is represented in the
+// helper naming by referring to the lane index in the loaded tuple that each
+// multiply is performed with:
+//
+// in0 in1
+// /----------
+// out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1
+// out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3
+//
+// So for indices 0331 from the earlier example, we end up with:
+//
+// in0 in1
+// /------------------
+// out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0
+// out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0)
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+ out0, out1) \
+ do { \
+ int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \
+ u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \
+ int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \
+ v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \
+ *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
+ *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
+ } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+ out0, out1) \
+ do { \
+ int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \
+ u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \
+ int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \
+ u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \
+ int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \
+ v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \
+ int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \
+ v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \
+ const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
+ const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \
+ const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
+ const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \
+ *out0 = vcombine_s16(c0, c1); \
+ *out1 = vcombine_s16(d0, d1); \
+ } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
+ int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
+ int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
+ int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + stride * i, in1[i]);
+ vst1q_s32(out + stride * i + 4, in2[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
+ const int stride,
+ int16x4_t *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1_s16(in);
+ in += stride;
+ }
+}
+
+static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
+ int16x8_t *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1q_s16(in + i * stride);
+ }
+}
+
+static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride, vmovl_s16(in[i]));
+ }
+}
+
+static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
+ vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
+ }
+}
+
+// A note on naming:
+// round_shift_[sqrt2]_s16_s32_4x1_neon(...)
+// | | | ^ 1 => a single vector
+// | | | n => an array of vectors
+// | | | ^ input/output vector element count
+// | | ^ output type
+// | ^ input type
+// ^ multiplicand and shift identifier
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+ return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+ return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+ round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+ return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+ return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+ round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
+ return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
+ return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
+}
+
+#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \
+ static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
+ for (int i = 0; i < size; ++i) { \
+ out[i] = fn(in[i]); \
+ } \
+ }
+
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
+ int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
+ int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
+ int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
+ int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
+ int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
+ }
+}
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride + 0,
+ round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
+ vst1q_s32(out + i * stride + 4,
+ round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
+ }
+}
+
+static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ int32x4_t u[6], v[6];
+ const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+ const int16x4_t u01 = vqadd_s16(input[0], input[1]);
+
+ v[5] = vmull_lane_s16(input[2], sinpi, 2);
+ v[0] = vmull_lane_s16(input[1], sinpi, 1);
+ v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
+ v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
+ v[2] = vmull_lane_s16(u01, sinpi, 2);
+ v[3] = vmull_lane_s16(input[0], sinpi, 3);
+ v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
+ v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
+
+ u[0] = vaddq_s32(v[0], v[1]);
+ u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
+ u[2] = vsubq_s32(v[3], v[4]);
+ u[3] = vsubq_s32(u[2], u[0]);
+ u[3] = vmlaq_n_s32(u[3], v[5], 3);
+
+ output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+ output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+ output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+ output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1-2
+ int16x4_t x2[8];
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
+
+ // stage 3
+ int16x4_t x3[8];
+ x3[0] = vqadd_s16(input[0], x2[2]);
+ x3[1] = vqsub_s16(x2[3], input[7]);
+ x3[2] = vqsub_s16(input[0], x2[2]);
+ x3[3] = vqadd_s16(input[7], x2[3]);
+ x3[4] = vqsub_s16(x2[6], input[1]);
+ x3[5] = vqadd_s16(input[6], x2[7]);
+ x3[6] = vqadd_s16(input[1], x2[6]);
+ x3[7] = vqsub_s16(input[6], x2[7]);
+
+ // stage 4
+ int16x4_t x4[8];
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
+
+ // stage 5
+ int16x4_t x5[8];
+ x5[0] = vqadd_s16(x3[0], x4[4]);
+ x5[1] = vqadd_s16(x3[1], x4[5]);
+ x5[2] = vqadd_s16(x3[2], x4[6]);
+ x5[3] = vqsub_s16(x4[7], x3[3]);
+ x5[4] = vqsub_s16(x3[0], x4[4]);
+ x5[5] = vqsub_s16(x3[1], x4[5]);
+ x5[6] = vqsub_s16(x3[2], x4[6]);
+ x5[7] = vqadd_s16(x3[3], x4[7]);
+
+ // stage 6-7
+ butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+ butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+ butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ int32x4_t u_lo[4], u_hi[4];
+ const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+ const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
+
+ u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
+ u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
+
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
+
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
+
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
+
+ u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
+ u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
+
+ u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
+ u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
+
+ u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
+ u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
+
+ u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
+ u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
+
+ u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
+ u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
+
+ u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
+ u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
+
+ u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
+ u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
+
+ const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
+ u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
+ u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
+
+ output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
+ output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
+ output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
+ output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
+}
+
+static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+ const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
+
+ int16x4_t in12a = vadd_s16(input[1], input[2]);
+ int16x4_t in12s = vsub_s16(input[1], input[2]);
+ int16x4_t in03a = vadd_s16(input[0], input[3]);
+ int16x4_t in03s = vsub_s16(input[0], input[3]);
+
+ int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
+ int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
+
+ int32x4_t u[4];
+ u[0] = vaddq_s32(u0ad1, u0ad2);
+ u[1] = vsubq_s32(u0ad2, u0ad1);
+ u[2] = vmull_lane_s16(in12s, cospi16, 1);
+ u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
+ u[3] = vmull_lane_s16(in03s, cospi16, 1);
+ u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
+
+ output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+ output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+ output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+ output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
+
+// Butterfly pre-processing:
+// e.g. n=4:
+// out[0] = in[0] + in[3]
+// out[1] = in[1] + in[2]
+// out[2] = in[1] - in[2]
+// out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
+ int16x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqadd_s16(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
+ int16x8_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqaddq_s16(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
+ int32x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqaddq_s32(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+// Butterfly post-processing:
+// e.g. n=8:
+// out[0] = in0[0] + in1[3];
+// out[1] = in0[1] + in1[2];
+// out[2] = in0[1] - in1[2];
+// out[3] = in0[0] - in1[3];
+// out[4] = in0[7] - in1[4];
+// out[5] = in0[6] - in1[5];
+// out[6] = in0[6] + in1[5];
+// out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
+ const int16x4_t *in1,
+ int16x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
+ const int16x8_t *in1,
+ int16x8_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
+ const int32x4_t *in1,
+ int32x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+
+ // stage 1
+ int16x8_t x1[4];
+ butterfly_dct_pre_s16_x8(input, x1, 4);
+
+ // stage 2
+ int16x8_t x2[4];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
+
+ // stage 3
+ output[0] = x2[0];
+ output[1] = x2[2];
+ output[2] = x2[1];
+ output[3] = x2[3];
+}
+
+static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+ // stage 1
+ int16x4_t x1[8];
+ butterfly_dct_pre_s16_x4(input, x1, 8);
+
+ // stage 2
+ int16x4_t x2[8];
+ butterfly_dct_pre_s16_x4(x1, x2, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+ // stage 3
+ int16x4_t x3[8];
+ butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+ butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
+
+ // stage 4-5
+ butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+ // stage 1
+ int16x8_t x1[8];
+ butterfly_dct_pre_s16_x8(input, x1, 8);
+
+ // stage 2
+ int16x8_t x2[8];
+ butterfly_dct_pre_s16_x8(x1, x2, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+ // stage 3
+ int16x8_t x3[8];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+ butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
+
+ // stage 4-5
+ butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1
+ int16x4_t x1[16];
+ butterfly_dct_pre_s16_x4(input, x1, 16);
+
+ // stage 2
+ int16x4_t x2[16];
+ butterfly_dct_pre_s16_x4(x1, x2, 8);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+ // stage 3
+ int16x4_t x3[16];
+ butterfly_dct_pre_s16_x4(x2, x3, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+ butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
+
+ // stage 4
+ int16x4_t x4[16];
+ butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
+ &output[12]);
+ butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+ butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+ // stage 5
+ int16x4_t x5[16];
+ butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
+ &output[6]);
+ butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
+ butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
+
+ // stage 6-7
+ butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
+ &output[15]);
+ butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
+ &output[7]);
+ butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
+ &output[11]);
+ butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
+ &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1
+ int16x8_t x1[16];
+ butterfly_dct_pre_s16_x8(input, x1, 16);
+
+ // stage 2
+ int16x8_t x2[16];
+ butterfly_dct_pre_s16_x8(x1, x2, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+ // stage 3
+ int16x8_t x3[16];
+ butterfly_dct_pre_s16_x8(x2, x3, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+ butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
+
+ // stage 4
+ int16x8_t x4[16];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
+ &output[12]);
+ butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+ // stage 5
+ int16x8_t x5[16];
+ butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
+ &output[6]);
+ butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
+ butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
+
+ // stage 6-7
+ butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
+ &output[15]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
+ &output[11]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
+ &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ // stage 1
+ int16x8_t x1[32];
+ butterfly_dct_pre_s16_x8(input, x1, 32);
+
+ // stage 2
+ int16x8_t x2[32];
+ butterfly_dct_pre_s16_x8(x1, x2, 16);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
+
+ // stage 3
+ int16x8_t x3[32];
+ butterfly_dct_pre_s16_x8(x2, x3, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
+ butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
+
+ // stage 4
+ int16x8_t x4[32];
+ butterfly_dct_pre_s16_x8(x3, x4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
+ butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
+
+ // stage 5
+ int16x8_t x5[32];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
+ &output[16]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
+ &output[24]);
+ butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
+ butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
+ butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
+
+ // stage 6
+ int16x8_t x6[32];
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
+ &output[12]);
+ butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
+ butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
+
+ // stage 7
+ int16x8_t x7[32];
+ butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
+ &output[30]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
+ &output[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
+ &output[22]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
+ &output[6]);
+ butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
+ butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
+ butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
+ butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
+
+ butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
+ &output[31]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
+ &output[15]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
+ &output[23]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
+ &output[27]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
+ &output[11]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
+ &output[19]);
+ butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
+ &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+ const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+ const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+ const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+ const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+ const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+ const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+ const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+ const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+ const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+ const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+ const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+ const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+ const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+ const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+ const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+ const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+ const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+ const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+ const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+ const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+ const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+ const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+ const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+ const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+ // stage 1
+ int16x8_t x1[64];
+ butterfly_dct_pre_s16_x8(input, x1, 64);
+
+ // stage 2
+ int16x8_t x2[64];
+ butterfly_dct_pre_s16_x8(x1, x2, 32);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
+
+ // stage 3
+ int16x8_t x3[64];
+ butterfly_dct_pre_s16_x8(x2, x3, 16);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
+
+ // stage 4
+ int16x8_t x4[64];
+ butterfly_dct_pre_s16_x8(x3, x4, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+ butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
+
+ // stage 5
+ int16x8_t x5[64];
+ butterfly_dct_pre_s16_x8(x4, x5, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+ butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+ butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
+
+ // stage 6
+ int16x8_t x6[64];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+ butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+ butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
+
+ // stage 7
+ int16x8_t x7[64];
+ butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+ butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+ butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
+
+ // stage 8
+ int16x8_t x8[64];
+ butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+ butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+ butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+ butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+ butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+ butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
+
+ // stage 9
+ int16x8_t x9[64];
+ butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+ butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+ butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
+
+ // stage 10
+ butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
+ &output[63]);
+ butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
+ &output[31]);
+ butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
+ &output[47]);
+ butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
+ &output[15]);
+ butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
+ &output[55]);
+ butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
+ &output[23]);
+ butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
+ &output[39]);
+ butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
+ &output[59]);
+ butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
+ &output[27]);
+ butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
+ &output[43]);
+ butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
+ &output[11]);
+ butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
+ &output[51]);
+ butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
+ &output[19]);
+ butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
+ &output[35]);
+ butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
+ &output[3]);
+
+ // stage 11
+ output[0] = x6[0];
+ output[2] = x9[16];
+ output[4] = x8[8];
+ output[6] = x9[24];
+ output[8] = x7[4];
+ output[10] = x9[20];
+ output[12] = x8[12];
+ output[14] = x9[28];
+ output[16] = x6[2];
+ output[18] = x9[18];
+ output[20] = x8[10];
+ output[22] = x9[26];
+ output[24] = x7[6];
+ output[26] = x9[22];
+ output[28] = x8[14];
+ output[30] = x9[30];
+ output[32] = x6[1];
+ output[34] = x9[17];
+ output[36] = x8[9];
+ output[38] = x9[25];
+ output[40] = x7[5];
+ output[42] = x9[21];
+ output[44] = x8[13];
+ output[46] = x9[29];
+ output[48] = x6[3];
+ output[52] = x8[11];
+ output[54] = x9[27];
+ output[56] = x7[7];
+ output[58] = x9[23];
+ output[60] = x8[15];
+ output[62] = x9[31];
+}
+
+static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 2
+ int16x8_t x2[8];
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
+
+ // stage 3
+ int16x8_t x3[8];
+ x3[0] = vqaddq_s16(input[0], x2[2]);
+ x3[1] = vqsubq_s16(x2[3], input[7]);
+ x3[2] = vqsubq_s16(input[0], x2[2]);
+ x3[3] = vqaddq_s16(input[7], x2[3]);
+ x3[4] = vqsubq_s16(x2[6], input[1]);
+ x3[5] = vqaddq_s16(input[6], x2[7]);
+ x3[6] = vqaddq_s16(input[1], x2[6]);
+ x3[7] = vqsubq_s16(input[6], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+
+ // stage 5
+ int16x8_t x5[8];
+ x5[0] = vqaddq_s16(x3[0], x3[4]);
+ x5[1] = vqaddq_s16(x3[1], x3[5]);
+ x5[2] = vqaddq_s16(x3[2], x3[6]);
+ x5[3] = vqsubq_s16(x3[7], x3[3]);
+ x5[4] = vqsubq_s16(x3[0], x3[4]);
+ x5[5] = vqsubq_s16(x3[1], x3[5]);
+ x5[6] = vqsubq_s16(x3[2], x3[6]);
+ x5[7] = vqaddq_s16(x3[3], x3[7]);
+
+ // stage 6
+ butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ // stage 2
+ int16x4_t x2[8];
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+ // stage 3
+ int16x4_t x3[16];
+ x3[0] = vqadd_s16(input[0], x2[0]);
+ x3[1] = vqsub_s16(x2[1], input[15]);
+ x3[2] = vqsub_s16(input[0], x2[0]);
+ x3[3] = vqadd_s16(input[15], x2[1]);
+ x3[4] = vqsub_s16(x2[2], input[3]);
+ x3[5] = vqadd_s16(input[12], x2[3]);
+ x3[6] = vqadd_s16(input[3], x2[2]);
+ x3[7] = vqsub_s16(input[12], x2[3]);
+ x3[8] = vqsub_s16(x2[4], input[1]);
+ x3[9] = vqadd_s16(input[14], x2[5]);
+ x3[10] = vqadd_s16(input[1], x2[4]);
+ x3[11] = vqsub_s16(input[14], x2[5]);
+ x3[12] = vqadd_s16(input[2], x2[6]);
+ x3[13] = vqsub_s16(x2[7], input[13]);
+ x3[14] = vqsub_s16(input[2], x2[6]);
+ x3[15] = vqadd_s16(input[13], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+ butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+ // stage 5
+ int16x4_t x5[16];
+ x5[0] = vqadd_s16(x3[0], x3[4]);
+ x5[1] = vqadd_s16(x3[1], x3[5]);
+ x5[2] = vqadd_s16(x3[2], x3[6]);
+ x5[3] = vqsub_s16(x3[7], x3[3]);
+ x5[4] = vqsub_s16(x3[0], x3[4]);
+ x5[5] = vqsub_s16(x3[1], x3[5]);
+ x5[6] = vqsub_s16(x3[2], x3[6]);
+ x5[7] = vqadd_s16(x3[3], x3[7]);
+ x5[8] = vqadd_s16(x3[8], x3[12]);
+ x5[9] = vqadd_s16(x3[9], x3[13]);
+ x5[10] = vqsub_s16(x3[14], x3[10]);
+ x5[11] = vqadd_s16(x3[11], x3[15]);
+ x5[12] = vqsub_s16(x3[8], x3[12]);
+ x5[13] = vqsub_s16(x3[9], x3[13]);
+ x5[14] = vqadd_s16(x3[10], x3[14]);
+ x5[15] = vqsub_s16(x3[11], x3[15]);
+
+ // stage 6
+ butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+ butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+ // stage 7
+ int16x4_t x7[16];
+ x7[0] = vqadd_s16(x5[0], x5[8]);
+ x7[1] = vqadd_s16(x5[1], x5[9]);
+ x7[2] = vqadd_s16(x5[2], x5[10]);
+ x7[3] = vqadd_s16(x5[3], x5[11]);
+ x7[4] = vqadd_s16(x5[4], x5[12]);
+ x7[5] = vqadd_s16(x5[5], x5[13]);
+ x7[6] = vqadd_s16(x5[6], x5[14]);
+ x7[7] = vqsub_s16(x5[15], x5[7]);
+ x7[8] = vqsub_s16(x5[0], x5[8]);
+ x7[9] = vqsub_s16(x5[1], x5[9]);
+ x7[10] = vqsub_s16(x5[2], x5[10]);
+ x7[11] = vqsub_s16(x5[3], x5[11]);
+ x7[12] = vqsub_s16(x5[4], x5[12]);
+ x7[13] = vqsub_s16(x5[5], x5[13]);
+ x7[14] = vqsub_s16(x5[6], x5[14]);
+ x7[15] = vqadd_s16(x5[7], x5[15]);
+
+ // stage 8
+ butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+ butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
+ &output[2]);
+ butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
+ &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+ butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+ butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
+ &output[10]);
+ butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
+ &output[12]);
+ butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
+ &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ // stage 2
+ int16x8_t x2[8];
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+ // stage 3
+ int16x8_t x3[16];
+ x3[0] = vqaddq_s16(input[0], x2[0]);
+ x3[1] = vqsubq_s16(x2[1], input[15]);
+ x3[2] = vqsubq_s16(input[0], x2[0]);
+ x3[3] = vqaddq_s16(input[15], x2[1]);
+ x3[4] = vqsubq_s16(x2[2], input[3]);
+ x3[5] = vqaddq_s16(input[12], x2[3]);
+ x3[6] = vqaddq_s16(input[3], x2[2]);
+ x3[7] = vqsubq_s16(input[12], x2[3]);
+ x3[8] = vqsubq_s16(x2[4], input[1]);
+ x3[9] = vqaddq_s16(input[14], x2[5]);
+ x3[10] = vqaddq_s16(input[1], x2[4]);
+ x3[11] = vqsubq_s16(input[14], x2[5]);
+ x3[12] = vqaddq_s16(input[2], x2[6]);
+ x3[13] = vqsubq_s16(x2[7], input[13]);
+ x3[14] = vqsubq_s16(input[2], x2[6]);
+ x3[15] = vqaddq_s16(input[13], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+ butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+ // stage 5
+ int16x8_t x5[16];
+ x5[0] = vqaddq_s16(x3[0], x3[4]);
+ x5[1] = vqaddq_s16(x3[1], x3[5]);
+ x5[2] = vqaddq_s16(x3[2], x3[6]);
+ x5[3] = vqsubq_s16(x3[7], x3[3]);
+ x5[4] = vqsubq_s16(x3[0], x3[4]);
+ x5[5] = vqsubq_s16(x3[1], x3[5]);
+ x5[6] = vqsubq_s16(x3[2], x3[6]);
+ x5[7] = vqaddq_s16(x3[3], x3[7]);
+ x5[8] = vqaddq_s16(x3[8], x3[12]);
+ x5[9] = vqaddq_s16(x3[9], x3[13]);
+ x5[10] = vqsubq_s16(x3[14], x3[10]);
+ x5[11] = vqaddq_s16(x3[11], x3[15]);
+ x5[12] = vqsubq_s16(x3[8], x3[12]);
+ x5[13] = vqsubq_s16(x3[9], x3[13]);
+ x5[14] = vqaddq_s16(x3[10], x3[14]);
+ x5[15] = vqsubq_s16(x3[11], x3[15]);
+
+ // stage 6
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+ butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+ // stage 7
+ int16x8_t x7[16];
+ x7[0] = vqaddq_s16(x5[0], x5[8]);
+ x7[1] = vqaddq_s16(x5[1], x5[9]);
+ x7[2] = vqaddq_s16(x5[2], x5[10]);
+ x7[3] = vqaddq_s16(x5[3], x5[11]);
+ x7[4] = vqaddq_s16(x5[4], x5[12]);
+ x7[5] = vqaddq_s16(x5[5], x5[13]);
+ x7[6] = vqaddq_s16(x5[6], x5[14]);
+ x7[7] = vqsubq_s16(x5[15], x5[7]);
+ x7[8] = vqsubq_s16(x5[0], x5[8]);
+ x7[9] = vqsubq_s16(x5[1], x5[9]);
+ x7[10] = vqsubq_s16(x5[2], x5[10]);
+ x7[11] = vqsubq_s16(x5[3], x5[11]);
+ x7[12] = vqsubq_s16(x5[4], x5[12]);
+ x7[13] = vqsubq_s16(x5[5], x5[13]);
+ x7[14] = vqsubq_s16(x5[6], x5[14]);
+ x7[15] = vqaddq_s16(x5[7], x5[15]);
+
+ // stage 8
+ butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
+ &output[2]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
+ &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
+ &output[10]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
+ &output[12]);
+ butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
+ &output[1]);
+}
+
+static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
+ int16x4_t *const output,
+ const int cos_bit) {
+ (void)cos_bit;
+ round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
+ int16x8_t *const output,
+ const int cos_bit) {
+ (void)cos_bit;
+ round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ (void)cos_bit;
+ shift_left_1_s16_x4(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ (void)cos_bit;
+ shift_left_1_s16_x8(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
+ int16x4_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
+ int16x8_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
+ int16x8_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ shift_left_2_s16_x8(input, output, 32);
+}
+
+#define TRANSFORM_COL(name, tw, n) \
+ static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
+ int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ load_buffer_s16_x##tw(input, stride, buf0, n); \
+ shift_left_2_s16_x##tw(buf0, buf0, n); \
+ name##_neon(buf0, output, cos_bit); \
+ }
+
+TRANSFORM_COL(fadst4x4, 4, 4)
+TRANSFORM_COL(fadst4x8, 4, 8)
+TRANSFORM_COL(fadst4x16, 4, 16)
+TRANSFORM_COL(fadst8x4, 8, 4)
+TRANSFORM_COL(fadst8x8, 8, 8)
+TRANSFORM_COL(fadst8x16, 8, 16)
+TRANSFORM_COL(fdct4x4, 4, 4)
+TRANSFORM_COL(fdct4x8, 4, 8)
+TRANSFORM_COL(fdct4x16, 4, 16)
+TRANSFORM_COL(fdct8x4, 8, 4)
+TRANSFORM_COL(fdct8x8, 8, 8)
+TRANSFORM_COL(fdct8x16, 8, 16)
+TRANSFORM_COL(fdct8x32, 8, 32)
+TRANSFORM_COL(fidentity4x4, 4, 4)
+TRANSFORM_COL(fidentity4x8, 4, 8)
+TRANSFORM_COL(fidentity4x16, 4, 16)
+TRANSFORM_COL(fidentity8x4, 8, 4)
+TRANSFORM_COL(fidentity8x8, 8, 8)
+TRANSFORM_COL(fidentity8x16, 8, 16)
+TRANSFORM_COL(fidentity8x32, 8, 32)
+
+#define TRANSFORM_ROW(name, tw, n) \
+ static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
+ int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ name##_neon(input, buf0, cos_bit); \
+ store_buffer_s16_x##tw(buf0, output, stride, n); \
+ }
+
+#define TRANSFORM_ROW_RECT(name, tw, n) \
+ static void name##_row_rect_neon(const int16x##tw##_t *input, \
+ int32_t *output, int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ name##_neon(input, buf0, cos_bit); \
+ store_rect_buffer_s16_x##tw(buf0, output, stride, n); \
+ }
+
+TRANSFORM_ROW(fadst4x4, 4, 4)
+TRANSFORM_ROW(fadst4x16, 4, 16)
+TRANSFORM_ROW(fadst8x4, 8, 4)
+TRANSFORM_ROW(fadst8x8, 8, 8)
+TRANSFORM_ROW(fadst8x16, 8, 16)
+TRANSFORM_ROW(fdct4x4, 4, 4)
+TRANSFORM_ROW(fdct4x16, 4, 16)
+TRANSFORM_ROW(fdct8x4, 8, 4)
+TRANSFORM_ROW(fdct8x8, 8, 8)
+TRANSFORM_ROW(fdct8x16, 8, 16)
+TRANSFORM_ROW(fdct8x32, 8, 32)
+TRANSFORM_ROW(fidentity4x4, 4, 4)
+TRANSFORM_ROW(fidentity4x16, 4, 16)
+TRANSFORM_ROW(fidentity8x4, 8, 4)
+TRANSFORM_ROW(fidentity8x8, 8, 8)
+TRANSFORM_ROW(fidentity8x16, 8, 16)
+TRANSFORM_ROW(fidentity8x32, 8, 32)
+
+TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
+TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
+TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
+TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
+TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
+TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
+TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
+TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
+TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
+TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
+TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
+TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
+
+typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
+ int16x4_t *output, int cos_bit);
+typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
+ int16x8_t *output, int cos_bit);
+
+typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
+ int16x4_t *output, int stride,
+ int cos_bit);
+typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
+ int16x8_t *output, int stride,
+ int cos_bit);
+
+typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
+ int32_t *output, int stride,
+ int cos_bit);
+typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
+ int32_t *output, int stride,
+ int cos_bit);
+
+static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_col_neon, // DCT_DCT
+ fadst4x8_col_neon, // ADST_DCT
+ fdct4x8_col_neon, // DCT_ADST
+ fadst4x8_col_neon, // ADST_ADST
+ fadst4x8_col_neon, // FLIPADST_DCT
+ fdct4x8_col_neon, // DCT_FLIPADST
+ fadst4x8_col_neon, // FLIPADST_FLIPADST
+ fadst4x8_col_neon, // ADST_FLIPADST
+ fadst4x8_col_neon, // FLIPADST_ADST
+ fidentity4x8_col_neon, // IDTX
+ fdct4x8_col_neon, // V_DCT
+ fidentity4x8_col_neon, // H_DCT
+ fadst4x8_col_neon, // V_ADST
+ fidentity4x8_col_neon, // H_ADST
+ fadst4x8_col_neon, // V_FLIPADST
+ fidentity4x8_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_row_neon, // DCT_DCT
+ fdct8x4_row_neon, // ADST_DCT
+ fadst8x4_row_neon, // DCT_ADST
+ fadst8x4_row_neon, // ADST_ADST
+ fdct8x4_row_neon, // FLIPADST_DCT
+ fadst8x4_row_neon, // DCT_FLIPADST
+ fadst8x4_row_neon, // FLIPADST_FLIPADST
+ fadst8x4_row_neon, // ADST_FLIPADST
+ fadst8x4_row_neon, // FLIPADST_ADST
+ fidentity8x4_row_neon, // IDTX
+ fidentity8x4_row_neon, // V_DCT
+ fdct8x4_row_neon, // H_DCT
+ fidentity8x4_row_neon, // V_ADST
+ fadst8x4_row_neon, // H_ADST
+ fidentity8x4_row_neon, // V_FLIPADST
+ fadst8x4_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_row_rect_neon, // DCT_DCT
+ fdct8x4_row_rect_neon, // ADST_DCT
+ fadst8x4_row_rect_neon, // DCT_ADST
+ fadst8x4_row_rect_neon, // ADST_ADST
+ fdct8x4_row_rect_neon, // FLIPADST_DCT
+ fadst8x4_row_rect_neon, // DCT_FLIPADST
+ fadst8x4_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x4_row_rect_neon, // ADST_FLIPADST
+ fadst8x4_row_rect_neon, // FLIPADST_ADST
+ fidentity8x4_row_rect_neon, // IDTX
+ fidentity8x4_row_rect_neon, // V_DCT
+ fdct8x4_row_rect_neon, // H_DCT
+ fidentity8x4_row_rect_neon, // V_ADST
+ fadst8x4_row_rect_neon, // H_ADST
+ fidentity8x4_row_rect_neon, // V_FLIPADST
+ fadst8x4_row_rect_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_col_neon, // DCT_DCT
+ fadst8x4_col_neon, // ADST_DCT
+ fdct8x4_col_neon, // DCT_ADST
+ fadst8x4_col_neon, // ADST_ADST
+ fadst8x4_col_neon, // FLIPADST_DCT
+ fdct8x4_col_neon, // DCT_FLIPADST
+ fadst8x4_col_neon, // FLIPADST_FLIPADST
+ fadst8x4_col_neon, // ADST_FLIPADST
+ fadst8x4_col_neon, // FLIPADST_ADST
+ fidentity8x4_col_neon, // IDTX
+ fdct8x4_col_neon, // V_DCT
+ fidentity8x4_col_neon, // H_DCT
+ fadst8x4_col_neon, // V_ADST
+ fidentity8x4_col_neon, // H_ADST
+ fadst8x4_col_neon, // V_FLIPADST
+ fidentity8x4_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_row_rect_neon, // DCT_DCT
+ fdct4x8_row_rect_neon, // ADST_DCT
+ fadst4x8_row_rect_neon, // DCT_ADST
+ fadst4x8_row_rect_neon, // ADST_ADST
+ fdct4x8_row_rect_neon, // FLIPADST_DCT
+ fadst4x8_row_rect_neon, // DCT_FLIPADST
+ fadst4x8_row_rect_neon, // FLIPADST_FLIPADST
+ fadst4x8_row_rect_neon, // ADST_FLIPADST
+ fadst4x8_row_rect_neon, // FLIPADST_ADST
+ fidentity4x8_row_rect_neon, // IDTX
+ fidentity4x8_row_rect_neon, // V_DCT
+ fdct4x8_row_rect_neon, // H_DCT
+ fidentity4x8_row_rect_neon, // V_ADST
+ fadst4x8_row_rect_neon, // H_ADST
+ fidentity4x8_row_rect_neon, // V_FLIPADST
+ fadst4x8_row_rect_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_col_neon, // DCT_DCT
+ fadst8x8_col_neon, // ADST_DCT
+ fdct8x8_col_neon, // DCT_ADST
+ fadst8x8_col_neon, // ADST_ADST
+ fadst8x8_col_neon, // FLIPADST_DCT
+ fdct8x8_col_neon, // DCT_FLIPADST
+ fadst8x8_col_neon, // FLIPADST_FLIPADST
+ fadst8x8_col_neon, // ADST_FLIPADST
+ fadst8x8_col_neon, // FLIPADST_ADST
+ fidentity8x8_col_neon, // IDTX
+ fdct8x8_col_neon, // V_DCT
+ fidentity8x8_col_neon, // H_DCT
+ fadst8x8_col_neon, // V_ADST
+ fidentity8x8_col_neon, // H_ADST
+ fadst8x8_col_neon, // V_FLIPADST
+ fidentity8x8_col_neon, // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_row_neon, // DCT_DCT
+ fdct8x8_row_neon, // ADST_DCT
+ fadst8x8_row_neon, // DCT_ADST
+ fadst8x8_row_neon, // ADST_ADST
+ fdct8x8_row_neon, // FLIPADST_DCT
+ fadst8x8_row_neon, // DCT_FLIPADST
+ fadst8x8_row_neon, // FLIPADST_FLIPADST
+ fadst8x8_row_neon, // ADST_FLIPADST
+ fadst8x8_row_neon, // FLIPADST_ADST
+ fidentity8x8_row_neon, // IDTX
+ fidentity8x8_row_neon, // V_DCT
+ fdct8x8_row_neon, // H_DCT
+ fidentity8x8_row_neon, // V_ADST
+ fadst8x8_row_neon, // H_ADST
+ fidentity8x8_row_neon, // V_FLIPADST
+ fadst8x8_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_row_rect_neon, // DCT_DCT
+ fdct8x8_row_rect_neon, // ADST_DCT
+ fadst8x8_row_rect_neon, // DCT_ADST
+ fadst8x8_row_rect_neon, // ADST_ADST
+ fdct8x8_row_rect_neon, // FLIPADST_DCT
+ fadst8x8_row_rect_neon, // DCT_FLIPADST
+ fadst8x8_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x8_row_rect_neon, // ADST_FLIPADST
+ fadst8x8_row_rect_neon, // FLIPADST_ADST
+ fidentity8x8_row_rect_neon, // IDTX
+ fidentity8x8_row_rect_neon, // V_DCT
+ fdct8x8_row_rect_neon, // H_DCT
+ fidentity8x8_row_rect_neon, // V_ADST
+ fadst8x8_row_rect_neon, // H_ADST
+ fidentity8x8_row_rect_neon, // V_FLIPADST
+ fadst8x8_row_rect_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
+ fdct4x16_col_neon, // DCT_DCT
+ fadst4x16_col_neon, // ADST_DCT
+ fdct4x16_col_neon, // DCT_ADST
+ fadst4x16_col_neon, // ADST_ADST
+ fadst4x16_col_neon, // FLIPADST_DCT
+ fdct4x16_col_neon, // DCT_FLIPADST
+ fadst4x16_col_neon, // FLIPADST_FLIPADST
+ fadst4x16_col_neon, // ADST_FLIPADST
+ fadst4x16_col_neon, // FLIPADST_ADST
+ fidentity4x16_col_neon, // IDTX
+ fdct4x16_col_neon, // V_DCT
+ fidentity4x16_col_neon, // H_DCT
+ fadst4x16_col_neon, // V_ADST
+ fidentity4x16_col_neon, // H_ADST
+ fadst4x16_col_neon, // V_FLIPADST
+ fidentity4x16_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
+ fdct4x16_row_neon, // DCT_DCT
+ fdct4x16_row_neon, // ADST_DCT
+ fadst4x16_row_neon, // DCT_ADST
+ fadst4x16_row_neon, // ADST_ADST
+ fdct4x16_row_neon, // FLIPADST_DCT
+ fadst4x16_row_neon, // DCT_FLIPADST
+ fadst4x16_row_neon, // FLIPADST_FLIPADST
+ fadst4x16_row_neon, // ADST_FLIPADST
+ fadst4x16_row_neon, // FLIPADST_ADST
+ fidentity4x16_row_neon, // IDTX
+ fidentity4x16_row_neon, // V_DCT
+ fdct4x16_row_neon, // H_DCT
+ fidentity4x16_row_neon, // V_ADST
+ fadst4x16_row_neon, // H_ADST
+ fidentity4x16_row_neon, // V_FLIPADST
+ fadst4x16_row_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_col_neon, // DCT_DCT
+ fadst8x16_col_neon, // ADST_DCT
+ fdct8x16_col_neon, // DCT_ADST
+ fadst8x16_col_neon, // ADST_ADST
+ fadst8x16_col_neon, // FLIPADST_DCT
+ fdct8x16_col_neon, // DCT_FLIPADST
+ fadst8x16_col_neon, // FLIPADST_FLIPADST
+ fadst8x16_col_neon, // ADST_FLIPADST
+ fadst8x16_col_neon, // FLIPADST_ADST
+ fidentity8x16_col_neon, // IDTX
+ fdct8x16_col_neon, // V_DCT
+ fidentity8x16_col_neon, // H_DCT
+ fadst8x16_col_neon, // V_ADST
+ fidentity8x16_col_neon, // H_ADST
+ fadst8x16_col_neon, // V_FLIPADST
+ fidentity8x16_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_row_neon, // DCT_DCT
+ fdct8x16_row_neon, // ADST_DCT
+ fadst8x16_row_neon, // DCT_ADST
+ fadst8x16_row_neon, // ADST_ADST
+ fdct8x16_row_neon, // FLIPADST_DCT
+ fadst8x16_row_neon, // DCT_FLIPADST
+ fadst8x16_row_neon, // FLIPADST_FLIPADST
+ fadst8x16_row_neon, // ADST_FLIPADST
+ fadst8x16_row_neon, // FLIPADST_ADST
+ fidentity8x16_row_neon, // IDTX
+ fidentity8x16_row_neon, // V_DCT
+ fdct8x16_row_neon, // H_DCT
+ fidentity8x16_row_neon, // V_ADST
+ fadst8x16_row_neon, // H_ADST
+ fidentity8x16_row_neon, // V_FLIPADST
+ fadst8x16_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_row_rect_neon, // DCT_DCT
+ fdct8x16_row_rect_neon, // ADST_DCT
+ fadst8x16_row_rect_neon, // DCT_ADST
+ fadst8x16_row_rect_neon, // ADST_ADST
+ fdct8x16_row_rect_neon, // FLIPADST_DCT
+ fadst8x16_row_rect_neon, // DCT_FLIPADST
+ fadst8x16_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x16_row_rect_neon, // ADST_FLIPADST
+ fadst8x16_row_rect_neon, // FLIPADST_ADST
+ fidentity8x16_row_rect_neon, // IDTX
+ fidentity8x16_row_rect_neon, // V_DCT
+ fdct8x16_row_rect_neon, // H_DCT
+ fidentity8x16_row_rect_neon, // V_ADST
+ fadst8x16_row_rect_neon, // H_ADST
+ fidentity8x16_row_rect_neon, // V_FLIPADST
+ fadst8x16_row_rect_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_row_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_row_neon, // IDTX
+ fidentity8x32_row_neon, // V_DCT
+ fdct8x32_row_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_row_rect_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_row_rect_neon, // IDTX
+ fidentity8x32_row_rect_neon, // V_DCT
+ fdct8x32_row_rect_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_col_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_col_neon, // IDTX
+ fdct8x32_col_neon, // V_DCT
+ fidentity8x32_col_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ int16x4_t buf0[4], buf1[4];
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case ADST_DCT:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case DCT_ADST:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case ADST_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case FLIPADST_DCT:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case DCT_FLIPADST:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case FLIPADST_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case ADST_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case FLIPADST_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case IDTX:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_DCT:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_DCT:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_ADST:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_FLIPADST:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ }
+}
+
+static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x4_t buf0[8];
+ int16x8_t buf1[8];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x4(buf0, buf0, 8);
+ transpose_arrays_s16_4x8(buf0, buf1);
+
+ if (lr_flip) {
+ int16x8_t buf2[8];
+ flip_buf_8_neon(buf1, buf2, 4);
+ row_txfm(buf2, output, 8, 13);
+ } else {
+ row_txfm(buf1, output, 8, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x4_t buf0[16];
+ int16x8_t buf1[16];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x4(buf0, buf0, 16);
+ transpose_arrays_s16_4x8(buf0, buf1);
+ transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ int16x8_t buf2[16];
+ flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
+ row_txfm(buf2, output + 8 * i, 16, 12);
+ } else {
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 16, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[8];
+ int16x4_t buf1[8];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 4);
+ transpose_arrays_s16_8x4(buf0, buf1);
+
+ if (lr_flip) {
+ int16x4_t buf2[8];
+ flip_buf_4_neon(buf1, buf2, 8);
+ row_txfm(buf2, output, 4, 13);
+ } else {
+ row_txfm(buf1, output, 4, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ int16x8_t buf0[8], buf1[8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case ADST_DCT:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case DCT_ADST:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case ADST_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case FLIPADST_DCT:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case DCT_FLIPADST:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case FLIPADST_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case ADST_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case FLIPADST_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case IDTX:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_DCT:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_DCT:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_ADST:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_FLIPADST:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16], buf1[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+ row_txfm(buf0, output + 8 * i, 16, 13);
+ } else {
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 16, 13);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[32];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ col_txfm(input, buf0, stride, 12);
+ shift_right_2_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+ transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
+ transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+ row_txfm(buf0, output + 8 * i, 32, 12);
+ } else {
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 32, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16];
+ int16x4_t buf1[16];
+ int16x4_t buf2[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 4);
+ transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_4_neon(buf1, buf2, 16);
+ row_txfm(buf2, output, 4, 13);
+ } else {
+ row_txfm(buf1, output, 4, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16], buf1[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_8_neon(buf1, buf0, 16);
+ row_txfm(buf0, output, 8, 13);
+ } else {
+ row_txfm(buf1, output, 8, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16], buf1[32];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+ row_txfm(buf0, output + 8 * i, 16, 12);
+ } else {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, output + 8 * i, 16, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[64];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+ row_txfm(buf0, output + 8 * i, 32, 13);
+ } else {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, output + 8 * i, 32, 13);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[32];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_8_neon(buf1, buf0, 32);
+ row_txfm(buf0, output, 8, 12);
+ } else {
+ row_txfm(buf1, output, 8, 12);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[64];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_4_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+ row_txfm(buf0, output + 8 * i, 16, 13);
+ } else {
+ int16x8_t *buf = buf1 + 32 * i;
+ row_txfm(buf, output + 8 * i, 16, 13);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[128];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+ row_txfm(buf0, output + 8 * i, 32, 12);
+ } else {
+ int16x8_t *buf = buf1 + 32 * i;
+ row_txfm(buf, output + 8 * i, 32, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[128];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
+ const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 8; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
+ shift_left_2_s16_x8(buf0, buf0, 16);
+ col_txfm(buf0, buf0, 13);
+ shift_right_4_round_s16_x8(buf0, buf0, 16);
+ for (int j = 0; j < 2; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ int16x8_t *buf = buf1 + 64 * i;
+ row_txfm(buf, buf, 12);
+ store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[128];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+ const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 8; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < 4; i++) {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, buf, 12);
+ store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
+ }
+}
+
+static void fdct32_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ int32x4_t buf0[32];
+ int32x4_t buf1[32];
+
+ // stage 1
+ butterfly_dct_pre_s32_x4(input, buf1, 32);
+
+ // stage 2
+ butterfly_dct_pre_s32_x4(buf1, buf0, 16);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
+ &buf0[20]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
+ &buf0[22]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
+ &buf0[23]);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ butterfly_dct_pre_s32_x4(buf0, buf1, 8);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
+ &buf1[10]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
+ &buf1[11]);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
+
+ // stage 4
+ butterfly_dct_pre_s32_x4(buf1, buf0, 4);
+ buf0[4] = buf1[4];
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
+ buf0[7] = buf1[7];
+ butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
+ &buf0[18]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
+ &buf0[19]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
+ &buf0[20]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
+ butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
+ buf1[8] = buf0[8];
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
+ &buf1[9]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
+ &buf1[10]);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+ butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
+
+ // stage 6
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
+ butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+ butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
+ buf0[16] = buf1[16];
+ butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
+ &buf0[17]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
+ &buf0[18]);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
+ &buf0[22]);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
+ &buf1[15]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
+ &buf1[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
+ &buf1[13]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
+ &buf1[12]);
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+ butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+ butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+ butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
+
+ // stage 8
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
+ &buf0[31]);
+ butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
+ &buf0[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
+ &buf0[29]);
+ butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
+ &buf0[28]);
+ butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
+ &buf0[27]);
+ butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
+ &buf0[26]);
+ butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
+ &buf0[25]);
+ butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
+ &buf0[24]);
+
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+static void fdct64_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+ const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+ const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+ const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+ const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+ const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+ const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+ const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+ const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+ const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+ const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+ const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+ const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+ const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+ const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+ const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+ const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+ const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+ const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+ const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+ const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+ const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+ const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+ const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+ const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+ // stage 1
+ int32x4_t x1[64];
+ butterfly_dct_pre_s32_x4(input, x1, 64);
+
+ // stage 2
+ int32x4_t x2[64];
+ butterfly_dct_pre_s32_x4(x1, x2, 32);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
+
+ // stage 3
+ int32x4_t x3[64];
+ butterfly_dct_pre_s32_x4(x2, x3, 16);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+ butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
+
+ // stage 4
+ int32x4_t x4[64];
+ butterfly_dct_pre_s32_x4(x3, x4, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+ butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
+
+ // stage 5
+ int32x4_t x5[64];
+ butterfly_dct_pre_s32_x4(x4, x5, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+ butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+ butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
+
+ // stage 6
+ int32x4_t x6[64];
+ butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+ butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+ butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
+
+ // stage 7
+ int32x4_t x7[64];
+ butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+ butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+ butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
+
+ // stage 8
+ int32x4_t x8[64];
+ butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+ butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+ butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+ butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+ butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+ butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
+
+ // stage 9
+ int32x4_t x9[64];
+ butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+ butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+ butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+ butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+ butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+ butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+ butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+ butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
+
+ // stage 10
+ int32x4_t x10[64];
+ butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
+ butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
+ butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
+ butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
+ butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
+ butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
+ butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
+ butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
+ butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
+ butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
+ butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
+ butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
+ butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
+ butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
+ butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
+ butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
+
+ // stage 11, only store into the low 32 output indices.
+ output[0] = x6[0];
+ output[1] = x10[32];
+ output[2] = x9[16];
+ output[3] = x10[48];
+ output[4] = x8[8];
+ output[5] = x10[40];
+ output[6] = x9[24];
+ output[7] = x10[56];
+ output[8] = x7[4];
+ output[9] = x10[36];
+ output[10] = x9[20];
+ output[11] = x10[52];
+ output[12] = x8[12];
+ output[13] = x10[44];
+ output[14] = x9[28];
+ output[15] = x10[60];
+ output[16] = x6[2];
+ output[17] = x10[34];
+ output[18] = x9[18];
+ output[19] = x10[50];
+ output[20] = x8[10];
+ output[21] = x10[42];
+ output[22] = x9[26];
+ output[23] = x10[58];
+ output[24] = x7[6];
+ output[25] = x10[38];
+ output[26] = x9[22];
+ output[27] = x10[54];
+ output[28] = x8[14];
+ output[29] = x10[46];
+ output[30] = x9[30];
+ output[31] = x10[62];
+}
+
+static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[512];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 8; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+ }
+ }
+ for (int i = 0; i < 4; i++) {
+ int32x4_t bufA[64];
+ int32x4_t bufB[64];
+ int16x8_t *buf = buf1 + 64 * i;
+ for (int j = 0; j < 64; ++j) {
+ bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+ bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+ }
+ fdct64_neon(bufA, bufA, 10);
+ fdct64_neon(bufB, bufB, 10);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[64], buf1[256];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+
+ for (int i = 0; i < 8; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < 4; i++) {
+ int32x4_t bufA[64];
+ int32x4_t bufB[64];
+ int16x8_t *buf = buf1 + 64 * i;
+ for (int j = 0; j < 64; ++j) {
+ bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+ bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+ }
+ fdct64_neon(bufA, bufA, 11);
+ fdct64_neon(bufB, bufB, 11);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[256];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 4; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < 4; i++) {
+ int32x4_t bufA[32];
+ int32x4_t bufB[32];
+ int16x8_t *buf = buf1 + 32 * i;
+ for (int j = 0; j < 32; ++j) {
+ bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+ bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+ }
+ fdct32_neon(bufA, bufA, 11);
+ fdct32_neon(bufB, bufB, 11);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
+ lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform
+ lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform
+ lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform
+ lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform
+ lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform
+ lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform
+ lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform
+ lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform
+ lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform
+ lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
+ if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
new file mode 100644
index 0000000000..11d3def16b
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32,
+ int32x4_t v_round_s32, int log_scale) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+ const int32x4_t v_abs_coeff_scaled =
+ vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale));
+ const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+ vreinterpretq_s32_u32(v_mask));
+ // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+ const int32x4_t v_abs_qcoeff =
+ vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32);
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ // vshlq_s32 will shift right if shift value is negative.
+ const int32x4_t v_abs_dqcoeff =
+ vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Used to find eob.
+ const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+ return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+ int16x8_t v_eobmax,
+ uint16x8_t v_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+void av1_highbd_quantize_fp_neon(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ (void)scan;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+ const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+ const int16x4_t v_round_log_scale =
+ vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_round =
+ vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+ int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+ int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ // DC and first 3 AC
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, log_scale);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+ // 4 more AC
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+
+ // Find the max lane eob for the first 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ count -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, log_scale);
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ count -= 8;
+ } while (count);
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
new file mode 100644
index 0000000000..d13cc65ae0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
+ const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+ const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+#if AOM_ARCH_AARCH64
+ return vpaddq_s32(l, h);
+#else
+ const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l));
+ const int32x2_t dh = vpadd_s32(vget_low_s32(h), vget_high_s32(h));
+ return vcombine_s32(dl, dh);
+#endif
+}
+
+void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ int64x2_t sum = vdupq_n_s64(0);
+ int16x8_t cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ cents[j] = vdupq_n_s16(centroids[j]);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ const int16x8_t in = vld1q_s16(data);
+ uint16x8_t ind = vdupq_n_u16(0);
+ // Compute the distance to the first centroid.
+ int16x8_t dist_min = vabdq_s16(in, cents[0]);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ const int16x8_t dist = vabdq_s16(in, cents[j]);
+ // Compare to the minimal one.
+ const uint16x8_t cmp = vcgtq_s16(dist_min, dist);
+ dist_min = vminq_s16(dist_min, dist);
+ const uint16x8_t ind1 = vdupq_n_u16(j);
+ ind = vbslq_u16(cmp, ind1, ind);
+ }
+ if (total_dist) {
+ // Square, convert to 32 bit and add together.
+ const int32x4_t l =
+ vmull_s16(vget_low_s16(dist_min), vget_low_s16(dist_min));
+ const int32x4_t sum32_tmp =
+ vmlal_s16(l, vget_high_s16(dist_min), vget_high_s16(dist_min));
+ // Pairwise sum, convert to 64 bit and add to sum.
+ sum = vpadalq_s32(sum, sum32_tmp);
+ }
+ vst1_u8(indices, vmovn_u16(ind));
+ indices += 8;
+ data += 8;
+ }
+ if (total_dist) {
+ *total_dist = horizontal_add_s64x2(sum);
+ }
+}
+
+void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ int64x2_t sum = vdupq_n_s64(0);
+ uint32x4_t ind[2];
+ int16x8_t cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+ const int16_t cxcy[8] = { cx, cy, cx, cy, cx, cy, cx, cy };
+ cents[j] = vld1q_s16(cxcy);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ for (int l = 0; l < 2; ++l) {
+ const int16x8_t in = vld1q_s16(data);
+ ind[l] = vdupq_n_u32(0);
+ // Compute the distance to the first centroid.
+ int16x8_t d1 = vsubq_s16(in, cents[0]);
+ int32x4_t dist_min = k_means_multiply_add_neon(d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = vsubq_s16(in, cents[j]);
+ const int32x4_t dist = k_means_multiply_add_neon(d1);
+ // Compare to the minimal one.
+ const uint32x4_t cmp = vcgtq_s32(dist_min, dist);
+ dist_min = vminq_s32(dist_min, dist);
+ const uint32x4_t ind1 = vdupq_n_u32(j);
+ ind[l] = vbslq_u32(cmp, ind1, ind[l]);
+ }
+ if (total_dist) {
+ // Pairwise sum, convert to 64 bit and add to sum.
+ sum = vpadalq_s32(sum, dist_min);
+ }
+ data += 8;
+ }
+ // Cast to 8 bit and store.
+ vst1_u8(indices,
+ vmovn_u16(vcombine_u16(vmovn_u32(ind[0]), vmovn_u32(ind[1]))));
+ indices += 8;
+ }
+ if (total_dist) {
+ *total_dist = horizontal_add_s64x2(sum);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
new file mode 100644
index 0000000000..18cd0ce4c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s8(v_sum_diff_total);
+#else
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+ const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+ const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+ vget_low_s64(fedcba98_76543210));
+ const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+ return sum_diff;
+#endif
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+ const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+ const uint8x16_t v_delta_level_1_and_2,
+ const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment =
+ vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment =
+ vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment =
+ vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment =
+ vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment =
+ vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+ uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+ v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude,
+ int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_height = block_size_high[bs] >> 1;
+
+ int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ v_sum_diff_total = denoiser_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+ v_level1_threshold, v_level2_threshold, v_level3_threshold,
+ v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ v_sum_diff_total = denoiser_adjust_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high =
+ vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low =
+ vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16, to 128x128 blocks.
+static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_width = block_size_wide[bs];
+ const int b_height = block_size_high[bs];
+ const int b_width_shift4 = b_width >> 4;
+
+ int8x16_t v_sum_diff_total[8][8];
+ int r, c, sum_diff = 0;
+
+ for (r = 0; r < 8; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r] = vdupq_n_s8(0);
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+ sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+ v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+ v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vdupq_n_u8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] =
+ denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+ k_delta, v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ }
+ return COPY_BLOCK;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/cnn_neon.c b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
new file mode 100644
index 0000000000..8e686260d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_cnn_weights.h"
+
+// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are
+// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in
+// partition_cnn_weights.h. However, to enable linear memory access, rearrange
+// the weight tables here.
+static const float weights_layer_1[] = {
+ 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f,
+ -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f,
+ 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f,
+ -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f,
+ 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f,
+ 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f,
+ 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f,
+ -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f,
+ -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f,
+ 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f,
+ -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f,
+ 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f,
+ 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f,
+ -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f,
+ 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f,
+ -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f,
+ -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f,
+ -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f,
+ -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f,
+ -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f,
+ 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f,
+ 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f,
+ -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f,
+ -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f,
+ -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f,
+ 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f,
+ 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f,
+ 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f,
+ 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f,
+ 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f,
+ 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f,
+ 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f,
+ 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f,
+ 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f,
+ 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f,
+ 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f,
+ 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f,
+ 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f,
+ 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f,
+ 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f,
+ 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f,
+ 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f,
+ -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f,
+ 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f,
+ 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f,
+ 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f,
+ 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f,
+ 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f,
+ 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f,
+ -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f,
+ 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f,
+ -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f,
+ -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f,
+ 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f,
+ 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f,
+ -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f,
+ -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f,
+ -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f,
+ 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f,
+ -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f,
+ 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f,
+ -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f,
+ 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f,
+ -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f,
+ 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f,
+ -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f,
+ 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f,
+ -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f,
+ -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f,
+ 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f,
+ 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f,
+ 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f,
+ 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f,
+ -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f,
+ 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f,
+ 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f,
+ 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f,
+ -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f,
+ 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f,
+ 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f,
+ 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f,
+ 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f,
+ 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f,
+ 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f,
+ -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f,
+ 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f,
+ 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f,
+ 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f,
+ 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f,
+ -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f,
+ -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f,
+ 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f,
+ -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f,
+ -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f,
+ 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f,
+ 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f,
+ 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f,
+ 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f,
+ 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f,
+ -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f,
+ 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f,
+ -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f,
+ -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f,
+ -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f,
+ 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f,
+ 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f,
+ 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f,
+ -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f,
+ -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f,
+ -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f,
+ 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f,
+ 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f,
+ -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f,
+ -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f,
+ 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f,
+ 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f,
+ 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f,
+ 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f,
+ 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f,
+ -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f,
+ 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f,
+ -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f,
+ 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f,
+ -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f,
+ -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f,
+ 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f,
+ -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f,
+ 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f,
+ -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f,
+ -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f,
+ -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f,
+ -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f,
+ -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f,
+ 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f,
+ -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f,
+ 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f,
+ -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f,
+ 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f,
+ -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f,
+ -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f,
+ -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f,
+ -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f,
+ -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f,
+ 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f,
+ -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f,
+ 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f,
+ -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f,
+ -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f,
+ 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f,
+ 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f,
+ -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f,
+ -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f,
+ 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f,
+ -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f,
+ -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f,
+ 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f,
+ -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f,
+ -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f,
+ -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f,
+ 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f,
+ -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f,
+ -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f,
+ 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f,
+ 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f,
+ -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f,
+ -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f,
+ -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f,
+ -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f,
+ -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f,
+ 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f,
+ -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f,
+ 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f,
+ -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f,
+ 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f,
+ -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f,
+ 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f,
+ -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f,
+ 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f,
+ 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f,
+ 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f,
+ 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f,
+ 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f,
+ 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f,
+ 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f,
+ 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f,
+ 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f,
+ 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f,
+ -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f,
+ 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f,
+ 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f,
+ -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f,
+ 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f,
+ 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f,
+ -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f,
+ -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f,
+ -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f,
+ -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f,
+ 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f,
+ -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f,
+ 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f,
+ 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f,
+ 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f,
+ 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f,
+ 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f,
+ -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f,
+ 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f,
+ -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f,
+ 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f,
+ 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f,
+ 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f,
+ 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f,
+ -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f,
+ -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f,
+ -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f,
+ -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f,
+ -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f,
+ 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f,
+ 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f,
+ -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f,
+ 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f,
+ -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f,
+ -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f,
+ 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f,
+ 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f,
+ -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f,
+ 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f,
+ -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f,
+ 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f,
+ -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f,
+ 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f,
+ 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f,
+ 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f,
+ 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f,
+ 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f,
+ 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f,
+ 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f,
+ 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f,
+ -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f,
+ -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f,
+ -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f,
+ -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f,
+ 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f,
+ -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f,
+ -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f,
+ 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f,
+ -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f,
+ 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f,
+ 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f,
+ -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f,
+ 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f,
+ 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f,
+ -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f,
+ 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f,
+ -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f,
+ 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f,
+ -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f,
+ 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f,
+ -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f,
+ -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f,
+ -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f,
+ -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f,
+ -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f,
+ 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f,
+ 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f,
+ -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f,
+ 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f,
+ 0.018088f, 0.115791f, -0.079165f, 0.139388f,
+};
+
+static const float weights_layer_2[] = {
+ 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f,
+ 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f,
+ 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f,
+ -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f,
+ -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f,
+ -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f,
+ 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f,
+ -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f,
+ 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f,
+ 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f,
+ 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f,
+ -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f,
+ 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f,
+ 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f,
+ 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f,
+ -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f,
+ -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f,
+ 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f,
+ 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f,
+ 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f,
+ 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f,
+ 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f,
+ 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f,
+ -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f,
+ 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f,
+ 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f,
+ 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f,
+ 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f,
+ -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f,
+ 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f,
+ 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f,
+ -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f,
+ -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f,
+ 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f,
+ -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f,
+ -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f,
+ 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f,
+ 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f,
+ 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f,
+ -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f,
+ -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f,
+ 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f,
+ -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f,
+ 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f,
+ 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f,
+ -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f,
+ -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f,
+ -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f,
+ -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f,
+ -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f,
+ -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f,
+ -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f,
+ -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f,
+ -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f,
+ 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f,
+ 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f,
+ 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f,
+ -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f,
+ 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f,
+ -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f,
+ 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f,
+ -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f,
+ 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f,
+ -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f,
+ -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f,
+ -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f,
+ 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f,
+ -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f,
+ 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f,
+ -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f,
+ -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f,
+ -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f,
+ -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f,
+ -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f,
+ -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f,
+ -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f,
+ 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f,
+ -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f,
+ -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f,
+ -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f,
+ 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f,
+ 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f,
+ 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f,
+ 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f,
+ 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f,
+ -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f,
+ 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f,
+ 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f,
+ 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f,
+ 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f,
+ 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f,
+ 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f,
+ 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f,
+ -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f,
+ -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f,
+ -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f,
+ 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f,
+ -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f,
+ 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f,
+ -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f,
+ -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f,
+ -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f,
+ 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f,
+ -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f,
+ -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f,
+ 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f,
+ -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f,
+ -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f,
+ 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f,
+ -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f,
+ -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f,
+ 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f,
+ -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f,
+ -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f,
+ -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f,
+ -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f,
+ 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f,
+ -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f,
+ 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f,
+ -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f,
+ -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f,
+ -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f,
+ -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f,
+ -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f,
+ 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f,
+ -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f,
+ -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f,
+ 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f,
+ -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f,
+ -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f,
+ -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f,
+ -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f,
+ -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f,
+ 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f,
+ 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f,
+ -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f,
+ -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f,
+ -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f,
+ 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f,
+ -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f,
+ 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f,
+ -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f,
+ 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f,
+ -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f,
+ 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f,
+ 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f,
+ 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f,
+ 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f,
+ -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f,
+ 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f,
+ -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f,
+ -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f,
+ -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f,
+ 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f,
+ -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f,
+ 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f,
+ -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f,
+ 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f,
+ 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f,
+ 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f,
+ 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f,
+ 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f,
+ 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f,
+ 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f,
+ -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f,
+ -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f,
+ 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f,
+ -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f,
+ 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f,
+ 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f,
+ 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f,
+ -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f,
+ 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f,
+ 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f,
+ -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f,
+ 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f,
+ 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f,
+ -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f,
+ -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f,
+ -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f,
+ 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f,
+ -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f,
+ 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f,
+ -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f,
+ 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f,
+ -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f,
+ -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f,
+ 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f,
+ 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f,
+ 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f,
+ 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f,
+ -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f,
+ 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f,
+ 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f,
+ -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f,
+ 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f,
+ -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f,
+ 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f,
+ -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f,
+ 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f,
+ 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f,
+ 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f,
+ 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f,
+ -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f,
+ 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f,
+ -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f,
+ 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f,
+ 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f,
+ 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f,
+ -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f,
+ 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f,
+ -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f,
+ 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f,
+ -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f,
+ -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f,
+ 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f,
+ -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f,
+ -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f,
+ 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f,
+ -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f,
+ 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f,
+ 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f,
+ -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f,
+ 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f,
+ -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f,
+ 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f,
+ -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f,
+ 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f,
+ -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f,
+ 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f,
+ -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f,
+ 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f,
+ -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f,
+ 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f,
+ -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f,
+ -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f,
+ -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f,
+ 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f,
+ 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f,
+ 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f,
+ -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f,
+ -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f,
+ -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f,
+ -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f,
+ -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f,
+ 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f,
+ -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f,
+ -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f,
+ 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f,
+ 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f,
+ -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f,
+ 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f,
+ -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f,
+ -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f,
+ -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f,
+ 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f,
+ 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f,
+ 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f,
+ 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f,
+ 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f,
+ 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f,
+ 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f,
+ 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f,
+ 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f,
+ -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f,
+ -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f,
+ -0.212384f, -0.229157f, -0.283428f, -0.184891f,
+};
+
+static const float weights_layer_3[] = {
+ -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f,
+ 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f,
+ -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f,
+ 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f,
+ -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f,
+ -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f,
+ -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f,
+ -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f,
+ -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f,
+ 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f,
+ -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f,
+ -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f,
+ 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f,
+ -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f,
+ 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f,
+ -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f,
+ 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f,
+ -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f,
+ -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f,
+ 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f,
+ 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f,
+ 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f,
+ 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f,
+ -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f,
+ -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f,
+ -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f,
+ -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f,
+ -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f,
+ -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f,
+ 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f,
+ 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f,
+ 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f,
+ 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f,
+ -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f,
+ 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f,
+ -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f,
+ 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f,
+ 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f,
+ 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f,
+ -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f,
+ -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f,
+ 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f,
+ 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f,
+ -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f,
+ 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f,
+ -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f,
+ -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f,
+ 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f,
+ 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f,
+ 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f,
+ -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f,
+ -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f,
+ -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f,
+ -0.071383f, -0.075005f,
+};
+
+static const float weights_layer_4[] = {
+ -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f,
+ -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f,
+ 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f,
+ -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f,
+ -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f,
+ -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f,
+ -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f,
+ -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f,
+ -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f,
+ -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f,
+ 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f,
+ -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f,
+ -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f,
+ -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f,
+ -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f,
+ -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f,
+ -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f,
+ -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f,
+ 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f,
+ 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f,
+ -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f,
+ -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f,
+ -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f,
+ -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f,
+ -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f,
+ -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f,
+ 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f,
+ 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f,
+ 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f,
+ 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f,
+ 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f,
+ -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f,
+ 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f,
+ 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f,
+ -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f,
+ -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f,
+ 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f,
+ 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f,
+ -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f,
+ 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f,
+ -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f,
+ 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f,
+ -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f,
+ 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f,
+ 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f,
+ 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f,
+ -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f,
+ -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f,
+ -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f,
+ 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f,
+ -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f,
+ -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f,
+ 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f,
+ -0.421885f, -0.293573f,
+};
+
+static const float weights_layer_5[] = {
+ 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f,
+ 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f,
+ 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f,
+ 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f,
+ -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f,
+ 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f,
+ -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f,
+ 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f,
+ 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f,
+ -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f,
+ -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f,
+ 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f,
+ 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f,
+ -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f,
+ -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f,
+ 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f,
+ -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f,
+ -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f,
+ 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f,
+ 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f,
+ -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f,
+ -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f,
+ -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f,
+ 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f,
+ 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f,
+ -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f,
+ 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f,
+ -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f,
+ -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f,
+ -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f,
+ -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f,
+ -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f,
+ -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f,
+ -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f,
+ -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f,
+ 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f,
+ -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f,
+ -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f,
+ -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f,
+ 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f,
+ -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f,
+ 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f,
+ -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f,
+ -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f,
+ 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f,
+ 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f,
+ 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f,
+ -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f,
+ -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f,
+ 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f,
+ 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f,
+ 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f,
+ 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f,
+ 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f,
+ 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f,
+ -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f,
+ 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f,
+ 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f,
+ -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f,
+ -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f,
+ -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f,
+ -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f,
+ 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f,
+ 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f,
+ 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f,
+ -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f,
+ -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f,
+ -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f,
+ 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f,
+ 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f,
+ -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f,
+ 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f,
+ 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f,
+ -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f,
+ -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f,
+ 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f,
+ 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f,
+ -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f,
+ -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f,
+ -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f,
+ -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f,
+ -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f,
+ -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f,
+ 0.565984f, 0.592690f,
+};
+
+static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) {
+ float32x4_t sum01 = vaddq_f32(a[0], a[1]);
+ float32x4_t sum23 = vaddq_f32(a[2], a[3]);
+ return vaddq_f32(sum01, sum23);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const float *bias, const int skip_width, const int skip_height,
+ const int filter_width, const int filter_height, const int in_channels,
+ const int out_channels, float **output, int out_stride, int start_idx,
+ const float *weights) {
+ assert(filter_height == 2 && filter_width == 2);
+ assert(skip_width == 2 && skip_height == 2);
+ assert(in_width >= 16);
+ const int in_size = in_height * in_width;
+
+ do {
+ const float32x4_t bias_v = vdupq_n_f32(bias[0]);
+ const float *weight_ptr0 = weights;
+ const float *in_ptr0 = *input;
+ float *out_ptr0 = *output;
+ int h = 0;
+
+ do {
+ const float *in_ptr1 = in_ptr0;
+ float *out_ptr1 = out_ptr0;
+ int w = 0;
+
+ do {
+ const float *weight_ptr1 = weight_ptr0;
+ const float *in_ptr2 = in_ptr1;
+ int k = 0;
+ float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+ vdupq_n_f32(0) };
+ float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+ vdupq_n_f32(0) };
+
+ do {
+ const float32x4_t weights0 = vld1q_f32(weight_ptr1);
+ const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4);
+ const float32x2_t weights0_lo = vget_low_f32(weights0);
+ const float32x2_t weights0_hi = vget_high_f32(weights0);
+ const float32x2_t weights1_lo = vget_low_f32(weights1);
+ const float32x2_t weights1_hi = vget_high_f32(weights1);
+
+ const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2);
+ const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride);
+ const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size);
+ const float32x4x2_t in1_hi_0 =
+ vld2q_f32(in_ptr2 + in_size + in_stride);
+
+ sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0);
+ sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1);
+
+ sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0);
+ sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1);
+
+ sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0);
+ sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1);
+
+ sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0);
+ sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1);
+
+ const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8);
+ const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8);
+ const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8);
+ const float32x4x2_t in1_hi_1 =
+ vld2q_f32(in_ptr2 + in_size + in_stride + 8);
+
+ sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0);
+ sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1);
+
+ sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0);
+ sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1);
+
+ sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0);
+ sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1);
+
+ sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0);
+ sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1);
+
+ weight_ptr1 += 8;
+ in_ptr2 += 2 * in_size;
+ k += 2;
+ } while (k < in_channels);
+
+ vst1q_f32(out_ptr1, add_f32x4_x4(sum0));
+ vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1));
+
+ out_ptr1 += 8;
+ in_ptr1 += 8 * skip_width;
+ w += 8 * skip_width;
+ } while (w < in_width - filter_width + 1);
+
+ out_ptr0 += out_stride;
+ in_ptr0 += skip_height * in_stride;
+ h += skip_height;
+ } while (h < in_height - filter_height + 1);
+
+ ++bias;
+ ++output;
+ weights += in_channels * filter_height * filter_width;
+ } while (++start_idx < out_channels);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const float *bias, const int skip_width, const int skip_height,
+ const int filter_width, const int filter_height, const int in_channels,
+ const int out_channels, float **output, int out_stride, int start_idx,
+ const float *weights) {
+ assert(filter_height == 2 && filter_width == 2);
+ assert(skip_width == 2 && skip_height == 2);
+ assert(in_width == 8);
+ const int in_size = in_height * in_width;
+ do {
+ const float32x4_t bias_v = vdupq_n_f32(*bias);
+ const float *weight_ptr0 = weights;
+ const float *in_ptr0 = *input;
+ float *out_ptr0 = *output;
+ int h = 0;
+
+ do {
+ const float *in_ptr1 = in_ptr0;
+ float *out_ptr1 = out_ptr0;
+ int w = 0;
+
+ do {
+ const float *weight_ptr1 = weight_ptr0;
+ const float *in_ptr2 = in_ptr1;
+ int k = 0;
+ float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+ vdupq_n_f32(0) };
+
+ do {
+ const float32x4_t weights0 = vld1q_f32(weight_ptr1);
+ const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4);
+ const float32x2_t weights0_lo = vget_low_f32(weights0);
+ const float32x2_t weights0_hi = vget_high_f32(weights0);
+ const float32x2_t weights1_lo = vget_low_f32(weights1);
+ const float32x2_t weights1_hi = vget_high_f32(weights1);
+
+ const float32x4x2_t in0_lo = vld2q_f32(in_ptr2);
+ const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride);
+ const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size);
+ const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride);
+
+ sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0);
+ sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1);
+
+ sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1);
+
+ sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0);
+ sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1);
+
+ sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0);
+ sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1);
+
+ weight_ptr1 += 8;
+ in_ptr2 += 2 * in_size;
+ k += 2;
+ } while (k < in_channels);
+
+ vst1q_f32(out_ptr1, add_f32x4_x4(sum));
+
+ out_ptr1 += 4;
+ in_ptr1 += 4 * skip_width;
+ w += 4 * skip_width;
+ } while (w < in_width - filter_width + 1);
+
+ out_ptr0 += out_stride;
+ in_ptr0 += skip_height * in_stride;
+ h += skip_height;
+ } while (h < in_height - filter_height + 1);
+
+ ++bias;
+ ++output;
+ weights += in_channels * filter_height * filter_width;
+ } while (++start_idx < out_channels);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const float *bias, const int skip_width, const int skip_height,
+ const int filter_width, const int filter_height, const int in_channels,
+ const int out_channels, float **output, int out_stride, int start_idx,
+ const float *weights) {
+ assert(filter_height == 5 && filter_width == 5);
+ assert(skip_width == 4 && skip_height == 4);
+ assert(in_width >= 16);
+ assert(in_channels == 1);
+ (void)in_channels;
+
+ do {
+ const float32x4_t bias_v = vdupq_n_f32(*bias);
+ const float *in_ptr0 = *input;
+ const float *weights_ptr0 = weights;
+ float *out_ptr0 = *output;
+ int h = 0;
+
+ do {
+ const float *in_ptr1 = in_ptr0;
+ float *out_ptr1 = out_ptr0;
+ int w = 0;
+
+ do {
+ float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) };
+
+ const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0);
+ const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4);
+ const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8);
+ const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12);
+ const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16);
+ const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20);
+
+ const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3);
+ const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3);
+ const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7);
+ const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7);
+ const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11);
+ const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11);
+ const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15);
+ const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15);
+ const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19);
+ const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19);
+ const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23);
+ const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23);
+
+ const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride);
+ const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride);
+ const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride);
+ const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride);
+ const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride);
+
+ const float32x4_t in0_4 = vextq_f32(
+ in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1);
+ const float32x4_t in1_4 = vextq_f32(
+ in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1);
+ const float32x4_t in2_4 = vextq_f32(
+ in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1);
+ const float32x4_t in3_4 = vextq_f32(
+ in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1);
+ const float32x4_t in4_4 = vextq_f32(
+ in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1);
+
+ // Kernel row 0.
+ sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0);
+
+ // Kernel row 1.
+ sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1);
+
+ // Kernel row 2.
+ sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0);
+
+ // Kernel row 3.
+ sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1);
+
+ // Kernel row 4.
+ sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1);
+ sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4);
+
+ vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1]));
+
+ out_ptr1 += 4;
+ in_ptr1 += 4 * skip_width;
+ w += 4 * skip_width;
+ } while (w < in_width - filter_width + 1);
+
+ out_ptr0 += out_stride;
+ in_ptr0 += skip_height * in_stride;
+ h += skip_height;
+ } while (h < in_height - filter_height + 1);
+
+ ++output;
+ ++bias;
+ weights += 25;
+ } while (++start_idx < out_channels);
+}
+
+// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+// 0 64x64 16x16 5 5 4 4
+// 1 16x16 8x8 2 2 2 2
+// 2 8x8 4x4 2 2 2 2
+// 3 4x4 2x2 2 2 2 2
+// 4 2x2 1x1 2 2 2 2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step) {
+ assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+ !layer_config->maxpool);
+ assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+ assert(layer_config->pad == PADDING_VALID);
+ assert(channel_step == 1);
+ assert(cstep == layer_config->in_channels * layer_config->out_channels);
+
+ if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+ layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+ av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+ input, in_width, in_height, in_stride, layer_config->bias,
+ layer_config->skip_width, layer_config->skip_height,
+ layer_config->filter_width, layer_config->filter_height,
+ layer_config->in_channels, layer_config->out_channels, output,
+ out_stride, start_idx, weights_layer_5);
+ } else if (layer_config->filter_width == 2 &&
+ layer_config->filter_height == 2 &&
+ layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+ const float *weights = weights_layer_1;
+ if (layer_config->output_num ==
+ av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) {
+ weights = weights_layer_2;
+ } else if ((layer_config->output_num ==
+ av1_intra_mode_cnn_partition_cnn_config.layer_config[3]
+ .output_num)) {
+ weights = weights_layer_3;
+ } else if ((layer_config->output_num ==
+ av1_intra_mode_cnn_partition_cnn_config.layer_config[4]
+ .output_num)) {
+ weights = weights_layer_4;
+ }
+ if (in_width >= 16) {
+ av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+ input, in_width, in_height, in_stride, layer_config->bias,
+ layer_config->skip_width, layer_config->skip_height,
+ layer_config->filter_width, layer_config->filter_height,
+ layer_config->in_channels, layer_config->out_channels, output,
+ out_stride, start_idx, weights);
+ } else if (in_width == 8) {
+ av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+ input, in_width, in_height, in_stride, layer_config->bias,
+ layer_config->skip_width, layer_config->skip_height,
+ layer_config->filter_width, layer_config->filter_height,
+ layer_config->in_channels, layer_config->out_channels, output,
+ out_stride, start_idx, weights);
+ } else {
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, channel_step);
+ }
+ } else {
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
new file mode 100644
index 0000000000..582863a27c
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encodetxb.h"
+
+void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ memset(levels - TX_PAD_TOP * stride, 0,
+ sizeof(*levels) * TX_PAD_TOP * stride);
+ memset(levels + stride * width, 0,
+ sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+ const int32x4_t zeros = vdupq_n_s32(0);
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (height == 4) {
+ do {
+ const int32x4_t coeffA = vld1q_s32(cf);
+ const int32x4_t coeffB = vld1q_s32(cf + height);
+ const int16x8_t coeffAB =
+ vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+ const int16x8_t absAB = vqabsq_s16(coeffAB);
+ const int8x8_t absABs = vqmovn_s16(absAB);
+#if AOM_ARCH_AARCH64
+ const int8x16_t absAB8 =
+ vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
+ const uint8x16_t lsAB =
+ vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
+#else
+ const int32x2x2_t absAB8 =
+ vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
+ const uint8x16_t lsAB =
+ vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
+#endif
+ vst1q_u8(ls, lsAB);
+ ls += (stride << 1);
+ cf += (height << 1);
+ i += 2;
+ } while (i < width);
+ } else if (height == 8) {
+ do {
+ const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+ const int16x8_t absAB = vqabsq_s16(coeffAB);
+ const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
+ vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
+ vst1q_u8(ls, absAB8);
+ ls += stride;
+ cf += height;
+ i += 1;
+ } while (i < width);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+ const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
+ const int16x8_t absAB = vqabsq_s16(coeffAB);
+ const int16x8_t absCD = vqabsq_s16(coeffCD);
+ const uint8x16_t absABCD = vreinterpretq_u8_s8(
+ vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
+ vst1q_u8((ls + j), absABCD);
+ j += 16;
+ cf += 16;
+ } while (j < height);
+ *(int32_t *)(ls + height) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < width);
+ }
+}
+
+// get_4_nz_map_contexts_2d coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
+ { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
+ { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_hor coefficients:
+/* clang-format off */
+#define SIG_COEF_CONTEXTS_2D_X4_051010 \
+ (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
+ ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
+/* clang-format on */
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = {
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_8_coeff_contexts_2d coefficients:
+// if (width == 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
+ { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+// if (width < 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
+ { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 },
+ { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 }
+};
+
+// if (width > 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
+ { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = {
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_16n_coeff_contexts_2d coefficients:
+// real_width == real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
+ { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width < real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
+ { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width > real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
+ { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_16n_coeff_contexts_hor coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// end of coefficients declaration area
+
+static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+ const int byte_stride) {
+#if AOM_ARCH_AARCH64
+ uint32x4_t v_data = vld1q_u32((uint32_t *)src);
+ v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
+ v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
+ v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
+
+ return vreinterpretq_u8_u32(v_data);
+#else
+ return load_unaligned_u8q(src, byte_stride);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+ const int byte_stride) {
+#if AOM_ARCH_AARCH64
+ uint64x2_t v_data = vld1q_u64((uint64_t *)src);
+ v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
+
+ return vreinterpretq_u8_u64(v_data);
+#else
+ uint8x8_t v_data_low = vld1_u8(src);
+ uint8x8_t v_data_high = vld1_u8(src + byte_stride);
+
+ return vcombine_u8(v_data_low, v_data_high);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+ const int byte_stride) {
+ (void)byte_stride;
+ return vld1q_u8(src);
+}
+
+static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+ const ptrdiff_t *const offsets,
+ uint8x16_t *const level) {
+ level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
+ level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
+ level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
+ level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
+ level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+ const ptrdiff_t *const offsets,
+ uint8x16_t *const level) {
+ level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
+ level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
+ level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
+ level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
+ level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_16x1x5(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ uint8x16_t *const level) {
+ level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
+ level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
+ level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
+ level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
+ level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+ const uint8x16_t const_3 = vdupq_n_u8(3);
+ const uint8x16_t const_4 = vdupq_n_u8(4);
+ uint8x16_t count;
+
+ count = vminq_u8(level[0], const_3);
+ level[1] = vminq_u8(level[1], const_3);
+ level[2] = vminq_u8(level[2], const_3);
+ level[3] = vminq_u8(level[3], const_3);
+ level[4] = vminq_u8(level[4], const_3);
+ count = vaddq_u8(count, level[1]);
+ count = vaddq_u8(count, level[2]);
+ count = vaddq_u8(count, level[3]);
+ count = vaddq_u8(count, level[4]);
+
+ count = vrshrq_n_u8(count, 1);
+ count = vminq_u8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
+
+ uint8x16_t pos_to_offset =
+ (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+ uint8_t *cc = coeff_contexts;
+
+ assert(!(width % 4));
+
+ int col = width;
+ do {
+ load_levels_4x4x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ col -= 4;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+
+ const uint8x16_t pos_to_offset =
+ vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 4));
+
+ int col = width;
+ do {
+ load_levels_4x4x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 4));
+
+ int col = width;
+ do {
+ load_levels_4x4x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ uint8_t *cc = coeff_contexts;
+ uint8x16_t count;
+ uint8x16_t level[5];
+ uint8x16_t pos_to_offset[3];
+
+ assert(!(width % 2));
+
+ if (width == 8) {
+ pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
+ pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
+ } else if (width < 8) {
+ pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
+ pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
+ } else {
+ pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
+ pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
+ }
+ pos_to_offset[2] = vdupq_n_u8(21);
+
+ int col = width;
+ do {
+ load_levels_8x2x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset[0]);
+ vst1q_u8(cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ col -= 2;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+
+ const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 2));
+
+ int col = width;
+ do {
+ load_levels_8x2x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
+ vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 2));
+
+ int col = width;
+ do {
+ load_levels_8x2x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ uint8_t *cc = coeff_contexts;
+ int col = width;
+ uint8x16_t pos_to_offset[5];
+ uint8x16_t pos_to_offset_large[3];
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(height % 16));
+
+ pos_to_offset_large[2] = vdupq_n_u8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
+ pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
+ pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
+ pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width < real_height) {
+ pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
+ pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
+ vld1q_u8(c_16_po_2d_g[2]);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width > real_height
+ pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
+ pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
+ pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16);
+ }
+
+ do {
+ int h = height;
+
+ do {
+ load_levels_16x1x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset[0]);
+ vst1q_u8(cc, count);
+ levels += 16;
+ cc += 16;
+ h -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(height % 16));
+
+ int col = width;
+ do {
+ uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);
+
+ int h = height;
+ do {
+ load_levels_16x1x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+
+ uint8x16_t pos_to_offset[3];
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(height % 16));
+
+ pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ int col = width;
+ do {
+ int h = height;
+ do {
+ load_levels_16x1x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset[0]);
+ vst1q_u8(coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = 0;
+ return;
+ }
+
+ uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = height + TX_PAD_HOR;
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (height == 4) {
+ get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
+ } else if (height == 8) {
+ get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coefficients);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (height == 4) {
+ get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
+ } else if (height == 8) {
+ get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (height == 4) {
+ get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
+ } else if (height == 8) {
+ get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
+ }
+ }
+
+ const int bhl = get_txb_bhl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (width << bhl) / 8)
+ coeff_contexts[pos] = 1;
+ else if (last_idx <= (width << bhl) / 4)
+ coeff_contexts[pos] = 2;
+ else
+ coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
new file mode 100644
index 0000000000..aa64a38902
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -0,0 +1,2619 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
+ int32x4_t *out) {
+ // This is not quite the same as the other transposes defined in
+ // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
+ // unused by the following row transform.
+ for (int j = 0; j < 8; ++j) {
+ for (int i = 0; i < 16; ++i) {
+ transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
+ }
+ }
+}
+
+// A note on butterfly helper naming:
+//
+// butterfly_[weight_indices]_neon
+// e.g. butterfly_0312_neon
+// ^ Weights are applied as indices 0, 3, 2, 1
+// (see more detail below)
+//
+// Weight indices are treated as an index into the 4-tuple of the weight
+// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
+// This is then represented in the helper naming by referring to the lane index
+// in the loaded tuple that each multiply is performed with:
+//
+// in0 in1
+// /------------
+// out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1]
+// out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3]
+//
+// So for indices 0321 from the earlier example, we end up with:
+//
+// in0 in1
+// /------------------
+// out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1)
+// out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0)
+
+#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \
+ do { \
+ int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \
+ int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
+ x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \
+ *out = vrshlq_s32(x, v_bit); \
+ } while (false)
+
+static AOM_FORCE_INLINE void butterfly_0112_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_2312_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0332_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0130_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
+ const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+ int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+ butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
+ const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+ int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+ butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
+ int32x4_t *output,
+ const int size) {
+ const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+ int i = 0;
+ do {
+ const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
+ output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+ } while (++i < size);
+}
+
+static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
+ const int32x4_t *input, int32x4_t *output, const int size) {
+ const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+ int i = 0;
+ do {
+ const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
+ const int32x4_t r1 = vmulq_s32(r0, sqrt2);
+ output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+ } while (++i < size);
+}
+
+#define LOAD_BUFFER_4XH(h) \
+ static AOM_FORCE_INLINE void load_buffer_4x##h( \
+ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+ if (fliplr) { \
+ for (int i = 0; i < (h); ++i) { \
+ int16x4_t a = vld1_s16(input + i * stride); \
+ a = vrev64_s16(a); \
+ in[i] = vshll_n_s16(a, 2); \
+ } \
+ } else { \
+ for (int i = 0; i < (h); ++i) { \
+ int16x4_t a = vld1_s16(input + i * stride); \
+ in[i] = vshll_n_s16(a, 2); \
+ } \
+ } \
+ }
+
+// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
+// avoid the expression even though the compiler can prove that the code path
+// is never taken if `shift == 0`.
+#define shift_left_long_s16(a, shift) \
+ ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
+
+#define LOAD_BUFFER_WXH(w, h, shift) \
+ static AOM_FORCE_INLINE void load_buffer_##w##x##h( \
+ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+ assert(w >= 8); \
+ if (fliplr) { \
+ for (int i = 0; i < (h); ++i) { \
+ for (int j = 0; j < (w) / 8; ++j) { \
+ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
+ a = vrev64q_s16(a); \
+ int j2 = (w) / 8 - j - 1; \
+ in[i + (h) * (2 * j2 + 0)] = \
+ shift_left_long_s16(vget_high_s16(a), (shift)); \
+ in[i + (h) * (2 * j2 + 1)] = \
+ shift_left_long_s16(vget_low_s16(a), (shift)); \
+ } \
+ } \
+ } else { \
+ for (int i = 0; i < (h); ++i) { \
+ for (int j = 0; j < (w) / 8; ++j) { \
+ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
+ in[i + (h) * (2 * j + 0)] = \
+ shift_left_long_s16(vget_low_s16(a), (shift)); \
+ in[i + (h) * (2 * j + 1)] = \
+ shift_left_long_s16(vget_high_s16(a), (shift)); \
+ } \
+ } \
+ } \
+ }
+
+LOAD_BUFFER_4XH(4)
+LOAD_BUFFER_4XH(8)
+LOAD_BUFFER_4XH(16)
+LOAD_BUFFER_4XH(32)
+LOAD_BUFFER_WXH(8, 8, 2)
+LOAD_BUFFER_WXH(16, 16, 2)
+LOAD_BUFFER_WXH(32, 64, 0)
+LOAD_BUFFER_WXH(64, 32, 2)
+LOAD_BUFFER_WXH(64, 64, 0)
+
+#if !CONFIG_REALTIME_ONLY
+LOAD_BUFFER_WXH(16, 64, 0)
+LOAD_BUFFER_WXH(64, 16, 2)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define STORE_BUFFER_WXH(w, h) \
+ static AOM_FORCE_INLINE void store_buffer_##w##x##h( \
+ const int32x4_t *in, int32_t *out, int stride) { \
+ for (int i = 0; i < (w); ++i) { \
+ for (int j = 0; j < (h) / 4; ++j) { \
+ vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
+ } \
+ } \
+ }
+
+STORE_BUFFER_WXH(4, 4)
+STORE_BUFFER_WXH(8, 4)
+STORE_BUFFER_WXH(8, 8)
+STORE_BUFFER_WXH(16, 4)
+STORE_BUFFER_WXH(16, 16)
+STORE_BUFFER_WXH(32, 4)
+STORE_BUFFER_WXH(32, 32)
+STORE_BUFFER_WXH(64, 32)
+
+#if !CONFIG_REALTIME_ONLY
+STORE_BUFFER_WXH(16, 32)
+STORE_BUFFER_WXH(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
+ const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
+
+ const int32x4_t a0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t a1 = vsubq_s32(in[0], in[3]);
+ const int32x4_t a2 = vaddq_s32(in[1], in[2]);
+ const int32x4_t a3 = vsubq_s32(in[1], in[2]);
+
+ const int32x4_t b0 = vmulq_s32(a0, cospi32);
+ const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
+ const int32x4_t b2 = vmulq_s32(a2, cospi32);
+ const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
+
+ const int32x4_t c0 = vaddq_s32(b0, b2);
+ const int32x4_t c1 = vsubq_s32(b0, b2);
+ const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
+ const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
+
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t d0 = vrshlq_s32(c0, v_bit);
+ const int32x4_t d1 = vrshlq_s32(c1, v_bit);
+ const int32x4_t d2 = vrshlq_s32(c2, v_bit);
+ const int32x4_t d3 = vrshlq_s32(c3, v_bit);
+
+ out[0] = d0;
+ out[1] = d2;
+ out[2] = d1;
+ out[3] = d3;
+}
+
+static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
+
+ const int32x4_t a0 = vaddq_s32(in[0], in[1]);
+ const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
+ const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
+ const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
+
+ const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
+ const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
+ const int32x4_t b2 = vsubq_s32(a0, in[3]);
+
+ const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
+ const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
+ const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
+
+ const int32x4_t d0 = vaddq_s32(c0, a3);
+ const int32x4_t d1 = vsubq_s32(c1, a3);
+ const int32x4_t d2 = vsubq_s32(c1, c0);
+
+ const int32x4_t e0 = vaddq_s32(d2, a3);
+
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ out[0] = vrshlq_s32(d0, v_bit);
+ out[1] = vrshlq_s32(c2, v_bit);
+ out[2] = vrshlq_s32(d1, v_bit);
+ out[3] = vrshlq_s32(e0, v_bit);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
+ int32x4_t *out,
+ int bit) {
+ (void)bit;
+ int32x4_t fact = vdupq_n_s32(NewSqrt2);
+
+ for (int i = 0; i < 4; i++) {
+ const int32x4_t a_low = vmulq_s32(in[i], fact);
+ out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
+ }
+}
+
+void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
+ int input_stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
+
+ // Workspace for column/row-wise transforms.
+ int32x4_t buf[4];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ default: assert(0);
+ }
+}
+
+// Butterfly pre-processing:
+// e.g. n=4:
+// out[0] = in[0] + in[3]
+// out[1] = in[1] + in[2]
+// out[2] = in[1] - in[2]
+// out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
+ int32x4_t *output, int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vaddq_s32(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+// Butterfly post-processing:
+// e.g. n=8:
+// out[0] = in0[0] + in1[3];
+// out[1] = in0[1] + in1[2];
+// out[2] = in0[1] - in1[2];
+// out[3] = in0[0] - in1[3];
+// out[4] = in0[7] - in1[4];
+// out[5] = in0[6] - in1[5];
+// out[6] = in0[6] + in1[5];
+// out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
+ const int32x4_t *in1,
+ int32x4_t *output, int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ // stage 1
+ int32x4_t a[8];
+ butterfly_dct_pre(in, a, 8);
+
+ // stage 2
+ int32x4_t b[8];
+ butterfly_dct_pre(a, b, 4);
+ butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
+
+ // stage 3
+ int32x4_t c[8];
+ butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
+ butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
+ butterfly_dct_post(a + 4, b + 4, c + 4, 4);
+
+ // stage 4-5
+ butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
+ butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
+
+ out[0] = c[0];
+ out[2] = c[2];
+ out[4] = c[1];
+ out[6] = c[3];
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+ int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+
+ // stage 0-1
+ u0 = in[0];
+ u1 = in[7];
+ u2 = in[3];
+ u3 = in[4];
+ u4 = in[1];
+ u5 = in[6];
+ u6 = in[2];
+ u7 = in[5];
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+ butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
+ v4 = u4;
+ v5 = u5;
+ butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
+
+ // stage 3
+ u0 = vaddq_s32(v0, v2);
+ u1 = vsubq_s32(v3, v1);
+ u2 = vsubq_s32(v0, v2);
+ u3 = vaddq_s32(v1, v3);
+ u4 = vsubq_s32(v6, v4);
+ u5 = vaddq_s32(v5, v7);
+ u6 = vaddq_s32(v4, v6);
+ u7 = vsubq_s32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
+ butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
+
+ // stage 5
+ u0 = vaddq_s32(v0, v4);
+ u1 = vaddq_s32(v1, v5);
+ u2 = vaddq_s32(v2, v6);
+ u3 = vsubq_s32(v7, v3);
+ u4 = vsubq_s32(v0, v4);
+ u5 = vsubq_s32(v1, v5);
+ u6 = vsubq_s32(v2, v6);
+ u7 = vaddq_s32(v3, v7);
+
+ // stage 6
+ butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
+ butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
+ butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
+ butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
+
+ // stage 7
+ out[0] = v1;
+ out[1] = v6;
+ out[2] = v3;
+ out[3] = v4;
+ out[4] = v5;
+ out[5] = v2;
+ out[6] = v7;
+ out[7] = v0;
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
+ int32x4_t *out,
+ int bit) {
+ (void)bit;
+ out[0] = vshlq_n_s32(in[0], 1);
+ out[1] = vshlq_n_s32(in[1], 1);
+ out[2] = vshlq_n_s32(in[2], 1);
+ out[3] = vshlq_n_s32(in[3], 1);
+ out[4] = vshlq_n_s32(in[4], 1);
+ out[5] = vshlq_n_s32(in[5], 1);
+ out[6] = vshlq_n_s32(in[6], 1);
+ out[7] = vshlq_n_s32(in[7], 1);
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ (void)bit;
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Workspaces for column/row-wise transforms.
+ int32x4_t buf0[16], buf1[16];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case IDTX:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case V_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case H_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case V_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case H_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ default: assert(0);
+ }
+}
+
+static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ int32x4_t u[16], v[16];
+
+ // stage 1
+ butterfly_dct_pre(in, u, 16);
+
+ // stage 2
+ butterfly_dct_pre(u, v, 8);
+ v[8] = u[8];
+ v[9] = u[9];
+ butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
+ butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ butterfly_dct_pre(v, u, 4);
+ u[4] = v[4];
+ butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
+ u[7] = v[7];
+ butterfly_dct_post(v + 8, v + 8, u + 8, 8);
+
+ // stage 4
+ butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
+ butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
+ butterfly_dct_post(u + 4, u + 4, v + 4, 4);
+ v[8] = u[8];
+ butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
+ butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
+ butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
+ butterfly_dct_post(v + 8, v + 8, u + 8, 4);
+ butterfly_dct_post(v + 12, v + 12, u + 12, 4);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
+ butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
+ butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
+ butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
+
+ out[0] = v[0];
+ out[1] = v[8];
+ out[2] = v[4];
+ out[3] = v[12];
+ out[4] = v[2];
+ out[5] = v[10];
+ out[6] = v[6];
+ out[7] = v[14];
+ out[8] = v[1];
+ out[9] = v[9];
+ out[10] = v[5];
+ out[11] = v[13];
+ out[12] = v[3];
+ out[13] = v[11];
+ out[14] = v[7];
+ out[15] = v[15];
+}
+
+static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ int32x4_t u[16], v[16];
+
+ // stage 0-1
+ u[0] = in[0];
+ u[1] = in[15];
+ u[2] = in[7];
+ u[3] = in[8];
+ u[4] = in[3];
+ u[5] = in[12];
+ u[6] = in[4];
+ u[7] = in[11];
+ u[8] = in[1];
+ u[9] = in[14];
+ u[10] = in[6];
+ u[11] = in[9];
+ u[12] = in[2];
+ u[13] = in[13];
+ u[14] = in[5];
+ u[15] = in[10];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
+ v[4] = u[4];
+ v[5] = u[5];
+ butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
+ v[12] = u[12];
+ v[13] = u[13];
+ butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
+
+ // stage 3
+ u[0] = vaddq_s32(v[0], v[2]);
+ u[1] = vsubq_s32(v[3], v[1]);
+ u[2] = vsubq_s32(v[0], v[2]);
+ u[3] = vaddq_s32(v[1], v[3]);
+ u[4] = vsubq_s32(v[6], v[4]);
+ u[5] = vaddq_s32(v[5], v[7]);
+ u[6] = vaddq_s32(v[4], v[6]);
+ u[7] = vsubq_s32(v[5], v[7]);
+ u[8] = vsubq_s32(v[10], v[8]);
+ u[9] = vaddq_s32(v[9], v[11]);
+ u[10] = vaddq_s32(v[8], v[10]);
+ u[11] = vsubq_s32(v[9], v[11]);
+ u[12] = vaddq_s32(v[12], v[14]);
+ u[13] = vsubq_s32(v[15], v[13]);
+ u[14] = vsubq_s32(v[12], v[14]);
+ u[15] = vaddq_s32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
+ butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
+ butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
+
+ // stage 5
+ u[0] = vaddq_s32(v[0], v[4]);
+ u[1] = vaddq_s32(v[1], v[5]);
+ u[2] = vaddq_s32(v[2], v[6]);
+ u[3] = vsubq_s32(v[7], v[3]);
+ u[4] = vsubq_s32(v[0], v[4]);
+ u[5] = vsubq_s32(v[1], v[5]);
+ u[6] = vsubq_s32(v[2], v[6]);
+ u[7] = vaddq_s32(v[3], v[7]);
+ u[8] = vaddq_s32(v[8], v[12]);
+ u[9] = vaddq_s32(v[9], v[13]);
+ u[10] = vsubq_s32(v[14], v[10]);
+ u[11] = vaddq_s32(v[11], v[15]);
+ u[12] = vsubq_s32(v[8], v[12]);
+ u[13] = vsubq_s32(v[9], v[13]);
+ u[14] = vaddq_s32(v[10], v[14]);
+ u[15] = vsubq_s32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
+ butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
+ butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
+ butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
+
+ // stage 7
+ u[0] = vaddq_s32(v[0], v[8]);
+ u[1] = vaddq_s32(v[1], v[9]);
+ u[2] = vaddq_s32(v[2], v[10]);
+ u[3] = vaddq_s32(v[3], v[11]);
+ u[4] = vaddq_s32(v[4], v[12]);
+ u[5] = vaddq_s32(v[5], v[13]);
+ u[6] = vaddq_s32(v[6], v[14]);
+ u[7] = vsubq_s32(v[15], v[7]);
+ u[8] = vsubq_s32(v[0], v[8]);
+ u[9] = vsubq_s32(v[1], v[9]);
+ u[10] = vsubq_s32(v[2], v[10]);
+ u[11] = vsubq_s32(v[3], v[11]);
+ u[12] = vsubq_s32(v[4], v[12]);
+ u[13] = vsubq_s32(v[5], v[13]);
+ u[14] = vsubq_s32(v[6], v[14]);
+ u[15] = vaddq_s32(v[7], v[15]);
+
+ // stage 8
+ butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
+ butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
+ butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
+ butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
+ butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
+ butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
+ butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
+ butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
+
+ // stage 9
+ out[0] = v[1];
+ out[1] = v[14];
+ out[2] = v[3];
+ out[3] = v[12];
+ out[4] = v[5];
+ out[5] = v[10];
+ out[6] = v[7];
+ out[7] = v[8];
+ out[8] = v[9];
+ out[9] = v[6];
+ out[10] = v[11];
+ out[11] = v[4];
+ out[12] = v[13];
+ out[13] = v[2];
+ out[14] = v[15];
+ out[15] = v[0];
+}
+
+static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ (void)bit;
+ const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+ const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+
+ for (int i = 0; i < 16; i++) {
+ int32x4_t a = vmulq_s32(in[i], fact);
+ a = vaddq_s32(a, offset);
+ out[i] = vshrq_n_s32(a, NewSqrt2Bits);
+ }
+}
+
+static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+ const int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
+ int bit, int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Workspaces for column/row-wise transforms.
+ int32x4_t buf0[64], buf1[64];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case IDTX:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case V_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ default: assert(0);
+ }
+}
+
+typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
+ int stride, int bit, int lr_flip);
+typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
+ int32x4_t *out, int stride,
+ int bit, int lr_flip,
+ int howmany, int hm_stride);
+
+typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
+ int bit, int stride);
+typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
+ int32_t *out, int bit,
+ int howmany, int hm_stride,
+ int stride);
+
+// Construct component kernels that include the load_buffer and store_buffer
+// stages to avoid the need to spill loaded data to the stack between these and
+// the txfm kernel calls.
+// The TRANSFORM_*_ONE cases are only ever called in situations where the
+// howmany parameter would be one, so no need for the loop at all in these
+// cases.
+
+#define TRANSFORM_COL_ONE(name, n) \
+ static void highbd_##name##_col_neon(const int16_t *input, \
+ int32x4_t *output, int stride, \
+ int cos_bit, int lr_flip) { \
+ int32x4_t buf0[n]; \
+ load_buffer_4x##n(input, buf0, stride, lr_flip); \
+ highbd_##name##_x4_neon(buf0, output, cos_bit); \
+ }
+
+#define TRANSFORM_COL_MANY(name, n) \
+ static void highbd_##name##_col_many_neon( \
+ const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
+ int lr_flip, int howmany, int hm_stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \
+ highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \
+ } while (++i < howmany); \
+ }
+
+#define TRANSFORM_ROW_ONE(name, n) \
+ static void highbd_##name##_row_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input, buf0, cos_bit); \
+ store_buffer_##n##x4(buf0, output, stride); \
+ }
+
+#define TRANSFORM_ROW_RECT_ONE(name, n) \
+ static void highbd_##name##_row_rect_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input, buf0, cos_bit); \
+ round_rect_array_s32_neon(buf0, buf0, (n)); \
+ store_buffer_##n##x4(buf0, output, stride); \
+ }
+
+#define TRANSFORM_ROW_MANY(name, n) \
+ static void highbd_##name##_row_many_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+ int hm_stride, int stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
+ store_buffer_##n##x4(buf0, output + 4 * i, stride); \
+ } while (++i < howmany); \
+ }
+
+#define TRANSFORM_ROW_RECT_MANY(name, n) \
+ static void highbd_##name##_row_rect_many_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+ int hm_stride, int stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
+ round_rect_array_s32_neon(buf0, buf0, (n)); \
+ store_buffer_##n##x4(buf0, output + 4 * i, stride); \
+ } while (++i < howmany); \
+ }
+
+TRANSFORM_COL_ONE(fdct8, 8)
+TRANSFORM_COL_ONE(fadst8, 8)
+TRANSFORM_COL_ONE(fidentity8, 8)
+
+TRANSFORM_COL_MANY(fdct4, 4)
+TRANSFORM_COL_MANY(fdct8, 8)
+TRANSFORM_COL_MANY(fdct16, 16)
+TRANSFORM_COL_MANY(fadst4, 4)
+TRANSFORM_COL_MANY(fadst8, 8)
+TRANSFORM_COL_MANY(fadst16, 16)
+TRANSFORM_COL_MANY(fidentity4, 4)
+TRANSFORM_COL_MANY(fidentity8, 8)
+TRANSFORM_COL_MANY(fidentity16, 16)
+
+TRANSFORM_ROW_ONE(fdct16, 16)
+TRANSFORM_ROW_ONE(fadst16, 16)
+TRANSFORM_ROW_ONE(fidentity16, 16)
+
+TRANSFORM_ROW_RECT_ONE(fdct8, 8)
+TRANSFORM_ROW_RECT_ONE(fadst8, 8)
+TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
+
+#if !CONFIG_REALTIME_ONLY
+TRANSFORM_ROW_MANY(fdct4, 4)
+TRANSFORM_ROW_MANY(fdct8, 8)
+TRANSFORM_ROW_MANY(fadst4, 4)
+TRANSFORM_ROW_MANY(fadst8, 8)
+TRANSFORM_ROW_MANY(fidentity4, 4)
+TRANSFORM_ROW_MANY(fidentity8, 8)
+#endif
+
+TRANSFORM_ROW_RECT_MANY(fdct4, 4)
+TRANSFORM_ROW_RECT_MANY(fdct8, 8)
+TRANSFORM_ROW_RECT_MANY(fdct16, 16)
+TRANSFORM_ROW_RECT_MANY(fadst4, 4)
+TRANSFORM_ROW_RECT_MANY(fadst8, 8)
+TRANSFORM_ROW_RECT_MANY(fadst16, 16)
+TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
+TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
+TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_col_many_neon, // DCT_DCT
+ highbd_fadst8_col_many_neon, // ADST_DCT
+ highbd_fdct8_col_many_neon, // DCT_ADST
+ highbd_fadst8_col_many_neon, // ADST_ADST
+ highbd_fadst8_col_many_neon, // FLIPADST_DCT
+ highbd_fdct8_col_many_neon, // DCT_FLIPADST
+ highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_col_many_neon, // ADST_FLIPADST
+ highbd_fadst8_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_col_many_neon, // IDTX
+ highbd_fdct8_col_many_neon, // V_DCT
+ highbd_fidentity8_col_many_neon, // H_DCT
+ highbd_fadst8_col_many_neon, // V_ADST
+ highbd_fidentity8_col_many_neon, // H_ADST
+ highbd_fadst8_col_many_neon, // V_FLIPADST
+ highbd_fidentity8_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
+ highbd_fdct8_col_neon, // DCT_DCT
+ highbd_fadst8_col_neon, // ADST_DCT
+ highbd_fdct8_col_neon, // DCT_ADST
+ highbd_fadst8_col_neon, // ADST_ADST
+ highbd_fadst8_col_neon, // FLIPADST_DCT
+ highbd_fdct8_col_neon, // DCT_FLIPADST
+ highbd_fadst8_col_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_col_neon, // ADST_FLIPADST
+ highbd_fadst8_col_neon, // FLIPADST_ADST
+ highbd_fidentity8_col_neon, // IDTX
+ highbd_fdct8_col_neon, // V_DCT
+ highbd_fidentity8_col_neon, // H_DCT
+ highbd_fadst8_col_neon, // V_ADST
+ highbd_fidentity8_col_neon, // H_ADST
+ highbd_fadst8_col_neon, // V_FLIPADST
+ highbd_fidentity8_col_neon // H_FLIPADST
+};
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_col_many_neon, // DCT_DCT
+ highbd_fadst16_col_many_neon, // ADST_DCT
+ highbd_fdct16_col_many_neon, // DCT_ADST
+ highbd_fadst16_col_many_neon, // ADST_ADST
+ highbd_fadst16_col_many_neon, // FLIPADST_DCT
+ highbd_fdct16_col_many_neon, // DCT_FLIPADST
+ highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_col_many_neon, // ADST_FLIPADST
+ highbd_fadst16_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity16_col_many_neon, // IDTX
+ highbd_fdct16_col_many_neon, // V_DCT
+ highbd_fidentity16_col_many_neon, // H_DCT
+ highbd_fadst16_col_many_neon, // V_ADST
+ highbd_fidentity16_col_many_neon, // H_ADST
+ highbd_fadst16_col_many_neon, // V_FLIPADST
+ highbd_fidentity16_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_col_many_neon, // DCT_DCT
+ highbd_fadst4_col_many_neon, // ADST_DCT
+ highbd_fdct4_col_many_neon, // DCT_ADST
+ highbd_fadst4_col_many_neon, // ADST_ADST
+ highbd_fadst4_col_many_neon, // FLIPADST_DCT
+ highbd_fdct4_col_many_neon, // DCT_FLIPADST
+ highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_col_many_neon, // ADST_FLIPADST
+ highbd_fadst4_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_col_many_neon, // IDTX
+ highbd_fdct4_col_many_neon, // V_DCT
+ highbd_fidentity4_col_many_neon, // H_DCT
+ highbd_fadst4_col_many_neon, // V_ADST
+ highbd_fidentity4_col_many_neon, // H_ADST
+ highbd_fadst4_col_many_neon, // V_FLIPADST
+ highbd_fidentity4_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_row_neon, // DCT_DCT
+ highbd_fdct16_row_neon, // ADST_DCT
+ highbd_fadst16_row_neon, // DCT_ADST
+ highbd_fadst16_row_neon, // ADST_ADST
+ highbd_fdct16_row_neon, // FLIPADST_DCT
+ highbd_fadst16_row_neon, // DCT_FLIPADST
+ highbd_fadst16_row_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_row_neon, // ADST_FLIPADST
+ highbd_fadst16_row_neon, // FLIPADST_ADST
+ highbd_fidentity16_row_neon, // IDTX
+ highbd_fidentity16_row_neon, // V_DCT
+ highbd_fdct16_row_neon, // H_DCT
+ highbd_fidentity16_row_neon, // V_ADST
+ highbd_fadst16_row_neon, // H_ADST
+ highbd_fidentity16_row_neon, // V_FLIPADST
+ highbd_fadst16_row_neon // H_FLIPADST
+};
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_row_rect_many_neon, // DCT_DCT
+ highbd_fdct16_row_rect_many_neon, // ADST_DCT
+ highbd_fadst16_row_rect_many_neon, // DCT_ADST
+ highbd_fadst16_row_rect_many_neon, // ADST_ADST
+ highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity16_row_rect_many_neon, // IDTX
+ highbd_fidentity16_row_rect_many_neon, // V_DCT
+ highbd_fdct16_row_rect_many_neon, // H_DCT
+ highbd_fidentity16_row_rect_many_neon, // V_ADST
+ highbd_fadst16_row_rect_many_neon, // H_ADST
+ highbd_fidentity16_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst16_row_rect_many_neon // H_FLIPADST
+ };
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_row_many_neon, // DCT_DCT
+ highbd_fdct8_row_many_neon, // ADST_DCT
+ highbd_fadst8_row_many_neon, // DCT_ADST
+ highbd_fadst8_row_many_neon, // ADST_ADST
+ highbd_fdct8_row_many_neon, // FLIPADST_DCT
+ highbd_fadst8_row_many_neon, // DCT_FLIPADST
+ highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_many_neon, // ADST_FLIPADST
+ highbd_fadst8_row_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_many_neon, // IDTX
+ highbd_fidentity8_row_many_neon, // V_DCT
+ highbd_fdct8_row_many_neon, // H_DCT
+ highbd_fidentity8_row_many_neon, // V_ADST
+ highbd_fadst8_row_many_neon, // H_ADST
+ highbd_fidentity8_row_many_neon, // V_FLIPADST
+ highbd_fadst8_row_many_neon // H_FLIPADST
+ };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_row_rect_many_neon, // DCT_DCT
+ highbd_fdct8_row_rect_many_neon, // ADST_DCT
+ highbd_fadst8_row_rect_many_neon, // DCT_ADST
+ highbd_fadst8_row_rect_many_neon, // ADST_ADST
+ highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_rect_many_neon, // IDTX
+ highbd_fidentity8_row_rect_many_neon, // V_DCT
+ highbd_fdct8_row_rect_many_neon, // H_DCT
+ highbd_fidentity8_row_rect_many_neon, // V_ADST
+ highbd_fadst8_row_rect_many_neon, // H_ADST
+ highbd_fidentity8_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst8_row_rect_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
+ highbd_fdct8_row_rect_neon, // DCT_DCT
+ highbd_fdct8_row_rect_neon, // ADST_DCT
+ highbd_fadst8_row_rect_neon, // DCT_ADST
+ highbd_fadst8_row_rect_neon, // ADST_ADST
+ highbd_fdct8_row_rect_neon, // FLIPADST_DCT
+ highbd_fadst8_row_rect_neon, // DCT_FLIPADST
+ highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_rect_neon, // ADST_FLIPADST
+ highbd_fadst8_row_rect_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_rect_neon, // IDTX
+ highbd_fidentity8_row_rect_neon, // V_DCT
+ highbd_fdct8_row_rect_neon, // H_DCT
+ highbd_fidentity8_row_rect_neon, // V_ADST
+ highbd_fadst8_row_rect_neon, // H_ADST
+ highbd_fidentity8_row_rect_neon, // V_FLIPADST
+ highbd_fadst8_row_rect_neon // H_FLIPADST
+};
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_row_many_neon, // DCT_DCT
+ highbd_fdct4_row_many_neon, // ADST_DCT
+ highbd_fadst4_row_many_neon, // DCT_ADST
+ highbd_fadst4_row_many_neon, // ADST_ADST
+ highbd_fdct4_row_many_neon, // FLIPADST_DCT
+ highbd_fadst4_row_many_neon, // DCT_FLIPADST
+ highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_row_many_neon, // ADST_FLIPADST
+ highbd_fadst4_row_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_row_many_neon, // IDTX
+ highbd_fidentity4_row_many_neon, // V_DCT
+ highbd_fdct4_row_many_neon, // H_DCT
+ highbd_fidentity4_row_many_neon, // V_ADST
+ highbd_fadst4_row_many_neon, // H_ADST
+ highbd_fidentity4_row_many_neon, // V_FLIPADST
+ highbd_fadst4_row_many_neon // H_FLIPADST
+ };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_row_rect_many_neon, // DCT_DCT
+ highbd_fdct4_row_rect_many_neon, // ADST_DCT
+ highbd_fadst4_row_rect_many_neon, // DCT_ADST
+ highbd_fadst4_row_rect_many_neon, // ADST_ADST
+ highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_row_rect_many_neon, // IDTX
+ highbd_fidentity4_row_rect_many_neon, // V_DCT
+ highbd_fdct4_row_rect_many_neon, // H_DCT
+ highbd_fidentity4_row_rect_many_neon, // V_ADST
+ highbd_fadst4_row_rect_many_neon, // H_ADST
+ highbd_fidentity4_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst4_row_rect_many_neon // H_FLIPADST
+ };
+
+static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int32_t *const cospi = cospi_arr_s32(cos_bit);
+ const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+ // Workspaces for intermediate transform steps.
+ int32x4_t buf0[32];
+ int32x4_t buf1[32];
+
+ // stage 1
+ butterfly_dct_pre(input, buf1, 32);
+
+ // stage 2
+ butterfly_dct_pre(buf1, buf0, 16);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
+ v_cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ butterfly_dct_pre(buf0, buf1, 8);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
+ v_cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
+
+ // stage 4
+ butterfly_dct_pre(buf1, buf0, 4);
+ buf0[4] = buf1[4];
+ butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
+ v_cos_bit);
+ buf0[7] = buf1[7];
+ butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
+ v_cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
+ v_cos_bit);
+ butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
+ buf1[8] = buf0[8];
+ butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
+ v_cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+ butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
+
+ // stage 6
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+
+ butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
+ v_cos_bit);
+ butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+ butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
+ buf0[16] = buf1[16];
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+
+ butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
+ v_cos_bit);
+ butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
+ v_cos_bit);
+
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
+ v_cos_bit);
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+ butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+ butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+ butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
+
+ // stage 8
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
+ v_cos_bit);
+
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
+ int8_t cos_bit) {
+ const int32_t *const cospi = cospi_arr_s32(cos_bit);
+ const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+ // stage 1
+ int32x4_t x1[64];
+ butterfly_dct_pre(input, x1, 64);
+
+ // stage 2
+ int32x4_t x2[64];
+ butterfly_dct_pre(x1, x2, 32);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ int32x4_t x3[64];
+ butterfly_dct_pre(x2, x3, 16);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
+
+ // stage 4
+ int32x4_t x4[64];
+ butterfly_dct_pre(x3, x4, 8);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ int32x4_t x5[64];
+ butterfly_dct_pre(x4, x5, 4);
+ x5[4] = x4[4];
+ butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
+ x5[7] = x4[7];
+ butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
+
+ // stage 6
+ int32x4_t x6[64];
+ butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
+ butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
+ x6[8] = x5[8];
+ butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
+ butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ int32x4_t x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
+ butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
+ x7[16] = x6[16];
+ butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
+
+ // stage 8
+ int32x4_t x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+
+ butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
+ butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
+ butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
+ butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
+ butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
+ x8[32] = x7[32];
+ butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
+ butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
+ butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
+ butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
+ butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ int32x4_t x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
+ butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
+ butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
+ butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
+ butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
+ butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
+ butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
+ butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
+ butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
+
+ // stage 10
+ int32x4_t x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
+ butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
+ butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
+ butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
+ butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
+ butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
+ butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
+ butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
+ butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
+ butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
+ butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
+ butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
+ butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
+ butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
+ butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
+ butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void highbd_fidentity32_x4_neon(const int32x4_t *input,
+ int32x4_t *output, int cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i++) {
+ output[i] = vshlq_n_s32(input[i], 2);
+ }
+}
+
+TRANSFORM_COL_MANY(fdct32, 32)
+TRANSFORM_COL_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_col_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_col_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+TRANSFORM_ROW_MANY(fdct32, 32)
+TRANSFORM_ROW_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_row_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_row_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+TRANSFORM_ROW_RECT_MANY(fdct32, 32)
+TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_row_rect_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_row_rect_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm8_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm16_xn_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[2][1];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Column-wise transform.
+ int32x4_t buf0[32];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
+ /*hm_stride=*/-8);
+ } else {
+ col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/8);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 32);
+
+ int32x4_t buf1[32];
+ transpose_arrays_s32_16x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
+}
+
+void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm8_xn_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[1][2];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[32];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
+ /*hm_stride=*/-16);
+ } else {
+ col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/16);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 32);
+
+ int32x4_t buf1[32];
+ transpose_arrays_s32_8x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[0][2];
+ int bitrow = av1_fwd_cos_bit_row[0][2];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm4_xn_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[16];
+ if (lr_flip) {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
+ /*hm_stride=*/0);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
+ /*hm_stride=*/0);
+ }
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+
+ int32x4_t buf1[16];
+ transpose_arrays_s32_4x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[2][0];
+ int bitrow = av1_fwd_cos_bit_row[2][0];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm4_xn_arr[tx_type];
+ const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ // Column-wise transform.
+ int32x4_t buf0[16];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
+ /*hm_stride=*/-4);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/4);
+ }
+
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_4x16(buf0, buf0);
+
+ // Row-wise transform.
+ row_txfm(buf0, coeff, bitrow, /*stride=*/4);
+}
+
+void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm16_xn_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[2][3];
+ int bitrow = av1_fwd_cos_bit_row[2][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[128];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/32);
+ shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+ int32x4_t buf1[128];
+ transpose_arrays_s32_16x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ int bitcol = av1_fwd_cos_bit_col[3][4];
+ int bitrow = av1_fwd_cos_bit_row[3][4];
+
+ // Column-wise transform.
+ int32x4_t buf0[512];
+ load_buffer_32x64(input, buf0, stride, 0);
+ for (int i = 0; i < 8; i++) {
+ highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 512);
+
+ int32x4_t buf1[512];
+ transpose_arrays_s32_32x64(buf0, buf1);
+
+ // Row-wise transform.
+ for (int i = 0; i < 16; i++) {
+ highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
+ }
+ round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+ store_buffer_32x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ int bitcol = av1_fwd_cos_bit_col[4][3];
+ int bitrow = av1_fwd_cos_bit_row[4][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[512];
+ load_buffer_64x32(input, buf0, stride, 0);
+ for (int i = 0; i < 16; i++) {
+ highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
+ }
+ shift_right_4_round_s32_x4(buf0, buf0, 512);
+
+ int32x4_t buf1[512];
+ transpose_arrays_s32_64x32(buf0, buf1);
+
+ // Row-wise transform.
+ for (int i = 0; i < 8; i++) {
+ highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+ }
+ round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+ store_buffer_64x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm32_x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[3][2];
+ int bitrow = av1_fwd_cos_bit_row[3][2];
+
+ // Column-wise transform.
+ int32x4_t buf0[128];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/16);
+ shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+ int32x4_t buf1[128];
+ transpose_arrays_s32_32x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm8_xn_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[1][3];
+ int bitrow = av1_fwd_cos_bit_row[1][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[64];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/32);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+ int32x4_t buf1[64];
+ transpose_arrays_s32_8x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm8_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm32_x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[3][1];
+ int bitrow = av1_fwd_cos_bit_row[3][1];
+
+ // Column-wise transform.
+ int32x4_t buf0[64];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/8);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+ int32x4_t buf1[64];
+ transpose_arrays_s32_32x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[0][1];
+ int bitrow = av1_fwd_cos_bit_row[0][1];
+ const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm4_xn_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Column-wise transform.
+ int32x4_t buf0[8];
+ col_txfm(input, buf0, stride, bitcol, lr_flip);
+ shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+ int32x4_t buf1[8];
+ transpose_arrays_s32_4x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
+}
+
+void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[1][0];
+ const int bitrow = av1_fwd_cos_bit_row[1][0];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm4_xn_arr[tx_type];
+ const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ // Column-wise transform.
+ int32x4_t buf0[8];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
+ /*hm_stride=*/-4);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/4);
+ }
+
+ shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+ int32x4_t buf1[8];
+ transpose_arrays_s32_8x4(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*stride=*/4);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[2][4];
+ const int bitrow = av1_fwd_cos_bit_row[2][4];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ load_buffer_16x64(input, buf0, stride, lr_flip);
+ for (int i = 0; i < 4; i++) {
+ highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 256);
+
+ int32x4_t buf1[256];
+ transpose_arrays_s32_16x64(buf0, buf1);
+
+ // Row-wise transform.
+ highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
+ store_buffer_16x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[4][2];
+ const int bitrow = av1_fwd_cos_bit_row[4][2];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ load_buffer_64x16(input, buf0, stride, lr_flip);
+ highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
+ shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+ int32x4_t buf1[256];
+ transpose_arrays_s32_64x16(buf0, buf1);
+
+ // Row-wise transform.
+ for (int i = 0; i < 4; i++) {
+ highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+ }
+ store_buffer_64x16(buf1, coeff, /*stride=*/16);
+ memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
+}
+#endif
+
+void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm32_x4_arr[tx_type];
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/32);
+ shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+ int32x4_t buf1[256];
+ transpose_arrays_s32_32x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
+ /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+
+ // Column-wise transform.
+ int32x4_t buf0[1024];
+ load_buffer_64x64(input, buf0, stride, 0);
+ for (int col = 0; col < 16; col++) {
+ highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 1024);
+
+ int32x4_t buf1[1024];
+ transpose_arrays_s32_64x64(buf0, buf1);
+
+ // Row-wise transform.
+ for (int col = 0; col < 8; col++) {
+ highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
+ }
+ shift_right_2_round_s32_x4(buf1, buf1, 512);
+ store_buffer_64x32(buf1, output, /*stride=*/32);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
new file mode 100644
index 0000000000..47b5f5cfb7
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -0,0 +1,1207 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void highbd_calc_proj_params_r0_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t h01_lo = vdupq_n_s64(0);
+ int64x2_t h01_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt0_ptr = flt0;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f0_lo = vsubq_s32(f0_lo, u_lo);
+ f0_hi = vsubq_s32(f0_hi, u_hi);
+ f1_lo = vsubq_s32(f1_lo, u_lo);
+ f1_hi = vsubq_s32(f1_hi, u_hi);
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+ h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+ h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+ h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ H[1][0] = H[0][1];
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r0_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt0_ptr = flt0;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f0_lo = vsubq_s32(f0_lo, u_lo);
+ f0_hi = vsubq_s32(f0_hi, u_hi);
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f1_lo = vsubq_s32(f1_lo, u_lo);
+ f1_hi = vsubq_s32(f1_hi, u_hi);
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } };
+ return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx));
+#else
+ uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)),
+ vreinterpret_u8_s16(vget_high_s16(a)),
+ vreinterpret_u8_s16(vget_low_s16(b)),
+ vreinterpret_u8_s16(vget_high_s16(b)) } };
+ return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+ vtbl4_u8(table, vget_high_u8(idx))));
+#endif
+}
+
+static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c,
+ uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b),
+ vreinterpretq_u8_s16(c) } };
+ return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx));
+#else
+ // This is a specific implementation working only for compute stats with
+ // wiener_win == 5.
+ uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)),
+ vreinterpret_u8_s16(vget_high_s16(a)),
+ vreinterpret_u8_s16(vget_low_s16(b)) } };
+ uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)),
+ vreinterpret_u8_s16(vget_high_s16(b)),
+ vreinterpret_u8_s16(vget_low_s16(c)) } };
+ return vreinterpretq_s16_u8(vcombine_u8(
+ vtbl3_u8(table_lo, vget_low_u8(idx)),
+ vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16)))));
+#endif
+}
+
+static INLINE int64_t div_shift_s64(int64_t x, int power) {
+ return (x < 0 ? x + (1ll << power) - 1 : x) >> power;
+}
+
+// The M matrix is accumulated in a bitdepth-dependent number of steps to
+// speed up the computation. This function computes the final M from the
+// accumulated (src_s64) and the residual parts (src_s32). It also transposes
+// the result as the output needs to be column-major.
+static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win,
+ int shift) {
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = 0; j < wiener_win; ++j) {
+ int tr_idx = j * wiener_win + i;
+ *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift);
+ }
+ }
+}
+
+// The resulting H is a column-major matrix accumulated from the transposed
+// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
+// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
+// function transforms back to the originally expected format (double
+// transpose). The H matrix is accumulated in a bitdepth-dependent number of
+// steps to speed up the computation. This function computes the final H from
+// the accumulated (src_s64) and the residual parts (src_s32). The computed H is
+// only an upper triangle matrix, this function also fills the lower triangle of
+// the resulting matrix.
+static INLINE void update_H(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win,
+ int stride, int shift) {
+ // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
+ // `wiener_win2` is 9, the M matrix is 3x3:
+ // 0, 3, 6
+ // 1, 4, 7
+ // 2, 5, 8
+ //
+ // This is viewed as a vector to compute H (9x9) by vector outer product:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8
+ //
+ // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8,
+ // 3, 30, 33, 12, 31, 34, 21, 32, 35,
+ // 6, 33, 60, 15, 42, 61, 24, 51, 62,
+ // 1, 12, 15, 10, 13, 16, 11, 14, 17,
+ // 4, 31, 42, 13, 40, 43, 22, 41, 44,
+ // 7, 34, 61, 16, 43, 70, 25, 52, 71,
+ // 2, 21, 24, 11, 22, 25, 20, 23, 26,
+ // 5, 32, 51, 14, 41, 52, 23, 50, 53,
+ // 8, 35, 62, 17, 44, 71, 26, 53, 80,
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ // Loop through the indices according to the remapping above, along the
+ // columns:
+ // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
+ // wiener_win - 1, wiener_win - 1 + wiener_win, ...
+ // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = i; j < wiener_win2; j += wiener_win) {
+ // These two inner loops are the same as the two outer loops, but running
+ // along rows instead of columns. For the 3x3 case `l` will be:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int k = 0; k < wiener_win; ++k) {
+ for (int l = k; l < wiener_win2; l += wiener_win) {
+ // The nominal double transpose indexing would be:
+ // int idx = stride * j + l;
+ // However we need the upper-right triangle, it is easy with some
+ // min/max operations.
+ int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
+
+ // Resulting matrix is filled by combining the 64-bit and the residual
+ // 32-bit matrices together with scaling.
+ *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift);
+ }
+ }
+ }
+ }
+}
+
+// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load
+// address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vld1q_s16(src);
+ src += stride;
+ dst[1] = vld1q_s16(src);
+ src += stride;
+ dst[2] = vld1q_s16(src);
+ src += stride;
+ dst[3] = vld1q_s16(src);
+ src += stride;
+ dst[4] = vld1q_s16(src);
+ src += stride;
+ dst[5] = vld1q_s16(src);
+ src += stride;
+ dst[6] = vld1q_s16(src - 1);
+}
+
+static INLINE void highbd_compute_stats_win7_neon(
+ const uint16_t *dgd, const uint16_t *src, int avg, int width, int height,
+ int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
+ // matrices.
+ // clang-format off
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21,
+ 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25,
+ 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+ 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ };
+ // clang-format on
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32);
+ const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48);
+ const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64);
+ const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80);
+ const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96);
+ const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112);
+ const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128);
+ const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144);
+ const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160);
+ const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176);
+
+ // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
+ // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
+ // be as high as 32768/2048/128 for the compute stats.
+ const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+ int acc_cnt = acc_cnt_max;
+ const int src_next = src_stride - width;
+ const int dgd_next = dgd_stride - width;
+ const int16x8_t avg_s16 = vdupq_n_s16(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ int16x8_t dgd_rows[7];
+ load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6;
+ dgd += 2;
+
+ dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16);
+ dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16);
+ dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16);
+ dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16);
+ dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16);
+ dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16);
+ dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16);
+
+ // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1
+ // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays.
+ // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg`
+ // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements.
+ int16x8_t dgd_avg0[6];
+ int16x8_t dgd_avg1[6];
+
+ dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6);
+ dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+ dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7);
+ dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+ dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8);
+ dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3);
+ dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9);
+ dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4);
+ dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10);
+ dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5);
+ dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+ vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+ vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
+
+ // The remaining last (49th) elements of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+ DGD_AVG1[48] = dgd_ptr[7] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 7 * 7. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+ update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
+ dgd_avg1[3]);
+ update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
+ dgd_avg1[4]);
+ update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
+ dgd_avg1[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 49 * 49. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
+ DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
+
+ // Accumulate into 64-bit after a bit depth dependent number of iterations
+ // to prevent overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = acc_cnt_max;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
+
+ // Last element of the row is computed separately.
+ lh[48] += lh32[48];
+ lh32[48] = 0;
+
+ lh += WIENER_WIN2_ALIGN2;
+ lh32 += WIENER_WIN2_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ int16x8_t dgd_rows[7];
+ load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6;
+ ++dgd;
+
+ // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly
+ // packed into a int16x8_t[6] array. This array contains 48 elements of
+ // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG
+ // buffer contains 49 consecutive elements.
+ int16x8_t dgd_avg0[6];
+
+ dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16);
+ dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16);
+ dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16);
+ dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16);
+ dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16);
+ dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+
+ // The remaining last (49th) element of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+ update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
+ update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
+ update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[48] += DGD_AVG0[48] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 49 * 49. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work column-major matrices, so we
+ // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+ // double transpose transformation will convert H_s32 back to the expected
+ // output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
+
+ // The last element of the triangle of H_s32 matrix can be computed as
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ int bit_depth_shift = bit_depth - AOM_BITS_8;
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift);
+}
+
+// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load
+// address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vld1q_s16(src);
+ src += stride;
+ dst[1] = vld1q_s16(src);
+ src += stride;
+ dst[2] = vld1q_s16(src);
+ src += stride;
+ dst[3] = vld1q_s16(src);
+ src += stride;
+ dst[4] = vld1q_s16(src - 3);
+}
+
+static void highbd_compute_stats_win5_neon(const uint16_t *dgd,
+ const uint16_t *src, int avg,
+ int width, int height,
+ int dgd_stride, int src_stride,
+ int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t,
+ H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t,
+ H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x3 matrix with consecutive elements from 5x5
+ // matrix.
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21,
+ 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33,
+ 2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35,
+ 4, 5, 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31,
+ };
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32);
+ const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48);
+ const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64);
+ const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80);
+
+ // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
+ // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
+ // be as high as 32768/2048/128 for the compute stats.
+ const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+ int acc_cnt = acc_cnt_max;
+ const int src_next = src_stride - width;
+ const int dgd_next = dgd_stride - width;
+ const int16x8_t avg_s16 = vdupq_n_s16(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ int16x8_t dgd_rows[5];
+ load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4;
+ dgd += 2;
+
+ dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16);
+ dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16);
+ dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16);
+ dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16);
+ dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16);
+
+ // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1
+ // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays.
+ // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg`
+ // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements.
+ int16x8_t dgd_avg0[3];
+ int16x8_t dgd_avg1[3];
+
+ dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
+ dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1);
+ dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4);
+ dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2);
+ dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+
+ // The remaining last (25th) elements of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+ DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 5 * 5. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 25 * 25. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
+
+ // Accumulate into 64-bit after a bit depth dependent number of iterations
+ // to prevent overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = acc_cnt_max;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
+
+ // Last element of the row is computed separately.
+ lh[24] += lh32[24];
+ lh32[24] = 0;
+
+ lh += WIENER_WIN2_REDUCED_ALIGN2;
+ lh32 += WIENER_WIN2_REDUCED_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ int16x8_t dgd_rows[5];
+ load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4;
+ ++dgd;
+
+ // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
+ // matrix tightly packed into a int16x8_t[3] array. This array contains
+ // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
+ // The DGD_AVG buffer contains 25 consecutive elements.
+ int16x8_t dgd_avg0[3];
+
+ dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16);
+ dgd_avg0[1] = vsubq_s16(
+ tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16);
+ dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+
+ // The remaining last (25th) element of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+ DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[24] += DGD_AVG0[24] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 25 * 25. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ int bit_depth_shift = bit_depth - AOM_BITS_8;
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
+ bit_depth_shift);
+}
+
+static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride,
+ int width, int height) {
+ assert(width > 0);
+ assert(height > 0);
+
+ uint64x2_t sum_u64 = vdupq_n_u64(0);
+ uint64_t sum = 0;
+
+ int h = height;
+ do {
+ uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int w = width;
+ const uint16_t *row = src;
+ while (w >= 32) {
+ uint16x8_t s0 = vld1q_u16(row + 0);
+ uint16x8_t s1 = vld1q_u16(row + 8);
+ uint16x8_t s2 = vld1q_u16(row + 16);
+ uint16x8_t s3 = vld1q_u16(row + 24);
+
+ s0 = vaddq_u16(s0, s1);
+ s2 = vaddq_u16(s2, s3);
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s2);
+
+ row += 32;
+ w -= 32;
+ }
+
+ if (w >= 16) {
+ uint16x8_t s0 = vld1q_u16(row + 0);
+ uint16x8_t s1 = vld1q_u16(row + 8);
+
+ s0 = vaddq_u16(s0, s1);
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+
+ row += 16;
+ w -= 16;
+ }
+
+ if (w >= 8) {
+ uint16x8_t s0 = vld1q_u16(row);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s0);
+
+ row += 8;
+ w -= 8;
+ }
+
+ if (w >= 4) {
+ uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0));
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+
+ row += 4;
+ w -= 4;
+ }
+
+ while (w-- > 0) {
+ sum += *row++;
+ }
+
+ sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1]));
+
+ src += src_stride;
+ } while (--h != 0);
+
+ return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width));
+}
+
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
+
+ const int wiener_halfwin = wiener_win >> 1;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const int height = v_end - v_start;
+ const int width = h_end - h_start;
+
+ const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride;
+ const uint16_t *src_start = src + h_start + v_start * src_stride;
+
+ // The wiener window will slide along the dgd frame, centered on each pixel.
+ // For the top left pixel and all the pixels on the side of the frame this
+ // means half of the window will be outside of the frame. As such the actual
+ // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+ // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+ const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+
+ uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height);
+
+ if (wiener_win == WIENER_WIN) {
+ highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height,
+ dgd_stride, src_stride, M, H, bit_depth);
+ } else {
+ highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height,
+ dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+
+int64_t av1_highbd_pixel_proj_error_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int64_t sse = 0;
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int32x2_t xq_v = vld1_s32(xq);
+ int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4);
+
+ do {
+ int j = 0;
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ do {
+ const uint16x8_t d = vld1q_u16(&dat[j]);
+ const uint16x8_t s = vld1q_u16(&src[j]);
+ int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
+ int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
+ int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
+ int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
+
+ int32x4_t d_s32_lo = vreinterpretq_s32_u32(
+ vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
+ int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16(
+ vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
+
+ int32x4_t v0 = vsubq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
+ d_s32_lo);
+ int32x4_t v1 = vsubq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
+ d_s32_hi);
+
+ v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0);
+ v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0);
+ v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
+ v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
+ vreinterpretq_s16_u16(vsubq_u16(d, s)));
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]);
+ v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4);
+ int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+ sse += ((int64_t)e * e);
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ int32x4_t xq_v = vdupq_n_s32(xq_active);
+
+ do {
+ int j = 0;
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ do {
+ const uint16x8_t d0 = vld1q_u16(&dat[j]);
+ const uint16x8_t s0 = vld1q_u16(&src[j]);
+ int32x4_t flt0_0 = vld1q_s32(&flt[j]);
+ int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]);
+
+ uint16x8_t d_u16 = vshlq_n_u16(d0, 4);
+ int32x4_t sub0 = vreinterpretq_s32_u32(
+ vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16)));
+ int32x4_t sub1 = vreinterpretq_s32_u32(
+ vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16)));
+
+ int32x4_t v0 = vmlaq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0,
+ xq_v);
+ int32x4_t v1 = vmlaq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1,
+ xq_v);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
+ vreinterpretq_s16_u16(vsubq_u16(d0, s0)));
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4));
+ const int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+ sse += ((int64_t)e * e);
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ flt += flt_stride;
+ src += src_stride;
+ } while (--height != 0);
+ } else {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t d = vld1q_u16(&dat[j]);
+ const uint16x8_t s = vld1q_u16(&src[j]);
+
+ uint16x8_t diff = vabdq_u16(d, s);
+ uint16x4_t diff_lo = vget_low_u16(diff);
+ uint16x4_t diff_hi = vget_high_u16(diff);
+
+ uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo);
+ uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi);
+
+ sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo));
+ sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi));
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t e = dat[k] - src[k];
+ sse += e * e;
+ }
+
+ dat += dat_stride;
+ src += src_stride;
+ } while (--height != 0);
+ }
+
+ sse += horizontal_add_s64x2(sse_s64);
+ return sse;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
new file mode 100644
index 0000000000..4bf7ae6ce4
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bd) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ const int shift = 2 * (bd - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int32x4_t c = vld1q_s32(coeff);
+ const int32x4_t d = vld1q_s32(dqcoeff);
+
+ const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+ err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+ err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+ ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+ ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+ coeff += 4;
+ dqcoeff += 4;
+ block_size -= 4;
+ } while (block_size != 0);
+
+ *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift;
+ return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..88e176f56c
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_squared_error(
+ const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2,
+ const uint32_t stride2, const uint32_t block_width,
+ const uint32_t block_height, uint32_t *frame_sse,
+ const unsigned int dst_stride) {
+ uint32_t *dst = frame_sse;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j);
+ uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint32x4_t sse_lo =
+ vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff));
+ uint32x4_t sse_hi =
+ vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff));
+
+ vst1q_u32(dst + j, sse_lo);
+ vst1q_u32(dst + j + 4, sse_hi);
+
+ j += 8;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ i++;
+ } while (i < block_height);
+}
+
+static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2],
+ const uint32x4_t mask_single) {
+ uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single);
+ return horizontal_add_u32x4(vsums);
+}
+
+static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2],
+ const uint32x4_t mask1,
+ const uint32x4_t mask2) {
+ uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[1][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[2][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[3][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[4][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[0][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[1][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[2][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[3][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[4][1], mask2);
+ return vsums;
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame, const unsigned int stride,
+ const uint32_t block_width, const uint32_t block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ const uint32_t *frame_sse, const uint32_t frame_sse_stride,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
+ int bd) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW] = { 0 };
+ const int half_window = TF_WINDOW_LENGTH >> 1;
+
+ uint32x4_t vsrc[5][2] = { 0 };
+ const uint32x4_t k0000 = vdupq_n_u32(0);
+ const uint32x4_t k1111 = vdupq_n_u32(1);
+ const uint32_t k3110_u32[4] = { 0, 1, 1, 3 };
+ const uint32_t k2111_u32[4] = { 1, 1, 1, 2 };
+ const uint32_t k1112_u32[4] = { 2, 1, 1, 1 };
+ const uint32_t k0113_u32[4] = { 3, 1, 1, 0 };
+ const uint32x4_t k3110 = vld1q_u32(k3110_u32);
+ const uint32x4_t k2111 = vld1q_u32(k2111_u32);
+ const uint32x4_t k1112 = vld1q_u32(k1112_u32);
+ const uint32x4_t k0113 = vld1q_u32(k0113_u32);
+
+ uint32x4_t vmask1[4], vmask2[4];
+ vmask1[0] = k1111;
+ vmask2[0] = vextq_u32(k1111, k0000, 3);
+ vmask1[1] = vextq_u32(k0000, k1111, 3);
+ vmask2[1] = vextq_u32(k1111, k0000, 2);
+ vmask1[2] = vextq_u32(k0000, k1111, 2);
+ vmask2[2] = vextq_u32(k1111, k0000, 1);
+ vmask1[3] = vextq_u32(k0000, k1111, 1);
+ vmask2[3] = k1111;
+
+ uint32_t row = 0;
+ do {
+ uint32_t col = 0;
+ const uint32_t *src = frame_sse + row * frame_sse_stride;
+ if (row == 0) {
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First 2 rows of the 5x5 matrix are padded from the 1st.
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[1][0] = vsrc[2][0];
+ } else if (row == 1) {
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First row of the 5x5 matrix are padded from the 1st.
+ vsrc[0][0] = vsrc[1][0];
+ } else if (row == block_height - 2) {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+
+ // Last row of the 5x5 matrix are padded from the one before.
+ vsrc[4][0] = vsrc[3][0];
+ } else if (row == block_height - 1) {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+
+ // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+ vsrc[3][0] = vsrc[2][0];
+ vsrc[4][0] = vsrc[2][0];
+ } else {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+ }
+
+ acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113);
+ acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112);
+
+ col += 4;
+ src += 4;
+ // Traverse 4 columns at a time
+ do {
+ if (row == 0) {
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First 2 rows of the 5x5 matrix are padded from the 1st.
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][1] = vsrc[2][1];
+ } else if (row == 1) {
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First row of the 5x5 matrix are padded from the 1st.
+ vsrc[0][1] = vsrc[1][1];
+ } else if (row == block_height - 2) {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+
+ // Last row of the 5x5 matrix are padded from the one before.
+ vsrc[4][1] = vsrc[3][1];
+ } else if (row == block_height - 1) {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+
+ // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+ vsrc[3][1] = vsrc[2][1];
+ vsrc[4][1] = vsrc[2][1];
+ } else {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+ }
+
+ uint32x4_t sums[4];
+ sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]);
+ sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]);
+ sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]);
+ sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]);
+ vst1q_u32(&acc_5x5_neon[row][col - half_window],
+ horizontal_add_4d_u32x4(sums));
+
+ vsrc[0][0] = vsrc[0][1];
+ vsrc[1][0] = vsrc[1][1];
+ vsrc[2][0] = vsrc[2][1];
+ vsrc[3][0] = vsrc[3][1];
+ vsrc[4][0] = vsrc[4][1];
+
+ src += 4;
+ col += 4;
+ } while (col <= block_width - 4);
+
+ acc_5x5_neon[row][col - half_window] =
+ sum_kernel5x5_mask_single(vsrc, k2111);
+ acc_5x5_neon[row][col - half_window + 1] =
+ sum_kernel5x5_mask_single(vsrc, k3110);
+
+ row++;
+ } while (row < block_height);
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ // Scale down the difference for high bit depth input.
+ const uint32_t diff_sse =
+ (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ // Scale down the difference for high bit depth input.
+ const uint32_t diff_sse =
+ (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_neon(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+ assert(is_high_bitdepth);
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[BW * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const uint32_t frame_sse_stride = plane_w;
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ const int ww = frame_sse_stride
+ << ss_x_shift; // Width of Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+ }
+ }
+ }
+ }
+ }
+ get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_sse, frame_sse_stride);
+
+ highbd_apply_temporal_filter(
+ pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
+ accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
+ luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+ int height, int width,
+ int stride,
+ int bitdepth,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint64x2_t acc = vdupq_n_u64(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ uint64_t final_acc = 0;
+ const uint16_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint16_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 8) {
+ uint16x8_t mat[3][3];
+ mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u16(src_ptr - stride);
+ mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u16(src_ptr - 1);
+ mat[1][1] = vld1q_u16(src_ptr);
+ mat[1][2] = vld1q_u16(src_ptr + 1);
+ mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u16(src_ptr + stride);
+ mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+ ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+ v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+ uint32x4_t v_u32 = vpaddlq_u16(v);
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x4_t mat[3][3];
+ mat[0][0] = vld1_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1_u16(src_ptr - stride);
+ mat[0][2] = vld1_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1_u16(src_ptr - 1);
+ mat[1][1] = vld1_u16(src_ptr);
+ mat[1][2] = vld1_u16(src_ptr + 1);
+ mat[2][0] = vld1_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1_u16(src_ptr + stride);
+ mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+ uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+ gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+ gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+ uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+ gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+ gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+ uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+ ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+ uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+ uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+ uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+ uint16x4_t adj = vadd_u16(adj0, adj1);
+ adj = vadd_u16(adj, adj);
+
+ uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+ uint16x4_t diag = vadd_u16(diag0, diag1);
+
+ uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+ v = vand_u16(v, thresh_u16);
+ uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_add_u64x2(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
new file mode 100644
index 0000000000..6cf835a243
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
+ int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+ int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+ int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+ out[0] = c0.val[0];
+ out[1] = c0.val[1];
+ out[2] = c1.val[0];
+ out[3] = c1.val[1];
+}
+
+void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) {
+ // Load the 4x4 source in transposed form.
+ int16x4_t a1, b1, c1, d1, e;
+ a1 = vld1_s16(&input[0]);
+ b1 = vld1_s16(&input[1 * stride]);
+ c1 = vld1_s16(&input[2 * stride]);
+ d1 = vld1_s16(&input[3 * stride]);
+
+ // WHT.
+
+ // Row transforms.
+ a1 = vadd_s16(a1, b1);
+ d1 = vsub_s16(d1, c1);
+ e = vhsub_s16(a1, d1);
+ b1 = vsub_s16(e, b1);
+ c1 = vsub_s16(e, c1);
+ a1 = vsub_s16(a1, c1);
+ d1 = vadd_s16(d1, b1);
+
+ int16x8_t x[2];
+ x[0] = vcombine_s16(a1, c1);
+ x[1] = vcombine_s16(d1, b1);
+
+ int16x4_t s[4];
+ transpose4x4(x, s);
+
+ a1 = s[0];
+ b1 = s[1];
+ c1 = s[2];
+ d1 = s[3];
+
+ // Row transforms.
+ a1 = vadd_s16(a1, b1);
+ d1 = vsub_s16(d1, c1);
+ e = vhsub_s16(a1, d1);
+ b1 = vsub_s16(e, b1);
+ c1 = vsub_s16(e, c1);
+ a1 = vsub_s16(a1, c1);
+ d1 = vadd_s16(d1, b1);
+
+ vst1q_s32(&output[0], vshll_n_s16(a1, UNIT_QUANT_SHIFT));
+ vst1q_s32(&output[4], vshll_n_s16(c1, UNIT_QUANT_SHIFT));
+ vst1q_s32(&output[8], vshll_n_s16(d1, UNIT_QUANT_SHIFT));
+ vst1q_s32(&output[12], vshll_n_s16(b1, UNIT_QUANT_SHIFT));
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/ml_neon.c b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
new file mode 100644
index 0000000000..be6ddfd763
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l,
+ const float32x4_t *zero) {
+ *out_h = vmaxq_f32(*out_h, *zero);
+ *out_l = vmaxq_f32(*out_l, *zero);
+}
+
+static void nn_activate4(float32x4_t *x, const float32x4_t *zero) {
+ *x = vmaxq_f32(*x, *zero);
+}
+
+#define CLAMP_0(x) (x = x > 0 ? x : 0)
+
+static void nn_propagate_8to1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t vadd = zero;
+ float total = *layer_bias;
+
+ for (int in = 0; in < num_inputs; in += 8) {
+ const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+ const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+ const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+ const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+ vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+ vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+ }
+#if AOM_ARCH_AARCH64
+ total += vaddvq_f32(vadd);
+#else
+ float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+ vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+ total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+ if (!output_layer) CLAMP_0(total);
+ *output_nodes = total;
+}
+
+static void nn_propagate_xto1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes) {
+ float32x4_t vadd = vdupq_n_f32(0);
+
+ float total = *layer_bias;
+ int j = num_inputs;
+ int in = 0;
+ while (j > 7) {
+ const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+ const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+ const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+ const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+ vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+ vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+ in += 8;
+ j -= 8;
+ }
+
+#if AOM_ARCH_AARCH64
+ total += vaddvq_f32(vadd);
+
+#else
+ float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+ vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+ total += vget_lane_f32(vadd_lo, 0);
+#endif
+ for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+ *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_xsto1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes) {
+ float total = *layer_bias;
+#if AOM_ARCH_AARCH64
+ const float32x4_t v_inputs = vld1q_f32(inputs);
+ const float32x4_t v_weights = vld1q_f32(weights);
+ const float32x4_t vadd = vmulq_f32(v_inputs, v_weights);
+ total += vaddvq_f32(vadd);
+ int in = 4;
+#else
+ int in = 0;
+#endif
+ for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+ *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_4to1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t vadd = zero;
+ float total = *layer_bias;
+
+ for (int in = 0; in < num_inputs; in += 4) {
+ const float32x4_t v_inputs = vld1q_f32(&inputs[in]);
+ const float32x4_t v_weights = vld1q_f32(&weights[in]);
+ vadd = vmlaq_f32(vadd, v_inputs, v_weights);
+ }
+
+#if AOM_ARCH_AARCH64
+ total += vaddvq_f32(vadd);
+#else
+ float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+ vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+ total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+ if (!output_layer) CLAMP_0(total);
+ *output_nodes = total;
+}
+
+static void nn_propagate_4to4(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ float32x4_t outputs = vld1q_f32(layer_bias);
+ const float32x4_t zero = vdupq_n_f32(0);
+
+ float32x4_t mul0[2] = { zero, zero };
+ float32x4_t mul1[2] = { zero, zero };
+ for (int in = 0; in < num_inputs; in += 4) {
+ const float32x4_t v_input = vld1q_f32(&inputs[in]);
+
+ for (int i = 0; i < 2; i++) {
+ const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+ mul0[i] = vmlaq_f32(mul0[i], weight0, v_input);
+ const float32x4_t weight1 =
+ vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+ mul1[i] = vmlaq_f32(mul1[i], weight1, v_input);
+ }
+ }
+ for (int i = 0; i < 2; i++)
+#if AOM_ARCH_AARCH64
+ mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+ const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]);
+#else
+ mul0[i] =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+ vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+ const float32x4_t hh =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+ vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+#endif
+
+ outputs = vaddq_f32(outputs, hh);
+ if (!output_layer) nn_activate4(&outputs, &zero);
+ vst1q_f32(output_nodes, outputs);
+}
+
+static void nn_propagate_4to8(const int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ float32x4_t out_h = vld1q_f32(&layer_bias[4]);
+ float32x4_t out_l = vld1q_f32(layer_bias);
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t mul0[4] = { zero, zero, zero, zero };
+ float32x4_t mul1[4] = { zero, zero, zero, zero };
+
+ for (int in = 0; in < num_inputs; in += 4) {
+ const float32x4_t v_input = vld1q_f32(&inputs[in]);
+ for (int i = 0; i < 4; i++) {
+ const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+ const float32x4_t weight1 =
+ vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+ mul0[i] = vmlaq_f32(mul0[i], v_input, weight0);
+ mul1[i] = vmlaq_f32(mul1[i], v_input, weight1);
+ }
+ }
+ for (int i = 0; i < 4; i++)
+#if AOM_ARCH_AARCH64
+ mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+ const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]);
+ const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]);
+#else
+ mul0[i] =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+ vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+ const float32x4_t hh0 =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+ vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+ const float32x4_t hh1 =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])),
+ vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3])));
+#endif
+
+ out_h = vaddq_f32(out_h, hh1);
+ out_l = vaddq_f32(out_l, hh0);
+
+ if (!output_layer) nn_activate8(&out_h, &out_l, &zero);
+ vst1q_f32(&output_nodes[4], out_h);
+ vst1q_f32(output_nodes, out_l);
+}
+
+static void nn_propagate_8to4(const int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ float32x4_t outputs = vld1q_f32(layer_bias);
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t add[4] = { zero, zero, zero, zero };
+ for (int in = 0; in < num_inputs; in += 8) {
+ const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+ const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+
+ for (int i = 0; i < 4; i++) {
+ const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]);
+ const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]);
+ add[i] = vmlaq_f32(add[i], inputs_l, weight_l);
+ add[i] = vmlaq_f32(add[i], inputs_h, weight_h);
+ }
+ }
+#if AOM_ARCH_AARCH64
+ const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]);
+ const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]);
+ const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h);
+#else
+ const float32x4_t hadd_h =
+ vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])),
+ vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3])));
+ const float32x4_t hadd_l =
+ vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])),
+ vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1])));
+ const float32x4_t haddhadd =
+ vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)),
+ vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h)));
+#endif
+
+ outputs = vaddq_f32(outputs, haddhadd);
+ if (!output_layer) nn_activate4(&outputs, &zero);
+ vst1q_f32(output_nodes, outputs);
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_neon(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+ // Hidden layers, except the final iteration is the output layer.
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool output_layer = (layer == nn_config->num_hidden_layers);
+ float *const output_nodes = output_layer ? output : buf[buf_index];
+ const int num_outputs = output_layer ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+
+ if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ nn_propagate_4to8(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ nn_propagate_8to4(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ nn_propagate_4to4(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_8to1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_4to1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs > 8) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_xto1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out]);
+ }
+ } else if (num_inputs >= 4) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_xsto1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out]);
+ }
+ } else {
+ for (int node = 0; node < num_outputs; ++node) {
+ float val = layer_bias[node];
+ for (int i = 0; i < num_inputs; ++i)
+ val += layer_weights[node * num_inputs + i] * input_nodes[i];
+ // ReLU as activation function.
+ val = val > 0.0f ? val : 0.0f; // Could use AOMMAX().
+ output_nodes[node] = val;
+ }
+ }
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c
new file mode 100644
index 0000000000..2e4761f9a4
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c
@@ -0,0 +1,1217 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+ const uint8_t *src, int width, int height, int src_stride,
+ const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int64_t sse = 0;
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int32x2_t xq_v = vld1_s32(xq);
+ int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS);
+
+ do {
+ int j = 0;
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ do {
+ const uint8x8_t d = vld1_u8(&dat[j]);
+ const uint8x8_t s = vld1_u8(&src[j]);
+ int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
+ int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
+ int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
+ int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
+
+ int32x4_t offset =
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
+ int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0);
+ int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0);
+
+ v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
+ v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
+
+ int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d));
+ v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16),
+ vreinterpret_s16_s32(xq_sum_v), 0);
+ v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16),
+ vreinterpret_s16_s32(xq_sum_v), 0);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t u = (dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) +
+ xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]);
+ int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+ sse += e * e;
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ int32x2_t xq_v = vdup_n_s32(xq_active);
+
+ do {
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ int j = 0;
+
+ do {
+ const uint8x8_t d = vld1_u8(&dat[j]);
+ const uint8x8_t s = vld1_u8(&src[j]);
+ int32x4_t flt_0 = vld1q_s32(&flt[j]);
+ int32x4_t flt_1 = vld1q_s32(&flt[j + 4]);
+ int16x8_t d_s16 =
+ vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+
+ int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16));
+ int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16));
+
+ int32x4_t offset =
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
+ int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0);
+ int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t u = dat[k] << SGRPROJ_RST_BITS;
+ int32_t v = xq_active * (flt[k] - u);
+ int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) +
+ dat[k] - src[k];
+ sse += e * e;
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ } while (--height != 0);
+ } else {
+ uint32x4_t sse_s32 = vdupq_n_u32(0);
+
+ do {
+ int j = 0;
+
+ do {
+ const uint8x16_t d = vld1q_u8(&dat[j]);
+ const uint8x16_t s = vld1q_u8(&src[j]);
+
+ uint8x16_t diff = vabdq_u8(d, s);
+ uint8x8_t diff_lo = vget_low_u8(diff);
+ uint8x8_t diff_hi = vget_high_u8(diff);
+
+ sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo));
+ sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi));
+
+ j += 16;
+ } while (j <= width - 16);
+
+ for (int k = j; k < width; ++k) {
+ int32_t e = dat[k] - src[k];
+ sse += e * e;
+ }
+
+ dat += dat_stride;
+ src += src_stride;
+ } while (--height != 0);
+
+ sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32));
+ }
+
+ sse += horizontal_add_s64x2(sse_s64);
+ return sse;
+}
+
+// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are
+// processing 2 pixels at a time, so the accumulator max can be as high as 32768
+// for the compute stats.
+#define STAT_ACCUMULATOR_MAX 32768
+
+static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t table = { { a, b } };
+ return vqtbl2_u8(table, idx);
+#else
+ uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+ vget_high_u8(b) } };
+ return vtbl4_u8(table, idx);
+#endif
+}
+
+static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t table = { { a, b } };
+ return vqtbl2q_u8(table, idx);
+#else
+ uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+ vget_high_u8(b) } };
+ return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+ vtbl4_u8(table, vget_high_u8(idx)));
+#endif
+}
+
+// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the
+// computation. This function computes the final M from the accumulated
+// (src_s64) and the residual parts (src_s32). It also transposes the result as
+// the output needs to be column-major.
+static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win,
+ int scale) {
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = 0; j < wiener_win; ++j) {
+ int tr_idx = j * wiener_win + i;
+ *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
+ }
+ }
+}
+
+// The resulting H is a column-major matrix accumulated from the transposed
+// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
+// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
+// function transforms back to the originally expected format (double
+// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to
+// speed-up the computation. This function computes the final H from the
+// accumulated (src_s64) and the residual parts (src_s32). The computed H is
+// only an upper triangle matrix, this function also fills the lower triangle of
+// the resulting matrix.
+static void update_H(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win, int stride,
+ int scale) {
+ // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
+ // `wiener_win2` is 9, the M matrix is 3x3:
+ // 0, 3, 6
+ // 1, 4, 7
+ // 2, 5, 8
+ //
+ // This is viewed as a vector to compute H (9x9) by vector outer product:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8
+ //
+ // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8,
+ // 3, 30, 33, 12, 31, 34, 21, 32, 35,
+ // 6, 33, 60, 15, 42, 61, 24, 51, 62,
+ // 1, 12, 15, 10, 13, 16, 11, 14, 17,
+ // 4, 31, 42, 13, 40, 43, 22, 41, 44,
+ // 7, 34, 61, 16, 43, 70, 25, 52, 71,
+ // 2, 21, 24, 11, 22, 25, 20, 23, 26,
+ // 5, 32, 51, 14, 41, 52, 23, 50, 53,
+ // 8, 35, 62, 17, 44, 71, 26, 53, 80,
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ // Loop through the indices according to the remapping above, along the
+ // columns:
+ // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
+ // wiener_win - 1, wiener_win - 1 + wiener_win, ...
+ // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = i; j < wiener_win2; j += wiener_win) {
+ // These two inner loops are the same as the two outer loops, but running
+ // along rows instead of columns. For the 3x3 case `l` will be:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int k = 0; k < wiener_win; ++k) {
+ for (int l = k; l < wiener_win2; l += wiener_win) {
+ // The nominal double transpose indexing would be:
+ // int idx = stride * j + l;
+ // However we need the upper-triangle indices, it is easy with some
+ // min/max operations.
+ int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
+
+ // Resulting matrix is filled by combining the 64-bit and the residual
+ // 32-bit matrices together with scaling.
+ *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
+ }
+ }
+ }
+ }
+}
+
+// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the
+// last load address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
+}
+
+static INLINE void compute_stats_win7_neon(const uint8_t *dgd,
+ const uint8_t *src, int width,
+ int height, int dgd_stride,
+ int src_stride, int avg, int64_t *M,
+ int64_t *H, int downsample_factor) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
+ // matrices.
+ // clang-format off
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = {
+ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17,
+ 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
+ 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22,
+ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18,
+ 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
+ 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
+ };
+ // clang-format on
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32);
+ const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48);
+ const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64);
+ const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80);
+
+ int acc_cnt = STAT_ACCUMULATOR_MAX;
+ const int src_next = downsample_factor * src_stride - width;
+ const int dgd_next = downsample_factor * dgd_stride - width;
+ const uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ uint8x16_t dgd_rows[4];
+ load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
+ dgd += 2;
+
+ // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7
+ // matrices (1 for each of the 2 pixels) separated into distinct
+ // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7).
+ // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49
+ // consecutive elements.
+ int16x8_t dgd_avg0[6];
+ int16x8_t dgd_avg1[6];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
+
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ dgd_avg1[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8));
+ dgd_avg1[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8));
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+
+ uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+ uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4);
+
+ dgd_avg0[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+ dgd_avg0[3] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+ dgd_avg1[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8));
+ dgd_avg1[3] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+ vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
+
+ uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+ uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5);
+
+ dgd_avg0[4] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+ dgd_avg0[5] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+ dgd_avg1[4] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8));
+ dgd_avg1[5] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+ vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
+ vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
+
+ // The remaining last (49th) elements of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+ DGD_AVG1[48] = dgd_ptr[7] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 7 * 7. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+ update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
+ dgd_avg1[3]);
+ update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
+ dgd_avg1[4]);
+ update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
+ dgd_avg1[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 49 * 49. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
+ DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
+
+ // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
+ // overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = STAT_ACCUMULATOR_MAX;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
+
+ // Last element of the row is computed separately.
+ lh[48] += lh32[48];
+ lh32[48] = 0;
+
+ lh += WIENER_WIN2_ALIGN2;
+ lh32 += WIENER_WIN2_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ uint8x16_t dgd_rows[4];
+ load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
+ ++dgd;
+
+ // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7
+ // matrix tightly packed into a int16x8_t[6] array. This array contains
+ // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer.
+ // The DGD_AVG buffer contains 49 consecutive elements.
+ int16x8_t dgd_avg0[6];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+
+ uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+ dgd_avg0[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+ dgd_avg0[3] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+
+ uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+ dgd_avg0[4] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+ dgd_avg0[5] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+
+ // The remaining last (49th) element of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+ update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
+ update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
+ update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[48] += DGD_AVG0[48] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 49 * 49. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work column-major matrices, so we
+ // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+ // double transpose transformation will convert H_s32 back to the expected
+ // output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
+
+ // The last element of the triangle of H_s32 matrix can be computed as
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor);
+}
+
+// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the
+// last load address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0));
+}
+
+static INLINE void compute_stats_win5_neon(const uint8_t *dgd,
+ const uint8_t *src, int width,
+ int height, int dgd_stride,
+ int src_stride, int avg, int64_t *M,
+ int64_t *H, int downsample_factor) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t,
+ H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t,
+ H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5
+ // matrices.
+ // clang-format off
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = {
+ 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24,
+ 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25,
+ 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23,
+ };
+ // clang-format on
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32);
+
+ int acc_cnt = STAT_ACCUMULATOR_MAX;
+ const int src_next = downsample_factor * src_stride - width;
+ const int dgd_next = downsample_factor * dgd_stride - width;
+ const uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ uint8x16_t dgd_rows[3];
+ load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
+ dgd += 2;
+
+ // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5
+ // matrices (1 for each of the 2 pixels) separated into distinct
+ // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5).
+ // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25
+ // consecutive elements.
+ int16x8_t dgd_avg0[3];
+ int16x8_t dgd_avg1[3];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1);
+ uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2);
+
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+ dgd_avg1[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+ dgd_avg1[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+ dgd_avg1[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+
+ // The remaining last (25th) elements of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+ DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 5 * 5. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 25 * 25. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
+
+ // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
+ // overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = STAT_ACCUMULATOR_MAX;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
+
+ // Last element of the row is computed separately.
+ lh[24] += lh32[24];
+ lh32[24] = 0;
+
+ lh += WIENER_WIN2_REDUCED_ALIGN2;
+ lh32 += WIENER_WIN2_REDUCED_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ uint8x16_t dgd_rows[3];
+ load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
+ ++dgd;
+
+ // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
+ // matrix tightly packed into a int16x8_t[3] array. This array contains
+ // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
+ // The DGD_AVG buffer contains 25 consecutive elements.
+ int16x8_t dgd_avg0[3];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2));
+
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+
+ // The remaining last (25th) element of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[24] += DGD_AVG0[24] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 25 * 25. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work column-major matrices, so we
+ // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+ // double transpose transformation will convert H_s32 back to the expected
+ // output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
+ downsample_factor);
+}
+
+static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+
+ if (width >= 16) {
+ int h = 0;
+ // We can accumulate up to 257 8-bit values in a 16-bit value, given
+ // that each 16-bit vector has 8 elements, that means we can process up to
+ // int(257*8/width) rows before we need to widen to 32-bit vector
+ // elements.
+ int h_overflow = 257 * 8 / width;
+ int h_limit = height > h_overflow ? h_overflow : height;
+ uint32x4_t avg_u32 = vdupq_n_u32(0);
+ do {
+ uint16x8_t avg_u16 = vdupq_n_u16(0);
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ avg_u16 = vpadalq_u8(avg_u16, s);
+ j -= 16;
+ src_ptr += 16;
+ } while (j >= 16);
+ if (j >= 8) {
+ uint8x8_t s = vld1_u8(src_ptr);
+ avg_u16 = vaddw_u8(avg_u16, s);
+ j -= 8;
+ src_ptr += 8;
+ }
+ // Scalar tail case.
+ while (j > 0) {
+ sum += src[width - j];
+ j--;
+ }
+ src += src_stride;
+ } while (++h < h_limit);
+ avg_u32 = vpadalq_u16(avg_u32, avg_u16);
+
+ h_limit += h_overflow;
+ h_limit = height > h_overflow ? h_overflow : height;
+ } while (h < height);
+ return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
+ (width * height));
+ }
+ if (width >= 8) {
+ int h = 0;
+ // We can accumulate up to 257 8-bit values in a 16-bit value, given
+ // that each 16-bit vector has 4 elements, that means we can process up to
+ // int(257*4/width) rows before we need to widen to 32-bit vector
+ // elements.
+ int h_overflow = 257 * 4 / width;
+ int h_limit = height > h_overflow ? h_overflow : height;
+ uint32x2_t avg_u32 = vdup_n_u32(0);
+ do {
+ uint16x4_t avg_u16 = vdup_n_u16(0);
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ uint8x8_t s = vld1_u8(src_ptr);
+ avg_u16 = vpadal_u8(avg_u16, s);
+ j -= 8;
+ src_ptr += 8;
+ // Scalar tail case.
+ while (j > 0) {
+ sum += src[width - j];
+ j--;
+ }
+ src += src_stride;
+ } while (++h < h_limit);
+ avg_u32 = vpadal_u16(avg_u32, avg_u16);
+
+ h_limit += h_overflow;
+ h_limit = height > h_overflow ? h_overflow : height;
+ } while (h < height);
+ return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
+ (width * height));
+ }
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sum += src[j];
+ } while (++j < width);
+ src += src_stride;
+ } while (--i != 0);
+ return (uint8_t)(sum / (width * height));
+}
+
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+ assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
+ (void)dgd_avg;
+ (void)src_avg;
+
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = wiener_win >> 1;
+ const int width = h_end - h_start;
+ const int height = v_end - v_start;
+
+ const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
+ const uint8_t *src_start = src + h_start + v_start * src_stride;
+
+ // The wiener window will slide along the dgd frame, centered on each pixel.
+ // For the top left pixel and all the pixels on the side of the frame this
+ // means half of the window will be outside of the frame. As such the actual
+ // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+ // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+ const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+
+ uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
+
+ // Since the height is not necessarily a multiple of the downsample factor,
+ // the last line of src will be scaled according to how many rows remain.
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ int downsampled_height = height / downsample_factor;
+ int downsample_remainder = height % downsample_factor;
+
+ memset(M, 0, wiener_win2 * sizeof(*M));
+ memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H));
+
+ // Calculate the M and H matrices for the normal and downsampled cases.
+ if (downsampled_height > 0) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height,
+ dgd_stride, src_stride, avg, M, H,
+ downsample_factor);
+ } else {
+ compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height,
+ dgd_stride, src_stride, avg, M, H,
+ downsample_factor);
+ }
+ }
+
+ // Accumulate the remaining last rows in the downsampled case.
+ if (downsample_remainder > 0) {
+ int remainder_offset = height - downsample_remainder;
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride,
+ src_start + remainder_offset * src_stride, width,
+ 1, dgd_stride, src_stride, avg, M, H,
+ downsample_remainder);
+ } else {
+ compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride,
+ src_start + remainder_offset * src_stride, width,
+ 1, dgd_stride, src_stride, avg, M, H,
+ downsample_remainder);
+ }
+ }
+}
+
+static INLINE void calc_proj_params_r0_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t h01_lo = vdupq_n_s64(0);
+ int64x2_t h01_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt0_ptr = flt0;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+ f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+ f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+ f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+ h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+ h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+ h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ H[1][0] = H[0][1];
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt0_ptr = flt0;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+ f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt0 += flt0_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+ f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2],
+ int64_t C[2], const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h
new file mode 100644
index 0000000000..7b72dca34d
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+
+#include <arm_neon.h>
+
+#include "av1/common/restoration.h"
+
+// Aligned sizes for Wiener filters.
+#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
+#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
+#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
+#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
+#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
+
+// Compute 8 values of M (cross correlation) for a single source pixel and
+// accumulate.
+static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
+ int16x8_t dgd_avg) {
+ int32x4_t lo = vld1q_s32(M_s32 + 0);
+ int32x4_t hi = vld1q_s32(M_s32 + 4);
+
+ lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
+ hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
+
+ vst1q_s32(M_s32 + 0, lo);
+ vst1q_s32(M_s32 + 4, hi);
+}
+
+// Compute 8 values of M (cross correlation) for two source pixels and
+// accumulate.
+static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
+ int16x4_t src_avg1, int16x8_t dgd_avg0,
+ int16x8_t dgd_avg1) {
+ int32x4_t lo = vld1q_s32(M_s32 + 0);
+ int32x4_t hi = vld1q_s32(M_s32 + 4);
+
+ lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
+ hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
+ lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
+ hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
+
+ vst1q_s32(M_s32 + 0, lo);
+ vst1q_s32(M_s32 + 4, hi);
+}
+
+static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
+ int width, int height) {
+ for (int i = 0; i < height; i += 4) {
+ int16x4_t di = vld1_s16(dgd_avg + i);
+
+ for (int j = i; j < width; j += 4) {
+ int16x4_t dj = vld1_s16(dgd_avg + j);
+ int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
+ int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
+ int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
+ int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
+
+ h0 = vmlal_lane_s16(h0, dj, di, 0);
+ h1 = vmlal_lane_s16(h1, dj, di, 1);
+ h2 = vmlal_lane_s16(h2, dj, di, 2);
+ h3 = vmlal_lane_s16(h3, dj, di, 3);
+
+ vst1q_s32(H_s32 + 0 * width + j, h0);
+ vst1q_s32(H_s32 + 1 * width + j, h1);
+ vst1q_s32(H_s32 + 2 * width + j, h2);
+ vst1q_s32(H_s32 + 3 * width + j, h3);
+ }
+ H_s32 += 4 * width;
+ }
+}
+
+static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+ const int16_t *dgd_avg1) {
+ for (int i = 0; i < 24; i += 4) {
+ int16x4_t di0 = vld1_s16(dgd_avg0 + i);
+ int16x4_t di1 = vld1_s16(dgd_avg1 + i);
+
+ for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
+ int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
+ int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
+ int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+ int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+ int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+ int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+
+ h0 = vmlal_lane_s16(h0, dj0, di0, 0);
+ h0 = vmlal_lane_s16(h0, dj1, di1, 0);
+ h1 = vmlal_lane_s16(h1, dj0, di0, 1);
+ h1 = vmlal_lane_s16(h1, dj1, di1, 1);
+ h2 = vmlal_lane_s16(h2, dj0, di0, 2);
+ h2 = vmlal_lane_s16(h2, dj1, di1, 2);
+ h3 = vmlal_lane_s16(h3, dj0, di0, 3);
+ h3 = vmlal_lane_s16(h3, dj1, di1, 3);
+
+ vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
+ vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
+ vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
+ vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
+ }
+ H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
+ }
+}
+
+static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+ const int16_t *dgd_avg1) {
+ for (int i = 0; i < 48; i += 4) {
+ int16x4_t di0 = vld1_s16(dgd_avg0 + i);
+ int16x4_t di1 = vld1_s16(dgd_avg1 + i);
+
+ int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
+ int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
+ int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
+ int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
+
+ h0 = vmlal_lane_s16(h0, di0, di0, 0);
+ h0 = vmlal_lane_s16(h0, di1, di1, 0);
+ h1 = vmlal_lane_s16(h1, di0, di0, 1);
+ h1 = vmlal_lane_s16(h1, di1, di1, 1);
+ h2 = vmlal_lane_s16(h2, di0, di0, 2);
+ h2 = vmlal_lane_s16(h2, di1, di1, 2);
+ h3 = vmlal_lane_s16(h3, di0, di0, 3);
+ h3 = vmlal_lane_s16(h3, di1, di1, 3);
+
+ vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
+ vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
+ vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
+ vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
+
+ for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
+ int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
+ int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
+ h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
+ h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
+ h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
+ h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
+
+ h0 = vmlal_lane_s16(h0, dj0, di0, 0);
+ h0 = vmlal_lane_s16(h0, dj1, di1, 0);
+ h1 = vmlal_lane_s16(h1, dj0, di0, 1);
+ h1 = vmlal_lane_s16(h1, dj1, di1, 1);
+ h2 = vmlal_lane_s16(h2, dj0, di0, 2);
+ h2 = vmlal_lane_s16(h2, dj1, di1, 2);
+ h3 = vmlal_lane_s16(h3, dj0, di0, 3);
+ h3 = vmlal_lane_s16(h3, dj1, di1, 3);
+
+ vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
+ vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
+ vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
+ vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
+ }
+ H_s32 += 4 * WIENER_WIN2_ALIGN2;
+ }
+}
+
+// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
+static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src,
+ int length) {
+ do {
+ int32x4_t s32 = vld1q_s32(src);
+ vst1q_s32(src, vdupq_n_s32(0));
+ src += 4;
+
+ int64x2_t d_lo = vld1q_s64(dst + 0);
+ int64x2_t d_hi = vld1q_s64(dst + 2);
+
+ d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
+ d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
+
+ vst1q_s64(dst + 0, d_lo);
+ vst1q_s64(dst + 2, d_hi);
+
+ dst += 4;
+ length -= 4;
+ } while (length > 0);
+}
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 0000000000..c3b57ce206
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+ int16x8_t v_eobmax,
+ uint16x8_t v_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ int16x8_t v_quant, int16x8_t v_dequant,
+ int16x8_t v_round, int16x8_t v_zero) {
+ const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+ const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
+ return v_nz_mask;
+}
+
+void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ int16x8_t v_quant = vld1q_s16(quant_ptr);
+ int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+ int16x8_t v_round = vld1q_s16(round_ptr);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ uint16x8_t v_nz_mask;
+ // process dc and the first seven ac coeffs
+ v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
+ // overwrite the dc constants with ac constants
+ v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+ v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+ v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+ count -= 8;
+ // now process the rest of the ac coeffs
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ count -= 8;
+ } while (count > 0);
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
+ int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, int16x8_t v_quant,
+ int16x8_t v_dequant, int16x8_t v_round,
+ int16x8_t v_zero) {
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+ const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ vst1q_s16(qcoeff_ptr, v_qcoeff);
+ vst1q_s16(dqcoeff_ptr, v_dqcoeff);
+ return v_nz_mask;
+}
+
+void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ int16x8_t v_quant = vld1q_s16(quant_ptr);
+ int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+ int16x8_t v_round = vld1q_s16(round_ptr);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ uint16x8_t v_nz_mask;
+ intptr_t count = n_coeffs;
+
+ // process dc and the first seven ac coeffs
+ v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ // overwrite the dc constants with ac constants
+ v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+ v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+ v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+ count -= 8;
+ // now process the rest of the ac coeffs
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ count -= 8;
+ } while (count != 0);
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
+ int16x8_t v_round, int16x8_t v_zero, int log_scale) {
+ const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
+ const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const uint16x8_t v_mask =
+ vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
+ vreinterpretq_s16_u16(v_mask));
+ const int16x8_t v_tmp2 =
+ vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff =
+ vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
+ // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
+ // shifting right. (vshlq_s16 will shift right if shift value is negative)
+ const uint16x8_t v_abs_dqcoeff =
+ vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
+ vdupq_n_s16(-log_scale));
+ const int16x8_t v_dqcoeff =
+ vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
+ v_coeff_sign);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+ return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
+ int16x8_t v_round, int16x8_t v_zero) {
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const uint16x8_t v_mask =
+ vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
+ vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
+ // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
+ vreinterpretq_s16_u16(v_mask));
+ // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+ const int16x8_t v_tmp2 =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
+ vreinterpretq_s16_u16(vshrq_n_u16(
+ vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff =
+ vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
+ // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+ const int16x8_t v_abs_dqcoeff =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
+ vreinterpretq_s16_u16(vshrq_n_u16(
+ vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
+ const int16x8_t v_dqcoeff =
+ vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+ return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
+ int log_scale) {
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ int16x8_t v_quant = vld1q_s16(quant_ptr);
+ int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+ const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
+ int16x8_t v_round =
+ vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ intptr_t non_zero_count = n_coeffs;
+
+ assert(n_coeffs > 16);
+ // Pre-scan pass
+ const int16x8_t v_dequant_scaled =
+ vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
+ const int16x8_t v_zbin_s16 =
+ vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
+ intptr_t i = n_coeffs;
+ do {
+ const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
+ const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
+ const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
+ const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
+ const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
+ const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
+ // If the coefficient is in the base ZBIN range, then discard.
+ if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
+ non_zero_count -= 16;
+ } else {
+ break;
+ }
+ i -= 16;
+ } while (i > 0);
+
+ const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+ memset(qcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+ // process dc and the first seven ac coeffs
+ uint16x8_t v_nz_mask;
+ if (log_scale == 2) {
+ v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant, v_dequant, v_round, v_zero);
+ } else {
+ v_nz_mask =
+ quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero, log_scale);
+ }
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ // overwrite the dc constants with ac constants
+ v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+ v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+ v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+ for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ if (log_scale == 2) {
+ v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant, v_dequant, v_round, v_zero);
+ } else {
+ v_nz_mask =
+ quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero, log_scale);
+ }
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+ quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
+ iscan, 1);
+}
+
+void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+ quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
+ iscan, 2);
+}
+
+void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+
+ uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+ int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+ vcond = vcgeq_s16(v_abs, vzbins);
+
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+ int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+#define QM_MULL_SHIFT(x0, x1) \
+ vreinterpretq_s16_u16(vorrq_u16( \
+ vreinterpretq_u16_s16(vshlq_n_s16( \
+ vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
+ vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
+
+static void aom_quantize_b_helper_16x16_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ (void)scan;
+
+ uint16x8_t vwt, viwt;
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+ uint16x8_t vcond;
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ (void)scan;
+
+ uint16x8_t vwt, viwt;
+ const int log_scale = 1;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+ const int16x8_t v_log_scale = v_eobmax_76543210;
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+ vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+ uint16x8_t vcond;
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+ vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_64x64_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ (void)scan;
+
+ uint16x8_t vwt, viwt;
+ const int log_scale = 2;
+ const int16x8_t v_log_scale =
+ vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+ int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+ vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+ uint16x8_t vcond;
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ int16x8_t ones =
+ vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+ vtmp2 =
+ vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ v_deq_abs =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ int16x8_t ones =
+ vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+ vtmp2 =
+ vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ v_deq_abs =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+void aom_quantize_b_helper_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2
+ case 0:
+ aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, qm_ptr, iqm_ptr);
+ break;
+ case 1:
+ aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, qm_ptr, iqm_ptr);
+ break;
+ case 2:
+ aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, qm_ptr, iqm_ptr);
+ break;
+ }
+}
+
+void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 2);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c
new file mode 100644
index 0000000000..7d3bd4c606
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include "av1/encoder/rdopt.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+ int32x4_t *xy_sum_32,
+ int32x4_t *xz_sum_32,
+ int32x4_t *x_sum_32,
+ int32x4_t *x2_sum_32) {
+ // Pixels in this 4x4 [ a b c d ]
+ // are referred to as: [ e f g h ]
+ // [ i j k l ]
+ // [ m n o p ]
+
+ const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
+ const int16x4_t pixelsa_2_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
+ const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
+ const int16x4_t pixelsb_2_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
+ const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
+ const int16x4_t pixelsa_1_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
+ const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
+ const int16x4_t pixelsb_1_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
+
+ const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
+
+ *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
+ *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
+ *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
+
+ *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
+ *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
+ *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
+
+ // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+ // (sum up every element in slli_a and swap_b)
+ *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
+ *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
+
+ // Also sum their squares
+ *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
+ *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
+ *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
+}
+
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - right neighbour pixel
+ // z - below neighbour pixel
+ // w - down-right neighbour pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, x2_sum = 0;
+ int32x4_t zero = vdupq_n_s32(0);
+ int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
+ int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
+ int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
+ int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
+ // Process horizontal and vertical correlations through the body in 4x4
+ // blocks. This excludes the final row and column and possibly one extra
+ // column depending how 3 divides into width and height
+
+ for (int i = 0; i <= height - 4; i += 3) {
+ int32x4_t xy_sum_32 = zero;
+ int32x4_t xz_sum_32 = zero;
+ int32x4_t x_sum_32 = zero;
+ int32x4_t x2_sum_32 = zero;
+ for (int j = 0; j <= width - 4; j += 3) {
+ horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+ &xz_sum_32, &x_sum_32, &x2_sum_32);
+ }
+ v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
+ v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
+ v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
+ v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
+ }
+#if AOM_ARCH_AARCH64
+ xy_sum = vaddvq_s64(v_xy_sum);
+ xz_sum = vaddvq_s64(v_xz_sum);
+ x2_sum = vaddvq_s64(v_x2_sum);
+ x_sum = vaddvq_s64(v_x_sum);
+#else
+ xy_sum = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
+ xz_sum = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
+ x2_sum = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
+ x_sum =
+ vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
+#endif
+ // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+ int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+ // Do we have 2 rows remaining or just the one? Note that width and height
+ // are powers of 2, so each modulo 3 must be 1 or 2.
+ if (height % 3 == 1) { // Just horiz corrs on the final row
+ const int16_t x0 = diff[(height - 1) * stride];
+ x_sum += x0;
+ x_finalrow += x0;
+ x2_sum += x0 * x0;
+ x2_finalrow += x0 * x0;
+ if (width >= 8) {
+ int32x4_t v_y_sum = zero;
+ int32x4_t v_y2_sum = zero;
+ int32x4_t v_xy_sum_a = zero;
+ int k = width - 1;
+ int j = 0;
+ while ((k - 8) > 0) {
+ const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
+ const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+ v_y_sum = vpadalq_s16(v_y_sum, v_y);
+ k -= 8;
+ j += 8;
+ }
+
+ const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
+ const int16x8_t v_x =
+ vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+ vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+ const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
+ const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+#if AOM_ARCH_AARCH64
+ const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+ xy_sum += vaddvq_s64(v_xy_sum2);
+ const int32_t y = vaddvq_s32(v_y_sum_a);
+ const int64_t y2 = vaddvq_s64(v_y2_sum_a);
+#else
+ xy_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+ const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
+ const int64_t y =
+ vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
+ const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
+ int64_t y2 = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
+#endif
+ x_sum += y;
+ x2_sum += y2;
+ x_finalrow += y;
+ x2_finalrow += y2;
+ } else {
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 1) * stride + j];
+ const int16_t y = diff[(height - 1) * stride + j + 1];
+ xy_sum += x * y;
+ x_sum += y;
+ x2_sum += y * y;
+ x_finalrow += y;
+ x2_finalrow += y * y;
+ }
+ }
+ } else { // Two rows remaining to do
+ const int16_t x0 = diff[(height - 2) * stride];
+ const int16_t z0 = diff[(height - 1) * stride];
+ x_sum += x0 + z0;
+ x2_sum += x0 * x0 + z0 * z0;
+ x_finalrow += z0;
+ x2_finalrow += z0 * z0;
+ if (width >= 8) {
+ int32x4_t v_y2_sum = zero;
+ int32x4_t v_w2_sum = zero;
+ int32x4_t v_xy_sum_a = zero;
+ int32x4_t v_xz_sum_a = zero;
+ int32x4_t v_x_sum_a = zero;
+ int32x4_t v_w_sum = zero;
+ int k = width - 1;
+ int j = 0;
+ while ((k - 8) > 0) {
+ const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
+ const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
+ const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
+ const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_z_lo = vget_low_s16(v_z);
+ const int16x4_t v_w_lo = vget_low_s16(v_w);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ const int16x4_t v_z_hi = vget_high_s16(v_z);
+ const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+ v_w_sum = vpadalq_s16(v_w_sum, v_w);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+ k -= 8;
+ j += 8;
+ }
+ const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
+ const int16x8_t v_x =
+ vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+ vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
+ const int16x8_t v_z =
+ vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
+ vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
+
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_z_lo = vget_low_s16(v_z);
+ const int16x4_t v_w_lo = vget_low_s16(v_w);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ const int16x4_t v_z_hi = vget_high_s16(v_z);
+ const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+ v_w_sum = vpadalq_s16(v_w_sum, v_w);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+#if AOM_ARCH_AARCH64
+ xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
+ xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
+ x_sum += vaddvq_s32(v_x_sum_a);
+ x_finalrow += vaddvq_s32(v_w_sum);
+ int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
+ int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
+#else
+ const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+ xy_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+ const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
+ xz_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
+ const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
+ x_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
+ const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
+ x_finalrow += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
+ const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+ int64_t y2 = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
+ const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
+ int64_t w2 = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
+#endif
+ x2_sum += y2 + w2;
+ x2_finalrow += w2;
+ } else {
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 2) * stride + j];
+ const int16_t y = diff[(height - 2) * stride + j + 1];
+ const int16_t z = diff[(height - 1) * stride + j];
+ const int16_t w = diff[(height - 1) * stride + j + 1];
+
+ // Horizontal and vertical correlations for the penultimate row:
+ xy_sum += x * y;
+ xz_sum += x * z;
+
+ // Now just horizontal correlations for the final row:
+ xy_sum += z * w;
+
+ x_sum += y + w;
+ x2_sum += y * y + w * w;
+ x_finalrow += w;
+ x2_finalrow += w * w;
+ }
+ }
+ }
+
+ // Do we have 2 columns remaining or just the one?
+ if (width % 3 == 1) { // Just vert corrs on the final col
+ const int16_t x0 = diff[width - 1];
+ x_sum += x0;
+ x_finalcol += x0;
+ x2_sum += x0 * x0;
+ x2_finalcol += x0 * x0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 1];
+ xz_sum += x * z;
+ x_finalcol += z;
+ x2_finalcol += z * z;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z;
+ x2_sum += z * z;
+ }
+ }
+ } else { // Two cols remaining
+ const int16_t x0 = diff[width - 2];
+ const int16_t y0 = diff[width - 1];
+ x_sum += x0 + y0;
+ x2_sum += x0 * x0 + y0 * y0;
+ x_finalcol += y0;
+ x2_finalcol += y0 * y0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 2];
+ const int16_t y = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 2];
+ const int16_t w = diff[(i + 1) * stride + width - 1];
+
+ // Horizontal and vertical correlations for the penultimate col:
+ // Skip these on the last iteration of this loop if we also had two
+ // rows remaining, otherwise the final horizontal and vertical correlation
+ // get erroneously processed twice
+ if (i < height - 2 || height % 3 == 1) {
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+
+ x_finalcol += w;
+ x2_finalcol += w * w;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z + w;
+ x2_sum += z * z + w * w;
+ }
+
+ // Now just vertical correlations for the final column:
+ xz_sum += y * w;
+ }
+ }
+
+ // Calculate the simple sums and squared-sums
+ int64_t x_firstrow = 0, x_firstcol = 0;
+ int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+ if (width >= 8) {
+ int32x4_t v_x_firstrow = zero;
+ int32x4_t v_x2_firstrow = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int16x8_t v_diff = vld1q_s16(diff + j);
+ const int16x4_t v_diff_lo = vget_low_s16(v_diff);
+ const int16x4_t v_diff_hi = vget_high_s16(v_diff);
+ v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
+ v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
+ v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
+ }
+#if AOM_ARCH_AARCH64
+ x_firstrow += vaddvq_s32(v_x_firstrow);
+ x2_firstrow += vaddvq_s32(v_x2_firstrow);
+#else
+ const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
+ x_firstrow += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
+ 0);
+ const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
+ x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
+ vget_high_s64(v_x2_firstrow_64)),
+ 0);
+#endif
+ } else {
+ for (int j = 0; j < width; ++j) {
+ x_firstrow += diff[j];
+ x2_firstrow += diff[j] * diff[j];
+ }
+ }
+ for (int i = 0; i < height; ++i) {
+ x_firstcol += diff[i * stride];
+ x2_firstcol += diff[i * stride] * diff[i * stride];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c
new file mode 100644
index 0000000000..3d17723224
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter_params = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ if (width > 8) {
+ assert(width % 16 == 0);
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t r = vld1q_u8(ref + j);
+ vst1q_u8(comp_pred + j, r);
+ j += 16;
+ } while (j < width);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else if (width == 8) {
+ int i = height;
+ do {
+ uint8x8_t r = vld1_u8(ref);
+ vst1_u8(comp_pred, r);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ int i = height / 2;
+ do {
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+ vst1_u8(comp_pred, r);
+ ref += 2 * ref_stride;
+ comp_pred += 2 * width;
+ } while (--i != 0);
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const filter_x =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
+ aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL,
+ -1, width, height);
+ } else if (!subpel_x_q3) {
+ const int16_t *const filter_y =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
+ aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y,
+ 16, width, height);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t,
+ im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+
+ const int16_t *const filter_x =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
+ const int16_t *const filter_y =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
+
+ const int im_stride = MAX_SB_SIZE;
+ const int im_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + SUBPEL_TAPS;
+
+ const int ref_vert_offset = ref_stride * ((SUBPEL_TAPS >> 1) - 1);
+ const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1);
+
+ assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block,
+ MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height);
+ aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width,
+ NULL, -1, filter_y, 16, width, height);
+ }
+}
+
+void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd,
+ const AV1_COMMON *const cm, int mi_row,
+ int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred,
+ width, jcp_param);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd,
+ const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ if (width > 4) {
+ assert(width % 8 == 0);
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t r = vld1q_u16(ref + j);
+ vst1q_u16(comp_pred + j, r);
+ j += 8;
+ } while (j < width);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else if (width == 4) {
+ int i = height;
+ do {
+ uint16x4_t r = vld1_u16(ref);
+ vst1_u16(comp_pred, r);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else {
+ assert(width == 2);
+ int i = height / 2;
+ do {
+ uint16x4_t r = load_u16_2x2(ref, ref_stride);
+ store_u16x2_strided_x2(comp_pred, width, r);
+ ref += 2 * ref_stride;
+ comp_pred += 2 * width;
+ } while (--i != 0);
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel,
+ 16, NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL,
+ -1, kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz_neon(
+ ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride,
+ CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+ intermediate_height, bd);
+ aom_highbd_convolve8_vert_neon(
+ CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8,
+ width);
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height,
+ comp_pred8, width, jcp_param);
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/arm/neon/shift_neon.h b/third_party/aom/av1/encoder/arm/neon/shift_neon.h
new file mode 100644
index 0000000000..d73aef2f25
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/shift_neon.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_INLINE.
+
+#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg) \
+ static AOM_INLINE void name(const type *in, type *out, int size) { \
+ int i = 0; \
+ do { \
+ out[i] = intrinsic(in[i], arg); \
+ } while (++i < size); \
+ }
+
+SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4)
+SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4)
+
+// Addition instructions have slightly better performance compared to shift
+// instructions on some micro-architectures, so use these for shifts by one.
+
+SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i])
+SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i])
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16,
+ vdup_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16,
+ vdupq_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32,
+ vdupq_n_s32(0))
+
+#undef SHIFT_LOOP_HELPER
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c
new file mode 100644
index 0000000000..986f143864
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
+DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
+};
+
+static INLINE void get_squared_error(
+ const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2,
+ const uint32_t stride2, const uint32_t block_width,
+ const uint32_t block_height, uint16_t *frame_sse,
+ const unsigned int dst_stride) {
+ uint16_t *dst = frame_sse;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+ uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ uint16x8_t sse_lo =
+ vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
+ uint16x8_t sse_hi =
+ vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff));
+
+ vst1q_u16(dst + j + 2, sse_lo);
+ vst1q_u16(dst + j + 10, sse_hi);
+
+ j += 16;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ } while (++i < block_height);
+}
+
+static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
+ const uint32_t block_width) {
+ uint16x8_t s = vld1q_u16(src);
+
+ if (col == 0) {
+ const uint16_t lane2 = vgetq_lane_u16(s, 2);
+ s = vsetq_lane_u16(lane2, s, 0);
+ s = vsetq_lane_u16(lane2, s, 1);
+ } else if (col >= block_width - 4) {
+ const uint16_t lane5 = vgetq_lane_u16(s, 5);
+ s = vsetq_lane_u16(lane5, s, 6);
+ s = vsetq_lane_u16(lane5, s, 7);
+ }
+ return s;
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+ const uint32_t block_height, const int *subblock_mses,
+ unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW];
+ const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
+
+ // Traverse 4 columns at a time - first and last two columns need padding.
+ for (uint32_t col = 0; col < block_width; col += 4) {
+ uint16x8_t vsrc[5];
+ const uint16_t *src = frame_sse + col;
+
+ // Load and pad (for first and last two columns) 3 rows from the top.
+ for (int i = 2; i < 5; i++) {
+ vsrc[i] = load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Pad the top 2 rows.
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (unsigned int row = 0; row < block_height; row++) {
+ for (int i = 0; i < 4; i++) {
+ uint32x4_t vsum = vdupq_n_u32(0);
+ for (int j = 0; j < 5; j++) {
+ vsum = vpadalq_u16(vsum, vandq_u16(vsrc[j], vmask.val[i]));
+ }
+ acc_5x5_neon[row][col + i] = horizontal_add_u32x4(vsum);
+ }
+
+ // Push all rows in the sliding window up one.
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row into the bottom of the sliding window.
+ vsrc[4] = load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ } else {
+ // Pad the bottom 2 rows.
+ vsrc[4] = vsrc[3];
+ }
+ }
+ }
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_neon(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_sse, SSE_STRIDE);
+
+ apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_sse, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint32x4_t acc = vdupq_n_u32(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ int64_t final_acc = 0;
+ const uint8_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint8_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 16) {
+ uint8x16_t mat[3][3];
+ mat[0][0] = vld1q_u8(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u8(src_ptr - stride);
+ mat[0][2] = vld1q_u8(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u8(src_ptr - 1);
+ mat[1][1] = vld1q_u8(src_ptr);
+ mat[1][2] = vld1q_u8(src_ptr + 1);
+ mat[2][0] = vld1q_u8(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u8(src_ptr + stride);
+ mat[2][2] = vld1q_u8(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0]));
+ uint16x8_t gxa_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0]));
+ uint16x8_t gxb_lo =
+ vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2]));
+ uint16x8_t gxb_hi =
+ vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2]));
+ gxa_lo = vaddq_u16(
+ gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0])));
+ gxa_hi = vaddq_u16(
+ gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0])));
+ gxb_lo = vaddq_u16(
+ gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2])));
+ gxb_hi = vaddq_u16(
+ gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2])));
+
+ uint16x8_t gya_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+ uint16x8_t gya_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+ uint16x8_t gyb_lo =
+ vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+ uint16x8_t gyb_hi =
+ vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+ gya_lo = vaddq_u16(
+ gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1])));
+ gya_hi = vaddq_u16(
+ gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1])));
+ gyb_lo = vaddq_u16(
+ gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1])));
+ gyb_hi = vaddq_u16(
+ gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1])));
+
+ uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo);
+ uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh);
+ uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh);
+
+ uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2);
+ uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2);
+
+ uint16x8_t adj0_lo =
+ vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1]));
+ uint16x8_t adj0_hi =
+ vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1]));
+ uint16x8_t adj1_lo =
+ vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2]));
+ uint16x8_t adj1_hi =
+ vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2]));
+ uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo);
+ adj_lo = vaddq_u16(adj_lo, adj_lo);
+ uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi);
+ adj_hi = vaddq_u16(adj_hi, adj_hi);
+
+ uint16x8_t diag0_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+ uint16x8_t diag0_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+ uint16x8_t diag1_lo =
+ vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+ uint16x8_t diag1_hi =
+ vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+ uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo);
+ uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi);
+
+ uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo);
+ v_lo = vabdq_u16(v_lo, adj_lo);
+ uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi);
+ v_hi = vabdq_u16(v_hi, adj_hi);
+
+ acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo));
+ acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi));
+
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo));
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi));
+
+ w += 16;
+ src_ptr += 16;
+ }
+
+ if (w <= (width - 1) - 8) {
+ uint8x8_t mat[3][3];
+ mat[0][0] = vld1_u8(src_ptr - stride - 1);
+ mat[0][1] = vld1_u8(src_ptr - stride);
+ mat[0][2] = vld1_u8(src_ptr - stride + 1);
+ mat[1][0] = vld1_u8(src_ptr - 1);
+ mat[1][1] = vld1_u8(src_ptr);
+ mat[1][2] = vld1_u8(src_ptr + 1);
+ mat[2][0] = vld1_u8(src_ptr + stride - 1);
+ mat[2][1] = vld1_u8(src_ptr + stride);
+ mat[2][2] = vld1_u8(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vaddq_u16(center, diag);
+ v = vabdq_u16(v, adj);
+
+ acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
+ uint8x8_t mat[3][3];
+ mat[0][0] = load_u8_4x1(src_ptr - stride - 1);
+ mat[0][1] = load_u8_4x1(src_ptr - stride);
+ mat[0][2] = load_u8_4x1(src_ptr - stride + 1);
+ mat[1][0] = load_u8_4x1(src_ptr - 1);
+ mat[1][1] = load_u8_4x1(src_ptr);
+ mat[1][2] = load_u8_4x1(src_ptr + 1);
+ mat[2][0] = load_u8_4x1(src_ptr + stride - 1);
+ mat[2][1] = load_u8_4x1(src_ptr + stride);
+ mat[2][2] = load_u8_4x1(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask);
+
+ uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vaddq_u16(center, diag);
+ v = vabdq_u16(v, adj);
+
+ acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = abs(gx) + abs(gy);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += abs(v) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_long_add_u32x4(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
new file mode 100644
index 0000000000..5a52e701a2
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// clang-format off
+
+DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+ 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+ 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+ 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+// clang-format on
+
+static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+ const uint8_t *frame2, const uint32_t stride2,
+ const uint32_t block_width,
+ const uint32_t block_height,
+ uint8_t *frame_abs_diff,
+ const unsigned int dst_stride) {
+ uint8_t *dst = frame_abs_diff;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+ uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ vst1q_u8(dst + j + 2, abs_diff);
+ j += 16;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ } while (++i < block_height);
+}
+
+static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
+ const uint32_t block_width) {
+ uint8x8_t s = vld1_u8(src);
+
+ if (col == 0) {
+ const uint8_t lane2 = vget_lane_u8(s, 2);
+ s = vset_lane_u8(lane2, s, 0);
+ s = vset_lane_u8(lane2, s, 1);
+ } else if (col >= block_width - 4) {
+ const uint8_t lane5 = vget_lane_u8(s, 5);
+ s = vset_lane_u8(lane5, s, 6);
+ s = vset_lane_u8(lane5, s, 7);
+ }
+ return vcombine_u8(s, s);
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+ const uint32_t block_height, const int *subblock_mses,
+ unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW];
+ const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+
+ // Traverse 4 columns at a time - first and last two columns need padding.
+ for (uint32_t col = 0; col < block_width; col += 4) {
+ uint8x16_t vsrc[5][2];
+ const uint8_t *src = frame_abs_diff + col;
+
+ // Load, pad (for first and last two columns) and mask 3 rows from the top.
+ for (int i = 2; i < 5; i++) {
+ const uint8x16_t s = load_and_pad(src, col, block_width);
+ vsrc[i][0] = vandq_u8(s, vmask.val[0]);
+ vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+ src += SSE_STRIDE;
+ }
+
+ // Pad the top 2 rows.
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (unsigned int row = 0; row < block_height; row++) {
+ uint32x4_t sum_01 = vdupq_n_u32(0);
+ uint32x4_t sum_23 = vdupq_n_u32(0);
+
+ sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
+
+ sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
+
+ vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
+
+ // Push all rows in the sliding window up one.
+ for (int i = 0; i < 4; i++) {
+ vsrc[i][0] = vsrc[i + 1][0];
+ vsrc[i][1] = vsrc[i + 1][1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row into the bottom of the sliding window.
+ uint8x16_t s = load_and_pad(src, col, block_width);
+ vsrc[4][0] = vandq_u8(s, vmask.val[0]);
+ vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+ src += SSE_STRIDE;
+ } else {
+ // Pad the bottom 2 rows.
+ vsrc[4][0] = vsrc[3][0];
+ vsrc[4][1] = vsrc[3][1];
+ }
+ }
+ }
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_neon_dotprod(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] +=
+ (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+ frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+ }
+ }
+ }
+ }
+ }
+
+ get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_abs_diff, SSE_STRIDE);
+
+ apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_abs_diff, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/txfm_neon.h b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h
new file mode 100644
index 0000000000..635364f46a
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+
+#include "aom/aom_integer.h" // For AOM_INLINE.
+
+static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
+ const int16_t **input,
+ int *stride, int out_size) {
+ if (ud_flip) {
+ *input = *input + (out_size - 1) * *stride;
+ *stride = -*stride;
+ }
+}
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c
new file mode 100644
index 0000000000..1b35269b33
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c for details of the parameters and
+ * computation.
+ */
+uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ assert(N % 64 == 0);
+
+ uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int i = 0;
+ do {
+ int32x4_t sum[4];
+ int32x4_t sse[2];
+ int16x4_t sum_s16[4];
+
+ const int16x8_t r1_l = vld1q_s16(r1 + i);
+ const int16x8_t r1_h = vld1q_s16(r1 + i + 8);
+ const int16x8_t d_l = vld1q_s16(d + i);
+ const int16x8_t d_h = vld1q_s16(d + i + 8);
+ // The following three lines are a bit inelegant compared to using a pair
+ // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair -
+ // which can be executed in parallel with the subsequent SSHL instructions.
+ // (SSHL can only be executed on half of the Neon pipes in modern Arm
+ // cores, whereas ZIP1/2 can be executed on all of them.)
+ const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0));
+ const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]);
+ const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]);
+
+ sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS);
+ sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS);
+ sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS);
+ sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS);
+
+ sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l));
+ sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l));
+ sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h));
+ sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h));
+
+ sum_s16[0] = vqmovn_s32(sum[0]);
+ sum_s16[1] = vqmovn_s32(sum[1]);
+ sum_s16[2] = vqmovn_s32(sum[2]);
+ sum_s16[3] = vqmovn_s32(sum[3]);
+
+ sse[0] = vmull_s16(sum_s16[0], sum_s16[0]);
+ sse[1] = vmull_s16(sum_s16[2], sum_s16[2]);
+ sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]);
+ sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]);
+
+ v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0]));
+ v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1]));
+
+ i += 16;
+ } while (i < N);
+
+ uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ do {
+ int16x8_t ds_l = vld1q_s16(ds);
+ int16x8_t ds_h = vld1q_s16(ds + 8);
+
+ int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m));
+ int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8));
+ int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8));
+
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l));
+ acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h));
+ acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h));
+
+ ds += 16;
+ m += 16;
+ N -= 16;
+ } while (N != 0);
+
+ int64x2_t sum = vpaddlq_s32(acc[0]);
+ sum = vpadalq_s32(sum, acc[1]);
+ sum = vpadalq_s32(sum, acc[2]);
+ sum = vpadalq_s32(sum, acc[3]);
+
+ return (horizontal_add_s64x2(sum) > limit);
+}
+
+void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr,
+ const int16_t *b_ptr, int N) {
+ do {
+ int16x8_t a = vld1q_s16(a_ptr);
+ int16x8_t b = vld1q_s16(b_ptr);
+
+ int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+ int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+
+ sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b));
+ sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b));
+
+ int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi));
+
+ vst1q_s16(d_ptr, res);
+
+ d_ptr += 8;
+ a_ptr += 8;
+ b_ptr += 8;
+ N -= 8;
+ } while (N != 0);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 0000000000..6601c19ab3
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 4;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[4];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[3];
+ bf1[1] = input[1] + input[2];
+ bf1[2] = -input[2] + input[1];
+ bf1[3] = -input[3] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[2];
+ bf1[2] = bf0[1];
+ bf1[3] = bf0[3];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 8;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[7];
+ bf1[1] = input[1] + input[6];
+ bf1[2] = input[2] + input[5];
+ bf1[3] = input[3] + input[4];
+ bf1[4] = -input[4] + input[3];
+ bf1[5] = -input[5] + input[2];
+ bf1[6] = -input[6] + input[1];
+ bf1[7] = -input[7] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[4];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[6];
+ bf1[4] = bf0[1];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[3];
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 16;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[15];
+ bf1[1] = input[1] + input[14];
+ bf1[2] = input[2] + input[13];
+ bf1[3] = input[3] + input[12];
+ bf1[4] = input[4] + input[11];
+ bf1[5] = input[5] + input[10];
+ bf1[6] = input[6] + input[9];
+ bf1[7] = input[7] + input[8];
+ bf1[8] = -input[8] + input[7];
+ bf1[9] = -input[9] + input[6];
+ bf1[10] = -input[10] + input[5];
+ bf1[11] = -input[11] + input[4];
+ bf1[12] = -input[12] + input[3];
+ bf1[13] = -input[13] + input[2];
+ bf1[14] = -input[14] + input[1];
+ bf1[15] = -input[15] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[8];
+ bf1[2] = bf0[4];
+ bf1[3] = bf0[12];
+ bf1[4] = bf0[2];
+ bf1[5] = bf0[10];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[14];
+ bf1[8] = bf0[1];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[5];
+ bf1[11] = bf0[13];
+ bf1[12] = bf0[3];
+ bf1[13] = bf0[11];
+ bf1[14] = bf0[7];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 32;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[32];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[31];
+ bf1[1] = input[1] + input[30];
+ bf1[2] = input[2] + input[29];
+ bf1[3] = input[3] + input[28];
+ bf1[4] = input[4] + input[27];
+ bf1[5] = input[5] + input[26];
+ bf1[6] = input[6] + input[25];
+ bf1[7] = input[7] + input[24];
+ bf1[8] = input[8] + input[23];
+ bf1[9] = input[9] + input[22];
+ bf1[10] = input[10] + input[21];
+ bf1[11] = input[11] + input[20];
+ bf1[12] = input[12] + input[19];
+ bf1[13] = input[13] + input[18];
+ bf1[14] = input[14] + input[17];
+ bf1[15] = input[15] + input[16];
+ bf1[16] = -input[16] + input[15];
+ bf1[17] = -input[17] + input[14];
+ bf1[18] = -input[18] + input[13];
+ bf1[19] = -input[19] + input[12];
+ bf1[20] = -input[20] + input[11];
+ bf1[21] = -input[21] + input[10];
+ bf1[22] = -input[22] + input[9];
+ bf1[23] = -input[23] + input[8];
+ bf1[24] = -input[24] + input[7];
+ bf1[25] = -input[25] + input[6];
+ bf1[26] = -input[26] + input[5];
+ bf1[27] = -input[27] + input[4];
+ bf1[28] = -input[28] + input[3];
+ bf1[29] = -input[29] + input[2];
+ bf1[30] = -input[30] + input[1];
+ bf1[31] = -input[31] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[15];
+ bf1[1] = bf0[1] + bf0[14];
+ bf1[2] = bf0[2] + bf0[13];
+ bf1[3] = bf0[3] + bf0[12];
+ bf1[4] = bf0[4] + bf0[11];
+ bf1[5] = bf0[5] + bf0[10];
+ bf1[6] = bf0[6] + bf0[9];
+ bf1[7] = bf0[7] + bf0[8];
+ bf1[8] = -bf0[8] + bf0[7];
+ bf1[9] = -bf0[9] + bf0[6];
+ bf1[10] = -bf0[10] + bf0[5];
+ bf1[11] = -bf0[11] + bf0[4];
+ bf1[12] = -bf0[12] + bf0[3];
+ bf1[13] = -bf0[13] + bf0[2];
+ bf1[14] = -bf0[14] + bf0[1];
+ bf1[15] = -bf0[15] + bf0[0];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[23];
+ bf1[17] = bf0[17] + bf0[22];
+ bf1[18] = bf0[18] + bf0[21];
+ bf1[19] = bf0[19] + bf0[20];
+ bf1[20] = -bf0[20] + bf0[19];
+ bf1[21] = -bf0[21] + bf0[18];
+ bf1[22] = -bf0[22] + bf0[17];
+ bf1[23] = -bf0[23] + bf0[16];
+ bf1[24] = -bf0[24] + bf0[31];
+ bf1[25] = -bf0[25] + bf0[30];
+ bf1[26] = -bf0[26] + bf0[29];
+ bf1[27] = -bf0[27] + bf0[28];
+ bf1[28] = bf0[28] + bf0[27];
+ bf1[29] = bf0[29] + bf0[26];
+ bf1[30] = bf0[30] + bf0[25];
+ bf1[31] = bf0[31] + bf0[24];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[19];
+ bf1[17] = bf0[17] + bf0[18];
+ bf1[18] = -bf0[18] + bf0[17];
+ bf1[19] = -bf0[19] + bf0[16];
+ bf1[20] = -bf0[20] + bf0[23];
+ bf1[21] = -bf0[21] + bf0[22];
+ bf1[22] = bf0[22] + bf0[21];
+ bf1[23] = bf0[23] + bf0[20];
+ bf1[24] = bf0[24] + bf0[27];
+ bf1[25] = bf0[25] + bf0[26];
+ bf1[26] = -bf0[26] + bf0[25];
+ bf1[27] = -bf0[27] + bf0[24];
+ bf1[28] = -bf0[28] + bf0[31];
+ bf1[29] = -bf0[29] + bf0[30];
+ bf1[30] = bf0[30] + bf0[29];
+ bf1[31] = bf0[31] + bf0[28];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ bf1[16] = bf0[16] + bf0[17];
+ bf1[17] = -bf0[17] + bf0[16];
+ bf1[18] = -bf0[18] + bf0[19];
+ bf1[19] = bf0[19] + bf0[18];
+ bf1[20] = bf0[20] + bf0[21];
+ bf1[21] = -bf0[21] + bf0[20];
+ bf1[22] = -bf0[22] + bf0[23];
+ bf1[23] = bf0[23] + bf0[22];
+ bf1[24] = bf0[24] + bf0[25];
+ bf1[25] = -bf0[25] + bf0[24];
+ bf1[26] = -bf0[26] + bf0[27];
+ bf1[27] = bf0[27] + bf0[26];
+ bf1[28] = bf0[28] + bf0[29];
+ bf1[29] = -bf0[29] + bf0[28];
+ bf1[30] = -bf0[30] + bf0[31];
+ bf1[31] = bf0[31] + bf0[30];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+ bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[16];
+ bf1[2] = bf0[8];
+ bf1[3] = bf0[24];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[20];
+ bf1[6] = bf0[12];
+ bf1[7] = bf0[28];
+ bf1[8] = bf0[2];
+ bf1[9] = bf0[18];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[26];
+ bf1[12] = bf0[6];
+ bf1[13] = bf0[22];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[30];
+ bf1[16] = bf0[1];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[9];
+ bf1[19] = bf0[25];
+ bf1[20] = bf0[5];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[13];
+ bf1[23] = bf0[29];
+ bf1[24] = bf0[3];
+ bf1[25] = bf0[19];
+ bf1[26] = bf0[11];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[7];
+ bf1[29] = bf0[23];
+ bf1[30] = bf0[15];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ int bit = cos_bit;
+ const int32_t *sinpi = sinpi_arr(bit);
+ int32_t x0, x1, x2, x3;
+ int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 0
+ av1_range_check_buf(0, input, input, 4, stage_range[0]);
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+ s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+ s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+ s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+ s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+ s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+ s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+ s7 = range_check_value(x0 + x1, stage_range[1]);
+
+ // stage 2
+ s7 = range_check_value(s7 - x3, stage_range[2]);
+
+ // stage 3
+ x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+ x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+ x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+ x3 = range_check_value(s4, bit + stage_range[3]);
+
+ // stage 4
+ x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+ x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+ // stage 5
+ s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+ s1 = range_check_value(x1, bit + stage_range[5]);
+ s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+ s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+ // stage 6
+ s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+ // 1-D transform scaling factor is sqrt(2).
+ output[0] = round_shift(s0, bit);
+ output[1] = round_shift(s1, bit);
+ output[2] = round_shift(s2, bit);
+ output[3] = round_shift(s3, bit);
+ av1_range_check_buf(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 8;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ assert(output != input);
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = -input[7];
+ bf1[2] = -input[3];
+ bf1[3] = input[4];
+ bf1[4] = -input[1];
+ bf1[5] = input[6];
+ bf1[6] = input[2];
+ bf1[7] = -input[5];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[2];
+ bf1[1] = bf0[1] + bf0[3];
+ bf1[2] = bf0[0] - bf0[2];
+ bf1[3] = bf0[1] - bf0[3];
+ bf1[4] = bf0[4] + bf0[6];
+ bf1[5] = bf0[5] + bf0[7];
+ bf1[6] = bf0[4] - bf0[6];
+ bf1[7] = bf0[5] - bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[4];
+ bf1[1] = bf0[1] + bf0[5];
+ bf1[2] = bf0[2] + bf0[6];
+ bf1[3] = bf0[3] + bf0[7];
+ bf1[4] = bf0[0] - bf0[4];
+ bf1[5] = bf0[1] - bf0[5];
+ bf1[6] = bf0[2] - bf0[6];
+ bf1[7] = bf0[3] - bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[1];
+ bf1[1] = bf0[6];
+ bf1[2] = bf0[3];
+ bf1[3] = bf0[4];
+ bf1[4] = bf0[5];
+ bf1[5] = bf0[2];
+ bf1[6] = bf0[7];
+ bf1[7] = bf0[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 16;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ assert(output != input);
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = -input[15];
+ bf1[2] = -input[7];
+ bf1[3] = input[8];
+ bf1[4] = -input[3];
+ bf1[5] = input[12];
+ bf1[6] = input[4];
+ bf1[7] = -input[11];
+ bf1[8] = -input[1];
+ bf1[9] = input[14];
+ bf1[10] = input[6];
+ bf1[11] = -input[9];
+ bf1[12] = input[2];
+ bf1[13] = -input[13];
+ bf1[14] = -input[5];
+ bf1[15] = input[10];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[2];
+ bf1[1] = bf0[1] + bf0[3];
+ bf1[2] = bf0[0] - bf0[2];
+ bf1[3] = bf0[1] - bf0[3];
+ bf1[4] = bf0[4] + bf0[6];
+ bf1[5] = bf0[5] + bf0[7];
+ bf1[6] = bf0[4] - bf0[6];
+ bf1[7] = bf0[5] - bf0[7];
+ bf1[8] = bf0[8] + bf0[10];
+ bf1[9] = bf0[9] + bf0[11];
+ bf1[10] = bf0[8] - bf0[10];
+ bf1[11] = bf0[9] - bf0[11];
+ bf1[12] = bf0[12] + bf0[14];
+ bf1[13] = bf0[13] + bf0[15];
+ bf1[14] = bf0[12] - bf0[14];
+ bf1[15] = bf0[13] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[4];
+ bf1[1] = bf0[1] + bf0[5];
+ bf1[2] = bf0[2] + bf0[6];
+ bf1[3] = bf0[3] + bf0[7];
+ bf1[4] = bf0[0] - bf0[4];
+ bf1[5] = bf0[1] - bf0[5];
+ bf1[6] = bf0[2] - bf0[6];
+ bf1[7] = bf0[3] - bf0[7];
+ bf1[8] = bf0[8] + bf0[12];
+ bf1[9] = bf0[9] + bf0[13];
+ bf1[10] = bf0[10] + bf0[14];
+ bf1[11] = bf0[11] + bf0[15];
+ bf1[12] = bf0[8] - bf0[12];
+ bf1[13] = bf0[9] - bf0[13];
+ bf1[14] = bf0[10] - bf0[14];
+ bf1[15] = bf0[11] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+ bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[8];
+ bf1[1] = bf0[1] + bf0[9];
+ bf1[2] = bf0[2] + bf0[10];
+ bf1[3] = bf0[3] + bf0[11];
+ bf1[4] = bf0[4] + bf0[12];
+ bf1[5] = bf0[5] + bf0[13];
+ bf1[6] = bf0[6] + bf0[14];
+ bf1[7] = bf0[7] + bf0[15];
+ bf1[8] = bf0[0] - bf0[8];
+ bf1[9] = bf0[1] - bf0[9];
+ bf1[10] = bf0[2] - bf0[10];
+ bf1[11] = bf0[3] - bf0[11];
+ bf1[12] = bf0[4] - bf0[12];
+ bf1[13] = bf0[5] - bf0[13];
+ bf1[14] = bf0[6] - bf0[14];
+ bf1[15] = bf0[7] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+ bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+ bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[1];
+ bf1[1] = bf0[14];
+ bf1[2] = bf0[3];
+ bf1[3] = bf0[12];
+ bf1[4] = bf0[5];
+ bf1[5] = bf0[10];
+ bf1[6] = bf0[7];
+ bf1[7] = bf0[8];
+ bf1[8] = bf0[9];
+ bf1[9] = bf0[6];
+ bf1[10] = bf0[11];
+ bf1[11] = bf0[4];
+ bf1[12] = bf0[13];
+ bf1[13] = bf0[2];
+ bf1[14] = bf0[15];
+ bf1[15] = bf0[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 4; ++i)
+ output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+ av1_range_check_buf(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+ av1_range_check_buf(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 16; ++i)
+ output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+ av1_range_check_buf(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+ av1_range_check_buf(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 64;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[64];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[63];
+ bf1[1] = input[1] + input[62];
+ bf1[2] = input[2] + input[61];
+ bf1[3] = input[3] + input[60];
+ bf1[4] = input[4] + input[59];
+ bf1[5] = input[5] + input[58];
+ bf1[6] = input[6] + input[57];
+ bf1[7] = input[7] + input[56];
+ bf1[8] = input[8] + input[55];
+ bf1[9] = input[9] + input[54];
+ bf1[10] = input[10] + input[53];
+ bf1[11] = input[11] + input[52];
+ bf1[12] = input[12] + input[51];
+ bf1[13] = input[13] + input[50];
+ bf1[14] = input[14] + input[49];
+ bf1[15] = input[15] + input[48];
+ bf1[16] = input[16] + input[47];
+ bf1[17] = input[17] + input[46];
+ bf1[18] = input[18] + input[45];
+ bf1[19] = input[19] + input[44];
+ bf1[20] = input[20] + input[43];
+ bf1[21] = input[21] + input[42];
+ bf1[22] = input[22] + input[41];
+ bf1[23] = input[23] + input[40];
+ bf1[24] = input[24] + input[39];
+ bf1[25] = input[25] + input[38];
+ bf1[26] = input[26] + input[37];
+ bf1[27] = input[27] + input[36];
+ bf1[28] = input[28] + input[35];
+ bf1[29] = input[29] + input[34];
+ bf1[30] = input[30] + input[33];
+ bf1[31] = input[31] + input[32];
+ bf1[32] = -input[32] + input[31];
+ bf1[33] = -input[33] + input[30];
+ bf1[34] = -input[34] + input[29];
+ bf1[35] = -input[35] + input[28];
+ bf1[36] = -input[36] + input[27];
+ bf1[37] = -input[37] + input[26];
+ bf1[38] = -input[38] + input[25];
+ bf1[39] = -input[39] + input[24];
+ bf1[40] = -input[40] + input[23];
+ bf1[41] = -input[41] + input[22];
+ bf1[42] = -input[42] + input[21];
+ bf1[43] = -input[43] + input[20];
+ bf1[44] = -input[44] + input[19];
+ bf1[45] = -input[45] + input[18];
+ bf1[46] = -input[46] + input[17];
+ bf1[47] = -input[47] + input[16];
+ bf1[48] = -input[48] + input[15];
+ bf1[49] = -input[49] + input[14];
+ bf1[50] = -input[50] + input[13];
+ bf1[51] = -input[51] + input[12];
+ bf1[52] = -input[52] + input[11];
+ bf1[53] = -input[53] + input[10];
+ bf1[54] = -input[54] + input[9];
+ bf1[55] = -input[55] + input[8];
+ bf1[56] = -input[56] + input[7];
+ bf1[57] = -input[57] + input[6];
+ bf1[58] = -input[58] + input[5];
+ bf1[59] = -input[59] + input[4];
+ bf1[60] = -input[60] + input[3];
+ bf1[61] = -input[61] + input[2];
+ bf1[62] = -input[62] + input[1];
+ bf1[63] = -input[63] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[31];
+ bf1[1] = bf0[1] + bf0[30];
+ bf1[2] = bf0[2] + bf0[29];
+ bf1[3] = bf0[3] + bf0[28];
+ bf1[4] = bf0[4] + bf0[27];
+ bf1[5] = bf0[5] + bf0[26];
+ bf1[6] = bf0[6] + bf0[25];
+ bf1[7] = bf0[7] + bf0[24];
+ bf1[8] = bf0[8] + bf0[23];
+ bf1[9] = bf0[9] + bf0[22];
+ bf1[10] = bf0[10] + bf0[21];
+ bf1[11] = bf0[11] + bf0[20];
+ bf1[12] = bf0[12] + bf0[19];
+ bf1[13] = bf0[13] + bf0[18];
+ bf1[14] = bf0[14] + bf0[17];
+ bf1[15] = bf0[15] + bf0[16];
+ bf1[16] = -bf0[16] + bf0[15];
+ bf1[17] = -bf0[17] + bf0[14];
+ bf1[18] = -bf0[18] + bf0[13];
+ bf1[19] = -bf0[19] + bf0[12];
+ bf1[20] = -bf0[20] + bf0[11];
+ bf1[21] = -bf0[21] + bf0[10];
+ bf1[22] = -bf0[22] + bf0[9];
+ bf1[23] = -bf0[23] + bf0[8];
+ bf1[24] = -bf0[24] + bf0[7];
+ bf1[25] = -bf0[25] + bf0[6];
+ bf1[26] = -bf0[26] + bf0[5];
+ bf1[27] = -bf0[27] + bf0[4];
+ bf1[28] = -bf0[28] + bf0[3];
+ bf1[29] = -bf0[29] + bf0[2];
+ bf1[30] = -bf0[30] + bf0[1];
+ bf1[31] = -bf0[31] + bf0[0];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = bf0[37];
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+ bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = bf0[58];
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[15];
+ bf1[1] = bf0[1] + bf0[14];
+ bf1[2] = bf0[2] + bf0[13];
+ bf1[3] = bf0[3] + bf0[12];
+ bf1[4] = bf0[4] + bf0[11];
+ bf1[5] = bf0[5] + bf0[10];
+ bf1[6] = bf0[6] + bf0[9];
+ bf1[7] = bf0[7] + bf0[8];
+ bf1[8] = -bf0[8] + bf0[7];
+ bf1[9] = -bf0[9] + bf0[6];
+ bf1[10] = -bf0[10] + bf0[5];
+ bf1[11] = -bf0[11] + bf0[4];
+ bf1[12] = -bf0[12] + bf0[3];
+ bf1[13] = -bf0[13] + bf0[2];
+ bf1[14] = -bf0[14] + bf0[1];
+ bf1[15] = -bf0[15] + bf0[0];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[47];
+ bf1[33] = bf0[33] + bf0[46];
+ bf1[34] = bf0[34] + bf0[45];
+ bf1[35] = bf0[35] + bf0[44];
+ bf1[36] = bf0[36] + bf0[43];
+ bf1[37] = bf0[37] + bf0[42];
+ bf1[38] = bf0[38] + bf0[41];
+ bf1[39] = bf0[39] + bf0[40];
+ bf1[40] = -bf0[40] + bf0[39];
+ bf1[41] = -bf0[41] + bf0[38];
+ bf1[42] = -bf0[42] + bf0[37];
+ bf1[43] = -bf0[43] + bf0[36];
+ bf1[44] = -bf0[44] + bf0[35];
+ bf1[45] = -bf0[45] + bf0[34];
+ bf1[46] = -bf0[46] + bf0[33];
+ bf1[47] = -bf0[47] + bf0[32];
+ bf1[48] = -bf0[48] + bf0[63];
+ bf1[49] = -bf0[49] + bf0[62];
+ bf1[50] = -bf0[50] + bf0[61];
+ bf1[51] = -bf0[51] + bf0[60];
+ bf1[52] = -bf0[52] + bf0[59];
+ bf1[53] = -bf0[53] + bf0[58];
+ bf1[54] = -bf0[54] + bf0[57];
+ bf1[55] = -bf0[55] + bf0[56];
+ bf1[56] = bf0[56] + bf0[55];
+ bf1[57] = bf0[57] + bf0[54];
+ bf1[58] = bf0[58] + bf0[53];
+ bf1[59] = bf0[59] + bf0[52];
+ bf1[60] = bf0[60] + bf0[51];
+ bf1[61] = bf0[61] + bf0[50];
+ bf1[62] = bf0[62] + bf0[49];
+ bf1[63] = bf0[63] + bf0[48];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[23];
+ bf1[17] = bf0[17] + bf0[22];
+ bf1[18] = bf0[18] + bf0[21];
+ bf1[19] = bf0[19] + bf0[20];
+ bf1[20] = -bf0[20] + bf0[19];
+ bf1[21] = -bf0[21] + bf0[18];
+ bf1[22] = -bf0[22] + bf0[17];
+ bf1[23] = -bf0[23] + bf0[16];
+ bf1[24] = -bf0[24] + bf0[31];
+ bf1[25] = -bf0[25] + bf0[30];
+ bf1[26] = -bf0[26] + bf0[29];
+ bf1[27] = -bf0[27] + bf0[28];
+ bf1[28] = bf0[28] + bf0[27];
+ bf1[29] = bf0[29] + bf0[26];
+ bf1[30] = bf0[30] + bf0[25];
+ bf1[31] = bf0[31] + bf0[24];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+ bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+ bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+ bf1[44] = bf0[44];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = bf0[50];
+ bf1[51] = bf0[51];
+ bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+ bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+ bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[39];
+ bf1[33] = bf0[33] + bf0[38];
+ bf1[34] = bf0[34] + bf0[37];
+ bf1[35] = bf0[35] + bf0[36];
+ bf1[36] = -bf0[36] + bf0[35];
+ bf1[37] = -bf0[37] + bf0[34];
+ bf1[38] = -bf0[38] + bf0[33];
+ bf1[39] = -bf0[39] + bf0[32];
+ bf1[40] = -bf0[40] + bf0[47];
+ bf1[41] = -bf0[41] + bf0[46];
+ bf1[42] = -bf0[42] + bf0[45];
+ bf1[43] = -bf0[43] + bf0[44];
+ bf1[44] = bf0[44] + bf0[43];
+ bf1[45] = bf0[45] + bf0[42];
+ bf1[46] = bf0[46] + bf0[41];
+ bf1[47] = bf0[47] + bf0[40];
+ bf1[48] = bf0[48] + bf0[55];
+ bf1[49] = bf0[49] + bf0[54];
+ bf1[50] = bf0[50] + bf0[53];
+ bf1[51] = bf0[51] + bf0[52];
+ bf1[52] = -bf0[52] + bf0[51];
+ bf1[53] = -bf0[53] + bf0[50];
+ bf1[54] = -bf0[54] + bf0[49];
+ bf1[55] = -bf0[55] + bf0[48];
+ bf1[56] = -bf0[56] + bf0[63];
+ bf1[57] = -bf0[57] + bf0[62];
+ bf1[58] = -bf0[58] + bf0[61];
+ bf1[59] = -bf0[59] + bf0[60];
+ bf1[60] = bf0[60] + bf0[59];
+ bf1[61] = bf0[61] + bf0[58];
+ bf1[62] = bf0[62] + bf0[57];
+ bf1[63] = bf0[63] + bf0[56];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[19];
+ bf1[17] = bf0[17] + bf0[18];
+ bf1[18] = -bf0[18] + bf0[17];
+ bf1[19] = -bf0[19] + bf0[16];
+ bf1[20] = -bf0[20] + bf0[23];
+ bf1[21] = -bf0[21] + bf0[22];
+ bf1[22] = bf0[22] + bf0[21];
+ bf1[23] = bf0[23] + bf0[20];
+ bf1[24] = bf0[24] + bf0[27];
+ bf1[25] = bf0[25] + bf0[26];
+ bf1[26] = -bf0[26] + bf0[25];
+ bf1[27] = -bf0[27] + bf0[24];
+ bf1[28] = -bf0[28] + bf0[31];
+ bf1[29] = -bf0[29] + bf0[30];
+ bf1[30] = bf0[30] + bf0[29];
+ bf1[31] = bf0[31] + bf0[28];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+ bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+ bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = bf0[41];
+ bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+ bf1[54] = bf0[54];
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+ bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+ bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[35];
+ bf1[33] = bf0[33] + bf0[34];
+ bf1[34] = -bf0[34] + bf0[33];
+ bf1[35] = -bf0[35] + bf0[32];
+ bf1[36] = -bf0[36] + bf0[39];
+ bf1[37] = -bf0[37] + bf0[38];
+ bf1[38] = bf0[38] + bf0[37];
+ bf1[39] = bf0[39] + bf0[36];
+ bf1[40] = bf0[40] + bf0[43];
+ bf1[41] = bf0[41] + bf0[42];
+ bf1[42] = -bf0[42] + bf0[41];
+ bf1[43] = -bf0[43] + bf0[40];
+ bf1[44] = -bf0[44] + bf0[47];
+ bf1[45] = -bf0[45] + bf0[46];
+ bf1[46] = bf0[46] + bf0[45];
+ bf1[47] = bf0[47] + bf0[44];
+ bf1[48] = bf0[48] + bf0[51];
+ bf1[49] = bf0[49] + bf0[50];
+ bf1[50] = -bf0[50] + bf0[49];
+ bf1[51] = -bf0[51] + bf0[48];
+ bf1[52] = -bf0[52] + bf0[55];
+ bf1[53] = -bf0[53] + bf0[54];
+ bf1[54] = bf0[54] + bf0[53];
+ bf1[55] = bf0[55] + bf0[52];
+ bf1[56] = bf0[56] + bf0[59];
+ bf1[57] = bf0[57] + bf0[58];
+ bf1[58] = -bf0[58] + bf0[57];
+ bf1[59] = -bf0[59] + bf0[56];
+ bf1[60] = -bf0[60] + bf0[63];
+ bf1[61] = -bf0[61] + bf0[62];
+ bf1[62] = bf0[62] + bf0[61];
+ bf1[63] = bf0[63] + bf0[60];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ bf1[16] = bf0[16] + bf0[17];
+ bf1[17] = -bf0[17] + bf0[16];
+ bf1[18] = -bf0[18] + bf0[19];
+ bf1[19] = bf0[19] + bf0[18];
+ bf1[20] = bf0[20] + bf0[21];
+ bf1[21] = -bf0[21] + bf0[20];
+ bf1[22] = -bf0[22] + bf0[23];
+ bf1[23] = bf0[23] + bf0[22];
+ bf1[24] = bf0[24] + bf0[25];
+ bf1[25] = -bf0[25] + bf0[24];
+ bf1[26] = -bf0[26] + bf0[27];
+ bf1[27] = bf0[27] + bf0[26];
+ bf1[28] = bf0[28] + bf0[29];
+ bf1[29] = -bf0[29] + bf0[28];
+ bf1[30] = -bf0[30] + bf0[31];
+ bf1[31] = bf0[31] + bf0[30];
+ bf1[32] = bf0[32];
+ bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+ bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+ bf1[43] = bf0[43];
+ bf1[44] = bf0[44];
+ bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[52];
+ bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+ bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+ bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+ bf1[32] = bf0[32] + bf0[33];
+ bf1[33] = -bf0[33] + bf0[32];
+ bf1[34] = -bf0[34] + bf0[35];
+ bf1[35] = bf0[35] + bf0[34];
+ bf1[36] = bf0[36] + bf0[37];
+ bf1[37] = -bf0[37] + bf0[36];
+ bf1[38] = -bf0[38] + bf0[39];
+ bf1[39] = bf0[39] + bf0[38];
+ bf1[40] = bf0[40] + bf0[41];
+ bf1[41] = -bf0[41] + bf0[40];
+ bf1[42] = -bf0[42] + bf0[43];
+ bf1[43] = bf0[43] + bf0[42];
+ bf1[44] = bf0[44] + bf0[45];
+ bf1[45] = -bf0[45] + bf0[44];
+ bf1[46] = -bf0[46] + bf0[47];
+ bf1[47] = bf0[47] + bf0[46];
+ bf1[48] = bf0[48] + bf0[49];
+ bf1[49] = -bf0[49] + bf0[48];
+ bf1[50] = -bf0[50] + bf0[51];
+ bf1[51] = bf0[51] + bf0[50];
+ bf1[52] = bf0[52] + bf0[53];
+ bf1[53] = -bf0[53] + bf0[52];
+ bf1[54] = -bf0[54] + bf0[55];
+ bf1[55] = bf0[55] + bf0[54];
+ bf1[56] = bf0[56] + bf0[57];
+ bf1[57] = -bf0[57] + bf0[56];
+ bf1[58] = -bf0[58] + bf0[59];
+ bf1[59] = bf0[59] + bf0[58];
+ bf1[60] = bf0[60] + bf0[61];
+ bf1[61] = -bf0[61] + bf0[60];
+ bf1[62] = -bf0[62] + bf0[63];
+ bf1[63] = bf0[63] + bf0[62];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 10
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = bf0[26];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+ bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+ bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+ bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+ bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+ bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+ bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+ bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+ bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+ bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+ bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+ bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+ bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+ bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+ bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+ bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+ bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+ bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+ bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+ bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+ bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+ bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+ bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 11
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[32];
+ bf1[2] = bf0[16];
+ bf1[3] = bf0[48];
+ bf1[4] = bf0[8];
+ bf1[5] = bf0[40];
+ bf1[6] = bf0[24];
+ bf1[7] = bf0[56];
+ bf1[8] = bf0[4];
+ bf1[9] = bf0[36];
+ bf1[10] = bf0[20];
+ bf1[11] = bf0[52];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[44];
+ bf1[14] = bf0[28];
+ bf1[15] = bf0[60];
+ bf1[16] = bf0[2];
+ bf1[17] = bf0[34];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[50];
+ bf1[20] = bf0[10];
+ bf1[21] = bf0[42];
+ bf1[22] = bf0[26];
+ bf1[23] = bf0[58];
+ bf1[24] = bf0[6];
+ bf1[25] = bf0[38];
+ bf1[26] = bf0[22];
+ bf1[27] = bf0[54];
+ bf1[28] = bf0[14];
+ bf1[29] = bf0[46];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[62];
+ bf1[32] = bf0[1];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[17];
+ bf1[35] = bf0[49];
+ bf1[36] = bf0[9];
+ bf1[37] = bf0[41];
+ bf1[38] = bf0[25];
+ bf1[39] = bf0[57];
+ bf1[40] = bf0[5];
+ bf1[41] = bf0[37];
+ bf1[42] = bf0[21];
+ bf1[43] = bf0[53];
+ bf1[44] = bf0[13];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[29];
+ bf1[47] = bf0[61];
+ bf1[48] = bf0[3];
+ bf1[49] = bf0[35];
+ bf1[50] = bf0[19];
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[11];
+ bf1[53] = bf0[43];
+ bf1[54] = bf0[27];
+ bf1[55] = bf0[59];
+ bf1[56] = bf0[7];
+ bf1[57] = bf0[39];
+ bf1[58] = bf0[23];
+ bf1[59] = bf0[55];
+ bf1[60] = bf0[15];
+ bf1[61] = bf0[47];
+ bf1[62] = bf0[31];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 0000000000..9ef54fe4de
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 0000000000..2777cc25bc
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t av1_fwd_cos_bit_col[5][5];
+extern const int8_t av1_fwd_cos_bit_row[5][5];
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 0000000000..12a9535a7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT4: return av1_fdct4;
+ case TXFM_TYPE_DCT8: return av1_fdct8;
+ case TXFM_TYPE_DCT16: return av1_fdct16;
+ case TXFM_TYPE_DCT32: return av1_fdct32;
+ case TXFM_TYPE_DCT64: return av1_fdct64;
+ case TXFM_TYPE_ADST4: return av1_fadst4;
+ case TXFM_TYPE_ADST8: return av1_fadst8;
+ case TXFM_TYPE_ADST16: return av1_fadst16;
+ case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+ case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+ case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+ case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+ default: assert(0); return NULL;
+ }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, int bd) {
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+ stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+ }
+
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+ }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+ const int stride, const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *buf, int bd) {
+ int c, r;
+ // Note when assigning txfm_size_col, we use the txfm_size from the
+ // row configuration and vice versa. This is intentionally done to
+ // accurately perform rectangular transforms. When the transform is
+ // rectangular, the number of columns will be the same as the
+ // txfm_size stored in the row cfg struct. It will make no difference
+ // for square transforms.
+ const int txfm_size_col = tx_size_wide[cfg->tx_size];
+ const int txfm_size_row = tx_size_high[cfg->tx_size];
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+ assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+ av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ // use output buffer as temp buffer
+ int32_t *temp_in = output;
+ int32_t *temp_out = output + txfm_size_row;
+
+ // Columns
+ for (c = 0; c < txfm_size_col; ++c) {
+ if (cfg->ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+ } else {
+ for (r = 0; r < txfm_size_row; ++r)
+ // flip upside down
+ temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+ }
+ av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+ txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+ if (cfg->lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ buf[r * txfm_size_col + c] = temp_out[r];
+ } else {
+ for (r = 0; r < txfm_size_row; ++r)
+ // flip from left to right
+ buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+ }
+ }
+
+ DECLARE_ALIGNED(16, int32_t, row_buffer[MAX_TX_SIZE]);
+
+ // Rows
+ for (r = 0; r < txfm_size_row; ++r) {
+ txfm_func_row(buf + r * txfm_size_col, row_buffer, cos_bit_row,
+ stage_range_row);
+ av1_round_shift_array(row_buffer, txfm_size_col, -shift[2]);
+ if (abs(rect_type) == 1) {
+ // Multiply everything by Sqrt2 if the transform is rectangular and the
+ // size difference is a factor of 2.
+ for (c = 0; c < txfm_size_col; ++c) {
+ row_buffer[c] =
+ round_shift((int64_t)row_buffer[c] * NewSqrt2, NewSqrt2Bits);
+ }
+ }
+ for (c = 0; c < txfm_size_col; ++c) {
+ output[c * txfm_size_row + r] = row_buffer[c];
+ }
+ }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[8 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[4 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[8 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 32];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 64];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+ // Zero out top-right 32x32 area.
+ for (int col = 0; col < 32; ++col) {
+ memset(output + col * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Zero out the bottom 64x32 area.
+ memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int col = 1; col < 32; ++col) {
+ memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out right 32x32 area.
+ for (int col = 0; col < 32; ++col) {
+ memset(output + col * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int col = 1; col < 32; ++col) {
+ memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 32];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out the bottom 32x32 area.
+ memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+ // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out right 32x16 area.
+ for (int row = 0; row < 16; ++row) {
+ memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Re-pack non-zero coeffs in the first 32x16 indices.
+ for (int row = 1; row < 16; ++row) {
+ memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+ // Note: no repacking needed here.
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+ fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
+ fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
+ fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+ fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
+ fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = {
+ { 13, 13, 13, 0, 0 },
+ { 13, 13, 13, 12, 0 },
+ { 13, 13, 13, 12, 13 },
+ { 0, 13, 13, 12, 13 },
+ { 0, 0, 13, 12, 13 }
+ };
+
+const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = {
+ { 13, 13, 12, 0, 0 },
+ { 13, 13, 13, 12, 0 },
+ { 13, 13, 12, 13, 12 },
+ { 0, 12, 13, 12, 11 },
+ { 0, 0, 12, 11, 10 }
+ };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10,
+ 11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+ fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2,
+ fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2,
+ fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+ fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+ av1_zero(cfg->stage_range_col);
+ av1_zero(cfg->stage_range_row);
+
+ const int8_t *const range_mult2_col =
+ fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+ const int stage_num_col = cfg->stage_num_col;
+ // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow.
+ for (int i = 0; i < stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i)
+ cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+
+ const int8_t *const range_mult2_row =
+ fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+ const int stage_num_row = cfg->stage_num_row;
+ // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow.
+ for (int i = 0; i < stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ cfg->stage_range_row[i] =
+ (range_mult2_col[stage_num_col - 1] + range_mult2_row[i] + 1) >> 1;
+ }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg) {
+ assert(cfg != NULL);
+ cfg->tx_size = tx_size;
+ set_flip_cfg(tx_type, cfg);
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ cfg->shift = av1_fwd_txfm_shift_ls[tx_size];
+ cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+ assert(cfg->txfm_type_col != TXFM_TYPE_INVALID);
+ cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+ assert(cfg->txfm_type_row != TXFM_TYPE_INVALID);
+ cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+ cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+ set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_ml_partition_models.h b/third_party/aom/av1/encoder/av1_ml_partition_models.h
new file mode 100644
index 0000000000..2572b138d5
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_ml_partition_models.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(kyslov): Replace with proper weights after training AV1 models
+
+#define FEATURES 6
+static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+ 0.35755366f, 0.86281112f, -0.20871686f, 0.0409634f, 0.97305766f,
+ 0.75510254f, 0.04860447f, 0.77095283f, -0.44105278f, -0.3755049f,
+ -0.08456618f, 1.1821136f, -0.73956301f, 1.30016453f, 0.45566902f,
+ 0.4742967f, 0.44213975f, 0.4876028f, 0.26720522f, -0.34429858f,
+ -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f,
+ 0.60262819f, -0.54788715f, -0.27272022f, 1.0995462f, -0.36338376f,
+ -0.64836313f, 0.16057039f, 1.02782791f, 0.9985311f, 0.90607883f,
+ 0.80570411f, -0.07750863f, -0.74006402f, 1.72839526f, 1.72355343f,
+ 1.69288916f, 1.59102043f, 0.14140216f, -1.47262839f, 0.4262519f,
+ -0.33805936f, -0.02449707f, 0.67203692f
+};
+
+static const float av1_var_part_nn_bias_64_layer0[8] = {
+ 0.39995694f, 0.65593756f, 1.12876737f, 1.28790576f,
+ 0.53468556f, 0.3177908f, -0.74388266f, -1.81131248f
+};
+
+static const float av1_var_part_nn_weights_64_layer1[8] = {
+ -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f,
+ 0.79258322f, 1.74626188f, -5.41831f, 3.33887435f
+};
+
+static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f };
+
+static const float av1_var_part_means_64[FEATURES] = {
+ 5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f
+};
+static const float av1_var_part_vars_64[FEATURES] = {
+ 0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_64 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ av1_var_part_nn_weights_64_layer0,
+ av1_var_part_nn_weights_64_layer1,
+ },
+ {
+ av1_var_part_nn_bias_64_layer0,
+ av1_var_part_nn_bias_64_layer1,
+ },
+};
+
+static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+ 0.97886049f, -1.66262011f, 0.94902798f, 0.7080922f, 0.91181186f,
+ 0.35222601f, -0.04428585f, 0.42086472f, -0.0206325f, -0.77937809f,
+ -0.70947522f, -1.24463119f, 0.23739497f, -1.34327359f, 0.01024804f,
+ 0.4544633f, -0.96907661f, 0.67279522f, 0.23180693f, 1.54063368f,
+ -0.15700707f, 0.18597331f, 0.34167589f, 0.40736558f, 0.69213366f,
+ -1.33584593f, 1.21190814f, 1.26725267f, 1.21284802f, 1.26611399f,
+ 0.17546514f, -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f,
+ -1.26890855f, 0.12166347f, -0.94565678f, -1.47475267f, -0.69279948f,
+ -0.10166587f, -0.23489881f, 0.57123565f, 0.80051137f, -1.28411946f,
+ -1.36576732f, -1.30257508f, -1.30575106f
+};
+
+static const float av1_var_part_nn_bias_32_layer0[8] = {
+ -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f,
+ -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f
+};
+
+static const float av1_var_part_nn_weights_32_layer1[8] = {
+ 1.99257161f, 0.7331492f, 1.33539961f, 1.13501456f,
+ -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f
+};
+
+static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f };
+
+static const float av1_var_part_means_32[FEATURES] = {
+ 5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f
+};
+
+static const float av1_var_part_vars_32[FEATURES] = {
+ 0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_32 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ av1_var_part_nn_weights_32_layer0,
+ av1_var_part_nn_weights_32_layer1,
+ },
+ {
+ av1_var_part_nn_bias_32_layer0,
+ av1_var_part_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+ 0.45118305f, -0.22068295f, 0.4604435f, -0.1446326f, -0.15765035f,
+ 0.42260198f, -0.0945916f, 0.49544996f, 0.62781567f, -0.41564372f,
+ -0.39103292f, 0.44407624f, 0.48382613f, -0.85424238f, -0.00961433f,
+ 0.25383582f, 0.14403897f, 0.00901859f, -0.83201967f, -0.19323284f,
+ 0.59271213f, 0.69487457f, 0.6897112f, 0.62768521f, 0.9204492f,
+ -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f, -0.11289049f,
+ 0.26290832f, -0.41850393f, 0.17239733f, 0.41770622f, 0.43725942f,
+ 0.19362467f, -0.35955731f, -0.899446f, 0.49726389f, 0.66569571f,
+ 0.65893982f, 0.53199654f, -0.1158694f, -0.26472603f, 0.4155923f,
+ 0.15059544f, 0.09596755f, 0.26247133f
+};
+
+static const float av1_var_part_nn_bias_16_layer0[8] = {
+ 1.64486321f, -0.11851574f, 1.29322833f, -0.61193136f,
+ 0.33027532f, 1.04197232f, -0.80716674f, 0.88681233f
+};
+
+static const float av1_var_part_nn_weights_16_layer1[8] = {
+ -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f,
+ -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f
+};
+
+static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f };
+
+static const float av1_var_part_means_16[FEATURES] = {
+ 5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f
+};
+
+static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f,
+ 0.01958579f, 0.02437927f,
+ 0.02420755f, 0.0192003f };
+
+static const NN_CONFIG av1_var_part_nnconfig_16 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ av1_var_part_nn_weights_16_layer0,
+ av1_var_part_nn_weights_16_layer1,
+ },
+ {
+ av1_var_part_nn_bias_16_layer0,
+ av1_var_part_nn_bias_16_layer1,
+ },
+};
+
+#undef FEATURES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.c b/third_party/aom/av1/encoder/av1_noise_estimate.c
new file mode 100644
index 0000000000..25007bb6d4
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_noise_estimate.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
+ return (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+ const int64_t area = (int64_t)width * height;
+ ne->enabled = 0;
+ ne->level = (area < 1280 * 720) ? kLowLow : kLow;
+ ne->value = 0;
+ ne->count = 0;
+ ne->thresh = 90;
+ ne->last_w = 0;
+ ne->last_h = 0;
+ if (area >= 1920 * 1080) {
+ ne->thresh = 200;
+ } else if (area >= 1280 * 720) {
+ ne->thresh = 140;
+ } else if (area >= 640 * 360) {
+ ne->thresh = 115;
+ }
+ ne->num_frames_estimate = 15;
+ ne->adapt_thresh = (3 * ne->thresh) >> 1;
+}
+
+static int enable_noise_estimation(AV1_COMP *const cpi) {
+ const int resize_pending = is_frame_resize_pending(cpi);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cpi->common.seq_params->use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+ cpi->common.width >= 320 && cpi->common.height >= 180)
+ return 1;
+#endif
+ // Only allow noise estimate under certain encoding mode.
+ // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+ // Not enabled for SVC mode and screen_content_mode.
+ // Not enabled for low resolutions.
+ if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
+ resize_pending == 0 && !cpi->ppi->use_svc &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ cpi->common.width * cpi->common.height >= 640 * 360)
+ return 1;
+ else
+ return 0;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+ const YV12_BUFFER_CONFIG *const src) {
+ const uint8_t *srcbuf = src->y_buffer;
+ uint8_t *destbuf = dest->y_buffer;
+
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+
+ for (int r = 0; r < dest->y_height; ++r) {
+ memcpy(destbuf, srcbuf, dest->y_width);
+ destbuf += dest->y_stride;
+ srcbuf += src->y_stride;
+ }
+}
+#endif // CONFIG_AV1_TEMPORAL_DENOISING
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+ int noise_level = kLowLow;
+ if (ne->value > (ne->thresh << 1)) {
+ noise_level = kHigh;
+ } else {
+ if (ne->value > ne->thresh)
+ noise_level = kMedium;
+ else if (ne->value > (ne->thresh >> 1))
+ noise_level = kLow;
+ else
+ noise_level = kLowLow;
+ }
+ return noise_level;
+}
+
+void av1_update_noise_estimate(AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+ // Estimate of noise level every frame_period frames.
+ int frame_period = 8;
+ int thresh_consec_zeromv = 2;
+ int frame_counter = cm->current_frame.frame_number;
+ // Estimate is between current source and last source.
+ YV12_BUFFER_CONFIG *last_source = cpi->last_source;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+ last_source = &cpi->denoiser.last_source;
+ // Tune these thresholds for different resolutions when denoising is
+ // enabled.
+ if (cm->width > 640 && cm->width <= 1920) {
+ thresh_consec_zeromv = 2;
+ }
+ }
+#endif
+ ne->enabled = enable_noise_estimation(cpi);
+ if (cpi->svc.number_spatial_layers > 1)
+ frame_counter = cpi->svc.current_superframe;
+ if (!ne->enabled || frame_counter % frame_period != 0 ||
+ last_source == NULL ||
+ (cpi->svc.number_spatial_layers == 1 &&
+ (ne->last_w != cm->width || ne->last_h != cm->height))) {
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+ if (last_source != NULL) {
+ ne->last_w = cm->width;
+ ne->last_h = cm->height;
+ }
+ return;
+ } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+ cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
+ // Force noise estimation to 0 and denoiser off if content has high motion.
+ ne->level = kLowLow;
+ ne->count = 0;
+ ne->num_frames_estimate = 10;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+ cpi->svc.current_superframe > 1) {
+ av1_denoiser_set_noise_level(cpi, ne->level);
+ copy_frame(&cpi->denoiser.last_source, cpi->source);
+ }
+#endif
+ return;
+ } else {
+ unsigned int bin_size = 100;
+ unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+ unsigned int hist_avg[MAX_VAR_HIST_BINS];
+ unsigned int max_bin = 0;
+ unsigned int max_bin_count = 0;
+ unsigned int bin_cnt;
+ BLOCK_SIZE bsize = BLOCK_16X16;
+ // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+ // been encoded as zero/small mv at least x consecutive frames, compute
+ // the variance to update estimate of noise in the source.
+ const uint8_t *src_y = cpi->source->y_buffer;
+ const int src_ystride = cpi->source->y_stride;
+ const uint8_t *last_src_y = last_source->y_buffer;
+ const int last_src_ystride = last_source->y_stride;
+ int mi_row, mi_col;
+ int num_low_motion = 0;
+ int frame_low_motion = 1;
+ for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) {
+ for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) {
+ int bl_index =
+ (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+ if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+ num_low_motion++;
+ }
+ }
+ if (num_low_motion <
+ (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3))
+ frame_low_motion = 0;
+ for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+ // 16x16 blocks, 1/4 sample of frame.
+ if (mi_row % 8 == 0 && mi_col % 8 == 0 &&
+ mi_row < mi_params->mi_rows - 3 &&
+ mi_col < mi_params->mi_cols - 3) {
+ int bl_index =
+ (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + (mi_params->mi_cols >> 1);
+ int bl_index3 = bl_index2 + 1;
+ int consec_zeromv =
+ AOMMIN(cpi->consec_zero_mv[bl_index],
+ AOMMIN(cpi->consec_zero_mv[bl_index1],
+ AOMMIN(cpi->consec_zero_mv[bl_index2],
+ cpi->consec_zero_mv[bl_index3])));
+ // Only consider blocks that are likely steady background. i.e, have
+ // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+ // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+ // 4 sub-blocks for 16x16 block. And exclude this frame if
+ // high_source_sad is true (i.e., scene/content change).
+ if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+ !cpi->rc.high_source_sad) {
+ unsigned int sse;
+ // Compute variance between co-located blocks from current and
+ // last input frames.
+ unsigned int variance = cpi->ppi->fn_ptr[bsize].vf(
+ src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+ unsigned int hist_index = variance / bin_size;
+ if (hist_index < MAX_VAR_HIST_BINS)
+ hist[hist_index]++;
+ else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+ hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail
+ }
+ }
+ src_y += 4;
+ last_src_y += 4;
+ }
+ src_y += (src_ystride << 2) - (mi_params->mi_cols << 2);
+ last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2);
+ }
+ ne->last_w = cm->width;
+ ne->last_h = cm->height;
+ // Adjust histogram to account for effect that histogram flattens
+ // and shifts to zero as scene darkens.
+ if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+ hist[0] = 0;
+ hist[1] >>= 2;
+ hist[2] >>= 2;
+ hist[3] >>= 2;
+ hist[4] >>= 1;
+ hist[5] >>= 1;
+ hist[6] = 3 * hist[6] >> 1;
+ hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+ }
+
+ // Average hist[] and find largest bin
+ for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+ if (bin_cnt == 0)
+ hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+ else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+ hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+ else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+ hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+ (hist[bin_cnt + 1] >> 1) + 2) >>
+ 2;
+ else
+ hist_avg[bin_cnt] =
+ (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+ 2;
+
+ if (hist_avg[bin_cnt] > max_bin_count) {
+ max_bin_count = hist_avg[bin_cnt];
+ max_bin = bin_cnt;
+ }
+ }
+ // Scale by 40 to work with existing thresholds
+ ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+ // Quickly increase VNR strength when the noise level increases suddenly.
+ if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+ ne->count = ne->num_frames_estimate;
+ } else {
+ ne->count++;
+ }
+ if (ne->count == ne->num_frames_estimate) {
+ // Reset counter and check noise level condition.
+ ne->num_frames_estimate = 30;
+ ne->count = 0;
+ ne->level = av1_noise_estimate_extract_level(ne);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ av1_denoiser_set_noise_level(cpi, ne->level);
+#endif
+ }
+ }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+}
diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.h b/third_party/aom/av1/encoder/av1_noise_estimate.h
new file mode 100644
index 0000000000..85530666f6
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_noise_estimate.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VAR_HIST_BINS 20
+
+typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
+
+typedef struct noise_estimate {
+ int enabled;
+ NOISE_LEVEL level;
+ int value;
+ int thresh;
+ int adapt_thresh;
+ int count;
+ int last_w;
+ int last_h;
+ int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct AV1_COMP;
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height);
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void av1_update_noise_estimate(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 0000000000..110d17f434
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ *eob_ptr = 0;
+}
+
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+ const int16_t dequant_ptr[2],
+ const int16_t round_ptr[2], int log_scale,
+ const int16_t *scan, int coeff_count,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr));
+ const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+ int eob = 0;
+ for (int i = 0; i < coeff_count; i++) {
+ const int rc = scan[i];
+ const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32 = 0;
+ if ((abs_coeff << (1 + log_scale)) >= thresh) {
+ abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+ tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+ if (tmp32) {
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff =
+ (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+ dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ }
+ }
+ if (tmp32) eob = i + 1;
+ }
+ return eob;
+}
+
+static void quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, int log_scale) {
+ int i, eob = -1;
+ const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (qm_ptr == NULL && iqm_ptr == NULL) {
+ *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+ log_scale, scan, (int)n_coeffs,
+ coeff_ptr, qcoeff_ptr, dqcoeff_ptr);
+ } else {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const int coeff_sign = AOMSIGN(coeff);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32 = 0;
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+ abs_coeff += rounding[rc != 0];
+ abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ }
+
+ if (tmp32) eob = i;
+ }
+ *eob_ptr = eob + 1;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, int log_scale) {
+ int i;
+ int eob = -1;
+ const int shift = 16 - log_scale;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ if (qm_ptr || iqm_ptr) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int abs_qcoeff = 0;
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+ const int64_t tmp =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ abs_qcoeff =
+ (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = i;
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ }
+ } else {
+ const int log_scaled_round_arr[2] = {
+ ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+ };
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int rc01 = (rc != 0);
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int log_scaled_round = log_scaled_round_arr[rc01];
+ if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+ const int quant = quant_ptr[rc01];
+ const int dequant = dequant_ptr[rc01];
+ const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+ const int abs_qcoeff = (int)((tmp * quant) >> shift);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ if (abs_qcoeff) eob = i;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)iscan;
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (int i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+ if (tmp) eob = i;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#if !CONFIG_REALTIME_ONLY
+ if (qparam->use_quant_b_adapt) {
+ // TODO(sarahparker) These quantize_b optimizations need SIMD
+ // implementations
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_quantize_b_adaptive_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+ p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+ return;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp;
+ int eob = -1;
+ int32_t tmp32;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (tmp32) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ (void)sc;
+ assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+ p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+ eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ highbd_quantize_fp_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qparam->log_scale);
+ }
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#if !CONFIG_REALTIME_ONLY
+ if (qparam->use_quant_b_adapt) {
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_highbd_quantize_b_adaptive_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_highbd_quantize_b_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+ return;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_highbd_quantize_b_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static INLINE void highbd_quantize_dc(
+ const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+ const int64_t tmpw = tmp * wt;
+ const int abs_qcoeff =
+ (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int dequant =
+ (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ (void)sc;
+
+ highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+ p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+ qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ int log_scale) {
+ highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+ log_scale);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+ uint32_t t;
+ int l, m;
+ t = d;
+ l = get_msb(t);
+ m = 1 + (1 << (16 + l)) / d;
+ *quant = (int16_t)(m - (1 << 16));
+ *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+ const int quant = av1_dc_quant_QTX(q, 0, bit_depth);
+ switch (bit_depth) {
+ case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+ case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+ case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+ int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+ int v_ac_delta_q, QUANTS *const quants,
+ Dequants *const deq) {
+ int i, q, quant_QTX;
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ const int qzbin_factor = get_qzbin_factor(q, bit_depth);
+ const int qrounding_factor = q == 0 ? 64 : 48;
+
+ for (i = 0; i < 2; ++i) {
+ const int qrounding_factor_fp = 64;
+ // y quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, 0, bit_depth);
+ invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+ quant_QTX);
+ quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->y_dequant_QTX[q][i] = quant_QTX;
+
+ // u quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+ invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+ quant_QTX);
+ quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->u_dequant_QTX[q][i] = quant_QTX;
+
+ // v quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+ invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+ quant_QTX);
+ quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->v_dequant_QTX[q][i] = quant_QTX;
+ }
+
+ for (i = 2; i < 8; i++) { // 8: SIMD width
+ quants->y_quant[q][i] = quants->y_quant[q][1];
+ quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+ quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+ quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+ quants->y_zbin[q][i] = quants->y_zbin[q][1];
+ quants->y_round[q][i] = quants->y_round[q][1];
+ deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+
+ quants->u_quant[q][i] = quants->u_quant[q][1];
+ quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+ quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+ quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+ quants->u_zbin[q][i] = quants->u_zbin[q][1];
+ quants->u_round[q][i] = quants->u_round[q][1];
+ deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+
+ quants->v_quant[q][i] = quants->v_quant[q][1];
+ quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+ quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+ quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+ quants->v_zbin[q][i] = quants->v_zbin[q][1];
+ quants->v_round[q][i] = quants->v_round[q][1];
+ deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+ }
+ }
+}
+
+static INLINE bool deltaq_params_have_changed(
+ const DeltaQuantParams *prev_deltaq_params,
+ const CommonQuantParams *quant_params) {
+ return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q ||
+ prev_deltaq_params->u_dc_delta_q != quant_params->u_dc_delta_q ||
+ prev_deltaq_params->v_dc_delta_q != quant_params->v_dc_delta_q ||
+ prev_deltaq_params->u_ac_delta_q != quant_params->u_ac_delta_q ||
+ prev_deltaq_params->v_ac_delta_q != quant_params->v_ac_delta_q);
+}
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+ const CommonQuantParams *quant_params,
+ aom_bit_depth_t bit_depth) {
+ DeltaQuantParams *const prev_deltaq_params =
+ &enc_quant_dequant_params->prev_deltaq_params;
+
+ // Re-initialize the quantizer only if any of the dc/ac deltaq parameters
+ // change.
+ if (!deltaq_params_have_changed(prev_deltaq_params, quant_params)) return;
+ QUANTS *const quants = &enc_quant_dequant_params->quants;
+ Dequants *const dequants = &enc_quant_dequant_params->dequants;
+ av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q,
+ quant_params->u_dc_delta_q, quant_params->u_ac_delta_q,
+ quant_params->v_dc_delta_q, quant_params->v_ac_delta_q,
+ quants, dequants);
+
+ // Record the state of deltaq parameters.
+ prev_deltaq_params->y_dc_delta_q = quant_params->y_dc_delta_q;
+ prev_deltaq_params->u_dc_delta_q = quant_params->u_dc_delta_q;
+ prev_deltaq_params->v_dc_delta_q = quant_params->v_dc_delta_q;
+ prev_deltaq_params->u_ac_delta_q = quant_params->u_ac_delta_q;
+ prev_deltaq_params->v_ac_delta_q = quant_params->v_ac_delta_q;
+}
+
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+ int qindex, MACROBLOCK *x) {
+ const QUANTS *const quants = &enc_quant_dequant_params->quants;
+ const Dequants *const dequants = &enc_quant_dequant_params->dequants;
+ x->qindex = qindex;
+ x->seg_skip_block =
+ 0; // TODO(angiebird): Find a proper place to init this variable.
+
+ // Y
+ x->plane[0].quant_QTX = quants->y_quant[qindex];
+ x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+ x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+ x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+ x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+ x->plane[0].round_QTX = quants->y_round[qindex];
+ x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
+
+ // U
+ x->plane[1].quant_QTX = quants->u_quant[qindex];
+ x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+ x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+ x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+ x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+ x->plane[1].round_QTX = quants->u_round[qindex];
+ x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
+
+ // V
+ x->plane[2].quant_QTX = quants->v_quant[qindex];
+ x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+ x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+ x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+ x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+ x->plane[2].round_QTX = quants->v_round[qindex];
+ x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
+}
+
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+ MACROBLOCKD *xd) {
+ const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+ const int qmlevel_y =
+ use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+ const int qmlevel_u =
+ use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+ const int qmlevel_v =
+ use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+ const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v };
+ for (int i = 0; i < MAX_MB_PLANE; ++i) {
+ const int qmlevel = qmlevel_ls[i];
+ memcpy(&xd->plane[i].seg_qmatrix[segment_id],
+ quant_params->gqmatrix[qmlevel][i],
+ sizeof(quant_params->gqmatrix[qmlevel][i]));
+ memcpy(&xd->plane[i].seg_iqmatrix[segment_id],
+ quant_params->giqmatrix[qmlevel][i],
+ sizeof(quant_params->giqmatrix[qmlevel][i]));
+ }
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id, const int do_update) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ int qindex_rd;
+
+ const int current_qindex = AOMMAX(
+ 0,
+ AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+ ? quant_params->base_qindex + x->delta_qindex
+ : quant_params->base_qindex));
+ const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+
+ if (cpi->oxcf.sb_qp_sweep) {
+ const int current_rd_qindex =
+ AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+ ? quant_params->base_qindex +
+ x->rdmult_delta_qindex
+ : quant_params->base_qindex));
+ qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex);
+ } else {
+ qindex_rd = qindex;
+ }
+
+ const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q;
+ const int rdmult = av1_compute_rd_mult(
+ qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ const int qindex_change = x->qindex != qindex;
+ if (qindex_change || do_update) {
+ av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x);
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if ((segment_id != x->prev_segment_id) ||
+ av1_use_qmatrix(quant_params, xd, segment_id)) {
+ av1_set_qmatrix(quant_params, segment_id, xd);
+ }
+
+ x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+
+ av1_set_error_per_bit(&x->errorperbit, rdmult);
+ av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd);
+
+ x->prev_segment_id = segment_id;
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ x->prev_segment_id = -1;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1);
+}
+
+static int adjust_hdr_cb_deltaq(int base_qindex) {
+ double baseQp = base_qindex / QP_SCALE_FACTOR;
+ const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+ const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+ int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
+ dqpCb = AOMMIN(0, dqpCb);
+ dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+ return dqpCb;
+}
+
+static int adjust_hdr_cr_deltaq(int base_qindex) {
+ double baseQp = base_qindex / QP_SCALE_FACTOR;
+ const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+ const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+ int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
+ dqpCr = AOMMIN(0, dqpCr);
+ dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+ return dqpCr;
+}
+
+void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
+ int q, int enable_chroma_deltaq, int enable_hdr_deltaq) {
+ // quantizer has to be reinitialized with av1_init_quantizer() if any
+ // delta_q changes.
+ CommonQuantParams *quant_params = &cm->quant_params;
+ quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
+ quant_params->y_dc_delta_q = 0;
+
+ if (enable_chroma_deltaq) {
+ // TODO(aomedia:2717): need to design better delta
+ quant_params->u_dc_delta_q = 2;
+ quant_params->u_ac_delta_q = 2;
+ quant_params->v_dc_delta_q = 2;
+ quant_params->v_ac_delta_q = 2;
+ } else {
+ quant_params->u_dc_delta_q = 0;
+ quant_params->u_ac_delta_q = 0;
+ quant_params->v_dc_delta_q = 0;
+ quant_params->v_ac_delta_q = 0;
+ }
+
+ // following section 8.3.2 in T-REC-H.Sup15 document
+ // to apply to AV1 qindex in the range of [0, 255]
+ if (enable_hdr_deltaq) {
+ int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex);
+ int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex);
+ quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb;
+ quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr;
+ if (dqpCb != dqpCr) {
+ cm->seq_params->separate_uv_delta_q = 1;
+ }
+ }
+
+ quant_params->qmatrix_level_y =
+ aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
+ quant_params->qmatrix_level_u =
+ aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
+ min_qmlevel, max_qmlevel);
+
+ if (!cm->seq_params->separate_uv_delta_q)
+ quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
+ else
+ quant_params->qmatrix_level_v =
+ aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q,
+ min_qmlevel, max_qmlevel);
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+ return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+ int quantizer;
+
+ for (quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 0000000000..040973376d
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+ int log_scale;
+ TX_SIZE tx_size;
+ const qm_val_t *qmatrix;
+ const qm_val_t *iqmatrix;
+ int use_quant_b_adapt;
+ int use_optimize_b;
+ int xform_quant_idx;
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
+typedef struct {
+ // 0: dc 1: ac 2-8: ac repeated to SIMD width
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+ // TODO(jingning): in progress of re-working the quantization. will decide
+ // if we want to deprecate the current use of y_quant.
+ DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+ DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are suffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
+typedef struct {
+ DECLARE_ALIGNED(16, int16_t,
+ y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t,
+ u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t,
+ v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+} Dequants;
+
+// The DeltaQuantParams structure holds the dc/ac deltaq parameters.
+typedef struct {
+ int y_dc_delta_q;
+ int u_dc_delta_q;
+ int u_ac_delta_q;
+ int v_dc_delta_q;
+ int v_ac_delta_q;
+} DeltaQuantParams;
+
+typedef struct {
+ // Quantization parameters for internal quantizer setup.
+ QUANTS quants;
+ // Dequantization parameters for internal quantizer setup.
+ Dequants dequants;
+ // Deltaq parameters to track the state of the dc/ac deltaq parameters in
+ // cm->quant_params. It is used to decide whether the quantizer tables need
+ // to be re-initialized.
+ DeltaQuantParams prev_deltaq_params;
+} EncQuantDequantParams;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id, const int do_update);
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+ int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+ int v_ac_delta_q, QUANTS *const quants,
+ Dequants *const deq);
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+ const CommonQuantParams *quant_params,
+ aom_bit_depth_t bit_depth);
+
+void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
+ int max_qmlevel, int q, int enable_chroma_deltaq,
+ int enable_hdr_deltaq);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+/*!\brief Quantize transform coefficients without using qmatrix
+ *
+ * quant_ptr, dequant_ptr and round_ptr are size 2 arrays,
+ * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs.
+ *
+ * \param[in] quant_ptr 16-bit fixed point representation of inverse
+ * quantize step size, i.e. 2^16/dequant
+ * \param[in] dequant_ptr quantize step size
+ * \param[in] round_ptr rounding
+ * \param[in] log_scale the relative log scale of the transform
+ * coefficients
+ * \param[in] scan scan[i] indicates the position of ith to-be-coded
+ * coefficient
+ * \param[in] coeff_count number of coefficients
+ * \param[out] qcoeff_ptr quantized coefficients
+ * \param[out] dqcoeff_ptr dequantized coefficients
+ *
+ * \return The last non-zero coefficient's scan index plus 1
+ */
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+ const int16_t dequant_ptr[2],
+ const int16_t round_ptr[2], int log_scale,
+ const int16_t *scan, int coeff_count,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+/*!\brief Update quantize parameters in MACROBLOCK
+ *
+ * \param[in] enc_quant_dequant_params This parameter cached the quantize and
+ * dequantize parameters for all q
+ * indices.
+ * \param[in] qindex Quantize index used for the current
+ * superblock.
+ * \param[out] x A superblock data structure for
+ * encoder.
+ */
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+ int qindex, MACROBLOCK *x);
+
+/*!\brief Update quantize matrix in MACROBLOCKD based on segment id
+ *
+ * \param[in] quant_params Quantize parameters used by encoder and decoder
+ * \param[in] segment_id Segment id.
+ * \param[out] xd A superblock data structure used by encoder and
+ * decoder.
+ */
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+ MACROBLOCKD *xd);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.c b/third_party/aom/av1/encoder/av1_temporal_denoiser.c
new file mode 100644
index 0000000000..3012df6311
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ (void)increase_denoising;
+ return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ (void)increase_denoising;
+ return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+ int motion_magnitude) {
+ if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) {
+ if (increase_denoising)
+ return (1 << num_pels_log2_lookup[bs]) << 2;
+ else
+ return 0;
+ } else {
+ return (1 << num_pels_log2_lookup[bs]) << 4;
+ }
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(kyslov): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ int r, c;
+ const uint8_t *sig_start = sig;
+ const uint8_t *mc_avg_start = mc_avg;
+ uint8_t *avg_start = avg;
+ int diff, adj, absdiff, delta;
+ int adj_val[] = { 3, 4, 6 };
+ int total_adj = 0;
+ int shift_inc = 1;
+
+ // If motion_magnitude is small, making the denoiser more aggressive by
+ // increasing the adjustment for each level. Add another increment for
+ // blocks that are labeled for increase denoising.
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ if (increase_denoising) {
+ shift_inc = 2;
+ }
+ adj_val[0] += shift_inc;
+ adj_val[1] += shift_inc;
+ adj_val[2] += shift_inc;
+ }
+
+ // First attempt to apply a strong temporal denoising filter.
+ for (r = 0; r < block_size_high[bs]; ++r) {
+ for (c = 0; c < block_size_wide[bs]; ++c) {
+ diff = mc_avg[c] - sig[c];
+ absdiff = abs(diff);
+
+ if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+ avg[c] = mc_avg[c];
+ total_adj += diff;
+ } else {
+ switch (absdiff) {
+ case 4:
+ case 5:
+ case 6:
+ case 7: adj = adj_val[0]; break;
+ case 8:
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15: adj = adj_val[1]; break;
+ default: adj = adj_val[2];
+ }
+ if (diff > 0) {
+ avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj);
+ total_adj += adj;
+ } else {
+ avg[c] = AOMMAX(0, sig[c] - adj);
+ total_adj -= adj;
+ }
+ }
+ }
+ sig += sig_stride;
+ avg += avg_stride;
+ mc_avg += mc_avg_stride;
+ }
+
+ // If the strong filter did not modify the signal too much, we're all set.
+ if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+ return FILTER_BLOCK;
+ }
+
+ // Otherwise, we try to dampen the filter if the delta is not too high.
+ delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >>
+ num_pels_log2_lookup[bs]) +
+ 1;
+
+ if (delta >= delta_thresh(bs, increase_denoising)) {
+ return COPY_BLOCK;
+ }
+
+ mc_avg = mc_avg_start;
+ avg = avg_start;
+ sig = sig_start;
+ for (r = 0; r < block_size_high[bs]; ++r) {
+ for (c = 0; c < block_size_wide[bs]; ++c) {
+ diff = mc_avg[c] - sig[c];
+ adj = abs(diff);
+ if (adj > delta) {
+ adj = delta;
+ }
+ if (diff > 0) {
+ // Diff positive means we made positive adjustment above
+ // (in first try/attempt), so now make negative adjustment to bring
+ // denoised signal down.
+ avg[c] = AOMMAX(0, avg[c] - adj);
+ total_adj -= adj;
+ } else {
+ // Diff negative means we made negative adjustment above
+ // (in first try/attempt), so now make positive adjustment to bring
+ // denoised signal up.
+ avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj);
+ total_adj += adj;
+ }
+ }
+ sig += sig_stride;
+ avg += avg_stride;
+ mc_avg += mc_avg_stride;
+ }
+
+ // We can use the filter if it has been sufficiently dampened
+ if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+ return FILTER_BLOCK;
+ }
+ return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
+ int mi_col) {
+ return framebuf + (stride * mi_row << 2) + (mi_col << 2);
+}
+
+static AV1_DENOISER_DECISION perform_motion_compensation(
+ AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
+ int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width,
+ int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer,
+ int use_gf_temporal_ref) {
+ const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+ ? 0
+ : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+ int frame;
+ int denoise_layer_idx = 0;
+ MACROBLOCKD *filter_mbd = &mb->e_mbd;
+ MB_MODE_INFO *mi = filter_mbd->mi[0];
+ MB_MODE_INFO saved_mi;
+ int i;
+ struct buf_2d saved_dst[MAX_MB_PLANE];
+ struct buf_2d saved_pre[MAX_MB_PLANE];
+ // const RefBuffer *saved_block_refs[2];
+ MV_REFERENCE_FRAME saved_frame;
+
+ frame = ctx->best_reference_frame;
+
+ saved_mi = *mi;
+
+ // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+ // denoise 16x16 blocks.
+ if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+ (bs == BLOCK_16X16 && width > 480 &&
+ denoiser->denoising_level <= kDenLow))
+ return COPY_BLOCK;
+
+ // If the best reference frame uses inter-prediction and there is enough of a
+ // difference in sum-squared-error, use it.
+ if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
+ sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+ mi->ref_frame[0] = ctx->best_reference_frame;
+ mi->mode = ctx->best_sse_inter_mode;
+ mi->mv[0] = ctx->best_sse_mv;
+ } else {
+ // Otherwise, use the zero reference frame.
+ frame = ctx->best_zeromv_reference_frame;
+ ctx->newmv_sse = ctx->zeromv_sse;
+ // Bias to last reference.
+ if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+ frame == ALTREF_FRAME ||
+ (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
+ (frame != LAST_FRAME &&
+ ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+ denoiser->denoising_level >= kDenHigh))) {
+ frame = LAST_FRAME;
+ ctx->newmv_sse = ctx->zeromv_lastref_sse;
+ }
+ mi->ref_frame[0] = frame;
+ mi->mode = GLOBALMV;
+ mi->mv[0].as_int = 0;
+ ctx->best_sse_inter_mode = GLOBALMV;
+ ctx->best_sse_mv.as_int = 0;
+ *zeromv_filter = 1;
+ if (denoiser->denoising_level > kDenMedium) {
+ motion_magnitude = 0;
+ }
+ }
+
+ saved_frame = frame;
+ // When using SVC, we need to map REF_FRAME to the frame buffer index.
+ if (use_svc) {
+ if (frame == LAST_FRAME)
+ frame = lst_fb_idx + 1;
+ else if (frame == GOLDEN_FRAME)
+ frame = gld_fb_idx + 1;
+ // Shift for the second spatial layer.
+ if (num_spatial_layers - spatial_layer == 2)
+ frame = frame + denoiser->num_ref_frames;
+ denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+ }
+
+ // Force copy (no denoise, copy source in denoised buffer) if
+ // running_avg_y[frame] is NULL.
+ if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+
+ if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+ if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+
+ // We will restore these after motion compensation.
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ saved_pre[i] = filter_mbd->plane[i].pre[0];
+ saved_dst[i] = filter_mbd->plane[i].dst;
+ }
+
+ // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+ // struct.
+ set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+ av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row,
+ mi_col, filter_mbd->block_ref_scale_factors[0], 1);
+ av1_setup_dst_planes(filter_mbd->plane, bs,
+ &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row,
+ mi_col, 0, 1);
+
+ av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col);
+
+ // Restore everything to its original state
+ *mi = saved_mi;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ filter_mbd->plane[i].pre[0] = saved_pre[i];
+ filter_mbd->plane[i].dst = saved_dst[i];
+ }
+
+ return FILTER_BLOCK;
+}
+
+void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
+ BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+ AV1_DENOISER_DECISION *denoiser_decision,
+ int use_gf_temporal_ref) {
+ int mv_col, mv_row;
+ int motion_magnitude = 0;
+ int zeromv_filter = 0;
+ AV1_DENOISER *denoiser = &cpi->denoiser;
+ AV1_DENOISER_DECISION decision = COPY_BLOCK;
+
+ const int shift =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+ ? denoiser->num_ref_frames
+ : 0;
+ YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+ const int denoise_layer_index =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+ YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
+ uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
+ uint8_t *mc_avg_start =
+ block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
+ struct buf_2d src = mb->plane[0].src;
+ int increase_denoising = 0;
+ int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG;
+ mv_col = ctx->best_sse_mv.as_mv.col;
+ mv_row = ctx->best_sse_mv.as_mv.row;
+ motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
+ if (denoiser->denoising_level == kDenHigh) increase_denoising = 1;
+
+ // Copy block if LAST_FRAME is not a reference.
+ // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+ // spatial layer doesn't have last reference when it's brought up for the
+ // first time on the fly.
+ if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+ !ctx->sb_skip_denoising)
+ decision = perform_motion_compensation(
+ &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+ motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
+ cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0],
+ cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc,
+ cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+
+ if (decision == FILTER_BLOCK) {
+ decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
+ mc_avg.y_stride, avg_start, avg.y_stride,
+ increase_denoising, bs, motion_magnitude);
+ }
+
+ if (decision == FILTER_BLOCK) {
+ aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ block_size_wide[bs], block_size_high[bs]);
+ } else { // COPY_BLOCK
+ aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ block_size_wide[bs], block_size_high[bs]);
+ }
+ *denoiser_decision = decision;
+ if (decision == FILTER_BLOCK && zeromv_filter == 1)
+ *denoiser_decision = FILTER_ZEROMV_BLOCK;
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+ const YV12_BUFFER_CONFIG *const src) {
+ int r;
+ const uint8_t *srcbuf = src->y_buffer;
+ uint8_t *destbuf = dest->y_buffer;
+
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+
+ for (r = 0; r < dest->y_height; ++r) {
+ memcpy(destbuf, srcbuf, dest->y_width);
+ destbuf += dest->y_stride;
+ srcbuf += src->y_stride;
+ }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
+ YV12_BUFFER_CONFIG *const src) {
+ uint8_t *tmp_buf = dest->y_buffer;
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+ dest->y_buffer = src->y_buffer;
+ src->y_buffer = tmp_buf;
+}
+
+void av1_denoiser_update_frame_info(
+ AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+ struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+ int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx, int resized,
+ int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+ const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
+ // Copy source into denoised reference buffers on KEY_FRAME or
+ // if the just encoded frame was resized. For SVC, copy source if the base
+ // spatial layer was key frame.
+ if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+ svc_refresh_denoiser_buffers) {
+ int i;
+ // Start at 1 so as not to overwrite the INTRA_FRAME
+ for (i = 1; i < denoiser->num_ref_frames; ++i) {
+ if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+ copy_frame(&denoiser->running_avg_y[i + shift], &src);
+ }
+ denoiser->reset = 0;
+ return;
+ }
+
+ if (rtc_ref->set_ref_frame_config) {
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i))
+ copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ } else {
+ // If more than one refresh occurs, must copy frame buffer.
+ if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+ 1) {
+ if (refresh_alt_ref_frame) {
+ copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_golden_frame) {
+ copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_last_frame) {
+ copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ } else {
+ if (refresh_alt_ref_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_golden_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_last_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ }
+ }
+}
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+ ctx->zeromv_sse = INT64_MAX;
+ ctx->newmv_sse = INT64_MAX;
+ ctx->zeromv_lastref_sse = INT64_MAX;
+ ctx->best_sse_mv.as_int = 0;
+}
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+ PREDICTION_MODE mode,
+ PICK_MODE_CONTEXT *ctx) {
+ if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+ ctx->zeromv_sse = sse;
+ ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+ if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse;
+ }
+
+ if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+ ctx->newmv_sse = sse;
+ ctx->best_sse_inter_mode = mode;
+ ctx->best_sse_mv = mi->mv[0];
+ ctx->best_reference_frame = mi->ref_frame[0];
+ }
+}
+
+static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm,
+ AV1_DENOISER *denoiser, int fb_idx) {
+ int fail = 0;
+ if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+ fail = aom_alloc_frame_buffer(
+ &denoiser->running_avg_y[fb_idx], cm->width, cm->height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+ struct RTC_REF *rtc_ref, struct SVC *svc,
+ int svc_buf_shift, int refresh_alt,
+ int refresh_gld, int refresh_lst, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx) {
+ int fail = 0;
+ if (rtc_ref->set_ref_frame_config) {
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (cm->current_frame.frame_type == KEY_FRAME ||
+ rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) {
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ i + 1 + svc_buf_shift);
+ }
+ }
+ } else {
+ if (refresh_alt) {
+ // Increase the frame buffer index by 1 to map it to the buffer index in
+ // the denoiser.
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ alt_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_gld) {
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ gld_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_lst) {
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ lst_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ }
+ return 0;
+}
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy, int use_highbitdepth, int border) {
+ int i, layer, fail, init_num_ref_frames;
+ const int legacy_byte_alignment = 0;
+ int num_layers = 1;
+ int scaled_width = width;
+ int scaled_height = height;
+ if (use_svc) {
+ LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ av1_get_layer_resolution(width, height, lc->scaling_factor_num,
+ lc->scaling_factor_den, &scaled_width,
+ &scaled_height);
+ // For SVC: only denoise at most 2 spatial (highest) layers.
+ if (noise_sen >= 2)
+ // Denoise from one spatial layer below the top.
+ svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0);
+ else
+ // Only denoise the top spatial layer.
+ svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0);
+ num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+ }
+ assert(denoiser != NULL);
+ denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+ init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES;
+ denoiser->num_layers = num_layers;
+ CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+ aom_calloc(denoiser->num_ref_frames * num_layers,
+ sizeof(denoiser->running_avg_y[0])));
+ CHECK_MEM_ERROR(
+ cm, denoiser->mc_running_avg_y,
+ aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+ for (layer = 0; layer < num_layers; ++layer) {
+ const int denoise_width = (layer == 0) ? width : scaled_width;
+ const int denoise_height = (layer == 0) ? height : scaled_height;
+ for (i = 0; i < init_num_ref_frames; ++i) {
+ fail = aom_alloc_frame_buffer(
+ &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+ denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border,
+ legacy_byte_alignment, 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ }
+
+ fail = aom_alloc_frame_buffer(
+ &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx,
+ ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+
+ // denoiser->last_source only used for noise_estimation, so only for top
+ // layer.
+ fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
+ use_highbitdepth, border, legacy_byte_alignment,
+ 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ denoiser->frame_buffer_initialized = 1;
+ denoiser->denoising_level = kDenMedium;
+ denoiser->prev_denoising_level = kDenMedium;
+ denoiser->reset = 0;
+ denoiser->current_denoiser_frame = 0;
+ return 0;
+}
+
+void av1_denoiser_free(AV1_DENOISER *denoiser) {
+ int i;
+ if (denoiser == NULL) {
+ return;
+ }
+ denoiser->frame_buffer_initialized = 0;
+ for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+ aom_free_frame_buffer(&denoiser->running_avg_y[i]);
+ }
+ aom_free(denoiser->running_avg_y);
+ denoiser->running_avg_y = NULL;
+
+ for (i = 0; i < denoiser->num_layers; ++i) {
+ aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+ }
+
+ aom_free(denoiser->mc_running_avg_y);
+ denoiser->mc_running_avg_y = NULL;
+ aom_free_frame_buffer(&denoiser->last_source);
+}
+
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+static void force_refresh_longterm_ref(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ // If long term reference is used, force refresh of that slot, so
+ // denoiser buffer for long term reference stays in sync.
+ if (svc->use_gf_temporal_ref_current_layer) {
+ int index = svc->spatial_layer_id;
+ if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+ assert(index >= 0);
+ cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+ cpi->refresh_alt_ref_frame = 1;
+ }
+}
+#endif
+
+void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) {
+ AV1_DENOISER *const denoiser = &cpi->denoiser;
+ denoiser->denoising_level = noise_level;
+ if (denoiser->denoising_level > kDenLowLow &&
+ denoiser->prev_denoising_level == kDenLowLow) {
+ denoiser->reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+ force_refresh_longterm_ref(cpi);
+#endif
+ } else {
+ denoiser->reset = 0;
+ }
+ denoiser->prev_denoising_level = denoiser->denoising_level;
+}
+
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+ CONTENT_STATE_SB content_state,
+ int temporal_layer_id) {
+ if ((content_state.source_sad_nonrd <= kLowSad &&
+ content_state.low_sumdiff) ||
+ (content_state.source_sad_nonrd == kHighSad &&
+ content_state.low_sumdiff) ||
+ (content_state.lighting_change && !content_state.low_sumdiff) ||
+ (noise_level == kDenHigh) || (temporal_layer_id != 0)) {
+ int64_t scaled_thr =
+ (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+ return scaled_thr;
+ } else {
+ return (5 * threshold) >> 2;
+ }
+}
+
+// Scale/increase the ac skip threshold for
+// denoiser speed-up.
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+ AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id) {
+ if (noise_level >= kDenLow && abs_sumdiff < 5)
+ threshold *= (noise_level == kDenLow) ? 2
+ : (temporal_layer_id == 2) ? 10
+ : 6;
+ return threshold;
+}
+
+void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) {
+ if (/*av1_denoise_svc_non_key(cpi) &&*/
+ cpi->denoiser.current_denoiser_frame == 0) {
+ cpi->denoiser.reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+ force_refresh_longterm_ref(cpi);
+#endif
+ }
+}
+
+void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ SVC *const svc = &cpi->svc;
+
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->denoiser.denoising_level > kDenLowLow) {
+ int svc_refresh_denoiser_buffers = 0;
+ int denoise_svc_second_layer = 0;
+ FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME
+ ? KEY_FRAME
+ : cm->current_frame.frame_type;
+ cpi->denoiser.current_denoiser_frame++;
+ const int resize_pending = is_frame_resize_pending(cpi);
+
+ if (cpi->ppi->use_svc) {
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+ const int svc_buf_shift =
+ svc->number_spatial_layers - svc->spatial_layer_id == 2
+ ? cpi->denoiser.num_ref_frames
+ : 0;
+ int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ svc_refresh_denoiser_buffers =
+ lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+ denoise_svc_second_layer =
+ svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+ // Check if we need to allocate extra buffers in the denoiser
+ // for refreshed frames.
+ if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref,
+ svc, svc_buf_shift,
+ cpi->refresh_alt_ref_frame,
+ cpi->refresh_golden_frame,
+ cpi->refresh_last_frame, cpi->alt_fb_idx,
+ cpi->gld_fb_idx, cpi->lst_fb_idx))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to re-allocate denoiser for SVC");
+#endif
+ }
+ av1_denoiser_update_frame_info(
+ &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type,
+ cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1,
+ rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0],
+ resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer);
+ }
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+ int r, c;
+ uint8_t *u = yuv->u_buffer;
+ uint8_t *v = yuv->v_buffer;
+
+ for (r = 0; r < yuv->uv_height; ++r) {
+ for (c = 0; c < yuv->uv_width; ++c) {
+ u[c] = UINT8_MAX / 2;
+ v[c] = UINT8_MAX / 2;
+ }
+ u += yuv->uv_stride;
+ v += yuv->uv_stride;
+ }
+}
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+ unsigned char *src = s->y_buffer;
+ int h = s->y_crop_height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+}
+#endif
diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.h b/third_party/aom/av1/encoder/av1_temporal_denoiser.h
new file mode 100644
index 0000000000..14dcccce69
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+#define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
+typedef enum av1_denoiser_decision {
+ COPY_BLOCK,
+ FILTER_BLOCK,
+ FILTER_ZEROMV_BLOCK
+} AV1_DENOISER_DECISION;
+
+typedef enum av1_denoiser_level {
+ kDenLowLow,
+ kDenLow,
+ kDenMedium,
+ kDenHigh
+} AV1_DENOISER_LEVEL;
+
+typedef struct av1_denoiser {
+ YV12_BUFFER_CONFIG *running_avg_y;
+ YV12_BUFFER_CONFIG *mc_running_avg_y;
+ YV12_BUFFER_CONFIG last_source;
+ int frame_buffer_initialized;
+ int reset;
+ int num_ref_frames;
+ int num_layers;
+ unsigned int current_denoiser_frame;
+ AV1_DENOISER_LEVEL denoising_level;
+ AV1_DENOISER_LEVEL prev_denoising_level;
+} AV1_DENOISER;
+
+typedef struct {
+ int64_t zero_last_cost_orig;
+ unsigned int *ref_frame_cost;
+ int_mv (*frame_mv)[REF_FRAMES];
+ int reuse_inter_pred;
+ TX_SIZE best_tx_size;
+ PREDICTION_MODE best_mode;
+ MV_REFERENCE_FRAME best_ref_frame;
+ int_interpfilters best_pred_filter;
+ uint8_t best_mode_skip_txfm;
+} AV1_PICKMODE_CTX_DEN;
+
+struct AV1_COMP;
+struct SVC;
+struct RTC_REF;
+
+void av1_denoiser_update_frame_info(
+ AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+ struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+ int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx, int resized,
+ int svc_refresh_denoiser_buffers, int second_spatial_layer);
+
+void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row,
+ int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+ AV1_DENOISER_DECISION *denoiser_decision,
+ int use_gf_temporal_ref);
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+ PREDICTION_MODE mode,
+ PICK_MODE_CONTEXT *ctx);
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+ struct RTC_REF *rtc, struct SVC *svc,
+ int svc_buf_shift, int refresh_alt,
+ int refresh_gld, int refresh_lst, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx);
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy, int use_highbitdepth, int border);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where av1_denoiser.h
+// is referenced.
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+ int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
+
+void av1_denoiser_free(AV1_DENOISER *denoiser);
+
+void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level);
+
+void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi);
+
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+ CONTENT_STATE_SB content_state,
+ int temporal_layer_id);
+
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+ AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id);
+
+void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi);
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 0000000000..219784fedf
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,4248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem_ops.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+#define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5 // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1 // Job dispatch time overhead per tile
+
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
+ aom_writer *const w, int plane, FRAME_COUNTS *counts);
+#endif
+
+static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+ const MB_MODE_INFO *mi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi,
+ PREDICTION_MODE mode,
+ aom_writer *w) {
+ assert(!is_intrabc_block(mi));
+ (void)mi;
+ aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
+ INTRA_MODES);
+}
+
+static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+ FRAME_CONTEXT *ec_ctx,
+ const int16_t mode_ctx) {
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+
+ aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+
+ if (mode != NEWMV) {
+ const int16_t zeromv_ctx =
+ (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+
+ if (mode != GLOBALMV) {
+ int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+ }
+ }
+}
+
+static AOM_INLINE void write_drl_idx(
+ FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+ assert(mbmi->ref_mv_idx < 3);
+
+ const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+ if (new_mv) {
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+
+ aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+ 2);
+ if (mbmi->ref_mv_idx == idx) return;
+ }
+ }
+ return;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ int idx;
+ // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+ for (idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+ aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+ ec_ctx->drl_cdf[drl_ctx], 2);
+ if (mbmi->ref_mv_idx == (idx - 1)) return;
+ }
+ }
+ return;
+ }
+}
+
+static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+ PREDICTION_MODE mode,
+ const int16_t mode_ctx) {
+ assert(is_inter_compound_mode(mode));
+ aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+ xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+ INTER_COMPOUND_MODES);
+}
+
+static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col,
+ aom_writer *w) {
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (depth == MAX_VARTX_DEPTH) {
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row,
+ mbmi->bsize, tx_size);
+ const int txb_size_index =
+ av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col);
+ const int write_txfm_partition =
+ tx_size == mbmi->inter_tx_size[txb_size_index];
+ if (write_txfm_partition) {
+ aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ // TODO(yuec): set correct txfm partition update for qttx
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+ aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+ if (sub_txs == TX_4X4) {
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, sub_txs, tx_size);
+ return;
+ }
+
+ assert(bsw > 0 && bsh > 0);
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetc = blk_col + col;
+ write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ if (block_signals_txsize(bsize)) {
+ const TX_SIZE tx_size = mbmi->tx_size;
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+ assert(depth >= 0 && depth <= max_depths);
+ assert(!is_inter_block(mbmi));
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+
+ aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ max_depths + 1);
+ }
+}
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ uint8_t segment_id, const MB_MODE_INFO *mi,
+ aom_writer *w) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip_txfm = mi->skip_txfm;
+ const int ctx = av1_get_skip_txfm_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2);
+ return skip_txfm;
+ }
+}
+
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ uint8_t segment_id, const MB_MODE_INFO *mi,
+ aom_writer *w) {
+ if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 0;
+ }
+ const int skip_mode = mi->skip_mode;
+ if (!is_comp_ref_allowed(mi->bsize)) {
+ assert(!skip_mode);
+ return 0;
+ }
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ // These features imply single-reference mode, while skip mode implies
+ // compound reference. Hence, the two are mutually exclusive.
+ // In other words, skip_mode is implicitly 0 here.
+ assert(!skip_mode);
+ return 0;
+ }
+ const int ctx = av1_get_skip_mode_context(xd);
+ aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+ return skip_mode;
+}
+
+static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, uint8_t segment_id,
+ aom_writer *w, const int is_inter) {
+ if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ assert(is_inter);
+ return;
+ }
+ const int ctx = av1_get_intra_inter_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+ }
+}
+
+static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ aom_writer *w) {
+ MOTION_MODE last_motion_mode_allowed =
+ cm->features.switchable_motion_mode
+ ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+ cm->features.allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+ assert(mbmi->motion_mode <= last_motion_mode_allowed);
+ switch (last_motion_mode_allowed) {
+ case SIMPLE_TRANSLATION: break;
+ case OBMC_CAUSAL:
+ aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+ xd->tile_ctx->obmc_cdf[mbmi->bsize], 2);
+ break;
+ default:
+ aom_write_symbol(w, mbmi->motion_mode,
+ xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
+ MOTION_MODES);
+ }
+}
+
+static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
+ int delta_qindex, aom_writer *w) {
+ int sign = delta_qindex < 0;
+ int abs = sign ? -delta_qindex : delta_qindex;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+ DELTA_Q_PROBS + 1);
+
+ if (!smallval) {
+ rem_bits = get_msb(abs - 1);
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits - 1, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, int lf_id,
+ int delta_lflevel,
+ int delta_lf_multi, aom_writer *w) {
+ int sign = delta_lflevel < 0;
+ int abs = sign ? -delta_lflevel : delta_lflevel;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ (void)cm;
+
+ if (delta_lf_multi) {
+ assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+ : FRAME_LF_COUNT - 2));
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+ ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+ } else {
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+ DELTA_LF_PROBS + 1);
+ }
+
+ if (!smallval) {
+ rem_bits = get_msb(abs - 1);
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits - 1, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
+ int n, int num, MapCdf map_pb_cdf) {
+ const TokenExtra *p = *tp;
+ const int palette_size_idx = n - PALETTE_MIN_SIZE;
+ write_uniform(w, n, p->token); // The first color index.
+ ++p;
+ --num;
+ for (int i = 0; i < num; ++i) {
+ assert((p->color_ctx >= 0) &&
+ (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS));
+ aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx];
+ aom_write_symbol(w, p->token, color_map_cdf, n);
+ ++p;
+ }
+ *tp = p;
+}
+
+static AOM_INLINE void pack_txb_tokens(
+ aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp,
+ const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+ int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
+ int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+
+ if (tx_size == plane_tx_size || plane) {
+ av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size);
+#if CONFIG_RD_DEBUG
+ TOKEN_STATS tmp_token_stats;
+ init_token_stats(&tmp_token_stats);
+ token_stats->cost += tmp_token_stats.cost;
+#endif
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int r = 0; r < row_end; r += bsh) {
+ const int offsetr = blk_row + r;
+ for (int c = 0; c < col_end; c += bsw) {
+ const int offsetc = blk_col + c;
+ pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+ bit_depth, block, offsetr, offsetc, sub_txs,
+ token_stats);
+ block += step;
+ }
+ }
+ }
+}
+
+static INLINE void set_spatial_segment_id(
+ const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) {
+ const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+
+ const int mi_stride = mi_params->mi_cols;
+
+ set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id);
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+ assert(x < max);
+ const int diff = x - ref;
+ if (!ref) return x;
+ if (ref >= (max - 1)) return -x + max - 1;
+ if (2 * ref < max) {
+ if (abs(diff) <= ref) {
+ if (diff > 0)
+ return (diff << 1) - 1;
+ else
+ return ((-diff) << 1);
+ }
+ return x;
+ } else {
+ if (abs(diff) < (max - ref)) {
+ if (diff > 0)
+ return (diff << 1) - 1;
+ else
+ return ((-diff) << 1);
+ }
+ return (max - x) - 1;
+ }
+}
+
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
+ const MB_MODE_INFO *const mbmi,
+ aom_writer *w,
+ const struct segmentation *seg,
+ struct segmentation_probs *segp,
+ int skip_txfm) {
+ if (!seg->enabled || !seg->update_map) return;
+
+ AV1_COMMON *const cm = &cpi->common;
+ int cdf_num;
+ const uint8_t pred = av1_get_spatial_seg_pred(
+ cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ if (skip_txfm) {
+ // Still need to transmit tx size for intra blocks even if skip_txfm is
+ // true. Changing segment_id may make the tx size become invalid, e.g
+ // changing from lossless to lossy.
+ assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
+
+ set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+ mi_row, mi_col, pred);
+ set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize,
+ mi_row, mi_col, pred);
+ /* mbmi is read only but we need to update segment_id */
+ ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+ return;
+ }
+
+ const int coded_id =
+ av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+ aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+ aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+ set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+ mi_row, mi_col, mbmi->segment_id);
+}
+
+#define WRITE_REF_BIT(bname, pname) \
+ aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
+
+// This function encodes the reference frame
+static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ const uint8_t segment_id = mbmi->segment_id;
+
+ // If segment level coding of this signal is disabled...
+ // or the segment allows multiple reference frame options
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] ==
+ get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+ } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] == LAST_FRAME);
+ } else {
+ // does the feature use compound prediction or not
+ // (if not specified at the frame/segment level)
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ if (is_comp_ref_allowed(mbmi->bsize))
+ aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
+ } else {
+ assert((!is_compound) ==
+ (cm->current_frame.reference_mode == SINGLE_REFERENCE));
+ }
+
+ if (is_compound) {
+ const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+ ? UNIDIR_COMP_REFERENCE
+ : BIDIR_COMP_REFERENCE;
+ aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+ 2);
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+ WRITE_REF_BIT(bit, uni_comp_ref_p);
+
+ if (!bit) {
+ assert(mbmi->ref_frame[0] == LAST_FRAME);
+ const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+ if (bit1) {
+ const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit2, uni_comp_ref_p2);
+ }
+ } else {
+ assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+ }
+
+ return;
+ }
+
+ assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+ const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME);
+ WRITE_REF_BIT(bit, comp_ref_p);
+
+ if (!bit) {
+ const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+ WRITE_REF_BIT(bit1, comp_ref_p1);
+ } else {
+ const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit2, comp_ref_p2);
+ }
+
+ const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+ WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+ if (!bit_bwd) {
+ WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+ }
+
+ } else {
+ const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+ mbmi->ref_frame[0] >= BWDREF_FRAME);
+ WRITE_REF_BIT(bit0, single_ref_p1);
+
+ if (bit0) {
+ const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+ WRITE_REF_BIT(bit1, single_ref_p2);
+
+ if (!bit1) {
+ WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+ }
+ } else {
+ const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[0] == GOLDEN_FRAME);
+ WRITE_REF_BIT(bit2, single_ref_p3);
+
+ if (!bit2) {
+ const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+ WRITE_REF_BIT(bit3, single_ref_p4);
+ } else {
+ const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+ WRITE_REF_BIT(bit4, single_ref_p5);
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_filter_intra_mode_info(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ aom_writer *w) {
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+ xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2);
+ if (mbmi->filter_intra_mode_info.use_filter_intra) {
+ const FILTER_INTRA_MODE mode =
+ mbmi->filter_intra_mode_info.filter_intra_mode;
+ aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+ FILTER_INTRA_MODES);
+ }
+ }
+}
+
+static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
+ aom_cdf_prob *cdf) {
+ aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+ 2 * MAX_ANGLE_DELTA + 1);
+}
+
+static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
+ ThreadData *td, aom_writer *w) {
+ const MACROBLOCKD *xd = &td->mb.e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!av1_is_interp_needed(xd)) {
+ int_interpfilters filters = av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->features.interp_filter));
+ assert(mbmi->interp_filters.as_int == filters.as_int);
+ (void)filters;
+ return;
+ }
+ if (cm->features.interp_filter == SWITCHABLE) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter =
+ av1_extract_interp_filter(mbmi->interp_filters, dir);
+ aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+ SWITCHABLE_FILTERS);
+ ++td->interp_filter_selected[filter];
+ if (cm->seq_params->enable_dual_filter == 0) return;
+ }
+ }
+}
+
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
+ int bit_depth, int min_val,
+ aom_writer *w) {
+ if (num <= 0) return;
+ assert(colors[0] < (1 << bit_depth));
+ aom_write_literal(w, colors[0], bit_depth);
+ if (num == 1) return;
+ int max_delta = 0;
+ int deltas[PALETTE_MAX_SIZE];
+ memset(deltas, 0, sizeof(deltas));
+ for (int i = 1; i < num; ++i) {
+ assert(colors[i] < (1 << bit_depth));
+ const int delta = colors[i] - colors[i - 1];
+ deltas[i - 1] = delta;
+ assert(delta >= min_val);
+ if (delta > max_delta) max_delta = delta;
+ }
+ const int min_bits = bit_depth - 3;
+ int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+ assert(bits <= bit_depth);
+ int range = (1 << bit_depth) - colors[0] - min_val;
+ aom_write_literal(w, bits - min_bits, 2);
+ for (int i = 0; i < num - 1; ++i) {
+ aom_write_literal(w, deltas[i] - min_val, bits);
+ range -= deltas[i];
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+}
+
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static AOM_INLINE void write_palette_colors_y(
+ const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[0];
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache =
+ av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+ cache_color_found, out_cache_colors);
+ int n_in_cache = 0;
+ for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+ const int found = cache_color_found[i];
+ aom_write_bit(w, found);
+ n_in_cache += found;
+ }
+ assert(n_in_cache + n_out_cache == n);
+ delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static AOM_INLINE void write_palette_colors_uv(
+ const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[1];
+ const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+ const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+ // U channel colors.
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache = av1_index_color_cache(
+ color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+ int n_in_cache = 0;
+ for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+ const int found = cache_color_found[i];
+ aom_write_bit(w, found);
+ n_in_cache += found;
+ }
+ delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+ // V channel colors. Don't use color cache as the colors are not sorted.
+ const int max_val = 1 << bit_depth;
+ int zero_count = 0, min_bits_v = 0;
+ int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int rate_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int rate_using_raw = bit_depth * n;
+ if (rate_using_delta < rate_using_raw) { // delta encoding
+ assert(colors_v[0] < (1 << bit_depth));
+ aom_write_bit(w, 1);
+ aom_write_literal(w, bits_v - min_bits_v, 2);
+ aom_write_literal(w, colors_v[0], bit_depth);
+ for (int i = 1; i < n; ++i) {
+ assert(colors_v[i] < (1 << bit_depth));
+ if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit.
+ aom_write_literal(w, 0, bits_v);
+ continue;
+ }
+ const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+ const int sign_bit = colors_v[i] < colors_v[i - 1];
+ if (delta <= max_val - delta) {
+ aom_write_literal(w, delta, bits_v);
+ aom_write_bit(w, sign_bit);
+ } else {
+ aom_write_literal(w, max_val - delta, bits_v);
+ aom_write_bit(w, !sign_bit);
+ }
+ }
+ } else { // Transmit raw values.
+ aom_write_bit(w, 0);
+ for (int i = 0; i < n; ++i) {
+ assert(colors_v[i] < (1 << bit_depth));
+ aom_write_literal(w, colors_v[i], bit_depth);
+ }
+ }
+}
+
+static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *const mbmi,
+ aom_writer *w) {
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
+ aom_write_symbol(
+ w, n > 0,
+ xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
+ if (n > 0) {
+ aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+ xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+ PALETTE_SIZES);
+ write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w);
+ }
+ }
+
+ const int uv_dc_pred =
+ num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref;
+ if (uv_dc_pred) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+ aom_write_symbol(w, n > 0,
+ xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+ if (n > 0) {
+ aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+ xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+ PALETTE_SIZES);
+ write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w);
+ }
+ }
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const FeatureFlags *const features = &cm->features;
+ const int is_inter = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
+ ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
+ (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+ !mbmi->skip_txfm &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, is_inter, features->reduced_tx_set_used);
+ const int eset =
+ get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used);
+ // eset == 0 should correspond to a set with only DCT_DCT and there
+ // is no need to send the tx_type
+ assert(eset > 0);
+ assert(av1_ext_tx_used[tx_set_type][tx_type]);
+ if (is_inter) {
+ aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+ ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+ av1_num_ext_tx_set[tx_set_type]);
+ } else {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir =
+ fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+ aom_write_symbol(
+ w, av1_ext_tx_ind[tx_set_type][tx_type],
+ ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+ }
+}
+
+static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
+ BLOCK_SIZE bsize,
+ PREDICTION_MODE mode,
+ aom_writer *w) {
+ aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+ INTRA_MODES);
+}
+
+static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+ UV_PREDICTION_MODE uv_mode,
+ PREDICTION_MODE y_mode,
+ CFL_ALLOWED_TYPE cfl_allowed,
+ aom_writer *w) {
+ aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+ UV_INTRA_MODES - !cfl_allowed);
+}
+
+static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
+ uint8_t idx, int8_t joint_sign,
+ aom_writer *w) {
+ aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ // Magnitudes are only signaled for nonzero codes.
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+ }
+}
+
+static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
+ aom_writer *w, int skip) {
+ if (cm->features.coded_lossless || cm->features.allow_intrabc) return;
+
+ // At the start of a superblock, mark that we haven't yet written CDEF
+ // strengths for any of the CDEF units contained in this superblock.
+ const int sb_mask = (cm->seq_params->mib_size - 1);
+ const int mi_row_in_sb = (xd->mi_row & sb_mask);
+ const int mi_col_in_sb = (xd->mi_col & sb_mask);
+ if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+ xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+ xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
+ }
+
+ // CDEF unit size is 64x64 irrespective of the superblock size.
+ const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+ // Find index of this CDEF unit in this superblock.
+ const int index_mask = cdef_size;
+ const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+ const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+ const int index = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
+ : 0;
+
+ // Write CDEF strength to the first non-skip coding block in this CDEF unit.
+ if (!xd->cdef_transmitted[index] && !skip) {
+ // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO
+ // of the 1st block in this CDEF unit.
+ const int first_block_mask = ~(cdef_size - 1);
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int grid_idx =
+ get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+ xd->mi_col & first_block_mask);
+ const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+ aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
+ xd->cdef_transmitted[index] = true;
+ }
+}
+
+static AOM_INLINE void write_inter_segment_id(
+ AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
+ const struct segmentation *const seg, struct segmentation_probs *const segp,
+ int skip, int preskip) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ if (seg->update_map) {
+ if (preskip) {
+ if (!seg->segid_preskip) return;
+ } else {
+ if (seg->segid_preskip) return;
+ if (skip) {
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 1);
+ if (seg->temporal_update) mbmi->seg_id_predicted = 0;
+ return;
+ }
+ }
+ if (seg->temporal_update) {
+ const int pred_flag = mbmi->seg_id_predicted;
+ aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+ aom_write_symbol(w, pred_flag, pred_cdf, 2);
+ if (!pred_flag) {
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+ }
+ if (pred_flag) {
+ set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+ mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
+ }
+ } else {
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+ }
+ }
+}
+
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, int skip,
+ aom_writer *w) {
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+
+ if (delta_q_info->delta_q_present_flag) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int super_block_upper_left =
+ ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+ ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+
+ if ((bsize != cm->seq_params->sb_size || skip == 0) &&
+ super_block_upper_left) {
+ assert(mbmi->current_qindex > 0);
+ const int reduced_delta_qindex =
+ (mbmi->current_qindex - xd->current_base_qindex) /
+ delta_q_info->delta_q_res;
+ write_delta_qindex(xd, reduced_delta_qindex, w);
+ xd->current_base_qindex = mbmi->current_qindex;
+ if (delta_q_info->delta_lf_present_flag) {
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ int reduced_delta_lflevel =
+ (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+ delta_q_info->delta_lf_res;
+ write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w);
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ }
+ } else {
+ int reduced_delta_lflevel =
+ (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+ delta_q_info->delta_lf_res;
+ write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w);
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
+ MACROBLOCKD *const xd,
+ int is_keyframe,
+ aom_writer *w) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ // Y mode.
+ if (is_keyframe) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+ } else {
+ write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+ }
+
+ // Y angle delta.
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ if (use_angle_delta && av1_is_directional_mode(mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+ ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ }
+
+ // UV mode and UV angle delta.
+ if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+ if (uv_mode == UV_CFL_PRED)
+ write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (use_angle_delta && av1_is_directional_mode(intra_mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+ ec_ctx->angle_delta_cdf[intra_mode - V_PRED]);
+ }
+ }
+
+ // Palette.
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+ write_palette_mode_info(cm, xd, mbmi, w);
+ }
+
+ // Filter intra.
+ write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
+
+static INLINE int16_t mode_context_analyzer(
+ const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) {
+ if (rf[1] <= INTRA_FRAME) return mode_context;
+
+ const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK;
+ const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+ const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+ newmv_ctx, COMP_NEWMV_CTXS - 1)];
+ return comp_ctx;
+}
+
+static INLINE int_mv get_ref_mv_from_stack(
+ int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+ if (ref_frame[1] > INTRA_FRAME) {
+ assert(ref_idx == 0 || ref_idx == 1);
+ return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+ : curr_ref_mv_stack[ref_mv_idx].this_mv;
+ }
+
+ assert(ref_idx == 0);
+ return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+ ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+ return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+ x->mbmi_ext_frame);
+}
+
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
+ const PREDICTION_MODE mode = mbmi->mode;
+ const uint8_t segment_id = mbmi->segment_id;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int allow_hp = cm->features.allow_high_precision_mv;
+ const int is_inter = is_inter_block(mbmi);
+ const int is_compound = has_second_ref(mbmi);
+ int ref;
+
+ write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1);
+
+ write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+ assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm));
+ const int skip =
+ mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+ write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0);
+
+ write_cdef(cm, xd, w, skip);
+
+ write_delta_q_params(cm, xd, skip, w);
+
+ if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+ if (mbmi->skip_mode) return;
+
+ if (!is_inter) {
+ write_intra_prediction_modes(cm, xd, 0, w);
+ } else {
+ int16_t mode_ctx;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ write_ref_frames(cm, xd, w);
+
+ mode_ctx =
+ mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame);
+
+ // If segment skip is not enabled code the mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+ if (is_inter_compound_mode(mode))
+ write_inter_compound_mode(xd, w, mode, mode_ctx);
+ else if (is_inter_singleref_mode(mode))
+ write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+ if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+ write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w);
+ else
+ assert(mbmi->ref_mv_idx == 0);
+ }
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = get_ref_mv(x, ref);
+ av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = get_ref_mv(x, 1);
+ av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = get_ref_mv(x, 0);
+ av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ }
+
+ if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
+ cpi->common.seq_params->enable_interintra_compound &&
+ is_interintra_allowed(mbmi)) {
+ const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+ const int bsize_group = size_group_lookup[bsize];
+ aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+ if (interintra) {
+ aom_write_symbol(w, mbmi->interintra_mode,
+ ec_ctx->interintra_mode_cdf[bsize_group],
+ INTERINTRA_MODES);
+ if (av1_is_wedge_used(bsize)) {
+ aom_write_symbol(w, mbmi->use_wedge_interintra,
+ ec_ctx->wedge_interintra_cdf[bsize], 2);
+ if (mbmi->use_wedge_interintra) {
+ aom_write_symbol(w, mbmi->interintra_wedge_index,
+ ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+ }
+ }
+ }
+ }
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+ // First write idx to indicate current compound inter prediction mode group
+ // Group A (0): dist_wtd_comp, compound_average
+ // Group B (1): interintra, compound_diffwtd, wedge
+ if (has_second_ref(mbmi)) {
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+
+ if (masked_compound_used) {
+ const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+ aom_write_symbol(w, mbmi->comp_group_idx,
+ ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+ } else {
+ assert(mbmi->comp_group_idx == 0);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ if (mbmi->compound_idx)
+ assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+ if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ aom_write_symbol(w, mbmi->compound_idx,
+ ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+ } else {
+ assert(mbmi->compound_idx == 1);
+ }
+ } else {
+ assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+ assert(masked_compound_used);
+ // compound_diffwtd, wedge
+ assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+ aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE,
+ ec_ctx->compound_type_cdf[bsize],
+ MASKED_COMPOUND_TYPES);
+
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+ ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+ aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+ } else {
+ assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+ aom_write_literal(w, mbmi->interinter_comp.mask_type,
+ MAX_DIFFWTD_MASK_BITS);
+ }
+ }
+ }
+ write_mb_interp_filter(cm, td, w);
+ }
+}
+
+static AOM_INLINE void write_intrabc_info(
+ MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ int use_intrabc = is_intrabc_block(mbmi);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+ if (use_intrabc) {
+ assert(mbmi->mode == DC_PRED);
+ assert(mbmi->uv_mode == UV_DC_PRED);
+ assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+ int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv;
+ av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+ }
+}
+
+static AOM_INLINE void write_mb_modes_kf(
+ AV1_COMP *cpi, MACROBLOCKD *xd,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ if (seg->segid_preskip && seg->update_map)
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+
+ const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+ if (!seg->segid_preskip && seg->update_map)
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, skip);
+
+ write_cdef(cm, xd, w, skip);
+
+ write_delta_q_params(cm, xd, skip, w);
+
+ if (av1_allow_intrabc(cm)) {
+ write_intrabc_info(xd, mbmi_ext_frame, w);
+ if (is_intrabc_block(mbmi)) return;
+ }
+
+ write_intra_prediction_modes(cm, xd, 1, w);
+}
+
+#if CONFIG_RD_DEBUG
+static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
+ printf("\nmi->mi_row == %d\n", mi->mi_row);
+ printf("&& mi->mi_col == %d\n", mi->mi_col);
+ printf("&& mi->bsize == %d\n", mi->bsize);
+ printf("&& mi->tx_size == %d\n", mi->tx_size);
+ printf("&& mi->mode == %d\n", mi->mode);
+}
+
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+ int plane) {
+ if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+ printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+ plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+#if ENC_MISMATCH_DEBUG
+static AOM_INLINE void enc_dump_logs(
+ const AV1_COMMON *const cm,
+ const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) {
+ const MB_MODE_INFO *const mbmi = *(
+ cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col));
+ const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame =
+ mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col,
+ cm->mi_params.mi_alloc_bsize,
+ mbmi_ext_info->stride);
+ if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
+ if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
+ cm->show_frame == 1) {
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ int_mv mv[2] = { 0 };
+ const int is_comp_ref = has_second_ref(mbmi);
+
+ for (int ref = 0; ref < 1 + is_comp_ref; ++ref)
+ mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+ if (!is_comp_ref) {
+ mv[1].as_int = 0;
+ }
+
+ const int16_t mode_ctx =
+ is_comp_ref ? 0
+ : mode_context_analyzer(mbmi_ext_frame->mode_context,
+ mbmi->ref_frame);
+
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+ int16_t zeromv_ctx = -1;
+ int16_t refmv_ctx = -1;
+
+ if (mbmi->mode != NEWMV) {
+ zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mbmi->mode != GLOBALMV)
+ refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ }
+
+ printf(
+ "=== ENCODER ===: "
+ "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+ "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+ "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+ "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+ cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode,
+ mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+ mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+ mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+ zeromv_ctx, refmv_ctx, mbmi->tx_size);
+ }
+ }
+}
+#endif // ENC_MISMATCH_DEBUG
+
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ MB_MODE_INFO *m = xd->mi[0];
+
+ if (frame_is_intra_only(cm)) {
+ write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w);
+ } else {
+ // has_subpel_mv_component needs the ref frame buffers set up to look
+ // up if they are scaled. has_subpel_mv_component is in turn needed by
+ // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+ set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
+
+#if ENC_MISMATCH_DEBUG
+ enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
+#endif // ENC_MISMATCH_DEBUG
+
+ pack_inter_mode_mvs(cpi, td, w);
+ }
+}
+
+static AOM_INLINE void write_inter_txb_coeff(
+ AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
+ aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end,
+ TOKEN_STATS *token_stats, const int row, const int col, int *block,
+ const int plane) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ const int bkw = tx_size_wide_unit[max_tx_size];
+ const int bkh = tx_size_high_unit[max_tx_size];
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+ const int num_4x4_w = mi_size_wide[plane_bsize];
+ const int num_4x4_h = mi_size_high[plane_bsize];
+ const int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ const int mu_blocks_high = mi_size_high[max_unit_bsize];
+ const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h);
+ const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w);
+ for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
+ for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
+ pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+ cm->seq_params->bit_depth, *block, blk_row, blk_col,
+ max_tx_size, token_stats);
+ *block += step;
+ }
+ }
+}
+
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+ aom_writer *w, const TokenExtra **tok,
+ const TokenExtra *const tok_end) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ assert(!mbmi->skip_txfm);
+
+ const int is_inter = is_inter_block(mbmi);
+ if (!is_inter) {
+ av1_write_intra_coeffs_mb(cm, x, w, bsize);
+ } else {
+ int block[MAX_MB_PLANE] = { 0 };
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int num_4x4_w = mi_size_wide[bsize];
+ const int num_4x4_h = mi_size_high[bsize];
+ TOKEN_STATS token_stats;
+ init_token_stats(&token_stats);
+
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64,
+ xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
+ const int num_planes = av1_num_planes(cm);
+ for (int row = 0; row < num_4x4_h; row += mu_blocks_high) {
+ for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row,
+ col, &block[plane], plane);
+ }
+ }
+ }
+#if CONFIG_RD_DEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (mbmi->bsize >= BLOCK_8X8 &&
+ rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+ dump_mode_info(mbmi);
+ assert(0);
+ }
+ }
+#endif // CONFIG_RD_DEBUG
+ }
+}
+
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+ const TileInfo *const tile, aom_writer *w,
+ const TokenExtra **tok,
+ const TokenExtra *const tok_end,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCKD *xd = &td->mb.e_mbd;
+ FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
+ const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
+ xd->mi = mi_params->mi_grid_base + grid_idx;
+ td->mb.mbmi_ext_frame =
+ cpi->mbmi_ext_info.frame_base +
+ get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+ cpi->mbmi_ext_info.stride);
+ xd->tx_type_map = mi_params->tx_type_map + grid_idx;
+ xd->tx_type_map_stride = mi_params->mi_stride;
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(bsize <= cm->seq_params->sb_size ||
+ (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
+
+ const int bh = mi_size_high[bsize];
+ const int bw = mi_size_wide[bsize];
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+ mi_params->mi_cols);
+
+ xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ write_mbmi_b(cpi, td, w);
+
+ for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+ const uint8_t palette_size_plane =
+ mbmi->palette_mode_info.palette_size[plane];
+ assert(!mbmi->skip_mode || !palette_size_plane);
+ if (palette_size_plane > 0) {
+ assert(mbmi->use_intrabc == 0);
+ assert(av1_allow_palette(cm->features.allow_screen_content_tools,
+ mbmi->bsize));
+ assert(!plane || xd->is_chroma_ref);
+ int rows, cols;
+ av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows,
+ &cols);
+ assert(*tok < tok_end);
+ MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf
+ : tile_ctx->palette_y_color_index_cdf;
+ pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf);
+ }
+ }
+
+ const int is_inter_tx = is_inter_block(mbmi);
+ const int skip_txfm = mbmi->skip_txfm;
+ const uint8_t segment_id = mbmi->segment_id;
+ if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+ !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) {
+ if (is_inter_tx) { // This implies skip flag is 0.
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+ const int txbh = tx_size_high_unit[max_tx_size];
+ const int txbw = tx_size_wide_unit[max_tx_size];
+ const int width = mi_size_wide[bsize];
+ const int height = mi_size_high[bsize];
+ for (int idy = 0; idy < height; idy += txbh) {
+ for (int idx = 0; idx < width; idx += txbw) {
+ write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+ }
+ }
+ } else {
+ write_selected_tx_size(xd, w);
+ set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
+ }
+ } else {
+ set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+ skip_txfm && is_inter_tx, xd);
+ }
+
+ if (!mbmi->skip_txfm) {
+ int start = aom_tell_size(w);
+
+ write_tokens_b(cpi, &td->mb, w, tok, tok_end);
+
+ const int end = aom_tell_size(w);
+ td->coefficient_size += end - start;
+ }
+}
+
+static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int hbs,
+ int mi_row, int mi_col, PARTITION_TYPE p,
+ BLOCK_SIZE bsize, aom_writer *w) {
+ const int is_partition_point = bsize >= BLOCK_8X8;
+
+ if (!is_partition_point) return;
+
+ const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT);
+ return;
+ }
+
+ if (has_rows && has_cols) {
+ aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+ partition_cdf_length(bsize));
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+ aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+ } else {
+ assert(has_rows && !has_cols);
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+ aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+ }
+}
+
+static AOM_INLINE void write_modes_sb(
+ AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
+ aom_writer *const w, const TokenExtra **tok,
+ const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int quarter_step = mi_size_wide[bsize] / 4;
+ int i;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+#if !CONFIG_REALTIME_ONLY
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ int rcol0, rcol1, rrow0, rrow1;
+
+ // Skip some unnecessary work if loop restoration is disabled
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+ if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
+ const int rstride = cm->rst_info[plane].horz_units;
+ for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+ for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+ const int runit_idx = rcol + rrow * rstride;
+ loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane,
+ td->counts);
+ }
+ }
+ }
+ }
+#endif
+
+ write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+ switch (partition) {
+ case PARTITION_NONE:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_row + hbs < mi_params->mi_rows)
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_col + hbs < mi_params->mi_cols)
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+ subsize);
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+ subsize);
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+ subsize);
+ break;
+ case PARTITION_HORZ_A:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+ write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col);
+ }
+ break;
+ default: assert(0);
+ }
+
+ // update partition context
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+// Populate token pointers appropriately based on token_info.
+static AOM_INLINE void get_token_pointers(const TokenInfo *token_info,
+ const int tile_row, int tile_col,
+ const int sb_row_in_tile,
+ const TokenExtra **tok,
+ const TokenExtra **tok_end) {
+ if (!is_token_info_allocated(token_info)) {
+ *tok = NULL;
+ *tok_end = NULL;
+ return;
+ }
+ *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start;
+ *tok_end =
+ *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count;
+}
+
+static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
+ const TileInfo *const tile,
+ aom_writer *const w, int tile_row,
+ int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ const int mi_row_start = tile->mi_row_start;
+ const int mi_row_end = tile->mi_row_end;
+ const int mi_col_start = tile->mi_col_start;
+ const int mi_col_end = tile->mi_col_end;
+ const int num_planes = av1_num_planes(cm);
+
+ av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
+ av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
+
+ if (cpi->common.delta_q_info.delta_q_present_flag) {
+ xd->current_base_qindex = cpi->common.quant_params.base_qindex;
+ if (cpi->common.delta_q_info.delta_lf_present_flag) {
+ av1_reset_loop_filter_delta(xd, num_planes);
+ }
+ }
+
+ for (int mi_row = mi_row_start; mi_row < mi_row_end;
+ mi_row += cm->seq_params->mib_size) {
+ const int sb_row_in_tile =
+ (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2;
+ const TokenInfo *token_info = &cpi->token_info;
+ const TokenExtra *tok;
+ const TokenExtra *tok_end;
+ get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok,
+ &tok_end);
+
+ av1_zero_left_context(xd);
+
+ for (int mi_col = mi_col_start; mi_col < mi_col_end;
+ mi_col += cm->seq_params->mib_size) {
+ td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+ write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col,
+ cm->seq_params->sb_size);
+ }
+ assert(tok == tok_end);
+ }
+}
+
+static AOM_INLINE void encode_restoration_mode(
+ AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+ assert(!cm->features.all_lossless);
+ if (!cm->seq_params->enable_restoration) return;
+ if (cm->features.allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int all_none = 1, chroma_none = 1;
+ for (int p = 0; p < num_planes; ++p) {
+ RestorationInfo *rsi = &cm->rst_info[p];
+ if (rsi->frame_restoration_type != RESTORE_NONE) {
+ all_none = 0;
+ chroma_none &= p == 0;
+ }
+ switch (rsi->frame_restoration_type) {
+ case RESTORE_NONE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_WIENER:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_SGRPROJ:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 1);
+ break;
+ case RESTORE_SWITCHABLE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 1);
+ break;
+ default: assert(0);
+ }
+ }
+ if (!all_none) {
+ assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+ cm->seq_params->sb_size == BLOCK_128X128);
+ const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
+
+ RestorationInfo *rsi = &cm->rst_info[0];
+
+ assert(rsi->restoration_unit_size >= sb_size);
+ assert(RESTORATION_UNITSIZE_MAX == 256);
+
+ if (sb_size == 64) {
+ aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+ }
+ if (rsi->restoration_unit_size > 64) {
+ aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
+ }
+ }
+
+ if (num_planes > 1) {
+ int s =
+ AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
+ if (s && !chroma_none) {
+ aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+ cm->rst_info[0].restoration_unit_size);
+ assert(cm->rst_info[1].restoration_unit_size ==
+ cm->rst_info[0].restoration_unit_size ||
+ cm->rst_info[1].restoration_unit_size ==
+ (cm->rst_info[0].restoration_unit_size >> s));
+ assert(cm->rst_info[2].restoration_unit_size ==
+ cm->rst_info[1].restoration_unit_size);
+ } else if (!s) {
+ assert(cm->rst_info[1].restoration_unit_size ==
+ cm->rst_info[0].restoration_unit_size);
+ assert(cm->rst_info[2].restoration_unit_size ==
+ cm->rst_info[1].restoration_unit_size);
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void write_wiener_filter(int wiener_win,
+ const WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info,
+ aom_writer *wb) {
+ if (wiener_win == WIENER_WIN)
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ else
+ assert(wiener_info->vfilter[0] == 0 &&
+ wiener_info->vfilter[WIENER_WIN - 1] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ if (wiener_win == WIENER_WIN)
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ else
+ assert(wiener_info->hfilter[0] == 0 &&
+ wiener_info->hfilter[WIENER_WIN - 1] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info,
+ aom_writer *wb) {
+ aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+ const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+
+ if (params->r[0] == 0) {
+ assert(sgrproj_info->xqd[0] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ } else if (params->r[1] == 0) {
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ } else {
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ }
+
+ memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
+ aom_writer *const w, int plane, FRAME_COUNTS *counts) {
+ const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx];
+ const RestorationInfo *rsi = cm->rst_info + plane;
+ RestorationType frame_rtype = rsi->frame_restoration_type;
+ assert(frame_rtype != RESTORE_NONE);
+
+ (void)counts;
+ assert(!cm->features.all_lossless);
+
+ const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+ WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
+ SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane];
+ RestorationType unit_rtype = rui->restoration_type;
+
+ if (frame_rtype == RESTORE_SWITCHABLE) {
+ aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+ ++counts->switchable_restore[unit_rtype];
+#endif
+ switch (unit_rtype) {
+ case RESTORE_WIENER:
+#if DEBUG_LR_COSTING
+ assert(!memcmp(
+ ref_wiener_info,
+ &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info,
+ sizeof(*ref_wiener_info)));
+#endif
+ write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+ break;
+ case RESTORE_SGRPROJ:
+#if DEBUG_LR_COSTING
+ assert(!memcmp(&ref_sgrproj_info->xqd,
+ &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx]
+ .sgrproj_info.xqd,
+ sizeof(ref_sgrproj_info->xqd)));
+#endif
+ write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+ break;
+ default: assert(unit_rtype == RESTORE_NONE); break;
+ }
+ } else if (frame_rtype == RESTORE_WIENER) {
+ aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+ xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+ if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+ assert(
+ !memcmp(ref_wiener_info,
+ &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info,
+ sizeof(*ref_wiener_info)));
+#endif
+ write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+ }
+ } else if (frame_rtype == RESTORE_SGRPROJ) {
+ aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+ xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+ if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+ assert(!memcmp(
+ &ref_sgrproj_info->xqd,
+ &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd,
+ sizeof(ref_sgrproj_info->xqd)));
+#endif
+ write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Only write out the ref delta section if any of the elements
+// will signal a delta.
+static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
+ struct loopfilter *lf = &cm->lf;
+ if (!lf->mode_ref_delta_update) {
+ return 0;
+ }
+ const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+ int8_t last_ref_deltas[REF_FRAMES];
+ int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+ if (buf == NULL) {
+ av1_set_default_ref_deltas(last_ref_deltas);
+ av1_set_default_mode_deltas(last_mode_deltas);
+ } else {
+ memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+ memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (lf->ref_deltas[i] != last_ref_deltas[i]) {
+ return true;
+ }
+ }
+ for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ if (lf->mode_deltas[i] != last_mode_deltas[i]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ assert(!cm->features.coded_lossless);
+ if (cm->features.allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *lf = &cm->lf;
+
+ // Encode the loop filter level and type
+ aom_wb_write_literal(wb, lf->filter_level[0], 6);
+ aom_wb_write_literal(wb, lf->filter_level[1], 6);
+ if (num_planes > 1) {
+ if (lf->filter_level[0] || lf->filter_level[1]) {
+ aom_wb_write_literal(wb, lf->filter_level_u, 6);
+ aom_wb_write_literal(wb, lf->filter_level_v, 6);
+ }
+ }
+ aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+ aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+ // Write out loop filter deltas applied at the MB level based on mode or
+ // ref frame (if they are enabled), only if there is information to write.
+ int meaningful = is_mode_ref_delta_meaningful(cm);
+ aom_wb_write_bit(wb, meaningful);
+ if (!meaningful) {
+ return;
+ }
+
+ const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+ int8_t last_ref_deltas[REF_FRAMES];
+ int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+ if (buf == NULL) {
+ av1_set_default_ref_deltas(last_ref_deltas);
+ av1_set_default_mode_deltas(last_mode_deltas);
+ } else {
+ memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+ memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ for (int i = 0; i < REF_FRAMES; i++) {
+ const int delta = lf->ref_deltas[i];
+ const int changed = delta != last_ref_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+ for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ const int delta = lf->mode_deltas[i];
+ const int changed = delta != last_mode_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+}
+
+static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ assert(!cm->features.coded_lossless);
+ if (!cm->seq_params->enable_cdef) return;
+ if (cm->features.allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int i;
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2);
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2);
+ for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) {
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i],
+ CDEF_STRENGTH_BITS);
+ if (num_planes > 1)
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i],
+ CDEF_STRENGTH_BITS);
+ }
+}
+
+static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
+ int delta_q) {
+ if (delta_q != 0) {
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ }
+}
+
+static AOM_INLINE void encode_quantization(
+ const CommonQuantParams *const quant_params, int num_planes,
+ bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS);
+ write_delta_q(wb, quant_params->y_dc_delta_q);
+ if (num_planes > 1) {
+ int diff_uv_delta =
+ (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) ||
+ (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q);
+ if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+ write_delta_q(wb, quant_params->u_dc_delta_q);
+ write_delta_q(wb, quant_params->u_ac_delta_q);
+ if (diff_uv_delta) {
+ write_delta_q(wb, quant_params->v_dc_delta_q);
+ write_delta_q(wb, quant_params->v_ac_delta_q);
+ }
+ }
+ aom_wb_write_bit(wb, quant_params->using_qmatrix);
+ if (quant_params->using_qmatrix) {
+ aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS);
+ aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS);
+ if (!separate_uv_delta_q)
+ assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v);
+ else
+ aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS);
+ }
+}
+
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ int i, j;
+ struct segmentation *seg = &cm->seg;
+
+ aom_wb_write_bit(wb, seg->enabled);
+ if (!seg->enabled) return;
+
+ // Write update flags
+ if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
+ aom_wb_write_bit(wb, seg->update_map);
+ if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update);
+ aom_wb_write_bit(wb, seg->update_data);
+ }
+
+ // Segmentation data
+ if (seg->update_data) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ const int active = segfeature_active(seg, i, j);
+ aom_wb_write_bit(wb, active);
+ if (active) {
+ const int data_max = av1_seg_feature_data_max(j);
+ const int data_min = -data_max;
+ const int ubits = get_unsigned_bits(data_max);
+ const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
+
+ if (av1_is_segfeature_signed(j)) {
+ aom_wb_write_inv_signed_literal(wb, data, ubits);
+ } else {
+ aom_wb_write_literal(wb, data, ubits);
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_frame_interp_filter(
+ InterpFilter filter, struct aom_write_bit_buffer *wb) {
+ aom_wb_write_bit(wb, filter == SWITCHABLE);
+ if (filter != SWITCHABLE)
+ aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+// Same function as write_uniform but writing to uncompresses header wb
+static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
+ int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_wb_write_literal(wb, v, l - 1);
+ } else {
+ aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+ aom_wb_write_literal(wb, (v - m) & 1, 1);
+ }
+}
+
+static AOM_INLINE void write_tile_info_max_tile(
+ const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+ int width_sb =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ int height_sb =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+ int size_sb, i;
+ const CommonTileParams *const tiles = &cm->tiles;
+
+ aom_wb_write_bit(wb, tiles->uniform_spacing);
+
+ if (tiles->uniform_spacing) {
+ int ones = tiles->log2_cols - tiles->min_log2_cols;
+ while (ones--) {
+ aom_wb_write_bit(wb, 1);
+ }
+ if (tiles->log2_cols < tiles->max_log2_cols) {
+ aom_wb_write_bit(wb, 0);
+ }
+
+ // rows
+ ones = tiles->log2_rows - tiles->min_log2_rows;
+ while (ones--) {
+ aom_wb_write_bit(wb, 1);
+ }
+ if (tiles->log2_rows < tiles->max_log2_rows) {
+ aom_wb_write_bit(wb, 0);
+ }
+ } else {
+ // Explicit tiles with configurable tile widths and heights
+ // columns
+ for (i = 0; i < tiles->cols; i++) {
+ size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+ wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1);
+ width_sb -= size_sb;
+ }
+ assert(width_sb == 0);
+
+ // rows
+ for (i = 0; i < tiles->rows; i++) {
+ size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+ wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb),
+ size_sb - 1);
+ height_sb -= size_sb;
+ }
+ assert(height_sb == 0);
+ }
+}
+
+static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ write_tile_info_max_tile(cm, wb);
+
+ *saved_wb = *wb;
+ if (cm->tiles.rows * cm->tiles.cols > 1) {
+ // tile id used for cdf update
+ aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows);
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(wb, 3, 2);
+ }
+}
+
+static AOM_INLINE void write_ext_tile_info(
+ const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ // This information is stored as a separate byte.
+ int mod = wb->bit_offset % CHAR_BIT;
+ if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+ assert(aom_wb_is_byte_aligned(wb));
+
+ *saved_wb = *wb;
+ if (cm->tiles.rows * cm->tiles.cols > 1) {
+ // Note that the last item in the uncompressed header is the data
+ // describing tile configuration.
+ // Number of bytes in tile column size - 1
+ aom_wb_write_literal(wb, 0, 2);
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(wb, 0, 2);
+ }
+}
+
+static INLINE int find_identical_tile(
+ const int tile_row, const int tile_col,
+ TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
+ const MV32 candidate_offset[1] = { { 1, 0 } };
+ const uint8_t *const cur_tile_data =
+ tile_buffers[tile_row][tile_col].data + 4;
+ const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+ int i;
+
+ if (tile_row == 0) return 0;
+
+ // (TODO: yunqingwang) For now, only above tile is checked and used.
+ // More candidates such as left tile can be added later.
+ for (i = 0; i < 1; i++) {
+ int row_offset = candidate_offset[0].row;
+ int col_offset = candidate_offset[0].col;
+ int row = tile_row - row_offset;
+ int col = tile_col - col_offset;
+ const uint8_t *tile_data;
+ TileBufferEnc *candidate;
+
+ if (row < 0 || col < 0) continue;
+
+ const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data);
+
+ // Read out tile-copy-mode bit:
+ if ((tile_hdr >> 31) == 1) {
+ // The candidate is a copy tile itself: the offset is stored in bits
+ // 30 through 24 inclusive.
+ row_offset += (tile_hdr >> 24) & 0x7f;
+ row = tile_row - row_offset;
+ }
+
+ candidate = &tile_buffers[row][col];
+
+ if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+ tile_data = candidate->data + 4;
+
+ if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+ // Identical tile found
+ assert(row_offset > 0);
+ return row_offset;
+ }
+
+ // No identical tile found
+ return 0;
+}
+
+static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ const int scaling_active = av1_resize_scaled(cm);
+ aom_wb_write_bit(wb, scaling_active);
+ if (scaling_active) {
+ aom_wb_write_literal(wb, cm->render_width - 1, 16);
+ aom_wb_write_literal(wb, cm->render_height - 1, 16);
+ }
+}
+
+static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ if (!seq_params->enable_superres) {
+ assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+ return;
+ }
+
+ // First bit is whether to to scale or not
+ if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+ aom_wb_write_bit(wb, 0); // no scaling
+ } else {
+ aom_wb_write_bit(wb, 1); // scaling, write scale factor
+ assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+ assert(cm->superres_scale_denominator <
+ SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+ aom_wb_write_literal(
+ wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+ SUPERRES_SCALE_BITS);
+ }
+}
+
+static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
+ int frame_size_override,
+ struct aom_write_bit_buffer *wb) {
+ const int coded_width = cm->superres_upscaled_width - 1;
+ const int coded_height = cm->superres_upscaled_height - 1;
+
+ if (frame_size_override) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ aom_wb_write_literal(wb, coded_width, num_bits_width);
+ aom_wb_write_literal(wb, coded_height, num_bits_height);
+ }
+
+ write_superres_scale(cm, wb);
+ write_render_size(cm, wb);
+}
+
+static AOM_INLINE void write_frame_size_with_refs(
+ const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+ int found = 0;
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+
+ if (cfg != NULL) {
+ found = cm->superres_upscaled_width == cfg->y_crop_width &&
+ cm->superres_upscaled_height == cfg->y_crop_height;
+ found &= cm->render_width == cfg->render_width &&
+ cm->render_height == cfg->render_height;
+ }
+ aom_wb_write_bit(wb, found);
+ if (found) {
+ write_superres_scale(cm, wb);
+ break;
+ }
+ }
+
+ if (!found) {
+ int frame_size_override = 1; // Always equal to 1 in this function
+ write_frame_size(cm, frame_size_override, wb);
+ }
+}
+
+static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile,
+ struct aom_write_bit_buffer *wb) {
+ assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+ aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
+
+static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
+ struct aom_write_bit_buffer *wb) {
+ // Profile 0/1: [0] for 8 bit, [1] 10-bit
+ // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+ if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
+ }
+}
+
+static AOM_INLINE void write_color_config(
+ const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+ write_bitdepth(seq_params, wb);
+ const int is_monochrome = seq_params->monochrome;
+ // monochrome bit
+ if (seq_params->profile != PROFILE_1)
+ aom_wb_write_bit(wb, is_monochrome);
+ else
+ assert(!is_monochrome);
+ if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+ aom_wb_write_bit(wb, 0); // No color description present
+ } else {
+ aom_wb_write_bit(wb, 1); // Color description present
+ aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+ aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+ aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
+ }
+ if (is_monochrome) {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, seq_params->color_range);
+ return;
+ }
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ assert(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12));
+ } else {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, seq_params->color_range);
+ if (seq_params->profile == PROFILE_0) {
+ // 420 only
+ assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+ } else if (seq_params->profile == PROFILE_1) {
+ // 444 only
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ } else if (seq_params->profile == PROFILE_2) {
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ // 420, 444 or 422
+ aom_wb_write_bit(wb, seq_params->subsampling_x);
+ if (seq_params->subsampling_x == 0) {
+ assert(seq_params->subsampling_y == 0 &&
+ "4:4:0 subsampling not allowed in AV1");
+ } else {
+ aom_wb_write_bit(wb, seq_params->subsampling_y);
+ }
+ } else {
+ // 422 only
+ assert(seq_params->subsampling_x == 1 &&
+ seq_params->subsampling_y == 0);
+ }
+ }
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ }
+ if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+ aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
+ }
+ }
+ aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
+}
+
+static AOM_INLINE void write_timing_info_header(
+ const aom_timing_info_t *const timing_info,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32);
+ aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32);
+ aom_wb_write_bit(wb, timing_info->equal_picture_interval);
+ if (timing_info->equal_picture_interval) {
+ aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1);
+ }
+}
+
+static AOM_INLINE void write_decoder_model_info(
+ const aom_dec_model_info_t *const decoder_model_info,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(
+ wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5);
+ aom_wb_write_unsigned_literal(
+ wb, decoder_model_info->num_units_in_decoding_tick, 32);
+ aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1,
+ 5);
+ aom_wb_write_literal(
+ wb, decoder_model_info->frame_presentation_time_length - 1, 5);
+}
+
+static AOM_INLINE void write_dec_model_op_parameters(
+ const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay,
+ buffer_delay_length);
+ aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay,
+ buffer_delay_length);
+ aom_wb_write_bit(wb, op_params->low_delay_mode_flag);
+}
+
+static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->frame_presentation_time,
+ cm->seq_params->decoder_model_info.frame_presentation_time_length);
+}
+
+static AOM_INLINE void write_film_grain_params(
+ const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
+ aom_wb_write_bit(wb, pars->apply_grain);
+ if (!pars->apply_grain) return;
+
+ aom_wb_write_literal(wb, pars->random_seed, 16);
+
+ if (cm->current_frame.frame_type == INTER_FRAME)
+ aom_wb_write_bit(wb, pars->update_parameters);
+
+ if (!pars->update_parameters) {
+ int ref_frame, ref_idx;
+ for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+ ref_idx = get_ref_frame_map_idx(cm, ref_frame);
+ assert(ref_idx != INVALID_IDX);
+ const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
+ if (buf->film_grain_params_present &&
+ aom_check_grain_params_equiv(pars, &buf->film_grain_params)) {
+ break;
+ }
+ }
+ assert(ref_frame < REF_FRAMES);
+ aom_wb_write_literal(wb, ref_idx, 3);
+ return;
+ }
+
+ // Scaling functions parameters
+ aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14
+ for (int i = 0; i < pars->num_y_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+ }
+
+ if (!cm->seq_params->monochrome) {
+ aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+ } else {
+ assert(!pars->chroma_scaling_from_luma);
+ }
+
+ if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma ||
+ ((cm->seq_params->subsampling_x == 1) &&
+ (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) {
+ assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
+ } else {
+ aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10
+ for (int i = 0; i < pars->num_cb_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+ }
+
+ aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10
+ for (int i = 0; i < pars->num_cr_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+ }
+ }
+
+ aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value
+
+ // AR coefficients
+ // Only sent if the corresponsing scaling function has
+ // more than 0 points
+
+ aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+ int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (pars->num_y_points > 0) ++num_pos_chroma;
+
+ if (pars->num_y_points)
+ for (int i = 0; i < num_pos_luma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+ if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+ if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+ aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value
+
+ aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+ if (pars->num_cb_points) {
+ aom_wb_write_literal(wb, pars->cb_mult, 8);
+ aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+ aom_wb_write_literal(wb, pars->cb_offset, 9);
+ }
+
+ if (pars->num_cr_points) {
+ aom_wb_write_literal(wb, pars->cr_mult, 8);
+ aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+ aom_wb_write_literal(wb, pars->cr_offset, 9);
+ }
+
+ aom_wb_write_bit(wb, pars->overlap_flag);
+
+ aom_wb_write_bit(wb, pars->clip_to_restricted_range);
+}
+
+static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
+ struct aom_write_bit_buffer *wb) {
+ (void)seq_params;
+ (void)wb;
+ assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+ assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+ assert(seq_params->sb_size == BLOCK_128X128 ||
+ seq_params->sb_size == BLOCK_64X64);
+ aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
+}
+
+static AOM_INLINE void write_sequence_header(
+ const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
+ aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
+ aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
+ seq_params->num_bits_width);
+ aom_wb_write_literal(wb, seq_params->max_frame_height - 1,
+ seq_params->num_bits_height);
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+ if (seq_params->frame_id_numbers_present_flag) {
+ // We must always have delta_frame_id_length < frame_id_length,
+ // in order for a frame to be referenced with a unique delta.
+ // Avoid wasting bits by using a coding that enforces this restriction.
+ aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+ aom_wb_write_literal(
+ wb,
+ seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+ 3);
+ }
+ }
+
+ write_sb_size(seq_params, wb);
+
+ aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+ aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+ aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+ aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+ aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint);
+
+ if (seq_params->order_hint_info.enable_order_hint) {
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp);
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs);
+ }
+ if (seq_params->force_screen_content_tools == 2) {
+ aom_wb_write_bit(wb, 1);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+ }
+ if (seq_params->force_screen_content_tools > 0) {
+ if (seq_params->force_integer_mv == 2) {
+ aom_wb_write_bit(wb, 1);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, seq_params->force_integer_mv);
+ }
+ } else {
+ assert(seq_params->force_integer_mv == 2);
+ }
+ if (seq_params->order_hint_info.enable_order_hint)
+ aom_wb_write_literal(
+ wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3);
+ }
+
+ aom_wb_write_bit(wb, seq_params->enable_superres);
+ aom_wb_write_bit(wb, seq_params->enable_cdef);
+ aom_wb_write_bit(wb, seq_params->enable_restoration);
+}
+
+static AOM_INLINE void write_global_motion_params(
+ const WarpedMotionParams *params, const WarpedMotionParams *ref_params,
+ struct aom_write_bit_buffer *wb, int allow_hp) {
+ const TransformationType type = params->wmtype;
+
+ // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION
+ // type models. Check here that we don't accidentally pick one somehow.
+ // See comments in gm_get_motion_vector() for details on the bug we're
+ // working around here
+ assert(type != TRANSLATION);
+
+ aom_wb_write_bit(wb, type != IDENTITY);
+ if (type != IDENTITY) {
+ aom_wb_write_bit(wb, type == ROTZOOM);
+ if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+ }
+
+ if (type >= ROTZOOM) {
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ }
+
+ if (type >= AFFINE) {
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+
+ if (type >= TRANSLATION) {
+ const int trans_bits = (type == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ const int trans_prec_diff = (type == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[0] >> trans_prec_diff),
+ (params->wmmat[0] >> trans_prec_diff));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[1] >> trans_prec_diff),
+ (params->wmmat[1] >> trans_prec_diff));
+ }
+}
+
+static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ int frame;
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+ write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+ cm->features.allow_high_precision_mv);
+ // TODO(sarahparker, debargha): The logic in the commented out code below
+ // does not work currently and causes mismatches when resize is on.
+ // Fix it before turning the optimization back on.
+ /*
+ YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame);
+ if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+ cpi->source->y_crop_height == ref_buf->y_crop_height) {
+ write_global_motion_params(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame], wb,
+ cm->features.allow_high_precision_mv);
+ } else {
+ assert(cm->global_motion[frame].wmtype == IDENTITY &&
+ "Invalid warp type for frames of different resolutions");
+ }
+ */
+ /*
+ printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+ cm->current_frame.frame_number, cm->show_frame, frame,
+ cm->global_motion[frame].wmmat[0],
+ cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+ cm->global_motion[frame].wmmat[3]);
+ */
+ }
+}
+
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm,
+ bool enable_ref_short_signaling) {
+ // In rtc case when res < 360p and speed >= 9, we turn on
+ // frame_refs_short_signaling if it won't break the decoder.
+ if (enable_ref_short_signaling) {
+ const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ const int base =
+ 1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+ const int order_hint_group_cur =
+ cm->current_frame.display_order_hint / base;
+ const int order_hint_group_gld =
+ cm->ref_frame_map[gld_map_idx]->display_order_hint / base;
+ const int relative_dist = cm->current_frame.order_hint -
+ cm->ref_frame_map[gld_map_idx]->order_hint;
+
+ // If current frame and GOLDEN frame are in the same order_hint group, and
+ // they are not far apart (i.e., > 64 frames), then return 1.
+ if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 &&
+ relative_dist <= 64) {
+ return 1;
+ }
+ return 0;
+ }
+
+ // Check whether all references are distinct frames.
+ const RefCntBuffer *seen_bufs[INTER_REFS_PER_FRAME] = { NULL };
+ int num_refs = 0;
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ int seen = 0;
+ for (int i = 0; i < num_refs; i++) {
+ if (seen_bufs[i] == buf) {
+ seen = 1;
+ break;
+ }
+ }
+ if (!seen) seen_bufs[num_refs++] = buf;
+ }
+ }
+
+ // We only turn on frame_refs_short_signaling when all references are
+ // distinct.
+ if (num_refs < INTER_REFS_PER_FRAME) {
+ // It indicates that there exist more than one reference frame pointing to
+ // the same reference buffer, i.e. two or more references are duplicate.
+ return 0;
+ }
+
+ // Check whether the encoder side ref frame choices are aligned with that to
+ // be derived at the decoder side.
+ int remapped_ref_idx_decoder[REF_FRAMES];
+
+ const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+
+ // Set up the frame refs mapping indexes according to the
+ // frame_refs_short_signaling policy.
+ av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx);
+
+ // We only turn on frame_refs_short_signaling when the encoder side decision
+ // on ref frames is identical to that at the decoder side.
+ int frame_refs_short_signaling = 1;
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+ // Compare the buffer index between two reference frames indexed
+ // respectively by the encoder and the decoder side decisions.
+ RefCntBuffer *ref_frame_buf_new = NULL;
+ if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) {
+ ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]];
+ }
+ if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) {
+ frame_refs_short_signaling = 0;
+ break;
+ }
+ }
+
+#if 0 // For debug
+ printf("\nFrame=%d: \n", cm->current_frame.frame_number);
+ printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ printf("enc_ref(map_idx=%d)=%d, vs. "
+ "dec_ref(map_idx=%d)=%d\n",
+ get_ref_frame_map_idx(cm, ref_frame), ref_frame,
+ cm->remapped_ref_idx[ref_frame - LAST_FRAME],
+ ref_frame);
+ }
+#endif // 0
+
+ return frame_refs_short_signaling;
+}
+
+// New function based on HLS R18
+static AOM_INLINE void write_uncompressed_header_obu(
+ AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const CommonQuantParams *quant_params = &cm->quant_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ FeatureFlags *const features = &cm->features;
+
+ if (!cpi->sf.rt_sf.enable_ref_short_signaling ||
+ !seq_params->order_hint_info.enable_order_hint ||
+ seq_params->order_hint_info.enable_ref_frame_mvs) {
+ current_frame->frame_refs_short_signaling = 0;
+ } else {
+ current_frame->frame_refs_short_signaling = 1;
+ }
+
+ if (seq_params->still_picture) {
+ assert(cm->show_existing_frame == 0);
+ assert(cm->show_frame == 1);
+ assert(current_frame->frame_type == KEY_FRAME);
+ }
+ if (!seq_params->reduced_still_picture_hdr) {
+ if (encode_show_existing_frame(cm)) {
+ aom_wb_write_bit(wb, 1); // show_existing_frame
+ aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+ if (seq_params->decoder_model_info_present_flag &&
+ seq_params->timing_info.equal_picture_interval == 0) {
+ write_tu_pts_info(cm, wb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
+ int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+ aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+ }
+ return;
+ } else {
+ aom_wb_write_bit(wb, 0); // show_existing_frame
+ }
+
+ aom_wb_write_literal(wb, current_frame->frame_type, 2);
+
+ aom_wb_write_bit(wb, cm->show_frame);
+ if (cm->show_frame) {
+ if (seq_params->decoder_model_info_present_flag &&
+ seq_params->timing_info.equal_picture_interval == 0)
+ write_tu_pts_info(cm, wb);
+ } else {
+ aom_wb_write_bit(wb, cm->showable_frame);
+ }
+ if (frame_is_sframe(cm)) {
+ assert(features->error_resilient_mode);
+ } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) {
+ aom_wb_write_bit(wb, features->error_resilient_mode);
+ }
+ }
+ aom_wb_write_bit(wb, features->disable_cdf_update);
+
+ if (seq_params->force_screen_content_tools == 2) {
+ aom_wb_write_bit(wb, features->allow_screen_content_tools);
+ } else {
+ assert(features->allow_screen_content_tools ==
+ seq_params->force_screen_content_tools);
+ }
+
+ if (features->allow_screen_content_tools) {
+ if (seq_params->force_integer_mv == 2) {
+ aom_wb_write_bit(wb, features->cur_frame_force_integer_mv);
+ } else {
+ assert(features->cur_frame_force_integer_mv ==
+ seq_params->force_integer_mv);
+ }
+ } else {
+ assert(features->cur_frame_force_integer_mv == 0);
+ }
+
+ int frame_size_override_flag = 0;
+
+ if (seq_params->reduced_still_picture_hdr) {
+ assert(cm->superres_upscaled_width == seq_params->max_frame_width &&
+ cm->superres_upscaled_height == seq_params->max_frame_height);
+ } else {
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
+ aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+ }
+
+ if (cm->superres_upscaled_width > seq_params->max_frame_width ||
+ cm->superres_upscaled_height > seq_params->max_frame_height) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Frame dimensions are larger than the maximum values");
+ }
+
+ frame_size_override_flag =
+ frame_is_sframe(cm)
+ ? 1
+ : (cm->superres_upscaled_width != seq_params->max_frame_width ||
+ cm->superres_upscaled_height != seq_params->max_frame_height);
+ if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+ if (seq_params->order_hint_info.enable_order_hint)
+ aom_wb_write_literal(
+ wb, current_frame->order_hint,
+ seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+ if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+ aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS);
+ }
+ }
+
+ if (seq_params->decoder_model_info_present_flag) {
+ aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present);
+ if (cpi->ppi->buffer_removal_time_present) {
+ for (int op_num = 0;
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+ if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
+ if (seq_params->operating_point_idc[op_num] == 0 ||
+ ((seq_params->operating_point_idc[op_num] >>
+ cm->temporal_layer_id) &
+ 0x1 &&
+ (seq_params->operating_point_idc[op_num] >>
+ (cm->spatial_layer_id + 8)) &
+ 0x1)) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->buffer_removal_times[op_num],
+ seq_params->decoder_model_info.buffer_removal_time_length);
+ cm->buffer_removal_times[op_num]++;
+ if (cm->buffer_removal_times[op_num] == 0) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "buffer_removal_time overflowed");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Shown keyframes and switch-frames automatically refreshes all reference
+ // frames. For all other frame types, we need to write refresh_frame_flags.
+ if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
+ current_frame->frame_type == INTER_FRAME ||
+ current_frame->frame_type == INTRA_ONLY_FRAME)
+ aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
+
+ if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
+ // Write all ref frame order hints if error_resilient_mode == 1
+ if (features->error_resilient_mode &&
+ seq_params->order_hint_info.enable_order_hint) {
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ aom_wb_write_literal(
+ wb, cm->ref_frame_map[ref_idx]->order_hint,
+ seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+ }
+ }
+ }
+
+ if (current_frame->frame_type == KEY_FRAME) {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+ if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+ aom_wb_write_bit(wb, features->allow_intrabc);
+ } else {
+ if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+ if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+ aom_wb_write_bit(wb, features->allow_intrabc);
+ } else if (current_frame->frame_type == INTER_FRAME ||
+ frame_is_sframe(cm)) {
+ MV_REFERENCE_FRAME ref_frame;
+
+ // NOTE: Error resilient mode turns off frame_refs_short_signaling
+ // automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+ current_frame->frame_refs_short_signaling =
+ seq_params->order_hint_info.enable_order_hint;
+#endif // FRAME_REFS_SHORT_SIGNALING
+
+ if (current_frame->frame_refs_short_signaling) {
+ // In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true,
+ // we turn on frame_refs_short_signaling when the current frame and
+ // golden frame are in the same order_hint group, and their relative
+ // distance is <= 64 (in order to be decodable).
+
+ // For other cases, an example solution for encoder-side
+ // implementation on frame_refs_short_signaling is also provided in
+ // this function, where frame_refs_short_signaling is only turned on
+ // when the encoder side decision on ref frames is identical to that
+ // at the decoder side.
+
+ current_frame->frame_refs_short_signaling =
+ check_frame_refs_short_signaling(
+ cm, cpi->sf.rt_sf.enable_ref_short_signaling);
+ }
+
+ if (seq_params->order_hint_info.enable_order_hint)
+ aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling);
+
+ if (current_frame->frame_refs_short_signaling) {
+ const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME);
+ aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+ const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX);
+ if (!current_frame->frame_refs_short_signaling)
+ aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame),
+ REF_FRAMES_LOG2);
+ if (seq_params->frame_id_numbers_present_flag) {
+ int i = get_ref_frame_map_idx(cm, ref_frame);
+ int frame_id_len = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int delta_frame_id_minus_1 =
+ ((cm->current_frame_id - cm->ref_frame_id[i] +
+ (1 << frame_id_len)) %
+ (1 << frame_id_len)) -
+ 1;
+ if (delta_frame_id_minus_1 < 0 ||
+ delta_frame_id_minus_1 >= (1 << diff_len)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Invalid delta_frame_id_minus_1");
+ }
+ aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+ }
+ }
+
+ if (!features->error_resilient_mode && frame_size_override_flag) {
+ write_frame_size_with_refs(cm, wb);
+ } else {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ }
+
+ if (!features->cur_frame_force_integer_mv)
+ aom_wb_write_bit(wb, features->allow_high_precision_mv);
+ write_frame_interp_filter(features->interp_filter, wb);
+ aom_wb_write_bit(wb, features->switchable_motion_mode);
+ if (frame_might_allow_ref_frame_mvs(cm)) {
+ aom_wb_write_bit(wb, features->allow_ref_frame_mvs);
+ } else {
+ assert(features->allow_ref_frame_mvs == 0);
+ }
+ }
+ }
+
+ const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+ !(features->disable_cdf_update);
+ if (cm->tiles.large_scale)
+ assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+
+ if (might_bwd_adapt) {
+ aom_wb_write_bit(
+ wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+ }
+
+ write_tile_info(cm, saved_wb, wb);
+ encode_quantization(quant_params, av1_num_planes(cm),
+ cm->seq_params->separate_uv_delta_q, wb);
+ encode_segmentation(cm, wb);
+
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
+ if (quant_params->base_qindex > 0) {
+ aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
+ if (delta_q_info->delta_q_present_flag) {
+ aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
+ xd->current_base_qindex = quant_params->base_qindex;
+ if (features->allow_intrabc)
+ assert(delta_q_info->delta_lf_present_flag == 0);
+ else
+ aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag);
+ if (delta_q_info->delta_lf_present_flag) {
+ aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2);
+ aom_wb_write_bit(wb, delta_q_info->delta_lf_multi);
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+ }
+
+ if (features->all_lossless) {
+ assert(!av1_superres_scaled(cm));
+ } else {
+ if (!features->coded_lossless) {
+ encode_loopfilter(cm, wb);
+ encode_cdef(cm, wb);
+ }
+ encode_restoration_mode(cm, wb);
+ }
+
+ // Write TX mode
+ if (features->coded_lossless)
+ assert(features->tx_mode == ONLY_4X4);
+ else
+ aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT);
+
+ if (!frame_is_intra_only(cm)) {
+ const int use_hybrid_pred =
+ current_frame->reference_mode == REFERENCE_MODE_SELECT;
+
+ aom_wb_write_bit(wb, use_hybrid_pred);
+ }
+
+ if (current_frame->skip_mode_info.skip_mode_allowed)
+ aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag);
+
+ if (frame_might_allow_warped_motion(cm))
+ aom_wb_write_bit(wb, features->allow_warped_motion);
+ else
+ assert(!features->allow_warped_motion);
+
+ aom_wb_write_bit(wb, features->reduced_tx_set_used);
+
+ if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+
+ if (seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame))
+ write_film_grain_params(cpi, wb);
+
+ if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb);
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+ // Choose the number of bytes required to represent size, without
+ // using the 'spare_msbs' number of most significant bits.
+
+ // Make sure we will fit in 4 bytes to start with..
+ if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+ // Normalise to 32 bits
+ size <<= spare_msbs;
+
+ if (size >> 24 != 0)
+ return 4;
+ else if (size >> 16 != 0)
+ return 3;
+ else if (size >> 8 != 0)
+ return 2;
+ else
+ return 1;
+}
+
+static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz,
+ const int val) {
+ switch (sz) {
+ case 1: dst[0] = (uint8_t)(val & 0xff); break;
+ case 2: mem_put_le16(dst, val); break;
+ case 3: mem_put_le24(dst, val); break;
+ case 4: mem_put_le32(dst, val); break;
+ default: assert(0 && "Invalid size"); break;
+ }
+}
+
+static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
+ const uint32_t data_size, const uint32_t max_tile_size,
+ const uint32_t max_tile_col_size,
+ int *const tile_size_bytes,
+ int *const tile_col_size_bytes) {
+ // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+ int tsb;
+ int tcsb;
+
+ if (tiles->large_scale) {
+ // The top bit in the tile size field indicates tile copy mode, so we
+ // have 1 less bit to code the tile size
+ tsb = choose_size_bytes(max_tile_size, 1);
+ tcsb = choose_size_bytes(max_tile_col_size, 0);
+ } else {
+ tsb = choose_size_bytes(max_tile_size, 0);
+ tcsb = 4; // This is ignored
+ (void)max_tile_col_size;
+ }
+
+ assert(tsb > 0);
+ assert(tcsb > 0);
+
+ *tile_size_bytes = tsb;
+ *tile_col_size_bytes = tcsb;
+ if (tsb == 4 && tcsb == 4) return data_size;
+
+ uint32_t wpos = 0;
+ uint32_t rpos = 0;
+
+ if (tiles->large_scale) {
+ int tile_row;
+ int tile_col;
+
+ for (tile_col = 0; tile_col < tiles->cols; tile_col++) {
+ // All but the last column has a column header
+ if (tile_col < tiles->cols - 1) {
+ uint32_t tile_col_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // Adjust the tile column size by the number of bytes removed
+ // from the tile size fields.
+ tile_col_size -= (4 - tsb) * tiles->rows;
+
+ mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+ wpos += tcsb;
+ }
+
+ for (tile_row = 0; tile_row < tiles->rows; tile_row++) {
+ // All, including the last row has a header
+ uint32_t tile_header = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // If this is a copy tile, we need to shift the MSB to the
+ // top bit of the new width, and there is no data to copy.
+ if (tile_header >> 31 != 0) {
+ if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+ } else {
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+
+ tile_header += AV1_MIN_TILE_SIZE_BYTES;
+ memmove(dst + wpos, dst + rpos, tile_header);
+ rpos += tile_header;
+ wpos += tile_header;
+ }
+ }
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+ }
+ const int n_tiles = tiles->cols * tiles->rows;
+ int n;
+
+ for (n = 0; n < n_tiles; n++) {
+ int tile_size;
+
+ if (n == n_tiles - 1) {
+ tile_size = data_size - rpos;
+ } else {
+ tile_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+ mem_put_varsize(dst + wpos, tsb, tile_size);
+ tile_size += AV1_MIN_TILE_SIZE_BYTES;
+ wpos += tsb;
+ }
+
+ memmove(dst + wpos, dst + rpos, tile_size);
+
+ rpos += tile_size;
+ wpos += tile_size;
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+}
+
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+ int *frame_header_count, OBU_TYPE obu_type,
+ int obu_extension, uint8_t *const dst) {
+ if (level_params->keep_level_stats &&
+ (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
+ ++(*frame_header_count);
+
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ aom_wb_write_literal(&wb, 0, 1); // forbidden bit.
+ aom_wb_write_literal(&wb, (int)obu_type, 4);
+ aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+ aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field
+ aom_wb_write_literal(&wb, 0, 1); // reserved
+
+ if (obu_extension) {
+ aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+ }
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+ uint8_t *dest) {
+ const size_t offset = obu_header_size;
+ size_t coded_obu_size = 0;
+ const uint32_t obu_size = (uint32_t)obu_payload_size;
+ assert(obu_size == obu_payload_size);
+
+ if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+ &coded_obu_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+ uint8_t *data) {
+ const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+ const size_t move_dst_offset = length_field_size + obu_header_size;
+ const size_t move_src_offset = obu_header_size;
+ const size_t move_size = obu_payload_size;
+ memmove(data + move_dst_offset, data + move_src_offset, move_size);
+ return length_field_size;
+}
+
+static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+ if (aom_wb_is_byte_aligned(wb)) {
+ aom_wb_write_literal(wb, 0x80, 8);
+ } else {
+ // assumes that the other bits are already 0s
+ aom_wb_write_bit(wb, 1);
+ }
+}
+
+static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx,
+ struct aom_write_bit_buffer *wb) {
+ assert(is_valid_seq_level_idx(seq_level_idx));
+ aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
+}
+
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+ uint8_t *const dst) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ write_profile(seq_params->profile, &wb);
+
+ // Still picture or not
+ aom_wb_write_bit(&wb, seq_params->still_picture);
+ assert(IMPLIES(!seq_params->still_picture,
+ !seq_params->reduced_still_picture_hdr));
+ // whether to use reduced still picture header
+ aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr);
+
+ if (seq_params->reduced_still_picture_hdr) {
+ assert(seq_params->timing_info_present == 0);
+ assert(seq_params->decoder_model_info_present_flag == 0);
+ assert(seq_params->display_model_info_present_flag == 0);
+ write_bitstream_level(seq_params->seq_level_idx[0], &wb);
+ } else {
+ aom_wb_write_bit(
+ &wb, seq_params->timing_info_present); // timing info present flag
+
+ if (seq_params->timing_info_present) {
+ // timing_info
+ write_timing_info_header(&seq_params->timing_info, &wb);
+ aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag);
+ if (seq_params->decoder_model_info_present_flag) {
+ write_decoder_model_info(&seq_params->decoder_model_info, &wb);
+ }
+ }
+ aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag);
+ aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1,
+ OP_POINTS_CNT_MINUS_1_BITS);
+ int i;
+ for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+ aom_wb_write_literal(&wb, seq_params->operating_point_idc[i],
+ OP_POINTS_IDC_BITS);
+ write_bitstream_level(seq_params->seq_level_idx[i], &wb);
+ if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+ aom_wb_write_bit(&wb, seq_params->tier[i]);
+ if (seq_params->decoder_model_info_present_flag) {
+ aom_wb_write_bit(
+ &wb, seq_params->op_params[i].decoder_model_param_present_flag);
+ if (seq_params->op_params[i].decoder_model_param_present_flag) {
+ write_dec_model_op_parameters(
+ &seq_params->op_params[i],
+ seq_params->decoder_model_info
+ .encoder_decoder_buffer_delay_length,
+ &wb);
+ }
+ }
+ if (seq_params->display_model_info_present_flag) {
+ aom_wb_write_bit(
+ &wb, seq_params->op_params[i].display_model_param_present_flag);
+ if (seq_params->op_params[i].display_model_param_present_flag) {
+ assert(seq_params->op_params[i].initial_display_delay >= 1);
+ assert(seq_params->op_params[i].initial_display_delay <= 10);
+ aom_wb_write_literal(
+ &wb, seq_params->op_params[i].initial_display_delay - 1, 4);
+ }
+ }
+ }
+ }
+ write_sequence_header(seq_params, &wb);
+
+ write_color_config(seq_params, &wb);
+
+ aom_wb_write_bit(&wb, seq_params->film_grain_params_present);
+
+ add_trailing_bits(&wb);
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd,
+ struct aom_write_bit_buffer *saved_wb,
+ uint8_t *const dst,
+ int append_trailing_bits) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ write_uncompressed_header_obu(cpi, xd, saved_wb, &wb);
+ if (append_trailing_bits) add_trailing_bits(&wb);
+ return aom_wb_bytes_written(&wb);
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
+ int end_tile, int tiles_log2,
+ int tile_start_and_end_present_flag) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ if (!tiles_log2) return size;
+
+ aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+ if (tile_start_and_end_present_flag) {
+ aom_wb_write_literal(&wb, start_tile, tiles_log2);
+ aom_wb_write_literal(&wb, end_tile, tiles_log2);
+ }
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+ const char *filename);
+
+typedef struct {
+ uint32_t tg_hdr_size;
+ uint32_t frame_header_size;
+} LargeTileFrameOBU;
+
+// Initialize OBU header for large scale tile case.
+static uint32_t init_large_scale_tile_obu_header(
+ AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
+ LargeTileFrameOBU *lst_obu) {
+ AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ CurrentFrame *const current_frame = &cpi->common.current_frame;
+ // For large_scale_tile case, we always have only one tile group, so it can
+ // be written as an OBU_FRAME.
+ const OBU_TYPE obu_type = OBU_FRAME;
+ lst_obu->tg_hdr_size = av1_write_obu_header(
+ level_params, &cpi->frame_header_count, obu_type, 0, *data);
+ *data += lst_obu->tg_hdr_size;
+
+ const uint32_t frame_header_size =
+ write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0);
+ *data += frame_header_size;
+ lst_obu->frame_header_size = frame_header_size;
+ // (yunqing) This test ensures the correctness of large scale tile coding.
+ if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) {
+ char fn[20] = "./fh";
+ fn[4] = current_frame->frame_number / 100 + '0';
+ fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+ fn[6] = (current_frame->frame_number % 10) + '0';
+ fn[7] = '\0';
+ av1_print_uncompressed_frame_header(*data - frame_header_size,
+ frame_header_size, fn);
+ }
+ return frame_header_size;
+}
+
+// Write total buffer size and related information into the OBU header for large
+// scale tile case.
+static void write_large_scale_tile_obu_size(
+ const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data,
+ struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu,
+ int have_tiles, uint32_t *total_size, int max_tile_size,
+ int max_tile_col_size) {
+ int tile_size_bytes = 0;
+ int tile_col_size_bytes = 0;
+ if (have_tiles) {
+ *total_size = remux_tiles(
+ tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size,
+ max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes);
+ *total_size += lst_obu->frame_header_size;
+ }
+
+ // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+ // current tile group size before tile data(include tile column header).
+ // Tile group size doesn't include the bytes storing tg size.
+ *total_size += lst_obu->tg_hdr_size;
+ const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
+ const size_t length_field_size =
+ av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+ if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
+ AOM_CODEC_OK)
+ assert(0);
+
+ *total_size += (uint32_t)length_field_size;
+ saved_wb->bit_buffer += length_field_size;
+
+ // Now fill in the gaps in the uncompressed header.
+ if (have_tiles) {
+ assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+ aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+ aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+ }
+}
+
+// Store information on each large scale tile in the OBU header.
+static void write_large_scale_tile_obu(
+ AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu,
+ int *const largest_tile_id, uint32_t *total_size, const int have_tiles,
+ unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+
+ TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ unsigned int tile_size = 0;
+
+ av1_reset_pack_bs_thread_data(&cpi->td);
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileInfo tile_info;
+ const int is_last_col = (tile_col == tile_cols - 1);
+ const uint32_t col_offset = *total_size;
+
+ av1_tile_set_col(&tile_info, cm, tile_col);
+
+ // The last column does not have a column header
+ if (!is_last_col) *total_size += 4;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+ const int data_offset = have_tiles ? 4 : 0;
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ aom_writer mode_bc;
+
+ buf->data = dst + *total_size + lst_obu->tg_hdr_size;
+
+ // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+ // even for the last one, unless no tiling is used at all.
+ *total_size += data_offset;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+ mode_bc.allow_update_cdf = !tiles->large_scale;
+ mode_bc.allow_update_cdf =
+ mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+ aom_start_encode(&mode_bc, buf->data + data_offset);
+ write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col);
+ if (aom_stop_encode(&mode_bc) < 0) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes");
+ }
+ tile_size = mode_bc.pos;
+ buf->size = tile_size;
+
+ // Record the maximum tile size we see, so we can compact headers later.
+ if (tile_size > *max_tile_size) {
+ *max_tile_size = tile_size;
+ *largest_tile_id = tile_cols * tile_row + tile_col;
+ }
+
+ if (have_tiles) {
+ // tile header: size of this tile, or copy offset
+ uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+ const int tile_copy_mode =
+ ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1
+ : 0;
+
+ // If tile_copy_mode = 1, check if this tile is a copy tile.
+ // Very low chances to have copy tiles on the key frames, so don't
+ // search on key frames to reduce unnecessary search.
+ if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) {
+ const int identical_tile_offset =
+ find_identical_tile(tile_row, tile_col, tile_buffers);
+
+ // Indicate a copy-tile by setting the most significant bit.
+ // The row-offset to copy from is stored in the highest byte.
+ // remux_tiles will move these around later
+ if (identical_tile_offset > 0) {
+ tile_size = 0;
+ tile_header = identical_tile_offset | 0x80;
+ tile_header <<= 24;
+ }
+ }
+
+ mem_put_le32(buf->data, (MEM_VALUE_T)tile_header);
+ }
+
+ *total_size += tile_size;
+ }
+ if (!is_last_col) {
+ uint32_t col_size = *total_size - col_offset - 4;
+ mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size);
+
+ // Record the maximum tile column size we see.
+ *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+ }
+ }
+ av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Packs information in the obu header for large scale tiles.
+static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
+ AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ uint32_t total_size = 0;
+ unsigned int max_tile_size = 0;
+ unsigned int max_tile_col_size = 0;
+ const int have_tiles = tiles->cols * tiles->rows > 1;
+ uint8_t *data = dst;
+
+ LargeTileFrameOBU lst_obu;
+
+ total_size +=
+ init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu);
+
+ write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size,
+ have_tiles, &max_tile_size, &max_tile_col_size);
+
+ write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu,
+ have_tiles, &total_size, max_tile_size,
+ max_tile_col_size);
+
+ return total_size;
+}
+
+// Writes obu, tile group and uncompressed headers to bitstream.
+void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
+ PackBSParams *const pack_bs_params,
+ const int tile_idx) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
+ const int tg_size =
+ (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
+
+ // Write Tile group, frame and OBU header
+ // A new tile group begins at this tile. Write the obu header and
+ // tile group header
+ const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+ *curr_tg_hdr_size = av1_write_obu_header(
+ &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+ pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
+ pack_bs_params->obu_header_size = *curr_tg_hdr_size;
+
+ if (cpi->num_tg == 1)
+ *curr_tg_hdr_size += write_frame_header_obu(
+ cpi, xd, pack_bs_params->saved_wb,
+ pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0);
+ *curr_tg_hdr_size += write_tile_group_header(
+ pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx,
+ AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
+ (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
+ *pack_bs_params->total_size += *curr_tg_hdr_size;
+}
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td,
+ PackBSParams *const pack_bs_params) {
+ aom_writer mode_bc;
+ AV1_COMMON *const cm = &cpi->common;
+ int tile_row = pack_bs_params->tile_row;
+ int tile_col = pack_bs_params->tile_col;
+ uint32_t *const total_size = pack_bs_params->total_size;
+ TileInfo tile_info;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ mode_bc.allow_update_cdf = 1;
+ mode_bc.allow_update_cdf =
+ mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+
+ unsigned int tile_size;
+
+ const int num_planes = av1_num_planes(cm);
+ av1_reset_loop_restoration(&td->mb.e_mbd, num_planes);
+
+ pack_bs_params->buf.data = pack_bs_params->dst + *total_size;
+
+ // The last tile of the tile group does not have a header.
+ if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4;
+
+ // Pack tile data
+ aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size);
+ write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col);
+ if (aom_stop_encode(&mode_bc) < 0) {
+ aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR,
+ "Error writing modes");
+ }
+ tile_size = mode_bc.pos;
+ assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+ pack_bs_params->buf.size = tile_size;
+
+ // Write tile size
+ if (!pack_bs_params->is_last_tile_in_tg) {
+ // size of this tile
+ mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+ }
+}
+
+void av1_write_last_tile_info(
+ AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+ struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+ uint8_t *curr_tg_start, uint32_t *const total_size,
+ uint8_t **tile_data_start, int *const largest_tile_id,
+ int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) {
+ // write current tile group size
+ const uint32_t obu_payload_size =
+ (uint32_t)(*curr_tg_data_size) - obu_header_size;
+ const size_t length_field_size =
+ av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+ curr_tg_start) != AOM_CODEC_OK) {
+ assert(0);
+ }
+ *curr_tg_data_size += (int)length_field_size;
+ *total_size += (uint32_t)length_field_size;
+ *tile_data_start += length_field_size;
+ if (cpi->num_tg == 1) {
+ // if this tg is combined with the frame header then update saved
+ // frame header base offset according to length field size
+ saved_wb->bit_buffer += length_field_size;
+ }
+
+ if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) {
+ // Make room for a duplicate Frame Header OBU.
+ memmove(curr_tg_start + fh_info->total_length, curr_tg_start,
+ *curr_tg_data_size);
+
+ // Insert a copy of the Frame Header OBU.
+ memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length);
+
+ // Force context update tile to be the first tile in error
+ // resilient mode as the duplicate frame headers will have
+ // context_update_tile_id set to 0
+ *largest_tile_id = 0;
+
+ // Rewrite the OBU header to change the OBU type to Redundant Frame
+ // Header.
+ av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
+ OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
+ &curr_tg_start[fh_info->obu_header_byte_offset]);
+
+ *curr_tg_data_size += (int)(fh_info->total_length);
+ *total_size += (uint32_t)(fh_info->total_length);
+ }
+ *is_first_tg = 0;
+}
+
+void av1_reset_pack_bs_thread_data(ThreadData *const td) {
+ td->coefficient_size = 0;
+ td->max_mv_magnitude = 0;
+ av1_zero(td->interp_filter_selected);
+}
+
+void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi,
+ ThreadData const *td) {
+ int do_max_mv_magnitude_update = 1;
+ cpi->rc.coefficient_size += td->coefficient_size;
+
+ // Disable max_mv_magnitude update for parallel frames based on update flag.
+ if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0;
+
+ if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update)
+ cpi->mv_search_params.max_mv_magnitude =
+ AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude);
+
+ for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++)
+ cpi->common.cur_frame->interp_filter_selected[filter] +=
+ td->interp_filter_selected[filter];
+}
+
+// Store information related to each default tile in the OBU header.
+static void write_tile_obu(
+ AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+ unsigned int *max_tile_size, uint32_t *const obu_header_size,
+ uint8_t **tile_data_start) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cpi->num_tg;
+ const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+ int tile_count = 0;
+ size_t curr_tg_data_size = 0;
+ uint8_t *tile_data_curr = dst;
+ int new_tg = 1;
+ int is_first_tg = 1;
+
+ av1_reset_pack_bs_thread_data(&cpi->td);
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+
+ int is_last_tile_in_tg = 0;
+ if (new_tg) {
+ tile_data_curr = dst + *total_size;
+ tile_count = 0;
+ }
+ tile_count++;
+
+ if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1))
+ is_last_tile_in_tg = 1;
+
+ xd->tile_ctx = &this_tile->tctx;
+
+ // PackBSParams stores all parameters required to pack tile and header
+ // info.
+ PackBSParams pack_bs_params;
+ pack_bs_params.dst = dst;
+ pack_bs_params.curr_tg_hdr_size = 0;
+ pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg;
+ pack_bs_params.new_tg = new_tg;
+ pack_bs_params.obu_extn_header = obu_extn_header;
+ pack_bs_params.obu_header_size = 0;
+ pack_bs_params.saved_wb = saved_wb;
+ pack_bs_params.tile_col = tile_col;
+ pack_bs_params.tile_row = tile_row;
+ pack_bs_params.tile_data_curr = tile_data_curr;
+ pack_bs_params.total_size = total_size;
+
+ if (new_tg)
+ av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx);
+
+ av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params);
+
+ if (new_tg) {
+ curr_tg_data_size = pack_bs_params.curr_tg_hdr_size;
+ *tile_data_start += pack_bs_params.curr_tg_hdr_size;
+ *obu_header_size = pack_bs_params.obu_header_size;
+ new_tg = 0;
+ }
+ if (is_last_tile_in_tg) new_tg = 1;
+
+ curr_tg_data_size +=
+ (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4));
+
+ if (pack_bs_params.buf.size > *max_tile_size) {
+ *largest_tile_id = tile_idx;
+ *max_tile_size = (unsigned int)pack_bs_params.buf.size;
+ }
+
+ if (is_last_tile_in_tg)
+ av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size,
+ tile_data_curr, total_size, tile_data_start,
+ largest_tile_id, &is_first_tg,
+ *obu_header_size, obu_extn_header);
+ *total_size += (uint32_t)pack_bs_params.buf.size;
+ }
+ }
+ av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Write total buffer size and related information into the OBU header for
+// default tile case.
+static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ int largest_tile_id, uint32_t *const total_size,
+ unsigned int max_tile_size,
+ uint32_t obu_header_size,
+ uint8_t *tile_data_start) {
+ const CommonTileParams *const tiles = &cpi->common.tiles;
+
+ // Fill in context_update_tile_id indicating the tile to use for the
+ // cdf update. The encoder currently sets it to the largest tile
+ // (but is up to the encoder)
+ aom_wb_overwrite_literal(saved_wb, largest_tile_id,
+ (tiles->log2_cols + tiles->log2_rows));
+ // If more than one tile group. tile_size_bytes takes the default value 4
+ // and does not need to be set. For a single tile group it is set in the
+ // section below.
+ if (cpi->num_tg != 1) return;
+ int tile_size_bytes = 4, unused;
+ const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+ const uint32_t tile_data_size = *total_size - tile_data_offset;
+
+ *total_size = remux_tiles(tiles, tile_data_start, tile_data_size,
+ max_tile_size, 0, &tile_size_bytes, &unused);
+ *total_size += tile_data_offset;
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+ aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+ // Update the OBU length if remux_tiles() reduced the size.
+ uint64_t payload_size;
+ size_t length_field_size;
+ int res =
+ aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size,
+ &payload_size, &length_field_size);
+ assert(res == 0);
+ (void)res;
+
+ const uint64_t new_payload_size =
+ *total_size - obu_header_size - length_field_size;
+ if (new_payload_size != payload_size) {
+ size_t new_length_field_size;
+ res = aom_uleb_encode(new_payload_size, length_field_size,
+ dst + obu_header_size, &new_length_field_size);
+ assert(res == 0);
+ if (new_length_field_size < length_field_size) {
+ const size_t src_offset = obu_header_size + length_field_size;
+ const size_t dst_offset = obu_header_size + new_length_field_size;
+ memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+ *total_size -= (int)(length_field_size - new_length_field_size);
+ }
+ }
+}
+
+// As per the experiments, single-thread bitstream packing is better for
+// frames with a smaller bitstream size. This behavior is due to setup time
+// overhead of multithread function would be more than that of time required
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+ int avail_workers, bool pack_bs_mt_enabled) {
+ if (!pack_bs_mt_enabled) return 1;
+
+ uint64_t frame_abs_sum_level = 0;
+
+ for (int idx = 0; idx < num_tiles; idx++)
+ frame_abs_sum_level += tile_data[idx].abs_sum_level;
+
+ int ideal_num_workers = 1;
+ const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+ float max_sum = 0.0;
+
+ for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+ const float fas_per_worker_const =
+ ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+ const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+ const float this_sum = fas_per_worker_const - setup_time_const -
+ job_disp_time_const / num_workers;
+
+ if (this_sum > max_sum) {
+ max_sum = this_sum;
+ ideal_num_workers = num_workers;
+ }
+ }
+ return ideal_num_workers;
+}
+
+static INLINE uint32_t pack_tiles_in_tg_obus(
+ AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id) {
+ const CommonTileParams *const tiles = &cpi->common.tiles;
+ uint32_t total_size = 0;
+ unsigned int max_tile_size = 0;
+ uint32_t obu_header_size = 0;
+ uint8_t *tile_data_start = dst;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ const int num_tiles = tile_rows * tile_cols;
+
+ const int num_workers = calc_pack_bs_mt_workers(
+ cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS],
+ cpi->mt_info.pack_bs_mt_enabled);
+
+ if (num_workers > 1) {
+ av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
+ fh_info, largest_tile_id, &max_tile_size,
+ &obu_header_size, &tile_data_start, num_workers);
+ } else {
+ write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
+ fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
+ &tile_data_start);
+ }
+
+ if (num_tiles > 1)
+ write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size,
+ max_tile_size, obu_header_size, tile_data_start);
+ return total_size;
+}
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ uint8_t obu_extension_header,
+ const FrameHeaderInfo *fh_info,
+ int *const largest_tile_id) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ *largest_tile_id = 0;
+
+ // Select the coding strategy (temporal or spatial)
+ if (cm->seg.enabled && cm->seg.update_map) {
+ if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+ cm->seg.temporal_update = 0;
+ } else {
+ cm->seg.temporal_update = 1;
+ if (cpi->td.rd_counts.seg_tmp_pred_cost[0] <
+ cpi->td.rd_counts.seg_tmp_pred_cost[1])
+ cm->seg.temporal_update = 0;
+ }
+ }
+
+ if (tiles->large_scale)
+ return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
+ largest_tile_id);
+
+ return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header,
+ fh_info, largest_tile_id);
+}
+
+static size_t av1_write_metadata_obu(const aom_metadata_t *metadata,
+ uint8_t *const dst) {
+ size_t coded_metadata_size = 0;
+ const uint64_t metadata_type = (uint64_t)metadata->type;
+ if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst,
+ &coded_metadata_size) != 0) {
+ return 0;
+ }
+ memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz);
+ // Add trailing bits.
+ dst[coded_metadata_size + metadata->sz] = 0x80;
+ return (uint32_t)(coded_metadata_size + metadata->sz + 1);
+}
+
+static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
+ if (!cpi->source) return 0;
+ AV1_COMMON *const cm = &cpi->common;
+ aom_metadata_array_t *arr = cpi->source->metadata;
+ if (!arr) return 0;
+ size_t obu_header_size = 0;
+ size_t obu_payload_size = 0;
+ size_t total_bytes_written = 0;
+ size_t length_field_size = 0;
+ for (size_t i = 0; i < arr->sz; i++) {
+ aom_metadata_t *current_metadata = arr->metadata_array[i];
+ if (current_metadata && current_metadata->payload) {
+ if ((cm->current_frame.frame_type == KEY_FRAME &&
+ current_metadata->insert_flag == AOM_MIF_KEY_FRAME) ||
+ (cm->current_frame.frame_type != KEY_FRAME &&
+ current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
+ current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
+ obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
+ &cpi->frame_header_count,
+ OBU_METADATA, 0, dst);
+ obu_payload_size =
+ av1_write_metadata_obu(current_metadata, dst + obu_header_size);
+ length_field_size =
+ av1_obu_memmove(obu_header_size, obu_payload_size, dst);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
+ AOM_CODEC_OK) {
+ const size_t obu_size = obu_header_size + obu_payload_size;
+ dst += obu_size + length_field_size;
+ total_bytes_written += obu_size + length_field_size;
+ } else {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Error writing metadata OBU size");
+ }
+ }
+ }
+ }
+ return total_bytes_written;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+ int *const largest_tile_id) {
+ uint8_t *data = dst;
+ uint32_t data_size;
+ AV1_COMMON *const cm = &cpi->common;
+ AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ uint32_t obu_header_size = 0;
+ uint32_t obu_payload_size = 0;
+ FrameHeaderInfo fh_info = { NULL, 0, 0 };
+ const uint8_t obu_extension_header =
+ cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
+
+ // If no non-zero delta_q has been used, reset delta_q_present_flag
+ if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+ cm->delta_q_info.delta_q_present_flag = 0;
+ }
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_reset_write();
+#endif
+
+ cpi->frame_header_count = 0;
+
+ // The TD is now written outside the frame encode loop
+
+ // write sequence header obu at each key frame or intra_only frame,
+ // preceded by 4-byte size
+ if (cm->current_frame.frame_type == INTRA_ONLY_FRAME ||
+ cm->current_frame.frame_type == KEY_FRAME) {
+ obu_header_size = av1_write_obu_header(
+ level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
+ obu_payload_size =
+ av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
+ const size_t length_field_size =
+ av1_obu_memmove(obu_header_size, obu_payload_size, data);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ data += obu_header_size + obu_payload_size + length_field_size;
+ }
+
+ // write metadata obus before the frame obu that has the show_frame flag set
+ if (cm->show_frame) data += av1_write_metadata_array(cpi, data);
+
+ const int write_frame_header =
+ (cpi->num_tg > 1 || encode_show_existing_frame(cm));
+ struct aom_write_bit_buffer saved_wb = { NULL, 0 };
+ size_t length_field = 0;
+ if (write_frame_header) {
+ // Write Frame Header OBU.
+ fh_info.frame_header = data;
+ obu_header_size =
+ av1_write_obu_header(level_params, &cpi->frame_header_count,
+ OBU_FRAME_HEADER, obu_extension_header, data);
+ obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
+ data + obu_header_size, 1);
+
+ length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ fh_info.obu_header_byte_offset = 0;
+ fh_info.total_length = obu_header_size + obu_payload_size + length_field;
+ data += fh_info.total_length;
+ }
+
+ if (encode_show_existing_frame(cm)) {
+ data_size = 0;
+ } else {
+ // Since length_field is determined adaptively after frame header
+ // encoding, saved_wb must be adjusted accordingly.
+ if (saved_wb.bit_buffer != NULL) {
+ saved_wb.bit_buffer += length_field;
+ }
+
+ // Each tile group obu will be preceded by 4-byte size of the tile group
+ // obu
+ data_size = write_tiles_in_tg_obus(
+ cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id);
+ }
+ data += data_size;
+ *size = data - dst;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 0000000000..12e8a630db
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/level.h"
+#include "aom_dsp/bitwriter.h"
+
+struct aom_write_bit_buffer;
+struct AV1_COMP;
+struct ThreadData;
+
+/*!\cond */
+
+// Stores the location and size of a tile's data in the bitstream. Used for
+// later identifying identical tiles
+typedef struct {
+ uint8_t *data;
+ size_t size;
+} TileBufferEnc;
+
+typedef struct {
+ uint8_t *frame_header;
+ size_t obu_header_byte_offset;
+ size_t total_length;
+} FrameHeaderInfo;
+
+typedef struct {
+ struct aom_write_bit_buffer *saved_wb; // Bit stream buffer writer structure
+ TileBufferEnc buf; // Structure to hold bitstream buffer and size
+ uint32_t *total_size; // Size of the bitstream buffer for the tile in bytes
+ uint8_t *dst; // Base address of tile bitstream buffer
+ uint8_t *tile_data_curr; // Base address of tile-group bitstream buffer
+ size_t tile_buf_size; // Available bitstream buffer for the tile in bytes
+ uint8_t obu_extn_header; // Presence of OBU extension header
+ uint32_t obu_header_size; // Size of the OBU header
+ int curr_tg_hdr_size; // Size of the obu, tg, frame headers
+ int tile_size_mi; // Tile size in mi units
+ int tile_row; // Number of tile rows
+ int tile_col; // Number of tile columns
+ int is_last_tile_in_tg; // Flag to indicate last tile in a tile-group
+ int new_tg; // Flag to indicate starting of a new tile-group
+} PackBSParams;
+
+typedef struct {
+ uint64_t abs_sum_level;
+ uint16_t tile_idx;
+} PackBSTileOrder;
+
+// Pack bitstream data for pack bitstream multi-threading.
+typedef struct {
+#if CONFIG_MULTITHREAD
+ // Mutex lock used while dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif
+ // Tile order structure of pack bitstream multithreading.
+ PackBSTileOrder pack_bs_tile_order[MAX_TILES];
+
+ // Index of next job to be processed.
+ int next_job_idx;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool pack_bs_mt_exit;
+} AV1EncPackBSSync;
+
+/*!\endcond */
+
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+ uint8_t *const dst);
+
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+ int *frame_header_count, OBU_TYPE obu_type,
+ int obu_extension, uint8_t *const dst);
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+ uint8_t *dest);
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td,
+ PackBSParams *const pack_bs_params);
+
+void av1_write_last_tile_info(
+ struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+ struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+ uint8_t *curr_tg_start, uint32_t *const total_size,
+ uint8_t **tile_data_start, int *const largest_tile_id,
+ int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header);
+
+/*!\brief Pack the bitstream for one frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ */
+int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+ int *const largest_tile_id);
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
+
+void av1_reset_pack_bs_thread_data(struct ThreadData *const td);
+
+void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi,
+ struct ThreadData const *td);
+
+void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi,
+ MACROBLOCKD *const xd,
+ PackBSParams *const pack_bs_params,
+ const int tile_idx);
+
+int av1_neg_interleave(int x, int ref, int max);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 0000000000..33d2d8c2a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,1515 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * Declares various structs used to encode the current partition block.
+ */
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#include "av1/common/enums.h"
+#include "av1/common/mvref_common.h"
+
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp_structs.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+
+#include "av1/encoder/hash_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//! Minimum linear dimension of a tpl block
+#define MIN_TPL_BSIZE_1D 16
+//! Maximum number of tpl block in a super block
+#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D)
+//! Number of txfm hash records kept for the partition block.
+#define RD_RECORD_BUFFER_LEN 8
+
+/*! Maximum value taken by transform type probabilities */
+#define MAX_TX_TYPE_PROB 1024
+
+//! Compute color sensitivity index for given plane
+#define COLOR_SENS_IDX(plane) ((plane)-1)
+
+//! Enable timer statistics of mode search in non-rd
+#define COLLECT_NONRD_PICK_MODE_STAT 0
+
+/*!\cond */
+#if COLLECT_NONRD_PICK_MODE_STAT
+#include "aom_ports/aom_timer.h"
+
+typedef struct _mode_search_stat_nonrd {
+ int32_t num_blocks[BLOCK_SIZES];
+ int64_t total_block_times[BLOCK_SIZES];
+ int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
+ int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT];
+ struct aom_usec_timer timer1;
+ struct aom_usec_timer timer2;
+ struct aom_usec_timer bsize_timer;
+} mode_search_stat_nonrd;
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+/*!\endcond */
+
+/*! \brief Superblock level encoder info
+ *
+ * SuperblockEnc stores superblock level information used by the encoder for
+ * more efficient encoding. Currently this is mostly used to store TPL data
+ * for the current superblock.
+ */
+typedef struct {
+ //! Maximum partition size for the sb.
+ BLOCK_SIZE min_partition_size;
+ //! Minimum partition size for the sb.
+ BLOCK_SIZE max_partition_size;
+
+ /*****************************************************************************
+ * \name TPL Info
+ *
+ * Information gathered from tpl_model at tpl block precision for the
+ * superblock to speed up the encoding process..
+ ****************************************************************************/
+ /**@{*/
+ //! Number of TPL blocks in this superblock.
+ int tpl_data_count;
+ //! TPL's estimate of inter cost for each tpl block.
+ int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+ //! TPL's estimate of tpl cost for each tpl block.
+ int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+ //! Motion vectors found by TPL model for each tpl block.
+ int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME];
+ //! TPL's stride for the arrays in this struct.
+ int tpl_stride;
+ /**@}*/
+} SuperBlockEnc;
+
+/*! \brief Stores the best performing modes.
+ */
+typedef struct {
+ //! The mbmi used to reconstruct the winner mode.
+ MB_MODE_INFO mbmi;
+ //! Rdstats of the winner mode.
+ RD_STATS rd_cost;
+ //! Rdcost of the winner mode
+ int64_t rd;
+ //! Luma rate of the winner mode.
+ int rate_y;
+ //! Chroma rate of the winner mode.
+ int rate_uv;
+ //! The color map needed to reconstruct palette mode.
+ uint8_t color_index_map[MAX_SB_SQUARE];
+ //! The current winner mode.
+ THR_MODES mode_index;
+} WinnerModeStats;
+
+/*! \brief Each source plane of the current macroblock
+ *
+ * This struct also stores the txfm buffers and quantizer settings.
+ */
+typedef struct macroblock_plane {
+ //! Stores source - pred so the txfm can be computed later
+ int16_t *src_diff;
+ //! Dequantized coefficients
+ tran_low_t *dqcoeff;
+ //! Quantized coefficients
+ tran_low_t *qcoeff;
+ //! Transformed coefficients
+ tran_low_t *coeff;
+ //! Location of the end of qcoeff (end of block).
+ uint16_t *eobs;
+ //! Contexts used to code the transform coefficients.
+ uint8_t *txb_entropy_ctx;
+ //! A buffer containing the source frame.
+ struct buf_2d src;
+
+ /*! \name Quantizer Settings
+ *
+ * \attention These are used/accessed only in the quantization process.
+ * RDO does not and *must not* depend on any of these values.
+ * All values below share the coefficient scale/shift used in TX.
+ */
+ /**@{*/
+ //! Quantization step size used by AV1_XFORM_QUANT_FP.
+ const int16_t *quant_fp_QTX;
+ //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP.
+ const int16_t *round_fp_QTX;
+ //! Quantization step size used by AV1_XFORM_QUANT_B.
+ const int16_t *quant_QTX;
+ //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B.
+ const int16_t *round_QTX;
+ //! Scale factor to shift coefficients toward zero. Only used by QUANT_B.
+ const int16_t *quant_shift_QTX;
+ //! Size of the quantization bin around 0. Only Used by QUANT_B
+ const int16_t *zbin_QTX;
+ //! Dequantizer
+ const int16_t *dequant_QTX;
+ /**@}*/
+} MACROBLOCK_PLANE;
+
+/*! \brief Costs for encoding the coefficients within a level.
+ *
+ * Covers everything including txb_skip, eob, dc_sign,
+ */
+typedef struct {
+ //! Cost to skip txfm for the current txfm block.
+ int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+ /*! \brief Cost for encoding the base_eob of a level.
+ *
+ * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1.
+ */
+ int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+ /*! \brief Cost for encoding the base level of a coefficient.
+ *
+ * Decoder derives coeff_base as coeff_base := base_eob + 1.
+ */
+ int base_cost[SIG_COEF_CONTEXTS][8];
+ /*! \brief Cost for encoding the last non-zero coefficient.
+ *
+ * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1
+ */
+ int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+ //! Cost for encoding the dc_sign
+ int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+ //! Cost for encoding an increment to the coefficient
+ int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
+} LV_MAP_COEFF_COST;
+
+/*! \brief Costs for encoding the eob.
+ */
+typedef struct {
+ //! eob_cost.
+ int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+/*! \brief Stores the transforms coefficients for the whole superblock.
+ */
+typedef struct {
+ //! The transformed coefficients.
+ tran_low_t *tcoeff[MAX_MB_PLANE];
+ //! Where the transformed coefficients end.
+ uint16_t *eobs[MAX_MB_PLANE];
+ /*! \brief Transform block entropy contexts.
+ *
+ * Each element is used as a bit field.
+ * - Bits 0~3: txb_skip_ctx
+ * - Bits 4~5: dc_sign_ctx.
+ */
+ uint8_t *entropy_ctx[MAX_MB_PLANE];
+} CB_COEFF_BUFFER;
+
+/*! \brief Extended mode info derived from mbmi.
+ */
+typedef struct {
+ // TODO(angiebird): Reduce the buffer size according to sb_type
+ //! The reference mv list for the current block.
+ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+ //! The weights used to compute the ref mvs.
+ uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+ //! Number of ref mvs in the drl.
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+ //! Global mvs
+ int_mv global_mvs[REF_FRAMES];
+ //! Context used to encode the current mode.
+ int16_t mode_context[MODE_CTX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+/*! \brief Stores best extended mode information at frame level.
+ *
+ * The frame level in here is used in bitstream preparation stage. The
+ * information in \ref MB_MODE_INFO_EXT are copied to this struct to save
+ * memory.
+ */
+typedef struct {
+ //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack
+ CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+ //! \copydoc MB_MODE_INFO_EXT::weight
+ uint16_t weight[USABLE_REF_MV_STACK_SIZE];
+ //! \copydoc MB_MODE_INFO_EXT::ref_mv_count
+ uint8_t ref_mv_count;
+ // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+ //! \copydoc MB_MODE_INFO_EXT::global_mvs
+ int_mv global_mvs[REF_FRAMES];
+ //! \copydoc MB_MODE_INFO_EXT::mode_context
+ int16_t mode_context;
+ //! Offset of current coding block's coeff buffer relative to the sb.
+ uint16_t cb_offset[PLANE_TYPES];
+} MB_MODE_INFO_EXT_FRAME;
+
+/*! \brief Inter-mode txfm results for a partition block.
+ */
+typedef struct {
+ //! Txfm size used if the current mode is intra mode.
+ TX_SIZE tx_size;
+ //! Txfm sizes used if the current mode is inter mode.
+ TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ //! Map showing which txfm block skips the txfm process.
+ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ //! Map showing the txfm types for each block.
+ uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ //! Rd_stats for the whole partition block.
+ RD_STATS rd_stats;
+ //! Hash value of the current record.
+ uint32_t hash_value;
+} MB_RD_INFO;
+
+/*! \brief Hash records of the inter-mode transform results
+ *
+ * Hash records of the inter-mode transform results for a whole partition block
+ * based on the residue. Since this operates on the partition block level, this
+ * can give us a whole txfm partition tree.
+ */
+typedef struct {
+ /*! Circular buffer that stores the inter-mode txfm results of a partition
+ * block.
+ */
+ MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN];
+ //! Index to insert the newest rd record.
+ int index_start;
+ //! Number of info stored in this record.
+ int num;
+ //! Hash function
+ CRC32C crc_calculator;
+} MB_RD_RECORD;
+
+//! Number of compound rd stats
+#define MAX_COMP_RD_STATS 64
+/*! \brief Rdcost stats in compound mode.
+ */
+typedef struct {
+ //! Rate of the compound modes.
+ int32_t rate[COMPOUND_TYPES];
+ //! Distortion of the compound modes.
+ int64_t dist[COMPOUND_TYPES];
+ //! Estimated rate of the compound modes.
+ int32_t model_rate[COMPOUND_TYPES];
+ //! Estimated distortion of the compound modes.
+ int64_t model_dist[COMPOUND_TYPES];
+ //! Rate need to send the mask type.
+ int comp_rs2[COMPOUND_TYPES];
+ //! Motion vector for each predictor.
+ int_mv mv[2];
+ //! Ref frame for each predictor.
+ MV_REFERENCE_FRAME ref_frames[2];
+ //! Current prediction mode.
+ PREDICTION_MODE mode;
+ //! Current interpolation filter.
+ int_interpfilters filter;
+ //! Refmv index in the drl.
+ int ref_mv_idx;
+ //! Whether the predictors are GLOBALMV.
+ int is_global[2];
+ //! Current parameters for interinter mode.
+ INTERINTER_COMPOUND_DATA interinter_comp;
+} COMP_RD_STATS;
+
+/*! \brief Contains buffers used to speed up rdopt for obmc.
+ *
+ * See the comments for calc_target_weighted_pred for details.
+ */
+typedef struct {
+ /*! \brief A new source weighted with the above and left predictors.
+ *
+ * Used to efficiently construct multiple obmc predictors during rdopt.
+ */
+ int32_t *wsrc;
+ /*! \brief A new mask constructed from the original horz/vert mask.
+ *
+ * \copydetails wsrc
+ */
+ int32_t *mask;
+ /*! \brief Prediction from the up predictor.
+ *
+ * Used to build the obmc predictor.
+ */
+ uint8_t *above_pred;
+ /*! \brief Prediction from the up predictor.
+ *
+ * \copydetails above_pred
+ */
+ uint8_t *left_pred;
+} OBMCBuffer;
+
+/*! \brief Contains color maps used in palette mode.
+ */
+typedef struct {
+ //! The best color map found.
+ uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+ //! A temporary buffer used for k-means clustering.
+ int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+/*! \brief Contains buffers used by av1_compound_type_rd()
+ *
+ * For sizes and alignment of these arrays, refer to
+ * alloc_compound_type_rd_buffers() function.
+ */
+typedef struct {
+ //! First prediction.
+ uint8_t *pred0;
+ //! Second prediction.
+ uint8_t *pred1;
+ //! Source - first prediction.
+ int16_t *residual1;
+ //! Second prediction - first prediction.
+ int16_t *diff10;
+ //! Backup of the best segmentation mask.
+ uint8_t *tmp_best_mask_buf;
+} CompoundTypeRdBuffers;
+
+/*! \brief Holds some parameters related to partitioning schemes in AV1.
+ */
+// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE
+typedef struct {
+#if !CONFIG_REALTIME_ONLY
+ // The following 4 parameters are used for cnn-based partitioning on intra
+ // frame.
+ /*! \brief Current index on the partition block quad tree.
+ *
+ * Used to index into the cnn buffer for partition decision.
+ */
+ int quad_tree_idx;
+ //! Whether the CNN buffer contains valid output.
+ int cnn_output_valid;
+ //! A buffer used by our segmentation CNN for intra-frame partitioning.
+ float cnn_buffer[CNN_OUT_BUF_SIZE];
+ //! log of the quantization parameter of the ancestor BLOCK_64X64.
+ float log_q;
+#endif
+
+ /*! \brief Variance of the subblocks in the superblock.
+ *
+ * This is used by rt mode for variance based partitioning.
+ * The indices corresponds to the following block sizes:
+ * - 0 - 128x128
+ * - 1-2 - 128x64
+ * - 3-4 - 64x128
+ * - 5-8 - 64x64
+ * - 9-16 - 64x32
+ * - 17-24 - 32x64
+ * - 25-40 - 32x32
+ * - 41-104 - 16x16
+ */
+ uint8_t variance_low[105];
+} PartitionSearchInfo;
+
+/*!\cond */
+enum {
+ /**
+ * Do not prune transform depths.
+ */
+ TX_PRUNE_NONE = 0,
+ /**
+ * Prune largest transform (depth 0) based on NN model.
+ */
+ TX_PRUNE_LARGEST = 1,
+ /**
+ * Prune split transforms (depth>=1) based on NN model.
+ */
+ TX_PRUNE_SPLIT = 2,
+} UENUM1BYTE(TX_PRUNE_TYPE);
+/*!\endcond */
+
+/*! \brief Defines the parameters used to perform txfm search.
+ *
+ * For the most part, this determines how various speed features are used.
+ */
+typedef struct {
+ /*! \brief Whether to limit the intra txfm search type to the default txfm.
+ *
+ * This could either be a result of either sequence parameter or speed
+ * features.
+ */
+ int use_default_intra_tx_type;
+
+ /*! Probability threshold used for conditionally forcing tx type*/
+ int default_inter_tx_type_prob_thresh;
+
+ //! Whether to prune 2d transforms based on 1d transform results.
+ int prune_2d_txfm_mode;
+
+ /*! \brief Variable from \ref WinnerModeParams based on current eval mode.
+ *
+ * See the documentation for \ref WinnerModeParams for more detail.
+ */
+ unsigned int coeff_opt_thresholds[2];
+ /*! \copydoc coeff_opt_thresholds */
+ unsigned int tx_domain_dist_threshold;
+ /*! \copydoc coeff_opt_thresholds */
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
+ /*! \copydoc coeff_opt_thresholds */
+ unsigned int use_transform_domain_distortion;
+ /*! \copydoc coeff_opt_thresholds */
+ unsigned int skip_txfm_level;
+
+ /*! \brief How to search for the optimal tx_size
+ *
+ * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for
+ * the current partition block; if TX_MODE_SELECT, search through the whole
+ * tree.
+ *
+ * \attention
+ * Although this looks suspicious similar to a bitstream element, this
+ * tx_mode_search_type is only used internally by the encoder, and is *not*
+ * written to the bitstream. It determines what kind of tx_mode would be
+ * searched. For example, we might set it to TX_MODE_LARGEST to find a good
+ * candidate, then code it as TX_MODE_SELECT.
+ */
+ TX_MODE tx_mode_search_type;
+
+ /*!
+ * Determines whether a block can be predicted as transform skip or DC only
+ * based on residual mean and variance.
+ * Type 0 : No skip block or DC only block prediction
+ * Type 1 : Prediction of skip block based on residual mean and variance
+ * Type 2 : Prediction of skip block or DC only block based on residual mean
+ * and variance
+ */
+ unsigned int predict_dc_level;
+
+ /*!
+ * Whether or not we should use the quantization matrix as weights for PSNR
+ * during RD search.
+ */
+ int use_qm_dist_metric;
+
+ /*!
+ * Keep track of previous mode evaluation stage type. This will be used to
+ * reset mb rd hash record when mode evaluation type changes.
+ */
+ int mode_eval_type;
+
+#if !CONFIG_REALTIME_ONLY
+ //! Indicates the transform depths for which RD evaluation is skipped.
+ TX_PRUNE_TYPE nn_prune_depths_for_intra_tx;
+
+ /*! \brief Indicates if NN model should be invoked to prune transform depths.
+ *
+ * Used to signal whether NN model should be evaluated to prune the R-D
+ * evaluation of specific transform depths.
+ */
+ bool enable_nn_prune_intra_tx_depths;
+#endif
+} TxfmSearchParams;
+
+/*!\cond */
+#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1))
+#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2))
+#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3))
+#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4))
+/*!\endcond */
+
+/*! \brief Stores various encoding/search decisions related to txfm search.
+ *
+ * This struct contains a cache of previous txfm results, and some buffers for
+ * the current txfm decision.
+ */
+typedef struct {
+ //! Whether to skip transform and quantization on a partition block level.
+ uint8_t skip_txfm;
+
+ /*! \brief Whether to skip transform and quantization on a txfm block level.
+ *
+ * Skips transform and quantization on a transform block level inside the
+ * current partition block. Each element of this array is used as a bit-field.
+ * So for example, the we are skipping on the luma plane, then the last bit
+ * would be set to 1.
+ */
+ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ /*! \brief Transform types inside the partition block
+ *
+ * Keeps a record of what kind of transform to use for each of the transform
+ * block inside the partition block.
+ * \attention The buffer here is *never* directly used. Instead, this just
+ * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the
+ * partition block. So if we need to save memory, we could move the allocation
+ * to pick_sb_mode instead.
+ */
+ uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ //! Txfm hash records of inter-modes.
+ MB_RD_RECORD *mb_rd_record;
+
+ /*! \brief Number of txb splits.
+ *
+ * Keep track of how many times we've used split tx partition for transform
+ * blocks. Somewhat misleadingly, this parameter doesn't actually keep track
+ * of the count of the current block. Instead, it's a cumulative count across
+ * of the whole frame. The main usage is that if txb_split_count is zero, then
+ * we can signal TX_MODE_LARGEST at frame level.
+ */
+ // TODO(chiyotsai@google.com): Move this to a more appropriate location such
+ // as ThreadData.
+ unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+ //! For debugging. Used to check how many txfm searches we are doing.
+ unsigned int tx_search_count;
+#endif // CONFIG_SPEED_STATS
+} TxfmSearchInfo;
+#undef MAX_NUM_8X8_TXBS
+#undef MAX_NUM_16X16_TXBS
+#undef MAX_NUM_32X32_TXBS
+#undef MAX_NUM_64X64_TXBS
+
+/*! \brief Holds the entropy costs for various modes sent to the bitstream.
+ *
+ * \attention This does not include the costs for mv and transformed
+ * coefficients.
+ */
+typedef struct {
+ /*****************************************************************************
+ * \name Partition Costs
+ ****************************************************************************/
+ /**@{*/
+ //! Cost for coding the partition.
+ int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Intra Costs: General
+ ****************************************************************************/
+ /**@{*/
+ //! Luma mode cost for inter frame.
+ int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+ //! Luma mode cost for intra frame.
+ int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+ //! Chroma mode cost
+ int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+ //! filter_intra_cost
+ int filter_intra_cost[BLOCK_SIZES_ALL][2];
+ //! filter_intra_mode_cost
+ int filter_intra_mode_cost[FILTER_INTRA_MODES];
+ //! angle_delta_cost
+ int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+
+ //! Rate rate associated with each alpha codeword
+ int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Intra Costs: Screen Contents
+ ****************************************************************************/
+ /**@{*/
+ //! intrabc_cost
+ int intrabc_cost[2];
+
+ //! palette_y_size_cost
+ int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ //! palette_uv_size_cost
+ int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ //! palette_y_color_cost
+ int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ //! palette_uv_color_cost
+ int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ //! palette_y_mode_cost
+ int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+ //! palette_uv_mode_cost
+ int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: MV Modes
+ ****************************************************************************/
+ /**@{*/
+ //! skip_mode_cost
+ int skip_mode_cost[SKIP_MODE_CONTEXTS][2];
+ //! newmv_mode_cost
+ int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+ //! zeromv_mode_cost
+ int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+ //! refmv_mode_cost
+ int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+ //! drl_mode_cost0
+ int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Ref Frame Types
+ ****************************************************************************/
+ /**@{*/
+ //! single_ref_cost
+ int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+ //! comp_inter_cost
+ int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+ //! comp_ref_type_cost
+ int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+ [CDF_SIZE(COMP_REFERENCE_TYPES)];
+ //! uni_comp_ref_cost
+ int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+ [CDF_SIZE(2)];
+ /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode
+ *
+ * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME.
+ */
+ int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+ /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode
+ *
+ * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME.
+ */
+ int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Compound Types
+ ****************************************************************************/
+ /**@{*/
+ //! intra_inter_cost
+ int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+ //! inter_compound_mode_cost
+ int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+ //! compound_type_cost
+ int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+ //! wedge_idx_cost
+ int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+ //! interintra_cost
+ int interintra_cost[BLOCK_SIZE_GROUPS][2];
+ //! wedge_interintra_cost
+ int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+ //! interintra_mode_cost
+ int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Compound Masks
+ ****************************************************************************/
+ /**@{*/
+ //! comp_idx_cost
+ int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+ //! comp_group_idx_cost
+ int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Motion Modes/Filters
+ ****************************************************************************/
+ /**@{*/
+ //! motion_mode_cost
+ int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+ //! motion_mode_cost1
+ int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+ //! switchable_interp_costs
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Txfm Mode Costs
+ ****************************************************************************/
+ /**@{*/
+ //! skip_txfm_cost
+ int skip_txfm_cost[SKIP_CONTEXTS][2];
+ //! tx_size_cost
+ int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+ //! txfm_partition_cost
+ int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+ //! inter_tx_type_costs
+ int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ //! intra_tx_type_costs
+ int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Restoration Mode Costs
+ ****************************************************************************/
+ /**@{*/
+ //! switchable_restore_cost
+ int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+ //! wiener_restore_cost
+ int wiener_restore_cost[2];
+ //! sgrproj_restore_cost
+ int sgrproj_restore_cost[2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Segmentation Mode Costs
+ ****************************************************************************/
+ /**@{*/
+ //! tmp_pred_cost
+ int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2];
+ //! spatial_pred_cost
+ int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS];
+ /**@}*/
+} ModeCosts;
+
+/*! \brief Holds mv costs for encoding and motion search.
+ */
+typedef struct {
+ /*****************************************************************************
+ * \name Encoding Costs
+ * Here are the entropy costs needed to encode a given mv.
+ * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds
+ * the memory for holding the mv cost. But since the motion vectors can be
+ * negative, we shift them to the middle and store the resulting pointer in
+ * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref
+ * mv_cost_stack points to the \ref nmv_cost with the mv precision we are
+ * currently working with. In essence, only \ref mv_cost_stack is needed for
+ * motion search, the other can be considered private.
+ ****************************************************************************/
+ /**@{*/
+ //! Costs for coding the zero components.
+ int nmv_joint_cost[MV_JOINTS];
+
+ //! Allocates memory for 1/4-pel motion vector costs.
+ int nmv_cost_alloc[2][MV_VALS];
+ //! Allocates memory for 1/8-pel motion vector costs.
+ int nmv_cost_hp_alloc[2][MV_VALS];
+ //! Points to the middle of \ref nmv_cost_alloc
+ int *nmv_cost[2];
+ //! Points to the middle of \ref nmv_cost_hp_alloc
+ int *nmv_cost_hp[2];
+ //! Points to the nmv_cost_hp in use.
+ int **mv_cost_stack;
+ /**@}*/
+} MvCosts;
+
+/*! \brief Holds mv costs for intrabc.
+ */
+typedef struct {
+ /*! Costs for coding the joint mv. */
+ int joint_mv[MV_JOINTS];
+
+ /*! \brief Cost of transmitting the actual motion vector.
+ * dv_costs_alloc[0][i] is the cost of motion vector with horizontal
+ * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of
+ * motion vector with vertical component (mv_col) equal to i - MV_MAX.
+ */
+ int dv_costs_alloc[2][MV_VALS];
+
+ /*! Points to the middle of \ref dv_costs_alloc. */
+ int *dv_costs[2];
+} IntraBCMVCosts;
+
+/*! \brief Holds the costs needed to encode the coefficients
+ */
+typedef struct {
+ //! Costs for coding the coefficients.
+ LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+ //! Costs for coding the eobs.
+ LV_MAP_EOB_COST eob_costs[7][2];
+} CoeffCosts;
+
+/*!\cond */
+// 4: NEAREST, NEW, NEAR, GLOBAL
+#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
+/*!\endcond */
+struct inter_modes_info;
+
+/*! \brief Holds the motion samples for warp motion model estimation
+ */
+typedef struct {
+ //! Number of samples.
+ int num;
+ //! Sample locations in current frame.
+ int pts[16];
+ //! Sample location in the reference frame.
+ int pts_inref[16];
+} WARP_SAMPLE_INFO;
+
+/*!\cond */
+typedef enum {
+ kZeroSad = 0,
+ kVeryLowSad = 1,
+ kLowSad = 2,
+ kMedSad = 3,
+ kHighSad = 4
+} SOURCE_SAD;
+
+typedef struct {
+ //! SAD levels in non-rd path
+ SOURCE_SAD source_sad_nonrd;
+ //! SAD levels in rd-path for var-based part qindex thresholds
+ SOURCE_SAD source_sad_rd;
+ int lighting_change;
+ int low_sumdiff;
+} CONTENT_STATE_SB;
+
+// Structure to hold pixel level gradient info.
+typedef struct {
+ uint16_t abs_dx_abs_dy_sum;
+ int8_t hist_bin_idx;
+ bool is_dx_zero;
+} PixelLevelGradientInfo;
+
+// Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks.
+typedef struct {
+ double log_var;
+ int var;
+} Block4x4VarInfo;
+
+#ifndef NDEBUG
+typedef struct SetOffsetsLoc {
+ int mi_row;
+ int mi_col;
+ BLOCK_SIZE bsize;
+} SetOffsetsLoc;
+#endif // NDEBUG
+
+/*!\endcond */
+
+/*! \brief Encoder's parameters related to the current coding block.
+ *
+ * This struct contains most of the information the encoder needs to encode the
+ * current coding block. This includes the src and pred buffer, a copy of the
+ * decoder's view of the current block, the txfm coefficients. This struct also
+ * contains various buffers and data used to speed up the encoding process.
+ */
+typedef struct macroblock {
+ /*****************************************************************************
+ * \name Source, Buffers and Decoder
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Each of the encoding plane.
+ *
+ * An array holding the src buffer for each of plane of the current block. It
+ * also contains the txfm and quantized txfm coefficients.
+ */
+ struct macroblock_plane plane[MAX_MB_PLANE];
+
+ /*! \brief Decoder's view of current coding block.
+ *
+ * Contains the encoder's copy of what the decoder sees in the current block.
+ * Most importantly, this struct contains pointers to mbmi that is used in
+ * final bitstream packing.
+ */
+ MACROBLOCKD e_mbd;
+
+ /*! \brief Derived coding information.
+ *
+ * Contains extra information not transmitted in the bitstream but are
+ * derived. For example, this contains the stack of ref_mvs.
+ */
+ MB_MODE_INFO_EXT mbmi_ext;
+
+ /*! \brief Finalized mbmi_ext for the whole frame.
+ *
+ * Contains the finalized info in mbmi_ext that gets used at the frame level
+ * for bitstream packing.
+ */
+ MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+
+ //! Entropy context for the current row.
+ FRAME_CONTEXT *row_ctx;
+ /*! \brief Entropy context for the current tile.
+ *
+ * This context will be used to update color_map_cdf pointer which would be
+ * used during pack bitstream. For single thread and tile-multithreading case
+ * this pointer will be same as xd->tile_ctx, but for the case of row-mt:
+ * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
+ * to the accurate tile context.
+ */
+ FRAME_CONTEXT *tile_pb_ctx;
+
+ /*! \brief Buffer of transformed coefficients
+ *
+ * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized
+ * coefficients. This is here to conveniently copy the best coefficients to
+ * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a
+ * superblock level, we need to combine it with cb_offset to get the proper
+ * position for the current coding block.
+ */
+ CB_COEFF_BUFFER *cb_coef_buff;
+ //! Offset of current coding block's coeff buffer relative to the sb.
+ uint16_t cb_offset[PLANE_TYPES];
+
+ //! Modified source and masks used for fast OBMC search.
+ OBMCBuffer obmc_buffer;
+ //! Buffer to store the best palette map.
+ PALETTE_BUFFER *palette_buffer;
+ //! Buffer used for compound_type_rd().
+ CompoundTypeRdBuffers comp_rd_buffer;
+ //! Buffer to store convolution during averaging process in compound mode.
+ CONV_BUF_TYPE *tmp_conv_dst;
+
+ /*! \brief Temporary buffer to hold prediction.
+ *
+ * Points to a buffer that is used to hold temporary prediction results. This
+ * is used in two ways:
+ * - This is a temporary buffer used to ping-pong the prediction in
+ * handle_inter_mode.
+ * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc
+ * prediction.
+ */
+ uint8_t *tmp_pred_bufs[2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Rdopt Costs
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Quantization index for the current partition block.
+ *
+ * This is used to as the index to find quantization parameter for luma and
+ * chroma transformed coefficients.
+ */
+ int qindex;
+
+ /*! \brief Difference between frame-level qindex and current qindex.
+ *
+ * This is used to track whether a non-zero delta for qindex is used at least
+ * once in the current frame.
+ */
+ int delta_qindex;
+
+ /*! \brief Difference between frame-level qindex and qindex used to
+ * compute rdmult (lambda).
+ *
+ * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep.
+ * During qp sweep, delta_qindex is changed and used to calculate the actual
+ * quant params, while rdmult_delta_qindex remains the same, and is used to
+ * calculate the rdmult in "set_deltaq_rdmult".
+ */
+ int rdmult_delta_qindex;
+
+ /*! \brief Current qindex (before being adjusted by delta_q_res) used to
+ * derive rdmult_delta_qindex.
+ */
+ int rdmult_cur_qindex;
+
+ /*! \brief Rate-distortion multiplier.
+ *
+ * The rd multiplier used to determine the rate-distortion trade-off. This is
+ * roughly proportional to the inverse of q-index for a given frame, but this
+ * can be manipulated for better rate-control. For example, in tune_ssim
+ * mode, this is scaled by a factor related to the variance of the current
+ * block.
+ */
+ int rdmult;
+
+ //! Intra only, per sb rd adjustment.
+ int intra_sb_rdmult_modifier;
+
+ //! Superblock level distortion propagation factor.
+ double rb;
+
+ //! Energy in the current source coding block. Used to calculate \ref rdmult
+ int mb_energy;
+ //! Energy in the current source superblock. Used to calculate \ref rdmult
+ int sb_energy_level;
+
+ //! The rate needed to signal a mode to the bitstream.
+ ModeCosts mode_costs;
+
+ //! The rate needed to encode a new motion vector to the bitstream and some
+ //! multipliers for motion search.
+ MvCosts *mv_costs;
+
+ /*! The rate needed to encode a new motion vector to the bitstream in intrabc
+ * mode.
+ */
+ IntraBCMVCosts *dv_costs;
+
+ //! The rate needed to signal the txfm coefficients to the bitstream.
+ CoeffCosts coeff_costs;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Rate to Distortion Multipliers
+ ****************************************************************************/
+ /**@{*/
+ //! A multiplier that converts mv cost to l2 error.
+ int errorperbit;
+ //! A multiplier that converts mv cost to l1 error.
+ int sadperbit;
+ /**@}*/
+
+ /******************************************************************************
+ * \name Segmentation
+ *****************************************************************************/
+ /**@{*/
+ /*! \brief Skip mode for the segment
+ *
+ * A syntax element of the segmentation mode. In skip_block mode, all mvs are
+ * set 0 and all txfms are skipped.
+ */
+ int seg_skip_block;
+
+ /*! \brief Number of segment 1 blocks
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 1.
+ */
+ int actual_num_seg1_blocks;
+
+ /*!\brief Number of segment 2 blocks
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 2.
+ */
+ int actual_num_seg2_blocks;
+
+ /*!\brief Number of zero motion vectors
+ */
+ int cnt_zeromv;
+
+ /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path.
+ *
+ * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks
+ * in the superblock may be marked as zeromv-skip at block level.
+ */
+ int force_zeromv_skip_for_sb;
+
+ /*!\brief Flag to force zeromv-skip at block level, for nonrd path.
+ */
+ int force_zeromv_skip_for_blk;
+
+ /*! \brief Previous segment id for which qmatrices were updated.
+ * This is used to bypass setting of qmatrices if no change in qindex.
+ */
+ int prev_segment_id;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Superblock
+ ****************************************************************************/
+ /**@{*/
+ //! Information on a whole superblock level.
+ // TODO(chiyotsai@google.com): Refactor this out of macroblock
+ SuperBlockEnc sb_enc;
+
+ /*! \brief Characteristics of the current superblock.
+ *
+ * Characteristics like whether the block has high sad, low sad, etc. This is
+ * only used by av1 realtime mode.
+ */
+ CONTENT_STATE_SB content_state_sb;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Reference Frame Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Sum absolute distortion of the predicted mv for each ref frame.
+ *
+ * This is used to measure how viable a reference frame is.
+ */
+ int pred_mv_sad[REF_FRAMES];
+ /*! \brief The minimum of \ref pred_mv_sad.
+ *
+ * Index 0 stores the minimum \ref pred_mv_sad across past reference frames.
+ * Index 1 stores the minimum \ref pred_mv_sad across future reference frames.
+ */
+ int best_pred_mv_sad[2];
+ //! The sad of the 1st mv ref (nearest).
+ int pred_mv0_sad[REF_FRAMES];
+ //! The sad of the 2nd mv ref (near).
+ int pred_mv1_sad[REF_FRAMES];
+
+ /*! \brief Disables certain ref frame pruning based on tpl.
+ *
+ * Determines whether a given ref frame is "good" based on data from the TPL
+ * model. If so, this stops selective_ref frame from pruning the given ref
+ * frame at block level.
+ */
+ uint8_t tpl_keep_ref_frame[REF_FRAMES];
+
+ /*! \brief Warp motion samples buffer.
+ *
+ * Store the motion samples used for warp motion.
+ */
+ WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES];
+
+ /*! \brief Reference frames picked by the square subblocks in a superblock.
+ *
+ * Keeps track of ref frames that are selected by square partition blocks
+ * within a superblock, in MI resolution. They can be used to prune ref frames
+ * for rectangular blocks.
+ */
+ int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ /*! \brief Prune ref frames in real-time mode.
+ *
+ * Determines whether to prune reference frames in real-time mode. For the
+ * most part, this is the same as nonrd_prune_ref_frame_search in
+ * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively
+ * turned off if the only frame available is GOLDEN_FRAME.
+ */
+ int nonrd_prune_ref_frame_search;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Partition Search
+ ****************************************************************************/
+ /**@{*/
+ //! Stores some partition-search related buffers.
+ PartitionSearchInfo part_search_info;
+
+ /*! \brief Whether to disable some features to force a mode in current block.
+ *
+ * In some cases, our speed features can be overly aggressive and remove all
+ * modes search in the superblock. When this happens, we set
+ * must_find_valid_partition to 1 to reduce the number of speed features, and
+ * recode the superblock again.
+ */
+ int must_find_valid_partition;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Prediction Mode Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Inter skip mode.
+ *
+ * Skip mode tries to use the closest forward and backward references for
+ * inter prediction. Skip here means to skip transmitting the reference
+ * frames, not to be confused with skip_txfm.
+ */
+ int skip_mode;
+
+ /*! \brief Factors used for rd-thresholding.
+ *
+ * Determines a rd threshold to determine whether to continue searching the
+ * current mode. If the current best rd is already <= threshold, then we skip
+ * the current mode.
+ */
+ int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+
+ /*! \brief Tracks the winner modes in the current coding block.
+ *
+ * Winner mode is a two-pass strategy to find the best prediction mode. In the
+ * first pass, we search the prediction modes with a limited set of txfm
+ * options, and keep the top modes. These modes are called the winner modes.
+ * In the second pass, we retry the winner modes with more thorough txfm
+ * options.
+ */
+ WinnerModeStats *winner_mode_stats;
+ //! Tracks how many winner modes there are.
+ int winner_mode_count;
+
+ /*! \brief The model used for rd-estimation to avoid txfm
+ *
+ * These are for inter_mode_rd_model_estimation, which is another two pass
+ * approach. In this speed feature, we collect data in the first couple frames
+ * to build an rd model to estimate the rdcost of a prediction model based on
+ * the residue error. Once enough data is collected, this speed feature uses
+ * the estimated rdcost to find the most performant prediction mode. Then we
+ * follow up with a second pass find the best transform for the mode.
+ * Determines if one would go with reduced complexity transform block
+ * search model to select prediction modes, or full complexity model
+ * to select transform kernel.
+ */
+ TXFM_RD_MODEL rd_model;
+
+ /*! \brief Stores the inter mode information needed to build an rd model.
+ *
+ * These are for inter_mode_rd_model_estimation, which is another two pass
+ * approach. In this speed feature, we collect data in the first couple frames
+ * to build an rd model to estimate the rdcost of a prediction model based on
+ * the residue error. Once enough data is collected, this speed feature uses
+ * the estimated rdcost to find the most performant prediction mode. Then we
+ * follow up with a second pass find the best transform for the mode.
+ */
+ // TODO(any): try to consolidate this speed feature with winner mode
+ // processing.
+ struct inter_modes_info *inter_modes_info;
+
+ //! How to blend the compound predictions.
+ uint8_t compound_idx;
+
+ //! A caches of results of compound type search so they can be reused later.
+ COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+ //! The idx for the latest compound mode in the cache \ref comp_rd_stats.
+ int comp_rd_stats_idx;
+
+ /*! \brief Whether to recompute the luma prediction.
+ *
+ * In interpolation search, we can usually skip recalculating the luma
+ * prediction because it is already calculated by a previous predictor. This
+ * flag signifies that some modes might have been skipped, so we need to
+ * rebuild the prediction.
+ */
+ int recalc_luma_mc_data;
+
+ /*! \brief Data structure to speed up intrabc search.
+ *
+ * Contains the hash table, hash function, and buffer used for intrabc.
+ */
+ IntraBCHashInfo intrabc_hash_info;
+
+ /*! \brief Whether to reuse the mode stored in mb_mode_cache. */
+ int use_mb_mode_cache;
+ /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and
+ * \ref av1_rd_pick_inter_mode. */
+ const MB_MODE_INFO *mb_mode_cache;
+ /*! \brief Pointer to the buffer which caches gradient information.
+ *
+ * Pointer to the array of structures to store gradient information of each
+ * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+ * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+ */
+ PixelLevelGradientInfo *pixel_gradient_info;
+ /*! \brief Flags indicating the availability of cached gradient info. */
+ bool is_sb_gradient_cached[PLANE_TYPES];
+
+ /*! \brief Flag to reuse predicted samples of inter block. */
+ bool reuse_inter_pred;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name MV Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Context used to determine the initial step size in motion search.
+ *
+ * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for
+ * each frame.
+ */
+ unsigned int max_mv_context[REF_FRAMES];
+
+ /*! \brief Limit for the range of motion vectors.
+ *
+ * These define limits to motion vector components to prevent them from
+ * extending outside the UMV borders
+ */
+ FullMvLimits mv_limits;
+
+ /*! \brief Buffer for storing the search site config.
+ *
+ * When resize mode or super resolution mode is on, the stride of the
+ * reference frame does not always match what's specified in \ref
+ * MotionVectorSearchParams::search_site_cfg. When his happens, we update the
+ * search_sine_config buffer here and use it for motion search.
+ */
+ search_site_config search_site_cfg_buf[NUM_DISTINCT_SEARCH_METHODS];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Txfm Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Parameters that control how motion search is done.
+ *
+ * Stores various txfm search related parameters such as txfm_type, txfm_size,
+ * trellis eob search, etc.
+ */
+ TxfmSearchParams txfm_search_params;
+
+ /*! \brief Results of the txfm searches that have been done.
+ *
+ * Caches old txfm search results and keeps the current txfm decisions to
+ * facilitate rdopt.
+ */
+ TxfmSearchInfo txfm_search_info;
+
+ /*! \brief Whether there is a strong color activity.
+ *
+ * Used in REALTIME coding mode to enhance the visual quality at the boundary
+ * of moving color objects.
+ */
+ uint8_t color_sensitivity_sb[MAX_MB_PLANE - 1];
+ //! Color sensitivity flag for the superblock for golden reference.
+ uint8_t color_sensitivity_sb_g[MAX_MB_PLANE - 1];
+ //! Color sensitivity flag for the superblock for altref reference.
+ uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1];
+ //! Color sensitivity flag for the coding block.
+ uint8_t color_sensitivity[MAX_MB_PLANE - 1];
+ //! Coding block distortion value for uv/color, minimum over the inter modes.
+ int64_t min_dist_inter_uv;
+
+ //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane
+ // so we can keep dqcoeff of the best tx_type.
+ tran_low_t *dqcoeff_buf;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Misc
+ ****************************************************************************/
+ /**@{*/
+ //! Variance of the source frame.
+ unsigned int source_variance;
+ //! Flag to indicate coding block is zero sad.
+ int block_is_zero_sad;
+ //! Flag to indicate superblock ME in variance partition is determined to be
+ // good/reliable, and so the superblock MV will be tested in the
+ // nonrd_pickmode. This is only used for LAST_FRAME.
+ int sb_me_partition;
+ //! Flag to indicate to test the superblock MV for the coding block in the
+ // nonrd_pickmode.
+ int sb_me_block;
+ //! Motion vector from superblock MV derived from int_pro_motion() in
+ // the variance_partitioning.
+ int_mv sb_me_mv;
+ //! SSE of the current predictor.
+ unsigned int pred_sse[REF_FRAMES];
+ //! Prediction for ML based partition.
+#if CONFIG_RT_ML_PARTITIONING
+ DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]);
+#endif
+ /**@}*/
+
+ /*! \brief NONE partition evaluated for merge.
+ *
+ * In variance based partitioning scheme, NONE & SPLIT partitions are
+ * evaluated to check the SPLIT can be merged as NONE. This flag signifies the
+ * partition is evaluated in the scheme.
+ */
+ int try_merge_partition;
+
+ /*! \brief Pointer to buffer which caches sub-block variances in a superblock.
+ *
+ * Pointer to the array of structures to store source variance information of
+ * each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+ * store source variance and log of source variance of each 4x4 sub-block.
+ */
+ Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+#ifndef NDEBUG
+ /*! \brief A hash to make sure av1_set_offsets is called */
+ SetOffsetsLoc last_set_offsets_loc;
+#endif // NDEBUG
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ mode_search_stat_nonrd ms_stat_nonrd;
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+
+ /*!\brief Number of pixels in current thread that choose palette mode in the
+ * fast encoding stage for screen content tool detemination.
+ */
+ int palette_pixels;
+
+ /*!\brief Pointer to the structure which stores the statistics used by
+ * sb-level multi-pass encoding.
+ */
+ struct SB_FIRST_PASS_STATS *sb_stats_cache;
+
+ /*!\brief Pointer to the structure which stores the statistics used by
+ * first-pass when superblock is searched twice consecutively.
+ */
+ struct SB_FIRST_PASS_STATS *sb_fp_stats;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+ /*!\brief Pointer to RD_STATS structure to be used in
+ * av1_rd_partition_search().
+ */
+ RD_STATS *rdcost;
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+} MACROBLOCK;
+#undef SINGLE_REF_MODES
+
+/*!\cond */
+// Zeroes out 'n_stats' elements in the array x->winner_mode_stats.
+// It only zeroes out what is necessary in 'color_index_map' (just the block
+// size, not the whole array).
+static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
+ WinnerModeStats *stats) {
+ // When winner mode stats are not required, the memory allocation is avoided
+ // for x->winner_mode_stats. The stats pointer will be NULL in such cases.
+ if (stats == NULL) return;
+
+ const int block_height = block_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ for (int i = 0; i < n_stats; ++i) {
+ WinnerModeStats *const stat = &stats[i];
+ memset(&stat->mbmi, 0, sizeof(stat->mbmi));
+ memset(&stat->rd_cost, 0, sizeof(stat->rd_cost));
+ memset(&stat->rd, 0, sizeof(stat->rd));
+ memset(&stat->rate_y, 0, sizeof(stat->rate_y));
+ memset(&stat->rate_uv, 0, sizeof(stat->rate_uv));
+ // Do not reset the whole array as it is CPU intensive.
+ memset(&stat->color_index_map, 0,
+ block_width * block_height * sizeof(stat->color_index_map[0]));
+ memset(&stat->mode_index, 0, sizeof(stat->mode_index));
+ }
+}
+
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+ static const char LUT[BLOCK_SIZES_ALL] = {
+ 0, // BLOCK_4X4
+ 1, // BLOCK_4X8
+ 1, // BLOCK_8X4
+ 0, // BLOCK_8X8
+ 1, // BLOCK_8X16
+ 1, // BLOCK_16X8
+ 0, // BLOCK_16X16
+ 1, // BLOCK_16X32
+ 1, // BLOCK_32X16
+ 0, // BLOCK_32X32
+ 1, // BLOCK_32X64
+ 1, // BLOCK_64X32
+ 0, // BLOCK_64X64
+ 0, // BLOCK_64X128
+ 0, // BLOCK_128X64
+ 0, // BLOCK_128X128
+ 1, // BLOCK_4X16
+ 1, // BLOCK_16X4
+ 1, // BLOCK_8X32
+ 1, // BLOCK_32X8
+ 1, // BLOCK_16X64
+ 1, // BLOCK_64X16
+ };
+
+ return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ return is_rect_tx_allowed_bsize(mbmi->bsize) &&
+ !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+ TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+ int depth = 0;
+ while (tx_size != ctx_size) {
+ depth++;
+ ctx_size = sub_tx_size_map[ctx_size];
+ assert(depth <= MAX_TX_DEPTH);
+ }
+ return depth;
+}
+
+static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
+ int skip) {
+ if (skip)
+ txb_skip[blk_idx] |= 1UL << plane;
+ else
+ txb_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+ // Set chroma planes to uninitialized states when luma is set to check if
+ // it will be set later
+ if (plane == 0) {
+ txb_skip[blk_idx] |= 1UL << (1 + 4);
+ txb_skip[blk_idx] |= 1UL << (2 + 4);
+ }
+
+ // Clear the initialization checking bit
+ txb_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
+#ifndef NDEBUG
+ // Check if this is initialized
+ assert(!(txb_skip[blk_idx] & (1UL << (plane + 4))));
+
+ // The magic number is 0x77, this is to test if there is garbage data
+ assert((txb_skip[blk_idx] & 0x88) == 0);
+#endif
+ return (txb_skip[blk_idx] >> plane) & 1;
+}
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 0000000000..6ad2ddaf25
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static int horizontal_filter(const uint8_t *s) {
+ return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+ return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+ return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+// p0 p1 p2 p3
+// q0 q1 q2 q3
+// block edge ->
+// r0 r1 r2 r3
+// s0 s1 s2 s3
+
+// blockiness = p0*-2+q0*6+r0*-6+s0*2 +
+// p1*-2+q1*6+r1*-6+s1*2 +
+// p2*-2+q2*6+r2*-6+s2*2 +
+// p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+// blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, s += sp, r += rp) {
+ s_blockiness += horizontal_filter(s);
+ r_blockiness += horizontal_filter(r);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-1];
+ sum_sq_1 += s[-1] * s[-1];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, ++s, ++r) {
+ s_blockiness += vertical_filter(s, sp);
+ r_blockiness += vertical_filter(r, rp);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-sp];
+ sum_sq_1 += s[-sp] * s[-sp];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch, int width,
+ int height) {
+ double blockiness = 0;
+ int i, j;
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4) {
+ if (i > 0 && i < height && j > 0 && j < width) {
+ blockiness +=
+ blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+ blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+ img2_pitch, 4);
+ }
+ }
+ }
+ blockiness /= width * height / 16;
+ return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/cnn.c b/third_party/aom/av1/encoder/cnn.c
new file mode 100644
index 0000000000..598b362753
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
+
+typedef struct {
+ const float **input;
+ int in_width;
+ int in_height;
+ int in_stride;
+ const CNN_LAYER_CONFIG *layer_config;
+ float **output;
+ int out_stride;
+ int start_idx;
+ int th_step;
+} CONVOLVE_OPS;
+
+static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
+
+static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
+
+typedef struct {
+ int allocsize;
+ int channels;
+ int width, height, stride;
+ float *buf[CNN_MAX_CHANNELS];
+} TENSOR;
+
+static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
+
+static void free_tensor(TENSOR *tensor) {
+ if (tensor->allocsize) {
+ aom_free(tensor->buf[0]);
+ tensor->buf[0] = NULL;
+ tensor->allocsize = 0;
+ }
+}
+
+static bool realloc_tensor(TENSOR *tensor, int channels, int width,
+ int height) {
+ const int newallocsize = channels * width * height;
+ if (tensor->allocsize < newallocsize) {
+ free_tensor(tensor);
+ tensor->buf[0] =
+ (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+ if (!tensor->buf[0]) return false;
+ tensor->allocsize = newallocsize;
+ }
+ tensor->width = width;
+ tensor->height = height;
+ tensor->stride = width;
+ tensor->channels = channels;
+ for (int c = 1; c < channels; ++c)
+ tensor->buf[c] = &tensor->buf[0][c * width * height];
+ return true;
+}
+
+static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
+ TENSOR *dst) {
+ assert(src->width == dst->width);
+ assert(src->height == dst->height);
+ assert(copy_channels <= src->channels);
+ if (src->stride == dst->width && dst->stride == dst->width) {
+ for (int c = 0; c < copy_channels; ++c) {
+ memcpy(dst->buf[dst_offset + c], src->buf[c],
+ sizeof(*dst->buf[0]) * src->width * src->height);
+ }
+ } else {
+ for (int c = 0; c < copy_channels; ++c) {
+ for (int r = 0; r < dst->height; ++r) {
+ memcpy(&dst->buf[dst_offset + c][r * dst->stride],
+ &src->buf[c][r * src->stride],
+ dst->width * sizeof(*dst->buf[c]));
+ }
+ }
+ }
+}
+
+static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
+ int channels, int width, int height, int stride) {
+ tensor->allocsize = 0;
+ tensor->channels = channels;
+ tensor->width = width;
+ tensor->height = height;
+ tensor->stride = stride;
+ if (buf) {
+ for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
+ } else {
+ for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
+ }
+}
+
+static void swap_tensor(TENSOR *t1, TENSOR *t2) {
+ TENSOR t = *t1;
+ *t1 = *t2;
+ *t2 = t;
+}
+
+// The concatenated tensor goes into dst with first the channels in
+// original dst followed by the channels in the src
+static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
+ assert(src->width == dst->width);
+ assert(src->height == dst->height);
+
+ const int dst_channels = dst->channels;
+ const int channels = dst->channels + src->channels;
+ const int newallocsize = channels * dst->width * dst->height;
+ if (dst->allocsize < newallocsize) {
+ TENSOR t;
+ init_tensor(&t);
+ // allocate new buffers and copy first the dst channels
+ if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
+ copy_tensor(dst, dst->channels, 0, &t);
+ // Swap the tensors and free the old buffers
+ swap_tensor(dst, &t);
+ free_tensor(&t);
+ }
+ for (int c = 1; c < channels; ++c)
+ dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
+ // Copy the channels in src after the first dst_channels channels.
+ copy_tensor(src, src->channels, dst_channels, dst);
+ return true;
+}
+
+int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+ return (t1->width == t2->width && t1->height == t2->height);
+}
+
+int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+ return (t1->channels == t2->channels && t1->width == t2->width &&
+ t1->height == t2->height);
+}
+
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+ const CNN_LAYER_CONFIG *layer_config,
+ int *out_width, int *out_height) {
+ assert(layer_config->skip_width > 0);
+ assert(layer_config->skip_height > 0);
+ if (!layer_config->deconvolve) {
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ case PADDING_SAME_REPLICATE:
+ *out_width = (in_width + layer_config->skip_width - 1) /
+ layer_config->skip_width;
+ *out_height = (in_height + layer_config->skip_height - 1) /
+ layer_config->skip_height;
+ break;
+ case PADDING_VALID:
+ *out_width =
+ (in_width - layer_config->filter_width + layer_config->skip_width) /
+ layer_config->skip_width;
+ *out_height = (in_height - layer_config->filter_height +
+ layer_config->skip_height) /
+ layer_config->skip_height;
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ } else {
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ case PADDING_SAME_REPLICATE:
+ *out_width = in_width * layer_config->skip_width;
+ *out_height = in_height * layer_config->skip_height;
+ break;
+ case PADDING_VALID:
+ *out_width = (in_width - 1) * layer_config->skip_width +
+ layer_config->filter_width;
+ *out_height = (in_height - 1) * layer_config->skip_height +
+ layer_config->filter_height;
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ }
+}
+
+void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+ int channels_per_branch[]) {
+ int branch = layer_config->branch;
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ if (layer_config->branch_copy_type == BRANCH_INPUT) {
+ channels_per_branch[b] = layer_config->in_channels;
+ } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+ channels_per_branch[b] = layer_config->out_channels;
+ } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+ channels_per_branch[b] = layer_config->out_channels;
+ for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+ if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+ assert(channels_per_branch[c] > 0);
+ channels_per_branch[b] += channels_per_branch[c];
+ }
+ }
+ }
+ }
+ }
+ channels_per_branch[branch] = layer_config->out_channels;
+ for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+ if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+ assert(channels_per_branch[c] > 0);
+ channels_per_branch[branch] += channels_per_branch[c];
+ }
+ }
+}
+
+#if CONFIG_DEBUG
+static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+ const int num_layers = cnn_config->num_layers;
+ const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
+
+ for (int idx = 0; idx < num_layers; idx++) {
+ if (layer_configs[idx].output_num != -1) {
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+
+void av1_find_cnn_output_size(int in_width, int in_height,
+ const CNN_CONFIG *cnn_config, int *out_width,
+ int *out_height, int *out_channels) {
+ int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
+ int i_width[CNN_MAX_BRANCHES] = { 0 };
+ int i_height[CNN_MAX_BRANCHES] = { 0 };
+ i_width[0] = in_width + cnn_config->ext_width * 2;
+ i_height[0] = in_height + cnn_config->ext_height * 2;
+
+#if CONFIG_DEBUG
+ assert(cnn_has_at_least_one_output(cnn_config));
+#endif
+
+ for (int i = 0; i < cnn_config->num_layers; ++i) {
+ const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+ const int branch = layer_config->branch;
+ int o_width = 0, o_height = 0;
+
+ if (layer_config->branch_copy_type == BRANCH_INPUT) {
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ assert(i_width[branch] > 0 && i_height[branch] > 0);
+ i_width[b] = i_width[branch];
+ i_height[b] = i_height[branch];
+ }
+ }
+ }
+
+ av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
+ layer_config, &o_width, &o_height);
+ i_width[branch] = o_width;
+ i_height[branch] = o_height;
+
+ if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ i_width[b] = o_width;
+ i_height[b] = o_height;
+ }
+ }
+ }
+
+ find_cnn_out_channels(layer_config, channels_per_branch);
+
+ const int output_num = layer_config->output_num;
+ if (output_num != -1) { // Current layer is an output layer
+ out_width[output_num] = o_width;
+ out_height[output_num] = o_height;
+ out_channels[output_num] = channels_per_branch[layer_config->branch];
+ }
+ }
+}
+
+static INLINE int get_start_shift_convolve(int width, int filt_width,
+ int stride) {
+ const int mod = (width % stride);
+ const int filt_off = (filt_width - 1) / 2;
+ const int dif = (mod ? mod - 1 : stride - 1);
+ return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
+}
+
+void av1_cnn_add_c(float **output, int channels, int width, int height,
+ int stride, const float **add) {
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ output[c][i * stride + j] += add[c][i * stride + j];
+ }
+}
+
+void av1_cnn_activate_c(float **output, int channels, int width, int height,
+ int stride, ACTIVATION layer_activation) {
+ if (layer_activation == RELU) {
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ output[c][i * stride + j] = relu(output[c][i * stride + j]);
+ }
+ } else if (layer_activation == SOFTSIGN) {
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ output[c][i * stride + j] = softsign(output[c][i * stride + j]);
+ }
+ } else if (layer_activation == SIGMOID) {
+ assert(0 && "Sigmoid has not been supported in CNN."); // TO DO
+ } else if (layer_activation != NONE) {
+ assert(0 && "Unknown activation type");
+ }
+}
+
+static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+ const CNN_LAYER_CONFIG *layer_config,
+ int branch, TENSOR branch_output[]) {
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ // Copy layer's active tensor to output tensor of branch b if set in
+ // mask. The output becomes the input of the first layer of the branch
+ // because the layer of the branch is not the first layer.
+ int copy_channels = branch_config->channels_to_copy > 0
+ ? branch_config->channels_to_copy
+ : layer_active_tensor->channels;
+ if (!realloc_tensor(&branch_output[b], copy_channels,
+ layer_active_tensor->width,
+ layer_active_tensor->height)) {
+ return false;
+ }
+ copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
+ }
+ }
+ return true;
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_ZERO.
+static void convolve_maxpool_padding_zero(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ const int cstep, const int filter_width_half,
+ const int filter_height_half) {
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+ for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+ for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+ ++hh) {
+ for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+ ++ww) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int ii = hh + l - filter_height_half;
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int jj = ww + m - filter_width_half;
+ if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+ continue;
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ const float a = sum;
+ if (h == hh && w == ww)
+ output[i][u * out_stride + v] = a;
+ else
+ output[i][u * out_stride + v] =
+ AOMMAX(output[i][u * out_stride + v], a);
+ }
+ }
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
+static void convolve_maxpool_padding_replicate(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ const int cstep, const int filter_width_half,
+ const int filter_height_half) {
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+ for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+ for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+ ++hh) {
+ for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+ ++ww) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int ii =
+ CLAMPINDEX(hh + l - filter_height_half, in_height);
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int jj =
+ CLAMPINDEX(ww + m - filter_width_half, in_width);
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ const float a = sum;
+ if (h == hh && w == ww)
+ output[i][u * out_stride + v] = a;
+ else
+ output[i][u * out_stride + v] =
+ AOMMAX(output[i][u * out_stride + v], a);
+ }
+ }
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_VALID.
+static void convolve_maxpool_padding_valid(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ const int cstep) {
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+ h += layer_config->skip_height, ++u) {
+ for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
+ w += layer_config->skip_width, ++v) {
+ for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+ ++hh) {
+ for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+ ++ww) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int ii = hh + l;
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int jj = ww + m;
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ const float a = sum;
+ if (h == hh && w == ww)
+ output[i][u * out_stride + v] = a;
+ else
+ output[i][u * out_stride + v] =
+ AOMMAX(output[i][u * out_stride + v], a);
+ }
+ }
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
+// equal to 1.
+static void convolve_element_wise(const float **input, int in_width,
+ int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config,
+ float **output, int out_stride, int start_idx,
+ int step) {
+ const int start_h = get_start_shift_convolve(
+ in_height, layer_config->filter_height, layer_config->skip_height);
+ const int start_w =
+ get_start_shift_convolve(in_width, layer_config->filter_width,
+ layer_config->skip_width) +
+ start_idx * layer_config->skip_width;
+ const int out_w_step = AOMMAX(step, 1);
+ const int in_w_step = layer_config->skip_width * out_w_step;
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = start_h, u = 0; h < in_height;
+ h += layer_config->skip_height, ++u) {
+ const int in_h = h * in_stride;
+ const int out_h = u * out_stride + start_idx;
+ for (int w = start_w, out_index = out_h; w < in_width;
+ w += in_w_step, out_index += out_w_step) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ sum += layer_config->weights[k * layer_config->out_channels + i] *
+ input[k][in_h + w];
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_ZERO.
+static void convolve_no_maxpool_padding_zero(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int filter_width_half,
+ const int filter_height_half, const int ii_shift, const int jj_shift,
+ const int channel_step) {
+ const int start_h = get_start_shift_convolve(
+ in_height, layer_config->filter_height, layer_config->skip_height);
+ const int start_w = get_start_shift_convolve(
+ in_width, layer_config->filter_width, layer_config->skip_width);
+ const int end_ii_shift = filter_height_half + 1;
+ const int end_jj_shift = filter_width_half + 1;
+ // *_filter_margin stores the number of pixels along a dimension in the
+ // intersection of the complement of the image in the extended image
+ // and the filter.
+ const int top_filter_margin = layer_config->filter_width * ii_shift;
+ const int right_filter_margin = end_jj_shift - in_width;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ for (int h = start_h, u = 0; h < in_height;
+ h += layer_config->skip_height, ++u) {
+ const int out_h = u * out_stride;
+ const int top_cstep =
+ AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+ cstep +
+ i;
+ const int start_ii = AOMMAX(0, h - ii_shift);
+ const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+ for (int w = start_w, out_index = out_h; w < in_width;
+ w += layer_config->skip_width, ++out_index) {
+ const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+ const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
+ const int start_jj = AOMMAX(0, w - jj_shift);
+ const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + top_cstep;
+ for (int ii = start_ii; ii < end_ii; ++ii) {
+ off += left_cstep;
+ for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+ sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+ }
+ off += right_cstep;
+ }
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_REPLICATE.
+static void convolve_no_maxpool_padding_replicate(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int ii_shift, const int jj_shift,
+ const int channel_step) {
+ // h and w are shifted to an offset coordinate system to reduce in-loop
+ // computation.
+ const int start_h =
+ get_start_shift_convolve(in_height, layer_config->filter_height,
+ layer_config->skip_height) -
+ ii_shift;
+ const int start_w =
+ get_start_shift_convolve(in_width, layer_config->filter_width,
+ layer_config->skip_width) -
+ jj_shift;
+ const int end_h = in_height - ii_shift;
+ const int end_w = in_width - jj_shift;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ for (int h = start_h, u = 0; h < end_h;
+ h += layer_config->skip_height, ++u) {
+ const int out_h = u * out_stride;
+ const int upper_ii_index = layer_config->filter_height + h;
+ for (int w = start_w, out_index = out_h; w < end_w;
+ w += layer_config->skip_width, ++out_index) {
+ const int upper_jj_index = layer_config->filter_width + w;
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int ii = h; ii < upper_ii_index; ++ii) {
+ const int clamped_ii = CLAMPINDEX(ii, in_height);
+ for (int jj = w; jj < upper_jj_index; ++jj) {
+ const int clamped_jj = CLAMPINDEX(jj, in_width);
+ assert(clamped_ii >= 0 && clamped_ii < in_height &&
+ clamped_jj >= 0 && clamped_jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][clamped_ii * in_stride + clamped_jj];
+ off += cstep;
+ }
+ }
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_VALID.
+void av1_cnn_convolve_no_maxpool_padding_valid_c(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step) {
+ assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+ !layer_config->maxpool);
+ assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+ assert(layer_config->pad == PADDING_VALID);
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+ h += layer_config->skip_height, ++u) {
+ const int out_h = u * out_stride;
+ const int upper_ii_index = layer_config->filter_height + h;
+ for (int w = 0, out_index = out_h;
+ w < in_width - layer_config->filter_width + 1;
+ w += layer_config->skip_width, ++out_index) {
+ const int upper_jj_index = layer_config->filter_width + w;
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int ii = h; ii < upper_ii_index; ++ii) {
+ for (int jj = w; jj < upper_jj_index; ++jj) {
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+ off += cstep;
+ }
+ }
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+static void av1_cnn_convolve(const float **input, int in_width, int in_height,
+ int in_stride,
+ const CNN_LAYER_CONFIG *layer_config,
+ float **output, int out_stride, int start_idx,
+ int step) {
+ assert(!layer_config->deconvolve);
+ const int cstep = layer_config->in_channels * layer_config->out_channels;
+ const int filter_height_half = layer_config->filter_height >> 1;
+ const int filter_width_half = layer_config->filter_width >> 1;
+ const int channel_step = AOMMAX(step, 1);
+
+ if (layer_config->maxpool &&
+ (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
+ layer_config, output, out_stride, cstep,
+ filter_width_half, filter_height_half);
+ break;
+ case PADDING_SAME_REPLICATE:
+ convolve_maxpool_padding_replicate(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, cstep, filter_width_half, filter_height_half);
+ break;
+ case PADDING_VALID:
+ convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
+ layer_config, output, out_stride, cstep);
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ } else {
+ // Results in element-wise matrix multiplication.
+ if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
+ convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
+ output, out_stride, start_idx, step);
+ return;
+ }
+ const int ii_shift =
+ filter_height_half - (layer_config->filter_height - 1) % 2;
+ const int jj_shift =
+ filter_width_half - (layer_config->filter_width - 1) % 2;
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ convolve_no_maxpool_padding_zero(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, filter_width_half, filter_height_half,
+ ii_shift, jj_shift, channel_step);
+ break;
+ case PADDING_SAME_REPLICATE:
+ convolve_no_maxpool_padding_replicate(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
+ break;
+ case PADDING_VALID:
+ av1_cnn_convolve_no_maxpool_padding_valid(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, channel_step);
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ }
+}
+
+static int convolve_layer(void *arg1, void *arg2) {
+ const CONVOLVE_OPS *convolve_ops = arg1;
+ (void)arg2;
+ av1_cnn_convolve(
+ convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+ convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+ convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+ return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+ int in_stride,
+ const CNN_LAYER_CONFIG *layer_config,
+ const CNN_THREAD_DATA *thread_data,
+ float **output, int out_stride) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ const int num_workers = thread_data->num_workers;
+ assert(thread_data->workers);
+
+ CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+ for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+ AVxWorker *const worker = &thread_data->workers[th];
+ winterface->reset(worker);
+
+ CONVOLVE_OPS convolve_op = { input, in_width, in_height,
+ in_stride, layer_config, output,
+ out_stride, th, num_workers };
+ convolve_ops[th] = convolve_op;
+ worker->hook = convolve_layer;
+ worker->data1 = &(convolve_ops[th]);
+ worker->data2 = NULL;
+
+ // Start convolving.
+ if (th == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait until all workers have finished.
+ for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+ winterface->sync(&thread_data->workers[th]);
+ }
+}
+
+static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+ const int dif = AOMMAX(filt_width - stride, 0);
+ return dif / 2;
+}
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
+ int stride, const float *gamma, const float *beta,
+ const float *mean, const float *std) {
+ assert(gamma && beta && beta && std && "batchnorm has null parameter!");
+ for (int ch = 0; ch < channels; ch++) {
+ const float ch_gamma = gamma[ch];
+ const float ch_beta = beta[ch];
+ const float ch_mean = mean[ch];
+ const float ch_std = std[ch];
+ float *image_row = image[ch];
+
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width; col++) {
+ image_row[col] =
+ ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
+ }
+ image_row += stride;
+ }
+ }
+}
+
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
+ int in_stride, const CNN_LAYER_CONFIG *layer_config,
+ float **output, int out_stride) {
+ assert(layer_config->deconvolve);
+
+ const int cstep = layer_config->in_channels * layer_config->out_channels;
+
+ int out_width = 0;
+ int out_height = 0;
+ av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
+ &out_height);
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int u = 0; u < out_height; ++u) {
+ for (int v = 0; v < out_width; ++v) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int h =
+ u - l +
+ get_start_shift_deconvolve(layer_config->filter_height,
+ layer_config->skip_height);
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int w =
+ v - m +
+ get_start_shift_deconvolve(layer_config->filter_width,
+ layer_config->skip_width);
+ if ((h % layer_config->skip_height) != 0 ||
+ (w % layer_config->skip_width) != 0)
+ continue;
+ const int ii = h / layer_config->skip_height;
+ const int jj = w / layer_config->skip_width;
+ if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+ continue;
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ output[i][u * out_stride + v] = sum;
+ }
+ }
+ }
+ break;
+ case PADDING_SAME_REPLICATE:
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int u = 0; u < out_height; ++u) {
+ for (int v = 0; v < out_width; ++v) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int h =
+ u - l +
+ get_start_shift_deconvolve(layer_config->filter_height,
+ layer_config->skip_height);
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int w =
+ v - m +
+ get_start_shift_deconvolve(layer_config->filter_width,
+ layer_config->skip_width);
+ if ((h % layer_config->skip_height) != 0 ||
+ (w % layer_config->skip_width) != 0)
+ continue;
+ const int ii =
+ CLAMPINDEX(h / layer_config->skip_height, in_height);
+ const int jj =
+ CLAMPINDEX(w / layer_config->skip_width, in_width);
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ output[i][u * out_stride + v] = sum;
+ }
+ }
+ }
+ break;
+ case PADDING_VALID:
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int u = 0; u < out_height; ++u) {
+ for (int v = 0; v < out_width; ++v) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int h = u - l;
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int w = v - m;
+ if ((h % layer_config->skip_height) != 0 ||
+ (w % layer_config->skip_width) != 0)
+ continue;
+ const int ii = h / layer_config->skip_height;
+ const int jj = w / layer_config->skip_width;
+ if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+ continue;
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ output[i][u * out_stride + v] = sum;
+ }
+ }
+ }
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+}
+
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
+ int in_stride, const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ CNN_MULTI_OUT *output_struct) {
+ bool success = false;
+ TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
+ TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
+
+ float **output[CNN_MAX_BRANCHES];
+ const int *out_chs = output_struct->output_channels;
+ output[0] = output_struct->output_buffer;
+ for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
+ output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
+ }
+
+ int i_width = in_width;
+ int i_height = in_height;
+ int o_width = 0, o_height = 0;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ init_tensor(&tensor1[b]);
+ init_tensor(&tensor2[b]);
+ }
+
+ const int *out_stride = output_struct->output_strides;
+ for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+ const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+ const int branch = layer_config->branch;
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+
+ // Allocate input tensor
+ if (layer == 0) { // First layer
+ assert(branch == 0); // First layer must be primary branch
+ assign_tensor(&tensor1[branch], (float **)input,
+ layer_config->in_channels, in_width, in_height, in_stride);
+ } else { // Non-first layer
+ // Swap tensor1 and tensor2
+ swap_tensor(&tensor1[branch], &tensor2[branch]);
+
+ i_width = tensor1[branch].width;
+ i_height = tensor1[branch].height;
+ }
+
+ // Allocate output tensor
+ av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
+ &o_height);
+ const int output_num = layer_config->output_num;
+ if (output_num == -1) { // Non-output layer
+ if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+ o_height)) {
+ goto Error;
+ }
+ } else { // Output layer
+ free_tensor(&tensor2[branch]);
+ assign_tensor(&tensor2[branch], output[output_num],
+ layer_config->out_channels, o_width, o_height,
+ out_stride[output_num]);
+ }
+
+ // If we are combining branches make sure that the branch to combine
+ // is different from the current branch.
+ assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
+ !(branch_config->branches_to_combine & (1 << branch))));
+
+ if (layer_config->branch_copy_type == BRANCH_INPUT) {
+ if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
+ branch, tensor2)) {
+ goto Error;
+ }
+ }
+ // Check consistency of input and output channels
+ assert(tensor1[branch].channels == layer_config->in_channels);
+ assert(tensor2[branch].channels == layer_config->out_channels);
+
+ // Convolve/Deconvolve
+ if (!cnn_config->layer_config[layer].deconvolve) {
+ if (thread_data->num_workers > 1) {
+ convolve_layer_mt((const float **)tensor1[branch].buf,
+ tensor1[branch].width, tensor1[branch].height,
+ tensor1[branch].stride, layer_config, thread_data,
+ tensor2[branch].buf, tensor2[branch].stride);
+ } else {
+ av1_cnn_convolve((const float **)tensor1[branch].buf,
+ tensor1[branch].width, tensor1[branch].height,
+ tensor1[branch].stride, layer_config,
+ tensor2[branch].buf, tensor2[branch].stride, 0, 1);
+ }
+ } else {
+ av1_cnn_deconvolve((const float **)tensor1[branch].buf,
+ tensor1[branch].width, tensor1[branch].height,
+ tensor1[branch].stride, layer_config,
+ tensor2[branch].buf, tensor2[branch].stride);
+ }
+
+ if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+ if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+ branch, tensor2)) {
+ goto Error;
+ }
+ }
+
+ // Add tensors from other branches if needed
+ if (layer_config->branch_combine_type == BRANCH_ADD) {
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
+ av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
+ tensor2[branch].width, tensor2[branch].height,
+ tensor2[branch].stride, (const float **)tensor2[b].buf);
+ }
+ }
+ }
+
+ // Non-linearity
+ av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
+ tensor2[branch].width, tensor2[branch].height,
+ tensor2[branch].stride, layer_config->activation);
+
+ if (layer_config->bn_params.bn_gamma) {
+ av1_cnn_batchnorm(
+ tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
+ tensor2[branch].height, tensor2[branch].stride,
+ layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
+ layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
+ }
+
+ // Concatenate tensors
+ if (layer_config->branch_combine_type == BRANCH_CAT) {
+ if (output_num == -1) { // Non-output layer
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+ assert(tensor2[b].channels > 0);
+ if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
+ }
+ }
+ } else { // Output layer
+ const int existing_channels = tensor2[branch].channels;
+ int num_chs = existing_channels;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+ // Needed only to assign the new channel buffers
+ num_chs += tensor2[b].channels;
+ }
+ }
+ assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
+ o_height, out_stride[output_num]);
+
+ num_chs = existing_channels;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+ // Needed only to assign the new channel buffers
+ copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
+ &tensor2[branch]);
+ num_chs += tensor2[b].channels;
+ }
+ }
+ }
+ }
+
+ if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+ if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+ branch, tensor2)) {
+ goto Error;
+ }
+ }
+ }
+
+ success = true;
+Error:
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ free_tensor(&tensor1[b]);
+ free_tensor(&tensor2[b]);
+ }
+ return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+ int stride, const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ CNN_MULTI_OUT *output) {
+ const float max_val = 255.0;
+
+ const int in_width = width + 2 * cnn_config->ext_width;
+ const int in_height = height + 2 * cnn_config->ext_height;
+ const int in_channels = cnn_config->layer_config[0].in_channels;
+ float *inputs[CNN_MAX_CHANNELS];
+ float *input_ =
+ (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+ if (!input_) return false;
+ const int in_stride = in_width;
+
+ for (int c = 0; c < in_channels; ++c) {
+ inputs[c] = input_ + c * in_stride * in_height;
+ float *input =
+ inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+ if (cnn_config->strict_bounds) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ // extend left and right
+ for (int i = 0; i < height; ++i) {
+ for (int j = -cnn_config->ext_width; j < 0; ++j)
+ input[i * in_stride + j] = input[i * in_stride];
+ for (int j = width; j < width + cnn_config->ext_width; ++j)
+ input[i * in_stride + j] = input[i * in_stride + width - 1];
+ }
+ // extend top and bottom
+ for (int i = -cnn_config->ext_height; i < 0; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[-cnn_config->ext_width], in_width * sizeof(*input));
+ for (int i = height; i < height + cnn_config->ext_height; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[(height - 1) * in_stride - cnn_config->ext_width],
+ in_width * sizeof(*input));
+ } else {
+ for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+ ++i)
+ for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+ ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ }
+ }
+ bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+ in_stride, cnn_config, thread_data, output);
+
+ aom_free(input_);
+ return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+ int stride,
+ const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ int bit_depth,
+ CNN_MULTI_OUT *output) {
+ const float max_val = (float)((1 << bit_depth) - 1);
+
+ const int in_width = width + 2 * cnn_config->ext_width;
+ const int in_height = height + 2 * cnn_config->ext_height;
+ const int in_channels = cnn_config->layer_config[0].in_channels;
+ float *inputs[CNN_MAX_CHANNELS];
+ float *input_ =
+ (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+ if (!input_) return false;
+ const int in_stride = in_width;
+
+ for (int c = 0; c < in_channels; ++c) {
+ inputs[c] = input_ + c * in_stride * in_height;
+ float *input =
+ inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+ if (cnn_config->strict_bounds) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ // extend left and right
+ for (int i = 0; i < height; ++i) {
+ for (int j = -cnn_config->ext_width; j < 0; ++j)
+ input[i * in_stride + j] = input[i * in_stride];
+ for (int j = width; j < width + cnn_config->ext_width; ++j)
+ input[i * in_stride + j] = input[i * in_stride + width - 1];
+ }
+ // extend top and bottom
+ for (int i = -cnn_config->ext_height; i < 0; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[-cnn_config->ext_width], in_width * sizeof(*input));
+ for (int i = height; i < height + cnn_config->ext_height; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[(height - 1) * in_stride - cnn_config->ext_width],
+ in_width * sizeof(*input));
+ } else {
+ for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+ ++i)
+ for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+ ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ }
+ }
+
+ bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+ in_stride, cnn_config, thread_data, output);
+
+ aom_free(input_);
+ return success;
+}
diff --git a/third_party/aom/av1/encoder/cnn.h b/third_party/aom/av1/encoder/cnn.h
new file mode 100644
index 0000000000..df6401f73f
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CNN_H_
+#define AOM_AV1_ENCODER_CNN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_util/aom_thread.h"
+#include "config/av1_rtcd.h"
+
+struct AV1Common;
+
+#define CNN_MAX_HIDDEN_LAYERS 64
+#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)
+#define CNN_MAX_CHANNELS 256
+#define CNN_MAX_BRANCHES 4
+#define CNN_MAX_THREADS 32
+
+#define NO_BRANCH_CONFIG \
+ { 0, 0, 0 }
+#define NO_BN_PARAMS \
+ { NULL, NULL, NULL, NULL }
+
+enum {
+ PADDING_SAME_ZERO, // tensorflow's SAME padding with pixels outside
+ // the image area assumed to be 0 (default)
+ PADDING_SAME_REPLICATE, // tensorflow's SAME padding with pixels outside
+ // the image area replicated from closest edge
+ PADDING_VALID // tensorflow's VALID padding
+} UENUM1BYTE(PADDING_TYPE);
+
+// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);
+
+// Times when input tensor may be copied to branches given in input_to_branches.
+// BRANCH_NO_COPY: doesn't copy any tensor.
+// BRANCH_INPUT: copies the input tensor to branches.
+// BRANCH_OUTPUT: copies the convolved tensor to branches.
+// BRANCH_COMBINED: copies the combined (after convolving and branch combining)
+// tensor. If no combinations happen at this layer, then this option
+// has the same effect as COPY_OUTPUT.
+enum {
+ BRANCH_NO_COPY,
+ BRANCH_INPUT,
+ BRANCH_OUTPUT,
+ BRANCH_COMBINED
+} UENUM1BYTE(BRANCH_COPY);
+
+// Types of combining branches with output of current layer:
+// BRANCH_NOC: no branch combining
+// BRANCH_ADD: Add previously stored branch tensor to output of layer
+// BRANCH_CAT: Concatenate branch tensor to output of layer
+enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);
+
+// The parameters used to scale each channel in batch
+// normalization. The processing in done on a per-channel basis.
+// e.g. bn_mean[c] is the mean for all pixels in channel c. This
+// is always applied after activation. The output is given by
+// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where
+// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]
+// here we assume that the effect of variance_epsilon is already
+// taken into account when bn_std is calculated. The pointers
+// needs to be either all zero or all valid. If all zero, then
+// batchnorm is disabled, else batchnorm is applied.
+struct CNN_BATCHNORM_PARAMS {
+ const float *bn_gamma;
+ const float *bn_beta;
+ const float *bn_mean;
+ const float *bn_std;
+};
+
+struct CNN_BRANCH_CONFIG {
+ int input_to_branches; // If nonzero, copy the active tensor to the current
+ // layer and store for future use in branches
+ // specified in the field as a binary mask. For
+ // example, if input_to_branch = 0x06, it means the
+ // input tensor to the current branch is copied to
+ // branches 1 and 2 (where 0 represents the primary
+ // branch). One restriction is that the mask
+ // cannot indicate copying to the current branch.
+ // If greater than 0, only copies the channels up
+ // to the given index.
+ int channels_to_copy; // Within the layer, input a copy of active
+ // tensor to branches given in input_to_branches.
+ int branches_to_combine; // mask of branches to combine with output of
+ // current layer, if
+ // branch_combine_type != BRANCH_NOC
+ // For example, if branches_to_combine = 0x0A,
+ // it means that braches 1 and 3 are combined
+ // with the current branch.
+};
+
+struct CNN_LAYER_CONFIG {
+ int in_channels;
+ int filter_width;
+ int filter_height;
+ int out_channels;
+ int skip_width;
+ int skip_height;
+ int maxpool; // whether to use maxpool or not (only effective when
+ // skip width or skip_height are > 1)
+ const float *weights; // array of length filter_height x filter_width x
+ // in_channels x out_channels where the inner-most
+ // scan is out_channels and the outer most scan is
+ // filter_height.
+ const float *bias; // array of length out_channels
+ PADDING_TYPE pad; // padding type
+ ACTIVATION activation; // the activation function to use after convolution
+ int deconvolve; // whether this is a deconvolution layer.
+ // 0: If skip_width or skip_height are > 1, then we
+ // reduce resolution
+ // 1: If skip_width or skip_height are > 1, then we
+ // increase resolution
+ int branch; // branch index in [0, CNN_MAX_BRANCHES - 1], where
+ // 0 refers to the primary branch.
+ BRANCH_COPY branch_copy_type;
+ BRANCH_COMBINE branch_combine_type;
+ struct CNN_BRANCH_CONFIG branch_config;
+ struct CNN_BATCHNORM_PARAMS
+ bn_params; // A struct that contains the parameters
+ // used for batch normalization.
+ int output_num; // The output buffer idx to which the layer output is
+ // written. Set to -1 to disable writing it to the output. In
+ // the case that branch_combine_type is BRANCH_CAT, all
+ // concatenated channels will be written to output. In the
+ // case of BRANCH_ADD, the output will be the result of
+ // summation.
+};
+
+struct CNN_CONFIG {
+ int num_layers; // number of CNN layers ( = number of hidden layers + 1)
+ int is_residue; // whether the output activation is a residue
+ int ext_width, ext_height; // extension horizontally and vertically
+ int strict_bounds; // whether the input bounds are strict or not.
+ // If strict, the extension area is filled by
+ // replication; if not strict, image data is
+ // assumed available beyond the bounds.
+ CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];
+};
+
+struct CNN_THREAD_DATA {
+ int num_workers;
+ AVxWorker *workers;
+};
+
+struct CNN_MULTI_OUT {
+ int num_outputs;
+ const int *output_channels;
+ const int *output_strides;
+ float **output_buffer;
+};
+
+// Function to return size of output
+void av1_find_cnn_output_size(int in_width, int in_height,
+ const CNN_CONFIG *cnn_config, int *out_width,
+ int *out_height, int *out_channels);
+
+// Function to return output width and output height of given layer.
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+ const CNN_LAYER_CONFIG *layer_config,
+ int *out_width, int *out_height);
+
+// Prediction functions from set of input image buffers. This function supports
+// CNN with multiple outputs.
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+ int stride, const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ struct CNN_MULTI_OUT *output);
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+ int stride,
+ const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ int bit_depth, CNN_MULTI_OUT *output);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_CNN_H_
diff --git a/third_party/aom/av1/encoder/compound_type.c b/third_party/aom/av1/encoder/compound_type.c
new file mode 100644
index 0000000000..3b0ee88241
--- /dev/null
+++ b/third_party/aom/av1/encoder/compound_type.c
@@ -0,0 +1,1678 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tx_search.h"
+
+typedef int64_t (*pick_interinter_mask_type)(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0, const uint8_t *const p1,
+ const int16_t *const residual1, const int16_t *const diff10,
+ uint64_t *best_sse);
+
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const COMP_RD_STATS *st,
+ const MB_MODE_INFO *const mi,
+ int32_t *comp_rate, int64_t *comp_dist,
+ int32_t *comp_model_rate,
+ int64_t *comp_model_dist, int *comp_rs2) {
+ // TODO(ranjit): Ensure that compound type search use regular filter always
+ // and check if following check can be removed
+ // Check if interp filter matches with previous case
+ if (st->filter.as_int != mi->interp_filters.as_int) return 0;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ // Match MV and reference indices
+ for (int i = 0; i < 2; ++i) {
+ if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+ (st->mv[i].as_int != mi->mv[i].as_int)) {
+ return 0;
+ }
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+ if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+ }
+
+ int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 };
+ // For compound wedge, reuse data if newmv search is disabled when NEWMV is
+ // present or if NEWMV is not present in either of the directions
+ if ((!have_newmv_in_inter_mode(mi->mode) &&
+ !have_newmv_in_inter_mode(st->mode)) ||
+ (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search))
+ reuse_data[COMPOUND_WEDGE] = 1;
+ // For compound diffwtd, reuse data if fast search is enabled (no newmv search
+ // when NEWMV is present) or if NEWMV is not present in either of the
+ // directions
+ if (cpi->sf.inter_sf.enable_fast_compound_mode_search ||
+ (!have_newmv_in_inter_mode(mi->mode) &&
+ !have_newmv_in_inter_mode(st->mode)))
+ reuse_data[COMPOUND_DIFFWTD] = 1;
+
+ // Store the stats for the different compound types
+ for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES;
+ comp_type++) {
+ if (reuse_data[comp_type]) {
+ comp_rate[comp_type] = st->rate[comp_type];
+ comp_dist[comp_type] = st->dist[comp_type];
+ comp_model_rate[comp_type] = st->model_rate[comp_type];
+ comp_model_dist[comp_type] = st->model_dist[comp_type];
+ comp_rs2[comp_type] = st->comp_rs2[comp_type];
+ }
+ }
+ return 1;
+}
+
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *const mbmi,
+ int32_t *comp_rate, int64_t *comp_dist,
+ int32_t *comp_model_rate,
+ int64_t *comp_model_dist, int *comp_rs2,
+ int *match_index) {
+ for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+ if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+ comp_dist, comp_model_rate, comp_model_dist,
+ comp_rs2)) {
+ *match_index = j;
+ return 1;
+ }
+ }
+ return 0; // no match result found
+}
+
+static INLINE bool enable_wedge_search(
+ MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) {
+ // Enable wedge search if source variance and edge strength are above
+ // the thresholds.
+ return x->source_variance > disable_wedge_var_thresh;
+}
+
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+ const AV1_COMP *const cpi) {
+ return enable_wedge_search(
+ x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) &&
+ cpi->oxcf.comp_type_cfg.enable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+ const AV1_COMP *const cpi) {
+ return enable_wedge_search(
+ x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) &&
+ cpi->oxcf.comp_type_cfg.enable_interintra_wedge;
+}
+
+static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const BLOCK_SIZE bsize, const uint8_t *pred0,
+ int stride0, const uint8_t *pred1,
+ int stride1) {
+ static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+ // 4X4
+ BLOCK_INVALID,
+ // 4X8, 8X4, 8X8
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+ // 8X16, 16X8, 16X16
+ BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+ // 16X32, 32X16, 32X32
+ BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+ // 32X64, 64X32, 64X64
+ BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+ // 64x128, 128x64, 128x128
+ BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+ // 4X16, 16X4, 8X32
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ // 32X8, 16X64, 64X16
+ BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+ };
+ const struct macroblock_plane *const p = &x->plane[0];
+ const uint8_t *src = p->src.buf;
+ int src_stride = p->src.stride;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int bw_by2 = bw >> 1;
+ const int bh_by2 = bh >> 1;
+ uint32_t esq[2][2];
+ int64_t tl, br;
+
+ const BLOCK_SIZE f_index = split_qtr[bsize];
+ assert(f_index != BLOCK_INVALID);
+
+ if (is_cur_buf_hbd(&x->e_mbd)) {
+ pred0 = CONVERT_TO_BYTEPTR(pred0);
+ pred1 = CONVERT_TO_BYTEPTR(pred1);
+ }
+
+ // Residual variance computation over relevant quandrants in order to
+ // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1),
+ // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0)
+ // The 2nd and 3rd quadrants cancel out in TL + BR
+ // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0)
+ // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
+ // for all codebooks; experiment with other quadrant combinations for
+ // 0, 90 and 135 degrees also.
+ cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+ cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+ pred0 + bh_by2 * stride0 + bw_by2, stride0,
+ &esq[0][1]);
+ cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+ cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+ pred1 + bh_by2 * stride1 + bw_by2, stride0,
+ &esq[1][1]);
+
+ tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
+ br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
+ return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const uint8_t *const p0,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ int8_t *const best_wedge_sign,
+ int8_t *const best_wedge_index, uint64_t *best_sse) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ assert(N >= 64);
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int8_t wedge_index;
+ int8_t wedge_sign;
+ const int8_t wedge_types = get_wedge_types_lookup(bsize);
+ const uint8_t *mask;
+ uint64_t sse;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+ DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (hbd) {
+ aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p0), bw);
+ } else {
+ aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+ }
+#else
+ (void)hbd;
+ aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+#endif
+
+ int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+ (int64_t)aom_sum_squares_i16(residual1, N)) *
+ (1 << WEDGE_WEIGHT_BITS) / 2;
+ int16_t *ds = residual0;
+
+ av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+ wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ // int rate2;
+ // int64_t dist2;
+ // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+ // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+ // sse, rate, dist, rate2, dist2); dist = dist2;
+ // rate = rate2;
+
+ rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
+ rd = RDCOST(x->rdmult, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ *best_wedge_sign = wedge_sign;
+ best_rd = rd;
+ *best_sse = sse;
+ }
+ }
+
+ return best_rd -
+ RDCOST(x->rdmult,
+ x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+ const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int16_t *const residual1,
+ const int16_t *const diff10, const int8_t wedge_sign,
+ int8_t *const best_wedge_index, uint64_t *best_sse) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ assert(N >= 64);
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int8_t wedge_index;
+ const int8_t wedge_types = get_wedge_types_lookup(bsize);
+ const uint8_t *mask;
+ uint64_t sse;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
+ rd = RDCOST(x->rdmult, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ best_rd = rd;
+ *best_sse = sse;
+ }
+ }
+ return best_rd -
+ RDCOST(x->rdmult,
+ x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0, const uint8_t *const p1,
+ const int16_t *const residual1, const int16_t *const diff10,
+ uint64_t *best_sse) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+
+ int64_t rd;
+ int8_t wedge_index = -1;
+ int8_t wedge_sign = 0;
+
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ assert(cpi->common.seq_params->enable_masked_compound);
+
+ if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
+ wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+ rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+ &wedge_index, best_sse);
+ } else {
+ rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+ &wedge_index, best_sse);
+ }
+
+ mbmi->interinter_comp.wedge_sign = wedge_sign;
+ mbmi->interinter_comp.wedge_index = wedge_index;
+ return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ uint64_t *best_sse) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = 1 << num_pels_log2_lookup[bsize];
+ int rate;
+ int64_t dist;
+ DIFFWTD_MASK_TYPE cur_mask_type;
+ int64_t best_rd = INT64_MAX;
+ DIFFWTD_MASK_TYPE best_mask_type = 0;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+ // try each mask type and its inverse
+ for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+ // build mask and inverse
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (hbd)
+ av1_build_compound_diffwtd_mask_highbd(
+ tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+ else
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+ p0, bw, p1, bw, bh, bw);
+#else
+ (void)hbd;
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0,
+ bw, p1, bw, bh, bw);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ // compute rd for mask
+ uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+ tmp_mask[cur_mask_type], N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+ if (rd0 < best_rd) {
+ best_mask_type = cur_mask_type;
+ best_rd = rd0;
+ *best_sse = sse;
+ }
+ }
+ mbmi->interinter_comp.mask_type = best_mask_type;
+ if (best_mask_type == DIFFWTD_38_INV) {
+ memcpy(xd->seg_mask, seg_mask, N * 2);
+ }
+ return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(av1_is_wedge_used(bsize));
+ assert(cpi->common.seq_params->enable_interintra_compound);
+
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
+ DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+ }
+#else
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+#endif
+ int8_t wedge_index = -1;
+ uint64_t sse;
+ int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0,
+ &wedge_index, &sse);
+
+ mbmi->interintra_wedge_index = wedge_index;
+ return rd;
+}
+
+static AOM_INLINE void get_inter_predictors_masked_compound(
+ MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1,
+ int16_t *residual1, int16_t *diff10, int *strides) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0,
+ strides);
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1,
+ strides);
+ const struct buf_2d *const src = &x->plane[0].src;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(*preds1), bw);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+ bw, CONVERT_TO_BYTEPTR(*preds0), bw);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+ bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+ }
+#else
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+#endif
+}
+
+// Computes the rd cost for the given interintra mode and updates the best
+static INLINE void compute_best_interintra_mode(
+ const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+ MACROBLOCK *const x, const int *const interintra_mode_cost,
+ const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf,
+ INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd,
+ INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int rate;
+ uint8_t skip_txfm_sb;
+ int64_t dist, skip_sse_sb;
+ const int bw = block_size_wide[bsize];
+ mbmi->interintra_mode = interintra_mode;
+ int rmode = interintra_mode_cost[interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist,
+ &skip_txfm_sb, &skip_sse_sb, NULL,
+ NULL, NULL);
+ int64_t rd = RDCOST(x->rdmult, rate + rmode, dist);
+ if (rd < *best_interintra_rd) {
+ *best_interintra_rd = rd;
+ *best_interintra_mode = mbmi->interintra_mode;
+ }
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+ MACROBLOCK *x, int64_t ref_best_rd,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (ref_best_rd < 0) return INT64_MAX;
+ av1_subtract_plane(x, bs, 0);
+ const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
+ max_txsize_rect_lookup[bs]);
+ if (rd != INT64_MAX) {
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ if (rd_stats->skip_txfm) {
+ const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ rd_stats->rate = s1;
+ } else {
+ const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ rd_stats->rate += s0;
+ }
+ }
+ return rd;
+}
+
+// Computes the rd_threshold for smooth interintra rd search.
+static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
+ int total_mode_rate,
+ int64_t ref_best_rd) {
+ const int64_t rd_thresh = get_rd_thresh_from_best_rd(
+ ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT),
+ INTER_INTRA_RD_THRESH_SCALE);
+ const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0);
+ return (rd_thresh - mode_rd);
+}
+
+// Computes the best wedge interintra mode
+static AOM_INLINE int64_t compute_best_wedge_interintra(
+ const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+ MACROBLOCK *const x, const int *const interintra_mode_cost,
+ const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_,
+ int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bw = block_size_wide[bsize];
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ int64_t best_total_rd = INT64_MAX;
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
+ mbmi->interintra_mode = mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ const int rate_overhead =
+ interintra_mode_cost[mode] +
+ x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+ const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
+ if (total_rd < best_total_rd) {
+ best_total_rd = total_rd;
+ best_interintra_rd_wedge = rd;
+ *best_mode = mbmi->interintra_mode;
+ *best_wedge_index = mbmi->interintra_wedge_index;
+ }
+ }
+ return best_interintra_rd_wedge;
+}
+
+static int handle_smooth_inter_intra_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv,
+ INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd,
+ int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf,
+ uint8_t *intrapred, HandleInterModeArgs *args) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const interintra_mode_cost =
+ mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bw = block_size_wide[bsize];
+
+ mbmi->use_wedge_interintra = 0;
+
+ if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+ *best_interintra_mode == INTERINTRA_MODES) {
+ int64_t best_interintra_rd = INT64_MAX;
+ for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+ ++cur_mode) {
+ if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+ cpi->sf.intra_sf.disable_smooth_intra) &&
+ cur_mode == II_SMOOTH_PRED)
+ continue;
+ compute_best_interintra_mode(
+ cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf,
+ best_interintra_mode, &best_interintra_rd, cur_mode, bsize);
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+ }
+ assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra,
+ *best_interintra_mode != II_SMOOTH_PRED));
+ // Recompute prediction if required
+ bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode ||
+ *best_interintra_mode != INTERINTRA_MODES;
+ if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) {
+ mbmi->interintra_mode = *best_interintra_mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+
+ // Compute rd cost for best smooth_interintra
+ RD_STATS rd_stats;
+ const int is_wedge_used = av1_is_wedge_used(bsize);
+ const int rmode =
+ interintra_mode_cost[*best_interintra_mode] +
+ (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0);
+ const int total_mode_rate = rmode + *rate_mv;
+ const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+ int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+ } else {
+ return IGNORE_MODE;
+ }
+ *best_rd = rd;
+ *best_mode_rate = rmode;
+ // Return early if best rd not good enough
+ if (ref_best_rd < INT64_MAX &&
+ (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE >
+ ref_best_rd) {
+ return IGNORE_MODE;
+ }
+ return 0;
+}
+
+static int handle_wedge_inter_intra_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode,
+ int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_,
+ uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred,
+ HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead,
+ int_mv *tmp_mv, int64_t best_rd_no_wedge) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const interintra_mode_cost =
+ mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bw = block_size_wide[bsize];
+ const int try_smooth_interintra =
+ cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+ mbmi->use_wedge_interintra = 1;
+
+ if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+ // Exhaustive search of all wedge and mode combinations.
+ int best_mode = 0;
+ int best_wedge_index = 0;
+ *best_rd = compute_best_wedge_interintra(
+ cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_,
+ &best_mode, &best_wedge_index, bsize);
+ mbmi->interintra_mode = best_mode;
+ mbmi->interintra_wedge_index = best_wedge_index;
+ if (best_mode != INTERINTRA_MODES - 1) {
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ }
+ } else if (!try_smooth_interintra) {
+ if (*best_interintra_mode == INTERINTRA_MODES) {
+ mbmi->interintra_mode = INTERINTRA_MODES - 1;
+ *best_interintra_mode = INTERINTRA_MODES - 1;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ // Pick wedge mask based on INTERINTRA_MODES - 1
+ *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ // Find the best interintra mode for the chosen wedge mask
+ for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+ ++cur_mode) {
+ compute_best_interintra_mode(
+ cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+ tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize);
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+ mbmi->interintra_mode = *best_interintra_mode;
+
+ // Recompute prediction if required
+ if (*best_interintra_mode != INTERINTRA_MODES - 1) {
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ }
+ } else {
+ // Pick wedge mask for the best interintra mode (reused)
+ mbmi->interintra_mode = *best_interintra_mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ }
+ } else {
+ // Pick wedge mask for the best interintra mode from smooth_interintra
+ *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ }
+
+ *rate_overhead =
+ interintra_mode_cost[mbmi->interintra_mode] +
+ mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+ mode_costs->wedge_interintra_cost[bsize][1];
+ *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0);
+
+ int64_t rd = INT64_MAX;
+ const int_mv mv0 = mbmi->mv[0];
+ // Refine motion vector for NEWMV case.
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ int rate_sum;
+ uint8_t skip_txfm_sb;
+ int64_t dist_sum, skip_sse_sb;
+ // get negative of mask
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+ av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred,
+ mask, bw, tmp_rate_mv, 0);
+ if (mbmi->mv[0].as_int != tmp_mv->as_int) {
+ mbmi->mv[0].as_int = tmp_mv->as_int;
+ // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+ // predictor is not calculated again in av1_enc_build_inter_predictor().
+ mbmi->ref_frame[1] = NONE_FRAME;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+ xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+ &skip_sse_sb, NULL, NULL, NULL);
+ rd =
+ RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum);
+ }
+ }
+ if (rd >= *best_rd) {
+ tmp_mv->as_int = mv0.as_int;
+ *tmp_rate_mv = *rate_mv;
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ // Evaluate closer to true rd
+ RD_STATS rd_stats;
+ const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0);
+ const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd;
+ rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate,
+ rd_stats.dist);
+ } else {
+ if (*best_rd == INT64_MAX) return IGNORE_MODE;
+ }
+ *best_rd = rd;
+ return 0;
+}
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ int *rate_mv, int *tmp_rate2,
+ const BUFFER_SET *orig_dst) {
+ const int try_smooth_interintra =
+ cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+ const int is_wedge_used = av1_is_wedge_used(bsize);
+ const int try_wedge_interintra =
+ is_wedge_used && enable_wedge_interintra_search(x, cpi);
+
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Single reference inter prediction
+ mbmi->ref_frame[1] = NONE_FRAME;
+ xd->plane[0].dst.buf = tmp_buf;
+ xd->plane[0].dst.stride = bw;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ const int num_planes = av1_num_planes(cm);
+
+ // Restore the buffers for intra prediction
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ INTERINTRA_MODE best_interintra_mode =
+ args->inter_intra_mode[mbmi->ref_frame[0]];
+
+ // Compute smooth_interintra
+ int64_t best_interintra_rd_nowedge = INT64_MAX;
+ int best_mode_rate = INT_MAX;
+ if (try_smooth_interintra) {
+ int ret = handle_smooth_inter_intra_mode(
+ cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode,
+ &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf,
+ intrapred, args);
+ if (ret == IGNORE_MODE) {
+ return IGNORE_MODE;
+ }
+ }
+
+ // Compute wedge interintra
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ const int_mv mv0 = mbmi->mv[0];
+ int_mv tmp_mv = mv0;
+ int tmp_rate_mv = 0;
+ int rate_overhead = 0;
+ if (try_wedge_interintra) {
+ int ret = handle_wedge_inter_intra_mode(
+ cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode,
+ &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_,
+ intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv,
+ best_interintra_rd_nowedge);
+ if (ret == IGNORE_MODE) {
+ return IGNORE_MODE;
+ }
+ }
+
+ if (best_interintra_rd_nowedge == INT64_MAX &&
+ best_interintra_rd_wedge == INT64_MAX) {
+ return IGNORE_MODE;
+ }
+ if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ *tmp_rate2 += tmp_rate_mv - *rate_mv;
+ *rate_mv = tmp_rate_mv;
+ best_mode_rate = rate_overhead;
+ } else if (try_smooth_interintra && try_wedge_interintra) {
+ // If smooth was best, but we over-wrote the values when evaluating the
+ // wedge mode, we need to recompute the smooth values.
+ mbmi->use_wedge_interintra = 0;
+ mbmi->interintra_mode = best_interintra_mode;
+ mbmi->mv[0].as_int = mv0.as_int;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+ *tmp_rate2 += best_mode_rate;
+
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_U, num_planes - 1);
+ }
+ return 0;
+}
+
+// Computes the valid compound_types to be evaluated
+static INLINE int compute_valid_comp_types(MACROBLOCK *x,
+ const AV1_COMP *const cpi,
+ BLOCK_SIZE bsize,
+ int masked_compound_used,
+ int mode_search_mask,
+ COMPOUND_TYPE *valid_comp_types) {
+ const AV1_COMMON *cm = &cpi->common;
+ int valid_type_count = 0;
+ int comp_type, valid_check;
+ int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 };
+
+ const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+ const int try_distwtd_comp =
+ ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+ cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 &&
+ cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+
+ // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
+ for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+ comp_type++) {
+ valid_check =
+ (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
+ if (valid_check && is_interinter_compound_used(comp_type, bsize))
+ valid_comp_types[valid_type_count++] = comp_type;
+ }
+ // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
+ if (masked_compound_used) {
+ // enable_masked_type[0] corresponds to COMPOUND_WEDGE
+ // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
+ enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
+ enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp;
+ for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
+ comp_type++) {
+ if ((mode_search_mask & (1 << comp_type)) &&
+ is_interinter_compound_used(comp_type, bsize) &&
+ enable_masked_type[comp_type - COMPOUND_WEDGE])
+ valid_comp_types[valid_type_count++] = comp_type;
+ }
+ }
+ return valid_type_count;
+}
+
+// Calculates the cost for compound type mask
+static INLINE void calc_masked_type_cost(
+ const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx,
+ int comp_index_ctx, int masked_compound_used, int *masked_type_cost) {
+ av1_zero_array(masked_type_cost, COMPOUND_TYPES);
+ // Account for group index cost when wedge and/or diffwtd prediction are
+ // enabled
+ if (masked_compound_used) {
+ // Compound group index of average and distwtd is 0
+ // Compound group index of wedge and diffwtd is 1
+ masked_type_cost[COMPOUND_AVERAGE] +=
+ mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0];
+ masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
+ masked_type_cost[COMPOUND_WEDGE] +=
+ mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1];
+ masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
+ }
+
+ // Compute the cost to signal compound index/type
+ masked_type_cost[COMPOUND_AVERAGE] +=
+ mode_costs->comp_idx_cost[comp_index_ctx][1];
+ masked_type_cost[COMPOUND_DISTWTD] +=
+ mode_costs->comp_idx_cost[comp_index_ctx][0];
+ masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0];
+ masked_type_cost[COMPOUND_DIFFWTD] +=
+ mode_costs->compound_type_cost[bsize][1];
+}
+
+// Updates mbmi structure with the relevant compound type info
+static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
+ COMPOUND_TYPE cur_type) {
+ mbmi->interinter_comp.type = cur_type;
+ mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE);
+ mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD);
+}
+
+// When match is found, populate the compound type data
+// and calculate the rd cost using the stored stats and
+// update the mbmi appropriately.
+static INLINE int populate_reuse_comp_type_data(
+ const MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate,
+ int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd,
+ int match_index) {
+ const int winner_comp_type =
+ x->comp_rd_stats[match_index].interinter_comp.type;
+ if (comp_rate[winner_comp_type] == INT_MAX)
+ return best_type_stats->best_compmode_interinter_cost;
+ update_mbmi_for_compound_type(mbmi, winner_comp_type);
+ mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp;
+ *rd = RDCOST(
+ x->rdmult,
+ comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type],
+ comp_dist[winner_comp_type]);
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ return comp_rs2[winner_comp_type];
+}
+
+// Updates rd cost and relevant compound type data for the best compound type
+static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
+ BEST_COMP_TYPE_STATS *best_type_stats,
+ int64_t best_rd_cur,
+ int64_t comp_model_rd_cur, int rs2) {
+ *rd = best_rd_cur;
+ best_type_stats->comp_best_model_rd = comp_model_rd_cur;
+ best_type_stats->best_compound_data = mbmi->interinter_comp;
+ best_type_stats->best_compmode_interinter_cost = rs2;
+}
+
+// Updates best_mv for masked compound types
+static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
+ int_mv *best_mv, int *best_tmp_rate_mv,
+ int tmp_rate_mv) {
+ *best_tmp_rate_mv = tmp_rate_mv;
+ best_mv[0].as_int = mbmi->mv[0].as_int;
+ best_mv[1].as_int = mbmi->mv[1].as_int;
+}
+
+static INLINE void save_comp_rd_search_stat(
+ MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate,
+ const int64_t *comp_dist, const int32_t *comp_model_rate,
+ const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) {
+ const int offset = x->comp_rd_stats_idx;
+ if (offset < MAX_COMP_RD_STATS) {
+ COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+ memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+ memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+ memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate));
+ memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist));
+ memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2));
+ memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+ memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+ rd_stats->mode = mbmi->mode;
+ rd_stats->filter = mbmi->interp_filters;
+ rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ for (int i = 0; i < 2; ++i) {
+ const WarpedMotionParams *const wm =
+ &xd->global_motion[mbmi->ref_frame[i]];
+ rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+ }
+ memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp,
+ sizeof(rd_stats->interinter_comp));
+ ++x->comp_rd_stats_idx;
+ }
+}
+
+static INLINE int get_interinter_compound_mask_rate(
+ const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) {
+ const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+ if (compound_type == COMPOUND_WEDGE) {
+ return av1_is_wedge_used(mbmi->bsize)
+ ? av1_cost_literal(1) +
+ mode_costs
+ ->wedge_idx_cost[mbmi->bsize]
+ [mbmi->interinter_comp.wedge_index]
+ : 0;
+ } else {
+ assert(compound_type == COMPOUND_DIFFWTD);
+ return av1_cost_literal(1);
+ }
+}
+
+// Takes a backup of rate, distortion and model_rd for future reuse
+static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
+ int64_t *comp_dist, int32_t *comp_model_rate,
+ int64_t *comp_model_dist, int rate_sum,
+ int64_t dist_sum, RD_STATS *rd_stats,
+ int *comp_rs2, int rs2) {
+ comp_rate[cur_type] = rd_stats->rate;
+ comp_dist[cur_type] = rd_stats->dist;
+ comp_model_rate[cur_type] = rate_sum;
+ comp_model_dist[cur_type] = dist_sum;
+ comp_rs2[cur_type] = rs2;
+}
+
+static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode,
+ const int reuse_level) {
+ if (reuse_level || (this_mode == NEW_NEWMV))
+ return 1;
+ else
+ return 0;
+}
+
+static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ const BLOCK_SIZE bsize,
+ int64_t ref_skip_rd, int mode_rate) {
+ int eval_txfm = 1;
+ const int txfm_rd_gate_level =
+ get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
+ // Check if the mode is good enough based on skip rd
+ if (txfm_rd_gate_level) {
+ int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+ int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4));
+ eval_txfm =
+ check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1);
+ }
+ return eval_txfm;
+}
+
+static int64_t masked_compound_type_rd(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+ int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+ uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+ int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound,
+ int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate,
+ int64_t *comp_model_dist, const int64_t comp_best_model_rd,
+ int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int64_t best_rd_cur = INT64_MAX;
+ int64_t rd = INT64_MAX;
+ const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+ assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD);
+ int rate_sum;
+ uint8_t tmp_skip_txfm_sb;
+ int64_t dist_sum, tmp_skip_sse_sb;
+ pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge,
+ pick_interinter_seg };
+
+ // TODO(any): Save pred and mask calculation as well into records. However
+ // this may increase memory requirements as compound segment mask needs to be
+ // stored in each record.
+ if (*calc_pred_masked_compound) {
+ get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1,
+ diff10, strides);
+ *calc_pred_masked_compound = 0;
+ }
+ if (compound_type == COMPOUND_WEDGE) {
+ unsigned int sse;
+ if (is_cur_buf_hbd(xd))
+ (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+ CONVERT_TO_BYTEPTR(*preds1), *strides,
+ &sse);
+ else
+ (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides,
+ &sse);
+ const unsigned int mse =
+ ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
+ // If two predictors are very similar, skip wedge compound mode search
+ if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ }
+ // Function pointer to pick the appropriate mask
+ // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge()
+ // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg()
+ uint64_t cur_sse = UINT64_MAX;
+ best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+ cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
+ *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+ best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+ assert(cur_sse != UINT64_MAX);
+ int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
+
+ // Although the true rate_mv might be different after motion search, but it
+ // is unlikely to be the best mode considering the transform rd cost and other
+ // mode overhead cost
+ int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+ if (mode_rd > rd_thresh) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+
+ // Check if the mode is good enough based on skip rd
+ // TODO(nithya): Handle wedge_newmv_search if extending for lower speed
+ // setting
+ const int txfm_rd_gate_level =
+ get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
+ if (txfm_rd_gate_level) {
+ int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
+ txfm_rd_gate_level, 1);
+ if (!eval_txfm) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ }
+
+ // Compute cost if matching record not found, else, reuse data
+ if (comp_rate[compound_type] == INT_MAX) {
+ // Check whether new MV search for wedge is to be done
+ int wedge_newmv_search =
+ have_newmv_in_inter_mode(this_mode) &&
+ (compound_type == COMPOUND_WEDGE) &&
+ (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search);
+
+ // Search for new MV if needed and build predictor
+ if (wedge_newmv_search) {
+ *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ } else {
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+ preds1, strides);
+ }
+ // Get the RD cost from model RD
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb,
+ &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+ *comp_model_rd_cur = rd;
+ // Override with best if current is worse than best for new MV
+ if (wedge_newmv_search) {
+ if (rd >= best_rd_cur) {
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+ strides, preds1, strides);
+ *comp_model_rd_cur = best_rd_cur;
+ }
+ }
+ if (cpi->sf.inter_sf.prune_comp_type_by_model_rd &&
+ (*comp_model_rd_cur > comp_best_model_rd) &&
+ comp_best_model_rd != INT64_MAX) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ // Compute RD cost for the current type
+ RD_STATS rd_stats;
+ const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+ const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd;
+ rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd =
+ RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+ // Backup rate and distortion for future reuse
+ backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2,
+ *rs2);
+ }
+ } else {
+ // Reuse data as matching record is found
+ assert(comp_dist[compound_type] != INT64_MAX);
+ // When disable_interinter_wedge_newmv_search is set, motion refinement is
+ // disabled. Hence rate and distortion can be reused in this case as well
+ assert(IMPLIES((have_newmv_in_inter_mode(this_mode) &&
+ (compound_type == COMPOUND_WEDGE)),
+ cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
+ assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+ assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
+ *out_rate_mv = rate_mv;
+ // Calculate RD cost based on stored stats
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+ comp_dist[compound_type]);
+ // Recalculate model rdcost with the updated rate
+ *comp_model_rd_cur =
+ RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type],
+ comp_model_dist[compound_type]);
+ }
+ return rd;
+}
+
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ HandleInterModeArgs *args, BLOCK_SIZE bsize,
+ int_mv *cur_mv, int mode_search_mask,
+ int masked_compound_used, const BUFFER_SET *orig_dst,
+ const BUFFER_SET *tmp_dst,
+ const CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t ref_skip_rd, int *is_luma_interp_done,
+ int64_t rd_thresh) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ int ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+ const int bw = block_size_wide[bsize];
+ int rs2;
+ int_mv best_mv[2];
+ int best_tmp_rate_mv = *rate_mv;
+ BEST_COMP_TYPE_STATS best_type_stats;
+ // Initializing BEST_COMP_TYPE_STATS
+ best_type_stats.best_compound_data.type = COMPOUND_AVERAGE;
+ best_type_stats.best_compmode_interinter_cost = 0;
+ best_type_stats.comp_best_model_rd = INT64_MAX;
+
+ uint8_t *preds0[1] = { buffers->pred0 };
+ uint8_t *preds1[1] = { buffers->pred1 };
+ int strides[1] = { bw };
+ int tmp_rate_mv;
+ COMPOUND_TYPE cur_type;
+ // Local array to store the mask cost for different compound types
+ int masked_type_cost[COMPOUND_TYPES];
+
+ int calc_pred_masked_compound = 1;
+ int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX };
+ int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX,
+ INT_MAX };
+ int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX };
+ int match_index = 0;
+ const int match_found =
+ find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, comp_rs2, &match_index);
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ *rd = INT64_MAX;
+
+ // Local array to store the valid compound types to be evaluated in the core
+ // loop
+ COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = {
+ COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
+ };
+ int valid_type_count = 0;
+ // compute_valid_comp_types() returns the number of valid compound types to be
+ // evaluated and populates the same in the local array valid_comp_types[].
+ // It also sets the flag 'try_average_and_distwtd_comp'
+ valid_type_count = compute_valid_comp_types(
+ x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types);
+
+ // The following context indices are independent of compound type
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+
+ // Populates masked_type_cost local array for the 4 compound types
+ calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx,
+ comp_index_ctx, masked_compound_used, masked_type_cost);
+
+ int64_t comp_model_rd_cur = INT64_MAX;
+ int64_t best_rd_cur = ref_best_rd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // If the match is found, calculate the rd cost using the
+ // stored stats and update the mbmi appropriately.
+ if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) {
+ return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv,
+ comp_rate, comp_dist, comp_rs2,
+ rate_mv, rd, match_index);
+ }
+
+ // If COMPOUND_AVERAGE is not valid, use the spare buffer
+ if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+
+ // Loop over valid compound types
+ for (int i = 0; i < valid_type_count; i++) {
+ cur_type = valid_comp_types[i];
+
+ if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) {
+ if (cur_type == COMPOUND_WEDGE) continue;
+ }
+
+ comp_model_rd_cur = INT64_MAX;
+ tmp_rate_mv = *rate_mv;
+ best_rd_cur = INT64_MAX;
+ ref_best_rd = AOMMIN(ref_best_rd, *rd);
+ update_mbmi_for_compound_type(mbmi, cur_type);
+ rs2 = masked_type_cost[cur_type];
+
+ int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd >= ref_best_rd) continue;
+
+ // Derive the flags to indicate enabling/disabling of MV refinement process.
+ const int enable_fast_compound_mode_search =
+ cpi->sf.inter_sf.enable_fast_compound_mode_search;
+ const bool skip_mv_refinement_for_avg_distwtd =
+ enable_fast_compound_mode_search == 3 ||
+ (enable_fast_compound_mode_search == 2 && (this_mode != NEW_NEWMV));
+ const bool skip_mv_refinement_for_diffwtd =
+ (!enable_fast_compound_mode_search && cur_type == COMPOUND_DIFFWTD);
+
+ // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
+ if (cur_type < COMPOUND_WEDGE) {
+ if (skip_mv_refinement_for_avg_distwtd) {
+ int rate_sum;
+ uint8_t tmp_skip_txfm_sb;
+ int64_t dist_sum, tmp_skip_sse_sb;
+
+ // Reuse data if matching record is found
+ if (comp_rate[cur_type] == INT_MAX) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+ // Compute RD cost for the current type
+ RD_STATS est_rd_stats;
+ const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+ int64_t est_rd = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ // Evaluate further if skip rd is low enough
+ if (eval_txfm) {
+ est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
+ &est_rd_stats);
+ }
+ if (est_rd != INT64_MAX) {
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ comp_model_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ // Backup rate and distortion for future reuse
+ backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
+ comp_rs2, rs2);
+ }
+ } else {
+ // Calculate RD cost based on stored stats
+ assert(comp_dist[cur_type] != INT64_MAX);
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+ comp_dist[cur_type]);
+ // Recalculate model rdcost with the updated rate
+ comp_model_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
+ comp_model_dist[cur_type]);
+ }
+ } else {
+ tmp_rate_mv = *rate_mv;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ InterPredParams inter_pred_params;
+ av1_dist_wtd_comp_weight_assign(
+ &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset,
+ &inter_pred_params.conv_params.bck_offset,
+ &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
+ int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
+ memset(xd->seg_mask, mask_value,
+ sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ RD_STATS est_rd_stats;
+ estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+
+ best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ }
+
+ // use spare buffer for following compound type try
+ if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+ } else if (cur_type == COMPOUND_WEDGE) {
+ int best_mask_index = 0;
+ int best_wedge_sign = 0;
+ int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
+ int best_rs2 = 0;
+ int best_rate_mv = *rate_mv;
+ int wedge_mask_size = get_wedge_types_lookup(bsize);
+ int need_mask_search = args->wedge_index == -1;
+ int wedge_newmv_search =
+ have_newmv_in_inter_mode(this_mode) &&
+ !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search;
+
+ if (need_mask_search && !wedge_newmv_search) {
+ // short cut repeated single reference block build
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0,
+ preds0, strides);
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1,
+ preds1, strides);
+ }
+
+ for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search;
+ ++wedge_mask) {
+ for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
+ tmp_rate_mv = *rate_mv;
+ mbmi->interinter_comp.wedge_index = wedge_mask;
+ mbmi->interinter_comp.wedge_sign = wedge_sign;
+ rs2 = masked_type_cost[cur_type];
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd >= ref_best_rd / 2) continue;
+
+ if (wedge_newmv_search) {
+ tmp_rate_mv = av1_interinter_compound_motion_search(
+ cpi, x, cur_mv, bsize, this_mode);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+ bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+ } else {
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+ strides, preds1, strides);
+ }
+
+ RD_STATS est_rd_stats;
+ int64_t this_rd_cur = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ this_rd_cur = estimate_yrd_for_sb(
+ cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats);
+ }
+ if (this_rd_cur < INT64_MAX) {
+ this_rd_cur =
+ RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ if (this_rd_cur < best_rd_cur) {
+ best_mask_index = wedge_mask;
+ best_wedge_sign = wedge_sign;
+ best_rd_cur = this_rd_cur;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ best_rate_mv = tmp_rate_mv;
+ best_rs2 = rs2;
+ }
+ }
+ // Consider the asymmetric partitions for oblique angle only if the
+ // corresponding symmetric partition is the best so far.
+ // Note: For horizontal and vertical types, both symmetric and
+ // asymmetric partitions are always considered.
+ if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) {
+ // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16]
+ // correspond to symmetric partitions of the 4 oblique angles, the
+ // next 4 entries correspond to the vertical/horizontal
+ // symmetric/asymmetric partitions and the last 8 entries correspond
+ // to the asymmetric partitions of oblique types.
+ const int idx_before_asym_oblique = 7;
+ const int last_oblique_sym_idx = 3;
+ if (wedge_mask == idx_before_asym_oblique) {
+ if (best_mask_index > last_oblique_sym_idx) {
+ break;
+ } else {
+ // Asymmetric (Index-1) map for the corresponding oblique masks.
+ // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9
+ // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13
+ // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15
+ // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11
+ const int asym_mask_idx[4] = { 7, 11, 13, 9 };
+ wedge_mask = asym_mask_idx[best_mask_index];
+ wedge_mask_size = wedge_mask + 3;
+ }
+ }
+ }
+ }
+
+ if (need_mask_search) {
+ if (save_mask_search_results(
+ this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) {
+ args->wedge_index = best_mask_index;
+ args->wedge_sign = best_wedge_sign;
+ }
+ } else {
+ mbmi->interinter_comp.wedge_index = args->wedge_index;
+ mbmi->interinter_comp.wedge_sign = args->wedge_sign;
+ rs2 = masked_type_cost[cur_type];
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ if (wedge_newmv_search) {
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+
+ best_mask_index = args->wedge_index;
+ best_wedge_sign = args->wedge_sign;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ best_rate_mv = tmp_rate_mv;
+ best_rs2 = masked_type_cost[cur_type];
+ best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ best_rs2 + *rate_mv);
+ if (eval_txfm) {
+ RD_STATS est_rd_stats;
+ estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+ best_rd_cur =
+ RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ }
+
+ mbmi->interinter_comp.wedge_index = best_mask_index;
+ mbmi->interinter_comp.wedge_sign = best_wedge_sign;
+ mbmi->mv[0] = tmp_mv[0];
+ mbmi->mv[1] = tmp_mv[1];
+ tmp_rate_mv = best_rate_mv;
+ rs2 = best_rs2;
+ } else if (skip_mv_refinement_for_diffwtd) {
+ int_mv tmp_mv[2];
+ int best_mask_index = 0;
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ int need_mask_search = args->diffwtd_index == -1;
+
+ for (int mask_index = 0; mask_index < 2 && need_mask_search;
+ ++mask_index) {
+ tmp_rate_mv = *rate_mv;
+ mbmi->interinter_comp.mask_type = mask_index;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ // hard coded number for diff wtd
+ int mask_value = mask_index == 0 ? 38 : 26;
+ memset(xd->seg_mask, mask_value,
+ sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ RD_STATS est_rd_stats;
+ int64_t this_rd_cur = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ this_rd_cur =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ }
+ if (this_rd_cur < INT64_MAX) {
+ this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+
+ if (this_rd_cur < best_rd_cur) {
+ best_rd_cur = this_rd_cur;
+ best_mask_index = mbmi->interinter_comp.mask_type;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ }
+ }
+
+ if (need_mask_search) {
+ if (save_mask_search_results(this_mode, 0))
+ args->diffwtd_index = best_mask_index;
+ } else {
+ mbmi->interinter_comp.mask_type = args->diffwtd_index;
+ rs2 = masked_type_cost[cur_type];
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26;
+ memset(xd->seg_mask, mask_value,
+ sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+
+ if (have_newmv_in_inter_mode(this_mode)) {
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+ best_mask_index = mbmi->interinter_comp.mask_type;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ RD_STATS est_rd_stats;
+ int64_t this_rd_cur = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ this_rd_cur =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ }
+ if (this_rd_cur < INT64_MAX) {
+ best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ }
+
+ mbmi->interinter_comp.mask_type = best_mask_index;
+ mbmi->mv[0] = tmp_mv[0];
+ mbmi->mv[1] = tmp_mv[1];
+ } else {
+ // Handle masked compound types
+ bool eval_masked_comp_type = true;
+ if (*rd != INT64_MAX) {
+ // Factors to control gating of compound type selection based on best
+ // approximate rd so far
+ const int max_comp_type_rd_threshold_mul =
+ comp_type_rd_threshold_mul[cpi->sf.inter_sf
+ .prune_comp_type_by_comp_avg];
+ const int max_comp_type_rd_threshold_div =
+ comp_type_rd_threshold_div[cpi->sf.inter_sf
+ .prune_comp_type_by_comp_avg];
+ // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is
+ // within threshold
+ const int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) *
+ max_comp_type_rd_threshold_mul);
+ if (approx_rd >= ref_best_rd) eval_masked_comp_type = false;
+ }
+
+ if (eval_masked_comp_type) {
+ const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh);
+ best_rd_cur = masked_compound_type_rd(
+ cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+ &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+ strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound,
+ comp_rate, comp_dist, comp_model_rate, comp_model_dist,
+ best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2,
+ ref_skip_rd);
+ }
+ }
+
+ // Update stats for best compound type
+ if (best_rd_cur < *rd) {
+ update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+ comp_model_rd_cur, rs2);
+ if (have_newmv_in_inter_mode(this_mode))
+ update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv);
+ }
+ // reset to original mvs for next iteration
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+
+ mbmi->comp_group_idx =
+ (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+ mbmi->compound_idx =
+ !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+ mbmi->interinter_comp = best_type_stats.best_compound_data;
+
+ if (have_newmv_in_inter_mode(this_mode)) {
+ mbmi->mv[0].as_int = best_mv[0].as_int;
+ mbmi->mv[1].as_int = best_mv[1].as_int;
+ rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+ *rate_mv = best_tmp_rate_mv;
+ }
+
+ if (this_mode == NEW_NEWMV)
+ args->cmp_mode[ref_frame] = mbmi->interinter_comp.type;
+
+ restore_dst_buf(xd, *orig_dst, 1);
+ if (!match_found)
+ save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, cur_mv, comp_rs2);
+ return best_type_stats.best_compmode_interinter_cost;
+}
diff --git a/third_party/aom/av1/encoder/compound_type.h b/third_party/aom/av1/encoder/compound_type.h
new file mode 100644
index 0000000000..a028a35093
--- /dev/null
+++ b/third_party/aom/av1/encoder/compound_type.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Structure to store the compound type related stats for best compound type
+typedef struct {
+ INTERINTER_COMPOUND_DATA best_compound_data;
+ int64_t comp_best_model_rd;
+ int best_compmode_interinter_cost;
+} BEST_COMP_TYPE_STATS;
+
+#define IGNORE_MODE -1
+// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode
+// is found, 0 otherwise.
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ int *rate_mv, int *tmp_rate2,
+ const BUFFER_SET *orig_dst);
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ HandleInterModeArgs *args, BLOCK_SIZE bsize,
+ int_mv *cur_mv, int mode_search_mask,
+ int masked_compound_used, const BUFFER_SET *orig_dst,
+ const BUFFER_SET *tmp_dst,
+ const CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t ref_skip_rd, int *is_luma_interp_done,
+ int64_t rd_thresh);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_COMPOUND_TYPE_H_
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 0000000000..aafe55d2d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+#include <assert.h>
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+ PICK_MODE_CONTEXT *src_ctx) {
+ dst_ctx->mic = src_ctx->mic;
+ dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
+
+ dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+ dst_ctx->skippable = src_ctx->skippable;
+#if CONFIG_INTERNAL_STATS
+ dst_ctx->best_mode_index = src_ctx->best_mode_index;
+#endif // CONFIG_INTERNAL_STATS
+
+ memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+ sizeof(uint8_t) * src_ctx->num_4x4_blk);
+ av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
+ src_ctx->num_4x4_blk);
+
+ dst_ctx->rd_stats = src_ctx->rd_stats;
+ dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+}
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+ PC_TREE_SHARED_BUFFERS *shared_bufs,
+ struct aom_internal_error_info *error) {
+ const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE;
+ const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size];
+ const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x +
+ seq_params->subsampling_y);
+ for (int i = 0; i < num_planes; i++) {
+ const int max_num_pix =
+ (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv;
+ AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i],
+ aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+ AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i],
+ aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+ AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i],
+ aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+ }
+}
+
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) {
+ for (int i = 0; i < 3; i++) {
+ aom_free(shared_bufs->coeff_buf[i]);
+ aom_free(shared_bufs->qcoeff_buf[i]);
+ aom_free(shared_bufs->dqcoeff_buf[i]);
+ shared_bufs->coeff_buf[i] = NULL;
+ shared_bufs->qcoeff_buf[i] = NULL;
+ shared_bufs->dqcoeff_buf[i] = NULL;
+ }
+}
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+ BLOCK_SIZE bsize,
+ PC_TREE_SHARED_BUFFERS *shared_bufs) {
+ PICK_MODE_CONTEXT *volatile ctx = NULL;
+ const AV1_COMMON *const cm = &cpi->common;
+ struct aom_internal_error_info error;
+
+ if (setjmp(error.jmp)) {
+ av1_free_pmc(ctx, av1_num_planes(cm));
+ return NULL;
+ }
+ error.setjmp = 1;
+
+ AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx)));
+ ctx->rd_mode_is_ready = 0;
+
+ const int num_planes = av1_num_planes(cm);
+ const int num_pix = block_size_wide[bsize] * block_size_high[bsize];
+ const int num_blk = num_pix / 16;
+
+ AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip,
+ aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+ AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map,
+ aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
+ ctx->num_4x4_blk = num_blk;
+
+ for (int i = 0; i < num_planes; ++i) {
+ ctx->coeff[i] = shared_bufs->coeff_buf[i];
+ ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
+ ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
+ AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+ AOM_CHECK_MEM_ERROR(
+ &error, ctx->txb_entropy_ctx[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+ }
+
+ if (num_pix <= MAX_PALETTE_SQUARE) {
+ for (int i = 0; i < 2; ++i) {
+ if (cm->features.allow_screen_content_tools) {
+ AOM_CHECK_MEM_ERROR(
+ &error, ctx->color_index_map[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+ } else {
+ ctx->color_index_map[i] = NULL;
+ }
+ }
+ }
+
+ av1_invalid_rd_stats(&ctx->rd_stats);
+
+ return ctx;
+}
+
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) {
+ av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk);
+ av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk);
+ av1_invalid_rd_stats(&ctx->rd_stats);
+}
+
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) {
+ if (ctx == NULL) return;
+
+ aom_free(ctx->blk_skip);
+ ctx->blk_skip = NULL;
+ aom_free(ctx->tx_type_map);
+ for (int i = 0; i < num_planes; ++i) {
+ ctx->coeff[i] = NULL;
+ ctx->qcoeff[i] = NULL;
+ ctx->dqcoeff[i] = NULL;
+ aom_free(ctx->eobs[i]);
+ ctx->eobs[i] = NULL;
+ aom_free(ctx->txb_entropy_ctx[i]);
+ ctx->txb_entropy_ctx[i] = NULL;
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ if (ctx->color_index_map[i]) {
+ aom_free(ctx->color_index_map[i]);
+ ctx->color_index_map[i] = NULL;
+ }
+ }
+
+ aom_free(ctx);
+}
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) {
+ PC_TREE *pc_tree = aom_calloc(1, sizeof(*pc_tree));
+ if (pc_tree == NULL) return NULL;
+
+ pc_tree->partitioning = PARTITION_NONE;
+ pc_tree->block_size = bsize;
+
+ return pc_tree;
+}
+
+#define FREE_PMC_NODE(CTX) \
+ do { \
+ av1_free_pmc(CTX, num_planes); \
+ CTX = NULL; \
+ } while (0)
+
+void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best,
+ int keep_none,
+ PARTITION_SEARCH_TYPE partition_search_type) {
+ if (pc_tree == NULL) return;
+
+ // Avoid freeing of extended partitions as they are not supported when
+ // partition_search_type is VAR_BASED_PARTITION.
+ if (partition_search_type == VAR_BASED_PARTITION && !keep_best &&
+ !keep_none) {
+ FREE_PMC_NODE(pc_tree->none);
+
+ for (int i = 0; i < 2; ++i) {
+ FREE_PMC_NODE(pc_tree->horizontal[i]);
+ FREE_PMC_NODE(pc_tree->vertical[i]);
+ }
+
+#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+ for (int i = 0; i < 3; ++i) {
+ assert(pc_tree->horizontala[i] == NULL);
+ assert(pc_tree->horizontalb[i] == NULL);
+ assert(pc_tree->verticala[i] == NULL);
+ assert(pc_tree->verticalb[i] == NULL);
+ }
+ for (int i = 0; i < 4; ++i) {
+ assert(pc_tree->horizontal4[i] == NULL);
+ assert(pc_tree->vertical4[i] == NULL);
+ }
+#endif
+
+ for (int i = 0; i < 4; ++i) {
+ if (pc_tree->split[i] != NULL) {
+ av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0,
+ partition_search_type);
+ pc_tree->split[i] = NULL;
+ }
+ }
+ aom_free(pc_tree);
+ return;
+ }
+
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+
+ if (!keep_none && (!keep_best || (partition != PARTITION_NONE)))
+ FREE_PMC_NODE(pc_tree->none);
+
+ for (int i = 0; i < 2; ++i) {
+ if (!keep_best || (partition != PARTITION_HORZ))
+ FREE_PMC_NODE(pc_tree->horizontal[i]);
+ if (!keep_best || (partition != PARTITION_VERT))
+ FREE_PMC_NODE(pc_tree->vertical[i]);
+ }
+#if !CONFIG_REALTIME_ONLY
+ for (int i = 0; i < 3; ++i) {
+ if (!keep_best || (partition != PARTITION_HORZ_A))
+ FREE_PMC_NODE(pc_tree->horizontala[i]);
+ if (!keep_best || (partition != PARTITION_HORZ_B))
+ FREE_PMC_NODE(pc_tree->horizontalb[i]);
+ if (!keep_best || (partition != PARTITION_VERT_A))
+ FREE_PMC_NODE(pc_tree->verticala[i]);
+ if (!keep_best || (partition != PARTITION_VERT_B))
+ FREE_PMC_NODE(pc_tree->verticalb[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ if (!keep_best || (partition != PARTITION_HORZ_4))
+ FREE_PMC_NODE(pc_tree->horizontal4[i]);
+ if (!keep_best || (partition != PARTITION_VERT_4))
+ FREE_PMC_NODE(pc_tree->vertical4[i]);
+ }
+#endif
+ if (!keep_best || (partition != PARTITION_SPLIT)) {
+ for (int i = 0; i < 4; ++i) {
+ if (pc_tree->split[i] != NULL) {
+ av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0,
+ partition_search_type);
+ pc_tree->split[i] = NULL;
+ }
+ }
+ }
+
+ if (!keep_best && !keep_none) aom_free(pc_tree);
+}
+
+int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
+ // The structure 'sms_tree' is used to store the simple motion search data for
+ // partition pruning in inter frames. Hence, the memory allocations and
+ // initializations related to it are avoided for allintra encoding mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0;
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ int sms_tree_index = 0;
+ SIMPLE_MOTION_DATA_TREE *this_sms;
+ int square_index = 1;
+ int nodes;
+
+ aom_free(td->sms_tree);
+ td->sms_tree =
+ (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree));
+ if (!td->sms_tree) return -1;
+ this_sms = &td->sms_tree[0];
+
+ if (!stat_generation_stage) {
+ const int leaf_factor = is_sb_size_128 ? 4 : 1;
+ const int leaf_nodes = 256 * leaf_factor;
+
+ // Sets up all the leaf nodes in the tree.
+ for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+ tree->block_size = square[0];
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (int i = 0; i < nodes; ++i) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+ tree->block_size = square[square_index];
+ for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+ ++sms_tree_index;
+ }
+ ++square_index;
+ }
+ } else {
+ // Allocation for firstpass/LAP stage
+ // TODO(Mufaddal): refactor square_index to use a common block_size macro
+ // from firstpass.c
+ SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+ square_index = 2;
+ tree->block_size = square[square_index];
+ }
+
+ // Set up the root node for the largest superblock size
+ td->sms_root = &td->sms_tree[tree_nodes - 1];
+ return 0;
+}
+
+void av1_free_sms_tree(ThreadData *td) {
+ aom_free(td->sms_tree);
+ td->sms_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 0000000000..0be7ccbb54
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+typedef struct {
+ tran_low_t *coeff_buf[MAX_MB_PLANE];
+ tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+ tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct PICK_MODE_CONTEXT {
+ MB_MODE_INFO mic;
+ MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
+ uint8_t *color_index_map[2];
+ uint8_t *blk_skip;
+
+ tran_low_t *coeff[MAX_MB_PLANE];
+ tran_low_t *qcoeff[MAX_MB_PLANE];
+ tran_low_t *dqcoeff[MAX_MB_PLANE];
+ uint16_t *eobs[MAX_MB_PLANE];
+ uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+ uint8_t *tx_type_map;
+
+ int num_4x4_blk;
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 1.
+ int skippable;
+#if CONFIG_INTERNAL_STATS
+ THR_MODES best_mode_index;
+#endif // CONFIG_INTERNAL_STATS
+ RD_STATS rd_stats;
+
+ int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has
+ // been made.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ int64_t newmv_sse;
+ int64_t zeromv_sse;
+ int64_t zeromv_lastref_sse;
+ PREDICTION_MODE best_sse_inter_mode;
+ int_mv best_sse_mv;
+ MV_REFERENCE_FRAME best_reference_frame;
+ MV_REFERENCE_FRAME best_zeromv_reference_frame;
+ int sb_skip_denoising;
+#endif
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+ PARTITION_TYPE partitioning;
+ BLOCK_SIZE block_size;
+ PICK_MODE_CONTEXT *none;
+ PICK_MODE_CONTEXT *horizontal[2];
+ PICK_MODE_CONTEXT *vertical[2];
+#if !CONFIG_REALTIME_ONLY
+ PICK_MODE_CONTEXT *horizontala[3];
+ PICK_MODE_CONTEXT *horizontalb[3];
+ PICK_MODE_CONTEXT *verticala[3];
+ PICK_MODE_CONTEXT *verticalb[3];
+ PICK_MODE_CONTEXT *horizontal4[4];
+ PICK_MODE_CONTEXT *vertical4[4];
+#endif
+ struct PC_TREE *split[4];
+ int index;
+} PC_TREE;
+
+typedef struct SIMPLE_MOTION_DATA_TREE {
+ BLOCK_SIZE block_size;
+ PARTITION_TYPE partitioning;
+ struct SIMPLE_MOTION_DATA_TREE *split[4];
+
+ // Simple motion search_features
+ FULLPEL_MV start_mvs[REF_FRAMES];
+ unsigned int sms_none_feat[2];
+ unsigned int sms_rect_feat[8];
+ int sms_none_valid;
+ int sms_rect_valid;
+} SIMPLE_MOTION_DATA_TREE;
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+ PC_TREE_SHARED_BUFFERS *shared_bufs,
+ struct aom_internal_error_info *error);
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
+void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best,
+ int keep_none,
+ PARTITION_SEARCH_TYPE partition_search_type);
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+ BLOCK_SIZE bsize,
+ PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx);
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+ PICK_MODE_CONTEXT *src_ctx);
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+ BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128,
+ int stat_generation_stage) {
+ const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+ const int tree_nodes =
+ stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+ return tree_nodes;
+}
+
+// Returns 0 on success, -1 on memory allocation failure.
+int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_sms_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 0000000000..323e2aed58
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+ 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+ 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+ 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+ 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+ 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+ 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+ 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73,
+ 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26,
+ 23, 20, 18, 15, 12, 9, 6, 3,
+};
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+ const int *inv_map) {
+ int i;
+ aom_cdf_prob prev_cdf = 0;
+ for (i = 0;; ++i) {
+ aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+ p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
+ prev_cdf = AOM_ICDF(cdf[i]);
+
+ if (inv_map)
+ costs[inv_map[i]] = av1_cost_symbol(p15);
+ else
+ costs[i] = av1_cost_symbol(p15);
+
+ // Stop once we reach the end of the CDF
+ if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+ }
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 0000000000..be0241a820
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[128];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+ // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+ // following cost calculation works correctly. Otherwise, if p15 =
+ // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+ p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
+ assert(0 < p15 && p15 < CDF_PROB_TOP);
+ const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+ const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+ assert(prob >= 128);
+ return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
+}
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+ const int *inv_map);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/deltaq4_model.c b/third_party/aom/av1/encoder/deltaq4_model.c
new file mode 100644
index 0000000000..60a7e6d2cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/deltaq4_model.c
@@ -0,0 +1,7776 @@
+/* Embedded file: model.tflite */
+const int av1_deltaq4_model_fsize = 101032;
+const unsigned char av1_deltaq4_model_file[101032] = {
+ 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c,
+ 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00,
+ 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00,
+ 0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00,
+ 0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04,
+ 0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00,
+ 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72,
+ 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14,
+ 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65,
+ 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+ 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69,
+ 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00,
+ 0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01,
+ 0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00,
+ 0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc,
+ 0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00,
+ 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00,
+ 0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+ 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04,
+ 0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00,
+ 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff,
+ 0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64,
+ 0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77,
+ 0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00,
+ 0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd,
+ 0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60,
+ 0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a,
+ 0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7,
+ 0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd,
+ 0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb,
+ 0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76,
+ 0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02,
+ 0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c,
+ 0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9,
+ 0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68,
+ 0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a,
+ 0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d,
+ 0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31,
+ 0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00,
+ 0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba,
+ 0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd,
+ 0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf,
+ 0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5,
+ 0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7,
+ 0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d,
+ 0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d,
+ 0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab,
+ 0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e,
+ 0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d,
+ 0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71,
+ 0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63,
+ 0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45,
+ 0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd,
+ 0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37,
+ 0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c,
+ 0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3,
+ 0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c,
+ 0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74,
+ 0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32,
+ 0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16,
+ 0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd,
+ 0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee,
+ 0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31,
+ 0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26,
+ 0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d,
+ 0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7,
+ 0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94,
+ 0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3,
+ 0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd,
+ 0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf,
+ 0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c,
+ 0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a,
+ 0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba,
+ 0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90,
+ 0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a,
+ 0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf,
+ 0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc,
+ 0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66,
+ 0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79,
+ 0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17,
+ 0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc,
+ 0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1,
+ 0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c,
+ 0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f,
+ 0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c,
+ 0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4,
+ 0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a,
+ 0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68,
+ 0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd,
+ 0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb,
+ 0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88,
+ 0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54,
+ 0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d,
+ 0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b,
+ 0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58,
+ 0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20,
+ 0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d,
+ 0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f,
+ 0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03,
+ 0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3,
+ 0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc,
+ 0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40,
+ 0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29,
+ 0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb,
+ 0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d,
+ 0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b,
+ 0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c,
+ 0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5,
+ 0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d,
+ 0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda,
+ 0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88,
+ 0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96,
+ 0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c,
+ 0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72,
+ 0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad,
+ 0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54,
+ 0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d,
+ 0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47,
+ 0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27,
+ 0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd,
+ 0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd,
+ 0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45,
+ 0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd,
+ 0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1,
+ 0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d,
+ 0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e,
+ 0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc,
+ 0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7,
+ 0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d,
+ 0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad,
+ 0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38,
+ 0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b,
+ 0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d,
+ 0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76,
+ 0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f,
+ 0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde,
+ 0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd,
+ 0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf,
+ 0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a,
+ 0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68,
+ 0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d,
+ 0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d,
+ 0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3,
+ 0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f,
+ 0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c,
+ 0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21,
+ 0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82,
+ 0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2,
+ 0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d,
+ 0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6,
+ 0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8,
+ 0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2,
+ 0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d,
+ 0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55,
+ 0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24,
+ 0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba,
+ 0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d,
+ 0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32,
+ 0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59,
+ 0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99,
+ 0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd,
+ 0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16,
+ 0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f,
+ 0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52,
+ 0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d,
+ 0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e,
+ 0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8,
+ 0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef,
+ 0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d,
+ 0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06,
+ 0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e,
+ 0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f,
+ 0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d,
+ 0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a,
+ 0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69,
+ 0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5,
+ 0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d,
+ 0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac,
+ 0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0,
+ 0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5,
+ 0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e,
+ 0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe,
+ 0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54,
+ 0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c,
+ 0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d,
+ 0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe,
+ 0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea,
+ 0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94,
+ 0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d,
+ 0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38,
+ 0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68,
+ 0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f,
+ 0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd,
+ 0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea,
+ 0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b,
+ 0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d,
+ 0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d,
+ 0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6,
+ 0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8,
+ 0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73,
+ 0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd,
+ 0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89,
+ 0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68,
+ 0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a,
+ 0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd,
+ 0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0,
+ 0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7,
+ 0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c,
+ 0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc,
+ 0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a,
+ 0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12,
+ 0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04,
+ 0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9,
+ 0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f,
+ 0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55,
+ 0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05,
+ 0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd,
+ 0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0,
+ 0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47,
+ 0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90,
+ 0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd,
+ 0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30,
+ 0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34,
+ 0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99,
+ 0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d,
+ 0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26,
+ 0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a,
+ 0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2,
+ 0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe,
+ 0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94,
+ 0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa,
+ 0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac,
+ 0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd,
+ 0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2,
+ 0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40,
+ 0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2,
+ 0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d,
+ 0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c,
+ 0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4,
+ 0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85,
+ 0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d,
+ 0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16,
+ 0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4,
+ 0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70,
+ 0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd,
+ 0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7,
+ 0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc,
+ 0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6,
+ 0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d,
+ 0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f,
+ 0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c,
+ 0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f,
+ 0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d,
+ 0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7,
+ 0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0,
+ 0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e,
+ 0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd,
+ 0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4,
+ 0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89,
+ 0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f,
+ 0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d,
+ 0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9,
+ 0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b,
+ 0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb,
+ 0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d,
+ 0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3,
+ 0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b,
+ 0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac,
+ 0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd,
+ 0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a,
+ 0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e,
+ 0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde,
+ 0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba,
+ 0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06,
+ 0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d,
+ 0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1,
+ 0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d,
+ 0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c,
+ 0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf,
+ 0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1,
+ 0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d,
+ 0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a,
+ 0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76,
+ 0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73,
+ 0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc,
+ 0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a,
+ 0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3,
+ 0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63,
+ 0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d,
+ 0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c,
+ 0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e,
+ 0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92,
+ 0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd,
+ 0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8,
+ 0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5,
+ 0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf,
+ 0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d,
+ 0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17,
+ 0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19,
+ 0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1,
+ 0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c,
+ 0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b,
+ 0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd,
+ 0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba,
+ 0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd,
+ 0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0,
+ 0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c,
+ 0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4,
+ 0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd,
+ 0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e,
+ 0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22,
+ 0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c,
+ 0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d,
+ 0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7,
+ 0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19,
+ 0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7,
+ 0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c,
+ 0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d,
+ 0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37,
+ 0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf,
+ 0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd,
+ 0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13,
+ 0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19,
+ 0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf,
+ 0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd,
+ 0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26,
+ 0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe,
+ 0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5,
+ 0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc,
+ 0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c,
+ 0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29,
+ 0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56,
+ 0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc,
+ 0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22,
+ 0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf,
+ 0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04,
+ 0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc,
+ 0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e,
+ 0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95,
+ 0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c,
+ 0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd,
+ 0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb,
+ 0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2,
+ 0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b,
+ 0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d,
+ 0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52,
+ 0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97,
+ 0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f,
+ 0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c,
+ 0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd,
+ 0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca,
+ 0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15,
+ 0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc,
+ 0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35,
+ 0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf,
+ 0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c,
+ 0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd,
+ 0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f,
+ 0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6,
+ 0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48,
+ 0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd,
+ 0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d,
+ 0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16,
+ 0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45,
+ 0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d,
+ 0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda,
+ 0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb,
+ 0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4,
+ 0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd,
+ 0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01,
+ 0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f,
+ 0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7,
+ 0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d,
+ 0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b,
+ 0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32,
+ 0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee,
+ 0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d,
+ 0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a,
+ 0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba,
+ 0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc,
+ 0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd,
+ 0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce,
+ 0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4,
+ 0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a,
+ 0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd,
+ 0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44,
+ 0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31,
+ 0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e,
+ 0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d,
+ 0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71,
+ 0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f,
+ 0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e,
+ 0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b,
+ 0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f,
+ 0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99,
+ 0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f,
+ 0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd,
+ 0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97,
+ 0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe,
+ 0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf,
+ 0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d,
+ 0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9,
+ 0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85,
+ 0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55,
+ 0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd,
+ 0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8,
+ 0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71,
+ 0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a,
+ 0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b,
+ 0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51,
+ 0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2,
+ 0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53,
+ 0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d,
+ 0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa,
+ 0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb,
+ 0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96,
+ 0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d,
+ 0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f,
+ 0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce,
+ 0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e,
+ 0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd,
+ 0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2,
+ 0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d,
+ 0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22,
+ 0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d,
+ 0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16,
+ 0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda,
+ 0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0,
+ 0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd,
+ 0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46,
+ 0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6,
+ 0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3,
+ 0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc,
+ 0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f,
+ 0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9,
+ 0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d,
+ 0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d,
+ 0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d,
+ 0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0,
+ 0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0,
+ 0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9,
+ 0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c,
+ 0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8,
+ 0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f,
+ 0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d,
+ 0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1,
+ 0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18,
+ 0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11,
+ 0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d,
+ 0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06,
+ 0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67,
+ 0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68,
+ 0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b,
+ 0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae,
+ 0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde,
+ 0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a,
+ 0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d,
+ 0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed,
+ 0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f,
+ 0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62,
+ 0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d,
+ 0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5,
+ 0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd,
+ 0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a,
+ 0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd,
+ 0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77,
+ 0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1,
+ 0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5,
+ 0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd,
+ 0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b,
+ 0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62,
+ 0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97,
+ 0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd,
+ 0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea,
+ 0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b,
+ 0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17,
+ 0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd,
+ 0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20,
+ 0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc,
+ 0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08,
+ 0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d,
+ 0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57,
+ 0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36,
+ 0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e,
+ 0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe,
+ 0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6,
+ 0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40,
+ 0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04,
+ 0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd,
+ 0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43,
+ 0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23,
+ 0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24,
+ 0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd,
+ 0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88,
+ 0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8,
+ 0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e,
+ 0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd,
+ 0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4,
+ 0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0,
+ 0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd,
+ 0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd,
+ 0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a,
+ 0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e,
+ 0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce,
+ 0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d,
+ 0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc,
+ 0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85,
+ 0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd,
+ 0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd,
+ 0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd,
+ 0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc,
+ 0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a,
+ 0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d,
+ 0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54,
+ 0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61,
+ 0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94,
+ 0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd,
+ 0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8,
+ 0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52,
+ 0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a,
+ 0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d,
+ 0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c,
+ 0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81,
+ 0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa,
+ 0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd,
+ 0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b,
+ 0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96,
+ 0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88,
+ 0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd,
+ 0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad,
+ 0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01,
+ 0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46,
+ 0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d,
+ 0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f,
+ 0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6,
+ 0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc,
+ 0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd,
+ 0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24,
+ 0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08,
+ 0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5,
+ 0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd,
+ 0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43,
+ 0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc,
+ 0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51,
+ 0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c,
+ 0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5,
+ 0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f,
+ 0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88,
+ 0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d,
+ 0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60,
+ 0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e,
+ 0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b,
+ 0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc,
+ 0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0,
+ 0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04,
+ 0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98,
+ 0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd,
+ 0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68,
+ 0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83,
+ 0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a,
+ 0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d,
+ 0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f,
+ 0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21,
+ 0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93,
+ 0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d,
+ 0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6,
+ 0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea,
+ 0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3,
+ 0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e,
+ 0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1,
+ 0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15,
+ 0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c,
+ 0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d,
+ 0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5,
+ 0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35,
+ 0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf,
+ 0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd,
+ 0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19,
+ 0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d,
+ 0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5,
+ 0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd,
+ 0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85,
+ 0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c,
+ 0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88,
+ 0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd,
+ 0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d,
+ 0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15,
+ 0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb,
+ 0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd,
+ 0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15,
+ 0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2,
+ 0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d,
+ 0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e,
+ 0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51,
+ 0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7,
+ 0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2,
+ 0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd,
+ 0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa,
+ 0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca,
+ 0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c,
+ 0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe,
+ 0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a,
+ 0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa,
+ 0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e,
+ 0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d,
+ 0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f,
+ 0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf,
+ 0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57,
+ 0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d,
+ 0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31,
+ 0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24,
+ 0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03,
+ 0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d,
+ 0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85,
+ 0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47,
+ 0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7,
+ 0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe,
+ 0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11,
+ 0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75,
+ 0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61,
+ 0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d,
+ 0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a,
+ 0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f,
+ 0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa,
+ 0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc,
+ 0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35,
+ 0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57,
+ 0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07,
+ 0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d,
+ 0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92,
+ 0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01,
+ 0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f,
+ 0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d,
+ 0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89,
+ 0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e,
+ 0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff,
+ 0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d,
+ 0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63,
+ 0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d,
+ 0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6,
+ 0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc,
+ 0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54,
+ 0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39,
+ 0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56,
+ 0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd,
+ 0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51,
+ 0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75,
+ 0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad,
+ 0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc,
+ 0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5,
+ 0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a,
+ 0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5,
+ 0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd,
+ 0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63,
+ 0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2,
+ 0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5,
+ 0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd,
+ 0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e,
+ 0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6,
+ 0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67,
+ 0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d,
+ 0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff,
+ 0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf,
+ 0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b,
+ 0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d,
+ 0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0,
+ 0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f,
+ 0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7,
+ 0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b,
+ 0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0,
+ 0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93,
+ 0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2,
+ 0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d,
+ 0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24,
+ 0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53,
+ 0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16,
+ 0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc,
+ 0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72,
+ 0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6,
+ 0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb,
+ 0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd,
+ 0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0,
+ 0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42,
+ 0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f,
+ 0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb,
+ 0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43,
+ 0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8,
+ 0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4,
+ 0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d,
+ 0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7,
+ 0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb,
+ 0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef,
+ 0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c,
+ 0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea,
+ 0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22,
+ 0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8,
+ 0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb,
+ 0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e,
+ 0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45,
+ 0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2,
+ 0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e,
+ 0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25,
+ 0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec,
+ 0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53,
+ 0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd,
+ 0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7,
+ 0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd,
+ 0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76,
+ 0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d,
+ 0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0,
+ 0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed,
+ 0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7,
+ 0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d,
+ 0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3,
+ 0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf,
+ 0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21,
+ 0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc,
+ 0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89,
+ 0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe,
+ 0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0,
+ 0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d,
+ 0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3,
+ 0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36,
+ 0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29,
+ 0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d,
+ 0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e,
+ 0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75,
+ 0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca,
+ 0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb,
+ 0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63,
+ 0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a,
+ 0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f,
+ 0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc,
+ 0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41,
+ 0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63,
+ 0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78,
+ 0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd,
+ 0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4,
+ 0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4,
+ 0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8,
+ 0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd,
+ 0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09,
+ 0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11,
+ 0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7,
+ 0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c,
+ 0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4,
+ 0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc,
+ 0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a,
+ 0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb,
+ 0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2,
+ 0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d,
+ 0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97,
+ 0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d,
+ 0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2,
+ 0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e,
+ 0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46,
+ 0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d,
+ 0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea,
+ 0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0,
+ 0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f,
+ 0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c,
+ 0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6,
+ 0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01,
+ 0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce,
+ 0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c,
+ 0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5,
+ 0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15,
+ 0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16,
+ 0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc,
+ 0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b,
+ 0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d,
+ 0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe,
+ 0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc,
+ 0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9,
+ 0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f,
+ 0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93,
+ 0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc,
+ 0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87,
+ 0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f,
+ 0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90,
+ 0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c,
+ 0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3,
+ 0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16,
+ 0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f,
+ 0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d,
+ 0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1,
+ 0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87,
+ 0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5,
+ 0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd,
+ 0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac,
+ 0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95,
+ 0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c,
+ 0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd,
+ 0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba,
+ 0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1,
+ 0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c,
+ 0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d,
+ 0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90,
+ 0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23,
+ 0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e,
+ 0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd,
+ 0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32,
+ 0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b,
+ 0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b,
+ 0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d,
+ 0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b,
+ 0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17,
+ 0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5,
+ 0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc,
+ 0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e,
+ 0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9,
+ 0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92,
+ 0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d,
+ 0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76,
+ 0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28,
+ 0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32,
+ 0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd,
+ 0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2,
+ 0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2,
+ 0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08,
+ 0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc,
+ 0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f,
+ 0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b,
+ 0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16,
+ 0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc,
+ 0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9,
+ 0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef,
+ 0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34,
+ 0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d,
+ 0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf,
+ 0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6,
+ 0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde,
+ 0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd,
+ 0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec,
+ 0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78,
+ 0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1,
+ 0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe,
+ 0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79,
+ 0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e,
+ 0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa,
+ 0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd,
+ 0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89,
+ 0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d,
+ 0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24,
+ 0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd,
+ 0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81,
+ 0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c,
+ 0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41,
+ 0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d,
+ 0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20,
+ 0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5,
+ 0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c,
+ 0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d,
+ 0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed,
+ 0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf,
+ 0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9,
+ 0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd,
+ 0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e,
+ 0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb,
+ 0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d,
+ 0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc,
+ 0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96,
+ 0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b,
+ 0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96,
+ 0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d,
+ 0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07,
+ 0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6,
+ 0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1,
+ 0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d,
+ 0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e,
+ 0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88,
+ 0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0,
+ 0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc,
+ 0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60,
+ 0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3,
+ 0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce,
+ 0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c,
+ 0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e,
+ 0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2,
+ 0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00,
+ 0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c,
+ 0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49,
+ 0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24,
+ 0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5,
+ 0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d,
+ 0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d,
+ 0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01,
+ 0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00,
+ 0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b,
+ 0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0,
+ 0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b,
+ 0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f,
+ 0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d,
+ 0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b,
+ 0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66,
+ 0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48,
+ 0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd,
+ 0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26,
+ 0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23,
+ 0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc,
+ 0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d,
+ 0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6,
+ 0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18,
+ 0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a,
+ 0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc,
+ 0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05,
+ 0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58,
+ 0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00,
+ 0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd,
+ 0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81,
+ 0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac,
+ 0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0,
+ 0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc,
+ 0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27,
+ 0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d,
+ 0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c,
+ 0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d,
+ 0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91,
+ 0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e,
+ 0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8,
+ 0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc,
+ 0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d,
+ 0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7,
+ 0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e,
+ 0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd,
+ 0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d,
+ 0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33,
+ 0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42,
+ 0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d,
+ 0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9,
+ 0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45,
+ 0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2,
+ 0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd,
+ 0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7,
+ 0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90,
+ 0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06,
+ 0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d,
+ 0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64,
+ 0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1,
+ 0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7,
+ 0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d,
+ 0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e,
+ 0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69,
+ 0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64,
+ 0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd,
+ 0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a,
+ 0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8,
+ 0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d,
+ 0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc,
+ 0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38,
+ 0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f,
+ 0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb,
+ 0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb,
+ 0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65,
+ 0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d,
+ 0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1,
+ 0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd,
+ 0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0,
+ 0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a,
+ 0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce,
+ 0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe,
+ 0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10,
+ 0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f,
+ 0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf,
+ 0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd,
+ 0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8,
+ 0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49,
+ 0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f,
+ 0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c,
+ 0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c,
+ 0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99,
+ 0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03,
+ 0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d,
+ 0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a,
+ 0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c,
+ 0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2,
+ 0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd,
+ 0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe,
+ 0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78,
+ 0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b,
+ 0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c,
+ 0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88,
+ 0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26,
+ 0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb,
+ 0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe,
+ 0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5,
+ 0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc,
+ 0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c,
+ 0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd,
+ 0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41,
+ 0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55,
+ 0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96,
+ 0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d,
+ 0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8,
+ 0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5,
+ 0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18,
+ 0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d,
+ 0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44,
+ 0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1,
+ 0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0,
+ 0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d,
+ 0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52,
+ 0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45,
+ 0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b,
+ 0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d,
+ 0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21,
+ 0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1,
+ 0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf,
+ 0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d,
+ 0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68,
+ 0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b,
+ 0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09,
+ 0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d,
+ 0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8,
+ 0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24,
+ 0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd,
+ 0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd,
+ 0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44,
+ 0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86,
+ 0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26,
+ 0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd,
+ 0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6,
+ 0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13,
+ 0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14,
+ 0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d,
+ 0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec,
+ 0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4,
+ 0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99,
+ 0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d,
+ 0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3,
+ 0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06,
+ 0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac,
+ 0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd,
+ 0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5,
+ 0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63,
+ 0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb,
+ 0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd,
+ 0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a,
+ 0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e,
+ 0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6,
+ 0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d,
+ 0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42,
+ 0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d,
+ 0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1,
+ 0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd,
+ 0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4,
+ 0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a,
+ 0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4,
+ 0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d,
+ 0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9,
+ 0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12,
+ 0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93,
+ 0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c,
+ 0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09,
+ 0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42,
+ 0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b,
+ 0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d,
+ 0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92,
+ 0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65,
+ 0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01,
+ 0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d,
+ 0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33,
+ 0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4,
+ 0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc,
+ 0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc,
+ 0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37,
+ 0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27,
+ 0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96,
+ 0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe,
+ 0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92,
+ 0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f,
+ 0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3,
+ 0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd,
+ 0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe,
+ 0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe,
+ 0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75,
+ 0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd,
+ 0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca,
+ 0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d,
+ 0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3,
+ 0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb,
+ 0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc,
+ 0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73,
+ 0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab,
+ 0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c,
+ 0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c,
+ 0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27,
+ 0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee,
+ 0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8,
+ 0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45,
+ 0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab,
+ 0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b,
+ 0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c,
+ 0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2,
+ 0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7,
+ 0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40,
+ 0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd,
+ 0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf,
+ 0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a,
+ 0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31,
+ 0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d,
+ 0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c,
+ 0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5,
+ 0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3,
+ 0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d,
+ 0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b,
+ 0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12,
+ 0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14,
+ 0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b,
+ 0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21,
+ 0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb,
+ 0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85,
+ 0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d,
+ 0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e,
+ 0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e,
+ 0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3,
+ 0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb,
+ 0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18,
+ 0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b,
+ 0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45,
+ 0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d,
+ 0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62,
+ 0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46,
+ 0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95,
+ 0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d,
+ 0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d,
+ 0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4,
+ 0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12,
+ 0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd,
+ 0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94,
+ 0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25,
+ 0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1,
+ 0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b,
+ 0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23,
+ 0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24,
+ 0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe,
+ 0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c,
+ 0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39,
+ 0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63,
+ 0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50,
+ 0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd,
+ 0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50,
+ 0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97,
+ 0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96,
+ 0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e,
+ 0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b,
+ 0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65,
+ 0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf,
+ 0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd,
+ 0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0,
+ 0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83,
+ 0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b,
+ 0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd,
+ 0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf,
+ 0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46,
+ 0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30,
+ 0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d,
+ 0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3,
+ 0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14,
+ 0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b,
+ 0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd,
+ 0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda,
+ 0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b,
+ 0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84,
+ 0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd,
+ 0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28,
+ 0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34,
+ 0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42,
+ 0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c,
+ 0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed,
+ 0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9,
+ 0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab,
+ 0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d,
+ 0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57,
+ 0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7,
+ 0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb,
+ 0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd,
+ 0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32,
+ 0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6,
+ 0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79,
+ 0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd,
+ 0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b,
+ 0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58,
+ 0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf,
+ 0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d,
+ 0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc,
+ 0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb,
+ 0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24,
+ 0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd,
+ 0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4,
+ 0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8,
+ 0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96,
+ 0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d,
+ 0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62,
+ 0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e,
+ 0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4,
+ 0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc,
+ 0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf,
+ 0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54,
+ 0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1,
+ 0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d,
+ 0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d,
+ 0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a,
+ 0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84,
+ 0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d,
+ 0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07,
+ 0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5,
+ 0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba,
+ 0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d,
+ 0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f,
+ 0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b,
+ 0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a,
+ 0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb,
+ 0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b,
+ 0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6,
+ 0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86,
+ 0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d,
+ 0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83,
+ 0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec,
+ 0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91,
+ 0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c,
+ 0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89,
+ 0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd,
+ 0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b,
+ 0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c,
+ 0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80,
+ 0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1,
+ 0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9,
+ 0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c,
+ 0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6,
+ 0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f,
+ 0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97,
+ 0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd,
+ 0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3,
+ 0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb,
+ 0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca,
+ 0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba,
+ 0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7,
+ 0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1,
+ 0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20,
+ 0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b,
+ 0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda,
+ 0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27,
+ 0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82,
+ 0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd,
+ 0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d,
+ 0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c,
+ 0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70,
+ 0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc,
+ 0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e,
+ 0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80,
+ 0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4,
+ 0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d,
+ 0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf,
+ 0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d,
+ 0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73,
+ 0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d,
+ 0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9,
+ 0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae,
+ 0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf,
+ 0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc,
+ 0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81,
+ 0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27,
+ 0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51,
+ 0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd,
+ 0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57,
+ 0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b,
+ 0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20,
+ 0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d,
+ 0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4,
+ 0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71,
+ 0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01,
+ 0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d,
+ 0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e,
+ 0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d,
+ 0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84,
+ 0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e,
+ 0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5,
+ 0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3,
+ 0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d,
+ 0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd,
+ 0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98,
+ 0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28,
+ 0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f,
+ 0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e,
+ 0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b,
+ 0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba,
+ 0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1,
+ 0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d,
+ 0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88,
+ 0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6,
+ 0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0,
+ 0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd,
+ 0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6,
+ 0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33,
+ 0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09,
+ 0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d,
+ 0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4,
+ 0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff,
+ 0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02,
+ 0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc,
+ 0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12,
+ 0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54,
+ 0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23,
+ 0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d,
+ 0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d,
+ 0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d,
+ 0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde,
+ 0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd,
+ 0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9,
+ 0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c,
+ 0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89,
+ 0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc,
+ 0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4,
+ 0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98,
+ 0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e,
+ 0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d,
+ 0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52,
+ 0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95,
+ 0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5,
+ 0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d,
+ 0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea,
+ 0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba,
+ 0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32,
+ 0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd,
+ 0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55,
+ 0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde,
+ 0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83,
+ 0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc,
+ 0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae,
+ 0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2,
+ 0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6,
+ 0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d,
+ 0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85,
+ 0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0,
+ 0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6,
+ 0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d,
+ 0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a,
+ 0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b,
+ 0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12,
+ 0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b,
+ 0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec,
+ 0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f,
+ 0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59,
+ 0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d,
+ 0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71,
+ 0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c,
+ 0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53,
+ 0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd,
+ 0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3,
+ 0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc,
+ 0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1,
+ 0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d,
+ 0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb,
+ 0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e,
+ 0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae,
+ 0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc,
+ 0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab,
+ 0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07,
+ 0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8,
+ 0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d,
+ 0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29,
+ 0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a,
+ 0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d,
+ 0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d,
+ 0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa,
+ 0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24,
+ 0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9,
+ 0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d,
+ 0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87,
+ 0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9,
+ 0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab,
+ 0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd,
+ 0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b,
+ 0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7,
+ 0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7,
+ 0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd,
+ 0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0,
+ 0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04,
+ 0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c,
+ 0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d,
+ 0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f,
+ 0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b,
+ 0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7,
+ 0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d,
+ 0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e,
+ 0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91,
+ 0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e,
+ 0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd,
+ 0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88,
+ 0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a,
+ 0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9,
+ 0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d,
+ 0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13,
+ 0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc,
+ 0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80,
+ 0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d,
+ 0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca,
+ 0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf,
+ 0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab,
+ 0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d,
+ 0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06,
+ 0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06,
+ 0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc,
+ 0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc,
+ 0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d,
+ 0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4,
+ 0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea,
+ 0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d,
+ 0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2,
+ 0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94,
+ 0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6,
+ 0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd,
+ 0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d,
+ 0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83,
+ 0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf,
+ 0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c,
+ 0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb,
+ 0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0,
+ 0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4,
+ 0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e,
+ 0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a,
+ 0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95,
+ 0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5,
+ 0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c,
+ 0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13,
+ 0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66,
+ 0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d,
+ 0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d,
+ 0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3,
+ 0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9,
+ 0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a,
+ 0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d,
+ 0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48,
+ 0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f,
+ 0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8,
+ 0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d,
+ 0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95,
+ 0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79,
+ 0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88,
+ 0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d,
+ 0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca,
+ 0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02,
+ 0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef,
+ 0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd,
+ 0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9,
+ 0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d,
+ 0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f,
+ 0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd,
+ 0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1,
+ 0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37,
+ 0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02,
+ 0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd,
+ 0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0,
+ 0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42,
+ 0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2,
+ 0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d,
+ 0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda,
+ 0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed,
+ 0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70,
+ 0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd,
+ 0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b,
+ 0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e,
+ 0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29,
+ 0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d,
+ 0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc,
+ 0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3,
+ 0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a,
+ 0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd,
+ 0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9,
+ 0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3,
+ 0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10,
+ 0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd,
+ 0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8,
+ 0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4,
+ 0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a,
+ 0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd,
+ 0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe,
+ 0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84,
+ 0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83,
+ 0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd,
+ 0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd,
+ 0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36,
+ 0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83,
+ 0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd,
+ 0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25,
+ 0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc,
+ 0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a,
+ 0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd,
+ 0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00,
+ 0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9,
+ 0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43,
+ 0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb,
+ 0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35,
+ 0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2,
+ 0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a,
+ 0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d,
+ 0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe,
+ 0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc,
+ 0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88,
+ 0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d,
+ 0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f,
+ 0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40,
+ 0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f,
+ 0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe,
+ 0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80,
+ 0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92,
+ 0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50,
+ 0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e,
+ 0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad,
+ 0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b,
+ 0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9,
+ 0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd,
+ 0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74,
+ 0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5,
+ 0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab,
+ 0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d,
+ 0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb,
+ 0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f,
+ 0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b,
+ 0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e,
+ 0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1,
+ 0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a,
+ 0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f,
+ 0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d,
+ 0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81,
+ 0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb,
+ 0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3,
+ 0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d,
+ 0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d,
+ 0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4,
+ 0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb,
+ 0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe,
+ 0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc,
+ 0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4,
+ 0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7,
+ 0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d,
+ 0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20,
+ 0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92,
+ 0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5,
+ 0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d,
+ 0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34,
+ 0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43,
+ 0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13,
+ 0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d,
+ 0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20,
+ 0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea,
+ 0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98,
+ 0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e,
+ 0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f,
+ 0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88,
+ 0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b,
+ 0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e,
+ 0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01,
+ 0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c,
+ 0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e,
+ 0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe,
+ 0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8,
+ 0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd,
+ 0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e,
+ 0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd,
+ 0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f,
+ 0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0,
+ 0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b,
+ 0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d,
+ 0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e,
+ 0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa,
+ 0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde,
+ 0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe,
+ 0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda,
+ 0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f,
+ 0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27,
+ 0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd,
+ 0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21,
+ 0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0,
+ 0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b,
+ 0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe,
+ 0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28,
+ 0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0,
+ 0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc,
+ 0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd,
+ 0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37,
+ 0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2,
+ 0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b,
+ 0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e,
+ 0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54,
+ 0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6,
+ 0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2,
+ 0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d,
+ 0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89,
+ 0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2,
+ 0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0,
+ 0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d,
+ 0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c,
+ 0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa,
+ 0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82,
+ 0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd,
+ 0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f,
+ 0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0,
+ 0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5,
+ 0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe,
+ 0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9,
+ 0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b,
+ 0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea,
+ 0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc,
+ 0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6,
+ 0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49,
+ 0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12,
+ 0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e,
+ 0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80,
+ 0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33,
+ 0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22,
+ 0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe,
+ 0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68,
+ 0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64,
+ 0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0,
+ 0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e,
+ 0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97,
+ 0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30,
+ 0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c,
+ 0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d,
+ 0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51,
+ 0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb,
+ 0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8,
+ 0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d,
+ 0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21,
+ 0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a,
+ 0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf,
+ 0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e,
+ 0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86,
+ 0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78,
+ 0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99,
+ 0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb,
+ 0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37,
+ 0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb,
+ 0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95,
+ 0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d,
+ 0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1,
+ 0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6,
+ 0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46,
+ 0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd,
+ 0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83,
+ 0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84,
+ 0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23,
+ 0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d,
+ 0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde,
+ 0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5,
+ 0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07,
+ 0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d,
+ 0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4,
+ 0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81,
+ 0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76,
+ 0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e,
+ 0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93,
+ 0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7,
+ 0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21,
+ 0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc,
+ 0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1,
+ 0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3,
+ 0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b,
+ 0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd,
+ 0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05,
+ 0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90,
+ 0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c,
+ 0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd,
+ 0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9,
+ 0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9,
+ 0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7,
+ 0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d,
+ 0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3,
+ 0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6,
+ 0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29,
+ 0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd,
+ 0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9,
+ 0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31,
+ 0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff,
+ 0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d,
+ 0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a,
+ 0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f,
+ 0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b,
+ 0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d,
+ 0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12,
+ 0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb,
+ 0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d,
+ 0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd,
+ 0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53,
+ 0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01,
+ 0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce,
+ 0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe,
+ 0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d,
+ 0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07,
+ 0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a,
+ 0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb,
+ 0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48,
+ 0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90,
+ 0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7,
+ 0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d,
+ 0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11,
+ 0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26,
+ 0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c,
+ 0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e,
+ 0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a,
+ 0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e,
+ 0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48,
+ 0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c,
+ 0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8,
+ 0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87,
+ 0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a,
+ 0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd,
+ 0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d,
+ 0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84,
+ 0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26,
+ 0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe,
+ 0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce,
+ 0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8,
+ 0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b,
+ 0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd,
+ 0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74,
+ 0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18,
+ 0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde,
+ 0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd,
+ 0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78,
+ 0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce,
+ 0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda,
+ 0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d,
+ 0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a,
+ 0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84,
+ 0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6,
+ 0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d,
+ 0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0,
+ 0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d,
+ 0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9,
+ 0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd,
+ 0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17,
+ 0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38,
+ 0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20,
+ 0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd,
+ 0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d,
+ 0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5,
+ 0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5,
+ 0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe,
+ 0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27,
+ 0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde,
+ 0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab,
+ 0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd,
+ 0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7,
+ 0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24,
+ 0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01,
+ 0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd,
+ 0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50,
+ 0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c,
+ 0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39,
+ 0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a,
+ 0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5,
+ 0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74,
+ 0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4,
+ 0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d,
+ 0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c,
+ 0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a,
+ 0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f,
+ 0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d,
+ 0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16,
+ 0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a,
+ 0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09,
+ 0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e,
+ 0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74,
+ 0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1,
+ 0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10,
+ 0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb,
+ 0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e,
+ 0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5,
+ 0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4,
+ 0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe,
+ 0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc,
+ 0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29,
+ 0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e,
+ 0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c,
+ 0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b,
+ 0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4,
+ 0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe,
+ 0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a,
+ 0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4,
+ 0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf,
+ 0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25,
+ 0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d,
+ 0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19,
+ 0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1,
+ 0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d,
+ 0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d,
+ 0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef,
+ 0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34,
+ 0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75,
+ 0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc,
+ 0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40,
+ 0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62,
+ 0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea,
+ 0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd,
+ 0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20,
+ 0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4,
+ 0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1,
+ 0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e,
+ 0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a,
+ 0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b,
+ 0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9,
+ 0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe,
+ 0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48,
+ 0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9,
+ 0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88,
+ 0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe,
+ 0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53,
+ 0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd,
+ 0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4,
+ 0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e,
+ 0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33,
+ 0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b,
+ 0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd,
+ 0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd,
+ 0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81,
+ 0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2,
+ 0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45,
+ 0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c,
+ 0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68,
+ 0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83,
+ 0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe,
+ 0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c,
+ 0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f,
+ 0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe,
+ 0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e,
+ 0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12,
+ 0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7,
+ 0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01,
+ 0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c,
+ 0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8,
+ 0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83,
+ 0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30,
+ 0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc,
+ 0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20,
+ 0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd,
+ 0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12,
+ 0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb,
+ 0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa,
+ 0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d,
+ 0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe,
+ 0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c,
+ 0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1,
+ 0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c,
+ 0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29,
+ 0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd,
+ 0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7,
+ 0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97,
+ 0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f,
+ 0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b,
+ 0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79,
+ 0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9,
+ 0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b,
+ 0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd,
+ 0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0,
+ 0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c,
+ 0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c,
+ 0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd,
+ 0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91,
+ 0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4,
+ 0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83,
+ 0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c,
+ 0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95,
+ 0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74,
+ 0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb,
+ 0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd,
+ 0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c,
+ 0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1,
+ 0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64,
+ 0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c,
+ 0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4,
+ 0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06,
+ 0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e,
+ 0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c,
+ 0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0,
+ 0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79,
+ 0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d,
+ 0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd,
+ 0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68,
+ 0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d,
+ 0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d,
+ 0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c,
+ 0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95,
+ 0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6,
+ 0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03,
+ 0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d,
+ 0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8,
+ 0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b,
+ 0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66,
+ 0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd,
+ 0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc,
+ 0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1,
+ 0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f,
+ 0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c,
+ 0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1,
+ 0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab,
+ 0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26,
+ 0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd,
+ 0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47,
+ 0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf,
+ 0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a,
+ 0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd,
+ 0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e,
+ 0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68,
+ 0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47,
+ 0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd,
+ 0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4,
+ 0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2,
+ 0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c,
+ 0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c,
+ 0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde,
+ 0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6,
+ 0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed,
+ 0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc,
+ 0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd,
+ 0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf,
+ 0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58,
+ 0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c,
+ 0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48,
+ 0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f,
+ 0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52,
+ 0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd,
+ 0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9,
+ 0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90,
+ 0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8,
+ 0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc,
+ 0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4,
+ 0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d,
+ 0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09,
+ 0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a,
+ 0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d,
+ 0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a,
+ 0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44,
+ 0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc,
+ 0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55,
+ 0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59,
+ 0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62,
+ 0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c,
+ 0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34,
+ 0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4,
+ 0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f,
+ 0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d,
+ 0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71,
+ 0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a,
+ 0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79,
+ 0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d,
+ 0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3,
+ 0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e,
+ 0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80,
+ 0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d,
+ 0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd,
+ 0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43,
+ 0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb,
+ 0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d,
+ 0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61,
+ 0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf,
+ 0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80,
+ 0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd,
+ 0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54,
+ 0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde,
+ 0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43,
+ 0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b,
+ 0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8,
+ 0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c,
+ 0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd,
+ 0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba,
+ 0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57,
+ 0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4,
+ 0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c,
+ 0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd,
+ 0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43,
+ 0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3,
+ 0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70,
+ 0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd,
+ 0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41,
+ 0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3,
+ 0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80,
+ 0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c,
+ 0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a,
+ 0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a,
+ 0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8,
+ 0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba,
+ 0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc,
+ 0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5,
+ 0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b,
+ 0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d,
+ 0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2,
+ 0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b,
+ 0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91,
+ 0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c,
+ 0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52,
+ 0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a,
+ 0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1,
+ 0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d,
+ 0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97,
+ 0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce,
+ 0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01,
+ 0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d,
+ 0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10,
+ 0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09,
+ 0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78,
+ 0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d,
+ 0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98,
+ 0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17,
+ 0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f,
+ 0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc,
+ 0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8,
+ 0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62,
+ 0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e,
+ 0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd,
+ 0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42,
+ 0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3,
+ 0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb,
+ 0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d,
+ 0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c,
+ 0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4,
+ 0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10,
+ 0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd,
+ 0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d,
+ 0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9,
+ 0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83,
+ 0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d,
+ 0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b,
+ 0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0,
+ 0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06,
+ 0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd,
+ 0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc,
+ 0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1,
+ 0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c,
+ 0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c,
+ 0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc,
+ 0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57,
+ 0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f,
+ 0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d,
+ 0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f,
+ 0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6,
+ 0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23,
+ 0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc,
+ 0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd,
+ 0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1,
+ 0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c,
+ 0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d,
+ 0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4,
+ 0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d,
+ 0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6,
+ 0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc,
+ 0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30,
+ 0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f,
+ 0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b,
+ 0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd,
+ 0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02,
+ 0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3,
+ 0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18,
+ 0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c,
+ 0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84,
+ 0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99,
+ 0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9,
+ 0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb,
+ 0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa,
+ 0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f,
+ 0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20,
+ 0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d,
+ 0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd,
+ 0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53,
+ 0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9,
+ 0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc,
+ 0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68,
+ 0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a,
+ 0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56,
+ 0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd,
+ 0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89,
+ 0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47,
+ 0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66,
+ 0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d,
+ 0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e,
+ 0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09,
+ 0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85,
+ 0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d,
+ 0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd,
+ 0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87,
+ 0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3,
+ 0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb,
+ 0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11,
+ 0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91,
+ 0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c,
+ 0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d,
+ 0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c,
+ 0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b,
+ 0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f,
+ 0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc,
+ 0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4,
+ 0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b,
+ 0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91,
+ 0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb,
+ 0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29,
+ 0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90,
+ 0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde,
+ 0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d,
+ 0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd,
+ 0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41,
+ 0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51,
+ 0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd,
+ 0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58,
+ 0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40,
+ 0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49,
+ 0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd,
+ 0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75,
+ 0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56,
+ 0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05,
+ 0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd,
+ 0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9,
+ 0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64,
+ 0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90,
+ 0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b,
+ 0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e,
+ 0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf,
+ 0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0,
+ 0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b,
+ 0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1,
+ 0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6,
+ 0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96,
+ 0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c,
+ 0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9,
+ 0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59,
+ 0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99,
+ 0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb,
+ 0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70,
+ 0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4,
+ 0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3,
+ 0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba,
+ 0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd,
+ 0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7,
+ 0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87,
+ 0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd,
+ 0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb,
+ 0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22,
+ 0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70,
+ 0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd,
+ 0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf,
+ 0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b,
+ 0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7,
+ 0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c,
+ 0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40,
+ 0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2,
+ 0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d,
+ 0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c,
+ 0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84,
+ 0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8,
+ 0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65,
+ 0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc,
+ 0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5,
+ 0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb,
+ 0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83,
+ 0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d,
+ 0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2,
+ 0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc,
+ 0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d,
+ 0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc,
+ 0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e,
+ 0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83,
+ 0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29,
+ 0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc,
+ 0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd,
+ 0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06,
+ 0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e,
+ 0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d,
+ 0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a,
+ 0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf,
+ 0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9,
+ 0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb,
+ 0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31,
+ 0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e,
+ 0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7,
+ 0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd,
+ 0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90,
+ 0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc,
+ 0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59,
+ 0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd,
+ 0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10,
+ 0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8,
+ 0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f,
+ 0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc,
+ 0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0,
+ 0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec,
+ 0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6,
+ 0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c,
+ 0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1,
+ 0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7,
+ 0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44,
+ 0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc,
+ 0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0,
+ 0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54,
+ 0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d,
+ 0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d,
+ 0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4,
+ 0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98,
+ 0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f,
+ 0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b,
+ 0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9,
+ 0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4,
+ 0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd,
+ 0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d,
+ 0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c,
+ 0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02,
+ 0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75,
+ 0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd,
+ 0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40,
+ 0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e,
+ 0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50,
+ 0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c,
+ 0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93,
+ 0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e,
+ 0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47,
+ 0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d,
+ 0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88,
+ 0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4,
+ 0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15,
+ 0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd,
+ 0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2,
+ 0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26,
+ 0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53,
+ 0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd,
+ 0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8,
+ 0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3,
+ 0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a,
+ 0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd,
+ 0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16,
+ 0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf,
+ 0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b,
+ 0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd,
+ 0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c,
+ 0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15,
+ 0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83,
+ 0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b,
+ 0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53,
+ 0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f,
+ 0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09,
+ 0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c,
+ 0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6,
+ 0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb,
+ 0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d,
+ 0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c,
+ 0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c,
+ 0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22,
+ 0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82,
+ 0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d,
+ 0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb,
+ 0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6,
+ 0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92,
+ 0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc,
+ 0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e,
+ 0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81,
+ 0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3,
+ 0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c,
+ 0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80,
+ 0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4,
+ 0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b,
+ 0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd,
+ 0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f,
+ 0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f,
+ 0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00,
+ 0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd,
+ 0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46,
+ 0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3,
+ 0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85,
+ 0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d,
+ 0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb,
+ 0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe,
+ 0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7,
+ 0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d,
+ 0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84,
+ 0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e,
+ 0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c,
+ 0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c,
+ 0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11,
+ 0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f,
+ 0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e,
+ 0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd,
+ 0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a,
+ 0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d,
+ 0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30,
+ 0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd,
+ 0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76,
+ 0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a,
+ 0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46,
+ 0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc,
+ 0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a,
+ 0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a,
+ 0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb,
+ 0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c,
+ 0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2,
+ 0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4,
+ 0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34,
+ 0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd,
+ 0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9,
+ 0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46,
+ 0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86,
+ 0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc,
+ 0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46,
+ 0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e,
+ 0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f,
+ 0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd,
+ 0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c,
+ 0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79,
+ 0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48,
+ 0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c,
+ 0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f,
+ 0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8,
+ 0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88,
+ 0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd,
+ 0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a,
+ 0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc,
+ 0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71,
+ 0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd,
+ 0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3,
+ 0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18,
+ 0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28,
+ 0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc,
+ 0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47,
+ 0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67,
+ 0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45,
+ 0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c,
+ 0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02,
+ 0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11,
+ 0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96,
+ 0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb,
+ 0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e,
+ 0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8,
+ 0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76,
+ 0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d,
+ 0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2,
+ 0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8,
+ 0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe,
+ 0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c,
+ 0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d,
+ 0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb,
+ 0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05,
+ 0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd,
+ 0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58,
+ 0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c,
+ 0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9,
+ 0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9,
+ 0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2,
+ 0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7,
+ 0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd,
+ 0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc,
+ 0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9,
+ 0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf,
+ 0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89,
+ 0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d,
+ 0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e,
+ 0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e,
+ 0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c,
+ 0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd,
+ 0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf,
+ 0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b,
+ 0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d,
+ 0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd,
+ 0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54,
+ 0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe,
+ 0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c,
+ 0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd,
+ 0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf,
+ 0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f,
+ 0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62,
+ 0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c,
+ 0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a,
+ 0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8,
+ 0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c,
+ 0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd,
+ 0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75,
+ 0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c,
+ 0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35,
+ 0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc,
+ 0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65,
+ 0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd,
+ 0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b,
+ 0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb,
+ 0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05,
+ 0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9,
+ 0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69,
+ 0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd,
+ 0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96,
+ 0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d,
+ 0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe,
+ 0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc,
+ 0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2,
+ 0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63,
+ 0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4,
+ 0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb,
+ 0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62,
+ 0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d,
+ 0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a,
+ 0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d,
+ 0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27,
+ 0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf,
+ 0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a,
+ 0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd,
+ 0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27,
+ 0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b,
+ 0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e,
+ 0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c,
+ 0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4,
+ 0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6,
+ 0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59,
+ 0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c,
+ 0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8,
+ 0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f,
+ 0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42,
+ 0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d,
+ 0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69,
+ 0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04,
+ 0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77,
+ 0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd,
+ 0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5,
+ 0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69,
+ 0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60,
+ 0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc,
+ 0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40,
+ 0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e,
+ 0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09,
+ 0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc,
+ 0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28,
+ 0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65,
+ 0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa,
+ 0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd,
+ 0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12,
+ 0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25,
+ 0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e,
+ 0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd,
+ 0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b,
+ 0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7,
+ 0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92,
+ 0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d,
+ 0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8,
+ 0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4,
+ 0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee,
+ 0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd,
+ 0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4,
+ 0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35,
+ 0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f,
+ 0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d,
+ 0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a,
+ 0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5,
+ 0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8,
+ 0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d,
+ 0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67,
+ 0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c,
+ 0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54,
+ 0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d,
+ 0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18,
+ 0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a,
+ 0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23,
+ 0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd,
+ 0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde,
+ 0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c,
+ 0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c,
+ 0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d,
+ 0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e,
+ 0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a,
+ 0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4,
+ 0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c,
+ 0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5,
+ 0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0,
+ 0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a,
+ 0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d,
+ 0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e,
+ 0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71,
+ 0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91,
+ 0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b,
+ 0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62,
+ 0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57,
+ 0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3,
+ 0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd,
+ 0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c,
+ 0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45,
+ 0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45,
+ 0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b,
+ 0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d,
+ 0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac,
+ 0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94,
+ 0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d,
+ 0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e,
+ 0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20,
+ 0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95,
+ 0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd,
+ 0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94,
+ 0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b,
+ 0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad,
+ 0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c,
+ 0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba,
+ 0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11,
+ 0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb,
+ 0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd,
+ 0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10,
+ 0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc,
+ 0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15,
+ 0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c,
+ 0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34,
+ 0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12,
+ 0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0,
+ 0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d,
+ 0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78,
+ 0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03,
+ 0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84,
+ 0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd,
+ 0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9,
+ 0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1,
+ 0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91,
+ 0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb,
+ 0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70,
+ 0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa,
+ 0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96,
+ 0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c,
+ 0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d,
+ 0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13,
+ 0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56,
+ 0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c,
+ 0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa,
+ 0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c,
+ 0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6,
+ 0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd,
+ 0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80,
+ 0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20,
+ 0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef,
+ 0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd,
+ 0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38,
+ 0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82,
+ 0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a,
+ 0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd,
+ 0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3,
+ 0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6,
+ 0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94,
+ 0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd,
+ 0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3,
+ 0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99,
+ 0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52,
+ 0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd,
+ 0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30,
+ 0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f,
+ 0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3,
+ 0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d,
+ 0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17,
+ 0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b,
+ 0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b,
+ 0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d,
+ 0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a,
+ 0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83,
+ 0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e,
+ 0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd,
+ 0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02,
+ 0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b,
+ 0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59,
+ 0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd,
+ 0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf,
+ 0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49,
+ 0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff,
+ 0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c,
+ 0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee,
+ 0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e,
+ 0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20,
+ 0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d,
+ 0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87,
+ 0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94,
+ 0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c,
+ 0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb,
+ 0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42,
+ 0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2,
+ 0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b,
+ 0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd,
+ 0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c,
+ 0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a,
+ 0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54,
+ 0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd,
+ 0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6,
+ 0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47,
+ 0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f,
+ 0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c,
+ 0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee,
+ 0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1,
+ 0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45,
+ 0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd,
+ 0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68,
+ 0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97,
+ 0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9,
+ 0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd,
+ 0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8,
+ 0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae,
+ 0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f,
+ 0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc,
+ 0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea,
+ 0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d,
+ 0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3,
+ 0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd,
+ 0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0,
+ 0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33,
+ 0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08,
+ 0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d,
+ 0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00,
+ 0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf,
+ 0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1,
+ 0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb,
+ 0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca,
+ 0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc,
+ 0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03,
+ 0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd,
+ 0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c,
+ 0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a,
+ 0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe,
+ 0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c,
+ 0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0,
+ 0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92,
+ 0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a,
+ 0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b,
+ 0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8,
+ 0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14,
+ 0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e,
+ 0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd,
+ 0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50,
+ 0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b,
+ 0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63,
+ 0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd,
+ 0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29,
+ 0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97,
+ 0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5,
+ 0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c,
+ 0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a,
+ 0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17,
+ 0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f,
+ 0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd,
+ 0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48,
+ 0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b,
+ 0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10,
+ 0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd,
+ 0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31,
+ 0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d,
+ 0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24,
+ 0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c,
+ 0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6,
+ 0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6,
+ 0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45,
+ 0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd,
+ 0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e,
+ 0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e,
+ 0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83,
+ 0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc,
+ 0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0,
+ 0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28,
+ 0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f,
+ 0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd,
+ 0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4,
+ 0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e,
+ 0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29,
+ 0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c,
+ 0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8,
+ 0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41,
+ 0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f,
+ 0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd,
+ 0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0,
+ 0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf,
+ 0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84,
+ 0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd,
+ 0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0,
+ 0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6,
+ 0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb,
+ 0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd,
+ 0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2,
+ 0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05,
+ 0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47,
+ 0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d,
+ 0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8,
+ 0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a,
+ 0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf,
+ 0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c,
+ 0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae,
+ 0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f,
+ 0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06,
+ 0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c,
+ 0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0,
+ 0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f,
+ 0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18,
+ 0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c,
+ 0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a,
+ 0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8,
+ 0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b,
+ 0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd,
+ 0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0,
+ 0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b,
+ 0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a,
+ 0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd,
+ 0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59,
+ 0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16,
+ 0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08,
+ 0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d,
+ 0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2,
+ 0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f,
+ 0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33,
+ 0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd,
+ 0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc,
+ 0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27,
+ 0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3,
+ 0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c,
+ 0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c,
+ 0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e,
+ 0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd,
+ 0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c,
+ 0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6,
+ 0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04,
+ 0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21,
+ 0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d,
+ 0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0,
+ 0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78,
+ 0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18,
+ 0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd,
+ 0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66,
+ 0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19,
+ 0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef,
+ 0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a,
+ 0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64,
+ 0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf,
+ 0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a,
+ 0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c,
+ 0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4,
+ 0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a,
+ 0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91,
+ 0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc,
+ 0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28,
+ 0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1,
+ 0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67,
+ 0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d,
+ 0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87,
+ 0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5,
+ 0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f,
+ 0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd,
+ 0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e,
+ 0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1,
+ 0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12,
+ 0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d,
+ 0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24,
+ 0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86,
+ 0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8,
+ 0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd,
+ 0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c,
+ 0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12,
+ 0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f,
+ 0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc,
+ 0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58,
+ 0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e,
+ 0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5,
+ 0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39,
+ 0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec,
+ 0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a,
+ 0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6,
+ 0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c,
+ 0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e,
+ 0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa,
+ 0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c,
+ 0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d,
+ 0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62,
+ 0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94,
+ 0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65,
+ 0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd,
+ 0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce,
+ 0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a,
+ 0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40,
+ 0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd,
+ 0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c,
+ 0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62,
+ 0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b,
+ 0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c,
+ 0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66,
+ 0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d,
+ 0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68,
+ 0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d,
+ 0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60,
+ 0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32,
+ 0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc,
+ 0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc,
+ 0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b,
+ 0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92,
+ 0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a,
+ 0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc,
+ 0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4,
+ 0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b,
+ 0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6,
+ 0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c,
+ 0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00,
+ 0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e,
+ 0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c,
+ 0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a,
+ 0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40,
+ 0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb,
+ 0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a,
+ 0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c,
+ 0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d,
+ 0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5,
+ 0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a,
+ 0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd,
+ 0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17,
+ 0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85,
+ 0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21,
+ 0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc,
+ 0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1,
+ 0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d,
+ 0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f,
+ 0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b,
+ 0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80,
+ 0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31,
+ 0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08,
+ 0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd,
+ 0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98,
+ 0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd,
+ 0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e,
+ 0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d,
+ 0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72,
+ 0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5,
+ 0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f,
+ 0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc,
+ 0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae,
+ 0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a,
+ 0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f,
+ 0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc,
+ 0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf,
+ 0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65,
+ 0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40,
+ 0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb,
+ 0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b,
+ 0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67,
+ 0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee,
+ 0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d,
+ 0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50,
+ 0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5,
+ 0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90,
+ 0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb,
+ 0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b,
+ 0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5,
+ 0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05,
+ 0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc,
+ 0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61,
+ 0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e,
+ 0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84,
+ 0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d,
+ 0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a,
+ 0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81,
+ 0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87,
+ 0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc,
+ 0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5,
+ 0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7,
+ 0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79,
+ 0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c,
+ 0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38,
+ 0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1,
+ 0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b,
+ 0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c,
+ 0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54,
+ 0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19,
+ 0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42,
+ 0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd,
+ 0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0,
+ 0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf,
+ 0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c,
+ 0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c,
+ 0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc,
+ 0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16,
+ 0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5,
+ 0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc,
+ 0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1,
+ 0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea,
+ 0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a,
+ 0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d,
+ 0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00,
+ 0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb,
+ 0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81,
+ 0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c,
+ 0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8,
+ 0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5,
+ 0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e,
+ 0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c,
+ 0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82,
+ 0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a,
+ 0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78,
+ 0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd,
+ 0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0,
+ 0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08,
+ 0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f,
+ 0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b,
+ 0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05,
+ 0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3,
+ 0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b,
+ 0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd,
+ 0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa,
+ 0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e,
+ 0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8,
+ 0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb,
+ 0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76,
+ 0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45,
+ 0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2,
+ 0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba,
+ 0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc,
+ 0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc,
+ 0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31,
+ 0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc,
+ 0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90,
+ 0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a,
+ 0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19,
+ 0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc,
+ 0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b,
+ 0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17,
+ 0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f,
+ 0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc,
+ 0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12,
+ 0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55,
+ 0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44,
+ 0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc,
+ 0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa,
+ 0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8,
+ 0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90,
+ 0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd,
+ 0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7,
+ 0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56,
+ 0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0,
+ 0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d,
+ 0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53,
+ 0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3,
+ 0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c,
+ 0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd,
+ 0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3,
+ 0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb,
+ 0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d,
+ 0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d,
+ 0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb,
+ 0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7,
+ 0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91,
+ 0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd,
+ 0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e,
+ 0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4,
+ 0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64,
+ 0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d,
+ 0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22,
+ 0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e,
+ 0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80,
+ 0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc,
+ 0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88,
+ 0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab,
+ 0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04,
+ 0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d,
+ 0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b,
+ 0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3,
+ 0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9,
+ 0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c,
+ 0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90,
+ 0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b,
+ 0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71,
+ 0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d,
+ 0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6,
+ 0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78,
+ 0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4,
+ 0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d,
+ 0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c,
+ 0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e,
+ 0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35,
+ 0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc,
+ 0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6,
+ 0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36,
+ 0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae,
+ 0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb,
+ 0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2,
+ 0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad,
+ 0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69,
+ 0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd,
+ 0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d,
+ 0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b,
+ 0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a,
+ 0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d,
+ 0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d,
+ 0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf,
+ 0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02,
+ 0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d,
+ 0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02,
+ 0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f,
+ 0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14,
+ 0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a,
+ 0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4,
+ 0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5,
+ 0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23,
+ 0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc,
+ 0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf,
+ 0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0,
+ 0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b,
+ 0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb,
+ 0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4,
+ 0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99,
+ 0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17,
+ 0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d,
+ 0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93,
+ 0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e,
+ 0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48,
+ 0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d,
+ 0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e,
+ 0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42,
+ 0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a,
+ 0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc,
+ 0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e,
+ 0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13,
+ 0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47,
+ 0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c,
+ 0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58,
+ 0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca,
+ 0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43,
+ 0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd,
+ 0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23,
+ 0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31,
+ 0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49,
+ 0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c,
+ 0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50,
+ 0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56,
+ 0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13,
+ 0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc,
+ 0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37,
+ 0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41,
+ 0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7,
+ 0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc,
+ 0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42,
+ 0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb,
+ 0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62,
+ 0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc,
+ 0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa,
+ 0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1,
+ 0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99,
+ 0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd,
+ 0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70,
+ 0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8,
+ 0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02,
+ 0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d,
+ 0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3,
+ 0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41,
+ 0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70,
+ 0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d,
+ 0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11,
+ 0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc,
+ 0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0,
+ 0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc,
+ 0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89,
+ 0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c,
+ 0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb,
+ 0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc,
+ 0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9,
+ 0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3,
+ 0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d,
+ 0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc,
+ 0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12,
+ 0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37,
+ 0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21,
+ 0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c,
+ 0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff,
+ 0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39,
+ 0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d,
+ 0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c,
+ 0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba,
+ 0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d,
+ 0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e,
+ 0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c,
+ 0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41,
+ 0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95,
+ 0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c,
+ 0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c,
+ 0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99,
+ 0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8,
+ 0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15,
+ 0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc,
+ 0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99,
+ 0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e,
+ 0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52,
+ 0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c,
+ 0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e,
+ 0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05,
+ 0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83,
+ 0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd,
+ 0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00,
+ 0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e,
+ 0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5,
+ 0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd,
+ 0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7,
+ 0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07,
+ 0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a,
+ 0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd,
+ 0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3,
+ 0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42,
+ 0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a,
+ 0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d,
+ 0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8,
+ 0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98,
+ 0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f,
+ 0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd,
+ 0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c,
+ 0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc,
+ 0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90,
+ 0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc,
+ 0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69,
+ 0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23,
+ 0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e,
+ 0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c,
+ 0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82,
+ 0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34,
+ 0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a,
+ 0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd,
+ 0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba,
+ 0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a,
+ 0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49,
+ 0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c,
+ 0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e,
+ 0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9,
+ 0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9,
+ 0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c,
+ 0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d,
+ 0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40,
+ 0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27,
+ 0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd,
+ 0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53,
+ 0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80,
+ 0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23,
+ 0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd,
+ 0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5,
+ 0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b,
+ 0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d,
+ 0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c,
+ 0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a,
+ 0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9,
+ 0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29,
+ 0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc,
+ 0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d,
+ 0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4,
+ 0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e,
+ 0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd,
+ 0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca,
+ 0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81,
+ 0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23,
+ 0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd,
+ 0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03,
+ 0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a,
+ 0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a,
+ 0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c,
+ 0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb,
+ 0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb,
+ 0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89,
+ 0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc,
+ 0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2,
+ 0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f,
+ 0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83,
+ 0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c,
+ 0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30,
+ 0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8,
+ 0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d,
+ 0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd,
+ 0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90,
+ 0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b,
+ 0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40,
+ 0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc,
+ 0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce,
+ 0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49,
+ 0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70,
+ 0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd,
+ 0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e,
+ 0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc,
+ 0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d,
+ 0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d,
+ 0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c,
+ 0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17,
+ 0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57,
+ 0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc,
+ 0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a,
+ 0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f,
+ 0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc,
+ 0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d,
+ 0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a,
+ 0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f,
+ 0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e,
+ 0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d,
+ 0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16,
+ 0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f,
+ 0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42,
+ 0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc,
+ 0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32,
+ 0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14,
+ 0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74,
+ 0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc,
+ 0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6,
+ 0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f,
+ 0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c,
+ 0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd,
+ 0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5,
+ 0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf,
+ 0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19,
+ 0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d,
+ 0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b,
+ 0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f,
+ 0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc,
+ 0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c,
+ 0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07,
+ 0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3,
+ 0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79,
+ 0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d,
+ 0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73,
+ 0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44,
+ 0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91,
+ 0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd,
+ 0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91,
+ 0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a,
+ 0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91,
+ 0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b,
+ 0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78,
+ 0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38,
+ 0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5,
+ 0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d,
+ 0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55,
+ 0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19,
+ 0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c,
+ 0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c,
+ 0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b,
+ 0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a,
+ 0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57,
+ 0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d,
+ 0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3,
+ 0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e,
+ 0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05,
+ 0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d,
+ 0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e,
+ 0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65,
+ 0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b,
+ 0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd,
+ 0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b,
+ 0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c,
+ 0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f,
+ 0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd,
+ 0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38,
+ 0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6,
+ 0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f,
+ 0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d,
+ 0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86,
+ 0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80,
+ 0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78,
+ 0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc,
+ 0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79,
+ 0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46,
+ 0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8,
+ 0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd,
+ 0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38,
+ 0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01,
+ 0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f,
+ 0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d,
+ 0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda,
+ 0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52,
+ 0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff,
+ 0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd,
+ 0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c,
+ 0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1,
+ 0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32,
+ 0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d,
+ 0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6,
+ 0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83,
+ 0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e,
+ 0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb,
+ 0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f,
+ 0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f,
+ 0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb,
+ 0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd,
+ 0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7,
+ 0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62,
+ 0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97,
+ 0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba,
+ 0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02,
+ 0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6,
+ 0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba,
+ 0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d,
+ 0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99,
+ 0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2,
+ 0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3,
+ 0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d,
+ 0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b,
+ 0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2,
+ 0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59,
+ 0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d,
+ 0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18,
+ 0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc,
+ 0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70,
+ 0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc,
+ 0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c,
+ 0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e,
+ 0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e,
+ 0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd,
+ 0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd,
+ 0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1,
+ 0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a,
+ 0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd,
+ 0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33,
+ 0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21,
+ 0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a,
+ 0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd,
+ 0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15,
+ 0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95,
+ 0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99,
+ 0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d,
+ 0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7,
+ 0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c,
+ 0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b,
+ 0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc,
+ 0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53,
+ 0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe,
+ 0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80,
+ 0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d,
+ 0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a,
+ 0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a,
+ 0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb,
+ 0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d,
+ 0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb,
+ 0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32,
+ 0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77,
+ 0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd,
+ 0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61,
+ 0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f,
+ 0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7,
+ 0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d,
+ 0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed,
+ 0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52,
+ 0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8,
+ 0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d,
+ 0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4,
+ 0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b,
+ 0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f,
+ 0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd,
+ 0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4,
+ 0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14,
+ 0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb,
+ 0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d,
+ 0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2,
+ 0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62,
+ 0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8,
+ 0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c,
+ 0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e,
+ 0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca,
+ 0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80,
+ 0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc,
+ 0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52,
+ 0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67,
+ 0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a,
+ 0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd,
+ 0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07,
+ 0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99,
+ 0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44,
+ 0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9,
+ 0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77,
+ 0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29,
+ 0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15,
+ 0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd,
+ 0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7,
+ 0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e,
+ 0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82,
+ 0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd,
+ 0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa,
+ 0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90,
+ 0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e,
+ 0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c,
+ 0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf,
+ 0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59,
+ 0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0,
+ 0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb,
+ 0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c,
+ 0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7,
+ 0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf,
+ 0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c,
+ 0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9,
+ 0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf,
+ 0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d,
+ 0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c,
+ 0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a,
+ 0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20,
+ 0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28,
+ 0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd,
+ 0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71,
+ 0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8,
+ 0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad,
+ 0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc,
+ 0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92,
+ 0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4,
+ 0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4,
+ 0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c,
+ 0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd,
+ 0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6,
+ 0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0,
+ 0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c,
+ 0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82,
+ 0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52,
+ 0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c,
+ 0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc,
+ 0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba,
+ 0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72,
+ 0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f,
+ 0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c,
+ 0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1,
+ 0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10,
+ 0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35,
+ 0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d,
+ 0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00,
+ 0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9,
+ 0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa,
+ 0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d,
+ 0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40,
+ 0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2,
+ 0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b,
+ 0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d,
+ 0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7,
+ 0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06,
+ 0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f,
+ 0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc,
+ 0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00,
+ 0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65,
+ 0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95,
+ 0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c,
+ 0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13,
+ 0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7,
+ 0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0,
+ 0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd,
+ 0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42,
+ 0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe,
+ 0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99,
+ 0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d,
+ 0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92,
+ 0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81,
+ 0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28,
+ 0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd,
+ 0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd,
+ 0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82,
+ 0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a,
+ 0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb,
+ 0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5,
+ 0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19,
+ 0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb,
+ 0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d,
+ 0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73,
+ 0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30,
+ 0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82,
+ 0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd,
+ 0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0,
+ 0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd,
+ 0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e,
+ 0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d,
+ 0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55,
+ 0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00,
+ 0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a,
+ 0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c,
+ 0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e,
+ 0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54,
+ 0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30,
+ 0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39,
+ 0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17,
+ 0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea,
+ 0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f,
+ 0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c,
+ 0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1,
+ 0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a,
+ 0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25,
+ 0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d,
+ 0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32,
+ 0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6,
+ 0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96,
+ 0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c,
+ 0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f,
+ 0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e,
+ 0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99,
+ 0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d,
+ 0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c,
+ 0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06,
+ 0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee,
+ 0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd,
+ 0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12,
+ 0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed,
+ 0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25,
+ 0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc,
+ 0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f,
+ 0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1,
+ 0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76,
+ 0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d,
+ 0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c,
+ 0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c,
+ 0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f,
+ 0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd,
+ 0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54,
+ 0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b,
+ 0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0,
+ 0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb,
+ 0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7,
+ 0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20,
+ 0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6,
+ 0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd,
+ 0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e,
+ 0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a,
+ 0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65,
+ 0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d,
+ 0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd,
+ 0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6,
+ 0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10,
+ 0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d,
+ 0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52,
+ 0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc,
+ 0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc,
+ 0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb,
+ 0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e,
+ 0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05,
+ 0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0,
+ 0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc,
+ 0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5,
+ 0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d,
+ 0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75,
+ 0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd,
+ 0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc,
+ 0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a,
+ 0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3,
+ 0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd,
+ 0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0,
+ 0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18,
+ 0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f,
+ 0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d,
+ 0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39,
+ 0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64,
+ 0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b,
+ 0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd,
+ 0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d,
+ 0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00,
+ 0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23,
+ 0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d,
+ 0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c,
+ 0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d,
+ 0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3,
+ 0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8,
+ 0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3,
+ 0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28,
+ 0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11,
+ 0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc,
+ 0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32,
+ 0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb,
+ 0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90,
+ 0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d,
+ 0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73,
+ 0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c,
+ 0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0,
+ 0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d,
+ 0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2,
+ 0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25,
+ 0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0,
+ 0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb,
+ 0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f,
+ 0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8,
+ 0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86,
+ 0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d,
+ 0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f,
+ 0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9,
+ 0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a,
+ 0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd,
+ 0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c,
+ 0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff,
+ 0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc,
+ 0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc,
+ 0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67,
+ 0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12,
+ 0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7,
+ 0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc,
+ 0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae,
+ 0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef,
+ 0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37,
+ 0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c,
+ 0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2,
+ 0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7,
+ 0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf,
+ 0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d,
+ 0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe,
+ 0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48,
+ 0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d,
+ 0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c,
+ 0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0,
+ 0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31,
+ 0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91,
+ 0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb,
+ 0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5,
+ 0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9,
+ 0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0,
+ 0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d,
+ 0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a,
+ 0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda,
+ 0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b,
+ 0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd,
+ 0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0,
+ 0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9,
+ 0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d,
+ 0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d,
+ 0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16,
+ 0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c,
+ 0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d,
+ 0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd,
+ 0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4,
+ 0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6,
+ 0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09,
+ 0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c,
+ 0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6,
+ 0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15,
+ 0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6,
+ 0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd,
+ 0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32,
+ 0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7,
+ 0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9,
+ 0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c,
+ 0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95,
+ 0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50,
+ 0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81,
+ 0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c,
+ 0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96,
+ 0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49,
+ 0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07,
+ 0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc,
+ 0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec,
+ 0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c,
+ 0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3,
+ 0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d,
+ 0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e,
+ 0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72,
+ 0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e,
+ 0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d,
+ 0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2,
+ 0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8,
+ 0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32,
+ 0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd,
+ 0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99,
+ 0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae,
+ 0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56,
+ 0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd,
+ 0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90,
+ 0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05,
+ 0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a,
+ 0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c,
+ 0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7,
+ 0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0,
+ 0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16,
+ 0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb,
+ 0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec,
+ 0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4,
+ 0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45,
+ 0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc,
+ 0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2,
+ 0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a,
+ 0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f,
+ 0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d,
+ 0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d,
+ 0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45,
+ 0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86,
+ 0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c,
+ 0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08,
+ 0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98,
+ 0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17,
+ 0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d,
+ 0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa,
+ 0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac,
+ 0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55,
+ 0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d,
+ 0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0,
+ 0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d,
+ 0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae,
+ 0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c,
+ 0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98,
+ 0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf,
+ 0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86,
+ 0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c,
+ 0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c,
+ 0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54,
+ 0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46,
+ 0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d,
+ 0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f,
+ 0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7,
+ 0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a,
+ 0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c,
+ 0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec,
+ 0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35,
+ 0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17,
+ 0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc,
+ 0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1,
+ 0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81,
+ 0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a,
+ 0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd,
+ 0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06,
+ 0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6,
+ 0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f,
+ 0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c,
+ 0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7,
+ 0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39,
+ 0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8,
+ 0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd,
+ 0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7,
+ 0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25,
+ 0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38,
+ 0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd,
+ 0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc,
+ 0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8,
+ 0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13,
+ 0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c,
+ 0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90,
+ 0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60,
+ 0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b,
+ 0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc,
+ 0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc,
+ 0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74,
+ 0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42,
+ 0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b,
+ 0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80,
+ 0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c,
+ 0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52,
+ 0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc,
+ 0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b,
+ 0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50,
+ 0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e,
+ 0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd,
+ 0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70,
+ 0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f,
+ 0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf,
+ 0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc,
+ 0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55,
+ 0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f,
+ 0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a,
+ 0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b,
+ 0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b,
+ 0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09,
+ 0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c,
+ 0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc,
+ 0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab,
+ 0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf,
+ 0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f,
+ 0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc,
+ 0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02,
+ 0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a,
+ 0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc,
+ 0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd,
+ 0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1,
+ 0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e,
+ 0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68,
+ 0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c,
+ 0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf,
+ 0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51,
+ 0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98,
+ 0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc,
+ 0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31,
+ 0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53,
+ 0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90,
+ 0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd,
+ 0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e,
+ 0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c,
+ 0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44,
+ 0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d,
+ 0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf,
+ 0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51,
+ 0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5,
+ 0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd,
+ 0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22,
+ 0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5,
+ 0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24,
+ 0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc,
+ 0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a,
+ 0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae,
+ 0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d,
+ 0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb,
+ 0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c,
+ 0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8,
+ 0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64,
+ 0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a,
+ 0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb,
+ 0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83,
+ 0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19,
+ 0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd,
+ 0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc,
+ 0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a,
+ 0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f,
+ 0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd,
+ 0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99,
+ 0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42,
+ 0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05,
+ 0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c,
+ 0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd,
+ 0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62,
+ 0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84,
+ 0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c,
+ 0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28,
+ 0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1,
+ 0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5,
+ 0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb,
+ 0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea,
+ 0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06,
+ 0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26,
+ 0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc,
+ 0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42,
+ 0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e,
+ 0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f,
+ 0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd,
+ 0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78,
+ 0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4,
+ 0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d,
+ 0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d,
+ 0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f,
+ 0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4,
+ 0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb,
+ 0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd,
+ 0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3,
+ 0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64,
+ 0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d,
+ 0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb,
+ 0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb,
+ 0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd,
+ 0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9,
+ 0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d,
+ 0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e,
+ 0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57,
+ 0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85,
+ 0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d,
+ 0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03,
+ 0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac,
+ 0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89,
+ 0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d,
+ 0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6,
+ 0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36,
+ 0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e,
+ 0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d,
+ 0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e,
+ 0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2,
+ 0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13,
+ 0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc,
+ 0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d,
+ 0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40,
+ 0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63,
+ 0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d,
+ 0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff,
+ 0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73,
+ 0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d,
+ 0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd,
+ 0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9,
+ 0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0,
+ 0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78,
+ 0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d,
+ 0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc,
+ 0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a,
+ 0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58,
+ 0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d,
+ 0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae,
+ 0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e,
+ 0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7,
+ 0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd,
+ 0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2,
+ 0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8,
+ 0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c,
+ 0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c,
+ 0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59,
+ 0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93,
+ 0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9,
+ 0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d,
+ 0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d,
+ 0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a,
+ 0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56,
+ 0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb,
+ 0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f,
+ 0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5,
+ 0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e,
+ 0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd,
+ 0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2,
+ 0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99,
+ 0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2,
+ 0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb,
+ 0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5,
+ 0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea,
+ 0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f,
+ 0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd,
+ 0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d,
+ 0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f,
+ 0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f,
+ 0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d,
+ 0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce,
+ 0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c,
+ 0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2,
+ 0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c,
+ 0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67,
+ 0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7,
+ 0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78,
+ 0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd,
+ 0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74,
+ 0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a,
+ 0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d,
+ 0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd,
+ 0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae,
+ 0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab,
+ 0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28,
+ 0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d,
+ 0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18,
+ 0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72,
+ 0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f,
+ 0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc,
+ 0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40,
+ 0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12,
+ 0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97,
+ 0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd,
+ 0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a,
+ 0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc,
+ 0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41,
+ 0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d,
+ 0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf,
+ 0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06,
+ 0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97,
+ 0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d,
+ 0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60,
+ 0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9,
+ 0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38,
+ 0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d,
+ 0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72,
+ 0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed,
+ 0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea,
+ 0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d,
+ 0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a,
+ 0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27,
+ 0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04,
+ 0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd,
+ 0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd,
+ 0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5,
+ 0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01,
+ 0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c,
+ 0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01,
+ 0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68,
+ 0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f,
+ 0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d,
+ 0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac,
+ 0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e,
+ 0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64,
+ 0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d,
+ 0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15,
+ 0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02,
+ 0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e,
+ 0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd,
+ 0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e,
+ 0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba,
+ 0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87,
+ 0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc,
+ 0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e,
+ 0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02,
+ 0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13,
+ 0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c,
+ 0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd,
+ 0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9,
+ 0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23,
+ 0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc,
+ 0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0,
+ 0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80,
+ 0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b,
+ 0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc,
+ 0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b,
+ 0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3,
+ 0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e,
+ 0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d,
+ 0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20,
+ 0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6,
+ 0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d,
+ 0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d,
+ 0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62,
+ 0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa,
+ 0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9,
+ 0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d,
+ 0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16,
+ 0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa,
+ 0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c,
+ 0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd,
+ 0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1,
+ 0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c,
+ 0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97,
+ 0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd,
+ 0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5,
+ 0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15,
+ 0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc,
+ 0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c,
+ 0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc,
+ 0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3,
+ 0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85,
+ 0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d,
+ 0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51,
+ 0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30,
+ 0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c,
+ 0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c,
+ 0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28,
+ 0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24,
+ 0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0,
+ 0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d,
+ 0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4,
+ 0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7,
+ 0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa,
+ 0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd,
+ 0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb,
+ 0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c,
+ 0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d,
+ 0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd,
+ 0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57,
+ 0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d,
+ 0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88,
+ 0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd,
+ 0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27,
+ 0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18,
+ 0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90,
+ 0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc,
+ 0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe,
+ 0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e,
+ 0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e,
+ 0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb,
+ 0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff,
+ 0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e,
+ 0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30,
+ 0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c,
+ 0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60,
+ 0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0,
+ 0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20,
+ 0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd,
+ 0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd,
+ 0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5,
+ 0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4,
+ 0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd,
+ 0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49,
+ 0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3,
+ 0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1,
+ 0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c,
+ 0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2,
+ 0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f,
+ 0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c,
+ 0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d,
+ 0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c,
+ 0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf,
+ 0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7,
+ 0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c,
+ 0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90,
+ 0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38,
+ 0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93,
+ 0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd,
+ 0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88,
+ 0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88,
+ 0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60,
+ 0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd,
+ 0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8,
+ 0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79,
+ 0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79,
+ 0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d,
+ 0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43,
+ 0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41,
+ 0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4,
+ 0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd,
+ 0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6,
+ 0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34,
+ 0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1,
+ 0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c,
+ 0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c,
+ 0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff,
+ 0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58,
+ 0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d,
+ 0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92,
+ 0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8,
+ 0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d,
+ 0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c,
+ 0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90,
+ 0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3,
+ 0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c,
+ 0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc,
+ 0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2,
+ 0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d,
+ 0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b,
+ 0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a,
+ 0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa,
+ 0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c,
+ 0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e,
+ 0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd,
+ 0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3,
+ 0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17,
+ 0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88,
+ 0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd,
+ 0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7,
+ 0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f,
+ 0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79,
+ 0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd,
+ 0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4,
+ 0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5,
+ 0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87,
+ 0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c,
+ 0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50,
+ 0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12,
+ 0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b,
+ 0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d,
+ 0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55,
+ 0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8,
+ 0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82,
+ 0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd,
+ 0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60,
+ 0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c,
+ 0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66,
+ 0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd,
+ 0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e,
+ 0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81,
+ 0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b,
+ 0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd,
+ 0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26,
+ 0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88,
+ 0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72,
+ 0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc,
+ 0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc,
+ 0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76,
+ 0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98,
+ 0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b,
+ 0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5,
+ 0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf,
+ 0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd,
+ 0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d,
+ 0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00,
+ 0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0,
+ 0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14,
+ 0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c,
+ 0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3,
+ 0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b,
+ 0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a,
+ 0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d,
+ 0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b,
+ 0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80,
+ 0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29,
+ 0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc,
+ 0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07,
+ 0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69,
+ 0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf,
+ 0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c,
+ 0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc,
+ 0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e,
+ 0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84,
+ 0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d,
+ 0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40,
+ 0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e,
+ 0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e,
+ 0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d,
+ 0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92,
+ 0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb,
+ 0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29,
+ 0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc,
+ 0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd,
+ 0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5,
+ 0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22,
+ 0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc,
+ 0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b,
+ 0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec,
+ 0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32,
+ 0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc,
+ 0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce,
+ 0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61,
+ 0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda,
+ 0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d,
+ 0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2,
+ 0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82,
+ 0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b,
+ 0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd,
+ 0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a,
+ 0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f,
+ 0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c,
+ 0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d,
+ 0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee,
+ 0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5,
+ 0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17,
+ 0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc,
+ 0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56,
+ 0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6,
+ 0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08,
+ 0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d,
+ 0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35,
+ 0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79,
+ 0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29,
+ 0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc,
+ 0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46,
+ 0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69,
+ 0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19,
+ 0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c,
+ 0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8,
+ 0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29,
+ 0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a,
+ 0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd,
+ 0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c,
+ 0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0,
+ 0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6,
+ 0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d,
+ 0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c,
+ 0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f,
+ 0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c,
+ 0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c,
+ 0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a,
+ 0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4,
+ 0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0,
+ 0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d,
+ 0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4,
+ 0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92,
+ 0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0,
+ 0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd,
+ 0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf,
+ 0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb,
+ 0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd,
+ 0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c,
+ 0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1,
+ 0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5,
+ 0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53,
+ 0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd,
+ 0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c,
+ 0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61,
+ 0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23,
+ 0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd,
+ 0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5,
+ 0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf,
+ 0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6,
+ 0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d,
+ 0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe,
+ 0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05,
+ 0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20,
+ 0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b,
+ 0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c,
+ 0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43,
+ 0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85,
+ 0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd,
+ 0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4,
+ 0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f,
+ 0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13,
+ 0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d,
+ 0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6,
+ 0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95,
+ 0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba,
+ 0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd,
+ 0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03,
+ 0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a,
+ 0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52,
+ 0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd,
+ 0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1,
+ 0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8,
+ 0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8,
+ 0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd,
+ 0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90,
+ 0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0,
+ 0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10,
+ 0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd,
+ 0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68,
+ 0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa,
+ 0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48,
+ 0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd,
+ 0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b,
+ 0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d,
+ 0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81,
+ 0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d,
+ 0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78,
+ 0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50,
+ 0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63,
+ 0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d,
+ 0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7,
+ 0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87,
+ 0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b,
+ 0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c,
+ 0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4,
+ 0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58,
+ 0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff,
+ 0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd,
+ 0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30,
+ 0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0,
+ 0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab,
+ 0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b,
+ 0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c,
+ 0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70,
+ 0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71,
+ 0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd,
+ 0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6,
+ 0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0,
+ 0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb,
+ 0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd,
+ 0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf,
+ 0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a,
+ 0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76,
+ 0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d,
+ 0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4,
+ 0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed,
+ 0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f,
+ 0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd,
+ 0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10,
+ 0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a,
+ 0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c,
+ 0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd,
+ 0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c,
+ 0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3,
+ 0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2,
+ 0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b,
+ 0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e,
+ 0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07,
+ 0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d,
+ 0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd,
+ 0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57,
+ 0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5,
+ 0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f,
+ 0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd,
+ 0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81,
+ 0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a,
+ 0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5,
+ 0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd,
+ 0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e,
+ 0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0,
+ 0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95,
+ 0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d,
+ 0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd,
+ 0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44,
+ 0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05,
+ 0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c,
+ 0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4,
+ 0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d,
+ 0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34,
+ 0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc,
+ 0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c,
+ 0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2,
+ 0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99,
+ 0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd,
+ 0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb,
+ 0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22,
+ 0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf,
+ 0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c,
+ 0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91,
+ 0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4,
+ 0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53,
+ 0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd,
+ 0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a,
+ 0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84,
+ 0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5,
+ 0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc,
+ 0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c,
+ 0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78,
+ 0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84,
+ 0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c,
+ 0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9,
+ 0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f,
+ 0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b,
+ 0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d,
+ 0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64,
+ 0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00,
+ 0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb,
+ 0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d,
+ 0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc,
+ 0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08,
+ 0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a,
+ 0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d,
+ 0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf,
+ 0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d,
+ 0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00,
+ 0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d,
+ 0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20,
+ 0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5,
+ 0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b,
+ 0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc,
+ 0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67,
+ 0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e,
+ 0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a,
+ 0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d,
+ 0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f,
+ 0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29,
+ 0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55,
+ 0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd,
+ 0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9,
+ 0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf,
+ 0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a,
+ 0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc,
+ 0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85,
+ 0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c,
+ 0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44,
+ 0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d,
+ 0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19,
+ 0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09,
+ 0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6,
+ 0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d,
+ 0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40,
+ 0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f,
+ 0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d,
+ 0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d,
+ 0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9,
+ 0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0,
+ 0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7,
+ 0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd,
+ 0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12,
+ 0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4,
+ 0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90,
+ 0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d,
+ 0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5,
+ 0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55,
+ 0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51,
+ 0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d,
+ 0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92,
+ 0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03,
+ 0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03,
+ 0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c,
+ 0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e,
+ 0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12,
+ 0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63,
+ 0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d,
+ 0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2,
+ 0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15,
+ 0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58,
+ 0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc,
+ 0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b,
+ 0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f,
+ 0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7,
+ 0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c,
+ 0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d,
+ 0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22,
+ 0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86,
+ 0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c,
+ 0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d,
+ 0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b,
+ 0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7,
+ 0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d,
+ 0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a,
+ 0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c,
+ 0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e,
+ 0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d,
+ 0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43,
+ 0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94,
+ 0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a,
+ 0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd,
+ 0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f,
+ 0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e,
+ 0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf,
+ 0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc,
+ 0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85,
+ 0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31,
+ 0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f,
+ 0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb,
+ 0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51,
+ 0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08,
+ 0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88,
+ 0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd,
+ 0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b,
+ 0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c,
+ 0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47,
+ 0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d,
+ 0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39,
+ 0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13,
+ 0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82,
+ 0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd,
+ 0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7,
+ 0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f,
+ 0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09,
+ 0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc,
+ 0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b,
+ 0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69,
+ 0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad,
+ 0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d,
+ 0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf,
+ 0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c,
+ 0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde,
+ 0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c,
+ 0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40,
+ 0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f,
+ 0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3,
+ 0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc,
+ 0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15,
+ 0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7,
+ 0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15,
+ 0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c,
+ 0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16,
+ 0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab,
+ 0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85,
+ 0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d,
+ 0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde,
+ 0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5,
+ 0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87,
+ 0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c,
+ 0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42,
+ 0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc,
+ 0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7,
+ 0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d,
+ 0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae,
+ 0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92,
+ 0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87,
+ 0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d,
+ 0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21,
+ 0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d,
+ 0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40,
+ 0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b,
+ 0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff,
+ 0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e,
+ 0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a,
+ 0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c,
+ 0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34,
+ 0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3,
+ 0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4,
+ 0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd,
+ 0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f,
+ 0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3,
+ 0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82,
+ 0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb,
+ 0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27,
+ 0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c,
+ 0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f,
+ 0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb,
+ 0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8,
+ 0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6,
+ 0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e,
+ 0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc,
+ 0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90,
+ 0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c,
+ 0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e,
+ 0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd,
+ 0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14,
+ 0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a,
+ 0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28,
+ 0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d,
+ 0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f,
+ 0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d,
+ 0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f,
+ 0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d,
+ 0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47,
+ 0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9,
+ 0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d,
+ 0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d,
+ 0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb,
+ 0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77,
+ 0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18,
+ 0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd,
+ 0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5,
+ 0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88,
+ 0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b,
+ 0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc,
+ 0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b,
+ 0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8,
+ 0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0,
+ 0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d,
+ 0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe,
+ 0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa,
+ 0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13,
+ 0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d,
+ 0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25,
+ 0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8,
+ 0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a,
+ 0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d,
+ 0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4,
+ 0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e,
+ 0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93,
+ 0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc,
+ 0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20,
+ 0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20,
+ 0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01,
+ 0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc,
+ 0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86,
+ 0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f,
+ 0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41,
+ 0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd,
+ 0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b,
+ 0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a,
+ 0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf,
+ 0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d,
+ 0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3,
+ 0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae,
+ 0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73,
+ 0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c,
+ 0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4,
+ 0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50,
+ 0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d,
+ 0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd,
+ 0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70,
+ 0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15,
+ 0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf,
+ 0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd,
+ 0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f,
+ 0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e,
+ 0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f,
+ 0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c,
+ 0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02,
+ 0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf,
+ 0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1,
+ 0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d,
+ 0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47,
+ 0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21,
+ 0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e,
+ 0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d,
+ 0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38,
+ 0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6,
+ 0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55,
+ 0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd,
+ 0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36,
+ 0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a,
+ 0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d,
+ 0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d,
+ 0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef,
+ 0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7,
+ 0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f,
+ 0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d,
+ 0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2,
+ 0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c,
+ 0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e,
+ 0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c,
+ 0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a,
+ 0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4,
+ 0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff,
+ 0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d,
+ 0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51,
+ 0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58,
+ 0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21,
+ 0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b,
+ 0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e,
+ 0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd,
+ 0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44,
+ 0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc,
+ 0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07,
+ 0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e,
+ 0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1,
+ 0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c,
+ 0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43,
+ 0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90,
+ 0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24,
+ 0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc,
+ 0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0,
+ 0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45,
+ 0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31,
+ 0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd,
+ 0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8,
+ 0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51,
+ 0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c,
+ 0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd,
+ 0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38,
+ 0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45,
+ 0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79,
+ 0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc,
+ 0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e,
+ 0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92,
+ 0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87,
+ 0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd,
+ 0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc,
+ 0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91,
+ 0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9,
+ 0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d,
+ 0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c,
+ 0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a,
+ 0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f,
+ 0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd,
+ 0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c,
+ 0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f,
+ 0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b,
+ 0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b,
+ 0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c,
+ 0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0,
+ 0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f,
+ 0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c,
+ 0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22,
+ 0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28,
+ 0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31,
+ 0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd,
+ 0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69,
+ 0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19,
+ 0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e,
+ 0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d,
+ 0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda,
+ 0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56,
+ 0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f,
+ 0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc,
+ 0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18,
+ 0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22,
+ 0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90,
+ 0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c,
+ 0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66,
+ 0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46,
+ 0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09,
+ 0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd,
+ 0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e,
+ 0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c,
+ 0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f,
+ 0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd,
+ 0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0,
+ 0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76,
+ 0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c,
+ 0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c,
+ 0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00,
+ 0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b,
+ 0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05,
+ 0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c,
+ 0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40,
+ 0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71,
+ 0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31,
+ 0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba,
+ 0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68,
+ 0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81,
+ 0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde,
+ 0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d,
+ 0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14,
+ 0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0,
+ 0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0,
+ 0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd,
+ 0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f,
+ 0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9,
+ 0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a,
+ 0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd,
+ 0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24,
+ 0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9,
+ 0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83,
+ 0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d,
+ 0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3,
+ 0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f,
+ 0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11,
+ 0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc,
+ 0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00,
+ 0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a,
+ 0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32,
+ 0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a,
+ 0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce,
+ 0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f,
+ 0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70,
+ 0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d,
+ 0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8,
+ 0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f,
+ 0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a,
+ 0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd,
+ 0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13,
+ 0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef,
+ 0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29,
+ 0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc,
+ 0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc,
+ 0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6,
+ 0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b,
+ 0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c,
+ 0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0,
+ 0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f,
+ 0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85,
+ 0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd,
+ 0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1,
+ 0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57,
+ 0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8,
+ 0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b,
+ 0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c,
+ 0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5,
+ 0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90,
+ 0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc,
+ 0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e,
+ 0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae,
+ 0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84,
+ 0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd,
+ 0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4,
+ 0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab,
+ 0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54,
+ 0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b,
+ 0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8,
+ 0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4,
+ 0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac,
+ 0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c,
+ 0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8,
+ 0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b,
+ 0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48,
+ 0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd,
+ 0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e,
+ 0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54,
+ 0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59,
+ 0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c,
+ 0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79,
+ 0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba,
+ 0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f,
+ 0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd,
+ 0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90,
+ 0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13,
+ 0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2,
+ 0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd,
+ 0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28,
+ 0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8,
+ 0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a,
+ 0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd,
+ 0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0,
+ 0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7,
+ 0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68,
+ 0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d,
+ 0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0,
+ 0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde,
+ 0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19,
+ 0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c,
+ 0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52,
+ 0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32,
+ 0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b,
+ 0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c,
+ 0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa,
+ 0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75,
+ 0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3,
+ 0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d,
+ 0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81,
+ 0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd,
+ 0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea,
+ 0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d,
+ 0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6,
+ 0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66,
+ 0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e,
+ 0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb,
+ 0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84,
+ 0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d,
+ 0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80,
+ 0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c,
+ 0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3,
+ 0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a,
+ 0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c,
+ 0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d,
+ 0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70,
+ 0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb,
+ 0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8,
+ 0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d,
+ 0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd,
+ 0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c,
+ 0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c,
+ 0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd,
+ 0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c,
+ 0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed,
+ 0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1,
+ 0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c,
+ 0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78,
+ 0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1,
+ 0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15,
+ 0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd,
+ 0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c,
+ 0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa,
+ 0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66,
+ 0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d,
+ 0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00,
+ 0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32,
+ 0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e,
+ 0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d,
+ 0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70,
+ 0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf,
+ 0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80,
+ 0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b,
+ 0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0,
+ 0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b,
+ 0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37,
+ 0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd,
+ 0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02,
+ 0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad,
+ 0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58,
+ 0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d,
+ 0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a,
+ 0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04,
+ 0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c,
+ 0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd,
+ 0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30,
+ 0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42,
+ 0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca,
+ 0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d,
+ 0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96,
+ 0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0,
+ 0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99,
+ 0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d,
+ 0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda,
+ 0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a,
+ 0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8,
+ 0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c,
+ 0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e,
+ 0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f,
+ 0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff,
+ 0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd,
+ 0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68,
+ 0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26,
+ 0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18,
+ 0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d,
+ 0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70,
+ 0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6,
+ 0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf,
+ 0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c,
+ 0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80,
+ 0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28,
+ 0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2,
+ 0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd,
+ 0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0,
+ 0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c,
+ 0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87,
+ 0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c,
+ 0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba,
+ 0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde,
+ 0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d,
+ 0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc,
+ 0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10,
+ 0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f,
+ 0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a,
+ 0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d,
+ 0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2,
+ 0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69,
+ 0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6,
+ 0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc,
+ 0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30,
+ 0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37,
+ 0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0,
+ 0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc,
+ 0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0,
+ 0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e,
+ 0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66,
+ 0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d,
+ 0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc,
+ 0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6,
+ 0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89,
+ 0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c,
+ 0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e,
+ 0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3,
+ 0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec,
+ 0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd,
+ 0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78,
+ 0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61,
+ 0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b,
+ 0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc,
+ 0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96,
+ 0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10,
+ 0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81,
+ 0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d,
+ 0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80,
+ 0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13,
+ 0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60,
+ 0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b,
+ 0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0,
+ 0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90,
+ 0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6,
+ 0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d,
+ 0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb,
+ 0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50,
+ 0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79,
+ 0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d,
+ 0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40,
+ 0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57,
+ 0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85,
+ 0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd,
+ 0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85,
+ 0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9,
+ 0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70,
+ 0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd,
+ 0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02,
+ 0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a,
+ 0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a,
+ 0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d,
+ 0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96,
+ 0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06,
+ 0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05,
+ 0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd,
+ 0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68,
+ 0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2,
+ 0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61,
+ 0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c,
+ 0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8,
+ 0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78,
+ 0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c,
+ 0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b,
+ 0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4,
+ 0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9,
+ 0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb,
+ 0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c,
+ 0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8,
+ 0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a,
+ 0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01,
+ 0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c,
+ 0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90,
+ 0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90,
+ 0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11,
+ 0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c,
+ 0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40,
+ 0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e,
+ 0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d,
+ 0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd,
+ 0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2,
+ 0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc,
+ 0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75,
+ 0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc,
+ 0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca,
+ 0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf,
+ 0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85,
+ 0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc,
+ 0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0,
+ 0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b,
+ 0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6,
+ 0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d,
+ 0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5,
+ 0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4,
+ 0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17,
+ 0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c,
+ 0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc,
+ 0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64,
+ 0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8,
+ 0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd,
+ 0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3,
+ 0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b,
+ 0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87,
+ 0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd,
+ 0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6,
+ 0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1,
+ 0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c,
+ 0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a,
+ 0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c,
+ 0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a,
+ 0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d,
+ 0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd,
+ 0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4,
+ 0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7,
+ 0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d,
+ 0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd,
+ 0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30,
+ 0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd,
+ 0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac,
+ 0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d,
+ 0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e,
+ 0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38,
+ 0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe,
+ 0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d,
+ 0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24,
+ 0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde,
+ 0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6,
+ 0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b,
+ 0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a,
+ 0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d,
+ 0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e,
+ 0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba,
+ 0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e,
+ 0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40,
+ 0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f,
+ 0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c,
+ 0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30,
+ 0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8,
+ 0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88,
+ 0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b,
+ 0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10,
+ 0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92,
+ 0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e,
+ 0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d,
+ 0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42,
+ 0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1,
+ 0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6,
+ 0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc,
+ 0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2,
+ 0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b,
+ 0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0,
+ 0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c,
+ 0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb,
+ 0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf,
+ 0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d,
+ 0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c,
+ 0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb,
+ 0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d,
+ 0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e,
+ 0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d,
+ 0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea,
+ 0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8,
+ 0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2,
+ 0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc,
+ 0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4,
+ 0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b,
+ 0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f,
+ 0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c,
+ 0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27,
+ 0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32,
+ 0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa,
+ 0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c,
+ 0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f,
+ 0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad,
+ 0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b,
+ 0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd,
+ 0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04,
+ 0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15,
+ 0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a,
+ 0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b,
+ 0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac,
+ 0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92,
+ 0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8,
+ 0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd,
+ 0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc,
+ 0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51,
+ 0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6,
+ 0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc,
+ 0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed,
+ 0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9,
+ 0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f,
+ 0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc,
+ 0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d,
+ 0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb,
+ 0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd,
+ 0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b,
+ 0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74,
+ 0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e,
+ 0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd,
+ 0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc,
+ 0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd,
+ 0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c,
+ 0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98,
+ 0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d,
+ 0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe,
+ 0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b,
+ 0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51,
+ 0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c,
+ 0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b,
+ 0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15,
+ 0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b,
+ 0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd,
+ 0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90,
+ 0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14,
+ 0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c,
+ 0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd,
+ 0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c,
+ 0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61,
+ 0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89,
+ 0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc,
+ 0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55,
+ 0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4,
+ 0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4,
+ 0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c,
+ 0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12,
+ 0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd,
+ 0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90,
+ 0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc,
+ 0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5,
+ 0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d,
+ 0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79,
+ 0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd,
+ 0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a,
+ 0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f,
+ 0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad,
+ 0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd,
+ 0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25,
+ 0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97,
+ 0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2,
+ 0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d,
+ 0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6,
+ 0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a,
+ 0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1,
+ 0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d,
+ 0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e,
+ 0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e,
+ 0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78,
+ 0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c,
+ 0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84,
+ 0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0,
+ 0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7,
+ 0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb,
+ 0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5,
+ 0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10,
+ 0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b,
+ 0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d,
+ 0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28,
+ 0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1,
+ 0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66,
+ 0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d,
+ 0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d,
+ 0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3,
+ 0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10,
+ 0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a,
+ 0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72,
+ 0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda,
+ 0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c,
+ 0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd,
+ 0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea,
+ 0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd,
+ 0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3,
+ 0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c,
+ 0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99,
+ 0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56,
+ 0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2,
+ 0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc,
+ 0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1,
+ 0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec,
+ 0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45,
+ 0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd,
+ 0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b,
+ 0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48,
+ 0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3,
+ 0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d,
+ 0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9,
+ 0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec,
+ 0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09,
+ 0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c,
+ 0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26,
+ 0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0,
+ 0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e,
+ 0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c,
+ 0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87,
+ 0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97,
+ 0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee,
+ 0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd,
+ 0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a,
+ 0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2,
+ 0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92,
+ 0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c,
+ 0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0,
+ 0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88,
+ 0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd,
+ 0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c,
+ 0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8,
+ 0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f,
+ 0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7,
+ 0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc,
+ 0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f,
+ 0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e,
+ 0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27,
+ 0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe,
+ 0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6,
+ 0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37,
+ 0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7,
+ 0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc,
+ 0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e,
+ 0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4,
+ 0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87,
+ 0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c,
+ 0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e,
+ 0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99,
+ 0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f,
+ 0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd,
+ 0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01,
+ 0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59,
+ 0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a,
+ 0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb,
+ 0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3,
+ 0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52,
+ 0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0,
+ 0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd,
+ 0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b,
+ 0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b,
+ 0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7,
+ 0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d,
+ 0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b,
+ 0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7,
+ 0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5,
+ 0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd,
+ 0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2,
+ 0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47,
+ 0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52,
+ 0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c,
+ 0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a,
+ 0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85,
+ 0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b,
+ 0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d,
+ 0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5,
+ 0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30,
+ 0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd,
+ 0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd,
+ 0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1,
+ 0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26,
+ 0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92,
+ 0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c,
+ 0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6,
+ 0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f,
+ 0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4,
+ 0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b,
+ 0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f,
+ 0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16,
+ 0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6,
+ 0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc,
+ 0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74,
+ 0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6,
+ 0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43,
+ 0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c,
+ 0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39,
+ 0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9,
+ 0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f,
+ 0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d,
+ 0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b,
+ 0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac,
+ 0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03,
+ 0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b,
+ 0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e,
+ 0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5,
+ 0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1,
+ 0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d,
+ 0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f,
+ 0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63,
+ 0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3,
+ 0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc,
+ 0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6,
+ 0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98,
+ 0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8,
+ 0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd,
+ 0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95,
+ 0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8,
+ 0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b,
+ 0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d,
+ 0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a,
+ 0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e,
+ 0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53,
+ 0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c,
+ 0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f,
+ 0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce,
+ 0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec,
+ 0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d,
+ 0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90,
+ 0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37,
+ 0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7,
+ 0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d,
+ 0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0,
+ 0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65,
+ 0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29,
+ 0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d,
+ 0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02,
+ 0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef,
+ 0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3,
+ 0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a,
+ 0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc,
+ 0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83,
+ 0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c,
+ 0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d,
+ 0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54,
+ 0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9,
+ 0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8,
+ 0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d,
+ 0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e,
+ 0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee,
+ 0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc,
+ 0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c,
+ 0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f,
+ 0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4,
+ 0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f,
+ 0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b,
+ 0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65,
+ 0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac,
+ 0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60,
+ 0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c,
+ 0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a,
+ 0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10,
+ 0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e,
+ 0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd,
+ 0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45,
+ 0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe,
+ 0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92,
+ 0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd,
+ 0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96,
+ 0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4,
+ 0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde,
+ 0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d,
+ 0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52,
+ 0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe,
+ 0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86,
+ 0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d,
+ 0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1,
+ 0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1,
+ 0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48,
+ 0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d,
+ 0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46,
+ 0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac,
+ 0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15,
+ 0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb,
+ 0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0,
+ 0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17,
+ 0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c,
+ 0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d,
+ 0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9,
+ 0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77,
+ 0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d,
+ 0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d,
+ 0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53,
+ 0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1,
+ 0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05,
+ 0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc,
+ 0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd,
+ 0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d,
+ 0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06,
+ 0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd,
+ 0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff,
+ 0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e,
+ 0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d,
+ 0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb,
+ 0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6,
+ 0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91,
+ 0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85,
+ 0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc,
+ 0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b,
+ 0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5,
+ 0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33,
+ 0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c,
+ 0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99,
+ 0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43,
+ 0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6,
+ 0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d,
+ 0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41,
+ 0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20,
+ 0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38,
+ 0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd,
+ 0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d,
+ 0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3,
+ 0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22,
+ 0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb,
+ 0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08,
+ 0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e,
+ 0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04,
+ 0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd,
+ 0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe,
+ 0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa,
+ 0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89,
+ 0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd,
+ 0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8,
+ 0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53,
+ 0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b,
+ 0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c,
+ 0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14,
+ 0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e,
+ 0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88,
+ 0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd,
+ 0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4,
+ 0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2,
+ 0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1,
+ 0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d,
+ 0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e,
+ 0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2,
+ 0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c,
+ 0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c,
+ 0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe,
+ 0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c,
+ 0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a,
+ 0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c,
+ 0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b,
+ 0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe,
+ 0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e,
+ 0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c,
+ 0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d,
+ 0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05,
+ 0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44,
+ 0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d,
+ 0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c,
+ 0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75,
+ 0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85,
+ 0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d,
+ 0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39,
+ 0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f,
+ 0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98,
+ 0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39,
+ 0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb,
+ 0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4,
+ 0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78,
+ 0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d,
+ 0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8,
+ 0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03,
+ 0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b,
+ 0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb,
+ 0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67,
+ 0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42,
+ 0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b,
+ 0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb,
+ 0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92,
+ 0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0,
+ 0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39,
+ 0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d,
+ 0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30,
+ 0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a,
+ 0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30,
+ 0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc,
+ 0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00,
+ 0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a,
+ 0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4,
+ 0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39,
+ 0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c,
+ 0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06,
+ 0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59,
+ 0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd,
+ 0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d,
+ 0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d,
+ 0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91,
+ 0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc,
+ 0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8,
+ 0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c,
+ 0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33,
+ 0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d,
+ 0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0,
+ 0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38,
+ 0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4,
+ 0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc,
+ 0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42,
+ 0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a,
+ 0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12,
+ 0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c,
+ 0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24,
+ 0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42,
+ 0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01,
+ 0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c,
+ 0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22,
+ 0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69,
+ 0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36,
+ 0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd,
+ 0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43,
+ 0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91,
+ 0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53,
+ 0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9,
+ 0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75,
+ 0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5,
+ 0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40,
+ 0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d,
+ 0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49,
+ 0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98,
+ 0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91,
+ 0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c,
+ 0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25,
+ 0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5,
+ 0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54,
+ 0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b,
+ 0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd,
+ 0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17,
+ 0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19,
+ 0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c,
+ 0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67,
+ 0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97,
+ 0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e,
+ 0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc,
+ 0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32,
+ 0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd,
+ 0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a,
+ 0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d,
+ 0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea,
+ 0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8,
+ 0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a,
+ 0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c,
+ 0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e,
+ 0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7,
+ 0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48,
+ 0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc,
+ 0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85,
+ 0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25,
+ 0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a,
+ 0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd,
+ 0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2,
+ 0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee,
+ 0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35,
+ 0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d,
+ 0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74,
+ 0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd,
+ 0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0,
+ 0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d,
+ 0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03,
+ 0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc,
+ 0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d,
+ 0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c,
+ 0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4,
+ 0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f,
+ 0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2,
+ 0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d,
+ 0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15,
+ 0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0,
+ 0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf,
+ 0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc,
+ 0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3,
+ 0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89,
+ 0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda,
+ 0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c,
+ 0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba,
+ 0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3,
+ 0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab,
+ 0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd,
+ 0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9,
+ 0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a,
+ 0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b,
+ 0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc,
+ 0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1,
+ 0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0,
+ 0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26,
+ 0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd,
+ 0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd,
+ 0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09,
+ 0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79,
+ 0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd,
+ 0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd,
+ 0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54,
+ 0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31,
+ 0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c,
+ 0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0,
+ 0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce,
+ 0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12,
+ 0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c,
+ 0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7,
+ 0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2,
+ 0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a,
+ 0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d,
+ 0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c,
+ 0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e,
+ 0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab,
+ 0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c,
+ 0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0,
+ 0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81,
+ 0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70,
+ 0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd,
+ 0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46,
+ 0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06,
+ 0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59,
+ 0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c,
+ 0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a,
+ 0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e,
+ 0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1,
+ 0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c,
+ 0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10,
+ 0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22,
+ 0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11,
+ 0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c,
+ 0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e,
+ 0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5,
+ 0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59,
+ 0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c,
+ 0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d,
+ 0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5,
+ 0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd,
+ 0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c,
+ 0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8,
+ 0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62,
+ 0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15,
+ 0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc,
+ 0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b,
+ 0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd,
+ 0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4,
+ 0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd,
+ 0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74,
+ 0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7,
+ 0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82,
+ 0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc,
+ 0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22,
+ 0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa,
+ 0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8,
+ 0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd,
+ 0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd,
+ 0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f,
+ 0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e,
+ 0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd,
+ 0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86,
+ 0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca,
+ 0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e,
+ 0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc,
+ 0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb,
+ 0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58,
+ 0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94,
+ 0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c,
+ 0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76,
+ 0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72,
+ 0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a,
+ 0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c,
+ 0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1,
+ 0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0,
+ 0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1,
+ 0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd,
+ 0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c,
+ 0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde,
+ 0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e,
+ 0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd,
+ 0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c,
+ 0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a,
+ 0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18,
+ 0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd,
+ 0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50,
+ 0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8,
+ 0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7,
+ 0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d,
+ 0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e,
+ 0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b,
+ 0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18,
+ 0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c,
+ 0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35,
+ 0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8,
+ 0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86,
+ 0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d,
+ 0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16,
+ 0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf,
+ 0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e,
+ 0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d,
+ 0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08,
+ 0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d,
+ 0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4,
+ 0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd,
+ 0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41,
+ 0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae,
+ 0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d,
+ 0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd,
+ 0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62,
+ 0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19,
+ 0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31,
+ 0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b,
+ 0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8,
+ 0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0,
+ 0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec,
+ 0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc,
+ 0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e,
+ 0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7,
+ 0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d,
+ 0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b,
+ 0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71,
+ 0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c,
+ 0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77,
+ 0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc,
+ 0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41,
+ 0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14,
+ 0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7,
+ 0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d,
+ 0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9,
+ 0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42,
+ 0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d,
+ 0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c,
+ 0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea,
+ 0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe,
+ 0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86,
+ 0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c,
+ 0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0,
+ 0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6,
+ 0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20,
+ 0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d,
+ 0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3,
+ 0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81,
+ 0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd,
+ 0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c,
+ 0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88,
+ 0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22,
+ 0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a,
+ 0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c,
+ 0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57,
+ 0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01,
+ 0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b,
+ 0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c,
+ 0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88,
+ 0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf,
+ 0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81,
+ 0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb,
+ 0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6,
+ 0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda,
+ 0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a,
+ 0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc,
+ 0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba,
+ 0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05,
+ 0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6,
+ 0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb,
+ 0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d,
+ 0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7,
+ 0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd,
+ 0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d,
+ 0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b,
+ 0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1,
+ 0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3,
+ 0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd,
+ 0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73,
+ 0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d,
+ 0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f,
+ 0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c,
+ 0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe,
+ 0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2,
+ 0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d,
+ 0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39,
+ 0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8,
+ 0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02,
+ 0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9,
+ 0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd,
+ 0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03,
+ 0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28,
+ 0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51,
+ 0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d,
+ 0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28,
+ 0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f,
+ 0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e,
+ 0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d,
+ 0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4,
+ 0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7,
+ 0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8,
+ 0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d,
+ 0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0,
+ 0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68,
+ 0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a,
+ 0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d,
+ 0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb,
+ 0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9,
+ 0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d,
+ 0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd,
+ 0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1,
+ 0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c,
+ 0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7,
+ 0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc,
+ 0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d,
+ 0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d,
+ 0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6,
+ 0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d,
+ 0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46,
+ 0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20,
+ 0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54,
+ 0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc,
+ 0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52,
+ 0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda,
+ 0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf,
+ 0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d,
+ 0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63,
+ 0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b,
+ 0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87,
+ 0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd,
+ 0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa,
+ 0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39,
+ 0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25,
+ 0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc,
+ 0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69,
+ 0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84,
+ 0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c,
+ 0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d,
+ 0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d,
+ 0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89,
+ 0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e,
+ 0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb,
+ 0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f,
+ 0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff,
+ 0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94,
+ 0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b,
+ 0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65,
+ 0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64,
+ 0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96,
+ 0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd,
+ 0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce,
+ 0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45,
+ 0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7,
+ 0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d,
+ 0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86,
+ 0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f,
+ 0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99,
+ 0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb,
+ 0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31,
+ 0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9,
+ 0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c,
+ 0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd,
+ 0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52,
+ 0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40,
+ 0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d,
+ 0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a,
+ 0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09,
+ 0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65,
+ 0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f,
+ 0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc,
+ 0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e,
+ 0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4,
+ 0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58,
+ 0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd,
+ 0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28,
+ 0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96,
+ 0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85,
+ 0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd,
+ 0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4,
+ 0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64,
+ 0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94,
+ 0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd,
+ 0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde,
+ 0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23,
+ 0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9,
+ 0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d,
+ 0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0,
+ 0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99,
+ 0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f,
+ 0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d,
+ 0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86,
+ 0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71,
+ 0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21,
+ 0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d,
+ 0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f,
+ 0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49,
+ 0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4,
+ 0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd,
+ 0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84,
+ 0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9,
+ 0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86,
+ 0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d,
+ 0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83,
+ 0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb,
+ 0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b,
+ 0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc,
+ 0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79,
+ 0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd,
+ 0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0,
+ 0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd,
+ 0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb,
+ 0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69,
+ 0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78,
+ 0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc,
+ 0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26,
+ 0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7,
+ 0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88,
+ 0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d,
+ 0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59,
+ 0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8,
+ 0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b,
+ 0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c,
+ 0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1,
+ 0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e,
+ 0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07,
+ 0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c,
+ 0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19,
+ 0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58,
+ 0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91,
+ 0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d,
+ 0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66,
+ 0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac,
+ 0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34,
+ 0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d,
+ 0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e,
+ 0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7,
+ 0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6,
+ 0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d,
+ 0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65,
+ 0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1,
+ 0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b,
+ 0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d,
+ 0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac,
+ 0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71,
+ 0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34,
+ 0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc,
+ 0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04,
+ 0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c,
+ 0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49,
+ 0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d,
+ 0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe,
+ 0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9,
+ 0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91,
+ 0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d,
+ 0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe,
+ 0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8,
+ 0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59,
+ 0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc,
+ 0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78,
+ 0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7,
+ 0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85,
+ 0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d,
+ 0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c,
+ 0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc,
+ 0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4,
+ 0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c,
+ 0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91,
+ 0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e,
+ 0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76,
+ 0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d,
+ 0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d,
+ 0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e,
+ 0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42,
+ 0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd,
+ 0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c,
+ 0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86,
+ 0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6,
+ 0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c,
+ 0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08,
+ 0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf,
+ 0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7,
+ 0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d,
+ 0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30,
+ 0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01,
+ 0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12,
+ 0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c,
+ 0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c,
+ 0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4,
+ 0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c,
+ 0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d,
+ 0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd,
+ 0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3,
+ 0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50,
+ 0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc,
+ 0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19,
+ 0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76,
+ 0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30,
+ 0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd,
+ 0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f,
+ 0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b,
+ 0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09,
+ 0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb,
+ 0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a,
+ 0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e,
+ 0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59,
+ 0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b,
+ 0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2,
+ 0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec,
+ 0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82,
+ 0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd,
+ 0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5,
+ 0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5,
+ 0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22,
+ 0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d,
+ 0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3,
+ 0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e,
+ 0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21,
+ 0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd,
+ 0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4,
+ 0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee,
+ 0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41,
+ 0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd,
+ 0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad,
+ 0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06,
+ 0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76,
+ 0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d,
+ 0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae,
+ 0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2,
+ 0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1,
+ 0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc,
+ 0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15,
+ 0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd,
+ 0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb,
+ 0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd,
+ 0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1,
+ 0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82,
+ 0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87,
+ 0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d,
+ 0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69,
+ 0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b,
+ 0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00,
+ 0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd,
+ 0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0,
+ 0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd,
+ 0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79,
+ 0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd,
+ 0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52,
+ 0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b,
+ 0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02,
+ 0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd,
+ 0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9,
+ 0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0,
+ 0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e,
+ 0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc,
+ 0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07,
+ 0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9,
+ 0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52,
+ 0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d,
+ 0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93,
+ 0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9,
+ 0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80,
+ 0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc,
+ 0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1,
+ 0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3,
+ 0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86,
+ 0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c,
+ 0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff,
+ 0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea,
+ 0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30,
+ 0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d,
+ 0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb,
+ 0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2,
+ 0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85,
+ 0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc,
+ 0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7,
+ 0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1,
+ 0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4,
+ 0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d,
+ 0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40,
+ 0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31,
+ 0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80,
+ 0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd,
+ 0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10,
+ 0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9,
+ 0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c,
+ 0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d,
+ 0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48,
+ 0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a,
+ 0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e,
+ 0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc,
+ 0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04,
+ 0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0,
+ 0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f,
+ 0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc,
+ 0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b,
+ 0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95,
+ 0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84,
+ 0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd,
+ 0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28,
+ 0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5,
+ 0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad,
+ 0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c,
+ 0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2,
+ 0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d,
+ 0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06,
+ 0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd,
+ 0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6,
+ 0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a,
+ 0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38,
+ 0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd,
+ 0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40,
+ 0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06,
+ 0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70,
+ 0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d,
+ 0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49,
+ 0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8,
+ 0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0,
+ 0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc,
+ 0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70,
+ 0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a,
+ 0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40,
+ 0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d,
+ 0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee,
+ 0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41,
+ 0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c,
+ 0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd,
+ 0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6,
+ 0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d,
+ 0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec,
+ 0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc,
+ 0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1,
+ 0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71,
+ 0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c,
+ 0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d,
+ 0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0,
+ 0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9,
+ 0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b,
+ 0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38,
+ 0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2,
+ 0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86,
+ 0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84,
+ 0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb,
+ 0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99,
+ 0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb,
+ 0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88,
+ 0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd,
+ 0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40,
+ 0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95,
+ 0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf,
+ 0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c,
+ 0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48,
+ 0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2,
+ 0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a,
+ 0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d,
+ 0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69,
+ 0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f,
+ 0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a,
+ 0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb,
+ 0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8,
+ 0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d,
+ 0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a,
+ 0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d,
+ 0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e,
+ 0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0,
+ 0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40,
+ 0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d,
+ 0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12,
+ 0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71,
+ 0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6,
+ 0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c,
+ 0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75,
+ 0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a,
+ 0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60,
+ 0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c,
+ 0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78,
+ 0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c,
+ 0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e,
+ 0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd,
+ 0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91,
+ 0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb,
+ 0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a,
+ 0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d,
+ 0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84,
+ 0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67,
+ 0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a,
+ 0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc,
+ 0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0,
+ 0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5,
+ 0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0,
+ 0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd,
+ 0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47,
+ 0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20,
+ 0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47,
+ 0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd,
+ 0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8,
+ 0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02,
+ 0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f,
+ 0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd,
+ 0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a,
+ 0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf,
+ 0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3,
+ 0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd,
+ 0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a,
+ 0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05,
+ 0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd,
+ 0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc,
+ 0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe,
+ 0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4,
+ 0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6,
+ 0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c,
+ 0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd,
+ 0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb,
+ 0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87,
+ 0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd,
+ 0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e,
+ 0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47,
+ 0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3,
+ 0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc,
+ 0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44,
+ 0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4,
+ 0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b,
+ 0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d,
+ 0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5,
+ 0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42,
+ 0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82,
+ 0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd,
+ 0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44,
+ 0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01,
+ 0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32,
+ 0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc,
+ 0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c,
+ 0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86,
+ 0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2,
+ 0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d,
+ 0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc,
+ 0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30,
+ 0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d,
+ 0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b,
+ 0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf,
+ 0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5,
+ 0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42,
+ 0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc,
+ 0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f,
+ 0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1,
+ 0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f,
+ 0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d,
+ 0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58,
+ 0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63,
+ 0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59,
+ 0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc,
+ 0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61,
+ 0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f,
+ 0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa,
+ 0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d,
+ 0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74,
+ 0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec,
+ 0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf,
+ 0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb,
+ 0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda,
+ 0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84,
+ 0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98,
+ 0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d,
+ 0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2,
+ 0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89,
+ 0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25,
+ 0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b,
+ 0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32,
+ 0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b,
+ 0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80,
+ 0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc,
+ 0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03,
+ 0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde,
+ 0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4,
+ 0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc,
+ 0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e,
+ 0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99,
+ 0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87,
+ 0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d,
+ 0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5,
+ 0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43,
+ 0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40,
+ 0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d,
+ 0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3,
+ 0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f,
+ 0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd,
+ 0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc,
+ 0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d,
+ 0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8,
+ 0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78,
+ 0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd,
+ 0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4,
+ 0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b,
+ 0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18,
+ 0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d,
+ 0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68,
+ 0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85,
+ 0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2,
+ 0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc,
+ 0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b,
+ 0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35,
+ 0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d,
+ 0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d,
+ 0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76,
+ 0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63,
+ 0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19,
+ 0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd,
+ 0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03,
+ 0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f,
+ 0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98,
+ 0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b,
+ 0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6,
+ 0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54,
+ 0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28,
+ 0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d,
+ 0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81,
+ 0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd,
+ 0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82,
+ 0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd,
+ 0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce,
+ 0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b,
+ 0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a,
+ 0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd,
+ 0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a,
+ 0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b,
+ 0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61,
+ 0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c,
+ 0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c,
+ 0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba,
+ 0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77,
+ 0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd,
+ 0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17,
+ 0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26,
+ 0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1,
+ 0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b,
+ 0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0,
+ 0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56,
+ 0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d,
+ 0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd,
+ 0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb,
+ 0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb,
+ 0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b,
+ 0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc,
+ 0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68,
+ 0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69,
+ 0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57,
+ 0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd,
+ 0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2,
+ 0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71,
+ 0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd,
+ 0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd,
+ 0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a,
+ 0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2,
+ 0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21,
+ 0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd,
+ 0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4,
+ 0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae,
+ 0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40,
+ 0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d,
+ 0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d,
+ 0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7,
+ 0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99,
+ 0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c,
+ 0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90,
+ 0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85,
+ 0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65,
+ 0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d,
+ 0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79,
+ 0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa,
+ 0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b,
+ 0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d,
+ 0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d,
+ 0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a,
+ 0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa,
+ 0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd,
+ 0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69,
+ 0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79,
+ 0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89,
+ 0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c,
+ 0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d,
+ 0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2,
+ 0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac,
+ 0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc,
+ 0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d,
+ 0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3,
+ 0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37,
+ 0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc,
+ 0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6,
+ 0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb,
+ 0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10,
+ 0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c,
+ 0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6,
+ 0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b,
+ 0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1,
+ 0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd,
+ 0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e,
+ 0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28,
+ 0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45,
+ 0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd,
+ 0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49,
+ 0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40,
+ 0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17,
+ 0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b,
+ 0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d,
+ 0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f,
+ 0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10,
+ 0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd,
+ 0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d,
+ 0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77,
+ 0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06,
+ 0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd,
+ 0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49,
+ 0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd,
+ 0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5,
+ 0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c,
+ 0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12,
+ 0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4,
+ 0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81,
+ 0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d,
+ 0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53,
+ 0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d,
+ 0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7,
+ 0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc,
+ 0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2,
+ 0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17,
+ 0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72,
+ 0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c,
+ 0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f,
+ 0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae,
+ 0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3,
+ 0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d,
+ 0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65,
+ 0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35,
+ 0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6,
+ 0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b,
+ 0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67,
+ 0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6,
+ 0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02,
+ 0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d,
+ 0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d,
+ 0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad,
+ 0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94,
+ 0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc,
+ 0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7,
+ 0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36,
+ 0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5,
+ 0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c,
+ 0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72,
+ 0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14,
+ 0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85,
+ 0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc,
+ 0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82,
+ 0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a,
+ 0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91,
+ 0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d,
+ 0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f,
+ 0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20,
+ 0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a,
+ 0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39,
+ 0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f,
+ 0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7,
+ 0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d,
+ 0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d,
+ 0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a,
+ 0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc,
+ 0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07,
+ 0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c,
+ 0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02,
+ 0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e,
+ 0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c,
+ 0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d,
+ 0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10,
+ 0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27,
+ 0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65,
+ 0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d,
+ 0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d,
+ 0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9,
+ 0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80,
+ 0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d,
+ 0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d,
+ 0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b,
+ 0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08,
+ 0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc,
+ 0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda,
+ 0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3,
+ 0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12,
+ 0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c,
+ 0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f,
+ 0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8,
+ 0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2,
+ 0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d,
+ 0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d,
+ 0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba,
+ 0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a,
+ 0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc,
+ 0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10,
+ 0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc,
+ 0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30,
+ 0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd,
+ 0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1,
+ 0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4,
+ 0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63,
+ 0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc,
+ 0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca,
+ 0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83,
+ 0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79,
+ 0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc,
+ 0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10,
+ 0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39,
+ 0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc,
+ 0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd,
+ 0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00,
+ 0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7,
+ 0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46,
+ 0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc,
+ 0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc,
+ 0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89,
+ 0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff,
+ 0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d,
+ 0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b,
+ 0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22,
+ 0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92,
+ 0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb,
+ 0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44,
+ 0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4,
+ 0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a,
+ 0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc,
+ 0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68,
+ 0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9,
+ 0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63,
+ 0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba,
+ 0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c,
+ 0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d,
+ 0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd,
+ 0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d,
+ 0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83,
+ 0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61,
+ 0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b,
+ 0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c,
+ 0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe,
+ 0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23,
+ 0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb,
+ 0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d,
+ 0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2,
+ 0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e,
+ 0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20,
+ 0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc,
+ 0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c,
+ 0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e,
+ 0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86,
+ 0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb,
+ 0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80,
+ 0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8,
+ 0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72,
+ 0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd,
+ 0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c,
+ 0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20,
+ 0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c,
+ 0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d,
+ 0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60,
+ 0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea,
+ 0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab,
+ 0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd,
+ 0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10,
+ 0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c,
+ 0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5,
+ 0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd,
+ 0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca,
+ 0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a,
+ 0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14,
+ 0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd,
+ 0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47,
+ 0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00,
+ 0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec,
+ 0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b,
+ 0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0,
+ 0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f,
+ 0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98,
+ 0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d,
+ 0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd,
+ 0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0,
+ 0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11,
+ 0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c,
+ 0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec,
+ 0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7,
+ 0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2,
+ 0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd,
+ 0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0,
+ 0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3,
+ 0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65,
+ 0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd,
+ 0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4,
+ 0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8,
+ 0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05,
+ 0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d,
+ 0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0,
+ 0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1,
+ 0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83,
+ 0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d,
+ 0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24,
+ 0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1,
+ 0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75,
+ 0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd,
+ 0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30,
+ 0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0,
+ 0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10,
+ 0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc,
+ 0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8,
+ 0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73,
+ 0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91,
+ 0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd,
+ 0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00,
+ 0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d,
+ 0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4,
+ 0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc,
+ 0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13,
+ 0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d,
+ 0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e,
+ 0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d,
+ 0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35,
+ 0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d,
+ 0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19,
+ 0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c,
+ 0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16,
+ 0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79,
+ 0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35,
+ 0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd,
+ 0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c,
+ 0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e,
+ 0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8,
+ 0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd,
+ 0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84,
+ 0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71,
+ 0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37,
+ 0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b,
+ 0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10,
+ 0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44,
+ 0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94,
+ 0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c,
+ 0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50,
+ 0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42,
+ 0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69,
+ 0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd,
+ 0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c,
+ 0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82,
+ 0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7,
+ 0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd,
+ 0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e,
+ 0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e,
+ 0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59,
+ 0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a,
+ 0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0,
+ 0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68,
+ 0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80,
+ 0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c,
+ 0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde,
+ 0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f,
+ 0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48,
+ 0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd,
+ 0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc,
+ 0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c,
+ 0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97,
+ 0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d,
+ 0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32,
+ 0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29,
+ 0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b,
+ 0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d,
+ 0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80,
+ 0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b,
+ 0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69,
+ 0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c,
+ 0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc,
+ 0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61,
+ 0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e,
+ 0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d,
+ 0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30,
+ 0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b,
+ 0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85,
+ 0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a,
+ 0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80,
+ 0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6,
+ 0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02,
+ 0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d,
+ 0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30,
+ 0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18,
+ 0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30,
+ 0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d,
+ 0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe,
+ 0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40,
+ 0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05,
+ 0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d,
+ 0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0,
+ 0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93,
+ 0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d,
+ 0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd,
+ 0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3,
+ 0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4,
+ 0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82,
+ 0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a,
+ 0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb,
+ 0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c,
+ 0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21,
+ 0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd,
+ 0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12,
+ 0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9,
+ 0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08,
+ 0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d,
+ 0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53,
+ 0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13,
+ 0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44,
+ 0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc,
+ 0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f,
+ 0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89,
+ 0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d,
+ 0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b,
+ 0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8,
+ 0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0,
+ 0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54,
+ 0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c,
+ 0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0,
+ 0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca,
+ 0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71,
+ 0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb,
+ 0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a,
+ 0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e,
+ 0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62,
+ 0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d,
+ 0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e,
+ 0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c,
+ 0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b,
+ 0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d,
+ 0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8,
+ 0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f,
+ 0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62,
+ 0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd,
+ 0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50,
+ 0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73,
+ 0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77,
+ 0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d,
+ 0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c,
+ 0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87,
+ 0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d,
+ 0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd,
+ 0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08,
+ 0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1,
+ 0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab,
+ 0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c,
+ 0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a,
+ 0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03,
+ 0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34,
+ 0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb,
+ 0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa,
+ 0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26,
+ 0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52,
+ 0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b,
+ 0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2,
+ 0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58,
+ 0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f,
+ 0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd,
+ 0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e,
+ 0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98,
+ 0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27,
+ 0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d,
+ 0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70,
+ 0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d,
+ 0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7,
+ 0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d,
+ 0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41,
+ 0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66,
+ 0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a,
+ 0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c,
+ 0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66,
+ 0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c,
+ 0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf,
+ 0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c,
+ 0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd,
+ 0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8,
+ 0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71,
+ 0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc,
+ 0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0,
+ 0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85,
+ 0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9,
+ 0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c,
+ 0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0,
+ 0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23,
+ 0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a,
+ 0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d,
+ 0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00,
+ 0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06,
+ 0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a,
+ 0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d,
+ 0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2,
+ 0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab,
+ 0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44,
+ 0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d,
+ 0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94,
+ 0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e,
+ 0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6,
+ 0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc,
+ 0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c,
+ 0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb,
+ 0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2,
+ 0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d,
+ 0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50,
+ 0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8,
+ 0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83,
+ 0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc,
+ 0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81,
+ 0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a,
+ 0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f,
+ 0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd,
+ 0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a,
+ 0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea,
+ 0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84,
+ 0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd,
+ 0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0,
+ 0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76,
+ 0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a,
+ 0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c,
+ 0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa,
+ 0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4,
+ 0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92,
+ 0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd,
+ 0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78,
+ 0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15,
+ 0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f,
+ 0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d,
+ 0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5,
+ 0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0,
+ 0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38,
+ 0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc,
+ 0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8,
+ 0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75,
+ 0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6,
+ 0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc,
+ 0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0,
+ 0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e,
+ 0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e,
+ 0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c,
+ 0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80,
+ 0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc,
+ 0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82,
+ 0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c,
+ 0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02,
+ 0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e,
+ 0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a,
+ 0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd,
+ 0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10,
+ 0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3,
+ 0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0,
+ 0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd,
+ 0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02,
+ 0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63,
+ 0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69,
+ 0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c,
+ 0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae,
+ 0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8,
+ 0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40,
+ 0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd,
+ 0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a,
+ 0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80,
+ 0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0,
+ 0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc,
+ 0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7,
+ 0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1,
+ 0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1,
+ 0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c,
+ 0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84,
+ 0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f,
+ 0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05,
+ 0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd,
+ 0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b,
+ 0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8,
+ 0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33,
+ 0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb,
+ 0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c,
+ 0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b,
+ 0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b,
+ 0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc,
+ 0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7,
+ 0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78,
+ 0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1,
+ 0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d,
+ 0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed,
+ 0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84,
+ 0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb,
+ 0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd,
+ 0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe,
+ 0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8,
+ 0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a,
+ 0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc,
+ 0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69,
+ 0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54,
+ 0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9,
+ 0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d,
+ 0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a,
+ 0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c,
+ 0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99,
+ 0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d,
+ 0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8,
+ 0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78,
+ 0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b,
+ 0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c,
+ 0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc,
+ 0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb,
+ 0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d,
+ 0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd,
+ 0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09,
+ 0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0,
+ 0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8,
+ 0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd,
+ 0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e,
+ 0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6,
+ 0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29,
+ 0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d,
+ 0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21,
+ 0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96,
+ 0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04,
+ 0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd,
+ 0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38,
+ 0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6,
+ 0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84,
+ 0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd,
+ 0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac,
+ 0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec,
+ 0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03,
+ 0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba,
+ 0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae,
+ 0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b,
+ 0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce,
+ 0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c,
+ 0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31,
+ 0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd,
+ 0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21,
+ 0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb,
+ 0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32,
+ 0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42,
+ 0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0,
+ 0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c,
+ 0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47,
+ 0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34,
+ 0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3,
+ 0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c,
+ 0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a,
+ 0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a,
+ 0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43,
+ 0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d,
+ 0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9,
+ 0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3,
+ 0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28,
+ 0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d,
+ 0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e,
+ 0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7,
+ 0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd,
+ 0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d,
+ 0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6,
+ 0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9,
+ 0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50,
+ 0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd,
+ 0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa,
+ 0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45,
+ 0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c,
+ 0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd,
+ 0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1,
+ 0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47,
+ 0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62,
+ 0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d,
+ 0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb,
+ 0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7,
+ 0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77,
+ 0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb,
+ 0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73,
+ 0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66,
+ 0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82,
+ 0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d,
+ 0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07,
+ 0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01,
+ 0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34,
+ 0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd,
+ 0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3,
+ 0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4,
+ 0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95,
+ 0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c,
+ 0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0,
+ 0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4,
+ 0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f,
+ 0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c,
+ 0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd,
+ 0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4,
+ 0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16,
+ 0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d,
+ 0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92,
+ 0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb,
+ 0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95,
+ 0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c,
+ 0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58,
+ 0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd,
+ 0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc,
+ 0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd,
+ 0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24,
+ 0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14,
+ 0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43,
+ 0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd,
+ 0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42,
+ 0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67,
+ 0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5,
+ 0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd,
+ 0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27,
+ 0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b,
+ 0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12,
+ 0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc,
+ 0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52,
+ 0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f,
+ 0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f,
+ 0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b,
+ 0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50,
+ 0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35,
+ 0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65,
+ 0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c,
+ 0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c,
+ 0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff,
+ 0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31,
+ 0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c,
+ 0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00,
+ 0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48,
+ 0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce,
+ 0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c,
+ 0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8,
+ 0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57,
+ 0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e,
+ 0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc,
+ 0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec,
+ 0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c,
+ 0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d,
+ 0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc,
+ 0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67,
+ 0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc,
+ 0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29,
+ 0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd,
+ 0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02,
+ 0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24,
+ 0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46,
+ 0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d,
+ 0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b,
+ 0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23,
+ 0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f,
+ 0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc,
+ 0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad,
+ 0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6,
+ 0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a,
+ 0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d,
+ 0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11,
+ 0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49,
+ 0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e,
+ 0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc,
+ 0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87,
+ 0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02,
+ 0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d,
+ 0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb,
+ 0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e,
+ 0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd,
+ 0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c,
+ 0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d,
+ 0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89,
+ 0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08,
+ 0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7,
+ 0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc,
+ 0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e,
+ 0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f,
+ 0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26,
+ 0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b,
+ 0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a,
+ 0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52,
+ 0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a,
+ 0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd,
+ 0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99,
+ 0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98,
+ 0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42,
+ 0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd,
+ 0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05,
+ 0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca,
+ 0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37,
+ 0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c,
+ 0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9,
+ 0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e,
+ 0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a,
+ 0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d,
+ 0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9,
+ 0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb,
+ 0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1,
+ 0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd,
+ 0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35,
+ 0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1,
+ 0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e,
+ 0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc,
+ 0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b,
+ 0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70,
+ 0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44,
+ 0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc,
+ 0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac,
+ 0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34,
+ 0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b,
+ 0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb,
+ 0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10,
+ 0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1,
+ 0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76,
+ 0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd,
+ 0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15,
+ 0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b,
+ 0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47,
+ 0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd,
+ 0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9,
+ 0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50,
+ 0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54,
+ 0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd,
+ 0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84,
+ 0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e,
+ 0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c,
+ 0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c,
+ 0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3,
+ 0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9,
+ 0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18,
+ 0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd,
+ 0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f,
+ 0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01,
+ 0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81,
+ 0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d,
+ 0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5,
+ 0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4,
+ 0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24,
+ 0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc,
+ 0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6,
+ 0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b,
+ 0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57,
+ 0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd,
+ 0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f,
+ 0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85,
+ 0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73,
+ 0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc,
+ 0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe,
+ 0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0,
+ 0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb,
+ 0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd,
+ 0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a,
+ 0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1,
+ 0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc,
+ 0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d,
+ 0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88,
+ 0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37,
+ 0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0,
+ 0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c,
+ 0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f,
+ 0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70,
+ 0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4,
+ 0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd,
+ 0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b,
+ 0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84,
+ 0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b,
+ 0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b,
+ 0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c,
+ 0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69,
+ 0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3,
+ 0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c,
+ 0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d,
+ 0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70,
+ 0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26,
+ 0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b,
+ 0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c,
+ 0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa,
+ 0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f,
+ 0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d,
+ 0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87,
+ 0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1,
+ 0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8,
+ 0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc,
+ 0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1,
+ 0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4,
+ 0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65,
+ 0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc,
+ 0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07,
+ 0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9,
+ 0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b,
+ 0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd,
+ 0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f,
+ 0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69,
+ 0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4,
+ 0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d,
+ 0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8,
+ 0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae,
+ 0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad,
+ 0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d,
+ 0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a,
+ 0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c,
+ 0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5,
+ 0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d,
+ 0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d,
+ 0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d,
+ 0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24,
+ 0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d,
+ 0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca,
+ 0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44,
+ 0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd,
+ 0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d,
+ 0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50,
+ 0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05,
+ 0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0,
+ 0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c,
+ 0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa,
+ 0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d,
+ 0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96,
+ 0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c,
+ 0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e,
+ 0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55,
+ 0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93,
+ 0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d,
+ 0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb,
+ 0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab,
+ 0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6,
+ 0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc,
+ 0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b,
+ 0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d,
+ 0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83,
+ 0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c,
+ 0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03,
+ 0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b,
+ 0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39,
+ 0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd,
+ 0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15,
+ 0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46,
+ 0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00,
+ 0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd,
+ 0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64,
+ 0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11,
+ 0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a,
+ 0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d,
+ 0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09,
+ 0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f,
+ 0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e,
+ 0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c,
+ 0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47,
+ 0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88,
+ 0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9,
+ 0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd,
+ 0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84,
+ 0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19,
+ 0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8,
+ 0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc,
+ 0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d,
+ 0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6,
+ 0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e,
+ 0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc,
+ 0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f,
+ 0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93,
+ 0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25,
+ 0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d,
+ 0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2,
+ 0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef,
+ 0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8,
+ 0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd,
+ 0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d,
+ 0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8,
+ 0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82,
+ 0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c,
+ 0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54,
+ 0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f,
+ 0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06,
+ 0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc,
+ 0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a,
+ 0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44,
+ 0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d,
+ 0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d,
+ 0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e,
+ 0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d,
+ 0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57,
+ 0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d,
+ 0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6,
+ 0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2,
+ 0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84,
+ 0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd,
+ 0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c,
+ 0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76,
+ 0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2,
+ 0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd,
+ 0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74,
+ 0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a,
+ 0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b,
+ 0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd,
+ 0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8,
+ 0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4,
+ 0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1,
+ 0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd,
+ 0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60,
+ 0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46,
+ 0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e,
+ 0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc,
+ 0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c,
+ 0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26,
+ 0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f,
+ 0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc,
+ 0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7,
+ 0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d,
+ 0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85,
+ 0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb,
+ 0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a,
+ 0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0,
+ 0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b,
+ 0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d,
+ 0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24,
+ 0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66,
+ 0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22,
+ 0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9,
+ 0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71,
+ 0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5,
+ 0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46,
+ 0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c,
+ 0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a,
+ 0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70,
+ 0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6,
+ 0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd,
+ 0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a,
+ 0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc,
+ 0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c,
+ 0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c,
+ 0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2,
+ 0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a,
+ 0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a,
+ 0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c,
+ 0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69,
+ 0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a,
+ 0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b,
+ 0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc,
+ 0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5,
+ 0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6,
+ 0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67,
+ 0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d,
+ 0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03,
+ 0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a,
+ 0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92,
+ 0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd,
+ 0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60,
+ 0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78,
+ 0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43,
+ 0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc,
+ 0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50,
+ 0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46,
+ 0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05,
+ 0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d,
+ 0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef,
+ 0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2,
+ 0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77,
+ 0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc,
+ 0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2,
+ 0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36,
+ 0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74,
+ 0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d,
+ 0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce,
+ 0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b,
+ 0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d,
+ 0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc,
+ 0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8,
+ 0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7,
+ 0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14,
+ 0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c,
+ 0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11,
+ 0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84,
+ 0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71,
+ 0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d,
+ 0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08,
+ 0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92,
+ 0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a,
+ 0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c,
+ 0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7,
+ 0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7,
+ 0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a,
+ 0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c,
+ 0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1,
+ 0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d,
+ 0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a,
+ 0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd,
+ 0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0,
+ 0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe,
+ 0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44,
+ 0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd,
+ 0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19,
+ 0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7,
+ 0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec,
+ 0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc,
+ 0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4,
+ 0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3,
+ 0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e,
+ 0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd,
+ 0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e,
+ 0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e,
+ 0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38,
+ 0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd,
+ 0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6,
+ 0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d,
+ 0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d,
+ 0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd,
+ 0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab,
+ 0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50,
+ 0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30,
+ 0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd,
+ 0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9,
+ 0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab,
+ 0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b,
+ 0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc,
+ 0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8,
+ 0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c,
+ 0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca,
+ 0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d,
+ 0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c,
+ 0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe,
+ 0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45,
+ 0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d,
+ 0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda,
+ 0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa,
+ 0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90,
+ 0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b,
+ 0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72,
+ 0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0,
+ 0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee,
+ 0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c,
+ 0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77,
+ 0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22,
+ 0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62,
+ 0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd,
+ 0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81,
+ 0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5,
+ 0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94,
+ 0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d,
+ 0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09,
+ 0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9,
+ 0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24,
+ 0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc,
+ 0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4,
+ 0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16,
+ 0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86,
+ 0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc,
+ 0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a,
+ 0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde,
+ 0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb,
+ 0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd,
+ 0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10,
+ 0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c,
+ 0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe,
+ 0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d,
+ 0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9,
+ 0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4,
+ 0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46,
+ 0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d,
+ 0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b,
+ 0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01,
+ 0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17,
+ 0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd,
+ 0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41,
+ 0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03,
+ 0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4,
+ 0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd,
+ 0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47,
+ 0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe,
+ 0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90,
+ 0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc,
+ 0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d,
+ 0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c,
+ 0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05,
+ 0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b,
+ 0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd,
+ 0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1,
+ 0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8,
+ 0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c,
+ 0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13,
+ 0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e,
+ 0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18,
+ 0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc,
+ 0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77,
+ 0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34,
+ 0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b,
+ 0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d,
+ 0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24,
+ 0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63,
+ 0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31,
+ 0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d,
+ 0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb,
+ 0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f,
+ 0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36,
+ 0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d,
+ 0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4,
+ 0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78,
+ 0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3,
+ 0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd,
+ 0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d,
+ 0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87,
+ 0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80,
+ 0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd,
+ 0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7,
+ 0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d,
+ 0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89,
+ 0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc,
+ 0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61,
+ 0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26,
+ 0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77,
+ 0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd,
+ 0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80,
+ 0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f,
+ 0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58,
+ 0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c,
+ 0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf,
+ 0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf,
+ 0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c,
+ 0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc,
+ 0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3,
+ 0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6,
+ 0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac,
+ 0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd,
+ 0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3,
+ 0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39,
+ 0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43,
+ 0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d,
+ 0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13,
+ 0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87,
+ 0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6,
+ 0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d,
+ 0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c,
+ 0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04,
+ 0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07,
+ 0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd,
+ 0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa,
+ 0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4,
+ 0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34,
+ 0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d,
+ 0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1,
+ 0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55,
+ 0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86,
+ 0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d,
+ 0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62,
+ 0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5,
+ 0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40,
+ 0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc,
+ 0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66,
+ 0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda,
+ 0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c,
+ 0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c,
+ 0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45,
+ 0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55,
+ 0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13,
+ 0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd,
+ 0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4,
+ 0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93,
+ 0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34,
+ 0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d,
+ 0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5,
+ 0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6,
+ 0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b,
+ 0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc,
+ 0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13,
+ 0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50,
+ 0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00,
+ 0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0,
+ 0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20,
+ 0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5,
+ 0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+ 0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4,
+ 0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59,
+ 0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65,
+ 0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd,
+ 0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf,
+ 0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a,
+ 0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a,
+ 0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1,
+ 0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf,
+ 0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20,
+ 0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c,
+ 0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9,
+ 0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16,
+ 0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23,
+ 0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c,
+ 0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e,
+ 0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a,
+ 0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30,
+ 0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d,
+ 0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00,
+ 0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00,
+ 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c,
+ 0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d,
+ 0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64,
+ 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e,
+ 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+ 0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+ 0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00,
+ 0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00,
+ 0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c,
+ 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe,
+ 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00,
+ 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff,
+ 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13,
+ 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+ 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00,
+ 0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+ 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00,
+ 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a,
+ 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+ 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00,
+ 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0,
+ 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+ 0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e,
+ 0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff,
+ 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03,
+ 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c,
+ 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00,
+ 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00,
+ 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00,
+ 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00,
+ 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08,
+ 0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00,
+ 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00,
+ 0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00,
+ 0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c,
+ 0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04,
+ 0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00,
+ 0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+ 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14,
+ 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00,
+ 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff,
+ 0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00,
+ 0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69,
+ 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00,
+ 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+ 0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+ 0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8,
+ 0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+ 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+ 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65,
+ 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64,
+ 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75,
+ 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+ 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42,
+ 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00,
+ 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+ 0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80,
+ 0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65,
+ 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66,
+ 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73,
+ 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00,
+ 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00,
+ 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88,
+ 0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+ 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70,
+ 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f,
+ 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00,
+ 0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00,
+ 0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+ 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00,
+ 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+ 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33,
+ 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+ 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64,
+ 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+ 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+ 0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+ 0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24,
+ 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00,
+ 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00,
+ 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff,
+ 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+ 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c,
+ 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78,
+ 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+ 0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34,
+ 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00,
+ 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73,
+ 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f,
+ 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65,
+ 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c,
+ 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71,
+ 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f,
+ 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76,
+ 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+ 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+ 0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04,
+ 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00,
+ 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00,
+ 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+ 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67,
+ 0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f,
+ 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00,
+ 0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff,
+ 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d,
+ 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff,
+ 0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+ 0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+ 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e,
+ 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42,
+ 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+ 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32,
+ 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b,
+ 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69,
+ 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e,
+ 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd,
+ 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00,
+ 0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+ 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+ 0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b,
+ 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00,
+ 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+ 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+ 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10,
+ 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00,
+ 0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff,
+ 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+ 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32,
+ 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10,
+ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd,
+ 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+ 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+ 0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+ 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+ 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61,
+ 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00,
+ 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00,
+ 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+ 0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f,
+ 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00,
+ 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff,
+ 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00,
+ 0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff,
+ 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+ 0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff,
+ 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00,
+ 0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff,
+ 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33,
+ 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c,
+ 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+ 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+ 0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22,
+ 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65,
+ 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+ 0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40,
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00,
+ 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+ 0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4,
+ 0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00,
+ 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 0000000000..2fab99dd8b
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+ tran_low_t *highpass) {
+ int n;
+ tran_low_t r, *a, *b;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ while (--n) {
+ *a++ = (r = *x++) * 2;
+ *b++ = *x - ((r + x[1] + 1) >> 1);
+ x++;
+ }
+ *a = (r = *x++) * 2;
+ *b = *x - r;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ r = *highpass;
+ while (n--) {
+ *a++ += (r + (*b) + 1) >> 1;
+ r = *b++;
+ }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+ tran_low_t *highpass) {
+ int n;
+ tran_low_t r, *a, *b;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ while (--n) {
+ *a++ = (r = *x++);
+ *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+ x++;
+ }
+ *a = (r = *x++);
+ *b = (*x - r + 1) >> 1;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ r = *highpass;
+ while (n--) {
+ *a++ += (r + (*b) + 1) >> 1;
+ r = *b++;
+ }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+ const uint8_t *x, int pitch_x,
+ tran_low_t *c, int pitch_c,
+ int dwt_scale_bits, int hbd) {
+ int lv, i, j, nh, nw, hh = height, hw = width;
+ tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+ if (hbd) {
+ const uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+ }
+ }
+ } else {
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+ }
+ }
+ }
+
+ for (lv = 0; lv < levels; lv++) {
+ nh = hh;
+ hh = (hh + 1) >> 1;
+ nw = hw;
+ hw = (hw + 1) >> 1;
+ if ((nh < 2) || (nw < 2)) return;
+ for (i = 0; i < nh; i++) {
+ memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+ analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+ }
+ for (j = 0; j < nw; j++) {
+ for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+ analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+ for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+ }
+ }
+}
+
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+ int stride, int hbd) {
+ dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+static int haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) {
+ int acsad = 0;
+
+ for (int r = 0; r < bh; ++r)
+ for (int c = 0; c < bw; ++c) {
+ if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+ }
+ return acsad;
+}
+
+static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
+ int hbd) {
+ tran_low_t output[64];
+
+ av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+ return haar_ac_sad(output, 8, 8, 8);
+}
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+ int hbd, int num_8x8_rows,
+ int num_8x8_cols) {
+ int64_t wavelet_energy = 0;
+ for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+ for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+ wavelet_energy += haar_ac_sad_8x8_uint8_input(
+ input + c8 * 8 + r8 * 8 * stride, stride, hbd);
+ }
+ }
+ return wavelet_energy;
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 0000000000..443b6bc12c
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+ int stride, int hbd);
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+ int hbd, int num_8x8_rows,
+ int num_8x8_cols);
+
+#endif // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h
new file mode 100644
index 0000000000..20cefa16a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/enc_enums.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
+#define AOM_AV1_ENCODER_ENC_ENUMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
+enum {
+ THR_NEARESTMV,
+ THR_NEARESTL2,
+ THR_NEARESTL3,
+ THR_NEARESTB,
+ THR_NEARESTA2,
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_NEWMV,
+ THR_NEWL2,
+ THR_NEWL3,
+ THR_NEWB,
+ THR_NEWA2,
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+ THR_NEARL2,
+ THR_NEARL3,
+ THR_NEARB,
+ THR_NEARA2,
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_GLOBALMV,
+ THR_GLOBALL2,
+ THR_GLOBALL3,
+ THR_GLOBALB,
+ THR_GLOBALA2,
+ THR_GLOBALA,
+ THR_GLOBALG,
+
+ THR_COMP_NEAREST_NEARESTLA,
+ THR_COMP_NEAREST_NEARESTL2A,
+ THR_COMP_NEAREST_NEARESTL3A,
+ THR_COMP_NEAREST_NEARESTGA,
+ THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTLA2,
+ THR_COMP_NEAREST_NEARESTL2A2,
+ THR_COMP_NEAREST_NEARESTL3A2,
+ THR_COMP_NEAREST_NEARESTGA2,
+ THR_COMP_NEAREST_NEARESTLL2,
+ THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG,
+ THR_COMP_NEAREST_NEARESTBA,
+
+ THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEW_NEWLB,
+ THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEW_NEARLB,
+ THR_COMP_NEAR_NEWLB,
+ THR_COMP_GLOBAL_GLOBALLB,
+
+ THR_COMP_NEAR_NEARLA,
+ THR_COMP_NEW_NEWLA,
+ THR_COMP_NEW_NEARESTLA,
+ THR_COMP_NEAREST_NEWLA,
+ THR_COMP_NEW_NEARLA,
+ THR_COMP_NEAR_NEWLA,
+ THR_COMP_GLOBAL_GLOBALLA,
+
+ THR_COMP_NEAR_NEARL2A,
+ THR_COMP_NEW_NEWL2A,
+ THR_COMP_NEW_NEARESTL2A,
+ THR_COMP_NEAREST_NEWL2A,
+ THR_COMP_NEW_NEARL2A,
+ THR_COMP_NEAR_NEWL2A,
+ THR_COMP_GLOBAL_GLOBALL2A,
+
+ THR_COMP_NEAR_NEARL3A,
+ THR_COMP_NEW_NEWL3A,
+ THR_COMP_NEW_NEARESTL3A,
+ THR_COMP_NEAREST_NEWL3A,
+ THR_COMP_NEW_NEARL3A,
+ THR_COMP_NEAR_NEWL3A,
+ THR_COMP_GLOBAL_GLOBALL3A,
+
+ THR_COMP_NEAR_NEARGA,
+ THR_COMP_NEW_NEWGA,
+ THR_COMP_NEW_NEARESTGA,
+ THR_COMP_NEAREST_NEWGA,
+ THR_COMP_NEW_NEARGA,
+ THR_COMP_NEAR_NEWGA,
+ THR_COMP_GLOBAL_GLOBALGA,
+
+ THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEW_NEWL2B,
+ THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEAR_NEWL2B,
+ THR_COMP_GLOBAL_GLOBALL2B,
+
+ THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEW_NEWL3B,
+ THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEAR_NEWL3B,
+ THR_COMP_GLOBAL_GLOBALL3B,
+
+ THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEW_NEWGB,
+ THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEW_NEARGB,
+ THR_COMP_NEAR_NEWGB,
+ THR_COMP_GLOBAL_GLOBALGB,
+
+ THR_COMP_NEAR_NEARLA2,
+ THR_COMP_NEW_NEWLA2,
+ THR_COMP_NEW_NEARESTLA2,
+ THR_COMP_NEAREST_NEWLA2,
+ THR_COMP_NEW_NEARLA2,
+ THR_COMP_NEAR_NEWLA2,
+ THR_COMP_GLOBAL_GLOBALLA2,
+
+ THR_COMP_NEAR_NEARL2A2,
+ THR_COMP_NEW_NEWL2A2,
+ THR_COMP_NEW_NEARESTL2A2,
+ THR_COMP_NEAREST_NEWL2A2,
+ THR_COMP_NEW_NEARL2A2,
+ THR_COMP_NEAR_NEWL2A2,
+ THR_COMP_GLOBAL_GLOBALL2A2,
+
+ THR_COMP_NEAR_NEARL3A2,
+ THR_COMP_NEW_NEWL3A2,
+ THR_COMP_NEW_NEARESTL3A2,
+ THR_COMP_NEAREST_NEWL3A2,
+ THR_COMP_NEW_NEARL3A2,
+ THR_COMP_NEAR_NEWL3A2,
+ THR_COMP_GLOBAL_GLOBALL3A2,
+
+ THR_COMP_NEAR_NEARGA2,
+ THR_COMP_NEW_NEWGA2,
+ THR_COMP_NEW_NEARESTGA2,
+ THR_COMP_NEAREST_NEWGA2,
+ THR_COMP_NEW_NEARGA2,
+ THR_COMP_NEAR_NEWGA2,
+ THR_COMP_GLOBAL_GLOBALGA2,
+
+ THR_COMP_NEAR_NEARLL2,
+ THR_COMP_NEW_NEWLL2,
+ THR_COMP_NEW_NEARESTLL2,
+ THR_COMP_NEAREST_NEWLL2,
+ THR_COMP_NEW_NEARLL2,
+ THR_COMP_NEAR_NEWLL2,
+ THR_COMP_GLOBAL_GLOBALLL2,
+
+ THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEW_NEWLL3,
+ THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEAR_NEWLL3,
+ THR_COMP_GLOBAL_GLOBALLL3,
+
+ THR_COMP_NEAR_NEARLG,
+ THR_COMP_NEW_NEWLG,
+ THR_COMP_NEW_NEARESTLG,
+ THR_COMP_NEAREST_NEWLG,
+ THR_COMP_NEW_NEARLG,
+ THR_COMP_NEAR_NEWLG,
+ THR_COMP_GLOBAL_GLOBALLG,
+
+ THR_COMP_NEAR_NEARBA,
+ THR_COMP_NEW_NEWBA,
+ THR_COMP_NEW_NEARESTBA,
+ THR_COMP_NEAREST_NEWBA,
+ THR_COMP_NEW_NEARBA,
+ THR_COMP_NEAR_NEWBA,
+ THR_COMP_GLOBAL_GLOBALBA,
+
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+
+ MAX_MODES,
+ SINGLE_REF_MODE_START = THR_NEARESTMV,
+ SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA,
+ NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
+ THR_MODE_START = THR_NEARESTMV,
+ THR_MODE_END = MAX_MODES,
+ THR_INTER_MODE_START = THR_MODE_START,
+ THR_INTER_MODE_END = THR_DC,
+ THR_INVALID = 255
+} UENUM1BYTE(THR_MODES);
+
+enum {
+ THR_LAST,
+ THR_LAST2,
+ THR_LAST3,
+ THR_BWDR,
+ THR_ALTR2,
+ THR_GOLD,
+ THR_ALTR,
+
+ THR_COMP_LA,
+ THR_COMP_L2A,
+ THR_COMP_L3A,
+ THR_COMP_GA,
+
+ THR_COMP_LB,
+ THR_COMP_L2B,
+ THR_COMP_L3B,
+ THR_COMP_GB,
+
+ THR_COMP_LA2,
+ THR_COMP_L2A2,
+ THR_COMP_L3A2,
+ THR_COMP_GA2,
+
+ THR_INTRA,
+
+ MAX_REFS
+} UENUM1BYTE(THR_MODES_SUB8X8);
+
+enum {
+ FULL_TXFM_RD,
+ LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+ USE_FULL_RD = 0,
+ USE_FAST_RD,
+ USE_LARGESTALL,
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENC_ENUMS_H_
diff --git a/third_party/aom/av1/encoder/encode_strategy.c b/third_party/aom/av1/encoder/encode_strategy.c
new file mode 100644
index 0000000000..35ca83c3f4
--- /dev/null
+++ b/third_party/aom/av1/encoder/encode_strategy.c
@@ -0,0 +1,1767 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "av1/common/blockd.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/temporal_filter.h"
+#if CONFIG_THREE_PASS
+#include "av1/encoder/thirdpass.h"
+#endif // CONFIG_THREE_PASS
+#include "av1/encoder/tpl_model.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
+
+static INLINE void set_refresh_frame_flags(
+ RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
+ bool refresh_arf) {
+ refresh_frame->golden_frame = refresh_gf;
+ refresh_frame->bwd_ref_frame = refresh_bwdref;
+ refresh_frame->alt_ref_frame = refresh_arf;
+}
+
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+ RefreshFrameInfo *const refresh_frame,
+ const FRAME_UPDATE_TYPE type,
+ const REFBUF_STATE refbuf_state,
+ int force_refresh_all) {
+ // NOTE(weitinglin): Should we define another function to take care of
+ // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+ const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &cpi->ext_flags.refresh_frame;
+ cpi->rc.is_src_frame_alt_ref = 0;
+
+ switch (type) {
+ case KF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+ break;
+
+ case LF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, false, false, false);
+ break;
+
+ case GF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, true, false, false);
+ break;
+
+ case OVERLAY_UPDATE:
+ if (refbuf_state == REFBUF_RESET)
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+ else
+ set_refresh_frame_flags(refresh_frame, true, false, false);
+
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case ARF_UPDATE:
+ // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+ if (refbuf_state == REFBUF_RESET)
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+ else
+ set_refresh_frame_flags(refresh_frame, false, false, true);
+
+ break;
+
+ case INTNL_OVERLAY_UPDATE:
+ set_refresh_frame_flags(refresh_frame, false, false, false);
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case INTNL_ARF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, false, true, false);
+ break;
+
+ default: assert(0); break;
+ }
+
+ if (ext_refresh_frame_flags->update_pending &&
+ (!is_stat_generation_stage(cpi))) {
+ set_refresh_frame_flags(refresh_frame,
+ ext_refresh_frame_flags->golden_frame,
+ ext_refresh_frame_flags->bwd_ref_frame,
+ ext_refresh_frame_flags->alt_ref_frame);
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (ext_refresh_frame_flags->golden_frame)
+ gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE;
+ if (ext_refresh_frame_flags->alt_ref_frame)
+ gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE;
+ if (ext_refresh_frame_flags->bwd_ref_frame)
+ gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE;
+ }
+
+ if (force_refresh_all)
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+ unsigned int *const frame_flags) {
+ if (frame_is_intra_only(cm)) {
+ *frame_flags |= FRAMEFLAGS_INTRAONLY;
+ }
+ if (frame_is_sframe(cm)) {
+ *frame_flags |= FRAMEFLAGS_SWITCH;
+ }
+ if (cm->features.error_resilient_mode) {
+ *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+ }
+}
+
+static void set_ext_overrides(AV1_COMMON *const cm,
+ EncodeFrameParams *const frame_params,
+ ExternalFlags *const ext_flags) {
+ // Overrides the defaults with the externally supplied values with
+ // av1_update_reference() and av1_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to av1_encode_lowlevel()
+
+ if (ext_flags->use_s_frame) {
+ frame_params->frame_type = S_FRAME;
+ }
+
+ if (ext_flags->refresh_frame_context_pending) {
+ cm->features.refresh_frame_context = ext_flags->refresh_frame_context;
+ ext_flags->refresh_frame_context_pending = 0;
+ }
+ cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs;
+
+ frame_params->error_resilient_mode = ext_flags->use_error_resilient;
+ // A keyframe is already error resilient and keyframes with
+ // error_resilient_mode interferes with the use of show_existing_frame
+ // when forward reference keyframes are enabled.
+ frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
+ // For bitstream conformance, s-frames must be error-resilient
+ frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
+}
+
+static int choose_primary_ref_frame(
+ AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int intra_only = frame_params->frame_type == KEY_FRAME ||
+ frame_params->frame_type == INTRA_ONLY_FRAME;
+ if (intra_only || frame_params->error_resilient_mode ||
+ cpi->ext_flags.use_primary_ref_none) {
+ return PRIMARY_REF_NONE;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode) {
+ int wanted_fb = cpi->ppi->gf_group.primary_ref_idx[cpi->gf_frame_index];
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb)
+ return ref_frame - LAST_FRAME;
+ }
+
+ return PRIMARY_REF_NONE;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ // In large scale case, always use Last frame's frame contexts.
+ // Note(yunqing): In other cases, primary_ref_frame is chosen based on
+ // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
+ // frame bit allocation.
+ if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
+
+ if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config)
+ return av1_svc_primary_ref_frame(cpi);
+
+ // Find the most recent reference frame with the same reference type as the
+ // current frame
+ const int current_ref_type = get_current_frame_ref_type(cpi);
+ int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type];
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index];
+ // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set.
+ if (frame_level == 1) {
+ cpi->wanted_fb = wanted_fb;
+ }
+ // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the
+ // set.
+ if (frame_level == 2 &&
+ gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) {
+ assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1);
+ wanted_fb = cpi->wanted_fb;
+ }
+ }
+ }
+#endif // CONFIG_FPMT_TEST
+ int primary_ref_frame = PRIMARY_REF_NONE;
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+ primary_ref_frame = ref_frame - LAST_FRAME;
+ }
+ }
+
+ return primary_ref_frame;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
+ TimeStamps *time_stamps = &cpi->time_stamps;
+ int64_t this_duration;
+ int step = 0;
+
+ // Clear down mmx registers
+
+ if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config &&
+ cpi->svc.number_spatial_layers > 1) {
+ // ts_start is the timestamp for the current frame and ts_end is the
+ // expected next timestamp given the duration passed into codec_encode().
+ // See the setting in encoder_encode() in av1_cx_iface.c:
+ // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol),
+ // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol +
+ // duration). So the difference ts_end - ts_start is the duration passed
+ // in by the user. For spatial layers SVC set the framerate based directly
+ // on the duration, and bypass the adjustments below.
+ this_duration = ts_end - ts_start;
+ if (this_duration > 0) {
+ cpi->new_framerate = 10000000.0 / this_duration;
+ av1_new_framerate(cpi, cpi->new_framerate);
+ time_stamps->prev_ts_start = ts_start;
+ time_stamps->prev_ts_end = ts_end;
+ return;
+ }
+ }
+
+ if (ts_start == time_stamps->first_ts_start) {
+ this_duration = ts_end - ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration =
+ time_stamps->prev_ts_end - time_stamps->prev_ts_start;
+
+ this_duration = ts_end - time_stamps->prev_ts_end;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ cpi->new_framerate = 10000000.0 / this_duration;
+ av1_new_framerate(cpi, cpi->new_framerate);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval =
+ AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+ cpi->new_framerate = (10000000.0 / avg_duration);
+ // For parallel frames update cpi->framerate with new_framerate
+ // during av1_post_encode_updates()
+ double framerate =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? cpi->framerate
+ : cpi->new_framerate;
+ av1_new_framerate(cpi, framerate);
+ }
+ }
+
+ time_stamps->prev_ts_start = ts_start;
+ time_stamps->prev_ts_end = ts_end;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+ const int up_to_index,
+ const COMPRESSOR_STAGE compressor_stage) {
+ for (int i = 0; i <= up_to_index; i++) {
+ const struct lookahead_entry *e =
+ av1_lookahead_peek(lookahead, i, compressor_stage);
+ if (e == NULL) {
+ // We have reached the end of the lookahead buffer and not early-returned
+ // so there isn't a forced key-frame pending.
+ return -1;
+ } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+ return i;
+ } else {
+ continue;
+ }
+ }
+ return -1; // Never reached
+}
+
+// Check if we should encode an ARF or internal ARF. If not, try a LAST
+// Do some setup associated with the chosen source
+// temporal_filtered, flush, and frame_update_type are outputs.
+// Return the frame source, or NULL if we couldn't find one
+static struct lookahead_entry *choose_frame_source(
+ AV1_COMP *const cpi, int *const flush, int *pop_lookahead,
+ struct lookahead_entry **last_source, int *const show_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ struct lookahead_entry *source = NULL;
+
+ // Source index in lookahead buffer.
+ int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+ // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
+ if (src_index &&
+ (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index,
+ cpi->compressor_stage) != -1) &&
+ cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) {
+ src_index = 0;
+ *flush = 1;
+ }
+
+ // If the current frame is arf, then we should not pop from the lookahead
+ // buffer. If the current frame is not arf, then pop it. This assumes the
+ // first frame in the GF group is not arf. May need to change if it is not
+ // true.
+ *pop_lookahead = (src_index == 0);
+ // If this is a key frame and keyframe filtering is enabled with overlay,
+ // then do not pop.
+ if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 &&
+ gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+ !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) {
+ if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz &&
+ (*flush ||
+ cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz ==
+ cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) {
+ *pop_lookahead = 0;
+ }
+ }
+
+ // LAP stage does not have ARFs or forward key-frames,
+ // hence, always pop_lookahead here.
+ if (is_stat_generation_stage(cpi)) {
+ *pop_lookahead = 1;
+ src_index = 0;
+ }
+
+ *show_frame = *pop_lookahead;
+
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) {
+#else
+ {
+#endif // CONFIG_FPMT_TEST
+ // Future frame in parallel encode set
+ if (gf_group->src_offset[cpi->gf_frame_index] != 0 &&
+ !is_stat_generation_stage(cpi))
+ src_index = gf_group->src_offset[cpi->gf_frame_index];
+ }
+ if (*show_frame) {
+ // show frame, pop from buffer
+ // Get last frame source.
+ if (cm->current_frame.frame_number > 0) {
+ *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1,
+ cpi->compressor_stage);
+ }
+ // Read in the source frame.
+ source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+ cpi->compressor_stage);
+ } else {
+ // no show frames are arf frames
+ source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+ cpi->compressor_stage);
+ if (source != NULL) {
+ cm->showable_frame = 1;
+ }
+ }
+ return source;
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient or
+// S-Frame. An exception can be made in the case of a keyframe, since it does
+// not depend on any previous frames.
+static int allow_show_existing(const AV1_COMP *const cpi,
+ unsigned int frame_flags) {
+ if (cpi->common.current_frame.frame_number == 0) return 0;
+
+ const struct lookahead_entry *lookahead_src =
+ av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+ if (lookahead_src == NULL) return 1;
+
+ const int is_error_resilient =
+ cpi->oxcf.tool_cfg.error_resilient_mode ||
+ (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+ const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe ||
+ (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+ const int is_key_frame =
+ (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
+ return !(is_error_resilient || is_s_frame) || is_key_frame;
+}
+
+// Update frame_flags to tell the encoder's caller what sort of frame was
+// encoded.
+static void update_frame_flags(const AV1_COMMON *const cm,
+ const RefreshFrameInfo *const refresh_frame,
+ unsigned int *frame_flags) {
+ if (encode_show_existing_frame(cm)) {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
+ return;
+ }
+
+ if (refresh_frame->golden_frame) {
+ *frame_flags |= FRAMEFLAGS_GOLDEN;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+ }
+
+ if (refresh_frame->alt_ref_frame) {
+ *frame_flags |= FRAMEFLAGS_ALTREF;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+ }
+
+ if (refresh_frame->bwd_ref_frame) {
+ *frame_flags |= FRAMEFLAGS_BWDREF;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+ }
+
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ *frame_flags |= FRAMEFLAGS_KEY;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
+ }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *const ref_buf,
+ char *file_name) {
+ int h;
+ FILE *f_ref = NULL;
+
+ if (ref_buf == NULL) {
+ printf("Frame data buffer is NULL.\n");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ if ((f_ref = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+
+ fclose(f_ref);
+
+ return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ char file_name[256] = "";
+ snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+ cm->current_frame.frame_number, ref_frame);
+ dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name);
+ }
+}
+#endif // DUMP_REF_FRAME_IMAGES == 1
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
+ int ref_map_index;
+
+ for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
+ if ((refresh_frame_flags >> ref_map_index) & 1) break;
+
+ if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX;
+ return ref_map_index;
+}
+
+static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) {
+ for (int idx = 0; idx < REF_FRAMES; ++idx)
+ if (ref_map_pairs[idx].disp_order == -1) return idx;
+ return INVALID_IDX;
+}
+
+static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int update_arf, GF_GROUP *gf_group, int gf_index,
+ int enable_refresh_skip, int cur_frame_disp) {
+ int arf_count = 0;
+ int oldest_arf_order = INT32_MAX;
+ int oldest_arf_idx = -1;
+
+ int oldest_frame_order = INT32_MAX;
+ int oldest_idx = -1;
+
+ for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+ RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+ if (ref_pair.disp_order == -1) continue;
+ const int frame_order = ref_pair.disp_order;
+ const int reference_frame_level = ref_pair.pyr_level;
+ // Keep future frames and three closest previous frames in output order.
+ if (frame_order > cur_frame_disp - 3) continue;
+
+ if (enable_refresh_skip) {
+ int skip_frame = 0;
+ // Prevent refreshing a frame in gf_group->skip_frame_refresh.
+ for (int i = 0; i < REF_FRAMES; i++) {
+ int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i];
+ if (frame_to_skip == INVALID_IDX) break;
+ if (frame_order == frame_to_skip) {
+ skip_frame = 1;
+ break;
+ }
+ }
+ if (skip_frame) continue;
+ }
+
+ // Keep track of the oldest level 1 frame if the current frame is also level
+ // 1.
+ if (reference_frame_level == 1) {
+ // If there are more than 2 level 1 frames in the reference list,
+ // discard the oldest.
+ if (frame_order < oldest_arf_order) {
+ oldest_arf_order = frame_order;
+ oldest_arf_idx = map_idx;
+ }
+ arf_count++;
+ continue;
+ }
+
+ // Update the overall oldest reference frame.
+ if (frame_order < oldest_frame_order) {
+ oldest_frame_order = frame_order;
+ oldest_idx = map_idx;
+ }
+ }
+ if (update_arf && arf_count > 2) return oldest_arf_idx;
+ if (oldest_idx >= 0) return oldest_idx;
+ if (oldest_arf_idx >= 0) return oldest_arf_idx;
+ if (oldest_idx == -1) {
+ assert(arf_count > 2 && enable_refresh_skip);
+ return oldest_arf_idx;
+ }
+ assert(0 && "No valid refresh index found");
+ return -1;
+}
+
+// Computes the reference refresh index for INTNL_ARF_UPDATE frame.
+int av1_calc_refresh_idx_for_intnl_arf(
+ AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int gf_index) {
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ // Search for the open slot to store the current frame.
+ int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+ // Use a free slot if available.
+ if (free_fb_index != INVALID_IDX) {
+ return free_fb_index;
+ } else {
+ int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+ int refresh_idx =
+ get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index,
+ enable_refresh_skip, gf_group->display_idx[gf_index]);
+ return refresh_idx;
+ }
+}
+
+int av1_get_refresh_frame_flags(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+ FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &cpi->ext_flags.refresh_frame;
+
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->refbuf_state[gf_index] == REFBUF_RESET)
+ return SELECT_ALL_BUF_SLOTS;
+
+ // TODO(jingning): Deprecate the following operations.
+ // Switch frames and shown key-frames overwrite all reference slots
+ if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS;
+
+ // show_existing_frames don't actually send refresh_frame_flags so set the
+ // flags to 0 to keep things consistent.
+ if (frame_params->show_existing_frame) return 0;
+
+ const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0;
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+ int new_fb_map_idx = cpi->ppi->gf_group.update_ref_idx[gf_index];
+ if (new_fb_map_idx == INVALID_IDX) return 0;
+ return 1 << new_fb_map_idx;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ int refresh_mask = 0;
+ if (ext_refresh_frame_flags->update_pending) {
+ if (rtc_ref->set_ref_frame_config ||
+ use_rtc_reference_structure_one_layer(cpi)) {
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+ << ref_frame_map_idx;
+ }
+ return refresh_mask;
+ }
+ // Unfortunately the encoder interface reflects the old refresh_*_frame
+ // flags so we have to replicate the old refresh_frame_flags logic here in
+ // order to preserve the behaviour of the flag overrides.
+ int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx;
+
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame
+ << ref_frame_map_idx;
+
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame
+ << ref_frame_map_idx;
+
+ if (frame_update_type == OVERLAY_UPDATE) {
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->golden_frame
+ << ref_frame_map_idx;
+ } else {
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->golden_frame
+ << ref_frame_map_idx;
+
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->alt_ref_frame
+ << ref_frame_map_idx;
+ }
+ return refresh_mask;
+ }
+
+ // Search for the open slot to store the current frame.
+ int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+ // No refresh necessary for these frame types.
+ if (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE)
+ return refresh_mask;
+
+ // If there is an open slot, refresh that one instead of replacing a
+ // reference.
+ if (free_fb_index != INVALID_IDX) {
+ refresh_mask = 1 << free_fb_index;
+ return refresh_mask;
+ }
+ const int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+ const int update_arf = frame_update_type == ARF_UPDATE;
+ const int refresh_idx =
+ get_refresh_idx(ref_frame_map_pairs, update_arf, &cpi->ppi->gf_group,
+ gf_index, enable_refresh_skip, cur_disp_order);
+ return 1 << refresh_idx;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
+
+ av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, num_planes);
+
+ set_mi_offsets(&cm->mi_params, xd, 0, 0);
+}
+
+// Apply temporal filtering to source frames and encode the filtered frame.
+// If the current frame does not require filtering, this function is identical
+// to av1_encode() except that tpl is not performed.
+static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time);
+#endif
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int is_second_arf =
+ av1_gop_is_second_arf(gf_group, cpi->gf_frame_index);
+
+ // Decide whether to apply temporal filtering to the source frame.
+ int apply_filtering =
+ av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi);
+ if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) {
+ apply_filtering = 0;
+ }
+ if (apply_filtering) {
+ if (frame_params->frame_type == KEY_FRAME) {
+ // TODO(angiebird): Move the noise level check to av1_tf_info_filtering.
+ // Decide whether it is allowed to perform key frame filtering
+ int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering &&
+ !frame_params->show_existing_frame &&
+ !is_lossless_requested(&oxcf->rc_cfg);
+ if (allow_kf_filtering) {
+ double y_noise_level = 0.0;
+ av1_estimate_noise_level(
+ frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+ cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD);
+ apply_filtering = y_noise_level > 0;
+ } else {
+ apply_filtering = 0;
+ }
+ // If we are doing kf filtering, set up a few things.
+ if (apply_filtering) {
+ av1_setup_past_independence(cm);
+ }
+ } else if (is_second_arf) {
+ apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering;
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
+#endif
+ // Save the pointer to the original source image.
+ YV12_BUFFER_CONFIG *source_buffer = frame_input->source;
+ // apply filtering to frame
+ if (apply_filtering) {
+ int show_existing_alt_ref = 0;
+ FRAME_DIFF frame_diff;
+ int top_index = 0;
+ int bottom_index = 0;
+ const int q_index = av1_rc_pick_q_and_bounds(
+ cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+ cpi->gf_frame_index, &bottom_index, &top_index);
+
+ // TODO(bohanli): figure out why we need frame_type in cm here.
+ cm->current_frame.frame_type = frame_params->frame_type;
+ if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+ YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf(
+ &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff);
+ if (tf_buf != NULL) {
+ frame_input->source = tf_buf;
+ show_existing_alt_ref = av1_check_show_filtered_frame(
+ tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth);
+ if (show_existing_alt_ref) {
+ cpi->common.showable_frame |= 1;
+ } else {
+ cpi->common.showable_frame = 0;
+ }
+ }
+ if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) {
+ cpi->ppi->show_existing_alt_ref = show_existing_alt_ref;
+ }
+ }
+
+ if (is_second_arf) {
+ // Allocate the memory for tf_buf_second_arf buffer, only when it is
+ // required.
+ int ret = aom_realloc_frame_buffer(
+ &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width,
+ oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0);
+ if (ret)
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate tf_buf_second_arf");
+
+ YV12_BUFFER_CONFIG *tf_buf_second_arf =
+ &cpi->ppi->tf_info.tf_buf_second_arf;
+ // We didn't apply temporal filtering for second arf ahead in
+ // av1_tf_info_filtering().
+ const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+ // Right now, we are still using tf_buf_second_arf due to
+ // implementation complexity.
+ // TODO(angiebird): Reuse tf_info->tf_buf here.
+ av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff,
+ tf_buf_second_arf);
+ show_existing_alt_ref = av1_check_show_filtered_frame(
+ tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth);
+ if (show_existing_alt_ref) {
+ aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm));
+ frame_input->source = tf_buf_second_arf;
+ }
+ // Currently INTNL_ARF_UPDATE only do show_existing.
+ cpi->common.showable_frame |= 1;
+ }
+
+ // Copy source metadata to the temporal filtered frame
+ if (source_buffer->metadata &&
+ aom_copy_metadata_to_frame_buffer(frame_input->source,
+ source_buffer->metadata)) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to copy source metadata to the temporal filtered frame");
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time);
+#endif
+
+ int set_mv_params = frame_params->frame_type == KEY_FRAME ||
+ update_type == ARF_UPDATE || update_type == GF_UPDATE;
+ cm->show_frame = frame_params->show_frame;
+ cm->current_frame.frame_type = frame_params->frame_type;
+ // TODO(bohanli): Why is this? what part of it is necessary?
+ av1_set_frame_size(cpi, cm->width, cm->height);
+ if (set_mv_params) av1_set_mv_search_params(cpi);
+
+#if CONFIG_RD_COMMAND
+ if (frame_params->frame_type == KEY_FRAME) {
+ char filepath[] = "rd_command.txt";
+ av1_read_rd_command(filepath, &cpi->rd_command);
+ }
+#endif // CONFIG_RD_COMMAND
+ if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) {
+ // perform tpl after filtering
+ int allow_tpl =
+ oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model;
+ if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) {
+ allow_tpl = 0;
+ }
+ if (frame_params->frame_type != KEY_FRAME) {
+ // In rare case, it's possible to have non ARF/GF update_type here.
+ // We should set allow_tpl to zero in the situation
+ allow_tpl =
+ allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+ (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode ==
+ DUCKY_ENCODE_GOP_MODE_RCL));
+ }
+
+ if (allow_tpl) {
+ if (!cpi->skip_tpl_setup_stats) {
+ av1_tpl_preload_rc_estimate(cpi, frame_params);
+ av1_tpl_setup_stats(cpi, 0, frame_params);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+ assert(cpi->gf_frame_index == 0);
+ av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+ gf_group, cm->seq_params->bit_depth);
+#endif
+ }
+ } else {
+ av1_init_tpl_stats(&cpi->ppi->tpl_data);
+ }
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ if (cpi->oxcf.pass == AOM_RC_SECOND_PASS &&
+ cpi->second_pass_log_stream != NULL) {
+ TPL_INFO *tpl_info;
+ AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+ av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data);
+ av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream,
+ cpi->common.error);
+ aom_free(tpl_info);
+ }
+#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ }
+
+ if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // Set frame_input source to true source for psnr calculation.
+ if (apply_filtering && is_psnr_calc_enabled(cpi)) {
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0,
+ false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ cpi->unscaled_source = source_buffer;
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time);
+#endif
+ return AOM_CODEC_OK;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+/*!\cond */
+// Struct to keep track of relevant reference frame data.
+typedef struct {
+ int map_idx;
+ int disp_order;
+ int pyr_level;
+ int used;
+} RefBufMapData;
+/*!\endcond */
+
+// Comparison function to sort reference frames in ascending display order.
+static int compare_map_idx_pair_asc(const void *a, const void *b) {
+ if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) {
+ return 0;
+ } else if (((const RefBufMapData *)a)->disp_order >
+ ((const RefBufMapData *)b)->disp_order) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+// Checks to see if a particular reference frame is already in the reference
+// frame map.
+static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) {
+ for (int i = 0; i < n_frames; i++) {
+ if (disp_order == map[i].disp_order) return 1;
+ }
+ return 0;
+}
+
+// Add a reference buffer index to a named reference slot.
+static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx,
+ int frame) {
+ remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx;
+ ref->used = 1;
+}
+
+// Threshold dictating when we are allowed to start considering
+// leaving lowest level frames unmapped.
+#define LOW_LEVEL_FRAMES_TR 5
+
+// Find which reference buffer should be left out of the named mapping.
+// This is because there are 8 reference buffers and only 7 named slots.
+static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs,
+ int n_min_level_refs, int min_level,
+ int cur_frame_disp) {
+ int max_dist = 0;
+ int unmapped_idx = -1;
+ if (n_bufs <= ALTREF_FRAME) return;
+ for (int i = 0; i < n_bufs; i++) {
+ if (buffer_map[i].used) continue;
+ if (buffer_map[i].pyr_level != min_level ||
+ n_min_level_refs >= LOW_LEVEL_FRAMES_TR) {
+ int dist = abs(cur_frame_disp - buffer_map[i].disp_order);
+ if (dist > max_dist) {
+ max_dist = dist;
+ unmapped_idx = i;
+ }
+ }
+ }
+ assert(unmapped_idx >= 0 && "Unmapped reference not found");
+ buffer_map[unmapped_idx].used = 1;
+}
+
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int cur_frame_disp, const AV1_COMP *cpi, int gf_index,
+ int is_parallel_encode,
+ int remapped_ref_idx[REF_FRAMES]) {
+ int buf_map_idx = 0;
+
+ // Initialize reference frame mappings.
+ for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+ for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) {
+ if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) {
+ remapped_ref_idx[rf - LAST_FRAME] =
+ cpi->ppi->gf_group.ref_frame_list[gf_index][rf];
+ }
+ }
+
+ int valid_rf_idx = 0;
+ static const int ref_frame_type_order[REF_FRAMES - LAST_FRAME] = {
+ GOLDEN_FRAME, ALTREF_FRAME, LAST_FRAME, BWDREF_FRAME,
+ ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME
+ };
+ for (int i = 0; i < REF_FRAMES - LAST_FRAME; i++) {
+ int rf = ref_frame_type_order[i];
+ if (remapped_ref_idx[rf - LAST_FRAME] != INVALID_IDX) {
+ valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME];
+ break;
+ }
+ }
+
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (remapped_ref_idx[i] == INVALID_IDX) {
+ remapped_ref_idx[i] = valid_rf_idx;
+ }
+ }
+
+ return;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ RefBufMapData buffer_map[REF_FRAMES];
+ int n_bufs = 0;
+ memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0]));
+ int min_level = MAX_ARF_LAYERS;
+ int max_level = 0;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ int skip_ref_unmapping = 0;
+ int is_one_pass_rt = is_one_pass_rt_params(cpi);
+
+ // Go through current reference buffers and store display order, pyr level,
+ // and map index.
+ for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+ // Get reference frame buffer.
+ RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+ if (ref_pair.disp_order == -1) continue;
+ const int frame_order = ref_pair.disp_order;
+ // Avoid duplicates.
+ if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue;
+ const int reference_frame_level = ref_pair.pyr_level;
+
+ // Keep track of the lowest and highest levels that currently exist.
+ if (reference_frame_level < min_level) min_level = reference_frame_level;
+ if (reference_frame_level > max_level) max_level = reference_frame_level;
+
+ buffer_map[n_bufs].map_idx = map_idx;
+ buffer_map[n_bufs].disp_order = frame_order;
+ buffer_map[n_bufs].pyr_level = reference_frame_level;
+ buffer_map[n_bufs].used = 0;
+ n_bufs++;
+ }
+
+ // Sort frames in ascending display order.
+ qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc);
+
+ int n_min_level_refs = 0;
+ int closest_past_ref = -1;
+ int golden_idx = -1;
+ int altref_idx = -1;
+
+ // Find the GOLDEN_FRAME and BWDREF_FRAME.
+ // Also collect various stats about the reference frames for the remaining
+ // mappings.
+ for (int i = n_bufs - 1; i >= 0; i--) {
+ if (buffer_map[i].pyr_level == min_level) {
+ // Keep track of the number of lowest level frames.
+ n_min_level_refs++;
+ if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 &&
+ remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) {
+ // Save index for GOLDEN.
+ golden_idx = i;
+ } else if (buffer_map[i].disp_order > cur_frame_disp &&
+ altref_idx == -1 &&
+ remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) {
+ // Save index for ALTREF.
+ altref_idx = i;
+ }
+ } else if (buffer_map[i].disp_order == cur_frame_disp) {
+ // Map the BWDREF_FRAME if this is the show_existing_frame.
+ add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME);
+ }
+
+ // During parallel encodes of lower layer frames, exclude the first frame
+ // (frame_parallel_level 1) from being used for the reference assignment of
+ // the second frame (frame_parallel_level 2).
+ if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 &&
+ gf_group->frame_parallel_level[gf_index - 1] == 1 &&
+ gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) {
+ assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE);
+#if CONFIG_FPMT_TEST
+ is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE)
+ ? is_parallel_encode
+ : 0;
+#endif // CONFIG_FPMT_TEST
+ // If parallel cpis are active, use ref_idx_to_skip, else, use display
+ // index.
+ assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX));
+ assert(IMPLIES(!is_parallel_encode,
+ gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX));
+ buffer_map[i].used = is_parallel_encode
+ ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip)
+ : (buffer_map[i].disp_order ==
+ gf_group->skip_frame_as_ref[gf_index]);
+ // In case a ref frame is excluded from being used during assignment,
+ // skip the call to set_unmapped_ref(). Applicable in steady state.
+ if (buffer_map[i].used) skip_ref_unmapping = 1;
+ }
+
+ // Keep track of where the frames change from being past frames to future
+ // frames.
+ if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0)
+ closest_past_ref = i;
+ }
+
+ // Do not map GOLDEN and ALTREF based on their pyramid level if all reference
+ // frames have the same level.
+ if (n_min_level_refs <= n_bufs) {
+ // Map the GOLDEN_FRAME.
+ if (golden_idx > -1)
+ add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME);
+ // Map the ALTREF_FRAME.
+ if (altref_idx > -1)
+ add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME);
+ }
+
+ // Find the buffer to be excluded from the mapping.
+ if (!skip_ref_unmapping)
+ set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level,
+ cur_frame_disp);
+
+ // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME.
+ for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer
+ // in decreasing ouptut order relative to current picture.
+ int next_buf_max = 0;
+ int next_disp_order = INT_MIN;
+ for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used &&
+ buffer_map[buf_map_idx].disp_order < cur_frame_disp &&
+ buffer_map[buf_map_idx].disp_order > next_disp_order) {
+ next_disp_order = buffer_map[buf_map_idx].disp_order;
+ next_buf_max = buf_map_idx;
+ }
+ }
+ buf_map_idx = next_buf_max;
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME.
+ for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer
+ // in increasing ouptut order relative to current picture.
+ int next_buf_max = 0;
+ int next_disp_order = INT_MAX;
+ for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used &&
+ buffer_map[buf_map_idx].disp_order > cur_frame_disp &&
+ buffer_map[buf_map_idx].disp_order < next_disp_order) {
+ next_disp_order = buffer_map[buf_map_idx].disp_order;
+ next_buf_max = buf_map_idx;
+ }
+ }
+ buf_map_idx = next_buf_max;
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Place remaining past frames.
+ buf_map_idx = closest_past_ref;
+ for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer.
+ for (; buf_map_idx >= 0; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used) break;
+ }
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Place remaining future frames.
+ buf_map_idx = n_bufs - 1;
+ for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer.
+ for (; buf_map_idx > closest_past_ref; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used) break;
+ }
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Fill any slots that are empty (should only happen for the first 7 frames).
+ for (int i = 0; i < REF_FRAMES; ++i)
+ if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0;
+}
+
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+ uint8_t *const dest, unsigned int *frame_flags,
+ int64_t *const time_stamp, int64_t *const time_end,
+ const aom_rational64_t *const timestamp_ratio,
+ int *const pop_lookahead, int flush) {
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ GFConfig *const gf_cfg = &oxcf->gf_cfg;
+
+ EncodeFrameInput frame_input;
+ EncodeFrameParams frame_params;
+ EncodeFrameResults frame_results;
+ memset(&frame_input, 0, sizeof(frame_input));
+ memset(&frame_params, 0, sizeof(frame_params));
+ memset(&frame_results, 0, sizeof(frame_results));
+
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info;
+ if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) {
+ THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+ av1_open_second_pass_log(cpi, 1);
+ FILE *second_pass_log_stream = cpi->second_pass_log_stream;
+ fseek(second_pass_log_stream, 0, SEEK_END);
+ size_t file_size = ftell(second_pass_log_stream);
+ rewind(second_pass_log_stream);
+ size_t read_size = 0;
+ while (read_size < file_size) {
+ THIRD_PASS_GOP_INFO gop_info;
+ struct aom_internal_error_info *error = cpi->common.error;
+ // Read in GOP information from the second pass file.
+ av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error);
+ TPL_INFO *tpl_info;
+ AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+ av1_read_tpl_info(tpl_info, second_pass_log_stream, error);
+ // Read in per-frame info from second-pass encoding
+ av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info,
+ gop_info.num_frames, error);
+ av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info);
+ read_size = ftell(second_pass_log_stream);
+ aom_free(tpl_info);
+ }
+ av1_close_second_pass_log(cpi);
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level;
+ av1_vbr_rc_compute_q_indices(
+ vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count,
+ vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth,
+ vbr_rc_info->q_index_list);
+ } else {
+ vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+ vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth,
+ vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count,
+ vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list,
+ vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+ }
+ vbr_rc_info->ready = 1;
+#if CONFIG_RATECTRL_LOG
+ rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index,
+ vbr_rc_info->total_frame_count);
+#endif // CONFIG_RATECTRL_LOG
+ }
+#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+
+ // Check if we need to stuff more src frames
+ if (flush == 0) {
+ int srcbuf_size =
+ av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+ int pop_size =
+ av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage);
+
+ // Continue buffering look ahead buffer.
+ if (srcbuf_size < pop_size) return -1;
+ }
+
+ if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) {
+#if !CONFIG_REALTIME_ONLY
+ if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+ !cpi->ppi->twopass.first_pass_done) {
+ av1_end_first_pass(cpi); /* get last stats packet */
+ cpi->ppi->twopass.first_pass_done = 1;
+ }
+#endif
+ return -1;
+ }
+
+ // TODO(sarahparker) finish bit allocation for one pass pyramid
+ if (has_no_stats_stage(cpi)) {
+ gf_cfg->gf_max_pyr_height =
+ AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+ gf_cfg->gf_min_pyr_height =
+ AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
+ }
+
+ // Allocation of mi buffers.
+ alloc_mb_mode_info_buffers(cpi);
+
+ cpi->skip_tpl_setup_stats = 0;
+#if !CONFIG_REALTIME_ONLY
+ if (oxcf->pass != AOM_RC_FIRST_PASS) {
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ if (tpl_data->tpl_stats_pool[0] == NULL) {
+ av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width,
+ oxcf->frm_dim_cfg.height, 0,
+ oxcf->gf_cfg.lag_in_frames);
+ }
+ }
+ cpi->twopass_frame.this_frame = NULL;
+ const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
+ if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_get_second_pass_params_time);
+#endif
+
+ // Initialise frame_level_rate_correction_factors with value previous
+ // to the parallel frames.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ for (int i = 0; i < RATE_FACTOR_LEVELS; i++) {
+ cpi->rc.frame_level_rate_correction_factors[i] =
+#if CONFIG_FPMT_TEST
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+ ? cpi->ppi->p_rc.temp_rate_correction_factors[i]
+ :
+#endif // CONFIG_FPMT_TEST
+ cpi->ppi->p_rc.rate_correction_factors[i];
+ }
+ }
+
+ // copy mv_stats from ppi to frame_level cpi.
+ cpi->mv_stats = cpi->ppi->mv_stats;
+ av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_get_second_pass_params_time);
+#endif
+ }
+#endif
+
+ if (!is_stat_generation_stage(cpi)) {
+ // TODO(jingning): fwd key frame always uses show existing frame?
+ if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
+ gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ frame_params.show_existing_frame = 1;
+ } else {
+ frame_params.show_existing_frame =
+ (cpi->ppi->show_existing_alt_ref &&
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE;
+ }
+ frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+
+ // Special handling to reset 'show_existing_frame' in case of dropped
+ // frames.
+ if (oxcf->rc_cfg.drop_frames_water_mark &&
+ (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE)) {
+ // During the encode of an OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE frame, loop
+ // over the gf group to check if the corresponding
+ // ARF_UPDATE/INTNL_ARF_UPDATE frame was dropped.
+ int cur_disp_idx = gf_group->display_idx[cpi->gf_frame_index];
+ for (int idx = 0; idx < cpi->gf_frame_index; idx++) {
+ if (cur_disp_idx == gf_group->display_idx[idx]) {
+ assert(IMPLIES(
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE,
+ gf_group->update_type[idx] == ARF_UPDATE));
+ assert(IMPLIES(gf_group->update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE,
+ gf_group->update_type[idx] == INTNL_ARF_UPDATE));
+ // Reset show_existing_frame and set cpi->is_dropped_frame to true if
+ // the frame was dropped during its first encode.
+ if (gf_group->is_frame_dropped[idx]) {
+ frame_params.show_existing_frame = 0;
+ assert(!cpi->is_dropped_frame);
+ cpi->is_dropped_frame = true;
+ }
+ break;
+ }
+ }
+ }
+
+ // Reset show_existing_alt_ref decision to 0 after it is used.
+ if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+ cpi->ppi->show_existing_alt_ref = 0;
+ }
+ } else {
+ frame_params.show_existing_frame = 0;
+ }
+
+ struct lookahead_entry *source = NULL;
+ struct lookahead_entry *last_source = NULL;
+ if (frame_params.show_existing_frame) {
+ source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+ *pop_lookahead = 1;
+ frame_params.show_frame = 1;
+ } else {
+ source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source,
+ &frame_params.show_frame);
+ }
+
+ if (source == NULL) { // If no source was found, we can't encode a frame.
+#if !CONFIG_REALTIME_ONLY
+ if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+ !cpi->ppi->twopass.first_pass_done) {
+ av1_end_first_pass(cpi); /* get last stats packet */
+ cpi->ppi->twopass.first_pass_done = 1;
+ }
+#endif
+ return -1;
+ }
+
+ // reset src_offset to allow actual encode call for this frame to get its
+ // source.
+ gf_group->src_offset[cpi->gf_frame_index] = 0;
+
+ // Source may be changed if temporal filtered later.
+ frame_input.source = &source->img;
+ if ((cpi->ppi->use_svc || cpi->rc.prev_frame_is_dropped) &&
+ last_source != NULL)
+ av1_svc_set_last_source(cpi, &frame_input, &last_source->img);
+ else
+ frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+ frame_input.ts_duration = source->ts_end - source->ts_start;
+ // Save unfiltered source. It is used in av1_get_second_pass_params().
+ cpi->unfiltered_source = frame_input.source;
+
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ if (source->ts_start < cpi->time_stamps.first_ts_start) {
+ cpi->time_stamps.first_ts_start = source->ts_start;
+ cpi->time_stamps.prev_ts_end = source->ts_start;
+ }
+
+ av1_apply_encoding_flags(cpi, source->flags);
+ *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ cpi->framerate = cpi->temp_framerate;
+ }
+ }
+#endif // CONFIG_FPMT_TEST
+
+ // Shown frames and arf-overlay frames need frame-rate considering
+ if (frame_params.show_frame)
+ adjust_frame_rate(cpi, source->ts_start, source->ts_end);
+
+ if (!frame_params.show_existing_frame) {
+ if (cpi->film_grain_table) {
+ cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
+ cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+ &cm->film_grain_params);
+ } else {
+ cm->cur_frame->film_grain_params_present =
+ cm->seq_params->film_grain_params_present;
+ }
+ // only one operating point supported now
+ const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
+ if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+
+ cm->frame_presentation_time = (uint32_t)pts64;
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
+#if CONFIG_REALTIME_ONLY
+ av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+ *frame_flags);
+ if (use_rtc_reference_structure_one_layer(cpi))
+ av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
+#else
+ if (use_one_pass_rt_params) {
+ av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+ *frame_flags);
+ if (use_rtc_reference_structure_one_layer(cpi))
+ av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
+ }
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
+
+ FRAME_UPDATE_TYPE frame_update_type =
+ get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+ if (frame_params.show_existing_frame &&
+ frame_params.frame_type != KEY_FRAME) {
+ // Force show-existing frames to be INTER, except forward keyframes
+ frame_params.frame_type = INTER_FRAME;
+ }
+
+ // Per-frame encode speed. In theory this can vary, but things may have
+ // been written assuming speed-level will not change within a sequence, so
+ // this parameter should be used with caution.
+ frame_params.speed = oxcf->speed;
+
+#if !CONFIG_REALTIME_ONLY
+ // Set forced key frames when necessary. For two-pass encoding / lap mode,
+ // this is already handled by av1_get_second_pass_params. However when no
+ // stats are available, we still need to check if the new frame is a keyframe.
+ // For one pass rt, this is already checked in av1_get_one_pass_rt_params.
+ if (!use_one_pass_rt_params &&
+ (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) {
+ // Current frame is coded as a key-frame for any of the following cases:
+ // 1) First frame of a video
+ // 2) For all-intra frame encoding
+ // 3) When a key-frame is forced
+ const int kf_requested =
+ (cm->current_frame.frame_number == 0 ||
+ oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY));
+ if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
+ frame_update_type != INTNL_OVERLAY_UPDATE) {
+ frame_params.frame_type = KEY_FRAME;
+ } else if (is_stat_generation_stage(cpi)) {
+ // For stats generation, set the frame type to inter here.
+ frame_params.frame_type = INTER_FRAME;
+ }
+ }
+#endif
+
+ // Work out some encoding parameters specific to the pass:
+ if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_update_parameters(cpi);
+ } else if (is_stat_generation_stage(cpi)) {
+ cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
+ } else if (is_stat_consumption_stage(cpi)) {
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+ cm->txcoeff_cost_timer = 0;
+ cm->txcoeff_cost_count = 0;
+#endif
+ }
+
+ if (!is_stat_generation_stage(cpi))
+ set_ext_overrides(cm, &frame_params, ext_flags);
+
+ // Shown keyframes and S frames refresh all reference buffers
+ const int force_refresh_all =
+ ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) ||
+ frame_params.frame_type == S_FRAME) &&
+ !frame_params.show_existing_frame;
+
+ av1_configure_buffer_updates(
+ cpi, &frame_params.refresh_frame, frame_update_type,
+ gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all);
+
+ if (!is_stat_generation_stage(cpi)) {
+ const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
+
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+ init_ref_map_pair(cpi, ref_frame_map_pairs);
+ const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+ const int cur_frame_disp =
+ cpi->common.current_frame.frame_number + order_offset;
+
+ int get_ref_frames = 0;
+#if CONFIG_FPMT_TEST
+ get_ref_frames =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (get_ref_frames ||
+ gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (!ext_flags->refresh_frame.update_pending) {
+ av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi,
+ cpi->gf_frame_index, 1, cm->remapped_ref_idx);
+ } else if (cpi->ppi->rtc_ref.set_ref_frame_config ||
+ use_rtc_reference_structure_one_layer(cpi)) {
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+ cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i];
+ }
+ }
+
+ // Get the reference frames
+ bool has_ref_frames = false;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ const RefCntBuffer *ref_frame =
+ get_ref_frame_buf(cm, ref_frame_priority_order[i]);
+ ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL;
+ if (ref_frame != NULL) has_ref_frames = true;
+ }
+ if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME ||
+ frame_params.frame_type == S_FRAME)) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // Work out which reference frame slots may be used.
+ frame_params.ref_frame_flags =
+ get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf,
+ ext_flags->ref_frame_flags);
+
+ // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE.
+ if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) {
+ frame_params.primary_ref_frame = PRIMARY_REF_NONE;
+ } else {
+ frame_params.primary_ref_frame =
+ choose_primary_ref_frame(cpi, &frame_params);
+ }
+
+ frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+ // Call av1_get_refresh_frame_flags() if refresh index not available.
+ if (!cpi->refresh_idx_available) {
+ frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+ cpi, &frame_params, frame_update_type, cpi->gf_frame_index,
+ cur_frame_disp, ref_frame_map_pairs);
+ } else {
+ assert(cpi->ref_refresh_index != INVALID_IDX);
+ frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index);
+ }
+
+ // Make the frames marked as is_frame_non_ref to non-reference frames.
+ if (gf_group->is_frame_non_ref[cpi->gf_frame_index])
+ frame_params.refresh_frame_flags = 0;
+
+ frame_params.existing_fb_idx_to_show = INVALID_IDX;
+ // Find the frame buffer to show based on display order.
+ if (frame_params.show_existing_frame) {
+ for (int frame = 0; frame < REF_FRAMES; frame++) {
+ const RefCntBuffer *const buf = cm->ref_frame_map[frame];
+ if (buf == NULL) continue;
+ const int frame_order = (int)buf->display_order_hint;
+ if (frame_order == cur_frame_disp)
+ frame_params.existing_fb_idx_to_show = frame;
+ }
+ }
+ }
+
+ // The way frame_params->remapped_ref_idx is setup is a placeholder.
+ // Currently, reference buffer assignment is done by update_ref_frame_map()
+ // which is called by high-level strategy AFTER encoding a frame. It
+ // modifies cm->remapped_ref_idx. If you want to use an alternative method
+ // to determine reference buffer assignment, just put your assignments into
+ // frame_params->remapped_ref_idx here and they will be used when encoding
+ // this frame. If frame_params->remapped_ref_idx is setup independently of
+ // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
+ memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
+ REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+ cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0;
+
+ if (!frame_params.show_existing_frame) {
+ cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
+ }
+
+ const int is_intra_frame = frame_params.frame_type == KEY_FRAME ||
+ frame_params.frame_type == INTRA_ONLY_FRAME;
+ FeatureFlags *const features = &cm->features;
+ if (!is_stat_generation_stage(cpi) &&
+ (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) &&
+ is_intra_frame) {
+ av1_set_screen_content_options(cpi, features);
+ }
+
+#if CONFIG_REALTIME_ONLY
+ if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+#else
+ if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+ gf_cfg->lag_in_frames == 0) {
+ if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+ &frame_results) != AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+#endif // CONFIG_REALTIME_ONLY
+
+ // This is used in rtc temporal filter case. Use true source in the PSNR
+ // calculation.
+ if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+ cpi->common.current_frame.frame_type != KEY_FRAME) {
+ assert(cpi->orig_source.buffer_alloc_sz > 0);
+ cpi->source = &cpi->orig_source;
+ }
+
+ if (!is_stat_generation_stage(cpi)) {
+ // First pass doesn't modify reference buffer assignment or produce frame
+ // flags
+ update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+ set_additional_frame_flags(cm, frame_flags);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#if TXCOEFF_COST_TIMER
+ if (!is_stat_generation_stage(cpi)) {
+ cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+ fprintf(stderr,
+ "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+ "in us\n",
+ cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+ cm->cum_txcoeff_cost_timer);
+ }
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_TUNE_VMAF
+ if (!is_stat_generation_stage(cpi) &&
+ (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+ av1_update_vmaf_curve(cpi);
+ }
+#endif
+
+ // Unpack frame_results:
+ *size = frame_results.size;
+
+ // Leave a signal for a higher level caller about if this frame is droppable
+ if (*size > 0) {
+ cpi->droppable =
+ is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame);
+ }
+
+ // For SVC, or when frame-dropper is enabled:
+ // keep track of the (unscaled) source corresponding to the refresh of LAST
+ // reference (base temporal layer - TL0). Copy only for the
+ // top spatial enhancement layer so all spatial layers of the next
+ // superframe have last_source to be aligned with previous TL0 superframe.
+ // Avoid cases where resolution changes for unscaled source (top spatial
+ // layer). Only needs to be done for frame that are encoded (size > 0).
+ if (*size > 0 &&
+ (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->svc.temporal_layer_id == 0 &&
+ cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width &&
+ cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) {
+ aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+ aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+ aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+ }
+
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/encode_strategy.h b/third_party/aom/av1/encoder/encode_strategy.h
new file mode 100644
index 0000000000..c1d14d134c
--- /dev/null
+++ b/third_party/aom/av1/encoder/encode_strategy.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares frame encoding functions.
+ */
+#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+/*!\brief Implement high-level encode strategy
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function will implement high-level encode strategy, choosing frame type,
+ * frame placement, etc. It populates an EncodeFrameParams struct with the
+ * results of these decisions and then encodes the frame. The caller should use
+ * the output parameters *time_stamp and *time_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ * \param[in] frame_flags Flags to decide how to encoding the frame
+ * \param[out] time_stamp Time stamp of the frame
+ * \param[out] time_end Time end
+ * \param[in] timestamp_ratio Time base
+ * \param[in] pop_lookahead Decide to pop the source frame from queue
+ * \param[in] flush Decide to encode one frame or the rest of frames
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+ uint8_t *const dest, unsigned int *frame_flags,
+ int64_t *const time_stamp, int64_t *const time_end,
+ const aom_rational64_t *const timestamp_ratio,
+ int *const pop_lookahead, int flush);
+
+/*!\cond */
+// Set individual buffer update flags based on frame reference type.
+// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all
+// refresh_*_frame flags to be set, because we refresh all buffers in this case.
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+ RefreshFrameInfo *const refresh_frame,
+ const FRAME_UPDATE_TYPE type,
+ const REFBUF_STATE refbuf_state,
+ int force_refresh_all);
+
+int av1_get_refresh_frame_flags(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+ FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]);
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
+
+/*!\brief Obtain indices of reference frames in ref_frame_map
+ *
+ * \callgraph
+ * \callergraph
+ *
+ * \param[out] remapped_ref_idx An array for storing indices of reference
+ * frames. The index is used to retrieve a
+ * reference frame buffer from ref_frame_map
+ * in AV1Common.
+ */
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int cur_frame_disp, const AV1_COMP *cpi, int gf_index,
+ int is_parallel_encode,
+ int remapped_ref_idx[REF_FRAMES]);
+
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+ const int up_to_index,
+ const COMPRESSOR_STAGE compressor_stage);
+
+static AOM_INLINE int is_frame_droppable(
+ const RTC_REF *const rtc_ref,
+ const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
+ // Droppable frame is only used by external refresh flags. VoD setting won't
+ // trigger its use case.
+ if (rtc_ref->set_ref_frame_config)
+ return rtc_ref->non_reference_frame;
+ else if (ext_refresh_frame_flags->update_pending)
+ return !(ext_refresh_frame_flags->alt_ref_frame ||
+ ext_refresh_frame_flags->alt2_ref_frame ||
+ ext_refresh_frame_flags->bwd_ref_frame ||
+ ext_refresh_frame_flags->golden_frame ||
+ ext_refresh_frame_flags->last_frame);
+ else
+ return 0;
+}
+
+static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) {
+ // We choose the reference "type" of this frame from the flags which indicate
+ // which reference frames will be refreshed by it. More than one of these
+ // flags may be set, so the order here implies an order of precedence. This is
+ // just used to choose the primary_ref_frame (as the most recent reference
+ // buffer of the same reference-type as the current frame).
+
+ switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) {
+ case 0: return 0;
+ case 1: return 1;
+ case MAX_ARF_LAYERS:
+ case MAX_ARF_LAYERS + 1: return 4;
+ default: return 7;
+ }
+}
+
+int av1_calc_refresh_idx_for_intnl_arf(
+ AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int gf_index);
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 0000000000..e2213a8355
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,2408 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_model_weights.h"
+#endif
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+/*!\cond */
+// This is used as a reference when computing the source variance for the
+// purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+// which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16
+};
+#endif // CONFIG_AV1_HIGHBITDEPTH
+/*!\endcond */
+
+// For the given bit depth, returns a constant array used to assist the
+// calculation of source block variance, which will then be used to decide
+// adaptive quantizers.
+static const uint8_t *get_var_offs(int use_hbd, int bd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ assert(bd == 8 || bd == 10 || bd == 12);
+ const int off_index = (bd - 8) >> 1;
+ static const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
+ AV1_HIGH_VAR_OFFS_10,
+ AV1_HIGH_VAR_OFFS_12 };
+ return CONVERT_TO_BYTEPTR(high_var_offs[off_index]);
+ }
+#else
+ (void)use_hbd;
+ (void)bd;
+ assert(!use_hbd);
+#endif
+ assert(bd == 8);
+ return AV1_VAR_OFFS;
+}
+
+void av1_init_rtc_counters(MACROBLOCK *const x) {
+ av1_init_cyclic_refresh_counters(x);
+ x->cnt_zeromv = 0;
+}
+
+void av1_accumulate_rtc_counters(AV1_COMP *cpi, const MACROBLOCK *const x) {
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, x);
+ cpi->rc.cnt_zeromv += x->cnt_zeromv;
+}
+
+unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane,
+ int use_hbd) {
+ const int subsampling_x = xd->plane[plane].subsampling_x;
+ const int subsampling_y = xd->plane[plane].subsampling_y;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ unsigned int sse;
+ const unsigned int var = cpi->ppi->fn_ptr[plane_bsize].vf(
+ ref->buf, ref->stride, get_var_offs(use_hbd, xd->bd), 0, &sse);
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[plane_bsize]);
+}
+
+unsigned int av1_get_perpixel_variance_facade(const AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane) {
+ const int use_hbd = is_cur_buf_hbd(xd);
+ return av1_get_perpixel_variance(cpi, xd, ref, bsize, plane, use_hbd);
+}
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+ int mi_row, int mi_col, const int num_planes,
+ BLOCK_SIZE bsize) {
+ // Set current frame pointer.
+ x->e_mbd.cur_buf = src;
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+ const int is_uv = i > 0;
+ setup_pred_plane(
+ &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv],
+ src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL,
+ x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+/*!\brief Assigns different quantization parameters to each super
+ * block based on its TPL weight.
+ *
+ * \ingroup tpl_modelling
+ *
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in,out] td Thread data structure
+ * \param[in,out] x Macro block level data for this block.
+ * \param[in] tile_info Tile infromation / identification
+ * \param[in] mi_row Block row (in "MI_SIZE" units) index
+ * \param[in] mi_col Block column (in "MI_SIZE" units) index
+ * \param[out] num_planes Number of image planes (e.g. Y,U,V)
+ *
+ * \remark No return value but updates macroblock and thread data
+ * related to the q / q delta to be used.
+ */
+static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
+ MACROBLOCK *const x,
+ const TileInfo *const tile_info,
+ int mi_row, int mi_col, int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ assert(delta_q_info->delta_q_present_flag);
+
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ // Delta-q modulation based on variance
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+ const int delta_q_res = delta_q_info->delta_q_res;
+ int current_qindex = cm->quant_params.base_qindex;
+ if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode ==
+ DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+ const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+ const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+ const int sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ const int sb_index = sb_row * sb_cols + sb_col;
+ current_qindex =
+ cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index];
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+ if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+ const int block_wavelet_energy_level =
+ av1_block_wavelet_energy_level(cpi, x, sb_size);
+ x->sb_energy_level = block_wavelet_energy_level;
+ current_qindex = av1_compute_q_from_energy_level_deltaq_mode(
+ cpi, block_wavelet_energy_level);
+ } else {
+ const int block_var_level = av1_log_block_var(cpi, x, sb_size);
+ x->sb_energy_level = block_var_level;
+ current_qindex =
+ av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
+ }
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE &&
+ cpi->oxcf.algo_cfg.enable_tpl_model) {
+ // Setup deltaq based on tpl stats
+ current_qindex =
+ av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col);
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) {
+ current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col);
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+ current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col);
+ } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) {
+ current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
+ }
+
+ x->rdmult_cur_qindex = current_qindex;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int adjusted_qindex = av1_adjust_q_from_delta_q_res(
+ delta_q_res, xd->current_base_qindex, current_qindex);
+ if (cpi->use_ducky_encode) {
+ assert(adjusted_qindex == current_qindex);
+ }
+ current_qindex = adjusted_qindex;
+
+ x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+ x->rdmult_delta_qindex = x->delta_qindex;
+
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ xd->mi[0]->current_qindex = current_qindex;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+ // keep track of any non-zero delta-q used
+ td->deltaq_used |= (x->delta_qindex != 0);
+
+ if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+ const int delta_lf_res = delta_q_info->delta_lf_res;
+ const int lfmask = ~(delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+ const int8_t delta_lf =
+ (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // pre-set the delta lf for loop filter. Note that this value is set
+ // before mi is assigned for each block in current superblock
+ for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+ for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+ const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+ mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+ }
+ }
+ }
+ }
+}
+
+static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCK *x = &td->mb;
+ const int frame_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+ av1_zero(x->tpl_keep_ref_frame);
+
+ if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return;
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+ const int is_overlay =
+ cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+ if (is_overlay) {
+ memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
+ return;
+ }
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int tpl_stride = tpl_frame->stride;
+ int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
+ const int step = 1 << block_mis_log2;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ const int mi_row_end =
+ AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size],
+ cm->superres_scale_denominator),
+ mi_cols_sr);
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < mi_row_end; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ const TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+ int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
+ // Find the winner ref frame idx for the current block
+ int64_t best_inter_cost = this_stats->pred_error[0];
+ int best_rf_idx = 0;
+ for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) {
+ if ((this_stats->pred_error[idx] < best_inter_cost) &&
+ (this_stats->pred_error[idx] != 0)) {
+ best_inter_cost = this_stats->pred_error[idx];
+ best_rf_idx = idx;
+ }
+ }
+ // tpl_pred_error is the pred_error reduction of best_ref w.r.t.
+ // LAST_FRAME.
+ tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] -
+ this_stats->pred_error[LAST_FRAME - 1];
+
+ for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx)
+ inter_cost[rf_idx] += tpl_pred_error[rf_idx];
+ }
+ }
+
+ int rank_index[INTER_REFS_PER_FRAME - 1];
+ for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+ rank_index[idx] = idx + 1;
+ for (int i = idx; i > 0; --i) {
+ if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) {
+ const int tmp = rank_index[i - 1];
+ rank_index[i - 1] = rank_index[i];
+ rank_index[i] = tmp;
+ }
+ }
+ }
+
+ x->tpl_keep_ref_frame[INTRA_FRAME] = 1;
+ x->tpl_keep_ref_frame[LAST_FRAME] = 1;
+
+ int cutoff_ref = 0;
+ for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+ x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+ if (idx > 2) {
+ if (!cutoff_ref) {
+ // If the predictive coding gains are smaller than the previous more
+ // relevant frame over certain amount, discard this frame and all the
+ // frames afterwards.
+ if (llabs(inter_cost[rank_index[idx]]) <
+ llabs(inter_cost[rank_index[idx - 1]]) / 8 ||
+ inter_cost[rank_index[idx]] == 0)
+ cutoff_ref = 1;
+ }
+
+ if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+ }
+ }
+}
+
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col) {
+ const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+ const int orig_rdmult = cpi->rd.RDMULT;
+
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int gf_group_index = cpi->gf_frame_index;
+ if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+ cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+ cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+ const int dr =
+ av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
+ x->rdmult = dr;
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_RT_ML_PARTITIONING
+// Get a prediction(stored in x->est_pred) for the whole superblock.
+static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *x, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ // TODO(kyslov) Extend to 128x128
+ assert(cm->seq_params->sb_size == BLOCK_64X64);
+
+ av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+ if (!is_key_frame) {
+ MB_MODE_INFO *mi = xd->mi[0];
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+ assert(yv12 != NULL);
+
+ av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ get_ref_scale_factors(cm, LAST_FRAME), 1);
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE;
+ mi->bsize = BLOCK_64X64;
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+
+ xd->plane[0].dst.buf = x->est_pred;
+ xd->plane[0].dst.stride = 64;
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+ } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+ switch (xd->bd) {
+ case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+ case 10:
+ memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+ break;
+ case 12:
+ memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+ break;
+ }
+#else
+ memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+}
+#endif // CONFIG_RT_ML_PARTITIONING
+
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+/*!\brief Encode a superblock (minimal RD search involved)
+ *
+ * \ingroup partition_search
+ * Encodes the superblock by a pre-determined partition pattern, only minor
+ * rd-based searches are allowed to adjust the initial pattern. It is only used
+ * by realtime encoding.
+ */
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ const int mi_row, const int mi_col,
+ const int seg_skip) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ PC_TREE *const pc_root = td->pc_root;
+
+#if CONFIG_RT_ML_PARTITIONING
+ if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
+ RD_STATS dummy_rdc;
+ get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+ av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root);
+ return;
+ }
+#endif
+ // Set the partition
+ if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+ (sf->rt_sf.use_fast_fixed_part &&
+ x->content_state_sb.source_sad_nonrd < kMedSad)) {
+ // set a fixed-size partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
+ if (sf->rt_sf.use_fast_fixed_part &&
+ x->content_state_sb.source_sad_nonrd < kLowSad) {
+ bsize_select = BLOCK_64X64;
+ }
+ const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select;
+ av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ // set a variance-based partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+ }
+ assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+ sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+ set_cb_offsets(td->mb.cb_offset, 0, 0);
+
+ // Initialize the flag to skip cdef to 1.
+ if (sf->rt_sf.skip_cdef_sb) {
+ const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+ // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+ // "blocks".
+ for (int r = 0; r < block64_in_sb; ++r) {
+ for (int c = 0; c < block64_in_sb; ++c) {
+ const int idx_in_sb =
+ r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+ if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1;
+ }
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, nonrd_use_partition_time);
+#endif
+ av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, nonrd_use_partition_time);
+#endif
+}
+
+// This function initializes the stats for encode_rd_sb.
+static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+ const TileDataEnc *tile_data,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ RD_STATS *rd_cost, int mi_row, int mi_col,
+ int gather_tpl_data) {
+ const AV1_COMMON *cm = &cpi->common;
+ const TileInfo *tile_info = &tile_data->tile_info;
+ MACROBLOCK *x = &td->mb;
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const int use_simple_motion_search =
+ (sf->part_sf.simple_motion_search_split ||
+ sf->part_sf.simple_motion_search_prune_rect ||
+ sf->part_sf.simple_motion_search_early_term_none ||
+ sf->part_sf.ml_early_term_after_part_split_level) &&
+ !frame_is_intra_only(cm);
+ if (use_simple_motion_search) {
+ av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root,
+ mi_row, mi_col);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.gf_cfg.lag_in_frames == 0)) {
+ init_ref_frame_space(cpi, td, mi_row, mi_col);
+ x->sb_energy_level = 0;
+ x->part_search_info.cnn_output_valid = 0;
+ if (gather_tpl_data) {
+ if (cm->delta_q_info.delta_q_present_flag) {
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+ av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+ }
+
+ // TODO(jingning): revisit this function.
+ if (cpi->oxcf.algo_cfg.enable_tpl_model && (0)) {
+ adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+ }
+ }
+ }
+#else
+ (void)tile_info;
+ (void)mi_row;
+ (void)mi_col;
+ (void)gather_tpl_data;
+#endif
+
+ x->reuse_inter_pred = false;
+ x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+ reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+ av1_zero(x->picked_ref_frames_mask);
+ av1_invalid_rd_stats(rd_cost);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td,
+ const TileDataEnc *tile_data,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ RD_STATS *rd_cost, int mi_row,
+ int mi_col, int delta_qp_ofs) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const TileInfo *tile_info = &tile_data->tile_info;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ assert(delta_q_info->delta_q_present_flag);
+ const int delta_q_res = delta_q_info->delta_q_res;
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const int use_simple_motion_search =
+ (sf->part_sf.simple_motion_search_split ||
+ sf->part_sf.simple_motion_search_prune_rect ||
+ sf->part_sf.simple_motion_search_early_term_none ||
+ sf->part_sf.ml_early_term_after_part_split_level) &&
+ !frame_is_intra_only(cm);
+ if (use_simple_motion_search) {
+ av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree,
+ mi_row, mi_col);
+ }
+
+ int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ current_qindex = av1_adjust_q_from_delta_q_res(
+ delta_q_res, xd->current_base_qindex, current_qindex);
+
+ x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ xd->mi[0]->current_qindex = current_qindex;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+ // keep track of any non-zero delta-q used
+ td->deltaq_used |= (x->delta_qindex != 0);
+
+ if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+ const int delta_lf_res = delta_q_info->delta_lf_res;
+ const int lfmask = ~(delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+ const int8_t delta_lf =
+ (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // pre-set the delta lf for loop filter. Note that this value is set
+ // before mi is assigned for each block in current superblock
+ for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+ for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+ const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+ mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+ }
+ }
+ }
+ }
+
+ x->reuse_inter_pred = false;
+ x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+ reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+ av1_zero(x->picked_ref_frames_mask);
+ av1_invalid_rd_stats(rd_cost);
+}
+
+static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ SB_FIRST_PASS_STATS *sb_org_stats) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ RD_STATS rdc_winner, cur_rdc;
+ av1_invalid_rd_stats(&rdc_winner);
+
+ int best_qindex = td->mb.rdmult_delta_qindex;
+ const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12;
+ const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12;
+ const int step = cm->delta_q_info.delta_q_res;
+
+ for (int sweep_qp_delta = start; sweep_qp_delta <= end;
+ sweep_qp_delta += step) {
+ sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row,
+ mi_col, sweep_qp_delta);
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ const int backup_current_qindex =
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+ av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col);
+ av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex;
+
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+ &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL,
+ SB_DRY_PASS, NULL);
+
+ if ((rdc_winner.rdcost > cur_rdc.rdcost) ||
+ (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) &&
+ rdc_winner.rdcost == cur_rdc.rdcost)) {
+ rdc_winner = cur_rdc;
+ best_qindex = x->rdmult_delta_qindex + sweep_qp_delta;
+ }
+ }
+
+ return best_qindex;
+}
+#endif //! CONFIG_REALTIME_ONLY
+
+/*!\brief Encode a superblock (RD-search-based)
+ *
+ * \ingroup partition_search
+ * Conducts partition search for a superblock, based on rate-distortion costs,
+ * from scratch or adjusting from a pre-calculated partition pattern.
+ */
+static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ const int mi_row, const int mi_col,
+ const int seg_skip) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int num_planes = av1_num_planes(cm);
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_STATS dummy_rdc;
+ SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root;
+
+#if CONFIG_REALTIME_ONLY
+ (void)seg_skip;
+#endif // CONFIG_REALTIME_ONLY
+
+ init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
+ 1);
+
+ // Encode the superblock
+ if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ // partition search starting from a variance-based partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_use_partition_time);
+#endif
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
+ sf->part_sf.partition_search_type);
+ td->pc_root = NULL;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_use_partition_time);
+#endif
+ }
+#if !CONFIG_REALTIME_ONLY
+ else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+ // partition search by adjusting a fixed-size partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ const BLOCK_SIZE bsize =
+ seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+ av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
+ sf->part_sf.partition_search_type);
+ td->pc_root = NULL;
+ } else {
+ // The most exhaustive recursive partition search
+ SuperBlockEnc *sb_enc = &x->sb_enc;
+ // No stats for overlay frames. Exclude key frame.
+ av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc);
+
+ // Reset the tree for simple motion search data
+ av1_reset_simple_motion_tree_partition(sms_root, sb_size);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_partition_time);
+#endif
+
+ // Estimate the maximum square partition block size, which will be used
+ // as the starting block size for partitioning the sb
+ set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col);
+
+ // The superblock can be searched only once, or twice consecutively for
+ // better quality. Note that the meaning of passes here is different from
+ // the general concept of 1-pass/2-pass encoders.
+ const int num_passes =
+ cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
+
+ if (cpi->oxcf.sb_qp_sweep &&
+ !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.gf_cfg.lag_in_frames == 0) &&
+ cm->delta_q_info.delta_q_present_flag) {
+ AOM_CHECK_MEM_ERROR(
+ x->e_mbd.error_info, td->mb.sb_stats_cache,
+ (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache)));
+ av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+ mi_col);
+ assert(x->rdmult_delta_qindex == x->delta_qindex);
+
+ const int best_qp_diff =
+ sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root,
+ td->mb.sb_stats_cache) -
+ x->rdmult_delta_qindex;
+
+ sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc,
+ mi_row, mi_col, best_qp_diff);
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ const int backup_current_qindex =
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+ av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+ av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+ mi_col);
+
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+ backup_current_qindex;
+ aom_free(td->mb.sb_stats_cache);
+ td->mb.sb_stats_cache = NULL;
+ }
+ if (num_passes == 1) {
+#if CONFIG_PARTITION_SEARCH_ORDER
+ if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) {
+ av1_reset_part_sf(&cpi->sf.part_sf);
+ av1_reset_sf_for_ext_part(cpi);
+ RD_STATS this_rdc;
+ av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
+ mi_col, sb_size, &this_rdc);
+ } else {
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root,
+ NULL, SB_SINGLE_PASS, NULL);
+ }
+#else
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+ SB_SINGLE_PASS, NULL);
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+ } else {
+ // First pass
+ AOM_CHECK_MEM_ERROR(
+ x->e_mbd.error_info, td->mb.sb_fp_stats,
+ (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats)));
+ av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+ mi_col);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+ SB_DRY_PASS, NULL);
+
+ // Second pass
+ init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
+ mi_col, 0);
+ av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+ av1_reset_simple_motion_tree_partition(sms_root, sb_size);
+
+ av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+ mi_col);
+
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+ SB_WET_PASS, NULL);
+ aom_free(td->mb.sb_fp_stats);
+ td->mb.sb_fp_stats = NULL;
+ }
+
+ // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
+ sb_enc->tpl_data_count = 0;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_partition_time);
+#endif
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ // Update the inter rd model
+ // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+ cm->tiles.cols == 1 && cm->tiles.rows == 1) {
+ av1_inter_mode_data_fit(tile_data, x->rdmult);
+ }
+}
+
+// Check if the cost update of symbols mode, coeff and dv are tile or off.
+static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
+ const AV1_COMP *const cpi) {
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+
+ return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+ inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+ cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+}
+
+// When row-mt is enabled and cost update frequencies are set to off/tile,
+// processing of current SB can start even before processing of top-right SB
+// is finished. This function checks if it is sufficient to wait for top SB
+// to finish processing before current SB starts processing.
+static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
+ const MODE mode = cpi->oxcf.mode;
+ if (mode == GOOD) return 0;
+
+ if (mode == ALLINTRA)
+ return is_mode_coeff_dv_upd_freq_tile_or_off(cpi);
+ else if (mode == REALTIME)
+ return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) &&
+ cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+ else
+ return 0;
+}
+
+/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
+ int mi_col) {
+ if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int num_blk_64x64_cols =
+ (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+ const int num_blk_64x64_rows =
+ (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+ const int blk_64x64_col_index = mi_col / blk_64x64_in_mis;
+ const int blk_64x64_row_index = mi_row / blk_64x64_in_mis;
+ uint64_t curr_sb_sad = UINT64_MAX;
+ const uint64_t *const src_sad_blk_64x64_data =
+ &cpi->src_sad_blk_64x64[blk_64x64_col_index +
+ blk_64x64_row_index * num_blk_64x64_cols];
+ if (cm->seq_params->sb_size == BLOCK_128X128 &&
+ blk_64x64_col_index + 1 < num_blk_64x64_cols &&
+ blk_64x64_row_index + 1 < num_blk_64x64_rows) {
+ // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the
+ // superblock
+ curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] +
+ src_sad_blk_64x64_data[num_blk_64x64_cols] +
+ src_sad_blk_64x64_data[num_blk_64x64_cols + 1];
+ } else if (cm->seq_params->sb_size == BLOCK_64X64) {
+ curr_sb_sad = src_sad_blk_64x64_data[0];
+ }
+ return curr_sb_sad;
+}
+
+/*!\brief Determine whether grading content can be skipped based on sad stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ int mi_row, int mi_col) {
+ if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+ return true;
+ const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col);
+ if (curr_sb_sad == UINT64_MAX) return true;
+ if (curr_sb_sad == 0) {
+ x->content_state_sb.source_sad_nonrd = kZeroSad;
+ return false;
+ }
+ AV1_COMMON *const cm = &cpi->common;
+ bool do_calc_src_content = true;
+
+ if (cpi->oxcf.speed < 9) return do_calc_src_content;
+
+ // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size.
+ if (AOMMIN(cm->width, cm->height) < 360) {
+ // Derive Average 64x64 block source SAD from SB source SAD
+ const uint64_t avg_64x64_blk_sad =
+ (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2)
+ : curr_sb_sad;
+
+ // The threshold is determined based on kLowSad and kHighSad threshold and
+ // test results.
+ const uint64_t thresh_low = 15000;
+ const uint64_t thresh_high = 40000;
+
+ if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) {
+ do_calc_src_content = false;
+ // Note: set x->content_state_sb.source_sad_rd as well if this is extended
+ // to RTC rd path.
+ x->content_state_sb.source_sad_nonrd = kMedSad;
+ }
+ }
+
+ return do_calc_src_content;
+}
+
+/*!\brief Determine whether grading content is needed based on sf and frame stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+// TODO(any): consolidate sfs to make interface cleaner
+static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cm->current_frame.frame_type == KEY_FRAME ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+ assert(x->content_state_sb.source_sad_nonrd == kMedSad);
+ assert(x->content_state_sb.source_sad_rd == kMedSad);
+ return;
+ }
+ bool calc_src_content = false;
+
+ if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+ if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) {
+ calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col);
+ } else {
+ x->content_state_sb.source_sad_nonrd = kZeroSad;
+ }
+ } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) &&
+ (cm->width * cm->height <= 352 * 288)) {
+ if (cpi->rc.frame_source_sad > 0)
+ calc_src_content = true;
+ else
+ x->content_state_sb.source_sad_rd = kZeroSad;
+ }
+ if (calc_src_content)
+ av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
+}
+
+/*!\brief Encode a superblock row by breaking it into superblocks
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Do partition and mode search for an sb row: one row of superblocks filling up
+ * the width of the current tile.
+ */
+static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, int mi_row,
+ TokenExtra **tp) {
+ AV1_COMMON *const cm = &cpi->common;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ bool row_mt_enabled = mt_info->row_mt_enabled;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mib_size = cm->seq_params->mib_size;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+ const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_row_time);
+#endif
+
+ // Initialize the left context for the new SB row
+ av1_zero_left_context(xd);
+
+ // Reset delta for quantizer and loof filters at the beginning of every tile
+ if (mi_row == tile_info->mi_row_start || row_mt_enabled) {
+ if (cm->delta_q_info.delta_q_present_flag)
+ xd->current_base_qindex = cm->quant_params.base_qindex;
+ if (cm->delta_q_info.delta_lf_present_flag) {
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+
+ reset_thresh_freq_fact(x);
+
+ // Code each SB in the row
+ for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
+ mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
+ // In realtime/allintra mode and when frequency of cost updates is off/tile,
+ // wait for the top superblock to finish encoding. Otherwise, wait for the
+ // top-right superblock to finish encoding.
+ enc_row_mt->sync_read_ptr(
+ row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi));
+
+#if CONFIG_MULTITHREAD
+ if (row_mt_enabled) {
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ const bool row_mt_exit = enc_row_mt->row_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (row_mt_exit) return;
+ }
+#endif
+
+ const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
+ if (update_cdf && (tile_info->mi_row_start != mi_row)) {
+ if ((tile_info->mi_col_start == mi_col)) {
+ // restore frame context at the 1st column sb
+ memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
+ } else {
+ // update context
+ int wt_left = AVG_CDF_WEIGHT_LEFT;
+ int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
+ if (tile_info->mi_col_end > (mi_col + mib_size))
+ av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile,
+ wt_left, wt_tr);
+ else
+ av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+ wt_left, wt_tr);
+ }
+ }
+
+ // Update the rate cost tables for some symbols
+ av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
+
+ // Reset color coding related parameters
+ av1_zero(x->color_sensitivity_sb);
+ av1_zero(x->color_sensitivity_sb_g);
+ av1_zero(x->color_sensitivity_sb_alt);
+ av1_zero(x->color_sensitivity);
+ x->content_state_sb.source_sad_nonrd = kMedSad;
+ x->content_state_sb.source_sad_rd = kMedSad;
+ x->content_state_sb.lighting_change = 0;
+ x->content_state_sb.low_sumdiff = 0;
+ x->force_zeromv_skip_for_sb = 0;
+ x->sb_me_block = 0;
+ x->sb_me_partition = 0;
+ x->sb_me_mv.as_int = 0;
+
+ if (cpi->oxcf.mode == ALLINTRA) {
+ x->intra_sb_rdmult_modifier = 128;
+ }
+
+ xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
+ x->source_variance = UINT_MAX;
+ td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+
+ // Get segment id and skip flag
+ const struct segmentation *const seg = &cm->seg;
+ int seg_skip = 0;
+ if (seg->enabled) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+ const uint8_t segment_id =
+ map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col)
+ : 0;
+ seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ }
+
+ produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col);
+
+ init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks,
+ sb_size);
+
+ // Grade the temporal variation of the sb, the grade will be used to decide
+ // fast mode search strategy for coding blocks
+ grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
+
+ // encode the superblock
+ if (use_nonrd_mode) {
+ encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
+ } else {
+ encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
+ }
+
+ // Update the top-right context in row_mt coding
+ if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) {
+ if (sb_cols_in_tile == 1)
+ memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+ else if (sb_col_in_tile >= 1)
+ memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
+ sizeof(*xd->tile_ctx));
+ }
+ enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile,
+ sb_cols_in_tile);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_row_time);
+#endif
+}
+
+static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Copy data over into macro block data structures.
+ av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
+ cm->seq_params->sb_size);
+
+ av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, num_planes);
+}
+
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ av1_row_mt_mem_dealloc(cpi);
+
+ aom_free(cpi->tile_data);
+ cpi->allocated_tiles = 0;
+ enc_row_mt->allocated_tile_cols = 0;
+ enc_row_mt->allocated_tile_rows = 0;
+
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+
+ cpi->allocated_tiles = tile_cols * tile_rows;
+ enc_row_mt->allocated_tile_cols = tile_cols;
+ enc_row_mt->allocated_tile_rows = tile_rows;
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+ av1_zero(this_tile->row_mt_sync);
+ this_tile->row_ctx = NULL;
+ }
+ }
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tile_col, tile_row;
+ TokenInfo *const token_info = &cpi->token_info;
+ TokenExtra *pre_tok = token_info->tile_tok[0][0];
+ TokenList *tplist = token_info->tplist[0][0];
+ unsigned int tile_tok = 0;
+ int tplist_count = 0;
+
+ if (!is_stat_generation_stage(cpi) &&
+ cm->features.allow_screen_content_tools) {
+ // Number of tokens for which token info needs to be allocated.
+ unsigned int tokens_required =
+ get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+ MAX_SB_SIZE_LOG2, num_planes);
+ // Allocate/reallocate memory for token related info if the number of tokens
+ // required is more than the number of tokens already allocated. This could
+ // occur in case of the following:
+ // 1) If the memory is not yet allocated
+ // 2) If the frame dimensions have changed
+ const bool realloc_tokens = tokens_required > token_info->tokens_allocated;
+ if (realloc_tokens) {
+ free_token_info(token_info);
+ alloc_token_info(cm, token_info, tokens_required);
+ pre_tok = token_info->tile_tok[0][0];
+ tplist = token_info->tplist[0][0];
+ }
+ }
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_tile_init(tile_info, cm, tile_row, tile_col);
+ tile_data->firstpass_top_mv = kZeroMv;
+ tile_data->abs_sum_level = 0;
+
+ if (is_token_info_allocated(token_info)) {
+ token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+ pre_tok = token_info->tile_tok[tile_row][tile_col];
+ tile_tok = allocated_tokens(
+ tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+ num_planes);
+ token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
+ tplist = token_info->tplist[tile_row][tile_col];
+ tplist_count = av1_get_sb_rows_in_tile(cm, tile_info);
+ }
+ tile_data->allow_update_cdf = !cm->tiles.large_scale;
+ tile_data->allow_update_cdf = tile_data->allow_update_cdf &&
+ !cm->features.disable_cdf_update &&
+ !delay_wait_for_top_right_sb(cpi);
+ tile_data->tctx = *cm->fc;
+ }
+ }
+}
+
+// Populate the start palette token info prior to encoding an SB row.
+static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
+ int tile_row, int tile_col, int mi_row,
+ TokenExtra **tp) {
+ const TokenInfo *token_info = &cpi->token_info;
+ if (!is_token_info_allocated(token_info)) return;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+
+ get_start_tok(cpi, tile_row, tile_col, mi_row, tp,
+ cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ assert(tplist != NULL);
+ tplist[sb_row_in_tile].start = *tp;
+}
+
+// Populate the token count after encoding an SB row.
+static AOM_INLINE void populate_token_count(AV1_COMP *cpi,
+ const TileInfo *tile_info,
+ int tile_row, int tile_col,
+ int mi_row, TokenExtra *tok) {
+ const TokenInfo *token_info = &cpi->token_info;
+ if (!is_token_info_allocated(token_info)) return;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TokenList *const tplist = token_info->tplist[tile_row][tile_col];
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int num_mb_rows_in_sb =
+ ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+ tplist[sb_row_in_tile].count =
+ (unsigned int)(tok - tplist[sb_row_in_tile].start);
+
+ assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
+ get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+ cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+ num_planes));
+
+ (void)num_planes;
+ (void)tile_mb_cols;
+ (void)num_mb_rows_in_sb;
+}
+
+/*!\brief Encode a superblock row
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col, int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TokenExtra *tok = NULL;
+
+ get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok);
+
+ encode_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+ populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok);
+}
+
+/*!\brief Encode a tile
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile);
+
+ av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+ tile_info->mi_col_end, tile_row);
+ av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+ &td->mb.e_mbd);
+
+ if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
+ cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+
+ if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+ av1_crc32c_calculator_init(
+ &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+ }
+
+ for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += cm->seq_params->mib_size) {
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+ }
+ this_tile->abs_sum_level = td->abs_sum_level;
+}
+
+/*!\brief Break one frame into tiles and encode the tiles
+ *
+ * \ingroup partition_search
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tile_col, tile_row;
+
+ MACROBLOCK *const mb = &cpi->td.mb;
+ assert(IMPLIES(cpi->tile_data == NULL,
+ cpi->allocated_tiles < tile_cols * tile_rows));
+ if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ av1_alloc_mb_data(cpi, mb);
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+ cpi->td.intrabc_used = 0;
+ cpi->td.deltaq_used = 0;
+ cpi->td.abs_sum_level = 0;
+ cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0;
+ cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+ cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
+ av1_init_rtc_counters(&cpi->td.mb);
+ cpi->td.mb.palette_pixels = 0;
+ av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+ if (!frame_is_intra_only(&cpi->common))
+ av1_accumulate_rtc_counters(cpi, &cpi->td.mb);
+ cpi->palette_pixel_num += cpi->td.mb.palette_pixels;
+ cpi->intrabc_used |= cpi->td.intrabc_used;
+ cpi->deltaq_used |= cpi->td.deltaq_used;
+ }
+ }
+
+ av1_dealloc_mb_data(mb, av1_num_planes(cm));
+}
+
+// Set the relative distance of a reference frame w.r.t. current frame
+static AOM_INLINE void set_rel_frame_dist(
+ const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info,
+ const int ref_frame_flags) {
+ MV_REFERENCE_FRAME ref_frame;
+ int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
+ ref_frame_dist_info->nearest_past_ref = NONE_FRAME;
+ ref_frame_dist_info->nearest_future_ref = NONE_FRAME;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
+ if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ int dist = av1_encoder_get_relative_dist(
+ cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
+ cm->current_frame.display_order_hint);
+ ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+ // Get the nearest ref_frame in the past
+ if (abs(dist) < min_past_dist && dist < 0) {
+ ref_frame_dist_info->nearest_past_ref = ref_frame;
+ min_past_dist = abs(dist);
+ }
+ // Get the nearest ref_frame in the future
+ if (dist < min_future_dist && dist > 0) {
+ ref_frame_dist_info->nearest_future_ref = ref_frame;
+ min_future_dist = dist;
+ }
+ }
+ }
+}
+
+static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
+ assert(!frame_is_intra_only(cm));
+
+ int one_sided_refs = 1;
+ const int cur_display_order_hint = cm->current_frame.display_order_hint;
+ for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+ if (buf == NULL) continue;
+ if (av1_encoder_get_relative_dist(buf->display_order_hint,
+ cur_display_order_hint) > 0) {
+ one_sided_refs = 0; // bwd reference
+ break;
+ }
+ }
+ return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+ int ref_order_hint[2]) {
+ const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+ ref_order_hint[0] = ref_order_hint[1] = 0;
+ if (!skip_mode_info->skip_mode_allowed) return;
+
+ const RefCntBuffer *const buf_0 =
+ get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0);
+ const RefCntBuffer *const buf_1 =
+ get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1);
+ assert(buf_0 != NULL && buf_1 != NULL);
+
+ ref_order_hint[0] = buf_0->order_hint;
+ ref_order_hint[1] = buf_1->order_hint;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ av1_setup_skip_mode_allowed(cm);
+ if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0;
+
+ // Turn off skip mode if the temporal distances of the reference pair to the
+ // current frame are different by more than 1 frame.
+ const int cur_offset = (int)cm->current_frame.order_hint;
+ int ref_offset[2];
+ get_skip_mode_ref_offsets(cm, ref_offset);
+ const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info,
+ cur_offset, ref_offset[0]);
+ const int cur_to_ref1 = abs(get_relative_dist(
+ &cm->seq_params->order_hint_info, cur_offset, ref_offset[1]));
+ if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+ // High Latency: Turn off skip mode if all refs are fwd.
+ if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0;
+
+ const int ref_frame[2] = {
+ cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME,
+ cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME
+ };
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) ||
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]]))
+ return 0;
+
+ return 1;
+}
+
+static AOM_INLINE void set_default_interp_skip_flags(
+ const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
+ const int num_planes = av1_num_planes(cm);
+ interp_search_flags->default_interp_skip_flags =
+ (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA
+ : INTERP_SKIP_LUMA_SKIP_CHROMA;
+}
+
+static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
+ if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+ cpi->sf.inter_sf.disable_onesided_comp) &&
+ cpi->all_one_sided_refs) {
+ // Disable all compound references
+ cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES);
+ } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->sf.inter_sf.selective_ref_frame >= 2) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int cur_frame_display_order_hint =
+ cm->current_frame.display_order_hint;
+ unsigned int *ref_display_order_hint =
+ cm->cur_frame->ref_display_order_hint;
+ const int arf2_dist = av1_encoder_get_relative_dist(
+ ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+ cur_frame_display_order_hint);
+ const int bwd_dist = av1_encoder_get_relative_dist(
+ ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+ cur_frame_display_order_hint);
+
+ for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_idx);
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) ||
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) {
+ continue;
+ }
+
+ if (!cpi->all_one_sided_refs) {
+ int ref_dist[2];
+ for (int i = 0; i < 2; ++i) {
+ ref_dist[i] = av1_encoder_get_relative_dist(
+ ref_display_order_hint[rf[i] - LAST_FRAME],
+ cur_frame_display_order_hint);
+ }
+
+ // One-sided compound is used only when all reference frames are
+ // one-sided.
+ if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) {
+ cpi->prune_ref_frame_mask |= 1 << ref_idx;
+ }
+ }
+
+ if (cpi->sf.inter_sf.selective_ref_frame >= 4 &&
+ (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+ // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+ if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) {
+ // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+ // reference to the current frame than ALTREF2_FRAME
+ cpi->prune_ref_frame_mask |= 1 << ref_idx;
+ }
+ }
+ }
+ }
+}
+
+static int allow_deltaq_mode(AV1_COMP *cpi) {
+#if !CONFIG_REALTIME_ONLY
+ AV1_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ int sbs_wide = mi_size_wide[sb_size];
+ int sbs_high = mi_size_high[sb_size];
+
+ int64_t delta_rdcost = 0;
+ for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) {
+ for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) {
+ int64_t this_delta_rdcost = 0;
+ av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size,
+ mi_row, mi_col);
+ delta_rdcost += this_delta_rdcost;
+ }
+ }
+ return delta_rdcost < 0;
+#else
+ (void)cpi;
+ return 1;
+#endif // !CONFIG_REALTIME_ONLY
+}
+
+#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000
+#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4
+
+// Populates block level thresholds for force zeromv-skip decision
+static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) {
+ if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return;
+
+ // Threshold for forcing zeromv-skip decision is as below:
+ // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103.
+ // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221
+ // allowing slightly higher error for smaller blocks.
+ // Per Pixel Threshold of 64x64 block Area of 64x64 block 1 1
+ // ------------------------------------=sqrt(---------------------)=sqrt(-)=-
+ // Per Pixel Threshold of 128x128 block Area of 128x128 block 4 2
+ // Thus, per pixel thresholds for blocks of size 32x32, 16x16,... can be
+ // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for
+ // small blocks, the same is clipped to 4.
+ const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF;
+ const int num_128x128_pix =
+ block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128];
+
+ for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+ const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize];
+
+ // Calculate the threshold for zeromv-skip decision based on area of the
+ // partition
+ unsigned int thresh_exit_part_blk =
+ (unsigned int)(thresh_exit_128x128_part *
+ sqrt((double)num_block_pix / num_128x128_pix) +
+ 0.5);
+ thresh_exit_part_blk = AOMMIN(
+ thresh_exit_part_blk,
+ (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix));
+ cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk;
+ }
+}
+
+static void free_block_hash_buffers(uint32_t *block_hash_values[2][2],
+ int8_t *is_block_same[2][3]) {
+ for (int k = 0; k < 2; ++k) {
+ for (int j = 0; j < 2; ++j) {
+ aom_free(block_hash_values[k][j]);
+ }
+
+ for (int j = 0; j < 3; ++j) {
+ aom_free(is_block_same[k][j]);
+ }
+ }
+}
+
+/*!\brief Encoder setup(only for the current frame), encoding, and recontruction
+ * for a single frame
+ *
+ * \ingroup high_level_algo
+ */
+static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
+ ThreadData *const td = &cpi->td;
+ MACROBLOCK *const x = &td->mb;
+ AV1_COMMON *const cm = &cpi->common;
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ FeatureFlags *const features = &cm->features;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+#if CONFIG_FPMT_TEST
+ FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+ FrameProbInfo *const temp_frame_probs_simulation =
+ &cpi->ppi->temp_frame_probs_simulation;
+#endif
+ FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+ IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode;
+ int i;
+
+ if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ mi_params->setup_mi(mi_params);
+ }
+
+ set_mi_offsets(mi_params, xd, 0, 0);
+
+ av1_zero(*td->counts);
+ av1_zero(rdc->tx_type_used);
+ av1_zero(rdc->obmc_used);
+ av1_zero(rdc->warped_used);
+ av1_zero(rdc->seg_tmp_pred_cost);
+
+ // Reset the flag.
+ cpi->intrabc_used = 0;
+ // Need to disable intrabc when superres is selected
+ if (av1_superres_scaled(cm)) {
+ features->allow_intrabc = 0;
+ }
+
+ features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc);
+
+ if (features->allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int warped_probability =
+#if CONFIG_FPMT_TEST
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+ ? temp_frame_probs->warped_probs[update_type]
+ :
+#endif // CONFIG_FPMT_TEST
+ frame_probs->warped_probs[update_type];
+ if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh)
+ features->allow_warped_motion = 0;
+ }
+
+ int hash_table_created = 0;
+ if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+ !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ // TODO(any): move this outside of the recoding loop to avoid recalculating
+ // the hash table.
+ // add to hash table
+ const int pic_width = cpi->source->y_crop_width;
+ const int pic_height = cpi->source->y_crop_height;
+ uint32_t *block_hash_values[2][2] = { { NULL } };
+ int8_t *is_block_same[2][3] = { { NULL } };
+ int k, j;
+ bool error = false;
+
+ for (k = 0; k < 2 && !error; ++k) {
+ for (j = 0; j < 2; ++j) {
+ block_hash_values[k][j] = (uint32_t *)aom_malloc(
+ sizeof(*block_hash_values[0][0]) * pic_width * pic_height);
+ if (!block_hash_values[k][j]) {
+ error = true;
+ break;
+ }
+ }
+
+ for (j = 0; j < 3 && !error; ++j) {
+ is_block_same[k][j] = (int8_t *)aom_malloc(
+ sizeof(*is_block_same[0][0]) * pic_width * pic_height);
+ if (!is_block_same[k][j]) error = true;
+ }
+ }
+
+ av1_hash_table_init(intrabc_hash_info);
+ if (error ||
+ !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+ free_block_hash_buffers(block_hash_values, is_block_same);
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating intrabc_hash_table and buffers");
+ }
+ hash_table_created = 1;
+ av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
+ block_hash_values[0], is_block_same[0]);
+ // Hash data generated for screen contents is used for intraBC ME
+ const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
+ const int max_sb_size =
+ (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
+ int src_idx = 0;
+ for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
+ const int dst_idx = !src_idx;
+ av1_generate_block_hash_value(
+ intrabc_hash_info, cpi->source, size, block_hash_values[src_idx],
+ block_hash_values[dst_idx], is_block_same[src_idx],
+ is_block_same[dst_idx]);
+ if (size >= min_alloc_size) {
+ if (!av1_add_to_hash_map_by_row_with_precal_data(
+ &intrabc_hash_info->intrabc_hash_table,
+ block_hash_values[dst_idx], is_block_same[dst_idx][2],
+ pic_width, pic_height, size)) {
+ error = true;
+ break;
+ }
+ }
+ }
+
+ free_block_hash_buffers(block_hash_values, is_block_same);
+
+ if (error) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error adding data to intrabc_hash_table");
+ }
+ }
+
+ const CommonQuantParams *quant_params = &cm->quant_params;
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex =
+ cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex)
+ : quant_params->base_qindex;
+ xd->lossless[i] =
+ qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+ quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+ quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+ if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1;
+ xd->qindex[i] = qindex;
+ if (xd->lossless[i]) {
+ cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT;
+ } else {
+ cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients;
+ }
+ }
+ features->coded_lossless = is_coded_lossless(cm, xd);
+ features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
+
+ // Fix delta q resolution for the moment
+
+ cm->delta_q_info.delta_q_res = 0;
+ if (cpi->use_ducky_encode) {
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE;
+ } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
+ if (deltaq_mode == DELTA_Q_OBJECTIVE)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+ else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ else if (deltaq_mode == DELTA_Q_USER_RATING_BASED)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ else if (deltaq_mode == DELTA_Q_HDR)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ // Set delta_q_present_flag before it is used for the first time
+ cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
+ cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;
+
+ // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q
+ // is used for ineligible frames. That effectively will turn off row_mt
+ // usage. Note objective delta_q and tpl eligible frames are only altref
+ // frames currently.
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (cm->delta_q_info.delta_q_present_flag) {
+ if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+ gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE)
+ cm->delta_q_info.delta_q_present_flag = 0;
+
+ if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+ cm->delta_q_info.delta_q_present_flag) {
+ cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi);
+ }
+ }
+
+ // Reset delta_q_used flag
+ cpi->deltaq_used = 0;
+
+ cm->delta_q_info.delta_lf_present_flag =
+ cm->delta_q_info.delta_q_present_flag &&
+ oxcf->tool_cfg.enable_deltalf_mode;
+ cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+
+ // update delta_q_present_flag and delta_lf_present_flag based on
+ // base_qindex
+ cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+ cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+ } else if (cpi->cyclic_refresh->apply_cyclic_refresh ||
+ cpi->svc.number_temporal_layers == 1) {
+ cpi->cyclic_refresh->actual_num_seg1_blocks = 0;
+ cpi->cyclic_refresh->actual_num_seg2_blocks = 0;
+ }
+ cpi->rc.cnt_zeromv = 0;
+
+ av1_frame_init_quantizer(cpi);
+ init_encode_frame_mb_context(cpi);
+ set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
+
+ if (cm->prev_frame && cm->prev_frame->seg.enabled)
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ else
+ cm->last_frame_seg_map = NULL;
+ if (features->allow_intrabc || features->coded_lossless) {
+ av1_set_default_ref_deltas(cm->lf.ref_deltas);
+ av1_set_default_mode_deltas(cm->lf.mode_deltas);
+ } else if (cm->prev_frame) {
+ memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+ memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+ memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+ cpi->all_one_sided_refs =
+ frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm);
+
+ cpi->prune_ref_frame_mask = 0;
+ // Figure out which ref frames can be skipped at frame level.
+ setup_prune_ref_frame_mask(cpi);
+
+ x->txfm_search_info.txb_split_count = 0;
+#if CONFIG_SPEED_STATS
+ x->txfm_search_info.tx_search_count = 0;
+#endif // CONFIG_SPEED_STATS
+
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_compute_global_motion_time);
+#endif
+ av1_compute_global_motion_facade(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_compute_global_motion_time);
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_setup_motion_field_time);
+#endif
+ av1_calculate_ref_frame_side(cm);
+ if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_setup_motion_field_time);
+#endif
+
+ cm->current_frame.skip_mode_info.skip_mode_flag =
+ check_skip_mode_enabled(cpi);
+
+ // Initialization of skip mode cost depends on the value of
+ // 'skip_mode_flag'. This initialization happens in the function
+ // av1_fill_mode_rates(), which is in turn called in
+ // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts()
+ // has to be called after 'skip_mode_flag' is initialized.
+ av1_initialize_rd_consts(cpi);
+ av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex);
+ populate_thresh_to_force_zeromv_skip(cpi);
+
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+ mt_info->row_mt_enabled = 0;
+ mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS],
+ cm->tiles.cols * cm->tiles.rows) > 1;
+
+ if (oxcf->row_mt && (mt_info->num_workers > 1)) {
+ mt_info->row_mt_enabled = 1;
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+ av1_encode_tiles_row_mt(cpi);
+ } else {
+ if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1) {
+ av1_encode_tiles_mt(cpi);
+ } else {
+ // Preallocate the pc_tree for realtime coding to reduce the cost of
+ // memory allocation.
+ const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+ if (use_nonrd_mode) {
+ td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ td->pc_root = NULL;
+ }
+
+ encode_tiles(cpi);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ }
+ }
+
+ // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+ if (features->allow_intrabc && !cpi->intrabc_used) {
+ features->allow_intrabc = 0;
+ }
+ if (features->allow_intrabc) {
+ cm->delta_q_info.delta_lf_present_flag = 0;
+ }
+
+ if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+ cm->delta_q_info.delta_q_present_flag = 0;
+ }
+
+ // Set the transform size appropriately before bitstream creation
+ const MODE_EVAL_TYPE eval_type =
+ cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch
+ ? WINNER_MODE_EVAL
+ : DEFAULT_EVAL;
+ const TX_SIZE_SEARCH_METHOD tx_search_type =
+ cpi->winner_mode_params.tx_size_search_methods[eval_type];
+ assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL);
+ features->tx_mode = select_tx_mode(cm, tx_search_type);
+
+ // Retain the frame level probability update conditions for parallel frames.
+ // These conditions will be consumed during postencode stage to update the
+ // probability.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] =
+ cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats;
+ cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] =
+ (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX);
+ cpi->do_update_frame_probs_warp[cpi->num_frame_recode] =
+ (features->allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0);
+ cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] =
+ (cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+ features->interp_filter == SWITCHABLE);
+ }
+
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats ||
+ ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh !=
+ INT_MAX) &&
+ (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ for (i = 0; i < TX_SIZES_ALL; i++) {
+ int sum = 0;
+ int j;
+ int left = MAX_TX_TYPE_PROB;
+
+ for (j = 0; j < TX_TYPES; j++)
+ sum += cpi->td.rd_counts.tx_type_used[i][j];
+
+ for (j = TX_TYPES - 1; j >= 0; j--) {
+ int update_txtype_frameprobs = 1;
+ const int new_prob =
+ sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum
+ : (j ? 0 : MAX_TX_TYPE_PROB);
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+ 0) {
+ int prob =
+ (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ temp_frame_probs_simulation->tx_type_probs[update_type][i][j] =
+ prob;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation
+ ->tx_type_probs[update_type_idx][i][j];
+ }
+ }
+ update_txtype_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_txtype_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode]
+ .tx_type_probs[update_type][i][j] = new_prob;
+ }
+ if (update_txtype_frameprobs) {
+ int prob =
+ (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->tx_type_probs[update_type][i][j] = prob;
+ }
+ }
+ }
+ }
+
+ if (cm->seg.enabled) {
+ cm->seg.temporal_update = 1;
+ if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1])
+ cm->seg.temporal_update = 0;
+ }
+
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ int sum = 0;
+ int update_obmc_frameprobs = 1;
+ for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
+
+ const int new_prob =
+ sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ temp_frame_probs_simulation->obmc_probs[update_type][i] =
+ (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+ new_prob) >>
+ 1;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->obmc_probs[update_type_idx][i] =
+ temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+ }
+ }
+ update_obmc_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_obmc_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] =
+ new_prob;
+ }
+ if (update_obmc_frameprobs) {
+ frame_probs->obmc_probs[update_type][i] =
+ (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+ }
+ }
+ }
+
+ if (features->allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int update_warp_frameprobs = 1;
+ int sum = 0;
+ for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
+ const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ temp_frame_probs_simulation->warped_probs[update_type] =
+ (temp_frame_probs_simulation->warped_probs[update_type] +
+ new_prob) >>
+ 1;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->warped_probs[update_type_idx] =
+ temp_frame_probs_simulation->warped_probs[update_type_idx];
+ }
+ }
+ update_warp_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_warp_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] =
+ new_prob;
+ }
+ if (update_warp_frameprobs) {
+ frame_probs->warped_probs[update_type] =
+ (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+ }
+ }
+
+ if (cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+ features->interp_filter == SWITCHABLE) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ int sum = 0;
+ int j;
+ int left = 1536;
+
+ for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+ sum += cpi->td.counts->switchable_interp[i][j];
+ }
+
+ for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+ int update_interpfilter_frameprobs = 1;
+ const int new_prob =
+ sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
+ : (j ? 0 : 1536);
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+ 0) {
+ int prob = (temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] = prob;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type_idx][i][j];
+ }
+ }
+ update_interpfilter_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_interpfilter_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode]
+ .switchable_interp_probs[update_type][i][j] = new_prob;
+ }
+ if (update_interpfilter_frameprobs) {
+ int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+ }
+ }
+ }
+ }
+ if (hash_table_created) {
+ av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
+ }
+}
+
+/*!\brief Setup reference frame buffers and encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+void av1_encode_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ FeatureFlags *const features = &cm->features;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ // Indicates whether or not to use a default reduced set for ext-tx
+ // rather than the potential full set of 16 transforms
+ features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set;
+
+ // Make sure segment_id is no larger than last_active_segid.
+ if (cm->seg.enabled && cm->seg.update_map) {
+ const int mi_rows = cm->mi_params.mi_rows;
+ const int mi_cols = cm->mi_params.mi_cols;
+ const int last_active_segid = cm->seg.last_active_segid;
+ uint8_t *map = cpi->enc_seg.map;
+ for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+ for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+ map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+ }
+ map += mi_cols;
+ }
+ }
+
+ av1_setup_frame_buf_refs(cm);
+ enforce_max_ref_frames(cpi, &cpi->ref_frame_flags,
+ cm->cur_frame->ref_display_order_hint,
+ cm->current_frame.display_order_hint);
+ set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info,
+ cpi->ref_frame_flags);
+ av1_setup_frame_sign_bias(cm);
+
+ // If global motion is enabled, then every buffer which is used as either
+ // a source or a ref frame should have an image pyramid allocated.
+ // Check here so that issues can be caught early in debug mode
+#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+ if (cpi->image_pyramid_levels > 0) {
+ assert(cpi->source->y_pyramid);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ assert(buf->buf.y_pyramid);
+ }
+ }
+ }
+#endif // !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_reset_frame(av1_num_planes(cm));
+#endif
+
+ rdc->newmv_or_intra_blocks = 0;
+ cpi->palette_pixel_num = 0;
+
+ if (cpi->sf.hl_sf.frame_parameter_update ||
+ cpi->sf.rt_sf.use_comp_ref_nonrd) {
+ if (frame_is_intra_only(cm))
+ current_frame->reference_mode = SINGLE_REFERENCE;
+ else
+ current_frame->reference_mode = REFERENCE_MODE_SELECT;
+
+ features->interp_filter = SWITCHABLE;
+ if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR;
+
+ features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+ features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
+
+ rdc->compound_ref_used_flag = 0;
+ rdc->skip_mode_used_flag = 0;
+
+ encode_frame_internal(cpi);
+
+ if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+ // Use a flag that includes 4x4 blocks
+ if (rdc->compound_ref_used_flag == 0) {
+ current_frame->reference_mode = SINGLE_REFERENCE;
+#if CONFIG_ENTROPY_STATS
+ av1_zero(cpi->td.counts->comp_inter);
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ // Re-check on the skip mode status as reference mode may have been
+ // changed.
+ SkipModeInfo *const skip_mode_info = &current_frame->skip_mode_info;
+ if (frame_is_intra_only(cm) ||
+ current_frame->reference_mode == SINGLE_REFERENCE) {
+ skip_mode_info->skip_mode_allowed = 0;
+ skip_mode_info->skip_mode_flag = 0;
+ }
+ if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+ skip_mode_info->skip_mode_flag = 0;
+
+ if (!cm->tiles.large_scale) {
+ if (features->tx_mode == TX_MODE_SELECT &&
+ cpi->td.mb.txfm_search_info.txb_split_count == 0)
+ features->tx_mode = TX_MODE_LARGEST;
+ }
+ } else {
+ // This is needed if real-time speed setting is changed on the fly
+ // from one using compound prediction to one using single reference.
+ if (current_frame->reference_mode == REFERENCE_MODE_SELECT)
+ current_frame->reference_mode = SINGLE_REFERENCE;
+ encode_frame_internal(cpi);
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 0000000000..ce32fb47e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#include "av1/encoder/global_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DELTA_Q_PERCEPTUAL_MODULATION \
+ 1 // 0: variance based
+ // 1: wavelet AC energy based
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+void av1_init_rtc_counters(struct macroblock *const x);
+
+void av1_accumulate_rtc_counters(struct AV1_COMP *cpi,
+ const struct macroblock *const x);
+
+void av1_setup_src_planes(struct macroblock *x,
+ const struct yv12_buffer_config *src, int mi_row,
+ int mi_col, const int num_planes, BLOCK_SIZE bsize);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+ int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodeframe_utils.c b/third_party/aom/av1/encoder/encodeframe_utils.c
new file mode 100644
index 0000000000..949837184a
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe_utils.c
@@ -0,0 +1,1775 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const BLOCK_SIZE bsize_base = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ int row, col;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 1.0;
+
+ // To avoid overflow of 'geom_mean_of_scale', bsize_base must be at least
+ // BLOCK_8X8.
+ //
+ // For bsize=BLOCK_128X128 and bsize_base=BLOCK_8X8, the loop below would
+ // iterate 256 times. Considering the maximum value of
+ // cpi->ssim_rdmult_scaling_factors (see av1_set_mb_ssim_rdmult_scaling()),
+ // geom_mean_of_scale can go up to 4.8323^256, which is within DBL_MAX
+ // (maximum value a double data type can hold). If bsize_base is modified to
+ // BLOCK_4X4 (minimum possible block size), geom_mean_of_scale can go up
+ // to 4.8323^1024 and exceed DBL_MAX, resulting in data overflow.
+ assert(bsize_base >= BLOCK_8X8);
+ assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col / num_mi_h;
+ col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0);
+ geom_mean_of_scale *= cpi->ssim_rdmult_scaling_factors[index];
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = pow(geom_mean_of_scale, (1.0 / num_of_mi));
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(errorperbit, *rdmult);
+}
+
+#if CONFIG_SALIENCY_MAP
+void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
+ int *errorperbit, const BLOCK_SIZE bsize,
+ const int mi_row, const int mi_col,
+ int *const rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_mi_w = mi_size_wide[bsize];
+ const int num_mi_h = mi_size_high[bsize];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+
+ *rdmult =
+ (int)(*rdmult * cpi->sm_scaling_factor[(mi_row / num_mi_h) * num_cols +
+ (mi_col / num_mi_w)]);
+
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(errorperbit, *rdmult);
+}
+#endif
+
+// TODO(angiebird): Move these function to tpl_model.c
+#if !CONFIG_REALTIME_ONLY
+// Return the end column for the current superblock, in unit of TPL blocks.
+static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
+ int num_mi_w) {
+ // Find the start column of this superblock.
+ const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
+ << cm->seq_params->mib_size_log2;
+ // Same but in superres upscaled dimension.
+ const int sb_mi_col_start_sr =
+ coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
+ // Width of this superblock in mi units.
+ const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
+ // Same but in superres upscaled dimension.
+ const int sb_mi_width_sr =
+ coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
+ // Superblock end in mi units.
+ const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr;
+ // Superblock end in TPL units.
+ return (sb_mi_end + num_mi_w - 1) / num_mi_w;
+}
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ int deltaq_rdmult = set_rdmult(cpi, x, -1);
+ if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+ if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+ if (x->rb == 0) return deltaq_rdmult;
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ int tpl_stride = tpl_frame->stride;
+ double intra_cost_base = 0;
+ double mc_dep_cost_base = 0;
+ double cbcmp_base = 0;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+ double cbcmp = (double)this_stats->srcrf_dist;
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ intra_cost_base += log(dist_scaled) * cbcmp;
+ mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+ cbcmp_base += cbcmp;
+ }
+ }
+
+ if (cbcmp_base == 0) return deltaq_rdmult;
+
+ double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+ deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb));
+
+ return AOMMAX(deltaq_rdmult, 1);
+}
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int orig_rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ const int deltaq_rdmult = set_rdmult(cpi, x, -1);
+ if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
+ return deltaq_rdmult;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int block_mi_width_sr =
+ coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
+
+ const BLOCK_SIZE bsize_base = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ // This is required because the end col of superblock may be off by 1 in case
+ // of superres.
+ const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w);
+ int row, col;
+ double base_block_count = 0.0;
+ double geom_mean_of_scale = 0.0;
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col_sr / num_mi_h;
+ col < num_cols && col < mi_col_sr / num_mi_h + num_bcols &&
+ col < sb_bcol_end;
+ ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
+ base_block_count += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+ int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+ rdmult = AOMMAX(rdmult, 0);
+ av1_set_error_per_bit(&x->errorperbit, rdmult);
+#if !CONFIG_RD_COMMAND
+ if (bsize == cm->seq_params->sb_size) {
+ const int rdmult_sb = set_rdmult(cpi, x, -1);
+ assert(rdmult_sb == rdmult);
+ (void)rdmult_sb;
+ }
+#endif // !CONFIG_RD_COMMAND
+ return rdmult;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+
+ // Only allow the 3 valid SWITCHABLE_FILTERS.
+ assert(filter < SWITCHABLE_FILTERS);
+ ++counts->switchable_interp[ctx][filter];
+ }
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+ MB_MODE_INFO_EXT *mbmi_ext,
+ const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+ memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+ sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+ memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+ sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+ mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+ mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+ memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+ sizeof(mbmi_ext->global_mvs));
+}
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+ int i, x_idx, y;
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const MB_MODE_INFO *const mi = &ctx->mic;
+ MB_MODE_INFO *const mi_addr = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int bw = mi_size_wide[mi->bsize];
+ const int bh = mi_size_high[mi->bsize];
+ const int mis = mi_params->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ assert(mi->bsize == bsize);
+
+ *mi_addr = *mi;
+ copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best,
+ av1_ref_frame_type(ctx->mic.ref_frame));
+
+ memcpy(txfm_info->blk_skip, ctx->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+
+ txfm_info->skip_txfm = ctx->rd_stats.skip_txfm;
+
+ xd->tx_type_map = ctx->tx_type_map;
+ xd->tx_type_map_stride = mi_size_wide[bsize];
+ // If not dry_run, copy the transform type data into the frame level buffer.
+ // Encoder will fetch tx types when writing bitstream.
+ if (!dry_run) {
+ const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+ uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+ const int mi_stride = mi_params->mi_stride;
+ for (int blk_row = 0; blk_row < bh; ++blk_row) {
+ av1_copy_array(tx_type_map + blk_row * mi_stride,
+ xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+ }
+ xd->tx_type_map = tx_type_map;
+ xd->tx_type_map_stride = mi_stride;
+ }
+
+ // If segmentation in use
+ if (seg->enabled) {
+ // For in frame complexity AQ copy the segment id from the segment map.
+ if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+ mi_addr->segment_id =
+ map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+ }
+ // Else for cyclic refresh mode update the segment map, set the segment id
+ // and then update the quantizer.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ !cpi->rc.rtc_external_ratectrl) {
+ av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize,
+ ctx->rd_stats.rate, ctx->rd_stats.dist,
+ txfm_info->skip_txfm, dry_run);
+ }
+ if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+ mi_addr->uv_mode = UV_DC_PRED;
+
+ if (!dry_run && !mi_addr->skip_txfm) {
+ int cdf_num;
+ const uint8_t spatial_pred = av1_get_spatial_seg_pred(
+ cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
+ const uint8_t coded_id = av1_neg_interleave(
+ mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1);
+ int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id];
+ td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost;
+
+ const int pred_segment_id =
+ cm->last_frame_seg_map
+ ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
+ mi_col)
+ : 0;
+ const int use_tmp_pred = pred_segment_id == mi_addr->segment_id;
+ const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd);
+ td->rd_counts.seg_tmp_pred_cost[1] +=
+ x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred];
+ if (!use_tmp_pred) {
+ td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost;
+ }
+ }
+ }
+
+ // Count zero motion vector.
+ if (!dry_run && !frame_is_intra_only(cm)) {
+ const MV mv = mi->mv[0].as_mv;
+ if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME &&
+ abs(mv.row) < 8 && abs(mv.col) < 8) {
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+ // Accumulate low_content_frame.
+ for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1;
+ }
+ }
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ p[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+
+ const int cols =
+ AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width);
+ const int rows = AOMMIN(
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height);
+ for (y = 0; y < rows; y++) {
+ for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr;
+ }
+
+ if (cpi->oxcf.q_cfg.aq_mode)
+ av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0);
+
+ if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+ {
+ unsigned int *const mode_chosen_counts =
+ (unsigned int *)cpi->mode_chosen_counts; // Cast const away.
+ if (frame_is_intra_only(cm)) {
+ static const int kf_mode_index[] = {
+ THR_DC /*DC_PRED*/,
+ THR_V_PRED /*V_PRED*/,
+ THR_H_PRED /*H_PRED*/,
+ THR_D45_PRED /*D45_PRED*/,
+ THR_D135_PRED /*D135_PRED*/,
+ THR_D113_PRED /*D113_PRED*/,
+ THR_D157_PRED /*D157_PRED*/,
+ THR_D203_PRED /*D203_PRED*/,
+ THR_D67_PRED /*D67_PRED*/,
+ THR_SMOOTH, /*SMOOTH_PRED*/
+ THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+ THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+ THR_PAETH /*PAETH_PRED*/,
+ };
+ ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+ } else {
+ // Note how often each mode chosen as best
+ ++mode_chosen_counts[ctx->best_mode_index];
+ }
+ }
+#endif
+ if (!frame_is_intra_only(cm)) {
+ if (is_inter_block(mi) && cm->features.interp_filter == SWITCHABLE) {
+ // When the frame interp filter is SWITCHABLE, several cases that always
+ // use the default type (EIGHTTAP_REGULAR) are described in
+ // av1_is_interp_needed(). Here, we should keep the counts for all
+ // applicable blocks, so the frame filter resetting decision in
+ // fix_interp_filter() is made correctly.
+ update_filter_type_count(td->counts, xd, mi_addr);
+ }
+ }
+
+ const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+ const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+ if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
+ av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+ PREDICTION_MODE mode, int16_t mode_context) {
+ (void)counts;
+
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+ if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->newmv_mode[mode_ctx][0];
+#endif
+ update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+ return;
+ }
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->newmv_mode[mode_ctx][1];
+#endif
+ update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->zeromv_mode[mode_ctx][0];
+#endif
+ update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+ return;
+ }
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->zeromv_mode[mode_ctx][1];
+#endif
+ update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+ ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+ update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ FRAME_COUNTS *counts) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ (void)counts;
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+ update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+ n > 0, 2);
+ if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+ update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+ n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+ }
+ }
+
+ if (mbmi->uv_mode == UV_DC_PRED) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+ update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+ if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+ update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+ n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+ }
+ }
+}
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+ MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi, const int intraonly) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const PREDICTION_MODE y_mode = mbmi->mode;
+ (void)counts;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+ const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[above];
+ const int left_ctx = intra_mode_context[left];
+ ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+ } else {
+#if CONFIG_ENTROPY_STATS
+ ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+ }
+
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ const int use_filter_intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+ ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode];
+ if (use_filter_intra_mode) {
+ ++counts
+ ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+ }
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2);
+ if (use_filter_intra_mode) {
+ update_cdf(fc->filter_intra_mode_cdf,
+ mbmi->filter_intra_mode_info.filter_intra_mode,
+ FILTER_INTRA_MODES);
+ }
+ }
+ if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->angle_delta[mbmi->mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+ update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+ mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+ 2 * MAX_ANGLE_DELTA + 1);
+ }
+
+ if (!xd->is_chroma_ref) return;
+
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+ UV_INTRA_MODES - !cfl_allowed);
+ if (uv_mode == UV_CFL_PRED) {
+ const int8_t joint_sign = mbmi->cfl_alpha_signs;
+ const uint8_t idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_sign[joint_sign];
+#endif
+ update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+ update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+ update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+ }
+ }
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->angle_delta[intra_mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+ update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED],
+ mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+ 2 * MAX_ANGLE_DELTA + 1);
+ }
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+ update_palette_cdf(xd, mbmi, counts);
+ }
+}
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide = mi_size_wide[bsize];
+ const int num_4x4_blocks_high = mi_size_high[bsize];
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+ for (p = 0; p < num_planes; p++) {
+ int tx_col = mi_col;
+ int tx_row = mi_row & MAX_MIB_MASK;
+ memcpy(
+ xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+ ctx->a + num_4x4_blocks_wide * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+ ctx->l + num_4x4_blocks_high * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(xd->above_partition_context + mi_col, ctx->sa,
+ sizeof(*xd->above_partition_context) * mi_width);
+ memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+ sizeof(xd->left_partition_context[0]) * mi_height);
+ xd->above_txfm_context = ctx->p_ta;
+ xd->left_txfm_context = ctx->p_tl;
+ memcpy(xd->above_txfm_context, ctx->ta,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(xd->left_txfm_context, ctx->tl,
+ sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < num_planes; ++p) {
+ int tx_col = mi_col;
+ int tx_row = mi_row & MAX_MIB_MASK;
+ memcpy(
+ ctx->a + mi_width * p,
+ xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+ (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+ memcpy(ctx->l + mi_height * p,
+ xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+ (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
+ }
+ memcpy(ctx->sa, xd->above_partition_context + mi_col,
+ sizeof(*xd->above_partition_context) * mi_width);
+ memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+ sizeof(xd->left_partition_context[0]) * mi_height);
+ memcpy(ctx->ta, xd->above_txfm_context,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(ctx->tl, xd->left_txfm_context,
+ sizeof(*xd->left_txfm_context) * mi_height);
+ ctx->p_ta = xd->above_txfm_context;
+ ctx->p_tl = xd->left_txfm_context;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+ MB_MODE_INFO *mi, int bh_in, int bw_in,
+ int mi_rows_remaining,
+ int mi_cols_remaining, BLOCK_SIZE bsize,
+ MB_MODE_INFO **mib) {
+ int bh = bh_in;
+ int r, c;
+ for (r = 0; r < cm->seq_params->mib_size; r += bh) {
+ int bw = bw_in;
+ for (c = 0; c < cm->seq_params->mib_size; c += bw) {
+ const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+ const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+ mib[grid_index] = mi + mi_index;
+ mib[grid_index]->bsize = find_partition_size(
+ bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+ }
+ }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MB_MODE_INFO **mib, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_rows_remaining = tile->mi_row_end - mi_row;
+ const int mi_cols_remaining = tile->mi_col_end - mi_col;
+ MB_MODE_INFO *const mi_upper_left =
+ mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+
+ assert(bsize >= mi_params->mi_alloc_bsize &&
+ "Attempted to use bsize < mi_params->mi_alloc_bsize");
+ assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+ // Apply the requested partition size to the SB if it is all "in image"
+ if ((mi_cols_remaining >= cm->seq_params->mib_size) &&
+ (mi_rows_remaining >= cm->seq_params->mib_size)) {
+ for (int block_row = 0; block_row < cm->seq_params->mib_size;
+ block_row += bh) {
+ for (int block_col = 0; block_col < cm->seq_params->mib_size;
+ block_col += bw) {
+ const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+ const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+ mib[grid_index] = mi_upper_left + mi_index;
+ mib[grid_index]->bsize = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB.
+ set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+ mi_cols_remaining, bsize, mib);
+ }
+}
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ assert(bsize >= BLOCK_8X8);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ for (int i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+ (mi_col + x_idx >= cm->mi_params.mi_cols))
+ return 0;
+ if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+ PARTITION_NONE &&
+ subsize != BLOCK_8X8)
+ return 0;
+ }
+ return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int orig_rdmult) {
+ AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ int64_t intra_cost = 0;
+ int64_t mc_dep_cost = 0;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ int tpl_stride = tpl_frame->stride;
+
+ if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+ return orig_rdmult;
+ }
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+ return orig_rdmult;
+ }
+
+#ifndef NDEBUG
+ int mi_count = 0;
+#endif
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int step = 1 << block_mis_log2;
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+ mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+#ifndef NDEBUG
+ mi_count++;
+#endif
+ }
+ }
+ assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+ double beta = 1.0;
+ if (mc_dep_cost > 0 && intra_cost > 0) {
+ const double r0 = cpi->rd.r0;
+ const double rk = (double)intra_cost / mc_dep_cost;
+ beta = (r0 / rk);
+ }
+
+ int rdmult = av1_get_adaptive_rdmult(cpi, beta);
+
+ rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+ rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+ rdmult = AOMMAX(1, rdmult);
+
+ return rdmult;
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_params.mi_rows;
+ int is_active_h_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (is_stat_consumption_stage_twopass(cpi)) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+ &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+ if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
+
+ bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
+ bottom_edge = AOMMAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+ is_active_h_edge = 1;
+ }
+ return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_params.mi_cols;
+ int is_active_v_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (is_stat_consumption_stage_twopass(cpi)) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+ &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+ if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
+
+ right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
+ right_edge = AOMMAX(left_edge, right_edge);
+ }
+
+ if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+ is_active_v_edge = 1;
+ }
+ return is_active_v_edge;
+}
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, SuperBlockEnc *sb_enc) {
+ sb_enc->tpl_data_count = 0;
+
+ if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+ return;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int gf_group_index = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ int tpl_stride = tpl_frame->stride;
+
+ int mi_count = 0;
+ int count = 0;
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ // mi_cols_sr is mi_cols at superres case.
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ // TPL store unit size is not the same as the motion estimation unit size.
+ // Here always use motion estimation size to avoid getting repetitive inter/
+ // intra cost.
+ const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+ assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+ const int row_step = mi_size_high[tpl_bsize];
+ const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize],
+ cm->superres_scale_denominator);
+
+ // Stride is only based on SB size, and we fill in values for every 16x16
+ // block in a SB.
+ sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+ // Handle partial SB, so that no invalid values are used later.
+ if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+ sb_enc->tpl_inter_cost[count] = INT64_MAX;
+ sb_enc->tpl_intra_cost[count] = INT64_MAX;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ sb_enc->tpl_mv[count][i].as_int = INVALID_MV;
+ }
+ count++;
+ continue;
+ }
+
+ TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ sb_enc->tpl_inter_cost[count] = this_stats->inter_cost
+ << TPL_DEP_COST_SCALE_LOG2;
+ sb_enc->tpl_intra_cost[count] = this_stats->intra_cost
+ << TPL_DEP_COST_SCALE_LOG2;
+ memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
+ mi_count++;
+ count++;
+ }
+ }
+
+ assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+ sb_enc->tpl_data_count = mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+// predictor chosen
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+ int64_t *delta_dist, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ double intra_cost = 0;
+ double mc_dep_reg = 0;
+ double mc_dep_cost = 0;
+ double cbcmp_base = 1;
+ double srcrf_dist = 0;
+ double srcrf_sse = 0;
+ double srcrf_rate = 0;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+ const int base_qindex = cm->quant_params.base_qindex;
+
+ if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex;
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ int tpl_stride = tpl_frame->stride;
+ if (!tpl_frame->is_valid) return base_qindex;
+
+#ifndef NDEBUG
+ int mi_count = 0;
+#endif
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int step = 1 << block_mis_log2;
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+ double cbcmp = (double)this_stats->srcrf_dist;
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ intra_cost += log(dist_scaled) * cbcmp;
+ mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp;
+ mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+ srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS);
+ srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS);
+ srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
+#ifndef NDEBUG
+ mi_count++;
+#endif
+ cbcmp_base += cbcmp;
+ }
+ }
+ assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+ int offset = 0;
+ double beta = 1.0;
+ double rk;
+ if (mc_dep_cost > 0 && intra_cost > 0) {
+ const double r0 = cpi->rd.r0;
+ rk = exp((intra_cost - mc_dep_cost) / cbcmp_base);
+ td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base);
+ beta = (r0 / rk);
+ assert(beta > 0.0);
+ } else {
+ return base_qindex;
+ }
+ offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+ offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+
+ int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth);
+ int sbs_qstep =
+ av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth);
+
+ if (delta_dist) {
+ double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0);
+ double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep);
+ sbs_dist = AOMMIN(sbs_dist, srcrf_sse);
+ *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk);
+ *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0);
+ *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0);
+ }
+ return qindex;
+}
+
+#if !DISABLE_HDR_LUMA_DELTAQ
+// offset table defined in Table3 of T-REC-H.Sup15 document.
+static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0, 301, 367, 434, 501, 567,
+ 634, 701, 767, 834, 1024 };
+
+static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3, 2, 1, 0, -1,
+ -2, -3, -4, -5, -6 };
+#endif
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ assert(cm->seq_params->bit_depth == AOM_BITS_10);
+
+#if DISABLE_HDR_LUMA_DELTAQ
+ (void)x;
+ (void)bsize;
+ (void)mi_row;
+ (void)mi_col;
+ return cm->quant_params.base_qindex;
+#else
+ // calculate pixel average
+ const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col);
+ // adjust offset based on average of the pixel block
+ int offset = 0;
+ for (int i = 0; i < HDR_QP_LEVELS; i++) {
+ if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) {
+ offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR);
+ break;
+ }
+ }
+
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+ offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+
+ return qindex;
+#endif
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+ BLOCK_SIZE bsize) {
+ if (sms_tree == NULL) return;
+ sms_tree->partitioning = PARTITION_NONE;
+
+ if (bsize >= BLOCK_8X8) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int idx = 0; idx < 4; ++idx)
+ av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize);
+ }
+}
+
+// Record the ref frames that have been selected by square partition blocks.
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+ BLOCK_SIZE bsize, int mib_size,
+ int mi_row, int mi_col) {
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ const int sb_size_mask = mib_size - 1;
+ const int mi_row_in_sb = mi_row & sb_size_mask;
+ const int mi_col_in_sb = mi_col & sb_size_mask;
+ const int mi_size = mi_size_wide[bsize];
+ for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+ for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+ x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+ }
+ }
+}
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+ int num_cdfs, int cdf_stride, int nsymbs,
+ int wt_left, int wt_tr) {
+ for (int i = 0; i < num_cdfs; i++) {
+ for (int j = 0; j <= nsymbs; j++) {
+ cdf_ptr_left[i * cdf_stride + j] =
+ (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+ (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+ ((wt_left + wt_tr) / 2)) /
+ (wt_left + wt_tr));
+ assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+ cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+ }
+ }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+ AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \
+ do { \
+ aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \
+ aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \
+ int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \
+ int num_cdfs = array_size / cdf_stride; \
+ avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+ wt_left, wt_tr); \
+ } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+ int wt_tr) {
+ AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+ for (int i = 0; i < 2; i++) {
+ AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+ MV_CLASSES);
+ AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+ nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+ nmv_tr->comps[i].class0_hp_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+ CLASS0_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+ }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+ int wt_left, int wt_tr) {
+ AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+ AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+ AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+ AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+ AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+ AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+ AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+ AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+ AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+ AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+ AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+ ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+ AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+ MASKED_COMPOUND_TYPES);
+ AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+ AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+ AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+ AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+ INTERINTRA_MODES);
+ AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+ AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+ AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+ PALETTE_SIZES);
+ AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+ PALETTE_SIZES);
+ for (int j = 0; j < PALETTE_SIZES; j++) {
+ int nsymbs = j + PALETTE_MIN_SIZE;
+ AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+ ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+ ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ }
+ AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+ AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+ AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+ AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+ AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+ AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+ AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+ AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2);
+ AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+ avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+ avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+ AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+ AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+ AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+ ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+ AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+ AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+ FILTER_INTRA_MODES);
+ AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES);
+ AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+ AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+ AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+ AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+ UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+ AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+ for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+ if (i < 4) {
+ AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+ CDF_SIZE(10));
+ } else if (i < 16) {
+ AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+ } else {
+ AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+ CDF_SIZE(10));
+ }
+ }
+ AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+ SWITCHABLE_FILTERS);
+ AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+ AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+ 2 * MAX_ANGLE_DELTA + 1);
+ AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+ CDF_SIZE(MAX_TX_DEPTH + 1));
+ AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+ AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+ for (int i = 0; i < FRAME_LF_COUNT; i++) {
+ AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+ DELTA_LF_PROBS + 1);
+ }
+ AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+ CDF_SIZE(TX_TYPES));
+ AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+ CFL_ALPHABET_SIZE);
+}
+
+// Check neighbor blocks' motion information.
+static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride,
+ const TileInfo *const tile_info, int mi_row,
+ int mi_col) {
+ int is_above_low_motion = 1;
+ int is_left_low_motion = 1;
+ const int thr = 24;
+
+ // Check above block.
+ if (mi_row > tile_info->mi_row_start) {
+ const MB_MODE_INFO *above_mbmi = mi[-mi_stride];
+ const int_mv above_mv = above_mbmi->mv[0];
+ if (above_mbmi->mode >= INTRA_MODE_END &&
+ (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr))
+ is_above_low_motion = 0;
+ }
+
+ // Check left block.
+ if (mi_col > tile_info->mi_col_start) {
+ const MB_MODE_INFO *left_mbmi = mi[-1];
+ const int_mv left_mv = left_mbmi->mv[0];
+ if (left_mbmi->mode >= INTRA_MODE_END &&
+ (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr))
+ is_left_low_motion = 0;
+ }
+
+ return (is_above_low_motion && is_left_low_motion);
+}
+
+// Check this block's motion in a fast way.
+static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y,
+ int src_ystride,
+ const uint8_t *last_src_y,
+ int last_src_ystride, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+ unsigned int blk_sad = INT_MAX;
+ if (cpi->src_sad_blk_64x64 != NULL) {
+ const int sb_size_by_mb = (bsize == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols =
+ (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sbi_col = mi_col / sb_size_by_mb;
+ const int sbi_row = mi_row / sb_size_by_mb;
+ blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+ } else {
+ blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+ last_src_ystride);
+ }
+
+ // Search 4 1-away points.
+ const uint8_t *const search_pos[4] = {
+ last_src_y - last_src_ystride,
+ last_src_y - 1,
+ last_src_y + 1,
+ last_src_y + last_src_ystride,
+ };
+ unsigned int sad_arr[4];
+ cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos,
+ last_src_ystride, sad_arr);
+
+ blk_sad = (blk_sad * 5) >> 3;
+ return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] &&
+ blk_sad < sad_arr[2] && blk_sad < sad_arr[3]);
+}
+
+// Grade the temporal variation of the source by comparing the current sb and
+// its collocated block in the last frame.
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+ int mi_row, int mi_col) {
+ if (cpi->last_source->y_width != cpi->source->y_width ||
+ cpi->last_source->y_height != cpi->source->y_height)
+ return;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+
+ unsigned int tmp_sse;
+ unsigned int tmp_variance;
+ const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+ uint8_t *src_y = cpi->source->y_buffer;
+ const int src_ystride = cpi->source->y_stride;
+ const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2);
+ uint8_t *last_src_y = cpi->last_source->y_buffer;
+ const int last_src_ystride = cpi->last_source->y_stride;
+ const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2);
+ uint64_t avg_source_sse_threshold_verylow = 10000; // ~1.5*1.5*(64*64)
+ uint64_t avg_source_sse_threshold_low[2] = { 100000, // ~5*5*(64*64)
+ 36000 }; // ~3*3*(64*64)
+
+ uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64)
+ if (cpi->sf.rt_sf.increase_source_sad_thresh) {
+ avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1;
+ avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1;
+ avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1;
+ }
+ uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5
+ src_y += src_offset;
+ last_src_y += last_src_offset;
+ tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+ last_src_ystride, &tmp_sse);
+ // rd thresholds
+ if (tmp_sse < avg_source_sse_threshold_low[1])
+ x->content_state_sb.source_sad_rd = kLowSad;
+
+ // nonrd thresholds
+ if (tmp_sse == 0) {
+ x->content_state_sb.source_sad_nonrd = kZeroSad;
+ return;
+ }
+ if (tmp_sse < avg_source_sse_threshold_verylow)
+ x->content_state_sb.source_sad_nonrd = kVeryLowSad;
+ else if (tmp_sse < avg_source_sse_threshold_low[0])
+ x->content_state_sb.source_sad_nonrd = kLowSad;
+ else if (tmp_sse > avg_source_sse_threshold_high)
+ x->content_state_sb.source_sad_nonrd = kHighSad;
+
+ // Detect large lighting change.
+ // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+ if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+ x->content_state_sb.lighting_change = 1;
+ if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
+ x->content_state_sb.low_sumdiff = 1;
+
+ if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad ||
+ cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1)
+ return;
+
+ // In-place temporal filter. If psnr calculation is enabled, we store the
+ // source for that.
+ AV1_COMMON *const cm = &cpi->common;
+ // Calculate n*mean^2
+ const unsigned int nmean2 = tmp_sse - tmp_variance;
+ const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+ cm->seq_params->bit_depth);
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME],
+ 0, cm->seq_params->bit_depth);
+
+ const unsigned int threshold =
+ (cpi->sf.rt_sf.use_rtc_tf == 1)
+ ? (clamp(avg_q_step, 250, 1000)) * ac_q_step
+ : 250 * ac_q_step;
+
+ // TODO(yunqing): use a weighted sum instead of averaging in filtering.
+ if (tmp_variance <= threshold && nmean2 <= 15) {
+ // Check neighbor blocks. If neighbor blocks aren't low-motion blocks,
+ // skip temporal filtering for this block.
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ const int is_neighbor_blocks_low_motion = check_neighbor_blocks(
+ mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col);
+ if (!is_neighbor_blocks_low_motion) return;
+
+ // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB
+ // size.
+ // Test several nearby points. If non-zero mv exists, don't do temporal
+ // filtering.
+ const int is_this_blk_low_motion = fast_detect_non_zero_motion(
+ cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col);
+
+ if (!is_this_blk_low_motion) return;
+
+ const int shift_x[2] = { 0, cpi->source->subsampling_x };
+ const int shift_y[2] = { 0, cpi->source->subsampling_y };
+ const uint8_t h = block_size_high[bsize];
+ const uint8_t w = block_size_wide[bsize];
+
+ for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+ uint8_t *src = cpi->source->buffers[plane];
+ const int src_stride = cpi->source->strides[plane != 0];
+ uint8_t *last_src = cpi->last_source->buffers[plane];
+ const int last_src_stride = cpi->last_source->strides[plane != 0];
+ src += src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+ (mi_col << (2 - shift_x[plane != 0]));
+ last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+ (mi_col << (2 - shift_x[plane != 0]));
+
+ for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) {
+ for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) {
+ src[j] = (last_src[j] + src[j]) >> 1;
+ }
+ src += src_stride;
+ last_src += last_src_stride;
+ }
+ }
+ }
+}
+
+// Memset the mbmis at the current superblock to 0
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+ int mi_row, int mi_col) {
+ // size of sb in unit of mi (BLOCK_4X4)
+ const int sb_size_mi = mi_size_wide[sb_size];
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ // size of sb in unit of allocated mi size
+ const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+ assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+ "mi is not allocated as a multiple of sb!");
+ assert(mi_params->mi_stride % sb_size_mi == 0 &&
+ "mi_grid_base is not allocated as a multiple of sb!");
+
+ const int mi_rows = mi_size_high[sb_size];
+ for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+ assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+ mi_params->mi_stride);
+ const int mi_grid_idx =
+ get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+ const int alloc_mi_idx =
+ get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+ memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+ sb_size_mi * sizeof(*mi_params->mi_grid_base));
+ memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+ sb_size_mi * sizeof(*mi_params->tx_type_map));
+ if (cur_mi_row % mi_alloc_size_1d == 0) {
+ memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+ sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+ }
+ }
+}
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+ ThreadData *td, const TileDataEnc *tile_data,
+ int mi_row, int mi_col) {
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const TileInfo *tile_info = &tile_data->tile_info;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+ sb_fp_stats->rd_count = td->rd_counts;
+ sb_fp_stats->split_count = x->txfm_search_info.txb_split_count;
+
+ sb_fp_stats->fc = *td->counts;
+
+ // Don't copy in row_mt case, otherwise run into data race. No behavior change
+ // in row_mt case.
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+ sizeof(sb_fp_stats->inter_mode_rd_models));
+ }
+
+ memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+ sizeof(sb_fp_stats->thresh_freq_fact));
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ sb_fp_stats->current_qindex =
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+ memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+ sizeof(sb_fp_stats->mode_chosen_counts));
+#endif // CONFIG_INTERNAL_STATS
+}
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+ ThreadData *td, TileDataEnc *tile_data, int mi_row,
+ int mi_col) {
+ MACROBLOCK *x = &td->mb;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
+ num_planes);
+
+ td->rd_counts = sb_fp_stats->rd_count;
+ x->txfm_search_info.txb_split_count = sb_fp_stats->split_count;
+
+ *td->counts = sb_fp_stats->fc;
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+ sizeof(sb_fp_stats->inter_mode_rd_models));
+ }
+
+ memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+ sizeof(sb_fp_stats->thresh_freq_fact));
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+ sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+ memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+ sizeof(sb_fp_stats->mode_chosen_counts));
+#endif // CONFIG_INTERNAL_STATS
+}
+
+/*! Checks whether to skip updating the entropy cost based on tile info.
+ *
+ * This function contains the common code used to skip the cost update of coeff,
+ * mode, mv and dv symbols.
+ */
+static int skip_cost_update(const SequenceHeader *seq_params,
+ const TileInfo *const tile_info, const int mi_row,
+ const int mi_col,
+ INTERNAL_COST_UPDATE_TYPE upd_level) {
+ if (upd_level == INTERNAL_COST_UPD_SB) return 0;
+ if (upd_level == INTERNAL_COST_UPD_OFF) return 1;
+
+ // upd_level is at most as frequent as each sb_row in a tile.
+ if (mi_col != tile_info->mi_col_start) return 1;
+
+ if (upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+ const int mib_size_log2 = seq_params->mib_size_log2;
+ const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+ const int sb_size = seq_params->mib_size * MI_SIZE;
+ const int tile_height =
+ (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+ // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens
+ // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However,
+ // as the update will not be equally spaced in smaller resolutions making
+ // it equally spaced by calculating (mv_num_rows_cost_update) the number of
+ // rows after which the cost update should happen.
+ const int sb_size_update_freq_map[2] = { 2, 4 };
+ const int update_freq_sb_rows =
+ sb_size_update_freq_map[sb_size != MAX_SB_SIZE];
+ const int update_freq_num_rows = sb_size * update_freq_sb_rows;
+ // Round-up the division result to next integer.
+ const int num_updates_per_tile =
+ (tile_height + update_freq_num_rows - 1) / update_freq_num_rows;
+ const int num_rows_update_per_tile = num_updates_per_tile * sb_size;
+ // Round-up the division result to next integer.
+ const int num_sb_rows_per_update =
+ (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile;
+ if ((sb_row % num_sb_rows_per_update) != 0) return 1;
+ }
+ return 0;
+}
+
+// Checks for skip status of mv cost update.
+static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+ const int mi_row, const int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ // For intra frames, mv cdfs are not updated during the encode. Hence, the mv
+ // cost calculation is skipped in this case.
+ if (frame_is_intra_only(cm)) return 1;
+
+ return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.inter_sf.mv_cost_upd_level);
+}
+
+// Checks for skip status of dv cost update.
+static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+ const int mi_row, const int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ // Intrabc is only applicable to intra frames. So skip if intrabc is not
+ // allowed.
+ if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) {
+ return 1;
+ }
+
+ return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.intra_sf.dv_cost_upd_level);
+}
+
+// Update the rate costs of some symbols according to the frequency directed
+// by speed features
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile_info, const int mi_row,
+ const int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ if (cm->features.disable_cdf_update) {
+ return;
+ }
+
+ switch (cpi->sf.inter_sf.coeff_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.inter_sf.coeff_cost_upd_level))
+ break;
+ av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->sf.inter_sf.mode_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.inter_sf.mode_cost_upd_level))
+ break;
+ av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->sf.inter_sf.mv_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ // Checks for skip status of mv cost update.
+ if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+ av1_fill_mv_costs(&xd->tile_ctx->nmvc,
+ cm->features.cur_frame_force_integer_mv,
+ cm->features.allow_high_precision_mv, x->mv_costs);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->sf.intra_sf.dv_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ // Checks for skip status of dv cost update.
+ if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+ av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs);
+ break;
+ default: assert(0);
+ }
+}
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ aom_free(mb->plane[plane].src_diff);
+ mb->plane[plane].src_diff = NULL;
+ }
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) {
+ const int num_planes = av1_num_planes(cm);
+#ifndef NDEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ assert(!mb->plane[plane].src_diff);
+ }
+#endif
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_xy =
+ plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+ : 0;
+ const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+ CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+ (int16_t *)aom_memalign(
+ 32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe_utils.h b/third_party/aom/av1/encoder/encodeframe_utils.h
new file mode 100644
index 0000000000..14c71b8802
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe_utils.h
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WRITE_FEATURE_TO_FILE 0
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+ (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+ (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part partition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB partition types.
+enum {
+ HORZ_A = 0,
+ HORZ_B,
+ VERT_A,
+ VERT_B,
+ NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular partition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+ int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+ SB_SINGLE_PASS, // Single pass encoding: all ctxs get updated normally
+ SB_DRY_PASS, // First pass of multi-pass: does not update the ctxs
+ SB_WET_PASS // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+typedef struct {
+ ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+ ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+ PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+ PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+ TXFM_CONTEXT *p_ta;
+ TXFM_CONTEXT *p_tl;
+ TXFM_CONTEXT ta[MAX_MIB_SIZE];
+ TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_COUNTS rd_count;
+
+ int split_count;
+ FRAME_COUNTS fc;
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+ int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+ unsigned int mode_chosen_counts[MAX_MODES];
+#endif // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
+// This structure contains block size related
+// variables for use in rd_pick_partition().
+typedef struct {
+ // Half of block width to determine block edge.
+ int mi_step;
+
+ // Block row and column indices.
+ int mi_row;
+ int mi_col;
+
+ // Block edge row and column indices.
+ int mi_row_edge;
+ int mi_col_edge;
+
+ // Block width of current partition block.
+ int width;
+
+ // Block width of minimum partition size allowed.
+ int min_partition_size_1d;
+
+ // Flag to indicate if partition is 8x8 or higher size.
+ int bsize_at_least_8x8;
+
+ // Indicates edge blocks in frame.
+ int has_rows;
+ int has_cols;
+
+ // Block size of current partition.
+ BLOCK_SIZE bsize;
+
+ // Size of current sub-partition.
+ BLOCK_SIZE subsize;
+
+ // Size of split partition.
+ BLOCK_SIZE split_bsize2;
+} PartitionBlkParams;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct PartitionTimingStats {
+ // Tracks the number of partition decision used in the current call to \ref
+ // av1_rd_pick_partition
+ int partition_decisions[EXT_PARTITION_TYPES];
+ // Tracks the number of partition_block searched in the current call to \ref
+ // av1_rd_pick_partition
+ int partition_attempts[EXT_PARTITION_TYPES];
+ // Tracks the time spent on each partition search in the current call to \ref
+ // av1_rd_pick_partition
+ int64_t partition_times[EXT_PARTITION_TYPES];
+ // Tracks the rdcost spent on each partition search in the current call to
+ // \ref av1_rd_pick_partition
+ int64_t partition_rdcost[EXT_PARTITION_TYPES];
+ // Timer used to time the partitions.
+ struct aom_usec_timer timer;
+ // Whether the timer is on
+ int timer_is_on;
+} PartitionTimingStats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+// Structure holding state variables for partition search.
+typedef struct {
+ // Intra partitioning related info.
+ PartitionSearchInfo *intra_part_info;
+
+ // Parameters related to partition block size.
+ PartitionBlkParams part_blk_params;
+
+ // Win flags for HORZ and VERT partition evaluations.
+ RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT];
+
+ // RD cost for the current block of given partition type.
+ RD_STATS this_rdc;
+
+ // RD cost summed across all blocks of partition type.
+ RD_STATS sum_rdc;
+
+ // Array holding partition type cost.
+ int tmp_partition_cost[PARTITION_TYPES];
+
+ // Pointer to partition cost buffer
+ int *partition_cost;
+
+ // RD costs for different partition types.
+ int64_t none_rd;
+ int64_t split_rd[SUB_PARTITIONS_SPLIT];
+ // RD costs for rectangular partitions.
+ // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ.
+ // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT.
+ int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT];
+
+ // Flags indicating if the corresponding partition was winner or not.
+ // Used to bypass similar blocks during AB partition evaluation.
+ int is_split_ctx_is_ready[2];
+ int is_rect_ctx_is_ready[NUM_RECT_PARTS];
+
+ // If true, skips the rest of partition evaluation at the current bsize level.
+ int terminate_partition_search;
+
+ // If false, skips rdopt on PARTITION_NONE.
+ int partition_none_allowed;
+
+ // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ,
+ // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT.
+ int partition_rect_allowed[NUM_RECT_PARTS];
+
+ // If false, skips searching rectangular partition unless some logic related
+ // to edge detection holds.
+ int do_rectangular_split;
+
+ // If false, skips searching PARTITION_SPLIT.
+ int do_square_split;
+
+ // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that
+ // this does not directly affect the extended partitions, so this can be used
+ // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of
+ // PARTITION_HORZ_AB4, etc.
+ int prune_rect_part[NUM_RECT_PARTS];
+
+ // Chroma subsampling in x and y directions.
+ int ss_x;
+ int ss_y;
+
+ // Partition plane context index.
+ int pl_ctx_idx;
+
+ // This flag will be set if best partition is found from the search.
+ bool found_best_partition;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats part_timing_stats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+} PartitionSearchState;
+
+static AOM_INLINE void av1_disable_square_split_partition(
+ PartitionSearchState *part_state) {
+ part_state->do_square_split = 0;
+}
+
+// Disables all possible rectangular splits. This includes PARTITION_AB4 as they
+// depend on the corresponding partition_rect_allowed.
+static AOM_INLINE void av1_disable_rect_partitions(
+ PartitionSearchState *part_state) {
+ part_state->do_rectangular_split = 0;
+ part_state->partition_rect_allowed[HORZ] = 0;
+ part_state->partition_rect_allowed[VERT] = 0;
+}
+
+// Disables all possible splits so that only PARTITION_NONE *might* be allowed.
+static AOM_INLINE void av1_disable_all_splits(
+ PartitionSearchState *part_state) {
+ av1_disable_square_split_partition(part_state);
+ av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE void av1_set_square_split_only(
+ PartitionSearchState *part_state) {
+ part_state->partition_none_allowed = 0;
+ part_state->do_square_split = 1;
+ av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE bool av1_blk_has_rows_and_cols(
+ const PartitionBlkParams *blk_params) {
+ return blk_params->has_rows && blk_params->has_cols;
+}
+
+static AOM_INLINE bool av1_is_whole_blk_in_frame(
+ const PartitionBlkParams *blk_params,
+ const CommonModeInfoParams *mi_params) {
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+ return mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+ mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+}
+
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ int dual_filter) {
+ for (int dir = 0; dir < 2; ++dir) {
+ if (dir && !dual_filter) break;
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+ update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+ SWITCHABLE_FILTERS);
+ }
+}
+
+static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, int segment_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const CommonQuantParams *quant_params = &cm->quant_params;
+ const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+ const FRAME_UPDATE_TYPE update_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+
+ int qindex;
+ if (segment_id >= 0) {
+ qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+ } else {
+ qindex = quant_params->base_qindex + x->rdmult_delta_qindex +
+ quant_params->y_dc_delta_q;
+ }
+
+ return av1_compute_rd_mult(
+ qindex, bit_depth, update_type, layer_depth, boost_index, frame_type,
+ cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi));
+}
+
+static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
+ return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
+ int frm) {
+ assert(frm >= 0);
+ if (frm < 0 ||
+ p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+ return NULL;
+ }
+
+ return &p->stats_buf_ctx->stats_in_start[frm];
+}
+
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int orig_rdmult);
+
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step);
+
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, SuperBlockEnc *sb_enc);
+
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+ int64_t *delta_dist, BLOCK_SIZE bsize,
+ int mi_row, int mi_col);
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col);
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col);
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int orig_rdmult);
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult);
+
+#if CONFIG_SALIENCY_MAP
+void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
+ int *errorperbit, const BLOCK_SIZE bsize,
+ const int mi_row, const int mi_col,
+ int *const rdmult);
+#endif
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+ PREDICTION_MODE mode, int16_t mode_context);
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+ MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi, const int intraonly);
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes);
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes);
+
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MB_MODE_INFO **mib, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+ BLOCK_SIZE bsize);
+
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+ BLOCK_SIZE bsize, int mib_size,
+ int mi_row, int mi_col);
+
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+ int wt_left, int wt_tr);
+
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+ int mi_row, int mi_col);
+
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+ int mi_row, int mi_col);
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+ ThreadData *td, const TileDataEnc *tile_data,
+ int mi_row, int mi_col);
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+ ThreadData *td, TileDataEnc *tile_data, int mi_row,
+ int mi_col);
+
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile_info, const int mi_row,
+ const int mi_col);
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes);
+
+static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
+ int num_planes) {
+ aom_free(mb->txfm_search_info.mb_rd_record);
+ mb->txfm_search_info.mb_rd_record = NULL;
+
+ aom_free(mb->inter_modes_info);
+ mb->inter_modes_info = NULL;
+
+ av1_dealloc_src_diff_buf(mb, num_planes);
+
+ aom_free(mb->e_mbd.seg_mask);
+ mb->e_mbd.seg_mask = NULL;
+
+ aom_free(mb->winner_mode_stats);
+ mb->winner_mode_stats = NULL;
+
+ aom_free(mb->dqcoeff_buf);
+ mb->dqcoeff_buf = NULL;
+}
+
+static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
+ struct macroblock *mb) {
+ const SPEED_FEATURES *sf = &cpi->sf;
+ // The winner_mode_stats buffer is not required in these cases.
+ if (is_stat_generation_stage(cpi) ||
+ (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) ||
+ (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF))
+ return;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int winner_mode_count =
+ winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+ CHECK_MEM_ERROR(cm, mb->winner_mode_stats,
+ (WinnerModeStats *)aom_malloc(
+ winner_mode_count * sizeof(mb->winner_mode_stats[0])));
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb);
+
+static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
+ struct macroblock *mb) {
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *sf = &cpi->sf;
+ if (!sf->rt_sf.use_nonrd_pick_mode) {
+ // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is
+ // enabled.
+ if (sf->rd_sf.use_mb_rd_hash)
+ CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record,
+ (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD)));
+ if (!frame_is_intra_only(cm))
+ CHECK_MEM_ERROR(
+ cm, mb->inter_modes_info,
+ (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
+ }
+
+ av1_alloc_src_diff_buf(cm, mb);
+
+ CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
+ (uint8_t *)aom_memalign(
+ 16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
+
+ allocate_winner_mode_stats(cpi, mb);
+
+ const int max_sb_square_y = 1
+ << num_pels_log2_lookup[cm->seq_params->sb_size];
+ CHECK_MEM_ERROR(
+ cm, mb->dqcoeff_buf,
+ (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t)));
+}
+
+// This function will compute the number of reference frames to be disabled
+// based on selective_ref_frame speed feature.
+static AOM_INLINE unsigned int get_num_refs_to_disable(
+ const AV1_COMP *cpi, const int *ref_frame_flags,
+ const unsigned int *ref_display_order_hint,
+ unsigned int cur_frame_display_index) {
+ unsigned int num_refs_to_disable = 0;
+ if (cpi->sf.inter_sf.selective_ref_frame >= 3) {
+ num_refs_to_disable++;
+ if (cpi->sf.inter_sf.selective_ref_frame >= 6) {
+ // Disable LAST2_FRAME and ALTREF2_FRAME
+ num_refs_to_disable += 2;
+ } else if (cpi->sf.inter_sf.selective_ref_frame == 5 &&
+ *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) {
+ const int last2_frame_dist = av1_encoder_get_relative_dist(
+ ref_display_order_hint[LAST2_FRAME - LAST_FRAME],
+ cur_frame_display_index);
+ // Disable LAST2_FRAME if it is a temporally distant frame
+ if (abs(last2_frame_dist) > 2) {
+ num_refs_to_disable++;
+ }
+#if !CONFIG_REALTIME_ONLY
+ else if (is_stat_consumption_stage_twopass(cpi)) {
+ const FIRSTPASS_STATS *const this_frame_stats =
+ read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index);
+ const double coded_error_per_mb = this_frame_stats->coded_error;
+ // Disable LAST2_FRAME if the coded error of the current frame based on
+ // first pass stats is very low.
+ if (coded_error_per_mb < 100.0) num_refs_to_disable++;
+ }
+#endif // CONFIG_REALTIME_ONLY
+ }
+ }
+ return num_refs_to_disable;
+}
+
+static INLINE int get_max_allowed_ref_frames(
+ const AV1_COMP *cpi, const int *ref_frame_flags,
+ const unsigned int *ref_display_order_hint,
+ unsigned int cur_frame_display_index) {
+ const unsigned int max_reference_frames =
+ cpi->oxcf.ref_frm_cfg.max_reference_frames;
+ const unsigned int num_refs_to_disable = get_num_refs_to_disable(
+ cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+ const unsigned int max_allowed_refs_for_given_speed =
+ INTER_REFS_PER_FRAME - num_refs_to_disable;
+ return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(
+ AV1_COMP *cpi, int *ref_frame_flags,
+ const unsigned int *ref_display_order_hint,
+ unsigned int cur_frame_display_index) {
+ MV_REFERENCE_FRAME ref_frame;
+ int total_valid_refs = 0;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ total_valid_refs++;
+ }
+ }
+
+ const int max_allowed_refs = get_max_allowed_ref_frames(
+ cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+
+ for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+ const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+ if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+ continue;
+ }
+
+ switch (ref_frame_to_disable) {
+ case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+ case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+ case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+ case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+ default: assert(0);
+ }
+ --total_valid_refs;
+ }
+ assert(total_valid_refs <= max_allowed_refs);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 0000000000..c78761dd98
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ assert(rows >= 4 && cols >= 4);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bd_info.use_highbitdepth_buf) {
+ aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+ pred8, pred_stride);
+ return;
+ }
+#endif
+ (void)bd_info;
+ aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+ pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const int tx1d_width = tx_size_wide[tx_size];
+ const int tx1d_height = tx_size_high[tx_size];
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+ int16_t *src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+ av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
+ src, src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+
+ av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
+ p->src.stride, pd->dst.buf, pd->dst.stride);
+}
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const int eob = p->eobs[block];
+ const int segment_id = xd->mi[0]->segment_id;
+
+ if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+ xd->lossless[segment_id]) {
+ *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size);
+ return eob;
+ }
+
+ return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+ rate_cost, cpi->oxcf.algo_cfg.sharpness);
+}
+
+// Hyper-parameters for dropout optimization, based on following logics.
+// TODO(yjshen): These settings are tuned by experiments. They may still be
+// optimized for better performance.
+// (1) Coefficients which are large enough will ALWAYS be kept.
+const tran_low_t DROPOUT_COEFF_MAX = 2; // Max dropout-able coefficient.
+// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is
+// NOT required. For example, `5 0 0 0 7` is treated as two continuous
+// coefficients if three zeros do not fulfill the dropout condition.
+const int DROPOUT_CONTINUITY_MAX = 2; // Max dropout-able continuous coeff.
+// (3) Dropout operation is NOT applicable to blocks with large or small
+// quantization index.
+const int DROPOUT_Q_MAX = 128;
+const int DROPOUT_Q_MIN = 16;
+// (4) Recall that dropout optimization will forcibly set some quantized
+// coefficients to zero. The key logic on determining whether a coefficient
+// should be dropped is to check the number of continuous zeros before AND
+// after this coefficient. The exact number of zeros for judgement depends
+// on block size and quantization index. More concretely, block size
+// determines the base number of zeros, while quantization index determines
+// the multiplier. Intuitively, larger block requires more zeros and larger
+// quantization index also requires more zeros (more information is lost
+// when using larger quantization index).
+const int DROPOUT_BEFORE_BASE_MAX = 32; // Max base number for leading zeros.
+const int DROPOUT_BEFORE_BASE_MIN = 16; // Min base number for leading zeros.
+const int DROPOUT_AFTER_BASE_MAX = 32; // Max base number for trailing zeros.
+const int DROPOUT_AFTER_BASE_MIN = 16; // Min base number for trailing zeros.
+const int DROPOUT_MULTIPLIER_MAX = 8; // Max multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_MIN = 2; // Min multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier.
+
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ TX_TYPE tx_type, int qindex) {
+ const int tx_width = tx_size_wide[tx_size];
+ const int tx_height = tx_size_high[tx_size];
+
+ // Early return if `qindex` is out of range.
+ if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
+ return;
+ }
+
+ // Compute number of zeros used for dropout judgement.
+ const int base_size = AOMMAX(tx_width, tx_height);
+ const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE,
+ DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX);
+ const int dropout_num_before =
+ multiplier *
+ CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX);
+ const int dropout_num_after =
+ multiplier *
+ CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
+
+ av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before,
+ dropout_num_after);
+}
+
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ int dropout_num_before, int dropout_num_after) {
+ const struct macroblock_plane *const p = &mb->plane[plane];
+ tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+ tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ const int max_eob = av1_get_max_eob(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+ // Early return if there are not enough non-zero coefficients.
+ if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before ||
+ max_eob <= dropout_num_before + dropout_num_after) {
+ return;
+ }
+
+ int count_zeros_before = 0;
+ int count_zeros_after = 0;
+ int count_nonzeros = 0;
+ // Index of the first non-zero coefficient after sufficient number of
+ // continuous zeros. If equals to `-1`, it means number of leading zeros
+ // hasn't reach `dropout_num_before`.
+ int idx = -1;
+ int eob = 0; // New end of block.
+
+ for (int i = 0; i < p->eobs[block]; ++i) {
+ const int scan_idx = scan_order->scan[i];
+ if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) {
+ // Keep large coefficients.
+ count_zeros_before = 0;
+ count_zeros_after = 0;
+ idx = -1;
+ eob = i + 1;
+ } else if (qcoeff[scan_idx] == 0) { // Count zeros.
+ if (idx == -1) {
+ ++count_zeros_before;
+ } else {
+ ++count_zeros_after;
+ }
+ } else { // Count non-zeros.
+ if (count_zeros_before >= dropout_num_before) {
+ idx = (idx == -1) ? i : idx;
+ ++count_nonzeros;
+ } else {
+ count_zeros_before = 0;
+ eob = i + 1;
+ }
+ }
+
+ // Handle continuity.
+ if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
+ count_zeros_before = 0;
+ count_zeros_after = 0;
+ count_nonzeros = 0;
+ idx = -1;
+ eob = i + 1;
+ }
+
+ // Handle the trailing zeros after original end of block.
+ if (idx != -1 && i == p->eobs[block] - 1) {
+ count_zeros_after += (max_eob - p->eobs[block]);
+ }
+
+ // Set redundant coefficients to zeros if needed.
+ if (count_zeros_after >= dropout_num_after) {
+ for (int j = idx; j <= i; ++j) {
+ qcoeff[scan_order->scan[j]] = 0;
+ dqcoeff[scan_order->scan[j]] = 0;
+ }
+ count_zeros_before += (i - idx + 1);
+ count_zeros_after = 0;
+ count_nonzeros = 0;
+ } else if (i == p->eobs[block] - 1) {
+ eob = i + 1;
+ }
+ }
+
+ if (eob != p->eobs[block]) {
+ p->eobs[block] = eob;
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+ }
+}
+
+// Settings for optimization type. NOTE: To set optimization type for all intra
+// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set.
+// TODO(yjshen): These settings are hard-coded and look okay for now. They
+// should be made configurable later.
+// Blocks of key frames ONLY.
+const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of intra frames (key frames EXCLUSIVE).
+const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default
+// if trellis optimization is on for inter frames.)
+const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+
+enum {
+ QUANT_FUNC_LOWBD = 0,
+ QUANT_FUNC_HIGHBD = 1,
+ QUANT_FUNC_TYPES = 2
+} UENUM1BYTE(QUANT_FUNC);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AV1_QUANT_FACADE
+ quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+ { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+ { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+ { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+ { NULL, NULL }
+ };
+#else
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = {
+ av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL
+};
+#endif
+
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+ TxfmParam *txfm_param, int64_t per_px_mean) {
+ assert(per_px_mean != INT64_MAX);
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+ memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+ coeff[0] =
+ (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam) {
+ av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param);
+ av1_quant(x, plane, block, txfm_param, qparam);
+}
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ const int diff_stride = block_size_wide[plane_bsize];
+
+ const int src_offset = (blk_row * diff_stride + blk_col);
+ const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
+
+ av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param->tx_size, txfm_param->tx_type);
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ tran_low_t *const qcoeff = p->qcoeff + block_offset;
+ tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+ uint16_t *const eob = &p->eobs[block];
+
+ if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+ const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+ if (LIKELY(!x->seg_skip_block)) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
+ coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#else
+ quant_func_list[qparam->xform_quant_idx](
+ coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#endif
+ } else {
+ av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
+ }
+ }
+ // use_optimize_b is true means av1_optimze_b will be called,
+ // thus cannot update entropy ctx now (performed in optimize_b)
+ if (qparam->use_optimize_b) {
+ p->txb_entropy_ctx[block] = 0;
+ } else {
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+ }
+}
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+ TX_TYPE tx_type, TxfmParam *txfm_param) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ txfm_param->tx_type = tx_type;
+ txfm_param->tx_size = tx_size;
+ txfm_param->lossless = xd->lossless[mbmi->segment_id];
+ txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used);
+
+ txfm_param->bd = xd->bd;
+ txfm_param->is_hbd = is_cur_buf_hbd(xd);
+}
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+ int use_quant_b_adapt, QUANT_PARAM *qparam) {
+ qparam->log_scale = av1_get_tx_scale(tx_size);
+ qparam->tx_size = tx_size;
+
+ qparam->use_quant_b_adapt = use_quant_b_adapt;
+
+ // TODO(bohanli): optimize_b and quantization idx has relationship,
+ // but is kind of buried and complicated in different encoding stages.
+ // Should have a unified function to derive quant_idx, rather than
+ // determine and pass in the quant_idx
+ qparam->use_optimize_b = use_optimize_b;
+ qparam->xform_quant_idx = xform_quant_idx;
+
+ qparam->qmatrix = NULL;
+ qparam->iqmatrix = NULL;
+}
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type, QUANT_PARAM *qparam) {
+ qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type);
+ qparam->iqmatrix =
+ av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type);
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+ RUN_TYPE dry_run) {
+ (void)dry_run;
+ struct encode_b_args *const args = arg;
+ const AV1_COMP *const cpi = args->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ uint8_t *dst;
+ ENTROPY_CONTEXT *a, *l;
+ int dummy_rate_cost = 0;
+
+ const int bw = mi_size_wide[plane_bsize];
+ dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+ a = &args->ta[blk_col];
+ l = &args->tl[blk_row];
+
+ TX_TYPE tx_type = DCT_DCT;
+ const int blk_skip_idx = blk_row * bw + blk_col;
+ if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) &&
+ !mbmi->skip_mode) {
+ tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
+ int quant_idx;
+ if (use_trellis)
+ quant_idx = AV1_XFORM_QUANT_FP;
+ else
+ quant_idx =
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+ av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+ av1_setup_quant(tx_size, use_trellis, quant_idx,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ // Whether trellis or dropout optimization is required for inter frames.
+ const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+ INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+ const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+ INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+
+ if (quant_param.use_optimize_b && do_trellis) {
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+ &dummy_rate_cost);
+ }
+ if (!quant_param.use_optimize_b && do_dropout) {
+ av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+ cm->quant_params.base_qindex);
+ }
+ } else {
+ p->eobs[block] = 0;
+ p->txb_entropy_ctx[block] = 0;
+ }
+
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ if (p->eobs[block]) {
+ // As long as any YUV plane has non-zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 0.
+ mbmi->skip_txfm = 0;
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ pd->dst.stride, p->eobs[block],
+ cm->features.reduced_tx_set_used);
+ } else {
+ // Only when YUV planes all have zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 1.
+ mbmi->skip_txfm &= 1;
+ }
+
+ // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+ // case. It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
+ if (p->eobs[block] == 0 && plane == 0) {
+#if 0
+ if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+ args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
+ // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+ // enable_optimize_b is true to detect potential RD bug.
+ const uint8_t disable_txk_check = args->enable_optimize_b;
+ if (!disable_txk_check) {
+ assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+ DCT_DCT);
+ }
+ }
+#endif
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+
+#if CONFIG_MISMATCH_DEBUG
+ if (dry_run == OUTPUT_ENABLED) {
+ int pixel_c, pixel_r;
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ int blk_w = block_size_wide[bsize];
+ int blk_h = block_size_high[bsize];
+ mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col,
+ blk_row, pd->subsampling_x, pd->subsampling_y);
+ mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
+ plane, pixel_c, pixel_r, blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
+}
+
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg, RUN_TYPE dry_run) {
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+ if (!plane) {
+ assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+ tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+ }
+
+ if (tx_size == plane_tx_size || plane) {
+ encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+ dry_run);
+ } else {
+ assert(tx_size < TX_SIZES_ALL);
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+ assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+ // This is the square transform block partition entry point.
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+
+ encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+ arg, dry_run);
+ block += step;
+ }
+ }
+ }
+}
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ // Call visit() directly with zero offsets if the current block size is the
+ // same as the transform block size.
+ if (plane_bsize == tx_bsize) {
+ visit(plane, 0, 0, 0, plane_bsize, tx_size, arg);
+ return;
+ }
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ const int mu_blocks_wide =
+ AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide);
+ const int mu_blocks_high =
+ AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high);
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ int i = 0;
+ for (int r = 0; r < max_blocks_high; r += mu_blocks_high) {
+ const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+ const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+ for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+ for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+ visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+ i += step;
+ }
+ }
+ }
+ }
+ // Check if visit() is invoked at least once.
+ assert(i >= 1);
+}
+
+typedef struct encode_block_pass1_args {
+ AV1_COMP *cpi;
+ MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+ AV1_COMP *cpi = args->cpi;
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+
+ uint8_t *dst;
+ dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+ &quant_param);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ if (p->eobs[block] > 0) {
+ txfm_param.eob = p->eobs[block];
+ if (txfm_param.is_hbd) {
+ av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ return;
+ }
+ av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ }
+}
+
+void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ encode_block_pass1_args args = { cpi, x };
+ av1_subtract_plane(x, bsize, 0);
+ av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+ encode_block_pass1, &args);
+}
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ // In the current encoder implementation, for inter blocks,
+ // only when YUV planes all have zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 1.
+ // For intra blocks, this flag is set to 0 since skipped blocks are so rare
+ // that transmitting skip_txfm = 1 is very expensive.
+ // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based
+ // on transform, quantization, and (if exists) trellis optimization.
+ mbmi->skip_txfm = 1;
+ if (x->txfm_search_info.skip_txfm) return;
+
+ struct optimize_ctx ctx;
+ struct encode_b_args arg = {
+ cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+ };
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int subsampling_x = pd->subsampling_x;
+ const int subsampling_y = pd->subsampling_y;
+ if (plane && !xd->is_chroma_ref) break;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ const int bw = mi_size_wide[txb_size];
+ const int bh = mi_size_high[txb_size];
+ int block = 0;
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
+ av1_subtract_plane(x, plane_bsize, plane);
+ arg.ta = ctx.ta[plane];
+ arg.tl = ctx.tl[plane];
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y);
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+ for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+ for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+ int blk_row, blk_col;
+ const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+ for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+ max_tx_size, &arg, dry_run);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void encode_block_intra_and_set_context(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ arg);
+
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *x = args->x;
+ ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct encode_b_args *const args = arg;
+ const AV1_COMP *const cpi = args->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ uint16_t *eob = &p->eobs[block];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ int dummy_rate_cost = 0;
+
+ av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+
+ TX_TYPE tx_type = DCT_DCT;
+ const int bw = mi_size_wide[plane_bsize];
+ if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane,
+ blk_row * bw + blk_col)) {
+ *eob = 0;
+ p->txb_entropy_ctx[block] = 0;
+ } else {
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+ const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ const int use_trellis =
+ is_trellis_used(args->enable_optimize_b, args->dry_run);
+ int quant_idx;
+ if (use_trellis)
+ quant_idx = AV1_XFORM_QUANT_FP;
+ else
+ quant_idx =
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+
+ av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+ av1_setup_quant(tx_size, use_trellis, quant_idx,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ // Whether trellis or dropout optimization is required for key frames and
+ // intra frames.
+ const bool do_trellis = (frame_is_intra_only(cm) &&
+ (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+ KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+ (!frame_is_intra_only(cm) &&
+ (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+ INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+ const bool do_dropout = (frame_is_intra_only(cm) &&
+ (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+ KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+ (!frame_is_intra_only(cm) &&
+ (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+ INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+
+ if (quant_param.use_optimize_b && do_trellis) {
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+ &dummy_rate_cost);
+ }
+ if (do_dropout) {
+ av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+ cm->quant_params.base_qindex);
+ }
+ }
+
+ if (*eob) {
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ dst_stride, *eob,
+ cm->features.reduced_tx_set_used);
+ }
+
+ // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+ // It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
+ if (*eob == 0 && plane == 0) {
+#if 0
+ if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ
+ && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
+ assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+ DCT_DCT);
+ }
+#endif
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+
+ // For intra mode, skipped blocks are so rare that transmitting
+ // skip_txfm = 1 is very expensive.
+ mbmi->skip_txfm = 0;
+
+ if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+ cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+ }
+}
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+ TRELLIS_OPT_TYPE enable_optimize_b) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ if (plane && !xd->is_chroma_ref) return;
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+ ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
+ struct encode_b_args arg = {
+ cpi, x, NULL, ta, tl, dry_run, enable_optimize_b
+ };
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ if (enable_optimize_b) {
+ av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
+ }
+ av1_foreach_transformed_block_in_plane(
+ xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg);
+}
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 0000000000..f97bf8f517
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ AV1_XFORM_QUANT_FP = 0,
+ AV1_XFORM_QUANT_B = 1,
+ AV1_XFORM_QUANT_DC = 2,
+ AV1_XFORM_QUANT_SKIP_QUANT,
+ AV1_XFORM_QUANT_TYPES,
+} UENUM1BYTE(AV1_XFORM_QUANT);
+
+// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE
+// Available optimization types to optimize the quantized coefficients.
+enum {
+ NONE_OPT = 0, // No optimization.
+ TRELLIS_OPT = 1, // Trellis optimization. See `av1_optimize_b()`.
+ DROPOUT_OPT = 2, // Dropout optimization. See `av1_dropout_qcoeff()`.
+ TRELLIS_DROPOUT_OPT = 3 // Perform dropout after trellis optimization.
+} UENUM1BYTE(OPT_TYPE);
+
+enum {
+ NO_TRELLIS_OPT, // No trellis optimization
+ FULL_TRELLIS_OPT, // Trellis optimization in all stages
+ FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass
+ NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
+struct optimize_ctx {
+ ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+ ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+ const struct AV1_COMP *cpi;
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ RUN_TYPE dry_run;
+ TRELLIS_OPT_TYPE enable_optimize_b;
+};
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg);
+
+void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize);
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+ TX_TYPE tx_type, TxfmParam *txfm_param);
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+ int use_quant_b_adapt, QUANT_PARAM *qparam);
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type, QUANT_PARAM *qparam);
+
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+ TxfmParam *txfm_param, int64_t per_px_mean);
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam);
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TxfmParam *txfm_param);
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam);
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost);
+
+// This function can be used as (i) a further optimization to reduce the
+// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
+// optimization, or (ii) an alternative to trellis optimization in high-speed
+// compression mode (e.g., real-time mode under speed-6) due to its LOW time
+// complexity. The rational behind is to drop out the may-be redundant quantized
+// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as
+// accurate as trellis optimization since the hyper-parameters are hard-coded
+// instead of dynamic search. More adaptive logic may improve the performance.
+// This function should be applied to all or partical block cells.
+// Inputs:
+// mb: Pointer to the MACROBLOCK to perform dropout on.
+// plane: Index of the plane to which the target block belongs.
+// block: Index of the target block.
+// tx_size: Transform size of the target block.
+// tx_type: Transform type of the target block. This field is particularly
+// used to find out the scan order of the block.
+// qindex: Quantization index used for target block. In general, all blocks
+// in a same plane share the same quantization index. This field is
+// particularly used to determine how many zeros should be used to
+// drop out a coefficient.
+// Returns:
+// Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as
+// `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ TX_TYPE tx_type, int qindex);
+// Same as above, with the number of zeroes needed before/after a coeff to drop
+// it explicitly passed in, instead of being derived from qindex.
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ int dropout_num_before, int dropout_num_after);
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
+
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l) {
+ const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+ memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+ memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+ TRELLIS_OPT_TYPE enable_optimize_b);
+
+static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
+ RUN_TYPE dry_run) {
+ if (optimize_b == NO_TRELLIS_OPT) return false;
+ if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED)
+ return false;
+ return true;
+}
+
+// Scaling terms (precision of 12 bits) to perform tx-size specific
+// normalization that is used in DCT_DCT forward transform.
+// For transform blocks of 1:2 and 2:1 - sqrt(2) normalization is used
+// For transform blocks of 1:4 and 4:1 - factor of 2 is used
+// For transform blocks TX_8x8 and below - an additional factor of 2 is used
+// For transform blocks max(width,height)=64 - currently not supported
+
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+ 1024, 2048, 4096, 4096, 0, 1448, 1448, 2896, 2896, 2896,
+ 2896, 0, 0, 2048, 2048, 4096, 4096, 0, 0
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 0000000000..7cae72c159
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static void update_mv_component_stats(int comp, nmv_component *mvcomp,
+ MvSubpelPrecision precision) {
+ assert(comp != 0);
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ // Sign
+ update_cdf(mvcomp->sign_cdf, sign, 2);
+
+ // Class
+ update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES);
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE);
+ } else {
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (int i = 0; i < n; ++i)
+ update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2);
+ }
+ // Fractional bits
+ if (precision > MV_SUBPEL_NONE) {
+ aom_cdf_prob *fp_cdf =
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf;
+ update_cdf(fp_cdf, fr, MV_FP_SIZE);
+ }
+
+ // High precision bit
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ aom_cdf_prob *hp_cdf =
+ mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf;
+ update_cdf(hp_cdf, hp, 2);
+ }
+}
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+ MvSubpelPrecision precision) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+ update_cdf(mvctx->joints_cdf, j, MV_JOINTS);
+
+ if (mv_joint_vertical(j))
+ update_mv_component_stats(diff.row, &mvctx->comps[0], precision);
+
+ if (mv_joint_horizontal(j))
+ update_mv_component_stats(diff.col, &mvctx->comps[1], precision);
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+ MvSubpelPrecision precision) {
+ assert(comp != 0);
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ // Sign
+ aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
+
+ // Class
+ aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+ } else {
+ int i;
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (i = 0; i < n; ++i)
+ aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
+ }
+ // Fractional bits
+ if (precision > MV_SUBPEL_NONE) {
+ aom_write_symbol(
+ w, fr,
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+ MV_FP_SIZE);
+ }
+
+ // High precision bit
+ if (precision > MV_SUBPEL_LOW_PRECISION)
+ aom_write_symbol(
+ w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+ 2);
+}
+
+/* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This
+ * is more than most L1D caches and is a significant chunk of L2. Write
+ * SIMD that uses streaming writes to avoid loading all of that into L1, or
+ * just don't update the larger component costs every time this called
+ * (or both).
+ */
+void av1_build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ MvSubpelPrecision precision) {
+ int i, j, v, o, mantissa;
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 },
+ fp_cost[MV_FP_SIZE] = { 0 };
+ int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 };
+
+ av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+ av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+ av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+ }
+
+ if (precision > MV_SUBPEL_NONE) {
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+ NULL);
+ av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+ }
+
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+ av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+ }
+
+ // Instead of accumulating the cost of each vector component's bits
+ // individually, compute the costs based on smaller vectors. Costs for
+ // [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1]
+ // respectively. Offsets are maintained to swap both 1) class costs when
+ // treated as a complete vector component with the highest set bit when
+ // treated as a mantissa (significand) and 2) leading zeros to account for
+ // the current exponent.
+
+ // Cost offsets
+ int cost_swap[MV_OFFSET_BITS] = { 0 };
+ // Delta to convert positive vector to negative vector costs
+ int negate_sign = sign_cost[1] - sign_cost[0];
+
+ // Initialize with offsets to swap the class costs with the costs of the
+ // highest set bit.
+ for (i = 1; i < MV_OFFSET_BITS; ++i) {
+ cost_swap[i] = bits_cost[i - 1][1];
+ if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS];
+ }
+
+ // Seed the fractional costs onto the output (overwritten latter).
+ for (o = 0; o < MV_FP_SIZE; ++o) {
+ int hp;
+ for (hp = 0; hp < 2; ++hp) {
+ v = 2 * o + hp + 1;
+ mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0];
+ }
+ }
+
+ mvcost[0] = 0;
+ // Fill the costs for each exponent's vectors, using the costs set in the
+ // previous exponents.
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ const int exponent = (2 * MV_FP_SIZE) << i;
+
+ int class = 0;
+ if (i >= CLASS0_BITS) {
+ class = class_cost[i - CLASS0_BITS + 1];
+ }
+
+ // Iterate through mantissas, keeping track of the location
+ // of the highest set bit for the mantissa.
+ // To be clear: in the outer loop, the position of the highest set bit
+ // (exponent) is tracked and, in this loop, the highest set bit of the
+ // mantissa is tracked.
+ mantissa = 0;
+ for (j = 0; j <= i; ++j) {
+ for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+ int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+ v = exponent + mantissa + 1;
+ mvcost[v] = cost;
+ mvcost[-v] = cost + negate_sign;
+ }
+ cost_swap[j] += bits_cost[i][0];
+ }
+ }
+
+ // Special case to avoid buffer overrun
+ {
+ int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS;
+ int class = class_cost[MV_CLASSES - 1];
+ mantissa = 0;
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+ int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+ v = exponent + mantissa + 1;
+ mvcost[v] = cost;
+ mvcost[-v] = cost + negate_sign;
+ }
+ }
+ // At this point: mantissa = exponent >> 1
+
+ // Manually calculate the final cost offset
+ int cost_swap_hi =
+ bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2];
+ for (; mantissa < exponent - 1; ++mantissa) {
+ int cost = mvcost[mantissa + 1] + class + cost_swap_hi;
+ v = exponent + mantissa + 1;
+ mvcost[v] = cost;
+ mvcost[-v] = cost + negate_sign;
+ }
+ }
+
+ // Fill costs for class0 vectors, overwriting previous placeholder values
+ // used for calculating the costs of the larger vectors.
+ for (i = 0; i < CLASS0_SIZE; ++i) {
+ const int top = i * 2 * MV_FP_SIZE;
+ for (o = 0; o < MV_FP_SIZE; ++o) {
+ int hp;
+ int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i];
+ for (hp = 0; hp < 2; ++hp) {
+ v = top + 2 * o + hp + 1;
+ mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0];
+ mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1];
+ }
+ }
+ }
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+ const MV *ref, nmv_context *mvctx, int usehp) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+ // If the mv_diff is zero, then we should have used near or nearest instead.
+ assert(j != MV_JOINT_ZERO);
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ usehp = MV_SUBPEL_NONE;
+ }
+ aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled then keep track of the largest
+ // motion vector component used.
+ if (cpi->sf.mv_sf.auto_mv_step_size) {
+ int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+ td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude);
+ }
+}
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx) {
+ // DV and ref DV should not have sub-pel.
+ assert((mv->col & 7) == 0);
+ assert((mv->row & 7) == 0);
+ assert((ref->col & 7) == 0);
+ assert((ref->row & 7) == 0);
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+ aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
+}
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *ctx,
+ MvSubpelPrecision precision) {
+ av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
+ av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+ av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+}
+
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const CANDIDATE_MV *curr_ref_mv_stack =
+ mbmi_ext->ref_mv_stack[ref_frame_type];
+
+ if (ref_frame[1] > INTRA_FRAME) {
+ assert(ref_idx == 0 || ref_idx == 1);
+ return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+ : curr_ref_mv_stack[ref_mv_idx].this_mv;
+ }
+
+ assert(ref_idx == 0);
+ return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]
+ ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : mbmi_ext->global_mvs[ref_frame_type];
+}
+
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+ return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+ &x->mbmi_ext);
+}
+
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *nearest_mv, int_mv *near_mv,
+ int is_integer) {
+ const int ref_idx = 0;
+ MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+ *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+ lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+ *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+ lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 0000000000..c39001a5a2
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+ const MV *ref, nmv_context *mvctx, int usehp);
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+ MvSubpelPrecision precision);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *mvctx,
+ MvSubpelPrecision precision);
+void av1_build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ MvSubpelPrecision precision);
+
+void av1_update_mv_count(ThreadData *td);
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx);
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *nearest_mv, int_mv *near_mv,
+ int is_integer);
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+ // row: Z col: Z | MV_JOINT_ZERO (0)
+ // row: Z col: NZ | MV_JOINT_HNZVZ (1)
+ // row: NZ col: Z | MV_JOINT_HZVNZ (2)
+ // row: NZ col: NZ | MV_JOINT_HNZVNZ (3)
+ return (!!mv->col) | ((!!mv->row) << 1);
+}
+
+static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) {
+ return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
+ // get_msb() is only valid when n != 0.
+ return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+ assert(z >= 0);
+ const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+ assert(c <= MV_CLASS_10);
+ if (offset) *offset = z - av1_mv_class_base(c);
+ return c;
+}
+
+static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
+ MACROBLOCK *const x) {
+ (void)cm;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ if (this_mode == NEW_NEWMV) {
+ const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+ const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+ if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+ mbmi->mv[1].as_int == ref_mv_1.as_int) {
+ return 0;
+ }
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+ if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+ return 0;
+ }
+ } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+ const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+ if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+ return 0;
+ }
+ } else if (this_mode == NEWMV) {
+ const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+ if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+ return 0;
+ }
+ }
+ return 1;
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 0000000000..4732ad435b
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,5409 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+
+#include "av1/common/scale.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aomcx.h"
+
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/hash_motion.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#if CONFIG_SALIENCY_MAP
+#include "av1/encoder/saliency_map.h"
+#endif
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+
+static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) {
+ switch (mode) {
+ case AOME_NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case AOME_FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case AOME_THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case AOME_THREEFOUR:
+ *hr = 3;
+ *hs = 4;
+ break;
+ case AOME_ONEFOUR:
+ *hr = 1;
+ *hs = 4;
+ break;
+ case AOME_ONEEIGHT:
+ *hr = 1;
+ *hs = 8;
+ break;
+ case AOME_ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ case AOME_TWOTHREE:
+ *hr = 2;
+ *hs = 3;
+ break;
+ case AOME_ONETHREE:
+ *hr = 1;
+ *hs = 3;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) {
+ unsigned char *const active_map_4x4 = cpi->active_map.map;
+ const int mi_rows = mi_params->mi_rows;
+ const int mi_cols = mi_params->mi_cols;
+ const int row_scale = mi_size_high_log2[BLOCK_16X16];
+ const int col_scale = mi_size_wide_log2[BLOCK_16X16];
+ cpi->active_map.update = 0;
+ assert(mi_rows % 2 == 0);
+ assert(mi_cols % 2 == 0);
+ if (new_map_16x16) {
+ for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+ for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+ const uint8_t val = new_map_16x16[r * cols + c]
+ ? AM_SEGMENT_ID_ACTIVE
+ : AM_SEGMENT_ID_INACTIVE;
+ active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val;
+ active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val;
+ active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val;
+ active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val;
+ }
+ }
+ cpi->active_map.enabled = 1;
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ if (rows == mi_params->mb_rows && cols == mi_params->mb_cols &&
+ new_map_16x16) {
+ unsigned char *const seg_map_8x8 = cpi->enc_seg.map;
+ const int mi_rows = mi_params->mi_rows;
+ const int mi_cols = mi_params->mi_cols;
+ const int row_scale = mi_size_high_log2[BLOCK_16X16];
+ const int col_scale = mi_size_wide_log2[BLOCK_16X16];
+ assert(mi_rows % 2 == 0);
+ assert(mi_cols % 2 == 0);
+
+ memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+ for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ uint8_t temp = 0;
+ temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ new_map_16x16[r * cols + c] |= temp;
+ }
+ }
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) {
+ bool is_allintra = usage == ALLINTRA;
+
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_me_luts();
+ if (!is_allintra) av1_init_wedge_masks();
+ if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts();
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+ cpi->framerate = framerate < 0.1 ? 30 : framerate;
+ av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+}
+
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+ size_t encoded_frame_size) {
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int height = cm->height;
+ const int64_t luma_pic_size = (int64_t)upscaled_width * height;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const int pic_size_profile_factor =
+ profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+ encoded_frame_size =
+ (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
+ const int64_t uncompressed_frame_size =
+ (luma_pic_size * pic_size_profile_factor) >> 3;
+ return (double)uncompressed_frame_size / encoded_frame_size;
+}
+
+static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs,
+ int num_tiles_lg, int tile_col_row) {
+ CommonTileParams *const tiles = &cm->tiles;
+ int i, start_sb;
+ int size_sb = num_sbs >> num_tiles_lg;
+ int res_sbs = num_sbs - (size_sb << num_tiles_lg);
+ int num_tiles = 1 << num_tiles_lg;
+ int inc_index = num_tiles - res_sbs;
+
+ tiles->uniform_spacing = 0;
+
+ for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) {
+ if (i == inc_index) ++size_sb;
+ if (tile_col_row)
+ tiles->col_start_sb[i] = start_sb;
+ else
+ tiles->row_start_sb[i] = start_sb;
+
+ start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+ }
+
+ if (tile_col_row) {
+ tiles->cols = i;
+ tiles->col_start_sb[i] = num_sbs;
+ } else {
+ tiles->rows = i;
+ tiles->row_start_sb[i] = num_sbs;
+ }
+}
+
+static void set_tile_info(AV1_COMMON *const cm,
+ const TileConfig *const tile_cfg) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ CommonTileParams *const tiles = &cm->tiles;
+ int i, start_sb;
+
+ av1_get_tile_limits(cm);
+
+ int sb_cols =
+ CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
+ // configure tile columns
+ if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) {
+ tiles->uniform_spacing = 1;
+ tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols);
+ // Add a special case to handle super resolution
+ sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator);
+ int min_log2_cols = 0;
+ for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) {
+ }
+ tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols);
+
+ tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
+ } else if (tile_cfg->tile_widths[0] < 0) {
+ auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1);
+ } else {
+ int size_sb, j = 0;
+ tiles->uniform_spacing = 0;
+ for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+ tiles->col_start_sb[i] = start_sb;
+ size_sb = tile_cfg->tile_widths[j++];
+ if (j >= tile_cfg->tile_width_count) j = 0;
+ start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+ }
+ tiles->cols = i;
+ tiles->col_start_sb[i] = sb_cols;
+ }
+ av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols,
+ tiles);
+
+ // configure tile rows
+ int sb_rows =
+ CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
+ if (tiles->uniform_spacing) {
+ tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows);
+ tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
+ } else if (tile_cfg->tile_heights[0] < 0) {
+ auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0);
+ } else {
+ int size_sb, j = 0;
+ for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+ tiles->row_start_sb[i] = start_sb;
+ size_sb = tile_cfg->tile_heights[j++];
+ if (j >= tile_cfg->tile_height_count) j = 0;
+ start_sb += AOMMIN(size_sb, tiles->max_height_sb);
+ }
+ tiles->rows = i;
+ tiles->row_start_sb[i] = sb_rows;
+ }
+ av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles);
+}
+
+void av1_update_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ // Setup mi_params here in case we need more mi's.
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+ cpi->sf.part_sf.default_min_partition_size);
+
+ av1_init_macroblockd(cm, xd);
+
+ if (!cpi->ppi->seq_params_locked)
+ set_sb_size(cm->seq_params,
+ av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+ cpi->ppi->number_spatial_layers));
+
+ set_tile_info(cm, &cpi->oxcf.tile_cfg);
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+ int lvl_width, int lvl_height,
+ double lvl_fps, int lvl_dim_mult) {
+ const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height;
+ const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+ const int64_t luma_pels = (int64_t)width * height;
+ const double display_sample_rate = luma_pels * fps;
+ return luma_pels <= lvl_luma_pels &&
+ display_sample_rate <= lvl_display_sample_rate &&
+ width <= lvl_width * lvl_dim_mult &&
+ height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
+ int height, double init_framerate) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const AV1LevelParams *const level_params = &ppi->level_params;
+ // TODO(any): This is a placeholder function that only addresses dimensions
+ // and max display sample rates.
+ // Need to add checks for max bit rate, max decoded luma sample rate, header
+ // rate, etc. that are not covered by this function.
+ AV1_LEVEL level = SEQ_LEVEL_MAX;
+ if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) {
+ level = SEQ_LEVEL_2_0;
+ } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0,
+ 4)) {
+ level = SEQ_LEVEL_2_1;
+ } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0,
+ 4)) {
+ level = SEQ_LEVEL_3_0;
+ } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0,
+ 4)) {
+ level = SEQ_LEVEL_3_1;
+ } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0,
+ 3)) {
+ level = SEQ_LEVEL_4_0;
+ } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0,
+ 3)) {
+ level = SEQ_LEVEL_4_1;
+ } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0,
+ 2)) {
+ level = SEQ_LEVEL_5_0;
+ } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0,
+ 2)) {
+ level = SEQ_LEVEL_5_1;
+ } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0,
+ 2)) {
+ level = SEQ_LEVEL_5_2;
+ } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0,
+ 2)) {
+ level = SEQ_LEVEL_6_0;
+ } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0,
+ 2)) {
+ level = SEQ_LEVEL_6_1;
+ } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0,
+ 2)) {
+ level = SEQ_LEVEL_6_2;
+ }
+#if CONFIG_CWG_C013
+ // TODO(bohanli): currently target level is only working for the 0th operating
+ // point, so scalable coding is not supported.
+ else if (level_params->target_seq_level_idx[0] >= SEQ_LEVEL_7_0 &&
+ level_params->target_seq_level_idx[0] <= SEQ_LEVEL_8_3) {
+ // Only use level 7.x to 8.x when explicitly asked to.
+ if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0, 2)) {
+ level = SEQ_LEVEL_7_0;
+ } else if (does_level_match(width, height, init_framerate, 16384, 8704,
+ 60.0, 2)) {
+ level = SEQ_LEVEL_7_1;
+ } else if (does_level_match(width, height, init_framerate, 16384, 8704,
+ 120.0, 2)) {
+ level = SEQ_LEVEL_7_2;
+ } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+ 30.0, 2)) {
+ level = SEQ_LEVEL_8_0;
+ } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+ 60.0, 2)) {
+ level = SEQ_LEVEL_8_1;
+ } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+ 120.0, 2)) {
+ level = SEQ_LEVEL_8_2;
+ }
+ }
+#endif
+
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ assert(is_valid_seq_level_idx(level_params->target_seq_level_idx[i]) ||
+ level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS);
+ // If a higher target level is specified, it is then used rather than the
+ // inferred one from resolution and framerate.
+ seq_params->seq_level_idx[i] =
+ level_params->target_seq_level_idx[i] < SEQ_LEVELS &&
+ level_params->target_seq_level_idx[i] > level
+ ? level_params->target_seq_level_idx[i]
+ : level;
+ // Set the maximum parameters for bitrate and buffer size for this profile,
+ // level, and tier
+ seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+ seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]);
+ // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+ // check
+ if (seq_params->op_params[i].bitrate == 0)
+ aom_internal_error(
+ &ppi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support this combination of profile, level, and tier.");
+ // Buffer size in bits/s is bitrate in bits/s * 1 s
+ seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
+ }
+}
+
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+ const AV1EncoderConfig *oxcf,
+ int disable_frame_id_numbers) {
+ SequenceHeader *const seq = &ppi->seq_params;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
+ seq->still_picture =
+ !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1);
+ seq->reduced_still_picture_hdr =
+ seq->still_picture && !tool_cfg->full_still_picture_hdr;
+ seq->force_screen_content_tools = 2;
+ seq->force_integer_mv = 2;
+ seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint;
+ seq->frame_id_numbers_present_flag =
+ !seq->reduced_still_picture_hdr &&
+ !oxcf->tile_cfg.enable_large_scale_tile &&
+ tool_cfg->error_resilient_mode && !disable_frame_id_numbers;
+ if (seq->reduced_still_picture_hdr) {
+ seq->order_hint_info.enable_order_hint = 0;
+ seq->force_screen_content_tools = 2;
+ seq->force_integer_mv = 2;
+ }
+ seq->order_hint_info.order_hint_bits_minus_1 =
+ seq->order_hint_info.enable_order_hint
+ ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
+ : -1;
+
+ seq->max_frame_width = frm_dim_cfg->forced_max_frame_width
+ ? frm_dim_cfg->forced_max_frame_width
+ : frm_dim_cfg->width;
+ seq->max_frame_height = frm_dim_cfg->forced_max_frame_height
+ ? frm_dim_cfg->forced_max_frame_height
+ : frm_dim_cfg->height;
+ seq->num_bits_width =
+ (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
+ seq->num_bits_height =
+ (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1;
+ assert(seq->num_bits_width <= 16);
+ assert(seq->num_bits_height <= 16);
+
+ seq->frame_id_length = FRAME_ID_LENGTH;
+ seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+ seq->enable_dual_filter = tool_cfg->enable_dual_filter;
+ seq->order_hint_info.enable_dist_wtd_comp =
+ oxcf->comp_type_cfg.enable_dist_wtd_comp;
+ seq->order_hint_info.enable_dist_wtd_comp &=
+ seq->order_hint_info.enable_order_hint;
+ seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present;
+ seq->order_hint_info.enable_ref_frame_mvs &=
+ seq->order_hint_info.enable_order_hint;
+ seq->enable_superres = oxcf->superres_cfg.enable_superres;
+ seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0;
+ seq->enable_restoration = tool_cfg->enable_restoration;
+ seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion;
+ seq->enable_interintra_compound = tool_cfg->enable_interintra_comp;
+ seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp;
+ seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
+ seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
+
+ set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height,
+ oxcf->input_cfg.init_framerate);
+
+ if (seq->operating_points_cnt_minus_1 == 0) {
+ seq->operating_point_idc[0] = 0;
+ } else {
+ // Set operating_point_idc[] such that the i=0 point corresponds to the
+ // highest quality operating point (all layers), and subsequent
+ // operarting points (i > 0) are lower quality corresponding to
+ // skip decoding enhancement layers (temporal first).
+ int i = 0;
+ assert(seq->operating_points_cnt_minus_1 ==
+ (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+ for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
+ for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
+ seq->operating_point_idc[i] =
+ (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
+ ~(~0u << (ppi->number_temporal_layers - tl));
+ i++;
+ }
+ }
+ }
+}
+
+static void init_config_sequence(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+ const ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+ ppi->use_svc = 0;
+ ppi->number_spatial_layers = 1;
+ ppi->number_temporal_layers = 1;
+
+ seq_params->profile = oxcf->profile;
+ seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+ seq_params->use_highbitdepth = oxcf->use_highbitdepth;
+ seq_params->color_primaries = color_cfg->color_primaries;
+ seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+ seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+ seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+ seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+ seq_params->color_range = color_cfg->color_range;
+ seq_params->timing_info_present = dec_model_cfg->timing_info_present;
+ seq_params->timing_info.num_units_in_display_tick =
+ dec_model_cfg->timing_info.num_units_in_display_tick;
+ seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
+ seq_params->timing_info.equal_picture_interval =
+ dec_model_cfg->timing_info.equal_picture_interval;
+ seq_params->timing_info.num_ticks_per_picture =
+ dec_model_cfg->timing_info.num_ticks_per_picture;
+
+ seq_params->display_model_info_present_flag =
+ dec_model_cfg->display_model_info_present_flag;
+ seq_params->decoder_model_info_present_flag =
+ dec_model_cfg->decoder_model_info_present_flag;
+ if (dec_model_cfg->decoder_model_info_present_flag) {
+ // set the decoder model parameters in schedule mode
+ seq_params->decoder_model_info.num_units_in_decoding_tick =
+ dec_model_cfg->num_units_in_decoding_tick;
+ ppi->buffer_removal_time_present = 1;
+ av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+ av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+ } else if (seq_params->timing_info_present &&
+ seq_params->timing_info.equal_picture_interval &&
+ !seq_params->decoder_model_info_present_flag) {
+ // set the decoder model parameters in resource availability mode
+ av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+ } else {
+ seq_params->op_params[0].initial_display_delay =
+ 10; // Default value (not signaled)
+ }
+
+ if (seq_params->monochrome) {
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 1;
+ } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ seq_params->subsampling_x = 0;
+ seq_params->subsampling_y = 0;
+ } else {
+ if (seq_params->profile == 0) {
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 1;
+ } else if (seq_params->profile == 1) {
+ seq_params->subsampling_x = 0;
+ seq_params->subsampling_y = 0;
+ } else {
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x;
+ seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y;
+ } else {
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 0;
+ }
+ }
+ }
+ av1_change_config_seq(ppi, oxcf, NULL);
+}
+
+static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+ cpi->oxcf = *oxcf;
+ cpi->framerate = oxcf->input_cfg.init_framerate;
+
+ cm->width = oxcf->frm_dim_cfg.width;
+ cm->height = oxcf->frm_dim_cfg.height;
+ cpi->is_dropped_frame = false;
+
+ alloc_compressor_data(cpi);
+
+ cpi->data_alloc_width = cm->width;
+ cpi->data_alloc_height = cm->height;
+ cpi->frame_size_related_setup_done = false;
+
+ // Single thread case: use counts in common.
+ cpi->td.counts = &cpi->counts;
+
+ // Init SVC parameters.
+ cpi->svc.number_spatial_layers = 1;
+ cpi->svc.number_temporal_layers = 1;
+ cm->spatial_layer_id = 0;
+ cm->temporal_layer_id = 0;
+ // Init rtc_ref parameters.
+ cpi->ppi->rtc_ref.set_ref_frame_config = 0;
+ cpi->ppi->rtc_ref.non_reference_frame = 0;
+ cpi->ppi->rtc_ref.ref_frame_comp[0] = 0;
+ cpi->ppi->rtc_ref.ref_frame_comp[1] = 0;
+ cpi->ppi->rtc_ref.ref_frame_comp[2] = 0;
+
+ // change includes all joint functionality
+ av1_change_config(cpi, oxcf, false);
+
+ cpi->ref_frame_flags = 0;
+
+ // Reset resize pending flags
+ resize_pending_params->width = 0;
+ resize_pending_params->height = 0;
+
+ // Setup identity scale factor
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, 1, 1, 1, 1);
+
+ init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
+
+ av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+}
+
+void av1_change_config_seq(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf,
+ bool *is_sb_size_changed) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+ const ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+ if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+ seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+ seq_params->color_primaries = color_cfg->color_primaries;
+ seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+ seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+ seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+ seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+ seq_params->color_range = color_cfg->color_range;
+
+ assert(IMPLIES(seq_params->profile <= PROFILE_1,
+ seq_params->bit_depth <= AOM_BITS_10));
+
+ seq_params->timing_info_present = dec_model_cfg->timing_info_present;
+ seq_params->timing_info.num_units_in_display_tick =
+ dec_model_cfg->timing_info.num_units_in_display_tick;
+ seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
+ seq_params->timing_info.equal_picture_interval =
+ dec_model_cfg->timing_info.equal_picture_interval;
+ seq_params->timing_info.num_ticks_per_picture =
+ dec_model_cfg->timing_info.num_ticks_per_picture;
+
+ seq_params->display_model_info_present_flag =
+ dec_model_cfg->display_model_info_present_flag;
+ seq_params->decoder_model_info_present_flag =
+ dec_model_cfg->decoder_model_info_present_flag;
+ if (dec_model_cfg->decoder_model_info_present_flag) {
+ // set the decoder model parameters in schedule mode
+ seq_params->decoder_model_info.num_units_in_decoding_tick =
+ dec_model_cfg->num_units_in_decoding_tick;
+ ppi->buffer_removal_time_present = 1;
+ av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+ av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+ } else if (seq_params->timing_info_present &&
+ seq_params->timing_info.equal_picture_interval &&
+ !seq_params->decoder_model_info_present_flag) {
+ // set the decoder model parameters in resource availability mode
+ av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+ } else {
+ seq_params->op_params[0].initial_display_delay =
+ 10; // Default value (not signaled)
+ }
+
+ av1_update_film_grain_parameters_seq(ppi, oxcf);
+
+ int sb_size = seq_params->sb_size;
+ // Superblock size should not be updated after the first key frame.
+ if (!ppi->seq_params_locked) {
+ set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width,
+ frm_dim_cfg->height,
+ ppi->number_spatial_layers));
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+ seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+ }
+ if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size)
+ *is_sb_size_changed = true;
+
+ // Init sequence level coding tools
+ // This should not be called after the first key frame.
+ if (!ppi->seq_params_locked) {
+ seq_params->operating_points_cnt_minus_1 =
+ (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
+ ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
+ : 0;
+ av1_init_seq_coding_tools(
+ ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config);
+ }
+ seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ highbd_set_var_fns(ppi);
+#endif
+
+ set_primary_rc_buffer_sizes(oxcf, ppi);
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+ bool is_sb_size_changed) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ FeatureFlags *const features = &cm->features;
+
+ // in case of LAP, lag in frames is set according to number of lap buffers
+ // calculated at init time. This stores and restores LAP's lag in frames to
+ // prevent override by new cfg.
+ int lap_lag_in_frames = -1;
+ if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+ lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+ }
+
+ cpi->oxcf = *oxcf;
+
+ av1_update_film_grain_parameters(cpi, oxcf);
+
+ // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize
+ // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure
+ // that any analysis (e.g. TPL) happening outside the main encoding loop still
+ // happens at full resolution.
+ // This value will later be set appropriately just before main encoding loop.
+ cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO
+ ? AOM_SUPERRES_NONE
+ : oxcf->superres_cfg.superres_mode; // default
+ x->e_mbd.bd = (int)seq_params->bit_depth;
+ x->e_mbd.global_motion = cm->global_motion;
+
+ memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx,
+ sizeof(level_params->target_seq_level_idx));
+ level_params->keep_level_stats = 0;
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ if (level_params->target_seq_level_idx[i] < SEQ_LEVELS ||
+ level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) {
+ level_params->keep_level_stats |= 1u << i;
+ if (!level_params->level_info[i]) {
+ CHECK_MEM_ERROR(cm, level_params->level_info[i],
+ aom_calloc(1, sizeof(*level_params->level_info[i])));
+ }
+ }
+ }
+
+ // TODO(huisu@): level targeting currently only works for the 0th operating
+ // point, so scalable coding is not supported yet.
+ if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) {
+ // Adjust encoder config in order to meet target level.
+ config_target_level(cpi, level_params->target_seq_level_idx[0],
+ seq_params->tier[0]);
+ }
+
+ if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) {
+ p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+ } else if (!is_one_pass_rt_params(cpi) ||
+ cm->current_frame.frame_number == 0) {
+ // For rtc mode: logic for setting the baseline_gf_interval is done
+ // in av1_get_one_pass_rt_params(), and it should not be reset here in
+ // change_config(), unless after init_config (first frame).
+ p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+ }
+
+ refresh_frame->golden_frame = false;
+ refresh_frame->bwd_ref_frame = false;
+
+ features->refresh_frame_context =
+ (oxcf->tool_cfg.frame_parallel_decoding_mode)
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ if (oxcf->tile_cfg.enable_large_scale_tile)
+ features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ if (x->palette_buffer == NULL) {
+ CHECK_MEM_ERROR(cm, x->palette_buffer,
+ aom_memalign(16, sizeof(*x->palette_buffer)));
+ }
+
+ if (x->tmp_conv_dst == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+ x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+ }
+ // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames
+ // to store intermediate inter mode prediction results and are not required
+ // for allintra encoding mode. Hence, the memory allocations for these buffers
+ // are avoided for allintra encoding mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+ if (x->comp_rd_buffer.pred0 == NULL)
+ alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer);
+
+ for (int i = 0; i < 2; ++i) {
+ if (x->tmp_pred_bufs[i] == NULL) {
+ CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
+ aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*x->tmp_pred_bufs[i])));
+ x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
+ }
+ }
+ }
+
+ av1_reset_segment_features(cm);
+
+ av1_set_high_precision_mv(cpi, 1, 0);
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ p_rc->bits_off_target =
+ AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+ p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+
+ // Set up frame rate and related parameters rate control values.
+ av1_new_framerate(cpi, cpi->framerate);
+
+ // Set absolute upper and lower quality limits
+ rc->worst_quality = rc_cfg->worst_allowed_q;
+ rc->best_quality = rc_cfg->best_allowed_q;
+
+ // If lossless has been requested make sure average Q accumulators are reset.
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ int i;
+ for (i = 0; i < FRAME_TYPES; ++i) {
+ p_rc->avg_frame_qindex[i] = 0;
+ }
+ }
+
+ features->interp_filter =
+ oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+ features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+ features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
+
+ if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) {
+ cm->render_width = frm_dim_cfg->render_width;
+ cm->render_height = frm_dim_cfg->render_height;
+ } else {
+ cm->render_width = frm_dim_cfg->width;
+ cm->render_height = frm_dim_cfg->height;
+ }
+ cm->width = frm_dim_cfg->width;
+ cm->height = frm_dim_cfg->height;
+
+ if (cm->width > cpi->data_alloc_width ||
+ cm->height > cpi->data_alloc_height || is_sb_size_changed) {
+ av1_free_context_buffers(cm);
+ av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+ av1_free_sms_tree(&cpi->td);
+ av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+ cpi->td.firstpass_ctx = NULL;
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->data_alloc_width = cm->width;
+ cpi->data_alloc_height = cm->height;
+ cpi->frame_size_related_setup_done = false;
+ }
+ av1_update_frame_size(cpi);
+
+ rc->is_src_frame_alt_ref = 0;
+
+ if (!cpi->ppi->rtc_ref.set_ref_frame_config)
+ cpi->ext_flags.refresh_frame.update_pending = 0;
+ cpi->ext_flags.refresh_frame_context_pending = 0;
+
+ if (cpi->ppi->use_svc)
+ av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
+
+ check_reset_rc_flag(cpi);
+
+ // restore the value of lag_in_frame for LAP stage.
+ if (lap_lag_in_frames != -1) {
+ cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+ }
+
+#if CONFIG_REALTIME_ONLY
+ assert(!oxcf->tool_cfg.enable_global_motion);
+ cpi->image_pyramid_levels = 0;
+#else
+ if (oxcf->tool_cfg.enable_global_motion) {
+ cpi->image_pyramid_levels =
+ global_motion_pyr_levels[default_global_motion_method];
+ } else {
+ cpi->image_pyramid_levels = 0;
+ }
+#endif // CONFIG_REALTIME_ONLY
+}
+
+static INLINE void init_frame_info(FRAME_INFO *frame_info,
+ const AV1_COMMON *const cm) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ frame_info->frame_width = cm->width;
+ frame_info->frame_height = cm->height;
+ frame_info->mi_cols = mi_params->mi_cols;
+ frame_info->mi_rows = mi_params->mi_rows;
+ frame_info->mb_cols = mi_params->mb_cols;
+ frame_info->mb_rows = mi_params->mb_rows;
+ frame_info->num_mbs = mi_params->MBs;
+ frame_info->bit_depth = seq_params->bit_depth;
+ frame_info->subsampling_x = seq_params->subsampling_x;
+ frame_info->subsampling_y = seq_params->subsampling_y;
+}
+
+static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) {
+ frame_index_set->show_frame_count = 0;
+}
+
+static INLINE void update_counters_for_show_frame(AV1_COMP *const cpi) {
+ assert(cpi->common.show_frame);
+ cpi->frame_index_set.show_frame_count++;
+ cpi->common.current_frame.frame_number++;
+}
+
+AV1_PRIMARY *av1_create_primary_compressor(
+ struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+ const AV1EncoderConfig *oxcf) {
+ AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY));
+ if (!ppi) return NULL;
+ av1_zero(*ppi);
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(ppi->error.jmp)) {
+ ppi->error.setjmp = 0;
+ av1_remove_primary_compressor(ppi);
+ return 0;
+ }
+ ppi->error.setjmp = 1;
+
+ ppi->seq_params_locked = 0;
+ ppi->lap_enabled = num_lap_buffers > 0;
+ ppi->output_pkt_list = pkt_list_head;
+ ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+ ppi->frames_left = oxcf->input_cfg.limit;
+ ppi->num_fp_contexts = 1;
+
+ init_config_sequence(ppi, oxcf);
+
+#if CONFIG_ENTROPY_STATS
+ av1_zero(ppi->aggregate_fc);
+#endif // CONFIG_ENTROPY_STATS
+
+ av1_primary_rc_init(oxcf, &ppi->p_rc);
+
+ // For two pass and lag_in_frames > 33 in LAP.
+ ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+ if (ppi->lap_enabled) {
+ if ((num_lap_buffers <
+ (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
+ num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
+ /*
+ * For lag in frames >= 19 and <33, enable scenecut
+ * with limited future frame prediction.
+ */
+ ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+ } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
+ // Disable scenecut when lag_in_frames < 19.
+ ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT;
+ }
+ }
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+ ppi->fn_ptr[BT].sdf = SDF; \
+ ppi->fn_ptr[BT].sdaf = SDAF; \
+ ppi->fn_ptr[BT].vf = VF; \
+ ppi->fn_ptr[BT].svf = SVF; \
+ ppi->fn_ptr[BT].svaf = SVAF; \
+ ppi->fn_ptr[BT].sdx4df = SDX4DF; \
+ ppi->fn_ptr[BT].jsdaf = JSDAF; \
+ ppi->fn_ptr[BT].jsvaf = JSVAF; \
+ ppi->fn_ptr[BT].sdx3df = SDX3DF;
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+ BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+ aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+ aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x16)
+
+ BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+ aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+ aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x4)
+
+ BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+ aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+ aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x32)
+
+ BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+ aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+ aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x8)
+
+ BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+ aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+ aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x64)
+
+ BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+ aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+ aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x16)
+#endif // !CONFIG_REALTIME_ONLY
+
+ BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+ aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+ aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg,
+ aom_dist_wtd_sub_pixel_avg_variance128x128)
+
+ BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+ aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+ aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance128x64)
+
+ BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+ aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+ aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x128)
+
+ BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+ aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+ aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x16)
+
+ BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+ aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+ aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x32)
+
+ BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+ aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+ aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x32)
+
+ BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+ aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+ aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x64)
+
+ BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+ aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+ aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x32)
+
+ BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+ aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+ aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x64)
+
+ BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+ aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+ aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x16)
+
+ BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+ aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+ aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x8)
+
+ BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+ aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+ aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x16)
+
+ BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+ aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+ aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x8)
+
+ BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+ aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+ aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x4)
+
+ BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+ aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+ aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x8)
+
+ BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+ aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+ aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x4)
+
+#if !CONFIG_REALTIME_ONLY
+#define OBFP(BT, OSDF, OVF, OSVF) \
+ ppi->fn_ptr[BT].osdf = OSDF; \
+ ppi->fn_ptr[BT].ovf = OVF; \
+ ppi->fn_ptr[BT].osvf = OSVF;
+
+ OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+ aom_obmc_sub_pixel_variance128x128)
+ OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+ aom_obmc_sub_pixel_variance128x64)
+ OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+ aom_obmc_sub_pixel_variance64x128)
+ OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+ aom_obmc_sub_pixel_variance64x64)
+ OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+ aom_obmc_sub_pixel_variance64x32)
+ OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+ aom_obmc_sub_pixel_variance32x64)
+ OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+ aom_obmc_sub_pixel_variance32x32)
+ OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+ aom_obmc_sub_pixel_variance32x16)
+ OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+ aom_obmc_sub_pixel_variance16x32)
+ OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+ aom_obmc_sub_pixel_variance16x16)
+ OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+ aom_obmc_sub_pixel_variance16x8)
+ OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+ aom_obmc_sub_pixel_variance8x16)
+ OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+ aom_obmc_sub_pixel_variance8x8)
+ OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+ aom_obmc_sub_pixel_variance4x8)
+ OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+ aom_obmc_sub_pixel_variance8x4)
+ OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+ aom_obmc_sub_pixel_variance4x4)
+ OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+ aom_obmc_sub_pixel_variance4x16)
+ OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+ aom_obmc_sub_pixel_variance16x4)
+ OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+ aom_obmc_sub_pixel_variance8x32)
+ OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+ aom_obmc_sub_pixel_variance32x8)
+ OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+ aom_obmc_sub_pixel_variance16x64)
+ OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+ aom_obmc_sub_pixel_variance64x16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define MBFP(BT, MCSDF, MCSVF) \
+ ppi->fn_ptr[BT].msdf = MCSDF; \
+ ppi->fn_ptr[BT].msvf = MCSVF;
+
+ MBFP(BLOCK_128X128, aom_masked_sad128x128,
+ aom_masked_sub_pixel_variance128x128)
+ MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+ MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
+ MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+ MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+ MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+ MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+ MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+ MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+ MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+ MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+ MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+ MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+ MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+ MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+ MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+#if !CONFIG_REALTIME_ONLY
+ MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+ MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+ MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+ MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+ MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+ MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+#endif
+
+#define SDSFP(BT, SDSF, SDSX4DF) \
+ ppi->fn_ptr[BT].sdsf = SDSF; \
+ ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+ SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d)
+ SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d)
+ SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d)
+ SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d)
+ SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d)
+
+ SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d)
+ SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d)
+ SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d)
+
+ SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d)
+ SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d)
+ SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d)
+ SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d)
+ SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d)
+
+ SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+ SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d)
+ SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d)
+ SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d)
+ SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d)
+ SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d)
+#endif
+#undef SDSFP
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ highbd_set_var_fns(ppi);
+#endif
+
+ {
+ // As cm->mi_params is a part of the frame level context (cpi), it is
+ // unavailable at this point. mi_params is created as a local temporary
+ // variable, to be passed into the functions used for allocating tpl
+ // buffers. The values in this variable are populated according to initial
+ // width and height of the frame.
+ CommonModeInfoParams mi_params;
+ enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ BLOCK_4X4);
+
+ const BLOCK_SIZE bsize = BLOCK_16X16;
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ const int num_cols = (mi_params.mi_cols + w - 1) / w;
+ const int num_rows = (mi_params.mi_rows + h - 1) / h;
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, ppi->tpl_sb_rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*ppi->tpl_sb_rdmult_scaling_factors)));
+
+#if CONFIG_INTERNAL_STATS
+ ppi->b_calculate_blockiness = 1;
+ ppi->b_calculate_consistency = 1;
+
+ for (int i = 0; i <= STAT_ALL; i++) {
+ ppi->psnr[0].stat[i] = 0;
+ ppi->psnr[1].stat[i] = 0;
+
+ ppi->fastssim.stat[i] = 0;
+ ppi->psnrhvs.stat[i] = 0;
+ }
+
+ ppi->psnr[0].worst = 100.0;
+ ppi->psnr[1].worst = 100.0;
+ ppi->worst_ssim = 100.0;
+ ppi->worst_ssim_hbd = 100.0;
+
+ ppi->count[0] = 0;
+ ppi->count[1] = 0;
+ ppi->total_bytes = 0;
+
+ if (ppi->b_calculate_psnr) {
+ ppi->total_sq_error[0] = 0;
+ ppi->total_samples[0] = 0;
+ ppi->total_sq_error[1] = 0;
+ ppi->total_samples[1] = 0;
+ ppi->total_recode_hits = 0;
+ ppi->summed_quality = 0;
+ ppi->summed_weights = 0;
+ ppi->summed_quality_hbd = 0;
+ ppi->summed_weights_hbd = 0;
+ }
+
+ ppi->fastssim.worst = 100.0;
+ ppi->psnrhvs.worst = 100.0;
+
+ if (ppi->b_calculate_blockiness) {
+ ppi->total_blockiness = 0;
+ ppi->worst_blockiness = 0.0;
+ }
+
+ ppi->total_inconsistency = 0;
+ ppi->worst_consistency = 100.0;
+ if (ppi->b_calculate_consistency) {
+ AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars,
+ aom_malloc(sizeof(*ppi->ssim_vars) * 4 *
+ mi_params.mi_rows * mi_params.mi_cols));
+ }
+#endif
+ }
+
+ ppi->error.setjmp = 0;
+ return ppi;
+}
+
+AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+ BufferPool *const pool, COMPRESSOR_STAGE stage,
+ int lap_lag_in_frames) {
+ AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+
+ if (!cpi) return NULL;
+
+ av1_zero(*cpi);
+
+ cpi->ppi = ppi;
+
+ AV1_COMMON *volatile const cm = &cpi->common;
+ cm->seq_params = &ppi->seq_params;
+ cm->error =
+ (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error));
+ if (!cm->error) {
+ aom_free(cpi);
+ return NULL;
+ }
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error->jmp)) {
+ cm->error->setjmp = 0;
+ av1_remove_compressor(cpi);
+ return NULL;
+ }
+
+ cm->error->setjmp = 1;
+ cpi->compressor_stage = stage;
+
+ cpi->do_frame_data_update = true;
+
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ mi_params->free_mi = enc_free_mi;
+ mi_params->setup_mi = enc_setup_mi;
+ mi_params->set_mb_mi =
+ (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE)
+ ? stat_stage_set_mb_mi
+ : enc_set_mb_mi;
+
+ mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(
+ cm, cm->default_frame_context,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+ memset(cm->fc, 0, sizeof(*cm->fc));
+ memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+ cpi->common.buffer_pool = pool;
+
+ init_config(cpi, oxcf);
+ if (cpi->compressor_stage == LAP_STAGE) {
+ cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+ }
+
+ av1_rc_init(&cpi->oxcf, &cpi->rc);
+
+ init_frame_info(&cpi->frame_info, cm);
+ init_frame_index_set(&cpi->frame_index_set);
+
+ cm->current_frame.frame_number = 0;
+ cpi->rc.frame_number_encoded = 0;
+ cpi->rc.prev_frame_is_dropped = 0;
+ cpi->rc.max_consec_drop = INT_MAX;
+ cpi->rc.drop_count_consec = 0;
+ cm->current_frame_id = -1;
+ cpi->tile_data = NULL;
+ cpi->last_show_frame_buf = NULL;
+ realloc_segmentation_maps(cpi);
+
+ cpi->refresh_frame.alt_ref_frame = false;
+
+#if CONFIG_SPEED_STATS
+ cpi->tx_search_count = 0;
+#endif // CONFIG_SPEED_STATS
+
+ cpi->time_stamps.first_ts_start = INT64_MAX;
+
+#ifdef OUTPUT_YUV_REC
+ yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+ yuv_denoised_file = fopen("denoised.yuv", "wb");
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+ if (is_stat_consumption_stage(cpi)) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
+
+ if (!cpi->ppi->lap_enabled) {
+ /*Re-initialize to stats buffer, populated by application in the case of
+ * two pass*/
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_start =
+ oxcf->twopass_stats_in.buf;
+ cpi->twopass_frame.stats_in =
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_end =
+ &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+ // The buffer size is packets - 1 because the last packet is total_stats.
+ av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info,
+ oxcf->twopass_stats_in.buf, packets - 1);
+ av1_init_second_pass(cpi);
+ } else {
+ av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0);
+ av1_init_single_pass_lap(cpi);
+ }
+ }
+#endif
+
+ // The buffer "obmc_buffer" is used in inter frames for fast obmc search.
+ // Hence, the memory allocation for the same is avoided for allintra encoding
+ // mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0)
+ alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error);
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+ cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+ av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+ av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+ int max_mi_cols = mi_params->mi_cols;
+ int max_mi_rows = mi_params->mi_rows;
+ if (oxcf->frm_dim_cfg.forced_max_frame_width) {
+ max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width);
+ }
+ if (oxcf->frm_dim_cfg.forced_max_frame_height) {
+ max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height);
+ }
+
+ const int consec_zero_mv_alloc_size = (max_mi_rows * max_mi_cols) >> 2;
+ CHECK_MEM_ERROR(
+ cm, cpi->consec_zero_mv,
+ aom_calloc(consec_zero_mv_alloc_size, sizeof(*cpi->consec_zero_mv)));
+ cpi->consec_zero_mv_alloc_size = consec_zero_mv_alloc_size;
+
+ cpi->mb_weber_stats = NULL;
+ cpi->mb_delta_q = NULL;
+ cpi->palette_pixel_num = 0;
+ cpi->scaled_last_source_available = 0;
+
+ {
+ const BLOCK_SIZE bsize = BLOCK_16X16;
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ const int num_cols = (max_mi_cols + w - 1) / w;
+ const int num_rows = (max_mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*cpi->ssim_rdmult_scaling_factors)));
+ CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*cpi->tpl_rdmult_scaling_factors)));
+ }
+
+#if CONFIG_TUNE_VMAF
+ {
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ const int num_cols = (mi_params->mi_cols + w - 1) / w;
+ const int num_rows = (mi_params->mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+ for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+ cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+ cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+ cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+ }
+ cpi->vmaf_info.original_qindex = -1;
+ cpi->vmaf_info.vmaf_model = NULL;
+ }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ {
+ const int w = mi_size_wide[butteraugli_rdo_bsize];
+ const int h = mi_size_high[butteraugli_rdo_bsize];
+ const int num_cols = (mi_params->mi_cols + w - 1) / w;
+ const int num_rows = (mi_params->mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(
+ cm, cpi->butteraugli_info.rdmult_scaling_factors,
+ aom_malloc(num_rows * num_cols *
+ sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
+ memset(&cpi->butteraugli_info.source, 0,
+ sizeof(cpi->butteraugli_info.source));
+ memset(&cpi->butteraugli_info.resized_source, 0,
+ sizeof(cpi->butteraugli_info.resized_source));
+ cpi->butteraugli_info.recon_set = false;
+ }
+#endif
+
+#if CONFIG_SALIENCY_MAP
+ {
+ CHECK_MEM_ERROR(cm, cpi->saliency_map,
+ (uint8_t *)aom_calloc(cm->height * cm->width,
+ sizeof(*cpi->saliency_map)));
+ // Buffer initialization based on MIN_MIB_SIZE_LOG2 to ensure that
+ // cpi->sm_scaling_factor buffer is allocated big enough, since we have no
+ // idea of the actual superblock size we are going to use yet.
+ const int min_mi_w_sb = (1 << MIN_MIB_SIZE_LOG2);
+ const int min_mi_h_sb = (1 << MIN_MIB_SIZE_LOG2);
+ const int max_sb_cols =
+ (cm->mi_params.mi_cols + min_mi_w_sb - 1) / min_mi_w_sb;
+ const int max_sb_rows =
+ (cm->mi_params.mi_rows + min_mi_h_sb - 1) / min_mi_h_sb;
+ CHECK_MEM_ERROR(cm, cpi->sm_scaling_factor,
+ (double *)aom_calloc(max_sb_rows * max_sb_cols,
+ sizeof(*cpi->sm_scaling_factor)));
+ }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ av1_zero(cpi->partition_stats);
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+ // Initialize the members of DeltaQuantParams with INT_MAX to ensure that
+ // the quantizer tables are correctly initialized using the default deltaq
+ // parameters when av1_init_quantizer is called for the first time.
+ DeltaQuantParams *const prev_deltaq_params =
+ &cpi->enc_quant_dequant_params.prev_deltaq_params;
+ prev_deltaq_params->y_dc_delta_q = INT_MAX;
+ prev_deltaq_params->u_dc_delta_q = INT_MAX;
+ prev_deltaq_params->v_dc_delta_q = INT_MAX;
+ prev_deltaq_params->u_ac_delta_q = INT_MAX;
+ prev_deltaq_params->v_ac_delta_q = INT_MAX;
+
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+ av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+
+ av1_loop_filter_init(cm);
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ cm->superres_upscaled_width = oxcf->frm_dim_cfg.width;
+ cm->superres_upscaled_height = oxcf->frm_dim_cfg.height;
+#if !CONFIG_REALTIME_ONLY
+ av1_loop_restoration_precal();
+#endif
+
+ cpi->third_pass_ctx = NULL;
+ if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+ av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL);
+ }
+
+ cpi->second_pass_log_stream = NULL;
+ cpi->use_ducky_encode = 0;
+
+ cm->error->setjmp = 0;
+ return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif // CONFIG_INTERNAL_STATS
+
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
+ if (!ppi) return;
+#if !CONFIG_REALTIME_ONLY
+ av1_tf_info_free(&ppi->tf_info);
+#endif // !CONFIG_REALTIME_ONLY
+
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ aom_free(ppi->level_params.level_info[i]);
+ }
+ av1_lookahead_destroy(ppi->lookahead);
+
+ aom_free(ppi->tpl_sb_rdmult_scaling_factors);
+ ppi->tpl_sb_rdmult_scaling_factors = NULL;
+
+ TplParams *const tpl_data = &ppi->tpl_data;
+ aom_free(tpl_data->txfm_stats_list);
+
+ for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+ aom_free(tpl_data->tpl_stats_pool[frame]);
+ aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+ tpl_data->tpl_stats_pool[frame] = NULL;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
+
+ av1_terminate_workers(ppi);
+ free_thread_data(ppi);
+
+ aom_free(ppi->p_mt_info.tile_thr_data);
+ ppi->p_mt_info.tile_thr_data = NULL;
+ aom_free(ppi->p_mt_info.workers);
+ ppi->p_mt_info.workers = NULL;
+ ppi->p_mt_info.num_workers = 0;
+
+ aom_free(ppi);
+}
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+ if (!cpi) return;
+#if CONFIG_RATECTRL_LOG
+ if (cpi->oxcf.pass == 3) {
+ rc_log_show(&cpi->rc_log);
+ }
+#endif // CONFIG_RATECTRL_LOG
+
+ AV1_COMMON *cm = &cpi->common;
+ if (cm->current_frame.frame_number > 0) {
+#if CONFIG_SPEED_STATS
+ if (!is_stat_generation_stage(cpi)) {
+ fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
+ }
+#endif // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ if (!is_stat_generation_stage(cpi)) {
+ av1_print_fr_partition_timing_stats(&cpi->partition_stats,
+ "fr_part_timing_data.csv");
+ }
+#endif
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ av1_denoiser_free(&(cpi->denoiser));
+#endif
+
+ if (cm->error) {
+ // Help detect use after free of the error detail string.
+ memset(cm->error->detail, 'A', sizeof(cm->error->detail) - 1);
+ cm->error->detail[sizeof(cm->error->detail) - 1] = '\0';
+ aom_free(cm->error);
+ }
+ aom_free(cpi->td.tctx);
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
+ pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_;
+ pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+ pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_;
+ pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
+ if (enc_row_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(enc_row_mt_mutex_);
+ aom_free(enc_row_mt_mutex_);
+ }
+ if (enc_row_mt_cond_ != NULL) {
+ pthread_cond_destroy(enc_row_mt_cond_);
+ aom_free(enc_row_mt_cond_);
+ }
+ if (gm_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(gm_mt_mutex_);
+ aom_free(gm_mt_mutex_);
+ }
+ if (tpl_error_mutex_ != NULL) {
+ pthread_mutex_destroy(tpl_error_mutex_);
+ aom_free(tpl_error_mutex_);
+ }
+ if (pack_bs_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(pack_bs_mt_mutex_);
+ aom_free(pack_bs_mt_mutex_);
+ }
+#endif
+ av1_row_mt_mem_dealloc(cpi);
+
+ if (mt_info->num_workers > 1) {
+ av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+ av1_loop_filter_dealloc(&mt_info->lf_row_sync);
+ av1_cdef_mt_dealloc(&mt_info->cdef_sync);
+#if !CONFIG_REALTIME_ONLY
+ av1_loop_restoration_dealloc(&mt_info->lr_row_sync);
+ av1_tf_mt_dealloc(&mt_info->tf_sync);
+#endif
+ }
+
+ av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+
+ av1_close_second_pass_log(cpi);
+
+ dealloc_compressor_data(cpi);
+
+ av1_ext_part_delete(&cpi->ext_part_controller);
+
+ av1_remove_common(cm);
+
+ aom_free(cpi);
+
+#ifdef OUTPUT_YUV_REC
+ fclose(yuv_rec_file);
+#endif
+
+#ifdef OUTPUT_YUV_DENOISED
+ fclose(yuv_denoised_file);
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+ struct aom_codec_cx_pkt pkt;
+ int i;
+ PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
+ bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+#endif
+
+ for (i = 0; i < 4; ++i) {
+ pkt.data.psnr.samples[i] = psnr.samples[i];
+ pkt.data.psnr.sse[i] = psnr.sse[i];
+ pkt.data.psnr.psnr[i] = psnr.psnr[i];
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+ (in_bit_depth < bit_depth)) {
+ for (i = 0; i < 4; ++i) {
+ pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i];
+ pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i];
+ pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i];
+ }
+ }
+#endif
+
+ pkt.kind = AOM_CODEC_PSNR_PKT;
+ aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
+ if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+ *ext_ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+ if (cfg) {
+ aom_yv12_copy_frame(cfg, sd, num_planes);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+ if (cfg) {
+ aom_yv12_copy_frame(sd, cfg, num_planes);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+ uint8_t *src = s->y_buffer;
+ int h = cm->height;
+ if (yuv_rec_file == NULL) return;
+ if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+ do {
+ fwrite(src16, s->y_width, 2, yuv_rec_file);
+ src16 += s->y_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+ return;
+ }
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_rec_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+}
+#endif // OUTPUT_YUV_REC
+
+void av1_set_mv_search_params(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+ const int max_mv_def = AOMMAX(cm->width, cm->height);
+
+ // Default based on max resolution.
+ mv_search_params->mv_step_param = av1_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv_sf.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ mv_search_params->max_mv_magnitude = max_mv_def;
+ } else {
+ // Use adaptive mv steps based on previous frame stats for show frames and
+ // internal arfs.
+ FRAME_UPDATE_TYPE cur_update_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ int use_auto_mv_step =
+ (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) &&
+ mv_search_params->max_mv_magnitude != -1 &&
+ cpi->sf.mv_sf.auto_mv_step_size >= 2;
+ if (use_auto_mv_step) {
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ mv_search_params->mv_step_param = av1_init_search_range(
+ AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
+ }
+ // Reset max_mv_magnitude based on update flag.
+ if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1;
+ }
+ }
+}
+
+void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ if (cm->seq_params->force_screen_content_tools != 2) {
+ features->allow_screen_content_tools = features->allow_intrabc =
+ cm->seq_params->force_screen_content_tools;
+ return;
+ }
+
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ features->allow_screen_content_tools = 1;
+ features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1;
+ cpi->is_screen_content_type = 1;
+ cpi->use_screen_content_tools = 1;
+ return;
+ }
+
+ if (cpi->oxcf.mode == REALTIME) {
+ features->allow_screen_content_tools = features->allow_intrabc = 0;
+ return;
+ }
+
+ // Screen content tools are not evaluated in non-RD encoding mode unless
+ // content type is not set explicitly, i.e., when
+ // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1
+ // and hybrid_intra_pickmode = 0. Hence, screen content detection is
+ // disabled.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ !cpi->sf.rt_sf.hybrid_intra_pickmode) {
+ features->allow_screen_content_tools = features->allow_intrabc = 0;
+ return;
+ }
+
+ // Estimate if the source frame is screen content, based on the portion of
+ // blocks that have few luma colors.
+ const uint8_t *src = cpi->unfiltered_source->y_buffer;
+ assert(src != NULL);
+ const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int stride = cpi->unfiltered_source->y_stride;
+ const int width = cpi->unfiltered_source->y_width;
+ const int height = cpi->unfiltered_source->y_height;
+ const int64_t area = (int64_t)width * height;
+ const int bd = cm->seq_params->bit_depth;
+ const int blk_w = 16;
+ const int blk_h = 16;
+ // These threshold values are selected experimentally.
+ const int color_thresh = 4;
+ const unsigned int var_thresh = 0;
+ // Counts of blocks with no more than color_thresh colors.
+ int64_t counts_1 = 0;
+ // Counts of blocks with no more than color_thresh colors and variance larger
+ // than var_thresh.
+ int64_t counts_2 = 0;
+
+ for (int r = 0; r + blk_h <= height; r += blk_h) {
+ for (int c = 0; c + blk_w <= width; c += blk_w) {
+ int count_buf[1 << 8]; // Maximum (1 << 8) bins for hbd path.
+ const uint8_t *const this_src = src + r * stride + c;
+ int n_colors;
+ if (use_hbd)
+ av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
+ count_buf, &n_colors, NULL);
+ else
+ av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors);
+ if (n_colors > 1 && n_colors <= color_thresh) {
+ ++counts_1;
+ struct buf_2d buf;
+ buf.stride = stride;
+ buf.buf = (uint8_t *)this_src;
+ const unsigned int var = av1_get_perpixel_variance(
+ cpi, xd, &buf, BLOCK_16X16, AOM_PLANE_Y, use_hbd);
+ if (var > var_thresh) ++counts_2;
+ }
+ }
+ }
+
+ // The threshold values are selected experimentally.
+ features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area;
+ // IntraBC would force loop filters off, so we use more strict rules that also
+ // requires that the block has high variance.
+ features->allow_intrabc = features->allow_screen_content_tools &&
+ counts_2 * blk_h * blk_w * 12 > area;
+ cpi->use_screen_content_tools = features->allow_screen_content_tools;
+ cpi->is_screen_content_type =
+ features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 &&
+ counts_2 * blk_h * blk_w * 30 > area);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+ const int aligned_width = (cm->width + 7) & ~7;
+ const int y_stride =
+ aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels);
+ const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width ||
+ cpi->oxcf.frm_dim_cfg.height != cm->height) ||
+ av1_superres_scaled(cm))
+ ? y_stride
+ : cpi->ppi->lookahead->buf->img.y_stride;
+ int fpf_y_stride =
+ cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride;
+
+ // Update if search_site_cfg is uninitialized or the current frame has a new
+ // stride
+ const int should_update =
+ !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride ||
+ !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride ||
+ (y_stride !=
+ mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride);
+
+ if (!should_update) {
+ return;
+ }
+
+ // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS.
+ for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+ const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0;
+ av1_init_motion_compensation[i](
+ &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level);
+ av1_init_motion_compensation[i](
+ &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src,
+ level);
+ }
+
+ // First pass search site config initialization.
+ av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+ fpf_y_stride);
+ for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+ memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i],
+ &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+ sizeof(search_site_config));
+ }
+}
+
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i;
+ if (cm->cur_frame) {
+ cm->cur_frame->ref_count--;
+ cm->cur_frame = NULL;
+ }
+ for (i = 0; i < REF_FRAMES; ++i) {
+ if (cm->ref_frame_map[i]) {
+ cm->ref_frame_map[i]->ref_count--;
+ cm->ref_frame_map[i] = NULL;
+ }
+ }
+#ifndef NDEBUG
+ BufferPool *const pool = cm->buffer_pool;
+ for (i = 0; i < pool->num_frame_bufs; ++i) {
+ assert(pool->frame_bufs[i].ref_count == 0);
+ }
+#endif
+}
+
+// TODO(chengchen): consider renaming this function as it is necessary
+// for the encoder to setup critical parameters, and it does not
+// deal with initial width any longer.
+aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+ int subsampling_x, int subsampling_y) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+
+ if (!cpi->frame_size_related_setup_done ||
+ seq_params->use_highbitdepth != use_highbitdepth ||
+ seq_params->subsampling_x != subsampling_x ||
+ seq_params->subsampling_y != subsampling_y) {
+ seq_params->subsampling_x = subsampling_x;
+ seq_params->subsampling_y = subsampling_y;
+ seq_params->use_highbitdepth = use_highbitdepth;
+
+ av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+ av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
+
+ if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+ if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi))
+ return AOM_CODEC_MEM_ERROR;
+#endif // !CONFIG_REALTIME_ONLY
+ }
+ init_ref_frame_bufs(cpi);
+
+ init_motion_estimation(cpi); // TODO(agrange) This can be removed.
+
+ cpi->initial_mbs = cm->mi_params.MBs;
+ cpi->frame_size_related_setup_done = true;
+ }
+ return AOM_CODEC_OK;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ !cpi->denoiser.frame_buffer_initialized) {
+ if (av1_denoiser_alloc(
+ cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc,
+ cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate denoiser");
+ }
+}
+#endif
+
+// Returns 1 if the assigned width or height was <= 0.
+static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+ AV1_COMMON *cm = &cpi->common;
+ aom_codec_err_t err = av1_check_initial_width(
+ cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y);
+ if (err != AOM_CODEC_OK) {
+ aom_internal_error(cm->error, err, "av1_check_initial_width() failed");
+ }
+
+ if (width <= 0 || height <= 0) return 1;
+
+ cm->width = width;
+ cm->height = height;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
+ if (cm->width > cpi->data_alloc_width ||
+ cm->height > cpi->data_alloc_height) {
+ av1_free_context_buffers(cm);
+ av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+ av1_free_sms_tree(&cpi->td);
+ av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+ cpi->td.firstpass_ctx = NULL;
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->data_alloc_width = cm->width;
+ cpi->data_alloc_height = cm->height;
+ cpi->frame_size_related_setup_done = false;
+ }
+ alloc_mb_mode_info_buffers(cpi);
+ av1_update_frame_size(cpi);
+
+ return 0;
+}
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int ref_frame;
+
+ if (width != cm->width || height != cm->height) {
+ // There has been a change in the encoded frame size
+ set_size_literal(cpi, width, height);
+ // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+ cm->features.all_lossless =
+ cm->features.coded_lossless && !av1_superres_scaled(cm);
+
+ av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ // Reset the denoiser on the resized frame.
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ av1_denoiser_free(&(cpi->denoiser));
+ setup_denoiser_buffer(cpi);
+ }
+#endif
+ }
+ if (is_stat_consumption_stage(cpi)) {
+ av1_set_target_rate(cpi, cm->width, cm->height);
+ }
+
+ alloc_frame_mvs(cm, cm->cur_frame);
+
+ // Allocate above context buffers
+ CommonContexts *const above_contexts = &cm->above_contexts;
+ if (above_contexts->num_planes < av1_num_planes(cm) ||
+ above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+ above_contexts->num_tile_rows < cm->tiles.rows) {
+ av1_free_above_context_buffers(above_contexts);
+ if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+ cm->mi_params.mi_cols,
+ av1_num_planes(cm)))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ AV1EncoderConfig *oxcf = &cpi->oxcf;
+ oxcf->border_in_pixels = av1_get_enc_border_size(
+ av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0,
+ cm->seq_params->sb_size);
+
+ // Reset the frame pointers to the current frame size.
+ if (aom_realloc_frame_buffer(
+ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ for (int i = 0; i < num_planes; ++i)
+ cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+ const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter;
+ av1_alloc_restoration_buffers(cm, is_sgr_enabled);
+ // Store the allocated restoration buffers in MT object.
+ if (cpi->ppi->p_mt_info.num_workers > 1) {
+ av1_init_lr_mt_buffers(cpi);
+ }
+ }
+#endif
+
+ init_motion_estimation(cpi);
+
+ int has_valid_ref_frame = 0;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+ av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
+ buf->buf.y_crop_height, cm->width,
+ cm->height);
+ has_valid_ref_frame |= av1_is_valid_scale(sf);
+ if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
+ }
+ }
+ if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Can't find at least one reference frame with valid size");
+ }
+
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+ cm->width, cm->height);
+
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static INLINE int extend_borders_mt(const AV1_COMP *cpi,
+ MULTI_THREADED_MODULES stage, int plane) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cpi->mt_info.num_mod_workers[stage] < 2) return 0;
+ switch (stage) {
+ // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled,
+ // multi-thread frame border extension along with loop filter frame.
+ // As loop-filtering of a superblock row modifies the pixels of the
+ // above superblock row, border extension requires that loop filtering
+ // of the current and above superblock row is complete.
+ case MOD_LPF: return 0;
+ case MOD_CDEF:
+ return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame &&
+ !is_restoration_used(cm) && !av1_superres_scaled(cm);
+ case MOD_LR:
+ return is_restoration_used(cm) &&
+ (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE);
+ default: assert(0);
+ }
+ return 0;
+}
+
+/*!\brief Select and apply cdef filters and switchable restoration filters
+ *
+ * \ingroup high_level_algo
+ */
+static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int use_restoration,
+ int use_cdef,
+ unsigned int skip_apply_postproc_filters) {
+#if !CONFIG_REALTIME_ONLY
+ if (use_restoration)
+ av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
+#else
+ (void)use_restoration;
+#endif
+
+ if (use_cdef) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, cdef_time);
+#endif
+ const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
+ // Find CDEF parameters
+ av1_cdef_search(cpi);
+
+ // Apply the filter
+ if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) {
+ assert(!cpi->ppi->rtc_ref.non_reference_frame);
+ if (num_workers > 1) {
+ // Extension of frame borders is multi-threaded along with cdef.
+ const int do_extend_border =
+ extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0);
+ av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
+ cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
+ num_workers, av1_cdef_init_fb_row_mt,
+ do_extend_border);
+ } else {
+ av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, cdef_time);
+#endif
+ }
+
+ const int use_superres = av1_superres_scaled(cm);
+ if (use_superres) {
+ if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) {
+ av1_superres_post_encode(cpi);
+ }
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loop_restoration_time);
+#endif
+ if (use_restoration) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int num_workers = mt_info->num_mod_workers[MOD_LR];
+ av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
+ av1_pick_filter_restoration(cpi->source, cpi);
+ if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 &&
+ (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
+ if (num_workers > 1) {
+ // Extension of frame borders is multi-threaded along with loop
+ // restoration filter.
+ const int do_extend_border = 1;
+ av1_loop_restoration_filter_frame_mt(
+ &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers,
+ &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border);
+ } else {
+ av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
+ &cpi->lr_ctxt);
+ }
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loop_restoration_time);
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+}
+
+static void extend_frame_borders(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // TODO(debargha): Fix mv search range on encoder side
+ for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+ const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) ||
+ extend_borders_mt(cpi, MOD_LR, plane);
+ if (!extend_border_done) {
+ const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf;
+ aom_extend_frame_borders_plane_row(ybf, plane, 0,
+ ybf->crop_heights[plane > 0]);
+ }
+ }
+}
+
+/*!\brief Select and apply deblocking filters, cdef filters, and restoration
+ * filters.
+ *
+ * \ingroup high_level_algo
+ */
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int num_workers = mt_info->num_mod_workers[MOD_LPF];
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+ assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg),
+ cm->features.coded_lossless && cm->features.all_lossless));
+
+ const int use_loopfilter =
+ is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc;
+ const int use_cdef = is_cdef_used(cm);
+ const int use_superres = av1_superres_scaled(cm);
+ const int use_restoration = is_restoration_used(cm);
+
+ const unsigned int skip_apply_postproc_filters =
+ derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+ use_superres, use_restoration);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loop_filter_time);
+#endif
+ if (use_loopfilter) {
+ av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
+ struct loopfilter *lf = &cm->lf;
+ if ((lf->filter_level[0] || lf->filter_level[1]) &&
+ (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) {
+ assert(!cpi->ppi->rtc_ref.non_reference_frame);
+ // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+ // lpf_opt_level is set to 1 if transform size search depth in inter
+ // blocks is limited to one as quad loop filtering assumes that all the
+ // transform blocks within a 16x8/8x16/16x16 prediction block are of the
+ // same size. lpf_opt_level = 2 : Filters both chroma planes together, in
+ // addition to enabling dual/quad loop-filtering. This is enabled when lpf
+ // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are
+ // equal.
+ int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+ mt_info->workers, num_workers,
+ &mt_info->lf_row_sync, lpf_opt_level);
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loop_filter_time);
+#endif
+
+ cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef,
+ skip_apply_postproc_filters);
+}
+
+static void update_motion_stat(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ RATE_CONTROL *const rc = &cpi->rc;
+ SVC *const svc = &cpi->svc;
+ const int avg_cnt_zeromv =
+ 100 * cpi->rc.cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+ if (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+ rc->avg_frame_low_motion =
+ (rc->avg_frame_low_motion == 0)
+ ? avg_cnt_zeromv
+ : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
+ // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+ // to all lower spatial layers.
+ if (cpi->ppi->use_svc &&
+ svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+ }
+ }
+ }
+}
+
+/*!\brief Encode a frame without the recode loop, usually used in one-pass
+ * encoding and realtime coding.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in] cpi Top-level encoder structure
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_without_recode(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+ SVC *const svc = &cpi->svc;
+ const int resize_pending = is_frame_resize_pending(cpi);
+ int top_index = 0, bottom_index = 0, q = 0;
+ YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+ InterpFilter filter_scaler =
+ cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+ : EIGHTTAP_SMOOTH;
+ int phase_scaler = cpi->ppi->use_svc
+ ? svc->downsample_filter_phase[svc->spatial_layer_id]
+ : 0;
+
+ set_size_independent_vars(cpi);
+ av1_setup_frame_size(cpi);
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
+ av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ av1_set_mv_search_params(cpi);
+
+ if (cm->current_frame.frame_number == 0 &&
+ (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) &&
+ cpi->svc.temporal_layer_id == 0) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ if (aom_alloc_frame_buffer(
+ &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width,
+ cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer for source_last_TL0");
+ }
+ }
+
+ if (!cpi->ppi->use_svc) {
+ phase_scaler = 8;
+ // 2:1 scaling.
+ if ((cm->width << 1) == unscaled->y_crop_width &&
+ (cm->height << 1) == unscaled->y_crop_height) {
+ filter_scaler = BILINEAR;
+ // For lower resolutions use eighttap_smooth.
+ if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH;
+ } else if ((cm->width << 2) == unscaled->y_crop_width &&
+ (cm->height << 2) == unscaled->y_crop_height) {
+ // 4:1 scaling.
+ filter_scaler = EIGHTTAP_SMOOTH;
+ } else if ((cm->width << 2) == 3 * unscaled->y_crop_width &&
+ (cm->height << 2) == 3 * unscaled->y_crop_height) {
+ // 4:3 scaling.
+ filter_scaler = EIGHTTAP_REGULAR;
+ }
+ }
+
+ allocate_gradient_info_for_hog(cpi);
+
+ allocate_src_var_of_4x4_sub_block_buf(cpi);
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
+ variance_partition_alloc(cpi);
+
+ if (cm->current_frame.frame_type == KEY_FRAME ||
+ ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame)))
+ copy_frame_prob_info(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame: \n");
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ av1_setup_butteraugli_rdmult(cpi);
+ }
+#endif
+
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true,
+ false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ if (frame_is_intra_only(cm) || resize_pending != 0) {
+ const int current_size =
+ (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2;
+ if (cpi->consec_zero_mv &&
+ (cpi->consec_zero_mv_alloc_size < current_size)) {
+ aom_free(cpi->consec_zero_mv);
+ cpi->consec_zero_mv_alloc_size = 0;
+ CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+ aom_malloc(current_size * sizeof(*cpi->consec_zero_mv)));
+ cpi->consec_zero_mv_alloc_size = current_size;
+ }
+ assert(cpi->consec_zero_mv != NULL);
+ memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv));
+ }
+
+ if (cpi->scaled_last_source_available) {
+ cpi->last_source = &cpi->scaled_last_source;
+ cpi->scaled_last_source_available = 0;
+ } else if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
+ phase_scaler, true, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ if (cpi->sf.rt_sf.use_temporal_noise_estimate) {
+ av1_update_noise_estimate(cpi);
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc)
+ av1_denoiser_reset_on_first_frame(cpi);
+#endif
+
+ // For 1 spatial layer encoding: if the (non-LAST) reference has different
+ // resolution from the source then disable that reference. This is to avoid
+ // significant increase in encode time from scaling the references in
+ // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger)
+ // resized frame and ALTREF will be refreshed ~4 frames later, so both
+ // references become available again after few frames.
+ // For superres: don't disable golden reference.
+ if (svc->number_spatial_layers == 1) {
+ if (!cpi->oxcf.superres_cfg.enable_superres) {
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+ cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+ }
+ }
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+ cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+ }
+ }
+
+ int scale_references = 0;
+#if CONFIG_FPMT_TEST
+ scale_references =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (scale_references ||
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (!frame_is_intra_only(cm)) {
+ av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+ }
+ }
+
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+ av1_setup_frame(cpi);
+
+ // Check if this high_source_sad (scene/slide change) frame should be
+ // encoded at high/max QP, and if so, set the q and adjust some rate
+ // control parameters.
+ if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+ cpi->rc.high_source_sad) {
+ if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ cm->features.primary_ref_frame == PRIMARY_REF_NONE)
+ av1_setup_frame(cpi);
+ }
+ }
+
+ if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+ suppress_active_map(cpi);
+ av1_cyclic_refresh_setup(cpi);
+ }
+ av1_apply_active_map(cpi);
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+ // This is for rtc temporal filtering case.
+ if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+ cm->current_frame.frame_type != KEY_FRAME) {
+ const SequenceHeader *seq_params = cm->seq_params;
+
+ if (cpi->orig_source.buffer_alloc_sz == 0 ||
+ cpi->last_source->y_width != cpi->source->y_width ||
+ cpi->last_source->y_height != cpi->source->y_height) {
+ // Allocate a source buffer to store the true source for psnr calculation.
+ if (aom_alloc_frame_buffer(
+ &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width,
+ cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled buffer");
+ }
+
+ aom_yv12_copy_y(cpi->source, &cpi->orig_source);
+ aom_yv12_copy_u(cpi->source, &cpi->orig_source);
+ aom_yv12_copy_v(cpi->source, &cpi->orig_source);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_encode_frame_time);
+#endif
+
+ // Set the motion vector precision based on mv stats from the last coded
+ // frame.
+ if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q);
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ if (!cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm))
+ update_motion_stat(cpi);
+
+ // Adjust the refresh of the golden (longer-term) reference based on QP
+ // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+ if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
+ svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl &&
+ sf->rt_sf.gf_refresh_based_on_qp)
+ av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
+
+ // For non-svc: if scaling is required, copy scaled_source
+ // into scaled_last_source.
+ if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc &&
+ cpi->scaled_source.y_buffer != NULL &&
+ cpi->scaled_last_source.y_buffer != NULL &&
+ cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width &&
+ cpi->scaled_source.y_crop_height ==
+ cpi->scaled_last_source.y_crop_height &&
+ (cm->width != cpi->unscaled_source->y_crop_width ||
+ cm->height != cpi->unscaled_source->y_crop_height)) {
+ cpi->scaled_last_source_available = 1;
+ aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source);
+ aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source);
+ aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_encode_frame_time);
+#endif
+#if CONFIG_INTERNAL_STATS
+ ++cpi->frame_recode_hits;
+#endif
+
+ return AOM_CODEC_OK;
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame
+ * for multiple times can be approaching a target bitrate or adjusting the usage
+ * of global motions.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+ const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
+ // Must allow recode if minimum compression ratio is set.
+ assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode));
+
+ set_size_independent_vars(cpi);
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->sf.interp_sf.adaptive_interp_filter_search)
+ cpi->interp_search_flags.interp_filter_search_mask =
+ av1_setup_interp_filter_search_mask(cpi);
+
+ av1_setup_frame_size(cpi);
+
+ if (av1_superres_in_recode_allowed(cpi) &&
+ cpi->superres_mode != AOM_SUPERRES_NONE &&
+ cm->superres_scale_denominator == SCALE_NUMERATOR) {
+ // Superres mode is currently enabled, but the denominator selected will
+ // disable superres. So no need to continue, as we will go through another
+ // recode loop for full-resolution after this anyway.
+ return -1;
+ }
+
+ int top_index = 0, bottom_index = 0;
+ int q = 0, q_low = 0, q_high = 0;
+ av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ q_low = bottom_index;
+ q_high = top_index;
+
+ av1_set_mv_search_params(cpi);
+
+ allocate_gradient_info_for_hog(cpi);
+
+ allocate_src_var_of_4x4_sub_block_buf(cpi);
+
+ if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+ variance_partition_alloc(cpi);
+
+ if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame: \n");
+#endif
+
+#if !CONFIG_RD_COMMAND
+ // Determine whether to use screen content tools using two fast encoding.
+ if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode)
+ av1_determine_sc_tools_with_encoding(cpi, q);
+#endif // !CONFIG_RD_COMMAND
+
+#if CONFIG_TUNE_VMAF
+ if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source);
+ }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ cpi->butteraugli_info.recon_set = false;
+ int original_q = 0;
+#endif
+
+ cpi->num_frame_recode = 0;
+
+ // Loop variables
+ int loop = 0;
+ int loop_count = 0;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+ int low_cr_seen = 0;
+ int last_loop_allow_hp = 0;
+
+ do {
+ loop = 0;
+ int do_mv_stats_collection = 1;
+
+ // if frame was scaled calculate global_motion_search again if already
+ // done
+ if (loop_count > 0 && cpi->source && gm_info->search_done) {
+ if (cpi->source->y_crop_width != cm->width ||
+ cpi->source->y_crop_height != cm->height) {
+ gm_info->search_done = 0;
+ }
+ }
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0,
+ false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ if (loop_count == 0) {
+ original_q = q;
+ // TODO(sdeng): different q here does not make big difference. Use a
+ // faster pass instead.
+ q = 96;
+ av1_setup_butteraugli_source(cpi);
+ } else {
+ q = original_q;
+ }
+ }
+#endif
+
+ if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ int scale_references = 0;
+#if CONFIG_FPMT_TEST
+ scale_references =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (scale_references ||
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (!frame_is_intra_only(cm)) {
+ if (loop_count > 0) {
+ release_scaled_references(cpi);
+ }
+ av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
+ }
+ }
+
+#if CONFIG_TUNE_VMAF
+ if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ cpi->vmaf_info.original_qindex = q;
+ q = av1_get_vmaf_base_qindex(cpi, q);
+ }
+#endif
+
+#if CONFIG_RD_COMMAND
+ RD_COMMAND *rd_command = &cpi->rd_command;
+ RD_OPTION option = rd_command->option_ls[rd_command->frame_index];
+ if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) {
+ q = rd_command->q_index_ls[rd_command->frame_index];
+ }
+#endif // CONFIG_RD_COMMAND
+
+#if CONFIG_BITRATE_ACCURACY
+#if CONFIG_THREE_PASS
+ if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) {
+ q = cpi->vbr_rc_info.q_index_list[frame_coding_idx];
+ } else {
+ // TODO(angiebird): Investigate why sometimes there is an extra frame
+ // after the last GOP.
+ q = cpi->vbr_rc_info.base_q_index;
+ }
+ }
+#else
+ if (cpi->vbr_rc_info.q_index_list_ready) {
+ q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index];
+ }
+#endif // CONFIG_THREE_PASS
+#endif // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ // TODO(angiebird): Move this into a function.
+ if (oxcf->pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx];
+ FRAME_UPDATE_TYPE update_type =
+ cpi->vbr_rc_info.update_type_list[frame_coding_idx];
+ rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q,
+ update_type);
+ }
+#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+
+ if (cpi->use_ducky_encode) {
+ const DuckyEncodeFrameInfo *frame_info =
+ &cpi->ducky_encode_info.frame_info;
+ if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+ q = frame_info->q_index;
+ cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled;
+ }
+ }
+
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+
+ // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
+ // cm->current_frame.frame_number, cm->show_frame, q,
+ // cm->current_frame.frame_type, cm->superres_scale_denominator);
+
+ if (loop_count == 0) {
+ av1_setup_frame(cpi);
+ } else if (get_primary_ref_frame_buf(cm) == NULL) {
+ // Base q-index may have changed, so we need to assign proper default coef
+ // probs before every iteration.
+ av1_default_coef_probs(cm);
+ av1_setup_frame_contexts(cm);
+ }
+
+ if (q_cfg->aq_mode == VARIANCE_AQ) {
+ av1_vaq_frame_setup(cpi);
+ } else if (q_cfg->aq_mode == COMPLEXITY_AQ) {
+ av1_setup_in_frame_q_adj(cpi);
+ }
+
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_encode_frame_time);
+#endif
+ // Set the motion vector precision based on mv stats from the last coded
+ // frame.
+ if (!frame_is_intra_only(cm)) {
+ av1_pick_and_set_high_precision_mv(cpi, q);
+
+ // If the precision has changed during different iteration of the loop,
+ // then we need to reset the global motion vectors
+ if (loop_count > 0 &&
+ cm->features.allow_high_precision_mv != last_loop_allow_hp) {
+ gm_info->search_done = 0;
+ }
+ last_loop_allow_hp = cm->features.allow_high_precision_mv;
+ }
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ // Disable mv_stats collection for parallel frames based on update flag.
+ if (!cpi->do_frame_data_update) do_mv_stats_collection = 0;
+
+ // Reset the mv_stats in case we are interrupted by an intraframe or an
+ // overlay frame.
+ if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats);
+
+ // Gather the mv_stats for the next frame
+ if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+ av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) {
+ av1_collect_mv_stats(cpi, q);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_encode_frame_time);
+#endif
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+ const int do_dummy_pack = 1;
+#else // CONFIG_BITRATE_ACCURACY
+ // Dummy pack of the bitstream using up to date stats to get an
+ // accurate estimate of output frame size to determine if we need
+ // to recode.
+ const int do_dummy_pack =
+ (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+ oxcf->rc_cfg.mode != AOM_Q) ||
+ oxcf->rc_cfg.min_cr > 0;
+#endif // CONFIG_BITRATE_ACCURACY
+ if (do_dummy_pack) {
+ av1_finalize_encoded_frame(cpi);
+ int largest_tile_id = 0; // Output from bitstream: unused here
+ rc->coefficient_size = 0;
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // bits used for this frame
+ rc->projected_frame_size = (int)(*size) << 3;
+#if CONFIG_RD_COMMAND
+ PSNR_STATS psnr;
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+ printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT,
+ rc->projected_frame_size, psnr.sse[0]);
+ ++rd_command->frame_index;
+ if (rd_command->frame_index == rd_command->frame_count) {
+ return AOM_CODEC_ERROR;
+ }
+#endif // CONFIG_RD_COMMAND
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ if (oxcf->pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx,
+ rc->projected_frame_size, rc->coefficient_size);
+ }
+#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ }
+
+#if CONFIG_TUNE_VMAF
+ if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ q = cpi->vmaf_info.original_qindex;
+ }
+#endif
+ if (allow_recode) {
+ // Update q and decide whether to do a recode loop
+ recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
+ bottom_index, &undershoot_seen, &overshoot_seen,
+ &low_cr_seen, loop_count);
+ }
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ loop = 1;
+ av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4);
+ }
+#endif
+
+ if (cpi->use_ducky_encode) {
+ // Ducky encode currently does not support recode loop.
+ loop = 0;
+ }
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+ loop = 0; // turn off recode loop when CONFIG_BITRATE_ACCURACY is on
+#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+
+ if (loop) {
+ ++loop_count;
+ cpi->num_frame_recode =
+ (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1))
+ ? (cpi->num_frame_recode + 1)
+ : (NUM_RECODES_PER_FRAME - 1);
+#if CONFIG_INTERNAL_STATS
+ ++cpi->frame_recode_hits;
+#endif
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (loop) printf("\n Recoding:");
+#endif
+ } while (loop);
+
+ return AOM_CODEC_OK;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// TODO(jingning, paulwilkins): Set up high grain level to test
+// hardware decoders. Need to adapt the actual noise variance
+// according to the difference between reconstructed frame and the
+// source signal.
+static void set_grain_syn_params(AV1_COMMON *cm) {
+ aom_film_grain_t *film_grain_params = &cm->film_grain_params;
+ film_grain_params->apply_grain = 1;
+ film_grain_params->update_parameters = 1;
+ film_grain_params->random_seed = rand() & 0xffff;
+
+ film_grain_params->num_y_points = 1;
+ film_grain_params->scaling_points_y[0][0] = 128;
+ film_grain_params->scaling_points_y[0][1] = 100;
+
+ if (!cm->seq_params->monochrome) {
+ film_grain_params->num_cb_points = 1;
+ film_grain_params->scaling_points_cb[0][0] = 128;
+ film_grain_params->scaling_points_cb[0][1] = 100;
+
+ film_grain_params->num_cr_points = 1;
+ film_grain_params->scaling_points_cr[0][0] = 128;
+ film_grain_params->scaling_points_cr[0][1] = 100;
+ } else {
+ film_grain_params->num_cb_points = 0;
+ film_grain_params->num_cr_points = 0;
+ }
+
+ film_grain_params->chroma_scaling_from_luma = 0;
+
+ film_grain_params->scaling_shift = 1;
+ film_grain_params->ar_coeff_lag = 0;
+ film_grain_params->ar_coeff_shift = 1;
+ film_grain_params->overlap_flag = 1;
+ film_grain_params->grain_scale_shift = 0;
+}
+
+/*!\brief Recode loop or a single loop for encoding one frame, followed by
+ * in-loop deblocking filters, CDEF filters, and restoration filters.
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ * \param[in] sse Total distortion of the frame
+ * \param[in] rate Total rate of the frame
+ * \param[in] largest_tile_id Tile id of the last tile
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest, int64_t *sse,
+ int64_t *rate,
+ int *largest_tile_id) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_with_or_without_recode_time);
+#endif
+ for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) {
+ cpi->do_update_frame_probs_txtype[i] = 0;
+ cpi->do_update_frame_probs_obmc[i] = 0;
+ cpi->do_update_frame_probs_warp[i] = 0;
+ cpi->do_update_frame_probs_interpfilter[i] = 0;
+ }
+
+ cpi->do_update_vbr_bits_off_target_fast = 0;
+ int err;
+#if CONFIG_REALTIME_ONLY
+ err = encode_without_recode(cpi);
+#else
+ if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE)
+ err = encode_without_recode(cpi);
+ else
+ err = encode_with_recode_loop(cpi, size, dest);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_with_or_without_recode_time);
+#endif
+ if (err != AOM_CODEC_OK) {
+ if (err == -1) {
+ // special case as described in encode_with_recode_loop().
+ // Encoding was skipped.
+ err = AOM_CODEC_OK;
+ if (sse != NULL) *sse = INT64_MAX;
+ if (rate != NULL) *rate = INT64_MAX;
+ *largest_tile_id = 0;
+ }
+ return err;
+ }
+
+#ifdef OUTPUT_YUV_DENOISED
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+ aom_write_yuv_frame(yuv_denoised_file,
+ &cpi->denoiser.running_avg_y[INTRA_FRAME]);
+ }
+#endif
+
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+
+ // Special case code to reduce pulsing when key frames are forced at a
+ // fixed interval. Note the reconstruction error if it is the frame before
+ // the force key frame
+ if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (seq_params->use_highbitdepth) {
+ cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ } else {
+ cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ }
+#else
+ cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+ }
+
+ cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+ cm->cur_frame->buf.transfer_characteristics =
+ seq_params->transfer_characteristics;
+ cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+ cm->cur_frame->buf.monochrome = seq_params->monochrome;
+ cm->cur_frame->buf.chroma_sample_position =
+ seq_params->chroma_sample_position;
+ cm->cur_frame->buf.color_range = seq_params->color_range;
+ cm->cur_frame->buf.render_width = cm->render_width;
+ cm->cur_frame->buf.render_height = cm->render_height;
+
+ if (!cpi->mt_info.pipeline_lpf_mt_with_enc)
+ set_postproc_filter_default_params(&cpi->common);
+
+ if (!cm->features.allow_intrabc) {
+ loopfilter_frame(cpi, cm);
+ }
+
+ if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) {
+ extend_frame_borders(cpi);
+ }
+
+#ifdef OUTPUT_YUV_REC
+ aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
+#endif
+
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) {
+ set_grain_syn_params(cm);
+ }
+
+ av1_finalize_encoded_frame(cpi);
+ // Build the bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+ cpi->rc.coefficient_size = 0;
+ if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+
+ // Compute sse and rate.
+ if (sse != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ *sse = (seq_params->use_highbitdepth)
+ ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
+ : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#else
+ *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+ }
+ if (rate != NULL) {
+ const int64_t bits = (*size << 3);
+ *rate = (bits << 5); // To match scale.
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode) {
+ PSNR_STATS psnr;
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+ DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result;
+ frame_result->global_order_idx = cm->cur_frame->display_order_hint;
+ frame_result->q_index = cm->quant_params.base_qindex;
+ frame_result->rdmult = cpi->rd.RDMULT;
+ frame_result->rate = (int)(*size) * 8;
+ frame_result->dist = psnr.sse[0];
+ frame_result->psnr = psnr.psnr[0];
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ return AOM_CODEC_OK;
+}
+
+static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest,
+ int *largest_tile_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ assert(cm->seq_params->enable_superres);
+ assert(av1_superres_in_recode_allowed(cpi));
+ aom_codec_err_t err = AOM_CODEC_OK;
+ av1_save_all_coding_context(cpi);
+
+ int64_t sse1 = INT64_MAX;
+ int64_t rate1 = INT64_MAX;
+ int largest_tile_id1 = 0;
+ int64_t sse2 = INT64_MAX;
+ int64_t rate2 = INT64_MAX;
+ int largest_tile_id2;
+ double proj_rdcost1 = DBL_MAX;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const FRAME_UPDATE_TYPE update_type =
+ gf_group->update_type[cpi->gf_frame_index];
+ const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+
+ // Encode with superres.
+ if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) {
+ SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg;
+ int64_t superres_sses[SCALE_NUMERATOR];
+ int64_t superres_rates[SCALE_NUMERATOR];
+ int superres_largest_tile_ids[SCALE_NUMERATOR];
+ // Use superres for Key-frames and Alt-ref frames only.
+ if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) {
+ for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+ ++denom) {
+ superres_cfg->superres_scale_denominator = denom;
+ superres_cfg->superres_kf_scale_denominator = denom;
+ const int this_index = denom - (SCALE_NUMERATOR + 1);
+
+ cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this loop.
+ err = encode_with_recode_loop_and_filter(
+ cpi, size, dest, &superres_sses[this_index],
+ &superres_rates[this_index],
+ &superres_largest_tile_ids[this_index]);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ if (err != AOM_CODEC_OK) return err;
+ restore_all_coding_context(cpi);
+ }
+ // Reset.
+ superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+ superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+ } else {
+ for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+ ++denom) {
+ const int this_index = denom - (SCALE_NUMERATOR + 1);
+ superres_sses[this_index] = INT64_MAX;
+ superres_rates[this_index] = INT64_MAX;
+ }
+ }
+ // Encode without superres.
+ assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+ &largest_tile_id2);
+ if (err != AOM_CODEC_OK) return err;
+
+ // Note: Both use common rdmult based on base qindex of fullres.
+ const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+ bit_depth, update_type, cm->quant_params.base_qindex);
+
+ // Find the best rdcost among all superres denoms.
+ int best_denom = -1;
+ for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+ ++denom) {
+ const int this_index = denom - (SCALE_NUMERATOR + 1);
+ const int64_t this_sse = superres_sses[this_index];
+ const int64_t this_rate = superres_rates[this_index];
+ const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+ const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ rdmult, this_rate, this_sse, bit_depth);
+ if (this_rdcost < proj_rdcost1) {
+ sse1 = this_sse;
+ rate1 = this_rate;
+ largest_tile_id1 = this_largest_tile_id;
+ proj_rdcost1 = this_rdcost;
+ best_denom = denom;
+ }
+ }
+ const double proj_rdcost2 =
+ RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+ // Re-encode with superres if it's better.
+ if (proj_rdcost1 < proj_rdcost2) {
+ restore_all_coding_context(cpi);
+ // TODO(urvang): We should avoid rerunning the recode loop by saving
+ // previous output+state, or running encode only for the selected 'q' in
+ // previous step.
+ // Again, temporarily force the best denom.
+ superres_cfg->superres_scale_denominator = best_denom;
+ superres_cfg->superres_kf_scale_denominator = best_denom;
+ int64_t sse3 = INT64_MAX;
+ int64_t rate3 = INT64_MAX;
+ cpi->superres_mode =
+ AOM_SUPERRES_AUTO; // Super-res on for this recode loop.
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+ largest_tile_id);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ assert(sse1 == sse3);
+ assert(rate1 == rate3);
+ assert(largest_tile_id1 == *largest_tile_id);
+ // Reset.
+ superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+ superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+ } else {
+ *largest_tile_id = largest_tile_id2;
+ }
+ } else {
+ assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL);
+ cpi->superres_mode =
+ AOM_SUPERRES_AUTO; // Super-res on for this recode loop.
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+ &largest_tile_id1);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ if (err != AOM_CODEC_OK) return err;
+ restore_all_coding_context(cpi);
+ // Encode without superres.
+ assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+ &largest_tile_id2);
+ if (err != AOM_CODEC_OK) return err;
+
+ // Note: Both use common rdmult based on base qindex of fullres.
+ const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+ bit_depth, update_type, cm->quant_params.base_qindex);
+ proj_rdcost1 =
+ RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth);
+ const double proj_rdcost2 =
+ RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+ // Re-encode with superres if it's better.
+ if (proj_rdcost1 < proj_rdcost2) {
+ restore_all_coding_context(cpi);
+ // TODO(urvang): We should avoid rerunning the recode loop by saving
+ // previous output+state, or running encode only for the selected 'q' in
+ // previous step.
+ int64_t sse3 = INT64_MAX;
+ int64_t rate3 = INT64_MAX;
+ cpi->superres_mode =
+ AOM_SUPERRES_AUTO; // Super-res on for this recode loop.
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+ largest_tile_id);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ assert(sse1 == sse3);
+ assert(rate1 == rate3);
+ assert(largest_tile_id1 == *largest_tile_id);
+ } else {
+ *largest_tile_id = largest_tile_id2;
+ }
+ }
+
+ return err;
+}
+
+// Conditions to disable cdf_update mode in selective mode for real-time.
+// Handle case for layers, scene change, and resizing.
+static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ // For single layer.
+ if (cpi->svc.number_spatial_layers == 1 &&
+ cpi->svc.number_temporal_layers == 1) {
+ // Don't disable on intra_only, scene change (high_source_sad = 1),
+ // or resized frame. To avoid quality loss force enable at
+ // for ~30 frames after key or scene/slide change, and
+ // after 8 frames since last update if frame_source_sad > 0.
+ if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+ rc->high_source_sad || rc->frames_since_key < 30 ||
+ (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30) ||
+ (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0))
+ return 0;
+ else
+ return 1;
+ } else if (cpi->svc.number_temporal_layers > 1) {
+ // Disable only on top temporal enhancement layer for now.
+ return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1;
+ }
+ return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void subtract_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->weight -= frame->weight;
+ section->intra_error -= frame->intra_error;
+ section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->intra_skip_pct -= frame->intra_skip_pct;
+ section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const FIRSTPASS_STATS *const total_stats =
+ twopass->stats_buf_ctx->total_stats;
+
+ if (is_one_pass_rt_params(cpi) ||
+ (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) ||
+ (is_fp_wavelet_energy_invalid(total_stats) == 0))
+ return;
+
+ const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.mi_params.MBs;
+ const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source;
+ const uint8_t *const src = unfiltered_source->y_buffer;
+ const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int stride = unfiltered_source->y_stride;
+ const BLOCK_SIZE fp_block_size =
+ get_fp_block_size(cpi->is_screen_content_type);
+ const int fp_block_size_width = block_size_wide[fp_block_size];
+ const int fp_block_size_height = block_size_high[fp_block_size];
+ const int num_unit_cols =
+ get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+ const int num_unit_rows =
+ get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height);
+ const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+ const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+ int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input(
+ src, stride, hbd, num_8x8_rows, num_8x8_cols);
+
+ cpi->twopass_frame.frame_avg_haar_energy =
+ log1p((double)frame_avg_wavelet_energy / num_mbs);
+}
+#endif
+
+extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
+ const char *filename);
+
+/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack
+ * the bitstream
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ struct segmentation *const seg = &cm->seg;
+ FeatureFlags *const features = &cm->features;
+ const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+ assert(cpi->source != NULL);
+ cpi->td.mb.e_mbd.cur_buf = cpi->source;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+ calculate_frame_avg_haar_energy(cpi);
+#endif
+
+ // frame type has been decided outside of this function call
+ cm->cur_frame->frame_type = current_frame->frame_type;
+
+ cm->tiles.large_scale = tile_cfg->enable_large_scale_tile;
+ cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding;
+
+ features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+ // features->allow_ref_frame_mvs needs to be written into the frame header
+ // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case
+ // is separated from frame_might_allow_ref_frame_mvs().
+ features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
+
+ features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion &&
+ frame_might_allow_warped_motion(cm);
+
+ cpi->last_frame_type = current_frame->frame_type;
+
+ if (frame_is_intra_only(cm)) {
+ cpi->frames_since_last_update = 0;
+ }
+
+ if (frame_is_sframe(cm)) {
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ // S frame will wipe out any previously encoded altref so we cannot place
+ // an overlay frame
+ gf_group->update_type[gf_group->size] = GF_UPDATE;
+ }
+
+ if (encode_show_existing_frame(cm)) {
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ // TODO(angiebird): Move this into a function.
+ if (oxcf->pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ rc_log_frame_encode_param(
+ &cpi->rc_log, frame_coding_idx, 1, 255,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index]);
+ }
+#endif
+ av1_finalize_encoded_frame(cpi);
+ // Build the bitstream
+ int largest_tile_id = 0; // Output from bitstream: unused here
+ cpi->rc.coefficient_size = 0;
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+
+ if (seq_params->frame_id_numbers_present_flag &&
+ current_frame->frame_type == KEY_FRAME) {
+ // Displaying a forward key-frame, so reset the ref buffer IDs
+ int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+ for (int i = 0; i < REF_FRAMES; i++)
+ cm->ref_frame_id[i] = display_frame_id;
+ }
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ av1_dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ av1_denoiser_update_ref_frame(cpi);
+#endif
+
+ // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+ // to do post-encoding update accordingly.
+ av1_set_target_rate(cpi, cm->width, cm->height);
+
+ if (is_psnr_calc_enabled(cpi)) {
+ cpi->source =
+ realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width,
+ cm->cur_frame->buf.y_crop_height);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode) {
+ PSNR_STATS psnr;
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+ DuckyEncodeFrameResult *frame_result =
+ &cpi->ducky_encode_info.frame_result;
+ frame_result->global_order_idx = cm->cur_frame->display_order_hint;
+ frame_result->q_index = cm->quant_params.base_qindex;
+ frame_result->rdmult = cpi->rd.RDMULT;
+ frame_result->rate = (int)(*size) * 8;
+ frame_result->dist = psnr.sse[0];
+ frame_result->psnr = psnr.psnr[0];
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ update_counters_for_show_frame(cpi);
+ return AOM_CODEC_OK;
+ }
+
+ // Work out whether to force_integer_mv this frame
+ if (!is_stat_generation_stage(cpi) &&
+ cpi->common.features.allow_screen_content_tools &&
+ !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ if (cpi->common.seq_params->force_integer_mv == 2) {
+ // Adaptive mode: see what previous frame encoded did
+ if (cpi->unscaled_last_source != NULL) {
+ features->cur_frame_force_integer_mv = av1_is_integer_mv(
+ cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
+ } else {
+ cpi->common.features.cur_frame_force_integer_mv = 0;
+ }
+ } else {
+ cpi->common.features.cur_frame_force_integer_mv =
+ cpi->common.seq_params->force_integer_mv;
+ }
+ } else {
+ cpi->common.features.cur_frame_force_integer_mv = 0;
+ }
+
+ // This is used by av1_pack_bitstream. So this needs to be set in case of
+ // row-mt where the encoding code will use a temporary structure.
+ cpi->td.mb.e_mbd.cur_frame_force_integer_mv =
+ cpi->common.features.cur_frame_force_integer_mv;
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ // Set various flags etc to special state if it is a key frame.
+ if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
+ // Reset the loop filter deltas and segmentation map.
+ av1_reset_segment_features(cm);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+ }
+ if (tile_cfg->mtu == 0) {
+ cpi->num_tg = tile_cfg->num_tile_groups;
+ } else {
+ // Use a default value for the purposes of weighting costs in probability
+ // updates
+ cpi->num_tg = DEFAULT_MAX_NUM_TG;
+ }
+
+ // For 1 pass CBR mode: check if we are dropping this frame.
+ if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) {
+ // Always drop for spatial enhancement layer if layer bandwidth is 0.
+ // Otherwise check for frame-dropping based on buffer level in
+ // av1_rc_drop_frame().
+ if ((cpi->svc.spatial_layer_id > 0 &&
+ cpi->oxcf.rc_cfg.target_bandwidth == 0) ||
+ av1_rc_drop_frame(cpi)) {
+ cpi->is_dropped_frame = true;
+ }
+ if (cpi->is_dropped_frame) {
+ av1_setup_frame_size(cpi);
+ av1_set_mv_search_params(cpi);
+ av1_rc_postencode_update_drop_frame(cpi);
+ release_scaled_references(cpi);
+ cpi->ppi->gf_group.is_frame_dropped[cpi->gf_frame_index] = true;
+ // A dropped frame might not be shown but it always takes a slot in the gf
+ // group. Therefore, even when it is not shown, we still need to update
+ // the relevant frame counters.
+ if (cm->show_frame) {
+ update_counters_for_show_frame(cpi);
+ }
+ return AOM_CODEC_OK;
+ }
+ }
+
+ if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) {
+ av1_set_mb_ssim_rdmult_scaling(cpi);
+ }
+#if CONFIG_SALIENCY_MAP
+ else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP &&
+ !(cpi->source->flags & YV12_FLAG_HIGHBITDEPTH)) {
+ if (av1_set_saliency_map(cpi) == 0) {
+ return AOM_CODEC_MEM_ERROR;
+ }
+#if !CONFIG_REALTIME_ONLY
+ double motion_ratio = av1_setup_motion_ratio(cpi);
+#else
+ double motion_ratio = 1.0;
+#endif
+ if (av1_setup_sm_rdmult_scaling_factor(cpi, motion_ratio) == 0) {
+ return AOM_CODEC_MEM_ERROR;
+ }
+ }
+#endif
+#if CONFIG_TUNE_VMAF
+ else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+ oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+ oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ av1_set_mb_vmaf_rdmult_scaling(cpi);
+ }
+#endif
+
+ if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode == 0) {
+ av1_init_mb_wiener_var_buffer(cpi);
+ av1_set_mb_wiener_variance(cpi);
+ }
+
+ if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+ av1_init_mb_ur_var_buffer(cpi);
+ av1_set_mb_ur_variance(cpi);
+ }
+
+#if CONFIG_INTERNAL_STATS
+ memset(cpi->mode_chosen_counts, 0,
+ MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ /* Non-normative definition of current_frame_id ("frame counter" with
+ * wraparound) */
+ if (cm->current_frame_id == -1) {
+ int lsb, msb;
+ /* quasi-random initialization of current_frame_id for a key frame */
+ if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+ msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+ } else {
+ lsb = cpi->source->y_buffer[0] & 0xff;
+ msb = cpi->source->y_buffer[1] & 0xff;
+ }
+ cm->current_frame_id =
+ ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
+
+ // S_frame is meant for stitching different streams of different
+ // resolutions together, so current_frame_id must be the
+ // same across different streams of the same content current_frame_id
+ // should be the same and not random. 0x37 is a chosen number as start
+ // point
+ if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37;
+ } else {
+ cm->current_frame_id =
+ (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
+ (1 << seq_params->frame_id_length);
+ }
+ }
+
+ switch (oxcf->algo_cfg.cdf_update_mode) {
+ case 0: // No CDF update for any frames(4~6% compression loss).
+ features->disable_cdf_update = 1;
+ break;
+ case 1: // Enable CDF update for all frames.
+ if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame &&
+ cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2)
+ features->disable_cdf_update = 1;
+ else if (cpi->sf.rt_sf.selective_cdf_update)
+ features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+ else
+ features->disable_cdf_update = 0;
+ break;
+ case 2:
+ // Strategically determine at which frames to do CDF update.
+ // Currently only enable CDF update for all-intra and no-show frames(1.5%
+ // compression loss) for good qualiy or allintra mode.
+ if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) {
+ features->disable_cdf_update =
+ (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+ } else {
+ features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+ }
+ break;
+ }
+
+ // Disable cdf update for the INTNL_ARF_UPDATE frame with
+ // frame_parallel_level 1.
+ if (!cpi->do_frame_data_update &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1);
+ features->disable_cdf_update = 1;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.tool_cfg.enable_global_motion && !frame_is_intra_only(cm)) {
+ // Flush any stale global motion information, which may be left over
+ // from a previous frame
+ aom_invalidate_pyramid(cpi->source->y_pyramid);
+ av1_invalidate_corner_list(cpi->source->corners);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ int largest_tile_id = 0;
+ if (av1_superres_in_recode_allowed(cpi)) {
+ if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ const aom_superres_mode orig_superres_mode = cpi->superres_mode; // save
+ cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode;
+ if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
+ &largest_tile_id) != AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ cpi->superres_mode = orig_superres_mode; // restore
+ }
+
+ // Update reference frame ids for reference frames this frame will overwrite
+ if (seq_params->frame_id_numbers_present_flag) {
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((current_frame->refresh_frame_flags >> i) & 1) {
+ cm->ref_frame_id[i] = cm->current_frame_id;
+ }
+ }
+ }
+
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->svc.num_encoded_top_layer++;
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ av1_dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ if (cm->seg.enabled) {
+ if (cm->seg.update_map == 0 && cm->last_frame_seg_map) {
+ memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
+ cm->cur_frame->mi_cols * cm->cur_frame->mi_rows *
+ sizeof(*cm->cur_frame->seg_map));
+ }
+ }
+
+ int release_scaled_refs = 0;
+#if CONFIG_FPMT_TEST
+ release_scaled_refs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (release_scaled_refs ||
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (frame_is_intra_only(cm) == 0) {
+ release_scaled_references(cpi);
+ }
+ }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ av1_denoiser_update_ref_frame(cpi);
+#endif
+
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+ if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ *cm->fc = cpi->tile_data[largest_tile_id].tctx;
+ av1_reset_cdf_symbol_counters(cm->fc);
+ }
+ if (!cm->tiles.large_scale) {
+ cm->cur_frame->frame_context = *cm->fc;
+ }
+
+ if (tile_cfg->enable_ext_tile_debug) {
+ // (yunqing) This test ensures the correctness of large scale tile coding.
+ if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
+ char fn[20] = "./fc";
+ fn[4] = current_frame->frame_number / 100 + '0';
+ fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+ fn[6] = (current_frame->frame_number % 10) + '0';
+ fn[7] = '\0';
+ av1_print_frame_contexts(cm->fc, fn);
+ }
+ }
+
+ cpi->last_frame_type = current_frame->frame_type;
+
+ if (cm->features.disable_cdf_update) {
+ cpi->frames_since_last_update++;
+ } else {
+ cpi->frames_since_last_update = 1;
+ }
+
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->svc.prev_number_spatial_layers = cpi->svc.number_spatial_layers;
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
+ cm->seg.update_map = 0;
+ cm->seg.update_data = 0;
+ cm->lf.mode_ref_delta_update = 0;
+
+ if (cm->show_frame) {
+ update_counters_for_show_frame(cpi);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+ return AOM_CODEC_OK;
+}
+
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ const EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+
+ cpi->unscaled_source = frame_input->source;
+ cpi->source = frame_input->source;
+ cpi->unscaled_last_source = frame_input->last_source;
+
+ current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
+ cm->features.error_resilient_mode = frame_params->error_resilient_mode;
+ cm->features.primary_ref_frame = frame_params->primary_ref_frame;
+ cm->current_frame.frame_type = frame_params->frame_type;
+ cm->show_frame = frame_params->show_frame;
+ cpi->ref_frame_flags = frame_params->ref_frame_flags;
+ cpi->speed = frame_params->speed;
+ cm->show_existing_frame = frame_params->show_existing_frame;
+ cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;
+
+ memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
+ REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+ memcpy(&cpi->refresh_frame, &frame_params->refresh_frame,
+ sizeof(cpi->refresh_frame));
+
+ if (current_frame->frame_type == KEY_FRAME &&
+ cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ current_frame->frame_number = 0;
+ }
+
+ current_frame->order_hint =
+ current_frame->frame_number + frame_params->order_offset;
+
+ current_frame->display_order_hint = current_frame->order_hint;
+ current_frame->order_hint %=
+ (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1));
+
+ current_frame->pyramid_level = get_true_pyr_level(
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
+ current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
+
+ if (is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.q_cfg.use_fixed_qp_offsets)
+ av1_noop_first_pass_frame(cpi, frame_input->ts_duration);
+ else
+ av1_first_pass(cpi, frame_input->ts_duration);
+#endif
+ } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS ||
+ cpi->oxcf.pass >= AOM_RC_SECOND_PASS) {
+ if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+ int block_size, float noise_level,
+ int64_t time_stamp, int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cpi->denoise_and_model) {
+ cpi->denoise_and_model = aom_denoise_and_model_alloc(
+ cm->seq_params->bit_depth, block_size, noise_level);
+ if (!cpi->denoise_and_model) {
+ aom_set_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating denoise and model");
+ return -1;
+ }
+ }
+ if (!cpi->film_grain_table) {
+ cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+ if (!cpi->film_grain_table) {
+ aom_set_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating grain table");
+ return -1;
+ }
+ memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+ }
+ if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+ &cm->film_grain_params,
+ cpi->oxcf.enable_dnl_denoising)) {
+ if (cm->film_grain_params.apply_grain) {
+ aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+ &cm->film_grain_params);
+ }
+ }
+ return 0;
+}
+#endif
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ int res = 0;
+ const int subsampling_x = sd->subsampling_x;
+ const int subsampling_y = sd->subsampling_y;
+ const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+
+#if CONFIG_TUNE_VMAF
+ if (!is_stat_generation_stage(cpi) &&
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+ av1_vmaf_frame_preprocessing(cpi, sd);
+ }
+ if (!is_stat_generation_stage(cpi) &&
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+ av1_vmaf_blk_preprocessing(cpi, sd);
+ }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
+#if CONFIG_DENOISE
+ // even if denoise_noise_level is > 0, we don't need need to denoise on pass
+ // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be
+ // encoding the original (non-denoised) frame
+ if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS &&
+ !cpi->oxcf.enable_dnl_denoising)) {
+#if !CONFIG_REALTIME_ONLY
+ // Choose a synthetic noise level for still images for enhanced perceptual
+ // quality based on an estimated noise level in the source, but only if
+ // the noise level is set on the command line to > 0.
+ if (cpi->oxcf.mode == ALLINTRA) {
+ // No noise synthesis if source is very clean.
+ // Uses a low edge threshold to focus on smooth areas.
+ // Increase output noise setting a little compared to measured value.
+ double y_noise_level = 0.0;
+ av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+ cm->seq_params->bit_depth, 16);
+ cpi->oxcf.noise_level = (float)(y_noise_level - 0.1);
+ cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level);
+ if (cpi->oxcf.noise_level > 0.0) {
+ cpi->oxcf.noise_level += (float)0.5;
+ }
+ cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level);
+ }
+#endif
+
+ if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+ cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+ res = -1;
+ }
+#endif // CONFIG_DENOISE
+
+ if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time,
+ use_highbitdepth, cpi->image_pyramid_levels,
+ frame_flags)) {
+ aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed");
+ res = -1;
+ }
+#if CONFIG_INTERNAL_STATS
+ aom_usec_timer_mark(&timer);
+ cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
+#endif
+
+ // Note: Regarding profile setting, the following checks are added to help
+ // choose a proper profile for the input video. The criterion is that all
+ // bitstreams must be designated as the lowest profile that match its content.
+ // E.G. A bitstream that contains 4:4:4 video must be designated as High
+ // Profile in the seq header, and likewise a bitstream that contains 4:2:2
+ // bitstream must be designated as Professional Profile in the sequence
+ // header.
+ if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
+ (subsampling_x != 1 || subsampling_y != 1)) {
+ aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+ "Non-4:2:0 color format requires profile 1 or 2");
+ res = -1;
+ }
+ if ((seq_params->profile == PROFILE_1) &&
+ !(subsampling_x == 0 && subsampling_y == 0)) {
+ aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+ "Profile 1 requires 4:4:4 color format");
+ res = -1;
+ }
+ if ((seq_params->profile == PROFILE_2) &&
+ (seq_params->bit_depth <= AOM_BITS_10) &&
+ !(subsampling_x == 1 && subsampling_y == 0)) {
+ aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+ "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
+ res = -1;
+ }
+
+ return res;
+}
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi) {
+ if (!ppi->cpi) return;
+
+ if (ppi->cpi->oxcf.pass != 1 &&
+ ppi->cpi->common.current_frame.frame_number > 0) {
+ fprintf(stderr, "Writing counts.stt\n");
+ FILE *f = fopen("counts.stt", "wb");
+ fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f);
+ fclose(f);
+ }
+}
+#endif // CONFIG_ENTROPY_STATS
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch,
+ int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+ ImageStat *s) {
+ s->stat[STAT_Y] += y;
+ s->stat[STAT_U] += u;
+ s->stat[STAT_V] += v;
+ s->stat[STAT_ALL] += all;
+ s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+ AV1_PRIMARY *const ppi = cpi->ppi;
+ AV1_COMMON *const cm = &cpi->common;
+ double samples = 0.0;
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+ if (cpi->ppi->use_svc &&
+ cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+ return;
+
+#if CONFIG_INTER_STATS_ONLY
+ if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame
+#endif
+ cpi->bytes += frame_bytes;
+ if (cm->show_frame) {
+ const YV12_BUFFER_CONFIG *orig = cpi->source;
+ const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+ double y, u, v, frame_all;
+
+ ppi->count[0]++;
+ ppi->count[1]++;
+ if (cpi->ppi->b_calculate_psnr) {
+ PSNR_STATS psnr;
+ double weight[2] = { 0.0, 0.0 };
+ double frame_ssim2[2] = { 0.0, 0.0 };
+#if CONFIG_AV1_HIGHBITDEPTH
+ aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(orig, recon, &psnr);
+#endif
+ adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+ &(ppi->psnr[0]));
+ ppi->total_sq_error[0] += psnr.sse[0];
+ ppi->total_samples[0] += psnr.samples[0];
+ samples = psnr.samples[0];
+
+ aom_calc_ssim(orig, recon, bit_depth, in_bit_depth,
+ cm->seq_params->use_highbitdepth, weight, frame_ssim2);
+
+ ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]);
+ ppi->summed_quality += frame_ssim2[0] * weight[0];
+ ppi->summed_weights += weight[0];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // Compute PSNR based on stream bit depth
+ if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+ (in_bit_depth < bit_depth)) {
+ adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
+ psnr.psnr_hbd[0], &ppi->psnr[1]);
+ ppi->total_sq_error[1] += psnr.sse_hbd[0];
+ ppi->total_samples[1] += psnr.samples_hbd[0];
+
+ ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]);
+ ppi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+ ppi->summed_weights_hbd += weight[1];
+ }
+#endif
+
+#if 0
+ {
+ FILE *f = fopen("q_used.stt", "a");
+ double y2 = psnr.psnr[1];
+ double u2 = psnr.psnr[2];
+ double v2 = psnr.psnr[3];
+ double frame_psnr2 = psnr.psnr[0];
+ fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+ cm->current_frame.frame_number, y2, u2, v2,
+ frame_psnr2, frame_ssim2);
+ fclose(f);
+ }
+#endif
+ }
+ if (ppi->b_calculate_blockiness) {
+ if (!cm->seq_params->use_highbitdepth) {
+ const double frame_blockiness =
+ av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+ recon->y_stride, orig->y_width, orig->y_height);
+ ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness);
+ ppi->total_blockiness += frame_blockiness;
+ }
+
+ if (ppi->b_calculate_consistency) {
+ if (!cm->seq_params->use_highbitdepth) {
+ const double this_inconsistency = aom_get_ssim_metrics(
+ orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+ orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1);
+
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const double consistency =
+ aom_sse_to_psnr(samples, peak, ppi->total_inconsistency);
+ if (consistency > 0.0)
+ ppi->worst_consistency =
+ AOMMIN(ppi->worst_consistency, consistency);
+ ppi->total_inconsistency += this_inconsistency;
+ }
+ }
+ }
+
+ frame_all =
+ aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &ppi->fastssim);
+ frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs);
+ }
+}
+
+void print_internal_stats(AV1_PRIMARY *ppi) {
+ if (!ppi->cpi) return;
+ AV1_COMP *const cpi = ppi->cpi;
+
+ if (ppi->cpi->oxcf.pass != 1 &&
+ ppi->cpi->common.current_frame.frame_number > 0) {
+ char headings[512] = { 0 };
+ char results[512] = { 0 };
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded =
+ (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
+ 10000000.000;
+ double total_encode_time =
+ (ppi->total_time_receive_data + ppi->total_time_compress_data) /
+ 1000.000;
+ const double dr =
+ (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded;
+ const double peak =
+ (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1);
+ const double target_rate =
+ (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000;
+ const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+ if (ppi->b_calculate_psnr) {
+ const double total_psnr = aom_sse_to_psnr(
+ (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]);
+ const double total_ssim =
+ 100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0);
+ snprintf(headings, sizeof(headings),
+ "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+ "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+ "AVPsrnY\tAPsnrCb\tAPsnrCr");
+ snprintf(results, sizeof(results),
+ "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f",
+ dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+ ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+ total_ssim, total_ssim,
+ ppi->fastssim.stat[STAT_ALL] / ppi->count[0],
+ ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst,
+ ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst,
+ ppi->psnr[0].stat[STAT_Y] / ppi->count[0],
+ ppi->psnr[0].stat[STAT_U] / ppi->count[0],
+ ppi->psnr[0].stat[STAT_V] / ppi->count[0]);
+
+ if (ppi->b_calculate_blockiness) {
+ SNPRINT(headings, "\t Block\tWstBlck");
+ SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]);
+ SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness);
+ }
+
+ if (ppi->b_calculate_consistency) {
+ double consistency =
+ aom_sse_to_psnr((double)ppi->total_samples[0], peak,
+ (double)ppi->total_inconsistency);
+
+ SNPRINT(headings, "\tConsist\tWstCons");
+ SNPRINT2(results, "\t%7.3f", consistency);
+ SNPRINT2(results, "\t%7.3f", ppi->worst_consistency);
+ }
+
+ SNPRINT(headings, "\t Time\tRcErr\tAbsErr");
+ SNPRINT2(results, "\t%8.0f", total_encode_time);
+ SNPRINT2(results, " %7.2f", rate_err);
+ SNPRINT2(results, " %7.2f", fabs(rate_err));
+
+ SNPRINT(headings, "\tAPsnr611");
+ SNPRINT2(results, " %7.3f",
+ (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] +
+ ppi->psnr[0].stat[STAT_V]) /
+ (ppi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = ppi->seq_params.bit_depth;
+ // Since cpi->source->flags is not available here, but total_samples[1]
+ // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was
+ // true in compute_internal_stats
+ if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) {
+ const double peak_hbd = (double)((1 << bit_depth) - 1);
+ const double total_psnr_hbd =
+ aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd,
+ (double)ppi->total_sq_error[1]);
+ const double total_ssim_hbd =
+ 100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0);
+ SNPRINT(headings,
+ "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+ " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+ " AOMSSIMH VPSSIMPH WstSsimH");
+ SNPRINT2(results, "\t%7.3f",
+ ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", total_psnr_hbd);
+ SNPRINT2(results, " %7.3f",
+ ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", total_psnr_hbd);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].worst);
+ SNPRINT2(results, " %7.3f", total_ssim_hbd);
+ SNPRINT2(results, " %7.3f", total_ssim_hbd);
+ SNPRINT2(results, " %7.3f", ppi->worst_ssim_hbd);
+ }
+#endif
+ fprintf(f, "%s\n", headings);
+ fprintf(f, "%s\n", results);
+ }
+
+ fclose(f);
+
+ aom_free(ppi->ssim_vars);
+ ppi->ssim_vars = NULL;
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+ if (cpi->common.show_frame && cpi->rc.frames_to_key) {
+#if !CONFIG_REALTIME_ONLY
+ FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info;
+ if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) {
+ av1_firstpass_info_move_cur_index_and_pop(firstpass_info);
+ } else {
+ // When there is not enough past stats, we move the current
+ // index without popping the past stats
+ av1_firstpass_info_move_cur_index(firstpass_info);
+ }
+#endif
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ cpi->rc.frames_to_fwd_kf--;
+ }
+ }
+}
+
+static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+ // TODO(weitinglin): Updating this counter for is_frame_droppable
+ // is a work-around to handle the condition when a frame is drop.
+ // We should fix the cpi->common.show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cpi->common.show_frame ||
+ is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+}
+
+static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
+ // Increment the gf group index ready for the next frame.
+ if (is_one_pass_rt_params(cpi) &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ ++cpi->gf_frame_index;
+ // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH
+ // for real time encoding.
+ if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
+ cpi->gf_frame_index = 0;
+ } else {
+ ++cpi->gf_frame_index;
+ }
+}
+
+static void update_fb_of_context_type(const AV1_COMP *const cpi,
+ int *const fb_of_context_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int current_frame_ref_type = get_current_frame_ref_type(cpi);
+
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ cpi->ext_flags.use_primary_ref_none) {
+ for (int i = 0; i < REF_FRAMES; i++) {
+ fb_of_context_type[i] = -1;
+ }
+ fb_of_context_type[current_frame_ref_type] =
+ cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+ : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ }
+
+ if (!encode_show_existing_frame(cm)) {
+ // Refresh fb_of_context_type[]: see encoder.h for explanation
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ // All ref frames are refreshed, pick one that will live long enough
+ fb_of_context_type[current_frame_ref_type] = 0;
+ } else {
+ // If more than one frame is refreshed, it doesn't matter which one we
+ // pick so pick the first. LST sometimes doesn't refresh any: this is ok
+
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+ fb_of_context_type[current_frame_ref_type] = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+ update_keyframe_counters(cpi);
+ update_frames_till_gf_update(cpi);
+ update_gf_group_index(cpi);
+}
+
+static void update_end_of_frame_stats(AV1_COMP *cpi) {
+ if (cpi->do_frame_data_update) {
+ // Store current frame loopfilter levels in ppi, if update flag is set.
+ if (!cpi->common.show_existing_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct loopfilter *const lf = &cm->lf;
+ cpi->ppi->filter_level[0] = lf->filter_level[0];
+ cpi->ppi->filter_level[1] = lf->filter_level[1];
+ cpi->ppi->filter_level_u = lf->filter_level_u;
+ cpi->ppi->filter_level_v = lf->filter_level_v;
+ }
+ }
+ // Store frame level mv_stats from cpi to ppi.
+ cpi->ppi->mv_stats = cpi->mv_stats;
+}
+
+// Updates frame level stats related to global motion
+static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) {
+ FRAME_UPDATE_TYPE update_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ int i, is_gm_present = 0;
+
+ // Check if the current frame has any valid global motion model across its
+ // reference frames
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (cpi->common.global_motion[i].wmtype != IDENTITY) {
+ is_gm_present = 1;
+ break;
+ }
+ }
+ int update_actual_stats = 1;
+#if CONFIG_FPMT_TEST
+ update_actual_stats =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!update_actual_stats) {
+ if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) {
+ cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present;
+ } else {
+ cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present;
+ }
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+ if (cpi->do_frame_data_update == 1 &&
+ !show_existing_between_parallel_frames) {
+ for (i = 0; i < FRAME_UPDATE_TYPES; i++) {
+ cpi->ppi->valid_gm_model_found[i] =
+ cpi->ppi->temp_valid_gm_model_found[i];
+ }
+ }
+ }
+#endif
+ if (update_actual_stats) {
+ if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) {
+ cpi->ppi->valid_gm_model_found[update_type] = is_gm_present;
+ } else {
+ cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present;
+ }
+ }
+}
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+ const AV1_COMP_DATA *const cpi_data) {
+ AV1_PRIMARY *const ppi = cpi->ppi;
+ AV1_COMMON *const cm = &cpi->common;
+
+ update_gm_stats(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+ // Update the total stats remaining structure.
+ if (cpi->twopass_frame.this_frame != NULL &&
+ ppi->twopass.stats_buf_ctx->total_left_stats) {
+ subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats,
+ cpi->twopass_frame.this_frame);
+ }
+#endif
+
+#if CONFIG_OUTPUT_FRAME_SIZE
+ FILE *f = fopen("frame_sizes.csv", "a");
+ fprintf(f, "%d,", 8 * (int)cpi_data->frame_size);
+ fprintf(f, "%d\n", cm->quant_params.base_qindex);
+ fclose(f);
+#endif // CONFIG_OUTPUT_FRAME_SIZE
+
+ if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) {
+ // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy
+ // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel
+ // encode set of lower layer frames.
+ // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid
+ // copy.
+ if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 &&
+ ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 &&
+ ppi->gf_group.update_type[cpi->gf_frame_index - 1] ==
+ INTNL_ARF_UPDATE) {
+ memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy,
+ sizeof(cm->ref_frame_map));
+ }
+ refresh_reference_frames(cpi);
+ // For frame_parallel_level 1 frame in a parallel encode set of lower layer
+ // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy.
+ if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 &&
+ ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map,
+ sizeof(cm->ref_frame_map));
+ }
+ av1_rc_postencode_update(cpi, cpi_data->frame_size);
+ }
+
+ if (cpi_data->pop_lookahead == 1) {
+ av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush,
+ cpi->compressor_stage);
+ }
+ if (cpi->common.show_frame) {
+ cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start;
+ cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end;
+ }
+ if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+ // Initialize level info. at the beginning of each sequence.
+ if (cm->current_frame.frame_type == KEY_FRAME &&
+ ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ av1_init_level_info(cpi);
+ }
+ av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start,
+ cpi_data->ts_frame_end);
+ }
+
+ if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+ if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
+#endif
+ update_fb_of_context_type(cpi, ppi->fb_of_context_type);
+ update_rc_counts(cpi);
+ update_end_of_frame_stats(cpi);
+ }
+
+ if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) {
+ av1_pop_third_pass_info(cpi->third_pass_ctx);
+ }
+
+ if (ppi->rtc_ref.set_ref_frame_config) {
+ av1_svc_update_buffer_slot_refreshed(cpi);
+ av1_svc_set_reference_was_previous(cpi);
+ }
+
+ if (ppi->use_svc) av1_save_layer_context(cpi);
+
+ // Note *size = 0 indicates a dropped frame for which psnr is not calculated
+ if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) {
+ if (cm->show_existing_frame ||
+ (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+ generate_psnr_packet(cpi);
+ }
+ }
+
+#if CONFIG_INTERNAL_STATS
+ if (!is_stat_generation_stage(cpi)) {
+ compute_internal_stats(cpi, (int)cpi_data->frame_size);
+ }
+#endif // CONFIG_INTERNAL_STATS
+
+ // Write frame info. Subtract 1 from frame index since if was incremented in
+ // update_rc_counts.
+ av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1);
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error->jmp)) {
+ cm->error->setjmp = 0;
+ return cm->error->error_code;
+ }
+ cm->error->setjmp = 1;
+
+#if CONFIG_INTERNAL_STATS
+ cpi->frame_recode_hits = 0;
+ cpi->time_compress_data = 0;
+ cpi->bytes = 0;
+#endif
+#if CONFIG_ENTROPY_STATS
+ if (cpi->compressor_stage == ENCODE_STAGE) {
+ av1_zero(cpi->counts);
+ }
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+ assert(cpi->oxcf.max_threads <= 1 &&
+ "bitstream debug tool does not support multithreading");
+ bitstream_queue_record_write();
+
+ if (cm->seq_params->order_hint_info.enable_order_hint) {
+ aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
+ cm->show_frame);
+ } else {
+ // This is currently used in RTC encoding. cm->show_frame is always 1.
+ aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number);
+ }
+#endif
+ if (cpi->ppi->use_svc) {
+ av1_one_pass_cbr_svc_start_layer(cpi);
+ }
+
+ cpi->is_dropped_frame = false;
+ cm->showable_frame = 0;
+ cpi_data->frame_size = 0;
+ cpi->available_bs_size = cpi_data->cx_data_sz;
+#if CONFIG_INTERNAL_STATS
+ struct aom_usec_timer cmptimer;
+ aom_usec_timer_start(&cmptimer);
+#endif
+ av1_set_high_precision_mv(cpi, 1, 0);
+
+ // Normal defaults
+ cm->features.refresh_frame_context =
+ oxcf->tool_cfg.frame_parallel_decoding_mode
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ if (oxcf->tile_cfg.enable_large_scale_tile)
+ cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ if (assign_cur_frame_new_fb(cm) == NULL) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Failed to allocate new cur_frame");
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case.
+ if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+ start_timing(cpi, av1_encode_strategy_time);
+#endif
+
+ const int result = av1_encode_strategy(
+ cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags,
+ &cpi_data->ts_frame_start, &cpi_data->ts_frame_end,
+ cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+ end_timing(cpi, av1_encode_strategy_time);
+
+ // Print out timing information.
+ // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+ // show_existing_frame and lag-in-frames.
+ if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) &&
+ cpi->frame_component_time[0] > 100) {
+ int i;
+ uint64_t frame_total = 0, total = 0;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ FRAME_UPDATE_TYPE frame_update_type =
+ get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+ fprintf(stderr,
+ "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update "
+ "Type: %d, Q: %d\n",
+ cm->current_frame.frame_number,
+ get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame,
+ frame_update_type, cm->quant_params.base_qindex);
+ for (i = 0; i < kTimingComponents; i++) {
+ cpi->component_time[i] += cpi->frame_component_time[i];
+ // Use av1_encode_strategy_time (i = 0) as the total time.
+ if (i == 0) {
+ frame_total = cpi->frame_component_time[0];
+ total = cpi->component_time[0];
+ }
+ fprintf(stderr,
+ " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+ " us [%6.2f%%])\n",
+ get_component_name(i), cpi->frame_component_time[i],
+ (float)((float)cpi->frame_component_time[i] * 100.0 /
+ (float)frame_total),
+ cpi->component_time[i],
+ (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+ cpi->frame_component_time[i] = 0;
+ }
+ }
+#endif
+
+ // Reset the flag to 0 afer encoding.
+ cpi->rc.use_external_qp_one_pass = 0;
+
+ if (result == -1) {
+ cm->error->setjmp = 0;
+ // Returning -1 indicates no frame encoded; more input is required
+ return -1;
+ }
+ if (result != AOM_CODEC_OK) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Failed to encode frame");
+ }
+#if CONFIG_INTERNAL_STATS
+ aom_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+#endif // CONFIG_INTERNAL_STATS
+
+#if CONFIG_SPEED_STATS
+ if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
+ cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
+ cpi->td.mb.txfm_search_info.tx_search_count = 0;
+ }
+#endif // CONFIG_SPEED_STATS
+
+ cm->error->setjmp = 0;
+ return AOM_CODEC_OK;
+}
+
+// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set. Also sets the bitmask 'ref_buffers_used_map'.
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) {
+ AV1_COMMON *cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ continue;
+ }
+
+ // FPMT does not support scaling yet.
+ assert(ref->y_crop_width == cm->width &&
+ ref->y_crop_height == cm->height);
+
+ RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+ cpi->scaled_ref_buf[ref_frame - 1] = buf;
+ for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) {
+ if (&cm->buffer_pool->frame_bufs[i] == buf) {
+ *ref_buffers_used_map |= (1 << i);
+ }
+ }
+ } else {
+ if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ }
+ }
+}
+
+// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map) {
+ for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) {
+ if (ref_buffers_used_map & (1 << i)) {
+ ++buffer_pool->frame_bufs[i].ref_count;
+ }
+ }
+}
+
+// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set.
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi) {
+ // TODO(isbs): only refresh the necessary frames, rather than all of them
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+ if (buf != NULL) {
+ cpi->scaled_ref_buf[i] = NULL;
+ }
+ }
+}
+
+// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map) {
+ for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) {
+ if (ref_buffers_used_map & (1 << i)) {
+ --buffer_pool->frame_bufs[i].ref_count;
+ }
+ }
+}
+
+// Initialize parallel frame contexts with screen content decisions.
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi) {
+ AV1_COMP *const first_cpi = ppi->cpi;
+ for (int i = 1; i < ppi->num_fp_contexts; ++i) {
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[i];
+ cur_cpi->common.features.allow_screen_content_tools =
+ first_cpi->common.features.allow_screen_content_tools;
+ cur_cpi->common.features.allow_intrabc =
+ first_cpi->common.features.allow_intrabc;
+ cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools;
+ cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type;
+ }
+}
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data) {
+ int cpi_idx = 0;
+
+ // Loop over parallel_cpi to find the cpi that processed the current
+ // gf_frame_index ahead of time.
+ for (int i = 1; i < ppi->num_fp_contexts; i++) {
+ if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) {
+ cpi_idx = i;
+ break;
+ }
+ }
+
+ assert(cpi_idx > 0);
+ assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame);
+
+ // Release the previously-used frame-buffer.
+ if (ppi->cpi->common.cur_frame != NULL) {
+ --ppi->cpi->common.cur_frame->ref_count;
+ ppi->cpi->common.cur_frame = NULL;
+ }
+
+ // Swap the appropriate parallel_cpi with the parallel_cpi[0].
+ ppi->cpi = ppi->parallel_cpi[cpi_idx];
+ ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0];
+ ppi->parallel_cpi[0] = ppi->cpi;
+
+ // Copy appropriate parallel_frames_data to local data.
+ {
+ AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1];
+ assert(data->frame_size > 0);
+ assert(first_cpi_data->cx_data_sz > data->frame_size);
+
+ first_cpi_data->lib_flags = data->lib_flags;
+ first_cpi_data->ts_frame_start = data->ts_frame_start;
+ first_cpi_data->ts_frame_end = data->ts_frame_end;
+ memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size);
+ first_cpi_data->frame_size = data->frame_size;
+ if (ppi->cpi->common.show_frame) {
+ first_cpi_data->pop_lookahead = 1;
+ }
+ }
+
+ return ppi->cpi;
+}
+
+// Initialises frames belonging to a parallel encode set.
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+ AV1_PRIMARY *const ppi,
+ int *ref_buffers_used_map) {
+ AV1_COMP *const first_cpi = ppi->cpi;
+ GF_GROUP *const gf_group = &ppi->gf_group;
+ int gf_index_start = first_cpi->gf_frame_index;
+ assert(gf_group->frame_parallel_level[gf_index_start] == 1);
+ int parallel_frame_count = 0;
+ int cur_frame_num = first_cpi->common.current_frame.frame_number;
+ int show_frame_count = first_cpi->frame_index_set.show_frame_count;
+ int frames_since_key = first_cpi->rc.frames_since_key;
+ int frames_to_key = first_cpi->rc.frames_to_key;
+ int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf;
+ int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start];
+ const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in;
+
+ assert(*ref_buffers_used_map == 0);
+
+ // Release the previously used frame-buffer by a frame_parallel_level 1 frame.
+ if (first_cpi->common.cur_frame != NULL) {
+ --first_cpi->common.cur_frame->ref_count;
+ first_cpi->common.cur_frame = NULL;
+ }
+
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+ RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES];
+ init_ref_map_pair(first_cpi, first_ref_frame_map_pairs);
+ memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs,
+ sizeof(RefFrameMapPair) * REF_FRAMES);
+
+ // Store the reference refresh index of frame_parallel_level 1 frame in a
+ // parallel encode set of lower layer frames.
+ if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+ first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf(
+ first_cpi, ref_frame_map_pairs, gf_index_start);
+ assert(first_cpi->ref_refresh_index != INVALID_IDX &&
+ first_cpi->ref_refresh_index < REF_FRAMES);
+ first_cpi->refresh_idx_available = true;
+ // Update ref_frame_map_pairs.
+ ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order =
+ gf_group->display_idx[gf_index_start];
+ ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level =
+ gf_group->layer_depth[gf_index_start];
+ }
+
+ // Set do_frame_data_update flag as false for frame_parallel_level 1 frame.
+ first_cpi->do_frame_data_update = false;
+ if (gf_group->arf_src_offset[gf_index_start] == 0) {
+ first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame;
+ first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame;
+ }
+
+ av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, first_cpi,
+ gf_index_start, 1, first_cpi->common.remapped_ref_idx);
+
+ av1_scale_references_fpmt(first_cpi, ref_buffers_used_map);
+ parallel_frame_count++;
+
+ // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2
+ // frames which are part of the current parallel encode set and initialize the
+ // required cpi elements.
+ for (int i = gf_index_start + 1; i < gf_group->size; i++) {
+ // Update frame counters if previous frame was show frame or show existing
+ // frame.
+ if (gf_group->arf_src_offset[i - 1] == 0) {
+ cur_frame_num++;
+ show_frame_count++;
+ if (frames_to_fwd_kf <= 0)
+ frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist;
+ if (frames_to_key) {
+ frames_since_key++;
+ frames_to_key--;
+ frames_to_fwd_kf--;
+ }
+ stats_in++;
+ }
+ cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i];
+ if (gf_group->frame_parallel_level[i] == 2) {
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count];
+ AV1_COMP_DATA *cur_cpi_data =
+ &ppi->parallel_frames_data[parallel_frame_count - 1];
+ cur_cpi->gf_frame_index = i;
+ cur_cpi->framerate = first_cpi->framerate;
+ cur_cpi->common.current_frame.frame_number = cur_frame_num;
+ cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i];
+ cur_cpi->frame_index_set.show_frame_count = show_frame_count;
+ cur_cpi->rc.frames_since_key = frames_since_key;
+ cur_cpi->rc.frames_to_key = frames_to_key;
+ cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf;
+ cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality;
+ cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth;
+ cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth;
+ cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth;
+ cur_cpi->rc.intervals_till_gf_calculate_due =
+ first_cpi->rc.intervals_till_gf_calculate_due;
+ cur_cpi->mv_search_params.max_mv_magnitude =
+ first_cpi->mv_search_params.max_mv_magnitude;
+ if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ cur_cpi->common.lf.mode_ref_delta_enabled = 1;
+ }
+ cur_cpi->do_frame_data_update = false;
+ // Initialize prev_ts_start and prev_ts_end for show frame(s) and show
+ // existing frame(s).
+ if (gf_group->arf_src_offset[i] == 0) {
+ // Choose source of prev frame.
+ int src_index = gf_group->src_offset[i];
+ struct lookahead_entry *prev_source = av1_lookahead_peek(
+ ppi->lookahead, src_index - 1, cur_cpi->compressor_stage);
+ // Save timestamps of prev frame.
+ cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start;
+ cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end;
+ }
+ cur_cpi->time_stamps.first_ts_start =
+ first_cpi->time_stamps.first_ts_start;
+
+ memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map,
+ sizeof(first_cpi->common.ref_frame_map));
+ cur_cpi_data->lib_flags = 0;
+ cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio;
+ cur_cpi_data->flush = first_cpi_data->flush;
+ cur_cpi_data->frame_size = 0;
+ if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+ // If the first frame in a parallel encode set is INTNL_ARF_UPDATE
+ // frame, initialize lib_flags of frame_parallel_level 2 frame in the
+ // set with that of frame_parallel_level 1 frame.
+ cur_cpi_data->lib_flags = first_cpi_data->lib_flags;
+ // Store the reference refresh index of frame_parallel_level 2 frame in
+ // a parallel encode set of lower layer frames.
+ cur_cpi->ref_refresh_index =
+ av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i);
+ cur_cpi->refresh_idx_available = true;
+ // Skip the reference frame which will be refreshed by
+ // frame_parallel_level 1 frame in a parallel encode set of lower layer
+ // frames.
+ cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index;
+ } else {
+ cur_cpi->ref_idx_to_skip = INVALID_IDX;
+ cur_cpi->ref_refresh_index = INVALID_IDX;
+ cur_cpi->refresh_idx_available = false;
+ }
+ cur_cpi->twopass_frame.stats_in = stats_in;
+
+ av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, cur_cpi, i,
+ 1, cur_cpi->common.remapped_ref_idx);
+ av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map);
+ parallel_frame_count++;
+ }
+
+ // Set do_frame_data_update to true for the last frame_parallel_level 2
+ // frame in the current parallel encode set.
+ if (i == (gf_group->size - 1) ||
+ (gf_group->frame_parallel_level[i + 1] == 0 &&
+ (gf_group->update_type[i + 1] == ARF_UPDATE ||
+ gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) ||
+ gf_group->frame_parallel_level[i + 1] == 1) {
+ ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true;
+ break;
+ }
+ }
+
+ av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool,
+ *ref_buffers_used_map);
+
+ // Return the number of frames in the parallel encode set.
+ return parallel_frame_count;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+ AV1_COMMON *cm = &cpi->common;
+ if (!cm->show_frame) {
+ return -1;
+ } else {
+ int ret;
+ if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) {
+ *dest = cm->cur_frame->buf;
+ dest->y_width = cm->width;
+ dest->y_height = cm->height;
+ dest->uv_width = cm->width >> cm->seq_params->subsampling_x;
+ dest->uv_height = cm->height >> cm->seq_params->subsampling_y;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ return ret;
+ }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+ if (cpi->last_show_frame_buf == NULL ||
+ cpi->oxcf.algo_cfg.skip_postproc_filtering)
+ return -1;
+
+ *frame = cpi->last_show_frame_buf->buf;
+ return 0;
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+ if (!equal_dimensions_and_border(new_frame, sd))
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+ return cm->error->error_code;
+}
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+ ResizePendingParams *resize_pending_params,
+ AOM_SCALING_MODE horiz_mode,
+ AOM_SCALING_MODE vert_mode) {
+ int hr = 0, hs = 0, vr = 0, vs = 0;
+
+ // Checks for invalid AOM_SCALING_MODE values.
+ if (horiz_mode > AOME_ONETHREE || vert_mode > AOME_ONETHREE) return -1;
+
+ Scale2Ratio(horiz_mode, &hr, &hs);
+ Scale2Ratio(vert_mode, &vr, &vs);
+
+ // always go to the next whole number
+ resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs;
+ resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs;
+
+ if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) {
+ oxcf->resize_cfg.resize_mode = RESIZE_FIXED;
+ oxcf->algo_cfg.enable_tpl_model = 0;
+ }
+ return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) {
+ return cpi->common.quant_params.base_qindex;
+}
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+ size_t output_size = 0;
+ size_t total_bytes_read = 0;
+ size_t remaining_size = *frame_size;
+ uint8_t *buff_ptr = buffer;
+
+ // go through each OBUs
+ while (total_bytes_read < *frame_size) {
+ uint8_t saved_obu_header[2];
+ uint64_t obu_payload_size;
+ size_t length_of_payload_size;
+ size_t length_of_obu_size;
+ uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+ size_t obu_bytes_read = obu_header_size; // bytes read for current obu
+
+ // save the obu header (1 or 2 bytes)
+ memmove(saved_obu_header, buff_ptr, obu_header_size);
+ // clear the obu_has_size_field
+ saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+ // get the payload_size and length of payload_size
+ if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+ &obu_payload_size, &length_of_payload_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+ obu_bytes_read += length_of_payload_size;
+
+ // calculate the length of size of the obu header plus payload
+ length_of_obu_size =
+ aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+ // move the rest of data to new location
+ memmove(buff_ptr + length_of_obu_size + obu_header_size,
+ buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+ obu_bytes_read += (size_t)obu_payload_size;
+
+ // write the new obu size
+ const uint64_t obu_size = obu_header_size + obu_payload_size;
+ size_t coded_obu_size;
+ if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+ &coded_obu_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // write the saved (modified) obu_header following obu size
+ memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+ total_bytes_read += obu_bytes_read;
+ remaining_size -= obu_bytes_read;
+ buff_ptr += length_of_obu_size + obu_size;
+ output_size += length_of_obu_size + (size_t)obu_size;
+ }
+
+ *frame_size = output_size;
+ return AOM_CODEC_OK;
+}
+
+static void rtc_set_updates_ref_frame_config(
+ ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags,
+ RTC_REF *const rtc_ref) {
+ ext_refresh_frame_flags->update_pending = 1;
+ ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]];
+ ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]];
+ ext_refresh_frame_flags->bwd_ref_frame =
+ rtc_ref->refresh[rtc_ref->ref_idx[4]];
+ ext_refresh_frame_flags->alt2_ref_frame =
+ rtc_ref->refresh[rtc_ref->ref_idx[5]];
+ ext_refresh_frame_flags->alt_ref_frame =
+ rtc_ref->refresh[rtc_ref->ref_idx[6]];
+ rtc_ref->non_reference_frame = 1;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (rtc_ref->refresh[i] == 1) {
+ rtc_ref->non_reference_frame = 0;
+ break;
+ }
+ }
+}
+
+static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ int ref = AOM_REFFRAME_ALL;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i);
+ }
+ return ref;
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+ // TODO(yunqingwang): For what references to use, external encoding flags
+ // should be consistent with internal reference frame selection. Need to
+ // ensure that there is not conflict between the two. In AV1 encoder, the
+ // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+ // GOLDEN, BWDREF, ALTREF2.
+
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &ext_flags->refresh_frame;
+ ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
+ if (flags &
+ (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2)) {
+ int ref = AOM_REFFRAME_ALL;
+
+ if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_ARF) {
+ ref ^= AOM_ALT_FLAG;
+ ref ^= AOM_BWD_FLAG;
+ ref ^= AOM_ALT2_FLAG;
+ } else {
+ if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+ }
+
+ av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+ } else {
+ if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+ int ref = rtc_set_references_external_ref_frame_config(cpi);
+ av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+ }
+ }
+
+ if (flags &
+ (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+ int upd = AOM_REFFRAME_ALL;
+
+ // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+ if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_ARF) {
+ upd ^= AOM_ALT_FLAG;
+ upd ^= AOM_BWD_FLAG;
+ upd ^= AOM_ALT2_FLAG;
+ }
+
+ ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0;
+ ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+ ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+ ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+ ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+ ext_refresh_frame_flags->update_pending = 1;
+ } else {
+ if (cpi->ppi->rtc_ref.set_ref_frame_config)
+ rtc_set_updates_ref_frame_config(ext_refresh_frame_flags,
+ &cpi->ppi->rtc_ref);
+ else
+ ext_refresh_frame_flags->update_pending = 0;
+ }
+
+ ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs &
+ ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+ ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode |
+ ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+ ext_flags->use_s_frame =
+ cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+ ext_flags->use_primary_ref_none =
+ (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
+ if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+ update_entropy(&ext_flags->refresh_frame_context,
+ &ext_flags->refresh_frame_context_pending, 0);
+ }
+}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
+ if (!ppi) return NULL;
+
+ uint8_t header_buf[512] = { 0 };
+ const uint32_t sequence_header_size =
+ av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]);
+ assert(sequence_header_size <= sizeof(header_buf));
+ if (sequence_header_size == 0) return NULL;
+
+ const size_t obu_header_size = 1;
+ const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+ const size_t payload_offset = obu_header_size + size_field_size;
+
+ if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+ memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+ if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
+ OBU_SEQUENCE_HEADER, 0,
+ &header_buf[0]) != obu_header_size) {
+ return NULL;
+ }
+
+ size_t coded_size_field_size = 0;
+ if (aom_uleb_encode(sequence_header_size, size_field_size,
+ &header_buf[obu_header_size],
+ &coded_size_field_size) != 0) {
+ return NULL;
+ }
+ assert(coded_size_field_size == size_field_size);
+
+ aom_fixed_buf_t *global_headers =
+ (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+ if (!global_headers) return NULL;
+
+ const size_t global_header_buf_size =
+ obu_header_size + size_field_size + sequence_header_size;
+
+ global_headers->buf = malloc(global_header_buf_size);
+ if (!global_headers->buf) {
+ free(global_headers);
+ return NULL;
+ }
+
+ memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+ global_headers->sz = global_header_buf_size;
+ return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 0000000000..5f6f67eda8
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,4512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares top-level encoder structures and functions.
+ */
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/timing.h"
+
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/external_partition.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/level.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/svc_layercontext.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/bitstream.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+#include "av1/encoder/tune_butteraugli.h"
+#endif
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to
+// adjust it while we work on documentation.
+/*!\cond */
+// Number of frames required to test for scene cut detection
+#define SCENE_CUT_KEY_TEST_INTERVAL 16
+
+// Lookahead index threshold to enable temporal filtering for second arf.
+#define TF_LOOKAHEAD_IDX_THR 7
+
+#define HDR_QP_LEVELS 10
+#define CHROMA_CB_QP_SCALE 1.04
+#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_QP_SCALE -0.46
+#define CHROMA_QP_OFFSET 9.26
+#define QP_SCALE_FACTOR 2.0
+#define DISABLE_HDR_LUMA_DELTAQ 1
+
+// Rational number with an int64 numerator
+// This structure holds a fractional value
+typedef struct aom_rational64 {
+ int64_t num; // fraction numerator
+ int den; // fraction denominator
+} aom_rational64_t; // alias for struct aom_rational
+
+enum {
+ // Good Quality Fast Encoding. The encoder balances quality with the amount of
+ // time it takes to encode the output. Speed setting controls how fast.
+ GOOD,
+ // Realtime Fast Encoding. Will force some restrictions on bitrate
+ // constraints.
+ REALTIME,
+ // All intra mode. All the frames are coded as intra frames.
+ ALLINTRA
+} UENUM1BYTE(MODE);
+
+enum {
+ FRAMEFLAGS_KEY = 1 << 0,
+ FRAMEFLAGS_GOLDEN = 1 << 1,
+ FRAMEFLAGS_BWDREF = 1 << 2,
+ // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
+ FRAMEFLAGS_ALTREF = 1 << 3,
+ FRAMEFLAGS_INTRAONLY = 1 << 4,
+ FRAMEFLAGS_SWITCH = 1 << 5,
+ FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
+} UENUM1BYTE(FRAMETYPE_FLAGS);
+
+#if CONFIG_FPMT_TEST
+enum {
+ PARALLEL_ENCODE = 0,
+ PARALLEL_SIMULATION_ENCODE,
+ NUM_FPMT_TEST_ENCODES
+} UENUM1BYTE(FPMT_TEST_ENC_CFG);
+#endif // CONFIG_FPMT_TEST
+// 0 level frames are sometimes used for rate control purposes, but for
+// reference mapping purposes, the minimum level should be 1.
+#define MIN_PYR_LEVEL 1
+static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+ int max_layer_depth) {
+ if (frame_order == 0) {
+ // Keyframe case
+ return MIN_PYR_LEVEL;
+ } else if (frame_level == MAX_ARF_LAYERS) {
+ // Leaves
+ return max_layer_depth;
+ } else if (frame_level == (MAX_ARF_LAYERS + 1)) {
+ // Altrefs
+ return MIN_PYR_LEVEL;
+ }
+ return AOMMAX(MIN_PYR_LEVEL, frame_level);
+}
+
+enum {
+ NO_AQ = 0,
+ VARIANCE_AQ = 1,
+ COMPLEXITY_AQ = 2,
+ CYCLIC_REFRESH_AQ = 3,
+ AQ_MODE_COUNT // This should always be the last member of the enum
+} UENUM1BYTE(AQ_MODE);
+enum {
+ NO_DELTA_Q = 0,
+ DELTA_Q_OBJECTIVE = 1, // Modulation to improve objective quality
+ DELTA_Q_PERCEPTUAL = 2, // Modulation to improve video perceptual quality
+ DELTA_Q_PERCEPTUAL_AI = 3, // Perceptual quality opt for all intra mode
+ DELTA_Q_USER_RATING_BASED = 4, // User rating based delta q mode
+ DELTA_Q_HDR = 5, // QP adjustment based on HDR block pixel average
+ DELTA_Q_MODE_COUNT // This should always be the last member of the enum
+} UENUM1BYTE(DELTAQ_MODE);
+
+enum {
+ RESIZE_NONE = 0, // No frame resizing allowed.
+ RESIZE_FIXED = 1, // All frames are coded at the specified scale.
+ RESIZE_RANDOM = 2, // All frames are coded at a random scale.
+ RESIZE_DYNAMIC = 3, // Frames coded at lower scale based on rate control.
+ RESIZE_MODES
+} UENUM1BYTE(RESIZE_MODE);
+
+enum {
+ SS_CFG_SRC = 0,
+ SS_CFG_LOOKAHEAD = 1,
+ SS_CFG_FPF = 2,
+ SS_CFG_TOTAL = 3
+} UENUM1BYTE(SS_CFG_OFFSET);
+
+enum {
+ DISABLE_SCENECUT, // For LAP, lag_in_frames < 19
+ ENABLE_SCENECUT_MODE_1, // For LAP, lag_in_frames >=19 and < 33
+ ENABLE_SCENECUT_MODE_2 // For twopass and LAP - lag_in_frames >=33
+} UENUM1BYTE(SCENECUT_MODE);
+
+#define MAX_VBR_CORPUS_COMPLEXITY 10000
+
+typedef enum {
+ MOD_FP, // First pass
+ MOD_TF, // Temporal filtering
+ MOD_TPL, // TPL
+ MOD_GME, // Global motion estimation
+ MOD_ENC, // Encode stage
+ MOD_LPF, // Deblocking loop filter
+ MOD_CDEF_SEARCH, // CDEF search
+ MOD_CDEF, // CDEF frame
+ MOD_LR, // Loop restoration filtering
+ MOD_PACK_BS, // Pack bitstream
+ MOD_FRAME_ENC, // Frame Parallel encode
+ MOD_AI, // All intra
+ NUM_MT_MODULES
+} MULTI_THREADED_MODULES;
+
+/*!\endcond */
+
+/*!\enum COST_UPDATE_TYPE
+ * \brief This enum controls how often the entropy costs should be updated.
+ * \warning In case of any modifications/additions done to the enum
+ * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as
+ * well.
+ */
+typedef enum {
+ COST_UPD_SB, /*!< Update every sb. */
+ COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */
+ COST_UPD_TILE, /*!< Update every tile. */
+ COST_UPD_OFF, /*!< Turn off cost updates. */
+ NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */
+} COST_UPDATE_TYPE;
+
+/*!\enum LOOPFILTER_CONTROL
+ * \brief This enum controls to which frames loopfilter is applied.
+ */
+typedef enum {
+ LOOPFILTER_NONE = 0, /*!< Disable loopfilter on all frames. */
+ LOOPFILTER_ALL = 1, /*!< Enable loopfilter for all frames. */
+ LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */
+ LOOPFILTER_SELECTIVELY =
+ 3, /*!< Disable loopfilter on frames with low motion. */
+} LOOPFILTER_CONTROL;
+
+/*!\enum SKIP_APPLY_POSTPROC_FILTER
+ * \brief This enum controls the application of post-processing filters on a
+ * reconstructed frame.
+ */
+typedef enum {
+ SKIP_APPLY_RESTORATION = 1 << 0,
+ SKIP_APPLY_SUPERRES = 1 << 1,
+ SKIP_APPLY_CDEF = 1 << 2,
+ SKIP_APPLY_LOOPFILTER = 1 << 3,
+} SKIP_APPLY_POSTPROC_FILTER;
+
+/*!
+ * \brief Encoder config related to resize.
+ */
+typedef struct {
+ /*!
+ * Indicates the frame resize mode to be used by the encoder.
+ */
+ RESIZE_MODE resize_mode;
+ /*!
+ * Indicates the denominator for resize of inter frames, assuming 8 as the
+ * numerator. Its value ranges between 8-16.
+ */
+ uint8_t resize_scale_denominator;
+ /*!
+ * Indicates the denominator for resize of key frames, assuming 8 as the
+ * numerator. Its value ranges between 8-16.
+ */
+ uint8_t resize_kf_scale_denominator;
+} ResizeCfg;
+
+/*!
+ * \brief Encoder config for coding block partitioning.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if rectanguar partitions should be enabled.
+ */
+ bool enable_rect_partitions;
+ /*!
+ * Flag to indicate if AB partitions should be enabled.
+ */
+ bool enable_ab_partitions;
+ /*!
+ * Flag to indicate if 1:4 / 4:1 partitions should be enabled.
+ */
+ bool enable_1to4_partitions;
+ /*!
+ * Indicates the minimum partition size that should be allowed. Both width and
+ * height of a partition cannot be smaller than the min_partition_size.
+ */
+ BLOCK_SIZE min_partition_size;
+ /*!
+ * Indicates the maximum partition size that should be allowed. Both width and
+ * height of a partition cannot be larger than the max_partition_size.
+ */
+ BLOCK_SIZE max_partition_size;
+} PartitionCfg;
+
+/*!
+ * \brief Encoder flags for intra prediction.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if intra edge filtering process should be enabled.
+ */
+ bool enable_intra_edge_filter;
+ /*!
+ * Flag to indicate if recursive filtering based intra prediction should be
+ * enabled.
+ */
+ bool enable_filter_intra;
+ /*!
+ * Flag to indicate if smooth intra prediction modes should be enabled.
+ */
+ bool enable_smooth_intra;
+ /*!
+ * Flag to indicate if PAETH intra prediction mode should be enabled.
+ */
+ bool enable_paeth_intra;
+ /*!
+ * Flag to indicate if CFL uv intra mode should be enabled.
+ */
+ bool enable_cfl_intra;
+ /*!
+ * Flag to indicate if directional modes should be enabled.
+ */
+ bool enable_directional_intra;
+ /*!
+ * Flag to indicate if the subset of directional modes from D45 to D203 intra
+ * should be enabled. Has no effect if directional modes are disabled.
+ */
+ bool enable_diagonal_intra;
+ /*!
+ * Flag to indicate if delta angles for directional intra prediction should be
+ * enabled.
+ */
+ bool enable_angle_delta;
+ /*!
+ * Flag to indicate whether to automatically turn off several intral coding
+ * tools.
+ * This flag is only used when "--deltaq-mode=3" is true.
+ * When set to 1, the encoder will analyze the reconstruction quality
+ * as compared to the source image in the preprocessing pass.
+ * If the recontruction quality is considered high enough, we disable
+ * the following intra coding tools, for better encoding speed:
+ * "--enable_smooth_intra",
+ * "--enable_paeth_intra",
+ * "--enable_cfl_intra",
+ * "--enable_diagonal_intra".
+ */
+ bool auto_intra_tools_off;
+} IntraModeCfg;
+
+/*!
+ * \brief Encoder flags for transform sizes and types.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if 64-pt transform should be enabled.
+ */
+ bool enable_tx64;
+ /*!
+ * Flag to indicate if flip and identity transform types should be enabled.
+ */
+ bool enable_flip_idtx;
+ /*!
+ * Flag to indicate if rectangular transform should be enabled.
+ */
+ bool enable_rect_tx;
+ /*!
+ * Flag to indicate whether or not to use a default reduced set for ext-tx
+ * rather than the potential full set of 16 transforms.
+ */
+ bool reduced_tx_type_set;
+ /*!
+ * Flag to indicate if transform type for intra blocks should be limited to
+ * DCT_DCT.
+ */
+ bool use_intra_dct_only;
+ /*!
+ * Flag to indicate if transform type for inter blocks should be limited to
+ * DCT_DCT.
+ */
+ bool use_inter_dct_only;
+ /*!
+ * Flag to indicate if intra blocks should use default transform type
+ * (mode-dependent) only.
+ */
+ bool use_intra_default_tx_only;
+ /*!
+ * Flag to indicate if transform size search should be enabled.
+ */
+ bool enable_tx_size_search;
+} TxfmSizeTypeCfg;
+
+/*!
+ * \brief Encoder flags for compound prediction modes.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if distance-weighted compound type should be enabled.
+ */
+ bool enable_dist_wtd_comp;
+ /*!
+ * Flag to indicate if masked (wedge/diff-wtd) compound type should be
+ * enabled.
+ */
+ bool enable_masked_comp;
+ /*!
+ * Flag to indicate if smooth interintra mode should be enabled.
+ */
+ bool enable_smooth_interintra;
+ /*!
+ * Flag to indicate if difference-weighted compound type should be enabled.
+ */
+ bool enable_diff_wtd_comp;
+ /*!
+ * Flag to indicate if inter-inter wedge compound type should be enabled.
+ */
+ bool enable_interinter_wedge;
+ /*!
+ * Flag to indicate if inter-intra wedge compound type should be enabled.
+ */
+ bool enable_interintra_wedge;
+} CompoundTypeCfg;
+
+/*!
+ * \brief Encoder config related to frame super-resolution.
+ */
+typedef struct {
+ /*!
+ * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+ * mode is used for inter frames.
+ */
+ int superres_qthresh;
+ /*!
+ * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+ * mode is used for key frames.
+ */
+ int superres_kf_qthresh;
+ /*!
+ * Indicates the denominator of the fraction that specifies the ratio between
+ * the superblock width before and after upscaling for inter frames. The
+ * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+ */
+ uint8_t superres_scale_denominator;
+ /*!
+ * Indicates the denominator of the fraction that specifies the ratio between
+ * the superblock width before and after upscaling for key frames. The
+ * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+ */
+ uint8_t superres_kf_scale_denominator;
+ /*!
+ * Indicates the Super-resolution mode to be used by the encoder.
+ */
+ aom_superres_mode superres_mode;
+ /*!
+ * Flag to indicate if super-resolution should be enabled for the sequence.
+ */
+ bool enable_superres;
+} SuperResCfg;
+
+/*!
+ * \brief Encoder config related to the coding of key frames.
+ */
+typedef struct {
+ /*!
+ * Indicates the minimum distance to a key frame.
+ */
+ int key_freq_min;
+
+ /*!
+ * Indicates the maximum distance to a key frame.
+ */
+ int key_freq_max;
+
+ /*!
+ * Indicates if temporal filtering should be applied on keyframe.
+ */
+ int enable_keyframe_filtering;
+
+ /*!
+ * Indicates the number of frames after which a frame may be coded as an
+ * S-Frame.
+ */
+ int sframe_dist;
+
+ /*!
+ * Indicates how an S-Frame should be inserted.
+ * 1: the considered frame will be made into an S-Frame only if it is an
+ * altref frame. 2: the next altref frame will be made into an S-Frame.
+ */
+ int sframe_mode;
+
+ /*!
+ * Indicates if encoder should autodetect cut scenes and set the keyframes.
+ */
+ bool auto_key;
+
+ /*!
+ * Indicates the forward key frame distance.
+ */
+ int fwd_kf_dist;
+
+ /*!
+ * Indicates if forward keyframe reference should be enabled.
+ */
+ bool fwd_kf_enabled;
+
+ /*!
+ * Indicates if S-Frames should be enabled for the sequence.
+ */
+ bool enable_sframe;
+
+ /*!
+ * Indicates if intra block copy prediction mode should be enabled or not.
+ */
+ bool enable_intrabc;
+} KeyFrameCfg;
+
+/*!
+ * \brief Encoder rate control configuration parameters
+ */
+typedef struct {
+ /*!\cond */
+ // BUFFERING PARAMETERS
+ /*!\endcond */
+ /*!
+ * Indicates the amount of data that will be buffered by the decoding
+ * application prior to beginning playback, and is expressed in units of
+ * time(milliseconds).
+ */
+ int64_t starting_buffer_level_ms;
+ /*!
+ * Indicates the amount of data that the encoder should try to maintain in the
+ * decoder's buffer, and is expressed in units of time(milliseconds).
+ */
+ int64_t optimal_buffer_level_ms;
+ /*!
+ * Indicates the maximum amount of data that may be buffered by the decoding
+ * application, and is expressed in units of time(milliseconds).
+ */
+ int64_t maximum_buffer_size_ms;
+
+ /*!
+ * Indicates the bandwidth to be used in bits per second.
+ */
+ int64_t target_bandwidth;
+
+ /*!
+ * Indicates average complexity of the corpus in single pass vbr based on
+ * LAP. 0 indicates that corpus complexity vbr mode is disabled.
+ */
+ unsigned int vbr_corpus_complexity_lap;
+ /*!
+ * Indicates the maximum allowed bitrate for any intra frame as % of bitrate
+ * target.
+ */
+ unsigned int max_intra_bitrate_pct;
+ /*!
+ * Indicates the maximum allowed bitrate for any inter frame as % of bitrate
+ * target.
+ */
+ unsigned int max_inter_bitrate_pct;
+ /*!
+ * Indicates the percentage of rate boost for golden frame in CBR mode.
+ */
+ unsigned int gf_cbr_boost_pct;
+ /*!
+ * min_cr / 100 indicates the target minimum compression ratio for each
+ * frame.
+ */
+ unsigned int min_cr;
+ /*!
+ * Indicates the frame drop threshold.
+ */
+ int drop_frames_water_mark;
+ /*!
+ * under_shoot_pct indicates the tolerance of the VBR algorithm to
+ * undershoot and is used as a trigger threshold for more aggressive
+ * adaptation of Q. It's value can range from 0-100.
+ */
+ int under_shoot_pct;
+ /*!
+ * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot
+ * and is used as a trigger threshold for more aggressive adaptation of Q.
+ * It's value can range from 0-1000.
+ */
+ int over_shoot_pct;
+ /*!
+ * Indicates the maximum qindex that can be used by the quantizer i.e. the
+ * worst quality qindex.
+ */
+ int worst_allowed_q;
+ /*!
+ * Indicates the minimum qindex that can be used by the quantizer i.e. the
+ * best quality qindex.
+ */
+ int best_allowed_q;
+ /*!
+ * Indicates the Constant/Constrained Quality level.
+ */
+ int cq_level;
+ /*!
+ * Indicates if the encoding mode is vbr, cbr, constrained quality or
+ * constant quality.
+ */
+ enum aom_rc_mode mode;
+ /*!
+ * Indicates the bias (expressed on a scale of 0 to 100) for determining
+ * target size for the current frame. The value 0 indicates the optimal CBR
+ * mode value should be used, and 100 indicates the optimal VBR mode value
+ * should be used.
+ */
+ int vbrbias;
+ /*!
+ * Indicates the minimum bitrate to be used for a single frame as a percentage
+ * of the target bitrate.
+ */
+ int vbrmin_section;
+ /*!
+ * Indicates the maximum bitrate to be used for a single frame as a percentage
+ * of the target bitrate.
+ */
+ int vbrmax_section;
+} RateControlCfg;
+
+/*!\cond */
+typedef struct {
+ // Indicates the number of frames lag before encoding is started.
+ int lag_in_frames;
+ // Indicates the minimum gf/arf interval to be used.
+ int min_gf_interval;
+ // Indicates the maximum gf/arf interval to be used.
+ int max_gf_interval;
+ // Indicates the minimum height for GF group pyramid structure to be used.
+ int gf_min_pyr_height;
+ // Indicates the maximum height for GF group pyramid structure to be used.
+ int gf_max_pyr_height;
+ // Indicates if automatic set and use of altref frames should be enabled.
+ bool enable_auto_arf;
+ // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be
+ // enabled.
+ bool enable_auto_brf;
+} GFConfig;
+
+typedef struct {
+ // Indicates the number of tile groups.
+ unsigned int num_tile_groups;
+ // Indicates the MTU size for a tile group. If mtu is non-zero,
+ // num_tile_groups is set to DEFAULT_MAX_NUM_TG.
+ unsigned int mtu;
+ // Indicates the number of tile columns in log2.
+ int tile_columns;
+ // Indicates the number of tile rows in log2.
+ int tile_rows;
+ // Indicates the number of widths in the tile_widths[] array.
+ int tile_width_count;
+ // Indicates the number of heights in the tile_heights[] array.
+ int tile_height_count;
+ // Indicates the tile widths, and may be empty.
+ int tile_widths[MAX_TILE_COLS];
+ // Indicates the tile heights, and may be empty.
+ int tile_heights[MAX_TILE_ROWS];
+ // Indicates if large scale tile coding should be used.
+ bool enable_large_scale_tile;
+ // Indicates if single tile decoding mode should be enabled.
+ bool enable_single_tile_decoding;
+ // Indicates if EXT_TILE_DEBUG should be enabled.
+ bool enable_ext_tile_debug;
+} TileConfig;
+
+typedef struct {
+ // Indicates the width of the input frame.
+ int width;
+ // Indicates the height of the input frame.
+ int height;
+ // If forced_max_frame_width is non-zero then it is used to force the maximum
+ // frame width written in write_sequence_header().
+ int forced_max_frame_width;
+ // If forced_max_frame_width is non-zero then it is used to force the maximum
+ // frame height written in write_sequence_header().
+ int forced_max_frame_height;
+ // Indicates the frame width after applying both super-resolution and resize
+ // to the coded frame.
+ int render_width;
+ // Indicates the frame height after applying both super-resolution and resize
+ // to the coded frame.
+ int render_height;
+} FrameDimensionCfg;
+
+typedef struct {
+ // Indicates if warped motion should be enabled.
+ bool enable_warped_motion;
+ // Indicates if warped motion should be evaluated or not.
+ bool allow_warped_motion;
+ // Indicates if OBMC motion should be enabled.
+ bool enable_obmc;
+} MotionModeCfg;
+
+typedef struct {
+ // Timing info for each frame.
+ aom_timing_info_t timing_info;
+ // Indicates the number of time units of a decoding clock.
+ uint32_t num_units_in_decoding_tick;
+ // Indicates if decoder model information is present in the coded sequence
+ // header.
+ bool decoder_model_info_present_flag;
+ // Indicates if display model information is present in the coded sequence
+ // header.
+ bool display_model_info_present_flag;
+ // Indicates if timing info for each frame is present.
+ bool timing_info_present;
+} DecoderModelCfg;
+
+typedef struct {
+ // Indicates the update frequency for coeff costs.
+ COST_UPDATE_TYPE coeff;
+ // Indicates the update frequency for mode costs.
+ COST_UPDATE_TYPE mode;
+ // Indicates the update frequency for mv costs.
+ COST_UPDATE_TYPE mv;
+ // Indicates the update frequency for dv costs.
+ COST_UPDATE_TYPE dv;
+} CostUpdateFreq;
+
+typedef struct {
+ // Indicates the maximum number of reference frames allowed per frame.
+ unsigned int max_reference_frames;
+ // Indicates if the reduced set of references should be enabled.
+ bool enable_reduced_reference_set;
+ // Indicates if one-sided compound should be enabled.
+ bool enable_onesided_comp;
+} RefFrameCfg;
+
+typedef struct {
+ // Indicates the color space that should be used.
+ aom_color_primaries_t color_primaries;
+ // Indicates the characteristics of transfer function to be used.
+ aom_transfer_characteristics_t transfer_characteristics;
+ // Indicates the matrix coefficients to be used for the transfer function.
+ aom_matrix_coefficients_t matrix_coefficients;
+ // Indicates the chroma 4:2:0 sample position info.
+ aom_chroma_sample_position_t chroma_sample_position;
+ // Indicates if a limited color range or full color range should be used.
+ aom_color_range_t color_range;
+} ColorCfg;
+
+typedef struct {
+ // Indicates if extreme motion vector unit test should be enabled or not.
+ unsigned int motion_vector_unit_test;
+ // Indicates if superblock multipass unit test should be enabled or not.
+ unsigned int sb_multipass_unit_test;
+} UnitTestCfg;
+
+typedef struct {
+ // Indicates the file path to the VMAF model.
+ const char *vmaf_model_path;
+ // Indicates the path to the film grain parameters.
+ const char *film_grain_table_filename;
+ // Indicates the visual tuning metric.
+ aom_tune_metric tuning;
+ // Indicates if the current content is screen or default type.
+ aom_tune_content content;
+ // Indicates the film grain parameters.
+ int film_grain_test_vector;
+ // Indicates the in-block distortion metric to use.
+ aom_dist_metric dist_metric;
+} TuneCfg;
+
+typedef struct {
+ // Indicates the framerate of the input video.
+ double init_framerate;
+ // Indicates the bit-depth of the input video.
+ unsigned int input_bit_depth;
+ // Indicates the maximum number of frames to be encoded.
+ unsigned int limit;
+ // Indicates the chrome subsampling x value.
+ unsigned int chroma_subsampling_x;
+ // Indicates the chrome subsampling y value.
+ unsigned int chroma_subsampling_y;
+} InputCfg;
+
+typedef struct {
+ // If true, encoder will use fixed QP offsets, that are either:
+ // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+ // - Picked automatically from cq_level.
+ int use_fixed_qp_offsets;
+ // Indicates the minimum flatness of the quantization matrix.
+ int qm_minlevel;
+ // Indicates the maximum flatness of the quantization matrix.
+ int qm_maxlevel;
+ // Indicates if adaptive quantize_b should be enabled.
+ int quant_b_adapt;
+ // Indicates the Adaptive Quantization mode to be used.
+ AQ_MODE aq_mode;
+ // Indicates the delta q mode to be used.
+ DELTAQ_MODE deltaq_mode;
+ // Indicates the delta q mode strength.
+ DELTAQ_MODE deltaq_strength;
+ // Indicates if delta quantization should be enabled in chroma planes.
+ bool enable_chroma_deltaq;
+ // Indicates if delta quantization should be enabled for hdr video
+ bool enable_hdr_deltaq;
+ // Indicates if encoding with quantization matrices should be enabled.
+ bool using_qm;
+} QuantizationCfg;
+
+/*!\endcond */
+/*!
+ * \brief Algorithm configuration parameters.
+ */
+typedef struct {
+ /*!
+ * Controls the level at which rate-distortion optimization of transform
+ * coefficients favours sharpness in the block. Has no impact on RD when set
+ * to zero (default). For values 1-7, eob and skip block optimization are
+ * avoided and rdmult is adjusted in favour of block sharpness.
+ */
+ int sharpness;
+
+ /*!
+ * Indicates the trellis optimization mode of quantized coefficients.
+ * 0: disabled
+ * 1: enabled
+ * 2: enabled for rd search
+ * 3: true for estimate yrd search
+ */
+ int disable_trellis_quant;
+
+ /*!
+ * The maximum number of frames used to create an arf.
+ */
+ int arnr_max_frames;
+
+ /*!
+ * The temporal filter strength for arf used when creating ARFs.
+ */
+ int arnr_strength;
+
+ /*!
+ * Indicates the CDF update mode
+ * 0: no update
+ * 1: update on every frame(default)
+ * 2: selectively update
+ */
+ uint8_t cdf_update_mode;
+
+ /*!
+ * Indicates if RDO based on frame temporal dependency should be enabled.
+ */
+ bool enable_tpl_model;
+
+ /*!
+ * Indicates if coding of overlay frames for filtered ALTREF frames is
+ * enabled.
+ */
+ bool enable_overlay;
+
+ /*!
+ * Controls loop filtering
+ * 0: Loop filter is disabled for all frames
+ * 1: Loop filter is enabled for all frames
+ * 2: Loop filter is disabled for non-reference frames
+ * 3: Loop filter is disables for the frames with low motion
+ */
+ LOOPFILTER_CONTROL loopfilter_control;
+
+ /*!
+ * Indicates if the application of post-processing filters should be skipped
+ * on reconstructed frame.
+ */
+ bool skip_postproc_filtering;
+} AlgoCfg;
+/*!\cond */
+
+typedef struct {
+ // Indicates the codec bit-depth.
+ aom_bit_depth_t bit_depth;
+ // Indicates the superblock size that should be used by the encoder.
+ aom_superblock_size_t superblock_size;
+ // Indicates if loopfilter modulation should be enabled.
+ bool enable_deltalf_mode;
+ // Indicates how CDEF should be applied.
+ CDEF_CONTROL cdef_control;
+ // Indicates if loop restoration filter should be enabled.
+ bool enable_restoration;
+ // When enabled, video mode should be used even for single frame input.
+ bool force_video_mode;
+ // Indicates if the error resiliency features should be enabled.
+ bool error_resilient_mode;
+ // Indicates if frame parallel decoding feature should be enabled.
+ bool frame_parallel_decoding_mode;
+ // Indicates if the input should be encoded as monochrome.
+ bool enable_monochrome;
+ // When enabled, the encoder will use a full header even for still pictures.
+ // When disabled, a reduced header is used for still pictures.
+ bool full_still_picture_hdr;
+ // Indicates if dual interpolation filters should be enabled.
+ bool enable_dual_filter;
+ // Indicates if frame order hint should be enabled or not.
+ bool enable_order_hint;
+ // Indicates if ref_frame_mvs should be enabled at the sequence level.
+ bool ref_frame_mvs_present;
+ // Indicates if ref_frame_mvs should be enabled at the frame level.
+ bool enable_ref_frame_mvs;
+ // Indicates if interintra compound mode is enabled.
+ bool enable_interintra_comp;
+ // Indicates if global motion should be enabled.
+ bool enable_global_motion;
+ // Indicates if palette should be enabled.
+ bool enable_palette;
+} ToolCfg;
+
+/*!\endcond */
+/*!
+ * \brief Main encoder configuration data structure.
+ */
+typedef struct AV1EncoderConfig {
+ /*!\cond */
+ // Configuration related to the input video.
+ InputCfg input_cfg;
+
+ // Configuration related to frame-dimensions.
+ FrameDimensionCfg frm_dim_cfg;
+
+ /*!\endcond */
+ /*!
+ * Encoder algorithm configuration.
+ */
+ AlgoCfg algo_cfg;
+
+ /*!
+ * Configuration related to key-frames.
+ */
+ KeyFrameCfg kf_cfg;
+
+ /*!
+ * Rate control configuration
+ */
+ RateControlCfg rc_cfg;
+ /*!\cond */
+
+ // Configuration related to Quantization.
+ QuantizationCfg q_cfg;
+
+ // Internal frame size scaling.
+ ResizeCfg resize_cfg;
+
+ // Frame Super-Resolution size scaling.
+ SuperResCfg superres_cfg;
+
+ /*!\endcond */
+ /*!
+ * stats_in buffer contains all of the stats packets produced in the first
+ * pass, concatenated.
+ */
+ aom_fixed_buf_t twopass_stats_in;
+ /*!\cond */
+
+ // Configuration related to encoder toolsets.
+ ToolCfg tool_cfg;
+
+ // Configuration related to Group of frames.
+ GFConfig gf_cfg;
+
+ // Tile related configuration parameters.
+ TileConfig tile_cfg;
+
+ // Configuration related to Tune.
+ TuneCfg tune_cfg;
+
+ // Configuration related to color.
+ ColorCfg color_cfg;
+
+ // Configuration related to decoder model.
+ DecoderModelCfg dec_model_cfg;
+
+ // Configuration related to reference frames.
+ RefFrameCfg ref_frm_cfg;
+
+ // Configuration related to unit tests.
+ UnitTestCfg unit_test_cfg;
+
+ // Flags related to motion mode.
+ MotionModeCfg motion_mode_cfg;
+
+ // Flags related to intra mode search.
+ IntraModeCfg intra_mode_cfg;
+
+ // Flags related to transform size/type.
+ TxfmSizeTypeCfg txfm_cfg;
+
+ // Flags related to compound type.
+ CompoundTypeCfg comp_type_cfg;
+
+ // Partition related information.
+ PartitionCfg part_cfg;
+
+ // Configuration related to frequency of cost update.
+ CostUpdateFreq cost_upd_freq;
+
+#if CONFIG_DENOISE
+ // Indicates the noise level.
+ float noise_level;
+ // Indicates the the denoisers block size.
+ int noise_block_size;
+ // Indicates whether to apply denoising to the frame to be encoded
+ int enable_dnl_denoising;
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ // Noise sensitivity.
+ int noise_sensitivity;
+#endif
+ // Bit mask to specify which tier each of the 32 possible operating points
+ // conforms to.
+ unsigned int tier_mask;
+
+ // Indicates the number of pixels off the edge of a reference frame we're
+ // allowed to go when forming an inter prediction.
+ int border_in_pixels;
+
+ // Indicates the maximum number of threads that may be used by the encoder.
+ int max_threads;
+
+ // Indicates the speed preset to be used.
+ int speed;
+
+ // Indicates the target sequence level index for each operating point(OP).
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+
+ // Indicates the bitstream profile to be used.
+ BITSTREAM_PROFILE profile;
+
+ /*!\endcond */
+ /*!
+ * Indicates the current encoder pass :
+ * AOM_RC_ONE_PASS = One pass encode,
+ * AOM_RC_FIRST_PASS = First pass of multiple-pass
+ * AOM_RC_SECOND_PASS = Second pass of multiple-pass
+ * AOM_RC_THIRD_PASS = Third pass of multiple-pass
+ */
+ enum aom_enc_pass pass;
+ /*!\cond */
+
+ // Total number of encoding passes.
+ int passes;
+
+ // the name of the second pass output file when passes > 2
+ const char *two_pass_output;
+
+ // the name of the second pass log file when passes > 2
+ const char *second_pass_log;
+
+ // Indicates if the encoding is GOOD or REALTIME.
+ MODE mode;
+
+ // Indicates if row-based multi-threading should be enabled or not.
+ bool row_mt;
+
+ // Indicates if frame parallel multi-threading should be enabled or not.
+ bool fp_mt;
+
+ // Indicates if 16bit frame buffers are to be used i.e., the content is >
+ // 8-bit.
+ bool use_highbitdepth;
+
+ // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as
+ // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B
+ // format.
+ bool save_as_annexb;
+
+ // The path for partition stats reading and writing, used in the experiment
+ // CONFIG_PARTITION_SEARCH_ORDER.
+ const char *partition_info_path;
+
+ // The flag that indicates whether we use an external rate distribution to
+ // guide adaptive quantization. It requires --deltaq-mode=3. The rate
+ // distribution map file name is stored in |rate_distribution_info|.
+ unsigned int enable_rate_guide_deltaq;
+
+ // The input file of rate distribution information used in all intra mode
+ // to determine delta quantization.
+ const char *rate_distribution_info;
+
+ // Exit the encoder when it fails to encode to a given level.
+ int strict_level_conformance;
+
+ // Max depth for the GOP after a key frame
+ int kf_max_pyr_height;
+
+ // A flag to control if we enable the superblock qp sweep for a given lambda
+ int sb_qp_sweep;
+ /*!\endcond */
+} AV1EncoderConfig;
+
+/*!\cond */
+static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) {
+ return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0;
+}
+/*!\endcond */
+
+/*!
+ * \brief Encoder-side probabilities for pruning of various AV1 tools
+ */
+typedef struct {
+ /*!
+ * obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+ * jth block size and ith frame update type, averaged over past frames. If
+ * obmc_probs[i][j] < thresh, then OBMC search is pruned.
+ */
+ int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+ /*!
+ * warped_probs[i] is the probability of warped motion being the best motion
+ * mode for ith frame update type, averaged over past frames. If
+ * warped_probs[i] < thresh, then warped motion search is pruned.
+ */
+ int warped_probs[FRAME_UPDATE_TYPES];
+
+ /*!
+ * tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+ * for jth transform size and ith frame update type, averaged over past
+ * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+ * type is pruned.
+ */
+ int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
+
+ /*!
+ * switchable_interp_probs[i][j][k] is the probability of kth interpolation
+ * filter being the best for jth filter context and ith frame update type,
+ * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+ * then interpolation filter search is pruned for that case.
+ */
+ int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+} FrameProbInfo;
+
+/*!\cond */
+
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+ unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+ unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+ unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+ unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+ unsigned int cfl_sign[CFL_JOINT_SIGNS];
+ unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+ unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+ unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+ unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ unsigned int palette_y_color_index[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ unsigned int palette_uv_color_index[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+ unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+ unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [EOB_COEF_CONTEXTS][2];
+ unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+ unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+ [2];
+ unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+ unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+ unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+ unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+ unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+ unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+ unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+ unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+ unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [LEVEL_CONTEXTS][BR_CDF_SIZE];
+ unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+ unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+ unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+ unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+ unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+ unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+ unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+ unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+ unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+ unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+ unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+ unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+ unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+ unsigned int obmc[BLOCK_SIZES_ALL][2];
+ unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+ unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+ unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+ unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+ unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+ unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+ unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+ unsigned int intrabc[2];
+
+ unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+ unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+ unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+ unsigned int skip_txfm[SKIP_CONTEXTS][2];
+ unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+ unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+ unsigned int delta_q[DELTA_Q_PROBS][2];
+ unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+ unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+ unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+ unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+ unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+ unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+ unsigned int wiener_restore[2];
+ unsigned int sgrproj_restore[2];
+#endif // CONFIG_ENTROPY_STATS
+
+ unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+ int ready;
+ double a;
+ double b;
+ double dist_mean;
+ double ld_mean;
+ double sse_mean;
+ double sse_sse_mean;
+ double sse_ld_mean;
+ int num;
+ double dist_sum;
+ double ld_sum;
+ double sse_sum;
+ double sse_sse_sum;
+ double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+ int idx;
+ int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_mode_info, which makes this terribly confusing.
+/*!\endcond */
+/*!
+ * \brief Struct used to hold inter mode data for fast tx search.
+ *
+ * This struct is used to perform a full transform search only on winning
+ * candidates searched with an estimate for transform coding RD.
+ */
+typedef struct inter_modes_info {
+ /*!
+ * The number of inter modes for which data was stored in each of the
+ * following arrays.
+ */
+ int num;
+ /*!
+ * Mode info struct for each of the candidate modes.
+ */
+ MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+ /*!
+ * The rate for each of the candidate modes.
+ */
+ int mode_rate_arr[MAX_INTER_MODES];
+ /*!
+ * The sse of the predictor for each of the candidate modes.
+ */
+ int64_t sse_arr[MAX_INTER_MODES];
+ /*!
+ * The estimated rd of the predictor for each of the candidate modes.
+ */
+ int64_t est_rd_arr[MAX_INTER_MODES];
+ /*!
+ * The rate and mode index for each of the candidate modes.
+ */
+ RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+ /*!
+ * The full rd stats for each of the candidate modes.
+ */
+ RD_STATS rd_cost_arr[MAX_INTER_MODES];
+ /*!
+ * The full rd stats of luma only for each of the candidate modes.
+ */
+ RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+ /*!
+ * The full rd stats of chroma only for each of the candidate modes.
+ */
+ RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
+} InterModesInfo;
+
+/*!\cond */
+typedef struct {
+ // TODO(kyslov): consider changing to 64bit
+
+ // This struct is used for computing variance in choose_partitioning(), where
+ // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+ // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+ // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+ uint32_t sum_square_error;
+ int32_t sum_error;
+ int log2_count;
+ int variance;
+} VPartVar;
+
+typedef struct {
+ VPartVar none;
+ VPartVar horz[2];
+ VPartVar vert[2];
+} VPVariance;
+
+typedef struct {
+ VPVariance part_variances;
+ VPartVar split[4];
+} VP4x4;
+
+typedef struct {
+ VPVariance part_variances;
+ VP4x4 split[4];
+} VP8x8;
+
+typedef struct {
+ VPVariance part_variances;
+ VP8x8 split[4];
+} VP16x16;
+
+typedef struct {
+ VPVariance part_variances;
+ VP16x16 split[4];
+} VP32x32;
+
+typedef struct {
+ VPVariance part_variances;
+ VP32x32 split[4];
+} VP64x64;
+
+typedef struct {
+ VPVariance part_variances;
+ VP64x64 *split;
+} VP128x128;
+
+/*!\endcond */
+
+/*!
+ * \brief Thresholds for variance based partitioning.
+ */
+typedef struct {
+ /*!
+ * If block variance > threshold, then that block is forced to split.
+ * thresholds[0] - threshold for 128x128;
+ * thresholds[1] - threshold for 64x64;
+ * thresholds[2] - threshold for 32x32;
+ * thresholds[3] - threshold for 16x16;
+ * thresholds[4] - threshold for 8x8;
+ */
+ int64_t thresholds[5];
+
+ /*!
+ * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+ * minmax > threshold_minmax, the 16x16 is forced to split.
+ */
+ int64_t threshold_minmax;
+} VarBasedPartitionInfo;
+
+/*!
+ * \brief Encoder parameters for synchronization of row based multi-threading
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+ /**
+ * \name Synchronization objects for top-right dependency.
+ */
+ /**@{*/
+ pthread_mutex_t *mutex_; /*!< Mutex lock object */
+ pthread_cond_t *cond_; /*!< Condition variable */
+ /**@}*/
+#endif // CONFIG_MULTITHREAD
+ /*!
+ * Buffer to store the superblock whose encoding is complete.
+ * num_finished_cols[i] stores the number of superblocks which finished
+ * encoding in the ith superblock row.
+ */
+ int *num_finished_cols;
+ /*!
+ * Denotes the superblock interval at which conditional signalling should
+ * happen. Also denotes the minimum number of extra superblocks of the top row
+ * to be complete to start encoding the current superblock. A value of 1
+ * indicates top-right dependency.
+ */
+ int sync_range;
+ /*!
+ * Denotes the additional number of superblocks in the previous row to be
+ * complete to start encoding the current superblock when intraBC tool is
+ * enabled. This additional top-right delay is required to satisfy the
+ * hardware constraints for intraBC tool when row multithreading is enabled.
+ */
+ int intrabc_extra_top_right_sb_delay;
+ /*!
+ * Number of superblock rows.
+ */
+ int rows;
+ /*!
+ * The superblock row (in units of MI blocks) to be processed next.
+ */
+ int next_mi_row;
+ /*!
+ * Number of threads processing the current tile.
+ */
+ int num_threads_working;
+} AV1EncRowMultiThreadSync;
+
+/*!\cond */
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+ TileInfo tile_info;
+ DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+ FRAME_CONTEXT *row_ctx;
+ uint64_t abs_sum_level;
+ uint8_t allow_update_cdf;
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ AV1EncRowMultiThreadSync row_mt_sync;
+ MV firstpass_top_mv;
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+ int compound_ref_used_flag;
+ int skip_mode_used_flag;
+ int tx_type_used[TX_SIZES_ALL][TX_TYPES];
+ int obmc_used[BLOCK_SIZES_ALL][2];
+ int warped_used[2];
+ int newmv_or_intra_blocks;
+ uint64_t seg_tmp_pred_cost[2];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+ MACROBLOCK mb;
+ MvCosts *mv_costs_alloc;
+ IntraBCMVCosts *dv_costs_alloc;
+ RD_COUNTS rd_counts;
+ FRAME_COUNTS *counts;
+ PC_TREE_SHARED_BUFFERS shared_coeff_buf;
+ SIMPLE_MOTION_DATA_TREE *sms_tree;
+ SIMPLE_MOTION_DATA_TREE *sms_root;
+ uint32_t *hash_value_buffer[2][2];
+ OBMCBuffer obmc_buffer;
+ PALETTE_BUFFER *palette_buffer;
+ CompoundTypeRdBuffers comp_rd_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint64_t abs_sum_level;
+ uint8_t *tmp_pred_bufs[2];
+ uint8_t *wiener_tmp_pred_buf;
+ int intrabc_used;
+ int deltaq_used;
+ int coefficient_size;
+ int max_mv_magnitude;
+ int interp_filter_selected[SWITCHABLE];
+ FRAME_CONTEXT *tctx;
+ VP64x64 *vt64x64;
+ int32_t num_64x64_blocks;
+ PICK_MODE_CONTEXT *firstpass_ctx;
+ TemporalFilterData tf_data;
+ TplBuffers tpl_tmp_buffers;
+ TplTxfmStats tpl_txfm_stats;
+ GlobalMotionData gm_data;
+ // Pointer to the array of structures to store gradient information of each
+ // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+ // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+ PixelLevelGradientInfo *pixel_gradient_info;
+ // Pointer to the array of structures to store source variance information of
+ // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+ // store source variance and log of source variance of each 4x4 sub-block
+ // for subsequent retrieval.
+ Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+ // Pointer to pc tree root.
+ PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+/*!\endcond */
+
+/*!
+ * \brief Encoder data related to row-based multi-threading
+ */
+typedef struct {
+ /*!
+ * Number of tile rows for which row synchronization memory is allocated.
+ */
+ int allocated_tile_rows;
+ /*!
+ * Number of tile cols for which row synchronization memory is allocated.
+ */
+ int allocated_tile_cols;
+ /*!
+ * Number of rows for which row synchronization memory is allocated
+ * per tile. During first-pass/look-ahead stage this equals the
+ * maximum number of macroblock rows in a tile. During encode stage,
+ * this equals the maximum number of superblock rows in a tile.
+ */
+ int allocated_rows;
+ /*!
+ * Number of columns for which entropy context memory is allocated
+ * per tile. During encode stage, this equals the maximum number of
+ * superblock columns in a tile minus 1. The entropy context memory
+ * is not allocated during first-pass/look-ahead stage.
+ */
+ int allocated_cols;
+
+ /*!
+ * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread.
+ */
+ int thread_id_to_tile_id[MAX_NUM_THREADS];
+
+ /*!
+ * num_tile_cols_done[i] indicates the number of tile columns whose encoding
+ * is complete in the ith superblock row.
+ */
+ int *num_tile_cols_done;
+
+ /*!
+ * Number of superblock rows in a frame for which 'num_tile_cols_done' is
+ * allocated.
+ */
+ int allocated_sb_rows;
+
+ /*!
+ * Initialized to false, set to true by the worker thread that encounters an
+ * error in order to abort the processing of other worker threads.
+ */
+ bool row_mt_exit;
+
+ /*!
+ * Initialized to false, set to true during first pass encoding by the worker
+ * thread that encounters an error in order to abort the processing of other
+ * worker threads.
+ */
+ bool firstpass_mt_exit;
+
+ /*!
+ * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker
+ * thread that encounters an error in order to abort the processing of other
+ * worker threads.
+ */
+ bool mb_wiener_mt_exit;
+
+#if CONFIG_MULTITHREAD
+ /*!
+ * Mutex lock used while dispatching jobs.
+ */
+ pthread_mutex_t *mutex_;
+ /*!
+ * Condition variable used to dispatch loopfilter jobs.
+ */
+ pthread_cond_t *cond_;
+#endif
+
+ /**
+ * \name Row synchronization related function pointers.
+ */
+ /**@{*/
+ /*!
+ * Reader.
+ */
+ void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+ /*!
+ * Writer.
+ */
+ void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+ /**@}*/
+} AV1EncRowMultiThreadInfo;
+
+/*!
+ * \brief Encoder data related to multi-threading for allintra deltaq-mode=3
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+ /*!
+ * Mutex lock used while dispatching jobs.
+ */
+ pthread_mutex_t *mutex_;
+ /*!
+ * Condition variable used to dispatch loopfilter jobs.
+ */
+ pthread_cond_t *cond_;
+#endif
+
+ /**
+ * \name Row synchronization related function pointers for all intra mode
+ */
+ /**@{*/
+ /*!
+ * Reader.
+ */
+ void (*intra_sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+ /*!
+ * Writer.
+ */
+ void (*intra_sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+ /**@}*/
+} AV1EncAllIntraMultiThreadInfo;
+
+/*!
+ * \brief Max number of recodes used to track the frame probabilities.
+ */
+#define NUM_RECODES_PER_FRAME 10
+
+/*!
+ * \brief Max number of frames that can be encoded in a parallel encode set.
+ */
+#define MAX_PARALLEL_FRAMES 4
+
+/*!
+ * \brief Buffers to be backed up during parallel encode set to be restored
+ * later.
+ */
+typedef struct RestoreStateBuffers {
+ /*!
+ * Backup of original CDEF srcbuf.
+ */
+ uint16_t *cdef_srcbuf;
+
+ /*!
+ * Backup of original CDEF colbuf.
+ */
+ uint16_t *cdef_colbuf[MAX_MB_PLANE];
+
+ /*!
+ * Backup of original LR rst_tmpbuf.
+ */
+ int32_t *rst_tmpbuf;
+
+ /*!
+ * Backup of original LR rlbs.
+ */
+ RestorationLineBuffers *rlbs;
+} RestoreStateBuffers;
+
+/*!
+ * \brief Parameters related to restoration types.
+ */
+typedef struct {
+ /*!
+ * Stores the best coefficients for Wiener restoration.
+ */
+ WienerInfo wiener;
+
+ /*!
+ * Stores the best coefficients for Sgrproj restoration.
+ */
+ SgrprojInfo sgrproj;
+
+ /*!
+ * The rtype to use for this unit given a frame rtype as index. Indices:
+ * WIENER, SGRPROJ, SWITCHABLE.
+ */
+ RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+/*!
+ * \brief Structure to hold search parameter per restoration unit and
+ * intermediate buffer of Wiener filter used in pick filter stage of Loop
+ * restoration.
+ */
+typedef struct {
+ /*!
+ * Array of pointers to 'RestUnitSearchInfo' which holds data related to
+ * restoration types.
+ */
+ RestUnitSearchInfo *rusi[MAX_MB_PLANE];
+
+ /*!
+ * Buffer used to hold dgd-avg data during SIMD call of Wiener filter.
+ */
+ int16_t *dgd_avg;
+} AV1LrPickStruct;
+
+/*!
+ * \brief Primary Encoder parameters related to multi-threading.
+ */
+typedef struct PrimaryMultiThreadInfo {
+ /*!
+ * Number of workers created for multi-threading.
+ */
+ int num_workers;
+
+ /*!
+ * Number of workers used for different MT modules.
+ */
+ int num_mod_workers[NUM_MT_MODULES];
+
+ /*!
+ * Synchronization object used to launch job in the worker thread.
+ */
+ AVxWorker *workers;
+
+ /*!
+ * Data specific to each worker in encoder multi-threading.
+ * tile_thr_data[i] stores the worker data of the ith thread.
+ */
+ struct EncWorkerData *tile_thr_data;
+
+ /*!
+ * CDEF row multi-threading data.
+ */
+ AV1CdefWorkerData *cdef_worker;
+
+ /*!
+ * Primary(Level 1) Synchronization object used to launch job in the worker
+ * thread.
+ */
+ AVxWorker *p_workers[MAX_PARALLEL_FRAMES];
+
+ /*!
+ * Number of primary workers created for multi-threading.
+ */
+ int p_num_workers;
+
+ /*!
+ * Tracks the number of workers in encode stage multi-threading.
+ */
+ int prev_num_enc_workers;
+} PrimaryMultiThreadInfo;
+
+/*!
+ * \brief Encoder parameters related to multi-threading.
+ */
+typedef struct MultiThreadInfo {
+ /*!
+ * Number of workers created for multi-threading.
+ */
+ int num_workers;
+
+ /*!
+ * Number of workers used for different MT modules.
+ */
+ int num_mod_workers[NUM_MT_MODULES];
+
+ /*!
+ * Synchronization object used to launch job in the worker thread.
+ */
+ AVxWorker *workers;
+
+ /*!
+ * Data specific to each worker in encoder multi-threading.
+ * tile_thr_data[i] stores the worker data of the ith thread.
+ */
+ struct EncWorkerData *tile_thr_data;
+
+ /*!
+ * When set, indicates that row based multi-threading of the encoder is
+ * enabled.
+ */
+ bool row_mt_enabled;
+
+ /*!
+ * When set, indicates that multi-threading for bitstream packing is enabled.
+ */
+ bool pack_bs_mt_enabled;
+
+ /*!
+ * Encoder row multi-threading data.
+ */
+ AV1EncRowMultiThreadInfo enc_row_mt;
+
+ /*!
+ * Encoder multi-threading data for allintra mode in the preprocessing stage
+ * when --deltaq-mode=3.
+ */
+ AV1EncAllIntraMultiThreadInfo intra_mt;
+
+ /*!
+ * Tpl row multi-threading data.
+ */
+ AV1TplRowMultiThreadInfo tpl_row_mt;
+
+ /*!
+ * Loop Filter multi-threading object.
+ */
+ AV1LfSync lf_row_sync;
+
+ /*!
+ * Loop Restoration multi-threading object.
+ */
+ AV1LrSync lr_row_sync;
+
+ /*!
+ * Pack bitstream multi-threading object.
+ */
+ AV1EncPackBSSync pack_bs_sync;
+
+ /*!
+ * Global Motion multi-threading object.
+ */
+ AV1GlobalMotionSync gm_sync;
+
+ /*!
+ * Temporal Filter multi-threading object.
+ */
+ AV1TemporalFilterSync tf_sync;
+
+ /*!
+ * CDEF search multi-threading object.
+ */
+ AV1CdefSync cdef_sync;
+
+ /*!
+ * Pointer to CDEF row multi-threading data for the frame.
+ */
+ AV1CdefWorkerData *cdef_worker;
+
+ /*!
+ * Buffers to be stored/restored before/after parallel encode.
+ */
+ RestoreStateBuffers restore_state_buf;
+
+ /*!
+ * In multi-threaded realtime encoding with row-mt enabled, pipeline
+ * loop-filtering after encoding.
+ */
+ int pipeline_lpf_mt_with_enc;
+} MultiThreadInfo;
+
+/*!\cond */
+
+typedef struct ActiveMap {
+ int enabled;
+ int update;
+ unsigned char *map;
+} ActiveMap;
+
+/*!\endcond */
+
+/*!
+ * \brief Encoder info used for decision on forcing integer motion vectors.
+ */
+typedef struct {
+ /*!
+ * cs_rate_array[i] is the fraction of blocks in a frame which either match
+ * with the collocated block or are smooth, where i is the rate_index.
+ */
+ double cs_rate_array[32];
+ /*!
+ * rate_index is used to index cs_rate_array.
+ */
+ int rate_index;
+ /*!
+ * rate_size is the total number of entries populated in cs_rate_array.
+ */
+ int rate_size;
+} ForceIntegerMVInfo;
+
+/*!\cond */
+
+#if CONFIG_INTERNAL_STATS
+// types of stats
+enum {
+ STAT_Y,
+ STAT_U,
+ STAT_V,
+ STAT_ALL,
+ NUM_STAT_TYPES // This should always be the last member of the enum
+} UENUM1BYTE(StatType);
+
+typedef struct IMAGE_STAT {
+ double stat[NUM_STAT_TYPES];
+ double worst;
+} ImageStat;
+#endif // CONFIG_INTERNAL_STATS
+
+typedef struct {
+ int ref_count;
+ YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+/*!\endcond */
+
+/*!
+ * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level
+ *
+ * This is used for bitstream preparation.
+ */
+typedef struct {
+ /*!
+ * frame_base[mi_row * stride + mi_col] stores the mode information of
+ * block (mi_row,mi_col).
+ */
+ MB_MODE_INFO_EXT_FRAME *frame_base;
+ /*!
+ * Size of frame_base buffer.
+ */
+ int alloc_size;
+ /*!
+ * Stride of frame_base buffer.
+ */
+ int stride;
+} MBMIExtFrameBufferInfo;
+
+/*!\cond */
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct FramePartitionTimingStats {
+ int partition_decisions[6][EXT_PARTITION_TYPES];
+ int partition_attempts[6][EXT_PARTITION_TYPES];
+ int64_t partition_times[6][EXT_PARTITION_TYPES];
+
+ int partition_redo;
+} FramePartitionTimingStats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "aom_ports/aom_timer.h"
+// Adjust the following to add new components.
+enum {
+ av1_encode_strategy_time,
+ av1_get_one_pass_rt_params_time,
+ av1_get_second_pass_params_time,
+ denoise_and_encode_time,
+ apply_filtering_time,
+ av1_tpl_setup_stats_time,
+ encode_frame_to_data_rate_time,
+ encode_with_or_without_recode_time,
+ loop_filter_time,
+ cdef_time,
+ loop_restoration_time,
+ av1_pack_bitstream_final_time,
+ av1_encode_frame_time,
+ av1_compute_global_motion_time,
+ av1_setup_motion_field_time,
+ encode_sb_row_time,
+
+ rd_pick_partition_time,
+ rd_use_partition_time,
+ choose_var_based_partitioning_time,
+ av1_prune_partitions_time,
+ none_partition_search_time,
+ split_partition_search_time,
+ rectangular_partition_search_time,
+ ab_partitions_search_time,
+ rd_pick_4partition_time,
+ encode_sb_time,
+
+ rd_pick_sb_modes_time,
+ av1_rd_pick_intra_mode_sb_time,
+ av1_rd_pick_inter_mode_sb_time,
+ set_params_rd_pick_inter_mode_time,
+ skip_inter_mode_time,
+ handle_inter_mode_time,
+ evaluate_motion_mode_for_winner_candidates_time,
+ do_tx_search_time,
+ handle_intra_mode_time,
+ refine_winner_mode_tx_time,
+ av1_search_palette_mode_time,
+ handle_newmv_time,
+ compound_type_rd_time,
+ interpolation_filter_search_time,
+ motion_mode_rd_time,
+
+ nonrd_use_partition_time,
+ pick_sb_modes_nonrd_time,
+ hybrid_intra_mode_search_time,
+ nonrd_pick_inter_mode_sb_time,
+ encode_b_nonrd_time,
+
+ kTimingComponents,
+} UENUM1BYTE(TIMING_COMPONENT);
+
+static INLINE char const *get_component_name(int index) {
+ switch (index) {
+ case av1_encode_strategy_time: return "av1_encode_strategy_time";
+ case av1_get_one_pass_rt_params_time:
+ return "av1_get_one_pass_rt_params_time";
+ case av1_get_second_pass_params_time:
+ return "av1_get_second_pass_params_time";
+ case denoise_and_encode_time: return "denoise_and_encode_time";
+ case apply_filtering_time: return "apply_filtering_time";
+ case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time";
+ case encode_frame_to_data_rate_time:
+ return "encode_frame_to_data_rate_time";
+ case encode_with_or_without_recode_time:
+ return "encode_with_or_without_recode_time";
+ case loop_filter_time: return "loop_filter_time";
+ case cdef_time: return "cdef_time";
+ case loop_restoration_time: return "loop_restoration_time";
+ case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time";
+ case av1_encode_frame_time: return "av1_encode_frame_time";
+ case av1_compute_global_motion_time:
+ return "av1_compute_global_motion_time";
+ case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
+ case encode_sb_row_time: return "encode_sb_row_time";
+
+ case rd_pick_partition_time: return "rd_pick_partition_time";
+ case rd_use_partition_time: return "rd_use_partition_time";
+ case choose_var_based_partitioning_time:
+ return "choose_var_based_partitioning_time";
+ case av1_prune_partitions_time: return "av1_prune_partitions_time";
+ case none_partition_search_time: return "none_partition_search_time";
+ case split_partition_search_time: return "split_partition_search_time";
+ case rectangular_partition_search_time:
+ return "rectangular_partition_search_time";
+ case ab_partitions_search_time: return "ab_partitions_search_time";
+ case rd_pick_4partition_time: return "rd_pick_4partition_time";
+ case encode_sb_time: return "encode_sb_time";
+
+ case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+ case av1_rd_pick_intra_mode_sb_time:
+ return "av1_rd_pick_intra_mode_sb_time";
+ case av1_rd_pick_inter_mode_sb_time:
+ return "av1_rd_pick_inter_mode_sb_time";
+ case set_params_rd_pick_inter_mode_time:
+ return "set_params_rd_pick_inter_mode_time";
+ case skip_inter_mode_time: return "skip_inter_mode_time";
+ case handle_inter_mode_time: return "handle_inter_mode_time";
+ case evaluate_motion_mode_for_winner_candidates_time:
+ return "evaluate_motion_mode_for_winner_candidates_time";
+ case do_tx_search_time: return "do_tx_search_time";
+ case handle_intra_mode_time: return "handle_intra_mode_time";
+ case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time";
+ case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
+ case handle_newmv_time: return "handle_newmv_time";
+ case compound_type_rd_time: return "compound_type_rd_time";
+ case interpolation_filter_search_time:
+ return "interpolation_filter_search_time";
+ case motion_mode_rd_time: return "motion_mode_rd_time";
+
+ case nonrd_use_partition_time: return "nonrd_use_partition_time";
+ case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time";
+ case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time";
+ case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time";
+ case encode_b_nonrd_time: return "encode_b_nonrd_time";
+
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+// The maximum number of internal ARFs except ALTREF_FRAME
+#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to global motion search
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if global motion search needs to be rerun.
+ */
+ bool search_done;
+
+ /*!
+ * Array of pointers to the frame buffers holding the reference frames.
+ * ref_buf[i] stores the pointer to the reference frame of the ith
+ * reference frame type.
+ */
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+
+ /*!
+ * Holds the number of valid reference frames in past and future directions
+ * w.r.t. the current frame. num_ref_frames[i] stores the total number of
+ * valid reference frames in 'i' direction.
+ */
+ int num_ref_frames[MAX_DIRECTIONS];
+
+ /*!
+ * Array of structure which stores the valid reference frames in past and
+ * future directions and their corresponding distance from the source frame.
+ * reference_frames[i][j] holds the jth valid reference frame type in the
+ * direction 'i' and its temporal distance from the source frame .
+ */
+ FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1];
+
+ /**
+ * \name Dimensions for which segment map is allocated.
+ */
+ /**@{*/
+ int segment_map_w; /*!< segment map width */
+ int segment_map_h; /*!< segment map height */
+ /**@}*/
+} GlobalMotionInfo;
+
+/*!
+ * \brief Flags related to interpolation filter search
+ */
+typedef struct {
+ /*!
+ * Stores the default value of skip flag depending on chroma format
+ * Set as 1 for monochrome and 3 for other color formats
+ */
+ int default_interp_skip_flags;
+ /*!
+ * Filter mask to allow certain interp_filter type.
+ */
+ uint16_t interp_filter_search_mask;
+} InterpSearchFlags;
+
+/*!
+ * \brief Parameters for motion vector search process
+ */
+typedef struct {
+ /*!
+ * Largest MV component used in a frame.
+ * The value from the previous frame is used to set the full pixel search
+ * range for the current frame.
+ */
+ int max_mv_magnitude;
+ /*!
+ * Parameter indicating initial search window to be used in full-pixel search.
+ * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+ */
+ int mv_step_param;
+ /*!
+ * Pointer to sub-pixel search function.
+ * In encoder: av1_find_best_sub_pixel_tree
+ * av1_find_best_sub_pixel_tree_pruned
+ * av1_find_best_sub_pixel_tree_pruned_more
+ * In MV unit test: av1_return_max_sub_pixel_mv
+ * av1_return_min_sub_pixel_mv
+ */
+ fractional_mv_step_fp *find_fractional_mv_step;
+ /*!
+ * Search site configuration for full-pel MV search.
+ * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple
+ * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal
+ * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+ */
+ search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS];
+} MotionVectorSearchParams;
+
+/*!
+ * \brief Refresh frame flags for different type of frames.
+ *
+ * If the refresh flag is true for a particular reference frame, after the
+ * current frame is encoded, the reference frame gets refreshed (updated) to
+ * be the current frame. Note: Usually at most one flag will be set to true at
+ * a time. But, for key-frames, all flags are set to true at once.
+ */
+typedef struct {
+ bool golden_frame; /*!< Refresh flag for golden frame */
+ bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+ bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+} RefreshFrameInfo;
+
+/*!
+ * \brief Desired dimensions for an externally triggered resize.
+ *
+ * When resize is triggered externally, the desired dimensions are stored in
+ * this struct until used in the next frame to be coded. These values are
+ * effective only for one frame and are reset after they are used.
+ */
+typedef struct {
+ int width; /*!< Desired resized width */
+ int height; /*!< Desired resized height */
+} ResizePendingParams;
+
+/*!
+ * \brief Refrence frame distance related variables.
+ */
+typedef struct {
+ /*!
+ * True relative distance of reference frames w.r.t. the current frame.
+ */
+ int ref_relative_dist[INTER_REFS_PER_FRAME];
+ /*!
+ * The nearest reference w.r.t. current frame in the past.
+ */
+ int8_t nearest_past_ref;
+ /*!
+ * The nearest reference w.r.t. current frame in the future.
+ */
+ int8_t nearest_future_ref;
+} RefFrameDistanceInfo;
+
+/*!
+ * \brief Parameters used for winner mode processing.
+ *
+ * This is a basic two pass approach: in the first pass, we reduce the number of
+ * transform searches based on some thresholds during the rdopt process to find
+ * the "winner mode". In the second pass, we perform a more through tx search
+ * on the winner mode.
+ * There are some arrays in the struct, and their indices are used in the
+ * following manner:
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable
+ * (Eg : IntraBc).
+ * Index 1: Mode evaluation.
+ * Index 2: Winner mode evaluation
+ * Index 1 and 2 are only used when the respective speed feature is on.
+ */
+typedef struct {
+ /*!
+ * Threshold to determine if trellis optimization is to be enabled
+ * based on :
+ * 0 : dist threshold
+ * 1 : satd threshold
+ * Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+ */
+ unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2];
+
+ /*!
+ * Determines the tx size search method during rdopt.
+ * Corresponds to enable_winner_mode_for_tx_size_srch speed feature.
+ */
+ TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
+
+ /*!
+ * Controls how often we should approximate prediction error with tx
+ * coefficients. If it's 0, then never. If 1, then it's during the tx_type
+ * search only. If 2, then always.
+ * Corresponds to tx_domain_dist_level speed feature.
+ */
+ unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
+
+ /*!
+ * Threshold to approximate pixel domain distortion with transform domain
+ * distortion. This is only used if use_transform_domain_distortion is on.
+ * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature.
+ */
+ unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+ /*!
+ * Controls how often we should try to skip the transform process based on
+ * result from dct.
+ * Corresponds to use_skip_flag_prediction speed feature.
+ */
+ unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+ /*!
+ * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+ * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+ */
+ unsigned int predict_dc_level[MODE_EVAL_TYPES];
+} WinnerModeParams;
+
+/*!
+ * \brief Frame refresh flags set by the external interface.
+ *
+ * Flags set by external interface to determine which reference buffers are
+ * refreshed by this frame. When set, the encoder will update the particular
+ * reference frame buffer with the contents of the current frame.
+ */
+typedef struct {
+ bool last_frame; /*!< Refresh flag for last frame */
+ bool golden_frame; /*!< Refresh flag for golden frame */
+ bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+ bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */
+ bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+ /*!
+ * Flag indicating if the update of refresh frame flags is pending.
+ */
+ bool update_pending;
+} ExtRefreshFrameFlagsInfo;
+
+/*!
+ * \brief Flags signalled by the external interface at frame level.
+ */
+typedef struct {
+ /*!
+ * Bit mask to disable certain reference frame types.
+ */
+ int ref_frame_flags;
+
+ /*!
+ * Frame refresh flags set by the external interface.
+ */
+ ExtRefreshFrameFlagsInfo refresh_frame;
+
+ /*!
+ * Flag to enable the update of frame contexts at the end of a frame decode.
+ */
+ bool refresh_frame_context;
+
+ /*!
+ * Flag to indicate that update of refresh_frame_context from external
+ * interface is pending.
+ */
+ bool refresh_frame_context_pending;
+
+ /*!
+ * Flag to enable temporal MV prediction.
+ */
+ bool use_ref_frame_mvs;
+
+ /*!
+ * Indicates whether the current frame is to be coded as error resilient.
+ */
+ bool use_error_resilient;
+
+ /*!
+ * Indicates whether the current frame is to be coded as s-frame.
+ */
+ bool use_s_frame;
+
+ /*!
+ * Indicates whether the current frame's primary_ref_frame is set to
+ * PRIMARY_REF_NONE.
+ */
+ bool use_primary_ref_none;
+} ExternalFlags;
+
+/*!\cond */
+
+typedef struct {
+ // Some misc info
+ int high_prec;
+ int q;
+ int order;
+
+ // MV counters
+ int inter_count;
+ int intra_count;
+ int default_mvs;
+ int mv_joint_count[4];
+ int last_bit_zero;
+ int last_bit_nonzero;
+
+ // Keep track of the rates
+ int total_mv_rate;
+ int hp_total_mv_rate;
+ int lp_total_mv_rate;
+
+ // Texture info
+ int horz_text;
+ int vert_text;
+ int diag_text;
+
+ // Whether the current struct contains valid data
+ int valid;
+} MV_STATS;
+
+typedef struct WeberStats {
+ int64_t mb_wiener_variance;
+ int64_t src_variance;
+ int64_t rec_variance;
+ int16_t src_pix_max;
+ int16_t rec_pix_max;
+ int64_t distortion;
+ int64_t satd;
+ double max_scale;
+} WeberStats;
+
+typedef struct {
+ struct loopfilter lf;
+ CdefInfo cdef_info;
+ YV12_BUFFER_CONFIG copy_buffer;
+ RATE_CONTROL rc;
+ MV_STATS mv_stats;
+} CODING_CONTEXT;
+
+typedef struct {
+ int frame_width;
+ int frame_height;
+ int mi_rows;
+ int mi_cols;
+ int mb_rows;
+ int mb_cols;
+ int num_mbs;
+ aom_bit_depth_t bit_depth;
+ int subsampling_x;
+ int subsampling_y;
+} FRAME_INFO;
+
+/*!
+ * \brief This structure stores different types of frame indices.
+ */
+typedef struct {
+ int show_frame_count;
+} FRAME_INDEX_SET;
+
+/*!\endcond */
+
+/*!
+ * \brief Segmentation related information for the current frame.
+ */
+typedef struct {
+ /*!
+ * 3-bit number containing the segment affiliation for each 4x4 block in the
+ * frame. map[y * stride + x] contains the segment id of the 4x4 block at
+ * (x,y) position.
+ */
+ uint8_t *map;
+ /*!
+ * Flag to indicate if current frame has lossless segments or not.
+ * 1: frame has at least one lossless segment.
+ * 0: frame has no lossless segments.
+ */
+ bool has_lossless_segment;
+} EncSegmentationInfo;
+
+/*!
+ * \brief Frame time stamps.
+ */
+typedef struct {
+ /*!
+ * Start time stamp of the previous frame
+ */
+ int64_t prev_ts_start;
+ /*!
+ * End time stamp of the previous frame
+ */
+ int64_t prev_ts_end;
+ /*!
+ * Start time stamp of the first frame
+ */
+ int64_t first_ts_start;
+} TimeStamps;
+
+/*!
+ * Pointers to the memory allocated for frame level transform coeff related
+ * info.
+ */
+typedef struct {
+ /*!
+ * Pointer to the transformed coefficients buffer.
+ */
+ tran_low_t *tcoeff;
+ /*!
+ * Pointer to the eobs buffer.
+ */
+ uint16_t *eobs;
+ /*!
+ * Pointer to the entropy_ctx buffer.
+ */
+ uint8_t *entropy_ctx;
+} CoeffBufferPool;
+
+#if !CONFIG_REALTIME_ONLY
+/*!\cond */
+// DUCKY_ENCODE_FRAME_MODE is c version of EncodeFrameMode
+enum {
+ DUCKY_ENCODE_FRAME_MODE_NONE, // Let native AV1 determine q index and rdmult
+ DUCKY_ENCODE_FRAME_MODE_QINDEX, // DuckyEncode determines q index and AV1
+ // determines rdmult
+ DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT, // DuckyEncode determines q index and
+ // rdmult
+} UENUM1BYTE(DUCKY_ENCODE_FRAME_MODE);
+
+enum {
+ DUCKY_ENCODE_GOP_MODE_NONE, // native AV1 decides GOP
+ DUCKY_ENCODE_GOP_MODE_RCL, // rate control lib decides GOP
+} UENUM1BYTE(DUCKY_ENCODE_GOP_MODE);
+
+typedef struct DuckyEncodeFrameInfo {
+ DUCKY_ENCODE_FRAME_MODE qp_mode;
+ DUCKY_ENCODE_GOP_MODE gop_mode;
+ int q_index;
+ int rdmult;
+ // These two arrays are equivalent to std::vector<SuperblockEncodeParameters>
+ int *superblock_encode_qindex;
+ int *superblock_encode_rdmult;
+ int delta_q_enabled;
+} DuckyEncodeFrameInfo;
+
+typedef struct DuckyEncodeFrameResult {
+ int global_order_idx;
+ int q_index;
+ int rdmult;
+ int rate;
+ int64_t dist;
+ double psnr;
+} DuckyEncodeFrameResult;
+
+typedef struct DuckyEncodeInfo {
+ DuckyEncodeFrameInfo frame_info;
+ DuckyEncodeFrameResult frame_result;
+} DuckyEncodeInfo;
+/*!\endcond */
+#endif
+
+/*!\cond */
+typedef struct RTC_REF {
+ /*!
+ * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ */
+ int reference[INTER_REFS_PER_FRAME];
+ int ref_idx[INTER_REFS_PER_FRAME];
+ int refresh[REF_FRAMES];
+ int set_ref_frame_config;
+ int non_reference_frame;
+ int ref_frame_comp[3];
+ int gld_idx_1layer;
+ /*!
+ * Frame number of the last frame that refreshed the buffer slot.
+ */
+ unsigned int buffer_time_index[REF_FRAMES];
+ /*!
+ * Spatial layer id of the last frame that refreshed the buffer slot.
+ */
+ unsigned char buffer_spatial_layer[REF_FRAMES];
+ /*!
+ * Flag to indicate whether closest reference was the previous frame.
+ */
+ bool reference_was_previous_frame;
+ /*!
+ * Flag to indicate this frame is based on longer term reference only,
+ * for recovery from past loss, and it should be biased for improved coding.
+ */
+ bool bias_recovery_frame;
+} RTC_REF;
+/*!\endcond */
+
+/*!
+ * \brief Structure to hold data corresponding to an encoded frame.
+ */
+typedef struct AV1_COMP_DATA {
+ /*!
+ * Buffer to store packed bitstream data of a frame.
+ */
+ unsigned char *cx_data;
+
+ /*!
+ * Allocated size of the cx_data buffer.
+ */
+ size_t cx_data_sz;
+
+ /*!
+ * Size of data written in the cx_data buffer.
+ */
+ size_t frame_size;
+
+ /*!
+ * Flags for the frame.
+ */
+ unsigned int lib_flags;
+
+ /*!
+ * Time stamp for start of frame.
+ */
+ int64_t ts_frame_start;
+
+ /*!
+ * Time stamp for end of frame.
+ */
+ int64_t ts_frame_end;
+
+ /*!
+ * Flag to indicate flush call.
+ */
+ int flush;
+
+ /*!
+ * Time base for sequence.
+ */
+ const aom_rational64_t *timestamp_ratio;
+
+ /*!
+ * Decide to pop the source for this frame from input buffer queue.
+ */
+ int pop_lookahead;
+
+ /*!
+ * Display order hint of frame whose packed data is in cx_data buffer.
+ */
+ int frame_display_order_hint;
+} AV1_COMP_DATA;
+
+/*!
+ * \brief Top level primary encoder structure
+ */
+typedef struct AV1_PRIMARY {
+ /*!
+ * Array of frame level encoder stage top level structures
+ */
+ struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES];
+
+ /*!
+ * Array of structures to hold data of frames encoded in a given parallel
+ * encode set.
+ */
+ struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1];
+#if CONFIG_FPMT_TEST
+ /*!
+ * Flag which enables/disables simulation path for fpmt unit test.
+ * 0 - FPMT integration
+ * 1 - FPMT simulation
+ */
+ FPMT_TEST_ENC_CFG fpmt_unit_test_cfg;
+
+ /*!
+ * Temporary variable simulating the delayed frame_probability update.
+ */
+ FrameProbInfo temp_frame_probs;
+
+ /*!
+ * Temporary variable holding the updated frame probability across
+ * frames. Copy its value to temp_frame_probs for frame_parallel_level 0
+ * frames or last frame in parallel encode set.
+ */
+ FrameProbInfo temp_frame_probs_simulation;
+
+ /*!
+ * Temporary variable simulating the delayed update of valid global motion
+ * model across frames.
+ */
+ int temp_valid_gm_model_found[FRAME_UPDATE_TYPES];
+#endif // CONFIG_FPMT_TEST
+ /*!
+ * Copy of cm->ref_frame_map maintained to facilitate sequential update of
+ * ref_frame_map by lower layer depth frames encoded ahead of time in a
+ * parallel encode set.
+ */
+ RefCntBuffer *ref_frame_map_copy[REF_FRAMES];
+
+ /*!
+ * Start time stamp of the last encoded show frame
+ */
+ int64_t ts_start_last_show_frame;
+
+ /*!
+ * End time stamp of the last encoded show frame
+ */
+ int64_t ts_end_last_show_frame;
+
+ /*!
+ * Number of frame level contexts(cpis)
+ */
+ int num_fp_contexts;
+
+ /*!
+ * Loopfilter levels of the previous encoded frame.
+ */
+ int filter_level[2];
+
+ /*!
+ * Chrominance component loopfilter level of the previous encoded frame.
+ */
+ int filter_level_u;
+
+ /*!
+ * Chrominance component loopfilter level of the previous encoded frame.
+ */
+ int filter_level_v;
+
+ /*!
+ * Encode stage top level structure
+ * During frame parallel encode, this is the same as parallel_cpi[0]
+ */
+ struct AV1_COMP *cpi;
+
+ /*!
+ * Lookahead processing stage top level structure
+ */
+ struct AV1_COMP *cpi_lap;
+
+ /*!
+ * Look-ahead context.
+ */
+ struct lookahead_ctx *lookahead;
+
+ /*!
+ * Sequence parameters have been transmitted already and locked
+ * or not. Once locked av1_change_config cannot change the seq
+ * parameters.
+ */
+ int seq_params_locked;
+
+ /*!
+ * Pointer to internal utility functions that manipulate aom_codec_* data
+ * structures.
+ */
+ struct aom_codec_pkt_list *output_pkt_list;
+
+ /*!
+ * When set, indicates that internal ARFs are enabled.
+ */
+ int internal_altref_allowed;
+
+ /*!
+ * Tell if OVERLAY frame shows existing alt_ref frame.
+ */
+ int show_existing_alt_ref;
+
+ /*!
+ * Information related to a gf group.
+ */
+ GF_GROUP gf_group;
+
+ /*!
+ * Track prior gf group state.
+ */
+ GF_STATE gf_state;
+
+ /*!
+ * Flag indicating whether look ahead processing (LAP) is enabled.
+ */
+ int lap_enabled;
+
+ /*!
+ * Parameters for AV1 bitstream levels.
+ */
+ AV1LevelParams level_params;
+
+ /*!
+ * Calculates PSNR on each frame when set to 1.
+ */
+ int b_calculate_psnr;
+
+ /*!
+ * Number of frames left to be encoded, is 0 if limit is not set.
+ */
+ int frames_left;
+
+ /*!
+ * Information related to two pass encoding.
+ */
+ TWO_PASS twopass;
+
+ /*!
+ * Rate control related parameters.
+ */
+ PRIMARY_RATE_CONTROL p_rc;
+
+ /*!
+ * Info and resources used by temporal filtering.
+ */
+ TEMPORAL_FILTER_INFO tf_info;
+ /*!
+ * Elements part of the sequence header, that are applicable for all the
+ * frames in the video.
+ */
+ SequenceHeader seq_params;
+
+ /*!
+ * Indicates whether to use SVC.
+ */
+ int use_svc;
+
+ /*!
+ * If true, buffer removal times are present.
+ */
+ bool buffer_removal_time_present;
+
+ /*!
+ * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+ */
+ unsigned int number_temporal_layers;
+
+ /*!
+ * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+ */
+ unsigned int number_spatial_layers;
+
+ /*!
+ * Code and details about current error status.
+ */
+ struct aom_internal_error_info error;
+
+ /*!
+ * Function pointers to variants of sse/sad/variance computation functions.
+ * fn_ptr[i] indicates the list of function pointers corresponding to block
+ * size i.
+ */
+ aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+ /*!
+ * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+ * the ith 16 x 16 block in raster scan order.
+ */
+ double *tpl_sb_rdmult_scaling_factors;
+
+ /*!
+ * Parameters related to tpl.
+ */
+ TplParams tpl_data;
+
+ /*!
+ * Motion vector stats of the previous encoded frame.
+ */
+ MV_STATS mv_stats;
+
+#if CONFIG_INTERNAL_STATS
+ /*!\cond */
+ uint64_t total_time_receive_data;
+ uint64_t total_time_compress_data;
+
+ unsigned int total_mode_chosen_counts[MAX_MODES];
+
+ int count[2];
+ uint64_t total_sq_error[2];
+ uint64_t total_samples[2];
+ ImageStat psnr[2];
+
+ double total_blockiness;
+ double worst_blockiness;
+
+ int total_bytes;
+ double summed_quality;
+ double summed_weights;
+ double summed_quality_hbd;
+ double summed_weights_hbd;
+ unsigned int total_recode_hits;
+ double worst_ssim;
+ double worst_ssim_hbd;
+
+ ImageStat fastssim;
+ ImageStat psnrhvs;
+
+ int b_calculate_blockiness;
+ int b_calculate_consistency;
+
+ double total_inconsistency;
+ double worst_consistency;
+ Ssimv *ssim_vars;
+ Metrics metrics;
+ /*!\endcond */
+#endif
+
+#if CONFIG_ENTROPY_STATS
+ /*!
+ * Aggregates frame counts for the sequence.
+ */
+ FRAME_COUNTS aggregate_fc;
+#endif // CONFIG_ENTROPY_STATS
+
+ /*!
+ * For each type of reference frame, this contains the index of a reference
+ * frame buffer for a reference frame of the same type. We use this to
+ * choose our primary reference frame (which is the most recent reference
+ * frame of the same type as the current frame).
+ */
+ int fb_of_context_type[REF_FRAMES];
+
+ /*!
+ * Primary Multi-threading parameters.
+ */
+ PrimaryMultiThreadInfo p_mt_info;
+
+ /*!
+ * Probabilities for pruning of various AV1 tools.
+ */
+ FrameProbInfo frame_probs;
+
+ /*!
+ * Indicates if a valid global motion model has been found in the different
+ * frame update types of a GF group.
+ * valid_gm_model_found[i] indicates if valid global motion model has been
+ * found in the frame update type with enum value equal to i
+ */
+ int valid_gm_model_found[FRAME_UPDATE_TYPES];
+
+ /*!
+ * Struct for the reference structure for RTC.
+ */
+ RTC_REF rtc_ref;
+
+ /*!
+ * Struct for all intra mode row multi threading in the preprocess stage
+ * when --deltaq-mode=3.
+ */
+ AV1EncRowMultiThreadSync intra_row_mt_sync;
+} AV1_PRIMARY;
+
+/*!
+ * \brief Top level encoder structure.
+ */
+typedef struct AV1_COMP {
+ /*!
+ * Pointer to top level primary encoder structure
+ */
+ AV1_PRIMARY *ppi;
+
+ /*!
+ * Quantization and dequantization parameters for internal quantizer setup
+ * in the encoder.
+ */
+ EncQuantDequantParams enc_quant_dequant_params;
+
+ /*!
+ * Structure holding thread specific variables.
+ */
+ ThreadData td;
+
+ /*!
+ * Statistics collected at frame level.
+ */
+ FRAME_COUNTS counts;
+
+ /*!
+ * Holds buffer storing mode information at 4x4/8x8 level.
+ */
+ MBMIExtFrameBufferInfo mbmi_ext_info;
+
+ /*!
+ * Buffer holding the transform block related information.
+ * coeff_buffer_base[i] stores the transform block related information of the
+ * ith superblock in raster scan order.
+ */
+ CB_COEFF_BUFFER *coeff_buffer_base;
+
+ /*!
+ * Structure holding pointers to frame level memory allocated for transform
+ * block related information.
+ */
+ CoeffBufferPool coeff_buffer_pool;
+
+ /*!
+ * Structure holding variables common to encoder and decoder.
+ */
+ AV1_COMMON common;
+
+ /*!
+ * Encoder configuration related parameters.
+ */
+ AV1EncoderConfig oxcf;
+
+ /*!
+ * Stores the trellis optimization type at segment level.
+ * optimize_seg_arr[i] stores the trellis opt type for ith segment.
+ */
+ TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
+
+ /*!
+ * Pointer to the frame buffer holding the source frame to be used during the
+ * current stage of encoding. It can be the raw input, temporally filtered
+ * input or scaled input.
+ */
+ YV12_BUFFER_CONFIG *source;
+
+ /*!
+ * Pointer to the frame buffer holding the last raw source frame.
+ * last_source is NULL for the following cases:
+ * 1) First frame
+ * 2) Alt-ref frames
+ * 3) All frames for all-intra frame encoding.
+ */
+ YV12_BUFFER_CONFIG *last_source;
+
+ /*!
+ * Pointer to the frame buffer holding the unscaled source frame.
+ * It can be either the raw input or temporally filtered input.
+ */
+ YV12_BUFFER_CONFIG *unscaled_source;
+
+ /*!
+ * Frame buffer holding the resized source frame (cropping / superres).
+ */
+ YV12_BUFFER_CONFIG scaled_source;
+
+ /*!
+ * Pointer to the frame buffer holding the unscaled last source frame.
+ */
+ YV12_BUFFER_CONFIG *unscaled_last_source;
+
+ /*!
+ * Frame buffer holding the resized last source frame.
+ */
+ YV12_BUFFER_CONFIG scaled_last_source;
+
+ /*!
+ * Pointer to the original source frame. This is used to determine if the
+ * content is screen.
+ */
+ YV12_BUFFER_CONFIG *unfiltered_source;
+
+ /*!
+ * Frame buffer holding the orig source frame for PSNR calculation in rtc tf
+ * case.
+ */
+ YV12_BUFFER_CONFIG orig_source;
+
+ /*!
+ * Skip tpl setup when tpl data from gop length decision can be reused.
+ */
+ int skip_tpl_setup_stats;
+
+ /*!
+ * Scaling factors used in the RD multiplier modulation.
+ * TODO(sdeng): consider merge the following arrays.
+ * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+ * intermediate scaling factors which are used in the calculation of
+ * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+ * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+ */
+ double *tpl_rdmult_scaling_factors;
+
+ /*!
+ * Temporal filter context.
+ */
+ TemporalFilterCtx tf_ctx;
+
+ /*!
+ * Pointer to CDEF search context.
+ */
+ CdefSearchCtx *cdef_search_ctx;
+
+ /*!
+ * Variables related to forcing integer mv decisions for the current frame.
+ */
+ ForceIntegerMVInfo force_intpel_info;
+
+ /*!
+ * Pointer to the buffer holding the scaled reference frames.
+ * scaled_ref_buf[i] holds the scaled reference frame of type i.
+ */
+ RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
+
+ /*!
+ * Pointer to the buffer holding the last show frame.
+ */
+ RefCntBuffer *last_show_frame_buf;
+
+ /*!
+ * Refresh frame flags for golden, bwd-ref and alt-ref frames.
+ */
+ RefreshFrameInfo refresh_frame;
+
+ /*!
+ * Flag to reduce the number of reference frame buffers used in rt.
+ */
+ int rt_reduce_num_ref_buffers;
+
+ /*!
+ * Flags signalled by the external interface at frame level.
+ */
+ ExternalFlags ext_flags;
+
+ /*!
+ * Temporary frame buffer used to store the non-loop filtered reconstructed
+ * frame during the search of loop filter level.
+ */
+ YV12_BUFFER_CONFIG last_frame_uf;
+
+ /*!
+ * Temporary frame buffer used to store the loop restored frame during loop
+ * restoration search.
+ */
+ YV12_BUFFER_CONFIG trial_frame_rst;
+
+ /*!
+ * Ambient reconstruction err target for force key frames.
+ */
+ int64_t ambient_err;
+
+ /*!
+ * Parameters related to rate distortion optimization.
+ */
+ RD_OPT rd;
+
+ /*!
+ * Temporary coding context used to save and restore when encoding with and
+ * without super-resolution.
+ */
+ CODING_CONTEXT coding_context;
+
+ /*!
+ * Parameters related to global motion search.
+ */
+ GlobalMotionInfo gm_info;
+
+ /*!
+ * Parameters related to winner mode processing.
+ */
+ WinnerModeParams winner_mode_params;
+
+ /*!
+ * Frame time stamps.
+ */
+ TimeStamps time_stamps;
+
+ /*!
+ * Rate control related parameters.
+ */
+ RATE_CONTROL rc;
+
+ /*!
+ * Frame rate of the video.
+ */
+ double framerate;
+
+ /*!
+ * Bitmask indicating which reference buffers may be referenced by this frame.
+ */
+ int ref_frame_flags;
+
+ /*!
+ * speed is passed as a per-frame parameter into the encoder.
+ */
+ int speed;
+
+ /*!
+ * sf contains fine-grained config set internally based on speed.
+ */
+ SPEED_FEATURES sf;
+
+ /*!
+ * Parameters for motion vector search process.
+ */
+ MotionVectorSearchParams mv_search_params;
+
+ /*!
+ * When set, indicates that all reference frames are forward references,
+ * i.e., all the reference frames are output before the current frame.
+ */
+ int all_one_sided_refs;
+
+ /*!
+ * Segmentation related information for current frame.
+ */
+ EncSegmentationInfo enc_seg;
+
+ /*!
+ * Parameters related to cyclic refresh aq-mode.
+ */
+ CYCLIC_REFRESH *cyclic_refresh;
+ /*!
+ * Parameters related to active map. Active maps indicate
+ * if there is any activity on a 4x4 block basis.
+ */
+ ActiveMap active_map;
+
+ /*!
+ * The frame processing order within a GOP.
+ */
+ unsigned char gf_frame_index;
+
+#if CONFIG_INTERNAL_STATS
+ /*!\cond */
+ uint64_t time_compress_data;
+
+ unsigned int mode_chosen_counts[MAX_MODES];
+ int bytes;
+ unsigned int frame_recode_hits;
+ /*!\endcond */
+#endif
+
+#if CONFIG_SPEED_STATS
+ /*!
+ * For debugging: number of transform searches we have performed.
+ */
+ unsigned int tx_search_count;
+#endif // CONFIG_SPEED_STATS
+
+ /*!
+ * When set, indicates that the frame is droppable, i.e., this frame
+ * does not update any reference buffers.
+ */
+ int droppable;
+
+ /*!
+ * Stores the frame parameters during encoder initialization.
+ */
+ FRAME_INFO frame_info;
+
+ /*!
+ * Stores different types of frame indices.
+ */
+ FRAME_INDEX_SET frame_index_set;
+
+ /*!
+ * Store the cm->width in the last call of alloc_compressor_data(). Help
+ * determine whether compressor data should be reallocated when cm->width
+ * changes.
+ */
+ int data_alloc_width;
+
+ /*!
+ * Store the cm->height in the last call of alloc_compressor_data(). Help
+ * determine whether compressor data should be reallocated when cm->height
+ * changes.
+ */
+ int data_alloc_height;
+
+ /*!
+ * Number of MBs in the full-size frame; to be used to
+ * normalize the firstpass stats. This will differ from the
+ * number of MBs in the current frame when the frame is
+ * scaled.
+ */
+ int initial_mbs;
+
+ /*!
+ * Flag to indicate whether the frame size inforamation has been
+ * setup and propagated to associated allocations.
+ */
+ bool frame_size_related_setup_done;
+
+ /*!
+ * The width of the frame that is lastly encoded.
+ * It is updated in the function "encoder_encode()".
+ */
+ int last_coded_width;
+
+ /*!
+ * The height of the frame that is lastly encoded.
+ * It is updated in the function "encoder_encode()".
+ */
+ int last_coded_height;
+
+ /*!
+ * Resize related parameters.
+ */
+ ResizePendingParams resize_pending_params;
+
+ /*!
+ * Pointer to struct holding adaptive data/contexts/models for the tile during
+ * encoding.
+ */
+ TileDataEnc *tile_data;
+ /*!
+ * Number of tiles for which memory has been allocated for tile_data.
+ */
+ int allocated_tiles;
+
+ /*!
+ * Structure to store the palette token related information.
+ */
+ TokenInfo token_info;
+
+ /*!
+ * VARIANCE_AQ segment map refresh.
+ */
+ int vaq_refresh;
+
+ /*!
+ * Thresholds for variance based partitioning.
+ */
+ VarBasedPartitionInfo vbp_info;
+
+ /*!
+ * Number of recodes in the frame.
+ */
+ int num_frame_recode;
+
+ /*!
+ * Current frame probability of parallel frames, across recodes.
+ */
+ FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for transform type frame_probability calculation
+ */
+ int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for obmc frame_probability calculation
+ */
+ int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for warped motion frame_probability calculation
+ */
+ int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for interpolation filter frame_probability calculation
+ */
+ int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME];
+
+#if CONFIG_FPMT_TEST
+ /*!
+ * Temporary variable for simulation.
+ * Previous frame's framerate.
+ */
+ double temp_framerate;
+#endif
+ /*!
+ * Updated framerate for the current parallel frame.
+ * cpi->framerate is updated with new_framerate during
+ * post encode updates for parallel frames.
+ */
+ double new_framerate;
+
+ /*!
+ * Retain condition for fast_extra_bits calculation.
+ */
+ int do_update_vbr_bits_off_target_fast;
+
+ /*!
+ * Multi-threading parameters.
+ */
+ MultiThreadInfo mt_info;
+
+ /*!
+ * Specifies the frame to be output. It is valid only if show_existing_frame
+ * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to
+ * INVALID_IDX.
+ */
+ int existing_fb_idx_to_show;
+
+ /*!
+ * A flag to indicate if intrabc is ever used in current frame.
+ */
+ int intrabc_used;
+
+ /*!
+ * Mark which ref frames can be skipped for encoding current frame during RDO.
+ */
+ int prune_ref_frame_mask;
+
+ /*!
+ * Loop Restoration context.
+ */
+ AV1LrStruct lr_ctxt;
+
+ /*!
+ * Loop Restoration context used during pick stage.
+ */
+ AV1LrPickStruct pick_lr_ctxt;
+
+ /*!
+ * Pointer to list of tables with film grain parameters.
+ */
+ aom_film_grain_table_t *film_grain_table;
+
+#if CONFIG_DENOISE
+ /*!
+ * Pointer to structure holding the denoised image buffers and the helper
+ * noise models.
+ */
+ struct aom_denoise_and_model_t *denoise_and_model;
+#endif
+
+ /*!
+ * Flags related to interpolation filter search.
+ */
+ InterpSearchFlags interp_search_flags;
+
+ /*!
+ * Turn on screen content tools flag.
+ * Note that some videos are not screen content videos, but
+ * screen content tools could also improve coding efficiency.
+ * For example, videos with large flat regions, gaming videos that look
+ * like natural videos.
+ */
+ int use_screen_content_tools;
+
+ /*!
+ * A flag to indicate "real" screen content videos.
+ * For example, screen shares, screen editing.
+ * This type is true indicates |use_screen_content_tools| must be true.
+ * In addition, rate control strategy is adjusted when this flag is true.
+ */
+ int is_screen_content_type;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ /*!
+ * Accumulates the partition timing stat over the whole frame.
+ */
+ FramePartitionTimingStats partition_stats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ /*!
+ * component_time[] are initialized to zero while encoder starts.
+ */
+ uint64_t component_time[kTimingComponents];
+ /*!
+ * Stores timing for individual components between calls of start_timing()
+ * and end_timing().
+ */
+ struct aom_usec_timer component_timer[kTimingComponents];
+ /*!
+ * frame_component_time[] are initialized to zero at beginning of each frame.
+ */
+ uint64_t frame_component_time[kTimingComponents];
+#endif
+
+ /*!
+ * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+ */
+ int frame_header_count;
+
+ /*!
+ * Whether any no-zero delta_q was actually used.
+ */
+ int deltaq_used;
+
+ /*!
+ * Refrence frame distance related variables.
+ */
+ RefFrameDistanceInfo ref_frame_dist_info;
+
+ /*!
+ * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+ * the ith 16 x 16 block in raster scan order. This scaling factor is used for
+ * RD multiplier modulation when SSIM tuning is enabled.
+ */
+ double *ssim_rdmult_scaling_factors;
+
+#if CONFIG_TUNE_VMAF
+ /*!
+ * Parameters for VMAF tuning.
+ */
+ TuneVMAFInfo vmaf_info;
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ /*!
+ * Parameters for Butteraugli tuning.
+ */
+ TuneButteraugliInfo butteraugli_info;
+#endif
+
+ /*!
+ * Parameters for scalable video coding.
+ */
+ SVC svc;
+
+ /*!
+ * Indicates whether current processing stage is encode stage or LAP stage.
+ */
+ COMPRESSOR_STAGE compressor_stage;
+
+ /*!
+ * Frame type of the last frame. May be used in some heuristics for speeding
+ * up the encoding.
+ */
+ FRAME_TYPE last_frame_type;
+
+ /*!
+ * Number of tile-groups.
+ */
+ int num_tg;
+
+ /*!
+ * Super-resolution mode currently being used by the encoder.
+ * This may / may not be same as user-supplied mode in oxcf->superres_mode
+ * (when we are recoding to try multiple options for example).
+ */
+ aom_superres_mode superres_mode;
+
+ /*!
+ * First pass related data.
+ */
+ FirstPassData firstpass_data;
+
+ /*!
+ * Temporal Noise Estimate
+ */
+ NOISE_ESTIMATE noise_estimate;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ /*!
+ * Temporal Denoiser
+ */
+ AV1_DENOISER denoiser;
+#endif
+
+ /*!
+ * Count on how many consecutive times a block uses small/zeromv for encoding
+ * in a scale of 8x8 block.
+ */
+ uint8_t *consec_zero_mv;
+
+ /*!
+ * Allocated memory size for |consec_zero_mv|.
+ */
+ int consec_zero_mv_alloc_size;
+
+ /*!
+ * Block size of first pass encoding
+ */
+ BLOCK_SIZE fp_block_size;
+
+ /*!
+ * The counter of encoded super block, used to differentiate block names.
+ * This number starts from 0 and increases whenever a super block is encoded.
+ */
+ int sb_counter;
+
+ /*!
+ * Available bitstream buffer size in bytes
+ */
+ size_t available_bs_size;
+
+ /*!
+ * The controller of the external partition model.
+ * It is used to do partition type selection based on external models.
+ */
+ ExtPartController ext_part_controller;
+
+ /*!
+ * Motion vector stats of the current encoded frame, used to update the
+ * ppi->mv_stats during postencode.
+ */
+ MV_STATS mv_stats;
+ /*!
+ * Stores the reference refresh index for the current frame.
+ */
+ int ref_refresh_index;
+
+ /*!
+ * A flag to indicate if the reference refresh index is available for the
+ * current frame.
+ */
+ bool refresh_idx_available;
+
+ /*!
+ * Reference frame index corresponding to the frame to be excluded from being
+ * used as a reference by frame_parallel_level 2 frame in a parallel
+ * encode set of lower layer frames.
+ */
+ int ref_idx_to_skip;
+#if CONFIG_FPMT_TEST
+ /*!
+ * Stores the wanted frame buffer index for choosing primary ref frame by a
+ * frame_parallel_level 2 frame in a parallel encode set of lower layer
+ * frames.
+ */
+
+ int wanted_fb;
+#endif // CONFIG_FPMT_TEST
+
+ /*!
+ * A flag to indicate frames that will update their data to the primary
+ * context at the end of the encode. It is set for non-parallel frames and the
+ * last frame in encode order in a given parallel encode set.
+ */
+ bool do_frame_data_update;
+
+#if CONFIG_RD_COMMAND
+ /*!
+ * A structure for assigning external q_index / rdmult for experiments
+ */
+ RD_COMMAND rd_command;
+#endif // CONFIG_RD_COMMAND
+
+ /*!
+ * Buffer to store MB variance after Wiener filter.
+ */
+ WeberStats *mb_weber_stats;
+
+ /*!
+ * Buffer to store rate cost estimates for each macro block (8x8) in the
+ * preprocessing stage used in allintra mode.
+ */
+ int *prep_rate_estimates;
+
+ /*!
+ * Buffer to store rate cost estimates for each 16x16 block read
+ * from an external file, used in allintra mode.
+ */
+ double *ext_rate_distribution;
+
+ /*!
+ * The scale that equals sum_rate_uniform_quantizer / sum_ext_rate.
+ */
+ double ext_rate_scale;
+
+ /*!
+ * Buffer to store MB variance after Wiener filter.
+ */
+ BLOCK_SIZE weber_bsize;
+
+ /*!
+ * Frame level Wiener filter normalization.
+ */
+ int64_t norm_wiener_variance;
+
+ /*!
+ * Buffer to store delta-q values for delta-q mode 4.
+ */
+ int *mb_delta_q;
+
+ /*!
+ * Flag to indicate that current frame is dropped.
+ */
+ bool is_dropped_frame;
+
+#if CONFIG_BITRATE_ACCURACY
+ /*!
+ * Structure stores information needed for bitrate accuracy experiment.
+ */
+ VBR_RATECTRL_INFO vbr_rc_info;
+#endif
+
+#if CONFIG_RATECTRL_LOG
+ /*!
+ * Structure stores information of rate control decisions.
+ */
+ RATECTRL_LOG rc_log;
+#endif // CONFIG_RATECTRL_LOG
+
+ /*!
+ * Frame level twopass status and control data
+ */
+ TWO_PASS_FRAME twopass_frame;
+
+ /*!
+ * Context needed for third pass encoding.
+ */
+ THIRD_PASS_DEC_CTX *third_pass_ctx;
+
+ /*!
+ * File pointer to second pass log
+ */
+ FILE *second_pass_log_stream;
+
+ /*!
+ * Buffer to store 64x64 SAD
+ */
+ uint64_t *src_sad_blk_64x64;
+
+ /*!
+ * SSE between the current frame and the reconstructed last frame
+ * It is only used for CBR mode.
+ * It is not used if the reference frame has a different frame size.
+ */
+ uint64_t rec_sse;
+
+ /*!
+ * A flag to indicate whether the encoder is controlled by DuckyEncode or not.
+ * 1:yes 0:no
+ */
+ int use_ducky_encode;
+
+#if !CONFIG_REALTIME_ONLY
+ /*! A structure that facilitates the communication between DuckyEncode and AV1
+ * encoder.
+ */
+ DuckyEncodeInfo ducky_encode_info;
+#endif // CONFIG_REALTIME_ONLY
+ //
+ /*!
+ * Frames since last frame with cdf update.
+ */
+ int frames_since_last_update;
+
+ /*!
+ * Block level thresholds to force zeromv-skip at partition level.
+ */
+ unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL];
+
+ /*!
+ * Number of downsampling pyramid levels to allocate for each frame
+ * This is currently only used for global motion
+ */
+ int image_pyramid_levels;
+
+#if CONFIG_SALIENCY_MAP
+ /*!
+ * Pixel level saliency map for each frame.
+ */
+ uint8_t *saliency_map;
+
+ /*!
+ * Superblock level rdmult scaling factor driven by saliency map.
+ */
+ double *sm_scaling_factor;
+#endif
+
+ /*!
+ * Number of pixels that choose palette mode for luma in the
+ * fast encoding pass in av1_determine_sc_tools_with_encoding().
+ */
+ int palette_pixel_num;
+
+ /*!
+ * Flag to indicate scaled_last_source is available,
+ * so scaling is not needed for last_source.
+ */
+ int scaled_last_source_available;
+} AV1_COMP;
+
+/*!
+ * \brief Input frames and last input frame
+ */
+typedef struct EncodeFrameInput {
+ /*!\cond */
+ YV12_BUFFER_CONFIG *source;
+ YV12_BUFFER_CONFIG *last_source;
+ int64_t ts_duration;
+ /*!\endcond */
+} EncodeFrameInput;
+
+/*!
+ * \brief contains per-frame encoding parameters decided upon by
+ * av1_encode_strategy() and passed down to av1_encode().
+ */
+typedef struct EncodeFrameParams {
+ /*!
+ * Is error resilient mode enabled
+ */
+ int error_resilient_mode;
+ /*!
+ * Frame type (eg KF vs inter frame etc)
+ */
+ FRAME_TYPE frame_type;
+
+ /*!\cond */
+ int primary_ref_frame;
+ int order_offset;
+
+ /*!\endcond */
+ /*!
+ * Should the current frame be displayed after being decoded
+ */
+ int show_frame;
+
+ /*!\cond */
+ int refresh_frame_flags;
+
+ int show_existing_frame;
+ int existing_fb_idx_to_show;
+
+ /*!\endcond */
+ /*!
+ * Bitmask of which reference buffers may be referenced by this frame.
+ */
+ int ref_frame_flags;
+
+ /*!
+ * Reference buffer assignment for this frame.
+ */
+ int remapped_ref_idx[REF_FRAMES];
+
+ /*!
+ * Flags which determine which reference buffers are refreshed by this
+ * frame.
+ */
+ RefreshFrameInfo refresh_frame;
+
+ /*!
+ * Speed level to use for this frame: Bigger number means faster.
+ */
+ int speed;
+} EncodeFrameParams;
+
+/*!\cond */
+
+// EncodeFrameResults contains information about the result of encoding a
+// single frame
+typedef struct {
+ size_t size; // Size of resulting bitstream
+} EncodeFrameResults;
+
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage);
+
+struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf,
+ BufferPool *const pool,
+ COMPRESSOR_STAGE stage,
+ int lap_lag_in_frames);
+
+struct AV1_PRIMARY *av1_create_primary_compressor(
+ struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+ const AV1EncoderConfig *oxcf);
+
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi);
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+void print_internal_stats(AV1_PRIMARY *ppi);
+#endif
+
+void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+ bool *sb_size_changed);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+ bool sb_size_changed);
+
+aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+ int subsampling_x, int subsampling_y);
+
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+ const AV1EncoderConfig *oxcf, int use_svc);
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+ const AV1_COMP_DATA *const cpi_data);
+
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map);
+
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map);
+
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi);
+
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map);
+
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi);
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data);
+
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+ AV1_PRIMARY *const ppi,
+ int *ref_buffers_used_map);
+/*!\endcond */
+
+/*!\brief Obtain the raw frame data
+ *
+ * \ingroup high_level_algo
+ * This function receives the raw frame data from input.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] frame_flags Flags to decide how to encoding the frame
+ * \param[in,out] sd Contain raw frame data
+ * \param[in] time_stamp Time stamp of the frame
+ * \param[in] end_time_stamp End time stamp
+ *
+ * \return Returns a value to indicate if the frame data is received
+ * successfully.
+ * \note The caller can assume that a copy of this frame is made and not just a
+ * copy of the pointer.
+ */
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time_stamp);
+
+/*!\brief Encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function encodes the raw frame data, and outputs the frame bit stream
+ * to the designated buffer. The caller should use the output parameters
+ * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in,out] cpi_data Data corresponding to a frame encode
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * No frame encoded; more input is required.
+ * \retval "A nonzero (positive) aom_codec_err_t code"
+ * The encoding failed with the error. Sets the error code and error message
+ * in \c cpi->common.error.
+ */
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data);
+
+/*!\brief Run 1-pass/2-pass encoding
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ */
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ const EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results);
+
+/*!\cond */
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd);
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
+void av1_set_mv_search_params(AV1_COMP *cpi);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+ ResizePendingParams *resize_pending_params,
+ AOM_SCALING_MODE horiz_mode,
+ AOM_SCALING_MODE vert_mode);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td);
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td);
+
+// Set screen content options.
+// This function estimates whether to use screen content tools, by counting
+// the portion of blocks that have few luma colors.
+// Modifies:
+// cpi->commom.features.allow_screen_content_tools
+// cpi->common.features.allow_intrabc
+// cpi->use_screen_content_tools
+// cpi->is_screen_content_type
+// However, the estimation is not accurate and may misclassify videos.
+// A slower but more accurate approach that determines whether to use screen
+// content tools is employed later. See av1_determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(struct AV1_COMP *cpi,
+ FeatureFlags *features);
+
+void av1_update_frame_size(AV1_COMP *cpi);
+
+typedef struct {
+ int pyr_level;
+ int disp_order;
+} RefFrameMapPair;
+
+static INLINE void init_ref_map_pair(
+ AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+ if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+ memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+ return;
+ }
+ memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+ for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+ // Get reference frame buffer.
+ const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx];
+ if (ref_frame_map_pairs[map_idx].disp_order == -1) continue;
+ if (buf == NULL) {
+ ref_frame_map_pairs[map_idx].disp_order = -1;
+ ref_frame_map_pairs[map_idx].pyr_level = -1;
+ continue;
+ } else if (buf->ref_count > 1) {
+ // Once the keyframe is coded, the slots in ref_frame_map will all
+ // point to the same frame. In that case, all subsequent pointers
+ // matching the current are considered "free" slots. This will find
+ // the next occurrence of the current pointer if ref_count indicates
+ // there are multiple instances of it and mark it as free.
+ for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
+ const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
+ if (buf2 == buf) {
+ ref_frame_map_pairs[idx2].disp_order = -1;
+ ref_frame_map_pairs[idx2].pyr_level = -1;
+ }
+ }
+ }
+ ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint;
+ ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
+ }
+}
+
+#if CONFIG_FPMT_TEST
+static AOM_INLINE void calc_frame_data_update_flag(
+ GF_GROUP *const gf_group, int gf_frame_index,
+ bool *const do_frame_data_update) {
+ *do_frame_data_update = true;
+ // Set the flag to false for all frames in a given parallel encode set except
+ // the last frame in the set with frame_parallel_level = 2.
+ if (gf_group->frame_parallel_level[gf_frame_index] == 1) {
+ *do_frame_data_update = false;
+ } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) {
+ // Check if this is the last frame in the set with frame_parallel_level = 2.
+ for (int i = gf_frame_index + 1; i < gf_group->size; i++) {
+ if ((gf_group->frame_parallel_level[i] == 0 &&
+ (gf_group->update_type[i] == ARF_UPDATE ||
+ gf_group->update_type[i] == INTNL_ARF_UPDATE)) ||
+ gf_group->frame_parallel_level[i] == 1) {
+ break;
+ } else if (gf_group->frame_parallel_level[i] == 2) {
+ *do_frame_data_update = false;
+ break;
+ }
+ }
+ }
+}
+#endif
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static INLINE int64_t
+timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) {
+ return n * timestamp_ratio->num / timestamp_ratio->den;
+}
+
+static INLINE int64_t
+ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
+ int64_t round = timestamp_ratio->num / 2;
+ if (round > 0) --round;
+ return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
+}
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const FRAME_UPDATE_TYPE update_type =
+ gf_group->update_type[cpi->gf_frame_index];
+
+ return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
+ update_type == GF_UPDATE;
+}
+
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) {
+ return (cpi->common.features.allow_screen_content_tools &&
+ cpi->common.features.allow_intrabc &&
+ frame_is_intra_only(&cpi->common));
+}
+
+static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+ const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ return buf != NULL ? &buf->buf : NULL;
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+ assert(buf != NULL);
+ ensure_mv_buffer(buf, cm);
+ buf->width = cm->width;
+ buf->height = cm->height;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(const TileInfo *tile,
+ int sb_size_log2, int num_planes) {
+ int tile_mb_rows =
+ ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2);
+ int tile_mb_cols =
+ ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2);
+
+ return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+ int mi_row, TokenExtra **tok, int sb_size_log2,
+ int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+ *tok = cpi->token_info.tile_tok[tile_row][tile_col] +
+ get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+#define ALT_MIN_LAG 3
+static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
+ return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
+}
+
+static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+ return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
+ (gf_cfg->gf_min_pyr_height == 0);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+ return (frame_length + mb_length - 1) / mb_length;
+}
+
+// Check if statistics generation stage
+static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
+ assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
+ cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled));
+ return (cpi->oxcf.pass == AOM_RC_FIRST_PASS ||
+ (cpi->compressor_stage == LAP_STAGE));
+}
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
+ return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS);
+}
+
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
+ return (is_stat_consumption_stage_twopass(cpi) ||
+ (cpi->oxcf.pass == AOM_RC_ONE_PASS &&
+ (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled));
+}
+
+// Decide whether 'dv_costs' need to be allocated/stored during the encoding.
+static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) {
+ return !cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi);
+}
+
+/*!\endcond */
+/*!\brief Check if the current stage has statistics
+ *
+ *\ingroup two_pass_algo
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ *
+ * \return 0 if no stats for current stage else 1
+ */
+static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
+ assert(
+ IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+ return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled);
+}
+
+/*!\cond */
+
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+ return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.gf_cfg.lag_in_frames == 0;
+}
+
+// Use default/internal reference structure for single-layer RTC.
+static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
+ return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 &&
+ cpi->ppi->number_temporal_layers == 1 &&
+ !cpi->ppi->rtc_ref.set_ref_frame_config;
+}
+
+// Function return size of frame stats buffer
+static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
+ /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
+ return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME ref0,
+ MV_REFERENCE_FRAME ref1) {
+ xd->block_ref_scale_factors[0] =
+ get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1);
+ xd->block_ref_scale_factors[1] =
+ get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+ return frame_index & 0x1;
+}
+
+static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi,
+ const int *cost_list) {
+ const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+ cpi->sf.mv_sf.use_fullpel_costlist;
+ return use_cost_list ? cost_list : NULL;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+ const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+ cpi->sf.mv_sf.use_fullpel_costlist;
+ return use_cost_list ? cost_list : NULL;
+}
+
+// Compression ratio of current frame.
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+ size_t encoded_frame_size);
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+void av1_setup_frame_size(AV1_COMP *cpi);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+ return cm->superres_upscaled_width != cm->render_width ||
+ cm->superres_upscaled_height != cm->render_height;
+}
+
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+ return av1_superres_scaled(cm) || av1_resize_scaled(cm);
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+ return cm->show_existing_frame && (!cm->features.error_resilient_mode ||
+ cm->current_frame.frame_type == KEY_FRAME);
+}
+
+// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given
+// 'mi_row' and 'mi_col'.
+static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
+ const BLOCK_SIZE mi_alloc_bsize,
+ const int mbmi_ext_stride) {
+ const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize];
+ const int mi_ext_row = mi_row / mi_ext_size_1d;
+ const int mi_ext_col = mi_col / mi_ext_size_1d;
+ return mi_ext_row * mbmi_ext_stride + mi_ext_col;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(
+ const CommonModeInfoParams *const mi_params,
+ const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row, int mi_col) {
+ set_mi_offsets(mi_params, xd, mi_row, mi_col);
+ const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize,
+ mbmi_ext_info->stride);
+ x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx;
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+ int cols_left, int *bh, int *bw) {
+ int int_size = (int)bsize;
+ if (rows_left <= 0 || cols_left <= 0) {
+ return AOMMIN(bsize, BLOCK_8X8);
+ } else {
+ for (; int_size > 0; int_size -= 3) {
+ *bh = mi_size_high[int_size];
+ *bw = mi_size_wide[int_size];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return (BLOCK_SIZE)int_size;
+}
+
+static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+
+// When more than 'max_allowed_refs' are available, we reduce the number of
+// reference frames one at a time based on this order.
+static const MV_REFERENCE_FRAME disable_order[] = {
+ LAST3_FRAME,
+ LAST2_FRAME,
+ ALTREF2_FRAME,
+ BWDREF_FRAME,
+};
+
+static const MV_REFERENCE_FRAME
+ ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
+ LAST_FRAME, ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
+ ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME,
+ };
+
+static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+ const int use_one_pass_rt_params,
+ const YV12_BUFFER_CONFIG **ref_frames,
+ const int ext_ref_frame_flags) {
+ // cpi->ext_flags.ref_frame_flags allows certain reference types to be
+ // disabled by the external interface. These are set by
+ // av1_apply_encoding_flags(). Start with what the external interface allows,
+ // then suppress any reference types which we have found to be duplicates.
+ int flags = ext_ref_frame_flags;
+
+ for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
+ const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
+ // If this_ref has appeared before, mark the corresponding ref frame as
+ // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the
+ // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+ int index =
+ (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME)
+ ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+ : i;
+ for (int j = 0; j < index; ++j) {
+ // If this_ref has appeared before (same as the reference corresponding
+ // to lower index j), remove it as a reference only if that reference
+ // (for index j) is actually used as a reference.
+ if (this_ref == ref_frames[j] &&
+ (flags & (1 << (ref_frame_priority_order[j] - 1)))) {
+ flags &= ~(1 << (ref_frame_priority_order[i] - 1));
+ break;
+ }
+ }
+ }
+ return flags;
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
+
+#define MAX_GFUBOOST_FACTOR 10.0
+#define MIN_GFUBOOST_FACTOR 4.0
+
+static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
+ uint8_t index) {
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index];
+ return update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+ update_type == KF_UPDATE;
+}
+
+static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
+ int selective_ref_frame,
+ int prune_ref_frames,
+ int gf_index) {
+ return (selective_ref_frame > 0) && (prune_ref_frames > 0) &&
+ !is_frame_tpl_eligible(gf_group, gf_index);
+}
+
+// Get update type of the current frame.
+static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
+ int gf_frame_index) {
+ return gf_group->update_type[gf_frame_index];
+}
+
+static INLINE int av1_pixels_to_mi(int pixels) {
+ return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
+}
+
+static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+ cm->show_frame;
+}
+
+static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
+ const ResizePendingParams *const resize_pending_params =
+ &cpi->resize_pending_params;
+ return (resize_pending_params->width && resize_pending_params->height &&
+ (cpi->common.width != resize_pending_params->width ||
+ cpi->common.height != resize_pending_params->height));
+}
+
+// Check if loop filter is used.
+static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) {
+ return !cm->features.coded_lossless && !cm->tiles.large_scale;
+}
+
+// Check if CDEF is used.
+static INLINE int is_cdef_used(const AV1_COMMON *const cm) {
+ return cm->seq_params->enable_cdef && !cm->features.coded_lossless &&
+ !cm->tiles.large_scale;
+}
+
+// Check if loop restoration filter is used.
+static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
+ return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
+ !cm->tiles.large_scale;
+}
+
+// Checks if post-processing filters need to be applied.
+// NOTE: This function decides if the application of different post-processing
+// filters on the reconstructed frame can be skipped at the encoder side.
+// However the computation of different filter parameters that are signaled in
+// the bitstream is still required.
+static INLINE unsigned int derive_skip_apply_postproc_filters(
+ const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres,
+ int use_restoration) {
+ // Though CDEF parameter selection should be dependent on
+ // deblocked/loop-filtered pixels for cdef_pick_method <=
+ // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the
+ // pixel values that are not loop-filtered in svc real-time encoding mode.
+ // Hence this case is handled separately using the condition below.
+ if (cpi->ppi->rtc_ref.non_reference_frame)
+ return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF);
+
+ if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr)
+ return 0;
+ assert(cpi->oxcf.mode == ALLINTRA);
+
+ // The post-processing filters are applied one after the other in the
+ // following order: deblocking->cdef->superres->restoration. In case of
+ // ALLINTRA encoding, the reconstructed frame is not used as a reference
+ // frame. Hence, the application of these filters can be skipped when
+ // 1. filter parameters of the subsequent stages are not dependent on the
+ // filtered output of the current stage or
+ // 2. subsequent filtering stages are disabled
+ if (use_restoration) return SKIP_APPLY_RESTORATION;
+ if (use_superres) return SKIP_APPLY_SUPERRES;
+ if (use_cdef) {
+ // CDEF parameter selection is not dependent on the deblocked frame if
+ // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking
+ // filters and cdef filters can be skipped in this case.
+ return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+ use_loopfilter)
+ ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF)
+ : SKIP_APPLY_CDEF;
+ }
+ if (use_loopfilter) return SKIP_APPLY_LOOPFILTER;
+
+ // If we reach here, all post-processing stages are disabled, so none need to
+ // be skipped.
+ return 0;
+}
+
+static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) {
+ struct loopfilter *const lf = &cm->lf;
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ RestorationInfo *const rst_info = cm->rst_info;
+
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ cdef_info->cdef_bits = 0;
+ cdef_info->cdef_strengths[0] = 0;
+ cdef_info->nb_cdef_strengths = 1;
+ cdef_info->cdef_uv_strengths[0] = 0;
+ rst_info[0].frame_restoration_type = RESTORE_NONE;
+ rst_info[1].frame_restoration_type = RESTORE_NONE;
+ rst_info[2].frame_restoration_type = RESTORE_NONE;
+}
+
+static INLINE int is_inter_tx_size_search_level_one(
+ const TX_SPEED_FEATURES *tx_sf) {
+ return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 &&
+ tx_sf->inter_tx_size_search_init_depth_sqr >= 1);
+}
+
+static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) {
+ int lpf_opt_level = 0;
+ if (is_inter_tx_size_search_level_one(&sf->tx_sf))
+ lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
+ return lpf_opt_level;
+}
+
+// Enable switchable motion mode only if warp and OBMC tools are allowed
+static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion,
+ bool enable_obmc) {
+ return (allow_warped_motion || enable_obmc);
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
+ return (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+}
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+static INLINE void av1_print_fr_partition_timing_stats(
+ const FramePartitionTimingStats *part_stats, const char *filename) {
+ FILE *f = fopen(filename, "w");
+ if (!f) {
+ return;
+ }
+
+ fprintf(f, "bsize,redo,");
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "decision_%d,", part);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "attempt_%d,", part);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "time_%d,", part);
+ }
+ fprintf(f, "\n");
+
+ static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+ for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+ fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]);
+ }
+ fprintf(f, "\n");
+ }
+ fclose(f);
+}
+#endif // CONFIG_COLLECT_PARTITION_STATS == 2
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+ assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+ bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
+ bsize == BLOCK_4X4);
+ switch (bsize) {
+ case BLOCK_128X128: return 0;
+ case BLOCK_64X64: return 1;
+ case BLOCK_32X32: return 2;
+ case BLOCK_16X16: return 3;
+ case BLOCK_8X8: return 4;
+ case BLOCK_4X4: return 5;
+ default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+ }
+}
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(AV1_COMP *cpi, int component) {
+ aom_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(AV1_COMP *cpi, int component) {
+ aom_usec_timer_mark(&cpi->component_timer[component]);
+ cpi->frame_component_time[component] +=
+ aom_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+ switch (type) {
+ case 0: return "KEY_FRAME";
+ case 1: return "INTER_FRAME";
+ case 2: return "INTRA_ONLY_FRAME";
+ case 3: return "S_FRAME";
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encoder_alloc.h b/third_party/aom/av1/encoder/encoder_alloc.h
new file mode 100644
index 0000000000..ce48496d48
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_alloc.h
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/pickcdef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void dealloc_context_buffers_ext(
+ MBMIExtFrameBufferInfo *mbmi_ext_info) {
+ aom_free(mbmi_ext_info->frame_base);
+ mbmi_ext_info->frame_base = NULL;
+ mbmi_ext_info->alloc_size = 0;
+}
+
+static AOM_INLINE void alloc_context_buffers_ext(
+ AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ const int mi_alloc_rows =
+ (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+ const int mi_alloc_cols =
+ (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+ const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+ if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+ dealloc_context_buffers_ext(mbmi_ext_info);
+ CHECK_MEM_ERROR(
+ cm, mbmi_ext_info->frame_base,
+ aom_malloc(new_ext_mi_size * sizeof(*mbmi_ext_info->frame_base)));
+ mbmi_ext_info->alloc_size = new_ext_mi_size;
+ }
+ // The stride needs to be updated regardless of whether new allocation
+ // happened or not.
+ mbmi_ext_info->stride = mi_alloc_cols;
+}
+
+static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ // Setup mi_params
+ mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+ cpi->sf.part_sf.default_min_partition_size);
+
+ if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi);
+
+ aom_free(cpi->td.mv_costs_alloc);
+ cpi->td.mv_costs_alloc = NULL;
+ // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding
+ // mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+ CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc,
+ (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc)));
+ cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc;
+ }
+
+ av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
+ cm->error);
+ if (av1_setup_sms_tree(cpi, &cpi->td)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate SMS tree");
+ }
+ cpi->td.firstpass_ctx =
+ av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+ if (!cpi->td.firstpass_ctx)
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+}
+
+// Allocate mbmi buffers which are used to store mode information at block
+// level.
+static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (av1_alloc_context_buffers(cm, cm->width, cm->height,
+ cpi->sf.part_sf.default_min_partition_size)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ if (!is_stat_generation_stage(cpi))
+ alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+}
+
+static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ // Create the encoder segmentation map and set all entries to 0
+ aom_free(cpi->enc_seg.map);
+ CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+ aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+
+ // Create a map used for cyclic background refresh.
+ if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ CHECK_MEM_ERROR(
+ cm, cpi->cyclic_refresh,
+ av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
+
+ // Create a map used to mark inactive areas.
+ aom_free(cpi->active_map.map);
+ CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void alloc_obmc_buffers(
+ OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) {
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->wsrc,
+ (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->mask,
+ (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask)));
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->above_pred,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred)));
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->left_pred,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
+}
+
+static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
+ aom_free(obmc_buffer->mask);
+ aom_free(obmc_buffer->above_pred);
+ aom_free(obmc_buffer->left_pred);
+ aom_free(obmc_buffer->wsrc);
+
+ obmc_buffer->mask = NULL;
+ obmc_buffer->above_pred = NULL;
+ obmc_buffer->left_pred = NULL;
+ obmc_buffer->wsrc = NULL;
+}
+
+static AOM_INLINE void alloc_compound_type_rd_buffers(
+ struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) {
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->pred0,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->pred1,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->residual1,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->diff10,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+ AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf,
+ (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+ sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static AOM_INLINE void release_compound_type_rd_buffers(
+ CompoundTypeRdBuffers *const bufs) {
+ aom_free(bufs->pred0);
+ aom_free(bufs->pred1);
+ aom_free(bufs->residual1);
+ aom_free(bufs->diff10);
+ aom_free(bufs->tmp_best_mask_buf);
+ av1_zero(*bufs); // Set all pointers to NULL for safety.
+}
+
+static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ TokenInfo *token_info = &cpi->token_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int num_planes = av1_num_planes(cm);
+ dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
+
+ aom_free(cpi->tile_data);
+ cpi->tile_data = NULL;
+ cpi->allocated_tiles = 0;
+ enc_row_mt->allocated_tile_cols = 0;
+ enc_row_mt->allocated_tile_rows = 0;
+
+ // Delete sementation map
+ aom_free(cpi->enc_seg.map);
+ cpi->enc_seg.map = NULL;
+
+ av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ cpi->cyclic_refresh = NULL;
+
+ aom_free(cpi->active_map.map);
+ cpi->active_map.map = NULL;
+
+ aom_free(cpi->ssim_rdmult_scaling_factors);
+ cpi->ssim_rdmult_scaling_factors = NULL;
+
+ aom_free(cpi->tpl_rdmult_scaling_factors);
+ cpi->tpl_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+ aom_free(cpi->vmaf_info.rdmult_scaling_factors);
+ cpi->vmaf_info.rdmult_scaling_factors = NULL;
+ aom_close_vmaf_model(cpi->vmaf_info.vmaf_model);
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ aom_free(cpi->butteraugli_info.rdmult_scaling_factors);
+ cpi->butteraugli_info.rdmult_scaling_factors = NULL;
+ aom_free_frame_buffer(&cpi->butteraugli_info.source);
+ aom_free_frame_buffer(&cpi->butteraugli_info.resized_source);
+#endif
+
+#if CONFIG_SALIENCY_MAP
+ aom_free(cpi->saliency_map);
+ aom_free(cpi->sm_scaling_factor);
+#endif
+
+ release_obmc_buffers(&cpi->td.mb.obmc_buffer);
+
+ aom_free(cpi->td.mv_costs_alloc);
+ cpi->td.mv_costs_alloc = NULL;
+ aom_free(cpi->td.dv_costs_alloc);
+ cpi->td.dv_costs_alloc = NULL;
+
+ aom_free(cpi->td.mb.sb_stats_cache);
+ cpi->td.mb.sb_stats_cache = NULL;
+
+ aom_free(cpi->td.mb.sb_fp_stats);
+ cpi->td.mb.sb_fp_stats = NULL;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+ aom_free(cpi->td.mb.rdcost);
+ cpi->td.mb.rdcost = NULL;
+#endif
+
+ av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ cpi->td.pc_root = NULL;
+
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 2; j++) {
+ aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+ cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
+ }
+
+ av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table);
+
+ aom_free(cm->tpl_mvs);
+ cm->tpl_mvs = NULL;
+
+ aom_free(cpi->td.pixel_gradient_info);
+ cpi->td.pixel_gradient_info = NULL;
+
+ aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
+ cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
+
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
+
+ av1_free_pmc(cpi->td.firstpass_ctx, num_planes);
+ cpi->td.firstpass_ctx = NULL;
+
+ const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+ // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+ // in av1_temporal_filter() for single-threaded encode are freed in case an
+ // error is encountered during temporal filtering (due to early termination
+ // tf_dealloc_data() in av1_temporal_filter() would not be invoked).
+ tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth);
+
+ // This call ensures that tpl_tmp_buffers for single-threaded encode are freed
+ // in case of an error during tpl.
+ tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers);
+
+ // This call ensures that the global motion (gm) data buffers for
+ // single-threaded encode are freed in case of an error during gm.
+ gm_dealloc_data(&cpi->td.gm_data);
+
+ // This call ensures that CDEF search context buffers are deallocated in case
+ // of an error during cdef search.
+ av1_cdef_dealloc_data(cpi->cdef_search_ctx);
+ aom_free(cpi->cdef_search_ctx);
+ cpi->cdef_search_ctx = NULL;
+
+ av1_dealloc_mb_data(&cpi->td.mb, num_planes);
+
+ av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+
+ av1_free_txb_buf(cpi);
+ av1_free_context_buffers(cm);
+
+ aom_free_frame_buffer(&cpi->last_frame_uf);
+#if !CONFIG_REALTIME_ONLY
+ av1_free_restoration_buffers(cm);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+#endif
+
+ if (!is_stat_generation_stage(cpi)) {
+ av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker,
+ &cpi->mt_info.cdef_sync);
+ }
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+ cpi->pick_lr_ctxt.rusi[plane] = NULL;
+ }
+ aom_free(cpi->pick_lr_ctxt.dgd_avg);
+ cpi->pick_lr_ctxt.dgd_avg = NULL;
+
+ aom_free_frame_buffer(&cpi->trial_frame_rst);
+ aom_free_frame_buffer(&cpi->scaled_source);
+ aom_free_frame_buffer(&cpi->scaled_last_source);
+ aom_free_frame_buffer(&cpi->orig_source);
+ aom_free_frame_buffer(&cpi->svc.source_last_TL0);
+
+ free_token_info(token_info);
+
+ av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+ av1_free_sms_tree(&cpi->td);
+
+ aom_free(cpi->td.mb.palette_buffer);
+ release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
+ aom_free(cpi->td.mb.tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(cpi->td.mb.tmp_pred_bufs[j]);
+ }
+
+#if CONFIG_DENOISE
+ if (cpi->denoise_and_model) {
+ aom_denoise_and_model_free(cpi->denoise_and_model);
+ cpi->denoise_and_model = NULL;
+ }
+#endif
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ aom_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
+
+ if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+ aom_free(cpi->svc.layer_context);
+ cpi->svc.layer_context = NULL;
+
+ aom_free(cpi->consec_zero_mv);
+ cpi->consec_zero_mv = NULL;
+ cpi->consec_zero_mv_alloc_size = 0;
+
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+
+ aom_free(cpi->mb_weber_stats);
+ cpi->mb_weber_stats = NULL;
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ aom_free(cpi->prep_rate_estimates);
+ cpi->prep_rate_estimates = NULL;
+
+ aom_free(cpi->ext_rate_distribution);
+ cpi->ext_rate_distribution = NULL;
+ }
+
+ aom_free(cpi->mb_delta_q);
+ cpi->mb_delta_q = NULL;
+}
+
+static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
+ if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+ PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info;
+ if (!pixel_gradient_info) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+ CHECK_MEM_ERROR(
+ cm, pixel_gradient_info,
+ aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE));
+ cpi->td.pixel_gradient_info = pixel_gradient_info;
+ }
+
+ cpi->td.mb.pixel_gradient_info = pixel_gradient_info;
+}
+
+static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
+ if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+ Block4x4VarInfo *source_variance_info =
+ cpi->td.src_var_info_of_4x4_sub_blocks;
+ if (!source_variance_info) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+ CHECK_MEM_ERROR(cm, source_variance_info,
+ aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb));
+ cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info;
+ }
+
+ cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info;
+}
+
+static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
+ if (cpi->td.vt64x64) {
+ if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
+ }
+ }
+ if (!cpi->td.vt64x64) {
+ CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+ aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+ cpi->td.num_64x64_blocks = num_64x64_blocks;
+ }
+}
+
+static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
+ AV1_COMP *cpi, int scaled_width, int scaled_height) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ if (scaled_width == cpi->unscaled_source->y_crop_width &&
+ scaled_height == cpi->unscaled_source->y_crop_height) {
+ return cpi->unscaled_source;
+ }
+
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_source, scaled_width, scaled_height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, NULL, NULL, NULL,
+ cpi->image_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate scaled source buffer");
+ assert(cpi->scaled_source.y_crop_width == scaled_width);
+ assert(cpi->scaled_source.y_crop_height == scaled_height);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->unscaled_source, &cpi->scaled_source,
+ (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate buffers during resize");
+ return &cpi->scaled_source;
+}
+
+// Deallocate allocated thread_data.
+static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ const int num_tf_workers =
+ AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers);
+ const int num_tpl_workers =
+ AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers);
+ const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
+ const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+ for (int t = 1; t < p_mt_info->num_workers; ++t) {
+ EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
+ thread_data->td = thread_data->original_td;
+ ThreadData *const td = thread_data->td;
+ if (!td) continue;
+ aom_free(td->tctx);
+ aom_free(td->palette_buffer);
+ aom_free(td->tmp_conv_dst);
+ release_compound_type_rd_buffers(&td->comp_rd_buffer);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(td->tmp_pred_bufs[j]);
+ }
+ aom_free(td->pixel_gradient_info);
+ aom_free(td->src_var_info_of_4x4_sub_blocks);
+ release_obmc_buffers(&td->obmc_buffer);
+ aom_free(td->vt64x64);
+
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ aom_free(td->hash_value_buffer[x][y]);
+ td->hash_value_buffer[x][y] = NULL;
+ }
+ }
+ aom_free(td->mv_costs_alloc);
+ td->mv_costs_alloc = NULL;
+ aom_free(td->dv_costs_alloc);
+ td->dv_costs_alloc = NULL;
+ aom_free(td->counts);
+ av1_free_pmc(td->firstpass_ctx, num_planes);
+ td->firstpass_ctx = NULL;
+ av1_free_shared_coeff_buffer(&td->shared_coeff_buf);
+ av1_free_sms_tree(td);
+ // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+ // in prepare_tf_workers() for MT encode are freed in case an error is
+ // encountered during temporal filtering (due to early termination
+ // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be
+ // invoked).
+ if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+ // This call ensures that tpl_tmp_buffers for MT encode are freed in case of
+ // an error during tpl.
+ if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+ // This call ensures that the buffers in gm_data for MT encode are freed in
+ // case of an error during gm.
+ gm_dealloc_data(&td->gm_data);
+ av1_dealloc_mb_data(&td->mb, num_planes);
+ aom_free(td->mb.sb_stats_cache);
+ td->mb.sb_stats_cache = NULL;
+ aom_free(td->mb.sb_fp_stats);
+ td->mb.sb_fp_stats = NULL;
+#if CONFIG_PARTITION_SEARCH_ORDER
+ aom_free(td->mb.rdcost);
+ td->mb.rdcost = NULL;
+#endif
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION);
+ td->pc_root = NULL;
+ av1_dealloc_mb_wiener_var_pred_buf(td);
+ aom_free(td);
+ thread_data->td = NULL;
+ thread_data->original_td = NULL;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_ALLOC_H_
diff --git a/third_party/aom/av1/encoder/encoder_utils.c b/third_party/aom/av1/encoder/encoder_utils.c
new file mode 100644
index 0000000000..c35873d207
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_utils.c
@@ -0,0 +1,1503 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aomcx.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+ { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+ { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+ { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+ { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+ { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+ { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+ { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+ { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+ { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+ { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+ { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+ { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+ { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+ { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+ { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+ { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+ { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+ { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+ { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+ { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+ { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+ { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+ { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+ { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+ { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+ { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+ { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+ { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+ { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+ { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+ { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+ { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+ { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+ { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+ { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+ { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+ { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+ { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+ { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+ { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+ { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+ { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+ { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+ { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+ { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+ { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+ { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+ { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+ { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+ { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+ { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+ { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+ { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 106, 90, 90, 97, 67, 59, 70, 28,
+ 30, 38, 16, 16, 16, 0, 0, 44, 50, 26, 25 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 98, 93, 97, 68, 82, 85, 33, 30,
+ 33, 16, 16, 16, 16, 0, 0, 43, 37, 26, 16 },
+ { 0, 0, 0, 91, 80, 76, 78, 55, 49, 24, 16,
+ 16, 16, 16, 16, 16, 0, 0, 29, 45, 16, 38 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 103, 89, 89, 89, 62, 63, 76, 34,
+ 35, 32, 19, 16, 16, 0, 0, 49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+ 64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+ [SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS] = {
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } }
+ };
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ struct segmentation *const seg = &cm->seg;
+
+ double avg_q;
+#if CONFIG_FPMT_TEST
+ avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+ ? cpi->ppi->p_rc.temp_avg_q
+ : cpi->ppi->p_rc.avg_q;
+#else
+ avg_q = cpi->ppi->p_rc.avg_q;
+#endif
+
+ int high_q = (int)(avg_q > 48.0);
+ int qi_delta;
+
+ // Disable and clear down for KF
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ // Clear down the global segmentation map
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ // Disable segmentation
+ av1_disable_segmentation(seg);
+
+ // Clear down the segment features.
+ av1_clearall_segfeatures(seg);
+ } else if (cpi->refresh_frame.alt_ref_frame) {
+ // If this is an alt ref frame
+ // Clear down the global segmentation map
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ // Disable segmentation and individual segment features by default
+ av1_disable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ // If segmentation was enabled set those features needed for the
+ // arf itself.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+
+ qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875,
+ cm->seq_params->bit_depth);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+ }
+ } else if (seg->enabled) {
+ // All other frames if segmentation has been enabled
+
+ // First normal frame in a valid gf or alt ref group
+ if (rc->frames_since_golden == 0) {
+ // Set up segment features for normal frames in an arf group
+ // Disable segmentation and clear down features if alt ref
+ // is not active for this group
+
+ av1_disable_segmentation(seg);
+
+ memset(cpi->enc_seg.map, 0,
+ cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ av1_clearall_segfeatures(seg);
+ } else if (rc->is_src_frame_alt_ref) {
+ // Special case where we are coding over the top of a previous
+ // alt ref frame.
+ // Segment coding disabled for compred testing
+
+ // Enable ref frame features for segment 0 as well
+ av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+ // All mbs should use ALTREF_FRAME
+ av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+ // Skip all MBs if high Q (0,0 mv and skip coeffs)
+ if (high_q) {
+ av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ // Enable data update
+ seg->update_data = 1;
+ } else {
+ // All other frames.
+
+ // No updates.. leave things as they are.
+ seg->update_map = 0;
+ seg->update_data = 0;
+ }
+ }
+}
+
+void av1_apply_active_map(AV1_COMP *cpi) {
+ struct segmentation *const seg = &cpi->common.seg;
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ const unsigned char *const active_map = cpi->active_map.map;
+ int i;
+
+ assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+ if (frame_is_intra_only(&cpi->common)) {
+ cpi->active_map.enabled = 0;
+ cpi->active_map.update = 1;
+ }
+
+ if (cpi->active_map.update) {
+ if (cpi->active_map.enabled) {
+ const int num_mis =
+ cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+ for (i = 0; i < num_mis; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+ av1_enable_segmentation(seg);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+ -MAX_LOOP_FILTER);
+ } else {
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+ if (seg->enabled) {
+ seg->update_data = 1;
+ seg->update_map = 1;
+ }
+ }
+ cpi->active_map.update = 0;
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ AV1_COMMON *const cm = &cpi->common;
+
+ assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size));
+
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ if (tpl_frame->is_valid) {
+ int tpl_stride = tpl_frame->stride;
+ double intra_cost_base = 0;
+ double mc_dep_cost_base = 0;
+ double cbcmp_base = 1;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) {
+ for (int col = 0; col < mi_cols_sr; col += col_step_sr) {
+ TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ double cbcmp = (double)(this_stats->srcrf_dist);
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ intra_cost_base += log(dist_scaled) * cbcmp;
+ mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+ cbcmp_base += cbcmp;
+ }
+ }
+
+ if (mc_dep_cost_base == 0) {
+ tpl_frame->is_valid = 0;
+ } else {
+ cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+ if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+ if (cpi->ppi->lap_enabled) {
+ double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval);
+ const int gfu_boost = get_gfu_boost_from_r0_lap(
+ min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
+ cpi->ppi->p_rc.num_stats_required_for_gfu_boost);
+ // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+ // gfu_boost);
+ cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+ min_boost_factor, MAX_BOOST_COMBINE_FACTOR,
+ cpi->ppi->p_rc.gfu_boost, gfu_boost,
+ cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
+ } else {
+ // TPL may only look at a subset of frame in the gf group when the
+ // speed feature 'reduce_num_frames' is on, which affects the r0
+ // calcuation. Thus, to compensate for TPL not using all frames a
+ // factor to adjust r0 is used.
+ const int gfu_boost =
+ (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0);
+ cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+ MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+ cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+ }
+ }
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+ int *top_index) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ // Setup variables that depend on the dimensions of the frame.
+ av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
+
+#if !CONFIG_REALTIME_ONLY
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (cpi->oxcf.algo_cfg.enable_tpl_model &&
+ av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+ process_tpl_stats_frame(cpi);
+ av1_tpl_rdmult_setup(cpi);
+ }
+#endif
+
+ // Decide q and q bounds.
+ *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
+ bottom_index, top_index);
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q &&
+ cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid &&
+ !is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ const int tpl_q = av1_tpl_get_q_index(
+ &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality,
+ cm->seq_params->bit_depth);
+ *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q);
+ *top_index = *bottom_index = *q;
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE)
+ cpi->ppi->p_rc.arf_q = *q;
+ }
+
+ if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+ const double qratio_grad =
+ cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3;
+ const double qstep_ratio =
+ 0.2 +
+ (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad;
+ *q = av1_get_q_index_from_qstep_ratio(
+ cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth);
+ *top_index = *bottom_index = *q;
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE)
+ cpi->ppi->p_rc.arf_q = *q;
+ } else if (gf_group->layer_depth[cpi->gf_frame_index] <
+ gf_group->max_layer_depth) {
+ int this_height = gf_group->layer_depth[cpi->gf_frame_index];
+ int arf_q = cpi->ppi->p_rc.arf_q;
+ while (this_height > 1) {
+ arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2;
+ --this_height;
+ }
+ *top_index = *bottom_index = *q = arf_q;
+ }
+ }
+#endif
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in the second pass of a two pass encode, as it requires
+ // lagged coding, and if the relevant speed feature flag is set.
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->sf.hl_sf.static_segmentation)
+ configure_static_seg_features(cpi);
+}
+
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+ pars->num_cr_points = 0;
+ pars->cr_mult = 0;
+ pars->cr_luma_mult = 0;
+ memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+ memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+ pars->num_cb_points = 0;
+ pars->cb_mult = 0;
+ pars->cb_luma_mult = 0;
+ pars->chroma_scaling_from_luma = 0;
+ memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+ memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+ if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename ||
+ tune_cfg->content == AOM_CONTENT_FILM) {
+ seq_params->film_grain_params_present = 1;
+ } else {
+#if CONFIG_DENOISE
+ seq_params->film_grain_params_present = (oxcf->noise_level > 0);
+#else
+ seq_params->film_grain_params_present = 0;
+#endif
+ }
+}
+
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+ const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ aom_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
+
+ if (tune_cfg->film_grain_test_vector) {
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ memcpy(&cm->film_grain_params,
+ film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
+ sizeof(cm->film_grain_params));
+ if (oxcf->tool_cfg.enable_monochrome)
+ reset_film_grain_chroma_params(&cm->film_grain_params);
+ cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+ if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) {
+ cm->film_grain_params.clip_to_restricted_range = 0;
+ }
+ }
+ } else if (tune_cfg->film_grain_table_filename) {
+ CHECK_MEM_ERROR(cm, cpi->film_grain_table,
+ aom_calloc(1, sizeof(*cpi->film_grain_table)));
+
+ aom_film_grain_table_read(cpi->film_grain_table,
+ tune_cfg->film_grain_table_filename, cm->error);
+ } else if (tune_cfg->content == AOM_CONTENT_FILM) {
+ cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+ if (oxcf->tool_cfg.enable_monochrome)
+ reset_film_grain_chroma_params(&cm->film_grain_params);
+ if (cm->seq_params->color_range == AOM_CR_FULL_RANGE)
+ cm->film_grain_params.clip_to_restricted_range = 0;
+ } else {
+ memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+ }
+}
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+ const int phase, const int use_optimized_scaler) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ BufferPool *const pool = cm->buffer_pool;
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ continue;
+ }
+
+ // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the
+ // motion search can be skipped for the references: last, golden, altref.
+ // If so, we can skip scaling that reference.
+ if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref &&
+ cpi->ppi->rtc_ref.set_ref_frame_config) {
+ if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue;
+ if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue;
+ if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref)
+ continue;
+ }
+ // For RTC with superres on: golden reference only needs to be scaled
+ // if it was refreshed in previous frame.
+ if (is_one_pass_rt_params(cpi) &&
+ cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME &&
+ cpi->rc.frame_num_last_gf_refresh <
+ (int)cm->current_frame.frame_number - 1) {
+ continue;
+ }
+
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ // Replace the reference buffer with a copy having a thicker border,
+ // if the reference buffer is higher resolution than the current
+ // frame, and the border is thin.
+ if ((ref->y_crop_width > cm->width ||
+ ref->y_crop_height > cm->height) &&
+ ref->border < AOM_BORDER_IN_PIXELS) {
+ RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+ if (aom_yv12_realloc_with_new_border(
+ &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, cpi->image_pyramid_levels,
+ num_planes) != 0) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ }
+ int force_scaling = 0;
+ RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+ if (new_fb == NULL) {
+ const int new_fb_idx = get_free_fb(cm);
+ if (new_fb_idx == INVALID_IDX) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Unable to find free frame buffer");
+ }
+ force_scaling = 1;
+ new_fb = &pool->frame_bufs[new_fb_idx];
+ }
+
+ if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+ new_fb->buf.y_crop_height != cm->height) {
+ if (aom_realloc_frame_buffer(
+ &new_fb->buf, cm->width, cm->height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) {
+ if (force_scaling) {
+ // Release the reference acquired in the get_free_fb() call above.
+ --new_fb->ref_count;
+ }
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ bool has_optimized_scaler = av1_has_optimized_scaler(
+ ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width,
+ new_fb->buf.y_crop_height);
+ if (num_planes > 1) {
+ has_optimized_scaler =
+ has_optimized_scaler &&
+ av1_has_optimized_scaler(
+ ref->uv_crop_width, ref->uv_crop_height,
+ new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height);
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_optimized_scaler && has_optimized_scaler &&
+ cm->seq_params->bit_depth == AOM_BITS_8) {
+ av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+ num_planes);
+ } else if (!av1_resize_and_extend_frame_nonnormative(
+ ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+ num_planes)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer during resize");
+ }
+#else
+ if (use_optimized_scaler && has_optimized_scaler) {
+ av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+ num_planes);
+ } else if (!av1_resize_and_extend_frame_nonnormative(
+ ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+ num_planes)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer during resize");
+ }
+#endif
+ cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+ } else {
+ RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+ buf->buf.y_crop_width = ref->y_crop_width;
+ buf->buf.y_crop_height = ref->y_crop_height;
+ cpi->scaled_ref_buf[ref_frame - 1] = buf;
+ ++buf->ref_count;
+ }
+ } else {
+ if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ }
+ }
+}
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+ int height, int number_spatial_layers) {
+ if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) {
+ return BLOCK_64X64;
+ }
+ if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) {
+ return BLOCK_128X128;
+ }
+#if CONFIG_TFLITE
+ if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64;
+#endif
+ // Force 64x64 superblock size to increase resolution in perceptual
+ // AQ mode.
+ if (oxcf->mode == ALLINTRA &&
+ (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI ||
+ oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) {
+ return BLOCK_64X64;
+ }
+ assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+ if (number_spatial_layers > 1 ||
+ oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
+ // Use the configured size (top resolution) for spatial layers or
+ // on resize.
+ return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720
+ ? BLOCK_128X128
+ : BLOCK_64X64;
+ } else if (oxcf->mode == REALTIME) {
+ if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) {
+ const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+ const int num_tiles =
+ (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows);
+ // For multi-thread encode: if the number of (128x128) superblocks
+ // per tile is low use 64X64 superblock.
+ if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 &&
+ oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 &&
+ (width * height) / (128 * 128 * num_tiles) <= 38)
+ return BLOCK_64X64;
+ else
+ return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64;
+ } else {
+ return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+ }
+ }
+
+ // TODO(any): Possibly could improve this with a heuristic.
+ // When superres / resize is on, 'cm->width / height' can change between
+ // calls, so we don't apply this heuristic there.
+ // Things break if superblock size changes between the first pass and second
+ // pass encoding, which is why this heuristic is not configured as a
+ // speed-feature.
+ if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
+ oxcf->resize_cfg.resize_mode == RESIZE_NONE) {
+ int is_480p_or_lesser = AOMMIN(width, height) <= 480;
+ if (oxcf->speed >= 1 && is_480p_or_lesser) return BLOCK_64X64;
+
+ // For 1080p and lower resolutions, choose SB size adaptively based on
+ // resolution and speed level for multi-thread encode.
+ int is_1080p_or_lesser = AOMMIN(width, height) <= 1080;
+ if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD &&
+ oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5)
+ return BLOCK_64X64;
+
+ // For allintra encode, since the maximum partition size is set to 32X32 for
+ // speed>=6, superblock size is set to 64X64 instead of 128X128. This
+ // improves the multithread performance due to reduction in top right delay
+ // and thread sync wastage. Currently, this setting is selectively enabled
+ // only for speed>=9 and resolutions less than 4k since cost update
+ // frequency is set to INTERNAL_COST_UPD_OFF in these cases.
+ const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
+ if (oxcf->mode == ALLINTRA && oxcf->speed >= 9 && !is_4k_or_larger)
+ return BLOCK_64X64;
+ }
+ return BLOCK_128X128;
+}
+
+void av1_setup_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // Set up entropy context depending on frame type. The decoder mandates
+ // the use of the default context, index 0, for keyframes and inter
+ // frames where the error_resilient_mode or intra_only flag is set. For
+ // other inter-frames the encoder currently uses only two contexts;
+ // context 1 for ALTREF frames and context 0 for the others.
+
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ cpi->ext_flags.use_primary_ref_none) {
+ av1_setup_past_independence(cm);
+ }
+
+ if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+ frame_is_sframe(cm)) {
+ if (!cpi->ppi->seq_params_locked) {
+ set_sb_size(cm->seq_params,
+ av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+ cpi->ppi->number_spatial_layers));
+ }
+ } else {
+ const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+ if (primary_ref_buf == NULL) {
+ av1_setup_past_independence(cm);
+ cm->seg.update_map = 1;
+ cm->seg.update_data = 1;
+ } else {
+ *cm->fc = primary_ref_buf->frame_context;
+ }
+ }
+
+ av1_zero(cm->cur_frame->interp_filter_selected);
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
+ cpi->vaq_refresh = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref,
+ InterpFilter ifilter) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+ if (buf == NULL) return 0;
+ return buf->interp_filter_selected[ifilter];
+}
+
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int ref_total[REF_FRAMES] = { 0 };
+ uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+ if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame)
+ return mask;
+
+ for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+ for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+ ++ifilter) {
+ ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+ }
+ }
+ int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+ ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+ ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+ for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+ ++ifilter) {
+ int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+ if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+ int filter_score =
+ get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+ get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+ get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+ if (filter_score < ref_total_total) {
+ DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+ reset_interp_filter_allowed_mask(&mask, filt_type);
+ }
+ }
+ }
+ return mask;
+}
+
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+ AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+ const int allow_intrabc_orig_decision,
+ const int use_screen_content_tools_orig_decision,
+ const int is_screen_content_type_orig_decision, const int pass,
+ int *projected_size_pass, PSNR_STATS *psnr) {
+ AV1_COMMON *const cm = &cpi->common;
+ FeatureFlags *const features = &cm->features;
+
+#if CONFIG_FPMT_TEST
+ projected_size_pass[pass] =
+ ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+ ? cpi->ppi->p_rc.temp_projected_frame_size
+ : cpi->rc.projected_frame_size;
+#else
+ projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+ bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+ if (pass != 1) return;
+
+ const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+ // Calculate % of palette mode to be chosen in a frame from mode decision.
+ const double palette_ratio =
+ (double)cpi->palette_pixel_num / (double)(cm->height * cm->width);
+ const int psnr_diff_is_large = (psnr_diff > STRICT_PSNR_DIFF_THRESH);
+ const int ratio_is_large =
+ ((palette_ratio >= 0.0001) && ((psnr_diff / palette_ratio) > 4));
+ const int is_sc_encoding_much_better = (psnr_diff_is_large || ratio_is_large);
+ if (is_sc_encoding_much_better) {
+ // Use screen content tools, if we get coding gain.
+ features->allow_screen_content_tools = 1;
+ features->allow_intrabc = cpi->intrabc_used;
+ cpi->use_screen_content_tools = 1;
+ cpi->is_screen_content_type = 1;
+ } else {
+ // Use original screen content decision.
+ features->allow_screen_content_tools =
+ allow_screen_content_tools_orig_decision;
+ features->allow_intrabc = allow_intrabc_orig_decision;
+ cpi->use_screen_content_tools = use_screen_content_tools_orig_decision;
+ cpi->is_screen_content_type = is_screen_content_type_orig_decision;
+ }
+}
+
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+ const int pass) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (pass == 0) {
+ // In the first pass, encode without screen content tools.
+ // Use a high q, and a fixed block size for fast encoding.
+ cm->features.allow_screen_content_tools = 0;
+ cm->features.allow_intrabc = 0;
+ cpi->use_screen_content_tools = 0;
+ cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+ cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+ return;
+ }
+ assert(pass == 1);
+ // In the second pass, encode with screen content tools.
+ // Use a high q, and a fixed block size for fast encoding.
+ cm->features.allow_screen_content_tools = 1;
+ // TODO(chengchen): turn intrabc on could lead to data race issue.
+ // cm->allow_intrabc = 1;
+ cpi->use_screen_content_tools = 1;
+ cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+ cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools".
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+ // Variables to help determine if we should allow screen content tools.
+ int projected_size_pass[3] = { 0 };
+ PSNR_STATS psnr[3];
+ const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+ const int allow_screen_content_tools_orig_decision =
+ cm->features.allow_screen_content_tools;
+ const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+ const int use_screen_content_tools_orig_decision =
+ cpi->use_screen_content_tools;
+ const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+ // Turn off the encoding trial for forward key frame and superres.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled ||
+ cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME ||
+ use_screen_content_tools_orig_decision || !is_key_frame) {
+ return;
+ }
+
+ // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+ // Find a better way to determine whether screen content tools should be used
+ // for lossless coding.
+ // Use a high q and a fixed partition to do quick encoding.
+ const int q_for_screen_content_quick_run =
+ is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244);
+ const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+ const BLOCK_SIZE fixed_partition_block_size_orig =
+ cpi->sf.part_sf.fixed_partition_size;
+
+ // Setup necessary params for encoding, including frame source, etc.
+
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+ 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ av1_setup_frame(cpi);
+
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+ // The two encoding passes aim to help determine whether to use screen
+ // content tools, with a high q and fixed partition.
+ for (int pass = 0; pass < 2; ++pass) {
+ set_encoding_params_for_screen_content(cpi, pass);
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
+ q_for_screen_content_quick_run,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+
+ av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+ 0);
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+ // Screen content decision
+ screen_content_tools_determination(
+ cpi, allow_screen_content_tools_orig_decision,
+ allow_intrabc_orig_decision, use_screen_content_tools_orig_decision,
+ is_screen_content_type_orig_decision, pass, projected_size_pass, psnr);
+ }
+
+ // Set partition speed feature back.
+ cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+ cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig;
+
+ // Free token related info if screen content coding tools are not enabled.
+ if (!cm->features.allow_screen_content_tools)
+ free_token_info(&cpi->token_info);
+}
+#endif // CONFIG_REALTIME_ONLY
+
+static void fix_interp_filter(InterpFilter *const interp_filter,
+ const FRAME_COUNTS *const counts) {
+ if (*interp_filter == SWITCHABLE) {
+ // Check to see if only one of the filters is actually used
+ int count[SWITCHABLE_FILTERS] = { 0 };
+ int num_filters_used = 0;
+ for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ count[i] += counts->switchable_interp[j][i];
+ num_filters_used += (count[i] > 0);
+ }
+ if (num_filters_used == 1) {
+ // Only one filter is used. So set the filter at frame level
+ for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ *interp_filter = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+
+ if (!cm->seq_params->reduced_still_picture_hdr &&
+ encode_show_existing_frame(cm)) {
+ RefCntBuffer *const frame_to_show =
+ cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+ if (frame_to_show == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer does not contain a reconstructed frame");
+ }
+ assert(frame_to_show->ref_count > 0);
+ assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+ }
+
+ if (!encode_show_existing_frame(cm) &&
+ cm->seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
+ // Copy the current frame's film grain params to the its corresponding
+ // RefCntBuffer slot.
+ cm->cur_frame->film_grain_params = cm->film_grain_params;
+
+ // We must update the parameters if this is not an INTER_FRAME
+ if (current_frame->frame_type != INTER_FRAME)
+ cm->cur_frame->film_grain_params.update_parameters = 1;
+
+ // Iterate the random seed for the next frame.
+ cm->film_grain_params.random_seed += 3381;
+ if (cm->film_grain_params.random_seed == 0)
+ cm->film_grain_params.random_seed = 7391;
+ }
+
+ // Initialise all tiles' contexts from the global frame context
+ for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+ for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+ const int tile_idx = tile_row * cm->tiles.cols + tile_col;
+ cpi->tile_data[tile_idx].tctx = *cm->fc;
+ }
+ }
+
+ if (!frame_is_intra_only(cm))
+ fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
+}
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+ const YV12_BUFFER_CONFIG *last_picture,
+ ForceIntegerMVInfo *const force_intpel_info) {
+ // check use hash ME
+ int k;
+
+ const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
+ const double threshold_current = 0.8;
+ const double threshold_average = 0.95;
+ const int max_history_size = 32;
+ int T = 0; // total block
+ int C = 0; // match with collocated block
+ int S = 0; // smooth region but not match with collocated block
+
+ const int pic_width = cur_picture->y_width;
+ const int pic_height = cur_picture->y_height;
+ for (int i = 0; i + block_size <= pic_height; i += block_size) {
+ for (int j = 0; j + block_size <= pic_width; j += block_size) {
+ const int x_pos = j;
+ const int y_pos = i;
+ int match = 1;
+ T++;
+
+ // check whether collocated block match with current
+ uint8_t *p_cur = cur_picture->y_buffer;
+ uint8_t *p_ref = last_picture->y_buffer;
+ int stride_cur = cur_picture->y_stride;
+ int stride_ref = last_picture->y_stride;
+ p_cur += (y_pos * stride_cur + x_pos);
+ p_ref += (y_pos * stride_ref + x_pos);
+
+ if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+ uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p16_cur[tmpX] != p16_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p16_cur += stride_cur;
+ p16_ref += stride_ref;
+ }
+ } else {
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p_cur[tmpX] != p_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p_cur += stride_cur;
+ p_ref += stride_ref;
+ }
+ }
+
+ if (match) {
+ C++;
+ continue;
+ }
+
+ if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+ y_pos) ||
+ av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+ S++;
+ continue;
+ }
+ }
+ }
+
+ assert(T > 0);
+ double cs_rate = ((double)(C + S)) / ((double)(T));
+
+ force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
+
+ force_intpel_info->rate_index =
+ (force_intpel_info->rate_index + 1) % max_history_size;
+ force_intpel_info->rate_size++;
+ force_intpel_info->rate_size =
+ AOMMIN(force_intpel_info->rate_size, max_history_size);
+
+ if (cs_rate < threshold_current) {
+ return 0;
+ }
+
+ if (C == T) {
+ return 1;
+ }
+
+ double cs_average = 0.0;
+
+ for (k = 0; k < force_intpel_info->rate_size; k++) {
+ cs_average += force_intpel_info->cs_rate_array[k];
+ }
+ cs_average /= force_intpel_info->rate_size;
+
+ if (cs_average < threshold_average) {
+ return 0;
+ }
+
+ if ((T - C - S) < 0) {
+ return 1;
+ }
+
+ if (cs_average > 1.01) {
+ return 1;
+ }
+
+ return 0;
+}
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ uint8_t *y_buffer = cpi->source->y_buffer;
+ const int y_stride = cpi->source->y_stride;
+ const int block_size = BLOCK_16X16;
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+ double log_sum = 0.0;
+
+ // Loop through each 16x16 block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ double var = 0.0, num_of_var = 0.0;
+ const int index = row * num_cols + col;
+
+ // Loop through each 8x8 block.
+ for (int mi_row = row * num_mi_h;
+ mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+ mi_row += 2) {
+ for (int mi_col = col * num_mi_w;
+ mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+ mi_col += 2) {
+ struct buf_2d buf;
+ const int row_offset_y = mi_row << 2;
+ const int col_offset_y = mi_col << 2;
+
+ buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ var += av1_get_perpixel_variance_facade(cpi, xd, &buf, BLOCK_8X8,
+ AOM_PLANE_Y);
+ num_of_var += 1.0;
+ }
+ }
+ var = var / num_of_var;
+
+ // Curve fitting with an exponential model on all 16x16 blocks from the
+ // midres dataset.
+ var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+
+ // As per the above computation, var will be in the range of
+ // [17.492222, 84.527656], assuming the data type is of infinite
+ // precision. The following assert conservatively checks if var is in the
+ // range of [17.0, 85.0] to avoid any issues due to the precision of the
+ // relevant data type.
+ assert(var > 17.0 && var < 85.0);
+ cpi->ssim_rdmult_scaling_factors[index] = var;
+ log_sum += log(var);
+ }
+ }
+
+ // As log_sum holds the geometric mean, it will be in the range
+ // [17.492222, 84.527656]. Hence, in the below loop, the value of
+ // cpi->ssim_rdmult_scaling_factors[index] would be in the range
+ // [0.2069, 4.8323].
+ log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+ }
+ }
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+
+ cc->lf = cm->lf;
+ cc->cdef_info = cm->cdef_info;
+ cc->rc = cpi->rc;
+ cc->mv_stats = cpi->ppi->mv_stats;
+}
+
+void av1_save_all_coding_context(AV1_COMP *cpi) {
+ save_extra_coding_context(cpi);
+ if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+#if DUMP_RECON_FRAMES == 1
+
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
+
+ if (recon_buf == NULL) {
+ printf("Frame %d is not ready.\n", current_frame->frame_number);
+ return;
+ }
+
+ static const int flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+ printf(
+ "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+ "show_existing_frame=%d) "
+ "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+ current_frame->frame_number, current_frame->order_hint, cm->show_frame,
+ cm->show_existing_frame);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+ printf(" %d(%c)", ref_offset,
+ (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
+ }
+ printf(" ]\n");
+
+ if (!cm->show_frame) {
+ printf("Frame %d is a no show frame, so no image dump.\n",
+ current_frame->frame_number);
+ return;
+ }
+
+ int h;
+ char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+ FILE *f_recon = NULL;
+
+ if (current_frame->frame_number == 0) {
+ if ((f_recon = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return;
+ }
+ } else {
+ if ((f_recon = fopen(file_name, "ab")) == NULL) {
+ printf("Unable to open file %s to append.\n", file_name);
+ return;
+ }
+ }
+ printf(
+ "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+ "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+ "refresh_alt_ref_frame=%d, "
+ "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+ current_frame->frame_number, cpi->gf_frame_index,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+ current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
+ cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame,
+ recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+ int ref_frame;
+ printf("get_ref_frame_map_idx: [");
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
+ printf(" ]\n");
+#endif // 0
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+ f_recon);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+
+ fclose(f_recon);
+}
+#endif // DUMP_RECON_FRAMES
diff --git a/third_party/aom/av1/encoder/encoder_utils.h b/third_party/aom/av1/encoder/encoder_utils.h
new file mode 100644
index 0000000000..113f62aa59
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_utils.h
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODER_UTILS_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+#define DUMP_RECON_FRAMES 0
+
+extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL]
+ [TX_TYPES];
+
+extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+extern const int default_warped_probs[FRAME_UPDATE_TYPES];
+
+extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+ [SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ int i;
+ const int num_mis =
+ cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+ if (cpi->active_map.enabled || cpi->active_map.update)
+ for (i = 0; i < num_mis; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+ seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+// Returns 'size' in the number of Mode Info (MI) units. 'size' is either the
+// width or height.
+static AOM_INLINE int size_in_mi(int size) {
+ // Ensure that the decoded width and height are both multiples of
+ // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+ // subsampling is used).
+ // This simplifies the implementation of various experiments,
+ // eg. cdef, which operates on units of 8x8 luma pixels.
+ const int aligned_size = ALIGN_POWER_OF_TWO(size, 3);
+ return aligned_size >> MI_SIZE_LOG2;
+}
+
+static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+ int height) {
+ mi_params->mi_cols = size_in_mi(width);
+ mi_params->mi_rows = size_in_mi(height);
+ mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+ mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
+ mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
+ mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ mi_params->mi_alloc_stride =
+ (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+ assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+ mi_size_high[mi_params->mi_alloc_bsize]);
+}
+
+static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
+ aom_free(mi_params->mi_alloc);
+ mi_params->mi_alloc = NULL;
+ mi_params->mi_alloc_size = 0;
+ aom_free(mi_params->mi_grid_base);
+ mi_params->mi_grid_base = NULL;
+ mi_params->mi_grid_size = 0;
+ aom_free(mi_params->tx_type_map);
+ mi_params->tx_type_map = NULL;
+}
+
+static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+ int height,
+ BLOCK_SIZE min_partition_size) {
+ mi_params->mi_alloc_bsize = min_partition_size;
+
+ set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
+ int width, int height,
+ BLOCK_SIZE min_partition_size) {
+ (void)min_partition_size;
+ mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+ set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
+ const int mi_grid_size =
+ mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+ memset(mi_params->mi_alloc, 0,
+ mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+ memset(mi_params->mi_grid_base, 0,
+ mi_grid_size * sizeof(*mi_params->mi_grid_base));
+ memset(mi_params->tx_type_map, 0,
+ mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
+
+static AOM_INLINE void init_buffer_indices(
+ ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) {
+ int fb_idx;
+ for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+ remapped_ref_idx[fb_idx] = fb_idx;
+ force_intpel_info->rate_index = 0;
+ force_intpel_info->rate_size = 0;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+ ppi->fn_ptr[BT].sdf = SDF; \
+ ppi->fn_ptr[BT].sdaf = SDAF; \
+ ppi->fn_ptr[BT].vf = VF; \
+ ppi->fn_ptr[BT].svf = SVF; \
+ ppi->fn_ptr[BT].svaf = SVAF; \
+ ppi->fn_ptr[BT].sdx4df = SDX4DF; \
+ ppi->fn_ptr[BT].sdx3df = SDX3DF; \
+ ppi->fn_ptr[BT].jsdaf = JSDAF; \
+ ppi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_BFP( \
+ BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \
+ aom_highbd_##BD##_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD, \
+ aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \
+ aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+ }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 4; \
+ }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param) >> \
+ 4; \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d)
+#endif
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+ ppi->fn_ptr[BT].msdf = MCSDF; \
+ ppi->fn_ptr[BT].msvf = MCSVF;
+
+#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask) >> \
+ 4; \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#if !CONFIG_REALTIME_ONLY
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
+#endif
+
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+ ppi->fn_ptr[BT].sdsf = SDSF; \
+ ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD)
+
+#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return fnname(src, src_stride, ref, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return fnname(src, src_stride, ref, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return fnname(src, src_stride, ref, ref_stride) >> 4; \
+ }
+
+#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32)
+#endif
+
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT) \
+ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \
+ aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+ ppi->fn_ptr[BT].osdf = OSDF; \
+ ppi->fn_ptr[BT].ovf = OVF; \
+ ppi->fn_ptr[BT].osvf = OSVF;
+
+#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk); \
+ } \
+ static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 2; \
+ } \
+ static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 4; \
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+#endif
+
+static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8:
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_BFP_WRAPPER(64, 16, 8)
+ HIGHBD_BFP_WRAPPER(16, 64, 8)
+ HIGHBD_BFP_WRAPPER(32, 8, 8)
+ HIGHBD_BFP_WRAPPER(8, 32, 8)
+ HIGHBD_BFP_WRAPPER(16, 4, 8)
+ HIGHBD_BFP_WRAPPER(4, 16, 8)
+#endif
+ HIGHBD_BFP_WRAPPER(32, 16, 8)
+ HIGHBD_BFP_WRAPPER(16, 32, 8)
+ HIGHBD_BFP_WRAPPER(64, 32, 8)
+ HIGHBD_BFP_WRAPPER(32, 64, 8)
+ HIGHBD_BFP_WRAPPER(32, 32, 8)
+ HIGHBD_BFP_WRAPPER(64, 64, 8)
+ HIGHBD_BFP_WRAPPER(16, 16, 8)
+ HIGHBD_BFP_WRAPPER(16, 8, 8)
+ HIGHBD_BFP_WRAPPER(8, 16, 8)
+ HIGHBD_BFP_WRAPPER(8, 8, 8)
+ HIGHBD_BFP_WRAPPER(8, 4, 8)
+ HIGHBD_BFP_WRAPPER(4, 8, 8)
+ HIGHBD_BFP_WRAPPER(4, 4, 8)
+ HIGHBD_BFP_WRAPPER(128, 128, 8)
+ HIGHBD_BFP_WRAPPER(128, 64, 8)
+ HIGHBD_BFP_WRAPPER(64, 128, 8)
+
+ HIGHBD_MBFP_WRAPPER(128, 128, 8)
+ HIGHBD_MBFP_WRAPPER(128, 64, 8)
+ HIGHBD_MBFP_WRAPPER(64, 128, 8)
+ HIGHBD_MBFP_WRAPPER(64, 64, 8)
+ HIGHBD_MBFP_WRAPPER(64, 32, 8)
+ HIGHBD_MBFP_WRAPPER(32, 64, 8)
+ HIGHBD_MBFP_WRAPPER(32, 32, 8)
+ HIGHBD_MBFP_WRAPPER(32, 16, 8)
+ HIGHBD_MBFP_WRAPPER(16, 32, 8)
+ HIGHBD_MBFP_WRAPPER(16, 16, 8)
+ HIGHBD_MBFP_WRAPPER(8, 16, 8)
+ HIGHBD_MBFP_WRAPPER(16, 8, 8)
+ HIGHBD_MBFP_WRAPPER(8, 8, 8)
+ HIGHBD_MBFP_WRAPPER(4, 8, 8)
+ HIGHBD_MBFP_WRAPPER(8, 4, 8)
+ HIGHBD_MBFP_WRAPPER(4, 4, 8)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_MBFP_WRAPPER(64, 16, 8)
+ HIGHBD_MBFP_WRAPPER(16, 64, 8)
+ HIGHBD_MBFP_WRAPPER(32, 8, 8)
+ HIGHBD_MBFP_WRAPPER(8, 32, 8)
+ HIGHBD_MBFP_WRAPPER(16, 4, 8)
+ HIGHBD_MBFP_WRAPPER(4, 16, 8)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_OBFP_WRAPPER_8(128, 128)
+ HIGHBD_OBFP_WRAPPER_8(128, 64)
+ HIGHBD_OBFP_WRAPPER_8(64, 128)
+ HIGHBD_OBFP_WRAPPER_8(64, 64)
+ HIGHBD_OBFP_WRAPPER_8(64, 32)
+ HIGHBD_OBFP_WRAPPER_8(32, 64)
+ HIGHBD_OBFP_WRAPPER_8(32, 32)
+ HIGHBD_OBFP_WRAPPER_8(32, 16)
+ HIGHBD_OBFP_WRAPPER_8(16, 32)
+ HIGHBD_OBFP_WRAPPER_8(16, 16)
+ HIGHBD_OBFP_WRAPPER_8(8, 16)
+ HIGHBD_OBFP_WRAPPER_8(16, 8)
+ HIGHBD_OBFP_WRAPPER_8(8, 8)
+ HIGHBD_OBFP_WRAPPER_8(4, 8)
+ HIGHBD_OBFP_WRAPPER_8(8, 4)
+ HIGHBD_OBFP_WRAPPER_8(4, 4)
+ HIGHBD_OBFP_WRAPPER_8(64, 16)
+ HIGHBD_OBFP_WRAPPER_8(16, 64)
+ HIGHBD_OBFP_WRAPPER_8(32, 8)
+ HIGHBD_OBFP_WRAPPER_8(8, 32)
+ HIGHBD_OBFP_WRAPPER_8(16, 4)
+ HIGHBD_OBFP_WRAPPER_8(4, 16)
+#endif
+
+ HIGHBD_SDSFP_WRAPPER(128, 128, 8)
+ HIGHBD_SDSFP_WRAPPER(128, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(64, 128, 8)
+ HIGHBD_SDSFP_WRAPPER(64, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(64, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 8, 8)
+ HIGHBD_SDSFP_WRAPPER(8, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(8, 8, 8)
+ HIGHBD_SDSFP_WRAPPER(4, 8, 8)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_SDSFP_WRAPPER(64, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 8, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(8, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(4, 16, 8)
+#endif
+ break;
+
+ case AOM_BITS_10:
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_BFP_WRAPPER(64, 16, 10)
+ HIGHBD_BFP_WRAPPER(16, 64, 10)
+ HIGHBD_BFP_WRAPPER(32, 8, 10)
+ HIGHBD_BFP_WRAPPER(8, 32, 10)
+ HIGHBD_BFP_WRAPPER(16, 4, 10)
+ HIGHBD_BFP_WRAPPER(4, 16, 10)
+#endif
+ HIGHBD_BFP_WRAPPER(32, 16, 10)
+ HIGHBD_BFP_WRAPPER(16, 32, 10)
+ HIGHBD_BFP_WRAPPER(64, 32, 10)
+ HIGHBD_BFP_WRAPPER(32, 64, 10)
+ HIGHBD_BFP_WRAPPER(32, 32, 10)
+ HIGHBD_BFP_WRAPPER(64, 64, 10)
+ HIGHBD_BFP_WRAPPER(16, 16, 10)
+ HIGHBD_BFP_WRAPPER(16, 8, 10)
+ HIGHBD_BFP_WRAPPER(8, 16, 10)
+ HIGHBD_BFP_WRAPPER(8, 8, 10)
+ HIGHBD_BFP_WRAPPER(8, 4, 10)
+ HIGHBD_BFP_WRAPPER(4, 8, 10)
+ HIGHBD_BFP_WRAPPER(4, 4, 10)
+ HIGHBD_BFP_WRAPPER(128, 128, 10)
+ HIGHBD_BFP_WRAPPER(128, 64, 10)
+ HIGHBD_BFP_WRAPPER(64, 128, 10)
+
+ HIGHBD_MBFP_WRAPPER(128, 128, 10)
+ HIGHBD_MBFP_WRAPPER(128, 64, 10)
+ HIGHBD_MBFP_WRAPPER(64, 128, 10)
+ HIGHBD_MBFP_WRAPPER(64, 64, 10)
+ HIGHBD_MBFP_WRAPPER(64, 32, 10)
+ HIGHBD_MBFP_WRAPPER(32, 64, 10)
+ HIGHBD_MBFP_WRAPPER(32, 32, 10)
+ HIGHBD_MBFP_WRAPPER(32, 16, 10)
+ HIGHBD_MBFP_WRAPPER(16, 32, 10)
+ HIGHBD_MBFP_WRAPPER(16, 16, 10)
+ HIGHBD_MBFP_WRAPPER(8, 16, 10)
+ HIGHBD_MBFP_WRAPPER(16, 8, 10)
+ HIGHBD_MBFP_WRAPPER(8, 8, 10)
+ HIGHBD_MBFP_WRAPPER(4, 8, 10)
+ HIGHBD_MBFP_WRAPPER(8, 4, 10)
+ HIGHBD_MBFP_WRAPPER(4, 4, 10)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_MBFP_WRAPPER(64, 16, 10)
+ HIGHBD_MBFP_WRAPPER(16, 64, 10)
+ HIGHBD_MBFP_WRAPPER(32, 8, 10)
+ HIGHBD_MBFP_WRAPPER(8, 32, 10)
+ HIGHBD_MBFP_WRAPPER(16, 4, 10)
+ HIGHBD_MBFP_WRAPPER(4, 16, 10)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_OBFP_WRAPPER(128, 128, 10)
+ HIGHBD_OBFP_WRAPPER(128, 64, 10)
+ HIGHBD_OBFP_WRAPPER(64, 128, 10)
+ HIGHBD_OBFP_WRAPPER(64, 64, 10)
+ HIGHBD_OBFP_WRAPPER(64, 32, 10)
+ HIGHBD_OBFP_WRAPPER(32, 64, 10)
+ HIGHBD_OBFP_WRAPPER(32, 32, 10)
+ HIGHBD_OBFP_WRAPPER(32, 16, 10)
+ HIGHBD_OBFP_WRAPPER(16, 32, 10)
+ HIGHBD_OBFP_WRAPPER(16, 16, 10)
+ HIGHBD_OBFP_WRAPPER(8, 16, 10)
+ HIGHBD_OBFP_WRAPPER(16, 8, 10)
+ HIGHBD_OBFP_WRAPPER(8, 8, 10)
+ HIGHBD_OBFP_WRAPPER(4, 8, 10)
+ HIGHBD_OBFP_WRAPPER(8, 4, 10)
+ HIGHBD_OBFP_WRAPPER(4, 4, 10)
+ HIGHBD_OBFP_WRAPPER(64, 16, 10)
+ HIGHBD_OBFP_WRAPPER(16, 64, 10)
+ HIGHBD_OBFP_WRAPPER(32, 8, 10)
+ HIGHBD_OBFP_WRAPPER(8, 32, 10)
+ HIGHBD_OBFP_WRAPPER(16, 4, 10)
+ HIGHBD_OBFP_WRAPPER(4, 16, 10)
+#endif
+
+ HIGHBD_SDSFP_WRAPPER(128, 128, 10)
+ HIGHBD_SDSFP_WRAPPER(128, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(64, 128, 10)
+ HIGHBD_SDSFP_WRAPPER(64, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(64, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 8, 10)
+ HIGHBD_SDSFP_WRAPPER(8, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(8, 8, 10)
+ HIGHBD_SDSFP_WRAPPER(4, 8, 10)
+
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_SDSFP_WRAPPER(64, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 8, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(8, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(4, 16, 10)
+#endif
+ break;
+
+ case AOM_BITS_12:
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_BFP_WRAPPER(64, 16, 12)
+ HIGHBD_BFP_WRAPPER(16, 64, 12)
+ HIGHBD_BFP_WRAPPER(32, 8, 12)
+ HIGHBD_BFP_WRAPPER(8, 32, 12)
+ HIGHBD_BFP_WRAPPER(16, 4, 12)
+ HIGHBD_BFP_WRAPPER(4, 16, 12)
+#endif
+ HIGHBD_BFP_WRAPPER(32, 16, 12)
+ HIGHBD_BFP_WRAPPER(16, 32, 12)
+ HIGHBD_BFP_WRAPPER(64, 32, 12)
+ HIGHBD_BFP_WRAPPER(32, 64, 12)
+ HIGHBD_BFP_WRAPPER(32, 32, 12)
+ HIGHBD_BFP_WRAPPER(64, 64, 12)
+ HIGHBD_BFP_WRAPPER(16, 16, 12)
+ HIGHBD_BFP_WRAPPER(16, 8, 12)
+ HIGHBD_BFP_WRAPPER(8, 16, 12)
+ HIGHBD_BFP_WRAPPER(8, 8, 12)
+ HIGHBD_BFP_WRAPPER(8, 4, 12)
+ HIGHBD_BFP_WRAPPER(4, 8, 12)
+ HIGHBD_BFP_WRAPPER(4, 4, 12)
+ HIGHBD_BFP_WRAPPER(128, 128, 12)
+ HIGHBD_BFP_WRAPPER(128, 64, 12)
+ HIGHBD_BFP_WRAPPER(64, 128, 12)
+
+ HIGHBD_MBFP_WRAPPER(128, 128, 12)
+ HIGHBD_MBFP_WRAPPER(128, 64, 12)
+ HIGHBD_MBFP_WRAPPER(64, 128, 12)
+ HIGHBD_MBFP_WRAPPER(64, 64, 12)
+ HIGHBD_MBFP_WRAPPER(64, 32, 12)
+ HIGHBD_MBFP_WRAPPER(32, 64, 12)
+ HIGHBD_MBFP_WRAPPER(32, 32, 12)
+ HIGHBD_MBFP_WRAPPER(32, 16, 12)
+ HIGHBD_MBFP_WRAPPER(16, 32, 12)
+ HIGHBD_MBFP_WRAPPER(16, 16, 12)
+ HIGHBD_MBFP_WRAPPER(8, 16, 12)
+ HIGHBD_MBFP_WRAPPER(16, 8, 12)
+ HIGHBD_MBFP_WRAPPER(8, 8, 12)
+ HIGHBD_MBFP_WRAPPER(4, 8, 12)
+ HIGHBD_MBFP_WRAPPER(8, 4, 12)
+ HIGHBD_MBFP_WRAPPER(4, 4, 12)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_MBFP_WRAPPER(64, 16, 12)
+ HIGHBD_MBFP_WRAPPER(16, 64, 12)
+ HIGHBD_MBFP_WRAPPER(32, 8, 12)
+ HIGHBD_MBFP_WRAPPER(8, 32, 12)
+ HIGHBD_MBFP_WRAPPER(16, 4, 12)
+ HIGHBD_MBFP_WRAPPER(4, 16, 12)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_OBFP_WRAPPER(128, 128, 12)
+ HIGHBD_OBFP_WRAPPER(128, 64, 12)
+ HIGHBD_OBFP_WRAPPER(64, 128, 12)
+ HIGHBD_OBFP_WRAPPER(64, 64, 12)
+ HIGHBD_OBFP_WRAPPER(64, 32, 12)
+ HIGHBD_OBFP_WRAPPER(32, 64, 12)
+ HIGHBD_OBFP_WRAPPER(32, 32, 12)
+ HIGHBD_OBFP_WRAPPER(32, 16, 12)
+ HIGHBD_OBFP_WRAPPER(16, 32, 12)
+ HIGHBD_OBFP_WRAPPER(16, 16, 12)
+ HIGHBD_OBFP_WRAPPER(8, 16, 12)
+ HIGHBD_OBFP_WRAPPER(16, 8, 12)
+ HIGHBD_OBFP_WRAPPER(8, 8, 12)
+ HIGHBD_OBFP_WRAPPER(4, 8, 12)
+ HIGHBD_OBFP_WRAPPER(8, 4, 12)
+ HIGHBD_OBFP_WRAPPER(4, 4, 12)
+ HIGHBD_OBFP_WRAPPER(64, 16, 12)
+ HIGHBD_OBFP_WRAPPER(16, 64, 12)
+ HIGHBD_OBFP_WRAPPER(32, 8, 12)
+ HIGHBD_OBFP_WRAPPER(8, 32, 12)
+ HIGHBD_OBFP_WRAPPER(16, 4, 12)
+ HIGHBD_OBFP_WRAPPER(4, 16, 12)
+#endif
+
+ HIGHBD_SDSFP_WRAPPER(128, 128, 12)
+ HIGHBD_SDSFP_WRAPPER(128, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(64, 128, 12)
+ HIGHBD_SDSFP_WRAPPER(64, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(64, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 8, 12)
+ HIGHBD_SDSFP_WRAPPER(8, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(8, 8, 12)
+ HIGHBD_SDSFP_WRAPPER(4, 8, 12)
+
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_SDSFP_WRAPPER(64, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 8, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(8, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(4, 16, 12)
+#endif
+ break;
+
+ default:
+ assert(0 &&
+ "cm->seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
+ FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+ }
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+ }
+ if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ av1_copy(frame_probs->warped_probs, default_warped_probs);
+ }
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ av1_copy(frame_probs->switchable_interp_probs,
+ default_switchable_interp_probs);
+ }
+
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs);
+ }
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs);
+ }
+ if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ av1_copy(temp_frame_probs->warped_probs, default_warped_probs);
+ }
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ av1_copy(temp_frame_probs->switchable_interp_probs,
+ default_switchable_interp_probs);
+ }
+
+ FrameProbInfo *const temp_frame_probs_simulation =
+ &cpi->ppi->temp_frame_probs_simulation;
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ av1_copy(temp_frame_probs_simulation->tx_type_probs,
+ default_tx_type_probs);
+ }
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs);
+ }
+ if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs);
+ }
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ av1_copy(temp_frame_probs_simulation->switchable_interp_probs,
+ default_switchable_interp_probs);
+ }
+ }
+#endif
+}
+
+static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
+ const CdefInfo *const src) {
+ dst->cdef_bits = src->cdef_bits;
+ dst->cdef_damping = src->cdef_damping;
+ av1_copy(dst->cdef_strengths, src->cdef_strengths);
+ av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths);
+ dst->nb_cdef_strengths = src->nb_cdef_strengths;
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+ cm->lf = cc->lf;
+ restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info);
+ cpi->rc = cc->rc;
+ cpi->ppi->mv_stats = cc->mv_stats;
+}
+
+static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+ a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+ a->border == b->border &&
+ (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
+ bool *ext_refresh_frame_context_pending,
+ bool update) {
+ *ext_refresh_frame_context = update;
+ *ext_refresh_frame_context_pending = 1;
+ return 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
+ double max_factor,
+ int prior_boost,
+ int tpl_boost,
+ int frames_to_key) {
+ double factor = sqrt((double)frames_to_key);
+ double range = max_factor - min_factor;
+ factor = AOMMIN(factor, max_factor);
+ factor = AOMMAX(factor, min_factor);
+ factor -= min_factor;
+ int boost =
+ (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+ return boost;
+}
+#endif
+
+static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
+ int i;
+ AV1_COMMON *const cm = &cpi->common;
+ FeatureFlags *const features = &cm->features;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ cm->global_motion[i] = default_warp_params;
+ }
+ cpi->gm_info.search_done = 0;
+
+ av1_set_speed_features_framesize_independent(cpi, cpi->speed);
+ av1_set_rd_speed_thresholds(cpi);
+ features->interp_filter = SWITCHABLE;
+ features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+ features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc);
+}
+
+static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
+ // Scaled references should only need to be released under certain conditions:
+ // if the reference will be updated, or if the scaled reference has same
+ // resolution. For now only apply this to Golden for non-svc RTC mode.
+ AV1_COMMON *const cm = &cpi->common;
+ const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0;
+ bool release_golden = true;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+ const int golden_ref = (i == GOLDEN_FRAME - 1);
+ if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc &&
+ buf != NULL) {
+ const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME);
+ const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width &&
+ buf->buf.y_crop_height == ref->buf.y_crop_height;
+ release_golden = refresh_golden || same_resoln;
+ }
+ if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) {
+ --buf->ref_count;
+ cpi->scaled_ref_buf[i] = NULL;
+ }
+ }
+}
+
+static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
+ restore_extra_coding_context(cpi);
+ if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) {
+ const SequenceHeader *const seq_params = cpi->common.seq_params;
+ return is_one_pass_rt_params(cpi) &&
+ use_rtc_reference_structure_one_layer(cpi) &&
+ (seq_params->order_hint_info.enable_order_hint == 0) &&
+ cpi->rt_reduce_num_ref_buffers;
+}
+
+// Refresh reference frame buffers according to refresh_frame_flags.
+static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // All buffers are refreshed for shown keyframes and S-frames.
+ // In case of RT, golden frame refreshes the 6th slot and other reference
+ // frames refresh slots 0 to 5. Slot 7 is not refreshed by any reference
+ // frame. Thus, only 7 buffers are refreshed for keyframes and S-frames
+ // instead of 8.
+ int num_ref_buffers = REF_FRAMES;
+ if (reduce_num_ref_buffers(cpi)) {
+ const int refresh_all_bufs =
+ (cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET ||
+ frame_is_sframe(cm));
+ assert(IMPLIES(((cm->current_frame.refresh_frame_flags >> 7) & 1) == 1,
+ refresh_all_bufs));
+ (void)refresh_all_bufs;
+ num_ref_buffers--;
+ }
+
+ for (int ref_frame = 0; ref_frame < num_ref_buffers; ref_frame++) {
+ if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+ assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+ }
+ }
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf);
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+ const AV1EncoderConfig *oxcf);
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+ const int phase, const int use_optimized_scaler);
+
+void av1_setup_frame(AV1_COMP *cpi);
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+ int height, int number_spatial_layers);
+
+void av1_apply_active_map(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi);
+
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig);
+#endif
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+ int *top_index);
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi);
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+ const YV12_BUFFER_CONFIG *last_picture,
+ ForceIntegerMVInfo *const force_intpel_info);
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_save_all_coding_context(AV1_COMP *cpi);
+
+#if DUMP_RECON_FRAMES == 1
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi);
+#endif
+
+static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra,
+ BLOCK_SIZE sb_size) {
+ // For allintra encoding mode, inter-frame motion search is not applicable and
+ // the intraBC motion vectors are restricted within the tile boundaries. Hence
+ // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case.
+ if (resize) {
+ return AOM_BORDER_IN_PIXELS;
+ }
+ if (all_intra) {
+ return AOM_ENC_ALLINTRA_BORDER;
+ }
+ return block_size_wide[sb_size] + 32;
+}
+
+static AOM_INLINE bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) {
+ const ResizeCfg *resize_cfg = &oxcf->resize_cfg;
+ const SuperResCfg *superres_cfg = &oxcf->superres_cfg;
+ return resize_cfg->resize_mode || superres_cfg->superres_mode;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_UTILS_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 0000000000..5fe2a497c7
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+ const int num_sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+ const int num_sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ const int size = num_sb_rows * num_sb_cols;
+ const int num_planes = av1_num_planes(cm);
+ const int subsampling_x = cm->seq_params->subsampling_x;
+ const int subsampling_y = cm->seq_params->subsampling_y;
+ const int luma_max_sb_square =
+ 1 << num_pels_log2_lookup[cm->seq_params->sb_size];
+ const int chroma_max_sb_square =
+ luma_max_sb_square >> (subsampling_x + subsampling_y);
+ const int num_tcoeffs =
+ size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
+ const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
+
+ av1_free_txb_buf(cpi);
+ // TODO(jingning): This should be further reduced.
+ CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+ aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
+ CHECK_MEM_ERROR(
+ cm, coeff_buf_pool->tcoeff,
+ aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs));
+ CHECK_MEM_ERROR(
+ cm, coeff_buf_pool->eobs,
+ aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size));
+ CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx,
+ aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) *
+ num_tcoeffs / txb_unit_size));
+
+ tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff;
+ uint16_t *eob_ptr = coeff_buf_pool->eobs;
+ uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx;
+ for (int i = 0; i < size; i++) {
+ for (int plane = 0; plane < num_planes; plane++) {
+ const int max_sb_square =
+ (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square;
+ cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr;
+ cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr;
+ cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr;
+ tcoeff_ptr += max_sb_square;
+ eob_ptr += max_sb_square / txb_unit_size;
+ entropy_ctx_ptr += max_sb_square / txb_unit_size;
+ }
+ }
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) {
+ CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+ aom_free(cpi->coeff_buffer_base);
+ cpi->coeff_buffer_base = NULL;
+ aom_free(coeff_buf_pool->tcoeff);
+ coeff_buf_pool->tcoeff = NULL;
+ aom_free(coeff_buf_pool->eobs);
+ coeff_buf_pool->eobs = NULL;
+ aom_free(coeff_buf_pool->entropy_ctx);
+ coeff_buf_pool->entropy_ctx = NULL;
+}
+
+static void write_golomb(aom_writer *w, int level) {
+ int x = level + 1;
+ int i = x;
+ int length = 0;
+
+ while (i) {
+ i >>= 1;
+ ++length;
+ }
+ assert(length > 0);
+
+ for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+ for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+static const int8_t eob_to_pos_small[33] = {
+ 0, 1, 2, // 0-2
+ 3, 3, // 3-4
+ 4, 4, 4, 4, // 5-8
+ 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+ 6, // place holder
+ 7, // 33-64
+ 8, 8, // 65-128
+ 9, 9, 9, 9, // 129-256
+ 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
+ 11 // 513-
+};
+
+int av1_get_eob_pos_token(const int eob, int *const extra) {
+ int t;
+
+ if (eob < 33) {
+ t = eob_to_pos_small[eob];
+ } else {
+ const int e = AOMMIN((eob - 1) >> 5, 16);
+ t = eob_to_pos_large[e];
+ }
+
+ *extra = eob - av1_eob_group_start[t];
+
+ return t;
+}
+
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+ TX_CLASS tx_class, PLANE_TYPE plane,
+ FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+ uint8_t allow_update_cdf) {
+#else
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+ uint8_t allow_update_cdf) {
+#endif
+ int eob_extra;
+ const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+ TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+
+ switch (eob_multi_size) {
+ case 0:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+ break;
+ case 1:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+ break;
+ case 2:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+ break;
+ case 3:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+ 8);
+ }
+ break;
+ case 4:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+ 9);
+ }
+ break;
+ case 5:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+ 10);
+ }
+ break;
+ case 6:
+ default:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+ 11);
+ }
+ break;
+ }
+
+ if (av1_eob_offset_bits[eob_pt] > 0) {
+ int eob_ctx = eob_pt - 3;
+ int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+ int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+ counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
+ }
+}
+
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+ const int coeff_idx, const int bhl,
+ const int width, const int scan_idx,
+ const int is_eob, const TX_SIZE tx_size,
+ const TX_CLASS tx_class) {
+ if (is_eob) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (width << bhl) / 8) return 1;
+ if (scan_idx <= (width << bhl) / 4) return 2;
+ return 3;
+ }
+ const int stats =
+ get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class);
+ return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
+}
+
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ uint8_t *ls = levels;
+
+ memset(levels + stride * width, 0,
+ sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+ for (int i = 0; i < width; i++) {
+ for (int j = 0; j < height; j++) {
+ *ls++ = (uint8_t)clamp(abs(coeff[i * height + j]), 0, INT8_MAX);
+ }
+ for (int j = 0; j < TX_PAD_HOR; j++) {
+ *ls++ = 0;
+ }
+ }
+}
+
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size, const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ for (int i = 0; i < eob; ++i) {
+ const int pos = scan[i];
+ coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bhl, width, i,
+ i == eob - 1, tx_size, tx_class);
+ }
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+ aom_writer *w, int blk_row, int blk_col, int plane,
+ int block, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+ (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+ const uint16_t eob = eob_txb[block];
+ const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+ const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
+ if (eob == 0) return;
+
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ // Only y plane's tx_type is transmitted
+ if (plane == 0) {
+ av1_write_tx_type(cm, xd, tx_type, tx_size, w);
+ }
+
+ int eob_extra;
+ const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ switch (eob_multi_size) {
+ case 0:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+ break;
+ case 1:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+ break;
+ case 2:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+ break;
+ case 3:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+ break;
+ case 4:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+ break;
+ case 5:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+ break;
+ default:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+ break;
+ }
+
+ const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+ if (eob_offset_bits > 0) {
+ const int eob_ctx = eob_pt - 3;
+ int eob_shift = eob_offset_bits - 1;
+ int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ aom_write_symbol(w, bit,
+ ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+ for (int i = 1; i < eob_offset_bits; i++) {
+ eob_shift = eob_offset_bits - 1 - i;
+ bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ aom_write_bit(w, bit);
+ }
+ }
+
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ const tran_low_t *tcoeff_txb =
+ cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+ const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ const int bhl = get_txb_bhl(tx_size);
+ for (int c = eob - 1; c >= 0; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = tcoeff[pos];
+ const tran_low_t level = abs(v);
+
+ if (c == eob - 1) {
+ aom_write_symbol(
+ w, AOMMIN(level, 3) - 1,
+ ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+ } else {
+ aom_write_symbol(w, AOMMIN(level, 3),
+ ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+ 4);
+ }
+ if (level > NUM_BASE_LEVELS) {
+ // level is above 1.
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ aom_cdf_prob *cdf =
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ }
+
+ // Loop to code all signs in the transform block,
+ // starting with the sign of DC (if applicable)
+ for (int c = 0; c < eob; ++c) {
+ const tran_low_t v = tcoeff[scan[c]];
+ const tran_low_t level = abs(v);
+ const int sign = (v < 0) ? 1 : 0;
+ if (level) {
+ if (c == 0) {
+ const int dc_sign_ctx =
+ (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
+ aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+ 2);
+ } else {
+ aom_write_bit(w, sign);
+ }
+ if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+ }
+ }
+}
+
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+ aom_writer *w, BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_planes = av1_num_planes(cm);
+ int block[MAX_MB_PLANE] = { 0 };
+ int row, col;
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int step = stepr * stepc;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+ for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += stepr) {
+ for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += stepc) {
+ av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
+ block[plane], tx_size);
+ block[plane] += step;
+ }
+ }
+ }
+ }
+ }
+}
+
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob) {
+ const int16_t *const scan = scan_order->scan;
+ int cul_level = 0;
+ int c;
+
+ if (eob == 0) return 0;
+ for (c = 0; c < eob; ++c) {
+ cul_level += abs(qcoeff[scan[c]]);
+ if (cul_level > COEFF_CONTEXT_MASK) break;
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+ set_dc_sign(&cul_level, qcoeff[0]);
+
+ return (uint8_t)cul_level;
+}
+
+static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int blk_row, int blk_col,
+ int plane, TX_SIZE tx_size,
+ FRAME_COUNTS *counts,
+ uint8_t allow_update_cdf) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int is_inter = is_inter_block(mbmi);
+ const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+ (void)counts;
+#endif // !CONFIG_ENTROPY_STATS
+
+ // Only y plane's tx_type is updated
+ if (plane > 0) return;
+ const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
+ tx_size, reduced_tx_set_used);
+ if (is_inter) {
+ if (cpi->oxcf.txfm_cfg.use_inter_dct_only) {
+ assert(tx_type == DCT_DCT);
+ }
+ } else {
+ if (cpi->oxcf.txfm_cfg.use_intra_dct_only) {
+ assert(tx_type == DCT_DCT);
+ } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) {
+ const TX_TYPE default_type = get_default_tx_type(
+ PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools);
+ (void)default_type;
+ // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in
+ // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode
+ // search, when picking up intra mode in nonRD inter mode search and in RD
+ // REALTIME mode when we limit TX type usage.
+ // We need to fix txfm cfg for these cases. Meanwhile relieving the
+ // assert.
+ assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode ||
+ cpi->oxcf.mode == REALTIME);
+ }
+ }
+
+ if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+ cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+ if (eset > 0) {
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
+ if (is_inter) {
+ if (allow_update_cdf) {
+ update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+ av1_ext_tx_ind[tx_set_type][tx_type],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+#if CONFIG_ENTROPY_STATS
+ ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+ [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+ ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+ [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(
+ fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+ av1_ext_tx_ind[tx_set_type][tx_type],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+ }
+ }
+ }
+}
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + block_offset;
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ tran_low_t *tcoeff;
+ assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+ if (args->dry_run == OUTPUT_ENABLED) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane,
+ pd->above_entropy_context + blk_col,
+ pd->left_entropy_context + blk_row, &txb_ctx);
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const uint8_t allow_update_cdf = args->allow_update_cdf;
+ const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+ int cdf_idx = cm->coef_cdf_category;
+ ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
+ eob == 0, 2);
+ }
+
+ CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+ const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+ (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+ entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+ eob_txb[block] = eob;
+
+ if (eob == 0) {
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+ blk_row);
+ return;
+ }
+ const int segment_id = mbmi->segment_id;
+ const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ tran_low_t *tcoeff_txb =
+ cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+ tcoeff = tcoeff_txb + block_offset;
+ memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+ td->counts, allow_update_cdf);
+
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const int16_t *const scan = scan_order->scan;
+
+ // record tx type usage
+ td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+ av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+ td->counts, allow_update_cdf);
+#else
+ av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+ allow_update_cdf);
+#endif
+
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+ coeff_contexts);
+
+ for (int c = eob - 1; c >= 0; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = qcoeff[pos];
+ const tran_low_t level = abs(v);
+ /* abs_sum_level is needed to decide the job scheduling order of
+ * pack bitstream multi-threading. This data is not needed if
+ * multi-threading is disabled. */
+ if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+ if (allow_update_cdf) {
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+ update_cdf(
+ ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+ AOMMIN(level, 3) - 1, 3);
+ } else {
+ update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+ AOMMIN(level, 3), 4);
+ }
+ }
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3) - 1];
+ } else {
+ ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3)];
+#endif
+ }
+ if (level > NUM_BASE_LEVELS) {
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx],
+ k, BR_CDF_SIZE);
+ }
+ for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+ [lps][br_ctx][lps == k];
+#endif // CONFIG_ENTROPY_STATS
+ if (lps == k) break;
+ }
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx][k];
+#endif
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ }
+ // Update the context needed to code the DC sign (if applicable)
+ if (tcoeff[0] != 0) {
+ const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+ const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+ entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+ }
+ } else {
+ tcoeff = qcoeff;
+ }
+ const uint8_t cul_level =
+ av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+ blk_col, blk_row);
+}
+
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + block_offset;
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ tran_low_t *tcoeff;
+ assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+ if (args->dry_run == OUTPUT_ENABLED) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane,
+ pd->above_entropy_context + blk_col,
+ pd->left_entropy_context + blk_row, &txb_ctx);
+#if CONFIG_ENTROPY_STATS
+ const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ int cdf_idx = cm->coef_cdf_category;
+ ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif // CONFIG_ENTROPY_STATS
+
+ CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+ const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+ (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+ entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+ eob_txb[block] = eob;
+
+ if (eob == 0) {
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+ blk_row);
+ return;
+ }
+ const int segment_id = mbmi->segment_id;
+ const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ tran_low_t *tcoeff_txb =
+ cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+ tcoeff = tcoeff_txb + block_offset;
+ memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+#if CONFIG_ENTROPY_STATS
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+ td->counts, 0 /*allow_update_cdf*/);
+
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const bool do_coeff_scan = true;
+#else
+ const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled;
+#endif
+ const int16_t *const scan = scan_order->scan;
+
+ // record tx type usage
+ td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+ td->counts, 0 /*allow_update_cdf*/);
+
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+ coeff_contexts);
+#endif
+
+ for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const tran_low_t level = abs(v);
+ /* abs_sum_level is needed to decide the job scheduling order of
+ * pack bitstream multi-threading. This data is not needed if
+ * multi-threading is disabled. */
+ if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+#if CONFIG_ENTROPY_STATS
+ const int coeff_ctx = coeff_contexts[pos];
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+ ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3) - 1];
+ } else {
+ ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3)];
+ }
+ if (level > NUM_BASE_LEVELS) {
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+ ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+ [lps][br_ctx][lps == k];
+ if (lps == k) break;
+ }
+ ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx][k];
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+#endif
+ }
+ // Update the context needed to code the DC sign (if applicable)
+ if (tcoeff[0] != 0) {
+ const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+ const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif // CONFIG_ENTROPY_STATS
+ entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+ }
+ } else {
+ tcoeff = qcoeff;
+ }
+ const uint8_t cul_level =
+ av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+ blk_col, blk_row);
+}
+
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ uint8_t allow_update_cdf) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+ if (mbmi->skip_txfm) {
+ av1_reset_entropy_context(xd, bsize, num_planes);
+ return;
+ }
+ const foreach_transformed_block_visitor visit =
+ allow_update_cdf ? av1_update_and_record_txb_context
+ : av1_record_txb_context;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg);
+ }
+}
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int stride =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ const int offset =
+ (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+ return cpi->coeff_buffer_base + offset;
+}
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 0000000000..67b94046b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define TXB_SKIP_CTX_MASK 15
+#define DC_SIGN_CTX_SHIFT 4
+#define DC_SIGN_CTX_MASK 3
+
+int av1_get_eob_pos_token(const int eob, int *const extra);
+
+/*!\endcond */
+/*!\brief Allocate the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * Each macro block will need a \ref CB_COEFF_BUFFER to store information for
+ * rate-distortion optimization and entropy coding of transform coefficients.
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+/*!\brief Free the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details.
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+void av1_free_txb_buf(AV1_COMP *cpi);
+
+/*!\brief Write quantized coefficients in a transform block into bitstream using
+ * entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will write the quantized coefficients in a transform block into
+ * the bitstream using entropy coding.
+ *
+ * The coding steps are as follows.
+ *
+ * 1) Code the end of block position "eob", which is the scan index of the
+ * last non-zero coefficient plus one.
+ *
+ * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ * for each coefficient in reversed scan order.
+ *
+ * 3) Code the sign and higher magnitude level
+ * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order.
+ *
+ * \param[in] cm Top-level structure shared by encoder and
+ * decoder
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] w Entropy coding write pointer
+ * \param[in] blk_row The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in] blk_col The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in] tx_size The given transform size
+ */
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+ aom_writer *w, int blk_row, int blk_col, int plane,
+ int block, TX_SIZE tx_size);
+
+/*!\brief Write quantized coefficients of all transform blocks in an intra
+ * macroblock into the bitstream using entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * All transform blocks in the intra macroblock share the same transform size.
+ *
+ * This function use \ref av1_write_coeffs_txb() to code each transform block in
+ * raster order.
+ *
+ * \param[in] cm Top-level structure shared by encoder and
+ * decoder
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] w Entropy coding write pointer
+ * \param[in] bsize Block size of the current macroblock
+ */
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+ aom_writer *w, BLOCK_SIZE bsize);
+
+/*!\brief Pack the context info of the current transform block into an uint8_t.
+ * \ingroup coefficient_coding
+ *
+ * This context info will be collected and consolidated by its neighbor
+ * transform blocks for coding transform block skip flag (tx_skip) and
+ * the sign of DC coefficient (dc_sign).
+ *
+ * \param[in] qcoeff Buffer of quantized coefficients
+ * \param[in] scan_order Coding order of coefficients in the transform
+ * block
+ * \param[in] eob The scan index of last non-zero coefficient plus
+ * one
+ */
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for all transform blocks in the intra macroblock.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will go through each transform block in the intra macorblock
+ * and call \ref av1_update_and_record_txb_context to update the probability
+ * model and entropy context properly.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] td Top-level multithreading structure
+ * \param[in] dry_run Whether this is a dry run.
+ * \param[in] bsize Block size of the current macroblock
+ * \param[in] allow_update_cdf Allowed to update probability model (cdf) or
+ * not.
+ */
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ uint8_t allow_update_cdf);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this funtion.
+ *
+ * Regular mode:
+ *
+ * The probability model (cdf) for each coding symbol in the
+ * transform block will be updated.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in] plane The index of the current plane.
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in] blk_row The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] blk_col The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] plane_bsize Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in] tx_size The given transform size.
+ * \param[in] arg This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
+
+/*!\brief Update the entropy context related to coefficient coding for a
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this function.
+ *
+ * Regular mode:
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in] plane The index of the current plane.
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in] blk_row The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] blk_col The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] plane_bsize Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in] tx_size The given transform size.
+ * \param[in] arg This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * The macroblock's location is described by mi_row and mi_col, row and column
+ * mi indexes in the coding frame.
+ *
+ * Each mi unit is a 4x4 pixel block.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] mi_row Row mi index of the current transform block
+ * in the frame.
+ * \param[in] mi_col Column mi index of the current transform
+ * block in the frame.
+ * \return CB_COEFF_BUFFER* Pointer of \ref CB_COEFF_BUFFER associated
+ * to this macroblock.
+ */
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+ int mi_col);
+
+/*!\brief Returns the entropy cost associated with skipping the current
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in] coeff_costs Table of entropy cost for coefficient coding.
+ * \param[in] txb_ctx Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in] plane The index of the current plane
+ * \param[in] tx_size The transform size
+ */
+static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
+ const TXB_CTX *const txb_ctx, int plane,
+ TX_SIZE tx_size) {
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs_ =
+ &coeff_costs->coeff_costs[txs_ctx][plane_type];
+ return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+/*!\cond */
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+ { 17, 13 },
+ { 16, 10 },
+};
+/*!\endcond */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 0000000000..d6a806d504
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,3469 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "av1/common/warped_motion.h"
+#include "av1/common/thread_common.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/ethread.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/firstpass.h"
+#endif
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/rdopt.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+ td->rd_counts.compound_ref_used_flag |=
+ td_t->rd_counts.compound_ref_used_flag;
+ td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+
+ for (int i = 0; i < TX_SIZES_ALL; i++) {
+ for (int j = 0; j < TX_TYPES; j++)
+ td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j];
+ }
+
+ for (int i = 0; i < BLOCK_SIZES_ALL; i++) {
+ for (int j = 0; j < 2; j++) {
+ td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j];
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i];
+ }
+
+ td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0];
+ td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1];
+
+ td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks;
+}
+
+static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ const int mib_size = cm->seq_params->mib_size;
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int row = 0; row < cm->tiles.rows; row++) {
+ for (int col = 0; col < cm->tiles.cols; col++) {
+ TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += mib_size) {
+ if (mi_row == tile_info->mi_row_start)
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ for (int mi_col = tile_info->mi_col_start;
+ mi_col < tile_info->mi_col_end; mi_col += mib_size) {
+ const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
+ MB_MODE_INFO *mbmi = mi[0];
+ if (mbmi->skip_txfm == 1 &&
+ (mbmi->bsize == cm->seq_params->sb_size)) {
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+ } else {
+ if (cm->delta_q_info.delta_lf_multi) {
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ } else {
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+}
+
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c, int cols) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+}
+
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+
+ if (r) {
+ pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > row_mt_sync->num_finished_cols[r - 1] - nsync -
+ row_mt_sync->intrabc_extra_top_right_sb_delay) {
+ pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+ int cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+ int cur;
+ // Only signal when there are enough encoded blocks for next row to run.
+ int sig = 1;
+
+ if (c < cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+ // When a thread encounters an error, num_finished_cols[r] is set to maximum
+ // column number. In this case, the AOMMAX operation here ensures that
+ // num_finished_cols[r] is not overwritten with a smaller value thus
+ // preventing the infinite waiting of threads in the relevant sync_read()
+ // function.
+ row_mt_sync->num_finished_cols[r] =
+ AOMMAX(row_mt_sync->num_finished_cols[r], cur);
+
+ pthread_cond_signal(&row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for row synchronization
+static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
+ AV1_COMMON *cm, int rows) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+ aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+ if (row_mt_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+ aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+ if (row_mt_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
+ aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
+
+ row_mt_sync->rows = rows;
+ // Set up nsync.
+ row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+ if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ if (row_mt_sync->mutex_ != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+ }
+ aom_free(row_mt_sync->mutex_);
+ }
+ if (row_mt_sync->cond_ != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_cond_destroy(&row_mt_sync->cond_[i]);
+ }
+ aom_free(row_mt_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(row_mt_sync->num_finished_cols);
+
+ // clear the structure as the source of this call may be dynamic change
+ // in tiles in which case this call will be followed by an _alloc()
+ // which may fail.
+ av1_zero(*row_mt_sync);
+ }
+}
+
+static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) {
+ return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows,
+ cm->seq_params->mib_size_log2);
+}
+
+static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
+ int alloc_row_ctx) {
+ struct AV1Common *cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tile_col, tile_row;
+
+ av1_row_mt_mem_dealloc(cpi);
+
+ // Allocate memory for row based multi-threading
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+ row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows);
+
+ if (alloc_row_ctx) {
+ assert(max_cols > 0);
+ const int num_row_ctx = AOMMAX(1, (max_cols - 1));
+ CHECK_MEM_ERROR(cm, this_tile->row_ctx,
+ (FRAME_CONTEXT *)aom_memalign(
+ 16, num_row_ctx * sizeof(*this_tile->row_ctx)));
+ }
+ }
+ }
+ const int sb_rows = get_sb_rows_in_frame(cm);
+ CHECK_MEM_ERROR(
+ cm, enc_row_mt->num_tile_cols_done,
+ aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows));
+
+ enc_row_mt->allocated_rows = max_rows;
+ enc_row_mt->allocated_cols = max_cols - 1;
+ enc_row_mt->allocated_sb_rows = sb_rows;
+}
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = enc_row_mt->allocated_tile_cols;
+ const int tile_rows = enc_row_mt->allocated_tile_rows;
+ int tile_col, tile_row;
+
+ // Free row based multi-threading sync memory
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+ av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+
+ if (cpi->oxcf.algo_cfg.cdf_update_mode) {
+ aom_free(this_tile->row_ctx);
+ this_tile->row_ctx = NULL;
+ }
+ }
+ }
+ aom_free(enc_row_mt->num_tile_cols_done);
+ enc_row_mt->num_tile_cols_done = NULL;
+ enc_row_mt->allocated_rows = 0;
+ enc_row_mt->allocated_cols = 0;
+ enc_row_mt->allocated_sb_rows = 0;
+}
+
+static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
+ int num_tiles, int num_workers) {
+ int tile_id = 0;
+ int i;
+
+ for (i = 0; i < num_workers; i++) {
+ thread_id_to_tile_id[i] = tile_id++;
+ if (tile_id == num_tiles) tile_id = 0;
+ }
+}
+
+static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
+ int *current_mi_row, int mib_size) {
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ const int mi_row_end = tile_data->tile_info.mi_row_end;
+
+ if (row_mt_sync->next_mi_row < mi_row_end) {
+ *current_mi_row = row_mt_sync->next_mi_row;
+ row_mt_sync->num_threads_working++;
+ row_mt_sync->next_mi_row += mib_size;
+ return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE void switch_tile_and_get_next_job(
+ AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
+ int *current_mi_row, int *end_of_frame, int is_firstpass,
+ const BLOCK_SIZE fp_block_size) {
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ int tile_id = -1; // Stores the tile ID with minimum proc done
+ int max_mis_to_encode = 0;
+ int min_num_threads_working = INT_MAX;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &tile_data[tile_index];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+#if CONFIG_REALTIME_ONLY
+ int num_b_rows_in_tile =
+ av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+ int num_b_cols_in_tile =
+ av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#else
+ int num_b_rows_in_tile =
+ is_firstpass
+ ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size)
+ : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+ int num_b_cols_in_tile =
+ is_firstpass
+ ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size)
+ : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#endif
+ int theoretical_limit_on_threads =
+ AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile);
+ int num_threads_working = row_mt_sync->num_threads_working;
+
+ if (num_threads_working < theoretical_limit_on_threads) {
+ int num_mis_to_encode =
+ this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row;
+
+ // Tile to be processed by this thread is selected on the basis of
+ // availability of jobs:
+ // 1) If jobs are available, tile to be processed is chosen on the
+ // basis of minimum number of threads working for that tile. If two or
+ // more tiles have same number of threads working for them, then the
+ // tile with maximum number of jobs available will be chosen.
+ // 2) If no jobs are available, then end_of_frame is reached.
+ if (num_mis_to_encode > 0) {
+ if (num_threads_working < min_num_threads_working) {
+ min_num_threads_working = num_threads_working;
+ max_mis_to_encode = 0;
+ }
+ if (num_threads_working == min_num_threads_working &&
+ num_mis_to_encode > max_mis_to_encode) {
+ tile_id = tile_index;
+ max_mis_to_encode = num_mis_to_encode;
+ }
+ }
+ }
+ }
+ }
+ if (tile_id == -1) {
+ *end_of_frame = 1;
+ } else {
+ // Update the current tile id to the tile id that will be processed next,
+ // which will be the least processed tile.
+ *cur_tile_id = tile_id;
+ const int unit_height = mi_size_high[fp_block_size];
+ get_next_job(&tile_data[tile_id], current_mi_row,
+ is_firstpass ? unit_height : cm->seq_params->mib_size);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void set_firstpass_encode_done(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ const int unit_height = mi_size_high[fp_block_size];
+
+ // In case of multithreading of firstpass encode, due to top-right
+ // dependency, the worker on a firstpass row waits for the completion of the
+ // firstpass processing of the top and top-right fp_blocks. Hence, in case a
+ // thread (main/worker) encounters an error, update the firstpass processing
+ // of every row in the frame to indicate that it is complete in order to avoid
+ // dependent workers waiting indefinitely.
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *tile = &tile_data->tile_info;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ const int unit_cols_in_tile =
+ av1_get_unit_cols_in_tile(tile, fp_block_size);
+ for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0;
+ mi_row < tile->mi_row_end;
+ mi_row += unit_height, unit_row_in_tile++) {
+ enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile,
+ unit_cols_in_tile - 1, unit_cols_in_tile);
+ }
+ }
+ }
+}
+
+static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ int thread_id = thread_data->thread_id;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+ (void)unused;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ enc_row_mt->firstpass_mt_exit = true;
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ set_firstpass_encode_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ AV1_COMMON *const cm = &cpi->common;
+ int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+ assert(cur_tile_id != -1);
+
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ const int unit_height = mi_size_high[fp_block_size];
+ int end_of_frame = 0;
+ while (1) {
+ int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+ if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id],
+ &current_mi_row, unit_height)) {
+ // No jobs are available for the current tile. Query for the status of
+ // other tiles and get the next job if available
+ switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+ &current_mi_row, &end_of_frame, 1,
+ fp_block_size);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ // When firstpass_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (firstpass_mt_exit || end_of_frame) break;
+
+ TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ ThreadData *td = thread_data->td;
+
+ assert(current_mi_row != -1 &&
+ current_mi_row < this_tile->tile_info.mi_row_end);
+
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2,
+ fp_block_size);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+#endif
+
+static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
+ AV1EncRowMultiThreadInfo *enc_row_mt,
+ int mib_size_log2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
+ const int sb_rows = get_sb_rows_in_frame(cm);
+ AV1LfMTInfo *cur_job_info;
+ bool row_mt_exit = false;
+ (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+
+ while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+ LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data;
+ const int lpf_opt_level = cur_job_info->lpf_opt_level;
+ (void)sb_rows;
+#if CONFIG_MULTITHREAD
+ const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2;
+ const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
+ // Wait for current and next superblock row to finish encoding.
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ while (!enc_row_mt->row_mt_exit &&
+ (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+ enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) {
+ pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
+ }
+ row_mt_exit = enc_row_mt->row_mt_exit;
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ if (row_mt_exit) return;
+
+ av1_thread_loop_filter_rows(
+ lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+ cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+ lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf,
+ lf_data->tx_buf, mib_size_log2);
+ }
+}
+
+static void set_encoding_done(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // In case of row-multithreading, due to top-right dependency, the worker on
+ // an SB row waits for the completion of the encode of the top and top-right
+ // SBs. Hence, in case a thread (main/worker) encounters an error, update that
+ // encoding of every SB row in the frame is complete in order to avoid the
+ // dependent workers of every tile from waiting indefinitely.
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0;
+ mi_row < tile_info->mi_row_end;
+ mi_row += mib_size, sb_row_in_tile++) {
+ enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile,
+ sb_cols_in_tile - 1, sb_cols_in_tile);
+ }
+ }
+ }
+}
+
+static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc,
+ const int filter_level[2]) {
+ return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]);
+}
+
+static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ int thread_id = thread_data->thread_id;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+ (void)unused;
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ AV1LfSync *const lf_sync = thread_data->lf_sync;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ xd->error_info = error_info;
+ AV1_COMMON *volatile const cm = &cpi->common;
+ volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled(
+ cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level);
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ enc_row_mt->row_mt_exit = true;
+ // Wake up all the workers waiting in launch_loop_filter_rows() to exit in
+ // case of an error.
+ pthread_cond_broadcast(enc_row_mt->cond_);
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ set_encoding_done(cpi);
+
+ if (do_pipelined_lpf_mt_with_enc) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lf_sync->job_mutex);
+ lf_sync->lf_mt_exit = true;
+ pthread_mutex_unlock(lf_sync->job_mutex);
+#endif
+ av1_set_vert_loop_filter_done(&cpi->common, lf_sync,
+ cpi->common.seq_params->mib_size_log2);
+ }
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+
+ // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+ // allocation.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!thread_data->td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ thread_data->td->pc_root = NULL;
+ }
+
+ assert(cur_tile_id != -1);
+
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ int end_of_frame = 0;
+ bool row_mt_exit = false;
+
+ // When master thread does not have a valid job to process, xd->tile_ctx
+ // is not set and it contains NULL pointer. This can result in NULL pointer
+ // access violation if accessed beyond the encode stage. Hence, updating
+ // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+ // context to avoid NULL pointer access in subsequent stages.
+ thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
+ while (1) {
+ int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ row_mt_exit = enc_row_mt->row_mt_exit;
+ // row_mt_exit check here can be avoided as it is checked after
+ // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here,
+ // tries to return before calling the function get_next_job().
+ if (!row_mt_exit &&
+ !get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+ cm->seq_params->mib_size)) {
+ // No jobs are available for the current tile. Query for the status of
+ // other tiles and get the next job if available
+ switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+ &current_mi_row, &end_of_frame, 0,
+ fp_block_size);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ // When row_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (row_mt_exit) {
+ error_info->setjmp = 0;
+ return 1;
+ }
+
+ if (end_of_frame) break;
+
+ TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ const int tile_row = tile_info->tile_row;
+ const int tile_col = tile_info->tile_col;
+ ThreadData *td = thread_data->td;
+ const int sb_row = current_mi_row >> mib_size_log2;
+
+ assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
+
+ td->mb.e_mbd.tile_ctx = td->tctx;
+ td->mb.tile_pb_ctx = &this_tile->tctx;
+ td->abs_sum_level = 0;
+
+ if (this_tile->allow_update_cdf) {
+ td->mb.row_ctx = this_tile->row_ctx;
+ if (current_mi_row == tile_info->mi_row_start)
+ memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+ } else {
+ memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+ }
+
+ av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+ &td->mb.e_mbd);
+
+ cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+ if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+ av1_crc32c_calculator_init(
+ &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+ }
+
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ this_tile->abs_sum_level += td->abs_sum_level;
+ row_mt_sync->num_threads_working--;
+ enc_row_mt->num_tile_cols_done[sb_row]++;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(enc_row_mt->cond_);
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ }
+ if (do_pipelined_lpf_mt_with_enc) {
+ // Loop-filter a superblock row if encoding of the current and next
+ // superblock row is complete.
+ // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving
+ // encoding and loop filter stage.
+ launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
+ }
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ thread_data->td->pc_root = NULL;
+ error_info->setjmp = 0;
+ return 1;
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int t;
+
+ (void)unused;
+
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+ // allocation.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!thread_data->td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ thread_data->td->pc_root = NULL;
+ }
+
+ for (t = thread_data->start; t < tile_rows * tile_cols;
+ t += cpi->mt_info.num_workers) {
+ int tile_row = t / tile_cols;
+ int tile_col = t % tile_cols;
+
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+ thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+ thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
+ av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+ }
+
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ thread_data->td->pc_root = NULL;
+ error_info->setjmp = 0;
+ return 1;
+}
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) {
+ cpi->mt_info.workers = ppi->p_mt_info.workers;
+ cpi->mt_info.num_workers = ppi->p_mt_info.num_workers;
+ cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data;
+ int i;
+ for (i = MOD_FP; i < NUM_MT_MODULES; i++) {
+ cpi->mt_info.num_mod_workers[i] =
+ AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]);
+ }
+}
+
+void av1_init_cdef_worker(AV1_COMP *cpi) {
+ // The allocation is done only for level 0 parallel frames. No change
+ // in config is supported in the middle of a parallel encode set, since the
+ // rest of the MT modules also do not support dynamic change of config.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return;
+ PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+ int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF);
+
+ av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker,
+ &cpi->mt_info.cdef_sync, num_cdef_workers, 1);
+ cpi->mt_info.cdef_worker = p_mt_info->cdef_worker;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
+ if (lr_sync->sync_range) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ return;
+ int num_lr_workers =
+ av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+ assert(num_lr_workers <= lr_sync->num_workers);
+ lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
+ lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
+ }
+}
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+ if (setjmp(cm->error->jmp)) {
+ cm->error->setjmp = 0;
+ aom_internal_error_copy(&cpi->ppi->error, cm->error);
+ }
+ cm->error->setjmp = 1;
+ // Initialize enc row MT object.
+ if (is_first_pass || cpi->oxcf.row_mt == 1) {
+ AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
+ if (enc_row_mt->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
+ aom_malloc(sizeof(*(enc_row_mt->mutex_))));
+ if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
+ }
+ if (enc_row_mt->cond_ == NULL) {
+ CHECK_MEM_ERROR(cm, enc_row_mt->cond_,
+ aom_malloc(sizeof(*(enc_row_mt->cond_))));
+ if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL);
+ }
+ }
+
+ if (!is_first_pass) {
+ // Initialize global motion MT object.
+ AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+ if (gm_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+ aom_malloc(sizeof(*(gm_sync->mutex_))));
+ if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+ }
+#if !CONFIG_REALTIME_ONLY
+ // Initialize temporal filtering MT object.
+ AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+ if (tf_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+ aom_malloc(sizeof(*tf_sync->mutex_)));
+ if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+ // Initialize CDEF MT object.
+ AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+ if (cdef_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+ aom_malloc(sizeof(*(cdef_sync->mutex_))));
+ if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+ }
+
+ // Initialize loop filter MT object.
+ AV1LfSync *lf_sync = &mt_info->lf_row_sync;
+ // Number of superblock rows
+ const int sb_rows =
+ CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2);
+ PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+ int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF);
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_lf_workers > lf_sync->num_workers) {
+ av1_loop_filter_dealloc(lf_sync);
+ av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
+ }
+
+ // Initialize tpl MT object.
+ AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt;
+ if (tpl_row_mt->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_,
+ aom_malloc(sizeof(*(tpl_row_mt->mutex_))));
+ if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ // Initialize loop restoration MT object.
+ AV1LrSync *lr_sync = &mt_info->lr_row_sync;
+ int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+ int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height);
+ int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
+ if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+ num_lr_workers > lr_sync->num_workers ||
+ MAX_MB_PLANE > lr_sync->num_planes) {
+ av1_loop_restoration_dealloc(lr_sync);
+ av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
+ MAX_MB_PLANE, cm->width);
+ }
+ }
+#endif
+
+ // Initialization of pack bitstream MT object.
+ AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync;
+ if (pack_bs_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+ aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+ if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+ }
+ }
+ cm->error->setjmp = 0;
+}
+#endif // CONFIG_MULTITHREAD
+
+// Computes the number of workers to be considered while allocating memory for a
+// multi-threaded module under FPMT.
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
+ MULTI_THREADED_MODULES mod_name) {
+ int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
+ if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
+ // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC].
+ // As frame parallel jobs will only perform multi-threading for the encode
+ // stage, we can limit the allocations according to num_enc_workers per
+ // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]).
+ num_mod_workers = p_mt_info->num_workers;
+ }
+ return num_mod_workers;
+}
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+ assert(p_mt_info->workers != NULL);
+ assert(p_mt_info->tile_thr_data != NULL);
+
+ int num_workers = p_mt_info->num_workers;
+ int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
+ assert(num_enc_workers <= num_workers);
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+ if (i > 0) {
+ // Allocate thread data.
+ ThreadData *td;
+ AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td)));
+ av1_zero(*td);
+ thread_data->original_td = thread_data->td = td;
+
+ // Set up shared coeff buffers.
+ av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf,
+ &ppi->error);
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*td->tmp_conv_dst)));
+
+ if (i < p_mt_info->num_mod_workers[MOD_FP]) {
+ // Set up firstpass PICK_MODE_CONTEXT.
+ td->firstpass_ctx =
+ av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf);
+ if (!td->firstpass_ctx)
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+
+ if (!is_first_pass && i < num_enc_workers) {
+ // Set up sms_tree.
+ if (av1_setup_sms_tree(ppi->cpi, td)) {
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate SMS tree");
+ }
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*td->hash_value_buffer[0][0])));
+
+ // Allocate frame counters in thread data.
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->counts,
+ aom_calloc(1, sizeof(*td->counts)));
+
+ // Allocate buffers used by palette coding mode.
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer,
+ aom_memalign(16, sizeof(*td->palette_buffer)));
+
+ // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are
+ // used in inter frames to store intermediate inter mode prediction
+ // results and are not required for allintra encoding mode. Hence, the
+ // memory allocations for these buffers are avoided for allintra
+ // encoding mode.
+ if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) {
+ alloc_obmc_buffers(&td->obmc_buffer, &ppi->error);
+
+ alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer);
+
+ for (int j = 0; j < 2; ++j) {
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->tmp_pred_bufs[j],
+ aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*td->tmp_pred_bufs[j])));
+ }
+ }
+
+ if (is_gradient_caching_for_hog_enabled(ppi->cpi)) {
+ const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome;
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info,
+ aom_malloc(sizeof(*td->pixel_gradient_info) *
+ plane_types * MAX_SB_SQUARE));
+ }
+
+ if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) {
+ const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size;
+ const int mi_count_in_sb =
+ mi_size_wide[sb_size] * mi_size_high[sb_size];
+
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->src_var_info_of_4x4_sub_blocks,
+ aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) *
+ mi_count_in_sb));
+ }
+
+ if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ const int num_64x64_blocks =
+ (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->vt64x64,
+ aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks));
+ }
+ }
+ }
+
+ if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) {
+ if (i == 0) {
+ for (int j = 0; j < ppi->num_fp_contexts; j++) {
+ AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx,
+ (FRAME_CONTEXT *)aom_memalign(
+ 16, sizeof(*ppi->parallel_cpi[j]->td.tctx)));
+ }
+ } else {
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, thread_data->td->tctx,
+ (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
+ }
+ }
+ }
+
+ // Record the number of workers in encode stage multi-threading for which
+ // allocation is done.
+ p_mt_info->prev_num_enc_workers = num_enc_workers;
+}
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ assert(p_mt_info->num_workers == 0);
+
+ AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers,
+ aom_malloc(num_workers * sizeof(*p_mt_info->workers)));
+
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, p_mt_info->tile_thr_data,
+ aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data)));
+
+ for (int i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &p_mt_info->workers[i];
+ EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+ winterface->init(worker);
+ worker->thread_name = "aom enc worker";
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i > 0) {
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ }
+ winterface->sync(worker);
+
+ ++p_mt_info->num_workers;
+ }
+}
+
+// This function will change the state and free the mutex of corresponding
+// workers and terminate the object. The object can not be re-used unless a call
+// to reset() is made.
+void av1_terminate_workers(AV1_PRIMARY *ppi) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ for (int t = 0; t < p_mt_info->num_workers; ++t) {
+ AVxWorker *const worker = &p_mt_info->workers[t];
+ aom_get_worker_interface()->end(worker);
+ }
+}
+
+// This function returns 1 if frame parallel encode is supported for
+// the current configuration. Returns 0 otherwise.
+static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+ // FPMT is enabled for AOM_Q and AOM_VBR.
+ // TODO(Tarun): Test and enable resize config.
+ if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
+ return 0;
+ }
+ if (ppi->use_svc) {
+ return 0;
+ }
+ if (oxcf->tile_cfg.enable_large_scale_tile) {
+ return 0;
+ }
+ if (oxcf->dec_model_cfg.timing_info_present) {
+ return 0;
+ }
+ if (oxcf->mode != GOOD) {
+ return 0;
+ }
+ if (oxcf->tool_cfg.error_resilient_mode) {
+ return 0;
+ }
+ if (oxcf->resize_cfg.resize_mode) {
+ return 0;
+ }
+ if (oxcf->pass != AOM_RC_SECOND_PASS) {
+ return 0;
+ }
+ if (oxcf->max_threads < 2) {
+ return 0;
+ }
+ if (!oxcf->fp_mt) {
+ return 0;
+ }
+
+ return 1;
+}
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
+ AV1EncoderConfig *const oxcf) {
+ if (is_fpmt_config(ppi, oxcf)) return 1;
+ // Reset frame parallel configuration for unsupported config
+ if (ppi->num_fp_contexts > 1) {
+ for (int i = 1; i < ppi->num_fp_contexts; i++) {
+ // Release the previously-used frame-buffer
+ if (ppi->parallel_cpi[i]->common.cur_frame != NULL) {
+ --ppi->parallel_cpi[i]->common.cur_frame->ref_count;
+ ppi->parallel_cpi[i]->common.cur_frame = NULL;
+ }
+ }
+
+ int cur_gf_index = ppi->cpi->gf_frame_index;
+ int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index);
+ av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index],
+ reset_size);
+ av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size);
+ av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size);
+ memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX,
+ sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) *
+ reset_size * REF_FRAMES);
+ memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX,
+ sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size);
+ ppi->num_fp_contexts = 1;
+ }
+ return 0;
+}
+
+// A large value for threads used to compute the max num_enc_workers
+// possible for each resolution.
+#define MAX_THREADS 100
+
+// Computes the max number of enc workers possible for each resolution.
+static AOM_INLINE int compute_max_num_enc_workers(
+ CommonModeInfoParams *const mi_params, int mib_size_log2) {
+ int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2);
+ int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2);
+
+ return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows);
+}
+
+// Computes the number of frame parallel(fp) contexts to be created
+// based on the number of max_enc_workers.
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+ ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0;
+ if (!av1_check_fpmt_config(ppi, oxcf)) {
+ return 1;
+ }
+ int max_num_enc_workers = compute_max_num_enc_workers(
+ &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2);
+ // Scaling factors and rounding factors used to tune worker_per_frame
+ // computation.
+ int rounding_factor[2] = { 2, 4 };
+ int scaling_factor[2] = { 4, 8 };
+ int is_480p_or_lesser =
+ AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480;
+ int is_sb_64 = 0;
+ if (ppi->cpi != NULL)
+ is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64;
+ // A parallel frame encode has at least 1/4th the
+ // theoretical limit of max enc workers in default case. For resolutions
+ // larger than 480p, if SB size is 64x64, optimal performance is obtained with
+ // limit of 1/8.
+ int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0;
+ int workers_per_frame =
+ AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) /
+ scaling_factor[index]);
+ int max_threads = oxcf->max_threads;
+ int num_fp_contexts = max_threads / workers_per_frame;
+ // Based on empirical results, FPMT gains with multi-tile are significant when
+ // more parallel frames are available. Use FPMT with multi-tile encode only
+ // when sufficient threads are available for parallel encode of
+ // MAX_PARALLEL_FRAMES frames.
+ if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) {
+ if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1;
+ }
+
+ num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES));
+ // Limit recalculated num_fp_contexts to ppi->num_fp_contexts.
+ num_fp_contexts = (ppi->num_fp_contexts == 1)
+ ? num_fp_contexts
+ : AOMMIN(num_fp_contexts, ppi->num_fp_contexts);
+ if (num_fp_contexts > 1) {
+ ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] =
+ AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads);
+ }
+ return num_fp_contexts;
+}
+
+// Computes the number of workers to process each of the parallel frames.
+static AOM_INLINE int compute_num_workers_per_frame(
+ const int num_workers, const int parallel_frame_count) {
+ // Number of level 2 workers per frame context (floor division).
+ int workers_per_frame = (num_workers / parallel_frame_count);
+ return workers_per_frame;
+}
+
+static AOM_INLINE void restore_workers_after_fpmt(
+ AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared);
+
+// Prepare level 1 workers. This function is only called for
+// parallel_frame_count > 1. This function populates the mt_info structure of
+// frame level contexts appropriately by dividing the total number of available
+// workers amongst the frames as level 2 workers. It also populates the hook and
+// data members of level 1 workers.
+static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
+ AV1_COMP_DATA *first_cpi_data,
+ AVxWorkerHook hook,
+ int parallel_frame_count) {
+ assert(parallel_frame_count <= ppi->num_fp_contexts &&
+ parallel_frame_count > 1);
+
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ int num_workers = p_mt_info->num_workers;
+
+ volatile int frame_idx = 0;
+ volatile int i = 0;
+ while (i < num_workers) {
+ // Assign level 1 worker
+ AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] =
+ &p_mt_info->workers[i];
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+ MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+ // This 'aom_internal_error_info' pointer is not derived from the local
+ // pointer ('AV1_COMMON *const cm') to silence the compiler warning
+ // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]".
+ struct aom_internal_error_info *const error = cur_cpi->common.error;
+
+ // The jmp_buf is valid only within the scope of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error->jmp)) {
+ error->setjmp = 0;
+ restore_workers_after_fpmt(ppi, parallel_frame_count, i);
+ aom_internal_error_copy(&ppi->error, error);
+ }
+ error->setjmp = 1;
+
+ AV1_COMMON *const cm = &cur_cpi->common;
+ // Assign start of level 2 worker pool
+ mt_info->workers = &p_mt_info->workers[i];
+ mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i];
+ // Assign number of workers for each frame in the parallel encode set.
+ mt_info->num_workers = compute_num_workers_per_frame(
+ num_workers - i, parallel_frame_count - frame_idx);
+ for (int j = MOD_FP; j < NUM_MT_MODULES; j++) {
+ mt_info->num_mod_workers[j] =
+ AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]);
+ }
+ if (p_mt_info->cdef_worker != NULL) {
+ mt_info->cdef_worker = &p_mt_info->cdef_worker[i];
+
+ // Back up the original cdef_worker pointers.
+ mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf;
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; plane++)
+ mt_info->restore_state_buf.cdef_colbuf[plane] =
+ mt_info->cdef_worker->colbuf[plane];
+ }
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ // Back up the original LR buffers before update.
+ int idx = i + mt_info->num_workers - 1;
+ assert(idx < mt_info->lr_row_sync.num_workers);
+ mt_info->restore_state_buf.rst_tmpbuf =
+ mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
+ mt_info->restore_state_buf.rlbs =
+ mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
+
+ // Update LR buffers.
+ mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf;
+ mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs;
+ }
+#endif
+
+ i += mt_info->num_workers;
+
+ // At this stage, the thread specific CDEF buffers for the current frame's
+ // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has
+ // already been allocated across parallel frames.
+ av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync,
+ p_mt_info->num_workers, 0);
+
+ frame_worker->hook = hook;
+ frame_worker->data1 = cur_cpi;
+ frame_worker->data2 = (frame_idx == 0)
+ ? first_cpi_data
+ : &ppi->parallel_frames_data[frame_idx - 1];
+ frame_idx++;
+ error->setjmp = 0;
+ }
+ p_mt_info->p_num_workers = parallel_frame_count;
+}
+
+// Launch level 1 workers to perform frame parallel encode.
+static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_workers = ppi->p_mt_info.p_num_workers;
+
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+ if (i == 0)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
+
+// Restore worker states after parallel encode.
+static AOM_INLINE void restore_workers_after_fpmt(
+ AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) {
+ assert(parallel_frame_count <= ppi->num_fp_contexts &&
+ parallel_frame_count > 1);
+ (void)parallel_frame_count;
+
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+ int frame_idx = 0;
+ int i = 0;
+ while (i < num_fpmt_workers_prepared) {
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+ MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+ const AV1_COMMON *const cm = &cur_cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ // Restore the original cdef_worker pointers.
+ if (p_mt_info->cdef_worker != NULL) {
+ mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf;
+ for (int plane = 0; plane < num_planes; plane++)
+ mt_info->cdef_worker->colbuf[plane] =
+ mt_info->restore_state_buf.cdef_colbuf[plane];
+ }
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ // Restore the original LR buffers.
+ int idx = i + mt_info->num_workers - 1;
+ assert(idx < mt_info->lr_row_sync.num_workers);
+ mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+ mt_info->restore_state_buf.rst_tmpbuf;
+ mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
+ mt_info->restore_state_buf.rlbs;
+ }
+#endif
+
+ frame_idx++;
+ i += mt_info->num_workers;
+ }
+}
+
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi,
+ int frames_in_parallel_set) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_workers = ppi->p_mt_info.p_num_workers;
+ int had_error = 0;
+ // Points to error in the earliest display order frame in the parallel set.
+ const struct aom_internal_error_info *error;
+
+ // Encoding ends.
+ for (int i = num_workers - 1; i >= 0; --i) {
+ AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error = ppi->parallel_cpi[i]->common.error;
+ }
+ }
+
+ restore_workers_after_fpmt(ppi, frames_in_parallel_set,
+ ppi->p_mt_info.num_workers);
+
+ if (had_error) aom_internal_error_copy(&ppi->error, error);
+}
+
+static int get_compressed_data_hook(void *arg1, void *arg2) {
+ AV1_COMP *cpi = (AV1_COMP *)arg1;
+ AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
+ int status = av1_get_compressed_data(cpi, cpi_data);
+
+ // AOM_CODEC_OK(0) means no error.
+ return !status;
+}
+
+// This function encodes the raw frame data for each frame in parallel encode
+// set, and outputs the frame bit stream to the designated buffers.
+void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data) {
+ // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf
+ // corresponding to frames in the current parallel encode set.
+ int ref_buffers_used_map = 0;
+ int frames_in_parallel_set = av1_init_parallel_frame_context(
+ first_cpi_data, ppi, &ref_buffers_used_map);
+ prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
+ frames_in_parallel_set);
+ launch_fpmt_workers(ppi);
+ sync_fpmt_workers(ppi, frames_in_parallel_set);
+
+ // Release cpi->scaled_ref_buf corresponding to frames in the current parallel
+ // encode set.
+ for (int i = 0; i < frames_in_parallel_set; ++i) {
+ av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]);
+ }
+ av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool,
+ ref_buffers_used_map);
+}
+
+static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ worker->had_error = 0;
+ if (i == 0)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
+
+static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
+ AV1_COMMON *const cm, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ const AVxWorker *const worker_main = &mt_info->workers[0];
+ int had_error = worker_main->had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ error_info = ((EncWorkerData *)worker_main->data1)->error_info;
+ }
+
+ // Encoding ends.
+ for (int i = num_workers - 1; i > 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((EncWorkerData *)worker->data1)->error_info;
+ }
+ }
+
+ if (had_error) aom_internal_error_copy(cm->error, &error_info);
+
+ // Restore xd->error_info of the main thread back to cm->error so that the
+ // multithreaded code, when executed using a single thread, has a valid
+ // xd->error_info.
+ MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd;
+ xd->error_info = cm->error;
+}
+
+static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
+ int num_workers) {
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &cpi->mt_info.workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ cpi->intrabc_used |= thread_data->td->intrabc_used;
+ cpi->deltaq_used |= thread_data->td->deltaq_used;
+ // Accumulate rtc counters.
+ if (!frame_is_intra_only(&cpi->common))
+ av1_accumulate_rtc_counters(cpi, &thread_data->td->mb);
+ cpi->palette_pixel_num += thread_data->td->mb.palette_pixels;
+ if (thread_data->td != &cpi->td) {
+ // Keep these conditional expressions in sync with the corresponding ones
+ // in prepare_enc_workers().
+ if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ aom_free(thread_data->td->mv_costs_alloc);
+ thread_data->td->mv_costs_alloc = NULL;
+ }
+ if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ aom_free(thread_data->td->dv_costs_alloc);
+ thread_data->td->dv_costs_alloc = NULL;
+ }
+ }
+ av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common));
+
+ // Accumulate counters.
+ if (i > 0) {
+ av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ cpi->td.mb.txfm_search_info.txb_split_count +=
+ thread_data->td->mb.txfm_search_info.txb_split_count;
+#if CONFIG_SPEED_STATS
+ cpi->td.mb.txfm_search_info.tx_search_count +=
+ thread_data->td->mb.txfm_search_info.tx_search_count;
+#endif // CONFIG_SPEED_STATS
+ }
+ }
+}
+
+static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1_COMMON *const cm = &cpi->common;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ thread_data->td->intrabc_used = 0;
+ thread_data->td->deltaq_used = 0;
+ thread_data->td->abs_sum_level = 0;
+ thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0;
+ thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0;
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ memcpy(thread_data->td->hash_value_buffer[x][y],
+ cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0]));
+ thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] =
+ thread_data->td->hash_value_buffer[x][y];
+ }
+ }
+ // Keep these conditional expressions in sync with the corresponding ones
+ // in accumulate_counters_enc_workers().
+ if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mv_costs_alloc,
+ (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc)));
+ thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc;
+ memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
+ sizeof(MvCosts));
+ }
+ if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ // Reset dv_costs to NULL for worker threads when dv cost update is
+ // enabled so that only dv_cost_upd_level needs to be checked before the
+ // aom_free() call for the same.
+ thread_data->td->mb.dv_costs = NULL;
+ if (av1_need_dv_costs(cpi)) {
+ CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc,
+ (IntraBCMVCosts *)aom_malloc(
+ sizeof(*thread_data->td->dv_costs_alloc)));
+ thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc;
+ memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+ sizeof(IntraBCMVCosts));
+ }
+ }
+ }
+ av1_alloc_mb_data(cpi, &thread_data->td->mb);
+
+ // Reset rtc counters.
+ av1_init_rtc_counters(&thread_data->td->mb);
+
+ thread_data->td->mb.palette_pixels = 0;
+
+ if (thread_data->td->counts != &cpi->counts) {
+ memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
+ }
+
+ if (i > 0) {
+ thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+ thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.tmp_pred_bufs[j] =
+ thread_data->td->tmp_pred_bufs[j];
+ }
+ thread_data->td->mb.pixel_gradient_info =
+ thread_data->td->pixel_gradient_info;
+
+ thread_data->td->mb.src_var_info_of_4x4_sub_blocks =
+ thread_data->td->src_var_info_of_4x4_sub_blocks;
+
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+ thread_data->td->mb.tmp_pred_bufs[j];
+ }
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ // Before encoding a frame, copy the thread data from cpi.
+ thread_data->td->mb = cpi->td.mb;
+ }
+ av1_alloc_src_diff_buf(cm, &thread_data->td->mb);
+ }
+}
+#endif
+
+// Computes the number of workers for row multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
+ int max_threads) {
+ TileInfo tile_info;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int total_num_threads_row_mt = 0;
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ av1_tile_init(&tile_info, cm, row, col);
+ const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info);
+ const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info);
+ total_num_threads_row_mt +=
+ AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+ }
+ }
+ return AOMMIN(max_threads, total_num_threads_row_mt);
+}
+
+// Computes the number of workers for tile multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
+ int max_threads) {
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ return AOMMIN(max_threads, tile_cols * tile_rows);
+}
+
+// Find max worker of all MT stages
+int av1_get_max_num_workers(const AV1_COMP *cpi) {
+ int max_num_workers = 0;
+ for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+ max_num_workers =
+ AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers);
+ assert(max_num_workers >= 1);
+ return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
+}
+
+// Computes the number of workers for encoding stage (row/tile multi-threading)
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
+ if (max_workers <= 1) return 1;
+ if (cpi->oxcf.row_mt)
+ return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
+ else
+ return compute_num_enc_tile_mt_workers(&cpi->common, max_workers);
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+ assert(IMPLIES(cpi->tile_data == NULL,
+ cpi->allocated_tiles < tile_cols * tile_rows));
+ if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
+ prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+ const FRAME_COUNTS *counts) {
+ unsigned int *const acc = (unsigned int *)acc_counts;
+ const unsigned int *const cnt = (const unsigned int *)counts;
+
+ const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+ for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
+
+// Computes the maximum number of sb rows and sb_cols across tiles which are
+// used to allocate memory for multi-threaded encoding with row-mt=1.
+static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm,
+ int *max_sb_rows_in_tile,
+ int *max_sb_cols_in_tile) {
+ const int tile_rows = cm->tiles.rows;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int num_mi_rows = cm->mi_params.mi_rows;
+ const int *const row_start_sb = cm->tiles.row_start_sb;
+ for (int row = 0; row < tile_rows; row++) {
+ const int mi_row_start = row_start_sb[row] << mib_size_log2;
+ const int mi_row_end =
+ AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+ const int num_sb_rows_in_tile =
+ CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2);
+ *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile);
+ }
+
+ const int tile_cols = cm->tiles.cols;
+ const int num_mi_cols = cm->mi_params.mi_cols;
+ const int *const col_start_sb = cm->tiles.col_start_sb;
+ for (int col = 0; col < tile_cols; col++) {
+ const int mi_col_start = col_start_sb[col] << mib_size_log2;
+ const int mi_col_end =
+ AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols);
+ const int num_sb_cols_in_tile =
+ CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2);
+ *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Computes the number of workers for firstpass stage (row/tile multi-threading)
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int total_num_threads_row_mt = 0;
+ TileInfo tile_info;
+
+ if (cpi->oxcf.max_threads <= 1) return 1;
+
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ av1_tile_init(&tile_info, cm, row, col);
+ const int num_mb_rows_in_tile =
+ av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size);
+ const int num_mb_cols_in_tile =
+ av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size);
+ total_num_threads_row_mt +=
+ AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile);
+ }
+ }
+ return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+}
+
+// Computes the maximum number of mb_rows for row multi-threading of firstpass
+// stage
+static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm,
+ BLOCK_SIZE fp_block_size) {
+ const int tile_rows = cm->tiles.rows;
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int num_mi_rows = cm->mi_params.mi_rows;
+ const int *const row_start_sb = cm->tiles.row_start_sb;
+ int max_mb_rows = 0;
+
+ for (int row = 0; row < tile_rows; row++) {
+ const int mi_row_start = row_start_sb[row] << mib_size_log2;
+ const int mi_row_end =
+ AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+ const int num_mb_rows_in_tile =
+ CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2);
+ max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
+ }
+ return max_mb_rows;
+}
+#endif
+
+static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) {
+ // Pipelining of loop-filtering after encoding is enabled when loop-filter
+ // level is chosen based on quantizer and frame type. It is disabled in case
+ // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage
+ // decides the filter level. Loop-filtering is disabled in case
+ // of non-reference frames and for frames with intra block copy tool enabled.
+ AV1_COMMON *cm = &cpi->common;
+ const int use_loopfilter = is_loopfilter_used(cm);
+ const int use_superres = av1_superres_scaled(cm);
+ const int use_cdef = is_cdef_used(cm);
+ const int use_restoration = is_restoration_used(cm);
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+ const unsigned int skip_apply_postproc_filters =
+ derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+ use_superres, use_restoration);
+ mt_info->pipeline_lpf_mt_with_enc =
+ (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) &&
+ (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) &&
+ (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) &&
+ !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc &&
+ ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0);
+
+ if (!mt_info->pipeline_lpf_mt_with_enc) return;
+
+ set_postproc_filter_default_params(cm);
+
+ if (!use_loopfilter) return;
+
+ const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick;
+ assert(method == LPF_PICK_FROM_Q);
+ assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY);
+
+ av1_pick_filter_level(cpi->source, cpi, method);
+
+ struct loopfilter *lf = &cm->lf;
+ const int plane_start = 0;
+ const int plane_end = av1_num_planes(cm);
+ int planes_to_lf[MAX_MB_PLANE];
+ if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc,
+ lf->filter_level)) {
+ set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
+ int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+ assert(lpf_opt_level == 2);
+
+ const int start_mi_row = 0;
+ const int end_mi_row = start_mi_row + cm->mi_params.mi_rows;
+
+ av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+ assert(mt_info->num_mod_workers[MOD_ENC] ==
+ mt_info->num_mod_workers[MOD_LPF]);
+ loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf,
+ mt_info->num_mod_workers[MOD_LPF],
+ &mt_info->lf_row_sync, lpf_opt_level,
+ cm->seq_params->mib_size_log2);
+
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+ // Initialize loopfilter data
+ thread_data->lf_sync = &mt_info->lf_row_sync;
+ thread_data->lf_data = &thread_data->lf_sync->lfdata[i];
+ loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd);
+ }
+ }
+}
+
+void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int sb_rows_in_frame = get_sb_rows_in_frame(cm);
+ int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+ int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0;
+ int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+ compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile);
+ const bool alloc_row_mt_mem =
+ (enc_row_mt->allocated_tile_cols != tile_cols ||
+ enc_row_mt->allocated_tile_rows != tile_rows ||
+ enc_row_mt->allocated_rows != max_sb_rows_in_tile ||
+ enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) ||
+ enc_row_mt->allocated_sb_rows != sb_rows_in_frame);
+ const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+ assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+ if (alloc_tile_data) {
+ av1_alloc_tile_data(cpi);
+ }
+
+ assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+ if (alloc_row_mt_mem) {
+ row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile,
+ cpi->oxcf.algo_cfg.cdf_update_mode);
+ }
+
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
+ lpf_pipeline_mt_init(cpi, num_workers);
+
+ av1_init_tile_data(cpi);
+
+ memset(thread_id_to_tile_id, -1,
+ sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+ memset(enc_row_mt->num_tile_cols_done, 0,
+ sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame);
+ enc_row_mt->row_mt_exit = false;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+ // Initialize num_finished_cols to -1 for all rows.
+ memset(row_mt_sync->num_finished_cols, -1,
+ sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile);
+ row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+ row_mt_sync->num_threads_working = 0;
+ row_mt_sync->intrabc_extra_top_right_sb_delay =
+ av1_get_intrabc_extra_top_right_sb_delay(cm);
+
+ av1_inter_mode_data_init(this_tile);
+ av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
+ this_tile->tile_info.mi_col_start,
+ this_tile->tile_info.mi_col_end, tile_row);
+ }
+ }
+
+ assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+ num_workers);
+ prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi);
+ accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) {
+ for (int i = num_workers - 1; i >= 0; --i) {
+ EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+ if (thread_data->td != &cpi->td)
+ av1_dealloc_src_diff_buf(&thread_data->td->mb,
+ av1_num_planes(&cpi->common));
+ }
+}
+
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+ int num_workers = 0;
+ int max_mb_rows = 0;
+
+ max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size);
+ const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols ||
+ enc_row_mt->allocated_tile_rows != tile_rows ||
+ enc_row_mt->allocated_rows != max_mb_rows;
+ const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+ assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+ if (alloc_tile_data) {
+ av1_alloc_tile_data(cpi);
+ }
+
+ assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+ if (alloc_row_mt_mem) {
+ row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
+ }
+
+ av1_init_tile_data(cpi);
+
+ // For pass = 1, compute the no. of workers needed. For single-pass encode
+ // (pass = 0), no. of workers are already computed.
+ if (mt_info->num_mod_workers[MOD_FP] == 0)
+ num_workers = av1_fp_compute_num_enc_workers(cpi);
+ else
+ num_workers = mt_info->num_mod_workers[MOD_FP];
+
+ memset(thread_id_to_tile_id, -1,
+ sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+ enc_row_mt->firstpass_mt_exit = false;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+ // Initialize num_finished_cols to -1 for all rows.
+ memset(row_mt_sync->num_finished_cols, -1,
+ sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows);
+ row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+ row_mt_sync->num_threads_working = 0;
+
+ // intraBC mode is not evaluated during first-pass encoding. Hence, no
+ // additional top-right delay is required.
+ row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+ }
+ }
+
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
+ assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+ num_workers);
+ fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ dealloc_thread_data_src_diff_buf(cpi, num_workers);
+}
+
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c) {
+ (void)tpl_mt_sync;
+ (void)r;
+ (void)c;
+}
+
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c, int cols) {
+ (void)tpl_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+}
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+ int c) {
+#if CONFIG_MULTITHREAD
+ int nsync = tpl_row_mt_sync->sync_range;
+
+ if (r) {
+ pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync)
+ pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex);
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)tpl_row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+ int c, int cols) {
+#if CONFIG_MULTITHREAD
+ int nsync = tpl_row_mt_sync->sync_range;
+ int cur;
+ // Only signal when there are enough encoded blocks for next row to run.
+ int sig = 1;
+
+ if (c < cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]);
+
+ // When a thread encounters an error, num_finished_cols[r] is set to maximum
+ // column number. In this case, the AOMMAX operation here ensures that
+ // num_finished_cols[r] is not overwritten with a smaller value thus
+ // preventing the infinite waiting of threads in the relevant sync_read()
+ // function.
+ tpl_row_mt_sync->num_finished_cols[r] =
+ AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur);
+
+ pthread_cond_signal(&tpl_row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)tpl_row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const BLOCK_SIZE bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const int mi_height = mi_size_high[bsize];
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+ const int tplb_cols_in_tile =
+ ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+ // In case of tpl row-multithreading, due to top-right dependency, the worker
+ // on an mb_row waits for the completion of the tpl processing of the top and
+ // top-right blocks. Hence, in case a thread (main/worker) encounters an
+ // error, update that the tpl processing of every mb_row in the frame is
+ // complete in order to avoid dependent workers waiting indefinitely.
+ for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows;
+ mi_row += mi_height, tplb_row++) {
+ (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_cols_in_tile - 1, tplb_cols_in_tile);
+ }
+}
+
+// Each worker calls tpl_worker_hook() and computes the tpl data.
+static int tpl_worker_hook(void *arg1, void *unused) {
+ (void)unused;
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *cpi = thread_data->cpi;
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCK *x = &thread_data->td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
+ TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers;
+ CommonModeInfoParams *mi_params = &cm->mi_params;
+ int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+ (void)tpl_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_;
+#endif
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tpl_error_mutex_);
+ tpl_row_mt->tpl_mt_exit = true;
+ pthread_mutex_unlock(tpl_error_mutex_);
+#endif
+ set_mode_estimation_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ TX_SIZE tx_size = max_txsize_lookup[bsize];
+ int mi_height = mi_size_high[bsize];
+
+ av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+ for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
+ mi_row += num_active_workers * mi_height) {
+ // Motion estimation row boundary
+ av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+ cpi->oxcf.border_in_pixels);
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge =
+ GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+ av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row,
+ bsize, tx_size);
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Deallocate tpl synchronization related mutex and data.
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
+ assert(tpl_sync != NULL);
+
+#if CONFIG_MULTITHREAD
+ if (tpl_sync->mutex_ != NULL) {
+ for (int i = 0; i < tpl_sync->rows; ++i)
+ pthread_mutex_destroy(&tpl_sync->mutex_[i]);
+ aom_free(tpl_sync->mutex_);
+ }
+ if (tpl_sync->cond_ != NULL) {
+ for (int i = 0; i < tpl_sync->rows; ++i)
+ pthread_cond_destroy(&tpl_sync->cond_[i]);
+ aom_free(tpl_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+
+ aom_free(tpl_sync->num_finished_cols);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*tpl_sync);
+}
+
+// Allocate memory for tpl row synchronization.
+void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
+ int mb_rows) {
+ tpl_sync->rows = mb_rows;
+#if CONFIG_MULTITHREAD
+ {
+ CHECK_MEM_ERROR(cm, tpl_sync->mutex_,
+ aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows));
+ if (tpl_sync->mutex_) {
+ for (int i = 0; i < mb_rows; ++i)
+ pthread_mutex_init(&tpl_sync->mutex_[i], NULL);
+ }
+
+ CHECK_MEM_ERROR(cm, tpl_sync->cond_,
+ aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows));
+ if (tpl_sync->cond_) {
+ for (int i = 0; i < mb_rows; ++i)
+ pthread_cond_init(&tpl_sync->cond_[i], NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols,
+ aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows));
+
+ // Set up nsync.
+ tpl_sync->sync_range = 1;
+}
+
+// Each worker is prepared by assigning the hook function and individual thread
+// data.
+static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ // OBMC buffers are used only to init MS params and remain unused when
+ // called from tpl, hence set the buffers to defaults.
+ av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+ if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers,
+ cpi->ppi->tpl_data.tpl_bsize_1d)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating tpl data");
+ }
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ }
+ }
+}
+
+#if CONFIG_BITRATE_ACCURACY
+// Accumulate transform stats after tpl.
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+ const MultiThreadInfo *mt_info,
+ int num_workers) {
+ TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ ThreadData *td = thread_data->td;
+ if (td != main_td) {
+ const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+ av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
+ }
+ }
+}
+#endif // CONFIG_BITRATE_ACCURACY
+
+// Implements multi-threading for tpl.
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CommonModeInfoParams *mi_params = &cm->mi_params;
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
+ int mb_rows = mi_params->mb_rows;
+ int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);
+
+ if (mb_rows != tpl_sync->rows) {
+ av1_tpl_dealloc(tpl_sync);
+ av1_tpl_alloc(tpl_sync, cm, mb_rows);
+ }
+ tpl_sync->num_threads_working = num_workers;
+ mt_info->tpl_row_mt.tpl_mt_exit = false;
+
+ // Initialize cur_mb_col to -1 for all MB rows.
+ memset(tpl_sync->num_finished_cols, -1,
+ sizeof(*tpl_sync->num_finished_cols) * mb_rows);
+
+ prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+#if CONFIG_BITRATE_ACCURACY
+ tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
+#endif // CONFIG_BITRATE_ACCURACY
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+ }
+}
+
+// Deallocate memory for temporal filter multi-thread synchronization.
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
+ assert(tf_sync != NULL);
+#if CONFIG_MULTITHREAD
+ if (tf_sync->mutex_ != NULL) {
+ pthread_mutex_destroy(tf_sync->mutex_);
+ aom_free(tf_sync->mutex_);
+ }
+#endif // CONFIG_MULTITHREAD
+ tf_sync->next_tf_row = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next_tf_row and returns 1, else returns 0.
+static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
+ int *current_mb_row, int mb_rows) {
+ int do_next_row = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
+ pthread_mutex_lock(tf_mutex_);
+#endif
+ if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) {
+ *current_mb_row = tf_mt_sync->next_tf_row;
+ tf_mt_sync->next_tf_row++;
+ do_next_row = 1;
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(tf_mutex_);
+#endif
+ return do_next_row;
+}
+
+// Hook function for each thread in temporal filter multi-threading.
+static int tf_worker_hook(void *arg1, void *unused) {
+ (void)unused;
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *cpi = thread_data->cpi;
+ ThreadData *td = thread_data->td;
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
+ const struct scale_factors *scale = &cpi->tf_ctx.sf;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tf_mutex_ = tf_sync->mutex_;
+#endif
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tf_mutex_);
+ tf_sync->tf_mt_exit = true;
+ pthread_mutex_unlock(tf_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ const int num_planes = av1_num_planes(&cpi->common);
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+ MACROBLOCKD *mbd = &td->mb.e_mbd;
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ MB_MODE_INFO **input_mb_mode_info;
+ tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+ tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+ int current_mb_row = -1;
+
+ while (tf_get_next_job(tf_sync, &current_mb_row, tf_ctx->mb_rows))
+ av1_tf_do_filtering_row(cpi, td, current_mb_row);
+
+ tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns temporal filter hook function and thread data to each worker.
+static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers, int is_highbitdepth) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ mt_info->tf_sync.next_tf_row = 0;
+ mt_info->tf_sync.tf_mt_exit = false;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ // OBMC buffers are used only to init MS params and remain unused when
+ // called from tf, hence set the buffers to defaults.
+ av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+ if (!tf_alloc_and_reset_data(&thread_data->td->tf_data,
+ cpi->tf_ctx.num_pels, is_highbitdepth)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+ }
+ }
+}
+
+// Deallocate thread specific data for temporal filter.
+static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers,
+ int is_highbitdepth) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+ }
+}
+
+// Accumulate sse and sum after temporal filtering.
+static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) {
+ FRAME_DIFF *total_diff = &cpi->td.tf_data.diff;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &cpi->mt_info.workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ ThreadData *td = thread_data->td;
+ FRAME_DIFF *diff = &td->tf_data.diff;
+ if (td != &cpi->td) {
+ total_diff->sse += diff->sse;
+ total_diff->sum += diff->sum;
+ }
+ }
+}
+
+// Implements multi-threading for temporal filter.
+void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+
+ int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);
+
+ prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, cm, num_workers);
+ tf_accumulate_frame_diff(cpi, num_workers);
+ tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth);
+}
+
+// Checks if a job is available in the current direction. If a job is available,
+// frame_idx will be populated and returns 1, else returns 0.
+static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
+ int cur_dir) {
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+ JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+ int total_refs = gm_info->num_ref_frames[cur_dir];
+ int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
+
+ if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) {
+ *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame;
+ job_info->next_frame_to_process[cur_dir] += 1;
+ return 1;
+ }
+ return 0;
+}
+
+// Switches the current direction and calls the function get_next_gm_job() if
+// the speed feature 'prune_ref_frame_for_gm_search' is not set.
+static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
+ int *cur_dir) {
+ if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
+ // Switch the direction and get next job
+ *cur_dir = !(*cur_dir);
+ get_next_gm_job(cpi, frame_idx, *(cur_dir));
+}
+
+// Hook function for each thread in global motion multi-threading.
+static int gm_mt_worker_hook(void *arg1, void *unused) {
+ (void)unused;
+
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *cpi = thread_data->cpi;
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+ AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+ JobInfo *job_info = &gm_sync->job_info;
+ int thread_id = thread_data->thread_id;
+ GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_;
+#endif
+
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(gm_mt_mutex_);
+ gm_sync->gm_mt_exit = true;
+ pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ int cur_dir = job_info->thread_id_to_dir[thread_id];
+ bool gm_mt_exit = false;
+ while (1) {
+ int ref_buf_idx = -1;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(gm_mt_mutex_);
+#endif
+
+ gm_mt_exit = gm_sync->gm_mt_exit;
+ // Populates ref_buf_idx(the reference frame type) for which global motion
+ // estimation will be done.
+ if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+ // No jobs are available for the current direction. Switch
+ // to other direction and get the next job, if available.
+ switch_direction(cpi, &ref_buf_idx, &cur_dir);
+ }
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+
+ // When gm_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (gm_mt_exit || ref_buf_idx == -1) break;
+
+ // Compute global motion for the given ref_buf_idx.
+ av1_compute_gm_for_valid_ref_frames(
+ cpi, error_info, gm_info->ref_buf, ref_buf_idx,
+ gm_thread_data->motion_models, gm_thread_data->segment_map,
+ gm_info->segment_map_w, gm_info->segment_map_h);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(gm_mt_mutex_);
+#endif
+ // If global motion w.r.t. current ref frame is
+ // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+ // the remaining ref frames in that direction.
+ if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+ cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION)
+ job_info->early_exit[cur_dir] = 1;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns global motion hook function and thread data to each worker.
+static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ mt_info->gm_sync.gm_mt_exit = false;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ if (thread_data->td != &cpi->td)
+ gm_alloc_data(cpi, &thread_data->td->gm_data);
+ }
+}
+
+// Assigns available threads to past/future direction.
+static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
+ int num_workers) {
+ int8_t frame_dir_idx = 0;
+
+ for (int i = 0; i < num_workers; i++) {
+ thread_id_to_dir[i] = frame_dir_idx++;
+ if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0;
+ }
+}
+
+// Computes number of workers for global motion multi-threading.
+static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
+ int total_refs =
+ cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
+ int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+ ? AOMMIN(MAX_DIRECTIONS, total_refs)
+ : total_refs;
+ num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers);
+ return (num_gm_workers);
+}
+
+// Frees the memory allocated for each worker in global motion multi-threading.
+static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int j = 0; j < num_workers; j++) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) gm_dealloc_data(&td->gm_data);
+ }
+}
+
+// Implements multi-threading for global motion.
+void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
+ JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+ av1_zero(*job_info);
+
+ int num_workers = compute_gm_workers(cpi);
+
+ assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
+ prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+ gm_dealloc_thread_data(cpi, num_workers);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE int get_next_job_allintra(
+ AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end,
+ int *current_mi_row, int mib_size) {
+ if (row_mt_sync->next_mi_row < mi_row_end) {
+ *current_mi_row = row_mt_sync->next_mi_row;
+ row_mt_sync->num_threads_working++;
+ row_mt_sync->next_mi_row += mib_size;
+ return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
+ AVxWorkerHook hook,
+ const int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread, in this case the preprocessing
+ // stage does not need tiles. So we set it to 0.
+ thread_data->start = 0;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td);
+ }
+ }
+}
+
+static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL);
+ const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+ const int mt_unit_cols =
+ (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+ const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt;
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+
+ // Update the wiener variance computation of every row in the frame to
+ // indicate that it is complete in order to avoid dependent workers waiting
+ // indefinitely.
+ for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows;
+ mi_row += mb_step, ++mt_thread_id) {
+ intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+ mt_unit_cols - 1, mt_unit_cols);
+ }
+}
+
+static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
+ (void)unused;
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ MACROBLOCK *x = &thread_data->td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_;
+#endif
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex);
+ enc_row_mt->mb_wiener_mt_exit = true;
+ pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+ set_mb_wiener_var_calc_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+ DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+ double sum_rec_distortion = 0;
+ double sum_est_rate = 0;
+ while (1) {
+ int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+ int has_jobs = enc_row_mt->mb_wiener_mt_exit
+ ? 0
+ : get_next_job_allintra(intra_row_mt_sync,
+ cpi->common.mi_params.mi_rows,
+ &current_mi_row, mb_step);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+ if (!has_jobs) break;
+ // TODO(chengchen): properly accumulate the distortion and rate.
+ av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
+ qcoeff, dqcoeff, &sum_rec_distortion,
+ &sum_est_rate,
+ thread_data->td->wiener_tmp_pred_buf);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+ intra_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) {
+ av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int j = 0; j < num_workers; ++j) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td);
+ }
+}
+
+// This function is the multi-threading version of computing the wiener
+// variance.
+// Note that the wiener variance is used for allintra mode (1 pass) and its
+// computation is before the frame encoding, so we don't need to consider
+// the number of tiles, instead we allocate all available threads to
+// the computation.
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+ double *sum_rec_distortion,
+ double *sum_est_rate) {
+ (void)sum_rec_distortion;
+ (void)sum_est_rate;
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+
+ // TODO(chengchen): the memory usage could be improved.
+ const int mi_rows = cm->mi_params.mi_rows;
+ row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows);
+
+ intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+ intra_row_mt_sync->num_threads_working = num_workers;
+ intra_row_mt_sync->next_mi_row = 0;
+ memset(intra_row_mt_sync->num_finished_cols, -1,
+ sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows);
+ mt_info->enc_row_mt.mb_wiener_mt_exit = false;
+
+ prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, cm, num_workers);
+ dealloc_mb_wiener_var_mt_data(cpi, num_workers);
+}
+
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+ const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+ const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+ if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+ return -1;
+ else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+ return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+ else
+ return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+ AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+ assert(pack_bs_sync->next_job_idx <= num_tiles);
+ if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+ return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+ .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+ const int frame_or_tg_size,
+ size_t *remain_buf_size,
+ size_t max_buf_size,
+ int is_last_chunk) {
+ size_t this_chunk_size;
+ assert(*remain_buf_size > 0);
+ if (is_last_chunk) {
+ this_chunk_size = *remain_buf_size;
+ *remain_buf_size = 0;
+ } else {
+ const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+ this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+ *remain_buf_size -= this_chunk_size;
+ assert(*remain_buf_size > 0);
+ }
+ assert(this_chunk_size > 0);
+ return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ PackBSParams *const pack_bs_params_arr,
+ uint8_t obu_extn_header) {
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int num_tiles = tiles->cols * tiles->rows;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cpi->num_tg;
+ // Tile group size in terms of number of tiles.
+ const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+ uint8_t *tile_dst = dst;
+ uint8_t *tile_data_curr = dst;
+ // Max tile group count can not be more than MAX_TILES.
+ int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units
+ int tile_idx;
+ int tg_idx = 0;
+ int tile_count_in_tg = 0;
+ int new_tg = 1;
+
+ // Populate pack bitstream params of all tiles.
+ for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+ const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+ PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+ // Calculate tile size in mi units.
+ const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+ (tile_info->mi_row_end - tile_info->mi_row_start);
+ int is_last_tile_in_tg = 0;
+ tile_count_in_tg++;
+ if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+ is_last_tile_in_tg = 1;
+
+ // Populate pack bitstream params of this tile.
+ pack_bs_params->curr_tg_hdr_size = 0;
+ pack_bs_params->obu_extn_header = obu_extn_header;
+ pack_bs_params->saved_wb = saved_wb;
+ pack_bs_params->obu_header_size = 0;
+ pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+ pack_bs_params->new_tg = new_tg;
+ pack_bs_params->tile_col = tile_info->tile_col;
+ pack_bs_params->tile_row = tile_info->tile_row;
+ pack_bs_params->tile_size_mi = tile_size_mi;
+ tg_size_mi[tg_idx] += tile_size_mi;
+
+ if (new_tg) new_tg = 0;
+ if (is_last_tile_in_tg) {
+ tile_count_in_tg = 0;
+ new_tg = 1;
+ tg_idx++;
+ }
+ }
+
+ assert(cpi->available_bs_size > 0);
+ size_t tg_buf_size[MAX_TILES] = { 0 };
+ size_t max_buf_size = cpi->available_bs_size;
+ size_t remain_buf_size = max_buf_size;
+ const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+ tile_idx = 0;
+ // Prepare obu, tile group and frame header of each tile group.
+ for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+ PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+ int is_last_tg = tg_idx == cpi->num_tg - 1;
+ // Prorate bitstream buffer size based on tile group size and available
+ // buffer size. This buffer will be used to store headers and tile data.
+ tg_buf_size[tg_idx] =
+ get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+ max_buf_size, is_last_tg);
+
+ pack_bs_params->dst = tile_dst;
+ pack_bs_params->tile_data_curr = tile_dst;
+
+ // Write obu, tile group and frame header at first tile in the tile
+ // group.
+ av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+ tile_dst += tg_buf_size[tg_idx];
+
+ // Exclude headers from tile group buffer size.
+ tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+ tile_idx += tg_size_in_tiles;
+ }
+
+ tg_idx = 0;
+ // Calculate bitstream buffer size of each tile in the tile group.
+ for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+ PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+ if (pack_bs_params->new_tg) {
+ max_buf_size = tg_buf_size[tg_idx];
+ remain_buf_size = max_buf_size;
+ }
+
+ // Prorate bitstream buffer size of this tile based on tile size and
+ // available buffer size. For this proration, header size is not accounted.
+ const size_t tile_buf_size = get_bs_chunk_size(
+ pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+ max_buf_size, pack_bs_params->is_last_tile_in_tg);
+ pack_bs_params->tile_buf_size = tile_buf_size;
+
+ // Update base address of bitstream buffer for tile and tile group.
+ if (pack_bs_params->new_tg) {
+ tile_dst = pack_bs_params->dst;
+ tile_data_curr = pack_bs_params->tile_data_curr;
+ // Account header size in first tile of a tile group.
+ pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+ } else {
+ pack_bs_params->dst = tile_dst;
+ pack_bs_params->tile_data_curr = tile_data_curr;
+ }
+
+ if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+ tile_dst += pack_bs_params->tile_buf_size;
+ }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+ AV1_COMP *const cpi = thread_data->cpi;
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int num_tiles = tiles->cols * tiles->rows;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_;
+#endif
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pack_bs_mutex);
+ pack_bs_sync->pack_bs_mt_exit = true;
+ pthread_mutex_unlock(pack_bs_mutex);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ while (1) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pack_bs_mutex);
+#endif
+ const int tile_idx =
+ pack_bs_sync->pack_bs_mt_exit
+ ? -1
+ : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pack_bs_mutex);
+#endif
+ // When pack_bs_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (tile_idx == -1) break;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+ thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+ av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+ }
+
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+ PackBSParams *const pack_bs_params,
+ AVxWorkerHook hook, const int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+ thread_data->cpi = cpi;
+ thread_data->start = i;
+ thread_data->thread_id = i;
+ av1_reset_pack_bs_thread_data(thread_data->td);
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = pack_bs_params;
+ }
+
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+ const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+ pack_bs_sync->next_job_idx = 0;
+ pack_bs_sync->pack_bs_mt_exit = false;
+
+ PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+ // Reset tile order data of pack bitstream
+ av1_zero_array(pack_bs_tile_order, num_tiles);
+
+ // Populate pack bitstream tile order structure
+ for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+ pack_bs_tile_order[tile_idx].abs_sum_level =
+ cpi->tile_data[tile_idx].abs_sum_level;
+ pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+ }
+
+ // Sort tiles in descending order based on tile area.
+ qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+ compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+ AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+ uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+ int *const largest_tile_id, unsigned int *max_tile_size,
+ uint32_t *const obu_header_size, uint8_t **tile_data_start,
+ const int num_workers) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int tile_count = tiles->cols * tiles->rows;
+ // Fixed size tile groups for the moment
+ size_t curr_tg_data_size = 0;
+ int is_first_tg = 1;
+ uint8_t *curr_tg_start = dst;
+ size_t src_offset = 0;
+ size_t dst_offset = 0;
+
+ for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+ // PackBSParams stores all parameters required to pack tile and header
+ // info.
+ const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+ uint32_t tile_size = 0;
+
+ if (pack_bs_params->new_tg) {
+ curr_tg_start = dst + *total_size;
+ curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+ *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+ *obu_header_size = pack_bs_params->obu_header_size;
+ }
+ curr_tg_data_size +=
+ pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+ if (pack_bs_params->buf.size > *max_tile_size) {
+ *largest_tile_id = tile_idx;
+ *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+ }
+ tile_size +=
+ (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+ // Pack all the chunks of tile bitstreams together
+ if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+ if (pack_bs_params->is_last_tile_in_tg)
+ av1_write_last_tile_info(
+ cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+ curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+ &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+ src_offset += pack_bs_params->tile_buf_size;
+ dst_offset += tile_size;
+ *total_size += tile_size;
+ }
+
+ // Accumulate thread data
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int idx = num_workers - 1; idx >= 0; idx--) {
+ ThreadData const *td = mt_info->tile_thr_data[idx].td;
+ av1_accumulate_pack_bs_thread_data(cpi, td);
+ }
+}
+
+void av1_write_tile_obu_mt(
+ AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+ unsigned int *max_tile_size, uint32_t *const obu_header_size,
+ uint8_t **tile_data_start, const int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+ PackBSParams pack_bs_params[MAX_TILES];
+ uint32_t tile_size[MAX_TILES] = { 0 };
+
+ for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+ pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+ init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+ prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+ num_workers);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, &cpi->common, num_workers);
+ accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+ largest_tile_id, max_tile_size, obu_header_size,
+ tile_data_start, num_workers);
+}
+
+// Deallocate memory for CDEF search multi-thread synchronization.
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
+ (void)cdef_sync;
+ assert(cdef_sync != NULL);
+#if CONFIG_MULTITHREAD
+ if (cdef_sync->mutex_ != NULL) {
+ pthread_mutex_destroy(cdef_sync->mutex_);
+ aom_free(cdef_sync->mutex_);
+ }
+#endif // CONFIG_MULTITHREAD
+}
+
+// Updates the row and column indices of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all blocks is complete.
+static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
+ cdef_sync->fbc++;
+ if (cdef_sync->fbc == nhfb) {
+ cdef_sync->fbr++;
+ if (cdef_sync->fbr == nvfb) {
+ cdef_sync->end_of_frame = 1;
+ } else {
+ cdef_sync->fbc = 0;
+ }
+ }
+}
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+ if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif // CONFIG_MULTITHREAD
+ cdef_sync->end_of_frame = 0;
+ cdef_sync->fbr = 0;
+ cdef_sync->fbc = 0;
+ cdef_sync->cdef_mt_exit = false;
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
+ CdefSearchCtx *cdef_search_ctx,
+ volatile int *cur_fbr,
+ volatile int *cur_fbc,
+ volatile int *sb_count) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(cdef_sync->mutex_);
+#endif // CONFIG_MULTITHREAD
+ int do_next_block = 0;
+ const int nvfb = cdef_search_ctx->nvfb;
+ const int nhfb = cdef_search_ctx->nhfb;
+
+ // If a block is skip, do not process the block and
+ // check the skip condition for the next block.
+ while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame &&
+ cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+ cdef_sync->fbc)) {
+ update_next_job_info(cdef_sync, nvfb, nhfb);
+ }
+
+ // Populates information needed for current job and update the row,
+ // column indices of the next block to be processed.
+ if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
+ do_next_block = 1;
+ *cur_fbr = cdef_sync->fbr;
+ *cur_fbc = cdef_sync->fbc;
+ *sb_count = cdef_search_ctx->sb_count;
+ cdef_search_ctx->sb_count++;
+ update_next_job_info(cdef_sync, nvfb, nhfb);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(cdef_sync->mutex_);
+#endif // CONFIG_MULTITHREAD
+ return do_next_block;
+}
+
+// Hook function for each thread in CDEF search multi-threading.
+static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_;
+#endif
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(cdef_mutex_);
+ cdef_sync->cdef_mt_exit = true;
+ pthread_mutex_unlock(cdef_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ volatile int cur_fbr, cur_fbc, sb_count;
+ while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
+ &sb_count)) {
+ av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc,
+ sb_count);
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns CDEF search hook function and thread data to each worker.
+static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ thread_data->cpi = cpi;
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = &mt_info->cdef_sync;
+ }
+}
+
+// Implements multi-threading for CDEF search.
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+ const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
+
+ cdef_reset_job_info(cdef_sync);
+ prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, &cpi->common, num_workers);
+}
+
+// Computes num_workers for temporal filter multi-threading.
+static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
+ // For single-pass encode, using no. of workers as per tf block size was not
+ // found to improve speed. Hence the thread assignment for single-pass encode
+ // is kept based on compute_num_enc_workers().
+ if (cpi->oxcf.pass < AOM_RC_SECOND_PASS)
+ return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+
+ if (cpi->oxcf.max_threads <= 1) return 1;
+
+ const int frame_height = cpi->common.height;
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ const int mb_height = block_size_high[block_size];
+ const int mb_rows = get_num_blocks(frame_height, mb_height);
+ return AOMMIN(cpi->oxcf.max_threads, mb_rows);
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop filter multi-threading.
+static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for cdef multi-threading.
+static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop-restoration multi-threading.
+static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+ if (cpi->oxcf.max_threads <= 1) return 1;
+ return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for all intra multi-threading.
+static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) {
+ if (cpi->oxcf.max_threads <= 1) return 1;
+ // The multi-threading implementation of deltaq-mode = 3 in allintra
+ // mode is based on row multi threading.
+ if (!cpi->oxcf.row_mt) return 1;
+ cpi->weber_bsize = BLOCK_8X8;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step;
+ return AOMMIN(num_mb_rows, cpi->oxcf.max_threads);
+}
+
+static int compute_num_mod_workers(AV1_COMP *cpi,
+ MULTI_THREADED_MODULES mod_name) {
+ int num_mod_workers = 0;
+ switch (mod_name) {
+ case MOD_FP:
+ if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS)
+ num_mod_workers = 0;
+ else
+ num_mod_workers =
+ av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ break;
+ case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
+ case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
+ case MOD_GME: num_mod_workers = 1; break;
+ case MOD_ENC:
+ num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ break;
+ case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
+ case MOD_CDEF_SEARCH:
+ num_mod_workers = compute_num_cdef_workers(cpi);
+ break;
+ case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
+ case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+ case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
+ case MOD_FRAME_ENC:
+ num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC];
+ break;
+ case MOD_AI:
+ if (cpi->oxcf.pass == AOM_RC_ONE_PASS) {
+ num_mod_workers = compute_num_ai_workers(cpi);
+ } else {
+ num_mod_workers = 0;
+ }
+ break;
+ default: assert(0); break;
+ }
+ return (num_mod_workers);
+}
+// Computes the number of workers for each MT modules in the encoder
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
+ for (int i = MOD_FP; i < NUM_MT_MODULES; i++) {
+ cpi->ppi->p_mt_info.num_mod_workers[i] =
+ compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
+ }
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 0000000000..468e120776
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+ struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ struct ThreadData *original_td;
+ struct aom_internal_error_info error_info;
+ AV1LfSync *lf_sync;
+ LFWorkerData *lf_data;
+ int start;
+ int thread_id;
+} EncWorkerData;
+
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c);
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+ int cols);
+
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c);
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c, int cols);
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+void av1_encode_tiles_row_mt(struct AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi);
+
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi);
+#endif
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+ const struct FRAME_COUNTS *counts);
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync);
+
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c);
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c, int cols);
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+ int c);
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+ int c, int cols);
+
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi);
+
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync);
+
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+ double *sum_rec_distortion,
+ double *sum_est_rate);
+
+void av1_tf_do_filtering_mt(AV1_COMP *cpi);
+
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
+
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi);
+
+int av1_get_max_num_workers(const AV1_COMP *cpi);
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers);
+
+void av1_terminate_workers(AV1_PRIMARY *ppi);
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi);
+
+void av1_init_cdef_worker(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi);
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
+#endif // CONFIG_MULTITHREAD
+
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
+ MULTI_THREADED_MODULES mod_name);
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
+
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi);
+
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
+
+void av1_write_tile_obu_mt(
+ AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+ unsigned int *max_tile_size, uint32_t *const obu_header_size,
+ uint8_t **tile_data_start, const int num_workers);
+
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
+
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf);
+
+void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 0000000000..e1b1e69ca7
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+ uint8_t *dst, int dst_pitch, int w, int h,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right,
+ int chroma_step) {
+ int i, linesize;
+ // copy the left and right most columns out
+ const uint8_t *src_ptr1 = src;
+ const uint8_t *src_ptr2 = src + (w - 1) * chroma_step;
+ uint8_t *dst_ptr1 = dst - extend_left;
+ uint8_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ if (chroma_step == 1) {
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+ } else {
+ for (int j = 0; j < w; j++) {
+ dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j];
+ }
+ }
+ memset(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+ assert(linesize <= dst_pitch);
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize);
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize);
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+ uint8_t *dst8, int dst_pitch, int w,
+ int h, int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ // copy the left and right most columns out
+ const uint16_t *src_ptr1 = src;
+ const uint16_t *src_ptr2 = src + w - 1;
+ uint16_t *dst_ptr1 = dst - extend_left;
+ uint16_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+ aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+ assert(linesize <= dst_pitch);
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ // Extend src frame in buffer
+ const int et_y = dst->border;
+ const int el_y = dst->border;
+ const int er_y =
+ AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+ src->y_crop_width;
+ const int eb_y = AOMMAX(src->y_height + dst->border,
+ ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+ src->y_crop_height;
+ const int uv_width_subsampling = src->subsampling_x;
+ const int uv_height_subsampling = src->subsampling_y;
+ const int et_uv = et_y >> uv_height_subsampling;
+ const int el_uv = el_y >> uv_width_subsampling;
+ const int eb_uv = eb_y >> uv_height_subsampling;
+ const int er_uv = er_y >> uv_width_subsampling;
+
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width,
+ src->y_crop_height, et_y, el_y, eb_y, er_y);
+ if (!src->monochrome) {
+ highbd_copy_and_extend_plane(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ highbd_copy_and_extend_plane(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ }
+ return;
+ }
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width, src->y_crop_height,
+ et_y, el_y, eb_y, er_y, 1);
+ if (!src->monochrome) {
+ // detect nv12 format
+ const int chroma_step = src->v_buffer ? 1 : 2;
+ const uint8_t *src_v_buffer =
+ src->v_buffer ? src->v_buffer : src->u_buffer + 1;
+ copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src->uv_crop_width,
+ src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+ chroma_step);
+ copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src->uv_crop_width,
+ src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+ chroma_step);
+ }
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 0000000000..b8cc5b9d28
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/external_partition.c b/third_party/aom/av1/encoder/external_partition.c
new file mode 100644
index 0000000000..79f8b4c8a4
--- /dev/null
+++ b/third_party/aom/av1/encoder/external_partition.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/encoder/external_partition.h"
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+ aom_ext_part_config_t config,
+ ExtPartController *ext_part_controller) {
+ if (ext_part_controller == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ ext_part_controller->funcs = funcs;
+ ext_part_controller->config = config;
+ const aom_ext_part_status_t status = ext_part_controller->funcs.create_model(
+ ext_part_controller->funcs.priv, &ext_part_controller->config,
+ &ext_part_controller->model);
+ if (status == AOM_EXT_PART_ERROR) {
+ return AOM_CODEC_ERROR;
+ } else if (status == AOM_EXT_PART_TEST) {
+ ext_part_controller->test_mode = 1;
+ ext_part_controller->ready = 0;
+ return AOM_CODEC_OK;
+ }
+ assert(status == AOM_EXT_PART_OK);
+ ext_part_controller->ready = 1;
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+ if (ext_part_controller == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ av1_zero(ext_part_controller);
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
+ if (ext_part_controller == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (ext_part_controller->ready) {
+ const aom_ext_part_status_t status =
+ ext_part_controller->funcs.delete_model(ext_part_controller->model);
+ if (status != AOM_EXT_PART_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return av1_ext_part_init(ext_part_controller);
+}
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+ aom_partition_decision_t *decision) {
+ assert(ext_part_controller != NULL);
+ assert(ext_part_controller->ready);
+ assert(decision != NULL);
+ const aom_ext_part_status_t status =
+ ext_part_controller->funcs.get_partition_decision(
+ ext_part_controller->model, decision);
+ if (status != AOM_EXT_PART_OK) return false;
+ return true;
+}
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+ const aom_partition_stats_t *stats) {
+ assert(ext_part_controller != NULL);
+ assert(ext_part_controller->ready);
+ assert(stats != NULL);
+ const aom_ext_part_status_t status =
+ ext_part_controller->funcs.send_partition_stats(
+ ext_part_controller->model, stats);
+ if (status != AOM_EXT_PART_OK) return false;
+ return true;
+}
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+ const aom_partition_features_t *features) {
+ assert(ext_part_controller != NULL);
+ assert(ext_part_controller->ready);
+ assert(features != NULL);
+ const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+ ext_part_controller->model, features);
+ if (status != AOM_EXT_PART_OK) return false;
+ return true;
+}
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+ const ExtPartController *ext_part_controller) {
+ return ext_part_controller->funcs.decision_mode;
+}
diff --git a/third_party/aom/av1/encoder/external_partition.h b/third_party/aom/av1/encoder/external_partition.h
new file mode 100644
index 0000000000..f74973e9eb
--- /dev/null
+++ b/third_party/aom/av1/encoder/external_partition.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+
+#include <stdbool.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+
+typedef struct ExtPartController {
+ int ready;
+ int test_mode;
+ aom_ext_part_config_t config;
+ aom_ext_part_model_t model;
+ aom_ext_part_funcs_t funcs;
+} ExtPartController;
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+ aom_ext_part_config_t config,
+ ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+ aom_partition_decision_t *decision);
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+ const aom_partition_stats_t *stats);
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+ const aom_partition_features_t *features);
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+ const ExtPartController *ext_part_controller);
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 0000000000..e20b6c177e
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,1600 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h" // av1_setup_dst_planes()
+#include "av1/common/reconintra.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#define OUTPUT_FPF 0
+
+#define FIRST_PASS_Q 10.0
+#define INTRA_MODE_PENALTY 1024
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+
+#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
+
+static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
+ struct aom_codec_pkt_list *pktlist) {
+ struct aom_codec_cx_pkt pkt;
+ pkt.kind = AOM_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+ if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile,
+ "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
+ stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+ stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+ stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+ stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+ stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+ stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+ stats->count, stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
+ section->frame = 0.0;
+ section->weight = 0.0;
+ section->intra_error = 0.0;
+ section->frame_avg_wavelet_energy = 0.0;
+ section->coded_error = 0.0;
+ section->log_intra_error = 0.0;
+ section->log_coded_error = 0.0;
+ section->sr_coded_error = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->intra_skip_pct = 0.0;
+ section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->new_mv_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+ section->is_flash = 0;
+ section->noise_var = 0;
+ section->cor_coeff = 1.0;
+}
+
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame += frame->frame;
+ section->weight += frame->weight;
+ section->intra_error += frame->intra_error;
+ section->log_intra_error += log1p(frame->intra_error);
+ section->log_coded_error += log1p(frame->coded_error);
+ section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
+ section->coded_error += frame->coded_error;
+ section->sr_coded_error += frame->sr_coded_error;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->intra_skip_pct += frame->intra_skip_pct;
+ section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->new_mv_count += frame->new_mv_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) {
+ const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+ const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+ if (height_mi_log2 > mb_height_mi_log2) {
+ return mb_rows >> (height_mi_log2 - mb_height_mi_log2);
+ }
+
+ return mb_rows << (mb_height_mi_log2 - height_mi_log2);
+}
+
+static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) {
+ const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+ const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+ if (width_mi_log2 > mb_width_mi_log2) {
+ return mb_cols >> (width_mi_log2 - mb_width_mi_log2);
+ }
+
+ return mb_cols << (mb_width_mi_log2 - width_mi_log2);
+}
+
+// TODO(chengchen): can we simplify it even if resize has to be considered?
+static int get_num_mbs(const BLOCK_SIZE fp_block_size,
+ const int num_mbs_16X16) {
+ const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+ const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+ const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+ const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+ // TODO(chengchen): Now this function assumes a square block is used.
+ // It does not support rectangular block sizes.
+ assert(width_mi_log2 == height_mi_log2);
+ if (width_mi_log2 > mb_width_mi_log2) {
+ return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) +
+ (height_mi_log2 - mb_height_mi_log2));
+ }
+
+ return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) +
+ (mb_height_mi_log2 - height_mi_log2));
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+ if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled)
+ output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats,
+ cpi->ppi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_8X8: return aom_mse8x8;
+ case BLOCK_16X8: return aom_mse16x8;
+ case BLOCK_8X16: return aom_mse8x16;
+ default: return aom_mse16x16;
+ }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+ int bd) {
+ switch (bd) {
+ default:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_8_mse8x8;
+ case BLOCK_16X8: return aom_highbd_8_mse16x8;
+ case BLOCK_8X16: return aom_highbd_8_mse8x16;
+ default: return aom_highbd_8_mse16x16;
+ }
+ case 10:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_10_mse8x8;
+ case BLOCK_16X8: return aom_highbd_10_mse16x8;
+ case BLOCK_8X16: return aom_highbd_10_mse8x16;
+ default: return aom_highbd_10_mse16x16;
+ }
+ case 12:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_12_mse8x8;
+ case BLOCK_16X8: return aom_highbd_12_mse16x8;
+ case BLOCK_8X16: return aom_highbd_12_mse8x16;
+ default: return aom_highbd_12_mse16x16;
+ }
+ }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref,
+ int bd) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(int width, int height) {
+ int sr = 0;
+ const int dim = AOMMIN(width, height);
+
+ while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+ return sr;
+}
+
+static AOM_INLINE const search_site_config *
+av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
+ SEARCH_METHODS search_method) {
+ const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+ // For AVIF applications, even the source frames can have changing resolution,
+ // so we need to manually check for the strides :(
+ // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+ // that's shared by multiple threads. In most cases where all frames have the
+ // same resolution, the cache contains the search site config that we need.
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_FPF];
+ }
+
+ // If the cache does not contain the correct stride, then we will need to rely
+ // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+ // thread level config doesn't match, then we need to update it.
+ search_method = search_method_lookup[search_method];
+ assert(search_method_lookup[search_method] == search_method &&
+ "The search_method_lookup table should be idempotent.");
+ if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+ av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+ ref_stride);
+ }
+
+ return x->search_site_cfg_buf;
+}
+
+static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+ const MV *ref_mv,
+ FULLPEL_MV *best_mv,
+ int *best_motion_err) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+ int tmp_err;
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+ const int sr = get_search_range(cm->width, cm->height);
+ const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
+
+ const search_site_config *first_pass_search_sites =
+ av1_get_first_pass_search_site_config(cpi, x, NSTEP);
+ const int fine_search_interval =
+ cpi->is_screen_content_type && cm->features.allow_intrabc;
+ FULLPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
+ start_mv, first_pass_search_sites, NSTEP,
+ fine_search_interval);
+
+ FULLPEL_MV this_best_mv;
+ FULLPEL_MV_STATS best_mv_stats;
+ tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
+ &this_best_mv, &best_mv_stats, NULL);
+
+ if (tmp_err < INT_MAX) {
+ aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
+ const MSBuffers *ms_buffers = &ms_params.ms_buffers;
+ tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
+ &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
+ new_mv_mode_penalty;
+ }
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = this_best_mv;
+ }
+}
+
+static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params,
+ const BLOCK_SIZE fp_block_size, const int unit_row,
+ const int unit_col) {
+ const int unit_width = mi_size_wide[fp_block_size];
+ const int unit_height = mi_size_high[fp_block_size];
+ const int is_half_width =
+ unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols;
+ const int is_half_height =
+ unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows;
+ const int max_dimension =
+ AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]);
+ int square_block_size = 0;
+ // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+ switch (max_dimension) {
+ case 4: square_block_size = 0; break;
+ case 8: square_block_size = 1; break;
+ case 16: square_block_size = 2; break;
+ case 32: square_block_size = 3; break;
+ case 64: square_block_size = 4; break;
+ case 128: square_block_size = 5; break;
+ default: assert(0 && "First pass block size is not supported!"); break;
+ }
+ if (is_half_width && is_half_height) {
+ return subsize_lookup[PARTITION_SPLIT][square_block_size];
+ } else if (is_half_width) {
+ return subsize_lookup[PARTITION_VERT][square_block_size];
+ } else if (is_half_height) {
+ return subsize_lookup[PARTITION_HORZ][square_block_size];
+ } else {
+ return fp_block_size;
+ }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+ return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+ int raw_motion_err_counts) {
+ int64_t sum_raw_err = 0;
+ double raw_err_avg = 0;
+ double raw_err_stdev = 0;
+ if (raw_motion_err_counts == 0) return 0;
+
+ int i;
+ for (i = 0; i < raw_motion_err_counts; i++) {
+ sum_raw_err += raw_motion_err_list[i];
+ }
+ raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
+ for (i = 0; i < raw_motion_err_counts; i++) {
+ raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+ (raw_motion_err_list[i] - raw_err_avg);
+ }
+ // Calculate the standard deviation for the motion error of all the inter
+ // blocks of the 0,0 motion using the last source
+ // frame as the reference.
+ raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+ return raw_err_stdev;
+}
+
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+ return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL;
+}
+typedef struct intra_pred_block_pass1_args {
+ const SequenceHeader *seq_params;
+ MACROBLOCK *x;
+} intra_pred_block_pass1_args;
+
+static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+ int sstride, int width, int height, int use_hbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride,
+ CONVERT_TO_SHORTPTR(dst), dstride, width, height);
+ } else {
+ aom_convolve_copy(src, sstride, dst, dstride, width, height);
+ }
+#else
+ (void)use_hbd;
+ aom_convolve_copy(src, sstride, dst, dstride, width, height);
+#endif
+}
+
+static void first_pass_intra_pred_and_calc_diff(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ (void)block;
+ struct intra_pred_block_pass1_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ MACROBLOCK_PLANE *const p = &x->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const SequenceHeader *seq_params = args->seq_params;
+ const int src_stride = p->src.stride;
+ uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+ pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
+ src_stride, dst, dst_stride, blk_col, blk_row, plane);
+
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+}
+
+static void first_pass_predict_intra_block_for_luma_plane(
+ const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = AOM_PLANE_Y;
+ const MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = pd->dst.buf;
+ const MACROBLOCK_PLANE *const p = &x->plane[plane];
+ const int src_stride = p->src.stride;
+ const uint8_t *src = p->src.buf;
+
+ intra_pred_block_pass1_args args = { seq_params, x };
+ av1_foreach_transformed_block_in_plane(
+ xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);
+
+ // copy source data to recon buffer, as the recon buffer will be used as a
+ // reference frame subsequently.
+ copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize],
+ block_size_high[bsize], seq_params->use_highbitdepth);
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+// Computes and returns the intra pred error of a block.
+// intra pred error: sum of squared error of the intra predicted residual.
+// Inputs:
+// cpi: the encoder setting. Only a few params in it will be used.
+// this_frame: the current frame buffer.
+// tile: tile information (not used in first pass, already init to zero)
+// unit_row: row index in the unit of first pass block size.
+// unit_col: column index in the unit of first pass block size.
+// y_offset: the offset of y frame buffer, indicating the starting point of
+// the current block.
+// uv_offset: the offset of u and v frame buffer, indicating the starting
+// point of the current block.
+// fp_block_size: first pass block size.
+// qindex: quantization step size to encode the frame.
+// stats: frame encoding stats.
+// Modifies:
+// stats->intra_skip_count
+// stats->image_data_start_row
+// stats->intra_factor
+// stats->brightness_factor
+// stats->intra_error
+// stats->frame_avg_wavelet_energy
+// Returns:
+// this_intra_error.
+static int firstpass_intra_prediction(
+ AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame,
+ const TileInfo *const tile, const int unit_row, const int unit_col,
+ const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
+ const int qindex, FRAME_STATS *const stats) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int unit_scale = mi_size_wide[fp_block_size];
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE bsize =
+ get_bsize(mi_params, fp_block_size, unit_row, unit_col);
+
+ set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale);
+ xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
+ if (num_planes > 1) {
+ xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
+ xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+ }
+ xd->left_available = (unit_col != 0);
+ xd->mi[0]->bsize = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize],
+ unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows,
+ mi_params->mi_cols);
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+ xd->mi[0]->segment_id = 0;
+ xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+ xd->mi[0]->mode = DC_PRED;
+ xd->mi[0]->tx_size = TX_4X4;
+
+ if (cpi->sf.fp_sf.disable_recon)
+ first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+ else
+ av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+ int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: this_intra_error >>= 4; break;
+ case AOM_BITS_12: this_intra_error >>= 8; break;
+ default:
+ assert(0 &&
+ "seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ }
+
+ if (this_intra_error < UL_INTRA_THRESH) {
+ ++stats->intra_skip_count;
+ } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+ stats->image_data_start_row = unit_row;
+ }
+
+ double log_intra = log1p(this_intra_error);
+ if (log_intra < 10.0) {
+ stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+ } else {
+ stats->intra_factor += 1.0;
+ }
+
+ int level_sample;
+ if (seq_params->use_highbitdepth) {
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ } else {
+ level_sample = x->plane[0].src.buf[0];
+ }
+
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: level_sample >>= 2; break;
+ case AOM_BITS_12: level_sample >>= 4; break;
+ default:
+ assert(0 &&
+ "seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ }
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+ stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+ } else {
+ stats->brightness_factor += 1.0;
+ }
+
+ // Intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (e.g. a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_intra_error += INTRA_MODE_PENALTY;
+
+ // Accumulate the intra error.
+ stats->intra_error += (int64_t)this_intra_error;
+
+ // Stats based on wavelet energy is used in the following cases :
+ // 1. ML model which predicts if a flat structure (golden-frame only structure
+ // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+ // constant quality mode under certain conditions.
+ // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+ // Thus, wavelet energy calculation is enabled for the above cases.
+ if (calc_wavelet_energy(&cpi->oxcf)) {
+ const int hbd = is_cur_buf_hbd(xd);
+ const int stride = x->plane[0].src.stride;
+ const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+ const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+ const uint8_t *buf = x->plane[0].src.buf;
+ stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input(
+ buf, stride, hbd, num_8x8_rows, num_8x8_cols);
+ } else {
+ stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
+ }
+
+ return this_intra_error;
+}
+
+// Returns the sum of square error between source and reference blocks.
+static int get_prediction_error_bitdepth(const int is_high_bitdepth,
+ const int bitdepth,
+ const BLOCK_SIZE block_size,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ (void)is_high_bitdepth;
+ (void)bitdepth;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_high_bitdepth) {
+ return highbd_get_prediction_error(block_size, src, ref, bitdepth);
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ return get_prediction_error(block_size, src, ref);
+}
+
+// Accumulates motion vector stats.
+// Modifies member variables of "stats".
+static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
+ const int mb_row, const int mb_col,
+ const int mb_rows, const int mb_cols,
+ MV *last_non_zero_mv, FRAME_STATS *stats) {
+ if (is_zero_mv(&best_mv)) return;
+
+ ++stats->mv_count;
+ // Non-zero vector, was it different from the last non zero vector?
+ if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count;
+ *last_non_zero_mv = best_mv;
+
+ // Does the row vector point inwards or outwards?
+ if (mb_row < mb_rows / 2) {
+ if (mv.row > 0) {
+ --stats->sum_in_vectors;
+ } else if (mv.row < 0) {
+ ++stats->sum_in_vectors;
+ }
+ } else if (mb_row > mb_rows / 2) {
+ if (mv.row > 0) {
+ ++stats->sum_in_vectors;
+ } else if (mv.row < 0) {
+ --stats->sum_in_vectors;
+ }
+ }
+
+ // Does the col vector point inwards or outwards?
+ if (mb_col < mb_cols / 2) {
+ if (mv.col > 0) {
+ --stats->sum_in_vectors;
+ } else if (mv.col < 0) {
+ ++stats->sum_in_vectors;
+ }
+ } else if (mb_col > mb_cols / 2) {
+ if (mv.col > 0) {
+ ++stats->sum_in_vectors;
+ } else if (mv.col < 0) {
+ --stats->sum_in_vectors;
+ }
+ }
+}
+
+// Computes and returns the inter prediction error from the last frame.
+// Computes inter prediction errors from the golden and alt ref frams and
+// Updates stats accordingly.
+// Inputs:
+// cpi: the encoder setting. Only a few params in it will be used.
+// last_frame: the frame buffer of the last frame.
+// golden_frame: the frame buffer of the golden frame.
+// unit_row: row index in the unit of first pass block size.
+// unit_col: column index in the unit of first pass block size.
+// recon_yoffset: the y offset of the reconstructed frame buffer,
+// indicating the starting point of the current block.
+// recont_uvoffset: the u/v offset of the reconstructed frame buffer,
+// indicating the starting point of the current block.
+// src_yoffset: the y offset of the source frame buffer.
+// fp_block_size: first pass block size.
+// this_intra_error: the intra prediction error of this block.
+// raw_motion_err_counts: the count of raw motion vectors.
+// raw_motion_err_list: the array that records the raw motion error.
+// ref_mv: the reference used to start the motion search
+// best_mv: the best mv found
+// last_non_zero_mv: the last non zero mv found in this tile row.
+// stats: frame encoding stats.
+// Modifies:
+// raw_motion_err_list
+// best_ref_mv
+// last_mv
+// stats: many member params in it.
+// Returns:
+// this_inter_error
+static int firstpass_inter_prediction(
+ AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame,
+ const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row,
+ const int unit_col, const int recon_yoffset, const int recon_uvoffset,
+ const int src_yoffset, const BLOCK_SIZE fp_block_size,
+ const int this_intra_error, const int raw_motion_err_counts,
+ int *raw_motion_err_list, const MV ref_mv, MV *best_mv,
+ MV *last_non_zero_mv, FRAME_STATS *stats) {
+ int this_inter_error = this_intra_error;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int is_high_bitdepth = is_cur_buf_hbd(xd);
+ const int bitdepth = xd->bd;
+ const int unit_scale = mi_size_wide[fp_block_size];
+ const BLOCK_SIZE bsize =
+ get_bsize(mi_params, fp_block_size, unit_row, unit_col);
+ const int fp_block_size_height = block_size_wide[fp_block_size];
+ const int unit_width = mi_size_wide[fp_block_size];
+ const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
+ const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
+ // Assume 0,0 motion with no mv overhead.
+ FULLPEL_MV mv = kZeroFullMv;
+ xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width,
+ fp_block_size_height >> MI_SIZE_LOG2,
+ cpi->oxcf.border_in_pixels);
+
+ int motion_error =
+ get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+ &x->plane[0].src, &xd->plane[0].pre[0]);
+
+ // Compute the motion error of the 0,0 motion using the last source
+ // frame as the reference. Skip the further motion search on
+ // reconstructed frame if this error is small.
+ // TODO(chiyotsai): The unscaled last source might be different dimension
+ // as the current source. See BUG=aomedia:3413
+ struct buf_2d unscaled_last_source_buf_2d;
+ unscaled_last_source_buf_2d.buf =
+ cpi->unscaled_last_source->y_buffer + src_yoffset;
+ unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+ const int raw_motion_error = get_prediction_error_bitdepth(
+ is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+ raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+ const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf;
+
+ if (raw_motion_error > fp_sf->skip_motion_search_threshold) {
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error);
+
+ // If the current best reference mv is not centered on 0,0 then do a
+ // 0,0 based search as well.
+ if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) {
+ FULLPEL_MV tmp_mv = kZeroFullMv;
+ int tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv = tmp_mv;
+ }
+ }
+ }
+
+ // Motion search in 2nd reference frame.
+ int gf_motion_error = motion_error;
+ if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+ FULLPEL_MV tmp_mv = kZeroFullMv;
+ // Assume 0,0 motion with no mv overhead.
+ av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1);
+ xd->plane[0].pre[0].buf += recon_yoffset;
+ gf_motion_error =
+ get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+ &x->plane[0].src, &xd->plane[0].pre[0]);
+ first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error);
+ }
+ if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) {
+ ++stats->second_ref_count;
+ }
+ // In accumulating a score for the 2nd reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+ stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error);
+ } else {
+ // TODO(chengchen): I believe logically this should also be changed to
+ // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error).
+ stats->sr_coded_error += motion_error;
+ }
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+ if (av1_num_planes(&cpi->common) > 1) {
+ xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+ }
+
+ // Start by assuming that intra mode is best.
+ *best_mv = kZeroMv;
+
+ if (motion_error <= this_intra_error) {
+ // Keep a count of cases where the inter and intra were very close
+ // and very low. This helps with scene cut detection for example in
+ // cropped clips with black bars at the sides or top and bottom.
+ if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) &&
+ (this_intra_error < (2 * INTRA_MODE_PENALTY))) {
+ stats->neutral_count += 1.0;
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_intra_error > NCOUNT_INTRA_THRESH) &&
+ (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ stats->neutral_count +=
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
+ }
+
+ *best_mv = get_mv_from_fullmv(&mv);
+ this_inter_error = motion_error;
+ xd->mi[0]->mode = NEWMV;
+ xd->mi[0]->mv[0].as_mv = *best_mv;
+ xd->mi[0]->tx_size = TX_4X4;
+ xd->mi[0]->ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE_FRAME;
+
+ if (fp_sf->disable_recon == 0) {
+ av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
+ unit_col * unit_scale, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ av1_encode_sby_pass1(cpi, x, bsize);
+ }
+ stats->sum_mvr += best_mv->row;
+ stats->sum_mvr_abs += abs(best_mv->row);
+ stats->sum_mvc += best_mv->col;
+ stats->sum_mvc_abs += abs(best_mv->col);
+ stats->sum_mvrs += best_mv->row * best_mv->row;
+ stats->sum_mvcs += best_mv->col * best_mv->col;
+ ++stats->inter_count;
+
+ accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
+ last_non_zero_mv, stats);
+ }
+
+ return this_inter_error;
+}
+
+// Normalize the first pass stats.
+// Error / counters are normalized to each MB.
+// MVs are normalized to the width/height of the frame.
+static void normalize_firstpass_stats(FIRSTPASS_STATS *fps,
+ double num_mbs_16x16, double f_w,
+ double f_h) {
+ fps->coded_error /= num_mbs_16x16;
+ fps->sr_coded_error /= num_mbs_16x16;
+ fps->intra_error /= num_mbs_16x16;
+ fps->frame_avg_wavelet_energy /= num_mbs_16x16;
+ fps->log_coded_error = log1p(fps->coded_error);
+ fps->log_intra_error = log1p(fps->intra_error);
+ fps->MVr /= f_h;
+ fps->mvr_abs /= f_h;
+ fps->MVc /= f_w;
+ fps->mvc_abs /= f_w;
+ fps->MVrv /= (f_h * f_h);
+ fps->MVcv /= (f_w * f_w);
+ fps->new_mv_count /= num_mbs_16x16;
+}
+
+// Updates the first pass stats of this frame.
+// Input:
+// cpi: the encoder setting. Only a few params in it will be used.
+// stats: stats accumulated for this frame.
+// raw_err_stdev: the statndard deviation for the motion error of all the
+// inter blocks of the (0,0) motion using the last source
+// frame as the reference.
+// frame_number: current frame number.
+// ts_duration: Duration of the frame / collection of frames.
+// Updates:
+// twopass->total_stats: the accumulated stats.
+// twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats,
+// update its value and its position
+// in the buffer.
+static void update_firstpass_stats(AV1_COMP *cpi,
+ const FRAME_STATS *const stats,
+ const double raw_err_stdev,
+ const int frame_number,
+ const int64_t ts_duration,
+ const BLOCK_SIZE fp_block_size) {
+ TWO_PASS *twopass = &cpi->ppi->twopass;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+ FIRSTPASS_STATS fps;
+ // The minimum error here insures some bit allocation to frames even
+ // in static regions. The allocation per MB declines for larger formats
+ // where the typical "real" energy per MB also falls.
+ // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+ // number of mbs is proportional to the image area.
+ const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : mi_params->MBs;
+ // Number of actual units used in the first pass, it can be other square
+ // block sizes than 16X16.
+ const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
+ const double min_err = 200 * sqrt(num_mbs);
+
+ fps.weight = stats->intra_factor * stats->brightness_factor;
+ fps.frame = frame_number;
+ fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
+ fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
+ fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
+ fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
+ fps.count = 1.0;
+ fps.pcnt_inter = (double)stats->inter_count / num_mbs;
+ fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
+ fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
+ fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
+ fps.inactive_zone_rows = (double)stats->image_data_start_row;
+ fps.inactive_zone_cols = 0.0; // Placeholder: not currently supported.
+ fps.raw_error_stdev = raw_err_stdev;
+ fps.is_flash = 0;
+ fps.noise_var = 0.0;
+ fps.cor_coeff = 1.0;
+ fps.log_coded_error = 0.0;
+ fps.log_intra_error = 0.0;
+
+ if (stats->mv_count > 0) {
+ fps.MVr = (double)stats->sum_mvr / stats->mv_count;
+ fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count;
+ fps.MVc = (double)stats->sum_mvc / stats->mv_count;
+ fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count;
+ fps.MVrv = ((double)stats->sum_mvrs -
+ ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) /
+ stats->mv_count;
+ fps.MVcv = ((double)stats->sum_mvcs -
+ ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) /
+ stats->mv_count;
+ fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2);
+ fps.new_mv_count = stats->new_mv_count;
+ fps.pcnt_motion = (double)stats->mv_count / num_mbs;
+ } else {
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.new_mv_count = 0.0;
+ fps.pcnt_motion = 0.0;
+ }
+
+ // TODO(paulwilkins): Handle the case when duration is set to 0, or
+ // something less than the full time between subsequent values of
+ // cpi->source_time_stamp.
+ fps.duration = (double)ts_duration;
+
+ normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height);
+
+ // We will store the stats inside the persistent twopass struct (and NOT the
+ // local variable 'fps'), and then cpi->output_pkt_list will point to it.
+ *this_frame_stats = fps;
+ if (!cpi->ppi->lap_enabled) {
+ output_stats(this_frame_stats, cpi->ppi->output_pkt_list);
+ } else {
+ av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats);
+ }
+ if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) {
+ av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps);
+ }
+ twopass->stats_buf_ctx->stats_in_end++;
+ // When ducky encode is on, we always use linear buffer for stats_buf_ctx.
+ if (cpi->use_ducky_encode == 0) {
+ // TODO(angiebird): Figure out why first pass uses circular buffer.
+ /* In the case of two pass, first pass uses it as a circular buffer,
+ * when LAP is enabled it is used as a linear buffer*/
+ if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) &&
+ (twopass->stats_buf_ctx->stats_in_end >=
+ twopass->stats_buf_ctx->stats_in_buf_end)) {
+ twopass->stats_buf_ctx->stats_in_end =
+ twopass->stats_buf_ctx->stats_in_start;
+ }
+ }
+}
+
+static void print_reconstruction_frame(
+ const YV12_BUFFER_CONFIG *const last_frame, int frame_number,
+ int do_print) {
+ if (!do_print) return;
+
+ char filename[512];
+ FILE *recon_file;
+ snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number);
+
+ if (frame_number == 0) {
+ recon_file = fopen(filename, "wb");
+ } else {
+ recon_file = fopen(filename, "ab");
+ }
+
+ fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file);
+ fclose(recon_file);
+}
+
+static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows,
+ int mb_cols) {
+ FRAME_STATS stats = { 0 };
+ int i, j;
+
+ stats.image_data_start_row = INVALID_ROW;
+ for (j = 0; j < mb_rows; j++) {
+ for (i = 0; i < mb_cols; i++) {
+ FRAME_STATS mb_stat = mb_stats[j * mb_cols + i];
+ stats.brightness_factor += mb_stat.brightness_factor;
+ stats.coded_error += mb_stat.coded_error;
+ stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy;
+ if (stats.image_data_start_row == INVALID_ROW &&
+ mb_stat.image_data_start_row != INVALID_ROW) {
+ stats.image_data_start_row = mb_stat.image_data_start_row;
+ }
+ stats.inter_count += mb_stat.inter_count;
+ stats.intra_error += mb_stat.intra_error;
+ stats.intra_factor += mb_stat.intra_factor;
+ stats.intra_skip_count += mb_stat.intra_skip_count;
+ stats.mv_count += mb_stat.mv_count;
+ stats.neutral_count += mb_stat.neutral_count;
+ stats.new_mv_count += mb_stat.new_mv_count;
+ stats.second_ref_count += mb_stat.second_ref_count;
+ stats.sr_coded_error += mb_stat.sr_coded_error;
+ stats.sum_in_vectors += mb_stat.sum_in_vectors;
+ stats.sum_mvc += mb_stat.sum_mvc;
+ stats.sum_mvc_abs += mb_stat.sum_mvc_abs;
+ stats.sum_mvcs += mb_stat.sum_mvcs;
+ stats.sum_mvr += mb_stat.sum_mvr;
+ stats.sum_mvr_abs += mb_stat.sum_mvr_abs;
+ stats.sum_mvrs += mb_stat.sum_mvrs;
+ }
+ }
+ return stats;
+}
+
+static void setup_firstpass_data(AV1_COMMON *const cm,
+ FirstPassData *firstpass_data,
+ const int unit_rows, const int unit_cols) {
+ CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list,
+ aom_calloc(unit_rows * unit_cols,
+ sizeof(*firstpass_data->raw_motion_err_list)));
+ CHECK_MEM_ERROR(
+ cm, firstpass_data->mb_stats,
+ aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats)));
+ for (int j = 0; j < unit_rows; j++) {
+ for (int i = 0; i < unit_cols; i++) {
+ firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row =
+ INVALID_ROW;
+ }
+ }
+}
+
+void av1_free_firstpass_data(FirstPassData *firstpass_data) {
+ aom_free(firstpass_data->raw_motion_err_list);
+ firstpass_data->raw_motion_err_list = NULL;
+ aom_free(firstpass_data->mb_stats);
+ firstpass_data->mb_stats = NULL;
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size) {
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ const int mi_rows = tile->mi_row_end - tile->mi_row_start;
+ const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2);
+
+ return unit_rows;
+}
+
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size) {
+ const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+ const int mi_cols = tile->mi_col_end - tile->mi_col_start;
+ const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2);
+
+ return unit_cols;
+}
+
+#define FIRST_PASS_ALT_REF_DISTANCE 16
+static void first_pass_tile(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ const BLOCK_SIZE fp_block_size) {
+ TileInfo *tile = &tile_data->tile_info;
+ const int unit_height = mi_size_high[fp_block_size];
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+ mi_row += unit_height) {
+ av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2,
+ fp_block_size);
+ }
+}
+
+static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ av1_alloc_src_diff_buf(cm, &cpi->td.mb);
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
+ }
+ }
+}
+
+void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ const int unit_row, const BLOCK_SIZE fp_block_size) {
+ MACROBLOCK *const x = &td->mb;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo *tile = &tile_data->tile_info;
+ const int qindex = find_fp_qindex(seq_params->bit_depth);
+ const int fp_block_size_width = block_size_high[fp_block_size];
+ const int fp_block_size_height = block_size_wide[fp_block_size];
+ const int unit_width = mi_size_wide[fp_block_size];
+ const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ const int unit_cols = mi_params->mb_cols * 4 / unit_width;
+ int raw_motion_err_counts = 0;
+ int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2);
+ int unit_col_start = tile->mi_col_start >> unit_width_log2;
+ int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size);
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+
+ const YV12_BUFFER_CONFIG *last_frame =
+ av1_get_scaled_ref_frame(cpi, LAST_FRAME);
+ if (!last_frame) {
+ last_frame = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ }
+ const YV12_BUFFER_CONFIG *golden_frame =
+ av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ if (!golden_frame) {
+ golden_frame = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ }
+ YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+
+ PICK_MODE_CONTEXT *ctx = td->firstpass_ctx;
+ FRAME_STATS *mb_stats =
+ cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start;
+ int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list +
+ unit_row * unit_cols + unit_col_start;
+ MV *first_top_mv = &tile_data->firstpass_top_mv;
+
+ for (int i = 0; i < num_planes; ++i) {
+ x->plane[i].coeff = ctx->coeff[i];
+ x->plane[i].qcoeff = ctx->qcoeff[i];
+ x->plane[i].eobs = ctx->eobs[i];
+ x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ x->plane[i].dqcoeff = ctx->dqcoeff[i];
+ }
+
+ const int src_y_stride = cpi->source->y_stride;
+ const int recon_y_stride = this_frame->y_stride;
+ const int recon_uv_stride = this_frame->uv_stride;
+ const int uv_mb_height =
+ fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+
+ MV best_ref_mv = kZeroMv;
+ MV last_mv;
+
+ // Reset above block coeffs.
+ xd->up_available = (unit_row_in_tile != 0);
+ int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) +
+ (unit_col_start * fp_block_size_width);
+ int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) +
+ (unit_col_start * fp_block_size_width);
+ int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) +
+ (unit_col_start * uv_mb_height);
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ av1_set_mv_row_limits(
+ mi_params, &x->mv_limits, (unit_row << unit_height_log2),
+ (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels);
+
+ av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2,
+ tile->mi_col_start, num_planes, fp_block_size);
+
+ // Fix - zero the 16x16 block first. This ensures correct this_intra_error for
+ // block sizes smaller than 16x16.
+ av1_zero_array(x->plane[0].src_diff, 256);
+
+ for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile;
+ unit_col_in_tile++) {
+ const int unit_col = unit_col_start + unit_col_in_tile;
+
+ enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+ if (cpi->ppi->p_mt_info.num_workers > 1) {
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (firstpass_mt_exit) return;
+ }
+#endif
+
+ if (unit_col_in_tile == 0) {
+ last_mv = *first_top_mv;
+ }
+ int this_intra_error = firstpass_intra_prediction(
+ cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset,
+ recon_uvoffset, fp_block_size, qindex, mb_stats);
+
+ if (!frame_is_intra_only(cm)) {
+ const int this_inter_error = firstpass_inter_prediction(
+ cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset,
+ recon_uvoffset, src_yoffset, fp_block_size, this_intra_error,
+ raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv,
+ &last_mv, mb_stats);
+ if (unit_col_in_tile == 0) {
+ *first_top_mv = last_mv;
+ }
+ mb_stats->coded_error += this_inter_error;
+ ++raw_motion_err_counts;
+ } else {
+ mb_stats->sr_coded_error += this_intra_error;
+ mb_stats->coded_error += this_intra_error;
+ }
+
+ // Adjust to the next column of MBs.
+ x->plane[0].src.buf += fp_block_size_width;
+ if (num_planes > 1) {
+ x->plane[1].src.buf += uv_mb_height;
+ x->plane[2].src.buf += uv_mb_height;
+ }
+
+ recon_yoffset += fp_block_size_width;
+ src_yoffset += fp_block_size_width;
+ recon_uvoffset += uv_mb_height;
+ mb_stats++;
+
+ enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile,
+ unit_cols_in_tile);
+ }
+}
+
+void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ int max_mb_rows = mi_params->mb_rows;
+ int max_mb_cols = mi_params->mb_cols;
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+ int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+ max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+ }
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+ int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+ max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+ }
+ const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows);
+ const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols);
+ setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+ FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+ FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+ update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number,
+ ts_duration, BLOCK_16X16);
+}
+
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int qindex = find_fp_qindex(seq_params->bit_depth);
+ const int ref_frame_flags_backup = cpi->ref_frame_flags;
+ cpi->ref_frame_flags = av1_ref_frame_flag_list[LAST_FRAME] |
+ av1_ref_frame_flag_list[GOLDEN_FRAME];
+
+ // Detect if the key frame is screen content type.
+ if (frame_is_intra_only(cm)) {
+ FeatureFlags *const features = &cm->features;
+ assert(cpi->source != NULL);
+ xd->cur_buf = cpi->source;
+ av1_set_screen_content_options(cpi, features);
+ }
+
+ // Prepare the speed features
+ av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+
+ // Unit size for the first pass encoding.
+ const BLOCK_SIZE fp_block_size =
+ get_fp_block_size(cpi->is_screen_content_type);
+
+ int max_mb_rows = mi_params->mb_rows;
+ int max_mb_cols = mi_params->mb_cols;
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+ int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+ max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+ }
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+ int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+ max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+ }
+
+ // Number of rows in the unit size.
+ // Note max_mb_rows and max_mb_cols are in the unit of 16x16.
+ const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows);
+ const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols);
+
+ // Set fp_block_size, for the convenience of multi-thread usage.
+ cpi->fp_block_size = fp_block_size;
+
+ setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+ int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list;
+ FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+
+ // multi threading info
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ if (cpi->allocated_tiles < tile_cols * tile_rows) {
+ av1_alloc_tile_data(cpi);
+ }
+
+ av1_init_tile_data(cpi);
+
+ const YV12_BUFFER_CONFIG *last_frame = NULL;
+ const YV12_BUFFER_CONFIG *golden_frame = NULL;
+ if (!frame_is_intra_only(cm)) {
+ av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
+ last_frame = av1_is_scaled(get_ref_scale_factors_const(cm, LAST_FRAME))
+ ? av1_get_scaled_ref_frame(cpi, LAST_FRAME)
+ : get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ golden_frame = av1_is_scaled(get_ref_scale_factors_const(cm, GOLDEN_FRAME))
+ ? av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME)
+ : get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ }
+
+ YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+ // First pass code requires valid last and new frame buffers.
+ assert(this_frame != NULL);
+ assert(frame_is_intra_only(cm) || (last_frame != NULL));
+
+ av1_setup_frame_size(cpi);
+ av1_set_mv_search_params(cpi);
+
+ set_mi_offsets(mi_params, xd, 0, 0);
+ xd->mi[0]->bsize = fp_block_size;
+
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+
+ av1_set_quantizer(
+ cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+ cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq);
+
+ av1_setup_block_planes(xd, seq_params->subsampling_x,
+ seq_params->subsampling_y, num_planes);
+
+ av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size);
+ av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0,
+ num_planes);
+
+ if (!frame_is_intra_only(cm)) {
+ av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes);
+ }
+
+ set_mi_offsets(mi_params, xd, 0, 0);
+
+ // Don't store luma on the fist pass since chroma is not computed
+ xd->cfl.store_y = 0;
+ av1_frame_init_quantizer(cpi);
+
+ av1_default_coef_probs(cm);
+ av1_init_mode_probs(cm->fc);
+ av1_init_mv_probs(cm);
+ av1_initialize_rd_consts(cpi);
+
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+
+ if (mt_info->num_workers > 1) {
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+ av1_fp_encode_tiles_row_mt(cpi);
+ } else {
+ first_pass_tiles(cpi, fp_block_size);
+ }
+
+ FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+ int total_raw_motion_err_count =
+ frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols;
+ const double raw_err_stdev =
+ raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+ av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm));
+
+ // Clamp the image start to rows/2. This number of rows is discarded top
+ // and bottom as dead data so rows / 2 means the frame is blank.
+ if ((stats.image_data_start_row > unit_rows / 2) ||
+ (stats.image_data_start_row == INVALID_ROW)) {
+ stats.image_data_start_row = unit_rows / 2;
+ }
+ // Exclude any image dead zone
+ if (stats.image_data_start_row > 0) {
+ stats.intra_skip_count =
+ AOMMAX(0, stats.intra_skip_count -
+ (stats.image_data_start_row * unit_cols * 2));
+ }
+
+ TWO_PASS *twopass = &cpi->ppi->twopass;
+ const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : mi_params->MBs;
+ // Number of actual units used in the first pass, it can be other square
+ // block sizes than 16X16.
+ const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
+ stats.intra_factor = stats.intra_factor / (double)num_mbs;
+ stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
+ FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+ update_firstpass_stats(cpi, &stats, raw_err_stdev,
+ current_frame->frame_number, ts_duration,
+ fp_block_size);
+
+ // Copy the previous Last Frame back into gf buffer if the prediction is good
+ // enough... but also don't allow it to lag too far.
+ if ((twopass->sr_update_lag > 3) ||
+ ((current_frame->frame_number > 0) &&
+ (this_frame_stats->pcnt_inter > 0.20) &&
+ ((this_frame_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) {
+ if (golden_frame != NULL) {
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+ cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+ }
+ twopass->sr_update_lag = 1;
+ } else {
+ ++twopass->sr_update_lag;
+ }
+
+ aom_extend_frame_borders(this_frame, num_planes);
+
+ // The frame we just compressed now becomes the last frame.
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame);
+
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
+ if (current_frame->frame_number == 0 &&
+ get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) {
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+ cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+ }
+
+ print_reconstruction_frame(last_frame, current_frame->frame_number,
+ /*do_print=*/0);
+
+ ++current_frame->frame_number;
+ cpi->ref_frame_flags = ref_frame_flags_backup;
+ if (!frame_is_intra_only(cm)) {
+ release_scaled_references(cpi);
+ }
+}
+
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+ FIRSTPASS_STATS *ext_stats_buf,
+ int ext_stats_buf_size) {
+ assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0));
+ if (ext_stats_buf == NULL) {
+ firstpass_info->stats_buf = firstpass_info->static_stats_buf;
+ firstpass_info->stats_buf_size =
+ sizeof(firstpass_info->static_stats_buf) /
+ sizeof(firstpass_info->static_stats_buf[0]);
+ firstpass_info->start_index = 0;
+ firstpass_info->cur_index = 0;
+ firstpass_info->stats_count = 0;
+ firstpass_info->future_stats_count = 0;
+ firstpass_info->past_stats_count = 0;
+ av1_zero(firstpass_info->total_stats);
+ if (ext_stats_buf_size == 0) {
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ firstpass_info->stats_buf = ext_stats_buf;
+ firstpass_info->stats_buf_size = ext_stats_buf_size;
+ firstpass_info->start_index = 0;
+ firstpass_info->cur_index = 0;
+ firstpass_info->stats_count = firstpass_info->stats_buf_size;
+ firstpass_info->future_stats_count = firstpass_info->stats_count;
+ firstpass_info->past_stats_count = 0;
+ av1_zero(firstpass_info->total_stats);
+ for (int i = 0; i < firstpass_info->stats_count; ++i) {
+ av1_accumulate_stats(&firstpass_info->total_stats,
+ &firstpass_info->stats_buf[i]);
+ }
+ }
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+ FIRSTPASS_INFO *firstpass_info) {
+ assert(firstpass_info->future_stats_count +
+ firstpass_info->past_stats_count ==
+ firstpass_info->stats_count);
+ if (firstpass_info->future_stats_count > 1) {
+ firstpass_info->cur_index =
+ (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size;
+ --firstpass_info->future_stats_count;
+ ++firstpass_info->past_stats_count;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+}
+
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) {
+ if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) {
+ const int next_start =
+ (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size;
+ firstpass_info->start_index = next_start;
+ --firstpass_info->stats_count;
+ --firstpass_info->past_stats_count;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+ FIRSTPASS_INFO *firstpass_info) {
+ aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info);
+ if (ret != AOM_CODEC_OK) return ret;
+ ret = av1_firstpass_info_pop(firstpass_info);
+ return ret;
+}
+
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+ const FIRSTPASS_STATS *input_stats) {
+ if (firstpass_info->stats_count < firstpass_info->stats_buf_size) {
+ const int next_index =
+ (firstpass_info->start_index + firstpass_info->stats_count) %
+ firstpass_info->stats_buf_size;
+ firstpass_info->stats_buf[next_index] = *input_stats;
+ ++firstpass_info->stats_count;
+ ++firstpass_info->future_stats_count;
+ av1_accumulate_stats(&firstpass_info->total_stats, input_stats);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+}
+
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+ const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) {
+ if (offset_from_cur >= -firstpass_info->past_stats_count &&
+ offset_from_cur < firstpass_info->future_stats_count) {
+ const int index = (firstpass_info->cur_index + offset_from_cur) %
+ firstpass_info->stats_buf_size;
+ return &firstpass_info->stats_buf[index];
+ } else {
+ return NULL;
+ }
+}
+
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur) {
+ if (offset_from_cur < firstpass_info->future_stats_count) {
+ return firstpass_info->future_stats_count - offset_from_cur;
+ }
+ return 0;
+}
+
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur) {
+ if (offset_from_cur >= -firstpass_info->past_stats_count) {
+ return offset_from_cur + firstpass_info->past_stats_count;
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 0000000000..d01363a80e
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
+
+#include <stdbool.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+
+#define VLOW_MOTION_THRESHOLD 950
+struct ThreadData;
+
+/*!
+ * \brief The stucture of acummulated frame stats in the first pass.
+ *
+ * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are
+ * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to
+ * the frame width and height. See function normalize_firstpass_stats.
+ */
+typedef struct FIRSTPASS_STATS {
+ /*!
+ * Frame number in display order, if stats are for a single frame.
+ * No real meaning for a collection of frames.
+ */
+ double frame;
+ /*!
+ * Weight assigned to this frame (or total weight for the collection of
+ * frames) currently based on intra factor and brightness factor. This is used
+ * to distribute bits betweeen easier and harder frames.
+ */
+ double weight;
+ /*!
+ * Intra prediction error.
+ */
+ double intra_error;
+ /*!
+ * Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+ */
+ double frame_avg_wavelet_energy;
+ /*!
+ * Best of intra pred error and inter pred error using last frame as ref.
+ */
+ double coded_error;
+ /*!
+ * Best of intra pred error and inter pred error using golden frame as ref.
+ */
+ double sr_coded_error;
+ /*!
+ * Percentage of blocks with inter pred error < intra pred error.
+ */
+ double pcnt_inter;
+ /*!
+ * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+ */
+ double pcnt_motion;
+ /*!
+ * Percentage of blocks where golden frame was better than last or intra:
+ * inter pred error using golden frame < inter pred error using last frame and
+ * inter pred error using golden frame < intra pred error
+ */
+ double pcnt_second_ref;
+ /*!
+ * Percentage of blocks where intra and inter prediction errors were very
+ * close. Note that this is a 'weighted count', that is, the so blocks may be
+ * weighted by how close the two errors were.
+ */
+ double pcnt_neutral;
+ /*!
+ * Percentage of blocks that have almost no intra error residual
+ * (i.e. are in effect completely flat and untextured in the intra
+ * domain). In natural videos this is uncommon, but it is much more
+ * common in animations, graphics and screen content, so may be used
+ * as a signal to detect these types of content.
+ */
+ double intra_skip_pct;
+ /*!
+ * Image mask rows top and bottom.
+ */
+ double inactive_zone_rows;
+ /*!
+ * Image mask columns at left and right edges.
+ */
+ double inactive_zone_cols;
+ /*!
+ * Average of row motion vectors.
+ */
+ double MVr;
+ /*!
+ * Mean of absolute value of row motion vectors.
+ */
+ double mvr_abs;
+ /*!
+ * Mean of column motion vectors.
+ */
+ double MVc;
+ /*!
+ * Mean of absolute value of column motion vectors.
+ */
+ double mvc_abs;
+ /*!
+ * Variance of row motion vectors.
+ */
+ double MVrv;
+ /*!
+ * Variance of column motion vectors.
+ */
+ double MVcv;
+ /*!
+ * Value in range [-1,1] indicating fraction of row and column motion vectors
+ * that point inwards (negative MV value) or outwards (positive MV value).
+ * For example, value of 1 indicates, all row/column MVs are inwards.
+ */
+ double mv_in_out_count;
+ /*!
+ * Count of unique non-zero motion vectors.
+ */
+ double new_mv_count;
+ /*!
+ * Duration of the frame / collection of frames.
+ */
+ double duration;
+ /*!
+ * 1.0 if stats are for a single frame, OR
+ * Number of frames in this collection for which the stats are accumulated.
+ */
+ double count;
+ /*!
+ * standard deviation for (0, 0) motion prediction error
+ */
+ double raw_error_stdev;
+ /*!
+ * Whether the frame contains a flash
+ */
+ int64_t is_flash;
+ /*!
+ * Estimated noise variance
+ */
+ double noise_var;
+ /*!
+ * Correlation coefficient with the previous frame
+ */
+ double cor_coeff;
+ /*!
+ * log of intra_error
+ */
+ double log_intra_error;
+ /*!
+ * log of coded_error
+ */
+ double log_coded_error;
+} FIRSTPASS_STATS;
+
+// We want to keep one past stats for key frame detection
+// in test_candidate_kf()
+#define FIRSTPASS_INFO_STATS_PAST_MIN 1
+
+// The size of static buffer used in FIRSTPASS_INFO.
+#define FIRSTPASS_INFO_STATIC_BUF_SIZE \
+ (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN)
+
+/*!
+ * \brief Data structure used for managing first pass stats
+ */
+typedef struct {
+ /*!
+ * A static buffer that will be used when no ext_stats_buf is assigned. The
+ * ext_stats_buf is assigned through av1_firstpass_info_init() when the user
+ * already has a pre-existing firstpass stats that is stored in an external
+ * buffer. The ext_stats_buf is usually used in two pass mode. When using one
+ * pass mode, we generate "firstpass" stats and encode the video in the same
+ * pass. In this scenario, the stats will be pushed and popped from
+ * static_stats_buf.
+ */
+ FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE];
+ /*!
+ * A pointer to first pass stats.
+ * Note that this buffer will be used as ring buffer.
+ */
+ FIRSTPASS_STATS *stats_buf;
+ /*!
+ * size of stats_buf
+ */
+ int stats_buf_size;
+ /*!
+ * start index of the available frame stats
+ * Note that start_index doesn't always point to
+ * current frame's stats because we need to
+ * keep past stats as well. To access current
+ * frame's stats, please use cur_index.
+ */
+ int start_index;
+
+ /*!
+ * count available stats stored in stats_buf
+ * the following condition should stay true
+ * stats_count = future_stats_count + past_stats_count
+ */
+ int stats_count;
+
+ /*!
+ * index of the current frame's stats
+ */
+ int cur_index;
+
+ /*!
+ * count available future stats including current stats
+ */
+ int future_stats_count;
+
+ /*!
+ * count available past stats EXCLUDING current stats
+ */
+ int past_stats_count;
+
+ /*!
+ * Accumulation of the stats being pushed into firstpass_info
+ */
+ FIRSTPASS_STATS total_stats;
+} FIRSTPASS_INFO;
+
+/*!\brief Init firstpass_info
+ *
+ * If using ext_stats_buf, the buffer needs to stay available during encoding
+ * process.
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \param[in] ext_stats_buf external stats buffer. Pass in NULL if
+ * choose to use internal static_stats_buf.
+ * \param[in] ext_stats_buf_size external stats buffer size. Pass in 0 if
+ * choose to use internal static_stats_buf. \return status
+ */
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+ FIRSTPASS_STATS *ext_stats_buf,
+ int ext_stats_buf_size);
+
+/*!\brief Move cur_index by 1
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+ FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Move cur_index by 1 and pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+ FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Push a stats into firstpass_info
+ *
+ * Note that the input stats will be copied into firstpass_info.
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \param[in] input_stats input stats
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+ const FIRSTPASS_STATS *input_stats);
+
+/*!\brief Peek at a stats from firstpass_info
+ *
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in] firstpass_info struct of firstpass_info.
+ * \param[in] offset_from_cur index offset from cur_index.
+ * \return pointer to the stats. The pointer will be NULL if
+ * stats_index_offset is invalid.
+ */
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+ const FIRSTPASS_INFO *firstpass_info, int offset_from_cur);
+
+/*!\brief Count the future stats from the target in firstpass_info
+ * Note that the target stats will be counted as well.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in] firstpass_info struct of firstpass_info.
+ * \param[in] offset_from_cur target stats's inffset
+ * from cur_index.
+ * \return Number of stats in the future after the target stats
+ * including itself.
+ */
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur);
+
+/*!\brief Count the past stats before the target in firstpass_info
+ * Note that the target stats will NOT be counted.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in] firstpass_info struct of firstpass_info.
+ * \param[in] offset_from_cur target stats's index offset
+ * from cur_index.
+ * \return Number of stats in the past before the target stats
+ * excluding itself.
+ */
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur);
+
+/*!\cond */
+#define FC_ANIMATION_THRESH 0.15
+enum {
+ FC_NORMAL = 0,
+ FC_GRAPHICS_ANIMATION = 1,
+ FRAME_CONTENT_TYPES = 2
+} UENUM1BYTE(FRAME_CONTENT_TYPE);
+/*!\endcond */
+
+/*!
+ * \brief Data related to the current GF/ARF group and the
+ * individual frames within the group
+ */
+typedef struct GF_GROUP {
+ /*!\cond */
+ // Frame update type, e.g. ARF/GF/LF/Overlay
+ FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
+ unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+ // The number of frames displayed so far within the GOP at a given coding
+ // frame.
+ unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
+ int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
+ int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
+ int max_layer_depth;
+ int max_layer_depth_allowed;
+ // This is currently only populated for AOM_Q mode
+ int q_val[MAX_STATIC_GF_GROUP_LENGTH];
+ int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH];
+ int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
+ // The frame coding type - inter/intra frame
+ FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
+ // The reference frame buffer control - update or reset
+ REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
+ int arf_index; // the index in the gf group of ARF, if no arf, then -1
+ int size; // The total length of a GOP
+
+ // The offset into lookahead_ctx for choosing
+ // source of frame parallel encodes.
+ int src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+ // Stores the display order hint of each frame in the current GF_GROUP.
+ int display_idx[MAX_STATIC_GF_GROUP_LENGTH];
+
+ // The reference frame list maps the reference frame indexes to its
+ // buffer index in the decoded buffer. A value of -1 means the
+ // corresponding reference frame index doesn't point towards any
+ // previously decoded frame.
+ int8_t ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+ // Update frame index
+ int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH];
+ // The map_idx of primary reference
+ int primary_ref_idx[MAX_STATIC_GF_GROUP_LENGTH];
+
+ // Indicates the level of parallelism in frame parallel encodes.
+ // 0 : frame is independently encoded (not part of parallel encodes).
+ // 1 : frame is the first in encode order in a given parallel encode set.
+ // 2 : frame occurs later in encode order in a given parallel encode set.
+ int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH];
+ // Indicates whether a frame should act as non-reference frame.
+ bool is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH];
+ // Indicates whether a frame is dropped.
+ bool is_frame_dropped[MAX_STATIC_GF_GROUP_LENGTH];
+
+ // Stores the display order hint of the frames not to be
+ // refreshed by the current frame.
+ int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+ // Stores the display order hint of the frame to be excluded during reference
+ // assignment.
+ int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH];
+ /*!\endcond */
+} GF_GROUP;
+/*!\cond */
+
+typedef struct {
+ // Track if the last frame in a GOP has higher quality.
+ int arf_gf_boost_lst;
+} GF_STATE;
+
+typedef struct {
+ FIRSTPASS_STATS *stats_in_start;
+ FIRSTPASS_STATS *stats_in_end;
+ FIRSTPASS_STATS *stats_in_buf_end;
+ FIRSTPASS_STATS *total_stats;
+ FIRSTPASS_STATS *total_left_stats;
+} STATS_BUFFER_CTX;
+
+/*!\endcond */
+
+/*!
+ * \brief Two pass status and control data.
+ */
+typedef struct {
+ /*!\cond */
+ unsigned int section_intra_rating;
+ // Circular queue of first pass stats stored for most recent frames.
+ // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
+ // here.
+ FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
+ int frame_stats_next_idx; // Index to next unused element in frame_stats_arr.
+ STATS_BUFFER_CTX *stats_buf_ctx;
+ FIRSTPASS_INFO firstpass_info; // This is the first pass data structure
+ // intended to replace stats_in
+ int first_pass_done;
+ int64_t bits_left;
+ double modified_error_min;
+ double modified_error_max;
+ double modified_error_left;
+
+ // Projected total bits available for a key frame group of frames
+ int64_t kf_group_bits;
+
+ // Error score of frames still to be coded in kf group
+ double kf_group_error_left;
+
+ // Over time correction for bits per macro block estimation
+ double bpm_factor;
+
+ // Record of target and actual bits spent in current ARF group
+ int rolling_arf_group_target_bits;
+ int rolling_arf_group_actual_bits;
+
+ int sr_update_lag;
+
+ int kf_zeromotion_pct;
+ int last_kfgroup_zeromotion_pct;
+ int extend_minq;
+ int extend_maxq;
+ /*!\endcond */
+} TWO_PASS;
+
+/*!
+ * \brief Frame level Two pass status and control data.
+ */
+typedef struct {
+ /*!\cond */
+ const FIRSTPASS_STATS *stats_in;
+ // Pointer to the stats of the current frame.
+ const FIRSTPASS_STATS *this_frame;
+ double mb_av_energy;
+ // An indication of the content type of the current frame
+ FRAME_CONTENT_TYPE fr_content_type;
+ double frame_avg_haar_energy;
+ /*!\endcond */
+} TWO_PASS_FRAME;
+
+/*!\cond */
+
+// This structure contains several key parameters to be accumulated for this
+// frame.
+typedef struct {
+ // Intra prediction error.
+ int64_t intra_error;
+ // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+ int64_t frame_avg_wavelet_energy;
+ // Best of intra pred error and inter pred error using last frame as ref.
+ int64_t coded_error;
+ // Best of intra pred error and inter pred error using golden frame as ref.
+ int64_t sr_coded_error;
+ // Count of motion vector.
+ int mv_count;
+ // Count of blocks that pick inter prediction (inter pred error is smaller
+ // than intra pred error).
+ int inter_count;
+ // Count of blocks that pick second ref (golden frame).
+ int second_ref_count;
+ // Count of blocks where the inter and intra are very close and very low.
+ double neutral_count;
+ // Count of blocks where intra error is very small.
+ int intra_skip_count;
+ // Start row.
+ int image_data_start_row;
+ // Count of unique non-zero motion vectors.
+ int new_mv_count;
+ // Sum of inward motion vectors.
+ int sum_in_vectors;
+ // Sum of motion vector row.
+ int sum_mvr;
+ // Sum of motion vector column.
+ int sum_mvc;
+ // Sum of absolute value of motion vector row.
+ int sum_mvr_abs;
+ // Sum of absolute value of motion vector column.
+ int sum_mvc_abs;
+ // Sum of the square of motion vector row.
+ int64_t sum_mvrs;
+ // Sum of the square of motion vector column.
+ int64_t sum_mvcs;
+ // A factor calculated using intra pred error.
+ double intra_factor;
+ // A factor that measures brightness.
+ double brightness_factor;
+} FRAME_STATS;
+
+// This structure contains first pass data.
+typedef struct {
+ // Buffer holding frame stats for all MACROBLOCKs.
+ // mb_stats[i] stores the FRAME_STATS of the ith
+ // MB in raster scan order.
+ FRAME_STATS *mb_stats;
+ // Buffer to store the prediction error of the (0,0) motion
+ // vector using the last source frame as the reference.
+ // raw_motion_err_list[i] stores the raw_motion_err of
+ // the ith MB in raster scan order.
+ int *raw_motion_err_list;
+} FirstPassData;
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+struct AV1EncoderConfig;
+struct TileDataEnc;
+
+static INLINE int is_fp_wavelet_energy_invalid(
+ const FIRSTPASS_STATS *fp_stats) {
+ assert(fp_stats != NULL);
+ return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+ return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size);
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size);
+
+void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ struct TileDataEnc *tile_data, const int mb_row,
+ const BLOCK_SIZE fp_block_size);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_free_firstpass_data(FirstPassData *firstpass_data);
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame);
+/*!\endcond */
+
+/*!\brief AV1 first pass encoding.
+ *
+ * \ingroup rate_control
+ * This function is the first encoding pass for the two pass encoding mode.
+ * It encodes the whole video and collect essential information.
+ * Two pass encoding is an encoding mode in the reference software (libaom)
+ * of AV1 for high performance encoding. The first pass is a fast encoding
+ * process to collect essential information to help the second pass make
+ * encoding decisions and improve coding quality. The collected stats is used
+ * in rate control, for example, to determine frame cut, the position of
+ * alternative reference frame (ARF), etc.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] ts_duration Duration of the frame / collection of frames
+ *
+ * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
+ * is modified to store information computed in this function.
+ */
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
+
+void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 0000000000..73910de121
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+/* clang-format off */
+// Error metric used for global motion evaluation.
+// For 8-bit input, the pixel error used to index this table will always
+// be between -255 and +255. But for 10- and 12-bit input, we use interpolation
+// which means that we need to support indices of -256 and +256 as well.
+// Therefore, the table is offset so that logical index 0 corresponds to
+// error_measure_lut[256].
+const int error_measure_lut[513] = {
+ // pow 0.7
+ 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113,
+ 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749,
+ 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381,
+ 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010,
+ 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634,
+ 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254,
+ 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870,
+ 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481,
+ 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088,
+ 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689,
+ 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285,
+ 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875,
+ 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458,
+ 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036,
+ 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606,
+ 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168,
+ 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723,
+ 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268,
+ 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804,
+ 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329,
+ 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842,
+ 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341,
+ 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826,
+ 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293,
+ 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741,
+ 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164,
+ 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558,
+ 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916,
+ 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224,
+ 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461,
+ 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577,
+ 1452, 1323, 1187, 1045, 894, 731, 550, 339,
+ 0, 339, 550, 731, 894, 1045, 1187, 1323,
+ 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255,
+ 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041,
+ 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748,
+ 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401,
+ 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015,
+ 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599,
+ 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157,
+ 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695,
+ 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214,
+ 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718,
+ 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208,
+ 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686,
+ 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153,
+ 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610,
+ 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058,
+ 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
+ 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
+ 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
+ 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
+ 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
+ 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
+ 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
+ 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
+ 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
+ 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
+ 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
+ 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
+ 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
+ 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
+ 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
+ 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
+ 16384,
+};
+/* clang-format on */
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+ return best_erroradvantage < erroradv_tr &&
+ best_erroradvantage * params_cost < erroradv_prod_tr;
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+ int i;
+ model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+ model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+
+ for (i = 2; i < 6; ++i) {
+ const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+ model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+ model[i] =
+ (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+ model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+ }
+}
+
+void av1_convert_model_to_params(const double *params,
+ WarpedMotionParams *model) {
+ convert_to_params(params, model->wmmat);
+ model->wmtype = get_wmtype(model);
+ model->invalid = 0;
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+ int32_t offset) {
+ const int scale_vals[2] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF };
+ const int clamp_vals[2] = { GM_TRANS_MAX, GM_ALPHA_MAX };
+ // type of param: 0 - translation, 1 - affine
+ const int param_type = (param_index < 2 ? 0 : 1);
+ const int is_one_centered = (param_index == 2 || param_index == 5);
+
+ // Make parameter zero-centered and offset the shift that was done to make
+ // it compatible with the warped model
+ param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+ scale_vals[param_type];
+ // Add desired offset to the rescaled/zero-centered parameter
+ param_value += offset;
+ // Clamp the parameter so it does not overflow the number of bits allotted
+ // to it in the bitstream
+ param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+ clamp_vals[param_type]);
+ // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+ // with the warped motion library
+ param_value *= (1 << scale_vals[param_type]);
+
+ // Undo the zero-centering step if necessary
+ return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+ switch (wmtype) {
+ case IDENTITY:
+ wm->wmmat[0] = 0;
+ wm->wmmat[1] = 0;
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ wm->wmmat[3] = 0;
+ AOM_FALLTHROUGH_INTENDED;
+ case ROTZOOM:
+ wm->wmmat[4] = -wm->wmmat[3];
+ wm->wmmat[5] = wm->wmmat[2];
+ AOM_FALLTHROUGH_INTENDED;
+ case AFFINE: break;
+ default: assert(0);
+ }
+ wm->wmtype = wmtype;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+ const uint16_t *const dst, int dst_stride,
+ int p_width, int p_height) {
+ // This function should only be called for patches smaller than
+ // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+ // small enough that we don't need a 64-bit accumulator
+ assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+ int sad = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+ }
+ }
+ return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_segmented_frame_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t highbd_segmented_frame_error(
+ const uint16_t *const ref, int ref_stride, const uint16_t *const dst,
+ int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map,
+ int segment_map_stride) {
+ (void)bd;
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+ if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+ sum_error += aom_highbd_sad32x32(
+ CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride,
+ CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+ } else {
+ sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride,
+ patch_w, patch_h);
+ }
+ }
+ }
+ return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t highbd_warp_error(WarpedMotionParams *wm,
+ const uint16_t *const ref, int ref_width,
+ int ref_height, int ref_stride,
+ const uint16_t *const dst, int dst_stride,
+ int p_col, int p_row, int p_width,
+ int p_height, int subsampling_x,
+ int subsampling_y, int bd, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride) {
+ int64_t gm_sumerr = 0;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ conv_params.use_dist_wtd_comp_avg = 0;
+ for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+ // avoid warping extra 8x8 blocks in the padded region of the frame
+ // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+ const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+ const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+ highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i,
+ warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x,
+ subsampling_y, bd, &conv_params);
+
+ if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+ gm_sumerr += aom_highbd_sad32x32(
+ CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK,
+ CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+ } else {
+ gm_sumerr +=
+ generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+ dst_stride, warp_w, warp_h);
+ }
+
+ if (gm_sumerr > best_error) return INT64_MAX;
+ }
+ }
+ return gm_sumerr;
+}
+#endif
+
+static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int dst_stride,
+ int p_width, int p_height) {
+ // This function should only be called for patches smaller than
+ // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+ // small enough that we don't need a 64-bit accumulator
+ assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+ int sad = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+ }
+ }
+ return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in segmented_warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+ if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+ sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride);
+ } else {
+ sum_error +=
+ generic_sad(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride, patch_w, patch_h);
+ }
+ }
+ }
+ return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+ int ref_width, int ref_height, int ref_stride,
+ const uint8_t *const dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height,
+ int subsampling_x, int subsampling_y,
+ int64_t best_error, uint8_t *segment_map,
+ int segment_map_stride) {
+ int64_t gm_sumerr = 0;
+ int warp_w, warp_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+ ConvolveParams conv_params = get_conv_params(0, 0, 8);
+ conv_params.use_dist_wtd_comp_avg = 0;
+
+ for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+ // avoid warping extra 8x8 blocks in the padded region of the frame
+ // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+ warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+ warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+ warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w,
+ warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+ &conv_params);
+
+ if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+ gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK,
+ dst + j + i * dst_stride, dst_stride);
+ } else {
+ gm_sumerr +=
+ generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+ dst_stride, warp_w, warp_h);
+ }
+
+ if (gm_sumerr > best_error) return INT64_MAX;
+ }
+ }
+ return gm_sumerr;
+}
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ return highbd_segmented_frame_error(
+ CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst),
+ dst_stride, p_width, p_height, bd, segment_map, segment_map_stride);
+ }
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width,
+ p_height, segment_map, segment_map_stride);
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int ref_width, int ref_height,
+ int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height, int subsampling_x,
+ int subsampling_y, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride) {
+ if (!av1_get_shear_params(wm)) return INT64_MAX;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd)
+ return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width,
+ ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst),
+ dst_stride, p_col, p_row, p_width, p_height,
+ subsampling_x, subsampling_y, bd, best_error,
+ segment_map, segment_map_stride);
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride,
+ p_col, p_row, p_width, p_height, subsampling_x,
+ subsampling_y, best_error, segment_map, segment_map_stride);
+}
+
+int64_t av1_refine_integerized_param(
+ WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+ uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+ int d_width, int d_height, int d_stride, int n_refinements,
+ int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) {
+ static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+ const int border = ERRORADV_BORDER;
+ int i = 0, p;
+ int n_params = max_trans_model_params[wmtype];
+ int32_t *param_mat = wm->wmmat;
+ int64_t step_error, best_error;
+ int32_t step;
+ int32_t *param;
+ int32_t curr_param;
+ int32_t best_param;
+
+ force_wmtype(wm, wmtype);
+ wm->wmtype = get_wmtype(wm);
+
+ if (n_refinements == 0) {
+ // Compute the maximum error value that will be accepted, so that
+ // av1_warp_error can terminate early if it proves the model will not
+ // be accepted.
+ int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr);
+ return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border,
+ 0, 0, selection_threshold, segment_map,
+ segment_map_stride);
+ }
+
+ // When refining, use a slightly higher threshold for the initial error
+ // calculation - see comment above erroradv_early_tr for why.
+ int64_t selection_threshold =
+ (int64_t)lrint(ref_frame_error * erroradv_early_tr);
+ best_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border, border,
+ d_width - 2 * border, d_height - 2 * border, 0, 0,
+ selection_threshold, segment_map, segment_map_stride);
+
+ if (best_error > selection_threshold) {
+ return INT64_MAX;
+ }
+
+ step = 1 << (n_refinements - 1);
+ for (i = 0; i < n_refinements; i++, step >>= 1) {
+ for (p = 0; p < n_params; ++p) {
+ int step_dir = 0;
+ param = param_mat + p;
+ curr_param = *param;
+ best_param = curr_param;
+ // look to the left
+ // Note: We have to use force_wmtype() to keep the proper symmetry for
+ // ROTZOOM type models
+ *param = add_param_offset(p, curr_param, -step);
+ force_wmtype(wm, wmtype);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border, 0,
+ 0, best_error, segment_map, segment_map_stride);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = -1;
+ }
+
+ // look to the right
+ *param = add_param_offset(p, curr_param, step);
+ force_wmtype(wm, wmtype);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border, 0,
+ 0, best_error, segment_map, segment_map_stride);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = 1;
+ }
+
+ // look to the direction chosen above repeatedly until error increases
+ // for the biggest step size
+ while (step_dir) {
+ *param = add_param_offset(p, best_param, step * step_dir);
+ force_wmtype(wm, wmtype);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border,
+ 0, 0, best_error, segment_map, segment_map_stride);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ } else {
+ step_dir = 0;
+ }
+ }
+
+ // Restore best parameter value so far
+ *param = best_param;
+ force_wmtype(wm, wmtype);
+ }
+ }
+
+ wm->wmtype = get_wmtype(wm);
+ return best_error;
+}
+
+#define FEAT_COUNT_TR 3
+#define SEG_COUNT_TR 48
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+ int height, int *inliers,
+ int num_inliers) {
+ int seg_count = 0;
+ memset(segment_map, 0, sizeof(*segment_map) * width * height);
+
+ for (int i = 0; i < num_inliers; i++) {
+ int x = inliers[i * 2];
+ int y = inliers[i * 2 + 1];
+ int seg_x = x >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = y >> WARP_ERROR_BLOCK_LOG;
+ segment_map[seg_y * width + seg_x] += 1;
+ }
+
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ uint8_t feat_count = segment_map[i * width + j];
+ segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR);
+ seg_count += (segment_map[i * width + j]);
+ }
+ }
+
+ // If this motion does not make up a large enough portion of the frame,
+ // use the unsegmented version of the error metric
+ if (seg_count < SEG_COUNT_TR)
+ memset(segment_map, 1, width * height * sizeof(*segment_map));
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 0000000000..8c9c60f0f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+#define GM_MAX_REFINEMENT_STEPS 5
+#define MAX_DIRECTIONS 2
+
+// The structure holds a valid reference frame type and its temporal distance
+// from the source frame.
+typedef struct {
+ int distance;
+ MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+typedef struct {
+ // Array of structure which holds the global motion parameters for a given
+ // motion model. motion_models[i] holds the parameters for a given motion
+ // model for the ith ransac motion.
+ MotionModel motion_models[RANSAC_NUM_MOTIONS];
+
+ // Pointer to hold inliers from motion model.
+ uint8_t *segment_map;
+} GlobalMotionData;
+
+typedef struct {
+ // Holds the mapping of each thread to past/future direction.
+ // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1)
+ // assigned to the ith thread.
+ int8_t thread_id_to_dir[MAX_NUM_THREADS];
+
+ // A flag which holds the early exit status based on the speed feature
+ // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed
+ // feature based early exit happens in the direction 'i'.
+ int8_t early_exit[MAX_DIRECTIONS];
+
+ // Counter for the next reference frame to be processed.
+ // next_frame_to_process[i] will hold the count of next reference frame to be
+ // processed in the direction 'i'.
+ int8_t next_frame_to_process[MAX_DIRECTIONS];
+} JobInfo;
+
+typedef struct {
+ // Data related to assigning jobs for global motion multi-threading.
+ JobInfo job_info;
+
+#if CONFIG_MULTITHREAD
+ // Mutex lock used while dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif
+
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool gm_mt_exit;
+} AV1GlobalMotionSync;
+
+void av1_convert_model_to_params(const double *params,
+ WarpedMotionParams *model);
+
+// Criteria for accepting a global motion model
+static const double erroradv_tr = 0.65;
+static const double erroradv_prod_tr = 20000;
+
+// Early exit threshold for global motion refinement
+// This is set slightly higher than erroradv_tr, as a compromise between
+// two factors:
+//
+// 1) By rejecting un-promising models early, we can reduce the encode time
+// spent trying to refine them
+//
+// 2) When we refine a model, its error may decrease to below the acceptance
+// threshold even if the model is initially above the threshold
+static const double erroradv_early_tr = 0.70;
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
+
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+ int height, int *inliers,
+ int num_inliers);
+
+extern const int error_measure_lut[513];
+
+static INLINE int error_measure(int err) {
+ return error_measure_lut[256 + err];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_error_measure(int err, int bd) {
+ const int b = bd - 8;
+ const int bmask = (1 << b) - 1;
+ const int v = (1 << b);
+
+ // Split error into two parts and do an interpolated table lookup
+ // To compute the table index and interpolation value, we want to calculate
+ // the quotient and remainder of err / 2^b. But it is very important that
+ // the division must round down, and the remainder must be positive,
+ // ie. in the range [0, 2^b).
+ //
+ // In C, the >> and & operators do what we want, but the / and % operators
+ // give the wrong results for negative inputs. So we must use >> and & here.
+ //
+ // For example, if bd == 10 and err == -5, compare the results:
+ // (-5) >> 2 = -2, (-5) & 3 = 3
+ // vs. (-5) / 4 = -1, (-5) % 4 = -1
+ const int e1 = err >> b;
+ const int e2 = err & bmask;
+ return error_measure_lut[256 + e1] * (v - e2) +
+ error_measure_lut[257 + e1] * e2;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map, int segment_map_stride);
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int ref_width, int ref_height,
+ int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height, int subsampling_x,
+ int subsampling_y, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride);
+
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t av1_refine_integerized_param(
+ WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+ uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+ int d_width, int d_height, int d_stride, int n_refinements,
+ int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/global_motion_facade.c b/third_party/aom/av1/encoder/global_motion_facade.c
new file mode 100644
index 0000000000..02a4e70ed3
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion_facade.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_writer.h"
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/pyramid.h"
+#include "av1/common/warped_motion.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/global_motion_facade.h"
+
+// Range of model types to search
+#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM
+#define LAST_GLOBAL_TRANS_TYPE ROTZOOM
+
+// Computes the cost for the warp parameters.
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+ const WarpedMotionParams *ref_gm, int allow_hp) {
+ int params_cost = 0;
+ int trans_bits, trans_prec_diff;
+ switch (gm->wmtype) {
+ case AFFINE:
+ case ROTZOOM:
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ if (gm->wmtype >= AFFINE) {
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ trans_bits = (gm->wmtype == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ trans_prec_diff = (gm->wmtype == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[0] >> trans_prec_diff),
+ (gm->wmmat[0] >> trans_prec_diff));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[1] >> trans_prec_diff),
+ (gm->wmmat[1] >> trans_prec_diff));
+ AOM_FALLTHROUGH_INTENDED;
+ case IDENTITY: break;
+ default: assert(0);
+ }
+ return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+// For the given reference frame, computes the global motion parameters for
+// different motion models and finds the best.
+static AOM_INLINE void compute_global_motion_for_ref_frame(
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
+ const int segment_map_h, const WarpedMotionParams *ref_params) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int src_width = cpi->source->y_crop_width;
+ int src_height = cpi->source->y_crop_height;
+ int src_stride = cpi->source->y_stride;
+ assert(ref_buf[frame] != NULL);
+ int bit_depth = cpi->common.seq_params->bit_depth;
+ GlobalMotionMethod global_motion_method = default_global_motion_method;
+ int num_refinements = cpi->sf.gm_sf.num_refinement_steps;
+ bool mem_alloc_failed = false;
+
+ // Select the best model based on fractional error reduction.
+ // By initializing this to erroradv_tr, the same logic which is used to
+ // select the best model will automatically filter out any model which
+ // doesn't meet the required quality threshold
+ double best_erroradv = erroradv_tr;
+ for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE;
+ model <= LAST_GLOBAL_TRANS_TYPE; ++model) {
+ if (!aom_compute_global_motion(
+ model, cpi->source, ref_buf[frame], bit_depth, global_motion_method,
+ motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) {
+ if (mem_alloc_failed) {
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate global motion buffers");
+ }
+ continue;
+ }
+
+ for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ if (motion_models[i].num_inliers == 0) continue;
+
+ WarpedMotionParams tmp_wm_params;
+ av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params);
+
+ // Skip models that we won't use (IDENTITY or TRANSLATION)
+ //
+ // For IDENTITY type models, we don't need to evaluate anything because
+ // all the following logic is effectively comparing the estimated model
+ // to an identity model.
+ //
+ // For TRANSLATION type global motion models, gm_get_motion_vector() gives
+ // the wrong motion vector (see comments in that function for details).
+ // As translation-type models do not give much gain, we can avoid this bug
+ // by never choosing a TRANSLATION type model
+ if (tmp_wm_params.wmtype <= TRANSLATION) continue;
+
+ av1_compute_feature_segmentation_map(
+ segment_map, segment_map_w, segment_map_h, motion_models[i].inliers,
+ motion_models[i].num_inliers);
+
+ int64_t ref_frame_error = av1_segmented_frame_error(
+ is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+ ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride,
+ src_width, src_height, segment_map, segment_map_w);
+
+ if (ref_frame_error == 0) continue;
+
+ const int64_t warp_error = av1_refine_integerized_param(
+ &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+ ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width,
+ ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride,
+ cpi->source->y_buffer, src_width, src_height, src_stride,
+ num_refinements, ref_frame_error, segment_map, segment_map_w);
+
+ // av1_refine_integerized_param() can return a simpler model type than
+ // its input, so re-check model type here
+ if (tmp_wm_params.wmtype <= TRANSLATION) continue;
+
+ double erroradvantage = (double)warp_error / ref_frame_error;
+
+ if (erroradvantage < best_erroradv) {
+ best_erroradv = erroradvantage;
+ // Save the wm_params modified by
+ // av1_refine_integerized_param() rather than motion index to
+ // avoid rerunning refine() below.
+ memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+ sizeof(WarpedMotionParams));
+ }
+ }
+ }
+
+ if (!av1_get_shear_params(&cm->global_motion[frame]))
+ cm->global_motion[frame] = default_warp_params;
+
+#if 0
+ // We never choose translational models, so this code is disabled
+ if (cm->global_motion[frame].wmtype == TRANSLATION) {
+ cm->global_motion[frame].wmmat[0] =
+ convert_to_trans_prec(cm->features.allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[0]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ cm->global_motion[frame].wmmat[1] =
+ convert_to_trans_prec(cm->features.allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[1]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ }
+#endif
+
+ if (cm->global_motion[frame].wmtype == IDENTITY) return;
+
+ // If the best error advantage found doesn't meet the threshold for
+ // this motion type, revert to IDENTITY.
+ if (!av1_is_enough_erroradvantage(
+ best_erroradv,
+ gm_get_params_cost(&cm->global_motion[frame], ref_params,
+ cm->features.allow_high_precision_mv))) {
+ cm->global_motion[frame] = default_warp_params;
+ }
+}
+
+// Computes global motion for the given reference frame.
+void av1_compute_gm_for_valid_ref_frames(
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
+ int segment_map_h) {
+ AV1_COMMON *const cm = &cpi->common;
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+
+ compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame,
+ motion_models, segment_map, segment_map_w,
+ segment_map_h, ref_params);
+}
+
+// Loops over valid reference frames and computes global motion estimation.
+static AOM_INLINE void compute_global_motion_for_references(
+ AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+ FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+ MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
+ const int segment_map_h) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct aom_internal_error_info *const error_info =
+ cpi->td.mb.e_mbd.error_info;
+ // Compute global motion w.r.t. reference frames starting from the nearest ref
+ // frame in a given direction.
+ for (int frame = 0; frame < num_ref_frames; frame++) {
+ int ref_frame = reference_frame[frame].frame;
+ av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame,
+ motion_models, segment_map,
+ segment_map_w, segment_map_h);
+ // If global motion w.r.t. current ref frame is
+ // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+ // the remaining ref frames in that direction.
+ if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+ cm->global_motion[ref_frame].wmtype <= TRANSLATION)
+ break;
+ }
+}
+
+// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to
+// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise.
+static int compare_distance(const void *a, const void *b) {
+ const int diff =
+ ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+ if (diff > 0)
+ return 1;
+ else if (diff < 0)
+ return -1;
+ return 0;
+}
+
+static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) {
+ int is_gm_present = 1;
+
+ // Check number of GM models only in GF groups with ARF frames. GM param
+ // estimation is always done in the case of GF groups with no ARF frames (flat
+ // gops)
+ if (cpi->ppi->gf_group.arf_index > -1) {
+ // valid_gm_model_found is initialized to INT32_MAX in the beginning of
+ // every GF group.
+ // Therefore, GM param estimation is always done for all frames until
+ // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are
+ // encoded in a GF group For subsequent frames, GM param estimation is
+ // disabled, if no valid models have been found in all the three update
+ // types.
+ is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) ||
+ (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) ||
+ (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0);
+ }
+ return !is_gm_present;
+}
+
+// Prunes reference frames for global motion estimation based on the speed
+// feature 'gm_search_type'.
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
+ (void)frame;
+ switch (sf->gm_sf.gm_search_type) {
+ case GM_FULL_SEARCH: return 1;
+ case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
+ return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+ case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+ return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+ (frame == ALTREF2_FRAME));
+ case GM_SEARCH_CLOSEST_REFS_ONLY: return 1;
+ case GM_DISABLE_SEARCH: return 0;
+ default: assert(0);
+ }
+ return 1;
+}
+
+// Populates valid reference frames in past/future directions in
+// 'reference_frames' and their count in 'num_ref_frames'.
+static AOM_INLINE void update_valid_ref_frames_for_gm(
+ AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+ FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1],
+ int *num_ref_frames) {
+ AV1_COMMON *const cm = &cpi->common;
+ int *num_past_ref_frames = &num_ref_frames[0];
+ int *num_future_ref_frames = &num_ref_frames[1];
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+ gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index);
+ int cur_frame_gm_disabled = 0;
+ int pyr_lvl = cm->cur_frame->pyramid_level;
+
+ if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) {
+ cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi);
+ }
+
+ for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+ const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+ RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+ const int ref_disabled =
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+ ref_buf[frame] = NULL;
+ cm->global_motion[frame] = default_warp_params;
+ // Skip global motion estimation for invalid ref frames
+ if (buf == NULL ||
+ (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+ continue;
+ } else {
+ ref_buf[frame] = &buf->buf;
+ }
+
+ int prune_ref_frames =
+ ref_pruning_enabled &&
+ prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame,
+ cm->cur_frame->ref_display_order_hint);
+ int ref_pyr_lvl = buf->pyramid_level;
+
+ if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+ ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+ do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames &&
+ ref_pyr_lvl <= pyr_lvl && !cur_frame_gm_disabled) {
+ assert(ref_buf[frame] != NULL);
+ const int relative_frame_dist = av1_encoder_get_relative_dist(
+ buf->display_order_hint, cm->cur_frame->display_order_hint);
+ // Populate past and future ref frames.
+ // reference_frames[0][] indicates past direction and
+ // reference_frames[1][] indicates future direction.
+ if (relative_frame_dist == 0) {
+ // Skip global motion estimation for frames at the same nominal instant.
+ // This will generally be either a "real" frame coded against a
+ // temporal filtered version, or a higher spatial layer coded against
+ // a lower spatial layer. In either case, the optimal motion model will
+ // be IDENTITY, so we don't need to search explicitly.
+ } else if (relative_frame_dist < 0) {
+ reference_frames[0][*num_past_ref_frames].distance =
+ abs(relative_frame_dist);
+ reference_frames[0][*num_past_ref_frames].frame = frame;
+ (*num_past_ref_frames)++;
+ } else {
+ reference_frames[1][*num_future_ref_frames].distance =
+ abs(relative_frame_dist);
+ reference_frames[1][*num_future_ref_frames].frame = frame;
+ (*num_future_ref_frames)++;
+ }
+ }
+ }
+}
+
+// Initializes parameters used for computing global motion.
+static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+ YV12_BUFFER_CONFIG *source = cpi->source;
+
+ gm_info->segment_map_w =
+ (source->y_crop_width + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG;
+ gm_info->segment_map_h =
+ (source->y_crop_height + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG;
+
+ memset(gm_info->reference_frames, -1,
+ sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS *
+ (REF_FRAMES - 1));
+ av1_zero(gm_info->num_ref_frames);
+
+ // Populate ref_buf for valid ref frames in global motion
+ update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf,
+ gm_info->reference_frames,
+ gm_info->num_ref_frames);
+
+ // Sort the past and future ref frames in the ascending order of their
+ // distance from the current frame. reference_frames[0] => past direction
+ // and reference_frames[1] => future direction.
+ qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0],
+ sizeof(gm_info->reference_frames[0][0]), compare_distance);
+ qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1],
+ sizeof(gm_info->reference_frames[1][0]), compare_distance);
+
+ if (cpi->sf.gm_sf.gm_search_type == GM_SEARCH_CLOSEST_REFS_ONLY) {
+ // Filter down to the nearest two ref frames.
+ // Prefer one past and one future ref over two past refs, even if
+ // the second past ref is closer
+ if (gm_info->num_ref_frames[1] > 0) {
+ gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 1);
+ gm_info->num_ref_frames[1] = AOMMIN(gm_info->num_ref_frames[1], 1);
+ } else {
+ gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 2);
+ }
+ }
+}
+
+// Computes global motion w.r.t. valid reference frames.
+static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+ GlobalMotionData *gm_data = &cpi->td.gm_data;
+
+ // Compute global motion w.r.t. past reference frames and future reference
+ // frames
+ for (int dir = 0; dir < MAX_DIRECTIONS; dir++) {
+ if (gm_info->num_ref_frames[dir] > 0)
+ compute_global_motion_for_references(
+ cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
+ gm_info->num_ref_frames[dir], gm_data->motion_models,
+ gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
+ }
+}
+
+// Global motion estimation for the current frame is computed.This computation
+// happens once per frame and the winner motion model parameters are stored in
+// cm->cur_frame->global_motion.
+void av1_compute_global_motion_facade(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+
+ if (cpi->oxcf.tool_cfg.enable_global_motion) {
+ if (cpi->gf_frame_index == 0) {
+ for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+ cpi->ppi->valid_gm_model_found[i] = INT32_MAX;
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+ cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX;
+#endif
+ }
+ }
+ }
+
+ if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+ cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done &&
+ cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) {
+ setup_global_motion_info_params(cpi);
+ // Terminate early if the total number of reference frames is zero.
+ if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) {
+ gm_alloc_data(cpi, &cpi->td.gm_data);
+ if (cpi->mt_info.num_workers > 1)
+ av1_global_motion_estimation_mt(cpi);
+ else
+ global_motion_estimation(cpi);
+ gm_dealloc_data(&cpi->td.gm_data);
+ gm_info->search_done = 1;
+ }
+ }
+ memcpy(cm->cur_frame->global_motion, cm->global_motion,
+ sizeof(cm->cur_frame->global_motion));
+}
diff --git a/third_party/aom/av1/encoder/global_motion_facade.h b/third_party/aom/av1/encoder/global_motion_facade.h
new file mode 100644
index 0000000000..f13989aa25
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion_facade.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+// Allocates memory for members of GlobalMotionData.
+static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
+ AV1_COMMON *cm = &cpi->common;
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+ CHECK_MEM_ERROR(cm, gm_data->segment_map,
+ aom_malloc(sizeof(*gm_data->segment_map) *
+ gm_info->segment_map_w * gm_info->segment_map_h));
+
+ av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS);
+ for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+ CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers,
+ aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 *
+ MAX_CORNERS));
+ }
+}
+
+// Deallocates the memory allocated for members of GlobalMotionData.
+static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) {
+ aom_free(gm_data->segment_map);
+ gm_data->segment_map = NULL;
+ for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+ aom_free(gm_data->motion_models[m].inliers);
+ gm_data->motion_models[m].inliers = NULL;
+ }
+}
+
+void av1_compute_gm_for_valid_ref_frames(
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
+ int segment_map_h);
+void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
diff --git a/third_party/aom/av1/encoder/gop_structure.c b/third_party/aom/av1/encoder/gop_structure.c
new file mode 100644
index 0000000000..5078098450
--- /dev/null
+++ b/third_party/aom/av1/encoder/gop_structure.c
@@ -0,0 +1,867 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "av1/common/blockd.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+
+// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based
+// on the value of parallel_frame_count.
+static void set_frame_parallel_level(int *frame_parallel_level,
+ int *parallel_frame_count,
+ int max_parallel_frames) {
+ assert(*parallel_frame_count > 0);
+ // parallel_frame_count > 1 indicates subsequent frame(s) in the current
+ // parallel encode set.
+ *frame_parallel_level = 1 + (*parallel_frame_count > 1);
+ // Update the count of no. of parallel frames.
+ (*parallel_frame_count)++;
+ if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1;
+}
+
+// This function sets gf_group->src_offset based on frame_parallel_level.
+// Outputs are gf_group->src_offset and first_frame_index
+static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
+ int cur_frame_idx, int frame_ind) {
+ if (gf_group->frame_parallel_level[frame_ind] > 0) {
+ if (gf_group->frame_parallel_level[frame_ind] == 1) {
+ *first_frame_index = cur_frame_idx;
+ }
+
+ // Obtain the offset of the frame at frame_ind in the lookahead queue by
+ // subtracting the display order hints of the current frame from the display
+ // order hint of the first frame in parallel encoding set (at
+ // first_frame_index).
+ gf_group->src_offset[frame_ind] =
+ (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) -
+ *first_frame_index;
+ }
+}
+
+// Sets the GF_GROUP params for LF_UPDATE frames.
+static AOM_INLINE void set_params_for_leaf_frames(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+ int *parallel_frame_count, int max_parallel_frames,
+ int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index,
+ int layer_depth, int start, int end) {
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth);
+ gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+ end - start, 0, NULL, NULL, 0);
+ ++(*cur_disp_index);
+
+ // Set the level of parallelism for the LF_UPDATE frame.
+ if (do_frame_parallel_encode) {
+ set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+ parallel_frame_count, max_parallel_frames);
+ // Set LF_UPDATE frames as non-reference frames.
+ gf_group->is_frame_non_ref[*frame_ind] = true;
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames.
+static AOM_INLINE void set_params_for_intnl_overlay_frames(
+ GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+ int *first_frame_index, int *cur_disp_index, int layer_depth) {
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+ ++(*cur_disp_index);
+
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
+static AOM_INLINE void set_params_for_internal_arfs(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+ int *parallel_frame_count, int max_parallel_frames,
+ int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
+ int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset,
+ int f_frames, int b_frames) {
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = arf_src_offset;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->display_idx[*frame_ind] =
+ (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset,
+ f_frames, b_frames, NULL, NULL, 0);
+
+ if (do_frame_parallel_encode) {
+ if (depth_thr != INT_MAX) {
+ assert(depth_thr == 3 || depth_thr == 4);
+ assert(IMPLIES(depth_thr == 3, layer_depth == 4));
+ assert(IMPLIES(depth_thr == 4, layer_depth == 5));
+ // Set frame_parallel_level of the first frame in the given layer to 1.
+ if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+ gf_group->frame_parallel_level[*frame_ind] = 1;
+ } else {
+ // Set frame_parallel_level of the consecutive frame in the same given
+ // layer to 2.
+ assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ // Store the display order hints of the past 2 INTNL_ARF_UPDATE
+ // frames which would not have been displayed at the time of the encode
+ // of current frame.
+ gf_group->skip_frame_refresh[*frame_ind][0] =
+ gf_group->display_idx[(*frame_ind) - 1];
+ gf_group->skip_frame_refresh[*frame_ind][1] =
+ gf_group->display_idx[(*frame_ind) - 2];
+ // Set the display_idx of frame_parallel_level 1 frame in
+ // gf_group->skip_frame_as_ref.
+ gf_group->skip_frame_as_ref[*frame_ind] =
+ gf_group->display_idx[(*frame_ind) - 1];
+ }
+ }
+ // If max_parallel_frames is not exceeded and if the frame will not be
+ // temporally filtered, encode the next internal ARF frame in parallel.
+ if (*parallel_frame_count > 1 &&
+ *parallel_frame_count <= max_parallel_frames) {
+ if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ *parallel_frame_count = 1;
+ }
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params_for_fp(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+ RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+ int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+ int max_parallel_frames, int do_frame_parallel_encode,
+ int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) {
+ const int num_frames_to_process = end - start;
+
+ // Either we are at the last level of the pyramid, or we don't have enough
+ // frames between 'l' and 'r' to create one more level.
+ if (layer_depth > gf_group->max_layer_depth_allowed ||
+ num_frames_to_process < 3) {
+ // Leaf nodes.
+ while (start < end) {
+ set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info,
+ gf_group, cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index,
+ cur_disp_idx, layer_depth, start, end);
+ ++start;
+ }
+ } else {
+ const int m = (start + end - 1) / 2;
+
+ // Internal ARF.
+ int arf_src_offset = m - start;
+ set_params_for_internal_arfs(
+ twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+ frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx,
+ layer_depth, arf_src_offset, m, end - m, m - start);
+
+ // If encode reordering is enabled, configure the multi-layers accordingly
+ // and return. For e.g., the encode order for gf-interval 16 after
+ // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10->
+ // 14-> 9-> 11-> 13-> 15.
+ if (layer_depth >= depth_thr) {
+ int m1 = (m + start - 1) / 2;
+ int m2 = (m + 1 + end) / 2;
+ int arf_src_offsets[2] = { m1 - start, m2 - start };
+ // Parameters to compute arf_boost.
+ int offset[2] = { m1, m2 };
+ int f_frames[2] = { m - m1, end - m2 };
+ int b_frames[2] = { m1 - start, m2 - (m + 1) };
+
+ // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered.
+ for (int i = 0; i < 2; i++) {
+ set_params_for_internal_arfs(
+ twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+ frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, depth_thr,
+ cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i],
+ f_frames[i], b_frames[i]);
+ }
+
+ // Initialize the start and end indices to configure LF_UPDATE frames.
+ int start_idx[4] = { start, m1 + 1, m + 1, end - 1 };
+ int end_idx[4] = { m1, m, m2, end };
+ int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth,
+ layer_depth + 1, INVALID_IDX };
+
+ // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE
+ // frames after reordering.
+ for (int i = 0; i < 4; i++) {
+ set_multi_layer_params_for_fp(
+ twopass, twopass_frame, gf_group, p_rc, rc, frame_info,
+ start_idx[i], end_idx[i], cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames, do_frame_parallel_encode,
+ first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2);
+ if (layer_depth_for_intnl_overlay[i] != INVALID_IDX)
+ set_params_for_intnl_overlay_frames(
+ gf_group, cur_frame_idx, frame_ind, first_frame_index,
+ cur_disp_idx, layer_depth_for_intnl_overlay[i]);
+ }
+ return;
+ }
+
+ // Frames displayed before this internal ARF.
+ set_multi_layer_params_for_fp(
+ twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+ cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+ layer_depth + 1);
+
+ // Overlay for internal ARF.
+ set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+ first_frame_index, cur_disp_idx,
+ layer_depth);
+
+ // Frames displayed after this internal ARF.
+ set_multi_layer_params_for_fp(
+ twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+ cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+ layer_depth + 1);
+ }
+}
+
+// Structure for bookkeeping start, end and display indices to configure
+// INTNL_ARF_UPDATE frames.
+typedef struct {
+ int start;
+ int end;
+ int display_index;
+} FRAME_REORDER_INFO;
+
+// Updates the stats required to configure the GF_GROUP.
+static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
+ int arf_frame_index,
+ int display_idx, int start,
+ int end) {
+ arf_frame_stats[arf_frame_index].start = start;
+ arf_frame_stats[arf_frame_index].end = end;
+ arf_frame_stats[arf_frame_index].display_index = display_idx;
+}
+
+// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates
+// doh_gf_index_map and arf_frame_stats.
+static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
+ GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+ int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+ int *count_arf_frames, int *doh_gf_index_map, int start, int end,
+ int layer_depth, int layer_with_parallel_encodes) {
+ int index = (start + end - 1) / 2;
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = index - 1;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->display_idx[*frame_ind] =
+ (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+
+ // Update the display index of the current frame with its gf index.
+ doh_gf_index_map[index] = *frame_ind;
+ if (layer_with_parallel_encodes) {
+ assert(layer_depth == 4);
+ // Set frame_parallel_level of the first frame in the given layer depth
+ // to 1.
+ if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+ gf_group->frame_parallel_level[*frame_ind] = 1;
+ } else {
+ // Set frame_parallel_level of the consecutive frame in the same given
+ // layer depth to 2.
+ assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ // Set the display_idx of frame_parallel_level 1 frame in
+ // gf_group->skip_frame_as_ref.
+ gf_group->skip_frame_as_ref[*frame_ind] =
+ gf_group->display_idx[(*frame_ind) - 1];
+ }
+ }
+ ++(*frame_ind);
+
+ // Update arf_frame_stats.
+ fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end);
+ ++(*count_arf_frames);
+}
+
+// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer
+// dpeth.
+static AOM_INLINE void set_params_for_cur_layer_frames(
+ GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+ int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+ int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start,
+ int node_end, int layer_depth) {
+ assert(num_dir < 3);
+ int start, end;
+ // Iterate through the nodes in the previous layer depth.
+ for (int i = node_start; i < node_end; i++) {
+ // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on
+ // either direction.
+ for (int dir = 0; dir < num_dir; dir++) {
+ // Checks for a frame to the left of current node.
+ if (dir == 0) {
+ start = arf_frame_stats[i].start;
+ end = arf_frame_stats[i].display_index;
+ } else {
+ // Checks for a frame to the right of current node.
+ start = arf_frame_stats[i].display_index + 1;
+ end = arf_frame_stats[i].end;
+ }
+ const int num_frames_to_process = end - start;
+ // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If
+ // num_frames_to_process is less than 3, then there are not enough frames
+ // between 'start' and 'end' to create another level.
+ if (num_frames_to_process >= 3) {
+ // Flag to indicate the lower layer depths for which parallel encoding
+ // is enabled. Currently enabled for layer 4 frames.
+ int layer_with_parallel_encodes = layer_depth == 4;
+ set_params_for_internal_arfs_in_gf14(
+ gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind,
+ count_arf_frames, doh_gf_index_map, start, end, layer_depth,
+ layer_with_parallel_encodes);
+ }
+ }
+ }
+}
+
+// Configures multi-layers of the GF_GROUP when consecutive encode of frames in
+// the same layer depth is enbaled.
+static AOM_INLINE void set_multi_layer_params_for_gf14(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+ int *cur_frame_idx, int *frame_ind, int *count_arf_frames,
+ int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index,
+ int *cur_disp_index, int gf_interval, int layer_depth,
+ int max_parallel_frames) {
+ assert(layer_depth == 2);
+ assert(gf_group->max_layer_depth_allowed >= 4);
+ int layer, node_start, node_end = 0;
+ // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only
+ // for gf-interval 14.
+ const int max_layer_depth = 4;
+ // Iterate through each layer depth starting from 2 till 'max_layer_depth'.
+ for (layer = layer_depth; layer <= max_layer_depth; layer++) {
+ // 'node_start' and 'node_end' indicate the number of nodes from the
+ // previous layer depth to be considered. It also corresponds to the indices
+ // of arf_frame_stats.
+ node_start = node_end;
+ node_end = (*count_arf_frames);
+ // 'num_dir' indicates the number of directions to traverse w.r.t. a given
+ // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would
+ // have only one frame and hence needs to traverse only in the left
+ // direction w.r.t the node in the previous layer.
+ int num_dir = layer == 2 ? 1 : 2;
+ set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx,
+ cur_disp_index, frame_ind, count_arf_frames,
+ doh_gf_index_map, num_dir, node_start,
+ node_end, layer);
+ }
+
+ for (int i = 1; i < gf_interval; i++) {
+ // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE
+ // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an
+ // LF_UPDATE frame.
+ if (doh_gf_index_map[i] == INVALID_IDX) {
+ // LF_UPDATE frames.
+ // TODO(Remya): Correct start and end parameters passed to
+ // set_params_for_leaf_frames() once encode reordering for gf-interval 14
+ // is enbaled for parallel encode of lower layer frames.
+ set_params_for_leaf_frames(
+ twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+ frame_ind, parallel_frame_count, max_parallel_frames, 1,
+ first_frame_index, cur_disp_index, layer, 0, 0);
+ } else {
+ // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get
+ // the gf index of corresponding INTNL_ARF_UPDATE frames.
+ int intnl_arf_index = doh_gf_index_map[i];
+ int ld = gf_group->layer_depth[intnl_arf_index];
+ set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+ first_frame_index, cur_disp_index,
+ ld);
+ }
+ }
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+ RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+ int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+ int max_parallel_frames, int do_frame_parallel_encode,
+ int *first_frame_index, int *cur_disp_idx, int layer_depth) {
+ const int num_frames_to_process = end - start;
+
+ // Either we are at the last level of the pyramid, or we don't have enough
+ // frames between 'l' and 'r' to create one more level.
+ if (layer_depth > gf_group->max_layer_depth_allowed ||
+ num_frames_to_process < 3) {
+ // Leaf nodes.
+ while (start < end) {
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->display_idx[*frame_ind] = *cur_disp_idx;
+ gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+ end - start, 0, NULL, NULL, 0);
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->max_layer_depth =
+ AOMMAX(gf_group->max_layer_depth, layer_depth);
+ // Set the level of parallelism for the LF_UPDATE frame.
+ if (do_frame_parallel_encode) {
+ set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+ parallel_frame_count, max_parallel_frames);
+ // Set LF_UPDATE frames as non-reference frames.
+ gf_group->is_frame_non_ref[*frame_ind] = true;
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+ ++(*cur_disp_idx);
+ ++start;
+ }
+ } else {
+ const int m = (start + end - 1) / 2;
+
+ // Internal ARF.
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = m - start;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->display_idx[*frame_ind] =
+ *cur_disp_idx + gf_group->arf_src_offset[*frame_ind];
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+ if (do_frame_parallel_encode) {
+ // If max_parallel_frames is not exceeded and if the frame will not be
+ // temporally filtered, encode the next internal ARF frame in parallel.
+ if (*parallel_frame_count > 1 &&
+ *parallel_frame_count <= max_parallel_frames) {
+ if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ *parallel_frame_count = 1;
+ }
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+ // Get the boost factor for intermediate ARF frames.
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m,
+ m - start, NULL, NULL, 0);
+ ++(*frame_ind);
+
+ // Frames displayed before this internal ARF.
+ set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+ frame_info, start, m, cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index,
+ cur_disp_idx, layer_depth + 1);
+
+ // Overlay for internal ARF.
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->display_idx[*frame_ind] = *cur_disp_idx;
+ gf_group->arf_boost[*frame_ind] = 0;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+ ++(*cur_disp_idx);
+
+ // Frames displayed after this internal ARF.
+ set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+ frame_info, m + 1, end, cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index,
+ cur_disp_idx, layer_depth + 1);
+ }
+}
+
+static int construct_multi_layer_gf_structure(
+ AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
+ RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval,
+ FRAME_UPDATE_TYPE first_frame_update_type) {
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ // TODO(angiebird): Why do we need "-1" here?
+ const int gf_interval = baseline_gf_interval - 1;
+ int frame_index = 0;
+ int cur_frame_index = 0;
+
+ // Set the display order hint for the first frame in the GF_GROUP.
+ int cur_disp_index = (first_frame_update_type == KF_UPDATE)
+ ? 0
+ : cpi->common.current_frame.frame_number;
+
+ // Initialize gf_group->frame_parallel_level, gf_group->is_frame_non_ref,
+ // gf_group->src_offset and gf_group->is_frame_dropped with 0.
+ memset(gf_group->frame_parallel_level, 0,
+ sizeof(gf_group->frame_parallel_level));
+ memset(gf_group->is_frame_non_ref, 0, sizeof(gf_group->is_frame_non_ref));
+ memset(gf_group->src_offset, 0, sizeof(gf_group->src_offset));
+ memset(gf_group->is_frame_dropped, 0, sizeof(gf_group->is_frame_dropped));
+ // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref
+ // with INVALID_IDX.
+ memset(gf_group->skip_frame_refresh, INVALID_IDX,
+ sizeof(gf_group->skip_frame_refresh));
+ memset(gf_group->skip_frame_as_ref, INVALID_IDX,
+ sizeof(gf_group->skip_frame_as_ref));
+
+ int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
+ // This is a patch that fixes https://crbug.com/aomedia/3163
+ // enable_keyframe_filtering > 1 will introduce an extra overlay frame at
+ // key frame location. However when
+ // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't
+ // afford to have an extra overlay frame. Otherwise, the gf_group->size will
+ // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error.
+ // A cheap solution is to turn of kf_decomp here.
+ // TODO(angiebird): Find a systematic way to solve this issue.
+ if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) {
+ kf_decomp = 0;
+ }
+ if (first_frame_update_type == KF_UPDATE) {
+ gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 0;
+ gf_group->frame_type[frame_index] = KEY_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_RESET;
+ gf_group->max_layer_depth = 0;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ if (!kf_decomp) cur_disp_index++;
+ ++frame_index;
+
+ if (kf_decomp) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 0;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = 0;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ cur_disp_index++;
+ ++frame_index;
+ }
+ cur_frame_index++;
+ }
+
+ if (first_frame_update_type == GF_UPDATE) {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 0;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = 0;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ cur_disp_index++;
+ ++frame_index;
+ ++cur_frame_index;
+ }
+
+ // ALTREF.
+ const int use_altref = gf_group->max_layer_depth_allowed > 0;
+ int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval;
+
+ if (use_altref) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 1;
+ gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost;
+ gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = 1;
+ gf_group->arf_index = frame_index;
+ gf_group->display_idx[frame_index] =
+ cur_disp_index + gf_group->arf_src_offset[frame_index];
+ ++frame_index;
+ } else {
+ gf_group->arf_index = -1;
+ }
+
+ // Flag to indicate if multi-layer configuration is complete.
+ int is_multi_layer_configured = 0;
+
+ // Running count of no. of frames that is part of a given parallel
+ // encode set in a gf_group. Value of 1 indicates no parallel encode.
+ int parallel_frame_count = 1;
+ // Enable parallel encode of frames if gf_group has a multi-layer pyramid
+ // structure with minimum 4 layers.
+ int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref &&
+ gf_group->max_layer_depth_allowed >= 4);
+
+ int first_frame_index = cur_frame_index;
+ if (do_frame_parallel_encode) {
+ // construct_multi_layer_gf_structure() takes the input parameter
+ // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the
+ // actual GF_GROUP length by compensating for this offset.
+ int actual_gf_length = ((first_frame_update_type == KF_UPDATE) ||
+ (first_frame_update_type == GF_UPDATE))
+ ? gf_interval
+ : gf_interval + 1;
+
+ // In order to facilitate parallel encoding of frames in lower layer depths,
+ // encode reordering is done. Currently encode reordering is enabled only
+ // for gf-intervals 16 and 32. NOTE: Since the buffer holding the
+ // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a
+ // limitation on the number of hidden frames possible at any given point and
+ // hence the reordering is enabled only for gf-intervals 16 and 32.
+ // Disabling encode reordering for gf-interval 14 since some cross-frame
+ // dependencies related to temporal filtering for FPMT is currently not
+ // handled.
+ int disable_gf14_reorder = 1;
+ if (actual_gf_length == 14 && !disable_gf14_reorder) {
+ // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot
+ // corresponding to their display order hint. This is used while
+ // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames.
+ int doh_gf_index_map[FIXED_GF_INTERVAL];
+ // Initialize doh_gf_index_map with INVALID_IDX.
+ memset(&doh_gf_index_map[0], INVALID_IDX,
+ (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL));
+
+ FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1];
+ // Store the stats corresponding to layer 1 frame.
+ fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1,
+ actual_gf_length);
+ int count_arf_frames = 1;
+
+ // Sets multi-layer params for gf-interval 14 to consecutively encode
+ // frames in the same layer depth, i.e., encode order would be 0-> 14->
+ // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13.
+ // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
+ set_multi_layer_params_for_gf14(
+ twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group,
+ arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames,
+ doh_gf_index_map, &parallel_frame_count, &first_frame_index,
+ &cur_disp_index, actual_gf_length, use_altref + 1,
+ cpi->ppi->num_fp_contexts);
+
+ // Set gf_group->skip_frame_refresh.
+ for (int i = 0; i < actual_gf_length; i++) {
+ int count = 0;
+ if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+ for (int j = 0; j < i; j++) {
+ // Store the display order hint of the frames which would not
+ // have been displayed at the encode call of frame 'i'.
+ if ((gf_group->display_idx[j] < gf_group->display_idx[i]) &&
+ gf_group->update_type[j] == INTNL_ARF_UPDATE) {
+ gf_group->skip_frame_refresh[i][count++] =
+ gf_group->display_idx[j];
+ }
+ }
+ }
+ }
+ } else {
+ // Set layer depth threshold for reordering as per the gf length.
+ int depth_thr = (actual_gf_length == 16) ? 3
+ : (actual_gf_length == 32) ? 4
+ : INT_MAX;
+
+ set_multi_layer_params_for_fp(
+ twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+ cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+ &parallel_frame_count, cpi->ppi->num_fp_contexts,
+ do_frame_parallel_encode, &first_frame_index, depth_thr,
+ &cur_disp_index, use_altref + 1);
+ }
+ is_multi_layer_configured = 1;
+ }
+
+ // Rest of the frames.
+ if (!is_multi_layer_configured)
+ set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc,
+ frame_info, cur_frame_index, gf_interval,
+ &cur_frame_index, &frame_index,
+ &parallel_frame_count, cpi->ppi->num_fp_contexts,
+ do_frame_parallel_encode, &first_frame_index,
+ &cur_disp_index, use_altref + 1);
+
+ if (use_altref) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] =
+ is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ ++frame_index;
+ } else {
+ for (; cur_frame_index <= gf_interval; ++cur_frame_index) {
+ gf_group->update_type[frame_index] = LF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+ set_src_offset(gf_group, &first_frame_index, cur_frame_index,
+ frame_index);
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ cur_disp_index++;
+ ++frame_index;
+ }
+ }
+ if (do_frame_parallel_encode) {
+ // Iterate through the gf_group and reset frame_parallel_level to 0 in case
+ // a frame is marked as frame_parallel_level 1 with no subsequent
+ // frame_parallel_level 2 frame(s).
+ int level1_frame_idx = INT_MAX;
+ int level2_frame_count = 0;
+ for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) {
+ if (gf_group->frame_parallel_level[frame_idx] == 1) {
+ // Set frame_parallel_level to 0 if only one frame is present in a
+ // parallel encode set.
+ if (level1_frame_idx != INT_MAX && !level2_frame_count)
+ gf_group->frame_parallel_level[level1_frame_idx] = 0;
+ // Book-keep frame_idx of frame_parallel_level 1 frame and reset the
+ // count of frame_parallel_level 2 frames in the corresponding parallel
+ // encode set.
+ level1_frame_idx = frame_idx;
+ level2_frame_count = 0;
+ }
+ if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++;
+ }
+ // If frame_parallel_level is set to 1 for the last LF_UPDATE
+ // frame in the gf_group, reset it to zero since there are no subsequent
+ // frames in the gf_group.
+ if (gf_group->frame_parallel_level[frame_index - 2] == 1) {
+ assert(gf_group->update_type[frame_index - 2] == LF_UPDATE);
+ gf_group->frame_parallel_level[frame_index - 2] = 0;
+ }
+ }
+
+ for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH;
+ ++gf_idx) {
+ gf_group->update_type[gf_idx] = LF_UPDATE;
+ gf_group->arf_src_offset[gf_idx] = 0;
+ gf_group->cur_frame_idx[gf_idx] = gf_idx;
+ gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[gf_idx] = NORMAL_BOOST;
+ gf_group->frame_type[gf_idx] = INTER_FRAME;
+ gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+ }
+
+ return frame_index;
+}
+
+static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) {
+ int log_gop_length = 0;
+ while ((1 << log_gop_length) < gop_length) {
+ ++log_gop_length;
+ }
+
+ for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+ int count = 0;
+ // Find the trailing zeros
+ for (; count < MAX_ARF_LAYERS; ++count) {
+ if ((gf_index >> count) & 0x01) break;
+ }
+ gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0);
+ }
+ gf_group->max_layer_depth = AOMMIN(log_gop_length, MAX_ARF_LAYERS);
+}
+
+void av1_gop_setup_structure(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ const int key_frame = rc->frames_since_key == 0;
+ FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE;
+
+ if (key_frame) {
+ first_frame_update_type = KF_UPDATE;
+ if (cpi->oxcf.kf_max_pyr_height != -1) {
+ gf_group->max_layer_depth_allowed = AOMMIN(
+ cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed);
+ }
+ } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) {
+ first_frame_update_type = GF_UPDATE;
+ }
+
+ gf_group->size = construct_multi_layer_gf_structure(
+ cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval,
+ first_frame_update_type);
+
+ if (gf_group->max_layer_depth_allowed == 0)
+ set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval);
+}
+
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+ int gf_frame_index) {
+ return gf_group->frame_type[gf_frame_index] == KEY_FRAME &&
+ gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE;
+}
+
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) {
+ const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index];
+ // TODO(angiebird): when gf_group->size == 32, it's possble to
+ // have "two" second arf. Check if this is acceptable.
+ if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE &&
+ arf_src_offset >= TF_LOOKAHEAD_IDX_THR) {
+ return 1;
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/gop_structure.h b/third_party/aom/av1/encoder/gop_structure.h
new file mode 100644
index 0000000000..ff22f54136
--- /dev/null
+++ b/third_party/aom/av1/encoder/gop_structure.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+
+/*!\endcond */
+
+/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP.
+ *
+ *\ingroup rate_control
+ *
+ * This function defines the Group-Of-Pictures structure for this GF_GROUP.
+ * This involves deciding where to place the various FRAME_UPDATE_TYPEs in
+ * the group. It does this primarily by updateing entries in
+ * cpi->twopass.gf_group.update_type[].
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ *
+ * \remark No return value but this function updates group data structures.
+ */
+void av1_gop_setup_structure(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] rc Rate control data
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] is_key_frame Indicates if the first frame in the group is
+ * also a key frame.
+ * \param[in] use_arf Are ARF frames enabled or is this a GF only
+ * uni-directional group.
+ * \param[in] gf_group_bits Bits available to be allocated.
+ *
+ * \remark No return but updates the rate control and group data structures
+ * to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+ GF_GROUP *gf_group, int is_key_frame, int use_arf,
+ int64_t gf_group_bits);
+
+/*!\brief Check whether a frame in the GOP is a forward key frame
+ *
+ *\ingroup rate_control
+ *
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] gf_frame_index GOP index
+ *
+ * \return Return 1 if it is a forward key frame, otherwise return 0
+ */
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+ int gf_frame_index);
+
+/*!\brief Check whether a frame in the GOP is the second arf
+ *
+ *\ingroup rate_control
+ *
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] gf_frame_index GOP index
+ *
+ * \return Return 1 if it is the second arf
+ */
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 0000000000..945dc37331
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+ /* Test 1 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 16, 0 },
+ { 25, 136 },
+ { 33, 144 },
+ { 41, 160 },
+ { 48, 168 },
+ { 56, 136 },
+ { 67, 128 },
+ { 82, 144 },
+ { 97, 152 },
+ { 113, 144 },
+ { 128, 176 },
+ { 143, 168 },
+ { 158, 176 },
+ { 178, 184 } },
+ 14 /* num_points_y */,
+ { { 16, 0 },
+ { 20, 64 },
+ { 28, 88 },
+ { 60, 104 },
+ { 90, 136 },
+ { 105, 160 },
+ { 134, 168 },
+ { 168, 208 } },
+ 8 /* num_cb_points */,
+ { { 16, 0 },
+ { 28, 96 },
+ { 56, 80 },
+ { 66, 96 },
+ { 80, 104 },
+ { 108, 96 },
+ { 122, 112 },
+ { 137, 112 },
+ { 169, 176 } },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+ { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+ { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+ 8 /* ar_coeff_shift */,
+ 247 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 18 /* cb_offset */,
+ 229 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 54 /* cr_offset */,
+ 0 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /* chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 2 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 2 /* num_points_y */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_cb_points */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 3 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 192 }, { 255, 192 } },
+ 2 /* num_points_y */,
+ { { 0, 128 }, { 255, 128 } },
+ 2 /* num_cb_points */,
+ { { 0, 128 }, { 255, 128 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19,
+ -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0,
+ },
+ {
+ 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19,
+ -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 1 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 4 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 24, 137 },
+ { 53, 146 },
+ { 63, 155 },
+ { 78, 155 },
+ { 107, 150 },
+ { 122, 147 },
+ { 136, 147 },
+ { 166, 153 },
+ },
+ 9 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 20, 72 },
+ { 27, 82 },
+ { 33, 91 },
+ { 69, 121 },
+ { 95, 143 },
+ { 108, 154 },
+ { 134, 169 },
+ { 147, 177 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 24, 95 },
+ { 54, 93 },
+ { 65, 94 },
+ { 79, 98 },
+ { 109, 107 },
+ { 124, 119 },
+ { 139, 136 },
+ { 169, 170 },
+ },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42,
+ 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113,
+ },
+ {
+ -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5,
+ -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0,
+ },
+ {
+ 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2,
+ -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0,
+ },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 5 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_points_y */,
+ {
+ { 0, 96 },
+ { 32, 90 },
+ { 64, 83 },
+ { 96, 76 },
+ { 128, 68 },
+ { 159, 59 },
+ { 191, 48 },
+ { 223, 34 },
+ { 255, 0 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 0, 0 },
+ { 32, 34 },
+ { 64, 48 },
+ { 96, 59 },
+ { 128, 68 },
+ { 159, 76 },
+ { 191, 83 },
+ { 223, 90 },
+ { 255, 96 },
+ },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2,
+ -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0,
+ },
+ {
+ -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+ 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 1063 /* random_seed */
+ },
+ /* Test 6 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 96 },
+ { 20, 92 },
+ { 39, 88 },
+ { 59, 84 },
+ { 78, 80 },
+ { 98, 75 },
+ { 118, 70 },
+ { 137, 65 },
+ { 157, 60 },
+ { 177, 53 },
+ { 196, 46 },
+ { 216, 38 },
+ { 235, 27 },
+ { 255, 0 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 } },
+ 0 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 2754 /* random_seed */
+ },
+ /* Test 7 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 0 },
+ { 20, 27 },
+ { 39, 38 },
+ { 59, 46 },
+ { 78, 53 },
+ { 98, 60 },
+ { 118, 65 },
+ { 137, 70 },
+ { 157, 75 },
+ { 177, 80 },
+ { 196, 84 },
+ { 216, 88 },
+ { 235, 92 },
+ { 255, 96 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 2 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 8 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 2 /* num_points_y */,
+ { { 0, 62 }, { 255, 62 } },
+ 2 /* num_cb_points */,
+ { { 0, 62 }, { 255, 62 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6,
+ -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69,
+ },
+ {
+ 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8,
+ -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 9 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 48 }, { 255, 48 } },
+ 2 /* num_points_y */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cb_points */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 10 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 48 }, { 255, 48 } },
+ 2 /* num_points_y */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cb_points */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+ { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 11 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_points_y */,
+ {
+ { 0, 48 },
+ { 32, 45 },
+ { 64, 42 },
+ { 96, 38 },
+ { 128, 34 },
+ { 159, 29 },
+ { 191, 24 },
+ { 223, 17 },
+ { 255, 0 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 0, 0 },
+ { 32, 17 },
+ { 64, 24 },
+ { 96, 29 },
+ { 128, 34 },
+ { 159, 38 },
+ { 191, 42 },
+ { 223, 45 },
+ { 255, 48 },
+ },
+ 9 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42,
+ 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113,
+ },
+ {
+ -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5,
+ -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0,
+ },
+ {
+ 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2,
+ -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0,
+ },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 1357 /* random_seed */
+ },
+ /* Test 12 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 24, 49 },
+ { 39, 69 },
+ { 46, 84 },
+ { 53, 91 },
+ { 63, 100 },
+ { 78, 114 },
+ { 92, 134 },
+ { 164, 139 },
+ },
+ 9 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 20, 31 },
+ { 26, 42 },
+ { 33, 54 },
+ { 40, 65 },
+ { 47, 72 },
+ { 56, 85 },
+ { 84, 123 },
+ { 152, 157 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 25, 14 },
+ { 39, 33 },
+ { 47, 40 },
+ { 54, 47 },
+ { 64, 62 },
+ { 79, 76 },
+ { 94, 83 },
+ { 167, 101 },
+ },
+ 9 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+ { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+ { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 0 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 13 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 48 },
+ { 20, 46 },
+ { 39, 44 },
+ { 59, 42 },
+ { 78, 40 },
+ { 98, 38 },
+ { 118, 35 },
+ { 137, 33 },
+ { 157, 30 },
+ { 177, 27 },
+ { 196, 23 },
+ { 216, 19 },
+ { 235, 13 },
+ { 255, 0 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 14 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 0 },
+ { 20, 13 },
+ { 39, 19 },
+ { 59, 23 },
+ { 78, 27 },
+ { 98, 30 },
+ { 118, 33 },
+ { 137, 35 },
+ { 157, 38 },
+ { 177, 40 },
+ { 196, 42 },
+ { 216, 44 },
+ { 235, 46 },
+ { 255, 48 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 15 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 1 /* num_points_y */,
+ { { 0, 96 }, { 255, 96 } },
+ 0 /* num_cb_points */,
+ { { 0, 96 }, { 255, 96 } },
+ 0 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+ { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+ { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 1 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 16 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 58, 126 },
+ { 87, 120 },
+ { 97, 122 },
+ { 112, 125 },
+ { 126, 131 },
+ { 141, 139 },
+ { 199, 153 },
+ },
+ 8 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 59, 68 },
+ { 66, 76 },
+ { 73, 82 },
+ { 79, 85 },
+ { 86, 86 },
+ { 151, 95 },
+ { 192, 101 },
+ },
+ 8 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 59, 64 },
+ { 89, 80 },
+ { 99, 86 },
+ { 114, 90 },
+ { 129, 93 },
+ { 144, 97 },
+ { 203, 85 },
+ },
+ 8 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6,
+ -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69,
+ },
+ {
+ 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8,
+ -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 2 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+};
+#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
new file mode 100644
index 0000000000..8037b59bef
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+#include "config/av1_rtcd.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+ uint8_t *pData, uint32_t dataLength) {
+ for (uint32_t i = 0; i < dataLength; i++) {
+ const uint8_t index = (uint8_t)(
+ (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+ pData[i]);
+ p_crc_calculator->remainder <<= 8;
+ p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+ }
+}
+
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+ p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+ return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+ const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+ const uint32_t byte_high_bit = 1 << (8 - 1);
+
+ for (uint32_t value = 0; value < 256; value++) {
+ uint32_t remainder = 0;
+ for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+ if (value & mask) {
+ remainder ^= high_bit;
+ }
+
+ if (remainder & high_bit) {
+ remainder <<= 1;
+ remainder ^= p_crc_calculator->trunc_poly;
+ } else {
+ remainder <<= 1;
+ }
+ }
+ p_crc_calculator->table[value] = remainder;
+ }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+ uint32_t truncPoly) {
+ p_crc_calculator->remainder = 0;
+ p_crc_calculator->bits = bits;
+ p_crc_calculator->trunc_poly = truncPoly;
+ p_crc_calculator->final_result_mask = (1 << bits) - 1;
+ crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+ int length) {
+ crc_calculator_reset(p_crc_calculator);
+ crc_calculator_process_data(p_crc_calculator, p, length);
+ return crc_calculator_get_crc(p_crc_calculator);
+}
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+ uint32_t crc;
+
+ for (int n = 0; n < 256; n++) {
+ crc = n;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ p_crc32c->table[0][n] = crc;
+ }
+ for (int n = 0; n < 256; n++) {
+ crc = p_crc32c->table[0][n];
+ for (int k = 1; k < 8; k++) {
+ crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+ p_crc32c->table[k][n] = crc;
+ }
+ }
+}
+
+/* Table-driven software version as a fall-back. This is about 15 times slower
+ than using the hardware instructions. This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) {
+ const uint8_t *next = (const uint8_t *)(buf);
+ uint64_t crc;
+ CRC32C *p = (CRC32C *)c;
+ crc = 0 ^ 0xffffffff;
+ while (len && ((uintptr_t)next & 7) != 0) {
+ crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ while (len >= 8) {
+ crc ^= *(uint64_t *)next;
+ crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+ p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+ p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+ p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+ next += 8;
+ len -= 8;
+ }
+ while (len) {
+ crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
new file mode 100644
index 0000000000..d8e8cc3a0b
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+ uint32_t remainder;
+ uint32_t trunc_poly;
+ uint32_t bits;
+ uint32_t table[256];
+ uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+ uint32_t truncPoly);
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+ int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+ /* Table for a quadword-at-a-time software crc. */
+ uint32_t table[8][256];
+} CRC32C;
+
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
new file mode 100644
index 0000000000..8b04e22d6c
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+
+#define kSrcBits 16
+#define kBlockSizeBits 3
+#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits))
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src,
+ int stride,
+ uint8_t *p_pixels_in1D) {
+ const uint8_t *p_pel = y_src;
+ int index = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ p_pixels_in1D[index++] = p_pel[j];
+ }
+ p_pel += stride;
+ }
+}
+
+static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src,
+ int stride,
+ uint16_t *p_pixels_in1D) {
+ const uint16_t *p_pel = y_src;
+ int index = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ p_pixels_in1D[index++] = p_pel[j];
+ }
+ p_pel += stride;
+ }
+}
+
+static int is_block_2x2_row_same_value(const uint8_t *p) {
+ if (p[0] != p[1] || p[2] != p[3]) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block16_2x2_row_same_value(const uint16_t *p) {
+ if (p[0] != p[1] || p[2] != p[3]) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block_2x2_col_same_value(const uint8_t *p) {
+ if ((p[0] != p[2]) || (p[1] != p[3])) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block16_2x2_col_same_value(const uint16_t *p) {
+ if ((p[0] != p[2]) || (p[1] != p[3])) {
+ return 0;
+ }
+ return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+ switch (block_size) {
+ case 4: return 0;
+ case 8: return 1;
+ case 16: return 2;
+ case 32: return 3;
+ case 64: return 4;
+ case 128: return 5;
+ default: return -1;
+ }
+}
+
+void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) {
+ if (!intrabc_hash_info->g_crc_initialized) {
+ av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB);
+ av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB);
+ intrabc_hash_info->g_crc_initialized = 1;
+ }
+ intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL;
+}
+
+void av1_hash_table_clear_all(hash_table *p_hash_table) {
+ if (p_hash_table->p_lookup_table == NULL) {
+ return;
+ }
+ for (int i = 0; i < kMaxAddr; i++) {
+ if (p_hash_table->p_lookup_table[i] != NULL) {
+ aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+ aom_free(p_hash_table->p_lookup_table[i]);
+ p_hash_table->p_lookup_table[i] = NULL;
+ }
+ }
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+ av1_hash_table_clear_all(p_hash_table);
+ aom_free(p_hash_table->p_lookup_table);
+ p_hash_table->p_lookup_table = NULL;
+}
+
+bool av1_hash_table_create(hash_table *p_hash_table) {
+ if (p_hash_table->p_lookup_table != NULL) {
+ av1_hash_table_clear_all(p_hash_table);
+ return true;
+ }
+ p_hash_table->p_lookup_table =
+ (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0]));
+ if (!p_hash_table->p_lookup_table) return false;
+ return true;
+}
+
+static bool hash_table_add_to_table(hash_table *p_hash_table,
+ uint32_t hash_value,
+ block_hash *curr_block_hash) {
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ p_hash_table->p_lookup_table[hash_value] =
+ aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ return false;
+ }
+ if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+ sizeof(curr_block_hash[0])) == VECTOR_ERROR)
+ return false;
+ if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash) == VECTOR_ERROR)
+ return false;
+ } else {
+ if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash) == VECTOR_ERROR)
+ return false;
+ }
+ return true;
+}
+
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+ uint32_t hash_value) {
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ return 0;
+ } else {
+ return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+ }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+ uint32_t hash_value) {
+ assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+ return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+ uint32_t hash_value2) {
+ if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+ return 0;
+ }
+ Iterator iterator =
+ aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+ Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
+ for (; !aom_iterator_equals(&iterator, &last);
+ aom_iterator_increment(&iterator)) {
+ if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 ==
+ hash_value2) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ uint32_t *pic_block_hash[2],
+ int8_t *pic_block_same_info[3]) {
+ const int width = 2;
+ const int height = 2;
+ const int x_end = picture->y_crop_width - width + 1;
+ const int y_end = picture->y_crop_height - height + 1;
+ CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+ CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+ const int length = width * 2;
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t p[4];
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ get_pixels_in_1D_short_array_by_block_2x2(
+ CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+ x_pos,
+ picture->y_stride, p);
+ pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+ pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+ pic_block_hash[0][pos] =
+ av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
+ pic_block_hash[1][pos] =
+ av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
+ pos++;
+ }
+ pos += width - 1;
+ }
+ } else {
+ uint8_t p[4];
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ get_pixels_in_1D_char_array_by_block_2x2(
+ picture->y_buffer + y_pos * picture->y_stride + x_pos,
+ picture->y_stride, p);
+ pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+ pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+ pic_block_hash[0][pos] =
+ av1_get_crc_value(calc_1, p, length * sizeof(p[0]));
+ pic_block_hash[1][pos] =
+ av1_get_crc_value(calc_2, p, length * sizeof(p[0]));
+ pos++;
+ }
+ pos += width - 1;
+ }
+ }
+}
+
+void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ int block_size,
+ uint32_t *src_pic_block_hash[2],
+ uint32_t *dst_pic_block_hash[2],
+ int8_t *src_pic_block_same_info[3],
+ int8_t *dst_pic_block_same_info[3]) {
+ CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+ CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+ const int pic_width = picture->y_crop_width;
+ const int x_end = picture->y_crop_width - block_size + 1;
+ const int y_end = picture->y_crop_height - block_size + 1;
+
+ const int src_size = block_size >> 1;
+ const int quad_size = block_size >> 2;
+
+ uint32_t p[4];
+ const int length = sizeof(p);
+
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ p[0] = src_pic_block_hash[0][pos];
+ p[1] = src_pic_block_hash[0][pos + src_size];
+ p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+ p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+ dst_pic_block_hash[0][pos] =
+ av1_get_crc_value(calc_1, (uint8_t *)p, length);
+
+ p[0] = src_pic_block_hash[1][pos];
+ p[1] = src_pic_block_hash[1][pos + src_size];
+ p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+ p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+ dst_pic_block_hash[1][pos] =
+ av1_get_crc_value(calc_2, (uint8_t *)p, length);
+
+ dst_pic_block_same_info[0][pos] =
+ src_pic_block_same_info[0][pos] &&
+ src_pic_block_same_info[0][pos + quad_size] &&
+ src_pic_block_same_info[0][pos + src_size] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+ dst_pic_block_same_info[1][pos] =
+ src_pic_block_same_info[1][pos] &&
+ src_pic_block_same_info[1][pos + src_size] &&
+ src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+ src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+ src_pic_block_same_info[1][pos + src_size * pic_width] &&
+ src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+ pos++;
+ }
+ pos += block_size - 1;
+ }
+
+ if (block_size >= 4) {
+ const int size_minus_1 = block_size - 1;
+ pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ dst_pic_block_same_info[2][pos] =
+ (!dst_pic_block_same_info[0][pos] &&
+ !dst_pic_block_same_info[1][pos]) ||
+ (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
+ pos++;
+ }
+ pos += block_size - 1;
+ }
+ }
+}
+
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+ uint32_t *pic_hash[2],
+ int8_t *pic_is_same,
+ int pic_width, int pic_height,
+ int block_size) {
+ const int x_end = pic_width - block_size + 1;
+ const int y_end = pic_height - block_size + 1;
+
+ const int8_t *src_is_added = pic_is_same;
+ const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+ int add_value = hash_block_size_to_index(block_size);
+ assert(add_value >= 0);
+ add_value <<= kSrcBits;
+ const int crc_mask = (1 << kSrcBits) - 1;
+
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ const int pos = y_pos * pic_width + x_pos;
+ // valid data
+ if (src_is_added[pos]) {
+ block_hash curr_block_hash;
+ curr_block_hash.x = x_pos;
+ curr_block_hash.y = y_pos;
+
+ const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+ curr_block_hash.hash_value2 = src_hash[1][pos];
+
+ if (!hash_table_add_to_table(p_hash_table, hash_value1,
+ &curr_block_hash)) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start) {
+ const int stride = picture->y_stride;
+ const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p16[j] != p16[0]) {
+ return 0;
+ }
+ }
+ p16 += stride;
+ }
+ } else {
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p[j] != p[0]) {
+ return 0;
+ }
+ }
+ p += stride;
+ }
+ }
+
+ return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start) {
+ const int stride = picture->y_stride;
+ const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p16[j * stride + i] != p16[i]) {
+ return 0;
+ }
+ }
+ }
+ } else {
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p[j * stride + i] != p[i]) {
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const uint8_t *y_src, int stride, int block_size,
+ uint32_t *hash_value1, uint32_t *hash_value2,
+ int use_highbitdepth) {
+ int add_value = hash_block_size_to_index(block_size);
+ assert(add_value >= 0);
+ add_value <<= kSrcBits;
+ const int crc_mask = (1 << kSrcBits) - 1;
+
+ CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+ CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+ uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0];
+ uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1];
+
+ // 2x2 subblock hash values in current CU
+ int sub_block_in_width = (block_size >> 1);
+ if (use_highbitdepth) {
+ uint16_t pixel_to_hash[4];
+ uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+ for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+ for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+ int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+ get_pixels_in_1D_short_array_by_block_2x2(
+ y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+ assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ }
+ }
+ } else {
+ uint8_t pixel_to_hash[4];
+ for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+ for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+ int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+ get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+ stride, pixel_to_hash);
+ assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ buf_1[0][pos] =
+ av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash));
+ buf_2[0][pos] =
+ av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash));
+ }
+ }
+ }
+
+ int src_sub_block_in_width = sub_block_in_width;
+ sub_block_in_width >>= 1;
+
+ int src_idx = 1;
+ int dst_idx = 0;
+
+ // 4x4 subblock hash values to current block hash values
+ uint32_t to_hash[4];
+ for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+ src_idx = 1 - src_idx;
+ dst_idx = 1 - dst_idx;
+
+ int dst_pos = 0;
+ for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+ for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+ int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+ assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ assert(srcPos + src_sub_block_in_width + 1 <
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ to_hash[0] = buf_1[src_idx][srcPos];
+ to_hash[1] = buf_1[src_idx][srcPos + 1];
+ to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1];
+
+ buf_1[dst_idx][dst_pos] =
+ av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash));
+
+ to_hash[0] = buf_2[src_idx][srcPos];
+ to_hash[1] = buf_2[src_idx][srcPos + 1];
+ to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1];
+ buf_2[dst_idx][dst_pos] =
+ av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash));
+ dst_pos++;
+ }
+ }
+
+ src_sub_block_in_width = sub_block_in_width;
+ sub_block_in_width >>= 1;
+ }
+
+ *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value;
+ *hash_value2 = buf_2[dst_idx][0];
+}
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
new file mode 100644
index 0000000000..8974ba27cb
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
+
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/hash.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Block size used for force_integer_mv decisions
+#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+ int16_t x;
+ int16_t y;
+ uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table {
+ Vector **p_lookup_table;
+} hash_table;
+
+struct intrabc_hash_info;
+
+typedef struct intrabc_hash_info {
+ // buffer for hash value calculation of a block
+ // used only in av1_get_block_hash_value()
+ // [first hash/second hash]
+ // [two buffers used ping-pong]
+ uint32_t *hash_value_buffer[2][2];
+ hash_table intrabc_hash_table;
+
+ CRC_CALCULATOR crc_calculator1;
+ CRC_CALCULATOR crc_calculator2;
+ int g_crc_initialized;
+} IntraBCHashInfo;
+
+void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info);
+void av1_hash_table_clear_all(hash_table *p_hash_table);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+bool av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+ uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+ uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+ uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ uint32_t *pic_block_hash[2],
+ int8_t *pic_block_same_info[3]);
+void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ int block_size,
+ uint32_t *src_pic_block_hash[2],
+ uint32_t *dst_pic_block_hash[2],
+ int8_t *src_pic_block_same_info[3],
+ int8_t *dst_pic_block_same_info[3]);
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+ uint32_t *pic_hash[2],
+ int8_t *pic_is_same,
+ int pic_width, int pic_height,
+ int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start);
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const uint8_t *y_src, int stride, int block_size,
+ uint32_t *hash_value1, uint32_t *hash_value2,
+ int use_highbitdepth);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000000..a108e8148c
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+ pixel.
+ Shared for both high and low bit depth.
+ */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
+
+ a1 += b1;
+ d1 = d1 - c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)a1;
+ op[1] = (tran_low_t)c1;
+ op[2] = (tran_low_t)d1;
+ op[3] = (tran_low_t)b1;
+
+ ip_pass0++;
+ op += 4;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ b1 = ip[4 * 1];
+ c1 = ip[4 * 2];
+ d1 = ip[4 * 3];
+
+ a1 += b1;
+ d1 -= c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[4 * 0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+ op[4 * 1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+ op[4 * 2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+ op[4 * 3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+ ip++;
+ op++;
+ }
+}
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ if (txfm_param->lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_fwht4x4(src_diff, coeff, diff_stride);
+ return;
+ }
+ av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+#endif
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ bd);
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ bd);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+#endif
+
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ TxfmParam *txfm_param) {
+ if (txfm_param->bd == 8)
+ av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+ else
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_64X64:
+ highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X64:
+ highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_64X32:
+ highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+
+ case TX_32X32:
+ highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X16:
+ highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X8:
+ highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X8:
+ highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X4:
+ highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X16:
+ highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X8:
+ highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X32:
+ highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X16:
+ highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X4:
+ highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+#if !CONFIG_REALTIME_ONLY
+ case TX_4X16:
+ highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X4:
+ highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X32:
+ highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X8:
+ highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X64:
+ highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_64X16:
+ highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+#endif
+ default: assert(0); break;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ switch (tx_size) {
+ // As the output transform co-efficients of 4x4 Hadamard transform can be
+ // represented using 15 bits (for 12-bit clip) use lowbd variant of
+ // hadamard_4x4.
+ case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+ case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break;
+ case TX_16X16:
+ aom_highbd_hadamard_16x16(src_diff, src_stride, coeff);
+ break;
+ case TX_32X32:
+ aom_highbd_hadamard_32x32(src_diff, src_stride, coeff);
+ break;
+ default: assert(0);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ switch (tx_size) {
+ case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+ case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+ case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+ case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+ default: assert(0);
+ }
+}
+
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+ const int16_t *src_diff, int src_stride,
+ tran_low_t *coeff) {
+ if (use_hadamard) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bd_info.use_highbitdepth_buf) {
+ highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+ } else {
+ wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+ }
+#else
+ wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ TxfmParam txfm_param;
+ txfm_param.tx_type = DCT_DCT;
+ txfm_param.tx_size = tx_size;
+ txfm_param.lossless = 0;
+ txfm_param.bd = bd_info.bit_depth;
+ txfm_param.is_hbd = bd_info.use_highbitdepth_buf;
+ txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+ av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000000..30f8a2258b
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ TxfmParam *txfm_param);
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param);
+
+/*!\brief Apply Hadamard or DCT transform
+ *
+ * \callergraph
+ * DCT and Hadamard transforms are commonly used for quick RD score estimation.
+ * The coeff buffer's size should be equal to the number of pixels
+ * corresponding to tx_size.
+ */
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+ const int16_t *src_diff, int src_stride, tran_low_t *coeff);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/interp_search.c b/third_party/aom/av1/encoder/interp_search.c
new file mode 100644
index 0000000000..27235303c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/interp_search.c
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// return mv_diff
+static INLINE int is_interp_filter_good_match(
+ const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi,
+ int skip_level) {
+ const int is_comp = has_second_ref(mi);
+ int i;
+
+ for (i = 0; i < 1 + is_comp; ++i) {
+ if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX;
+ }
+
+ if (skip_level == 1 && is_comp) {
+ if (st->comp_type != mi->interinter_comp.type) return INT_MAX;
+ if (st->compound_idx != mi->compound_idx) return INT_MAX;
+ }
+
+ int mv_diff = 0;
+ for (i = 0; i < 1 + is_comp; ++i) {
+ mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) +
+ abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col);
+ }
+ return mv_diff;
+}
+
+static INLINE int save_interp_filter_search_stat(
+ MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse,
+ INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx) {
+ if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) {
+ INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+ { mbmi->mv[0], mbmi->mv[1] },
+ { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] },
+ mbmi->interinter_comp.type,
+ mbmi->compound_idx,
+ rd,
+ pred_sse };
+ interp_filter_stats[interp_filter_stats_idx] = stat;
+ interp_filter_stats_idx++;
+ }
+ return interp_filter_stats_idx;
+}
+
+static INLINE int find_interp_filter_in_stats(
+ MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx, int skip_level) {
+ // [skip_levels][single or comp]
+ const int thr[2][2] = { { 0, 0 }, { 3, 7 } };
+ const int is_comp = has_second_ref(mbmi);
+
+ // Find good enough match.
+ // TODO(yunqing): Separate single-ref mode and comp mode stats for fast
+ // search.
+ int best = INT_MAX;
+ int match = -1;
+ for (int j = 0; j < interp_filter_stats_idx; ++j) {
+ const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j];
+ const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level);
+ // Exact match is found.
+ if (mv_diff == 0) {
+ match = j;
+ break;
+ } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) {
+ best = mv_diff;
+ match = j;
+ }
+ }
+
+ if (match != -1) {
+ mbmi->interp_filters = interp_filter_stats[match].filters;
+ return match;
+ }
+ return -1; // no match result found
+}
+
+int av1_find_interp_filter_match(
+ MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+ const InterpFilter assign_filter, const int need_search,
+ INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx) {
+ int match_found_idx = -1;
+ if (cpi->sf.interp_sf.use_interp_filter && need_search)
+ match_found_idx = find_interp_filter_in_stats(
+ mbmi, interp_filter_stats, interp_filter_stats_idx,
+ cpi->sf.interp_sf.use_interp_filter);
+
+ if (!need_search || match_found_idx == -1)
+ set_default_interp_filters(mbmi, assign_filter);
+ return match_found_idx;
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+ const int_interpfilters filters,
+ const int ctx[2], int dual_filter) {
+ const InterpFilter filter0 = filters.as_filters.y_filter;
+ int inter_filter_cost =
+ x->mode_costs.switchable_interp_costs[ctx[0]][filter0];
+ if (dual_filter) {
+ const InterpFilter filter1 = filters.as_filters.x_filter;
+ inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1];
+ }
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// Build inter predictor and calculate model rd
+// for a given plane.
+static INLINE void interp_model_rd_eval(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int plane_from, int plane_to,
+ RD_STATS *rd_stats, int is_skip_build_pred) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_STATS tmp_rd_stats;
+ av1_init_rd_stats(&tmp_rd_stats);
+
+ // Skip inter predictor if the predictor is already available.
+ if (!is_skip_build_pred) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane_from, plane_to);
+ }
+
+ model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+ ? MODELRD_LEGACY
+ : MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
+ &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL,
+ NULL, NULL);
+
+ av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd,
+ RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2],
+ const int skip_pred) {
+ const AV1_COMMON *cm = &cpi->common;
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ RD_STATS this_rd_stats_luma, this_rd_stats;
+
+ // Initialize rd_stats structures to default values.
+ av1_init_rd_stats(&this_rd_stats_luma);
+ this_rd_stats = *rd_stats_luma;
+ const int_interpfilters last_best = mbmi->interp_filters;
+ mbmi->interp_filters = filter_sets[filter_idx];
+ const int tmp_rs =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+ cm->seq_params->enable_dual_filter);
+
+ int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
+ if (min_rd > *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+
+ (void)tile_data;
+
+ assert(skip_pred != 2);
+ assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
+ assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
+ assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
+ assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1));
+ assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1));
+ assert((skip_pred >= 0) &&
+ (skip_pred <= interp_search_flags->default_interp_skip_flags));
+
+ // When skip_txfm pred is equal to default_interp_skip_flags,
+ // skip both luma and chroma MC.
+ // For mono-chrome images:
+ // num_planes = 1 and cpi->default_interp_skip_flags = 1,
+ // skip_pred = 1: skip both luma and chroma
+ // skip_pred = 0: Evaluate luma and as num_planes=1,
+ // skip chroma evaluation
+ int tmp_skip_pred =
+ (skip_pred == interp_search_flags->default_interp_skip_flags)
+ ? INTERP_SKIP_LUMA_SKIP_CHROMA
+ : skip_pred;
+
+ switch (tmp_skip_pred) {
+ case INTERP_EVAL_LUMA_EVAL_CHROMA:
+ // skip_pred = 0: Evaluate both luma and chroma.
+ // Luma MC
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+ &this_rd_stats_luma, 0);
+ this_rd_stats = this_rd_stats_luma;
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ AOM_FALLTHROUGH_INTENDED;
+ case INTERP_SKIP_LUMA_EVAL_CHROMA:
+ // skip_pred = 1: skip luma evaluation (retain previous best luma stats)
+ // and do chroma evaluation.
+ for (int plane = 1; plane < num_planes; ++plane) {
+ int64_t tmp_rd =
+ RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+ if (tmp_rd >= *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane,
+ &this_rd_stats, 0);
+ }
+ break;
+ case INTERP_SKIP_LUMA_SKIP_CHROMA:
+ // both luma and chroma evaluation is skipped
+ this_rd_stats = *rd_stats;
+ break;
+ case INTERP_EVAL_INVALID:
+ default: assert(0); return 0;
+ }
+ int64_t tmp_rd =
+ RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+
+ if (tmp_rd < *rd) {
+ *rd = tmp_rd;
+ *switchable_rate = tmp_rs;
+ if (skip_pred != interp_search_flags->default_interp_skip_flags) {
+ if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) {
+ // Overwrite the data as current filter is the best one
+ *rd_stats_luma = this_rd_stats_luma;
+ *rd_stats = this_rd_stats;
+ // As luma MC data is computed, no need to recompute after the search
+ x->recalc_luma_mc_data = 0;
+ } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) {
+ // As luma MC data is not computed, update of luma data can be skipped
+ *rd_stats = this_rd_stats;
+ // As luma MC data is not recomputed and current filter is the best,
+ // indicate the possibility of recomputing MC data
+ // If current buffer contains valid MC data, toggle to indicate that
+ // luma MC data needs to be recomputed
+ x->recalc_luma_mc_data ^= 1;
+ }
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ }
+ return 1;
+ }
+ mbmi->interp_filters = last_best;
+ return 0;
+}
+
+static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed(
+ const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int_interpfilters *af, int_interpfilters *lf) {
+ const AV1_COMMON *cm = &cpi->common;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int bsl = mi_size_wide_log2[bsize];
+ int is_horiz_eq = 0, is_vert_eq = 0;
+
+ if (above_mbmi && is_inter_block(above_mbmi))
+ *af = above_mbmi->interp_filters;
+
+ if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters;
+
+ if (af->as_filters.x_filter != INTERP_INVALID)
+ is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter;
+ if (af->as_filters.y_filter != INTERP_INVALID)
+ is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter;
+
+ INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int pred_filter_enable =
+ cpi->sf.interp_sf.cb_pred_filter_search
+ ? (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_frame.frame_number)) &
+ 0x1
+ : 0;
+ pred_filter_enable &= is_horiz_eq || is_vert_eq;
+ // pred_filter_search = 0: pred_filter is disabled
+ // pred_filter_search = 1: pred_filter is enabled and only horz pred matching
+ // pred_filter_search = 2: pred_filter is enabled and only vert pred matching
+ // pred_filter_search = 3: pred_filter is enabled and
+ // both vert, horz pred matching
+ return pred_filter_enable * pred_filter_type;
+}
+
+static DUAL_FILTER_TYPE find_best_interp_rd_facade(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) {
+ int tmp_skip_pred = skip_pred;
+ DUAL_FILTER_TYPE best_filt_type = REG_REG;
+
+ // If no filter are set to be evaluated, return from function
+ if (allow_interp_mask == 0x0) return best_filt_type;
+ // For block width or height is 4, skip the pred evaluation of SHARP_SHARP
+ tmp_skip_pred = is_w4_or_h4
+ ? cpi->interp_search_flags.default_interp_skip_flags
+ : skip_pred;
+
+ // Loop over the all filter types and evaluate for only allowed filter types
+ for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) {
+ const int is_filter_allowed =
+ get_interp_filter_allowed_mask(allow_interp_mask, filt_type);
+ if (is_filter_allowed)
+ if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate,
+ dst_bufs, filt_type, switchable_ctx,
+ tmp_skip_pred))
+ best_filt_type = filt_type;
+ tmp_skip_pred = skip_pred;
+ }
+ return best_filt_type;
+}
+
+static INLINE void pred_dual_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af,
+ int_interpfilters *lf) {
+ (void)lf;
+ assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ);
+ assert(pred_filt_type < INTERP_PRED_TYPE_ALL);
+ uint16_t allowed_interp_mask = 0;
+
+ if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) {
+ // pred_filter_search = 1: Only horizontal filter is matching
+ allowed_interp_mask =
+ av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter];
+ } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) {
+ // pred_filter_search = 2: Only vertical filter is matching
+ allowed_interp_mask =
+ av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter];
+ } else {
+ // pred_filter_search = 3: Both horizontal and vertical filter are matching
+ int filt_type =
+ af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS;
+ set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type);
+ }
+ // REG_REG is already been evaluated in the beginning
+ reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+ find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y,
+ rd_stats, switchable_rate, dst_bufs,
+ switchable_ctx, skip_pred, allowed_interp_mask, 0);
+}
+// Evaluate dual filter type
+// a) Using above, left block interp filter
+// b) Find the best horizontal filter and
+// then evaluate corresponding vertical filters.
+static INLINE void fast_dual_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_hor, const int skip_ver) {
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+ int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+ int_interpfilters lf = af;
+
+ if (!have_newmv_in_inter_mode(mbmi->mode)) {
+ pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+ }
+
+ if (pred_filter_type) {
+ pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ switchable_ctx, (skip_hor & skip_ver),
+ pred_filter_type, &af, &lf);
+ } else {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int best_dual_mode = 0;
+ int skip_pred =
+ bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor;
+ // TODO(any): Make use of find_best_interp_rd_facade()
+ // if speed impact is negligible
+ for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+ if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate,
+ dst_bufs, i, switchable_ctx, skip_pred)) {
+ best_dual_mode = i;
+ }
+ skip_pred = skip_hor;
+ }
+ // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+ skip_pred =
+ bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver;
+ for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+ i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ i, switchable_ctx, skip_pred);
+ skip_pred = skip_ver;
+ }
+ }
+}
+
+// Find the best interp filter if dual_interp_filter = 0
+static INLINE void find_best_non_dual_interp_filter(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_ver, const int skip_hor) {
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ int8_t i;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ uint16_t interp_filter_search_mask =
+ interp_search_flags->interp_filter_search_mask;
+
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
+ const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
+ int use_actual_frame_probs = 1;
+ const int *switchable_interp_p0;
+ const int *switchable_interp_p1;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs
+ .switchable_interp_probs[update_type][ctx0];
+ switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs
+ .switchable_interp_probs[update_type][ctx1];
+ }
+#endif
+ if (use_actual_frame_probs) {
+ switchable_interp_p0 =
+ cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0];
+ switchable_interp_p1 =
+ cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1];
+ }
+ static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
+ const int thresh = thr[update_type];
+ for (i = 0; i < SWITCHABLE_FILTERS; i++) {
+ // For non-dual case, the 2 dir's prob should be identical.
+ assert(switchable_interp_p0[i] == switchable_interp_p1[i]);
+ if (switchable_interp_p0[i] < thresh &&
+ switchable_interp_p1[i] < thresh) {
+ DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i;
+ reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type);
+ }
+ }
+ }
+
+ // Regular filter evaluation should have been done and hence the same should
+ // be the winner
+ assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int);
+ if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) {
+ INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+ int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+ int_interpfilters lf = af;
+
+ pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+ if (pred_filter_type) {
+ assert(af.as_filters.x_filter != INTERP_INVALID);
+ int filter_idx = SWITCHABLE * af.as_filters.x_filter;
+ // This assert tells that (filter_x == filter_y) for non-dual filter case
+ assert(filter_sets[filter_idx].as_filters.x_filter ==
+ filter_sets[filter_idx].as_filters.y_filter);
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+ !(get_interp_filter_allowed_mask(interp_filter_search_mask,
+ filter_idx))) {
+ return;
+ }
+ if (filter_idx) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ filter_idx, switchable_ctx,
+ (skip_hor & skip_ver));
+ }
+ return;
+ }
+ }
+ // Reuse regular filter's modeled rd data for sharp filter for following
+ // cases
+ // 1) When bsize is 4x4
+ // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical
+ // direction is full-pel
+ // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal
+ // direction is full-pel
+ // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction
+ // alone is full-pel
+
+ if ((bsize == BLOCK_4X4) ||
+ (block_size_wide[bsize] == 4 &&
+ skip_ver == interp_search_flags->default_interp_skip_flags) ||
+ (block_size_high[bsize] == 4 &&
+ skip_hor == interp_search_flags->default_interp_skip_flags)) {
+ int skip_pred = skip_hor & skip_ver;
+ uint16_t allowed_interp_mask = 0;
+
+ // REG_REG filter type is evaluated beforehand, hence skip it
+ set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP);
+ set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH);
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search)
+ allowed_interp_mask &= interp_filter_search_mask;
+
+ find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ switchable_ctx, skip_pred, allowed_interp_mask,
+ 1);
+ } else {
+ int skip_pred = (skip_hor & skip_ver);
+ for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE;
+ i += (SWITCHABLE_FILTERS + 1)) {
+ // This assert tells that (filter_x == filter_y) for non-dual filter case
+ assert(filter_sets[i].as_filters.x_filter ==
+ filter_sets[i].as_filters.y_filter);
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+ !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) {
+ continue;
+ }
+ interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ i, switchable_ctx, skip_pred);
+ // In first iteration, smooth filter is evaluated. If smooth filter
+ // (which is less sharper) is the winner among regular and smooth filters,
+ // sharp filter evaluation is skipped
+ // TODO(any): Refine this gating based on modelled rd only (i.e., by not
+ // accounting switchable filter rate)
+ if (cpi->sf.interp_sf.skip_sharp_interp_filter_search &&
+ skip_pred != interp_search_flags->default_interp_skip_flags) {
+ if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int)
+ break;
+ }
+ }
+ }
+}
+
+static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
+ const AV1_COMP *const cpi,
+ int *skip_hor, int *skip_ver) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ const int is_compound = has_second_ref(mbmi);
+ assert(is_intrabc_block(mbmi) == 0);
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]);
+ // TODO(any): Refine skip flag calculation considering scaling
+ if (av1_is_scaled(sf)) {
+ *skip_hor = 0;
+ *skip_ver = 0;
+ break;
+ }
+ const MV mv = mbmi->mv[ref].as_mv;
+ int skip_hor_plane = 0;
+ int skip_ver_plane = 0;
+ for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1));
+ ++plane_idx) {
+ struct macroblockd_plane *const pd = &xd->plane[plane_idx];
+ const int bw = pd->width;
+ const int bh = pd->height;
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ skip_hor_plane |= ((sub_x == 0) << plane_idx);
+ skip_ver_plane |= ((sub_y == 0) << plane_idx);
+ }
+ *skip_hor &= skip_hor_plane;
+ *skip_ver &= skip_ver_plane;
+ // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+ assert(*skip_hor != 2);
+ assert(*skip_ver != 2);
+ }
+ // When compond prediction type is compound segment wedge, luma MC and chroma
+ // MC need to go hand in hand as mask generated during luma MC is reuired for
+ // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+ // vertical filter decision may be incorrect as temporary MC evaluation
+ // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+ // populated during luma MC
+ if (is_compound && mbmi->compound_idx == 1 &&
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+ assert(mbmi->comp_group_idx == 1);
+ if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0;
+ }
+}
+
+/*!\brief AV1 interpolation filter search
+ *
+ * \ingroup inter_mode_search
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding.
+ * \param[in] x Pointer to struc holding all the data for
+ * the current macroblock.
+ * \param[in] bsize Current block size.
+ * \param[in] tmp_dst A temporary prediction buffer to hold a
+ * computed prediction.
+ * \param[in,out] orig_dst A prediction buffer to hold a computed
+ * prediction. This will eventually hold the
+ * final prediction, and the tmp_dst info will
+ * be copied here.
+ * \param[in,out] rd The RD cost associated with the selected
+ * interpolation filter parameters.
+ * \param[in,out] switchable_rate The rate associated with using a SWITCHABLE
+ * filter mode.
+ * \param[in,out] skip_build_pred Indicates whether or not to build the inter
+ * predictor. If this is 0, the inter predictor
+ * has already been built and thus we can avoid
+ * repeating computation.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ *
+ * \return Returns INT64_MAX if the filter parameters are invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * parameter search is a success.
+ */
+int64_t av1_interpolation_filter_search(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+ int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int need_search = av1_is_interp_needed(xd);
+ const int ref_frame = xd->mi[0]->ref_frame[0];
+ RD_STATS rd_stats_luma, rd_stats;
+
+ // Initialization of rd_stats structures with default values
+ av1_init_rd_stats(&rd_stats_luma);
+ av1_init_rd_stats(&rd_stats);
+
+ int match_found_idx = -1;
+ const InterpFilter assign_filter = cm->features.interp_filter;
+
+ match_found_idx = av1_find_interp_filter_match(
+ mbmi, cpi, assign_filter, need_search, args->interp_filter_stats,
+ args->interp_filter_stats_idx);
+
+ if (match_found_idx != -1) {
+ *rd = args->interp_filter_stats[match_found_idx].rd;
+ x->pred_sse[ref_frame] =
+ args->interp_filter_stats[match_found_idx].pred_sse;
+ *skip_build_pred = 0;
+ return 0;
+ }
+
+ int switchable_ctx[2];
+ switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+ switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+ *switchable_rate =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+ cm->seq_params->enable_dual_filter);
+
+ // Do MC evaluation for default filter_type.
+ // Luma MC
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+ &rd_stats_luma, *skip_build_pred);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ // Chroma MC
+ if (num_planes > 1) {
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V,
+ &rd_stats, *skip_build_pred);
+ }
+ *skip_build_pred = 1;
+
+ av1_merge_rd_stats(&rd_stats, &rd_stats_luma);
+
+ assert(rd_stats.rate >= 0);
+
+ *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist);
+ x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+ if (assign_filter != SWITCHABLE || match_found_idx != -1) {
+ return 0;
+ }
+ if (!need_search) {
+ int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ assert(mbmi->interp_filters.as_int == filters.as_int);
+ (void)filters;
+ return 0;
+ }
+ if (args->modelled_rd != NULL) {
+ if (has_second_ref(mbmi)) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+ const int mode0 = compound_ref0_mode(mbmi->mode);
+ const int mode1 = compound_ref1_mode(mbmi->mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ }
+
+ x->recalc_luma_mc_data = 0;
+ // skip_flag=xx (in binary form)
+ // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+ // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip
+ // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only"
+ // Skip_flag=2 is not a valid case
+ // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+ int skip_hor = interp_search_flags->default_interp_skip_flags;
+ int skip_ver = interp_search_flags->default_interp_skip_flags;
+ calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver);
+
+ // do interp_filter search
+ restore_dst_buf(xd, *tmp_dst, num_planes);
+ const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+ // Evaluate dual interp filters
+ if (cm->seq_params->enable_dual_filter) {
+ if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
+ fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ &rd_stats_luma, &rd_stats, switchable_rate,
+ dst_bufs, switchable_ctx, skip_hor, skip_ver);
+ } else {
+ // Use full interpolation filter search
+ uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK;
+ // REG_REG filter type is evaluated beforehand, so loop is repeated over
+ // REG_SMOOTH to SHARP_SHARP for full interpolation filter search
+ reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+ find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+ &rd_stats_luma, &rd_stats, switchable_rate,
+ dst_bufs, switchable_ctx,
+ (skip_hor & skip_ver), allowed_interp_mask, 0);
+ }
+ } else {
+ // Evaluate non-dual interp filters
+ find_best_non_dual_interp_filter(
+ x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats,
+ switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor);
+ }
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ // Recompute final MC data if required
+ if (x->recalc_luma_mc_data == 1) {
+ // Recomputing final luma MC data is required only if the same was skipped
+ // in either of the directions Condition below is necessary, but not
+ // sufficient
+ assert((skip_hor == 1) || (skip_ver == 1));
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+ x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+ // save search results
+ if (cpi->sf.interp_sf.use_interp_filter) {
+ assert(match_found_idx == -1);
+ args->interp_filter_stats_idx = save_interp_filter_search_stat(
+ mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats,
+ args->interp_filter_stats_idx);
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/interp_search.h b/third_party/aom/av1/encoder/interp_search.h
new file mode 100644
index 0000000000..9815e0bcfb
--- /dev/null
+++ b/third_party/aom/av1/encoder/interp_search.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define MAX_INTERP_FILTER_STATS 128
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+
+typedef struct {
+ int_interpfilters filters;
+ int_mv mv[2];
+ int8_t ref_frames[2];
+ COMPOUND_TYPE comp_type;
+ int compound_idx;
+ int64_t rd;
+ unsigned int pred_sse;
+} INTERPOLATION_FILTER_STATS;
+/*!\endcond */
+
+/*!\brief Miscellaneous arguments for inter mode search.
+ */
+typedef struct HandleInterModeArgs {
+ /*!
+ * Buffer for the above predictor in OBMC
+ */
+ uint8_t *above_pred_buf[MAX_MB_PLANE];
+ /*!
+ * Stride for the above predictor in OBMC
+ */
+ int above_pred_stride[MAX_MB_PLANE];
+ /*!
+ * Buffer for the left predictor in OBMC
+ */
+ uint8_t *left_pred_buf[MAX_MB_PLANE];
+ /*!
+ * Stride for the left predictor in OBMC
+ */
+ int left_pred_stride[MAX_MB_PLANE];
+ /*!
+ * Pointer to the first member in a 2D array which holds
+ * single reference mode motion vectors to be used as a starting
+ * point in the mv search for compound modes. Each array is length REF_FRAMES,
+ * meaning there is a slot for a single reference motion vector for
+ * each possible reference frame. The 2D array consists of N of these arrays,
+ * where N is the length of the reference mv stack computed for the single
+ * reference case for that particular reference frame.
+ */
+ int_mv (*single_newmv)[REF_FRAMES];
+ /*!
+ * Pointer to the first array of a 2D array with the same setup as
+ * single_newmv array above. This is a 2D array to hold the rate
+ * corresponding to each of the single reference mode motion vectors
+ * held in single_newmv.
+ */
+ int (*single_newmv_rate)[REF_FRAMES];
+ /*!
+ * Pointer to the first array of a 2D array with the same setup as
+ * single_newmv array above. This is a 2D array to hold a 0 or 1
+ * validity value corresponding to each of the single reference mode motion
+ * vectors held in single_newmv.
+ */
+ int (*single_newmv_valid)[REF_FRAMES];
+ /*!
+ * Pointer to the first array in a 3D array of predicted rate-distortion.
+ * The dimensions of this structure are:
+ * (number of possible inter modes) X
+ * (number of reference MVs) X
+ * (number of reference frames).
+ */
+ int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+ /*!
+ * Holds an estimated entropy cost for picking the current reference frame.
+ * This is used to compute an rd estimate.
+ */
+ int ref_frame_cost;
+ /*!
+ * Holds an estimated entropy cost for picking single or compound
+ * reference. This is used to compute an rd estimate.
+ */
+ int single_comp_cost;
+ /*!
+ * Pointer to the first element in a 3D array holding rd's of
+ * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref
+ * modes used to determine compound ref modes. The full structure is:
+ * (number of inter modes) X (length of refmv list) X (number of ref frames)
+ */
+ int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+ /*!
+ * An integer value 0 or 1 which indicates whether or not to skip the motion
+ * mode search and default to SIMPLE_TRANSLATION as a speed feature.
+ */
+ int skip_motion_mode;
+ /*!
+ * Initialized to false. If true, skips interpolation filter search and uses
+ * the default EIGHTTAP_REGULAR.
+ */
+ bool skip_ifs;
+ /*!
+ * A pointer to the first element in an array of INTERINTRA_MODE types. This
+ * contains the best inter_intra mode for each reference frame.
+ */
+ INTERINTRA_MODE *inter_intra_mode;
+ /*!
+ * Array of saved interpolation filter stats collected to avoid repeating
+ * an interpolation filter search when the mv and ref_frame are the same
+ * as a previous search.
+ */
+ INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+
+ /*!
+ * Stack to store full pixel search start mv of NEWMV mode.
+ */
+ FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+ /*!
+ * Stack to store ref_mv_idx of NEWMV mode.
+ */
+ uint8_t ref_mv_idx_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+ /*!
+ * Count of mvs in start mv stack.
+ */
+ int start_mv_cnt;
+
+ /*!
+ * Index of the last set of saved stats in the interp_filter_stats array.
+ */
+ int interp_filter_stats_idx;
+ /*!
+ * Estimated wedge index.
+ */
+ int wedge_index;
+ /*!
+ * Estimated wedge sign.
+ */
+ int wedge_sign;
+ /*!
+ * Estimated diff wtd index.
+ */
+ int diffwtd_index;
+ /*!
+ * Estimated cmp mode.
+ */
+ int cmp_mode[MODE_CTX_REF_FRAMES];
+ /*!
+ * The best sse during single new_mv search. Note that the sse here comes from
+ * single_motion_search, and not from interpolation_filter_search. This has
+ * two implications:
+ * 1. The mv used to calculate the sse here does not have to be the best sse
+ * found in handle_inter_mode.
+ * 2. Even if the mvs agree, the sse here can differ from the sse in \ref
+ * MACROBLOCK::pred_sse due to different interpolation filter used.
+ */
+ unsigned int best_single_sse_in_refs[REF_FRAMES];
+ /*!
+ * Holds the sse of best mode so far in the mode evaluation process. This is
+ * used in intermediate termination of NEWMV mode evaluation.
+ */
+ unsigned int best_pred_sse;
+} HandleInterModeArgs;
+
+/*!\cond */
+static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+ { 0x00000000 }, { 0x00010000 }, { 0x00020000 }, // y = 0
+ { 0x00000001 }, { 0x00010001 }, { 0x00020001 }, // y = 1
+ { 0x00000002 }, { 0x00010002 }, { 0x00020002 }, // y = 2
+};
+
+int av1_find_interp_filter_match(
+ MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+ const InterpFilter assign_filter, const int need_search,
+ INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx);
+
+int64_t av1_interpolation_filter_search(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+ int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd);
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/intra_mode_search.c b/third_party/aom/av1/encoder/intra_mode_search.c
new file mode 100644
index 0000000000..99b0af2f8e
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search.c
@@ -0,0 +1,1739 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tx_search.h"
+
+// Even though there are 7 delta angles, this macro is set to 9 to facilitate
+// the rd threshold check to prune -3 and 3 delta angles.
+#define SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY (2 * MAX_ANGLE_DELTA + 3)
+
+// The order for evaluating delta angles while processing the luma directional
+// intra modes. Currently, this order of evaluation is applicable only when
+// speed feature prune_luma_odd_delta_angles_in_intra is enabled. In this case,
+// even angles are evaluated first in order to facilitate the pruning of odd
+// delta angles based on the rd costs of the neighboring delta angles.
+static const int8_t luma_delta_angles_order[2 * MAX_ANGLE_DELTA] = {
+ -2, 2, -3, -1, 1, 3,
+};
+
+/*!\cond */
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+ DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED,
+ SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED,
+ D67_PRED, D113_PRED, D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+ UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED,
+ UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+ UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED,
+ UV_D113_PRED, UV_D45_PRED,
+};
+
+// The bitmask corresponds to the filter intra modes as defined in enums.h
+// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding filter intra mode. The table
+// av1_derived_filter_intra_mode_used_flag is used when speed feature
+// prune_filter_intra_level is 1. The evaluated filter intra modes are union
+// of the following:
+// 1) FILTER_DC_PRED
+// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED,
+// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED).
+static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = {
+ 0x01, // DC_PRED: 0000 0001
+ 0x03, // V_PRED: 0000 0011
+ 0x05, // H_PRED: 0000 0101
+ 0x01, // D45_PRED: 0000 0001
+ 0x01, // D135_PRED: 0000 0001
+ 0x01, // D113_PRED: 0000 0001
+ 0x09, // D157_PRED: 0000 1001
+ 0x01, // D203_PRED: 0000 0001
+ 0x01, // D67_PRED: 0000 0001
+ 0x01, // SMOOTH_PRED: 0000 0001
+ 0x01, // SMOOTH_V_PRED: 0000 0001
+ 0x01, // SMOOTH_H_PRED: 0000 0001
+ 0x11 // PAETH_PRED: 0001 0001
+};
+
+// The bitmask corresponds to the chroma intra modes as defined in enums.h
+// UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding chroma intra mode. The table
+// av1_derived_chroma_intra_mode_used_flag is used when speed feature
+// prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma
+// intra modes are union of the following:
+// 1) UV_DC_PRED
+// 2) UV_SMOOTH_PRED
+// 3) UV_CFL_PRED
+// 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma
+// intra mode winner is V_PRED).
+static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = {
+ 0x2201, // DC_PRED: 0010 0010 0000 0001
+ 0x2203, // V_PRED: 0010 0010 0000 0011
+ 0x2205, // H_PRED: 0010 0010 0000 0101
+ 0x2209, // D45_PRED: 0010 0010 0000 1001
+ 0x2211, // D135_PRED: 0010 0010 0001 0001
+ 0x2221, // D113_PRED: 0010 0010 0010 0001
+ 0x2241, // D157_PRED: 0010 0010 0100 0001
+ 0x2281, // D203_PRED: 0010 0010 1000 0001
+ 0x2301, // D67_PRED: 0010 0011 0000 0001
+ 0x2201, // SMOOTH_PRED: 0010 0010 0000 0001
+ 0x2601, // SMOOTH_V_PRED: 0010 0110 0000 0001
+ 0x2a01, // SMOOTH_H_PRED: 0010 1010 0000 0001
+ 0x3201 // PAETH_PRED: 0011 0010 0000 0001
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+ highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+ const int stride, const int is_hbd) {
+ unsigned int sse;
+
+ if (is_hbd)
+ return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse);
+ else
+ return vf(buf, stride, all_zeros, 0, &sse);
+}
+
+// Computes average of log(1 + variance) across 4x4 sub-blocks for source and
+// reconstructed blocks.
+static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const BLOCK_SIZE bs,
+ double *avg_log_src_variance,
+ double *avg_log_recon_variance) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+ const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+ const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+ const int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ const int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+ const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow);
+ const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow);
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ for (int i = 0; i < bh; i += MI_SIZE) {
+ const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2);
+ for (int j = 0; j < bw; j += MI_SIZE) {
+ const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2);
+ const int mi_offset = r * mi_size_wide[sb_size] + c;
+ Block4x4VarInfo *block_4x4_var_info =
+ &x->src_var_info_of_4x4_sub_blocks[mi_offset];
+ int src_var = block_4x4_var_info->var;
+ double log_src_var = block_4x4_var_info->log_var;
+ // Compute average of log(1 + variance) for the source block from 4x4
+ // sub-block variance values. Calculate and store 4x4 sub-block variance
+ // and log(1 + variance), if the values present in
+ // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily
+ // available with valid values.
+ if (src_var < 0) {
+ src_var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, is_hbd);
+ block_4x4_var_info->var = src_var;
+ log_src_var = log1p(src_var / 16.0);
+ block_4x4_var_info->log_var = log_src_var;
+ } else {
+ // When source variance is already calculated and available for
+ // retrieval, check if log(1 + variance) is also available. If it is
+ // available, then retrieve from buffer. Else, calculate the same and
+ // store to the buffer.
+ if (log_src_var < 0) {
+ log_src_var = log1p(src_var / 16.0);
+ block_4x4_var_info->log_var = log_src_var;
+ }
+ }
+ *avg_log_src_variance += log_src_var;
+
+ const int recon_var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+ xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
+ xd->plane[0].dst.stride, is_hbd);
+ *avg_log_recon_variance += log1p(recon_var / 16.0);
+ }
+ }
+
+ const int blocks = (bw * bh) / 16;
+ *avg_log_src_variance /= (double)blocks;
+ *avg_log_recon_variance /= (double)blocks;
+}
+
+// Returns a factor to be applied to the RD value based on how well the
+// reconstructed block variance matches the source variance.
+static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed);
+ // For non-positive threshold values, the comparison of source and
+ // reconstructed variances with threshold evaluates to false
+ // (src_var < threshold/rec_var < threshold) as these metrics are greater than
+ // than 0. Hence further calculations are skipped.
+ if (threshold <= 0) return 1.0;
+
+ double variance_rd_factor = 1.0;
+ double avg_log_src_variance = 0.0;
+ double avg_log_recon_variance = 0.0;
+ double var_diff = 0.0;
+
+ compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance,
+ &avg_log_recon_variance);
+
+ // Dont allow 0 to prevent / 0 below.
+ avg_log_src_variance += 0.000001;
+ avg_log_recon_variance += 0.000001;
+
+ if (avg_log_src_variance >= avg_log_recon_variance) {
+ var_diff = (avg_log_src_variance - avg_log_recon_variance);
+ if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) {
+ variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance);
+ }
+ } else {
+ var_diff = (avg_log_recon_variance - avg_log_src_variance);
+ if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) {
+ variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance));
+ }
+ }
+
+ // Limit adjustment;
+ variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
+
+ return variance_rd_factor;
+}
+/*!\endcond */
+
+/*!\brief Search for the best filter_intra mode when coding intra frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise.
+ */
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, int mode_cost,
+ PREDICTION_MODE best_mode_so_far,
+ int64_t *best_rd, int64_t *best_model_rd,
+ PICK_MODE_CONTEXT *ctx) {
+ // Skip the evaluation of filter intra modes.
+ if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int filter_intra_selected_flag = 0;
+ FILTER_INTRA_MODE mode;
+ TX_SIZE best_tx_size = TX_8X8;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ av1_zero(filter_intra_mode_info);
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+
+ // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have
+ // filter-intra as winner.
+ if (x->use_mb_mode_cache &&
+ !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra)
+ return 0;
+
+ for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+ int64_t this_rd;
+ RD_STATS tokenonly_rd_stats;
+ mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+
+ if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) &&
+ !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] &
+ (1 << mode)))
+ continue;
+
+ // Skip the evaluation of modes that do not match with the winner mode in
+ // x->mb_mode_cache.
+ if (x->use_mb_mode_cache &&
+ mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode)
+ continue;
+
+ if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
+ continue;
+ }
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+ *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ const int this_rate =
+ tokenonly_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ // Visual quality adjustment based on recon vs source variance.
+ if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+ this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+ }
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ best_tx_size = mbmi->tx_size;
+ filter_intra_mode_info = mbmi->filter_intra_mode_info;
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip_txfm;
+ filter_intra_selected_flag = 1;
+ }
+ }
+
+ if (filter_intra_selected_flag) {
+ mbmi->mode = DC_PRED;
+ mbmi->tx_size = best_tx_size;
+ mbmi->filter_intra_mode_info = filter_intra_mode_info;
+ av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+ int *val_count, int *num_colors) {
+ const int max_pix_val = 1 << 8;
+ memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int this_val = src[r * stride + c];
+ assert(this_val < max_pix_val);
+ ++val_count[this_val];
+ }
+ }
+ int n = 0;
+ for (int i = 0; i < max_pix_val; ++i) {
+ if (val_count[i]) ++n;
+ }
+ *num_colors = n;
+}
+
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+ int cols, int bit_depth, int *val_count,
+ int *bin_val_count, int *num_color_bins,
+ int *num_colors) {
+ assert(bit_depth <= 12);
+ const int max_bin_val = 1 << 8;
+ const int max_pix_val = 1 << bit_depth;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0]));
+ if (val_count != NULL)
+ memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ /*
+ * Down-convert the pixels to 8-bit domain before counting.
+ * This provides consistency of behavior for palette search
+ * between lbd and hbd encodes. This down-converted pixels
+ * are only used for calculating the threshold (n).
+ */
+ const int this_val = ((src[r * stride + c]) >> (bit_depth - 8));
+ assert(this_val < max_bin_val);
+ if (this_val >= max_bin_val) continue;
+ ++bin_val_count[this_val];
+ if (val_count != NULL) ++val_count[(src[r * stride + c])];
+ }
+ }
+ int n = 0;
+ // Count the colors based on 8-bit domain used to gate the palette path
+ for (int i = 0; i < max_bin_val; ++i) {
+ if (bin_val_count[i]) ++n;
+ }
+ *num_color_bins = n;
+
+ // Count the actual hbd colors used to create top_colors
+ n = 0;
+ if (val_count != NULL) {
+ for (int i = 0; i < max_pix_val; ++i) {
+ if (val_count[i]) ++n;
+ }
+ *num_colors = n;
+ }
+}
+
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
+ int reorder_delta_angle_eval) {
+ if (mode_idx < INTRA_MODE_END) {
+ mbmi->mode = intra_rd_search_mode_order[mode_idx];
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ } else {
+ mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+ int delta_angle_eval_idx =
+ (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+ if (reorder_delta_angle_eval) {
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ luma_delta_angles_order[delta_angle_eval_idx];
+ } else {
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ (delta_angle_eval_idx < 3 ? (delta_angle_eval_idx - 3)
+ : (delta_angle_eval_idx - 2));
+ }
+ }
+}
+
+static AOM_INLINE int get_model_rd_index_for_pruning(
+ const MACROBLOCK *const x,
+ const INTRA_MODE_SPEED_FEATURES *const intra_sf) {
+ const int top_intra_model_count_allowed =
+ intra_sf->top_intra_model_count_allowed;
+ if (!intra_sf->adapt_top_model_rd_count_using_neighbors)
+ return top_intra_model_count_allowed - 1;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const PREDICTION_MODE mode = xd->mi[0]->mode;
+ int model_rd_index_for_pruning = top_intra_model_count_allowed - 1;
+ int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0;
+ if (xd->left_available)
+ is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode;
+ if (xd->up_available)
+ is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode;
+ // The pruning of luma intra modes is made more aggressive at lower quantizers
+ // and vice versa. The value for model_rd_index_for_pruning is derived as
+ // follows.
+ // qidx 0 to 127: Reduce the index of a candidate used for comparison only if
+ // the current mode does not match either of the available neighboring modes.
+ // qidx 128 to 255: Reduce the index of a candidate used for comparison only
+ // if the current mode does not match both the available neighboring modes.
+ if (x->qindex <= 127) {
+ if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode)
+ model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
+ } else {
+ if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode)
+ model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
+ }
+ return model_rd_index_for_pruning;
+}
+
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+ int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+ int model_rd_index_for_pruning) {
+ const double thresh_best = 1.50;
+ const double thresh_top = 1.00;
+ for (int i = 0; i < max_model_cnt_allowed; i++) {
+ if (this_model_rd < top_intra_model_rd[i]) {
+ for (int j = max_model_cnt_allowed - 1; j > i; j--) {
+ top_intra_model_rd[j] = top_intra_model_rd[j - 1];
+ }
+ top_intra_model_rd[i] = this_model_rd;
+ break;
+ }
+ }
+ if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX &&
+ this_model_rd >
+ thresh_top * top_intra_model_rd[model_rd_index_for_pruning])
+ return 1;
+
+ if (this_model_rd != INT64_MAX &&
+ this_model_rd > thresh_best * (*best_model_rd))
+ return 1;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ return 0;
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+ int *best_angle_delta, int64_t *best_rd) {
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+ assert(!is_inter_block(mbmi));
+ int this_rate;
+ int64_t this_rd;
+ RD_STATS tokenonly_rd_stats;
+
+ if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+ return INT64_MAX;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+ *rate = this_rate;
+ rd_stats->rate = tokenonly_rd_stats.rate;
+ rd_stats->dist = tokenonly_rd_stats.dist;
+ rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
+ }
+ return this_rd;
+}
+
+/*!\brief Search for the best angle delta for chroma prediction
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Given a chroma directional intra prediction mode, this function will try to
+ * estimate the best delta_angle.
+ *
+ * \returns Return if there is a new mode with smaller rdcost than best_rd.
+ */
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int rate_overhead,
+ int64_t best_rd, int *rate,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ int i, angle_delta, best_angle_delta = 0;
+ int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+ rd_stats->rate = INT_MAX;
+ rd_stats->skip_txfm = 0;
+ rd_stats->dist = INT64_MAX;
+ for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+ for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (i = 0; i < 2; ++i) {
+ best_rd_in = (best_rd == INT64_MAX)
+ ? INT64_MAX
+ : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+ mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+ this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+ best_rd_in, rate, rd_stats,
+ &best_angle_delta, &best_rd);
+ rd_cost[2 * angle_delta + i] = this_rd;
+ if (angle_delta == 0) {
+ if (this_rd == INT64_MAX) return 0;
+ rd_cost[1] = this_rd;
+ break;
+ }
+ }
+ }
+
+ assert(best_rd != INT64_MAX);
+ for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ int64_t rd_thresh;
+ for (i = 0; i < 2; ++i) {
+ int skip_search = 0;
+ rd_thresh = best_rd + (best_rd >> 5);
+ if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+ rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+ skip_search = 1;
+ if (!skip_search) {
+ mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+ pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ rate, rd_stats, &best_angle_delta,
+ &best_rd);
+ }
+ }
+ }
+
+ mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+ return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+ (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+
+static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign,
+ int *cfl_alpha) {
+ int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO;
+ if (cfl_linear_idx == 0) {
+ *cfl_sign = CFL_SIGN_ZERO;
+ *cfl_alpha = 0;
+ } else {
+ *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG;
+ *cfl_alpha = abs(cfl_linear_idx) - 1;
+ }
+}
+
+static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int plane, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, int cfl_idx,
+ int fast_mode, RD_STATS *rd_stats) {
+ assert(IMPLIES(fast_mode, rd_stats == NULL));
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int cfl_plane = get_cfl_pred_type(plane);
+ CFL_SIGN_TYPE cfl_sign;
+ int cfl_alpha;
+ cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha);
+ // We conly build CFL for a given plane, the other plane's sign is dummy
+ int dummy_sign = CFL_SIGN_NEG;
+ const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs;
+ const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx;
+ mbmi->cfl_alpha_signs =
+ PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign);
+ mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
+ int64_t cfl_cost;
+ if (fast_mode) {
+ cfl_cost =
+ intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
+ } else {
+ av1_init_rd_stats(rd_stats);
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
+ tx_size, FTXS_NONE, 0);
+ av1_rd_cost_update(x->rdmult, rd_stats);
+ cfl_cost = rd_stats->rdcost;
+ }
+ mbmi->cfl_alpha_signs = orig_cfl_alpha_signs;
+ mbmi->cfl_alpha_idx = orig_cfl_alpha_idx;
+ return cfl_cost;
+}
+
+static const int cfl_dir_ls[2] = { 1, -1 };
+
+// If cfl_search_range is CFL_MAGS_SIZE, return zero. Otherwise return the index
+// of the best alpha found using intra_model_rd().
+static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int plane, TX_SIZE tx_size,
+ int cfl_search_range) {
+ assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+
+ if (cfl_search_range == CFL_MAGS_SIZE) return CFL_INDEX_ZERO;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->uv_mode == UV_CFL_PRED);
+ const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+ int est_best_cfl_idx = CFL_INDEX_ZERO;
+ int fast_mode = 1;
+ int start_cfl_idx = CFL_INDEX_ZERO;
+ int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+ start_cfl_idx, fast_mode, NULL);
+ for (int si = 0; si < 2; ++si) {
+ const int dir = cfl_dir_ls[si];
+ for (int i = 1; i < CFL_MAGS_SIZE; ++i) {
+ int cfl_idx = start_cfl_idx + dir * i;
+ if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+ int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+ cfl_idx, fast_mode, NULL);
+ if (cfl_cost < best_cfl_cost) {
+ best_cfl_cost = cfl_cost;
+ est_best_cfl_idx = cfl_idx;
+ } else {
+ break;
+ }
+ }
+ }
+ return est_best_cfl_idx;
+}
+
+static AOM_INLINE void set_invalid_cfl_parameters(
+ uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) {
+ *best_cfl_alpha_idx = 0;
+ *best_cfl_alpha_signs = 0;
+}
+
+static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int plane, TX_SIZE tx_size, int cfl_search_range,
+ RD_STATS cfl_rd_arr[CFL_MAGS_SIZE],
+ int est_best_cfl_idx) {
+ assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->uv_mode == UV_CFL_PRED);
+ const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+ for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
+ av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
+ }
+
+ int fast_mode = 0;
+ int start_cfl_idx = est_best_cfl_idx;
+ cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
+ &cfl_rd_arr[start_cfl_idx]);
+
+ if (cfl_search_range == 1) return;
+
+ for (int si = 0; si < 2; ++si) {
+ const int dir = cfl_dir_ls[si];
+ for (int i = 1; i < cfl_search_range; ++i) {
+ int cfl_idx = start_cfl_idx + dir * i;
+ if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+ cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
+ &cfl_rd_arr[cfl_idx]);
+ }
+ }
+}
+
+/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ *
+ * This function will use DCT_DCT followed by computing SATD (sum of absolute
+ * transformed differences) to estimate the RD score and find the best possible
+ * CFL parameter.
+ *
+ * Then the function will apply a full RD search near the best possible CFL
+ * parameter to find the best actual CFL parameter.
+ *
+ * Side effect:
+ * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
+ * search.
+ *
+ * \param[in] x Encoder prediction block structure.
+ * \param[in] cpi Top-level encoder instance structure.
+ * \param[in] tx_size Transform size.
+ * \param[in] ref_best_rd Reference best RD.
+ * \param[in] cfl_search_range The search range of full RD search near the
+ * estimated best CFL parameter.
+ *
+ * \param[out] best_rd_stats RD stats of the best CFL parameter
+ * \param[out] best_cfl_alpha_idx Best CFL alpha index
+ * \param[out] best_cfl_alpha_signs Best CFL joint signs
+ *
+ */
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+ TX_SIZE tx_size, int64_t ref_best_rd,
+ int cfl_search_range, RD_STATS *best_rd_stats,
+ uint8_t *best_cfl_alpha_idx,
+ int8_t *best_cfl_alpha_signs) {
+ assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+ const ModeCosts *mode_costs = &x->mode_costs;
+ RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
+ RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int est_best_cfl_idx_u, est_best_cfl_idx_v;
+
+ av1_invalid_rd_stats(best_rd_stats);
+
+ // As the dc pred data is same for different values of alpha, enable the
+ // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before
+ // returning to avoid the unintentional usage of cached dc pred data.
+ xd->cfl.use_dc_pred_cache = true;
+ // Evaluate alpha parameter of each chroma plane.
+ est_best_cfl_idx_u =
+ cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range);
+ est_best_cfl_idx_v =
+ cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range);
+
+ if (cfl_search_range == 1) {
+ // For cfl_search_range=1, further refinement of alpha is not enabled. Hence
+ // CfL index=0 for both the chroma planes implies invalid CfL mode.
+ if (est_best_cfl_idx_u == CFL_INDEX_ZERO &&
+ est_best_cfl_idx_v == CFL_INDEX_ZERO) {
+ set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+ clear_cfl_dc_pred_cache_flags(&xd->cfl);
+ return 0;
+ }
+
+ int cfl_alpha_u, cfl_alpha_v;
+ CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u);
+ cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v);
+ const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+ // Compute alpha and mode signaling rate.
+ const int rate_overhead =
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] +
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] +
+ mode_costs
+ ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED];
+ // Skip the CfL mode evaluation if the RD cost derived using the rate needed
+ // to signal the CfL mode and alpha parameter exceeds the ref_best_rd.
+ if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) {
+ set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+ clear_cfl_dc_pred_cache_flags(&xd->cfl);
+ return 0;
+ }
+ }
+
+ // Compute the rd cost of each chroma plane using the alpha parameters which
+ // were already evaluated.
+ cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u,
+ est_best_cfl_idx_u);
+ cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v,
+ est_best_cfl_idx_v);
+
+ clear_cfl_dc_pred_cache_flags(&xd->cfl);
+
+ for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
+ if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
+ int cfl_alpha_u;
+ CFL_SIGN_TYPE cfl_sign_u;
+ cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
+ for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
+ if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
+ int cfl_alpha_v;
+ CFL_SIGN_TYPE cfl_sign_v;
+ cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
+ // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a
+ // valid parameter for CFL
+ if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
+ int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+ RD_STATS rd_stats = cfl_rd_arr_u[ui];
+ av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
+ if (rd_stats.rate != INT_MAX) {
+ rd_stats.rate +=
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
+ rd_stats.rate +=
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
+ }
+ av1_rd_cost_update(x->rdmult, &rd_stats);
+ if (rd_stats.rdcost < best_rd_stats->rdcost) {
+ *best_rd_stats = rd_stats;
+ *best_cfl_alpha_idx =
+ (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
+ *best_cfl_alpha_signs = joint_sign;
+ }
+ }
+ }
+ if (best_rd_stats->rdcost >= ref_best_rd) {
+ av1_invalid_rd_stats(best_rd_stats);
+ // Set invalid CFL parameters here since the rdcost is not better than
+ // ref_best_rd.
+ set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+ return 0;
+ }
+ return 1;
+}
+
+static bool should_prune_chroma_smooth_pred_based_on_source_variance(
+ const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) {
+ if (!cpi->sf.intra_sf.prune_smooth_intra_mode_for_chroma) return false;
+
+ // If the source variance of both chroma planes is less than 20 (empirically
+ // derived), prune UV_SMOOTH_PRED.
+ for (int i = AOM_PLANE_U; i < av1_num_planes(&cpi->common); i++) {
+ const unsigned int variance = av1_get_perpixel_variance_facade(
+ cpi, &x->e_mbd, &x->plane[i].src, bsize, i);
+ if (variance >= 20) return false;
+ }
+ return true;
+}
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int64_t best_rd = INT64_MAX, this_rd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+
+ init_sbuv_mode(mbmi);
+
+ // Return if the current block does not correspond to a chroma block.
+ if (!xd->is_chroma_ref) {
+ *rate = 0;
+ *rate_tokenonly = 0;
+ *distortion = 0;
+ *skippable = 1;
+ return INT64_MAX;
+ }
+
+ // Only store reconstructed luma when there's chroma RDO. When there's no
+ // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+ xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+ if (xd->cfl.store_y) {
+ // Restore reconstructed luma values.
+ // TODO(chiyotsai@google.com): right now we are re-computing the txfm in
+ // this function everytime we search through uv modes. There is some
+ // potential speed up here if we cache the result to avoid redundant
+ // computation.
+ av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y,
+ DRY_RUN_NORMAL,
+ cpi->optimize_seg_arr[mbmi->segment_id]);
+ xd->cfl.store_y = 0;
+ }
+ IntraModeSearchState intra_search_state;
+ init_intra_mode_search_state(&intra_search_state);
+ const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+
+ // Search through all non-palette modes.
+ for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+ int this_rate;
+ RD_STATS tokenonly_rd_stats;
+ UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx];
+
+ // Skip the current mode evaluation if the RD cost derived using the mode
+ // signaling rate exceeds the best_rd so far.
+ const int mode_rate =
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+ if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue;
+
+ PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode);
+ const int is_directional_mode = av1_is_directional_mode(intra_mode);
+
+ if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
+ continue;
+ if (is_directional_mode &&
+ !cpi->oxcf.intra_mode_cfg.enable_directional_intra)
+ continue;
+
+ if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+ (1 << uv_mode)))
+ continue;
+ if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED &&
+ uv_mode <= UV_SMOOTH_H_PRED)
+ continue;
+
+ if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED)
+ continue;
+
+ assert(mbmi->mode < INTRA_MODES);
+ if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner &&
+ !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode)))
+ continue;
+
+ mbmi->uv_mode = uv_mode;
+
+ // Init variables for cfl and angle delta
+ const SPEED_FEATURES *sf = &cpi->sf;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ if (uv_mode == UV_CFL_PRED) {
+ if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue;
+ assert(!is_directional_mode);
+ const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd,
+ sf->intra_sf.cfl_search_range, &tokenonly_rd_stats,
+ &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) {
+ continue;
+ }
+ } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+ intra_mode_cfg->enable_angle_delta) {
+ if (sf->intra_sf.chroma_intra_pruning_with_hog &&
+ !intra_search_state.dir_mode_skip_mask_ready) {
+ static const float thresh[2][4] = {
+ { -1.2f, 0.0f, 0.0f, 1.2f }, // Interframe
+ { -1.2f, -1.2f, -0.6f, 0.4f }, // Intraframe
+ };
+ const int is_chroma = 1;
+ const int is_intra_frame = frame_is_intra_only(cm);
+ prune_intra_mode_with_hog(
+ x, bsize, cm->seq_params->sb_size,
+ thresh[is_intra_frame]
+ [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
+ intra_search_state.directional_mode_skip_mask, is_chroma);
+ intra_search_state.dir_mode_skip_mask_ready = 1;
+ }
+ if (intra_search_state.directional_mode_skip_mask[uv_mode]) {
+ continue;
+ }
+
+ // Search through angle delta
+ const int rate_overhead =
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+ if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ &this_rate, &tokenonly_rd_stats))
+ continue;
+ } else {
+ if (uv_mode == UV_SMOOTH_PRED &&
+ should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x,
+ bsize))
+ continue;
+
+ // Predict directly if we don't need to search for angle delta.
+ if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+ continue;
+ }
+ }
+ const int mode_cost =
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip_txfm;
+ }
+ }
+
+ // Search palette mode
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mbmi->bsize);
+ if (try_palette) {
+ uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+ av1_rd_pick_palette_intra_sbuv(
+ cpi, x,
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][UV_DC_PRED],
+ best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+ distortion, skippable);
+ }
+
+ *mbmi = best_mbmi;
+ // Make sure we actually chose a mode
+ assert(best_rd < INT64_MAX);
+ return best_rd;
+}
+
+// Searches palette mode for luma channel in inter frame.
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int rate2 = 0;
+ int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd;
+ int skippable = 0;
+ uint8_t *const best_palette_color_map =
+ x->palette_buffer->best_palette_color_map;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ MB_MODE_INFO best_mbmi_palette = *mbmi;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const intra_mode_cost =
+ mode_costs->mbmode_cost[size_group_lookup[bsize]];
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ av1_zero(pmi->palette_size);
+
+ RD_STATS rd_stats_y;
+ av1_invalid_rd_stats(&rd_stats_y);
+ av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map,
+ &best_rd_palette, &rd_stats_y.rate, NULL,
+ &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+ ctx, best_blk_skip, best_tx_type_map);
+ if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+ this_rd_cost->rdcost = INT64_MAX;
+ return skippable;
+ }
+
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+
+ skippable = rd_stats_y.skip_txfm;
+ distortion2 = rd_stats_y.dist;
+ rate2 = rd_stats_y.rate + ref_frame_cost;
+ if (num_planes > 1) {
+ if (intra_search_state->rate_uv_intra == INT_MAX) {
+ // We have not found any good uv mode yet, so we need to search for it.
+ TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+ av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+ &intra_search_state->rate_uv_tokenonly,
+ &intra_search_state->dist_uvs,
+ &intra_search_state->skip_uvs, bsize, uv_tx);
+ intra_search_state->mode_uv = mbmi->uv_mode;
+ intra_search_state->pmi_uv = *pmi;
+ intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+ }
+
+ // We have found at least one good uv mode before, so copy and paste it
+ // over.
+ mbmi->uv_mode = intra_search_state->mode_uv;
+ pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+ if (pmi->palette_size[1] > 0) {
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+ skippable = skippable && intra_search_state->skip_uvs;
+ distortion2 += intra_search_state->dist_uvs;
+ rate2 += intra_search_state->rate_uv_intra;
+ }
+
+ if (skippable) {
+ rate2 -= rd_stats_y.rate;
+ if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
+ rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+ } else {
+ rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+ }
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+ this_rd_cost->rate = rate2;
+ this_rd_cost->dist = distortion2;
+ this_rd_cost->rdcost = this_rd;
+ return skippable;
+}
+
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx,
+ RD_STATS *this_rd_cost, int64_t best_rd) {
+ MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int64_t best_rd_palette = best_rd, this_rd;
+ uint8_t *const best_palette_color_map =
+ x->palette_buffer->best_palette_color_map;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ MB_MODE_INFO best_mbmi_palette = *mbmi;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const intra_mode_cost =
+ mode_costs->mbmode_cost[size_group_lookup[bsize]];
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ av1_zero(pmi->palette_size);
+
+ RD_STATS rd_stats_y;
+ av1_invalid_rd_stats(&rd_stats_y);
+ av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map,
+ &best_rd_palette, &rd_stats_y.rate, NULL,
+ &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+ ctx, best_blk_skip, best_tx_type_map);
+ if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+ this_rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+
+ rd_stats_y.rate += ref_frame_cost;
+
+ if (rd_stats_y.skip_txfm) {
+ rd_stats_y.rate =
+ ref_frame_cost +
+ mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+ } else {
+ rd_stats_y.rate +=
+ mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+ }
+ this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist);
+ this_rd_cost->rate = rd_stats_y.rate;
+ this_rd_cost->dist = rd_stats_y.dist;
+ this_rd_cost->rdcost = this_rd;
+ this_rd_cost->skip_txfm = rd_stats_y.skip_txfm;
+}
+
+/*!\brief Get the intra prediction by searching through tx_type and tx_size.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Currently this function is only used in the intra frame code path for
+ * winner-mode processing.
+ *
+ * \return Returns whether the current mode is an improvement over best_rd.
+ */
+static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, const int *bmode_costs,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable,
+ MB_MODE_INFO *best_mbmi,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ RD_STATS rd_stats;
+ // In order to improve txfm search, avoid rd based breakouts during winner
+ // mode evaluation. Hence passing ref_best_rd as INT64_MAX by default when the
+ // speed feature use_rd_based_breakout_for_intra_tx_search is disabled.
+ int64_t ref_best_rd = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
+ ? *best_rd
+ : INT64_MAX;
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, ref_best_rd);
+ if (rd_stats.rate == INT_MAX) return 0;
+ int this_rate_tokenonly = rd_stats.rate;
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+ // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+ // in the tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+ const int this_rate =
+ rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
+ const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_mbmi = *mbmi;
+ *best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = rd_stats.dist;
+ *skippable = rd_stats.skip_txfm;
+ av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip,
+ ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ return 1;
+ }
+ return 0;
+}
+
+/*!\brief Search for the best filter_intra mode when coding inter frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \remark Returns nothing, but updates the mbmi and rd_stats.
+ */
+static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ const PICK_MODE_CONTEXT *ctx,
+ RD_STATS *rd_stats_y, int mode_cost,
+ int64_t best_rd,
+ int64_t best_rd_so_far) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->mode == DC_PRED &&
+ av1_filter_intra_allowed_bsize(&cpi->common, bsize));
+
+ RD_STATS rd_stats_y_fi;
+ int filter_intra_selected_flag = 0;
+ TX_SIZE best_tx_size = mbmi->tx_size;
+ FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES;
+ ++fi_mode) {
+ mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd);
+ if (rd_stats_y_fi.rate == INT_MAX) continue;
+ const int this_rate_tmp =
+ rd_stats_y_fi.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ const int64_t this_rd_tmp =
+ RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+ if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+ break;
+ }
+ if (this_rd_tmp < best_rd_so_far) {
+ best_tx_size = mbmi->tx_size;
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+ best_fi_mode = fi_mode;
+ *rd_stats_y = rd_stats_y_fi;
+ filter_intra_selected_flag = 1;
+ best_rd_so_far = this_rd_tmp;
+ }
+ }
+
+ mbmi->tx_size = best_tx_size;
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+
+ if (filter_intra_selected_flag) {
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+ } else {
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ }
+}
+
+// Evaluate a given luma intra-mode in inter frames.
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+ int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+ int64_t *best_model_rd,
+ int64_t top_intra_model_rd[]) {
+ const AV1_COMMON *cm = &cpi->common;
+ const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->ref_frame[0] == INTRA_FRAME);
+ const PREDICTION_MODE mode = mbmi->mode;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int mode_cost =
+ mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+ int known_rate = mode_cost;
+ const int intra_cost_penalty = av1_get_intra_cost_penalty(
+ cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
+ cm->seq_params->bit_depth);
+
+ if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
+ known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
+ mode_costs->skip_txfm_cost[skip_ctx][1]);
+ const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+ if (known_rd > best_rd) {
+ intra_search_state->skip_intra_modes = 1;
+ return 0;
+ }
+
+ const int is_directional_mode = av1_is_directional_mode(mode);
+ if (is_directional_mode && av1_use_angle_delta(bsize) &&
+ cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
+ if (intra_sf->intra_pruning_with_hog &&
+ !intra_search_state->dir_mode_skip_mask_ready) {
+ const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
+ const int is_chroma = 0;
+ prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size,
+ thresh[intra_sf->intra_pruning_with_hog - 1],
+ intra_search_state->directional_mode_skip_mask,
+ is_chroma);
+ intra_search_state->dir_mode_skip_mask_ready = 1;
+ }
+ if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
+ }
+ const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+ const int64_t this_model_rd =
+ intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+ const int model_rd_index_for_pruning =
+ get_model_rd_index_for_pruning(x, intra_sf);
+
+ if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd,
+ intra_sf->top_intra_model_count_allowed,
+ model_rd_index_for_pruning))
+ return 0;
+ av1_init_rd_stats(rd_stats_y);
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
+
+ // Pick filter intra modes.
+ if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ int try_filter_intra = 1;
+ int64_t best_rd_so_far = INT64_MAX;
+ if (rd_stats_y->rate != INT_MAX) {
+ // best_rd_so_far is the rdcost of DC_PRED without using filter_intra.
+ // Later, in filter intra search, best_rd_so_far is used for comparison.
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ const int tmp_rate =
+ rd_stats_y->rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+ try_filter_intra = (best_rd_so_far / 2) <= best_rd;
+ } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) {
+ // As rd cost of luma intra dc mode is more than best_rd (i.e.,
+ // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes.
+ try_filter_intra = 0;
+ }
+
+ if (try_filter_intra) {
+ handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost,
+ best_rd, best_rd_so_far);
+ }
+ }
+
+ if (rd_stats_y->rate == INT_MAX) return 0;
+
+ *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ const int rate_y = rd_stats_y->skip_txfm
+ ? mode_costs->skip_txfm_cost[skip_ctx][1]
+ : rd_stats_y->rate;
+ *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist);
+ if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) {
+ intra_search_state->skip_intra_modes = 1;
+ return 0;
+ }
+
+ return 1;
+}
+
+int av1_search_intra_uv_modes_in_interframe(
+ IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+ const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->ref_frame[0] == INTRA_FRAME);
+
+ // TODO(chiyotsai@google.com): Consolidate the chroma search code here with
+ // the one in av1_search_palette_mode.
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize);
+
+ assert(intra_search_state->rate_uv_intra == INT_MAX);
+ if (intra_search_state->rate_uv_intra == INT_MAX) {
+ // If no good uv-predictor had been found, search for it.
+ const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+ av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+ &intra_search_state->rate_uv_tokenonly,
+ &intra_search_state->dist_uvs,
+ &intra_search_state->skip_uvs, bsize, uv_tx);
+ intra_search_state->mode_uv = mbmi->uv_mode;
+ if (try_palette) intra_search_state->pmi_uv = *pmi;
+ intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+ const int uv_rate = intra_search_state->rate_uv_tokenonly;
+ const int64_t uv_dist = intra_search_state->dist_uvs;
+ const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+ if (uv_rd > best_rd) {
+ // If there is no good intra uv-mode available, we can skip all intra
+ // modes.
+ intra_search_state->skip_intra_modes = 1;
+ return 0;
+ }
+ }
+
+ // If we are here, then the encoder has found at least one good intra uv
+ // predictor, so we can directly copy its statistics over.
+ // TODO(any): the stats here is not right if the best uv mode is CFL but the
+ // best y mode is palette.
+ rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+ rd_stats_uv->dist = intra_search_state->dist_uvs;
+ rd_stats_uv->skip_txfm = intra_search_state->skip_uvs;
+ rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm;
+ mbmi->uv_mode = intra_search_state->mode_uv;
+ if (try_palette) {
+ pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+
+ return 1;
+}
+
+// Checks if odd delta angles can be pruned based on rdcosts of even delta
+// angles of the corresponding directional mode.
+static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost(
+ const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost,
+ int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) {
+ const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
+ if (!prune_luma_odd_delta_angles_in_intra ||
+ !av1_is_directional_mode(mbmi->mode) || !(abs(luma_delta_angle) & 1) ||
+ best_rd == INT64_MAX)
+ return 0;
+
+ const int64_t rd_thresh = best_rd + (best_rd >> 3);
+
+ // Neighbour rdcosts are considered for pruning of odd delta angles as
+ // mentioned below:
+ // Delta angle Delta angle rdcost
+ // to be pruned to be considered
+ // -3 -2
+ // -1 -2, 0
+ // 1 0, 2
+ // 3 2
+ return intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA] > rd_thresh &&
+ intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA + 2] >
+ rd_thresh;
+}
+
+// Finds the best non-intrabc mode on an intra frame.
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, int64_t best_rd,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ int64_t best_model_rd = INT64_MAX;
+ int is_directional_mode;
+ uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 };
+ // Flag to check rd of any intra mode is better than best_rd passed to this
+ // function
+ int beat_best_rd = 0;
+ const int *bmode_costs;
+ const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mbmi->bsize);
+ uint8_t *best_palette_color_map =
+ try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[A];
+ const int left_ctx = intra_mode_context[L];
+ bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
+
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+ if (intra_sf->intra_pruning_with_hog) {
+ // Less aggressive thresholds are used here than those used in inter frame
+ // encoding in av1_handle_intra_y_mode() because we want key frames/intra
+ // frames to have higher quality.
+ const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
+ const int is_chroma = 0;
+ prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size,
+ thresh[intra_sf->intra_pruning_with_hog - 1],
+ directional_mode_skip_mask, is_chroma);
+ }
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ pmi->palette_size[0] = 0;
+
+ // Set params for mode evaluation
+ set_mode_eval_params(cpi, x, MODE_EVAL);
+
+ MB_MODE_INFO best_mbmi = *mbmi;
+ const int max_winner_mode_count =
+ winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type];
+ zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
+ x->winner_mode_count = 0;
+
+ // Searches the intra-modes except for intrabc, palette, and filter_intra.
+ int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+ for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+ top_intra_model_rd[i] = INT64_MAX;
+ }
+
+ // Initialize the rdcost corresponding to all the directional and
+ // non-directional intra modes.
+ // 1. For directional modes, it stores the rdcost values for delta angles -4,
+ // -3, ..., 3, 4.
+ // 2. The rdcost value for luma_delta_angle is stored at index
+ // luma_delta_angle + MAX_ANGLE_DELTA + 1.
+ // 3. The rdcost values for fictitious/nonexistent luma_delta_angle -4 and 4
+ // (array indices 0 and 8) are always set to INT64_MAX (the initial value).
+ int64_t intra_modes_rd_cost[INTRA_MODE_END]
+ [SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY];
+ for (int i = 0; i < INTRA_MODE_END; i++) {
+ for (int j = 0; j < SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY; j++) {
+ intra_modes_rd_cost[i][j] = INT64_MAX;
+ }
+ }
+
+ for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+ ++mode_idx) {
+ set_y_mode_and_delta_angle(mode_idx, mbmi,
+ intra_sf->prune_luma_odd_delta_angles_in_intra);
+ RD_STATS this_rd_stats;
+ int this_rate, this_rate_tokenonly, s;
+ int is_diagonal_mode;
+ int64_t this_distortion, this_rd;
+ const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
+
+ is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode);
+ if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue;
+ if (av1_is_directional_mode(mbmi->mode) &&
+ !intra_mode_cfg->enable_directional_intra)
+ continue;
+
+ // The smooth prediction mode appears to be more frequently picked
+ // than horizontal / vertical smooth prediction modes. Hence treat
+ // them differently in speed features.
+ if ((!intra_mode_cfg->enable_smooth_intra ||
+ intra_sf->disable_smooth_intra) &&
+ (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED)
+ continue;
+
+ // The functionality of filter intra modes and smooth prediction
+ // overlap. Hence smooth prediction is pruned only if all the
+ // filter intra modes are enabled.
+ if (intra_sf->disable_smooth_intra &&
+ intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED)
+ continue;
+ if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+ continue;
+
+ // Skip the evaluation of modes that do not match with the winner mode in
+ // x->mb_mode_cache.
+ if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue;
+
+ is_directional_mode = av1_is_directional_mode(mbmi->mode);
+ if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+ if (is_directional_mode &&
+ !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+ luma_delta_angle != 0)
+ continue;
+
+ // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+ if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] &
+ (1 << mbmi->mode)))
+ continue;
+
+ if (prune_luma_odd_delta_angles_using_rd_cost(
+ mbmi, intra_modes_rd_cost[mbmi->mode], best_rd,
+ intra_sf->prune_luma_odd_delta_angles_in_intra))
+ continue;
+
+ const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+ const int64_t this_model_rd =
+ intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+ const int model_rd_index_for_pruning =
+ get_model_rd_index_for_pruning(x, intra_sf);
+
+ if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd,
+ intra_sf->top_intra_model_count_allowed,
+ model_rd_index_for_pruning))
+ continue;
+
+ // Builds the actual prediction. The prediction from
+ // model_intra_yrd_and_prune was just an estimation that did not take into
+ // account the effect of txfm pipeline, so we need to redo it for real
+ // here.
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+ this_rate_tokenonly = this_rd_stats.rate;
+ this_distortion = this_rd_stats.dist;
+ s = this_rd_stats.skip_txfm;
+
+ if (this_rate_tokenonly == INT_MAX) continue;
+
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+ // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
+ // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
+ // coded (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+ this_rate =
+ this_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
+ this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+
+ // Visual quality adjustment based on recon vs source variance.
+ if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+ this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+ }
+
+ intra_modes_rd_cost[mbmi->mode][luma_delta_angle + MAX_ANGLE_DELTA + 1] =
+ this_rd;
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ // Setting beat_best_rd flag because current mode rd is better than
+ // best_rd passed to this function
+ beat_best_rd = 1;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ }
+ }
+
+ // Searches palette
+ if (try_palette) {
+ av1_rd_pick_palette_intra_sby(
+ cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
+ &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd,
+ ctx, ctx->blk_skip, ctx->tx_type_map);
+ }
+
+ // Searches filter_intra
+ if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+ if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+ skippable, bsize, bmode_costs[DC_PRED],
+ best_mbmi.mode, &best_rd, &best_model_rd,
+ ctx)) {
+ best_mbmi = *mbmi;
+ }
+ }
+
+ // No mode is identified with less rd value than best_rd passed to this
+ // function. In such cases winner mode processing is not necessary and return
+ // best_rd as INT64_MAX to indicate best mode is not identified
+ if (!beat_best_rd) return INT64_MAX;
+
+ // In multi-winner mode processing, perform tx search for few best modes
+ // identified during mode evaluation. Winner mode processing uses best tx
+ // configuration for tx search.
+ if (cpi->sf.winner_mode_sf.multi_winner_mode_type) {
+ int best_mode_idx = 0;
+ int block_width, block_height;
+ uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
+ av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+ &block_height, NULL, NULL);
+
+ for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) {
+ *mbmi = x->winner_mode_stats[mode_idx].mbmi;
+ if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
+ // Restore color_map of palette mode before winner mode processing
+ if (mbmi->palette_mode_info.palette_size[0] > 0) {
+ uint8_t *color_map_src =
+ x->winner_mode_stats[mode_idx].color_index_map;
+ memcpy(color_map_dst, color_map_src,
+ block_width * block_height * sizeof(*color_map_src));
+ }
+ // Set params for winner mode evaluation
+ set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+ // Winner mode processing
+ // If previous searches use only the default tx type/no R-D optimization
+ // of quantized coeffs, do an extra search for the best tx type/better
+ // R-D optimization of quantized coeffs
+ if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+ rate_tokenonly, distortion, skippable, &best_mbmi,
+ ctx))
+ best_mode_idx = mode_idx;
+ }
+ }
+ // Copy color_map of palette mode for final winner mode
+ if (best_mbmi.palette_mode_info.palette_size[0] > 0) {
+ uint8_t *color_map_src =
+ x->winner_mode_stats[best_mode_idx].color_index_map;
+ memcpy(color_map_dst, color_map_src,
+ block_width * block_height * sizeof(*color_map_src));
+ }
+ } else {
+ // If previous searches use only the default tx type/no R-D optimization of
+ // quantized coeffs, do an extra search for the best tx type/better R-D
+ // optimization of quantized coeffs
+ if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
+ // Set params for winner mode evaluation
+ set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+ *mbmi = best_mbmi;
+ intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+ rate_tokenonly, distortion, skippable, &best_mbmi, ctx);
+ }
+ }
+ *mbmi = best_mbmi;
+ av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+ return best_rd;
+}
diff --git a/third_party/aom/av1/encoder/intra_mode_search.h b/third_party/aom/av1/encoder/intra_mode_search.h
new file mode 100644
index 0000000000..75289c4e3c
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search.h
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares high level functions to search through intra modes.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Variables related to intra-mode search during inter frame coding.
+ *
+ * \ingroup intra_mode_search
+ * This is a set of variables used during intra-mode search for inter frames.
+ * This includes an histogram of gradient speed features and a cache of uv
+ * prediction to avoid repeated search of chroma prediction.
+ */
+typedef struct IntraModeSearchState {
+ /*!
+ * \brief The best luma intra-mode found so far
+ */
+ PREDICTION_MODE best_intra_mode;
+
+ /** \name Speed feature variables
+ * Variables to help with pruning some luma intra-modes during inter frame
+ * coding process.
+ */
+ /**@{*/
+ /*!
+ * \brief Whether to terminate all intra mode search.
+ */
+ int skip_intra_modes;
+ /*!
+ * \brief Whether a directional mode is pruned.
+ */
+ uint8_t directional_mode_skip_mask[INTRA_MODES];
+ /*!
+ * \brief Whether \ref directional_mode_skip_mask is valid for pruning.
+ */
+ int dir_mode_skip_mask_ready;
+ /**@}*/
+
+ /** \name Chroma mode search cache
+ * A cache of the best chroma prediction mode to avoid having to search for
+ * chroma predictions repeatedly in \ref
+ * av1_search_intra_uv_modes_in_interframe()
+ */
+ /**@{*/
+ int rate_uv_intra; /*!< \brief Total rate to transmit uv_mode */
+ int rate_uv_tokenonly; /*!< \brief Rate transmit txfm tokens */
+ int64_t dist_uvs; /*!< \brief Distortion of the uv_mode's recon */
+ uint8_t skip_uvs; /*!< \brief Whether the uv txfm is skippable */
+ UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */
+ PALETTE_MODE_INFO pmi_uv; /*!< \brief Color map if mode_uv is palette */
+ int8_t uv_angle_delta; /*!< \brief Angle delta if mode_uv directional */
+ /**@}*/
+} IntraModeSearchState;
+
+/*!\brief Evaluate a given luma intra-mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles an intra-mode luma prediction when the current frame
+ * is an inter frame. This is the intra-mode counterpart of handle_inter_mode.
+ * This function performs an intra luma prediction using the mode specified by
+ * x->e_mbd.mi[0]->mode. This function does *not* support palette mode
+ * prediction in the luma channel.
+ *
+ * \param[in,out] intra_search_state Structure to intra search state.
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in,out] x Pointer to structure holding all the
+ * data for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[in] ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] ctx Structure to hold the number of 4x4 blks
+ * to copy tx_type and txfm_skip arrays.
+ * \param[out] rd_stats_y Struct to keep track of the current
+ * intra-mode's rd_stats (luma only).
+ * \param[in] best_rd Best RD seen for this block so far.
+ * \param[out] mode_cost_y The cost needed to signal the current
+ * intra mode.
+ * \param[out] rd_y The rdcost of the chosen mode.
+ * \param[in] best_model_rd Best model RD seen for this block so far
+ * \param[in] top_intra_model_rd Top intra model RD seen for this
+ * block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
+ * rd_y are also updated. Moreover, in the first evaluation with directional
+ * mode, a prune_mask computed with histogram of gradient is also stored in
+ * intra_search_state.
+ */
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+ int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+ int64_t *best_model_rd,
+ int64_t top_intra_model_rd[]);
+
+/*!\brief Search through all chroma intra-modes for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles intra-mode chroma prediction when the current frame
+ * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode
+ * with some additional book-keeping.
+ *
+ * \param[in,out] intra_search_state Structure to intra search state.
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in,out] x Pointer to structure holding all the
+ * data for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[out] rd_stats Struct to keep track of the current
+ * intra-mode's rd_stats (all planes).
+ * \param[out] rd_stats_y Struct to keep track of the current
+ * intra-mode's rd_stats (luma only).
+ * \param[out] rd_stats_uv Struct to keep track of the current
+ * intra-mode's rd_stats (chroma only).
+ * \param[in] best_rd Best RD seen for this block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv) are also
+ * updated. Moreover, in the first evocation of the function, the chroma intra
+ * mode result is cached in intra_search_state to be used in subsequent calls.
+ */
+int av1_search_intra_uv_modes_in_interframe(
+ IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+ const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in] intra_search_state Structure to hold the best luma intra mode
+ * and cache chroma prediction for speed up.
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[in] ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ * \param[in] this_rd_cost Struct to keep track of palette mode's
+ * rd_stats.
+ * \param[in] best_rd Best RD seen for this block so far.
+ *
+ * \return Returns whether luma palette mode can skip the txfm. The
+ * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in
+ * ctx are also updated.
+ */
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+ int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[in] ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ * \param[in] this_rd_cost Struct to keep track of palette mode's
+ * rd_stats.
+ * \param[in] best_rd Best RD seen for this block so far.
+ */
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx,
+ RD_STATS *this_rd_cost, int64_t best_rd);
+
+/*!\brief Perform intra-mode search on luma channels for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function performs intra-mode search on the luma channel when the
+ * current frame is intra-only. This function does not search intrabc mode,
+ * but it does search palette and filter_intra.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] rate The total rate needed to predict the current
+ * chroma block.
+ * \param[in] rate_tokenonly The rate without the cost of sending the
+ * prediction modes.
+ * chroma block.
+ * after the reconstruction.
+ * \param[in] distortion The chroma distortion of the best prediction
+ * after the reconstruction.
+ * \param[in] skippable Whether we can skip txfm process.
+ * \param[in] bsize Current partition block size.
+ * \param[in] best_rd Best RD seen for this block so far.
+ * \param[in] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ *
+ * \return Returns the rd_cost if this function finds a mode better than
+ * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate
+ * and distortion, and the tx_type arrays in ctx.
+ */
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, int64_t best_rd,
+ PICK_MODE_CONTEXT *ctx);
+
+/*!\brief Perform intra-mode search on chroma channels.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function performs intra-mode search on the chroma channels. Just like
+ * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode
+ * (filter_intra is not available on chroma planes). Unlike \ref
+ * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra
+ * frames.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] rate The total rate needed to predict the current
+ * chroma block.
+ * \param[in] rate_tokenonly The rate without the cost of sending the
+ * prediction modes.
+ * chroma block.
+ * after the reconstruction.
+ * \param[in] distortion The chroma distortion of the best prediction
+ * after the reconstruction.
+ * \param[in] skippable Whether we can skip txfm process.
+ * \param[in] bsize Current partition block size.
+ * \param[in] max_tx_size The maximum tx_size available
+ *
+ * \return Returns the rd_cost of the best uv mode found. This also updates the
+ * mbmi, the rate and distortion, distortion.
+ */
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+/*! \brief Return the number of colors in src. Used by palette mode.
+ */
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+ int *val_count, int *num_colors);
+
+/*! \brief See \ref av1_count_colors(), but for highbd.
+ */
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+ int cols, int bit_depth, int *val_count,
+ int *val_count_8bit, int *num_color_bins,
+ int *num_colors);
+
+/*! \brief Initializes the \ref IntraModeSearchState struct.
+ */
+static AOM_INLINE void init_intra_mode_search_state(
+ IntraModeSearchState *intra_search_state) {
+ memset(intra_search_state, 0, sizeof(*intra_search_state));
+ intra_search_state->rate_uv_intra = INT_MAX;
+}
+
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in] mode_idx mode index in intra mode decision
+ * process.
+ * \param[in] mbmi Pointer to structure holding the mode
+ * info for the current macroblock.
+ * \param[in] reorder_delta_angle_eval Indicates whether to reorder the
+ * evaluation of delta angle modes.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
+ int reorder_delta_angle_eval);
+
+/*! \brief prune luma intra mode based on the model rd.
+ * \param[in] this_model_rd model rd for current mode.
+ * \param[in] best_model_rd Best model RD seen for this block so
+ * far.
+ * \param[in] top_intra_model_rd Top intra model RD seen for this
+ * block so far.
+ * \param[in] max_model_cnt_allowed The maximum number of top intra
+ * model RD allowed.
+ * \param[in] model_rd_index_for_pruning Index of the candidate used for
+ * pruning based on model rd.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+ int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+ int model_rd_index_for_pruning);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/intra_mode_search_utils.h b/third_party/aom/av1/encoder/intra_mode_search_utils.h
new file mode 100644
index 0000000000..107c2236f8
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search_utils.h
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Defines utility functions used in intra mode search.
+ *
+ * This includes rdcost estimations, histogram based pruning, etc.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+// Macro for computing the speed-preset dependent threshold which is used for
+// deciding whether to enable/disable variance calculations in
+// intra_rd_variance_factor().
+#define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X)))
+
+#define BINS 32
+static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = {
+ 0.450578f, 0.695518f, -0.717944f, -0.639894f,
+ -0.602019f, -0.453454f, 0.055857f, -0.465480f,
+};
+
+static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+ -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+ -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+ -0.434156f, 0.322868f, 2.260546f, 3.368715f, 3.989290f, 3.308487f,
+ 2.277893f, 0.923793f, 0.026412f, -0.385174f, -0.718622f, -1.408867f,
+ -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+ -2.985709f, -3.447155f, 3.758139f, 3.204353f, 2.170998f, 0.826587f,
+ -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+ -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+ -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+ -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+ -0.088058f, 0.753494f, 2.092413f, 3.215266f, -3.300277f, -2.748658f,
+ -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+ -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+ -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+ 0.813112f, 1.702213f, 2.653045f, 3.351749f, 3.243554f, 3.199409f,
+ 2.437856f, 1.468854f, 0.533039f, -0.099065f, -0.622643f, -2.200732f,
+ -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f, 1.975043f,
+ 3.179528f, 3.939064f, 3.454379f, 3.689386f, 3.116411f, 1.970991f,
+ 0.798406f, -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+ -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+ -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+ -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+ -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f, 1.416882f,
+ 2.572884f, 3.607755f, 3.974820f, 3.997783f, 2.970459f, 0.791687f,
+ -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+ -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+ -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+ 2.794130f, 3.685984f, 3.745195f, 3.252444f, 2.316108f, 1.399146f,
+ -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+ -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+ -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+ -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+ -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+ -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+ -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+ -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+ 0.716997f, 1.481393f, 2.216702f, 2.737986f, 3.109809f, 3.226084f,
+ 2.490098f, -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+ -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+ -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+ -1.430687f, 0.872896f, 2.766550f, 3.610080f, 3.578041f, 3.334928f,
+ 2.586680f, 1.895721f, 1.122195f, 0.488519f, -0.140689f, -0.799076f,
+ -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static const NN_CONFIG av1_intra_hog_model_nnconfig = {
+ BINS, // num_inputs
+ DIRECTIONAL_MODES, // num_outputs
+ 0, // num_hidden_layers
+ { 0 },
+ {
+ av1_intra_hog_model_weights,
+ },
+ {
+ av1_intra_hog_model_bias,
+ },
+};
+
+#define FIX_PREC_BITS (16)
+static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+ const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
+
+ // Find index by bisection
+ static const int thresholds[BINS] = {
+ -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303,
+ -59392, -48579, -39272, -30982, -23445, -16400, -9715, -3194,
+ 3227, 9748, 16433, 23478, 31015, 39305, 48611, 59425,
+ 72336, 88392, 109364, 138593, 183191, 261638, 441831, INT32_MAX
+ };
+
+ int lo_idx = 0, hi_idx = BINS - 1;
+ // Divide into segments of size 8 gives better performance than binary search
+ // here.
+ if (ratio <= thresholds[7]) {
+ lo_idx = 0;
+ hi_idx = 7;
+ } else if (ratio <= thresholds[15]) {
+ lo_idx = 8;
+ hi_idx = 15;
+ } else if (ratio <= thresholds[23]) {
+ lo_idx = 16;
+ hi_idx = 23;
+ } else {
+ lo_idx = 24;
+ hi_idx = 31;
+ }
+
+ for (int idx = lo_idx; idx <= hi_idx; idx++) {
+ if (ratio <= thresholds[idx]) {
+ return idx;
+ }
+ }
+ assert(0 && "No valid histogram bin found!");
+ return BINS - 1;
+}
+#undef FIX_PREC_BITS
+
+// Normalizes the hog data.
+static AOM_INLINE void normalize_hog(float total, float *hist) {
+ for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
+ int rows, int cols, float *hist) {
+ float total = 0.1f;
+ src += stride;
+ for (int r = 1; r < rows - 1; ++r) {
+ for (int c = 1; c < cols - 1; ++c) {
+ const uint8_t *above = &src[c - stride];
+ const uint8_t *below = &src[c + stride];
+ const uint8_t *left = &src[c - 1];
+ const uint8_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ if (dx == 0 && dy == 0) continue;
+ const int temp = abs(dx) + abs(dy);
+ if (!temp) continue;
+ total += temp;
+ if (dx == 0) {
+ hist[0] += temp / 2;
+ hist[BINS - 1] += temp / 2;
+ } else {
+ const int idx = get_hist_bin_idx(dx, dy);
+ assert(idx >= 0 && idx < BINS);
+ hist[idx] += temp;
+ }
+ }
+ src += stride;
+ }
+
+ normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for LBD encode.
+static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane) {
+ PixelLevelGradientInfo *const grad_info_sb =
+ x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+ const uint8_t *src = x->plane[plane].src.buf;
+ const int stride = x->plane[plane].src.stride;
+ const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+ const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+ const int sb_height = block_size_high[sb_size] >> ss_y;
+ const int sb_width = block_size_wide[sb_size] >> ss_x;
+ src += stride;
+ for (int r = 1; r < sb_height - 1; ++r) {
+ for (int c = 1; c < sb_width - 1; ++c) {
+ const uint8_t *above = &src[c - stride];
+ const uint8_t *below = &src[c + stride];
+ const uint8_t *left = &src[c - 1];
+ const uint8_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+ grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+ (uint16_t)(abs(dx) + abs(dy));
+ grad_info_sb[r * sb_width + c].hist_bin_idx =
+ (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+ }
+ src += stride;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
+ int rows, int cols, float *hist) {
+ float total = 0.1f;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ src += stride;
+ for (int r = 1; r < rows - 1; ++r) {
+ for (int c = 1; c < cols - 1; ++c) {
+ const uint16_t *above = &src[c - stride];
+ const uint16_t *below = &src[c + stride];
+ const uint16_t *left = &src[c - 1];
+ const uint16_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ if (dx == 0 && dy == 0) continue;
+ const int temp = abs(dx) + abs(dy);
+ if (!temp) continue;
+ total += temp;
+ if (dx == 0) {
+ hist[0] += temp / 2;
+ hist[BINS - 1] += temp / 2;
+ } else {
+ const int idx = get_hist_bin_idx(dx, dy);
+ assert(idx >= 0 && idx < BINS);
+ hist[idx] += temp;
+ }
+ }
+ src += stride;
+ }
+
+ normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for HBD encode.
+static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane) {
+ PixelLevelGradientInfo *const grad_info_sb =
+ x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
+ const int stride = x->plane[plane].src.stride;
+ const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+ const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+ const int sb_height = block_size_high[sb_size] >> ss_y;
+ const int sb_width = block_size_wide[sb_size] >> ss_x;
+ src += stride;
+ for (int r = 1; r < sb_height - 1; ++r) {
+ for (int c = 1; c < sb_width - 1; ++c) {
+ const uint16_t *above = &src[c - stride];
+ const uint16_t *below = &src[c + stride];
+ const uint16_t *left = &src[c - 1];
+ const uint16_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+ grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+ (uint16_t)(abs(dx) + abs(dy));
+ grad_info_sb[r * sb_width + c].hist_bin_idx =
+ (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+ }
+ src += stride;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
+ int cols, float *hist, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ highbd_generate_hog(src8, stride, rows, cols, hist);
+ return;
+ }
+#else
+ (void)highbd;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ lowbd_generate_hog(src8, stride, rows, cols, hist);
+}
+
+static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(&x->e_mbd)) {
+ highbd_compute_gradient_info_sb(x, sb_size, plane);
+ return;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ lowbd_compute_gradient_info_sb(x, sb_size, plane);
+}
+
+// Gradient caching at superblock level is allowed only if all of the following
+// conditions are satisfied:
+// (1) The current frame is an intra only frame
+// (2) Non-RD mode decisions are not enabled
+// (3) The sf partition_search_type is set to SEARCH_PARTITION
+// (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled
+//
+// SB level caching of gradient data may not help in speedup for the following
+// cases:
+// (1) Inter frames (due to early intra gating)
+// (2) When partition_search_type is not SEARCH_PARTITION
+// Hence, gradient data is computed at block level in such cases.
+static AOM_INLINE bool is_gradient_caching_for_hog_enabled(
+ const AV1_COMP *const cpi) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode &&
+ (sf->part_sf.partition_search_type == SEARCH_PARTITION) &&
+ (sf->intra_sf.intra_pruning_with_hog ||
+ sf->intra_sf.chroma_intra_pruning_with_hog);
+}
+
+// Function to generate pixel level gradient information for a given superblock.
+// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
+// gradient info is generated for the same.
+static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE sb_size, int mi_row,
+ int mi_col) {
+ // Initialise flags related to hog data caching.
+ x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
+ x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
+ if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const int num_planes = av1_num_planes(&cpi->common);
+
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+ if (sf->intra_sf.intra_pruning_with_hog) {
+ compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y);
+ x->is_sb_gradient_cached[PLANE_TYPE_Y] = true;
+ }
+ if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) {
+ compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV);
+ x->is_sb_gradient_cached[PLANE_TYPE_UV] = true;
+ }
+}
+
+// Reuses the pixel level gradient data generated at superblock level for block
+// level histogram computation.
+static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+ int rows, int cols,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane,
+ float *hist) {
+ float total = 0.1f;
+ const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+ const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+ const int sb_width = block_size_wide[sb_size] >> ss_x;
+
+ // Derive the offset from the starting of the superblock in order to locate
+ // the block level gradient data in the cache.
+ const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+ const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+ const int block_offset_in_grad_cache =
+ sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) +
+ (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x));
+ const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info +
+ plane * MAX_SB_SQUARE +
+ block_offset_in_grad_cache;
+
+ // Retrieve the cached gradient information and generate the histogram.
+ for (int r = 1; r < rows - 1; ++r) {
+ for (int c = 1; c < cols - 1; ++c) {
+ const uint16_t abs_dx_abs_dy_sum =
+ grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum;
+ if (!abs_dx_abs_dy_sum) continue;
+ total += abs_dx_abs_dy_sum;
+ const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero;
+ if (is_dx_zero) {
+ hist[0] += abs_dx_abs_dy_sum >> 1;
+ hist[BINS - 1] += abs_dx_abs_dy_sum >> 1;
+ } else {
+ const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx;
+ assert(idx >= 0 && idx < BINS);
+ hist[idx] += abs_dx_abs_dy_sum;
+ }
+ }
+ }
+ normalize_hog(total, hist);
+}
+
+static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
+ BLOCK_SIZE sb_size, int plane, float *hog) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int bh = block_size_high[bsize];
+ const int bw = block_size_wide[bsize];
+ const int rows =
+ ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >>
+ ss_y;
+ const int cols =
+ ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
+ ss_x;
+
+ // If gradient data is already generated at SB level, reuse the cached data.
+ // Otherwise, compute the data.
+ if (x->is_sb_gradient_cached[plane]) {
+ generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog);
+ } else {
+ const uint8_t *src = x->plane[plane].src.buf;
+ const int src_stride = x->plane[plane].src.stride;
+ generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd));
+ }
+
+ // Scale the hog so the luma and chroma are on the same scale
+ for (int b = 0; b < BINS; ++b) {
+ hog[b] *= (1 + ss_x) * (1 + ss_y);
+ }
+}
+
+static AOM_INLINE void prune_intra_mode_with_hog(
+ const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
+ uint8_t *directional_mode_skip_mask, int is_chroma) {
+ const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
+ float hist[BINS] = { 0.0f };
+ collect_hog_data(x, bsize, sb_size, plane, hist);
+
+ // Make prediction for each of the mode
+ float scores[DIRECTIONAL_MODES] = { 0.0f };
+ av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores);
+ for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED;
+ uv_mode++) {
+ if (scores[uv_mode - UV_V_PRED] <= th) {
+ directional_mode_skip_mask[uv_mode] = 1;
+ }
+ }
+}
+#undef BINS
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+ const int stride, const int is_hbd);
+
+// Returns whether caching of source variance for 4x4 sub-blocks is allowed.
+static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled(
+ const AV1_COMP *const cpi) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ if (cpi->oxcf.mode != ALLINTRA) return false;
+
+ if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true;
+
+ if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 ||
+ (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode))
+ return false;
+
+ return true;
+}
+
+// Initialize the members of Block4x4VarInfo structure to -1 at the start
+// of every superblock.
+static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks(
+ const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks,
+ const BLOCK_SIZE sb_size) {
+ if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+ const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+ for (int i = 0; i < mi_count_in_sb; i++) {
+ src_var_info_of_4x4_sub_blocks[i].var = -1;
+ src_var_info_of_4x4_sub_blocks[i].log_var = -1.0;
+ }
+}
+
+// Returns the cost needed to send a uniformly distributed r.v.
+static AOM_INLINE int write_uniform_cost(int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return 0;
+ if (v < m)
+ return av1_cost_literal(l - 1);
+ else
+ return av1_cost_literal(l);
+}
+/*!\endcond */
+
+/*!\brief Returns the rate cost for luma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int mode_cost,
+ int discount_color_cost) {
+ int total_rate = mode_cost;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+ const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+ const int use_intrabc = mbmi->use_intrabc;
+ // Can only activate one mode.
+ assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+ use_filter_intra) <= 1);
+ const int try_palette = av1_allow_palette(
+ cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+ if (try_palette && mbmi->mode == DC_PRED) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+ const int mode_ctx = av1_get_palette_mode_ctx(xd);
+ total_rate +=
+ mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+ if (use_palette) {
+ const uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ const int plt_size = mbmi->palette_mode_info.palette_size[0];
+ int palette_mode_cost =
+ mode_costs
+ ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+ write_uniform_cost(plt_size, color_map[0]);
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ palette_mode_cost +=
+ av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+ n_cache, cpi->common.seq_params->bit_depth);
+ if (!discount_color_cost)
+ palette_mode_cost +=
+ av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+
+ total_rate += palette_mode_cost;
+ }
+ }
+ if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+ total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra];
+ if (use_filter_intra) {
+ total_rate +=
+ mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ }
+ }
+ if (av1_is_directional_mode(mbmi->mode)) {
+ if (av1_use_angle_delta(bsize)) {
+ total_rate +=
+ mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+ [MAX_ANGLE_DELTA +
+ mbmi->angle_delta[PLANE_TYPE_Y]];
+ }
+ }
+ if (av1_allow_intrabc(&cpi->common))
+ total_rate += mode_costs->intrabc_cost[use_intrabc];
+ return total_rate;
+}
+
+/*!\brief Return the rate cost for chroma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int mode_cost) {
+ int total_rate = mode_cost;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ // Can only activate one mode.
+ assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+ const int try_palette = av1_allow_palette(
+ cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+ if (try_palette && uv_mode == UV_DC_PRED) {
+ const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+ total_rate +=
+ mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+ if (use_palette) {
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+ const int plt_size = pmi->palette_size[1];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const uint8_t *const color_map = xd->plane[1].color_index_map;
+ int palette_mode_cost =
+ mode_costs
+ ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+ write_uniform_cost(plt_size, color_map[0]);
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ palette_mode_cost += av1_palette_color_cost_uv(
+ pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth);
+ palette_mode_cost +=
+ av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+ total_rate += palette_mode_cost;
+ }
+ }
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (av1_is_directional_mode(intra_mode)) {
+ if (av1_use_angle_delta(bsize)) {
+ total_rate +=
+ mode_costs->angle_delta_cost[intra_mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_UV] +
+ MAX_ANGLE_DELTA];
+ }
+ }
+ return total_rate;
+}
+
+/*!\cond */
+// Makes a quick intra prediction and estimate the rdcost with a model without
+// going through the whole txfm/quantize/itxfm process.
+static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
+ int plane, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, int use_hadamard) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ int row, col;
+ assert(!is_inter_block(xd->mi[0]));
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int txbw = tx_size_wide[tx_size];
+ const int txbh = tx_size_high[tx_size];
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ int64_t satd_cost = 0;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ // Prediction.
+ for (row = 0; row < max_blocks_high; row += stepr) {
+ for (col = 0; col < max_blocks_wide; col += stepc) {
+ av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+ // Here we use p->src_diff and p->coeff as temporary buffers for
+ // prediction residue and transform coefficients. The buffers are only
+ // used in this for loop, therefore we don't need to properly add offset
+ // to the buffers.
+ av1_subtract_block(
+ bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+ p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
+ pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
+ av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
+ block_size_wide[plane_bsize], p->coeff);
+ satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
+ }
+ }
+ return satd_cost;
+}
+/*!\endcond */
+
+/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function first makes a quick luma prediction and estimates the rdcost
+ * with a model without going through the txfm, then try to prune the current
+ * mode if the new estimate y_rd > 1.25 * best_model_rd.
+ *
+ * \return Returns 1 if the given mode is prune; 0 otherwise.
+ */
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ int64_t *best_model_rd) {
+ const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+ const int plane = 0;
+ const AV1_COMMON *cm = &cpi->common;
+ const int64_t this_model_rd =
+ intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 2)) {
+ return 1;
+ } else if (this_model_rd < *best_model_rd) {
+ *best_model_rd = this_model_rd;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
new file mode 100644
index 0000000000..4be2038a6f
--- /dev/null
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+// Though we want to compute the smallest L2 norm, in 1 dimension,
+// it is equivalent to find the smallest L1 norm and then square it.
+// This is preferrable for speed, especially on the SIMD side.
+static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) {
+#if AV1_K_MEANS_DIM == 1
+ return abs(p1[0] - p2[0]);
+#else
+ int dist = 0;
+ for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+ const int diff = p1[i] - p2[i];
+ dist += diff * diff;
+ }
+ return dist;
+#endif
+}
+
+void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *dist, int n, int k) {
+ if (dist) {
+ *dist = 0;
+ }
+ for (int i = 0; i < n; ++i) {
+ int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+ indices[i] = 0;
+ for (int j = 1; j < k; ++j) {
+ const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+ centroids + j * AV1_K_MEANS_DIM);
+ if (this_dist < min_dist) {
+ min_dist = this_dist;
+ indices[i] = j;
+ }
+ }
+ if (dist) {
+#if AV1_K_MEANS_DIM == 1
+ *dist += min_dist * min_dist;
+#else
+ *dist += min_dist;
+#endif
+ }
+ }
+}
+
+static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids,
+ const uint8_t *indices, int n, int k) {
+ int i, j;
+ int count[PALETTE_MAX_SIZE] = { 0 };
+ int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+ unsigned int rand_state = (unsigned int)data[0];
+ assert(n <= 32768);
+ memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM);
+
+ for (i = 0; i < n; ++i) {
+ const int index = indices[i];
+ assert(index < k);
+ ++count[index];
+ for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+ centroids_sum[index * AV1_K_MEANS_DIM + j] +=
+ data[i * AV1_K_MEANS_DIM + j];
+ }
+ }
+
+ for (i = 0; i < k; ++i) {
+ if (count[i] == 0) {
+ memcpy(centroids + i * AV1_K_MEANS_DIM,
+ data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+ sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+ } else {
+ for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+ centroids[i * AV1_K_MEANS_DIM + j] =
+ DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]);
+ }
+ }
+ }
+}
+
+void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k, int max_itr) {
+ int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+ uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
+ int16_t *meta_centroids[2] = { centroids, centroids_tmp };
+ uint8_t *meta_indices[2] = { indices, indices_tmp };
+ int i, l = 0, prev_l, best_l = 0;
+ int64_t this_dist;
+
+ assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT);
+
+#if AV1_K_MEANS_DIM == 1
+ av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k);
+#else
+ av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k);
+#endif
+
+ for (i = 0; i < max_itr; ++i) {
+ const int64_t prev_dist = this_dist;
+ prev_l = l;
+ l = (l == 1) ? 0 : 1;
+
+ RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k);
+ if (!memcmp(meta_centroids[l], meta_centroids[prev_l],
+ sizeof(centroids[0]) * k * AV1_K_MEANS_DIM)) {
+ break;
+ }
+#if AV1_K_MEANS_DIM == 1
+ av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist,
+ n, k);
+#else
+ av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist,
+ n, k);
+#endif
+
+ if (this_dist > prev_dist) {
+ best_l = prev_l;
+ break;
+ }
+ }
+ if (i == max_itr) best_l = l;
+ if (best_l != 0) {
+ memcpy(centroids, meta_centroids[1],
+ sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+ memcpy(indices, meta_indices[1], sizeof(indices[0]) * n);
+ }
+}
+#undef RENAME_
+#undef RENAME
diff --git a/third_party/aom/av1/encoder/level.c b/third_party/aom/av1/encoder/level.c
new file mode 100644
index 0000000000..5d5fe9ce96
--- /dev/null
+++ b/third_party/aom/av1/encoder/level.c
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/level.h"
+
+#define UNDEFINED_LEVEL \
+ { \
+ .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \
+ .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \
+ .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \
+ .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \
+ }
+
+static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
+ { .level = SEQ_LEVEL_2_0,
+ .max_picture_size = 147456,
+ .max_h_size = 2048,
+ .max_v_size = 1152,
+ .max_display_rate = 4423680L,
+ .max_decode_rate = 5529600L,
+ .max_header_rate = 150,
+ .main_mbps = 1.5,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 8,
+ .max_tile_cols = 4 },
+ { .level = SEQ_LEVEL_2_1,
+ .max_picture_size = 278784,
+ .max_h_size = 2816,
+ .max_v_size = 1584,
+ .max_display_rate = 8363520L,
+ .max_decode_rate = 10454400L,
+ .max_header_rate = 150,
+ .main_mbps = 3.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 8,
+ .max_tile_cols = 4 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_3_0,
+ .max_picture_size = 665856,
+ .max_h_size = 4352,
+ .max_v_size = 2448,
+ .max_display_rate = 19975680L,
+ .max_decode_rate = 24969600L,
+ .max_header_rate = 150,
+ .main_mbps = 6.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 16,
+ .max_tile_cols = 6 },
+ { .level = SEQ_LEVEL_3_1,
+ .max_picture_size = 1065024,
+ .max_h_size = 5504,
+ .max_v_size = 3096,
+ .max_display_rate = 31950720L,
+ .max_decode_rate = 39938400L,
+ .max_header_rate = 150,
+ .main_mbps = 10.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 16,
+ .max_tile_cols = 6 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_4_0,
+ .max_picture_size = 2359296,
+ .max_h_size = 6144,
+ .max_v_size = 3456,
+ .max_display_rate = 70778880L,
+ .max_decode_rate = 77856768L,
+ .max_header_rate = 300,
+ .main_mbps = 12.0,
+ .high_mbps = 30.0,
+ .main_cr = 4.0,
+ .high_cr = 4.0,
+ .max_tiles = 32,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_4_1,
+ .max_picture_size = 2359296,
+ .max_h_size = 6144,
+ .max_v_size = 3456,
+ .max_display_rate = 141557760L,
+ .max_decode_rate = 155713536L,
+ .max_header_rate = 300,
+ .main_mbps = 20.0,
+ .high_mbps = 50.0,
+ .main_cr = 4.0,
+ .high_cr = 4.0,
+ .max_tiles = 32,
+ .max_tile_cols = 8 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_5_0,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 267386880L,
+ .max_decode_rate = 273715200L,
+ .max_header_rate = 300,
+ .main_mbps = 30.0,
+ .high_mbps = 100.0,
+ .main_cr = 6.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_1,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 534773760L,
+ .max_decode_rate = 547430400L,
+ .max_header_rate = 300,
+ .main_mbps = 40.0,
+ .high_mbps = 160.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_2,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1094860800L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_3,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1176502272L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_6_0,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1176502272L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_1,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 2139095040L,
+ .max_decode_rate = 2189721600L,
+ .max_header_rate = 300,
+ .main_mbps = 100.0,
+ .high_mbps = 480.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_2,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4379443200L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_3,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4706009088L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+#if CONFIG_CWG_C013
+ { .level = SEQ_LEVEL_7_0,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4706009088L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_7_1,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 8556380160L,
+ .max_decode_rate = 8758886400L,
+ .max_header_rate = 300,
+ .main_mbps = 200.0,
+ .high_mbps = 960.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_7_2,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 17112760320L,
+ .max_decode_rate = 17517772800L,
+ .max_header_rate = 300,
+ .main_mbps = 320.0,
+ .high_mbps = 1600.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_7_3,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 17112760320L,
+ .max_decode_rate = 18824036352L,
+ .max_header_rate = 300,
+ .main_mbps = 320.0,
+ .high_mbps = 1600.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_8_0,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 17112760320L,
+ .max_decode_rate = 18824036352L,
+ .max_header_rate = 300,
+ .main_mbps = 320.0,
+ .high_mbps = 1600.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+ { .level = SEQ_LEVEL_8_1,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 34225520640L,
+ .max_decode_rate = 34910031052L,
+ .max_header_rate = 300,
+ .main_mbps = 400.0,
+ .high_mbps = 1920.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+ { .level = SEQ_LEVEL_8_2,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 68451041280L,
+ .max_decode_rate = 69820062105L,
+ .max_header_rate = 300,
+ .main_mbps = 640.0,
+ .high_mbps = 3200.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+ { .level = SEQ_LEVEL_8_3,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 68451041280L,
+ .max_decode_rate = 75296145408L,
+ .max_header_rate = 300,
+ .main_mbps = 640.0,
+ .high_mbps = 3200.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+#else // !CONFIG_CWG_C013
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+#endif // CONFIG_CWG_C013
+};
+
+typedef enum {
+ LUMA_PIC_SIZE_TOO_LARGE,
+ LUMA_PIC_H_SIZE_TOO_LARGE,
+ LUMA_PIC_V_SIZE_TOO_LARGE,
+ LUMA_PIC_H_SIZE_TOO_SMALL,
+ LUMA_PIC_V_SIZE_TOO_SMALL,
+ TOO_MANY_TILE_COLUMNS,
+ TOO_MANY_TILES,
+ TILE_RATE_TOO_HIGH,
+ TILE_TOO_LARGE,
+ SUPERRES_TILE_WIDTH_TOO_LARGE,
+ CROPPED_TILE_WIDTH_TOO_SMALL,
+ CROPPED_TILE_HEIGHT_TOO_SMALL,
+ TILE_WIDTH_INVALID,
+ FRAME_HEADER_RATE_TOO_HIGH,
+ DISPLAY_RATE_TOO_HIGH,
+ DECODE_RATE_TOO_HIGH,
+ CR_TOO_SMALL,
+ TILE_SIZE_HEADER_RATE_TOO_HIGH,
+ BITRATE_TOO_HIGH,
+ DECODER_MODEL_FAIL,
+
+ TARGET_LEVEL_FAIL_IDS,
+ TARGET_LEVEL_OK,
+} TARGET_LEVEL_FAIL_ID;
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+ "The picture size is too large.",
+ "The picture width is too large.",
+ "The picture height is too large.",
+ "The picture width is too small.",
+ "The picture height is too small.",
+ "Too many tile columns are used.",
+ "Too many tiles are used.",
+ "The tile rate is too high.",
+ "The tile size is too large.",
+ "The superres tile width is too large.",
+ "The cropped tile width is less than 8.",
+ "The cropped tile height is less than 8.",
+ "The tile width is invalid.",
+ "The frame header rate is too high.",
+ "The display luma sample rate is too high.",
+ "The decoded luma sample rate is too high.",
+ "The compression ratio is too small.",
+ "The product of max tile size and header rate is too high.",
+ "The bitrate is too high.",
+ "The decoder model fails.",
+};
+
+static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier,
+ BITSTREAM_PROFILE profile) {
+ if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+ const double bitrate_basis =
+ (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6;
+ const double bitrate_profile_factor =
+ profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0);
+ return bitrate_basis * bitrate_profile_factor;
+}
+
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+ BITSTREAM_PROFILE profile) {
+ assert(is_valid_seq_level_idx(level_index));
+ return get_max_bitrate(&av1_level_defs[level_index], tier, profile);
+}
+
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+ int *const max_tile_cols) {
+ assert(is_valid_seq_level_idx(level_index));
+ const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+ *max_tiles = level_spec->max_tiles;
+ *max_tile_cols = level_spec->max_tile_cols;
+}
+
+// We assume time t to be valid if and only if t >= 0.0.
+// So INVALID_TIME can be defined as anything less than 0.
+#define INVALID_TIME (-1.0)
+
+// This corresponds to "free_buffer" in the spec.
+static void release_buffer(DECODER_MODEL *const decoder_model, int idx) {
+ assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE);
+ FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+ this_buffer->decoder_ref_count = 0;
+ this_buffer->player_ref_count = 0;
+ this_buffer->display_index = -1;
+ this_buffer->presentation_time = INVALID_TIME;
+}
+
+static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) {
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ release_buffer(decoder_model, i);
+ }
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ decoder_model->vbi[i] = -1;
+ }
+}
+
+static int get_free_buffer(DECODER_MODEL *const decoder_model) {
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ const FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->decoder_ref_count == 0 &&
+ this_buffer->player_ref_count == 0)
+ return i;
+ }
+ return -1;
+}
+
+static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx,
+ int refresh_frame_flags) {
+ FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (refresh_frame_flags & (1 << i)) {
+ const int pre_idx = decoder_model->vbi[i];
+ if (pre_idx != -1) {
+ --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count;
+ }
+ decoder_model->vbi[i] = idx;
+ ++this_buffer->decoder_ref_count;
+ }
+ }
+}
+
+// The time (in seconds) required to decode a frame.
+static double time_to_decode_frame(const AV1_COMMON *const cm,
+ int64_t max_decode_rate) {
+ if (cm->show_existing_frame) return 0.0;
+
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ int luma_samples = 0;
+ if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) {
+ luma_samples = cm->superres_upscaled_width * cm->height;
+ } else {
+ const int spatial_layer_dimensions_present_flag = 0;
+ if (spatial_layer_dimensions_present_flag) {
+ assert(0 && "Spatial layer dimensions not supported yet.");
+ } else {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int max_frame_width = seq_params->max_frame_width;
+ const int max_frame_height = seq_params->max_frame_height;
+ luma_samples = max_frame_width * max_frame_height;
+ }
+ }
+
+ return luma_samples / (double)max_decode_rate;
+}
+
+// Release frame buffers that are no longer needed for decode or display.
+// It corresponds to "start_decode_at_removal_time" in the spec.
+static void release_processed_frames(DECODER_MODEL *const decoder_model,
+ double removal_time) {
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->player_ref_count > 0) {
+ if (this_buffer->presentation_time >= 0.0 &&
+ this_buffer->presentation_time <= removal_time) {
+ this_buffer->player_ref_count = 0;
+ if (this_buffer->decoder_ref_count == 0) {
+ release_buffer(decoder_model, i);
+ }
+ }
+ }
+ }
+}
+
+static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) {
+ int frames_in_pool = 0;
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ const FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->decoder_ref_count > 0 ||
+ this_buffer->player_ref_count > 0) {
+ ++frames_in_pool;
+ }
+ }
+ return frames_in_pool;
+}
+
+static double get_presentation_time(const DECODER_MODEL *const decoder_model,
+ int display_index) {
+ if (decoder_model->mode == SCHEDULE_MODE) {
+ assert(0 && "SCHEDULE_MODE NOT SUPPORTED");
+ return INVALID_TIME;
+ } else {
+ const double initial_presentation_delay =
+ decoder_model->initial_presentation_delay;
+ // Can't decide presentation time until the initial presentation delay is
+ // known.
+ if (initial_presentation_delay < 0.0) return INVALID_TIME;
+
+ return initial_presentation_delay +
+ display_index * decoder_model->num_ticks_per_picture *
+ decoder_model->display_clock_tick;
+ }
+}
+
+#define MAX_TIME 1e16
+static double time_next_buffer_is_free(int num_decoded_frame,
+ int decoder_buffer_delay,
+ const FRAME_BUFFER *frame_buffer_pool,
+ double current_time) {
+ if (num_decoded_frame == 0) {
+ return (double)decoder_buffer_delay / 90000.0;
+ }
+
+ double buf_free_time = MAX_TIME;
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i];
+ if (this_buffer->decoder_ref_count == 0) {
+ if (this_buffer->player_ref_count == 0) {
+ return current_time;
+ }
+ const double presentation_time = this_buffer->presentation_time;
+ if (presentation_time >= 0.0 && presentation_time < buf_free_time) {
+ buf_free_time = presentation_time;
+ }
+ }
+ }
+ return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME;
+}
+#undef MAX_TIME
+
+static double get_removal_time(int mode, int num_decoded_frame,
+ int decoder_buffer_delay,
+ const FRAME_BUFFER *frame_buffer_pool,
+ double current_time) {
+ if (mode == SCHEDULE_MODE) {
+ assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET");
+ return INVALID_TIME;
+ } else {
+ return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay,
+ frame_buffer_pool, current_time);
+ }
+}
+
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
+ printf(
+ "\n status %d, num_frame %3d, num_decoded_frame %3d, "
+ "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, "
+ "presentation delay %6.2f, total interval %6.2f\n",
+ decoder_model->status, decoder_model->num_frame,
+ decoder_model->num_decoded_frame, decoder_model->num_shown_frame,
+ decoder_model->current_time, frames_in_buffer_pool(decoder_model),
+ decoder_model->initial_presentation_delay,
+ decoder_model->dfg_interval_queue.total_interval);
+ for (int i = 0; i < 10; ++i) {
+ const FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ printf("buffer %d, decode count %d, display count %d, present time %6.4f\n",
+ i, this_buffer->decoder_ref_count, this_buffer->player_ref_count,
+ this_buffer->presentation_time);
+ }
+}
+
+// op_index is the operating point index.
+void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
+ int op_index, DECODER_MODEL *const decoder_model) {
+ decoder_model->status = DECODER_MODEL_OK;
+ decoder_model->level = level;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ decoder_model->bit_rate = get_max_bitrate(
+ av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
+
+ // TODO(huisu or anyone): implement SCHEDULE_MODE.
+ decoder_model->mode = RESOURCE_MODE;
+ decoder_model->encoder_buffer_delay = 20000;
+ decoder_model->decoder_buffer_delay = 70000;
+ decoder_model->is_low_delay_mode = false;
+
+ decoder_model->first_bit_arrival_time = 0.0;
+ decoder_model->last_bit_arrival_time = 0.0;
+ decoder_model->coded_bits = 0;
+
+ decoder_model->removal_time = INVALID_TIME;
+ decoder_model->presentation_time = INVALID_TIME;
+ decoder_model->decode_samples = 0;
+ decoder_model->display_samples = 0;
+ decoder_model->max_decode_rate = 0.0;
+ decoder_model->max_display_rate = 0.0;
+
+ decoder_model->num_frame = -1;
+ decoder_model->num_decoded_frame = -1;
+ decoder_model->num_shown_frame = -1;
+ decoder_model->current_time = 0.0;
+
+ initialize_buffer_pool(decoder_model);
+
+ DFG_INTERVAL_QUEUE *const dfg_interval_queue =
+ &decoder_model->dfg_interval_queue;
+ dfg_interval_queue->total_interval = 0.0;
+ dfg_interval_queue->head = 0;
+ dfg_interval_queue->size = 0;
+
+ if (seq_params->timing_info_present) {
+ decoder_model->num_ticks_per_picture =
+ seq_params->timing_info.num_ticks_per_picture;
+ decoder_model->display_clock_tick =
+ seq_params->timing_info.num_units_in_display_tick /
+ seq_params->timing_info.time_scale;
+ } else {
+ decoder_model->num_ticks_per_picture = 1;
+ decoder_model->display_clock_tick = 1.0 / cpi->framerate;
+ }
+
+ decoder_model->initial_display_delay =
+ seq_params->op_params[op_index].initial_display_delay;
+ decoder_model->initial_presentation_delay = INVALID_TIME;
+ decoder_model->decode_rate = av1_level_defs[level].max_decode_rate;
+}
+
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+ const AV1_COMP *const cpi, size_t coded_bits,
+ const DECODER_MODEL *const decoder_model) {
+ DECODER_MODEL_STATUS status = DECODER_MODEL_OK;
+
+ if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) {
+ return status;
+ }
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int show_existing_frame = cm->show_existing_frame;
+
+ size_t cur_coded_bits = decoder_model->coded_bits + coded_bits;
+ int num_decoded_frame = decoder_model->num_decoded_frame;
+ if (!show_existing_frame) ++num_decoded_frame;
+
+ if (show_existing_frame) {
+ return status;
+ } else {
+ const double removal_time = get_removal_time(
+ decoder_model->mode, num_decoded_frame,
+ decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+ decoder_model->current_time);
+ if (removal_time < 0.0) {
+ status = DECODE_FRAME_BUF_UNAVAILABLE;
+ return status;
+ }
+
+ // A frame with show_existing_frame being false indicates the end of a DFG.
+ // Update the bits arrival time of this DFG.
+ const double buffer_delay = (decoder_model->encoder_buffer_delay +
+ decoder_model->decoder_buffer_delay) /
+ 90000.0;
+ const double latest_arrival_time = removal_time - buffer_delay;
+ const double first_bit_arrival_time =
+ AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+ const double last_bit_arrival_time =
+ first_bit_arrival_time +
+ (double)cur_coded_bits / decoder_model->bit_rate;
+ // Smoothing buffer underflows if the last bit arrives after the removal
+ // time.
+ if (last_bit_arrival_time > removal_time &&
+ !decoder_model->is_low_delay_mode) {
+ status = SMOOTHING_BUFFER_UNDERFLOW;
+ return status;
+ }
+
+ // Check if the smoothing buffer overflows.
+ const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+ if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+ assert(0);
+ }
+
+ double total_interval = queue->total_interval;
+ int qhead = queue->head;
+ int qsize = queue->size;
+ // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+ while (queue->buf[qhead].removal_time <= last_bit_arrival_time &&
+ qsize > 0) {
+ if (queue->buf[qhead].removal_time - first_bit_arrival_time +
+ total_interval >
+ 1.0) {
+ status = SMOOTHING_BUFFER_OVERFLOW;
+ return status;
+ }
+ total_interval -= queue->buf[qhead].last_bit_arrival_time -
+ queue->buf[qhead].first_bit_arrival_time;
+ qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE;
+ --qsize;
+ }
+ total_interval += last_bit_arrival_time - first_bit_arrival_time;
+ // The smoothing buffer can hold at most "bit_rate" bits, which is
+ // equivalent to 1 second of total interval.
+ if (total_interval > 1.0) {
+ status = SMOOTHING_BUFFER_OVERFLOW;
+ return status;
+ }
+
+ return status;
+ }
+}
+
+void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
+ size_t coded_bits,
+ DECODER_MODEL *const decoder_model) {
+ if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int luma_pic_size = cm->superres_upscaled_width * cm->height;
+ const int show_existing_frame = cm->show_existing_frame;
+ const int show_frame = cm->show_frame || show_existing_frame;
+ ++decoder_model->num_frame;
+ if (!show_existing_frame) ++decoder_model->num_decoded_frame;
+ if (show_frame) ++decoder_model->num_shown_frame;
+ decoder_model->coded_bits += coded_bits;
+
+ int display_idx = -1;
+ if (show_existing_frame) {
+ display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show];
+ if (display_idx < 0) {
+ decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY;
+ return;
+ }
+ if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) {
+ update_ref_buffers(decoder_model, display_idx, 0xFF);
+ }
+ } else {
+ const double removal_time = get_removal_time(
+ decoder_model->mode, decoder_model->num_decoded_frame,
+ decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+ decoder_model->current_time);
+ if (removal_time < 0.0) {
+ decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+ return;
+ }
+
+ const int previous_decode_samples = decoder_model->decode_samples;
+ const double previous_removal_time = decoder_model->removal_time;
+ assert(previous_removal_time < removal_time);
+ decoder_model->removal_time = removal_time;
+ decoder_model->decode_samples = luma_pic_size;
+ const double this_decode_rate =
+ previous_decode_samples / (removal_time - previous_removal_time);
+ decoder_model->max_decode_rate =
+ AOMMAX(decoder_model->max_decode_rate, this_decode_rate);
+
+ // A frame with show_existing_frame being false indicates the end of a DFG.
+ // Update the bits arrival time of this DFG.
+ const double buffer_delay = (decoder_model->encoder_buffer_delay +
+ decoder_model->decoder_buffer_delay) /
+ 90000.0;
+ const double latest_arrival_time = removal_time - buffer_delay;
+ decoder_model->first_bit_arrival_time =
+ AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+ decoder_model->last_bit_arrival_time =
+ decoder_model->first_bit_arrival_time +
+ (double)decoder_model->coded_bits / decoder_model->bit_rate;
+ // Smoothing buffer underflows if the last bit arrives after the removal
+ // time.
+ if (decoder_model->last_bit_arrival_time > removal_time &&
+ !decoder_model->is_low_delay_mode) {
+ decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW;
+ return;
+ }
+ // Reset the coded bits for the next DFG.
+ decoder_model->coded_bits = 0;
+
+ // Check if the smoothing buffer overflows.
+ DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+ if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+ assert(0);
+ }
+ const double first_bit_arrival_time = decoder_model->first_bit_arrival_time;
+ const double last_bit_arrival_time = decoder_model->last_bit_arrival_time;
+ // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+ while (queue->buf[queue->head].removal_time <= last_bit_arrival_time &&
+ queue->size > 0) {
+ if (queue->buf[queue->head].removal_time - first_bit_arrival_time +
+ queue->total_interval >
+ 1.0) {
+ decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+ return;
+ }
+ queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time -
+ queue->buf[queue->head].first_bit_arrival_time;
+ queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE;
+ --queue->size;
+ }
+ // Push current DFG into the queue.
+ const int queue_index =
+ (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE;
+ queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time;
+ queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time;
+ queue->buf[queue_index].removal_time = removal_time;
+ queue->total_interval += last_bit_arrival_time - first_bit_arrival_time;
+ // The smoothing buffer can hold at most "bit_rate" bits, which is
+ // equivalent to 1 second of total interval.
+ if (queue->total_interval > 1.0) {
+ decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+ return;
+ }
+
+ release_processed_frames(decoder_model, removal_time);
+ decoder_model->current_time =
+ removal_time + time_to_decode_frame(cm, decoder_model->decode_rate);
+
+ const int cfbi = get_free_buffer(decoder_model);
+ if (cfbi < 0) {
+ decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+ return;
+ }
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ decoder_model->frame_buffer_pool[cfbi].frame_type =
+ cm->current_frame.frame_type;
+ display_idx = cfbi;
+ update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags);
+
+ if (decoder_model->initial_presentation_delay < 0.0) {
+ // Display can begin after required number of frames have been buffered.
+ if (frames_in_buffer_pool(decoder_model) >=
+ decoder_model->initial_display_delay - 1) {
+ decoder_model->initial_presentation_delay = decoder_model->current_time;
+ // Update presentation time for each shown frame in the frame buffer.
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->player_ref_count == 0) continue;
+ assert(this_buffer->display_index >= 0);
+ this_buffer->presentation_time =
+ get_presentation_time(decoder_model, this_buffer->display_index);
+ }
+ }
+ }
+ }
+
+ // Display.
+ if (show_frame) {
+ assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE);
+ FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[display_idx];
+ ++this_buffer->player_ref_count;
+ this_buffer->display_index = decoder_model->num_shown_frame;
+ const double presentation_time =
+ get_presentation_time(decoder_model, this_buffer->display_index);
+ this_buffer->presentation_time = presentation_time;
+ if (presentation_time >= 0.0 &&
+ decoder_model->current_time > presentation_time) {
+ decoder_model->status = DISPLAY_FRAME_LATE;
+ return;
+ }
+
+ const int previous_display_samples = decoder_model->display_samples;
+ const double previous_presentation_time = decoder_model->presentation_time;
+ decoder_model->display_samples = luma_pic_size;
+ decoder_model->presentation_time = presentation_time;
+ if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) {
+ assert(previous_presentation_time < presentation_time);
+ const double this_display_rate =
+ previous_display_samples /
+ (presentation_time - previous_presentation_time);
+ decoder_model->max_display_rate =
+ AOMMAX(decoder_model->max_display_rate, this_display_rate);
+ }
+ }
+}
+
+void av1_init_level_info(AV1_COMP *cpi) {
+ for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
+ AV1LevelInfo *const this_level_info =
+ cpi->ppi->level_params.level_info[op_index];
+ if (!this_level_info) continue;
+ memset(this_level_info, 0, sizeof(*this_level_info));
+ AV1LevelSpec *const level_spec = &this_level_info->level_spec;
+ level_spec->level = SEQ_LEVEL_MAX;
+ AV1LevelStats *const level_stats = &this_level_info->level_stats;
+ level_stats->min_cropped_tile_width = INT_MAX;
+ level_stats->min_cropped_tile_height = INT_MAX;
+ level_stats->min_frame_width = INT_MAX;
+ level_stats->min_frame_height = INT_MAX;
+ level_stats->tile_width_is_valid = 1;
+ level_stats->min_cr = 1e8;
+
+ FrameWindowBuffer *const frame_window_buffer =
+ &this_level_info->frame_window_buffer;
+ frame_window_buffer->num = 0;
+ frame_window_buffer->start = 0;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int height = cm->height;
+ const int pic_size = upscaled_width * height;
+ for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+ DECODER_MODEL *const this_model = &this_level_info->decoder_models[level];
+ const AV1LevelSpec *const spec = &av1_level_defs[level];
+ if (upscaled_width > spec->max_h_size || height > spec->max_v_size ||
+ pic_size > spec->max_picture_size) {
+ // Turn off decoder model for this level as the frame size already
+ // exceeds level constraints.
+ this_model->status = DECODER_MODEL_DISABLED;
+ } else {
+ av1_decoder_model_init(cpi, level, op_index, this_model);
+ }
+ }
+ }
+}
+
+static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
+ int is_still_picture, int64_t decoded_sample_rate) {
+ if (is_still_picture) return 0.8;
+ if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+ const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
+ const double speed_adj =
+ (double)decoded_sample_rate / level_spec->max_display_rate;
+ return AOMMAX(min_cr_basis * speed_adj, 0.8);
+}
+
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+ int is_still_picture) {
+ assert(is_valid_seq_level_idx(level_index));
+ const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+ return get_min_cr(level_spec, tier, is_still_picture,
+ level_spec->max_decode_rate);
+}
+
+static void get_temporal_parallel_params(int scalability_mode_idc,
+ int *temporal_parallel_num,
+ int *temporal_parallel_denom) {
+ if (scalability_mode_idc < 0) {
+ *temporal_parallel_num = 1;
+ *temporal_parallel_denom = 1;
+ return;
+ }
+
+ // TODO(huisu@): handle scalability cases.
+ if (scalability_mode_idc == SCALABILITY_SS) {
+ (void)scalability_mode_idc;
+ } else {
+ (void)scalability_mode_idc;
+ }
+}
+
+#define MIN_CROPPED_TILE_WIDTH 8
+#define MIN_CROPPED_TILE_HEIGHT 8
+#define MIN_FRAME_WIDTH 16
+#define MIN_FRAME_HEIGHT 16
+#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+ const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier,
+ int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) {
+ const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level];
+ const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status;
+ if (decoder_model_status != DECODER_MODEL_OK &&
+ decoder_model_status != DECODER_MODEL_DISABLED) {
+ return DECODER_MODEL_FAIL;
+ }
+
+ const AV1LevelSpec *const level_spec = &level_info->level_spec;
+ const AV1LevelSpec *const target_level_spec = &av1_level_defs[level];
+ const AV1LevelStats *const level_stats = &level_info->level_stats;
+ TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+ do {
+ if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
+ fail_id = LUMA_PIC_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_h_size > target_level_spec->max_h_size) {
+ fail_id = LUMA_PIC_H_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_v_size > target_level_spec->max_v_size) {
+ fail_id = LUMA_PIC_V_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) {
+ fail_id = TOO_MANY_TILE_COLUMNS;
+ break;
+ }
+
+ if (level_spec->max_tiles > target_level_spec->max_tiles) {
+ fail_id = TOO_MANY_TILES;
+ break;
+ }
+
+ if (level_spec->max_header_rate > target_level_spec->max_header_rate) {
+ fail_id = FRAME_HEADER_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (decoder_model->max_display_rate >
+ (double)target_level_spec->max_display_rate) {
+ fail_id = DISPLAY_RATE_TOO_HIGH;
+ break;
+ }
+
+ // TODO(huisu): we are not using max decode rate calculated by the decoder
+ // model because the model in resource availability mode always returns
+ // MaxDecodeRate(as in the level definitions) as the max decode rate.
+ if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
+ fail_id = DECODE_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) {
+ fail_id = TILE_RATE_TOO_HIGH;
+ break;
+ }
+
+#if CONFIG_CWG_C013
+ const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3)
+ ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE
+ : MAX_TILE_AREA;
+#else
+ const int max_tile_size = MAX_TILE_AREA;
+#endif
+ if (level_stats->max_tile_size > max_tile_size) {
+ fail_id = TILE_TOO_LARGE;
+ break;
+ }
+
+ if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) {
+ fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE;
+ break;
+ }
+
+ if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) {
+ fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) {
+ fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_frame_width < MIN_FRAME_WIDTH) {
+ fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) {
+ fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
+ break;
+ }
+
+ if (!level_stats->tile_width_is_valid) {
+ fail_id = TILE_WIDTH_INVALID;
+ break;
+ }
+
+ const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+ level_spec->max_decode_rate);
+ if (level_stats->min_cr < min_cr) {
+ fail_id = CR_TOO_SMALL;
+ break;
+ }
+
+ if (check_bitrate) {
+ // Check average bitrate instead of max_bitrate.
+ const double bitrate_limit =
+ get_max_bitrate(target_level_spec, tier, profile);
+ const double avg_bitrate = level_stats->total_compressed_size * 8.0 /
+ level_stats->total_time_encoded;
+ if (avg_bitrate > bitrate_limit) {
+ fail_id = BITRATE_TOO_HIGH;
+ break;
+ }
+ }
+
+ if (target_level_spec->level > SEQ_LEVEL_5_1) {
+ int temporal_parallel_num;
+ int temporal_parallel_denom;
+ const int scalability_mode_idc = -1;
+ get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num,
+ &temporal_parallel_denom);
+ const int val = level_stats->max_tile_size * level_spec->max_header_rate *
+ temporal_parallel_denom / temporal_parallel_num;
+ if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) {
+ fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH;
+ break;
+ }
+ }
+ } while (0);
+
+ return fail_id;
+}
+
+static void get_tile_stats(const AV1_COMMON *const cm,
+ const TileDataEnc *const tile_data,
+ int *max_tile_size, int *max_superres_tile_width,
+ int *min_cropped_tile_width,
+ int *min_cropped_tile_height,
+ int *tile_width_valid) {
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int superres_scale_denominator = cm->superres_scale_denominator;
+
+ *max_tile_size = 0;
+ *max_superres_tile_width = 0;
+ *min_cropped_tile_width = INT_MAX;
+ *min_cropped_tile_height = INT_MAX;
+ *tile_width_valid = 1;
+
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const TileInfo *const tile_info =
+ &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info;
+ const int tile_width =
+ (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
+ const int tile_height =
+ (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+ const int tile_size = tile_width * tile_height;
+ *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+ const int supperres_tile_width =
+ tile_width * superres_scale_denominator / SCALE_NUMERATOR;
+ *max_superres_tile_width =
+ AOMMAX(*max_superres_tile_width, supperres_tile_width);
+
+ const int cropped_tile_width =
+ cm->width - tile_info->mi_col_start * MI_SIZE;
+ const int cropped_tile_height =
+ cm->height - tile_info->mi_row_start * MI_SIZE;
+ *min_cropped_tile_width =
+ AOMMIN(*min_cropped_tile_width, cropped_tile_width);
+ *min_cropped_tile_height =
+ AOMMIN(*min_cropped_tile_height, cropped_tile_height);
+
+ const int is_right_most_tile =
+ tile_info->mi_col_end == cm->mi_params.mi_cols;
+ if (!is_right_most_tile) {
+ if (av1_superres_scaled(cm))
+ *tile_width_valid &= tile_width >= 128;
+ else
+ *tile_width_valid &= tile_width >= 64;
+ }
+ }
+ }
+}
+
+static int store_frame_record(int64_t ts_start, int64_t ts_end,
+ size_t encoded_size, int pic_size,
+ int frame_header_count, int tiles, int show_frame,
+ int show_existing_frame,
+ FrameWindowBuffer *const buffer) {
+ if (buffer->num < FRAME_WINDOW_SIZE) {
+ ++buffer->num;
+ } else {
+ buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE;
+ }
+ const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+ FrameRecord *const record = &buffer->buf[new_idx];
+ record->ts_start = ts_start;
+ record->ts_end = ts_end;
+ record->encoded_size_in_bytes = encoded_size;
+ record->pic_size = pic_size;
+ record->frame_header_count = frame_header_count;
+ record->tiles = tiles;
+ record->show_frame = show_frame;
+ record->show_existing_frame = show_existing_frame;
+
+ return new_idx;
+}
+
+// Count the number of frames encoded in the last "duration" ticks, in display
+// time.
+static int count_frames(const FrameWindowBuffer *const buffer,
+ int64_t duration) {
+ const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+ // Assume current frame is shown frame.
+ assert(buffer->buf[current_idx].show_frame);
+
+ const int64_t current_time = buffer->buf[current_idx].ts_end;
+ const int64_t time_limit = AOMMAX(current_time - duration, 0);
+ int num_frames = 1;
+ int index = current_idx - 1;
+ for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) {
+ if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+ const FrameRecord *const record = &buffer->buf[index];
+ if (!record->show_frame) continue;
+ const int64_t ts_start = record->ts_start;
+ if (ts_start < time_limit) break;
+ }
+
+ return num_frames;
+}
+
+// Scan previously encoded frames and update level metrics accordingly.
+static void scan_past_frames(const FrameWindowBuffer *const buffer,
+ int num_frames_to_scan,
+ AV1LevelSpec *const level_spec,
+ AV1LevelStats *const level_stats) {
+ const int num_frames_in_buffer = buffer->num;
+ int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
+ int frame_headers = 0;
+ int tiles = 0;
+ int64_t display_samples = 0;
+ int64_t decoded_samples = 0;
+ size_t encoded_size_in_bytes = 0;
+ for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
+ const FrameRecord *const record = &buffer->buf[index];
+ if (!record->show_existing_frame) {
+ frame_headers += record->frame_header_count;
+ decoded_samples += record->pic_size;
+ }
+ if (record->show_frame) {
+ display_samples += record->pic_size;
+ }
+ tiles += record->tiles;
+ encoded_size_in_bytes += record->encoded_size_in_bytes;
+ --index;
+ if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+ }
+ level_spec->max_header_rate =
+ AOMMAX(level_spec->max_header_rate, frame_headers);
+ // TODO(huisu): we can now compute max display rate with the decoder model, so
+ // these couple of lines can be removed. Keep them here for a while for
+ // debugging purpose.
+ level_spec->max_display_rate =
+ AOMMAX(level_spec->max_display_rate, display_samples);
+ level_spec->max_decode_rate =
+ AOMMAX(level_spec->max_decode_rate, decoded_samples);
+ level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+ level_stats->max_bitrate =
+ AOMMAX(level_stats->max_bitrate,
+ (int)AOMMIN(encoded_size_in_bytes * 8, (size_t)INT_MAX));
+}
+
+void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
+ int64_t ts_end) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int width = cm->width;
+ const int height = cm->height;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int tiles = tile_cols * tile_rows;
+ const int luma_pic_size = upscaled_width * height;
+ const int frame_header_count = cpi->frame_header_count;
+ const int show_frame = cm->show_frame;
+ const int show_existing_frame = cm->show_existing_frame;
+
+ int max_tile_size;
+ int min_cropped_tile_width;
+ int min_cropped_tile_height;
+ int max_superres_tile_width;
+ int tile_width_is_valid;
+ get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width,
+ &min_cropped_tile_width, &min_cropped_tile_height,
+ &tile_width_is_valid);
+
+ const double compression_ratio = av1_get_compression_ratio(cm, size);
+
+ const int temporal_layer_id = cm->temporal_layer_id;
+ const int spatial_layer_id = cm->spatial_layer_id;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const int is_still_picture = seq_params->still_picture;
+ // update level_stats
+ // TODO(kyslov@) fix the implementation according to buffer model
+ for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
+ if (!is_in_operating_point(seq_params->operating_point_idc[i],
+ temporal_layer_id, spatial_layer_id) ||
+ !((level_params->keep_level_stats >> i) & 1)) {
+ continue;
+ }
+
+ AV1LevelInfo *const level_info = level_params->level_info[i];
+ assert(level_info != NULL);
+ AV1LevelStats *const level_stats = &level_info->level_stats;
+
+ level_stats->max_tile_size =
+ AOMMAX(level_stats->max_tile_size, max_tile_size);
+ level_stats->max_superres_tile_width =
+ AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width);
+ level_stats->min_cropped_tile_width =
+ AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width);
+ level_stats->min_cropped_tile_height =
+ AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height);
+ level_stats->tile_width_is_valid &= tile_width_is_valid;
+ level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
+ level_stats->min_frame_height =
+ AOMMIN(level_stats->min_frame_height, height);
+ level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+ level_stats->total_compressed_size += (double)size;
+
+ // update level_spec
+ // TODO(kyslov@) update all spec fields
+ AV1LevelSpec *const level_spec = &level_info->level_spec;
+ level_spec->max_picture_size =
+ AOMMAX(level_spec->max_picture_size, luma_pic_size);
+ level_spec->max_h_size =
+ AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width);
+ level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height);
+ level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
+ level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
+
+ // Store info. of current frame into FrameWindowBuffer.
+ FrameWindowBuffer *const buffer = &level_info->frame_window_buffer;
+ store_frame_record(ts_start, ts_end, size, luma_pic_size,
+ frame_header_count, tiles, show_frame,
+ show_existing_frame, buffer);
+ if (show_frame) {
+ // Count the number of frames encoded in the past 1 second.
+ const int encoded_frames_in_last_second =
+ show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+ scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
+ level_stats);
+ level_stats->total_time_encoded +=
+ (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) /
+ (double)TICKS_PER_SEC;
+ }
+
+ DECODER_MODEL *const decoder_models = level_info->decoder_models;
+ for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+ av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
+ }
+
+ // Check whether target level is met.
+ const AV1_LEVEL target_level = level_params->target_seq_level_idx[i];
+ if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance) {
+ assert(is_valid_seq_level_idx(target_level));
+ const int tier = seq_params->tier[i];
+ const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+ level_info, target_level, tier, is_still_picture, profile, 0);
+ if (fail_id != TARGET_LEVEL_OK) {
+ const int target_level_major = 2 + (target_level >> 2);
+ const int target_level_minor = target_level & 3;
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Failed to encode to the target level %d_%d. %s",
+ target_level_major, target_level_minor,
+ level_fail_messages[fail_id]);
+ }
+ }
+ }
+}
+
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *seq_level_idx) {
+ const int is_still_picture = seq_params->still_picture;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+ seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+ if (!((level_params->keep_level_stats >> op) & 1)) continue;
+ const int tier = seq_params->tier[op];
+ const AV1LevelInfo *const level_info = level_params->level_info[op];
+ assert(level_info != NULL);
+ for (int level = 0; level < SEQ_LEVELS; ++level) {
+ if (!is_valid_seq_level_idx(level)) continue;
+ const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+ level_info, level, tier, is_still_picture, profile, 1);
+ if (fail_id == TARGET_LEVEL_OK) {
+ seq_level_idx[op] = level;
+ break;
+ }
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *target_seq_level_idx) {
+ for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+ target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+ if (!((level_params->keep_level_stats >> op) & 1)) continue;
+ target_seq_level_idx[op] = level_params->target_seq_level_idx[op];
+ }
+
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/level.h b/third_party/aom/av1/encoder/level.h
new file mode 100644
index 0000000000..ebf2a1c19d
--- /dev/null
+++ b/third_party/aom/av1/encoder/level.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LEVEL_H_
+#define AOM_AV1_ENCODER_LEVEL_H_
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+
+// AV1 Level Specifications
+typedef struct {
+ AV1_LEVEL level;
+ int max_picture_size;
+ int max_h_size;
+ int max_v_size;
+ int max_header_rate;
+ int max_tile_rate;
+ int max_tiles;
+ int max_tile_cols;
+ int64_t max_display_rate;
+ int64_t max_decode_rate;
+ double main_mbps;
+ double high_mbps;
+ double main_cr;
+ double high_cr;
+} AV1LevelSpec;
+
+typedef struct {
+ int64_t ts_start;
+ int64_t ts_end;
+ size_t encoded_size_in_bytes;
+ int pic_size;
+ int frame_header_count;
+ int tiles;
+ int show_frame;
+ int show_existing_frame;
+} FrameRecord;
+
+// Record frame info. in a rolling window.
+#define FRAME_WINDOW_SIZE 256
+typedef struct {
+ FrameRecord buf[FRAME_WINDOW_SIZE];
+ int num; // Number of FrameRecord stored in the buffer.
+ int start; // Buffer index of the first FrameRecord.
+} FrameWindowBuffer;
+
+typedef struct {
+ int max_bitrate; // Max bitrate in any 1-second window, in bps.
+ int max_tile_size;
+ int max_superres_tile_width;
+ int min_cropped_tile_width;
+ int min_cropped_tile_height;
+ int tile_width_is_valid;
+ int min_frame_width;
+ int min_frame_height;
+ double total_compressed_size; // In bytes.
+ double total_time_encoded; // In seconds.
+ double min_cr;
+} AV1LevelStats;
+
+// The following data structures are for the decoder model.
+typedef struct {
+ int decoder_ref_count;
+ int player_ref_count;
+ int display_index;
+ FRAME_TYPE frame_type;
+ double presentation_time;
+} FRAME_BUFFER;
+
+// Interval of bits transmission for a DFG(Decodable Frame Group).
+typedef struct {
+ double first_bit_arrival_time; // Time when the first bit arrives.
+ double last_bit_arrival_time; // Time when the last bit arrives.
+ // Removal time means the time when the bits to be decoded are removed from
+ // the smoothing buffer. Removal time is essentially the time when the
+ // decoding of the frame starts.
+ double removal_time;
+} DFG_INTERVAL;
+
+#define DFG_INTERVAL_QUEUE_SIZE 64
+typedef struct {
+ int head;
+ int size;
+ double total_interval;
+ DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE];
+} DFG_INTERVAL_QUEUE;
+
+enum {
+ RESOURCE_MODE = 0, // Resource availability mode.
+ SCHEDULE_MODE // Decoding schedule mode.
+} UENUM1BYTE(DECODER_MODEL_MODE);
+
+enum {
+ DECODER_MODEL_OK = 0,
+ DECODE_BUFFER_AVAILABLE_LATE,
+ DECODE_FRAME_BUF_UNAVAILABLE,
+ DECODE_EXISTING_FRAME_BUF_EMPTY,
+ DISPLAY_FRAME_LATE,
+ SMOOTHING_BUFFER_UNDERFLOW,
+ SMOOTHING_BUFFER_OVERFLOW,
+ DECODER_MODEL_DISABLED
+} UENUM1BYTE(DECODER_MODEL_STATUS);
+
+#define BUFFER_POOL_MAX_SIZE 10
+typedef struct {
+ DECODER_MODEL_STATUS status;
+ DECODER_MODEL_MODE mode;
+ bool is_low_delay_mode;
+ AV1_LEVEL level;
+ int encoder_buffer_delay; // In units of 1/90000 seconds.
+ int decoder_buffer_delay; // In units of 1/90000 seconds.
+ int num_ticks_per_picture;
+ int initial_display_delay; // In units of frames.
+ int64_t decode_rate;
+ double display_clock_tick; // In units of seconds.
+ double current_time; // In units of seconds.
+ double initial_presentation_delay; // In units of seconds.
+ double bit_rate; // Bits per second.
+
+ int num_frame;
+ int num_decoded_frame;
+ int num_shown_frame;
+ int vbi[REF_FRAMES]; // Virtual buffer index.
+ FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE];
+ DFG_INTERVAL_QUEUE dfg_interval_queue;
+
+ // Information for the DFG(Decodable Frame Group) being processed.
+ double first_bit_arrival_time;
+ double last_bit_arrival_time;
+ size_t coded_bits;
+
+ // Information for the frame being processed.
+ double removal_time;
+ double presentation_time;
+ int decode_samples;
+ int display_samples;
+
+ double max_display_rate;
+ double max_decode_rate;
+} DECODER_MODEL;
+
+typedef struct {
+ AV1LevelStats level_stats;
+ AV1LevelSpec level_spec;
+ FrameWindowBuffer frame_window_buffer;
+ DECODER_MODEL decoder_models[SEQ_LEVELS];
+} AV1LevelInfo;
+
+typedef struct AV1LevelParams {
+ // Specifies the level that the coded video sequence conforms to for each
+ // operating point.
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ // Bit mask to indicate whether to keep level stats for corresponding
+ // operating points.
+ uint32_t keep_level_stats;
+ // Level information for each operating point.
+ AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
+} AV1LevelParams;
+
+static INLINE int is_in_operating_point(int operating_point,
+ int temporal_layer_id,
+ int spatial_layer_id) {
+ if (!operating_point) return 1;
+
+ return ((operating_point >> temporal_layer_id) & 1) &&
+ ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+void av1_init_level_info(struct AV1_COMP *cpi);
+
+void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
+ int64_t ts_end);
+
+// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *seq_level_idx);
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *target_seq_level_idx);
+
+// Print the status of the decoder model(for debugging).
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level,
+ int op_index, DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
+ size_t coded_bits,
+ DECODER_MODEL *const decoder_model);
+
+// This function uses the decoder model to check whether there could be
+// SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not
+// update the content of decoder_model, and can be used to target certain
+// encoding level in the recode loop.
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+ const struct AV1_COMP *const cpi, size_t coded_bits,
+ const DECODER_MODEL *const decoder_model);
+
+// Return max bitrate(bps) for given level.
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+ BITSTREAM_PROFILE profile);
+
+// Get max number of tiles and tile columns for given level.
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+ int *const max_tile_cols);
+
+// Return minimum compression ratio for given level.
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+ int is_still_picture);
+#endif // AOM_AV1_ENCODER_LEVEL_H_
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 0000000000..9ef9b88675
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+ int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if (++index >= ctx->max_sz) index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+ if (ctx) {
+ if (ctx->buf) {
+ int i;
+
+ for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+struct lookahead_ctx *av1_lookahead_init(
+ unsigned int width, unsigned int height, unsigned int subsampling_x,
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+ const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+ bool is_all_intra, int num_pyramid_levels) {
+ int lag_in_frames = AOMMAX(1, depth);
+
+ // For all-intra frame encoding, previous source frames are not required.
+ // Hence max_pre_frames is set to 0 in this case. As previous source frames
+ // are accessed using a negative index to av1_lookahead_peek(), setting
+ // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a
+ // negative index.
+ const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES;
+
+ // Add the lags to depth and clamp
+ depth += num_lap_buffers;
+ depth = clamp(depth, 1, MAX_TOTAL_BUFFERS);
+
+ // Allocate memory to keep previous source frames available.
+ depth += max_pre_frames;
+
+ // Allocate the lookahead structures
+ struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx));
+ if (ctx) {
+ unsigned int i;
+ ctx->max_sz = depth;
+ ctx->push_frame_count = 0;
+ ctx->max_pre_frames = max_pre_frames;
+ ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames;
+ ctx->read_ctxs[ENCODE_STAGE].valid = 1;
+ if (num_lap_buffers) {
+ ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames;
+ ctx->read_ctxs[LAP_STAGE].valid = 1;
+ }
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ if (!ctx->buf) goto fail;
+ for (i = 0; i < depth; i++) {
+ if (aom_realloc_frame_buffer(
+ &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+ use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL,
+ NULL, num_pyramid_levels, 0)) {
+ goto fail;
+ }
+ }
+ }
+ return ctx;
+fail:
+ av1_lookahead_destroy(ctx);
+ return NULL;
+}
+
+int av1_lookahead_full(const struct lookahead_ctx *ctx) {
+ // TODO(angiebird): Test this function.
+ return ctx->read_ctxs[ENCODE_STAGE].sz >= ctx->read_ctxs[ENCODE_STAGE].pop_sz;
+}
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ int num_pyramid_levels, aom_enc_frame_flags_t flags) {
+ int width = src->y_crop_width;
+ int height = src->y_crop_height;
+ int uv_width = src->uv_crop_width;
+ int uv_height = src->uv_crop_height;
+ int subsampling_x = src->subsampling_x;
+ int subsampling_y = src->subsampling_y;
+ int larger_dimensions, new_dimensions;
+
+ assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1);
+ if (ctx->read_ctxs[ENCODE_STAGE].sz + ctx->max_pre_frames > ctx->max_sz)
+ return 1;
+
+ ctx->read_ctxs[ENCODE_STAGE].sz++;
+ if (ctx->read_ctxs[LAP_STAGE].valid) {
+ ctx->read_ctxs[LAP_STAGE].sz++;
+ }
+
+ struct lookahead_entry *buf = pop(ctx, &ctx->write_idx);
+
+ new_dimensions = width != buf->img.y_crop_width ||
+ height != buf->img.y_crop_height ||
+ uv_width != buf->img.uv_crop_width ||
+ uv_height != buf->img.uv_crop_height;
+ larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+ uv_width > buf->img.uv_width ||
+ uv_height > buf->img.uv_height;
+ assert(!larger_dimensions || new_dimensions);
+
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y, use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0))
+ return 1;
+ aom_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
+ }
+ // Partial copy not implemented yet
+ av1_copy_and_extend_frame(src, &buf->img);
+
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->display_idx = ctx->push_frame_count;
+ buf->flags = flags;
+ ++ctx->push_frame_count;
+ aom_remove_metadata_from_frame_buffer(&buf->img);
+ if (src->metadata &&
+ aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) {
+ return 1;
+ }
+ return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+ COMPRESSOR_STAGE stage) {
+ struct lookahead_entry *buf = NULL;
+ if (ctx) {
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) {
+ buf = pop(ctx, &read_ctx->read_idx);
+ read_ctx->sz--;
+ }
+ }
+ return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+ COMPRESSOR_STAGE stage) {
+ struct lookahead_entry *buf = NULL;
+ if (ctx == NULL) {
+ return buf;
+ }
+
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ if (index >= 0) {
+ // Forward peek
+ if (index < read_ctx->sz) {
+ index += read_ctx->read_idx;
+ if (index >= ctx->max_sz) index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ } else if (index < 0) {
+ // Backward peek
+ if (-index <= ctx->max_pre_frames) {
+ index += (int)(read_ctx->read_idx);
+ if (index < 0) index += (int)(ctx->max_sz);
+ buf = ctx->buf + index;
+ }
+ }
+
+ return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+ COMPRESSOR_STAGE stage) {
+ assert(ctx != NULL);
+
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ return read_ctx->sz;
+}
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) {
+ assert(ctx != NULL);
+
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ return read_ctx->pop_sz;
+}
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 0000000000..c0e6d222f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes look ahead buffer operations.
+ */
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
+
+#include <stdbool.h>
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define MAX_LAG_BUFFERS 48
+#define MAX_LAP_BUFFERS 48
+#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
+#define LAP_LAG_IN_FRAMES 17
+
+struct lookahead_entry {
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ int display_idx;
+ aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE);
+
+struct read_ctx {
+ int sz; /* Number of buffers currently in the queue */
+ int read_idx; /* Read index */
+ int pop_sz; /* Size to check for pop condition */
+ int valid; /* Is this ctx valid? */
+};
+
+struct lookahead_ctx {
+ int max_sz; /* Absolute size of the queue */
+ int write_idx; /* Write index */
+ struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
+ struct lookahead_entry *buf; /* Buffer list */
+ int push_frame_count; /* Number of frames that have been pushed in the queue*/
+ uint8_t
+ max_pre_frames; /* Maximum number of past frames allowed in the queue */
+};
+/*!\endcond */
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(
+ unsigned int width, unsigned int height, unsigned int subsampling_x,
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+ const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+ bool is_all_intra, int num_pyramid_levels);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Check if lookahead buffer is full
+ */
+int av1_lookahead_full(const struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] use_highbitdepth Tell if HBD is used
+ * \param[in] num_pyramid_levels Number of pyramid levels to allocate
+ for each frame buffer
+ * \param[in] flags Flags set on this frame
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ int num_pyramid_levels, aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ * \param[in] stage Encoder stage
+ *
+ * \retval Return NULL, if drain set and queue is empty, or if drain not set and
+ * queue not of the configured depth.
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+ COMPRESSOR_STAGE stage);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ * \param[in] stage Encoder stage
+ *
+ * \retval Return NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+ COMPRESSOR_STAGE stage);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+ COMPRESSOR_STAGE stage);
+
+/**\brief Get pop_sz value
+ */
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 0000000000..4e53447379
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,3998 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
+ const MvCosts *mv_costs,
+ const MV *ref_mv, int errorperbit,
+ int sadperbit) {
+ mv_cost_params->ref_mv = ref_mv;
+ mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
+ mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
+ mv_cost_params->error_per_bit = errorperbit;
+ mv_cost_params->sad_per_bit = sadperbit;
+ // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the
+ // population of mvjcost and mvcost are avoided. In case of IntraBC, these
+ // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode().
+ if (mv_costs != NULL) {
+ mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
+ mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
+ mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
+ }
+}
+
+static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
+ ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
+ ms_buffers->src = &x->plane[0].src;
+
+ av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
+
+ ms_buffers->wsrc = x->obmc_buffer.wsrc;
+ ms_buffers->obmc_mask = x->obmc_buffer.mask;
+}
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
+ obmc_buffer->wsrc = NULL;
+ obmc_buffer->mask = NULL;
+ obmc_buffer->above_pred = NULL;
+ obmc_buffer->left_pred = NULL;
+}
+
+void av1_make_default_fullpel_ms_params(
+ FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
+ const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+ SEARCH_METHODS search_method, int fine_search_interval) {
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const int is_key_frame =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
+
+ // High level params
+ ms_params->bsize = bsize;
+ ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
+
+ init_ms_buffers(&ms_params->ms_buffers, x);
+
+ av1_set_mv_search_method(ms_params, search_sites, search_method);
+
+ ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
+ ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
+ ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
+ ms_params->prune_mesh_search =
+ (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0;
+ ms_params->mesh_search_mv_diff_threshold = 4;
+ ms_params->run_mesh_search = 0;
+ ms_params->fine_search_interval = fine_search_interval;
+
+ ms_params->is_intra_mode = 0;
+
+ ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level;
+
+ ms_params->mv_limits = x->mv_limits;
+ av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
+
+ // Mvcost params
+ init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+ x->errorperbit, x->sadperbit);
+
+ ms_params->sdf = ms_params->vfp->sdf;
+ ms_params->sdx4df = ms_params->vfp->sdx4df;
+ ms_params->sdx3df = ms_params->vfp->sdx3df;
+
+ if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) {
+ ms_params->sdf = ms_params->vfp->sdsf;
+ ms_params->sdx4df = ms_params->vfp->sdsx4df;
+ // Skip version of sadx3 is not available yet
+ ms_params->sdx3df = ms_params->vfp->sdsx4df;
+ } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 &&
+ !is_key_frame) {
+ FULLPEL_MV start_mv_clamped = start_mv;
+ // adjust start_mv to make sure it is within MV range
+ clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits);
+
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const int ref_stride = ref->stride;
+ const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped);
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+
+ unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows;
+ start_mv_sad_even_rows =
+ ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride);
+ start_mv_sad_odd_rows =
+ ms_params->vfp->sdsf(src_buf + src_stride, src_stride,
+ best_address + ref_stride, ref_stride);
+
+ // If the absolute SAD difference computed between the pred-to-src of even
+ // and odd rows is small, skip every other row in sad computation.
+ const int odd_to_even_diff_sad =
+ abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+ const int mult_thresh = 4;
+ if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+ ms_params->sdf = ms_params->vfp->sdsf;
+ ms_params->sdx4df = ms_params->vfp->sdsx4df;
+ ms_params->sdx3df = ms_params->vfp->sdsx4df;
+ }
+ }
+}
+
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const IntraBCMVCosts *dv_costs) {
+ ms_params->is_intra_mode = 1;
+
+ MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+ mv_cost_params->mvjcost = dv_costs->joint_mv;
+ mv_cost_params->mvcost[0] = dv_costs->dv_costs[0];
+ mv_cost_params->mvcost[1] = dv_costs->dv_costs[1];
+}
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct AV1_COMP *cpi,
+ const MACROBLOCK *x, BLOCK_SIZE bsize,
+ const MV *ref_mv, const int *cost_list) {
+ const AV1_COMMON *cm = &cpi->common;
+ // High level params
+ ms_params->allow_hp = cm->features.allow_high_precision_mv;
+ ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop;
+ ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step;
+ ms_params->cost_list = cond_cost_list_const(cpi, cost_list);
+
+ av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
+
+ // Mvcost params
+ init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+ x->errorperbit, x->sadperbit);
+
+ // Subpel variance params
+ ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
+ ms_params->var_params.subpel_search_type =
+ cpi->sf.mv_sf.use_accurate_subpel_search;
+ ms_params->var_params.w = block_size_wide[bsize];
+ ms_params->var_params.h = block_size_high[bsize];
+
+ // Ref and src buffers
+ MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers;
+ init_ms_buffers(ms_buffers, x);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
+ // Calculate the outermost full-pixel MVs which are inside the limits set by
+ // av1_set_subpel_mv_search_range().
+ //
+ // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar
+ // for mv->row. We can then divide by 8 to find the fullpel MV limits. But
+ // we have to be careful about the rounding. We want these bounds to be
+ // at least as tight as the subpel limits, which means that we must round
+ // the minimum values up and the maximum values down when dividing.
+ int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL;
+ int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL;
+ int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+ int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+ col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+ row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+ col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+ row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+ // Get intersection of UMV window and valid MV window to reduce # of checks
+ // in diamond search.
+ if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+ if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+ if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+ if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+
+ mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max);
+ mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max);
+}
+
+int av1_init_search_range(int size) {
+ int sr = 0;
+ // Minimum search size no matter what the passed in value.
+ size = AOMMAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+ sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+ return sr;
+}
+
+// ============================================================================
+// Cost of motion vectors
+// ============================================================================
+// TODO(any): Adaptively adjust the regularization strength based on image size
+// and motion activity instead of using hard-coded values. It seems like we
+// roughly half the lambda for each increase in resolution
+// These are multiplier used to perform regularization in motion compensation
+// when x->mv_cost_type is set to MV_COST_L1.
+// LOWRES
+#define SSE_LAMBDA_LOWRES 2 // Used by mv_cost_err_fn
+#define SAD_LAMBDA_LOWRES 32 // Used by mvsad_err_cost during full pixel search
+// MIDRES
+#define SSE_LAMBDA_MIDRES 0 // Used by mv_cost_err_fn
+#define SAD_LAMBDA_MIDRES 15 // Used by mvsad_err_cost during full pixel search
+// HDRES
+#define SSE_LAMBDA_HDRES 1 // Used by mv_cost_err_fn
+#define SAD_LAMBDA_HDRES 8 // Used by mvsad_err_cost during full pixel search
+
+// Returns the rate of encoding the current motion vector based on the
+// joint_cost and comp_cost. joint_costs covers the cost of transmitting
+// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
+// vector.
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+ const int *const comp_cost[2]) {
+ return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+ comp_cost[1][mv->col];
+}
+
+#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr))
+// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost
+// is defined as the rate required to encode diff * weight, rounded to the
+// nearest 2 ** 7.
+// This is NOT used during motion compensation.
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+ int *const mvcost[2], int weight) {
+ const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+ return ROUND_POWER_OF_TWO(
+ mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// used when var is used as the error metric.
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
+ const int *mvjcost, const int *const mvcost[2],
+ int error_per_bit, MV_COST_TYPE mv_cost_type) {
+ const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+ const MV abs_diff = { abs(diff.row), abs(diff.col) };
+
+ switch (mv_cost_type) {
+ case MV_COST_ENTROPY:
+ if (mvcost) {
+ return (int)ROUND_POWER_OF_TWO_64(
+ (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+ RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+ PIXEL_TRANSFORM_ERROR_SCALE);
+ }
+ return 0;
+ case MV_COST_L1_LOWRES:
+ return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3;
+ case MV_COST_L1_MIDRES:
+ return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3;
+ case MV_COST_L1_HDRES:
+ return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3;
+ case MV_COST_NONE: return 0;
+ default: assert(0 && "Invalid rd_cost_type"); return 0;
+ }
+}
+
+static INLINE int mv_err_cost_(const MV *mv,
+ const MV_COST_PARAMS *mv_cost_params) {
+ if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
+ return 0;
+ }
+ return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
+ mv_cost_params->mvcost, mv_cost_params->error_per_bit,
+ mv_cost_params->mv_cost_type);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// only used during full pixel motion search when sad is used as the error
+// metric
+static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
+ const int *mvjcost, const int *const mvcost[2],
+ int sad_per_bit, MV_COST_TYPE mv_cost_type) {
+ const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
+ GET_MV_SUBPEL(mv->col - ref_mv->col) };
+
+ switch (mv_cost_type) {
+ case MV_COST_ENTROPY:
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) *
+ sad_per_bit,
+ AV1_PROB_COST_SHIFT);
+ case MV_COST_L1_LOWRES:
+ return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3;
+ case MV_COST_L1_MIDRES:
+ return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+ case MV_COST_L1_HDRES:
+ return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+ case MV_COST_NONE: return 0;
+ default: assert(0 && "Invalid rd_cost_type"); return 0;
+ }
+}
+
+static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
+ const MV_COST_PARAMS *mv_cost_params) {
+ return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
+ mv_cost_params->mvjcost, mv_cost_params->mvcost,
+ mv_cost_params->sad_per_bit,
+ mv_cost_params->mv_cost_type);
+}
+
+// =============================================================================
+// Fullpixel Motion Search: Translational
+// =============================================================================
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale
+#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+
+// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods.
+// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+ int level) {
+ int num_search_steps = 0;
+ int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+ cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+ cfg->site[stage_index][0].offset = 0;
+ cfg->stride = stride;
+
+ // Choose the initial step size depending on level.
+ const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP;
+
+ for (int radius = first_step; radius > 0;) {
+ int num_search_pts = 8;
+
+ const FULLPEL_MV search_site_mvs[13] = {
+ { 0, 0 }, { -radius, 0 }, { radius, 0 },
+ { 0, -radius }, { 0, radius }, { -radius, -radius },
+ { radius, radius }, { -radius, radius }, { radius, -radius },
+ };
+
+ int i;
+ for (i = 0; i <= num_search_pts; ++i) {
+ search_site *const site = &cfg->site[stage_index][i];
+ site->mv = search_site_mvs[i];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ cfg->searches_per_step[stage_index] = num_search_pts;
+ cfg->radius[stage_index] = radius;
+ // Update the search radius based on level.
+ if (!level || ((stage_index < 9) && level)) radius /= 2;
+ --stage_index;
+ ++num_search_steps;
+ }
+ cfg->num_search_steps = num_search_steps;
+}
+
+void av1_init_motion_fpf(search_site_config *cfg, int stride) {
+ int num_search_steps = 0;
+ int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+ cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+ cfg->site[stage_index][0].offset = 0;
+ cfg->stride = stride;
+
+ for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+ // Generate offsets for 8 search sites per step.
+ int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+ int num_search_pts = 12;
+ if (radius == 1) num_search_pts = 8;
+
+ const FULLPEL_MV search_site_mvs[13] = {
+ { 0, 0 },
+ { -radius, 0 },
+ { radius, 0 },
+ { 0, -radius },
+ { 0, radius },
+ { -radius, -tan_radius },
+ { radius, tan_radius },
+ { -tan_radius, radius },
+ { tan_radius, -radius },
+ { -radius, tan_radius },
+ { radius, -tan_radius },
+ { tan_radius, radius },
+ { -tan_radius, -radius },
+ };
+
+ int i;
+ for (i = 0; i <= num_search_pts; ++i) {
+ search_site *const site = &cfg->site[stage_index][i];
+ site->mv = search_site_mvs[i];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ cfg->searches_per_step[stage_index] = num_search_pts;
+ cfg->radius[stage_index] = radius;
+ --stage_index;
+ ++num_search_steps;
+ }
+ cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for NSTEP / NSTEP_8PT search methods.
+// level = 0: NSTEP, level = 1: NSTEP_8PT.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+ int level) {
+ int num_search_steps = 0;
+ int stage_index = 0;
+ cfg->stride = stride;
+ int radius = 1;
+ const int num_stages = (level > 0) ? 16 : 15;
+ for (stage_index = 0; stage_index < num_stages; ++stage_index) {
+ int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+ int num_search_pts = 12;
+ if ((radius <= 5) || (level > 0)) {
+ tan_radius = radius;
+ num_search_pts = 8;
+ }
+ const FULLPEL_MV search_site_mvs[13] = {
+ { 0, 0 },
+ { -radius, 0 },
+ { radius, 0 },
+ { 0, -radius },
+ { 0, radius },
+ { -radius, -tan_radius },
+ { radius, tan_radius },
+ { -tan_radius, radius },
+ { tan_radius, -radius },
+ { -radius, tan_radius },
+ { radius, -tan_radius },
+ { tan_radius, radius },
+ { -tan_radius, -radius },
+ };
+
+ for (int i = 0; i <= num_search_pts; ++i) {
+ search_site *const site = &cfg->site[stage_index][i];
+ site->mv = search_site_mvs[i];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ cfg->searches_per_step[stage_index] = num_search_pts;
+ cfg->radius[stage_index] = radius;
+ ++num_search_steps;
+ if (stage_index < 12)
+ radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
+ }
+ cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND
+// search methods.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+ int level) {
+ (void)level;
+ cfg->stride = stride;
+ // First scale has 4-closest points, the rest have 8 points in diamond
+ // shape at increasing scales
+ static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+ 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+
+ // BIGDIA search method candidates.
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const FULLPEL_MV
+ site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 } },
+ { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+ { -1, 1 }, { -2, 0 } },
+ { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+ { -2, 2 }, { -4, 0 } },
+ { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+ { -4, 4 }, { -8, 0 } },
+ { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+ { -8, 8 }, { -16, 0 } },
+ { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+ { 0, 32 }, { -16, 16 }, { -32, 0 } },
+ { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+ { 0, 64 }, { -32, 32 }, { -64, 0 } },
+ { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+ { 0, 128 }, { -64, 64 }, { -128, 0 } },
+ { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 },
+ { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } },
+ { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 },
+ { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } },
+ { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+ { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+ };
+
+ /* clang-format on */
+ int radius = 1;
+ for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+ cfg->searches_per_step[i] = bigdia_num_candidates[i];
+ cfg->radius[i] = radius;
+ for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+ search_site *const site = &cfg->site[i][j];
+ site->mv = site_candidates[i][j];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ radius *= 2;
+ }
+ cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for SQUARE search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+ int level) {
+ (void)level;
+ cfg->stride = stride;
+ // All scales have 8 closest points in square shape.
+ static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+
+ // Square search method candidates.
+ // Note that the largest candidate step at each scale is 2^scale.
+ /* clang-format off */
+ static const FULLPEL_MV
+ square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+ { -2, 2 }, { -2, 0 } },
+ { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+ { -4, 4 }, { -4, 0 } },
+ { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+ { -8, 8 }, { -8, 0 } },
+ { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+ { 0, 16 }, { -16, 16 }, { -16, 0 } },
+ { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+ { 0, 32 }, { -32, 32 }, { -32, 0 } },
+ { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+ { 0, 64 }, { -64, 64 }, { -64, 0 } },
+ { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 },
+ { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } },
+ { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 },
+ { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } },
+ { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 },
+ { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } },
+ { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+ { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+ };
+
+ /* clang-format on */
+ int radius = 1;
+ for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+ cfg->searches_per_step[i] = square_num_candidates[i];
+ cfg->radius[i] = radius;
+ for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+ search_site *const site = &cfg->site[i][j];
+ site->mv = square_candidates[i][j];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ radius *= 2;
+ }
+ cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for HEX / FAST_HEX search methods.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+ int level) {
+ (void)level;
+ cfg->stride = stride;
+ // First scale has 8-closest points, the rest have 6 points in hex shape
+ // at increasing scales.
+ static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6 };
+ // Note that the largest candidate step at each scale is 2^scale.
+ /* clang-format off */
+ static const FULLPEL_MV
+ hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+ { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+ { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+ { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 },
+ { -8, 16 }, { -16, 0 } },
+ { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+ { -32, 0 } },
+ { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+ { -64, 0 } },
+ { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 },
+ { -64, 128 }, { -128, 0 } },
+ { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 },
+ { -128, 256 }, { -256, 0 } },
+ { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 },
+ { -256, 512 }, { -512, 0 } },
+ { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+ { -512, 1024 }, { -1024, 0 } },
+ };
+
+ /* clang-format on */
+ int radius = 1;
+ for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+ cfg->searches_per_step[i] = hex_num_candidates[i];
+ cfg->radius[i] = radius;
+ for (int j = 0; j < hex_num_candidates[i]; ++j) {
+ search_site *const site = &cfg->site[i][j];
+ site->mv = hex_candidates[i][j];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ radius *= 2;
+ }
+ cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+const av1_init_search_site_config
+ av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = {
+ av1_init_dsmotion_compensation, av1_init_motion_compensation_nstep,
+ av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation,
+ av1_init_motion_compensation_hex, av1_init_motion_compensation_bigdia,
+ av1_init_motion_compensation_square
+ };
+
+// Checks whether the mv is within range of the mv_limits
+static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
+ int range) {
+ return ((row - range) >= mv_limits->row_min) &
+ ((row + range) <= mv_limits->row_max) &
+ ((col - range) >= mv_limits->col_min) &
+ ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int get_mvpred_var_cost(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+ FULLPEL_MV_STATS *mv_stats) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ int bestsme;
+
+ bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+ ref_stride, &mv_stats->sse);
+ mv_stats->distortion = bestsme;
+
+ mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ bestsme += mv_stats->err_cost;
+
+ return bestsme;
+}
+
+static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct buf_2d *const src,
+ const uint8_t *const ref_address,
+ const int ref_stride) {
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+
+ return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
+}
+
+static INLINE int get_mvpred_compound_var_cost(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+ FULLPEL_MV_STATS *mv_stats) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ const uint8_t *mask = ms_params->ms_buffers.mask;
+ const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+ const int mask_stride = ms_params->ms_buffers.mask_stride;
+ const int invert_mask = ms_params->ms_buffers.inv_mask;
+ int bestsme;
+
+ if (mask) {
+ bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+ src_buf, src_stride, second_pred, mask, mask_stride,
+ invert_mask, &mv_stats->sse);
+ } else if (second_pred) {
+ bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+ src_buf, src_stride, &mv_stats->sse, second_pred);
+ } else {
+ bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+ ref_stride, &mv_stats->sse);
+ }
+ mv_stats->distortion = bestsme;
+
+ const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+ mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ bestsme += mv_stats->err_cost;
+
+ return bestsme;
+}
+
+static INLINE int get_mvpred_compound_sad(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct buf_2d *const src, const uint8_t *const ref_address,
+ const int ref_stride) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+
+ const uint8_t *mask = ms_params->ms_buffers.mask;
+ const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+ const int mask_stride = ms_params->ms_buffers.mask_stride;
+ const int invert_mask = ms_params->ms_buffers.inv_mask;
+
+ if (mask) {
+ return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred,
+ mask, mask_stride, invert_mask);
+ } else if (second_pred) {
+ return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
+ } else {
+ return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
+ }
+}
+
+// Calculates and returns a sad+mvcost list around an integer best pel during
+// fullpixel motion search. The resulting list can be used to speed up subpel
+// motion search later.
+#define USE_SAD_COSTLIST 1
+
+// calc_int_cost_list uses var to populate the costlist, which is more accurate
+// than sad but slightly slower.
+static AOM_FORCE_INLINE void calc_int_cost_list(
+ const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ int *cost_list) {
+ static const FULLPEL_MV neighbors[4] = {
+ { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+ };
+ const int br = best_mv.row;
+ const int bc = best_mv.col;
+
+ FULLPEL_MV_STATS mv_stats;
+ cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats);
+
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ cost_list[i + 1] =
+ get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
+ cost_list[i + 1] = INT_MAX;
+ } else {
+ cost_list[i + 1] =
+ get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
+ }
+ }
+ }
+}
+
+// calc_int_sad_list uses sad to populate the costlist, which is less accurate
+// than var but faster.
+static AOM_FORCE_INLINE void calc_int_sad_list(
+ const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ int *cost_list, int costlist_has_sad) {
+ static const FULLPEL_MV neighbors[4] = {
+ { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+ };
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const int ref_stride = ref->stride;
+ const int br = best_mv.row;
+ const int bc = best_mv.col;
+
+ assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv));
+
+ // Refresh the costlist it does not contain valid sad
+ if (!costlist_has_sad) {
+ cost_list[0] = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride);
+
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV this_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ cost_list[i + 1] = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV this_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ cost_list[i + 1] = INT_MAX;
+ } else {
+ cost_list[i + 1] = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+ }
+ }
+ }
+ }
+
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params);
+
+ for (int idx = 0; idx < 4; idx++) {
+ if (cost_list[idx + 1] != INT_MAX) {
+ const FULLPEL_MV this_mv = { br + neighbors[idx].row,
+ bc + neighbors[idx].col };
+ cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params);
+ }
+ }
+}
+
+// Computes motion vector cost and adds to the sad cost.
+// Then updates the best sad and motion vectors.
+// Inputs:
+// this_sad: the sad to be evaluated.
+// mv: the current motion vector.
+// mv_cost_params: a structure containing information to compute mv cost.
+// best_sad: the current best sad.
+// raw_best_sad (optional): the current best sad without calculating mv cost.
+// best_mv: the current best motion vector.
+// second_best_mv (optional): the second best motion vector up to now.
+// Modifies:
+// best_sad, raw_best_sad, best_mv, second_best_mv
+// If the current sad is lower than the current best sad.
+// Returns:
+// Whether the input sad (mv) is better than the current best.
+static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad,
+ const FULLPEL_MV *mv,
+ const MV_COST_PARAMS *mv_cost_params,
+ unsigned int *best_sad,
+ unsigned int *raw_best_sad,
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV *second_best_mv) {
+ if (this_sad >= *best_sad) return 0;
+
+ // Add the motion vector cost.
+ const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params);
+ if (sad < *best_sad) {
+ if (raw_best_sad) *raw_best_sad = this_sad;
+ *best_sad = sad;
+ if (second_best_mv) *second_best_mv = *best_mv;
+ *best_mv = *mv;
+ return 1;
+ }
+ return 0;
+}
+
+// Calculate sad4 and update the bestmv information
+// in FAST_DIAMOND search method.
+static AOM_INLINE void calc_sad4_update_bestmv(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ const FULLPEL_MV center_mv, const uint8_t *center_address,
+ unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+ int *best_site, int cand_start, int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+
+ unsigned char const *block_offset[4];
+ unsigned int sads_buf[4];
+ unsigned int *sads;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ if (cost_list) {
+ sads = (unsigned int *)(cost_list + 1);
+ } else {
+ sads = sads_buf;
+ }
+ // Loop over number of candidates.
+ for (int j = 0; j < 4; j++)
+ block_offset[j] = site[cand_start + j].offset + center_address;
+
+ // 4-point sad calculation.
+ ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
+
+ for (int j = 0; j < 4; j++) {
+ const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row,
+ center_mv.col + site[cand_start + j].mv.col };
+ const int found_better_mv = update_mvs_and_sad(
+ sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = cand_start + j;
+ }
+}
+
+static AOM_INLINE void calc_sad3_update_bestmv(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad,
+ unsigned int *raw_bestsad, int search_step, int *best_site,
+ const int *chkpts_indices, int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+ unsigned char const *block_offset[4] = {
+ center_address + site[chkpts_indices[0]].offset,
+ center_address + site[chkpts_indices[1]].offset,
+ center_address + site[chkpts_indices[2]].offset,
+ center_address,
+ };
+ unsigned int sads[4];
+ ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
+ for (int j = 0; j < 3; j++) {
+ const int index = chkpts_indices[j];
+ const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+ center_mv.col + site[index].mv.col };
+ const int found_better_mv = update_mvs_and_sad(
+ sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = j;
+ }
+ if (cost_list) {
+ for (int j = 0; j < 3; j++) {
+ int index = chkpts_indices[j];
+ cost_list[index + 1] = sads[j];
+ }
+ }
+}
+
+// Calculate sad and update the bestmv information
+// in FAST_DIAMOND search method.
+static AOM_INLINE void calc_sad_update_bestmv(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ const FULLPEL_MV center_mv, const uint8_t *center_address,
+ unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+ int *best_site, const int num_candidates, int cand_start, int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+ // Loop over number of candidates.
+ for (int i = cand_start; i < num_candidates; i++) {
+ const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row,
+ center_mv.col + site[i].mv.col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+ int thissad = get_mvpred_sad(ms_params, src,
+ center_address + site[i].offset, ref->stride);
+ if (cost_list) {
+ cost_list[i + 1] = thissad;
+ }
+ const int found_better_mv = update_mvs_and_sad(
+ thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = i;
+ }
+}
+
+static AOM_INLINE void calc_sad_update_bestmv_with_indices(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ const FULLPEL_MV center_mv, const uint8_t *center_address,
+ unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+ int *best_site, const int num_candidates, const int *chkpts_indices,
+ int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+ // Loop over number of candidates.
+ for (int i = 0; i < num_candidates; i++) {
+ int index = chkpts_indices[i];
+ const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+ center_mv.col + site[index].mv.col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ if (cost_list) {
+ cost_list[index + 1] = INT_MAX;
+ }
+ continue;
+ }
+ const int thissad = get_mvpred_sad(
+ ms_params, src, center_address + site[index].offset, ref->stride);
+ if (cost_list) {
+ cost_list[index + 1] = thissad;
+ }
+ const int found_better_mv = update_mvs_and_sad(
+ thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = i;
+ }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+static int pattern_search(FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ static const int search_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site_config *search_sites = ms_params->search_sites;
+ const int *num_candidates = search_sites->searches_per_step;
+ const int ref_stride = ref->stride;
+ const int last_is_4 = num_candidates[0] == 4;
+ int br, bc;
+ unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
+ int k = -1;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
+ assert(search_step >= 0);
+ int best_init_s = search_steps[search_step];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ br = start_mv.row;
+ bc = start_mv.col;
+ if (cost_list != NULL) {
+ cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+ INT_MAX;
+ }
+ int costlist_has_sad = 0;
+
+ // Work out the start point for the search
+ raw_bestsad = get_mvpred_sad(ms_params, src,
+ get_buf_from_fullmv(ref, &start_mv), ref_stride);
+ bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params);
+
+ // Search all possible scales up to the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv);
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
+ // Call 4-point sad for multiples of 4 candidates.
+ const int no_of_4_cand_loops = num_candidates[t] >> 2;
+ for (i = 0; i < no_of_4_cand_loops; i++) {
+ calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, t,
+ &best_site, i * 4, /*cost_list=*/NULL);
+ }
+ // Rest of the candidates
+ const int remaining_cand = num_candidates[t] % 4;
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, t,
+ &best_site, remaining_cand,
+ no_of_4_cand_loops * 4, NULL);
+ } else {
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, t,
+ &best_site, num_candidates[t], 0, NULL);
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += search_sites->site[best_init_s][k].mv.row;
+ bc += search_sites->site[best_init_s][k].mv.col;
+ center_address += search_sites->site[best_init_s][k].offset;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ const int last_s = (last_is_4 && cost_list != NULL);
+ int best_site = -1;
+ s = best_init_s;
+
+ for (; s >= last_s; s--) {
+ // No need to search all points the 1st time if initial search was used
+ if (!do_init_search || s != best_init_s) {
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ // Call 4-point sad for multiples of 4 candidates.
+ const int no_of_4_cand_loops = num_candidates[s] >> 2;
+ for (i = 0; i < no_of_4_cand_loops; i++) {
+ calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+ center_mv, center_address, &bestsad,
+ &raw_bestsad, s, &best_site, i * 4,
+ /*cost_list=*/NULL);
+ }
+ // Rest of the candidates
+ const int remaining_cand = num_candidates[s] % 4;
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, remaining_cand,
+ no_of_4_cand_loops * 4, NULL);
+ } else {
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, num_candidates[s], 0, NULL);
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += search_sites->site[s][best_site].mv.row;
+ bc += search_sites->site[s][best_site].mv.col;
+ center_address += search_sites->site[s][best_site].offset;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, next_chkpts_indices, NULL);
+ } else {
+ calc_sad_update_bestmv_with_indices(
+ ms_params, mv_cost_params, best_mv, center_mv, center_address,
+ &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+ next_chkpts_indices, NULL);
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += search_sites->site[s][k].mv.row;
+ bc += search_sites->site[s][k].mv.col;
+ center_address += search_sites->site[s][k].offset;
+ }
+ } while (best_site != -1);
+ }
+ // Note: If we enter the if below, then cost_list must be non-NULL.
+ if (s == 0) {
+ cost_list[0] = raw_bestsad;
+ costlist_has_sad = 1;
+ assert(num_candidates[s] == 4);
+ if (!do_init_search || s != best_init_s) {
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, 0, cost_list);
+ } else {
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, /*num_candidates=*/4,
+ /*cand_start=*/0, cost_list);
+ }
+
+ if (best_site != -1) {
+ br += search_sites->site[s][best_site].mv.row;
+ bc += search_sites->site[s][best_site].mv.col;
+ center_address += search_sites->site[s][best_site].offset;
+ k = best_site;
+ }
+ }
+ while (best_site != -1) {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+ cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+ cost_list[((k + 2) % 4) + 1] = cost_list[0];
+ cost_list[0] = raw_bestsad;
+
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ assert(PATTERN_CANDIDATES_REF == 3);
+ calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, next_chkpts_indices, cost_list);
+ } else {
+ calc_sad_update_bestmv_with_indices(
+ ms_params, mv_cost_params, best_mv, center_mv, center_address,
+ &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+ next_chkpts_indices, cost_list);
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += search_sites->site[s][k].mv.row;
+ bc += search_sites->site[s][k].mv.col;
+ center_address += search_sites->site[s][k].offset;
+ }
+ }
+ }
+ }
+ best_mv->row = br;
+ best_mv->col = bc;
+
+ assert(center_address == get_buf_from_fullmv(ref, best_mv) &&
+ "center address is out of sync with best_mv!\n");
+
+ // Returns the one-away integer pel cost/sad around the best as follows:
+ // cost_list[0]: cost/sad at the best integer pel
+ // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel
+ // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+ // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel
+ // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel
+ if (cost_list) {
+ if (USE_SAD_COSTLIST) {
+ calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+ } else {
+ calc_int_cost_list(*best_mv, ms_params, cost_list);
+ }
+ }
+
+ const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats);
+ return var_cost;
+}
+
+// For the following foo_search, the input arguments are:
+// start_mv: where we are starting our motion search
+// ms_params: a collection of motion search parameters
+// search_step: how many steps to skip in our motion search. For example,
+// a value 3 suggests that 3 search steps have already taken place prior to
+// this function call, so we jump directly to step 4 of the search process
+// do_init_search: if on, do an initial search of all possible scales around the
+// start_mv, and then pick the best scale.
+// cond_list: used to hold the cost around the best full mv so we can use it to
+// speed up subpel search later.
+// best_mv: the best mv found in the motion search
+static int hex_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return pattern_search(start_mv, ms_params, search_step, do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int bigdia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return pattern_search(start_mv, ms_params, search_step, do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int square_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return pattern_search(start_mv, ms_params, search_step, do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_hex_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return hex_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int vfast_dia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return bigdia_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
+ do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_dia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return bigdia_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
+ do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_bigdia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return bigdia_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
+ do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, int *num00,
+ FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+#define UPDATE_SEARCH_STEP \
+ do { \
+ if (best_site != 0) { \
+ tmp_second_best_mv = *best_mv; \
+ best_mv->row += site[best_site].mv.row; \
+ best_mv->col += site[best_site].mv.col; \
+ best_address += site[best_site].offset; \
+ is_off_center = 1; \
+ } \
+ \
+ if (is_off_center == 0) num_center_steps++; \
+ \
+ if (best_site == 0 && step > 2) { \
+ int next_step_size = cfg->radius[step - 1]; \
+ while (next_step_size == cfg->radius[step] && step > 2) { \
+ num_center_steps++; \
+ --step; \
+ next_step_size = cfg->radius[step - 1]; \
+ } \
+ } \
+ } while (0)
+
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+ const search_site_config *cfg = ms_params->search_sites;
+
+ int is_off_center = 0;
+ // Number of times that we have stayed in the middle. This is used to skip
+ // search steps in the future if diamond_search_sad is called again.
+ int num_center_steps = 0;
+
+ // search_step determines the length of the initial step and hence the number
+ // of iterations.
+ const int tot_steps = cfg->num_search_steps - search_step;
+ FULLPEL_MV tmp_second_best_mv;
+ if (second_best_mv) {
+ tmp_second_best_mv = *second_best_mv;
+ }
+
+ *best_mv = start_mv;
+
+ // Check the starting position
+ const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
+ unsigned int bestsad = start_mv_sad;
+
+ // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf
+ if (ms_params->ms_buffers.second_pred) {
+ for (int step = tot_steps - 1; step >= 0; --step) {
+ const search_site *site = cfg->site[step];
+ const int num_searches = cfg->searches_per_step[step];
+ int best_site = 0;
+
+ for (int idx = 1; idx <= num_searches; idx++) {
+ const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+ best_mv->col + site[idx].mv.col };
+
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ const uint8_t *const check_here = site[idx].offset + best_address;
+ unsigned int thissad =
+ get_mvpred_compound_sad(ms_params, src, check_here, ref_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = idx;
+ }
+ }
+ }
+ }
+ UPDATE_SEARCH_STEP;
+ }
+ } else {
+ for (int step = tot_steps - 1; step >= 0; --step) {
+ const search_site *site = cfg->site[step];
+ const int num_searches = cfg->searches_per_step[step];
+ int best_site = 0;
+
+ int all_in = 1;
+ // Trap illegal vectors
+ all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min;
+ all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max;
+ all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min;
+ all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max;
+
+ if (all_in) {
+ for (int idx = 1; idx <= num_searches; idx += 4) {
+ unsigned char const *block_offset[4];
+ unsigned int sads[4];
+
+ for (int j = 0; j < 4; j++)
+ block_offset[j] = site[idx + j].offset + best_address;
+
+ ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride,
+ sads);
+ for (int j = 0; j < 4; j++) {
+ if (sads[j] < bestsad) {
+ const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
+ best_mv->col +
+ site[idx + j].mv.col };
+ unsigned int thissad =
+ sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = idx + j;
+ }
+ }
+ }
+ }
+ } else {
+ for (int idx = 1; idx <= num_searches; idx++) {
+ const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+ best_mv->col + site[idx].mv.col };
+
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ const uint8_t *const check_here = site[idx].offset + best_address;
+ unsigned int thissad =
+ get_mvpred_sad(ms_params, src, check_here, ref_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = idx;
+ }
+ }
+ }
+ }
+ }
+ UPDATE_SEARCH_STEP;
+ }
+ }
+
+ *num00 = num_center_steps;
+ if (second_best_mv) {
+ *second_best_mv = tmp_second_best_mv;
+ }
+
+ return bestsad;
+
+#undef UPDATE_SEARCH_STEP
+}
+
+static INLINE unsigned int get_start_mvpred_sad_cost(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
+
+ unsigned int start_mv_sad =
+ mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params);
+
+ if (ms_params->ms_buffers.second_pred)
+ start_mv_sad +=
+ get_mvpred_compound_sad(ms_params, src, best_address, ref->stride);
+ else
+ start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride);
+
+ return start_mv_sad;
+}
+
+static int full_pixel_diamond(FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, int *cost_list,
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv) {
+ const search_site_config *cfg = ms_params->search_sites;
+ int thissme, n, num00 = 0;
+
+ // Clamp start mv and calculate the cost
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv);
+
+ diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv,
+ second_best_mv);
+
+ int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats);
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ const int further_steps = cfg->num_search_steps - 1 - step_param;
+ while (n < further_steps) {
+ ++n;
+
+ // TODO(chiyotsai@google.com): There is another bug here where the second
+ // best mv gets incorrectly overwritten. Fix it later.
+ FULLPEL_MV tmp_best_mv;
+ FULLPEL_MV_STATS tmp_best_mv_stats;
+ diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n,
+ &num00, &tmp_best_mv, second_best_mv);
+
+ thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv,
+ &tmp_best_mv_stats);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *best_mv = tmp_best_mv;
+ *best_mv_stats = tmp_best_mv_stats;
+ }
+
+ if (num00) {
+ // Advance the loop by num00 steps
+ n += num00;
+ num00 = 0;
+ }
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ if (USE_SAD_COSTLIST) {
+ const int costlist_has_sad = 0;
+ calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+ } else {
+ calc_int_cost_list(*best_mv, ms_params, cost_list);
+ }
+ }
+ return bestsme;
+}
+
+// Exhaustive motion search around a given centre position with a given
+// step size.
+static int exhaustive_mesh_search(FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int range, const int step,
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV *second_best_mv) {
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const int ref_stride = ref->stride;
+ unsigned int best_sad = INT_MAX;
+ int r, c, i;
+ int start_col, end_col, start_row, end_row;
+ const int col_step = (step > 1) ? step : 4;
+
+ assert(step >= 1);
+
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ *best_mv = start_mv;
+ best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv),
+ ref_stride);
+ best_sad += mvsad_err_cost_(&start_mv, mv_cost_params);
+ start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row);
+ start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col);
+ end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row);
+ end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col);
+
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += col_step) {
+ // Step > 1 means we are not checking every location in this pass.
+ if (step > 1) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
+ unsigned int sad = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+ update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+ /*raw_best_sad=*/NULL, best_mv, second_best_mv);
+ } else {
+ // 4 sads in a single call if we are checking every location
+ if (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+ addrs[i] = get_buf_from_fullmv(ref, &mv);
+ }
+
+ ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ if (sads[i] < best_sad) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+ update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad,
+ /*raw_best_sad=*/NULL, best_mv,
+ second_best_mv);
+ }
+ }
+ } else {
+ for (i = 0; i < end_col - c; ++i) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+ unsigned int sad = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+ update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+ /*raw_best_sad=*/NULL, best_mv, second_best_mv);
+ }
+ }
+ }
+ }
+ }
+
+ return best_sad;
+}
+
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct MESH_PATTERN *const mesh_patterns,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *mv_stats,
+ FULLPEL_MV *second_best_mv) {
+ const int kMinRange = 7;
+ const int kMaxRange = 256;
+ const int kMinInterval = 1;
+
+ int bestsme;
+ int i;
+ int interval = mesh_patterns[0].interval;
+ int range = mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref
+ // version of sad and variance function. We still need to check the
+ // performance when compound ref exhaustive search is enabled.
+ assert(!ms_params->ms_buffers.second_pred &&
+ "Mesh search does not support compound mode!");
+
+ *best_mv = start_mv;
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) ||
+ (interval > range))
+ return INT_MAX;
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
+ range = AOMMIN(range, kMaxRange);
+ interval = AOMMAX(interval, range / baseline_interval_divisor);
+ // Use a small search step/interval for certain kind of clips.
+ // For example, screen content clips with a lot of texts.
+ // Large interval could lead to a false matching position, and it can't find
+ // the best global candidate in following iterations due to reduced search
+ // range. The solution here is to use a small search iterval in the beginning
+ // and thus reduces the chance of missing the best candidate.
+ if (ms_params->fine_search_interval) {
+ interval = AOMMIN(interval, 4);
+ }
+
+ // initial search
+ bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
+ best_mv, second_best_mv);
+
+ if ((interval > kMinInterval) && (range > kMinRange)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhaustive_mesh_search(
+ *best_mv, ms_params, mesh_patterns[i].range,
+ mesh_patterns[i].interval, best_mv, second_best_mv);
+
+ if (mesh_patterns[i].interval == 1) break;
+ }
+ }
+
+ if (bestsme < INT_MAX) {
+ bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats);
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ if (USE_SAD_COSTLIST) {
+ const int costlist_has_sad = 0;
+ calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+ } else {
+ calc_int_cost_list(*best_mv, ms_params, cost_list);
+ }
+ }
+ return bestsme;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode, or when searching for one component of an ext-inter compound mode.
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) {
+ static const search_neighbors neighbors[8] = {
+ { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+ };
+
+ uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P *
+ SEARCH_GRID_STRIDE_8P] = { 0 };
+ int grid_center = SEARCH_GRID_CENTER_8P;
+ int grid_coord = grid_center;
+
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const FullMvLimits *mv_limits = &ms_params->mv_limits;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const struct buf_2d *src = ms_buffers->src;
+ const struct buf_2d *ref = ms_buffers->ref;
+ const int ref_stride = ref->stride;
+
+ *best_mv = start_mv;
+ clamp_fullmv(best_mv, mv_limits);
+
+ unsigned int best_sad = get_mvpred_compound_sad(
+ ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride);
+ best_sad += mvsad_err_cost_(best_mv, mv_cost_params);
+
+ do_refine_search_grid[grid_coord] = 1;
+
+ for (int i = 0; i < SEARCH_RANGE_8P; ++i) {
+ int best_site = -1;
+
+ for (int j = 0; j < 8; ++j) {
+ grid_coord = grid_center + neighbors[j].coord_offset;
+ if (do_refine_search_grid[grid_coord] == 1) {
+ continue;
+ }
+ const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row,
+ best_mv->col + neighbors[j].coord.col };
+
+ do_refine_search_grid[grid_coord] = 1;
+ if (av1_is_fullmv_in_range(mv_limits, mv)) {
+ unsigned int sad;
+ sad = get_mvpred_compound_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_mv->row += neighbors[best_site].coord.row;
+ best_mv->col += neighbors[best_site].coord.col;
+ grid_center += neighbors[best_site].coord_offset;
+ }
+ }
+ return best_sad;
+}
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, int *cost_list,
+ FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv) {
+ const BLOCK_SIZE bsize = ms_params->bsize;
+ const SEARCH_METHODS search_method = ms_params->search_method;
+
+ const int is_intra_mode = ms_params->is_intra_mode;
+ int run_mesh_search = ms_params->run_mesh_search;
+
+ int var = 0;
+ MARK_MV_INVALID(best_mv);
+ if (second_best_mv) {
+ MARK_MV_INVALID(second_best_mv);
+ }
+
+ if (cost_list) {
+ cost_list[0] = INT_MAX;
+ cost_list[1] = INT_MAX;
+ cost_list[2] = INT_MAX;
+ cost_list[3] = INT_MAX;
+ cost_list[4] = INT_MAX;
+ }
+
+ assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride);
+ assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width);
+
+ switch (search_method) {
+ case FAST_BIGDIA:
+ var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case VFAST_DIAMOND:
+ var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case FAST_DIAMOND:
+ var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case FAST_HEX:
+ var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case HEX:
+ var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv,
+ best_mv_stats);
+ break;
+ case SQUARE:
+ var = square_search(start_mv, ms_params, step_param, 1, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case BIGDIA:
+ var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case NSTEP:
+ case NSTEP_8PT:
+ case DIAMOND:
+ case CLAMPED_DIAMOND:
+ var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
+ best_mv, best_mv_stats, second_best_mv);
+ break;
+ default: assert(0 && "Invalid search method.");
+ }
+
+ // Should we allow a follow on exhaustive search?
+ if (!run_mesh_search &&
+ ((search_method == NSTEP) || (search_method == NSTEP_8PT)) &&
+ !ms_params->ms_buffers.second_pred) {
+ int exhaustive_thr = ms_params->force_mesh_thresh;
+ exhaustive_thr >>=
+ 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ // Threshold variance for an exhaustive full search.
+ if (var > exhaustive_thr) run_mesh_search = 1;
+ }
+
+ // TODO(yunqing): the following is used to reduce mesh search in temporal
+ // filtering. Can extend it to intrabc.
+ if (!is_intra_mode && ms_params->prune_mesh_search) {
+ const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
+ abs(start_mv.col - best_mv->col));
+ if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) {
+ run_mesh_search = 0;
+ }
+ }
+
+ if (ms_params->sdf != ms_params->vfp->sdf) {
+ // If we are skipping rows when we perform the motion search, we need to
+ // check the quality of skipping. If it's bad, then we run mesh search with
+ // skip row features off.
+ // TODO(chiyotsai@google.com): Handle the case where we have a vertical
+ // offset of 1 before we hit this statement to avoid having to redo
+ // motion search.
+ const struct buf_2d *src = ms_params->ms_buffers.src;
+ const struct buf_2d *ref = ms_params->ms_buffers.ref;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ const uint8_t *src_address = src->buf;
+ const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
+ const int sad =
+ ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
+ const int skip_sad =
+ ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
+ // We will keep the result of skipping rows if it's good enough. Here, good
+ // enough means the error is less than 1 per pixel.
+ const int kSADThresh =
+ 1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
+ // There is a large discrepancy between skipping and not skipping, so we
+ // need to redo the motion search.
+ FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
+ new_ms_params.sdf = new_ms_params.vfp->sdf;
+ new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+ new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
+
+ return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
+ cost_list, best_mv, best_mv_stats,
+ second_best_mv);
+ }
+ }
+
+ if (run_mesh_search) {
+ int var_ex;
+ FULLPEL_MV tmp_mv_ex;
+ FULLPEL_MV_STATS tmp_mv_stats;
+ // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
+ // or non-intraBC)
+ // TODO(chiyotsai@google.com): There is a bug here where the second best mv
+ // gets overwritten without actually comparing the rdcost.
+ const MESH_PATTERN *const mesh_patterns =
+ ms_params->mesh_patterns[is_intra_mode];
+ // TODO(chiyotsai@google.com): the second best mv is not set correctly by
+ // full_pixel_exhaustive, which can incorrectly override it.
+ var_ex =
+ full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list,
+ &tmp_mv_ex, &tmp_mv_stats, second_best_mv);
+ if (var_ex < var) {
+ var = var_ex;
+ *best_mv_stats = tmp_mv_stats;
+ *best_mv = tmp_mv_ex;
+ }
+ }
+
+ return var;
+}
+
+int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ IntraBCHashInfo *intrabc_hash_info,
+ FULLPEL_MV *best_mv) {
+ if (!av1_use_hash_me(cpi)) return INT_MAX;
+
+ const BLOCK_SIZE bsize = ms_params->bsize;
+ const int block_width = block_size_wide[bsize];
+ const int block_height = block_size_high[bsize];
+
+ if (block_width != block_height) return INT_MAX;
+
+ const FullMvLimits *mv_limits = &ms_params->mv_limits;
+ const MSBuffers *ms_buffer = &ms_params->ms_buffers;
+
+ const uint8_t *src = ms_buffer->src->buf;
+ const int src_stride = ms_buffer->src->stride;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int x_pos = mi_col * MI_SIZE;
+ const int y_pos = mi_row * MI_SIZE;
+
+ uint32_t hash_value1, hash_value2;
+ int best_hash_cost = INT_MAX;
+
+ // for the hashMap
+ hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
+
+ av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
+ &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
+
+ const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+ if (count <= 1) {
+ return INT_MAX;
+ }
+
+ Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+ for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) {
+ block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator));
+ if (hash_value2 == ref_block_hash.hash_value2) {
+ // Make sure the prediction is from valid area.
+ const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
+ GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
+ if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
+ cpi->common.seq_params->mib_size_log2))
+ continue;
+
+ FULLPEL_MV hash_mv;
+ hash_mv.col = ref_block_hash.x - x_pos;
+ hash_mv.row = ref_block_hash.y - y_pos;
+ if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
+ FULLPEL_MV_STATS mv_stats;
+ const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats);
+ if (refCost < best_hash_cost) {
+ best_hash_cost = refCost;
+ *best_mv = hash_mv;
+ }
+ }
+ }
+
+ return best_hash_cost;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
+ int full_search, int *sad) {
+ int best_sad = INT_MAX;
+ int this_sad;
+ int d;
+ int center, offset = 0;
+ int bw = search_size << 1;
+
+ if (full_search) {
+ for (d = 0; d <= bw; d++) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+ *sad = best_sad;
+ return (center - (bw >> 1));
+ }
+
+ for (d = 0; d <= bw; d += 16) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+
+ for (d = -8; d <= 8; d += 16) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -4; d <= 4; d += 8) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -2; d <= 2; d += 4) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -1; d <= 1; d += 2) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ *sad = best_sad;
+ return (center - (bw >> 1));
+}
+
+// A special fast version of motion search used in rt mode.
+// The search window along columns and row is given by:
+// +/- me_search_size_col/row.
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv,
+ unsigned int *y_sad_zero,
+ int me_search_size_col,
+ int me_search_size_row) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int idx;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ const int full_search = is_screen;
+ const bool screen_scroll_superblock =
+ is_screen && bsize == cm->seq_params->sb_size;
+ // Keep border a multiple of 16.
+ const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+ int search_size_width = me_search_size_col;
+ int search_size_height = me_search_size_row;
+ // Adjust based on boundary.
+ if (((mi_col << 2) - search_size_width < -border) ||
+ ((mi_col << 2) + search_size_width > cm->width + border))
+ search_size_width = border;
+ if (((mi_row << 2) - search_size_height < -border) ||
+ ((mi_row << 2) + search_size_height > cm->height + border))
+ search_size_height = border;
+ const int src_stride = x->plane[0].src.stride;
+ const int ref_stride = xd->plane[0].pre[0].stride;
+ uint8_t const *ref_buf, *src_buf;
+ int_mv *best_int_mv = &xd->mi[0]->mv[0];
+ unsigned int best_sad, tmp_sad, this_sad[4];
+ int best_sad_col, best_sad_row;
+ const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+ const int col_norm_factor = 3 + (bw >> 5);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+ static const MV search_pos[4] = {
+ { -1, 0 },
+ { 0, -1 },
+ { 0, 1 },
+ { 1, 0 },
+ };
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+ av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+ MAX_MB_PLANE);
+ }
+
+ if (xd->bd != 8) {
+ best_int_mv->as_fullmv = kZeroFullMv;
+ best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return best_sad;
+ }
+ const int width_ref_buf = (search_size_width << 1) + bw;
+ const int height_ref_buf = (search_size_height << 1) + bh;
+ int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf));
+ int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf));
+ int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+ int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+ if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+ aom_free(hbuf);
+ aom_free(vbuf);
+ aom_free(src_hbuf);
+ aom_free(src_vbuf);
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+ }
+
+ // Set up prediction 1-D reference set for rows.
+ ref_buf = xd->plane[0].pre[0].buf - search_size_width;
+ aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh,
+ row_norm_factor);
+
+ // Set up prediction 1-D reference set for cols
+ ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride;
+ aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf,
+ col_norm_factor);
+
+ // Set up src 1-D reference set
+ src_buf = x->plane[0].src.buf;
+ aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+ aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
+
+ // Find the best match per 1-D search
+ best_int_mv->as_fullmv.col =
+ vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
+ full_search, &best_sad_col);
+ best_int_mv->as_fullmv.row =
+ vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
+ full_search, &best_sad_row);
+
+ // For screen: select between horiz or vert motion.
+ if (is_screen) {
+ if (best_sad_col < best_sad_row)
+ best_int_mv->as_fullmv.row = 0;
+ else
+ best_int_mv->as_fullmv.col = 0;
+ }
+
+ FULLPEL_MV this_mv = best_int_mv->as_fullmv;
+ src_buf = x->plane[0].src.buf;
+ ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+ best_sad =
+ cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+ // Evaluate zero MV if found MV is non-zero.
+ if (best_int_mv->as_int != 0) {
+ tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+ *y_sad_zero = tmp_sad;
+ if (tmp_sad < best_sad) {
+ best_int_mv->as_fullmv = kZeroFullMv;
+ this_mv = best_int_mv->as_fullmv;
+ ref_buf = xd->plane[0].pre[0].buf;
+ best_sad = tmp_sad;
+ }
+ } else {
+ *y_sad_zero = best_sad;
+ }
+
+ if (!screen_scroll_superblock) {
+ const uint8_t *const pos[4] = {
+ ref_buf - ref_stride,
+ ref_buf - 1,
+ ref_buf + 1,
+ ref_buf + ref_stride,
+ };
+
+ cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
+ this_sad);
+
+ for (idx = 0; idx < 4; ++idx) {
+ if (this_sad[idx] < best_sad) {
+ best_sad = this_sad[idx];
+ best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+ best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+ }
+ }
+
+ if (this_sad[0] < this_sad[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
+
+ if (this_sad[1] < this_sad[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
+
+ ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+
+ tmp_sad =
+ cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ best_int_mv->as_fullmv = this_mv;
+ best_sad = tmp_sad;
+ }
+ }
+
+ FullMvLimits mv_limits = x->mv_limits;
+ av1_set_mv_search_range(&mv_limits, ref_mv);
+ clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits);
+
+ convert_fullmv_to_mv(best_int_mv);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ aom_free(hbuf);
+ aom_free(vbuf);
+ aom_free(src_hbuf);
+ aom_free(src_vbuf);
+ return best_sad;
+}
+
+// =============================================================================
+// Fullpixel Motion Search: OBMC
+// =============================================================================
+static INLINE int get_obmc_mvpred_var(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const struct buf_2d *ref_buf = ms_buffers->ref;
+
+ const MV mv = get_mv_from_fullmv(this_mv);
+ unsigned int unused;
+
+ return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc,
+ mask, &unused) +
+ mv_err_cost_(&mv, mv_cost_params);
+}
+
+static int obmc_refining_search_sad(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) {
+ const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const struct buf_2d *ref_buf = ms_buffers->ref;
+ const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const int kSearchRange = 8;
+
+ unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv),
+ ref_buf->stride, wsrc, mask) +
+ mvsad_err_cost_(best_mv, mv_cost_params);
+
+ for (int i = 0; i < kSearchRange; i++) {
+ int best_site = -1;
+
+ for (int j = 0; j < 4; j++) {
+ const FULLPEL_MV mv = { best_mv->row + neighbors[j].row,
+ best_mv->col + neighbors[j].col };
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+ unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv),
+ ref_buf->stride, wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_mv->row += neighbors[best_site].row;
+ best_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+static int obmc_diamond_search_sad(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
+ FULLPEL_MV *best_mv, int search_step, int *num00) {
+ const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+ const search_site_config *cfg = ms_params->search_sites;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const struct buf_2d *const ref_buf = ms_buffers->ref;
+
+ // search_step determines the length of the initial step and hence the number
+ // of iterations.
+ const int tot_steps = cfg->num_search_steps - search_step;
+ const uint8_t *best_address, *init_ref;
+ int best_sad = INT_MAX;
+ int best_site = 0;
+
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
+ *num00 = 0;
+ *best_mv = start_mv;
+
+ // Check the starting position
+ best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
+ mvsad_err_cost_(best_mv, mv_cost_params);
+
+ for (int step = tot_steps - 1; step >= 0; --step) {
+ const search_site *const site = cfg->site[step];
+ best_site = 0;
+ for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
+ const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row,
+ best_mv->col + site[idx].mv.col };
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+ int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride,
+ wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = idx;
+ }
+ }
+ }
+ }
+
+ if (best_site != 0) {
+ best_mv->row += site[best_site].mv.row;
+ best_mv->col += site[best_site].mv.col;
+ best_address += site[best_site].offset;
+ } else if (best_address == init_ref) {
+ (*num00)++;
+ }
+ }
+ return best_sad;
+}
+
+static int obmc_full_pixel_diamond(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
+ int step_param, FULLPEL_MV *best_mv) {
+ const search_site_config *cfg = ms_params->search_sites;
+ FULLPEL_MV tmp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme =
+ obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n);
+ if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+ *best_mv = tmp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ const int further_steps = cfg->num_search_steps - 1 - step_param;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv,
+ step_param + n, &num00);
+ if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *best_mv = tmp_mv;
+ }
+ }
+ }
+
+ return bestsme;
+}
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, FULLPEL_MV *best_mv) {
+ if (!ms_params->fast_obmc_search) {
+ const int bestsme =
+ obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv);
+ return bestsme;
+ } else {
+ *best_mv = start_mv;
+ clamp_fullmv(best_mv, &ms_params->mv_limits);
+ int thissme = obmc_refining_search_sad(ms_params, best_mv);
+ if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv);
+ return thissme;
+ }
+}
+
+// =============================================================================
+// Subpixel Motion Search: Translational
+// =============================================================================
+#define INIT_SUBPEL_STEP_SIZE (4)
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// Returns the subpel offset used by various subpel variance functions [m]sv[a]f
+static INLINE int get_subpel_part(int x) { return x & 7; }
+
+// Gets the address of the ref buffer at subpel location (r, c), rounded to the
+// nearest fullpel precision toward - \infty
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+ const MV mv) {
+ const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
+ return &buf->buf[offset];
+}
+
+// Estimates the variance of prediction residue using bilinear filter for fast
+// search.
+static INLINE int estimated_pref_error(
+ const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const uint8_t *src = ms_buffers->src->buf;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int src_stride = ms_buffers->src->stride;
+ const int ref_stride = ms_buffers->ref->stride;
+ const uint8_t *second_pred = ms_buffers->second_pred;
+ const uint8_t *mask = ms_buffers->mask;
+ const int mask_stride = ms_buffers->mask_stride;
+ const int invert_mask = ms_buffers->inv_mask;
+
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ if (second_pred == NULL) {
+ return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ sse);
+ } else if (mask) {
+ return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ second_pred, mask, mask_stride, invert_mask, sse);
+ } else {
+ return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ sse, second_pred);
+ }
+}
+
+// Calculates the variance of prediction residue.
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+ const MV *this_mv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+ const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const uint8_t *src = ms_buffers->src->buf;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int src_stride = ms_buffers->src->stride;
+ const int ref_stride = ms_buffers->ref->stride;
+ const uint8_t *second_pred = ms_buffers->second_pred;
+ const uint8_t *mask = ms_buffers->mask;
+ const int mask_stride = ms_buffers->mask_stride;
+ const int invert_mask = ms_buffers->inv_mask;
+ const int w = var_params->w;
+ const int h = var_params->h;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ unsigned int besterr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_highbd_comp_mask_upsampled_pred(
+ xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+ invert_mask, xd->bd, subpel_search_type);
+ } else {
+ aom_highbd_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+ subpel_search_type);
+ }
+ } else {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ xd->bd, subpel_search_type);
+ }
+ besterr = vfp->vf(pred8, w, src, src_stride, sse);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_comp_mask_upsampled_pred(
+ xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+ invert_mask, subpel_search_type);
+ } else {
+ aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+ second_pred, w, h, subpel_x_q3, subpel_y_q3,
+ ref, ref_stride, subpel_search_type);
+ }
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search_type);
+ }
+
+ besterr = vfp->vf(pred, w, src, src_stride, sse);
+ }
+#else
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+ second_pred, w, h, subpel_x_q3, subpel_y_q3,
+ ref, ref_stride, mask, mask_stride,
+ invert_mask, subpel_search_type);
+ } else {
+ aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+ second_pred, w, h, subpel_x_q3, subpel_y_q3,
+ ref, ref_stride, subpel_search_type);
+ }
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, ref, ref_stride, subpel_search_type);
+ }
+
+ besterr = vfp->vf(pred, w, src, src_stride, sse);
+#endif
+ return besterr;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account. It is suffixed "fast" because
+// it uses bilinear filter to estimate the prediction.
+static INLINE unsigned int check_better_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+ const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ int thismse;
+ if (is_scaled) {
+ thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+ } else {
+ thismse = estimated_pref_error(this_mv, var_params, &sse);
+ }
+ cost = mv_err_cost_(this_mv, mv_cost_params);
+ cost += thismse;
+
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *has_better_mv |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+// Checks whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static AOM_FORCE_INLINE unsigned int check_better(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+ const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *is_better) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ int thismse;
+ thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+ cost = mv_err_cost_(this_mv, mv_cost_params);
+ cost += thismse;
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *is_better |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
+ unsigned int right_cost,
+ unsigned int up_cost,
+ unsigned int down_cost) {
+ const MV diag_step = { up_cost <= down_cost ? -step_size : step_size,
+ left_cost <= right_cost ? -step_size : step_size };
+
+ return diag_step;
+}
+
+// Searches the four cardinal direction for a better mv, then follows up with a
+// search in the best quadrant. This uses bilinear filter to speed up the
+// calculation.
+static AOM_FORCE_INLINE MV first_level_check_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+ int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int is_scaled) {
+ // Check the four cardinal directions
+ const MV left_mv = { this_mv.row, this_mv.col - hstep };
+ int dummy = 0;
+ const unsigned int left = check_better_fast(
+ xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy, is_scaled);
+
+ const MV right_mv = { this_mv.row, this_mv.col + hstep };
+ const unsigned int right = check_better_fast(
+ xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, &dummy, is_scaled);
+
+ const MV top_mv = { this_mv.row - hstep, this_mv.col };
+ const unsigned int up = check_better_fast(
+ xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy, is_scaled);
+
+ const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+ const unsigned int down = check_better_fast(
+ xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, &dummy, is_scaled);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ return diag_step;
+}
+
+// Performs a following up search after first_level_check_fast is called. This
+// performs two extra chess pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void second_level_check_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step,
+ MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int is_scaled) {
+ assert(diag_step.row == hstep || diag_step.row == -hstep);
+ assert(diag_step.col == hstep || diag_step.col == -hstep);
+ const int tr = this_mv.row;
+ const int tc = this_mv.col;
+ const int br = best_mv->row;
+ const int bc = best_mv->col;
+ int dummy = 0;
+ if (tr != br && tc != bc) {
+ assert(diag_step.col == bc - tc);
+ assert(diag_step.row == br - tr);
+ const MV chess_mv_1 = { br, bc + diag_step.col };
+ const MV chess_mv_2 = { br + diag_step.row, bc };
+ check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ } else if (tr == br && tc != bc) {
+ assert(diag_step.col == bc - tc);
+ // Continue searching in the best direction
+ const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
+ const MV top_long_mv = { br - hstep, bc + diag_step.col };
+ check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ // Search in the direction opposite of the best quadrant
+ const MV rev_mv = { br - diag_step.row, bc };
+ check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ } else if (tr != br && tc == bc) {
+ assert(diag_step.row == br - tr);
+ // Continue searching in the best direction
+ const MV right_long_mv = { br + diag_step.row, bc + hstep };
+ const MV left_long_mv = { br + diag_step.row, bc - hstep };
+ check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ // Search in the direction opposite of the best quadrant
+ const MV rev_mv = { br, bc - diag_step.col };
+ check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ }
+}
+
+// Combines first level check and second level check when applicable. This first
+// searches the four cardinal directions, and perform several
+// diagonal/chess-pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void two_level_checks_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+ int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int iters, int is_scaled) {
+ const MV diag_step = first_level_check_fast(
+ xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, is_scaled);
+ if (iters > 1) {
+ second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep,
+ mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, is_scaled);
+ }
+}
+
+static AOM_FORCE_INLINE MV
+first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv,
+ MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion) {
+ int dummy = 0;
+ const MV left_mv = { this_mv.row, this_mv.col - hstep };
+ const MV right_mv = { this_mv.row, this_mv.col + hstep };
+ const MV top_mv = { this_mv.row - hstep, this_mv.col };
+ const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+ const unsigned int left =
+ check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int right =
+ check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int up =
+ check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int down =
+ check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, &dummy);
+
+ return diag_step;
+}
+
+// A newer version of second level check that gives better quality.
+// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
+// from av1_find_best_sub_pixel_tree
+static AOM_FORCE_INLINE void second_level_check_v2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+ MV *best_mv, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int is_scaled) {
+ assert(best_mv->row == this_mv.row + diag_step.row ||
+ best_mv->col == this_mv.col + diag_step.col);
+ if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+ return;
+ } else if (this_mv.row == best_mv->row) {
+ // Search away from diagonal step since diagonal search did not provide any
+ // improvement
+ diag_step.row *= -1;
+ } else if (this_mv.col == best_mv->col) {
+ diag_step.col *= -1;
+ }
+
+ const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+ const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+ const MV diag_bias_mv = { best_mv->row + diag_step.row,
+ best_mv->col + diag_step.col };
+ int has_better_mv = 0;
+
+ if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+ check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+ check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+ }
+ } else {
+ check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+ is_scaled);
+ check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+ is_scaled);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv, is_scaled);
+ }
+ }
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int setup_center_error(
+ const MACROBLOCKD *xd, const MV *bestmv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+ const int w = var_params->w;
+ const int h = var_params->h;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const uint8_t *src = ms_buffers->src->buf;
+ const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv);
+ const int src_stride = ms_buffers->src->stride;
+ const int y_stride = ms_buffers->ref->stride;
+ const uint8_t *second_pred = ms_buffers->second_pred;
+ const uint8_t *mask = ms_buffers->mask;
+ const int mask_stride = ms_buffers->mask_stride;
+ const int invert_mask = ms_buffers->inv_mask;
+
+ unsigned int besterr;
+
+ if (second_pred != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+ uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+ if (mask) {
+ aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
+ mask, mask_stride, invert_mask);
+ } else {
+ aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ if (mask) {
+ aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+ mask_stride, invert_mask);
+ } else {
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ }
+#else
+ (void)xd;
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ if (mask) {
+ aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+ mask_stride, invert_mask);
+ } else {
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+#endif
+ } else {
+ besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
+ besterr += mv_err_cost_(bestmv, mv_cost_params);
+ return besterr;
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int upsampled_setup_center_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost_(bestmv, mv_cost_params);
+ return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+ return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+ return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+ cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+ int bits) {
+ *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+ (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+ *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+ (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+// Checks the list of mvs searched in the last iteration and see if we are
+// repeating it. If so, return 1. Otherwise we update the last_mv_search_list
+// with current_mv and return 0.
+static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
+ const MV current_mv, int iter) {
+ if (last_mv_search_list) {
+ if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
+ return 1;
+ }
+
+ last_mv_search_list[iter].as_mv = current_mv;
+ }
+ return 0;
+}
+
+static AOM_INLINE int setup_center_error_facade(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
+ int is_scaled) {
+ if (is_scaled) {
+ return upsampled_setup_center_error(xd, cm, bestmv, var_params,
+ mv_cost_params, sse1, distortion);
+ } else {
+ return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+ }
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
+ (void)cm;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const int *cost_list = ms_params->cost_list;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+ // The iteration we are current searching for. Iter 0 corresponds to fullpel
+ // mv, iter 1 to half pel, and so on
+ int iter = 0;
+ int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel
+ unsigned int besterr = INT_MAX;
+ *bestmv = start_mv;
+
+ const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+ ? &cm->sf_identity
+ : xd->block_ref_scale_factors[0];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ besterr =
+ setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+ sse1, distortion, is_scaled);
+ }
+
+ // If forced_stop is FULL_PEL, return.
+ if (forced_stop == FULL_PEL) return besterr;
+
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ int ir, ic;
+ get_cost_surf_min(cost_list, &ir, &ic, 1);
+ if (ir != 0 || ic != 0) {
+ const MV this_mv = { start_mv.row + ir * hstep,
+ start_mv.col + ic * hstep };
+ int dummy = 0;
+ check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ }
+ } else {
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ if (forced_stop < HALF_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ if (allow_hp && forced_stop == EIGHTH_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
+ (void)cm;
+ (void)start_mv_stats;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const int *cost_list = ms_params->cost_list;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+ // The iteration we are current searching for. Iter 0 corresponds to fullpel
+ // mv, iter 1 to half pel, and so on
+ int iter = 0;
+ int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel
+ unsigned int besterr = INT_MAX;
+ *bestmv = start_mv;
+
+ const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+ ? &cm->sf_identity
+ : xd->block_ref_scale_factors[0];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ besterr =
+ setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+ sse1, distortion, is_scaled);
+ }
+
+ // If forced_stop is FULL_PEL, return.
+ if (forced_stop == FULL_PEL) return besterr;
+
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX) {
+ const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+ (cost_list[2] < cost_list[4] ? 0 : 2);
+
+ const MV left_mv = { start_mv.row, start_mv.col - hstep };
+ const MV right_mv = { start_mv.row, start_mv.col + hstep };
+ const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
+ const MV top_mv = { start_mv.row - hstep, start_mv.col };
+
+ const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
+ const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
+ const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
+ const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
+
+ int dummy = 0;
+
+ switch (whichdir) {
+ case 0: // bottom left quadrant
+ check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, &dummy, is_scaled);
+ break;
+ case 1: // bottom right quadrant
+ check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, &dummy, is_scaled);
+ break;
+ case 2: // top left quadrant
+ check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ break;
+ case 3: // top right quadrant
+ check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ break;
+ }
+ } else {
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ if (forced_stop < HALF_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ if (allow_hp && forced_stop == EIGHTH_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion,
+ unsigned int *sse1,
+ int_mv *last_mv_search_list) {
+ (void)start_mv_stats;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+ const SUBPEL_SEARCH_TYPE subpel_search_type =
+ ms_params->var_params.subpel_search_type;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ // How many steps to take. A round of 0 means fullpel search only, 1 means
+ // half-pel, and so on.
+ const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+ int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel
+
+ unsigned int besterr = INT_MAX;
+
+ *bestmv = start_mv;
+
+ const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+ ? &cm->sf_identity
+ : xd->block_ref_scale_factors[0];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ if (subpel_search_type != USE_2_TAPS_ORIG) {
+ besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+ mv_cost_params, sse1, distortion);
+ } else {
+ besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+ }
+ }
+
+ // If forced_stop is FULL_PEL, return.
+ if (!round) return besterr;
+
+ for (int iter = 0; iter < round; ++iter) {
+ MV iter_center_mv = *bestmv;
+ if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
+ iter)) {
+ return INT_MAX;
+ }
+
+ MV diag_step;
+ if (subpel_search_type != USE_2_TAPS_ORIG) {
+ diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion);
+ } else {
+ diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion, is_scaled);
+ }
+
+ // Check diagonal sub-pixel position
+ if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+ second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+ mv_limits, var_params, mv_cost_params, &besterr,
+ sse1, distortion, is_scaled);
+ }
+
+ hstep >>= 1;
+ }
+
+ return besterr;
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+// Returns the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion, unsigned int *sse1,
+ int_mv *last_mv_search_list) {
+ (void)xd;
+ (void)cm;
+ (void)start_mv;
+ (void)start_mv_stats;
+ (void)sse1;
+ (void)distortion;
+ (void)last_mv_search_list;
+
+ const int allow_hp = ms_params->allow_hp;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ bestmv->row = mv_limits->row_max;
+ bestmv->col = mv_limits->col_max;
+
+ unsigned int besterr = 0;
+
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp, 0);
+ return besterr;
+}
+
+// Returns the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion, unsigned int *sse1,
+ int_mv *last_mv_search_list) {
+ (void)xd;
+ (void)cm;
+ (void)start_mv;
+ (void)start_mv_stats;
+ (void)sse1;
+ (void)distortion;
+ (void)last_mv_search_list;
+
+ const int allow_hp = ms_params->allow_hp;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ bestmv->row = mv_limits->row_min;
+ bestmv->col = mv_limits->col_min;
+
+ unsigned int besterr = 0;
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp, 0);
+ return besterr;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Computes the cost of the current predictor by going through the whole
+// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
+// during motion_mode_rd. We are going through the whole
+// av1_enc_build_inter_predictor because we might have changed the interpolation
+// filter, etc before motion_mode_rd is called.
+static INLINE unsigned int compute_motion_cost(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
+ const MV *this_mv) {
+ unsigned int mse;
+ unsigned int sse;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+
+ const uint8_t *const src = ms_buffers->src->buf;
+ const int src_stride = ms_buffers->src->stride;
+ const uint8_t *const dst = xd->plane[0].dst.buf;
+ const int dst_stride = xd->plane[0].dst.stride;
+ const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;
+
+ mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+ mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params);
+ return mse;
+}
+
+// Refines MV in a small range
+
+// Macros to build bitmasks which help us avoid redundant computations
+//
+// To explain the idea here, imagine that on the first iteration of the
+// loop below, we step rightwards. Then, on the second iteration, the neighbors
+// to consider are:
+// . . .
+// 0 1 .
+// . . .
+// Where 0 is the initial search point, 1 is the best candidate found in the
+// first iteration, and the dots are the other neighbors of point 1.
+//
+// Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and
+// the seven points marked with dots), and compare them to see where to move
+// next. However, we already evaluated 5 of those 8 neighbors in the last
+// iteration, and decided that they are worse than point 1. So we don't need
+// to re-consider these points. We only really need to consider the three
+// points which are adjacent to point 1 but *not* to point 0.
+//
+// As the algorithm goes on, there are other ways that redundant evaluations
+// can happen, if the search path curls back around on itself.
+//
+// To avoid all possible redundancies, we'd have to build a set containing
+// every point we have already checked, and this would be quite expensive.
+//
+// So instead, we apply a 95%-effective solution with a much lower overhead:
+// we prune out the points which were considered during the previous
+// iteration, but we don't worry about any prior iteration. This can be done
+// as follows:
+//
+// We build a static table, called neighbor_mask, which answers the question
+// "if we moved in direction X last time, which neighbors are new, and which
+// were scanned last iteration?"
+// Then we can query this table to quickly determine which points we need to
+// evaluate, and which we can skip.
+//
+// To query the table, the logic is simply:
+// neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration,
+// do we need to scan neighbor j this iteration?"
+#define NEIGHBOR_MASK_DIA(left, down, right, up) \
+ (left | (down << 1) | (right << 2) | (up << 3))
+
+#define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \
+ up_left, up_right) \
+ (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) | \
+ (down_right << 5) | (up_left << 6) | (up_right << 7))
+
+static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = {
+ // WARP_SEARCH_DIAMOND
+ {
+ .num_neighbors = 4,
+ .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+ .neighbor_mask = {
+ // If we stepped left last time, consider all points except right
+ NEIGHBOR_MASK_DIA(1, 1, 0, 1),
+ // If we stepped down last time, consider all points except up
+ NEIGHBOR_MASK_DIA(1, 1, 1, 0),
+ // Stepped right last time
+ NEIGHBOR_MASK_DIA(0, 1, 1, 1),
+ // Stepped up last time
+ NEIGHBOR_MASK_DIA(1, 0, 1, 1),
+ },
+ },
+ // WARP_SEARCH_SQUARE
+ {
+ .num_neighbors = 8,
+ .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+ { 1, -1 }, { 1, 1 }, { -1, -1 }, { -1, 1 } },
+ .neighbor_mask = {
+ // If we stepped left last time, then we only need to consider 3 points:
+ // left, down+left, up+left
+ NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0),
+ // If we stepped down last time, then we only need to consider 3 points:
+ // down, down+left, down+right
+ NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0),
+ // Stepped right last time
+ NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1),
+ // Stepped up last time
+ NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1),
+
+ // If we stepped down+left last time, then we need to consider 5 points:
+ // left, down, down+left, down+right, up+left
+ NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0),
+ // Stepped down+right last time
+ NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1),
+ // Stepped up+left last time
+ NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1),
+ // Stepped up+right last time
+ NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1),
+ },
+ },
+};
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ BLOCK_SIZE bsize, const int *pts0,
+ const int *pts_inref0, int total_samples,
+ WARP_SEARCH_METHOD search_method,
+ int num_iterations) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+
+ const MV *neighbors = warp_search_info[search_method].neighbors;
+ const int num_neighbors = warp_search_info[search_method].num_neighbors;
+ const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask;
+
+ MV *best_mv = &mbmi->mv[0].as_mv;
+
+ WarpedMotionParams best_wm_params = mbmi->wm_params;
+ int best_num_proj_ref = mbmi->num_proj_ref;
+ unsigned int bestmse;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ const int mv_shift = ms_params->allow_hp ? 0 : 1;
+
+ // Calculate the center position's error
+ assert(av1_is_subpelmv_in_range(mv_limits, *best_mv));
+ bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv);
+
+ // MV search
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // First step always scans all neighbors
+ uint8_t valid_neighbors = UINT8_MAX;
+
+ for (int ite = 0; ite < num_iterations; ++ite) {
+ int best_idx = -1;
+
+ for (int idx = 0; idx < num_neighbors; ++idx) {
+ if ((valid_neighbors & (1 << idx)) == 0) {
+ continue;
+ }
+
+ unsigned int thismse;
+
+ MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift),
+ best_mv->col + neighbors[idx].col * (1 << mv_shift) };
+ if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ if (total_samples > 1) {
+ mbmi->num_proj_ref =
+ av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+ }
+
+ if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ this_mv.row, this_mv.col, &mbmi->wm_params,
+ mi_row, mi_col)) {
+ thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv);
+
+ if (thismse < bestmse) {
+ best_idx = idx;
+ best_wm_params = mbmi->wm_params;
+ best_num_proj_ref = mbmi->num_proj_ref;
+ bestmse = thismse;
+ }
+ }
+ }
+ }
+
+ if (best_idx == -1) break;
+
+ if (best_idx >= 0) {
+ best_mv->row += neighbors[best_idx].row * (1 << mv_shift);
+ best_mv->col += neighbors[best_idx].col * (1 << mv_shift);
+ valid_neighbors = neighbor_mask[best_idx];
+ }
+ }
+
+ mbmi->wm_params = best_wm_params;
+ mbmi->num_proj_ref = best_num_proj_ref;
+ return bestmse;
+}
+
+#endif // !CONFIG_REALTIME_ONLY
+// =============================================================================
+// Subpixel Motion Search: OBMC
+// =============================================================================
+// Estimates the variance of prediction residue
+static INLINE int estimate_obmc_pref_error(
+ const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const int32_t *src = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int ref_stride = ms_buffers->ref->stride;
+
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse);
+}
+
+// Calculates the variance of prediction residue
+static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+ const MV *this_mv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+ const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+ const int w = var_params->w;
+ const int h = var_params->h;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int ref_stride = ms_buffers->ref->stride;
+
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ unsigned int besterr;
+ DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+ subpel_search_type);
+ besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+ besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+ }
+#else
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+ besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#endif
+ return besterr;
+}
+
+static unsigned int setup_obmc_center_error(
+ const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ // TODO(chiyotsai@google.com): There might be a bug here where we didn't use
+ // get_buf_from_mv(ref, *this_mv).
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const uint8_t *ref = ms_buffers->ref->buf;
+ const int ref_stride = ms_buffers->ref->stride;
+ unsigned int besterr =
+ var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost_(this_mv, mv_cost_params);
+ return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ unsigned int besterr =
+ upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost_(this_mv, mv_cost_params);
+ return besterr;
+}
+
+// Estimates the variance of prediction residue
+// TODO(chiyotsai@google.com): the cost does does not match the cost in
+// mv_cost_. Investigate this later.
+static INLINE int estimate_obmc_mvcost(const MV *this_mv,
+ const MV_COST_PARAMS *mv_cost_params) {
+ const MV *ref_mv = mv_cost_params->ref_mv;
+ const int *mvjcost = mv_cost_params->mvjcost;
+ const int *const *mvcost = mv_cost_params->mvcost;
+ const int error_per_bit = mv_cost_params->error_per_bit;
+ const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type;
+ const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row),
+ GET_MV_SUBPEL(this_mv->col - ref_mv->col) };
+
+ switch (mv_cost_type) {
+ case MV_COST_ENTROPY:
+ return (unsigned)((mv_cost(&diff_mv, mvjcost,
+ CONVERT_TO_CONST_MVCOST(mvcost)) *
+ error_per_bit +
+ 4096) >>
+ 13);
+ case MV_COST_NONE: return 0;
+ default:
+ assert(0 && "L1 norm is not tuned for estimated obmc mvcost");
+ return 0;
+ }
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better_fast(
+ const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *has_better_mv) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse);
+
+ cost = estimate_obmc_mvcost(this_mv, mv_cost_params);
+ cost += thismse;
+
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *has_better_mv |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+ const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *has_better_mv) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ const int thismse =
+ upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse);
+ cost = mv_err_cost_(this_mv, mv_cost_params);
+
+ cost += thismse;
+
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *has_better_mv |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+static AOM_FORCE_INLINE MV obmc_first_level_check(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv,
+ const int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion) {
+ int dummy = 0;
+ const MV left_mv = { this_mv.row, this_mv.col - hstep };
+ const MV right_mv = { this_mv.row, this_mv.col + hstep };
+ const MV top_mv = { this_mv.row - hstep, this_mv.col };
+ const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+ if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+ const unsigned int left =
+ obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int right =
+ obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int up =
+ obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int down =
+ obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ return diag_step;
+ } else {
+ const unsigned int left = obmc_check_better_fast(
+ &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+ distortion, &dummy);
+ const unsigned int right = obmc_check_better_fast(
+ &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy);
+
+ const unsigned int up = obmc_check_better_fast(
+ &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+ distortion, &dummy);
+
+ const unsigned int down = obmc_check_better_fast(
+ &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ return diag_step;
+ }
+}
+
+// A newer version of second level check for obmc that gives better quality.
+static AOM_FORCE_INLINE void obmc_second_level_check_v2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+ MV *best_mv, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion) {
+ assert(best_mv->row == this_mv.row + diag_step.row ||
+ best_mv->col == this_mv.col + diag_step.col);
+ if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+ return;
+ } else if (this_mv.row == best_mv->row) {
+ // Search away from diagonal step since diagonal search did not provide any
+ // improvement
+ diag_step.row *= -1;
+ } else if (this_mv.col == best_mv->col) {
+ diag_step.col *= -1;
+ }
+
+ const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+ const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+ const MV diag_bias_mv = { best_mv->row + diag_step.row,
+ best_mv->col + diag_step.col };
+ int has_better_mv = 0;
+
+ if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+ obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ }
+ } else {
+ obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ }
+ }
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
+ (void)last_mv_search_list;
+ (void)start_mv_stats;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+ const SUBPEL_SEARCH_TYPE subpel_search_type =
+ ms_params->var_params.subpel_search_type;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ int hstep = INIT_SUBPEL_STEP_SIZE;
+ const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+
+ unsigned int besterr = INT_MAX;
+ *bestmv = start_mv;
+
+ if (subpel_search_type != USE_2_TAPS_ORIG)
+ besterr = upsampled_setup_obmc_center_error(
+ xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion);
+ else
+ besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+
+ for (int iter = 0; iter < round; ++iter) {
+ MV iter_center_mv = *bestmv;
+ MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion);
+
+ if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+ obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion);
+ }
+ hstep >>= 1;
+ }
+
+ return besterr;
+}
+
+// =============================================================================
+// Public cost function: mv_cost + pred error
+// =============================================================================
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src, const struct buf_2d *pre) {
+ const MV mv = get_mv_from_fullmv(&best_mv);
+ unsigned int sse, var;
+
+ var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv),
+ pre->stride, &sse);
+ (void)var;
+
+ return sse + mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const uint8_t *second_pred,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src,
+ const struct buf_2d *pre) {
+ const MV mv = get_mv_from_fullmv(&best_mv);
+ unsigned int unused;
+
+ return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+ src->buf, src->stride, &unused, second_pred) +
+ mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_mask_var(
+ const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
+ const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+ int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
+ const struct buf_2d *pre) {
+ const MV mv = get_mv_from_fullmv(&best_mv);
+ unsigned int unused;
+
+ return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+ src->buf, src->stride, second_pred, mask, mask_stride,
+ invert_mask, &unused) +
+ mv_err_cost_(&mv, mv_cost_params);
+}
+
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src,
+ const struct buf_2d *pre) {
+ if (mask) {
+ return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask,
+ mask_stride, invert_mask, vfp, src, pre);
+ } else {
+ return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src,
+ pre);
+ }
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 0000000000..87b9309b61
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
+
+#include "av1/common/mv.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/rd.h"
+
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+// =============================================================================
+// Cost functions
+// =============================================================================
+
+enum {
+ MV_COST_ENTROPY, // Use the entropy rate of the mv as the cost
+ MV_COST_L1_LOWRES, // Use the l1 norm of the mv as the cost (<480p)
+ MV_COST_L1_MIDRES, // Use the l1 norm of the mv as the cost (>=480p)
+ MV_COST_L1_HDRES, // Use the l1 norm of the mv as the cost (>=720p)
+ MV_COST_NONE // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
+typedef struct {
+ // The reference mv used to compute the mv cost
+ const MV *ref_mv;
+ FULLPEL_MV full_ref_mv;
+ MV_COST_TYPE mv_cost_type;
+ const int *mvjcost;
+ const int *mvcost[2];
+ int error_per_bit;
+ // A multiplier used to convert rate to sad cost
+ int sad_per_bit;
+} MV_COST_PARAMS;
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+ int *const mvcost[2], int weight);
+
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src, const struct buf_2d *pre);
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
+ const FULLPEL_MV best_mv,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src,
+ const struct buf_2d *pre);
+
+// =============================================================================
+// Motion Search
+// =============================================================================
+typedef struct {
+ // The reference buffer
+ const struct buf_2d *ref;
+
+ // The source and predictors/mask used by translational search
+ const struct buf_2d *src;
+ const uint8_t *second_pred;
+ const uint8_t *mask;
+ int mask_stride;
+ int inv_mask;
+
+ // The weighted source and mask used by OBMC
+ const int32_t *wsrc;
+ const int32_t *obmc_mask;
+} MSBuffers;
+
+static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
+ const uint8_t *second_pred,
+ const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ ms_buffers->second_pred = second_pred;
+ ms_buffers->mask = mask;
+ ms_buffers->mask_stride = mask_stride;
+ ms_buffers->inv_mask = invert_mask;
+}
+
+// =============================================================================
+// Fullpixel Motion Search
+// =============================================================================
+// This struct holds fullpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+ BLOCK_SIZE bsize;
+ // A function pointer to the simd function for fast computation
+ const aom_variance_fn_ptr_t *vfp;
+
+ MSBuffers ms_buffers;
+
+ // WARNING: search_method should be regarded as a private variable and should
+ // not be modified directly so it is in sync with search_sites. To modify it,
+ // use av1_set_mv_search_method.
+ SEARCH_METHODS search_method;
+ const search_site_config *search_sites;
+ FullMvLimits mv_limits;
+
+ int run_mesh_search; // Sets mesh search unless it got pruned by
+ // prune_mesh_search.
+ int prune_mesh_search; // Disables mesh search if the best_mv after a normal
+ // search if close to the start_mv.
+ int mesh_search_mv_diff_threshold; // mv diff threshold to enable
+ // prune_mesh_search
+ int force_mesh_thresh; // Forces mesh search if the residue variance is
+ // higher than the threshold.
+ const struct MESH_PATTERN *mesh_patterns[2];
+
+ // Use maximum search interval of 4 if true. This helps motion search to find
+ // the best motion vector for screen content types.
+ int fine_search_interval;
+
+ int is_intra_mode;
+
+ int fast_obmc_search;
+
+ // For calculating mv cost
+ MV_COST_PARAMS mv_cost_params;
+
+ // Stores the function used to compute the sad. This can be different from the
+ // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
+ aom_sad_fn_t sdf;
+ aom_sad_multi_d_fn_t sdx4df;
+ aom_sad_multi_d_fn_t sdx3df;
+} FULLPEL_MOTION_SEARCH_PARAMS;
+
+typedef struct {
+ int err_cost;
+ unsigned int distortion;
+ unsigned int sse;
+} FULLPEL_MV_STATS;
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
+
+void av1_make_default_fullpel_ms_params(
+ FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
+ const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+ SEARCH_METHODS search_method, int fine_search_interval);
+
+/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const IntraBCMVCosts *dv_costs);
+
+// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for firstpass motion search.
+void av1_init_motion_fpf(search_site_config *cfg, int stride);
+// Sets up configs for NSTEP / NSTEP_8PT motion search method.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA
+// motion search method.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for HEX or FAST_HEX motion search method.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for SQUARE motion search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+ int level);
+
+/*! Function pointer to search site config initialization of different search
+ * method functions. */
+typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride,
+ int level);
+
+/*! Array of function pointers used to set the motion search config. */
+extern const av1_init_search_site_config
+ av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS];
+
+// Array to inform which all search methods are having
+// same candidates and different in number of search steps.
+static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
+ DIAMOND, // DIAMOND
+ NSTEP, // NSTEP
+ NSTEP_8PT, // NSTEP_8PT
+ CLAMPED_DIAMOND, // CLAMPED_DIAMOND
+ HEX, // HEX
+ BIGDIA, // BIGDIA
+ SQUARE, // SQUARE
+ HEX, // FAST_HEX
+ BIGDIA, // FAST_DIAMOND
+ BIGDIA, // FAST_BIGDIA
+ BIGDIA // VFAST_DIAMOND
+};
+
+// Reinitialize the search site config.
+static AOM_INLINE void av1_refresh_search_site_config(
+ search_site_config *ss_cfg_buf, SEARCH_METHODS search_method,
+ const int ref_stride) {
+ const int level =
+ search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND;
+ search_method = search_method_lookup[search_method];
+ av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method],
+ ref_stride, level);
+}
+
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_search_method(
+ FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+ SEARCH_METHODS search_method) {
+ ms_params->search_method = search_method;
+ ms_params->search_sites =
+ &search_sites[search_method_lookup[ms_params->search_method]];
+}
+
+// Set up limit values for MV components.
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_row_limits(
+ const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+ int mi_row, int mi_height, int border) {
+ const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+ const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+ mv_limits->row_min = AOMMAX(min1, min2);
+ const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE +
+ border - 2 * AOM_INTERP_EXTEND;
+ const int max2 =
+ (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+ mv_limits->row_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_col_limits(
+ const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+ int mi_col, int mi_width, int border) {
+ const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+ const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+ mv_limits->col_min = AOMMAX(min1, min2);
+ const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border -
+ 2 * AOM_INTERP_EXTEND;
+ const int max2 =
+ (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+ mv_limits->col_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_limits(
+ const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+ int mi_row, int mi_col, int mi_height, int mi_width, int border) {
+ av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border);
+ av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
+
+int av1_init_search_range(int size);
+
+unsigned int av1_int_pro_motion_estimation(
+ const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
+ int me_search_size_col, int me_search_size_row);
+
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, int *cost_list,
+ FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv);
+
+int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ IntraBCHashInfo *intrabc_hash_info,
+ FULLPEL_MV *best_mv);
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, FULLPEL_MV *best_mv);
+
+static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+ FULLPEL_MV mv) {
+ return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+ (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+// =============================================================================
+// Subpixel Motion Search
+// =============================================================================
+enum {
+ EIGHTH_PEL,
+ QUARTER_PEL,
+ HALF_PEL,
+ FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+typedef struct {
+ const aom_variance_fn_ptr_t *vfp;
+ SUBPEL_SEARCH_TYPE subpel_search_type;
+ // Source and reference buffers
+ MSBuffers ms_buffers;
+ int w, h;
+} SUBPEL_SEARCH_VAR_PARAMS;
+
+// This struct holds subpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+ // High level motion search settings
+ int allow_hp;
+ const int *cost_list;
+ SUBPEL_FORCE_STOP forced_stop;
+ int iters_per_step;
+ SubpelMvLimits mv_limits;
+
+ // For calculating mv cost
+ MV_COST_PARAMS mv_cost_params;
+
+ // Distortion calculation params
+ SUBPEL_SEARCH_VAR_PARAMS var_params;
+} SUBPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct AV1_COMP *cpi,
+ const MACROBLOCK *x, BLOCK_SIZE bsize,
+ const MV *ref_mv, const int *cost_list);
+
+typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion,
+ unsigned int *sse1,
+ int_mv *last_mv_search_list);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ BLOCK_SIZE bsize, const int *pts0,
+ const int *pts_inref0, int total_samples,
+ WARP_SEARCH_METHOD search_method,
+ int num_iterations);
+
+static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
+ for (int z = 0; z < 3; z++) {
+ fractional_best_mv[z].as_int = INVALID_MV;
+ }
+}
+
+static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
+ const FullMvLimits *mv_limits,
+ const MV *ref_mv) {
+ const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL);
+ int minc = AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv);
+ int maxc = AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv);
+ int minr = AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv);
+ int maxr = AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv);
+
+ maxc = AOMMAX(minc, maxc);
+ maxr = AOMMAX(minr, maxr);
+
+ subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc);
+ subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc);
+ subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr);
+ subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
+ MV mv) {
+ return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+ (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+
+static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+ return mv->row * stride + mv->col;
+}
+
+static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+ const FULLPEL_MV *mv) {
+ return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mcomp_structs.h b/third_party/aom/av1/encoder/mcomp_structs.h
new file mode 100644
index 0000000000..06660cf4a6
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp_structs.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
+#define AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
+
+#include "av1/common/mv.h"
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Maximum number of neighbors to scan per iteration during
+// WARPED_CAUSAL refinement
+// Note: The elements of warp_search_config.neighbor_mask must be at least
+// MAX_WARP_SEARCH_NEIGHBORS many bits wide. So the type may need to be
+// widened if this value is increased.
+#define MAX_WARP_SEARCH_NEIGHBORS 8
+
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+ (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
+typedef struct {
+ FULLPEL_MV coord;
+ int coord_offset;
+} search_neighbors;
+// motion search site
+typedef struct search_site {
+ FULLPEL_MV mv;
+ int offset;
+} search_site;
+
+typedef struct search_site_config {
+ search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1];
+ // Number of search steps.
+ int num_search_steps;
+ int searches_per_step[MAX_MVSEARCH_STEPS * 2];
+ int radius[MAX_MVSEARCH_STEPS * 2];
+ int stride;
+} search_site_config;
+
+enum {
+ // Search 8-points in the radius grid around center, up to 11 search stages.
+ DIAMOND = 0,
+ // Search 12-points in the radius/tan_radius grid around center,
+ // up to 15 search stages.
+ NSTEP = 1,
+ // Search 8-points in the radius grid around center, up to 16 search stages.
+ NSTEP_8PT = 2,
+ // Search 8-points in the radius grid around center, upto 11 search stages
+ // with clamping of search radius.
+ CLAMPED_DIAMOND = 3,
+ // Search maximum 8-points in the radius grid around center,
+ // up to 11 search stages. First stage consists of 8 search points
+ // and the rest with 6 search points each in hex shape.
+ HEX = 4,
+ // Search maximum 8-points in the radius grid around center,
+ // up to 11 search stages. First stage consists of 4 search
+ // points and the rest with 8 search points each.
+ BIGDIA = 5,
+ // Search 8-points in the square grid around center, up to 11 search stages.
+ SQUARE = 6,
+ // HEX search with up to 2 stages.
+ FAST_HEX = 7,
+ // BIGDIA search with up to 2 stages.
+ FAST_DIAMOND = 8,
+ // BIGDIA search with up to 3 stages.
+ FAST_BIGDIA = 9,
+ // BIGDIA search with up to 1 stage.
+ VFAST_DIAMOND = 10,
+ // Total number of search methods.
+ NUM_SEARCH_METHODS,
+ // Number of distinct search methods.
+ NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1,
+} UENUM1BYTE(SEARCH_METHODS);
+
+typedef struct warp_search_config {
+ int num_neighbors;
+ MV neighbors[MAX_WARP_SEARCH_NEIGHBORS];
+ // Bitmask which is used to prune the search neighbors at one iteration
+ // based on which direction we chose in the previous iteration.
+ // See comments in av1_refine_warped_mv for details.
+ uint8_t neighbor_mask[MAX_WARP_SEARCH_NEIGHBORS];
+} warp_search_config;
+
+// Methods for refining WARPED_CAUSAL motion vectors
+enum {
+ // Search 4 adjacent points in a diamond shape at each iteration
+ WARP_SEARCH_DIAMOND,
+ // Search 8 adjacent points in a square at each iteration
+ WARP_SEARCH_SQUARE,
+ WARP_SEARCH_METHODS
+} UENUM1BYTE(WARP_SEARCH_METHOD);
+
+#endif // AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
diff --git a/third_party/aom/av1/encoder/misc_model_weights.h b/third_party/aom/av1/encoder/misc_model_weights.h
new file mode 100644
index 0000000000..f00aeabcf6
--- /dev/null
+++ b/third_party/aom/av1/encoder/misc_model_weights.h
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define MV_PREC_FEATURE_SIZE 18
+
+#define NUM_DNN_LAYERS 1
+#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE
+#define MV_PREC_LAYER_SIZE_0 32
+#define NUM_LOGITS 1
+
+const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f,
+ 141.6251917346238f,
+ 0.36313633945679064f,
+ 0.0028162791958822085f,
+ 0.000484820537626698f,
+ 0.002769969388939025f,
+ 0.0f,
+ 0.00031274626720947577f,
+ 0.00020578555375160075f,
+ 0.0007075246732697733f,
+ 0.000539641029909925f,
+ 0.0013939401375906984f,
+ 4.985394760423499f,
+ 4.985394760423499f,
+ 4.9992148717283085f,
+ 5.143739822380163f,
+ 5.518483124004564f,
+ 87.63597847427077f };
+
+const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f,
+ 68.04472572607503f,
+ 13.23247674430399f,
+ 0.0029123438396921955f,
+ 0.0015331406169374737f,
+ 0.0029149813096313775f,
+ 1.0f,
+ 0.00047501102871357813f,
+ 0.00030025962993117947f,
+ 0.0009861163580391207f,
+ 0.0012157593528004055f,
+ 0.002004954948490521f,
+ 6.539447500484038f,
+ 6.539447500484038f,
+ 6.396589058279465f,
+ 3.4870155874262516f,
+ 3.8911353973740535f,
+ 112.07985259573601f };
+
+const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f,
+ -0.1483527373474774f,
+ 0.08112076098858864f,
+ -0.9582568679627453f,
+ -0.34794757171071206f,
+ 0.6465225723304947f,
+ 0.0f,
+ 0.06754171885839604f,
+ 0.27156803620541214f,
+ 0.10635231245664407f,
+ -0.031183926995968583f,
+ 0.048122572260291f,
+ -0.19498534230045128f,
+ -0.2614116319273316f,
+ -0.3223762845136331f,
+ -1.2063368350609205f,
+ -0.523333556911706f,
+ 1.075632260890728f,
+ 0.48989726814387946f,
+ -0.34816466111070477f,
+ 0.41668357610256473f,
+ -1.0973562848791671f,
+ 0.04183921854389494f,
+ -0.9123815389260476f,
+ 0.0f,
+ 0.859965047744027f,
+ 0.1962095804679813f,
+ 0.2606564339077058f,
+ 0.26695868715184895f,
+ 0.5319308568326692f,
+ -0.23717505799723165f,
+ -0.43127224481782567f,
+ -0.3214545776203726f,
+ 0.5850852241402176f,
+ -0.26705531612587813f,
+ -0.5786016766610093f,
+ 0.9360519909983003f,
+ 0.20771329289016555f,
+ -0.027614159544811823f,
+ -1.175022807046164f,
+ -0.07578967497693835f,
+ 0.6890172485324256f,
+ 0.0f,
+ -0.008008338164988263f,
+ -0.08064800010158935f,
+ -0.22606910981666667f,
+ 0.4541586669210879f,
+ 0.07731527661370792f,
+ -0.6744475941247964f,
+ -0.2625842448396184f,
+ 1.7018613444303785f,
+ -0.08622229073162656f,
+ 0.041858142814941275f,
+ -0.24575964090386415f,
+ -0.046626044730994964f,
+ 0.7608713064175202f,
+ -0.23330119070907146f,
+ -0.10115510984500826f,
+ 0.9722537349192069f,
+ 0.11718554254290829f,
+ 0.0f,
+ 0.2075123446014759f,
+ 0.09465167310768637f,
+ 0.7609896851963016f,
+ 0.4441038581385328f,
+ 0.26064144727430955f,
+ -0.14678625366485035f,
+ -0.03597014452200524f,
+ 0.3128680867196166f,
+ 1.102496797385966f,
+ 0.06642253233084111f,
+ -1.2665494483407629f,
+ 0.09049412632000911f,
+ -1.1160621999565095f,
+ 0.043420275255913035f,
+ -0.8811412259978966f,
+ 0.21076234632287777f,
+ 0.16571534463543866f,
+ 0.0f,
+ -0.7324075176473275f,
+ -0.3677622514459495f,
+ 0.3273532243056415f,
+ 0.22922161936797775f,
+ 0.8204766691058087f,
+ 0.02982161033720488f,
+ 0.5266419954188112f,
+ -1.0032154963302191f,
+ 0.7007602969763729f,
+ 0.37196355167990885f,
+ -0.7608579453228548f,
+ 0.08568111584781847f,
+ 0.07011061059123677f,
+ 0.3233263598082507f,
+ -0.08249928295410253f,
+ 0.08220165761319252f,
+ 0.22148722752246794f,
+ 0.0f,
+ 0.6122392701743506f,
+ -0.26429838296378333f,
+ 0.31958081620005463f,
+ -0.006027177397853826f,
+ -0.3088310785887994f,
+ -0.5436192046707807f,
+ -0.011080356757423306f,
+ 0.12632650770008413f,
+ -0.45097913215234525f,
+ 1.8008072867127298f,
+ -0.7630029654575501f,
+ -0.4054774329826579f,
+ 0.40386074452544535f,
+ -0.18541426257453025f,
+ 0.2444879765079863f,
+ -0.6216724756115081f,
+ 0.27030299321302f,
+ 0.0f,
+ -0.6835848952967989f,
+ -0.7914184320964815f,
+ -0.6761595019582928f,
+ -1.009565565604081f,
+ -0.1904242439353305f,
+ 0.4463417126318631f,
+ 0.6025503823452971f,
+ 0.5149990860115566f,
+ 1.0242970663937634f,
+ 0.037947306826401385f,
+ 0.07039339786212848f,
+ 0.14273796789711987f,
+ 0.168103961425691f,
+ 1.6596066376811978f,
+ 0.19321092229384657f,
+ -0.3710750388148514f,
+ -0.01717015559410288f,
+ 0.0f,
+ 0.3005688477942597f,
+ 0.23877080653829577f,
+ 0.2718594552971173f,
+ 0.3885402571589898f,
+ 0.32999531945669247f,
+ -0.6134460954213243f,
+ -0.13972265462799183f,
+ -0.07180089575716991f,
+ -1.014572598188105f,
+ 0.0717207322809836f,
+ 0.34896157745155615f,
+ -0.27127687591403f,
+ -0.5058651212773623f,
+ -1.5442435628306925f,
+ -0.6399784724734707f,
+ 0.6274301429074947f,
+ -0.4645750072767051f,
+ 0.0f,
+ -0.2406726815244178f,
+ -0.06321214115916597f,
+ 0.312856714253404f,
+ 0.16459514124116134f,
+ 0.3993579604809623f,
+ -0.15232044351561913f,
+ -0.5613743948568469f,
+ 0.7219801372223262f,
+ 0.2936857469624009f,
+ 0.7823466656034087f,
+ -0.12416947814098349f,
+ -0.36413756654028345f,
+ -0.07992098796866462f,
+ -0.7395722879842416f,
+ 0.8639913543220514f,
+ -0.311931773757945f,
+ -1.7308240470400613f,
+ 0.0f,
+ 0.394499716712104f,
+ 0.6511462819539963f,
+ -0.0722425275974144f,
+ 0.13490818194661386f,
+ 0.055319135836378035f,
+ 0.15389577508097013f,
+ 0.28958598328870605f,
+ -0.14608429470539772f,
+ 0.09488817462478298f,
+ -0.17231294096622088f,
+ 0.6721115415911466f,
+ -0.05664621150536103f,
+ 0.03291799673669331f,
+ 0.02845382711057482f,
+ -0.9953563446999164f,
+ -0.17994298220605923f,
+ 0.6560824519337476f,
+ 0.0f,
+ -0.30990646375917935f,
+ 0.17215517202874f,
+ 0.2026816225170481f,
+ 0.22011958747715601f,
+ 0.3562520768889686f,
+ -0.18436559057189175f,
+ 0.1733377147302066f,
+ 0.02818276995640877f,
+ -0.29703005574859076f,
+ -0.3310652639215064f,
+ -1.6091173258529277f,
+ 0.45461585790028003f,
+ -0.5078643334592593f,
+ -0.338997374732338f,
+ 0.4688619590359733f,
+ 0.627099126828289f,
+ -0.5249801376494249f,
+ 0.0f,
+ 0.34465498218272883f,
+ 0.009891680630908135f,
+ -0.27244020967349f,
+ 0.05404589867626979f,
+ -0.06220329325739666f,
+ -0.13365376464759104f,
+ -0.13098573553512366f,
+ 0.11434198976289106f,
+ 0.6740951247574676f,
+ 1.3381727185724581f,
+ -1.4865773213251936f,
+ 0.05809898701966341f,
+ 0.25380780261023456f,
+ 1.2716367496512722f,
+ 0.1768290070780598f,
+ -0.07554828135356352f,
+ 0.8180570085344856f,
+ 0.0f,
+ 1.0788448980077463f,
+ 0.0651938742459459f,
+ 0.3807672030015587f,
+ 0.6144792680268445f,
+ 0.011660612214908059f,
+ -0.018306023765580288f,
+ 0.44140813809926516f,
+ -0.13411994195502386f,
+ 0.15920368955127778f,
+ -0.19382358417849888f,
+ -0.08802147969690055f,
+ -0.019731052733814477f,
+ 0.1104744229169665f,
+ -0.195834419735958f,
+ -0.5005295046454347f,
+ -0.17041241868229032f,
+ -0.471942117351489f,
+ 0.0f,
+ -0.3599073304761372f,
+ -0.2745532782968519f,
+ -0.8323064841106417f,
+ -0.88355885384943f,
+ -0.02826466859020679f,
+ 0.06977870308805256f,
+ 0.11926112095374196f,
+ 1.367382707959643f,
+ -0.06119843162964051f,
+ -0.5331395268889569f,
+ -1.2155531584240624f,
+ -0.01896651779524327f,
+ 0.10591845408571081f,
+ -0.010632842156504733f,
+ 0.6150787968629282f,
+ -0.4191690185896091f,
+ -0.9961718918346271f,
+ 0.0f,
+ 0.23370364516013867f,
+ 0.4156033072362998f,
+ 0.1261005546633433f,
+ 0.0812413884532226f,
+ -0.008894337353937203f,
+ 0.07984447025056046f,
+ -0.1258098052766725f,
+ -0.40245475467767916f,
+ 1.78188906675019f,
+ -1.1544387954232302f,
+ -0.41768781481273387f,
+ 0.6791211165341995f,
+ -0.4175127856183446f,
+ -0.07353219159767788f,
+ -0.2888813577574072f,
+ -0.7107767892597061f,
+ -1.0450031091195449f,
+ 0.0f,
+ -0.9221599545079143f,
+ -0.6747876356740621f,
+ 0.30241454354872105f,
+ 0.4924965303373908f,
+ -0.14042722740054084f,
+ 0.27744210409350445f,
+ -0.14788270997426836f,
+ -0.9081467469237995f,
+ -0.04513115674995093f,
+ -0.5254168669125793f,
+ -0.6999012037974789f,
+ 0.434661246306547f,
+ -0.7193303957246092f,
+ -0.9117952623409744f,
+ -1.5097267865916142f,
+ -0.20779888103770922f,
+ 0.4935562480901218f,
+ 0.0f,
+ 0.18303393908923593f,
+ 0.34753722677570037f,
+ 0.29291001533177663f,
+ 0.3832351878354224f,
+ 0.3295194956120599f,
+ -0.32398033003617527f,
+ -0.31570906736433746f,
+ 0.23657779050372962f,
+ 0.9510794465234161f,
+ -0.5122243902568278f,
+ 0.08652112725315658f,
+ 0.2246634353717998f,
+ -0.9032595595582497f,
+ -0.8936484034533545f,
+ 0.6012969720865752f,
+ -0.6454216646117924f,
+ -1.1753786049658332f,
+ 0.0f,
+ -0.4360545677728656f,
+ -0.6586237455328507f,
+ -0.34347301697886656f,
+ -0.8909724651992144f,
+ -0.24378721818350263f,
+ 0.6179733359297576f,
+ 0.0661661181742234f,
+ -0.14120142044993794f,
+ -0.07732699885498932f,
+ 1.0221355882357506f,
+ 0.44514798994115284f,
+ -0.7371569579959046f,
+ -0.7212499572378936f,
+ 0.7453626921081045f,
+ 0.5478757761345768f,
+ -0.39411232789985384f,
+ 0.7200542656743857f,
+ 0.0f,
+ -0.11790869453118827f,
+ -0.12317030713581928f,
+ -0.4207902738133338f,
+ 0.15895105878327986f,
+ 0.304261777102111f,
+ 0.11450744587017621f,
+ -0.11470709991317944f,
+ 0.5949222371739038f,
+ 0.6549518619412444f,
+ -0.24390606570422838f,
+ -0.4212796009440803f,
+ -0.6269666206320964f,
+ -0.5421193969807078f,
+ -0.12297772128652287f,
+ 0.021517257619930424f,
+ 0.25462855095544523f,
+ -0.22107798187348246f,
+ 0.0f,
+ 0.5204516300095662f,
+ 0.2837402841862462f,
+ 0.11310823283285916f,
+ 0.8944351685018025f,
+ 0.17487203235834015f,
+ -0.5271221928634433f,
+ -0.19516594503423199f,
+ 0.452456617580365f,
+ 1.2456272242706414f,
+ 0.24166615894862817f,
+ 0.09411429305204502f,
+ -0.2730072283327243f,
+ -0.8129383770918172f,
+ -0.24093254193486136f,
+ 0.5696499174142177f,
+ -0.11110805836073044f,
+ -0.3968204166235694f,
+ 0.0f,
+ -0.04388165369378549f,
+ -0.005631266017272595f,
+ -0.02574211858479705f,
+ 0.06230399626660669f,
+ 0.17677671232932785f,
+ 0.5172871274400965f,
+ 0.4919150085620063f,
+ -1.597656637582941f,
+ 0.02415185715719143f,
+ -0.17945446376668306f,
+ -0.39340600199798886f,
+ 0.25013205256886845f,
+ 0.05972330340308685f,
+ 0.1359911505596489f,
+ -0.02341033271820833f,
+ 0.15726074644063684f,
+ 0.47512625913020357f,
+ 0.0f,
+ 0.7327341664835779f,
+ -0.3689092312320013f,
+ 0.4571824787436036f,
+ 0.6215465537945456f,
+ 0.0944111296842023f,
+ -0.12571956176607574f,
+ -0.2507235674395462f,
+ -0.09579602654351593f,
+ 1.4463357293728496f,
+ 0.749153535856049f,
+ -0.5553955120807588f,
+ -0.09622771929369946f,
+ -0.2598697420394813f,
+ -0.964691815299676f,
+ -0.8289963178173902f,
+ 0.7112949291983329f,
+ -0.8667009730492162f,
+ 0.0f,
+ -0.48698304169042794f,
+ -0.18786095669893707f,
+ -0.11425249263203247f,
+ -0.3693391011684809f,
+ 0.09933145842585253f,
+ 0.2568559685298844f,
+ 0.7048512233651738f,
+ 0.6056238412407038f,
+ -0.4355558119826642f,
+ 0.17318931883915484f,
+ 0.6481333496429564f,
+ -0.45728823054344486f,
+ -0.006325004538589701f,
+ 0.45609864075494927f,
+ -0.6199385981116988f,
+ 0.035105808783046165f,
+ 0.1203147963894839f,
+ 0.0f,
+ 0.383402190836527f,
+ 0.048429009055370106f,
+ 0.5887186439275204f,
+ -0.20538767641607814f,
+ -0.031237879611002117f,
+ 0.3140759860883231f,
+ 0.24447070584999556f,
+ 0.7271263905705878f,
+ 0.8432799162434237f,
+ -0.11530577554199217f,
+ -0.7781023892314718f,
+ 0.05359488822710336f,
+ 0.5624870388700809f,
+ 0.5134656523208906f,
+ 0.18304041423438375f,
+ -0.04237421156328257f,
+ -0.20759809886942207f,
+ 0.0f,
+ -0.06249337454975615f,
+ 0.10081284533873777f,
+ 0.3894374350259183f,
+ 1.518217777528342f,
+ -0.9100037950171563f,
+ 0.17796906121831477f,
+ -0.2892167255357892f,
+ 0.6117902467884032f,
+ 0.13332120964959573f,
+ -0.3487155932849374f,
+ -0.32920583745734694f,
+ 0.08242631209809854f,
+ -0.24920225708110588f,
+ 0.8401757259392635f,
+ 0.11729108681358365f,
+ 0.11222925752499184f,
+ -0.027078490721459958f,
+ 0.0f,
+ 0.726132375517389f,
+ 0.72220359881096f,
+ 0.5721582611845177f,
+ 0.15139162075524315f,
+ 0.6676549461551197f,
+ -0.321449586554697f,
+ -0.10141104515219895f,
+ -0.09711123988777906f,
+ 0.9623356184776928f,
+ -0.7941822373167173f,
+ -0.9373923554119346f,
+ 0.4573241832354059f,
+ -0.42029139056126147f,
+ 0.2675223459380999f,
+ -0.5487300191551386f,
+ 0.2236621891916084f,
+ 0.11692039230044018f,
+ 0.0f,
+ 0.1758399202780961f,
+ 0.676447587678781f,
+ 0.5945412815881029f,
+ 0.5669863357359594f,
+ 0.8433565415303922f,
+ -0.30300550790708036f,
+ -0.43332881999693673f,
+ -0.4996522695731392f,
+ -0.2084930815451962f,
+ 0.27765278702463786f,
+ 1.0886848763946915f,
+ -0.0739433655813831f,
+ -0.4762801579229192f,
+ -0.2490825339320731f,
+ -1.8820479350439439f,
+ -0.4251592225775914f,
+ -0.3992922365484464f,
+ 0.0f,
+ 0.19598917760218867f,
+ 0.4860238022746914f,
+ 0.3364528828641281f,
+ 0.3350950865226741f,
+ 0.2773654548632006f,
+ -0.30547262140782566f,
+ 0.028649620490728344f,
+ -0.11763407628280315f,
+ 0.6237318502627169f,
+ -0.3958952632477945f,
+ 0.14797171297835243f,
+ 0.45821729624747465f,
+ -0.8687137170773626f,
+ 0.06989667196937126f,
+ -0.5752606929478727f,
+ 0.16986945686358412f,
+ 0.6925071596817824f,
+ 0.0f,
+ 0.4991250796183003f,
+ 0.03424654896322111f,
+ 0.6153698611882319f,
+ 0.5070872444849457f,
+ 0.43615747516328135f,
+ -0.7870352838659244f,
+ -0.6424101231965247f,
+ -0.7005774876651399f,
+ 0.79983115431488f,
+ 0.15720357955596242f,
+ -1.408372612176309f,
+ -0.039294695217213765f,
+ 0.6979415372962309f,
+ 0.27403316751965656f,
+ 1.2844596102619275f,
+ -0.2781534150257364f,
+ 0.3248437714908865f,
+ 0.0f,
+ 0.4364362371752831f,
+ -0.2548580911485434f,
+ -0.19578001373349452f,
+ -0.04597194387828005f,
+ -0.010035156855533233f,
+ 0.0415941475251266f,
+ 0.07929549739797387f,
+ -0.060629652912508866f,
+ 0.5977303008711333f,
+ -1.4404008068066554f,
+ 0.8555694790197376f,
+ -0.03693438534401856f,
+ 0.17761411164512408f,
+ -0.11858304304109235f,
+ -1.4241324353471327f,
+ 0.1533849765389186f,
+ 0.7650643783126995f,
+ 0.0f,
+ -0.0639949379280401f,
+ 0.4288617817939563f,
+ 0.4235508646885404f,
+ 0.3419843254383798f,
+ -0.015992360660098768f,
+ -0.773247697505441f,
+ -0.4908452922015917f,
+ 0.9868134897291486f,
+ -0.5078689994742608f,
+ 1.05632043744864f,
+ -0.38867419409275117f,
+ -0.0065547696858664194f,
+ -0.3056003173415037f,
+ -0.333762331930102f,
+ 0.4459671174011671f,
+ 0.08219092584580244f,
+ -0.08099158579518179f,
+ 0.0f,
+ -0.1568180656346373f,
+ -0.061962372393910135f,
+ 0.14065868174859464f,
+ -0.055925712798972765f,
+ 0.05136117465820622f,
+ 0.0907831030477633f,
+ 0.19518110495319604f,
+ -0.7470794578145956f,
+ 1.5945999734733545f,
+ -0.4351697502345834f,
+ -0.33253649399571805f };
+
+const float av1_mv_prec_nn_bias_layer_0[] = {
+ -0.651213833993862f, -1.1243309933417809f, -0.2123880023097051f,
+ 0.23095477452877616f, -0.6668057665893545f, 0.3082268148379634f,
+ -0.3344916753975844f, -0.20920185606857844f, 0.6057933917964854f,
+ 0.5031857662559803f, -1.5380096313468152f, -0.4457245344804041f,
+ 1.82368055812373f, 0.7973912064077963f, 0.25706500555622913f,
+ 0.1394695119825382f, 0.4508811973450553f, -0.5408959545111782f,
+ 1.064829233697863f, 0.3733268644246235f, 1.1173169029905483f,
+ -0.2012817466400134f, -0.16628447748302294f, 1.3086000088940826f,
+ 0.7267092979664235f, -0.9097857006590555f, -0.7564259343863077f,
+ -0.49844128036716173f, -0.4675729246975423f, -0.03626154526362181f,
+ -0.41957330902404616f, -0.9658160514319954f
+};
+
+const float av1_mv_prec_nn_weights_layer_1[] = {
+ 1.5017296484510276f, 1.044216918060133f, -1.066541411740906f,
+ -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f,
+ 0.7117244268817873f, -0.7695942296628597f, 0.7892157680137047f,
+ -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f,
+ -0.9699580532370483f, 0.5849682956422552f, -1.0372272986941953f,
+ -0.5005014627824439f, 1.1816204711740521f, -1.2204867615892114f,
+ 0.4510263977504913f, 0.35567865078585165f, -0.7811389330738839f,
+ -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f,
+ 0.8861643352684585f, 0.6438840651522237f, 0.6677191546466089f,
+ 0.9703715021995785f, 1.250893534236489f, 0.7733742028067933f,
+ -1.249673977776904f, -1.2890127265725608f
+};
+
+const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f };
+
+static const NN_CONFIG av1_mv_prec_dnn_config = {
+ NUM_DNN_FEATURES,
+ NUM_LOGITS,
+ NUM_DNN_LAYERS,
+ { MV_PREC_LAYER_SIZE_0 },
+ {
+ av1_mv_prec_nn_weights_layer_0,
+ av1_mv_prec_nn_weights_layer_1,
+ },
+ {
+ av1_mv_prec_nn_bias_layer_0,
+ av1_mv_prec_nn_bias_layer_1,
+ },
+};
+#undef NUM_DNN_LAYERS
+#undef NUM_DNN_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 0000000000..94cd56c5d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/ml.h"
+
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+ const int prec_bits = 9;
+ const int prec = 1 << prec_bits;
+ const float inv_prec = (float)(1.0 / prec);
+ for (int i = 0; i < num_output; i++) {
+ output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+ }
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_c(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ int num_input_nodes = nn_config->num_inputs;
+ int buf_index = 0;
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+
+ // Propagate hidden layers.
+ const int num_layers = nn_config->num_hidden_layers;
+ assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+ for (int layer = 0; layer < num_layers; ++layer) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ float *output_nodes = buf[buf_index];
+ const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+ assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+ for (int node = 0; node < num_output_nodes; ++node) {
+ float val = layer_bias[node];
+ for (int i = 0; i < num_input_nodes; ++i)
+ val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+ // ReLU as activation function.
+ val = val > 0.0f ? val : 0.0f; // Could use AOMMAX().
+ output_nodes[node] = val;
+ }
+ num_input_nodes = num_output_nodes;
+ input_nodes = output_nodes;
+ buf_index = 1 - buf_index;
+ }
+
+ // Final output layer.
+ const float *layer_weights = nn_config->weights[num_layers];
+ const float *layer_bias = nn_config->bias[num_layers];
+ for (int node = 0; node < nn_config->num_outputs; ++node) {
+ float val = layer_bias[node];
+ for (int i = 0; i < num_input_nodes; ++i)
+ val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+ output[node] = val;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+#if CONFIG_NN_V2
+// Applies the ReLu activation to one fc layer
+// output[i] = Max(input[i],0.0f)
+static float *nn_relu(const float *input, FC_LAYER *layer) {
+ for (int i = 0; i < layer->num_outputs; ++i) {
+ layer->output[i] = AOMMAX(input[i], 0.0f);
+ }
+
+ return layer->output;
+}
+
+// Applies the Sigmoid activation to one fc layer
+// output[i] = 1/(1+exp(input[i]))
+static float *nn_sigmoid(const float *input, FC_LAYER *layer) {
+ for (int i = 0; i < layer->num_outputs; ++i) {
+ const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f);
+ layer->output[i] = 1.0f / (1.0f + expf(-tmp));
+ }
+
+ return layer->output;
+}
+
+// Forward prediction in one fc layer, used in function av1_nn_predict_V2
+static float *nn_fc_forward(const float *input, FC_LAYER *layer) {
+ const float *weights = layer->weights;
+ const float *bias = layer->bias;
+ assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER);
+ // fc
+ for (int node = 0; node < layer->num_outputs; ++node) {
+ float val = bias[node];
+ for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i];
+ layer->output[node] = val;
+ weights += layer->num_inputs;
+ }
+
+ // activation
+ switch (layer->activation) {
+ case NONE: return layer->output;
+ case RELU: return nn_relu(layer->output, layer);
+ case SIGMOID: return nn_sigmoid(layer->output, layer);
+ case SOFTSIGN:
+ assert(0 && "Softsign has not been supported in NN."); // TO DO
+ return NULL;
+ default:
+ assert(0 && "Unknown activation"); // Unknown activation
+ return NULL;
+ }
+}
+
+void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
+ int reduce_prec, float *output) {
+ const float *input_nodes = feature;
+
+ // Propagate the layers.
+ const int num_layers = nn_config->num_hidden_layers;
+ assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+ for (int i = 0; i < num_layers; ++i) {
+ input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i);
+ assert(nn_config->layer[i + 1].num_inputs ==
+ nn_config->layer[i].num_outputs);
+ }
+
+ // Final layer
+ input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers);
+ assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
+ // Copy the final layer output
+ memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
+}
+#endif // CONFIG_NN_V2
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+ // Softmax function is invariant to adding the same constant
+ // to all input values, so we subtract the maximum input to avoid
+ // possible overflow.
+ float max_input = input[0];
+ for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < n; i++) {
+ // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+ const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+ output[i] = expf(normalized_input);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
+
+void av1_nn_fast_softmax_16_c(const float *input, float *output) {
+ const int kNumClasses = 16;
+ float max_input = input[0];
+ for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < kNumClasses; i++) {
+ // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+ const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+ output[i] = approx_exp(normalized_input);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 0000000000..566f9271dd
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/av1_rtcd.h"
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+struct NN_CONFIG {
+ int num_inputs; // Number of input nodes, i.e. features.
+ int num_outputs; // Number of output nodes.
+ int num_hidden_layers; // Number of hidden layers, maximum 10.
+ // Number of nodes for each hidden layer.
+ int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+ // Weight parameters, indexed by layer.
+ const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+ // Bias parameters, indexed by layer.
+ const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+};
+// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs
+
+#if CONFIG_NN_V2
+// Fully-connectedly layer configuration
+struct FC_LAYER {
+ const int num_inputs; // Number of input nodes, i.e. features.
+ const int num_outputs; // Number of output nodes.
+
+ float *weights; // Weight parameters.
+ float *bias; // Bias parameters.
+ const ACTIVATION activation; // Activation function.
+
+ float *output; // The output array.
+ float *dY; // Gradient of outputs
+ float *dW; // Gradient of weights.
+ float *db; // Gradient of bias
+};
+
+// NN configure structure V2
+struct NN_CONFIG_V2 {
+ const int num_hidden_layers; // Number of hidden layers, max = 10.
+ FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1]; // The layer array
+ const int num_logits; // Number of output nodes.
+ float *logits; // Raw prediction (same as output of final layer)
+ const LOSS loss; // Loss function
+};
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
+ int reduce_prec, float *output);
+#endif // CONFIG_NN_V2
+
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
+// A faster but less accurate version of av1_nn_softmax(input, output, 16)
+void av1_nn_fast_softmax_16_c(const float *input, float *output);
+
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/mode_prune_model_weights.h b/third_party/aom/av1/encoder/mode_prune_model_weights.h
new file mode 100644
index 0000000000..98ec36808a
--- /dev/null
+++ b/third_party/aom/av1/encoder/mode_prune_model_weights.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_HIDDEN_LAYERS_12 1
+#define NUM_FEATURES_12 6
+#define NUM_LAYER_0_UNITS_12 24
+#define NUM_LOGITS_12 2
+
+static const float av1_intrap_hiddenlayer_0_kernel_12[] = {
+ 7.28372f, -1.3333898f, -1.3180022f, -0.007156151f, -0.40799126f,
+ -0.57538104f, -31.81647f, 6.7057495f, 6.351472f, -0.029544508f,
+ 0.026801195f, 1.12863f, -0.70769817f, -0.24183524f, 0.0649113f,
+ -0.7189517f, 0.21791299f, 0.12840256f, -0.56424767f, 0.16924907f,
+ 0.4605501f, -0.170895f, -0.60358995f, -0.15383226f, -4.0523643f,
+ 0.6961917f, 1.3100256f, -0.4189354f, 0.37264112f, -0.14555685f,
+ 10.628014f, 8.184437f, 8.941916f, -0.011731001f, -0.45127156f,
+ 0.42704004f, 36.84277f, 8.988796f, 8.844238f, 0.00030091056f,
+ -0.022038324f, 1.3566176f, -8.863219f, -0.84811693f, -1.0908632f,
+ 0.00023130262f, -1.0698471f, -6.755927f, 7.1711984f, 4.7216063f,
+ 3.5099216f, -0.6650184f, 0.5935173f, -0.6696286f, 11.8595295f,
+ 0.3001874f, 0.29822728f, 0.04319222f, -1.203178f, 1.1210147f,
+ 0.035045594f, -0.20559944f, -0.015388541f, -0.7857941f, -0.94100875f,
+ -0.1278549f, -19.22603f, 7.9466896f, 6.5048656f, -0.22195444f,
+ 0.19061874f, 1.3927288f, -8.896529f, -0.48146892f, -1.6098932f,
+ -0.0030235797f, -0.6533787f, -2.1333003f, -22.256454f, -4.934058f,
+ -4.4707212f, -0.015831878f, -0.4243649f, -2.776269f, -0.23762038f,
+ 0.1820098f, -0.51865315f, -1.1893421f, 0.34969202f, 0.10636194f,
+ 14.545696f, 1.3849198f, 2.6815193f, -0.5145498f, 0.45948258f,
+ -0.8842355f, -0.9111363f, -0.39652422f, 0.077266276f, -0.68084997f,
+ 0.4593515f, -0.28872707f, -6.936231f, 1.12253f, 1.7616503f,
+ -0.014069137f, -0.0052156276f, -4.5095444f, 6.2076726f, -0.058755957f,
+ -0.4675936f, -0.13039507f, 0.12094394f, -0.07285393f, 68.26125f,
+ 7.4893136f, 8.770954f, 0.020274093f, -0.027877754f, 1.6579602f,
+ -0.1825479f, 0.34832543f, 0.07472531f, -0.44812247f, -1.0941806f,
+ -0.16749863f, 1.1394324f, 0.47983396f, -0.99983627f, -0.00064249727f,
+ -1.3345739f, -0.057157427f, -18.14875f, 16.506035f, 15.539248f,
+ 0.013191509f, -0.021674965f, -25.006235f, 0.51220596f, 0.7334426f,
+ 0.81836903f, -1.0443225f, 0.4459505f, -1.2045046f
+};
+
+static const float av1_intrap_hiddenlayer_0_bias_12[] = {
+ -4.154915f, 14.33833f, 0.0f, 0.0f, 2.0440118f, 12.40922f,
+ -16.77514f, 0.5879813f, 3.2305415f, 0.8303539f, 0.0f, 14.488708f,
+ 2.94393f, 1.874383f, 0.0f, -0.53140444f, 0.0f, 1.8456234f,
+ -0.55427986f, -19.856262f, 0.0f, 0.17281002f, 48.31631f, 0.0f
+};
+
+static const float av1_intrap_logits_kernel_12[] = {
+ 0.26843873f, -0.09576241f, 0.34427166f, 0.09914787f, -0.10275399f,
+ 0.02999484f, -0.1467772f, 0.11594324f, 0.29200763f, 0.0067976206f,
+ 0.050393578f, -0.018694371f, 0.3333476f, 0.2127221f, 0.35128218f,
+ 0.19968672f, 0.08099991f, 0.084850654f, -0.16045967f, 0.30286232f,
+ 0.6164765f, -0.27140254f, 0.08210814f, 0.34852806f, 0.25028184f,
+ -0.12188078f, 0.16310331f, 0.31253803f, -0.10792341f, 0.065858394f,
+ -0.1349708f, 0.08948815f, 0.31905392f, 0.03680656f, -0.05040944f,
+ -0.051539157f, 0.3211852f, 0.2137136f, 0.45037416f, 0.22748767f,
+ -0.10978614f, 0.06475646f, -0.16954158f, 0.32831904f, 0.16479677f,
+ -0.30020145f, 0.066221856f, 0.37213042f
+};
+
+static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f };
+
+static const NN_CONFIG av1_intrap_nn_config = {
+ NUM_FEATURES_12,
+ NUM_LOGITS_12,
+ NUM_HIDDEN_LAYERS_12,
+ {
+ NUM_LAYER_0_UNITS_12,
+ },
+ {
+ av1_intrap_hiddenlayer_0_kernel_12,
+ av1_intrap_logits_kernel_12,
+ },
+ {
+ av1_intrap_hiddenlayer_0_bias_12,
+ av1_intrap_logits_bias_12,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_12
+#undef NUM_FEATURES_12
+#undef NUM_LAYER_0_UNITS_12
+#undef NUM_LOGITS_12
+
+#define NUM_HIDDEN_LAYERS_15 1
+#define NUM_FEATURES_15 6
+#define NUM_LAYER_0_UNITS_15 24
+#define NUM_LOGITS_15 2
+
+static const float av1_intraph_hiddenlayer_0_kernel_15[] = {
+ -0.77480125f, 0.3219551f, -0.015702145f, -0.5310235f, 0.5254026f,
+ -1.1522819f, 2.682016f, 0.08001052f, -0.2539285f, 0.04711023f,
+ -0.81296307f, 0.2675382f, 0.1952474f, -0.0664705f, 1.2989824f,
+ -0.3150117f, -0.8022715f, 0.045423955f, -27.584324f, -2.5608704f,
+ -3.2280366f, 0.05272543f, -0.47141576f, -0.07644298f, -53.77942f,
+ -22.393923f, -23.027853f, -0.00015186476f, -0.010696465f, 2.7064638f,
+ -22.776028f, 11.514891f, 11.138167f, -0.001243723f, -0.4802433f,
+ -8.758646f, 0.26398206f, -0.23485385f, 0.27586034f, -0.004954741f,
+ -0.4935232f, -0.017607696f, 69.56049f, -1.1756641f, -0.052366666f,
+ -0.38052833f, 0.32474658f, 0.04634263f, 0.8583235f, -0.528438f,
+ -0.7868907f, -0.4757781f, 0.4620985f, -0.70621157f, 231.40195f,
+ 6.805205f, 9.420295f, 0.02585775f, -0.03480937f, 1.3577378f,
+ 0.1758226f, 15.056758f, 14.437874f, -0.1305005f, 0.115103304f,
+ 0.21297209f, 55.821743f, -6.611156f, -6.8552365f, -0.011928095f,
+ -0.2042175f, 1.2557873f, -1.0722278f, -0.2683614f, 0.48318478f,
+ -0.73739994f, 0.54055226f, -0.03224738f, -0.06767959f, -0.21015017f,
+ 0.29171246f, -0.6937296f, -1.2342545f, -0.41278538f, -37.9365f,
+ 17.68424f, 16.263042f, -0.074828684f, 0.06607806f, -0.16763286f,
+ 13.594707f, 0.6152676f, -0.4371223f, -0.8365592f, 0.8273623f,
+ -1.2126317f, 0.1216157f, -1.3002136f, -0.18856938f, -0.2589358f,
+ -0.76897144f, 0.21777137f, -122.25033f, -0.23490006f, -3.1238277f,
+ -0.13916978f, 0.08576391f, -1.7391548f, -116.24812f, 14.906071f,
+ 13.468357f, 0.02332889f, -0.034617376f, -18.506111f, 0.7500542f,
+ -1.1882535f, 0.40848416f, -0.28434393f, -0.71471655f, -0.29188696f,
+ -0.46588746f, -0.17324813f, -0.62460244f, -1.1801276f, 0.28993344f,
+ -0.22072886f, 129.2688f, -0.33782578f, -0.34836572f, -0.034112718f,
+ -0.023666814f, -0.5865087f, -33.484146f, 1.1431375f, 0.56056374f,
+ -0.0049730353f, -0.24347587f, -1.3003352f, 0.88973033f, 0.8499571f,
+ -0.5678484f, -0.39009875f, -0.062105156f, -0.13965102f
+};
+
+static const float av1_intraph_hiddenlayer_0_bias_15[] = {
+ 0.0f, -0.2926711f, 0.0f, -1.0303509f, -27.459345f, 12.412848f,
+ 0.0f, -2.5971522f, -0.02733541f, -19.881912f, 14.391992f, -8.249469f,
+ 0.0f, 0.0f, 13.676118f, -0.6472994f, -0.07189449f, 1.1986839f,
+ 52.479107f, 0.0f, 0.0f, -3.0187025f, 1.4435643f, 0.0f
+};
+
+static const float av1_intraph_logits_kernel_15[] = {
+ 0.05390722f, -0.06859513f, 0.036842898f, 0.190772f, 0.13623567f,
+ 0.09321194f, 0.2314745f, -0.13958375f, -0.3058229f, -0.0104543045f,
+ 0.11336068f, -0.276115f, 0.00470723f, -0.49123898f, -0.15988174f,
+ 0.087681435f, 0.022517204f, 0.073877744f, 0.2968856f, -0.1401399f,
+ -0.38788354f, -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f,
+ -0.032179773f, -0.35758728f, 0.25819537f, 0.11468631f, 0.13573235f,
+ -0.2672175f, 0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f,
+ -0.21821865f, 0.08434734f, 0.3129456f, -0.18215221f, 0.08884877f,
+ -0.35621428f, 0.11405768f, 0.27370325f, 0.14956686f, 0.01604587f,
+ -0.18334487f, -0.42385718f, -0.08033409f
+};
+
+static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f };
+
+static const NN_CONFIG av1_intrap_hd_nn_config = {
+ NUM_FEATURES_15,
+ NUM_LOGITS_15,
+ NUM_HIDDEN_LAYERS_15,
+ {
+ NUM_LAYER_0_UNITS_15,
+ },
+ {
+ av1_intraph_hiddenlayer_0_kernel_15,
+ av1_intraph_logits_kernel_15,
+ },
+ {
+ av1_intraph_hiddenlayer_0_bias_15,
+ av1_intraph_logits_bias_15,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_15
+#undef NUM_FEATURES_15
+#undef NUM_LAYER_0_UNITS_15
+#undef NUM_LOGITS_15
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/model_rd.h b/third_party/aom/av1/encoder/model_rd.h
new file mode 100644
index 0000000000..f7e8b96b5b
--- /dev/null
+++ b/third_party/aom/av1/encoder/model_rd.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODEL_RD_H_
+#define AOM_AV1_ENCODER_MODEL_RD_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "config/aom_dsp_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
+
+typedef void (*model_rd_for_sb_type)(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+ uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+ int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+
+static int64_t calculate_sse(MACROBLOCKD *const xd,
+ const struct macroblock_plane *p,
+ struct macroblockd_plane *pd, const int bw,
+ const int bh) {
+ int64_t sse = 0;
+ const int shift = xd->bd - 8;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ bw, bh);
+ } else {
+ sse =
+ aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+ }
+#else
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+#endif
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+ return sse;
+}
+
+static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane, const BLOCK_SIZE bsize) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+
+ int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+
+ return sse;
+}
+
+static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)num_samples;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+
+ // Fast approximate the modelling function.
+ if (cpi->sf.rd_sf.simple_model_rd_from_var) {
+ const int64_t square_error = sse;
+ int quantizer = p->dequant_QTX[1] >> dequant_shift;
+ if (quantizer < 120)
+ *rate = (int)AOMMIN(
+ (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+ INT_MAX);
+ else
+ *rate = 0;
+ assert(*rate >= 0);
+ *dist = (square_error * quantizer) >> 8;
+ } else {
+ av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+ p->dequant_QTX[1] >> dequant_shift, rate,
+ dist);
+ }
+ *dist <<= 4;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1);
+
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xqr = log2(sse_norm / qstepsqr);
+ double rate_f, dist_by_sse_norm_f;
+ av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+ &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static AOM_INLINE void model_rd_for_sb(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+ uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+ int64_t *plane_sse, int64_t *plane_dist) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int plane;
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ int64_t sse;
+ int rate;
+ int64_t dist;
+
+ sse = calculate_sse(xd, p, pd, bw, bh);
+
+ model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ assert(rate_sum >= 0);
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ rate_sum = AOMMIN(rate_sum, INT_MAX);
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static AOM_INLINE void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+ uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+ int64_t *plane_sse, int64_t *plane_dist) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+
+ sse = calculate_sse(xd, p, pd, bw, bh);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType);
+
+static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+ model_rd_for_sb, model_rd_for_sb_with_curvfit
+};
+
+static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+ model_rd_from_sse, model_rd_with_curvfit
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_MODEL_RD_H_
diff --git a/third_party/aom/av1/encoder/motion_search_facade.c b/third_party/aom/av1/encoder/motion_search_facade.c
new file mode 100644
index 0000000000..e7eec29dc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/motion_search_facade.c
@@ -0,0 +1,1071 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+
+#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
+
+typedef struct {
+ int_mv fmv;
+ int weight;
+} cand_mv_t;
+
+static int compare_weight(const void *a, const void *b) {
+ const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight;
+ if (diff < 0)
+ return 1;
+ else if (diff > 0)
+ return -1;
+ return 0;
+}
+
+// Allow more mesh searches for screen content type on the ARF.
+static int use_fine_search_interval(const AV1_COMP *const cpi) {
+ return cpi->is_screen_content_type &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+ cpi->oxcf.speed <= 2;
+}
+
+// Iterate through the tpl and collect the mvs to be used as candidates
+static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
+ const MACROBLOCK *x,
+ BLOCK_SIZE bsize, int ref,
+ cand_mv_t *cand, int *cand_count,
+ int *total_cand_weight) {
+ const SuperBlockEnc *sb_enc = &x->sb_enc;
+ if (!sb_enc->tpl_data_count) {
+ return;
+ }
+
+ const AV1_COMMON *cm = &cpi->common;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ const BLOCK_SIZE tpl_bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const int tplw = mi_size_wide[tpl_bsize];
+ const int tplh = mi_size_high[tpl_bsize];
+ const int nw = mi_size_wide[bsize] / tplw;
+ const int nh = mi_size_high[bsize] / tplh;
+
+ if (nw >= 1 && nh >= 1) {
+ const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size];
+ const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size];
+ const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
+ int valid = 1;
+
+ // Assign large weight to start_mv, so it is always tested.
+ cand[0].weight = nw * nh;
+
+ for (int k = 0; k < nh; k++) {
+ for (int l = 0; l < nw; l++) {
+ const int_mv mv =
+ sb_enc
+ ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME];
+ if (mv.as_int == INVALID_MV) {
+ valid = 0;
+ break;
+ }
+
+ const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+ GET_MV_RAWPEL(mv.as_mv.col) };
+ int unique = 1;
+ for (int m = 0; m < *cand_count; m++) {
+ if (RIGHT_SHIFT_MV(fmv.row) ==
+ RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) &&
+ RIGHT_SHIFT_MV(fmv.col) ==
+ RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) {
+ unique = 0;
+ cand[m].weight++;
+ break;
+ }
+ }
+
+ if (unique) {
+ cand[*cand_count].fmv.as_fullmv = fmv;
+ cand[*cand_count].weight = 1;
+ (*cand_count)++;
+ }
+ }
+ if (!valid) break;
+ }
+
+ if (valid) {
+ *total_cand_weight = 2 * nh * nw;
+ if (*cand_count > 2)
+ qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight);
+ }
+ }
+}
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+ int search_range, inter_mode_info *mode_info,
+ int_mv *best_mv,
+ struct HandleInterModeArgs *const args) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int bestsme = INT_MAX;
+ const int ref = mbmi->ref_frame[ref_idx];
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const MvCosts *mv_costs = x->mv_costs;
+
+ if (scaled_ref_frame) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // full-pixel motion search code to be used without additional
+ // modifications.
+ for (int i = 0; i < num_planes; i++) {
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+ }
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+ int step_param;
+ if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) {
+ // Take the weighted average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param = (av1_init_search_range(x->max_mv_context[ref]) +
+ mv_search_params->mv_step_param) /
+ 2;
+ } else {
+ step_param = mv_search_params->mv_step_param;
+ }
+
+ const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+ FULLPEL_MV start_mv;
+ if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+ start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
+ else
+ start_mv = get_fullmv_from_mv(&ref_mv);
+
+ // cand stores start_mv and all possible MVs in a SB.
+ cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1];
+ av1_zero(cand);
+ cand[0].fmv.as_fullmv = start_mv;
+ int cnt = 1;
+ int total_weight = 0;
+
+ if (!cpi->sf.mv_sf.full_pixel_search_level &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION) {
+ get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight);
+ }
+
+ const int cand_cnt = AOMMIN(2, cnt);
+ // TODO(any): Test the speed feature for OBMC_CAUSAL mode.
+ if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION) {
+ const int stack_size = args->start_mv_cnt;
+ for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) {
+ int_mv *fmv_cand = &cand[cand_idx].fmv;
+ int skip_cand_mv = 0;
+
+ // Check difference between mvs in the stack and candidate mv.
+ for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) {
+ const uint8_t this_ref_mv_idx = args->ref_mv_idx_stack[stack_idx];
+ const FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx];
+ const int this_newmv_valid =
+ args->single_newmv_valid[this_ref_mv_idx][ref];
+ const int row_diff = abs(fmv_stack->row - fmv_cand->as_fullmv.row);
+ const int col_diff = abs(fmv_stack->col - fmv_cand->as_fullmv.col);
+
+ if (!this_newmv_valid) continue;
+
+ if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 2) {
+ // Prunes the current start_mv candidate, if the absolute mv
+ // difference of both row and column are <= 1.
+ if (row_diff <= 1 && col_diff <= 1) {
+ skip_cand_mv = 1;
+ break;
+ }
+ } else if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 1) {
+ // Prunes the current start_mv candidate, if the sum of the absolute
+ // mv difference of row and column is <= 1.
+ if (row_diff + col_diff <= 1) {
+ skip_cand_mv = 1;
+ break;
+ }
+ }
+ }
+ if (skip_cand_mv) {
+ // Ensure atleast one full-pel motion search is not pruned.
+ assert(mbmi->ref_mv_idx != 0);
+ // Mark the candidate mv as invalid so that motion search gets skipped.
+ cand[cand_idx].fmv.as_int = INVALID_MV;
+ } else {
+ // Store start_mv candidate and corresponding ref_mv_idx of full-pel
+ // search in the mv stack (except last ref_mv_idx).
+ if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) {
+ assert(args->start_mv_cnt < (MAX_REF_MV_SEARCH - 1) * 2);
+ args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv;
+ args->ref_mv_idx_stack[args->start_mv_cnt] = mbmi->ref_mv_idx;
+ args->start_mv_cnt++;
+ }
+ }
+ }
+ }
+
+ // Hot fix for asan complaints when resize mode is on. When resize mode is on,
+ // the stride of the reference frame can be different from indicated by
+ // MotionVectorSearchParams::search_site_cfg. When this happens, we need to
+ // readjust the stride.
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
+ const search_site_config *src_search_site_cfg =
+ av1_get_search_site_config(cpi, x, search_method);
+
+ // Further reduce the search range.
+ if (search_range < INT_MAX) {
+ const search_site_config *search_site_cfg =
+ &src_search_site_cfg[search_method_lookup[search_method]];
+ // Max step_param is search_site_cfg->num_search_steps.
+ if (search_range < 1) {
+ step_param = search_site_cfg->num_search_steps;
+ } else {
+ while (search_site_cfg->radius[search_site_cfg->num_search_steps -
+ step_param - 1] > (search_range << 1) &&
+ search_site_cfg->num_search_steps - step_param - 1 > 0)
+ step_param++;
+ }
+ }
+
+ int cost_list[5];
+ FULLPEL_MV_STATS best_mv_stats;
+ int_mv second_best_mv;
+ best_mv->as_int = second_best_mv.as_int = INVALID_MV;
+
+ // Allow more mesh searches for screen content type on the ARF.
+ const int fine_search_interval = use_fine_search_interval(cpi);
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION: {
+ // Perform a search with the top 2 candidates
+ int sum_weight = 0;
+ for (int m = 0; m < cand_cnt; m++) {
+ int_mv smv = cand[m].fmv;
+ FULLPEL_MV this_best_mv, this_second_best_mv;
+ FULLPEL_MV_STATS this_mv_stats;
+
+ if (smv.as_int == INVALID_MV) continue;
+
+ av1_make_default_fullpel_ms_params(
+ &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv,
+ src_search_site_cfg, search_method, fine_search_interval);
+
+ const int thissme =
+ av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), &this_best_mv,
+ &this_mv_stats, &this_second_best_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ best_mv->as_fullmv = this_best_mv;
+ best_mv_stats = this_mv_stats;
+ second_best_mv.as_fullmv = this_second_best_mv;
+ }
+
+ sum_weight += cand[m].weight;
+ if (4 * sum_weight > 3 * total_weight) break;
+ }
+ } break;
+ case OBMC_CAUSAL:
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+ &ref_mv, start_mv, src_search_site_cfg,
+ search_method, fine_search_interval);
+
+ bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
+ step_param, &best_mv->as_fullmv);
+ break;
+ default: assert(0 && "Invalid motion mode!\n");
+ }
+ if (best_mv->as_int == INVALID_MV) return;
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref_idx] = backup_yv12[i];
+ }
+ }
+
+ // Terminate search with the current ref_idx based on fullpel mv, rate cost,
+ // and other know cost.
+ if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION &&
+ best_mv->as_int != INVALID_MV) {
+ int_mv this_mv;
+ this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ const int this_mv_rate =
+ av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
+ mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+ mode_info[ref_mv_idx].full_mv_bestsme = bestsme;
+
+ for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+ // Check if the motion search result same as previous results
+ if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) {
+ // Compare the rate cost
+ const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate +
+ mode_info[prev_ref_idx].drl_cost;
+ const int this_rate_cost =
+ this_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+ if (prev_rate_cost <= this_rate_cost) {
+ // If the current rate_cost is worse than the previous rate_cost, then
+ // we terminate the search. Since av1_single_motion_search is only
+ // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the
+ // best_mv to INVALID mv to signal that we wish to terminate search
+ // for the current mode.
+ best_mv->as_int = INVALID_MV;
+ return;
+ }
+ }
+
+ // Terminate the evaluation of current ref_mv_idx based on bestsme and
+ // drl_cost.
+ const int psme = mode_info[prev_ref_idx].full_mv_bestsme;
+ if (psme == INT_MAX) continue;
+ const int thr =
+ cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme;
+ if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 &&
+ mode_info[ref_mv_idx].full_mv_bestsme > thr &&
+ mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) {
+ best_mv->as_int = INVALID_MV;
+ return;
+ }
+ }
+ }
+
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ convert_fullmv_to_mv(best_mv);
+ }
+
+ const int use_fractional_mv =
+ bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+ int best_mv_rate = 0;
+ int mv_rate_calculated = 0;
+ if (use_fractional_mv) {
+ int_mv fractional_ms_list[3];
+ av1_set_fractional_mv(fractional_ms_list);
+ int dis; /* TODO: use dis in distortion calculation later. */
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+ cost_list);
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION:
+ if (mv_sf->use_accurate_subpel_search) {
+ const int try_second = second_best_mv.as_int != INVALID_MV &&
+ second_best_mv.as_int != best_mv->as_int &&
+ (mv_sf->disable_second_mv <= 1);
+ const int best_mv_var = mv_search_params->find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list);
+
+ if (try_second) {
+ struct macroblockd_plane *p = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ int64_t rd = INT64_MAX;
+ if (!mv_sf->disable_second_mv) {
+ // Calculate actual rd cost.
+ mbmi->mv[0].as_mv = best_mv->as_mv;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize, 0, 0);
+ av1_subtract_plane(x, bsize, 0);
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+ av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
+ max_txsize_rect_lookup[bsize]);
+ int this_mv_rate = av1_mv_bit_cost(
+ &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate,
+ this_rd_stats.dist);
+ }
+
+ MV this_best_mv;
+ subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+ if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
+ subpel_start_mv)) {
+ unsigned int sse;
+ const int this_var = mv_search_params->find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv,
+ &dis, &sse, fractional_ms_list);
+
+ if (!mv_sf->disable_second_mv) {
+ // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
+ // to choose the better MV.
+ mbmi->mv[0].as_mv = this_best_mv;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize, 0, 0);
+ av1_subtract_plane(x, bsize, 0);
+ RD_STATS tmp_rd_stats;
+ av1_init_rd_stats(&tmp_rd_stats);
+ av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
+ max_txsize_rect_lookup[bsize]);
+ int tmp_mv_rate = av1_mv_bit_cost(
+ &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ int64_t tmp_rd =
+ RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
+ tmp_rd_stats.dist);
+ if (tmp_rd < rd) {
+ best_mv->as_mv = this_best_mv;
+ x->pred_sse[ref] = sse;
+ }
+ } else {
+ // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the
+ // best MV.
+ if (this_var < best_mv_var) {
+ best_mv->as_mv = this_best_mv;
+ x->pred_sse[ref] = sse;
+ }
+ }
+ }
+ }
+ } else {
+ mv_search_params->find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL);
+ }
+ break;
+ case OBMC_CAUSAL:
+ av1_find_best_obmc_sub_pixel_tree_up(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis,
+ &x->pred_sse[ref], NULL);
+ break;
+ default: assert(0 && "Invalid motion mode!\n");
+ }
+
+ // Terminate search with the current ref_idx based on subpel mv and rate
+ // cost.
+ if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION &&
+ best_mv->as_int != INVALID_MV) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ best_mv_rate =
+ av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ mv_rate_calculated = 1;
+
+ for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+ if (!args->single_newmv_valid[prev_ref_idx][ref]) continue;
+ // Check if the motion vectors are the same.
+ if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) {
+ // Skip this evaluation if the previous one is skipped.
+ if (mode_info[prev_ref_idx].skip) {
+ mode_info[ref_mv_idx].skip = 1;
+ break;
+ }
+ // Compare the rate cost that we current know.
+ const int prev_rate_cost =
+ args->single_newmv_rate[prev_ref_idx][ref] +
+ mode_info[prev_ref_idx].drl_cost;
+ const int this_rate_cost =
+ best_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+ if (prev_rate_cost <= this_rate_cost) {
+ // If the current rate_cost is worse than the previous rate_cost,
+ // then we terminate the search for this ref_mv_idx.
+ mode_info[ref_mv_idx].skip = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (mv_rate_calculated) {
+ *rate_mv = best_mv_rate;
+ } else {
+ *rate_mv =
+ av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+}
+
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv,
+ int allow_second_mv, int joint_me_num_refine_iter) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ const int plane = 0;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+ const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+ const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+ const MvCosts *mv_costs = x->mv_costs;
+ int_mv ref_mv[2];
+ int ite, ref;
+
+ // Get the prediction block from the 'other' reference frame.
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ InterPredParams inter_pred_params;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ int last_besterr[2] = { INT_MAX, INT_MAX };
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ av1_get_scaled_ref_frame(cpi, refs[0]),
+ av1_get_scaled_ref_frame(cpi, refs[1])
+ };
+
+ // Prediction buffer from second frame.
+ DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+ uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+
+ int_mv best_mv, second_best_mv;
+
+ // Allow joint search multiple times iteratively for each reference frame
+ // and break out of the search loop if it couldn't find a better mv.
+ for (ite = 0; ite < (2 * joint_me_num_refine_iter); ite++) {
+ struct buf_2d ref_yv12[2];
+ int bestsme = INT_MAX;
+ int id = ite % 2; // Even iterations search in the first reference frame,
+ // odd iterations search in the second. The predictor
+ // found for the 'other' reference frame is factored in.
+ if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+ if (cur_mv[id].as_int == init_mv[id].as_int) {
+ break;
+ } else {
+ int_mv cur_int_mv, init_int_mv;
+ cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+ cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+ if (cur_int_mv.as_int == init_int_mv.as_int) {
+ break;
+ }
+ }
+ }
+ for (ref = 0; ref < 2; ++ref) {
+ ref_mv[ref] = av1_get_ref_mv(x, ref);
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ if (scaled_ref_frame[ref]) {
+ int i;
+ for (i = 0; i < num_planes; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL, num_planes);
+ }
+ }
+
+ assert(IMPLIES(scaled_ref_frame[0] != NULL,
+ cm->width == scaled_ref_frame[0]->y_crop_width &&
+ cm->height == scaled_ref_frame[0]->y_crop_height));
+ assert(IMPLIES(scaled_ref_frame[1] != NULL,
+ cm->width == scaled_ref_frame[1]->y_crop_width &&
+ cm->height == scaled_ref_frame[1]->y_crop_height));
+
+ // Initialize based on (possibly scaled) prediction buffers.
+ ref_yv12[0] = xd->plane[plane].pre[0];
+ ref_yv12[1] = xd->plane[plane].pre[1];
+
+ av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+ &cm->sf_identity, &ref_yv12[!id], interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+ // Since we have scaled the reference frames to match the size of the
+ // current frame we must use a unit scaling factor during mode selection.
+ av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
+ &inter_pred_params);
+
+ // Do full-pixel compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+ // Make motion search params
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ // Use the mv result from the single mode as mv predictor.
+ const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+ &ref_mv[id].as_mv, start_fullmv,
+ src_search_sites, search_method,
+ /*fine_search_interval=*/0);
+
+ av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+ mask_stride, id);
+
+ // Small-range full-pixel motion search.
+ if (!mv_sf->disable_extensive_joint_motion_search &&
+ mbmi->interinter_comp.type != COMPOUND_WEDGE) {
+ bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+ &best_mv.as_fullmv, &best_mv_stats,
+ &second_best_mv.as_fullmv);
+ } else {
+ bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+ &best_mv.as_fullmv);
+ second_best_mv = best_mv;
+ }
+
+ const int try_second = second_best_mv.as_int != INVALID_MV &&
+ second_best_mv.as_int != best_mv.as_int &&
+ allow_second_mv;
+
+ // Restore the pointer to the first (possibly scaled) prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+ // Re-initialize based on unscaled prediction buffers.
+ ref_yv12[ref] = xd->plane[plane].pre[ref];
+ }
+ }
+
+ // Do sub-pixel compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ convert_fullmv_to_mv(&best_mv);
+ }
+ if (bestsme < INT_MAX &&
+ cpi->common.features.cur_frame_force_integer_mv == 0) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+ &ref_mv[id].as_mv, NULL);
+ av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+ mask, mask_stride, id);
+ ms_params.forced_stop = EIGHTH_PEL;
+ MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+ bestsme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL);
+
+ if (try_second) {
+ MV this_best_mv;
+ MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+ if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
+ const int thissme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis,
+ &sse, NULL);
+ if (thissme < bestsme) {
+ best_mv.as_mv = this_best_mv;
+ bestsme = thissme;
+ }
+ }
+ }
+ }
+
+ // Restore the pointer to the first prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+ if (bestsme < last_besterr[id]) {
+ cur_mv[id] = best_mv;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ }
+
+ *rate_mv = 0;
+
+ for (ref = 0; ref < 2; ++ref) {
+ const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+ *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+ mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+
+ return AOMMIN(last_besterr[0], last_besterr[1]);
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *this_mv,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int *rate_mv, int ref_idx) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int ref = mbmi->ref_frame[ref_idx];
+ const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const MvCosts *mv_costs = x->mv_costs;
+
+ struct buf_2d backup_yv12[MAX_MB_PLANE];
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+
+ // Check that this is either an interinter or an interintra block
+ assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+ // Store the first prediction buffer.
+ struct buf_2d orig_yv12;
+ if (ref_idx) {
+ orig_yv12 = pd->pre[0];
+ pd->pre[0] = pd->pre[ref_idx];
+ }
+
+ if (scaled_ref_frame) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // full-pixel motion search code to be used without additional
+ // modifications.
+ for (int i = 0; i < num_planes; i++) {
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+ }
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ // The index below needs to be 0 instead of ref_idx since we assume the
+ // 0th slot to be used for subsequent searches. Note that the ref_idx
+ // reference buffer has been copied to the 0th slot in the code above.
+ // Now we need to swap the reference frame for the 0th slot.
+ av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ int bestsme = INT_MAX;
+ int_mv best_mv;
+
+ // Make motion search params
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ // Use the mv result from the single mode as mv predictor.
+ const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+ &ref_mv.as_mv, start_fullmv,
+ src_search_sites, search_method,
+ /*fine_search_interval=*/0);
+
+ av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+ mask_stride, ref_idx);
+
+ // Small-range full-pixel motion search.
+ bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers for subpel motion search for the 0th slot.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ }
+
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ convert_fullmv_to_mv(&best_mv);
+ }
+ const int use_fractional_mv =
+ bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+ if (use_fractional_mv) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv,
+ NULL);
+ av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+ mask, mask_stride, ref_idx);
+ ms_params.forced_stop = EIGHTH_PEL;
+ MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+ bestsme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis,
+ &sse, NULL);
+ }
+
+ // Restore the pointer to the first unscaled prediction buffer.
+ if (ref_idx) pd->pre[0] = orig_yv12;
+
+ if (bestsme < INT_MAX) *this_mv = best_mv.as_mv;
+
+ *rate_mv = 0;
+
+ *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ return bestsme;
+}
+
+static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ const MV *other_mv, int ref_idx,
+ uint8_t *second_pred) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x);
+ const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y);
+
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+
+ const int plane = 0;
+ struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+ struct scale_factors sf;
+ av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+ cm->width, cm->height);
+
+ InterPredParams inter_pred_params;
+
+ av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), 0, &sf, &ref_yv12,
+ mbmi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+
+ // Get the prediction block from the 'other' reference frame.
+ av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
+ &inter_pred_params);
+}
+
+// Wrapper for av1_compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+int av1_compound_single_motion_search_interinter(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(xd->mi[0]));
+
+ // Prediction buffer from second frame.
+ DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+ uint8_t *second_pred;
+ if (is_cur_buf_hbd(xd))
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+ else
+ second_pred = (uint8_t *)second_pred_alloc_16;
+
+ MV *this_mv = &cur_mv[ref_idx].as_mv;
+ const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+ build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
+ return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred,
+ mask, mask_stride, rate_mv, ref_idx);
+}
+
+static AOM_INLINE void do_masked_motion_search_indexed(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+ int_mv *tmp_mv, int *rate_mv, int which) {
+ // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ BLOCK_SIZE sb_type = mbmi->bsize;
+ const uint8_t *mask;
+ const int mask_stride = block_size_wide[bsize];
+
+ mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+ tmp_mv[0].as_int = cur_mv[0].as_int;
+ tmp_mv[1].as_int = cur_mv[1].as_int;
+ if (which == 0 || which == 1) {
+ av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
+ mask_stride, rate_mv, which);
+ } else if (which == 2) {
+ const int joint_me_num_refine_iter =
+ cpi->sf.inter_sf.enable_fast_compound_mode_search == 2
+ ? REDUCED_JOINT_ME_REFINE_ITER
+ : NUM_JOINT_ME_REFINE_ITER;
+ av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv,
+ !cpi->sf.mv_sf.disable_second_mv,
+ joint_me_num_refine_iter);
+ }
+}
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int_mv tmp_mv[2];
+ int tmp_rate_mv = 0;
+ // TODO(jingning): The average compound mode has proper SAD and variance
+ // functions implemented, and is triggerd by setting the mask pointer as
+ // Null. Need to further implement those for frame distance weighted mode.
+ mbmi->interinter_comp.seg_mask =
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+ if (this_mode == NEW_NEWMV) {
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ tmp_mv, &tmp_rate_mv, 2);
+ mbmi->mv[0].as_int = tmp_mv[0].as_int;
+ mbmi->mv[1].as_int = tmp_mv[1].as_int;
+ } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) {
+ // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+ // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV
+ int which = (NEWMV == compound_ref1_mode(this_mode));
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ tmp_mv, &tmp_rate_mv, which);
+ mbmi->mv[which].as_int = tmp_mv[which].as_int;
+ }
+ return tmp_rate_mv;
+}
+
+int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int ref,
+ FULLPEL_MV start_mv, int num_planes,
+ int use_subpixel, unsigned int *sse,
+ unsigned int *var) {
+ assert(num_planes == 1 &&
+ "Currently simple_motion_search only supports luma plane");
+ assert(!frame_is_intra_only(&cpi->common) &&
+ "Simple motion search only enabled for non-key frames");
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->bsize = bsize;
+ mbmi->ref_frame[0] = ref;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+ struct buf_2d backup_yv12;
+ // ref_mv is used to calculate the cost of the motion vector
+ const MV ref_mv = kZeroMv;
+ const int step_param =
+ AOMMIN(cpi->mv_search_params.mv_step_param +
+ cpi->sf.part_sf.simple_motion_search_reduce_search_steps,
+ MAX_MVSEARCH_STEPS - 2);
+ int cost_list[5];
+ const int ref_idx = 0;
+ int bestsme;
+ int_mv best_mv;
+ FULLPEL_MV_STATS best_mv_stats;
+
+ av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+ get_ref_scale_factors(cm, ref), num_planes);
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ if (scaled_ref_frame) {
+ backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ // Allow more mesh searches for screen content type on the ARF.
+ const int fine_search_interval = use_fine_search_interval(cpi);
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+ start_mv, src_search_sites, search_method,
+ fine_search_interval);
+
+ bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ const int use_subpel_search =
+ bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+ use_subpixel &&
+ (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL);
+ if (scaled_ref_frame) {
+ xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+ }
+ if (use_subpel_search) {
+ int not_used = 0;
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+ cost_list);
+ // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
+ ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop;
+
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+
+ cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv,
+ &not_used, &x->pred_sse[ref], NULL);
+
+ mbmi->mv[0] = best_mv;
+
+ // Get a copy of the prediction output
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ *var = cpi->ppi->fn_ptr[bsize].vf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, sse);
+ } else {
+ // Manually convert from units of pixel to 1/8-pixels if we are not doing
+ // subpel search
+ convert_fullmv_to_mv(&best_mv);
+ *var = best_mv_stats.distortion;
+ *sse = best_mv_stats.sse;
+ }
+
+ return best_mv;
+}
diff --git a/third_party/aom/av1/encoder/motion_search_facade.h b/third_party/aom/av1/encoder/motion_search_facade.h
new file mode 100644
index 0000000000..d1fa915bca
--- /dev/null
+++ b/third_party/aom/av1/encoder/motion_search_facade.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_
+#define AOM_AV1_ENCODER_MOTION_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_JOINT_ME_REFINE_ITER 2
+#define REDUCED_JOINT_ME_REFINE_ITER 1
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_modes_info, which makes this terribly confusing.
+typedef struct {
+ int drl_cost;
+ int_mv full_search_mv;
+ int full_mv_rate;
+ int full_mv_bestsme;
+ int skip;
+} inter_mode_info;
+
+struct HandleInterModeArgs;
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+ int search_range, inter_mode_info *mode_info,
+ int_mv *best_mv,
+ struct HandleInterModeArgs *const args);
+
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv,
+ int allow_second_mv, int joint_me_num_refine_iter);
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode);
+
+int av1_compound_single_motion_search_interinter(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
+
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *this_mv,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int *rate_mv, int ref_idx);
+
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref and calculates the sse and var of the residue. Note that this sets the
+// offset of mbmi, so we will need to reset it after calling this function.
+int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int ref,
+ const FULLPEL_MV start_mv,
+ int num_planes, int use_subpixel,
+ unsigned int *sse, unsigned int *var);
+
+static AOM_INLINE const search_site_config *av1_get_search_site_config(
+ const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
+ const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+ // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+ // that's shared by multiple threads. In most cases where all frames have the
+ // same resolution, the cache contains the search site config that we need.
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_SRC];
+ } else if (ref_stride ==
+ mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD];
+ }
+
+ // If the cache does not contain the correct stride, then we will need to rely
+ // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+ // thread level config doesn't match, then we need to update it.
+ search_method = search_method_lookup[search_method];
+ assert(search_method_lookup[search_method] == search_method &&
+ "The search_method_lookup table should be idempotent.");
+ if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+ av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+ ref_stride);
+ }
+
+ return x->search_site_cfg_buf;
+}
+
+static AOM_INLINE SEARCH_METHODS
+av1_get_faster_search_method(SEARCH_METHODS search_method) {
+ // Note on search method's accuracy:
+ // 1. NSTEP
+ // 2. DIAMOND
+ // 3. BIGDIA \approx SQUARE
+ // 4. HEX.
+ // 5. FAST_HEX \approx FAST_DIAMOND
+ switch (search_method) {
+ case NSTEP: return DIAMOND;
+ case NSTEP_8PT: return DIAMOND;
+ case DIAMOND: return BIGDIA;
+ case CLAMPED_DIAMOND: return BIGDIA;
+ case BIGDIA: return HEX;
+ case SQUARE: return HEX;
+ case HEX: return FAST_HEX;
+ case FAST_HEX: return FAST_HEX;
+ case FAST_DIAMOND: return VFAST_DIAMOND;
+ case FAST_BIGDIA: return FAST_BIGDIA;
+ case VFAST_DIAMOND: return VFAST_DIAMOND;
+ default: assert(0 && "Invalid search method!"); return DIAMOND;
+ }
+}
+
+static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method(
+ const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) {
+ SEARCH_METHODS search_method = mv_sf->search_method;
+ const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
+ const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ const bool use_faster_search_method =
+ (sf_blk_search_method == 1 && min_dim >= 32) ||
+ (sf_blk_search_method >= 2 && min_dim >= 16 &&
+ x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
+
+ if (use_faster_search_method) {
+ search_method = av1_get_faster_search_method(search_method);
+ }
+ return search_method;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MOTION_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/mv_prec.c b/third_party/aom/av1/encoder/mv_prec.c
new file mode 100644
index 0000000000..b64f4dcd0e
--- /dev/null
+++ b/third_party/aom/av1/encoder/mv_prec.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encodemv.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/misc_model_weights.h"
+#endif // !CONFIG_REALTIME_ONLY
+#include "av1/encoder/mv_prec.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
+ const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+ int ref_idx) {
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+
+ const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+ if (ref_frames[1] > INTRA_FRAME) {
+ assert(ref_idx == 0 || ref_idx == 1);
+ return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+ : curr_ref_mv_stack[ref_mv_idx].this_mv;
+ }
+
+ assert(ref_idx == 0);
+ return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+ ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
+ const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]);
+ const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0;
+ const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB);
+
+ return av1_cost_symbol(p15);
+}
+
+static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
+ int comp_idx, const AV1_COMP *cpi,
+ int *rates) {
+ assert(comp_val != 0 && "mv component should not have zero value!");
+ const int sign = comp_val < 0;
+ const int mag = sign ? -comp_val : comp_val;
+ const int mag_minus_1 = mag - 1;
+ int offset;
+ const int mv_class = av1_get_mv_class(mag_minus_1, &offset);
+ const int int_part = offset >> 3; // int mv data
+ const int frac_part = (offset >> 1) & 3; // fractional mv data
+ const int high_part = offset & 1; // high precision mv data
+ const int use_hp = cpi->common.features.allow_high_precision_mv;
+ int r_idx = 0;
+
+ const MACROBLOCK *const x = &cpi->td.mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ nmv_component *mvcomp_ctx = nmvc->comps;
+ nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx];
+ aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf;
+ aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf;
+ aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf;
+ aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf;
+ aom_cdf_prob *frac_part_cdf = mv_class
+ ? (cur_mvcomp_ctx->fp_cdf)
+ : (cur_mvcomp_ctx->class0_fp_cdf[int_part]);
+ aom_cdf_prob *high_part_cdf =
+ mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf);
+
+ const int sign_rate = get_symbol_cost(sign_cdf, sign);
+ rates[r_idx++] = sign_rate;
+ update_cdf(sign_cdf, sign, 2);
+
+ const int class_rate = get_symbol_cost(class_cdf, mv_class);
+ rates[r_idx++] = class_rate;
+ update_cdf(class_cdf, mv_class, MV_CLASSES);
+
+ int int_bit_rate = 0;
+ if (mv_class == MV_CLASS_0) {
+ int_bit_rate = get_symbol_cost(class0_cdf, int_part);
+ update_cdf(class0_cdf, int_part, CLASS0_SIZE);
+ } else {
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (int i = 0; i < n; ++i) {
+ int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1);
+ update_cdf(bits_cdf[i], (int_part >> i) & 1, 2);
+ }
+ }
+ rates[r_idx++] = int_bit_rate;
+ const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part);
+ rates[r_idx++] = frac_part_rate;
+ update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE);
+ const int high_part_rate =
+ use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0;
+ if (use_hp) {
+ update_cdf(high_part_cdf, high_part, 2);
+ }
+ rates[r_idx++] = high_part_rate;
+
+ mv_stats->last_bit_zero += !high_part;
+ mv_stats->last_bit_nonzero += high_part;
+ const int total_rate =
+ (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate);
+ return total_rate;
+}
+
+static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
+ const MV *cur_mv, const AV1_COMP *cpi) {
+ const MACROBLOCK *const x = &cpi->td.mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ aom_cdf_prob *joint_cdf = nmvc->joints_cdf;
+ const int use_hp = cpi->common.features.allow_high_precision_mv;
+
+ const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col };
+ const int mv_joint = av1_get_mv_joint(&diff);
+ // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp
+ const MV hp_diff = diff;
+ const int hp_mv_joint = av1_get_mv_joint(&hp_diff);
+ const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 };
+ const MV lp_diff = use_hp ? truncated_diff : diff;
+ const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
+
+ const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
+ const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
+ const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
+
+ update_cdf(joint_cdf, mv_joint, MV_JOINTS);
+
+ mv_stats->total_mv_rate += mv_joint_rate;
+ mv_stats->hp_total_mv_rate += hp_mv_joint_rate;
+ mv_stats->lp_total_mv_rate += lp_mv_joint_rate;
+ mv_stats->mv_joint_count[mv_joint]++;
+
+ for (int comp_idx = 0; comp_idx < 2; comp_idx++) {
+ const int comp_val = comp_idx ? diff.col : diff.row;
+ const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row;
+ const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row;
+ int rates[5];
+ av1_zero_array(rates, 5);
+
+ const int comp_rate =
+ comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates)
+ : 0;
+ // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false
+ const int hp_rate =
+ hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0;
+ const int lp_rate =
+ lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0;
+
+ mv_stats->total_mv_rate += comp_rate;
+ mv_stats->hp_total_mv_rate += hp_rate;
+ mv_stats->lp_total_mv_rate += lp_rate;
+ }
+}
+
+static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
+ const AV1_COMP *cpi, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) {
+ return;
+ }
+
+ const MB_MODE_INFO *mbmi =
+ mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col];
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame =
+ cpi->mbmi_ext_info.frame_base +
+ get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+ cpi->mbmi_ext_info.stride);
+
+ if (!is_inter_block(mbmi)) {
+ mv_stats->intra_count++;
+ return;
+ }
+ mv_stats->inter_count++;
+
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int is_compound = has_second_ref(mbmi);
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ // All mvs are new
+ for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) {
+ const MV ref_mv =
+ get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+ const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+ keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV ||
+ mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ // has exactly one new_mv
+ mv_stats->default_mvs += 1;
+
+ const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV);
+ const MV ref_mv =
+ get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+ const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+
+ keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+ } else {
+ // No new_mv
+ mv_stats->default_mvs += 1 + is_compound;
+ }
+
+ // Add texture information
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int num_rows = block_size_high[bsize];
+ const int num_cols = block_size_wide[bsize];
+ const int y_stride = cpi->source->y_stride;
+ const int px_row = 4 * mi_row, px_col = 4 * mi_col;
+ const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd = cm->seq_params->bit_depth;
+ if (buf_is_hbd) {
+ uint16_t *source_buf =
+ CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
+ for (int row = 0; row < num_rows - 1; row++) {
+ for (int col = 0; col < num_cols - 1; col++) {
+ const int offset = row * y_stride + col;
+ const int horz_diff =
+ abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
+ const int vert_diff =
+ abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
+ mv_stats->horz_text += horz_diff;
+ mv_stats->vert_text += vert_diff;
+ mv_stats->diag_text += horz_diff * vert_diff;
+ }
+ }
+ } else {
+ uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col;
+ for (int row = 0; row < num_rows - 1; row++) {
+ for (int col = 0; col < num_cols - 1; col++) {
+ const int offset = row * y_stride + col;
+ const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]);
+ const int vert_diff =
+ abs(source_buf[offset + y_stride] - source_buf[offset]);
+ mv_stats->horz_text += horz_diff;
+ mv_stats->vert_text += vert_diff;
+ mv_stats->diag_text += horz_diff * vert_diff;
+ }
+ }
+ }
+}
+
+// Split block
+static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
+ const AV1_COMP *cpi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *cm = &cpi->common;
+
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int qbs = mi_size_wide[bsize] / 4;
+ switch (partition) {
+ case PARTITION_NONE:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize);
+ collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize);
+ collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize);
+ collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize);
+ break;
+ case PARTITION_HORZ_A:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_HORZ_4:
+ for (int i = 0; i < 4; ++i) {
+ const int this_mi_row = mi_row + i * qbs;
+ collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (int i = 0; i < 4; ++i) {
+ const int this_mi_col = mi_col + i * qbs;
+ collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col);
+ }
+ break;
+ default: assert(0);
+ }
+}
+
+static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
+ const AV1_COMP *cpi,
+ const TileInfo *tile_info) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int mi_row_start = tile_info->mi_row_start;
+ const int mi_row_end = tile_info->mi_row_end;
+ const int mi_col_start = tile_info->mi_col_start;
+ const int mi_col_end = tile_info->mi_col_end;
+ const int sb_size_mi = cm->seq_params->mib_size;
+ BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
+ for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
+ collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
+ }
+ }
+}
+
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
+ MV_STATS *mv_stats = &cpi->mv_stats;
+ const AV1_COMMON *cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ cpi->tile_data[tile_idx].tctx = *cm->fc;
+ cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx;
+ collect_mv_stats_tile(mv_stats, cpi, &tile_info);
+ }
+ }
+
+ mv_stats->q = current_q;
+ mv_stats->order = cpi->common.current_frame.order_hint;
+ mv_stats->valid = 1;
+}
+
+static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
+ int current_q) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int order_hint = cpi->common.current_frame.order_hint;
+ const int order_diff = order_hint - mv_stats->order;
+ const float area = (float)(cm->width * cm->height);
+ float features[MV_PREC_FEATURE_SIZE] = {
+ (float)current_q,
+ (float)mv_stats->q,
+ (float)order_diff,
+ mv_stats->inter_count / area,
+ mv_stats->intra_count / area,
+ mv_stats->default_mvs / area,
+ mv_stats->mv_joint_count[0] / area,
+ mv_stats->mv_joint_count[1] / area,
+ mv_stats->mv_joint_count[2] / area,
+ mv_stats->mv_joint_count[3] / area,
+ mv_stats->last_bit_zero / area,
+ mv_stats->last_bit_nonzero / area,
+ mv_stats->total_mv_rate / area,
+ mv_stats->hp_total_mv_rate / area,
+ mv_stats->lp_total_mv_rate / area,
+ mv_stats->horz_text / area,
+ mv_stats->vert_text / area,
+ mv_stats->diag_text / area,
+ };
+
+ for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) {
+ features[f_idx] =
+ (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx];
+ }
+ float score = 0.0f;
+
+ av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score);
+
+ const int use_high_hp = score >= 0.0f;
+ return use_high_hp;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
+ int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH;
+#if !CONFIG_REALTIME_ONLY
+ MV_STATS *mv_stats = &cpi->mv_stats;
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) {
+ use_hp = 0;
+ }
+#if !CONFIG_REALTIME_ONLY
+ else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+ av1_frame_allows_smart_mv(cpi) && mv_stats->valid) {
+ use_hp = get_smart_mv_prec(cpi, mv_stats, qindex);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ av1_set_high_precision_mv(cpi, use_hp,
+ cpi->common.features.cur_frame_force_integer_mv);
+}
diff --git a/third_party/aom/av1/encoder/mv_prec.h b/third_party/aom/av1/encoder/mv_prec.h
new file mode 100644
index 0000000000..55108b6cdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/mv_prec.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MV_PREC_H_
+#define AOM_AV1_ENCODER_MV_PREC_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+
+// Q threshold for high precision mv.
+#define HIGH_PRECISION_MV_QTHRESH 128
+#if !CONFIG_REALTIME_ONLY
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
+
+static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
+ const int gf_group_index = cpi->gf_frame_index;
+ const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
+ return !frame_is_intra_only(&cpi->common) &&
+ !(gf_update_type == INTNL_OVERLAY_UPDATE ||
+ gf_update_type == OVERLAY_UPDATE);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void av1_set_high_precision_mv(
+ AV1_COMP *cpi, int allow_high_precision_mv,
+ int cur_frame_force_integer_mv) {
+ MvCosts *const mv_costs = cpi->td.mb.mv_costs;
+ // Avoid accessing 'mv_costs' when it is not allocated.
+ if (mv_costs == NULL) return;
+
+ const int copy_hp = cpi->common.features.allow_high_precision_mv =
+ allow_high_precision_mv && !cur_frame_force_integer_mv;
+
+ mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+ mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+ mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+ mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+ mv_costs->mv_cost_stack =
+ copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+}
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
+
+#endif // AOM_AV1_ENCODER_MV_PREC_H_
diff --git a/third_party/aom/av1/encoder/nonrd_opt.c b/third_party/aom/av1/encoder/nonrd_opt.c
new file mode 100644
index 0000000000..651ca43a2e
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_opt.c
@@ -0,0 +1,933 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/rdopt.h"
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = {
+ av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16
+};
+
+#define DECLARE_BLOCK_YRD_BUFFERS() \
+ DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \
+ DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]); \
+ DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]); \
+ uint16_t eob[1];
+
+#define DECLARE_BLOCK_YRD_VARS() \
+ /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the \
+ * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \
+ * as a non-const so we can reassign it to macroblock_plane::coeff. */ \
+ int16_t *low_coeff = (int16_t *)coeff_buf; \
+ int16_t *const low_qcoeff = (int16_t *)qcoeff_buf; \
+ int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf; \
+ const int diff_stride = bw;
+
+#define DECLARE_LOOP_VARS_BLOCK_YRD() \
+ const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars(
+ MACROBLOCK *x, int *skippable, int step, int ncoeffs,
+ int16_t *const low_coeff, int16_t *const low_qcoeff,
+ int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+ int tx_blk_id) {
+ const int is_txfm_skip = (ncoeffs == 0);
+ *skippable &= is_txfm_skip;
+ x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+ *eob_cost += get_msb(ncoeffs + 1);
+ if (ncoeffs == 1)
+ this_rdc->rate += (int)abs(low_qcoeff[0]);
+ else if (ncoeffs > 1)
+ this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+ this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+}
+
+static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
+ int max_blocks_high,
+ int max_blocks_wide,
+ int num_4x4_w, int step,
+ int block_step) {
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ const int bw = 4 * num_4x4_w;
+ const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide);
+ int block = 0;
+
+ for (int r = 0; r < max_blocks_high; r += block_step) {
+ for (int c = 0; c < num_4x4; c += 2 * block_step) {
+ const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2];
+ int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block);
+ aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff);
+ block += 2 * step;
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define DECLARE_BLOCK_YRD_HBD_VARS() \
+ tran_low_t *const coeff = coeff_buf; \
+ tran_low_t *const qcoeff = qcoeff_buf; \
+ tran_low_t *const dqcoeff = dqcoeff_buf;
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
+ MACROBLOCK *x, int *skippable, int step, int ncoeffs,
+ tran_low_t *const coeff, tran_low_t *const qcoeff,
+ tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+ int tx_blk_id) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int is_txfm_skip = (ncoeffs == 0);
+ *skippable &= is_txfm_skip;
+ x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+ *eob_cost += get_msb(ncoeffs + 1);
+
+ int64_t dummy;
+ if (ncoeffs == 1)
+ this_rdc->rate += (int)abs(qcoeff[0]);
+ else if (ncoeffs > 1)
+ this_rdc->rate += aom_satd(qcoeff, step << 4);
+ this_rdc->dist +=
+ av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2;
+}
+#endif
+
+/*!\brief Calculates RD Cost using Hadamard transform.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost using Hadamard transform. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] this_rdc Pointer to calculated RD Cost
+ * \param[in] skippable Pointer to a flag indicating possible tx skip
+ * \param[in] bsize Current block size
+ * \param[in] tx_size Transform size
+ * \param[in] is_inter_mode Flag to indicate inter mode
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
+ * coefficients for Hadamard transform
+ */
+void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y];
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int num_4x4_w = mi_size_wide[bsize];
+ const int num_4x4_h = mi_size_high[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ const int row_step = step * num_4x4_w >> tx_size;
+ int block = 0;
+ const int max_blocks_wide =
+ num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+ const int max_blocks_high =
+ num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+ int eob_cost = 0;
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+ const int use_hbd = is_cur_buf_hbd(xd);
+ int num_blk_skip_w = num_4x4_w;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+ p->src.stride, pd->dst.buf, pd->dst.stride);
+ } else {
+ aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+ }
+#else
+ aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+#endif
+
+ // Keep the intermediate value on the stack here. Writing directly to
+ // skippable causes speed regression due to load-and-store issues in
+ // update_yrd_loop_vars.
+ int temp_skippable = 1;
+ this_rdc->dist = 0;
+ this_rdc->rate = 0;
+ // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
+ // can be done per function call. Hence the call of Hadamard txfm is
+ // abstracted here for the specified cases.
+ int is_tx_8x8_dual_applicable =
+ (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
+ block_size_high[bsize] >= 8);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // As of now, dual implementation of hadamard txfm is available for low
+ // bitdepth.
+ if (use_hbd) is_tx_8x8_dual_applicable = 0;
+#endif
+
+ if (is_tx_8x8_dual_applicable) {
+ aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
+ step, block_step);
+ }
+
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+ DECLARE_BLOCK_YRD_BUFFERS()
+ DECLARE_BLOCK_YRD_VARS()
+#if CONFIG_AV1_HIGHBITDEPTH
+ DECLARE_BLOCK_YRD_HBD_VARS()
+#else
+ (void)use_hbd;
+#endif
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (int r = 0; r < max_blocks_high; r += block_step) {
+ for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+ DECLARE_LOOP_VARS_BLOCK_YRD()
+
+ switch (tx_size) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ case TX_16X16:
+ if (use_hbd) {
+ aom_hadamard_16x16(src_diff, diff_stride, coeff);
+ av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+ dqcoeff, p->dequant_QTX, eob,
+ // default_scan_fp_16x16_transpose and
+ // av1_default_iscan_fp_16x16_transpose have to be
+ // used together.
+ default_scan_fp_16x16_transpose,
+ av1_default_iscan_fp_16x16_transpose);
+ } else {
+ aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+ av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+ p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+ p->dequant_QTX, eob,
+ // default_scan_lp_16x16_transpose and
+ // av1_default_iscan_lp_16x16_transpose have to be
+ // used together.
+ default_scan_lp_16x16_transpose,
+ av1_default_iscan_lp_16x16_transpose);
+ }
+ break;
+ case TX_8X8:
+ if (use_hbd) {
+ aom_hadamard_8x8(src_diff, diff_stride, coeff);
+ av1_quantize_fp(
+ coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+ p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
+ default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+ } else {
+ if (is_tx_8x8_dual_applicable) {
+ // The coeffs are pre-computed for the whole block, so re-assign
+ // low_coeff to the appropriate location.
+ const int block_offset = BLOCK_OFFSET(block + s);
+ low_coeff = (int16_t *)p->coeff + block_offset;
+ } else {
+ aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+ }
+ av1_quantize_lp(
+ low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff,
+ low_dqcoeff, p->dequant_QTX, eob,
+ // default_scan_8x8_transpose and
+ // av1_default_iscan_8x8_transpose have to be used together.
+ default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+ }
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate
+ // normal coefficients order, so we don't need to change the scan
+ // order here.
+ if (use_hbd) {
+ aom_fdct4x4(src_diff, coeff, diff_stride);
+ av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+ dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+ scan_order->iscan);
+ } else {
+ aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+ av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ scan_order->scan, scan_order->iscan);
+ }
+ break;
+#else
+ case TX_16X16:
+ aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+ av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ default_scan_lp_16x16_transpose,
+ av1_default_iscan_lp_16x16_transpose);
+ break;
+ case TX_8X8:
+ if (is_tx_8x8_dual_applicable) {
+ // The coeffs are pre-computed for the whole block, so re-assign
+ // low_coeff to the appropriate location.
+ const int block_offset = BLOCK_OFFSET(block + s);
+ low_coeff = (int16_t *)p->coeff + block_offset;
+ } else {
+ aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+ }
+ av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ default_scan_8x8_transpose,
+ av1_default_iscan_8x8_transpose);
+ break;
+ default:
+ aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+ av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+#endif
+ }
+ assert(*eob <= 1024);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd)
+ update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff,
+ dqcoeff, this_rdc, &eob_cost,
+ r * num_blk_skip_w + c);
+ else
+#endif
+ update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+ low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+ r * num_blk_skip_w + c);
+ }
+ block += row_step;
+ }
+
+ this_rdc->skip_txfm = *skippable = temp_skippable;
+ if (this_rdc->sse < INT64_MAX) {
+ this_rdc->sse = (this_rdc->sse << 6) >> 2;
+ if (temp_skippable) {
+ this_rdc->dist = 0;
+ this_rdc->dist = this_rdc->sse;
+ return;
+ }
+ }
+
+ // If skippable is set, rate gets clobbered later.
+ this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+ this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+// Explicitly enumerate the cases so the compiler can generate SIMD for the
+// function. According to the disassembler, gcc generates SSE codes for each of
+// the possible block sizes. The hottest case is tx_width 16, which takes up
+// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
+// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
+// potential room of improvement for writing AVX2 optimization is only 3% * 8% =
+// 0.24% of total encoding time.
+static AOM_INLINE void scale_square_buf_vals(int16_t *dst, int tx_width,
+ const int16_t *src,
+ int src_stride) {
+#define DO_SCALING \
+ do { \
+ for (int idy = 0; idy < tx_width; ++idy) { \
+ for (int idx = 0; idx < tx_width; ++idx) { \
+ dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
+ } \
+ } \
+ } while (0)
+
+ if (tx_width == 4) {
+ DO_SCALING;
+ } else if (tx_width == 8) {
+ DO_SCALING;
+ } else if (tx_width == 16) {
+ DO_SCALING;
+ } else {
+ assert(0);
+ }
+
+#undef DO_SCALING
+}
+
+/*!\brief Calculates RD Cost when the block uses Identity transform.
+ * Note that this function is only for low bit depth encoding, since it
+ * is called in real-time mode for now, which sets high bit depth to 0:
+ * -DCONFIG_AV1_HIGHBITDEPTH=0
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] pred_buf Pointer to the prediction buffer
+ * \param[in] pred_stride Stride for the prediction buffer
+ * \param[in] this_rdc Pointer to calculated RD Cost
+ * \param[in] skippable Pointer to a flag indicating possible tx skip
+ * \param[in] bsize Current block size
+ * \param[in] tx_size Transform size
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if all coefficients are zero.
+ */
+void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
+ int pred_stride, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int num_4x4_w = mi_size_wide[bsize];
+ const int num_4x4_h = mi_size_high[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ const int max_blocks_wide =
+ num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+ const int max_blocks_high =
+ num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+ int eob_cost = 0;
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+ const int num_blk_skip_w = num_4x4_w;
+ // Keep the intermediate value on the stack here. Writing directly to
+ // skippable causes speed regression due to load-and-store issues in
+ // update_yrd_loop_vars.
+ int temp_skippable = 1;
+ int tx_wd = 0;
+ const SCAN_ORDER *scan_order = NULL;
+ switch (tx_size) {
+ case TX_64X64:
+ assert(0); // Not implemented
+ break;
+ case TX_32X32:
+ assert(0); // Not used
+ break;
+ case TX_16X16:
+ scan_order = &av1_fast_idtx_scan_order_16x16;
+ tx_wd = 16;
+ break;
+ case TX_8X8:
+ scan_order = &av1_fast_idtx_scan_order_8x8;
+ tx_wd = 8;
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ scan_order = &av1_fast_idtx_scan_order_4x4;
+ tx_wd = 4;
+ break;
+ }
+ assert(scan_order != NULL);
+
+ this_rdc->dist = 0;
+ this_rdc->rate = 0;
+ aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pred_buf, pred_stride);
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ DECLARE_BLOCK_YRD_BUFFERS()
+ DECLARE_BLOCK_YRD_VARS()
+ for (int r = 0; r < max_blocks_high; r += block_step) {
+ for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+ DECLARE_LOOP_VARS_BLOCK_YRD()
+ scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
+ av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
+ p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
+ eob, scan_order->scan, scan_order->iscan);
+ assert(*eob <= 1024);
+ update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+ low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+ r * num_blk_skip_w + c);
+ }
+ }
+ this_rdc->skip_txfm = *skippable = temp_skippable;
+ if (this_rdc->sse < INT64_MAX) {
+ this_rdc->sse = (this_rdc->sse << 6) >> 2;
+ if (temp_skippable) {
+ this_rdc->dist = 0;
+ this_rdc->dist = this_rdc->sse;
+ return;
+ }
+ }
+ // If skippable is set, rate gets clobbered later.
+ this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+ this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_STATS *this_rdc, int start_plane,
+ int stop_plane) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ int plane;
+ int64_t tot_sse = 0;
+
+ this_rdc->rate = 0;
+ this_rdc->dist = 0;
+ this_rdc->skip_txfm = 0;
+
+ for (plane = start_plane; plane <= stop_plane; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const uint32_t dc_quant = p->dequant_QTX[0];
+ const uint32_t ac_quant = p->dequant_QTX[1];
+ const BLOCK_SIZE bs = plane_bsize;
+ unsigned int var;
+ if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue;
+
+ var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ assert(sse >= var);
+ tot_sse += sse;
+
+ av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> 3, &rate, &dist);
+
+ this_rdc->rate += rate >> 1;
+ this_rdc->dist += dist << 3;
+
+ av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+ &rate, &dist);
+
+ this_rdc->rate += rate;
+ this_rdc->dist += dist << 4;
+ }
+
+ if (this_rdc->rate == 0) {
+ this_rdc->skip_txfm = 1;
+ }
+
+ if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
+ RDCOST(x->rdmult, 0, tot_sse << 4)) {
+ this_rdc->rate = 0;
+ this_rdc->dist = tot_sse << 4;
+ this_rdc->skip_txfm = 1;
+ }
+
+ return tot_sse;
+}
+
+static void compute_intra_yprediction(const AV1_COMMON *cm,
+ PREDICTION_MODE mode, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ int plane = 0;
+ int row, col;
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+ p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+ pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
+ FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
+ 0, 0, plane);
+ }
+ }
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+}
+
+// Checks whether Intra mode needs to be pruned based on
+// 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
+// speed features.
+static INLINE bool is_prune_intra_mode(
+ AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize,
+ uint8_t segment_id, SOURCE_SAD source_sad_nonrd,
+ uint8_t color_sensitivity[MAX_MB_PLANE - 1]) {
+ const PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+ if (mode_index > 2 || force_intra_check == 0) {
+ if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
+ return true;
+
+ if (this_mode == DC_PRED) return false;
+
+ if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false;
+
+ const bool has_color_sensitivity =
+ color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] &&
+ color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)];
+ if (has_color_sensitivity &&
+ (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad ||
+ cyclic_refresh_segment_id_boosted(segment_id) ||
+ source_sad_nonrd > kMedSad))
+ return false;
+
+ return true;
+ }
+ return false;
+}
+
+/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost for an intra mode for a single TX block using Hadamard
+ * transform.
+ * \param[in] plane Color plane
+ * \param[in] block Index of a TX block in a prediction block
+ * \param[in] row Row of a current TX block
+ * \param[in] col Column of a current TX block
+ * \param[in] plane_bsize Block size of a current prediction block
+ * \param[in] tx_size Transform size
+ * \param[in] arg Pointer to a structure that holds parameters
+ * for intra mode search
+ *
+ * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode
+ * are set in \c args->rdc and \c args->mode
+ */
+void av1_estimate_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct estimate_block_intra_args *const args = arg;
+ AV1_COMP *const cpi = args->cpi;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int64_t src_stride = p->src.stride;
+ const int64_t dst_stride = pd->dst.stride;
+
+ (void)block;
+
+ av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+ if (args->prune_mode_based_on_sad) {
+ unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+ const unsigned int sad_threshold =
+ args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4)
+ : UINT_MAX;
+ // Skip the evaluation of current mode if its SAD is more than a threshold.
+ if (this_sad > sad_threshold) {
+ // For the current mode, set rate and distortion to maximum possible
+ // values and return.
+ // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip
+ // the evaluation of the current mode.
+ args->rdc->rate = INT_MAX;
+ args->rdc->dist = INT64_MAX;
+ return;
+ }
+ if (this_sad < args->best_sad) {
+ args->best_sad = this_sad;
+ }
+ }
+
+ RD_STATS this_rdc;
+ av1_invalid_rd_stats(&this_rdc);
+
+ p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+ pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
+ if (plane == 0) {
+ av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx,
+ AOMMIN(tx_size, TX_16X16));
+ } else {
+ av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane);
+ }
+
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+ assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX);
+ args->rdc->rate += this_rdc.rate;
+ args->rdc->dist += this_rdc.dist;
+}
+
+/*!\brief Estimates best intra mode for inter mode search
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ *
+ * Using heuristics based on best inter mode, block size, and other decides
+ * whether to check intra modes. If so, estimates and selects best intra mode
+ * from the reduced set of intra modes (max 4 intra modes checked)
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] bsize Current block size
+ * \param[in] best_early_term Flag, indicating that TX for the
+ * best inter mode was skipped
+ * \param[in] ref_cost_intra Cost of signalling intra mode
+ * \param[in] reuse_prediction Flag, indicating prediction re-use
+ * \param[in] orig_dst Original destination buffer
+ * \param[in] tmp_buffers Pointer to a temporary buffers for
+ * prediction re-use
+ * \param[out] this_mode_pred Pointer to store prediction buffer
+ * for prediction re-use
+ * \param[in] best_rdc Pointer to RD cost for the best
+ * selected intra mode
+ * \param[in] best_pickmode Pointer to a structure containing
+ * best mode picked so far
+ * \param[in] ctx Pointer to structure holding coding
+ * contexts and modes for the block
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c best_rdc and best selected mode is placed to \c best_pickmode
+ *
+ */
+void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int best_early_term, unsigned int ref_cost_intra,
+ int reuse_prediction, struct buf_2d *orig_dst,
+ PRED_BUFFER *tmp_buffers,
+ PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+ BEST_PICKMODE *best_pickmode,
+ PICK_MODE_CONTEXT *ctx) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const unsigned char segment_id = mi->segment_id;
+ const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+ const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+ const bool is_screen_content =
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+
+ const CommonQuantParams *quant_params = &cm->quant_params;
+
+ RD_STATS this_rdc;
+
+ int intra_cost_penalty = av1_get_intra_cost_penalty(
+ quant_params->base_qindex, quant_params->y_dc_delta_q,
+ cm->seq_params->bit_depth);
+ int64_t inter_mode_thresh =
+ RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
+ int perform_intra_pred = rt_sf->check_intra_pred_nonrd;
+ int force_intra_check = 0;
+ // For spatial enhancement layer: turn off intra prediction if the
+ // previous spatial layer as golden ref is not chosen as best reference.
+ // only do this for temporal enhancement layer and on non-key frames.
+ if (cpi->svc.spatial_layer_id > 0 &&
+ best_pickmode->best_ref_frame != GOLDEN_FRAME &&
+ cpi->svc.temporal_layer_id > 0 &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+ perform_intra_pred = 0;
+
+ int do_early_exit_rdthresh = 1;
+
+ uint32_t spatial_var_thresh = 50;
+ int motion_thresh = 32;
+ // Adjust thresholds to make intra mode likely tested if the other
+ // references (golden, alt) are skipped/not checked. For now always
+ // adjust for svc mode.
+ if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 &&
+ rt_sf->nonrd_prune_ref_frame_search > 0)) {
+ spatial_var_thresh = 150;
+ motion_thresh = 0;
+ }
+
+ // Some adjustments to checking intra mode based on source variance.
+ if (x->source_variance < spatial_var_thresh) {
+ // If the best inter mode is large motion or non-LAST ref reduce intra cost
+ // penalty, so intra mode is more likely tested.
+ if (best_rdc->rdcost != INT64_MAX &&
+ (best_pickmode->best_ref_frame != LAST_FRAME ||
+ abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+ abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
+ intra_cost_penalty = intra_cost_penalty >> 2;
+ inter_mode_thresh =
+ RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
+ do_early_exit_rdthresh = 0;
+ }
+ if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
+ x->content_state_sb.source_sad_nonrd >= kHighSad) ||
+ (is_screen_content && x->source_variance < 50 &&
+ ((bsize >= BLOCK_32X32 &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad) ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)))
+ force_intra_check = 1;
+ // For big blocks worth checking intra (since only DC will be checked),
+ // even if best_early_term is set.
+ if (bsize >= BLOCK_32X32) best_early_term = 0;
+ } else if (rt_sf->source_metrics_sb_nonrd &&
+ x->content_state_sb.source_sad_nonrd <= kLowSad) {
+ perform_intra_pred = 0;
+ }
+
+ if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) {
+ if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV)
+ perform_intra_pred = 0;
+ else if (rt_sf->skip_intra_pred == 2)
+ perform_intra_pred = 0;
+ }
+
+ if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
+ (perform_intra_pred && !best_early_term &&
+ bsize <= cpi->sf.part_sf.max_intra_bsize))) {
+ return;
+ }
+
+ // Early exit based on RD cost calculated using known rate. When
+ // is_screen_content is true, more bias is given to intra modes. Hence,
+ // considered conservative threshold in early exit for the same.
+ const int64_t known_rd = is_screen_content
+ ? CALC_BIASED_RDCOST(inter_mode_thresh)
+ : inter_mode_thresh;
+ if (known_rd > best_rdc->rdcost) return;
+
+ struct estimate_block_intra_args args;
+ init_estimate_block_intra_args(&args, cpi, x);
+ TX_SIZE intra_tx_size = AOMMIN(
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+ TX_16X16);
+ if (is_screen_content && cpi->rc.high_source_sad &&
+ x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16)
+ intra_tx_size = TX_4X4;
+
+ PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ if (reuse_prediction && best_pred != NULL) {
+ const int bh = block_size_high[bsize];
+ const int bw = block_size_wide[bsize];
+ if (best_pred->data == orig_dst->buf) {
+ *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
+ aom_convolve_copy(best_pred->data, best_pred->stride,
+ (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
+ bh);
+ best_pickmode->best_pred = *this_mode_pred;
+ }
+ }
+ pd->dst = *orig_dst;
+
+ for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) {
+ const PREDICTION_MODE this_mode = intra_mode_list[midx];
+ const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+ const int64_t mode_rd_thresh = rd_threshes[mode_index];
+
+ if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id,
+ x->content_state_sb.source_sad_nonrd,
+ x->color_sensitivity))
+ continue;
+
+ if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
+ // For spatially flat blocks with zero motion only check
+ // DC mode.
+ if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+ x->source_variance == 0 && this_mode != DC_PRED)
+ continue;
+ // Only test Intra for big blocks if spatial_variance is small.
+ else if (bsize > BLOCK_32X32 && x->source_variance > 50)
+ continue;
+ }
+
+ if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
+ rd_thresh_freq_fact[mode_index]) &&
+ (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+ continue;
+ }
+ const BLOCK_SIZE uv_bsize =
+ get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+
+ mi->mode = this_mode;
+ mi->ref_frame[0] = INTRA_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+
+ av1_invalid_rd_stats(&this_rdc);
+ args.mode = this_mode;
+ args.skippable = 1;
+ args.rdc = &this_rdc;
+ mi->tx_size = intra_tx_size;
+ compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+ // Look into selecting tx_size here, based on prediction residual.
+ av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size);
+ // TODO(kyslov@) Need to account for skippable
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+ av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U,
+ av1_estimate_block_intra, &args);
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+ av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V,
+ av1_estimate_block_intra, &args);
+ }
+
+ int mode_cost = 0;
+ if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+ mode_cost +=
+ x->mode_costs.angle_delta_cost[this_mode - V_PRED]
+ [MAX_ANGLE_DELTA +
+ mi->angle_delta[PLANE_TYPE_Y]];
+ }
+ if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
+ }
+ this_rdc.rate += ref_cost_intra;
+ this_rdc.rate += intra_cost_penalty;
+ this_rdc.rate += mode_cost;
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+ if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
+ // For blocks with low spatial variance and color sad,
+ // favor the intra-modes, only on scene/slide change.
+ if (cpi->rc.high_source_sad && x->source_variance < 800 &&
+ (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]))
+ this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost);
+ // Otherwise bias against intra for blocks with zero
+ // motion and no color, on non-scene/slide changes.
+ else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
+ x->content_state_sb.source_sad_nonrd == kZeroSad &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+ this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1;
+ }
+
+ if (this_rdc.rdcost < best_rdc->rdcost) {
+ *best_rdc = this_rdc;
+ best_pickmode->best_mode = this_mode;
+ best_pickmode->best_tx_size = mi->tx_size;
+ best_pickmode->best_ref_frame = INTRA_FRAME;
+ best_pickmode->best_second_ref_frame = NONE;
+ best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm;
+ mi->uv_mode = this_mode;
+ mi->mv[0].as_int = INVALID_MV;
+ mi->mv[1].as_int = INVALID_MV;
+ if (!this_rdc.skip_txfm)
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+ if (best_pickmode->best_ref_frame == INTRA_FRAME)
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ mi->tx_size = best_pickmode->best_tx_size;
+}
diff --git a/third_party/aom/av1/encoder/nonrd_opt.h b/third_party/aom/av1/encoder/nonrd_opt.h
new file mode 100644
index 0000000000..a53578ebad
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_opt.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_NONRD_OPT_H_
+#define AOM_AV1_ENCODER_NONRD_OPT_H_
+
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/rdopt.h"
+
+#define RTC_INTER_MODES (4)
+#define RTC_INTRA_MODES (4)
+#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES))
+#define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3)
+#define NUM_COMP_INTER_MODES_RT (6)
+#define NUM_INTER_MODES 12
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+ (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+#define FILTER_SEARCH_SIZE 2
+#if !CONFIG_REALTIME_ONLY
+#define MOTION_MODE_SEARCH_SIZE 2
+#endif
+
+extern int g_pick_inter_mode_cnt;
+/*!\cond */
+typedef struct {
+ uint8_t *data;
+ int stride;
+ int in_use;
+} PRED_BUFFER;
+
+typedef struct {
+ PRED_BUFFER *best_pred;
+ PREDICTION_MODE best_mode;
+ TX_SIZE best_tx_size;
+ TX_TYPE tx_type;
+ MV_REFERENCE_FRAME best_ref_frame;
+ MV_REFERENCE_FRAME best_second_ref_frame;
+ uint8_t best_mode_skip_txfm;
+ uint8_t best_mode_initial_skip_flag;
+ int_interpfilters best_pred_filter;
+ MOTION_MODE best_motion_mode;
+ WarpedMotionParams wm_params;
+ int num_proj_ref;
+ PALETTE_MODE_INFO pmi;
+ int64_t best_sse;
+} BEST_PICKMODE;
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame;
+ PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame[2];
+ PREDICTION_MODE pred_mode;
+} COMP_REF_MODE;
+
+struct estimate_block_intra_args {
+ AV1_COMP *cpi;
+ MACROBLOCK *x;
+ PREDICTION_MODE mode;
+ int skippable;
+ RD_STATS *rdc;
+ unsigned int best_sad;
+ bool prune_mode_based_on_sad;
+};
+/*!\endcond */
+
+/*!\brief Structure to store parameters and statistics used in non-rd inter mode
+ * evaluation.
+ */
+typedef struct {
+ //! Structure to hold best inter mode data
+ BEST_PICKMODE best_pickmode;
+ //! Structure to RD cost of current mode
+ RD_STATS this_rdc;
+ //! Pointer to the RD Cost for the best mode found so far
+ RD_STATS best_rdc;
+ //! Distortion of chroma planes for all modes and reference frames
+ int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES];
+ //! Buffer to hold predicted block for all reference frames and planes
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ //! Array to hold variance of all modes and reference frames
+ unsigned int vars[RTC_INTER_MODES][REF_FRAMES];
+ //! Array to hold ref cost of single reference mode for all ref frames
+ unsigned int ref_costs_single[REF_FRAMES];
+ //! Array to hold motion vector for all modes and reference frames
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold best mv for all modes and reference frames
+ int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold inter mode cost of single ref mode for all ref frames
+ int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES];
+ //! Array to hold use reference frame mask for each reference frame
+ int use_ref_frame_mask[REF_FRAMES];
+ //! Array to hold flags of evaluated modes for each reference frame
+ uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold flag indicating if scaled reference frame is used.
+ bool use_scaled_ref_frame[REF_FRAMES];
+} InterModeSearchStateNonrd;
+
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
+ 2, 2, 3, 3, 3, 4,
+ 4, 4, 5, 5 };
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1,
+ 2, 3, 2, 3, 4, 3,
+ 4, 5, 4, 5 };
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+ SMOOTH_PRED };
+
+static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV,
+ NEWMV };
+
+static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = {
+ { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
+ { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
+ { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
+ { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
+ { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG },
+ { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB },
+ { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 },
+ { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA },
+};
+
+// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
+// mode
+static const REF_MODE ref_mode_set[NUM_INTER_MODES] = {
+ { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV },
+ { LAST_FRAME, GLOBALMV }, { LAST_FRAME, NEWMV },
+ { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
+ { GOLDEN_FRAME, GLOBALMV }, { GOLDEN_FRAME, NEWMV },
+ { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
+ { ALTREF_FRAME, GLOBALMV }, { ALTREF_FRAME, NEWMV },
+};
+
+static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = {
+ { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV },
+ { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV },
+ { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV },
+ { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV },
+ { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV },
+ { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV },
+};
+
+static const int_interpfilters filters_ref_set[9] = {
+ [0].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+ [1].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+ [2].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },
+ [3].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
+ [4].as_filters = { MULTITAP_SHARP, MULTITAP_SHARP },
+ [5].as_filters = { EIGHTTAP_REGULAR, MULTITAP_SHARP },
+ [6].as_filters = { MULTITAP_SHARP, EIGHTTAP_REGULAR },
+ [7].as_filters = { EIGHTTAP_SMOOTH, MULTITAP_SHARP },
+ [8].as_filters = { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
+};
+
+enum {
+ // INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+ INTER_NEAREST = (1 << NEARESTMV),
+ INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV),
+ INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
+};
+
+// The original scan order (default_scan_8x8) is modified according to the extra
+// transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and
+// aom_hadamard_8x8_c.
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = {
+ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
+};
+
+// The original scan order (av1_default_iscan_8x8) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and
+// aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the
+// order of coefficients, such that the normal scan order is no longer
+// guaranteed to scan low coefficients first, therefore we modify the scan order
+// accordingly.
+// Note that this one has to be used together with default_scan_8x8_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_default_iscan_8x8_transpose[64]) = {
+ 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36,
+ 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49,
+ 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58,
+ 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63
+};
+
+// The original scan order (default_scan_16x16) is modified according to the
+// extra transpose in hadamard c implementation in lp case, i.e.,
+// aom_hadamard_lp_16x16_c.
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_lp_16x16_transpose[256]) = {
+ 0, 8, 2, 4, 10, 16, 24, 18, 12, 6, 64, 14, 20, 26, 32,
+ 40, 34, 28, 22, 72, 66, 68, 74, 80, 30, 36, 42, 48, 56, 50,
+ 44, 38, 88, 82, 76, 70, 128, 78, 84, 90, 96, 46, 52, 58, 1,
+ 9, 3, 60, 54, 104, 98, 92, 86, 136, 130, 132, 138, 144, 94, 100,
+ 106, 112, 62, 5, 11, 17, 25, 19, 13, 7, 120, 114, 108, 102, 152,
+ 146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65, 15, 21, 27,
+ 33, 41, 35, 29, 23, 73, 67, 124, 118, 168, 162, 156, 150, 200, 194,
+ 196, 202, 208, 158, 164, 170, 176, 126, 69, 75, 81, 31, 37, 43, 49,
+ 57, 51, 45, 39, 89, 83, 77, 71, 184, 178, 172, 166, 216, 210, 204,
+ 198, 206, 212, 218, 224, 174, 180, 186, 129, 79, 85, 91, 97, 47, 53,
+ 59, 61, 55, 105, 99, 93, 87, 137, 131, 188, 182, 232, 226, 220, 214,
+ 222, 228, 234, 240, 190, 133, 139, 145, 95, 101, 107, 113, 63, 121, 115,
+ 109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143,
+ 149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252,
+ 246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217,
+ 211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221,
+ 215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247,
+ 255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (default_scan_16x16) is modified according to the
+// extra shift in hadamard c implementation in fp case, i.e.,
+// aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different
+// outputs, so we handle them separately.
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_fp_16x16_transpose[256]) = {
+ 0, 4, 2, 8, 6, 16, 20, 18, 12, 10, 64, 14, 24, 22, 32,
+ 36, 34, 28, 26, 68, 66, 72, 70, 80, 30, 40, 38, 48, 52, 50,
+ 44, 42, 84, 82, 76, 74, 128, 78, 88, 86, 96, 46, 56, 54, 1,
+ 5, 3, 60, 58, 100, 98, 92, 90, 132, 130, 136, 134, 144, 94, 104,
+ 102, 112, 62, 9, 7, 17, 21, 19, 13, 11, 116, 114, 108, 106, 148,
+ 146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65, 15, 25, 23,
+ 33, 37, 35, 29, 27, 69, 67, 124, 122, 164, 162, 156, 154, 196, 194,
+ 200, 198, 208, 158, 168, 166, 176, 126, 73, 71, 81, 31, 41, 39, 49,
+ 53, 51, 45, 43, 85, 83, 77, 75, 180, 178, 172, 170, 212, 210, 204,
+ 202, 206, 216, 214, 224, 174, 184, 182, 129, 79, 89, 87, 97, 47, 57,
+ 55, 61, 59, 101, 99, 93, 91, 133, 131, 188, 186, 228, 226, 220, 218,
+ 222, 232, 230, 240, 190, 137, 135, 145, 95, 105, 103, 113, 63, 117, 115,
+ 109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143,
+ 153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252,
+ 250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213,
+ 211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221,
+ 219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251,
+ 255
+};
+#endif
+
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_lp_16x16_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_default_iscan_lp_16x16_transpose[256]) = {
+ 0, 44, 2, 46, 3, 63, 9, 69, 1, 45, 4, 64, 8, 68, 11,
+ 87, 5, 65, 7, 67, 12, 88, 18, 94, 6, 66, 13, 89, 17, 93,
+ 24, 116, 14, 90, 16, 92, 25, 117, 31, 123, 15, 91, 26, 118, 30,
+ 122, 41, 148, 27, 119, 29, 121, 42, 149, 48, 152, 28, 120, 43, 150,
+ 47, 151, 62, 177, 10, 86, 20, 96, 21, 113, 35, 127, 19, 95, 22,
+ 114, 34, 126, 37, 144, 23, 115, 33, 125, 38, 145, 52, 156, 32, 124,
+ 39, 146, 51, 155, 58, 173, 40, 147, 50, 154, 59, 174, 73, 181, 49,
+ 153, 60, 175, 72, 180, 83, 198, 61, 176, 71, 179, 84, 199, 98, 202,
+ 70, 178, 85, 200, 97, 201, 112, 219, 36, 143, 54, 158, 55, 170, 77,
+ 185, 53, 157, 56, 171, 76, 184, 79, 194, 57, 172, 75, 183, 80, 195,
+ 102, 206, 74, 182, 81, 196, 101, 205, 108, 215, 82, 197, 100, 204, 109,
+ 216, 131, 223, 99, 203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221,
+ 141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78, 193, 104,
+ 208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214,
+ 133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139,
+ 231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250,
+ 168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211,
+ 255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_fp_16x16_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_default_iscan_fp_16x16_transpose[256]) = {
+ 0, 44, 2, 46, 1, 45, 4, 64, 3, 63, 9, 69, 8, 68, 11,
+ 87, 5, 65, 7, 67, 6, 66, 13, 89, 12, 88, 18, 94, 17, 93,
+ 24, 116, 14, 90, 16, 92, 15, 91, 26, 118, 25, 117, 31, 123, 30,
+ 122, 41, 148, 27, 119, 29, 121, 28, 120, 43, 150, 42, 149, 48, 152,
+ 47, 151, 62, 177, 10, 86, 20, 96, 19, 95, 22, 114, 21, 113, 35,
+ 127, 34, 126, 37, 144, 23, 115, 33, 125, 32, 124, 39, 146, 38, 145,
+ 52, 156, 51, 155, 58, 173, 40, 147, 50, 154, 49, 153, 60, 175, 59,
+ 174, 73, 181, 72, 180, 83, 198, 61, 176, 71, 179, 70, 178, 85, 200,
+ 84, 199, 98, 202, 97, 201, 112, 219, 36, 143, 54, 158, 53, 157, 56,
+ 171, 55, 170, 77, 185, 76, 184, 79, 194, 57, 172, 75, 183, 74, 182,
+ 81, 196, 80, 195, 102, 206, 101, 205, 108, 215, 82, 197, 100, 204, 99,
+ 203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221,
+ 128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78, 193, 104,
+ 208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214,
+ 133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139,
+ 231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250,
+ 168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211,
+ 255
+};
+#endif
+
+// For entropy coding, IDTX shares the scan orders of the other 2D-transforms,
+// but the fastest way to calculate the IDTX transform (i.e. no transposes)
+// results in coefficients that are a transposition of the entropy coding
+// versions. These tables are used as substitute for the scan order for the
+// faster version of IDTX.
+
+// Must be used together with av1_fast_idtx_iscan_4x4
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_fast_idtx_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6,
+ 9, 12, 13, 10, 7, 11, 14, 15 };
+
+// Must be used together with av1_fast_idtx_scan_4x4
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_fast_idtx_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12,
+ 3, 8, 11, 13, 9, 10, 14, 15 };
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_4x4 = {
+ av1_fast_idtx_scan_4x4, av1_fast_idtx_iscan_4x4
+};
+
+// Must be used together with av1_fast_idtx_iscan_8x8
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_8x8[64]) = {
+ 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+// Must be used together with av1_fast_idtx_scan_8x8
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_8x8[64]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42,
+ 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53,
+ 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+ 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_8x8 = {
+ av1_fast_idtx_scan_8x8, av1_fast_idtx_iscan_8x8
+};
+
+// Must be used together with av1_fast_idtx_iscan_16x16
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_16x16[256]) = {
+ 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4,
+ 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22,
+ 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100,
+ 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27,
+ 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46,
+ 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+ 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94,
+ 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+ 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+ 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+ 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+ 255
+};
+
+// Must be used together with av1_fast_idtx_scan_16x16
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119,
+ 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118,
+ 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117,
+ 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116,
+ 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115,
+ 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114,
+ 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113,
+ 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112,
+ 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111,
+ 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110,
+ 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109,
+ 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108,
+ 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107,
+ 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+ 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+ 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+ 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+ 255
+};
+
+// Indicates the blocks for which RD model should be based on special logic
+static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int large_block = bsize >= BLOCK_32X32;
+ // Only enable for low bitdepth to mitigate issue: b/303023614.
+ return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
+ !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+ cm->quant_params.base_qindex && !cpi->oxcf.use_highbitdepth;
+}
+/*!\brief Finds predicted motion vectors for a block.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds predicted motion vectors for a block from a certain reference frame.
+ * First, it fills reference MV stack, then picks the test from the stack and
+ * predicts the final MV for a block for each mode.
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] ref_frame Reference frame for which to find
+ * ref MVs
+ * \param[out] frame_mv Predicted MVs for a block
+ * \param[in] yv12_mb Buffer to hold predicted block
+ * \param[in] bsize Current block size
+ * \param[in] force_skip_low_temp_var Flag indicating possible mode search
+ * prune for low temporal variance block
+ * \param[in] skip_pred_mv Flag indicating to skip av1_mv_pred
+ * \param[out] use_scaled_ref_frame Flag to indicate if scaled reference
+ * frame is used.
+ *
+ * \remark Nothing is returned. Instead, predicted MVs are placed into
+ * \c frame_mv array, and use_scaled_ref_frame is set.
+ */
+static INLINE void find_predictors(
+ AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+ int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame);
+ const bool ref_is_scaled =
+ ref->y_crop_height != cm->height || ref->y_crop_width != cm->width;
+ const YV12_BUFFER_CONFIG *scaled_ref =
+ av1_get_scaled_ref_frame(cpi, ref_frame);
+ const YV12_BUFFER_CONFIG *yv12 =
+ ref_is_scaled && scaled_ref ? scaled_ref : ref;
+ const int num_planes = av1_num_planes(cm);
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ x->pred_mv0_sad[ref_frame] = INT_MAX;
+ x->pred_mv1_sad[ref_frame] = INT_MAX;
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ // TODO(kyslov) this needs various further optimizations. to be continued..
+ assert(yv12 != NULL);
+ if (yv12 != NULL) {
+ struct scale_factors *const sf =
+ scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame);
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ av1_find_best_ref_mvs_from_stack(
+ cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
+ &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+ frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
+ // Early exit for non-LAST frame if force_skip_low_temp_var is set.
+ if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv &&
+ !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ bsize);
+ }
+ }
+ if (cm->features.switchable_motion_mode) {
+ av1_count_overlappable_neighbors(cm, xd);
+ }
+ mbmi->num_proj_ref = 1;
+ *use_scaled_ref_frame = ref_is_scaled && scaled_ref;
+}
+
+static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
+ PREDICTION_MODE pred_mode,
+ MV_REFERENCE_FRAME ref_frame0,
+ MV_REFERENCE_FRAME ref_frame1,
+ const AV1_COMMON *cm) {
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ mbmi->ref_mv_idx = 0;
+ mbmi->mode = pred_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frame0;
+ mbmi->ref_frame[1] = ref_frame1;
+ pmi->palette_size[PLANE_TYPE_Y] = 0;
+ pmi->palette_size[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->num_proj_ref = 1;
+ mbmi->interintra_mode = 0;
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static INLINE void init_estimate_block_intra_args(
+ struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) {
+ args->cpi = cpi;
+ args->x = x;
+ args->mode = DC_PRED;
+ args->skippable = 1;
+ args->rdc = 0;
+ args->best_sad = UINT_MAX;
+ args->prune_mode_based_on_sad = false;
+}
+
+static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) {
+ for (int buf_idx = 0; buf_idx < len; buf_idx++) {
+ if (!p[buf_idx].in_use) {
+ p[buf_idx].in_use = 1;
+ return buf_idx;
+ }
+ }
+ return -1;
+}
+
+static INLINE void free_pred_buffer(PRED_BUFFER *p) {
+ if (p != NULL) p->in_use = 0;
+}
+
+#if CONFIG_INTERNAL_STATS
+static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx,
+ int mode_index) {
+#else
+static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx) {
+#endif // CONFIG_INTERNAL_STATS
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->rd_stats.skip_txfm = txfm_info->skip_txfm;
+
+ ctx->skippable = txfm_info->skip_txfm;
+#if CONFIG_INTERNAL_STATS
+ ctx->best_mode_index = mode_index;
+#endif // CONFIG_INTERNAL_STATS
+ ctx->mic = *xd->mi[0];
+ ctx->skippable = txfm_info->skip_txfm;
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+}
+
+void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
+ int pred_stride, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_STATS *this_rdc, int start_plane,
+ int stop_plane);
+
+void av1_estimate_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg);
+
+void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int best_early_term, unsigned int ref_cost_intra,
+ int reuse_prediction, struct buf_2d *orig_dst,
+ PRED_BUFFER *tmp_buffers,
+ PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+ BEST_PICKMODE *best_pickmode,
+ PICK_MODE_CONTEXT *ctx);
+
+#endif // AOM_AV1_ENCODER_NONRD_OPT_H_
diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c
new file mode 100644
index 0000000000..f939b6d1fa
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_pickmode.c
@@ -0,0 +1,3537 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+static INLINE int early_term_inter_search_with_sse(int early_term_idx,
+ BLOCK_SIZE bsize,
+ int64_t this_sse,
+ int64_t best_sse,
+ PREDICTION_MODE this_mode) {
+ // Aggressiveness to terminate inter mode search early is adjusted based on
+ // speed and block size.
+ static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 },
+ { 0.6, 0.65, 0.85, 0.9 },
+ { 0.5, 0.5, 0.55, 0.6 },
+ { 0.6, 0.75, 0.85, 0.85 } };
+ static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3,
+ 0.3 };
+
+ const int size_group = size_group_lookup[bsize];
+ assert(size_group < 4);
+ assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES));
+ const double threshold =
+ ((early_term_idx == EARLY_TERM_IDX_4) &&
+ (this_mode == NEWMV || this_mode == NEARESTMV))
+ ? early_term_thresh_newmv_nearestmv[size_group]
+ : early_term_thresh[early_term_idx - 1][size_group];
+
+ // Terminate inter mode search early based on best sse so far.
+ if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) {
+ return 1;
+ }
+ return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+ bp->best_sse = INT64_MAX;
+ bp->best_mode = NEARESTMV;
+ bp->best_ref_frame = LAST_FRAME;
+ bp->best_second_ref_frame = NONE_FRAME;
+ bp->best_tx_size = TX_8X8;
+ bp->tx_type = DCT_DCT;
+ bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ bp->best_mode_skip_txfm = 0;
+ bp->best_mode_initial_skip_flag = 0;
+ bp->best_pred = NULL;
+ bp->best_motion_mode = SIMPLE_TRANSLATION;
+ bp->num_proj_ref = 0;
+ av1_zero(bp->wm_params);
+ av1_zero(bp->pmi);
+}
+
+// Copy best inter mode parameters to best_pickmode
+static INLINE void update_search_state_nonrd(
+ InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi,
+ TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx,
+ PREDICTION_MODE this_best_mode, const int64_t sse_y) {
+ BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+ best_pickmode->best_sse = sse_y;
+ best_pickmode->best_mode = this_best_mode;
+ best_pickmode->best_motion_mode = mi->motion_mode;
+ best_pickmode->wm_params = mi->wm_params;
+ best_pickmode->num_proj_ref = mi->num_proj_ref;
+ best_pickmode->best_pred_filter = mi->interp_filters;
+ best_pickmode->best_tx_size = mi->tx_size;
+ best_pickmode->best_ref_frame = mi->ref_frame[0];
+ best_pickmode->best_second_ref_frame = mi->ref_frame[1];
+ best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm;
+ best_pickmode->best_mode_initial_skip_flag =
+ (nonskip_rdc->rate == INT_MAX && search_state->this_rdc.skip_txfm);
+ if (!best_pickmode->best_mode_skip_txfm) {
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+}
+
+static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int_mv *mv, MV ref_mv, FULLPEL_MV start_mv,
+ bool fullpel_performed_well) {
+ const int frame_lowmotion = cpi->rc.avg_frame_low_motion;
+ const int reduce_mv_pel_precision_highmotion =
+ cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion;
+
+ // Reduce MV precision for higher int MV value & frame-level motion
+ if (reduce_mv_pel_precision_highmotion >= 3) {
+ int mv_thresh = 4;
+ const int is_low_resoln =
+ (cpi->common.width * cpi->common.height <= 320 * 240);
+ mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
+ if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12;
+ mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
+ if (abs(mv->as_fullmv.row) >= mv_thresh ||
+ abs(mv->as_fullmv.col) >= mv_thresh)
+ return HALF_PEL;
+ } else if (reduce_mv_pel_precision_highmotion >= 1) {
+ int mv_thresh;
+ const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } };
+ const int th_idx = reduce_mv_pel_precision_highmotion - 1;
+ assert(th_idx >= 0 && th_idx < 2);
+ if (frame_lowmotion > 0 && frame_lowmotion < 40)
+ mv_thresh = 12;
+ else
+ mv_thresh = (bsize >= BLOCK_32X32) ? th_vals[th_idx][0]
+ : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1]
+ : th_vals[th_idx][2];
+ if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) ||
+ abs(mv->as_fullmv.col) >= (mv_thresh << 1))
+ return FULL_PEL;
+ else if (abs(mv->as_fullmv.row) >= mv_thresh ||
+ abs(mv->as_fullmv.col) >= mv_thresh)
+ return HALF_PEL;
+ }
+ // Reduce MV precision for relatively static (e.g. background), low-complex
+ // large areas
+ if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) {
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ assert(qband < 4);
+ if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad &&
+ bsize > BLOCK_16X16 && qband != 0) {
+ if (x->source_variance < 500)
+ return FULL_PEL;
+ else if (x->source_variance < 5000)
+ return HALF_PEL;
+ }
+ } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) {
+ if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 &&
+ start_mv.row == 0 && start_mv.col == 0)
+ return HALF_PEL;
+ }
+ return cpi->sf.mv_sf.subpel_force_stop;
+}
+
+static bool use_aggressive_subpel_search_method(MACROBLOCK *x,
+ bool use_adaptive_subpel_search,
+ bool fullpel_performed_well) {
+ if (!use_adaptive_subpel_search) return false;
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ assert(qband < 4);
+ if ((qband > 0) && (fullpel_performed_well ||
+ (x->content_state_sb.source_sad_nonrd <= kLowSad) ||
+ (x->source_variance < 100)))
+ return true;
+ return false;
+}
+
+/*!\brief Runs Motion Estimation for a specific block and specific ref frame.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by running Motion Estimation for a specific
+ * block and a specific reference frame. Exits early if RDCost of Full Pel part
+ * exceeds best RD Cost fund so far
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] bsize Current block size
+ * \param[in] tmp_mv Pointer to best found New MV
+ * \param[in] rate_mv Pointer to Rate of the best new MV
+ * \param[in] best_rd_sofar RD Cost of the best mode found so far
+ * \param[in] use_base_mv Flag, indicating that tmp_mv holds
+ * specific MV to start the search with
+ *
+ * \return Returns 0 if ME was terminated after Full Pel Search because too
+ * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv.
+ * Rate estimation for this vector is placed to \c rate_mv
+ */
+static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *tmp_mv,
+ int *rate_mv, int64_t best_rd_sofar,
+ int use_base_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *sf = &cpi->sf;
+ MB_MODE_INFO *mi = xd->mi[0];
+ int step_param = (sf->rt_sf.fullpel_search_step_param)
+ ? sf->rt_sf.fullpel_search_step_param
+ : cpi->mv_search_params.mv_step_param;
+ FULLPEL_MV start_mv;
+ const int ref = mi->ref_frame[0];
+ const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
+ MV center_mv;
+ int dis;
+ int rv = 0;
+ int cost_list[5];
+ int search_subpel = 1;
+
+ start_mv = get_fullmv_from_mv(&ref_mv);
+
+ if (!use_base_mv)
+ center_mv = ref_mv;
+ else
+ center_mv = tmp_mv->as_mv;
+
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+ start_mv, src_search_sites, search_method,
+ /*fine_search_interval=*/0);
+
+ const unsigned int full_var_rd = av1_full_pixel_search(
+ start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+ &tmp_mv->as_fullmv, &best_mv_stats, NULL);
+
+ // calculate the bit cost on motion vector
+ MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+
+ *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+
+ // TODO(kyslov) Account for Rate Mode!
+ rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
+
+ if (rv && search_subpel) {
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+ cost_list);
+ const bool fullpel_performed_well =
+ (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
+ (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
+ (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127);
+ if (sf->rt_sf.reduce_mv_pel_precision_highmotion ||
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex)
+ ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv,
+ start_mv, fullpel_performed_well);
+
+ MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ // adaptively downgrade subpel search method based on block properties
+ if (use_aggressive_subpel_search_method(
+ x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
+ av1_find_best_sub_pixel_tree_pruned_more(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+ &dis, &x->pred_sse[ref], NULL);
+ else
+ cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+ &dis, &x->pred_sse[ref], NULL);
+ *rate_mv =
+ av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+ // The final MV can not be equal to the reference MV as this will trigger an
+ // assert later. This can happen if both NEAREST and NEAR modes were skipped.
+ rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
+ return rv;
+}
+
+/*!\brief Searches for the best New Motion Vector.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by doing Motion Estimation. Uses reduced
+ * complexity ME for non-LAST frames or calls \c combined_motion_search
+ * for LAST reference frame
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] frame_mv Array that holds MVs for all modes
+ * and ref frames
+ * \param[in] ref_frame Reference frame for which to find
+ * the best New MVs
+ * \param[in] gf_temporal_ref Flag, indicating temporal reference
+ * for GOLDEN frame
+ * \param[in] bsize Current block size
+ * \param[in] mi_row Row index in 4x4 units
+ * \param[in] mi_col Column index in 4x4 units
+ * \param[in] rate_mv Pointer to Rate of the best new MV
+ * \param[in] best_rdc Pointer to the RD Cost for the best
+ * mode found so far
+ *
+ * \return Returns -1 if the search was not done, otherwise returns 0.
+ * Best New MV is placed into \c frame_mv array, Rate estimation for this
+ * vector is placed to \c rate_mv
+ */
+static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
+ int_mv frame_mv[][REF_FRAMES],
+ MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv,
+ RD_STATS *best_rdc) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ AV1_COMMON *cm = &cpi->common;
+ int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame];
+ unsigned int y_sad_zero;
+ if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ gf_temporal_ref) {
+ int tmp_sad;
+ int dis;
+
+ if (bsize < BLOCK_16X16) return -1;
+
+ int me_search_size_col = block_size_wide[bsize] >> 1;
+ int me_search_size_row = block_size_high[bsize] >> 1;
+ tmp_sad = av1_int_pro_motion_estimation(
+ cpi, x, bsize, mi_row, mi_col,
+ &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero,
+ me_search_size_col, me_search_size_row);
+
+ if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+
+ this_ref_frm_newmv->as_int = mi->mv[0].as_int;
+ int_mv best_mv = mi->mv[0];
+ best_mv.as_mv.row >>= 3;
+ best_mv.as_mv.col >>= 3;
+ MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
+ this_ref_frm_newmv->as_mv.row >>= 3;
+ this_ref_frm_newmv->as_mv.col >>= 3;
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
+ if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion ||
+ cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) {
+ FULLPEL_MV start_mv = { .row = 0, .col = 0 };
+ ms_params.forced_stop =
+ subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false);
+ }
+ MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+ cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis,
+ &x->pred_sse[ref_frame], NULL);
+ this_ref_frm_newmv->as_int = best_mv.as_int;
+
+ // When NEWMV is same as ref_mv from the drl, it is preferred to code the
+ // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to
+ // avoid an assert failure at a later stage. The scenario can occur if
+ // NEARESTMV was not evaluated for ALTREF.
+ if (this_ref_frm_newmv->as_mv.col == ref_mv.col &&
+ this_ref_frm_newmv->as_mv.row == ref_mv.row)
+ return -1;
+
+ *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame],
+ rate_mv, best_rdc->rdcost, 0)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ const ModeCosts *mode_costs,
+ int segment_id, BLOCK_SIZE bsize,
+ unsigned int *ref_costs_single) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+ } else {
+ int intra_inter_ctx = av1_get_intra_inter_context(xd);
+ ref_costs_single[INTRA_FRAME] =
+ mode_costs->intra_inter_cost[intra_inter_ctx][0];
+ unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+ is_comp_ref_allowed(bsize)) {
+ const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+ base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
+ }
+ ref_costs_single[LAST_FRAME] = base_cost;
+ ref_costs_single[GOLDEN_FRAME] = base_cost;
+ ref_costs_single[ALTREF_FRAME] = base_cost;
+ // add cost for last, golden, altref
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0];
+ }
+}
+
+static INLINE void set_force_skip_flag(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, unsigned int sse,
+ int *force_skip) {
+ if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT &&
+ cpi->sf.rt_sf.tx_size_level_based_on_qstep &&
+ cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+ const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (x->e_mbd.bd - 5);
+ const unsigned int qstep_sq = qstep * qstep;
+ // If the sse is low for low source variance blocks, mark those as
+ // transform skip.
+ // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+ // low so that reliable early estimate of tx skip can be obtained
+ // through its comparison with sse.
+ if (sse < qstep_sq && x->source_variance < qstep_sq &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+ *force_skip = 1;
+ }
+}
+
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+ (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+
+static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *const x, unsigned int var,
+ unsigned int sse, int *force_skip) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TX_SIZE tx_size;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
+ int multiplier = 8;
+ unsigned int var_thresh = 0;
+ unsigned int is_high_var = 1;
+ // Use quantizer based thresholds to determine transform size.
+ if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) {
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ const int mult[4] = { 8, 7, 6, 5 };
+ assert(qband < 4);
+ multiplier = mult[qband];
+ const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (xd->bd - 5);
+ const unsigned int qstep_sq = qstep * qstep;
+ var_thresh = qstep_sq * 2;
+ if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+ // If the sse is low for low source variance blocks, mark those as
+ // transform skip.
+ // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+ // low so that reliable early estimate of tx skip can be obtained
+ // through its comparison with sse.
+ if (sse < qstep_sq && x->source_variance < qstep_sq &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+ *force_skip = 1;
+ // Further lower transform size based on aq mode only if residual
+ // variance is high.
+ is_high_var = (var >= var_thresh);
+ }
+ }
+ // Choose larger transform size for blocks where dc component is dominant or
+ // the ac component is low.
+ if (sse > ((var * multiplier) >> 2) || (var < var_thresh))
+ tx_size =
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+ else
+ tx_size = TX_8X8;
+
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var)
+ tx_size = TX_8X8;
+ else if (tx_size > TX_16X16)
+ tx_size = TX_16X16;
+ } else {
+ tx_size =
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+ }
+
+ if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize))
+ tx_size = TX_SIZE_FOR_BSIZE_GT32;
+
+ return AOMMIN(tx_size, TX_16X16);
+}
+
+static void block_variance(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w, int h,
+ unsigned int *sse, int *sum, int block_size,
+ uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
+ int k = 0;
+ *sse = 0;
+ *sum = 0;
+
+ // This function is called for block sizes >= BLOCK_32x32. As per the design
+ // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32)
+ // per call. Hence the width and height of the block need to be at least 8 and
+ // 32 samples respectively.
+ assert(w >= 32);
+ assert(h >= 8);
+ for (int row = 0; row < h; row += block_size) {
+ for (int col = 0; col < w; col += 32) {
+ aom_get_var_sse_sum_8x8_quad(src + src_stride * row + col, src_stride,
+ ref + ref_stride * row + col, ref_stride,
+ &sse8x8[k], &sum8x8[k], sse, sum,
+ &var8x8[k]);
+ k += 4;
+ }
+ }
+}
+
+static void block_variance_16x16_dual(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w,
+ int h, unsigned int *sse, int *sum,
+ int block_size, uint32_t *sse16x16,
+ uint32_t *var16x16) {
+ int k = 0;
+ *sse = 0;
+ *sum = 0;
+ // This function is called for block sizes >= BLOCK_32x32. As per the design
+ // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a
+ // 16x32) per call. Hence the width and height of the block need to be at
+ // least 16 and 32 samples respectively.
+ assert(w >= 32);
+ assert(h >= 16);
+ for (int row = 0; row < h; row += block_size) {
+ for (int col = 0; col < w; col += 32) {
+ aom_get_var_sse_sum_16x16_dual(src + src_stride * row + col, src_stride,
+ ref + ref_stride * row + col, ref_stride,
+ &sse16x16[k], sse, sum, &var16x16[k]);
+ k += 2;
+ }
+ }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+ unsigned int *sse_i, int *sum_i,
+ unsigned int *var_o, unsigned int *sse_o,
+ int *sum_o) {
+ const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+ const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+ const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+ int row, col, k = 0;
+
+ for (row = 0; row < nh; row += 2) {
+ for (col = 0; col < nw; col += 2) {
+ sse_o[k] = sse_i[row * nw + col] + sse_i[row * nw + col + 1] +
+ sse_i[(row + 1) * nw + col] + sse_i[(row + 1) * nw + col + 1];
+ sum_o[k] = sum_i[row * nw + col] + sum_i[row * nw + col + 1] +
+ sum_i[(row + 1) * nw + col] + sum_i[(row + 1) * nw + col + 1];
+ var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+ (b_width_log2_lookup[unit_size] +
+ b_height_log2_lookup[unit_size] + 6));
+ k++;
+ }
+ }
+}
+
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(int speed, int width, int height, int norm_sum) {
+ if (speed >= 8 && norm_sum < 5) {
+ if (width <= 640 && height <= 480)
+ return 4;
+ else
+ return 2;
+ }
+ return 1;
+}
+
+// Sets early_term flag based on chroma planes prediction
+static INLINE void set_early_term_based_on_uv_plane(
+ AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row,
+ int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx,
+ const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ const uint32_t dc_quant = p->dequant_QTX[0];
+ const uint32_t ac_quant = p->dequant_QTX[1];
+ int64_t dc_thr = dc_quant * dc_quant >> 6;
+ int64_t ac_thr = ac_quant * ac_quant >> 6;
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ int ac_test = 1;
+ int dc_test = 1;
+ const int norm_sum = abs(sum) >> (bw + bh);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5)
+ ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+ norm_sum, cpi->svc.temporal_layer_id);
+ else
+ ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+#else
+ ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+
+#endif
+
+ if (cpi->sf.rt_sf.increase_source_sad_thresh) {
+ dc_thr = dc_thr << 1;
+ ac_thr = ac_thr << 2;
+ }
+
+ for (int k = 0; k < num_blk; k++) {
+ // Check if all ac coefficients can be quantized to zero.
+ if (!(var_tx[k] < ac_thr || var == 0)) {
+ ac_test = 0;
+ break;
+ }
+ // Check if dc coefficient can be quantized to zero.
+ if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+ dc_test = 0;
+ break;
+ }
+ }
+
+ // Check if chroma can be skipped based on ac and dc test flags.
+ if (ac_test && dc_test) {
+ int skip_uv[2] = { 0 };
+ unsigned int var_uv[2];
+ unsigned int sse_uv[2];
+ // Transform skipping test in UV planes.
+ for (int plane = AOM_PLANE_U; plane <= AOM_PLANE_V; plane++) {
+ int j = plane - 1;
+ skip_uv[j] = 1;
+ if (x->color_sensitivity[COLOR_SENS_IDX(plane)]) {
+ skip_uv[j] = 0;
+ struct macroblock_plane *const puv = &x->plane[plane];
+ struct macroblockd_plane *const puvd = &xd->plane[plane];
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(
+ bsize, puvd->subsampling_x, puvd->subsampling_y);
+ // Adjust these thresholds for UV.
+ const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3;
+ const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3;
+ const int64_t uv_dc_thr =
+ (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc;
+ const int64_t uv_ac_thr =
+ (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ plane, plane);
+ var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
+ puvd->dst.buf,
+ puvd->dst.stride, &sse_uv[j]);
+ if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+ (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+ skip_uv[j] = 1;
+ else
+ break;
+ }
+ }
+ if (skip_uv[0] & skip_uv[1]) {
+ *early_term = 1;
+ }
+ }
+}
+
+static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ int calculate_rd, int *early_term,
+ BLOCK_SIZE bsize,
+ unsigned int sse) {
+ if (calculate_rd) {
+ if (!*early_term) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+
+ model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh,
+ &rd_stats->rate, &rd_stats->dist);
+ }
+
+ if (*early_term) {
+ rd_stats->rate = 0;
+ rd_stats->dist = sse << 4;
+ }
+ }
+}
+
+static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MACROBLOCK *x,
+ MACROBLOCKD *xd, RD_STATS *rd_stats,
+ int *early_term, int calculate_rd,
+ int64_t best_sse,
+ unsigned int *var_output,
+ unsigned int var_prune_threshold) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ int test_skip = 1;
+ unsigned int var;
+ int sum;
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ unsigned int sse16x16[64] = { 0 };
+ unsigned int var16x16[64] = { 0 };
+ assert(xd->mi[0]->tx_size == TX_16X16);
+ assert(bsize > BLOCK_32X32);
+
+ // Calculate variance for whole partition, and also save 16x16 blocks'
+ // variance to be used in following transform skipping test.
+ block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16,
+ sse16x16, var16x16);
+
+ var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+ if (var_output) {
+ *var_output = var;
+ if (*var_output > var_prune_threshold) {
+ return;
+ }
+ }
+
+ rd_stats->sse = sse;
+ // Skipping test
+ *early_term = 0;
+ set_force_skip_flag(cpi, x, sse, early_term);
+ // The code below for setting skip flag assumes transform size of at least
+ // 8x8, so force this lower limit on transform.
+ MB_MODE_INFO *const mi = xd->mi[0];
+ if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+ early_term_inter_search_with_sse(
+ cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+ mi->mode))
+ test_skip = 0;
+
+ if (*early_term) test_skip = 0;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ if (test_skip) {
+ const unsigned int *sse_tx = sse16x16;
+ const unsigned int *var_tx = var16x16;
+ const unsigned int num_block = (1 << (bw + bh - 2)) >> 2;
+ set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+ early_term, num_block, sse_tx, var_tx, sum,
+ var, sse);
+ }
+ calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+ sse);
+}
+
+static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MACROBLOCK *x,
+ MACROBLOCKD *xd, RD_STATS *rd_stats,
+ int *early_term, int calculate_rd,
+ int64_t best_sse,
+ unsigned int *var_output,
+ unsigned int var_prune_threshold) {
+ if (x->force_zeromv_skip_for_blk) {
+ *early_term = 1;
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ return;
+ }
+
+ // For block sizes greater than 32x32, the transform size is always 16x16.
+ // This function avoids calling calculate_variance() for tx_size 16x16 cases
+ // by directly populating variance at tx_size level from
+ // block_variance_16x16_dual() function.
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) {
+ xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32;
+ model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats,
+ early_term, calculate_rd, best_sse, var_output,
+ var_prune_threshold);
+ return;
+ }
+
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ int test_skip = 1;
+ unsigned int var;
+ int sum;
+
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ unsigned int sse8x8[256] = { 0 };
+ int sum8x8[256] = { 0 };
+ unsigned int var8x8[256] = { 0 };
+ TX_SIZE tx_size;
+
+ // Calculate variance for whole partition, and also save 8x8 blocks' variance
+ // to be used in following transform skipping test.
+ block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+ var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+ if (var_output) {
+ *var_output = var;
+ if (*var_output > var_prune_threshold) {
+ return;
+ }
+ }
+
+ rd_stats->sse = sse;
+ // Skipping test
+ *early_term = 0;
+ tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term);
+ assert(tx_size <= TX_16X16);
+ // The code below for setting skip flag assumes transform size of at least
+ // 8x8, so force this lower limit on transform.
+ if (tx_size < TX_8X8) tx_size = TX_8X8;
+ xd->mi[0]->tx_size = tx_size;
+
+ MB_MODE_INFO *const mi = xd->mi[0];
+ if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+ early_term_inter_search_with_sse(
+ cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+ mi->mode))
+ test_skip = 0;
+
+ if (*early_term) test_skip = 0;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ if (test_skip) {
+ unsigned int sse16x16[64] = { 0 };
+ int sum16x16[64] = { 0 };
+ unsigned int var16x16[64] = { 0 };
+ const unsigned int *sse_tx = sse8x8;
+ const unsigned int *var_tx = var8x8;
+ unsigned int num_blks = 1 << (bw + bh - 2);
+
+ if (tx_size >= TX_16X16) {
+ calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+ sum16x16);
+ sse_tx = sse16x16;
+ var_tx = var16x16;
+ num_blks = num_blks >> 2;
+ }
+ set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+ early_term, num_blks, sse_tx, var_tx, sum,
+ var, sse);
+ }
+ calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+ sse);
+}
+
+static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_STATS *rd_stats, unsigned int *var_out,
+ int calculate_rd, int *early_term) {
+ if (x->force_zeromv_skip_for_blk && early_term != NULL) {
+ *early_term = 1;
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ }
+
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+
+ unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
+ int force_skip = 0;
+ xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
+ if (var_out) {
+ *var_out = var;
+ }
+
+ if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
+ const int bwide = block_size_wide[bsize];
+ const int bhigh = block_size_high[bsize];
+ model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
+ &dist);
+ } else {
+ rate = INT_MAX; // this will be overwritten later with av1_block_yrd
+ dist = INT_MAX;
+ }
+ rd_stats->sse = sse;
+ x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ if (force_skip && ref > INTRA_FRAME) {
+ rate = 0;
+ dist = (int64_t)sse << 4;
+ }
+
+ assert(rate >= 0);
+
+ rd_stats->skip_txfm = (rate == 0);
+ rate = AOMMIN(rate, INT_MAX);
+ rd_stats->rate = rate;
+ rd_stats->dist = dist;
+}
+
+static INLINE int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ const int (*const drl_mode_cost0)[2],
+ int8_t ref_frame_type) {
+ int cost = 0;
+ if (this_mode == NEWMV || this_mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx];
+ if (ref_mv_idx == idx) return cost;
+ }
+ }
+ return cost;
+ }
+
+ if (have_nearmv_in_inter_mode(this_mode)) {
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)];
+ if (ref_mv_idx == (idx - 1)) return cost;
+ }
+ }
+ return cost;
+ }
+ return cost;
+}
+
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
+ int16_t mode_context) {
+ if (is_inter_compound_mode(mode)) {
+ return mode_costs
+ ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+ }
+
+ int mode_cost = 0;
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+ assert(is_inter_mode(mode));
+
+ if (mode == NEWMV) {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+ if (mode == GLOBALMV) {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+ return mode_cost;
+ }
+ }
+}
+
+static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
+ RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
+ int mv_col, int speed, uint32_t spatial_variance,
+ CONTENT_STATE_SB content_state_sb) {
+ // Bias against MVs associated with NEWMV mode that are very different from
+ // top/left neighbors.
+ if (this_mode == NEWMV) {
+ int al_mv_average_row;
+ int al_mv_average_col;
+ int row_diff, col_diff;
+ int above_mv_valid = 0;
+ int left_mv_valid = 0;
+ int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
+ int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
+ if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
+ spatial_variance < 300 &&
+ (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
+ this_rdc->rdcost = this_rdc->rdcost << 2;
+ return;
+ }
+ if (xd->above_mbmi) {
+ above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
+ above_row = xd->above_mbmi->mv[0].as_mv.row;
+ above_col = xd->above_mbmi->mv[0].as_mv.col;
+ }
+ if (xd->left_mbmi) {
+ left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV;
+ left_row = xd->left_mbmi->mv[0].as_mv.row;
+ left_col = xd->left_mbmi->mv[0].as_mv.col;
+ }
+ if (above_mv_valid && left_mv_valid) {
+ al_mv_average_row = (above_row + left_row + 1) >> 1;
+ al_mv_average_col = (above_col + left_col + 1) >> 1;
+ } else if (above_mv_valid) {
+ al_mv_average_row = above_row;
+ al_mv_average_col = above_col;
+ } else if (left_mv_valid) {
+ al_mv_average_row = left_row;
+ al_mv_average_col = left_col;
+ } else {
+ al_mv_average_row = al_mv_average_col = 0;
+ }
+ row_diff = al_mv_average_row - mv_row;
+ col_diff = al_mv_average_col - mv_col;
+ if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) {
+ if (bsize >= BLOCK_32X32)
+ this_rdc->rdcost = this_rdc->rdcost << 1;
+ else
+ this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+ }
+ } else {
+ // Bias for speed >= 8 for low spatial variance.
+ if (speed >= 8 && spatial_variance < 150 &&
+ (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64))
+ this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+ }
+}
+
+static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ MV_REFERENCE_FRAME ref_frame,
+ THR_MODES best_mode_idx,
+ PREDICTION_MODE mode) {
+ const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+ const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4);
+ const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128);
+ for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) {
+ int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx];
+ if (thr_mode_idx == best_mode_idx) {
+ *freq_fact -= (*freq_fact >> 4);
+ } else {
+ *freq_fact =
+ AOMMIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void av1_pickmode_ctx_den_update(
+ AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
+ unsigned int ref_frame_cost[REF_FRAMES],
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred,
+ BEST_PICKMODE *bp) {
+ ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+ ctx_den->ref_frame_cost = ref_frame_cost;
+ ctx_den->frame_mv = frame_mv;
+ ctx_den->reuse_inter_pred = reuse_inter_pred;
+ ctx_den->best_tx_size = bp->best_tx_size;
+ ctx_den->best_mode = bp->best_mode;
+ ctx_den->best_ref_frame = bp->best_ref_frame;
+ ctx_den->best_pred_filter = bp->best_pred_filter;
+ ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
+}
+
+static void recheck_zeromv_after_denoising(
+ AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+ AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den,
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc,
+ BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+ // denoised result. Only do this under noise conditions, and if rdcost of
+ // ZEROMV on original source is not significantly higher than rdcost of best
+ // mode.
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
+ ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+ ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+ (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+ cpi->svc.number_spatial_layers == 1 &&
+ decision == FILTER_ZEROMV_BLOCK))) {
+ // Check if we should pick ZEROMV on denoised signal.
+ AV1_COMMON *const cm = &cpi->common;
+ RD_STATS this_rdc;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+
+ mi->mode = GLOBALMV;
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[LAST_FRAME][AOM_PLANE_Y];
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+ unsigned int var;
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL);
+
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+ this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx);
+
+ this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME];
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+ txfm_info->skip_txfm = this_rdc.skip_txfm;
+ // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+ // is higher than best_ref mode (on original source).
+ if (this_rdc.rdcost > best_rdc->rdcost) {
+ this_rdc = *best_rdc;
+ mi->mode = best_pickmode->best_mode;
+ mi->ref_frame[0] = best_pickmode->best_ref_frame;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+ mi->interp_filters = best_pickmode->best_pred_filter;
+ if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+ mi->mv[0].as_int = INVALID_MV;
+ } else {
+ mi->mv[0].as_int = ctx_den
+ ->frame_mv[best_pickmode->best_mode]
+ [best_pickmode->best_ref_frame]
+ .as_int;
+ if (ctx_den->reuse_inter_pred) {
+ xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[GOLDEN_FRAME][AOM_PLANE_Y];
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+ }
+ }
+ mi->tx_size = best_pickmode->best_tx_size;
+ txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+ } else {
+ ctx_den->best_ref_frame = LAST_FRAME;
+ *best_rdc = this_rdc;
+ }
+ }
+}
+#endif // CONFIG_AV1_TEMPORAL_DENOISING
+
+/*!\brief Searches for the best interpolation filter
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR,
+ * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects
+ * the one that gives lowest RD cost. RD cost is calculated using curvfit model.
+ * Support for dual filters (different filters in the x & y directions) is
+ * allowed if sf.interp_sf.disable_dual_filter = 0.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] this_rdc Pointer to calculated RD Cost
+ * \param[in] inter_pred_params_sr Pointer to structure holding parameters of
+ inter prediction for single reference
+ * \param[in] mi_row Row index in 4x4 units
+ * \param[in] mi_col Column index in 4x4 units
+ * \param[in] tmp_buffer Pointer to a temporary buffer for
+ * prediction re-use
+ * \param[in] bsize Current block size
+ * \param[in] reuse_inter_pred Flag, indicating prediction re-use
+ * \param[out] this_mode_pred Pointer to store prediction buffer
+ * for prediction re-use
+ * \param[out] this_early_term Flag, indicating that transform can be
+ * skipped
+ * \param[out] var The residue variance of the current
+ * predictor.
+ * \param[in] use_model_yrd_large Flag, indicating special logic to handle
+ * large blocks
+ * \param[in] best_sse Best sse so far.
+ * \param[in] is_single_pred Flag, indicating single mode.
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
+ * \c reuse_inter_pred flag is set, this function also outputs
+ * \c this_mode_pred. Also \c this_early_temp is set if transform can be
+ * skipped
+ */
+static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+ InterPredParams *inter_pred_params_sr, int mi_row,
+ int mi_col, PRED_BUFFER *tmp_buffer,
+ BLOCK_SIZE bsize, int reuse_inter_pred,
+ PRED_BUFFER **this_mode_pred,
+ int *this_early_term, unsigned int *var,
+ int use_model_yrd_large, int64_t best_sse,
+ int is_single_pred) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+ int dim_factor =
+ (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1;
+ RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+ TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+ PRED_BUFFER *current_pred = *this_mode_pred;
+ int best_skip = 0;
+ int best_early_term = 0;
+ int64_t best_cost = INT64_MAX;
+ int best_filter_index = -1;
+
+ SubpelParams subpel_params;
+ // Initialize inter prediction params at mode level for single reference
+ // mode.
+ if (is_single_pred)
+ init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr,
+ &subpel_params, xd->block_ref_scale_factors[0],
+ pd->pre->width, pd->pre->height);
+ for (int filter_idx = 0; filter_idx < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE;
+ ++filter_idx) {
+ int64_t cost;
+ if (cpi->sf.interp_sf.disable_dual_filter &&
+ filters_ref_set[filter_idx].as_filters.x_filter !=
+ filters_ref_set[filter_idx].as_filters.y_filter)
+ continue;
+
+ mi->interp_filters.as_int = filters_ref_set[filter_idx].as_int;
+ if (is_single_pred)
+ av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+ &subpel_params);
+ else
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ unsigned int curr_var = UINT_MAX;
+ if (use_model_yrd_large)
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &pf_rd_stats[filter_idx], this_early_term, 1,
+ best_sse, &curr_var, UINT_MAX);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[filter_idx], &curr_var,
+ 1, NULL);
+ pf_rd_stats[filter_idx].rate += av1_get_switchable_rate(
+ x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
+ cost = RDCOST(x->rdmult, pf_rd_stats[filter_idx].rate,
+ pf_rd_stats[filter_idx].dist);
+ pf_tx_size[filter_idx] = mi->tx_size;
+ if (cost < best_cost) {
+ *var = curr_var;
+ best_filter_index = filter_idx;
+ best_cost = cost;
+ best_skip = pf_rd_stats[filter_idx].skip_txfm;
+ best_early_term = *this_early_term;
+ if (reuse_inter_pred) {
+ if (*this_mode_pred != current_pred) {
+ free_pred_buffer(*this_mode_pred);
+ *this_mode_pred = current_pred;
+ }
+ current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ pd->dst.buf = current_pred->data;
+ pd->dst.stride = bw;
+ }
+ }
+ }
+ assert(best_filter_index >= 0 &&
+ best_filter_index < dim_factor * FILTER_SEARCH_SIZE);
+ if (reuse_inter_pred && *this_mode_pred != current_pred)
+ free_pred_buffer(current_pred);
+
+ mi->interp_filters.as_int = filters_ref_set[best_filter_index].as_int;
+ mi->tx_size = pf_tx_size[best_filter_index];
+ this_rdc->rate = pf_rd_stats[best_filter_index].rate;
+ this_rdc->dist = pf_rd_stats[best_filter_index].dist;
+ this_rdc->sse = pf_rd_stats[best_filter_index].sse;
+ this_rdc->skip_txfm = (best_skip || best_early_term);
+ *this_early_term = best_early_term;
+ if (reuse_inter_pred) {
+ pd->dst.buf = (*this_mode_pred)->data;
+ pd->dst.stride = (*this_mode_pred)->stride;
+ } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
+ if (is_single_pred)
+ av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+ &subpel_params);
+ else
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+}
+#if !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ const MB_MODE_INFO *mbmi) {
+ const FeatureFlags *const features = &cpi->common.features;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ if (cpi->sf.inter_sf.extra_prune_warped) return 0;
+ if (has_second_ref(mbmi)) return 0;
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+ if (features->switchable_motion_mode) {
+ // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+ // is allowed.
+ last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mbmi, features->allow_warped_motion);
+ }
+
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const FeatureFlags *const features = &cm->features;
+
+ mi->num_proj_ref = 1;
+ WARP_SAMPLE_INFO *const warp_sample_info =
+ &x->warp_sample_info[mi->ref_frame[0]];
+ int *pts0 = warp_sample_info->pts;
+ int *pts_inref0 = warp_sample_info->pts_inref;
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+ if (features->switchable_motion_mode) {
+ // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+ // is allowed.
+ last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mi, features->allow_warped_motion);
+ }
+
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ if (warp_sample_info->num < 0) {
+ warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+ }
+ mi->num_proj_ref = warp_sample_info->num;
+ }
+}
+
+static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *this_early_term, int use_model_yrd_large,
+ int *rate_mv, int64_t best_sse) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const FeatureFlags *const features = &cm->features;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 };
+ int best_skip = 0;
+ int best_early_term = 0;
+ int64_t best_cost = INT64_MAX;
+ int best_mode_index = -1;
+ const int interp_filter = features->interp_filter;
+
+ const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = {
+ SIMPLE_TRANSLATION, WARPED_CAUSAL
+ };
+ int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1;
+
+ WARP_SAMPLE_INFO *const warp_sample_info =
+ &x->warp_sample_info[mi->ref_frame[0]];
+ int *pts0 = warp_sample_info->pts;
+ int *pts_inref0 = warp_sample_info->pts_inref;
+
+ const int total_samples = mi->num_proj_ref;
+ if (total_samples == 0) {
+ // Do not search WARPED_CAUSAL if there are no samples to use to determine
+ // warped parameters.
+ mode_search_size = 1;
+ }
+
+ const MB_MODE_INFO base_mbmi = *mi;
+ MB_MODE_INFO best_mbmi;
+
+ for (int mode_index = 0; mode_index < mode_search_size; ++mode_index) {
+ int64_t cost = INT64_MAX;
+ MOTION_MODE motion_mode = motion_modes[mode_index];
+ *mi = base_mbmi;
+ mi->motion_mode = motion_mode;
+ if (motion_mode == SIMPLE_TRANSLATION) {
+ mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ if (use_model_yrd_large)
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &pf_rd_stats[mode_index], this_early_term, 1,
+ best_sse, NULL, UINT_MAX);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1,
+ NULL);
+ pf_rd_stats[mode_index].rate +=
+ av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+ cm->seq_params->enable_dual_filter);
+ cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate,
+ pf_rd_stats[mode_index].dist);
+ } else if (motion_mode == WARPED_CAUSAL) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ mi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ // Select the samples according to motion vector difference
+ if (mi->num_proj_ref > 1) {
+ mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref,
+ mi->num_proj_ref, bsize);
+ }
+
+ // Compute the warped motion parameters with a least squares fit
+ // using the collected samples
+ if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize,
+ mi->mv[0].as_mv.row, mi->mv[0].as_mv.col,
+ &mi->wm_params, mi_row, mi_col)) {
+ if (mi->mode == NEWMV) {
+ const int_mv mv0 = mi->mv[0];
+ const WarpedMotionParams wm_params0 = mi->wm_params;
+ const int num_proj_ref0 = mi->num_proj_ref;
+
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+ &ref_mv.as_mv, NULL);
+
+ // Refine MV in a small range.
+ av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+ total_samples, cpi->sf.mv_sf.warp_search_method,
+ cpi->sf.mv_sf.warp_search_iters);
+ if (mi->mv[0].as_int == ref_mv.as_int) {
+ continue;
+ }
+
+ if (mv0.as_int != mi->mv[0].as_int) {
+ // Keep the refined MV and WM parameters.
+ int tmp_rate_mv = av1_mv_bit_cost(
+ &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ *rate_mv = tmp_rate_mv;
+ } else {
+ // Restore the old MV and WM parameters.
+ mi->mv[0] = mv0;
+ mi->wm_params = wm_params0;
+ mi->num_proj_ref = num_proj_ref0;
+ }
+ }
+ // Build the warped predictor
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, av1_num_planes(cm) - 1);
+ if (use_model_yrd_large)
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &pf_rd_stats[mode_index], this_early_term,
+ 1, best_sse, NULL, UINT_MAX);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL,
+ 1, NULL);
+
+ pf_rd_stats[mode_index].rate +=
+ mode_costs->motion_mode_cost[bsize][mi->motion_mode];
+ cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate,
+ pf_rd_stats[mode_index].dist);
+ } else {
+ cost = INT64_MAX;
+ }
+ }
+ if (cost < best_cost) {
+ best_mode_index = mode_index;
+ best_cost = cost;
+ best_skip = pf_rd_stats[mode_index].skip_txfm;
+ best_early_term = *this_early_term;
+ best_mbmi = *mi;
+ }
+ }
+ assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE);
+
+ *mi = best_mbmi;
+ this_rdc->rate = pf_rd_stats[best_mode_index].rate;
+ this_rdc->dist = pf_rd_stats[best_mode_index].dist;
+ this_rdc->sse = pf_rd_stats[best_mode_index].sse;
+ this_rdc->skip_txfm = (best_skip || best_early_term);
+ *this_early_term = best_early_term;
+ if (best_mode_index < FILTER_SEARCH_SIZE - 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#define COLLECT_NON_SQR_STAT 0
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+
+static AOM_INLINE void print_stage_time(const char *stage_name,
+ int64_t stage_time,
+ int64_t total_time) {
+ printf(" %s: %ld (%f%%)\n", stage_name, stage_time,
+ 100 * stage_time / (float)total_time);
+}
+
+static void print_time(const mode_search_stat_nonrd *const ms_stat,
+ BLOCK_SIZE bsize, int mi_rows, int mi_cols, int mi_row,
+ int mi_col) {
+ if ((mi_row + mi_size_high[bsize] >= mi_rows) &&
+ (mi_col + mi_size_wide[bsize] >= mi_cols)) {
+ int64_t total_time = 0l;
+ int32_t total_blocks = 0;
+ for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+ total_time += ms_stat->total_block_times[bs];
+ total_blocks += ms_stat->num_blocks[bs];
+ }
+
+ printf("\n");
+ for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+ if (ms_stat->num_blocks[bs] == 0) {
+ continue;
+ }
+ if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) {
+ continue;
+ }
+
+ printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n",
+ block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs],
+ ms_stat->total_block_times[bs],
+ 100 * ms_stat->total_block_times[bs] / (float)total_time,
+ (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]);
+ for (int j = 0; j < MB_MODE_COUNT; j++) {
+ if (ms_stat->nonskipped_search_times[bs][j] == 0) {
+ continue;
+ }
+
+ int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j];
+ printf(" Mode %d, %d/%d tps %f\n", j,
+ ms_stat->num_nonskipped_searches[bs][j],
+ ms_stat->num_searches[bs][j],
+ ms_stat->num_nonskipped_searches[bs][j] > 0
+ ? (float)ms_stat->nonskipped_search_times[bs][j] /
+ ms_stat->num_nonskipped_searches[bs][j]
+ : 0l);
+ if (j >= INTER_MODE_START) {
+ total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] +
+ ms_stat->model_rd_time[bs][j] +
+ ms_stat->txfm_time[bs][j];
+ print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j],
+ total_time);
+ print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j],
+ total_time);
+ print_stage_time("Model RD Time", ms_stat->model_rd_time[bs][j],
+ total_time);
+ print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j],
+ total_time);
+ }
+ print_stage_time("Total Mode Time", total_mode_time, total_time);
+ }
+ printf("\n");
+ }
+ printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
+ }
+}
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+
+static bool should_prune_intra_modes_using_neighbors(
+ const MACROBLOCKD *xd, bool enable_intra_mode_pruning_using_neighbors,
+ PREDICTION_MODE this_mode, PREDICTION_MODE above_mode,
+ PREDICTION_MODE left_mode) {
+ if (!enable_intra_mode_pruning_using_neighbors) return false;
+
+ // Avoid pruning of DC_PRED as it is the most probable mode to win as per the
+ // statistics generated for nonrd intra mode evaluations.
+ if (this_mode == DC_PRED) return false;
+
+ // Enable the pruning for current mode only if it is not the winner mode of
+ // both the neighboring blocks (left/top).
+ return xd->up_available && this_mode != above_mode && xd->left_available &&
+ this_mode != left_mode;
+}
+
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ RD_STATS this_rdc, best_rdc;
+ struct estimate_block_intra_args args;
+ init_estimate_block_intra_args(&args, cpi, x);
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ mi->tx_size =
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+ assert(IMPLIES(xd->lossless[mi->segment_id], mi->tx_size == TX_4X4));
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size];
+
+ // If the current block size is the same as the transform block size, enable
+ // mode pruning based on the best SAD so far.
+ if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize)
+ args.prune_mode_based_on_sad = true;
+
+ int *bmode_costs;
+ PREDICTION_MODE best_mode = DC_PRED;
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[A];
+ const int left_ctx = intra_mode_context[L];
+ const unsigned int source_variance = x->source_variance;
+ bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
+
+ av1_invalid_rd_stats(&best_rdc);
+ av1_invalid_rd_stats(&this_rdc);
+
+ init_mbmi_nonrd(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm);
+ mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV;
+
+ // Change the limit of this loop to add other intra prediction
+ // mode tests.
+ for (int mode_index = 0; mode_index < RTC_INTRA_MODES; ++mode_index) {
+ PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+
+ // As per the statistics generated for intra mode evaluation in the nonrd
+ // path, it is found that the probability of H_PRED mode being the winner is
+ // very low when the best mode so far is V_PRED (out of DC_PRED and V_PRED).
+ // If V_PRED is the winner mode out of DC_PRED and V_PRED, it could imply
+ // the presence of a vertically dominant pattern. Hence, H_PRED mode is not
+ // evaluated.
+ if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far &&
+ this_mode == H_PRED && best_mode == V_PRED)
+ continue;
+
+ if (should_prune_intra_modes_using_neighbors(
+ xd, cpi->sf.rt_sf.enable_intra_mode_pruning_using_neighbors,
+ this_mode, A, L)) {
+ // Prune V_PRED and H_PRED if source variance of the block is less than
+ // or equal to 50. The source variance threshold is obtained empirically.
+ if ((this_mode == V_PRED || this_mode == H_PRED) && source_variance <= 50)
+ continue;
+
+ // As per the statistics, probability of SMOOTH_PRED being the winner is
+ // low when best mode so far is DC_PRED (out of DC_PRED, V_PRED and
+ // H_PRED). Hence, SMOOTH_PRED mode is not evaluated.
+ if (best_mode == DC_PRED && this_mode == SMOOTH_PRED) continue;
+ }
+
+ this_rdc.dist = this_rdc.rate = 0;
+ args.mode = this_mode;
+ args.skippable = 1;
+ args.rdc = &this_rdc;
+ mi->mode = this_mode;
+ av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y,
+ av1_estimate_block_intra, &args);
+
+ if (this_rdc.rate == INT_MAX) continue;
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ if (args.skippable) {
+ this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ } else {
+ this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ }
+ this_rdc.rate += bmode_costs[this_mode];
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ best_mode = this_mode;
+ if (!this_rdc.skip_txfm) {
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+ }
+
+ mi->mode = best_mode;
+ // Keep DC for UV since mode test is based on Y channel only.
+ mi->uv_mode = UV_DC_PRED;
+ *rd_cost = best_rdc;
+
+ // For lossless: always force the skip flags off.
+ // Even though the blk_skip is set to 0 above in the rdcost comparison,
+ // do it here again in case the above logic changes.
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ x->txfm_search_info.skip_txfm = 0;
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ }
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context_nonrd(x, ctx, mi->mode);
+#else
+ store_coding_context_nonrd(x, ctx);
+#endif // CONFIG_INTERNAL_STATS
+}
+
+static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
+ struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+ struct scale_factors *const sf_golden =
+ get_ref_scale_factors(cm, GOLDEN_FRAME);
+ return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) &&
+ (sf_last->y_scale_fp == sf_golden->y_scale_fp));
+}
+
+static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
+ MB_MODE_INFO *mi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int gf_temporal_ref,
+ int use_ref_frame[],
+ int *force_skip_low_temp_var) {
+ AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+
+ // When the ref_frame_config is used to set the reference frame structure
+ // then the usage of alt_ref is determined by the ref_frame_flags
+ // (and not the speed feature use_nonrd_altref_frame).
+ int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config ||
+ cpi->sf.rt_sf.use_nonrd_altref_frame;
+
+ int use_golden_ref_frame = 1;
+ int use_last_ref_frame = 1;
+
+ // When the ref_frame_config is used to set the reference frame structure:
+ // check if LAST is used as a reference. And only remove golden and altref
+ // references below if last is used as a reference.
+ if (cpi->ppi->rtc_ref.set_ref_frame_config)
+ use_last_ref_frame =
+ cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0;
+
+ // frame_since_golden is not used when user sets the referene structure.
+ if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame &&
+ cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
+ use_golden_ref_frame = 0;
+ }
+
+ if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var &&
+ x->nonrd_prune_ref_frame_search) {
+ if (is_small_sb)
+ *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+ else
+ *force_skip_low_temp_var = av1_get_force_skip_low_temp_var(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+ // If force_skip_low_temp_var is set, skip golden reference.
+ if (*force_skip_low_temp_var) {
+ use_golden_ref_frame = 0;
+ use_alt_ref_frame = 0;
+ }
+ }
+
+ if (use_last_ref_frame &&
+ (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk ||
+ (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
+ use_golden_ref_frame = 0;
+ use_alt_ref_frame = 0;
+ }
+
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+ use_golden_ref_frame = 1;
+ use_alt_ref_frame = 0;
+ }
+
+ // Skip golden/altref reference if color is set, on flat blocks with motion.
+ // For screen: always skip golden/alt (if color_sensitivity_sb_g/alt is set)
+ // except when x->nonrd_prune_ref_frame_search = 0. This latter flag
+ // may be set in the variance partition when golden is a much better
+ // reference than last, in which case it may not be worth skipping
+ // golden/altref completely.
+ // Condition on use_last_ref to make sure there remains at least one
+ // reference.
+ if (use_last_ref_frame &&
+ ((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ x->nonrd_prune_ref_frame_search != 0) ||
+ (x->source_variance < 200 &&
+ x->content_state_sb.source_sad_nonrd >= kLowSad))) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ use_golden_ref_frame = 0;
+ if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ use_alt_ref_frame = 0;
+ }
+
+ // For non-screen: if golden and altref are not being selected as references
+ // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back
+ // based on the sad of nearest/nearmv of LAST ref. If this block sad is large,
+ // keep golden as reference. Only do this for the agrressive pruning mode and
+ // avoid it when color is set for golden reference.
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame &&
+ !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX &&
+ x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) {
+ int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150;
+ int pred = x->pred_mv_sad[LAST_FRAME] >>
+ (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (pred > thr) use_golden_ref_frame = 1;
+ }
+
+ use_alt_ref_frame =
+ cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
+ use_golden_ref_frame =
+ cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
+
+ // For spatial layers: enable golden ref if it is set by user and
+ // corresponds to the lower spatial layer.
+ if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+ x->content_state_sb.source_sad_nonrd < kHighSad) {
+ const int buffslot_golden =
+ cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME];
+ if (cpi->ppi->rtc_ref.buffer_time_index[buffslot_golden] ==
+ cpi->svc.current_superframe)
+ use_golden_ref_frame = 1;
+ }
+
+ use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
+ use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
+ use_ref_frame[LAST_FRAME] = use_last_ref_frame;
+ // Keep this assert on, as only 3 references are used in nonrd_pickmode
+ // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this
+ // frame must be an intra-only frame and hence should never enter the
+ // pickmode here for inter frames.
+ assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
+}
+
+static AOM_INLINE int is_filter_search_enabled_blk(
+ AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // filt search disabled
+ if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0;
+ // filt search purely based on mode properties
+ if (!cb_pred_filter_search) return 1;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int enable_interp_search = 0;
+ if (!(xd->left_mbmi && xd->above_mbmi)) {
+ // neighbors info unavailable
+ enable_interp_search = 2;
+ } else if (!(is_inter_block(xd->left_mbmi) &&
+ is_inter_block(xd->above_mbmi))) {
+ // neighbor is INTRA
+ enable_interp_search = 2;
+ } else if (xd->left_mbmi->interp_filters.as_int !=
+ xd->above_mbmi->interp_filters.as_int) {
+ // filters are different
+ enable_interp_search = 2;
+ } else if ((cb_pred_filter_search == 1) &&
+ (xd->left_mbmi->interp_filters.as_filters.x_filter !=
+ EIGHTTAP_REGULAR)) {
+ // not regular
+ enable_interp_search = 2;
+ } else {
+ // enable prediction based on chessboard pattern
+ if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH)
+ *filt_select = EIGHTTAP_SMOOTH;
+ const int bsl = mi_size_wide_log2[bsize];
+ enable_interp_search =
+ (bool)((((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_frame.frame_number)) &
+ 0x1);
+ if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1;
+ }
+ return enable_interp_search;
+}
+
+static AOM_INLINE int skip_mode_by_threshold(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
+ int frames_since_golden, const int *const rd_threshes,
+ const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
+ int extra_shift) {
+ int skip_this_mode = 0;
+ const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
+ int64_t mode_rd_thresh =
+ best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1)
+ : ((int64_t)rd_threshes[mode_index]) << extra_shift;
+
+ // Increase mode_rd_thresh value for non-LAST for improved encoding
+ // speed
+ if (ref_frame != LAST_FRAME) {
+ mode_rd_thresh = mode_rd_thresh << 1;
+ if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4)
+ mode_rd_thresh = mode_rd_thresh << (extra_shift + 1);
+ }
+
+ if (rd_less_than_thresh(best_cost, mode_rd_thresh,
+ rd_thresh_freq_fact[mode_index]))
+ if (mv.as_int != 0) skip_this_mode = 1;
+
+ return skip_this_mode;
+}
+
+static AOM_INLINE int skip_mode_by_low_temp(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+ CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
+ // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+ // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+ // later.
+ if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) {
+ return 1;
+ }
+
+ if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
+ force_skip_low_temp_var && mode == NEWMV) {
+ return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+ int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+ const unsigned int thresh_skip_golden = 500;
+
+ if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
+ mode == NEWMV)
+ return 1;
+
+ if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+
+ // Skip testing non-LAST if this flag is set.
+ if (extra_prune) {
+ if (extra_prune > 1 && ref_frame != LAST_FRAME &&
+ (bsize > BLOCK_16X16 && mode == NEWMV))
+ return 1;
+
+ if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
+
+ if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
+ }
+ return 0;
+}
+
+static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ struct buf_2d *yv12_mb) {
+ struct macroblock_plane *const p = &x->plane[0];
+ const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+ yv12_mb->buf, yv12_mb->stride);
+ if (y_sad == 0) x->block_is_zero_sad = 1;
+}
+
+static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int y_sad,
+ unsigned int source_variance,
+ struct buf_2d yv12_mb[MAX_MB_PLANE]) {
+ const int subsampling_x = cpi->common.seq_params->subsampling_x;
+ const int subsampling_y = cpi->common.seq_params->subsampling_y;
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ const int high_res = cpi->common.width * cpi->common.height >= 640 * 360;
+ if (bsize == cpi->common.seq_params->sb_size) {
+ // At superblock level color_sensitivity is already set to 0, 1, or 2.
+ // 2 is middle/uncertain level. To avoid additional sad
+ // computations when bsize = sb_size force level 2 to 1 (certain color)
+ // for motion areas.
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] =
+ source_sad_nonrd >= kMedSad ? 1 : 0;
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] =
+ source_sad_nonrd >= kMedSad ? 1 : 0;
+ }
+ return;
+ }
+ int shift = 3;
+ unsigned int source_var_thr = 50;
+ int uv_sad_thr = 100;
+ if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res)
+ shift = 4;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ if (cpi->rc.high_source_sad) shift = 6;
+ if (source_sad_nonrd > kMedSad) {
+ source_var_thr = 1200;
+ uv_sad_thr = 10;
+ }
+ }
+ NOISE_LEVEL noise_level = kLow;
+ int norm_sad =
+ y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000;
+ // If the spatial source variance is high and the normalized y_sad
+ // is low, then y-channel is likely good for mode estimation, so keep
+ // color_sensitivity off. For low noise content for now, since there is
+ // some bdrate regression for noisy color clip.
+ if (cpi->noise_estimate.enabled)
+ noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+ if (noise_level == kLow && source_variance > thresh_spatial &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 0;
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 0;
+ return;
+ }
+ const int num_planes = av1_num_planes(&cpi->common);
+
+ for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) {
+ // Always check if level = 2. If level = 0 check again for
+ // motion areas for higher resolns, where color artifacts
+ // are more noticeable.
+ if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 ||
+ (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 &&
+ source_sad_nonrd >= kMedSad && high_res)) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+
+ const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+
+ const int norm_uv_sad =
+ uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
+ x->color_sensitivity[COLOR_SENS_IDX(plane)] =
+ uv_sad > (y_sad >> shift) && norm_uv_sad > 40;
+ if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr)
+ x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1;
+ }
+ }
+}
+
+static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x,
+ struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+ const int *use_ref_frame_mask,
+ const MV_REFERENCE_FRAME *rf,
+ int *ref_mv_idx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ MV_REFERENCE_FRAME ref_frame_comp;
+ if (!use_ref_frame_mask[rf[1]]) {
+ // Need to setup pred_block, if it hasn't been done in find_predictors.
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]);
+ const int num_planes = av1_num_planes(cm);
+ if (yv12 != NULL) {
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, rf[1]);
+ av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes);
+ }
+ }
+ ref_frame_comp = av1_ref_frame_type(rf);
+ mbmi_ext->mode_context[ref_frame_comp] = 0;
+ mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp);
+ *ref_mv_idx = mbmi->ref_mv_idx + 1;
+}
+
+static void set_compound_mode(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME ref_frame2, int ref_mv_idx,
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ PREDICTION_MODE this_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = ref_frame2;
+ mi->compound_idx = 1;
+ mi->comp_group_idx = 0;
+ mi->interinter_comp.type = COMPOUND_AVERAGE;
+ MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame);
+ if (this_mode == GLOBAL_GLOBALMV) {
+ frame_mv[this_mode][ref_frame].as_int = 0;
+ frame_mv[this_mode][ref_frame2].as_int = 0;
+ } else if (this_mode == NEAREST_NEARESTMV) {
+ frame_mv[this_mode][ref_frame].as_int =
+ xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int;
+ frame_mv[this_mode][ref_frame2].as_int =
+ xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int;
+ } else if (this_mode == NEAR_NEARMV) {
+ frame_mv[this_mode][ref_frame].as_int =
+ xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int;
+ frame_mv[this_mode][ref_frame2].as_int =
+ xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int;
+ }
+}
+
+// Prune compound mode if the single mode variance is lower than a fixed
+// percentage of the median value.
+static bool skip_comp_based_on_var(
+ const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) {
+ unsigned int best_var = UINT_MAX;
+ for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) {
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]);
+ }
+ }
+ const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659);
+ const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281);
+
+ // Currently, the thresh for 128 and 16 are not well-tuned. We are using the
+ // results from 64 and 32 as an heuristic.
+ switch (bsize) {
+ case BLOCK_128X128: return best_var < 4 * thresh_64;
+ case BLOCK_64X64: return best_var < thresh_64;
+ case BLOCK_32X32: return best_var < thresh_32;
+ case BLOCK_16X16: return best_var < thresh_32 / 4;
+ default: return false;
+ }
+}
+
+static AOM_FORCE_INLINE void fill_single_inter_mode_costs(
+ int (*single_inter_mode_costs)[REF_FRAMES], int num_inter_modes,
+ const REF_MODE *reference_mode_set, const ModeCosts *mode_costs,
+ const int16_t *mode_context) {
+ bool ref_frame_used[REF_FRAMES] = { false };
+ for (int idx = 0; idx < num_inter_modes; idx++) {
+ ref_frame_used[reference_mode_set[idx].ref_frame] = true;
+ }
+
+ for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES;
+ this_ref_frame++) {
+ if (!ref_frame_used[this_ref_frame]) {
+ continue;
+ }
+
+ const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME };
+ const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf);
+ for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV;
+ this_mode++) {
+ single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] =
+ cost_mv_ref(mode_costs, this_mode, mode_ctx);
+ }
+ }
+}
+
+static AOM_INLINE bool is_globalmv_better(
+ PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv,
+ const ModeCosts *mode_costs,
+ const int (*single_inter_mode_costs)[REF_FRAMES],
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const int globalmv_mode_cost =
+ single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame];
+ int this_mode_cost =
+ rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame];
+ if (this_mode == NEWMV || this_mode == NEARMV) {
+ const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME };
+ this_mode_cost += get_drl_cost(
+ NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf));
+ }
+ return this_mode_cost > globalmv_mode_cost;
+}
+
+// Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it
+// succeeds, 0 if it fails.
+static AOM_INLINE int setup_compound_params_from_comp_idx(
+ const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+ PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame,
+ MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ const int *use_ref_frame_mask, int comp_index,
+ bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame,
+ BLOCK_SIZE bsize) {
+ const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame;
+ int skip_gf = 0;
+ int skip_alt = 0;
+ *this_mode = comp_ref_mode_set[comp_index].pred_mode;
+ *ref_frame = rf[0];
+ *ref_frame2 = rf[1];
+ assert(*ref_frame == LAST_FRAME);
+ assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV);
+ if (x->source_variance < 50 && bsize > BLOCK_16X16) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ skip_gf = 1;
+ if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ skip_alt = 1;
+ }
+ if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) {
+ return 0;
+ }
+ if (*ref_frame2 == GOLDEN_FRAME &&
+ (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 || skip_gf ||
+ !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) {
+ return 0;
+ } else if (*ref_frame2 == LAST2_FRAME &&
+ (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 ||
+ !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) {
+ return 0;
+ } else if (*ref_frame2 == ALTREF_FRAME &&
+ (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 || skip_alt ||
+ !(cpi->ref_frame_flags & AOM_ALT_FLAG))) {
+ return 0;
+ }
+ int ref_mv_idx = 0;
+ if (*last_comp_ref_frame != rf[1]) {
+ // Only needs to be done once per reference pair.
+ setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf,
+ &ref_mv_idx);
+ *last_comp_ref_frame = rf[1];
+ }
+ set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv,
+ *this_mode);
+ if (*this_mode != GLOBAL_GLOBALMV &&
+ frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+ frame_mv[*this_mode][*ref_frame2].as_int == 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static AOM_INLINE bool previous_mode_performed_poorly(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+ const unsigned int (*vars)[REF_FRAMES],
+ const int64_t (*uv_dist)[REF_FRAMES]) {
+ unsigned int best_var = UINT_MAX;
+ int64_t best_uv_dist = INT64_MAX;
+ for (int midx = 0; midx < RTC_INTER_MODES; midx++) {
+ best_var = AOMMIN(best_var, vars[midx][ref_frame]);
+ best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]);
+ }
+ assert(best_var != UINT_MAX && "Invalid variance data.");
+ const float mult = 1.125f;
+ bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame];
+ if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX &&
+ best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) {
+ // If we have chroma info, then take it into account
+ var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame];
+ }
+ return var_bad;
+}
+
+static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
+ PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
+ const uint8_t (*mode_checked)[REF_FRAMES],
+ const unsigned int (*vars)[REF_FRAMES],
+ const int64_t (*uv_dist)[REF_FRAMES]) {
+ const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode);
+ const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode);
+
+ bool first_ref_valid = false, second_ref_valid = false;
+ bool first_ref_bad = false, second_ref_bad = false;
+ if (mode_checked[single_mode0][ref_frame] &&
+ frame_mv[single_mode0][ref_frame].as_int ==
+ frame_mv[compound_mode][ref_frame].as_int &&
+ vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) {
+ first_ref_valid = true;
+ first_ref_bad =
+ previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist);
+ }
+ if (mode_checked[single_mode1][ref_frame2] &&
+ frame_mv[single_mode1][ref_frame2].as_int ==
+ frame_mv[compound_mode][ref_frame2].as_int &&
+ vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) {
+ second_ref_valid = true;
+ second_ref_bad =
+ previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist);
+ }
+ if (first_ref_valid && second_ref_valid) {
+ return first_ref_bad && second_ref_bad;
+ } else if (first_ref_valid || second_ref_valid) {
+ return first_ref_bad || second_ref_bad;
+ }
+ return false;
+}
+
+// Function to setup parameters used for inter mode evaluation in non-rd.
+static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col,
+ int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ ,
+ PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode
+#endif
+) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ int skip_pred_mv = 0;
+
+ // Initialize variance and distortion (chroma) for all modes and reference
+ // frames
+ for (int idx = 0; idx < RTC_INTER_MODES; idx++) {
+ for (int ref = 0; ref < REF_FRAMES; ref++) {
+ search_state->vars[idx][ref] = UINT_MAX;
+ search_state->uv_dist[idx][ref] = INT64_MAX;
+ }
+ }
+
+ // Initialize values of color sensitivity with sb level color sensitivity
+ av1_copy(x->color_sensitivity, x->color_sensitivity_sb);
+
+ init_best_pickmode(&search_state->best_pickmode);
+
+ // Estimate cost for single reference frames
+ estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize,
+ search_state->ref_costs_single);
+
+ // Reset flag to indicate modes evaluated
+ av1_zero(search_state->mode_checked);
+
+ txfm_info->skip_txfm = 0;
+
+ // Initialize mode decisions
+ av1_invalid_rd_stats(&search_state->best_rdc);
+ av1_invalid_rd_stats(&search_state->this_rdc);
+ av1_invalid_rd_stats(rd_cost);
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
+ x->warp_sample_info[ref_idx].num = -1;
+ }
+
+ mi->bsize = bsize;
+ mi->ref_frame[0] = NONE_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+ // av1_denoise_svc_non_key(cpi);
+ if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+ av1_denoiser_reset_frame_stats(ctx);
+ }
+#endif
+
+ // Populate predicated motion vectors for LAST_FRAME
+ if (cpi->ref_frame_flags & AOM_LAST_FLAG) {
+ find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv,
+ search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+ x->force_zeromv_skip_for_blk,
+ &search_state->use_scaled_ref_frame[LAST_FRAME]);
+ }
+ // Update mask to use all reference frame
+ get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
+ search_state->use_ref_frame_mask,
+ force_skip_low_temp_var);
+
+ skip_pred_mv = x->force_zeromv_skip_for_blk ||
+ (x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+
+ // Populate predicated motion vectors for other single reference frame
+ // Start at LAST_FRAME + 1.
+ for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1;
+ ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
+ if (search_state->use_ref_frame_mask[ref_frame_iter]) {
+ find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv,
+ search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+ skip_pred_mv,
+ &search_state->use_scaled_ref_frame[ref_frame_iter]);
+ }
+ }
+}
+
+// Function to check the inter mode can be skipped based on mode statistics and
+// speed features settings.
+static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *is_single_pred,
+ PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame,
+ MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx,
+ int_mv svc_mv, int force_skip_low_temp_var, unsigned int sse_zeromv_norm,
+ int num_inter_modes, unsigned char segment_id, BLOCK_SIZE bsize,
+ bool comp_use_zero_zeromv_only, bool check_globalmv) {
+ AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ const SVC *const svc = &cpi->svc;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+
+ // Skip compound mode based on reference frame mask and type of the mode and
+ // for allowed compound modes, setup ref mv stack and reference frame.
+ if (idx >= num_inter_modes) {
+ const int comp_index = idx - num_inter_modes;
+ if (!setup_compound_params_from_comp_idx(
+ cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2,
+ search_state->frame_mv, search_state->use_ref_frame_mask,
+ comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame,
+ bsize)) {
+ return true;
+ }
+ *is_single_pred = 0;
+ } else {
+ *this_mode = ref_mode_set[idx].pred_mode;
+ *ref_frame = ref_mode_set[idx].ref_frame;
+ *ref_frame2 = NONE_FRAME;
+ }
+
+ if (x->sb_me_block && *ref_frame == LAST_FRAME) {
+ // We want to make sure to test the superblock MV:
+ // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
+ // have this sb MV. And don't skip NEWMV_LAST: this will be set to
+ // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't
+ // have it.
+ if (*this_mode == NEARESTMV &&
+ search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int ==
+ x->sb_me_mv.as_int) {
+ return false;
+ }
+ if (*this_mode == NEARMV &&
+ search_state->frame_mv[NEARMV][LAST_FRAME].as_int ==
+ x->sb_me_mv.as_int) {
+ return false;
+ }
+ if (*this_mode == NEWMV) {
+ return false;
+ }
+ }
+
+ // Skip the single reference mode for which mode check flag is set.
+ if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
+ return true;
+ }
+
+ // Skip GLOBALMV mode if check_globalmv flag is not enabled.
+ if (!check_globalmv && *this_mode == GLOBALMV) {
+ return true;
+ }
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.num_searches[bsize][*this_mode]++;
+#endif
+ mi->mode = *this_mode;
+ mi->ref_frame[0] = *ref_frame;
+ mi->ref_frame[1] = *ref_frame2;
+
+ // Skip the mode if use reference frame mask flag is not set.
+ if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
+
+ // Skip mode for some modes and reference frames when
+ // force_zeromv_skip_for_blk flag is true.
+ if (x->force_zeromv_skip_for_blk &&
+ ((!(*this_mode == NEARESTMV &&
+ search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+ *this_mode != GLOBALMV) ||
+ *ref_frame != LAST_FRAME))
+ return true;
+
+ // Skip compound mode based on variance of previously evaluated single
+ // reference modes.
+ if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
+ prune_compoundmode_with_singlemode_var(
+ *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv,
+ search_state->mode_checked, search_state->vars,
+ search_state->uv_dist)) {
+ return true;
+ }
+
+ *force_mv_inter_layer = 0;
+ if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
+ (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) ||
+ (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) {
+ // Only test mode if NEARESTMV/NEARMV is (svc_mv.mv.col, svc_mv.mv.row),
+ // otherwise set NEWMV to (svc_mv.mv.col, svc_mv.mv.row).
+ // Skip newmv and filter search.
+ *force_mv_inter_layer = 1;
+ if (*this_mode == NEWMV) {
+ search_state->frame_mv[*this_mode][*ref_frame] = svc_mv;
+ } else if (search_state->frame_mv[*this_mode][*ref_frame].as_int !=
+ svc_mv.as_int) {
+ return true;
+ }
+ }
+
+ // If the segment reference frame feature is enabled then do nothing if the
+ // current ref frame is not allowed.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
+ return true;
+
+ // For screen content: skip mode testing based on source_sad.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ // If source_sad is computed: skip non-zero motion
+ // check for stationary (super)blocks. Otherwise if superblock
+ // has motion skip the modes with zero motion on last reference
+ // for flat blocks, and color is not set.
+ // For the latter condition: the same condition should apply
+ // to newmv if (0, 0), so this latter condition is repeated
+ // below after search_new_mv.
+ if (rt_sf->source_metrics_sb_nonrd) {
+ if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
+ x->content_state_sb.source_sad_nonrd == kZeroSad) ||
+ (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+ x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME &&
+ ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+ cpi->rc.high_source_sad) &&
+ x->source_variance == 0))
+ return true;
+ }
+ // Skip NEWMV search for flat blocks.
+ if (*this_mode == NEWMV && x->source_variance < 100) return true;
+ // Skip non-LAST for color on flat blocks.
+ if (*ref_frame > LAST_FRAME && x->source_variance == 0 &&
+ (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))
+ return true;
+ }
+
+ // Skip mode based on block size, reference frame mode and other block
+ // properties.
+ if (skip_mode_by_bsize_and_ref_frame(
+ *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
+ sse_zeromv_norm, rt_sf->nonrd_aggressive_skip))
+ return true;
+
+ // Skip mode based on low temporal variance and souce sad.
+ if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb,
+ search_state->frame_mv[*this_mode][*ref_frame],
+ force_skip_low_temp_var))
+ return true;
+
+ // Disable this drop out case if the ref frame segment level feature is
+ // enabled for this segment. This is to prevent the possibility that we
+ // end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
+ if (rt_sf->nonrd_prune_ref_frame_search > 0 &&
+ x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) {
+ if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true;
+ }
+ }
+
+ // Check for skipping NEARMV based on pred_mv_sad.
+ if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX &&
+ x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1))
+ return true;
+
+ // Skip single reference mode based on rd threshold.
+ if (*is_single_pred) {
+ if (skip_mode_by_threshold(
+ *this_mode, *ref_frame,
+ search_state->frame_mv[*this_mode][*ref_frame],
+ cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize],
+ x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost,
+ search_state->best_pickmode.best_mode_skip_txfm,
+ (rt_sf->nonrd_aggressive_skip ? 1 : 0)))
+ return true;
+ }
+ return false;
+}
+
+// Function to perform inter mode evaluation for non-rd
+static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ PICK_MODE_CONTEXT *ctx, PRED_BUFFER **this_mode_pred,
+ PRED_BUFFER *tmp_buffer, InterPredParams inter_pred_params_sr,
+ int *best_early_term, unsigned int *sse_zeromv_norm, bool *check_globalmv,
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ int64_t *zero_last_cost_orig, int denoise_svc_pickmode,
+#endif
+ int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref,
+ int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize,
+ PREDICTION_MODE this_mode, InterpFilter filt_select,
+ int cb_pred_filter_search, int reuse_inter_pred,
+ int *sb_me_has_been_tested) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ const int bw = block_size_wide[bsize];
+ const InterpFilter filter_ref = cm->features.interp_filter;
+ const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+ MV_REFERENCE_FRAME ref_frame = mi->ref_frame[0];
+ MV_REFERENCE_FRAME ref_frame2 = mi->ref_frame[1];
+ int_mv *const this_mv = &search_state->frame_mv[this_mode][ref_frame];
+ unsigned int var = UINT_MAX;
+ int this_early_term = 0;
+ int rate_mv = 0;
+ int is_skippable;
+ int skip_this_mv = 0;
+ unsigned int var_threshold = UINT_MAX;
+ PREDICTION_MODE this_best_mode;
+ RD_STATS nonskip_rdc;
+ av1_invalid_rd_stats(&nonskip_rdc);
+
+ if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) {
+ // Set the NEWMV_LAST to the sb MV.
+ search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int;
+ } else if (this_mode == NEWMV && !force_mv_inter_layer) {
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ // Find the best motion vector for single/compound mode.
+ const bool skip_newmv = search_new_mv(
+ cpi, x, search_state->frame_mv, ref_frame, gf_temporal_ref, bsize,
+ mi_row, mi_col, &rate_mv, &search_state->best_rdc);
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.ms_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+ // Skip NEWMV mode,
+ // (i). For bsize smaller than 16X16
+ // (ii). Based on sad of the predicted mv w.r.t LAST_FRAME
+ // (iii). When motion vector is same as that of reference mv
+ if (skip_newmv) {
+ return true;
+ }
+ }
+
+ // Check the current motion vector is same as that of previously evaluated
+ // motion vectors.
+ for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
+ inter_mv_mode++) {
+ if (inter_mv_mode == this_mode) continue;
+ if (is_single_pred &&
+ search_state->mode_checked[inter_mv_mode][ref_frame] &&
+ this_mv->as_int ==
+ search_state->frame_mv[inter_mv_mode][ref_frame].as_int) {
+ skip_this_mv = 1;
+ break;
+ }
+ }
+
+ // Skip single mode if current motion vector is same that of previously
+ // evaluated motion vectors.
+ if (skip_this_mv && is_single_pred) return true;
+
+ // For screen: for spatially flat blocks with non-zero motion,
+ // skip newmv if the motion vector is (0, 0)-LAST, and color is not set.
+ if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) {
+ if (this_mv->as_int == 0 && ref_frame == LAST_FRAME &&
+ x->block_is_zero_sad == 0 &&
+ ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+ cpi->rc.high_source_sad) &&
+ x->source_variance == 0)
+ return true;
+ }
+
+ mi->mode = this_mode;
+ mi->mv[0].as_int = this_mv->as_int;
+ mi->mv[1].as_int = 0;
+ if (!is_single_pred)
+ mi->mv[1].as_int = search_state->frame_mv[this_mode][ref_frame2].as_int;
+
+ // Set buffers to store predicted samples for reuse
+ if (reuse_inter_pred) {
+ if (!*this_mode_pred) {
+ *this_mode_pred = &tmp_buffer[3];
+ } else {
+ *this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ pd->dst.buf = (*this_mode_pred)->data;
+ pd->dst.stride = bw;
+ }
+ }
+
+ mi->motion_mode = SIMPLE_TRANSLATION;
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
+ calc_num_proj_ref(cpi, x, mi);
+ }
+#endif
+ // set variance threshold for compound mode pruning
+ if (rt_sf->prune_compoundmode_with_singlecompound_var && !is_single_pred &&
+ use_model_yrd_large) {
+ const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode);
+ const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode);
+ var_threshold =
+ AOMMIN(var_threshold,
+ search_state->vars[INTER_OFFSET(single_mode0)][ref_frame]);
+ var_threshold =
+ AOMMIN(var_threshold,
+ search_state->vars[INTER_OFFSET(single_mode1)][ref_frame2]);
+ }
+
+ // decide interpolation filter, build prediction signal, get sse
+ const bool is_mv_subpel =
+ (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07);
+ const bool enable_filt_search_this_mode =
+ (filter_search_enabled_blk == 2)
+ ? true
+ : (filter_search_enabled_blk && !force_mv_inter_layer &&
+ is_single_pred &&
+ (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search));
+ if (is_mv_subpel && enable_filt_search_this_mode) {
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ search_filter_ref(
+ cpi, x, &search_state->this_rdc, &inter_pred_params_sr, mi_row, mi_col,
+ tmp_buffer, bsize, reuse_inter_pred, this_mode_pred, &this_early_term,
+ &var, use_model_yrd_large, best_pickmode->best_sse, is_single_pred);
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.ifs_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+#if !CONFIG_REALTIME_ONLY
+ } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
+ this_mode == NEWMV) {
+ // Find the best motion mode when current mode is NEWMV
+ search_motion_mode(cpi, x, &search_state->this_rdc, mi_row, mi_col, bsize,
+ &this_early_term, use_model_yrd_large, &rate_mv,
+ best_pickmode->best_sse);
+ if (this_mode == NEWMV) {
+ this_mv[0] = mi->mv[0];
+ }
+#endif
+ } else {
+ mi->interp_filters =
+ (filter_ref == SWITCHABLE)
+ ? av1_broadcast_interp_filter(default_interp_filter)
+ : av1_broadcast_interp_filter(filter_ref);
+ if (force_mv_inter_layer)
+ mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ // If it is sub-pel motion and cb_pred_filter_search is enabled, select
+ // the pre-decided filter
+ if (is_mv_subpel && cb_pred_filter_search)
+ mi->interp_filters = av1_broadcast_interp_filter(filt_select);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ if (is_single_pred) {
+ SubpelParams subpel_params;
+ // Initialize inter mode level params for single reference mode.
+ init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr,
+ &subpel_params, xd->block_ref_scale_factors[0],
+ pd->pre->width, pd->pre->height);
+ av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr,
+ &subpel_params);
+ } else {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+
+ if (use_model_yrd_large) {
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &search_state->this_rdc, &this_early_term, 0,
+ best_pickmode->best_sse, &var, var_threshold);
+ } else {
+ model_rd_for_sb_y(cpi, bsize, x, xd, &search_state->this_rdc, &var, 0,
+ &this_early_term);
+ }
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.model_rd_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+ }
+
+ // update variance for single mode
+ if (is_single_pred) {
+ search_state->vars[INTER_OFFSET(this_mode)][ref_frame] = var;
+ if (this_mv->as_int == 0) {
+ search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+ }
+ }
+ // prune compound mode based on single mode var threshold
+ if (!is_single_pred && var > var_threshold) {
+ if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+ return true;
+ }
+
+ if (ref_frame == LAST_FRAME && this_mv->as_int == 0) {
+ *sse_zeromv_norm = (unsigned int)(search_state->this_rdc.sse >>
+ (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]));
+ }
+
+ // Perform early termination based on sse.
+ if (rt_sf->sse_early_term_inter_search &&
+ early_term_inter_search_with_sse(rt_sf->sse_early_term_inter_search,
+ bsize, search_state->this_rdc.sse,
+ best_pickmode->best_sse, this_mode)) {
+ if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+ return true;
+ }
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ x->ms_stat_nonrd.num_nonskipped_searches[bsize][this_mode]++;
+#endif
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
+ const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
+ const int64_t sse_y = search_state->this_rdc.sse;
+
+ if (this_early_term) {
+ search_state->this_rdc.skip_txfm = 1;
+ search_state->this_rdc.rate = skip_txfm_cost;
+ search_state->this_rdc.dist = search_state->this_rdc.sse << 4;
+ } else {
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ // Calculates RD Cost using Hadamard transform.
+ av1_block_yrd(x, &search_state->this_rdc, &is_skippable, bsize,
+ mi->tx_size);
+ if (search_state->this_rdc.skip_txfm ||
+ RDCOST(x->rdmult, search_state->this_rdc.rate,
+ search_state->this_rdc.dist) >=
+ RDCOST(x->rdmult, 0, search_state->this_rdc.sse)) {
+ if (!search_state->this_rdc.skip_txfm) {
+ // Need to store "real" rdc for possible future use if UV rdc
+ // disallows tx skip
+ nonskip_rdc = search_state->this_rdc;
+ nonskip_rdc.rate += no_skip_txfm_cost;
+ }
+ search_state->this_rdc.rate = skip_txfm_cost;
+ search_state->this_rdc.skip_txfm = 1;
+ search_state->this_rdc.dist = search_state->this_rdc.sse;
+ } else {
+ search_state->this_rdc.rate += no_skip_txfm_cost;
+ }
+
+ // Populate predicted sample for chroma planes based on color sensitivity.
+ if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) {
+ RD_STATS rdc_uv;
+ const BLOCK_SIZE uv_bsize =
+ get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_U, AOM_PLANE_U);
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_V, AOM_PLANE_V);
+ }
+ // Compute sse for chroma planes.
+ const int64_t sse_uv = av1_model_rd_for_sb_uv(
+ cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V);
+ if (rdc_uv.dist < x->min_dist_inter_uv)
+ x->min_dist_inter_uv = rdc_uv.dist;
+ search_state->this_rdc.sse += sse_uv;
+ // Restore Y rdc if UV rdc disallows txfm skip
+ if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
+ nonskip_rdc.rate != INT_MAX)
+ search_state->this_rdc = nonskip_rdc;
+ if (is_single_pred) {
+ search_state->uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist;
+ }
+ search_state->this_rdc.rate += rdc_uv.rate;
+ search_state->this_rdc.dist += rdc_uv.dist;
+ search_state->this_rdc.skip_txfm =
+ search_state->this_rdc.skip_txfm && rdc_uv.skip_txfm;
+ }
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.txfm_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+ }
+
+ this_best_mode = this_mode;
+ // TODO(kyslov) account for UV prediction cost
+ search_state->this_rdc.rate += rate_mv;
+ if (!is_single_pred) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+ search_state->this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx);
+ } else {
+ // If the current mode has zeromv but is not GLOBALMV, compare the rate
+ // cost. If GLOBALMV is cheaper, use GLOBALMV instead.
+ if (this_mode != GLOBALMV &&
+ this_mv->as_int == search_state->frame_mv[GLOBALMV][ref_frame].as_int) {
+ if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs,
+ search_state->single_inter_mode_costs, mbmi_ext)) {
+ this_best_mode = GLOBALMV;
+ }
+ }
+
+ search_state->this_rdc.rate +=
+ search_state
+ ->single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame];
+ }
+
+ if (is_single_pred && this_mv->as_int == 0 && var < UINT_MAX) {
+ search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+ }
+
+ search_state->this_rdc.rate += search_state->ref_costs_single[ref_frame];
+
+ search_state->this_rdc.rdcost = RDCOST(x->rdmult, search_state->this_rdc.rate,
+ search_state->this_rdc.dist);
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && is_single_pred) {
+ newmv_diff_bias(xd, this_best_mode, &search_state->this_rdc, bsize,
+ search_state->frame_mv[this_best_mode][ref_frame].as_mv.row,
+ search_state->frame_mv[this_best_mode][ref_frame].as_mv.col,
+ cpi->speed, x->source_variance, x->content_state_sb);
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
+ cpi->denoiser.denoising_level > kDenLowLow) {
+ av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+ // Keep track of zero_last cost.
+ if (ref_frame == LAST_FRAME && this_mv->as_int == 0)
+ *zero_last_cost_orig = search_state->this_rdc.rdcost;
+ }
+#else
+ (void)(sse_y);
+#endif
+
+ search_state->mode_checked[this_mode][ref_frame] = 1;
+ search_state->mode_checked[this_best_mode][ref_frame] = 1;
+
+ if (*check_globalmv) {
+ int32_t abs_mv =
+ abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.row) +
+ abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.col);
+ // Early exit check: if the magnitude of this_best_mode's mv is small
+ // enough, we skip GLOBALMV check in the next loop iteration.
+ if (abs_mv < 2) {
+ *check_globalmv = false;
+ }
+ }
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.nonskipped_search_times[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
+#endif
+
+ if (x->sb_me_block && ref_frame == LAST_FRAME &&
+ search_state->frame_mv[this_best_mode][ref_frame].as_int ==
+ x->sb_me_mv.as_int)
+ *sb_me_has_been_tested = 1;
+
+ // Copy best mode params to search state
+ if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
+ search_state->best_rdc = search_state->this_rdc;
+ *best_early_term = this_early_term;
+ update_search_state_nonrd(search_state, mi, txfm_info, &nonskip_rdc, ctx,
+ this_best_mode, sse_y);
+
+ // This is needed for the compound modes.
+ search_state->frame_mv_best[this_best_mode][ref_frame].as_int =
+ search_state->frame_mv[this_best_mode][ref_frame].as_int;
+ if (ref_frame2 > NONE_FRAME) {
+ search_state->frame_mv_best[this_best_mode][ref_frame2].as_int =
+ search_state->frame_mv[this_best_mode][ref_frame2].as_int;
+ }
+
+ if (reuse_inter_pred) {
+ free_pred_buffer(best_pickmode->best_pred);
+ best_pickmode->best_pred = *this_mode_pred;
+ }
+ } else {
+ if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+ }
+
+ if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) {
+ txfm_info->skip_txfm = 1;
+ if (!x->sb_me_block || *sb_me_has_been_tested) return false;
+ }
+ return true;
+}
+
+// Function to perform screen content mode evaluation for non-rd
+static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ PRED_BUFFER *this_mode_pred, PICK_MODE_CONTEXT *ctx,
+ PRED_BUFFER *tmp_buffer, struct buf_2d *orig_dst, int skip_idtx_palette,
+ int try_palette, BLOCK_SIZE bsize, int reuse_inter_pred, int mi_col,
+ int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+ // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit
+ // when issue 3359 is fixed.
+ if (cm->seq_params->bit_depth == 8 &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
+ !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
+ is_inter_mode(best_pickmode->best_mode) &&
+ best_pickmode->best_pred != NULL &&
+ (!rt_sf->prune_idtx_nonrd ||
+ (rt_sf->prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
+ best_pickmode->best_mode_skip_txfm != 1 && x->source_variance > 200))) {
+ RD_STATS idtx_rdc;
+ av1_init_rd_stats(&idtx_rdc);
+ int is_skippable;
+ this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = bw;
+ const PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ av1_block_yrd_idtx(x, best_pred->data, best_pred->stride, &idtx_rdc,
+ &is_skippable, bsize, mi->tx_size);
+ int64_t idx_rdcost_y = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+ int allow_idtx = 1;
+ // Incorporate color into rd cost.
+ if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) {
+ RD_STATS rdc_uv;
+ const BLOCK_SIZE uv_bsize =
+ get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_U, AOM_PLANE_U);
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_V, AOM_PLANE_V);
+ }
+ av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U,
+ AOM_PLANE_V);
+ if (rdc_uv.dist < x->min_dist_inter_uv)
+ x->min_dist_inter_uv = rdc_uv.dist;
+ idtx_rdc.rate += rdc_uv.rate;
+ idtx_rdc.dist += rdc_uv.dist;
+ idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm;
+ if (idx_rdcost_y == 0 && rdc_uv.dist > 0 && x->source_variance < 3000 &&
+ x->content_state_sb.source_sad_nonrd > kMedSad)
+ allow_idtx = 0;
+ }
+ int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+ if (allow_idtx && idx_rdcost < search_state->best_rdc.rdcost) {
+ best_pickmode->tx_type = IDTX;
+ search_state->best_rdc.rdcost = idx_rdcost;
+ best_pickmode->best_mode_skip_txfm = idtx_rdc.skip_txfm;
+ if (!idtx_rdc.skip_txfm) {
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ xd->tx_type_map[0] = best_pickmode->tx_type;
+ memset(ctx->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk);
+ memset(xd->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk);
+ }
+ pd->dst = *orig_dst;
+ }
+
+ if (!try_palette) return;
+ const unsigned int intra_ref_frame_cost =
+ search_state->ref_costs_single[INTRA_FRAME];
+
+ if (!is_mode_intra(best_pickmode->best_mode)) {
+ PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ if (reuse_inter_pred && best_pred != NULL) {
+ if (best_pred->data == orig_dst->buf) {
+ this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ aom_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride, bw, bh);
+ best_pickmode->best_pred = this_mode_pred;
+ }
+ }
+ pd->dst = *orig_dst;
+ }
+ // Search palette mode for Luma plane in inter frame.
+ av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
+ &search_state->this_rdc,
+ search_state->best_rdc.rdcost);
+ // Update best mode data in search_state
+ if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
+ best_pickmode->pmi = mi->palette_mode_info;
+ best_pickmode->best_mode = DC_PRED;
+ mi->mv[0].as_int = INVALID_MV;
+ mi->mv[1].as_int = INVALID_MV;
+ best_pickmode->best_ref_frame = INTRA_FRAME;
+ best_pickmode->best_second_ref_frame = NONE;
+ search_state->best_rdc.rate = search_state->this_rdc.rate;
+ search_state->best_rdc.dist = search_state->this_rdc.dist;
+ search_state->best_rdc.rdcost = search_state->this_rdc.rdcost;
+ best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm;
+ // Keep the skip_txfm off if the color_sensitivity is set.
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])
+ search_state->this_rdc.skip_txfm = 0;
+ if (!search_state->this_rdc.skip_txfm) {
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ if (xd->tx_type_map[0] != DCT_DCT)
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ }
+}
+
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ * need to be checked.
+ * *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ data/contexts/models for the tile during
+ encoding
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ AV1_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ MV_REFERENCE_FRAME ref_frame, ref_frame2;
+ const unsigned char segment_id = mi->segment_id;
+ int best_early_term = 0;
+ int force_skip_low_temp_var = 0;
+ unsigned int sse_zeromv_norm = UINT_MAX;
+ const int num_inter_modes = NUM_INTER_MODES;
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ bool check_globalmv = rt_sf->check_globalmv_on_single_ref;
+ PRED_BUFFER tmp_buffer[4];
+ DECLARE_ALIGNED(16, uint8_t, pred_buf[MAX_MB_PLANE * MAX_SB_SQUARE]);
+ PRED_BUFFER *this_mode_pred = NULL;
+ const int reuse_inter_pred =
+ rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8;
+ InterModeSearchStateNonrd search_state;
+ av1_zero(search_state.use_ref_frame_mask);
+ av1_zero(search_state.use_scaled_ref_frame);
+ BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode;
+ (void)tile_data;
+
+ const int bh = block_size_high[bsize];
+ const int bw = block_size_wide[bsize];
+ const int pixels_in_block = bh * bw;
+ struct buf_2d orig_dst = pd->dst;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+#if COLLECT_NONRD_PICK_MODE_STAT
+ // Mode statistics can be collected only when num_workers is 1
+ assert(cpi->mt_info.num_workers <= 1);
+ aom_usec_timer_start(&x->ms_stat_nonrd.bsize_timer);
+#endif
+ int64_t thresh_sad_pred = INT64_MAX;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int_mv svc_mv = { .as_int = 0 };
+ int force_mv_inter_layer = 0;
+ bool comp_use_zero_zeromv_only = 0;
+ int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ const int denoise_recheck_zeromv = 1;
+ AV1_PICKMODE_CTX_DEN ctx_den;
+ int64_t zero_last_cost_orig = INT64_MAX;
+ int denoise_svc_pickmode = 1;
+ const int resize_pending = is_frame_resize_pending(cpi);
+#endif
+ const ModeCosts *mode_costs = &x->mode_costs;
+ struct scale_factors sf_no_scale;
+ av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+ cm->width, cm->height);
+ if (reuse_inter_pred) {
+ for (int buf_idx = 0; buf_idx < 3; buf_idx++) {
+ tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx];
+ tmp_buffer[buf_idx].stride = bw;
+ tmp_buffer[buf_idx].in_use = 0;
+ }
+ tmp_buffer[3].data = pd->dst.buf;
+ tmp_buffer[3].stride = pd->dst.stride;
+ tmp_buffer[3].in_use = 0;
+ }
+
+ const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
+
+ // If the lower spatial layer uses an averaging filter for downsampling
+ // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative
+ // to source, so use subpel motion vector to compensate. The nonzero motion
+ // is half pixel shifted to left and top, so (-4, -4). This has more effect
+ // on higher resolutions, so condition it on that for now.
+ // Exclude quality layers, which have the same resolution and hence no shift.
+ if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ !svc->has_lower_quality_layer &&
+ svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+ cm->width * cm->height > 640 * 480) {
+ svc_mv.as_mv.row = -4;
+ svc_mv.as_mv.col = -4;
+ }
+
+ // Setup parameters used for inter mode evaluation.
+ set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost,
+ &force_skip_low_temp_var, mi_row, mi_col,
+ gf_temporal_ref, segment_id, bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ ,
+ ctx, denoise_svc_pickmode
+#endif
+ );
+
+ if (rt_sf->use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
+ // Only search compound if bsize \gt BLOCK_16X16.
+ if (bsize > BLOCK_16X16) {
+ comp_use_zero_zeromv_only = rt_sf->check_only_zero_zeromv_on_large_blocks;
+ } else {
+ tot_num_comp_modes = 0;
+ }
+ } else {
+ tot_num_comp_modes = 0;
+ }
+
+ if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) {
+ thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
+ // Increase threshold for less aggressive pruning.
+ if (rt_sf->nonrd_prune_ref_frame_search == 1)
+ thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
+ }
+
+ const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+ // decide block-level interp filter search flags:
+ // filter_search_enabled_blk:
+ // 0: disabled
+ // 1: filter search depends on mode properties
+ // 2: filter search forced since prediction is unreliable
+ // cb_pred_filter_search 0: disabled cb prediction
+ InterpFilter filt_select = EIGHTTAP_REGULAR;
+ const int cb_pred_filter_search =
+ x->content_state_sb.source_sad_nonrd > kVeryLowSad
+ ? cpi->sf.interp_sf.cb_pred_filter_search
+ : 0;
+ const int filter_search_enabled_blk =
+ is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id,
+ cb_pred_filter_search, &filt_select);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ x->ms_stat_nonrd.num_blocks[bsize]++;
+#endif
+ init_mbmi_nonrd(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
+ mi->tx_size = AOMMIN(
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+ TX_16X16);
+
+ fill_single_inter_mode_costs(search_state.single_inter_mode_costs,
+ num_inter_modes, ref_mode_set, mode_costs,
+ mbmi_ext->mode_context);
+
+ MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME;
+
+ // Initialize inter prediction params at block level for single reference
+ // mode.
+ InterPredParams inter_pred_params_sr;
+ init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height,
+ mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x,
+ pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd),
+ /*is_intrabc=*/0);
+ inter_pred_params_sr.conv_params =
+ get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
+
+ x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ !x->force_zeromv_skip_for_blk &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ x->source_variance == 0 && bsize < cm->seq_params->sb_size &&
+ search_state.yv12_mb[LAST_FRAME][0].width == cm->width &&
+ search_state.yv12_mb[LAST_FRAME][0].height == cm->height) {
+ set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]);
+ }
+
+ int sb_me_has_been_tested = 0;
+ x->sb_me_block = x->sb_me_partition;
+ // Only use this feature (force testing of superblock motion) if coding
+ // block size is large.
+ if (x->sb_me_block) {
+ if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64)
+ x->sb_me_block = 0;
+ else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32)
+ x->sb_me_block = 0;
+ }
+
+ x->min_dist_inter_uv = INT64_MAX;
+ for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
+ // If we are at the first compound mode, and the single modes already
+ // perform well, then end the search.
+ if (rt_sf->skip_compound_based_on_var && idx == num_inter_modes &&
+ skip_comp_based_on_var(search_state.vars, bsize)) {
+ break;
+ }
+
+ int is_single_pred = 1;
+ PREDICTION_MODE this_mode;
+
+ if (idx == 0 && !x->force_zeromv_skip_for_blk) {
+ // Set color sensitivity on first tested mode only.
+ // Use y-sad already computed in find_predictors: take the sad with motion
+ // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+ // is for zeromv.
+ // For screen: first check if golden reference is being used, if so,
+ // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1.
+ // The check in set_color_sensitivity() will then follow and check for
+ // setting the flag if the level is still 2 or 0.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
+ }
+ if (search_state.use_ref_frame_mask[LAST_FRAME] &&
+ x->pred_mv0_sad[LAST_FRAME] != INT_MAX) {
+ int y_sad = x->pred_mv0_sad[LAST_FRAME];
+ if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+ (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+ abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+ (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+ abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+ y_sad = x->pred_mv1_sad[LAST_FRAME];
+ set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
+ search_state.yv12_mb[LAST_FRAME]);
+ }
+ }
+
+ // Check the inter mode can be skipped based on mode statistics and speed
+ // features settings.
+ if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred,
+ &force_mv_inter_layer, &is_single_pred,
+ &this_mode, &last_comp_ref_frame, &ref_frame,
+ &ref_frame2, idx, svc_mv, force_skip_low_temp_var,
+ sse_zeromv_norm, num_inter_modes, segment_id,
+ bsize, comp_use_zero_zeromv_only, check_globalmv))
+ continue;
+
+ // Select prediction reference frames.
+ for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+ xd->plane[plane].pre[0] = search_state.yv12_mb[ref_frame][plane];
+ if (!is_single_pred)
+ xd->plane[plane].pre[1] = search_state.yv12_mb[ref_frame2][plane];
+ }
+
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = ref_frame2;
+ set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
+
+ // Check if the scaled reference frame should be used. This is set in the
+ // find_predictors() for each usable reference. If so, set the
+ // block_ref_scale_factors[] to no reference scaling.
+ if (search_state.use_scaled_ref_frame[ref_frame]) {
+ xd->block_ref_scale_factors[0] = &sf_no_scale;
+ }
+ if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) {
+ xd->block_ref_scale_factors[1] = &sf_no_scale;
+ }
+
+ // Perform inter mode evaluation for non-rd
+ if (!handle_inter_mode_nonrd(
+ cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer,
+ inter_pred_params_sr, &best_early_term, &sse_zeromv_norm,
+ &check_globalmv,
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ &zero_last_cost_orig, denoise_svc_pickmode,
+#endif
+ idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref,
+ use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode,
+ filt_select, cb_pred_filter_search, reuse_inter_pred,
+ &sb_me_has_been_tested)) {
+ break;
+ }
+ }
+
+ // Restore mode data of best inter mode
+ mi->mode = best_pickmode->best_mode;
+ mi->motion_mode = best_pickmode->best_motion_mode;
+ mi->wm_params = best_pickmode->wm_params;
+ mi->num_proj_ref = best_pickmode->num_proj_ref;
+ mi->interp_filters = best_pickmode->best_pred_filter;
+ mi->tx_size = best_pickmode->best_tx_size;
+ memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
+ mi->ref_frame[0] = best_pickmode->best_ref_frame;
+ mi->mv[0].as_int = search_state
+ .frame_mv_best[best_pickmode->best_mode]
+ [best_pickmode->best_ref_frame]
+ .as_int;
+ mi->mv[1].as_int = 0;
+ if (best_pickmode->best_second_ref_frame > INTRA_FRAME) {
+ mi->ref_frame[1] = best_pickmode->best_second_ref_frame;
+ mi->mv[1].as_int = search_state
+ .frame_mv_best[best_pickmode->best_mode]
+ [best_pickmode->best_second_ref_frame]
+ .as_int;
+ }
+ // Perform intra prediction search, if the best SAD is above a certain
+ // threshold.
+ mi->angle_delta[PLANE_TYPE_Y] = 0;
+ mi->angle_delta[PLANE_TYPE_UV] = 0;
+ mi->filter_intra_mode_info.use_filter_intra = 0;
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.num_searches[bsize][DC_PRED]++;
+ x->ms_stat_nonrd.num_nonskipped_searches[bsize][DC_PRED]++;
+#endif
+
+ int force_palette_test = 0;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ bsize <= BLOCK_16X16) {
+ unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000;
+ unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200;
+ unsigned int best_sse_inter_motion =
+ (unsigned int)(search_state.best_rdc.sse >>
+ (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]));
+ if (best_sse_inter_motion > thresh_sse &&
+ x->source_variance > thresh_source_var)
+ force_palette_test = 1;
+ }
+
+ // Evaluate Intra modes in inter frame
+ if (!x->force_zeromv_skip_for_blk)
+ av1_estimate_intra_mode(cpi, x, bsize, best_early_term,
+ search_state.ref_costs_single[INTRA_FRAME],
+ reuse_inter_pred, &orig_dst, tmp_buffer,
+ &this_mode_pred, &search_state.best_rdc,
+ best_pickmode, ctx);
+
+ int skip_idtx_palette = (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ !cpi->rc.high_source_sad;
+
+ int try_palette =
+ !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mi->bsize);
+ try_palette =
+ try_palette &&
+ (is_mode_intra(best_pickmode->best_mode) || force_palette_test) &&
+ x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
+ (cpi->rc.high_source_sad || x->source_variance > 300);
+
+ if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0;
+
+ // Perform screen content mode evaluation for non-rd
+ handle_screen_content_mode_nonrd(
+ cpi, x, &search_state, this_mode_pred, ctx, tmp_buffer, &orig_dst,
+ skip_idtx_palette, try_palette, bsize, reuse_inter_pred, mi_col, mi_row);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.nonskipped_search_times[bsize][DC_PRED] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
+#endif
+
+ pd->dst = orig_dst;
+ // Best mode is finalized. Restore the mode data to mbmi
+ if (try_palette) mi->palette_mode_info = best_pickmode->pmi;
+ mi->mode = best_pickmode->best_mode;
+ mi->ref_frame[0] = best_pickmode->best_ref_frame;
+ mi->ref_frame[1] = best_pickmode->best_second_ref_frame;
+ // For lossless: always force the skip flags off.
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ txfm_info->skip_txfm = 0;
+ memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
+ } else {
+ txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+ }
+ if (has_second_ref(mi)) {
+ mi->comp_group_idx = 0;
+ mi->compound_idx = 1;
+ mi->interinter_comp.type = COMPOUND_AVERAGE;
+ }
+
+ if (!is_inter_block(mi)) {
+ mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+ } else {
+ // If inter mode is selected and ref_frame was one that uses the
+ // scaled reference frame, then we can't use reuse_inter_pred.
+ if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] ||
+ (has_second_ref(mi) &&
+ search_state
+ .use_scaled_ref_frame[best_pickmode->best_second_ref_frame]))
+ x->reuse_inter_pred = 0;
+ }
+
+ // Restore the predicted samples of best mode to final buffer
+ if (reuse_inter_pred && best_pickmode->best_pred != NULL) {
+ PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+ aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ }
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 &&
+ denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+ cpi->denoiser.reset == 0) {
+ AV1_DENOISER_DECISION decision = COPY_BLOCK;
+ ctx->sb_skip_denoising = 0;
+ av1_pickmode_ctx_den_update(
+ &ctx_den, zero_last_cost_orig, search_state.ref_costs_single,
+ search_state.frame_mv, reuse_inter_pred, best_pickmode);
+ av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+ gf_temporal_ref);
+ if (denoise_recheck_zeromv)
+ recheck_zeromv_after_denoising(
+ cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb,
+ &search_state.best_rdc, best_pickmode, bsize, mi_row, mi_col);
+ best_pickmode->best_ref_frame = ctx_den.best_ref_frame;
+ }
+#endif
+
+ // Update the factors used for RD thresholding for all modes.
+ if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
+ THR_MODES best_mode_idx =
+ mode_idx[best_pickmode->best_ref_frame][mode_offset(mi->mode)];
+ if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+ // Only consider the modes that are included in the intra_mode_list.
+ int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
+ for (int mode_index = 0; mode_index < intra_modes; mode_index++) {
+ update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx,
+ intra_mode_list[mode_index]);
+ }
+ } else {
+ PREDICTION_MODE this_mode;
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ update_thresh_freq_fact(cpi, x, bsize, best_pickmode->best_ref_frame,
+ best_mode_idx, this_mode);
+ }
+ }
+ }
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context_nonrd(x, ctx, mi->mode);
+#else
+ store_coding_context_nonrd(x, ctx);
+#endif // CONFIG_INTERNAL_STATS
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.bsize_timer);
+ x->ms_stat_nonrd.total_block_times[bsize] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.bsize_timer);
+ print_time(&x->ms_stat_nonrd, bsize, cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols, mi_row, mi_col);
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+
+ *rd_cost = search_state.best_rdc;
+
+ // Reset the xd->block_ref_scale_factors[i], as they may have
+ // been set to pointer &sf_no_scale, which becomes invalid afer
+ // this function.
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+}
diff --git a/third_party/aom/av1/encoder/optical_flow.c b/third_party/aom/av1/encoder/optical_flow.c
new file mode 100644
index 0000000000..dc168e7aee
--- /dev/null
+++ b/third_party/aom/av1/encoder/optical_flow.c
@@ -0,0 +1,1113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/optical_flow.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params) {
+ opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS;
+ opfl_params->warping_steps = OPFL_WARPING_STEPS;
+ opfl_params->lk_params = NULL;
+}
+
+void av1_init_lk_params(LK_PARAMS *lk_params) {
+ lk_params->window_size = OPFL_WINDOW_SIZE;
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+ return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+// Helper function to determine whether optical flow method is sparse.
+static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) {
+ return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0;
+}
+
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const double x_coord, const double y_coord,
+ const int window_size, const int bit_depth,
+ double *ix, double *iy, double *it,
+ LOCALMV *mv);
+
+// coefficients for bilinear interpolation on unit square
+static int pixel_interp(const double x, const double y, const double b00,
+ const double b01, const double b10, const double b11) {
+ const int xint = (int)x;
+ const int yint = (int)y;
+ const double xdec = x - xint;
+ const double ydec = y - yint;
+ const double a = (1 - xdec) * (1 - ydec);
+ const double b = xdec * (1 - ydec);
+ const double c = (1 - xdec) * ydec;
+ const double d = xdec * ydec;
+ // if x, y are already integers, this results to b00
+ int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11);
+ return interp;
+}
+
+// Scharr filter to compute spatial gradient
+static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord,
+ const int y_coord, const int direction,
+ double *derivative) {
+ double *filter;
+ // Scharr filters
+ double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 };
+ double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 };
+ if (direction == 0) { // x direction
+ filter = gx;
+ } else { // y direction
+ filter = gy;
+ }
+ int idx = 0;
+ double d = 0;
+ for (int yy = -1; yy <= 1; yy++) {
+ for (int xx = -1; xx <= 1; xx++) {
+ d += filter[idx] *
+ frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)];
+ idx++;
+ }
+ }
+ // normalization scaling factor for scharr
+ *derivative = d / 32.0;
+}
+
+// Determine the spatial gradient at subpixel locations
+// For example, when reducing images for pyramidal LK,
+// corners found in original image may be at subpixel locations.
+static void gradient_interp(double *fullpel_deriv, const double x_coord,
+ const double y_coord, const int w, const int h,
+ double *derivative) {
+ const int xint = (int)x_coord;
+ const int yint = (int)y_coord;
+ double interp;
+ if (xint + 1 > w - 1 || yint + 1 > h - 1) {
+ interp = fullpel_deriv[yint * w + xint];
+ } else {
+ interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint],
+ fullpel_deriv[yint * w + (xint + 1)],
+ fullpel_deriv[(yint + 1) * w + xint],
+ fullpel_deriv[(yint + 1) * w + (xint + 1)]);
+ }
+
+ *derivative = interp;
+}
+
+static void temporal_gradient(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *frame2,
+ const double x_coord, const double y_coord,
+ const int bit_depth, double *derivative,
+ LOCALMV *mv) {
+ const int w = 2;
+ const int h = 2;
+ uint8_t pred1[4];
+ uint8_t pred2[4];
+
+ const int y = (int)y_coord;
+ const int x = (int)x_coord;
+ const double ydec = y_coord - y;
+ const double xdec = x_coord - x;
+ const int is_intrabc = 0; // Is intra-copied?
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame2);
+ const int subsampling_x = 0, subsampling_y = 0; // for y-buffer
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(MULTITAP_SHARP);
+ const int plane = 0; // y-plane
+ const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width,
+ frame2->y_crop_height, frame2->y_stride };
+ struct scale_factors scale;
+ av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width,
+ frame->y_crop_height, frame->y_crop_width,
+ frame->y_crop_height);
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+ &scale, &ref_buf2, interp_filters);
+ inter_pred_params.interp_filter_params[0] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+ inter_pred_params.interp_filter_params[1] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8),
+ .col = (int16_t)round((mv->col + ydec) * 8) };
+ av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params);
+ const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width,
+ frame->y_crop_height, frame->y_stride };
+ av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+ &scale, &ref_buf1, interp_filters);
+ inter_pred_params.interp_filter_params[0] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+ inter_pred_params.interp_filter_params[1] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ MV zeroMV = { .row = (int16_t)round(xdec * 8),
+ .col = (int16_t)round(ydec * 8) };
+ av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params);
+
+ *derivative = pred2[0] - pred1[0];
+}
+
+// Numerical differentiate over window_size x window_size surrounding (x,y)
+// location. Alters ix, iy, it to contain numerical partial derivatives
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const double x_coord, const double y_coord,
+ const int window_size, const int bit_depth,
+ double *ix, double *iy, double *it,
+ LOCALMV *mv) {
+ const double left = x_coord - window_size / 2.0;
+ const double top = y_coord - window_size / 2.0;
+ // gradient operators need pixel before and after (start at 1)
+ const double x_start = AOMMAX(1, left);
+ const double y_start = AOMMAX(1, top);
+ const int frame_height = frame->y_crop_height;
+ const int frame_width = frame->y_crop_width;
+ double deriv_x;
+ double deriv_y;
+ double deriv_t;
+
+ const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2);
+ const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2);
+ const int xs = (int)AOMMAX(1, x_start - 1);
+ const int ys = (int)AOMMAX(1, y_start - 1);
+ const int xe = (int)AOMMIN(x_end + 2, frame_width - 2);
+ const int ye = (int)AOMMIN(y_end + 2, frame_height - 2);
+ // with normalization, gradients may be double values
+ double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x));
+ double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y));
+ if (!fullpel_dx || !fullpel_dy) {
+ aom_free(fullpel_dx);
+ aom_free(fullpel_dy);
+ return;
+ }
+
+ // TODO(any): This could be more efficient in the case that x_coord
+ // and y_coord are integers.. but it may look more messy.
+
+ // calculate spatial gradients at full pixel locations
+ for (int j = ys; j < ye; j++) {
+ for (int i = xs; i < xe; i++) {
+ spatial_gradient(frame, i, j, 0, &deriv_x);
+ spatial_gradient(frame, i, j, 1, &deriv_y);
+ int idx = (j - ys) * (xe - xs) + (i - xs);
+ fullpel_dx[idx] = deriv_x;
+ fullpel_dy[idx] = deriv_y;
+ }
+ }
+ // compute numerical differentiation for every pixel in window
+ // (this potentially includes subpixels)
+ for (double j = y_start; j < y_end; j++) {
+ for (double i = x_start; i < x_end; i++) {
+ temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv);
+ gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x);
+ gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y);
+ int idx = (int)(j - top) * window_size + (int)(i - left);
+ ix[idx] = deriv_x;
+ iy[idx] = deriv_y;
+ it[idx] = deriv_t;
+ }
+ }
+ // TODO(any): to avoid setting deriv arrays to zero for every iteration,
+ // could instead pass these two values back through function call
+ // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left);
+ // int width = window_size - ((int)(x_start - left) + (int)(left + window_size
+ // - x_end));
+
+ aom_free(fullpel_dx);
+ aom_free(fullpel_dy);
+}
+
+// To compute eigenvalues of 2x2 matrix: Solve for lambda where
+// Determinant(matrix - lambda*identity) == 0
+static void eigenvalues_2x2(const double *matrix, double *eig) {
+ const double a = 1;
+ const double b = -1 * matrix[0] - matrix[3];
+ const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3];
+ // quadratic formula
+ const double discriminant = b * b - 4 * a * c;
+ eig[0] = (-b - sqrt(discriminant)) / (2.0 * a);
+ eig[1] = (-b + sqrt(discriminant)) / (2.0 * a);
+ // double check that eigenvalues are ordered by magnitude
+ if (fabs(eig[0]) > fabs(eig[1])) {
+ double tmp = eig[0];
+ eig[0] = eig[1];
+ eig[1] = tmp;
+ }
+}
+
+// Shi-Tomasi corner detection criteria
+static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame, const int x,
+ const int y, double *i_x, double *i_y, double *i_t,
+ const int n, const int bit_depth) {
+ double eig[2];
+ LOCALMV mv = { .row = 0, .col = 0 };
+ // TODO(any): technically, ref_frame and i_t are not used by corner score
+ // so these could be replaced by dummy variables,
+ // or change this to spatial gradient function over window only
+ gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x,
+ i_y, i_t, &mv);
+ double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+ multiply_mat(i_x, i_x, Mres1, 1, n * n, 1);
+ multiply_mat(i_x, i_y, Mres2, 1, n * n, 1);
+ multiply_mat(i_y, i_y, Mres3, 1, n * n, 1);
+ double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+ eigenvalues_2x2(M, eig);
+ return fabs(eig[0]);
+}
+
+// Finds corners in frame_to_filter
+// For less strict requirements (i.e. more corners), decrease threshold
+static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const int maxcorners, int *ref_corners,
+ const int bit_depth) {
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ // TODO(any): currently if maxcorners is decreased, then it only means
+ // corners will be omited from bottom-right of image. if maxcorners
+ // is actually used, then this algorithm would need to re-iterate
+ // and choose threshold based on that
+ assert(maxcorners == frame_height * frame_width);
+ int countcorners = 0;
+ const double threshold = 0.1;
+ double score;
+ const int n = 3;
+ double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ const int fromedge = n;
+ double max_score = corner_score(frame_to_filter, ref_frame, fromedge,
+ fromedge, i_x, i_y, i_t, n, bit_depth);
+ // rough estimate of max corner score in image
+ for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+ for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) {
+ for (int i = 0; i < n * n; i++) {
+ i_x[i] = 0;
+ i_y[i] = 0;
+ i_t[i] = 0;
+ }
+ score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+ bit_depth);
+ if (score > max_score) {
+ max_score = score;
+ }
+ }
+ }
+ // score all the points and choose corners over threshold
+ for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+ for (int y = fromedge;
+ (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) {
+ for (int i = 0; i < n * n; i++) {
+ i_x[i] = 0;
+ i_y[i] = 0;
+ i_t[i] = 0;
+ }
+ score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+ bit_depth);
+ if (score > threshold * max_score) {
+ ref_corners[countcorners * 2] = x;
+ ref_corners[countcorners * 2 + 1] = y;
+ countcorners++;
+ }
+ }
+ }
+ return countcorners;
+}
+
+// weights is an nxn matrix. weights is filled with a gaussian function,
+// with independent variable: distance from the center point.
+static void gaussian(const double sigma, const int n, const int normalize,
+ double *weights) {
+ double total_weight = 0;
+ for (int j = 0; j < n; j++) {
+ for (int i = 0; i < n; i++) {
+ double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2));
+ double weight = exp(-0.5 * pow(distance / sigma, 2));
+ weights[j * n + i] = weight;
+ total_weight += weight;
+ }
+ }
+ if (normalize == 1) {
+ for (int j = 0; j < n; j++) {
+ weights[j] = weights[j] / total_weight;
+ }
+ }
+}
+
+static double convolve(const double *filter, const int *img, const int size) {
+ double result = 0;
+ for (int i = 0; i < size; i++) {
+ result += filter[i] * img[i];
+ }
+ return result;
+}
+
+// Applies a Gaussian low-pass smoothing filter to produce
+// a corresponding lower resolution image with halved dimensions
+static void reduce(uint8_t *img, int height, int width, int stride,
+ uint8_t *reduced_img) {
+ const int new_width = width / 2;
+ const int window_size = 5;
+ const double gaussian_filter[25] = {
+ 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16,
+ 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32,
+ 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256,
+ 1. / 64, 3. / 128, 1. / 64, 1. / 256
+ };
+ // filter is 5x5 so need prev and forward 2 pixels
+ int img_section[25];
+ for (int y = 0; y < height - 1; y += 2) {
+ for (int x = 0; x < width - 1; x += 2) {
+ int i = 0;
+ for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) {
+ for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) {
+ int yvalue = yy;
+ int xvalue = xx;
+ // copied pixels outside the boundary
+ if (yvalue < 0) yvalue = 0;
+ if (xvalue < 0) xvalue = 0;
+ if (yvalue >= height) yvalue = height - 1;
+ if (xvalue >= width) xvalue = width - 1;
+ img_section[i++] = img[yvalue * stride + xvalue];
+ }
+ }
+ reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve(
+ gaussian_filter, img_section, window_size * window_size);
+ }
+ }
+}
+
+static int cmpfunc(const void *a, const void *b) {
+ return (*(int *)a - *(int *)b);
+}
+static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height,
+ const int frame_width, LOCALMV *localmvs, MV *mvs) {
+ const int n = 5; // window size
+ // for smoothing filter
+ const double gaussian_filter[25] = {
+ 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16,
+ 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32,
+ 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256,
+ 1. / 64, 3. / 128, 1. / 64, 1. / 256
+ };
+ // for median filter
+ int mvrows[25];
+ int mvcols[25];
+ if (mv_filter != MV_FILTER_NONE) {
+ for (int y = 0; y < frame_height; y++) {
+ for (int x = 0; x < frame_width; x++) {
+ int center_idx = y * frame_width + x;
+ int i = 0;
+ double filtered_row = 0;
+ double filtered_col = 0;
+ for (int yy = y - n / 2; yy <= y + n / 2; yy++) {
+ for (int xx = x - n / 2; xx <= x + n / 2; xx++) {
+ int yvalue = yy;
+ int xvalue = xx;
+ // copied pixels outside the boundary
+ if (yvalue < 0) yvalue = 0;
+ if (xvalue < 0) xvalue = 0;
+ if (yvalue >= frame_height) yvalue = frame_height - 1;
+ if (xvalue >= frame_width) xvalue = frame_width - 1;
+ int index = yvalue * frame_width + xvalue;
+ if (mv_filter == MV_FILTER_SMOOTH) {
+ filtered_row += mvs[index].row * gaussian_filter[i];
+ filtered_col += mvs[index].col * gaussian_filter[i];
+ } else if (mv_filter == MV_FILTER_MEDIAN) {
+ mvrows[i] = mvs[index].row;
+ mvcols[i] = mvs[index].col;
+ }
+ i++;
+ }
+ }
+
+ MV mv = mvs[center_idx];
+ if (mv_filter == MV_FILTER_SMOOTH) {
+ mv.row = (int16_t)filtered_row;
+ mv.col = (int16_t)filtered_col;
+ } else if (mv_filter == MV_FILTER_MEDIAN) {
+ qsort(mvrows, 25, sizeof(mv.row), cmpfunc);
+ qsort(mvcols, 25, sizeof(mv.col), cmpfunc);
+ mv.row = mvrows[25 / 2];
+ mv.col = mvcols[25 / 2];
+ }
+ LOCALMV localmv = { .row = ((double)mv.row) / 8,
+ .col = ((double)mv.row) / 8 };
+ localmvs[y * frame_width + x] = localmv;
+ // if mvs array is immediately updated here, then the result may
+ // propagate to other pixels.
+ }
+ }
+ for (int i = 0; i < frame_height * frame_width; i++) {
+ MV mv = { .row = (int16_t)round(8 * localmvs[i].row),
+ .col = (int16_t)round(8 * localmvs[i].col) };
+ mvs[i] = mv;
+ }
+ }
+}
+
+// Computes optical flow at a single pyramid level,
+// using Lucas-Kanade algorithm.
+// Modifies mvs array.
+static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame, const int level,
+ const LK_PARAMS *lk_params, const int num_ref_corners,
+ int *ref_corners, const int mv_stride,
+ const int bit_depth, LOCALMV *mvs) {
+ assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0);
+ const int n = lk_params->window_size;
+ // algorithm is sensitive to window size
+ double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x));
+ double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y));
+ double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t));
+ double *weights = (double *)aom_malloc(n * n * sizeof(*weights));
+ if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf;
+
+ const int expand_multiplier = (int)pow(2, level);
+ double sigma = 0.2 * n;
+ // normalizing doesn't really affect anything since it's applied
+ // to every component of M and b
+ gaussian(sigma, n, 0, weights);
+ for (int i = 0; i < num_ref_corners; i++) {
+ const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier;
+ const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier;
+ int highres_x = ref_corners[i * 2];
+ int highres_y = ref_corners[i * 2 + 1];
+ int mv_idx = highres_y * (mv_stride) + highres_x;
+ LOCALMV mv_old = mvs[mv_idx];
+ mv_old.row = mv_old.row / expand_multiplier;
+ mv_old.col = mv_old.col / expand_multiplier;
+ // using this instead of memset, since it's not completely
+ // clear if zero memset works on double arrays
+ for (int j = 0; j < n * n; j++) {
+ i_x[j] = 0;
+ i_y[j] = 0;
+ i_t[j] = 0;
+ }
+ gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth,
+ i_x, i_y, i_t, &mv_old);
+ double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+ double bres1[1] = { 0 }, bres2[1] = { 0 };
+ for (int j = 0; j < n * n; j++) {
+ Mres1[0] += weights[j] * i_x[j] * i_x[j];
+ Mres2[0] += weights[j] * i_x[j] * i_y[j];
+ Mres3[0] += weights[j] * i_y[j] * i_y[j];
+ bres1[0] += weights[j] * i_x[j] * i_t[j];
+ bres2[0] += weights[j] * i_y[j] * i_t[j];
+ }
+ double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+ double b[2] = { -1 * bres1[0], -1 * bres2[0] };
+ double eig[2] = { 1, 1 };
+ eigenvalues_2x2(M, eig);
+ double threshold = 0.1;
+ if (fabs(eig[0]) > threshold) {
+ // if M is not invertible, then displacement
+ // will default to zeros
+ double u[2] = { 0, 0 };
+ linsolve(2, M, 2, b, u);
+ int mult = 1;
+ if (level != 0)
+ mult = expand_multiplier; // mv doubles when resolution doubles
+ LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)),
+ .col = (mult * (u[1] + mv_old.col)) };
+ mvs[mv_idx] = mv;
+ mvs[mv_idx] = mv;
+ }
+ }
+free_lk_buf:
+ aom_free(weights);
+ aom_free(i_t);
+ aom_free(i_x);
+ aom_free(i_y);
+}
+
+// Warp the src_frame to warper_frame according to mvs.
+// mvs point to src_frame
+static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame,
+ const YV12_BUFFER_CONFIG *src_frame,
+ const LOCALMV *mvs, int mv_stride) {
+ int w, h;
+ const int fw = src_frame->y_crop_width;
+ const int fh = src_frame->y_crop_height;
+ const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride;
+ const uint8_t *src_buf = src_frame->y_buffer;
+ uint8_t *warped_buf = warped_frame->y_buffer;
+ double temp;
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ double cord_x = (double)w + mvs[h * mv_stride + w].col;
+ double cord_y = (double)h + mvs[h * mv_stride + w].row;
+ cord_x = fclamp(cord_x, 0, (double)(fw - 1));
+ cord_y = fclamp(cord_y, 0, (double)(fh - 1));
+ const int floorx = (int)floor(cord_x);
+ const int floory = (int)floor(cord_y);
+ const double fracx = cord_x - (double)floorx;
+ const double fracy = cord_y - (double)floory;
+
+ temp = 0;
+ for (int hh = 0; hh < 2; hh++) {
+ const double weighth = hh ? (fracy) : (1 - fracy);
+ for (int ww = 0; ww < 2; ww++) {
+ const double weightw = ww ? (fracx) : (1 - fracx);
+ int y = floory + hh;
+ int x = floorx + ww;
+ y = clamp(y, 0, fh - 1);
+ x = clamp(x, 0, fw - 1);
+ temp += (double)src_buf[y * src_fs + x] * weightw * weighth;
+ }
+ }
+ warped_buf[h * warped_fs + w] = (uint8_t)round(temp);
+ }
+ }
+}
+
+// Same as warp_back_frame, but using a better interpolation filter.
+static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame,
+ const YV12_BUFFER_CONFIG *src_frame,
+ const LOCALMV *mvs, int mv_stride) {
+ int w, h;
+ const int fw = src_frame->y_crop_width;
+ const int fh = src_frame->y_crop_height;
+ const int warped_fs = warped_frame->y_stride;
+ uint8_t *warped_buf = warped_frame->y_buffer;
+ const int blk = 2;
+ uint8_t temp_blk[4];
+
+ const int is_intrabc = 0; // Is intra-copied?
+ const int is_high_bitdepth = is_frame_high_bitdepth(src_frame);
+ const int subsampling_x = 0, subsampling_y = 0; // for y-buffer
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(MULTITAP_SHARP2);
+ const int plane = 0; // y-plane
+ const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer,
+ src_frame->y_crop_width,
+ src_frame->y_crop_height,
+ src_frame->y_stride };
+ const int bit_depth = src_frame->bit_depth;
+ struct scale_factors scale;
+ av1_setup_scale_factors_for_frame(
+ &scale, src_frame->y_crop_width, src_frame->y_crop_height,
+ src_frame->y_crop_width, src_frame->y_crop_height);
+
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth,
+ is_intrabc, &scale, &ref_buf2, interp_filters);
+ inter_pred_params.interp_filter_params[0] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+ inter_pred_params.interp_filter_params[1] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8),
+ .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) };
+ av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv,
+ &inter_pred_params);
+ warped_buf[h * warped_fs + w] = temp_blk[0];
+ }
+ }
+}
+
+#define DERIVATIVE_FILTER_LENGTH 7
+double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60, -45.0 / 60, 0,
+ 45.0 / 60, -9.0 / 60, 1.0 / 60 };
+
+// Get gradient of the whole frame
+static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame, double *ix,
+ double *iy, double *it, int grad_stride) {
+ int w, h, k, idx;
+ const int fw = from_frame->y_crop_width;
+ const int fh = from_frame->y_crop_height;
+ const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride;
+ const uint8_t *from_buf = from_frame->y_buffer;
+ const uint8_t *to_buf = to_frame->y_buffer;
+
+ const int lh = DERIVATIVE_FILTER_LENGTH;
+ const int hleft = (lh - 1) / 2;
+
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ // x
+ ix[h * grad_stride + w] = 0;
+ for (k = 0; k < lh; k++) {
+ // if we want to make this block dependent, need to extend the
+ // boundaries using other initializations.
+ idx = w + k - hleft;
+ idx = clamp(idx, 0, fw - 1);
+ ix[h * grad_stride + w] += filter[k] * 0.5 *
+ ((double)from_buf[h * from_fs + idx] +
+ (double)to_buf[h * to_fs + idx]);
+ }
+ // y
+ iy[h * grad_stride + w] = 0;
+ for (k = 0; k < lh; k++) {
+ // if we want to make this block dependent, need to extend the
+ // boundaries using other initializations.
+ idx = h + k - hleft;
+ idx = clamp(idx, 0, fh - 1);
+ iy[h * grad_stride + w] += filter[k] * 0.5 *
+ ((double)from_buf[idx * from_fs + w] +
+ (double)to_buf[idx * to_fs + w]);
+ }
+ // t
+ it[h * grad_stride + w] =
+ (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w];
+ }
+ }
+}
+
+// Solve for linear equations given by the H-S method
+static void solve_horn_schunck(const double *ix, const double *iy,
+ const double *it, int grad_stride, int width,
+ int height, const LOCALMV *init_mvs,
+ int init_mv_stride, LOCALMV *mvs,
+ int mv_stride) {
+ // TODO(bohanli): May just need to allocate the buffers once per optical flow
+ // calculation
+ int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos));
+ int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos));
+ double *values = aom_calloc(width * height * 28, sizeof(*values));
+ double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec));
+ double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec));
+ double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b));
+ double *b = aom_calloc(width * height * 2, sizeof(*b));
+ if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b ||
+ !b) {
+ goto free_hs_solver_buf;
+ }
+
+ // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors
+ const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 };
+ const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 };
+
+ int h, w, checkh, checkw, k, ret;
+ const int offset = height * width;
+ SPARSE_MTX A;
+ int c = 0;
+ const double lambda = 100;
+
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col;
+ mv_init_vec[w * height + h + offset] =
+ init_mvs[h * init_mv_stride + w].row;
+ }
+ }
+
+ // get matrix A
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ int center_num_direct = 4;
+ const int center_idx = w * height + h;
+ if (w == 0 || w == width - 1) center_num_direct--;
+ if (h == 0 || h == height - 1) center_num_direct--;
+ // diagonal entry for this row from the center pixel
+ double cor_w = center_num_direct * center_num_direct + center_num_direct;
+ row_pos[c] = center_idx;
+ col_pos[c] = center_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = center_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ // other entries from direct neighbors
+ for (k = 0; k < 4; k++) {
+ checkh = h + check_locs_y[k];
+ checkw = w + check_locs_x[k];
+ if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+ continue;
+ }
+ int this_idx = checkw * height + checkh;
+ int this_num_direct = 4;
+ if (checkw == 0 || checkw == width - 1) this_num_direct--;
+ if (checkh == 0 || checkh == height - 1) this_num_direct--;
+ cor_w = -center_num_direct - this_num_direct;
+ row_pos[c] = center_idx;
+ col_pos[c] = this_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = this_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ }
+ // entries from neighbors on the diagonal corners
+ for (k = 4; k < 8; k++) {
+ checkh = h + check_locs_y[k];
+ checkw = w + check_locs_x[k];
+ if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+ continue;
+ }
+ int this_idx = checkw * height + checkh;
+ cor_w = 2;
+ row_pos[c] = center_idx;
+ col_pos[c] = this_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = this_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ }
+ // entries from neighbors with dist of 2
+ for (k = 8; k < 12; k++) {
+ checkh = h + check_locs_y[k];
+ checkw = w + check_locs_x[k];
+ if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+ continue;
+ }
+ int this_idx = checkw * height + checkh;
+ cor_w = 1;
+ row_pos[c] = center_idx;
+ col_pos[c] = this_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = this_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ }
+ }
+ }
+ ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+ 2 * width * height, &A);
+ if (ret < 0) goto free_hs_solver_buf;
+ // subtract init mv part from b
+ av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height);
+ for (int i = 0; i < 2 * width * height; i++) {
+ b[i] = -temp_b[i];
+ }
+ av1_free_sparse_mtx_elems(&A);
+
+ // add cross terms to A and modify b with ExEt / EyEt
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ int curidx = w * height + h;
+ // modify b
+ b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w];
+ b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w];
+ // add cross terms to A
+ row_pos[c] = curidx;
+ col_pos[c] = curidx + offset;
+ values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+ c++;
+ row_pos[c] = curidx + offset;
+ col_pos[c] = curidx;
+ values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+ c++;
+ }
+ }
+ // Add diagonal terms to A
+ for (int i = 0; i < c; i++) {
+ if (row_pos[i] == col_pos[i]) {
+ if (row_pos[i] < offset) {
+ w = row_pos[i] / height;
+ h = row_pos[i] % height;
+ values[i] += pow(ix[h * grad_stride + w], 2);
+ } else {
+ w = (row_pos[i] - offset) / height;
+ h = (row_pos[i] - offset) % height;
+ values[i] += pow(iy[h * grad_stride + w], 2);
+ }
+ }
+ }
+
+ ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+ 2 * width * height, &A);
+ if (ret < 0) goto free_hs_solver_buf;
+
+ // solve for the mvs
+ ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec);
+ if (ret < 0) goto free_hs_solver_buf;
+
+ // copy mvs
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ mvs[h * mv_stride + w].col = mv_vec[w * height + h];
+ mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset];
+ }
+ }
+free_hs_solver_buf:
+ aom_free(row_pos);
+ aom_free(col_pos);
+ aom_free(values);
+ aom_free(mv_vec);
+ aom_free(mv_init_vec);
+ aom_free(b);
+ aom_free(temp_b);
+ av1_free_sparse_mtx_elems(&A);
+}
+
+// Calculate optical flow from from_frame to to_frame using the H-S method.
+static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame, const int level,
+ const int mv_stride, const int mv_height,
+ const int mv_width, const OPFL_PARAMS *opfl_params,
+ LOCALMV *mvs) {
+ // mvs are always on level 0, here we define two new mv arrays that is of size
+ // of this level.
+ const int fw = from_frame->y_crop_width;
+ const int fh = from_frame->y_crop_height;
+ const int factor = (int)pow(2, level);
+ int w, h, k, init_mv_stride;
+ LOCALMV *init_mvs = NULL, *refine_mvs = NULL;
+ double *ix = NULL, *iy = NULL, *it = NULL;
+ YV12_BUFFER_CONFIG temp_frame;
+ temp_frame.y_buffer = NULL;
+ if (level == 0) {
+ init_mvs = mvs;
+ init_mv_stride = mv_stride;
+ } else {
+ init_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+ if (!init_mvs) goto free_hs_buf;
+ init_mv_stride = fw;
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ init_mvs[h * init_mv_stride + w].row =
+ mvs[h * factor * mv_stride + w * factor].row / (double)factor;
+ init_mvs[h * init_mv_stride + w].col =
+ mvs[h * factor * mv_stride + w * factor].col / (double)factor;
+ }
+ }
+ }
+ refine_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+ if (!refine_mvs) goto free_hs_buf;
+ // temp frame for warping
+ temp_frame.y_buffer =
+ (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer));
+ if (!temp_frame.y_buffer) goto free_hs_buf;
+ temp_frame.y_crop_height = fh;
+ temp_frame.y_crop_width = fw;
+ temp_frame.y_stride = fw;
+ // gradient buffers
+ ix = aom_calloc(fw * fh, sizeof(*ix));
+ iy = aom_calloc(fw * fh, sizeof(*iy));
+ it = aom_calloc(fw * fh, sizeof(*it));
+ if (!ix || !iy || !it) goto free_hs_buf;
+ // For each warping step
+ for (k = 0; k < opfl_params->warping_steps; k++) {
+ // warp from_frame with init_mv
+ if (level == 0) {
+ warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride);
+ } else {
+ warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride);
+ }
+ // calculate frame gradients
+ get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw);
+ // form linear equations and solve mvs
+ solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride,
+ refine_mvs, fw);
+ // update init_mvs
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col;
+ init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row;
+ }
+ }
+ }
+ // copy back the mvs if needed
+ if (level != 0) {
+ for (h = 0; h < mv_height; h++) {
+ for (w = 0; w < mv_width; w++) {
+ mvs[h * mv_stride + w].row =
+ init_mvs[h / factor * init_mv_stride + w / factor].row *
+ (double)factor;
+ mvs[h * mv_stride + w].col =
+ init_mvs[h / factor * init_mv_stride + w / factor].col *
+ (double)factor;
+ }
+ }
+ }
+free_hs_buf:
+ if (level != 0) aom_free(init_mvs);
+ aom_free(refine_mvs);
+ aom_free(temp_frame.y_buffer);
+ aom_free(ix);
+ aom_free(iy);
+ aom_free(it);
+}
+
+// Apply optical flow iteratively at each pyramid level
+static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame,
+ const int bit_depth,
+ const OPFL_PARAMS *opfl_params,
+ const OPTFLOW_METHOD method, LOCALMV *mvs) {
+ assert(opfl_params->pyramid_levels > 0 &&
+ opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS);
+ int levels = opfl_params->pyramid_levels;
+ const int frame_height = from_frame->y_crop_height;
+ const int frame_width = from_frame->y_crop_width;
+ if ((frame_height / pow(2.0, levels - 1) < 50 ||
+ frame_height / pow(2.0, levels - 1) < 50) &&
+ levels > 1)
+ levels = levels - 1;
+ uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL };
+ uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL };
+ int *ref_corners = NULL;
+
+ images1[0] = from_frame->y_buffer;
+ images2[0] = to_frame->y_buffer;
+ YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1));
+ YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2));
+ if (!buffers1 || !buffers2) goto free_pyramid_buf;
+ buffers1[0] = *from_frame;
+ buffers2[0] = *to_frame;
+ int fw = frame_width;
+ int fh = frame_height;
+ for (int i = 1; i < levels; i++) {
+ // TODO(bohanli): may need to extend buffers for better interpolation SIMD
+ images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i]));
+ images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i]));
+ if (!images1[i] || !images2[i]) goto free_pyramid_buf;
+ int stride;
+ if (i == 1)
+ stride = from_frame->y_stride;
+ else
+ stride = fw;
+ reduce(images1[i - 1], fh, fw, stride, images1[i]);
+ reduce(images2[i - 1], fh, fw, stride, images2[i]);
+ fh /= 2;
+ fw /= 2;
+ YV12_BUFFER_CONFIG a = { .y_buffer = images1[i],
+ .y_crop_width = fw,
+ .y_crop_height = fh,
+ .y_stride = fw };
+ YV12_BUFFER_CONFIG b = { .y_buffer = images2[i],
+ .y_crop_width = fw,
+ .y_crop_height = fh,
+ .y_stride = fw };
+ buffers1[i] = a;
+ buffers2[i] = b;
+ }
+ // Compute corners for specific frame
+ int num_ref_corners = 0;
+ if (is_sparse(opfl_params)) {
+ int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height;
+ ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners));
+ if (!ref_corners) goto free_pyramid_buf;
+ num_ref_corners = detect_corners(from_frame, to_frame, maxcorners,
+ ref_corners, bit_depth);
+ }
+ const int stop_level = 0;
+ for (int i = levels - 1; i >= stop_level; i--) {
+ if (method == LUCAS_KANADE) {
+ assert(is_sparse(opfl_params));
+ lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params,
+ num_ref_corners, ref_corners, buffers1[0].y_crop_width,
+ bit_depth, mvs);
+ } else if (method == HORN_SCHUNCK) {
+ assert(!is_sparse(opfl_params));
+ horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width,
+ buffers1[0].y_crop_height, buffers1[0].y_crop_width,
+ opfl_params, mvs);
+ }
+ }
+free_pyramid_buf:
+ for (int i = 1; i < levels; i++) {
+ aom_free(images1[i]);
+ aom_free(images2[i]);
+ }
+ aom_free(ref_corners);
+ aom_free(buffers1);
+ aom_free(buffers2);
+}
+// Computes optical flow by applying algorithm at
+// multiple pyramid levels of images (lower-resolution, smoothed images)
+// This accounts for larger motions.
+// Inputs:
+// from_frame Frame buffer.
+// to_frame: Frame buffer. MVs point from_frame -> to_frame.
+// from_frame_idx: Index of from_frame.
+// to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal.
+// bit_depth:
+// opfl_params: contains algorithm-specific parameters.
+// mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN.
+// method: LUCAS_KANADE, HORN_SCHUNCK
+// mvs: pointer to MVs. Contains initialization, and modified
+// based on optical flow. Must have
+// dimensions = from_frame->y_crop_width * from_frame->y_crop_height
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame,
+ const int from_frame_idx, const int to_frame_idx,
+ const int bit_depth, const OPFL_PARAMS *opfl_params,
+ const MV_FILTER_TYPE mv_filter,
+ const OPTFLOW_METHOD method, MV *mvs) {
+ const int frame_height = from_frame->y_crop_height;
+ const int frame_width = from_frame->y_crop_width;
+ // TODO(any): deal with the case where frames are not of the same dimensions
+ assert(frame_height == to_frame->y_crop_height &&
+ frame_width == to_frame->y_crop_width);
+ if (from_frame_idx == to_frame_idx) {
+ // immediately return all zero mvs when frame indices are equal
+ for (int yy = 0; yy < frame_height; yy++) {
+ for (int xx = 0; xx < frame_width; xx++) {
+ MV mv = { .row = 0, .col = 0 };
+ mvs[yy * frame_width + xx] = mv;
+ }
+ }
+ return;
+ }
+
+ // Initialize double mvs based on input parameter mvs array
+ LOCALMV *localmvs =
+ aom_malloc(frame_height * frame_width * sizeof(*localmvs));
+ if (!localmvs) return;
+
+ filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs);
+
+ for (int i = 0; i < frame_width * frame_height; i++) {
+ MV mv = mvs[i];
+ LOCALMV localmv = { .row = ((double)mv.row) / 8,
+ .col = ((double)mv.col) / 8 };
+ localmvs[i] = localmv;
+ }
+ // Apply optical flow algorithm
+ pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method,
+ localmvs);
+
+ // Update original mvs array
+ for (int j = 0; j < frame_height; j++) {
+ for (int i = 0; i < frame_width; i++) {
+ int idx = j * frame_width + i;
+ if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height ||
+ i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) {
+ continue;
+ }
+ MV mv = { .row = (int16_t)round(8 * localmvs[idx].row),
+ .col = (int16_t)round(8 * localmvs[idx].col) };
+ mvs[idx] = mv;
+ }
+ }
+
+ filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs);
+
+ aom_free(localmvs);
+}
+#endif
diff --git a/third_party/aom/av1/encoder/optical_flow.h b/third_party/aom/av1/encoder/optical_flow.h
new file mode 100644
index 0000000000..2fbe474d77
--- /dev/null
+++ b/third_party/aom/av1/encoder/optical_flow.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OPTICAL_FLOW_API
+
+typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD;
+
+typedef enum {
+ MV_FILTER_NONE,
+ MV_FILTER_SMOOTH,
+ MV_FILTER_MEDIAN
+} MV_FILTER_TYPE;
+
+typedef struct LOCALMV {
+ double row;
+ double col;
+} LOCALMV;
+
+#define MAX_PYRAMID_LEVELS 5
+// default options for optical flow
+#define OPFL_WINDOW_SIZE 15
+#define OPFL_PYRAMID_LEVELS 3 // total levels
+#define OPFL_WARPING_STEPS 3
+
+// parameters specific to Lucas-Kanade
+typedef struct lk_params {
+ int window_size;
+} LK_PARAMS;
+
+// generic structure to contain parameters for all
+// optical flow algorithms
+typedef struct opfl_params {
+ int pyramid_levels;
+ int warping_steps;
+ LK_PARAMS *lk_params;
+ int flags;
+} OPFL_PARAMS;
+
+#define OPFL_FLAG_SPARSE 1
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params);
+
+void av1_init_lk_params(LK_PARAMS *lk_params);
+
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame,
+ const int from_frame_idx, const int to_frame_idx,
+ const int bit_depth, const OPFL_PARAMS *opfl_params,
+ const MV_FILTER_TYPE mv_filter,
+ const OPTFLOW_METHOD method, MV *mvs);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_OPTICAL_FLOW_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 0000000000..7f79e9596e
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/common/pred_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_search.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+
+static int int16_comparer(const void *a, const void *b) {
+ return (*(int16_t *)a - *(int16_t *)b);
+}
+
+int av1_remove_duplicates(int16_t *centroids, int num_centroids) {
+ int num_unique; // number of unique centroids
+ int i;
+ qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer);
+ // Remove duplicates.
+ num_unique = 1;
+ for (i = 1; i < num_centroids; ++i) {
+ if (centroids[i] != centroids[i - 1]) { // found a new unique centroid
+ centroids[num_unique++] = centroids[i];
+ }
+ }
+ return num_unique;
+}
+
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+ int min_val) {
+ if (num <= 0) return 0;
+ int bits_cost = bit_depth;
+ if (num == 1) return bits_cost;
+ bits_cost += 2;
+ int max_delta = 0;
+ int deltas[PALETTE_MAX_SIZE];
+ const int min_bits = bit_depth - 3;
+ for (int i = 1; i < num; ++i) {
+ const int delta = colors[i] - colors[i - 1];
+ deltas[i - 1] = delta;
+ assert(delta >= min_val);
+ if (delta > max_delta) max_delta = delta;
+ }
+ int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+ assert(bits_per_delta <= bit_depth);
+ int range = (1 << bit_depth) - colors[0] - min_val;
+ for (int i = 0; i < num - 1; ++i) {
+ bits_cost += bits_per_delta;
+ range -= deltas[i];
+ bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
+ }
+ return bits_cost;
+}
+
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+ const uint16_t *colors, int n_colors,
+ uint8_t *cache_color_found, int *out_cache_colors) {
+ if (n_cache <= 0) {
+ for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+ return n_colors;
+ }
+ memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+ int n_in_cache = 0;
+ int in_cache_flags[PALETTE_MAX_SIZE];
+ memset(in_cache_flags, 0, sizeof(in_cache_flags));
+ for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+ for (int j = 0; j < n_colors; ++j) {
+ if (colors[j] == color_cache[i]) {
+ in_cache_flags[j] = 1;
+ cache_color_found[i] = 1;
+ ++n_in_cache;
+ break;
+ }
+ }
+ }
+ int j = 0;
+ for (int i = 0; i < n_colors; ++i)
+ if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+ assert(j == n_colors - n_in_cache);
+ return j;
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count,
+ int *min_bits) {
+ const int n = pmi->palette_size[1];
+ const int max_val = 1 << bit_depth;
+ int max_d = 0;
+ *min_bits = bit_depth - 4;
+ *zero_count = 0;
+ for (int i = 1; i < n; ++i) {
+ const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+ const int v = abs(delta);
+ const int d = AOMMIN(v, max_val - v);
+ if (d > max_d) max_d = d;
+ if (d == 0) ++(*zero_count);
+ }
+ return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth) {
+ const int n = pmi->palette_size[0];
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache =
+ av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+ cache_color_found, out_cache_colors);
+ const int total_bits =
+ n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+ return av1_cost_literal(total_bits);
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth) {
+ const int n = pmi->palette_size[1];
+ int total_bits = 0;
+ // U channel palette color cost.
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache = av1_index_color_cache(
+ color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+ cache_color_found, out_cache_colors);
+ total_bits +=
+ n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
+ // V channel palette color cost.
+ int zero_count = 0, min_bits_v = 0;
+ const int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int bits_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int bits_using_raw = bit_depth * n;
+ total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+ return av1_cost_literal(total_bits);
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+ int orig_width, int orig_height,
+ int new_width, int new_height) {
+ int j;
+ assert(new_width >= orig_width);
+ assert(new_height >= orig_height);
+ if (new_width == orig_width && new_height == orig_height) return;
+
+ for (j = orig_height - 1; j >= 0; --j) {
+ memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+ // Copy last column to extra columns.
+ memset(color_map + j * new_width + orig_width,
+ color_map[j * new_width + orig_width - 1], new_width - orig_width);
+ }
+ // Copy last row to extra rows.
+ for (j = orig_height; j < new_height; ++j) {
+ memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+ new_width);
+ }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+ int n_cache, int n_colors,
+ int stride, int16_t *centroids,
+ int bit_depth) {
+ if (n_cache <= 0) return;
+ for (int i = 0; i < n_colors * stride; i += stride) {
+ int min_diff = abs((int)centroids[i] - (int)color_cache[0]);
+ int idx = 0;
+ for (int j = 1; j < n_cache; ++j) {
+ const int this_diff = abs((int)centroids[i] - (int)color_cache[j]);
+ if (this_diff < min_diff) {
+ min_diff = this_diff;
+ idx = j;
+ }
+ }
+ const int min_threshold = 4 << (bit_depth - 8);
+ if (min_diff <= min_threshold) centroids[i] = color_cache[idx];
+ }
+}
+
+/*!\brief Calculate the luma palette cost from a given color palette
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * Given the base colors as specified in centroids[], calculate the RD cost
+ * of palette mode.
+ */
+static AOM_INLINE void palette_rd_y(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids,
+ int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
+ MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
+ int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+ uint8_t *tx_type_map, int *beat_best_palette_rd,
+ bool *do_header_rd_based_breakout, int discount_color_cost) {
+ if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false;
+ optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
+ cpi->common.seq_params->bit_depth);
+ const int num_unique_colors = av1_remove_duplicates(centroids, n);
+ if (num_unique_colors < PALETTE_MIN_SIZE) {
+ // Too few unique colors to create a palette. And DC_PRED will work
+ // well for that case anyway. So skip.
+ return;
+ }
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ if (cpi->common.seq_params->use_highbitdepth) {
+ for (int i = 0; i < num_unique_colors; ++i) {
+ pmi->palette_colors[i] = clip_pixel_highbd(
+ (int)centroids[i], cpi->common.seq_params->bit_depth);
+ }
+ } else {
+ for (int i = 0; i < num_unique_colors; ++i) {
+ pmi->palette_colors[i] = clip_pixel(centroids[i]);
+ }
+ }
+ pmi->palette_size[0] = num_unique_colors;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+ 1);
+ extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+ RD_STATS tokenonly_rd_stats;
+ int this_rate;
+
+ if (do_header_rd_based_gating) {
+ assert(do_header_rd_based_breakout != NULL);
+ const int palette_mode_rate = intra_mode_info_cost_y(
+ cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost);
+ const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+ // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+ const int header_rd_shift =
+ (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
+ // Terminate further palette_size search, if the header cost corresponding
+ // to lower palette_size is more than *best_rd << header_rd_shift. This
+ // logic is implemented with a right shift in the LHS to prevent a possible
+ // overflow with the left shift in RHS.
+ if ((header_rd >> header_rd_shift) > *best_rd) {
+ *do_header_rd_based_breakout = true;
+ return;
+ }
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+ *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) return;
+ this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+ *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) return;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost,
+ discount_color_cost);
+ }
+
+ int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+ tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+ this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ // Setting beat_best_rd flag because current mode rd is better than best_rd.
+ // This flag need to be updated only for palette evaluation in key frames
+ if (beat_best_rd) *beat_best_rd = 1;
+ memcpy(best_palette_color_map, color_map,
+ block_width * block_height * sizeof(color_map[0]));
+ *best_mbmi = *mbmi;
+ memcpy(blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ if (rate) *rate = this_rate;
+ if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+ if (distortion) *distortion = tokenonly_rd_stats.dist;
+ if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
+ if (beat_best_palette_rd) *beat_best_palette_rd = 1;
+ }
+}
+
+static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
+ assert(step_size != 0);
+ return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx;
+}
+
+// Performs count-based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_top_color_palette_search(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data,
+ int16_t *top_colors, int start_n, int end_n, int step_size,
+ bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+ int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+ uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) {
+ int16_t centroids[PALETTE_MAX_SIZE];
+ int n = start_n;
+ int top_color_winner = end_n;
+ /* clang-format off */
+ assert(IMPLIES(step_size < 0, start_n > end_n));
+ /* clang-format on */
+ assert(IMPLIES(step_size > 0, start_n < end_n));
+ while (!is_iter_over(n, end_n, step_size)) {
+ int beat_best_palette_rd = 0;
+ bool do_header_rd_based_breakout = false;
+ memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
+ palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+ color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+ best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, &beat_best_palette_rd,
+ &do_header_rd_based_breakout, discount_color_cost);
+ *last_n_searched = n;
+ if (do_header_rd_based_breakout) {
+ // Terminate palette_size search by setting last_n_searched to end_n.
+ *last_n_searched = end_n;
+ break;
+ }
+ if (beat_best_palette_rd) {
+ top_color_winner = n;
+ } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+ // At search level 2, we return immediately if we don't see an improvement
+ return top_color_winner;
+ }
+ n += step_size;
+ }
+ return top_color_winner;
+}
+
+// Performs k-means based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_k_means_palette_search(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound,
+ int upper_bound, int start_n, int end_n, int step_size,
+ bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+ int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+ uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+ int data_points, int discount_color_cost) {
+ int16_t centroids[PALETTE_MAX_SIZE];
+ const int max_itr = 50;
+ int n = start_n;
+ int top_color_winner = end_n;
+ /* clang-format off */
+ assert(IMPLIES(step_size < 0, start_n > end_n));
+ /* clang-format on */
+ assert(IMPLIES(step_size > 0, start_n < end_n));
+ while (!is_iter_over(n, end_n, step_size)) {
+ int beat_best_palette_rd = 0;
+ bool do_header_rd_based_breakout = false;
+ for (int i = 0; i < n; ++i) {
+ centroids[i] =
+ lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+ palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+ color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+ best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, &beat_best_palette_rd,
+ &do_header_rd_based_breakout, discount_color_cost);
+ *last_n_searched = n;
+ if (do_header_rd_based_breakout) {
+ // Terminate palette_size search by setting last_n_searched to end_n.
+ *last_n_searched = end_n;
+ break;
+ }
+ if (beat_best_palette_rd) {
+ top_color_winner = n;
+ } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+ // At search level 2, we return immediately if we don't see an improvement
+ return top_color_winner;
+ }
+ n += step_size;
+ }
+ return top_color_winner;
+}
+
+// Sets the parameters to search the current number of colors +- 1
+static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
+ int winner, int end_n) {
+ // Set min to winner - 1 unless we are already at the border, then we set it
+ // to winner + 1
+ *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1)
+ : AOMMAX(winner - 1, PALETTE_MIN_SIZE);
+ // Set max to winner + 1 unless we are already at the border, then we set it
+ // to winner - 1
+ *max_n =
+ (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE);
+
+ // Set the step size to max_n - min_n so we only search those two values.
+ // If max_n == min_n, then set step_size to 1 to avoid infinite loop later.
+ *step_size = AOMMAX(1, *max_n - *min_n);
+}
+
+static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src,
+ const int src_stride,
+ const int rows, const int cols,
+ const int is_high_bitdepth,
+ int16_t *data, int *lower_bound,
+ int *upper_bound) {
+ if (is_high_bitdepth) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+ *lower_bound = *upper_bound = src_ptr[0];
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int val = src_ptr[c];
+ data[c] = (int16_t)val;
+ *lower_bound = AOMMIN(*lower_bound, val);
+ *upper_bound = AOMMAX(*upper_bound, val);
+ }
+ src_ptr += src_stride;
+ data += cols;
+ }
+ return;
+ }
+
+ // low bit depth
+ *lower_bound = *upper_bound = src[0];
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int val = src[c];
+ data[c] = (int16_t)val;
+ *lower_bound = AOMMIN(*lower_bound, val);
+ *upper_bound = AOMMAX(*upper_bound, val);
+ }
+ src += src_stride;
+ data += cols;
+ }
+}
+
+/*! \brief Colors are sorted by their count: the higher the better.
+ */
+struct ColorCount {
+ //! Color index in the histogram.
+ int index;
+ //! Histogram count.
+ int count;
+};
+
+int color_count_comp(const void *c1, const void *c2) {
+ const struct ColorCount *color_count1 = (const struct ColorCount *)c1;
+ const struct ColorCount *color_count2 = (const struct ColorCount *)c2;
+ if (color_count1->count > color_count2->count) return -1;
+ if (color_count1->count < color_count2->count) return 1;
+ if (color_count1->index < color_count2->index) return -1;
+ return 1;
+}
+
+static void find_top_colors(const int *const count_buf, int bit_depth,
+ int n_colors, int16_t *top_colors) {
+ // Top color array, serving as a priority queue if more than n_colors are
+ // found.
+ struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } };
+ int n_color_count = 0;
+ for (int i = 0; i < (1 << bit_depth); ++i) {
+ if (count_buf[i] > 0) {
+ if (n_color_count < n_colors) {
+ // Keep adding to the top colors.
+ top_color_counts[n_color_count].index = i;
+ top_color_counts[n_color_count].count = count_buf[i];
+ ++n_color_count;
+ if (n_color_count == n_colors) {
+ qsort(top_color_counts, n_colors, sizeof(top_color_counts[0]),
+ color_count_comp);
+ }
+ } else {
+ // Check the worst in the sorted top.
+ if (count_buf[i] > top_color_counts[n_colors - 1].count) {
+ int j = n_colors - 1;
+ // Move up to the best one.
+ while (j >= 1 && count_buf[i] > top_color_counts[j - 1].count) --j;
+ memmove(top_color_counts + j + 1, top_color_counts + j,
+ (n_colors - j - 1) * sizeof(top_color_counts[0]));
+ top_color_counts[j].index = i;
+ top_color_counts[j].count = count_buf[i];
+ }
+ }
+ }
+ }
+ assert(n_color_count == n_colors);
+
+ for (int i = 0; i < n_colors; ++i) {
+ top_colors[i] = top_color_counts[i].index;
+ }
+}
+
+void av1_rd_pick_palette_intra_sby(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
+ MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
+ int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+ uint8_t *tx_type_map) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ bsize));
+ assert(PALETTE_MAX_SIZE == 8);
+ assert(PALETTE_MIN_SIZE == 2);
+
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *const src = x->plane[0].src.buf;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ const SequenceHeader *const seq_params = cpi->common.seq_params;
+ const int is_hbd = seq_params->use_highbitdepth;
+ const int bit_depth = seq_params->bit_depth;
+ const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode;
+ int unused;
+
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ int colors, colors_threshold = 0;
+ if (is_hbd) {
+ int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path.
+ av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
+ count_buf_8bit, &colors_threshold, &colors);
+ } else {
+ av1_count_colors(src, src_stride, rows, cols, count_buf, &colors);
+ colors_threshold = colors;
+ }
+
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int color_thresh_palette = 64;
+ // Allow for larger color_threshold for palette search, based on color,
+ // scene_change, and block source variance.
+ // Since palette is Y based, only allow larger threshold if block
+ // color_dist is below threshold.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad &&
+ x->source_variance > 50) {
+ int64_t norm_color_dist = 0;
+ if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+ norm_color_dist = x->min_dist_inter_uv >>
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ if (x->color_sensitivity[0] && x->color_sensitivity[1])
+ norm_color_dist = norm_color_dist >> 1;
+ }
+ if (norm_color_dist < 8000) color_thresh_palette += 20;
+ }
+ if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) {
+ int16_t *const data = x->palette_buffer->kmeans_data_buf;
+ int16_t centroids[PALETTE_MAX_SIZE];
+ int lower_bound, upper_bound;
+ fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data,
+ &lower_bound, &upper_bound);
+
+ mbmi->mode = DC_PRED;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+ // Find the dominant colors, stored in top_colors[].
+ int16_t top_colors[PALETTE_MAX_SIZE] = { 0 };
+ find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE),
+ top_colors);
+
+ // The following are the approaches used for header rdcost based gating
+ // for early termination for different values of prune_palette_search_level.
+ // 0: Pruning based on header rdcost for ascending order palette_size
+ // search.
+ // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size
+ // search and for finer search do_header_rd_based_gating parameter is
+ // explicitly passed as 'false'.
+ // 2: Enabled only for ascending order palette_size search and for
+ // descending order search do_header_rd_based_gating parameter is explicitly
+ // passed as 'false'.
+ const bool do_header_rd_based_gating =
+ cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
+
+ // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+ // where the dominant colors and the k-means results are similar.
+ if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+ (colors > PALETTE_MIN_SIZE)) {
+ // Start index and step size below are chosen to evaluate unique
+ // candidates in neighbor search, in case a winner candidate is found in
+ // coarse search. Example,
+ // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+ // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+ // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+ // (3) and 8 (7).
+ // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+ // as for 8 colors) then step size should also be 2, to cover all
+ // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+ // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+ // coarse search will evaluate 3 and 6. For the winner, unique neighbors
+ // (3: 2,4 or 6: 5,7) would be evaluated.
+
+ // Start index for coarse palette search for dominant colors and k-means
+ const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+ 3, 3, 2,
+ 3, 3, 2 };
+ // Step size for coarse palette search for dominant colors and k-means
+ const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+ 3, 3, 3,
+ 3, 3, 3 };
+
+ // Choose the start index and step size for coarse search based on number
+ // of colors
+ const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+ const int min_n = start_n_lookup_table[max_n];
+ const int step_size = step_size_lookup_table[max_n];
+ assert(min_n >= PALETTE_MIN_SIZE);
+ // Perform top color coarse palette search to find the winner candidate
+ const int top_color_winner = perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+ step_size, do_header_rd_based_gating, &unused, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+ discount_color_cost);
+ // Evaluate neighbors for the winner color (if winner is found) in the
+ // above coarse search for dominant colors
+ if (top_color_winner <= max_n) {
+ int stage2_min_n, stage2_max_n, stage2_step_size;
+ set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size,
+ top_color_winner, max_n);
+ // perform finer search for the winner candidate
+ perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
+ stage2_max_n + 1, stage2_step_size,
+ /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, discount_color_cost);
+ }
+ // K-means clustering.
+ // Perform k-means coarse palette search to find the winner candidate
+ const int k_means_winner = perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused,
+ color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+ rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, color_map, rows * cols,
+ discount_color_cost);
+ // Evaluate neighbors for the winner color (if winner is found) in the
+ // above coarse search for k-means
+ if (k_means_winner <= max_n) {
+ int start_n_stage2, end_n_stage2, step_size_stage2;
+ set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2,
+ k_means_winner, max_n);
+ // perform finer search for the winner candidate
+ perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ start_n_stage2, end_n_stage2 + 1, step_size_stage2,
+ /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, color_map, rows * cols, discount_color_cost);
+ }
+ } else {
+ const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+ min_n = PALETTE_MIN_SIZE;
+ // Perform top color palette search in ascending order
+ int last_n_searched = min_n;
+ perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+ 1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+ discount_color_cost);
+ if (last_n_searched < max_n) {
+ // Search in descending order until we get to the previous best
+ perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n,
+ last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused,
+ color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+ rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, discount_color_cost);
+ }
+ // K-means clustering.
+ if (colors == PALETTE_MIN_SIZE) {
+ // Special case: These colors automatically become the centroids.
+ assert(colors == 2);
+ centroids[0] = lower_bound;
+ centroids[1] = upper_bound;
+ palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+ color_cache, n_cache, /*do_header_rd_based_gating=*/false,
+ best_mbmi, best_palette_color_map, best_rd, rate,
+ rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, NULL, NULL,
+ discount_color_cost);
+ } else {
+ // Perform k-means palette search in ascending order
+ last_n_searched = min_n;
+ perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched,
+ color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+ rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, color_map, rows * cols,
+ discount_color_cost);
+ if (last_n_searched < max_n) {
+ // Search in descending order until we get to the previous best
+ perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false,
+ &unused, color_cache, n_cache, best_mbmi, best_palette_color_map,
+ best_rd, rate, rate_tokenonly, distortion, skippable,
+ beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+ rows * cols, discount_color_cost);
+ }
+ }
+ }
+ }
+
+ if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ block_width * block_height * sizeof(best_palette_color_map[0]));
+ // Gather the stats to determine whether to use screen content tools in
+ // function av1_determine_sc_tools_with_encoding().
+ x->palette_pixels += (block_width * block_height);
+ }
+ *mbmi = *best_mbmi;
+}
+
+void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+ int dc_mode_cost,
+ uint8_t *best_palette_color_map,
+ MB_MODE_INFO *const best_mbmi,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mbmi->bsize));
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const SequenceHeader *const seq_params = cpi->common.seq_params;
+ int this_rate;
+ int64_t this_rd;
+ int colors_u, colors_v;
+ int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0;
+ const int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ RD_STATS tokenonly_rd_stats;
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+
+ mbmi->uv_mode = UV_DC_PRED;
+ if (seq_params->use_highbitdepth) {
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path.
+ av1_count_colors_highbd(src_u, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf, count_buf_8bit,
+ &colors_threshold_u, &colors_u);
+ av1_count_colors_highbd(src_v, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf, count_buf_8bit,
+ &colors_threshold_v, &colors_v);
+ } else {
+ int count_buf[1 << 8];
+ av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u);
+ av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v);
+ colors_threshold_u = colors_u;
+ colors_threshold_v = colors_v;
+ }
+
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+ colors_threshold = colors_threshold_u > colors_threshold_v
+ ? colors_threshold_u
+ : colors_threshold_v;
+ if (colors_threshold > 1 && colors_threshold <= 64) {
+ int r, c, n, i, j;
+ const int max_itr = 50;
+ int lb_u, ub_u, val_u;
+ int lb_v, ub_v, val_v;
+ int16_t *const data = x->palette_buffer->kmeans_data_buf;
+ int16_t centroids[2 * PALETTE_MAX_SIZE];
+
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ if (seq_params->use_highbitdepth) {
+ lb_u = src_u16[0];
+ ub_u = src_u16[0];
+ lb_v = src_v16[0];
+ ub_v = src_v16[0];
+ } else {
+ lb_u = src_u[0];
+ ub_u = src_u[0];
+ lb_v = src_v[0];
+ ub_v = src_v[0];
+ }
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ if (seq_params->use_highbitdepth) {
+ val_u = src_u16[r * src_stride + c];
+ val_v = src_v16[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ } else {
+ val_u = src_u[r * src_stride + c];
+ val_v = src_v[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ }
+ if (val_u < lb_u)
+ lb_u = val_u;
+ else if (val_u > ub_u)
+ ub_u = val_u;
+ if (val_v < lb_v)
+ lb_v = val_v;
+ else if (val_v > ub_v)
+ ub_v = val_v;
+ }
+ }
+
+ const int colors = colors_u > colors_v ? colors_u : colors_v;
+ const int max_colors =
+ colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+ for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) {
+ for (i = 0; i < n; ++i) {
+ centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+ centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+ optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
+ cpi->common.seq_params->bit_depth);
+ // Sort the U channel colors in ascending order.
+ for (i = 0; i < 2 * (n - 1); i += 2) {
+ int min_idx = i;
+ int min_val = centroids[i];
+ for (j = i + 2; j < 2 * n; j += 2)
+ if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+ if (min_idx != i) {
+ int temp_u = centroids[i], temp_v = centroids[i + 1];
+ centroids[i] = centroids[min_idx];
+ centroids[i + 1] = centroids[min_idx + 1];
+ centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+ }
+ }
+ av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+ pmi->palette_size[1] = n;
+ for (i = 1; i < 3; ++i) {
+ for (j = 0; j < n; ++j) {
+ if (seq_params->use_highbitdepth)
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+ (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+ else
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+ clip_pixel((int)centroids[j * 2 + i - 1]);
+ }
+ }
+
+ if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) {
+ const int palette_mode_rate =
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+ const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+ // Terminate further palette_size search, if header cost corresponding
+ // to lower palette_size is more than the best_rd.
+ if (header_rd >= *best_rd) break;
+ av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+ } else {
+ av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+ }
+
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_mbmi = *mbmi;
+ memcpy(best_palette_color_map, color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ *rate = this_rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *skippable = tokenonly_rd_stats.skip_txfm;
+ }
+ }
+ }
+ if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ int16_t *const data = x->palette_buffer->kmeans_data_buf;
+ int16_t centroids[2 * PALETTE_MAX_SIZE];
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ int r, c;
+ const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ if (cpi->common.seq_params->use_highbitdepth) {
+ data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+ } else {
+ data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+ }
+ }
+ }
+
+ for (r = 1; r < 3; ++r) {
+ for (c = 0; c < pmi->palette_size[1]; ++c) {
+ centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+ }
+ }
+
+ av1_calc_indices(data, centroids, color_map, rows * cols,
+ pmi->palette_size[1], 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+}
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 0000000000..7da863a0cc
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares functions used in palette search.
+ */
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct PICK_MODE_CONTEXT;
+struct macroblock;
+
+/*!\cond */
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
+
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k,
+ int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k,
+ int max_itr);
+/*!\endcond */
+
+/*!\brief Calculates the cluster to which each data point belong.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] data The data points whose cluster indices are
+ * to be computed. The data layout is
+ * NUM_DATA_POINTS X DATA_DIM.
+ * \param[in] centroids Pointer to the centroids. The data layout
+ * is NUM_CENTROIDS X DATA_DIM.
+ * \param[in] indices Pointer to store the computed indices.
+ * \param[in] n Number of data points.
+ * \param[in] k Number of clusters.
+ * \param[in] dim Data dimension.
+ *
+ * \remark Returns nothing, but saves each data's cluster index in \a indices.
+ */
+static INLINE void av1_calc_indices(const int16_t *data,
+ const int16_t *centroids, uint8_t *indices,
+ int n, int k, int dim) {
+ assert(n > 0);
+ assert(k > 0);
+ if (dim == 1) {
+ av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k);
+ } else if (dim == 2) {
+ av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k);
+ } else {
+ assert(0 && "Untemplated k means dimension");
+ }
+}
+
+/*!\brief Performs k-means cluster on the data.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] data The data points to be clustered. The data
+ * layout is NUM_DATA_POINTS X DATA_DIM.
+ * \param[in] centroids Pointer to store the computed centroids.
+ * The data layout is
+ * NUM_CENTROIDS X DATA_DIM.
+ * \param[in] indices Pointer to store the computed indices. For
+ * each training data.
+ * \param[in] n Number of data points.
+ * \param[in] k Number of clusters.
+ * \param[in] dim Data dimension.
+ * \param[in] max_itr Maximum number of iterations to run.
+ *
+ * \remark Returns nothing, but saves each cluster's centroid in centroids and
+ * each data's cluster index in \a indices.
+ *
+ * \attention The output centroids are rounded off to nearest integers.
+ */
+static INLINE void av1_k_means(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k, int dim,
+ int max_itr) {
+ assert(n > 0);
+ assert(k > 0);
+ if (dim == 1) {
+ AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+ } else if (dim == 2) {
+ AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+ } else {
+ assert(0 && "Untemplated k means dimension");
+ }
+}
+
+/*!\brief Removes duplicated centroid indices.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] centroids A list of centroids index.
+ * \param[in] num_centroids Number of centroids.
+ *
+ * \return Returns the number of unique centroids and saves the unique centroids
+ * in beginning of the centroids array.
+ *
+ * \attention The centroids should be rounded to integers before calling this
+ * method.
+ */
+int av1_remove_duplicates(int16_t *centroids, int num_centroids);
+
+/*!\brief Checks what colors are in the color cache.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] color_cache A cache of colors.
+ * \param[in] n_cache Number of colors in the cache.
+ * \param[in] colors New base colors.
+ * \param[in] n_colors Number of new colors.
+ * \param[in] cache_color_found Stores what cached colors are presented in
+ * colors.
+ * \param[in] out_cache_colors Stores what colors are not in the cache.
+ *
+ * \return Returns the number of colors that are not in cache. In addition,
+ * records whether each cache color is presented in colors in cache_color_found,
+ * and stores and stores the out of cache colors in out_cache_colors.
+ */
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+ const uint16_t *colors, int n_colors,
+ uint8_t *cache_color_found, int *out_cache_colors);
+
+/*!\brief Gets the rate cost for each delta-encoding v palette.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] pmi Struct that stores the palette mode info.
+ * \param[in] bit_depth Pixel bitdepth of the sequence.
+ * \param[in] zero_count Stores the number of zero deltas.
+ * \param[in] min_bits Minimum bits for the deltas. Sets to
+ * bit_depth - 4.
+ *
+ * \return Returns the number of bits used to transmit each v palette color
+ * delta and assigns zero_count with the number of deltas being 0.
+ */
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count, int *min_bits);
+
+/*!\brief Gets the rate cost for transmitting luma palette color values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] pmi Struct that stores the palette mode info.
+ * \param[in] color_cache Color cache presented at the decoder.
+ * \param[in] n_cache Number of colors in the cache.
+ * \param[in] bit_depth Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth);
+
+/*!\brief Gets the rate cost for transmitting luma palette chroma values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] pmi Struct that stores the palette mode info.
+ * \param[in] color_cache Color cache presented at the decoder.
+ * \param[in] n_cache Number of colors in the cache.
+ * \param[in] bit_depth Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth);
+
+/*!\brief Search for the best palette in the luma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sby(
+ const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
+ int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+ int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+ uint8_t *best_blk_skip, uint8_t *tx_type_map);
+
+/*!\brief Search for the best palette in the chroma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
+ struct macroblock *x, int dc_mode_cost,
+ uint8_t *best_palette_color_map,
+ MB_MODE_INFO *const best_mbmi,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable);
+
+/*!\brief Resets palette color map for chroma channels.
+ */
+void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_cnn_weights.h b/third_party/aom/av1/encoder/partition_cnn_weights.h
new file mode 100644
index 0000000000..504038c63a
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_cnn_weights.h
@@ -0,0 +1,2139 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/ml.h"
+
+#define CNN_BRANCH_0_OUT_CH 20
+#define CNN_BRANCH_1_OUT_CH 4
+#define CNN_BRANCH_2_OUT_CH 20
+#define CNN_BRANCH_3_OUT_CH 20
+#define CNN_TOT_OUT_CH \
+ (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \
+ (CNN_BRANCH_3_OUT_CH)))
+#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH)
+#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2)
+#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4)
+#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8)
+#define CNN_OUT_BUF_SIZE \
+ (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \
+ (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE)))
+
+#define NUM_DNN_BRANCHES 4
+#define NUM_CNN_LAYERS 5
+#define BRANCH_0_NUM_DNN_LAYERS 2
+#define BRANCH_1_NUM_DNN_LAYERS 2
+#define BRANCH_2_NUM_DNN_LAYERS 2
+#define BRANCH_3_NUM_DNN_LAYERS 2
+#define CNN_LAYER_0_HEIGHT 5
+#define CNN_LAYER_0_WIDTH 5
+#define CNN_LAYER_0_IN_CH 1
+#define CNN_LAYER_0_OUT_CH 20
+#define CNN_LAYER_0_HORZ_STRIDE 4
+#define CNN_LAYER_0_VERT_STRIDE 4
+#define CNN_LAYER_1_HEIGHT 2
+#define CNN_LAYER_1_WIDTH 2
+#define CNN_LAYER_1_IN_CH 20
+#define CNN_LAYER_1_OUT_CH 20
+#define CNN_LAYER_1_HORZ_STRIDE 2
+#define CNN_LAYER_1_VERT_STRIDE 2
+#define CNN_LAYER_2_HEIGHT 2
+#define CNN_LAYER_2_WIDTH 2
+#define CNN_LAYER_2_IN_CH 20
+#define CNN_LAYER_2_OUT_CH 20
+#define CNN_LAYER_2_HORZ_STRIDE 2
+#define CNN_LAYER_2_VERT_STRIDE 2
+#define CNN_LAYER_3_HEIGHT 2
+#define CNN_LAYER_3_WIDTH 2
+#define CNN_LAYER_3_IN_CH 20
+#define CNN_LAYER_3_OUT_CH 4
+#define CNN_LAYER_3_HORZ_STRIDE 2
+#define CNN_LAYER_3_VERT_STRIDE 2
+#define CNN_LAYER_4_HEIGHT 2
+#define CNN_LAYER_4_WIDTH 2
+#define CNN_LAYER_4_IN_CH 4
+#define CNN_LAYER_4_OUT_CH 20
+#define CNN_LAYER_4_HORZ_STRIDE 2
+#define CNN_LAYER_4_VERT_STRIDE 2
+#define BRANCH_0_NUM_DNN_FEATURES 37
+#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_0_NUM_LOGITS 1
+#define BRANCH_1_NUM_DNN_FEATURES 25
+#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_1_NUM_LOGITS 1
+#define BRANCH_2_NUM_DNN_FEATURES 25
+#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_2_NUM_LOGITS 1
+#define BRANCH_3_NUM_DNN_FEATURES 41
+#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_3_NUM_LOGITS 1
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = {
+ 0.131894f, -0.593536f, -0.212935f, -0.00220011f, -0.396949f,
+ 0.287753f, -0.91875f, -0.0095057f, 0.804197f, -0.395239f,
+ 0.516604f, 1.16439f, 0.445784f, -0.163349f, 0.746488f,
+ -0.33891f, -0.562652f, 0.481403f, 0.755378f, -0.200753f,
+ 0.0784307f, 0.105657f, 0.0205673f, -0.524089f, -0.476146f,
+ -0.161206f, -0.65079f, 0.137474f, 0.28584f, 0.508768f,
+ -0.643386f, 0.227068f, -0.899507f, -0.413382f, 0.631466f,
+ 0.398203f, -0.544392f, 0.825155f, 0.671847f, -0.249779f,
+ 0.323121f, 0.125357f, -0.719564f, -0.0714854f, -0.168472f,
+ -0.213246f, -0.674525f, 0.330148f, -0.138414f, 0.20462f,
+ -0.518571f, -0.15091f, -0.605116f, -0.448732f, -0.475599f,
+ 0.738f, -0.328526f, 0.755035f, 0.969414f, -0.321039f,
+ -0.23068f, 0.408567f, -0.377813f, -0.273974f, 1.0684f,
+ 0.373968f, -0.450305f, 0.439258f, -0.381846f, -0.267331f,
+ 0.30613f, -0.39369f, 0.622438f, -0.52877f, -0.334991f,
+ 0.263193f, -0.402121f, 0.64142f, 0.793048f, -0.0231174f,
+ -0.68474f, -0.293338f, -0.737511f, -0.462654f, 0.474629f,
+ 0.141397f, -0.152529f, 0.345879f, -0.499991f, 0.00174024f,
+ 0.337387f, -0.131151f, 0.427385f, -0.457449f, -0.879614f,
+ -0.425908f, -0.263172f, 0.0344974f, 1.07861f, -0.00416662f,
+ 0.0208952f, 0.233905f, 0.765965f, 0.0423685f, -0.117554f,
+ -0.248237f, 0.49848f, -0.845131f, 0.223648f, -0.838709f,
+ 0.5834f, 0.309956f, -0.0625093f, -0.619619f, 0.918957f,
+ 0.358271f, -0.668459f, 0.518783f, -0.418963f, -0.206788f,
+ 0.364983f, -0.0396087f, 0.624309f, -0.138679f, -0.142453f,
+ 0.28309f, 0.895092f, -0.215713f, 0.439025f, 0.659333f,
+ -0.366025f, -0.413518f, 0.66657f, -0.265919f, 0.473471f,
+ -1.0729f, -0.526702f, 0.2838f, 0.367648f, -0.61242f,
+ 0.121656f, 0.547727f, -0.0636793f, -0.33006f, -0.306604f,
+ -0.00897731f, 0.688242f, 0.0944626f, 0.321508f, 0.0437392f,
+ -0.560035f, -0.768334f, 0.0571051f, -0.0427601f, -0.0437806f,
+ -0.816209f, -0.395829f, 0.293733f, 0.217645f, -0.646428f,
+ 0.132448f, -0.435806f, -0.0556814f, 0.0218857f, 0.348525f,
+ -0.17296f, 0.669057f, 0.638604f, -0.0995596f, -0.024099f,
+ -0.262332f, -0.548975f, 0.357894f, 0.43873f, -0.688234f,
+ -0.425519f, 0.190986f, -0.074778f, 0.294232f, -0.548969f,
+ -0.731198f, 0.03616f, -0.475969f, -0.306075f, -0.111929f,
+ -0.234146f, 0.612669f, 0.882254f, -0.622893f, 0.262431f,
+ 0.465242f, 0.245384f, -0.811016f, 0.501798f, -0.925875f,
+ 0.264373f, 0.307766f, -0.26872f, 0.113027f, -0.158875f,
+ 0.0711483f, 0.220275f, -0.0699022f, -0.0111303f, -0.435384f,
+ -0.720014f, 0.593484f, -0.964082f, 0.750925f, 0.252433f,
+ 0.964332f, -0.256904f, -0.421715f, -0.403851f, -0.188081f,
+ 0.694014f, -1.00183f, 0.798921f, 0.0603123f, 0.213814f,
+ 0.739642f, -0.0203375f, 0.72569f, -0.260224f, 0.0199516f,
+ -0.322451f, 0.318204f, -0.38392f, 0.740994f, -0.265215f,
+ -0.54541f, -0.51479f, -0.458397f, 0.519564f, 0.0509182f,
+ 0.0363331f, -0.293051f, 0.317714f, -0.327488f, -0.0840401f,
+ 0.318437f, -0.619403f, 0.641094f, -0.288435f, -0.260185f,
+ 0.181083f, -0.169294f, 0.292645f, 0.140405f, 0.0572885f,
+ -0.637428f, -0.102616f, 0.288955f, 0.817314f, 0.116855f,
+ 0.635532f, 0.283334f, -0.236391f, -0.305035f, -0.217365f,
+ -0.033021f, -0.455858f, 0.439922f, -0.104039f, 0.373376f,
+ 0.310659f, 0.388789f, 0.266341f, 0.0746306f, -0.428192f,
+ -0.202695f, -0.347625f, 0.00585741f, 0.366203f, 0.221413f,
+ 0.518856f, 0.57245f, -0.375071f, -0.2436f, -0.511895f,
+ -1.03708f, 0.681455f, -0.111544f, -0.183563f, 0.109729f,
+ -0.422646f, -0.529777f, 0.747473f, -0.270223f, -0.11435f,
+ 0.378931f, 0.420456f, 0.236331f, 0.49261f, -0.0666801f,
+ 0.0475846f, 0.906095f, -0.4146f, -0.020588f, -0.653285f,
+ 0.135335f, 0.543846f, -0.309061f, 0.11899f, -0.639168f,
+ -0.719994f, -0.219706f, -0.645631f, -0.829049f, -0.0114746f,
+ 0.834604f, 0.0378035f, 0.107957f, 0.546929f, -0.674395f,
+ -0.854817f, -1.1443f, 0.223413f, -0.326324f, 0.440971f,
+ 0.383582f, -0.495084f, 0.280091f, -0.53116f, 0.0333923f,
+ -0.354339f, -0.0449156f, -0.538896f, -0.753355f, 0.463995f,
+ 0.000969967f, -0.2832f, 0.587276f, 0.853094f, -0.481985f,
+ -0.138202f, 0.180989f, -0.349044f, -0.417534f, 0.455591f,
+ 0.287332f, 0.251496f, 0.381416f, 0.339632f, -0.0825727f,
+ 0.352739f, 0.161697f, -0.319764f, -0.258015f, 0.668833f,
+ -0.553303f, -0.578815f, -0.3758f, 0.289f, 0.247368f,
+ 0.00681103f, 0.421092f, -0.191033f, -0.425868f, -0.1239f,
+ 0.0540422f, -0.0856856f, 0.481168f, -0.0283741f, -0.196018f,
+ 0.230923f, -0.145288f, 0.52188f, 0.00628462f, -0.604556f,
+ -0.562879f, 0.319282f, 0.323799f, 0.453941f, 0.271129f,
+ -0.0520196f, 0.684571f, -0.391779f, -0.404614f, 0.134097f,
+ -0.825482f, 0.0913949f, 0.483543f, 0.159084f, 0.301637f,
+ 0.427013f, 0.196153f, 0.460091f, -0.730573f, -0.12278f,
+ 0.221665f, 0.674622f, -0.623363f, -0.0761517f, 0.637979f,
+ -0.468498f, 0.527276f, -0.596894f, -0.34675f, -0.251241f,
+ 0.418533f, -0.476696f, -0.901267f, -0.0088241f, -0.12421f,
+ -0.660316f, -0.0222117f, -0.470898f, -1.10739f, -0.441645f,
+ 0.39516f, -0.0117906f, 0.254122f, 0.00722599f, -1.00697f,
+ 0.48908f, -0.122287f, -0.378608f, -0.339145f, 0.682463f,
+ 0.305606f, 0.453628f, -0.49923f, -0.791388f, -0.202515f,
+ 0.23214f, -0.434209f, -0.778283f, -0.538015f, 0.145769f,
+ 0.446281f, -0.339329f, -0.198478f, -0.183717f, -0.855441f,
+ -0.105778f, 0.575067f, -0.18592f, -0.348094f, 0.740614f,
+ 0.041549f, -0.109663f, 0.0434492f, 0.245242f, -1.22192f,
+ 0.685896f, -0.208115f, -0.0616216f, -1.00552f, 0.31045f,
+ -0.184394f, 0.466705f, -0.0984364f, -0.506252f, 0.144874f,
+ 0.357038f, 0.675221f, -0.822171f, -0.52729f, 0.991212f,
+ 0.432422f, 0.383493f, -0.372395f, 0.35651f, -0.25369f,
+ 0.660208f, -0.117745f, -0.142433f, -0.724115f, -1.0035f,
+ -0.59178f, 0.563444f, -0.282531f, -0.599989f, 0.507424f,
+ -0.782875f, 0.755029f, -0.754962f, -0.617825f, 0.565984f,
+ -0.826878f, -0.456563f, 0.0212161f, 0.469867f, -0.144864f,
+ 0.225748f, -0.279029f, 0.21052f, -0.440183f, 0.936069f,
+ 0.170595f, 0.40966f, 0.452453f, -0.576006f, 1.50696f,
+ 0.649049f, 0.094957f, -0.167706f, -0.258342f, 0.59269f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = {
+ 0.00475215f, -0.00362332f, -0.00317542f, 0.190083f, 0.0488147f,
+ -0.0268093f, -0.00432231f, 0.0112229f, 0.0626653f, -0.0025698f,
+ 0.0018675f, -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f,
+ 0.000136436f, 0.0667295f, 0.0251274f, 0.00226553f, -0.000638344f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = {
+ 0.228403f, 0.241933f, 0.181079f, 0.101728f, 0.278455f,
+ -0.222078f, 0.387578f, 0.0847356f, -0.0737012f, 0.26518f,
+ -1.0817f, 0.0404161f, -0.805199f, 0.336576f, -0.541494f,
+ 0.246264f, 0.116597f, -0.756804f, -0.914136f, 0.410265f,
+ 0.413294f, 0.07873f, 0.450017f, -0.264346f, 0.549095f,
+ 1.03755f, -0.203542f, 1.61018f, 0.374131f, 0.402515f,
+ -2.36115f, 0.116427f, -0.172157f, -0.231482f, -0.905736f,
+ -0.0183059f, -0.575746f, 0.110348f, -0.268018f, 0.140399f,
+ 0.427196f, 0.0718528f, 0.247936f, -0.326661f, 0.150404f,
+ -0.659979f, -0.157148f, 0.00826241f, -0.679275f, -0.131564f,
+ -1.04822f, 1.06039f, -0.207898f, 0.510167f, 0.484233f,
+ 0.138972f, -0.0801639f, -0.184416f, 0.0741107f, -0.0299281f,
+ 0.112263f, 0.380071f, -0.0185269f, -0.0821188f, 0.918796f,
+ -0.576106f, 0.593007f, 0.479446f, 0.0440703f, 0.322379f,
+ 0.176783f, -0.147111f, 0.0953247f, -0.636377f, 0.0702104f,
+ 0.130979f, 0.293892f, -0.0112124f, -0.040347f, -0.16034f,
+ 0.3252f, -0.586802f, 0.601786f, -0.487148f, -0.458777f,
+ 0.463835f, 0.144942f, 0.00339965f, -0.779966f, 0.0585298f,
+ -1.20758f, -0.275614f, 0.292346f, -0.132781f, 0.337892f,
+ -0.357677f, 1.48511f, 0.172907f, -0.148668f, 0.243184f,
+ -0.503392f, -0.0791543f, 0.0265389f, -0.102267f, 0.213294f,
+ 0.0657801f, 0.156996f, 0.0891168f, 0.120805f, 0.261285f,
+ -0.343025f, -0.0792235f, -0.106415f, 0.133878f, -0.112981f,
+ -0.00151126f, -0.0643829f, 0.0458938f, -0.0452731f, -0.00147422f,
+ 0.1871f, -0.0208793f, 0.0752037f, 0.0794674f, 0.167666f,
+ 0.198028f, -0.361015f, -0.0661721f, -0.10672f, -0.0773641f,
+ -1.15856f, -0.516443f, -0.322702f, 0.15668f, 0.0075841f,
+ -0.157731f, 0.270926f, -0.241551f, 0.0169097f, -0.0263953f,
+ -0.303556f, -0.239237f, 0.117792f, -0.137871f, 0.122054f,
+ -0.587381f, 0.112938f, 0.0867262f, -0.27909f, -0.203622f,
+ -0.622195f, 0.42623f, 0.670704f, 0.190826f, -0.304979f,
+ -0.570075f, -0.240699f, 0.43744f, 0.632896f, -0.563846f,
+ -0.0160434f, -0.0709745f, 0.816662f, 0.269999f, -0.358734f,
+ 0.193644f, 1.19339f, -0.118223f, -0.363291f, -0.723616f,
+ -1.58825f, 0.0222856f, 0.769852f, 0.322713f, 0.0857619f,
+ -0.669756f, -1.08414f, 1.18593f, 0.486166f, -0.520646f,
+ 0.0861854f, -0.134197f, 0.258337f, 0.223345f, 0.697639f,
+ -0.57261f, 0.54031f, 0.892644f, 0.497572f, -0.287076f,
+ -1.95928f, -0.0568128f, -0.253335f, 0.00233392f, -0.192787f,
+ -0.115203f, -0.0975649f, 0.277954f, 0.000704534f, -0.315884f,
+ 0.309583f, 0.357458f, 0.0939298f, -0.072701f, 0.433045f,
+ -0.536938f, 0.534523f, 0.184585f, -0.0415175f, -0.120909f,
+ -1.2622f, 0.412449f, -0.114741f, 0.290453f, -0.441671f,
+ -0.0242497f, -0.20746f, 0.139019f, -0.422668f, -0.146732f,
+ -0.688828f, -0.00339426f, 0.04166f, 0.41755f, 0.405675f,
+ 0.562564f, 0.0216812f, 0.0271391f, 0.215227f, 0.328183f,
+ -1.6442f, -0.827838f, 0.115491f, 0.0951442f, -0.133779f,
+ -0.0482928f, 0.203177f, 0.322953f, -0.513259f, 0.0676788f,
+ -0.0877928f, 0.224448f, 0.451957f, 0.314243f, 0.307403f,
+ 0.35653f, 0.0286278f, 2.27554f, 0.569313f, -0.0488753f,
+ -2.48809f, 0.274555f, -0.248375f, -0.635634f, -0.187663f,
+ 0.1827f, -0.409634f, -0.0280568f, -0.207119f, -0.208192f,
+ -0.410268f, -0.017669f, 0.134856f, 0.434551f, 0.165201f,
+ 0.584608f, -0.389997f, -0.088713f, 0.118087f, 0.00210905f,
+ -1.07698f, -0.520967f, -0.198742f, 0.190255f, -0.162639f,
+ 0.0122759f, 0.460774f, -0.684633f, -0.149512f, 0.167556f,
+ -0.295034f, -0.0650964f, 0.0868653f, -0.691352f, 0.089795f,
+ 0.0620608f, 0.0531289f, 0.0124286f, 0.151921f, 1.51067f,
+ -0.10586f, -0.0311871f, 0.114706f, 0.0565205f, -0.159634f,
+ -0.423987f, -0.226896f, 0.0605352f, -0.36324f, -0.142205f,
+ -0.252249f, 0.0666312f, 0.316655f, 0.00687196f, 0.131079f,
+ -0.128281f, -0.293468f, 1.3327f, 0.542277f, -0.060088f,
+ -1.73475f, 0.0542297f, -0.227522f, -0.376004f, -0.147028f,
+ 0.0228252f, 0.0569538f, -0.0796497f, 0.0937596f, -0.0660153f,
+ -0.979219f, -0.377322f, 0.0523787f, 0.467299f, 0.0824278f,
+ 0.437147f, 0.263637f, 0.0325681f, 0.303581f, 0.353479f,
+ -0.142369f, -0.394797f, 0.597185f, 0.116482f, -0.0782593f,
+ 0.364539f, -0.30396f, 0.119016f, -0.0022429f, -0.044292f,
+ -0.0110531f, 0.233571f, 0.000975879f, 0.447332f, -0.0320396f,
+ 0.541609f, 0.14232f, 0.163905f, 0.848609f, 0.19954f,
+ -0.186591f, -0.44465f, -0.431672f, 0.159037f, -0.129977f,
+ -0.141778f, 0.246818f, -0.197539f, -0.70115f, 0.185449f,
+ 0.400274f, -0.0350744f, 0.239727f, -0.290504f, 0.0698443f,
+ -0.180374f, -0.759591f, -0.0569088f, -0.50246f, -0.0986616f,
+ -0.892114f, 0.306737f, -0.133937f, 0.285625f, 0.495471f,
+ -0.686222f, -0.168647f, -0.0926158f, 0.351772f, -0.0215394f,
+ 0.361223f, 0.0657142f, 0.268229f, -0.616299f, 0.0564718f,
+ -0.294013f, -0.588019f, 0.0234195f, -0.426863f, -0.511253f,
+ -0.72177f, 0.420903f, 0.0987506f, 0.309368f, 0.523532f,
+ 1.06073f, -0.33028f, 0.0818142f, 0.0130354f, 0.0180882f,
+ 0.0316898f, -0.416614f, -0.566344f, -0.163083f, 0.285085f,
+ -0.0534352f, 0.385496f, 0.151068f, -0.208295f, -0.175648f,
+ 0.0476705f, 0.190428f, -0.643391f, 0.484004f, -0.421836f,
+ -0.19829f, -0.227574f, -0.0869152f, 1.09881f, 0.345129f,
+ -0.236732f, -0.381935f, -1.46271f, 0.465914f, 0.610375f,
+ 0.689968f, -0.688546f, 1.95033f, 0.420946f, 0.0282428f,
+ 0.147823f, 0.669393f, 0.429085f, -0.328385f, -0.150439f,
+ -0.419097f, -0.828102f, 0.248743f, 0.24644f, 0.0186131f,
+ -0.384319f, -0.126294f, -0.417067f, 0.271483f, -0.0128456f,
+ -0.881351f, 0.152581f, 0.185584f, -0.745827f, 0.0551359f,
+ 0.127083f, 0.936983f, -0.0225341f, 0.575861f, 0.767417f,
+ -0.140867f, -0.762518f, 0.422446f, -0.0611973f, 0.0515641f,
+ -0.144168f, -0.298882f, 0.308461f, 0.0208704f, 0.213872f,
+ -0.258708f, 1.13186f, 0.314083f, -0.347536f, -0.137768f,
+ 0.653953f, -0.217883f, -0.56112f, -0.864661f, 0.488836f,
+ 0.268133f, -0.548664f, -0.765226f, 0.117082f, 0.326798f,
+ -0.678246f, 0.477785f, -1.27584f, 0.198912f, -0.710395f,
+ 1.39096f, -0.411577f, -0.55119f, 0.51092f, -0.295023f,
+ 0.245983f, -0.0957192f, -0.312001f, 0.0175991f, 0.524423f,
+ -0.126379f, 0.124687f, -1.53945f, -0.342856f, 0.514072f,
+ 0.400884f, -0.00581101f, -0.219327f, 0.0977873f, 0.337551f,
+ -0.058603f, 0.20034f, 0.0429945f, 0.676803f, -0.273585f,
+ -0.173435f, -0.581596f, 0.226263f, -0.0946223f, -0.060088f,
+ -0.0100809f, -0.022242f, -0.22218f, -0.030463f, -0.141389f,
+ -0.190757f, -0.00526518f, -0.77519f, -0.0825695f, 0.308403f,
+ 0.262792f, -0.601842f, 0.0783697f, 0.197527f, 0.0714048f,
+ 0.0392629f, -0.388628f, 0.172541f, -0.0222009f, 0.252096f,
+ 0.0728652f, 0.173632f, 0.192914f, -0.00969965f, 0.0530136f,
+ -0.00765759f, 0.440234f, -0.0943323f, 0.112319f, 0.0878737f,
+ -0.739021f, 0.385305f, 0.133334f, -0.396697f, 0.177818f,
+ -0.0712558f, 0.516923f, 0.102174f, 0.17158f, -0.211068f,
+ 0.295795f, -0.36198f, 0.179087f, -0.845744f, -0.242514f,
+ -1.49073f, 0.272702f, 0.59011f, -0.408184f, -0.0731313f,
+ 0.234643f, 0.589642f, -0.100778f, 0.516921f, -0.700154f,
+ 0.316432f, 0.36117f, 0.0380282f, 0.480101f, -0.0975487f,
+ 0.941452f, 0.231705f, -0.151182f, -1.20305f, 0.28255f,
+ -0.0427662f, -0.00717175f, -0.842085f, -0.357376f, 0.545581f,
+ -0.290714f, 0.741498f, 1.00377f, 0.483864f, 0.150405f,
+ 0.0834512f, -0.10031f, 0.424054f, -0.0223491f, -0.0696701f,
+ -0.134479f, -0.747227f, 0.422208f, 0.123858f, -0.392624f,
+ -0.0299847f, -0.0376142f, -0.392536f, -0.0343114f, 0.298224f,
+ -0.375899f, 0.693119f, 0.27909f, -0.53463f, 0.105459f,
+ -0.0267383f, 0.5094f, -0.411557f, 0.451749f, -0.348479f,
+ -0.0497316f, -0.353913f, -0.14858f, 0.241838f, 0.331039f,
+ 0.756607f, -0.0701661f, -0.827264f, -0.367772f, 0.447201f,
+ 0.834616f, -0.00497265f, -0.0557285f, 0.055088f, -0.300115f,
+ -0.143833f, -1.07838f, -0.106896f, 0.16945f, 0.0170324f,
+ 0.108754f, 0.335893f, -0.0923708f, 0.450209f, -0.0713308f,
+ -0.0233037f, -0.0129902f, -1.40664f, -0.0996218f, 0.711236f,
+ 0.400716f, 0.227871f, 2.01499f, 0.572926f, 0.135673f,
+ -0.0340458f, -0.316736f, 0.24257f, -0.700768f, -0.194985f,
+ 0.312011f, -0.179599f, 0.128114f, 0.0725977f, -0.193816f,
+ 0.352143f, 0.070641f, -0.467808f, -0.399047f, 0.10136f,
+ 0.671574f, -0.553965f, 0.105729f, 0.210383f, 0.065048f,
+ 0.248198f, -0.731674f, 0.588725f, -0.308237f, 0.24511f,
+ 0.00608906f, 0.170906f, 0.246175f, 0.149521f, 0.106071f,
+ 0.160246f, 0.118487f, -0.104102f, 0.872823f, 0.227478f,
+ 0.0182631f, -0.115083f, 0.0142445f, 0.307947f, -0.884925f,
+ 0.0767105f, 0.0414042f, -0.448021f, -0.0400193f, -0.0765448f,
+ -0.411931f, -0.199624f, 0.333371f, 0.17267f, -0.0431816f,
+ 0.190826f, -0.0758961f, -1.02831f, -0.0414525f, 0.605374f,
+ -0.0188181f, -0.2207f, 1.30004f, -0.207005f, -0.0333617f,
+ 0.227145f, 0.105059f, -0.0473393f, -0.448752f, -0.0342152f,
+ -0.0244812f, 0.220329f, 0.0313591f, -0.0902074f, -0.0731945f,
+ 0.88488f, 0.306306f, -0.275613f, -0.476372f, 0.00678104f,
+ 0.442029f, 0.122049f, 0.118042f, 0.270527f, -0.462538f,
+ 0.0665021f, -0.260255f, 0.209182f, 0.162321f, 0.0629934f,
+ -0.244896f, -0.078863f, 0.655585f, -0.0506617f, -0.487128f,
+ 0.118765f, -0.34408f, 0.0930615f, -0.365632f, -0.0670776f,
+ 0.44428f, 0.286734f, 0.146608f, 0.686757f, -0.0738428f,
+ -0.10034f, -0.928438f, -0.172601f, -0.0959575f, -0.010532f,
+ 0.277549f, 0.28773f, -0.318883f, 0.71254f, 0.273593f,
+ -0.382845f, -0.0104587f, -0.647769f, 0.25541f, 0.194625f,
+ 0.265197f, -0.750938f, -0.0650515f, -0.567092f, 0.070613f,
+ 0.209531f, 0.429699f, 0.130676f, 0.514914f, 0.615778f,
+ 0.594535f, -0.0878778f, 0.40593f, -0.303383f, 0.0907863f,
+ -0.320068f, 0.0137162f, -0.303424f, 0.594207f, -0.236524f,
+ -0.692627f, -0.990063f, -0.0262934f, 0.222375f, 0.503412f,
+ 0.220224f, 0.676871f, -0.150996f, 0.379777f, 0.841339f,
+ -1.05981f, 0.259943f, -0.781745f, 0.0346478f, 0.115791f,
+ -0.25171f, -0.00872158f, 0.395561f, -0.0849893f, -1.20134f,
+ -0.313938f, 0.789542f, 0.159606f, -0.782095f, -0.229754f,
+ 0.266687f, -0.0354282f, -0.3041f, 0.0338618f, -0.390001f,
+ -0.28362f, -0.436144f, 0.777351f, 0.855321f, 0.653338f,
+ -0.0382912f, -0.204577f, 1.13828f, 0.220395f, -4.60853f,
+ 0.575694f, 0.0453189f, 1.76567f, 0.466151f, -0.366109f,
+ 0.594717f, 0.278891f, -0.750676f, -0.332739f, -0.942304f,
+ 0.280363f, 0.284561f, 0.209326f, 0.238347f, -0.0124311f,
+ -0.439463f, -0.036186f, 0.165997f, 0.374717f, -0.481148f,
+ -0.626417f, 0.0223598f, 0.039337f, -0.379918f, 0.211046f,
+ 0.0795812f, 0.863355f, -0.341448f, 0.421494f, 0.410477f,
+ -0.117025f, -0.511108f, 0.565193f, -0.063582f, -0.031349f,
+ -0.0750174f, 0.387941f, 0.541266f, 0.0919753f, 1.05041f,
+ 0.263004f, 0.289006f, 0.0439694f, -1.22439f, -0.247832f,
+ 0.260967f, 0.355794f, 0.599694f, -0.69418f, 0.372805f,
+ -0.161731f, 0.0720574f, 0.0394657f, 0.122772f, -0.458067f,
+ -0.370826f, -1.34495e-05f, -0.373404f, 0.0245539f, -2.3472f,
+ -2.61448f, 0.264794f, 0.0601582f, -0.968597f, -0.196022f,
+ -0.727067f, 0.167346f, 0.517478f, 0.0035377f, 0.777219f,
+ 0.553128f, 0.727211f, 0.606202f, -0.495604f, 2.41445f,
+ 0.465214f, -0.0443004f, 0.142972f, 0.141459f, -0.17771f,
+ 0.0156117f, 0.169264f, 0.0428022f, -0.164827f, -0.240632f,
+ 0.215289f, -0.213134f, -0.184163f, 0.0161321f, -0.20025f,
+ -0.0311616f, 0.00292108f, -0.0131921f, 0.0437664f, -0.104817f,
+ -0.131906f, 0.0822771f, 0.237307f, -0.347567f, -1.2485f,
+ 0.253616f, -0.442217f, 0.0514077f, 0.337561f, -0.0147658f,
+ -0.132888f, -0.643821f, 0.445573f, -0.0146213f, 0.235511f,
+ 0.53583f, -0.640644f, 0.0280044f, 0.00628834f, 0.143885f,
+ 0.380077f, -0.542342f, 0.363101f, 0.0647334f, -0.476556f,
+ -0.822676f, 0.482454f, -0.0467326f, -0.253083f, 0.116726f,
+ 0.317333f, 0.548131f, -0.234667f, 0.579923f, -0.420683f,
+ 0.595613f, -0.279864f, -0.753204f, -0.516844f, -0.436574f,
+ -0.120682f, -0.278939f, 0.752202f, -0.183443f, -0.14632f,
+ -0.0344068f, 0.127638f, -0.225245f, 0.489391f, 0.145082f,
+ -0.73672f, 0.980065f, -0.0367412f, 0.40632f, -0.802509f,
+ 0.356897f, 0.366172f, 1.23858f, -0.978381f, -0.684924f,
+ -0.0870693f, -0.353628f, 0.695788f, -0.244593f, -1.8897f,
+ -0.257803f, 0.686937f, 0.405155f, -0.125696f, 0.258075f,
+ 0.570584f, -0.439481f, -0.59798f, 0.0745711f, -0.235162f,
+ 0.133048f, -0.243033f, 0.0415527f, -0.00118735f, 0.00980514f,
+ -0.297429f, -0.144983f, 0.463093f, 0.0965441f, -0.338508f,
+ -0.651077f, 0.817577f, -0.0364773f, -0.388465f, 0.113288f,
+ 0.231198f, 0.316208f, -0.592201f, 0.530376f, -0.431434f,
+ 0.0200985f, 0.104303f, -0.130705f, 0.4374f, 0.362342f,
+ 0.70641f, 0.20037f, 0.309128f, -0.484535f, -1.18469f,
+ 0.513893f, 0.201236f, -0.022396f, 0.179638f, -0.361289f,
+ -0.0794946f, -1.04704f, -0.0281103f, 0.0494822f, 0.00196415f,
+ 0.0625478f, -0.229033f, 0.12018f, 0.542629f, -0.222423f,
+ -0.0123321f, -0.0988525f, 0.773192f, -0.192218f, -3.19156f,
+ 0.300606f, 0.462751f, 2.2968f, 0.137182f, 0.132539f,
+ 0.165884f, 0.128818f, -0.155856f, -0.558538f, -0.231742f,
+ -0.244377f, -0.442397f, 0.250947f, 0.0850658f, -0.00820139f,
+ 0.391284f, 0.17453f, 0.306003f, -0.531499f, -0.624451f,
+ 0.564584f, -0.343953f, -0.0278713f, 0.212664f, -0.135969f,
+ -0.0179867f, -0.687887f, 0.371065f, -0.0537029f, 0.0499509f,
+ 0.0980684f, -0.0438569f, 0.186731f, 0.182105f, 0.172254f,
+ -0.149446f, -0.0247637f, 0.148098f, 1.20772f, -0.136664f,
+ 0.00983112f, 0.0181381f, -0.0147549f, -0.0846561f, -0.827022f,
+ 0.00207177f, 0.0478215f, 0.0652549f, 0.0898219f, -0.0224959f,
+ -0.0274246f, 0.0166498f, -0.0211715f, -0.502932f, 0.0961452f,
+ 0.251206f, -0.0623632f, 0.741566f, 0.0078449f, -2.99162f,
+ -0.187244f, 0.0743479f, 1.46425f, 0.0737923f, 0.0133544f,
+ 0.20922f, -0.178671f, -0.0528492f, -0.526717f, 0.0282125f,
+ -0.0363201f, 0.37406f, -0.303658f, -0.066803f, 0.132237f,
+ 0.962057f, -0.399733f, 0.191765f, -0.452606f, -0.348732f,
+ 0.444939f, 0.153025f, 0.0796317f, 0.265985f, -0.319638f,
+ 0.0278161f, -0.333734f, 0.226108f, 0.147895f, -0.124066f,
+ -0.37306f, 0.19541f, 0.200175f, -0.0593244f, 0.0333887f,
+ -0.0284278f, 0.462491f, 0.0686487f, -0.332435f, -0.437166f,
+ 0.302795f, 0.100542f, 0.0265019f, 0.767212f, -0.140621f,
+ 0.11558f, -0.70584f, -0.00017415f, 0.00793092f, -0.0490901f,
+ 0.0598338f, 0.484876f, -0.13025f, 0.660349f, 0.147503f,
+ -0.462766f, 0.0843824f, 0.218493f, 0.310921f, -0.162284f,
+ 0.210404f, -0.788799f, 0.0698512f, -0.484799f, 0.0311505f,
+ -0.308243f, 0.417298f, 0.0593723f, 0.208908f, 0.451437f,
+ 0.354546f, -0.0700888f, -0.281678f, -0.311177f, 0.00914652f,
+ -0.372084f, 0.135036f, 0.185393f, 0.461347f, -0.114241f,
+ -0.402347f, -0.692327f, 0.0376155f, -0.200267f, 0.565963f,
+ -0.0627442f, 0.429677f, 0.170514f, 0.350565f, 0.699528f,
+ -0.948126f, -0.364205f, 0.348878f, -0.137832f, -0.0791649f,
+ -0.0462295f, -0.255078f, -0.398509f, 0.136783f, -0.0164628f,
+ -0.555472f, 0.690396f, 0.147715f, 0.000523095f, 0.14874f,
+ 0.524804f, 0.162974f, 0.797599f, 0.277473f, -0.500696f,
+ 0.189917f, -0.333309f, 0.00613646f, -1.07817f, 0.0470502f,
+ 0.210766f, 0.159768f, -0.447774f, -0.252968f, -1.72739f,
+ 0.0658259f, -0.448747f, 2.26511f, 0.349651f, 0.157232f,
+ 0.956842f, 0.856676f, 0.149227f, -0.626957f, -0.566771f,
+ -0.0980846f, 0.351668f, -0.362741f, -0.0272282f, -0.113632f,
+ 0.366015f, -0.00790003f, -0.458632f, -0.31157f, -0.182257f,
+ -0.953975f, 0.0583582f, 0.164721f, -0.900107f, -0.115542f,
+ 0.0654192f, 0.99056f, -0.247976f, 0.48254f, 0.670196f,
+ 0.098585f, -0.212855f, 0.310072f, 0.0894616f, 0.151944f,
+ 0.119629f, -0.26735f, 0.162257f, -0.0305818f, 0.681526f,
+ -0.229847f, 1.01556f, 0.29132f, 0.740113f, 0.0703937f,
+ 0.537892f, -0.18653f, -0.0252359f, -0.420014f, 0.197631f,
+ -0.176629f, 0.00674754f, 0.301288f, -0.162816f, 0.636235f,
+ -0.341362f, 0.197296f, -0.589747f, -0.749363f, -0.277197f,
+ -1.27291f, -0.0857908f, -0.147591f, -0.0956297f, -0.109097f,
+ 0.0717554f, 0.359078f, 0.301457f, 0.486934f, -0.260955f,
+ -0.126821f, 1.55756f, 0.477469f, -1.45363f, 1.42198f,
+ -0.360847f, -0.0211924f, -0.0184957f, -0.110706f, -0.152136f,
+ 0.104703f, 0.267615f, 0.127392f, 0.172996f, 0.258326f,
+ 0.268578f, -0.431123f, -0.114419f, 0.0101172f, -0.195671f,
+ 0.0792025f, -0.151505f, -0.064077f, 0.0479777f, -0.141882f,
+ 0.121492f, -0.139132f, -0.348252f, 0.341043f, -0.565367f,
+ -0.0791259f, -0.781086f, 0.0140045f, 0.571094f, -0.00875077f,
+ 0.217132f, -0.202345f, 0.157213f, 0.228445f, 0.366612f,
+ -0.529989f, 0.42241f, -0.540538f, -0.0425556f, -0.207774f,
+ -0.0663941f, 0.37836f, -0.0650245f, -0.0828694f, -0.0835478f,
+ -0.795512f, 0.470268f, 0.1551f, -0.69017f, -0.116735f,
+ 0.157614f, 0.555973f, -0.293311f, 0.245428f, -0.0853701f,
+ -0.449278f, -0.0551647f, -0.00137429f, 0.709439f, -0.456796f,
+ 0.132062f, -0.0449484f, -0.308599f, 0.180608f, -2.24196f,
+ 0.421478f, -0.640946f, -0.460397f, -0.920628f, -0.184949f,
+ -0.0416982f, 0.6484f, -0.22806f, 0.412229f, -0.468079f,
+ -0.72372f, -0.347698f, -1.3899f, 0.631876f, 0.0611046f,
+ 0.0294258f, -0.128091f, -0.205615f, 0.355348f, -0.267725f,
+ -0.644835f, 0.435879f, 0.517477f, -0.338123f, -0.157764f,
+ 0.32762f, -0.166454f, 0.221007f, -0.0438278f, -0.0777725f,
+ 0.10986f, 0.941545f, -0.542284f, -0.172312f, -0.256597f,
+ -0.0181391f, 0.220623f, -0.432456f, 0.0164074f, 0.250226f,
+ -0.522576f, 0.783109f, 0.198703f, -0.784554f, -0.0929628f,
+ 0.326861f, 0.470293f, 0.442684f, 0.271879f, -0.108256f,
+ 0.0483558f, -0.403151f, 0.36183f, -0.268186f, 0.270851f,
+ -0.696826f, -0.166037f, -0.354658f, 0.405977f, -0.473447f,
+ 0.649689f, -0.0863114f, -0.147319f, 0.0869966f, 0.319792f,
+ 0.493026f, -1.07456f, 0.354751f, 0.114605f, -0.120647f,
+ -0.238315f, 0.0290955f, -0.355299f, -0.45381f, 0.0812865f,
+ -0.0180434f, 0.00861318f, -0.892943f, -0.0127801f, -1.66398f,
+ 0.290505f, 0.126832f, 2.08173f, -0.0454847f, -0.162481f,
+ 1.07426f, 0.228566f, 0.280528f, -0.537625f, -0.175288f,
+ -0.118012f, 0.649114f, -0.349926f, -0.0189864f, -0.30934f,
+ -0.363178f, -0.119822f, -0.22656f, 0.484513f, -0.173269f,
+ 0.41987f, -0.448517f, -0.0950466f, 0.482443f, 0.061558f,
+ 0.4219f, -0.536388f, 0.0781972f, 0.212489f, 0.104229f,
+ -0.0792804f, 0.402066f, -0.676313f, -0.2272f, -0.16379f,
+ 0.260145f, -0.0504658f, -0.0826579f, -1.37749f, 0.00790747f,
+ 0.0841031f, -0.0671308f, -0.00301736f, -0.386206f, 0.190311f,
+ 0.0702639f, 0.0643968f, 0.133741f, -0.0141555f, -0.0365324f,
+ 0.87028f, 0.207894f, -0.421266f, 0.689256f, 0.145037f,
+ -0.270796f, 0.212604f, -0.345326f, 0.0074631f, -1.72379f,
+ 0.0672097f, -0.273153f, 1.30503f, -1.01324f, 0.00284696f,
+ 0.851459f, 0.176847f, 0.30948f, -0.57144f, -0.0596695f,
+ -0.111189f, 0.130361f, -0.298286f, 0.0567591f, -0.0885215f,
+ -0.847601f, 0.238624f, -0.162391f, 0.452357f, -0.0192713f,
+ 0.226661f, 0.0762922f, -0.0894055f, 0.332702f, 0.424484f,
+ 0.0443207f, -0.162345f, -0.601036f, 0.280527f, -0.137362f,
+ 0.266345f, 0.729438f, -0.887182f, 0.152943f, -0.573548f,
+ -0.0201383f, -0.56521f, 0.033582f, 0.300284f, -0.144472f,
+ 0.633026f, 0.30866f, 0.0653073f, 0.316901f, 0.0721326f,
+ 0.192252f, -0.833162f, 0.194292f, -0.08663f, -0.189401f,
+ -0.178242f, 0.111488f, 0.522487f, -0.65497f, 0.457049f,
+ 0.390654f, 0.0522936f, -0.39712f, -0.293717f, -0.374656f,
+ -0.118916f, -0.853076f, -0.0829578f, -0.17335f, -0.0218694f,
+ 0.367968f, 0.478469f, 0.0913813f, 0.519251f, 0.803526f,
+ -0.272516f, -0.341329f, 0.0897285f, 0.247653f, 0.000898686f,
+ 0.313196f, 0.000587979f, -0.314189f, -0.449439f, -0.0291611f,
+ -0.356287f, -0.722904f, -0.0480958f, -0.523758f, -0.576146f,
+ 0.133754f, 0.616921f, -0.085494f, 0.487487f, 0.745129f,
+ 0.993267f, 0.256555f, 0.0822743f, 0.0411971f, 0.139388f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = {
+ 0.00447951f, 0.0202534f, 0.00970833f, -0.00460874f, 0.0942288f,
+ -0.0534704f, 0.00829869f, -0.0255174f, -0.0809143f, 0.00169117f,
+ 0.0177427f, 0.0259387f, 0.0291077f, -0.0267599f, 0.100275f,
+ -0.00389366f, 0.0315499f, 0.0265846f, -0.000206604f, 0.0302221f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = {
+ 0.153048f, 0.0725422f, 0.068901f, -0.475608f, 0.0736706f,
+ -0.134076f, 0.229289f, 0.0217921f, 0.0449205f, -1.00002f,
+ 0.149133f, 0.0497258f, 0.118988f, 0.0741764f, 0.0385486f,
+ 0.225181f, 0.012966f, 0.155593f, -3.07175f, -0.0641051f,
+ 0.09161f, 0.0259005f, -0.209998f, -0.420298f, 0.0587126f,
+ 0.00352744f, 0.0451313f, -0.049384f, 0.11516f, 0.083135f,
+ 0.103675f, -0.0185604f, 0.0623248f, -0.0993726f, 0.0448522f,
+ 0.0134017f, -0.294776f, -0.251924f, 0.0712635f, -0.0764298f,
+ -0.463766f, -0.0295011f, -0.579168f, 0.573853f, -0.00596607f,
+ 0.0237762f, -0.0500104f, -0.0969275f, 0.155573f, 0.0515382f,
+ -0.178454f, -0.154008f, -0.278299f, -0.166421f, 0.0149533f,
+ -0.0700236f, 0.239287f, -1.19545f, -0.0744625f, 0.143037f,
+ 0.141874f, 0.086302f, 0.0838633f, -0.454179f, 0.120308f,
+ -0.0896718f, 0.254909f, 0.0714462f, 0.00471098f, -0.869494f,
+ 0.209407f, 0.138285f, 0.0816641f, 0.0666266f, 0.0848555f,
+ 0.173313f, 0.0695633f, 0.285667f, -3.15384f, 0.00140275f,
+ -0.969824f, -0.0318689f, -0.00487396f, 0.412541f, 0.0263593f,
+ -0.249824f, 0.0897776f, 0.0208836f, -0.0982745f, -0.16049f,
+ -0.12719f, -0.186166f, 0.102338f, 0.273931f, -0.0886306f,
+ -0.19513f, -0.0135712f, -0.194127f, -0.0834291f, 0.426623f,
+ -0.0705446f, 0.0327476f, 0.0800862f, 0.478757f, -0.00849111f,
+ -0.554911f, -0.0489312f, -0.184029f, -0.227428f, 0.159989f,
+ -0.0677731f, -0.0901436f, 0.00308696f, -0.352243f, 0.278715f,
+ 0.306374f, -0.0772054f, -0.0122733f, -0.0693457f, 0.074365f,
+ -0.267458f, -0.123612f, -0.495954f, 0.552604f, -0.103951f,
+ -0.121771f, 0.179966f, -0.377947f, -1.35472f, 0.153294f,
+ -0.445284f, -0.089813f, -0.00529807f, 0.254047f, -0.0378426f,
+ 0.114597f, -0.143052f, 0.0815258f, -0.10528f, 0.00833533f,
+ -0.117508f, 0.129052f, 0.0706719f, -1.39506f, 0.0124731f,
+ 0.109831f, -0.0744156f, 0.181612f, 0.0787894f, 0.0293352f,
+ 0.494929f, 0.00997207f, -0.585882f, -0.0844138f, -0.00864134f,
+ -0.109943f, 0.0713114f, 0.14883f, 0.0610554f, 0.204145f,
+ -0.00390313f, 0.0184763f, -0.111387f, 0.175442f, -0.0840215f,
+ -0.178785f, -0.0693612f, -0.254507f, -0.191549f, 0.501561f,
+ -0.0858995f, -0.164921f, 0.0250706f, -0.0916282f, 0.247085f,
+ 0.13877f, -0.419487f, -0.295065f, -0.213812f, -0.10362f,
+ 0.138243f, 0.086985f, 0.113633f, -0.459273f, 0.12388f,
+ -0.139296f, 0.253792f, 0.0421624f, 0.0665065f, -0.977282f,
+ 0.199927f, 0.115194f, 0.099045f, 0.0534806f, 0.089283f,
+ 0.0815367f, 0.150901f, 0.253458f, -3.24825f, -0.0118163f,
+ -0.544565f, 0.0201825f, -0.0682201f, 0.759028f, 0.00479696f,
+ -0.00625607f, 0.058007f, -0.0811189f, -0.114617f, -0.0998578f,
+ 0.133312f, 0.0246256f, -0.0167416f, 0.196118f, 0.109823f,
+ 0.109489f, 0.474682f, -0.763475f, 0.0818745f, 0.0798777f,
+ -0.0994905f, -0.00138143f, -0.108563f, 0.697289f, -0.103702f,
+ -0.306085f, -0.0996705f, -0.142618f, -0.130989f, 0.0813303f,
+ -0.0909275f, -0.10786f, -0.0280431f, 0.206877f, -1.70798f,
+ 0.525568f, 0.559891f, -0.166132f, -0.227574f, -0.150955f,
+ 0.0849226f, 0.00497342f, -0.168667f, -0.282575f, 0.00537805f,
+ -0.0185572f, 0.0607167f, -0.0534948f, -0.0215776f, -0.14825f,
+ -0.0164577f, -0.0611978f, 0.0347562f, 0.286917f, 0.226598f,
+ 0.149497f, -0.478101f, -0.246006f, 0.0663239f, -0.121728f,
+ 0.267087f, 0.0802681f, -0.184741f, -0.558267f, 0.0437066f,
+ 0.13816f, -0.0710939f, 0.0725697f, 0.339857f, 0.161069f,
+ 0.304871f, 0.108138f, 0.193396f, 0.0891607f, -0.0701939f,
+ -0.182038f, -0.451873f, -0.233883f, 0.0444747f, 0.0436545f,
+ -0.245894f, -0.0721136f, 0.309013f, 0.278996f, 0.0259377f,
+ 0.0278116f, 0.0686773f, -0.271237f, 0.235082f, -0.0778285f,
+ -0.456541f, -0.109303f, -0.074565f, -0.407301f, -0.162191f,
+ -0.801819f, 0.372435f, -0.559083f, -0.039189f, 0.0477762f,
+ 0.0875363f, 0.0699926f, 0.116552f, -0.308217f, 0.0341607f,
+ -0.14202f, 0.135517f, 0.0316971f, 0.153297f, -0.759722f,
+ 0.12849f, 0.114229f, 0.0814893f, 0.275402f, 0.0403976f,
+ 0.0357503f, 0.212295f, 0.0673998f, -2.59822f, -0.0475021f,
+ -0.0594725f, 0.0659163f, 0.0469717f, -0.0370461f, -0.12863f,
+ -0.381743f, -0.0445055f, -0.106843f, -0.0880648f, 0.00591106f,
+ 0.235514f, -0.165162f, -0.0696645f, 0.115374f, 0.245558f,
+ 0.192049f, -0.388628f, -0.48291f, 0.154313f, -0.160207f,
+ 0.125928f, 0.122039f, 0.0713794f, -0.161244f, 0.128082f,
+ -0.234659f, 0.0680219f, 0.0597933f, 0.208421f, -0.163623f,
+ 0.196873f, 0.156603f, 0.184179f, -0.278331f, -0.0481286f,
+ 0.0828152f, 0.247004f, 0.0915582f, -0.0906229f, -0.20376f,
+ 0.136593f, 0.0740336f, -0.0134935f, -0.355048f, 0.0898485f,
+ -0.0962068f, 0.185804f, -0.0145596f, 0.0966589f, -0.515784f,
+ 0.121602f, 0.0320428f, 0.11093f, -0.0559421f, 0.0355484f,
+ 0.192128f, 0.0500888f, 0.133641f, -1.73282f, -0.0624599f,
+ 0.122524f, 0.0757292f, -0.0974648f, -0.193649f, 0.0561096f,
+ 0.0159959f, 0.0334472f, -0.0168832f, -0.12386f, -0.112419f,
+ 0.19552f, 0.0308502f, 0.0537643f, -0.0181012f, 0.0392183f,
+ 0.0461833f, -0.52623f, -0.238252f, 0.0821762f, -0.212384f,
+ 0.112901f, 0.096063f, 0.0540225f, 0.0773583f, 0.143045f,
+ -0.101551f, 0.282418f, 0.0176749f, -0.00244542f, -0.780154f,
+ -0.254428f, -5.82215f, 0.106638f, 0.11746f, 0.0486823f,
+ 0.164562f, 0.0303006f, 0.229614f, -2.41845f, -0.117122f,
+ 0.0451654f, 0.0237383f, -0.208731f, 0.0721137f, 0.0761163f,
+ -0.0569416f, -0.00830511f, -0.045256f, 0.14535f, -0.0189222f,
+ -0.283363f, -3.15502f, 0.0971161f, -0.035913f, 0.00813281f,
+ 0.0187974f, -0.361573f, -0.302067f, 0.118014f, -0.0956148f,
+ -0.596567f, 0.0105443f, -0.49019f, -0.0801959f, 0.0322344f,
+ -0.0280032f, 0.0555038f, -0.111495f, -0.0994456f, 0.0178021f,
+ 0.0358362f, 1.07063f, -0.0833138f, 0.0621246f, 0.0637157f,
+ 0.0999207f, 0.191975f, -1.2811f, 0.0341681f, 0.14818f,
+ 0.0957259f, 0.109909f, 0.0566115f, 0.0585633f, 0.179939f,
+ -0.104372f, 0.309091f, 0.0172941f, 0.0243182f, -0.935252f,
+ -0.296257f, -5.83634f, 0.0899249f, 0.455347f, 0.129505f,
+ 0.220212f, 0.0214801f, 0.284802f, -2.94585f, -0.0805413f,
+ -1.01819f, 0.00534034f, -0.057203f, 0.0869331f, 0.0207575f,
+ -0.124479f, -0.0465806f, 0.0894252f, 0.32203f, 0.0858497f,
+ 0.25178f, 0.0932205f, 0.0888455f, 0.233153f, -0.446398f,
+ -0.00791233f, 0.0909603f, -0.0904397f, 0.131835f, 0.475597f,
+ -0.1236f, 0.0231622f, 0.138602f, -0.097731f, -0.0282484f,
+ -0.549095f, -0.0457428f, -0.0895407f, -0.293965f, 0.166872f,
+ 0.46719f, 0.236254f, 0.0615991f, 0.499236f, 0.540366f,
+ 0.402035f, 0.0606324f, -0.0499928f, -0.0155198f, 0.0994403f,
+ -0.14773f, -0.183433f, -0.612093f, -0.334201f, -0.110877f,
+ -0.143441f, 0.05815f, -0.318586f, -0.344235f, 0.199593f,
+ 0.51109f, -0.252281f, -0.028834f, 0.0615421f, 0.0623699f,
+ 0.210745f, -0.236448f, 0.166279f, 0.127516f, -0.0971157f,
+ -0.204389f, 0.208112f, 0.0377023f, 0.271837f, -0.00859528f,
+ 0.0797081f, -0.00582115f, 0.140018f, -0.384865f, -0.0853243f,
+ -0.586727f, -0.0664489f, -0.631436f, -0.245828f, -0.0647894f,
+ -0.171912f, -0.0801706f, 0.0731614f, -0.11725f, 0.281478f,
+ -0.03047f, 0.0363488f, -0.0481651f, -0.326329f, -0.0155898f,
+ -0.428316f, -0.0989367f, -0.271902f, -0.00263837f, 0.366168f,
+ 0.325989f, 0.165463f, 0.0668512f, -0.142202f, 0.419992f,
+ 0.164971f, -0.515479f, -0.187585f, -0.151783f, -0.0682468f,
+ 0.0910191f, 0.117086f, 0.106579f, 0.0961825f, 0.162148f,
+ -0.129645f, 0.301039f, 0.000320343f, -0.0558097f, -0.844295f,
+ -0.218919f, -5.7571f, 0.0982612f, 0.238955f, 0.0703565f,
+ 0.0969388f, 0.107202f, 0.321585f, -3.00594f, -0.058755f,
+ -0.620004f, 0.052114f, 0.128423f, -0.177673f, -0.00341509f,
+ -0.146756f, -0.0414309f, -0.0893262f, -0.0584779f, -0.129552f,
+ 0.127629f, 0.13275f, -0.0973342f, -0.215617f, 0.0724309f,
+ 0.0102229f, 0.178137f, -0.943374f, -0.171465f, 0.304949f,
+ -0.0963836f, -0.0346437f, -0.138667f, -0.234184f, 0.0344159f,
+ -0.319592f, -0.0990766f, -0.16065f, 0.369432f, 0.194911f,
+ 0.363348f, -0.356009f, -0.00736217f, 0.241788f, -2.21311f,
+ 0.704816f, 0.697019f, 0.129186f, -0.132799f, -0.11861f,
+ 0.0383451f, 0.0247782f, -0.12687f, 0.0256552f, 0.048413f,
+ 0.00660549f, 0.0457962f, -0.012819f, 0.115991f, -0.1117f,
+ -0.291045f, -0.646138f, 0.0813613f, 0.112063f, 0.191675f,
+ 0.120835f, -0.444267f, -0.340385f, 0.0391936f, -0.151132f,
+ 0.184419f, 0.124998f, -0.14089f, 0.214087f, 0.00108535f,
+ 0.119611f, 0.0236965f, 0.0715074f, -0.225997f, -0.0126552f,
+ -0.459214f, -0.490444f, 0.173716f, 0.355811f, -0.13607f,
+ -0.191091f, -0.530085f, -0.400666f, 0.011221f, 0.10527f,
+ -0.11498f, -0.011864f, 0.364376f, 0.0319587f, -0.0528563f,
+ 0.0353899f, 0.0393453f, -0.289211f, -0.347785f, -0.0417157f,
+ 0.545848f, 0.741785f, -0.0732565f, -1.29687f, -0.0433128f,
+ -1.44162f, 0.318894f, -0.377784f, 0.123751f, -0.00444347f,
+ 0.0957118f, 0.0893616f, 0.0911595f, 0.092917f, 0.127681f,
+ -0.159929f, 0.190417f, -0.0297948f, -0.00132599f, -0.742756f,
+ -0.0364169f, -4.00108f, 0.0784767f, 0.223048f, 0.0430138f,
+ 0.0180493f, 0.212842f, 0.122987f, -2.83267f, -0.0641464f,
+ -0.173247f, 0.100946f, 0.0804885f, 0.0172631f, 0.0877408f,
+ -0.353222f, 0.0108262f, -0.0452121f, -0.116127f, 0.268154f,
+ -0.132587f, -0.27481f, -0.0316914f, 0.0610525f, 0.439691f,
+ 0.00966415f, -0.78962f, -0.424823f, -0.0214365f, -0.113846f,
+ 0.100793f, 0.126482f, 0.0415354f, 0.0427995f, 0.14273f,
+ -0.315674f, 0.110095f, 0.0061568f, 0.0320474f, -0.3596f,
+ -0.12533f, -1.28837f, 0.174673f, -0.235912f, 0.00495439f,
+ 0.0695473f, 0.266489f, 0.049248f, 0.0868526f, -0.0685969f,
+ 0.102984f, 0.0924639f, -0.027535f, 0.0709277f, 0.155776f,
+ -0.190944f, 0.188273f, -0.00897471f, 0.0964232f, -0.475822f,
+ -0.209374f, -5.00252f, 0.103495f, 0.110698f, 0.00682092f,
+ 0.208586f, 0.0489575f, 0.0966254f, -1.42973f, -0.0645128f,
+ 0.0515961f, 0.0571281f, -0.0992321f, 0.00791648f, 0.0087609f,
+ 0.0607367f, 0.0315705f, 0.0183317f, 0.0756087f, -0.0292847f,
+ -0.212932f, -0.782259f, 0.0899944f, 0.102677f, 0.0681135f,
+ 0.0447764f, -0.481969f, -0.221459f, 0.0794475f, -0.229157f,
+ 0.136781f, 0.0832359f, 0.0297807f, -0.00287225f, -5.97897f,
+ -0.0960581f, 0.250945f, -0.00133314f, -0.112396f, -0.856922f,
+ 0.115776f, 0.124536f, 0.0914194f, -0.160775f, 0.128684f,
+ 0.106718f, 0.100665f, 0.139579f, -0.86141f, -0.190323f,
+ 0.0884896f, 0.0363845f, -0.19831f, 0.121601f, 0.0264453f,
+ -0.00557822f, 0.0720238f, -0.0140132f, -0.166814f, -0.266214f,
+ 0.00500545f, 0.0146905f, 0.126035f, 0.0812372f, 0.0615973f,
+ 0.0766063f, -0.420156f, -0.126157f, -0.0284299f, -0.112513f,
+ -0.567008f, -0.0100263f, -0.607567f, 0.193053f, 0.0067527f,
+ -0.0753897f, 0.00134269f, -0.0512249f, -0.161661f, 0.0667741f,
+ -0.113702f, -0.071606f, -0.300563f, 0.276479f, -0.155318f,
+ -0.0512306f, 0.0896443f, -0.987911f, 0.0440889f, 0.430958f,
+ 0.175427f, 0.101385f, 0.0303662f, 0.0672653f, -6.62463f,
+ -0.10475f, 0.228249f, -0.00482173f, -0.0608713f, -0.895836f,
+ 0.187976f, 0.162173f, 0.0747544f, 0.219953f, 0.0682489f,
+ 0.142665f, 0.100287f, 0.301887f, -1.97736f, -0.295001f,
+ -1.0733f, -0.0562668f, -0.0604295f, 0.0304073f, 0.194274f,
+ -0.243593f, 0.0727137f, 0.0610967f, -0.0692415f, -0.02967f,
+ 0.055633f, 0.0192402f, 0.105841f, 0.102236f, -0.0757102f,
+ -0.0067639f, 0.0102317f, -0.257959f, -0.0638652f, 0.45521f,
+ -0.114967f, 0.0921177f, 0.223796f, 0.277072f, -0.0613282f,
+ -0.564693f, -0.151333f, -0.158035f, 0.228491f, 0.12997f,
+ -0.192625f, -0.125344f, 0.0983258f, -0.931206f, 0.618715f,
+ 0.273759f, -0.145527f, -0.099431f, -0.119551f, 0.0663484f,
+ -0.161419f, -0.202377f, -0.545393f, 0.0917645f, 0.042263f,
+ -0.17117f, -0.178622f, -0.336977f, 0.866715f, 0.0376922f,
+ -0.319728f, -0.127406f, 0.0599384f, 0.268804f, -0.0331844f,
+ 0.355326f, -0.103902f, 0.0425935f, 0.00525512f, -0.133687f,
+ -0.122695f, 0.145582f, 0.139013f, -0.0053352f, 0.0313566f,
+ 0.327295f, -0.0117993f, 0.233524f, 0.162388f, -0.0793262f,
+ 0.454543f, 0.0442224f, -0.742673f, -0.144882f, 0.0874983f,
+ -0.0707259f, 0.0219869f, 0.201728f, 0.0204537f, 0.0788857f,
+ -0.0374329f, 0.0724169f, 0.0743593f, -0.0193526f, -0.313546f,
+ -0.418882f, -0.0815754f, -0.197144f, 0.305053f, 0.330196f,
+ -0.131006f, -0.00113249f, 0.0750458f, -0.541764f, 0.299935f,
+ 0.308516f, -0.20547f, -0.333066f, 0.0285833f, 0.191147f,
+ 0.160372f, 0.0724649f, 0.0426326f, 0.153046f, -6.59656f,
+ -0.081237f, 0.219163f, 0.0147081f, -0.0109837f, -1.01487f,
+ 0.170055f, 0.163386f, 0.106413f, 0.150188f, 0.0688875f,
+ 0.0541359f, 0.156307f, 0.178844f, -1.51054f, -0.149477f,
+ -0.504503f, 0.017878f, -0.181821f, -0.0999659f, 0.0484548f,
+ -0.32211f, 0.0406744f, 0.0017627f, 0.0220593f, 0.0900512f,
+ -0.561625f, 0.107279f, -0.0861521f, -0.0862376f, 0.0816765f,
+ 0.168072f, 0.150063f, -0.816825f, -0.13569f, 0.557555f,
+ -0.155265f, 0.025135f, -0.109304f, -0.0487062f, -0.00347487f,
+ -0.454803f, -0.0394371f, -0.214597f, -0.248898f, 0.286501f,
+ -0.249246f, -0.138935f, 0.00391409f, -0.122544f, -2.14993f,
+ 0.588942f, 0.541231f, 0.0154047f, -0.359742f, 0.0520729f,
+ 0.0667058f, 0.0418163f, -0.132533f, -0.184759f, 0.0546118f,
+ -0.131198f, 0.109664f, -0.0714679f, -0.114163f, -0.243081f,
+ -0.0405089f, 0.0342795f, 0.0801825f, -0.268408f, 0.192207f,
+ 0.0800494f, -0.586539f, -0.118155f, -0.0508569f, -0.193987f,
+ 0.261478f, 0.105719f, -0.125361f, -0.0956201f, 0.0233802f,
+ 0.271098f, 0.0113352f, 0.0910447f, 0.00628244f, -0.071722f,
+ 0.21439f, 0.0747191f, 0.207765f, -0.0782454f, -0.0151716f,
+ -0.196505f, -0.44798f, -0.228597f, 0.0549039f, -0.120715f,
+ -0.19388f, -0.0768461f, 0.361102f, 0.122936f, -0.0334211f,
+ -0.202503f, -0.0450776f, -0.272345f, 0.662321f, 0.109247f,
+ -0.218026f, -0.0669386f, -0.0864701f, -0.633421f, -0.158007f,
+ -1.10778f, 0.351211f, -0.541458f, -0.0171707f, 0.149606f,
+ 0.106105f, 0.0880349f, 0.0968455f, 0.113269f, -5.01949f,
+ -0.106404f, 0.175578f, -0.030045f, -0.0267249f, -0.563713f,
+ 0.173885f, 0.130772f, 0.0334519f, 0.0770157f, 0.0394389f,
+ -0.0290326f, 0.220003f, 0.180901f, -1.62203f, -0.151858f,
+ -0.202386f, -0.0067836f, 0.0287665f, -0.194183f, -0.239834f,
+ -0.484159f, 0.00671722f, -0.122459f, 0.0808959f, -0.263769f,
+ -0.015066f, -0.0429868f, -0.111255f, -0.231872f, 0.219659f,
+ -0.0437412f, -0.536618f, -0.477831f, 0.0421895f, -0.0815851f,
+ 0.119638f, 0.0786293f, -0.000668378f, 0.0305567f, -0.0868189f,
+ -0.178327f, 0.0799657f, 0.0280923f, -0.211395f, -0.464577f,
+ 0.216912f, 0.0761976f, 0.160288f, -0.416372f, -0.10286f,
+ -0.0733786f, 0.261033f, 0.0493698f, 0.143137f, -0.179979f,
+ 0.15655f, 0.0897976f, -0.0258041f, -0.152852f, -6.15512f,
+ -0.118917f, 0.227283f, -0.0514043f, -0.0786432f, -0.523485f,
+ 0.1644f, 0.0869001f, 0.0984082f, -0.428288f, 0.0791992f,
+ 0.141904f, 0.0652073f, 0.104429f, -0.775125f, -0.121479f,
+ 0.0841637f, 0.0135705f, -0.208863f, -0.0629523f, 0.0455794f,
+ 0.0513898f, -0.0147657f, 0.0401145f, 0.0660079f, 0.0210609f,
+ -0.0151801f, 0.0562111f, 0.140308f, -0.0196394f, 0.0230753f,
+ -0.0336115f, -0.422411f, -0.196974f, -0.0405748f, -0.283428f,
+ 0.15458f, 0.0876296f, 0.0314038f, 0.16389f, -7.01385f,
+ -0.117146f, 0.197273f, -0.0400688f, 0.0143951f, -0.964007f,
+ -0.0618919f, 0.0406891f, 0.07992f, -0.144132f, 0.116416f,
+ 0.0326838f, 0.103641f, 0.171805f, -1.05158f, -0.182589f,
+ 0.116991f, 0.0530774f, -0.212454f, -0.016727f, -0.0565992f,
+ 0.0712873f, 0.0445466f, -0.000107032f, -0.121449f, -0.15148f,
+ 0.0220338f, 0.0762024f, 0.12253f, 0.0622466f, 0.0835822f,
+ 0.0465119f, -0.388743f, -0.34665f, -0.0720734f, -0.101581f,
+ -0.630565f, -0.0512685f, -0.520541f, 0.0530119f, -0.0245276f,
+ -0.19116f, -0.0144446f, -0.0604486f, 0.187251f, -0.021341f,
+ -0.217823f, 0.0510256f, -0.197946f, 0.060955f, -0.0617316f,
+ 0.0741673f, 0.117591f, -1.47844f, -0.0911093f, 0.359225f,
+ 0.145027f, 0.127513f, 0.0617905f, 0.141154f, -7.63868f,
+ -0.0808127f, 0.274843f, 0.00693195f, -0.0283113f, -0.853871f,
+ -0.15737f, 0.0858904f, 0.0746279f, 0.109912f, 0.193775f,
+ 0.0698094f, 0.174159f, 0.259556f, -1.49885f, -0.156706f,
+ -1.04113f, -0.0329546f, -0.0491449f, -0.0304125f, 0.0514892f,
+ -0.244284f, 0.126814f, -0.0387081f, -0.153173f, -0.0566748f,
+ 0.294111f, -0.0170534f, 0.102381f, 0.447606f, -0.0613267f,
+ -0.0636869f, -0.0347599f, -0.259572f, -0.0657846f, 0.454352f,
+ -0.169453f, -0.00177987f, 0.133279f, -0.0863932f, -0.134423f,
+ -0.475107f, -0.00448962f, -0.214607f, 0.111413f, 0.194377f,
+ -0.0710837f, 0.0562353f, 0.0401193f, 0.248595f, 0.538374f,
+ 0.449469f, -0.39111f, 0.0125057f, 0.0448811f, -0.00707751f,
+ -0.164894f, -0.317516f, -0.56231f, -0.270262f, 0.127016f,
+ -0.12092f, -0.0881587f, -0.323908f, 0.872344f, 0.103391f,
+ 0.267971f, -0.155088f, -0.0136683f, 0.309517f, 0.119901f,
+ 0.271307f, -0.188463f, 0.185121f, -0.142777f, -0.110535f,
+ -0.163107f, 0.175502f, 0.0801924f, 0.240499f, 0.0874759f,
+ 0.308907f, -0.00222504f, 0.193366f, 0.109018f, -0.0772158f,
+ -0.520675f, 0.0259432f, -0.736666f, -0.296579f, 0.043486f,
+ -0.128932f, 0.0417669f, 0.125747f, 0.157879f, 0.112857f,
+ -0.0595681f, 0.0611936f, -0.042125f, -0.270338f, 0.120072f,
+ -0.36675f, -0.0347962f, -0.119539f, 0.0873369f, 0.296432f,
+ -0.069501f, -0.0383859f, 0.0913597f, -0.40747f, 0.234276f,
+ 0.332536f, -0.732132f, -0.312291f, 0.137759f, 0.227593f,
+ 0.14165f, 0.129068f, 0.102734f, 0.135818f, -7.35883f,
+ -0.101533f, 0.256027f, -0.0142278f, -0.0561601f, -1.09899f,
+ -0.106538f, 0.0612256f, 0.099487f, -0.0605983f, 0.134311f,
+ 0.052226f, 0.143672f, 0.219944f, -1.47539f, -0.101828f,
+ -0.429979f, 0.010478f, -0.0132605f, 0.103363f, 0.0267373f,
+ -0.338865f, 0.0090188f, 0.0810085f, -0.124368f, -0.0133776f,
+ 0.595666f, -0.00162201f, -0.212444f, -0.26342f, 0.0913656f,
+ -0.106279f, 0.414515f, -0.709901f, -0.00198859f, 0.305288f,
+ -0.188536f, -0.0377482f, -0.131909f, -0.116099f, -0.236827f,
+ -0.36356f, 0.0179455f, -0.202143f, -0.00395508f, 0.177363f,
+ 0.0630679f, -0.145173f, -0.0558639f, -0.44879f, -1.55687f,
+ 0.473398f, 0.50531f, -0.0656231f, -0.137197f, 0.064707f,
+ 0.122083f, 0.0321111f, -0.167096f, 0.0406581f, -0.0793592f,
+ -0.0777081f, 0.0321379f, -0.0108834f, -0.0652323f, -0.102918f,
+ 0.0178664f, 0.0781873f, 0.0613189f, -0.04177f, 0.159566f,
+ 0.15134f, -0.445996f, -0.384905f, 0.0951659f, -0.175046f,
+ 0.255746f, 0.177047f, -0.150632f, 0.200522f, 0.00778549f,
+ 0.232168f, -0.0304652f, 0.083155f, -0.125395f, -0.0203289f,
+ -0.23874f, 0.0349836f, 0.231701f, -0.14849f, -0.204272f,
+ -0.198309f, -0.364955f, -0.228428f, 0.0614142f, -0.040976f,
+ -0.227785f, -0.0898404f, 0.271566f, -0.209196f, 0.0226431f,
+ -0.0911715f, 0.0840369f, -0.299411f, -0.529182f, 0.0622292f,
+ 0.202475f, 0.0155583f, -0.083114f, 0.124253f, -0.22721f,
+ -1.02565f, 0.193961f, -0.54287f, -0.00849364f, 0.11124f,
+ 0.0993531f, 0.120621f, 0.0959537f, 0.136274f, -5.23358f,
+ -0.107433f, 0.155286f, -0.0136043f, -0.0246768f, -0.631187f,
+ -0.0493852f, 0.0446751f, 0.0588353f, 0.160766f, -0.0354385f,
+ -0.0672548f, 0.243743f, 0.186004f, -1.20199f, -0.151872f,
+ -0.0760096f, -0.00775123f, -0.0122227f, 0.0891327f, -0.377876f,
+ -0.469926f, -0.134715f, -0.0969362f, 0.212542f, 0.0871489f,
+ 0.164638f, -0.0485785f, -0.167754f, -0.515052f, 0.13821f,
+ 0.0515572f, -0.430691f, -0.394719f, 0.143947f, -0.00670816f,
+ 0.129623f, 0.140299f, 0.0336978f, 0.153545f, -0.350927f,
+ -0.213485f, 0.0344809f, 0.0405889f, 0.0749967f, -0.369352f,
+ -0.109398f, 0.0350649f, 0.190893f, -0.284106f, -0.185376f,
+ 0.0105842f, 0.263692f, 0.160429f, 0.0998209f, -0.127779f,
+ 0.140558f, 0.108968f, -0.0122672f, 0.102875f, -5.72172f,
+ -0.161288f, 0.135935f, -0.0143087f, 0.106556f, -0.649813f,
+ -0.123049f, -0.0108861f, 0.102918f, -0.298137f, 0.0329013f,
+ 0.100763f, 0.12018f, 0.100782f, -0.648036f, -0.111122f,
+ 0.12363f, 0.0211952f, -0.225201f, 0.0506021f, 0.0167621f,
+ 0.0608759f, -0.0245646f, 0.0503477f, -0.0972749f, -0.0415155f,
+ -0.00578366f, -0.0977591f, 0.124867f, 0.0134788f, -0.0375816f,
+ -0.00581233f, -0.272292f, -0.250393f, 0.024511f, -0.184891f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = {
+ 0.182474f, 0.0223202f, 0.204111f, 0.0573683f, 0.111143f,
+ 0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f,
+ 0.0133081f, 0.119719f, 0.237522f, -0.266705f, 0.129427f,
+ 0.0695857f, 0.22068f, 0.231667f, 0.405829f, -0.0972567f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = {
+ -0.0393876f, -0.269924f, -0.0703231f, -0.0236484f, 0.170478f,
+ 0.245566f, 0.175963f, 0.104194f, -0.0490501f, -0.157605f,
+ -0.0275165f, -0.0169499f, -0.250725f, 0.215203f, -0.00733655f,
+ 0.0111298f, 0.205606f, 0.928046f, 0.15139f, 0.0955483f,
+ -0.015115f, -0.126643f, 0.0957605f, -0.140178f, -0.0246866f,
+ 0.097097f, 0.116287f, 0.177746f, 0.0570021f, -0.0518686f,
+ -0.0446482f, -0.0125318f, 0.0116092f, 0.102431f, 0.0898519f,
+ 0.0870372f, -0.843274f, 0.383311f, -0.102761f, -0.0246494f,
+ 0.0312555f, 0.19472f, 0.111573f, 0.0920392f, -0.0555618f,
+ 0.326461f, 0.219357f, -0.133727f, -0.118399f, -0.0611432f,
+ -0.169931f, 0.123733f, -0.204607f, 0.082592f, 0.0323181f,
+ 0.201618f, -0.00388867f, -0.053583f, 0.0266333f, -0.0951787f,
+ -0.0358283f, -0.0649549f, 0.0119263f, -0.11812f, 0.209851f,
+ -0.036616f, -0.014911f, -0.138096f, -0.139664f, -0.207395f,
+ 0.0128848f, -0.201816f, 0.0899419f, 0.343308f, -0.0096243f,
+ -0.212605f, -0.0905284f, -0.0597114f, -0.055261f, -0.0653405f,
+ 0.0330484f, -0.27681f, -0.0994095f, -0.0468272f, 0.145713f,
+ 0.267216f, 0.185335f, 0.1798f, -0.0437882f, -0.200401f,
+ -0.0398117f, -0.0736501f, -0.166349f, 0.203316f, 0.0710647f,
+ 0.061825f, 0.281131f, 0.733323f, 0.215488f, 0.00145659f,
+ -0.138995f, -0.0833713f, 0.107809f, -0.105343f, -0.0672139f,
+ 0.101852f, 0.135455f, 0.132903f, 0.0312017f, -0.0643586f,
+ -0.0274546f, -0.0687466f, -0.020233f, 0.109444f, 0.0774587f,
+ 0.139497f, -0.800587f, 0.325783f, -0.0546695f, -0.092003f,
+ -0.0773301f, 0.189672f, 0.0604666f, 0.0939425f, 0.679495f,
+ 0.114789f, -0.161153f, 0.12843f, -0.0345385f, -0.134641f,
+ -0.153995f, 0.0823055f, -0.0349296f, 0.0299183f, -0.0606872f,
+ 0.137588f, 0.0449805f, -0.0555399f, -0.00553351f, -0.120719f,
+ -0.204701f, -0.0739813f, 0.0584115f, -0.104833f, -0.110989f,
+ 0.00845446f, 0.0630702f, -0.147861f, 0.0268545f, -0.216419f,
+ 0.00531986f, -0.206641f, 0.253082f, 0.413215f, -0.05909f,
+ -0.0939983f, -0.116818f, -0.0450892f, -0.0551134f, -0.00696931f,
+ -0.113003f, -0.289192f, -0.00884866f, -0.0365724f, 0.0401887f,
+ 0.238622f, 0.149151f, 0.175751f, -0.157425f, -0.138924f,
+ -0.0277598f, -0.0285915f, 0.10165f, 0.209532f, 0.0862249f,
+ 0.0256428f, 0.623204f, -0.0941196f, 0.20345f, -0.132869f,
+ 0.00947298f, -0.14753f, 0.103918f, -0.161799f, 0.125566f,
+ 0.10916f, 0.115446f, 0.135627f, -0.0181667f, -0.0734694f,
+ -0.0154729f, -0.085849f, -0.000427605f, 0.113614f, 0.0776308f,
+ 0.111899f, -0.214917f, 0.393234f, -0.132223f, 0.020783f,
+ -0.074902f, 0.217477f, 0.107883f, 0.109466f, 0.146609f,
+ 0.317061f, 0.074379f, -0.0505457f, -0.0503772f, -0.0678954f,
+ -0.220003f, 0.114878f, 0.176014f, -0.00657996f, -0.0875497f,
+ 0.065582f, 0.00238612f, -0.063395f, 0.0295323f, -0.127126f,
+ 0.099813f, -0.115452f, 0.0106309f, -0.179632f, -0.0436553f,
+ 0.0120295f, 0.0652713f, -0.131512f, -0.081714f, -0.205363f,
+ -0.0374944f, -0.196707f, 0.680568f, -0.00991824f, -0.0212223f,
+ -0.186258f, -0.432361f, -0.0291303f, -0.0475983f, -0.071383f,
+ -0.0116416f, -0.28257f, -0.0635272f, -0.0576546f, -0.280129f,
+ 0.286528f, 0.199997f, 0.192851f, 0.323829f, -0.185006f,
+ -0.04791f, -0.0882187f, -0.0496895f, 0.293135f, 0.125539f,
+ 0.0341828f, 0.993452f, 0.0369177f, 0.0453796f, 0.0329807f,
+ 0.157673f, -0.153195f, 0.122383f, -0.161983f, -0.317619f,
+ 0.105129f, 0.155673f, 0.152489f, 0.0685417f, -0.0595907f,
+ -0.026657f, -0.0954336f, -0.0359557f, 0.105617f, 0.0825066f,
+ 0.100189f, -0.22125f, 0.382508f, -0.0247677f, -0.115807f,
+ -0.0639787f, 0.177786f, 0.0566206f, 0.0496389f, 1.31533f,
+ 0.0482907f, -0.118743f, 0.190632f, 0.172867f, -0.108446f,
+ -0.200186f, 0.122572f, 0.0897468f, 0.0155328f, -0.0380217f,
+ 0.125161f, -0.141723f, -0.023157f, 0.0270805f, -0.101961f,
+ 0.12358f, -0.0866255f, 0.00306761f, -0.131764f, -0.461118f,
+ -0.00803936f, 0.0895496f, -0.153905f, 0.207623f, -0.249099f,
+ -0.0198487f, -0.160013f, 0.81136f, -0.109978f, -0.0880332f,
+ -0.0761368f, -0.0755881f, -0.0384827f, -0.0554777f, -0.0750048f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = {
+ 0.0106809f, 0.136699f, 0.285316f, 0.395746f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = {
+ -0.0161019f, -0.088871f, 0.0463358f, -0.198037f, 0.038122f,
+ 0.0135483f, -0.196641f, -0.433531f, 0.527972f, -0.143716f,
+ 0.558627f, 0.459889f, 0.322864f, -0.491514f, -0.190915f,
+ -0.0765601f, 0.210329f, 0.689389f, -0.100415f, -1.8788f,
+ 0.2228f, 0.292781f, -0.954838f, -0.0788763f, -0.131402f,
+ -0.17154f, 0.049934f, -0.0541183f, -0.530529f, -0.666165f,
+ 0.195492f, 0.218548f, -0.314895f, 0.0749444f, -0.191344f,
+ 0.349469f, 0.00811248f, -0.760157f, 0.0707434f, -0.0719285f,
+ -0.264495f, -0.432009f, -0.432686f, 0.155738f, -0.020197f,
+ 0.19278f, -0.658335f, -0.273143f, -0.286079f, 0.243402f,
+ 0.497701f, 0.0121003f, -0.666308f, 0.028172f, -0.547901f,
+ -0.11755f, 0.322028f, 0.0878274f, -0.0328334f, 0.311816f,
+ 0.0951026f, -1.11429f, -0.0417486f, 0.123467f, -0.0910681f,
+ -0.0154255f, 0.311201f, -0.0156158f, -0.600437f, 0.0274156f,
+ -0.174907f, -1.29313f, -0.178656f, 0.596556f, -0.421725f,
+ -0.289137f, 0.529297f, 0.114833f, -0.0155887f, -0.308232f,
+ -0.0228361f, 0.184017f, 0.138232f, 0.146347f, -0.117867f,
+ 0.248351f, -0.282846f, -0.18058f, 0.348355f, -0.415754f,
+ 0.0657168f, 0.431728f, -0.231043f, -0.186745f, 0.137401f,
+ -0.282329f, -0.159678f, 0.754262f, 0.037824f, -1.68521f,
+ -0.290175f, 0.289588f, -0.18683f, -0.300385f, 0.285449f,
+ -0.00386456f, 0.0563485f, -0.376541f, 0.159899f, -0.697312f,
+ 0.0284389f, 0.437307f, 0.3968f, -0.372082f, -0.232535f,
+ 0.394629f, 0.00315248f, -0.38374f, 0.0311291f, -0.624353f,
+ 0.498083f, -0.342663f, -0.125978f, 0.186797f, 0.187723f,
+ 0.149335f, -0.82727f, -0.0740974f, -0.659039f, 0.42671f,
+ -0.448835f, 0.150677f, 0.830742f, -0.233148f, -0.65308f,
+ -0.0878935f, -0.407797f, -0.511826f, -0.0739023f, 0.506305f,
+ -0.187451f, 0.0284968f, -0.822238f, 0.362523f, -0.270865f,
+ 0.032335f, 0.560413f, -0.00388247f, -0.446333f, 0.163147f,
+ -0.409633f, -0.372575f, 0.306993f, 0.55953f, -0.24362f,
+ -0.0929369f, -0.520298f, -0.444022f, 0.186077f, -0.0942208f,
+ 0.624049f, -0.429625f, -0.869528f, 0.405257f, -0.120445f,
+ 0.537685f, -0.3911f, 0.142142f, 0.0913808f, -0.00375967f,
+ 0.382781f, 0.60505f, -0.271608f, -0.0630436f, -0.150625f,
+ -0.0124598f, 0.0132878f, 0.138475f, -0.106264f, -0.416581f,
+ -0.518415f, 0.185127f, -0.464622f, -0.0102925f, 0.0389567f,
+ 0.406439f, -0.0414264f, -0.366185f, -0.511867f, -0.650255f,
+ 0.278252f, 0.0270234f, 0.262788f, -0.0294793f, 0.12651f,
+ 0.421537f, 0.0300837f, 0.0742187f, 0.281954f, -0.122069f,
+ -0.450145f, -0.312206f, -0.402633f, -0.0868137f, 0.190433f,
+ -0.149602f, -0.175029f, 0.00900023f, -0.266596f, 0.21721f,
+ -0.245079f, -1.09798f, 0.319409f, -0.337938f, 0.358514f,
+ 0.0771549f, 0.447087f, -0.305507f, -0.285492f, 0.383896f,
+ 0.145933f, -0.264944f, -0.118486f, 0.068805f, -0.194231f,
+ -1.79133f, 0.363408f, -0.17434f, -0.229629f, 0.132188f,
+ 0.207548f, -0.876264f, 0.265634f, 0.139332f, 0.236206f,
+ -0.0145184f, 0.562865f, 0.526612f, -0.0333508f, -0.421885f,
+ 0.273485f, -0.110882f, 0.425557f, 0.513303f, -0.422322f,
+ 0.0563155f, -0.0409693f, 0.194768f, -0.419828f, -0.107195f,
+ -1.19224f, 0.48552f, 0.132782f, -0.00932096f, -0.225484f,
+ -0.428484f, -0.0392684f, 0.750697f, 0.337615f, 0.158476f,
+ 0.413484f, 0.326017f, -0.757107f, -0.183962f, 0.00884361f,
+ 0.126507f, -0.0751588f, -0.308782f, -0.104237f, -0.703877f,
+ -0.491806f, -0.204251f, -0.317212f, 0.0815479f, 0.296323f,
+ 0.219632f, -0.039859f, 0.556257f, 0.176144f, -0.0750654f,
+ -0.106419f, 0.00400385f, -0.172266f, 0.000178763f, 0.146532f,
+ 0.255202f, -0.427235f, -0.182198f, -0.256557f, 0.260255f,
+ -0.0143364f, 0.0868664f, -0.564373f, -0.0876947f, 0.726289f,
+ 0.0160001f, -0.381562f, -0.638214f, -0.803803f, 0.25945f,
+ -0.371542f, -0.419611f, 0.238617f, 0.371834f, -0.226777f,
+ -0.894602f, 0.37458f, -0.354866f, 0.0249312f, 0.142374f,
+ 0.433813f, -0.0218183f, -0.33248f, 0.107223f, 0.390823f,
+ -0.0271108f, -0.616878f, -0.604984f, 0.517269f, -0.293573f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = {
+ -0.290371f, -0.0560272f, -0.118144f, -0.270583f, 0.401388f,
+ -0.308677f, 0.150729f, -0.0324442f, -0.135937f, 0.0875581f,
+ 0.0206493f, -0.212682f, -0.0266535f, -0.326656f, 0.0185105f,
+ -1.01429f, -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f
+};
+
+static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = {
+ NUM_CNN_LAYERS, // num_layers
+ 0, // is_residue
+ 0, // ext_width
+ 0, // ext_height
+ 0, // strict_bounds
+ {
+ {
+ CNN_LAYER_0_IN_CH, // in_channels
+ CNN_LAYER_0_WIDTH, // filter_width
+ CNN_LAYER_0_WIDTH, // filter_height
+ CNN_LAYER_0_OUT_CH, // out_channels
+ CNN_LAYER_0_HORZ_STRIDE, // skip_width
+ CNN_LAYER_0_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_0_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_0_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ -1, // output_num
+ },
+ {
+ CNN_LAYER_1_IN_CH, // in_channels
+ CNN_LAYER_1_WIDTH, // filter_width
+ CNN_LAYER_1_WIDTH, // filter_height
+ CNN_LAYER_1_OUT_CH, // out_channels
+ CNN_LAYER_1_HORZ_STRIDE, // skip_width
+ CNN_LAYER_1_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_1_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_1_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 3, // output_num
+ },
+ {
+ CNN_LAYER_2_IN_CH, // in_channels
+ CNN_LAYER_2_WIDTH, // filter_width
+ CNN_LAYER_2_WIDTH, // filter_height
+ CNN_LAYER_2_OUT_CH, // out_channels
+ CNN_LAYER_2_HORZ_STRIDE, // skip_width
+ CNN_LAYER_2_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_2_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_2_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 2, // output_num
+ },
+ {
+ CNN_LAYER_3_IN_CH, // in_channels
+ CNN_LAYER_3_WIDTH, // filter_width
+ CNN_LAYER_3_WIDTH, // filter_height
+ CNN_LAYER_3_OUT_CH, // out_channels
+ CNN_LAYER_3_HORZ_STRIDE, // skip_width
+ CNN_LAYER_3_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_3_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_3_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 1, // output_num
+ },
+ {
+ CNN_LAYER_4_IN_CH, // in_channels
+ CNN_LAYER_4_WIDTH, // filter_width
+ CNN_LAYER_4_WIDTH, // filter_height
+ CNN_LAYER_4_OUT_CH, // out_channels
+ CNN_LAYER_4_HORZ_STRIDE, // skip_width
+ CNN_LAYER_4_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_4_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_4_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 0, // output_num
+ },
+ },
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = {
+ 0.604356f, -0.236007f, 0.342172f, 0.531397f, -0.635698f,
+ -0.591573f, 0.833872f, 0.492814f, -0.100308f, 0.186385f,
+ 0.202779f, 0.263578f, 0.330001f, -0.15531f, 0.879584f,
+ -0.0048796f, 0.490796f, 0.242254f, -0.292211f, -0.696912f,
+ 0.746664f, 0.129371f, -0.0122443f, 0.196234f, -0.251605f,
+ -0.385617f, 0.157707f, 0.699963f, 0.0432536f, -0.11141f,
+ -0.0353473f, -0.0364045f, -0.113556f, -0.520842f, 0.231248f,
+ 0.230638f, -0.323852f, -1.08633f, -0.0469168f, -0.481821f,
+ 0.366838f, 0.189627f, -0.0637262f, -0.484917f, -0.109874f,
+ 0.292237f, 0.368702f, -0.183896f, -0.109038f, -1.22613f,
+ -0.880355f, -1.63768f, 0.337426f, -0.940994f, 0.413097f,
+ -0.37879f, -0.480525f, -0.594819f, -0.0172653f, -0.499436f,
+ -0.298395f, -0.840181f, -0.0758645f, -0.772089f, -0.232727f,
+ -0.815968f, 0.160785f, -0.0767165f, 0.0064244f, -0.540491f,
+ 0.417776f, -0.384337f, -0.497377f, 0.68414f, 0.00797514f,
+ 0.262626f, 0.203732f, 0.702047f, 0.0617544f, 0.0878249f,
+ -0.315032f, -0.0169776f, 0.403986f, 0.815872f, 0.135388f,
+ 0.0858594f, 0.169172f, -0.638227f, -1.65268f, -0.0476042f,
+ -0.982685f, 0.45707f, -0.0577537f, 0.367329f, 0.176513f,
+ -0.356454f, 0.0979095f, -0.277476f, 0.257271f, -0.333451f,
+ 0.0241497f, 0.0671127f, 0.221216f, 0.106065f, 0.537151f,
+ 0.0257329f, 0.265559f, -0.348353f, 0.285569f, -0.0610511f,
+ -1.59334f, -1.63826f, -0.164898f, -0.36605f, -0.489304f,
+ 0.729241f, 0.0197627f, 0.200291f, -0.231506f, -0.255715f,
+ -0.0932264f, -0.728793f, 0.468297f, -1.09592f, -0.079791f,
+ -1.76531f, -0.182904f, -2.05897f, -0.371894f, 0.207124f,
+ 0.255029f, 0.186501f, -0.005805f, 0.00160733f, -0.178206f,
+ -0.352757f, -0.164741f, -0.557583f, -0.559692f, -0.00731467f,
+ 0.149326f, 0.409735f, 0.22083f, -0.332572f, -0.1741f,
+ -0.0519008f, -0.266402f, 0.294031f, -2.4453f, 0.339851f,
+ -0.573747f, -5.97783f, -0.084142f, 0.20286f, -0.576038f,
+ -0.111081f, 0.101238f, -5.83427f, -1.98537f, 0.322796f,
+ -0.60171f, 0.212412f, 0.247176f, 0.603694f, -0.54357f,
+ -0.693439f, 0.250725f, -4.31988f, 0.0935924f, 0.43669f,
+ -0.139706f, -0.158391f, 0.244309f, 0.619213f, -0.309154f,
+ -0.135341f, 0.475815f, -0.290804f, -0.109038f, -0.0937104f,
+ 0.0385907f, -0.29105f, -0.0597651f, -0.451187f, -1.51821f,
+ 0.141772f, 0.822204f, -0.729661f, -0.109908f, 0.178217f,
+ -0.750278f, 0.113762f, -0.0959985f, 0.066579f, -0.104209f,
+ -0.951378f, 1.4087f, -1.13175f, -1.09103f, -1.50416f,
+ -0.182273f, -1.80129f, -0.152135f, 0.356931f, 0.205591f,
+ 0.183148f, -0.498671f, -0.183034f, -0.176428f, 0.395706f,
+ -0.589908f, -0.318276f, -0.421162f, 0.658766f, -0.186752f,
+ 0.0656253f, 0.248002f, 0.289618f, -0.458111f, -0.130789f,
+ -0.542988f, 0.405804f, -0.35364f, -0.311927f, 0.218339f,
+ 0.309215f, -0.130347f, -0.0257543f, 0.0413234f, -0.190205f,
+ -0.242382f, 0.819886f, -0.255157f, -0.181219f, -0.290903f,
+ -0.301995f, -0.0469988f, 0.702936f, 0.209122f, 0.0234243f,
+ 0.598637f, 0.0305196f, 0.0423457f, -0.618799f, 0.0190867f,
+ 0.420584f, -0.224752f, -0.410077f, 0.127854f, 0.395261f,
+ -0.393685f, -0.282822f, 0.0289504f, 0.0406515f, -0.511531f,
+ -0.497611f, 0.0252715f, 0.0812549f, 0.80205f, 1.29084f,
+ 0.764972f, 0.561258f, -0.23499f, 0.217594f, -0.690935f,
+ -0.26607f, 0.357955f, 0.391608f, 0.448352f, 0.458586f,
+ -0.790071f, 0.719959f, -0.468052f, 1.24579f, 0.220705f,
+ 0.284044f, 0.141346f, 0.246687f, 0.147826f, -0.403557f,
+ -0.00648195f, 0.398034f, -0.100464f, -0.77107f, -0.188274f,
+ -0.219245f, -0.0330375f, 0.367585f, -0.220391f, 0.308736f,
+ 0.221399f, 0.340292f, 0.037597f, 0.606083f, 0.665634f,
+ -0.755529f, -0.95989f, -0.243673f, 0.233709f, -0.454628f,
+ -0.110952f, 0.776062f, 0.731136f, -0.140422f, 0.19261f,
+ 0.355086f, 0.975026f, 0.190936f, 0.776205f, 0.982781f,
+ 0.555569f, 0.42382f, -0.409721f, 0.25053f, -0.271328f,
+ 0.859941f, -0.0210901f, 0.0176916f, -0.562895f, -0.0787431f,
+ -0.861032f, -0.34022f, -0.571995f, 0.205436f, 0.346968f,
+ 0.377033f, -1.08484f, 0.297007f, -1.01693f, 0.189463f,
+ -0.483242f, 0.147058f, 0.0159503f, 0.0908779f, -0.46962f,
+ 0.174024f, -0.490704f, -0.383501f, -0.0507626f, 0.00902188f,
+ -0.202495f, 0.205047f, 0.0562261f, -0.143371f, 0.219524f,
+ -0.317294f, -0.0575756f, -0.0595825f, -0.000625279f, -0.278864f,
+ -0.0516874f, -0.225259f, 0.429046f, -0.0952421f, 0.0799135f,
+ -0.122883f, -0.262308f, -0.481006f, -0.0466122f, -0.402822f,
+ 0.150595f, -0.0919558f, -0.356765f, -0.199222f, 0.219389f,
+ -0.214452f, -0.196361f, -0.095758f, -0.115891f, -0.143777f,
+ 0.549843f, -0.113036f, 0.764895f, -0.0114812f, -0.0684054f,
+ -0.98045f, -0.0170634f, 0.247719f, -0.18718f, -0.381566f,
+ 0.150758f, -0.526257f, 1.00851f, 0.776634f, 1.69728f,
+ -0.303058f, 0.228967f, -0.414134f, 0.0858226f, -0.285472f,
+ 0.431459f, 0.315318f, 0.587835f, 0.335737f, -0.0222039f,
+ 0.18945f, 0.274008f, 0.609263f, 0.320232f, -0.214137f,
+ -0.0297668f, 0.0439046f, -0.52821f, -0.0127375f, 0.431885f,
+ 0.508846f, -0.329189f, -0.166778f, -0.94338f, -0.358807f,
+ 0.208641f, -0.517986f, -0.128278f, 0.693464f, -0.24408f,
+ -0.0669412f, -0.410287f, 0.0444145f, -0.264179f, 0.143884f,
+ 0.276842f, 0.498934f, -0.682557f, -0.217198f, -0.8249f,
+ -0.40446f, -0.115376f, 0.417934f, 0.65605f, -0.00570035f,
+ -0.365742f, -0.367625f, 0.526824f, -0.0164913f, -0.255998f,
+ 0.247292f, 0.0846536f, 0.109302f, -0.302996f, 0.160564f,
+ 0.0228132f, 0.035211f, -0.236951f, 0.493801f, 1.37315f,
+ -0.182348f, 0.234437f, -0.256906f, 0.12523f, 0.667113f,
+ -0.437981f, -0.0721831f, 0.303976f, -0.041336f, -0.145894f,
+ -0.733741f, 0.436056f, 0.368542f, -0.149072f, -0.290281f,
+ 0.0946743f, -0.0579292f, 0.264539f, 0.170048f, 0.262411f,
+ 0.049679f, 0.371369f, 0.760675f, 0.482157f, -0.0196783f,
+ 0.260888f, 0.948856f, 0.170228f, -0.134432f, -0.942235f,
+ -1.23226f, -0.373963f, -0.0381773f, -0.17947f, 0.00947998f,
+ 0.01086f, 0.389578f, -0.380389f, -0.0865851f, -0.220328f,
+ -0.171901f, -0.384325f, -0.0787615f, 0.392678f, 0.123392f,
+ -0.0895824f, 0.00480886f, -0.162918f, 0.214336f, -0.00147339f,
+ 0.203899f, -0.00292344f, -0.148594f, 0.0425697f, -0.306896f,
+ -0.342225f, -0.45088f, -0.184454f, -0.00923638f, -0.521993f,
+ -0.334464f, 0.156497f, -0.0856832f, -0.277661f, -0.0721105f,
+ -0.488781f, -0.509543f, -0.012664f, 0.0940558f, -0.29869f,
+ 0.0434843f, -0.0178945f, -0.0525666f, -0.303178f, 0.713507f,
+ -0.137413f, -0.170289f, -0.142942f, -0.316002f, 0.229125f,
+ -0.277585f, 0.0125026f, 0.508316f, -1.20614f, -0.915129f,
+ -1.63389f, -0.454604f, -0.893951f, -0.447403f, -0.751423f,
+ 1.3886f, 0.617818f, 0.611458f, -0.884173f, -0.7779f,
+ -0.608639f, -0.164759f, -0.631846f, -0.176894f, -0.459361f,
+ -0.187119f, 0.173283f, -0.477191f, -0.156736f, 0.182675f,
+ 0.598854f, -0.489941f, -0.420493f, -0.162002f, 0.344418f,
+ 0.33832f, -0.187463f, -0.388721f, -0.0733151f, -0.138835f,
+ 0.313699f, 0.0625967f, -0.291488f, 0.114088f, -0.356843f,
+ 0.197506f, 0.0320749f, 1.16745f, -0.36081f, 1.63416f,
+ 0.198392f, 1.13928f, -0.317971f, 0.531019f, 0.526518f,
+ 0.185814f, 0.0923607f, 0.192858f, -0.234378f, 0.18091f,
+ -0.228837f, 0.397216f, 0.581501f, 0.284376f, -0.130434f,
+ 0.20076f, 0.242662f, -0.0480872f, 0.131746f, 0.362712f,
+ 0.0146821f, 0.475679f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = {
+ 0.477356f, 0.385222f, 0.389122f, 0.539506f, -0.0272558f, 0.581605f,
+ -0.800961f, 0.142229f, 0.117549f, -0.0724944f, 0.102095f, -0.71319f,
+ -0.0162434f, -0.132858f, 0.543411f, -0.626599f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = {
+ 0.195436f, -0.623354f, 1.27907f, 0.270071f, -0.677612f,
+ 0.0266141f, 0.272991f, -0.425446f, 0.891889f, -0.299836f,
+ -0.611825f, -0.0322273f, 0.185276f, 0.238639f, -0.150954f,
+ 0.083495f, -0.472106f, 0.573506f, 1.16465f, -0.154947f,
+ 0.640631f, -1.59467f, -9.8166f, -0.22889f, -0.189912f,
+ 0.227052f, -0.540787f, 0.0840873f, -3.04293f, -0.0209975f,
+ -6.10979f, -5.92801f, 0.288467f, -0.169476f, 0.0527948f,
+ -1.21202f, -0.280915f, 0.290863f, -0.601877f, 0.0598784f,
+ -0.592136f, -0.535588f, -0.0434018f, -0.653223f, 0.00339129f,
+ -0.133273f, 0.279463f, 0.483879f, 0.463664f, -0.14174f,
+ -1.56354f, 0.560043f, -1.44639f, 0.673528f, -0.108418f,
+ -0.707313f, 0.49633f, -0.0321971f, 0.411475f, -0.382184f,
+ -0.965501f, -0.0507655f, 0.540415f, -0.977297f, 0.370382f,
+ -0.375683f, 0.0844529f, -2.0002f, -0.346289f, 0.621251f,
+ -0.489855f, 0.191252f, -0.576629f, -0.35773f, 0.023167f,
+ 0.180793f, -0.417864f, 0.0587254f, 0.167824f, 0.0612058f,
+ -0.712108f, 0.155614f, 0.900036f, -0.480124f, 0.146117f,
+ 0.467011f, 0.412525f, 0.312724f, 0.551826f, -0.179601f,
+ 0.706261f, 0.00674965f, -0.495221f, 0.140829f, -0.0619195f,
+ -0.0697912f, 0.511967f, -0.0318237f, -0.285946f, -0.28608f,
+ 0.0894142f, 0.234351f, -0.272328f, -0.350369f, -0.392605f,
+ 0.287318f, 0.310426f, 0.293524f, 0.357681f, -0.157868f,
+ 0.149652f, -0.259363f, 0.192941f, -0.850096f, 0.456507f,
+ 0.387857f, -0.491187f, -0.0541993f, -0.28118f, 0.193991f,
+ -0.0956664f, 0.0679829f, 0.0341118f, 0.141826f, 0.271538f,
+ -0.285295f, -0.68666f, 0.306414f, 0.600678f, 0.494801f,
+ -1.11907f, 0.524849f, 0.151169f, 0.474068f, -0.43441f,
+ -0.229138f, 0.0345483f, 0.682888f, -0.471534f, -0.0457066f,
+ -2.36721f, 0.446407f, 0.20396f, -1.17868f, 0.815363f,
+ -1.13897f, 0.397217f, -0.593796f, -6.95512f, 0.650695f,
+ 0.771657f, 0.15227f, -0.824519f, 0.617854f, -0.295353f,
+ -0.101207f, 0.600989f, -0.550653f, -0.722371f, 0.292006f,
+ -0.451891f, 0.54544f, 0.354278f, 0.0136258f, 0.192003f,
+ 0.258275f, -0.0443647f, 0.0928186f, 0.667775f, 0.239558f,
+ 0.0523887f, 0.71586f, 0.292563f, 0.362479f, 0.373453f,
+ 0.250638f, -0.423037f, -0.486574f, -0.619397f, 0.343888f,
+ 0.974971f, 0.574218f, 0.273989f, -0.209956f, -0.274333f,
+ 0.0553766f, 0.263918f, 0.733824f, 0.038713f, -0.0788992f,
+ 0.292014f, 0.111808f, -0.197507f, 0.593668f, -0.0245337f,
+ 0.0873662f, 0.530997f, 0.620717f, 0.310697f, -1.54861f,
+ 1.12915f, 0.0991346f, -0.59214f, 0.422325f, -0.0157936f,
+ 0.380975f, 0.626403f, 0.268064f, -0.615231f, -1.43172f,
+ 0.0928048f, 0.0949026f, -0.470912f, -0.0867527f, -0.0381206f,
+ 0.178393f, -1.13737f, 0.12798f, 0.258214f, -0.803364f,
+ 0.177506f, 0.542718f, 0.660656f, 0.145091f, 0.183056f,
+ -0.47338f, 0.469287f, 0.10832f, 0.0994899f, -0.402719f,
+ 0.157287f, 0.523071f, -0.324493f, 0.343599f, 0.664839f,
+ -0.0375519f, -0.279238f, -0.0722333f, 0.395344f, -0.289316f,
+ 0.0259298f, -0.843245f, -0.160021f, 0.741429f, -1.38726f,
+ -0.2969f, -0.240443f, 0.247731f, -1.04088f, -0.280454f,
+ -0.237054f, -0.759227f, 0.0456369f, -0.647453f, -1.02372f,
+ -0.200395f, -0.546839f, -0.104226f, -0.152727f, -0.56685f,
+ -0.0559663f, -0.425494f, -0.610679f, -0.987096f, -0.575138f,
+ -0.0887979f, 0.463646f, -1.041f, -0.49412f, -0.175298f,
+ -0.463296f, -0.955177f, 0.17852f, -1.10694f, 0.181991f,
+ -0.18998f, 0.227818f, 0.688237f, -1.10444f, 0.549108f,
+ -0.171849f, -0.245614f, 0.120624f, 1.29571f, 0.607116f,
+ 0.00809927f, 0.1041f, -1.22918f, -0.212948f, 0.430239f,
+ -1.57341f, 0.482054f, 0.275905f, 0.939785f, -1.0209f,
+ -0.355534f, 0.397337f, -0.0593077f, -0.239603f, 0.475483f,
+ -0.999101f, -0.140578f, 1.04787f, -0.591981f, -0.306989f,
+ -0.879012f, -0.994715f, 0.0343158f, 0.218509f, 0.34704f,
+ 0.0672934f, -0.178941f, 0.20509f, -0.360031f, 0.161241f,
+ -0.324775f, -0.359531f, -0.0657085f, -0.864422f, -0.444865f,
+ 0.597095f, -0.948691f, 0.240001f, -0.783159f, -0.569422f,
+ 0.974205f, -1.04539f, 0.345915f, -0.681558f, -0.246047f,
+ 0.256174f, 0.493667f, 0.681324f, 0.155613f, 0.773309f,
+ -0.647027f, -0.214744f, -0.474202f, -0.661092f, -1.02316f,
+ 0.0572593f, -0.437082f, -0.119874f, -0.464877f, -0.58067f,
+ -0.218029f, 0.319516f, -0.378983f, -0.0698695f, 0.554693f,
+ -0.537875f, 0.126429f, -0.145113f, -0.594312f, -0.218021f,
+ -0.703569f, 0.0720548f, 0.261054f, -0.81438f, 0.249921f,
+ 0.165296f, -0.079028f, -0.322647f, 0.134458f, 0.0975046f,
+ 0.538594f, -0.250126f, 0.142309f, 0.526486f, 0.0532615f,
+ -0.383332f, -0.38143f, -0.101611f, 0.519776f, -0.278364f,
+ -0.23287f, -0.29139f, 0.22353f, 0.472085f, 0.366264f,
+ 0.741187f, 0.42019f, 0.0676459f, -0.230008f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = {
+ -0.48603f, -0.578556f, 0.257639f, 0.459915f, 0.178156f, -1.16663f,
+ 0.828891f, 0.620291f, 0.413257f, -1.00508f, -0.574179f, -1.20623f,
+ -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f,
+ 0.0233112f, 0.126045f, 0.361304f, 0.655317f, 0.413134f, 0.769947f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = {
+ 0.67244f, -2.59179f, 0.50425f, -1.86481f, 1.15891f, -1.26447f,
+ 0.761081f, 0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f,
+ -0.560935f, 0.838959f, 0.502264f, -1.28958f, -0.205551f, 0.635671f,
+ -1.12619f, -1.68277f, 0.83361f, 1.57235f, 1.15839f, 0.35345f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = {
+ 1.14463f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = {
+ 0.364612f, 0.237868f, -0.192821f, 0.12364f, 0.522205f,
+ -0.205785f, -0.503288f, -0.426503f, -0.083073f, 0.0164429f,
+ 0.184278f, -0.426055f, 0.0717997f, -0.261968f, 0.176412f,
+ -0.101226f, 0.0400285f, -0.332051f, 0.344385f, 0.189565f,
+ 0.441162f, 0.330462f, -0.719857f, -1.14209f, 0.557831f,
+ 0.104756f, 0.0562001f, -0.465923f, -0.344592f, -0.191554f,
+ -0.0656866f, -0.640162f, 0.419388f, 0.409308f, -1.68632f,
+ -1.10829f, 0.105485f, -0.14561f, -0.944738f, 0.104629f,
+ -0.146837f, 0.538823f, -0.153157f, 0.321081f, -1.77714f,
+ -0.0559296f, 0.324136f, -0.497023f, -1.15793f, -0.740144f,
+ -0.0888472f, 0.010059f, -0.18394f, -0.234405f, -0.10586f,
+ 0.130958f, -0.101944f, -0.186483f, -0.447049f, -0.900026f,
+ 0.128444f, 0.401696f, 0.128509f, 0.123778f, 0.062168f,
+ -0.321755f, -0.0691584f, 0.254468f, -0.115212f, -0.848885f,
+ 0.817005f, 0.0615853f, 0.153363f, 0.513855f, 0.789225f,
+ 0.356168f, 0.371613f, 0.269541f, 0.268173f, 0.220481f,
+ -0.109063f, -0.00620798f, -0.0334622f, 0.236267f, -0.0235294f,
+ -0.0800253f, 0.0294184f, 0.047131f, -0.224047f, 0.0890737f,
+ -0.356293f, 0.0989534f, 0.16799f, 0.498266f, 0.612581f,
+ -0.372897f, -0.75125f, 0.77698f, 1.1032f, -0.0764679f,
+ 0.0266299f, 0.309532f, 0.461305f, 0.0193521f, -0.0939161f,
+ -0.276156f, -0.102714f, -0.0828328f, 0.40003f, 0.122542f,
+ 0.0867203f, -0.170738f, 0.0850642f, -0.130762f, 0.082324f,
+ -0.115218f, -0.0244491f, 0.0434331f, 0.216453f, 0.443733f,
+ -0.173679f, -0.161617f, 0.316209f, -0.689656f, -1.52007f,
+ -0.421018f, 0.430833f, -0.00734122f, 0.284499f, -0.0207885f,
+ 0.0572024f, -0.878942f, 0.388264f, 0.0191589f, -0.123415f,
+ -0.0461196f, -0.0444461f, -0.00383171f, 0.0945655f, -0.0597219f,
+ -0.374918f, 0.0182124f, 0.523083f, 0.00519547f, 0.80513f,
+ -0.221433f, -1.30591f, -0.416917f, -0.718173f, 0.622999f,
+ 0.941798f, 0.0477536f, 0.0303772f, 0.268078f, 0.414778f,
+ 0.394325f, 0.299733f, -0.583208f, 0.309379f, 0.416581f,
+ 0.0299948f, -0.409145f, -0.161557f, -0.214082f, -0.0098119f,
+ 0.221912f, 0.107135f, 0.0692518f, 0.00490957f, 0.107613f,
+ -0.368404f, -0.548006f, 0.208274f, 0.550475f, 0.643678f,
+ -1.65859f, 0.095938f, -0.0434245f, -0.0792685f, 0.838109f,
+ -0.0138653f, -0.527573f, -0.123472f, -0.235618f, -0.677401f,
+ -0.125877f, -0.175604f, -0.203196f, 0.113478f, -0.228323f,
+ -0.53539f, 0.134458f, 0.0534899f, -0.213006f, -0.138679f,
+ -2.15023f, 0.186303f, 0.48566f, -1.22301f, -0.240982f,
+ -0.486836f, -0.121181f, -0.131382f, -0.0320283f, 0.278828f,
+ 0.342581f, -0.182257f, -0.365193f, -0.226351f, 0.108928f,
+ -0.100159f, 0.448355f, -0.0768947f, 0.0633719f, -0.104786f,
+ 0.0456653f, 0.0965752f, 0.156403f, -0.157337f, 0.212259f,
+ 0.317939f, 0.124193f, -0.329475f, 0.206868f, -2.15986f,
+ -0.108385f, -0.396769f, -0.0317231f, -0.271524f, -0.184697f,
+ 0.662615f, 0.412926f, -0.0217462f, -0.0285475f, -0.118826f,
+ 0.0252706f, -0.137091f, 0.198973f, 0.329509f, -0.0831966f,
+ -0.621237f, 0.0896179f, 0.805261f, -0.019675f, 0.962452f,
+ 0.307433f, 0.892168f, -0.537587f, -2.46145f, 0.125606f,
+ 0.920491f, 0.219462f, 0.292765f, -0.748238f, -0.0537239f,
+ -0.224326f, 0.505492f, 0.176426f, 0.0343168f, 0.16708f,
+ -0.581393f, 0.951726f, -1.1777f, -0.561914f, -1.53288f,
+ 0.864567f, -1.19648f, -1.24141f, -0.334688f, -0.622026f,
+ 0.666876f, -0.197005f, -0.600507f, -0.851924f, 0.492299f,
+ 0.31078f, -0.0736115f, 0.030999f, -6.02463e-05f, -0.0604341f,
+ -0.0254238f, 0.139222f, 0.333235f, 0.366534f, -0.191982f,
+ -0.0156092f, 0.44234f, -0.0193213f, 0.0938745f, -0.015709f,
+ -0.12043f, 0.00895591f, 0.0464401f, 0.0530699f, -0.623018f,
+ -1.23372f, -0.538647f, -1.12389f, 0.26742f, 0.548694f,
+ 0.00540655f, -0.219703f, 0.314894f, -0.573463f, -0.241555f,
+ 0.441851f, 0.422491f, 0.253785f, -0.384683f, 0.0370165f,
+ 0.226669f, 0.245587f, 0.215265f, -0.122272f, 0.0492235f,
+ 0.000658591f, -0.312877f, 0.436487f, -0.229199f, -0.174373f,
+ 0.904268f, -0.855845f, -0.877293f, -0.65409f, 0.313795f,
+ 0.461748f, -0.737766f, -0.228523f, 0.182181f, 0.334522f,
+ 0.0629676f, -0.151087f, 0.178798f, -0.325809f, -0.331672f,
+ 0.0865837f, -0.0684225f, 0.0252008f, -0.0820631f, 0.0481863f,
+ 0.209473f, -0.0242151f, -0.0898919f, -0.163828f, -0.164282f,
+ 0.581888f, 0.816896f, 0.0607674f, 0.364855f, -0.346512f,
+ -0.764174f, 0.595561f, 0.302872f, 0.206361f, 0.106917f,
+ -0.972338f, 0.176948f, 0.6415f, -0.131897f, -0.155802f,
+ 0.216337f, -0.342511f, 0.123743f, -0.123014f, 0.0205439f,
+ 0.15173f, -0.23801f, -1.00387f, 0.651328f, 0.237439f,
+ -0.542952f, 1.066f, -0.161107f, -0.593545f, 0.219343f,
+ -0.178094f, 0.0789992f, 0.428332f, 0.23827f, -0.327421f,
+ 0.416144f, 0.00394653f, 0.052046f, -0.238289f, 0.405942f,
+ 0.00141984f, 0.161017f, 0.077111f, 0.0823985f, 0.0981208f,
+ 0.109949f, -0.0428502f, 0.343629f, -0.722978f, -0.375269f,
+ -0.111634f, -0.271523f, 0.712093f, 0.684904f, -0.572331f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = {
+ 0.583367f, -0.202004f, -0.207626f, 0.412451f, -0.258311f, 0.0304954f,
+ -0.102458f, 0.450087f, -0.376851f, -0.338702f, 0.335226f, 0.889072f,
+ 0.502411f, 0.649282f, 0.15345f, -0.0109896f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = {
+ 0.0214882f, -0.934339f, -0.173335f, 0.8362f, -0.764234f,
+ 0.525163f, 0.409749f, 0.821539f, -0.784157f, -0.455593f,
+ 0.446099f, 0.406756f, 0.479242f, -0.814038f, -0.419332f,
+ 0.328869f, -0.340707f, 0.133219f, 0.0320347f, 0.25089f,
+ -0.324917f, -0.0684265f, 0.0377777f, -0.262556f, 0.673458f,
+ -0.0291454f, -0.417957f, -1.0075f, -0.481537f, 0.922105f,
+ -0.000516239f, -0.40034f, 0.242067f, -0.43178f, 0.32001f,
+ 0.143599f, -0.345172f, 0.126093f, 0.148518f, -1.12151f,
+ -1.03435f, 0.551691f, -0.310001f, -0.323194f, -0.595128f,
+ -0.395689f, 0.737268f, -0.729227f, 0.590804f, -0.590022f,
+ -1.01427f, -0.521159f, -0.617579f, 1.07292f, -0.613047f,
+ -0.619093f, 0.335268f, 0.473753f, -0.795027f, 1.24635f,
+ -0.556193f, 0.241046f, -0.0354181f, -0.354215f, 0.716752f,
+ -0.00200745f, -1.25171f, -0.440731f, -0.763918f, -0.588614f,
+ -0.183901f, -0.396056f, 0.226903f, 0.921471f, 1.10465f,
+ 0.207053f, 0.57681f, -0.555699f, 0.235469f, -0.92149f,
+ 0.625808f, 0.29653f, -0.81775f, -0.307889f, -1.41384f,
+ -0.136205f, -0.365314f, -0.516741f, 0.748052f, 0.617947f,
+ 0.0973239f, 0.839607f, 0.530668f, -0.227032f, -0.449044f,
+ -1.04725f, -0.244363f, -0.396888f, -0.146161f, 0.359789f,
+ 0.0436599f, 1.21645f, -0.336069f, 0.0534646f, -0.00200328f,
+ 0.658551f, -0.156142f, -1.0728f, 0.0951015f, 0.234837f,
+ -0.380525f, 0.041783f, -0.269273f, 0.0386013f, -0.455589f,
+ -0.174338f, 0.0345251f, 0.17116f, -0.507642f, 0.210453f,
+ 0.739987f, -0.0438776f, 0.570145f, -0.118811f, 0.0548662f,
+ 0.153458f, -0.89887f, 0.493704f, 0.283351f, 0.785441f,
+ -0.586002f, -0.0616167f, -0.714328f, -0.145941f, -0.449656f,
+ 0.850117f, 0.279997f, 0.204143f, -0.31356f, 0.947057f,
+ -0.135787f, 0.747071f, 0.0145968f, -0.81414f, 0.431009f,
+ -0.275824f, -0.342928f, -0.0528272f, -0.592183f, 0.433915f,
+ -0.251752f, -0.311815f, -1.47533f, -1.43677f, 0.0698436f,
+ 1.01341f, 0.305063f, -0.252003f, -0.428915f, -0.00104153f,
+ -0.368267f, -0.354523f, -0.27956f, -0.771664f, 0.232092f,
+ -0.428495f, 0.424952f, -0.343229f, 0.196899f, -0.761084f,
+ -0.0110293f, -0.335361f, 0.571637f, -0.423489f, -0.52773f,
+ 0.0108043f, -0.504715f, -1.1419f, -0.402904f, -0.160747f,
+ -0.329184f, 0.375374f, -1.02604f, -0.601371f, 0.631652f,
+ 0.0742486f, -0.464765f, 0.467445f, 0.240562f, -0.38211f,
+ -0.459004f, 0.704196f, 0.021357f, 0.860785f, -1.16731f,
+ -0.479029f, -0.139644f, -0.444087f, 0.322326f, -0.25455f,
+ 0.874399f, 0.477696f, 0.0464487f, 1.20658f, 0.0993356f,
+ 0.00682712f, -0.10163f, -0.371765f, -0.629513f, -0.679196f,
+ -0.193935f, 0.47405f, -0.18238f, 0.254918f, -0.35306f,
+ -0.375611f, 0.119771f, -0.257282f, -0.565124f, 0.162667f,
+ -0.356128f, 0.870351f, 0.241847f, -0.264712f, -0.384322f,
+ 0.31807f, 0.211621f, -0.180767f, 0.764944f, 0.368646f,
+ 0.186111f, 1.02458f, -0.494252f, -0.483375f, -0.699664f,
+ 0.00415657f, -0.189376f, -0.677103f, -0.030319f, 0.667087f,
+ 0.810951f, -0.488237f, -0.387355f, -0.726579f, -0.304763f,
+ 1.10392f, -0.775977f, -0.247731f, 0.532396f, 1.24089f,
+ 0.206621f, -0.670568f, -1.08142f, -0.342503f, 0.189854f,
+ -0.200846f, 0.784204f, 0.641112f, -0.509346f, 0.0805264f,
+ -1.40006f, 0.322084f, -0.823739f, -1.12965f, -0.215668f,
+ 0.099673f, 0.425966f, 0.771697f, 0.338834f, 0.345364f,
+ -0.297826f, -0.176746f, -0.297299f, -1.80029f, -0.178348f,
+ 0.421194f, -0.19155f, 0.417653f, 0.374441f, -0.135654f,
+ -0.895843f, 0.220647f, 0.368264f, 0.369233f, 0.382707f,
+ 0.0800511f, 0.542053f, 0.318896f, -0.385539f, 0.313305f,
+ -1.01166f, -0.222379f, -1.53708f, 1.32407f, -0.665444f,
+ -0.102348f, 0.0410504f, -0.616825f, 1.3108f, 0.405902f,
+ 1.27777f, 0.0630558f, -0.172696f, 0.16224f, -1.10111f,
+ -3.31326f, -0.242566f, 0.831422f, 0.917397f, 0.311749f,
+ -0.238613f, 0.438007f, -0.407089f, -0.0202555f, -1.82502f,
+ -0.907965f, -0.300031f, -0.616669f, -0.767921f, 0.285919f,
+ -0.112019f, 0.252677f, 0.350892f, 0.000214244f, 0.315915f,
+ 0.260344f, 0.327362f, -0.0211213f, -0.41241f, 0.0418355f,
+ 0.103328f, -0.0158439f, -0.230505f, -0.0215114f, 0.266739f,
+ -0.234376f, -0.352583f, 0.0709437f, -0.90649f, -0.535843f,
+ 1.21322f, -1.05144f, -0.983682f, -0.189956f, 1.14208f,
+ -0.0188492f, -0.254821f, -0.463214f, -0.708714f, 0.0447348f,
+ -0.220831f, 0.476299f, 0.102544f, 1.1173f, -0.36981f,
+ -0.814102f, 0.103604f, -0.247871f, 0.0610701f, -0.356616f,
+ -0.144093f, 1.66496f, 0.180206f, -1.04384f, -0.65883f,
+ 0.0290771f, -0.622728f, 0.761523f, -0.909091f, -0.0340348f,
+ 0.666895f, -0.0232575f, 0.962643f, -2.50103f, -1.69745f,
+ -0.0482305f, 0.771811f, -1.32233f, -0.778722f, -0.203309f,
+ 0.395875f, -0.171812f, 0.253794f, 0.432799f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = {
+ -0.152159f, 0.552347f, -0.806068f, 0.227901f, 0.335896f, 0.180785f,
+ 0.75277f, 0.982208f, 0.409823f, -0.17755f, -0.125365f, 0.738114f,
+ 0.202331f, 0.751737f, -0.360511f, 0.149254f, 0.085073f, -0.214542f,
+ 0.529727f, -0.0348777f, -2.13162f, -0.893332f, -0.136952f, -0.71258f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = {
+ -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f, -2.31153f,
+ 0.912733f, 0.879995f, -1.00602f, -1.02467f, 0.0536835f, 1.76011f,
+ -0.898546f, 1.06959f, 1.60471f, -1.7312f, -0.877168f, -0.681185f,
+ -1.57286f, -1.16038f, -4.11303f, -3.06351f, -3.02536f, -2.92186f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = {
+ 1.33207f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = {
+ 0.0419551f, 0.0924078f, -0.153084f, 0.191642f, 0.069586f,
+ -0.530661f, 0.431968f, 0.000453838f, 0.793047f, 0.0161817f,
+ -0.476075f, -0.156638f, -0.219066f, 0.372716f, -0.0642299f,
+ 0.156813f, -0.105819f, -0.0519422f, 0.149935f, 0.295544f,
+ 0.192037f, -0.0450383f, 0.828794f, -0.0510661f, -1.22549f,
+ -0.100293f, -0.178274f, 0.0304427f, -0.0664097f, -0.0438936f,
+ 0.948248f, 0.425486f, -0.238206f, 1.3744f, 0.336897f,
+ 0.0760769f, -0.583508f, 0.0735519f, -0.117024f, 0.0501598f,
+ 0.332212f, 0.199531f, 0.424764f, 0.206712f, 0.342868f,
+ 0.592673f, -0.0961148f, -0.190113f, -0.155027f, 0.00789871f,
+ -0.0514839f, -0.416154f, -0.290309f, 0.407541f, 0.48534f,
+ 0.126564f, 0.0709566f, -0.0469664f, 0.735403f, -0.365963f,
+ 0.150295f, -0.50147f, 0.021383f, 0.76514f, 0.0085721f,
+ -0.416384f, 1.22268f, 0.0832438f, 0.367813f, -0.12012f,
+ 0.823183f, -0.0525972f, -0.325526f, -0.0983032f, 0.370128f,
+ 0.368778f, 0.138971f, -0.0397997f, 0.411058f, -0.0400404f,
+ 0.588437f, -0.29963f, -0.107992f, -1.75238f, -0.274387f,
+ 0.430418f, 0.495152f, 0.283172f, -0.441166f, 0.195339f,
+ -0.436182f, -0.252613f, 0.176204f, -0.126541f, -0.474833f,
+ -0.0721603f, -0.496599f, -0.0608464f, 0.0333451f, -0.0621485f,
+ 0.0843859f, 0.0637854f, -0.145291f, 0.14876f, 0.181665f,
+ -0.675805f, 0.294903f, 0.301118f, -0.225957f, 0.0105897f,
+ -0.136427f, -0.555925f, -0.158853f, -0.216779f, 0.0612481f,
+ -0.107158f, 0.352451f, 0.140536f, -0.0148237f, 0.189371f,
+ -0.091046f, -0.0476226f, 0.366054f, -0.0723413f, 0.389883f,
+ -0.0213411f, 0.0279539f, 0.194827f, -0.271502f, -0.166474f,
+ 0.0690549f, 0.0584665f, 0.0198415f, -0.442348f, 0.1571f,
+ -0.113463f, -0.16822f, -0.0580659f, -0.13441f, -0.0022386f,
+ 0.251521f, -0.160494f, -0.0753547f, 0.0897289f, 0.137917f,
+ 0.129836f, 0.0816833f, -0.626288f, 0.0643293f, -1.20001f,
+ 0.085631f, -0.195602f, 0.251244f, 0.0321744f, 0.0493178f,
+ -0.220616f, 0.724075f, -0.00831514f, 2.00319f, 0.407932f,
+ 0.0710799f, -0.166128f, 0.0126611f, -0.229644f, -0.0984299f,
+ 0.632041f, -0.0946141f, 0.295315f, 0.100934f, 0.184883f,
+ -0.236173f, 0.158081f, 0.195775f, 0.413542f, 0.789801f,
+ 0.767741f, 0.166275f, -0.348271f, -0.384074f, -0.291648f,
+ -0.119899f, 0.0368354f, 0.0751987f, 1.04217f, -0.159002f,
+ -2.71592f, -0.788502f, -1.06268f, 0.536057f, 0.0575876f,
+ 1.06811f, 0.12033f, 0.198578f, -0.0419196f, 0.0631388f,
+ 0.623138f, -0.142226f, 1.33129f, 0.0868059f, -0.0287825f,
+ 0.139378f, -0.143037f, 0.307452f, 0.0363987f, -0.0976368f,
+ 0.040544f, 0.0269327f, -0.0845524f, 0.0674699f, 0.104501f,
+ -0.0351155f, 0.167071f, 0.00986971f, 0.10284f, 0.0300016f,
+ 0.192601f, 0.0397177f, 0.0251346f, -0.00912908f, -0.0452825f,
+ 0.0164356f, -0.0275149f, 0.194846f, 0.0943608f, 1.61674f,
+ 0.0124345f, 0.523787f, 0.0397258f, -0.17208f, -0.147808f,
+ -1.23583f, 0.676385f, 0.551994f, 0.0233041f, 0.0116391f,
+ -0.466706f, 0.154725f, -0.207371f, 0.606662f, 0.247286f,
+ 0.31216f, 0.173765f, -0.268033f, 0.224422f, 0.314649f,
+ 0.481922f, -0.190604f, -0.0129162f, 0.270552f, 0.135195f,
+ 0.0927735f, -0.226099f, 0.53897f, 0.103309f, -0.0257271f,
+ -0.0246776f, 0.442013f, -0.179246f, -1.02581f, 0.206176f,
+ -0.326365f, 0.391623f, -0.103549f, 0.115645f, 0.0269328f,
+ -0.584517f, -0.237502f, 0.157996f, 0.0447407f, -0.161f,
+ -0.126072f, -0.148967f, -0.416347f, 0.0236496f, -1.12612f,
+ 0.0120709f, -0.00979376f, 0.0507126f, -0.172262f, 0.0697059f,
+ -0.212334f, 0.335731f, -0.0301362f, -0.839583f, -0.238539f,
+ 0.0636752f, -0.0467217f, -0.0372118f, -0.144615f, -0.161773f,
+ -0.648242f, 0.158197f, -0.051471f, -0.0615805f, -0.0426936f,
+ -0.0745554f, 0.358975f, 0.358297f, 0.0568553f, -1.14383f,
+ -0.103955f, 0.728194f, -0.224945f, -0.31659f, -0.204458f,
+ 0.171763f, -0.465666f, 0.899234f, -0.37042f, -0.0894774f,
+ 0.11478f, -0.334957f, 0.0896514f, 0.413251f, 0.359471f,
+ 1.41597f, 0.558082f, 0.153486f, 0.0270558f, -0.0178797f,
+ 0.124983f, -0.12273f, -1.04516f, -0.125375f, 0.370336f,
+ -0.209423f, -0.36816f, -0.66077f, -0.0180773f, -0.628921f,
+ -0.178542f, 0.0346841f, 0.0319309f, -0.470138f, 0.172763f,
+ 0.0798846f, -0.259737f, -0.652461f, -0.386283f, -0.474447f,
+ -0.924054f, -0.0154613f, -0.613712f, -0.138068f, -0.337842f,
+ 0.217921f, -0.0711405f, 0.000404091f, -0.703766f, 0.0364683f,
+ 0.150173f, 0.0126249f, 0.170594f, 0.0371879f, -0.0862515f,
+ -0.23454f, -0.0144143f, 0.164947f, 0.45591f, 0.115703f,
+ 0.069752f, -0.011993f, 0.0402097f, 0.00697581f, 0.0811613f,
+ 0.384752f, 0.341977f, 0.06087f, 0.0590107f, 0.00812679f,
+ 0.121211f, -0.0612108f, 0.167851f, 0.195781f, -1.62162f,
+ 0.336292f, -0.0772523f, -0.310786f, 0.188257f, -0.0325804f,
+ -0.240098f, 0.158748f, -0.265264f, 3.19593f, -0.449251f,
+ -1.33102f, -0.482856f, -0.435731f, 0.300808f, 0.346503f,
+ 2.67378f, -0.152379f, 0.219322f, -0.146119f, -0.0584806f,
+ -0.0276895f, -0.21955f, -0.479179f, -0.689545f, 0.152799f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = {
+ -0.296575f, 0.101072f, -0.208429f, 0.111585f, 0.699552f, -0.379484f,
+ 0.313244f, -0.746369f, 0.867757f, 0.457318f, -0.0190943f, -0.290745f,
+ 0.45592f, -0.160465f, -0.634243f, 0.0829737f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = {
+ 0.27511f, -2.14172f, 1.25755f, -0.554772f, 0.589508f,
+ 0.228307f, 0.0754914f, 1.07061f, 0.293323f, 0.65162f,
+ -0.272016f, -1.33519f, -0.606759f, -0.57827f, 0.368807f,
+ -1.48668f, 0.162439f, 0.0821667f, 0.225535f, -0.795996f,
+ 0.0328293f, 0.975476f, -0.187514f, 2.47069f, -1.5638f,
+ -0.461524f, 0.00310062f, 1.1556f, -0.286206f, 0.00426021f,
+ 0.585836f, 0.900007f, 0.384055f, 0.189435f, -0.157291f,
+ -0.0710573f, -0.0663986f, -0.710772f, -0.669136f, -0.379493f,
+ -1.2634f, -0.377524f, 0.824094f, 0.312308f, 0.125368f,
+ -0.382737f, 0.637109f, 0.61907f, -0.741184f, 0.00257198f,
+ -0.0151343f, -0.669826f, -0.439855f, 0.564852f, -0.0588036f,
+ -1.38123f, -1.1126f, 0.701831f, 0.198686f, 0.266866f,
+ 0.270172f, -0.692401f, 0.272533f, -1.70914f, 0.66064f,
+ 0.0886659f, -0.132233f, 0.270531f, -0.479581f, 0.704338f,
+ -0.307039f, -0.111792f, -2.05753f, -0.231749f, 0.300528f,
+ 0.383266f, -0.130857f, -0.373944f, 1.21025f, 0.704655f,
+ -0.589422f, 0.267185f, -0.109065f, -0.195991f, 0.20209f,
+ -0.0676526f, -0.183926f, 0.164894f, 0.0877923f, 0.565943f,
+ -0.0610466f, -0.86354f, -0.80853f, -0.176111f, -1.45016f,
+ -2.29078f, -0.124524f, -0.139305f, -0.187858f, -0.0250151f,
+ -0.572544f, 0.185336f, -0.69275f, -0.430354f, -0.30861f,
+ -0.754258f, -0.468221f, -0.160487f, -0.766692f, -0.636418f,
+ -0.71016f, 0.576125f, -0.240476f, -0.954556f, -0.104693f,
+ 0.155557f, -0.840224f, -0.685457f, -0.0346927f, -0.644882f,
+ -1.92475f, -0.314544f, 0.463569f, 0.323569f, -0.990124f,
+ -0.213658f, 0.407183f, 1.19797f, -4.77004f, -0.0613379f,
+ -2.40345f, -0.0591791f, -0.477622f, -0.303556f, 0.104077f,
+ -0.974128f, -0.035172f, 1.47064f, 0.233727f, -0.0754056f,
+ 0.158553f, 0.0614361f, -1.38865f, 0.690729f, 0.568455f,
+ 0.205866f, -0.0236852f, -0.0921077f, -0.538954f, 0.336613f,
+ -0.427115f, 0.791754f, -1.819f, -0.404432f, 0.670242f,
+ -0.0343869f, -0.37191f, 0.0271262f, 0.988161f, -0.547343f,
+ 0.925304f, 0.548079f, -0.430343f, -0.214109f, 0.242013f,
+ 1.39027f, 0.37648f, -1.63524f, -0.158864f, -0.572779f,
+ -0.766801f, -2.62032f, 0.47799f, -1.12025f, -0.115283f,
+ 1.22349f, -0.262132f, -0.151274f, 0.390483f, -0.496482f,
+ 1.06166f, -0.183052f, 0.54647f, 0.847486f, 0.0229506f,
+ 0.653309f, -0.020736f, -1.27453f, 0.48386f, -0.366625f,
+ -0.515725f, -1.31196f, 0.140701f, -0.183636f, 0.000413912f,
+ 0.300993f, -0.849529f, -0.59764f, -0.212992f, -0.933365f,
+ -1.4054f, -0.091982f, 0.41695f, 0.264004f, -0.26379f,
+ -0.0738219f, 0.434052f, 1.16617f, -0.639624f, -0.146465f,
+ 0.0409936f, -0.900182f, 0.73517f, 0.805746f, -0.208088f,
+ 1.74459f, -0.0592751f, 0.624865f, -0.62325f, -0.446315f,
+ 0.150526f, 0.0526697f, 0.374254f, -0.658043f, 1.02623f,
+ -0.941758f, 0.381217f, -0.359448f, 0.160051f, 0.556455f,
+ 0.239382f, 0.75851f, 0.437583f, -0.122221f, 0.746136f,
+ 0.218286f, -0.426729f, 0.0353903f, -0.830513f, -0.877586f,
+ 0.488077f, -0.132354f, -0.180756f, 0.736163f, -0.202934f,
+ -0.882534f, 0.166305f, 0.183122f, 0.0599858f, 0.442687f,
+ 0.0522908f, -1.17755f, -1.03733f, 0.392363f, 0.672718f,
+ -1.44704f, 0.360623f, 0.390298f, -0.213968f, 0.169783f,
+ -0.717536f, -0.830984f, -0.445049f, 0.196772f, -0.730634f,
+ -1.09497f, 0.344012f, -0.292802f, -0.67966f, 0.138515f,
+ -0.361803f, 0.936778f, -0.189802f, 0.197777f, -0.367507f,
+ -0.293653f, 0.447759f, -0.409245f, -0.687568f, -0.431301f,
+ -0.271234f, -0.585413f, -0.936414f, -0.396049f, -0.29388f,
+ -0.0930843f, 0.0179339f, 0.262463f, -0.166598f, 0.0171466f,
+ -0.329641f, 0.39343f, 0.657445f, -0.579052f, -0.312444f,
+ -0.0915881f, -0.432622f, -0.247645f, 0.485749f, -0.602508f,
+ -0.347936f, 0.287353f, 0.288705f, 0.168397f, 0.568228f,
+ -0.493586f, 1.04155f, -0.097956f, 0.658928f, -0.561007f,
+ 0.0457783f, 2.12744f, 0.182683f, -0.690282f, 0.183302f,
+ 0.0309499f, -0.722251f, 0.0660448f, -0.333277f, 0.198929f,
+ -0.724102f, -0.405597f, 0.614868f, -0.292862f, 0.886513f,
+ 0.142353f, -1.48934f, -0.97273f, 0.199683f, 0.522121f,
+ 0.0877478f, -0.172593f, -1.58858f, 0.113191f, -0.436178f,
+ 0.640895f, -0.504676f, 0.0658654f, -0.361301f, 0.604323f,
+ 0.315196f, -0.423021f, -0.323484f, -0.563163f, 0.118989f,
+ -0.404508f, -0.0550995f, -0.0359236f, -0.126574f, -0.357288f,
+ -0.0494502f, 1.04959f, -0.31646f, -0.0376684f, -0.300744f,
+ -0.135016f, 0.102696f, -0.392333f, -1.17502f, 0.505227f,
+ 0.337608f, -0.348831f, -0.420815f, 0.202791f, -0.154264f,
+ -0.563686f, 0.0942187f, 0.353862f, 0.0303509f, -0.132794f,
+ 0.420746f, 0.143529f, 0.455822f, -1.28348f, -1.35662f,
+ -0.850688f, -1.76361f, -0.717546f, 0.443111f, 0.227155f,
+ -0.863307f, -0.452033f, -0.278151f, 1.86233f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = {
+ -0.103218f, -0.359587f, 0.619666f, -0.473497f, -0.649803f, 0.86992f,
+ -0.115561f, 0.335114f, -0.285044f, -0.59295f, 0.24497f, 0.611583f,
+ 0.38568f, 0.137913f, -0.281191f, -0.0107777f, 0.487236f, -0.262363f,
+ 0.696962f, 0.121565f, 0.312511f, 0.430916f, 0.694134f, 0.393632f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = {
+ -2.42496f, -1.239f, 0.832673f, 1.56923f, -2.6175f, -1.42492f,
+ -0.311387f, -1.94237f, 0.54071f, -2.50391f, 0.352205f, -0.96572f,
+ 1.47144f, -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f,
+ 0.789163f, -0.65236f, 1.77018f, 0.273867f, 1.19506f, 1.07022f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = {
+ 0.953424f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = {
+ 0.0485154f, 0.0496279f, 0.0268229f, -0.0584843f, -0.166928f,
+ 0.0316731f, -0.0895094f, -0.0433243f, -0.00893639f, -0.0886265f,
+ -0.0345622f, -0.235395f, -0.213754f, -0.00212398f, 0.0218857f,
+ -0.0054983f, -0.0248236f, 0.081822f, -0.0355708f, -0.0795593f,
+ -0.106995f, -0.0596378f, 0.0350686f, -0.133863f, -0.00582928f,
+ 0.114963f, 0.193906f, -0.00419085f, 0.0430529f, -0.128318f,
+ 0.0614715f, -0.000952935f, -0.0345722f, -0.109459f, 0.074204f,
+ -0.0865131f, 0.0649158f, -0.0942417f, -0.10122f, -0.047551f,
+ -1.27825f, -0.0125456f, -0.019722f, -0.152058f, 0.280306f,
+ -0.121231f, -0.0565484f, 0.0959188f, 0.0603919f, 0.0457468f,
+ 0.967589f, 0.105892f, -0.118326f, 0.198933f, 0.163437f,
+ -0.056824f, -0.0302956f, -0.07366f, -0.681407f, -0.0781575f,
+ 0.255732f, -0.0712105f, 0.177882f, 0.709206f, -0.232457f,
+ 1.33809f, -0.0328557f, 0.0572231f, -1.01361f, 0.130676f,
+ -0.205159f, 0.975398f, 0.356293f, 0.0766364f, -0.297397f,
+ -0.0261066f, -0.0933549f, 0.0568851f, -0.0123034f, -0.0433538f,
+ 0.131003f, 0.890705f, 0.0084565f, 0.00547395f, 0.00157634f,
+ 0.0047937f, -0.0511092f, 0.0300034f, -0.00604993f, -0.0133502f,
+ -0.000274302f, 0.129728f, -0.00532916f, 0.0855351f, 0.136885f,
+ 0.0175562f, -0.0123633f, -0.000512229f, -0.019924f, -0.0316328f,
+ 0.422972f, 0.0460336f, 0.0170841f, -0.00086795f, -0.0655137f,
+ 0.0287308f, -0.0375644f, -0.0329215f, -0.0273072f, 0.0241426f,
+ -0.0429052f, 0.0221593f, -0.063881f, -0.0347391f, -6.44339e-07f,
+ 0.0476934f, -0.0150068f, 0.0146403f, -0.0653099f, 0.0107635f,
+ 0.012407f, 0.0048935f, 1.50975f, 0.322256f, 0.17881f,
+ 0.0943775f, -0.100583f, -0.367022f, -0.156525f, -0.0397161f,
+ 0.0752784f, -0.00219022f, -0.887456f, 0.0153415f, -0.0148185f,
+ -0.56435f, 0.163996f, -0.0221024f, -0.0115872f, -0.0529284f,
+ 0.156838f, -1.13813f, -0.207863f, -0.00484959f, 0.135719f,
+ 0.131004f, 0.0417939f, 0.31453f, 0.121719f, -0.101515f,
+ 0.267951f, 0.219727f, 0.0398821f, 0.0713504f, 3.65918e-06f,
+ -0.00659998f, 0.477343f, -0.128426f, 0.0648877f, 0.111884f,
+ 0.224552f, 0.0617426f, 0.117742f, 0.031377f, 0.0586865f,
+ -0.459293f, 0.100211f, -0.14127f, 0.624412f, 0.014659f,
+ -1.41807f, -0.382452f, -0.695931f, -0.103153f, 0.145808f,
+ 0.333526f, -0.256367f, 0.096842f, 0.102458f, -0.181224f,
+ 0.729272f, 0.151177f, 1.46729f, 0.111044f, -4.28813f,
+ 0.0178379f, 0.47641f, -6.57533f, 0.0633335f, 0.496934f,
+ -0.154657f, -9.07298e-05f, 0.848937f, -5.40143f, 0.375685f,
+ 0.23586f, -0.166591f, -0.0191648f, -0.039862f, -3.25093f,
+ 0.168472f, -0.260317f, -5.51548f, 0.0575334f, 0.328979f,
+ 0.112644f, 0.231339f, -0.122641f, 0.0567331f, 1.19541f,
+ -0.038735f, 0.0630576f, 0.176668f, 0.0757184f, -0.833104f,
+ 0.133669f, 0.982669f, 0.0311783f, 0.0908558f, -0.10065f,
+ -0.0386599f, -0.231587f, -0.83876f, -0.347148f, 0.225529f,
+ -1.29625f, 0.0806834f, 0.369648f, -1.63367f, 0.118057f,
+ -0.311948f, 0.95022f, -0.354807f, -0.648657f, -1.72048f,
+ 0.260397f, 0.915555f, 0.057737f, -0.162019f, -0.453543f,
+ -1.70388f, -0.311632f, -0.731593f, -0.678089f, 0.10438f,
+ -0.293911f, 0.144864f, 0.039212f, 0.0289241f, -0.0685266f,
+ 0.634592f, -0.0798614f, -0.119197f, -0.00517433f, -0.04653f,
+ -0.127568f, -0.0582645f, 0.0735302f, -0.0946823f, 0.00865585f,
+ 0.0115748f, 0.0194847f, 0.0455664f, 0.181006f, -0.0824601f,
+ 0.0869093f, 0.264767f, -0.0750432f, 0.135136f, 0.316511f,
+ 0.399015f, 0.0994808f, -0.166944f, -0.102126f, 0.457858f,
+ 0.300488f, 0.467582f, 0.830244f, -0.0511439f, -0.522892f,
+ -0.183049f, 0.2626f, 0.118382f, 0.241674f, 0.250399f,
+ -0.0963507f, -0.83231f, -0.227699f, -0.133314f, 0.231718f,
+ -0.0700274f, 0.891311f, 0.224742f, -0.572836f, 0.402798f,
+ -0.191576f, 0.740922f, -0.00374073f, 0.658178f, -0.209364f,
+ -0.416259f, 0.166297f, 0.0095577f, -0.0876076f, 0.424954f,
+ 0.265226f, -0.129343f, -0.203146f, -0.194637f, -0.818142f,
+ -0.164152f, -0.368962f, 0.273373f, 0.599927f, -0.19859f,
+ 0.0939651f, -0.12458f, -0.751816f, -0.302997f, -0.139176f,
+ -0.372737f, 0.332704f, -0.206045f, -0.00593763f, -0.452363f,
+ -0.2704f, -0.198846f, 0.0976308f, -0.216124f, 0.110122f,
+ -0.220342f, 0.00763426f, -0.0272775f, -0.190395f, -0.0359411f,
+ -0.0395759f, 0.000941162f, -1.49959f, 0.0914233f, 0.448346f,
+ -0.420435f, -0.0102102f, -0.0757978f, -0.0177687f, -0.0231492f,
+ -0.142125f, 1.31774f, 0.0269368f, 0.134566f, 0.152079f,
+ -0.139933f, 0.139226f, -0.214467f, -0.194446f, -0.555893f,
+ 0.271197f, -0.111047f, 0.0888069f, -0.198121f, 0.0871713f,
+ 0.100612f, 0.429782f, -0.3787f, 0.123147f, -0.12538f,
+ 0.235678f, 0.139237f, 0.223326f, 0.85806f, -0.00554756f,
+ 0.285095f, 0.0954683f, 0.0464989f, 0.100806f, -0.0211297f,
+ 0.121672f, 0.242473f, 0.0810475f, -0.834356f, 0.119629f,
+ 0.111338f, -0.227126f, 0.159296f, -0.0584685f, -0.108265f,
+ -0.0909221f, -0.21749f, 0.0929309f, -0.176815f, 0.178067f,
+ -0.0025905f, 0.317883f, 0.313045f, 0.26774f, -0.589329f,
+ -1.19882f, -0.285513f, -0.109478f, 0.309441f, -0.0604479f,
+ 0.947461f, -0.142342f, -0.9086f, -0.814788f, 0.184588f,
+ -0.0736317f, 0.276237f, 0.13132f, -0.3931f, -0.381744f,
+ -0.0122719f, 0.0246101f, -0.0920412f, 0.11331f, -0.110355f,
+ 0.00848064f, 0.0931248f, -0.0638655f, -4.30869e-05f, -0.300367f,
+ 0.0489508f, 0.464441f, -0.0466243f, -0.0137732f, 0.0099241f,
+ -0.223972f, 0.188966f, -0.653173f, -0.354322f, 0.189237f,
+ -0.624276f, -1.46218f, -0.075161f, -0.516172f, 0.40993f,
+ 0.291178f, -1.95088f, -0.0352157f, 0.196354f, -0.335897f,
+ 0.0857039f, 0.605319f, -1.12923f, -0.638387f, 1.41868f,
+ 0.0955757f, -0.00913477f, 0.315935f, -0.671223f, -0.851436f,
+ -0.157464f, -0.296763f, 0.182277f, -0.139309f, 0.232789f,
+ 0.869562f, 0.248894f, 0.242709f, 0.195479f, 0.106153f,
+ 0.358881f, 0.167443f, 0.982987f, 0.104767f, -0.033925f,
+ -0.0263185f, 0.0045304f, 0.0722479f, -0.111307f, 0.00128896f,
+ 0.406128f, -0.00944947f, 0.121592f, 0.546284f, -0.00175696f,
+ 0.776588f, 0.238846f, 0.064469f, 0.27082f, 0.269187f,
+ 0.0294455f, 0.62364f, -0.27872f, -0.0488013f, 0.229024f,
+ 0.154457f, 0.0445898f, 0.349943f, 0.0710998f, 0.0820674f,
+ 0.0279449f, 0.172826f, -0.122156f, -0.164688f, 0.0292124f,
+ 0.0496112f, -0.741762f, 0.0673926f, 0.108159f, -0.0942327f,
+ -0.0562883f, 0.558231f, 0.0552399f, 0.211393f, 0.0376817f,
+ -0.275788f, 0.0548436f, 0.212732f, 0.163603f, 0.0663363f,
+ -0.0252315f, 0.164533f, 0.0826088f, 0.0301389f, 0.345705f,
+ -0.0378046f, -0.139581f, 1.30162f, 1.23551f, -0.446693f,
+ 0.682534f, -0.0831157f, -0.0121595f, 1.50505f, 0.0839017f,
+ -0.953413f, 0.0820985f, -0.125556f, 0.699796f, -0.140453f,
+ 0.168438f, -0.110966f, 0.173806f, 0.114683f, 0.132502f,
+ -0.0453539f, -0.133096f, 0.511947f, -0.180657f, -0.0298605f,
+ 0.291437f, -0.0275017f, -0.229703f, -0.0504205f, 0.559622f,
+ 0.384601f, 0.111024f, -0.0773559f, -0.0591752f, -0.0866182f,
+ -0.189437f, -0.262345f, -0.0372182f, 0.149925f, 0.154644f,
+ -0.188298f, 0.236949f, -0.199328f, -0.378909f, -0.680128f,
+ 0.277184f, -0.172784f, 0.184717f, -0.23899f, 0.0712069f,
+ 0.0235425f, 0.4225f, -0.441487f, 0.177434f, -0.298303f,
+ 0.295696f, 0.17346f, 0.220542f, -0.680116f, 0.00266223f,
+ -0.0408459f, -0.15486f, 0.24335f, 0.237258f, -0.0283245f,
+ 0.19703f, -0.100027f, 0.0554843f, -1.03081f, 0.151745f,
+ 0.538582f, 0.370368f, 0.196683f, 0.0222123f, -0.0831401f,
+ -0.0832803f, -0.286743f, -0.686003f, 0.0995004f, 0.148901f,
+ -0.0436037f, -0.316508f, 0.00391835f, -0.228452f, 0.940058f,
+ 0.520047f, -0.334211f, 0.652142f, -0.0755971f, 0.0965123f,
+ -0.98191f, 0.394096f, -0.420466f, 0.327284f, -0.134651f,
+ 0.849297f, -0.523372f, 0.010327f, 0.133636f, 0.298119f,
+ -0.257389f, 0.0376153f, -0.198298f, 0.0736235f, 0.608809f,
+ 0.0291836f, -0.290005f, -0.141316f, 0.0184599f, 0.0554437f,
+ 0.0621519f, 0.485276f, 0.617062f, -0.0924811f, -0.0120834f,
+ 0.0817611f, 0.100421f, -0.0153553f, -0.135958f, -0.0185322f,
+ -0.395803f, -0.204862f, 0.547916f, -0.438117f, 0.0229788f,
+ 0.406981f, 0.795584f, -2.02756f, -0.8355f, -0.386789f,
+ 0.00968368f, 1.2147f, -0.740869f, -1.18415f, -0.954918f,
+ -0.541142f, 0.0596003f, 0.107189f, -0.411708f, -0.964593f,
+ 0.511906f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = {
+ -0.485545f, 0.131552f, 0.796833f, -0.157582f, -0.0948124f, 0.00818613f,
+ -0.485562f, 0.3826f, -0.0839326f, 0.170998f, 0.279545f, -0.287143f,
+ 0.184986f, -0.0719864f, 0.19748f, 0.404145f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = {
+ 1.30172f, 0.720189f, 0.261675f, -0.466201f, 1.21773f,
+ 0.495525f, 0.62398f, 0.44567f, -0.330993f, -0.269798f,
+ 0.835161f, -0.294874f, 0.186981f, 0.0162467f, 0.367654f,
+ 0.658468f, 1.08325f, 1.01558f, 0.12783f, -0.280581f,
+ 2.2204f, 0.0337286f, -0.403649f, -0.230908f, -0.35188f,
+ 0.437712f, -0.103634f, -0.645929f, 1.17407f, 0.157385f,
+ 0.212438f, 1.41874f, 0.284242f, -0.493105f, 1.0703f,
+ 0.00632116f, 1.18222f, -0.26003f, 0.276795f, -0.823156f,
+ 0.29577f, -0.157467f, -0.18092f, 0.0237336f, 0.205715f,
+ -0.295679f, 0.165443f, -0.628279f, 1.00804f, 0.361232f,
+ 0.646155f, -0.028651f, 1.64317f, 0.334251f, -1.50713f,
+ -1.51685f, -0.488522f, 0.169694f, -0.593176f, -0.372682f,
+ -1.50223f, 0.35076f, -0.24641f, -0.237189f, 0.190502f,
+ -0.948191f, -0.303346f, 0.45108f, -0.794368f, -2.3116f,
+ 0.404008f, -2.67269f, -0.941992f, -0.45336f, 0.0655987f,
+ -0.288432f, 0.106068f, 0.286978f, 0.121403f, 0.462739f,
+ 0.0130292f, 0.240597f, -2.30983f, -0.453309f, -0.149335f,
+ 0.856424f, -0.186576f, 0.769961f, -0.0657097f, -0.976188f,
+ 0.972971f, -0.532728f, -0.699334f, -0.168803f, 0.361945f,
+ 0.950769f, 1.5368f, -0.223899f, 1.17547f, -0.281483f,
+ 0.533619f, 0.315344f, 0.0854543f, 0.464701f, 0.346828f,
+ 0.271794f, -0.0185388f, 0.109517f, 0.371662f, -0.10852f,
+ 0.244092f, 0.491959f, -0.750281f, 1.41865f, -3.51221f,
+ 0.298194f, -0.0790832f, -0.134158f, -0.424084f, 0.189593f,
+ -0.238361f, -0.407872f, -0.366222f, -0.606813f, -0.230498f,
+ 0.387248f, -0.102734f, -0.190544f, -1.43649f, 0.141338f,
+ -0.0438917f, 0.204628f, 1.57033f, 0.0366937f, -0.14733f,
+ 0.048198f, -0.122631f, 0.183354f, 0.0658753f, -0.243381f,
+ 0.0246889f, -0.768798f, -0.0644054f, 0.775073f, 1.63419f,
+ 0.491624f, 0.21898f, -0.358944f, 3.31304f, 0.0195916f,
+ 0.236174f, 0.530704f, 0.140124f, 0.0736778f, -0.27361f,
+ -0.598836f, -1.01659f, 0.361765f, 0.00455986f, -0.345222f,
+ 1.68731f, 0.764082f, 0.193555f, 0.322782f, 1.19801f,
+ 0.538935f, -0.0393231f, -0.0248292f, -0.151168f, 0.479879f,
+ -0.208582f, 0.22798f, 0.335473f, -0.00295455f, 0.139539f,
+ 0.400814f, 0.478307f, -0.189376f, 0.540084f, 0.466072f,
+ 0.920231f, 0.398774f, -0.472403f, -0.0431972f, -0.581665f,
+ -0.990058f, 0.258995f, -0.0148889f, 0.27105f, 0.340334f,
+ 0.223576f, -0.0405193f, -1.23888f, -1.45229f, -1.44543f,
+ -0.376146f, 0.132601f, -0.4064f, -0.583611f, -0.374588f,
+ 0.0659428f, 0.325652f, -0.338456f, 0.253767f, -0.0181164f,
+ 0.681732f, 0.222041f, 0.837496f, 1.09735f, 0.156328f,
+ 0.177236f, -0.702702f, 0.473689f, 0.322118f, 0.43343f,
+ 0.315441f, -0.40798f, 0.0811291f, 0.631431f, 0.361929f,
+ 0.0723276f, 0.0164498f, 0.0293847f, 0.156406f, -1.10453f,
+ 0.837977f, -1.03449f, -0.348408f, 1.71953f, -0.401765f,
+ 0.64272f, -0.182438f, -0.233954f, 0.364597f, 0.269177f,
+ -0.578512f, 0.397216f, 0.0425122f, -0.258728f, 1.41621f,
+ -0.688768f, 0.0944726f, 0.253163f, -0.989037f, 1.72726f,
+ 1.15976f, -0.0460612f, 0.534186f, -0.136814f, 0.49327f,
+ 0.115744f, -0.633052f, -0.433855f, -1.01874f, -0.324035f,
+ 0.489487f, 1.08696f, 0.836376f, -0.423477f, -0.421309f,
+ 1.07348f, 0.323266f, 0.717604f, 0.366422f, 0.32983f,
+ 0.336583f, 0.749292f, -0.210666f, 0.387101f, -0.583376f,
+ 0.0391101f, -1.07537f, 0.914591f, -0.51303f, 1.15023f,
+ -0.0378782f, 0.262889f, -0.841128f, 0.41619f, -0.669704f,
+ -0.109995f, 1.01825f, -0.194853f, 0.120739f, 0.627889f,
+ -0.00269221f, 0.751152f, -0.529865f, -1.50238f, 0.184521f,
+ 0.795464f, 0.106099f, 1.83117f, 0.0883305f, 0.306844f,
+ -0.0671504f, -0.169306f, -0.214575f, -0.121606f, -0.234965f,
+ 0.109752f, -0.35831f, -0.07894f, 0.497203f, -2.63013f,
+ 0.815608f, -0.193593f, -0.62292f, 0.338941f, 0.0970922f,
+ -0.531178f, 0.723346f, 0.35063f, 0.182647f, -0.257013f,
+ 0.784924f, -0.217915f, -0.0797363f, -0.399706f, -0.485602f,
+ 1.23155f, 0.345998f, 0.322949f, -0.168196f, -0.173313f,
+ 0.282205f, 0.45117f, 0.918706f, -0.046172f, -0.0873883f,
+ 0.56103f, -0.485768f, 0.546199f, 0.254997f, 0.394296f,
+ 0.607178f, 0.667532f, -0.343883f, 0.374402f, -0.531439f,
+ 2.27782f, -1.13255f, 0.505867f, -0.514742f, 0.998571f,
+ -1.60984f, -0.172873f, -0.0604094f, 0.719791f, -0.733982f,
+ 0.348905f, 1.39008f, -0.895343f, -0.677064f, -1.84221f,
+ 0.0434018f, -0.534794f, 0.0434753f, -0.266576f, 0.268099f,
+ -0.242935f, 0.00166289f, 0.0263789f, -0.224794f, -0.113493f,
+ -0.236397f, 0.0879936f, 0.510895f, -0.511789f, -1.48962f,
+ -2.78268f, -0.0495784f, -0.0343907f, 0.440459f, -0.364209f,
+ 0.833223f, -0.0589337f, 0.00181418f, 0.455499f, 0.101762f,
+ -1.16424f, 0.270405f, 0.219033f, -4.91105f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = {
+ -0.40114f, -0.372342f, -0.216186f, -0.240014f, -0.341773f, -0.344489f,
+ -0.113037f, 0.198479f, 0.482958f, -0.630072f, -0.728704f, -0.171963f,
+ 0.519883f, 0.253003f, -0.121618f, -0.0569875f, -0.485568f, -0.147577f,
+ 0.533305f, -0.587251f, -0.120837f, -0.483953f, 0.445641f, -0.125136f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = {
+ -1.57431f, -1.09069f, 1.67996f, -0.669702f, 0.499807f, -3.03145f,
+ -0.878135f, 0.637818f, -1.58419f, -3.79756f, 0.62755f, -0.446646f,
+ 0.653269f, -0.667854f, -2.19774f, -3.53349f, 2.6107f, -0.685892f,
+ -1.2603f, -0.89707f, -0.715551f, 0.382202f, 2.09574f, 0.469386f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = {
+ -0.022787f
+};
+
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = {
+ BRANCH_0_NUM_DNN_FEATURES,
+ BRANCH_0_NUM_LOGITS,
+ BRANCH_0_NUM_DNN_LAYERS,
+ {
+ BRANCH_0_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_0_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_0_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_0_logits_bias,
+ },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = {
+ BRANCH_1_NUM_DNN_FEATURES,
+ BRANCH_1_NUM_LOGITS,
+ BRANCH_1_NUM_DNN_LAYERS,
+ {
+ BRANCH_1_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_1_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_1_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_1_logits_bias,
+ },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = {
+ BRANCH_2_NUM_DNN_FEATURES,
+ BRANCH_2_NUM_LOGITS,
+ BRANCH_2_NUM_DNN_LAYERS,
+ {
+ BRANCH_2_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_2_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_2_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_2_logits_bias,
+ },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = {
+ BRANCH_3_NUM_DNN_FEATURES,
+ BRANCH_3_NUM_LOGITS,
+ BRANCH_3_NUM_DNN_LAYERS,
+ {
+ BRANCH_3_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_3_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_3_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_3_logits_bias,
+ },
+};
+
+#undef NUM_DNN_BRANCHES
+#undef NUM_CNN_LAYERS
+#undef BRANCH_0_NUM_DNN_LAYERS
+#undef BRANCH_1_NUM_DNN_LAYERS
+#undef BRANCH_2_NUM_DNN_LAYERS
+#undef BRANCH_3_NUM_DNN_LAYERS
+#undef CNN_LAYER_0_HEIGHT
+#undef CNN_LAYER_0_WIDTH
+#undef CNN_LAYER_0_IN_CH
+#undef CNN_LAYER_0_OUT_CH
+#undef CNN_LAYER_0_HORZ_STRIDE
+#undef CNN_LAYER_0_VERT_STRIDE
+#undef CNN_LAYER_1_HEIGHT
+#undef CNN_LAYER_1_WIDTH
+#undef CNN_LAYER_1_IN_CH
+#undef CNN_LAYER_1_OUT_CH
+#undef CNN_LAYER_1_HORZ_STRIDE
+#undef CNN_LAYER_1_VERT_STRIDE
+#undef CNN_LAYER_2_HEIGHT
+#undef CNN_LAYER_2_WIDTH
+#undef CNN_LAYER_2_IN_CH
+#undef CNN_LAYER_2_OUT_CH
+#undef CNN_LAYER_2_HORZ_STRIDE
+#undef CNN_LAYER_2_VERT_STRIDE
+#undef CNN_LAYER_3_HEIGHT
+#undef CNN_LAYER_3_WIDTH
+#undef CNN_LAYER_3_IN_CH
+#undef CNN_LAYER_3_OUT_CH
+#undef CNN_LAYER_3_HORZ_STRIDE
+#undef CNN_LAYER_3_VERT_STRIDE
+#undef CNN_LAYER_4_HEIGHT
+#undef CNN_LAYER_4_WIDTH
+#undef CNN_LAYER_4_IN_CH
+#undef CNN_LAYER_4_OUT_CH
+#undef CNN_LAYER_4_HORZ_STRIDE
+#undef CNN_LAYER_4_VERT_STRIDE
+#undef BRANCH_0_NUM_DNN_FEATURES
+#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_0_NUM_LOGITS
+#undef BRANCH_1_NUM_DNN_FEATURES
+#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_1_NUM_LOGITS
+#undef BRANCH_2_NUM_DNN_FEATURES
+#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_2_NUM_LOGITS
+#undef BRANCH_3_NUM_DNN_FEATURES
+#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_3_NUM_LOGITS
+
+static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = {
+ 100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = {
+ -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = {
+ 100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = {
+ -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = {
+ 100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = {
+ -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f,
+};
+
+static const float av1_intra_mode_cnn_partition_mean[1] = {
+ 1.191922f,
+};
+
+static const float av1_intra_mode_cnn_partition_std[1] = {
+ 1.730044f,
+};
+
+static const int quad_to_linear_0[1] = { 0 };
+static const int quad_to_linear_1[4] = { 0, 1, 2, 3 };
+static const int quad_to_linear_2[16] = { 0, 1, 4, 5, 2, 3, 6, 7,
+ 8, 9, 12, 13, 10, 11, 14, 15 };
+static const int quad_to_linear_3[64] = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 16, 17, 24, 25, 18, 19, 26, 27,
+ 4, 5, 12, 13, 6, 7, 14, 15, 20, 21, 28, 29, 22, 23, 30, 31,
+ 32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59,
+ 36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
new file mode 100644
index 0000000000..71c1ace782
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -0,0 +1,5646 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(chiyotsai@google.com): The performance of these models are getting worse
+// due the changes in the encoder. We should retrain the models here to get
+// better performance once we have the time.
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+ -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f,
+ 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f,
+ 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f,
+ -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f,
+ -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+ -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f,
+ 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f,
+ 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f,
+ 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f,
+ 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f,
+ -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f,
+ -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f,
+ 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f,
+ 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f,
+ 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f,
+ 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+ 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f,
+ -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f,
+ 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f,
+ -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f,
+ -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f,
+ -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f,
+ -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f,
+ -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f,
+ 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f,
+ 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f,
+ 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f,
+ 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f,
+ -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f,
+ -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f,
+ -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f,
+ -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f,
+ 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+ 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f,
+ 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f,
+ 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f,
+ -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f,
+ 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f,
+ -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f,
+ 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f,
+ 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f,
+ 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f,
+ 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f,
+ -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f,
+ -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f,
+ 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f,
+ -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f,
+ 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f,
+ -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f,
+ -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f,
+ -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f,
+ -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f,
+ -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f,
+ -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f,
+ 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f,
+ 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f,
+ 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+ -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f,
+ -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f,
+ 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f,
+ -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+ 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f,
+ 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f,
+ -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f,
+ -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f,
+ 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f,
+ -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f,
+ 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f,
+ -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f,
+ -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f,
+ 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f,
+ -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f,
+ 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+ 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+ -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f,
+ 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f,
+ 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f,
+ -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f,
+ 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f,
+ -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+ 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f,
+ -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f,
+ 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f,
+ -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f,
+ 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f,
+ -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f,
+ -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f,
+ -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+ 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f,
+ -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f,
+ -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+ -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+ 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f,
+ 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f,
+ -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f,
+ 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f,
+ -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f,
+ 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f,
+ 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f,
+ -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+ 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f,
+ -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+ 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f,
+ 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f,
+ -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f,
+ 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f,
+ 0.420104f, -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+ 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f,
+ -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f,
+ 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f,
+ -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f,
+ 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f,
+ 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+ 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+ 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f,
+ 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f,
+ 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f,
+ 0.853918f, 0.002504f, -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+ 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+ -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f,
+ 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f,
+ 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f,
+ 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f,
+ -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f,
+ -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f,
+ -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f,
+ -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f,
+ 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f,
+ -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f,
+ 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f,
+ 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f,
+ 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f,
+ 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f,
+ 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f,
+ 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f,
+ 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f,
+ -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f,
+ 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f,
+ 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f,
+ -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f,
+ 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f,
+ 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f,
+ -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f,
+ -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f,
+ -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f,
+ 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f,
+ 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f,
+ 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f,
+ -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f,
+ -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f,
+ 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f,
+ 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f,
+ 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+ 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f,
+ -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f,
+ -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f,
+ 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f,
+ 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f,
+ 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f,
+ 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f,
+ 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+ -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f,
+ -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f,
+ -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f,
+ 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f,
+ -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f,
+ -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+ 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f,
+ -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f,
+ -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f,
+ 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f,
+ -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f,
+ -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f,
+ -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f,
+ 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f,
+ 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f,
+ 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f,
+ -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f,
+ -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f,
+ -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f,
+ 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f,
+ -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f,
+ -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f,
+ 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f,
+ -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f,
+ -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f,
+ -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f,
+ -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f,
+ -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f,
+ 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f,
+ 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f,
+ -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f,
+ -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f,
+ 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+ -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f,
+ -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f,
+ 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f,
+ 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+ 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f,
+ -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f,
+ 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f,
+ -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+ -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+ -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f,
+ 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f,
+ 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f,
+ -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f,
+ 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f,
+ 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f,
+ -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f,
+ -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f,
+ -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+ -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f,
+ 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f,
+ -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f,
+ 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f,
+ -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f,
+ 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f,
+ 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f,
+ 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+ -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f,
+ 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f,
+ -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f,
+ 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f,
+ -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f,
+ 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+ -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f,
+ -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+ -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+ -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f,
+ -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f,
+ 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f,
+ -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f,
+ -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f,
+ 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f,
+ -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f,
+ -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f,
+ -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f,
+ -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f,
+ -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f,
+ -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+ -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f,
+ -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f,
+ -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+ -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+ -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f,
+ -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f,
+ -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f,
+ -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f,
+ -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f,
+ -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f,
+ -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f,
+ -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f,
+ 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f,
+ 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f,
+ -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+ 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f,
+ -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f,
+ -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+ -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f,
+ 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f,
+ -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f,
+ -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f,
+ -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f,
+ 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f,
+ -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f,
+ -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f,
+ -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+ 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+ -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f,
+ -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f,
+ -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f,
+ 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+ 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+ -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f,
+ 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+ 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+ 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f,
+ 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f,
+ -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f,
+ -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f,
+ -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f,
+ 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f,
+ -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f,
+ 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f,
+ 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f,
+ 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f,
+ -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f,
+ 0.823643f, -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+ -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f,
+ -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f,
+ 0.325655f, -0.107123f, 0.591049f, 0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_128_layer0,
+ av1_ab_partition_nn_weights_128_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_128_layer0,
+ av1_ab_partition_nn_bias_128_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+ -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f,
+ -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+ -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+ -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f,
+ 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f,
+ -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f,
+ -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+ 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+ 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f,
+ -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+ 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f,
+ 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f,
+ -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f,
+ 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f,
+ -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+ 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f,
+ 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f,
+ -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+ -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+ -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f,
+ 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f,
+ 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f,
+ -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f,
+ -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+ -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+ -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+ 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+ 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+ 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+ -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+ -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f,
+ -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+ -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f,
+ -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f,
+ -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f,
+ -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f,
+ -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f,
+ -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f,
+ 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+ -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+ -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f,
+ -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f,
+ -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f,
+ -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+ -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f,
+ 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f,
+ -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+ -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+ 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f,
+ -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+ -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f,
+ -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+ -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+ -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f,
+ -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f,
+ -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f,
+ -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f,
+ 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+ 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f,
+ -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f,
+ -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f,
+ -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+ -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+ -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f,
+ 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f,
+ -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+ -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f,
+ 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+ 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+ 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f,
+ -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f,
+ -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+ -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+ -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f,
+ -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f,
+ -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f,
+ -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+ -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f,
+ 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f,
+ -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+ -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f,
+ -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+ 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+ -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+ -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+ -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f,
+ -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+ -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f,
+ 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+ -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f,
+ 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+ -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f,
+ -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f,
+ 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f,
+ 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f,
+ -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f,
+ -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f,
+ -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f,
+ 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f,
+ 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f,
+ -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+ -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+ -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f,
+ -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f,
+ -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+ -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+ 0.230343f, -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+ -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f,
+ -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f,
+ -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+ -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+ 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f,
+ -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f,
+ -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f,
+ 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+ -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+ -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f,
+ -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+ -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f,
+ -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f,
+ 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f,
+ -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f,
+ 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f,
+ 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f,
+ 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f,
+ -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f,
+ 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f,
+ 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f,
+ 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f,
+ -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f,
+ 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f,
+ -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f,
+ -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f,
+ -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f,
+ 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f,
+ 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f,
+ 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f,
+ -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f,
+ 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f,
+ 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f,
+ 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f,
+ 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f,
+ 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f,
+ -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f,
+ -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f,
+ -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f,
+ 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f,
+ -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f,
+ -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f,
+ -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f,
+ -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f,
+ 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f,
+ -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f,
+ 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f,
+ -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f,
+ -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f,
+ 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f,
+ 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f,
+ -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f,
+ -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f,
+ 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f,
+ 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f,
+ 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f,
+ 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f,
+ -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f,
+ -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f,
+ 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f,
+ 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f,
+ -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f,
+ 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f,
+ -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f,
+ -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f,
+ -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f,
+ -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f,
+ -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f,
+ -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f,
+ -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f,
+ -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f,
+ -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f,
+ -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f,
+ -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f,
+ -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f,
+ -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f,
+ 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f,
+ 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f,
+ -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f,
+ 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f,
+ 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f,
+ -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f,
+ 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f,
+ 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f,
+ -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f,
+ -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f,
+ -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f,
+ 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f,
+ -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f,
+ -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f,
+ -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f,
+ 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f,
+ 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f,
+ -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f,
+ 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f,
+ 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f,
+ 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f,
+ -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f,
+ -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f,
+ -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f,
+ -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f,
+ -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f,
+ -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f,
+ -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f,
+ -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f,
+ -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f,
+ 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f,
+ -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f,
+ -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f,
+ -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f,
+ -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f,
+ -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f,
+ -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f,
+ -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f,
+ 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f,
+ 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f,
+ -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f,
+ 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f,
+ -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f,
+ 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f,
+ -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f,
+ -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f,
+ 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f,
+ -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f,
+ -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f,
+ 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f,
+ 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f,
+ -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f,
+ -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f,
+ 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f,
+ -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f,
+ -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f,
+ -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f,
+ -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f,
+ -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f,
+ -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f,
+ 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f,
+ -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f,
+ 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f,
+ -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f,
+ 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f,
+ 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f,
+ 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f,
+ -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f,
+ 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f,
+ -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f,
+ 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f,
+ 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f,
+ -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f,
+ -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f,
+ 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f,
+ 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f,
+ -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f,
+ -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f,
+ 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f,
+ 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f,
+ -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f,
+ -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f,
+ -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f,
+ -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f,
+ -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f,
+ -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f,
+ -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f,
+ 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f,
+ -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f,
+ -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f,
+ -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f,
+ -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f,
+ -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f,
+ 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f,
+ 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f,
+ -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f,
+ -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f,
+ -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f,
+ -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f,
+ -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f,
+ -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f,
+ -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f,
+ 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f,
+ 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f,
+ -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f,
+ -0.114126f, -0.193834f, -0.025759f, 0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+ -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f,
+ -0.872737f, 0.718723f, 0.703398f, 2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_64_layer0,
+ av1_ab_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_64_layer0,
+ av1_ab_partition_nn_bias_64_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+ -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+ -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+ 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+ 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+ -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+ 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f,
+ -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f,
+ 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f,
+ 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f,
+ 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f,
+ -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f,
+ -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+ -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f,
+ -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f,
+ 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f,
+ -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f,
+ -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f,
+ 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+ -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f,
+ -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f,
+ -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f,
+ -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f,
+ 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+ -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f,
+ -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f,
+ -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f,
+ -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+ 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f,
+ 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f,
+ -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f,
+ -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f,
+ -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f,
+ -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+ -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f,
+ 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+ -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f,
+ -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+ -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f,
+ -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f,
+ -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+ -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f,
+ 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+ -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+ -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f,
+ 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f,
+ -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f,
+ -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f,
+ 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f,
+ 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+ -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f,
+ -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+ -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f,
+ -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f,
+ -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+ -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+ 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f,
+ -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+ -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+ -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f,
+ -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+ -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+ -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f,
+ -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+ -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+ -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+ -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f,
+ -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+ 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f,
+ 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+ -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+ -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+ -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+ -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f,
+ 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+ -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+ -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+ -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f,
+ -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+ -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+ -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f,
+ -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+ -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+ -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+ -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f,
+ 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f,
+ -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+ -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+ -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+ -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+ -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f,
+ 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f,
+ -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+ -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+ -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f,
+ 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f,
+ -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f,
+ -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f,
+ -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+ -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f,
+ -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f,
+ 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f,
+ 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f,
+ -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+ -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+ -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+ -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f,
+ -0.827145f, -0.225277f, 0.275800f, 1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+ -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+ 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f,
+ -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+ 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f,
+ 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+ -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+ 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+ -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f,
+ -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f,
+ 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+ -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+ -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f,
+ -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f,
+ -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f,
+ 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f,
+ 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f,
+ -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f,
+ 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f,
+ -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f,
+ 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f,
+ -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f,
+ 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f,
+ -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f,
+ -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f,
+ 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f,
+ -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f,
+ 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f,
+ 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f,
+ -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f,
+ 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f,
+ 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f,
+ -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f,
+ 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f,
+ 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f,
+ 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f,
+ -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f,
+ 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+ -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f,
+ 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f,
+ 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f,
+ 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f,
+ -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f,
+ -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+ -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f,
+ 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+ -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f,
+ -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f,
+ -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+ -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+ -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+ -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+ -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f,
+ 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f,
+ 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f,
+ 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+ 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f,
+ 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f,
+ -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+ 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f,
+ -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+ -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f,
+ -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f,
+ -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f,
+ 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f,
+ -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f,
+ 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f,
+ -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f,
+ 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f,
+ -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f,
+ -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f,
+ 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f,
+ 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f,
+ -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f,
+ -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f,
+ -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f,
+ -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f,
+ 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f,
+ -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f,
+ -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f,
+ -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+ -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f,
+ -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+ -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+ -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+ 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f,
+ -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+ -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f,
+ 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f,
+ -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f,
+ -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+ -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f,
+ 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f,
+ -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+ -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+ 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f,
+ -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f,
+ -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+ -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f,
+ 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f,
+ -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f,
+ -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f,
+ 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+ 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+ -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f,
+ -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+ -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f,
+ -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f,
+ -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f,
+ 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+ -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f,
+ -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f,
+ 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f,
+ -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+ -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+ -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f,
+ -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f,
+ 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f,
+ 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f,
+ -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f,
+ -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+ -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f,
+ 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f,
+ -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f,
+ -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+ -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+ 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+ -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f,
+ 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f,
+ 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+ -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f,
+ -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f,
+ 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f,
+ 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f,
+ 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f,
+ -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+ 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f,
+ -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f,
+ -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f,
+ -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f,
+ -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+ -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f,
+ 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f,
+ -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f,
+ -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f,
+ -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+ -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f,
+ -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+ -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+ -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f,
+ -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f,
+ -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f,
+ -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+ 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f,
+ -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f,
+ 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f,
+ -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f,
+ -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f,
+ -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+ 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f,
+ -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+ -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+ -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f,
+ -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+ 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f,
+ -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f,
+ -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f,
+ -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f,
+ 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+ -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+ -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f,
+ -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+ -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+ 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f,
+ 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f,
+ -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f,
+ 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+ -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f,
+ -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f,
+ -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f,
+ -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f,
+ 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+ -1.251640f, -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+ 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f,
+ -0.010198f, 0.130597f, 1.276752f, 2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_32_layer0,
+ av1_ab_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_32_layer0,
+ av1_ab_partition_nn_bias_32_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+ 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f,
+ 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f,
+ 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f,
+ 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f,
+ -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f,
+ 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f,
+ -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+ 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f,
+ -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f,
+ 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f,
+ 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f,
+ 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f,
+ -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f,
+ 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f,
+ -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+ -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+ 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f,
+ -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f,
+ 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+ -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+ 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f,
+ -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+ -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+ -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f,
+ -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+ -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f,
+ -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f,
+ -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f,
+ -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+ -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f,
+ -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f,
+ -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f,
+ 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f,
+ -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f,
+ -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f,
+ 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f,
+ -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f,
+ -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+ 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f,
+ 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f,
+ -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f,
+ 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f,
+ -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+ 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f,
+ 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f,
+ 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+ -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+ -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+ -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f,
+ -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f,
+ 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f,
+ -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f,
+ 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f,
+ 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f,
+ -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f,
+ -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f,
+ -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f,
+ 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+ 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f,
+ -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f,
+ -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f,
+ -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+ -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f,
+ -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f,
+ -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f,
+ -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f,
+ -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f,
+ 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+ -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f,
+ 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f,
+ -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f,
+ -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f,
+ -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f,
+ -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f,
+ -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f,
+ -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f,
+ -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f,
+ -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+ -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f,
+ -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f,
+ 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f,
+ -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f,
+ 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f,
+ 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f,
+ 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+ -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+ -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f,
+ -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f,
+ -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+ -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f,
+ -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f,
+ -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f,
+ 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f,
+ -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f,
+ -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f,
+ 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f,
+ -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f,
+ 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f,
+ 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+ -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f,
+ -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f,
+ -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f,
+ -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f,
+ -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f,
+ -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+ -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f,
+ -0.021087f, 0.110220f, -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+ 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f,
+ -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+ 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+ 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f,
+ -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f,
+ -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f,
+ -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f,
+ 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f,
+ -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f,
+ -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f,
+ 0.123809f, -0.109797f, 0.200510f, -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+ -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f,
+ -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f,
+ -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f,
+ -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f,
+ 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f,
+ 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f,
+ 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f,
+ -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f,
+ 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f,
+ 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+ -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f,
+ 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+ -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f,
+ 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f,
+ 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f,
+ 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f,
+ -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f,
+ 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f,
+ -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f,
+ 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f,
+ 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f,
+ -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f,
+ 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+ 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f,
+ 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f,
+ 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f,
+ -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+ -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f,
+ 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f,
+ 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f,
+ 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+ 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f,
+ 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f,
+ -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f,
+ 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f,
+ 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f,
+ 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f,
+ -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+ 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f,
+ 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f,
+ 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f,
+ -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f,
+ -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f,
+ -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f,
+ -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f,
+ -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f,
+ -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f,
+ 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f,
+ 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f,
+ -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f,
+ -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f,
+ 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f,
+ -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f,
+ -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f,
+ -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f,
+ -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f,
+ -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f,
+ -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f,
+ 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f,
+ 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f,
+ -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f,
+ -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f,
+ -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f,
+ -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f,
+ 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+ 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f,
+ -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+ 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f,
+ 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f,
+ -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f,
+ -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f,
+ -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+ 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f,
+ 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+ 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f,
+ 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+ -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f,
+ 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f,
+ -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f,
+ -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f,
+ -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+ -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f,
+ -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+ 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f,
+ -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f,
+ 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f,
+ -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f,
+ 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f,
+ 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f,
+ -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f,
+ -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f,
+ 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f,
+ -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f,
+ -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f,
+ 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+ -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f,
+ 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+ -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f,
+ -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f,
+ -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+ 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f,
+ 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f,
+ 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f,
+ 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f,
+ -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+ -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f,
+ -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f,
+ 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f,
+ -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f,
+ 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f,
+ -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f,
+ -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f,
+ -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f,
+ 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f,
+ 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f,
+ -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+ -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f,
+ -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f,
+ -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+ -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f,
+ -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+ -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f,
+ -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+ -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f,
+ -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f,
+ -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f,
+ 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f,
+ 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f,
+ 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+ 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f,
+ -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f,
+ -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f,
+ 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f,
+ 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f,
+ 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f,
+ 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f,
+ -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f,
+ -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+ -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f,
+ -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+ -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f,
+ -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+ -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+ 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f,
+ -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f,
+ -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f,
+ 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f,
+ 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+ -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+ -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f,
+ -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f,
+ 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f,
+ 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f,
+ 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f,
+ 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f,
+ 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+ -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f,
+ -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f,
+ -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f,
+ -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f,
+ 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+ -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f,
+ -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f,
+ -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f,
+ -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f,
+ 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f,
+ -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f,
+ 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+ -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f,
+ -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f,
+ 0.172790f, -0.172982f, 0.041258f, -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+ 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f,
+ -0.658522f, 0.723479f, 0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_16_layer0,
+ av1_ab_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_16_layer0,
+ av1_ab_partition_nn_bias_16_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+ -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f,
+ 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f,
+ 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f,
+ 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f,
+ -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f,
+ -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f,
+ 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f,
+ 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f,
+ -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f,
+ -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f,
+ -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+ -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f,
+ 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f,
+ 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f,
+ -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+ -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f,
+ -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f,
+ -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f,
+ 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f,
+ -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+ -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+ -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f,
+ -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f,
+ -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f,
+ -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f,
+ 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f,
+ 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+ -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f,
+ 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f,
+ -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f,
+ 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f,
+ 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f,
+ -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f,
+ -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f,
+ 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f,
+ -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f,
+ 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f,
+ -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f,
+ 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f,
+ -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f,
+ 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f,
+ 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f,
+ -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f,
+ 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f,
+ 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+ 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+ 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f,
+ -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f,
+ -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f,
+ -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f,
+ 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+ 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f,
+ -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f,
+ -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f,
+ 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+ -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f,
+ -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+ -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f,
+ 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f,
+ -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f,
+ -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f,
+ 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f,
+ 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f,
+ 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+ 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f,
+ -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+ 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+ -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+ -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+ -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f,
+ 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f,
+ 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+ 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f,
+ -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+ 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+ -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+ -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f,
+ 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f,
+ -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+ -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f,
+ 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f,
+ -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f,
+ -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f,
+ 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+ 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f,
+ -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f,
+ 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f,
+ -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f,
+ 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+ -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+ -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f,
+ -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.462133f,
+ 0.465060f,
+ 0.062211f,
+ 0.401786f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_16_layer0,
+ av1_4_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_16_layer0,
+ av1_4_partition_nn_bias_16_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+ -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f,
+ 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f,
+ -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f,
+ 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f,
+ -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f,
+ -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f,
+ -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f,
+ -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f,
+ -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+ -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+ 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f,
+ -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+ -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f,
+ 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f,
+ -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f,
+ -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f,
+ -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f,
+ -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f,
+ -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f,
+ -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f,
+ 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f,
+ -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f,
+ -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f,
+ 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+ 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f,
+ -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+ 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f,
+ 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f,
+ -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+ -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f,
+ -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f,
+ 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f,
+ -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f,
+ 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f,
+ -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+ -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f,
+ 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+ -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+ 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+ -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f,
+ -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+ -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f,
+ -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+ 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f,
+ 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f,
+ -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f,
+ -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f,
+ 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f,
+ 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f,
+ 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f,
+ 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f,
+ -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+ 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f,
+ 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+ -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+ -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f,
+ -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f,
+ -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f,
+ -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+ -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f,
+ -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f,
+ -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f,
+ 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f,
+ -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f,
+ 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f,
+ -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+ 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f,
+ 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f,
+ 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f,
+ -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f,
+ 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f,
+ 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f,
+ 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f,
+ -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f,
+ 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+ 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f,
+ -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f,
+ 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f,
+ -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f,
+ -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f,
+ -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f,
+ -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f,
+ -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f,
+ 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+ -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f,
+ 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f,
+ -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f,
+ -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f,
+ 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f,
+ -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f,
+ -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f,
+ 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f,
+ -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+ -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f,
+ -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f,
+ 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+ 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+ -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+ -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f,
+ -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+ -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f,
+ 0.109579f, -0.082685f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+ 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f,
+ 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f,
+ 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f,
+ -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f,
+ 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f,
+ 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f,
+ -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f,
+ 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f,
+ 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f,
+ 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f,
+ -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f,
+ 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f,
+ -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f,
+ -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f,
+ 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f,
+ -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f,
+ 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f,
+ 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f,
+ 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f,
+ 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f,
+ -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f,
+ -0.800926f, -0.134132f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.019518f,
+ 0.198546f,
+ 0.339015f,
+ -0.261961f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_32_layer0,
+ av1_4_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_32_layer0,
+ av1_4_partition_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+ -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f,
+ -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f,
+ 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f,
+ -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+ -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f,
+ 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f,
+ 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f,
+ 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+ 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f,
+ -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f,
+ -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f,
+ 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f,
+ -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f,
+ 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f,
+ -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f,
+ -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+ 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f,
+ -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f,
+ 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f,
+ -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f,
+ -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f,
+ -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f,
+ -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f,
+ -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f,
+ -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+ -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+ -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f,
+ 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f,
+ 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+ -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f,
+ -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f,
+ 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+ -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f,
+ 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f,
+ -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f,
+ 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+ 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f,
+ -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+ -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f,
+ 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f,
+ 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f,
+ 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f,
+ 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f,
+ -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f,
+ -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f,
+ 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f,
+ -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f,
+ 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+ -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f,
+ 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f,
+ -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f,
+ -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f,
+ 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f,
+ -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+ -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+ -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f,
+ -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f,
+ -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f,
+ 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f,
+ 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f,
+ -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f,
+ -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f,
+ -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f,
+ 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+ 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+ -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+ -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+ 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+ 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+ 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f,
+ -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f,
+ 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+ 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f,
+ -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f,
+ -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f,
+ -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+ -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f,
+ 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f,
+ 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f,
+ -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f,
+ -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f,
+ 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f,
+ -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f,
+ 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f,
+ 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f,
+ -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f,
+ -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f,
+ -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f,
+ 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f,
+ -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+ -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+ -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.478735f,
+ 0.292948f,
+ 0.293172f,
+ 0.040013f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_64_layer0,
+ av1_4_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_64_layer0,
+ av1_4_partition_nn_bias_64_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 4
+static const float
+ av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+ -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f,
+ -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f,
+ 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f,
+ -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f,
+ -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f,
+ -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f,
+ -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f,
+ -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f,
+ 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f,
+ 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f,
+ -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f,
+ -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f,
+ 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f,
+ -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f,
+ -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f,
+ -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f,
+ 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f,
+ -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f,
+ -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f,
+ -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f,
+ 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f,
+ -0.007193f, -0.257836f,
+ };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+ 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f,
+ -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f,
+ 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f,
+ 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f,
+ -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f,
+ 0.429660f, -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+ -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+ 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f,
+ 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f,
+ -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f,
+ -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f,
+ -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+ 0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_128_layer0,
+ av1_partition_breakout_nn_weights_128_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_128_layer0,
+ av1_partition_breakout_nn_bias_128_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+ 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+ -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f,
+ 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+ 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f,
+ -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+ 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f,
+ 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+ -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f,
+ 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+ -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+ -2.407131f, -0.062304f, 0.000874f, 0.108786f,
+ };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+ 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f,
+ -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+ -0.337413f, 4.492778f, 0.000000f, 17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+ -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+ 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f,
+ -0.038572f, 0.307899f, -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+ -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_64_layer0,
+ av1_partition_breakout_nn_weights_64_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_64_layer0,
+ av1_partition_breakout_nn_bias_64_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+ -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f,
+ 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f,
+ -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+ -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+ -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f,
+ 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f,
+ 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f,
+ -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+ -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f,
+ -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f,
+ -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+ };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+ 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+ 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+ -0.423808f, 0.000000f, 6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+ 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f,
+ 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+ -0.004171f, 0.157694f, 0.117845f, 0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+ 0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_32_layer0,
+ av1_partition_breakout_nn_weights_32_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_32_layer0,
+ av1_partition_breakout_nn_bias_32_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+ 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f,
+ -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f,
+ -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+ -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f,
+ -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f,
+ -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+ -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f,
+ -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+ -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f,
+ -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f,
+ -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+ };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+ 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f,
+ 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f,
+ 5.625762f, 0.615822f, 0.040057f, 16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+ -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f,
+ 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+ 0.269773f, -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+ 1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_16_layer0,
+ av1_partition_breakout_nn_weights_16_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_16_layer0,
+ av1_partition_breakout_nn_bias_16_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+ -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+ 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f,
+ -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f,
+ -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f,
+ 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f,
+ -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f,
+ -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+ -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f,
+ -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f,
+ -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f,
+ -0.596269f, 0.098494f, -0.005765f, 0.173652f,
+ };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+ 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+ 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f,
+ 2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+ -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f,
+ -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+ 0.055858f, 0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+ 1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_8_layer0,
+ av1_partition_breakout_nn_weights_8_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_8_layer0,
+ av1_partition_breakout_nn_bias_8_layer1,
+ },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9 // Input layer size
+#define NUM_NODES 32 // Hidden layer size
+#define LABEL_SIZE 3 // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f,
+ -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f,
+ 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f,
+ -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f,
+ 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+ 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f,
+ 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f,
+ -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+ 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f,
+ 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f,
+ -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+ -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+ 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f,
+ 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f,
+ -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f,
+ 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f,
+ -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f,
+ 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f,
+ 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f,
+ -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f,
+ -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+ -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f,
+ 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f,
+ 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f,
+ -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f,
+ 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f,
+ -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+ -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f,
+ -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f,
+ 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+ -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f,
+ -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f,
+ -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f,
+ 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f,
+ 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f,
+ -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+ 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f,
+ 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f,
+ 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f,
+ 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f,
+ -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f,
+ -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+ 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f,
+ -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f,
+ 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f,
+ -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f,
+ -0.22638f, 1.40940f, -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f,
+ -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f,
+ -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f,
+ -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f,
+ -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f,
+ 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f,
+ -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f,
+ 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f,
+ 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+ -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f,
+ -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f,
+ 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f,
+ -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f,
+ -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+ 1.70665f,
+ -0.77954f,
+ -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_8_layer0,
+ av1_rect_partition_nn_weights_8_layer1 },
+ { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f,
+ -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f,
+ 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f,
+ -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f,
+ 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+ -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f,
+ 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+ 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+ 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f,
+ -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f,
+ 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f,
+ 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f,
+ 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+ 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f,
+ 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+ -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f,
+ -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+ 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f,
+ -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f,
+ -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f,
+ -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f,
+ 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f,
+ 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+ -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f,
+ -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+ -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f,
+ 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f,
+ 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f,
+ -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f,
+ -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f,
+ -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f,
+ -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f,
+ -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f,
+ 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+ 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+ 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f,
+ -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+ -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+ 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+ -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f,
+ -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f,
+ -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+ -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f,
+ 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+ 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f,
+ -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f,
+ -0.12044f, 1.65478f, -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f,
+ 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f,
+ 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+ 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f,
+ -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f,
+ 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f,
+ 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f,
+ 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+ 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f,
+ -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+ -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f,
+ -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f,
+ 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f,
+ -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+ 2.68750f,
+ -1.31894f,
+ -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_16_layer0,
+ av1_rect_partition_nn_weights_16_layer1 },
+ { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+ -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f,
+ -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f,
+ -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f,
+ -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+ -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f,
+ -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f,
+ -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f,
+ -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+ 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f,
+ -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f,
+ -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f,
+ 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f,
+ 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+ -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f,
+ -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f,
+ 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f,
+ 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f,
+ 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f,
+ 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f,
+ 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f,
+ -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f,
+ 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f,
+ 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f,
+ -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+ -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f,
+ -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f,
+ -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f,
+ -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+ -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f,
+ 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f,
+ -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+ -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f,
+ 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f,
+ 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f,
+ -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f,
+ 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f,
+ 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f,
+ 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f,
+ -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f,
+ -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f,
+ 0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+ -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f,
+ 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f,
+ 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f,
+ -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f,
+ -0.27602f, -1.98063f, 0.20816f, -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f,
+ -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f,
+ 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f,
+ -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f,
+ 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f,
+ 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f,
+ 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f,
+ 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f,
+ 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f,
+ -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f,
+ 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f,
+ -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+ -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f,
+ -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+ 2.47332f,
+ -1.65756f,
+ -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_32_layer0,
+ av1_rect_partition_nn_weights_32_layer1 },
+ { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f,
+ 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f,
+ 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f,
+ 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f,
+ 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+ 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f,
+ 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f,
+ -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f,
+ 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f,
+ 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f,
+ -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f,
+ -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f,
+ -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f,
+ -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+ 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f,
+ 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+ 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f,
+ -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+ -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f,
+ -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f,
+ -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f,
+ -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f,
+ 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f,
+ 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f,
+ 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f,
+ -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f,
+ -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+ 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f,
+ 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f,
+ 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+ -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f,
+ -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f,
+ -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f,
+ 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f,
+ -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+ -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f,
+ 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f,
+ -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f,
+ -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f,
+ 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f,
+ -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f,
+ 0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+ 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f,
+ -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f,
+ -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f,
+ -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f,
+ 0.59835f, -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f,
+ -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f,
+ 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f,
+ 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f,
+ 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f,
+ -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f,
+ -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f,
+ 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f,
+ -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f,
+ 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f,
+ -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f,
+ -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f,
+ -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f,
+ 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+ 0.32215f,
+ -0.57522f,
+ 0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_64_layer0,
+ av1_rect_partition_nn_weights_64_layer1 },
+ { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f,
+ 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f,
+ 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f,
+ 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f,
+ -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f,
+ 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f,
+ 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f,
+ 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f,
+ 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f,
+ 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f,
+ -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f,
+ 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f,
+ -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f,
+ -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f,
+ 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f,
+ -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f,
+ -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f,
+ -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f,
+ -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f,
+ -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f,
+ -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f,
+ -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+ -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f,
+ 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f,
+ 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f,
+ -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f,
+ -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f,
+ 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f,
+ 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f,
+ 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f,
+ -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f,
+ 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f,
+ -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f,
+ 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f,
+ -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f,
+ -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f,
+ 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f,
+ -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f,
+ -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f,
+ -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f,
+ 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f,
+ 2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+ 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f,
+ 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f,
+ -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f,
+ -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f,
+ 0.66120f, 0.61119f, -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f,
+ 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+ -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+ 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f,
+ 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f,
+ 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f,
+ 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f,
+ 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f,
+ -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+ -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f,
+ 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f,
+ 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f,
+ 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f,
+ 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+ 1.09014f,
+ -0.53317f,
+ -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_128_layer0,
+ av1_rect_partition_nn_weights_128_layer1 },
+ { av1_rect_partition_nn_bias_128_layer0,
+ av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
+// Below are the models used for simple_motion_search_based_split
+// Thresholds
+// The first index level is for aggresiveness, and the second is frame
+// resolution, third is bsize
+static const float av1_simple_motion_search_split_thresh[4][3][5] = {
+ // Aggressiveness = 0
+ {
+ // lowres
+ {
+ 1.40402595879f, // p = 0.8028197
+ 4.72845183649f, // p = 0.99123732
+ 1.86517797783f, // p = 0.86589934
+ 1.58715223005f, // p = 0.83021506
+ 7.22695596987f, // p = 0.9992738
+ },
+ // midres
+ {
+ 5.839480f, // p = 0.997098
+ 1.877167f, // p = 0.867285
+ 3.073499f, // p = 0.955783
+ 1.405601f, // p = 0.803071
+ 2.555636f, // p = 0.927951
+ },
+ // hdres
+ {
+ 5.839480f, // p = 0.997098
+ 1.877167f, // p = 0.867285
+ 3.073499f, // p = 0.955783
+ 1.405601f, // p = 0.803071
+ 2.555636f, // p = 0.927951
+ },
+ },
+ // Aggressiveness = 1
+ {
+ // Lowres
+ {
+ 100.0000f, // p = 1.000000
+ 4.952535f, // p = 0.992984
+ 1.720880f, // p = 0.848242
+ 1.426233f, // p = 0.806314
+ 1.491905f, // p = 0.816364
+ },
+ // Midres
+ {
+ 100.0000f, // p = 100.0000
+ 3.137263f, // p = 0.958404
+ 2.703262f, // p = 0.937219
+ 1.877166f, // p = 0.867285
+ 2.221149f, // p = 0.902133
+ },
+ // Hdres
+ {
+ 4.417680f, // p = 0.988082
+ 3.086898f, // p = 0.956349
+ 3.966704f, // p = 0.981416
+ 1.532565f, // p = 0.822381
+ 3.449975f, // p = 0.969230
+ },
+ },
+ // Aggressiveness = 2
+ {
+ // lowres
+ {
+ 100.000000f, // p = 0.998048
+ 1.484020f, // p = 0.815179
+ 1.866781f, // p = 0.866085
+ 1.706711f, // p = 0.846409
+ 2.080369f, // p = 0.888980
+ },
+ // midres
+ {
+ 100.000000f, // p = 0.0
+ 3.265763f, // p = 0.963235428881
+ 2.024598f, // p = 0.883355591569
+ 1.846446f, // p = 0.863709256976
+ 2.240962f, // p = 0.903868036126
+ },
+ // hdres
+ {
+ 3.133026f, // p = 0.958234684141
+ 2.940954f, // p = 0.949834204693
+ 2.484544f, // p = 0.923051170045
+ 1.702972f, // p = 0.845922460525
+ 1.655562f, // p = 0.839641385729
+ },
+ },
+ // Aggressiveness = 3
+ {
+ // lowres
+ { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f,
+ 0.762099214988f },
+ // midres
+ { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f,
+ 0.557298794638f },
+ // hdres
+ { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f,
+ 1.86572095242f },
+ },
+};
+
+static const float av1_simple_motion_search_no_split_thresh[4][3][5] = {
+ // Aggressiveness = 0
+ {
+ // lowres
+ {
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ },
+ // midres
+ {
+ -3.38168078f, // p = 0.032872917
+ -4.08610739f, // p = 0.016526795
+ -1.78302370f, // p = 0.15270848
+ -100.000000f, // p = 0.0
+ -100.000000f, // p = 0.0
+ },
+ // hdres
+ {
+ -100.000000f, // p = 0.0
+ -100.000000f, // p = 0.0
+ -2.98718897f, // p = 0.048008
+ -100.000000f, // p = 0.0
+ -3.33229488f, // p = 0.03447975
+ },
+ },
+ // Aggressiveness = 1
+ {
+ // Lowres
+ {
+ -100.0000f, // p = 0.0
+ -4.893793f, // p = 0.007437
+ -3.387766f, // p = 0.032680
+ -2.982806f, // p = 0.048209
+ -2.330372f, // p = 0.088639
+ },
+ // Midres
+ {
+ -100.0000f, // p = 0.000000
+ -6.131853f, // p = 0.002168
+ -2.346579f, // p = 0.087338
+ -2.712849f, // p = 0.062219
+ -3.195430f, // p = 0.039338
+ },
+ // Hdres
+ {
+ -3.491416f, // p = 0.029557
+ -2.192853f, // p = 0.100394
+ -3.620180f, // p = 0.026079
+ -2.030855f, // p = 0.116001
+ -2.797586f, // p = 0.057455
+ },
+ },
+ // Aggressiveness = 2
+ {
+ // lowres
+ {
+ -100.0000f, // p = 0.0
+ -3.617350f, // p = 0.026151
+ -5.902503f, // p = 0.002725
+ -4.677840f, // p = 0.009213
+ -2.168378f, // p = 0.102626
+ },
+ // midres
+ {
+ -100.0000f, // p = 0.0
+ -3.204195f, // p = 0.0390081679555
+ -2.354128f, // p = 0.0867382128969
+ -2.523326f, // p = 0.0742390077132
+ -3.112328f, // p = 0.0426016085803
+ },
+ // hdres
+ {
+ -5.047760f, // p = 0.00638270448225
+ -3.414994f, // p = 0.0318301469487
+ -5.628090f, // p = 0.00358255438917
+ -2.122691f, // p = 0.10691083145
+ -1.972387f, // p = 0.122132728355
+ },
+ },
+ // Aggressiveness = 3
+ {
+ // lowres
+ { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f,
+ -1.0830321897f },
+ // midres
+ { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f,
+ -0.228236297886f },
+ // hdres
+ { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f,
+ -1.36741555171f },
+ },
+};
+
+static const float av1_simple_motion_search_split_mean_128[17] = {
+ 14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f,
+ 12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f, 0.714786f,
+ 3.535450f, 3.566207f, 0.835913f, 3.315452f, 3.302908f,
+};
+
+static const float av1_simple_motion_search_split_std_128[17] = {
+ 1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f,
+ 1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f,
+ 1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f,
+};
+
+static const float av1_simple_motion_search_split_mean_64[17] = {
+ 12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f,
+ 10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f, 0.896393f,
+ 2.819613f, 2.855845f, 0.926296f, 2.808782f, 2.798229f,
+};
+
+static const float av1_simple_motion_search_split_std_64[17] = {
+ 1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f,
+ 1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f,
+ 1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f,
+};
+
+static const float av1_simple_motion_search_split_mean_32[17] = {
+ 10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f,
+ 8.759780f, 8.656299f, 8.772563f, 8.669839f, 4.208026f, 0.958573f,
+ 2.308769f, 2.347375f, 0.961685f, 2.323464f, 2.296322f,
+};
+
+static const float av1_simple_motion_search_split_std_32[17] = {
+ 1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f,
+ 1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f,
+ 0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f,
+};
+
+static const float av1_simple_motion_search_split_mean_16[17] = {
+ 9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f,
+ 7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f,
+ 1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f,
+};
+
+static const float av1_simple_motion_search_split_std_16[17] = {
+ 1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f,
+ 1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f,
+ 0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f,
+};
+
+static const float av1_simple_motion_search_split_mean_8[17] = {
+ 7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f,
+ 5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f,
+ 1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f,
+};
+
+static const float av1_simple_motion_search_split_std_8[17] = {
+ 1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f,
+ 1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f,
+ 0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f,
+};
+
+static const float *const av1_simple_motion_search_split_mean[5] = {
+ av1_simple_motion_search_split_mean_128,
+ av1_simple_motion_search_split_mean_64,
+ av1_simple_motion_search_split_mean_32,
+ av1_simple_motion_search_split_mean_16,
+ av1_simple_motion_search_split_mean_8,
+};
+
+static const float *const av1_simple_motion_search_split_std[5] = {
+ av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64,
+ av1_simple_motion_search_split_std_32, av1_simple_motion_search_split_std_16,
+ av1_simple_motion_search_split_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 17
+#define NUM_LAYER_0_UNITS_128 20
+#define NUM_LOGITS_128 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = {
+ 0.24095f, -0.397761f, -0.388619f, -0.0629548f, -0.44577f, 0.688212f,
+ -0.20889f, -1.08227f, -0.0313894f, -0.615505f, -0.401839f, 0.40233f,
+ -0.171305f, 0.439803f, 1.58527f, -0.968535f, -1.29255f, 1.14846f,
+ 0.885777f, 0.116412f, -0.225704f, 0.316506f, 0.793951f, -0.63591f,
+ 0.097789f, -0.327027f, -0.778396f, -0.231667f, -0.9622f, 1.0044f,
+ 0.32594f, 0.179768f, -0.115529f, -0.499395f, -1.14727f, -1.26111f,
+ 0.269818f, -0.0882028f, -0.349107f, 0.100901f, 0.0249506f, 0.528929f,
+ 0.113961f, 0.929794f, 0.242494f, -0.122828f, -0.0477379f, 0.170659f,
+ 0.0500187f, 0.28859f, 0.78783f, 0.482412f, 0.795298f, 0.179517f,
+ 0.453911f, -0.298029f, -0.903332f, 0.510615f, 0.691994f, 0.433383f,
+ -0.140802f, -1.11635f, -0.547326f, 1.11318f, 0.71905f, 0.978538f,
+ 0.097444f, -0.0386012f, 0.713599f, 0.465164f, 0.391278f, -0.472864f,
+ 0.230224f, -0.279508f, 0.558192f, -0.468625f, 0.55995f, -0.57507f,
+ -1.39947f, -0.755819f, -1.04512f, -0.411552f, -0.830444f, -0.106571f,
+ -0.0972184f, 0.251842f, 0.269955f, 0.230492f, -0.290581f, -0.484799f,
+ 0.0151041f, 0.171047f, 0.829999f, -0.384581f, 0.220301f, -0.121687f,
+ 1.88848f, -0.482809f, -0.48185f, 1.34482f, -0.716438f, -0.284482f,
+ -1.78592f, -1.29333f, 0.886867f, 0.80106f, 0.456415f, 0.649095f,
+ 0.231093f, 0.361562f, 0.290018f, 0.128009f, -0.196343f, 0.0607802f,
+ 0.576761f, -0.0413836f, 0.0300984f, -0.318998f, 0.204434f, -0.712524f,
+ 0.833394f, -0.81168f, 0.765488f, -0.720973f, 1.12866f, -0.838694f,
+ 1.295f, -0.159127f, 1.05404f, 0.736519f, 0.248662f, 0.229233f,
+ 0.0434302f, 0.0551856f, 0.197862f, 0.354823f, -0.32429f, -0.227353f,
+ -0.132198f, -0.438118f, -0.210401f, -0.81046f, 0.653555f, 0.826737f,
+ 0.154235f, 0.228945f, 0.123089f, 0.614964f, -0.0940471f, -0.00676807f,
+ 0.24996f, 0.949233f, 0.746526f, -0.044474f, 0.386414f, 0.503221f,
+ 0.155133f, -0.698848f, -0.735356f, -0.255091f, 0.413235f, -0.335295f,
+ -0.145757f, 0.326299f, -0.602629f, -0.844474f, -0.346722f, -0.42598f,
+ -0.491016f, -0.447732f, -0.965366f, -0.0242841f, 0.836606f, -0.104877f,
+ 1.23236f, 0.683986f, 0.787005f, -0.0253437f, 1.2145f, 1.29554f,
+ -1.24302f, -0.229495f, 0.439415f, 0.885087f, -0.408704f, -0.119299f,
+ -0.0960972f, 0.60148f, 0.683271f, -0.057129f, -0.180295f, -0.264815f,
+ -0.363184f, 0.638271f, 0.631083f, -0.252899f, -0.164364f, -1.31274f,
+ 0.354408f, 0.0429172f, 0.371154f, -1.0978f, 0.0433642f, -0.467394f,
+ -0.706572f, 1.57198f, -0.0701271f, 1.93149f, -0.446267f, 1.4519f,
+ -1.29567f, 0.309978f, -0.878062f, 0.891494f, 0.364005f, -0.209611f,
+ -0.125927f, 0.184097f, 0.0629695f, -0.43375f, -0.0980562f, 1.08547f,
+ 0.578312f, 0.16566f, -0.198852f, -0.241854f, -0.523934f, -0.206037f,
+ -0.867721f, 1.00041f, 1.09848f, -2.12562f, -0.19992f, -0.186128f,
+ -0.03507f, 0.0484884f, 0.160856f, 0.10802f, -0.805141f, -1.06902f,
+ 0.290363f, 0.0222096f, -0.849266f, 0.112932f, 0.148682f, -0.0457585f,
+ 1.139f, 1.79141f, 0.194122f, -0.342508f, -0.403572f, 0.133678f,
+ 0.217553f, -0.263759f, 0.18441f, 0.254529f, 0.0471115f, 0.733178f,
+ -0.416205f, 0.441447f, -0.443335f, 0.725005f, -0.78946f, 0.71301f,
+ -0.644969f, 1.5445f, 0.365277f, -0.455775f, -0.365066f, 0.4742f,
+ -0.381714f, -0.545794f, -0.0464861f, -0.222768f, -0.0106466f, -0.069743f,
+ 0.0335566f, 0.378348f, -0.249663f, 0.922286f, 0.125711f, -0.894619f,
+ 0.444682f, 0.447893f, -1.98936f, -1.41978f, 0.0406667f, -0.199928f,
+ -0.199786f, 0.463481f, 0.334931f, -0.396222f, -0.0732259f, 0.796684f,
+ -0.140817f, -0.26878f, 0.194642f, 0.895784f, -0.369976f, -2.26981f,
+ -0.0791776f, -0.0492268f, 0.6715f, 0.281805f, 0.0156664f, -0.779785f,
+ 0.17743f, 0.188786f, -0.588077f, -0.359153f, 0.258319f, 0.881688f,
+ 0.846894f, 1.00292f, 0.838134f, 0.680632f, 0.273098f, -0.329261f,
+ 0.217757f, -0.506726f, -0.336523f, -0.695875f, -0.252006f, 0.751216f,
+ 0.334409f, -0.0151467f, 0.0885474f, 0.0973114f, -0.248754f, -0.263716f,
+ 0.369906f, -0.213749f, -0.0355395f, -0.137799f, 2.43233f, -0.944233f,
+ -0.745167f, 0.318558f, 0.316608f, 0.568678f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = {
+ 0.821344f, 1.11542f, -1.24172f, 1.03642f, 1.13511f,
+ 1.16414f, -0.278655f, -1.35558f, -1.26788f, -1.63189f,
+ -0.323271f, 1.21319f, -0.888415f, 0.987145f, -1.16767f,
+ 0.255833f, -0.1392f, 1.43265f, -1.54952f, 1.65159f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_128[] = {
+ 0.3565753f, 0.5490161f, -1.015597f, 0.565366f, 0.751604f,
+ 0.922747f, -1.931846f, 1.759353f, -0.7362949f, 0.5707034f,
+ -1.092127f, 0.936767f, 2.034499f, 2.08148f, 0.9509507f,
+ -1.342504f, -0.834566f, 0.618184f, 0.844113f, 1.182693f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_128[] = {
+ 1.819351f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = {
+ NUM_FEATURES_128,
+ NUM_LOGITS_128,
+ NUM_HIDDEN_LAYERS_128,
+ {
+ NUM_LAYER_0_UNITS_128,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_128,
+ av1_simple_motion_search_split_logits_kernel_128,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_128,
+ av1_simple_motion_search_split_logits_bias_128,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 17
+#define NUM_LAYER_0_UNITS_64 24
+#define NUM_LOGITS_64 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = {
+ -1.40663f, -0.851503f, -0.0613111f, 0.741591f, 0.302754f,
+ 0.184001f, 0.0474853f, 0.371096f, 0.0541624f, 0.381508f,
+ 0.355427f, 0.0428822f, 0.154916f, -0.00490099f, 0.025484f,
+ 0.0208921f, 0.140596f, -0.292525f, -0.459067f, -0.081393f,
+ 0.109824f, -0.290183f, 0.720236f, 0.385835f, -0.150643f,
+ -0.078518f, 0.0979819f, -0.102135f, 0.137152f, -0.0786457f,
+ 0.0171441f, 0.991338f, -0.546583f, -1.0714f, -0.0842851f,
+ 0.244072f, 0.427379f, 0.146775f, -0.921613f, -0.912093f,
+ 0.393566f, -0.232375f, 0.19963f, 0.312355f, 0.55659f,
+ -0.104714f, -0.137563f, 0.0985237f, 0.0788307f, -0.225514f,
+ 0.0228832f, -0.288733f, -0.00737685f, -0.711657f, -0.256796f,
+ 0.0869605f, 0.583977f, 0.384306f, 1.46692f, -0.741126f,
+ -0.21105f, -0.276604f, -0.0151463f, -0.0227997f, -0.0403232f,
+ 0.044122f, 0.0185784f, -0.0451951f, 0.00489513f, -0.387131f,
+ 0.0966724f, -0.599174f, -0.00243351f, -0.21439f, 0.302043f,
+ 0.130334f, -0.191251f, 0.863261f, -1.50112f, 0.00901057f,
+ 0.000324294f, -0.0572545f, 0.0117685f, -0.0734682f, -0.0570435f,
+ -0.126253f, 1.2313f, -0.328267f, 0.211788f, -0.175438f,
+ -0.0419298f, 0.166447f, -0.178739f, -0.326221f, -0.0439188f,
+ 1.01182f, -0.390678f, -0.426343f, 0.0944665f, -0.225042f,
+ -0.183344f, 0.0500763f, -0.377393f, -0.673401f, -0.436907f,
+ -0.00366876f, -0.363412f, 0.195194f, 0.250248f, -0.397193f,
+ -0.0917222f, -0.0221579f, 1.7693f, -0.0694484f, -0.0410764f,
+ -0.134571f, -0.159992f, -0.170359f, -0.249333f, -0.128056f,
+ -0.617054f, -0.808701f, -0.540642f, 0.396391f, 0.147787f,
+ 0.346916f, 0.709852f, 0.116064f, 0.0509731f, 0.073713f,
+ -0.365082f, -1.09287f, -0.618214f, 0.20545f, 0.126161f,
+ -0.140012f, 0.62592f, 0.316326f, -0.392765f, -0.15934f,
+ 0.337617f, -0.41669f, -0.295225f, 0.0602025f, -0.0150657f,
+ -0.319629f, 0.783729f, -0.0661199f, -0.362657f, 0.390042f,
+ -0.043614f, -0.0414596f, 0.121155f, -0.309775f, -0.284761f,
+ -0.243932f, 0.279855f, -0.266823f, 0.734824f, -0.164028f,
+ 0.261776f, -0.105585f, 0.10733f, -0.180469f, 1.18875f,
+ -1.12836f, -0.173008f, 0.150221f, 0.111598f, 0.148306f,
+ -1.2833f, -1.06346f, 0.233546f, 0.16432f, 0.00142378f,
+ 0.340574f, -0.0140885f, 0.634761f, -0.122096f, 0.821487f,
+ 0.421424f, -0.0256687f, -0.035503f, -0.0453547f, -0.0215179f,
+ -0.0671277f, -0.0486862f, -0.962761f, -0.208383f, 0.109573f,
+ -0.210668f, -0.176485f, 0.421279f, 0.41605f, 0.342084f,
+ 0.619364f, 0.103718f, -0.00341643f, 0.00266677f, 0.249089f,
+ -0.22848f, -0.0368968f, 1.12092f, -0.64912f, -0.456579f,
+ 0.477823f, 0.418345f, 1.41515f, 0.0936279f, 0.886155f,
+ -0.785656f, -0.217109f, -0.561829f, -0.286435f, -0.884068f,
+ -0.148839f, -0.282848f, 0.0683745f, 0.0962815f, -0.111975f,
+ 0.0509158f, -0.211274f, 0.744909f, -0.8982f, 0.315232f,
+ -0.78624f, 0.598387f, -0.530952f, 0.677357f, 0.0371339f,
+ 0.99209f, -0.681899f, -0.291416f, -0.224822f, -0.26049f,
+ -0.0436525f, -0.380004f, -0.27187f, 0.534779f, 0.717939f,
+ 0.418197f, -0.152539f, -0.0684039f, -0.186308f, -0.0653121f,
+ 0.194145f, -0.196367f, 0.256997f, -0.726269f, -0.307672f,
+ -0.153362f, 0.450827f, 0.708842f, -0.0667079f, 0.555564f,
+ 0.0486892f, 0.0715072f, -0.7211f, -0.849797f, 0.0650271f,
+ 1.2747f, -0.646738f, -0.53042f, 0.182197f, 0.928203f,
+ 0.180621f, -0.00640791f, -0.171416f, 0.092688f, -0.391275f,
+ -0.0650657f, 0.0843773f, 0.170824f, 0.378085f, 0.0596657f,
+ 0.844398f, -1.3083f, -1.27828f, -0.199179f, 0.557855f,
+ 0.241479f, 0.385804f, 0.169533f, -0.0028072f, 0.0538041f,
+ 0.00136234f, 0.0130481f, 0.0349449f, -0.0366494f, -0.000474055f,
+ 0.437956f, 0.286724f, -0.298187f, 0.461967f, 0.43065f,
+ -0.0877194f, -0.19133f, 0.379121f, -0.687751f, -1.64077f,
+ -0.375191f, -0.336836f, -0.323904f, -0.101859f, 0.0126672f,
+ -0.346332f, 0.112303f, -0.863336f, 0.155538f, 0.366509f,
+ -0.0976829f, 0.635278f, -0.681967f, -0.527729f, 0.591839f,
+ 0.366678f, 0.189981f, 0.0208007f, -0.565809f, 0.70183f,
+ -0.282844f, -0.327485f, 0.347243f, -1.13014f, -0.373378f,
+ -0.514978f, 0.662994f, -0.144931f, 0.1402f, -0.820049f,
+ 0.711498f, 0.681156f, 1.06515f, -0.423409f, -0.0392664f,
+ 0.0675396f, -0.0508602f, 0.0431443f, 0.0212639f, -0.0279887f,
+ -0.62611f, -0.202064f, 0.701934f, 1.28452f, -0.00858481f,
+ -0.517249f, 0.0615832f, -0.260215f, 0.0949119f, -0.28423f,
+ -0.39573f, -0.0574246f, -0.318658f, 0.0601775f, -0.0629386f,
+ -0.134208f, 0.111686f, -0.23355f, 0.078667f, 0.741023f,
+ 0.828523f, -0.345067f, -0.315135f, -0.0957154f, 0.522825f,
+ -0.190057f, -0.473789f, -0.390489f, 0.200677f, -0.0271802f,
+ 0.110336f, 0.493302f, 0.663126f, 0.570148f, -0.380042f,
+ -0.437349f, -0.660884f, 0.301908f, 0.0644179f, 0.172494f,
+ 0.461917f, 0.330938f, -0.140041f, -0.0430205f, -1.51003f,
+ -0.410984f, -0.182161f, 0.0235313f, -0.364849f, 0.154183f,
+ -0.592465f, 0.272701f, 0.192389f, -0.0497777f, -0.924467f,
+ -0.179513f, -0.592217f, 0.436363f, -0.0716164f, 0.189094f,
+ -0.574697f, -0.304303f, 0.326441f, -0.0865553f, 0.735948f,
+ 0.266912f, 0.435824f, -0.123322f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = {
+ -1.19333f, 1.01834f, -1.10844f, 0.0454873f, -1.45506f, 0.580864f,
+ -0.040979f, -0.505681f, -1.15072f, 0.692697f, -0.520812f, -0.479384f,
+ 0.529652f, 0.507252f, -1.08619f, 0.0586375f, 0.0929614f, -0.46753f,
+ -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_64[] = {
+ -3.32501f, 0.43082f, -1.060692f, 1.328908f, 0.8892894f, 0.6488833f,
+ -1.096516f, -0.664786f, -1.301339f, 0.508805f, -2.128406f, -0.757304f,
+ 0.383839f, 0.694763f, -0.591725f, 0.770385f, 1.021594f, 0.589181f,
+ -0.76238f, 1.488826f, 0.709135f, -0.575738f, 0.26421759f, -0.2484219f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_64[] = {
+ 0.699037f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = {
+ NUM_FEATURES_64,
+ NUM_LOGITS_64,
+ NUM_HIDDEN_LAYERS_64,
+ {
+ NUM_LAYER_0_UNITS_64,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_64,
+ av1_simple_motion_search_split_logits_kernel_64,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_64,
+ av1_simple_motion_search_split_logits_bias_64,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 17
+#define NUM_LAYER_0_UNITS_32 20
+#define NUM_LOGITS_32 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = {
+ -0.980626f, -0.946611f, 0.103761f, 0.408899f, 0.498149f,
+ 0.0490161f, 0.253279f, 0.332029f, 0.00367441f, 0.364401f,
+ -0.236433f, 0.0592119f, -0.0978848f, 0.159733f, -0.018052f,
+ -1.10726f, 1.16167f, -0.244982f, -0.147819f, -0.147095f,
+ 0.111404f, -0.349502f, 0.441178f, 0.0984191f, -0.135537f,
+ -0.0423312f, 0.0123079f, 0.358012f, -0.266796f, 0.0125811f,
+ 0.196563f, 0.337093f, -1.07266f, -1.25134f, 0.57337f,
+ -0.521717f, 0.259824f, 0.537383f, -0.463688f, -0.336128f,
+ 0.373385f, 0.483443f, -0.229293f, -0.33373f, -0.656021f,
+ 0.768647f, 0.179279f, 0.315415f, 0.187749f, 1.07839f,
+ 0.0626629f, -0.230299f, 0.662606f, -0.414154f, 0.459334f,
+ -0.6312f, 0.427704f, -0.249849f, 0.701056f, -0.707969f,
+ 0.057401f, 0.620434f, 0.665748f, -0.501356f, -0.230685f,
+ 0.0722371f, -0.0988625f, -0.114035f, -0.653799f, 0.571353f,
+ 0.268276f, 1.13251f, -1.0695f, -0.225607f, -0.984355f,
+ -0.42213f, 0.300422f, 1.21492f, -0.139931f, -0.000726004f,
+ 0.045964f, -0.0817352f, -0.0278813f, -0.0102341f, -0.0144087f,
+ -0.475882f, 1.20682f, -0.359919f, 0.277189f, -0.166401f,
+ 0.599211f, -0.129872f, 0.574211f, -0.247573f, 0.824405f,
+ -1.53329f, -0.202151f, -0.328698f, -0.516322f, -0.281416f,
+ -0.383651f, -0.252862f, -0.43185f, 0.456802f, -0.430055f,
+ -0.55245f, -0.6884f, -0.541456f, -0.281376f, 1.10425f,
+ -0.140706f, 1.59816f, -0.0343895f, -0.00920039f, -0.0307667f,
+ 0.0560132f, -0.0340302f, -0.10848f, 0.0593314f, -0.951795f,
+ 0.876831f, -1.00548f, -0.566244f, 0.430061f, 1.10109f,
+ -0.634212f, -0.0755369f, -0.108953f, 1.03191f, 0.109036f,
+ -0.0415309f, 0.0681162f, -0.0611775f, -0.0231938f, 0.0973158f,
+ -0.0558169f, -0.823484f, -0.918509f, 0.16756f, 0.27087f,
+ 0.286074f, 0.174069f, 0.1304f, 0.386074f, 0.433953f,
+ 0.0291467f, -1.74087f, 0.0296094f, -0.00793714f, -0.13041f,
+ 0.00990992f, -0.0137848f, -0.0742606f, -0.251029f, -0.645316f,
+ 0.640029f, 0.550607f, 0.470097f, 0.549451f, -0.285723f,
+ -0.164759f, -0.128166f, -0.391496f, -0.80287f, 0.0769472f,
+ 1.34391f, 0.0215005f, 0.0669497f, 0.131919f, 0.291674f,
+ 0.0952889f, -0.677953f, -0.364054f, 0.144823f, 0.246198f,
+ -0.12393f, 0.363661f, 0.215091f, -0.239658f, 0.18491f,
+ 0.118703f, 0.0064156f, 1.38619f, -1.3845f, 0.0567323f,
+ 1.20812f, -0.720374f, -1.92158f, -1.48657f, 0.335601f,
+ 0.409379f, 0.373618f, 0.231274f, 0.292194f, 0.368619f,
+ 0.2398f, 0.473579f, 0.83402f, -0.0133751f, -0.00344358f,
+ 2.20688e-05f, 0.00836757f, 0.00405377f, 0.0110539f, -0.260154f,
+ 0.192112f, -0.666986f, 0.302875f, -0.113302f, 0.17882f,
+ -0.221493f, 0.146161f, -0.448697f, 0.584187f, 0.122109f,
+ 0.989981f, -1.14706f, -0.734042f, 0.0638213f, 0.213357f,
+ 0.068543f, -0.808558f, 0.404741f, 0.808313f, 1.57523f,
+ -0.113448f, 0.254102f, -0.350065f, -0.615f, 0.0753549f,
+ -0.540936f, -0.0250732f, -0.225681f, -0.161384f, 0.0128342f,
+ -0.0933368f, -0.286904f, 0.130133f, -0.874747f, 0.392585f,
+ -0.493135f, 0.169708f, 0.0909804f, 1.89921f, -0.469954f,
+ 0.65165f, -0.953401f, -0.21595f, -0.37479f, 0.0451146f,
+ 0.0234621f, -0.0596903f, -0.0682308f, -0.0830426f, 0.130011f,
+ -0.409141f, 0.0627038f, -0.581148f, -0.513922f, 0.631676f,
+ 0.0637034f, 0.0539081f, 0.0638872f, 0.515863f, -0.0123463f,
+ 0.177238f, 0.279506f, -0.930345f, 1.23726f, 0.202851f,
+ 0.708792f, -0.445086f, -0.0267075f, -0.913822f, -0.0714978f,
+ -0.281107f, -0.0770565f, -0.23086f, -0.165893f, -0.319683f,
+ 0.216235f, -0.490999f, 2.04841f, -0.0524071f, -0.239043f,
+ -0.0526375f, 0.023002f, -0.132685f, -0.155354f, -0.186503f,
+ -0.904296f, 0.166478f, 0.063268f, -0.302842f, -0.27179f,
+ -0.428299f, 0.50193f, 0.480717f, -0.864275f, 0.317096f,
+ 0.40698f, 0.0286107f, 0.189432f, -0.0374374f, 0.0671728f,
+ 0.203681f, -0.457959f, -0.155776f, 0.340948f, 0.542841f,
+ 0.342675f, -0.000952399f, 0.470957f, 0.744418f, -1.11763f,
+ -0.658812f, -0.044832f, 0.0688237f, -0.357766f, 0.428662f,
+ -0.087152f, -0.291903f, 0.373244f, -0.587853f, 0.415895f,
+ -0.535694f, 0.621785f, -0.143648f, 0.0451373f, 0.00068827f,
+ 1.84432f, -1.26239f, -0.432087f, -0.152307f, 0.0293551f,
+ 0.184744f, -0.0173156f, -0.00572154f, -0.0305062f, -0.0900071f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = {
+ 0.160011f, 0.903856f, -0.13738f, 0.358221f, -0.0906044f,
+ -0.606558f, -0.0215651f, -0.03377f, -1.67017f, -0.144554f,
+ -0.201482f, -0.87719f, 0.639815f, -0.51976f, -0.309922f,
+ -1.33421f, 0.721328f, -0.889354f, -1.7158f, -0.285963f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_32[] = {
+ -0.2745374f, 0.333548f, -0.2437388f, 0.288009f, 0.55635f,
+ 0.4560176f, 0.2970518f, 0.391192f, 1.311854f, -0.231219f,
+ -0.2968651f, -1.819984f, 0.2775824f, 0.28929857f, 0.419126f,
+ -0.32868411f, -0.916399f, -0.1921077f, -0.617489f, 0.637953f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_32[] = {
+ 0.208473f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = {
+ NUM_FEATURES_32,
+ NUM_LOGITS_32,
+ NUM_HIDDEN_LAYERS_32,
+ {
+ NUM_LAYER_0_UNITS_32,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_32,
+ av1_simple_motion_search_split_logits_kernel_32,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_32,
+ av1_simple_motion_search_split_logits_bias_32,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 17
+#define NUM_LAYER_0_UNITS_16 20
+#define NUM_LOGITS_16 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = {
+ 0.0136957f, 0.182135f, -0.583394f, 0.0556956f, 0.211152f,
+ 0.168234f, -0.694203f, -0.678216f, 0.289943f, 1.00014f,
+ -0.0427784f, -0.0427538f, -0.0276009f, -0.00133608f, 0.0901944f,
+ 0.0674892f, 0.104068f, -0.308582f, -0.43596f, 0.855997f,
+ -0.223414f, 0.0390026f, 0.366492f, 0.216065f, -0.386863f,
+ -0.148823f, -0.297022f, 0.0529546f, -0.202885f, 1.26471f,
+ -0.861163f, -0.0949431f, 0.573627f, -0.00277083f, -0.616063f,
+ -0.626927f, 0.371583f, -0.411743f, 0.173387f, -0.209734f,
+ 0.293697f, -0.260714f, 0.442728f, -0.594486f, 1.38987f,
+ 0.208025f, -0.0433776f, 0.01173f, 0.921766f, -0.168379f,
+ 0.000697326f, 0.209967f, -0.304577f, 0.149551f, -0.196658f,
+ 0.389251f, -0.449106f, -0.456329f, 0.669073f, -0.163806f,
+ 0.083348f, -0.0783998f, 0.0678355f, 0.0510435f, 0.103964f,
+ 0.104537f, -0.778093f, -1.0641f, -0.626102f, -2.02131f,
+ 0.159591f, 0.254161f, -0.000362642f, 0.289859f, 0.192713f,
+ 0.139801f, -0.0251327f, 0.164002f, 1.22892f, -0.0852193f,
+ 0.0769487f, 0.0296408f, -0.0418688f, 0.0936023f, 0.0448523f,
+ 0.674015f, -0.0732944f, 0.313575f, -0.593432f, 0.642067f,
+ -1.06063f, 0.468223f, -0.769085f, -0.173798f, -0.175663f,
+ 0.692808f, 0.00753295f, -0.123327f, -0.0234937f, -0.0923153f,
+ 0.0216917f, -0.0690157f, -0.397488f, 0.426628f, 0.264475f,
+ 0.342074f, -0.139817f, 0.215915f, 0.422544f, -0.321102f,
+ 0.0355587f, 0.460193f, 0.0315326f, 0.080556f, -0.0256533f,
+ -0.0857874f, -0.488283f, -0.299653f, -0.245987f, 0.104383f,
+ 0.203731f, 0.328734f, 0.668104f, -0.586909f, -0.501335f,
+ -0.661292f, -0.359811f, 0.00951363f, 0.816315f, -0.0124104f,
+ 0.0545827f, 0.089863f, 0.0125486f, 0.043609f, -0.0259544f,
+ 0.0123911f, 0.12557f, -0.539875f, -0.0556721f, 0.16532f,
+ 0.265834f, -0.384171f, 0.646496f, 0.366147f, -0.111272f,
+ 0.262096f, -0.0845724f, 0.382724f, 0.165783f, 0.1025f,
+ 0.392988f, 0.290525f, 0.038659f, 0.540269f, -0.485586f,
+ -0.273065f, -0.154052f, -0.0896895f, -0.35394f, 0.193214f,
+ -0.423728f, 0.654576f, -0.373321f, 0.814914f, 0.026278f,
+ -0.0328304f, -0.220913f, -0.0442121f, 0.487545f, -0.509537f,
+ -0.777581f, -1.23886f, 0.223482f, 0.206009f, 0.20391f,
+ 0.194628f, 0.226762f, 0.171609f, -0.219037f, 0.557892f,
+ -0.312011f, 1.27709f, 0.064013f, 0.105384f, 0.0493933f,
+ 0.074059f, -0.0100078f, -0.0176888f, -0.440005f, 0.302922f,
+ -0.197456f, 0.296128f, -0.326647f, 0.305323f, -0.30696f,
+ 0.201951f, -0.15874f, -0.793042f, 0.0197254f, 0.0569867f,
+ -0.0295468f, -0.0215012f, 0.025855f, -0.0196102f, 0.215558f,
+ -0.253069f, 0.298469f, 0.261269f, 0.435305f, 0.0120354f,
+ -0.384789f, -0.2772f, 0.0366613f, -0.494994f, 0.149072f,
+ 1.32981f, -0.427717f, 0.43938f, -0.16375f, -0.444342f,
+ 0.548214f, 0.127955f, -1.24387f, 0.0863676f, 0.175071f,
+ 0.172673f, -0.0906204f, 0.444454f, -0.546669f, 0.215857f,
+ -0.100621f, 0.200699f, -0.0985915f, 0.134706f, -0.256396f,
+ 0.393427f, 0.119606f, -0.214278f, -0.0183637f, 0.194266f,
+ -0.238025f, 0.182203f, 0.599718f, 0.846933f, 0.0607852f,
+ -0.183434f, -0.723743f, -0.72414f, -0.124701f, 0.0227527f,
+ -0.0664636f, -0.0385867f, -0.0257377f, -0.149054f, 0.12077f,
+ 0.678029f, -0.624456f, 0.189644f, -0.518604f, 0.134397f,
+ -0.189777f, -0.309376f, -0.00377086f, 0.701132f, -0.170915f,
+ 0.00736111f, -0.121906f, 0.329136f, 0.165514f, 0.0328356f,
+ 0.171275f, 0.248619f, 0.247704f, -0.449933f, 0.0841684f,
+ 0.136982f, 0.122703f, -0.0169439f, -0.0726496f, 0.302648f,
+ -0.128556f, 0.0667425f, -0.289717f, -0.207532f, -1.20269f,
+ -0.68892f, 0.045259f, 0.0973945f, 0.0988314f, -0.944748f,
+ -0.180401f, 0.134331f, 0.033834f, 0.109023f, 0.265723f,
+ 0.38063f, -0.106518f, -0.0686953f, 0.3744f, -1.0957f,
+ 0.0302782f, 0.0515164f, 0.00188222f, 0.0014413f, -0.0404425f,
+ 0.0124618f, -0.0828645f, 0.506166f, -0.776352f, -0.405138f,
+ -0.123887f, 0.0732116f, 0.379928f, 0.604524f, -0.492317f,
+ 0.439191f, 0.0744193f, 0.389101f, 0.0604518f, 0.0943165f,
+ 0.0339942f, 0.0917975f, 0.0161988f, 0.512227f, 0.538021f,
+ -0.411495f, 0.307281f, 0.33746f, -0.218639f, 0.265742f,
+ 0.39738f, -0.12442f, 0.125236f, -0.0845223f, -0.150396f,
+ 0.0334878f, -0.00391915f, 0.0406864f, -0.0487059f, 0.0377073f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = {
+ 0.0535976f, -0.0130279f, 0.150146f, -0.511132f, -0.357698f,
+ 0.6719f, -1.27877f, -0.0208048f, 0.0961914f, 0.263603f,
+ 0.704574f, -1.48998f, 0.728063f, 0.941829f, -0.199981f,
+ 0.797802f, -0.29816f, -0.60894f, -0.116624f, -1.16723f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_16[] = {
+ 0.343153f, -0.2110482f, -0.487199f, 0.3274144f, -2.1975f,
+ -0.6051438f, 0.1901127f, 0.4741924f, -0.24029f, -0.185018f,
+ -0.652635f, 2.57714f, -0.31033031f, -0.307222f, 0.329035f,
+ -0.430181f, 0.3429f, 0.742292f, 0.3269808f, 0.4142165f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_16[] = {
+ -0.783658f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = {
+ NUM_FEATURES_16,
+ NUM_LOGITS_16,
+ NUM_HIDDEN_LAYERS_16,
+ {
+ NUM_LAYER_0_UNITS_16,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_16,
+ av1_simple_motion_search_split_logits_kernel_16,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_16,
+ av1_simple_motion_search_split_logits_bias_16,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 17
+#define NUM_LAYER_0_UNITS_8 20
+#define NUM_LOGITS_8 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = {
+ 0.079443f, -1.04068f, 0.336819f, -0.20901f, 0.796251f,
+ 0.181066f, 0.0118876f, -0.207145f, 0.250671f, -0.402119f,
+ -0.0847227f, 1.88683f, 0.303469f, 0.0718458f, 0.0338589f,
+ 0.158896f, 0.0540238f, -0.385426f, 0.955925f, 0.424506f,
+ 0.492584f, -0.795058f, -0.248667f, -0.905349f, -0.316989f,
+ 0.545471f, 0.63762f, -0.232613f, -0.238947f, -0.395338f,
+ -0.322673f, -0.0761563f, -0.125357f, 0.0694415f, -0.371599f,
+ 0.358387f, -0.486841f, 0.403863f, -0.0295666f, 0.283074f,
+ -0.424396f, 0.156318f, -0.685355f, 0.6663f, 0.337949f,
+ 0.273198f, 0.517448f, 0.458911f, 0.157252f, 0.692096f,
+ 0.64965f, -0.23987f, -1.08431f, -0.252475f, -0.332614f,
+ -0.712291f, -0.380973f, 0.460545f, 0.48936f, 0.337601f,
+ 0.489223f, 1.65336f, -0.223585f, 0.17367f, -0.235057f,
+ -0.456773f, 0.327877f, -0.221192f, -0.940151f, -1.06616f,
+ 0.687084f, -0.109973f, 0.106636f, 0.445895f, 0.163432f,
+ 0.378306f, 0.201902f, 0.176811f, 0.693082f, 1.62156f,
+ -0.178346f, 0.455175f, 1.61943f, 0.231376f, 0.0890932f,
+ -0.889693f, -1.03298f, 0.778196f, -0.0289539f, 0.137848f,
+ 0.18707f, 0.171889f, 0.119157f, 0.24893f, -0.313628f,
+ 0.00250735f, -0.0758209f, 0.272974f, -0.229825f, 2.47926f,
+ -0.0354665f, 0.175366f, 0.0411555f, -1.52149f, -0.0258663f,
+ 0.253027f, -0.0520839f, -0.0189782f, 0.362387f, -0.371154f,
+ 0.622929f, 0.0447056f, 0.242529f, -0.168391f, 0.308935f,
+ -0.117294f, 2.16307f, 0.0673638f, 0.080771f, -0.460779f,
+ -0.940176f, 0.473266f, -0.0125302f, 0.475145f, -0.218187f,
+ 0.43258f, -0.0380196f, 0.413607f, -0.110856f, -1.52076f,
+ 0.0896812f, 0.246636f, -0.0612008f, 0.189583f, 0.0106902f,
+ -0.158403f, -0.629377f, -0.0634279f, -0.0864584f, -0.226568f,
+ -0.286234f, -0.0721132f, -0.43702f, 0.113702f, 0.433372f,
+ 0.743396f, 0.14312f, 0.29914f, 0.801188f, 0.7609f,
+ 0.385046f, 0.480314f, 0.171119f, -1.59058f, -1.18853f,
+ 0.150676f, 0.408123f, -0.00677924f, 0.398145f, 0.0914611f,
+ 0.176945f, 0.0677457f, 0.316478f, 0.998219f, -0.22618f,
+ 0.0756793f, -0.0156674f, 0.105716f, 0.0496245f, -0.0827133f,
+ -0.423119f, -0.161033f, 0.212962f, -0.234453f, 0.743366f,
+ 1.04108f, 0.0597604f, -0.285993f, -0.114829f, -0.557364f,
+ -0.840051f, 0.326509f, -0.192508f, -0.141769f, 0.370626f,
+ -0.126353f, 0.00672923f, 0.493623f, -0.852076f, 0.466798f,
+ -0.226436f, 0.259268f, -0.452662f, 0.0721126f, 0.0198245f,
+ 0.2048f, 0.02506f, 0.316194f, 0.814651f, 1.01288f,
+ -0.569607f, -0.0838994f, 1.37146f, -0.613135f, 0.441761f,
+ -0.643901f, 0.364269f, -0.147177f, 0.338001f, -0.332376f,
+ 0.518875f, -0.628964f, -0.291889f, -0.050736f, 0.108047f,
+ 1.05673f, 0.0479492f, 0.466756f, -0.0867334f, -0.0355575f,
+ 0.57626f, -0.227583f, -0.146421f, 0.0990489f, 0.117351f,
+ -0.103858f, -0.0336936f, 0.0201903f, -0.0766383f, -0.010211f,
+ 0.0400779f, 0.0725462f, 0.137142f, 0.478261f, 0.287869f,
+ 0.0882359f, -0.739754f, -0.853521f, -0.43703f, 0.316856f,
+ 0.27593f, 0.312149f, 0.175575f, 0.441839f, 0.264325f,
+ 0.0148051f, -0.005559f, 0.373176f, 0.933701f, -0.0197615f,
+ 0.0219723f, -0.0559883f, -0.103456f, -0.0323009f, 0.0773202f,
+ -0.390838f, 0.855488f, -0.596525f, -0.249093f, 0.124262f,
+ 0.220172f, 0.0552478f, 1.04041f, -0.960992f, -0.495255f,
+ -0.211612f, 0.350007f, -0.238998f, -0.0265068f, 0.384686f,
+ -0.0815808f, -0.0570019f, 0.123903f, -0.485114f, -0.00282573f,
+ -0.0649603f, 0.163719f, -0.469479f, -0.439713f, 0.0602562f,
+ -0.527993f, -0.111458f, 2.48686f, -0.180723f, 0.0553895f,
+ 0.0560679f, -0.0978928f, -0.216063f, 0.089457f, -1.5602f,
+ -1.62332f, -0.147388f, 0.736155f, 0.440409f, 0.243519f,
+ 0.0622638f, 0.522932f, 0.109686f, 0.422849f, 0.510589f,
+ 1.01116f, 0.174019f, 0.0191171f, -0.0717751f, -0.0068308f,
+ 0.172932f, -0.834888f, -0.635788f, 0.32012f, 0.298656f,
+ 0.274309f, -0.155456f, 0.1755f, -0.175171f, 0.343498f,
+ -0.122832f, -0.107696f, 0.279924f, -0.797633f, -0.344658f,
+ 0.162669f, 0.389092f, 0.644479f, -0.635216f, -0.181868f,
+ 0.0579244f, -0.0568976f, 0.433003f, -0.591067f, 0.71013f,
+ -0.165515f, 0.225725f, -0.358156f, 0.0541944f, 1.95485f,
+ -0.315223f, 0.61537f, -0.0401568f, 0.22811f, 0.271147f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = {
+ 1.63441f, -0.616459f, -0.437775f, -0.71669f, 1.56616f, 2.28109f, 1.64054f,
+ -1.51476f, 0.0274108f, 0.935156f, -0.966329f, 0.906069f, 1.19954f, -1.25867f,
+ -1.7376f, -0.594211f, 0.322242f, 0.438631f, -1.01682f, 1.30032f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_8[] = {
+ -0.463187f, 0.2936127f, 0.16762f, -0.1663271f, -0.292418f,
+ -0.421457f, -0.378265f, 1.053049f, 0.32432879f, -0.49775575f,
+ 0.427357f, -0.239251f, -0.1631546f, 0.335468f, 0.255371f,
+ 0.276901f, -0.665683f, -0.7021493f, 0.381513f, -0.1339761f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_8[] = {
+ -1.739754f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = {
+ NUM_FEATURES_8,
+ NUM_LOGITS_8,
+ NUM_HIDDEN_LAYERS_8,
+ {
+ NUM_LAYER_0_UNITS_8,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_8,
+ av1_simple_motion_search_split_logits_kernel_8,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_8,
+ av1_simple_motion_search_split_logits_bias_8,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = {
+ &av1_simple_motion_search_split_nn_config_128,
+ &av1_simple_motion_search_split_nn_config_64,
+ &av1_simple_motion_search_split_nn_config_32,
+ &av1_simple_motion_search_split_nn_config_16,
+ &av1_simple_motion_search_split_nn_config_8,
+};
+
+// Model based on simple_motion_search for pruning rect
+// Thresholds. The first idx level is aggresiveness, second is frame resolution,
+// third is bsize
+static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = {
+ // Aggressivness = 0
+ {
+ // Lowres
+ { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+ 0.000961189195907f, 0.0f },
+ // Midres
+ { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+ 0.000961189195907f, 0.0f },
+ // Hdres
+ { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+ 0.000961189195907f, 0.0f },
+ },
+ // Aggressivness = 1
+ {
+ // Lowres
+ {
+ 0.000000f,
+ 0.116076f,
+ 0.049759f,
+ 0.057747f,
+ 0.006001f,
+ },
+ // Midres
+ {
+ 0.000000f,
+ 0.017380f,
+ 0.026077f,
+ 0.078111f,
+ 0.064477f,
+ },
+ // Hdres
+ {
+ 0.002994f,
+ 0.103093f,
+ 0.076408f,
+ 0.010456f,
+ 0.187211f,
+ },
+ },
+ // Aggressiveness = 2
+ {
+ // Lowres
+ {
+ 0.000000f,
+ 0.003111f,
+ 0.144294f,
+ 0.144884f,
+ 0.069924f,
+ },
+ // Midres
+ {
+ 0.000000f,
+ 0.013696f,
+ 0.055203f,
+ 0.152271f,
+ 0.078886f,
+ },
+ // Hdres
+ {
+ 0.030577f,
+ 0.082486f,
+ 0.040690f,
+ 0.140924f,
+ 0.067608f,
+ },
+ },
+ // Aggressiveness = 3
+ {
+ // Lowres
+ { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f,
+ 0.287219697095f },
+ // Midres
+ { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f,
+ 0.178833795641f },
+ // Hdres
+ { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f,
+ 0.21329309279f },
+ },
+};
+
+// Mean and std
+static const float av1_simple_motion_search_prune_rect_mean_128[25] = {
+ 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
+ 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
+ 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
+ 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f,
+ 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_128[25] = {
+ 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
+ 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
+ 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
+ 1.208679f, 0.353742f, 1.228122f, 1.211777f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_64[25] = {
+ 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
+ 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f,
+ 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
+ 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f,
+ 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_64[25] = {
+ 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
+ 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
+ 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
+ 1.081292f, 0.257521f, 1.112510f, 1.089404f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_32[25] = {
+ 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
+ 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
+ 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
+ 2.751266f, 0.963302f, 2.716584f, 2.709725f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_32[25] = {
+ 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
+ 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
+ 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
+ 0.952221f, 0.188018f, 0.985295f, 0.946228f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_16[25] = {
+ 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
+ 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
+ 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
+ 2.131698f, 0.981005f, 2.110868f, 2.106539f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_16[25] = {
+ 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
+ 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
+ 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
+ 0.829935f, 0.136507f, 0.828972f, 0.808563f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_8[25] = {
+ 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
+ 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
+ 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
+ 1.531762f, 0.989606f, 1.496581f, 1.484139f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_8[25] = {
+ 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
+ 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
+ 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
+ 0.754040f, 0.101419f, 0.738239f, 0.729455f,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_mean[5] = {
+ av1_simple_motion_search_prune_rect_mean_128,
+ av1_simple_motion_search_prune_rect_mean_64,
+ av1_simple_motion_search_prune_rect_mean_32,
+ av1_simple_motion_search_prune_rect_mean_16,
+ av1_simple_motion_search_prune_rect_mean_8,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_std[5] = {
+ av1_simple_motion_search_prune_rect_std_128,
+ av1_simple_motion_search_prune_rect_std_64,
+ av1_simple_motion_search_prune_rect_std_32,
+ av1_simple_motion_search_prune_rect_std_16,
+ av1_simple_motion_search_prune_rect_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 25
+#define NUM_LAYER_0_UNITS_128 8
+#define NUM_LOGITS_128 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = {
+ -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f,
+ -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f,
+ 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f,
+ -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f,
+ 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f,
+ 0.398452f, 0.696949f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = {
+ 1.22789f, -1.34527f, 0.759048f, 0.315086f,
+ 1.0834f, -1.58019f, -0.465158f, 1.20716f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = {
+ -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f,
+ 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f,
+ -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f,
+ 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f,
+ -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f,
+ -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f,
+ -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f,
+ 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f,
+ 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f,
+ 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f,
+ 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f,
+ -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f,
+ 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f,
+ -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f,
+ -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f,
+ 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f,
+ -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f,
+ 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f,
+ 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f,
+ -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f,
+ 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f,
+ -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f,
+ -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f,
+ -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f,
+ 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f,
+ -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f,
+ 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f,
+ -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f,
+ 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f,
+ 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f,
+ -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f,
+ -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f,
+ 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f,
+ -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f,
+ 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f,
+ 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f,
+ -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f,
+ 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f,
+ 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f,
+ -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = {
+ 1.58571f, -4.6314f, -2.00273f, 0.543699f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = {
+ NUM_FEATURES_128,
+ NUM_LOGITS_128,
+ NUM_HIDDEN_LAYERS_128,
+ {
+ NUM_LAYER_0_UNITS_128,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_128,
+ av1_simple_motion_search_prune_rect_logits_kernel_128,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_128,
+ av1_simple_motion_search_prune_rect_logits_bias_128,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 25
+#define NUM_LAYER_0_UNITS_64 32
+#define NUM_LOGITS_64 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = {
+ 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f,
+ -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f,
+ 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f,
+ -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f,
+ 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f,
+ 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f,
+ 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f,
+ -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f,
+ -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f,
+ -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f,
+ 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f,
+ -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f,
+ -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f,
+ 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f,
+ 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f,
+ -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f,
+ -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f,
+ 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f,
+ 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f,
+ 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f,
+ -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f,
+ 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f,
+ -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f,
+ -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f,
+ -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f,
+ -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f,
+ 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f,
+ 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f,
+ 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f,
+ -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f,
+ -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f,
+ -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f,
+ -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f,
+ -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f,
+ -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f,
+ -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f,
+ -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f,
+ -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f,
+ -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f,
+ -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f,
+ -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f,
+ -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f,
+ 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f,
+ 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f,
+ -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f,
+ 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f,
+ -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f,
+ -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f,
+ -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f,
+ -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f,
+ -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f,
+ -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f,
+ -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f,
+ -0.359633f, 0.668108f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = {
+ 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f,
+ -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f,
+ 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f,
+ -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f,
+ 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f,
+ 0.656818f, 0.0169274f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = {
+ -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f,
+ 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f,
+ 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f,
+ -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f,
+ 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f,
+ 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f,
+ -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f,
+ 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f,
+ -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f,
+ 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f,
+ -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f,
+ -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f,
+ -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f,
+ 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f,
+ 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f,
+ 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f,
+ -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f,
+ -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f,
+ 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f,
+ 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f,
+ -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f,
+ 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f,
+ -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f,
+ 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f,
+ 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f,
+ -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f,
+ 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f,
+ -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f,
+ -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f,
+ 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f,
+ -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f,
+ 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f,
+ -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f,
+ -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f,
+ 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f,
+ -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f,
+ -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f,
+ -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f,
+ -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f,
+ -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f,
+ -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f,
+ -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f,
+ -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f,
+ 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f,
+ 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f,
+ 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f,
+ -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f,
+ 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f,
+ -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f,
+ -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f,
+ 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f,
+ 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f,
+ 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f,
+ -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f,
+ -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f,
+ 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f,
+ -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f,
+ 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f,
+ -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f,
+ -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f,
+ 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f,
+ 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f,
+ -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f,
+ -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f,
+ 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f,
+ -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f,
+ 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f,
+ -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f,
+ -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f,
+ -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f,
+ -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f,
+ 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f,
+ -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f,
+ 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f,
+ 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f,
+ -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f,
+ -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f,
+ -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f,
+ -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f,
+ 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f,
+ 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f,
+ 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f,
+ -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f,
+ 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f,
+ 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f,
+ -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f,
+ -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f,
+ 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f,
+ 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f,
+ -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f,
+ -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f,
+ -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f,
+ -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f,
+ 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f,
+ -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f,
+ -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f,
+ -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f,
+ -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f,
+ 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f,
+ -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f,
+ -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f,
+ 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f,
+ 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f,
+ -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f,
+ 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f,
+ -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f,
+ -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f,
+ -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f,
+ 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f,
+ -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f,
+ -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f,
+ 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f,
+ -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f,
+ 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f,
+ -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f,
+ -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f,
+ 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f,
+ -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f,
+ 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f,
+ 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f,
+ -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f,
+ -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f,
+ -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f,
+ -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f,
+ -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f,
+ 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f,
+ -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f,
+ 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f,
+ 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f,
+ 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f,
+ 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f,
+ -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f,
+ 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f,
+ 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f,
+ -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f,
+ -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f,
+ -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f,
+ -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f,
+ 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f,
+ 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f,
+ 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f,
+ 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f,
+ -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f,
+ 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f,
+ -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f,
+ -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f,
+ 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f,
+ 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f,
+ -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f,
+ 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f,
+ 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f,
+ 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f,
+ 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f,
+ 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f,
+ -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f,
+ -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f,
+ 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f,
+ -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f,
+ -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f,
+ -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = {
+ 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f,
+ -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = {
+ NUM_FEATURES_64,
+ NUM_LOGITS_64,
+ NUM_HIDDEN_LAYERS_64,
+ {
+ NUM_LAYER_0_UNITS_64,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_64,
+ av1_simple_motion_search_prune_rect_logits_kernel_64,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_64,
+ av1_simple_motion_search_prune_rect_logits_bias_64,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 25
+#define NUM_LAYER_0_UNITS_32 28
+#define NUM_LOGITS_32 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = {
+ 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f,
+ 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f,
+ -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f,
+ 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f,
+ -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f,
+ -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f,
+ -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f,
+ 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f,
+ 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f,
+ 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f,
+ -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f,
+ 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f,
+ -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f,
+ 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f,
+ -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f,
+ 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f,
+ -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f,
+ 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f,
+ 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f,
+ -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f,
+ 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f,
+ -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f,
+ 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f,
+ 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f,
+ 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f,
+ -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f,
+ -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f,
+ -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f,
+ 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f,
+ -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f,
+ -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f,
+ -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f,
+ -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f,
+ 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f,
+ 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f,
+ 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f,
+ -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f,
+ -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f,
+ 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f,
+ 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f,
+ -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f,
+ 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f,
+ -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f,
+ -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f,
+ 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f,
+ 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f,
+ -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f,
+ -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f,
+ -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f,
+ -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f,
+ 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f,
+ -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f,
+ -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f,
+ -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f,
+ -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f,
+ -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = {
+ 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f,
+ 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f,
+ 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f,
+ -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f,
+ 0.59681f, -0.472405f, 0.0969218f, -0.250624f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = {
+ 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f,
+ -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f,
+ -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f,
+ 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f,
+ 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f,
+ -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f,
+ 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f,
+ -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f,
+ -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f,
+ -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f,
+ 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f,
+ -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f,
+ 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f,
+ 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f,
+ -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f,
+ 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f,
+ -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f,
+ 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f,
+ 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f,
+ 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f,
+ -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f,
+ 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f,
+ -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f,
+ 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f,
+ -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f,
+ -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f,
+ -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f,
+ 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f,
+ -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f,
+ 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f,
+ -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f,
+ 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f,
+ 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f,
+ -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f,
+ 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f,
+ -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f,
+ -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f,
+ -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f,
+ 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f,
+ 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f,
+ -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f,
+ 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f,
+ -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f,
+ -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f,
+ 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f,
+ 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f,
+ -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f,
+ 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f,
+ -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f,
+ -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f,
+ 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f,
+ 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f,
+ -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f,
+ 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f,
+ -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f,
+ -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f,
+ -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f,
+ -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f,
+ -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f,
+ -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f,
+ -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f,
+ 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f,
+ -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f,
+ -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f,
+ 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f,
+ -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f,
+ 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f,
+ 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f,
+ -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f,
+ 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f,
+ -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f,
+ 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f,
+ -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f,
+ 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f,
+ 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f,
+ -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f,
+ 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f,
+ 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f,
+ -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f,
+ 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f,
+ -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f,
+ 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f,
+ -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f,
+ -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f,
+ -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f,
+ -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f,
+ 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f,
+ 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f,
+ 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f,
+ 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f,
+ -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f,
+ -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f,
+ 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f,
+ -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f,
+ 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f,
+ -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f,
+ 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f,
+ -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f,
+ -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f,
+ -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f,
+ -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f,
+ -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f,
+ -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f,
+ 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f,
+ 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f,
+ -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f,
+ 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f,
+ 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f,
+ -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f,
+ 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f,
+ 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f,
+ -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f,
+ -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f,
+ -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f,
+ 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f,
+ -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f,
+ 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f,
+ -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f,
+ 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f,
+ -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f,
+ 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f,
+ 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f,
+ -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f,
+ -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f,
+ -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f,
+ -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f,
+ -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f,
+ -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f,
+ -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f,
+ -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f,
+ -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f,
+ 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f,
+ -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f,
+ 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f,
+ 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f,
+ -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f,
+ 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f,
+ -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f,
+ 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f,
+ -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = {
+ 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f,
+ -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = {
+ NUM_FEATURES_32,
+ NUM_LOGITS_32,
+ NUM_HIDDEN_LAYERS_32,
+ {
+ NUM_LAYER_0_UNITS_32,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_32,
+ av1_simple_motion_search_prune_rect_logits_kernel_32,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_32,
+ av1_simple_motion_search_prune_rect_logits_bias_32,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 25
+#define NUM_LAYER_0_UNITS_16 32
+#define NUM_LOGITS_16 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = {
+ -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f,
+ 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f,
+ -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f,
+ 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f,
+ -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f,
+ 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f,
+ 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f,
+ -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f,
+ 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f,
+ 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f,
+ -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f,
+ 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f,
+ -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f,
+ -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f,
+ -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f,
+ -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f,
+ -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f,
+ 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f,
+ -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f,
+ -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f,
+ 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f,
+ -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f,
+ 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f,
+ 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f,
+ 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f,
+ -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f,
+ -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f,
+ -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f,
+ -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f,
+ -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f,
+ 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f,
+ -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f,
+ -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f,
+ 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f,
+ -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f,
+ -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f,
+ -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f,
+ -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f,
+ -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f,
+ 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f,
+ -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f,
+ -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f,
+ -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f,
+ -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f,
+ -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f,
+ -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f,
+ 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f,
+ -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f,
+ -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f,
+ 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f,
+ 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f,
+ -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f,
+ -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f,
+ -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f,
+ 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f,
+ -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f,
+ 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f,
+ 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f,
+ -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f,
+ 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f,
+ -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f,
+ -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f,
+ -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f,
+ -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = {
+ -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f,
+ -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f,
+ 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f,
+ 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f,
+ -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f,
+ 0.661496f, 0.95533f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = {
+ -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f,
+ 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f,
+ 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f,
+ -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f,
+ -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f,
+ -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f,
+ -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f,
+ -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f,
+ 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f,
+ 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f,
+ -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f,
+ -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f,
+ -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f,
+ 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f,
+ -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f,
+ -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f,
+ 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f,
+ 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f,
+ 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f,
+ -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f,
+ 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f,
+ -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f,
+ -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f,
+ 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f,
+ 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f,
+ 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f,
+ -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f,
+ 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f,
+ 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f,
+ 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f,
+ -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f,
+ -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f,
+ -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f,
+ -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f,
+ -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f,
+ -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f,
+ -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f,
+ 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f,
+ -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f,
+ -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f,
+ -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f,
+ 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f,
+ -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f,
+ 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f,
+ 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f,
+ -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f,
+ 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f,
+ 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f,
+ -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f,
+ 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f,
+ -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f,
+ 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f,
+ -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f,
+ 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f,
+ -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f,
+ 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f,
+ 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f,
+ -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f,
+ 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f,
+ 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f,
+ 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f,
+ 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f,
+ 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f,
+ -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f,
+ 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f,
+ -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f,
+ -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f,
+ -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f,
+ 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f,
+ 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f,
+ 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f,
+ -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f,
+ 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f,
+ -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f,
+ -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f,
+ -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f,
+ -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f,
+ -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f,
+ -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f,
+ -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f,
+ 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f,
+ -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f,
+ 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f,
+ 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f,
+ 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f,
+ -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f,
+ 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f,
+ 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f,
+ -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f,
+ 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f,
+ -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f,
+ 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f,
+ -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f,
+ -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f,
+ 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f,
+ -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f,
+ -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f,
+ -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f,
+ -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f,
+ -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f,
+ -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f,
+ 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f,
+ -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f,
+ 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f,
+ 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f,
+ -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f,
+ -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f,
+ -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f,
+ 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f,
+ -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f,
+ -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f,
+ -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f,
+ -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f,
+ 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f,
+ 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f,
+ -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f,
+ -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f,
+ -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f,
+ 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f,
+ -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f,
+ 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f,
+ 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f,
+ -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f,
+ 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f,
+ -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f,
+ -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f,
+ 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f,
+ 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f,
+ -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f,
+ -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f,
+ -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f,
+ -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f,
+ 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f,
+ -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f,
+ -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f,
+ -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f,
+ -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f,
+ 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f,
+ 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f,
+ -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f,
+ -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f,
+ 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f,
+ 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f,
+ -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f,
+ 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f,
+ -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f,
+ 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f,
+ -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f,
+ 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f,
+ 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f,
+ 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f,
+ 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f,
+ 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f,
+ -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f,
+ 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f,
+ 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f,
+ -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f,
+ -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f,
+ 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f,
+ -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = {
+ 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f,
+ -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = {
+ NUM_FEATURES_16,
+ NUM_LOGITS_16,
+ NUM_HIDDEN_LAYERS_16,
+ {
+ NUM_LAYER_0_UNITS_16,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_16,
+ av1_simple_motion_search_prune_rect_logits_kernel_16,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_16,
+ av1_simple_motion_search_prune_rect_logits_bias_16,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 25
+#define NUM_LAYER_0_UNITS_8 32
+#define NUM_LOGITS_8 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = {
+ -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f,
+ 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f,
+ -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f,
+ 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f,
+ -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f,
+ -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f,
+ 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f,
+ -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f,
+ -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f,
+ 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f,
+ -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f,
+ 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f,
+ -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f,
+ 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f,
+ 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f,
+ -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f,
+ -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f,
+ -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f,
+ 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f,
+ -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f,
+ -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f,
+ -0.112242f, 0.295184f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = {
+ -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f,
+ -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
+ -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f,
+ 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f,
+ -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f,
+ -0.490783f, -0.415782f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = {
+ -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f,
+ 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f,
+ 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f,
+ -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f,
+ -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f,
+ -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f,
+ -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f,
+ 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f,
+ 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f,
+ 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f,
+ -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f,
+ -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f,
+ 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f,
+ 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f,
+ 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f,
+ 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f,
+ -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f,
+ -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f,
+ 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f,
+ -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f,
+ -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f,
+ -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f,
+ 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f,
+ -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f,
+ 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f,
+ -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f,
+ 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f,
+ -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f,
+ -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f,
+ 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f,
+ -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f,
+ 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f,
+ 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f,
+ 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f,
+ 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f,
+ 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f,
+ 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f,
+ -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f,
+ 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f,
+ -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f,
+ -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f,
+ 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f,
+ -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f,
+ 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f,
+ -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f,
+ -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f,
+ 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f,
+ 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f,
+ 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f,
+ 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f,
+ -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f,
+ 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f,
+ -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f,
+ -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f,
+ -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f,
+ -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f,
+ 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f,
+ 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f,
+ -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f,
+ 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f,
+ -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f,
+ 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f,
+ 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f,
+ 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f,
+ -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f,
+ -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f,
+ 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f,
+ -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f,
+ -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f,
+ -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f,
+ 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f,
+ -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f,
+ -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f,
+ 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f,
+ -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f,
+ -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f,
+ -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f,
+ 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f,
+ 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f,
+ 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f,
+ -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f,
+ 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f,
+ -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f,
+ 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f,
+ 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f,
+ -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f,
+ 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f,
+ 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f,
+ -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f,
+ -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f,
+ -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f,
+ 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f,
+ 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f,
+ -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f,
+ -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f,
+ -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f,
+ 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f,
+ -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f,
+ 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f,
+ 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f,
+ -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f,
+ 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f,
+ -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f,
+ 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f,
+ 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f,
+ -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f,
+ 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f,
+ -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f,
+ 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f,
+ 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f,
+ -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f,
+ 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f,
+ -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f,
+ 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f,
+ 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f,
+ 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f,
+ 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f,
+ 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f,
+ 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f,
+ -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f,
+ -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f,
+ -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f,
+ 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f,
+ 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f,
+ 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f,
+ -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f,
+ 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f,
+ -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f,
+ -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f,
+ -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f,
+ -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f,
+ 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f,
+ -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f,
+ 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f,
+ 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f,
+ 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f,
+ -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f,
+ 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f,
+ -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f,
+ -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f,
+ -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f,
+ -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f,
+ 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f,
+ 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f,
+ -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f,
+ -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f,
+ -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f,
+ 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f,
+ -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f,
+ 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f,
+ -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f,
+ 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f,
+ -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f,
+ -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f,
+ 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f,
+ 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f,
+ -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f,
+ -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f,
+ -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f,
+ -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = {
+ 1.63404f, -0.715866f, -1.0132f, -2.08745f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = {
+ NUM_FEATURES_8,
+ NUM_LOGITS_8,
+ NUM_HIDDEN_LAYERS_8,
+ {
+ NUM_LAYER_0_UNITS_8,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_8,
+ av1_simple_motion_search_prune_rect_logits_kernel_8,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_8,
+ av1_simple_motion_search_prune_rect_logits_bias_8,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG
+ *const av1_simple_motion_search_prune_rect_nn_config[5] = {
+ &av1_simple_motion_search_prune_rect_nn_config_128,
+ &av1_simple_motion_search_prune_rect_nn_config_64,
+ &av1_simple_motion_search_prune_rect_nn_config_32,
+ &av1_simple_motion_search_prune_rect_nn_config_16,
+ &av1_simple_motion_search_prune_rect_nn_config_8,
+ };
+
+// nn model for predicting max square partition level of a superblock
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_FEATURES 13
+#define NUM_LAYER_0_UNITS 48
+#define NUM_LOGITS 4
+
+static const float av1_max_part_pred_logits_kernel[] = {
+ -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f,
+ 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f,
+ 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f,
+ 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f,
+ 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f,
+ 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f,
+ -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f,
+ 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f,
+ -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f,
+ -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f,
+ 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f,
+ 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f,
+ -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f,
+ 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f,
+ -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f,
+ -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f,
+ 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f,
+ 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f,
+ 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f,
+ 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f,
+ -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f,
+ 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f,
+ 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f,
+ 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f,
+ 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f,
+ 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f,
+ 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f,
+ 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f,
+ -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f,
+ -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f,
+ -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f,
+ 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f,
+ -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f,
+ 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f,
+ 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f,
+ -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f,
+ 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f,
+ 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f,
+ 0.208747f, 0.448697f
+};
+
+static const float av1_max_part_pred_layer_0_bias[] = {
+ -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f,
+ 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f,
+ -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f,
+ -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f,
+ -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f,
+ -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f,
+ -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f,
+ 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f
+};
+
+static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f,
+ 1.96217f, 0.728905f };
+
+static const float av1_max_part_pred_layer_0_kernel[] = {
+ 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f,
+ -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f,
+ -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f,
+ 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f,
+ -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f,
+ -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f,
+ -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f,
+ -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f,
+ 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f,
+ -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f,
+ -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f,
+ -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f,
+ -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f,
+ 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f,
+ -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f,
+ -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f,
+ 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f,
+ -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f,
+ -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f,
+ 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f,
+ -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f,
+ -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f,
+ 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f,
+ -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f,
+ -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f,
+ -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f,
+ -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f,
+ -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f,
+ -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f,
+ -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f,
+ 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f,
+ -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f,
+ -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f,
+ -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f,
+ 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f,
+ -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f,
+ -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f,
+ 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f,
+ 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f,
+ -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f,
+ -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f,
+ -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f,
+ 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f,
+ -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f,
+ -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f,
+ -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f,
+ -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f,
+ -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f,
+ 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f,
+ 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f,
+ 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f,
+ -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f,
+ -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f,
+ -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f,
+ -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f,
+ -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f,
+ 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f,
+ -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f,
+ 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f,
+ -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f,
+ 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f,
+ -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f,
+ -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f,
+ 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f,
+ 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f,
+ -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f,
+ 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f,
+ 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f,
+ -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f,
+ 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f,
+ -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f,
+ -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f,
+ 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f,
+ 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f,
+ 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f,
+ -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f,
+ -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f,
+ -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f,
+ -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f,
+ -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f,
+ 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f,
+ -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f,
+ 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f,
+ -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f,
+ -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f,
+ -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f,
+ -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f,
+ -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f,
+ -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f,
+ 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f,
+ -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f,
+ 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f,
+ 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f,
+ -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f,
+ -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f,
+ 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f,
+ -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f,
+ -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f,
+ 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f,
+ 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f,
+ -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f,
+ -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f,
+ 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f,
+ -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f,
+ 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f,
+ 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f,
+ 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f,
+ 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f,
+ -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f,
+ -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f,
+ -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f,
+ 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f,
+ 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f,
+ -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f,
+ -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f,
+ -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f,
+ 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f,
+ -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f,
+ -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f,
+ -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f,
+ 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f,
+ 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f,
+ -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f,
+ -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f,
+ 1.36966f, 0.869475f, -0.0302774f, -0.0537556f
+};
+
+static const NN_CONFIG av1_max_part_pred_nn_config = {
+ NUM_FEATURES,
+ NUM_LOGITS,
+ NUM_HIDDEN_LAYERS,
+ {
+ NUM_LAYER_0_UNITS,
+ },
+ {
+ av1_max_part_pred_layer_0_kernel,
+ av1_max_part_pred_logits_kernel,
+ },
+ {
+ av1_max_part_pred_layer_0_bias,
+ av1_max_part_pred_logits_bias,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+// Early termination in second pass
+static const float av1_simple_motion_search_term_none_mean_128[28] = {
+ 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f,
+ 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f,
+ 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f,
+ 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f,
+ 4.298179f, 8.514713f, 14.911736f, 19.825352f,
+};
+
+static const float av1_simple_motion_search_term_none_std_128[28] = {
+ 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f,
+ 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f,
+ 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f,
+ 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_64[28] = {
+ 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f,
+ 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f,
+ 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f,
+ 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f,
+ 3.573322f, 8.807137f, 13.348477f, 18.269117f,
+};
+
+static const float av1_simple_motion_search_term_none_std_64[28] = {
+ 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f,
+ 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f,
+ 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f,
+ 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_32[28] = {
+ 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f,
+ 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f,
+ 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f,
+ 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f,
+};
+
+static const float av1_simple_motion_search_term_none_std_32[28] = {
+ 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f,
+ 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f,
+ 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f,
+ 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_16[28] = {
+ 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f,
+ 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f,
+ 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f,
+ 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f,
+};
+
+static const float av1_simple_motion_search_term_none_std_16[28] = {
+ 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f,
+ 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f,
+ 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f,
+ 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f,
+};
+
+static const float av1_simple_motion_search_term_none_model_128[] = {
+ -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f,
+ 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f,
+ 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f,
+ 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f,
+ -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f,
+ 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f,
+ 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f,
+ -0.5493146094f,
+};
+
+static const float av1_simple_motion_search_term_none_model_64[] = {
+ -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f,
+ 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f,
+ 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f,
+ -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f,
+ -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f,
+ 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f,
+ 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f,
+ -0.4337360901f,
+};
+
+static const float av1_simple_motion_search_term_none_model_32[] = {
+ -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f,
+ 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f,
+ 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f,
+ -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f,
+ -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f,
+ 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f,
+ 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f,
+ -0.6609679881f,
+};
+
+static const float av1_simple_motion_search_term_none_model_16[] = {
+ -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f,
+ 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f,
+ 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f,
+ -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f,
+ 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f,
+ 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f,
+ 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f,
+ -0.5396254205f,
+};
+
+#define FEATURES 31
+#define HIDDEN_NODES 32
+static const float av1_early_term_after_split_nn_weights_64_layer0[] = {
+ -0.306296f, -0.691664f, 0.335148f, -0.298465f, -0.509241f, -0.632796f,
+ -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f,
+ -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f,
+ -0.240293f, 0.121749f, -0.489777f, -0.756647f, 0.001047f, -0.016528f,
+ 0.145714f, 0.172910f, 0.086197f, 0.162882f, -0.070588f, -0.077104f,
+ 0.502730f, -0.244954f, 0.265605f, -0.323994f, 0.223397f, -1.086453f,
+ 0.391886f, 0.200343f, 0.253878f, 0.018925f, 0.201819f, -0.205136f,
+ 0.427314f, 0.041155f, 0.070484f, 0.159925f, -0.057095f, -0.146544f,
+ -0.073792f, 0.152628f, 0.003986f, -0.515965f, -0.209754f, 0.037457f,
+ 0.070622f, -0.143571f, -0.059602f, 0.111734f, 0.319674f, 0.149894f,
+ -0.219883f, 0.206678f, 0.015809f, -0.210549f, 0.130156f, -0.189502f,
+ -0.850392f, -0.156363f, -0.060354f, 0.189044f, 0.266495f, 0.151305f,
+ -0.563677f, -0.354896f, 0.300637f, 0.257568f, -0.008359f, -0.535497f,
+ -0.003127f, 0.293054f, -0.020212f, -0.157278f, 0.229972f, -0.309799f,
+ -0.329927f, -0.077140f, 0.001177f, -0.024415f, 0.134044f, -0.181587f,
+ -0.135380f, 0.230989f, -0.281451f, 0.912282f, 0.511562f, -3.900779f,
+ -0.039917f, 1.956406f, -0.357589f, 0.292998f, -0.950158f, 0.422041f,
+ 0.526572f, 0.605746f, -0.147110f, 0.256576f, 0.090010f, 0.221641f,
+ 0.029763f, 0.351592f, 0.458324f, -0.005888f, 0.010521f, -0.389326f,
+ -0.094006f, -0.171489f, -0.013153f, 0.026333f, -0.454571f, -1.932891f,
+ -0.168211f, 0.051298f, -0.258061f, -0.028936f, -0.555937f, -0.475566f,
+ -0.304046f, -0.318113f, 0.099697f, -0.217145f, 0.139433f, -0.203986f,
+ -0.164012f, 0.051527f, 0.138603f, -0.085100f, -0.082887f, -0.242955f,
+ -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f, 0.135086f,
+ 0.146200f, 0.184827f, -0.199041f, 0.162570f, -0.300167f, 0.017748f,
+ -0.140111f, 0.103553f, 0.206929f, 0.193446f, 0.123141f, -1.201898f,
+ -0.052254f, -0.750121f, 0.111741f, 0.204092f, -0.166266f, 0.124008f,
+ -0.455496f, 0.306035f, 0.275903f, 0.193599f, -0.730011f, 0.126808f,
+ 0.051059f, 0.103634f, -0.044334f, 0.048889f, 0.405228f, 0.574099f,
+ 0.061167f, 0.260576f, 0.070032f, -0.038040f, 0.229183f, -0.243269f,
+ -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f,
+ -0.290828f, -0.233006f, 0.068712f, 0.618085f, -0.407008f, 0.686868f,
+ 0.172247f, 0.826287f, -0.002672f, 0.239825f, -0.051548f, 0.420773f,
+ 0.218747f, 0.041057f, -0.071189f, 0.286987f, -0.113915f, 0.122561f,
+ 0.013979f, -0.049046f, 0.148175f, 0.031313f, -0.248601f, 0.209488f,
+ 0.069008f, 0.072763f, 0.332475f, 0.079986f, -0.151042f, -0.205110f,
+ -0.155550f, -0.510408f, 0.330429f, 0.577729f, 0.266524f, -0.378489f,
+ 0.228204f, 0.055318f, 0.117583f, -0.588557f, -0.778201f, 0.434622f,
+ -0.227820f, 0.611642f, 0.170548f, 0.817761f, 0.006642f, -1.005794f,
+ -0.911490f, 1.633684f, -0.290664f, 0.308128f, 0.295986f, 0.243377f,
+ -0.001275f, -0.131156f, 0.275205f, -0.041865f, -0.201951f, -0.016380f,
+ 0.336604f, -0.258118f, 0.890810f, 0.441065f, -0.968006f, 0.135989f,
+ -1.447191f, 0.353426f, -0.343235f, 0.376837f, -0.071602f, -0.319639f,
+ -0.072347f, 0.547450f, -0.215380f, 0.182141f, -0.066186f, 0.033787f,
+ 0.257482f, 0.217428f, -0.130249f, 0.057525f, 0.263991f, 0.230664f,
+ -0.245113f, 0.048610f, -0.079955f, 0.251737f, -0.070368f, -0.017968f,
+ -0.151815f, 0.025945f, -0.257769f, 0.299735f, 0.077263f, -0.565526f,
+ 0.326263f, 0.096429f, 0.113414f, 0.092754f, -0.141908f, 0.172060f,
+ 0.393117f, -0.216755f, 0.331051f, -0.363369f, -0.113363f, -0.095164f,
+ -0.072784f, 0.214572f, 0.010993f, 0.209456f, 0.260381f, -0.314747f,
+ -0.422173f, -0.189963f, -0.225130f, 0.339448f, 0.153814f, 0.265616f,
+ -0.103575f, -0.123841f, -0.106236f, 0.155894f, -0.156264f, -1.361406f,
+ -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f,
+ 0.105758f, 0.040788f, -0.313589f, -1.359318f, 0.071329f, 0.176404f,
+ -0.476141f, 0.010108f, -0.201440f, -0.221167f, -0.197448f, -0.013927f,
+ -0.610270f, -0.607285f, 0.178070f, 0.174320f, 0.313115f, 0.026191f,
+ -0.112330f, 0.122338f, -0.367751f, 0.196794f, 0.153709f, -0.205454f,
+ -0.397471f, -1.879336f, -0.030129f, 0.143429f, -0.079832f, 0.435259f,
+ -1.729539f, 0.518301f, -0.141393f, 0.199399f, -1.914601f, 0.142865f,
+ -0.219899f, 0.508458f, 0.086365f, -0.220740f, -0.012507f, 1.263320f,
+ 0.042136f, 0.050922f, -0.329644f, -0.188198f, 0.251522f, 0.394731f,
+ -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f, 0.359257f,
+ -0.427732f, -0.100652f, 0.192129f, 0.075572f, 0.916708f, 0.255747f,
+ 0.486384f, 0.127989f, -0.556449f, -0.484913f, 0.392298f, 0.045401f,
+ -0.839551f, -0.703619f, 0.069263f, -0.040720f, 0.542265f, 0.443739f,
+ 0.862552f, -0.021726f, 0.230858f, -0.261004f, -0.125697f, -0.106435f,
+ 0.002341f, 0.013904f, 0.011034f, 0.542296f, -0.284325f, 0.135736f,
+ 0.113882f, 0.040610f, -0.255485f, 0.224061f, -0.087140f, 0.127872f,
+ -0.002638f, 0.164889f, -0.335958f, -0.031166f, -0.393581f, 0.075455f,
+ 0.055995f, 0.087934f, -0.133859f, -0.342187f, 0.002492f, -0.340722f,
+ 0.058304f, 0.104165f, -0.142136f, -0.351111f, -0.158037f, -0.079924f,
+ -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f,
+ 0.076088f, -0.232091f, -0.070052f, 0.097595f, 0.063173f, -0.211195f,
+ 0.126478f, -0.178828f, 0.278723f, -0.070807f, -0.179783f, 0.034123f,
+ 0.035721f, -0.200431f, 0.170640f, 0.107933f, 0.226594f, -0.301499f,
+ -0.291096f, 0.228076f, -0.272951f, 0.002490f, -0.210707f, -0.128033f,
+ -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f,
+ -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f,
+ 0.749334f, -1.161845f, 0.505480f, 0.221733f, 0.210490f, -0.234984f,
+ 0.014183f, -0.510401f, 0.238692f, -0.134111f, 0.083844f, -0.478751f,
+ -0.088434f, 0.304063f, 0.150336f, -0.749682f, -0.081999f, 0.729739f,
+ 0.412508f, 0.132571f, 0.058306f, -0.047451f, -0.117435f, -0.445395f,
+ -0.005182f, -0.025757f, 0.175051f, -0.258194f, -0.150311f, -0.196533f,
+ -1.314316f, -0.428627f, 0.512451f, 0.045138f, -0.200925f, 0.081538f,
+ -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f,
+ -0.419858f, -0.154321f, 0.376970f, 0.094017f, 0.783520f, 0.110641f,
+ 0.077966f, -0.093064f, 0.160522f, -0.863041f, 0.086210f, 0.560764f,
+ 0.057032f, 0.159224f, 0.323068f, -0.173109f, 0.014042f, -0.126856f,
+ -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f,
+ -0.215834f, 0.062076f, -0.270596f, 0.271581f, -0.153486f, -0.247165f,
+ 0.079737f, -0.157049f, -0.027459f, -0.299397f, 0.136729f, -0.334192f,
+ -0.191722f, 0.145865f, -0.031324f, -0.307165f, -0.244923f, -0.228027f,
+ 0.063807f, 0.054965f, -0.005709f, -0.041977f, -0.276245f, 0.020003f,
+ 0.133323f, -0.145992f, -0.951030f, 0.414083f, -1.063323f, 0.137872f,
+ 0.104732f, -0.123728f, 0.542532f, 0.213654f, 0.542954f, 0.155619f,
+ 0.543072f, 0.399067f, 0.191402f, -0.102552f, -0.176734f, -0.136776f,
+ -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f,
+ 0.058331f, 0.126601f, 0.104420f, -0.148684f, 0.343218f, 0.093604f,
+ -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f,
+ 0.042791f, -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f,
+ -0.070898f, -0.156415f, 0.112831f, -0.065931f, -0.353007f, 0.058453f,
+ -0.136982f, 0.233393f, 0.017240f, -0.018428f, 0.229104f, -0.371440f,
+ -0.262212f, 0.203075f, -0.263293f, 0.034413f, -0.299354f, 0.227269f,
+ 0.204977f, -0.118107f, -0.359832f, -0.068252f, 0.480105f, -0.214711f,
+ -0.614381f, 0.209048f, -0.456014f, -0.188819f, -0.220995f, -0.322104f,
+ -0.191457f, 0.420874f, -0.454919f, 0.023119f, 0.291700f, -0.532885f,
+ -0.032642f, 0.043271f, 0.133974f, 0.002399f, -0.179899f, -0.044158f,
+ -0.027078f, -0.350075f, 0.236766f, 0.346771f, -0.118534f, -0.421221f,
+ 0.019544f, 0.109349f, 0.141517f, 0.403561f, 0.409102f, 0.054555f,
+ -0.561751f, 0.577183f, -0.705156f, -0.231188f, -1.969772f, 0.172289f,
+ -0.048122f, 0.205671f, -0.667130f, -0.066870f, 0.202838f, -0.095538f,
+ -0.842651f, 0.254170f, 0.046256f, -0.271891f, -0.369254f, 0.492101f,
+ 0.001189f, -0.186525f, 0.188470f, -0.207072f, 0.030086f, -0.132904f,
+ 0.127001f, 0.116662f, -0.079246f, 0.227241f, -0.462178f, 0.446304f,
+ -1.660753f, 0.241832f, -0.288040f, 0.054663f, -0.435804f, 0.296782f,
+ -0.026421f, -0.115618f, 0.163416f, 0.834001f, 0.008019f, -0.014243f,
+ 0.524658f, 0.067894f, -0.253936f, -0.100657f, 1.285389f, -0.005952f,
+ 0.087134f, -0.088375f, -0.121866f, -0.171172f, 0.279463f, -0.598593f,
+ -0.727761f, 0.189831f, -0.822575f, -0.291141f, -0.012410f, -0.069999f,
+ 0.098842f, -0.218513f, 0.009494f, 0.100106f, -0.402884f, -0.299236f,
+ -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f,
+ -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f,
+ -0.012113f, -0.032962f, -0.450648f, 0.129060f, -0.135227f, -0.298593f,
+ 0.001435f, 0.278790f, -0.272945f, 0.162759f, -0.290208f, 0.058481f,
+ -0.490971f, 0.019630f, -0.210347f, 0.000520f, -0.340413f, 0.641562f,
+ 0.023104f, 0.194832f, -0.441894f, -0.253538f, -0.228332f, 0.423264f,
+ -1.094073f, -0.475657f, -0.238752f, 0.033910f, 0.440425f, 0.036320f,
+ 0.566989f, -0.065326f, -0.297939f, 0.406098f, 0.529561f, -0.113084f,
+ 0.141472f, -0.024462f, -0.179212f, 0.187801f, -0.235787f, -0.229624f,
+ 0.357791f, 0.061110f, -0.607788f, -1.713694f, -0.651041f, 1.734283f,
+ -0.334701f, 0.161687f, 0.010215f, 0.320708f, 0.169447f, 0.513558f,
+ 0.488340f, -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f,
+ 0.327028f, -0.100539f, 0.012048f, -0.223013f, -0.239680f, 0.323035f,
+ 0.165950f, -0.155110f, 0.128664f, -0.157378f, -0.124490f, 0.291553f,
+ 0.055849f, -0.221664f, 0.077770f, -0.350658f, -0.181939f, 0.110230f,
+ -0.078219f, 0.007472f, -0.031620f, 0.007708f, -0.201794f, 0.017594f,
+ -0.027480f, 0.058884f, -0.369166f, -0.369770f, 0.181635f, -0.183318f,
+ -0.389184f, -0.256661f, 0.160107f, 0.037127f, -0.082573f, -0.095815f,
+ -0.322782f, 0.072528f, -0.348875f, 0.216247f, -0.161757f, -0.385502f,
+ -0.315738f, 0.020123f, -0.155609f, 0.114403f, -0.383232f, 0.629529f,
+ 0.066142f, 0.448392f, -0.389557f, -0.083315f, 0.829535f, -0.015531f,
+ -0.050728f, -0.325127f, 0.812992f, -0.196780f, 0.021060f, -0.952647f,
+ 0.006687f, -0.512715f, -0.066778f, 0.410067f, -0.116945f, -0.288283f,
+ 0.189334f, -0.083153f, 0.159980f, -0.068208f, 0.107358f, -0.154411f,
+ -0.068914f, 0.186816f, 0.032251f, 0.109242f, 0.134825f, 0.035101f,
+ -0.253175f, 0.157309f, -0.363597f, -0.138176f, -0.334141f, -0.172697f,
+ 0.045800f, -0.286057f, 0.173403f, -0.172444f, -0.117996f, -0.383848f,
+ -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f,
+ 0.056121f, 0.155046f, 0.044708f, -0.295609f, -0.211688f, -0.233229f,
+ -0.264980f, 0.145549f, 0.045323f, -0.027112f, 0.175638f, -0.207251f,
+ -0.055274f, 0.092706f, 0.086200f, -0.241340f, -0.147416f, 0.024510f,
+ -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f,
+ -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f,
+ -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f, -0.090609f,
+ 0.006595f, -0.200790f, 0.171856f, -0.027766f, -0.032017f, -0.006745f,
+ 0.566426f, -0.096850f, 0.727633f, -0.408065f, -0.012436f, 0.005646f,
+ -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f,
+ -0.231385f, -0.203175f, 0.041903f, -0.373694f, 0.058239f, -0.101116f,
+ 0.183772f, 0.164523f, -0.099046f, -0.201272f, -0.394523f, -0.157517f,
+ 0.032079f, -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f,
+ 0.100268f, -0.023806f, 0.004978f, 0.184916f, 0.142699f, -0.113240f,
+ -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f,
+ -0.113085f, -0.279699f, -0.267434f, 0.126263f, -0.260527f, -0.153904f,
+ -0.494653f, -0.355144f, 0.030549f, -0.216400f, -0.123363f, 0.189090f,
+ 0.219122f, 0.096677f, -0.202037f, -0.014489f, -0.137859f, -0.114184f,
+ -0.279423f, -0.270683f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer0[] = {
+ -0.491455f, 0.464538f, -0.005742f, -0.219951f, -0.073682f, 0.102027f,
+ 0.567071f, 0.441402f, 0.277521f, 0.314498f, -0.448199f, -0.065032f,
+ 0.488139f, -0.079632f, 0.000000f, 0.521555f, -0.151950f, -0.034616f,
+ 0.393438f, -0.072242f, -0.087343f, -0.571308f, 0.017372f, -0.126144f,
+ 0.372261f, -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f,
+ -0.109614f, -0.164492f,
+};
+
+static const float av1_early_term_after_split_nn_weights_64_layer1[] = {
+ -0.373195f, -0.283141f, 0.416113f, 0.483659f, 0.230583f, 0.349197f,
+ -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f, 0.339355f,
+ -0.828033f, 0.019617f, 0.118757f, -0.619360f, 0.282295f, -0.054116f,
+ -0.730596f, 0.068567f, -0.248707f, 0.461225f, 0.330224f, -0.287080f,
+ -0.458103f, 0.591852f, -0.008491f, 0.632119f, -0.007872f, 0.007869f,
+ -0.230698f, -0.011437f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer1[] = {
+ -0.55403697f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ HIDDEN_NODES,
+ },
+ {
+ av1_early_term_after_split_nn_weights_64_layer0,
+ av1_early_term_after_split_nn_weights_64_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_64_layer0,
+ av1_early_term_after_split_nn_bias_64_layer1,
+ },
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer0[] = {
+ 0.026050f, -0.226531f, 0.308107f, -0.083744f, 0.201785f, 0.098562f,
+ 0.147595f, -0.495771f, -0.245741f, 0.201616f, -0.272070f, -0.579545f,
+ -0.127261f, -0.229588f, 0.250831f, -0.176929f, -0.031689f, 0.284718f,
+ 0.085845f, -0.285027f, 0.012304f, 0.382402f, -0.204591f, 0.272514f,
+ -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f, 0.195689f,
+ 0.242530f, 0.023528f, -0.294242f, -0.272132f, 0.460180f, -0.731281f,
+ -0.208103f, 0.208204f, 0.348250f, 0.016328f, 0.043707f, -0.169551f,
+ 0.108521f, 0.226895f, -0.020471f, 0.102443f, 0.429640f, -0.252555f,
+ -0.218434f, -0.163665f, 0.175531f, 0.101588f, -0.135798f, -0.158102f,
+ 0.142565f, 0.128277f, 0.174985f, -0.100073f, 0.113967f, 0.223682f,
+ -0.145576f, -0.008443f, 0.112748f, -0.037845f, 0.076954f, -0.287137f,
+ -0.518185f, -0.106833f, 0.175359f, 0.031408f, 0.219069f, -0.294440f,
+ 0.007766f, 0.067754f, -0.049168f, -0.212368f, -0.261708f, 0.309252f,
+ 0.220859f, -0.274852f, -0.653157f, 0.083438f, -0.265386f, 0.174429f,
+ -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f, 0.212890f,
+ 0.272053f, -0.425315f, -0.107726f, 0.294444f, -0.354629f, 0.104402f,
+ -0.307663f, 0.558430f, 0.140334f, -0.054831f, -0.449456f, 0.058274f,
+ -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f, -0.079297f,
+ -0.638571f, 0.181823f, -0.039611f, 0.206310f, -0.659157f, -0.102930f,
+ -0.067303f, -0.176881f, -0.001038f, 0.091835f, 0.079739f, -0.121923f,
+ 0.211070f, 0.362719f, -0.154915f, -0.151876f, -0.165460f, 0.023469f,
+ -0.251036f, 0.210014f, -0.537125f, 0.156832f, -0.216987f, 0.062975f,
+ -0.198462f, 0.329123f, 0.125870f, 0.225830f, 0.086377f, -0.128773f,
+ -0.179673f, -0.074612f, 0.456645f, 0.021905f, -0.243140f, 0.059145f,
+ -0.273942f, -0.277822f, 0.154556f, -0.025459f, 0.227614f, -0.313076f,
+ 0.044705f, -0.019017f, 0.108999f, -0.020243f, -0.016373f, 0.560270f,
+ -0.064818f, 0.050880f, -0.218458f, 0.825699f, -0.534056f, -0.258253f,
+ 0.222073f, 0.013295f, 0.477870f, -0.386727f, 0.388509f, 0.004128f,
+ 0.451388f, -0.175788f, 0.264093f, -0.109812f, 0.358132f, 0.500992f,
+ -0.446933f, -0.222397f, 0.345834f, 0.370943f, -0.233115f, -0.047005f,
+ -0.111335f, -0.111586f, 0.026975f, -0.052191f, -0.111800f, -0.129782f,
+ 0.225132f, 0.102524f, 0.544557f, -0.111674f, -0.857884f, 0.133258f,
+ 0.310001f, 0.043829f, 0.104143f, 0.256493f, 0.242520f, -0.342082f,
+ 0.421447f, 0.124227f, 0.061542f, -0.090206f, 0.316681f, 0.353452f,
+ -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f,
+ 0.255725f, -0.126346f, 0.034095f, -0.240276f, -0.135918f, 0.095682f,
+ -0.147457f, -0.338216f, -0.200426f, 0.010265f, -0.243915f, -0.231375f,
+ -0.323924f, -0.014353f, 0.150252f, -0.264346f, 0.205303f, -0.194610f,
+ -0.282527f, 0.180555f, -0.000087f, 0.027240f, -0.000903f, -0.345877f,
+ -0.353274f, -0.311829f, 0.172985f, -0.111748f, -0.309380f, 0.108110f,
+ -0.260914f, -0.164990f, 0.183625f, -0.319692f, -0.096988f, 0.094147f,
+ -0.047062f, -0.080978f, 0.227387f, -0.000450f, -0.220159f, -0.211448f,
+ -0.020885f, -0.139646f, -0.086721f, 0.067928f, -0.033084f, -0.251996f,
+ 0.090317f, 0.086313f, -0.228420f, -0.111356f, -0.314304f, -0.223664f,
+ 0.188176f, -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f,
+ -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f,
+ -0.038595f, 0.119537f, 0.260477f, -0.168014f, -0.172751f, 0.532861f,
+ -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f,
+ -0.532110f, 0.359323f, -0.254786f, 0.471316f, -0.545024f, 0.291912f,
+ -0.836939f, 0.443427f, -0.441709f, 0.168866f, -0.140372f, 0.546607f,
+ -0.315465f, 0.023328f, 0.137709f, -0.083492f, -0.049986f, -0.071302f,
+ -0.293680f, -0.105049f, 0.315317f, 0.279569f, 0.220762f, 0.088161f,
+ -0.756456f, -0.074512f, 0.958318f, -0.332924f, -0.004906f, -0.629271f,
+ 0.212050f, 0.279123f, 0.311523f, -0.599580f, 0.516150f, 0.456952f,
+ 0.020255f, 0.247290f, -0.182670f, -0.335554f, 0.021203f, 0.131081f,
+ -0.208584f, 0.112530f, -0.198980f, 0.211583f, -0.101271f, -0.206453f,
+ -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f, -0.165483f,
+ 0.094953f, -0.182578f, 0.055068f, 0.135605f, -0.266941f, -0.297556f,
+ 0.199181f, 0.015979f, -0.158659f, -0.226841f, 0.171306f, 0.013438f,
+ -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f, -0.026230f,
+ -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f,
+ 0.248576f, -0.144425f, -0.113948f, -0.247297f, 0.276682f, 0.010963f,
+ -0.737786f, 0.026347f, 0.007830f, 0.753543f, 0.371904f, 0.305614f,
+ 0.105028f, 0.073530f, -0.119137f, 0.102352f, -0.080523f, 0.176366f,
+ -0.159457f, -0.339948f, 0.360131f, -0.007051f, -0.388378f, -0.101695f,
+ 0.663041f, -0.234486f, -0.142536f, -0.099931f, 0.041478f, 0.230425f,
+ 0.005743f, 0.154060f, 0.056233f, -0.080668f, -0.009754f, -0.194356f,
+ 0.185474f, -0.296474f, 0.192700f, 0.257767f, 0.348529f, 0.458265f,
+ 0.060276f, -0.130473f, 0.139889f, 0.310073f, -0.306869f, -0.272922f,
+ -0.259862f, 0.409207f, 0.431991f, -0.100357f, -0.050415f, -0.071830f,
+ -0.239665f, 0.153399f, 0.177192f, -0.611644f, -0.176114f, -0.022694f,
+ -0.033701f, -0.345842f, 0.015660f, 0.158931f, -0.097586f, 0.222001f,
+ 0.257887f, -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f,
+ -0.010895f, 0.052815f, -0.265306f, -0.081059f, 0.219162f, -0.256084f,
+ -0.372676f, 0.148977f, 0.174831f, 0.086980f, 0.108518f, 0.074011f,
+ 0.038032f, -0.070856f, -0.109407f, 0.126174f, 0.022341f, -0.249786f,
+ -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f, -0.017953f,
+ -0.028353f, 0.233621f, 0.109426f, 0.232798f, -0.104950f, -0.241798f,
+ -0.018995f, -0.167954f, 0.002473f, 0.060418f, -0.232717f, -0.195980f,
+ -0.283971f, -0.371881f, 0.219728f, 0.018072f, -0.166694f, -0.083301f,
+ -0.000616f, -0.212641f, -0.173158f, 0.222739f, -0.235302f, 0.237624f,
+ 0.222232f, -0.041235f, -0.342411f, 0.121194f, 0.211291f, -0.032237f,
+ -0.249401f, -0.291668f, 0.206055f, -0.148200f, 0.011824f, -0.272728f,
+ -0.194854f, 0.367175f, -0.257243f, 0.103433f, -0.231077f, 0.236734f,
+ 0.135733f, -0.362845f, 0.197147f, 0.242782f, -0.135289f, 0.123311f,
+ 0.259420f, -0.116278f, 0.127287f, 0.236789f, -0.097438f, 0.118073f,
+ 0.112796f, -0.035949f, 0.184408f, 0.200948f, -0.008859f, 0.195989f,
+ 0.161970f, -0.295320f, -0.330389f, 0.141034f, 0.066081f, -0.707857f,
+ 0.357037f, 0.149633f, 0.679877f, 0.548674f, 0.469076f, 0.194123f,
+ -0.209872f, -0.071764f, -0.126960f, 0.199420f, 0.327116f, -0.169053f,
+ -0.429156f, 0.443429f, -0.225530f, -0.130738f, -0.028351f, 0.644393f,
+ 0.049606f, -0.243602f, -0.409920f, 0.117028f, -0.258557f, 0.073865f,
+ -0.200454f, -0.139957f, -0.031314f, 0.162325f, 0.247221f, 0.071909f,
+ -0.336276f, 0.079922f, 0.192780f, -0.148882f, 0.133192f, -0.143177f,
+ -0.121327f, 0.126221f, -0.089521f, -0.181826f, 0.149923f, -0.280682f,
+ 0.391572f, 0.108990f, -0.445494f, -0.170787f, 0.225182f, 0.223313f,
+ -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f, -0.049377f,
+ 0.057976f, 0.033558f, 0.068733f, -0.283353f, 0.217877f, 0.158093f,
+ -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f, -0.474510f,
+ -0.096738f, 0.256940f, 0.234203f, -0.226667f, -0.260576f, -0.183403f,
+ -0.035578f, 0.141570f, 0.078764f, -0.028086f, 0.155800f, -0.251115f,
+ -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f,
+ 0.043057f, 0.068136f, -0.179903f, 0.143699f, -0.002571f, 0.239012f,
+ 0.197456f, 0.035745f, -0.311927f, 0.220320f, 0.102687f, -0.294105f,
+ 0.426740f, 0.209050f, 0.211907f, 0.083453f, 0.006578f, -0.143338f,
+ 0.003157f, 0.040295f, 0.234497f, 0.035344f, -0.163909f, 0.411115f,
+ 0.289453f, -0.075357f, -0.008884f, 0.469798f, -0.033304f, -0.153293f,
+ -0.229322f, -0.004162f, 0.113363f, 0.395381f, 0.067414f, -0.188966f,
+ -0.117424f, -0.166423f, 0.066839f, 0.595641f, -0.204782f, -0.451727f,
+ 0.198509f, -0.921583f, -0.246765f, -0.153411f, 0.046491f, 0.365906f,
+ 0.376710f, -0.017355f, -0.035232f, 0.138785f, -0.163918f, -0.283449f,
+ -0.094340f, 0.192127f, 0.154815f, 0.035787f, -0.029087f, 0.115649f,
+ -0.220133f, -0.452741f, 0.311667f, 0.157666f, 0.091401f, 0.236040f,
+ -0.168523f, 0.122176f, -0.219016f, -0.214856f, 0.172824f, -0.091810f,
+ 0.031520f, -0.857420f, 0.643446f, -0.017471f, 0.206082f, -0.933517f,
+ -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f,
+ 0.289970f, -0.889775f, -0.044741f, 0.232647f, -0.319416f, 0.073030f,
+ 0.278549f, 0.238782f, -0.202206f, 0.272540f, 0.201412f, 0.175574f,
+ -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f, 0.505169f,
+ -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f,
+ -0.161019f, 0.344837f, 0.361958f, -0.097050f, 0.014375f, 0.267110f,
+ 0.341442f, -0.016688f, 0.073393f, 0.131500f, 0.246331f, 0.011059f,
+ 0.033597f, 0.014779f, -0.269366f, -0.504788f, 0.048651f, 0.295682f,
+ 0.237363f, 0.227484f, -0.235814f, -0.160530f, 0.182682f, -0.172999f,
+ -0.126630f, 0.168357f, -0.078729f, 0.052805f, 0.377021f, -0.004727f,
+ 0.230415f, -0.876673f, 0.458457f, 0.099401f, -0.019616f, 0.611982f,
+ -0.231508f, -0.070894f, -0.056142f, 0.548969f, -0.376599f, -0.600428f,
+ 0.241930f, -0.592893f, 0.189371f, 0.488651f, -0.092446f, -0.272569f,
+ 0.251643f, 0.315945f, -0.301468f, 0.112961f, 0.052119f, -0.066076f,
+ -0.082249f, 0.252805f, -0.195539f, 0.150386f, -0.865534f, 0.673447f,
+ 0.030177f, -0.438528f, -1.006174f, 0.575176f, -0.271656f, 0.035835f,
+ -1.056916f, 0.495267f, -0.092428f, -0.109511f, -0.192359f, 0.166669f,
+ -0.624326f, -0.000354f, -0.089075f, 0.176279f, -0.289347f, 0.021346f,
+ 0.020375f, 0.255282f, -0.045588f, 0.173675f, 0.100957f, -0.294373f,
+ 0.049303f, -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f,
+ 0.079024f, 0.101113f, 0.135742f, -0.348869f, -0.026759f, -0.134155f,
+ -0.179275f, -0.054297f, -0.054948f, 0.029351f, 0.190560f, 0.102476f,
+ -0.025785f, 0.169442f, -0.271303f, 0.200667f, 0.099063f, 0.074767f,
+ -0.326533f, 0.044426f, -0.290251f, -0.082443f, -0.164482f, -0.349412f,
+ 0.045109f, -0.157330f, 0.165935f, 0.012672f, -0.059818f, 0.399140f,
+ -0.316620f, 0.386638f, -0.285399f, -0.296777f, -0.200473f, -0.144232f,
+ 0.251851f, -0.203768f, 0.001071f, -0.179063f, 0.248952f, -0.143029f,
+ 0.010423f, -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f,
+ 0.322825f, 0.133683f, 0.017388f, 0.142467f, 0.221320f, 0.004059f,
+ -0.115770f, 0.143363f, 0.137972f, -0.272584f, 0.489366f, -0.091828f,
+ -0.014703f, 0.082332f, -0.476226f, -0.202859f, 0.356094f, -0.283049f,
+ 0.218086f, 0.202015f, 0.201724f, 0.012617f, 0.050720f, 0.255695f,
+ 0.244653f, 0.111296f, -0.151450f, -0.056210f, -0.757348f, 0.441724f,
+ -0.022455f, -0.244662f, 0.296205f, -0.421883f, -0.217386f, -0.254301f,
+ 0.409105f, -0.031309f, 0.050147f, -0.337170f, -0.106620f, -0.606455f,
+ 0.308024f, 0.298144f, 0.363993f, 0.704870f, -0.047292f, 0.166901f,
+ 0.105991f, -0.536757f, -0.424031f, -0.226034f, 0.213635f, -0.526754f,
+ 0.310990f, -0.116038f, 0.007775f, 0.538330f, -0.177912f, 0.445357f,
+ -0.290365f, 0.451169f, 0.030931f, 0.033388f, 0.209905f, -0.244492f,
+ -0.097792f, -0.246042f, 0.132047f, 0.032576f, 0.115516f, 0.022890f,
+ 0.093508f, -0.071840f, 0.362948f, -0.135245f, 0.659911f, -0.321413f,
+ 0.193118f, -0.795001f, -0.218311f, 0.024862f, 0.206172f, -0.832878f,
+ -0.255670f, 0.343402f, -0.275211f, -0.898363f, -0.025172f, 0.158565f,
+ 0.171347f, -0.127518f, -0.215156f, -0.159198f, 0.250355f, -0.132452f,
+ 0.061254f, -0.097544f, -0.223246f, 0.013183f, 0.239468f, 0.259017f,
+ -0.217739f, -0.032263f, 0.123755f, -0.701777f, 0.150049f, -0.555293f,
+ 0.062430f, -0.260304f, 0.494894f, -0.168702f, -0.134829f, -0.113989f,
+ 0.150092f, -0.060248f, 0.115711f, -0.277202f, 0.499811f, 0.417116f,
+ 0.191081f, -0.376432f, -0.321092f, 0.033992f, 0.057193f, 0.127077f,
+ -0.009042f, 0.014443f, 0.142808f, -0.124349f, 0.213087f, -0.381686f,
+ 0.129726f, -0.038396f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer0[] = {
+ -0.107171f, 0.060848f, -0.069480f, -0.121982f, 0.037637f, -0.291839f,
+ 0.102257f, -0.065889f, -0.032452f, 0.034171f, -0.073984f, -0.005236f,
+ 0.218820f, 0.132123f, -0.089621f, -0.067679f, 0.049368f, 0.329444f,
+ -0.184729f, 0.031702f, 0.009735f, -0.039964f, -0.018024f, -0.073031f,
+ -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f, 0.216609f,
+ -0.078358f, -0.007740f,
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer1[] = {
+ 0.047869f, -0.231773f, -0.185663f, 0.460676f, -0.208182f, 0.590555f,
+ -0.622627f, 0.279377f, 0.351681f, 0.633504f, 1.069884f, 0.332449f,
+ -0.457703f, -0.435817f, -0.028853f, 0.327490f, -0.282469f, -0.975792f,
+ -0.062975f, -0.147187f, 0.348340f, -1.207116f, 0.516159f, -1.509626f,
+ -0.805072f, 0.522999f, 0.143671f, 0.304246f, -0.360720f, -0.612472f,
+ 0.260045f, -0.223243f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer1[] = {
+ -0.07571174f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ HIDDEN_NODES,
+ },
+ {
+ av1_early_term_after_split_nn_weights_32_layer0,
+ av1_early_term_after_split_nn_weights_32_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_32_layer0,
+ av1_early_term_after_split_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer0[] = {
+ -0.113798f, 0.053357f, -0.037947f, -0.477171f, 0.276517f, -0.349252f,
+ -0.177284f, 0.189597f, 0.141744f, 0.230207f, -0.328104f, 0.074328f,
+ 0.247717f, 0.233533f, 0.145167f, 0.018029f, -0.398725f, -0.226199f,
+ -0.309724f, 0.125279f, 0.194759f, 0.025531f, 0.349714f, -0.273944f,
+ 0.186871f, 0.181735f, -0.520614f, -0.264076f, 0.308207f, 0.157438f,
+ -0.137791f, -0.054582f, 0.125879f, 0.796218f, -0.897562f, 0.885439f,
+ 0.381640f, 0.106625f, -2.027456f, 0.000874f, 0.179581f, 0.013287f,
+ -2.329439f, -0.163169f, -0.136191f, 0.320108f, -2.318779f, -0.196722f,
+ -0.295721f, 0.203658f, -0.182275f, 0.615941f, 0.015762f, 0.257181f,
+ -0.115297f, 0.295774f, -0.026144f, -0.022686f, -0.219423f, -0.042861f,
+ 0.207647f, -0.057791f, 0.201671f, -0.169569f, 0.291492f, -0.994991f,
+ 0.137473f, 0.230948f, 0.505626f, -1.065860f, 0.275225f, -0.250861f,
+ 0.519466f, -1.217242f, -0.087384f, 0.053441f, 0.030729f, -1.702304f,
+ -0.034635f, 0.010177f, -0.035422f, -0.749979f, 0.355499f, 0.408166f,
+ -0.086883f, 0.017203f, 0.195706f, -0.218056f, -0.029153f, 0.367335f,
+ -0.061732f, -0.241068f, 0.078496f, -0.370346f, -0.124223f, -0.172708f,
+ 0.037971f, 0.038875f, -0.282489f, -0.266323f, -0.210864f, 0.214714f,
+ 0.234695f, -0.045625f, 0.015357f, -0.007464f, -0.362003f, -0.113465f,
+ 0.145141f, 0.238470f, -0.202664f, -0.286587f, -0.347112f, 0.054501f,
+ -0.190290f, -0.283256f, 0.062179f, 0.041165f, -0.006935f, -0.220351f,
+ -0.088800f, 0.220924f, -0.200982f, 0.058493f, -0.225175f, 0.057175f,
+ -0.618187f, 0.761023f, -0.743774f, -0.500599f, -0.584999f, 1.545211f,
+ 0.123055f, -0.106848f, -0.353057f, 1.552187f, 0.174104f, 0.068060f,
+ -0.449859f, 1.254299f, -0.161716f, -0.060630f, -0.230721f, 0.165976f,
+ -0.101582f, -0.422415f, 0.110384f, -0.130098f, 0.104428f, 0.083518f,
+ 0.031626f, 0.083048f, 0.158877f, 0.173340f, 0.063962f, 0.427845f,
+ 0.663268f, 0.376996f, 0.146435f, -0.091329f, 0.443447f, 0.518432f,
+ -0.182777f, -0.091313f, 0.331229f, 0.532604f, -0.187001f, 0.054774f,
+ 0.298068f, 0.502295f, -0.362378f, 0.054283f, 0.292806f, 0.168901f,
+ -0.214787f, 0.025637f, 0.458009f, -0.322714f, -0.264059f, 0.140313f,
+ -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f, -0.009061f,
+ -0.050681f, 0.108681f, 0.043272f, -1.073133f, 0.206410f, 0.469576f,
+ 0.291494f, -2.021244f, -0.001183f, -0.067542f, 0.364907f, -2.470543f,
+ 0.049147f, -0.018868f, 0.658500f, -2.531048f, 0.275433f, -0.034224f,
+ -0.171386f, 0.096369f, 0.728069f, 0.272332f, 0.222255f, -0.030426f,
+ 0.026994f, 0.208928f, -0.173943f, -0.227581f, -0.214798f, 0.079341f,
+ 0.032344f, -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f,
+ -0.086592f, 0.066487f, 0.337353f, -0.168704f, 0.015702f, 0.022607f,
+ 0.286647f, 0.218106f, 0.193319f, -0.358714f, 0.030796f, 0.007646f,
+ -0.045617f, 0.165007f, -0.284641f, -0.291812f, 0.207544f, 0.082823f,
+ -0.141907f, -0.331336f, -0.052908f, 0.120716f, 0.202521f, 0.232782f,
+ -0.348141f, -0.017332f, 1.191126f, -0.391987f, -0.154537f, -0.206551f,
+ -2.378690f, 0.057918f, -0.328183f, 2.151556f, 0.238803f, 0.164880f,
+ -0.480039f, 1.616200f, 0.260243f, 0.083704f, -0.174461f, 1.804634f,
+ 0.194810f, 0.223837f, 0.550107f, -0.068171f, -0.293435f, -0.186770f,
+ -0.364846f, 0.127181f, 0.105556f, -0.016202f, 0.278403f, -0.344995f,
+ -0.009761f, -0.082555f, 0.046731f, -0.301452f, 0.604259f, 0.055895f,
+ 0.049862f, 0.314249f, -0.305811f, -0.112937f, 0.658787f, -0.549288f,
+ -0.307567f, -0.460650f, -0.840643f, 0.082576f, 0.373711f, 0.138318f,
+ 0.336901f, 0.284984f, -0.281400f, 0.408210f, -0.449858f, 0.461054f,
+ 0.227629f, -0.131705f, 0.301769f, -0.278540f, 0.189290f, -0.269041f,
+ 0.111350f, -0.300257f, 0.436858f, -0.265920f, -0.211938f, 0.272631f,
+ 0.206291f, 0.253273f, -0.229776f, -0.031112f, -0.171183f, -0.109676f,
+ -0.202390f, -0.068857f, 0.182125f, -0.140523f, -0.308742f, -0.045840f,
+ 0.256545f, -0.262405f, 0.225951f, -0.287463f, -0.189203f, -0.055552f,
+ -0.052448f, -0.242839f, -0.278877f, 0.140920f, -0.175755f, 0.215402f,
+ -0.248841f, -0.264080f, -0.178303f, 0.147777f, 0.049460f, -0.279877f,
+ -0.539725f, -0.004622f, 0.182874f, 0.338814f, 0.265974f, 0.249851f,
+ -0.141154f, 0.157228f, -0.090972f, 0.179444f, 0.305255f, 0.127788f,
+ 0.123270f, 0.355320f, 0.076797f, 0.263495f, 0.235965f, -0.133816f,
+ 0.243624f, 0.227062f, -0.213629f, 0.002075f, 0.061203f, -0.077820f,
+ -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f,
+ -0.274248f, 0.053950f, -0.225750f, -0.367097f, -0.122391f, 0.181212f,
+ -0.411824f, -0.084241f, -0.302288f, 0.077860f, -0.187443f, -0.300262f,
+ 0.083156f, -0.392461f, -0.332320f, -0.346474f, 0.140658f, -0.283656f,
+ 0.120714f, -0.056577f, -0.280968f, 0.017795f, -0.024686f, 0.073113f,
+ -0.346637f, 0.082567f, -0.036556f, -0.369730f, 0.081225f, -0.005211f,
+ 0.144886f, -0.003544f, 0.178307f, -0.366035f, -0.063887f, -0.191767f,
+ 0.105835f, -0.273978f, -0.266532f, -0.023984f, 0.039166f, 0.065848f,
+ -0.026802f, -0.268923f, 0.189659f, 0.086300f, 0.030718f, 0.216565f,
+ -0.130025f, -0.215687f, 0.146341f, -0.286438f, -0.394226f, -0.181509f,
+ -0.005612f, 0.186040f, 0.133491f, 0.032096f, -0.261609f, 0.074007f,
+ -0.042929f, -0.234479f, 0.189704f, 0.088395f, -0.003671f, -0.125055f,
+ -0.252418f, -0.086387f, 0.111197f, -0.297071f, -0.018793f, -0.031902f,
+ -0.333191f, -0.186279f, 0.039868f, 0.091419f, -0.264438f, -0.216150f,
+ -0.212550f, 0.203412f, -0.113028f, -0.197169f, -0.346771f, 0.086066f,
+ 0.091443f, -0.128507f, -0.007281f, -0.118389f, 0.003370f, -0.338661f,
+ 0.026739f, -0.063571f, -0.281567f, -0.166824f, 0.167455f, 0.216173f,
+ 0.199163f, 0.256314f, -0.222679f, 0.040282f, -0.154808f, -0.133943f,
+ -0.270163f, -0.357398f, 0.260373f, 0.176950f, -0.125162f, -0.085050f,
+ 0.226376f, -0.124585f, -0.324804f, 0.035536f, -0.133600f, 0.173450f,
+ 0.068107f, -0.337442f, 0.169629f, 0.047223f, 0.057878f, 0.055555f,
+ -0.317449f, -0.103768f, 0.080899f, -0.194759f, -1.137593f, 0.508999f,
+ 0.045372f, 1.746454f, 1.250347f, -0.342930f, -0.127821f, -0.220175f,
+ -0.417649f, -0.480595f, 0.071902f, 0.050231f, -0.562554f, -0.677866f,
+ -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f, 0.572936f,
+ 0.047325f, 0.050619f, 0.112611f, -0.035393f, 0.052585f, -0.071076f,
+ -0.015798f, -0.050228f, -0.142875f, 0.189329f, 0.048833f, 0.503633f,
+ 0.249588f, 0.175492f, -0.137664f, -0.018533f, 0.288453f, -0.025644f,
+ 0.079131f, 0.195096f, -0.154039f, -0.104220f, -0.224072f, 0.095946f,
+ -0.208424f, 0.214745f, 0.056468f, 0.182603f, 0.341784f, -0.134664f,
+ -0.194050f, 0.058532f, -0.107336f, -0.087783f, -0.238795f, -0.387212f,
+ 0.049055f, -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f,
+ 0.407375f, -0.462654f, -0.609488f, 0.027742f, -0.985512f, -0.109154f,
+ -0.423276f, 2.347960f, 0.129240f, 0.187610f, -0.057081f, 2.424892f,
+ 0.087666f, 0.106716f, -0.039379f, 2.764866f, 0.113309f, 0.028196f,
+ -0.582789f, 0.335385f, -0.538029f, -0.477337f, -0.114207f, 0.178829f,
+ 0.006276f, 0.123179f, 0.095101f, 0.139898f, -0.372074f, -0.111010f,
+ 0.136330f, 0.272900f, 0.126737f, -0.097808f, -0.363697f, 0.108665f,
+ -0.227749f, -0.083421f, 1.714677f, 0.451943f, 0.107931f, -0.392281f,
+ 1.615846f, 0.022307f, -0.247011f, 0.257703f, 1.039134f, 0.537789f,
+ 0.022177f, -0.271532f, 0.351350f, -0.399205f, -0.240534f, -0.315399f,
+ 0.026928f, -0.005618f, 0.053179f, -0.010277f, 0.000501f, 0.040896f,
+ -0.109160f, 0.018282f, 0.003887f, 0.199599f, 0.095349f, -0.337284f,
+ 0.169929f, -0.109409f, -0.166983f, 0.059908f, -0.226574f, -0.120114f,
+ 0.077329f, -0.333133f, -0.220936f, 0.114309f, -0.233965f, -0.281551f,
+ 0.042948f, 0.100940f, 0.116037f, -0.313122f, 0.215149f, -0.309057f,
+ -0.341052f, -0.294417f, -0.179722f, 0.010795f, 0.192053f, -0.275261f,
+ -0.033077f, 0.117348f, 0.090206f, 0.781573f, 0.602456f, -0.220296f,
+ 0.172159f, 0.758513f, 0.157910f, -0.217897f, -0.372659f, 0.031935f,
+ 0.791463f, 0.267195f, 0.931593f, -0.057349f, 0.405512f, -0.058512f,
+ -0.641663f, -0.076592f, 0.550227f, -0.024094f, 0.048218f, -0.289971f,
+ 0.180940f, 0.167533f, 0.052711f, -0.360726f, 0.019210f, -0.488879f,
+ 0.380498f, 0.151608f, -0.276895f, -0.596554f, 0.106076f, -0.245833f,
+ -0.048783f, 0.073823f, 0.098780f, 0.000211f, 0.113958f, -0.068964f,
+ -0.265533f, -0.185457f, 0.175586f, -0.163621f, -0.204919f, 0.145802f,
+ -0.163421f, 0.129576f, -0.153486f, -0.105573f, 0.067289f, -0.213120f,
+ -0.286103f, 0.249543f, -0.044970f, -0.170464f, -0.105501f, -0.094765f,
+ -0.050734f, -0.369468f, 0.180020f, -0.363328f, -0.151654f, -0.262550f,
+ -0.424503f, 0.829032f, -0.559452f, 0.506837f, 0.143823f, 0.276660f,
+ -1.808608f, -0.259517f, -0.053945f, 0.035676f, -1.842195f, -0.065960f,
+ -0.069285f, 0.462022f, -2.319453f, -0.370299f, 0.183329f, -0.146412f,
+ -0.563875f, 0.305068f, 0.480904f, 0.044319f, -0.016098f, 0.168516f,
+ 0.114874f, -0.097621f, -0.030373f, 0.177700f, 0.181591f, -0.146003f,
+ -0.330853f, -0.259200f, 0.779319f, -1.517524f, 0.178781f, 0.135451f,
+ 0.088784f, -2.076089f, 0.628717f, -0.048685f, 0.281327f, -2.341596f,
+ 0.422171f, 0.006135f, 0.367096f, -1.663118f, 0.365253f, -0.072884f,
+ -0.197620f, -0.688634f, 0.477354f, 0.395841f, -0.098505f, 0.208709f,
+ -0.027523f, 0.127119f, 0.106274f, 0.114424f, -0.122877f, -0.087245f,
+ 0.086923f, -0.527398f, -0.342062f, -0.764662f, 0.713094f, -0.626453f,
+ -0.081454f, -0.087683f, 0.885047f, 0.323440f, -0.018579f, -0.217166f,
+ 1.617984f, -0.159038f, 0.265991f, -0.390313f, 1.933182f, -0.032431f,
+ -0.057513f, -0.300841f, 0.461248f, -0.072147f, -0.287052f, -0.078056f,
+ 0.011734f, 0.044013f, 0.177174f, 0.093400f, 0.028819f, 0.193686f,
+ -0.224853f, 0.268321f, -0.075059f, 0.074526f, -0.015618f, 0.165615f,
+ -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f,
+ -0.224625f, -0.124980f, -0.104482f, 0.076864f, -0.009631f, -0.164682f,
+ 0.150480f, -0.111880f, -0.260425f, 0.086234f, -0.176936f, -0.136771f,
+ -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f,
+ -0.109061f, -0.098970f, 0.090792f, -0.109623f, 0.349851f, 0.266341f,
+ -0.088602f, -0.108071f, 0.082519f, 0.472650f, -1.838758f, 0.456694f,
+ 0.119927f, 0.461077f, -2.860022f, 0.231495f, 0.235771f, 0.256424f,
+ -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f, 0.505510f,
+ 0.615657f, 0.193760f, 0.224600f, 0.265732f, -0.121553f, -0.354597f,
+ -0.242414f, -0.276639f, -0.057591f, 0.026369f, -0.261148f, -0.356155f,
+ -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f, 0.221299f,
+ -0.108857f, -0.156514f, 0.050901f, 0.058541f, -0.077141f, 0.071515f,
+ -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f,
+ -0.143374f, -0.091811f, 0.165161f, 0.060156f, -0.086103f, -0.039031f,
+ -0.377759f, -0.370533f, 0.074431f, 0.064192f, 0.186576f, 0.447858f,
+ -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f, 0.176286f,
+ 2.850013f, 0.019385f, -0.225361f, -0.235315f, 1.654694f, -0.073978f,
+ -0.341412f, -1.187575f, 2.815900f, -0.228063f, -0.174547f, 0.623825f,
+ -0.010676f, 0.157189f, 0.111879f, -0.198965f, 0.051851f, 0.158396f,
+ 0.045194f, 0.293531f, -0.246714f, -0.351493f, 0.026954f, 0.076233f,
+ 0.420367f, 0.168154f, -0.131450f, 0.134487f, -0.288851f, -0.134553f,
+ 0.014902f, 0.756381f, 0.277713f, 0.190080f, -0.020869f, 1.446672f,
+ 0.029792f, -0.025927f, 0.060640f, 0.559864f, 0.422229f, 0.198459f,
+ 0.036167f, 0.029432f, 0.001882f, 0.038480f, -0.160528f, -0.288855f,
+ -0.310886f, 0.291296f, 0.190558f, -0.182816f, -0.002252f, 0.073101f,
+ -0.172245f, -0.305980f, 0.112492f, -0.422839f, -0.295999f, -0.078160f,
+ -0.173405f, -0.032819f, 0.373774f, -0.715223f, 0.018911f, 0.131753f,
+ -0.237364f, -0.128499f, -0.228406f, 0.341619f, 0.343552f, -0.521581f,
+ -0.263790f, 0.362502f, -0.018450f, 0.054233f, 0.183068f, 0.382772f,
+ 0.188811f, -0.627287f, 0.040399f, -0.487338f, -0.192591f, 0.247426f,
+ 0.154372f, -0.483994f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer0[] = {
+ -0.173976f, 0.305495f, 0.250981f, -0.067127f, -0.313100f, 0.242464f,
+ 0.315196f, -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f,
+ -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f,
+ -0.239698f, -0.182082f, -0.065296f, 0.021503f, -0.036787f, 0.311861f,
+ 0.118135f, -0.320456f, -0.110719f, 0.220692f, -0.071727f, -0.088226f,
+ -0.110874f, -0.111671f,
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer1[] = {
+ -0.338573f, 0.398159f, 0.314774f, -0.037448f, -0.271950f, -0.774991f,
+ 0.950901f, -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f,
+ -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f,
+ -0.657195f, 0.927632f, -0.040150f, 0.578920f, 0.212301f, 0.292495f,
+ 0.563590f, -0.205735f, 0.195877f, 0.582122f, -0.217860f, 1.613379f,
+ 0.313278f, -0.555802f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer1[] = {
+ 0.16553f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ HIDDEN_NODES,
+ },
+ {
+ av1_early_term_after_split_nn_weights_16_layer0,
+ av1_early_term_after_split_nn_weights_16_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_16_layer0,
+ av1_early_term_after_split_nn_bias_16_layer1,
+ },
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer0[] = {
+ -0.719472f, 0.305806f, 0.855829f, 0.100094f, 0.412517f, 1.254673f,
+ 1.552105f, -5.890773f, -0.089957f, -0.016736f, 1.418074f, -5.393506f,
+ -0.028214f, 0.117758f, 1.479209f, -5.299794f, 0.171585f, -0.084182f,
+ -0.162105f, 0.388577f, -0.044319f, -0.025861f, 0.251782f, -0.181462f,
+ -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f,
+ 0.038038f, -0.119492f, 0.049104f, -0.344384f, -0.354513f, 0.036977f,
+ 0.017513f, -0.004025f, -0.163212f, -0.261999f, 0.146575f, 0.207541f,
+ 0.130365f, -0.252127f, 0.097419f, -0.231057f, -0.309421f, 0.347866f,
+ -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f,
+ -0.233553f, 0.156354f, -0.184009f, 0.344289f, -0.308058f, -0.205202f,
+ -0.325068f, 0.183820f, -0.361667f, -0.069559f, -0.121834f, -0.038357f,
+ -0.210043f, -0.266129f, 0.003188f, 0.074902f, -0.328843f, 0.293679f,
+ -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f,
+ 0.166074f, -0.262899f, 0.102114f, -0.323420f, 0.057064f, -0.203318f,
+ -0.397413f, -0.317324f, -0.307093f, 0.020574f, -0.188627f, 0.132529f,
+ 0.118992f, -0.487387f, -0.282975f, 0.573231f, -0.266071f, 0.125140f,
+ -0.970034f, 1.424008f, -0.487366f, -0.196415f, 3.680273f, -0.008407f,
+ 0.081109f, -0.187479f, 3.876021f, 0.159168f, 0.111721f, -0.337423f,
+ 3.901760f, 0.261268f, -0.245555f, -0.187632f, -0.324298f, 0.167234f,
+ 0.170986f, -0.473055f, 0.087016f, -0.003469f, 0.051035f, 0.251794f,
+ 0.153549f, 0.217609f, -0.326870f, -0.175511f, 0.637341f, -0.694837f,
+ -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f,
+ 0.071414f, 0.215265f, -0.835999f, 5.735746f, 0.300101f, 0.089626f,
+ -0.450261f, 5.608051f, 0.190491f, 0.110220f, -0.595360f, -0.446324f,
+ 0.311380f, 0.268812f, -0.339656f, -0.008708f, 0.011111f, -0.027557f,
+ 0.171534f, 0.000676f, 0.227232f, 0.033993f, 0.146684f, 0.094817f,
+ -0.175381f, -0.211927f, -0.362471f, 0.168834f, 0.264149f, -0.350538f,
+ -0.463249f, -0.288105f, 0.347155f, 0.183231f, -0.229732f, -0.252202f,
+ -0.218074f, -0.008769f, -0.156103f, 0.181233f, -0.354736f, 0.263270f,
+ -0.106636f, 0.081057f, 0.060634f, -0.046887f, 0.050468f, 0.071259f,
+ 0.221287f, 0.199071f, -0.180185f, -0.406902f, -0.239351f, -0.034957f,
+ 0.369140f, 0.864600f, 0.233798f, 0.423612f, -0.468918f, 0.976987f,
+ 0.691198f, -1.597908f, 0.102926f, 0.305546f, 0.391196f, -3.909059f,
+ 0.333635f, 0.311561f, 0.738886f, -4.002001f, 0.236394f, -0.233141f,
+ 0.263342f, 0.679898f, 0.136233f, 0.254743f, -0.367571f, 0.066412f,
+ 0.001606f, -0.059542f, 0.051726f, -0.347145f, -0.045501f, -0.313847f,
+ -0.021952f, 1.386316f, -0.579139f, -1.275844f, -0.003493f, -1.716577f,
+ 0.250209f, 0.192086f, 4.177055f, 0.351835f, 0.338177f, 0.140163f,
+ 4.099592f, 0.321866f, -0.128153f, -0.360414f, 4.350767f, 0.025943f,
+ -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f,
+ 0.149441f, -0.019823f, 0.012759f, 0.404442f, -0.108881f, 0.067974f,
+ -0.188278f, 0.136327f, 0.109927f, -0.179270f, -0.272342f, 0.018064f,
+ -0.304216f, -0.469470f, 0.109310f, -0.326214f, 0.061909f, -0.278997f,
+ -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f,
+ 0.042441f, -0.126699f, -0.420399f, -0.033842f, 0.016773f, -0.273789f,
+ 0.081928f, -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f,
+ -0.232576f, 0.082955f, -0.490080f, 0.073820f, -0.090384f, 0.035781f,
+ -0.158880f, -0.506793f, -0.069132f, 0.047602f, -0.349640f, -0.058389f,
+ -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f, -0.548909f,
+ -0.131561f, 0.247196f, -0.207923f, 0.133056f, -0.509854f, -0.193685f,
+ -0.181327f, -0.242442f, 0.091821f, 0.114430f, -0.375233f, -0.015254f,
+ -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f,
+ -0.076332f, -0.186232f, -0.268491f, 0.075561f, -0.389082f, -0.077435f,
+ 0.352562f, -0.020086f, -0.338181f, -0.404629f, 0.254983f, 0.150477f,
+ -0.265903f, 0.003341f, 0.099969f, -0.211964f, -0.129372f, -0.166366f,
+ 0.327712f, -0.276234f, 0.140675f, -0.433677f, -0.163050f, -0.143578f,
+ -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f,
+ 1.394155f, -0.922486f, -1.350222f, 2.030201f, 0.057717f, 0.227650f,
+ -0.193179f, 0.037224f, 0.065555f, 0.020558f, -0.059205f, -0.023690f,
+ -0.008718f, 0.095976f, -0.549587f, -0.321164f, -0.243728f, 1.344381f,
+ -1.254107f, 0.294244f, -0.154737f, -0.152597f, 0.342419f, 0.301883f,
+ 0.069866f, -0.327766f, 0.209323f, -0.364913f, -0.005530f, -0.558972f,
+ 0.057684f, -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f,
+ -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f,
+ 0.269628f, -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f,
+ -0.135487f, -0.326108f, -0.039019f, 0.185029f, -0.264883f, -0.563447f,
+ -0.163532f, -0.447652f, -0.141851f, 0.001714f, -0.193184f, 0.032609f,
+ -0.112883f, 0.074599f, 0.490665f, 0.434764f, 0.021652f, -0.219618f,
+ 0.743267f, 0.147195f, -0.303479f, -0.097674f, 0.195813f, 0.704007f,
+ -1.290851f, 0.119701f, 0.224065f, 0.260246f, -0.580657f, -0.096201f,
+ -0.333214f, -0.586689f, 0.567178f, 0.157340f, -0.043184f, 0.194358f,
+ -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f, -0.564178f,
+ -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f, -0.346106f,
+ -0.256892f, 0.110915f, -0.337464f, -0.341474f, -0.216113f, 0.249445f,
+ -0.070175f, -0.412141f, 0.153458f, -0.081280f, 0.164669f, -0.356396f,
+ -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f, -0.253233f,
+ -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f,
+ -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f, 0.011957f,
+ 0.000498f, -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f,
+ 0.124249f, -0.550804f, -0.420397f, -0.123462f, 0.333292f, -0.240230f,
+ -0.025604f, 0.337536f, -0.295006f, -0.272614f, -0.496850f, -0.278521f,
+ 0.234591f, -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f,
+ 0.008714f, -0.064018f, -0.124873f, -0.334014f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer0[] = {
+ 1.202379f, -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f,
+ 0.615653f, -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f,
+ -0.225995f, 0.370877f, -0.214821f, -0.227752f,
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer1[] = {
+ 0.376594f, 0.266703f, -0.039847f, 1.680142f, -0.879939f, 0.286806f,
+ -0.378223f, -0.405295f, -0.021107f, 0.039188f, 0.259308f, 0.193091f,
+ 0.077994f, -0.269141f, 0.011180f, -0.019262f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer1[] = {
+ -1.29585564f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ 16,
+ },
+ {
+ av1_early_term_after_split_nn_weights_8_layer0,
+ av1_early_term_after_split_nn_weights_8_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_8_layer0,
+ av1_early_term_after_split_nn_bias_8_layer1,
+ },
+};
+#undef FEATURES
+#undef HIDDEN_NODES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c
new file mode 100644
index 0000000000..1c17b09ee1
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_search.c
@@ -0,0 +1,6263 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "aom_dsp/txfm_common.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/av1_ml_partition_models.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define COLLECT_MOTION_SEARCH_FEATURE_SB 0
+
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+ part_sf->partition_search_type = SEARCH_PARTITION;
+ part_sf->less_rectangular_check_level = 0;
+ part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+ part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ part_sf->default_max_partition_size = BLOCK_LARGEST;
+ part_sf->default_min_partition_size = BLOCK_4X4;
+ part_sf->adjust_var_based_rd_partitioning = 0;
+ part_sf->max_intra_bsize = BLOCK_LARGEST;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ part_sf->fixed_partition_size = BLOCK_16X16;
+ // Recode loop tolerance %.
+ part_sf->partition_search_breakout_dist_thr = 0;
+ part_sf->partition_search_breakout_rate_thr = 0;
+ part_sf->prune_ext_partition_types_search_level = 0;
+ part_sf->prune_part4_search = 0;
+ part_sf->ml_prune_partition = 0;
+ part_sf->ml_early_term_after_part_split_level = 0;
+ for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+ part_sf->ml_partition_search_breakout_thresh[i] =
+ -1; // -1 means not enabled.
+ }
+ part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+ part_sf->simple_motion_search_split = 0;
+ part_sf->simple_motion_search_prune_rect = 0;
+ part_sf->simple_motion_search_early_term_none = 0;
+ part_sf->simple_motion_search_reduce_search_steps = 0;
+ part_sf->intra_cnn_based_part_prune_level = 0;
+ part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+ part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+ part_sf->ext_part_eval_based_on_cur_best = 0;
+ part_sf->prune_ext_part_using_split_info = 0;
+ part_sf->prune_rectangular_split_based_on_qidx = 0;
+ part_sf->early_term_after_none_split = 0;
+ part_sf->ml_predict_breakout_level = 0;
+ part_sf->prune_sub_8x8_partition_level = 0;
+ part_sf->simple_motion_search_rect_split = 0;
+ part_sf->reuse_prev_rd_results_for_part_ab = 0;
+ part_sf->reuse_best_prediction_for_part_ab = 0;
+ part_sf->use_best_rd_for_pruning = 0;
+ part_sf->skip_non_sq_part_based_on_none = 0;
+}
+
+// Reset speed features that works for the baseline encoding, but
+// blocks the external partition search.
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) {
+ cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// If input |features| is NULL, write tpl stats to file for each super block.
+// Otherwise, store tpl stats to |features|.
+// The tpl stats is computed in the unit of tpl_bsize_1d (16x16).
+// When writing to text file:
+// The first row contains super block position, super block size,
+// tpl unit length, number of units in the super block.
+// The second row contains the intra prediction cost for each unit.
+// The third row contains the inter prediction cost for each unit.
+// The forth row contains the motion compensated dependency cost for each unit.
+static void collect_tpl_stats_sb(const AV1_COMP *const cpi,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col,
+ aom_partition_features_t *features) {
+ const AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+ return;
+ }
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ // If tpl stats is not established, early return
+ if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+ if (features != NULL) features->sb_features.tpl_features.available = 0;
+ return;
+ }
+
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int mi_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int mi_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+ const int col_steps = (mi_width / step) + ((mi_width % step) > 0);
+ const int row_steps = (mi_height / step) + ((mi_height % step) > 0);
+ const int num_blocks = col_steps * row_steps;
+
+ if (features == NULL) {
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d",
+ cpi->oxcf.partition_info_path, cpi->sb_counter);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+ tpl_data->tpl_bsize_1d, num_blocks);
+ int count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ fprintf(pfile, "%.0f", (double)this_stats->intra_cost);
+ if (count < num_blocks - 1) fprintf(pfile, ",");
+ ++count;
+ }
+ }
+ fprintf(pfile, "\n");
+ count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ fprintf(pfile, "%.0f", (double)this_stats->inter_cost);
+ if (count < num_blocks - 1) fprintf(pfile, ",");
+ ++count;
+ }
+ }
+ fprintf(pfile, "\n");
+ count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ fprintf(pfile, "%.0f", (double)mc_dep_delta);
+ if (count < num_blocks - 1) fprintf(pfile, ",");
+ ++count;
+ }
+ }
+ fclose(pfile);
+ } else {
+ features->sb_features.tpl_features.available = 1;
+ features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d;
+ features->sb_features.tpl_features.num_units = num_blocks;
+ int count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ features->sb_features.tpl_features.intra_cost[count] =
+ this_stats->intra_cost;
+ features->sb_features.tpl_features.inter_cost[count] =
+ this_stats->inter_cost;
+ features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta;
+ ++count;
+ }
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col,
+ uint8_t allow_update_cdf) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, mbmi->bsize,
+ tx_size);
+ const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+ assert(tx_size > TX_4X4);
+
+ if (depth == MAX_VARTX_DEPTH) {
+ // Don't add to counts in this case
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->txfm_partition[ctx][0];
+#endif
+ if (allow_update_cdf)
+ update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->txfm_partition[ctx][1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+ ++x->txfm_search_info.txb_split_count;
+
+ if (sub_txs == TX_4X4) {
+ mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+ return;
+ }
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ int offsetr = row;
+ int offsetc = col;
+
+ update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+ blk_col + offsetc, allow_update_cdf);
+ }
+ }
+ }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+ BLOCK_SIZE plane_bsize,
+ FRAME_COUNTS *td_counts,
+ uint8_t allow_update_cdf) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+ for (int idy = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+ allow_update_cdf);
+ }
+ }
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+ int blk_col) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size) {
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+ } else {
+ if (tx_size == TX_8X8) {
+ mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+ return;
+ }
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+ set_txfm_context(xd, sub_txs, offsetr, offsetc);
+ }
+ }
+ }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+ for (int idy = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ set_txfm_context(xd, max_tx_size, idy, idx);
+ }
+ }
+}
+
+static void update_zeromv_cnt(const AV1_COMP *const cpi,
+ const MB_MODE_INFO *const mi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) ||
+ mi->segment_id > CR_SEGMENT_ID_BOOST2) {
+ return;
+ }
+ const AV1_COMMON *const cm = &cpi->common;
+ const MV mv = mi->mv[0].as_mv;
+ const int bw = mi_size_wide[bsize] >> 1;
+ const int bh = mi_size_high[bsize] >> 1;
+ const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw);
+ const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh);
+ const int block_index =
+ (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1);
+ for (int y = 0; y < ymis; y++) {
+ for (int x = 0; x < xmis; x++) {
+ // consec_zero_mv is in the scale of 8x8 blocks
+ const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x;
+ if (abs(mv.row) < 10 && abs(mv.col) < 10) {
+ if (cpi->consec_zero_mv[map_offset] < 255)
+ cpi->consec_zero_mv[map_offset]++;
+ } else {
+ cpi->consec_zero_mv[map_offset] = 0;
+ }
+ }
+ }
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TokenExtra **t, RUN_TYPE dry_run,
+ BLOCK_SIZE bsize, int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO **mi_4x4 = xd->mi;
+ MB_MODE_INFO *mbmi = mi_4x4[0];
+ const int seg_skip =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ const int mis = cm->mi_params.mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int is_inter = is_inter_block(mbmi);
+
+ // Initialize tx_mode and tx_size_search_method
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ set_tx_size_search_method(
+ cm, &cpi->winner_mode_params, txfm_params,
+ cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ if (!is_inter) {
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+ mbmi->skip_txfm = 1;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+ cpi->optimize_seg_arr[mbmi->segment_id]);
+ }
+
+ // If there is at least one lossless segment, force the skip for intra
+ // block to be 0, in order to avoid the segment_id to be changed by in
+ // write_segment_id().
+ if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+ cpi->enc_seg.has_lossless_segment)
+ mbmi->skip_txfm = 0;
+
+ xd->cfl.store_y = 0;
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+ for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+ if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+ if (!dry_run) {
+ av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+ PALETTE_MAP, tile_data->allow_update_cdf,
+ td->counts);
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ *rate +=
+ av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+ }
+ }
+ }
+ }
+
+ av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize,
+ tile_data->allow_update_cdf);
+ } else {
+ int ref;
+ const int is_compound = has_second_ref(mbmi);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const YV12_BUFFER_CONFIG *cfg =
+ get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
+ assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+ av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+ xd->block_ref_scale_factors[ref], num_planes);
+ }
+ // Predicted sample of inter mode (for Luma plane) cannot be reused if
+ // nonrd_check_partition_split speed feature is enabled, Since in such cases
+ // the buffer may not contain the predicted sample of best mode.
+ const int start_plane =
+ (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) &&
+ cm->seq_params->bit_depth == AOM_BITS_8)
+ ? 1
+ : 0;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ start_plane, av1_num_planes(cm) - 1);
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ assert(cpi->oxcf.motion_mode_cfg.enable_obmc);
+ av1_build_obmc_inter_predictors_sb(cm, xd);
+ }
+
+#if CONFIG_MISMATCH_DEBUG
+ if (dry_run == OUTPUT_ENABLED) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+ pd->subsampling_x, pd->subsampling_y);
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
+ cm->current_frame.order_hint, plane, pixel_c,
+ pixel_r, pd->width, pd->height,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+ }
+#else
+ (void)num_planes;
+#endif
+
+ av1_encode_sb(cpi, x, bsize, dry_run);
+ av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
+ tile_data->allow_update_cdf);
+ }
+
+ if (!dry_run) {
+ if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 &&
+ !(is_inter && (mbmi->skip_txfm || seg_skip))) {
+ if (is_inter) {
+ tx_partition_count_update(cm, x, bsize, td->counts,
+ tile_data->allow_update_cdf);
+ } else {
+ if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+ ++x->txfm_search_info.txb_split_count;
+ if (block_signals_txsize(bsize)) {
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+
+ if (tile_data->allow_update_cdf)
+ update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+ }
+ }
+ assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+ } else {
+ int i, j;
+ TX_SIZE intra_tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ intra_tx_size = TX_4X4;
+ } else {
+ intra_tx_size =
+ tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+ }
+ } else {
+ intra_tx_size = mbmi->tx_size;
+ }
+
+ const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width);
+ const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height);
+ for (j = 0; j < rows; j++) {
+ for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+ }
+
+ if (intra_tx_size != max_txsize_rect_lookup[bsize])
+ ++x->txfm_search_info.txb_split_count;
+ }
+ }
+
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ block_signals_txsize(mbmi->bsize) && is_inter &&
+ !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+ if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
+ } else {
+ TX_SIZE tx_size = mbmi->tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ tx_size = TX_4X4;
+ } else {
+ tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+ }
+ } else {
+ tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+ }
+ mbmi->tx_size = tx_size;
+ set_txfm_ctxs(tx_size, xd->width, xd->height,
+ (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd);
+ }
+
+ if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
+ cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
+ }
+ if (!dry_run) {
+ if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 &&
+ cpi->sf.rt_sf.use_temporal_noise_estimate &&
+ (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
+ update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
+ }
+}
+
+static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+ x->rdmult = cpi->rd.RDMULT;
+
+ if (aq_mode != NO_AQ) {
+ assert(mbmi != NULL);
+ if (aq_mode == VARIANCE_AQ) {
+ if (cpi->vaq_refresh) {
+ const int energy = bsize <= BLOCK_16X16
+ ? x->mb_energy
+ : av1_log_block_var(cpi, x, bsize);
+ mbmi->segment_id = energy;
+ }
+ x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == COMPLEXITY_AQ) {
+ x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+ // If segment is boosted, use rdmult for that segment.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+ x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+ }
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->common.delta_q_info.delta_q_present_flag &&
+ !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
+ av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col,
+ &x->rdmult);
+ }
+#if CONFIG_SALIENCY_MAP
+ else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP) {
+ av1_set_saliency_map_vmaf_rdmult(cpi, &x->errorperbit,
+ cpi->common.seq_params->sb_size, mi_row,
+ mi_col, &x->rdmult);
+ }
+#endif
+#if CONFIG_TUNE_VMAF
+ else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+ else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+#endif
+ if (cpi->oxcf.mode == ALLINTRA) {
+ x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7);
+ }
+
+ // Check to make sure that the adjustments above have not caused the
+ // rd multiplier to be truncated to 0.
+ x->rdmult = (x->rdmult > 0) ? x->rdmult : 1;
+}
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+
+ set_entropy_context(xd, mi_row, mi_col, num_planes);
+ xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+ mi_width, cpi->oxcf.border_in_pixels);
+
+ set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+ cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+ // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+ xd->tile = *tile;
+}
+
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+
+ av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+ // Setup segment ID.
+ mbmi = xd->mi[0];
+ mbmi->segment_id = 0;
+ if (seg->enabled) {
+ if (seg->enabled && !cpi->vaq_refresh) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+ mbmi->segment_id =
+ map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
+ }
+ av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0);
+ }
+#ifndef NDEBUG
+ x->last_set_offsets_loc.mi_row = mi_row;
+ x->last_set_offsets_loc.mi_col = mi_col;
+ x->last_set_offsets_loc.bsize = bsize;
+#endif // NDEBUG
+}
+
+/*!\brief Hybrid intra mode search.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This is top level function for mode search for intra frames in non-RD
+ * optimized case. Depending on speed feature and block size it calls
+ * either non-RD or RD optimized intra mode search.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+
+static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ RD_STATS *rd_cost,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ int use_rdopt = 0;
+ const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode;
+ // Use rd pick for intra mode search based on block size and variance.
+ if (hybrid_intra_pickmode && bsize < BLOCK_16X16) {
+ unsigned int var_thresh[3] = { 0, 101, 201 };
+ assert(hybrid_intra_pickmode <= 3);
+ if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1])
+ use_rdopt = 1;
+ }
+
+ if (use_rdopt)
+ av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+ else
+ av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+// For real time/allintra row-mt enabled multi-threaded encoding with cost
+// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated
+// at superblock level. Thus, it is not required for the encoding of top-right
+// superblock be complete for updating tile ctxt. However, when encoding a block
+// whose right edge is also the superblock edge, intra and inter mode evaluation
+// (ref mv list population) require the encoding of the top-right superblock to
+// be complete. So, here, we delay the waiting of threads until the need for the
+// data from the top-right superblock region.
+static AOM_INLINE void wait_for_top_right_sb(
+ AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
+ TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int sb_size_in_mi = mi_size_wide[sb_size];
+ const int bw_in_mi = mi_size_wide[bsize];
+ const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
+ const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1);
+ const int top_right_block_in_sb =
+ (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
+
+ // Don't wait if the block is the not the top-right block in the superblock.
+ if (!top_right_block_in_sb) return;
+
+ // Wait for the top-right superblock to finish encoding.
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
+ const int sb_col_in_tile =
+ (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
+
+ enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+}
+
+/*!\brief Interface for AV1 mode search for an individual coding block
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level interface that
+ * directs the encoder to the proper mode search function, among these
+ * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding
+ * \param[in] x Pointer to structure holding all the data for
+ * the current macroblock
+ * \param[in] mi_row Row coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] rd_cost Pointer to structure holding rate and distortion
+ * stats for the current block
+ * \param[in] partition Partition mode of the parent block
+ * \param[in] bsize Current block size
+ * \param[in] ctx Pointer to structure holding coding contexts and
+ * chosen modes for the current block
+ * \param[in] best_rd Upper bound of rd cost of a valid partition
+ *
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, PARTITION_TYPE partition,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ RD_STATS best_rd) {
+ if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) {
+ ctx->rd_stats.rdcost = INT64_MAX;
+ ctx->rd_stats.skip_txfm = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
+ av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+ if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab &&
+ ctx->rd_mode_is_ready) {
+ assert(ctx->mic.bsize == bsize);
+ assert(ctx->mic.partition == partition);
+ rd_cost->rate = ctx->rd_stats.rate;
+ rd_cost->dist = ctx->rd_stats.dist;
+ rd_cost->rdcost = ctx->rd_stats.rdcost;
+ return;
+ }
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ int i;
+
+ // This is only needed for real time/allintra row-mt enabled multi-threaded
+ // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+ wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+ &tile_data->tile_info, cm->seq_params->sb_size,
+ cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+ mbmi = xd->mi[0];
+ mbmi->bsize = bsize;
+ mbmi->partition = partition;
+
+#if CONFIG_RD_DEBUG
+ mbmi->mi_row = mi_row;
+ mbmi->mi_col = mi_col;
+#endif
+
+ // Sets up the tx_type_map buffer in MACROBLOCKD.
+ xd->tx_type_map = txfm_info->tx_type_map_;
+ xd->tx_type_map_stride = mi_size_wide[bsize];
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ p[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+ ctx->skippable = 0;
+ // Set to zero to make sure we do not use the previous encoded frame stats
+ mbmi->skip_txfm = 0;
+ // Reset skip mode flag.
+ mbmi->skip_mode = 0;
+
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+ // Set error per bit for current rdmult
+ av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+ av1_rd_cost_update(x->rdmult, &best_rd);
+
+ // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous
+ // rdcost information for the following mode search.
+ // Disabling the feature could get some coding gain, with encoder slowdown.
+ if (!cpi->sf.part_sf.use_best_rd_for_pruning) {
+ av1_invalid_rd_stats(&best_rd);
+ }
+
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+ av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+ } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx, best_rd.rdcost);
+ } else {
+ av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx,
+ best_rd.rdcost);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+ }
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+ bsize >= BLOCK_16X16) {
+ av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+ }
+
+ x->rdmult = orig_rdmult;
+
+ // TODO(jingning) The rate-distortion optimization flow needs to be
+ // refactored to provide proper exit/return handle.
+ if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+ ctx->rd_stats.rate = rd_cost->rate;
+ ctx->rd_stats.dist = rd_cost->dist;
+ ctx->rd_stats.rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+ if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
+ is_comp_ref_allowed(bsize)) {
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+ td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+ update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+ }
+
+ if (!mbmi->skip_mode && !seg_ref_active) {
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+#if CONFIG_ENTROPY_STATS
+ td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++;
+#endif
+ update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2);
+ }
+
+#if CONFIG_ENTROPY_STATS
+ // delta quant applies to both intra and inter
+ const int super_block_upper_left =
+ ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+ ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ if (delta_q_info->delta_q_present_flag &&
+ (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+ super_block_upper_left) {
+ const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
+ delta_q_info->delta_q_res;
+ const int absdq = abs(dq);
+ for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+ td->counts->delta_q[i][1]++;
+ }
+ if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+ if (delta_q_info->delta_lf_present_flag) {
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+ delta_q_info->delta_lf_res;
+ const int abs_delta_lf = abs(delta_lf);
+ for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf_multi[lf_id][i][1]++;
+ }
+ if (abs_delta_lf < DELTA_LF_SMALL)
+ td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+ }
+ } else {
+ const int delta_lf =
+ (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+ delta_q_info->delta_lf_res;
+ const int abs_delta_lf = abs(delta_lf);
+ for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf[i][1]++;
+ }
+ if (abs_delta_lf < DELTA_LF_SMALL)
+ td->counts->delta_lf[abs_delta_lf][0]++;
+ }
+ }
+ }
+#endif
+
+ if (!is_inter_block(mbmi)) {
+ av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+ frame_is_intra_only(cm));
+ }
+
+ if (av1_allow_intrabc(cm)) {
+ const int is_intrabc = is_intrabc_block(mbmi);
+ update_cdf(fc->intrabc_cdf, is_intrabc, 2);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->intrabc[is_intrabc];
+#endif // CONFIG_ENTROPY_STATS
+ if (is_intrabc) {
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+ av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc,
+ MV_SUBPEL_NONE);
+ }
+ }
+
+ if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
+
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mbmi);
+
+ if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+ counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+ update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+ inter_block, 2);
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (inter_block) {
+ const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+ const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+ if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+ if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->comp_inter[av1_get_reference_mode_context(xd)]
+ [has_second_ref(mbmi)]++;
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
+ }
+ }
+
+ if (has_second_ref(mbmi)) {
+ const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+ ? UNIDIR_COMP_REFERENCE
+ : BIDIR_COMP_REFERENCE;
+ update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+ COMP_REFERENCE_TYPES);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+ [comp_ref_type]++;
+#endif // CONFIG_ENTROPY_STATS
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = (ref0 == BWDREF_FRAME);
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts
+ ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit) {
+ const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+ [bit1]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (bit1) {
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+ ref1 == GOLDEN_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+ [ref1 == GOLDEN_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ } else {
+ const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+ update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit) {
+ update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+ [ref0 == LAST2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+ [ref0 == GOLDEN_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+ [ref1 == ALTREF_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (ref1 != ALTREF_FRAME) {
+ update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+ ref1 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+ [ref1 == ALTREF2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ } else {
+ const int bit = (ref0 >= BWDREF_FRAME);
+ update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (bit) {
+ assert(ref0 <= ALTREF_FRAME);
+ update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+ [ref0 == ALTREF_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (ref0 != ALTREF_FRAME) {
+ update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+ ref0 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+ [ref0 == ALTREF2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ } else {
+ const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+ update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit1) {
+ update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+ [ref0 != LAST_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+ [ref0 != LAST3_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ }
+
+ if (cm->seq_params->enable_interintra_compound &&
+ is_interintra_allowed(mbmi)) {
+ const int bsize_group = size_group_lookup[bsize];
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+ counts->interintra[bsize_group][1]++;
+#endif
+ update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+ update_cdf(fc->interintra_mode_cdf[bsize_group],
+ mbmi->interintra_mode, INTERINTRA_MODES);
+ if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+ update_cdf(fc->wedge_interintra_cdf[bsize],
+ mbmi->use_wedge_interintra, 2);
+ if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+ update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+ 16);
+ }
+ }
+ } else {
+#if CONFIG_ENTROPY_STATS
+ counts->interintra[bsize_group][0]++;
+#endif
+ update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+ }
+ }
+
+ const MOTION_MODE motion_allowed =
+ cm->features.switchable_motion_mode
+ ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->features.allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+ if (mbmi->ref_frame[1] != INTRA_FRAME) {
+ if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+ counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+ update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+ MOTION_MODES);
+ } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+ counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+ update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+ }
+ }
+
+ if (has_second_ref(mbmi)) {
+ assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+ if (masked_compound_used) {
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+ update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+ mbmi->comp_group_idx, 2);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+ update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+ 2);
+ } else {
+ assert(masked_compound_used);
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+ COMPOUND_WEDGE];
+#endif
+ update_cdf(fc->compound_type_cdf[bsize],
+ mbmi->interinter_comp.type - COMPOUND_WEDGE,
+ MASKED_COMPOUND_TYPES);
+ }
+ }
+ }
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+ update_cdf(fc->wedge_idx_cdf[bsize],
+ mbmi->interinter_comp.wedge_index, 16);
+ }
+ }
+ }
+ }
+
+ if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+ av1_is_interp_needed(xd)) {
+ update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter);
+ }
+ if (inter_block &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+ if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+ update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+ INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+ } else {
+ av1_update_inter_mode_stats(fc, counts, mode, mode_ctx);
+ }
+
+ const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+ if (new_mv) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ const uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+ if (mbmi->ref_mv_idx == idx) break;
+ }
+ }
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ const uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+ if (mbmi->ref_mv_idx == idx - 1) break;
+ }
+ }
+ }
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ const int allow_hp = cm->features.cur_frame_force_integer_mv
+ ? MV_SUBPEL_NONE
+ : cm->features.allow_high_precision_mv;
+ if (new_mv) {
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+ allow_hp);
+ }
+ } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+ const int ref = 1;
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+ allow_hp);
+ } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+ const int ref = 0;
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+ allow_hp);
+ }
+ }
+ }
+}
+
+/*!\brief Reconstructs an individual coding block
+ *
+ * \ingroup partition_search
+ * Reconstructs an individual coding block by applying the chosen modes stored
+ * in ctx, also updates mode counts and entropy models.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during encoding
+ * \param[in] td Pointer to thread data
+ * \param[in] tp Pointer to the starting token
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ * \param[in] bsize Current block size
+ * \param[in] partition Partition mode of the parent block
+ * \param[in] ctx Pointer to structure holding coding contexts and the
+ * chosen modes for the current block
+ * \param[in] rate Pointer to the total rate for the current block
+ *
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes
+ * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0].
+ */
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TokenExtra **tp, int mi_row, int mi_col,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx,
+ int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int subsampling_x = cm->seq_params->subsampling_x;
+ const int subsampling_y = cm->seq_params->subsampling_y;
+
+ av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ const int origin_mult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->partition = partition;
+ av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+ if (!dry_run) {
+ set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+ x->cb_offset[PLANE_TYPE_UV]);
+ assert(x->cb_offset[PLANE_TYPE_Y] <
+ (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+ assert(x->cb_offset[PLANE_TYPE_UV] <
+ ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+ (subsampling_x + subsampling_y)));
+ }
+
+ encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+ if (!dry_run) {
+ update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+ if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 &&
+ cm->delta_q_info.delta_lf_present_flag) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+ }
+ if (has_second_ref(mbmi)) {
+ if (mbmi->compound_idx == 0 ||
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+ mbmi->comp_group_idx = 0;
+ else
+ mbmi->comp_group_idx = 1;
+ }
+
+ // delta quant applies to both intra and inter
+ const int super_block_upper_left =
+ ((mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+ ((mi_col & (cm->seq_params->mib_size - 1)) == 0);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ if (delta_q_info->delta_q_present_flag &&
+ (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+ super_block_upper_left) {
+ xd->current_base_qindex = mbmi->current_qindex;
+ if (delta_q_info->delta_lf_present_flag) {
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ }
+ } else {
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+
+ RD_COUNTS *rdc = &td->rd_counts;
+ if (mbmi->skip_mode) {
+ assert(!frame_is_intra_only(cm));
+ rdc->skip_mode_used_flag = 1;
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ assert(has_second_ref(mbmi));
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ } else {
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active) {
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (is_inter_block(mbmi)) {
+ av1_collect_neighbors_ref_counts(xd);
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ if (has_second_ref(mbmi)) {
+ // This flag is also updated for 4x4 blocks
+ rdc->compound_ref_used_flag = 1;
+ }
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ }
+ }
+ }
+
+ if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+ // Gather obmc and warped motion count to update the probability.
+ if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) ||
+ (cm->features.allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+ const int inter_block = is_inter_block(mbmi);
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active && inter_block) {
+ const MOTION_MODE motion_allowed =
+ cm->features.switchable_motion_mode
+ ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->features.allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME) {
+ if (motion_allowed >= OBMC_CAUSAL) {
+ td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+ }
+ if (motion_allowed == WARPED_CAUSAL) {
+ td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+ }
+ }
+ }
+ }
+ }
+ // TODO(Ravi/Remya): Move this copy function to a better logical place
+ // This function will copy the best mode information from block
+ // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+ // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+ // bitstream preparation.
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+ x->rdmult = origin_mult;
+}
+
+/*!\brief Reconstructs a partition (may contain multiple coding blocks)
+ *
+ * \ingroup partition_search
+ * Reconstructs a sub-partition of the superblock by applying the chosen modes
+ * and partition trees stored in pc_tree.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] td Pointer to thread data
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during encoding
+ * \param[in] tp Pointer to the starting token
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ * \param[in] bsize Current block size
+ * \param[in] pc_tree Pointer to the PC_TREE node storing the picked
+ * partitions and mode info for the current block
+ * \param[in] rate Pointer to the total rate for the current block
+ *
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd.
+ */
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree, int *rate) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int is_partition_root = bsize >= BLOCK_8X8;
+ const int ctx = is_partition_root
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : -1;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+#if !CONFIG_REALTIME_ONLY
+ int quarter_step = mi_size_wide[bsize] / 4;
+ int i;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+ if (subsize == BLOCK_INVALID) return;
+
+ if (!dry_run && ctx >= 0) {
+ const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+ const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+
+ if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+ td->counts->partition[ctx][partition]++;
+#endif
+
+ if (tile_data->allow_update_cdf) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ update_cdf(fc->partition_cdf[ctx], partition,
+ partition_cdf_length(bsize));
+ }
+ }
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->none, rate);
+ break;
+ case PARTITION_VERT:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->vertical[0], rate);
+ if (mi_col + hbs < mi_params->mi_cols) {
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, pc_tree->vertical[1], rate);
+ }
+ break;
+ case PARTITION_HORZ:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontal[0], rate);
+ if (mi_row + hbs < mi_params->mi_rows) {
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontal[1], rate);
+ }
+ break;
+ case PARTITION_SPLIT:
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+ pc_tree->split[0], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ pc_tree->split[1], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ pc_tree->split[2], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ subsize, pc_tree->split[3], rate);
+ break;
+
+#if !CONFIG_REALTIME_ONLY
+ case PARTITION_HORZ_A:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+ partition, pc_tree->horizontala[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, pc_tree->horizontala[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontala[2], rate);
+ break;
+ case PARTITION_HORZ_B:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontalb[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, pc_tree->horizontalb[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ bsize2, partition, pc_tree->horizontalb[2], rate);
+ break;
+ case PARTITION_VERT_A:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+ partition, pc_tree->verticala[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, pc_tree->verticala[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, pc_tree->verticala[2], rate);
+
+ break;
+ case PARTITION_VERT_B:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->verticalb[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, pc_tree->verticalb[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ bsize2, partition, pc_tree->verticalb[2], rate);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+ encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontal4[i], rate);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+ encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+ partition, pc_tree->vertical4[i], rate);
+ }
+ break;
+#endif
+ default: assert(0 && "Invalid partition type."); break;
+ }
+
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE int is_adjust_var_based_part_enabled(
+ AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf,
+ BLOCK_SIZE bsize) {
+ if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0;
+ if (part_sf->adjust_var_based_rd_partitioning == 0 ||
+ part_sf->adjust_var_based_rd_partitioning > 2)
+ return 0;
+
+ if (bsize <= BLOCK_32X32) return 1;
+ if (part_sf->adjust_var_based_rd_partitioning == 2) {
+ const int is_larger_qindex = cm->quant_params.base_qindex > 190;
+ const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+ return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64;
+ }
+ return 0;
+}
+
+/*!\brief AV1 block partition search (partition estimation and partial search).
+*
+* \ingroup partition_search
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. Minor partition
+* adjustments are tested and applied if they lead to lower rd costs. The
+* partition types are limited to a basic set: none, horz, vert, and split.
+*
+* \param[in] cpi Top-level encoder structure
+* \param[in] td Pointer to thread data
+* \param[in] tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in] mib Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in] tp Pointer to the starting token
+* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+* \param[in] mi_col Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in] bsize Current block size
+* \param[in] rate Pointer to the final rate for encoding the current
+block
+* \param[in] dist Pointer to the final distortion of the current block
+* \param[in] do_recon Whether the reconstruction function needs to be run,
+either for finalizing a superblock or providing
+reference for future sub-partitions
+* \param[in] pc_tree Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \remark Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes. The rate and dist are also updated with those
+* corresponding to the best partition found.
+*/
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
+ int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
+ BLOCK_SIZE bs_type = mib[0]->bsize;
+ int use_partition_none = 0;
+ x->try_merge_partition = 0;
+
+ if (pc_tree->none == NULL) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ // In rt mode, currently the min partition size is BLOCK_8X8.
+ assert(bsize >= cpi->sf.part_sf.default_min_partition_size);
+
+ av1_invalid_rd_stats(&last_part_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_invalid_rd_stats(&chosen_rdc);
+ av1_invalid_rd_stats(&invalid_rdc);
+
+ pc_tree->partitioning = partition;
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+ }
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+ if (partition != PARTITION_NONE &&
+ is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) &&
+ (mi_row + hbs < mi_params->mi_rows &&
+ mi_col + hbs < mi_params->mi_cols)) {
+ assert(bsize > cpi->sf.part_sf.default_min_partition_size);
+ mib[0]->bsize = bsize;
+ pc_tree->partitioning = PARTITION_NONE;
+ x->try_merge_partition = 1;
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE,
+ bsize, ctx_none, invalid_rdc);
+
+ if (none_rdc.rate < INT_MAX) {
+ none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ }
+
+ // Try to skip split partition evaluation based on none partition
+ // characteristics.
+ if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
+ use_partition_none = 1;
+ }
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ mib[0]->bsize = bs_type;
+ pc_tree->partitioning = partition;
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+ switch (partition) {
+ case PARTITION_NONE:
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_NONE, bsize, ctx_none, invalid_rdc);
+ break;
+ case PARTITION_HORZ:
+ if (use_partition_none) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ pc_tree->horizontal[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->horizontal[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_HORZ, subsize, pc_tree->horizontal[0],
+ invalid_rdc);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_row + hbs < mi_params->mi_rows) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
+ av1_init_rd_stats(&tmp_rdc);
+ av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+ NULL);
+ pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+ PARTITION_HORZ, subsize, pc_tree->horizontal[1],
+ invalid_rdc);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_VERT:
+ if (use_partition_none) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ pc_tree->vertical[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->vertical[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_col + hbs < mi_params->mi_cols) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
+ av1_init_rd_stats(&tmp_rdc);
+ av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+ NULL);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+ PARTITION_VERT, subsize,
+ pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (use_partition_none) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+
+ last_part_rdc.rate = 0;
+ last_part_rdc.dist = 0;
+ last_part_rdc.rdcost = 0;
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ RD_STATS tmp_rdc;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+
+ av1_init_rd_stats(&tmp_rdc);
+ av1_rd_use_partition(
+ cpi, td, tile_data,
+ mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+ &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ }
+ break;
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
+ default: assert(0); break;
+ }
+
+ if (last_part_rdc.rate < INT_MAX) {
+ last_part_rdc.rate += mode_costs->partition_cost[pl][partition];
+ last_part_rdc.rdcost =
+ RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+ }
+
+ if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+ cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+ partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+ (mi_row + bs < mi_params->mi_rows ||
+ mi_row + hbs == mi_params->mi_rows) &&
+ (mi_col + bs < mi_params->mi_cols ||
+ mi_col + hbs == mi_params->mi_cols)) {
+ BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ chosen_rdc.rate = 0;
+ chosen_rdc.dist = 0;
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ // Split partition.
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ RD_STATS tmp_rdc;
+
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ if (pc_tree->split[i]->none == NULL)
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+ PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
+ invalid_rdc);
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&chosen_rdc);
+ break;
+ }
+
+ chosen_rdc.rate += tmp_rdc.rate;
+ chosen_rdc.dist += tmp_rdc.dist;
+
+ if (i != SUB_PARTITIONS_SPLIT - 1)
+ encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+ OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+ chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ }
+ if (chosen_rdc.rate < INT_MAX) {
+ chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+ }
+ }
+
+ // If last_part is better set the partitioning to that.
+ if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+ mib[0]->bsize = bs_type;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+
+ chosen_rdc = last_part_rdc;
+ }
+ // If none was better set the partitioning to that.
+ if (none_rdc.rdcost < INT64_MAX &&
+ none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) {
+ mib[0]->bsize = bsize;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ chosen_rdc = none_rdc;
+ }
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if (bsize == cm->seq_params->sb_size)
+ assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
+ if (do_recon) {
+ if (bsize == cm->seq_params->sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
+
+ *rate = chosen_rdc.rate;
+ *dist = chosen_rdc.dist;
+ x->rdmult = orig_rdmult;
+}
+
+static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TokenExtra **tp, int mi_row,
+ int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition,
+ PICK_MODE_CONTEXT *const ctx, int *rate) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ const int origin_mult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->partition = partition;
+ av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+ const int subsampling_x = cpi->common.seq_params->subsampling_x;
+ const int subsampling_y = cpi->common.seq_params->subsampling_y;
+ if (!dry_run) {
+ set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+ x->cb_offset[PLANE_TYPE_UV]);
+ assert(x->cb_offset[PLANE_TYPE_Y] <
+ (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+ assert(x->cb_offset[PLANE_TYPE_UV] <
+ ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+ (subsampling_x + subsampling_y)));
+ }
+
+ encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+ if (!dry_run) {
+ update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+ if (has_second_ref(mbmi)) {
+ if (mbmi->compound_idx == 0 ||
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+ mbmi->comp_group_idx = 0;
+ else
+ mbmi->comp_group_idx = 1;
+ mbmi->compound_idx = 1;
+ }
+ RD_COUNTS *const rdc = &td->rd_counts;
+ if (mbmi->skip_mode) {
+ assert(!frame_is_intra_only(cm));
+ rdc->skip_mode_used_flag = 1;
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+ has_second_ref(mbmi)) {
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ } else {
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active) {
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (is_inter_block(mbmi)) {
+ av1_collect_neighbors_ref_counts(xd);
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+ has_second_ref(mbmi)) {
+ // This flag is also updated for 4x4 blocks
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ }
+ }
+ }
+ if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+ (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) {
+ int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize];
+ rdc->newmv_or_intra_blocks += blocks;
+ }
+ if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+ }
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+ !cpi->rc.rtc_external_ratectrl && cm->seg.enabled)
+ av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run);
+ // TODO(Ravi/Remya): Move this copy function to a better logical place
+ // This function will copy the best mode information from block
+ // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+ // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+ // bitstream preparation.
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+ x->rdmult = origin_mult;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+}
+
+static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ // Force zero MV skip based on SB level decision
+ if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb;
+
+ // For blocks of size equal to superblock size, the decision would have been
+ // already done at superblock level. Hence zeromv-skip decision is skipped.
+ const AV1_COMMON *const cm = &cpi->common;
+ if (bsize == cm->seq_params->sb_size) return 0;
+
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const unsigned int thresh_exit_part_y =
+ cpi->zeromv_skip_thresh_exit_part[bsize];
+ const unsigned int thresh_exit_part_uv =
+ CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
+ const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y,
+ thresh_exit_part_uv,
+ thresh_exit_part_uv };
+ const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, LAST_FRAME);
+
+ struct buf_2d yv12_mb[MAX_MB_PLANE];
+ av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+ assert(plane < MAX_MB_PLANE);
+ if (plane_sad >= thresh_exit_part[plane]) return 0;
+ }
+ return 1;
+}
+
+/*!\brief Top level function to pick block mode for non-RD optimized case
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level function that is
+ * used for non-RD optimized mode search (controlled by
+ * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls
+ * inter/skip/hybrid-intra mode search functions
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding
+ * \param[in] x Pointer to structure holding all the data for
+ * the current macroblock
+ * \param[in] mi_row Row coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] rd_cost Pointer to structure holding rate and distortion
+ * stats for the current block
+ * \param[in] bsize Current block size
+ * \param[in] ctx Pointer to structure holding coding contexts and
+ * chosen modes for the current block
+ *
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ // For nonrd mode, av1_set_offsets is already called at the superblock level
+ // in encode_nonrd_sb when we determine the partitioning.
+ if (bsize != cpi->common.seq_params->sb_size ||
+ cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+ av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+ }
+ assert(x->last_set_offsets_loc.mi_row == mi_row &&
+ x->last_set_offsets_loc.mi_col == mi_col &&
+ x->last_set_offsets_loc.bsize == bsize);
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int i;
+
+ // This is only needed for real time/allintra row-mt enabled multi-threaded
+ // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+ wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+ &tile_data->tile_info, cm->seq_params->sb_size,
+ cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+ // Sets up the tx_type_map buffer in MACROBLOCKD.
+ xd->tx_type_map = txfm_info->tx_type_map_;
+ xd->tx_type_map_stride = mi_size_wide[bsize];
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ p[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+ x->force_zeromv_skip_for_blk =
+ get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
+
+ // Source variance may be already compute at superblock level, so no need
+ // to recompute, unless bsize < sb_size or source_variance is not yet set.
+ if (!x->force_zeromv_skip_for_blk &&
+ (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+ // Set error per bit for current rdmult
+ av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+ hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+ } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ RD_STATS invalid_rd;
+ av1_invalid_rd_stats(&invalid_rd);
+ // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip
+ av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx,
+ invalid_rd.rdcost);
+ } else {
+ av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+ }
+ if (cpi->sf.rt_sf.skip_cdef_sb) {
+ // cdef_strength is initialized to 1 which means skip_cdef, and is updated
+ // here. Check to see is skipping cdef is allowed.
+ const int allow_cdef_skipping =
+ cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
+ !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]);
+
+ // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
+ // the block size.
+ const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
+ const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64;
+ MB_MODE_INFO **mi_sb =
+ cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
+ // Do not skip if intra or new mv is picked, or color sensitivity is set.
+ // Never skip on slide/scene change.
+ if (cpi->sf.rt_sf.skip_cdef_sb >= 2) {
+ mi_sb[0]->cdef_strength =
+ mi_sb[0]->cdef_strength &&
+ (allow_cdef_skipping || x->source_variance == 0);
+ } else {
+ mi_sb[0]->cdef_strength =
+ mi_sb[0]->cdef_strength && allow_cdef_skipping &&
+ !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+ }
+ // Store in the pickmode context.
+ ctx->mic.cdef_strength = mi_sb[0]->cdef_strength;
+ }
+ x->rdmult = orig_rdmult;
+ ctx->rd_stats.rate = rd_cost->rate;
+ ctx->rd_stats.dist = rd_cost->dist;
+ ctx->rd_stats.rdcost = rd_cost->rdcost;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+}
+
+static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
+ TileDataEnc *const tile_data,
+ TileInfo *const tile_info, TokenExtra **tp,
+ MACROBLOCK *const x, MACROBLOCKD *const xd,
+ const CommonModeInfoParams *const mi_params,
+ const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize, const int pl,
+ PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int hbs = mi_size_wide[bsize] / 2;
+ if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows ||
+ mi_col + mi_size_wide[bsize] >= mi_params->mi_cols)
+ return 0;
+ if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0;
+ if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0;
+
+ // Do not try split partition when the source sad is small, or
+ // the prediction residual is small.
+ const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, LAST_FRAME);
+ const int num_planes = av1_num_planes(cm);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+ av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes);
+ int block_sad = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+ block_sad += plane_sad;
+ }
+ const int blk_pix = block_size_wide[bsize] * block_size_high[bsize];
+ const int block_avg_sad = block_sad / blk_pix;
+ // TODO(chengchen): find a proper threshold. It might change according to
+ // q as well.
+ const int threshold = 25;
+ if (block_avg_sad < threshold) return 0;
+
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS split_rdc, none_rdc;
+ av1_invalid_rd_stats(&split_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Calculate rdcost for none partition
+ pc_tree->partitioning = PARTITION_NONE;
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ if (!pc_tree->none) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->none);
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+ pc_tree->none);
+ none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+ // Calculate rdcost for split partition
+ pc_tree->partitioning = PARTITION_SPLIT;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ av1_init_rd_stats(&split_rdc);
+ split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ if (subsize >= BLOCK_8X8) {
+ split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4);
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (!pc_tree->split[i]) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ }
+ pc_tree->split[i]->index = i;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ RD_STATS block_rdc;
+ av1_invalid_rd_stats(&block_rdc);
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+ if (!pc_tree->split[i]->none) {
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->split[i]->none);
+ }
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &block_rdc, subsize, pc_tree->split[i]->none);
+ split_rdc.rate += block_rdc.rate;
+ split_rdc.dist += block_rdc.dist;
+ av1_rd_cost_update(x->rdmult, &split_rdc);
+ if (none_rdc.rdcost < split_rdc.rdcost) break;
+ if (i != SUB_PARTITIONS_SPLIT - 1)
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+ subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+ }
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+ split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+ const int split = split_rdc.rdcost < none_rdc.rdcost;
+
+ return split;
+}
+
+// Returns if SPLIT partitions should be evaluated
+static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const PC_TREE *pc_tree, const RD_STATS *none_rdc,
+ const CommonModeInfoParams *mi_params,
+ int mi_row, int mi_col, int hbs,
+ BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_larger_qindex = cm->quant_params.base_qindex > 100;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ bool do_split =
+ (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3)
+ ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64))
+ : true;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
+ cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) ||
+ !none_rdc->skip_txfm)
+ return do_split;
+
+ const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+ // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm
+ // would have been populated based on Hadamard transform and skip_txfm flag is
+ // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8
+ // and 16x16 blocks.
+ // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may
+ // not be reliable. Hence SPLIT evaluation is disabled only at lower
+ // quantizers for blocks >= 32x32.
+ if ((!use_model_yrd_large) || (!is_larger_qindex)) return false;
+
+ // Use residual statistics to decide if SPLIT partition should be evaluated
+ // for 32x32 blocks. The pruning logic is avoided for larger block size to
+ // avoid the visual artifacts
+ if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ assert(subsize < BLOCK_SIZES_ALL);
+ double min_per_pixel_error = DBL_MAX;
+ double max_per_pixel_error = 0.;
+ int i;
+ for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ const int x_idx = (i & 1) * hbs;
+ const int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols)) {
+ break;
+ }
+
+ // Populate the appropriate buffer pointers.
+ // Pass scale factors as NULL as the base pointer of the block would have
+ // been calculated appropriately.
+ struct buf_2d src_split_buf_2d, pred_split_buf_2d;
+ const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src;
+ setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf,
+ src_none_buf_2d->width, src_none_buf_2d->height,
+ src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+ const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst;
+ setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf,
+ pred_none_buf_2d->width, pred_none_buf_2d->height,
+ pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+
+ unsigned int curr_uint_mse;
+ const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf(
+ src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf,
+ pred_split_buf_2d.stride, &curr_uint_mse);
+ const double curr_per_pixel_error =
+ sqrt((double)curr_uint_var / block_size_wide[subsize] /
+ block_size_high[subsize]);
+ if (curr_per_pixel_error < min_per_pixel_error)
+ min_per_pixel_error = curr_per_pixel_error;
+ if (curr_per_pixel_error > max_per_pixel_error)
+ max_per_pixel_error = curr_per_pixel_error;
+ }
+
+ // Prune based on residual statistics only if all the sub-partitions are
+ // valid.
+ if (i == SUB_PARTITIONS_SPLIT) {
+ if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false;
+ }
+ }
+
+ return do_split;
+}
+
+static void try_merge(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TokenExtra **tp, const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize, PC_TREE *const pc_tree,
+ const PARTITION_TYPE partition, const BLOCK_SIZE subsize,
+ const int pl) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int num_planes = av1_num_planes(cm);
+ // Only square blocks from 8x8 to 128x128 are supported
+ assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ bool do_split = false;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS split_rdc, none_rdc;
+ av1_invalid_rd_stats(&split_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ pc_tree->partitioning = PARTITION_NONE;
+ if (!pc_tree->none) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->none);
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+ pc_tree->none);
+ none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+ none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
+ do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row,
+ mi_col, hbs, bsize, partition);
+ if (do_split) {
+ av1_init_rd_stats(&split_rdc);
+ split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ RD_STATS block_rdc;
+ av1_invalid_rd_stats(&block_rdc);
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+ if (!pc_tree->split[i]->none) {
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->split[i]->none);
+ }
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &block_rdc, subsize, pc_tree->split[i]->none);
+ // TODO(yunqingwang): The rate here did not include the cost of
+ // signaling PARTITION_NONE token in the sub-blocks.
+ split_rdc.rate += block_rdc.rate;
+ split_rdc.dist += block_rdc.dist;
+
+ av1_rd_cost_update(x->rdmult, &split_rdc);
+
+ if (none_rdc.rdcost < split_rdc.rdcost) {
+ break;
+ }
+
+ if (i != SUB_PARTITIONS_SPLIT - 1)
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
+ 1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
+ NULL);
+ }
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+ }
+ }
+
+ if (none_rdc.rdcost < split_rdc.rdcost) {
+ /* Predicted samples can not be reused for PARTITION_NONE since same
+ * buffer is being used to store the reconstructed samples of
+ * PARTITION_SPLIT block. */
+ if (do_split) x->reuse_inter_pred = false;
+
+ mib[0]->bsize = bsize;
+ pc_tree->partitioning = PARTITION_NONE;
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+ pc_tree->none, NULL);
+ } else {
+ mib[0]->bsize = subsize;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ /* Predicted samples can not be reused for PARTITION_SPLIT since same
+ * buffer is being used to write the reconstructed samples. */
+ // TODO(Cherma): Store and reuse predicted samples generated by
+ // encode_b_nonrd() in DRY_RUN_NORMAL mode.
+ x->reuse_inter_pred = false;
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+
+ // Note: We don't reset pc_tree->split[i]->none here because it
+ // could contain results from the additional check. Instead, it is
+ // reset before we enter the nonrd_check_partition_merge_mode
+ // condition.
+ if (!pc_tree->split[i]->none) {
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+ }
+ }
+}
+
+// Evaluate if the sub-partitions can be merged directly into a large partition
+// without calculating the RD cost.
+static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ MB_MODE_INFO **b0 = mib;
+ MB_MODE_INFO **b1 = mib + hbs;
+ MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride;
+ MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs;
+
+ // Check if the following conditions are met. This can be updated
+ // later with more support added.
+ const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize ||
+ b2[0]->bsize < subsize || b3[0]->bsize < subsize;
+ if (further_split) return;
+
+ const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm ||
+ !b2[0]->skip_txfm || !b3[0]->skip_txfm;
+ if (no_skip) return;
+
+ const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] ||
+ b0[0]->ref_frame[1] != b2[0]->ref_frame[1] ||
+ b0[0]->ref_frame[1] != b3[0]->ref_frame[1] ||
+ b0[0]->ref_frame[1] > NONE_FRAME);
+ if (compound) return;
+
+ // Intra modes aren't considered here.
+ const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] ||
+ b0[0]->ref_frame[0] != b2[0]->ref_frame[0] ||
+ b0[0]->ref_frame[0] != b3[0]->ref_frame[0] ||
+ b0[0]->ref_frame[0] <= INTRA_FRAME);
+ if (different_ref) return;
+
+ const int different_mode =
+ (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode ||
+ b0[0]->mode != b3[0]->mode);
+ if (different_mode) return;
+
+ const int unsupported_mode =
+ (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV);
+ if (unsupported_mode) return;
+
+ const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int ||
+ b0[0]->mv[0].as_int != b2[0]->mv[0].as_int ||
+ b0[0]->mv[0].as_int != b3[0]->mv[0].as_int);
+ if (different_mv) return;
+
+ const int unsupported_motion_mode =
+ (b0[0]->motion_mode != b1[0]->motion_mode ||
+ b0[0]->motion_mode != b2[0]->motion_mode ||
+ b0[0]->motion_mode != b3[0]->motion_mode ||
+ b0[0]->motion_mode != SIMPLE_TRANSLATION);
+ if (unsupported_motion_mode) return;
+
+ const int diffent_filter =
+ (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int ||
+ b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int ||
+ b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int);
+ if (diffent_filter) return;
+
+ const int different_seg = (b0[0]->segment_id != b1[0]->segment_id ||
+ b0[0]->segment_id != b2[0]->segment_id ||
+ b0[0]->segment_id != b3[0]->segment_id);
+ if (different_seg) return;
+
+ // Evaluate the ref_mv.
+ MB_MODE_INFO **this_mi = mib;
+ BLOCK_SIZE orig_bsize = this_mi[0]->bsize;
+ const PARTITION_TYPE orig_partition = this_mi[0]->partition;
+
+ this_mi[0]->bsize = bsize;
+ this_mi[0]->partition = PARTITION_NONE;
+ this_mi[0]->skip_txfm = 1;
+
+ // TODO(yunqing): functions called below can be optimized by
+ // removing unrelated operations.
+ av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+ mi_col, bsize);
+
+ const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0];
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ int force_skip_low_temp_var = 0;
+ int skip_pred_mv = 0;
+ bool use_scaled_ref;
+
+ for (int i = 0; i < MB_MODE_COUNT; ++i) {
+ for (int j = 0; j < REF_FRAMES; ++j) {
+ frame_mv[i][j].as_int = INVALID_MV;
+ }
+ }
+ av1_copy(x->color_sensitivity, x->color_sensitivity_sb);
+ skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+
+ find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize,
+ force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
+
+ int continue_merging = 1;
+ if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row ||
+ frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col)
+ continue_merging = 0;
+
+ if (!continue_merging) {
+ this_mi[0]->bsize = orig_bsize;
+ this_mi[0]->partition = orig_partition;
+
+ // TODO(yunqing): Store the results and restore here instead of
+ // calling find_predictors() again.
+ av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+ mi_col, this_mi[0]->bsize);
+ find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize,
+ force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
+ } else {
+ struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+ const int is_scaled = av1_is_scaled(sf);
+ const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) ||
+ (abs(this_mi[0]->mv[0].as_mv.col) % 8);
+ const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) ||
+ (abs(this_mi[0]->mv[0].as_mv.col) % 16);
+
+ if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) {
+ const int num_planes = av1_num_planes(cm);
+ set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]);
+ const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+ av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col,
+ xd->block_ref_scale_factors[0], num_planes);
+
+ if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) {
+ assert(is_uv_subpel_mv == 1);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1,
+ num_planes - 1);
+ } else {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ num_planes - 1);
+ }
+ }
+
+ // Copy out mbmi_ext information.
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame;
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(
+ mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame));
+
+ const BLOCK_SIZE this_subsize =
+ get_partition_subsize(bsize, this_mi[0]->partition);
+ // Update partition contexts.
+ update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize,
+ this_mi[0]->partition);
+
+ const int num_planes = av1_num_planes(cm);
+ av1_reset_entropy_context(xd, bsize, num_planes);
+
+ // Note: use x->txfm_search_params.tx_mode_search_type instead of
+ // cm->features.tx_mode here.
+ TX_SIZE tx_size =
+ tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type);
+ if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4;
+ this_mi[0]->tx_size = tx_size;
+ memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size,
+ sizeof(this_mi[0]->inter_tx_size));
+
+ // Update txfm contexts.
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height,
+ this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd);
+
+ // Update mi for this partition block.
+ for (int y = 0; y < bs; y++) {
+ for (int x_idx = 0; x_idx < bs; x_idx++) {
+ this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0];
+ }
+ }
+ }
+}
+
+/*!\brief AV1 block partition application (minimal RD search).
+*
+* \ingroup partition_search
+* \callgraph
+* \callergraph
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. The only
+* partition adjustment allowed is merging leaf split nodes if it leads to a
+* lower rd cost. The partition types are limited to a basic set: none, horz,
+* vert, and split. This function is only used in the real-time mode.
+*
+* \param[in] cpi Top-level encoder structure
+* \param[in] td Pointer to thread data
+* \param[in] tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in] mib Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in] tp Pointer to the starting token
+* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+* \param[in] mi_col Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in] bsize Current block size
+* \param[in] pc_tree Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \remark Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes.
+*/
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TokenExtra **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ // Only square blocks from 8x8 to 128x128 are supported
+ assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ PARTITION_TYPE partition = (bsize >= BLOCK_8X8)
+ ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ assert(subsize <= BLOCK_LARGEST);
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+
+ RD_STATS dummy_cost;
+ av1_invalid_rd_stats(&dummy_cost);
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd;
+
+ int change_none_to_split = 0;
+ if (partition == PARTITION_NONE &&
+ cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+ change_none_to_split =
+ try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params,
+ mi_row, mi_col, bsize, pl, pc_tree);
+ if (change_none_to_split) {
+ partition = PARTITION_SPLIT;
+ subsize = get_partition_subsize(bsize, partition);
+ assert(subsize <= BLOCK_LARGEST);
+ }
+ }
+
+ pc_tree->partitioning = partition;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ if (!pc_tree->none) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->none);
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize,
+ pc_tree->none);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+ partition, pc_tree->none, NULL);
+ break;
+ case PARTITION_VERT:
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (!pc_tree->vertical[i]) {
+ pc_tree->vertical[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->vertical[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->vertical[i]);
+ }
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+ subsize, pc_tree->vertical[0]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+ PARTITION_VERT, pc_tree->vertical[0], NULL);
+ if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs,
+ &dummy_cost, subsize, pc_tree->vertical[1]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+ PARTITION_VERT, pc_tree->vertical[1], NULL);
+ }
+ break;
+ case PARTITION_HORZ:
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (!pc_tree->horizontal[i]) {
+ pc_tree->horizontal[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->horizontal[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->horizontal[i]);
+ }
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+ subsize, pc_tree->horizontal[0]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+ PARTITION_HORZ, pc_tree->horizontal[0], NULL);
+
+ if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col,
+ &dummy_cost, subsize, pc_tree->horizontal[1]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+ PARTITION_HORZ, pc_tree->horizontal[1], NULL);
+ }
+ break;
+ case PARTITION_SPLIT:
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (!pc_tree->split[i]) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ }
+ pc_tree->split[i]->index = i;
+ }
+ if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+ av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+ !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+ try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree,
+ partition, subsize, pl);
+ } else {
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+ av1_nonrd_use_partition(
+ cpi, td, tile_data,
+ mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+ }
+
+ if (!change_none_to_split) {
+ // Note: Palette, cfl are not supported.
+ if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf &&
+ cpi->sf.rt_sf.partition_direct_merging &&
+ mode_costs->partition_cost[pl][PARTITION_NONE] <
+ mode_costs->partition_cost[pl][PARTITION_SPLIT] &&
+ (mi_row + bs <= mi_params->mi_rows) &&
+ (mi_col + bs <= mi_params->mi_cols)) {
+ direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col,
+ bsize);
+ }
+ }
+ }
+ break;
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
+ default: assert(0); break;
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks).
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int is_last,
+ int mi_row, int mi_col, BLOCK_SIZE subsize,
+ RD_STATS best_rdcost, RD_STATS *sum_rdc,
+ PARTITION_TYPE partition,
+ PICK_MODE_CONTEXT *this_ctx) {
+ MACROBLOCK *const x = &td->mb;
+ const int orig_mult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
+
+ av1_rd_cost_update(x->rdmult, &best_rdcost);
+
+ RD_STATS rdcost_remaining;
+ av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+ RD_STATS this_rdc;
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+ subsize, this_ctx, rdcost_remaining);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc->rdcost = INT64_MAX;
+ } else {
+ sum_rdc->rate += this_rdc.rate;
+ sum_rdc->dist += this_rdc.dist;
+ av1_rd_cost_update(x->rdmult, sum_rdc);
+ }
+
+ if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+ x->rdmult = orig_mult;
+ return 0;
+ }
+
+ if (!is_last) {
+ av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+ }
+
+ x->rdmult = orig_mult;
+ return 1;
+}
+
+// Tests an AB partition, and updates the encoder status, the pick mode
+// contexts, the best rdcost, and the best partition.
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ PC_TREE *pc_tree, RD_STATS *best_rdc,
+ int64_t *this_rdcost,
+ PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB],
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition,
+ const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+ const int ab_mi_pos[SUB_PARTITIONS_AB][2],
+ const MB_MODE_INFO **mode_cache) {
+ MACROBLOCK *const x = &td->mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ RD_STATS sum_rdc;
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = x->mode_costs.partition_cost[pl][partition];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ // Loop over sub-partitions in AB partition type.
+ for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+ if (mode_cache && mode_cache[i]) {
+ x->use_mb_mode_cache = 1;
+ x->mb_mode_cache = mode_cache[i];
+ }
+ const int mode_search_success =
+ rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
+ ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
+ *best_rdc, &sum_rdc, partition, ctxs[i]);
+ x->use_mb_mode_cache = 0;
+ x->mb_mode_cache = NULL;
+ if (!mode_search_success) {
+ return false;
+ }
+ }
+
+ av1_rd_cost_update(x->rdmult, &sum_rdc);
+ *this_rdcost = sum_rdc.rdcost;
+ if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ *this_rdcost = sum_rdc.rdcost;
+ if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+
+ *best_rdc = sum_rdc;
+ pc_tree->partitioning = partition;
+ return true;
+}
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static void init_partition_block_timing_stats(
+ PartitionTimingStats *part_timing_stats) {
+ av1_zero(*part_timing_stats);
+}
+
+static INLINE void start_partition_block_timer(
+ PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) {
+ assert(!part_timing_stats->timer_is_on);
+ part_timing_stats->partition_attempts[partition_type] += 1;
+ aom_usec_timer_start(&part_timing_stats->timer);
+ part_timing_stats->timer_is_on = 1;
+}
+
+static INLINE void end_partition_block_timer(
+ PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type,
+ int64_t rdcost) {
+ if (part_timing_stats->timer_is_on) {
+ aom_usec_timer_mark(&part_timing_stats->timer);
+ const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer);
+ part_timing_stats->partition_times[partition_type] += time;
+ part_timing_stats->partition_rdcost[partition_type] = rdcost;
+ part_timing_stats->timer_is_on = 0;
+ }
+}
+static INLINE void print_partition_timing_stats_with_rdcost(
+ const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number,
+ const RD_STATS *best_rdc, const char *filename) {
+ FILE *f = fopen(filename, "a");
+ fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number,
+ frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist,
+ best_rdc->rdcost);
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) {
+ fprintf(f, "%d,", -1);
+ } else {
+ fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]);
+ }
+ }
+ fprintf(f, "\n");
+ fclose(f);
+}
+
+static INLINE void print_partition_timing_stats(
+ const PartitionTimingStats *part_timing_stats, int intra_only,
+ int show_frame, const BLOCK_SIZE bsize, const char *filename) {
+ FILE *f = fopen(filename, "a");
+ fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only);
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+ }
+ fprintf(f, "\n");
+ fclose(f);
+}
+
+static INLINE void accumulate_partition_timing_stats(
+ FramePartitionTimingStats *fr_part_timing_stats,
+ const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) {
+ const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+ int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx];
+ int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx];
+ int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx];
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ agg_attempts[idx] += part_timing_stats->partition_attempts[idx];
+ agg_decisions[idx] += part_timing_stats->partition_decisions[idx];
+ agg_times[idx] += part_timing_stats->partition_times[idx];
+ }
+}
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+// Initialize state variables of partition search used in
+// av1_rd_pick_partition().
+static void init_partition_search_state_params(
+ MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+
+ // Initialization of block size related parameters.
+ blk_params->mi_step = mi_size_wide[bsize] / 2;
+ blk_params->mi_row = mi_row;
+ blk_params->mi_col = mi_col;
+ blk_params->mi_row_edge = mi_row + blk_params->mi_step;
+ blk_params->mi_col_edge = mi_col + blk_params->mi_step;
+ blk_params->width = block_size_wide[bsize];
+ blk_params->min_partition_size_1d =
+ block_size_wide[x->sb_enc.min_partition_size];
+ blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ blk_params->split_bsize2 = blk_params->subsize;
+ blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+ blk_params->bsize = bsize;
+
+ // Check if the partition corresponds to edge block.
+ blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows);
+ blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols);
+
+ // Update intra partitioning related info.
+ part_search_state->intra_part_info = &x->part_search_info;
+ // Prepare for segmentation CNN-based partitioning for intra-frame.
+ if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+ part_search_state->intra_part_info->quad_tree_idx = 0;
+ part_search_state->intra_part_info->cnn_output_valid = 0;
+ }
+
+ // Set partition plane context index.
+ part_search_state->pl_ctx_idx =
+ blk_params->bsize_at_least_8x8
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+
+ // Partition cost buffer update
+ ModeCosts *mode_costs = &x->mode_costs;
+ part_search_state->partition_cost =
+ mode_costs->partition_cost[part_search_state->pl_ctx_idx];
+
+ // Initialize HORZ and VERT win flags as true for all split partitions.
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true;
+ part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true;
+ }
+
+ // Initialize the rd cost.
+ av1_init_rd_stats(&part_search_state->this_rdc);
+
+ // Initialize RD costs for partition types to 0.
+ part_search_state->none_rd = 0;
+ av1_zero(part_search_state->split_rd);
+ av1_zero(part_search_state->rect_part_rd);
+
+ // Initialize SPLIT partition to be not ready.
+ av1_zero(part_search_state->is_split_ctx_is_ready);
+ // Initialize HORZ and VERT partitions to be not ready.
+ av1_zero(part_search_state->is_rect_ctx_is_ready);
+
+ // Chroma subsampling.
+ part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x;
+ part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y;
+
+ // Initialize partition search flags to defaults.
+ part_search_state->terminate_partition_search = 0;
+ part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
+ part_search_state->do_rectangular_split =
+ cpi->oxcf.part_cfg.enable_rect_partitions &&
+ blk_params->bsize_at_least_8x8;
+ av1_zero(part_search_state->prune_rect_part);
+
+ // Initialize allowed partition types for the partition block.
+ part_search_state->partition_none_allowed =
+ av1_blk_has_rows_and_cols(blk_params);
+ part_search_state->partition_rect_allowed[HORZ] =
+ part_search_state->do_rectangular_split && blk_params->has_cols &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ part_search_state->partition_rect_allowed[VERT] =
+ part_search_state->do_rectangular_split && blk_params->has_rows &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+
+ // Reset the flag indicating whether a partition leading to a rdcost lower
+ // than the bound best_rdc has been found.
+ part_search_state->found_best_partition = false;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ init_partition_block_timing_stats(&part_search_state->part_timing_stats);
+#endif // CONFIG_COLLECT_PARTITION_STATS
+}
+
+// Override partition cost buffer for the edge blocks.
+static void set_partition_cost_for_edge_blk(
+ AV1_COMMON const *cm, PartitionSearchState *part_search_state) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0);
+ const aom_cdf_prob *partition_cdf =
+ cm->fc->partition_cdf[part_search_state->pl_ctx_idx];
+ const int max_cost = av1_cost_symbol(0);
+ for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i)
+ part_search_state->tmp_partition_cost[i] = max_cost;
+ if (blk_params.has_cols) {
+ // At the bottom, the two possibilities are HORZ and SPLIT.
+ aom_cdf_prob bot_cdf[2];
+ partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize);
+ static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf,
+ bot_inv_map);
+ } else if (blk_params.has_rows) {
+ // At the right, the two possibilities are VERT and SPLIT.
+ aom_cdf_prob rhs_cdf[2];
+ partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize);
+ static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf,
+ rhs_inv_map);
+ } else {
+ // At the bottom right, we always split.
+ part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0;
+ }
+ // Override the partition cost buffer.
+ part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+}
+
+// Reset the partition search state flags when
+// must_find_valid_partition is equal to 1.
+static AOM_INLINE void reset_part_limitations(
+ AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int is_rect_part_allowed =
+ blk_params.bsize_at_least_8x8 &&
+ cpi->oxcf.part_cfg.enable_rect_partitions &&
+ (blk_params.width > blk_params.min_partition_size_1d);
+ part_search_state->do_square_split =
+ blk_params.bsize_at_least_8x8 &&
+ (blk_params.width > blk_params.min_partition_size_1d);
+ part_search_state->partition_none_allowed =
+ av1_blk_has_rows_and_cols(&blk_params) &&
+ (blk_params.width >= blk_params.min_partition_size_1d);
+ part_search_state->partition_rect_allowed[HORZ] =
+ blk_params.has_cols && is_rect_part_allowed &&
+ get_plane_block_size(
+ get_partition_subsize(blk_params.bsize, PARTITION_HORZ),
+ part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+ part_search_state->partition_rect_allowed[VERT] =
+ blk_params.has_rows && is_rect_part_allowed &&
+ get_plane_block_size(
+ get_partition_subsize(blk_params.bsize, PARTITION_VERT),
+ part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+ part_search_state->terminate_partition_search = 0;
+}
+
+// Rectangular partitions evaluation at sub-block level.
+static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x,
+ PICK_MODE_CONTEXT *cur_partition_ctx,
+ PartitionSearchState *part_search_state,
+ RD_STATS *best_rdc, const int idx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition_type) {
+ // Obtain the remainder from the best rd cost
+ // for further processing of partition.
+ RD_STATS best_remain_rdcost;
+ av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc,
+ &best_remain_rdcost);
+
+ // Obtain the best mode for the partition sub-block.
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc,
+ partition_type, bsize, cur_partition_ctx, best_remain_rdcost);
+ av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc);
+
+ // Update the partition rd cost with the current sub-block rd.
+ if (part_search_state->this_rdc.rate == INT_MAX) {
+ part_search_state->sum_rdc.rdcost = INT64_MAX;
+ } else {
+ part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate;
+ part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist;
+ av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+ }
+ const RECT_PART_TYPE rect_part =
+ partition_type == PARTITION_HORZ ? HORZ : VERT;
+ part_search_state->rect_part_rd[rect_part][idx] =
+ part_search_state->this_rdc.rdcost;
+}
+
+typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+// Checks if HORZ / VERT partition search is allowed.
+static AOM_INLINE int is_rect_part_allowed(
+ const AV1_COMP *cpi, const PartitionSearchState *part_search_state,
+ const active_edge_info *active_edge, RECT_PART_TYPE rect_part,
+ const int mi_pos) {
+ const PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+ const int is_part_allowed =
+ (!part_search_state->terminate_partition_search &&
+ part_search_state->partition_rect_allowed[rect_part] &&
+ !part_search_state->prune_rect_part[rect_part] &&
+ (part_search_state->do_rectangular_split ||
+ active_edge[rect_part](cpi, mi_pos, blk_params->mi_step)));
+ return is_part_allowed;
+}
+
+static void rectangular_partition_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+ RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type,
+ const RECT_PART_TYPE end_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS *sum_rdc = &part_search_state->sum_rdc;
+ const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
+ PARTITION_VERT };
+
+ // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of
+ // HORZ and VERT partition types.
+ // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of
+ // HORZ and VERT partition types.
+ const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = {
+ { { blk_params.mi_row, blk_params.mi_col },
+ { blk_params.mi_row_edge, blk_params.mi_col } },
+ { { blk_params.mi_row, blk_params.mi_col },
+ { blk_params.mi_row, blk_params.mi_col_edge } }
+ };
+
+ // Initialize active edge_type function pointer
+ // for HOZR and VERT partition types.
+ active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge,
+ av1_active_v_edge };
+
+ // Indicates edge blocks for HORZ and VERT partition types.
+ const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows,
+ blk_params.has_cols };
+
+ // Initialize pc tree context for HORZ and VERT partition types.
+ PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = {
+ { &pc_tree->horizontal[0], &pc_tree->horizontal[1] },
+ { &pc_tree->vertical[0], &pc_tree->vertical[1] }
+ };
+
+ // Loop over rectangular partition types.
+ for (RECT_PART_TYPE i = start_type; i <= end_type; i++) {
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part_search_state->partition_rect_allowed[i]));
+
+ // Check if the HORZ / VERT partition search is to be performed.
+ if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i,
+ mi_pos_rect[i][0][i]))
+ continue;
+
+ // Sub-partition idx.
+ int sub_part_idx = 0;
+ PARTITION_TYPE partition_type = rect_partition_type[i];
+ blk_params.subsize =
+ get_partition_subsize(blk_params.bsize, partition_type);
+ assert(blk_params.subsize <= BLOCK_LARGEST);
+ av1_init_rd_stats(sum_rdc);
+ for (int j = 0; j < SUB_PARTITIONS_RECT; j++) {
+ if (cur_ctx[i][j][0] == NULL) {
+ cur_ctx[i][j][0] =
+ av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf);
+ if (!cur_ctx[i][j][0])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ }
+ sum_rdc->rate = part_search_state->partition_cost[partition_type];
+ sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_rdc->rdcost - sum_rdc->rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, partition_type);
+ }
+#endif
+
+ // First sub-partition evaluation in HORZ / VERT partition type.
+ rd_pick_rect_partition(
+ cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+ best_rdc, 0, mi_pos_rect[i][sub_part_idx][0],
+ mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+
+ // Start of second sub-partition evaluation.
+ // Evaluate second sub-partition if the first sub-partition cost
+ // is less than the best cost and if it is not an edge block.
+ if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) {
+ const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted.
+ if (pmi->palette_size[PLANE_TYPE_Y] == 0 &&
+ pmi->palette_size[PLANE_TYPE_UV] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED)
+ part_search_state->is_rect_ctx_is_ready[i] = 1;
+ }
+ av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row,
+ blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+ blk_params.subsize, NULL);
+
+ // Second sub-partition evaluation in HORZ / VERT partition type.
+ sub_part_idx = 1;
+ rd_pick_rect_partition(
+ cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+ best_rdc, 1, mi_pos_rect[i][sub_part_idx][0],
+ mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+ }
+ // Update HORZ / VERT best partition.
+ if (sum_rdc->rdcost < best_rdc->rdcost) {
+ sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist);
+ if (sum_rdc->rdcost < best_rdc->rdcost) {
+ *best_rdc = *sum_rdc;
+ part_search_state->found_best_partition = true;
+ pc_tree->partitioning = partition_type;
+ }
+ } else {
+ // Update HORZ / VERT win flag.
+ if (rect_part_win_info != NULL)
+ rect_part_win_info->rect_part_win[i] = false;
+ }
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ end_partition_block_timer(part_timing_stats, partition_type,
+ sum_rdc->rdcost);
+ }
+#endif
+ av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+ blk_params.bsize, av1_num_planes(cm));
+ }
+}
+
+// AB partition type evaluation.
+static void rd_pick_ab_part(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB],
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+ const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type,
+ const MB_MODE_INFO **mode_cache) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ int64_t this_rdcost = 0;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ {
+ RD_STATS tmp_sum_rdc;
+ av1_init_rd_stats(&tmp_sum_rdc);
+ tmp_sum_rdc.rate = part_search_state->partition_cost[part_type];
+ tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+ if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, part_type);
+ }
+ }
+#endif
+
+ // Test this partition and update the best partition.
+ const bool find_best_ab_part = rd_test_partition3(
+ cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row,
+ mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache);
+ part_search_state->found_best_partition |= find_best_ab_part;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ if (!find_best_ab_part) this_rdcost = INT64_MAX;
+ end_partition_block_timer(part_timing_stats, part_type, this_rdcost);
+ }
+#endif
+ av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// Set mode search context.
+static AOM_INLINE void set_mode_search_ctx(
+ PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
+ PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) {
+ mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0];
+ mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0];
+
+ if (is_ctx_ready[HORZ_A][0])
+ mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none;
+
+ if (is_ctx_ready[VERT_A][0])
+ mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none;
+
+ if (is_ctx_ready[HORZ_A][1])
+ mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
+}
+
+static AOM_INLINE void copy_partition_mode_from_mode_context(
+ const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) {
+ if (ctx && ctx->rd_stats.rate < INT_MAX) {
+ *dst_mode = &ctx->mic;
+ } else {
+ *dst_mode = NULL;
+ }
+}
+
+static AOM_INLINE void copy_partition_mode_from_pc_tree(
+ const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) {
+ if (pc_tree) {
+ copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
+ } else {
+ *dst_mode = NULL;
+ }
+}
+
+static AOM_INLINE void set_mode_cache_for_partition_ab(
+ const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree,
+ AB_PART_TYPE ab_part_type) {
+ switch (ab_part_type) {
+ case HORZ_A:
+ copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+ copy_partition_mode_from_mode_context(&mode_cache[2],
+ pc_tree->horizontal[1]);
+ break;
+ case HORZ_B:
+ copy_partition_mode_from_mode_context(&mode_cache[0],
+ pc_tree->horizontal[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+ copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+ break;
+ case VERT_A:
+ copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+ copy_partition_mode_from_mode_context(&mode_cache[2],
+ pc_tree->vertical[1]);
+ break;
+ case VERT_B:
+ copy_partition_mode_from_mode_context(&mode_cache[0],
+ pc_tree->vertical[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+ copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+ break;
+ default: assert(0 && "Invalid ab partition type!\n");
+ }
+}
+
+// AB Partitions type search.
+static void ab_partitions_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PC_TREE *pc_tree, PartitionSearchState *part_search_state,
+ RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info,
+ int pb_source_variance, int ext_partition_allowed,
+ const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ if (part_search_state->terminate_partition_search) {
+ return;
+ }
+
+ int ab_partitions_allowed[NUM_AB_PARTS];
+ // Prune AB partitions
+ av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost,
+ rect_part_win_info, ext_partition_allowed,
+ part_search_state, ab_partitions_allowed);
+
+ // Flags to indicate whether the mode search is done.
+ const int is_ctx_ready[NUM_AB_PARTS][2] = {
+ { part_search_state->is_split_ctx_is_ready[0],
+ part_search_state->is_split_ctx_is_ready[1] },
+ { part_search_state->is_rect_ctx_is_ready[HORZ], 0 },
+ { part_search_state->is_split_ctx_is_ready[0], 0 },
+ { part_search_state->is_rect_ctx_is_ready[VERT], 0 }
+ };
+
+ // Current partition context.
+ PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala,
+ pc_tree->horizontalb,
+ pc_tree->verticala,
+ pc_tree->verticalb };
+
+ // Context of already evaluted partition types.
+ PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2];
+ // Set context of already evaluted partition types.
+ set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx);
+
+ // Array of sub-partition size of AB partition types.
+ const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = {
+ { blk_params.split_bsize2, blk_params.split_bsize2,
+ get_partition_subsize(bsize, PARTITION_HORZ_A) },
+ { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2,
+ blk_params.split_bsize2 },
+ { blk_params.split_bsize2, blk_params.split_bsize2,
+ get_partition_subsize(bsize, PARTITION_VERT_A) },
+ { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2,
+ blk_params.split_bsize2 }
+ };
+
+ // Array of mi_row, mi_col positions corresponds to each sub-partition in AB
+ // partition types.
+ const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = {
+ { { mi_row, mi_col },
+ { mi_row, blk_params.mi_col_edge },
+ { blk_params.mi_row_edge, mi_col } },
+ { { mi_row, mi_col },
+ { blk_params.mi_row_edge, mi_col },
+ { blk_params.mi_row_edge, blk_params.mi_col_edge } },
+ { { mi_row, mi_col },
+ { blk_params.mi_row_edge, mi_col },
+ { mi_row, blk_params.mi_col_edge } },
+ { { mi_row, mi_col },
+ { mi_row, blk_params.mi_col_edge },
+ { blk_params.mi_row_edge, blk_params.mi_col_edge } }
+ };
+
+ // Loop over AB partition types.
+ for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type;
+ ab_part_type++) {
+ const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
+
+ // Check if the AB partition search is to be performed.
+ if (!ab_partitions_allowed[ab_part_type]) {
+ continue;
+ }
+
+ blk_params.subsize = get_partition_subsize(bsize, part_type);
+ for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+ // Set AB partition context.
+ cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
+ cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+ if (!cur_part_ctxs[ab_part_type][i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ // Set mode as not ready.
+ cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
+ }
+
+ if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) {
+ // We can copy directly the mode search results if we have already
+ // searched the current block and the contexts match.
+ if (is_ctx_ready[ab_part_type][0]) {
+ av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+ mode_srch_ctx[ab_part_type][0][0]);
+ cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+ cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+ if (is_ctx_ready[ab_part_type][1]) {
+ av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+ mode_srch_ctx[ab_part_type][1][0]);
+ cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+ cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+ }
+ }
+ }
+
+ // Even if the contexts don't match, we can still speed up by reusing the
+ // previous prediction mode.
+ const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL };
+ if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) {
+ set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
+ }
+
+ // Evaluation of AB partition type.
+ rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree,
+ cur_part_ctxs[ab_part_type], part_search_state, best_rdc,
+ ab_subsize[ab_part_type], ab_mi_pos[ab_part_type],
+ part_type, mode_cache);
+ }
+}
+
+// Set mi positions for HORZ4 / VERT4 sub-block partitions.
+static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES],
+ int mi_pos[SUB_PARTITIONS_PART4][2],
+ const int mi_row, const int mi_col) {
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) {
+ mi_pos[i][0] = mi_row + i * inc_step[HORZ4];
+ mi_pos[i][1] = mi_col + i * inc_step[VERT4];
+ }
+}
+
+// Set context and RD cost for HORZ4 / VERT4 partition types.
+static void set_4_part_ctx_and_rdcost(
+ MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+ PartitionSearchState *part_search_state, PARTITION_TYPE partition_type,
+ BLOCK_SIZE bsize) {
+ // Initialize sum_rdc RD cost structure.
+ av1_init_rd_stats(&part_search_state->sum_rdc);
+ const int subsize = get_partition_subsize(bsize, partition_type);
+ part_search_state->sum_rdc.rate =
+ part_search_state->partition_cost[partition_type];
+ part_search_state->sum_rdc.rdcost =
+ RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!cur_part_ctx[i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+}
+
+// Partition search of HORZ4 / VERT4 partition types.
+static void rd_pick_4partition(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ // mi positions needed for HORZ4 and VERT4 partition types.
+ int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols };
+ const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4);
+ int mi_pos[SUB_PARTITIONS_PART4][2];
+
+ blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type);
+ // Set partition context and RD cost.
+ set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state,
+ partition_type, blk_params.bsize);
+ // Set mi positions for sub-block sizes.
+ set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col);
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, partition_type);
+ }
+#endif
+ // Loop over sub-block partitions.
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break;
+
+ // Sub-block evaluation of Horz4 / Vert4 partition type.
+ cur_part_ctx[i]->rd_mode_is_ready = 0;
+ if (!rd_try_subblock(
+ cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1),
+ mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc,
+ &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) {
+ av1_invalid_rd_stats(&part_search_state->sum_rdc);
+ break;
+ }
+ }
+
+ // Calculate the total cost and update the best partition.
+ av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+ if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) {
+ *best_rdc = part_search_state->sum_rdc;
+ part_search_state->found_best_partition = true;
+ pc_tree->partitioning = partition_type;
+ }
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ end_partition_block_timer(part_timing_stats, partition_type,
+ part_search_state->sum_rdc.rdcost);
+ }
+#endif
+ av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+ blk_params.bsize, av1_num_planes(cm));
+}
+
+// Do not evaluate extended partitions if NONE partition is skippable.
+static INLINE int prune_ext_part_none_skippable(
+ PICK_MODE_CONTEXT *part_none, int must_find_valid_partition,
+ int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) {
+ if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) {
+ if (part_none->skippable && !must_find_valid_partition &&
+ bsize >= BLOCK_16X16) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+// Allow ab partition search
+static int allow_ab_partition_search(PartitionSearchState *part_search_state,
+ PARTITION_SPEED_FEATURES *part_sf,
+ PARTITION_TYPE curr_best_part,
+ int must_find_valid_partition,
+ int prune_ext_part_state,
+ int64_t best_rdcost) {
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ // Do not prune if there is no valid partition
+ if (best_rdcost == INT64_MAX) return 1;
+
+ // Determine bsize threshold to evaluate ab partitions
+ BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh;
+ if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition &&
+ !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT))
+ ab_bsize_thresh = BLOCK_128X128;
+
+ // ab partitions are only allowed for square block sizes BLOCK_16X16 or
+ // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and
+ // BLOCK_8X8.
+ assert(ab_bsize_thresh >= BLOCK_8X8);
+
+ int ab_partition_allowed =
+ part_search_state->do_rectangular_split && bsize > ab_bsize_thresh &&
+ av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+ return ab_partition_allowed;
+}
+
+// Prune 4-way partitions based on the number of horz/vert wins
+// in the current block and sub-blocks in PARTITION_SPLIT.
+static void prune_4_partition_using_split_info(
+ AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state,
+ int part4_search_allowed[NUM_PART4_TYPES]) {
+ PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 };
+ // Count of child blocks in which HORZ or VERT partition has won
+ int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 };
+ // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+ // split partiitons.
+ // Conservative pruning for high quantizers.
+ const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+
+ for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+ if (!(cpi->sf.part_sf.prune_ext_part_using_split_info &&
+ part4_search_allowed[cur_part[i]]))
+ continue;
+ // Loop over split partitions.
+ // Get rectangular partitions winner info of split partitions.
+ for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++)
+ num_child_rect_win[i] +=
+ (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1
+ : 0;
+ if (num_child_rect_win[i] < num_win_thresh) {
+ part4_search_allowed[cur_part[i]] = 0;
+ }
+ }
+}
+
+// Prune 4-way partition search.
+static void prune_4_way_partition_search(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ int pb_source_variance, int prune_ext_part_state,
+ int part4_search_allowed[NUM_PART4_TYPES]) {
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ // Do not prune if there is no valid partition
+ if (best_rdc->rdcost == INT64_MAX) return;
+
+ // Determine bsize threshold to evaluate 4-way partitions
+ BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh;
+ if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best &&
+ !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE)
+ part4_bsize_thresh = BLOCK_128X128;
+
+ // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and
+ // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude
+ // BLOCK_4X4 and BLOCK_8X8.
+ assert(part4_bsize_thresh >= BLOCK_8X8);
+
+ bool partition4_allowed =
+ part_search_state->do_rectangular_split && bsize > part4_bsize_thresh &&
+ av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+ // Disable 4-way partition search flags for width less than a multiple of the
+ // minimum partition width.
+ if (blk_params.width < (blk_params.min_partition_size_1d
+ << cpi->sf.part_sf.prune_part4_search)) {
+ part4_search_allowed[HORZ4] = 0;
+ part4_search_allowed[VERT4] = 0;
+ return;
+ }
+
+ PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
+ PARTITION_VERT_4 };
+ const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+ // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+ // PARTITION_VERT_4 for this block. This is almost the same as
+ // partition4_allowed, except that we don't allow 128x32 or 32x128
+ // blocks, so we require that bsize is not BLOCK_128X128.
+ partition4_allowed &=
+ part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128;
+
+ for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
+ part4_search_allowed[i] =
+ partition4_allowed && part_search_state->partition_rect_allowed[i] &&
+ get_plane_block_size(get_partition_subsize(bsize, cur_part[i]),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ }
+ // Pruning: pruning out 4-way partitions based on the current best partition.
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
+ part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_HORZ_A ||
+ pc_tree->partitioning == PARTITION_HORZ_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
+ part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_VERT_A ||
+ pc_tree->partitioning == PARTITION_VERT_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
+ }
+
+ // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of
+ // sub-blocks from basic partition types.
+ if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed &&
+ part_search_state->partition_rect_allowed[HORZ] &&
+ part_search_state->partition_rect_allowed[VERT]) {
+ av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost,
+ part_search_state, part4_search_allowed,
+ pb_source_variance);
+ }
+
+ // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
+ // in the current block and sub-blocks in PARTITION_SPLIT.
+ prune_4_partition_using_split_info(cpi, x, part_search_state,
+ part4_search_allowed);
+}
+
+// Set params needed for PARTITION_NONE search.
+static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
+ MACROBLOCK *x, PC_TREE *pc_tree,
+ PartitionSearchState *part_search_state,
+ RD_STATS *best_remain_rdcost,
+ RD_STATS *best_rdc, int *pt_cost) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS partition_rdcost;
+ // Set PARTITION_NONE context.
+ if (pc_tree->none == NULL)
+ pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+
+ // Set PARTITION_NONE type cost.
+ if (part_search_state->partition_none_allowed) {
+ if (blk_params.bsize_at_least_8x8) {
+ *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX
+ ? part_search_state->partition_cost[PARTITION_NONE]
+ : 0;
+ }
+
+ // Initialize the RD stats structure.
+ av1_init_rd_stats(&partition_rdcost);
+ partition_rdcost.rate = *pt_cost;
+ av1_rd_cost_update(x->rdmult, &partition_rdcost);
+ av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost,
+ best_remain_rdcost);
+ }
+}
+
+// Skip other partitions based on PARTITION_NONE rd cost.
+static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PICK_MODE_CONTEXT *ctx_none,
+ PartitionSearchState *part_search_state,
+ RD_STATS *best_rdc,
+ unsigned int *pb_source_variance) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS *this_rdc = &part_search_state->this_rdc;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ if (!frame_is_intra_only(cm) &&
+ (part_search_state->do_square_split ||
+ part_search_state->do_rectangular_split) &&
+ !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+ const int use_ml_based_breakout =
+ bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+ bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
+ if (use_ml_based_breakout) {
+ av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd,
+ part_search_state);
+ }
+
+ // Adjust dist breakout threshold according to the partition size.
+ const int64_t dist_breakout_thr =
+ cpi->sf.part_sf.partition_search_breakout_dist_thr >>
+ ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+ const int rate_breakout_thr =
+ cpi->sf.part_sf.partition_search_breakout_rate_thr *
+ num_pels_log2_lookup[bsize];
+ // If all y, u, v transform blocks in this partition are skippable,
+ // and the dist & rate are within the thresholds, the partition
+ // search is terminated for current branch of the partition search
+ // tree. The dist & rate thresholds are set to 0 at speed 0 to
+ // disable the early termination at that speed.
+ if (best_rdc->dist < dist_breakout_thr &&
+ best_rdc->rate < rate_breakout_thr) {
+ part_search_state->do_square_split = 0;
+ part_search_state->do_rectangular_split = 0;
+ }
+ }
+
+ // Early termination: using simple_motion_search features and the
+ // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a
+ // decision on early terminating at PARTITION_NONE.
+ if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame &&
+ !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+ av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX &&
+ this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX &&
+ this_rdc->rate >= 0 &&
+ (part_search_state->do_square_split ||
+ part_search_state->do_rectangular_split)) {
+ av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc,
+ part_search_state);
+ }
+}
+
+// Decide early termination and rectangular partition pruning
+// based on PARTITION_NONE and PARTITION_SPLIT costs.
+static void prune_partitions_after_split(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ int64_t part_none_rd, int64_t part_split_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ // Early termination: using the rd costs of PARTITION_NONE and subblocks
+ // from PARTITION_SPLIT to determine an early breakout.
+ if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+ !frame_is_intra_only(cm) &&
+ !part_search_state->terminate_partition_search &&
+ part_search_state->do_rectangular_split &&
+ (part_search_state->partition_rect_allowed[HORZ] ||
+ part_search_state->partition_rect_allowed[VERT])) {
+ av1_ml_early_term_after_split(
+ cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd,
+ part_search_state->split_rd, part_search_state);
+ }
+
+ // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
+ // to prune out rectangular partitions in some directions.
+ if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+ cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) &&
+ (part_search_state->partition_rect_allowed[HORZ] ||
+ part_search_state->partition_rect_allowed[VERT]) &&
+ !(part_search_state->prune_rect_part[HORZ] ||
+ part_search_state->prune_rect_part[VERT]) &&
+ !part_search_state->terminate_partition_search) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
+ bsize);
+ av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost,
+ part_search_state->none_rd,
+ part_search_state->split_rd, part_search_state);
+ }
+}
+
+// Returns true if either of the left and top neighbor blocks is larger than
+// the current block; false otherwise.
+static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk(
+ const MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+ const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]);
+ if (xd->left_available) {
+ const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize;
+ if (block_size_high[left_bsize] * block_size_wide[left_bsize] >
+ cur_blk_area)
+ return true;
+ }
+
+ if (xd->up_available) {
+ const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize;
+ if (block_size_high[above_bsize] * block_size_wide[above_bsize] >
+ cur_blk_area)
+ return true;
+ }
+ return false;
+}
+
+static AOM_INLINE void prune_rect_part_using_none_pred_mode(
+ const MACROBLOCKD *xd, PartitionSearchState *part_state,
+ PREDICTION_MODE mode, BLOCK_SIZE bsize) {
+ if (mode == DC_PRED || mode == SMOOTH_PRED) {
+ // If the prediction mode of NONE partition is either DC_PRED or
+ // SMOOTH_PRED, it indicates that the current block has less variation. In
+ // this case, HORZ and VERT partitions are pruned if at least one of left
+ // and top neighbor blocks is larger than the current block.
+ if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) {
+ part_state->prune_rect_part[HORZ] = 1;
+ part_state->prune_rect_part[VERT] = 1;
+ }
+ } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) {
+ // If the prediction mode chosen by NONE partition is close to 90 degrees,
+ // it implies a dominant vertical pattern, and the chance of choosing a
+ // vertical rectangular partition is high. Hence, horizontal partition is
+ // pruned in these cases.
+ part_state->prune_rect_part[HORZ] = 1;
+ } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) {
+ // If the prediction mode chosen by NONE partition is close to 180 degrees,
+ // it implies a dominant horizontal pattern, and the chance of choosing a
+ // horizontal rectangular partition is high. Hence, vertical partition is
+ // pruned in these cases.
+ part_state->prune_rect_part[VERT] = 1;
+ }
+}
+
+// PARTITION_NONE search.
+static void none_partition_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x,
+ PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS *this_rdc = &part_search_state->this_rdc;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ if (part_search_state->terminate_partition_search ||
+ !part_search_state->partition_none_allowed)
+ return;
+
+ int pt_cost = 0;
+ RD_STATS best_remain_rdcost;
+ av1_invalid_rd_stats(&best_remain_rdcost);
+
+ // Set PARTITION_NONE context and cost.
+ set_none_partition_params(cpi, td, x, pc_tree, part_search_state,
+ &best_remain_rdcost, best_rdc, &pt_cost);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ // Timer start for partition None.
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_remain_rdcost.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, PARTITION_NONE);
+ }
+#endif
+ // PARTITION_NONE evaluation and cost update.
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
+ bsize, pc_tree->none, best_remain_rdcost);
+
+ av1_rd_cost_update(x->rdmult, this_rdc);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ // Timer end for partition None.
+ if (part_timing_stats->timer_is_on) {
+ RD_STATS tmp_rdc;
+ av1_init_rd_stats(&tmp_rdc);
+ if (this_rdc->rate != INT_MAX) {
+ tmp_rdc.rate = this_rdc->rate;
+ tmp_rdc.dist = this_rdc->dist;
+ tmp_rdc.rdcost = this_rdc->rdcost;
+ if (blk_params.bsize_at_least_8x8) {
+ tmp_rdc.rate += pt_cost;
+ tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
+ }
+ }
+ end_partition_block_timer(part_timing_stats, PARTITION_NONE,
+ tmp_rdc.rdcost);
+ }
+#endif
+ *pb_source_variance = x->source_variance;
+ if (none_rd) *none_rd = this_rdc->rdcost;
+ part_search_state->none_rd = this_rdc->rdcost;
+ if (this_rdc->rate != INT_MAX) {
+ // Record picked ref frame to prune ref frames for other partition types.
+ if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
+ const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
+ av1_update_picked_ref_frames_mask(
+ x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col);
+ }
+
+ // Calculate the total cost and update the best partition.
+ if (blk_params.bsize_at_least_8x8) {
+ this_rdc->rate += pt_cost;
+ this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist);
+ }
+ *part_none_rd = this_rdc->rdcost;
+ if (this_rdc->rdcost < best_rdc->rdcost) {
+ *best_rdc = *this_rdc;
+ part_search_state->found_best_partition = true;
+ if (blk_params.bsize_at_least_8x8) {
+ pc_tree->partitioning = PARTITION_NONE;
+ }
+
+ // Disable split and rectangular partition search
+ // based on PARTITION_NONE cost.
+ prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none,
+ part_search_state, best_rdc,
+ pb_source_variance);
+ }
+
+ if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode)
+ prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state,
+ pc_tree->none->mic.mode, bsize);
+ }
+ av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// PARTITION_SPLIT search.
+static void split_partition_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+ SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+ RD_STATS sum_rdc = part_search_state->sum_rdc;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ // Check if partition split is allowed.
+ if (part_search_state->terminate_partition_search ||
+ !part_search_state->do_square_split)
+ return;
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (pc_tree->split[i] == NULL)
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+
+ // Initialization of this partition RD stats.
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+ int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_rdc->rdcost - sum_rdc.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, PARTITION_SPLIT);
+ }
+#endif
+ // Recursive partition search on 4 sub-blocks.
+ for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost;
+ ++idx) {
+ const int x_idx = (idx & 1) * blk_params.mi_step;
+ const int y_idx = (idx >> 1) * blk_params.mi_step;
+
+ if (mi_row + y_idx >= mi_params->mi_rows ||
+ mi_col + x_idx >= mi_params->mi_cols)
+ continue;
+
+ pc_tree->split[idx]->index = idx;
+ int64_t *p_split_rd = &part_search_state->split_rd[idx];
+ RD_STATS best_remain_rdcost;
+ av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc,
+ &best_remain_rdcost);
+
+ int curr_quad_tree_idx = 0;
+ if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+ curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx;
+ part_search_state->intra_part_info->quad_tree_idx =
+ 4 * curr_quad_tree_idx + idx + 1;
+ }
+ // Split partition evaluation of corresponding idx.
+ // If the RD cost exceeds the best cost then do not
+ // evaluate other split sub-partitions.
+ SIMPLE_MOTION_DATA_TREE *const sms_tree_split =
+ (sms_tree == NULL) ? NULL : sms_tree->split[idx];
+ if (!av1_rd_pick_partition(
+ cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &part_search_state->this_rdc, best_remain_rdcost,
+ pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode,
+ &part_search_state->split_part_rect_win[idx])) {
+ av1_invalid_rd_stats(&sum_rdc);
+ break;
+ }
+ if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+ part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx;
+ }
+
+ sum_rdc.rate += part_search_state->this_rdc.rate;
+ sum_rdc.dist += part_search_state->this_rdc.dist;
+ av1_rd_cost_update(x->rdmult, &sum_rdc);
+
+ // Set split ctx as ready for use.
+ if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+ pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+ const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted.
+ if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED)
+ part_search_state->is_split_ctx_is_ready[idx] = 1;
+ }
+ }
+ }
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ end_partition_block_timer(part_timing_stats, PARTITION_SPLIT,
+ sum_rdc.rdcost);
+ }
+#endif
+ const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT);
+
+ // Calculate the total cost and update the best partition.
+ *part_split_rd = sum_rdc.rdcost;
+ if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ if (sum_rdc.rdcost < best_rdc->rdcost) {
+ *best_rdc = sum_rdc;
+ part_search_state->found_best_partition = true;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split.
+ if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+ const int partition_none_valid = part_search_state->none_rd > 0;
+ const int partition_none_better =
+ part_search_state->none_rd < sum_rdc.rdcost;
+ part_search_state->do_rectangular_split &=
+ !(partition_none_valid && partition_none_better);
+ }
+ }
+ // Restore the context for the following cases:
+ // 1) Current block size not more than maximum partition size as dry run
+ // encode happens for these cases
+ // 2) Current block size same as superblock size as the final encode
+ // happens for this case
+ if (bsize <= x->sb_enc.max_partition_size || bsize == cm->seq_params->sb_size)
+ av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// The max number of nodes in the partition tree.
+// The number of leaf nodes is (128x128) / (4x4) = 1024.
+// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023.
+#define NUM_NODES 2048
+
+static void write_partition_tree(AV1_COMP *const cpi,
+ const PC_TREE *const pc_tree,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col) {
+ (void)mi_row;
+ (void)mi_col;
+ const char *path = cpi->oxcf.partition_info_path;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+ cpi->sb_counter, 0);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d", bsize);
+
+ // Write partition type with BFS order.
+ const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int q_idx = 0;
+ int last_idx = 1;
+ int num_nodes = 1;
+
+ // First traversal to get number of leaf nodes.
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ if (node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ const int num_leafs = last_idx;
+ fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+ // Write partitions for each node.
+ q_idx = 0;
+ last_idx = 1;
+ num_nodes = 1;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ fprintf(pfile, ",%d", node->partitioning);
+ if (node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ fprintf(pfile, "\n");
+
+ fclose(pfile);
+}
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+static void verify_write_partition_tree(const AV1_COMP *const cpi,
+ const PC_TREE *const pc_tree,
+ const BLOCK_SIZE bsize,
+ const int config_id, const int mi_row,
+ const int mi_col) {
+ (void)mi_row;
+ (void)mi_col;
+ const char *path = cpi->oxcf.partition_info_path;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d",
+ path, cpi->sb_counter, config_id);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d", bsize);
+
+ // Write partition type with BFS order.
+ const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int q_idx = 0;
+ int last_idx = 1;
+ int num_nodes = 1;
+
+ // First traversal to get number of leaf nodes.
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL && node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ const int num_leafs = last_idx;
+ fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+ // Write partitions for each node.
+ q_idx = 0;
+ last_idx = 1;
+ num_nodes = 1;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL) { // suppress warning
+ fprintf(pfile, ",%d", node->partitioning);
+ if (node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ fprintf(pfile, "\n");
+
+ fclose(pfile);
+}
+
+static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+ struct aom_internal_error_info *error_info,
+ const int config_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const char *path = cpi->oxcf.partition_info_path;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+ cpi->sb_counter, config_id);
+ FILE *pfile = fopen(filename, "r");
+ if (pfile == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "Can't find input file: %s.",
+ filename);
+ }
+
+ int read_bsize;
+ int num_nodes;
+ int num_configs;
+ fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
+ assert(read_bsize == cpi->common.seq_params->sb_size);
+ BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+ assert(bsize == pc_tree->block_size);
+
+ PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int last_idx = 1;
+ int q_idx = 0;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ int partitioning;
+ fscanf(pfile, ",%d", &partitioning);
+ assert(partitioning >= PARTITION_NONE &&
+ partitioning < EXT_PARTITION_TYPES);
+ PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL) {
+ node->partitioning = partitioning;
+ bsize = node->block_size;
+ }
+ if (partitioning == PARTITION_SPLIT) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int i = 0; i < 4; ++i) {
+ if (node != NULL) { // Suppress warning
+ node->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!node->split[i])
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ node->split[i]->index = i;
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ }
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ fclose(pfile);
+
+ return num_configs;
+}
+
+static RD_STATS rd_search_for_fixed_partition(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col,
+ const BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ RD_STATS best_rdc;
+ av1_invalid_rd_stats(&best_rdc);
+ int sum_subblock_rate = 0;
+ int64_t sum_subblock_dist = 0;
+ PartitionSearchState part_search_state;
+ init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+ bsize);
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c).
+ PartitionBlkParams blk_params = part_search_state.part_blk_params;
+ if (!av1_blk_has_rows_and_cols(&blk_params))
+ set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ (void)orig_rdmult;
+
+ // Set the context.
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ assert(bsize < BLOCK_SIZES_ALL);
+ unsigned int pb_source_variance = UINT_MAX;
+ int64_t part_none_rd = INT64_MAX;
+ int64_t none_rd = INT64_MAX;
+ int inc_step[NUM_PART4_TYPES] = { 0 };
+ if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4;
+ if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+ &part_search_state, &best_rdc, &pb_source_variance,
+ &none_rd, &part_none_rd);
+ break;
+ case PARTITION_HORZ:
+ rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+ &part_search_state, &best_rdc, NULL, HORZ,
+ HORZ);
+ break;
+ case PARTITION_VERT:
+ rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+ &part_search_state, &best_rdc, NULL, VERT,
+ VERT);
+ break;
+ case PARTITION_HORZ_A:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, HORZ_A, HORZ_A);
+ break;
+ case PARTITION_HORZ_B:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, HORZ_B, HORZ_B);
+ break;
+ case PARTITION_VERT_A:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, VERT_A, VERT_A);
+ break;
+ case PARTITION_VERT_B:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, VERT_B, VERT_B);
+ break;
+ case PARTITION_HORZ_4:
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->horizontal4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_HORZ_4);
+ break;
+ case PARTITION_VERT_4:
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->vertical4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_VERT_4);
+ break;
+ case PARTITION_SPLIT:
+ for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) {
+ const BLOCK_SIZE subsize =
+ get_partition_subsize(bsize, PARTITION_SPLIT);
+ assert(subsize < BLOCK_SIZES_ALL);
+ const int next_mi_row =
+ idx < 2 ? mi_row : mi_row + mi_size_high[subsize];
+ const int next_mi_col =
+ idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize];
+ if (next_mi_row >= cm->mi_params.mi_rows ||
+ next_mi_col >= cm->mi_params.mi_cols) {
+ continue;
+ }
+ const RD_STATS subblock_rdc = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row,
+ next_mi_col, subsize, pc_tree->split[idx]);
+ sum_subblock_rate += subblock_rdc.rate;
+ sum_subblock_dist += subblock_rdc.dist;
+ }
+ best_rdc.rate = sum_subblock_rate;
+ best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT];
+ best_rdc.dist = sum_subblock_dist;
+ best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist);
+ break;
+ default:
+ assert(0 && "invalid partition type.");
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid partition type.");
+ }
+ // Note: it is necessary to restore context information.
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (bsize != cm->seq_params->sb_size) {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ x->rdmult = orig_rdmult;
+
+ return best_rdc;
+}
+
+static void prepare_sb_features_before_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row,
+ int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) {
+ av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+ bsize, features);
+ collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features);
+}
+
+static void update_partition_stats(const RD_STATS *const this_rdcost,
+ aom_partition_stats_t *stats) {
+ stats->rate = this_rdcost->rate;
+ stats->dist = this_rdcost->dist;
+ stats->rdcost = this_rdcost->rdcost;
+}
+
+static void build_pc_tree_from_part_decision(
+ const aom_partition_decision_t *partition_decision,
+ const BLOCK_SIZE this_bsize, PC_TREE *pc_tree,
+ struct aom_internal_error_info *error_info) {
+ BLOCK_SIZE bsize = this_bsize;
+ int num_nodes = partition_decision->num_nodes;
+ PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int last_idx = 1;
+ int q_idx = 0;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const int partitioning = partition_decision->partition_decision[q_idx];
+ assert(partitioning >= PARTITION_NONE &&
+ partitioning < EXT_PARTITION_TYPES);
+ PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL) {
+ node->partitioning = partitioning;
+ bsize = node->block_size;
+ }
+ if (partitioning == PARTITION_SPLIT) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int i = 0; i < 4; ++i) {
+ if (node != NULL) { // Suppress warning
+ node->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!node->split[i])
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ node->split[i]->index = i;
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ }
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+}
+
+// The ML model needs to provide the whole decision tree for the superblock.
+static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col,
+ const BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ struct aom_internal_error_info *error_info = x->e_mbd.error_info;
+ aom_partition_features_t features;
+ prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+ &features);
+ features.mi_row = mi_row;
+ features.mi_col = mi_col;
+ features.frame_width = cpi->frame_info.frame_width;
+ features.frame_height = cpi->frame_info.frame_height;
+ features.block_size = bsize;
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // rd mode search (dry run) for a valid partition decision from the ml model.
+ aom_partition_decision_t partition_decision;
+ do {
+ const bool valid_decision = av1_ext_part_get_partition_decision(
+ ext_part_controller, &partition_decision);
+ if (!valid_decision) return false;
+
+ // First, let's take the easy approach.
+ // We require that the ml model has to provide partition decisions for the
+ // whole superblock.
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root,
+ error_info);
+
+ const RD_STATS this_rdcost = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
+ aom_partition_stats_t stats;
+ update_partition_stats(&this_rdcost, &stats);
+ av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+ if (!partition_decision.is_final_decision) {
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ }
+ } while (!partition_decision.is_final_decision);
+
+ // Encode with the selected mode and partition.
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+
+ return true;
+}
+
+// Use a bitmask to represent the valid partition types for the current
+// block. "1" represents the corresponding partition type is vaild.
+// The least significant bit represents "PARTITION_NONE", the
+// largest significant bit represents "PARTITION_VERT_4", follow
+// the enum order for PARTITION_TYPE in "enums.h"
+static int get_valid_partition_types(
+ const AV1_COMP *const cpi,
+ const PartitionSearchState *const part_search_state,
+ const BLOCK_SIZE bsize) {
+ const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ int valid_types = 0;
+ // PARTITION_NONE
+ valid_types |= (part_search_state->partition_none_allowed << 0);
+ // PARTITION_HORZ
+ valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1);
+ // PARTITION_VERT
+ valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2);
+ // PARTITION_SPLIT
+ valid_types |= (part_search_state->do_square_split << 3);
+ // PARTITION_HORZ_A
+ const int ext_partition_allowed = part_search_state->do_rectangular_split &&
+ av1_blk_has_rows_and_cols(&blk_params);
+ const int horzab_partition_allowed =
+ ext_partition_allowed && part_cfg->enable_ab_partitions &&
+ part_search_state->partition_rect_allowed[HORZ];
+ valid_types |= (horzab_partition_allowed << 4);
+ // PARTITION_HORZ_B
+ valid_types |= (horzab_partition_allowed << 5);
+ // PARTITION_VERT_A
+ const int vertab_partition_allowed =
+ ext_partition_allowed && part_cfg->enable_ab_partitions &&
+ part_search_state->partition_rect_allowed[VERT];
+ valid_types |= (vertab_partition_allowed << 6);
+ // PARTITION_VERT_B
+ valid_types |= (vertab_partition_allowed << 7);
+ // PARTITION_HORZ_4
+ const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+ ext_partition_allowed &&
+ bsize != BLOCK_128X128;
+ const int horz4_allowed =
+ partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ valid_types |= (horz4_allowed << 8);
+ // PARTITION_VERT_4
+ const int vert4_allowed =
+ partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ valid_types |= (vert4_allowed << 9);
+
+ return valid_types;
+}
+
+static void prepare_tpl_stats_block(const AV1_COMP *const cpi,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int64_t *intra_cost,
+ int64_t *inter_cost, int64_t *mc_dep_cost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+ return;
+ }
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ // If tpl stats is not established, early return
+ if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+ return;
+ }
+
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int mi_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int mi_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+
+ int64_t sum_intra_cost = 0;
+ int64_t sum_inter_cost = 0;
+ int64_t sum_mc_dep_cost = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ sum_intra_cost += this_stats->intra_cost;
+ sum_inter_cost += this_stats->inter_cost;
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ sum_mc_dep_cost += mc_dep_delta;
+ }
+ }
+
+ *intra_cost = sum_intra_cost;
+ *inter_cost = sum_inter_cost;
+ *mc_dep_cost = sum_mc_dep_cost;
+}
+
+static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ PC_TREE *pc_tree, int mi_row, int mi_col,
+ const BLOCK_SIZE bsize, RD_STATS *this_rdcost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) {
+ return false;
+ }
+ aom_partition_decision_t partition_decision;
+ do {
+ PartitionSearchState part_search_state;
+ // Initialization of state variables used in partition search.
+ // TODO(chengchen): check if there is hidden conditions that don't allow
+ // all possible partition types.
+ init_partition_search_state_params(x, cpi, &part_search_state, mi_row,
+ mi_col, bsize);
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c).
+ PartitionBlkParams blk_params = part_search_state.part_blk_params;
+ if (!av1_blk_has_rows_and_cols(&blk_params))
+ set_partition_cost_for_edge_blk(cm, &part_search_state);
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ const int valid_partition_types =
+ get_valid_partition_types(cpi, &part_search_state, bsize);
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id,
+ cm->quant_params.base_qindex);
+ // RD multiplier
+ const int rdmult = x->rdmult;
+ // pyramid level
+ const int pyramid_level =
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+ x->rdmult = orig_rdmult;
+ // Neighbor information
+ const int has_above = !!xd->above_mbmi;
+ const int has_left = !!xd->left_mbmi;
+ const BLOCK_SIZE above_bsize =
+ has_above ? xd->above_mbmi->bsize : BLOCK_INVALID;
+ const BLOCK_SIZE left_bsize =
+ has_left ? xd->left_mbmi->bsize : BLOCK_INVALID;
+ const int above_block_width =
+ above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize];
+ const int above_block_height =
+ above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize];
+ const int left_block_width =
+ left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize];
+ const int left_block_height =
+ left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize];
+ // Prepare simple motion search stats as features
+ unsigned int block_sse = -1;
+ unsigned int block_var = -1;
+ unsigned int sub_block_sse[4] = { -1, -1, -1, -1 };
+ unsigned int sub_block_var[4] = { -1, -1, -1, -1 };
+ unsigned int horz_block_sse[2] = { -1, -1 };
+ unsigned int horz_block_var[2] = { -1, -1 };
+ unsigned int vert_block_sse[2] = { -1, -1 };
+ unsigned int vert_block_var[2] = { -1, -1 };
+ av1_prepare_motion_search_features_block(
+ cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types,
+ &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse,
+ horz_block_var, vert_block_sse, vert_block_var);
+ // Prepare tpl stats for the current block as features
+ int64_t tpl_intra_cost = -1;
+ int64_t tpl_inter_cost = -1;
+ int64_t tpl_mc_dep_cost = -1;
+ prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost,
+ &tpl_inter_cost, &tpl_mc_dep_cost);
+
+ aom_partition_features_t features;
+ features.mi_row = mi_row;
+ features.mi_col = mi_col;
+ features.frame_width = cpi->frame_info.frame_width;
+ features.frame_height = cpi->frame_info.frame_height;
+ features.block_size = bsize;
+ features.valid_partition_types = valid_partition_types;
+ features.update_type = update_type;
+ features.qindex = qindex;
+ features.rdmult = rdmult;
+ features.pyramid_level = pyramid_level;
+ features.has_above_block = has_above;
+ features.above_block_width = above_block_width;
+ features.above_block_height = above_block_height;
+ features.has_left_block = has_left;
+ features.left_block_width = left_block_width;
+ features.left_block_height = left_block_height;
+ features.block_sse = block_sse;
+ features.block_var = block_var;
+ for (int i = 0; i < 4; ++i) {
+ features.sub_block_sse[i] = sub_block_sse[i];
+ features.sub_block_var[i] = sub_block_var[i];
+ }
+ for (int i = 0; i < 2; ++i) {
+ features.horz_block_sse[i] = horz_block_sse[i];
+ features.horz_block_var[i] = horz_block_var[i];
+ features.vert_block_sse[i] = vert_block_sse[i];
+ features.vert_block_var[i] = vert_block_var[i];
+ }
+ features.tpl_intra_cost = tpl_intra_cost;
+ features.tpl_inter_cost = tpl_inter_cost;
+ features.tpl_mc_dep_cost = tpl_mc_dep_cost;
+ av1_ext_part_send_features(ext_part_controller, &features);
+ const bool valid_decision = av1_ext_part_get_partition_decision(
+ ext_part_controller, &partition_decision);
+ if (!valid_decision) return false;
+ pc_tree->partitioning = partition_decision.current_decision;
+
+ av1_init_rd_stats(this_rdcost);
+ if (partition_decision.current_decision == PARTITION_SPLIT) {
+ assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ RD_STATS split_rdc[SUB_PARTITIONS_SPLIT];
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ av1_init_rd_stats(&split_rdc[i]);
+ if (pc_tree->split[i] == NULL)
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+ const int orig_rdmult_tmp = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ // TODO(chengchen): check boundary conditions
+ // top-left
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0],
+ mi_row, mi_col, subsize, &split_rdc[0]);
+ // top-right
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1],
+ mi_row, mi_col + mi_size_wide[subsize], subsize,
+ &split_rdc[1]);
+ // bottom-left
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2],
+ mi_row + mi_size_high[subsize], mi_col, subsize,
+ &split_rdc[2]);
+ // bottom_right
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3],
+ mi_row + mi_size_high[subsize],
+ mi_col + mi_size_wide[subsize], subsize,
+ &split_rdc[3]);
+ this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT];
+ // problem is here, the rdmult is different from the rdmult in sub block.
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ this_rdcost->rate += split_rdc[i].rate;
+ this_rdcost->dist += split_rdc[i].dist;
+ av1_rd_cost_update(x->rdmult, this_rdcost);
+ }
+ x->rdmult = orig_rdmult_tmp;
+ } else {
+ *this_rdcost = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+ }
+
+ aom_partition_stats_t stats;
+ update_partition_stats(this_rdcost, &stats);
+ av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+ if (!partition_decision.is_final_decision) {
+ if (partition_decision.current_decision == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ if (pc_tree->split[i] != NULL) {
+ av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0,
+ 0,
+ cpi->sf.part_sf.partition_search_type);
+ pc_tree->split[i] = NULL;
+ }
+ }
+ }
+ }
+ } while (!partition_decision.is_final_decision);
+
+ return true;
+}
+
+// The ML model only needs to make decisions for the current block each time.
+static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col,
+ const BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ aom_partition_features_t features;
+ prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+ &features);
+ features.mi_row = mi_row;
+ features.mi_col = mi_col;
+ features.frame_width = cpi->frame_info.frame_width;
+ features.frame_height = cpi->frame_info.frame_height;
+ features.block_size = bsize;
+ av1_ext_part_send_features(ext_part_controller, &features);
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+
+ RD_STATS rdcost;
+ const bool valid_partition =
+ recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row,
+ mi_col, bsize, &rdcost);
+ if (!valid_partition) {
+ return false;
+ }
+
+ // Encode with the selected mode and partition.
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+
+ return true;
+}
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+ int mi_col, const BLOCK_SIZE bsize,
+ RD_STATS *best_rd_cost) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cpi->ext_part_controller.ready) {
+ bool valid_search = true;
+ const aom_ext_part_decision_mode_t decision_mode =
+ av1_get_ext_part_decision_mode(&cpi->ext_part_controller);
+ if (decision_mode == AOM_EXT_PART_WHOLE_TREE) {
+ valid_search = ml_partition_search_whole_tree(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+ } else if (decision_mode == AOM_EXT_PART_RECURSIVE) {
+ valid_search = ml_partition_search_partial(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+ } else {
+ assert(0 && "Unknown decision mode.");
+ return false;
+ }
+ if (!valid_search) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_ERROR,
+ "Invalid search from ML model, partition search failed");
+ }
+ return true;
+ }
+
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int best_idx = 0;
+ int64_t min_rdcost = INT64_MAX;
+ int num_configs;
+ int i = 0;
+ do {
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i);
+ if (num_configs <= 0) {
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs.");
+ }
+ verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col);
+ if (i == 0) {
+ AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost,
+ aom_calloc(num_configs, sizeof(*x->rdcost)));
+ }
+ // Encode the block with the given partition tree. Get rdcost and encoding
+ // time.
+ x->rdcost[i] = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
+
+ if (x->rdcost[i].rdcost < min_rdcost) {
+ min_rdcost = x->rdcost[i].rdcost;
+ best_idx = i;
+ *best_rd_cost = x->rdcost[i];
+ }
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ ++i;
+ } while (i < num_configs);
+
+ aom_free(x->rdcost);
+ x->rdcost = NULL;
+ // Encode with the partition configuration with the smallest rdcost.
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx);
+ rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
+ mi_col, bsize, td->pc_root);
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ ++cpi->sb_counter;
+
+ return true;
+}
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+
+static AOM_INLINE bool should_do_dry_run_encode_for_current_block(
+ BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index,
+ BLOCK_SIZE bsize) {
+ if (bsize > max_partition_size) return false;
+
+ // Enable the reconstruction with dry-run for the 4th sub-block only if its
+ // parent block's reconstruction with dry-run is skipped. If
+ // max_partition_size is the same as immediate split of superblock, then avoid
+ // reconstruction of the 4th sub-block, as this data is not consumed.
+ if (curr_block_index != 3) return true;
+
+ const BLOCK_SIZE sub_sb_size =
+ get_partition_subsize(sb_size, PARTITION_SPLIT);
+ return bsize == max_partition_size && sub_sb_size != max_partition_size;
+}
+
+static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ double *var_min, double *var_max) {
+ // This functions returns a the minimum and maximum log variances for 4x4
+ // sub blocks in the current block.
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int is_hbd = is_cur_buf_hbd(xd);
+ const int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ const int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ // Initialize minimum variance to a large value and maximum variance to 0.
+ double min_var_4x4 = (double)INT_MAX;
+ double max_var_4x4 = 0.0;
+
+ for (int i = 0; i < bh; i += MI_SIZE) {
+ for (int j = 0; j < bw; j += MI_SIZE) {
+ int var;
+ // Calculate the 4x4 sub-block variance.
+ var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+ x->plane[0].src.buf + (i * x->plane[0].src.stride) + j,
+ x->plane[0].src.stride, is_hbd);
+
+ // Record min and max for over-arching block
+ min_var_4x4 = AOMMIN(min_var_4x4, var);
+ max_var_4x4 = AOMMAX(max_var_4x4, var);
+ }
+ }
+ *var_min = log1p(min_var_4x4 / 16.0);
+ *var_max = log1p(max_var_4x4 / 16.0);
+}
+
+static AOM_INLINE void set_sms_tree_partitioning(
+ SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) {
+ if (sms_tree == NULL) return;
+ sms_tree->partitioning = partition;
+}
+
+/*!\brief AV1 block partition search (full search).
+*
+* \ingroup partition_search
+* \callgraph
+* Searches for the best partition pattern for a block based on the
+* rate-distortion cost, and returns a bool value to indicate whether a valid
+* partition pattern is found. The partition can recursively go down to the
+* smallest block size.
+*
+* \param[in] cpi Top-level encoder structure
+* \param[in] td Pointer to thread data
+* \param[in] tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during
+encoding
+* \param[in] tp Pointer to the starting token
+* \param[in] mi_row Row coordinate of the block in a step size
+of MI_SIZE
+* \param[in] mi_col Column coordinate of the block in a step
+size of MI_SIZE
+* \param[in] bsize Current block size
+* \param[in] rd_cost Pointer to the final rd cost of the block
+* \param[in] best_rdc Upper bound of rd cost of a valid partition
+* \param[in] pc_tree Pointer to the PC_TREE node storing the
+picked partitions and mode info for the
+current block
+* \param[in] sms_tree Pointer to struct holding simple motion
+search data for the current block
+* \param[in] none_rd Pointer to the rd cost in the case of not
+splitting the current block
+* \param[in] multi_pass_mode SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+* \param[in] rect_part_win_info Pointer to struct storing whether horz/vert
+partition outperforms previously tested
+partitions
+*
+* \return A bool value is returned indicating if a valid partition is found.
+* The pc_tree struct is modified to store the picked partition and modes.
+* The rd_cost struct is also updated with the RD stats corresponding to the
+* best partition found.
+*/
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+ RD_STATS best_rdc, PC_TREE *pc_tree,
+ SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+ SB_MULTI_PASS_MODE multi_pass_mode,
+ RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ const TokenExtra *const tp_orig = *tp;
+ PartitionSearchState part_search_state;
+
+ // Initialization of state variables used in partition search.
+ init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+ bsize);
+ PartitionBlkParams blk_params = part_search_state.part_blk_params;
+
+ set_sms_tree_partitioning(sms_tree, PARTITION_NONE);
+ if (best_rdc.rdcost < 0) {
+ av1_invalid_rd_stats(rd_cost);
+ return part_search_state.found_best_partition;
+ }
+ if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0;
+
+ // Override skipping rectangular partition operations for edge blocks.
+ if (none_rd) *none_rd = 0;
+ (void)*tp_orig;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ // Stats at the current quad tree
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state.part_timing_stats;
+ // Stats aggregated at frame level
+ FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c).
+ if (!av1_blk_has_rows_and_cols(&blk_params))
+ set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+ // Disable rectangular partitions for inner blocks when the current block is
+ // forced to only use square partitions.
+ if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
+ part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows;
+ part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols;
+ }
+
+#ifndef NDEBUG
+ // Nothing should rely on the default value of this array (which is just
+ // leftover from encoding the previous block. Setting it to fixed pattern
+ // when debugging.
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->txfm_search_info.blk_skip, 0x77,
+ sizeof(x->txfm_search_info.blk_skip));
+#endif // NDEBUG
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ // Set buffers and offsets.
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ if (cpi->oxcf.mode == ALLINTRA) {
+ if (bsize == cm->seq_params->sb_size) {
+ double var_min, var_max;
+ log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+ x->intra_sb_rdmult_modifier = 128;
+ if ((var_min < 2.0) && (var_max > 4.0)) {
+ if ((var_max - var_min) > 8.0) {
+ x->intra_sb_rdmult_modifier -= 48;
+ } else {
+ x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6);
+ }
+ }
+ }
+ }
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+ // Apply simple motion search for the entire super block with fixed block
+ // size, e.g., 16x16, to collect features and write to files for the
+ // external ML model.
+ // TODO(chengchen): reduce motion search. This function is similar to
+ // av1_get_max_min_partition_features().
+ if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) &&
+ bsize == cm->seq_params->sb_size) {
+ av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+ bsize, /*features=*/NULL);
+ collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL);
+ }
+
+ // Update rd cost of the bound using the current multiplier.
+ av1_rd_cost_update(x->rdmult, &best_rdc);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+ // Set the context.
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_prune_partitions_time);
+#endif
+ // Pruning: before searching any partition type, using source and simple
+ // motion search results to prune out unlikely partitions.
+ av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state);
+
+ // Pruning: eliminating partition types leading to coding block sizes outside
+ // the min and max bsize limitations set from the encoder.
+ av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_prune_partitions_time);
+#endif
+
+ // Partition search
+BEGIN_PARTITION_SEARCH:
+ // If a valid partition is required, usually when the first round cannot find
+ // a valid one under the cost limit after pruning, reset the limitations on
+ // partition types and intra cnn output.
+ if (x->must_find_valid_partition) {
+ reset_part_limitations(cpi, &part_search_state);
+ av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+ // Invalidate intra cnn output for key frames.
+ if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+ part_search_state.intra_part_info->quad_tree_idx = 0;
+ part_search_state.intra_part_info->cnn_output_valid = 0;
+ }
+ }
+ // Partition block source pixel variance.
+ unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, none_partition_search_time);
+#endif
+
+ if (cpi->oxcf.mode == ALLINTRA) {
+ const bool bsize_at_least_16x16 = (bsize >= BLOCK_16X16);
+ const bool prune_rect_part_using_4x4_var_deviation =
+ (cpi->sf.part_sf.prune_rect_part_using_4x4_var_deviation &&
+ !x->must_find_valid_partition);
+
+ if (bsize_at_least_16x16 || prune_rect_part_using_4x4_var_deviation) {
+ double var_min, var_max;
+ log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+ // Further pruning or in some cases reverse pruning when allintra is set.
+ // This code helps visual and in some cases metrics quality where the
+ // current block comprises at least one very low variance sub-block and at
+ // least one where the variance is much higher.
+ //
+ // The idea is that in such cases there is danger of ringing and other
+ // visual artifacts from a high variance feature such as an edge into a
+ // very low variance region.
+ //
+ // The approach taken is to force break down / split to a smaller block
+ // size to try and separate out the low variance and well predicted blocks
+ // from the more complex ones and to prevent propagation of ringing over a
+ // large region.
+ if (bsize_at_least_16x16 && (var_min < 0.272) &&
+ ((var_max - var_min) > 3.0)) {
+ part_search_state.partition_none_allowed = 0;
+ part_search_state.terminate_partition_search = 0;
+ part_search_state.do_square_split = 1;
+ } else if (prune_rect_part_using_4x4_var_deviation &&
+ (var_max - var_min < 3.0)) {
+ // Prune rectangular partitions if the variance deviation of 4x4
+ // sub-blocks within the block is less than a threshold (derived
+ // empirically).
+ part_search_state.do_rectangular_split = 0;
+ }
+ }
+ }
+
+ // PARTITION_NONE search stage.
+ int64_t part_none_rd = INT64_MAX;
+ none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+ &part_search_state, &best_rdc, &pb_source_variance,
+ none_rd, &part_none_rd);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, none_partition_search_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, split_partition_search_time);
+#endif
+ // PARTITION_SPLIT search stage.
+ int64_t part_split_rd = INT64_MAX;
+ split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx,
+ &part_search_state, &best_rdc, multi_pass_mode,
+ &part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, split_partition_search_time);
+#endif
+ // Terminate partition search for child partition,
+ // when NONE and SPLIT partition rd_costs are INT64_MAX.
+ if (cpi->sf.part_sf.early_term_after_none_split &&
+ part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
+ !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) {
+ part_search_state.terminate_partition_search = 1;
+ }
+
+ // Do not evaluate non-square partitions if NONE partition did not choose a
+ // newmv mode and is skippable.
+ if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) &&
+ (pc_tree->none != NULL)) {
+ if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) &&
+ !have_newmv_in_inter_mode(pc_tree->none->mic.mode) &&
+ pc_tree->none->skippable && !x->must_find_valid_partition &&
+ bsize >= BLOCK_16X16)
+ part_search_state.do_rectangular_split = 0;
+ }
+
+ // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
+ prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
+ part_none_rd, part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rectangular_partition_search_time);
+#endif
+ // Rectangular partitions search stage.
+ rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+ &part_search_state, &best_rdc,
+ rect_part_win_info, HORZ, VERT);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rectangular_partition_search_time);
+#endif
+
+ if (pb_source_variance == UINT_MAX) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+ pb_source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+ }
+
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part_search_state.do_rectangular_split));
+
+ const int prune_ext_part_state = prune_ext_part_none_skippable(
+ pc_tree->none, x->must_find_valid_partition,
+ cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize);
+
+ const int ab_partition_allowed = allow_ab_partition_search(
+ &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning,
+ x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, ab_partitions_search_time);
+#endif
+ // AB partitions search stage.
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, rect_part_win_info,
+ pb_source_variance, ab_partition_allowed, HORZ_A,
+ VERT_B);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, ab_partitions_search_time);
+#endif
+
+ // 4-way partitions search stage.
+ int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
+ // Prune 4-way partition search.
+ prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
+ pb_source_variance, prune_ext_part_state,
+ part4_search_allowed);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_4partition_time);
+#endif
+ // PARTITION_HORZ_4
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part4_search_allowed[HORZ4]));
+ if (!part_search_state.terminate_partition_search &&
+ part4_search_allowed[HORZ4]) {
+ const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4,
+ 0 };
+ // Evaluation of Horz4 partition type.
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->horizontal4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_HORZ_4);
+ }
+
+ // PARTITION_VERT_4
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part4_search_allowed[VERT4]));
+ if (!part_search_state.terminate_partition_search &&
+ part4_search_allowed[VERT4] && blk_params.has_cols) {
+ const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] /
+ 4 };
+ // Evaluation of Vert4 partition type.
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->vertical4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_VERT_4);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_4partition_time);
+#endif
+
+ if (bsize == cm->seq_params->sb_size &&
+ !part_search_state.found_best_partition) {
+ // Did not find a valid partition, go back and search again, with less
+ // constraint on which partition types to search.
+ x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS
+ fr_part_timing_stats->partition_redo += 1;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+ goto BEGIN_PARTITION_SEARCH;
+ }
+
+ // Store the final rd cost
+ *rd_cost = best_rdc;
+
+ // Also record the best partition in simple motion data tree because it is
+ // necessary for the related speed features.
+ set_sms_tree_partitioning(sms_tree, pc_tree->partitioning);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+ part_timing_stats->partition_decisions[pc_tree->partitioning] += 1;
+ }
+
+ // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+ // prediction block.
+ print_partition_timing_stats_with_rdcost(
+ part_timing_stats, mi_row, mi_col, bsize,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+ cm->current_frame.frame_number, &best_rdc, "part_timing.csv");
+ const bool print_timing_stats = false;
+ if (print_timing_stats) {
+ print_partition_timing_stats(part_timing_stats, cm->show_frame,
+ frame_is_intra_only(cm), bsize,
+ "part_timing_data.csv");
+ }
+ // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+ // the whole clip. So we need to pass the information upstream to the encoder.
+ accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats,
+ bsize);
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+ // Reset the PC_TREE deallocation flag.
+ int pc_tree_dealloc = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
+ if (part_search_state.found_best_partition) {
+ if (bsize == cm->seq_params->sb_size) {
+ // Encode the superblock.
+ const int emit_output = multi_pass_mode != SB_DRY_PASS;
+ const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
+ // Write partition tree to file. Not used by default.
+ if (COLLECT_MOTION_SEARCH_FEATURE_SB) {
+ write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+ ++cpi->sb_counter;
+ }
+
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+ pc_tree, NULL);
+ assert(pc_tree == td->pc_root);
+ // Dealloc the whole PC_TREE after a superblock is done.
+ av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ pc_tree = NULL;
+ td->pc_root = NULL;
+ pc_tree_dealloc = 1;
+ } else if (should_do_dry_run_encode_for_current_block(
+ cm->seq_params->sb_size, x->sb_enc.max_partition_size,
+ pc_tree->index, bsize)) {
+ // Encode the smaller blocks in DRY_RUN mode.
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
+
+ // If the tree still exists (non-superblock), dealloc most nodes, only keep
+ // nodes for the best partition and PARTITION_NONE.
+ if (pc_tree_dealloc == 0)
+ av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1,
+ cpi->sf.part_sf.partition_search_type);
+
+ if (bsize == cm->seq_params->sb_size) {
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+
+ // Restore the rd multiplier.
+ x->rdmult = orig_rdmult;
+ return part_search_state.found_best_partition;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef COLLECT_MOTION_SEARCH_FEATURE_SB
+
+#if CONFIG_RT_ML_PARTITIONING
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const NN_CONFIG *nn_config = NULL;
+ const float *means = NULL;
+ const float *vars = NULL;
+ switch (bsize) {
+ case BLOCK_64X64:
+ nn_config = &av1_var_part_nnconfig_64;
+ means = av1_var_part_means_64;
+ vars = av1_var_part_vars_64;
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_var_part_nnconfig_32;
+ means = av1_var_part_means_32;
+ vars = av1_var_part_vars_32;
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_var_part_nnconfig_16;
+ means = av1_var_part_means_16;
+ vars = av1_var_part_vars_16;
+ break;
+ case BLOCK_8X8:
+ default: assert(0 && "Unexpected block size."); return -1;
+ }
+
+ if (!nn_config) return -1;
+
+ {
+ const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+ float features[FEATURES] = { 0.0f };
+ const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+ cm->seq_params->bit_depth);
+ int feature_idx = 0;
+ float score[LABELS];
+
+ features[feature_idx] =
+ (log1pf((float)(dc_q * dc_q) / 256.0f) - means[feature_idx]) /
+ sqrtf(vars[feature_idx]);
+ feature_idx++;
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+ {
+ const int bs = block_size_wide[bsize];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int sb_offset_row = 4 * (mi_row & 15);
+ const int sb_offset_col = 4 * (mi_col & 15);
+ const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const int pred_stride = 64;
+ unsigned int sse;
+ int i;
+ // Variance of whole block.
+ const unsigned int var =
+ cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+ const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+ features[feature_idx] =
+ (log1pf((float)var) - means[feature_idx]) / sqrtf(vars[feature_idx]);
+ feature_idx++;
+ for (i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bs / 2;
+ const int y_idx = (i >> 1) * bs / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int pred_offset = y_idx * pred_stride + x_idx;
+ // Variance of quarter block.
+ const unsigned int sub_var =
+ cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+ pred + pred_offset, pred_stride, &sse);
+ const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+ features[feature_idx] =
+ (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]);
+ feature_idx++;
+ }
+ }
+ // for (int i = 0; i<FEATURES; i++)
+ // printf("F_%d, %f; ", i, features[i]);
+ assert(feature_idx == FEATURES);
+ av1_nn_predict(features, nn_config, 1, score);
+ // printf("Score %f, thr %f ", (float)score[0], thresh);
+ if (score[0] > thresh) return PARTITION_SPLIT;
+ if (score[0] < -thresh) return PARTITION_NONE;
+ return -1;
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+// Uncomment for collecting data for ML-based partitioning
+// #define _COLLECT_GROUND_TRUTH_
+
+#ifdef _COLLECT_GROUND_TRUTH_
+static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, PARTITION_TYPE part) {
+ AV1_COMMON *const cm = &cpi->common;
+ char fname[128];
+ switch (bsize) {
+ case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break;
+ case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break;
+ case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break;
+ case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break;
+ default: assert(0 && "Unexpected block size."); return -1;
+ }
+
+ float features[6]; // DC_Q, VAR, VAR_RATIO-0..3
+
+ FILE *f = fopen(fname, "a");
+
+ {
+ const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+ cm->seq_params->bit_depth);
+ int feature_idx = 0;
+
+ features[feature_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+ {
+ const int bs = block_size_wide[bsize];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int sb_offset_row = 4 * (mi_row & 15);
+ const int sb_offset_col = 4 * (mi_col & 15);
+ const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const int pred_stride = 64;
+ unsigned int sse;
+ int i;
+ // Variance of whole block.
+ /*
+ if (bs == 8)
+ {
+ int r, c;
+ printf("%d %d\n", mi_row, mi_col);
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ printf("%3d ",
+ src[r * src_stride + c] - pred[64 * r + c]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+ */
+ const unsigned int var =
+ cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+ const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+ features[feature_idx++] = log1pf((float)var);
+
+ fprintf(f, "%f,%f,", features[0], features[1]);
+ for (i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bs / 2;
+ const int y_idx = (i >> 1) * bs / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int pred_offset = y_idx * pred_stride + x_idx;
+ // Variance of quarter block.
+ const unsigned int sub_var =
+ cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+ pred + pred_offset, pred_stride, &sse);
+ const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+ features[feature_idx++] = var_ratio;
+ fprintf(f, "%f,", var_ratio);
+ }
+
+ fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1);
+ }
+
+ fclose(f);
+ return -1;
+ }
+}
+#endif
+
+static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int block_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int block_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+ const int mi_stride = xd->mi_stride;
+ MB_MODE_INFO *const src_mi = xd->mi[0];
+ int i, j;
+
+ for (j = 0; j < block_height; ++j)
+ for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
+}
+
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+ MB_MODE_INFO_EXT *const mbmi_ext,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) {
+ memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+ sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+ memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+ sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+ mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+ mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+ memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+ sizeof(mbmi_ext->global_mvs));
+}
+
+static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int hbs = mi_size_wide[bsize] >> 1;
+ PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ assert(bsize >= BLOCK_8X8);
+
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row,
+ mi_col);
+ *(xd->mi[0]) = pc_tree->none->mic;
+ copy_mbmi_ext_frame_to_mbmi_ext(
+ &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME);
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+ break;
+ case PARTITION_SPLIT: {
+ fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+ fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize,
+ pc_tree->split[1]);
+ fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize,
+ pc_tree->split[2]);
+ fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize,
+ pc_tree->split[3]);
+ break;
+ }
+ default: break;
+ }
+}
+
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+ PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int hbs = mi_size_wide[bsize] >> 1;
+ TokenExtra *tp_orig = *tp;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ RD_STATS this_rdc, best_rdc;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ int do_split = bsize > BLOCK_8X8;
+ // Override skipping rectangular partition operations for edge blocks
+ const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows);
+ const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols);
+
+ int partition_none_allowed = !force_horz_split && !force_vert_split;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Square partition only
+ assert(cm->seq_params->sb_size == BLOCK_64X64); // Small SB so far
+
+ (void)*tp_orig;
+
+ av1_invalid_rd_stats(&best_rdc);
+ best_rdc.rdcost = best_rd;
+#ifndef _COLLECT_GROUND_TRUTH_
+ if (partition_none_allowed && do_split) {
+ const int ml_predicted_partition =
+ ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
+ if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+ if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+ }
+#endif
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ PICK_MODE_CONTEXT *ctx = pc_tree->none;
+
+// Flip for RDO based pick mode
+#if 0
+ RD_STATS dummy;
+ av1_invalid_rd_stats(&dummy);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+ PARTITION_NONE, bsize, ctx, dummy);
+#else
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
+ ctx);
+#endif
+ if (this_rdc.rate != INT_MAX) {
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+ this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ }
+ }
+ }
+
+ // PARTITION_SPLIT
+ if (do_split) {
+ RD_STATS sum_rdc;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ av1_init_rd_stats(&sum_rdc);
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ for (int i = 0;
+ i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+ const int x_idx = (i & 1) * hbs;
+ const int y_idx = (i >> 1) * hbs;
+
+ if (mi_row + y_idx >= cm->mi_params.mi_rows ||
+ mi_col + x_idx >= cm->mi_params.mi_cols)
+ continue;
+ av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+ mi_col + x_idx, subsize, &this_rdc, i < 3,
+ best_rdc.rdcost - sum_rdc.rdcost,
+ pc_tree->split[i]);
+
+ if (this_rdc.rate == INT_MAX) {
+ av1_invalid_rd_stats(&sum_rdc);
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ }
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ }
+
+#ifdef _COLLECT_GROUND_TRUTH_
+ store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning);
+#endif
+
+ *rd_cost = best_rdc;
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+ if (best_rdc.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
+ // update mode info array
+ fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
+
+ if (do_recon) {
+ if (bsize == cm->seq_params->sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ if (bsize == BLOCK_64X64 && do_recon) {
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+#endif // CONFIG_RT_ML_PARTITIONING
diff --git a/third_party/aom/av1/encoder/partition_search.h b/third_party/aom/av1/encoder/partition_search.h
new file mode 100644
index 0000000000..1b5d71b7da
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_search.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize);
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
+ int64_t *dist, int do_recon, PC_TREE *pc_tree);
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TokenExtra **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PC_TREE *pc_tree);
+#if CONFIG_RT_ML_PARTITIONING
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+ PC_TREE *pc_tree);
+#endif
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi);
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *best_rd_cost);
+#endif
+
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+ RD_STATS best_rdc, PC_TREE *pc_tree,
+ SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+ SB_MULTI_PASS_MODE multi_pass_mode,
+ RD_RECT_PART_WIN_INFO *rect_part_win_info);
+
+static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
+ const uint16_t cb_offset_y,
+ const uint16_t cb_offset_uv) {
+ cb_offset[PLANE_TYPE_Y] = cb_offset_y;
+ cb_offset[PLANE_TYPE_UV] = cb_offset_uv;
+}
+
+static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
+ const int subsampling_x,
+ const int subsampling_y) {
+ x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
+ if (x->e_mbd.is_chroma_ref) {
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ assert(plane_bsize != BLOCK_INVALID);
+ x->cb_offset[PLANE_TYPE_UV] +=
+ block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+ }
+}
+
+#endif // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/partition_strategy.c b/third_party/aom/av1/encoder/partition_strategy.c
new file mode 100644
index 0000000000..ce06313579
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_strategy.c
@@ -0,0 +1,2573 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/thirdpass.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconinter.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+#include "av1/encoder/encoder.h"
+
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/rdopt.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void simple_motion_search_prune_part_features(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+ int features_to_get);
+
+static bool ext_ml_model_decision_before_none(
+ AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+ int *partition_none_allowed, int *partition_horz_allowed,
+ int *partition_vert_allowed, int *do_rectangular_split,
+ int *do_square_split);
+
+static bool ext_ml_model_decision_before_none_part2(
+ AV1_COMP *cpi,
+ const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+ int *prune_horz, int *prune_vert);
+
+static bool ext_ml_model_decision_after_none(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_none, int *do_square_split,
+ int *do_rectangular_split);
+
+static bool ext_ml_model_decision_after_none_part2(
+ AV1_COMP *const cpi, const float *const features_terminate,
+ int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split(
+ AV1_COMP *const cpi, const float *const features_terminate,
+ int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split_part2(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_prune, int *prune_rect_part_horz,
+ int *prune_rect_part_vert);
+
+static bool ext_ml_model_decision_after_rect(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_rect, int *horza_partition_allowed,
+ int *horzb_partition_allowed, int *verta_partition_allowed,
+ int *vertb_partition_allowed);
+
+static bool ext_ml_model_decision_after_part_ab(
+ AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+ int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+ int *const partition_vert4_allowed, unsigned int pb_source_variance,
+ int mi_row, int mi_col);
+
+static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_128X128: return 0;
+ case BLOCK_64X64: return 1;
+ case BLOCK_32X32: return 2;
+ case BLOCK_16X16: return 3;
+ case BLOCK_8X8: return 4;
+ default: assert(0 && "Invalid bsize"); return -1;
+ }
+}
+
+static char *get_feature_file_name(int id) {
+ static char *feature_file_names[] = {
+ "feature_before_partition_none",
+ "feature_before_partition_none_prune_rect",
+ "feature_after_partition_none_prune",
+ "feature_after_partition_none_terminate",
+ "feature_after_partition_split_terminate",
+ "feature_after_partition_split_prune_rect",
+ "feature_after_partition_rect",
+ "feature_after_partition_ab",
+ };
+
+ return feature_file_names[id];
+}
+
+static void write_features_to_file(const char *const path,
+ const bool is_test_mode,
+ const float *features,
+ const int feature_size, const int id,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col) {
+ if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
+
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/%s", path,
+ get_feature_file_name(id));
+ FILE *pfile = fopen(filename, "a");
+ if (pfile == NULL) return;
+ if (!is_test_mode) {
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col,
+ feature_size);
+ }
+ for (int i = 0; i < feature_size; ++i) {
+ fprintf(pfile, "%.6f", features[i]);
+ if (i < feature_size - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ fclose(pfile);
+}
+
+// TODO(chiyotsai@google.com): This is very much a work in progress. We still
+// need to the following:
+// -- add support for hdres
+// -- add support for pruning rectangular partitions
+// -- use reconstructed pixels instead of source pixels for padding
+// -- use chroma pixels in addition to luma pixels
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+ int quad_tree_idx,
+ int intra_cnn_based_part_prune_level,
+ PartitionSearchState *part_state) {
+ assert(cm->seq_params->sb_size >= BLOCK_64X64 &&
+ "Invalid sb_size for intra_cnn!");
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const int bsize_idx = convert_bsize_to_idx(bsize);
+
+ if (bsize == BLOCK_128X128) {
+ return;
+ }
+
+ PartitionSearchInfo *part_info = &x->part_search_info;
+
+ // Precompute the CNN part and cache the result in MACROBLOCK
+ if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) {
+ const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+ // Prepare the output
+ const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL };
+ const int num_outputs = 4;
+ const int output_dims[4] = { 1, 2, 4, 8 };
+ const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH,
+ CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH };
+ float *output_buffer[CNN_TOT_OUT_CH];
+
+ float **cur_output_buf = output_buffer;
+ float *curr_buf_ptr = part_info->cnn_buffer;
+ for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+ const int num_chs = out_chs[output_idx];
+ const int ch_size = output_dims[output_idx] * output_dims[output_idx];
+ for (int ch = 0; ch < num_chs; ch++) {
+ cur_output_buf[ch] = curr_buf_ptr;
+ curr_buf_ptr += ch_size;
+ }
+ cur_output_buf += num_chs;
+ }
+
+ CNN_MULTI_OUT output = {
+ .num_outputs = 4,
+ .output_channels = out_chs,
+ .output_strides = output_dims,
+ .output_buffer = output_buffer,
+ };
+
+ // Prepare the input
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int bit_depth = xd->bd;
+ const int dc_q =
+ av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
+ part_info->log_q = log1pf((float)(dc_q * dc_q) / 256.0f);
+ part_info->log_q =
+ (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+ av1_intra_mode_cnn_partition_std[0];
+
+ const int width = 65, height = 65,
+ stride = x->plane[AOM_PLANE_Y].src.stride;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *image[1] = {
+ CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
+ };
+
+ if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+ cnn_config, &thread_data,
+ bit_depth, &output)) {
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating CNN data");
+ return;
+ }
+ } else {
+ uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
+
+ if (!av1_cnn_predict_img_multi_out(image, width, height, stride,
+ cnn_config, &thread_data, &output)) {
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating CNN data");
+ return;
+ }
+ }
+
+ part_info->cnn_output_valid = 1;
+ }
+
+ if (!part_info->cnn_output_valid) {
+ return;
+ }
+
+ const NN_CONFIG *dnn_configs[5] = {
+ NULL,
+ &av1_intra_mode_cnn_partition_branch_0_dnn_config,
+ &av1_intra_mode_cnn_partition_branch_1_dnn_config,
+ &av1_intra_mode_cnn_partition_branch_2_dnn_config,
+ &av1_intra_mode_cnn_partition_branch_3_dnn_config,
+ };
+
+ const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
+
+ float dnn_features[100];
+ float logits[4] = { 0.0f };
+
+ const float *branch_0 = part_info->cnn_buffer;
+ const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
+ const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
+ const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
+
+ if (bsize == BLOCK_64X64) {
+ int f_idx = 0;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_0[ch_idx];
+ }
+
+ const int spa_stride = 2 * 2;
+ for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) {
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
+ }
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else if (bsize == BLOCK_32X32) {
+ int f_idx = 0;
+ for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
+ dnn_features[f_idx++] = branch_0[idx];
+ }
+
+ const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1];
+ const int spa_stride = 2 * 2;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else if (bsize == BLOCK_16X16) {
+ int f_idx = 0;
+ const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+ const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1];
+ const int prev_spa_stride = 2 * 2;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride];
+ }
+
+ const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5];
+ const int spa_stride = 4 * 4;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else if (bsize == BLOCK_8X8) {
+ int f_idx = 0;
+ const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+ const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5];
+ const int prev_spa_stride = 4 * 4;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride];
+ }
+
+ const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21];
+ const int spa_stride = 8 * 8;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else {
+ assert(0 && "Invalid bsize in intra_cnn partition");
+ }
+
+ // Make decision
+ av1_nn_predict(dnn_features, dnn_config, 1, logits);
+
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ float split_only_thresh = 100.0f, no_split_thresh = -100.0f;
+ if (is_720p_or_larger) {
+ split_only_thresh =
+ av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx];
+ no_split_thresh =
+ av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx];
+ } else if (is_480p_or_larger) {
+ split_only_thresh =
+ av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx];
+ no_split_thresh =
+ av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx];
+ } else {
+ split_only_thresh =
+ av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx];
+ no_split_thresh =
+ av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx];
+ }
+
+ if (logits[0] > split_only_thresh) {
+ // As screen contents tend to choose larger partitions, do not prune
+ // PARTITION_NONE when intra_cnn_based_part_prune_level=1.
+ if (intra_cnn_based_part_prune_level != 1) {
+ part_state->partition_none_allowed = 0;
+ }
+ part_state->do_square_split = 1;
+ av1_disable_rect_partitions(part_state);
+ }
+
+ if (logits[0] < no_split_thresh) {
+ av1_disable_square_split_partition(part_state);
+ }
+}
+
+static INLINE int get_simple_motion_search_prune_agg(int qindex,
+ int prune_level,
+ int is_rect_part) {
+ assert(prune_level < TOTAL_AGG_LVLS);
+ if (prune_level == NO_PRUNING) {
+ return -1;
+ }
+
+ // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except
+ // QIDX_BASED_AGG_LVL
+ const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 };
+ if (prune_level < TOTAL_SIMPLE_AGG_LVLS) {
+ return sms_prune_agg_levels[prune_level];
+ }
+
+ // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value.
+ // Aggressive pruning for lower quantizers in non-boosted frames to prune
+ // rectangular partitions.
+ const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0;
+ const int sms_prune_agg_qindex_based[2] = { 1, 2 };
+ return sms_prune_agg_qindex_based[qband];
+}
+
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const int bsize_idx = convert_bsize_to_idx(bsize);
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+
+ const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+ assert(bsize_idx >= 0 && bsize_idx <= 4 &&
+ "Invalid bsize in simple_motion_search_based_split");
+
+ const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx];
+ const float *ml_std = av1_simple_motion_search_split_std[bsize_idx];
+ const NN_CONFIG *nn_config =
+ av1_simple_motion_search_split_nn_config[bsize_idx];
+
+ const int agg = get_simple_motion_search_prune_agg(
+ x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0);
+ if (agg < 0) {
+ return;
+ }
+
+ const float split_only_thresh =
+ av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
+ const float no_split_thresh =
+ av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
+
+ float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, features,
+ FEATURE_SMS_SPLIT_MODEL_FLAG);
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col);
+
+ // Note: it is intended to not normalize the features here, to keep it
+ // consistent for all features collected and passed to the external model.
+ if (ext_ml_model_decision_before_none(
+ cpi, features, &part_state->partition_none_allowed,
+ &part_state->partition_rect_allowed[HORZ],
+ &part_state->partition_rect_allowed[VERT],
+ &part_state->do_rectangular_split, &part_state->do_square_split)) {
+ return;
+ }
+
+ for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
+ features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
+ }
+
+ float score = 0.0f;
+
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ if (score > split_only_thresh) {
+ av1_set_square_split_only(part_state);
+ }
+
+ if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
+ score < no_split_thresh) {
+ av1_disable_square_split_partition(part_state);
+ }
+
+ // If the score is very low, prune rectangular split since it is unlikely to
+ // occur.
+ if (cpi->sf.part_sf.simple_motion_search_rect_split) {
+ const float scale = res_idx >= 2 ? 3.0f : 2.0f;
+ const float rect_split_thresh =
+ scale * av1_simple_motion_search_no_split_thresh
+ [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx]
+ [bsize_idx];
+ if (score < rect_split_thresh) {
+ part_state->do_rectangular_split = 0;
+ }
+ }
+}
+
+// Given a list of ref frames in refs, performs simple_motion_search on each of
+// the refs and returns the ref with the smallest sse. Returns -1 if none of the
+// ref in the list is available. Also stores the best sse and var in best_sse,
+// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
+// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the
+// subtrees.
+static int simple_motion_search_get_best_ref(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs,
+ int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse,
+ unsigned int *best_var) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int best_ref = -1;
+
+ if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) {
+ // If the whole block is outside of the image, set the var and sse to 0.
+ *best_var = 0;
+ *best_sse = 0;
+
+ return best_ref;
+ }
+
+ // Otherwise do loop through the reference frames and find the one with the
+ // minimum SSE
+ const int num_planes = 1;
+
+ *best_sse = INT_MAX;
+
+ for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) {
+ const int ref = refs[ref_idx];
+
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+ const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
+ unsigned int curr_sse = 0, curr_var = 0;
+ const int_mv best_mv = av1_simple_motion_search_sse_var(
+ cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes,
+ use_subpixel, &curr_sse, &curr_var);
+ if (curr_sse < *best_sse) {
+ *best_sse = curr_sse;
+ *best_var = curr_var;
+ best_ref = ref;
+ }
+
+ if (save_mv) {
+ sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+ sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+
+ if (bsize >= BLOCK_8X8) {
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
+ // Propagate the new motion vectors to a lower level
+ SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+ sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref];
+ }
+ }
+ }
+ }
+ }
+
+ return best_ref;
+}
+
+// Collects features using simple_motion_search and store them in features. The
+// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features
+// collected are the sse and var from the subblocks flagged by features_to_get.
+// Furthermore, if features is not NULL, then 7 more features are appended to
+// the end of features:
+// - log(1.0 + dc_q ** 2)
+// - whether an above macroblock exists
+// - width of above macroblock
+// - height of above macroblock
+// - whether a left marcoblock exists
+// - width of left macroblock
+// - height of left macroblock
+static AOM_INLINE void simple_motion_search_prune_part_features(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+ int features_to_get) {
+ const int w_mi = mi_size_wide[bsize];
+ const int h_mi = mi_size_high[bsize];
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ assert(bsize >= BLOCK_8X8);
+ assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+ cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+ // Setting up motion search
+ const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+ : LAST_FRAME };
+ const int num_refs = 1;
+ const int use_subpixel = 1;
+
+ // Doing whole block first to update the mv
+ if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+ simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize,
+ ref_list, num_refs, use_subpixel, 1,
+ &sms_tree->sms_none_feat[0],
+ &sms_tree->sms_none_feat[1]);
+ sms_tree->sms_none_valid = 1;
+ }
+
+ // Split subblocks
+ if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
+ const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+ const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+ SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+
+ if (!sub_tree->sms_none_valid) {
+ simple_motion_search_get_best_ref(
+ cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list,
+ num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0],
+ &sub_tree->sms_none_feat[1]);
+ sub_tree->sms_none_valid = 1;
+ }
+ }
+ }
+
+ // Rectangular subblocks
+ if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+ // Horz subblock
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
+ const int sub_mi_col = mi_col + 0;
+ const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx],
+ &sms_tree->sms_rect_feat[2 * r_idx + 1]);
+ }
+
+ // Vert subblock
+ subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
+ const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+ const int sub_mi_row = mi_row + 0;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx],
+ &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+ }
+ sms_tree->sms_rect_valid = 1;
+ }
+
+ if (!features) return;
+
+ int f_idx = 0;
+ if (features_to_get & FEATURE_SMS_NONE_FLAG) {
+ for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
+ features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[sub_idx]);
+ }
+ }
+
+ if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+ for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) {
+ SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx];
+ features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[0]);
+ features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[1]);
+ }
+ }
+
+ if (features_to_get & FEATURE_SMS_RECT_FLAG) {
+ for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[sub_idx]);
+ }
+ }
+
+ const MACROBLOCKD *xd = &x->e_mbd;
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ // Q_INDEX
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ features[f_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f);
+
+ // Neighbor stuff
+ const int has_above = !!xd->above_mbmi;
+ const int has_left = !!xd->left_mbmi;
+ const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize;
+ const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize;
+ features[f_idx++] = (float)has_above;
+ features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+ features[f_idx++] = (float)has_left;
+ features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+}
+
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const int bsize_idx = convert_bsize_to_idx(bsize);
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+
+ const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+ // Get model parameters
+ const NN_CONFIG *nn_config =
+ av1_simple_motion_search_prune_rect_nn_config[bsize_idx];
+ const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx],
+ *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
+
+ const int agg = get_simple_motion_search_prune_agg(
+ x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1);
+ if (agg < 0) {
+ return;
+ }
+
+ const float prune_thresh =
+ av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
+
+ // If there is no valid threshold, return immediately.
+ if (!nn_config || prune_thresh == 0.0f) {
+ return;
+ }
+
+ // Get features
+ float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, features,
+ FEATURE_SMS_PRUNE_PART_FLAG);
+
+ // Note: it is intended to not normalize the features here, to keep it
+ // consistent for all features collected and passed to the external model.
+ if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+ !frame_is_intra_only(cm) &&
+ (part_state->partition_rect_allowed[HORZ] ||
+ part_state->partition_rect_allowed[VERT]) &&
+ bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+ // Write features to file
+ write_features_to_file(
+ cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode,
+ features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_before_none_part2(
+ cpi, features, &part_state->prune_rect_part[HORZ],
+ &part_state->prune_rect_part[VERT])) {
+ return;
+ }
+ }
+
+ for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
+ features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+
+ // Get probabilities
+ float scores[EXT_PARTITION_TYPES] = { 0.0f },
+ probs[EXT_PARTITION_TYPES] = { 0.0f };
+ const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8)
+ ? PARTITION_TYPES
+ : EXT_PARTITION_TYPES;
+
+ av1_nn_predict(features, nn_config, 1, scores);
+
+ av1_nn_softmax(scores, probs, num_classes);
+
+ // Determine if we should prune rectangular partitions.
+ if (probs[PARTITION_HORZ] <= prune_thresh) {
+ part_state->prune_rect_part[HORZ] = 1;
+ }
+ if (probs[PARTITION_VERT] <= prune_thresh) {
+ part_state->prune_rect_part[VERT] = 1;
+ }
+}
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+// - The frame is a show frame
+// - The frame is not intra only
+// - The current bsize is > BLOCK_8X8
+// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ const RD_STATS *none_rdc, PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, features,
+ FEATURE_SMS_PRUNE_PART_FLAG);
+ int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
+
+ features[f_idx++] = log1pf((float)none_rdc->rate);
+ features[f_idx++] = log1pf((float)none_rdc->dist);
+ features[f_idx++] = log1pf((float)none_rdc->rdcost);
+
+ assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
+
+ const float *ml_mean = NULL;
+ const float *ml_std = NULL;
+ const float *ml_model = NULL;
+
+ if (bsize == BLOCK_128X128) {
+ ml_mean = av1_simple_motion_search_term_none_mean_128;
+ ml_std = av1_simple_motion_search_term_none_std_128;
+ ml_model = av1_simple_motion_search_term_none_model_128;
+ } else if (bsize == BLOCK_64X64) {
+ ml_mean = av1_simple_motion_search_term_none_mean_64;
+ ml_std = av1_simple_motion_search_term_none_std_64;
+ ml_model = av1_simple_motion_search_term_none_model_64;
+ } else if (bsize == BLOCK_32X32) {
+ ml_mean = av1_simple_motion_search_term_none_mean_32;
+ ml_std = av1_simple_motion_search_term_none_std_32;
+ ml_model = av1_simple_motion_search_term_none_model_32;
+ } else if (bsize == BLOCK_16X16) {
+ ml_mean = av1_simple_motion_search_term_none_mean_16;
+ ml_std = av1_simple_motion_search_term_none_std_16;
+ ml_model = av1_simple_motion_search_term_none_model_16;
+ } else {
+ assert(0 && "Unexpected block size in simple_motion_term_none");
+ }
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_none_part2(
+ cpi, features, &part_state->terminate_partition_search)) {
+ return;
+ }
+
+ if (ml_model) {
+ float score = 0.0f;
+ for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
+ score +=
+ ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+ score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
+
+ if (score >= 0.0f) {
+ part_state->terminate_partition_search = 1;
+ }
+ }
+}
+
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ float *features) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size.
+ assert(sb_size == BLOCK_128X128);
+
+ int f_idx = 0;
+
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ const float log_q_sq = log1pf((float)(dc_q * dc_q) / 256.0f);
+
+ // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
+ float sum_mv_row_sq = 0;
+ float sum_mv_row = 0;
+ float min_abs_mv_row = FLT_MAX;
+ float max_abs_mv_row = 0;
+
+ float sum_mv_col_sq = 0;
+ float sum_mv_col = 0;
+ float min_abs_mv_col = FLT_MAX;
+ float max_abs_mv_col = 0;
+
+ float sum_log_sse_sq = 0;
+ float sum_log_sse = 0;
+ float min_log_sse = FLT_MAX;
+ float max_log_sse = 0;
+
+ const BLOCK_SIZE mb_size = BLOCK_16X16;
+ const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size];
+ const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size];
+ const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size];
+ const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size];
+
+ for (int mb_row = 0; mb_row < mb_rows; mb_row++)
+ for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+ const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2);
+ const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
+ unsigned int sse = 0;
+ unsigned int var = 0;
+ const FULLPEL_MV start_mv = kZeroFullMv;
+ const MV_REFERENCE_FRAME ref =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+ const int_mv best_mv = av1_simple_motion_search_sse_var(
+ cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse,
+ &var);
+
+ const float mv_row = (float)(best_mv.as_mv.row / 8);
+ const float mv_col = (float)(best_mv.as_mv.col / 8);
+ const float log_sse = log1pf((float)sse);
+ const float abs_mv_row = fabsf(mv_row);
+ const float abs_mv_col = fabsf(mv_col);
+
+ sum_mv_row_sq += mv_row * mv_row;
+ sum_mv_row += mv_row;
+ sum_mv_col_sq += mv_col * mv_col;
+ sum_mv_col += mv_col;
+
+ if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row;
+ if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row;
+ if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col;
+ if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col;
+
+ sum_log_sse_sq += log_sse * log_sse;
+ sum_log_sse += log_sse;
+ if (log_sse < min_log_sse) min_log_sse = log_sse;
+ if (log_sse > max_log_sse) max_log_sse = log_sse;
+ }
+ const int blks = mb_rows * mb_cols;
+ const float avg_mv_row = sum_mv_row / (float)blks;
+ const float var_mv_row =
+ sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row;
+
+ const float avg_mv_col = sum_mv_col / (float)blks;
+ const float var_mv_col =
+ sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col;
+
+ const float avg_log_sse = sum_log_sse / (float)blks;
+ const float var_log_sse =
+ sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse;
+
+ features[f_idx++] = avg_log_sse;
+ features[f_idx++] = avg_mv_col;
+ features[f_idx++] = avg_mv_row;
+ features[f_idx++] = log_q_sq;
+ features[f_idx++] = max_abs_mv_col;
+ features[f_idx++] = max_abs_mv_row;
+ features[f_idx++] = max_log_sse;
+ features[f_idx++] = min_abs_mv_col;
+ features[f_idx++] = min_abs_mv_row;
+ features[f_idx++] = min_log_sse;
+ features[f_idx++] = var_log_sse;
+ features[f_idx++] = var_mv_col;
+ features[f_idx++] = var_mv_row;
+
+ assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
+}
+
+// Convert result index to block size.
+// result idx block size
+// 0 BLOCK_16X16
+// 1 BLOCK_32X32
+// 2 BLOCK_64X64
+// 3 BLOCK_128X128
+static BLOCK_SIZE get_block_size(int idx) {
+ return (BLOCK_SIZE)((idx + 2) * 3);
+}
+
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const float *features) {
+ float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+ const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
+
+ assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+ NOT_IN_USE);
+
+ av1_nn_predict(features, nn_config, 1, scores);
+
+ int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
+ if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+ DIRECT_PRED) {
+ result = 0;
+ float max_score = scores[0];
+ for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
+ if (scores[i] > max_score) {
+ max_score = scores[i];
+ result = i;
+ }
+ }
+ return get_block_size(result);
+ }
+
+ float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+ av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+ if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+ RELAXED_PRED) {
+ for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+ --result) {
+ if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+ probs[result] += probs[result + 1];
+ }
+ if (probs[result] > 0.2) break;
+ }
+ } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+ ADAPT_PRED) {
+ const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+ // TODO(debargha): x->source_variance is unavailable at this point,
+ // so compute. The redundant recomputation later can be removed.
+ const unsigned int source_variance = av1_get_perpixel_variance_facade(
+ cpi, &x->e_mbd, &x->plane[0].src, sb_size, AOM_PLANE_Y);
+ if (source_variance > 16) {
+ const double thresh = source_variance < 128 ? 0.05 : 0.1;
+ for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+ --result) {
+ if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+ probs[result] += probs[result + 1];
+ }
+ if (probs[result] > thresh) break;
+ }
+ }
+ }
+
+ return get_block_size(result);
+}
+
+// Get the minimum partition block width and height(in log scale) under a
+// SIMPLE_MOTION_DATA_TREE.
+static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int *min_bw, int *min_bh) {
+ if (!sms_tree) return;
+
+ const BLOCK_SIZE bsize = sms_tree->block_size;
+ if (bsize == BLOCK_4X4) {
+ *min_bw = 0;
+ *min_bh = 0;
+ return;
+ }
+
+ PARTITION_TYPE part_type = sms_tree->partitioning;
+ if (part_type == PARTITION_INVALID) return;
+
+ if (part_type == PARTITION_SPLIT) {
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ get_min_bsize(sms_tree->split[i], min_bw, min_bh);
+ }
+ } else {
+ if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
+ part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B)
+ part_type = PARTITION_SPLIT;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type);
+ if (subsize != BLOCK_INVALID) {
+ *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]);
+ *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]);
+ }
+ }
+}
+
+static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
+ int *feature_idx) {
+ const int rd_valid = rd > 0 && rd < INT64_MAX;
+ const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f;
+ features[(*feature_idx)++] = (float)rd_valid;
+ features[(*feature_idx)++] = rd_ratio;
+}
+
+#define FEATURES 31
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ int64_t best_rd, int64_t part_none_rd,
+ int64_t part_split_rd,
+ int64_t *split_block_rd,
+ PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ if (best_rd <= 0 || best_rd == INT64_MAX ||
+ part_state->terminate_partition_search)
+ return;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const NN_CONFIG *nn_config = NULL;
+ float thresh = -1e6;
+ switch (bsize) {
+ case BLOCK_128X128: break;
+ case BLOCK_64X64:
+ nn_config = &av1_early_term_after_split_nnconfig_64;
+ thresh = is_480p_or_larger ? -2.0f : -1.2f;
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_early_term_after_split_nnconfig_32;
+ thresh = is_480p_or_larger ? -2.6f : -2.3f;
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_early_term_after_split_nnconfig_16;
+ thresh = is_480p_or_larger ? -2.0f : -2.4f;
+ break;
+ case BLOCK_8X8:
+ nn_config = &av1_early_term_after_split_nnconfig_8;
+ thresh = is_480p_or_larger ? -1.0f : -1.4f;
+ break;
+ case BLOCK_4X4: break;
+ default:
+ assert(0 && "Invalid block size in av1_ml_early_term_after_split().");
+ break;
+ }
+ if (!nn_config) return;
+
+ // Use more conservative threshold for level 1.
+ if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ const int bs = block_size_wide[bsize];
+ int f_idx = 0;
+ float features[FEATURES] = { 0.0f };
+
+ features[f_idx++] = log1pf((float)dc_q / 4.0f);
+ features[f_idx++] = log1pf((float)best_rd / bs / bs / 1024.0f);
+
+ add_rd_feature(part_none_rd, best_rd, features, &f_idx);
+ add_rd_feature(part_split_rd, best_rd, features, &f_idx);
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
+ int min_bw = MAX_SB_SIZE_LOG2;
+ int min_bh = MAX_SB_SIZE_LOG2;
+ get_min_bsize(sms_tree->split[i], &min_bw, &min_bh);
+ features[f_idx++] = (float)min_bw;
+ features[f_idx++] = (float)min_bh;
+ }
+
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, NULL,
+ FEATURE_SMS_PRUNE_PART_FLAG);
+
+ features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[1]);
+
+ features[f_idx++] = log1pf((float)sms_tree->split[0]->sms_none_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->split[1]->sms_none_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->split[2]->sms_none_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->split[3]->sms_none_feat[1]);
+
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[3]);
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[5]);
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[7]);
+
+ assert(f_idx == FEATURES);
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features, FEATURES,
+ 4, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_split(
+ cpi, features, &part_state->terminate_partition_search)) {
+ return;
+ }
+
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, 1, &score);
+ // Score is indicator of confidence that we should NOT terminate.
+ if (score < thresh) {
+ part_state->terminate_partition_search = 1;
+ }
+}
+#undef FEATURES
+
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ int64_t best_rd, int64_t none_rd,
+ const int64_t *split_rd,
+ PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ best_rd = AOMMAX(best_rd, 1);
+ const NN_CONFIG *nn_config = NULL;
+ const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+ float cur_thresh = 0.0f;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_rect_partition_nnconfig_8;
+ cur_thresh = prob_thresholds[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_rect_partition_nnconfig_16;
+ cur_thresh = prob_thresholds[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_rect_partition_nnconfig_32;
+ cur_thresh = prob_thresholds[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_rect_partition_nnconfig_64;
+ cur_thresh = prob_thresholds[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_rect_partition_nnconfig_128;
+ cur_thresh = prob_thresholds[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ // 1. Compute input features
+ float features[9];
+
+ // RD cost ratios
+ for (int i = 0; i < 5; i++) features[i] = 1.0f;
+ if (none_rd > 0 && none_rd < 1000000000)
+ features[0] = (float)none_rd / (float)best_rd;
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ features[1 + i] = (float)split_rd[i] / (float)best_rd;
+ }
+
+ // Variance ratios
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int whole_block_variance;
+ whole_block_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+ whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+ int split_variance[SUB_PARTITIONS_SPLIT];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ struct buf_2d buf;
+ buf.stride = x->plane[0].src.stride;
+ const int bw = block_size_wide[bsize];
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ const int x_idx = (i & 1) * bw / 2;
+ const int y_idx = (i >> 1) * bw / 2;
+ buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+ split_variance[i] =
+ av1_get_perpixel_variance_facade(cpi, xd, &buf, subsize, AOM_PLANE_Y);
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
+ features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ /*feature_size=*/9, 5, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_split_part2(
+ &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+ features, &part_state->prune_rect_part[HORZ],
+ &part_state->prune_rect_part[VERT])) {
+ return;
+ }
+
+ // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+ float raw_scores[3] = { 0.0f };
+ av1_nn_predict(features, nn_config, 1, raw_scores);
+ float probs[3] = { 0.0f };
+ av1_nn_softmax(raw_scores, probs, 3);
+
+ // probs[0] is the probability of the fact that both rectangular partitions
+ // are worse than current best_rd
+ if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1;
+ if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+ int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed) {
+ const PartitionBlkParams blk_params = part_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ const NN_CONFIG *nn_config = NULL;
+ switch (bsize) {
+ case BLOCK_8X8: nn_config = NULL; break;
+ case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+ case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ // Generate features.
+ float features[10];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)var_ctx;
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ const int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ const int64_t *vert_rd = part_state->rect_part_rd[VERT];
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ const int64_t *split_rd = part_state->split_rd;
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+ assert(feature_index == 10);
+
+ // Write features to file
+ if (!frame_is_intra_only(&cpi->common)) {
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ /*feature_size=*/10, 6, bsize, mi_row, mi_col);
+ }
+
+ if (ext_ml_model_decision_after_rect(
+ &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+ features, &ab_partitions_allowed[HORZ_A],
+ &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A],
+ &ab_partitions_allowed[VERT_B])) {
+ return;
+ }
+
+ // Calculate scores using the NN model.
+ float score[16] = { 0.0f };
+ av1_nn_predict(features, nn_config, 1, score);
+ int int_score[16];
+ int max_score = -1000;
+ for (int i = 0; i < 16; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 150; break;
+ case BLOCK_32X32: thresh -= 100; break;
+ default: break;
+ }
+ av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS);
+ for (int i = 0; i < 16; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1;
+ if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1;
+ if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1;
+ if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1;
+ }
+ }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+ int part_ctx, int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *part4_allowed,
+ unsigned int pb_source_variance) {
+ const PartitionBlkParams blk_params = part_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
+ int64_t *split_rd = part_state->split_rd;
+ if (ext_ml_model_decision_after_part_ab(
+ cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd,
+ &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance,
+ mi_row, mi_col))
+ return;
+
+ if (best_rd >= 1000000000) return;
+ int64_t *horz_rd = rect_part_rd[HORZ4];
+ int64_t *vert_rd = rect_part_rd[VERT4];
+ const NN_CONFIG *nn_config = NULL;
+ // 4-way partitions are only allowed for these three square block sizes.
+ switch (bsize) {
+ case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ // Generate features.
+ float features[FEATURES];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+
+ // Get variance of the 1:4 and 4:1 sub-blocks.
+ unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ {
+ BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+ assert(horz_4_bs != BLOCK_INVALID);
+ assert(vert_4_bs != BLOCK_INVALID);
+
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common), bsize);
+ const int src_stride = x->plane[0].src.stride;
+ uint8_t *src = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ struct buf_2d horz_4_src, vert_4_src;
+ horz_4_src.stride = src_stride;
+ vert_4_src.stride = src_stride;
+
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+ vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+ horz_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y);
+ vert_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y);
+ }
+ }
+
+ const float denom = (float)(pb_source_variance + 1);
+ const float low_b = 0.1f;
+ const float high_b = 10.0f;
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 4:1 sub-block variance and the whole-block variance.
+ float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 1:4 sub-block RD and the whole-block RD.
+ float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ assert(feature_index == FEATURES);
+
+ // Write features to file
+ if (!frame_is_intra_only(&cpi->common)) {
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ FEATURES, 7, bsize, mi_row, mi_col);
+ }
+
+ // Calculate scores using the NN model.
+ float score[LABELS] = { 0.0f };
+ av1_nn_predict(features, nn_config, 1, score);
+ int int_score[LABELS];
+ int max_score = -1000;
+ for (int i = 0; i < LABELS; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 500; break;
+ case BLOCK_32X32: thresh -= 500; break;
+ case BLOCK_64X64: thresh -= 200; break;
+ default: break;
+ }
+ av1_zero_array(part4_allowed, NUM_PART4_TYPES);
+ for (int i = 0; i < LABELS; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) part4_allowed[HORZ4] = 1;
+ if ((i >> 1) & 1) part4_allowed[VERT4] = 1;
+ }
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance, int bit_depth,
+ PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const NN_CONFIG *nn_config = NULL;
+ int thresh = 0;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_partition_breakout_nnconfig_8;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_partition_breakout_nnconfig_16;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_partition_breakout_nnconfig_32;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_partition_breakout_nnconfig_64;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_partition_breakout_nnconfig_128;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config || thresh < 0) return;
+
+ const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
+ thresh = (int)((float)thresh *
+ ml_predict_breakout_thresh_scale
+ [cpi->sf.part_sf.ml_predict_breakout_level - 1]);
+
+ // Generate feature values.
+ float features[FEATURES];
+ int feature_index = 0;
+
+ const int num_pels_log2 = num_pels_log2_lookup[bsize];
+ float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+ rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+ rate_f;
+ features[feature_index++] = rate_f;
+
+ const float dist_f =
+ (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+ features[feature_index++] = dist_f;
+
+ features[feature_index++] = (float)pb_source_variance;
+
+ const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8);
+ features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+ assert(feature_index == FEATURES);
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features, FEATURES,
+ 2, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_none(&cpi->ext_part_controller,
+ frame_is_intra_only(&cpi->common),
+ features, &part_state->do_square_split,
+ &part_state->do_rectangular_split)) {
+ return;
+ }
+
+ // Calculate score using the NN model.
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ // Make decision.
+ if ((int)(score * 100) >= thresh) {
+ part_state->do_square_split = 0;
+ part_state->do_rectangular_split = 0;
+ }
+}
+#undef FEATURES
+
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ PartitionSearchState *part_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ if (cpi->third_pass_ctx) {
+ int mi_row = blk_params->mi_row;
+ int mi_col = blk_params->mi_col;
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width,
+ &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w);
+ BLOCK_SIZE third_pass_bsize =
+ av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w);
+ // check the actual partition of this block in the second pass
+ PARTITION_TYPE third_pass_part =
+ av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi);
+
+ int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) ||
+ (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols);
+
+ if (!is_edge && block_size_wide[bsize] >= 16) {
+ // If in second pass we used rectangular partition, then do not search for
+ // rectangular partition in the different direction.
+ if (third_pass_part != PARTITION_NONE) {
+ if (third_pass_part == PARTITION_HORZ ||
+ third_pass_part == PARTITION_HORZ_4 ||
+ third_pass_part == PARTITION_HORZ_A ||
+ third_pass_part == PARTITION_HORZ_B) {
+ part_state->partition_rect_allowed[VERT] = 0;
+ } else if (third_pass_part == PARTITION_VERT ||
+ third_pass_part == PARTITION_VERT_4 ||
+ third_pass_part == PARTITION_VERT_A ||
+ third_pass_part == PARTITION_VERT_B) {
+ part_state->partition_rect_allowed[HORZ] = 0;
+ }
+ }
+
+ int minSize = AOMMIN(block_size_wide[third_pass_bsize],
+ block_size_high[third_pass_bsize]);
+ int maxSize = AOMMAX(block_size_wide[third_pass_bsize],
+ block_size_high[third_pass_bsize]);
+ if (block_size_wide[bsize] < minSize / 4) {
+ // Current partition is too small, just terminate
+ part_state->terminate_partition_search = 1;
+ return;
+ } else if (block_size_wide[bsize] < minSize / 2) {
+ if (third_pass_part != PARTITION_NONE) {
+ // Current partition is very small, and in second pass we used
+ // rectangular partition. Terminate the search here then.
+ part_state->terminate_partition_search = 1;
+ return;
+ } else {
+ // Partition is small, but we still check this partition, only disable
+ // further splits.
+ // TODO(any): check why this is not covered by the termination for <
+ // minSize/4.
+ av1_disable_square_split_partition(part_state);
+ av1_disable_rect_partitions(part_state);
+ return;
+ }
+ } else if (block_size_wide[bsize] > maxSize) {
+ // Partition is larger than in the second pass. Only allow split.
+ av1_set_square_split_only(part_state);
+ return;
+ } else if (block_size_wide[bsize] >= minSize &&
+ block_size_wide[bsize] <= maxSize) {
+ // Partition is within a range where it is very likely to find a good
+ // choice, so do not prune anything.
+ return;
+ }
+ }
+ }
+
+ // Prune rectangular partitions for larger blocks.
+ if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) {
+ part_state->do_rectangular_split = 0;
+ part_state->partition_rect_allowed[HORZ] = 0;
+ part_state->partition_rect_allowed[VERT] = 0;
+ }
+
+ // Prune rectangular, AB and 4-way partition based on q index and block size
+ if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) {
+ if (bsize == BLOCK_8X8 && x->qindex < 35)
+ av1_disable_rect_partitions(part_state);
+
+ } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) {
+ // Enumeration difference between two square partitions
+ const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16;
+ int max_bsize =
+ BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step;
+ max_bsize = AOMMAX(max_bsize, BLOCK_4X4);
+ const BLOCK_SIZE max_prune_bsize =
+ (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32);
+
+ // Prune partition
+ // qidx 0 to 85: prune bsize below BLOCK_32X32
+ // qidx 86 to 170: prune bsize below BLOCK_16X16
+ // qidx 171 to 255: prune bsize below BLOCK_8X8
+ if (bsize < max_prune_bsize) {
+ av1_disable_rect_partitions(part_state);
+ }
+ }
+
+ if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int prune_sub_8x8;
+ if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) {
+ prune_sub_8x8 = 1;
+ } else {
+ assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1);
+ // Prune if both neighbors are available and either is > BLOCK_8X8
+ prune_sub_8x8 = xd->left_available && xd->up_available &&
+ (xd->left_mbmi->bsize > BLOCK_8X8 ||
+ xd->above_mbmi->bsize > BLOCK_8X8);
+ }
+ if (prune_sub_8x8) {
+ av1_disable_all_splits(part_state);
+ }
+ }
+
+ // A CNN-based speed feature pruning out either split or all non-split
+ // partition in INTRA frame coding.
+ const int try_intra_cnn_based_part_prune =
+ frame_is_intra_only(cm) &&
+ cpi->sf.part_sf.intra_cnn_based_part_prune_level &&
+ cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+ blk_params->bsize_at_least_8x8 &&
+ av1_is_whole_blk_in_frame(blk_params, mi_params);
+
+ if (try_intra_cnn_based_part_prune) {
+ av1_intra_mode_cnn_partition(
+ &cpi->common, x, x->part_search_info.quad_tree_idx,
+ cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state);
+ }
+
+ // Use simple motion search to prune out split or non-split partitions. This
+ // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a
+ // smaller blocksize.
+ const int try_split_only =
+ cpi->sf.part_sf.simple_motion_search_split &&
+ part_state->do_square_split && blk_params->bsize_at_least_8x8 &&
+ av1_is_whole_blk_in_frame(blk_params, mi_params) &&
+ !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
+
+ if (try_split_only) {
+ av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state);
+ }
+
+ // Use simple motion search to prune out rectangular partition in some
+ // direction. The results are stored in prune_horz and prune_vert in order to
+ // bypass future related pruning checks if a pruning decision has been made.
+
+ // We want to search at least one partition mode, so don't prune if NONE and
+ // SPLIT are disabled.
+ const int non_rect_part_allowed =
+ part_state->do_square_split || part_state->partition_none_allowed;
+ // Only run the model if the partitions are not already pruned.
+ const int rect_part_allowed = part_state->do_rectangular_split &&
+ ((part_state->partition_rect_allowed[HORZ] &&
+ !part_state->prune_rect_part[HORZ]) ||
+ (part_state->partition_rect_allowed[VERT] &&
+ !part_state->prune_rect_part[VERT]));
+
+ const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect &&
+ !frame_is_intra_only(cm) &&
+ non_rect_part_allowed && rect_part_allowed &&
+ !av1_superres_scaled(cm);
+
+ if (try_prune_rect) {
+ av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state);
+ }
+}
+
+#ifndef NDEBUG
+static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+ return block_size_wide[bsize] == block_size_high[bsize];
+}
+#endif // NDEBUG
+
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+ PartitionSearchState *part_state) {
+ assert(is_bsize_square(sb_enc->max_partition_size));
+ assert(is_bsize_square(sb_enc->min_partition_size));
+ assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+ assert(is_bsize_square(bsize));
+ const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
+ const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
+ const int bsize_1d = block_size_wide[bsize];
+ assert(min_partition_size_1d <= max_partition_size_1d);
+ const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d;
+ const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
+ if (is_gt_max_sq_part) {
+ // If current block size is larger than max, only allow split.
+ av1_set_square_split_only(part_state);
+ } else if (is_le_min_sq_part) {
+ // If current block size is less or equal to min, only allow none if valid
+ // block large enough; only allow split otherwise.
+ av1_disable_rect_partitions(part_state);
+
+ // only disable square split when current block is not at the picture
+ // boundary. otherwise, inherit the square split flag from previous logic
+ if (av1_blk_has_rows_and_cols(blk_params)) {
+ part_state->do_square_split = 0;
+ }
+ part_state->partition_none_allowed = !(part_state->do_square_split);
+ }
+}
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+ const PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+ const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+ int split_idx2) {
+ int num_win = 0;
+ // Threshold for number of winners
+ // Conservative pruning for high quantizers
+ const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+ int sub_part_win =
+ (rect_part_win_info == NULL) ? (pc_tree->partitioning == rect_part)
+ : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ]
+ : rect_part_win_info->rect_part_win[VERT];
+ num_win += (sub_part_win) ? 1 : 0;
+ if (pc_tree->split[split_idx1]) {
+ num_win +=
+ (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+ } else {
+ num_win += 1;
+ }
+ if (pc_tree->split[split_idx2]) {
+ num_win +=
+ (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+ } else {
+ num_win += 1;
+ }
+ if (num_win < num_win_thresh) {
+ return 0;
+ }
+ return 1;
+}
+
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+ const PC_TREE *pc_tree, int pb_source_variance,
+ int64_t best_rdcost,
+ const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+ bool ext_partition_allowed,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed) {
+ int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+ int64_t *vert_rd = part_state->rect_part_rd[VERT];
+ int64_t *split_rd = part_state->split_rd;
+ const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+ // The standard AB partitions are allowed initially if ext-partition-types are
+ // allowed.
+ int horzab_partition_allowed = ext_partition_allowed &&
+ part_cfg->enable_ab_partitions &&
+ part_state->partition_rect_allowed[HORZ];
+ int vertab_partition_allowed = ext_partition_allowed &&
+ part_cfg->enable_ab_partitions &&
+ part_state->partition_rect_allowed[VERT];
+
+ // Pruning: pruning out AB partitions on one main direction based on the
+ // current best partition and source variance.
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
+ // TODO(debargha,huisu@google.com): may need to tune the threshold for
+ // pb_source_variance.
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ (pc_tree->partitioning == PARTITION_NONE &&
+ pb_source_variance < 32) ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ (pc_tree->partitioning == PARTITION_NONE &&
+ pb_source_variance < 32) ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ } else {
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ }
+ horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+ horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+ vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+ vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+ split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+ split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+ split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+ split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+ }
+
+ // Pruning: pruning out horz_a or horz_b if the combined rdcost of its
+ // subblocks estimated from previous partitions is much higher than the best
+ // rd so far.
+ ab_partitions_allowed[HORZ_A] = horzab_partition_allowed;
+ ab_partitions_allowed[HORZ_B] = horzab_partition_allowed;
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+ const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+ switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ case 1:
+ ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost);
+ ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost);
+ break;
+ case 2:
+ default:
+ ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost);
+ ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost);
+ break;
+ }
+ }
+
+ // Pruning: pruning out vert_a or vert_b if the combined rdcost of its
+ // subblocks estimated from previous partitions is much higher than the best
+ // rd so far.
+ ab_partitions_allowed[VERT_A] = vertab_partition_allowed;
+ ab_partitions_allowed[VERT_B] = vertab_partition_allowed;
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+ const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+ switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ case 1:
+ ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost);
+ ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost);
+ break;
+ case 2:
+ default:
+ ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost);
+ ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost);
+ break;
+ }
+ }
+
+ // Pruning: pruning out some ab partitions using a DNN taking rd costs of
+ // sub-blocks from previous basic partition types.
+ if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed &&
+ part_state->partition_rect_allowed[HORZ] &&
+ part_state->partition_rect_allowed[VERT]) {
+ // TODO(huisu@google.com): x->source_variance may not be the current
+ // block's variance. The correct one to use is pb_source_variance. Need to
+ // re-train the model to fix it.
+ av1_ml_prune_ab_partition(cpi, pc_tree->partitioning,
+ get_unsigned_bits(x->source_variance),
+ best_rdcost, part_state, ab_partitions_allowed);
+ }
+
+ // Pruning: pruning AB partitions based on the number of horz/vert wins
+ // in the current block and sub-blocks in PARTITION_SPLIT.
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[HORZ_A]) {
+ ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+ }
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[HORZ_B]) {
+ ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+ }
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[VERT_A]) {
+ ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+ }
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[VERT_B]) {
+ ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+ }
+}
+
+// Prepare features for the external model. Specifically, features after
+// ab partition is searched.
+static void prepare_features_after_part_ab(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int part_ctx, int64_t best_rd,
+ int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+ int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance,
+ int mi_row, int mi_col, aom_partition_features_t *const features) {
+ int64_t *horz_rd = rect_part_rd[HORZ];
+ int64_t *vert_rd = rect_part_rd[VERT];
+
+ // Generate features.
+ int feature_index = 0;
+ features->after_part_ab.f[feature_index++] = (float)part_ctx;
+ features->after_part_ab.f[feature_index++] =
+ (float)get_unsigned_bits(pb_source_variance);
+
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features->after_part_ab.f[feature_index++] = rd_ratio;
+ }
+
+ // 4-way partitions are only allowed for these three square block sizes.
+ assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64);
+
+ // Get variance of the 1:4 and 4:1 sub-blocks.
+ unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ {
+ BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+ assert(horz_4_bs != BLOCK_INVALID);
+ assert(vert_4_bs != BLOCK_INVALID);
+
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common), bsize);
+ const int src_stride = x->plane[0].src.stride;
+ uint8_t *src = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ struct buf_2d horz_4_src, vert_4_src;
+ horz_4_src.stride = src_stride;
+ vert_4_src.stride = src_stride;
+
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+ vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+ horz_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y);
+ vert_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y);
+ }
+ }
+
+ const float denom = (float)(pb_source_variance + 1);
+ const float low_b = 0.1f;
+ const float high_b = 10.0f;
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 4:1 sub-block variance and the whole-block variance.
+ float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features->after_part_ab.f[feature_index++] = var_ratio;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 1:4 sub-block RD and the whole-block RD.
+ float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features->after_part_ab.f[feature_index++] = var_ratio;
+ }
+ assert(feature_index == 18);
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// partition_none_allowed
+// partition_horz_allowed
+// partition_vert_allowed
+// do_rectangular_split
+// do_square_split
+static bool ext_ml_model_decision_before_none(
+ AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+ int *partition_none_allowed, int *partition_horz_allowed,
+ int *partition_vert_allowed, int *do_rectangular_split,
+ int *do_square_split) {
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (!ext_part_controller->ready) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE;
+ for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) {
+ features.before_part_none.f[i] = features_from_motion[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *partition_none_allowed = decision.partition_none_allowed;
+ *partition_horz_allowed = decision.partition_rect_allowed[HORZ];
+ *partition_vert_allowed = decision.partition_rect_allowed[VERT];
+ *do_rectangular_split = decision.do_rectangular_split;
+ *do_square_split = decision.do_square_split;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// prune_horz
+// prune_vert
+static bool ext_ml_model_decision_before_none_part2(
+ AV1_COMP *cpi,
+ const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+ int *prune_horz, int *prune_vert) {
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (!ext_part_controller->ready) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2;
+ for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) {
+ features.before_part_none.f_part2[i] = features_from_motion[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *prune_horz = decision.prune_rect_part[HORZ];
+ *prune_vert = decision.prune_rect_part[VERT];
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// do_square_split
+// do_rectangular_split
+bool ext_ml_model_decision_after_none(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_none, int *do_square_split,
+ int *do_rectangular_split) {
+ if (!ext_part_controller->ready || is_intra_frame) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_NONE;
+ for (int i = 0; i < 4; ++i) {
+ features.after_part_none.f[i] = features_after_none[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *do_square_split = decision.do_square_split;
+ *do_rectangular_split = decision.do_rectangular_split;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_none_part2(
+ AV1_COMP *const cpi, const float *const features_terminate,
+ int *terminate_partition_search) {
+ AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2;
+ for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) {
+ features.after_part_none.f_terminate[i] = features_terminate[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *terminate_partition_search = decision.terminate_partition_search;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_split(AV1_COMP *const cpi,
+ const float *const features_terminate,
+ int *terminate_partition_search) {
+ const AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) {
+ return false;
+ }
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT;
+ for (int i = 0; i < 31; ++i) {
+ features.after_part_split.f_terminate[i] = features_terminate[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *terminate_partition_search = decision.terminate_partition_search;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// prune_rect_part[HORZ]
+// prune_rect_part[VERT]
+bool ext_ml_model_decision_after_split_part2(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_prune, int *prune_rect_part_horz,
+ int *prune_rect_part_vert) {
+ if (is_intra_frame || !ext_part_controller->ready) {
+ return false;
+ }
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2;
+ for (int i = 0; i < 9; ++i) {
+ features.after_part_split.f_prune_rect[i] = features_prune[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *prune_rect_part_horz = decision.prune_rect_part[0];
+ *prune_rect_part_vert = decision.prune_rect_part[1];
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after rectangular partition. Specifically, these parameters:
+// horza_partition_allowed
+// horzb_partition_allowed
+// verta_partition_allowed
+// vertb_partition_allowed
+static bool ext_ml_model_decision_after_rect(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_rect, int *horza_partition_allowed,
+ int *horzb_partition_allowed, int *verta_partition_allowed,
+ int *vertb_partition_allowed) {
+ if (is_intra_frame || !ext_part_controller->ready) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_RECT;
+ for (int i = 0; i < 10; ++i) {
+ features.after_part_rect.f[i] = features_after_rect[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *horza_partition_allowed = decision.horza_partition_allowed;
+ *horzb_partition_allowed = decision.horzb_partition_allowed;
+ *verta_partition_allowed = decision.verta_partition_allowed;
+ *vertb_partition_allowed = decision.vertb_partition_allowed;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after AB partition. Specifically, these parameters:
+// partition_vert4_allowed
+// partition_horz4_allowed
+static bool ext_ml_model_decision_after_part_ab(
+ AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+ int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+ int *const partition_vert4_allowed, unsigned int pb_source_variance,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+
+ if (!frame_is_intra_only(cm) && ext_part_controller->ready) {
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_AB;
+ prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd,
+ rect_part_rd, split_rd, pb_source_variance,
+ mi_row, mi_col, &features);
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *partition_horz4_allowed = decision.partition_horz4_allowed;
+ *partition_vert4_allowed = decision.partition_vert4_allowed;
+
+ return true;
+ }
+
+ return false;
+}
+
+// This function resembles "av1_setup_sms_tree()" in context_tree.c
+// with function signature change.
+static SIMPLE_MOTION_DATA_TREE *setup_sms_tree(
+ AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ int sms_tree_index = 0;
+ SIMPLE_MOTION_DATA_TREE *this_sms;
+ int square_index = 1;
+ int nodes;
+ this_sms = &sms_tree[0];
+
+ if (!stat_generation_stage) {
+ const int leaf_factor = is_sb_size_128 ? 4 : 1;
+ const int leaf_nodes = 256 * leaf_factor;
+
+ // Sets up all the leaf nodes in the tree.
+ for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+ tree->block_size = square[0];
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (int i = 0; i < nodes; ++i) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+ tree->block_size = square[square_index];
+ for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+ ++sms_tree_index;
+ }
+ ++square_index;
+ }
+ } else {
+ // Allocation for firstpass/LAP stage
+ // TODO(Mufaddal): refactor square_index to use a common block_size macro
+ // from firstpass.c
+ SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+ square_index = 2;
+ tree->block_size = square[square_index];
+ }
+
+ // Set up the root node for the largest superblock size
+ return &sms_tree[tree_nodes - 1];
+}
+
+static void write_motion_feature_to_file(
+ const char *const path, const int sb_counter, const unsigned int *block_sse,
+ const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize,
+ const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) {
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path,
+ sb_counter);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+ block_size_wide[fixed_block_size], num_blocks);
+ for (int i = 0; i < num_blocks; ++i) {
+ fprintf(pfile, "%d", block_sse[i]);
+ if (i < num_blocks - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ for (int i = 0; i < num_blocks; ++i) {
+ fprintf(pfile, "%d", block_var[i]);
+ if (i < num_blocks - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ fclose(pfile);
+}
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize,
+ aom_partition_features_t *features) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (frame_is_intra_only(cm)) return;
+
+ MACROBLOCK *const x = &td->mb;
+ const BLOCK_SIZE fixed_block_size = BLOCK_16X16;
+ const int col_step = mi_size_wide[fixed_block_size];
+ const int row_step = mi_size_high[fixed_block_size];
+ SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+ SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+ av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row,
+ mi_col);
+ av1_reset_simple_motion_tree_partition(sms_root, bsize);
+ const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+ : LAST_FRAME };
+ const int mi_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int mi_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+ const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0);
+ const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0);
+ const int num_blocks = col_steps * row_steps;
+ unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse));
+ unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var));
+ if (!(block_sse && block_var)) {
+ aom_free(sms_tree);
+ aom_free(block_sse);
+ aom_free(block_var);
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating block_sse & block_var");
+ }
+ int idx = 0;
+
+ for (int row = mi_row;
+ row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows);
+ row += row_step) {
+ for (int col = mi_col;
+ col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols);
+ col += col_step) {
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_root, row, col, fixed_block_size, ref_list,
+ /*num_refs=*/1, /*use_subpixel=*/1,
+ /*save_mv=*/1, &block_sse[idx], &block_var[idx]);
+ ++idx;
+ }
+ }
+ if (features == NULL) {
+ write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter,
+ block_sse, block_var, idx, bsize,
+ fixed_block_size, mi_row, mi_col);
+ } else {
+ features->sb_features.motion_features.unit_length =
+ block_size_wide[fixed_block_size];
+ features->sb_features.motion_features.num_units = idx;
+ for (int i = 0; i < idx; ++i) {
+ features->sb_features.motion_features.block_sse[i] = block_sse[i];
+ features->sb_features.motion_features.block_var[i] = block_var[i];
+ }
+ }
+
+ aom_free(block_sse);
+ aom_free(block_var);
+ aom_free(sms_tree);
+}
+
+void av1_prepare_motion_search_features_block(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+ const int valid_partition_types, unsigned int *block_sse,
+ unsigned int *block_var, unsigned int sub_block_sse[4],
+ unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+ unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+ unsigned int vert_block_var[2]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (frame_is_intra_only(cm)) return;
+ MACROBLOCK *const x = &td->mb;
+ SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+ SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+ av1_reset_simple_motion_tree_partition(sms_root, bsize);
+ const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+ : LAST_FRAME };
+ const int sub_mi_width = mi_size_wide[bsize] / 2;
+ const int sub_mi_height = sub_mi_width;
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var);
+ // Split to 4 sub blocks.
+ if (valid_partition_types & (1 << PARTITION_SPLIT)) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int i = 0; i < 4; ++i) {
+ const int row = mi_row + (i >> 1) * sub_mi_height;
+ const int col = mi_col + (i & 1) * sub_mi_width;
+ simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+ ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1,
+ &sub_block_sse[i], &sub_block_var[i]);
+ }
+ }
+ // Horizontal split
+ if (valid_partition_types & (1 << PARTITION_HORZ)) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ for (int i = 0; i < 2; ++i) {
+ const int row = mi_row + (i & 1) * sub_mi_height;
+ const int col = mi_col;
+ simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+ ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1,
+ &horz_block_sse[i], &horz_block_var[i]);
+ }
+ }
+ // Vertical split
+ if (valid_partition_types & (1 << PARTITION_VERT)) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ for (int i = 0; i < 2; ++i) {
+ const int row = mi_row;
+ const int col = mi_col + (i & 1) * sub_mi_width;
+ simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+ ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1,
+ &vert_block_sse[i], &vert_block_var[i]);
+ }
+ }
+
+ aom_free(sms_tree);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void init_simple_motion_search_mvs(
+ SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) {
+ memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs));
+ av1_zero(sms_tree->sms_none_feat);
+ av1_zero(sms_tree->sms_rect_feat);
+ av1_zero(sms_tree->sms_none_valid);
+ av1_zero(sms_tree->sms_rect_valid);
+
+ if (sms_tree->block_size >= BLOCK_8X8) {
+ init_simple_motion_search_mvs(sms_tree->split[0], start_mvs);
+ init_simple_motion_search_mvs(sms_tree->split[1], start_mvs);
+ init_simple_motion_search_mvs(sms_tree->split[2], start_mvs);
+ init_simple_motion_search_mvs(sms_tree->split[3], start_mvs);
+ }
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+ const TileInfo *tile_info,
+ MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col) {
+ // Use the NEARESTMV of the sb as the start mv
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FULLPEL_MV ref_mvs[REF_FRAMES];
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ av1_zero(ref_mvs);
+ // If tile_info is NULL, assume that the offsets have already been set.
+ if (tile_info) {
+ av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
+ sb_size);
+ }
+
+ MB_MODE_INFO_EXT mbmi_ext;
+ const int ref_frame =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+ av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs,
+ mbmi_ext.mode_context);
+ if (mbmi_ext.ref_mv_count[ref_frame] > 0) {
+ ref_mvs[ref_frame] =
+ get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+ } else {
+ ref_mvs[ref_frame] =
+ get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv);
+ }
+
+ init_simple_motion_search_mvs(sms_root, ref_mvs);
+}
diff --git a/third_party/aom/av1/encoder/partition_strategy.h b/third_party/aom/av1/encoder/partition_strategy.h
new file mode 100644
index 0000000000..84683f5fd4
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_strategy.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encoder.h"
+
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+ int label_idx,
+ int intra_cnn_based_part_prune_level,
+ PartitionSearchState *part_state);
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Then use the features to determine whether we want
+// to go straight to splitting without trying PARTITION_NONE
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state);
+
+// Performs a simple_motion_search with two reference frames and extract
+// the variance of residues. Then use the features to determine whether we want
+// to prune some partitions.
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state);
+
+#if !CONFIG_REALTIME_ONLY
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+// - The frame is a show frame
+// - The frame is not intra only
+// - The current bsize is > BLOCK_8X8
+// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ const RD_STATS *none_rdc,
+ PartitionSearchState *part_state);
+
+// Get the features for selecting the max and min partition size. Currently this
+// performs simple_motion_search on 16X16 subblocks of the current superblock,
+// and then extract the statistics of sse and motion vectors as features.
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ float *features);
+
+// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const float *features);
+
+// Attempts an early termination after PARTITION_SPLIT.
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ int64_t best_rd, int64_t part_none_rd,
+ int64_t part_split_rd,
+ int64_t *split_block_rd,
+ PartitionSearchState *part_state);
+
+// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
+// PARTITION_VERT.
+// TODO(chiyotsai@google.com): Currently this model does not use q value and has
+// no information about rectangular partitions. Preliminary experiments suggest
+// that we can get better performance by adding in q_index and rectangular
+// sse/var from SMS. We should retrain and tune this model later.
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ int64_t best_rd, int64_t none_rd,
+ const int64_t *split_rd,
+ PartitionSearchState *part_state);
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+ int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed);
+
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+ int part_ctx, int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *part4_allowed,
+ unsigned int pb_source_variance);
+
+// ML-based partition search breakout after PARTITION_NONE.
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance, int bit_depth,
+ PartitionSearchState *part_state);
+
+// The first round of partition pruning determined before any partition
+// has been tested. The decisions will be updated and passed back
+// to the partition search function.
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ PartitionSearchState *part_state);
+
+// Prune out partitions that lead to coding block sizes outside the min and max
+// bsizes set by the encoder. Max and min square partition levels are defined as
+// the partition nodes that the recursive function rd_pick_partition() can
+// reach. To implement this: only PARTITION_NONE is allowed if the current node
+// equals max_partition_size, only PARTITION_SPLIT is allowed if the current
+// node exceeds max_partition_size.
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+ PartitionSearchState *part_state);
+
+// Prune out AB partitions based on rd decisions made from testing the
+// basic partitions.
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+ const PC_TREE *pc_tree, int pb_source_variance,
+ int64_t best_rdcost,
+ const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+ bool ext_partition_allowed,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed);
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize,
+ aom_partition_features_t *features);
+void av1_prepare_motion_search_features_block(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+ const int valid_partition_types, unsigned int *block_sse,
+ unsigned int *block_var, unsigned int sub_block_sse[4],
+ unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+ unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+ unsigned int vert_block_var[2]);
+#endif // !CONFIG_REALTIME_ONLY
+
+// A simplified version of set_offsets meant to be used for
+// simple_motion_search.
+static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+ mi_width, cpi->oxcf.border_in_pixels);
+
+ set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+ xd->mi_row = mi_row;
+ xd->mi_col = mi_col;
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge =
+ GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+ xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+ xd->mb_to_right_edge =
+ GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE);
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+ const TileInfo *tile_info,
+ MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col);
+
+static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col, BLOCK_SIZE sb_size) {
+ const int sb_mi_wide = mi_size_wide[sb_size];
+ const int sb_mi_high = mi_size_high[sb_size];
+
+ return (mi_row + sb_mi_high) <= mi_params->mi_rows &&
+ (mi_col + sb_mi_wide) <= mi_params->mi_cols;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Do not use this criteria for screen content videos.
+// Since screen content videos could often find good predictors and the largest
+// block size is likely to be used.
+static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
+ BLOCK_SIZE sb_size, int mi_row,
+ int mi_col) {
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const AV1_COMMON *const cm = &cpi->common;
+ return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
+ cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+ NOT_IN_USE &&
+ sb_size == BLOCK_128X128 &&
+ is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+ OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+ INTNL_OVERLAY_UPDATE;
+}
+
+static BLOCK_SIZE dim_to_size(int dim) {
+ switch (dim) {
+ case 4: return BLOCK_4X4;
+ case 8: return BLOCK_8X8;
+ case 16: return BLOCK_16X16;
+ case 32: return BLOCK_32X32;
+ case 64: return BLOCK_64X64;
+ case 128: return BLOCK_128X128;
+ default: assert(0); return 0;
+ }
+}
+
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+ AV1_COMP *cpi, MACROBLOCK *x,
+ const SPEED_FEATURES *sf,
+ BLOCK_SIZE sb_size,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+
+ sb_enc->max_partition_size =
+ AOMMIN(sf->part_sf.default_max_partition_size,
+ dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+ sb_enc->min_partition_size =
+ AOMMAX(sf->part_sf.default_min_partition_size,
+ dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+ sb_enc->max_partition_size =
+ AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size);
+ sb_enc->min_partition_size =
+ AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size);
+
+ if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+ float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+ av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+ sb_enc->max_partition_size =
+ AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+ sb_enc->max_partition_size),
+ sb_enc->min_partition_size);
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/pass2_strategy.c b/third_party/aom/av1/encoder/pass2_strategy.c
new file mode 100644
index 0000000000..a9442ffc1a
--- /dev/null
+++ b/third_party/aom/av1/encoder/pass2_strategy.c
@@ -0,0 +1,4488 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup gf_group_algo Golden Frame Group
+ * \ingroup high_level_algo
+ * Algorithms regarding determining the length of GF groups and defining GF
+ * group structures.
+ * @{
+ */
+/*! @} - end defgroup gf_group_algo */
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_mem/aom_mem.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encode_strategy.h"
+
+#define DEFAULT_KF_BOOST 2300
+#define DEFAULT_GF_BOOST 2000
+#define GROUP_ADAPTIVE_MAXQ 1
+
+static void init_gf_stats(GF_GROUP_STATS *gf_stats);
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+ int is_final_pass);
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame) {
+ const double active_pct =
+ 1.0 -
+ ((this_frame->intra_skip_pct / 2) +
+ ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
+ return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err_new(const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *total_stats,
+ const FIRSTPASS_STATS *this_stats,
+ int vbrbias, double modified_error_min,
+ double modified_error_max) {
+ if (total_stats == NULL) {
+ return 0;
+ }
+ const double av_weight = total_stats->weight / total_stats->count;
+ const double av_err =
+ (total_stats->coded_error * av_weight) / total_stats->count;
+ double modified_error =
+ av_err * pow(this_stats->coded_error * this_stats->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ vbrbias / 100.0);
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_error *=
+ pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION);
+
+ return fclamp(modified_error, modified_error_min, modified_error_max);
+}
+
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+ const TWO_PASS *twopass,
+ const AV1EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+ return calculate_modified_err_new(
+ frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias,
+ twopass->modified_error_min, twopass->modified_error_max);
+}
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS_FRAME *p_frame,
+ const FIRSTPASS_STATS *position) {
+ p_frame->stats_in = position;
+}
+
+static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+ FIRSTPASS_STATS *fps) {
+ if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+ *fps = *p_frame->stats_in;
+ ++p_frame->stats_in;
+ return 1;
+}
+
+static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+ FIRSTPASS_STATS *fps) {
+ if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+ *fps = *p_frame->stats_in;
+ /* Move old stats[0] out to accommodate for next frame stats */
+ memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
+ (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) *
+ sizeof(FIRSTPASS_STATS));
+ p->stats_buf_ctx->stats_in_end--;
+ return 1;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p,
+ const TWO_PASS_FRAME *p_frame,
+ int offset) {
+ if ((offset >= 0 &&
+ p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+ (offset < 0 &&
+ p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
+ return NULL;
+ }
+
+ return &p_frame->stats_in[offset];
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+ (int64_t)oxcf->rc_cfg.vbrmax_section) /
+ 100;
+ if (max_bits < 0)
+ max_bits = 0;
+ else if (max_bits > rc->max_frame_bandwidth)
+ max_bits = rc->max_frame_bandwidth;
+
+ return (int)max_bits;
+}
+
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+ 0.80, 0.85, 0.90,
+ 0.95, 0.95, 0.95 };
+#define ERR_DIVISOR 96.0
+static double calc_correction_factor(double err_per_mb, int q) {
+ const double error_term = err_per_mb / ERR_DIVISOR;
+ const int index = q >> 5;
+ // Adjustment to power term based on qindex
+ const double power_term =
+ q_pow_term[index] +
+ (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
+ assert(error_term >= 0.0);
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+// Based on history adjust expectations of bits per macroblock.
+static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+ // Based on recent history adjust expectations of bits per macroblock.
+ double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
+ double rate_err_factor = 1.0;
+ const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0);
+ const double min_fac = 1.0 - adj_limit;
+ const double max_fac = 1.0 + adj_limit;
+
+ if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) {
+ int64_t actual_bits = 0;
+ int64_t target_bits = 0;
+ double factor = 0.0;
+ int count = 0;
+ for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) {
+ actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits;
+ target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated;
+ factor += cpi->third_pass_ctx->frame_info[i].bpm_factor;
+ count++;
+ }
+
+ if (count == 0) {
+ factor = 1.0;
+ } else {
+ factor /= (double)count;
+ }
+
+ factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits);
+
+ if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) ||
+ (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) {
+ twopass->bpm_factor = factor;
+ twopass->bpm_factor =
+ AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+ }
+ }
+
+ int err_estimate = p_rc->rate_error_estimate;
+ int64_t bits_left = twopass->bits_left;
+ int64_t total_actual_bits = p_rc->total_actual_bits;
+ int64_t bits_off_target = p_rc->vbr_bits_off_target;
+ double rolling_arf_group_actual_bits =
+ (double)twopass->rolling_arf_group_actual_bits;
+ double rolling_arf_group_target_bits =
+ (double)twopass->rolling_arf_group_target_bits;
+
+#if CONFIG_FPMT_TEST
+ const int is_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0;
+ const int simulate_parallel_frame =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+ ? is_parallel_frame
+ : 0;
+ total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits
+ : p_rc->total_actual_bits;
+ bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target
+ : p_rc->vbr_bits_off_target;
+ bits_left =
+ simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left;
+ rolling_arf_group_target_bits =
+ (double)(simulate_parallel_frame
+ ? p_rc->temp_rolling_arf_group_target_bits
+ : twopass->rolling_arf_group_target_bits);
+ rolling_arf_group_actual_bits =
+ (double)(simulate_parallel_frame
+ ? p_rc->temp_rolling_arf_group_actual_bits
+ : twopass->rolling_arf_group_actual_bits);
+ err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate
+ : p_rc->rate_error_estimate;
+#endif
+
+ if (p_rc->bits_off_target && total_actual_bits > 0) {
+ if (cpi->ppi->lap_enabled) {
+ rate_err_factor = rolling_arf_group_actual_bits /
+ DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits);
+ } else {
+ rate_err_factor = 1.0 - ((double)(bits_off_target) /
+ AOMMAX(total_actual_bits, bits_left));
+ }
+
+ // Adjustment is damped if this is 1 pass with look ahead processing
+ // (as there are only ever a few frames of data) and for all but the first
+ // GOP in normal two pass.
+ if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
+ rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
+ }
+ rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
+ }
+
+ // Is the rate control trending in the right direction. Only make
+ // an adjustment if things are getting worse.
+ if ((rate_err_factor < 1.0 && err_estimate >= 0) ||
+ (rate_err_factor > 1.0 && err_estimate <= 0)) {
+ twopass->bpm_factor *= rate_err_factor;
+ if (rate_err_tol >= 100) {
+ twopass->bpm_factor =
+ AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+ } else {
+ twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor));
+ }
+ }
+}
+
+static int qbpm_enumerator(int rate_err_tol) {
+ return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but includes
+// calculation of a correction_factor.
+static int find_qindex_by_rate_with_correction(
+ int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb,
+ double group_weight_factor, int rate_err_tol, int best_qindex,
+ int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const double mid_factor = calc_correction_factor(error_per_mb, mid);
+ const double q = av1_convert_qindex_to_q(mid, bit_depth);
+ const int enumerator = qbpm_enumerator(rate_err_tol);
+ const int mid_bits_per_mb =
+ (int)((enumerator * mid_factor * group_weight_factor) / q);
+
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ return low;
+}
+
+/*!\brief Choose a target maximum Q for a group of frames
+ *
+ * \ingroup rate_control
+ *
+ * This function is used to estimate a suitable maximum Q for a
+ * group of frames. Inititally it is called to get a crude estimate
+ * for the whole clip. It is then called for each ARF/GF group to get
+ * a revised estimate for that group.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] av_frame_err The average per frame coded error score
+ * for frames making up this section/group.
+ * \param[in] inactive_zone Used to mask off /ignore part of the
+ * frame. The most common use case is where
+ * a wide format video (e.g. 16:9) is
+ * letter-boxed into a more square format.
+ * Here we want to ignore the bands at the
+ * top and bottom.
+ * \param[in] av_target_bandwidth The target bits per frame
+ *
+ * \return The maximum Q for frames in the group.
+ */
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
+ double inactive_zone,
+ int av_target_bandwidth) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ inactive_zone = fclamp(inactive_zone, 0.0, 0.9999);
+
+ if (av_target_bandwidth <= 0) {
+ return rc->worst_quality; // Highest value allowed
+ } else {
+ const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.mi_params.MBs;
+ const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+ const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone);
+ const int target_norm_bits_per_mb =
+ (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
+ int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+
+ // Update bpm correction factor based on previous GOP rate error.
+ twopass_update_bpm_factor(cpi, rate_err_tol);
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ int q = find_qindex_by_rate_with_correction(
+ target_norm_bits_per_mb, cpi->common.seq_params->bit_depth,
+ av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol,
+ rc->best_quality, rc->worst_quality);
+
+ // Restriction on active max q for constrained quality mode.
+ if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
+ return q;
+ }
+}
+
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.01
+#define NCOUNT_FRAME_II_THRESH 5.0
+#define LOW_CODED_ERR_PER_MB 0.01
+
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) {
+ double sr_diff = (frame->sr_coded_error - frame->coded_error);
+ double sr_decay = 1.0;
+ double modified_pct_inter;
+ double modified_pcnt_intra;
+
+ modified_pct_inter = frame->pcnt_inter;
+ if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+ ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH)) {
+ modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+ if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+ double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error);
+ sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
+ }
+ return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
+ const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+ double sr_decay = get_sr_decay_rate(frame);
+ return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define DEFAULT_ZM_FACTOR 0.5
+static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) {
+ const double sr_decay_rate = get_sr_decay_rate(frame_stats);
+ double zero_motion_factor =
+ DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+
+ // Clamp value to range 0.0 to 1.0
+ // This should happen anyway if input values are sensibly clamped but checked
+ // here just in case.
+ if (zero_motion_factor > 1.0)
+ zero_motion_factor = 1.0;
+ else if (zero_motion_factor < 0.0)
+ zero_motion_factor = 0.0;
+
+ return AOMMAX(zero_motion_factor,
+ (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info,
+ int next_stats_index,
+ const int min_gf_interval,
+ const int frame_interval,
+ const int still_interval,
+ const double loop_decay_rate,
+ const double last_decay_rate) {
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
+ last_decay_rate < 0.9) {
+ int stats_left =
+ av1_firstpass_info_future_count(firstpass_info, next_stats_index);
+ if (stats_left >= still_interval) {
+ int j;
+ // Look ahead a few frames to see if static condition persists...
+ for (j = 0; j < still_interval; ++j) {
+ const FIRSTPASS_STATS *stats =
+ av1_firstpass_info_peek(firstpass_info, next_stats_index + j);
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+ }
+ // Only if it does do we signal a transition to still.
+ return j == still_interval;
+ }
+ }
+ return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass,
+ const TWO_PASS_FRAME *twopass_frame, const int offset) {
+ const FIRSTPASS_STATS *const next_frame =
+ read_frame_stats(twopass, twopass_frame, offset);
+
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // compared to pcnt_inter.
+ return next_frame != NULL &&
+ next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+ next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+ GF_GROUP_STATS *gf_stats, double f_w,
+ double f_h) {
+ const double pct = stats->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats.
+ gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct;
+ gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out;
+ gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out);
+
+ // Accumulate a measure of how uniform (or conversely how random) the motion
+ // field is (a ratio of abs(mv) / mv).
+ if (pct > 0.05) {
+ const double mvr_ratio =
+ fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+ const double mvc_ratio =
+ fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+ gf_stats->mv_ratio_accumulator +=
+ pct *
+ (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h);
+ gf_stats->mv_ratio_accumulator +=
+ pct *
+ (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w);
+ }
+}
+
+static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
+ const double mod_frame_err,
+ GF_GROUP_STATS *gf_stats) {
+ gf_stats->gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_stats->gf_group_raw_error += stats->coded_error;
+#endif
+ gf_stats->gf_group_skip_pct += stats->intra_skip_pct;
+ gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
+}
+
+static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
+ const int flash_detected,
+ const int frames_since_key,
+ const int cur_idx,
+ GF_GROUP_STATS *gf_stats, int f_w,
+ int f_h) {
+ accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h);
+ // sum up the metric values of current gf group
+ gf_stats->avg_sr_coded_error += stats->sr_coded_error;
+ gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
+ gf_stats->avg_new_mv_count += stats->new_mv_count;
+ gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
+ if (fabs(stats->raw_error_stdev) > 0.000001) {
+ gf_stats->non_zero_stdev_count++;
+ gf_stats->avg_raw_err_stdev += stats->raw_error_stdev;
+ }
+
+ // Accumulate the effect of prediction quality decay
+ if (!flash_detected) {
+ gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
+ gf_stats->loop_decay_rate = get_prediction_decay_rate(stats);
+
+ gf_stats->decay_accumulator =
+ gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
+
+ // Monitor for static sections.
+ if ((frames_since_key + cur_idx - 1) > 1) {
+ gf_stats->zero_motion_accumulator = AOMMIN(
+ gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats));
+ }
+ }
+}
+
+static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) {
+ if (total_frame) {
+ gf_stats->avg_sr_coded_error /= total_frame;
+ gf_stats->avg_pcnt_second_ref /= total_frame;
+ gf_stats->avg_new_mv_count /= total_frame;
+ gf_stats->avg_wavelet_energy /= total_frame;
+ }
+
+ if (gf_stats->non_zero_stdev_count)
+ gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
+}
+
+#define BOOST_FACTOR 12.5
+static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
+ unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
+
+ // Use a different error per mb factor for calculating boost for
+ // different formats.
+ if (screen_area <= 640 * 360) {
+ return 500.0;
+ } else {
+ return 1000.0;
+ }
+}
+
+static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
+ const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame,
+ double this_frame_mv_in_out, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
+ frame_info->bit_depth);
+ const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+ const double active_area = calculate_active_area(frame_info, this_frame);
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
+ this_frame->intra_error * active_area) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+ // Increase boost for frames where new data coming into frame (e.g. zoom out).
+ // Slightly reduce boost if there is a net balance of motion out of the frame
+ // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ // In the extreme case the boost is halved.
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
+ const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame,
+ double *sr_accumulator, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
+ frame_info->bit_depth);
+ const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
+ const double active_area = calculate_active_area(frame_info, this_frame);
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
+ this_frame->intra_error * active_area) /
+ DOUBLE_DIVIDE_CHECK(
+ (this_frame->coded_error + *sr_accumulator) * active_area);
+
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+ *sr_accumulator = AOMMAX(0.0, *sr_accumulator);
+
+ // Q correction and scaling
+ // The 40.0 value here is an experimentally derived baseline minimum.
+ // This value is in line with the minimum per frame boost in the alt_ref
+ // boost calculation.
+ frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc,
+ int gfu_boost, int frames_to_project,
+ int num_stats_used_for_gfu_boost) {
+ /*
+ * If frames_to_project is equal to num_stats_used_for_gfu_boost,
+ * it means that gfu_boost was calculated over frames_to_project to
+ * begin with(ie; all stats required were available), hence return
+ * the original boost.
+ */
+ if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
+
+ double min_boost_factor = sqrt(p_rc->baseline_gf_interval);
+ // Get the current tpl factor (number of frames = frames_to_project).
+ double tpl_factor = av1_get_gfu_boost_projection_factor(
+ min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
+ // Get the tpl factor when number of frames = num_stats_used_for_prior_boost.
+ double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor(
+ min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost);
+ int projected_gfu_boost =
+ (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats);
+ return projected_gfu_boost;
+}
+
+#define GF_MAX_BOOST 90.0
+#define GF_MIN_BOOST 50
+#define MIN_DECAY_FACTOR 0.01
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+ const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ int offset, int f_frames, int b_frames,
+ int *num_fpstats_used, int *num_fpstats_required,
+ int project_gfu_boost) {
+ int i;
+ GF_GROUP_STATS gf_stats;
+ init_gf_stats(&gf_stats);
+ double boost_score = (double)NORMAL_BOOST;
+ int arf_boost;
+ int flash_detected = 0;
+ if (num_fpstats_used) *num_fpstats_used = 0;
+
+ // Search forward from the proposed arf/next gf position.
+ for (i = 0; i < f_frames; ++i) {
+ const FIRSTPASS_STATS *this_frame =
+ read_frame_stats(twopass, twopass_frame, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(this_frame, &gf_stats,
+ frame_info->frame_width,
+ frame_info->frame_height);
+
+ // We want to discount the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+ detect_flash(twopass, twopass_frame, i + offset + 1);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
+ gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : gf_stats.decay_accumulator;
+ }
+
+ boost_score +=
+ gf_stats.decay_accumulator *
+ calc_frame_boost(p_rc, frame_info, this_frame,
+ gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+ if (num_fpstats_used) (*num_fpstats_used)++;
+ }
+
+ arf_boost = (int)boost_score;
+
+ // Reset for backward looking loop.
+ boost_score = 0.0;
+ init_gf_stats(&gf_stats);
+ // Search backward towards last gf position.
+ for (i = -1; i >= -b_frames; --i) {
+ const FIRSTPASS_STATS *this_frame =
+ read_frame_stats(twopass, twopass_frame, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(this_frame, &gf_stats,
+ frame_info->frame_width,
+ frame_info->frame_height);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+ detect_flash(twopass, twopass_frame, i + offset + 1);
+
+ // Cumulative effect of prediction quality decay.
+ if (!flash_detected) {
+ gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
+ gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : gf_stats.decay_accumulator;
+ }
+
+ boost_score +=
+ gf_stats.decay_accumulator *
+ calc_frame_boost(p_rc, frame_info, this_frame,
+ gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+ if (num_fpstats_used) (*num_fpstats_used)++;
+ }
+ arf_boost += (int)boost_score;
+
+ if (project_gfu_boost) {
+ assert(num_fpstats_required != NULL);
+ assert(num_fpstats_used != NULL);
+ *num_fpstats_required = f_frames + b_frames;
+ arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required,
+ *num_fpstats_used);
+ }
+
+ if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST))
+ arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST);
+
+ return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
+
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
+ }
+
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+/*!\brief Calculates the bit target for this GF/ARF group
+ *
+ * \ingroup rate_control
+ *
+ * Calculates the total bits to allocate in this GF/ARF group.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] gf_group_err Cumulative coded error score for the
+ * frames making up this group.
+ *
+ * \return The target total number of bits for this GF/ARF group.
+ */
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+ double gf_group_err) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+ int64_t total_group_bits;
+
+ // Calculate the bits to be allocated to the group as a whole.
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ total_group_bits = (int64_t)(twopass->kf_group_bits *
+ (gf_group_err / twopass->kf_group_error_left));
+ } else {
+ total_group_bits = 0;
+ }
+
+ // Clamp odd edge cases.
+ total_group_bits = (total_group_bits < 0) ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
+
+ // Clip based on user supplied data rate variability limit.
+ if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
+ total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval;
+
+ return total_group_bits;
+}
+
+// Calculate the number of bits to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+ int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0)) return 0;
+
+ if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX));
+
+ allocation_chunks = (frame_count * 100) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+ 0);
+}
+
+// Calculate the boost factor based on the number of bits assigned, i.e. the
+// inverse of calculate_boost_bits().
+static int calculate_boost_factor(int frame_count, int bits,
+ int64_t total_group_bits) {
+ return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
+}
+
+// Reduce the number of bits assigned to keyframe or arf if necessary, to
+// prevent bitrate spikes that may break level constraints.
+// frame_type: 0: keyframe; 1: arf.
+static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
+ RATE_CONTROL *const rc,
+ int bits_assigned,
+ int64_t group_bits,
+ int frame_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const int temporal_layer_id = cm->temporal_layer_id;
+ const int spatial_layer_id = cm->spatial_layer_id;
+ for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
+ ++index) {
+ if (!is_in_operating_point(seq_params->operating_point_idc[index],
+ temporal_layer_id, spatial_layer_id)) {
+ continue;
+ }
+
+ const AV1_LEVEL target_level =
+ cpi->ppi->level_params.target_seq_level_idx[index];
+ if (target_level >= SEQ_LEVELS) continue;
+
+ assert(is_valid_seq_level_idx(target_level));
+
+ const double level_bitrate_limit = av1_get_max_bitrate_for_level(
+ target_level, seq_params->tier[0], seq_params->profile);
+ const int target_bits_per_frame =
+ (int)(level_bitrate_limit / cpi->framerate);
+ if (frame_type == 0) {
+ // Maximum bits for keyframe is 8 times the target_bits_per_frame.
+ const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
+ if (bits_assigned > level_enforced_max_kf_bits) {
+ const int frames = rc->frames_to_key - 1;
+ p_rc->kf_boost = calculate_boost_factor(
+ frames, level_enforced_max_kf_bits, group_bits);
+ bits_assigned =
+ calculate_boost_bits(frames, p_rc->kf_boost, group_bits);
+ }
+ } else if (frame_type == 1) {
+ // Maximum bits for arf is 4 times the target_bits_per_frame.
+ const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
+ if (bits_assigned > level_enforced_max_arf_bits) {
+ p_rc->gfu_boost =
+ calculate_boost_factor(p_rc->baseline_gf_interval,
+ level_enforced_max_arf_bits, group_bits);
+ bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval,
+ p_rc->gfu_boost, group_bits);
+ }
+ } else {
+ assert(0);
+ }
+ }
+
+ return bits_assigned;
+}
+
+// Allocate bits to each frame in a GF / ARF group
+double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60,
+ 0.60, 1.0, 1.0 };
+static void allocate_gf_group_bits(GF_GROUP *gf_group,
+ PRIMARY_RATE_CONTROL *const p_rc,
+ RATE_CONTROL *const rc,
+ int64_t gf_group_bits, int gf_arf_bits,
+ int key_frame, int use_arf) {
+ int64_t total_group_bits = gf_group_bits;
+ int base_frame_bits;
+ const int gf_group_size = gf_group->size;
+ int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ int frame_index = !!key_frame;
+
+ // Subtract the extra bits set aside for ARF frames from the Group Total
+ if (use_arf) total_group_bits -= gf_arf_bits;
+
+ int num_frames =
+ AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0));
+ base_frame_bits = (int)(total_group_bits / num_frames);
+
+ // Check the number of frames in each layer in case we have a
+ // non standard group length.
+ int max_arf_layer = gf_group->max_layer_depth - 1;
+ for (int idx = frame_index; idx < gf_group_size; ++idx) {
+ if ((gf_group->update_type[idx] == ARF_UPDATE) ||
+ (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
+ layer_frames[gf_group->layer_depth[idx]]++;
+ }
+ }
+
+ // Allocate extra bits to each ARF layer
+ int i;
+ int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 };
+ assert(max_arf_layer <= MAX_ARF_LAYERS);
+ for (i = 1; i <= max_arf_layer; ++i) {
+ double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i];
+ layer_extra_bits[i] =
+ (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i]));
+ gf_arf_bits -= (int)(gf_arf_bits * fraction);
+ }
+
+ // Now combine ARF layer and baseline bits to give total bits for each frame.
+ int arf_extra_bits;
+ for (int idx = frame_index; idx < gf_group_size; ++idx) {
+ switch (gf_group->update_type[idx]) {
+ case ARF_UPDATE:
+ case INTNL_ARF_UPDATE:
+ arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]];
+ gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits;
+ break;
+ case INTNL_OVERLAY_UPDATE:
+ case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break;
+ default: gf_group->bit_allocation[idx] = base_frame_bits; break;
+ }
+ }
+
+ // Set the frame following the current GOP to 0 bit allocation. For ARF
+ // groups, this next frame will be overlay frame, which is the first frame
+ // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+ // Setting this frame to use 0 bit (of out the current GOP budget) will
+ // simplify logics in reference frame management.
+ if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH)
+ gf_group->bit_allocation[gf_group_size] = 0;
+}
+
+// Returns true if KF group and GF group both are almost completely static.
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
+ int is_lap_enabled) {
+ if (is_lap_enabled) {
+ /*
+ * when LAP enabled kf_zero_motion is not reliable, so use strict
+ * constraint on gf_zero_motion.
+ */
+ return (gf_zero_motion >= 0.999);
+ } else {
+ return (gf_zero_motion >= 0.995) &&
+ (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+ }
+}
+
+#define ARF_ABS_ZOOM_THRESH 4.4
+static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
+ int flash_detected, int active_max_gf_interval,
+ int active_min_gf_interval,
+ GF_GROUP_STATS *gf_stats) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ AV1_COMMON *const cm = &cpi->common;
+ // Motion breakout threshold for loop below depends on image size.
+ const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0;
+
+ if (!flash_detected) {
+ // Break clause to detect very still sections after motion. For example,
+ // a static image after a fade or other transition.
+
+ // TODO(angiebird): This is a temporary change, we will avoid using
+ // twopass_frame.stats_in in the follow-up CL
+ int index = (int)(cpi->twopass_frame.stats_in -
+ twopass->stats_buf_ctx->stats_in_start);
+ if (detect_transition_to_still(&twopass->firstpass_info, index,
+ rc->min_gf_interval, frame_index - cur_start,
+ 5, gf_stats->loop_decay_rate,
+ gf_stats->last_loop_decay_rate)) {
+ return 1;
+ }
+ }
+
+ // Some conditions to breakout after min interval.
+ if (frame_index - cur_start >= active_min_gf_interval &&
+ // If possible don't break very close to a kf
+ (rc->frames_to_key - frame_index >= rc->min_gf_interval) &&
+ ((frame_index - cur_start) & 0x01) && !flash_detected &&
+ (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+ gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+ return 1;
+ }
+
+ // If almost totally static, we will not use the the max GF length later,
+ // so we can continue for more frames.
+ if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
+ !is_almost_static(gf_stats->zero_motion_accumulator,
+ twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
+ return 1;
+ }
+ return 0;
+}
+
+static int is_shorter_gf_interval_better(
+ AV1_COMP *cpi, const EncodeFrameParams *frame_params) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+ int shorten_gf_interval;
+
+ av1_tpl_preload_rc_estimate(cpi, frame_params);
+
+ if (gop_length_decision_method == 2) {
+ // GF group length is decided based on GF boost and tpl stats of ARFs from
+ // base layer, (base+1) layer.
+ shorten_gf_interval =
+ (p_rc->gfu_boost <
+ p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+ !av1_tpl_setup_stats(cpi, 3, frame_params);
+ } else {
+ int do_complete_tpl = 1;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ int is_temporal_filter_enabled =
+ (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+ if (gop_length_decision_method == 1) {
+ // Check if tpl stats of ARFs from base layer, (base+1) layer,
+ // (base+2) layer can decide the GF group length.
+ int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params);
+
+ if (gop_length_eval != 2) {
+ do_complete_tpl = 0;
+ shorten_gf_interval = !gop_length_eval;
+ }
+ }
+
+ if (do_complete_tpl) {
+ // Decide GF group length based on complete tpl stats.
+ shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params);
+ // Tpl stats is reused when the ARF is temporally filtered and GF
+ // interval is not shortened.
+ if (is_temporal_filter_enabled && !shorten_gf_interval) {
+ cpi->skip_tpl_setup_stats = 1;
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+ assert(cpi->gf_frame_index == 0);
+ av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+ gf_group,
+ cpi->common.seq_params->bit_depth);
+#endif // CONFIG_BITRATE_ACCURACY
+ }
+ }
+ }
+ return shorten_gf_interval;
+}
+
+#define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking
+#define SMOOTH_FILT_LEN 7
+#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
+#define WINDOW_SIZE 7
+#define HALF_WIN (WINDOW_SIZE / 2)
+// A 7-tap gaussian smooth filter
+const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
+ 0.242, 0.061, 0.006 };
+
+// Smooth filter intra_error and coded_error in firstpass stats.
+// If stats[i].is_flash==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
+ int last_idx, double *filt_intra_err,
+ double *filt_coded_err) {
+ int i, j;
+ for (i = start_idx; i <= last_idx; i++) {
+ double total_wt = 0;
+ for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+ if (stats[idx].is_flash) continue;
+
+ filt_intra_err[i] +=
+ smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
+ total_wt += smooth_filt[j + HALF_FILT_LEN];
+ }
+ if (total_wt > 0.01) {
+ filt_intra_err[i] /= total_wt;
+ } else {
+ filt_intra_err[i] = stats[i].intra_error;
+ }
+ }
+ for (i = start_idx; i <= last_idx; i++) {
+ double total_wt = 0;
+ for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+ // Coded error involves idx and idx - 1.
+ if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+
+ filt_coded_err[i] +=
+ smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
+ total_wt += smooth_filt[j + HALF_FILT_LEN];
+ }
+ if (total_wt > 0.01) {
+ filt_coded_err[i] /= total_wt;
+ } else {
+ filt_coded_err[i] = stats[i].coded_error;
+ }
+ }
+}
+
+// Calculate gradient
+static void get_gradient(const double *values, int start, int last,
+ double *grad) {
+ if (start == last) {
+ grad[start] = 0;
+ return;
+ }
+ for (int i = start; i <= last; i++) {
+ int prev = AOMMAX(i - 1, start);
+ int next = AOMMIN(i + 1, last);
+ grad[i] = (values[next] - values[prev]) / (next - prev);
+ }
+}
+
+static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
+ int first, int last) {
+ // Identify unstable areas caused by scenecuts.
+ // Find the max and 2nd max coded error, and the average of the rest frames.
+ // If there is only one frame that yields a huge coded error, it is likely a
+ // scenecut.
+ double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded,
+ max_next_coded;
+
+ if (last - first == 0) return -1;
+
+ for (int i = first; i <= last; i++) {
+ if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+ continue;
+ double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
+ this_ratio = stats_start[i].coded_error / temp_intra;
+ // find the avg ratio in the preceding neighborhood
+ max_prev_ratio = 0;
+ max_prev_coded = 0;
+ for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
+ if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash))
+ continue;
+ temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+ double temp_ratio = stats_start[j].coded_error / temp_intra;
+ if (temp_ratio > max_prev_ratio) {
+ max_prev_ratio = temp_ratio;
+ }
+ if (stats_start[j].coded_error > max_prev_coded) {
+ max_prev_coded = stats_start[j].coded_error;
+ }
+ }
+ // find the avg ratio in the following neighborhood
+ max_next_ratio = 0;
+ max_next_coded = 0;
+ for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
+ if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+ continue;
+ temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+ double temp_ratio = stats_start[j].coded_error / temp_intra;
+ if (temp_ratio > max_next_ratio) {
+ max_next_ratio = temp_ratio;
+ }
+ if (stats_start[j].coded_error > max_next_coded) {
+ max_next_coded = stats_start[j].coded_error;
+ }
+ }
+
+ if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) {
+ // the ratios are very small, only check a small fixed threshold
+ if (this_ratio < 0.02) continue;
+ } else {
+ // check if this frame has a larger ratio than the neighborhood
+ double max_sr = stats_start[i].sr_coded_error;
+ if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error);
+ double max_sr_fr_ratio =
+ max_sr / AOMMAX(stats_start[i].coded_error, 0.01);
+
+ if (max_sr_fr_ratio > 1.2) continue;
+ if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) &&
+ stats_start[i].coded_error <
+ 2 * AOMMAX(max_prev_coded, max_next_coded)) {
+ continue;
+ }
+ }
+ return i;
+ }
+ return -1;
+}
+
+// Remove the region with index next_region.
+// parameter merge: 0: merge with previous; 1: merge with next; 2:
+// merge with both, take type from previous if possible
+// After removing, next_region will be the index of the next region.
+static void remove_region(int merge, REGIONS *regions, int *num_regions,
+ int *next_region) {
+ int k = *next_region;
+ assert(k < *num_regions);
+ if (*num_regions == 1) {
+ *num_regions = 0;
+ return;
+ }
+ if (k == 0) {
+ merge = 1;
+ } else if (k == *num_regions - 1) {
+ merge = 0;
+ }
+ int num_merge = (merge == 2) ? 2 : 1;
+ switch (merge) {
+ case 0:
+ regions[k - 1].last = regions[k].last;
+ *next_region = k;
+ break;
+ case 1:
+ regions[k + 1].start = regions[k].start;
+ *next_region = k + 1;
+ break;
+ case 2:
+ regions[k - 1].last = regions[k + 1].last;
+ *next_region = k;
+ break;
+ default: assert(0);
+ }
+ *num_regions -= num_merge;
+ for (k = *next_region - (merge == 1); k < *num_regions; k++) {
+ regions[k] = regions[k + num_merge];
+ }
+}
+
+// Insert a region in the cur_region_idx. The start and last should both be in
+// the current region. After insertion, the cur_region_idx will point to the
+// last region that was splitted from the original region.
+static void insert_region(int start, int last, REGION_TYPES type,
+ REGIONS *regions, int *num_regions,
+ int *cur_region_idx) {
+ int k = *cur_region_idx;
+ REGION_TYPES this_region_type = regions[k].type;
+ int this_region_last = regions[k].last;
+ int num_add = (start != regions[k].start) + (last != regions[k].last);
+ // move the following regions further to the back
+ for (int r = *num_regions - 1; r > k; r--) {
+ regions[r + num_add] = regions[r];
+ }
+ *num_regions += num_add;
+ if (start > regions[k].start) {
+ regions[k].last = start - 1;
+ k++;
+ regions[k].start = start;
+ }
+ regions[k].type = type;
+ if (last < this_region_last) {
+ regions[k].last = last;
+ k++;
+ regions[k].start = last + 1;
+ regions[k].last = this_region_last;
+ regions[k].type = this_region_type;
+ } else {
+ regions[k].last = this_region_last;
+ }
+ *cur_region_idx = k;
+}
+
+// Get the average of stats inside a region.
+static void analyze_region(const FIRSTPASS_STATS *stats, int k,
+ REGIONS *regions) {
+ int i;
+ regions[k].avg_cor_coeff = 0;
+ regions[k].avg_sr_fr_ratio = 0;
+ regions[k].avg_intra_err = 0;
+ regions[k].avg_coded_err = 0;
+
+ int check_first_sr = (k != 0);
+
+ for (i = regions[k].start; i <= regions[k].last; i++) {
+ if (i > regions[k].start || check_first_sr) {
+ double num_frames =
+ (double)(regions[k].last - regions[k].start + check_first_sr);
+ double max_coded_error =
+ AOMMAX(stats[i].coded_error, stats[i - 1].coded_error);
+ double this_ratio =
+ stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001);
+ regions[k].avg_sr_fr_ratio += this_ratio / num_frames;
+ }
+
+ regions[k].avg_intra_err +=
+ stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1);
+ regions[k].avg_coded_err +=
+ stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
+
+ regions[k].avg_cor_coeff +=
+ AOMMAX(stats[i].cor_coeff, 0.001) /
+ (double)(regions[k].last - regions[k].start + 1);
+ regions[k].avg_noise_var +=
+ AOMMAX(stats[i].noise_var, 0.001) /
+ (double)(regions[k].last - regions[k].start + 1);
+ }
+}
+
+// Calculate the regions stats of every region.
+static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions,
+ int num_regions) {
+ for (int k = 0; k < num_regions; k++) {
+ analyze_region(stats, k, regions);
+ }
+}
+
+// Find tentative stable regions
+static int find_stable_regions(const FIRSTPASS_STATS *stats,
+ const double *grad_coded, int this_start,
+ int this_last, REGIONS *regions) {
+ int i, j, k = 0;
+ regions[k].start = this_start;
+ for (i = this_start; i <= this_last; i++) {
+ // Check mean and variance of stats in a window
+ double mean_intra = 0.001, var_intra = 0.001;
+ double mean_coded = 0.001, var_coded = 0.001;
+ int count = 0;
+ for (j = -HALF_WIN; j <= HALF_WIN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
+ if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+ mean_intra += stats[idx].intra_error;
+ var_intra += stats[idx].intra_error * stats[idx].intra_error;
+ mean_coded += stats[idx].coded_error;
+ var_coded += stats[idx].coded_error * stats[idx].coded_error;
+ count++;
+ }
+
+ REGION_TYPES cur_type;
+ if (count > 0) {
+ mean_intra /= (double)count;
+ var_intra /= (double)count;
+ mean_coded /= (double)count;
+ var_coded /= (double)count;
+ int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03);
+ int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 &&
+ fabs(grad_coded[i]) / mean_coded < 0.05) ||
+ mean_coded / mean_intra < 0.05;
+ int is_coded_small = mean_coded < 0.5 * mean_intra;
+ cur_type = (is_intra_stable && is_coded_stable && is_coded_small)
+ ? STABLE_REGION
+ : HIGH_VAR_REGION;
+ } else {
+ cur_type = HIGH_VAR_REGION;
+ }
+
+ // mark a new region if type changes
+ if (i == regions[k].start) {
+ // first frame in the region
+ regions[k].type = cur_type;
+ } else if (cur_type != regions[k].type) {
+ // Append a new region
+ regions[k].last = i - 1;
+ regions[k + 1].start = i;
+ regions[k + 1].type = cur_type;
+ k++;
+ }
+ }
+ regions[k].last = this_last;
+ return k + 1;
+}
+
+// Clean up regions that should be removed or merged.
+static void cleanup_regions(REGIONS *regions, int *num_regions) {
+ int k = 0;
+ while (k < *num_regions) {
+ if ((k > 0 && regions[k - 1].type == regions[k].type &&
+ regions[k].type != SCENECUT_REGION) ||
+ regions[k].last < regions[k].start) {
+ remove_region(0, regions, num_regions, &k);
+ } else {
+ k++;
+ }
+ }
+}
+
+// Remove regions that are of type and shorter than length.
+// Merge it with its neighboring regions.
+static void remove_short_regions(REGIONS *regions, int *num_regions,
+ REGION_TYPES type, int length) {
+ int k = 0;
+ while (k < *num_regions && (*num_regions) > 1) {
+ if ((regions[k].last - regions[k].start + 1 < length &&
+ regions[k].type == type)) {
+ // merge current region with the previous and next regions
+ remove_region(2, regions, num_regions, &k);
+ } else {
+ k++;
+ }
+ }
+ cleanup_regions(regions, num_regions);
+}
+
+static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
+ REGIONS *regions, int *num_regions) {
+ int i, j, k;
+ // Remove regions that are too short. Likely noise.
+ remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
+ remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+
+ get_region_stats(stats, regions, *num_regions);
+
+ // Adjust region boundaries. The thresholds are empirically obtained, but
+ // overall the performance is not very sensitive to small changes to them.
+ for (k = 0; k < *num_regions; k++) {
+ if (regions[k].type == STABLE_REGION) continue;
+ if (k > 0) {
+ // Adjust previous boundary.
+ // First find the average intra/coded error in the previous
+ // neighborhood.
+ double avg_intra_err = 0;
+ const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
+ regions[k - 1].start + 1);
+ const int lasti = regions[k - 1].last;
+ int counti = 0;
+ for (i = starti; i <= lasti; i++) {
+ avg_intra_err += stats[i].intra_error;
+ counti++;
+ }
+ if (counti > 0) {
+ avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+ int count_coded = 0, count_grad = 0;
+ for (j = lasti + 1; j <= regions[k].last; j++) {
+ const int intra_close =
+ fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+ const int coded_small = stats[j].coded_error / avg_intra_err < 0.1;
+ const int coeff_close = stats[j].cor_coeff > 0.995;
+ if (!coeff_close || !coded_small) count_coded--;
+ if (intra_close && count_coded >= 0 && count_grad >= 0) {
+ // this frame probably belongs to the previous stable region
+ regions[k - 1].last = j;
+ regions[k].start = j + 1;
+ } else {
+ break;
+ }
+ }
+ }
+ } // if k > 0
+ if (k < *num_regions - 1) {
+ // Adjust next boundary.
+ // First find the average intra/coded error in the next neighborhood.
+ double avg_intra_err = 0;
+ const int starti = regions[k + 1].start;
+ const int lasti = AOMMIN(regions[k + 1].last - 1,
+ regions[k + 1].start + WINDOW_SIZE - 1);
+ int counti = 0;
+ for (i = starti; i <= lasti; i++) {
+ avg_intra_err += stats[i].intra_error;
+ counti++;
+ }
+ if (counti > 0) {
+ avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+ // At the boundary, coded error is large, but still the frame is stable
+ int count_coded = 1, count_grad = 1;
+ for (j = starti - 1; j >= regions[k].start; j--) {
+ const int intra_close =
+ fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+ const int coded_small =
+ stats[j + 1].coded_error / avg_intra_err < 0.1;
+ const int coeff_close = stats[j].cor_coeff > 0.995;
+ if (!coeff_close || !coded_small) count_coded--;
+ if (intra_close && count_coded >= 0 && count_grad >= 0) {
+ // this frame probably belongs to the next stable region
+ regions[k + 1].start = j;
+ regions[k].last = j - 1;
+ } else {
+ break;
+ }
+ }
+ }
+ } // if k < *num_regions - 1
+ } // end of loop over all regions
+
+ cleanup_regions(regions, num_regions);
+ remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+ get_region_stats(stats, regions, *num_regions);
+
+ // If a stable regions has higher error than neighboring high var regions,
+ // or if the stable region has a lower average correlation,
+ // then it should be merged with them
+ k = 0;
+ while (k < *num_regions && (*num_regions) > 1) {
+ if (regions[k].type == STABLE_REGION &&
+ (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+ ((k > 0 && // previous regions
+ (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 ||
+ regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) &&
+ (k < *num_regions - 1 && // next region
+ (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 ||
+ regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) {
+ // merge current region with the previous and next regions
+ remove_region(2, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ } else if (regions[k].type == HIGH_VAR_REGION &&
+ (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+ ((k > 0 && // previous regions
+ (regions[k].avg_coded_err <
+ regions[k - 1].avg_coded_err * 0.99 ||
+ regions[k].avg_cor_coeff >
+ regions[k - 1].avg_cor_coeff * 1.001)) &&
+ (k < *num_regions - 1 && // next region
+ (regions[k].avg_coded_err <
+ regions[k + 1].avg_coded_err * 0.99 ||
+ regions[k].avg_cor_coeff >
+ regions[k + 1].avg_cor_coeff * 1.001)))) {
+ // merge current region with the previous and next regions
+ remove_region(2, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ } else {
+ k++;
+ }
+ }
+
+ remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE);
+ remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+}
+
+// Identify blending regions.
+static void find_blending_regions(const FIRSTPASS_STATS *stats,
+ REGIONS *regions, int *num_regions) {
+ int i, k = 0;
+ // Blending regions will have large content change, therefore will have a
+ // large consistent change in intra error.
+ int count_stable = 0;
+ while (k < *num_regions) {
+ if (regions[k].type == STABLE_REGION) {
+ k++;
+ count_stable++;
+ continue;
+ }
+ int dir = 0;
+ int start = 0, last;
+ for (i = regions[k].start; i <= regions[k].last; i++) {
+ // First mark the regions that has consistent large change of intra error.
+ if (k == 0 && i == regions[k].start) continue;
+ if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue;
+ double grad = stats[i].intra_error - stats[i - 1].intra_error;
+ int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
+ int this_dir = 0;
+ if (large_change) {
+ this_dir = (grad > 0) ? 1 : -1;
+ }
+ // the current trend continues
+ if (dir == this_dir) continue;
+ if (dir != 0) {
+ // Mark the end of a new large change group and add it
+ last = i - 1;
+ insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+ }
+ dir = this_dir;
+ if (k == 0 && i == regions[k].start + 1) {
+ start = i - 1;
+ } else {
+ start = i;
+ }
+ }
+ if (dir != 0) {
+ last = regions[k].last;
+ insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+ }
+ k++;
+ }
+
+ // If the blending region has very low correlation, mark it as high variance
+ // since we probably cannot benefit from it anyways.
+ get_region_stats(stats, regions, *num_regions);
+ for (k = 0; k < *num_regions; k++) {
+ if (regions[k].type != BLENDING_REGION) continue;
+ if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
+ count_stable == 0)
+ regions[k].type = HIGH_VAR_REGION;
+ }
+ get_region_stats(stats, regions, *num_regions);
+
+ // It is possible for blending to result in a "dip" in intra error (first
+ // decrease then increase). Therefore we need to find the dip and combine the
+ // two regions.
+ k = 1;
+ while (k < *num_regions) {
+ if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) {
+ // Check if this short high variance regions is actually in the middle of
+ // a blending region.
+ if (regions[k - 1].type == BLENDING_REGION &&
+ regions[k + 1].type == BLENDING_REGION &&
+ regions[k].last - regions[k].start < 3) {
+ int prev_dir = (stats[regions[k - 1].last].intra_error -
+ stats[regions[k - 1].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+ int next_dir = (stats[regions[k + 1].last].intra_error -
+ stats[regions[k + 1].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+ if (prev_dir < 0 && next_dir > 0) {
+ // This is possibly a mid region of blending. Check the ratios
+ double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio,
+ regions[k + 1].avg_sr_fr_ratio) *
+ 0.95;
+ if (regions[k].avg_sr_fr_ratio > ratio_thres) {
+ regions[k].type = BLENDING_REGION;
+ remove_region(2, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ continue;
+ }
+ }
+ }
+ }
+ // Check if we have a pair of consecutive blending regions.
+ if (regions[k - 1].type == BLENDING_REGION &&
+ regions[k].type == BLENDING_REGION) {
+ int prev_dir = (stats[regions[k - 1].last].intra_error -
+ stats[regions[k - 1].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+ int next_dir = (stats[regions[k].last].intra_error -
+ stats[regions[k].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+
+ // if both are too short, no need to check
+ int total_length = regions[k].last - regions[k - 1].start + 1;
+ if (total_length < 4) {
+ regions[k - 1].type = HIGH_VAR_REGION;
+ k++;
+ continue;
+ }
+
+ int to_merge = 0;
+ if (prev_dir < 0 && next_dir > 0) {
+ // In this case we check the last frame in the previous region.
+ double prev_length =
+ (double)(regions[k - 1].last - regions[k - 1].start + 1);
+ double last_ratio, ratio_thres;
+ if (prev_length < 2.01) {
+ // if the previous region is very short
+ double max_coded_error =
+ AOMMAX(stats[regions[k - 1].last].coded_error,
+ stats[regions[k - 1].last - 1].coded_error);
+ last_ratio = stats[regions[k - 1].last].sr_coded_error /
+ AOMMAX(max_coded_error, 0.001);
+ ratio_thres = regions[k].avg_sr_fr_ratio * 0.95;
+ } else {
+ double max_coded_error =
+ AOMMAX(stats[regions[k - 1].last].coded_error,
+ stats[regions[k - 1].last - 1].coded_error);
+ last_ratio = stats[regions[k - 1].last].sr_coded_error /
+ AOMMAX(max_coded_error, 0.001);
+ double prev_ratio =
+ (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) /
+ (prev_length - 1.0);
+ ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95;
+ }
+ if (last_ratio > ratio_thres) {
+ to_merge = 1;
+ }
+ }
+
+ if (to_merge) {
+ remove_region(0, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ continue;
+ } else {
+ // These are possibly two separate blending regions. Mark the boundary
+ // frame as HIGH_VAR_REGION to separate the two.
+ int prev_k = k - 1;
+ insert_region(regions[prev_k].last, regions[prev_k].last,
+ HIGH_VAR_REGION, regions, num_regions, &prev_k);
+ analyze_region(stats, prev_k, regions);
+ k = prev_k + 1;
+ analyze_region(stats, k, regions);
+ }
+ }
+ k++;
+ }
+ cleanup_regions(regions, num_regions);
+}
+
+// Clean up decision for blendings. Remove blending regions that are too short.
+// Also if a very short high var region is between a blending and a stable
+// region, just merge it with one of them.
+static void cleanup_blendings(REGIONS *regions, int *num_regions) {
+ int k = 0;
+ while (k<*num_regions && * num_regions> 1) {
+ int is_short_blending = regions[k].type == BLENDING_REGION &&
+ regions[k].last - regions[k].start + 1 < 5;
+ int is_short_hv = regions[k].type == HIGH_VAR_REGION &&
+ regions[k].last - regions[k].start + 1 < 5;
+ int has_stable_neighbor =
+ ((k > 0 && regions[k - 1].type == STABLE_REGION) ||
+ (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION));
+ int has_blend_neighbor =
+ ((k > 0 && regions[k - 1].type == BLENDING_REGION) ||
+ (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION));
+ int total_neighbors = (k > 0) + (k < *num_regions - 1);
+
+ if (is_short_blending ||
+ (is_short_hv &&
+ has_stable_neighbor + has_blend_neighbor >= total_neighbors)) {
+ // Remove this region.Try to determine whether to combine it with the
+ // previous or next region.
+ int merge;
+ double prev_diff =
+ (k > 0)
+ ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff)
+ : 1;
+ double next_diff =
+ (k < *num_regions - 1)
+ ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff)
+ : 1;
+ // merge == 0 means to merge with previous, 1 means to merge with next
+ merge = prev_diff > next_diff;
+ remove_region(merge, regions, num_regions, &k);
+ } else {
+ k++;
+ }
+ }
+ cleanup_regions(regions, num_regions);
+}
+
+static void free_firstpass_stats_buffers(REGIONS *temp_regions,
+ double *filt_intra_err,
+ double *filt_coded_err,
+ double *grad_coded) {
+ aom_free(temp_regions);
+ aom_free(filt_intra_err);
+ aom_free(filt_coded_err);
+ aom_free(grad_coded);
+}
+
+// Identify stable and unstable regions from first pass stats.
+// stats_start points to the first frame to analyze.
+// |offset| is the offset from the current frame to the frame stats_start is
+// pointing to.
+// Returns 0 on success, -1 on memory allocation failure.
+static int identify_regions(const FIRSTPASS_STATS *const stats_start,
+ int total_frames, int offset, REGIONS *regions,
+ int *total_regions) {
+ int k;
+ if (total_frames <= 1) return 0;
+
+ // store the initial decisions
+ REGIONS *temp_regions =
+ (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0]));
+ // buffers for filtered stats
+ double *filt_intra_err =
+ (double *)aom_calloc(total_frames, sizeof(*filt_intra_err));
+ double *filt_coded_err =
+ (double *)aom_calloc(total_frames, sizeof(*filt_coded_err));
+ double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded));
+ if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) {
+ free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+ grad_coded);
+ return -1;
+ }
+ av1_zero_array(temp_regions, total_frames);
+
+ int cur_region = 0, this_start = 0, this_last;
+
+ int next_scenecut = -1;
+ do {
+ // first get the obvious scenecuts
+ next_scenecut =
+ find_next_scenecut(stats_start, this_start, total_frames - 1);
+ this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
+
+ // low-pass filter the needed stats
+ smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err,
+ filt_coded_err);
+ get_gradient(filt_coded_err, this_start, this_last, grad_coded);
+
+ // find tentative stable regions and unstable regions
+ int num_regions = find_stable_regions(stats_start, grad_coded, this_start,
+ this_last, temp_regions);
+
+ adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions);
+
+ get_region_stats(stats_start, temp_regions, num_regions);
+
+ // Try to identify blending regions in the unstable regions
+ find_blending_regions(stats_start, temp_regions, &num_regions);
+ cleanup_blendings(temp_regions, &num_regions);
+
+ // The flash points should all be considered high variance points
+ k = 0;
+ while (k < num_regions) {
+ if (temp_regions[k].type != STABLE_REGION) {
+ k++;
+ continue;
+ }
+ int start = temp_regions[k].start;
+ int last = temp_regions[k].last;
+ for (int i = start; i <= last; i++) {
+ if (stats_start[i].is_flash) {
+ insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
+ }
+ }
+ k++;
+ }
+ cleanup_regions(temp_regions, &num_regions);
+
+ // copy the regions in the scenecut group
+ for (k = 0; k < num_regions; k++) {
+ if (temp_regions[k].last < temp_regions[k].start &&
+ k == num_regions - 1) {
+ num_regions--;
+ break;
+ }
+ regions[k + cur_region] = temp_regions[k];
+ }
+ cur_region += num_regions;
+
+ // add the scenecut region
+ if (next_scenecut > -1) {
+ // add the scenecut region, and find the next scenecut
+ regions[cur_region].type = SCENECUT_REGION;
+ regions[cur_region].start = next_scenecut;
+ regions[cur_region].last = next_scenecut;
+ cur_region++;
+ this_start = next_scenecut + 1;
+ }
+ } while (next_scenecut >= 0);
+
+ *total_regions = cur_region;
+ get_region_stats(stats_start, regions, *total_regions);
+
+ for (k = 0; k < *total_regions; k++) {
+ // If scenecuts are very minor, mark them as high variance.
+ if (regions[k].type != SCENECUT_REGION ||
+ regions[k].avg_cor_coeff *
+ (1 - stats_start[regions[k].start].noise_var /
+ regions[k].avg_intra_err) <
+ 0.8) {
+ continue;
+ }
+ regions[k].type = HIGH_VAR_REGION;
+ }
+ cleanup_regions(regions, total_regions);
+ get_region_stats(stats_start, regions, *total_regions);
+
+ for (k = 0; k < *total_regions; k++) {
+ regions[k].start += offset;
+ regions[k].last += offset;
+ }
+
+ free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+ grad_coded);
+ return 0;
+}
+
+static int find_regions_index(const REGIONS *regions, int num_regions,
+ int frame_idx) {
+ for (int k = 0; k < num_regions; k++) {
+ if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) {
+ return k;
+ }
+ }
+ return -1;
+}
+
+/*!\brief Determine the length of future GF groups.
+ *
+ * \ingroup gf_group_algo
+ * This function decides the gf group length of future frames in batch
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] max_gop_length Maximum length of the GF group
+ * \param[in] max_intervals Maximum number of intervals to decide
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
+ * changed to store the decided GF group lengths.
+ */
+static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
+ int max_intervals) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0);
+
+ const int f_w = cpi->common.width;
+ const int f_h = cpi->common.height;
+ int i;
+
+ int flash_detected;
+
+ av1_zero(next_frame);
+
+ if (has_no_stats_stage(cpi)) {
+ for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
+ p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+ }
+ p_rc->cur_gf_index = 0;
+ rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
+ return;
+ }
+
+ // TODO(urvang): Try logic to vary min and max interval based on q.
+ const int active_min_gf_interval = rc->min_gf_interval;
+ const int active_max_gf_interval =
+ AOMMIN(rc->max_gf_interval, max_gop_length);
+ const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
+
+ i = (rc->frames_since_key == 0);
+ max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals;
+ int count_cuts = 1;
+ // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
+ int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last;
+ int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
+ int cut_here;
+ GF_GROUP_STATS gf_stats;
+ init_gf_stats(&gf_stats);
+ while (count_cuts < max_intervals + 1) {
+ // reaches next key frame, break here
+ if (i >= rc->frames_to_key) {
+ cut_here = 2;
+ } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
+ // reached maximum len, but nothing special yet (almost static)
+ // let's look at the next interval
+ cut_here = 1;
+ } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) {
+ // reaches last frame, break
+ cut_here = 2;
+ } else {
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+ // TODO(bohanli): remove redundant accumulations here, or unify
+ // this and the ones in define_gf_group
+ accumulate_next_frame_stats(&next_frame, flash_detected,
+ rc->frames_since_key, i, &gf_stats, f_w, f_h);
+
+ cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
+ active_max_gf_interval, active_min_gf_interval,
+ &gf_stats);
+ }
+ if (cut_here) {
+ cur_last = i - 1; // the current last frame in the gf group
+ int ori_last = cur_last;
+ // The region frame idx does not start from the same frame as cur_start
+ // and cur_last. Need to offset them.
+ int offset = rc->frames_since_key - p_rc->regions_offset;
+ REGIONS *regions = p_rc->regions;
+ int num_regions = p_rc->num_regions;
+
+ int scenecut_idx = -1;
+ // only try shrinking if interval smaller than active_max_gf_interval
+ if (cur_last - cur_start <= active_max_gf_interval &&
+ cur_last > cur_start) {
+ // find the region indices of where the first and last frame belong.
+ int k_start =
+ find_regions_index(regions, num_regions, cur_start + offset);
+ int k_last =
+ find_regions_index(regions, num_regions, cur_last + offset);
+ if (cur_start + offset == 0) k_start = 0;
+
+ // See if we have a scenecut in between
+ for (int r = k_start + 1; r <= k_last; r++) {
+ if (regions[r].type == SCENECUT_REGION &&
+ regions[r].last - offset - cur_start > active_min_gf_interval) {
+ scenecut_idx = r;
+ break;
+ }
+ }
+
+ // if the found scenecut is very close to the end, ignore it.
+ if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) {
+ scenecut_idx = -1;
+ }
+
+ if (scenecut_idx != -1) {
+ // If we have a scenecut, then stop at it.
+ // TODO(bohanli): add logic here to stop before the scenecut and for
+ // the next gop start from the scenecut with GF
+ int is_minor_sc =
+ (regions[scenecut_idx].avg_cor_coeff *
+ (1 - stats[regions[scenecut_idx].start - offset].noise_var /
+ regions[scenecut_idx].avg_intra_err) >
+ 0.6);
+ cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
+ } else {
+ int is_last_analysed = (k_last == num_regions - 1) &&
+ (cur_last + offset == regions[k_last].last);
+ int not_enough_regions =
+ k_last - k_start <=
+ 1 + (regions[k_start].type == SCENECUT_REGION);
+ // if we are very close to the end, then do not shrink since it may
+ // introduce intervals that are too short
+ if (!(is_last_analysed && not_enough_regions)) {
+ const double arf_length_factor = 0.1;
+ double best_score = 0;
+ int best_j = -1;
+ const int first_frame = regions[0].start - offset;
+ const int last_frame = regions[num_regions - 1].last - offset;
+ // score of how much the arf helps the whole GOP
+ double base_score = 0.0;
+ // Accumulate base_score in
+ for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
+ if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+ base_score = (base_score + 1.0) * stats[j].cor_coeff;
+ }
+ int met_blending = 0; // Whether we have met blending areas before
+ int last_blending = 0; // Whether the previous frame if blending
+ for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
+ if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+ base_score = (base_score + 1.0) * stats[j].cor_coeff;
+ int this_reg =
+ find_regions_index(regions, num_regions, j + offset);
+ if (this_reg < 0) continue;
+ // A GOP should include at most 1 blending region.
+ if (regions[this_reg].type == BLENDING_REGION) {
+ last_blending = 1;
+ if (met_blending) {
+ break;
+ } else {
+ base_score = 0;
+ continue;
+ }
+ } else {
+ if (last_blending) met_blending = 1;
+ last_blending = 0;
+ }
+
+ // Add the factor of how good the neighborhood is for this
+ // candidate arf.
+ double this_score = arf_length_factor * base_score;
+ double temp_accu_coeff = 1.0;
+ // following frames
+ int count_f = 0;
+ for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
+ if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break;
+ temp_accu_coeff *= stats[n].cor_coeff;
+ this_score +=
+ temp_accu_coeff *
+ sqrt(AOMMAX(0.5,
+ 1 - stats[n].noise_var /
+ AOMMAX(stats[n].intra_error, 0.001)));
+ count_f++;
+ }
+ // preceding frames
+ temp_accu_coeff = 1.0;
+ for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+ if (stats + n < twopass->stats_buf_ctx->stats_in_start) break;
+ temp_accu_coeff *= stats[n].cor_coeff;
+ this_score +=
+ temp_accu_coeff *
+ sqrt(AOMMAX(0.5,
+ 1 - stats[n].noise_var /
+ AOMMAX(stats[n].intra_error, 0.001)));
+ }
+
+ if (this_score > best_score) {
+ best_score = this_score;
+ best_j = j;
+ }
+ }
+
+ // For blending areas, move one more frame in case we missed the
+ // first blending frame.
+ int best_reg =
+ find_regions_index(regions, num_regions, best_j + offset);
+ if (best_reg < num_regions - 1 && best_reg > 0) {
+ if (regions[best_reg - 1].type == BLENDING_REGION &&
+ regions[best_reg + 1].type == BLENDING_REGION) {
+ if (best_j + offset == regions[best_reg].start &&
+ best_j + offset < regions[best_reg].last) {
+ best_j += 1;
+ } else if (best_j + offset == regions[best_reg].last &&
+ best_j + offset > regions[best_reg].start) {
+ best_j -= 1;
+ }
+ }
+ }
+
+ if (cur_last - best_j < 2) best_j = cur_last;
+ if (best_j > 0 && best_score > 0.1) cur_last = best_j;
+ // if cannot find anything, just cut at the original place.
+ }
+ }
+ }
+ cut_pos[count_cuts] = cur_last;
+ count_cuts++;
+
+ // reset pointers to the shrunken location
+ cpi->twopass_frame.stats_in = start_pos + cur_last;
+ cur_start = cur_last;
+ int cur_region_idx =
+ find_regions_index(regions, num_regions, cur_start + 1 + offset);
+ if (cur_region_idx >= 0)
+ if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++;
+
+ i = cur_last;
+
+ if (cut_here > 1 && cur_last == ori_last) break;
+
+ // reset accumulators
+ init_gf_stats(&gf_stats);
+ }
+ ++i;
+ }
+
+ // save intervals
+ rc->intervals_till_gf_calculate_due = count_cuts - 1;
+ for (int n = 1; n < count_cuts; n++) {
+ p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
+ }
+ p_rc->cur_gf_index = 0;
+ cpi->twopass_frame.stats_in = start_pos;
+}
+
+static void correct_frames_to_key(AV1_COMP *cpi) {
+ int lookahead_size =
+ (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+ if (lookahead_size <
+ av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) {
+ assert(
+ IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0,
+ lookahead_size == cpi->ppi->frames_left));
+ cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+ } else if (cpi->ppi->frames_left > 0) {
+ // Correct frames to key based on limit
+ cpi->rc.frames_to_key =
+ AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left);
+ }
+}
+
+/*!\brief Define a GF group in one pass mode when no look ahead stats are
+ * available.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup in the special
+ * case of one pass encoding where no lookahead stats are avialable.
+ *
+ * \param[in] cpi Top-level encoder structure
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group_pass0(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+ int target;
+
+ if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_set_golden_update(cpi);
+ } else {
+ p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index];
+ rc->intervals_till_gf_calculate_due--;
+ p_rc->cur_gf_index++;
+ }
+
+ // correct frames_to_key when lookahead queue is flushing
+ correct_frames_to_key(cpi);
+
+ if (p_rc->baseline_gf_interval > rc->frames_to_key)
+ p_rc->baseline_gf_interval = rc->frames_to_key;
+
+ p_rc->gfu_boost = DEFAULT_GF_BOOST;
+ p_rc->constrained_gf_group =
+ (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+
+ gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
+
+ // Rare case when the look-ahead is less than the target GOP length, can't
+ // generate ARF frame.
+ if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+ !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
+ p_rc->baseline_gf_interval < rc->min_gf_interval)
+ gf_group->max_layer_depth_allowed = 0;
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi);
+
+ // Allocate bits to each of the frames in the GF group.
+ // TODO(sarahparker) Extend this to work with pyramid structure.
+ for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
+ const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
+ if (oxcf->rc_cfg.mode == AOM_CBR) {
+ if (cur_update_type == KF_UPDATE) {
+ target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
+ }
+ } else {
+ if (cur_update_type == KF_UPDATE) {
+ target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
+ }
+ }
+ gf_group->bit_allocation[cur_index] = target;
+ }
+}
+
+static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
+ int arf_position) {
+ p_rc->baseline_gf_interval = arf_position;
+}
+
+// initialize GF_GROUP_STATS
+static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
+ gf_stats->gf_group_err = 0.0;
+ gf_stats->gf_group_raw_error = 0.0;
+ gf_stats->gf_group_skip_pct = 0.0;
+ gf_stats->gf_group_inactive_zone_rows = 0.0;
+
+ gf_stats->mv_ratio_accumulator = 0.0;
+ gf_stats->decay_accumulator = 1.0;
+ gf_stats->zero_motion_accumulator = 1.0;
+ gf_stats->loop_decay_rate = 1.0;
+ gf_stats->last_loop_decay_rate = 1.0;
+ gf_stats->this_frame_mv_in_out = 0.0;
+ gf_stats->mv_in_out_accumulator = 0.0;
+ gf_stats->abs_mv_in_out_accumulator = 0.0;
+
+ gf_stats->avg_sr_coded_error = 0.0;
+ gf_stats->avg_pcnt_second_ref = 0.0;
+ gf_stats->avg_new_mv_count = 0.0;
+ gf_stats->avg_wavelet_energy = 0.0;
+ gf_stats->avg_raw_err_stdev = 0.0;
+ gf_stats->non_zero_stdev_count = 0;
+}
+
+static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w,
+ int f_h, FIRSTPASS_STATS *next_frame,
+ const FIRSTPASS_STATS *start_pos,
+ GF_GROUP_STATS *gf_stats, int *idx) {
+ int i, flash_detected;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ RATE_CONTROL *const rc = &cpi->rc;
+ FRAME_INFO *frame_info = &cpi->frame_info;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ init_gf_stats(gf_stats);
+ av1_zero(*next_frame);
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ i = is_intra_only;
+ // get the determined gf group length from p_rc->gf_intervals
+ while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+ // read in the next frame
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+ // Accumulate error score of frames in this gf group.
+ double mod_frame_err =
+ calculate_modified_err(frame_info, twopass, oxcf, next_frame);
+ // accumulate stats for this frame
+ accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats);
+ ++i;
+ }
+
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+ i = is_intra_only;
+ input_stats(twopass, &cpi->twopass_frame, next_frame);
+ while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+ // read in the next frame
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+
+ // accumulate stats for next frame
+ accumulate_next_frame_stats(next_frame, flash_detected,
+ rc->frames_since_key, i, gf_stats, f_w, f_h);
+
+ ++i;
+ }
+
+ i = p_rc->gf_intervals[p_rc->cur_gf_index];
+ average_gf_stats(i, gf_stats);
+
+ *idx = i;
+}
+
+static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc,
+ int idx, int is_final_pass) {
+ if (is_final_pass) {
+ rc->intervals_till_gf_calculate_due--;
+ p_rc->cur_gf_index++;
+ }
+
+ // Was the group length constrained by the requirement for a new KF?
+ p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0;
+
+ set_baseline_gf_interval(p_rc, idx);
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+}
+
+#define MAX_GF_BOOST 5400
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only,
+ int is_final_pass, int use_alt_ref,
+ int alt_offset, const FIRSTPASS_STATS *start_pos,
+ GF_GROUP_STATS *gf_stats) {
+ // Should we use the alternate reference frame.
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ FRAME_INFO *frame_info = &cpi->frame_info;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ int ext_len = i - is_intra_only;
+ if (use_alt_ref) {
+ const int forward_frames = (rc->frames_to_key - i >= ext_len)
+ ? ext_len
+ : AOMMAX(0, rc->frames_to_key - i);
+
+ // Calculate the boost for alt ref.
+ p_rc->gfu_boost = av1_calc_arf_boost(
+ twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset,
+ forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost,
+ &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled);
+ } else {
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+ p_rc->gfu_boost = AOMMIN(
+ MAX_GF_BOOST,
+ av1_calc_arf_boost(
+ twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len,
+ 0, &p_rc->num_stats_used_for_gfu_boost,
+ &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled));
+ }
+
+#define LAST_ALR_BOOST_FACTOR 0.2f
+ p_rc->arf_boost_factor = 1.0;
+ if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
+ // Reduce the boost of altref in the last gf group
+ if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
+ rc->frames_to_key - ext_len == 0) {
+ p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+ }
+ }
+
+ // Reset the file position.
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+ if (cpi->ppi->lap_enabled) {
+ // Since we don't have enough stats to know the actual error of the
+ // gf group, we assume error of each frame to be equal to 1 and set
+ // the error of the group as baseline_gf_interval.
+ gf_stats->gf_group_err = p_rc->baseline_gf_interval;
+ }
+ // Calculate the bits to be allocated to the gf/arf group as a whole
+ p_rc->gf_group_bits =
+ calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more aggressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) &&
+ is_final_pass) {
+ const int vbr_group_bits_per_frame =
+ (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval);
+ const double group_av_err =
+ gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval;
+ const double group_av_skip_pct =
+ gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval;
+ const double group_av_inactive_zone =
+ ((gf_stats->gf_group_inactive_zone_rows * 2) /
+ (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+
+ int tmp_q;
+ tmp_q = get_twopass_worst_quality(
+ cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+ vbr_group_bits_per_frame);
+ rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
+ }
+#endif
+
+ // Adjust KF group bits and error remaining.
+ if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err;
+
+ // Reset the file position.
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ if (rc->frames_since_key != 0) {
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_pos, twopass->stats_buf_ctx->stats_in_end,
+ p_rc->baseline_gf_interval);
+ }
+
+ av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
+ use_alt_ref, p_rc->gf_group_bits);
+
+ // TODO(jingning): Generalize this condition.
+ if (is_final_pass) {
+ cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref;
+
+ // Reset rolling actual and target bits counters for ARF groups.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+ }
+#if CONFIG_BITRATE_ACCURACY
+ if (is_final_pass) {
+ av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info,
+ p_rc->baseline_gf_interval);
+ }
+#endif
+}
+
+/*!\brief Define a GF group.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] frame_params Structure with frame parameters
+ * \param[in] is_final_pass Whether this is the final pass for the
+ * GF group, or a trial (non-zero)
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+ int is_final_pass) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ const int f_w = cm->width;
+ const int f_h = cm->height;
+ int i;
+ const int is_intra_only = rc->frames_since_key == 0;
+
+ cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (!is_intra_only) {
+ av1_zero(cpi->ppi->gf_group);
+ cpi->gf_frame_index = 0;
+ }
+
+ if (has_no_stats_stage(cpi)) {
+ define_gf_group_pass0(cpi);
+ return;
+ }
+
+ if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+ int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass);
+ if (ret == 0) return;
+
+ av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+ cpi->third_pass_ctx = NULL;
+ }
+
+ // correct frames_to_key when lookahead queue is emptying
+ if (cpi->ppi->lap_enabled) {
+ correct_frames_to_key(cpi);
+ }
+
+ GF_GROUP_STATS gf_stats;
+ accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+ &gf_stats, &i);
+
+ const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ const int active_min_gf_interval = rc->min_gf_interval;
+
+ // Disable internal ARFs for "still" gf groups.
+ // zero_motion_accumulator: minimum percentage of (0,0) motion;
+ // avg_sr_coded_error: average of the SSE per pixel of each frame;
+ // avg_raw_err_stdev: average of the standard deviation of (0,0)
+ // motion error per block of each frame.
+ const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1;
+ if (can_disable_internal_arfs &&
+ gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
+ gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR &&
+ gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+ cpi->ppi->internal_altref_allowed = 0;
+ }
+
+ int use_alt_ref;
+ if (can_disable_arf) {
+ use_alt_ref =
+ !is_almost_static(gf_stats.zero_motion_accumulator,
+ twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) &&
+ p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+ (i >= MIN_GF_INTERVAL);
+ } else {
+ use_alt_ref = p_rc->use_arf_in_this_kf_group &&
+ (i < gf_cfg->lag_in_frames) && (i > 2);
+ }
+ if (use_alt_ref) {
+ gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+ } else {
+ gf_group->max_layer_depth_allowed = 0;
+ }
+
+ int alt_offset = 0;
+ // The length reduction strategy is tweaked for certain cases, and doesn't
+ // work well for certain other cases.
+ const int allow_gf_length_reduction =
+ ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) ||
+ !cpi->ppi->internal_altref_allowed) &&
+ !is_lossless_requested(rc_cfg);
+
+ if (allow_gf_length_reduction && use_alt_ref) {
+ // adjust length of this gf group if one of the following condition met
+ // 1: only one overlay frame left and this gf is too long
+ // 2: next gf group is too short to have arf compared to the current gf
+
+ // maximum length of next gf group
+ const int next_gf_len = rc->frames_to_key - i;
+ const int single_overlay_left =
+ next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+ // the next gf is probably going to have a ARF but it will be shorter than
+ // this gf
+ const int unbalanced_gf =
+ i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 >= rc->min_gf_interval;
+
+ if (single_overlay_left || unbalanced_gf) {
+ const int roll_back = REDUCE_GF_LENGTH_BY;
+ // Reduce length only if active_min_gf_interval will be respected later.
+ if (i - roll_back >= active_min_gf_interval + 1) {
+ alt_offset = -roll_back;
+ i -= roll_back;
+ if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+ p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back;
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+ accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame,
+ start_pos, &gf_stats, &i);
+ }
+ }
+ }
+
+ update_gop_length(rc, p_rc, i, is_final_pass);
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi);
+
+ set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref,
+ alt_offset, start_pos, &gf_stats);
+
+ frame_params->frame_type =
+ rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME;
+ frame_params->show_frame =
+ !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+}
+
+/*!\brief Define a GF group for the third apss.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group for the third pass, along
+ * with various parameters regarding bit-allocation and quality setup based on
+ * the two-pass bitstream.
+ * Much of the function still uses the strategies used for the second pass and
+ * relies on first pass statistics. It is expected that over time these portions
+ * would be replaced with strategies specific to the third pass.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] frame_params Structure with frame parameters
+ * \param[in] is_final_pass Whether this is the final pass for the
+ * GF group, or a trial (non-zero)
+ *
+ * \return 0: Success;
+ * -1: There are conflicts between the bitstream and current config
+ * The values in cpi->ppi->gf_group are also changed.
+ */
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+ int is_final_pass) {
+ if (!cpi->third_pass_ctx) return -1;
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+ const int f_w = cm->width;
+ const int f_h = cm->height;
+ int i;
+ const int is_intra_only = rc->frames_since_key == 0;
+
+ cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (!is_intra_only) {
+ av1_zero(cpi->ppi->gf_group);
+ cpi->gf_frame_index = 0;
+ }
+
+ GF_GROUP_STATS gf_stats;
+ accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+ &gf_stats, &i);
+
+ const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
+
+ // TODO(any): set cpi->ppi->internal_altref_allowed accordingly;
+
+ int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx);
+ if (use_alt_ref == 0 && !can_disable_arf) return -1;
+ if (use_alt_ref) {
+ gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+ } else {
+ gf_group->max_layer_depth_allowed = 0;
+ }
+
+ update_gop_length(rc, p_rc, i, is_final_pass);
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi);
+
+ set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0,
+ start_pos, &gf_stats);
+
+ frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type;
+ frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame;
+ return 0;
+}
+
+// #define FIXED_ARF_BITS
+#ifdef FIXED_ARF_BITS
+#define ARF_BITS_FRACTION 0.75
+#endif
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+ GF_GROUP *gf_group, int is_key_frame, int use_arf,
+ int64_t gf_group_bits) {
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ // Calculate the extra bits to be used for boosted frame(s)
+#ifdef FIXED_ARF_BITS
+ int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
+#else
+ int gf_arf_bits = calculate_boost_bits(
+ p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost,
+ gf_group_bits);
+#endif
+
+ gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
+ gf_group_bits, 1);
+
+ // Allocate bits to each of the frames in the GF group.
+ allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits,
+ is_key_frame, use_arf);
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 1.9
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+ (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+ (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double get_second_ref_usage_thresh(int frame_count_so_far) {
+ const int adapt_upto = 32;
+ const double min_second_ref_usage_thresh = 0.085;
+ const double second_ref_usage_thresh_max_delta = 0.035;
+ if (frame_count_so_far >= adapt_upto) {
+ return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+ }
+ return min_second_ref_usage_thresh +
+ ((double)frame_count_so_far / (adapt_upto - 1)) *
+ second_ref_usage_thresh_max_delta;
+}
+
+static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info,
+ int this_stats_index, int frame_count_so_far,
+ enum aom_rc_mode rc_mode, int scenecut_mode,
+ int num_mbs) {
+ const FIRSTPASS_STATS *last_stats =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index - 1);
+ const FIRSTPASS_STATS *this_stats =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index);
+ const FIRSTPASS_STATS *next_stats =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index + 1);
+ if (last_stats == NULL || this_stats == NULL || next_stats == NULL) {
+ return 0;
+ }
+
+ int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_stats->pcnt_inter;
+ double modified_pcnt_inter =
+ this_stats->pcnt_inter - this_stats->pcnt_neutral;
+ const double second_ref_usage_thresh =
+ get_second_ref_usage_thresh(frame_count_so_far);
+ int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL;
+ int count_for_tolerable_prediction = 3;
+
+ // We do "-1" because the candidate key is not counted.
+ int stats_after_this_stats =
+ av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1;
+
+ if (scenecut_mode == ENABLE_SCENECUT_MODE_1) {
+ if (stats_after_this_stats < 3) {
+ return 0;
+ } else {
+ frames_to_test_after_candidate_key = 3;
+ count_for_tolerable_prediction = 1;
+ }
+ }
+ // Make sure we have enough stats after the candidate key.
+ frames_to_test_after_candidate_key =
+ AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats);
+
+ // Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
+ // If so, then examine how well it predicts subsequent frames.
+ if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
+ (this_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+ (next_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+ ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ slide_transition(this_stats, last_stats, next_stats) ||
+ ((pcnt_intra > MIN_INTRA_LEVEL) &&
+ (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+ ((this_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) <
+ KF_II_ERR_THRESHOLD) &&
+ ((fabs(last_stats->coded_error - this_stats->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_stats->coded_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ (fabs(last_stats->intra_error - this_stats->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_stats->intra_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ ((next_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) >
+ II_IMPROVEMENT_THRESHOLD))))) {
+ int i;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+
+ // Examine how well the key frame predicts subsequent frames.
+ for (i = 1; i <= frames_to_test_after_candidate_key; ++i) {
+ // Get the next frame details
+ const FIRSTPASS_STATS *local_next_frame =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index + i);
+ double next_iiratio =
+ (BOOST_FACTOR * local_next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error));
+
+ if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+ // Cumulative effect of decay in prediction quality.
+ if (local_next_frame->pcnt_inter > 0.85)
+ decay_accumulator *= local_next_frame->pcnt_inter;
+ else
+ decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0;
+
+ // Keep a running total.
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses.
+ // TODO(any): Test of intra error should be normalized to an MB.
+ if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+ (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) <
+ 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (local_next_frame->intra_error < (200.0 / (double)num_mbs))) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then
+ // break out else discard this potential key frame and move on
+ if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) {
+ is_viable_kf = 1;
+ } else {
+ is_viable_kf = 0;
+ }
+ }
+ return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_KF_BOOST 600 // Minimum boost for non-static KF interval
+#define MAX_KF_BOOST 3200
+#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval
+
+static int detect_app_forced_key(AV1_COMP *cpi) {
+ int num_frames_to_app_forced_key = is_forced_keyframe_pending(
+ cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage);
+ return num_frames_to_app_forced_key;
+}
+
+static int get_projected_kf_boost(AV1_COMP *cpi) {
+ /*
+ * If num_stats_used_for_kf_boost >= frames_to_key, then
+ * all stats needed for prior boost calculation are available.
+ * Hence projecting the prior boost is not needed in this cases.
+ */
+ if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+ return cpi->ppi->p_rc.kf_boost;
+
+ // Get the current tpl factor (number of frames = frames_to_key).
+ double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
+ // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
+ double tpl_factor_num_stats = av1_get_kf_boost_projection_factor(
+ cpi->ppi->p_rc.num_stats_used_for_kf_boost);
+ int projected_kf_boost =
+ (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats);
+ return projected_kf_boost;
+}
+
+/*!\brief Determine the location of the next key frame
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame when a
+ * scenecut is detected or the maximum key frame distance is reached.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] firstpass_info struct for firstpass info
+ * \param[in] num_frames_to_detect_scenecut Maximum lookahead frames.
+ * \param[in] search_start_idx the start index for searching key frame.
+ * Set it to one if we already know the
+ * current frame is key frame. Otherwise,
+ * set it to zero.
+ *
+ * \return Number of frames to the next key including the current frame.
+ */
+static int define_kf_interval(AV1_COMP *cpi,
+ const FIRSTPASS_INFO *firstpass_info,
+ int num_frames_to_detect_scenecut,
+ int search_start_idx) {
+ const TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+ double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+ double decay_accumulator = 1.0;
+ int i = 0, j;
+ int frames_to_key = search_start_idx;
+ int frames_since_key = rc->frames_since_key + 1;
+ int scenecut_detected = 0;
+
+ int num_frames_to_next_key = detect_app_forced_key(cpi);
+
+ if (num_frames_to_detect_scenecut == 0) {
+ if (num_frames_to_next_key != -1)
+ return num_frames_to_next_key;
+ else
+ return rc->frames_to_key;
+ }
+
+ if (num_frames_to_next_key != -1)
+ num_frames_to_detect_scenecut =
+ AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key);
+
+ // Initialize the decay rates for the recent frames to check
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+ i = 0;
+ const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.mi_params.MBs;
+ const int future_stats_count =
+ av1_firstpass_info_future_count(firstpass_info, 0);
+ while (frames_to_key < future_stats_count &&
+ frames_to_key < num_frames_to_detect_scenecut) {
+ // Provided that we are not at the end of the file...
+ if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
+ frames_to_key + 1 < future_stats_count) {
+ double loop_decay_rate;
+
+ // Check for a scene cut.
+ if (frames_since_key >= kf_cfg->key_freq_min) {
+ scenecut_detected = test_candidate_kf(
+ &twopass->firstpass_info, frames_to_key, frames_since_key,
+ oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection,
+ num_mbs);
+ if (scenecut_detected) {
+ break;
+ }
+ }
+
+ // How fast is the prediction quality decaying?
+ const FIRSTPASS_STATS *next_stats =
+ av1_firstpass_info_peek(firstpass_info, frames_to_key + 1);
+ loop_decay_rate = get_prediction_decay_rate(next_stats);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concerned with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+ decay_accumulator *= recent_loop_decay[j];
+
+ // Special check for transition or high motion followed by a
+ // static scene.
+ if (frames_since_key >= kf_cfg->key_freq_min) {
+ scenecut_detected = detect_transition_to_still(
+ firstpass_info, frames_to_key + 1, rc->min_gf_interval, i,
+ kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator);
+ if (scenecut_detected) {
+ // In the case of transition followed by a static scene, the key frame
+ // could be a good predictor for the following frames, therefore we
+ // do not use an arf.
+ p_rc->use_arf_in_this_kf_group = 0;
+ break;
+ }
+ }
+
+ // Step on to the next frame.
+ ++frames_to_key;
+ ++frames_since_key;
+
+ // If we don't have a real key frame within the next two
+ // key_freq_max intervals then break out of the loop.
+ if (frames_to_key >= 2 * kf_cfg->key_freq_max) {
+ break;
+ }
+ } else {
+ ++frames_to_key;
+ ++frames_since_key;
+ }
+ ++i;
+ }
+ if (cpi->ppi->lap_enabled && !scenecut_detected)
+ frames_to_key = num_frames_to_next_key;
+
+ return frames_to_key;
+}
+
+static double get_kf_group_avg_error(TWO_PASS *twopass,
+ TWO_PASS_FRAME *twopass_frame,
+ const FIRSTPASS_STATS *first_frame,
+ const FIRSTPASS_STATS *start_position,
+ int frames_to_key) {
+ FIRSTPASS_STATS cur_frame = *first_frame;
+ int num_frames, i;
+ double kf_group_avg_error = 0.0;
+
+ reset_fpf_position(twopass_frame, start_position);
+
+ for (i = 0; i < frames_to_key; ++i) {
+ kf_group_avg_error += cur_frame.coded_error;
+ if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break;
+ }
+ num_frames = i + 1;
+ num_frames = AOMMIN(num_frames, frames_to_key);
+ kf_group_avg_error = kf_group_avg_error / num_frames;
+
+ return (kf_group_avg_error);
+}
+
+static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
+ double kf_group_avg_error) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ int64_t kf_group_bits;
+ if (cpi->ppi->lap_enabled) {
+ kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
+ if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
+ double vbr_corpus_complexity_lap =
+ cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
+ /* Get the average corpus complexity of the frame */
+ kf_group_bits = (int64_t)(
+ kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
+ }
+ } else {
+ kf_group_bits = (int64_t)(twopass->bits_left *
+ (kf_group_err / twopass->modified_error_left));
+ }
+
+ return kf_group_bits;
+}
+
+static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS cur_frame;
+ av1_zero(cur_frame);
+ int num_frames = 0;
+ // Accumulate total stat using available number of stats.
+ for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) {
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break;
+ av1_accumulate_stats(avg_frame_stat, &cur_frame);
+ }
+
+ if (num_frames < 2) {
+ return num_frames;
+ }
+ // Average the total stat
+ avg_frame_stat->weight = avg_frame_stat->weight / num_frames;
+ avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames;
+ avg_frame_stat->frame_avg_wavelet_energy =
+ avg_frame_stat->frame_avg_wavelet_energy / num_frames;
+ avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames;
+ avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames;
+ avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames;
+ avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames;
+ avg_frame_stat->pcnt_second_ref =
+ avg_frame_stat->pcnt_second_ref / num_frames;
+ avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames;
+ avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames;
+ avg_frame_stat->inactive_zone_rows =
+ avg_frame_stat->inactive_zone_rows / num_frames;
+ avg_frame_stat->inactive_zone_cols =
+ avg_frame_stat->inactive_zone_cols / num_frames;
+ avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames;
+ avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames;
+ avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames;
+ avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames;
+ avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames;
+ avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames;
+ avg_frame_stat->mv_in_out_count =
+ avg_frame_stat->mv_in_out_count / num_frames;
+ avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames;
+ avg_frame_stat->count = avg_frame_stat->count / num_frames;
+ avg_frame_stat->duration = avg_frame_stat->duration / num_frames;
+
+ return num_frames;
+}
+
+static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
+ double *zero_motion_accumulator,
+ double *sr_accumulator, int use_avg_stat) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ FIRSTPASS_STATS frame_stat;
+ av1_zero(frame_stat);
+ int i = 0, num_stat_used = 0;
+ double boost_score = 0.0;
+ const double kf_max_boost =
+ cpi->oxcf.rc_cfg.mode == AOM_Q
+ ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+ KF_MAX_FRAME_BOOST)
+ : KF_MAX_FRAME_BOOST;
+
+ // Calculate the average using available number of stats.
+ if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat);
+
+ for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) {
+ if (!use_avg_stat &&
+ EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat))
+ break;
+
+ // Monitor for static sections.
+ // For the first frame in kf group, the second ref indicator is invalid.
+ if (i > 0) {
+ *zero_motion_accumulator =
+ AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat));
+ } else {
+ *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
+ }
+
+ // Not all frames in the group are necessarily used in calculating boost.
+ if ((*sr_accumulator < (kf_raw_err * 1.50)) &&
+ (i <= rc->max_gf_interval * 2)) {
+ double frame_boost;
+ double zm_factor;
+
+ // Factor 0.75-1.25 based on how much of frame is static.
+ zm_factor = (0.75 + (*zero_motion_accumulator / 2.0));
+
+ if (i < 2) *sr_accumulator = 0.0;
+ frame_boost =
+ calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat,
+ sr_accumulator, kf_max_boost);
+ boost_score += frame_boost * zm_factor;
+ }
+ }
+ return boost_score;
+}
+
+/*!\brief Interval(in seconds) to clip key-frame distance to in LAP.
+ */
+#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5
+
+/*!\brief Determine the next key frame group
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame, and
+ * calculates the bit allocation of the KF group and the keyframe itself.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] this_frame Pointer to first pass stats
+ */
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+ const FIRSTPASS_STATS first_frame = *this_frame;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info;
+ av1_zero(next_frame);
+
+ rc->frames_since_key = 0;
+ // Use arfs if possible.
+ p_rc->use_arf_in_this_kf_group = is_altref_enabled(
+ oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
+
+ // Reset the GF group data structures.
+ av1_zero(*gf_group);
+ cpi->gf_frame_index = 0;
+
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+
+ if (has_no_stats_stage(cpi)) {
+ int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
+ p_rc->this_key_frame_forced =
+ current_frame->frame_number != 0 && rc->frames_to_key == 0;
+ if (num_frames_to_app_forced_key != -1)
+ rc->frames_to_key = num_frames_to_app_forced_key;
+ else
+ rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
+ correct_frames_to_key(cpi);
+ p_rc->kf_boost = DEFAULT_KF_BOOST;
+ gf_group->update_type[0] = KF_UPDATE;
+ return;
+ }
+ int i;
+ const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in;
+ int kf_bits = 0;
+ double zero_motion_accumulator = 1.0;
+ double boost_score = 0.0;
+ double kf_raw_err = 0.0;
+ double kf_mod_err = 0.0;
+ double sr_accumulator = 0.0;
+ double kf_group_avg_error = 0.0;
+ int frames_to_key, frames_to_key_clipped = INT_MAX;
+ int64_t kf_group_bits_clipped = INT64_MAX;
+
+ // Is this a forced key frame by interval.
+ p_rc->this_key_frame_forced = p_rc->next_key_frame_forced;
+
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0; // Group modified error score.
+
+ kf_raw_err = this_frame->intra_error;
+ kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+ // We assume the current frame is a key frame and we are looking for the next
+ // key frame. Therefore search_start_idx = 1
+ frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max,
+ /*search_start_idx=*/1);
+
+ if (frames_to_key != -1) {
+ rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key);
+ } else {
+ rc->frames_to_key = kf_cfg->key_freq_max;
+ }
+
+ if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural interval
+ // is between 1x and 2x.
+ if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) {
+ FIRSTPASS_STATS tmp_frame = first_frame;
+
+ rc->frames_to_key /= 2;
+
+ // Reset to the start of the group.
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+ // Rescan to get the correct error data for the forced kf group.
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break;
+ }
+ p_rc->next_key_frame_forced = 1;
+ } else if ((cpi->twopass_frame.stats_in ==
+ twopass->stats_buf_ctx->stats_in_end &&
+ is_stat_consumption_stage_twopass(cpi)) ||
+ rc->frames_to_key >= kf_cfg->key_freq_max) {
+ p_rc->next_key_frame_forced = 1;
+ } else {
+ p_rc->next_key_frame_forced = 0;
+ }
+
+ double kf_group_err = 0;
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ const FIRSTPASS_STATS *this_stats =
+ av1_firstpass_info_peek(&twopass->firstpass_info, i);
+ if (this_stats != NULL) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err_new(
+ frame_info, &firstpass_info->total_stats, this_stats,
+ oxcf->rc_cfg.vbrbias, twopass->modified_error_min,
+ twopass->modified_error_max);
+ ++p_rc->num_stats_used_for_kf_boost;
+ }
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
+ (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
+ // Maximum number of bits for a single normal frame (not key frame).
+ const int max_bits = frame_max_bits(rc, oxcf);
+
+ // Maximum number of bits allocated to the key frame group.
+ int64_t max_grp_bits;
+
+ if (oxcf->rc_cfg.vbr_corpus_complexity_lap) {
+ kf_group_avg_error =
+ get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame,
+ start_position, rc->frames_to_key);
+ }
+
+ // Default allocation based on bits left and relative
+ // complexity of the section.
+ twopass->kf_group_bits =
+ get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error);
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+ if (twopass->kf_group_bits > max_grp_bits)
+ twopass->kf_group_bits = max_grp_bits;
+ } else {
+ twopass->kf_group_bits = 0;
+ }
+ twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+ if (cpi->ppi->lap_enabled) {
+ // In the case of single pass based on LAP, frames to key may have an
+ // inaccurate value, and hence should be clipped to an appropriate
+ // interval.
+ frames_to_key_clipped =
+ (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate);
+
+ // This variable calculates the bits allocated to kf_group with a clipped
+ // frames_to_key.
+ if (rc->frames_to_key > frames_to_key_clipped) {
+ kf_group_bits_clipped =
+ (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped /
+ rc->frames_to_key);
+ }
+ }
+
+ // Reset the first pass file position.
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+
+ // Scan through the kf group collating various stats used to determine
+ // how many bits to spend on it.
+ boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator,
+ &sr_accumulator, 0);
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
+
+ p_rc->kf_boost = (int)boost_score;
+
+ if (cpi->ppi->lap_enabled) {
+ if (oxcf->rc_cfg.mode == AOM_Q) {
+ p_rc->kf_boost = get_projected_kf_boost(cpi);
+ } else {
+ // TODO(any): Explore using average frame stats for AOM_Q as well.
+ boost_score = get_kf_boost_score(
+ cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+ p_rc->kf_boost += (int)boost_score;
+ }
+ }
+
+ // Special case for static / slide show content but don't apply
+ // if the kf group is very short.
+ if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
+ (rc->frames_to_key > 8)) {
+ p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST);
+ } else {
+ // Apply various clamps for min and max boost
+ p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3));
+ p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST);
+#ifdef STRICT_RC
+ p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST);
+#endif
+ }
+
+ // Work out how many bits to allocate for the key frame itself.
+ // In case of LAP enabled for VBR, if the frames_to_key value is
+ // very high, we calculate the bits based on a clipped value of
+ // frames_to_key.
+ kf_bits = calculate_boost_bits(
+ AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost,
+ AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
+ // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n",
+ // p_rc->kf_boost,
+ // kf_bits, twopass->kf_zeromotion_pct);
+ kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
+ twopass->kf_group_bits, 0);
+
+ twopass->kf_group_bits -= kf_bits;
+
+ // Save the bits to spend on the key frame.
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+
+ // Note the total error score of the kf group minus the key frame itself.
+ if (cpi->ppi->lap_enabled)
+ // As we don't have enough stats to know the actual error of the group,
+ // we assume the complexity of each frame to be equal to 1, and set the
+ // error as the number of frames in the group(minus the keyframe).
+ twopass->kf_group_error_left = (double)(rc->frames_to_key - 1);
+ else
+ twopass->kf_group_error_left = kf_group_err - kf_mod_err;
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
+ twopass->modified_error_left -= kf_group_err;
+}
+
+#define ARF_STATS_OUTPUT 0
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+static int get_section_target_bandwidth(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ int section_target_bandwidth;
+ const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+ current_frame->frame_number);
+ if (cpi->ppi->lap_enabled)
+ section_target_bandwidth = (int)rc->avg_frame_bandwidth;
+ else
+ section_target_bandwidth = (int)(twopass->bits_left / frames_left);
+ return section_target_bandwidth;
+}
+
+static INLINE void set_twopass_params_based_on_fp_stats(
+ AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
+ if (this_frame_ptr == NULL) return;
+
+ TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame;
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass_frame->mb_av_energy = log1p(this_frame_ptr->intra_error);
+
+ const FIRSTPASS_STATS *const total_stats =
+ cpi->ppi->twopass.stats_buf_ctx->total_stats;
+ if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+ twopass_frame->frame_avg_haar_energy =
+ log1p(this_frame_ptr->frame_avg_wavelet_energy);
+ }
+
+ // Set the frame content type flag.
+ if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
+ twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION;
+ else
+ twopass_frame->fr_content_type = FC_NORMAL;
+}
+
+static void process_first_pass_stats(AV1_COMP *cpi,
+ FIRSTPASS_STATS *this_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+
+ if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
+ cpi->gf_frame_index == 0 && total_stats &&
+ twopass->stats_buf_ctx->total_left_stats) {
+ if (cpi->ppi->lap_enabled) {
+ /*
+ * Accumulate total_stats using available limited number of stats,
+ * and assign it to total_left_stats.
+ */
+ *twopass->stats_buf_ctx->total_left_stats = *total_stats;
+ }
+ // Special case code for first frame.
+ const int section_target_bandwidth = get_section_target_bandwidth(cpi);
+ const double section_length =
+ twopass->stats_buf_ctx->total_left_stats->count;
+ const double section_error =
+ twopass->stats_buf_ctx->total_left_stats->coded_error / section_length;
+ const double section_intra_skip =
+ twopass->stats_buf_ctx->total_left_stats->intra_skip_pct /
+ section_length;
+ const double section_inactive_zone =
+ (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) /
+ ((double)cm->mi_params.mb_rows * section_length);
+ const int tmp_q = get_twopass_worst_quality(
+ cpi, section_error, section_intra_skip + section_inactive_zone,
+ section_target_bandwidth);
+
+ rc->active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ p_rc->last_q[INTER_FRAME] = tmp_q;
+ p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth);
+ p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+ p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
+ p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME];
+ }
+
+ if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) {
+ *this_frame = *cpi->twopass_frame.stats_in;
+ ++cpi->twopass_frame.stats_in;
+ }
+ set_twopass_params_based_on_fp_stats(cpi, this_frame);
+}
+
+static void setup_target_rate(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ int target_rate = gf_group->bit_allocation[cpi->gf_frame_index];
+
+ if (has_no_stats_stage(cpi)) {
+ av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
+ cpi->common.height);
+ }
+
+ rc->base_frame_target = target_rate;
+}
+
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats) {
+ FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
+ while (this_stats < last_stats - 1) {
+ next_stats = this_stats + 1;
+ if (next_stats->pcnt_second_ref > next_stats->pcnt_inter &&
+ next_stats->pcnt_second_ref >= 0.5) {
+ this_stats->is_flash = 1;
+ } else {
+ this_stats->is_flash = 0;
+ }
+ this_stats = next_stats;
+ }
+ // We always treat the last one as none flash.
+ if (last_stats - 1 >= first_stats) {
+ (last_stats - 1)->is_flash = 0;
+ }
+}
+
+// Smooth-out the noise variance so it is more stable
+// Returns 0 on success, -1 on memory allocation failure.
+// TODO(bohanli): Use a better low-pass filter than averaging
+static int smooth_filter_noise(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats) {
+ int len = (int)(last_stats - first_stats);
+ double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise));
+ if (!smooth_noise) return -1;
+
+ for (int i = 0; i < len; i++) {
+ double total_noise = 0;
+ double total_wt = 0;
+ for (int j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, 0), len - 1);
+ if (first_stats[idx].is_flash) continue;
+
+ total_noise += first_stats[idx].noise_var;
+ total_wt += 1.0;
+ }
+ if (total_wt > 0.01) {
+ total_noise /= total_wt;
+ } else {
+ total_noise = first_stats[i].noise_var;
+ }
+ smooth_noise[i] = total_noise;
+ }
+
+ for (int i = 0; i < len; i++) {
+ first_stats[i].noise_var = smooth_noise[i];
+ }
+
+ aom_free(smooth_noise);
+ return 0;
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats,
+ struct aom_internal_error_info *error_info) {
+ FIRSTPASS_STATS *this_stats, *next_stats;
+ double C1, C2, C3, noise;
+ for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+ this_stats->noise_var = 0.0;
+ // flashes tend to have high correlation of innovations, so ignore them.
+ if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+ (this_stats - 2)->is_flash)
+ continue;
+
+ C1 = (this_stats - 1)->intra_error *
+ (this_stats->intra_error - this_stats->coded_error);
+ C2 = (this_stats - 2)->intra_error *
+ ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error);
+ C3 = (this_stats - 2)->intra_error *
+ (this_stats->intra_error - this_stats->sr_coded_error);
+ if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+ C1 = sqrt(C1);
+ C2 = sqrt(C2);
+ C3 = sqrt(C3);
+
+ noise = (this_stats - 1)->intra_error - C1 * C2 / C3;
+ noise = AOMMAX(noise, 0.01);
+ this_stats->noise_var = noise;
+ }
+
+ // Copy noise from the neighbor if the noise value is not trustworthy
+ for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+ if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+ (this_stats - 2)->is_flash)
+ continue;
+ if (this_stats->noise_var < 1.0) {
+ int found = 0;
+ // TODO(bohanli): consider expanding to two directions at the same time
+ for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+ continue;
+ found = 1;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ if (found) continue;
+ for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+ next_stats--) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+ continue;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ }
+ }
+
+ // copy the noise if this is a flash
+ for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+ if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+ (this_stats - 2)->is_flash) {
+ int found = 0;
+ for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash)
+ continue;
+ found = 1;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ if (found) continue;
+ for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+ next_stats--) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash)
+ continue;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ }
+ }
+
+ // if we are at the first 2 frames, copy the noise
+ for (this_stats = first_stats;
+ this_stats < first_stats + 2 && (first_stats + 2) < last_stats;
+ this_stats++) {
+ this_stats->noise_var = (first_stats + 2)->noise_var;
+ }
+
+ if (smooth_filter_noise(first_stats, last_stats) == -1) {
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers in smooth_filter_noise()");
+ }
+}
+
+// Estimate correlation coefficient of each frame with its previous frame.
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats) {
+ FIRSTPASS_STATS *this_stats;
+ for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
+ const double C =
+ sqrt(AOMMAX((this_stats - 1)->intra_error *
+ (this_stats->intra_error - this_stats->coded_error),
+ 0.001));
+ const double cor_coeff =
+ C /
+ AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001);
+
+ this_stats->cor_coeff =
+ cor_coeff *
+ sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var,
+ 0.001) /
+ AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001));
+ // clip correlation coefficient.
+ this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1);
+ }
+ first_stats->cor_coeff = 1.0;
+}
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+ EncodeFrameParams *const frame_params,
+ unsigned int frame_flags) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ if (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+ frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ frame_params->show_frame =
+ !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+ if (cpi->gf_frame_index == 0) {
+ av1_tf_info_reset(&cpi->ppi->tf_info);
+ av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+ }
+ return;
+ }
+
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ int update_total_stats = 0;
+
+ if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return;
+
+ // Check forced key frames.
+ const int frames_to_next_forced_key = detect_app_forced_key(cpi);
+ if (frames_to_next_forced_key == 0) {
+ rc->frames_to_key = 0;
+ frame_flags &= FRAMEFLAGS_KEY;
+ } else if (frames_to_next_forced_key > 0 &&
+ frames_to_next_forced_key < rc->frames_to_key) {
+ rc->frames_to_key = frames_to_next_forced_key;
+ }
+
+ assert(cpi->twopass_frame.stats_in != NULL);
+ const int update_type = gf_group->update_type[cpi->gf_frame_index];
+ frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+
+ if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
+ assert(cpi->gf_frame_index < gf_group->size);
+
+ setup_target_rate(cpi);
+
+ // If this is an arf frame then we dont want to read the stats file or
+ // advance the input pointer as we already have what we need.
+ if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
+ const FIRSTPASS_STATS *const this_frame_ptr =
+ read_frame_stats(twopass, &cpi->twopass_frame,
+ gf_group->arf_src_offset[cpi->gf_frame_index]);
+ set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+ return;
+ }
+ }
+
+ if (oxcf->rc_cfg.mode == AOM_Q)
+ rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+
+ if (cpi->gf_frame_index == gf_group->size) {
+ if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) {
+ const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+ const int frames_to_key = define_kf_interval(
+ cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut,
+ /*search_start_idx=*/0);
+ if (frames_to_key != -1)
+ rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+ }
+ }
+
+ FIRSTPASS_STATS this_frame;
+ av1_zero(this_frame);
+ // call above fn
+ if (is_stat_consumption_stage(cpi)) {
+ if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) {
+ process_first_pass_stats(cpi, &this_frame);
+ update_total_stats = 1;
+ }
+ } else {
+ rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+ }
+
+ // Keyframe and section processing.
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
+ if (rc->frames_to_key <= 0) {
+ assert(rc->frames_to_key == 0);
+ // Define next KF group and assign bits to it.
+ frame_params->frame_type = KEY_FRAME;
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
+ }
+
+ if (rc->frames_to_fwd_kf <= 0)
+ rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (cpi->gf_frame_index == gf_group->size) {
+ av1_tf_info_reset(&cpi->ppi->tf_info);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+ vbr_rc_reset_gop_data(&cpi->vbr_rc_info);
+#endif // CONFIG_BITRATE_ACCURACY
+ int max_gop_length =
+ (oxcf->gf_cfg.lag_in_frames >= 32)
+ ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames -
+ oxcf->algo_cfg.arnr_max_frames / 2)
+ : MAX_GF_LENGTH_LAP;
+
+ // Handle forward key frame when enabled.
+ if (oxcf->kf_cfg.fwd_kf_dist > 0)
+ max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length);
+
+ // Use the provided gop size in low delay setting
+ if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval;
+
+ // Limit the max gop length for the last gop in 1 pass setting.
+ max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key);
+
+ // Identify regions if needed.
+ // TODO(bohanli): identify regions for all stats available.
+ if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
+ (p_rc->frames_till_regions_update - rc->frames_since_key <
+ rc->frames_to_key &&
+ p_rc->frames_till_regions_update - rc->frames_since_key <
+ max_gop_length + 1)) {
+ // how many frames we can analyze from this frame
+ int rest_frames =
+ AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES);
+ rest_frames =
+ AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end -
+ cpi->twopass_frame.stats_in +
+ (rc->frames_since_key == 0)));
+ p_rc->frames_till_regions_update = rest_frames;
+
+ int ret;
+ if (cpi->ppi->lap_enabled) {
+ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+ av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end,
+ cpi->common.error);
+ av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+ ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+ (rc->frames_since_key == 0), p_rc->regions,
+ &p_rc->num_regions);
+ } else {
+ ret = identify_regions(
+ cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
+ rest_frames, 0, p_rc->regions, &p_rc->num_regions);
+ }
+ if (ret == -1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers in identify_regions");
+ }
+ }
+
+ int cur_region_idx =
+ find_regions_index(p_rc->regions, p_rc->num_regions,
+ rc->frames_since_key - p_rc->regions_offset);
+ if ((cur_region_idx >= 0 &&
+ p_rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+ rc->frames_since_key == 0) {
+ // If we start from a scenecut, then the last GOP's arf boost is not
+ // needed for this GOP.
+ cpi->ppi->gf_state.arf_gf_boost_lst = 0;
+ }
+
+ int need_gf_len = 1;
+ if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+ // set up bitstream to read
+ if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) {
+ cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output;
+ }
+ av1_open_second_pass_log(cpi, 1);
+ THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info;
+ // Read in GOP information from the second pass file.
+ av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info,
+ cpi->common.error);
+#if CONFIG_BITRATE_ACCURACY
+ TPL_INFO *tpl_info;
+ AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info,
+ aom_malloc(sizeof(*tpl_info)));
+ av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream,
+ cpi->common.error);
+ aom_free(tpl_info);
+#if CONFIG_THREE_PASS
+ // TODO(angiebird): Put this part into a func
+ cpi->vbr_rc_info.cur_gop_idx++;
+#endif // CONFIG_THREE_PASS
+#endif // CONFIG_BITRATE_ACCURACY
+ // Read in third_pass_info from the bitstream.
+ av1_set_gop_third_pass(cpi->third_pass_ctx);
+ // Read in per-frame info from second-pass encoding
+ av1_read_second_pass_per_frame_info(
+ cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info,
+ gop_info->num_frames, cpi->common.error);
+
+ p_rc->cur_gf_index = 0;
+ p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length;
+ need_gf_len = 0;
+ }
+
+ if (need_gf_len) {
+ // If we cannot obtain GF group length from second_pass_file
+ // TODO(jingning): Resolve the redundant calls here.
+ if (rc->intervals_till_gf_calculate_due == 0 || 1) {
+ calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+ }
+
+ if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
+ oxcf->gf_cfg.lag_in_frames >= 32 &&
+ cpi->sf.tpl_sf.gop_length_decision_method != 3) {
+ int this_idx = rc->frames_since_key +
+ p_rc->gf_intervals[p_rc->cur_gf_index] -
+ p_rc->regions_offset - 1;
+ int this_region =
+ find_regions_index(p_rc->regions, p_rc->num_regions, this_idx);
+ int next_region =
+ find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1);
+ // TODO(angiebird): Figure out why this_region and next_region are -1 in
+ // unit test like AltRefFramePresenceTestLarge (aomedia:3134)
+ int is_last_scenecut =
+ p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key ||
+ (this_region != -1 &&
+ p_rc->regions[this_region].type == SCENECUT_REGION) ||
+ (next_region != -1 &&
+ p_rc->regions[next_region].type == SCENECUT_REGION);
+
+ int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index];
+
+ if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 &&
+ rc->min_gf_interval <= 16) {
+ // The calculate_gf_length function is previously used with
+ // max_gop_length = 32 with look-ahead gf intervals.
+ define_gf_group(cpi, frame_params, 0);
+ av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+ this_frame = this_frame_copy;
+
+ if (is_shorter_gf_interval_better(cpi, frame_params)) {
+ // A shorter gf interval is better.
+ // TODO(jingning): Remove redundant computations here.
+ max_gop_length = 16;
+ calculate_gf_length(cpi, max_gop_length, 1);
+ if (is_last_scenecut &&
+ (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) {
+ p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int;
+ }
+ }
+ }
+ }
+ }
+
+ define_gf_group(cpi, frame_params, 0);
+
+ if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE &&
+ rc->frames_since_key > 0)
+ process_first_pass_stats(cpi, &this_frame);
+
+ define_gf_group(cpi, frame_params, 1);
+
+ // write gop info if needed for third pass. Per-frame info is written after
+ // each frame is encoded.
+ av1_write_second_pass_gop_info(cpi);
+
+ av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+ assert(cpi->gf_frame_index == 0);
+#if ARF_STATS_OUTPUT
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
+ cpi->common.current_frame.frame_number,
+ rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count,
+ p_rc->gfu_boost);
+
+ fclose(fpfile);
+ }
+#endif
+ }
+ assert(cpi->gf_frame_index < gf_group->size);
+
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+ const FIRSTPASS_STATS *const this_frame_ptr =
+ read_frame_stats(twopass, &cpi->twopass_frame,
+ gf_group->arf_src_offset[cpi->gf_frame_index]);
+ set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+ } else {
+ // Back up this frame's stats for updating total stats during post encode.
+ cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL;
+ }
+
+ frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ setup_target_rate(cpi);
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ double frame_rate;
+ FIRSTPASS_STATS *stats;
+
+ if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+ av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end, cpi->common.error);
+ av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+
+ stats = twopass->stats_buf_ctx->total_stats;
+
+ *stats = *twopass->stats_buf_ctx->stats_in_end;
+ *twopass->stats_buf_ctx->total_left_stats = *stats;
+
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+ av1_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
+
+#if CONFIG_BITRATE_ACCURACY
+ av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left,
+ (int)round(stats->count));
+#endif
+
+#if CONFIG_RATECTRL_LOG
+ rc_log_init(&cpi->rc_log);
+#endif
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
+ {
+ const double avg_error =
+ stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+ const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in;
+ double modified_error_total = 0.0;
+ twopass->modified_error_min =
+ (avg_error * oxcf->rc_cfg.vbrmin_section) / 100;
+ twopass->modified_error_max =
+ (avg_error * oxcf->rc_cfg.vbrmax_section) / 100;
+ while (s < twopass->stats_buf_ctx->stats_in_end) {
+ modified_error_total +=
+ calculate_modified_err(frame_info, twopass, oxcf, s);
+ ++s;
+ }
+ twopass->modified_error_left = modified_error_total;
+ }
+
+ // Reset the vbr bits off target counters
+ cpi->ppi->p_rc.vbr_bits_off_target = 0;
+ cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
+
+ cpi->ppi->p_rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ // Initialize bits per macro_block estimate correction factor.
+ twopass->bpm_factor = 1.0;
+ // Initialize actual and target bits counters for ARF groups so that
+ // at the start we have a neutral bpm adjustment.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+}
+
+void av1_init_single_pass_lap(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+
+ if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ twopass->bits_left = 0;
+ twopass->modified_error_min = 0.0;
+ twopass->modified_error_max = 0.0;
+ twopass->modified_error_left = 0.0;
+
+ // Reset the vbr bits off target counters
+ cpi->ppi->p_rc.vbr_bits_off_target = 0;
+ cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
+
+ cpi->ppi->p_rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ // Initialize bits per macro_block estimate correction factor.
+ twopass->bpm_factor = 1.0;
+ // Initialize actual and target bits counters for ARF groups so that
+ // at the start we have a neutral bpm adjustment.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+
+ // Increment the stats_in pointer.
+ if (is_stat_consumption_stage(cpi) &&
+ !(cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode ==
+ DUCKY_ENCODE_GOP_MODE_RCL) &&
+ (cpi->gf_frame_index < cpi->ppi->gf_group.size ||
+ rc->frames_to_key == 0)) {
+ const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) {
+ FIRSTPASS_STATS this_frame;
+ assert(cpi->twopass_frame.stats_in >
+ twopass->stats_buf_ctx->stats_in_start);
+ --cpi->twopass_frame.stats_in;
+ if (cpi->ppi->lap_enabled) {
+ input_stats_lap(twopass, &cpi->twopass_frame, &this_frame);
+ } else {
+ input_stats(twopass, &cpi->twopass_frame, &this_frame);
+ }
+ } else if (cpi->ppi->lap_enabled) {
+ cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start;
+ }
+ }
+
+ // VBR correction is done through rc->vbr_bits_off_target. Based on the
+ // sign of this value, a limited % adjustment is made to the target rate
+ // of subsequent frames, to try and push it back towards 0. This method
+ // is designed to prevent extreme behaviour at the end of a clip
+ // or group of frames.
+ p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+ twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0);
+
+ if (cpi->do_update_vbr_bits_off_target_fast) {
+ // Subtract current frame's fast_extra_bits.
+ p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits;
+ rc->frame_level_fast_extra_bits = 0;
+ }
+
+ // Target vs actual bits for this arf group.
+ twopass->rolling_arf_group_target_bits += rc->base_frame_target;
+ twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
+ // Calculate the pct rc error.
+ if (p_rc->total_actual_bits) {
+ p_rc->rate_error_estimate =
+ (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits);
+ p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100);
+ } else {
+ p_rc->rate_error_estimate = 0;
+ }
+
+#if CONFIG_FPMT_TEST
+ /* The variables temp_vbr_bits_off_target, temp_bits_left,
+ * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits
+ * temp_rate_error_estimate are introduced for quality simulation purpose,
+ * it retains the value previous to the parallel encode frames. The
+ * variables are updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ const int simulate_parallel_frame =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ simulate_parallel_frame) {
+ cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target;
+ cpi->ppi->p_rc.temp_bits_left = twopass->bits_left;
+ cpi->ppi->p_rc.temp_rolling_arf_group_target_bits =
+ twopass->rolling_arf_group_target_bits;
+ cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits =
+ twopass->rolling_arf_group_actual_bits;
+ cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate;
+ }
+#endif
+ // Update the active best quality pyramid.
+ if (!rc->is_src_frame_alt_ref) {
+ const int pyramid_level =
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+ int i;
+ for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
+ p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+#if CONFIG_TUNE_VMAF
+ if (cpi->vmaf_info.original_qindex != -1 &&
+ (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+ p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
+ }
+#endif
+ }
+ }
+
+#if 0
+ {
+ AV1_COMMON *cm = &cpi->common;
+ FILE *fpfile;
+ fpfile = fopen("details.stt", "a");
+ fprintf(fpfile,
+ "%10d %10d %10d %10" PRId64 " %10" PRId64
+ " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
+ cm->current_frame.frame_number, rc->base_frame_target,
+ rc->projected_frame_size, rc->total_actual_bits,
+ rc->vbr_bits_off_target, p_rc->rate_error_estimate,
+ twopass->rolling_arf_group_target_bits,
+ twopass->rolling_arf_group_actual_bits,
+ (double)twopass->rolling_arf_group_actual_bits /
+ (double)twopass->rolling_arf_group_target_bits,
+ twopass->bpm_factor,
+ av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
+ cm->seq_params->bit_depth),
+ av1_convert_qindex_to_q(rc->active_worst_quality,
+ cm->seq_params->bit_depth));
+ fclose(fpfile);
+ }
+#endif
+
+ if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+ twopass->kf_group_bits -= rc->base_frame_target;
+ twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+ }
+ twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+ int minq_adj_limit;
+ int maxq_adj_limit;
+ minq_adj_limit =
+ (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+ maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
+
+ // Undershoot
+ if ((rc_cfg->under_shoot_pct < 100) &&
+ (p_rc->rolling_actual_bits < p_rc->rolling_target_bits)) {
+ int pct_error =
+ ((p_rc->rolling_target_bits - p_rc->rolling_actual_bits) * 100) /
+ p_rc->rolling_target_bits;
+
+ if ((pct_error >= rc_cfg->under_shoot_pct) &&
+ (p_rc->rate_error_estimate > 0)) {
+ twopass->extend_minq += 1;
+ }
+ twopass->extend_maxq -= 1;
+ // Overshoot
+ } else if ((rc_cfg->over_shoot_pct < 100) &&
+ (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) {
+ int pct_error =
+ ((p_rc->rolling_actual_bits - p_rc->rolling_target_bits) * 100) /
+ p_rc->rolling_target_bits;
+
+ pct_error = clamp(pct_error, 0, 100);
+ if ((pct_error >= rc_cfg->over_shoot_pct) &&
+ (p_rc->rate_error_estimate < 0)) {
+ twopass->extend_maxq += 1;
+ }
+ twopass->extend_minq -= 1;
+ } else {
+ // Adjustment for extreme local overshoot.
+ // Only applies when normal adjustment above is not used (e.g.
+ // when threshold is set to 100).
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+ // Unwind extreme overshoot adjustment.
+ else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits)
+ --twopass->extend_maxq;
+ }
+ twopass->extend_minq =
+ clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit);
+ twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+ // If there is a big and undexpected undershoot then feed the extra
+ // bits back in quickly. One situation where this may happen is if a
+ // frame is unexpectedly almost perfectly predicted by the ARF or GF
+ // but not very well predcited by the previous frame.
+ if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+ int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+ if (rc->projected_frame_size < fast_extra_thresh) {
+ p_rc->vbr_bits_off_target_fast +=
+ fast_extra_thresh - rc->projected_frame_size;
+ p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast,
+ (4 * rc->avg_frame_bandwidth));
+ }
+ }
+
+#if CONFIG_FPMT_TEST
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ simulate_parallel_frame) {
+ cpi->ppi->p_rc.temp_vbr_bits_off_target_fast =
+ p_rc->vbr_bits_off_target_fast;
+ cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq;
+ cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq;
+ }
+#endif
+ }
+
+ // Update the frame probabilities obtained from parallel encode frames
+ FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+#if CONFIG_FPMT_TEST
+ /* The variable temp_active_best_quality is introduced only for quality
+ * simulation purpose, it retains the value previous to the parallel
+ * encode frames. The variable is updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ simulate_parallel_frame) {
+ int i;
+ const int pyramid_level =
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+ if (!rc->is_src_frame_alt_ref) {
+ for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i)
+ cpi->ppi->p_rc.temp_active_best_quality[i] =
+ p_rc->active_best_quality[i];
+ }
+ }
+
+ // Update the frame probabilities obtained from parallel encode frames
+ FrameProbInfo *const temp_frame_probs_simulation =
+ simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation
+ : frame_probs;
+ FrameProbInfo *const temp_frame_probs =
+ simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL;
+#endif
+ int i, j, loop;
+ // Sequentially do average on temp_frame_probs_simulation which holds
+ // probabilities of last frame before parallel encode
+ for (loop = 0; loop <= cpi->num_frame_recode; loop++) {
+ // Sequentially update tx_type_probs
+ if (cpi->do_update_frame_probs_txtype[loop] &&
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ for (i = 0; i < TX_SIZES_ALL; i++) {
+ int left = 1024;
+
+ for (j = TX_TYPES - 1; j >= 0; j--) {
+ const int new_prob =
+ cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j];
+#if CONFIG_FPMT_TEST
+ int prob =
+ (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob;
+#else
+ int prob =
+ (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->tx_type_probs[update_type][i][j] = prob;
+#endif
+ }
+ }
+ }
+
+ // Sequentially update obmc_probs
+ if (cpi->do_update_frame_probs_obmc[loop] &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ const int new_prob =
+ cpi->frame_new_probs[loop].obmc_probs[update_type][i];
+#if CONFIG_FPMT_TEST
+ temp_frame_probs_simulation->obmc_probs[update_type][i] =
+ (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+ new_prob) >>
+ 1;
+#else
+ frame_probs->obmc_probs[update_type][i] =
+ (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#endif
+ }
+ }
+
+ // Sequentially update warped_probs
+ if (cpi->do_update_frame_probs_warp[loop] &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type];
+#if CONFIG_FPMT_TEST
+ temp_frame_probs_simulation->warped_probs[update_type] =
+ (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >>
+ 1;
+#else
+ frame_probs->warped_probs[update_type] =
+ (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#endif
+ }
+
+ // Sequentially update switchable_interp_probs
+ if (cpi->do_update_frame_probs_interpfilter[loop] &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ int left = 1536;
+
+ for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+ const int new_prob = cpi->frame_new_probs[loop]
+ .switchable_interp_probs[update_type][i][j];
+#if CONFIG_FPMT_TEST
+ int prob = (temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] = prob;
+#else
+ int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#endif
+ }
+ }
+ }
+ }
+
+#if CONFIG_FPMT_TEST
+ // Copying temp_frame_probs_simulation to temp_frame_probs based on
+ // the flag
+ if (cpi->do_frame_data_update &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ simulate_parallel_frame) {
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ temp_frame_probs->obmc_probs[update_type_idx][i] =
+ temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+ }
+ temp_frame_probs->warped_probs[update_type_idx] =
+ temp_frame_probs_simulation->warped_probs[update_type_idx];
+ for (i = 0; i < TX_SIZES_ALL; i++) {
+ for (j = 0; j < TX_TYPES; j++) {
+ temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j];
+ }
+ }
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+ temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type_idx][i][j];
+ }
+ }
+ }
+ }
+#endif
+ // Update framerate obtained from parallel encode frames
+ if (cpi->common.show_frame &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ cpi->framerate = cpi->new_framerate;
+#if CONFIG_FPMT_TEST
+ // SIMULATION PURPOSE
+ int show_existing_between_parallel_frames_cndn =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+ if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn &&
+ cpi->do_frame_data_update && simulate_parallel_frame)
+ cpi->temp_framerate = cpi->framerate;
+#endif
+}
diff --git a/third_party/aom/av1/encoder/pass2_strategy.h b/third_party/aom/av1/encoder/pass2_strategy.h
new file mode 100644
index 0000000000..5987a78a23
--- /dev/null
+++ b/third_party/aom/av1/encoder/pass2_strategy.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#include "av1/encoder/encoder.h"
+
+/*!
+ * \brief accumulated stats and features in a gf group
+ */
+typedef struct {
+ /*!\cond */
+ double gf_group_err;
+ double gf_group_raw_error;
+ double gf_group_skip_pct;
+ double gf_group_inactive_zone_rows;
+
+ double mv_ratio_accumulator;
+ double decay_accumulator;
+ double zero_motion_accumulator;
+ double loop_decay_rate;
+ double last_loop_decay_rate;
+ double this_frame_mv_in_out;
+ double mv_in_out_accumulator;
+ double abs_mv_in_out_accumulator;
+
+ double avg_sr_coded_error;
+ double avg_pcnt_second_ref;
+ double avg_new_mv_count;
+ double avg_wavelet_energy;
+ double avg_raw_err_stdev;
+ int non_zero_stdev_count;
+ /*!\endcond */
+} GF_GROUP_STATS;
+
+/*!
+ * \brief accumulated stats and features for a frame
+ */
+typedef struct {
+ /*!\cond */
+ double frame_err;
+ double frame_coded_error;
+ double frame_sr_coded_error;
+ /*!\endcond */
+} GF_FRAME_STATS;
+/*!\cond */
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+
+void av1_init_single_pass_lap(AV1_COMP *cpi);
+
+/*!\endcond */
+/*!\brief Main per frame entry point for second pass of two pass encode
+ *
+ *\ingroup rate_control
+ *
+ * This function is called for each frame in the second pass of a two pass
+ * encode. It checks the frame type and if a new KF or GF/ARF is due.
+ * When a KF is due it calls find_next_key_frame() to work out how long
+ * this key frame group will be and assign bits to the key frame.
+ * At the start of a new GF/ARF group it calls calculate_gf_length()
+ * and define_gf_group() which are the main functions responsible for
+ * defining the size and structure of the new GF/ARF group.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] frame_params Per frame encoding parameters
+ * \param[in] frame_flags Frame type and coding flags
+ *
+ * \remark No return but analyses first pass stats and assigns a target
+ * number of bits to the current frame and a target Q range.
+ */
+void av1_get_second_pass_params(struct AV1_COMP *cpi,
+ struct EncodeFrameParams *const frame_params,
+ unsigned int frame_flags);
+
+/*!\brief Adjustments to two pass and rate control after each frame.
+ *
+ *\ingroup rate_control
+ *
+ * This function is called after each frame to make adjustments to
+ * heuristics and data structures that relate to rate control.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ *
+ * \remark No return value but this function updates various rate control
+ * related data structures that for example track overshoot and
+ * undershoot.
+ */
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] rc Rate control data
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] is_key_frame Indicates if the first frame in the group is
+ * also a key frame.
+ * \param[in] use_arf Are ARF frames enabled or is this a GF only
+ * uni-directional group.
+ * \param[in] gf_group_bits Bits available to be allocated.
+ *
+ * \remark No return but updates the rate control and group data structures
+ * to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+ GF_GROUP *gf_group, int is_key_frame, int use_arf,
+ int64_t gf_group_bits);
+
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+ const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ int offset, int f_frames, int b_frames,
+ int *num_fpstats_used, int *num_fpstats_required,
+ int project_gfu_boost);
+
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats);
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats,
+ struct aom_internal_error_info *error_info);
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 0000000000..232a2f9edb
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/mcomp.h"
+
+// Get primary and secondary filter strength for the given strength index and
+// search method
+static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
+ int *pri_strength,
+ int *sec_strength,
+ int strength_idx) {
+ const int tot_sec_filter =
+ (pick_method == CDEF_FAST_SEARCH_LVL5)
+ ? REDUCED_SEC_STRENGTHS_LVL5
+ : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3
+ : CDEF_SEC_STRENGTHS);
+ const int pri_idx = strength_idx / tot_sec_filter;
+ const int sec_idx = strength_idx % tot_sec_filter;
+ *pri_strength = pri_idx;
+ *sec_strength = sec_idx;
+ if (pick_method == CDEF_FULL_SEARCH) return;
+
+ switch (pick_method) {
+ case CDEF_FAST_SEARCH_LVL1:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1);
+ *pri_strength = priconv_lvl1[pri_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL2:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+ *pri_strength = priconv_lvl2[pri_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL3:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+ assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
+ *pri_strength = priconv_lvl2[pri_idx];
+ *sec_strength = secconv_lvl3[sec_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL4:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+ assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
+ *pri_strength = priconv_lvl4[pri_idx];
+ *sec_strength = secconv_lvl3[sec_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL5:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+ assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5);
+ *pri_strength = priconv_lvl5[pri_idx];
+ *sec_strength = secconv_lvl5[sec_idx];
+ break;
+ default: assert(0 && "Invalid CDEF search method");
+ }
+}
+
+// Store CDEF filter strength calculated from strength index for given search
+// method
+#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \
+ do { \
+ get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength, \
+ (strength_idx)); \
+ cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength; \
+ } while (0)
+
+/* Search for the best strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t tot_mse[TOTAL_STRENGTHS];
+ const int total_strengths = nb_cdef_strengths[pick_method];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id = 0;
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ if (mse[i][lev[gi]] < best_mse) {
+ best_mse = mse[i][lev[gi]];
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < total_strengths; j++) {
+ uint64_t best = best_mse;
+ if (mse[i][j] < best) best = mse[i][j];
+ tot_mse[j] += best;
+ }
+ }
+ for (j = 0; j < total_strengths; j++) {
+ if (tot_mse[j] < best_tot_mse) {
+ best_tot_mse = tot_mse[j];
+ best_id = j;
+ }
+ }
+ lev[nb_strengths] = best_id;
+ return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id0 = 0;
+ int best_id1 = 0;
+ const int total_strengths = nb_cdef_strengths[pick_method];
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ uint64_t curr = mse[0][i][lev0[gi]];
+ curr += mse[1][i][lev1[gi]];
+ if (curr < best_mse) {
+ best_mse = curr;
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < total_strengths; j++) {
+ int k;
+ for (k = 0; k < total_strengths; k++) {
+ uint64_t best = best_mse;
+ uint64_t curr = mse[0][i][j];
+ curr += mse[1][i][k];
+ if (curr < best) best = curr;
+ tot_mse[j][k] += best;
+ }
+ }
+ }
+ for (j = 0; j < total_strengths; j++) {
+ int k;
+ for (k = 0; k < total_strengths; k++) {
+ if (tot_mse[j][k] < best_tot_mse) {
+ best_tot_mse = tot_mse[j][k];
+ best_id0 = j;
+ best_id1 = k;
+ }
+ }
+ }
+ lev0[nb_strengths] = best_id0;
+ lev1[nb_strengths] = best_id1;
+ return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS],
+ int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t best_tot_mse;
+ int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+ pick_method <= CDEF_FAST_SEARCH_LVL5);
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ if (!fast) {
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+ best_tot_mse =
+ search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method);
+ }
+ }
+ return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+ int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS],
+ int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t best_tot_mse;
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse =
+ search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) {
+ best_lev0[j] = best_lev0[j + 1];
+ best_lev1[j] = best_lev1[j + 1];
+ }
+ best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+ sb_count, pick_method);
+ }
+ return best_tot_mse;
+}
+
+static INLINE void init_src_params(int *src_stride, int *width, int *height,
+ int *width_log2, int *height_log2,
+ BLOCK_SIZE bsize) {
+ *src_stride = block_size_wide[bsize];
+ *width = block_size_wide[bsize];
+ *height = block_size_high[bsize];
+ *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+ *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+/* Compute MSE only on the blocks we filtered. */
+static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count,
+ BLOCK_SIZE bsize, int coeff_shift,
+ int row, int col) {
+ assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_8X8);
+ uint64_t sum = 0;
+ int bi, bx, by;
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst);
+ uint16_t *dst_buff = &dst16[row * dstride + col];
+ int src_stride, width, height, width_log2, height_log2;
+ init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+ bsize);
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += aom_mse_wxh_16bit_highbd(
+ &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+ &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+ }
+ return sum >> 2 * coeff_shift;
+}
+#endif
+
+// Checks dual and quad block processing is applicable for block widths 8 and 4
+// respectively.
+static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width,
+ int cdef_count, int bi, int iter) {
+ assert(width == 8 || width == 4);
+ const int blk_offset = (width == 8) ? 1 : 3;
+ if ((iter + blk_offset) >= cdef_count) return 0;
+
+ if (dlist[bi].by == dlist[bi + blk_offset].by &&
+ dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx)
+ return 1;
+
+ return 0;
+}
+
+static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count,
+ BLOCK_SIZE bsize, int coeff_shift, int row,
+ int col) {
+ assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_8X8);
+ uint64_t sum = 0;
+ int bi, bx, by;
+ int iter = 0;
+ int inc = 1;
+ uint8_t *dst8 = (uint8_t *)dst;
+ uint8_t *dst_buff = &dst8[row * dstride + col];
+ int src_stride, width, height, width_log2, height_log2;
+ init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+ bsize);
+
+ const int num_blks = 16 / width;
+ for (bi = 0; bi < cdef_count; bi += inc) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)];
+ uint8_t *dst_tmp =
+ &dst_buff[(by << height_log2) * dstride + (bx << width_log2)];
+
+ if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) {
+ sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height);
+ iter += num_blks;
+ inc = num_blks;
+ } else {
+ sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width,
+ height);
+ iter += 1;
+ inc = 1;
+ }
+ }
+
+ return sum >> 2 * coeff_shift;
+}
+
+// Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the
+// region is outside frame boundary
+static INLINE void fill_borders_for_fbs_on_frame_boundary(
+ uint16_t *inbuf, int hfilt_size, int vfilt_size,
+ bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary,
+ bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) {
+ if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary &&
+ !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary)
+ return;
+ if (is_fb_on_frm_bottom_boundary) {
+ // Fill bottom region of the block
+ const int buf_offset =
+ (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER;
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) {
+ const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE;
+ // Fill bottom-left region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) {
+ const int buf_offset =
+ (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER;
+ // Fill bottom-right region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_top_boundary) {
+ // Fill top region of the block
+ fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) {
+ // Fill top-left region of the block
+ fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) {
+ const int buf_offset = hfilt_size + CDEF_HBORDER;
+ // Fill top-right region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_left_boundary) {
+ const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+ // Fill left region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_right_boundary) {
+ const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+ // Fill right region of the block
+ fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE,
+ vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+}
+
+// Calculate the number of 8x8/4x4 filter units for which SSE can be calculated
+// after CDEF filtering in single function call
+static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units(
+ cdef_list *dlist, int cdef_count, int bi, int subsampling_x,
+ int subsampling_y) {
+ // TODO(Ranjit): Extend the optimization for 422
+ if (subsampling_x != subsampling_y) return 1;
+
+ // Combining more blocks seems to increase encode time due to increase in
+ // control code
+ if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by &&
+ dlist[bi].bx + 3 == dlist[bi + 3].bx) {
+ /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific
+ * logic if y co-ordinates match and x co-ordinates are
+ * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */
+ return 4;
+ }
+ if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by &&
+ dlist[bi].bx + 1 == dlist[bi + 1].bx) {
+ /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific
+ * logic if their y co-ordinates match and x co-ordinates are
+ * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */
+ return 2;
+ }
+ return 1;
+}
+
+// Returns the block error after CDEF filtering for a given strength
+static INLINE uint64_t get_filt_error(
+ const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd,
+ cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer,
+ int ref_stride, int row, int col, int pri_strength, int sec_strength,
+ int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) {
+ uint64_t curr_sse = 0;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y);
+ const int bw_log2 = 3 - pd->subsampling_x;
+ const int bh_log2 = 3 - pd->subsampling_y;
+
+ // TODO(Ranjit): Extend this optimization for HBD
+ if (!cdef_search_ctx->use_highbitdepth) {
+ // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the
+ // error at CDEF block level
+ const int tot_blk_count =
+ (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >>
+ (bw_log2 + bh_log2);
+ if (cdef_count == tot_blk_count) {
+ // Calculate the offset in the buffer based on block position
+ const FULLPEL_MV this_mv = { row, col };
+ const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+ if (pri_strength == 0 && sec_strength == 0) {
+ // When CDEF strength is zero, filtering is not applied. Hence
+ // error is calculated between source and unfiltered pixels
+ curr_sse =
+ aom_sse(&ref_buffer[buf_offset], ref_stride,
+ get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+ block_size_wide[plane_bsize], block_size_high[plane_bsize]);
+ } else {
+ DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+ av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+ cdef_search_ctx->xdec[pli],
+ cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+ dlist, cdef_count, pri_strength,
+ sec_strength + (sec_strength == 3),
+ cdef_search_ctx->damping, coeff_shift);
+ curr_sse =
+ aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8,
+ (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize],
+ block_size_high[plane_bsize]);
+ }
+ } else {
+ // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering
+ // functions produce 8-bit output and the error is calculated in 8-bit
+ // domain
+ if (pri_strength == 0 && sec_strength == 0) {
+ int num_error_calc_filt_units = 1;
+ for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+ const uint8_t by = dlist[bi].by;
+ const uint8_t bx = dlist[bi].bx;
+ const int16_t by_pos = (by << bh_log2);
+ const int16_t bx_pos = (bx << bw_log2);
+ // Calculate the offset in the buffer based on block position
+ const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+ const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+ num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+ dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+ curr_sse += aom_sse(
+ &ref_buffer[buf_offset], ref_stride,
+ get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+ num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+ }
+ } else {
+ DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+ av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+ cdef_search_ctx->xdec[pli],
+ cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+ dlist, cdef_count, pri_strength,
+ sec_strength + (sec_strength == 3),
+ cdef_search_ctx->damping, coeff_shift);
+ int num_error_calc_filt_units = 1;
+ for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+ const uint8_t by = dlist[bi].by;
+ const uint8_t bx = dlist[bi].bx;
+ const int16_t by_pos = (by << bh_log2);
+ const int16_t bx_pos = (bx << bw_log2);
+ // Calculate the offset in the buffer based on block position
+ const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+ const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos };
+ const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+ const int tmp_buf_offset =
+ get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2));
+ num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+ dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+ curr_sse += aom_sse(
+ &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset],
+ (1 << MAX_SB_SIZE_LOG2),
+ num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+ }
+ }
+ }
+ } else {
+ DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+ av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
+ cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
+ dir, dirinit, var, pli, dlist, cdef_count, pri_strength,
+ sec_strength + (sec_strength == 3),
+ cdef_search_ctx->damping, coeff_shift);
+ curr_sse = cdef_search_ctx->compute_cdef_dist_fn(
+ ref_buffer, ref_stride, tmp_dst, dlist, cdef_count,
+ cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+ }
+ return curr_sse;
+}
+
+// Calculates MSE at block level.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// fbr: Row index in units of 64x64 block
+// fbc: Column index in units of 64x64 block
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info,
+ int fbr, int fbc, int sb_count) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
+ const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
+ const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
+ const int coeff_shift = cdef_search_ctx->coeff_shift;
+ const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2;
+ const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
+
+ // Declare and initialize the temporary buffers.
+ DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+ cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+ int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+ int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+ int hb_step = 1, vb_step = 1;
+ BLOCK_SIZE bs;
+
+ const MB_MODE_INFO *const mbmi =
+ mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc];
+
+ uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer,
+ ref->v_buffer };
+ int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride,
+ ref->uv_stride };
+
+ if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 ||
+ mbmi->bsize == BLOCK_64X128) {
+ bs = mbmi->bsize;
+ if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+ nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+ hb_step = 2;
+ }
+ if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+ nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+ vb_step = 2;
+ }
+ } else {
+ bs = BLOCK_64X64;
+ }
+ // Get number of 8x8 blocks which are not skip. Cdef processing happens for
+ // 8x8 blocks which are not skip.
+ const int cdef_count = av1_cdef_compute_sb_list(
+ mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+ const bool is_fb_on_frm_left_boundary = (fbc == 0);
+ const bool is_fb_on_frm_right_boundary =
+ (fbc + hb_step == cdef_search_ctx->nhfb);
+ const bool is_fb_on_frm_top_boundary = (fbr == 0);
+ const bool is_fb_on_frm_bottom_boundary =
+ (fbr + vb_step == cdef_search_ctx->nvfb);
+ const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary);
+ const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary);
+ int dirinit = 0;
+ for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) {
+ /* We avoid filtering the pixels for which some of the pixels to
+ average are outside the frame. We could change the filter instead,
+ but it would add special cases for any future vectorization. */
+ const int hfilt_size = (nhb << mi_wide_l2[pli]);
+ const int vfilt_size = (nvb << mi_high_l2[pli]);
+ const int ysize =
+ vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff;
+ const int xsize =
+ hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff;
+ const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+ const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+ struct macroblockd_plane pd = cdef_search_ctx->plane[pli];
+ cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+ pd.dst.buf, row - yoff, col - xoff, pd.dst.stride,
+ ysize, xsize);
+ fill_borders_for_fbs_on_frame_boundary(
+ inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary,
+ is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary,
+ is_fb_on_frm_bottom_boundary);
+ for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) {
+ int pri_strength, sec_strength;
+ get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength,
+ &sec_strength, gi);
+ const uint64_t curr_mse = get_filt_error(
+ cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli],
+ ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count,
+ pli, coeff_shift, bs);
+ if (pli < 2)
+ cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse;
+ else
+ cdef_search_ctx->mse[1][sb_count][gi] += curr_mse;
+ }
+ }
+ cdef_search_ctx->sb_index[sb_count] =
+ MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
+}
+
+// MSE calculation at frame level.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info) {
+ // Loop over each sb.
+ for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
+ for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
+ // Checks if cdef processing can be skipped for particular sb.
+ if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
+ // Calculate mse for each sb and store the relevant sb index.
+ av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc,
+ cdef_search_ctx->sb_count);
+ cdef_search_ctx->sb_count++;
+ }
+ }
+}
+
+// Allocates memory for members of CdefSearchCtx.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters
+// related to CDEF search context.
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) {
+ const int nvfb = cdef_search_ctx->nvfb;
+ const int nhfb = cdef_search_ctx->nhfb;
+ CHECK_MEM_ERROR(
+ cm, cdef_search_ctx->sb_index,
+ aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0])));
+ cdef_search_ctx->sb_count = 0;
+ CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0],
+ aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+ CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1],
+ aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+}
+
+// Deallocates the memory allocated for members of CdefSearchCtx.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters
+// related to CDEF search context.
+// Returns:
+// Nothing will be returned.
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+ if (cdef_search_ctx) {
+ aom_free(cdef_search_ctx->mse[0]);
+ cdef_search_ctx->mse[0] = NULL;
+ aom_free(cdef_search_ctx->mse[1]);
+ cdef_search_ctx->mse[1] = NULL;
+ aom_free(cdef_search_ctx->sb_index);
+ cdef_search_ctx->sb_index = NULL;
+ }
+}
+
+// Initialize the parameters related to CDEF search context.
+// Inputs:
+// frame: Pointer to compressed frame buffer
+// ref: Pointer to the frame buffer holding the source frame
+// cm: Pointer to top level common structure
+// xd: Pointer to common current coding block structure
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// pick_method: Search method used to select CDEF parameters
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *ref,
+ AV1_COMMON *cm, MACROBLOCKD *xd,
+ CdefSearchCtx *cdef_search_ctx,
+ CDEF_PICK_METHOD pick_method) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ cdef_search_ctx->mi_params = &cm->mi_params;
+ cdef_search_ctx->ref = ref;
+ cdef_search_ctx->nvfb =
+ (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ cdef_search_ctx->nhfb =
+ (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+ cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
+ cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
+ cdef_search_ctx->num_planes = num_planes;
+ cdef_search_ctx->pick_method = pick_method;
+ cdef_search_ctx->sb_count = 0;
+ cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth;
+ av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+ num_planes);
+ // Initialize plane wise information.
+ for (int pli = 0; pli < num_planes; pli++) {
+ cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x;
+ cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y;
+ cdef_search_ctx->bsize[pli] =
+ cdef_search_ctx->ydec[pli]
+ ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+ : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+ cdef_search_ctx->mi_wide_l2[pli] =
+ MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+ cdef_search_ctx->mi_high_l2[pli] =
+ MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+ cdef_search_ctx->plane[pli] = xd->plane[pli];
+ }
+ // Function pointer initialization.
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params->use_highbitdepth) {
+ cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd;
+ cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
+ } else {
+ cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
+ cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+ }
+#else
+ cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
+ cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+#endif
+}
+
+void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+ int is_screen_content) {
+ const int bd = cm->seq_params->bit_depth;
+ const int q =
+ av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ // Check the speed feature to avoid extra signaling.
+ if (skip_cdef) {
+ cdef_info->cdef_bits = 1;
+ cdef_info->nb_cdef_strengths = 2;
+ } else {
+ cdef_info->cdef_bits = 0;
+ cdef_info->nb_cdef_strengths = 1;
+ }
+ cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
+
+ int predicted_y_f1 = 0;
+ int predicted_y_f2 = 0;
+ int predicted_uv_f1 = 0;
+ int predicted_uv_f2 = 0;
+ if (is_screen_content) {
+ predicted_y_f1 =
+ (int)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02);
+ predicted_y_f2 =
+ (int)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01);
+ predicted_uv_f1 =
+ (int)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01);
+ predicted_uv_f2 =
+ (int)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0);
+ predicted_y_f1 = clamp(predicted_y_f1, 0, 15);
+ predicted_y_f2 = clamp(predicted_y_f2, 0, 3);
+ predicted_uv_f1 = clamp(predicted_uv_f1, 0, 15);
+ predicted_uv_f2 = clamp(predicted_uv_f2, 0, 3);
+ } else {
+ if (!frame_is_intra_only(cm)) {
+ predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
+ q * 0.0068615186f + 0.02709886f),
+ 0, 15);
+ predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f +
+ q * 0.0013993345f + 0.03831067f),
+ 0, 3);
+ predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f +
+ q * 0.0034628846f + 0.00887099f),
+ 0, 15);
+ predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f +
+ q * 0.00028223585f + 0.05576307f),
+ 0, 3);
+ } else {
+ predicted_y_f1 = clamp(
+ (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f),
+ 0, 15);
+ predicted_y_f2 = clamp((int)roundf(q * q * 0.0000029167343f +
+ q * 0.0027798624f + 0.0079405f),
+ 0, 3);
+ predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000130790995f +
+ q * 0.012892405f - 0.00748388f),
+ 0, 15);
+ predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f +
+ q * 0.00035520183f + 0.00228092f),
+ 0, 3);
+ }
+ }
+ cdef_info->cdef_strengths[0] =
+ predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2;
+ cdef_info->cdef_uv_strengths[0] =
+ predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
+
+ // mbmi->cdef_strength is already set in the encoding stage. We don't need to
+ // set it again here.
+ if (skip_cdef) {
+ cdef_info->cdef_strengths[1] = 0;
+ cdef_info->cdef_uv_strengths[1] = 0;
+ return;
+ }
+
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
+ // mbmi is NULL when real-time rate control library is used.
+ if (!mbmi) return;
+ for (int r = 0; r < nvfb; ++r) {
+ for (int c = 0; c < nhfb; ++c) {
+ MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
+ current_mbmi->cdef_strength = 0;
+ }
+ mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
+ }
+}
+
+void av1_cdef_search(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control;
+
+ assert(cdef_control != CDEF_NONE);
+ if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) {
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ cdef_info->nb_cdef_strengths = 1;
+ cdef_info->cdef_bits = 0;
+ cdef_info->cdef_strengths[0] = 0;
+ cdef_info->cdef_uv_strengths[0] = 0;
+ return;
+ }
+
+ // Indicate if external RC is used for testing
+ const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl;
+ if (rtc_ext_rc) {
+ av1_pick_cdef_from_qp(cm, 0, 0);
+ return;
+ }
+ CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method;
+ if (pick_method == CDEF_PICK_FROM_Q) {
+ const int use_screen_content_model =
+ cm->quant_params.base_qindex >
+ AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
+ cpi->rc.best_quality + 5) &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb,
+ use_screen_content_model);
+ return;
+ }
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int damping = 3 + (cm->quant_params.base_qindex >> 6);
+ const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+ pick_method <= CDEF_FAST_SEARCH_LVL5);
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+ if (!cpi->cdef_search_ctx)
+ CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx,
+ aom_malloc(sizeof(*cpi->cdef_search_ctx)));
+ CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx;
+
+ // Initialize parameters related to CDEF search context.
+ cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx,
+ pick_method);
+ // Allocate CDEF search context buffers.
+ cdef_alloc_data(cm, cdef_search_ctx);
+ // Frame level mse calculation.
+ if (cpi->mt_info.num_workers > 1) {
+ av1_cdef_mse_calc_frame_mt(cpi);
+ } else {
+ cdef_mse_calc_frame(cdef_search_ctx, cm->error);
+ }
+
+ /* Search for different number of signaling bits. */
+ int nb_strength_bits = 0;
+ uint64_t best_rd = UINT64_MAX;
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ int sb_count = cdef_search_ctx->sb_count;
+ uint64_t(*mse[2])[TOTAL_STRENGTHS];
+ mse[0] = cdef_search_ctx->mse[0];
+ mse[1] = cdef_search_ctx->mse[1];
+ /* Calculate the maximum number of bits required to signal CDEF strengths at
+ * block level */
+ const int total_strengths = nb_cdef_strengths[pick_method];
+ const int joint_strengths =
+ num_planes > 1 ? total_strengths * total_strengths : total_strengths;
+ const int max_signaling_bits =
+ joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1;
+ int rdmult = cpi->td.mb.rdmult;
+ for (int i = 0; i <= 3; i++) {
+ if (i > max_signaling_bits) break;
+ int best_lev0[CDEF_MAX_STRENGTHS];
+ int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+ const int nb_strengths = 1 << i;
+ uint64_t tot_mse;
+ if (num_planes > 1) {
+ tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+ mse, sb_count, pick_method);
+ } else {
+ tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+ pick_method);
+ }
+
+ const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS *
+ (num_planes > 1 ? 2 : 1);
+ const int rate_cost = av1_cost_literal(total_bits);
+ const uint64_t dist = tot_mse * 16;
+ const uint64_t rd = RDCOST(rdmult, rate_cost, dist);
+ if (rd < best_rd) {
+ best_rd = rd;
+ nb_strength_bits = i;
+ memcpy(cdef_info->cdef_strengths, best_lev0,
+ nb_strengths * sizeof(best_lev0[0]));
+ if (num_planes > 1) {
+ memcpy(cdef_info->cdef_uv_strengths, best_lev1,
+ nb_strengths * sizeof(best_lev1[0]));
+ }
+ }
+ }
+
+ cdef_info->cdef_bits = nb_strength_bits;
+ cdef_info->nb_cdef_strengths = 1 << nb_strength_bits;
+ for (int i = 0; i < sb_count; i++) {
+ uint64_t best_mse = UINT64_MAX;
+ int best_gi = 0;
+ for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) {
+ uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]];
+ if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]];
+ if (curr < best_mse) {
+ best_gi = gi;
+ best_mse = curr;
+ }
+ }
+ mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength =
+ best_gi;
+ }
+ if (fast) {
+ for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
+ const int luma_strength = cdef_info->cdef_strengths[j];
+ const int chroma_strength = cdef_info->cdef_uv_strengths[j];
+ int pri_strength, sec_strength;
+
+ STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method,
+ luma_strength);
+ STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method,
+ chroma_strength);
+ }
+ }
+
+ cdef_info->cdef_damping = damping;
+ // Deallocate CDEF search context buffers.
+ av1_cdef_dealloc_data(cdef_search_ctx);
+}
diff --git a/third_party/aom/av1/encoder/pickcdef.h b/third_party/aom/av1/encoder/pickcdef.h
new file mode 100644
index 0000000000..192e734fb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKCDEF_H_
+#define AOM_AV1_ENCODER_PICKCDEF_H_
+
+#include "av1/common/cdef.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\enum CDEF_CONTROL
+ * \brief This enum controls to which frames CDEF is applied.
+ */
+typedef enum {
+ CDEF_NONE = 0, /*!< Disable CDEF on all frames. */
+ CDEF_ALL = 1, /*!< Enable CDEF for all frames. */
+ CDEF_REFERENCE = 2, /*!< Disable CDEF on non reference frames. */
+} CDEF_CONTROL;
+
+/*!\cond */
+struct MultiThreadInfo;
+
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+#define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_SEC_STRENGTHS_LVL5 1
+#define REDUCED_PRI_STRENGTHS_LVL4 2
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+ (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+ (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL3 \
+ (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL4 \
+ (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL5 \
+ (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2, 3,
+ 5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
+static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 };
+static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+ TOTAL_STRENGTHS,
+ REDUCED_TOTAL_STRENGTHS_LVL1,
+ REDUCED_TOTAL_STRENGTHS_LVL2,
+ REDUCED_TOTAL_STRENGTHS_LVL3,
+ REDUCED_TOTAL_STRENGTHS_LVL4,
+ REDUCED_TOTAL_STRENGTHS_LVL5,
+ TOTAL_STRENGTHS
+};
+
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src,
+ int src_voffset, int src_hoffset, int sstride,
+ int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count,
+ BLOCK_SIZE bsize, int coeff_shift,
+ int row, int col);
+
+/*! \brief CDEF search context.
+ */
+typedef struct {
+ /*!
+ * Pointer to the frame buffer holding the source frame
+ */
+ const YV12_BUFFER_CONFIG *ref;
+ /*!
+ * Pointer to params related to MB_MODE_INFO arrays and related info
+ */
+ CommonModeInfoParams *mi_params;
+ /*!
+ * Info specific to each plane
+ */
+ struct macroblockd_plane plane[MAX_MB_PLANE];
+ /*!
+ * Function pointer of copy_fn
+ */
+ copy_fn_t copy_fn;
+ /*!
+ * Function pointer of compute_cdef_dist_fn
+ */
+ compute_cdef_dist_t compute_cdef_dist_fn;
+ /*!
+ * Number of strenghts evaluated in CDEF filter search
+ */
+ int total_strengths;
+ /*!
+ * Bit-depth dependent shift
+ */
+ int coeff_shift;
+ /*!
+ * CDEF damping factor
+ */
+ int damping;
+ /*!
+ * Search method used to select CDEF parameters
+ */
+ int pick_method;
+ /*!
+ * Number of planes
+ */
+ int num_planes;
+ /*!
+ * Log2 of width of the MI unit in pixels. mi_wide_l2[i]
+ * indicates the width of the MI unit in pixels for the ith plane
+ */
+ int mi_wide_l2[MAX_MB_PLANE];
+ /*!
+ * Log2 of height of the MI unit in pixels. mi_high_l2[i]
+ * indicates the height of the MI unit in pixels for the ith plane
+ */
+ int mi_high_l2[MAX_MB_PLANE];
+ /*!
+ * Subsampling in x direction. xdec[i] indicates the subsampling
+ * for the ith plane
+ */
+ int xdec[MAX_MB_PLANE];
+ /*!
+ * Subsampling in y direction. ydec[i] indicates the subsampling
+ * for the ith plane
+ */
+ int ydec[MAX_MB_PLANE];
+ /*!
+ * bsize[i] indicates the block size of ith plane
+ */
+ int bsize[MAX_MB_PLANE];
+ /*!
+ * Number of 64x64 blocks in vertical direction of a frame
+ */
+ int nvfb;
+ /*!
+ * Number of 64x64 blocks in horizontal direction of a frame
+ */
+ int nhfb;
+ /*!
+ * Pointer to the mean squared error between the CDEF filtered block and the
+ * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds
+ * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength
+ * index
+ */
+ uint64_t (*mse[2])[TOTAL_STRENGTHS];
+ /*!
+ * Holds the position (in units of mi's) of the cdef filtered
+ * block in raster scan order
+ */
+ int *sb_index;
+ /*!
+ * Holds the count of cdef filtered blocks
+ */
+ int sb_count;
+ /*!
+ * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth
+ * is > 8-bit
+ */
+ bool use_highbitdepth;
+} CdefSearchCtx;
+
+static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col) {
+ const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+ const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+ const int stride = mi_params->mi_stride;
+ MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+ for (int r = 0; r < maxr; ++r, mbmi += stride) {
+ for (int c = 0; c < maxc; ++c) {
+ if (!mbmi[c]->skip_txfm) return 0;
+ }
+ }
+ return 1;
+}
+
+// Checks if cdef processing can be skipped for particular sb.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// fbr: Row index in units of 64x64 block
+// fbc: Column index in units of 64x64 block
+// Returns:
+// 1/0 will be returned to indicate skip/don't skip cdef processing of sb
+// respectively.
+static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
+ int fbr, int fbc) {
+ const MB_MODE_INFO *const mbmi =
+ mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc];
+ // No filtering if the entire filter block is skipped.
+ if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+ return 1;
+ // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128,
+ // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering
+ // is done at the corresponding block sizes.
+ if (((fbc & 1) &&
+ (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) ||
+ ((fbr & 1) &&
+ (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128)))
+ return 1;
+ return 0;
+}
+
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info,
+ int fbr, int fbc, int sb_count);
+/*!\endcond */
+
+/*!\brief AV1 CDEF parameter search
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Searches for optimal CDEF parameters for frame
+ *
+ * \param[in,out] cpi Top level encoder structure
+ *
+ * \remark Nothing is returned. Instead, optimal CDEF parameters are stored
+ * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
+ * \arg \c cdef_bits: Bits of strength parameters
+ * \arg \c nb_cdef_strengths: Number of strength parameters
+ * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the luma plane.
+ * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the chroma planes.
+ * \arg \c damping_factor: CDEF damping factor.
+ *
+ */
+void av1_cdef_search(struct AV1_COMP *cpi);
+
+/*!\brief AV1 CDEF level from QP
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Calculates CDEF levels from frame QP. Only used for speed 7+ with RT mode.
+ *
+ * \param[in,out] cm Pointer to top level common structure
+ * \param[in] skip_cdef Flag to skip CDEF filtering
+ * \param[in] is_screen_content Flag indicating screen content
+ *
+ */
+void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+ int is_screen_content);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_PICKCDEF_H_
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 0000000000..9084d3f13a
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc, int plane) {
+ switch (plane) {
+ case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+ case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+ case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+ default: assert(plane >= 0 && plane <= 2); break;
+ }
+}
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+ if (is_stat_consumption_stage_twopass(cpi)) {
+ return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+ : MAX_LOOP_FILTER;
+ } else {
+ return MAX_LOOP_FILTER;
+ }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+ AV1_COMP *const cpi, int filt_level,
+ int partial_frame, int plane, int dir) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ int num_workers = mt_info->num_mod_workers[MOD_LPF];
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+
+ assert(plane >= 0 && plane <= 2);
+ int filter_level[2] = { filt_level, filt_level };
+ if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+ if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+ // set base filters for use of av1_get_filter_level when in DELTA_LF mode
+ switch (plane) {
+ case 0:
+ cm->lf.filter_level[0] = filter_level[0];
+ cm->lf.filter_level[1] = filter_level[1];
+ break;
+ case 1: cm->lf.filter_level_u = filter_level[0]; break;
+ case 2: cm->lf.filter_level_v = filter_level[0]; break;
+ }
+
+ // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+ int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf);
+
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
+ plane + 1, partial_frame, mt_info->workers,
+ num_workers, &mt_info->lf_row_sync, lpf_opt_level);
+
+ filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
+ cm->seq_params->use_highbitdepth);
+
+ // Re-instate the unfiltered frame
+ yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
+
+ return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ int partial_frame,
+ const int *last_frame_filter_level, int plane,
+ int dir) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ int filt_direction = 0;
+ int64_t best_err;
+ int filt_best;
+
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
+ int lvl;
+ switch (plane) {
+ case 0:
+ switch (dir) {
+ case 2:
+ lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
+ 1;
+ break;
+ case 0:
+ case 1: lvl = last_frame_filter_level[dir]; break;
+ default: assert(dir >= 0 && dir <= 2); return 0;
+ }
+ break;
+ case 1: lvl = last_frame_filter_level[2]; break;
+ case 2: lvl = last_frame_filter_level[3]; break;
+ default: assert(plane >= 0 && plane <= 2); return 0;
+ }
+ int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+ int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+ // Sum squared error at each filter level
+ int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+ const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search;
+ assert(use_coarse_search <= 1);
+ static const int min_filter_step_lookup[2] = { 0, 2 };
+ // min_filter_step_thesh determines the stopping criteria for the search.
+ // The search is terminated when filter_step equals min_filter_step_thesh.
+ const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search];
+
+ // Set each entry to -1
+ memset(ss_err, 0xFF, sizeof(ss_err));
+ yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
+ best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
+ filt_best = filt_mid;
+ ss_err[filt_mid] = best_err;
+
+ while (filter_step > min_filter_step_thesh) {
+ const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+ const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+ // Bias against raising loop filter in favor of lowering it.
+ int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+ if ((is_stat_consumption_stage_twopass(cpi)) &&
+ (cpi->ppi->twopass.section_intra_rating < 20))
+ bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20;
+
+ // yx, bias less for large block size
+ if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
+
+ if (filt_direction <= 0 && filt_low != filt_mid) {
+ // Get Low filter error score
+ if (ss_err[filt_low] < 0) {
+ ss_err[filt_low] =
+ try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
+ }
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
+ if (ss_err[filt_low] < (best_err + bias)) {
+ // Was it actually better than the previous best?
+ if (ss_err[filt_low] < best_err) {
+ best_err = ss_err[filt_low];
+ }
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if (filt_direction >= 0 && filt_high != filt_mid) {
+ if (ss_err[filt_high] < 0) {
+ ss_err[filt_high] =
+ try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
+ }
+ // If value is significantly better than previous best, bias added against
+ // raising filter value
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid) {
+ filter_step /= 2;
+ filt_direction = 0;
+ } else {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ LPF_PICK_METHOD method) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *const lf = &cm->lf;
+ int disable_filter_rt_screen = 0;
+ (void)sd;
+
+ lf->sharpness_level = 0;
+
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->sf.rt_sf.skip_lf_screen)
+ disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi);
+
+ if (disable_filter_rt_screen ||
+ cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE ||
+ (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE &&
+ cpi->ppi->rtc_ref.non_reference_frame)) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ return;
+ }
+
+ if (method == LPF_PICK_MINIMAL_LPF) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ } else if (method >= LPF_PICK_FROM_Q) {
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+ seq_params->bit_depth);
+ // based on tests result for rtc test set
+ // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
+ const int strength_boost_q_treshold = 0;
+ int inter_frame_multiplier =
+ (q > strength_boost_q_treshold ||
+ (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->common.width * cpi->common.height > 352 * 288))
+ ? 12034
+ : 6017;
+ // Increase strength on base TL0 for temporal layers, for low-resoln,
+ // based on frame source_sad.
+ if (cpi->svc.number_temporal_layers > 1 &&
+ cpi->svc.temporal_layer_id == 0 &&
+ cpi->common.width * cpi->common.height <= 352 * 288 &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ if (cpi->rc.frame_source_sad > 100000)
+ inter_frame_multiplier = inter_frame_multiplier << 1;
+ else if (cpi->rc.frame_source_sad > 50000)
+ inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1);
+ }
+ // These values were determined by linear fitting the result of the
+ // searched level for 8 bit depth:
+ // Keyframes: filt_guess = q * 0.06699 - 1.60817
+ // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
+ //
+ // And high bit depth separately:
+ // filt_guess = q * 0.316206 + 3.87252
+ int filt_guess;
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8:
+ filt_guess =
+ (cm->current_frame.frame_type == KEY_FRAME)
+ ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+ : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
+ break;
+ case AOM_BITS_10:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+ break;
+ case AOM_BITS_12:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+ break;
+ default:
+ assert(0 &&
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+ "or AOM_BITS_12");
+ return;
+ }
+ if (seq_params->bit_depth != AOM_BITS_8 &&
+ cm->current_frame.frame_type == KEY_FRAME)
+ filt_guess -= 4;
+ // TODO(chengchen): retrain the model for Y, U, V filter levels
+ lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+ if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+ !frame_is_intra_only(cm) && !cpi->rc.high_source_sad) {
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ } else {
+ const int num4x4 = (cm->width >> 2) * (cm->height >> 2);
+ const int newmv_thresh = 7;
+ const int distance_since_key_thresh = 5;
+ if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) <
+ newmv_thresh &&
+ cpi->rc.frames_since_key > distance_since_key_thresh) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ }
+ }
+ }
+ } else {
+ int last_frame_filter_level[4] = { 0 };
+ if (!frame_is_intra_only(cm)) {
+ last_frame_filter_level[0] = cpi->ppi->filter_level[0];
+ last_frame_filter_level[1] = cpi->ppi->filter_level[1];
+ last_frame_filter_level[2] = cpi->ppi->filter_level_u;
+ last_frame_filter_level[3] = cpi->ppi->filter_level_v;
+ }
+ // The frame buffer last_frame_uf is used to store the non-loop filtered
+ // reconstructed frame in search_filter_level().
+ if (aom_realloc_frame_buffer(
+ &cpi->last_frame_uf, cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ lf->filter_level[0] = lf->filter_level[1] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 0, 2);
+ if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
+ lf->filter_level[0] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 0, 0);
+ lf->filter_level[1] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 0, 1);
+ }
+
+ if (num_planes > 1) {
+ lf->filter_level_u =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 1, 0);
+ lf->filter_level_v =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 2, 0);
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 0000000000..f567937c32
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+
+/*!\brief Algorithm for AV1 loop filter level selection.
+ *
+ * \ingroup in_loop_filter
+ * This function determines proper filter levels used for in-loop filter
+ * (deblock filter).
+ *
+ * \param[in] sd The pointer of frame buffer
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] method The method used to select filter levels
+ *
+ * \par
+ * method includes:
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE: Try the full image with different values.
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search
+ * with non-dual filter only.
+ * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with
+ * different values.
+ * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type
+ * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last
+ * frame
+ *
+ * \remark Nothing is returned. Instead, filter levels below are stored in the
+ * "loopfilter" structure inside "cpi":
+ * \arg \c filter_level[0]: the vertical filter level for Y plane
+ * \arg \c filter_level[1]: the horizontal filter level for Y plane
+ * \arg \c filter_level_u: the filter level for U plane
+ * \arg \c filter_level_v: the filter level for V plane
+ *
+ * \n
+ * \b Overview
+ * \par
+ * The workflow of deblock filter is shown in Fig.1. \n
+ * Boundary pixels pass through a non-flatness check, followed by a step that
+ * determines smoothness and selects proper types of filters
+ * (4-, 6-, 8-, 14-tap filter). \n
+ * If non-flatness criteria is not satisfied, the encoder will not apply
+ * deblock filtering on these boundary pixels.
+ * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70%
+ *
+ * \par
+ * The non-flatness is determined by the boundary pixels and thresholds as shown
+ * in Fig.2. \n
+ * Filtering is applied when \n
+ * \f$|p_0-p_1|<thr_1\f$ and \f$|q_0-q_1|<thr_1\f$ and
+ * \f$2*|p_0-q_0|+|p_1-q_1|/2<thr_2\f$ \n
+ * \image html filter_thr.png "Fig.2. Non-flatness of pixel boundary" height=40%
+ *
+ * \par
+ * Thresholds ("thr_1" and "thr_2") are determined by the filter level. \n
+ * In AV1, for each frame, we employ the four filter levels, based on these
+ * observations: \n
+ * Luma and chroma planes have different characteristics, including subsampling
+ * (different plane size), coding quality (chroma planes are better coded). \n
+ * Therefore chroma planes need less deblocking filtering than luma plane. \n
+ * In addition, content texture has different spatial characteristics: vertical
+ * and horizontal direction may need different level of filtering. \n
+ * The selection of these filter levels is described in the following section.
+ *
+ * \par
+ * \b Algorithm
+ * \par
+ * The encoder selects filter levels given the current frame buffer, and the
+ * method. \n
+ * By default, "LPF_PICK_FROM_FULL_IMAGE" is used, which should provide
+ * the most appropriate filter levels. \n
+ * For video on demand (VOD) mode, if speed setting is larger than 5,
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" is used. \n
+ * For real-time mode, if speed setting is larger than 5, "LPF_PICK_FROM_Q" is
+ * used.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE" method: determine filter levels sequentially
+ * by a filter level search procedure (function "search_filter_level"). \n
+ * The order is: \n
+ * First search and determine the filter level for Y plane.
+ * Let vertical filter level (filter_level[0]) and the horizontal filter level
+ * (filter_level[1]) be equal to it. \n
+ * Keep the horizontal filter level the same and search and determine the
+ * vertical filter level. \n
+ * Search and determine the horizontal filter level. \n
+ * Search and determine filter level for U plane. \n
+ * Search and determine filter level for V plane.
+ *
+ * \par
+ * Search and determine filter level is fulfilled by function
+ * "search_filter_level". \n
+ * It starts with a base filter level ("filt_mid") initialized by the
+ * corresponding last frame's filter level. \n
+ * A filter step ("filter_step") is determined as:
+ * filter_step = filt_mid < 16 ? 4 : filt_mid / 4. \n
+ * Then a modified binary search strategy is employed to find a proper
+ * filter level. \n
+ * In each iteration, set filt_low = filt_mid - filter_step,
+ * filt_high = filt_mid + filter_step. \n
+ * We now have three candidate levels, "filt_mid", "filt_low" and "filt_high".
+ * \n
+ * Deblock filtering is applied on the current frame with candidate filter
+ * levels and the sum of squared error (SSE) between source and filtered frame
+ * is computed. \n
+ * Set "filt_best" to the filter level of the smallest SSE. If "filter_best"
+ * equals to "filt_mid", halve the filter_step. Otherwise, set filt_mid =
+ * filt_best. \n
+ * Go to the next iteration until "filter_step" is 0. \n
+ * Note that in the comparison of SSEs between SSE[filt_low] and SSE[filt_mid],
+ * a "bias" is introduced to slightly raise the filter level. \n
+ * It is based on the observation that low filter levels tend to yield a smaller
+ * SSE and produce a higher PSNR for the current frame, \n
+ * while oversmoothing it and degradating the quality for prediction for future
+ * frames and leanding to a suboptimal performance overall. \n
+ * Function "try_filter_frame" is the referrence for applying deblock filtering
+ * with a given filter level and computatition of SSE.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" method: almost the same as
+ * "LPF_PICK_FROM_FULL_IMAGE", \n
+ * just without separately searching for appropriate filter levels for vertical
+ * and horizontal filters.
+ *
+ * \par
+ * "LPF_PICK_FROM_Q" method: filter levels are determined by the
+ * quantization factor (q). \n
+ * For 8 bit: \n
+ * Keyframes: filt_guess = q * 0.06699 - 1.60817 \n
+ * Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 \n
+ * inter_frame_multiplier = q > 700 ? 0.04590 : 0.02295 \n
+ * For 10 bit and 12 bit: \n
+ * filt_guess = q * 0.316206 + 3.87252 \n
+ * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v =
+ * clamp(filt_guess, min_filter_level, max_filter_level) \n
+ * Where min_filter_level = 0, max_filter_level = 64 \n
+ * The equations were determined by linear fitting using filter levels
+ * generated by "LPF_PICK_FROM_FULL_IMAGE" method.
+ *
+ */
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+ struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 0000000000..6429064175
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,2217 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 5
+
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
+
+// Working precision for Wiener filter coefficients
+#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16)
+
+#define SGRPROJ_EP_GRP1_START_IDX 0
+#define SGRPROJ_EP_GRP1_END_IDX 9
+#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4
+#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2
+static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6,
+ 9 };
+static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = {
+ { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 },
+ { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
+};
+
+#if DEBUG_LR_COSTING
+RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+ [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif // DEBUG_LR_COSTING
+
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ int hstart, int width, int vstart,
+ int height);
+typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ int hstart, int width, int vstart,
+ int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define NUM_EXTRACTORS (3 * (1 + 1))
+#else
+#define NUM_EXTRACTORS 3
+#endif
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+ aom_get_y_sse_part, aom_get_u_sse_part,
+ aom_get_v_sse_part,
+#if CONFIG_AV1_HIGHBITDEPTH
+ aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part,
+ aom_highbd_get_v_sse_part,
+#endif
+};
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+ aom_get_y_var, aom_get_u_var, aom_get_v_var,
+#if CONFIG_AV1_HIGHBITDEPTH
+ aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+#endif
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *dst, int plane,
+ int highbd) {
+ return sse_part_extractors[3 * highbd + plane](
+ src, dst, limits->h_start, limits->h_end - limits->h_start,
+ limits->v_start, limits->v_end - limits->v_start);
+}
+
+static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
+ const YV12_BUFFER_CONFIG *src, int plane,
+ int highbd) {
+ return var_part_extractors[3 * highbd + plane](
+ src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
+ limits->v_end - limits->v_start);
+}
+
+typedef struct {
+ const YV12_BUFFER_CONFIG *src;
+ YV12_BUFFER_CONFIG *dst;
+
+ const AV1_COMMON *cm;
+ const MACROBLOCK *x;
+ int plane;
+ int plane_w;
+ int plane_h;
+ RestUnitSearchInfo *rusi;
+
+ // Speed features
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf;
+
+ uint8_t *dgd_buffer;
+ int dgd_stride;
+ const uint8_t *src_buffer;
+ int src_stride;
+
+ // SSE values for each restoration mode for the current RU
+ // These are saved by each search function for use in search_switchable()
+ int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+ // This flag will be set based on the speed feature
+ // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+ uint8_t skip_sgr_eval;
+
+ // Total rate and distortion so far for each restoration type
+ // These are initialised by reset_rsc in search_rest_type
+ int64_t total_sse[RESTORE_TYPES];
+ int64_t total_bits[RESTORE_TYPES];
+
+ // Reference parameters for delta-coding
+ //
+ // For each restoration type, we need to store the latest parameter set which
+ // has been used, so that we can properly cost up the next parameter set.
+ // Note that we have two sets of these - one for the single-restoration-mode
+ // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ)
+ // and one for the switchable mode. This is because these two cases can lead
+ // to different sets of parameters being signaled, but we don't know which
+ // we will pick for sure until the end of the search process.
+ WienerInfo ref_wiener;
+ SgrprojInfo ref_sgrproj;
+ WienerInfo switchable_ref_wiener;
+ SgrprojInfo switchable_ref_sgrproj;
+
+ // Buffers used to hold dgd-avg and src-avg data respectively during SIMD
+ // call of Wiener filter.
+ int16_t *dgd_avg;
+ int16_t *src_avg;
+} RestSearchCtxt;
+
+static AOM_INLINE void rsc_on_tile(void *priv) {
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ set_default_wiener(&rsc->ref_wiener);
+ set_default_sgrproj(&rsc->ref_sgrproj);
+ set_default_wiener(&rsc->switchable_ref_wiener);
+ set_default_sgrproj(&rsc->switchable_ref_sgrproj);
+}
+
+static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
+ memset(rsc->total_sse, 0, sizeof(rsc->total_sse));
+ memset(rsc->total_bits, 0, sizeof(rsc->total_bits));
+}
+
+static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
+ const AV1_COMMON *cm, const MACROBLOCK *x,
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf,
+ int plane, RestUnitSearchInfo *rusi,
+ YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
+ rsc->src = src;
+ rsc->dst = dst;
+ rsc->cm = cm;
+ rsc->x = x;
+ rsc->plane = plane;
+ rsc->rusi = rusi;
+ rsc->lpf_sf = lpf_sf;
+
+ const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+ const int is_uv = plane != AOM_PLANE_Y;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ assert(plane_w == src->crop_widths[is_uv]);
+ assert(plane_h == src->crop_heights[is_uv]);
+ assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+ assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+
+ rsc->plane_w = plane_w;
+ rsc->plane_h = plane_h;
+ rsc->src_buffer = src->buffers[plane];
+ rsc->src_stride = src->strides[is_uv];
+ rsc->dgd_buffer = dgd->buffers[plane];
+ rsc->dgd_stride = dgd->strides[is_uv];
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ const RestorationUnitInfo *rui) {
+ const AV1_COMMON *const cm = rsc->cm;
+ const int plane = rsc->plane;
+ const int is_uv = plane > 0;
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationLineBuffers rlbs;
+ const int bit_depth = cm->seq_params->bit_depth;
+ const int highbd = cm->seq_params->use_highbitdepth;
+
+ const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
+ // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+ // also used in encoder.
+ const int optimized_lr = 0;
+
+ av1_loop_restoration_filter_unit(
+ limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h,
+ is_uv && cm->seq_params->subsampling_x,
+ is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
+ fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+ rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error);
+
+ return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
+}
+
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ int i, j;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ }
+ } else if (params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t e = (int32_t)(dat[j]) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ return err;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int i, j;
+ int64_t err = 0;
+ const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int xq0 = xq[0];
+ int xq1 = xq[1];
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+ int32_t v0 = flt0[j] - u;
+ int32_t v1 = flt1[j] - u;
+ int32_t v = half;
+ v += xq0 * v0;
+ v += xq1 * v1;
+ const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ src += src_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int exq;
+ int32_t *flt;
+ int flt_stride;
+ if (params->r[0] > 0) {
+ exq = xq[0];
+ flt = flt0;
+ flt_stride = flt0_stride;
+ } else {
+ exq = xq[1];
+ flt = flt1;
+ flt_stride = flt1_stride;
+ }
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+ int32_t v = half;
+ v += exq * (flt[j] - u);
+ const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ flt += flt_stride;
+ src += src_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t e = d - s;
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+ return err;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int use_highbitdepth,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int *xqd,
+ const sgr_params_type *params) {
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_highbitdepth) {
+ return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+
+ } else {
+ return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+ }
+#else
+ (void)use_highbitdepth;
+ return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+#endif
+}
+
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+ int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+ const sgr_params_type *params) {
+ int64_t err = get_pixel_proj_error(
+ src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+ int64_t err2;
+ int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+ int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+ for (int s = start_step; s >= 1; s >>= 1) {
+ for (int p = 0; p < 2; ++p) {
+ if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+ continue;
+ }
+ int skip = 0;
+ do {
+ if (xqd[p] - s >= tap_min[p]) {
+ xqd[p] -= s;
+ err2 =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ if (err2 > err) {
+ xqd[p] += s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (xqd[p] + s <= tap_max[p]) {
+ xqd[p] += s;
+ err2 =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ if (err2 > err) {
+ xqd[p] -= s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ }
+#endif // USE_SGRPROJ_REFINEMENT_SEARCH
+ return err;
+}
+
+static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) {
+ if (dividend < 0)
+ return (dividend - divisor / 2) / divisor;
+ else
+ return (dividend + divisor / 2) / divisor;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ H[1][1] += (int64_t)f2 * f2;
+ H[0][1] += (int64_t)f1 * f2;
+ C[0] += (int64_t)f1 * s;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ H[1][1] += (int64_t)f2 * f2;
+ H[0][1] += (int64_t)f1 * f2;
+ C[0] += (int64_t)f1 * s;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int64_t H[2][2],
+ int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ C[0] += (int64_t)f1 * s;
+ }
+ }
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ C[0] += (int64_t)f1 * s;
+ }
+ }
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2],
+ int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[1][1] += (int64_t)f2 * f2;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[1][1] += (int64_t)f2 * f2;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, flt1, flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
+
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2],
+ int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int use_highbitdepth, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int *xq,
+ const sgr_params_type *params) {
+ int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t C[2] = { 0, 0 };
+
+ // Default values to be returned if the problem becomes ill-posed
+ xq[0] = 0;
+ xq[1] = 0;
+
+ if (!use_highbitdepth) {
+ if ((width & 0x7) == 0) {
+ av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, flt1, flt1_stride, H, C, params);
+ } else {
+ av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, flt1, flt1_stride, H, C,
+ params);
+ }
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ else { // NOLINT
+ if ((width & 0x7) == 0) {
+ av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C, params);
+ } else {
+ av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C, params);
+ }
+ }
+#endif
+
+ if (params->r[0] == 0) {
+ // H matrix is now only the scalar H[1][1]
+ // C vector is now only the scalar C[1]
+ const int64_t Det = H[1][1];
+ if (Det == 0) return; // ill-posed, return default values
+ xq[0] = 0;
+ xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det);
+ } else if (params->r[1] == 0) {
+ // H matrix is now only the scalar H[0][0]
+ // C vector is now only the scalar C[0]
+ const int64_t Det = H[0][0];
+ if (Det == 0) return; // ill-posed, return default values
+ xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det);
+ xq[1] = 0;
+ } else {
+ const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0];
+ if (Det == 0) return; // ill-posed, return default values
+
+ // If scaling up dividend would overflow, instead scale down the divisor
+ const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1];
+ if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) ||
+ (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1))
+ xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS));
+ else
+ xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det);
+
+ const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0];
+ if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) ||
+ (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2))
+ xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS));
+ else
+ xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det);
+ }
+}
+
+static AOM_INLINE void encode_xq(int *xq, int *xqd,
+ const sgr_params_type *params) {
+ if (params->r[0] == 0) {
+ xqd[0] = 0;
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ } else if (params->r[1] == 0) {
+ xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ } else {
+ xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ }
+}
+
+// Apply the self-guided filter across an entire restoration unit.
+static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
+ int width, int height, int dat_stride,
+ int use_highbd, int bit_depth, int pu_width,
+ int pu_height, int32_t *flt0, int32_t *flt1,
+ int flt_stride,
+ struct aom_internal_error_info *error_info) {
+ for (int i = 0; i < height; i += pu_height) {
+ const int h = AOMMIN(pu_height, height - i);
+ int32_t *flt0_row = flt0 + i * flt_stride;
+ int32_t *flt1_row = flt1 + i * flt_stride;
+ const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+ // Iterate over the stripe in blocks of width pu_width
+ for (int j = 0; j < width; j += pu_width) {
+ const int w = AOMMIN(pu_width, width - j);
+ if (av1_selfguided_restoration(
+ dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+ flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_selfguided_restoration");
+ }
+ }
+ }
+}
+
+static AOM_INLINE void compute_sgrproj_err(
+ const uint8_t *dat8, const int width, const int height,
+ const int dat_stride, const uint8_t *src8, const int src_stride,
+ const int use_highbitdepth, const int bit_depth, const int pu_width,
+ const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
+ const int flt_stride, int *exqd, int64_t *err,
+ struct aom_internal_error_info *error_info) {
+ int exq[2];
+ apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+ pu_width, pu_height, flt0, flt1, flt_stride, error_info);
+ const sgr_params_type *const params = &av1_sgr_params[ep];
+ get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+ use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+ params);
+ encode_xq(exq, exqd, params);
+ *err = finer_search_pixel_proj_error(
+ src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+ flt_stride, flt1, flt_stride, 2, exqd, params);
+}
+
+static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
+ const int *exqd, int *bestxqd,
+ int *bestep, const int ep) {
+ if (*besterr == -1 || err < *besterr) {
+ *bestep = ep;
+ *besterr = err;
+ bestxqd[0] = exqd[0];
+ bestxqd[1] = exqd[1];
+ }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+ const uint8_t *dat8, int width, int height, int dat_stride,
+ const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+ int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning,
+ struct aom_internal_error_info *error_info) {
+ int32_t *flt0 = rstbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ int ep, idx, bestep = 0;
+ int64_t besterr = -1;
+ int exqd[2], bestxqd[2] = { 0, 0 };
+ int flt_stride = ((width + 7) & ~7) + 8;
+ assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+ pu_width == RESTORATION_PROC_UNIT_SIZE);
+ assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+ pu_height == RESTORATION_PROC_UNIT_SIZE);
+ if (!enable_sgr_ep_pruning) {
+ for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ } else {
+ // evaluate first four seed ep in first group
+ for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) {
+ ep = sgproj_ep_grp1_seed[idx];
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ // evaluate left and right ep of winner in seed ep
+ int bestep_ref = bestep;
+ for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) {
+ if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX)
+ continue;
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ // evaluate last two group
+ for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) {
+ ep = sgproj_ep_grp2_3[idx][bestep];
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ }
+
+ SgrprojInfo ret;
+ ret.ep = bestep;
+ ret.xqd[0] = bestxqd[0];
+ ret.xqd[1] = bestxqd[1];
+ return ret;
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info) {
+ int bits = SGRPROJ_PARAMS_BITS;
+ const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+ if (params->r[0] > 0)
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ if (params->r[1] > 0)
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ return bits;
+}
+
+static AOM_INLINE void search_sgrproj(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)rlbs;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+ const AV1_COMMON *const cm = rsc->cm;
+ const int highbd = cm->seq_params->use_highbitdepth;
+ const int bit_depth = cm->seq_params->bit_depth;
+
+ const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
+ // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
+ if (rsc->skip_sgr_eval) {
+ rsc->total_bits[RESTORE_SGRPROJ] += bits_none;
+ rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
+ rsc->sse[RESTORE_SGRPROJ] = INT64_MAX;
+ return;
+ }
+
+ uint8_t *dgd_start =
+ rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
+ const uint8_t *src_start =
+ rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
+
+ const int is_uv = rsc->plane > 0;
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+ const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+
+ rusi->sgrproj = search_selfguided_restoration(
+ dgd_start, limits->h_end - limits->h_start,
+ limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+ rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+ tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info);
+
+ RestorationUnitInfo rui;
+ rui.restoration_type = RESTORE_SGRPROJ;
+ rui.sgrproj_info = rusi->sgrproj;
+
+ rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui);
+
+ const int64_t bits_sgr =
+ x->mode_costs.sgrproj_restore_cost[1] +
+ (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj)
+ << AV1_PROB_COST_SHIFT);
+ double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth);
+ double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth);
+ if (rusi->sgrproj.ep < 10)
+ cost_sgr *=
+ (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
+
+ RestorationType rtype =
+ (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+ rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info =
+ rsc->ref_sgrproj;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype];
+ rsc->total_bits[RESTORE_SGRPROJ] +=
+ (cost_sgr < cost_none) ? bits_sgr : bits_none;
+ if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj;
+}
+
+static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src,
+ int dgd_stride, int h_start, int h_end,
+ uint8_t avg, const int wiener_halfwin,
+ const int wiener_win2, int32_t *M_int32,
+ int32_t *H_int32, int count) {
+ int j, k, l;
+ int16_t Y[WIENER_WIN2];
+
+ for (j = h_start; j < h_end; j++) {
+ const int16_t X = (int16_t)src[j] - (int16_t)avg;
+ int idx = 0;
+ for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+ for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+ Y[idx] =
+ (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg;
+ idx++;
+ }
+ }
+ assert(idx == wiener_win2);
+ for (k = 0; k < wiener_win2; ++k) {
+ M_int32[k] += (int32_t)Y[k] * X;
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
+ }
+ }
+ }
+}
+
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int16_t *dgd_avg, int16_t *src_avg, int h_start,
+ int h_end, int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ (void)dgd_avg;
+ (void)src_avg;
+ int i, k, l;
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+ int32_t M_row[WIENER_WIN2] = { 0 };
+ int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 };
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+ for (i = v_start; i < v_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = v_end - i;
+ }
+
+ memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2);
+ memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2);
+ acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end,
+ avg, wiener_halfwin, wiener_win2, M_row, H_row, i);
+
+ for (k = 0; k < wiener_win2; ++k) {
+ // Scale M matrix based on the downsampling factor
+ M[k] += ((int64_t)M_row[k] * downsample_factor);
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ // Scale H Matrix based on the downsampling factor
+ H[k * wiener_win2 + l] +=
+ ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor);
+ }
+ }
+ }
+
+ for (k = 0; k < wiener_win2; ++k) {
+ for (l = k + 1; l < wiener_win2; ++l) {
+ H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ int i, j, k, l;
+ int32_t Y[WIENER_WIN2];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j++) {
+ const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg;
+ int idx = 0;
+ for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+ for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+ Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg;
+ idx++;
+ }
+ }
+ assert(idx == wiener_win2);
+ for (k = 0; k < wiener_win2; ++k) {
+ M[k] += (int64_t)Y[k] * X;
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l];
+ }
+ }
+ }
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ M[k] /= bit_depth_divider;
+ H[k * wiener_win2 + k] /= bit_depth_divider;
+ for (l = k + 1; l < wiener_win2; ++l) {
+ H[k * wiener_win2 + l] /= bit_depth_divider;
+ H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE int wrap_index(int i, int wiener_win) {
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
+}
+
+// Solve linear equations to find Wiener filter tap values
+// Taps are output scaled by WIENER_FILT_STEP
+static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
+ int64_t *x) {
+ for (int k = 0; k < n - 1; k++) {
+ // Partial pivoting: bring the row with the largest pivot to the top
+ for (int i = n - 1; i > k; i--) {
+ // If row i has a better (bigger) pivot than row (i-1), swap them
+ if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) {
+ for (int j = 0; j < n; j++) {
+ const int64_t c = A[i * stride + j];
+ A[i * stride + j] = A[(i - 1) * stride + j];
+ A[(i - 1) * stride + j] = c;
+ }
+ const int64_t c = b[i];
+ b[i] = b[i - 1];
+ b[i - 1] = c;
+ }
+ }
+
+ // b/278065963: The multiplies
+ // c / 256 * A[k * stride + j] / cd * 256
+ // and
+ // c / 256 * b[k] / cd * 256
+ // within Gaussian elimination can cause a signed integer overflow. Rework
+ // the multiplies so that larger scaling is used without significantly
+ // impacting the overall precision.
+ //
+ // Precision guidance:
+ // scale_threshold: Pick as high as possible.
+ // For max_abs_akj >= scale_threshold scenario:
+ // scaler_A: Pick as low as possible. Needed for A[(i + 1) * stride + j].
+ // scaler_c: Pick as low as possible while maintaining scaler_c >=
+ // (1 << 7). Needed for A[(i + 1) * stride + j] and b[i + 1].
+ int64_t max_abs_akj = 0;
+ for (int j = 0; j < n; j++) {
+ const int64_t abs_akj = llabs(A[k * stride + j]);
+ if (abs_akj > max_abs_akj) max_abs_akj = abs_akj;
+ }
+ const int scale_threshold = 1 << 22;
+ const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5);
+ const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7);
+ const int scaler = scaler_c * scaler_A;
+
+ // Forward elimination (convert A to row-echelon form)
+ for (int i = k; i < n - 1; i++) {
+ if (A[k * stride + k] == 0) return 0;
+ const int64_t c = A[(i + 1) * stride + k] / scaler_c;
+ const int64_t cd = A[k * stride + k];
+ for (int j = 0; j < n; j++) {
+ A[(i + 1) * stride + j] -=
+ A[k * stride + j] / scaler_A * c / cd * scaler;
+ }
+ b[i + 1] -= c * b[k] / cd * scaler_c;
+ }
+ }
+ // Back-substitution
+ for (int i = n - 1; i >= 0; i--) {
+ if (A[i * stride + i] == 0) return 0;
+ int64_t c = 0;
+ for (int j = i + 1; j <= n - 1; j++) {
+ c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR;
+ }
+ // Store filter taps x in scaled form.
+ x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i];
+ }
+
+ return 1;
+}
+
+// Fix vector b, update vector a
+static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
+ int64_t **Hc, int32_t *a, int32_t *b) {
+ int i, j;
+ int64_t S[WIENER_WIN];
+ int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; ++j) {
+ const int jj = wrap_index(j, wiener_win);
+ A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR;
+ }
+ }
+
+ // b/274668506: This is the dual branch for the issue in b/272139363. The fix
+ // is similar. See comments in update_b_sep_sym() below.
+ int32_t max_b_l = 0;
+ for (int l = 0; l < wiener_win; ++l) {
+ const int32_t abs_b_l = abs(b[l]);
+ if (abs_b_l > max_b_l) max_b_l = abs_b_l;
+ }
+ const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
+ const int scaler = max_b_l < scale_threshold ? 1 : 4;
+
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; j++) {
+ int k, l;
+ for (k = 0; k < wiener_win; ++k) {
+ const int kk = wrap_index(k, wiener_win);
+ for (l = 0; l < wiener_win; ++l) {
+ const int ll = wrap_index(l, wiener_win);
+ B[ll * wiener_halfwin1 + kk] +=
+ Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
+ (scaler * WIENER_TAP_SCALE_FACTOR) * b[j] /
+ (WIENER_TAP_SCALE_FACTOR / scaler);
+ }
+ }
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ A[i] -=
+ A[wiener_halfwin1 - 1] * 2 +
+ B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+ }
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+ B[i * wiener_halfwin1 + j] -=
+ 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+ B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+ (wiener_halfwin1 - 1)]);
+ }
+ }
+ if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+ S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+ for (i = wiener_halfwin1; i < wiener_win; ++i) {
+ S[i] = S[wiener_win - 1 - i];
+ S[wiener_halfwin1 - 1] -= 2 * S[i];
+ }
+ for (i = 0; i < wiener_win; ++i) {
+ a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+ (1 << (WIENER_FILT_BITS - 1)) - 1);
+ }
+ }
+}
+
+// Fix vector a, update vector b
+static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
+ int64_t **Hc, int32_t *a, int32_t *b) {
+ int i, j;
+ int64_t S[WIENER_WIN];
+ int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < wiener_win; i++) {
+ const int ii = wrap_index(i, wiener_win);
+ for (j = 0; j < wiener_win; j++) {
+ A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR;
+ }
+ }
+
+ // b/272139363: The computation,
+ // Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+ // WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR;
+ // may generate a signed-integer-overflow. Conditionally scale the terms to
+ // avoid a potential overflow.
+ //
+ // Hc contains accumulated correlation statistics and it is desired to leave
+ // as much room as possible for Hc. It was experimentally observed that the
+ // primary issue manifests itself with the second, a[l], multiply. For
+ // max_a_l < WIENER_TAP_SCALE_FACTOR the first multiply with a[k] should not
+ // increase dynamic range and the second multiply should hence be safe.
+ // Thereafter a safe scale_threshold depends on the actual operational range
+ // of Hc. The largest scale_threshold is expected to depend on bit-depth
+ // (av1_compute_stats_highbd_c() scales highbd to 8-bit) and maximum
+ // restoration-unit size (256), leading up to 32-bit positive numbers in Hc.
+ // Noting that the caller, wiener_decompose_sep_sym(), initializes a[...]
+ // to a range smaller than 16 bits, the scale_threshold is set as below for
+ // convenience.
+ int32_t max_a_l = 0;
+ for (int l = 0; l < wiener_win; ++l) {
+ const int32_t abs_a_l = abs(a[l]);
+ if (abs_a_l > max_a_l) max_a_l = abs_a_l;
+ }
+ const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
+ const int scaler = max_a_l < scale_threshold ? 1 : 4;
+
+ for (i = 0; i < wiener_win; i++) {
+ const int ii = wrap_index(i, wiener_win);
+ for (j = 0; j < wiener_win; j++) {
+ const int jj = wrap_index(j, wiener_win);
+ int k, l;
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ B[jj * wiener_halfwin1 + ii] +=
+ Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+ (scaler * WIENER_TAP_SCALE_FACTOR) * a[l] /
+ (WIENER_TAP_SCALE_FACTOR / scaler);
+ }
+ }
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ A[i] -=
+ A[wiener_halfwin1 - 1] * 2 +
+ B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+ }
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+ B[i * wiener_halfwin1 + j] -=
+ 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+ B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+ (wiener_halfwin1 - 1)]);
+ }
+ }
+ if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+ S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+ for (i = wiener_halfwin1; i < wiener_win; ++i) {
+ S[i] = S[wiener_win - 1 - i];
+ S[wiener_halfwin1 - 1] -= 2 * S[i];
+ }
+ for (i = 0; i < wiener_win; ++i) {
+ b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+ (1 << (WIENER_FILT_BITS - 1)) - 1);
+ }
+ }
+}
+
+static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
+ int32_t *a, int32_t *b) {
+ static const int32_t init_filt[WIENER_WIN] = {
+ WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+ WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+ WIENER_FILT_TAP0_MIDV,
+ };
+ int64_t *Hc[WIENER_WIN2];
+ int64_t *Mc[WIENER_WIN];
+ int i, j, iter;
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+ for (i = 0; i < wiener_win; i++) {
+ a[i] = b[i] =
+ WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off];
+ }
+ for (i = 0; i < wiener_win; i++) {
+ Mc[i] = M + i * wiener_win;
+ for (j = 0; j < wiener_win; j++) {
+ Hc[i * wiener_win + j] =
+ H + i * wiener_win * wiener_win2 + j * wiener_win;
+ }
+ }
+
+ iter = 1;
+ while (iter < NUM_WIENER_ITERS) {
+ update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+ update_b_sep_sym(wiener_win, Mc, Hc, a, b);
+ iter++;
+ }
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H,
+ InterpKernel vfilt, InterpKernel hfilt) {
+ int32_t ab[WIENER_WIN * WIENER_WIN];
+ int16_t a[WIENER_WIN], b[WIENER_WIN];
+ int64_t P = 0, Q = 0;
+ int64_t iP = 0, iQ = 0;
+ int64_t Score, iScore;
+ int i, k, l;
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP;
+ for (i = 0; i < WIENER_HALFWIN; ++i) {
+ a[i] = a[WIENER_WIN - i - 1] = vfilt[i];
+ b[i] = b[WIENER_WIN - i - 1] = hfilt[i];
+ a[WIENER_HALFWIN] -= 2 * a[i];
+ b[WIENER_HALFWIN] -= 2 * b[i];
+ }
+ memset(ab, 0, sizeof(ab));
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l)
+ ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP;
+ for (l = 0; l < wiener_win2; ++l) {
+ Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP /
+ WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP;
+ }
+ }
+ Score = Q - 2 * P;
+
+ iP = M[wiener_win2 >> 1];
+ iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
+ iScore = iQ - 2 * iP;
+
+ return Score - iScore;
+}
+
+static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
+ InterpKernel fi) {
+ int i;
+ const int wiener_halfwin = (wiener_win >> 1);
+
+ for (i = 0; i < wiener_halfwin; ++i) {
+ const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP;
+ const int64_t divisor = WIENER_TAP_SCALE_FACTOR;
+ // Perform this division with proper rounding rather than truncation
+ if (dividend < 0) {
+ fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor);
+ } else {
+ fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor);
+ }
+ }
+ // Specialize for 7-tap filter
+ if (wiener_win == WIENER_WIN) {
+ fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+ fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ } else {
+ fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[0] = 0;
+ }
+ // Satisfy filter constraints
+ fi[WIENER_WIN - 1] = fi[0];
+ fi[WIENER_WIN - 2] = fi[1];
+ fi[WIENER_WIN - 3] = fi[2];
+ // The central element has an implicit +WIENER_FILT_STEP
+ fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info) {
+ int bits = 0;
+ if (wiener_win == WIENER_WIN)
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ if (wiener_win == WIENER_WIN)
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ return bits;
+}
+
+static int64_t finer_search_wiener(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ RestorationUnitInfo *rui, int wiener_win) {
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ int64_t err = try_restoration_unit(rsc, limits, rui);
+
+ if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err;
+
+ // Refinement search around the wiener filter coefficients.
+ int64_t err2;
+ int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+ WIENER_FILT_TAP2_MINV };
+ int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+ WIENER_FILT_TAP2_MAXV };
+
+ WienerInfo *plane_wiener = &rui->wiener_info;
+
+ // printf("err pre = %"PRId64"\n", err);
+ const int start_step = 4;
+ for (int s = start_step; s >= 1; s >>= 1) {
+ for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+ int skip = 0;
+ do {
+ if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+ plane_wiener->hfilter[p] -= s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->hfilter[p] += s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+ plane_wiener->hfilter[p] += s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->hfilter[p] -= s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+ int skip = 0;
+ do {
+ if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+ plane_wiener->vfilter[p] -= s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->vfilter[p] += s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+ plane_wiener->vfilter[p] += s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->vfilter[p] -= s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ }
+ // printf("err post = %"PRId64"\n", err);
+ return err;
+}
+
+static AOM_INLINE void search_wiener(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)tmpbuf;
+ (void)rlbs;
+ (void)error_info;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+ const int64_t bits_none = x->mode_costs.wiener_restore_cost[0];
+
+ // Skip Wiener search for low variance contents
+ if (rsc->lpf_sf->prune_wiener_based_on_src_var) {
+ const int scale[3] = { 0, 1, 2 };
+ // Obtain the normalized Qscale
+ const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
+ rsc->cm->seq_params->bit_depth) >>
+ 3;
+ // Derive threshold as sqr(normalized Qscale) * scale / 16,
+ const uint64_t thresh =
+ (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
+ const int highbd = rsc->cm->seq_params->use_highbitdepth;
+ const uint64_t src_var =
+ var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+ // Do not perform Wiener search if source variance is lower than threshold
+ // or if the reconstruction error is zero
+ int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0);
+ if (prune_wiener) {
+ rsc->total_bits[RESTORE_WIENER] += bits_none;
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+ rsc->sse[RESTORE_WIENER] = INT64_MAX;
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
+ return;
+ }
+ }
+
+ const int wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+ int reduced_wiener_win = wiener_win;
+ if (rsc->lpf_sf->reduce_wiener_window_size) {
+ reduced_wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
+ }
+
+ int64_t M[WIENER_WIN2];
+ int64_t H[WIENER_WIN2 * WIENER_WIN2];
+ int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const AV1_COMMON *const cm = rsc->cm;
+ if (cm->seq_params->use_highbitdepth) {
+ // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD
+ // functions. Optimize intrinsics of HBD design similar to LBD (i.e.,
+ // pre-calculate d and s buffers and avoid most of the C operations).
+ av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
+ rsc->src_buffer, limits->h_start, limits->h_end,
+ limits->v_start, limits->v_end, rsc->dgd_stride,
+ rsc->src_stride, M, H, cm->seq_params->bit_depth);
+ } else {
+ av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ rsc->dgd_avg, rsc->src_avg, limits->h_start,
+ limits->h_end, limits->v_start, limits->v_end,
+ rsc->dgd_stride, rsc->src_stride, M, H,
+ rsc->lpf_sf->use_downsampled_wiener_stats);
+ }
+#else
+ av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end,
+ limits->v_start, limits->v_end, rsc->dgd_stride,
+ rsc->src_stride, M, H,
+ rsc->lpf_sf->use_downsampled_wiener_stats);
+#endif
+
+ wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter);
+
+ RestorationUnitInfo rui;
+ memset(&rui, 0, sizeof(rui));
+ rui.restoration_type = RESTORE_WIENER;
+ finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter);
+ finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter);
+
+ // Filter score computes the value of the function x'*A*x - x'*b for the
+ // learned filter and compares it against identity filer. If there is no
+ // reduction in the function, the filter is reverted back to identity
+ if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
+ rui.wiener_info.hfilter) > 0) {
+ rsc->total_bits[RESTORE_WIENER] += bits_none;
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+ rsc->sse[RESTORE_WIENER] = INT64_MAX;
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
+ return;
+ }
+
+ rsc->sse[RESTORE_WIENER] =
+ finer_search_wiener(rsc, limits, &rui, reduced_wiener_win);
+ rusi->wiener = rui.wiener_info;
+
+ if (reduced_wiener_win != WIENER_WIN) {
+ assert(rui.wiener_info.vfilter[0] == 0 &&
+ rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+ assert(rui.wiener_info.hfilter[0] == 0 &&
+ rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
+ }
+
+ const int64_t bits_wiener =
+ x->mode_costs.wiener_restore_cost[1] +
+ (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener)
+ << AV1_PROB_COST_SHIFT);
+
+ double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE],
+ rsc->cm->seq_params->bit_depth);
+ double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER],
+ rsc->cm->seq_params->bit_depth);
+
+ RestorationType rtype =
+ (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+ rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
+
+ // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
+ // RESTORE_NONE or based on best_rtype
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
+ rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+ } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
+ rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+ }
+
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info =
+ rsc->ref_wiener;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype];
+ rsc->total_bits[RESTORE_WIENER] +=
+ (cost_wiener < cost_none) ? bits_wiener : bits_none;
+ if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener;
+}
+
+static AOM_INLINE void search_norestore(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)rest_unit_idx;
+ (void)tmpbuf;
+ (void)rlbs;
+ (void)error_info;
+
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+
+ const int highbd = rsc->cm->seq_params->use_highbitdepth;
+ rsc->sse[RESTORE_NONE] = sse_restoration_unit(
+ limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
+
+ rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE];
+}
+
+static AOM_INLINE void search_switchable(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)limits;
+ (void)tmpbuf;
+ (void)rlbs;
+ (void)error_info;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+
+ const int wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+ double best_cost = 0;
+ int64_t best_bits = 0;
+ RestorationType best_rtype = RESTORE_NONE;
+
+ for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+ // If this restoration mode was skipped, or could not find a solution
+ // that was better than RESTORE_NONE, then we can't select it here either.
+ //
+ // Note: It is possible for the restoration search functions to find a
+ // filter which is better than RESTORE_NONE when looking purely at SSE, but
+ // for it to be rejected overall due to its rate cost. In this case, there
+ // is a chance that it may be have a lower rate cost when looking at
+ // RESTORE_SWITCHABLE, and so it might be acceptable here.
+ //
+ // Therefore we prune based on SSE, rather than on whether or not the
+ // previous search function selected this mode.
+ if (r > RESTORE_NONE) {
+ if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue;
+ }
+
+ const int64_t sse = rsc->sse[r];
+ int64_t coeff_pcost = 0;
+ switch (r) {
+ case RESTORE_NONE: coeff_pcost = 0; break;
+ case RESTORE_WIENER:
+ coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener,
+ &rsc->switchable_ref_wiener);
+ break;
+ case RESTORE_SGRPROJ:
+ coeff_pcost =
+ count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj);
+ break;
+ default: assert(0); break;
+ }
+ const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+ const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
+ double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth);
+ if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+ cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
+ if (r == 0 || cost < best_cost) {
+ best_cost = cost;
+ best_bits = bits;
+ best_rtype = r;
+ }
+ }
+
+ rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info =
+ rsc->switchable_ref_wiener;
+ lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info =
+ rsc->switchable_ref_sgrproj;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype];
+ rsc->total_bits[RESTORE_SWITCHABLE] += best_bits;
+ if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener;
+ if (best_rtype == RESTORE_SGRPROJ)
+ rsc->switchable_ref_sgrproj = rusi->sgrproj;
+}
+
+static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
+ const RestUnitSearchInfo *rusi,
+ RestorationUnitInfo *rui) {
+ assert(frame_rtype > 0);
+ rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+ if (rui->restoration_type == RESTORE_WIENER)
+ rui->wiener_info = rusi->wiener;
+ else
+ rui->sgrproj_info = rusi->sgrproj;
+}
+
+static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc,
+ bool *disable_lr_filter) {
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const CommonTileParams *tiles = &cm->tiles;
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ RestorationInfo *rsi = &cm->rst_info[plane];
+ const int ru_size = rsi->restoration_unit_size;
+ const int ext_size = ru_size * 3 / 2;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+ search_norestore, search_wiener, search_sgrproj, search_switchable
+ };
+
+ const int plane_num_units = rsi->num_rest_units;
+ const RestorationType num_rtypes =
+ (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+ reset_rsc(rsc);
+
+ // Iterate over restoration units in encoding order, so that each RU gets
+ // the correct reference parameters when we cost it up. This is effectively
+ // a nested iteration over:
+ // * Each tile, order does not matter
+ // * Each superblock within that tile, in raster order
+ // * Each LR unit which is coded within that superblock, in raster order
+ for (int tile_row = 0; tile_row < tiles->rows; tile_row++) {
+ int sb_row_start = tiles->row_start_sb[tile_row];
+ int sb_row_end = tiles->row_start_sb[tile_row + 1];
+ for (int tile_col = 0; tile_col < tiles->cols; tile_col++) {
+ int sb_col_start = tiles->col_start_sb[tile_col];
+ int sb_col_end = tiles->col_start_sb[tile_col + 1];
+
+ // Reset reference parameters for delta-coding at the start of each tile
+ rsc_on_tile(rsc);
+
+ for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) {
+ int mi_row = sb_row << mib_size_log2;
+ for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) {
+ int mi_col = sb_col << mib_size_log2;
+
+ int rcol0, rcol1, rrow0, rrow1;
+ int has_lr_info = av1_loop_restoration_corners_in_sb(
+ cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0,
+ &rrow1);
+
+ if (!has_lr_info) continue;
+
+ RestorationTileLimits limits;
+ for (int rrow = rrow0; rrow < rrow1; rrow++) {
+ int y0 = rrow * ru_size;
+ int remaining_h = plane_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : ru_size;
+
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+ for (int rcol = rcol0; rcol < rcol1; rcol++) {
+ int x0 = rcol * ru_size;
+ int remaining_w = plane_w - x0;
+ int w = (remaining_w < ext_size) ? remaining_w : ru_size;
+
+ limits.h_start = x0;
+ limits.h_end = x0 + w;
+ assert(limits.h_end <= plane_w);
+
+ const int unit_idx = rrow * rsi->horz_units + rcol;
+
+ rsc->skip_sgr_eval = 0;
+ for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) {
+ if (disable_lr_filter[r]) continue;
+
+ funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL,
+ cm->error);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static INLINE void av1_derive_flags_for_lr_processing(
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) {
+ const bool is_wiener_disabled = lpf_sf->disable_wiener_filter;
+ const bool is_sgr_disabled = lpf_sf->disable_sgr_filter;
+
+ // Enable None Loop restoration filter if either of Wiener or Self-guided is
+ // enabled.
+ disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled);
+
+ disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled;
+ disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled;
+
+ // Enable Swicthable Loop restoration filter if both of the Wiener and
+ // Self-guided are enabled.
+ disable_lr_filter[RESTORE_SWITCHABLE] =
+ (is_wiener_disabled || is_sgr_disabled);
+}
+
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+// Allocate both decoder-side and encoder-side info structs for a single plane.
+// The unit size passed in should be the minimum size which we are going to
+// search; before each search, set_restoration_unit_size() must be called to
+// configure the actual size.
+static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm,
+ RestorationInfo *rsi,
+ int is_uv,
+ int min_luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int sx = cm->seq_params.subsampling_x;
+ int sy = cm->seq_params.subsampling_y;
+ int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int min_unit_size = min_luma_unit_size >> s;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w);
+ const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h);
+ const int max_num_units = max_horz_units * max_vert_units;
+
+ aom_free(rsi->unit_info);
+ CHECK_MEM_ERROR(cm, rsi->unit_info,
+ (RestorationUnitInfo *)aom_memalign(
+ 16, sizeof(*rsi->unit_info) * max_num_units));
+
+ RestUnitSearchInfo *rusi;
+ CHECK_MEM_ERROR(
+ cm, rusi,
+ (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units));
+
+ // If the restoration unit dimensions are not multiples of
+ // rsi->restoration_unit_size then some elements of the rusi array may be
+ // left uninitialised when we reach copy_unit_info(...). This is not a
+ // problem, as these elements are ignored later, but in order to quiet
+ // Valgrind's warnings we initialise the array below.
+ memset(rusi, 0, sizeof(*rusi) * max_num_units);
+
+ return rusi;
+}
+
+static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi,
+ int is_uv, int luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int sx = cm->seq_params.subsampling_x;
+ int sy = cm->seq_params.subsampling_y;
+ int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int unit_size = luma_unit_size >> s;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ const int horz_units = av1_lr_count_units(unit_size, plane_w);
+ const int vert_units = av1_lr_count_units(unit_size, plane_h);
+
+ rsi->restoration_unit_size = unit_size;
+ rsi->num_rest_units = horz_units * vert_units;
+ rsi->horz_units = horz_units;
+ rsi->vert_units = vert_units;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
+ const int num_planes = av1_num_planes(cm);
+ const int highbd = cm->seq_params->use_highbitdepth;
+ assert(!cm->features.all_lossless);
+
+ av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
+ // Select unit size based on speed feature settings, and allocate
+ // rui structs based on this size
+ int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+ int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size;
+
+ // The minimum allowed unit size at a syntax level is 1 superblock.
+ // Apply this constraint here so that the speed features code which sets
+ // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size
+ min_lr_unit_size =
+ AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs(
+ cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size);
+ }
+
+ x->rdmult = cpi->rd.RDMULT;
+
+ // Allocate the frame buffer trial_frame_rst, which is used to temporarily
+ // store the loop restored frame.
+ if (aom_realloc_frame_buffer(
+ &cpi->trial_frame_rst, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate trial restored frame buffer");
+
+ RestSearchCtxt rsc;
+
+ // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers.
+ // These buffers are only required for the AVX2 and NEON implementations of
+ // av1_compute_stats. The buffer size required is calculated based on maximum
+ // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5
+ // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width
+ // and height aligned to multiple of 16 is considered for intrinsic purpose.
+ rsc.dgd_avg = NULL;
+ rsc.src_avg = NULL;
+#if HAVE_AVX2 || HAVE_NEON
+ // The buffers allocated below are used during Wiener filter processing of low
+ // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
+ // low bitdepth path.
+ if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+ const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
+ RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+ CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg,
+ (int16_t *)aom_memalign(32, buf_size));
+
+ rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg;
+ // When LRU width isn't multiple of 16, the 256 bits load instruction used
+ // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to
+ // silence Valgrind warning this buffer is initialized with zero. Overhead
+ // due to this initialization is negligible since it is done at frame level.
+ memset(rsc.dgd_avg, 0, buf_size);
+ rsc.src_avg =
+ rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+ // Asserts the starting address of src_avg is always 32-bytes aligned.
+ assert(!((intptr_t)rsc.src_avg % 32));
+ }
+#endif
+
+ // Initialize all planes, so that any planes we skip searching will still have
+ // valid data
+ for (int plane = 0; plane < num_planes; plane++) {
+ cm->rst_info[plane].frame_restoration_type = RESTORE_NONE;
+ }
+
+ // Decide which planes to search
+ int plane_start, plane_end;
+
+ if (lpf_sf->disable_loop_restoration_luma) {
+ plane_start = AOM_PLANE_U;
+ } else {
+ plane_start = AOM_PLANE_Y;
+ }
+
+ if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) {
+ plane_end = AOM_PLANE_Y;
+ } else {
+ plane_end = AOM_PLANE_V;
+ }
+
+ // Derive the flags to enable/disable Loop restoration filters based on the
+ // speed features 'disable_wiener_filter' and 'disable_sgr_filter'.
+ bool disable_lr_filter[RESTORE_TYPES] = { false };
+ av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter);
+
+ for (int plane = plane_start; plane <= plane_end; plane++) {
+ const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+ const int is_uv = plane != AOM_PLANE_Y;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv],
+ RESTORATION_BORDER, RESTORATION_BORDER, highbd);
+ }
+
+ double best_cost = DBL_MAX;
+ int best_luma_unit_size = max_lr_unit_size;
+ for (int luma_unit_size = max_lr_unit_size;
+ luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) {
+ int64_t bits_this_size = 0;
+ int64_t sse_this_size = 0;
+ RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE,
+ RESTORE_NONE };
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+ luma_unit_size);
+ init_rsc(src, &cpi->common, x, lpf_sf, plane,
+ cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc);
+
+ restoration_search(cm, plane, &rsc, disable_lr_filter);
+
+ const int plane_num_units = cm->rst_info[plane].num_rest_units;
+ const RestorationType num_rtypes =
+ (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+ double best_cost_this_plane = DBL_MAX;
+ for (RestorationType r = 0; r < num_rtypes; ++r) {
+ // Disable Loop restoration filter based on the flags set using speed
+ // feature 'disable_wiener_filter' and 'disable_sgr_filter'.
+ if (disable_lr_filter[r]) continue;
+
+ double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r],
+ cm->seq_params->bit_depth);
+
+ if (cost_this_plane < best_cost_this_plane) {
+ best_cost_this_plane = cost_this_plane;
+ best_rtype[plane] = r;
+ }
+ }
+
+ bits_this_size += rsc.total_bits[best_rtype[plane]];
+ sse_this_size += rsc.total_sse[best_rtype[plane]];
+ }
+
+ double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_this_size >> 4, sse_this_size,
+ cm->seq_params->bit_depth);
+
+ if (cost_this_size < best_cost) {
+ best_cost = cost_this_size;
+ best_luma_unit_size = luma_unit_size;
+ // Copy parameters out of rusi struct, before we overwrite it at
+ // the start of the next iteration
+ bool all_none = true;
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ cm->rst_info[plane].frame_restoration_type = best_rtype[plane];
+ if (best_rtype[plane] != RESTORE_NONE) {
+ all_none = false;
+ const int plane_num_units = cm->rst_info[plane].num_rest_units;
+ for (int u = 0; u < plane_num_units; ++u) {
+ copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u],
+ &cm->rst_info[plane].unit_info[u]);
+ }
+ }
+ }
+ // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we
+ // couldn't find any good filters at this size. So we likely won't find
+ // any good filters at a smaller size either, so skip
+ if (all_none) {
+ break;
+ }
+ } else {
+ // Heuristic: If this size is worse than the previous (larger) size, then
+ // the next size down will likely be even worse, so skip
+ break;
+ }
+ }
+
+ // Final fixup to set the correct unit size
+ // We set this for all planes, even ones we have skipped searching,
+ // so that other code does not need to care which planes were and weren't
+ // searched
+ for (int plane = 0; plane < num_planes; ++plane) {
+ set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+ best_luma_unit_size);
+ }
+
+#if HAVE_AVX || HAVE_NEON
+ if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+ aom_free(cpi->pick_lr_ctxt.dgd_avg);
+ cpi->pick_lr_ctxt.dgd_avg = NULL;
+ }
+#endif
+ for (int plane = 0; plane < num_planes; plane++) {
+ aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+ cpi->pick_lr_ctxt.rusi[plane] = NULL;
+ }
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 0000000000..d1d0b0cec6
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+// Enable extra debugging for loop restoration costing?
+//
+// If this is set to 1, then we record not just the selected LR parameters, but
+// also the values which the search process thinks they should be delta-coded
+// against. Then, when writing out the bitstream, we verify this information,
+// to help ensure that the search code is costing things properly
+#define DEBUG_LR_COSTING 0
+
+#if DEBUG_LR_COSTING
+#define MAX_LR_UNITS_W 64
+#define MAX_LR_UNITS_H 64
+
+// Storage for reference parameters.
+//
+// The storage size is determined by:
+// * This is always written and then checked within the same frame encode pass,
+// so we do not need to buffer multiple frames of data
+// * The parameters can be different per plane within one frame
+// * The relevant set of ref parameters can differ between the search where
+// we set the frame restoration mode to RESTORE_WIENER, and the search where
+// we set it to RESTORE_SWITCHABLE.
+// So we need to store at least two sets of Wiener params and two sets of
+// SGR params, and the easiest way to do this is to index by
+// frame_restoration_type
+extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+ [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif // DEBUG_LR_COSTING
+
+static const uint8_t g_shuffle_stats_data[16] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static const uint8_t g_shuffle_stats_highbd_data[32] = {
+ 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+ 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+};
+
+static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ for (int i = v_start; i < v_end; i++) {
+ for (int j = h_start; j < h_end; j++) {
+ sum += src[i * stride + j];
+ }
+ }
+ uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+ return (uint8_t)avg;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
+ int h_end, int v_start, int v_end,
+ int stride) {
+ uint64_t sum = 0;
+ for (int i = v_start; i < v_end; i++) {
+ for (int j = h_start; j < h_end; j++) {
+ sum += src[i * stride + j];
+ }
+ }
+ uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+ return (uint16_t)avg;
+}
+#endif
+
+/*!\brief Algorithm for AV1 loop restoration search and estimation.
+ *
+ * \ingroup in_loop_restoration
+ * This function determines proper restoration filter types and
+ * associated parameters for each restoration unit in a frame.
+ *
+ * \param[in] sd Source frame buffer
+ * \param[in,out] cpi Top-level encoder structure
+ *
+ * \remark Nothing is returned. Instead, chosen restoration filter
+ * types and parameters are stored per plane in the \c rst_info structure
+ * of type \ref RestorationInfo inside \c cpi->common:
+ * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane
+ * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists
+ * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists
+ * \par
+ * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2
+ * are populated:
+ * \arg \c rst_info[ \c p ].\c frame_restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * for each \c u in 0, 1, ..., \c n( \c p ) - 1,
+ * where \c n( \c p ) is the number of restoration units in plane \c p.
+ * \par
+ * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type
+ * \ref RestorationUnitInfo are populated:
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR
+ * \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR
+ * neither, depending on
+ * \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ *
+ */
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 0000000000..2e8710108b
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES_PUSTATS 8
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 12
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+ av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+ HIDDEN_LAYERS_0_NODES] = {
+ -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f,
+ -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f,
+ 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f,
+ 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f,
+ -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f,
+ -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f,
+ -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f,
+ -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f,
+ 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f,
+ -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f,
+ -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f,
+ -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f,
+ 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f,
+ -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+ 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f,
+ 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+ HIDDEN_LAYERS_1_NODES] = {
+ 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f,
+ 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f,
+ -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f,
+ 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f,
+ 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f,
+ -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f,
+ -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f,
+ -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f,
+ 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f,
+ 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f,
+ -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f,
+ -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f,
+ -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f,
+ 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f,
+ -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f,
+ -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f,
+ 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f,
+ -2.7566f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+ 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+ 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f,
+ };
+
+static const float
+ av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+ 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f,
+ 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f,
+ };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+ 4.5103f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+ NUM_FEATURES_PUSTATS, // num_inputs
+ LOGITS_NODES, // num_outputs
+ NUM_HIDDEN_LAYERS, // num_hidden_layers
+ { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
+ {
+ av1_pustats_rate_hiddenlayer_0_kernel,
+ av1_pustats_rate_hiddenlayer_1_kernel,
+ av1_pustats_rate_logits_kernel,
+ },
+ {
+ av1_pustats_rate_hiddenlayer_0_bias,
+ av1_pustats_rate_hiddenlayer_1_bias,
+ av1_pustats_rate_logits_bias,
+ },
+};
+
+static const float
+ av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+ HIDDEN_LAYERS_0_NODES] = {
+ -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+ 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f,
+ 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f,
+ 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f,
+ 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f,
+ -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f,
+ -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f,
+ -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f,
+ 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+ 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+ -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f,
+ -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f,
+ 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f,
+ -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+ 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+ 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+ HIDDEN_LAYERS_1_NODES] = {
+ -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f,
+ -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f,
+ 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f,
+ 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+ -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f,
+ -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f,
+ -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f,
+ 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f,
+ -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f,
+ 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f,
+ 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f,
+ -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+ 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f,
+ 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f,
+ 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f,
+ -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f,
+ -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f,
+ -0.4164f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+ -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+ 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f,
+ };
+
+static const float
+ av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+ -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f,
+ 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f,
+ };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+ 2.3371f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+ NUM_FEATURES_PUSTATS, // num_inputs
+ LOGITS_NODES, // num_outputs
+ NUM_HIDDEN_LAYERS, // num_hidden_layers
+ { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
+ {
+ av1_pustats_dist_hiddenlayer_0_kernel,
+ av1_pustats_dist_hiddenlayer_1_kernel,
+ av1_pustats_dist_logits_kernel,
+ },
+ {
+ av1_pustats_dist_hiddenlayer_0_bias,
+ av1_pustats_dist_hiddenlayer_1_bias,
+ av1_pustats_dist_logits_bias,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
new file mode 100644
index 0000000000..efe909b6db
--- /dev/null
+++ b/third_party/aom/av1/encoder/random.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Advance the generator to its next state, and generate the next 32-bit output.
+// Note that the low bits of this output are comparatively low-quality, so users
+// of this function should ensure that the high bits factor through to their
+// outputs.
+static INLINE uint32_t lcg_next(uint32_t *state) {
+ *state = (uint32_t)(*state * 1103515245ULL + 12345);
+ return *state;
+}
+
+// Generate a random number in the range [0, 32768).
+static INLINE uint32_t lcg_rand16(uint32_t *state) {
+ return (lcg_next(state) / 65536) % 32768;
+}
+
+// Generate a random number in the range [0, n)
+// This is implemented as (rand() * n) / <range of RNG> rather than
+// rand() % n, for a few reasons: This implementation is faster and less biased,
+// and if is a power of 2, this uses the higher-quality top bits from the RNG
+// output rather than the lower-quality bottom bits.
+static INLINE uint32_t lcg_randint(uint32_t *state, uint32_t n) {
+ uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32;
+ return (uint32_t)v;
+}
+
+// Generate a random number in the range [lo, hi)
+static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo,
+ uint32_t hi) {
+ assert(lo < hi);
+ return lo + lcg_randint(state, hi - lo);
+}
+
+// Pick k distinct numbers from the set {0, ..., n-1}
+// All possible sets of k numbers, and all possible orderings of those numbers,
+// are equally likely.
+//
+// Note: The algorithm used here uses resampling to avoid choosing repeated
+// values. This works well as long as n >> k, but can potentially lead to many
+// resampling attempts if n is equal to or only slightly larger than k.
+static INLINE void lcg_pick(int n, int k, int *out, unsigned int *seed) {
+ assert(0 <= k && k <= n);
+ for (int i = 0; i < k; i++) {
+ int v;
+
+ // Inner resampling loop
+ // We have to use a goto here because C does not have a multi-level continue
+ // statement
+ resample:
+ v = (int)lcg_randint(seed, n);
+ for (int j = 0; j < i; j++) {
+ if (v == out[j]) {
+ // Repeated v, resample
+ goto resample;
+ }
+ }
+
+ // New v, accept
+ out[i] = v;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 0000000000..df86380272
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,3587 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2
+#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0
+
+#define FRAME_OVERHEAD_BITS 200
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ switch (bit_depth) { \
+ case AOM_BITS_8: name = name##_8; break; \
+ case AOM_BITS_10: name = name##_10; break; \
+ case AOM_BITS_12: name = name##_12; break; \
+ default: \
+ assert(0 && \
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+ " or AOM_BITS_12"); \
+ name = NULL; \
+ } \
+ } while (0)
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+
+static int gf_high = 2400;
+static int gf_low = 300;
+#ifdef STRICT_RC
+static int kf_high = 3200;
+#else
+static int kf_high = 5000;
+#endif
+static int kf_low = 400;
+
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg,
+ int width, int height) {
+ return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height);
+}
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+ aom_bit_depth_t bit_depth) {
+ const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+ // Special case handling to deal with the step from q2.0
+ // down to lossless mode represented by q 1.0.
+ if (minqtarget <= 2.0) return 0;
+
+ return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+ int *arfgf_high, int *inter, int *rtc,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+ kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+ arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+ arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+ rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+ }
+}
+
+static void rc_init_minq_luts(void) {
+ init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+ arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+ inter_minq_8, rtc_minq_8, AOM_BITS_8);
+ init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+ arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+ inter_minq_10, rtc_minq_10, AOM_BITS_10);
+ init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+ arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+ inter_minq_12, rtc_minq_12, AOM_BITS_12);
+}
+
+void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); }
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+ // Convert the index to a real Q value (scaled down to match old Q values)
+ switch (bit_depth) {
+ case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0;
+ case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0;
+ case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1.0;
+ }
+}
+
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+ const int is_screen_content_type) {
+ int enumerator;
+
+ if (is_screen_content_type) {
+ enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000;
+ } else {
+ enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000;
+ }
+
+ return enumerator;
+}
+
+static int get_init_ratio(double sse) { return (int)(300000 / sse); }
+
+int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
+ double correction_factor, int accurate_estimate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_screen_content_type = cpi->is_screen_content_type;
+ const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+ const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int enumerator = av1_get_bpmb_enumerator(frame_type, is_screen_content_type);
+
+ assert(correction_factor <= MAX_BPB_FACTOR &&
+ correction_factor >= MIN_BPB_FACTOR);
+
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME &&
+ accurate_estimate && cpi->rec_sse != UINT64_MAX) {
+ const int mbs = cm->mi_params.MBs;
+ const double sse_sqrt =
+ (double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) /
+ (double)mbs;
+ const int ratio = (cpi->rc.bit_est_ratio == 0) ? get_init_ratio(sse_sqrt)
+ : cpi->rc.bit_est_ratio;
+ // Clamp the enumerator to lower the q fluctuations.
+ enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000);
+ }
+
+ // q based adjustment to baseline enumerator
+ return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ const int mbs = cm->mi_params.MBs;
+ const int bpm =
+ (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor,
+ cpi->sf.hl_sf.accurate_bit_estimate));
+ return AOMMAX(FRAME_OVERHEAD_BITS,
+ (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+ FRAME_UPDATE_TYPE frame_update_type) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const int min_frame_target =
+ AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+ // Clip the frame target to the minimum setup value.
+ if (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE) {
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ } else if (target < min_frame_target) {
+ target = min_frame_target;
+ }
+
+ // Clip the frame target to the maximum allowed value.
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ if (oxcf->rc_cfg.max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+
+ return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ if (rc_cfg->max_intra_bitrate_pct) {
+ const int64_t max_rate =
+ (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ return (int)target;
+}
+
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size,
+ bool is_screen) {
+ const int current_temporal_layer = svc->temporal_layer_id;
+ for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers;
+ ++i) {
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+ lp_rc->bits_off_target +=
+ (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
+ // Clip buffer level to maximum buffer size for the layer.
+ lp_rc->bits_off_target =
+ AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+ lp_rc->buffer_level = lp_rc->bits_off_target;
+
+ // For screen-content mode: don't let buffer level go below threshold,
+ // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+ // up sooner after slide change with big oveshoot.
+ if (is_screen) {
+ lp_rc->bits_off_target =
+ AOMMAX(lp_rc->bits_off_target, -lp_rc->maximum_buffer_size);
+ lp_rc->buffer_level = lp_rc->bits_off_target;
+ }
+ }
+}
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+ // Non-viewable frames are a special case and are treated as pure overhead.
+ if (!cm->show_frame)
+ p_rc->bits_off_target -= encoded_frame_size;
+ else
+ p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+ // Clip the buffer level to the maximum specified buffer size.
+ p_rc->bits_off_target =
+ AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+ // For screen-content mode: don't let buffel level go below threshold,
+ // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+ // up sooner after slide change with big oveshoot.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+ p_rc->bits_off_target =
+ AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size);
+ p_rc->buffer_level = p_rc->bits_off_target;
+
+ if (cpi->ppi->use_svc)
+ update_layer_buffer_level(&cpi->svc, encoded_frame_size,
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+
+#if CONFIG_FPMT_TEST
+ /* The variable temp_buffer_level is introduced for quality
+ * simulation purpose, it retains the value previous to the parallel
+ * encode frames. The variable is updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ p_rc->temp_buffer_level = p_rc->buffer_level;
+ }
+#endif
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+ double framerate) {
+ // Assume we do not need any constraint lower than 4K 20 fps
+ static const double factor_safe = 3840 * 2160 * 20.0;
+ const double factor = (double)width * height * framerate;
+ const int default_interval =
+ clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+ if (factor <= factor_safe)
+ return default_interval;
+ else
+ return AOMMAX(default_interval,
+ (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+ // Note this logic makes:
+ // 4K24: 5
+ // 4K30: 6
+ // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+ int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+ interval += (interval & 0x01); // Round to even value
+ interval = AOMMAX(MAX_GF_INTERVAL, interval);
+ return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_primary_rc_init(const AV1EncoderConfig *oxcf,
+ PRIMARY_RATE_CONTROL *p_rc) {
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ int worst_allowed_q = rc_cfg->worst_allowed_q;
+
+ int min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ int max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ if (min_gf_interval == 0)
+ min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ oxcf->input_cfg.init_framerate);
+ if (max_gf_interval == 0)
+ max_gf_interval = av1_rc_get_default_max_gf_interval(
+ oxcf->input_cfg.init_framerate, min_gf_interval);
+ p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2;
+ p_rc->this_key_frame_forced = 0;
+ p_rc->next_key_frame_forced = 0;
+ p_rc->ni_frames = 0;
+
+ p_rc->tot_q = 0.0;
+ p_rc->total_actual_bits = 0;
+ p_rc->total_target_bits = 0;
+ p_rc->buffer_level = p_rc->starting_buffer_level;
+
+ if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) {
+ worst_allowed_q = 255;
+ }
+ if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) {
+ p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q;
+ p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q;
+ } else {
+ p_rc->avg_frame_qindex[KEY_FRAME] =
+ (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+ p_rc->avg_frame_qindex[INTER_FRAME] =
+ (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+ }
+ p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
+ oxcf->tool_cfg.bit_depth);
+ p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
+ p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
+
+ for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ p_rc->rate_correction_factors[i] = 0.7;
+ }
+ p_rc->rate_correction_factors[KF_STD] = 1.0;
+ p_rc->bits_off_target = p_rc->starting_buffer_level;
+
+ p_rc->rolling_target_bits =
+ (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+ p_rc->rolling_actual_bits =
+ (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ rc->frames_since_key = 8; // Sensible default for first frame.
+ rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+ rc->frames_till_gf_update_due = 0;
+ rc->ni_av_qi = rc_cfg->worst_allowed_q;
+ rc->ni_tot_qi = 0;
+
+ rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ oxcf->input_cfg.init_framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ oxcf->input_cfg.init_framerate, rc->min_gf_interval);
+ rc->avg_frame_low_motion = 0;
+
+ rc->resize_state = ORIG;
+ rc->resize_avg_qp = 0;
+ rc->resize_buffer_underflow = 0;
+ rc->resize_count = 0;
+ rc->rtc_external_ratectrl = 0;
+ rc->frame_level_fast_extra_bits = 0;
+ rc->use_external_qp_one_pass = 0;
+}
+
+static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level,
+ int drop_mark) {
+ SVC *svc = &cpi->svc;
+ if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 ||
+ cpi->svc.framedrop_mode == AOM_LAYER_DROP) {
+ return (buffer_level <= drop_mark);
+ } else {
+ // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on
+ // buffer is checked on current and upper spatial layers.
+ for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ PRIMARY_RATE_CONTROL *lrc = &lc->p_rc;
+ // Exclude check for layer whose bitrate is 0.
+ if (lc->target_bandwidth > 0) {
+ const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark;
+ const int drop_mark_layer =
+ (int)(drop_thresh * lrc->optimal_buffer_level / 100);
+ if (lrc->buffer_level <= drop_mark_layer) return true;
+ }
+ }
+ return false;
+ }
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int64_t buffer_level =
+ simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level;
+#else
+ int64_t buffer_level = p_rc->buffer_level;
+#endif
+ // Never drop on key frame, or for frame whose base layer is key.
+ // If drop_count_consec hits or exceeds max_consec_drop then don't drop.
+ if (cpi->common.current_frame.frame_type == KEY_FRAME ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+ !oxcf->rc_cfg.drop_frames_water_mark ||
+ (rc->max_consec_drop > 0 &&
+ rc->drop_count_consec >= rc->max_consec_drop)) {
+ return 0;
+ } else {
+ SVC *svc = &cpi->svc;
+ // In the full_superframe framedrop mode for svc, if the previous spatial
+ // layer was dropped, drop the current spatial layer.
+ if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ svc->drop_spatial_layer[svc->spatial_layer_id - 1] &&
+ svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP)
+ return 1;
+ // -1 is passed here for drop_mark since we are checking if
+ // buffer goes below 0 (<= -1).
+ if (check_buffer_below_thresh(cpi, buffer_level, -1)) {
+ // Always drop if buffer is below 0.
+ rc->drop_count_consec++;
+ return 1;
+ } else {
+ // If buffer is below drop_mark, for now just drop every other frame
+ // (starting with the next frame) until it increases back over drop_mark.
+ const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
+ p_rc->optimal_buffer_level / 100);
+ const bool buffer_below_thresh =
+ check_buffer_below_thresh(cpi, buffer_level, drop_mark);
+ if (!buffer_below_thresh && rc->decimation_factor > 0) {
+ --rc->decimation_factor;
+ } else if (buffer_below_thresh && rc->decimation_factor == 0) {
+ rc->decimation_factor = 1;
+ }
+ if (rc->decimation_factor > 0) {
+ if (rc->decimation_count > 0) {
+ --rc->decimation_count;
+ rc->drop_count_consec++;
+ return 1;
+ } else {
+ rc->decimation_count = rc->decimation_factor;
+ return 0;
+ }
+ } else {
+ rc->decimation_count = 0;
+ return 0;
+ }
+ }
+ }
+}
+
+static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
+ int width, int height) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1_COMMON *const cm = &cpi->common;
+ const SVC *const svc = &cpi->svc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ // Flag to indicate previous frame has overshoot, and buffer level
+ // for current frame is low (less than ~half of optimal). For such
+ // (inter) frames, if the source_sad is non-zero, relax the max_delta_up
+ // and clamp applied below.
+ const bool overshoot_buffer_low =
+ cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 &&
+ p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) &&
+ rc->frames_since_key > 4;
+ int max_delta_down;
+ int max_delta_up = overshoot_buffer_low ? 60 : 20;
+ const int change_avg_frame_bandwidth =
+ abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
+ 0.1 * (rc->avg_frame_bandwidth);
+
+ // Set the maximum adjustment down for Q for this frame.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->apply_cyclic_refresh) {
+ // For static screen type content limit the Q drop till the start of the
+ // next refresh cycle.
+ if (cpi->is_screen_content_type &&
+ (cpi->cyclic_refresh->sb_index > cpi->cyclic_refresh->last_sb_index)) {
+ max_delta_down = AOMMIN(8, AOMMAX(1, rc->q_1_frame / 32));
+ } else {
+ max_delta_down = AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
+ }
+ if (!cpi->ppi->use_svc && cpi->is_screen_content_type) {
+ // Link max_delta_up to max_delta_down and buffer status.
+ if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
+ max_delta_up = AOMMAX(4, max_delta_down);
+ } else {
+ max_delta_up = AOMMAX(8, max_delta_down);
+ }
+ }
+ } else {
+ max_delta_down = (cpi->is_screen_content_type)
+ ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16))
+ : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
+ }
+ // If resolution changes or avg_frame_bandwidth significantly changed,
+ // then set this flag to indicate change in target bits per macroblock.
+ const int change_target_bits_mb =
+ cm->prev_frame &&
+ (width != cm->prev_frame->width || height != cm->prev_frame->height ||
+ change_avg_frame_bandwidth);
+ // Apply some control/clamp to QP under certain conditions.
+ // Delay the use of the clamping for svc until after num_temporal_layers,
+ // to make they have been set for each temporal layer.
+ if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 &&
+ (!cpi->ppi->use_svc ||
+ svc->current_superframe > (unsigned int)svc->number_temporal_layers) &&
+ !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
+ (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
+ !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
+ // If in the previous two frames we have seen both overshoot and undershoot
+ // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have
+ // not been set due to dropped frames.
+ if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
+ rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 &&
+ rc->q_2_frame > 0 && !overshoot_buffer_low) {
+ int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
+ AOMMAX(rc->q_1_frame, rc->q_2_frame));
+ // If the previous frame had overshoot and the current q needs to
+ // increase above the clamped value, reduce the clamp for faster reaction
+ // to overshoot.
+ if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10)
+ q = (q + qclamp) >> 1;
+ else
+ q = qclamp;
+ }
+ // Adjust Q base on source content change from scene detection.
+ if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
+ rc->frames_since_key > 10 && rc->frame_source_sad > 0 &&
+ !cpi->rc.rtc_external_ratectrl) {
+ const int bit_depth = cm->seq_params->bit_depth;
+ double delta =
+ (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
+ // Push Q downwards if content change is decreasing and buffer level
+ // is stable (at least 1/4-optimal level), so not overshooting. Do so
+ // only for high Q to avoid excess overshoot.
+ // Else reduce decrease in Q from previous frame if content change is
+ // increasing and buffer is below max (so not undershooting).
+ if (delta < 0.0 &&
+ p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+ q > (rc->worst_quality >> 1)) {
+ double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
+ double q_val = av1_convert_qindex_to_q(q, bit_depth);
+ q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
+ p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size,
+ p_rc->optimal_buffer_level << 1)) {
+ q = (3 * q + rc->q_1_frame) >> 2;
+ }
+ }
+ // Limit the decrease in Q from previous frame.
+ if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down;
+ // Limit the increase in Q from previous frame.
+ else if (q - rc->q_1_frame > max_delta_up)
+ q = rc->q_1_frame + max_delta_up;
+ }
+ // Adjustment for temporal layers.
+ if (svc->number_temporal_layers > 1 && svc->spatial_layer_id == 0 &&
+ !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
+ cpi->oxcf.resize_cfg.resize_mode != RESIZE_DYNAMIC) {
+ if (svc->temporal_layer_id > 0) {
+ // Constrain enhancement relative to the previous base TL0.
+ // Get base temporal layer TL0.
+ const int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the
+ // last TL0 frame.
+ if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth &&
+ q < lc->p_rc.last_q[INTER_FRAME] - 4)
+ q = lc->p_rc.last_q[INTER_FRAME] - 4;
+ } else if (cpi->svc.temporal_layer_id == 0 &&
+ p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+ rc->frame_source_sad < 100000) {
+ // Push base TL0 Q down if buffer is stable and frame_source_sad
+ // is below threshold.
+ int delta = (svc->number_temporal_layers == 2) ? 4 : 10;
+ q = q - delta;
+ }
+ }
+ // For non-svc (single layer): if resolution has increased push q closer
+ // to the active_worst to avoid excess overshoot.
+ if (!cpi->ppi->use_svc && cm->prev_frame &&
+ (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height))
+ q = (q + active_worst_quality) >> 1;
+ // For single layer RPS: Bias Q based on distance of closest reference.
+ if (cpi->ppi->rtc_ref.bias_recovery_frame) {
+ const int min_dist = av1_svc_get_min_ref_dist(cpi);
+ q = q - AOMMIN(min_dist, 20);
+ }
+ return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
+static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
+ KF_STD, // KF_UPDATE
+ INTER_NORMAL, // LF_UPDATE
+ GF_ARF_STD, // GF_UPDATE
+ GF_ARF_STD, // ARF_UPDATE
+ INTER_NORMAL, // OVERLAY_UPDATE
+ INTER_NORMAL, // INTNL_OVERLAY_UPDATE
+ GF_ARF_LOW, // INTNL_ARF_UPDATE
+};
+
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group,
+ int gf_frame_index) {
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+ assert(update_type < FRAME_UPDATE_TYPES);
+ return rate_factor_levels[update_type];
+}
+
+/*!\brief Gets a rate vs Q correction factor
+ *
+ * This function returns the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \return Returns a correction factor for the current frame
+ */
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+ int height) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ double rcf;
+ double rate_correction_factors_kfstd;
+ double rate_correction_factors_gfarfstd;
+ double rate_correction_factors_internormal;
+
+ rate_correction_factors_kfstd =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[KF_STD]
+ : p_rc->rate_correction_factors[KF_STD];
+ rate_correction_factors_gfarfstd =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[GF_ARF_STD]
+ : p_rc->rate_correction_factors[GF_ARF_STD];
+ rate_correction_factors_internormal =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[INTER_NORMAL]
+ : p_rc->rate_correction_factors[INTER_NORMAL];
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ rcf = rate_correction_factors_kfstd;
+ } else if (is_stat_consumption_stage(cpi)) {
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ double rate_correction_factors_rflvl =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[rf_lvl]
+ : p_rc->rate_correction_factors[rf_lvl];
+ rcf = rate_correction_factors_rflvl;
+ } else {
+ if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+ !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+ (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+ cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
+ rcf = rate_correction_factors_gfarfstd;
+ else
+ rcf = rate_correction_factors_internormal;
+ }
+ rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+/*!\brief Sets a rate vs Q correction factor
+ *
+ * This function updates the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] is_encode_stage Indicates if recode loop or post-encode
+ * \param[in] factor New correction factor
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \remark Updates the rate correction factor for the
+ * current frame type in cpi->rc.
+ */
+static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage,
+ double factor, int width, int height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ int update_default_rcf = 1;
+ // Normalize RCF to account for the size-dependent scaling factor.
+ factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
+
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ p_rc->rate_correction_factors[KF_STD] = factor;
+ } else if (is_stat_consumption_stage(cpi)) {
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ if (is_encode_stage &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ rc->frame_level_rate_correction_factors[rf_lvl] = factor;
+ update_default_rcf = 0;
+ }
+ if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor;
+ } else {
+ if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+ !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+ (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+ cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) {
+ p_rc->rate_correction_factors[GF_ARF_STD] = factor;
+ } else {
+ if (is_encode_stage &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor;
+ update_default_rcf = 0;
+ }
+ if (update_default_rcf)
+ p_rc->rate_correction_factors[INTER_NORMAL] = factor;
+ }
+ }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
+ int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ double correction_factor = 1.0;
+ double rate_correction_factor =
+ get_rate_correction_factor(cpi, width, height);
+ double adjustment_limit;
+ int projected_size_based_on_q = 0;
+ int cyclic_refresh_active =
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
+
+ // Do not update the rate factors for arf overlay frames.
+ if (cpi->rc.is_src_frame_alt_ref) return;
+
+ // Don't update rate correction factors here on scene changes as
+ // it is already reset in av1_encodedframe_overshoot_cbr(),
+ // but reset variables related to previous frame q and size.
+ // Note that the counter of frames since the last scene change
+ // is only valid when cyclic refresh mode is enabled and that
+ // this break out only applies to scene changes that are not
+ // recorded as INTRA only key frames.
+ if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) &&
+ (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) &&
+ !frame_is_intra_only(cm) && !cpi->ppi->use_svc) {
+ cpi->rc.q_2_frame = cm->quant_params.base_qindex;
+ cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+ return;
+ }
+
+ // Clear down mmx registers to allow floating point in what follows
+
+ // Work out how big we would have expected the frame to be at this Q given
+ // the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ if (cyclic_refresh_active) {
+ projected_size_based_on_q =
+ av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+ } else {
+ projected_size_based_on_q = av1_estimate_bits_at_q(
+ cpi, cm->quant_params.base_qindex, rate_correction_factor);
+ }
+ // Work out a size correction factor.
+ if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+ correction_factor = (double)cpi->rc.projected_frame_size /
+ (double)projected_size_based_on_q;
+
+ // Clamp correction factor to prevent anything too extreme
+ correction_factor = AOMMAX(correction_factor, 0.25);
+
+ cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+ cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+ cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+ if (correction_factor > 1.1)
+ cpi->rc.rc_1_frame = -1;
+ else if (correction_factor < 0.9)
+ cpi->rc.rc_1_frame = 1;
+ else
+ cpi->rc.rc_1_frame = 0;
+
+ // Decide how heavily to dampen the adjustment
+ if (correction_factor > 0.0) {
+ if (cpi->is_screen_content_type) {
+ adjustment_limit =
+ 0.25 + 0.5 * AOMMIN(0.5, fabs(log10(correction_factor)));
+ } else {
+ adjustment_limit =
+ 0.25 + 0.75 * AOMMIN(0.5, fabs(log10(correction_factor)));
+ }
+ } else {
+ adjustment_limit = 0.75;
+ }
+
+ // Adjustment to delta Q and number of blocks updated in cyclic refressh
+ // based on over or under shoot of target in current frame.
+ if (cyclic_refresh_active && cpi->rc.this_frame_target > 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ if (correction_factor > 1.25) {
+ cr->percent_refresh_adjustment =
+ AOMMAX(cr->percent_refresh_adjustment - 1, -5);
+ cr->rate_ratio_qdelta_adjustment =
+ AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0);
+ } else if (correction_factor < 0.5) {
+ cr->percent_refresh_adjustment =
+ AOMMIN(cr->percent_refresh_adjustment + 1, 5);
+ cr->rate_ratio_qdelta_adjustment =
+ AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25);
+ }
+ }
+
+ if (correction_factor > 1.01) {
+ // We are not already at the worst allowable quality
+ correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit));
+ rate_correction_factor = rate_correction_factor * correction_factor;
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ } else if (correction_factor < 0.99) {
+ // We are not already at the best allowable quality
+ correction_factor = 1.0 / correction_factor;
+ correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit));
+ correction_factor = 1.0 / correction_factor;
+
+ rate_correction_factor = rate_correction_factor * correction_factor;
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor,
+ width, height);
+}
+
+// Calculate rate for the given 'q'.
+static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
+ double correction_factor, int q) {
+ const AV1_COMMON *const cm = &cpi->common;
+ return use_cyclic_refresh
+ ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
+ : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q,
+ correction_factor,
+ cpi->sf.hl_sf.accurate_bit_estimate);
+}
+
+/*!\brief Searches for a Q index value predicted to give an average macro
+ * block rate closest to the target value.
+ *
+ * Similar to find_qindex_by_rate() function, but returns a q index with a
+ * rate just above or below the desired rate, depending on which of the two
+ * rates is closer to the desired rate.
+ * Also, respects the selected aq_mode when computing the rate.
+ *
+ * \ingroup rate_control
+ * \param[in] desired_bits_per_mb Target bits per mb
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] correction_factor Current Q to rate correction factor
+ * \param[in] best_qindex Min allowed Q value.
+ * \param[in] worst_qindex Max allowed Q value.
+ *
+ * \return Returns a correction factor for the current frame
+ */
+static int find_closest_qindex_by_rate(int desired_bits_per_mb,
+ const AV1_COMP *cpi,
+ double correction_factor,
+ int best_qindex, int worst_qindex) {
+ const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->apply_cyclic_refresh;
+
+ // Find 'qindex' based on 'desired_bits_per_mb'.
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const int mid_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+
+ // Calculate rate difference of this q index from the desired rate.
+ const int curr_q = low;
+ const int curr_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q);
+ const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb)
+ ? desired_bits_per_mb - curr_bits_per_mb
+ : INT_MAX;
+ assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) ||
+ curr_q == worst_qindex);
+
+ // Calculate rate difference for previous q index too.
+ const int prev_q = curr_q - 1;
+ int prev_bit_diff;
+ if (curr_bit_diff == INT_MAX || curr_q == best_qindex) {
+ prev_bit_diff = INT_MAX;
+ } else {
+ const int prev_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q);
+ assert(prev_bits_per_mb > desired_bits_per_mb);
+ prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb;
+ }
+
+ // Pick one of the two q indices, depending on which one has rate closer to
+ // the desired rate.
+ return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q;
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality,
+ int width, int height) {
+ const int MBs = av1_get_MBs(width, height);
+ const double correction_factor =
+ get_rate_correction_factor(cpi, width, height);
+ const int target_bits_per_mb =
+ (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs);
+
+ int q =
+ find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
+ active_best_quality, active_worst_quality);
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi))
+ return adjust_q_cbr(cpi, q, active_worst_quality, width, height);
+
+ return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+ int *low_motion_minq, int *high_motion_minq) {
+ if (gfu_boost > high) {
+ return low_motion_minq[q];
+ } else if (gfu_boost < low) {
+ return high_motion_minq[q];
+ } else {
+ const int gap = high - low;
+ const int offset = high - gfu_boost;
+ const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ return low_motion_minq[q] + adjustment;
+ }
+}
+
+static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+ aom_bit_depth_t bit_depth) {
+ int *kf_low_motion_minq;
+ int *kf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+ return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high,
+ kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality_no_rc(int gfu_boost, int q,
+ aom_bit_depth_t bit_depth) {
+ int *arfgf_low_motion_minq;
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return get_active_quality(q, gfu_boost, gf_low, gf_high,
+ arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+ aom_bit_depth_t bit_depth) {
+ return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth);
+}
+
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return arfgf_high_motion_minq[q];
+}
+
+static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const unsigned int curr_frame = cpi->common.current_frame.frame_number;
+ int active_worst_quality;
+ int last_q_key_frame;
+ int last_q_inter_frame;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME]
+ : p_rc->last_q[KEY_FRAME];
+ last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME]
+ : p_rc->last_q[INTER_FRAME];
+#else
+ last_q_key_frame = p_rc->last_q[KEY_FRAME];
+ last_q_inter_frame = p_rc->last_q[INTER_FRAME];
+#endif
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ active_worst_quality =
+ curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2;
+ } else {
+ if (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame ||
+ refresh_frame->alt_ref_frame)) {
+ active_worst_quality =
+ curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame;
+ } else {
+ active_worst_quality =
+ curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2;
+ }
+ }
+ return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
+ // Adjust active_worst_quality: If buffer is above the optimal/target level,
+ // bring active_worst_quality down depending on fullness of buffer.
+ // If buffer is below the optimal level, let the active_worst_quality go from
+ // ambient Q (at buffer = optimal level) to worst_quality level
+ // (at buffer = critical level).
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ const SVC *const svc = &cpi->svc;
+ unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
+ // Buffer level below which we push active_worst to worst_quality.
+ int64_t critical_level = p_rc->optimal_buffer_level >> 3;
+ int64_t buff_lvl_step = 0;
+ int adjustment = 0;
+ int active_worst_quality;
+ int ambient_qp;
+ if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality;
+ // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+ // for the first few frames following key frame. These are both initialized
+ // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+ // So for first few frames following key, the qp of that key frame is weighted
+ // into the active_worst_quality setting. For SVC the key frame should
+ // correspond to layer (0, 0), so use that for layer context.
+ int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME];
+ if (svc->number_temporal_layers > 1) {
+ int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+ const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+ avg_qindex_key =
+ AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
+ }
+ ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
+ ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
+ : p_rc->avg_frame_qindex[INTER_FRAME];
+ ambient_qp = AOMMIN(rc->worst_quality, ambient_qp);
+
+ if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
+ // Adjust down.
+ int max_adjustment_down; // Maximum adjustment down for Q
+
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && !cpi->ppi->use_svc &&
+ (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) {
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp);
+ max_adjustment_down = AOMMIN(4, active_worst_quality / 16);
+ } else {
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+ max_adjustment_down = active_worst_quality / 3;
+ }
+
+ if (max_adjustment_down) {
+ buff_lvl_step =
+ ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) /
+ max_adjustment_down);
+ if (buff_lvl_step)
+ adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) /
+ buff_lvl_step);
+ active_worst_quality -= adjustment;
+ }
+ } else if (p_rc->buffer_level > critical_level) {
+ // Adjust up from ambient Q.
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp);
+ if (critical_level) {
+ buff_lvl_step = (p_rc->optimal_buffer_level - critical_level);
+ if (buff_lvl_step) {
+ adjustment = (int)((rc->worst_quality - ambient_qp) *
+ (p_rc->optimal_buffer_level - p_rc->buffer_level) /
+ buff_lvl_step);
+ }
+ active_worst_quality += adjustment;
+ }
+ } else {
+ // Set to worst_quality if buffer is below critical level.
+ active_worst_quality = rc->worst_quality;
+ }
+ return active_worst_quality;
+}
+
+// Calculate the active_best_quality level.
+static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
+ int active_worst_quality,
+ int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ int *rtc_minq;
+ const int bit_depth = cm->seq_params->bit_depth;
+ int active_best_quality = rc->best_quality;
+ ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
+
+ if (frame_is_intra_only(cm)) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (p_rc->this_key_frame_forced) {
+ int qindex = p_rc->last_boosted_qindex;
+ double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ (last_boosted_q * 0.75), bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (current_frame->frame_number > 0) {
+ // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+ double q_val;
+ active_best_quality = get_kf_active_quality(
+ p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+ cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ int q = active_worst_quality;
+ if (rc->frames_since_key > 1 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = p_rc->avg_frame_qindex[INTER_FRAME];
+ }
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ FRAME_TYPE frame_type =
+ (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+ if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality)
+ active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ }
+ return active_best_quality;
+}
+
+#if RT_PASSIVE_STRATEGY
+static int get_q_passive_strategy(const AV1_COMP *const cpi,
+ const int q_candidate, const int threshold) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ int sum = 0;
+ int count = 0;
+ int i = 1;
+ while (i < MAX_Q_HISTORY) {
+ int frame_id = current_frame->frame_number - i;
+ if (frame_id <= 0) break;
+ sum += p_rc->q_history[frame_id % MAX_Q_HISTORY];
+ ++count;
+ ++i;
+ }
+ if (count > 0) {
+ const int avg_q = sum / count;
+ if (abs(avg_q - q_candidate) <= threshold) return avg_q;
+ }
+ return q_candidate;
+}
+#endif // RT_PASSIVE_STRATEGY
+
+/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ int q;
+ int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+ int active_best_quality = calc_active_best_quality_no_stats_cbr(
+ cpi, active_worst_quality, width, height);
+ assert(has_no_stats_stage(cpi));
+ assert(cpi->oxcf.rc_cfg.mode == AOM_CBR);
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
+ current_frame->frame_number != 0) {
+ int qdelta = 0;
+ qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+ active_worst_quality, 2.0);
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+#if RT_PASSIVE_STRATEGY
+ if (current_frame->frame_type != KEY_FRAME &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ q = get_q_passive_strategy(cpi, q, 50);
+ }
+#endif // RT_PASSIVE_STRATEGY
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) {
+ return gf_group->layer_depth[gf_index];
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+ const PRIMARY_RATE_CONTROL *p_rc,
+ const AV1EncoderConfig *const oxcf,
+ int intra_only, aom_superres_mode superres_mode,
+ int superres_denom) {
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ static const double cq_adjust_threshold = 0.1;
+ int active_cq_level = rc_cfg->cq_level;
+ if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) {
+ // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
+ // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
+ if ((superres_mode == AOM_SUPERRES_QTHRESH ||
+ superres_mode == AOM_SUPERRES_AUTO) &&
+ superres_denom != SCALE_NUMERATOR) {
+ int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
+ if (intra_only && rc->frames_to_key <= 1) {
+ mult = 0;
+ } else if (intra_only) {
+ mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME;
+ } else {
+ mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME;
+ }
+ active_cq_level = AOMMAX(
+ active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
+ }
+ }
+ if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) {
+ const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits;
+ if (x < cq_adjust_threshold) {
+ active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+ }
+ }
+ return active_cq_level;
+}
+
+/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Any rate control other than constant bit-rate mode:
+ * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+
+ assert(has_no_stats_stage(cpi));
+ assert(rc_mode == AOM_VBR ||
+ (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) ||
+ rc_mode == AOM_Q);
+
+ const int cq_level =
+ get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+ cpi->superres_mode, cm->superres_scale_denominator);
+ const int bit_depth = cm->seq_params->bit_depth;
+
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi);
+ int q;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+ if (frame_is_intra_only(cm)) {
+ if (rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex =
+ av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (p_rc->this_key_frame_forced) {
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ int qindex = p_rc->last_boosted_qindex;
+#endif
+ const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex = av1_compute_qdelta(
+ rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else { // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+
+ active_best_quality = get_kf_active_quality(
+ p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta on active_best_quality.
+ {
+ const double q_val =
+ av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ q = (rc->frames_since_key > 1 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ ? p_rc->avg_frame_qindex[INTER_FRAME]
+ : p_rc->avg_frame_qindex[KEY_FRAME];
+ // For constrained quality dont allow Q less than the cq level
+ if (rc_mode == AOM_CQ) {
+ if (q < cq_level) q = cq_level;
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+ } else if (rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex =
+ (refresh_frame->alt_ref_frame)
+ ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+ : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ }
+ } else {
+ if (rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+ 0.70, 1.0, 0.85, 1.0 };
+ const int delta_qindex = av1_compute_qdelta(
+ rc, q_val,
+ q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL],
+ bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ active_best_quality =
+ (current_frame->frame_number > 1)
+ ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]]
+ : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]];
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ {
+ int qdelta = 0;
+ if (current_frame->frame_type == KEY_FRAME &&
+ !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
+ qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+ active_worst_quality, 2.0);
+ } else if (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+ qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+ active_worst_quality, 1.75);
+ }
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ if (rc_mode == AOM_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames
+ } else if ((current_frame->frame_type == KEY_FRAME) &&
+ p_rc->this_key_frame_forced) {
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ q = p_rc->last_boosted_qindex;
+#endif
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
+ 1.50, 1.25, 1.15,
+ 1.0 };
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(gf_group, cpi->gf_frame_index);
+ const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const double rate_factor =
+ (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
+
+ return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor);
+}
+
+// This unrestricted Q selection on CQ mode is useful when testing new features,
+// but may lead to Q being out of range on current RC restrictions
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level =
+ get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+ cm->superres_scale_denominator);
+ const int bit_depth = cm->seq_params->bit_depth;
+ const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
+ (void)width;
+ (void)height;
+ assert(has_no_stats_stage(cpi));
+ assert(cpi->oxcf.rc_cfg.mode == AOM_CQ);
+
+ *top_index = q;
+ *bottom_index = q;
+
+ return q;
+}
+#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE
+
+#define STATIC_MOTION_THRESH 95
+static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+ int *active_best, int *active_worst,
+ int cq_level) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ int active_best_quality;
+ int active_worst_quality = *active_worst;
+ const int bit_depth = cm->seq_params->bit_depth;
+
+ if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
+ // If the next frame is also a key frame or the current frame is the
+ // only frame in the sequence in AOM_Q mode, just use the cq_level
+ // as q.
+ active_best_quality = cq_level;
+ active_worst_quality = cq_level;
+ } else if (p_rc->this_key_frame_forced) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ double last_boosted_q;
+ int delta_qindex;
+ int qindex;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int last_boosted_qindex = simulate_parallel_frame
+ ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
+ active_best_quality = qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 1.25, bit_depth);
+ active_worst_quality =
+ AOMMIN(qindex + delta_qindex, active_worst_quality);
+ } else {
+ qindex = last_boosted_qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 0.50, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ }
+ } else {
+ // Not forced keyframe.
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ // Baseline value derived from active_worst_quality and kf boost.
+ active_best_quality =
+ get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
+ if (cpi->is_screen_content_type) {
+ active_best_quality /= 2;
+ }
+
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+ active_best_quality /= 3;
+ }
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Make a further adjustment based on the kf zero motion measure.
+ if (is_stat_consumption_stage_twopass(cpi))
+ q_adj_factor +=
+ 0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct);
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+
+ // Tweak active_best_quality for AOM_Q mode when superres is on, as this
+ // will be used directly as 'q' later.
+ if (oxcf->rc_cfg.mode == AOM_Q &&
+ (cpi->superres_mode == AOM_SUPERRES_QTHRESH ||
+ cpi->superres_mode == AOM_SUPERRES_AUTO) &&
+ cm->superres_scale_denominator != SCALE_NUMERATOR) {
+ active_best_quality =
+ AOMMAX(active_best_quality -
+ ((cm->superres_scale_denominator - SCALE_NUMERATOR) *
+ SUPERRES_QADJ_PER_DENOM_KEYFRAME),
+ 0);
+ }
+ }
+ *active_best = active_best_quality;
+ *active_worst = active_worst_quality;
+}
+
+static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
+ const int is_intrl_arf_boost,
+ int *active_worst,
+ int *active_best) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ int active_best_quality = *active_best;
+ int active_worst_quality = *active_worst;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq
+ : cpi->ppi->twopass.extend_minq;
+ int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq
+ : cpi->ppi->twopass.extend_maxq;
+#endif
+ // Extension to max or min Q if undershoot or overshoot is outside
+ // the permitted range.
+ if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
+ if (frame_is_intra_only(cm) ||
+ (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || is_intrl_arf_boost ||
+ refresh_frame->alt_ref_frame))) {
+#if CONFIG_FPMT_TEST
+ active_best_quality -= extend_minq;
+ active_worst_quality += (extend_maxq / 2);
+#else
+ active_best_quality -= cpi->ppi->twopass.extend_minq / 4;
+ active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
+#endif
+ } else {
+#if CONFIG_FPMT_TEST
+ active_best_quality -= extend_minq / 2;
+ active_worst_quality += extend_maxq;
+#else
+ active_best_quality -= cpi->ppi->twopass.extend_minq / 4;
+ active_worst_quality += cpi->ppi->twopass.extend_maxq;
+#endif
+ }
+ }
+
+#ifndef STRICT_RC
+ // Static forced key frames Q restrictions dealt with elsewhere.
+ if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
+ (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+ const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
+ active_worst_quality =
+ AOMMAX(active_worst_quality + qdelta, active_best_quality);
+ }
+#endif
+
+ // Modify active_best_quality for downscaled normal frames.
+ if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+ int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+ active_best_quality, 2.0);
+ active_best_quality =
+ AOMMAX(active_best_quality + qdelta, rc->best_quality);
+ }
+
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *active_best = active_best_quality;
+ *active_worst = active_worst_quality;
+}
+
+/*!\brief Gets a Q value to use for the current frame
+ *
+ *
+ * Selects a Q value from a permitted range that we estimate
+ * will result in approximately the target number of bits.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] width Width of frame
+ * \param[in] height Height of frame
+ * \param[in] active_worst_quality Max Q allowed
+ * \param[in] active_best_quality Min Q allowed
+ *
+ * \return The suggested Q for this frame.
+ */
+static int get_q(const AV1_COMP *cpi, const int width, const int height,
+ const int active_worst_quality,
+ const int active_best_quality) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int q;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg;
+ int last_boosted_qindex = simulate_parallel_frame
+ ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
+
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
+ (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced &&
+ cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+ rc->frames_to_key > 1)) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames.
+ } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) {
+ // If static since last kf use better of last boosted and last kf q.
+ if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
+ } else {
+ q = AOMMIN(last_boosted_qindex,
+ (active_best_quality + active_worst_quality) / 2);
+ }
+ q = clamp(q, active_best_quality, active_worst_quality);
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > active_worst_quality) {
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target < rc->max_frame_bandwidth) {
+ q = active_worst_quality;
+ }
+ }
+ q = AOMMAX(q, active_best_quality);
+ }
+ return q;
+}
+
+// Returns |active_best_quality| for an inter frame.
+// The |active_best_quality| depends on different rate control modes:
+// VBR, Q, CQ, CBR.
+// The returning active_best_quality could further be adjusted in
+// adjust_active_best_and_worst_quality().
+static int get_active_best_quality(const AV1_COMP *const cpi,
+ const int active_worst_quality,
+ const int cq_level, const int gf_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bit_depth = cm->seq_params->bit_depth;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+ int active_best_quality = 0;
+ const int is_intrl_arf_boost =
+ gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+ int is_leaf_frame =
+ !(gf_group->update_type[gf_index] == ARF_UPDATE ||
+ gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost);
+
+ // TODO(jingning): Consider to rework this hack that covers issues incurred
+ // in lightfield setting.
+ if (cm->tiles.large_scale) {
+ is_leaf_frame = !(refresh_frame->golden_frame ||
+ refresh_frame->alt_ref_frame || is_intrl_arf_boost);
+ }
+ const int is_overlay_frame = rc->is_src_frame_alt_ref;
+
+ if (is_leaf_frame || is_overlay_frame) {
+ if (rc_mode == AOM_Q) return cq_level;
+
+ active_best_quality = inter_minq[active_worst_quality];
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ return active_best_quality;
+ }
+
+ // Determine active_best_quality for frames that are not leaf or overlay.
+ int q = active_worst_quality;
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = p_rc->avg_frame_qindex[INTER_FRAME];
+ }
+ if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ // Constrained quality use slightly lower active best.
+ if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+ active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor);
+ if (!is_intrl_arf_boost) return active_best_quality;
+
+ if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q;
+ int this_height = gf_group_pyramid_level(gf_group, gf_index);
+ while (this_height > 1) {
+ active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+ --this_height;
+ }
+ return active_best_quality;
+}
+
+// Returns the q_index for a single frame in the GOP.
+// This function assumes that rc_mode == AOM_Q mode.
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+ int gf_pyramid_level, int arf_q) {
+ const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE;
+ int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE ||
+ gf_update_type == OVERLAY_UPDATE ||
+ gf_update_type == INTNL_OVERLAY_UPDATE;
+
+ if (is_leaf_or_overlay_frame) return base_q_index;
+
+ if (!is_intrl_arf_boost) return arf_q;
+
+ int active_best_quality = arf_q;
+ int active_worst_quality = base_q_index;
+
+ while (gf_pyramid_level > 1) {
+ active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+ --gf_pyramid_level;
+ }
+ return active_best_quality;
+}
+
+// Returns the q_index for the ARF in the GOP.
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+ double arf_boost_factor) {
+ int active_best_quality =
+ get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
+ const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
+ const int boost = min_boost - active_best_quality;
+ return min_boost - (int)(boost * arf_boost_factor);
+}
+
+static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
+ int height, int gf_index,
+ int *bottom_index, int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level =
+ get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+ cpi->superres_mode, cm->superres_scale_denominator);
+ int active_best_quality = 0;
+ int active_worst_quality = rc->active_worst_quality;
+ int q;
+
+ if (frame_is_intra_only(cm)) {
+ get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+ &active_worst_quality, cq_level);
+ } else {
+ // Active best quality limited by previous layer.
+ active_best_quality =
+ get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+ }
+
+ if (cq_level > 0) active_best_quality = AOMMAX(1, active_best_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ *top_index = AOMMAX(*top_index, rc->best_quality);
+ *top_index = AOMMIN(*top_index, rc->worst_quality);
+
+ *bottom_index = AOMMAX(*bottom_index, rc->best_quality);
+ *bottom_index = AOMMIN(*bottom_index, rc->worst_quality);
+
+ q = active_best_quality;
+
+ q = AOMMAX(q, rc->best_quality);
+ q = AOMMIN(q, rc->worst_quality);
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+ return q;
+}
+
+/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
+ *
+ * Handles the the general cases not covered by
+ * \ref rc_pick_q_and_bounds_no_stats_cbr() and
+ * \ref rc_pick_q_and_bounds_no_stats()
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[in] gf_index Index of this frame in the golden frame group
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+ int gf_index, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(has_no_stats_stage(cpi),
+ cpi->oxcf.rc_cfg.mode == AOM_Q &&
+ gf_group->update_type[gf_index] != ARF_UPDATE));
+ const int cq_level =
+ get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+ cpi->superres_mode, cm->superres_scale_denominator);
+
+ if (oxcf->rc_cfg.mode == AOM_Q) {
+ return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index,
+ bottom_index, top_index);
+ }
+
+ int active_best_quality = 0;
+ int active_worst_quality = rc->active_worst_quality;
+ int q;
+
+ const int is_intrl_arf_boost =
+ gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+
+ if (frame_is_intra_only(cm)) {
+ get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+ &active_worst_quality, cq_level);
+#ifdef STRICT_RC
+ active_best_quality = 0;
+#endif
+ } else {
+ // Active best quality limited by previous layer.
+ const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
+
+ if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) {
+ active_best_quality = get_active_best_quality(cpi, active_worst_quality,
+ cq_level, gf_index);
+ } else {
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int local_active_best_quality =
+ simulate_parallel_frame
+ ? p_rc->temp_active_best_quality[pyramid_level - 1]
+ : p_rc->active_best_quality[pyramid_level - 1];
+ active_best_quality = local_active_best_quality + 1;
+#else
+ active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1;
+#endif
+
+ active_best_quality = AOMMIN(active_best_quality, active_worst_quality);
+#ifdef STRICT_RC
+ active_best_quality += (active_worst_quality - active_best_quality) / 16;
+#else
+ active_best_quality += (active_worst_quality - active_best_quality) / 2;
+#endif
+ }
+
+ // For alt_ref and GF frames (including internal arf frames) adjust the
+ // worst allowed quality as well. This insures that even on hard
+ // sections we dont clamp the Q at the same value for arf frames and
+ // leaf (non arf) frames. This is important to the TPL model which assumes
+ // Q drops with each arf level.
+ if (!(rc->is_src_frame_alt_ref) &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame ||
+ is_intrl_arf_boost)) {
+ active_worst_quality =
+ (active_best_quality + (3 * active_worst_quality) + 2) / 4;
+ }
+ }
+
+ adjust_active_best_and_worst_quality(
+ cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality);
+ q = get_q(cpi, width, height, active_worst_quality, active_best_quality);
+
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target >= rc->max_frame_bandwidth &&
+ q > active_worst_quality) {
+ active_worst_quality = q;
+ }
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+ return q;
+}
+
+static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source;
+ if (unscaled_src == NULL) return;
+
+ const uint8_t *src_y = unscaled_src->y_buffer;
+ const int src_ystride = unscaled_src->y_stride;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const uint8_t *pre_y = yv12->buffers[0];
+ const int pre_ystride = yv12->strides[0];
+
+ // TODO(yunqing): support scaled reference frames.
+ if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return;
+
+ for (int i = 0; i < 2; ++i) {
+ if (unscaled_src->widths[i] != yv12->widths[i] ||
+ unscaled_src->heights[i] != yv12->heights[i]) {
+ return;
+ }
+ }
+
+ const int num_mi_cols = cm->mi_params.mi_cols;
+ const int num_mi_rows = cm->mi_params.mi_rows;
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ int num_samples = 0;
+ // sse is computed on 64x64 blocks
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+
+ uint64_t fsse = 0;
+ cpi->rec_sse = 0;
+
+ for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+ for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+ unsigned int sse;
+ uint8_t src[64 * 64] = { 0 };
+ // Apply 4x4 block averaging/denoising on source frame.
+ for (int i = 0; i < 64; i += 4) {
+ for (int j = 0; j < 64; j += 4) {
+ const unsigned int avg =
+ aom_avg_4x4(src_y + i * src_ystride + j, src_ystride);
+
+ for (int m = 0; m < 4; ++m) {
+ for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg;
+ }
+ }
+ }
+
+ cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse);
+ fsse += sse;
+ num_samples++;
+ src_y += 64;
+ pre_y += 64;
+ }
+ src_y += (src_ystride << 6) - (sb_cols << 6);
+ pre_y += (pre_ystride << 6) - (sb_cols << 6);
+ }
+ assert(num_samples > 0);
+ // Ensure rec_sse > 0
+ if (num_samples > 0) cpi->rec_sse = fsse > 0 ? fsse : 1;
+}
+
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index,
+ int *bottom_index, int *top_index) {
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int q;
+ // TODO(sarahparker) merge no-stats vbr and altref q computation
+ // with rc_pick_q_and_bounds().
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
+ gf_group->update_type[gf_index] == ARF_UPDATE) &&
+ has_no_stats_stage(cpi)) {
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+ // TODO(yunqing): the results could be used for encoder optimization.
+ cpi->rec_sse = UINT64_MAX;
+ if (cpi->sf.hl_sf.accurate_bit_estimate &&
+ cpi->common.current_frame.frame_type != KEY_FRAME)
+ rc_compute_variance_onepass_rt(cpi);
+
+ q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index,
+ top_index);
+ // preserve copy of active worst quality selected.
+ cpi->rc.active_worst_quality = *top_index;
+
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+ } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+ q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index,
+ top_index);
+#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE
+ } else {
+ q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index,
+ top_index);
+ }
+ } else {
+ q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
+ top_index);
+ }
+ if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q;
+
+ return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ } else {
+ // For very small rate targets where the fractional adjustment
+ // may be tiny make sure there is at least a minimum range.
+ assert(cpi->sf.hl_sf.recode_tolerance <= 100);
+ const int tolerance = (int)AOMMAX(
+ 100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+ *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
+ *frame_over_shoot_limit =
+ AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
+ }
+}
+
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ rc->this_frame_target = target;
+
+ // Modify frame size target when down-scaled.
+ if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) {
+ rc->this_frame_target =
+ (int)(rc->this_frame_target *
+ resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height));
+ }
+
+ // Target rate per SB64 (including partial SB64s.
+ rc->sb64_target_rate =
+ (int)(((int64_t)rc->this_frame_target << 12) / (width * height));
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+ // this frame refreshes means next frames don't unless specified by user
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->frames_since_golden = 0;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Update the Golden frame usage counts.
+ if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) {
+ rc->frames_since_golden = 0;
+ } else if (cpi->common.show_frame) {
+ rc->frames_since_golden++;
+ }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+
+ const int is_intrnl_arf =
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+ const int qindex = cm->quant_params.base_qindex;
+
+#if RT_PASSIVE_STRATEGY
+ const int frame_number = current_frame->frame_number % MAX_Q_HISTORY;
+ p_rc->q_history[frame_number] = qindex;
+#endif // RT_PASSIVE_STRATEGY
+
+ // Update rate control heuristics
+ rc->projected_frame_size = (int)(bytes_used << 3);
+
+ // Post encode loop adjustment of Q prediction.
+ av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height);
+
+ // Update bit estimation ratio.
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->sf.hl_sf.accurate_bit_estimate) {
+ const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+ cm->seq_params->bit_depth);
+ const int this_bit_est_ratio =
+ (int)(rc->projected_frame_size * q / sqrt((double)cpi->rec_sse));
+ cpi->rc.bit_est_ratio =
+ cpi->rc.bit_est_ratio == 0
+ ? this_bit_est_ratio
+ : (7 * cpi->rc.bit_est_ratio + this_bit_est_ratio) / 8;
+ }
+
+ // Keep a record of last Q and ambient average Q.
+ if (current_frame->frame_type == KEY_FRAME) {
+ p_rc->last_q[KEY_FRAME] = qindex;
+ p_rc->avg_frame_qindex[KEY_FRAME] =
+ ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+ } else {
+ if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
+ cpi->rc.rtc_external_ratectrl ||
+ (!rc->is_src_frame_alt_ref &&
+ !(refresh_frame->golden_frame || is_intrnl_arf ||
+ refresh_frame->alt_ref_frame))) {
+ p_rc->last_q[INTER_FRAME] = qindex;
+ p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+ 3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+ p_rc->ni_frames++;
+ p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth);
+ p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames;
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ rc->ni_tot_qi += qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames;
+ }
+ }
+ // Keep record of last boosted (KF/GF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((qindex < p_rc->last_boosted_qindex) ||
+ (current_frame->frame_type == KEY_FRAME) ||
+ (!p_rc->constrained_gf_group &&
+ (refresh_frame->alt_ref_frame || is_intrnl_arf ||
+ (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) {
+ p_rc->last_boosted_qindex = qindex;
+ }
+ if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
+
+ update_buffer_level(cpi, rc->projected_frame_size);
+ rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+
+ // Rolling monitors of whether we are over or underspending used to help
+ // regulate min and Max Q in two pass.
+ if (av1_frame_scaled(cm))
+ rc->this_frame_target = (int)(rc->this_frame_target /
+ resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+ cm->width, cm->height));
+ if (current_frame->frame_type != KEY_FRAME) {
+ p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+ p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+ p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+ p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+ }
+
+ // Actual bits spent
+ p_rc->total_actual_bits += rc->projected_frame_size;
+ p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+ if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
+ cpi->oxcf.gf_cfg.enable_auto_arf) &&
+ refresh_frame->alt_ref_frame &&
+ (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
+ // Update the alternate reference frame stats as appropriate.
+ update_alt_ref_frame_stats(cpi);
+ else
+ // Update the Golden frame stats as appropriate.
+ update_golden_frame_stats(cpi);
+
+#if CONFIG_FPMT_TEST
+ /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q,
+ * temp_last_boosted_qindex are introduced only for quality simulation
+ * purpose, it retains the value previous to the parallel encode frames. The
+ * variables are updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ for (int i = 0; i < FRAME_TYPES; i++) {
+ p_rc->temp_last_q[i] = p_rc->last_q[i];
+ }
+ p_rc->temp_avg_q = p_rc->avg_q;
+ p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex;
+ p_rc->temp_total_actual_bits = p_rc->total_actual_bits;
+ p_rc->temp_projected_frame_size = rc->projected_frame_size;
+ for (int i = 0; i < RATE_FACTOR_LEVELS; i++)
+ p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i];
+ }
+#endif
+ if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+ if (cpi->refresh_frame.golden_frame)
+ rc->frame_num_last_gf_refresh = current_frame->frame_number;
+ rc->prev_coded_width = cm->width;
+ rc->prev_coded_height = cm->height;
+ rc->frame_number_encoded++;
+ rc->prev_frame_is_dropped = 0;
+ rc->drop_count_consec = 0;
+ // if (current_frame->frame_number == 1 && cm->show_frame)
+ /*
+ rc->this_frame_target =
+ (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+ cm->width, cm->height));
+ */
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+ // Update buffer level with zero size, update frame counters, and return.
+ update_buffer_level(cpi, 0);
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ }
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+ cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+ cpi->rc.prev_coded_width = cpi->common.width;
+ cpi->rc.prev_coded_height = cpi->common.height;
+ cpi->rc.prev_frame_is_dropped = 1;
+ // On a scene/slide change for dropped frame: reset the avg_source_sad to 0,
+ // otherwise the avg_source_sad can get too large and subsequent frames
+ // may miss the scene/slide detection.
+ if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0;
+ if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) {
+ cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true;
+ cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true;
+ }
+}
+
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+ int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const double mid_q = av1_convert_qindex_to_q(mid, bit_depth);
+ if (mid_q < desired_q) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+ assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q ||
+ low == worst_qindex);
+ return low;
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth) {
+ const int start_index =
+ av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality);
+ const int target_index =
+ av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality);
+ return target_index - start_index;
+}
+
+// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex],
+// assuming 'correction_factor' is 1.0.
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// bits per mb <= desired_bits_per_mb.
+// If no such q index is found, returns 'worst_qindex'.
+static int find_qindex_by_rate(const AV1_COMP *const cpi,
+ int desired_bits_per_mb, FRAME_TYPE frame_type,
+ int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const int mid_bits_per_mb =
+ av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+ assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <=
+ desired_bits_per_mb ||
+ low == worst_qindex);
+ return low;
+}
+
+int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio) {
+ const RATE_CONTROL *rc = &cpi->rc;
+
+ // Look up the current projected bits per block for the base index
+ const int base_bits_per_mb =
+ av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0);
+
+ // Find the target bits per mb based on the base value and given ratio.
+ const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+ const int target_index = find_qindex_by_rate(
+ cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality);
+ return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+ RATE_CONTROL *const rc) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Special case code for 1 pass fixed Q mode tests
+ if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) {
+ rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ rc->static_scene_max_gf_interval = rc->min_gf_interval + 1;
+ } else {
+ // Set Maximum gf/arf interval
+ rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ cpi->framerate, rc->min_gf_interval);
+ /*
+ * Extended max interval for genuinely static scenes like slide shows.
+ * The no.of.stats available in the case of LAP is limited,
+ * hence setting to max_gf_interval.
+ */
+ if (cpi->ppi->lap_enabled)
+ rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
+ else
+ rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+
+ if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+ rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+ }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int vbr_max_bits;
+ const int MBs = av1_get_MBs(width, height);
+
+ rc->avg_frame_bandwidth =
+ (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
+ rc->min_frame_bandwidth =
+ (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100);
+
+ rc->min_frame_bandwidth =
+ AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+ // A maximum bitrate for a frame is defined.
+ // The baseline for this aligns with HW implementations that
+ // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+ // per 16x16 MB (averaged over a frame). However this limit is extended if
+ // a very high rate is given on the command line or the the rate cannnot
+ // be acheived because of a user specificed max q (e.g. when the user
+ // specifies lossless encode.
+ vbr_max_bits =
+ (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) /
+ 100);
+ rc->max_frame_bandwidth =
+ AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+ av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int64_t vbr_bits_off_target = simulate_parallel_frame
+ ? cpi->ppi->p_rc.temp_vbr_bits_off_target
+ : p_rc->vbr_bits_off_target;
+#else
+ int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target;
+#endif
+ const int stats_count =
+ cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
+ ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
+ : 0;
+ const int frame_window = AOMMIN(
+ 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
+ assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
+ if (frame_window > 0) {
+ const int max_delta = (int)AOMMIN(
+ abs((int)(vbr_bits_off_target / frame_window)),
+ ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+
+ // vbr_bits_off_target > 0 means we have extra bits to spend
+ // vbr_bits_off_target < 0 we are currently overshooting
+ *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
+ }
+
+#if CONFIG_FPMT_TEST
+ int64_t vbr_bits_off_target_fast =
+ simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast
+ : p_rc->vbr_bits_off_target_fast;
+#endif
+ // Fast redistribution of bits arising from massive local undershoot.
+ // Dont do it for kf,arf,gf or overlay frames.
+ if (!frame_is_kf_gf_arf(cpi) &&
+#if CONFIG_FPMT_TEST
+ vbr_bits_off_target_fast &&
+#else
+ p_rc->vbr_bits_off_target_fast &&
+#endif
+ !rc->is_src_frame_alt_ref) {
+ int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+ int fast_extra_bits;
+#if CONFIG_FPMT_TEST
+ fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits =
+ (int)AOMMIN(fast_extra_bits,
+ AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8));
+#else
+ fast_extra_bits =
+ (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits = (int)AOMMIN(
+ fast_extra_bits,
+ AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
+#endif
+ if (fast_extra_bits > 0) {
+ // Update this_frame_target only if additional bits are available from
+ // local undershoot.
+ *this_frame_target += (int)fast_extra_bits;
+ }
+ // Store the fast_extra_bits of the frame and reduce it from
+ // vbr_bits_off_target_fast during postencode stage.
+ rc->frame_level_fast_extra_bits = fast_extra_bits;
+ // Retaining the condition to udpate during postencode stage since
+ // fast_extra_bits are calculated based on vbr_bits_off_target_fast.
+ cpi->do_update_vbr_bits_off_target_fast = 1;
+ }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target_rate = rc->base_frame_target;
+
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ)
+ vbr_rate_correction(cpi, &target_rate);
+ av1_rc_set_frame_target(cpi, target_rate, width, height);
+}
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+ const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
+ static const int af_ratio = 10;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int64_t target;
+#if USE_ALTREF_FOR_ONE_PASS
+ if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+ frame_update_type == ARF_UPDATE) {
+ target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+ af_ratio) /
+ (p_rc->baseline_gf_interval + af_ratio - 1);
+ } else {
+ target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
+ (p_rc->baseline_gf_interval + af_ratio - 1);
+ }
+ if (target > INT_MAX) target = INT_MAX;
+#else
+ target = rc->avg_frame_bandwidth;
+#endif
+ return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type);
+}
+
+int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+ static const int kf_ratio = 25;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio;
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+ const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
+ const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level;
+ const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
+ int min_frame_target =
+ AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+ int target;
+
+ if (rc_cfg->gf_cbr_boost_pct) {
+ const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
+ if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+ target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+ af_ratio_pct) /
+ (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+ (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ }
+ } else {
+ target = rc->avg_frame_bandwidth;
+ }
+ if (cpi->ppi->use_svc) {
+ // Note that for layers, avg_frame_bandwidth is the cumulative
+ // per-frame-bandwidth. For the target size of this frame, use the
+ // layer average frame size (i.e., non-cumulative per-frame-bw).
+ int layer =
+ LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ target = lc->avg_frame_size;
+ min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+ }
+ if (diff > 0) {
+ // Lower the target bandwidth for this frame.
+ const int pct_low =
+ (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct);
+ target -= (target * pct_low) / 200;
+ } else if (diff < 0) {
+ // Increase the target bandwidth for this frame.
+ const int pct_high =
+ (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct);
+ target += (target * pct_high) / 200;
+ }
+ if (rc_cfg->max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ return AOMMAX(min_frame_target, target);
+}
+
+int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ int64_t target;
+ if (cpi->common.current_frame.frame_number == 0) {
+ target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
+ ? INT_MAX
+ : (int)(p_rc->starting_buffer_level / 2);
+ if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) {
+ target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1));
+ }
+ } else {
+ int kf_boost = 32;
+ int framerate = (int)round(cpi->framerate);
+
+ kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+ if (rc->frames_since_key < framerate / 2) {
+ kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+ }
+ target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+ }
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void set_golden_update(AV1_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int divisor = 10;
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+ divisor = cpi->cyclic_refresh->percent_refresh;
+
+ // Set minimum gf_interval for GF update to a multiple of the refresh period,
+ // with some max limit. Depending on past encoding stats, GF flag may be
+ // reset and update may not occur until next baseline_gf_interval.
+ const int gf_length_mult[2] = { 8, 4 };
+ if (divisor > 0)
+ p_rc->baseline_gf_interval =
+ AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor),
+ MAX_GF_INTERVAL_RT);
+ else
+ p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+ if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+ p_rc->baseline_gf_interval = 16;
+}
+
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ set_golden_update(cpi);
+
+ if (p_rc->baseline_gf_interval > rc->frames_to_key &&
+ cpi->oxcf.kf_cfg.auto_key)
+ p_rc->baseline_gf_interval = rc->frames_to_key;
+ p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+ p_rc->constrained_gf_group =
+ (p_rc->baseline_gf_interval >= rc->frames_to_key &&
+ cpi->oxcf.kf_cfg.auto_key)
+ ? 1
+ : 0;
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+ cpi->gf_frame_index = 0;
+ // SVC does not use GF as periodic boost.
+ // TODO(marpan): Find better way to disable this for SVC.
+ if (cpi->ppi->use_svc) {
+ SVC *const svc = &cpi->svc;
+ p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+ p_rc->gfu_boost = 1;
+ p_rc->constrained_gf_group = 0;
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+ for (int layer = 0;
+ layer < svc->number_spatial_layers * svc->number_temporal_layers;
+ ++layer) {
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+ lc->p_rc.gfu_boost = p_rc->gfu_boost;
+ lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
+ lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+ lc->group_index = 0;
+ }
+ }
+ gf_group->size = p_rc->baseline_gf_interval;
+ gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+ gf_group->refbuf_state[cpi->gf_frame_index] =
+ (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+}
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ const int resize_pending = is_frame_resize_pending(cpi);
+ if (!resize_pending && !rc->high_source_sad) {
+ // Check if we should disable GF refresh (if period is up),
+ // or force a GF refresh update (if we are at least halfway through
+ // period) based on QP. Look into add info on segment deltaq.
+ PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME];
+ const int allow_gf_update =
+ rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10);
+ int gf_update_changed = 0;
+ int thresh = 87;
+ if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) <
+ FIXED_GF_INTERVAL_RT &&
+ rc->frames_till_gf_update_due == 1 &&
+ cm->quant_params.base_qindex > avg_qp) {
+ // Disable GF refresh since QP is above the running average QP.
+ rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0;
+ gf_update_changed = 1;
+ cpi->refresh_frame.golden_frame = 0;
+ } else if (allow_gf_update &&
+ ((cm->quant_params.base_qindex < thresh * avg_qp / 100) ||
+ (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) {
+ // Force refresh since QP is well below average QP or this is a high
+ // motion frame.
+ rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1;
+ gf_update_changed = 1;
+ cpi->refresh_frame.golden_frame = 1;
+ }
+ if (gf_update_changed) {
+ set_baseline_gf_interval(cpi, INTER_FRAME);
+ int refresh_mask = 0;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+ << ref_frame_map_idx;
+ }
+ cm->current_frame.refresh_frame_flags = refresh_mask;
+ }
+ }
+}
+
+/*!\brief Setup the reference prediction structure for 1 pass real-time
+ *
+ * Set the reference prediction structure for 1 layer.
+ * Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+ * where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+ * either updated on LAST with period baseline_gf_interval (fixed slot)
+ * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] gf_update Flag to indicate if GF is updated
+ *
+ * \remark Nothing is returned. Instead the settings for the prediction
+ * structure are set in \c cpi-ext_flags; and the buffer slot index
+ * (for each of 7 references) and refresh flags (for each of the 8 slots)
+ * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[].
+ */
+void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) {
+ AV1_COMMON *const cm = &cpi->common;
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ RATE_CONTROL *const rc = &cpi->rc;
+ ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &ext_flags->refresh_frame;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ unsigned int frame_number = (cpi->oxcf.rc_cfg.drop_frames_water_mark)
+ ? rc->frame_number_encoded
+ : cm->current_frame.frame_number;
+ unsigned int lag_alt = 4;
+ int last_idx = 0;
+ int last_idx_refresh = 0;
+ int gld_idx = 0;
+ int alt_ref_idx = 0;
+ int last2_idx = 0;
+ ext_refresh_frame_flags->update_pending = 1;
+ ext_flags->ref_frame_flags = 0;
+ ext_refresh_frame_flags->last_frame = 1;
+ ext_refresh_frame_flags->golden_frame = 0;
+ ext_refresh_frame_flags->alt_ref_frame = 0;
+ // Decide altref lag adaptively for rt
+ if (cpi->sf.rt_sf.sad_based_adp_altref_lag) {
+ lag_alt = 6;
+ const uint64_t th_frame_sad[4][3] = {
+ { 18000, 18000, 18000 }, // HDRES CPU 9
+ { 25000, 25000, 25000 }, // MIDRES CPU 9
+ { 40000, 30000, 20000 }, // HDRES CPU10
+ { 30000, 25000, 20000 } // MIDRES CPU 10
+ };
+ int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1;
+ assert(th_idx < 4);
+ if (rc->avg_source_sad > th_frame_sad[th_idx][0])
+ lag_alt = 3;
+ else if (rc->avg_source_sad > th_frame_sad[th_idx][1])
+ lag_alt = 4;
+ else if (rc->avg_source_sad > th_frame_sad[th_idx][2])
+ lag_alt = 5;
+ }
+ // This defines the reference structure for 1 layer (non-svc) RTC encoding.
+ // To avoid the internal/default reference structure for non-realtime
+ // overwriting this behavior, we use the "svc" ref parameters from the
+ // external control SET_SVC_REF_FRAME_CONFIG.
+ // TODO(marpan): rename that control and the related internal parameters
+ // to rtc_ref.
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7;
+ for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0;
+ // Set the reference frame flags.
+ ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
+ if (!cpi->sf.rt_sf.force_only_last_ref) {
+ ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+ ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+ ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
+ }
+ const int sh = 6;
+ // Moving index slot for last: 0 - (sh - 1).
+ if (frame_number > 1) last_idx = ((frame_number - 1) % sh);
+ // Moving index for refresh of last: one ahead for next frame.
+ last_idx_refresh = (frame_number % sh);
+ gld_idx = 6;
+
+ // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+ if (frame_number > lag_alt) alt_ref_idx = ((frame_number - lag_alt) % sh);
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+ // Moving index for LAST2, lag behind LAST by 2 frames.
+ if (frame_number > 2) last2_idx = ((frame_number - 2) % sh);
+ }
+ rtc_ref->ref_idx[0] = last_idx; // LAST
+ rtc_ref->ref_idx[1] = last_idx_refresh; // LAST2 (for refresh of last).
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+ rtc_ref->ref_idx[1] = last2_idx; // LAST2
+ rtc_ref->ref_idx[2] = last_idx_refresh; // LAST3 (for refresh of last).
+ }
+ rtc_ref->ref_idx[3] = gld_idx; // GOLDEN
+ rtc_ref->ref_idx[6] = alt_ref_idx; // ALT_REF
+ // Refresh this slot, which will become LAST on next frame.
+ rtc_ref->refresh[last_idx_refresh] = 1;
+ // Update GOLDEN on period for fixed slot case.
+ if (gf_update && cm->current_frame.frame_type != KEY_FRAME) {
+ ext_refresh_frame_flags->golden_frame = 1;
+ rtc_ref->refresh[gld_idx] = 1;
+ }
+ rtc_ref->gld_idx_1layer = gld_idx;
+ // Set the flag to reduce the number of reference frame buffers used.
+ // This assumes that slot 7 is never used.
+ cpi->rt_reduce_num_ref_buffers = 1;
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7);
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7);
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7);
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7);
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7);
+}
+
+/*!\brief Check for scene detection, for 1 pass real-time mode.
+ *
+ * Compute average source sad (temporal sad: between current source and
+ * previous source) over a subset of superblocks. Use this is detect big changes
+ * in content and set the \c cpi->rc.high_source_sad flag.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_input Current and last input source frames
+ *
+ * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
+ * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated.
+ */
+static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
+ const EncodeFrameInput *frame_input) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ YV12_BUFFER_CONFIG const *const unscaled_src = frame_input->source;
+ YV12_BUFFER_CONFIG const *const unscaled_last_src = frame_input->last_source;
+ uint8_t *src_y;
+ int src_ystride;
+ int src_width;
+ int src_height;
+ uint8_t *last_src_y;
+ int last_src_ystride;
+ int last_src_width;
+ int last_src_height;
+ int width = cm->width;
+ int height = cm->height;
+ if (cpi->svc.number_spatial_layers > 1) {
+ width = cpi->oxcf.frm_dim_cfg.width;
+ height = cpi->oxcf.frm_dim_cfg.height;
+ }
+ if (width != cm->render_width || height != cm->render_height ||
+ unscaled_src == NULL || unscaled_last_src == NULL) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+ if (unscaled_src == NULL || unscaled_last_src == NULL) return;
+ src_y = unscaled_src->y_buffer;
+ src_ystride = unscaled_src->y_stride;
+ src_width = unscaled_src->y_width;
+ src_height = unscaled_src->y_height;
+ last_src_y = unscaled_last_src->y_buffer;
+ last_src_ystride = unscaled_last_src->y_stride;
+ last_src_width = unscaled_last_src->y_width;
+ last_src_height = unscaled_last_src->y_height;
+ if (src_width != last_src_width || src_height != last_src_height) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ return;
+ }
+ rc->high_source_sad = 0;
+ rc->percent_blocks_with_motion = 0;
+ rc->max_block_source_sad = 0;
+ rc->prev_avg_source_sad = rc->avg_source_sad;
+ int num_mi_cols = cm->mi_params.mi_cols;
+ int num_mi_rows = cm->mi_params.mi_rows;
+ if (cpi->svc.number_spatial_layers > 1) {
+ num_mi_cols = cpi->svc.mi_cols_full_resoln;
+ num_mi_rows = cpi->svc.mi_rows_full_resoln;
+ }
+ int num_zero_temp_sad = 0;
+ uint32_t min_thresh = 10000;
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0
+ ? 50000
+ : 100000;
+ }
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+ uint64_t avg_sad = 0;
+ uint64_t tmp_sad = 0;
+ int num_samples = 0;
+ const int thresh =
+ cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6;
+ // SAD is computed on 64x64 blocks
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+ uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5
+ int num_low_var_high_sumdiff = 0;
+ int light_change = 0;
+ // Flag to check light change or not.
+ const int check_light_change = 0;
+ // TODO(marpan): There seems some difference along the bottom border when
+ // using the source_last_tl0 for last_source (used for temporal layers or
+ // when previous frame is dropped).
+ // Remove this bord parameter when issue is resolved: difference is that
+ // non-zero sad exists along bottom border even though source is static.
+ const int border =
+ rc->prev_frame_is_dropped || cpi->svc.number_temporal_layers > 1;
+ // Store blkwise SAD for later use
+ if (width == cm->render_width && height == cm->render_height) {
+ if (cpi->src_sad_blk_64x64 == NULL) {
+ CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64,
+ (uint64_t *)aom_calloc(sb_cols * sb_rows,
+ sizeof(*cpi->src_sad_blk_64x64)));
+ }
+ }
+ // Avoid bottom and right border.
+ for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) {
+ for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+ tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+ last_src_ystride);
+ if (cpi->src_sad_blk_64x64 != NULL)
+ cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
+ if (check_light_change) {
+ unsigned int sse, variance;
+ variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+ last_src_ystride, &sse);
+ // Note: sse - variance = ((sum * sum) >> 12)
+ // Detect large lighting change.
+ if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
+ num_low_var_high_sumdiff++;
+ }
+ }
+ avg_sad += tmp_sad;
+ num_samples++;
+ if (tmp_sad == 0) num_zero_temp_sad++;
+ if (tmp_sad > rc->max_block_source_sad)
+ rc->max_block_source_sad = tmp_sad;
+
+ src_y += 64;
+ last_src_y += 64;
+ }
+ src_y += (src_ystride << 6) - (sb_cols << 6);
+ last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+ }
+ if (check_light_change && num_samples > 0 &&
+ num_low_var_high_sumdiff > (num_samples >> 1))
+ light_change = 1;
+ if (num_samples > 0) avg_sad = avg_sad / num_samples;
+ // Set high_source_sad flag if we detect very high increase in avg_sad
+ // between current and previous frame value(s). Use minimum threshold
+ // for cases where there is small change from content that is completely
+ // static.
+ if (!light_change &&
+ avg_sad >
+ AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
+ rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+ num_zero_temp_sad < 3 * (num_samples >> 2))
+ rc->high_source_sad = 1;
+ else
+ rc->high_source_sad = 0;
+ rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+ rc->frame_source_sad = avg_sad;
+ if (num_samples > 0)
+ rc->percent_blocks_with_motion =
+ ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+ // Scene detection is only on base SLO, and using full/orignal resolution.
+ // Pass the state to the upper spatial layers.
+ if (cpi->svc.number_spatial_layers > 1) {
+ SVC *svc = &cpi->svc;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ int tl = svc->temporal_layer_id;
+ const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->high_source_sad = rc->high_source_sad;
+ lrc->frame_source_sad = rc->frame_source_sad;
+ lrc->avg_source_sad = rc->avg_source_sad;
+ lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion;
+ lrc->max_block_source_sad = rc->max_block_source_sad;
+ }
+ }
+}
+
+/*!\brief Set the GF baseline interval for 1 pass real-time mode.
+ *
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_type frame type
+ *
+ * \return Return GF update flag, and update the \c cpi->rc with
+ * the next GF interval settings.
+ */
+static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
+ FRAME_TYPE frame_type) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int gf_update = 0;
+ const int resize_pending = is_frame_resize_pending(cpi);
+ // GF update based on frames_till_gf_update_due, also
+ // force upddate on resize pending frame or for scene change.
+ if ((resize_pending || rc->high_source_sad ||
+ rc->frames_till_gf_update_due == 0) &&
+ cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
+ set_baseline_gf_interval(cpi, frame_type);
+ gf_update = 1;
+ }
+ return gf_update;
+}
+
+static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
+ int prev_width, int prev_height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ SVC *const svc = &cpi->svc;
+ int target_bits_per_frame;
+ int active_worst_quality;
+ int qindex;
+ double tot_scale_change = (double)(resize_width * resize_height) /
+ (double)(prev_width * prev_height);
+ // Disable the skip mv search for svc on resize frame.
+ svc->skip_mvsearch_last = 0;
+ svc->skip_mvsearch_gf = 0;
+ svc->skip_mvsearch_altref = 0;
+ // Reset buffer level to optimal, update target size.
+ p_rc->buffer_level = p_rc->optimal_buffer_level;
+ p_rc->bits_off_target = p_rc->optimal_buffer_level;
+ rc->this_frame_target =
+ av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
+ target_bits_per_frame = rc->this_frame_target;
+ if (tot_scale_change > 4.0)
+ p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+ else if (tot_scale_change > 1.0)
+ p_rc->avg_frame_qindex[INTER_FRAME] =
+ (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
+ active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+ qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+ active_worst_quality, resize_width, resize_height);
+ // If resize is down, check if projected q index is close to worst_quality,
+ // and if so, reduce the rate correction factor (since likely can afford
+ // lower q for resized frame).
+ if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100)
+ p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+ // If resize is back up: check if projected q index is too much above the
+ // previous index, and if so, reduce the rate correction factor
+ // (since prefer to keep q for resized frame at least closet to previous q).
+ // Also check if projected qindex is close to previous qindex, if so
+ // increase correction factor (to push qindex higher and avoid overshoot).
+ if (tot_scale_change >= 1.0) {
+ if (tot_scale_change < 4.0 &&
+ qindex > 130 * p_rc->last_q[INTER_FRAME] / 100)
+ p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
+ if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100)
+ p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5;
+ }
+ if (svc->number_temporal_layers > 1) {
+ // Apply the same rate control reset to all temporal layers.
+ for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
+ LAYER_CONTEXT *lc = NULL;
+ lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ tl];
+ lc->rc.resize_state = rc->resize_state;
+ lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
+ lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
+ lc->p_rc.rate_correction_factors[INTER_NORMAL] =
+ p_rc->rate_correction_factors[INTER_NORMAL];
+ lc->p_rc.avg_frame_qindex[INTER_FRAME] =
+ p_rc->avg_frame_qindex[INTER_FRAME];
+ }
+ }
+}
+
+/*!\brief ChecK for resize based on Q, for 1 pass real-time mode.
+ *
+ * Check if we should resize, based on average QP from past x frames.
+ * Only allow for resize at most 1/2 scale down for now, Scaling factor
+ * for each step may be 3/4 or 1/2.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Return resized width/height in \c cpi->resize_pending_params,
+ * and update some resize counters in \c rc.
+ */
+static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ RESIZE_ACTION resize_action = NO_RESIZE;
+ const int avg_qp_thr1 = 70;
+ const int avg_qp_thr2 = 50;
+ // Don't allow for resized frame to go below 160x90, resize in steps of 3/4.
+ const int min_width = (160 * 4) / 3;
+ const int min_height = (90 * 4) / 3;
+ int down_size_on = 1;
+ // Don't resize on key frame; reset the counters on key frame.
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ rc->resize_avg_qp = 0;
+ rc->resize_count = 0;
+ rc->resize_buffer_underflow = 0;
+ return;
+ }
+ // No resizing down if frame size is below some limit.
+ if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
+
+ // Resize based on average buffer underflow and QP over some window.
+ // Ignore samples close to key frame, since QP is usually high after key.
+ if (cpi->rc.frames_since_key > cpi->framerate) {
+ const int window = AOMMIN(30, (int)(2 * cpi->framerate));
+ rc->resize_avg_qp += p_rc->last_q[INTER_FRAME];
+ if (cpi->ppi->p_rc.buffer_level <
+ (int)(30 * p_rc->optimal_buffer_level / 100))
+ ++rc->resize_buffer_underflow;
+ ++rc->resize_count;
+ // Check for resize action every "window" frames.
+ if (rc->resize_count >= window) {
+ int avg_qp = rc->resize_avg_qp / rc->resize_count;
+ // Resize down if buffer level has underflowed sufficient amount in past
+ // window, and we are at original or 3/4 of original resolution.
+ // Resize back up if average QP is low, and we are currently in a resized
+ // down state, i.e. 1/2 or 3/4 of original resolution.
+ // Currently, use a flag to turn 3/4 resizing feature on/off.
+ if (rc->resize_buffer_underflow > (rc->resize_count >> 2) &&
+ down_size_on) {
+ if (rc->resize_state == THREE_QUARTER) {
+ resize_action = DOWN_ONEHALF;
+ rc->resize_state = ONE_HALF;
+ } else if (rc->resize_state == ORIG) {
+ resize_action = DOWN_THREEFOUR;
+ rc->resize_state = THREE_QUARTER;
+ }
+ } else if (rc->resize_state != ORIG &&
+ avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+ if (rc->resize_state == THREE_QUARTER ||
+ avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) {
+ resize_action = UP_ORIG;
+ rc->resize_state = ORIG;
+ } else if (rc->resize_state == ONE_HALF) {
+ resize_action = UP_THREEFOUR;
+ rc->resize_state = THREE_QUARTER;
+ }
+ }
+ // Reset for next window measurement.
+ rc->resize_avg_qp = 0;
+ rc->resize_count = 0;
+ rc->resize_buffer_underflow = 0;
+ }
+ }
+ // If decision is to resize, reset some quantities, and check is we should
+ // reduce rate correction factor,
+ if (resize_action != NO_RESIZE) {
+ int resize_width = cpi->oxcf.frm_dim_cfg.width;
+ int resize_height = cpi->oxcf.frm_dim_cfg.height;
+ int resize_scale_num = 1;
+ int resize_scale_den = 1;
+ if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+ resize_scale_num = 3;
+ resize_scale_den = 4;
+ } else if (resize_action == DOWN_ONEHALF) {
+ resize_scale_num = 1;
+ resize_scale_den = 2;
+ }
+ resize_width = resize_width * resize_scale_num / resize_scale_den;
+ resize_height = resize_height * resize_scale_num / resize_scale_den;
+ resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height);
+ }
+ return;
+}
+
+static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ AV1_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+
+ // Very first frame has to be key frame.
+ if (cm->current_frame.frame_number == 0) return 1;
+ // Set key frame if forced by frame flags.
+ if (frame_flags & FRAMEFLAGS_KEY) return 1;
+ if (!cpi->ppi->use_svc) {
+ // Non-SVC
+ if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1;
+ } else {
+ // SVC
+ if (svc->spatial_layer_id == 0 &&
+ (cpi->oxcf.kf_cfg.auto_key &&
+ (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
+ svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)))
+ return 1;
+ }
+
+ return 0;
+}
+
+// Set to true if this frame is a recovery frame, for 1 layer RPS,
+// and whether we should apply some boost (QP, adjust speed features, etc).
+// Recovery frame here means frame whose closest reference suddenly
+// switched from previous frame to one much further away.
+// TODO(marpan): Consider adding on/off flag to SVC_REF_FRAME_CONFIG to
+// allow more control for applications.
+static bool set_flag_rps_bias_recovery_frame(const AV1_COMP *const cpi) {
+ if (cpi->ppi->rtc_ref.set_ref_frame_config &&
+ cpi->svc.number_temporal_layers == 1 &&
+ cpi->svc.number_spatial_layers == 1 &&
+ cpi->ppi->rtc_ref.reference_was_previous_frame) {
+ int min_dist = av1_svc_get_min_ref_dist(cpi);
+ // Only consider boost for this frame if its closest reference is further
+ // than x frames away, using x = 4 for now.
+ if (min_dist != INT_MAX && min_dist > 4) return true;
+ }
+ return false;
+}
+
+void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
+ const EncodeFrameInput *frame_input,
+ unsigned int frame_flags) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ SVC *const svc = &cpi->svc;
+ ResizePendingParams *const resize_pending_params =
+ &cpi->resize_pending_params;
+ int target;
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ if (cpi->ppi->use_svc) {
+ av1_update_temporal_layer_framerate(cpi);
+ av1_restore_layer_context(cpi);
+ }
+ cpi->ppi->rtc_ref.bias_recovery_frame = set_flag_rps_bias_recovery_frame(cpi);
+ // Set frame type.
+ if (set_key_frame(cpi, frame_flags)) {
+ *frame_type = KEY_FRAME;
+ p_rc->this_key_frame_forced =
+ cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
+ p_rc->kf_boost = DEFAULT_KF_BOOST_RT;
+ gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE;
+ gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME;
+ gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET;
+ if (cpi->ppi->use_svc) {
+ if (cm->current_frame.frame_number > 0)
+ av1_svc_reset_temporal_layers(cpi, 1);
+ svc->layer_context[layer].is_key_frame = 1;
+ }
+ rc->frame_number_encoded = 0;
+ cpi->ppi->rtc_ref.non_reference_frame = 0;
+ } else {
+ *frame_type = INTER_FRAME;
+ gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
+ gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
+ if (cpi->ppi->use_svc) {
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ lc->is_key_frame =
+ svc->spatial_layer_id == 0
+ ? 0
+ : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+ // If the user is setting the reference structure with
+ // set_ref_frame_config and did not set any references, set the
+ // frame type to Intra-only.
+ if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+ int no_references_set = 1;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (cpi->ppi->rtc_ref.reference[i]) {
+ no_references_set = 0;
+ break;
+ }
+ }
+ // Set to intra_only_frame if no references are set.
+ // The stream can start decoding on INTRA_ONLY_FRAME so long as the
+ // layer with the intra_only_frame doesn't signal a reference to a slot
+ // that hasn't been set yet.
+ if (no_references_set) *frame_type = INTRA_ONLY_FRAME;
+ }
+ }
+ }
+ // Check for scene change: for SVC check on base spatial layer only.
+ if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) {
+ if (rc->prev_coded_width == cm->width &&
+ rc->prev_coded_height == cm->height) {
+ rc_scene_detection_onepass_rt(cpi, frame_input);
+ } else {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+ }
+ // Check for dynamic resize, for single spatial layer for now.
+ // For temporal layers only check on base temporal layer.
+ if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
+ if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0)
+ dynamic_resize_one_pass_cbr(cpi);
+ if (rc->resize_state == THREE_QUARTER) {
+ resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2;
+ resize_pending_params->height =
+ (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2;
+ } else if (rc->resize_state == ONE_HALF) {
+ resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1;
+ resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1;
+ } else {
+ resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
+ resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
+ }
+ } else if (is_frame_resize_pending(cpi)) {
+ resize_reset_rc(cpi, resize_pending_params->width,
+ resize_pending_params->height, cm->width, cm->height);
+ }
+ // Set the GF interval and update flag.
+ if (!rc->rtc_external_ratectrl)
+ set_gf_interval_update_onepass_rt(cpi, *frame_type);
+ // Set target size.
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+ if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
+ target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_cbr(
+ cpi, gf_group->update_type[cpi->gf_frame_index]);
+ }
+ } else {
+ if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
+ target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_vbr(
+ cpi, gf_group->update_type[cpi->gf_frame_index]);
+ }
+ }
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q)
+ rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level;
+
+ av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
+ rc->base_frame_target = target;
+ cm->current_frame.frame_type = *frame_type;
+ // For fixed mode SVC: if KSVC is enabled remove inter layer
+ // prediction on spatial enhancement layer frames for frames
+ // whose base is not KEY frame.
+ if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode &&
+ svc->number_spatial_layers > 1 &&
+ !svc->layer_context[layer].is_key_frame) {
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+ }
+}
+
+int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
+ AV1_COMMON *const cm = &cpi->common;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ double rate_correction_factor =
+ cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
+ const int target_size = cpi->rc.avg_frame_bandwidth;
+ double new_correction_factor;
+ int target_bits_per_mb;
+ double q2;
+ int enumerator;
+ int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+ *q = (3 * cpi->rc.worst_quality + *q) >> 2;
+ // For screen content use the max-q set by the user to allow for less
+ // overshoot on slide changes.
+ if (is_screen_content) *q = cpi->rc.worst_quality;
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+ // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+ // these parameters will affect QP selection for subsequent frames. If they
+ // have settled down to a very different (low QP) state, then not adjusting
+ // them may cause next frame to select low QP and overshoot again.
+ p_rc->avg_frame_qindex[INTER_FRAME] = *q;
+ p_rc->buffer_level = p_rc->optimal_buffer_level;
+ p_rc->bits_off_target = p_rc->optimal_buffer_level;
+ // Reset rate under/over-shoot flags.
+ cpi->rc.rc_1_frame = 0;
+ cpi->rc.rc_2_frame = 0;
+ // Adjust rate correction factor.
+ target_bits_per_mb =
+ (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
+ // Reset rate correction factor: for now base it on target_bits_per_mb
+ // and qp (==max_QP). This comes from the inverse computation of
+ // av1_rc_bits_per_mb().
+ q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
+ enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content);
+ new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+ if (new_correction_factor > rate_correction_factor) {
+ rate_correction_factor =
+ (new_correction_factor + rate_correction_factor) / 2.0;
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
+ rate_correction_factor;
+ }
+ // For temporal layers: reset the rate control parameters across all
+ // temporal layers.
+ if (cpi->svc.number_temporal_layers > 1) {
+ SVC *svc = &cpi->svc;
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int sl = svc->spatial_layer_id;
+ const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+ lp_rc->avg_frame_qindex[INTER_FRAME] = *q;
+ lp_rc->buffer_level = lp_rc->optimal_buffer_level;
+ lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
+ lrc->rc_1_frame = 0;
+ lrc->rc_2_frame = 0;
+ lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+ }
+ }
+ return 1;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 0000000000..6802ad42d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+// Threshold used to define if a KF group is static (e.g. a slide show).
+// Essentially, this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+#define STATIC_KF_GROUP_FLOAT_THRESH 0.99
+
+// The maximum duration of a GF group that is static (e.g. a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 32
+#define FIXED_GF_INTERVAL 16
+#define MAX_GF_LENGTH_LAP 16
+
+#define FIXED_GF_INTERVAL_RT 80
+#define MAX_GF_INTERVAL_RT 160
+
+#define MAX_NUM_GF_INTERVALS 15
+
+#define MAX_ARF_LAYERS 6
+// #define STRICT_RC
+
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+// A passive rate control strategy for screen content type in real-time mode.
+// When it is turned on, the compression performance is improved by
+// 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains
+// over 20% on metric.
+// The downside is that it does not guarantee frame size.
+// Since RT mode has a tight restriction on buffer overflow control, we
+// turn it off by default.
+#define RT_PASSIVE_STRATEGY 0
+#define MAX_Q_HISTORY 1000
+
+typedef struct {
+ int resize_width;
+ int resize_height;
+ uint8_t superres_denom;
+} size_params_type;
+
+enum {
+ INTER_NORMAL,
+ GF_ARF_LOW,
+ GF_ARF_STD,
+ KF_STD,
+ RATE_FACTOR_LEVELS
+} UENUM1BYTE(RATE_FACTOR_LEVEL);
+
+enum {
+ KF_UPDATE,
+ LF_UPDATE,
+ GF_UPDATE,
+ ARF_UPDATE,
+ OVERLAY_UPDATE,
+ INTNL_OVERLAY_UPDATE, // Internal Overlay Frame
+ INTNL_ARF_UPDATE, // Internal Altref Frame
+ FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
+
+enum {
+ REFBUF_RESET, // Clear reference frame buffer
+ REFBUF_UPDATE, // Refresh reference frame buffer
+ REFBUF_STATES
+} UENUM1BYTE(REFBUF_STATE);
+
+typedef enum {
+ NO_RESIZE = 0,
+ DOWN_THREEFOUR = 1, // From orig to 3/4.
+ DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2.
+ UP_THREEFOUR = -1, // From 1/2 to 3/4.
+ UP_ORIG = -2, // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE;
+
+#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150
+typedef enum region_types {
+ STABLE_REGION = 0,
+ HIGH_VAR_REGION = 1,
+ SCENECUT_REGION = 2,
+ BLENDING_REGION = 3,
+} REGION_TYPES;
+
+typedef struct regions {
+ int start;
+ int last;
+ double avg_noise_var;
+ double avg_cor_coeff;
+ double avg_sr_fr_ratio;
+ double avg_intra_err;
+ double avg_coded_err;
+ REGION_TYPES type;
+} REGIONS;
+
+/*!\endcond */
+/*!
+ * \brief Rate Control parameters and status
+ */
+typedef struct {
+ // Rate targetting variables
+
+ /*!
+ * Baseline target rate for frame before adjustment for previous under or
+ * over shoot.
+ */
+ int base_frame_target;
+ /*!
+ * Target rate for frame after adjustment for previous under or over shoot.
+ */
+ int this_frame_target; // Actual frame target after rc adjustment.
+
+ /*!
+ * Projected size for current frame
+ */
+ int projected_frame_size;
+
+ /*!
+ * Bit size of transform coefficient for current frame.
+ */
+ int coefficient_size;
+
+ /*!
+ * Super block rate target used with some adaptive quantization strategies.
+ */
+ int sb64_target_rate;
+
+ /*!
+ * Number of frames since the last ARF / GF.
+ */
+ int frames_since_golden;
+
+ /*!
+ * Number of frames till the next ARF / GF is due.
+ */
+ int frames_till_gf_update_due;
+
+ /*!
+ * Number of determined gf groups left
+ */
+ int intervals_till_gf_calculate_due;
+
+ /*!\cond */
+ int min_gf_interval;
+ int max_gf_interval;
+ int static_scene_max_gf_interval;
+ /*!\endcond */
+ /*!
+ * Frames before the next key frame
+ */
+ int frames_to_key;
+ /*!\cond */
+ int frames_since_key;
+ int frames_to_fwd_kf;
+ int is_src_frame_alt_ref;
+ int sframe_due;
+
+ int high_source_sad;
+ uint64_t avg_source_sad;
+ uint64_t prev_avg_source_sad;
+ uint64_t frame_source_sad;
+
+ int avg_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
+ int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
+ int prev_avg_frame_bandwidth;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+
+ int decimation_factor;
+ int decimation_count;
+ int prev_frame_is_dropped;
+ int drop_count_consec;
+ int max_consec_drop;
+
+ /*!
+ * Frame number for encoded frames (non-dropped).
+ * Use for setting the rtc reference structure.
+ */
+ unsigned int frame_number_encoded;
+
+ /*!\endcond */
+ /*!
+ * User specified maximum Q allowed for current frame
+ */
+ int worst_quality;
+ /*!
+ * User specified minimum Q allowed for current frame
+ */
+ int best_quality;
+
+ /*!\cond */
+
+ // rate control history for last frame(1) and the frame before(2).
+ // -1: overshoot
+ // 1: undershoot
+ // 0: not initialized.
+ int rc_1_frame;
+ int rc_2_frame;
+ int q_1_frame;
+ int q_2_frame;
+
+ /*!\endcond */
+ /*!
+ * Proposed maximum allowed Q for current frame
+ */
+ int active_worst_quality;
+
+ /*!\cond */
+ // Track amount of low motion in scene
+ int avg_frame_low_motion;
+ int cnt_zeromv;
+
+ // signals if number of blocks with motion is high
+ int percent_blocks_with_motion;
+
+ // Maximum value of source sad across all blocks of frame.
+ uint64_t max_block_source_sad;
+
+ // For dynamic resize, 1 pass cbr.
+ RESIZE_STATE resize_state;
+ int resize_avg_qp;
+ int resize_buffer_underflow;
+ int resize_count;
+
+ // Flag to disable content related qp adjustment.
+ int rtc_external_ratectrl;
+
+ // Stores fast_extra_bits of the current frame.
+ int frame_level_fast_extra_bits;
+
+ double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ int frame_num_last_gf_refresh;
+
+ int prev_coded_width;
+ int prev_coded_height;
+
+ // The ratio used for inter frames in bit estimation.
+ // TODO(yunqing): if golden frame is treated differently (e.g. gf_cbr_boost_
+ // pct > THR), consider to add bit_est_ratio_g for golden frames.
+ int bit_est_ratio;
+
+ // Whether to use a fixed qp for the frame, bypassing internal rate control.
+ // This flag will reset to 0 after every frame.
+ int use_external_qp_one_pass;
+ /*!\endcond */
+} RATE_CONTROL;
+
+/*!
+ * \brief Primary Rate Control parameters and status
+ */
+typedef struct {
+ // Sub-gop level Rate targetting variables
+
+ /*!
+ * Target bit budget for the current GF / ARF group of frame.
+ */
+ int64_t gf_group_bits;
+
+ /*!
+ * Boost factor used to calculate the extra bits allocated to the key frame
+ */
+ int kf_boost;
+
+ /*!
+ * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+ */
+ int gfu_boost;
+
+ /*!
+ * Stores the determined gf group lengths for a set of gf groups
+ */
+ int gf_intervals[MAX_NUM_GF_INTERVALS];
+
+ /*!
+ * The current group's index into gf_intervals[]
+ */
+ int cur_gf_index;
+
+ /*!\cond */
+ int num_regions;
+
+ REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+ int regions_offset; // offset of regions from the last keyframe
+ int frames_till_regions_update;
+
+ int baseline_gf_interval;
+
+ int constrained_gf_group;
+
+ int this_key_frame_forced;
+
+ int next_key_frame_forced;
+ /*!\endcond */
+
+ /*!
+ * Initial buffuer level in ms for CBR / low delay encoding
+ */
+ int64_t starting_buffer_level;
+
+ /*!
+ * Optimum / target buffuer level in ms for CBR / low delay encoding
+ */
+ int64_t optimal_buffer_level;
+
+ /*!
+ * Maximum target buffuer level in ms for CBR / low delay encoding
+ */
+ int64_t maximum_buffer_size;
+
+ /*!
+ * Q index used for ALT frame
+ */
+ int arf_q;
+
+ /*!\cond */
+ float_t arf_boost_factor;
+
+ int base_layer_qp;
+
+ // Total number of stats used only for kf_boost calculation.
+ int num_stats_used_for_kf_boost;
+
+ // Total number of stats used only for gfu_boost calculation.
+ int num_stats_used_for_gfu_boost;
+
+ // Total number of stats required by gfu_boost calculation.
+ int num_stats_required_for_gfu_boost;
+
+ int enable_scenecut_detection;
+
+ int use_arf_in_this_kf_group;
+
+ int ni_frames;
+
+ double tot_q;
+ /*!\endcond */
+
+ /*!
+ * Q used for last boosted (non leaf) frame
+ */
+ int last_kf_qindex;
+
+ /*!
+ * Average of q index of previous encoded frames in a sequence.
+ */
+ int avg_frame_qindex[FRAME_TYPES];
+
+#if CONFIG_FPMT_TEST
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * active_best_quality.
+ */
+ int temp_active_best_quality[MAX_ARF_LAYERS + 1];
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * last_boosted_qindex.
+ */
+ int temp_last_boosted_qindex;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * avg_q.
+ */
+ double temp_avg_q;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * last_q.
+ */
+ int temp_last_q[FRAME_TYPES];
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * projected_frame_size.
+ */
+ int temp_projected_frame_size;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * total_actual_bits.
+ */
+ int64_t temp_total_actual_bits;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * buffer_level.
+ */
+ int64_t temp_buffer_level;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * vbr_bits_off_target.
+ */
+ int64_t temp_vbr_bits_off_target;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * vbr_bits_off_target_fast.
+ */
+ int64_t temp_vbr_bits_off_target_fast;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rate_correction_factors.
+ */
+ double temp_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rate_error_estimate.
+ */
+ int temp_rate_error_estimate;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rolling_arf_group_target_bits.
+ */
+ int temp_rolling_arf_group_target_bits;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rolling_arf_group_actual_bits;.
+ */
+ int temp_rolling_arf_group_actual_bits;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * bits_left;.
+ */
+ int64_t temp_bits_left;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * extend_minq.
+ */
+ int temp_extend_minq;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * extend_maxq.
+ */
+ int temp_extend_maxq;
+
+#endif
+ /*!
+ * Proposed minimum allowed Q different layers in a coding pyramid
+ */
+ int active_best_quality[MAX_ARF_LAYERS + 1];
+
+ /*!
+ * Q used for last boosted (non leaf) frame (GF/KF/ARF)
+ */
+ int last_boosted_qindex;
+
+ /*!
+ * Average Q value of previous inter frames
+ */
+ double avg_q;
+
+ /*!
+ * Q used on last encoded frame of the given type.
+ */
+ int last_q[FRAME_TYPES];
+
+ /*!
+ * Correction factors used to adjust the q estimate for a given target rate
+ * in the encode loop.
+ */
+ double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ /*!
+ * Current total consumed bits.
+ */
+ int64_t total_actual_bits;
+
+ /*!
+ * Current total target bits.
+ */
+ int64_t total_target_bits;
+
+ /*!
+ * Current buffer level.
+ */
+ int64_t buffer_level;
+
+ /*!
+ * PCT rc error.
+ */
+ int rate_error_estimate;
+
+ /*!
+ * Error bits available from previously encoded frames.
+ */
+ int64_t vbr_bits_off_target;
+
+ /*!
+ * Error bits available from previously encoded frames undershoot.
+ */
+ int64_t vbr_bits_off_target_fast;
+
+ /*!
+ * Total bits deviated from the average frame target, from previously
+ * encoded frames.
+ */
+ int64_t bits_off_target;
+
+ /*!
+ * Rolling monitor target bits updated based on current frame target size.
+ */
+ int rolling_target_bits;
+
+ /*!
+ * Rolling monitor actual bits updated based on current frame final projected
+ * size.
+ */
+ int rolling_actual_bits;
+
+ /*!
+ * The history of qindex for each frame.
+ * Only used when RT_PASSIVE_STRATEGY = 1.
+ */
+ int q_history[MAX_Q_HISTORY];
+} PRIMARY_RATE_CONTROL;
+
+/*!\cond */
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct GF_GROUP;
+
+void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
+ PRIMARY_RATE_CONTROL *p_rc);
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q,
+ double correction_factor);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+// av1_get_one_pass_rt_params()
+// av1_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by:
+// av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+struct EncodeFrameInput;
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+/*!\endcond */
+/*!\brief Updates the rate correction factor linking Q to output bits
+ *
+ * This function updates the Q rate correction factor after an encode
+ * cycle depending on whether we overshot or undershot the target rate.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] is_encode_stage Indicates if recode loop or post-encode
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \remark Updates the relevant rate correction factor in cpi->rc
+ */
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi,
+ int is_encode_stage, int width,
+ int height);
+/*!\cond */
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+ int this_frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+
+/*!\endcond */
+
+/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[in] gf_index Index of this frame in the golden frame group
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ * Also, updates \c rc->arf_q.
+ */
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
+ int gf_index, int *bottom_index, int *top_index);
+
+/*!\brief Estimates q to achieve a target bits per frame
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] target_bits_per_frame Frame rate target
+ * \param[in] active_worst_quality Max Q allowed
+ * \param[in] active_best_quality Min Q allowed
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \return Returns a q index value
+ */
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality,
+ int width, int height);
+
+/*!\cond */
+// Gets the appropriate bpmb ennumerator based on the frame and content type
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+ const int is_screen_content_type);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type,
+ int qindex, double correction_factor,
+ int accurate_estimate);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+ int64_t target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+ int target, uint8_t frame_update_type);
+
+// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// q >= desired_q.
+// If no such q index is found, returns 'worst_qindex'.
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+ int best_qindex, int worst_qindex);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi,
+ FRAME_TYPE frame_type, int qindex,
+ double rate_target_ratio);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+ RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
+ int height);
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
+
+void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi,
+ int gf_update);
+
+/*!\endcond */
+/*!\brief Calculates how many bits to use for a P frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_update_type Type of frame
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_pframe_target_size_one_pass_vbr(
+ const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+/*!\brief Calculates how many bits to use for an i frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
+
+/*!\brief Calculates how many bits to use for a P frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_update_type Type of frame
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_pframe_target_size_one_pass_cbr(
+ const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+/*!\brief Calculates how many bits to use for an i frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
+
+/*!\brief Setup the rate control parameters for 1 pass real-time mode.
+ *
+ * - Sets the frame type and target frame size.
+ * - Sets the GF update.
+ * - Checks for scene change.
+ * - Sets the reference prediction structure for 1 layers (non-SVC).
+ * - Resets and updates are done for SVC.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_type Encoder frame type
+ * \param[in] frame_input Current and last input source frames
+ * \param[in] frame_flags Encoder frame flags
+ *
+ * \remark Nothing is returned. Instead the settings computed in this
+ * function are set in: \c frame_params, \c cpi->common, \c cpi->rc,
+ * \c cpi->svc.
+ */
+void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
+ FRAME_TYPE *const frame_type,
+ const struct EncodeFrameInput *frame_input,
+ unsigned int frame_flags);
+
+/*!\brief Increase q on expected encoder overshoot, for CBR mode.
+ *
+ * Handles the case when encoder is expected to create a large frame:
+ * - q is increased to value closer to \c cpi->rc.worst_quality
+ * - avg_frame_qindex is reset
+ * - buffer levels are reset
+ * - rate correction factor is adjusted
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] q Current q index
+ *
+ * \return q is returned, and updates are done to \c cpi->rc.
+ */
+int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
+
+/*!\brief Compute the q_indices for a single frame.
+ *
+ * Intended to be used with AOM_Q mode.
+ *
+ * \param[in] base_q_index Base q index
+ * \param[in] gf_update_type GOP update type
+ * \param[in] gf_pyramid_level GOP level of the current frame
+ * \param[in] arf_q ARF q_index
+ *
+ * \return Returns the q_index for the current frame.
+ */
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+ int gf_pyramid_level, int arf_q);
+
+/*!\brief Compute the q_indices for the ARF of a GOP.
+ *
+ * \param[in] base_q_index Base q index
+ * \param[in] gfu_boost GFU boost
+ * \param[in] bit_depth Bit depth
+ * \param[in] arf_boost_factor ARF boost factor
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+ double arf_boost_factor);
+
+#if !CONFIG_REALTIME_ONLY
+struct TplDepFrame;
+/*!\brief Compute the q_indices for the ARF of a GOP in Q mode.
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] tpl_frame Tpl Frame stats
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi,
+ struct TplDepFrame *tpl_frame);
+#endif
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/rc_utils.h b/third_party/aom/av1/encoder/rc_utils.h
new file mode 100644
index 0000000000..fe22ee5afb
--- /dev/null
+++ b/third_party/aom/av1/encoder/rc_utils.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RC_UTILS_H_
+#define AOM_AV1_ENCODER_RC_UTILS_H_
+
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/psnr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+ RATE_CONTROL *rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ if (cpi->common.current_frame.frame_number >
+ (unsigned int)cpi->svc.number_spatial_layers) {
+ if (cpi->ppi->use_svc) {
+ av1_svc_check_reset_layer_rc_flag(cpi);
+ } else {
+ if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) ||
+ rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) {
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
+ p_rc->bits_off_target = p_rc->optimal_buffer_level;
+ p_rc->buffer_level = p_rc->optimal_buffer_level;
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
+ AV1_PRIMARY *ppi) {
+ PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ const int64_t bandwidth = rc_cfg->target_bandwidth;
+ const int64_t starting = rc_cfg->starting_buffer_level_ms;
+ const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
+ const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
+
+ p_rc->starting_buffer_level = starting * bandwidth / 1000;
+ p_rc->optimal_buffer_level =
+ (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+ p_rc->maximum_buffer_size =
+ (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ p_rc->bits_off_target =
+ AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+ p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+}
+
+static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
+ AV1_LEVEL target_level, int tier) {
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ SequenceHeader *const seq_params = cpi->common.seq_params;
+ TileConfig *const tile_cfg = &oxcf->tile_cfg;
+ RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ // Adjust target bitrate to be no larger than 70% of level limit.
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const double level_bitrate_limit =
+ av1_get_max_bitrate_for_level(target_level, tier, profile);
+ const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+ rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
+ // Also need to update cpi->ppi->twopass.bits_left.
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+ if (stats != NULL)
+ cpi->ppi->twopass.bits_left =
+ (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
+
+ // Adjust max over-shoot percentage.
+ rc_cfg->over_shoot_pct = 0;
+
+ // Adjust max quantizer.
+ rc_cfg->worst_allowed_q = 255;
+
+ // Adjust number of tiles and tile columns to be under level limit.
+ int max_tiles, max_tile_cols;
+ av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+ while (tile_cfg->tile_columns > 0 &&
+ (1 << tile_cfg->tile_columns) > max_tile_cols) {
+ --tile_cfg->tile_columns;
+ }
+ const int tile_cols = (1 << tile_cfg->tile_columns);
+ while (tile_cfg->tile_rows > 0 &&
+ tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) {
+ --tile_cfg->tile_rows;
+ }
+
+ // Adjust min compression ratio.
+ const int still_picture = seq_params->still_picture;
+ const double min_cr =
+ av1_get_min_cr_for_level(target_level, tier, still_picture);
+ rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Function to test for conditions that indicate we should loop
+ * back and recode a frame.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] high_limit Upper rate threshold
+ * \param[in] low_limit Lower rate threshold
+ * \param[in] q Current q index
+ * \param[in] maxq Maximum allowed q index
+ * \param[in] minq Minimum allowed q index
+ *
+ * \return Indicates if a recode is required.
+ * \retval 1 Recode Required
+ * \retval 0 No Recode required
+ */
+static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
+ int low_limit, int q, int maxq,
+ int minq) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+ int force_recode = 0;
+
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+ (frame_is_kfgfarf &&
+ (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+ // TODO(agrange) high_limit could be greater than the scale-down threshold.
+ if ((rc->projected_frame_size > high_limit && q < maxq) ||
+ (rc->projected_frame_size < low_limit && q > minq)) {
+ force_recode = 1;
+ } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
+ if (q > oxcf->rc_cfg.cq_level &&
+ rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+ force_recode = 1;
+ }
+ }
+ }
+ return force_recode;
+}
+
+static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
+ double max_factor,
+ int frame_count) {
+ double factor = sqrt((double)frame_count);
+ factor = AOMMIN(factor, max_factor);
+ factor = AOMMAX(factor, min_factor);
+ factor = (200.0 + 10.0 * factor);
+ return factor;
+}
+
+static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor,
+ double max_factor, double r0,
+ int frames_to_key) {
+ double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+ frames_to_key);
+ const int boost = (int)rint(factor / r0);
+ return boost;
+}
+
+static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
+ double factor = sqrt((double)frame_count);
+ factor = AOMMIN(factor, 10.0);
+ factor = AOMMAX(factor, 4.0);
+ factor = (75.0 + 14.0 * factor);
+ return factor;
+}
+
+static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
+ int is_encode_stage, int q_low,
+ int q_high, int top_index,
+ int bottom_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+
+ int q_regulated =
+ av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width, cm->height);
+
+ int retries = 0;
+ while (q_regulated < q_low && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+ q_regulated =
+ av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width, cm->height);
+ retries++;
+ }
+ return q_regulated;
+}
+
+static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
+ int is_encode_stage,
+ int q_high, int top_index,
+ int bottom_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+ int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+
+ int retries = 0;
+ while (q_regulated > q_high && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+ q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+ retries++;
+ }
+ return q_regulated;
+}
+
+/*!\brief Called after encode_with_recode_loop() has just encoded a frame.
+ * This function works out whether we undershot or overshot our bitrate
+ * target and adjusts q as appropriate. It also decides whether or not
+ * we need to recode the frame to get closer to the target rate.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[out] loop Should we go around the recode loop again
+ * \param[in,out] q New q index value
+ * \param[in,out] q_low Low q index limit for this loop itteration
+ * \param[in,out] q_high High q index limit for this loop itteration
+ * \param[in] top_index Max permited new value for q index
+ * \param[in] bottom_index Min permited new value for q index
+ * \param[in,out] undershoot_seen Have we seen undershoot on this frame
+ * \param[in,out] overshoot_seen Have we seen overshoot on this frame
+ * \param[in,out] low_cr_seen Have we previously trriggered recode
+ * because the compression ration was less
+ * than a given minimum threshold.
+ * \param[in] loop_count Loop itterations so far.
+ *
+ */
+static AOM_INLINE void recode_loop_update_q(
+ AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+ int *const q_high, const int top_index, const int bottom_index,
+ int *const undershoot_seen, int *const overshoot_seen,
+ int *const low_cr_seen, const int loop_count) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ *loop = 0;
+
+ // Special case for overlay frame.
+ if (rc->is_src_frame_alt_ref &&
+ rc->projected_frame_size < rc->max_frame_bandwidth)
+ return;
+
+ const int min_cr = rc_cfg->min_cr;
+ if (min_cr > 0) {
+ const double compression_ratio =
+ av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+ const double target_cr = min_cr / 100.0;
+ if (compression_ratio < target_cr) {
+ *low_cr_seen = 1;
+ if (*q < rc->worst_quality) {
+ const double cr_ratio = target_cr / compression_ratio;
+ const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+ *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+ *q_low = AOMMAX(*q, *q_low);
+ *q_high = AOMMAX(*q, *q_high);
+ *loop = 1;
+ }
+ }
+ if (*low_cr_seen) return;
+ }
+
+ if (cpi->ppi->level_params.keep_level_stats &&
+ !is_stat_generation_stage(cpi)) {
+ // Initialize level info. at the beginning of each sequence.
+ if (cm->current_frame.frame_type == KEY_FRAME &&
+ cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ av1_init_level_info(cpi);
+ }
+ const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ // TODO(any): currently only checking operating point 0
+ const AV1LevelInfo *const level_info = level_params->level_info[0];
+ const DECODER_MODEL *const decoder_models = level_info->decoder_models;
+ const AV1_LEVEL target_level = level_params->target_seq_level_idx[0];
+
+ if (target_level < SEQ_LEVELS &&
+ decoder_models[target_level].status == DECODER_MODEL_OK) {
+ DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf(
+ cpi, rc->projected_frame_size, &decoder_models[target_level]);
+
+ if ((status == SMOOTHING_BUFFER_UNDERFLOW ||
+ status == SMOOTHING_BUFFER_OVERFLOW) &&
+ *q < rc->worst_quality) {
+ *q = AOMMIN(*q + 10, rc->worst_quality);
+ *q_low = AOMMAX(*q, *q_low);
+ *q_high = AOMMAX(*q, *q_high);
+ *loop = 1;
+ return;
+ }
+ }
+ }
+
+ if (rc_cfg->mode == AOM_Q) return;
+
+ const int last_q = *q;
+ int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+ av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+ if (cm->current_frame.frame_type == KEY_FRAME &&
+ p_rc->this_key_frame_forced &&
+ rc->projected_frame_size < rc->max_frame_bandwidth) {
+ int64_t kf_err;
+ const int64_t high_err_target = cpi->ambient_err;
+ const int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params->use_highbitdepth) {
+ kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ } else {
+ kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ }
+#else
+ kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ rc->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ rc->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ *q_high = AOMMAX(*q - 1, *q_low);
+
+ // Adjust Q
+ *q = (int)((*q * high_err_target) / kf_err);
+ *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ rc->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ *q_low = AOMMIN(*q + 1, *q_high);
+
+ // Adjust Q
+ *q = (int)((*q * low_err_target) / kf_err);
+ *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
+ }
+
+ // Clamp Q to upper and lower limits:
+ *q = clamp(*q, *q_low, *q_high);
+ *loop = (*q != last_q);
+ return;
+ }
+
+ if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+ AOMMAX(*q_high, top_index), bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+ // Frame is too large
+ if (rc->projected_frame_size > rc->this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if (*q == *q_high &&
+ rc->projected_frame_size >= rc->max_frame_bandwidth) {
+ const double q_val_high_current =
+ av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth);
+ const double q_val_high_new =
+ q_val_high_current *
+ ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+ *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth,
+ rc->best_quality, rc->worst_quality);
+ }
+
+ // Raise Qlow as to at least the current value
+ *q_low = AOMMIN(*q + 1, *q_high);
+
+ if (*undershoot_seen || loop_count > 2 ||
+ (loop_count == 2 && !frame_is_intra_only(cm))) {
+ av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+
+ *q = (*q_high + *q_low + 1) / 2;
+ } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+ const int q_mid = (*q_high + *q_low + 1) / 2;
+ const int q_regulated = get_regulated_q_overshoot(
+ cpi, 1, *q_low, *q_high, top_index, bottom_index);
+ // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+ // transition between loop_count < 2 and loop_count > 2.
+ *q = (q_mid + q_regulated + 1) / 2;
+ } else {
+ *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index,
+ bottom_index);
+ }
+
+ *overshoot_seen = 1;
+ } else {
+ // Frame is too small
+ *q_high = AOMMAX(*q - 1, *q_low);
+
+ if (*overshoot_seen || loop_count > 2 ||
+ (loop_count == 2 && !frame_is_intra_only(cm))) {
+ av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+ *q = (*q_high + *q_low) / 2;
+ } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+ const int q_mid = (*q_high + *q_low) / 2;
+ const int q_regulated = get_regulated_q_undershoot(
+ cpi, 1, *q_high, top_index, bottom_index);
+ // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+ // transition between loop_count < 2 and loop_count > 2.
+ *q = (q_mid + q_regulated) / 2;
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) {
+ *q_low = *q;
+ }
+ } else {
+ *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index,
+ bottom_index);
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (rc_cfg->mode == AOM_CQ && *q < *q_low) {
+ *q_low = *q;
+ }
+ }
+
+ *undershoot_seen = 1;
+ }
+
+ // Clamp Q to upper and lower limits:
+ *q = clamp(*q, *q_low, *q_high);
+ }
+
+ *loop = (*q != last_q);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RC_UTILS_H_
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 0000000000..c2d76e7a9a
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+
+#define RD_THRESH_POW 1.25
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+ 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
+};
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA]
+ [EXT_TX_SIZES] = {
+ { 1, 1, 1, 1 }, // unused
+ { 1, 1, 0, 0 },
+ { 0, 0, 1, 0 },
+ };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER]
+ [EXT_TX_SIZES] = {
+ { 1, 1, 1, 1 }, // unused
+ { 1, 1, 0, 0 },
+ { 0, 0, 1, 0 },
+ { 0, 1, 1, 1 },
+ };
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+ EXT_TX_SETS_INTER)] = {
+ {
+ // Intra
+ EXT_TX_SET_DCTONLY,
+ EXT_TX_SET_DTT4_IDTX_1DDCT,
+ EXT_TX_SET_DTT4_IDTX,
+ },
+ {
+ // Inter
+ EXT_TX_SET_DCTONLY,
+ EXT_TX_SET_ALL16,
+ EXT_TX_SET_DTT9_IDTX_1DDCT,
+ EXT_TX_SET_DCT_IDTX,
+ },
+};
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
+ FRAME_CONTEXT *fc) {
+ int i, j;
+
+ for (i = 0; i < PARTITION_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->partition_cost[i],
+ fc->partition_cdf[i], NULL);
+
+ if (cm->current_frame.skip_mode_info.skip_mode_flag) {
+ for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i],
+ fc->skip_mode_cdfs[i], NULL);
+ }
+ }
+
+ for (i = 0; i < SKIP_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i],
+ fc->skip_txfm_cdfs[i], NULL);
+ }
+
+ for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+ for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+ av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j],
+ fc->kf_y_cdf[i][j], NULL);
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i],
+ NULL);
+ for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+ for (j = 0; j < INTRA_MODES; ++j)
+ av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j],
+ fc->uv_mode_cdf[i][j], NULL);
+
+ av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost,
+ fc->filter_intra_mode_cdf, NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ if (av1_filter_intra_allowed_bsize(cm, i))
+ av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i],
+ fc->filter_intra_cdfs[i], NULL);
+ }
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i],
+ fc->switchable_interp_cdf[i], NULL);
+
+ for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i],
+ fc->palette_y_size_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i],
+ fc->palette_uv_size_cdf[i], NULL);
+ for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j],
+ fc->palette_y_mode_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i],
+ fc->palette_uv_mode_cdf[i], NULL);
+ }
+
+ for (i = 0; i < PALETTE_SIZES; ++i) {
+ for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j],
+ fc->palette_y_color_index_cdf[i][j], NULL);
+ av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j],
+ fc->palette_uv_color_index_cdf[i][j], NULL);
+ }
+ }
+
+ int sign_cost[CFL_JOINT_SIGNS];
+ av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+ for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+ int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U];
+ int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V];
+ if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
+ memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+ } else {
+ const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+ }
+ if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
+ memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+ } else {
+ const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+ }
+ for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+ cost_u[u] += sign_cost[joint_sign];
+ }
+
+ for (i = 0; i < MAX_TX_CATS; ++i)
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+ av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j],
+ fc->tx_size_cdf[i][j], NULL);
+
+ for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i],
+ fc->txfm_partition_cdf[i], NULL);
+ }
+
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ int s;
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+ if (use_inter_ext_tx_for_txsize[s][i]) {
+ av1_cost_tokens_from_cdf(
+ mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+ av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
+ }
+ }
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+ if (use_intra_ext_tx_for_txsize[s][i]) {
+ for (j = 0; j < INTRA_MODES; ++j) {
+ av1_cost_tokens_from_cdf(
+ mode_costs->intra_tx_type_costs[s][i][j],
+ fc->intra_ext_tx_cdf[s][i][j],
+ av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+ }
+ }
+ }
+ }
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i],
+ fc->angle_delta_cdf[i], NULL);
+ }
+ av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL);
+
+ for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i],
+ fc->seg.spatial_pred_seg_cdf[i], NULL);
+ }
+
+ for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i],
+ NULL);
+ }
+
+ if (!frame_is_intra_only(cm)) {
+ for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i],
+ fc->comp_inter_cdf[i], NULL);
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < SINGLE_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j],
+ fc->single_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i],
+ fc->comp_ref_type_cdf[i], NULL);
+ }
+
+ for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+ for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j],
+ fc->uni_comp_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < FWD_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j],
+ fc->comp_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < BWD_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j],
+ fc->comp_bwdref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i],
+ fc->intra_inter_cdf[i], NULL);
+ }
+
+ for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i],
+ NULL);
+ }
+
+ for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i],
+ fc->zeromv_cdf[i], NULL);
+ }
+
+ for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i],
+ NULL);
+ }
+
+ for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i],
+ NULL);
+ }
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i],
+ fc->inter_compound_mode_cdf[i], NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i],
+ fc->compound_type_cdf[i], NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ if (av1_is_wedge_used(i)) {
+ av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i],
+ fc->wedge_idx_cdf[i], NULL);
+ }
+ }
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i],
+ fc->interintra_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i],
+ fc->interintra_mode_cdf[i], NULL);
+ }
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i],
+ fc->wedge_interintra_cdf[i], NULL);
+ }
+ for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+ av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i],
+ fc->motion_mode_cdf[i], NULL);
+ }
+ for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+ av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i],
+ fc->obmc_cdf[i], NULL);
+ }
+ for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i],
+ fc->compound_index_cdf[i], NULL);
+ }
+ for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i],
+ fc->comp_group_idx_cdf[i], NULL);
+ }
+ }
+}
+
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) {
+ av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost,
+ fc->switchable_restore_cdf, NULL);
+ av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost,
+ fc->wiener_restore_cdf, NULL);
+ av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost,
+ fc->sgrproj_restore_cdf, NULL);
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit_lut_8[QINDEX_RANGE];
+static int sad_per_bit_lut_10[QINDEX_RANGE];
+static int sad_per_bit_lut_12[QINDEX_RANGE];
+
+static void init_me_luts_bd(int *bit16lut, int range,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ // Initialize the sad lut tables using a formulaic calculation for now.
+ // This is to make it easier to resolve the impact of experimental changes
+ // to the quantizer tables.
+ for (i = 0; i < range; i++) {
+ const double q = av1_convert_qindex_to_q(i, bit_depth);
+ bit16lut[i] = (int)(0.0418 * q + 2.4107);
+ }
+}
+
+static void init_me_luts(void) {
+ init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8);
+ init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10);
+ init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12);
+}
+
+void av1_init_me_luts(void) { aom_once(init_me_luts); }
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+ 8, 8, 4, 4, 2, 2, 1, 0 };
+
+static const int rd_layer_depth_factor[7] = {
+ 160, 160, 160, 160, 192, 208, 224
+};
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+ return 3.2 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+ return 3.25 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+ return 3.3 + (0.0015 * (double)qindex);
+}
+
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+ FRAME_UPDATE_TYPE update_type,
+ int qindex) {
+ const int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ int64_t rdmult = q * q;
+ if (update_type == KF_UPDATE) {
+ double def_rd_q_mult = def_kf_rd_multiplier(q);
+ rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+ } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) {
+ double def_rd_q_mult = def_arf_rd_multiplier(q);
+ rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+ } else {
+ double def_rd_q_mult = def_inter_rd_multiplier(q);
+ rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+ }
+
+ switch (bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+ case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1;
+}
+
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+ const FRAME_UPDATE_TYPE update_type,
+ const int layer_depth, const int boost_index,
+ const FRAME_TYPE frame_type,
+ const int use_fixed_qp_offsets,
+ const int is_stat_consumption_stage) {
+ int64_t rdmult =
+ av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
+ if (is_stat_consumption_stage && !use_fixed_qp_offsets &&
+ (frame_type != KEY_FRAME)) {
+ // Layer depth adjustment
+ rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
+ // ARF boost adjustment
+ rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+ }
+ return (int)rdmult;
+}
+
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
+ assert(beta > 0.0);
+ int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ int newq = (int)rint(q / sqrt(beta));
+ int orig_qindex = qindex;
+ if (newq == q) {
+ return 0;
+ }
+ if (newq < q) {
+ while (qindex > 0) {
+ qindex--;
+ q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (newq >= q) {
+ break;
+ }
+ }
+ } else {
+ while (qindex < MAXQ) {
+ qindex++;
+ q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (newq <= q) {
+ break;
+ }
+ }
+ }
+ return qindex - orig_qindex;
+}
+
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+ int curr_qindex) {
+ curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res);
+ const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1;
+ const int deltaq_deadzone = delta_q_res / 4;
+ const int qmask = ~(delta_q_res - 1);
+ int abs_deltaq_index = abs(curr_qindex - prev_qindex);
+ abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+ int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index;
+ adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1);
+ return adjust_qindex;
+}
+
+int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
+ assert(beta > 0.0);
+ const AV1_COMMON *cm = &cpi->common;
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ const int qindex_rdmult = cm->quant_params.base_qindex;
+ return (int)(av1_compute_rd_mult(
+ qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+ layer_depth, boost_index, frame_type,
+ cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi)) /
+ beta);
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+ double q;
+ switch (bit_depth) {
+ case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break;
+ case AOM_BITS_10:
+ q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0;
+ break;
+ case AOM_BITS_12:
+ q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0;
+ break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ // TODO(debargha): Adjust the function below.
+ return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
+ switch (cpi->common.seq_params->bit_depth) {
+ case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break;
+ case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break;
+ case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ }
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd,
+ int use_nonrd_pick_mode) {
+ int i, bsize, segment_id;
+ THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 };
+ int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES;
+
+ if (use_nonrd_pick_mode) {
+ for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) {
+ const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+ if (ref != INTRA_FRAME) {
+ for (i = 0; i < RTC_INTER_MODES; i++)
+ mode_indices[num_modes_count++] =
+ mode_idx[ref][mode_offset(inter_mode_list[i])];
+ } else {
+ for (i = 0; i < RTC_INTRA_MODES; i++)
+ mode_indices[num_modes_count++] =
+ mode_idx[ref][mode_offset(intra_mode_list[i])];
+ }
+ }
+ }
+
+ for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+ const int qindex = clamp(
+ av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
+ cm->quant_params.y_dc_delta_q,
+ 0, MAXQ);
+ const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth);
+
+ for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ // Threshold here seems unnecessarily harsh but fine given actual
+ // range of values used for cpi->sf.thresh_mult[].
+ const int t = q * rd_thresh_block_size_factor[bsize];
+ const int thresh_max = INT_MAX / t;
+
+ for (i = 0; i < num_modes_count; ++i) {
+ const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i;
+ rd->threshes[segment_id][bsize][mode_index] =
+ rd->thresh_mult[mode_index] < thresh_max
+ ? rd->thresh_mult[mode_index] * t / 4
+ : INT_MAX;
+ }
+ }
+ }
+}
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
+ const int num_planes) {
+ const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+ for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+ for (int plane = 0; plane < nplanes; ++plane) {
+ LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane];
+
+ for (int ctx = 0; ctx < 2; ++ctx) {
+ aom_cdf_prob *pcdf;
+ switch (eob_multi_size) {
+ case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+ case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+ case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+ case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+ case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+ case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+ case 6:
+ default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+ }
+ av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+ }
+ }
+ }
+ for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+ for (int plane = 0; plane < nplanes; ++plane) {
+ LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane];
+
+ for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+ fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+ fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+ NULL);
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+ fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+ pcost->base_cost[ctx][4] = 0;
+ pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+ av1_cost_literal(1) -
+ pcost->base_cost[ctx][0];
+ pcost->base_cost[ctx][6] =
+ pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+ pcost->base_cost[ctx][7] =
+ pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+ }
+
+ for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+ fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+ fc->dc_sign_cdf[plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ int br_rate[BR_CDF_SIZE];
+ int prev_cost = 0;
+ int i, j;
+ av1_cost_tokens_from_cdf(
+ br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx],
+ NULL);
+ // printf("br_rate: ");
+ // for(j = 0; j < BR_CDF_SIZE; j++)
+ // printf("%4d ", br_rate[j]);
+ // printf("\n");
+ for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+ for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+ pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
+ }
+ prev_cost += br_rate[j];
+ }
+ pcost->lps_cost[ctx][i] = prev_cost;
+ // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+ // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+ // printf("%5d ", pcost->lps_cost[ctx][i]);
+ // printf("\n");
+ }
+ for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+ pcost->lps_cost[ctx][0];
+ for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+ pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+ pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+ }
+ }
+ }
+ }
+}
+
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+ MvCosts *mv_costs) {
+ // Avoid accessing 'mv_costs' when it is not allocated.
+ if (mv_costs == NULL) return;
+
+ mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+ mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+ mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+ mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+ if (integer_mv) {
+ mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost;
+ av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+ nmvc, MV_SUBPEL_NONE);
+ } else {
+ mv_costs->mv_cost_stack =
+ usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+ av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+ nmvc, usehp);
+ }
+}
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) {
+ dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX];
+ dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX];
+ av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc,
+ MV_SUBPEL_NONE);
+}
+
+// Populates speed features based on codec control settings (of type
+// COST_UPDATE_TYPE) and expected speed feature settings (of type
+// INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update.
+// The populated/updated speed features are used for cost updates in the
+// encoder.
+// WARNING: Population of unified cost update frequency needs to be taken care
+// accordingly, in case of any modifications/additions to the enum
+// COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE.
+static INLINE void populate_unified_cost_update_freq(
+ const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) {
+ INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+ // Mapping of entropy cost update frequency from the encoder's codec control
+ // settings of type COST_UPDATE_TYPE to speed features of type
+ // INTERNAL_COST_UPDATE_TYPE.
+ static const INTERNAL_COST_UPDATE_TYPE
+ map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = {
+ INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE,
+ INTERNAL_COST_UPD_OFF
+ };
+
+ inter_sf->mv_cost_upd_level =
+ AOMMIN(inter_sf->mv_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]);
+ inter_sf->coeff_cost_upd_level =
+ AOMMIN(inter_sf->coeff_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]);
+ inter_sf->mode_cost_upd_level =
+ AOMMIN(inter_sf->mode_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]);
+ sf->intra_sf.dv_cost_upd_level =
+ AOMMIN(sf->intra_sf.dv_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]);
+}
+
+// Checks if entropy costs should be initialized/updated at frame level or not.
+static INLINE int is_frame_level_cost_upd_freq_set(
+ const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level,
+ const int use_nonrd_pick_mode, const int frames_since_key) {
+ const int fill_costs =
+ frame_is_intra_only(cm) ||
+ (use_nonrd_pick_mode ? frames_since_key < 2
+ : (cm->current_frame.frame_number & 0x07) == 1);
+ return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) ||
+ cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs);
+}
+
+// Decide whether we want to update the mode entropy cost for the current frame.
+// The logit is currently inherited from selective_disable_cdf_rtc.
+static AOM_INLINE int should_force_mode_cost_update(const AV1_COMP *cpi) {
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ if (!rt_sf->frame_level_mode_cost_update) {
+ return false;
+ }
+
+ if (cpi->oxcf.algo_cfg.cdf_update_mode == 2) {
+ return cpi->frames_since_last_update == 1;
+ } else if (cpi->oxcf.algo_cfg.cdf_update_mode == 1) {
+ if (cpi->svc.number_spatial_layers == 1 &&
+ cpi->svc.number_temporal_layers == 1) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ return frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+ rc->high_source_sad || rc->frames_since_key < 10 ||
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 ||
+ cm->current_frame.frame_number % 8 == 0;
+ } else if (cpi->svc.number_temporal_layers > 1) {
+ return cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1;
+ }
+ }
+
+ return false;
+}
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ RD_OPT *const rd = &cpi->rd;
+ int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+ int frames_since_key = cpi->rc.frames_since_key;
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ const int qindex_rdmult =
+ cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q;
+ rd->RDMULT = av1_compute_rd_mult(
+ qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+#if CONFIG_RD_COMMAND
+ if (cpi->oxcf.pass == 2) {
+ const RD_COMMAND *rd_command = &cpi->rd_command;
+ if (rd_command->option_ls[rd_command->frame_index] ==
+ RD_OPTION_SET_Q_RDMULT) {
+ rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index];
+ }
+ }
+#endif // CONFIG_RD_COMMAND
+
+ av1_set_error_per_bit(&x->errorperbit, rd->RDMULT);
+
+ set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode);
+
+ populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf);
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+ // Frame level mv cost update
+ if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level,
+ use_nonrd_pick_mode, frames_since_key))
+ av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+ cm->features.allow_high_precision_mv, x->mv_costs);
+
+ // Frame level coefficient cost update
+ if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level,
+ use_nonrd_pick_mode, frames_since_key))
+ av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm));
+
+ // Frame level mode cost update
+ if (should_force_mode_cost_update(cpi) ||
+ is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level,
+ use_nonrd_pick_mode, frames_since_key))
+ av1_fill_mode_rates(cm, &x->mode_costs, cm->fc);
+
+ // Frame level dv cost update
+ if (av1_need_dv_costs(cpi)) {
+ if (cpi->td.dv_costs_alloc == NULL) {
+ CHECK_MEM_ERROR(
+ cm, cpi->td.dv_costs_alloc,
+ (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc)));
+ cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc;
+ }
+ av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
+ }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+ // NOTE: The tables below must be of the same size.
+
+ // The functions described below are sampled at the four most significant
+ // bits of x^2 + 8 / 256.
+
+ // Normalized rate:
+ // This table models the rate for a Laplacian source with given variance
+ // when quantized with a uniform quantizer with given stepsize. The
+ // closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+ // and H(x) is the binary entropy function.
+ static const int rate_tab_q10[] = {
+ 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+ 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+ 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+ 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+ 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+ 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424,
+ 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87,
+ 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6,
+ 5, 3, 2, 1, 1, 1, 0, 0,
+ };
+ // Normalized distortion:
+ // This table models the normalized distortion for a Laplacian source
+ // with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance).
+ // Note the actual distortion is Dn * variance.
+ static const int dist_tab_q10[] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5,
+ 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17,
+ 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54,
+ 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142,
+ 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351,
+ 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659,
+ 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936,
+ 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017,
+ 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+ };
+ static const int xsq_iq_q10[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32,
+ 40, 48, 56, 64, 72, 80, 88, 96, 112,
+ 128, 144, 160, 176, 192, 208, 224, 256, 288,
+ 320, 352, 384, 416, 448, 480, 544, 608, 672,
+ 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
+ 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
+ 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
+ 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
+ 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
+ 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
+ 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
+ 180192, 196576, 212960, 229344, 245728,
+ };
+ const int tmp = (xsq_q10 >> 2) + 8;
+ const int k = get_msb(tmp) - 3;
+ const int xq = (k << 3) + ((tmp >> k) & 0x7);
+ const int one_q10 = 1 << 10;
+ const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+ const int b_q10 = one_q10 - a_q10;
+ *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+ *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+ unsigned int qstep, int *rate,
+ int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ if (var == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ int d_q10, r_q10;
+ static const uint32_t MAX_XSQ_Q10 = 245727;
+ const uint64_t xsq_q10_64 =
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+ const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+ model_rd_norm(xsq_q10, &r_q10, &d_q10);
+ *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+ *dist = (var * (int64_t)d_q10 + 512) >> 10;
+ }
+}
+
+static double interp_cubic(const double *p, double x) {
+ return p[1] + 0.5 * x *
+ (p[2] - p[0] +
+ x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+ x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+/*
+static double interp_bicubic(const double *p, int p_stride, double x,
+ double y) {
+ double q[4];
+ q[0] = interp_cubic(p, x);
+ q[1] = interp_cubic(p + p_stride, x);
+ q[2] = interp_cubic(p + 2 * p_stride, x);
+ q[3] = interp_cubic(p + 3 * p_stride, x);
+ return interp_cubic(q, y);
+}
+*/
+
+static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3
+};
+
+static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
+ return (sse_norm > 16.0);
+}
+
+// Models distortion by sse using a logistic function on
+// l = log2(sse / q^2) as:
+// dbysse = 16 / (1 + k exp(l + c))
+static double get_dbysse_logistic(double l, double c, double k) {
+ const double A = 16.0;
+ const double dbysse = A / (1 + k * exp(l + c));
+ return dbysse;
+}
+
+// Models rate using a clamped linear function on
+// l = log2(sse / q^2) as:
+// rate = max(0, a + b * l)
+static double get_rate_clamplinear(double l, double a, double b) {
+ const double rate = a + b * l;
+ return (rate < 0 ? 0 : rate);
+}
+
+static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4
+};
+
+static const double surffit_rate_params[9][4] = {
+ {
+ 638.390212,
+ 2.253108,
+ 166.585650,
+ -3.939401,
+ },
+ {
+ 5.256905,
+ 81.997240,
+ -1.321771,
+ 17.694216,
+ },
+ {
+ -74.193045,
+ 72.431868,
+ -19.033152,
+ 15.407276,
+ },
+ {
+ 416.770113,
+ 14.794188,
+ 167.686830,
+ -6.997756,
+ },
+ {
+ 378.511276,
+ 9.558376,
+ 154.658843,
+ -6.635663,
+ },
+ {
+ 277.818787,
+ 4.413180,
+ 150.317637,
+ -9.893038,
+ },
+ {
+ 142.212132,
+ 11.542038,
+ 94.393964,
+ -5.518517,
+ },
+ {
+ 219.100256,
+ 4.007421,
+ 108.932852,
+ -6.981310,
+ },
+ {
+ 222.261971,
+ 3.251049,
+ 95.972916,
+ -5.609789,
+ },
+};
+
+static const double surffit_dist_params[7] = { 1.475844, 4.328362, -5.680233,
+ -0.500994, 0.554585, 4.839478,
+ -0.695837 };
+
+static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+ double *rpar) {
+ const int cat = bsize_surffit_model_cat_lookup[bsize];
+ rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm;
+ rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm;
+}
+
+static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+ double *dpar) {
+ (void)bsize;
+ const double *params = surffit_dist_params;
+ dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3]));
+ dpar[1] = params[4] + params[5] * exp(params[6] * xm);
+}
+
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+ double yl, double *rate_f, double *distbysse_f) {
+ (void)sse_norm;
+ double rpar[2], dpar[2];
+ rate_surffit_model_params_lookup(bsize, xm, rpar);
+ dist_surffit_model_params_lookup(bsize, xm, dpar);
+
+ *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]);
+ *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]);
+}
+
+static const double interp_rgrid_curv[4][65] = {
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 118.257702, 120.210658, 121.434853, 122.100487,
+ 122.377758, 122.436865, 72.290102, 96.974289, 101.652727,
+ 126.830141, 140.417377, 157.644879, 184.315291, 215.823873,
+ 262.300169, 335.919859, 420.624173, 519.185032, 619.854243,
+ 726.053595, 827.663369, 933.127475, 1037.988755, 1138.839609,
+ 1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052,
+ 1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680,
+ 2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011,
+ 2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827,
+ 2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773,
+ 3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 13.087244, 15.919735, 25.930313, 24.412411,
+ 28.567417, 29.924194, 30.857010, 32.742979, 36.382570,
+ 39.210386, 42.265690, 47.378572, 57.014850, 82.740067,
+ 137.346562, 219.968084, 316.781856, 415.643773, 516.706538,
+ 614.914364, 714.303763, 815.512135, 911.210485, 1008.501528,
+ 1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641,
+ 1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309,
+ 1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824,
+ 2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694,
+ 2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660,
+ 3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 4.656893, 5.123633, 5.594132, 6.162376,
+ 6.918433, 7.768444, 8.739415, 10.105862, 11.477328,
+ 13.236604, 15.421030, 19.093623, 25.801871, 46.724612,
+ 98.841054, 181.113466, 272.586364, 359.499769, 445.546343,
+ 525.944439, 605.188743, 681.793483, 756.668359, 838.486885,
+ 926.950356, 1015.482542, 1113.353926, 1204.897193, 1288.871992,
+ 1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771,
+ 1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872,
+ 2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216,
+ 2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436,
+ 3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.337370, 0.391916, 0.468839, 0.566334,
+ 0.762564, 1.069225, 1.384361, 1.787581, 2.293948,
+ 3.251909, 4.412991, 8.050068, 11.606073, 27.668092,
+ 65.227758, 128.463938, 202.097653, 262.715851, 312.464873,
+ 355.601398, 400.609054, 447.201352, 495.761568, 552.871938,
+ 619.067625, 691.984883, 773.753288, 860.628503, 946.262808,
+ 1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987,
+ 1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823,
+ 1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119,
+ 2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754,
+ 3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000,
+ },
+};
+
+static const double interp_dgrid_curv[3][65] = {
+ {
+ 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+ 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+ 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+ 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+ 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064,
+ 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123,
+ 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324,
+ 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733,
+ 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848,
+ 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550,
+ 0.000348, 0.000193, 0.000085, 0.000021, 0.000000,
+ },
+ {
+ 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+ 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+ 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+ 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519,
+ 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412,
+ 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825,
+ 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319,
+ 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733,
+ 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848,
+ 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550,
+ 0.000348, 0.000193, 0.000085, 0.000021, -0.000000,
+ },
+};
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+ double *rate_f, double *distbysse_f) {
+ const double x_start = -15.5;
+ const double x_end = 16.5;
+ const double x_step = 0.5;
+ const double epsilon = 1e-6;
+ const int rcat = bsize_curvfit_model_cat_lookup[bsize];
+ const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
+ (void)x_end;
+
+ xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+ xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+ const double x = (xqr - x_start) / x_step;
+ const int xi = (int)floor(x);
+ const double xo = x - xi;
+
+ assert(xi > 0);
+
+ const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
+ *rate_f = interp_cubic(prate, xo);
+ const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
+ *distbysse_f = interp_cubic(pdist, xo);
+}
+
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+ const int num_4x4_w = mi_size_wide[plane_bsize];
+ const int num_4x4_h = mi_size_high[plane_bsize];
+ const ENTROPY_CONTEXT *const above = pd->above_entropy_context;
+ const ENTROPY_CONTEXT *const left = pd->left_entropy_context;
+
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
+}
+
+// Special clamping used in the encoder when calculating a prediction
+//
+// Logically, all pixel fetches used for prediction are clamped against the
+// edges of the frame. But doing this directly is slow, so instead we allocate
+// a finite border around the frame and fill it with copies of the outermost
+// pixels.
+//
+// Since this border is finite, we need to clamp the motion vector before
+// prediction in order to avoid out-of-bounds reads. At the same time, this
+// clamp must not change the prediction result.
+//
+// We can balance both of these concerns by calculating how far we would have
+// to go in each direction before the extended prediction region (the current
+// block + AOM_INTERP_EXTEND many pixels around the block) would be mapped
+// so that it touches the frame only at one row or column. This is a special
+// point because any more extreme MV will always lead to the same prediction.
+// So it is safe to clamp at that point.
+//
+// In the worst case, this requires a border of
+// max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels
+// around the frame edges.
+static INLINE void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ MV *mv) {
+ int bw = xd->width << MI_SIZE_LOG2;
+ int bh = xd->height << MI_SIZE_LOG2;
+
+ int px_to_left_edge = xd->mi_col << MI_SIZE_LOG2;
+ int px_to_right_edge = (cm->mi_params.mi_cols - xd->mi_col) << MI_SIZE_LOG2;
+ int px_to_top_edge = xd->mi_row << MI_SIZE_LOG2;
+ int px_to_bottom_edge = (cm->mi_params.mi_rows - xd->mi_row) << MI_SIZE_LOG2;
+
+ const SubpelMvLimits mv_limits = {
+ .col_min = -GET_MV_SUBPEL(px_to_left_edge + bw + AOM_INTERP_EXTEND),
+ .col_max = GET_MV_SUBPEL(px_to_right_edge + AOM_INTERP_EXTEND),
+ .row_min = -GET_MV_SUBPEL(px_to_top_edge + bh + AOM_INTERP_EXTEND),
+ .row_max = GET_MV_SUBPEL(px_to_bottom_edge + AOM_INTERP_EXTEND)
+ };
+ clamp_mv(mv, &mv_limits);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+ int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+ const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+ const int_mv ref_mv =
+ av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext);
+ const int_mv ref_mv1 =
+ av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext);
+ MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+ int num_mv_refs = 0;
+ pred_mv[num_mv_refs++] = ref_mv.as_mv;
+ if (ref_mv.as_int != ref_mv1.as_int) {
+ pred_mv[num_mv_refs++] = ref_mv1.as_mv;
+ }
+
+ assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+ const uint8_t *const src_y_ptr = x->plane[0].src.buf;
+ int zero_seen = 0;
+ int best_sad = INT_MAX;
+ int max_mv = 0;
+ // Get the sad for each candidate reference mv.
+ for (int i = 0; i < num_mv_refs; ++i) {
+ MV *this_mv = &pred_mv[i];
+ enc_clamp_mv(&cpi->common, &x->e_mbd, this_mv);
+
+ const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+ const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+ max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+ if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+ zero_seen |= (fp_row == 0 && fp_col == 0);
+
+ const uint8_t *const ref_y_ptr =
+ &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+ // Find sad for current vector.
+ const int this_sad = cpi->ppi->fn_ptr[block_size].sdf(
+ src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
+ // Note if it is the best so far.
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ }
+ if (i == 0)
+ x->pred_mv0_sad[ref_frame] = this_sad;
+ else if (i == 1)
+ x->pred_mv1_sad[ref_frame] = this_sad;
+ }
+
+ // Note the index of the mv that worked best in the reference list.
+ x->max_mv_context[ref_frame] = max_mv;
+ x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv,
+ const int num_planes) {
+ dst[0].buf = src->y_buffer;
+ dst[0].stride = src->y_stride;
+ dst[1].buf = src->u_buffer;
+ dst[2].buf = src->v_buffer;
+ dst[1].stride = dst[2].stride = src->uv_stride;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ for (int i = 0; i < num_planes; ++i) {
+ setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf,
+ i ? src->uv_crop_width : src->y_crop_width,
+ i ? src->uv_crop_height : src->y_crop_height,
+ dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+ xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+ }
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+ int ref_frame) {
+ assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+ RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1];
+ const RefCntBuffer *const ref_buf =
+ get_ref_frame_buf(&cpi->common, ref_frame);
+ return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf
+ : NULL;
+}
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+ InterpFilter interp_filter, int dual_filter) {
+ if (interp_filter == SWITCHABLE) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ int inter_filter_cost = 0;
+ for (int dir = 0; dir < 2; ++dir) {
+ if (dir && !dual_filter) break;
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ const InterpFilter filter =
+ av1_extract_interp_filter(mbmi->interp_filters, dir);
+ inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter];
+ }
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+ } else {
+ return 0;
+ }
+}
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+ RD_OPT *const rd = &cpi->rd;
+
+ // Set baseline threshold values.
+ av1_zero(rd->thresh_mult);
+
+ rd->thresh_mult[THR_NEARESTMV] = 300;
+ rd->thresh_mult[THR_NEARESTL2] = 300;
+ rd->thresh_mult[THR_NEARESTL3] = 300;
+ rd->thresh_mult[THR_NEARESTB] = 300;
+ rd->thresh_mult[THR_NEARESTA2] = 300;
+ rd->thresh_mult[THR_NEARESTA] = 300;
+ rd->thresh_mult[THR_NEARESTG] = 300;
+
+ rd->thresh_mult[THR_NEWMV] = 1000;
+ rd->thresh_mult[THR_NEWL2] = 1000;
+ rd->thresh_mult[THR_NEWL3] = 1000;
+ rd->thresh_mult[THR_NEWB] = 1000;
+ rd->thresh_mult[THR_NEWA2] = 1100;
+ rd->thresh_mult[THR_NEWA] = 1000;
+ rd->thresh_mult[THR_NEWG] = 1000;
+
+ rd->thresh_mult[THR_NEARMV] = 1000;
+ rd->thresh_mult[THR_NEARL2] = 1000;
+ rd->thresh_mult[THR_NEARL3] = 1000;
+ rd->thresh_mult[THR_NEARB] = 1000;
+ rd->thresh_mult[THR_NEARA2] = 1000;
+ rd->thresh_mult[THR_NEARA] = 1000;
+ rd->thresh_mult[THR_NEARG] = 1000;
+
+ rd->thresh_mult[THR_GLOBALMV] = 2200;
+ rd->thresh_mult[THR_GLOBALL2] = 2000;
+ rd->thresh_mult[THR_GLOBALL3] = 2000;
+ rd->thresh_mult[THR_GLOBALB] = 2400;
+ rd->thresh_mult[THR_GLOBALA2] = 2000;
+ rd->thresh_mult[THR_GLOBALG] = 2000;
+ rd->thresh_mult[THR_GLOBALA] = 2400;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360;
+ rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640;
+ rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760;
+ rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640;
+ rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980;
+ rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200;
+
+ rd->thresh_mult[THR_DC] = 1000;
+ rd->thresh_mult[THR_PAETH] = 1000;
+ rd->thresh_mult[THR_SMOOTH] = 2200;
+ rd->thresh_mult[THR_SMOOTH_V] = 2000;
+ rd->thresh_mult[THR_SMOOTH_H] = 2000;
+ rd->thresh_mult[THR_H_PRED] = 2000;
+ rd->thresh_mult[THR_V_PRED] = 1800;
+ rd->thresh_mult[THR_D135_PRED] = 2500;
+ rd->thresh_mult[THR_D203_PRED] = 2000;
+ rd->thresh_mult[THR_D157_PRED] = 2500;
+ rd->thresh_mult[THR_D67_PRED] = 2000;
+ rd->thresh_mult[THR_D113_PRED] = 2500;
+ rd->thresh_mult[THR_D45_PRED] = 2500;
+}
+
+static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES],
+ THR_MODES best_mode_index,
+ THR_MODES mode_start, THR_MODES mode_end,
+ BLOCK_SIZE min_size, BLOCK_SIZE max_size,
+ int max_rd_thresh_factor) {
+ for (THR_MODES mode = mode_start; mode < mode_end; ++mode) {
+ for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &factor_buf[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
+ } else {
+ *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
+ }
+ }
+ }
+}
+
+void av1_update_rd_thresh_fact(
+ const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES],
+ int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index,
+ THR_MODES inter_mode_start, THR_MODES inter_mode_end,
+ THR_MODES intra_mode_start, THR_MODES intra_mode_end) {
+ assert(use_adaptive_rd_thresh > 0);
+ const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+
+ const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size;
+ BLOCK_SIZE min_size, max_size;
+ if (bsize_is_1_to_4) {
+ // This part handles block sizes with 1:4 and 4:1 aspect ratios
+ // TODO(any): Experiment with threshold update for parent/child blocks
+ min_size = bsize;
+ max_size = bsize;
+ } else {
+ min_size = AOMMAX(bsize - 2, BLOCK_4X4);
+ max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size);
+ }
+
+ update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end,
+ min_size, max_size, max_rd_thresh_factor);
+ update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end,
+ min_size, max_size, max_rd_thresh_factor);
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth) {
+ const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth);
+ switch (bit_depth) {
+ case AOM_BITS_8: return 20 * q;
+ case AOM_BITS_10: return 5 * q;
+ case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 0000000000..b38d9ca542
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, R, D) \
+ (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+ ((D) * (1 << RDDIV_BITS)))
+
+#define RDCOST_NEG_R(RM, R, D) \
+ (((D) * (1 << RDDIV_BITS)) - \
+ ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
+
+#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD) \
+ (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+ ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS)))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+// The fractional part of rd_thresh factor is stored with 5 bits. The maximum
+// factor that we allow is two, which is stored as 2 ** (5+1) = 64
+#define RD_THRESH_FAC_FRAC_BITS (5)
+#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS))
+#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1)
+#define RD_THRESH_LOG_DEC_FACTOR (4)
+#define RD_THRESH_INC (1)
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// Macros for common video resolutions: width x height
+// For example, 720p represents video resolution of 1280x720 pixels.
+#define RESOLUTION_288P 352 * 288
+#define RESOLUTION_360P 640 * 360
+#define RESOLUTION_480P 640 * 480
+#define RESOLUTION_720P 1280 * 720
+#define RESOLUTION_1080P 1920 * 1080
+#define RESOLUTION_1440P 2560 * 1440
+#define RESOLUTION_4K 3840 * 2160
+
+#define RTC_REFS 4
+static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = {
+ { LAST_FRAME, NONE_FRAME },
+ { ALTREF_FRAME, NONE_FRAME },
+ { GOLDEN_FRAME, NONE_FRAME },
+ { INTRA_FRAME, NONE_FRAME }
+};
+
+static INLINE int mode_offset(const PREDICTION_MODE mode) {
+ if (mode >= NEARESTMV) {
+ return INTER_OFFSET(mode);
+ } else {
+ switch (mode) {
+ case DC_PRED: return 0;
+ case V_PRED: return 1;
+ case H_PRED: return 2;
+ case SMOOTH_PRED: return 3;
+ default: assert(0); return -1;
+ }
+ }
+}
+
+enum {
+ // Default initialization when we are not using winner mode framework. e.g.
+ // intrabc
+ DEFAULT_EVAL = 0,
+ // Initialization for selecting winner mode
+ MODE_EVAL,
+ // Initialization for winner mode evaluation
+ WINNER_MODE_EVAL,
+ // All mode evaluation types
+ MODE_EVAL_TYPES,
+} UENUM1BYTE(MODE_EVAL_TYPE);
+
+typedef struct RD_OPT {
+ // Thresh_mult is used to set a threshold for the rd score. A higher value
+ // means that we will accept the best mode so far more often. This number
+ // is used in combination with the current block size, and thresh_freq_fact
+ // to pick a threshold.
+ int thresh_mult[MAX_MODES];
+
+ int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
+
+ int RDMULT;
+
+ double r0;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->rdcost = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip_txfm = 1;
+ rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = 0;
+ }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = INT_MAX;
+ rd_stats->dist = INT64_MAX;
+ rd_stats->rdcost = INT64_MAX;
+ rd_stats->sse = INT64_MAX;
+ rd_stats->skip_txfm = 0;
+ rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = INT_MAX;
+ }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+ const RD_STATS *rd_stats_src) {
+ if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
+ // If rd_stats_dst or rd_stats_src has invalid rate, we will make
+ // rd_stats_dst invalid.
+ av1_invalid_rd_stats(rd_stats_dst);
+ return;
+ }
+ rd_stats_dst->rate = (int)AOMMIN(
+ ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
+ if (!rd_stats_dst->zero_rate)
+ rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
+ rd_stats_dst->dist += rd_stats_src->dist;
+ if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) {
+ rd_stats_dst->sse += rd_stats_src->sse;
+ }
+ rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+ }
+#endif
+}
+
+static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
+ int rate, int skip_txfm, int64_t sse,
+ int zero_rate) {
+ assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
+ rd_stats->rate += rate;
+ if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
+ rd_stats->dist += dist;
+ rd_stats->skip_txfm &= skip_txfm;
+ rd_stats->sse += sse;
+}
+
+static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
+ assert(mult >= 0);
+ if (rate >= 0) {
+ return RDCOST(mult, rate, dist);
+ }
+ return RDCOST_NEG_R(mult, -rate, dist);
+}
+
+static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
+ if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX &&
+ rd_cost->rdcost < INT64_MAX) {
+ rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist);
+ } else {
+ av1_invalid_rd_stats(rd_cost);
+ }
+}
+
+static INLINE void av1_rd_stats_subtraction(int mult,
+ const RD_STATS *const left,
+ const RD_STATS *const right,
+ RD_STATS *result) {
+ if (left->rate == INT_MAX || right->rate == INT_MAX ||
+ left->dist == INT64_MAX || right->dist == INT64_MAX ||
+ left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) {
+ av1_invalid_rd_stats(result);
+ } else {
+ result->rate = left->rate - right->rate;
+ result->dist = left->dist - right->dist;
+ result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist);
+ }
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+/*!\brief Compute rdmult based on q index and frame update type
+ *
+ * \param[in] bit_depth bit depth
+ * \param[in] update_type frame update type
+ * \param[in] qindex q index
+ *
+ * \return rdmult
+ */
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+ FRAME_UPDATE_TYPE update_type,
+ int qindex);
+
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+ const FRAME_UPDATE_TYPE update_type,
+ const int layer_depth, const int boost_index,
+ const FRAME_TYPE frame_type,
+ const int use_fixed_qp_offsets,
+ const int is_stat_consumption_stage);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+// Sets the multiplier to convert mv cost to l1 error during motion search.
+void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit,
+ int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+ unsigned int qstep, int *rate, int64_t *dist);
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+ double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+ double yl, double *rate_f, double *distbysse_f);
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+ InterpFilter interp_filter, int dual_filter);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+ int ref_frame);
+
+void av1_init_me_luts(void);
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+ int (*fact)[MAX_MODES], int rd_thresh,
+ BLOCK_SIZE bsize, THR_MODES best_mode_index,
+ THR_MODES inter_mode_start,
+ THR_MODES inter_mode_end,
+ THR_MODES intra_mode_start,
+ THR_MODES intra_mode_end);
+
+static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
+ for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ for (int j = 0; j < MAX_MODES; ++j) {
+ x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL;
+ }
+ }
+}
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
+ int thresh_fact) {
+ return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+ BLOCK_SIZE block_size);
+
+// Sets the multiplier to convert mv cost to l2 error during motion search.
+static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) {
+ *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1);
+}
+
+// Get the threshold for R-D optimization of coefficients depending upon mode
+// decision/winner mode processing
+static INLINE void get_rd_opt_coeff_thresh(
+ const uint32_t (*const coeff_opt_threshold)[2],
+ TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt,
+ int is_winner_mode) {
+ if (!enable_winner_mode_for_coeff_opt) {
+ // Default initialization of threshold
+ txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0];
+ txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1];
+ return;
+ }
+ // TODO(any): Experiment with coeff_opt_dist_threshold values when
+ // enable_winner_mode_for_coeff_opt is ON
+ // TODO(any): Skip the winner mode processing for blocks with lower residual
+ // energy as R-D optimization of coefficients would have been enabled during
+ // mode decision
+
+ // Use conservative threshold during mode decision and perform R-D
+ // optimization of coeffs always for winner modes
+ if (is_winner_mode) {
+ txfm_params->coeff_opt_thresholds[0] =
+ coeff_opt_threshold[WINNER_MODE_EVAL][0];
+ txfm_params->coeff_opt_thresholds[1] =
+ coeff_opt_threshold[WINNER_MODE_EVAL][1];
+ } else {
+ txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0];
+ txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1];
+ }
+}
+
+// Used to reset the state of mb rd hash information
+static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) {
+ if (!mb_rd_record) return;
+
+ // Reset the state for use_mb_rd_hash
+ mb_rd_record->num = mb_rd_record->index_start = 0;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv,
+ const int num_planes);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth);
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
+ FRAME_CONTEXT *fc);
+
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
+ const int num_planes);
+
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+ MvCosts *mv_costs);
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs);
+
+int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
+
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta);
+
+/*!\brief Adjust current superblock's q_index based on delta q resolution
+ *
+ * \param[in] delta_q_res delta q resolution
+ * \param[in] prev_qindex previous superblock's q index
+ * \param[in] curr_qindex current superblock's q index
+ *
+ * \return the current superblock's adjusted q_index
+ */
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+ int curr_qindex);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 0000000000..c17fbccf8c
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,6598 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/mode_prune_model_weights.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+#include "av1/encoder/var_based_part.h"
+
+#define LAST_NEW_MV_INDEX 6
+
+// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable
+// The values are kept in Q12 format and equation used to derive is
+// (2.5 - ((float)x->qindex / MAXQ) * 1.5)
+#define MODE_THRESH_QBITS 12
+static const int mode_threshold_mul_factor[QINDEX_RANGE] = {
+ 10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999,
+ 9975, 9951, 9927, 9903, 9879, 9854, 9830, 9806, 9782, 9758, 9734,
+ 9710, 9686, 9662, 9638, 9614, 9589, 9565, 9541, 9517, 9493, 9469,
+ 9445, 9421, 9397, 9373, 9349, 9324, 9300, 9276, 9252, 9228, 9204,
+ 9180, 9156, 9132, 9108, 9083, 9059, 9035, 9011, 8987, 8963, 8939,
+ 8915, 8891, 8867, 8843, 8818, 8794, 8770, 8746, 8722, 8698, 8674,
+ 8650, 8626, 8602, 8578, 8553, 8529, 8505, 8481, 8457, 8433, 8409,
+ 8385, 8361, 8337, 8312, 8288, 8264, 8240, 8216, 8192, 8168, 8144,
+ 8120, 8096, 8072, 8047, 8023, 7999, 7975, 7951, 7927, 7903, 7879,
+ 7855, 7831, 7806, 7782, 7758, 7734, 7710, 7686, 7662, 7638, 7614,
+ 7590, 7566, 7541, 7517, 7493, 7469, 7445, 7421, 7397, 7373, 7349,
+ 7325, 7301, 7276, 7252, 7228, 7204, 7180, 7156, 7132, 7108, 7084,
+ 7060, 7035, 7011, 6987, 6963, 6939, 6915, 6891, 6867, 6843, 6819,
+ 6795, 6770, 6746, 6722, 6698, 6674, 6650, 6626, 6602, 6578, 6554,
+ 6530, 6505, 6481, 6457, 6433, 6409, 6385, 6361, 6337, 6313, 6289,
+ 6264, 6240, 6216, 6192, 6168, 6144, 6120, 6096, 6072, 6048, 6024,
+ 5999, 5975, 5951, 5927, 5903, 5879, 5855, 5831, 5807, 5783, 5758,
+ 5734, 5710, 5686, 5662, 5638, 5614, 5590, 5566, 5542, 5518, 5493,
+ 5469, 5445, 5421, 5397, 5373, 5349, 5325, 5301, 5277, 5253, 5228,
+ 5204, 5180, 5156, 5132, 5108, 5084, 5060, 5036, 5012, 4987, 4963,
+ 4939, 4915, 4891, 4867, 4843, 4819, 4795, 4771, 4747, 4722, 4698,
+ 4674, 4650, 4626, 4602, 4578, 4554, 4530, 4506, 4482, 4457, 4433,
+ 4409, 4385, 4361, 4337, 4313, 4289, 4265, 4241, 4216, 4192, 4168,
+ 4144, 4120, 4096
+};
+
+static const THR_MODES av1_default_mode_order[MAX_MODES] = {
+ THR_NEARESTMV,
+ THR_NEARESTL2,
+ THR_NEARESTL3,
+ THR_NEARESTB,
+ THR_NEARESTA2,
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_NEWMV,
+ THR_NEWL2,
+ THR_NEWL3,
+ THR_NEWB,
+ THR_NEWA2,
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+ THR_NEARL2,
+ THR_NEARL3,
+ THR_NEARB,
+ THR_NEARA2,
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_GLOBALMV,
+ THR_GLOBALL2,
+ THR_GLOBALL3,
+ THR_GLOBALB,
+ THR_GLOBALA2,
+ THR_GLOBALA,
+ THR_GLOBALG,
+
+ THR_COMP_NEAREST_NEARESTLA,
+ THR_COMP_NEAREST_NEARESTL2A,
+ THR_COMP_NEAREST_NEARESTL3A,
+ THR_COMP_NEAREST_NEARESTGA,
+ THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTLA2,
+ THR_COMP_NEAREST_NEARESTL2A2,
+ THR_COMP_NEAREST_NEARESTL3A2,
+ THR_COMP_NEAREST_NEARESTGA2,
+ THR_COMP_NEAREST_NEARESTLL2,
+ THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG,
+ THR_COMP_NEAREST_NEARESTBA,
+
+ THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEW_NEWLB,
+ THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEW_NEARLB,
+ THR_COMP_NEAR_NEWLB,
+ THR_COMP_GLOBAL_GLOBALLB,
+
+ THR_COMP_NEAR_NEARLA,
+ THR_COMP_NEW_NEWLA,
+ THR_COMP_NEW_NEARESTLA,
+ THR_COMP_NEAREST_NEWLA,
+ THR_COMP_NEW_NEARLA,
+ THR_COMP_NEAR_NEWLA,
+ THR_COMP_GLOBAL_GLOBALLA,
+
+ THR_COMP_NEAR_NEARL2A,
+ THR_COMP_NEW_NEWL2A,
+ THR_COMP_NEW_NEARESTL2A,
+ THR_COMP_NEAREST_NEWL2A,
+ THR_COMP_NEW_NEARL2A,
+ THR_COMP_NEAR_NEWL2A,
+ THR_COMP_GLOBAL_GLOBALL2A,
+
+ THR_COMP_NEAR_NEARL3A,
+ THR_COMP_NEW_NEWL3A,
+ THR_COMP_NEW_NEARESTL3A,
+ THR_COMP_NEAREST_NEWL3A,
+ THR_COMP_NEW_NEARL3A,
+ THR_COMP_NEAR_NEWL3A,
+ THR_COMP_GLOBAL_GLOBALL3A,
+
+ THR_COMP_NEAR_NEARGA,
+ THR_COMP_NEW_NEWGA,
+ THR_COMP_NEW_NEARESTGA,
+ THR_COMP_NEAREST_NEWGA,
+ THR_COMP_NEW_NEARGA,
+ THR_COMP_NEAR_NEWGA,
+ THR_COMP_GLOBAL_GLOBALGA,
+
+ THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEW_NEWL2B,
+ THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEAR_NEWL2B,
+ THR_COMP_GLOBAL_GLOBALL2B,
+
+ THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEW_NEWL3B,
+ THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEAR_NEWL3B,
+ THR_COMP_GLOBAL_GLOBALL3B,
+
+ THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEW_NEWGB,
+ THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEW_NEARGB,
+ THR_COMP_NEAR_NEWGB,
+ THR_COMP_GLOBAL_GLOBALGB,
+
+ THR_COMP_NEAR_NEARLA2,
+ THR_COMP_NEW_NEWLA2,
+ THR_COMP_NEW_NEARESTLA2,
+ THR_COMP_NEAREST_NEWLA2,
+ THR_COMP_NEW_NEARLA2,
+ THR_COMP_NEAR_NEWLA2,
+ THR_COMP_GLOBAL_GLOBALLA2,
+
+ THR_COMP_NEAR_NEARL2A2,
+ THR_COMP_NEW_NEWL2A2,
+ THR_COMP_NEW_NEARESTL2A2,
+ THR_COMP_NEAREST_NEWL2A2,
+ THR_COMP_NEW_NEARL2A2,
+ THR_COMP_NEAR_NEWL2A2,
+ THR_COMP_GLOBAL_GLOBALL2A2,
+
+ THR_COMP_NEAR_NEARL3A2,
+ THR_COMP_NEW_NEWL3A2,
+ THR_COMP_NEW_NEARESTL3A2,
+ THR_COMP_NEAREST_NEWL3A2,
+ THR_COMP_NEW_NEARL3A2,
+ THR_COMP_NEAR_NEWL3A2,
+ THR_COMP_GLOBAL_GLOBALL3A2,
+
+ THR_COMP_NEAR_NEARGA2,
+ THR_COMP_NEW_NEWGA2,
+ THR_COMP_NEW_NEARESTGA2,
+ THR_COMP_NEAREST_NEWGA2,
+ THR_COMP_NEW_NEARGA2,
+ THR_COMP_NEAR_NEWGA2,
+ THR_COMP_GLOBAL_GLOBALGA2,
+
+ THR_COMP_NEAR_NEARLL2,
+ THR_COMP_NEW_NEWLL2,
+ THR_COMP_NEW_NEARESTLL2,
+ THR_COMP_NEAREST_NEWLL2,
+ THR_COMP_NEW_NEARLL2,
+ THR_COMP_NEAR_NEWLL2,
+ THR_COMP_GLOBAL_GLOBALLL2,
+
+ THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEW_NEWLL3,
+ THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEAR_NEWLL3,
+ THR_COMP_GLOBAL_GLOBALLL3,
+
+ THR_COMP_NEAR_NEARLG,
+ THR_COMP_NEW_NEWLG,
+ THR_COMP_NEW_NEARESTLG,
+ THR_COMP_NEAREST_NEWLG,
+ THR_COMP_NEW_NEARLG,
+ THR_COMP_NEAR_NEWLG,
+ THR_COMP_GLOBAL_GLOBALLG,
+
+ THR_COMP_NEAR_NEARBA,
+ THR_COMP_NEW_NEWBA,
+ THR_COMP_NEW_NEARESTBA,
+ THR_COMP_NEAREST_NEWBA,
+ THR_COMP_NEW_NEARBA,
+ THR_COMP_NEAR_NEWBA,
+ THR_COMP_GLOBAL_GLOBALBA,
+
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+};
+
+/*!\cond */
+typedef struct SingleInterModeState {
+ int64_t rd;
+ MV_REFERENCE_FRAME ref_frame;
+ int valid;
+} SingleInterModeState;
+
+typedef struct InterModeSearchState {
+ int64_t best_rd;
+ int64_t best_skip_rd[2];
+ MB_MODE_INFO best_mbmode;
+ int best_rate_y;
+ int best_rate_uv;
+ int best_mode_skippable;
+ int best_skip2;
+ THR_MODES best_mode_index;
+ int num_available_refs;
+ int64_t dist_refs[REF_FRAMES];
+ int dist_order_refs[REF_FRAMES];
+ int64_t mode_threshold[MAX_MODES];
+ int64_t best_intra_rd;
+ unsigned int best_pred_sse;
+
+ /*!
+ * \brief Keep track of best intra rd for use in compound mode.
+ */
+ int64_t best_pred_rd[REFERENCE_MODES];
+ // Save a set of single_newmv for each checked ref_mv.
+ int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
+ int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES];
+ int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES];
+ int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+ // The rd of simple translation in single inter modes
+ int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+ int64_t best_single_rd[REF_FRAMES];
+ PREDICTION_MODE best_single_mode[REF_FRAMES];
+
+ // Single search results by [directions][modes][reference frames]
+ SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+ SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+ [FWD_REFS];
+ int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+ MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ IntraModeSearchState intra_search_state;
+ RD_STATS best_y_rdcost;
+} InterModeSearchState;
+/*!\endcond */
+
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
+ for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
+ md->ready = 0;
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
+ }
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_dist = (int64_t)round(md->dist_mean);
+ const double est_ld = md->a * sse + md->b;
+ // Clamp estimated rate cost by INT_MAX / 2.
+ // TODO(angiebird@google.com): find better solution than clamping.
+ if (fabs(est_ld) < 1e-2) {
+ *est_residue_cost = INT_MAX / 2;
+ } else {
+ double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+ if (est_residue_cost_dbl < 0) {
+ *est_residue_cost = 0;
+ } else {
+ *est_residue_cost =
+ (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+ }
+ }
+ if (*est_residue_cost <= 0) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ }
+ }
+ return 1;
+ }
+ return 0;
+}
+
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const int block_idx = inter_mode_data_block_idx(bsize);
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (block_idx == -1) continue;
+ if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
+ continue;
+ } else {
+ if (md->ready == 0) {
+ md->dist_mean = md->dist_sum / md->num;
+ md->ld_mean = md->ld_sum / md->num;
+ md->sse_mean = md->sse_sum / md->num;
+ md->sse_sse_mean = md->sse_sse_sum / md->num;
+ md->sse_ld_mean = md->sse_ld_sum / md->num;
+ } else {
+ const double factor = 3;
+ md->dist_mean =
+ (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+ md->ld_mean =
+ (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+ md->sse_mean =
+ (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+ md->sse_sse_mean =
+ (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+ (factor + 1);
+ md->sse_ld_mean =
+ (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+ (factor + 1);
+ }
+
+ const double my = md->ld_mean;
+ const double mx = md->sse_mean;
+ const double dx = sqrt(md->sse_sse_mean);
+ const double dxy = md->sse_ld_mean;
+
+ md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+ md->b = my - md->a * mx;
+ md->ready = 1;
+
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
+ }
+ (void)rdmult;
+ }
+}
+
+static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
+ BLOCK_SIZE bsize, int64_t sse,
+ int64_t dist, int residue_cost) {
+ if (residue_cost == 0 || sse == dist) return;
+ const int block_idx = inter_mode_data_block_idx(bsize);
+ if (block_idx == -1) return;
+ InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+ if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+ const double ld = (sse - dist) * 1. / residue_cost;
+ ++rd_model->num;
+ rd_model->dist_sum += dist;
+ rd_model->ld_sum += ld;
+ rd_model->sse_sum += sse;
+ rd_model->sse_sse_sum += (double)sse * (double)sse;
+ rd_model->sse_ld_sum += sse * ld;
+ }
+}
+
+static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
+ int mode_rate, int64_t sse,
+ int64_t rd, RD_STATS *rd_cost,
+ RD_STATS *rd_cost_y,
+ RD_STATS *rd_cost_uv,
+ const MB_MODE_INFO *mbmi) {
+ const int num = inter_modes_info->num;
+ assert(num < MAX_INTER_MODES);
+ inter_modes_info->mbmi_arr[num] = *mbmi;
+ inter_modes_info->mode_rate_arr[num] = mode_rate;
+ inter_modes_info->sse_arr[num] = sse;
+ inter_modes_info->est_rd_arr[num] = rd;
+ inter_modes_info->rd_cost_arr[num] = *rd_cost;
+ inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
+ inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
+ ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+ if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+ // To avoid inconsistency in qsort() ordering when two elements are equal,
+ // using idx as tie breaker. Refer aomedia:2928
+ if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx)
+ return 0;
+ else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx)
+ return 1;
+ else
+ return -1;
+ } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+static AOM_INLINE void inter_modes_info_sort(
+ const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) {
+ if (inter_modes_info->num == 0) {
+ return;
+ }
+ for (int i = 0; i < inter_modes_info->num; ++i) {
+ rd_idx_pair_arr[i].idx = i;
+ rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+ }
+ qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+ compare_rd_idx_pair);
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - left neighbor pixel
+ // z - top neighbor pixel
+ int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0;
+ int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0;
+ int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0;
+
+ // First, process horizontal correlation on just the first row
+ x_sum += diff[0];
+ x2_sum += diff[0] * diff[0];
+ x_firstrow += diff[0];
+ x2_firstrow += diff[0] * diff[0];
+ for (int j = 1; j < width; ++j) {
+ const int16_t x = diff[j];
+ const int16_t y = diff[j - 1];
+ x_sum += x;
+ x_firstrow += x;
+ x2_sum += x * x;
+ x2_firstrow += x * x;
+ xy_sum += x * y;
+ }
+
+ // Process vertical correlation in the first column
+ x_firstcol += diff[0];
+ x2_firstcol += diff[0] * diff[0];
+ for (int i = 1; i < height; ++i) {
+ const int16_t x = diff[i * stride];
+ const int16_t z = diff[(i - 1) * stride];
+ x_sum += x;
+ x_firstcol += x;
+ x2_sum += x * x;
+ x2_firstcol += x * x;
+ xz_sum += x * z;
+ }
+
+ // Now process horiz and vert correlation through the rest unit
+ for (int i = 1; i < height; ++i) {
+ for (int j = 1; j < width; ++j) {
+ const int16_t x = diff[i * stride + j];
+ const int16_t y = diff[i * stride + j - 1];
+ const int16_t z = diff[(i - 1) * stride + j];
+ x_sum += x;
+ x2_sum += x * x;
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+ }
+
+ for (int j = 0; j < width; ++j) {
+ x_finalrow += diff[(height - 1) * stride + j];
+ x2_finalrow +=
+ diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j];
+ }
+ for (int i = 0; i < height; ++i) {
+ x_finalcol += diff[i * stride + width - 1];
+ x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
+
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
+ int64_t *sse_y) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int64_t total_sse = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+ unsigned int sse;
+
+ cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ total_sse += sse;
+ if (!plane && sse_y) *sse_y = sse;
+ }
+ total_sse <<= 4;
+ return total_sse;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += coeff[i] * coeff[i];
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t block_size) {
+ int64_t error = 0;
+
+ for (int i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ }
+
+ return error;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+#endif
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+ PREDICTION_MODE best_intra_mode) {
+ if (mode == D113_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D67_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D203_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D157_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
+}
+
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
+ int16_t mode_context) {
+ if (is_inter_compound_mode(mode)) {
+ return mode_costs
+ ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+ }
+
+ int mode_cost = 0;
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+ assert(is_inter_mode(mode));
+
+ if (mode == NEWMV) {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+ if (mode == GLOBALMV) {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+ return mode_cost;
+ }
+ }
+}
+
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+ int ref_idx) {
+ return ref_idx ? compound_ref1_mode(this_mode)
+ : compound_ref0_mode(this_mode);
+}
+
+static AOM_INLINE void estimate_ref_frame_costs(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
+ int segment_id, unsigned int *ref_costs_single,
+ unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+ int ref_frame;
+ for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+ memset(ref_costs_comp[ref_frame], 0,
+ REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+ } else {
+ int intra_inter_ctx = av1_get_intra_inter_context(xd);
+ ref_costs_single[INTRA_FRAME] =
+ mode_costs->intra_inter_cost[intra_inter_ctx][0];
+ unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ ref_costs_single[i] = base_cost;
+
+ const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+ const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+ const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+ const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+ const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+ const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+ // Determine cost of a single ref frame, where frame types are represented
+ // by a tree:
+ // Level 0: add cost whether this ref is a forward or backward ref
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+ ref_costs_single[ALTREF2_FRAME] +=
+ mode_costs->single_ref_cost[ctx_p1][0][1];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+
+ // Level 1: if this ref is forward ref,
+ // add cost whether it is last/last2 or last3/golden
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+ ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+ ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+
+ // Level 1: if this ref is backward ref
+ // then add cost whether this ref is altref or backward ref
+ ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0];
+ ref_costs_single[ALTREF2_FRAME] +=
+ mode_costs->single_ref_cost[ctx_p2][1][0];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1];
+
+ // Level 2: further add cost whether this ref is last or last2
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0];
+ ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1];
+
+ // Level 2: last3 or golden
+ ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1];
+
+ // Level 2: bwdref or altref2
+ ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0];
+ ref_costs_single[ALTREF2_FRAME] +=
+ mode_costs->single_ref_cost[ctx_p6][5][1];
+
+ if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
+ // Similar to single ref, determine cost of compound ref frames.
+ // cost_compound_refs = cost_first_ref + cost_second_ref
+ const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+ const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+ const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+ const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+ const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+ const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+ unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+ ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+ ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
+ ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+ ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+ // cost of first ref frame
+ ref_bicomp_costs[LAST_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+ ref_bicomp_costs[LAST2_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+ ref_bicomp_costs[LAST3_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+ ref_bicomp_costs[GOLDEN_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+ ref_bicomp_costs[LAST_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
+ ref_bicomp_costs[LAST2_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+ ref_bicomp_costs[LAST3_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
+ ref_bicomp_costs[GOLDEN_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+ // cost of second ref frame
+ ref_bicomp_costs[BWDREF_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+ ref_bicomp_costs[ALTREF2_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+ ref_bicomp_costs[ALTREF_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+ ref_bicomp_costs[BWDREF_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+ ref_bicomp_costs[ALTREF2_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+ // cost: if one ref frame is forward ref, the other ref is backward ref
+ int ref0, ref1;
+ for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+ for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+ ref_costs_comp[ref0][ref1] =
+ ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+ }
+ }
+
+ // cost: if both ref frames are the same side.
+ const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+ const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+ const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+ ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+ ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+ ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+ ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+ } else {
+ int ref0, ref1;
+ for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+ for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+ ref_costs_comp[ref0][ref1] = 512;
+ }
+ ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+ ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+ ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+ ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+ }
+ }
+}
+
+static AOM_INLINE void store_coding_context(
+#if CONFIG_INTERNAL_STATS
+ MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
+#else
+ MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+#endif // CONFIG_INTERNAL_STATS
+ int skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm;
+ ctx->skippable = skippable;
+#if CONFIG_INTERNAL_STATS
+ ctx->best_mode_index = mode_index;
+#endif // CONFIG_INTERNAL_STATS
+ ctx->mic = *xd->mi[0];
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+}
+
+static AOM_INLINE void setup_buffer_ref_mvs_inter(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref_frame);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, ref_frame);
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+ assert(yv12 != NULL);
+
+ if (scaled_ref_frame) {
+ // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
+ // support scaling.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL,
+ num_planes);
+ } else {
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+ }
+
+ // Gets an initial list of candidate vectors from neighbours and orders them
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ // Further refinement that is encode side only to test the top few candidates
+ // in full and choose the best as the center point for subsequent searches.
+ // The current implementation doesn't support scaling.
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride,
+ ref_frame, block_size);
+
+ // Go back to unscaled reference.
+ if (scaled_ref_frame) {
+ // We had temporarily setup pred block based on scaled reference above. Go
+ // back to unscaled reference now, for subsequent use.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+ }
+}
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+ const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_bottom_edge +
+ RIGHT_BOTTOM_MARGIN };
+ clamp_mv(mv, &mv_limits);
+}
+
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+ const MACROBLOCK *const x,
+ PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2],
+ InterModeSearchState *search_state) {
+ const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+ if (!is_comp_pred) {
+ if (this_mode == NEARMV) {
+ if (ref_mv_count == 0) {
+ // NEARMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // NEARMV has the same motion vector as GLOBALMV
+ compare_mode = GLOBALMV;
+ }
+ }
+ if (this_mode == GLOBALMV) {
+ if (ref_mv_count == 0 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // GLOBALMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1) {
+ // GLOBALMV has the same motion vector as NEARMV
+ compare_mode = NEARMV;
+ }
+ }
+
+ if (compare_mode != MB_MODE_COUNT) {
+ // Use modelled_rd to check whether compare mode was searched
+ if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+ INT64_MAX) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+ const int compare_cost =
+ cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx);
+ const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx);
+
+ // Only skip if the mode cost is larger than compare mode cost
+ if (this_cost > compare_cost) {
+ search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+ search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+ const AV1_COMMON *cm,
+ const MACROBLOCK *x) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ *out_mv = in_mv;
+ lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv,
+ cm->features.cur_frame_force_integer_mv);
+ clamp_mv2(&out_mv->as_mv, xd);
+ return av1_is_fullmv_in_range(&x->mv_limits,
+ get_fullmv_from_mv(&out_mv->as_mv));
+}
+
+// To use single newmv directly for compound modes, need to clamp the mv to the
+// valid mv range. Without this, encoder would generate out of range mv, and
+// this is seen in 8k encoding.
+static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
+ int ref_idx) {
+ const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+ SubpelMvLimits mv_limits;
+
+ av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
+ clamp_mv(&mv->as_mv, &mv_limits);
+}
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, int_mv *cur_mv,
+ int *const rate_mv, HandleInterModeArgs *const args,
+ inter_mode_info *mode_info) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int refs[2] = { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+
+ if (is_comp_pred) {
+ const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
+ const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
+ if (this_mode == NEW_NEWMV) {
+ if (valid_mv0) {
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ clamp_mv_in_range(x, &cur_mv[0], 0);
+ }
+ if (valid_mv1) {
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+ clamp_mv_in_range(x, &cur_mv[1], 1);
+ }
+ *rate_mv = 0;
+ for (int i = 0; i < 2; ++i) {
+ const int_mv ref_mv = av1_get_ref_mv(x, i);
+ *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ if (valid_mv1) {
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+ clamp_mv_in_range(x, &cur_mv[1], 1);
+ }
+ const int_mv ref_mv = av1_get_ref_mv(x, 1);
+ *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ } else {
+ assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+ if (valid_mv0) {
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ clamp_mv_in_range(x, &cur_mv[0], 0);
+ }
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+ } else {
+ // Single ref case.
+ const int ref_idx = 0;
+ int search_range = INT_MAX;
+
+ if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) {
+ const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+ int min_mv_diff = INT_MAX;
+ int best_match = -1;
+ MV prev_ref_mv[2] = { { 0 } };
+ for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
+ prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
+ idx, &x->mbmi_ext)
+ .as_mv;
+ const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
+ abs(ref_mv.col - prev_ref_mv[idx].col));
+
+ if (min_mv_diff > ref_mv_diff) {
+ min_mv_diff = ref_mv_diff;
+ best_match = idx;
+ }
+ }
+
+ if (min_mv_diff < (16 << 3)) {
+ if (args->single_newmv_valid[best_match][refs[0]]) {
+ search_range = min_mv_diff;
+ search_range +=
+ AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row -
+ prev_ref_mv[best_match].row),
+ abs(args->single_newmv[best_match][refs[0]].as_mv.col -
+ prev_ref_mv[best_match].col));
+ // Get full pixel search range.
+ search_range = (search_range + 4) >> 3;
+ }
+ }
+ }
+
+ int_mv best_mv;
+ av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
+ mode_info, &best_mv, args);
+ if (best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+ args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
+ args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+ args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
+ cur_mv[0].as_int = best_mv.as_int;
+
+ // Return after single_newmv is set.
+ if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX;
+ }
+
+ return 0;
+}
+
+static INLINE void update_mode_start_end_index(
+ const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
+ int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
+ int interintra_allowed, int eval_motion_mode) {
+ *mode_index_start = (int)SIMPLE_TRANSLATION;
+ *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
+ if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+ if (!eval_motion_mode) {
+ *mode_index_end = (int)SIMPLE_TRANSLATION;
+ } else {
+ // Set the start index appropriately to process motion modes other than
+ // simple translation
+ *mode_index_start = 1;
+ }
+ }
+ if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16)
+ *mode_index_end = SIMPLE_TRANSLATION;
+}
+
+/*!\brief AV1 motion mode search
+ *
+ * \ingroup inter_mode_search
+ * Function to search over and determine the motion mode. It will update
+ * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or
+ * WARPED_CAUSAL and determine any necessary side information for the selected
+ * motion mode. It will also perform the full transform search, unless the
+ * input parameter do_tx_search indicates to do an estimation of the RD rather
+ * than an RD corresponding to a full transform search. It will return the
+ * RD for the final motion_mode.
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding.
+ * \param[in] x Pointer to struct holding all the data for
+ * the current macroblock.
+ * \param[in] bsize Current block size.
+ * \param[in,out] rd_stats Struct to keep track of the overall RD
+ * information.
+ * \param[in,out] rd_stats_y Struct to keep track of the RD information
+ * for only the Y plane.
+ * \param[in,out] rd_stats_uv Struct to keep track of the RD information
+ * for only the UV planes.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ * \param[in,out] ref_skip_rd A length 2 array, where skip_rd[0] is the
+ * best total RD for a skip mode so far, and
+ * skip_rd[1] is the best RD for a skip mode so
+ * far in luma. This is used as a speed feature
+ * to skip the transform search if the computed
+ * skip RD for the current mode is not better
+ * than the best skip_rd so far.
+ * \param[in,out] rate_mv The rate associated with the motion vectors.
+ * This will be modified if a motion search is
+ * done in the motion mode search.
+ * \param[in,out] orig_dst A prediction buffer to hold a computed
+ * prediction. This will eventually hold the
+ * final prediction, and the tmp_dst info will
+ * be copied here.
+ * \param[in,out] best_est_rd Estimated RD for motion mode search if
+ * do_tx_search (see below) is 0.
+ * \param[in] do_tx_search Parameter to indicate whether or not to do
+ * a full transform search. This will compute
+ * an estimated RD for the modes without the
+ * transform search and later perform the full
+ * transform search on the best candidates.
+ * \param[in] inter_modes_info InterModesInfo struct to hold inter mode
+ * information to perform a full transform
+ * search only on winning candidates searched
+ * with an estimate for transform coding RD.
+ * \param[in] eval_motion_mode Boolean whether or not to evaluate motion
+ * motion modes other than SIMPLE_TRANSLATION.
+ * \param[out] yrd Stores the rdcost corresponding to encoding
+ * the luma plane.
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
+static int64_t motion_mode_rd(
+ const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd,
+ int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst,
+ int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info,
+ int eval_motion_mode, int64_t *yrd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FeatureFlags *const features = &cm->features;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int rate2_nocoeff = rd_stats->rate;
+ int best_xskip_txfm = 0;
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int rate_mv0 = *rate_mv;
+ const int interintra_allowed = cm->seq_params->enable_interintra_compound &&
+ is_interintra_allowed(mbmi) &&
+ mbmi->compound_idx;
+ WARP_SAMPLE_INFO *const warp_sample_info =
+ &x->warp_sample_info[mbmi->ref_frame[0]];
+ int *pts0 = warp_sample_info->pts;
+ int *pts_inref0 = warp_sample_info->pts_inref;
+
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+ av1_invalid_rd_stats(&best_rd_stats);
+ mbmi->num_proj_ref = 1; // assume num_proj_ref >=1
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+ *yrd = INT64_MAX;
+ if (features->switchable_motion_mode) {
+ // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+ // is allowed.
+ last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mbmi, features->allow_warped_motion);
+ }
+
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ // Collect projection samples used in least squares approximation of
+ // the warped motion parameters if WARPED_CAUSAL is going to be searched.
+ if (warp_sample_info->num < 0) {
+ warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+ }
+ mbmi->num_proj_ref = warp_sample_info->num;
+ }
+ const int total_samples = mbmi->num_proj_ref;
+ if (total_samples == 0) {
+ // Do not search WARPED_CAUSAL if there are no samples to use to determine
+ // warped parameters.
+ last_motion_mode_allowed = OBMC_CAUSAL;
+ }
+
+ const MB_MODE_INFO base_mbmi = *mbmi;
+ MB_MODE_INFO best_mbmi;
+ const int interp_filter = features->interp_filter;
+ const int switchable_rate =
+ av1_is_interp_needed(xd)
+ ? av1_get_switchable_rate(x, xd, interp_filter,
+ cm->seq_params->enable_dual_filter)
+ : 0;
+ int64_t best_rd = INT64_MAX;
+ int best_rate_mv = rate_mv0;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int mode_index_start, mode_index_end;
+ const int txfm_rd_gate_level =
+ get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_MOTION_MODE, eval_motion_mode);
+
+ // Modify the start and end index according to speed features. For example,
+ // if SIMPLE_TRANSLATION has already been searched according to
+ // the motion_mode_for_winner_cand speed feature, update the mode_index_start
+ // to avoid searching it again.
+ update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end,
+ last_motion_mode_allowed, interintra_allowed,
+ eval_motion_mode);
+ // Main function loop. This loops over all of the possible motion modes and
+ // computes RD to determine the best one. This process includes computing
+ // any necessary side information for the motion mode and performing the
+ // transform search.
+ for (int mode_index = mode_index_start; mode_index <= mode_index_end;
+ mode_index++) {
+ if (args->skip_motion_mode && mode_index) continue;
+ int tmp_rate2 = rate2_nocoeff;
+ const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+ int tmp_rate_mv = rate_mv0;
+
+ *mbmi = base_mbmi;
+ if (is_interintra_mode) {
+ // Only use SIMPLE_TRANSLATION for interintra
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ } else {
+ mbmi->motion_mode = (MOTION_MODE)mode_index;
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ }
+
+ // Do not search OBMC if the probability of selecting it is below a
+ // predetermined threshold for this update_type and block size.
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int use_actual_frame_probs = 1;
+ int prune_obmc;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+#endif
+ if (use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+ if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) &&
+ mbmi->motion_mode == OBMC_CAUSAL)
+ continue;
+
+ if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+ // SIMPLE_TRANSLATION mode: no need to recalculate.
+ // The prediction is calculated before motion_mode_rd() is called in
+ // handle_inter_mode()
+ } else if (mbmi->motion_mode == OBMC_CAUSAL) {
+ const uint32_t cur_mv = mbmi->mv[0].as_int;
+ // OBMC_CAUSAL not allowed for compound prediction
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
+ av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
+ &mbmi->mv[0], NULL);
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ }
+ if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
+ // Build the predictor according to the current motion vector if it has
+ // not already been built
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ 0, av1_num_planes(cm) - 1);
+ }
+ // Build the inter predictor by blending the predictor corresponding to
+ // this MV, and the neighboring blocks using the OBMC model
+ av1_build_obmc_inter_prediction(
+ cm, xd, args->above_pred_buf, args->above_pred_stride,
+ args->left_pred_buf, args->left_pred_stride);
+#if !CONFIG_REALTIME_ONLY
+ } else if (mbmi->motion_mode == WARPED_CAUSAL) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->motion_mode = WARPED_CAUSAL;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ // Select the samples according to motion vector difference
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = av1_selectSamples(
+ &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
+ }
+
+ // Compute the warped motion parameters with a least squares fit
+ // using the collected samples
+ if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+ &mbmi->wm_params, mi_row, mi_col)) {
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
+ // Refine MV for NEWMV mode
+ const int_mv mv0 = mbmi->mv[0];
+ const WarpedMotionParams wm_params0 = mbmi->wm_params;
+ const int num_proj_ref0 = mbmi->num_proj_ref;
+
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+ &ref_mv.as_mv, NULL);
+
+ // Refine MV in a small range.
+ av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+ total_samples, cpi->sf.mv_sf.warp_search_method,
+ cpi->sf.mv_sf.warp_search_iters);
+
+ if (mv0.as_int != mbmi->mv[0].as_int) {
+ // Keep the refined MV and WM parameters.
+ tmp_rate_mv = av1_mv_bit_cost(
+ &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ } else {
+ // Restore the old MV and WM parameters.
+ mbmi->mv[0] = mv0;
+ mbmi->wm_params = wm_params0;
+ mbmi->num_proj_ref = num_proj_ref0;
+ }
+ }
+
+ // Build the warped predictor
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ } else {
+ continue;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+ } else if (is_interintra_mode) {
+ const int ret =
+ av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
+ &tmp_rate_mv, &tmp_rate2, orig_dst);
+ if (ret < 0) continue;
+ }
+
+ // If we are searching newmv and the mv is the same as refmv, skip the
+ // current mode
+ if (!av1_check_newmv_joint_nonzero(cm, x)) continue;
+
+ // Update rd_stats for the current motion mode
+ txfm_info->skip_txfm = 0;
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip_txfm = 1;
+ rd_stats->rate = tmp_rate2;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
+ if (interintra_allowed) {
+ rd_stats->rate +=
+ mode_costs->interintra_cost[size_group_lookup[bsize]]
+ [mbmi->ref_frame[1] == INTRA_FRAME];
+ }
+ if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+ (mbmi->ref_frame[1] != INTRA_FRAME)) {
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ rd_stats->rate +=
+ mode_costs->motion_mode_cost[bsize][mbmi->motion_mode];
+ } else {
+ rd_stats->rate +=
+ mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode];
+ }
+ }
+
+ int64_t this_yrd = INT64_MAX;
+
+ if (!do_tx_search) {
+ // Avoid doing a transform search here to speed up the overall mode
+ // search. It will be done later in the mode search if the current
+ // motion mode seems promising.
+ int64_t curr_sse = -1;
+ int64_t sse_y = -1;
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ int64_t est_rd = 0;
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ curr_sse = get_sse(cpi, x, &sse_y);
+ const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+ &est_residue_cost, &est_dist);
+ (void)has_est_rd;
+ assert(has_est_rd);
+ } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 ||
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+ cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist,
+ NULL, &curr_sse, NULL, NULL, NULL);
+ sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]];
+ }
+ est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
+ if (est_rd * 0.80 > *best_est_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ continue;
+ }
+ const int mode_rate = rd_stats->rate;
+ rd_stats->rate += est_residue_cost;
+ rd_stats->dist = est_dist;
+ rd_stats->rdcost = est_rd;
+ if (rd_stats->rdcost < *best_est_rd) {
+ *best_est_rd = rd_stats->rdcost;
+ assert(sse_y >= 0);
+ ref_skip_rd[1] = txfm_rd_gate_level
+ ? RDCOST(x->rdmult, mode_rate, (sse_y << 4))
+ : INT64_MAX;
+ }
+ if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
+ if (!is_comp_pred) {
+ assert(curr_sse >= 0);
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, rd_stats, rd_stats_y,
+ rd_stats_uv, mbmi);
+ }
+ } else {
+ assert(curr_sse >= 0);
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, rd_stats, rd_stats_y,
+ rd_stats_uv, mbmi);
+ }
+ mbmi->skip_txfm = 0;
+ } else {
+ // Perform full transform search
+ int64_t skip_rd = INT64_MAX;
+ int64_t skip_rdy = INT64_MAX;
+ if (txfm_rd_gate_level) {
+ // Check if the mode is good enough based on skip RD
+ int64_t sse_y = INT64_MAX;
+ int64_t curr_sse = get_sse(cpi, x, &sse_y);
+ skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
+ skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
+ int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
+ txfm_rd_gate_level, 0);
+ if (!eval_txfm) continue;
+ }
+
+ // Do transform search
+ const int mode_rate = rd_stats->rate;
+ if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+ rd_stats->rate, ref_best_rd)) {
+ if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
+ return INT64_MAX;
+ }
+ continue;
+ }
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int y_rate =
+ rd_stats->skip_txfm
+ ? x->mode_costs.skip_txfm_cost[skip_ctx][1]
+ : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]);
+ this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist);
+
+ const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (curr_rd < ref_best_rd) {
+ ref_best_rd = curr_rd;
+ ref_skip_rd[0] = skip_rd;
+ ref_skip_rd[1] = skip_rdy;
+ }
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ inter_mode_data_push(
+ tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+ }
+ }
+
+ if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ mbmi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+ }
+ }
+
+ const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (mode_index == 0) {
+ args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+ }
+ if (mode_index == 0 || tmp_rd < best_rd) {
+ // Update best_rd data if this is the best motion mode so far
+ best_mbmi = *mbmi;
+ best_rd = tmp_rd;
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rate_mv = tmp_rate_mv;
+ *yrd = this_yrd;
+ if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+ memcpy(best_blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+ best_xskip_txfm = mbmi->skip_txfm;
+ }
+ }
+ // Update RD and mbmi stats for selected motion mode
+ mbmi->ref_frame[1] = ref_frame_1;
+ *rate_mv = best_rate_mv;
+ if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) {
+ av1_invalid_rd_stats(rd_stats);
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return INT64_MAX;
+ }
+ *mbmi = best_mbmi;
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+ memcpy(txfm_info->blk_skip, best_blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+ txfm_info->skip_txfm = best_xskip_txfm;
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t best_rd) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int64_t total_sse = 0;
+ int64_t this_rd = INT64_MAX;
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+ rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1];
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ // Call av1_enc_build_inter_predictor() for one plane at a time.
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane, plane);
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ av1_subtract_plane(x, plane_bsize, plane);
+
+ int64_t sse =
+ av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL);
+ if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ sse <<= 4;
+ total_sse += sse;
+ // When current rd cost is more than the best rd, skip evaluation of
+ // remaining planes.
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, total_sse);
+ if (this_rd > best_rd) break;
+ }
+
+ rd_stats->dist = rd_stats->sse = total_sse;
+ rd_stats->rdcost = this_rd;
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 0;
+}
+
+// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant
+// mode
+// Note(rachelbarker): This speed feature currently does not interact correctly
+// with global motion. The issue is that, when global motion is used, GLOBALMV
+// produces a different prediction to NEARESTMV/NEARMV even if the motion
+// vectors are the same. Thus GLOBALMV should not be pruned in this case.
+static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
+ int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE single_mode) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ assert(single_mode != NEWMV);
+ if (single_mode == NEARESTMV) {
+ return 0;
+ } else if (single_mode == NEARMV) {
+ // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV
+ // when ref_mv_count = 1, NEARMV is same as GLOBALMV
+ if (ref_mv_count < 2) return 1;
+ } else if (single_mode == GLOBALMV) {
+ // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV
+ if (ref_mv_count == 0) return 1;
+ // when ref_mv_count == 1, NEARMV is same as GLOBALMV
+ else if (ref_mv_count == 1)
+ return 0;
+
+ int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count);
+ // Check GLOBALMV is matching with any mv in ref_mv_stack
+ for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) {
+ int_mv this_mv;
+
+ if (ref_idx == 0)
+ this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+ else
+ this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+ if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
+ int skip_repeated_ref_mv,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+ assert(is_inter_singleref_mode(single_mode));
+ if (single_mode == NEWMV) {
+ this_mv->as_int = INVALID_MV;
+ } else if (single_mode == GLOBALMV) {
+ if (skip_repeated_ref_mv &&
+ check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+ return 0;
+ *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+ } else {
+ assert(single_mode == NEARMV || single_mode == NEARESTMV);
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1;
+ if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+ assert(ref_mv_offset >= 0);
+ if (ref_idx == 0) {
+ *this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+ } else {
+ *this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+ }
+ } else {
+ if (skip_repeated_ref_mv &&
+ check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+ return 0;
+ *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+ }
+ }
+ return 1;
+}
+
+// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list
+// population
+static INLINE int skip_nearest_near_mv_using_refmv_weight(
+ const MACROBLOCK *const x, const PREDICTION_MODE this_mode,
+ const int8_t ref_frame_type, PREDICTION_MODE best_mode) {
+ if (this_mode != NEARESTMV && this_mode != NEARMV) return 0;
+ // Do not skip the mode if the current block has not yet obtained a valid
+ // inter mode.
+ if (!is_inter_mode(best_mode)) return 0;
+
+ const MACROBLOCKD *xd = &x->e_mbd;
+ // Do not skip the mode if both the top and left neighboring blocks are not
+ // available.
+ if (!xd->left_available || !xd->up_available) return 0;
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type];
+ const int ref_mv_count =
+ AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]);
+
+ if (ref_mv_count == 0) return 0;
+ // If ref mv list has at least one nearest candidate do not prune NEARESTMV
+ if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0;
+
+ // Count number of ref mvs populated from nearest candidates
+ int nearest_refmv_count = 0;
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) {
+ if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++;
+ }
+
+ // nearest_refmv_count indicates the closeness of block motion characteristics
+ // with respect to its spatial neighbor. Smaller value of nearest_refmv_count
+ // w.r.t to ref_mv_count means less correlation with its spatial neighbors.
+ // Hence less possibility for NEARESTMV and NEARMV modes becoming the best
+ // mode since these modes work well for blocks that shares similar motion
+ // characteristics with its neighbor. Thus, NEARMV mode is pruned when
+ // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV
+ // mode is pruned if none of the ref mvs are populated from nearest candidate.
+ const int prune_thresh = 1 + (ref_mv_count >= 2);
+ if (nearest_refmv_count < prune_thresh) return 1;
+ return 0;
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+ const AV1_COMMON *cm, const MACROBLOCK *x,
+ int skip_repeated_ref_mv) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+
+ int ret = 1;
+ for (int i = 0; i < is_comp_pred + 1; ++i) {
+ int_mv this_mv;
+ this_mv.as_int = INVALID_MV;
+ ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
+ skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext);
+ if (!ret) return 0;
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
+ if (single_mode == NEWMV) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ cur_mv[i] =
+ (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+ .this_mv
+ : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+ .comp_mv;
+ } else {
+ ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+ }
+ }
+ return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ const int (*const drl_mode_cost0)[2],
+ int8_t ref_frame_type) {
+ int cost = 0;
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+ if (mbmi->ref_mv_idx == idx) return cost;
+ }
+ }
+ return cost;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+ if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+ }
+ }
+ return cost;
+ }
+ return cost;
+}
+
+static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
+ const MB_MODE_INFO *const mbmi,
+ PREDICTION_MODE this_mode) {
+ for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+ const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+ if (single_mode == NEWMV &&
+ args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE mode) {
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+ const int has_drl =
+ (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+ const int ref_set =
+ has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1;
+
+ return ref_set;
+}
+
+// Checks if particular ref_mv_idx should be pruned.
+static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes,
+ const int qindex,
+ const int ref_mv_idx) {
+ if (reduce_inter_modes >= 3) return 1;
+ // Q-index logic based pruning is enabled only for
+ // reduce_inter_modes = 2.
+ assert(reduce_inter_modes == 2);
+ // When reduce_inter_modes=2, pruning happens as below based on q index.
+ // For q index range between 0 and 85: prune if ref_mv_idx >= 1.
+ // For q index range between 86 and 170: prune if ref_mv_idx == 2.
+ // For q index range between 171 and 255: no pruning.
+ const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1;
+ return (ref_mv_idx >= min_prune_ref_mv_idx);
+}
+
+// Whether this reference motion vector can be skipped, based on initial
+// heuristics.
+static bool ref_mv_idx_early_breakout(
+ const SPEED_FEATURES *const sf,
+ const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x,
+ const HandleInterModeArgs *const args, int64_t ref_best_rd,
+ int ref_mv_idx) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ const int is_comp_pred = has_second_ref(mbmi);
+ if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
+ if (mbmi->ref_frame[0] == LAST2_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == LAST2_FRAME ||
+ mbmi->ref_frame[1] == LAST3_FRAME) {
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+ REF_CAT_LEVEL) {
+ return true;
+ }
+ }
+ // TODO(any): Experiment with reduce_inter_modes for compound prediction
+ if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
+ have_newmv_in_inter_mode(mbmi->mode)) {
+ if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref &&
+ mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) {
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ const int do_prune = prune_ref_mv_idx_using_qindex(
+ sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx);
+ if (do_prune &&
+ (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+ REF_CAT_LEVEL)) {
+ return true;
+ }
+ }
+ }
+ }
+
+ mbmi->ref_mv_idx = ref_mv_idx;
+ if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
+ return true;
+ }
+ size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
+ const int drl_cost = get_drl_cost(
+ mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type);
+ est_rd_rate += drl_cost;
+ if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ return true;
+ }
+ return false;
+}
+
+// Compute the estimated RD cost for the motion vector with simple translation.
+static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ HandleInterModeArgs *args,
+ int ref_mv_idx, int64_t ref_best_rd,
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ const AV1_COMMON *cm = &cpi->common;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const ModeCosts *mode_costs = &x->mode_costs;
+
+ struct macroblockd_plane *p = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ av1_init_rd_stats(rd_stats);
+
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+ mbmi->ref_frame[1] = NONE_FRAME;
+ }
+ int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = ref_mv_idx;
+
+ rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+ const int drl_cost =
+ get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+ rd_stats->rate += drl_cost;
+
+ int_mv cur_mv[2];
+ if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
+ return INT64_MAX;
+ }
+ assert(have_nearmv_in_inter_mode(mbmi->mode));
+ for (int i = 0; i < is_comp_pred + 1; ++i) {
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+ const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx);
+ rd_stats->rate += ref_mv_cost;
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
+ return INT64_MAX;
+ }
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->num_proj_ref = 0;
+ if (is_comp_pred) {
+ // Only compound_average
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ }
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ int est_rate;
+ int64_t est_dist;
+ model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist,
+ NULL, NULL, NULL, NULL, NULL);
+ return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist);
+}
+
+// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in
+// an integer. 0 for the i-th bit means that integer is excluded, 1 means
+// it is included.
+static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
+
+static INLINE bool mask_check_bit(int mask, int index) {
+ return (mask >> index) & 0x1;
+}
+
+// Before performing the full MV search in handle_inter_mode, do a simple
+// translation search and see if we can eliminate any motion vectors.
+// Returns an integer where, if the i-th bit is set, it means that the i-th
+// motion vector should be searched. This is only set for NEAR_MV.
+static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ HandleInterModeArgs *const args,
+ int64_t ref_best_rd, BLOCK_SIZE bsize,
+ const int ref_set) {
+ // If the number of ref mv count is equal to 1, do not prune the same. It
+ // is better to evaluate the same than to prune it.
+ if (ref_set == 1) return 1;
+ AV1_COMMON *const cm = &cpi->common;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+ // Only search indices if they have some chance of being good.
+ int good_indices = 0;
+ for (int i = 0; i < ref_set; ++i) {
+ if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args,
+ ref_best_rd, i)) {
+ continue;
+ }
+ mask_set_bit(&good_indices, i);
+ }
+
+ // Only prune in NEARMV mode, if the speed feature is set, and the block size
+ // is large enough. If these conditions are not met, return all good indices
+ // found so far.
+ if (!cpi->sf.inter_sf.prune_mode_search_simple_translation)
+ return good_indices;
+ if (!have_nearmv_in_inter_mode(this_mode)) return good_indices;
+ if (num_pels_log2_lookup[bsize] <= 6) return good_indices;
+ // Do not prune when there is internal resizing. TODO(elliottk) fix this
+ // so b/2384 can be resolved.
+ if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) ||
+ (mbmi->ref_frame[1] > 0 &&
+ av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) {
+ return good_indices;
+ }
+
+ // Calculate the RD cost for the motion vectors using simple translation.
+ int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX };
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ // If this index is bad, ignore it.
+ if (!mask_check_bit(good_indices, ref_mv_idx)) {
+ continue;
+ }
+ idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
+ cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize);
+ }
+ // Find the index with the best RD cost.
+ int best_idx = 0;
+ for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) {
+ if (idx_rdcost[i] < idx_rdcost[best_idx]) {
+ best_idx = i;
+ }
+ }
+ // Only include indices that are good and within a % of the best.
+ const double dth = has_second_ref(mbmi) ? 1.05 : 1.001;
+ // If the simple translation cost is not within this multiple of the
+ // best RD, skip it. Note that the cutoff is derived experimentally.
+ const double ref_dth = 5;
+ int result = 0;
+ for (int i = 0; i < ref_set; ++i) {
+ if (mask_check_bit(good_indices, i) &&
+ (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth &&
+ (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) {
+ mask_set_bit(&result, i);
+ }
+ }
+ return result;
+}
+
+/*!\brief Motion mode information for inter mode search speedup.
+ *
+ * Used in a speed feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning candidates.
+ */
+typedef struct motion_mode_candidate {
+ /*!
+ * Mode info for the motion mode candidate.
+ */
+ MB_MODE_INFO mbmi;
+ /*!
+ * Rate describing the cost of the motion vectors for this candidate.
+ */
+ int rate_mv;
+ /*!
+ * Rate before motion mode search and transform coding is applied.
+ */
+ int rate2_nocoeff;
+ /*!
+ * An integer value 0 or 1 which indicates whether or not to skip the motion
+ * mode search and default to SIMPLE_TRANSLATION as a speed feature for this
+ * candidate.
+ */
+ int skip_motion_mode;
+ /*!
+ * Total RD cost for this candidate.
+ */
+ int64_t rd_cost;
+} motion_mode_candidate;
+
+/*!\cond */
+typedef struct motion_mode_best_st_candidate {
+ motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
+ int num_motion_mode_cand;
+} motion_mode_best_st_candidate;
+
+// Checks if the current reference frame matches with neighbouring block's
+// (top/left) reference frames
+static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
+ MB_MODE_INFO *nb_mbmi) {
+ MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0],
+ nb_mbmi->ref_frame[1] };
+ MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0],
+ cur_mbmi->ref_frame[1] };
+ const int is_cur_comp_pred = has_second_ref(cur_mbmi);
+ int match_found = 0;
+
+ for (int i = 0; i < (is_cur_comp_pred + 1); i++) {
+ if ((cur_ref_frames[i] == nb_ref_frames[0]) ||
+ (cur_ref_frames[i] == nb_ref_frames[1]))
+ match_found = 1;
+ }
+ return match_found;
+}
+
+static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
+ MACROBLOCKD *xd) {
+ if (!xd->up_available) return 1;
+ const int mi_col = xd->mi_col;
+ MB_MODE_INFO **cur_mbmi = xd->mi;
+ // prev_row_mi points into the mi array, starting at the beginning of the
+ // previous row.
+ MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+ const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols);
+ uint8_t mi_step;
+ for (int above_mi_col = mi_col; above_mi_col < end_col;
+ above_mi_col += mi_step) {
+ MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+ mi_step = mi_size_wide[above_mi[0]->bsize];
+ int match_found = 0;
+ if (is_inter_block(*above_mi))
+ match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
+ if (match_found) return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
+ MACROBLOCKD *xd) {
+ if (!xd->left_available) return 1;
+ const int mi_row = xd->mi_row;
+ MB_MODE_INFO **cur_mbmi = xd->mi;
+ // prev_col_mi points into the mi array, starting at the top of the
+ // previous column
+ MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+ const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows);
+ uint8_t mi_step;
+ for (int left_mi_row = mi_row; left_mi_row < end_row;
+ left_mi_row += mi_step) {
+ MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+ mi_step = mi_size_high[left_mi[0]->bsize];
+ int match_found = 0;
+ if (is_inter_block(*left_mi))
+ match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
+ if (match_found) return 1;
+ }
+ return 0;
+}
+/*!\endcond */
+
+/*! \brief Struct used to hold TPL data to
+ * narrow down parts of the inter mode search.
+ */
+typedef struct {
+ /*!
+ * The best inter cost out of all of the reference frames.
+ */
+ int64_t best_inter_cost;
+ /*!
+ * The inter cost for each reference frame.
+ */
+ int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
+} PruneInfoFromTpl;
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(Remya): Check if get_tpl_stats_b() can be reused
+static AOM_INLINE void get_block_level_tpl_stats(
+ AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
+ PruneInfoFromTpl *inter_cost_info_from_tpl) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return;
+ const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
+ row += row_step) {
+ for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
+ col += col_step_sr) {
+ const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+ // Sums up the inter cost of corresponding ref frames
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+ inter_cost_info_from_tpl->ref_inter_cost[ref_idx] +=
+ this_stats->pred_error[ref_idx];
+ }
+ }
+ }
+
+ // Computes the best inter cost (minimum inter_cost)
+ int64_t best_inter_cost = INT64_MAX;
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+ const int64_t cur_inter_cost =
+ inter_cost_info_from_tpl->ref_inter_cost[ref_idx];
+ // For invalid ref frames, cur_inter_cost = 0 and has to be handled while
+ // calculating the minimum inter_cost
+ if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) &&
+ valid_refs[ref_idx])
+ best_inter_cost = cur_inter_cost;
+ }
+ inter_cost_info_from_tpl->best_inter_cost = best_inter_cost;
+}
+#endif
+
+static AOM_INLINE int prune_modes_based_on_tpl_stats(
+ PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
+ const PREDICTION_MODE this_mode, int prune_mode_level) {
+ const int have_newmv = have_newmv_in_inter_mode(this_mode);
+ if ((prune_mode_level < 2) && have_newmv) return 0;
+
+ const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
+ if (best_inter_cost == INT64_MAX) return 0;
+
+ const int prune_level = prune_mode_level - 1;
+ int64_t cur_inter_cost;
+
+ const int is_globalmv =
+ (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV);
+ const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx;
+
+ // Thresholds used for pruning:
+ // Lower value indicates aggressive pruning and higher value indicates
+ // conservative pruning which is set based on ref_mv_idx and speed feature.
+ // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
+ // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
+ static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = {
+ { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 }
+ };
+
+ const int is_comp_pred = (refs[1] > INTRA_FRAME);
+ if (!is_comp_pred) {
+ cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+ } else {
+ const int64_t inter_cost_ref0 =
+ inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+ const int64_t inter_cost_ref1 =
+ inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1];
+ // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for
+ // more aggressive pruning
+ cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1);
+ }
+
+ // Prune the mode if cur_inter_cost is greater than threshold times
+ // best_inter_cost
+ if (cur_inter_cost >
+ ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
+ best_inter_cost) >>
+ 2))
+ return 1;
+ return 0;
+}
+
+/*!\brief High level function to select parameters for compound mode.
+ *
+ * \ingroup inter_mode_search
+ * The main search functionality is done in the call to av1_compound_type_rd().
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to struct holding all the data for
+ * the current macroblock.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ * \param[in,out] cur_mv Current motion vector.
+ * \param[in] bsize Current block size.
+ * \param[in,out] compmode_interinter_cost RD of the selected interinter
+ compound mode.
+ * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all
+ * allocated buffers for the compound
+ * predictors and masks in the compound type
+ * search.
+ * \param[in,out] orig_dst A prediction buffer to hold a computed
+ * prediction. This will eventually hold the
+ * final prediction, and the tmp_dst info will
+ * be copied here.
+ * \param[in] tmp_dst A temporary prediction buffer to hold a
+ * computed prediction.
+ * \param[in,out] rate_mv The rate associated with the motion vectors.
+ * This will be modified if a motion search is
+ * done in the motion mode search.
+ * \param[in,out] rd_stats Struct to keep track of the overall RD
+ * information.
+ * \param[in,out] skip_rd An array of length 2 where skip_rd[0] is the
+ * best total RD for a skip mode so far, and
+ * skip_rd[1] is the best RD for a skip mode so
+ * far in luma. This is used as a speed feature
+ * to skip the transform search if the computed
+ * skip RD for the current mode is not better
+ * than the best skip_rd so far.
+ * \param[in,out] skip_build_pred Indicates whether or not to build the inter
+ * predictor. If this is 0, the inter predictor
+ * has already been built and thus we can avoid
+ * repeating computation.
+ * \return Returns 1 if this mode is worse than one already seen and 0 if it is
+ * a viable candidate.
+ */
+static int process_compound_inter_mode(
+ AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+ int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize,
+ int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers,
+ const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv,
+ RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const AV1_COMMON *cm = &cpi->common;
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+ int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+ (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+ const int num_planes = av1_num_planes(cm);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int is_luma_interp_done = 0;
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+ int64_t best_rd_compound;
+ int64_t rd_thresh;
+ const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+ const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+ rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift),
+ comp_type_rd_scale);
+ // Select compound type and any parameters related to that type
+ // (for example, the mask parameters if it is a masked mode) and compute
+ // the RD
+ *compmode_interinter_cost = av1_compound_type_rd(
+ cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used,
+ orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats,
+ ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+ if (ref_best_rd < INT64_MAX &&
+ (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+ ref_best_rd) {
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 1;
+ }
+
+ // Build only uv predictor for COMPOUND_AVERAGE.
+ // Note there is no need to call av1_enc_build_inter_predictor
+ // for luma if COMPOUND_AVERAGE is selected because it is the first
+ // candidate in av1_compound_type_rd, which means it used the dst_buf
+ // rather than the tmp_buf.
+ if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) {
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_U, num_planes - 1);
+ }
+ *skip_build_pred = 1;
+ }
+ return 0;
+}
+
+// Speed feature to prune out MVs that are similar to previous MVs if they
+// don't achieve the best RD advantage.
+static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
+ int_mv save_mv[MAX_REF_MV_SEARCH - 1][2],
+ MB_MODE_INFO *mbmi, int pruning_factor) {
+ int i;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const int thr = (1 + is_comp_pred) << (pruning_factor + 1);
+
+ // Skip the evaluation if an MV match is found.
+ if (ref_mv_idx > 0) {
+ for (int idx = 0; idx < ref_mv_idx; ++idx) {
+ if (save_mv[idx][0].as_int == INVALID_MV) continue;
+
+ int mv_diff = 0;
+ for (i = 0; i < 1 + is_comp_pred; ++i) {
+ mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+ abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
+ }
+
+ // If this mode is not the best one, and current MV is similar to
+ // previous stored MV, terminate this ref_mv_idx evaluation.
+ if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1;
+ }
+ }
+
+ if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+ for (i = 0; i < is_comp_pred + 1; ++i)
+ save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
+ }
+
+ return 0;
+}
+
+/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE
+ *
+ * \ingroup inter_mode_search
+ *
+ * Compares the sse of zero mv and the best sse found in single new_mv. If the
+ * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped.
+ * Else returns 0.
+ *
+ * Note that the sse of here comes from single_motion_search. So it is
+ * interpolated with the filter in motion search, not the actual interpolation
+ * filter used in encoding.
+ *
+ * \param[in] fn_ptr A table of function pointers to compute SSE.
+ * \param[in] x Pointer to struct holding all the data for
+ * the current macroblock.
+ * \param[in] bsize The current block_size.
+ * \param[in] args The args to handle_inter_mode, used to track
+ * the best SSE.
+ * \param[in] prune_zero_mv_with_sse The argument holds speed feature
+ * prune_zero_mv_with_sse value
+ * \return Returns 1 if zero_mv is pruned, 0 otherwise.
+ */
+static AOM_INLINE int prune_zero_mv_with_sse(
+ const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
+ const HandleInterModeArgs *args, int prune_zero_mv_with_sse) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+
+ const int is_comp_pred = has_second_ref(mbmi);
+ const MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+
+ for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+ if (xd->global_motion[refs[idx]].wmtype != IDENTITY) {
+ // Pruning logic only works for IDENTITY type models
+ // Note: In theory we could apply similar logic for TRANSLATION
+ // type models, but we do not code these due to a spec bug
+ // (see comments in gm_get_motion_vector() in av1/common/mv.h)
+ assert(xd->global_motion[refs[idx]].wmtype != TRANSLATION);
+ return 0;
+ }
+
+ // Don't prune if we have invalid data
+ assert(mbmi->mv[idx].as_int == 0);
+ if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) {
+ return 0;
+ }
+ }
+
+ // Sum up the sse of ZEROMV and best NEWMV
+ unsigned int this_sse_sum = 0;
+ unsigned int best_sse_sum = 0;
+ for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+ const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ const struct macroblockd_plane *pd = xd->plane;
+ const struct buf_2d *src_buf = &p->src;
+ const struct buf_2d *ref_buf = &pd->pre[idx];
+ const uint8_t *src = src_buf->buf;
+ const uint8_t *ref = ref_buf->buf;
+ const int src_stride = src_buf->stride;
+ const int ref_stride = ref_buf->stride;
+
+ unsigned int this_sse;
+ fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse);
+ this_sse_sum += this_sse;
+
+ const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]];
+ best_sse_sum += best_sse;
+ }
+
+ const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25;
+ if ((double)this_sse_sum > (mul * (double)best_sse_sum)) {
+ return 1;
+ }
+
+ return 0;
+}
+
+/*!\brief Searches for interpolation filter in realtime mode during winner eval
+ *
+ * \ingroup inter_mode_search
+ *
+ * Does a simple interpolation filter search during winner mode evaluation. This
+ * is currently only used by realtime mode as \ref
+ * av1_interpolation_filter_search is not called during realtime encoding.
+ *
+ * This function only searches over two possible filters. EIGHTTAP_REGULAR is
+ * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For
+ * higher res slips (>240p), EIGHTTAP_SMOOTH is also searched.
+ * *
+ * \param[in] cpi Pointer to the compressor. Used for feature
+ * flags.
+ * \param[in,out] x Pointer to macroblock. This is primarily
+ * used to access the buffers.
+ * \param[in] mi_row The current row in mi unit (4X4 pixels).
+ * \param[in] mi_col The current col in mi unit (4X4 pixels).
+ * \param[in] bsize The current block_size.
+ * \return Returns true if a predictor is built in xd->dst, false otherwise.
+ */
+static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ static const InterpFilters filters_ref_set[3] = {
+ { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+ { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+ { MULTITAP_SHARP, MULTITAP_SHARP }
+ };
+
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ int64_t best_cost = INT64_MAX;
+ int best_filter_index = -1;
+ // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best
+ const int num_planes = av1_num_planes(cm);
+ const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240;
+ assert(is_inter_mode(mi->mode));
+ assert(mi->motion_mode == SIMPLE_TRANSLATION);
+ assert(!is_inter_compound_mode(mi->mode));
+
+ if (!av1_is_interp_needed(xd)) {
+ return false;
+ }
+
+ struct macroblockd_plane *pd = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+ { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+ };
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+ const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst };
+
+ for (int i = 0; i < 3; ++i) {
+ if (is_240p_or_lesser) {
+ if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) {
+ continue;
+ }
+ } else {
+ if (filters_ref_set[i].x_filter == MULTITAP_SHARP) {
+ continue;
+ }
+ }
+ int64_t cost;
+ RD_STATS tmp_rd = { 0 };
+
+ mi->interp_filters.as_filters = filters_ref_set[i];
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+
+ model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+ ? MODELRD_LEGACY
+ : MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist,
+ &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL);
+
+ tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+ cm->seq_params->enable_dual_filter);
+ cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist);
+ if (cost < best_cost) {
+ best_filter_index = i;
+ best_cost = cost;
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ }
+ }
+ assert(best_filter_index >= 0);
+
+ mi->interp_filters.as_filters = filters_ref_set[best_filter_index];
+
+ const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1];
+
+ if (is_best_pred_in_orig) {
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ } else {
+ // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if
+ // is_best_pred_in_orig is false, that means the current buffer is the
+ // original one.
+ assert(&orig_dst == dst_bufs[0]);
+ assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]);
+ const int width = block_size_wide[bsize];
+ const int height = block_size_high[bsize];
+#if CONFIG_AV1_HIGHBITDEPTH
+ const bool is_hbd = is_cur_buf_hbd(xd);
+ if (is_hbd) {
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]),
+ tmp_dst.stride[AOM_PLANE_Y],
+ CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]),
+ orig_dst.stride[AOM_PLANE_Y], width, height);
+ } else {
+ aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+ orig_dst.plane[AOM_PLANE_Y],
+ orig_dst.stride[AOM_PLANE_Y], width, height);
+ }
+#else
+ aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+ orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y],
+ width, height);
+#endif
+ }
+
+ // Build the YUV predictor.
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_U, AOM_PLANE_V);
+ }
+
+ return true;
+}
+
+/*!\brief AV1 inter mode RD computation
+ *
+ * \ingroup inter_mode_search
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] bsize Current block size.
+ * \param[in,out] rd_stats Struct to keep track of the overall RD
+ * information.
+ * \param[in,out] rd_stats_y Struct to keep track of the RD information
+ * for only the Y plane.
+ * \param[in,out] rd_stats_uv Struct to keep track of the RD information
+ * for only the UV planes.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ * \param[in] tmp_buf Temporary buffer used to hold predictors
+ * built in this search.
+ * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all
+ * allocated buffers for the compound
+ * predictors and masks in the compound type
+ * search.
+ * \param[in,out] best_est_rd Estimated RD for motion mode search if
+ * do_tx_search (see below) is 0.
+ * \param[in] do_tx_search Parameter to indicate whether or not to do
+ * a full transform search. This will compute
+ * an estimated RD for the modes without the
+ * transform search and later perform the full
+ * transform search on the best candidates.
+ * \param[in,out] inter_modes_info InterModesInfo struct to hold inter mode
+ * information to perform a full transform
+ * search only on winning candidates searched
+ * with an estimate for transform coding RD.
+ * \param[in,out] motion_mode_cand A motion_mode_candidate struct to store
+ * motion mode information used in a speed
+ * feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning
+ * candidates.
+ * \param[in,out] skip_rd A length 2 array, where skip_rd[0] is the
+ * best total RD for a skip mode so far, and
+ * skip_rd[1] is the best RD for a skip mode so
+ * far in luma. This is used as a speed feature
+ * to skip the transform search if the computed
+ * skip RD for the current mode is not better
+ * than the best skip_rd so far.
+ * \param[in] inter_cost_info_from_tpl A PruneInfoFromTpl struct used to
+ * narrow down the search based on data
+ * collected in the TPL model.
+ * \param[out] yrd Stores the rdcost corresponding to encoding
+ * the luma plane.
+ *
+ * \return The RD cost for the mode being searched.
+ */
+static int64_t handle_inter_mode(
+ AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd,
+ uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers,
+ int64_t *best_est_rd, const int do_tx_search,
+ InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand,
+ int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl,
+ int64_t *yrd) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+#if CONFIG_REALTIME_ONLY
+ const int prune_modes_based_on_tpl = 0;
+#else // CONFIG_REALTIME_ONLY
+ const TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const int prune_modes_based_on_tpl =
+ cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
+ av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index);
+#endif // CONFIG_REALTIME_ONLY
+ int i;
+ // Reference frames for this mode
+ const int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ int rate_mv = 0;
+ int64_t rd = INT64_MAX;
+ // Do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ struct macroblockd_plane *pd = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+ { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+ };
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+
+ int64_t ret_val = INT64_MAX;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ int64_t best_rd = INT64_MAX;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ int64_t best_yrd = INT64_MAX;
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int best_xskip_txfm = 0;
+ int64_t newmv_ret_val = INT64_MAX;
+ inter_mode_info mode_info[MAX_REF_MV_SEARCH];
+
+ // Do not prune the mode based on inter cost from tpl if the current ref frame
+ // is the winner ref in neighbouring blocks.
+ int ref_match_found_in_above_nb = 0;
+ int ref_match_found_in_left_nb = 0;
+ if (prune_modes_based_on_tpl) {
+ ref_match_found_in_above_nb =
+ find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd);
+ ref_match_found_in_left_nb =
+ find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd);
+ }
+
+ // First, perform a simple translation search for each of the indices. If
+ // an index performs well, it will be fully searched in the main loop
+ // of this function.
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+ // Save MV results from first 2 ref_mv_idx.
+ int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
+ int best_ref_mv_idx = -1;
+ const int idx_mask =
+ ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set);
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx);
+ const int base_rate =
+ args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
+
+ for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
+ save_mv[i][0].as_int = INVALID_MV;
+ save_mv[i][1].as_int = INVALID_MV;
+ }
+ args->start_mv_cnt = 0;
+
+ // Main loop of this function. This will iterate over all of the ref mvs
+ // in the dynamic reference list and do the following:
+ // 1.) Get the current MV. Create newmv MV if necessary
+ // 2.) Search compound type and parameters if applicable
+ // 3.) Do interpolation filter search
+ // 4.) Build the inter predictor
+ // 5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL,
+ // WARPED_CAUSAL)
+ // 6.) Update stats if best so far
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ mbmi->ref_mv_idx = ref_mv_idx;
+
+ mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
+ mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX;
+ const int drl_cost = get_drl_cost(
+ mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+ mode_info[ref_mv_idx].drl_cost = drl_cost;
+ mode_info[ref_mv_idx].skip = 0;
+
+ if (!mask_check_bit(idx_mask, ref_mv_idx)) {
+ // MV did not perform well in simple translation search. Skip it.
+ continue;
+ }
+ if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
+ !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+ // Skip mode if TPL model indicates it will not be beneficial.
+ if (prune_modes_based_on_tpl_stats(
+ inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
+ cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
+ continue;
+ }
+ av1_init_rd_stats(rd_stats);
+
+ // Initialize compound mode data
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ // Compute cost for signalling this DRL index
+ rd_stats->rate = base_rate;
+ rd_stats->rate += drl_cost;
+
+ int rs = 0;
+ int compmode_interinter_cost = 0;
+
+ int_mv cur_mv[2];
+
+ // TODO(Cherma): Extend this speed feature to support compound mode
+ int skip_repeated_ref_mv =
+ is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+ // Generate the current mv according to the prediction mode
+ if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
+ continue;
+ }
+
+ // The above call to build_cur_mv does not handle NEWMV modes. Build
+ // the mv here if we have NEWMV for any predictors.
+ if (have_newmv_in_inter_mode(this_mode)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_newmv_time);
+#endif
+ newmv_ret_val =
+ handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_newmv_time);
+#endif
+
+ if (newmv_ret_val != 0) continue;
+
+ if (is_inter_singleref_mode(this_mode) &&
+ cur_mv[0].as_int != INVALID_MV) {
+ const MV_REFERENCE_FRAME ref = refs[0];
+ const unsigned int this_sse = x->pred_sse[ref];
+ if (this_sse < args->best_single_sse_in_refs[ref]) {
+ args->best_single_sse_in_refs[ref] = this_sse;
+ }
+
+ if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) {
+ const int th_idx = cpi->sf.rt_sf.skip_newmv_mode_based_on_sse - 1;
+ const int pix_idx = num_pels_log2_lookup[bsize] - 4;
+ const double scale_factor[3][11] = {
+ { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9 },
+ { 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 1, 1, 1, 1, 1 },
+ { 0.7, 0.7, 0.7, 0.7, 1, 1, 1, 1, 1, 1, 1 }
+ };
+ assert(pix_idx >= 0);
+ assert(th_idx <= 2);
+ if (args->best_pred_sse < scale_factor[th_idx][pix_idx] * this_sse)
+ continue;
+ }
+ }
+
+ rd_stats->rate += rate_mv;
+ }
+ // Copy the motion vector for this mode into mbmi struct
+ for (i = 0; i < is_comp_pred + 1; ++i) {
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ continue;
+ }
+
+ // Skip the rest of the search if prune_ref_mv_idx_search speed feature
+ // is enabled, and the current MV is similar to a previous one.
+ if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred &&
+ prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi,
+ cpi->sf.inter_sf.prune_ref_mv_idx_search))
+ continue;
+
+ if (cpi->sf.gm_sf.prune_zero_mv_with_sse &&
+ (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+ if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args,
+ cpi->sf.gm_sf.prune_zero_mv_with_sse)) {
+ continue;
+ }
+ }
+
+ int skip_build_pred = 0;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Handle a compound predictor, continue if it is determined this
+ // cannot be the best compound mode
+ if (is_comp_pred) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, compound_type_rd_time);
+#endif
+ const int not_best_mode = process_compound_inter_mode(
+ cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost,
+ rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd,
+ &skip_build_pred);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, compound_type_rd_time);
+#endif
+ if (not_best_mode) continue;
+ }
+
+ if (!args->skip_ifs) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, interpolation_filter_search_time);
+#endif
+ // Determine the interpolation filter for this mode
+ ret_val = av1_interpolation_filter_search(
+ x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+ &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, interpolation_filter_search_time);
+#endif
+ if (args->modelled_rd != NULL && !is_comp_pred) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+ }
+ if (ret_val != 0) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+ ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+
+ // Compute modelled RD if enabled
+ if (args->modelled_rd != NULL) {
+ if (is_comp_pred) {
+ const int mode0 = compound_ref0_mode(this_mode);
+ const int mode1 = compound_ref1_mode(this_mode);
+ const int64_t mrd =
+ AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+ }
+ }
+
+ rd_stats->rate += compmode_interinter_cost;
+ if (skip_build_pred != 1) {
+ // Build this inter predictor if it has not been previously built
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
+ av1_num_planes(cm) - 1);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, motion_mode_rd_time);
+#endif
+ int rate2_nocoeff = rd_stats->rate;
+ // Determine the motion mode. This will be one of SIMPLE_TRANSLATION,
+ // OBMC_CAUSAL or WARPED_CAUSAL
+ int64_t this_yrd;
+ ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
+ rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv,
+ &orig_dst, best_est_rd, do_tx_search,
+ inter_modes_info, 0, &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, motion_mode_rd_time);
+#endif
+ assert(
+ IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX));
+
+ if (ret_val != INT64_MAX) {
+ int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ const THR_MODES mode_enum = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ // Collect mode stats for multiwinner mode processing
+ store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y,
+ rd_stats_uv, mode_enum, NULL, bsize, tmp_rd,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type,
+ do_tx_search);
+ if (tmp_rd < best_rd) {
+ best_yrd = this_yrd;
+ // Update the best rd stats if we found the best mode so far
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rd_stats_uv = *rd_stats_uv;
+ best_rd = tmp_rd;
+ best_mbmi = *mbmi;
+ best_xskip_txfm = txfm_info->skip_txfm;
+ memcpy(best_blk_skip, txfm_info->blk_skip,
+ sizeof(best_blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map,
+ xd->height * xd->width);
+ motion_mode_cand->rate_mv = rate_mv;
+ motion_mode_cand->rate2_nocoeff = rate2_nocoeff;
+ }
+
+ if (tmp_rd < ref_best_rd) {
+ ref_best_rd = tmp_rd;
+ best_ref_mv_idx = ref_mv_idx;
+ }
+ }
+ restore_dst_buf(xd, orig_dst, num_planes);
+ }
+
+ if (best_rd == INT64_MAX) return INT64_MAX;
+
+ // re-instate status of the best choice
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ *rd_stats_uv = best_rd_stats_uv;
+ *yrd = best_yrd;
+ *mbmi = best_mbmi;
+ txfm_info->skip_txfm = best_xskip_txfm;
+ assert(IMPLIES(mbmi->comp_group_idx == 1,
+ mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+ memcpy(txfm_info->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+
+ rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+ return rd_stats->rdcost;
+}
+
+/*!\brief Search for the best intrabc predictor
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function performs a motion search to find the best intrabc predictor.
+ *
+ * \returns Returns the best overall rdcost (including the non-intrabc modes
+ * search before this function).
+ */
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
+ !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode)
+ return INT64_MAX;
+ const int num_planes = av1_num_planes(cm);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TileInfo *tile = &xd->tile;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+ const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ int_mv nearestmv, nearmv;
+ av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+ 0);
+
+ if (nearestmv.as_int == INVALID_MV) {
+ nearestmv.as_int = 0;
+ }
+ if (nearmv.as_int == INVALID_MV) {
+ nearmv.as_int = 0;
+ }
+
+ int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+ if (dv_ref.as_int == 0) {
+ av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row);
+ }
+ // Ref DV should not have sub-pel.
+ assert((dv_ref.as_mv.col & 7) == 0);
+ assert((dv_ref.as_mv.row & 7) == 0);
+ mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
+
+ struct buf_2d yv12_mb[MAX_MB_PLANE];
+ av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes);
+ for (int i = 0; i < num_planes; ++i) {
+ xd->plane[i].pre[0] = yv12_mb[i];
+ }
+
+ enum IntrabcMotionDirection {
+ IBC_MOTION_ABOVE,
+ IBC_MOTION_LEFT,
+ IBC_MOTION_DIRECTIONS
+ };
+
+ MB_MODE_INFO best_mbmi = *mbmi;
+ RD_STATS best_rdstats = *rd_stats;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+
+ FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+ const search_site_config *lookahead_search_sites =
+ cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+ const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
+ av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
+ &dv_ref.as_mv, start_mv,
+ lookahead_search_sites, search_method,
+ /*fine_search_interval=*/0);
+ const IntraBCMVCosts *const dv_costs = x->dv_costs;
+ av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
+
+ for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+ dir < IBC_MOTION_DIRECTIONS; ++dir) {
+ switch (dir) {
+ case IBC_MOTION_ABOVE:
+ fullms_params.mv_limits.col_min =
+ (tile->mi_col_start - mi_col) * MI_SIZE;
+ fullms_params.mv_limits.col_max =
+ (tile->mi_col_end - mi_col) * MI_SIZE - w;
+ fullms_params.mv_limits.row_min =
+ (tile->mi_row_start - mi_row) * MI_SIZE;
+ fullms_params.mv_limits.row_max =
+ (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h;
+ break;
+ case IBC_MOTION_LEFT:
+ fullms_params.mv_limits.col_min =
+ (tile->mi_col_start - mi_col) * MI_SIZE;
+ fullms_params.mv_limits.col_max =
+ (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w;
+ // TODO(aconverse@google.com): Minimize the overlap between above and
+ // left areas.
+ fullms_params.mv_limits.row_min =
+ (tile->mi_row_start - mi_row) * MI_SIZE;
+ int bottom_coded_mi_edge =
+ AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end);
+ fullms_params.mv_limits.row_max =
+ (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+ break;
+ default: assert(0);
+ }
+ assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min);
+ assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max);
+ assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min);
+ assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max);
+
+ av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv);
+
+ if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min ||
+ fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) {
+ continue;
+ }
+
+ const int step_param = cpi->mv_search_params.mv_step_param;
+ IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
+ int_mv best_mv, best_hash_mv;
+ FULLPEL_MV_STATS best_mv_stats;
+
+ int bestsme =
+ av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL,
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+ const int hashsme = av1_intrabc_hash_search(
+ cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
+ if (hashsme < bestsme) {
+ best_mv = best_hash_mv;
+ bestsme = hashsme;
+ }
+
+ if (bestsme == INT_MAX) continue;
+ const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ if (!av1_is_fullmv_in_range(&fullms_params.mv_limits,
+ get_fullmv_from_mv(&dv)))
+ continue;
+ if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+ cm->seq_params->mib_size_log2))
+ continue;
+
+ // DV should not have sub-pel.
+ assert((dv.col & 7) == 0);
+ assert((dv.row & 7) == 0);
+ memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->use_intrabc = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->mv[0].as_mv = dv;
+ mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+ mbmi->skip_txfm = 0;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+
+ // TODO(aconverse@google.com): The full motion field defining discount
+ // in MV_COST_WEIGHT is too large. Explore other values.
+ const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
+ dv_costs->dv_costs, MV_COST_WEIGHT_SUB);
+ const int rate_mode = x->mode_costs.intrabc_cost[1];
+ RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
+ if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
+ &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+ continue;
+ rd_stats_yuv.rdcost =
+ RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
+ if (rd_stats_yuv.rdcost < best_rd) {
+ best_rd = rd_stats_yuv.rdcost;
+ best_mbmi = *mbmi;
+ best_rdstats = rd_stats_yuv;
+ memcpy(best_blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+ }
+ }
+ *mbmi = best_mbmi;
+ *rd_stats = best_rdstats;
+ memcpy(txfm_info->blk_skip, best_blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+#if CONFIG_RD_DEBUG
+ mbmi->rd_stats = *rd_stats;
+#endif
+ return best_rd;
+}
+
+// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their
+// typedef here because Doxygen doesn't know about the typedefs yet. So using
+// the typedef will prevent doxygen from finding this function and generating
+// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to
+// doxygen, we can revert back to using the typedefs.
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+ uint8_t y_skip_txfm = 0, uv_skip_txfm = 0;
+ int64_t dist_y = 0, dist_uv = 0;
+
+ ctx->rd_stats.skip_txfm = 0;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->use_intrabc = 0;
+ mbmi->mv[0].as_int = 0;
+ mbmi->skip_mode = 0;
+
+ const int64_t intra_yrd =
+ av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+ &y_skip_txfm, bsize, best_rd, ctx);
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ if (intra_yrd < best_rd) {
+ // Search intra modes for uv planes if needed
+ if (num_planes > 1) {
+ // Set up the tx variables for reproducing the y predictions in case we
+ // need it for chroma-from-luma.
+ if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) {
+ memcpy(txfm_info->blk_skip, ctx->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+ }
+ const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ &dist_uv, &uv_skip_txfm, bsize,
+ max_uv_tx_size);
+ }
+
+ // Intra block is always coded as non-skip
+ rd_cost->rate =
+ rate_y + rate_uv +
+ x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+ rd_cost->dist = dist_y + dist_uv;
+ rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+ rd_cost->skip_txfm = 0;
+ } else {
+ rd_cost->rate = INT_MAX;
+ }
+
+ if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+ best_rd = rd_cost->rdcost;
+ if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
+ ctx->rd_stats.skip_txfm = mbmi->skip_txfm;
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ assert(rd_cost->rate != INT_MAX);
+ }
+ if (rd_cost->rate == INT_MAX) return;
+
+ ctx->mic = *xd->mi[0];
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+static AOM_INLINE void calc_target_weighted_pred(
+ const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+ const uint8_t *above, int above_stride, const uint8_t *left,
+ int left_stride);
+
+static AOM_INLINE void rd_pick_skip_mode(
+ RD_STATS *rd_cost, InterModeSearchState *search_state,
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ x->compound_idx = 1; // COMPOUND_AVERAGE
+ RD_STATS skip_mode_rd_stats;
+ av1_invalid_rd_stats(&skip_mode_rd_stats);
+
+ if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX ||
+ skip_mode_info->ref_frame_idx_1 == INVALID_IDX) {
+ return;
+ }
+
+ const MV_REFERENCE_FRAME ref_frame =
+ LAST_FRAME + skip_mode_info->ref_frame_idx_0;
+ const MV_REFERENCE_FRAME second_ref_frame =
+ LAST_FRAME + skip_mode_info->ref_frame_idx_1;
+ const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+ const THR_MODES mode_index =
+ get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+ if (mode_index == THR_INVALID) {
+ return;
+ }
+
+ if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+ cpi->sf.inter_sf.disable_onesided_comp) &&
+ cpi->all_one_sided_refs) {
+ return;
+ }
+
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frame;
+ mbmi->ref_frame[1] = second_ref_frame;
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) {
+ MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext;
+ if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+ mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+ return;
+ }
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type);
+ }
+
+ assert(this_mode == NEAREST_NEARESTMV);
+ if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) {
+ return;
+ }
+
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = x->compound_idx;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = 0;
+ mbmi->skip_mode = mbmi->skip_txfm = 1;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ BUFFER_SET orig_dst;
+ for (int i = 0; i < num_planes; i++) {
+ orig_dst.plane[i] = xd->plane[i].dst.buf;
+ orig_dst.stride[i] = xd->plane[i].dst.stride;
+ }
+
+ // Compare the use of skip_mode with the best intra/inter mode obtained.
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+ int64_t best_intra_inter_mode_cost = INT64_MAX;
+ if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
+ const ModeCosts *mode_costs = &x->mode_costs;
+ best_intra_inter_mode_cost = RDCOST(
+ x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0],
+ rd_cost->dist);
+ // Account for non-skip mode rate in total rd stats
+ rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0];
+ av1_rd_cost_update(x->rdmult, rd_cost);
+ }
+
+ // Obtain the rdcost for skip_mode.
+ skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst,
+ best_intra_inter_mode_cost);
+
+ if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
+ (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
+ assert(mode_index != THR_INVALID);
+ search_state->best_mbmode.skip_mode = 1;
+ search_state->best_mbmode = *mbmi;
+ memset(search_state->best_mbmode.inter_tx_size,
+ search_state->best_mbmode.tx_size,
+ sizeof(search_state->best_mbmode.inter_tx_size));
+ set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
+ search_state->best_mbmode.skip_txfm && is_inter_block(mbmi),
+ xd);
+ search_state->best_mode_index = mode_index;
+
+ // Update rd_cost
+ rd_cost->rate = skip_mode_rd_stats.rate;
+ rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+ rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+ search_state->best_rd = rd_cost->rdcost;
+ search_state->best_skip2 = 1;
+ search_state->best_mode_skippable = 1;
+
+ x->txfm_search_info.skip_txfm = 1;
+ }
+}
+
+// Get winner mode stats of given mode index
+static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
+ MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
+ int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
+ RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
+ THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type,
+ int mode_idx) {
+ MB_MODE_INFO *winner_mbmi;
+ if (multi_winner_mode_type) {
+ assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
+ WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
+ winner_mbmi = &winner_mode_stat->mbmi;
+
+ *winner_rd_cost = &winner_mode_stat->rd_cost;
+ *winner_rate_y = winner_mode_stat->rate_y;
+ *winner_rate_uv = winner_mode_stat->rate_uv;
+ *winner_mode_index = winner_mode_stat->mode_index;
+ } else {
+ winner_mbmi = best_mbmode;
+ *winner_rd_cost = best_rd_cost;
+ *winner_rate_y = best_rate_y;
+ *winner_rate_uv = best_rate_uv;
+ *winner_mode_index = *best_mode_index;
+ }
+ return winner_mbmi;
+}
+
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static AOM_INLINE void refine_winner_mode_tx(
+ const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index,
+ MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+ int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int64_t best_rd;
+ const int num_planes = av1_num_planes(cm);
+
+ if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode,
+ rd_cost->skip_txfm))
+ return;
+
+ // Set params for winner mode evaluation
+ set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+ // No best mode identified so far
+ if (*best_mode_index == THR_INVALID) return;
+
+ best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+ for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) {
+ RD_STATS *winner_rd_stats = NULL;
+ int winner_rate_y = 0, winner_rate_uv = 0;
+ THR_MODES winner_mode_index = 0;
+
+ // TODO(any): Combine best mode and multi-winner mode processing paths
+ // Get winner mode stats for current mode index
+ MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
+ x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
+ &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx);
+
+ if (xd->lossless[winner_mbmi->segment_id] == 0 &&
+ winner_mode_index != THR_INVALID &&
+ is_winner_mode_processing_enabled(cpi, x, winner_mbmi,
+ rd_cost->skip_txfm)) {
+ RD_STATS rd_stats = *winner_rd_stats;
+ int skip_blk = 0;
+ RD_STATS rd_stats_y, rd_stats_uv;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+ *mbmi = *winner_mbmi;
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (has_second_ref(mbmi))
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ if (is_inter_mode(mbmi->mode)) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ bool is_predictor_built = false;
+ const PREDICTION_MODE prediction_mode = mbmi->mode;
+ // Do interpolation filter search for realtime mode if applicable.
+ if (cpi->sf.winner_mode_sf.winner_mode_ifs &&
+ cpi->oxcf.mode == REALTIME &&
+ cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+ is_inter_mode(prediction_mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION &&
+ !is_inter_compound_mode(prediction_mode)) {
+ is_predictor_built =
+ fast_interp_search(cpi, x, mi_row, mi_col, bsize);
+ }
+ if (!is_predictor_built) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ }
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd);
+
+ av1_subtract_plane(x, bsize, 0);
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ !xd->lossless[mbmi->segment_id]) {
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ assert(rd_stats_y.rate != INT_MAX);
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ memset(mbmi->inter_tx_size, mbmi->tx_size,
+ sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->height * xd->width; ++i)
+ set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm);
+ }
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ }
+
+ if (num_planes > 1) {
+ av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ } else {
+ av1_init_rd_stats(&rd_stats_uv);
+ }
+
+ const ModeCosts *mode_costs = &x->mode_costs;
+ if (is_inter_mode(mbmi->mode) &&
+ RDCOST(x->rdmult,
+ mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate +
+ rd_stats_uv.rate,
+ (rd_stats_y.dist + rd_stats_uv.dist)) >
+ RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1],
+ (rd_stats_y.sse + rd_stats_uv.sse))) {
+ skip_blk = 1;
+ rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ } else {
+ skip_blk = 0;
+ rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+ }
+ int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
+ winner_rate_y - winner_rate_uv;
+ int64_t this_rd =
+ RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist));
+ if (best_rd > this_rd) {
+ *best_mbmode = *mbmi;
+ *best_mode_index = winner_mode_index;
+ av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ rd_cost->rate = this_rate;
+ rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+ rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ *best_skip2 = skip_blk;
+ }
+ }
+ }
+}
+
+/*!\cond */
+typedef struct {
+ // Mask for each reference frame, specifying which prediction modes to NOT try
+ // during search.
+ uint32_t pred_modes[REF_FRAMES];
+ // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of
+ // reference frames (i, j).
+ // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1
+ // (NONE_FRAME).
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
+} mode_skip_mask_t;
+/*!\endcond */
+
+// Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
+static AOM_INLINE void disable_reference(
+ MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+ for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+ ref_combo[ref][ref2 + 1] = true;
+ }
+}
+
+// Update 'ref_combo' mask to disable all inter references except ALTREF.
+static AOM_INLINE void disable_inter_references_except_altref(
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+ disable_reference(LAST_FRAME, ref_combo);
+ disable_reference(LAST2_FRAME, ref_combo);
+ disable_reference(LAST3_FRAME, ref_combo);
+ disable_reference(GOLDEN_FRAME, ref_combo);
+ disable_reference(BWDREF_FRAME, ref_combo);
+ disable_reference(ALTREF2_FRAME, ref_combo);
+}
+
+static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
+ { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME },
+ { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME },
+ { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+ { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME },
+ { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME },
+ { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME },
+ { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+ { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME },
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
+ REF_SET ref_set) {
+ if (ref_set == REF_SET_FULL) {
+ // Everything available by default.
+ memset(mask, 0, sizeof(*mask));
+ } else {
+ // All modes available by default.
+ memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
+ // All references disabled first.
+ for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
+ for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+ mask->ref_combo[ref1][ref2 + 1] = true;
+ }
+ }
+ const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+ int num_ref_combos;
+
+ // Then enable reduced set of references explicitly.
+ switch (ref_set) {
+ case REF_SET_REDUCED:
+ ref_set_combos = reduced_ref_combos;
+ num_ref_combos =
+ (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+ break;
+ case REF_SET_REALTIME:
+ ref_set_combos = real_time_ref_combos;
+ num_ref_combos =
+ (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+ break;
+ default: assert(0); num_ref_combos = 0;
+ }
+
+ for (int i = 0; i < num_ref_combos; ++i) {
+ const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
+ mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
+ }
+ }
+}
+
+static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ unsigned char segment_id = mbmi->segment_id;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+ REF_SET ref_set = REF_SET_FULL;
+
+ if (sf->rt_sf.use_real_time_ref_set)
+ ref_set = REF_SET_REALTIME;
+ else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set)
+ ref_set = REF_SET_REDUCED;
+
+ default_skip_mask(mask, ref_set);
+
+ int min_pred_mv_sad = INT_MAX;
+ MV_REFERENCE_FRAME ref_frame;
+ if (ref_set == REF_SET_REALTIME) {
+ // For real-time encoding, we only look at a subset of ref frames. So the
+ // threshold for pruning should be computed from this subset as well.
+ const int num_rt_refs =
+ sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos);
+ for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) {
+ const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+ if (ref != INTRA_FRAME) {
+ min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]);
+ }
+ }
+ } else {
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
+ // Skip checking missing reference in both single and compound reference
+ // modes.
+ disable_reference(ref_frame, mask->ref_combo);
+ } else {
+ // Skip fixed mv modes for poor references
+ if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+ mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ }
+ }
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ // Reference not used for the segment.
+ disable_reference(ref_frame, mask->ref_combo);
+ }
+ }
+ // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature
+ // is disabled for this segment. This is to prevent the possibility that we
+ // end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref &&
+ (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) {
+ disable_inter_references_except_altref(mask->ref_combo);
+
+ mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+ const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+ int_mv near_mv, nearest_mv, global_mv;
+ get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames,
+ &x->mbmi_ext);
+ get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+ get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+
+ if (near_mv.as_int != global_mv.as_int)
+ mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
+ if (nearest_mv.as_int != global_mv.as_int)
+ mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV);
+ }
+ }
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (inter_sf->alt_ref_search_fp &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
+ mask->pred_modes[ALTREF_FRAME] = 0;
+ disable_inter_references_except_altref(mask->ref_combo);
+ disable_reference(INTRA_FRAME, mask->ref_combo);
+ }
+ }
+
+ if (inter_sf->alt_ref_search_fp) {
+ if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) {
+ int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3);
+ // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
+ // those are past frames
+ MV_REFERENCE_FRAME start_frame =
+ inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
+ for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+ 0) {
+ // Prune inter modes when relative dist of ALTREF2 and ALTREF is close
+ // to the relative dist of LAST_FRAME.
+ if (inter_sf->alt_ref_search_fp == 1 &&
+ (abs(cpi->ref_frame_dist_info
+ .ref_relative_dist[ref_frame - LAST_FRAME]) >
+ 1.5 * abs(cpi->ref_frame_dist_info
+ .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) {
+ continue;
+ }
+ if (x->pred_mv_sad[ref_frame] > sad_thresh)
+ mask->pred_modes[ref_frame] |= INTER_ALL;
+ }
+ }
+ }
+ }
+
+ if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+ if (x->best_pred_mv_sad[0] < INT_MAX) {
+ int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1);
+ const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME };
+
+ // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references
+ for (int ref_idx = 0; ref_idx < 2; ref_idx++) {
+ ref_frame = prune_ref_list[ref_idx];
+ if (x->pred_mv_sad[ref_frame] > sad_thresh)
+ mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ }
+ }
+ }
+
+ if (bsize > sf->part_sf.max_intra_bsize) {
+ disable_reference(INTRA_FRAME, mask->ref_combo);
+ }
+
+ if (!cpi->oxcf.tool_cfg.enable_global_motion) {
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ mask->pred_modes[ref_frame] |= (1 << GLOBALMV);
+ mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV);
+ }
+ }
+
+ mask->pred_modes[INTRA_FRAME] |=
+ ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
+
+ // Prune reference frames which are not the closest to the current
+ // frame and with large pred_mv_sad.
+ if (inter_sf->prune_single_ref) {
+ assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3);
+ const double prune_threshes[2] = { 1.20, 1.05 };
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefFrameDistanceInfo *const ref_frame_dist_info =
+ &cpi->ref_frame_dist_info;
+ const int is_closest_ref =
+ (ref_frame == ref_frame_dist_info->nearest_past_ref) ||
+ (ref_frame == ref_frame_dist_info->nearest_future_ref);
+
+ if (!is_closest_ref) {
+ const int dir =
+ (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+ ? 0
+ : 1;
+ if (x->best_pred_mv_sad[dir] < INT_MAX &&
+ x->pred_mv_sad[ref_frame] >
+ prune_threshes[inter_sf->prune_single_ref - 1] *
+ x->best_pred_mv_sad[dir])
+ mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL;
+ }
+ }
+ }
+}
+
+static AOM_INLINE void init_neighbor_pred_buf(
+ const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
+ int is_hbd) {
+ if (is_hbd) {
+ const int len = sizeof(uint16_t);
+ args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
+ args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred +
+ (MAX_SB_SQUARE >> 1) * len);
+ args->above_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
+ args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
+ args->left_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
+ args->left_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
+ } else {
+ args->above_pred_buf[0] = obmc_buffer->above_pred;
+ args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1);
+ args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE;
+ args->left_pred_buf[0] = obmc_buffer->left_pred;
+ args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1);
+ args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE;
+ }
+}
+
+static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
+ MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+
+ if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1;
+
+ if (prune_ref_by_selective_ref_frame(cpi, x, rf,
+ cm->cur_frame->ref_display_order_hint)) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_by_compound_ref(
+ int ref_frame, int skip_ref_frame_mask) {
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_frame || rf[1] == ref_frame) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+ const MB_MODE_INFO *mi_cache) {
+ if (!mi_cache) {
+ return 0;
+ }
+
+ if (ref_frame < REF_FRAMES) {
+ return (ref_frame == mi_cache->ref_frame[0] ||
+ ref_frame == mi_cache->ref_frame[1]);
+ }
+
+ // if we are here, then the current mode is compound.
+ MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame);
+ return ref_frame == cached_ref_type;
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static AOM_INLINE void set_params_rd_pick_inter_mode(
+ const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+ BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask,
+ unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES],
+ struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ unsigned char segment_id = mbmi->segment_id;
+
+ init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd));
+ av1_collect_neighbors_ref_counts(xd);
+ estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single,
+ ref_costs_comp);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ x->best_pred_mv_sad[0] = INT_MAX;
+ x->best_pred_mv_sad[1] = INT_MAX;
+
+ for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+ ++ref_frame) {
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ // Skip the ref frame if the mask says skip and the ref is not used by
+ // compound ref.
+ if (skip_ref_frame_mask & (1 << ref_frame) &&
+ !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) &&
+ !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+ continue;
+ }
+ assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+ setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
+ }
+ if (cpi->sf.inter_sf.alt_ref_search_fp ||
+ cpi->sf.inter_sf.prune_single_ref ||
+ cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+ // Store the best pred_mv_sad across all past frames
+ if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+ 0)
+ x->best_pred_mv_sad[0] =
+ AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]);
+ else
+ // Store the best pred_mv_sad across all future frames
+ x->best_pred_mv_sad[1] =
+ AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]);
+ }
+ }
+
+ if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) {
+ // No second reference on RT ref set, so no need to initialize
+ for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME;
+ ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+ mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+ if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
+ continue;
+ }
+
+ if (skip_ref_frame_mask & (1 << ref_frame) &&
+ !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+ continue;
+ }
+ // Ref mv list population is not required, when compound references are
+ // pruned.
+ if (prune_ref_frame(cpi, x, ref_frame)) continue;
+
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ }
+ }
+
+ av1_count_overlappable_neighbors(cm, xd);
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int use_actual_frame_probs = 1;
+ int prune_obmc;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+#endif
+ if (use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+ if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) {
+ if (check_num_overlappable_neighbors(mbmi) &&
+ is_motion_variation_allowed_bsize(bsize)) {
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf,
+ dst_width1, dst_height1,
+ args->above_pred_stride);
+ av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf,
+ dst_width2, dst_height2,
+ args->left_pred_stride);
+ const int num_planes = av1_num_planes(cm);
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+ mi_col, 0, num_planes);
+ calc_target_weighted_pred(
+ cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0],
+ args->left_pred_buf[0], args->left_pred_stride[0]);
+ }
+ }
+
+ init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
+
+ // Set params for mode evaluation
+ set_mode_eval_params(cpi, x, MODE_EVAL);
+
+ x->comp_rd_stats_idx = 0;
+
+ for (int idx = 0; idx < REF_FRAMES; idx++) {
+ args->best_single_sse_in_refs[idx] = INT32_MAX;
+ }
+}
+
+static AOM_INLINE void init_single_inter_mode_search_state(
+ InterModeSearchState *search_state) {
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ SingleInterModeState *state;
+
+ state = &search_state->single_state[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ state = &search_state->single_state_modelled[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+ }
+ }
+ }
+
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->best_single_rd[ref_frame] = INT64_MAX;
+ search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID;
+ }
+ av1_zero(search_state->single_state_cnt);
+ av1_zero(search_state->single_state_modelled_cnt);
+}
+
+static AOM_INLINE void init_inter_mode_search_state(
+ InterModeSearchState *search_state, const AV1_COMP *cpi,
+ const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
+ init_intra_mode_search_state(&search_state->intra_search_state);
+ av1_invalid_rd_stats(&search_state->best_y_rdcost);
+
+ search_state->best_rd = best_rd_so_far;
+ search_state->best_skip_rd[0] = INT64_MAX;
+ search_state->best_skip_rd[1] = INT64_MAX;
+
+ av1_zero(search_state->best_mbmode);
+
+ search_state->best_rate_y = INT_MAX;
+
+ search_state->best_rate_uv = INT_MAX;
+
+ search_state->best_mode_skippable = 0;
+
+ search_state->best_skip2 = 0;
+
+ search_state->best_mode_index = THR_INVALID;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const unsigned char segment_id = mbmi->segment_id;
+
+ search_state->num_available_refs = 0;
+ memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+ memset(search_state->dist_order_refs, -1,
+ sizeof(search_state->dist_order_refs));
+
+ for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+ search_state->mode_threshold[i] = 0;
+ const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+ for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i)
+ search_state->mode_threshold[i] =
+ ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+ RD_THRESH_FAC_FRAC_BITS;
+
+ search_state->best_intra_rd = INT64_MAX;
+
+ search_state->best_pred_sse = UINT_MAX;
+
+ av1_zero(search_state->single_newmv);
+ av1_zero(search_state->single_newmv_rate);
+ av1_zero(search_state->single_newmv_valid);
+ for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ for (int i = 0; i < REFERENCE_MODES; ++i) {
+ search_state->best_pred_rd[i] = INT64_MAX;
+ }
+
+ if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) {
+ for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i)
+ search_state->mode_threshold[i] =
+ ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+ RD_THRESH_FAC_FRAC_BITS;
+
+ for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ init_single_inter_mode_search_state(search_state);
+ }
+}
+
+static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const PREDICTION_MODE this_mode) {
+ if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
+ return true;
+ }
+
+ return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1];
+}
+
+static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ PREDICTION_MODE curr_mode,
+ const MV_REFERENCE_FRAME *ref_frames) {
+ const int comp_pred = ref_frames[1] > INTRA_FRAME;
+ if (comp_pred) {
+ if (!is_comp_ref_allowed(bsize)) return 1;
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) {
+ return 1;
+ }
+
+ const AV1_COMMON *const cm = &cpi->common;
+ if (frame_is_intra_only(cm)) return 1;
+
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
+
+ const struct segmentation *const seg = &cm->seg;
+ const unsigned char segment_id = x->e_mbd.mi[0]->segment_id;
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+ }
+
+ if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) {
+ // Mode must be compatible
+ if (!is_interintra_allowed_bsize(bsize)) return 1;
+ if (!is_interintra_allowed_mode(curr_mode)) return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mib_size) {
+ const int sb_size_mask = mib_size - 1;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int mi_row_in_sb = mi_row & sb_size_mask;
+ const int mi_col_in_sb = mi_col & sb_size_mask;
+ const int mi_w = mi_size_wide[bsize];
+ const int mi_h = mi_size_high[bsize];
+ int picked_ref_frames_mask = 0;
+ for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) {
+ for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) {
+ picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j];
+ }
+ }
+ return picked_ref_frames_mask;
+}
+
+// Check if reference frame pair of the current block matches with the given
+// block.
+static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
+ const MV_REFERENCE_FRAME *ref_frames) {
+ return ((ref_frames[0] == mbmi->ref_frame[0]) &&
+ (ref_frames[1] == mbmi->ref_frame[1]));
+}
+
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
+static int inter_mode_search_order_independent_skip(
+ const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask,
+ InterModeSearchState *search_state, int skip_ref_frame_mask,
+ PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) {
+ if (mask_says_skip(mode_skip_mask, ref_frame, mode)) {
+ return 1;
+ }
+
+ const int ref_type = av1_ref_frame_type(ref_frame);
+ if (!cpi->sf.rt_sf.use_real_time_ref_set)
+ if (prune_ref_frame(cpi, x, ref_type)) return 1;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test &&
+ ref_frame[0] == INTRA_FRAME)
+ return 1;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) {
+ return 1;
+ }
+
+ // Reuse the prediction mode in cache
+ if (x->use_mb_mode_cache) {
+ const MB_MODE_INFO *cached_mi = x->mb_mode_cache;
+ const PREDICTION_MODE cached_mode = cached_mi->mode;
+ const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
+ const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
+
+ // If the cached mode is intra, then we just need to match the mode.
+ if (is_mode_intra(cached_mode) && mode != cached_mode) {
+ return 1;
+ }
+
+ // If the cached mode is single inter mode, then we match the mode and
+ // reference frame.
+ if (cached_mode_is_single) {
+ if (mode != cached_mode || ref_frame[0] != cached_frame[0]) {
+ return 1;
+ }
+ } else {
+ // If the cached mode is compound, then we need to consider several cases.
+ const int mode_is_single = ref_frame[1] <= INTRA_FRAME;
+ if (mode_is_single) {
+ // If the mode is single, we know the modes can't match. But we might
+ // still want to search it if compound mode depends on the current mode.
+ int skip_motion_mode_only = 0;
+ if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) {
+ skip_motion_mode_only = (ref_frame[0] == cached_frame[0]);
+ } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) {
+ skip_motion_mode_only = (ref_frame[0] == cached_frame[1]);
+ } else if (cached_mode == NEW_NEWMV) {
+ skip_motion_mode_only = (ref_frame[0] == cached_frame[0] ||
+ ref_frame[0] == cached_frame[1]);
+ }
+
+ return 1 + skip_motion_mode_only;
+ } else {
+ // If both modes are compound, then everything must match.
+ if (mode != cached_mode || ref_frame[0] != cached_frame[0] ||
+ ref_frame[1] != cached_frame[1]) {
+ return 1;
+ }
+ }
+ }
+ }
+
+ const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+ // If no valid mode has been found so far in PARTITION_NONE when finding a
+ // valid partition is required, do not skip mode.
+ if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE &&
+ x->must_find_valid_partition)
+ return 0;
+
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference
+ // frames
+ if (sf->inter_sf.prune_nearmv_using_neighbors &&
+ (mode == NEAR_NEARMV || mode == NEARMV)) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ if (search_state->best_rd != INT64_MAX && xd->left_available &&
+ xd->up_available) {
+ const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 },
+ { 1, 1, 0 },
+ { 2, 1, 0 } };
+ const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE;
+
+ assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX &&
+ qindex_sub_range < 3);
+ const int num_ref_frame_pair_match_thresh =
+ thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1]
+ [qindex_sub_range];
+
+ assert(num_ref_frame_pair_match_thresh <= 2 &&
+ num_ref_frame_pair_match_thresh >= 0);
+ int num_ref_frame_pair_match = 0;
+
+ num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame);
+ num_ref_frame_pair_match +=
+ match_ref_frame_pair(xd->above_mbmi, ref_frame);
+
+ // Pruning based on ref frame pair match with neighbors.
+ if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1;
+ }
+ }
+
+ int skip_motion_mode = 0;
+ if (mbmi->partition != PARTITION_NONE) {
+ int skip_ref = skip_ref_frame_mask & (1 << ref_type);
+ if (ref_type <= ALTREF_FRAME && skip_ref) {
+ // Since the compound ref modes depends on the motion estimation result of
+ // two single ref modes (best mv of single ref modes as the start point),
+ // if current single ref mode is marked skip, we need to check if it will
+ // be used in compound ref modes.
+ if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) {
+ // Found a not skipped compound ref mode which contains current
+ // single ref. So this single ref can't be skipped completely
+ // Just skip its motion mode search, still try its simple
+ // transition mode.
+ skip_motion_mode = 1;
+ skip_ref = 0;
+ }
+ }
+ // If we are reusing the prediction from cache, and the current frame is
+ // required by the cache, then we cannot prune it.
+ if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) {
+ skip_ref = 0;
+ // If the cache only needs the current reference type for compound
+ // prediction, then we can skip motion mode search.
+ skip_motion_mode = (ref_type <= ALTREF_FRAME &&
+ x->mb_mode_cache->ref_frame[1] > INTRA_FRAME);
+ }
+ if (skip_ref) return 1;
+ }
+
+ if (ref_frame[0] == INTRA_FRAME) {
+ if (mode != DC_PRED) {
+ // Disable intra modes other than DC_PRED for blocks with low variance
+ // Threshold for intra skipping based on source variance
+ // TODO(debargha): Specialize the threshold for super block sizes
+ const unsigned int skip_intra_var_thresh = 64;
+ if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+ x->source_variance < skip_intra_var_thresh)
+ return 1;
+ }
+ }
+
+ if (skip_motion_mode) return 2;
+
+ return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
+ const MV_REFERENCE_FRAME *ref_frames,
+ const AV1_COMMON *cm) {
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ mbmi->ref_mv_idx = 0;
+ mbmi->mode = curr_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frames[0];
+ mbmi->ref_frame[1] = ref_frames[1];
+ pmi->palette_size[0] = 0;
+ pmi->palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static AOM_INLINE void collect_single_states(MACROBLOCK *x,
+ InterModeSearchState *search_state,
+ const MB_MODE_INFO *const mbmi) {
+ int i, j;
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+ const int mode_offset = INTER_OFFSET(this_mode);
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+ // Simple rd
+ int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ const int64_t rd =
+ search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < simple_rd) simple_rd = rd;
+ }
+
+ // Insertion sort of single_state
+ const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+ SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+ i = search_state->single_state_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+ state_s[j] = state_s[j - 1];
+ state_s[j] = this_state_s;
+ search_state->single_state_cnt[dir][mode_offset]++;
+
+ // Modelled rd
+ int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ const int64_t rd =
+ search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < modelled_rd) modelled_rd = rd;
+ }
+
+ // Insertion sort of single_state_modelled
+ const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode_offset];
+ i = search_state->single_state_modelled_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+ state_m[j] = state_m[j - 1];
+ state_m[j] = this_state_m;
+ search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static AOM_INLINE void analyze_single_states(
+ const AV1_COMP *cpi, InterModeSearchState *search_state) {
+ const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result;
+ assert(prune_level >= 1);
+ int i, j, dir, mode;
+
+ for (dir = 0; dir < 2; ++dir) {
+ int64_t best_rd;
+ SingleInterModeState(*state)[FWD_REFS];
+ const int prune_factor = prune_level >= 2 ? 6 : 5;
+
+ // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+ // reference frames for all the modes (NEARESTMV and NEARMV may not
+ // have same motion vectors). Always keep the best of each mode
+ // because it might form the best possible combination with other mode.
+ state = search_state->single_state[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+
+ state = search_state->single_state_modelled[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+ }
+
+ // Ordering by simple rd first, then by modelled rd
+ for (dir = 0; dir < 2; ++dir) {
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+ const int state_cnt_m =
+ search_state->single_state_modelled_cnt[dir][mode];
+ SingleInterModeState *state_s = search_state->single_state[dir][mode];
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode];
+ int count = 0;
+ const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+ for (i = 0; i < state_cnt_s; ++i) {
+ if (state_s[i].rd == INT64_MAX) break;
+ if (state_s[i].valid) {
+ search_state->single_rd_order[dir][mode][count++] =
+ state_s[i].ref_frame;
+ }
+ }
+ if (count >= max_candidates) continue;
+
+ for (i = 0; i < state_cnt_m && count < max_candidates; ++i) {
+ if (state_m[i].rd == INT64_MAX) break;
+ if (!state_m[i].valid) continue;
+ const int ref_frame = state_m[i].ref_frame;
+ int match = 0;
+ // Check if existing already
+ for (j = 0; j < count; ++j) {
+ if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+ match = 1;
+ break;
+ }
+ }
+ if (match) continue;
+ // Check if this ref_frame is removed in simple rd
+ int valid = 1;
+ for (j = 0; j < state_cnt_s; ++j) {
+ if (ref_frame == state_s[j].ref_frame) {
+ valid = state_s[j].valid;
+ break;
+ }
+ }
+ if (valid) {
+ search_state->single_rd_order[dir][mode][count++] = ref_frame;
+ }
+ }
+ }
+ }
+}
+
+static int compound_skip_get_candidates(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const int dir, const PREDICTION_MODE mode) {
+ const int mode_offset = INTER_OFFSET(mode);
+ const SingleInterModeState *state =
+ search_state->single_state[dir][mode_offset];
+ const SingleInterModeState *state_modelled =
+ search_state->single_state_modelled[dir][mode_offset];
+
+ int max_candidates = 0;
+ for (int i = 0; i < FWD_REFS; ++i) {
+ if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+ max_candidates++;
+ }
+
+ int candidates = max_candidates;
+ if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) {
+ candidates = AOMMIN(2, max_candidates);
+ }
+ if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) {
+ if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+ state[0].ref_frame == state_modelled[0].ref_frame)
+ candidates = 1;
+ if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+ }
+
+ if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) {
+ // Limit the number of candidates to 1 in each direction for compound
+ // prediction
+ candidates = AOMMIN(1, candidates);
+ }
+ return candidates;
+}
+
+static int compound_skip_by_single_states(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+ const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+ const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+ const int mode[2] = { compound_ref0_mode(this_mode),
+ compound_ref1_mode(this_mode) };
+ const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+ const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+ refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+ int ref_searched[2] = { 0, 0 };
+ int ref_mv_match[2] = { 1, 1 };
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ const SingleInterModeState *state =
+ search_state->single_state[mode_dir[i]][mode_offset[i]];
+ const int state_cnt =
+ search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+ for (j = 0; j < state_cnt; ++j) {
+ if (state[j].ref_frame == refs[i]) {
+ ref_searched[i] = 1;
+ break;
+ }
+ }
+ }
+
+ const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+ for (i = 0; i < 2; ++i) {
+ if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) {
+ continue;
+ }
+ const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+ int_mv single_mv;
+ int_mv comp_mv;
+ get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
+ &x->mbmi_ext);
+ get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext);
+ if (single_mv.as_int != comp_mv.as_int) {
+ ref_mv_match[i] = 0;
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < 2; ++i) {
+ if (!ref_searched[i] || !ref_mv_match[i]) continue;
+ const int candidates =
+ compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+ const MV_REFERENCE_FRAME *ref_order =
+ search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+ int match = 0;
+ for (j = 0; j < candidates; ++j) {
+ if (refs[i] == ref_order[j]) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) return 1;
+ }
+
+ return 0;
+}
+
+// Check if ref frames of current block matches with given block.
+static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
+ const MV_REFERENCE_FRAME *ref_frames,
+ int *const is_ref_match) {
+ if (is_inter_block(mbmi)) {
+ is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0];
+ is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0];
+ if (has_second_ref(mbmi)) {
+ is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1];
+ is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1];
+ }
+ }
+}
+
+// Prune compound mode using ref frames of neighbor blocks.
+static INLINE int compound_skip_using_neighbor_refs(
+ MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
+ // Exclude non-extended compound modes from pruning
+ if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+ this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+ return 0;
+
+ if (prune_ext_comp_using_neighbors >= 3) return 1;
+
+ int is_ref_match[2] = { 0 }; // 0 - match for forward refs
+ // 1 - match for backward refs
+ // Check if ref frames of this block matches with left neighbor.
+ if (xd->left_available)
+ match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match);
+
+ // Check if ref frames of this block matches with above neighbor.
+ if (xd->up_available)
+ match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match);
+
+ // Combine ref frame match with neighbors in forward and backward refs.
+ const int track_ref_match = is_ref_match[0] + is_ref_match[1];
+
+ // Pruning based on ref frame match with neighbors.
+ if (track_ref_match >= prune_ext_comp_using_neighbors) return 0;
+ return 1;
+}
+
+// Update best single mode for the given reference frame based on simple rd.
+static INLINE void update_best_single_mode(InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frame,
+ int64_t this_rd) {
+ if (this_rd < search_state->best_single_rd[ref_frame]) {
+ search_state->best_single_rd[ref_frame] = this_rd;
+ search_state->best_single_mode[ref_frame] = this_mode;
+ }
+}
+
+// Prune compound mode using best single mode for the same reference.
+static INLINE int skip_compound_using_best_single_mode_ref(
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames,
+ const PREDICTION_MODE *best_single_mode,
+ int prune_comp_using_best_single_mode_ref) {
+ // Exclude non-extended compound modes from pruning
+ if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+ this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+ return 0;
+
+ assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV);
+ const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode);
+ // Get ref frame direction corresponding to NEWMV
+ // 0 - NEWMV corresponding to forward direction
+ // 1 - NEWMV corresponding to backward direction
+ const int newmv_dir = comp_mode_ref0 != NEWMV;
+
+ // Avoid pruning the compound mode when ref frame corresponding to NEWMV
+ // have NEWMV as single mode winner.
+ // Example: For an extended-compound mode,
+ // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}}
+ // - Ref frame corresponding to NEWMV is ALTREF_FRAME
+ // - Avoid pruning this mode, if best single mode corresponding to ref frame
+ // ALTREF_FRAME is NEWMV
+ const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]];
+ if (single_mode == NEWMV) return 0;
+
+ // Avoid pruning the compound mode when best single mode is not available
+ if (prune_comp_using_best_single_mode_ref == 1)
+ if (single_mode == MB_MODE_COUNT) return 0;
+ return 1;
+}
+
+static int compare_int64(const void *a, const void *b) {
+ int64_t a64 = *((int64_t *)a);
+ int64_t b64 = *((int64_t *)b);
+ if (a64 < b64) {
+ return -1;
+ } else if (a64 == b64) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+static INLINE void update_search_state(
+ InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst,
+ PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats,
+ const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv,
+ THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int skip_txfm =
+ mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode);
+ const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ search_state->best_rd = new_best_rd_stats->rdcost;
+ search_state->best_mode_index = new_best_mode;
+ *best_rd_stats_dst = *new_best_rd_stats;
+ search_state->best_mbmode = *mbmi;
+ search_state->best_skip2 = skip_txfm;
+ search_state->best_mode_skippable = new_best_rd_stats->skip_txfm;
+ // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
+ // rate_uv because av1_txfm_search process is replaced by rd estimation.
+ // Therefore, we should avoid updating best_rate_y and best_rate_uv here.
+ // These two values will be updated when av1_txfm_search is called.
+ if (txfm_search_done) {
+ search_state->best_rate_y =
+ new_best_rd_stats_y->rate +
+ x->mode_costs.skip_txfm_cost[skip_ctx]
+ [new_best_rd_stats->skip_txfm || skip_txfm];
+ search_state->best_rate_uv = new_best_rd_stats_uv->rate;
+ }
+ search_state->best_y_rdcost = *new_best_rd_stats_y;
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+// Find the best RD for a reference frame (among single reference modes)
+// and store +10% of it in the 0-th element in ref_frame_rd.
+static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
+ assert(ref_frame_rd[0] == INT64_MAX);
+ int64_t ref_copy[REF_FRAMES - 1];
+ memcpy(ref_copy, ref_frame_rd + 1,
+ sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1));
+ qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64);
+
+ int64_t cutoff = ref_copy[0];
+ // The cut-off is within 10% of the best.
+ if (cutoff != INT64_MAX) {
+ assert(cutoff < INT64_MAX / 200);
+ cutoff = (110 * cutoff) / 100;
+ }
+ ref_frame_rd[0] = cutoff;
+}
+
+// Check if either frame is within the cutoff.
+static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
+ MV_REFERENCE_FRAME frame1,
+ MV_REFERENCE_FRAME frame2) {
+ assert(frame2 > 0);
+ return ref_frame_rd[frame1] <= ref_frame_rd[0] ||
+ ref_frame_rd[frame2] <= ref_frame_rd[0];
+}
+
+static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost,
+ HandleInterModeArgs *const args, TileDataEnc *const tile_data,
+ PICK_MODE_CONTEXT *const ctx,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+ const motion_mode_best_st_candidate *const best_motion_mode_cands,
+ int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
+ InterModeSearchState *const search_state, int64_t *yrd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ InterModesInfo *const inter_modes_info = x->inter_modes_info;
+ const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand;
+
+ for (int cand = 0; cand < num_best_cand; cand++) {
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+ av1_init_rd_stats(&rd_stats_y);
+ av1_init_rd_stats(&rd_stats_uv);
+ int rate_mv;
+
+ rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
+ args->skip_motion_mode =
+ best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode;
+ *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi;
+ rd_stats.rate =
+ best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff;
+
+ // Continue if the best candidate is compound.
+ if (!is_inter_singleref_mode(mbmi->mode)) continue;
+
+ x->txfm_search_info.skip_txfm = 0;
+ struct macroblockd_plane *pd = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+ { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+ };
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ // Initialize motion mode to simple translation
+ // Calculation of switchable rate depends on it.
+ mbmi->motion_mode = 0;
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ int64_t skip_rd[2] = { search_state->best_skip_rd[0],
+ search_state->best_skip_rd[1] };
+ int64_t this_yrd = INT64_MAX;
+ int64_t ret_value = motion_mode_rd(
+ cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args,
+ search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd,
+ do_tx_search, inter_modes_info, 1, &this_yrd);
+
+ if (ret_value != INT64_MAX) {
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+ const THR_MODES mode_enum = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ // Collect mode stats for multiwinner mode processing
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+ mode_enum, NULL, bsize, rd_stats.rdcost,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
+ if (rd_stats.rdcost < search_state->best_rd) {
+ *yrd = this_yrd;
+ update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_enum, x, do_tx_search);
+ if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
+ }
+ }
+ }
+}
+
+/*!\cond */
+// Arguments for speed feature pruning of inter mode search
+typedef struct {
+ int *skip_motion_mode;
+ mode_skip_mask_t *mode_skip_mask;
+ InterModeSearchState *search_state;
+ int skip_ref_frame_mask;
+ int reach_first_comp_mode;
+ int mode_thresh_mul_fact;
+ int num_single_modes_processed;
+ int prune_cpd_using_sr_stats_ready;
+} InterModeSFArgs;
+/*!\endcond */
+
+static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ int64_t *ref_frame_rd, int midx,
+ InterModeSFArgs *args, int is_low_temp_var) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ // Get the actual prediction mode we are trying in this iteration
+ const THR_MODES mode_enum = av1_default_mode_order[midx];
+ const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+ const PREDICTION_MODE this_mode = mode_def->mode;
+ const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+ const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+ const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+ if (ref_frame == INTRA_FRAME) return 1;
+
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE &&
+ comp_pred) {
+ return 1;
+ }
+
+ // This is for real time encoding.
+ if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME &&
+ this_mode != NEARESTMV)
+ return 1;
+
+ // Check if this mode should be skipped because it is incompatible with the
+ // current frame
+ if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
+ return 1;
+ const int ret = inter_mode_search_order_independent_skip(
+ cpi, x, args->mode_skip_mask, args->search_state,
+ args->skip_ref_frame_mask, this_mode, mode_def->ref_frame);
+ if (ret == 1) return 1;
+ *(args->skip_motion_mode) = (ret == 2);
+
+ // We've reached the first compound prediction mode, get stats from the
+ // single reference predictors to help with pruning.
+ // Disable this pruning logic if interpolation filter search was skipped for
+ // single prediction modes as it can result in aggressive pruning of compound
+ // prediction modes due to the absence of modelled_rd populated by
+ // av1_interpolation_filter_search().
+ // TODO(Remya): Check the impact of the sf
+ // 'prune_comp_search_by_single_result' if compound prediction modes are
+ // enabled in future for REALTIME encode.
+ if (!sf->interp_sf.skip_interp_filter_search &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+ args->reach_first_comp_mode == 0) {
+ analyze_single_states(cpi, args->search_state);
+ args->reach_first_comp_mode = 1;
+ }
+
+ // Prune aggressively when best mode is skippable.
+ int mul_fact = args->search_state->best_mode_skippable
+ ? args->mode_thresh_mul_fact
+ : (1 << MODE_THRESH_QBITS);
+ int64_t mode_threshold =
+ (args->search_state->mode_threshold[mode_enum] * mul_fact) >>
+ MODE_THRESH_QBITS;
+
+ if (args->search_state->best_rd < mode_threshold) return 1;
+
+ // Skip this compound mode based on the RD results from the single prediction
+ // modes
+ if (!sf->interp_sf.skip_interp_filter_search &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
+ ref_frame, second_ref_frame, x))
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
+ // After we done with single reference modes, find the 2nd best RD
+ // for a reference frame. Only search compound modes that have a reference
+ // frame at least as good as the 2nd best.
+ if (!args->prune_cpd_using_sr_stats_ready &&
+ args->num_single_modes_processed == NUM_SINGLE_REF_MODES) {
+ find_top_ref(ref_frame_rd);
+ args->prune_cpd_using_sr_stats_ready = 1;
+ }
+ if (args->prune_cpd_using_sr_stats_ready &&
+ !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame))
+ return 1;
+ }
+
+ // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+ if (sf->inter_sf.skip_ext_comp_nearmv_mode &&
+ (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) {
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) {
+ if (compound_skip_using_neighbor_refs(
+ xd, this_mode, ref_frames,
+ sf->inter_sf.prune_ext_comp_using_neighbors))
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) {
+ if (skip_compound_using_best_single_mode_ref(
+ this_mode, ref_frames, args->search_state->best_single_mode,
+ sf->inter_sf.prune_comp_using_best_single_mode_ref))
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ if (skip_nearest_near_mv_using_refmv_weight(
+ x, this_mode, ref_frame_type,
+ args->search_state->best_mbmode.mode)) {
+ // Ensure the mode is pruned only when the current block has obtained a
+ // valid inter mode.
+ assert(is_inter_mode(args->search_state->best_mbmode.mode));
+ return 1;
+ }
+ }
+
+ if (sf->rt_sf.prune_inter_modes_with_golden_ref &&
+ ref_frame == GOLDEN_FRAME && !comp_pred) {
+ const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL);
+ if (cpi->rc.frames_since_golden > (subgop_size >> 2) &&
+ args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) {
+ if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void record_best_compound(REFERENCE_MODE reference_mode,
+ RD_STATS *rd_stats, int comp_pred, int rdmult,
+ InterModeSearchState *search_state,
+ int compmode_cost) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rd_stats->rate - compmode_cost;
+ hybrid_rate = rd_stats->rate;
+ } else {
+ single_rate = rd_stats->rate;
+ hybrid_rate = rd_stats->rate + compmode_cost;
+ }
+
+ single_rd = RDCOST(rdmult, single_rate, rd_stats->dist);
+ hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
+
+ if (!comp_pred) {
+ if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE])
+ search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ } else {
+ if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE])
+ search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+ }
+ if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT])
+ search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+}
+
+// Does a transform search over a list of the best inter mode candidates.
+// This is called if the original mode search computed an RD estimate
+// for the transform search rather than doing a full search.
+static void tx_search_best_inter_candidates(
+ AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+ int64_t best_rd_so_far, BLOCK_SIZE bsize,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col,
+ InterModeSearchState *search_state, RD_STATS *rd_cost,
+ PICK_MODE_CONTEXT *ctx, int64_t *yrd) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int num_planes = av1_num_planes(cm);
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ InterModesInfo *inter_modes_info = x->inter_modes_info;
+ inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+ search_state->best_rd = best_rd_so_far;
+ search_state->best_mode_index = THR_INVALID;
+ // Initialize best mode stats for winner mode processing
+ x->winner_mode_count = 0;
+ store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+ NULL, bsize, best_rd_so_far,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, 0);
+ inter_modes_info->num =
+ inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+ ? inter_modes_info->num
+ : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+ const int64_t top_est_rd =
+ inter_modes_info->num > 0
+ ? inter_modes_info
+ ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+ : INT64_MAX;
+ *yrd = INT64_MAX;
+ int64_t best_rd_in_this_partition = INT64_MAX;
+ int num_inter_mode_cands = inter_modes_info->num;
+ int newmv_mode_evaled = 0;
+ int max_allowed_cands = INT_MAX;
+ if (cpi->sf.inter_sf.limit_inter_mode_cands) {
+ // The bound on the no. of inter mode candidates, beyond which the
+ // candidates are limited if a newmv mode got evaluated, is set as
+ // max_allowed_cands + 1.
+ const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 };
+ assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4);
+ max_allowed_cands =
+ num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands];
+ }
+
+ int num_mode_thresh = INT_MAX;
+ if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) {
+ // Bound the no. of transform searches per prediction mode beyond a
+ // threshold.
+ const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 };
+ assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3);
+ num_mode_thresh =
+ num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode];
+ }
+
+ int num_tx_cands = 0;
+ int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 };
+ // Iterate over best inter mode candidates and perform tx search
+ for (int j = 0; j < num_inter_mode_cands; ++j) {
+ const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+ *mbmi = inter_modes_info->mbmi_arr[data_idx];
+ const PREDICTION_MODE prediction_mode = mbmi->mode;
+ int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+ if (curr_est_rd * 0.80 > top_est_rd) break;
+
+ if (num_tx_cands > num_mode_thresh) {
+ if ((prediction_mode != NEARESTMV &&
+ num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) ||
+ (prediction_mode == NEARESTMV &&
+ num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2))
+ continue;
+ }
+
+ txfm_info->skip_txfm = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ bool is_predictor_built = false;
+
+ // Initialize RD stats
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+ int64_t skip_rd = INT64_MAX;
+ const int txfm_rd_gate_level = get_txfm_rd_gate_level(
+ cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT,
+ /*eval_motion_mode=*/0);
+ if (txfm_rd_gate_level) {
+ // Check if the mode is good enough based on skip RD
+ int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+ skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+ int eval_txfm = check_txfm_eval(x, bsize, search_state->best_skip_rd[0],
+ skip_rd, txfm_rd_gate_level, 0);
+ if (!eval_txfm) continue;
+ }
+
+ // Build the prediction for this mode
+ if (!is_predictor_built) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ }
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ av1_build_obmc_inter_predictors_sb(cm, xd);
+ }
+
+ num_tx_cands++;
+ if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1;
+ num_tx_search_modes[prediction_mode - INTER_MODE_START]++;
+ int64_t this_yrd = INT64_MAX;
+ // Do the transform search
+ if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+ mode_rate, search_state->best_rd)) {
+ continue;
+ } else {
+ const int y_rate =
+ rd_stats.skip_txfm
+ ? mode_costs->skip_txfm_cost[skip_ctx][1]
+ : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]);
+ this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist);
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ inter_mode_data_push(
+ tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+ }
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+ if (rd_stats.rdcost < best_rd_in_this_partition) {
+ best_rd_in_this_partition = rd_stats.rdcost;
+ *yrd = this_yrd;
+ }
+
+ const THR_MODES mode_enum = get_prediction_mode_idx(
+ prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum,
+ NULL, bsize, rd_stats.rdcost,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+
+ if (rd_stats.rdcost < search_state->best_rd) {
+ update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_enum, x, txfm_search_done);
+ search_state->best_skip_rd[0] = skip_rd;
+ // Limit the total number of modes to be evaluated if the first is valid
+ // and transform skip or compound
+ if (cpi->sf.inter_sf.inter_mode_txfm_breakout) {
+ if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) {
+ // Evaluate more candidates at high quantizers where occurrence of
+ // transform skip is high.
+ const int max_cands_cap[5] = { 2, 3, 5, 7, 9 };
+ const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+ num_inter_mode_cands =
+ AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num);
+ } else if (!j && has_second_ref(&search_state->best_mbmode)) {
+ const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1;
+ // Evaluate more candidates at low quantizers where occurrence of
+ // single reference mode is high.
+ const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 },
+ { 10, 7, 5, 3 } };
+ const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS;
+ num_inter_mode_cands = AOMMIN(
+ max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num);
+ }
+ }
+ }
+ // If the number of candidates evaluated exceeds max_allowed_cands, break if
+ // a newmv mode was evaluated already.
+ if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break;
+ }
+}
+
+// Indicates number of winner simple translation modes to be used
+static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 };
+
+// Adds a motion mode to the candidate list for motion_mode_for_winner_cand
+// speed feature. This list consists of modes that have only searched
+// SIMPLE_TRANSLATION. The final list will be used to search other motion
+// modes after the initial RD search.
+static void handle_winner_cand(
+ MB_MODE_INFO *const mbmi,
+ motion_mode_best_st_candidate *best_motion_mode_cands,
+ int max_winner_motion_mode_cand, int64_t this_rd,
+ motion_mode_candidate *motion_mode_cand, int skip_motion_mode) {
+ // Number of current motion mode candidates in list
+ const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand;
+ int valid_motion_mode_cand_loc = num_motion_mode_cand;
+
+ // find the best location to insert new motion mode candidate
+ for (int j = 0; j < num_motion_mode_cand; j++) {
+ if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) {
+ valid_motion_mode_cand_loc = j;
+ break;
+ }
+ }
+
+ // Insert motion mode if location is found
+ if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) {
+ if (num_motion_mode_cand > 0 &&
+ valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1)
+ memmove(
+ &best_motion_mode_cands
+ ->motion_mode_cand[valid_motion_mode_cand_loc + 1],
+ &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc],
+ (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) -
+ valid_motion_mode_cand_loc) *
+ sizeof(best_motion_mode_cands->motion_mode_cand[0]));
+ motion_mode_cand->mbmi = *mbmi;
+ motion_mode_cand->rd_cost = this_rd;
+ motion_mode_cand->skip_motion_mode = skip_motion_mode;
+ best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] =
+ *motion_mode_cand;
+ best_motion_mode_cands->num_motion_mode_cand =
+ AOMMIN(max_winner_motion_mode_cand,
+ best_motion_mode_cands->num_motion_mode_cand + 1);
+ }
+}
+
+/*!\brief Search intra modes in interframes
+ *
+ * \ingroup intra_mode_search
+ *
+ * This function searches for the best intra mode when the current frame is an
+ * interframe. This function however does *not* handle luma palette mode.
+ * Palette mode is currently handled by \ref av1_search_palette_mode.
+ *
+ * This function will first iterate through the luma mode candidates to find the
+ * best luma intra mode. Once the best luma mode it's found, it will then search
+ * for the best chroma mode. Because palette mode is currently not handled by
+ * here, a cache of uv mode is stored in
+ * InterModeSearchState::intra_search_state so it can be reused later by \ref
+ * av1_search_palette_mode.
+ *
+ * \param[in,out] search_state Struct keep track of the prediction mode
+ * search state in interframe.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in,out] x Pointer to struct holding all the data for
+ * the current prediction block.
+ * \param[out] rd_cost Stores the best rd_cost among all the
+ * prediction modes searched.
+ * \param[in] bsize Current block size.
+ * \param[in,out] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ * for only the Y plane.
+ * \param[in] sf_args Stores the list of intra mode candidates
+ * to be searched.
+ * \param[in] intra_ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] yrd_threshold The rdcost threshold for luma intra mode to
+ * terminate chroma intra mode search.
+ *
+ * \remark If a new best mode is found, search_state and rd_costs are updated
+ * correspondingly. While x is also modified, it is only used as a temporary
+ * buffer, and the final decisions are stored in search_state.
+ */
+static AOM_INLINE void search_intra_modes_in_interframe(
+ InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
+ int64_t yrd_threshold) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ IntraModeSearchState *intra_search_state = &search_state->intra_search_state;
+
+ int is_best_y_mode_intra = 0;
+ RD_STATS best_intra_rd_stats_y;
+ int64_t best_rd_y = INT64_MAX;
+ int best_mode_cost_y = -1;
+ MB_MODE_INFO best_mbmi = *xd->mi[0];
+ THR_MODES best_mode_enum = THR_INVALID;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int num_4x4 = bsize_to_num_blk(bsize);
+
+ // Performs luma search
+ int64_t best_model_rd = INT64_MAX;
+ int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+ for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+ top_intra_model_rd[i] = INT64_MAX;
+ }
+ for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) {
+ if (sf->intra_sf.skip_intra_in_interframe &&
+ search_state->intra_search_state.skip_intra_modes)
+ break;
+ set_y_mode_and_delta_angle(
+ mode_idx, mbmi, sf->intra_sf.prune_luma_odd_delta_angles_in_intra);
+ assert(mbmi->mode < INTRA_MODE_END);
+
+ // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+ if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode))
+ continue;
+
+ const THR_MODES mode_enum =
+ get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME);
+ if ((!intra_mode_cfg->enable_smooth_intra ||
+ cpi->sf.intra_sf.disable_smooth_intra) &&
+ (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+ mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+ continue;
+ if (av1_is_directional_mode(mbmi->mode) &&
+ !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+ mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+ continue;
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+ assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+ assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+ init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+ x->txfm_search_info.skip_txfm = 0;
+
+ if (this_mode != DC_PRED) {
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+ if (search_state->best_mode_index != THR_INVALID &&
+ search_state->best_mbmode.ref_frame[0] > INTRA_FRAME)
+ continue;
+ }
+ if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(
+ this_mode, search_state->intra_search_state.best_intra_mode))
+ continue;
+ }
+ }
+
+ RD_STATS intra_rd_stats_y;
+ int mode_cost_y;
+ int64_t intra_rd_y = INT64_MAX;
+ const int is_luma_result_valid = av1_handle_intra_y_mode(
+ intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
+ &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y,
+ &best_model_rd, top_intra_model_rd);
+ if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
+ is_best_y_mode_intra = 1;
+ if (intra_rd_y < best_rd_y) {
+ best_intra_rd_stats_y = intra_rd_stats_y;
+ best_mode_cost_y = mode_cost_y;
+ best_rd_y = intra_rd_y;
+ best_mbmi = *mbmi;
+ best_mode_enum = mode_enum;
+ memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(best_blk_skip[0]) * num_4x4);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4);
+ }
+ }
+ }
+
+ if (!is_best_y_mode_intra) {
+ return;
+ }
+
+ assert(best_rd_y < INT64_MAX);
+
+ // Restores the best luma mode
+ *mbmi = best_mbmi;
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * num_4x4);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4);
+
+ // Performs chroma search
+ RD_STATS intra_rd_stats, intra_rd_stats_uv;
+ av1_init_rd_stats(&intra_rd_stats);
+ av1_init_rd_stats(&intra_rd_stats_uv);
+ const int num_planes = av1_num_planes(cm);
+ if (num_planes > 1) {
+ const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe(
+ intra_search_state, cpi, x, bsize, &intra_rd_stats,
+ &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd);
+
+ if (!intra_uv_mode_valid) {
+ return;
+ }
+ }
+
+ // Merge the luma and chroma rd stats
+ assert(best_mode_cost_y >= 0);
+ intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y;
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+ // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+ // in the tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const PREDICTION_MODE mode = mbmi->mode;
+ if (num_planes > 1 && xd->is_chroma_ref) {
+ const int uv_mode_cost =
+ mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+ intra_rd_stats.rate +=
+ intra_rd_stats_uv.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+ }
+
+ // Intra block is always coded as non-skip
+ intra_rd_stats.skip_txfm = 0;
+ intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist;
+ // Add in the cost of the no skip flag.
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+ // Calculate the final RD estimate for this mode.
+ const int64_t this_rd =
+ RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist);
+ // Keep record of best intra rd
+ if (this_rd < search_state->best_intra_rd) {
+ search_state->best_intra_rd = this_rd;
+ intra_search_state->best_intra_mode = mode;
+ }
+
+ for (int i = 0; i < REFERENCE_MODES; ++i) {
+ search_state->best_pred_rd[i] =
+ AOMMIN(search_state->best_pred_rd[i], this_rd);
+ }
+
+ intra_rd_stats.rdcost = this_rd;
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y,
+ &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (intra_rd_stats.rdcost < search_state->best_rd) {
+ update_search_state(search_state, rd_cost, ctx, &intra_rd_stats,
+ &best_intra_rd_stats_y, &intra_rd_stats_uv,
+ best_mode_enum, x, txfm_search_done);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
+// features in intra mode pruning.
+static AOM_INLINE void calculate_cost_from_tpl_data(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // Only consider full SB.
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d;
+ const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+ (block_size_high[sb_size] / tpl_bsize_1d);
+ SuperBlockEnc *sb_enc = &x->sb_enc;
+ if (sb_enc->tpl_data_count == len) {
+ const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+ const int tpl_stride = sb_enc->tpl_stride;
+ const int tplw = mi_size_wide[tpl_bsize];
+ const int tplh = mi_size_high[tpl_bsize];
+ const int nw = mi_size_wide[bsize] / tplw;
+ const int nh = mi_size_high[bsize] / tplh;
+ if (nw >= 1 && nh >= 1) {
+ const int of_h = mi_row % mi_size_high[sb_size];
+ const int of_w = mi_col % mi_size_wide[sb_size];
+ const int start = of_h / tplh * tpl_stride + of_w / tplw;
+
+ for (int k = 0; k < nh; k++) {
+ for (int l = 0; l < nw; l++) {
+ *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+ *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
+ }
+ }
+ *inter_cost /= nw * nh;
+ *intra_cost /= nw * nh;
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
+// intra mode search.
+static AOM_INLINE void skip_intra_modes_in_interframe(
+ AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
+ InterModeSearchState *search_state, const SPEED_FEATURES *const sf,
+ int64_t inter_cost, int64_t intra_cost) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME;
+ if (sf->rt_sf.prune_intra_mode_based_on_mv_range &&
+ bsize > sf->part_sf.max_intra_bsize && !comp_pred) {
+ const MV best_mv = search_state->best_mbmode.mv[0].as_mv;
+ const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range;
+ if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh &&
+ x->source_variance > 128) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ return;
+ }
+ }
+
+ const unsigned int src_var_thresh_intra_skip = 1;
+ const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe;
+ if (!(skip_intra_in_interframe &&
+ (x->source_variance > src_var_thresh_intra_skip)))
+ return;
+
+ // Prune intra search based on best inter mode being transfrom skip.
+ if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) {
+ const int qindex_thresh[2] = { 200, MAXQ };
+ const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0;
+ if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) &&
+ (x->qindex <= qindex_thresh[ind])) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ return;
+ } else if ((skip_intra_in_interframe >= 4) &&
+ (inter_cost < 0 || intra_cost < 0)) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ return;
+ }
+ }
+ // Use ML model to prune intra search.
+ if (inter_cost >= 0 && intra_cost >= 0) {
+ const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+ ? &av1_intrap_nn_config
+ : &av1_intrap_hd_nn_config;
+ float nn_features[6];
+ float scores[2] = { 0.0f };
+
+ nn_features[0] = (float)search_state->best_mbmode.skip_txfm;
+ nn_features[1] = (float)mi_size_wide_log2[bsize];
+ nn_features[2] = (float)mi_size_high_log2[bsize];
+ nn_features[3] = (float)intra_cost;
+ nn_features[4] = (float)inter_cost;
+ const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+ const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+ nn_features[5] = (float)(ac_q_max / ac_q);
+
+ av1_nn_predict(nn_features, nn_config, 1, scores);
+
+ // For two parameters, the max prob returned from av1_nn_softmax equals
+ // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the
+ // calling of av1_nn_softmax.
+ const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f };
+ assert(skip_intra_in_interframe <= 5);
+ if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ }
+ }
+}
+
+static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
+ int is_single_pred) {
+ const MODE encoding_mode = cpi->oxcf.mode;
+ if (encoding_mode == REALTIME) {
+ return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE &&
+ (cpi->sf.interp_sf.skip_interp_filter_search ||
+ cpi->sf.winner_mode_sf.winner_mode_ifs));
+ } else if (encoding_mode == GOOD) {
+ // Skip interpolation filter search for single prediction modes.
+ return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred);
+ }
+ return false;
+}
+
+static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+
+ if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION ||
+ !sf->rt_sf.short_circuit_low_temp_var ||
+ !sf->rt_sf.prune_inter_modes_using_temp_var) {
+ return 0;
+ }
+
+ const int mi_row = x->e_mbd.mi_row;
+ const int mi_col = x->e_mbd.mi_col;
+ int is_low_temp_var = 0;
+
+ if (cm->seq_params->sb_size == BLOCK_64X64)
+ is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+ else
+ is_low_temp_var = av1_get_force_skip_low_temp_var(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+
+ return is_low_temp_var;
+}
+
+// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ AV1_COMMON *const cm = &cpi->common;
+ const FeatureFlags *const features = &cm->features;
+ const int num_planes = av1_num_planes(cm);
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int i;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *comp_inter_cost =
+ mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
+
+ InterModeSearchState search_state;
+ init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
+ INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+ };
+ HandleInterModeArgs args = { { NULL },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+ { NULL },
+ { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 },
+ NULL,
+ NULL,
+ NULL,
+ search_state.modelled_rd,
+ INT_MAX,
+ INT_MAX,
+ search_state.simple_rd,
+ 0,
+ false,
+ interintra_modes,
+ { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
+ { { 0, 0 } },
+ { 0 },
+ 0,
+ 0,
+ -1,
+ -1,
+ -1,
+ { 0 },
+ { 0 },
+ UINT_MAX };
+ // Currently, is_low_temp_var is used in real time encoding.
+ const int is_low_temp_var = get_block_temp_var(cpi, x, bsize);
+
+ for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
+ // Indicates the appropriate number of simple translation winner modes for
+ // exhaustive motion mode evaluation
+ const int max_winner_motion_mode_cand =
+ num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand];
+ assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES);
+ motion_mode_candidate motion_mode_cand;
+ motion_mode_best_st_candidate best_motion_mode_cands;
+ // Initializing the number of motion mode candidates to zero.
+ best_motion_mode_cands.num_motion_mode_cand = 0;
+ for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i)
+ best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX;
+
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+ av1_invalid_rd_stats(rd_cost);
+
+ for (i = 0; i < REF_FRAMES; ++i) {
+ x->warp_sample_info[i].num = -1;
+ }
+
+ // Ref frames that are selected by square partition blocks.
+ int picked_ref_frames_mask = 0;
+ if (sf->inter_sf.prune_ref_frame_for_rect_partitions &&
+ mbmi->partition != PARTITION_NONE) {
+ // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
+ // partition blocks. prune_ref_frame_for_rect_partitions >=2
+ // implies prune for vert, horiz and extended partition blocks.
+ if ((mbmi->partition != PARTITION_VERT &&
+ mbmi->partition != PARTITION_HORZ) ||
+ sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
+ picked_ref_frames_mask =
+ fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size);
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
+ // Skip ref frames that never selected by square blocks.
+ const int skip_ref_frame_mask =
+ picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+ mode_skip_mask_t mode_skip_mask;
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ // init params, set frame modes, speed features
+ set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
+ skip_ref_frame_mask, ref_costs_single,
+ ref_costs_comp, yv12_mb);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
+
+ int64_t best_est_rd = INT64_MAX;
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ // If do_tx_search is 0, only estimated RD should be computed.
+ // If do_tx_search is 1, all modes have TX search performed.
+ const int do_tx_search =
+ !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+ (sf->inter_sf.inter_mode_rd_model_estimation == 2 &&
+ num_pels_log2_lookup[bsize] > 8));
+ InterModesInfo *inter_modes_info = x->inter_modes_info;
+ inter_modes_info->num = 0;
+
+ // Temporary buffers used by handle_inter_mode().
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+
+ // The best RD found for the reference frame, among single reference modes.
+ // Note that the 0-th element will contain a cut-off that is later used
+ // to determine if we should skip a compound mode.
+ int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX, INT64_MAX };
+
+ // Prepared stats used later to check if we could skip intra mode eval.
+ int64_t inter_cost = -1;
+ int64_t intra_cost = -1;
+ // Need to tweak the threshold for hdres speed 0 & 1.
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Obtain the relevant tpl stats for pruning inter modes
+ PruneInfoFromTpl inter_cost_info_from_tpl;
+#if !CONFIG_REALTIME_ONLY
+ if (sf->inter_sf.prune_inter_modes_based_on_tpl) {
+ // x->tpl_keep_ref_frame[id] = 1 => no pruning in
+ // prune_ref_by_selective_ref_frame()
+ // x->tpl_keep_ref_frame[id] = 0 => ref frame can be pruned in
+ // prune_ref_by_selective_ref_frame()
+ // Populating valid_refs[idx] = 1 ensures that
+ // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
+ // pruned ref frame.
+ int valid_refs[INTER_REFS_PER_FRAME];
+ for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+ const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
+ valid_refs[frame - 1] =
+ x->tpl_keep_ref_frame[frame] ||
+ !prune_ref_by_selective_ref_frame(
+ cpi, x, refs, cm->cur_frame->ref_display_order_hint);
+ }
+ av1_zero(inter_cost_info_from_tpl);
+ get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs,
+ &inter_cost_info_from_tpl);
+ }
+
+ const int do_pruning =
+ (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
+ if (do_pruning && sf->intra_sf.skip_intra_in_interframe &&
+ cpi->oxcf.algo_cfg.enable_tpl_model)
+ calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost,
+ &intra_cost);
+#endif // !CONFIG_REALTIME_ONLY
+
+ // Initialize best mode stats for winner mode processing.
+ const int max_winner_mode_count =
+ winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+ zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
+ x->winner_mode_count = 0;
+ store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+ NULL, bsize, best_rd_so_far,
+ sf->winner_mode_sf.multi_winner_mode_type, 0);
+
+ int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
+ if (sf->inter_sf.prune_inter_modes_if_skippable) {
+ // Higher multiplication factor values for lower quantizers.
+ mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex];
+ }
+
+ // Initialize arguments for mode loop speed features
+ InterModeSFArgs sf_args = { &args.skip_motion_mode,
+ &mode_skip_mask,
+ &search_state,
+ skip_ref_frame_mask,
+ 0,
+ mode_thresh_mul_fact,
+ 0,
+ 0 };
+ int64_t best_inter_yrd = INT64_MAX;
+
+ // This is the main loop of this function. It loops over all possible inter
+ // modes and calls handle_inter_mode() to compute the RD for each.
+ // Here midx is just an iterator index that should not be used by itself
+ // except to keep track of the number of modes searched. It should be used
+ // with av1_default_mode_order to get the enum that defines the mode, which
+ // can be used with av1_mode_defs to get the prediction mode and the ref
+ // frames.
+ // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings
+ // good speedup for real time case. If we decide to use compound mode in real
+ // time, maybe we can modify av1_default_mode_order table.
+ THR_MODES mode_start = THR_INTER_MODE_START;
+ THR_MODES mode_end = THR_INTER_MODE_END;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ if (current_frame->reference_mode == SINGLE_REFERENCE) {
+ mode_start = SINGLE_REF_MODE_START;
+ mode_end = SINGLE_REF_MODE_END;
+ }
+
+ for (THR_MODES midx = mode_start; midx < mode_end; ++midx) {
+ // Get the actual prediction mode we are trying in this iteration
+ const THR_MODES mode_enum = av1_default_mode_order[midx];
+ const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+ const PREDICTION_MODE this_mode = mode_def->mode;
+ const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+
+ const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+ const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+ const int is_single_pred =
+ ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME;
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+ init_mbmi(mbmi, this_mode, ref_frames, cm);
+
+ txfm_info->skip_txfm = 0;
+ sf_args.num_single_modes_processed += is_single_pred;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, skip_inter_mode_time);
+#endif
+ // Apply speed features to decide if this inter mode can be skipped
+ const int is_skip_inter_mode = skip_inter_mode(
+ cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, skip_inter_mode_time);
+#endif
+ if (is_skip_inter_mode) continue;
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->ref_mv_idx = 0;
+
+ const int64_t ref_best_rd = search_state.best_rd;
+ RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+
+ const int ref_frame_cost = comp_pred
+ ? ref_costs_comp[ref_frame][second_ref_frame]
+ : ref_costs_single[ref_frame];
+ const int compmode_cost =
+ is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0;
+ const int real_compmode_cost =
+ cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+ ? compmode_cost
+ : 0;
+ // Point to variables that are maintained between loop iterations
+ args.single_newmv = search_state.single_newmv;
+ args.single_newmv_rate = search_state.single_newmv_rate;
+ args.single_newmv_valid = search_state.single_newmv_valid;
+ args.single_comp_cost = real_compmode_cost;
+ args.ref_frame_cost = ref_frame_cost;
+ args.best_pred_sse = search_state.best_pred_sse;
+ args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred);
+
+ int64_t skip_rd[2] = { search_state.best_skip_rd[0],
+ search_state.best_skip_rd[1] };
+ int64_t this_yrd = INT64_MAX;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_inter_mode_time);
+#endif
+ int64_t this_rd = handle_inter_mode(
+ cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args,
+ ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search,
+ inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl,
+ &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_inter_mode_time);
+#endif
+ if (current_frame->reference_mode != SINGLE_REFERENCE) {
+ if (!args.skip_ifs &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+ is_inter_singleref_mode(this_mode)) {
+ collect_single_states(x, &search_state, mbmi);
+ }
+
+ if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 &&
+ is_inter_singleref_mode(this_mode))
+ update_best_single_mode(&search_state, this_mode, ref_frame, this_rd);
+ }
+
+ if (this_rd == INT64_MAX) continue;
+
+ if (mbmi->skip_txfm) {
+ rd_stats_y.rate = 0;
+ rd_stats_uv.rate = 0;
+ }
+
+ if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred &&
+ this_rd < ref_frame_rd[ref_frame]) {
+ ref_frame_rd[ref_frame] = this_rd;
+ }
+
+ // Did this mode help, i.e., is it the new best mode
+ if (this_rd < search_state.best_rd) {
+ assert(IMPLIES(comp_pred,
+ cm->current_frame.reference_mode != SINGLE_REFERENCE));
+ search_state.best_pred_sse = x->pred_sse[ref_frame];
+ best_inter_yrd = this_yrd;
+ update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_enum, x, do_tx_search);
+ if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+ // skip_rd[0] is the best total rd for a skip mode so far.
+ // skip_rd[1] is the best total rd for a skip mode so far in luma.
+ // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated.
+ // When do_tx_search = 0, skip_rd[1] is updated.
+ search_state.best_skip_rd[1] = skip_rd[1];
+ }
+ if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
+ // Add this mode to motion mode candidate list for motion mode search
+ // if using motion_mode_for_winner_cand speed feature
+ handle_winner_cand(mbmi, &best_motion_mode_cands,
+ max_winner_motion_mode_cand, this_rd,
+ &motion_mode_cand, args.skip_motion_mode);
+ }
+
+ /* keep record of best compound/single-only prediction */
+ record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred,
+ x->rdmult, &search_state, compmode_cost);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+ if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
+ // For the single ref winner candidates, evaluate other motion modes (non
+ // simple translation).
+ evaluate_motion_mode_for_winner_candidates(
+ cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
+ &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
+ &search_state, &best_inter_yrd);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, do_tx_search_time);
+#endif
+ if (do_tx_search != 1) {
+ // A full tx search has not yet been done, do tx search for
+ // top mode candidates
+ tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize,
+ yv12_mb, mi_row, mi_col, &search_state,
+ rd_cost, ctx, &best_inter_yrd);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, do_tx_search_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_intra_mode_time);
+#endif
+ // Gate intra mode evaluation if best of inter is skip except when source
+ // variance is extremely low and also based on max intra bsize.
+ skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost,
+ intra_cost);
+
+ const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+ search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
+ &sf_args, intra_ref_frame_cost,
+ best_inter_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_intra_mode_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, refine_winner_mode_tx_time);
+#endif
+ int winner_mode_count =
+ sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
+ // In effect only when fast tx search speed features are enabled.
+ refine_winner_mode_tx(
+ cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
+ &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+ search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, refine_winner_mode_tx_time);
+#endif
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ // Only try palette mode when the best mode so far is an intra mode.
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) &&
+ !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX;
+ RD_STATS this_rd_cost;
+ int this_skippable = 0;
+ if (try_palette) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_search_palette_mode_time);
+#endif
+ this_skippable = av1_search_palette_mode(
+ &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+ ctx, &this_rd_cost, search_state.best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_search_palette_mode_time);
+#endif
+ if (this_rd_cost.rdcost < search_state.best_rd) {
+ search_state.best_mode_index = THR_DC;
+ mbmi->mv[0].as_int = 0;
+ rd_cost->rate = this_rd_cost.rate;
+ rd_cost->dist = this_rd_cost.dist;
+ rd_cost->rdcost = this_rd_cost.rdcost;
+ search_state.best_rd = rd_cost->rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = 0;
+ search_state.best_mode_skippable = this_skippable;
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ }
+ }
+
+ search_state.best_mbmode.skip_mode = 0;
+ if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+ is_comp_ref_allowed(bsize)) {
+ const struct segmentation *const seg = &cm->seg;
+ unsigned char segment_id = mbmi->segment_id;
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb);
+ }
+ }
+
+ // Make sure that the ref_mv_idx is only nonzero when we're
+ // using a mode which can support ref_mv_idx
+ if (search_state.best_mbmode.ref_mv_idx != 0 &&
+ !(search_state.best_mbmode.mode == NEWMV ||
+ search_state.best_mbmode.mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+ search_state.best_mbmode.ref_mv_idx = 0;
+ }
+
+ if (search_state.best_mode_index == THR_INVALID ||
+ search_state.best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ const InterpFilter interp_filter = features->interp_filter;
+ assert((interp_filter == SWITCHABLE) ||
+ (interp_filter ==
+ search_state.best_mbmode.interp_filters.as_filters.y_filter) ||
+ !is_inter_block(&search_state.best_mbmode));
+ assert((interp_filter == SWITCHABLE) ||
+ (interp_filter ==
+ search_state.best_mbmode.interp_filters.as_filters.x_filter) ||
+ !is_inter_block(&search_state.best_mbmode));
+
+ if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) {
+ av1_update_rd_thresh_fact(
+ cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize,
+ search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES);
+ }
+
+ // macroblock modes
+ *mbmi = search_state.best_mbmode;
+ txfm_info->skip_txfm |= search_state.best_skip2;
+
+ // Note: this section is needed since the mode may have been forced to
+ // GLOBALMV by the all-zero mode handling of ref-mv.
+ if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+ // Correct the interp filters for GLOBALMV
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ int_interpfilters filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+ assert(mbmi->interp_filters.as_int == filters.as_int);
+ (void)filters;
+ }
+ }
+
+ txfm_info->skip_txfm |= search_state.best_mode_skippable;
+
+ assert(search_state.best_mode_index != THR_INVALID);
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context(x, ctx, search_state.best_mode_index,
+ search_state.best_mode_skippable);
+#else
+ store_coding_context(x, ctx, search_state.best_mode_skippable);
+#endif // CONFIG_INTERNAL_STATS
+
+ if (mbmi->palette_mode_info.palette_size[1] > 0) {
+ assert(try_palette);
+ av1_restore_uv_color_map(cpi, x);
+ }
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+ TileDataEnc *tile_data, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FeatureFlags *const features = &cm->features;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ unsigned char segment_id = mbmi->segment_id;
+ const int comp_pred = 0;
+ int i;
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *comp_inter_cost =
+ mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
+ InterpFilter best_filter = SWITCHABLE;
+ int64_t this_rd = INT64_MAX;
+ int rate2 = 0;
+ const int64_t distortion2 = 0;
+ (void)mi_row;
+ (void)mi_col;
+ (void)tile_data;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single,
+ ref_costs_comp);
+
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+
+ assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mode = GLOBALMV;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->uv_mode = UV_DC_PRED;
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+ mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ else
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->mv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+ features->allow_high_precision_mv, bsize, mi_col,
+ mi_row, features->cur_frame_force_integer_mv)
+ .as_int;
+ mbmi->tx_size = max_txsize_lookup[bsize];
+ x->txfm_search_info.skip_txfm = 1;
+
+ mbmi->ref_mv_idx = 0;
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ av1_count_overlappable_neighbors(cm, xd);
+ if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+ // Select the samples according to motion vector difference
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
+ }
+ }
+
+ const InterpFilter interp_filter = features->interp_filter;
+ set_default_interp_filters(mbmi, interp_filter);
+
+ if (interp_filter != SWITCHABLE) {
+ best_filter = interp_filter;
+ } else {
+ best_filter = EIGHTTAP_REGULAR;
+ if (av1_is_interp_needed(xd)) {
+ int rs;
+ int best_rs = INT_MAX;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ mbmi->interp_filters = av1_broadcast_interp_filter(i);
+ rs = av1_get_switchable_rate(x, xd, interp_filter,
+ cm->seq_params->enable_dual_filter);
+ if (rs < best_rs) {
+ best_rs = rs;
+ best_filter = mbmi->interp_filters.as_filters.y_filter;
+ }
+ }
+ }
+ }
+ // Set the appropriate filter
+ mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+ rate2 += av1_get_switchable_rate(x, xd, interp_filter,
+ cm->seq_params->enable_dual_filter);
+
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
+ rate2 += comp_inter_cost[comp_pred];
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs_single[LAST_FRAME];
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+
+ if (this_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert((interp_filter == SWITCHABLE) ||
+ (interp_filter == mbmi->interp_filters.as_filters.y_filter));
+
+ if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+ av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+ cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
+ THR_GLOBALMV, THR_INTER_MODE_START,
+ THR_INTER_MODE_END, THR_DC, MAX_MODES);
+ }
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context(x, ctx, THR_GLOBALMV, 0);
+#else
+ store_coding_context(x, ctx, 0);
+#endif // CONFIG_INTERNAL_STATS
+}
+
+/*!\cond */
+struct calc_target_weighted_pred_ctxt {
+ const OBMCBuffer *obmc_buffer;
+ const uint8_t *tmp;
+ int tmp_stride;
+ int overlap;
+};
+/*!\endcond */
+
+static INLINE void calc_target_weighted_pred_above(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+ (void)nb_mi;
+ (void)num_planes;
+ (void)rel_mi_row;
+ (void)dir;
+
+ struct calc_target_weighted_pred_ctxt *ctxt =
+ (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+ const int bw = xd->width << MI_SIZE_LOG2;
+ const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+ int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE);
+ int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE);
+ const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ if (!is_hbd) {
+ for (int row = 0; row < ctxt->overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+ wsrc[col] = m1 * tmp[col];
+ mask[col] = m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp += ctxt->tmp_stride;
+ }
+ } else {
+ const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+ for (int row = 0; row < ctxt->overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+ wsrc[col] = m1 * tmp16[col];
+ mask[col] = m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp16 += ctxt->tmp_stride;
+ }
+ }
+}
+
+static INLINE void calc_target_weighted_pred_left(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+ (void)nb_mi;
+ (void)num_planes;
+ (void)rel_mi_col;
+ (void)dir;
+
+ struct calc_target_weighted_pred_ctxt *ctxt =
+ (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+ const int bw = xd->width << MI_SIZE_LOG2;
+ const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+ int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw);
+ int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw);
+ const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ if (!is_hbd) {
+ for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+ for (int col = 0; col < ctxt->overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp += ctxt->tmp_stride;
+ }
+ } else {
+ const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+ for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+ for (int col = 0; col < ctxt->overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp16 += ctxt->tmp_stride;
+ }
+ }
+}
+
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+// PObmc(x,y) =
+// AOM_BLEND_A64(Mh(x),
+// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+// PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+// Mh(x) * Mv(y) * P(x,y) +
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+// wsrc(x, y) =
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+// error(x, y) =
+// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static AOM_INLINE void calc_target_weighted_pred(
+ const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+ const uint8_t *above, int above_stride, const uint8_t *left,
+ int left_stride) {
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int bw = xd->width << MI_SIZE_LOG2;
+ const int bh = xd->height << MI_SIZE_LOG2;
+ const OBMCBuffer *obmc_buffer = &x->obmc_buffer;
+ int32_t *mask_buf = obmc_buffer->mask;
+ int32_t *wsrc_buf = obmc_buffer->wsrc;
+
+ const int is_hbd = is_cur_buf_hbd(xd);
+ const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+
+ // plane 0 should not be sub-sampled
+ assert(xd->plane[0].subsampling_x == 0);
+ assert(xd->plane[0].subsampling_y == 0);
+
+ av1_zero_array(wsrc_buf, bw * bh);
+ for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+ // handle above row
+ if (xd->up_available) {
+ const int overlap =
+ AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+ struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above,
+ above_stride, overlap };
+ foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ calc_target_weighted_pred_above, &ctxt);
+ }
+
+ for (int i = 0; i < bw * bh; ++i) {
+ wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ }
+
+ // handle left column
+ if (xd->left_available) {
+ const int overlap =
+ AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+ struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left,
+ left_stride, overlap };
+ foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ calc_target_weighted_pred_left, &ctxt);
+ }
+
+ if (!is_hbd) {
+ const uint8_t *src = x->plane[0].src.buf;
+
+ for (int row = 0; row < bh; ++row) {
+ for (int col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += bw;
+ src += x->plane[0].src.stride;
+ }
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+ for (int row = 0; row < bh; ++row) {
+ for (int col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += bw;
+ src += x->plane[0].src.stride;
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 0000000000..efb797e5b5
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
+
+#include <stdbool.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define COMP_TYPE_RD_THRESH_SCALE 11
+#define COMP_TYPE_RD_THRESH_SHIFT 4
+#define MAX_WINNER_MOTION_MODES 10
+
+struct TileInfo;
+struct macroblock;
+struct RD_STATS;
+
+/*!\brief AV1 intra mode selection for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * Top level function for rd-based intra mode selection during intra frame
+ * encoding. This function will first search for the best luma prediction by
+ * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction
+ * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the
+ * search with an evaluation for intrabc.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock.
+ * \param[in] rd_cost Struct to keep track of the RD information.
+ * \param[in] bsize Current block size.
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process.
+ * \param[in] best_rd Best RD seen for this block so far.
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+/*!\brief AV1 inter mode selection.
+ *
+ * \ingroup inter_mode_search
+ * \callgraph
+ * Top level function for inter mode selection. This function will loop over
+ * all possible inter modes and select the best one for the current block by
+ * computing the RD cost. The mode search and RD are computed in
+ * handle_inter_mode(), which is called from this function within the main
+ * loop.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ data/contexts/models for the tile during
+ encoding
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ * \param[in] best_rd_so_far Best RD seen for this block so far
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+
+/*!\brief AV1 intra mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Top level function for Non-RD optimized intra mode selection.
+ * This finction will loop over subset of intra modes and select the best one
+ * based on calculated modelled RD cost. Only 4 intra modes are checked as
+ * specified in \c intra_mode_list. When calculating RD cost Hadamard transform
+ * of residual is used to calculate rate. Estmation of RD cost is performed
+ * in \c av1_estimate_block_intra which is called from this function
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ * need to be checked.
+ * *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ data/contexts/models for the tile during
+ encoding
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+ const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
+
+static INLINE int coded_to_superres_mi(int mi_col, int denom) {
+ return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
+}
+
+static INLINE int av1_encoder_get_relative_dist(int a, int b) {
+ assert(a >= 0 && b >= 0);
+ return (a - b);
+}
+
+// This function will return number of mi's in a superblock.
+static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
+ const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
+ int sb_mi_rows =
+ (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
+ mi_alloc_size_1d;
+ assert(mi_size_wide[cm->seq_params->sb_size] ==
+ mi_size_high[cm->seq_params->sb_size]);
+ int sb_mi_size = sb_mi_rows * sb_mi_rows;
+
+ return sb_mi_size;
+}
+
+// This function prunes the mode if either of the reference frame falls in the
+// pruning list
+static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
+ const unsigned int *const ref_display_order_hint,
+ const unsigned int frame_display_order_hint,
+ const int *ref_frame_list) {
+ for (int i = 0; i < 2; i++) {
+ if (ref_frame_list[i] == NONE_FRAME) continue;
+
+ if (ref_frame[0] == ref_frame_list[i] ||
+ ref_frame[1] == ref_frame_list[i]) {
+ if (av1_encoder_get_relative_dist(
+ ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
+ frame_display_order_hint) < 0)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
+ int8_t closest_past_ref,
+ int8_t closest_future_ref) {
+ int has_closest_past_ref =
+ (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref);
+ int has_closest_future_ref = (ref_frame[0] == closest_future_ref) ||
+ (ref_frame[1] == closest_future_ref);
+ return (has_closest_past_ref && has_closest_future_ref);
+}
+
+static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
+ const MACROBLOCK *const x) {
+ int has_best_past_pred_mv_sad = 0;
+ int has_best_future_pred_mv_sad = 0;
+ if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) {
+ has_best_past_pred_mv_sad =
+ (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) ||
+ (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]);
+ has_best_future_pred_mv_sad =
+ (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) ||
+ (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]);
+ }
+ return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad);
+}
+
+static INLINE int prune_ref_by_selective_ref_frame(
+ const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *const ref_frame,
+ const unsigned int *const ref_display_order_hint) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ if (!sf->inter_sf.selective_ref_frame) return 0;
+
+ const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+ if (sf->inter_sf.selective_ref_frame >= 2 ||
+ (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) {
+ int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
+
+ if (x != NULL) {
+ // Disable pruning if either tpl suggests that we keep the frame or
+ // the pred_mv gives us the best sad
+ if (x->tpl_keep_ref_frame[LAST3_FRAME] ||
+ x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[0] = NONE_FRAME;
+ }
+ if (x->tpl_keep_ref_frame[LAST2_FRAME] ||
+ x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[1] = NONE_FRAME;
+ }
+ }
+
+ if (prune_ref(ref_frame, ref_display_order_hint,
+ ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
+ ref_frame_list))
+ return 1;
+ }
+
+ if (sf->inter_sf.selective_ref_frame >= 3) {
+ int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
+
+ if (x != NULL) {
+ // Disable pruning if either tpl suggests that we keep the frame or
+ // the pred_mv gives us the best sad
+ if (x->tpl_keep_ref_frame[ALTREF2_FRAME] ||
+ x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[0] = NONE_FRAME;
+ }
+ if (x->tpl_keep_ref_frame[BWDREF_FRAME] ||
+ x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[1] = NONE_FRAME;
+ }
+ }
+
+ if (prune_ref(ref_frame, ref_display_order_hint,
+ ref_display_order_hint[LAST_FRAME - LAST_FRAME],
+ ref_frame_list))
+ return 1;
+ }
+
+ if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) {
+ int closest_ref_frames = has_closest_ref_frames(
+ ref_frame, cpi->ref_frame_dist_info.nearest_past_ref,
+ cpi->ref_frame_dist_info.nearest_future_ref);
+ if (closest_ref_frames == 0) {
+ // Prune reference frames which are not the closest to the current frame.
+ if (sf->inter_sf.prune_comp_ref_frames >= 2) {
+ return 1;
+ } else if (sf->inter_sf.prune_comp_ref_frames == 1) {
+ // Prune reference frames with non minimum pred_mv_sad.
+ if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME.
+static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame(
+ MB_MODE_INFO_EXT_FRAME *mbmi_ext_best,
+ const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) {
+ memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type],
+ sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+ memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type],
+ sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+ mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type];
+ mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs,
+ sizeof(mbmi_ext->global_mvs));
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/rdopt_data_defs.h b/third_party/aom/av1/encoder/rdopt_data_defs.h
new file mode 100644
index 0000000000..ca7ef810f3
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt_data_defs.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = {
+ THR_DC, // DC_PRED,
+ THR_V_PRED, // V_PRED,
+ THR_H_PRED, // H_PRED,
+ THR_D45_PRED, // D45_PRED,
+ THR_D135_PRED, // D135_PRED,
+ THR_D113_PRED, // D113_PRED,
+ THR_D157_PRED, // D157_PRED,
+ THR_D203_PRED, // D203_PRED,
+ THR_D67_PRED, // D67_PRED,
+ THR_SMOOTH, // SMOOTH_PRED,
+ THR_SMOOTH_V, // SMOOTH_V_PRED,
+ THR_SMOOTH_H, // SMOOTH_H_PRED,
+ THR_PAETH, // PAETH_PRED,
+};
+
+/* clang-format off */
+static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+ [REF_FRAMES] = {
+ // NEARESTMV,
+ { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3,
+ THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, },
+ // NEARMV,
+ { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3,
+ THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, },
+ // GLOBALMV,
+ { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3,
+ THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, },
+ // NEWMV,
+ { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3,
+ THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+ [REF_FRAMES] = {
+ // NEAREST_NEARESTMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEAR_NEARMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEAREST_NEWMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEW_NEARESTMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEAR_NEWMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3,
+ THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB,
+ THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWL2B,
+ THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWL3B,
+ THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWGB,
+ THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEW_NEARMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB,
+ THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARGB,
+ THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // GLOBAL_GLOBALMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3,
+ THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB,
+ THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B,
+ THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B,
+ THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALGB,
+ THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEW_NEWMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3,
+ THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB,
+ THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWL2B,
+ THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWL3B,
+ THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWGB,
+ THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
diff --git a/third_party/aom/av1/encoder/rdopt_utils.h b/third_party/aom/av1/encoder/rdopt_utils.h
new file mode 100644
index 0000000000..b6bc4927e3
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt_utils.h
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_RDOPT_UTILS_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/common/cfl.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/rdopt_data_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SEARCH 3
+#define MAX_TX_RD_GATE_LEVEL 5
+#define INTER_INTRA_RD_THRESH_SCALE 9
+#define INTER_INTRA_RD_THRESH_SHIFT 4
+
+typedef struct {
+ PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+// This array defines the mapping from the enums in THR_MODES to the actual
+// prediction modes and refrence frames
+static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
+ { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEWMV, { LAST_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEARMV, { LAST_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+ { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+ { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+ { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+ { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+ { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+ { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+ { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ // intra modes
+ { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+// Number of winner modes allowed for different values of the speed feature
+// multi_winner_mode_type.
+static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = {
+ 1, // MULTI_WINNER_MODE_OFF
+ 2, // MULTI_WINNER_MODE_FAST
+ 3 // MULTI_WINNER_MODE_DEFAULT
+};
+
+static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
+ const int num_planes) {
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].dst.buf = dst.plane[i];
+ xd->plane[i].dst.stride = dst.stride[i];
+ }
+}
+
+static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd,
+ const BUFFER_SET *dst_bufs[2],
+ int num_planes) {
+ const BUFFER_SET *buf0 = dst_bufs[0];
+ dst_bufs[0] = dst_bufs[1];
+ dst_bufs[1] = buf0;
+ restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+/* clang-format on */
+// Calculate rd threshold based on ref best rd and relevant scaling factors
+static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
+ int mul_factor,
+ int div_factor) {
+ int64_t rd_thresh = ref_best_rd;
+ if (div_factor != 0) {
+ rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor))
+ ? ((ref_best_rd / div_factor) * mul_factor)
+ : INT64_MAX;
+ }
+ return rd_thresh;
+}
+
+static AOM_INLINE THR_MODES
+get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME second_ref_frame) {
+ if (this_mode < INTRA_MODE_END) {
+ assert(ref_frame == INTRA_FRAME);
+ assert(second_ref_frame == NONE_FRAME);
+ return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+ }
+ if (this_mode >= SINGLE_INTER_MODE_START &&
+ this_mode < SINGLE_INTER_MODE_END) {
+ assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+ return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+ [ref_frame];
+ }
+ if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END &&
+ second_ref_frame != NONE_FRAME) {
+ assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+ assert((second_ref_frame > INTRA_FRAME) &&
+ (second_ref_frame <= ALTREF_FRAME));
+ return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+ [second_ref_frame];
+ }
+ assert(0);
+ return THR_INVALID;
+}
+
+static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+ if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+ return -1;
+ }
+ return 1;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+ BLOCK_SIZE plane_bsize, int blk_row,
+ int blk_col, BLOCK_SIZE tx_bsize,
+ int *width, int *height,
+ int *visible_width,
+ int *visible_height) {
+ assert(tx_bsize <= plane_bsize);
+ const int txb_height = block_size_high[tx_bsize];
+ const int txb_width = block_size_wide[tx_bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+ // than the MI size
+ if (xd->mb_to_bottom_edge >= 0) {
+ *visible_height = txb_height;
+ } else {
+ const int block_height = block_size_high[plane_bsize];
+ const int block_rows =
+ (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+ *visible_height =
+ clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height);
+ }
+ if (height) *height = txb_height;
+
+ if (xd->mb_to_right_edge >= 0) {
+ *visible_width = txb_width;
+ } else {
+ const int block_width = block_size_wide[plane_bsize];
+ const int block_cols =
+ (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+ *visible_width =
+ clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width);
+ }
+ if (width) *width = txb_width;
+}
+
+static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+ int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
+ return num_blk;
+}
+
+static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int64_t best_skip_rd, int64_t skip_rd,
+ int level, int is_luma_only) {
+ int eval_txfm = 1;
+ // Derive aggressiveness factor for gating the transform search
+ // Lower value indicates more aggressiveness. Be more conservative (high
+ // value) for (i) low quantizers (ii) regions where prediction is poor
+ const int scale[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 4, 3, 2, 2, 1 };
+ const int qslope = 2 * (!is_luma_only);
+ const int level_to_qindex_map[MAX_TX_RD_GATE_LEVEL + 1] = { 0, 0, 0,
+ 80, 100, 140 };
+ int aggr_factor = 4;
+ assert(level <= MAX_TX_RD_GATE_LEVEL);
+ const int pred_qindex_thresh = level_to_qindex_map[level];
+ if (!is_luma_only && level <= 2) {
+ aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope,
+ QINDEX_BITS));
+ }
+ if ((best_skip_rd >
+ (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) &&
+ (x->qindex >= pred_qindex_thresh))
+ aggr_factor *= scale[level];
+ // For level setting 1, be more conservative for non-luma-only case even when
+ // prediction is good.
+ else if ((level <= 1) && !is_luma_only)
+ aggr_factor = (aggr_factor >> 2) * 6;
+
+ // Be more conservative for luma only cases (called from compound type rd)
+ // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
+ // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
+ // interpolation filter search
+ const int luma_mul[MAX_TX_RD_GATE_LEVEL + 1] = {
+ INT_MAX, 32, 29, 17, 17, 17
+ };
+ int mul_factor = is_luma_only ? luma_mul[level] : 16;
+ int64_t rd_thresh =
+ (best_skip_rd == INT64_MAX)
+ ? best_skip_rd
+ : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6);
+ if (skip_rd > rd_thresh) eval_txfm = 0;
+ return eval_txfm;
+}
+
+static TX_MODE select_tx_mode(
+ const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) {
+ if (cm->features.coded_lossless) return ONLY_4X4;
+ if (tx_size_search_method == USE_LARGESTALL) {
+ return TX_MODE_LARGEST;
+ } else {
+ assert(tx_size_search_method == USE_FULL_RD ||
+ tx_size_search_method == USE_FAST_RD);
+ return TX_MODE_SELECT;
+ }
+}
+
+// Checks the conditions to disable winner mode processing
+static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x,
+ const SPEED_FEATURES *sf,
+ int use_txfm_skip,
+ int actual_txfm_skip,
+ PREDICTION_MODE best_mode) {
+ const int prune_winner_mode_eval_level =
+ sf->winner_mode_sf.prune_winner_mode_eval_level;
+
+ // Disable winner mode processing for blocks with low source variance.
+ // The aggressiveness of this pruning logic reduces as qindex increases.
+ // The threshold decreases linearly from 64 as qindex varies from 0 to 255.
+ if (prune_winner_mode_eval_level == 1) {
+ const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1);
+ if (x->source_variance < src_var_thresh) return 1;
+ } else if (prune_winner_mode_eval_level == 2) {
+ // Skip winner mode processing of blocks for which transform turns out to be
+ // skip due to nature of eob alone except NEWMV mode.
+ if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1;
+ } else if (prune_winner_mode_eval_level == 3) {
+ // Skip winner mode processing of blocks for which transform turns out to be
+ // skip except NEWMV mode and considered based on the quantizer.
+ // At high quantizers: Take conservative approach by considering transform
+ // skip based on eob alone.
+ // At low quantizers: Consider transform skip based on eob nature or RD cost
+ // evaluation.
+ const int is_txfm_skip =
+ x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip;
+
+ if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1;
+ } else if (prune_winner_mode_eval_level >= 4) {
+ // Do not skip winner mode evaluation at low quantizers if normal mode's
+ // transform search was too aggressive.
+ if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0;
+
+ if (use_txfm_skip || actual_txfm_skip) return 1;
+ }
+
+ return 0;
+}
+
+// Checks the conditions to enable winner mode processing
+static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
+ const MACROBLOCK *const x,
+ MB_MODE_INFO *const mbmi,
+ int actual_txfm_skip) {
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const PREDICTION_MODE best_mode = mbmi->mode;
+
+ if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip,
+ best_mode))
+ return 0;
+
+ // TODO(any): Move block independent condition checks to frame level
+ if (is_inter_block(mbmi)) {
+ if (is_inter_mode(best_mode) &&
+ (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) &&
+ !cpi->oxcf.txfm_cfg.use_inter_dct_only)
+ return 1;
+ } else {
+ if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
+ !cpi->oxcf.txfm_cfg.use_intra_default_tx_only &&
+ !cpi->oxcf.txfm_cfg.use_intra_dct_only)
+ return 1;
+ }
+
+ // Check speed feature related to winner mode processing
+ if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt &&
+ cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+ cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT)
+ return 1;
+ if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1;
+
+ return 0;
+}
+
+static INLINE void set_tx_size_search_method(
+ const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
+ TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch,
+ int is_winner_mode) {
+ // Populate transform size search method/transform mode appropriately
+ txfm_params->tx_size_search_method =
+ winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
+ if (enable_winner_mode_for_tx_size_srch) {
+ if (is_winner_mode)
+ txfm_params->tx_size_search_method =
+ winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
+ else
+ txfm_params->tx_size_search_method =
+ winner_mode_params->tx_size_search_methods[MODE_EVAL];
+ }
+ txfm_params->tx_mode_search_type =
+ select_tx_mode(cm, txfm_params->tx_size_search_method);
+}
+
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
+ TxfmSearchParams *txfm_params,
+ int winner_mode_tx_type_pruning,
+ int is_winner_mode) {
+ // Populate prune transform mode appropriately
+ txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
+ if (!winner_mode_tx_type_pruning) return;
+
+ const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 },
+ { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+ { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 },
+ { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } };
+ txfm_params->prune_2d_txfm_mode =
+ prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
+}
+
+static INLINE void set_tx_domain_dist_params(
+ const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
+ int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+ if (txfm_params->use_qm_dist_metric) {
+ // QM-weighted PSNR is computed in transform space, so we need to forcibly
+ // enable the use of tx domain distortion.
+ txfm_params->use_transform_domain_distortion = 1;
+ txfm_params->tx_domain_dist_threshold = 0;
+ return;
+ }
+
+ if (!enable_winner_mode_for_tx_domain_dist) {
+ txfm_params->use_transform_domain_distortion =
+ winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
+ txfm_params->tx_domain_dist_threshold =
+ winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
+ return;
+ }
+
+ if (is_winner_mode) {
+ txfm_params->use_transform_domain_distortion =
+ winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
+ txfm_params->tx_domain_dist_threshold =
+ winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
+ } else {
+ txfm_params->use_transform_domain_distortion =
+ winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
+ txfm_params->tx_domain_dist_threshold =
+ winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
+ }
+}
+
+// This function sets mode parameters for different mode evaluation stages
+static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
+ MACROBLOCK *x,
+ MODE_EVAL_TYPE mode_eval_type) {
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+
+ txfm_params->use_qm_dist_metric =
+ cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR;
+
+ switch (mode_eval_type) {
+ case DEFAULT_EVAL:
+ txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+ txfm_params->use_default_intra_tx_type = 0;
+ txfm_params->skip_txfm_level =
+ winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[DEFAULT_EVAL];
+ // Set default transform domain distortion type
+ set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
+
+ // Get default threshold for R-D optimization of coefficients
+ get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds,
+ txfm_params, 0, 0);
+
+ // Set default transform size search method
+ set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0);
+ // Set default transform type prune
+ set_tx_type_prune(sf, txfm_params, 0, 0);
+ break;
+ case MODE_EVAL:
+ txfm_params->use_default_intra_tx_type =
+ (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
+ cpi->oxcf.txfm_cfg.use_intra_default_tx_only);
+ txfm_params->default_inter_tx_type_prob_thresh =
+ cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh;
+ txfm_params->skip_txfm_level =
+ winner_mode_params->skip_txfm_level[MODE_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[MODE_EVAL];
+ // Set transform domain distortion type for mode evaluation
+ set_tx_domain_dist_params(
+ winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
+
+ // Get threshold for R-D optimization of coefficients during mode
+ // evaluation
+ get_rd_opt_coeff_thresh(
+ winner_mode_params->coeff_opt_thresholds, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+
+ // Set the transform size search method for mode evaluation
+ set_tx_size_search_method(
+ cm, winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
+ // Set transform type prune for mode evaluation
+ set_tx_type_prune(sf, txfm_params,
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+ 0);
+ break;
+ case WINNER_MODE_EVAL:
+ txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+ txfm_params->use_default_intra_tx_type = 0;
+ txfm_params->skip_txfm_level =
+ winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
+
+ // Set transform domain distortion type for winner mode evaluation
+ set_tx_domain_dist_params(
+ winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
+
+ // Get threshold for R-D optimization of coefficients for winner mode
+ // evaluation
+ get_rd_opt_coeff_thresh(
+ winner_mode_params->coeff_opt_thresholds, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+
+ // Set the transform size search method for winner mode evaluation
+ set_tx_size_search_method(
+ cm, winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+ // Set default transform type prune mode for winner mode evaluation
+ set_tx_type_prune(sf, txfm_params,
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+ 1);
+ break;
+ default: assert(0);
+ }
+
+ // Rd record collected at a specific mode evaluation stage can not be used
+ // across other evaluation stages as the transform parameters are different.
+ // Hence, reset mb rd record whenever mode evaluation stage type changes.
+ if (txfm_params->mode_eval_type != mode_eval_type)
+ reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+
+ txfm_params->mode_eval_type = mode_eval_type;
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+ const MACROBLOCK *x) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+
+ if (!xd->is_chroma_ref) {
+ // For non-chroma-reference blocks, we should always store the luma pixels,
+ // in case the corresponding chroma-reference block uses CfL.
+ // Note that this can only happen for block sizes which are <8 on
+ // their shortest side, as otherwise they would be chroma reference
+ // blocks.
+ return CFL_ALLOWED;
+ }
+
+ // For chroma reference blocks, we should store data in the encoder iff we're
+ // allowed to try out CfL.
+ return is_cfl_allowed(xd);
+}
+
+static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+// Store best mode stats for winner mode processing
+static INLINE void store_winner_mode_stats(
+ const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi,
+ RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
+ THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
+ int multi_winner_mode_type, int txfm_search_done) {
+ WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
+ int mode_idx = 0;
+ int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
+ // Mode stat is not required when multiwinner mode processing is disabled
+ if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return;
+ // Ignore mode with maximum rd
+ if (this_rd == INT64_MAX) return;
+ // TODO(any): Winner mode processing is currently not applicable for palette
+ // mode in Inter frames. Clean-up the following code, once support is added
+ if (!frame_is_intra_only(cm) && is_palette_mode) return;
+
+ int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type];
+ assert(x->winner_mode_count >= 0 &&
+ x->winner_mode_count <= max_winner_mode_count);
+
+ if (x->winner_mode_count) {
+ // Find the mode which has higher rd cost than this_rd
+ for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++)
+ if (winner_mode_stats[mode_idx].rd > this_rd) break;
+
+ if (mode_idx == max_winner_mode_count) {
+ // No mode has higher rd cost than this_rd
+ return;
+ } else if (mode_idx < max_winner_mode_count - 1) {
+ // Create a slot for current mode and move others to the next slot
+ memmove(
+ &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx],
+ (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats));
+ }
+ }
+ // Add a mode stat for winner mode processing
+ winner_mode_stats[mode_idx].mbmi = *mbmi;
+ winner_mode_stats[mode_idx].rd = this_rd;
+ winner_mode_stats[mode_idx].mode_index = mode_index;
+
+ // Update rd stats required for inter frame
+ if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
+ const int skip_txfm = mbmi->skip_txfm && !is_intra_mode;
+
+ winner_mode_stats[mode_idx].rd_cost = *rd_cost;
+ if (txfm_search_done) {
+ winner_mode_stats[mode_idx].rate_y =
+ rd_cost_y->rate +
+ x->mode_costs
+ .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm];
+ winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
+ }
+ }
+
+ if (color_map) {
+ // Store color_index_map for palette mode
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int block_width, block_height;
+ av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+ &block_height, NULL, NULL);
+ memcpy(winner_mode_stats[mode_idx].color_index_map, color_map,
+ block_width * block_height * sizeof(color_map[0]));
+ }
+
+ x->winner_mode_count =
+ AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
+}
+
+unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane,
+ int use_hbd);
+
+unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane);
+
+static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+ return mode < INTRA_MODE_END;
+}
+
+// This function will copy usable ref_mv_stack[ref_frame][4] and
+// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
+// weight[ref_frame][8].
+static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+ const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame) {
+ memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
+ USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
+ memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
+ USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
+}
+
+// Get transform rd gate level for the given transform search case.
+static INLINE int get_txfm_rd_gate_level(
+ const int is_masked_compound_enabled,
+ const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize,
+ TX_SEARCH_CASE tx_search_case, int eval_motion_mode) {
+ assert(tx_search_case < TX_SEARCH_CASES);
+ if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode &&
+ num_pels_log2_lookup[bsize] > 8)
+ return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE];
+ // Enable aggressive gating of transform search only when masked compound type
+ // is enabled.
+ else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE &&
+ is_masked_compound_enabled)
+ return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE];
+
+ return txfm_rd_gate_level[TX_SEARCH_DEFAULT];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_UTILS_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 0000000000..9b964113a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static AOM_INLINE void enc_calc_subpel_params(
+ const MV *const src_mv, InterPredParams *const inter_pred_params,
+ uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
+ struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+ init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width,
+ pre_buf->height);
+ *pre = pre_buf->buf0 +
+ (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+ *src_stride = pre_buf->stride;
+}
+
+#define IS_DEC 0
+#include "av1/common/reconinter_template.inc"
+#undef IS_DEC
+
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+ const MV *src_mv,
+ InterPredParams *inter_pred_params) {
+ build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params);
+}
+
+static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int bw, int bh, int mi_x, int mi_y) {
+ build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x,
+ mi_y);
+}
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ InterPredParams inter_pred_params;
+
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+ const MV mv = xd->mi[0]->mv[0].as_mv;
+ const struct scale_factors *const sf = xd->block_ref_scale_factors[0];
+
+ av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), false, sf, pd->pre,
+ xd->mi[0]->interp_filters);
+
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ 0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd);
+
+ inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+ av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+ &inter_pred_params);
+}
+
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params) {
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *pre_buf = &pd->pre[0];
+ const uint8_t *src =
+ pre_buf->buf0 +
+ (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+ uint8_t *const dst = dst_buf->buf;
+ int src_stride = pre_buf->stride;
+ int dst_stride = dst_buf->stride;
+ inter_pred_params->ref_frame_buf = *pre_buf;
+
+ // Initialize interp filter for single reference mode.
+ init_interp_filter_params(inter_pred_params->interp_filter_params,
+ &mbmi->interp_filters.as_filters, pd->width,
+ pd->height, /*is_intrabc=*/0);
+
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params,
+ subpel_params);
+}
+
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+ int plane_from, int plane_to) {
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width,
+ xd->plane[plane].height, mi_x, mi_y);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = {
+ { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+ { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+ xd->plane[2].dst.stride }
+ };
+ if (!ctx) {
+ ctx = &default_ctx;
+ }
+ av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+ xd->plane[plane].dst.stride, ctx, plane,
+ bsize);
+ }
+ }
+}
+
+static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+ int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize);
+ const int ref_mi_row = xd->mi_row + mi_row_offset;
+ const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+ ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+ ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+
+ xd->block_ref_scale_factors[0] = sf;
+ if (!av1_is_valid_scale(sf))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+
+ av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+ num_planes);
+}
+
+static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
+ int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *above_mbmi,
+ void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+ num_planes);
+
+ const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
+ const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+
+ InterPredParams inter_pred_params;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = 0, bh = 0;
+
+ if (dir) {
+ // prepare left reference block size
+ bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+ } else {
+ // prepare above reference block size
+ bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+ bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+ }
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue;
+
+ const struct buf_2d *const pre_buf = &pd->pre[0];
+ const MV mv = above_mbmi->mv[0].as_mv;
+
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x,
+ pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+ xd->block_ref_scale_factors[0], pre_buf,
+ above_mbmi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, j, xd->bd);
+
+ av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv,
+ &inter_pred_params);
+ }
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->up_available) return;
+ struct build_prediction_ctxt ctxt = {
+ cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL
+ };
+ BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ foreach_overlappable_nb_above(cm, xd,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_obmc_prediction, &ctxt);
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->left_available) return;
+ struct build_prediction_ctxt ctxt = {
+ cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL
+ };
+ BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ foreach_overlappable_nb_left(cm, xd,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_obmc_prediction, &ctxt);
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+ dst_stride1);
+ av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+ dst_stride2);
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+ mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+ dst_stride2);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+ uint8_t *ext_dst[], int ext_dst_stride[]) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ WarpTypesAllowed warp_types;
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ InterPredParams inter_pred_params;
+
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x,
+ pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+ xd->block_ref_scale_factors[ref], &pd->pre[ref],
+ mi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]);
+ const MV mv = mi->mv[ref].as_mv;
+
+ av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv,
+ &inter_pred_params);
+ }
+}
+
+static void build_masked_compound(
+ uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void build_masked_compound_highbd(
+ uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, int bd) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ // const uint8_t *mask =
+ // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+ aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, block_size_wide[sb_type], w, h,
+ subw, subh, bd);
+}
+#endif
+
+static void build_wedge_inter_predictor_from_buf(
+ MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+ int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ mbmi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ if (is_compound && is_masked_compound_type(comp_data->type)) {
+ if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ av1_build_compound_diffwtd_mask_highbd(
+ comp_data->seg_mask, comp_data->mask_type,
+ CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+ } else {
+ av1_build_compound_diffwtd_mask(
+ comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+ ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+ }
+#else
+ (void)is_hbd;
+ av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type,
+ ext_dst0, ext_dst_stride0, ext_dst1,
+ ext_dst_stride1, h, w);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ build_masked_compound_highbd(
+ dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize,
+ h, w, xd->bd);
+ } else {
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize,
+ h, w);
+ }
+#else
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h,
+ w);
+#endif
+ } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
+ } else {
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
+ }
+#else
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
+#endif
+ }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[],
+ int ext_dst_stride0[],
+ uint8_t *ext_dst1[],
+ int ext_dst_stride1[]) {
+ int plane;
+ assert(bsize < BLOCK_SIZES_ALL);
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_wedge_inter_predictor_from_buf(
+ xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+ ext_dst1[plane], ext_dst_stride1[plane]);
+ }
+}
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ for (int i = 0; i < height; i++) {
+ memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+ comp_pred += width;
+ ref += ref_stride;
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+ -1, width, height);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+ 16, width, height);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t,
+ temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+ width, intermediate_height);
+ aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+ MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+ width, height);
+ }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ int i, j;
+
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ int subpel_search) {
+ if (subpel_x_q3 | subpel_y_q3) {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ ref = comp_pred;
+ ref_stride = width;
+ }
+ aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ int i, j;
+ const int fwd_offset = jcp_param->fwd_offset;
+ const int bck_offset = jcp_param->bck_offset;
+
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+ comp_pred[j] = (uint8_t)tmp;
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+ const struct AV1Common *const cm, int mi_row,
+ int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (int i = 0; i < height; i++) {
+ memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+ comp_pred += width;
+ ref += ref_stride;
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+ 16, NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, CONVERT_TO_BYTEPTR(temp),
+ MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+ intermediate_height, bd);
+ aom_highbd_convolve8_vert_c(
+ CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ int i, j;
+
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ int i, j;
+ const int fwd_offset = jcp_param->fwd_offset;
+ const int bck_offset = jcp_param->bck_offset;
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+ comp_pred[j] = (uint16_t)tmp;
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+ mask, mask_stride, invert_mask);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 0000000000..16932f37a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ int subpel_search);
+
+void aom_highbd_comp_mask_upsampled_pred(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int bd, int subpel_search);
+
+// Build single or compound reference inter predictors for all planes.
+// Can build inter-intra predictors, masked predictors etc as well.
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+ int plane_from, int plane_to);
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col);
+
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params);
+
+// Build one inter predictor. It is called for building predictor for single
+// reference case, or just the 1st or 2nd reference in compound reference case.
+// Can build both regular and masked predictors.
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+ const MV *src_mv,
+ InterPredParams *inter_pred_params);
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+ uint8_t *ext_dst[], int ext_dst_stride[]);
+
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[],
+ int ext_dst_stride0[],
+ uint8_t *ext_dst1[],
+ int ext_dst_stride1[]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/saliency_map.c b/third_party/aom/av1/encoder/saliency_map.c
new file mode 100644
index 0000000000..30019bbec0
--- /dev/null
+++ b/third_party/aom/av1/encoder/saliency_map.c
@@ -0,0 +1,1414 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/saliency_map.h"
+
+// The Gabor filter is generated by setting the parameters as:
+// ksize = 9
+// sigma = 1
+// theta = y*np.pi/4, where y /in {0, 1, 2, 3}, i.e., 0, 45, 90, 135 degree
+// lambda1 = 1
+// gamma=0.8
+// phi =0
+static const double kGaborFilter[4][9][9] = { // [angle: 0, 45, 90, 135
+ // degree][ksize][ksize]
+ { { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03,
+ 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 },
+ { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02,
+ 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 },
+ { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01,
+ 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 },
+ { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01,
+ 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 },
+ { 3.3546262e-04, 1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00,
+ 6.0653067e-01, 1.3533528e-01, 1.1108996e-02, 3.3546262e-04 },
+ { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01,
+ 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 },
+ { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01,
+ 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 },
+ { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02,
+ 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 },
+ { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03,
+ 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 } },
+
+ { { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04,
+ 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05,
+ 3.5712848e-05 },
+ { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03,
+ 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03,
+ -8.1631159e-05 },
+ { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02,
+ -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03,
+ -9.9486928e-04 },
+ { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01,
+ -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02,
+ 1.3962291e-03 },
+ { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01,
+ 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02,
+ 6.6981313e-04 },
+ { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01,
+ -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03,
+ -4.4602581e-04 },
+ { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02,
+ -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03,
+ 3.0079011e-06 },
+ { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02,
+ 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06,
+ 3.8760313e-06 },
+ { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03,
+ 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06,
+ -6.2165498e-08 } },
+
+ { { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04,
+ 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 },
+ { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02,
+ 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 },
+ { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01,
+ 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 },
+ { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01,
+ 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 },
+ { 5.9760227e-03, 5.6134764e-02, 2.7803731e-01, 7.2614902e-01, 1.0000000e+00,
+ 7.2614902e-01, 2.7803731e-01, 5.6134764e-02, 5.9760227e-03 },
+ { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01,
+ 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 },
+ { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01,
+ 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 },
+ { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02,
+ 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 },
+ { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04,
+ 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 } },
+
+ { { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03,
+ 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06,
+ -6.2165498e-08 },
+ { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02,
+ 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06,
+ 3.8760313e-06 },
+ { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02,
+ -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03,
+ 3.0079011e-06 },
+ { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01,
+ -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03,
+ -4.4602581e-04 },
+ { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01,
+ 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02,
+ 6.6981313e-04 },
+ { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01,
+ -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02,
+ 1.3962291e-03 },
+ { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02,
+ -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03,
+ -9.9486928e-04 },
+ { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03,
+ 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03,
+ -8.1631159e-05 },
+ { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04,
+ 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05,
+ 3.5712848e-05 } }
+};
+
+// This function is to extract red/green/blue channels, and calculate intensity
+// = (r+g+b)/3. Note that it only handles 8bits case now.
+// TODO(linzhen): add high bitdepth support.
+static void get_color_intensity(const YV12_BUFFER_CONFIG *src,
+ int subsampling_x, int subsampling_y,
+ double *cr, double *cg, double *cb,
+ double *intensity) {
+ const uint8_t *y = src->buffers[0];
+ const uint8_t *u = src->buffers[1];
+ const uint8_t *v = src->buffers[2];
+
+ const int y_height = src->crop_heights[0];
+ const int y_width = src->crop_widths[0];
+ const int y_stride = src->strides[0];
+ const int c_stride = src->strides[1];
+
+ for (int i = 0; i < y_height; ++i) {
+ for (int j = 0; j < y_width; ++j) {
+ cr[i * y_width + j] =
+ fclamp((double)y[i * y_stride + j] +
+ 1.370 * (double)(v[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128),
+ 0, 255);
+ cg[i * y_width + j] =
+ fclamp((double)y[i * y_stride + j] -
+ 0.698 * (double)(u[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128) -
+ 0.337 * (double)(v[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128),
+ 0, 255);
+ cb[i * y_width + j] =
+ fclamp((double)y[i * y_stride + j] +
+ 1.732 * (double)(u[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128),
+ 0, 255);
+
+ intensity[i * y_width + j] =
+ (cr[i * y_width + j] + cg[i * y_width + j] + cb[i * y_width + j]) /
+ 3.0;
+ assert(intensity[i * y_width + j] >= 0 &&
+ intensity[i * y_width + j] <= 255);
+
+ intensity[i * y_width + j] /= 256;
+ cr[i * y_width + j] /= 256;
+ cg[i * y_width + j] /= 256;
+ cb[i * y_width + j] /= 256;
+ }
+ }
+}
+
+static INLINE double convolve_map(const double *filter, const double *map,
+ const int size) {
+ double result = 0;
+ for (int i = 0; i < size; ++i) {
+ result += filter[i] * map[i]; // symmetric filter is used
+ }
+ return result;
+}
+
+// This function is to decimate the map by half, and apply Gaussian filter on
+// top of the downsampled map.
+static INLINE void decimate_map(const double *map, int height, int width,
+ int stride, double *downsampled_map) {
+ const int new_width = width / 2;
+ const int window_size = 5;
+ const double gaussian_filter[25] = {
+ 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16,
+ 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32,
+ 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256,
+ 1. / 64, 3. / 128, 1. / 64, 1. / 256
+ };
+
+ double map_region[25];
+ for (int y = 0; y < height - 1; y += 2) {
+ for (int x = 0; x < width - 1; x += 2) {
+ int i = 0;
+ for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) {
+ for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) {
+ int yvalue = clamp(yy, 0, height - 1);
+ int xvalue = clamp(xx, 0, width - 1);
+ map_region[i++] = map[yvalue * stride + xvalue];
+ }
+ }
+ downsampled_map[(y / 2) * new_width + (x / 2)] =
+ convolve_map(gaussian_filter, map_region, window_size * window_size);
+ }
+ }
+}
+
+// This function is to upscale the map from in_level size to out_level size.
+// Note that the map at "level-1" will upscale the map at "level" by x2.
+static INLINE int upscale_map(const double *input, int in_level, int out_level,
+ int height[9], int width[9], double *output) {
+ for (int level = in_level; level > out_level; level--) {
+ const int cur_width = width[level];
+ const int cur_height = height[level];
+ const int cur_stride = width[level];
+
+ double *original = (level == in_level) ? (double *)input : output;
+
+ assert(level > 0);
+
+ const int h_upscale = height[level - 1];
+ const int w_upscale = width[level - 1];
+ const int s_upscale = width[level - 1];
+
+ double *upscale = aom_malloc(h_upscale * w_upscale * sizeof(*upscale));
+
+ if (!upscale) {
+ return 0;
+ }
+
+ for (int i = 0; i < h_upscale; ++i) {
+ for (int j = 0; j < w_upscale; ++j) {
+ const int ii = clamp((i >> 1), 0, cur_height - 1);
+ const int jj = clamp((j >> 1), 0, cur_width - 1);
+ upscale[j + i * s_upscale] = (double)original[jj + ii * cur_stride];
+ }
+ }
+ memcpy(output, upscale, h_upscale * w_upscale * sizeof(double));
+ aom_free(upscale);
+ }
+
+ return 1;
+}
+
+// This function calculates the differences between a fine scale c and a
+// coarser scale s yielding the feature maps. c \in {2, 3, 4}, and s = c +
+// delta, where delta \in {3, 4}.
+static int center_surround_diff(const double *input[9], int height[9],
+ int width[9], saliency_feature_map *output[6]) {
+ int j = 0;
+ for (int k = 2; k < 5; ++k) {
+ int cur_height = height[k];
+ int cur_width = width[k];
+
+ if (upscale_map(input[k + 3], k + 3, k, height, width, output[j]->buf) ==
+ 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j]->buf[r * cur_width + c] =
+ fabs((double)(input[k][r * cur_width + c] -
+ output[j]->buf[r * cur_width + c]));
+ }
+ }
+
+ if (upscale_map(input[k + 4], k + 4, k, height, width,
+ output[j + 1]->buf) == 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j + 1]->buf[r * cur_width + c] =
+ fabs(input[k][r * cur_width + c] -
+ output[j + 1]->buf[r * cur_width + c]);
+ }
+ }
+
+ j += 2;
+ }
+ return 1;
+}
+
+// For color channels, the differences is calculated based on "color
+// double-opponency". For example, the RG feature map is constructed between a
+// fine scale c of R-G component and a coarser scale s of G-R component.
+static int center_surround_diff_rgb(const double *input_1[9],
+ const double *input_2[9], int height[9],
+ int width[9],
+ saliency_feature_map *output[6]) {
+ int j = 0;
+ for (int k = 2; k < 5; ++k) {
+ int cur_height = height[k];
+ int cur_width = width[k];
+
+ if (upscale_map(input_2[k + 3], k + 3, k, height, width, output[j]->buf) ==
+ 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j]->buf[r * cur_width + c] =
+ fabs((double)(input_1[k][r * cur_width + c] -
+ output[j]->buf[r * cur_width + c]));
+ }
+ }
+
+ if (upscale_map(input_2[k + 4], k + 4, k, height, width,
+ output[j + 1]->buf) == 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j + 1]->buf[r * cur_width + c] =
+ fabs(input_1[k][r * cur_width + c] -
+ output[j + 1]->buf[r * cur_width + c]);
+ }
+ }
+
+ j += 2;
+ }
+ return 1;
+}
+
+// This function is to generate Gaussian pyramid images with indexes from 0 to
+// 8, and construct the feature maps from calculating the center-surround
+// differences.
+static int gaussian_pyramid(const double *src, int width[9], int height[9],
+ saliency_feature_map *dst[6]) {
+ double *gaussian_map[9]; // scale = 9
+ gaussian_map[0] =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0]));
+ if (!gaussian_map[0]) {
+ return 0;
+ }
+
+ memcpy(gaussian_map[0], src, width[0] * height[0] * sizeof(double));
+
+ for (int i = 1; i < 9; ++i) {
+ int stride = width[i - 1];
+ int new_width = width[i];
+ int new_height = height[i];
+
+ gaussian_map[i] =
+ (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i]));
+
+ if (!gaussian_map[i]) {
+ for (int l = 0; l < i; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ return 0;
+ }
+
+ memset(gaussian_map[i], 0, new_width * new_height * sizeof(double));
+
+ decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride,
+ gaussian_map[i]);
+ }
+
+ if (center_surround_diff((const double **)gaussian_map, height, width, dst) ==
+ 0) {
+ for (int l = 0; l < 9; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ return 0;
+ }
+
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[i]);
+ }
+ return 1;
+}
+
+static int gaussian_pyramid_rgb(double *src_1, double *src_2, int width[9],
+ int height[9], saliency_feature_map *dst[6]) {
+ double *gaussian_map[2][9]; // scale = 9
+ double *src[2];
+
+ src[0] = src_1;
+ src[1] = src_2;
+
+ for (int k = 0; k < 2; ++k) {
+ gaussian_map[k][0] = (double *)aom_malloc(width[0] * height[0] *
+ sizeof(*gaussian_map[k][0]));
+ if (!gaussian_map[k][0]) {
+ for (int l = 0; l < k; ++l) {
+ aom_free(gaussian_map[l][0]);
+ }
+ return 0;
+ }
+ memcpy(gaussian_map[k][0], src[k], width[0] * height[0] * sizeof(double));
+
+ for (int i = 1; i < 9; ++i) {
+ int stride = width[i - 1];
+ int new_width = width[i];
+ int new_height = height[i];
+
+ gaussian_map[k][i] = (double *)aom_malloc(new_width * new_height *
+ sizeof(*gaussian_map[k][i]));
+ if (!gaussian_map[k][i]) {
+ for (int l = 0; l < k; ++l) {
+ aom_free(gaussian_map[l][i]);
+ }
+ return 0;
+ }
+ memset(gaussian_map[k][i], 0, new_width * new_height * sizeof(double));
+ decimate_map(gaussian_map[k][i - 1], height[i - 1], width[i - 1], stride,
+ gaussian_map[k][i]);
+ }
+ }
+
+ if (center_surround_diff_rgb((const double **)gaussian_map[0],
+ (const double **)gaussian_map[1], height, width,
+ dst) == 0) {
+ for (int l = 0; l < 2; ++l) {
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[l][i]);
+ }
+ }
+ return 0;
+ }
+
+ for (int l = 0; l < 2; ++l) {
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[l][i]);
+ }
+ }
+ return 1;
+}
+
+static int get_feature_map_intensity(double *intensity, int width[9],
+ int height[9],
+ saliency_feature_map *i_map[6]) {
+ if (gaussian_pyramid(intensity, width, height, i_map) == 0) {
+ return 0;
+ }
+ return 1;
+}
+
+static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9],
+ int height[9], saliency_feature_map *rg_map[6],
+ saliency_feature_map *by_map[6]) {
+ double *rg_mat = aom_malloc(height[0] * width[0] * sizeof(*rg_mat));
+ double *by_mat = aom_malloc(height[0] * width[0] * sizeof(*by_mat));
+ double *gr_mat = aom_malloc(height[0] * width[0] * sizeof(*gr_mat));
+ double *yb_mat = aom_malloc(height[0] * width[0] * sizeof(*yb_mat));
+
+ if (!rg_mat || !by_mat || !gr_mat || !yb_mat) {
+ aom_free(rg_mat);
+ aom_free(by_mat);
+ aom_free(gr_mat);
+ aom_free(yb_mat);
+ return 0;
+ }
+
+ double r, g, b, y;
+ for (int i = 0; i < height[0]; ++i) {
+ for (int j = 0; j < width[0]; ++j) {
+ r = AOMMAX(0, cr[i * width[0] + j] -
+ (cg[i * width[0] + j] + cb[i * width[0] + j]) / 2);
+ g = AOMMAX(0, cg[i * width[0] + j] -
+ (cr[i * width[0] + j] + cb[i * width[0] + j]) / 2);
+ b = AOMMAX(0, cb[i * width[0] + j] -
+ (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2);
+ y = AOMMAX(0, (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2 -
+ fabs(cr[i * width[0] + j] - cg[i * width[0] + j]) / 2 -
+ cb[i * width[0] + j]);
+
+ rg_mat[i * width[0] + j] = r - g;
+ by_mat[i * width[0] + j] = b - y;
+ gr_mat[i * width[0] + j] = g - r;
+ yb_mat[i * width[0] + j] = y - b;
+ }
+ }
+
+ if (gaussian_pyramid_rgb(rg_mat, gr_mat, width, height, rg_map) == 0 ||
+ gaussian_pyramid_rgb(by_mat, yb_mat, width, height, by_map) == 0) {
+ aom_free(rg_mat);
+ aom_free(by_mat);
+ aom_free(gr_mat);
+ aom_free(yb_mat);
+ return 0;
+ }
+
+ aom_free(rg_mat);
+ aom_free(by_mat);
+ aom_free(gr_mat);
+ aom_free(yb_mat);
+ return 1;
+}
+
+static INLINE void filter2d(const double *input, const double kernel[9][9],
+ int width, int height, double *output) {
+ const int window_size = 9;
+ double map_section[81];
+ for (int y = 0; y <= height - 1; ++y) {
+ for (int x = 0; x <= width - 1; ++x) {
+ int i = 0;
+ for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) {
+ for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) {
+ int yvalue = clamp(yy, 0, height - 1);
+ int xvalue = clamp(xx, 0, width - 1);
+ map_section[i++] = input[yvalue * width + xvalue];
+ }
+ }
+
+ output[y * width + x] = 0;
+ for (int k = 0; k < window_size; ++k) {
+ for (int l = 0; l < window_size; ++l) {
+ output[y * width + x] +=
+ kernel[k][l] * map_section[k * window_size + l];
+ }
+ }
+ }
+ }
+}
+
+static int get_feature_map_orientation(const double *intensity, int width[9],
+ int height[9],
+ saliency_feature_map *dst[24]) {
+ double *gaussian_map[9];
+
+ gaussian_map[0] =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0]));
+ if (!gaussian_map[0]) {
+ return 0;
+ }
+ memcpy(gaussian_map[0], intensity, width[0] * height[0] * sizeof(double));
+
+ for (int i = 1; i < 9; ++i) {
+ int stride = width[i - 1];
+ int new_width = width[i];
+ int new_height = height[i];
+
+ gaussian_map[i] =
+ (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i]));
+ if (!gaussian_map[i]) {
+ for (int l = 0; l < i; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ return 0;
+ }
+ memset(gaussian_map[i], 0, new_width * new_height * sizeof(double));
+ decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride,
+ gaussian_map[i]);
+ }
+
+ double *tempGaborOutput[4][9]; //[angle: 0, 45, 90, 135 degree][filter_size]
+
+ for (int i = 2; i < 9; ++i) {
+ const int cur_height = height[i];
+ const int cur_width = width[i];
+ for (int j = 0; j < 4; ++j) {
+ tempGaborOutput[j][i] = (double *)aom_malloc(
+ cur_height * cur_width * sizeof(*tempGaborOutput[j][i]));
+ if (!tempGaborOutput[j][i]) {
+ for (int l = 0; l < 9; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ for (int h = 0; h < 4; ++h) {
+ for (int g = 2; g < 9; ++g) {
+ aom_free(tempGaborOutput[h][g]);
+ }
+ }
+ return 0;
+ }
+ filter2d(gaussian_map[i], kGaborFilter[j], cur_width, cur_height,
+ tempGaborOutput[j][i]);
+ }
+ }
+
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[i]);
+ }
+
+ saliency_feature_map
+ *tmp[4][6]; //[angle: 0, 45, 90, 135 degree][filter_size]
+
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ tmp[j][i] = dst[j * 6 + i];
+ }
+ }
+
+ for (int j = 0; j < 4; ++j) {
+ if (center_surround_diff((const double **)tempGaborOutput[j], height, width,
+ tmp[j]) == 0) {
+ for (int h = 0; h < 4; ++h) {
+ for (int g = 2; g < 9; ++g) {
+ aom_free(tempGaborOutput[h][g]);
+ }
+ }
+ return 0;
+ }
+ }
+
+ for (int i = 2; i < 9; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ aom_free(tempGaborOutput[j][i]);
+ }
+ }
+
+ return 1;
+}
+
+static INLINE void find_min_max(const saliency_feature_map *input,
+ double *max_value, double *min_value) {
+ assert(input && input->buf);
+ *min_value = DBL_MAX;
+ *max_value = 0.0;
+
+ for (int i = 0; i < input->height; ++i) {
+ for (int j = 0; j < input->width; ++j) {
+ assert(input->buf[i * input->width + j] >= 0.0);
+ *min_value = fmin(input->buf[i * input->width + j], *min_value);
+ *max_value = fmax(input->buf[i * input->width + j], *max_value);
+ }
+ }
+}
+
+static INLINE double average_local_max(const saliency_feature_map *input,
+ int stepsize) {
+ int numlocal = 0;
+ double lmaxmean = 0, lmax = 0, dummy = 0;
+ saliency_feature_map local_map;
+ local_map.height = stepsize;
+ local_map.width = stepsize;
+ local_map.buf =
+ (double *)aom_malloc(stepsize * stepsize * sizeof(*local_map.buf));
+
+ if (!local_map.buf) {
+ return -1;
+ }
+
+ for (int y = 0; y < input->height - stepsize; y += stepsize) {
+ for (int x = 0; x < input->width - stepsize; x += stepsize) {
+ for (int i = 0; i < stepsize; ++i) {
+ for (int j = 0; j < stepsize; ++j) {
+ local_map.buf[i * stepsize + j] =
+ input->buf[(y + i) * input->width + x + j];
+ }
+ }
+
+ find_min_max(&local_map, &lmax, &dummy);
+ lmaxmean += lmax;
+ numlocal++;
+ }
+ }
+
+ aom_free(local_map.buf);
+
+ return lmaxmean / numlocal;
+}
+
+// Linear normalization the values in the map to [0,1].
+static void minmax_normalize(saliency_feature_map *input) {
+ double max_value, min_value;
+ find_min_max(input, &max_value, &min_value);
+
+ for (int i = 0; i < input->height; ++i) {
+ for (int j = 0; j < input->width; ++j) {
+ if (max_value != min_value) {
+ input->buf[i * input->width + j] =
+ input->buf[i * input->width + j] / (max_value - min_value) +
+ min_value / (min_value - max_value);
+ } else {
+ input->buf[i * input->width + j] -= min_value;
+ }
+ }
+ }
+}
+
+// This function is to promote meaningful “activation spots” in the map and
+// ignores homogeneous areas.
+static int nomalization_operator(saliency_feature_map *input, int stepsize) {
+ minmax_normalize(input);
+ double lmaxmean = average_local_max(input, stepsize);
+ if (lmaxmean < 0) {
+ return 0;
+ }
+ double normCoeff = (1 - lmaxmean) * (1 - lmaxmean);
+
+ for (int i = 0; i < input->height; ++i) {
+ for (int j = 0; j < input->width; ++j) {
+ input->buf[i * input->width + j] *= normCoeff;
+ }
+ }
+
+ return 1;
+}
+
+// Normalize the values in feature maps to [0,1], and then upscale all maps to
+// the original frame size.
+static int normalize_fm(saliency_feature_map *input[6], int width[9],
+ int height[9], int num_fm,
+ saliency_feature_map *output[6]) {
+ // Feature maps (FM) are generated by function "center_surround_diff()". The
+ // difference is between a fine scale c and a coarser scale s, where c \in {2,
+ // 3, 4}, and s = c + delta, where delta \in {3, 4}, and the FM size is scale
+ // c. Specifically, i=0: c=2 and s=5, i=1: c=2 and s=6, i=2: c=3 and s=6, i=3:
+ // c=3 and s=7, i=4: c=4 and s=7, i=5: c=4 and s=8.
+ for (int i = 0; i < num_fm; ++i) {
+ if (nomalization_operator(input[i], 8) == 0) {
+ return 0;
+ }
+
+ // Upscale FM to original frame size
+ if (upscale_map(input[i]->buf, (i / 2) + 2, 0, height, width,
+ output[i]->buf) == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+// Combine feature maps with the same category (intensity, color, or
+// orientation) into one conspicuity map.
+static int normalized_map(saliency_feature_map *input[6], int width[9],
+ int height[9], saliency_feature_map *output) {
+ int num_fm = 6;
+
+ saliency_feature_map *n_input[6];
+ for (int i = 0; i < 6; ++i) {
+ n_input[i] = (saliency_feature_map *)aom_malloc(sizeof(*n_input[i]));
+ if (!n_input[i]) {
+ return 0;
+ }
+ n_input[i]->buf =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*n_input[i]->buf));
+ if (!n_input[i]->buf) {
+ aom_free(n_input[i]);
+ return 0;
+ }
+ n_input[i]->height = height[0];
+ n_input[i]->width = width[0];
+ }
+
+ if (normalize_fm(input, width, height, num_fm, n_input) == 0) {
+ for (int i = 0; i < num_fm; ++i) {
+ aom_free(n_input[i]->buf);
+ aom_free(n_input[i]);
+ }
+ return 0;
+ }
+
+ // Add up all normalized feature maps with the same category into one map.
+ for (int i = 0; i < num_fm; ++i) {
+ for (int r = 0; r < height[0]; ++r) {
+ for (int c = 0; c < width[0]; ++c) {
+ output->buf[r * width[0] + c] += n_input[i]->buf[r * width[0] + c];
+ }
+ }
+ }
+
+ for (int i = 0; i < num_fm; ++i) {
+ aom_free(n_input[i]->buf);
+ aom_free(n_input[i]);
+ }
+
+ nomalization_operator(output, 8);
+ return 1;
+}
+
+static int normalized_map_rgb(saliency_feature_map *rg_map[6],
+ saliency_feature_map *by_map[6], int width[9],
+ int height[9], saliency_feature_map *output) {
+ saliency_feature_map *color_cm[2]; // 0: color_cm_rg, 1: color_cm_by
+ for (int i = 0; i < 2; ++i) {
+ color_cm[i] = aom_malloc(sizeof(*color_cm[i]));
+ if (!color_cm[i]) {
+ return 0;
+ }
+ color_cm[i]->buf =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*color_cm[i]->buf));
+ if (!color_cm[i]->buf) {
+ for (int l = 0; l < i; ++l) {
+ aom_free(color_cm[l]->buf);
+ }
+ aom_free(color_cm[i]);
+ return 0;
+ }
+
+ color_cm[i]->width = width[0];
+ color_cm[i]->height = height[0];
+ memset(color_cm[i]->buf, 0,
+ width[0] * height[0] * sizeof(*color_cm[i]->buf));
+ }
+
+ if (normalized_map(rg_map, width, height, color_cm[0]) == 0 ||
+ normalized_map(by_map, width, height, color_cm[1]) == 0) {
+ for (int i = 0; i < 2; ++i) {
+ aom_free(color_cm[i]->buf);
+ aom_free(color_cm[i]);
+ }
+ return 0;
+ }
+
+ for (int r = 0; r < height[0]; ++r) {
+ for (int c = 0; c < width[0]; ++c) {
+ output->buf[r * width[0] + c] = color_cm[0]->buf[r * width[0] + c] +
+ color_cm[1]->buf[r * width[0] + c];
+ }
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ aom_free(color_cm[i]->buf);
+ aom_free(color_cm[i]);
+ }
+
+ nomalization_operator(output, 8);
+ return 1;
+}
+
+static int normalized_map_orientation(saliency_feature_map *orientation_map[24],
+ int width[9], int height[9],
+ saliency_feature_map *output) {
+ int num_fms_per_angle = 6;
+
+ saliency_feature_map *ofm[4][6];
+ for (int i = 0; i < num_fms_per_angle; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ ofm[j][i] = orientation_map[j * num_fms_per_angle + i];
+ }
+ }
+
+ // extract conspicuity map for each angle
+ saliency_feature_map *nofm = aom_malloc(sizeof(*nofm));
+ if (!nofm) {
+ return 0;
+ }
+ nofm->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*nofm->buf));
+ if (!nofm->buf) {
+ aom_free(nofm);
+ return 0;
+ }
+ nofm->height = height[0];
+ nofm->width = width[0];
+
+ for (int i = 0; i < 4; ++i) {
+ memset(nofm->buf, 0, width[0] * height[0] * sizeof(*nofm->buf));
+ if (normalized_map(ofm[i], width, height, nofm) == 0) {
+ aom_free(nofm->buf);
+ aom_free(nofm);
+ return 0;
+ }
+
+ for (int r = 0; r < height[0]; ++r) {
+ for (int c = 0; c < width[0]; ++c) {
+ output->buf[r * width[0] + c] += nofm->buf[r * width[0] + c];
+ }
+ }
+ }
+
+ aom_free(nofm->buf);
+ aom_free(nofm);
+
+ nomalization_operator(output, 8);
+ return 1;
+}
+
+// Set pixel level saliency mask based on Itti-Koch algorithm
+int av1_set_saliency_map(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ int frm_width = cm->width;
+ int frm_height = cm->height;
+
+ int pyr_height[9];
+ int pyr_width[9];
+
+ pyr_height[0] = frm_height;
+ pyr_width[0] = frm_width;
+
+ for (int i = 1; i < 9; ++i) {
+ pyr_width[i] = pyr_width[i - 1] / 2;
+ pyr_height[i] = pyr_height[i - 1] / 2;
+ }
+
+ double *cr = aom_malloc(frm_width * frm_height * sizeof(*cr));
+ double *cg = aom_malloc(frm_width * frm_height * sizeof(*cg));
+ double *cb = aom_malloc(frm_width * frm_height * sizeof(*cb));
+ double *intensity = aom_malloc(frm_width * frm_height * sizeof(*intensity));
+
+ if (!cr || !cg || !cb || !intensity) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ return 0;
+ }
+
+ // Extract red / green / blue channels and intensity component
+ get_color_intensity(cpi->source, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, cr, cg, cb, intensity);
+
+ // Feature Map Extraction
+ // intensity map
+ saliency_feature_map *i_map[6];
+ for (int i = 0; i < 6; ++i) {
+ int cur_height = pyr_height[(i / 2) + 2];
+ int cur_width = pyr_width[(i / 2) + 2];
+
+ i_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*i_map[i]));
+ if (!i_map[i]) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < i; ++l) {
+ aom_free(i_map[l]);
+ }
+ return 0;
+ }
+ i_map[i]->buf =
+ (double *)aom_malloc(cur_height * cur_width * sizeof(*i_map[i]->buf));
+ if (!i_map[i]->buf) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < i; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ }
+ return 0;
+ }
+ i_map[i]->height = cur_height;
+ i_map[i]->width = cur_width;
+ }
+
+ if (get_feature_map_intensity(intensity, pyr_width, pyr_height, i_map) == 0) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ }
+ return 0;
+ }
+
+ // RGB map
+ saliency_feature_map *rg_map[6], *by_map[6];
+ for (int i = 0; i < 6; ++i) {
+ int cur_height = pyr_height[(i / 2) + 2];
+ int cur_width = pyr_width[(i / 2) + 2];
+ rg_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*rg_map[i]));
+ by_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*by_map[i]));
+ if (!rg_map[i] || !by_map[i]) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ return 0;
+ }
+ rg_map[i]->buf =
+ (double *)aom_malloc(cur_height * cur_width * sizeof(*rg_map[i]->buf));
+ by_map[i]->buf =
+ (double *)aom_malloc(cur_height * cur_width * sizeof(*by_map[i]->buf));
+ if (!by_map[i]->buf || !rg_map[i]->buf) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ }
+ for (int l = 0; l < i; ++l) {
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ return 0;
+ }
+ rg_map[i]->height = cur_height;
+ rg_map[i]->width = cur_width;
+ by_map[i]->height = cur_height;
+ by_map[i]->width = cur_width;
+ }
+
+ if (get_feature_map_rgb(cr, cg, cb, pyr_width, pyr_height, rg_map, by_map) ==
+ 0) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ return 0;
+ }
+
+ // Orientation map
+ saliency_feature_map *orientation_map[24];
+ for (int i = 0; i < 24; ++i) {
+ int cur_height = pyr_height[((i % 6) / 2) + 2];
+ int cur_width = pyr_width[((i % 6) / 2) + 2];
+
+ orientation_map[i] =
+ (saliency_feature_map *)aom_malloc(sizeof(*orientation_map[i]));
+ if (!orientation_map[i]) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ for (int h = 0; h < i; ++h) {
+ aom_free(orientation_map[h]);
+ }
+ return 0;
+ }
+
+ orientation_map[i]->buf = (double *)aom_malloc(
+ cur_height * cur_width * sizeof(*orientation_map[i]->buf));
+ if (!orientation_map[i]->buf) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+
+ for (int h = 0; h < i; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ aom_free(orientation_map[h]);
+ }
+ return 0;
+ }
+
+ orientation_map[i]->height = cur_height;
+ orientation_map[i]->width = cur_width;
+ }
+
+ if (get_feature_map_orientation(intensity, pyr_width, pyr_height,
+ orientation_map) == 0) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ for (int h = 0; h < 24; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ }
+ return 0;
+ }
+
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+
+ saliency_feature_map
+ *normalized_maps[3]; // 0: intensity, 1: color, 2: orientation
+
+ for (int i = 0; i < 3; ++i) {
+ normalized_maps[i] = aom_malloc(sizeof(*normalized_maps[i]));
+ if (!normalized_maps[i]) {
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+
+ for (int h = 0; h < 24; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ }
+
+ for (int l = 0; l < i; ++l) {
+ aom_free(normalized_maps[l]);
+ }
+ return 0;
+ }
+ normalized_maps[i]->buf = (double *)aom_malloc(
+ frm_width * frm_height * sizeof(*normalized_maps[i]->buf));
+ if (!normalized_maps[i]->buf) {
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ for (int h = 0; h < 24; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ }
+ for (int l = 0; l < i; ++l) {
+ aom_free(normalized_maps[l]->buf);
+ aom_free(normalized_maps[l]);
+ }
+ return 0;
+ }
+ normalized_maps[i]->width = frm_width;
+ normalized_maps[i]->height = frm_height;
+ memset(normalized_maps[i]->buf, 0,
+ frm_width * frm_height * sizeof(*normalized_maps[i]->buf));
+ }
+
+ // Conspicuity map generation
+ if (normalized_map(i_map, pyr_width, pyr_height, normalized_maps[0]) == 0 ||
+ normalized_map_rgb(rg_map, by_map, pyr_width, pyr_height,
+ normalized_maps[1]) == 0 ||
+ normalized_map_orientation(orientation_map, pyr_width, pyr_height,
+ normalized_maps[2]) == 0) {
+ for (int i = 0; i < 6; ++i) {
+ aom_free(i_map[i]->buf);
+ aom_free(rg_map[i]->buf);
+ aom_free(by_map[i]->buf);
+ aom_free(i_map[i]);
+ aom_free(rg_map[i]);
+ aom_free(by_map[i]);
+ }
+
+ for (int i = 0; i < 24; ++i) {
+ aom_free(orientation_map[i]->buf);
+ aom_free(orientation_map[i]);
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+ return 0;
+ }
+
+ for (int i = 0; i < 6; ++i) {
+ aom_free(i_map[i]->buf);
+ aom_free(rg_map[i]->buf);
+ aom_free(by_map[i]->buf);
+ aom_free(i_map[i]);
+ aom_free(rg_map[i]);
+ aom_free(by_map[i]);
+ }
+
+ for (int i = 0; i < 24; ++i) {
+ aom_free(orientation_map[i]->buf);
+ aom_free(orientation_map[i]);
+ }
+
+ // Pixel level saliency map
+ saliency_feature_map *combined_saliency_map =
+ aom_malloc(sizeof(*combined_saliency_map));
+ if (!combined_saliency_map) {
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+ return 0;
+ }
+
+ combined_saliency_map->buf = (double *)aom_malloc(
+ frm_width * frm_height * sizeof(*combined_saliency_map->buf));
+ if (!combined_saliency_map->buf) {
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+
+ aom_free(combined_saliency_map);
+ return 0;
+ }
+ combined_saliency_map->height = frm_height;
+ combined_saliency_map->width = frm_width;
+
+ double w_intensity, w_color, w_orient;
+
+ w_intensity = w_color = w_orient = (double)1 / 3;
+
+ for (int r = 0; r < frm_height; ++r) {
+ for (int c = 0; c < frm_width; ++c) {
+ combined_saliency_map->buf[r * frm_width + c] =
+ (w_intensity * normalized_maps[0]->buf[r * frm_width + c] +
+ w_color * normalized_maps[1]->buf[r * frm_width + c] +
+ w_orient * normalized_maps[2]->buf[r * frm_width + c]);
+ }
+ }
+
+ for (int r = 0; r < frm_height; ++r) {
+ for (int c = 0; c < frm_width; ++c) {
+ int index = r * frm_width + c;
+ cpi->saliency_map[index] =
+ (uint8_t)(combined_saliency_map->buf[index] * 255);
+ }
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+
+ aom_free(combined_saliency_map->buf);
+ aom_free(combined_saliency_map);
+
+ return 1;
+}
+
+// Set superblock level saliency mask for rdmult scaling
+int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) {
+ AV1_COMMON *cm = &cpi->common;
+
+ saliency_feature_map *sb_saliency_map =
+ aom_malloc(sizeof(saliency_feature_map));
+
+ if (sb_saliency_map == NULL) {
+ return 0;
+ }
+
+ const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+ const int num_mi_w = mi_size_wide[bsize];
+ const int num_mi_h = mi_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ const int block_height = block_size_high[bsize];
+ const int num_sb_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_sb_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+
+ sb_saliency_map->height = num_sb_rows;
+ sb_saliency_map->width = num_sb_cols;
+ sb_saliency_map->buf = (double *)aom_malloc(num_sb_rows * num_sb_cols *
+ sizeof(*sb_saliency_map->buf));
+
+ if (sb_saliency_map->buf == NULL) {
+ aom_free(sb_saliency_map);
+ return 0;
+ }
+
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ double total_pixel = 0;
+ double total_weight = 0;
+
+ for (int i = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++) {
+ if ((row * block_height + i) >= cpi->common.height ||
+ (col * block_width + j) >= cpi->common.width)
+ continue;
+ total_pixel++;
+ total_weight +=
+ cpi->saliency_map[(row * block_height + i) * cpi->common.width +
+ col * block_width + j];
+ }
+ }
+
+ assert(total_pixel > 0);
+
+ // Calculate the superblock level saliency map from pixel level saliency
+ // map
+ sb_saliency_map->buf[index] = total_weight / total_pixel;
+
+ // Further lower the superblock saliency score for boundary superblocks.
+ if (row < 1 || row > num_sb_rows - 2 || col < 1 ||
+ col > num_sb_cols - 2) {
+ sb_saliency_map->buf[index] /= 5;
+ }
+ }
+ }
+
+ // superblock level saliency map finalization
+ minmax_normalize(sb_saliency_map);
+
+ double log_sum = 0.0;
+ double sum = 0.0;
+ int block_count = 0;
+
+ // Calculate the average superblock sm_scaling_factor for a frame, to be used
+ // for clamping later.
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ const double saliency = sb_saliency_map->buf[index];
+
+ cpi->sm_scaling_factor[index] = 1 - saliency;
+ sum += cpi->sm_scaling_factor[index];
+ block_count++;
+ }
+ }
+ assert(block_count > 0);
+ sum /= block_count;
+
+ // Calculate the geometric mean of superblock sm_scaling_factor for a frame,
+ // to be used for normalization.
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ log_sum += log(fmax(cpi->sm_scaling_factor[index], 0.001));
+ cpi->sm_scaling_factor[index] =
+ fmax(cpi->sm_scaling_factor[index], 0.8 * sum);
+ }
+ }
+
+ log_sum = exp(log_sum / block_count);
+
+ // Normalize the sm_scaling_factor by geometric mean.
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ assert(log_sum > 0);
+ cpi->sm_scaling_factor[index] /= log_sum;
+
+ // Modulate the sm_scaling_factor by frame basis motion factor
+ cpi->sm_scaling_factor[index] =
+ cpi->sm_scaling_factor[index] * motion_ratio;
+ }
+ }
+
+ aom_free(sb_saliency_map->buf);
+ aom_free(sb_saliency_map);
+ return 1;
+}
+
+// av1_setup_motion_ratio() is only enabled when CONFIG_REALTIME_ONLY is 0,
+// because the computations need to access the first pass stats which are
+// only available when CONFIG_REALTIME_ONLY is equal to 0.
+#if !CONFIG_REALTIME_ONLY
+// Set motion_ratio that reflects the motion quantities between two consecutive
+// frames. Motion_ratio will be used to set up saliency_map based rdmult scaling
+// factor, i.e., the less the motion quantities are, the more bits will be spent
+// on this frame, and vice versa.
+double av1_setup_motion_ratio(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int frames_since_key =
+ cm->current_frame.display_order_hint - cpi->rc.frames_since_key;
+ const FIRSTPASS_STATS *cur_stats = av1_firstpass_info_peek(
+ &cpi->ppi->twopass.firstpass_info, frames_since_key);
+ assert(cur_stats != NULL);
+ assert(cpi->ppi->twopass.firstpass_info.total_stats.count > 0);
+
+ const double avg_intra_error =
+ exp(cpi->ppi->twopass.firstpass_info.total_stats.log_intra_error /
+ cpi->ppi->twopass.firstpass_info.total_stats.count);
+ const double avg_inter_error =
+ exp(cpi->ppi->twopass.firstpass_info.total_stats.log_coded_error /
+ cpi->ppi->twopass.firstpass_info.total_stats.count);
+
+ double inter_error = cur_stats->coded_error;
+ double error_stdev = 0;
+ const double avg_error =
+ cpi->ppi->twopass.firstpass_info.total_stats.intra_error /
+ cpi->ppi->twopass.firstpass_info.total_stats.count;
+ for (int i = 0; i < cpi->ppi->twopass.firstpass_info.total_stats.count; i++) {
+ const FIRSTPASS_STATS *stats =
+ &cpi->ppi->twopass.firstpass_info.stats_buf[i];
+ error_stdev +=
+ (stats->intra_error - avg_error) * (stats->intra_error - avg_error);
+ }
+ error_stdev =
+ sqrt(error_stdev / cpi->ppi->twopass.firstpass_info.total_stats.count);
+
+ double motion_ratio = 1;
+ if (error_stdev / fmax(avg_intra_error, 1) > 0.1) {
+ motion_ratio = inter_error / fmax(1, avg_inter_error);
+ motion_ratio = AOMMIN(motion_ratio, 1.5);
+ motion_ratio = AOMMAX(motion_ratio, 0.8);
+ }
+
+ return motion_ratio;
+}
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/av1/encoder/saliency_map.h b/third_party/aom/av1/encoder/saliency_map.h
new file mode 100644
index 0000000000..0d27f83633
--- /dev/null
+++ b/third_party/aom/av1/encoder/saliency_map.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SALIENCY_MAP_H_
+#define AOM_AV1_ENCODER_SALIENCY_MAP_H_
+#include "av1/encoder/encoder.h"
+
+typedef struct saliency_feature_map {
+ double *buf; // stores values of the map in 1D array
+ int height;
+ int width;
+} saliency_feature_map;
+
+int av1_set_saliency_map(AV1_COMP *cpi);
+#if !CONFIG_REALTIME_ONLY
+double av1_setup_motion_ratio(AV1_COMP *cpi);
+#endif
+int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio);
+
+#endif // AOM_AV1_ENCODER_SALIENCY_MAP_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 0000000000..4b4e78779c
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+ seg->enabled = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
+ seg->temporal_update = 0;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ seg->temporal_update = 0;
+}
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] &= ~(1u << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_data[segment_id][feature_id] = 0;
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+ struct segmentation *seg = &cm->seg;
+
+ // Set up default state for MB feature flags
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 0000000000..1ad13d66a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/sorting_network.h b/third_party/aom/av1/encoder/sorting_network.h
new file mode 100644
index 0000000000..54f4c19dcd
--- /dev/null
+++ b/third_party/aom/av1/encoder/sorting_network.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * This file contains several utility functions used to sort small arrays with
+ * sorting networks.
+ *
+ * Sorting network is a (potentially branch-less) way to quickly sort small
+ * arrays with known size. For more details, consult
+ * (https://en.wikipedia.org/wiki/Sorting_network).
+ */
+#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_
+#define AOM_AV1_ENCODER_SORTING_NETWORK_H_
+
+#include "aom/aom_integer.h"
+
+#define SWAP(i, j) \
+ do { \
+ const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \
+ const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \
+ const int maxi = (k[i] >= k[j]) ? v[i] : v[j]; \
+ const int mini = (k[i] >= k[j]) ? v[j] : v[i]; \
+ k[i] = maxf; \
+ k[j] = minf; \
+ v[i] = maxi; \
+ v[j] = mini; \
+ } while (0)
+
+/*!\brief Sorts two size-16 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out] k An length-16 array of float serves as the keys.
+ * \param[in,out] v An length-16 array of int32 serves as the
+ * value.
+ */
+static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
+ SWAP(0, 1);
+ SWAP(2, 3);
+ SWAP(4, 5);
+ SWAP(6, 7);
+ SWAP(8, 9);
+ SWAP(10, 11);
+ SWAP(12, 13);
+ SWAP(14, 15);
+ SWAP(0, 2);
+ SWAP(1, 3);
+ SWAP(4, 6);
+ SWAP(5, 7);
+ SWAP(8, 10);
+ SWAP(9, 11);
+ SWAP(12, 14);
+ SWAP(13, 15);
+ SWAP(1, 2);
+ SWAP(5, 6);
+ SWAP(0, 4);
+ SWAP(3, 7);
+ SWAP(9, 10);
+ SWAP(13, 14);
+ SWAP(8, 12);
+ SWAP(11, 15);
+ SWAP(1, 5);
+ SWAP(2, 6);
+ SWAP(9, 13);
+ SWAP(10, 14);
+ SWAP(0, 8);
+ SWAP(7, 15);
+ SWAP(1, 4);
+ SWAP(3, 6);
+ SWAP(9, 12);
+ SWAP(11, 14);
+ SWAP(2, 4);
+ SWAP(3, 5);
+ SWAP(10, 12);
+ SWAP(11, 13);
+ SWAP(1, 9);
+ SWAP(6, 14);
+ SWAP(3, 4);
+ SWAP(11, 12);
+ SWAP(1, 8);
+ SWAP(2, 10);
+ SWAP(5, 13);
+ SWAP(7, 14);
+ SWAP(3, 11);
+ SWAP(2, 8);
+ SWAP(4, 12);
+ SWAP(7, 13);
+ SWAP(3, 10);
+ SWAP(5, 12);
+ SWAP(3, 9);
+ SWAP(6, 12);
+ SWAP(3, 8);
+ SWAP(7, 12);
+ SWAP(5, 9);
+ SWAP(6, 10);
+ SWAP(4, 8);
+ SWAP(7, 11);
+ SWAP(5, 8);
+ SWAP(7, 10);
+ SWAP(6, 8);
+ SWAP(7, 9);
+ SWAP(7, 8);
+}
+
+/*!\brief Sorts two size-8 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out] k An length-8 array of float serves as the keys.
+ * \param[in,out] v An length-8 array of int32 serves as the values.
+ */
+static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) {
+ SWAP(0, 1);
+ SWAP(2, 3);
+ SWAP(4, 5);
+ SWAP(6, 7);
+ SWAP(0, 2);
+ SWAP(1, 3);
+ SWAP(4, 6);
+ SWAP(5, 7);
+ SWAP(1, 2);
+ SWAP(5, 6);
+ SWAP(0, 4);
+ SWAP(3, 7);
+ SWAP(1, 5);
+ SWAP(2, 6);
+ SWAP(1, 4);
+ SWAP(3, 6);
+ SWAP(2, 4);
+ SWAP(3, 5);
+ SWAP(3, 4);
+}
+#undef SWAP
+#endif // AOM_AV1_ENCODER_SORTING_NETWORK_H_
diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.c b/third_party/aom/av1/encoder/sparse_linear_solver.c
new file mode 100644
index 0000000000..e47c78e148
--- /dev/null
+++ b/third_party/aom/av1/encoder/sparse_linear_solver.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "config/aom_config.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/alloccommon.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+/*
+ * Input:
+ * rows: array of row positions
+ * cols: array of column positions
+ * values: array of element values
+ * num_elem: total number of elements in the matrix
+ * num_rows: number of rows in the matrix
+ * num_cols: number of columns in the matrix
+ *
+ * Output:
+ * sm: pointer to the sparse matrix to be initialized
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+ int num_elem, int num_rows, int num_cols,
+ SPARSE_MTX *sm) {
+ sm->n_elem = num_elem;
+ sm->n_rows = num_rows;
+ sm->n_cols = num_cols;
+ if (num_elem == 0) {
+ sm->row_pos = NULL;
+ sm->col_pos = NULL;
+ sm->value = NULL;
+ return 0;
+ }
+ sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
+ sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
+ sm->value = aom_calloc(num_elem, sizeof(*sm->value));
+
+ if (!sm->row_pos || !sm->col_pos || !sm->value) {
+ av1_free_sparse_mtx_elems(sm);
+ return -1;
+ }
+
+ memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
+ memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
+ memcpy(sm->value, values, num_elem * sizeof(*sm->value));
+
+ return 0;
+}
+
+/*
+ * Combines two sparse matrices (allocating new space).
+ *
+ * Input:
+ * sm1, sm2: matrices to be combined
+ * row_offset1, row_offset2: row offset of each matrix in the new matrix
+ * col_offset1, col_offset2: column offset of each matrix in the new matrix
+ * new_n_rows, new_n_cols: number of rows and columns in the new matrix
+ *
+ * Output:
+ * sm: the combined matrix
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+ SPARSE_MTX *sm, int row_offset1,
+ int col_offset1, int row_offset2,
+ int col_offset2, int new_n_rows,
+ int new_n_cols) {
+ sm->n_elem = sm1->n_elem + sm2->n_elem;
+ sm->n_cols = new_n_cols;
+ sm->n_rows = new_n_rows;
+
+ if (sm->n_elem == 0) {
+ sm->row_pos = NULL;
+ sm->col_pos = NULL;
+ sm->value = NULL;
+ return 0;
+ }
+
+ sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
+ sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
+ sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));
+
+ if (!sm->row_pos || !sm->col_pos || !sm->value) {
+ av1_free_sparse_mtx_elems(sm);
+ return -1;
+ }
+
+ for (int i = 0; i < sm1->n_elem; i++) {
+ sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
+ sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
+ }
+ memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value));
+ int n_elem1 = sm1->n_elem;
+ for (int i = 0; i < sm2->n_elem; i++) {
+ sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2;
+ sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
+ }
+ memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
+ return 0;
+}
+
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
+ sm->n_cols = 0;
+ sm->n_rows = 0;
+ if (sm->n_elem != 0) {
+ aom_free(sm->row_pos);
+ aom_free(sm->col_pos);
+ aom_free(sm->value);
+ }
+ sm->n_elem = 0;
+}
+
+/*
+ * Calculate matrix and vector multiplication: A*b
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl) {
+ memset(dstv, 0, sizeof(*dstv) * dstl);
+ for (int i = 0; i < sm->n_elem; i++) {
+ dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i];
+ }
+}
+/*
+ * Calculate matrix and vector multiplication: b*A
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl) {
+ memset(dstv, 0, sizeof(*dstv) * dstl);
+ for (int i = 0; i < sm->n_elem; i++) {
+ dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i];
+ }
+}
+
+/*
+ * Calculate inner product of two vectors
+ *
+ * Input:
+ * src1, scr2: the vectors to be multiplied
+ * src1l: length of the vectors
+ *
+ * Output:
+ * the inner product
+ */
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) {
+ double result = 0;
+ for (int i = 0; i < src1l; i++) {
+ result += src1[i] * src2[i];
+ }
+ return result;
+}
+
+/*
+ * Multiply each element in the matrix sm with a constant c
+ */
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
+ for (int i = 0; i < sm->n_elem; i++) {
+ sm->value[i] *= c;
+ }
+}
+
+static INLINE void free_solver_local_buf(double *buf1, double *buf2,
+ double *buf3, double *buf4,
+ double *buf5, double *buf6,
+ double *buf7) {
+ aom_free(buf1);
+ aom_free(buf2);
+ aom_free(buf3);
+ aom_free(buf4);
+ aom_free(buf5);
+ aom_free(buf6);
+ aom_free(buf7);
+}
+
+/*
+ * Solve for Ax = b
+ * no requirement on A
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+ int bl, double *x) {
+ double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL,
+ *p_hatA = NULL, *x_hat = NULL;
+ double alpha, beta, rtr, r_norm_2;
+ double denormtemp;
+
+ // initialize
+ r = aom_calloc(bl, sizeof(*r));
+ r_hat = aom_calloc(bl, sizeof(*r_hat));
+ p = aom_calloc(bl, sizeof(*p));
+ p_hat = aom_calloc(bl, sizeof(*p_hat));
+ Ap = aom_calloc(bl, sizeof(*Ap));
+ p_hatA = aom_calloc(bl, sizeof(*p_hatA));
+ x_hat = aom_calloc(bl, sizeof(*x_hat));
+ if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) {
+ free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+ return -1;
+ }
+
+ int i;
+ for (i = 0; i < bl; i++) {
+ r[i] = b[i];
+ r_hat[i] = b[i];
+ p[i] = r[i];
+ p_hat[i] = r_hat[i];
+ x[i] = 0;
+ x_hat[i] = 0;
+ }
+ r_norm_2 = av1_vect_vect_multi(r_hat, bl, r);
+ for (int k = 0; k < MAX_CG_SP_ITER; k++) {
+ rtr = r_norm_2;
+ av1_mtx_vect_multi_right(A, p, Ap, bl);
+ av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl);
+
+ denormtemp = av1_vect_vect_multi(p_hat, bl, Ap);
+ if (denormtemp < 1e-10) break;
+ alpha = rtr / denormtemp;
+ r_norm_2 = 0;
+ for (i = 0; i < bl; i++) {
+ x[i] += alpha * p[i];
+ x_hat[i] += alpha * p_hat[i];
+ r[i] -= alpha * Ap[i];
+ r_hat[i] -= alpha * p_hatA[i];
+ r_norm_2 += r_hat[i] * r[i];
+ }
+ if (sqrt(r_norm_2) < 1e-2) {
+ break;
+ }
+ if (rtr < 1e-10) break;
+ beta = r_norm_2 / rtr;
+ for (i = 0; i < bl; i++) {
+ p[i] = r[i] + beta * p[i];
+ p_hat[i] = r_hat[i] + beta * p_hat[i];
+ }
+ }
+ // free
+ free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+ return 0;
+}
+
+/*
+ * Solve for Ax = b when A is symmetric and positive definite
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x) {
+ double *r = NULL, *p = NULL, *Ap = NULL;
+ double alpha, beta, rtr, r_norm_2;
+ double denormtemp;
+
+ // initialize
+ r = aom_calloc(bl, sizeof(*r));
+ p = aom_calloc(bl, sizeof(*p));
+ Ap = aom_calloc(bl, sizeof(*Ap));
+ if (!r || !p || !Ap) {
+ free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+ return -1;
+ }
+
+ int i;
+ for (i = 0; i < bl; i++) {
+ r[i] = b[i];
+ p[i] = r[i];
+ x[i] = 0;
+ }
+ r_norm_2 = av1_vect_vect_multi(r, bl, r);
+ int k;
+ for (k = 0; k < MAX_CG_SP_ITER; k++) {
+ rtr = r_norm_2;
+ av1_mtx_vect_multi_right(A, p, Ap, bl);
+ denormtemp = av1_vect_vect_multi(p, bl, Ap);
+ if (denormtemp < 1e-10) break;
+ alpha = rtr / denormtemp;
+ r_norm_2 = 0;
+ for (i = 0; i < bl; i++) {
+ x[i] += alpha * p[i];
+ r[i] -= alpha * Ap[i];
+ r_norm_2 += r[i] * r[i];
+ }
+ if (r_norm_2 < 1e-8 * bl) break;
+ if (rtr < 1e-10) break;
+ beta = r_norm_2 / rtr;
+ for (i = 0; i < bl; i++) {
+ p[i] = r[i] + beta * p[i];
+ }
+ }
+ // free
+ free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Solve for Ax = b using Jacobi method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) {
+ double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL,
+ *tempx = NULL;
+ double resi2;
+
+ diags = aom_calloc(bl, sizeof(*diags));
+ Rx = aom_calloc(bl, sizeof(*Rx));
+ x_last = aom_calloc(bl, sizeof(*x_last));
+ x_cur = aom_calloc(bl, sizeof(*x_cur));
+
+ if (!diags || !Rx || !x_last || !x_cur) {
+ free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+ return -1;
+ }
+
+ int i;
+ memset(x_last, 0, sizeof(*x_last) * bl);
+ // get the diagonals of A
+ memset(diags, 0, sizeof(*diags) * bl);
+ for (int c = 0; c < A->n_elem; c++) {
+ if (A->row_pos[c] != A->col_pos[c]) continue;
+ diags[A->row_pos[c]] = A->value[c];
+ }
+ int k;
+ for (k = 0; k < MAX_CG_SP_ITER; k++) {
+ // R = A - diag(diags)
+ // get R*x_last
+ memset(Rx, 0, sizeof(*Rx) * bl);
+ for (int c = 0; c < A->n_elem; c++) {
+ if (A->row_pos[c] == A->col_pos[c]) continue;
+ Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c];
+ }
+ resi2 = 0;
+ for (i = 0; i < bl; i++) {
+ x_cur[i] = (b[i] - Rx[i]) / diags[i];
+ resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]);
+ }
+ if (resi2 <= 1e-10 * bl) break;
+ // swap last & cur buffer ptrs
+ tempx = x_last;
+ x_last = x_cur;
+ x_cur = tempx;
+ }
+ printf("\n numiter: %d\n", k);
+ for (i = 0; i < bl; i++) {
+ x[i] = x_cur[i];
+ }
+ free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * Solve for Ax = b using Steepest descent method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x) {
+ double *d = NULL, *Ad = NULL, *Ax = NULL;
+ double resi2, resi2_last, dAd, temp;
+
+ d = aom_calloc(bl, sizeof(*d));
+ Ax = aom_calloc(bl, sizeof(*Ax));
+ Ad = aom_calloc(bl, sizeof(*Ad));
+
+ if (!d || !Ax || !Ad) {
+ free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+ return -1;
+ }
+
+ int i;
+ // initialize with 0s
+ resi2 = 0;
+ for (i = 0; i < bl; i++) {
+ x[i] = 0;
+ d[i] = b[i];
+ resi2 += d[i] * d[i] / bl;
+ }
+ int k;
+ for (k = 0; k < MAX_CG_SP_ITER; k++) {
+ // get A*x_last
+ av1_mtx_vect_multi_right(A, d, Ad, bl);
+ dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad);
+ for (i = 0; i < bl; i++) {
+ temp = dAd * d[i];
+ x[i] = x[i] + temp;
+ }
+ av1_mtx_vect_multi_right(A, x, Ax, bl);
+ resi2_last = resi2;
+ resi2 = 0;
+ for (i = 0; i < bl; i++) {
+ d[i] = b[i] - Ax[i];
+ resi2 += d[i] * d[i] / bl;
+ }
+ if (resi2 <= 1e-8) break;
+ if (resi2_last - resi2 < 1e-8) {
+ break;
+ }
+ }
+ free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+#endif // CONFIG_OPTICAL_FLOW_API
diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.h b/third_party/aom/av1/encoder/sparse_linear_solver.h
new file mode 100644
index 0000000000..f30fc0f5b1
--- /dev/null
+++ b/third_party/aom/av1/encoder/sparse_linear_solver.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+// Number of iterations for solving linear equations.
+#define MAX_CG_SP_ITER 100
+
+typedef struct {
+ int n_elem; // number of non-zero elements
+ int n_rows;
+ int n_cols;
+ // using arrays to represent non-zero elements.
+ int *col_pos;
+ int *row_pos; // starts with 0
+ double *value;
+} SPARSE_MTX;
+
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+ int num_elem, int num_rows, int num_cols,
+ SPARSE_MTX *sm);
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+ SPARSE_MTX *sm, int row_offset1,
+ int col_offset1, int row_offset2,
+ int col_offset2, int new_n_rows,
+ int new_n_cols);
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm);
+
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl);
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl);
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2);
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c);
+
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x);
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+ int bl, double *x);
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x);
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x);
+
+#endif // CONFIG_OPTICAL_FLOW_API
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 0000000000..a6c0971096
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,2715 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+// Max speed setting for tx domain evaluation
+#define MAX_TX_DOMAIN_EVAL_SPEED 5
+static MESH_PATTERN
+ good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ };
+
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+
+// Threshold values to be used for pruning the txfm_domain_distortion
+// based on block MSE
+// Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = {
+ { UINT_MAX, UINT_MAX, UINT_MAX },
+ { 22026, 22026, 22026 },
+ { 1377, 1377, 1377 },
+ { 0, 0, 0 }
+};
+
+// Number of different levels of aggressiveness in using transform domain
+// distortion during the R-D evaluation based on the speed feature
+// tx_domain_dist_level.
+#define TX_DOMAIN_DIST_LEVELS 4
+
+// Transform domain distortion type to be used for default, mode and winner mode
+// evaluation Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode
+// evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+static unsigned int
+ tx_domain_dist_types[TX_DOMAIN_DIST_LEVELS][MODE_EVAL_TYPES] = {
+ { 0, 2, 0 }, { 1, 2, 0 }, { 2, 2, 0 }, { 2, 2, 2 }
+ };
+
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE / qstep^2.
+// TODO(any): Experiment the threshold logic based on variance metric.
+// Table has satd and dist threshold value index 0 : dist,index 1: satd
+// For each row, the indices are as follows.
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc)
+// Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+// feature is ON
+// There are 7 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = {
+ { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } },
+ { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } },
+ { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } },
+ { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } }
+};
+
+// Transform size to be used for default, mode and winner mode evaluation
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+// feature is ON
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = {
+ { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
+ { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
+ { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD },
+ { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL }
+};
+
+// Predict transform skip levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early skip prediction
+// 1 : conservative early skip prediction using DCT_DCT
+// 2 : early skip prediction based on SSE
+static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+ { 1, 1, 1 },
+ { 1, 2, 1 } };
+
+// Predict skip or DC block level used during transform type search. It is
+// indexed using the following:
+// First index : Speed feature 'dc_blk_pred_level' (0 to 3)
+// Second index : Mode evaluation type (DEFAULT_EVAL, MODE_EVAL and
+// WINNER_MODE_EVAL).
+//
+// The values of predict_dc_levels[][] indicate the aggressiveness of predicting
+// a block as transform skip or DC only.
+// Type 0 : No skip block or DC only block prediction
+// Type 1 : Prediction of skip block based on residual mean and variance
+// Type 2 : Prediction of skip block or DC only block based on residual mean and
+// variance
+static unsigned int predict_dc_levels[4][MODE_EVAL_TYPES] = {
+ { 0, 0, 0 }, { 1, 1, 0 }, { 2, 2, 0 }, { 2, 2, 2 }
+};
+
+#if !CONFIG_FPMT_TEST
+// This table holds the maximum number of reference frames for global motion.
+// The table is indexed as per the speed feature 'gm_search_type'.
+// 0 : All reference frames are allowed.
+// 1 : All reference frames except L2 and L3 are allowed.
+// 2 : All reference frames except L2, L3 and ARF2 are allowed.
+// 3 : No reference frame is allowed.
+static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = {
+ INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0
+};
+#endif
+
+// Qindex threshold levels used for selecting full-pel motion search.
+// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band
+// for resolution index 'j' for aggressiveness level 'i'.
+// Aggressiveness increases from i = 0 to 2.
+// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution.
+// Currently invoked only for speed 0, 1 and 2.
+static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } },
+ { { 170, 50 }, { MAXQ, 200 } },
+ { { 170, 40 }, { 200, 40 } } };
+
+// Full-pel search methods for aggressive search based on qindex.
+// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger
+// resolutions. Currently invoked only for speed 1 and 2.
+static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+ return frame_is_kf_gf_arf(cpi);
+}
+
+// Set transform rd gate level for all transform search cases.
+static AOM_INLINE void set_txfm_rd_gate_level(
+ int txfm_rd_gate_level[TX_SEARCH_CASES], int level) {
+ assert(level <= MAX_TX_RD_GATE_LEVEL);
+ for (int idx = 0; idx < TX_SEARCH_CASES; idx++)
+ txfm_rd_gate_level[idx] = level;
+}
+
+static void set_allintra_speed_feature_framesize_dependent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+ const bool use_hbd = cpi->oxcf.use_highbitdepth;
+
+ if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ if (is_720p_or_larger)
+ sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+ else
+ sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+ }
+
+ if (is_4k_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ sf->part_sf.ml_early_term_after_part_split_level = 1;
+ }
+
+ if (is_720p_or_larger) {
+ // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+ // current block's vertical texture instead of hardcoded with resolution
+ sf->mv_sf.use_downsampled_sad = 2;
+ }
+
+ if (speed >= 1) {
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+ sf->part_sf.ml_early_term_after_part_split_level = 2;
+ }
+
+ if (speed >= 2) {
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+ sf->part_sf.partition_search_breakout_rate_thr = 100;
+ }
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+ } else {
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+ }
+ }
+
+ if (speed >= 3) {
+ sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+ sf->part_sf.partition_search_breakout_rate_thr = 200;
+ } else {
+ sf->part_sf.max_intra_bsize = BLOCK_32X32;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ }
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+ }
+
+ if (speed >= 4) {
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ }
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+ }
+ }
+
+ if (speed >= 6) {
+ if (is_720p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ }
+
+ if (is_1080p_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+ }
+
+ if (speed >= 7) {
+ // TODO(kyslov): add more speed features to control speed/quality
+ }
+
+ if (speed >= 8) {
+ if (!is_480p_or_larger) {
+ sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+ }
+ if (is_720p_or_larger) {
+ sf->rt_sf.force_large_partition_blocks_intra = 1;
+ }
+ }
+
+ if (speed >= 9) {
+ // TODO(kyslov): add more speed features to control speed/quality
+ if (!is_4k_or_larger) {
+ // In av1_select_sb_size(), superblock size is set to 64x64 only for
+ // resolutions less than 4k in speed>=9, to improve the multithread
+ // performance. If cost update levels are set to INTERNAL_COST_UPD_OFF
+ // for resolutions >= 4k, the SB size setting can be modified for these
+ // resolutions as well.
+ sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ }
+ }
+}
+
+static void set_allintra_speed_features_framesize_independent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int allow_screen_content_tools =
+ cm->features.allow_screen_content_tools;
+ const int use_hbd = cpi->oxcf.use_highbitdepth;
+
+ sf->part_sf.less_rectangular_check_level = 1;
+ sf->part_sf.ml_prune_partition = 1;
+ sf->part_sf.prune_ext_partition_types_search_level = 1;
+ sf->part_sf.prune_part4_search = 2;
+ sf->part_sf.simple_motion_search_prune_rect = 1;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+ sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+ sf->part_sf.use_best_rd_for_pruning = 1;
+
+ sf->intra_sf.intra_pruning_with_hog = 1;
+ sf->intra_sf.prune_luma_palette_size_search_level = 1;
+ sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 1;
+ sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+ sf->rt_sf.use_nonrd_pick_mode = 0;
+ sf->rt_sf.use_real_time_ref_set = 0;
+
+ if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+ cpi->use_screen_content_tools) {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+ } else {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+ }
+
+ sf->rd_sf.perform_coeff_opt = 1;
+ sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+ if (speed >= 1) {
+ sf->part_sf.intra_cnn_based_part_prune_level =
+ allow_screen_content_tools ? 0 : 2;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ // TODO(Venkat): Clean-up frame type dependency for
+ // simple_motion_search_split in partition search function and set the
+ // speed feature accordingly
+ sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+ sf->part_sf.reuse_best_prediction_for_part_ab = 1;
+
+ sf->mv_sf.exhaustive_searches_thresh <<= 1;
+
+ sf->intra_sf.prune_palette_search_level = 1;
+ sf->intra_sf.prune_luma_palette_size_search_level = 2;
+ sf->intra_sf.top_intra_model_count_allowed = 3;
+
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+ sf->rd_sf.perform_coeff_opt = 2;
+ sf->rd_sf.tx_domain_dist_level = 1;
+ sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+ sf->lpf_sf.dual_sgr_penalty_level = 1;
+ sf->lpf_sf.enable_sgr_ep_pruning = 1;
+ }
+
+ if (speed >= 2) {
+ sf->mv_sf.auto_mv_step_size = 1;
+
+ sf->intra_sf.disable_smooth_intra = 1;
+ sf->intra_sf.intra_pruning_with_hog = 2;
+ sf->intra_sf.prune_filter_intra_level = 1;
+
+ sf->rd_sf.perform_coeff_opt = 3;
+
+ sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+ sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+ }
+
+ if (speed >= 3) {
+ sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+ sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+ sf->part_sf.less_rectangular_check_level = 2;
+ sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1;
+ sf->part_sf.prune_ext_part_using_split_info = 1;
+
+ sf->mv_sf.full_pixel_search_level = 1;
+ sf->mv_sf.search_method = DIAMOND;
+
+ // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+ // inherited directly from luma hog with some minor tweaking. Eventually we
+ // should run this with a bayesian optimizer to find the Pareto frontier.
+ sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+ sf->intra_sf.intra_pruning_with_hog = 3;
+ sf->intra_sf.prune_palette_search_level = 2;
+
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+ sf->tx_sf.use_rd_based_breakout_for_intra_tx_search = true;
+
+ // TODO(any): evaluate if these lpf features can be moved to speed 2.
+ // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+ // loss.
+ sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+ sf->lpf_sf.disable_loop_restoration_chroma = 0;
+ sf->lpf_sf.reduce_wiener_window_size = 1;
+ sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+ }
+
+ if (speed >= 4) {
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+ sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2;
+ sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+ sf->part_sf.prune_ext_part_using_split_info = 2;
+ sf->part_sf.early_term_after_none_split = 1;
+ sf->part_sf.ml_predict_breakout_level = 3;
+
+ sf->intra_sf.prune_chroma_modes_using_luma_winner = 1;
+
+ sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+
+ sf->tpl_sf.prune_starting_mv = 2;
+ sf->tpl_sf.subpel_force_stop = HALF_PEL;
+ sf->tpl_sf.search_method = FAST_BIGDIA;
+
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+ sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+ sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
+
+ sf->rd_sf.perform_coeff_opt = 5;
+ sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+
+ sf->mv_sf.reduce_search_range = 1;
+
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+ }
+
+ if (speed >= 5) {
+ sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3;
+ sf->part_sf.ext_partition_eval_thresh =
+ allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+ sf->part_sf.intra_cnn_based_part_prune_level =
+ allow_screen_content_tools ? 1 : 2;
+
+ sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+ sf->lpf_sf.use_coarse_filter_level_search = 0;
+ // Disable Wiener and Self-guided Loop restoration filters.
+ sf->lpf_sf.disable_wiener_filter = true;
+ sf->lpf_sf.disable_sgr_filter = true;
+
+ sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST;
+ }
+
+ if (speed >= 6) {
+ sf->intra_sf.prune_smooth_intra_mode_for_chroma = 1;
+ sf->intra_sf.prune_filter_intra_level = 2;
+ sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+ sf->intra_sf.intra_pruning_with_hog = 4;
+ sf->intra_sf.cfl_search_range = 1;
+ sf->intra_sf.top_intra_model_count_allowed = 2;
+ sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1;
+ sf->intra_sf.prune_luma_odd_delta_angles_in_intra = 1;
+
+ sf->part_sf.prune_rectangular_split_based_on_qidx =
+ allow_screen_content_tools ? 0 : 2;
+ sf->part_sf.prune_rect_part_using_4x4_var_deviation = true;
+ sf->part_sf.prune_rect_part_using_none_pred_mode = true;
+ sf->part_sf.prune_sub_8x8_partition_level =
+ allow_screen_content_tools ? 0 : 1;
+ sf->part_sf.prune_part4_search = 3;
+ // TODO(jingning): This might not be a good trade off if the
+ // target image quality is very low.
+ sf->part_sf.default_max_partition_size = BLOCK_32X32;
+
+ sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+ sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
+ sf->tx_sf.prune_intra_tx_depths_using_nn = true;
+
+ sf->rd_sf.perform_coeff_opt = 6;
+ sf->rd_sf.tx_domain_dist_level = 3;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+ sf->winner_mode_sf.prune_winner_mode_eval_level = 1;
+ sf->winner_mode_sf.dc_blk_pred_level = 1;
+ }
+ // The following should make all-intra mode speed 7 approximately equal
+ // to real-time speed 6,
+ // all-intra speed 8 close to real-time speed 7, and all-intra speed 9
+ // close to real-time speed 8
+ if (speed >= 7) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+ sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->rt_sf.var_part_split_threshold_shift = 7;
+ }
+
+ if (speed >= 8) {
+ sf->rt_sf.hybrid_intra_pickmode = 1;
+ sf->rt_sf.use_nonrd_pick_mode = 1;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+ sf->rt_sf.var_part_split_threshold_shift = 8;
+ // Set mask for intra modes.
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_32X32)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ else
+ // Use DC, H, V intra mode for block sizes < 32X32.
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+ }
+
+ if (speed >= 9) {
+ sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+
+ sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+ sf->rt_sf.hybrid_intra_pickmode = 0;
+ sf->rt_sf.var_part_split_threshold_shift = 9;
+ sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
+ sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
+ sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
+ sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
+ }
+
+ // As the speed feature prune_chroma_modes_using_luma_winner already
+ // constrains the number of chroma directional mode evaluations to a maximum
+ // of 1, the HOG computation and the associated pruning logic does not seem to
+ // help speed-up the chroma mode evaluations. Hence disable the speed feature
+ // chroma_intra_pruning_with_hog when prune_chroma_modes_using_luma_winner is
+ // enabled.
+ if (sf->intra_sf.prune_chroma_modes_using_luma_winner)
+ sf->intra_sf.chroma_intra_pruning_with_hog = 0;
+}
+
+static void set_good_speed_feature_framesize_dependent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+ const bool use_hbd = cpi->oxcf.use_highbitdepth;
+ // Speed features applicable for temporal filtering and tpl modules may be
+ // changed based on frame type at places where the sf is applied (Example :
+ // use_downsampled_sad). This is because temporal filtering and tpl modules
+ // are called before this function (except for the first key frame).
+ // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal
+ // filtering and tpl modules, modify the sf initialization appropriately
+ // before calling the modules.
+ const int boosted = frame_is_boosted(cpi);
+ const int is_boosted_arf2_bwd_type =
+ boosted ||
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+ const int is_lf_frame =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE;
+ const int allow_screen_content_tools =
+ cm->features.allow_screen_content_tools;
+
+ if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ if (is_720p_or_larger)
+ sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+ else
+ sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+ }
+
+ if (is_4k_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ sf->part_sf.ml_early_term_after_part_split_level = 1;
+ }
+
+ if (is_720p_or_larger) {
+ // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+ // current block's vertical texture instead of hardcoded with resolution
+ sf->mv_sf.use_downsampled_sad = 2;
+ }
+
+ if (!is_720p_or_larger) {
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ const int rate_tolerance =
+ AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+ sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2);
+ }
+
+ if (speed >= 1) {
+ if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+ sf->part_sf.ml_early_term_after_part_split_level = 2;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+ }
+
+ if (speed >= 2) {
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+ sf->part_sf.partition_search_breakout_rate_thr = 100;
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_obmc_prob_thresh = 16;
+ } else {
+ sf->inter_sf.prune_obmc_prob_thresh = 8;
+ }
+
+ if (is_480p_or_larger) {
+ sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
+ } else {
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ }
+
+ if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0;
+ } else {
+ sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0;
+ }
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+ } else {
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->mv_sf.disable_second_mv = 1;
+ sf->mv_sf.auto_mv_step_size = 2;
+ } else {
+ sf->mv_sf.disable_second_mv = boosted ? 0 : 2;
+ sf->mv_sf.auto_mv_step_size = 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->hl_sf.recode_tolerance = 50;
+ sf->inter_sf.disable_interinter_wedge_newmv_search =
+ is_boosted_arf2_bwd_type ? 0 : 1;
+ sf->inter_sf.enable_fast_wedge_mask_search = 1;
+ }
+ }
+
+ if (speed >= 3) {
+ sf->inter_sf.enable_fast_wedge_mask_search = 1;
+ sf->inter_sf.skip_newmv_in_drl = 2;
+ sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+ sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0;
+ sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
+ frame_is_intra_only(&cpi->common) ? 0 : 1;
+
+ sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+ sf->part_sf.partition_search_breakout_rate_thr = 200;
+ sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0;
+ } else {
+ sf->part_sf.max_intra_bsize = BLOCK_32X32;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0;
+ }
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+
+ if (is_480p_or_larger) {
+ sf->part_sf.early_term_after_none_split = 1;
+ } else {
+ sf->part_sf.early_term_after_none_split = 0;
+ }
+ if (is_720p_or_larger) {
+ sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2;
+ } else {
+ sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3;
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+ sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1;
+ } else {
+ sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+ sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2;
+ }
+
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ }
+
+ if (speed >= 4) {
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ }
+ sf->part_sf.early_term_after_none_split = 1;
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+ } else {
+ sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+ }
+
+ sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+ sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+ sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+ if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3;
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_comp_ref_frames = 1;
+ } else if (is_480p_or_larger) {
+ sf->inter_sf.prune_comp_ref_frames = is_boosted_arf2_bwd_type ? 0 : 1;
+ }
+
+ if (is_720p_or_larger)
+ sf->hl_sf.recode_tolerance = 32;
+ else
+ sf->hl_sf.recode_tolerance = 55;
+
+ sf->intra_sf.skip_intra_in_interframe = 4;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+ }
+
+ if (speed >= 5) {
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_warped_prob_thresh = 16;
+ } else if (is_480p_or_larger) {
+ sf->inter_sf.prune_warped_prob_thresh = 8;
+ }
+ if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40;
+
+ sf->inter_sf.skip_newmv_in_drl = 4;
+ sf->inter_sf.prune_comp_ref_frames = 1;
+ sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+
+ if (!is_720p_or_larger) {
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+ sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
+ (boosted || allow_screen_content_tools) ? 0 : 1;
+ sf->mv_sf.use_downsampled_sad = 1;
+ }
+
+ if (!is_480p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ }
+
+ if (is_480p_or_lesser) {
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1;
+ } else {
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
+ }
+
+ if (is_720p_or_larger)
+ sf->part_sf.ext_part_eval_based_on_cur_best =
+ (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1;
+
+ if (is_480p_or_larger) {
+ sf->tpl_sf.reduce_num_frames = 1;
+ }
+ }
+
+ if (speed >= 6) {
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+ sf->inter_sf.prune_comp_ref_frames = 2;
+ sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
+ (boosted || allow_screen_content_tools) ? 0 : 1;
+ sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 2;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ }
+
+ if (is_480p_or_larger) {
+ sf->hl_sf.allow_sub_blk_me_in_tf = 1;
+ }
+
+ if (is_1080p_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.disable_masked_comp = 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ }
+
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 28);
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_ref_mv_idx_search = 2;
+ } else {
+ sf->inter_sf.prune_ref_mv_idx_search = 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+ is_boosted_arf2_bwd_type ? 450 : 150;
+ }
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+
+ sf->hl_sf.recode_tolerance = 55;
+ }
+}
+
+static void set_good_speed_features_framesize_independent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boosted = frame_is_boosted(cpi);
+ const int is_boosted_arf2_bwd_type =
+ boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+ const int is_inter_frame =
+ gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME;
+ const int allow_screen_content_tools =
+ cm->features.allow_screen_content_tools;
+ const int use_hbd = cpi->oxcf.use_highbitdepth;
+ if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) {
+ sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
+ }
+
+ // Speed 0 for all speed features that give neutral coding performance change.
+ sf->gm_sf.gm_search_type = boosted ? GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2
+ : GM_SEARCH_CLOSEST_REFS_ONLY;
+ sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
+ sf->gm_sf.disable_gm_search_based_on_stats = 1;
+
+ sf->part_sf.less_rectangular_check_level = 1;
+ sf->part_sf.ml_prune_partition = 1;
+ sf->part_sf.prune_ext_partition_types_search_level = 1;
+ sf->part_sf.prune_part4_search = 2;
+ sf->part_sf.simple_motion_search_prune_rect = 1;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+ sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+ sf->part_sf.use_best_rd_for_pruning = 1;
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0;
+
+ // TODO(debargha): Test, tweak and turn on either 1 or 2
+ sf->inter_sf.inter_mode_rd_model_estimation = 1;
+ sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+ sf->inter_sf.prune_compound_using_single_ref = 1;
+ sf->inter_sf.prune_mode_search_simple_translation = 1;
+ sf->inter_sf.prune_ref_frame_for_rect_partitions =
+ (boosted || (allow_screen_content_tools))
+ ? 0
+ : (is_boosted_arf2_bwd_type ? 1 : 2);
+ sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+ sf->inter_sf.selective_ref_frame = 1;
+ sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+
+ sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+ sf->intra_sf.intra_pruning_with_hog = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 1;
+ sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+ sf->tpl_sf.search_method = NSTEP_8PT;
+
+ sf->rt_sf.use_nonrd_pick_mode = 0;
+ sf->rt_sf.use_real_time_ref_set = 0;
+
+ if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+ cpi->use_screen_content_tools) {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+ } else {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+ }
+
+ sf->rd_sf.perform_coeff_opt = 1;
+ sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+ if (speed >= 1) {
+ sf->hl_sf.adjust_num_frames_for_arf_filtering =
+ allow_screen_content_tools ? 0 : 1;
+
+ sf->part_sf.intra_cnn_based_part_prune_level =
+ allow_screen_content_tools ? 0 : 2;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ // TODO(Venkat): Clean-up frame type dependency for
+ // simple_motion_search_split in partition search function and set the
+ // speed feature accordingly
+ sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+
+ sf->mv_sf.exhaustive_searches_thresh <<= 1;
+ sf->mv_sf.obmc_full_pixel_search_level = 1;
+ sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
+ sf->mv_sf.disable_extensive_joint_motion_search = 1;
+
+ sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
+ sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+ sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
+ sf->inter_sf.prune_ref_frame_for_rect_partitions =
+ (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
+ ? 0
+ : (boosted ? 1 : 2);
+ sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+ sf->inter_sf.reuse_inter_intra_mode = 1;
+ sf->inter_sf.selective_ref_frame = 2;
+ sf->inter_sf.skip_arf_compound = 1;
+
+ sf->interp_sf.use_interp_filter = 1;
+
+ sf->intra_sf.prune_palette_search_level = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+ sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
+ sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+ sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+ sf->lpf_sf.dual_sgr_penalty_level = 1;
+ sf->lpf_sf.enable_sgr_ep_pruning = 1;
+
+ // TODO(any, yunqing): move this feature to speed 0.
+ sf->tpl_sf.skip_alike_starting_mv = 1;
+ }
+
+ if (speed >= 2) {
+ sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+ sf->fp_sf.skip_motion_search_threshold = 25;
+
+ sf->gm_sf.num_refinement_steps = 2;
+
+ sf->part_sf.reuse_best_prediction_for_part_ab =
+ !frame_is_intra_only(&cpi->common);
+
+ sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
+ sf->mv_sf.subpel_iters_per_step = 1;
+ sf->mv_sf.reduce_search_range = 1;
+
+ // TODO(chiyotsai@google.com): We can get 10% speed up if we move
+ // adaptive_rd_thresh to speed 1. But currently it performs poorly on some
+ // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
+ // bit more closely to figure out why.
+ sf->inter_sf.adaptive_rd_thresh = 1;
+ sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+ sf->inter_sf.fast_interintra_wedge_search = 1;
+ sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
+ sf->inter_sf.prune_ext_comp_using_neighbors = 1;
+ sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
+ sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+ sf->inter_sf.selective_ref_frame = 3;
+ sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+ sf->inter_sf.enable_fast_compound_mode_search = 1;
+ sf->inter_sf.reuse_mask_search_results = 1;
+ set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 1);
+ sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1;
+ sf->inter_sf.alt_ref_search_fp = 1;
+
+ sf->interp_sf.adaptive_interp_filter_search = 1;
+ sf->interp_sf.disable_dual_filter = 1;
+
+ sf->intra_sf.disable_smooth_intra =
+ !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1);
+ sf->intra_sf.intra_pruning_with_hog = 2;
+ sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1;
+ sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+ sf->tpl_sf.prune_starting_mv = 1;
+ sf->tpl_sf.search_method = DIAMOND;
+
+ sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+ sf->rd_sf.use_mb_rd_hash = 1;
+
+ sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+ sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+ sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
+ sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1;
+
+ // TODO(any): Re-evaluate this feature set to 1 in speed 2.
+ sf->tpl_sf.allow_compound_pred = 0;
+ sf->tpl_sf.prune_ref_frames_in_tpl = 1;
+ }
+
+ if (speed >= 3) {
+ sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+
+ sf->gm_sf.prune_ref_frame_for_gm_search = 1;
+ sf->gm_sf.prune_zero_mv_with_sse = 1;
+ sf->gm_sf.num_refinement_steps = 0;
+
+ sf->part_sf.less_rectangular_check_level = 2;
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools
+ ? SIMPLE_AGG_LVL0
+ : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1);
+ sf->part_sf.prune_ext_part_using_split_info = 1;
+ sf->part_sf.simple_motion_search_rect_split = 1;
+
+ sf->mv_sf.full_pixel_search_level = 1;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->mv_sf.search_method = DIAMOND;
+ sf->mv_sf.disable_second_mv = 2;
+ sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1;
+ sf->mv_sf.use_intrabc = 0;
+
+ sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.disable_onesided_comp = 1;
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
+ // and clean-up the speed feature
+ sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
+ sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
+ sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
+ sf->inter_sf.selective_ref_frame = 5;
+ sf->inter_sf.reuse_compound_type_decision = 1;
+ set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level,
+ boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2));
+ sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2;
+
+ sf->interp_sf.adaptive_interp_filter_search = 2;
+
+ // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+ // inherited directly from luma hog with some minor tweaking. Eventually we
+ // should run this with a bayesian optimizer to find the Pareto frontier.
+ sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+ sf->intra_sf.intra_pruning_with_hog = 3;
+ sf->intra_sf.prune_palette_search_level = 2;
+ sf->intra_sf.top_intra_model_count_allowed = 2;
+
+ sf->tpl_sf.prune_starting_mv = 2;
+ sf->tpl_sf.skip_alike_starting_mv = 2;
+ sf->tpl_sf.prune_intra_modes = 1;
+ sf->tpl_sf.reduce_first_step_size = 6;
+ sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+ sf->tpl_sf.gop_length_decision_method = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
+ sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+
+ // TODO(any): Refactor the code related to following winner mode speed
+ // features
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+ sf->winner_mode_sf.motion_mode_for_winner_cand =
+ boosted ? 0
+ : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1
+ : 2;
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4;
+
+ // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+ // loss.
+ sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+ sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+ sf->lpf_sf.use_coarse_filter_level_search =
+ frame_is_intra_only(&cpi->common) ? 0 : 1;
+ sf->lpf_sf.use_downsampled_wiener_stats = 1;
+ }
+
+ if (speed >= 4) {
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+ sf->gm_sf.prune_zero_mv_with_sse = 2;
+
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2;
+ sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+ sf->part_sf.prune_ext_part_using_split_info = 2;
+ sf->part_sf.ml_predict_breakout_level = 3;
+ sf->part_sf.prune_rectangular_split_based_on_qidx =
+ (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+ : 1;
+
+ sf->inter_sf.alt_ref_search_fp = 2;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3;
+
+ sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
+ sf->inter_sf.prune_ext_comp_using_neighbors = 2;
+ sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+ sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+
+ sf->interp_sf.cb_pred_filter_search = 1;
+ sf->interp_sf.skip_sharp_interp_filter_search = 1;
+ sf->interp_sf.use_interp_filter = 2;
+
+ sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+ // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4.
+ // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+ sf->intra_sf.skip_intra_in_interframe = 4;
+
+ sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+ sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+ sf->tpl_sf.subpel_force_stop = HALF_PEL;
+ sf->tpl_sf.search_method = FAST_BIGDIA;
+ sf->tpl_sf.use_sad_for_mode_decision = 1;
+
+ sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+
+ sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7;
+
+ // TODO(any): Extend multi-winner mode processing support for inter frames
+ sf->winner_mode_sf.multi_winner_mode_type =
+ frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT
+ : MULTI_WINNER_MODE_OFF;
+ sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 2;
+
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+ }
+
+ if (speed >= 5) {
+ sf->hl_sf.weight_calc_level_in_tf = 1;
+ sf->hl_sf.adjust_num_frames_for_arf_filtering =
+ allow_screen_content_tools ? 0 : 2;
+
+ sf->fp_sf.reduce_mv_step_param = 4;
+
+ sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3;
+ sf->part_sf.ext_partition_eval_thresh =
+ allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+ sf->part_sf.prune_sub_8x8_partition_level =
+ allow_screen_content_tools ? 1 : 2;
+
+ sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND;
+
+ sf->inter_sf.prune_inter_modes_if_skippable = 1;
+ sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5;
+ sf->inter_sf.enable_fast_compound_mode_search = 2;
+
+ sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1;
+
+ sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+ // TODO(any): Extend multi-winner mode processing support for inter frames
+ sf->winner_mode_sf.multi_winner_mode_type =
+ frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
+ : MULTI_WINNER_MODE_OFF;
+
+ // Disable Self-guided Loop restoration filter.
+ sf->lpf_sf.disable_sgr_filter = true;
+ sf->lpf_sf.disable_wiener_coeff_refine_search = true;
+
+ sf->tpl_sf.prune_starting_mv = 3;
+ sf->tpl_sf.use_y_only_rate_distortion = 1;
+ sf->tpl_sf.subpel_force_stop = FULL_PEL;
+ sf->tpl_sf.gop_length_decision_method = 2;
+ sf->tpl_sf.use_sad_for_mode_decision = 2;
+
+ sf->winner_mode_sf.dc_blk_pred_level = 2;
+
+ sf->fp_sf.disable_recon = 1;
+ }
+
+ if (speed >= 6) {
+ sf->hl_sf.disable_extra_sc_testing = 1;
+ sf->hl_sf.second_alt_ref_filtering = 0;
+
+ sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+ sf->inter_sf.selective_ref_frame = 6;
+ sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2;
+ sf->inter_sf.prune_ext_comp_using_neighbors = 3;
+
+ sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+ sf->intra_sf.intra_pruning_with_hog = 4;
+ sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+ sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+ sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC;
+ sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+ sf->part_sf.prune_rectangular_split_based_on_qidx =
+ boosted || allow_screen_content_tools ? 0 : 2;
+
+ sf->part_sf.prune_part4_search = 3;
+
+ sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
+ sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+ sf->tpl_sf.gop_length_decision_method = 3;
+
+ sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8;
+
+ sf->winner_mode_sf.dc_blk_pred_level = 3;
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+
+ sf->fp_sf.skip_zeromv_motion_search = 1;
+ }
+}
+
+static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
+ SPEED_FEATURES *const sf,
+ int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int boosted = frame_is_boosted(cpi);
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+
+ if (!is_360p_or_larger) {
+ sf->rt_sf.prune_intra_mode_based_on_mv_range = 1;
+ sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+ if (speed >= 6)
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+ if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2;
+ if (speed >= 7) {
+ sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+ sf->rt_sf.use_rtc_tf = 2;
+ }
+ if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1;
+ if (speed >= 8) {
+ sf->rt_sf.use_nonrd_filter_search = 1;
+ sf->rt_sf.tx_size_level_based_on_qstep = 1;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.nonrd_aggressive_skip = 1;
+ sf->rt_sf.skip_intra_pred = 1;
+ // Only turn on enable_ref_short_signaling for low resolution when only
+ // LAST and GOLDEN ref frames are used.
+ sf->rt_sf.enable_ref_short_signaling =
+ (!sf->rt_sf.use_nonrd_altref_frame &&
+ (!sf->rt_sf.use_comp_ref_nonrd ||
+ (!sf->rt_sf.ref_frame_comp_nonrd[1] &&
+ !sf->rt_sf.ref_frame_comp_nonrd[2])));
+
+// TODO(kyslov) Re-enable when AV1 models are trained
+#if 0
+#if CONFIG_RT_ML_PARTITIONING
+ if (!frame_is_intra_only(cm)) {
+ sf->part_sf.partition_search_type = ML_BASED_PARTITION;
+ sf->rt_sf.reuse_inter_pred_nonrd = 0;
+ }
+#endif
+#endif
+ sf->rt_sf.use_adaptive_subpel_search = false;
+ }
+ if (speed >= 10) {
+ // TODO(yunqingwang@google.com): To be conservative, disable
+ // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga
+ // for now. May enable it in the future.
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ sf->rt_sf.skip_intra_pred = 2;
+ sf->rt_sf.hybrid_intra_pickmode = 3;
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+ sf->rt_sf.use_nonrd_filter_search = 0;
+ }
+ } else {
+ sf->rt_sf.prune_intra_mode_based_on_mv_range = 2;
+ sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+ if (speed <= 5) {
+ sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+ boosted ? INT_MAX : 350;
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+ }
+ if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+ if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2;
+ if (speed == 7) {
+ sf->rt_sf.prefer_large_partition_blocks = 1;
+ // Enable this feature for [360p, 720p] resolution range initially.
+ // Only enable for low bitdepth to mitigate issue: b/303023614.
+ if (!cpi->rc.rtc_external_ratectrl &&
+ AOMMIN(cm->width, cm->height) <= 720 && !cpi->oxcf.use_highbitdepth)
+ sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ;
+ }
+ if (speed >= 7) {
+ sf->rt_sf.use_rtc_tf = 1;
+ }
+ if (speed == 8 && !cpi->ppi->use_svc) {
+ sf->rt_sf.short_circuit_low_temp_var = 0;
+ sf->rt_sf.use_nonrd_altref_frame = 1;
+ }
+ if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2;
+ if (speed >= 9) {
+ sf->rt_sf.gf_length_lvl = 1;
+ sf->rt_sf.skip_cdef_sb = 1;
+ sf->rt_sf.sad_based_adp_altref_lag = 2;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+ sf->rt_sf.use_adaptive_subpel_search = true;
+ sf->interp_sf.cb_pred_filter_search = 1;
+ }
+ if (speed >= 10) {
+ sf->rt_sf.hybrid_intra_pickmode = 2;
+ sf->rt_sf.sad_based_adp_altref_lag = 4;
+ sf->rt_sf.tx_size_level_based_on_qstep = 0;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ sf->rt_sf.use_adaptive_subpel_search = false;
+ sf->interp_sf.cb_pred_filter_search = 2;
+ }
+ }
+ if (!is_480p_or_larger) {
+ if (speed == 7) {
+ sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+ }
+ }
+ if (!is_720p_or_larger) {
+ if (speed >= 9) {
+ sf->rt_sf.force_large_partition_blocks_intra = 1;
+ }
+ } else {
+ if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3;
+ if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0;
+ if (speed >= 7) {
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.sad_based_adp_altref_lag = 1;
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+ }
+ if (speed >= 10) {
+ sf->rt_sf.sad_based_adp_altref_lag = 3;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ }
+ }
+ // TODO(Any): Check/Tune settings of other sfs for 1080p.
+ if (is_1080p_or_larger) {
+ if (speed >= 7) {
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+ sf->rt_sf.use_adaptive_subpel_search = 0;
+ }
+ if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0;
+ } else {
+ if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1;
+ }
+ // TODO(marpan): Tune settings for speed 11 video mode,
+ // for resolutions below 720p.
+ if (speed >= 11 && !is_720p_or_larger &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ sf->rt_sf.skip_cdef_sb = 2;
+ sf->rt_sf.force_only_last_ref = 1;
+ sf->rt_sf.selective_cdf_update = 1;
+ sf->rt_sf.use_nonrd_filter_search = 0;
+ if (is_360p_or_larger) {
+ sf->part_sf.fixed_partition_size = BLOCK_32X32;
+ sf->rt_sf.use_fast_fixed_part = 1;
+ }
+ sf->rt_sf.increase_source_sad_thresh = 1;
+ sf->rt_sf.part_early_exit_zeromv = 2;
+ sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+ for (int i = 0; i < BLOCK_SIZES; ++i) {
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ }
+ }
+ // Setting for SVC, or when the ref_frame_config control is
+ // used to set the reference structure.
+ if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) {
+ const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ // For SVC: for greater than 2 temporal layers, use better mv search on
+ // base temporal layers, and only on base spatial layer if highest
+ // resolution is above 640x360.
+ if (cpi->svc.number_temporal_layers >= 2 &&
+ cpi->svc.temporal_layer_id == 0 &&
+ (cpi->svc.spatial_layer_id == 0 ||
+ cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+ 640 * 360)) {
+ sf->mv_sf.search_method = NSTEP;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->rt_sf.fullpel_search_step_param = 10;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+ if (cm->width * cm->height <= 352 * 288)
+ sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+ sf->rt_sf.force_large_partition_blocks_intra = 0;
+ }
+ if (speed >= 8) {
+ if (cpi->svc.number_temporal_layers > 2)
+ sf->rt_sf.disable_cdf_update_non_reference_frame = true;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ if (rtc_ref->non_reference_frame) {
+ sf->rt_sf.nonrd_aggressive_skip = 1;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ }
+ }
+ if (speed <= 9 && cpi->svc.number_temporal_layers > 2 &&
+ cpi->svc.temporal_layer_id == 0)
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false;
+ else
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+ sf->rt_sf.frame_level_mode_cost_update = false;
+
+ // Compound mode enabling.
+ if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] ||
+ rtc_ref->ref_frame_comp[2]) {
+ sf->rt_sf.use_comp_ref_nonrd = 1;
+ sf->rt_sf.ref_frame_comp_nonrd[0] =
+ rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1];
+ sf->rt_sf.ref_frame_comp_nonrd[1] =
+ rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1];
+ sf->rt_sf.ref_frame_comp_nonrd[2] =
+ rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1];
+ } else {
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ }
+
+ if (cpi->svc.number_spatial_layers > 1 ||
+ cpi->svc.number_temporal_layers > 1)
+ sf->hl_sf.accurate_bit_estimate = 0;
+
+ sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+
+ // For single layers RPS: bias/adjustment for recovery frame.
+ if (cpi->ppi->rtc_ref.bias_recovery_frame) {
+ sf->mv_sf.search_method = NSTEP;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+ sf->rt_sf.fullpel_search_step_param = 8;
+ sf->rt_sf.nonrd_aggressive_skip = 0;
+ }
+ }
+ // Screen settings.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ // TODO(marpan): Check settings for speed 7 and 8.
+ if (speed >= 7) {
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+ sf->mv_sf.use_bsize_dependent_search_method = 0;
+ sf->rt_sf.skip_cdef_sb = 1;
+ sf->rt_sf.increase_color_thresh_palette = 1;
+ if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1;
+ }
+ if (speed >= 8) {
+ sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+ sf->rt_sf.use_nonrd_filter_search = 0;
+ sf->rt_sf.prune_hv_pred_modes_using_src_sad = false;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.prune_idtx_nonrd = 1;
+ sf->rt_sf.part_early_exit_zeromv = 2;
+ sf->rt_sf.skip_lf_screen = 1;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+ sf->rt_sf.var_part_split_threshold_shift = 10;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+ sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+ sf->interp_sf.cb_pred_filter_search = 0;
+ }
+ if (speed >= 10) {
+ if (cm->width * cm->height > 1920 * 1080)
+ sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+ sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
+ sf->rt_sf.part_early_exit_zeromv = 1;
+ sf->rt_sf.nonrd_aggressive_skip = 1;
+ }
+ if (speed >= 11) {
+ sf->rt_sf.skip_lf_screen = 2;
+ sf->rt_sf.skip_cdef_sb = 2;
+ sf->rt_sf.part_early_exit_zeromv = 2;
+ sf->rt_sf.prune_palette_nonrd = 1;
+ sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+ sf->rt_sf.increase_color_thresh_palette = 0;
+ }
+ sf->rt_sf.use_nonrd_altref_frame = 0;
+ sf->rt_sf.use_rtc_tf = 0;
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.source_metrics_sb_nonrd = 1;
+ if (cpi->rc.high_source_sad == 1) {
+ sf->rt_sf.prefer_large_partition_blocks = 0;
+ sf->part_sf.max_intra_bsize = BLOCK_128X128;
+ for (int i = 0; i < BLOCK_SIZES; ++i) {
+ if (i > BLOCK_32X32)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ else
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+ }
+ }
+ if (cpi->rc.max_block_source_sad > 20000 &&
+ cpi->rc.frame_source_sad > 100 && speed >= 6 &&
+ (cpi->rc.percent_blocks_with_motion > 1 ||
+ cpi->svc.last_layer_dropped[0])) {
+ sf->mv_sf.search_method = NSTEP;
+ sf->rt_sf.fullpel_search_step_param = 2;
+ }
+ sf->rt_sf.partition_direct_merging = 0;
+ sf->hl_sf.accurate_bit_estimate = 0;
+ // This feature is for nonrd_pickmode.
+ if (sf->rt_sf.use_nonrd_pick_mode)
+ sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+ else
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ }
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ sf->rt_sf.use_rtc_tf = 0;
+ // TODO(aomedia:3412): The setting accurate_bit_estimate = 0
+ // can be removed once it's fixed for lossless mode.
+ sf->hl_sf.accurate_bit_estimate = 0;
+ }
+ if (cpi->oxcf.use_highbitdepth) {
+ // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614.
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ }
+ if (cpi->oxcf.superres_cfg.enable_superres) {
+ sf->rt_sf.use_rtc_tf = 0;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+ }
+}
+
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+// except it sets non-rd flag on speed 8. This function will likely
+// be modified in the future with RT-specific speed features.
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int boosted = frame_is_boosted(cpi);
+
+ // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same.
+ // Following set of speed features are not impacting encoder's decisions as
+ // the relevant tools are disabled by default.
+ sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+ sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+ sf->inter_sf.reuse_inter_intra_mode = 1;
+ sf->inter_sf.prune_compound_using_single_ref = 0;
+ sf->inter_sf.prune_comp_search_by_single_result = 2;
+ sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+ sf->inter_sf.fast_wedge_sign_estimate = 1;
+ sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+ sf->interp_sf.cb_pred_filter_search = 0;
+ sf->interp_sf.skip_interp_filter_search = 1;
+ sf->part_sf.ml_prune_partition = 1;
+ sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+ sf->part_sf.prune_ext_partition_types_search_level = 2;
+ sf->part_sf.less_rectangular_check_level = 2;
+ sf->mv_sf.obmc_full_pixel_search_level = 1;
+ sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ sf->lpf_sf.dual_sgr_penalty_level = 1;
+ // Disable Wiener and Self-guided Loop restoration filters.
+ sf->lpf_sf.disable_wiener_filter = true;
+ sf->lpf_sf.disable_sgr_filter = true;
+ sf->intra_sf.prune_palette_search_level = 2;
+ sf->intra_sf.prune_luma_palette_size_search_level = 2;
+ sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+ // End of set
+
+ // TODO(any, yunqing): tune these features for real-time use cases.
+ sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
+ sf->hl_sf.frame_parameter_update = 0;
+
+ sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+ // TODO(any): As per the experiments, this speed feature is doing redundant
+ // computation since the model rd based pruning logic is similar to model rd
+ // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if
+ // either of the condition becomes true.
+ // (1) inter_mode_rd_model_estimation != 2
+ // (2) skip_interp_filter_search == 0
+ // (3) Motion mode or compound mode is enabled */
+ sf->inter_sf.prune_mode_search_simple_translation = 0;
+ sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ sf->inter_sf.selective_ref_frame = 4;
+ sf->inter_sf.alt_ref_search_fp = 2;
+ set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 4);
+ sf->inter_sf.limit_txfm_eval_per_mode = 3;
+
+ sf->inter_sf.adaptive_rd_thresh = 4;
+ sf->inter_sf.inter_mode_rd_model_estimation = 2;
+ sf->inter_sf.prune_inter_modes_if_skippable = 1;
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+ sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+ sf->inter_sf.skip_newmv_in_drl = 4;
+
+ sf->interp_sf.use_fast_interpolation_filter_search = 1;
+ sf->interp_sf.use_interp_filter = 1;
+ sf->interp_sf.adaptive_interp_filter_search = 1;
+ sf->interp_sf.disable_dual_filter = 1;
+
+ sf->part_sf.default_max_partition_size = BLOCK_128X128;
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ sf->part_sf.use_best_rd_for_pruning = 1;
+ sf->part_sf.early_term_after_none_split = 1;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+ sf->part_sf.max_intra_bsize = BLOCK_16X16;
+ sf->part_sf.partition_search_breakout_rate_thr = 500;
+ sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+ sf->part_sf.adjust_var_based_rd_partitioning = 2;
+
+ sf->mv_sf.full_pixel_search_level = 1;
+ sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+ sf->mv_sf.auto_mv_step_size = 1;
+ sf->mv_sf.subpel_iters_per_step = 1;
+ sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+ sf->mv_sf.search_method = FAST_DIAMOND;
+ sf->mv_sf.subpel_force_stop = EIGHTH_PEL;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+ for (int i = 0; i < TX_SIZES; ++i) {
+ sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
+ sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+ }
+ sf->intra_sf.skip_intra_in_interframe = 5;
+ sf->intra_sf.disable_smooth_intra = 1;
+ sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.tx_size_search_lgr_block = 1;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+ sf->tx_sf.refine_fast_tx_search_results = 0;
+ sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+ sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+
+ sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+ sf->rd_sf.simple_model_rd_from_var = 1;
+ sf->rd_sf.tx_domain_dist_level = 2;
+ sf->rd_sf.tx_domain_dist_thres_level = 2;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+ sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 3;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+ sf->winner_mode_sf.tx_size_search_level = 1;
+ sf->winner_mode_sf.winner_mode_ifs = 1;
+
+ sf->rt_sf.check_intra_pred_nonrd = 1;
+ sf->rt_sf.estimate_motion_for_var_based_partition = 2;
+ sf->rt_sf.hybrid_intra_pickmode = 1;
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.ref_frame_comp_nonrd[0] = 0;
+ sf->rt_sf.ref_frame_comp_nonrd[1] = 0;
+ sf->rt_sf.ref_frame_comp_nonrd[2] = 0;
+ sf->rt_sf.use_nonrd_filter_search = 1;
+ sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->rt_sf.num_inter_modes_for_tx_search = 5;
+ sf->rt_sf.prune_inter_modes_using_temp_var = 1;
+ sf->rt_sf.use_real_time_ref_set = 1;
+ sf->rt_sf.use_simple_rd_model = 1;
+ sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1;
+ // TODO(any): This sf could be removed.
+ sf->rt_sf.short_circuit_low_temp_var = 1;
+ sf->rt_sf.check_scene_detection = 1;
+ if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0;
+ if (cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->oxcf.rc_cfg.mode == AOM_CBR)
+ sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ;
+ // Enable noise estimation only for high resolutions for now.
+ //
+ // Since use_temporal_noise_estimate has no effect for all-intra frame
+ // encoding, it is disabled for this case.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480)
+ sf->rt_sf.use_temporal_noise_estimate = 1;
+ sf->rt_sf.skip_tx_no_split_var_based_partition = 1;
+ sf->rt_sf.skip_newmv_mode_based_on_sse = 1;
+ sf->rt_sf.mode_search_skip_flags =
+ (cm->current_frame.frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+ FLAG_EARLY_TERMINATE;
+ sf->rt_sf.var_part_split_threshold_shift = 5;
+ if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1;
+ sf->rt_sf.use_fast_fixed_part = 0;
+ sf->rt_sf.increase_source_sad_thresh = 0;
+
+ if (speed >= 6) {
+ sf->mv_sf.use_fullpel_costlist = 1;
+
+ sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+ sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0;
+ sf->inter_sf.limit_inter_mode_cands = 4;
+ sf->inter_sf.prune_warped_prob_thresh = 8;
+ sf->inter_sf.extra_prune_warped = 1;
+
+ sf->rt_sf.gf_refresh_based_on_qp = 1;
+ sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+ sf->rt_sf.var_part_split_threshold_shift = 7;
+ if (!frame_is_intra_only(&cpi->common))
+ sf->rt_sf.var_part_based_on_qidx = 2;
+
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3;
+ }
+
+ if (speed >= 7) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1;
+ sf->rt_sf.use_comp_ref_nonrd = 1;
+ sf->rt_sf.ref_frame_comp_nonrd[2] = 1; // LAST_ALTREF
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
+ sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+ sf->part_sf.max_intra_bsize = BLOCK_32X32;
+
+ sf->mv_sf.search_method = FAST_DIAMOND;
+ sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+
+ sf->inter_sf.inter_mode_rd_model_estimation = 2;
+ // This sf is not applicable in non-rd path.
+ sf->inter_sf.skip_newmv_in_drl = 0;
+
+ sf->interp_sf.skip_interp_filter_search = 0;
+
+ // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
+ // good. May need more study.
+ for (int i = 0; i < TX_SIZES; ++i) {
+ sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
+ }
+
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5;
+
+ sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+ // This is for rd path only.
+ sf->rt_sf.prune_inter_modes_using_temp_var = 0;
+ sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+ sf->rt_sf.prune_intra_mode_based_on_mv_range = 0;
+#if !CONFIG_REALTIME_ONLY
+ sf->rt_sf.reuse_inter_pred_nonrd =
+ (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0);
+#else
+ sf->rt_sf.reuse_inter_pred_nonrd = 1;
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0);
+#endif
+ sf->rt_sf.short_circuit_low_temp_var = 0;
+ // For spatial layers, only LAST and GOLDEN are currently used in the SVC
+ // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
+ // get_ref_frame_flags() for some patterns, so disable it here for
+ // spatial layers.
+ sf->rt_sf.use_nonrd_altref_frame =
+ (cpi->svc.number_spatial_layers > 1) ? 0 : 1;
+ sf->rt_sf.use_nonrd_pick_mode = 1;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+ sf->rt_sf.skip_intra_pred = 1;
+ sf->rt_sf.source_metrics_sb_nonrd = 1;
+ // Set mask for intra modes.
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_32X32)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ else
+ // Use DC, H, V intra mode for block sizes < 32X32.
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+
+ sf->winner_mode_sf.dc_blk_pred_level = 0;
+ sf->rt_sf.var_part_based_on_qidx = 3;
+ sf->rt_sf.prune_compoundmode_with_singlecompound_var = true;
+ sf->rt_sf.prune_compoundmode_with_singlemode_var = true;
+ sf->rt_sf.skip_compound_based_on_var = true;
+ sf->rt_sf.use_adaptive_subpel_search = true;
+ }
+
+ if (speed >= 8) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2;
+ sf->intra_sf.intra_pruning_with_hog = 1;
+ sf->rt_sf.short_circuit_low_temp_var = 1;
+ sf->rt_sf.use_nonrd_altref_frame = 0;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+ sf->rt_sf.var_part_split_threshold_shift = 8;
+ sf->rt_sf.var_part_based_on_qidx = 4;
+ sf->rt_sf.partition_direct_merging = 1;
+ sf->rt_sf.prune_compoundmode_with_singlemode_var = false;
+ sf->mv_sf.use_bsize_dependent_search_method = 2;
+ sf->rt_sf.prune_hv_pred_modes_using_src_sad = true;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3;
+ sf->rt_sf.estimate_motion_for_var_based_partition = 3;
+ sf->rt_sf.prefer_large_partition_blocks = 3;
+ sf->rt_sf.skip_intra_pred = 2;
+ sf->rt_sf.var_part_split_threshold_shift = 9;
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ sf->rt_sf.var_part_based_on_qidx = 0;
+ sf->rt_sf.frame_level_mode_cost_update = true;
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+ sf->rt_sf.use_adaptive_subpel_search = true;
+ sf->mv_sf.use_bsize_dependent_search_method = 0;
+ }
+ if (speed >= 10) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+ sf->rt_sf.var_part_split_threshold_shift = 10;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ }
+ if (speed >= 11 && !frame_is_intra_only(cm) &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ sf->winner_mode_sf.dc_blk_pred_level = 3;
+ }
+}
+
+static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
+ // best quality defaults
+ hl_sf->frame_parameter_update = 1;
+ hl_sf->recode_loop = ALLOW_RECODE;
+ // Recode loop tolerance %.
+ hl_sf->recode_tolerance = 25;
+ hl_sf->high_precision_mv_usage = CURRENT_Q;
+ hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL;
+ hl_sf->disable_extra_sc_testing = 0;
+ hl_sf->second_alt_ref_filtering = 1;
+ hl_sf->adjust_num_frames_for_arf_filtering = 0;
+ hl_sf->accurate_bit_estimate = 0;
+ hl_sf->weight_calc_level_in_tf = 0;
+ hl_sf->allow_sub_blk_me_in_tf = 0;
+}
+
+static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+ fp_sf->reduce_mv_step_param = 3;
+ fp_sf->skip_motion_search_threshold = 0;
+ fp_sf->disable_recon = 0;
+ fp_sf->skip_zeromv_motion_search = 0;
+}
+
+static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+ tpl_sf->gop_length_decision_method = 0;
+ tpl_sf->prune_intra_modes = 0;
+ tpl_sf->prune_starting_mv = 0;
+ tpl_sf->reduce_first_step_size = 0;
+ tpl_sf->skip_alike_starting_mv = 0;
+ tpl_sf->subpel_force_stop = EIGHTH_PEL;
+ tpl_sf->search_method = NSTEP;
+ tpl_sf->prune_ref_frames_in_tpl = 0;
+ tpl_sf->allow_compound_pred = 1;
+ tpl_sf->use_y_only_rate_distortion = 0;
+ tpl_sf->use_sad_for_mode_decision = 0;
+ tpl_sf->reduce_num_frames = 0;
+}
+
+static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
+ gm_sf->gm_search_type = GM_FULL_SEARCH;
+ gm_sf->prune_ref_frame_for_gm_search = 0;
+ gm_sf->prune_zero_mv_with_sse = 0;
+ gm_sf->disable_gm_search_based_on_stats = 0;
+ gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS;
+}
+
+static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+ part_sf->partition_search_type = SEARCH_PARTITION;
+ part_sf->less_rectangular_check_level = 0;
+ part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+ part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ part_sf->default_max_partition_size = BLOCK_LARGEST;
+ part_sf->default_min_partition_size = BLOCK_4X4;
+ part_sf->adjust_var_based_rd_partitioning = 0;
+ part_sf->max_intra_bsize = BLOCK_LARGEST;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ part_sf->fixed_partition_size = BLOCK_16X16;
+ // Recode loop tolerance %.
+ part_sf->partition_search_breakout_dist_thr = 0;
+ part_sf->partition_search_breakout_rate_thr = 0;
+ part_sf->prune_ext_partition_types_search_level = 0;
+ part_sf->prune_part4_search = 0;
+ part_sf->ml_prune_partition = 0;
+ part_sf->ml_early_term_after_part_split_level = 0;
+ for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+ part_sf->ml_partition_search_breakout_thresh[i] =
+ -1; // -1 means not enabled.
+ }
+ part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+ part_sf->simple_motion_search_split = 0;
+ part_sf->simple_motion_search_prune_rect = 0;
+ part_sf->simple_motion_search_early_term_none = 0;
+ part_sf->simple_motion_search_reduce_search_steps = 0;
+ part_sf->intra_cnn_based_part_prune_level = 0;
+ part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+ part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+ part_sf->ext_part_eval_based_on_cur_best = 0;
+ part_sf->prune_ext_part_using_split_info = 0;
+ part_sf->prune_rectangular_split_based_on_qidx = 0;
+ part_sf->prune_rect_part_using_4x4_var_deviation = false;
+ part_sf->prune_rect_part_using_none_pred_mode = false;
+ part_sf->early_term_after_none_split = 0;
+ part_sf->ml_predict_breakout_level = 0;
+ part_sf->prune_sub_8x8_partition_level = 0;
+ part_sf->simple_motion_search_rect_split = 0;
+ part_sf->reuse_prev_rd_results_for_part_ab = 0;
+ part_sf->reuse_best_prediction_for_part_ab = 0;
+ part_sf->use_best_rd_for_pruning = 0;
+ part_sf->skip_non_sq_part_based_on_none = 0;
+ part_sf->disable_8x8_part_based_on_qidx = 0;
+}
+
+static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+ mv_sf->full_pixel_search_level = 0;
+ mv_sf->auto_mv_step_size = 0;
+ mv_sf->exhaustive_searches_thresh = 0;
+ mv_sf->obmc_full_pixel_search_level = 0;
+ mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED;
+ mv_sf->reduce_search_range = 0;
+ mv_sf->search_method = NSTEP;
+ mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL;
+ mv_sf->subpel_force_stop = EIGHTH_PEL;
+ mv_sf->subpel_iters_per_step = 2;
+ mv_sf->subpel_search_method = SUBPEL_TREE;
+ mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+ mv_sf->use_bsize_dependent_search_method = 0;
+ mv_sf->use_fullpel_costlist = 0;
+ mv_sf->use_downsampled_sad = 0;
+ mv_sf->disable_extensive_joint_motion_search = 0;
+ mv_sf->disable_second_mv = 0;
+ mv_sf->skip_fullpel_search_using_startmv = 0;
+ mv_sf->warp_search_method = WARP_SEARCH_SQUARE;
+ mv_sf->warp_search_iters = 8;
+ mv_sf->use_intrabc = 1;
+}
+
+static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
+ inter_sf->adaptive_rd_thresh = 0;
+ inter_sf->model_based_post_interp_filter_breakout = 0;
+ inter_sf->reduce_inter_modes = 0;
+ inter_sf->alt_ref_search_fp = 0;
+ inter_sf->prune_single_ref = 0;
+ inter_sf->prune_comp_ref_frames = 0;
+ inter_sf->selective_ref_frame = 0;
+ inter_sf->prune_ref_frame_for_rect_partitions = 0;
+ inter_sf->fast_wedge_sign_estimate = 0;
+ inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
+ inter_sf->reuse_inter_intra_mode = 0;
+ inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB;
+ inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB;
+ inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB;
+ inter_sf->prune_inter_modes_based_on_tpl = 0;
+ inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF;
+ inter_sf->prune_comp_search_by_single_result = 0;
+ inter_sf->skip_repeated_ref_mv = 0;
+ inter_sf->skip_newmv_in_drl = 0;
+ inter_sf->inter_mode_rd_model_estimation = 0;
+ inter_sf->prune_compound_using_single_ref = 0;
+ inter_sf->prune_ext_comp_using_neighbors = 0;
+ inter_sf->skip_ext_comp_nearmv_mode = 0;
+ inter_sf->prune_comp_using_best_single_mode_ref = 0;
+ inter_sf->prune_nearest_near_mv_using_refmv_weight = 0;
+ inter_sf->disable_onesided_comp = 0;
+ inter_sf->prune_mode_search_simple_translation = 0;
+ inter_sf->prune_comp_type_by_comp_avg = 0;
+ inter_sf->disable_interinter_wedge_newmv_search = 0;
+ inter_sf->fast_interintra_wedge_search = 0;
+ inter_sf->prune_comp_type_by_model_rd = 0;
+ inter_sf->perform_best_rd_based_gating_for_chroma = 0;
+ inter_sf->prune_obmc_prob_thresh = 0;
+ inter_sf->disable_interinter_wedge_var_thresh = 0;
+ inter_sf->disable_interintra_wedge_var_thresh = 0;
+ inter_sf->prune_ref_mv_idx_search = 0;
+ inter_sf->prune_warped_prob_thresh = 0;
+ inter_sf->reuse_compound_type_decision = 0;
+ inter_sf->prune_inter_modes_if_skippable = 0;
+ inter_sf->disable_masked_comp = 0;
+ inter_sf->enable_fast_compound_mode_search = 0;
+ inter_sf->reuse_mask_search_results = 0;
+ inter_sf->enable_fast_wedge_mask_search = 0;
+ inter_sf->inter_mode_txfm_breakout = 0;
+ inter_sf->limit_inter_mode_cands = 0;
+ inter_sf->limit_txfm_eval_per_mode = 0;
+ inter_sf->skip_arf_compound = 0;
+ set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0);
+}
+
+static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
+ interp_sf->adaptive_interp_filter_search = 0;
+ interp_sf->cb_pred_filter_search = 0;
+ interp_sf->disable_dual_filter = 0;
+ interp_sf->skip_sharp_interp_filter_search = 0;
+ interp_sf->use_fast_interpolation_filter_search = 0;
+ interp_sf->use_interp_filter = 0;
+ interp_sf->skip_interp_filter_search = 0;
+}
+
+static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+ intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB;
+ intra_sf->skip_intra_in_interframe = 1;
+ intra_sf->intra_pruning_with_hog = 0;
+ intra_sf->chroma_intra_pruning_with_hog = 0;
+ intra_sf->prune_palette_search_level = 0;
+ intra_sf->prune_luma_palette_size_search_level = 0;
+
+ for (int i = 0; i < TX_SIZES; i++) {
+ intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
+ intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+ }
+ intra_sf->disable_smooth_intra = 0;
+ intra_sf->prune_smooth_intra_mode_for_chroma = 0;
+ intra_sf->prune_filter_intra_level = 0;
+ intra_sf->prune_chroma_modes_using_luma_winner = 0;
+ intra_sf->cfl_search_range = 3;
+ intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT;
+ intra_sf->adapt_top_model_rd_count_using_neighbors = 0;
+ intra_sf->early_term_chroma_palette_size_search = 0;
+ intra_sf->skip_filter_intra_in_inter_frames = 0;
+ intra_sf->prune_luma_odd_delta_angles_in_intra = 0;
+}
+
+static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
+ tx_sf->inter_tx_size_search_init_depth_sqr = 0;
+ tx_sf->inter_tx_size_search_init_depth_rect = 0;
+ tx_sf->intra_tx_size_search_init_depth_rect = 0;
+ tx_sf->intra_tx_size_search_init_depth_sqr = 0;
+ tx_sf->tx_size_search_lgr_block = 0;
+ tx_sf->model_based_prune_tx_search_level = 0;
+ tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1;
+ tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
+ tx_sf->tx_type_search.use_skip_flag_prediction = 1;
+ tx_sf->tx_type_search.use_reduced_intra_txset = 0;
+ tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
+ tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX;
+ tx_sf->tx_type_search.skip_tx_search = 0;
+ tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+ tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
+ tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
+ tx_sf->txb_split_cap = 1;
+ tx_sf->adaptive_txb_search_level = 0;
+ tx_sf->refine_fast_tx_search_results = 1;
+ tx_sf->prune_tx_size_level = 0;
+ tx_sf->prune_intra_tx_depths_using_nn = false;
+ tx_sf->use_rd_based_breakout_for_intra_tx_search = false;
+}
+
+static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
+ const AV1EncoderConfig *oxcf) {
+ const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant;
+ if (disable_trellis_quant == 3) {
+ rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
+ ? NO_ESTIMATE_YRD_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+ } else if (disable_trellis_quant == 2) {
+ rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
+ ? FINAL_PASS_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+ } else if (disable_trellis_quant == 0) {
+ if (is_lossless_requested(&oxcf->rc_cfg)) {
+ rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+ } else {
+ rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
+ }
+ } else if (disable_trellis_quant == 1) {
+ rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+ } else {
+ assert(0 && "Invalid disable_trellis_quant value");
+ }
+ rd_sf->use_mb_rd_hash = 0;
+ rd_sf->simple_model_rd_from_var = 0;
+ rd_sf->tx_domain_dist_level = 0;
+ rd_sf->tx_domain_dist_thres_level = 0;
+ rd_sf->perform_coeff_opt = 0;
+}
+
+static AOM_INLINE void init_winner_mode_sf(
+ WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
+ winner_mode_sf->motion_mode_for_winner_cand = 0;
+ // Set this at the appropriate speed levels
+ winner_mode_sf->tx_size_search_level = 0;
+ winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
+ winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
+ winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
+ winner_mode_sf->multi_winner_mode_type = 0;
+ winner_mode_sf->dc_blk_pred_level = 0;
+ winner_mode_sf->winner_mode_ifs = 0;
+ winner_mode_sf->prune_winner_mode_eval_level = 0;
+}
+
+static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
+ lpf_sf->disable_loop_restoration_chroma = 0;
+ lpf_sf->disable_loop_restoration_luma = 0;
+ lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+ lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ lpf_sf->prune_wiener_based_on_src_var = 0;
+ lpf_sf->prune_sgr_based_on_wiener = 0;
+ lpf_sf->enable_sgr_ep_pruning = 0;
+ lpf_sf->reduce_wiener_window_size = 0;
+ lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+ lpf_sf->use_coarse_filter_level_search = 0;
+ lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
+ // Set decoder side speed feature to use less dual sgr modes
+ lpf_sf->dual_sgr_penalty_level = 0;
+ // Enable Wiener and Self-guided Loop restoration filters by default.
+ lpf_sf->disable_wiener_filter = false;
+ lpf_sf->disable_sgr_filter = false;
+ lpf_sf->disable_wiener_coeff_refine_search = false;
+ lpf_sf->use_downsampled_wiener_stats = 0;
+}
+
+static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+ rt_sf->check_intra_pred_nonrd = 0;
+ rt_sf->skip_intra_pred = 0;
+ rt_sf->estimate_motion_for_var_based_partition = 0;
+ rt_sf->nonrd_check_partition_merge_mode = 0;
+ rt_sf->nonrd_check_partition_split = 0;
+ rt_sf->mode_search_skip_flags = 0;
+ rt_sf->nonrd_prune_ref_frame_search = 0;
+ rt_sf->use_nonrd_pick_mode = 0;
+ rt_sf->use_nonrd_altref_frame = 0;
+ rt_sf->use_comp_ref_nonrd = 0;
+ rt_sf->use_real_time_ref_set = 0;
+ rt_sf->short_circuit_low_temp_var = 0;
+ rt_sf->reuse_inter_pred_nonrd = 0;
+ rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+ rt_sf->use_nonrd_filter_search = 0;
+ rt_sf->use_simple_rd_model = 0;
+ rt_sf->hybrid_intra_pickmode = 0;
+ rt_sf->source_metrics_sb_nonrd = 0;
+ rt_sf->overshoot_detection_cbr = NO_DETECTION;
+ rt_sf->check_scene_detection = 0;
+ rt_sf->prefer_large_partition_blocks = 0;
+ rt_sf->use_temporal_noise_estimate = 0;
+ rt_sf->fullpel_search_step_param = 0;
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL;
+ rt_sf->prune_hv_pred_modes_using_src_sad = false;
+ rt_sf->nonrd_aggressive_skip = 0;
+ rt_sf->skip_cdef_sb = 0;
+ rt_sf->force_large_partition_blocks_intra = 0;
+ rt_sf->skip_tx_no_split_var_based_partition = 0;
+ rt_sf->skip_newmv_mode_based_on_sse = 0;
+ rt_sf->gf_length_lvl = 0;
+ rt_sf->prune_inter_modes_with_golden_ref = 0;
+ rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+ rt_sf->prune_inter_modes_using_temp_var = 0;
+ rt_sf->reduce_mv_pel_precision_highmotion = 0;
+ rt_sf->reduce_mv_pel_precision_lowcomplex = 0;
+ rt_sf->prune_intra_mode_based_on_mv_range = 0;
+ rt_sf->var_part_split_threshold_shift = 7;
+ rt_sf->gf_refresh_based_on_qp = 0;
+ rt_sf->use_rtc_tf = 0;
+ rt_sf->prune_idtx_nonrd = 0;
+ rt_sf->prune_palette_nonrd = 0;
+ rt_sf->dct_only_palette_nonrd = 0;
+ rt_sf->part_early_exit_zeromv = 0;
+ rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
+ rt_sf->skip_lf_screen = 0;
+ rt_sf->sad_based_adp_altref_lag = 0;
+ rt_sf->partition_direct_merging = 0;
+ rt_sf->var_part_based_on_qidx = 0;
+ rt_sf->tx_size_level_based_on_qstep = 0;
+ rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
+ rt_sf->prune_compoundmode_with_singlecompound_var = false;
+ rt_sf->frame_level_mode_cost_update = false;
+ rt_sf->prune_h_pred_using_best_mode_so_far = false;
+ rt_sf->enable_intra_mode_pruning_using_neighbors = false;
+ rt_sf->prune_intra_mode_using_best_sad_so_far = false;
+ rt_sf->check_only_zero_zeromv_on_large_blocks = false;
+ rt_sf->disable_cdf_update_non_reference_frame = false;
+ rt_sf->prune_compoundmode_with_singlemode_var = false;
+ rt_sf->skip_compound_based_on_var = false;
+ rt_sf->set_zeromv_skip_based_on_source_sad = 1;
+ rt_sf->use_adaptive_subpel_search = false;
+ rt_sf->screen_content_cdef_filter_qindex_thresh = 0;
+ rt_sf->enable_ref_short_signaling = false;
+ rt_sf->check_globalmv_on_single_ref = true;
+ rt_sf->increase_color_thresh_palette = false;
+ rt_sf->selective_cdf_update = 0;
+ rt_sf->force_only_last_ref = 0;
+}
+
+static fractional_mv_step_fp
+ *const fractional_mv_search[SUBPEL_SEARCH_METHODS] = {
+ av1_find_best_sub_pixel_tree, // SUBPEL_TREE = 0
+ av1_find_best_sub_pixel_tree_pruned, // SUBPEL_TREE_PRUNED = 1
+ av1_find_best_sub_pixel_tree_pruned_more // SUBPEL_TREE_PRUNED_MORE = 2
+ };
+
+// Populate appropriate sub-pel search method based on speed feature and user
+// specified settings
+static void set_subpel_search_method(
+ MotionVectorSearchParams *mv_search_params,
+ unsigned int motion_vector_unit_test,
+ SUBPEL_SEARCH_METHOD subpel_search_method) {
+ assert(subpel_search_method <= SUBPEL_TREE_PRUNED_MORE);
+ mv_search_params->find_fractional_mv_step =
+ fractional_mv_search[subpel_search_method];
+
+ // This is only used in motion vector unit test.
+ if (motion_vector_unit_test == 1)
+ mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+ else if (motion_vector_unit_test == 2)
+ mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ switch (oxcf->mode) {
+ case GOOD:
+ set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+ break;
+ case ALLINTRA:
+ set_allintra_speed_feature_framesize_dependent(cpi, sf, speed);
+ break;
+ case REALTIME:
+ set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+ break;
+ }
+
+ if (!cpi->ppi->seq_params_locked) {
+ cpi->common.seq_params->enable_masked_compound &=
+ !sf->inter_sf.disable_masked_comp;
+ cpi->common.seq_params->enable_interintra_compound &=
+ (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
+ }
+
+ set_subpel_search_method(&cpi->mv_search_params,
+ cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+ sf->mv_sf.subpel_search_method);
+
+ // For multi-thread use case with row_mt enabled, cost update for a set of
+ // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to
+ // INTERNAL_COST_UPD_SBROW in such cases.
+ if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) {
+ if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+ // Set mv_cost_upd_level to use row level update.
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ }
+ }
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ int i;
+
+ init_hl_sf(&sf->hl_sf);
+ init_fp_sf(&sf->fp_sf);
+ init_tpl_sf(&sf->tpl_sf);
+ init_gm_sf(&sf->gm_sf);
+ init_part_sf(&sf->part_sf);
+ init_mv_sf(&sf->mv_sf);
+ init_inter_sf(&sf->inter_sf);
+ init_interp_sf(&sf->interp_sf);
+ init_intra_sf(&sf->intra_sf);
+ init_tx_sf(&sf->tx_sf);
+ init_rd_sf(&sf->rd_sf, oxcf);
+ init_winner_mode_sf(&sf->winner_mode_sf);
+ init_lpf_sf(&sf->lpf_sf);
+ init_rt_sf(&sf->rt_sf);
+
+ switch (oxcf->mode) {
+ case GOOD:
+ set_good_speed_features_framesize_independent(cpi, sf, speed);
+ break;
+ case ALLINTRA:
+ set_allintra_speed_features_framesize_independent(cpi, sf, speed);
+ break;
+ case REALTIME:
+ set_rt_speed_features_framesize_independent(cpi, sf, speed);
+ break;
+ }
+
+ // Note: when use_nonrd_pick_mode is true, the transform size is the
+ // minimum of 16x16 and the largest possible size of the current block,
+ // which conflicts with the speed feature "enable_tx_size_search".
+ if (!oxcf->txfm_cfg.enable_tx_size_search &&
+ sf->rt_sf.use_nonrd_pick_mode == 0) {
+ sf->winner_mode_sf.tx_size_search_level = 3;
+ }
+
+ if (cpi->mt_info.num_workers > 1) {
+ // Loop restoration stage is conditionally disabled for speed 5, 6 when
+ // num_workers > 1. Since av1_pick_filter_restoration() is not
+ // multi-threaded, enabling the Loop restoration stage will cause an
+ // increase in encode time (3% to 7% increase depends on frame
+ // resolution).
+ // TODO(aomedia:3446): Implement multi-threading of
+ // av1_pick_filter_restoration() and enable Wiener filter for speed 5, 6
+ // similar to single thread encoding path.
+ if (speed >= 5) {
+ sf->lpf_sf.disable_sgr_filter = true;
+ sf->lpf_sf.disable_wiener_filter = true;
+ }
+ }
+
+ if (!cpi->ppi->seq_params_locked) {
+ cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &=
+ (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+ cpi->common.seq_params->enable_dual_filter &=
+ !sf->interp_sf.disable_dual_filter;
+ // Set the flag 'enable_restoration', if one the Loop restoration filters
+ // (i.e., Wiener or Self-guided) is enabled.
+ cpi->common.seq_params->enable_restoration &=
+ (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter);
+
+ cpi->common.seq_params->enable_interintra_compound &=
+ (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
+ }
+
+ const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mv_sf.mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_speed][i].range;
+ sf->mv_sf.mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_speed][i].interval;
+ }
+
+ // Update the mesh pattern of exhaustive motion search for intraBC
+ // Though intraBC mesh pattern is populated for all frame types, it is used
+ // only for intra frames of screen contents
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mv_sf.intrabc_mesh_patterns[i].range =
+ intrabc_mesh_patterns[mesh_speed][i].range;
+ sf->mv_sf.intrabc_mesh_patterns[i].interval =
+ intrabc_mesh_patterns[mesh_speed][i].interval;
+ }
+
+ // Slow quant, dct and trellis not worthwhile for first pass
+ // so make sure they are always turned off.
+ if (is_stat_generation_stage(cpi))
+ sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+
+ // No recode for 1 pass.
+ if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi))
+ sf->hl_sf.recode_loop = DISALLOW_RECODE;
+
+ set_subpel_search_method(&cpi->mv_search_params,
+ cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+ sf->mv_sf.subpel_search_method);
+
+ // assert ensures that tx_domain_dist_level is accessed correctly
+ assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
+ cpi->sf.rd_sf.tx_domain_dist_thres_level < 4);
+ memcpy(winner_mode_params->tx_domain_dist_threshold,
+ tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level],
+ sizeof(winner_mode_params->tx_domain_dist_threshold));
+
+ assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 &&
+ cpi->sf.rd_sf.tx_domain_dist_level < TX_DOMAIN_DIST_LEVELS);
+ memcpy(winner_mode_params->use_transform_domain_distortion,
+ tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level],
+ sizeof(winner_mode_params->use_transform_domain_distortion));
+
+ // assert ensures that coeff_opt_thresholds is accessed correctly
+ assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
+ cpi->sf.rd_sf.perform_coeff_opt < 9);
+ memcpy(winner_mode_params->coeff_opt_thresholds,
+ &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
+ sizeof(winner_mode_params->coeff_opt_thresholds));
+
+ // assert ensures that predict_skip_levels is accessed correctly
+ assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
+ cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
+ memcpy(winner_mode_params->skip_txfm_level,
+ predict_skip_levels[cpi->sf.tx_sf.tx_type_search
+ .use_skip_flag_prediction],
+ sizeof(winner_mode_params->skip_txfm_level));
+
+ // assert ensures that tx_size_search_level is accessed correctly
+ assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
+ cpi->sf.winner_mode_sf.tx_size_search_level <= 3);
+ memcpy(winner_mode_params->tx_size_search_methods,
+ tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
+ sizeof(winner_mode_params->tx_size_search_methods));
+ memcpy(winner_mode_params->predict_dc_level,
+ predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level],
+ sizeof(winner_mode_params->predict_dc_level));
+
+ if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) {
+ if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
+ // Revert to type 2
+ sf->inter_sf.inter_mode_rd_model_estimation = 2;
+ }
+
+#if !CONFIG_FPMT_TEST
+ // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve
+ // better parallelism when number of threads available are greater than or
+ // equal to maximum number of reference frames allowed for global motion.
+ if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH &&
+ (cpi->mt_info.num_workers >=
+ gm_available_reference_frames[sf->gm_sf.gm_search_type]))
+ sf->gm_sf.prune_ref_frame_for_gm_search = 0;
+#endif
+ }
+
+ // This only applies to the real time mode. Adaptive gf refresh is disabled if
+ // gf_cbr_boost_pct that is set by the user is larger than 0.
+ if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0)
+ sf->rt_sf.gf_refresh_based_on_qp = 0;
+}
+
+// Override some speed features based on qindex
+void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+ const int boosted = frame_is_boosted(cpi);
+ const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440;
+ const int is_arf2_bwd_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+ if (cpi->oxcf.mode == REALTIME) {
+ if (speed >= 6) {
+ const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150);
+ sf->part_sf.adjust_var_based_rd_partitioning =
+ frame_is_intra_only(cm)
+ ? 0
+ : cm->quant_params.base_qindex > qindex_thresh;
+ }
+ return;
+ }
+
+ if (speed == 0) {
+ // qindex_thresh for resolution < 720p
+ const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
+ if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
+ sf->part_sf.simple_motion_search_split =
+ cm->features.allow_screen_content_tools ? 1 : 2;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ }
+
+ if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) {
+ sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger;
+ memcpy(winner_mode_params->coeff_opt_thresholds,
+ &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt],
+ sizeof(winner_mode_params->coeff_opt_thresholds));
+ sf->part_sf.simple_motion_search_split =
+ cm->features.allow_screen_content_tools ? 1 : 2;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+ if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
+ sf->inter_sf.selective_ref_frame = 2;
+ sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+ sf->rd_sf.tx_domain_dist_thres_level = 1;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->interp_sf.cb_pred_filter_search = 0;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+ }
+ }
+ }
+
+ if (speed >= 2) {
+ // Disable extended partitions for lower quantizers
+ const int aggr = AOMMIN(4, speed - 2);
+ const int qindex_thresh1[4] = { 50, 50, 80, 100 };
+ const int qindex_thresh2[4] = { 80, 100, 120, 160 };
+ int qindex_thresh;
+ if (aggr <= 1) {
+ const int qthresh2 =
+ (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
+ qindex_thresh = cm->features.allow_screen_content_tools
+ ? qindex_thresh1[aggr]
+ : qthresh2;
+ if (cm->quant_params.base_qindex <= qindex_thresh && !boosted)
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (aggr <= 2) {
+ qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+ if (cm->quant_params.base_qindex <= qindex_thresh &&
+ !frame_is_intra_only(cm))
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (aggr <= 3) {
+ if (!is_480p_or_larger) {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (!is_720p_or_larger && !frame_is_intra_only(cm) &&
+ !cm->features.allow_screen_content_tools) {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else {
+ qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+ if (cm->quant_params.base_qindex <= qindex_thresh &&
+ !frame_is_intra_only(cm))
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ }
+ } else {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ }
+ }
+
+ if (speed >= 4) {
+ // Disable rectangular partitions for lower quantizers
+ const int aggr = AOMMIN(1, speed - 4);
+ const int qindex_thresh[2] = { 65, 80 };
+ int disable_rect_part;
+ disable_rect_part = !boosted;
+ if (cm->quant_params.base_qindex <= qindex_thresh[aggr] &&
+ disable_rect_part && is_480p_or_larger) {
+ sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8;
+ }
+ }
+
+ if (speed <= 2) {
+ if (!is_stat_generation_stage(cpi)) {
+ // Use faster full-pel motion search for high quantizers.
+ // Also use reduced total search range for low resolutions at high
+ // quantizers.
+ const int aggr = speed;
+ const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0];
+ const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1];
+ const SEARCH_METHODS search_method =
+ motion_search_method[is_720p_or_larger];
+ if (cm->quant_params.base_qindex > qindex_thresh1) {
+ sf->mv_sf.search_method = search_method;
+ sf->tpl_sf.search_method = search_method;
+ } else if (cm->quant_params.base_qindex > qindex_thresh2) {
+ sf->mv_sf.search_method = NSTEP_8PT;
+ }
+ }
+ }
+
+ if (speed >= 4) {
+ // Disable LR search at low and high quantizers and enable only for
+ // mid-quantizer range.
+ if (!boosted && !is_arf2_bwd_type) {
+ const int qindex_low[2] = { 100, 60 };
+ const int qindex_high[2] = { 180, 160 };
+ if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] ||
+ cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) {
+ sf->lpf_sf.disable_loop_restoration_luma = 1;
+ }
+ }
+ }
+
+ if (speed == 1) {
+ // Reuse interinter wedge mask search from first search for non-boosted
+ // non-internal-arf frames, except at very high quantizers.
+ if (cm->quant_params.base_qindex <= 200) {
+ if (!boosted && !is_arf2_bwd_type)
+ sf->inter_sf.reuse_mask_search_results = 1;
+ }
+ }
+
+ if (speed == 5) {
+ if (!(frame_is_intra_only(&cpi->common) ||
+ cm->features.allow_screen_content_tools)) {
+ const int qindex[2] = { 256, 128 };
+ // Set the sf value as 3 for low resolution and
+ // for higher resolutions with low quantizers.
+ if (cm->quant_params.base_qindex < qindex[is_480p_or_larger])
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+ }
+ }
+
+ if (speed >= 5) {
+ // Disable the sf for low quantizers in case of low resolution screen
+ // contents.
+ if (cm->features.allow_screen_content_tools &&
+ cm->quant_params.base_qindex < 128 && is_480p_or_lesser) {
+ sf->part_sf.prune_sub_8x8_partition_level = 0;
+ }
+ }
+
+ // Loop restoration size search
+ // At speed 0, always search all available sizes for the maximum possible gain
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+
+ if (speed >= 1) {
+ // For large frames, small restoration units are almost never useful,
+ // so prune them away
+ if (is_1440p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ } else if (is_720p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ }
+ }
+
+ if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) {
+ // At this speed, a full search is too expensive. Instead, pick a single
+ // size based on size and qindex. Note that, in general, higher quantizers
+ // (== lower quality) and larger frames generally want to use larger
+ // restoration units.
+ int qindex_thresh = 96;
+ if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ } else {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ }
+ }
+
+ set_subpel_search_method(&cpi->mv_search_params,
+ cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+ sf->mv_sf.subpel_search_method);
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 0000000000..60c000e4f4
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,2025 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encodemb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @file */
+
+/*!\cond */
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+ int range;
+ int interval;
+} MESH_PATTERN;
+
+enum {
+ GM_FULL_SEARCH,
+ GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+ GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
+
+ // Same as GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 but with extra filtering
+ // to keep at most two ref frames
+ GM_SEARCH_CLOSEST_REFS_ONLY,
+
+ GM_DISABLE_SEARCH
+} UENUM1BYTE(GM_SEARCH_TYPE);
+
+enum {
+ DIST_WTD_COMP_ENABLED,
+ DIST_WTD_COMP_SKIP_MV_SEARCH,
+ DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
+
+enum {
+ INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+ (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+ (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+ (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+ UV_INTRA_ALL =
+ (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+ (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+ (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+ (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+ (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC = (1 << UV_DC_PRED),
+ UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+ UV_INTRA_DC_PAETH_CFL =
+ (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+ UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+ (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+ (1 << UV_V_PRED) | (1 << UV_H_PRED),
+ UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+ (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+ (1 << UV_CFL_PRED),
+ INTRA_DC = (1 << DC_PRED),
+ INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
+ INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+ INTRA_DC_H_V_SMOOTH =
+ (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED),
+ INTRA_DC_PAETH_H_V =
+ (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+ (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+ (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+ (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
+ INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
+ (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+ (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+ (1 << NEAR_NEARMV),
+ INTER_SINGLE_ALL =
+ (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV),
+};
+
+enum {
+ DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+ DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+ DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+ LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+enum {
+ TXFM_CODING_SF = 1,
+ INTER_PRED_SF = 2,
+ INTRA_PRED_SF = 4,
+ PARTITION_SF = 8,
+ LOOP_FILTER_SF = 16,
+ RD_SKIP_SF = 32,
+ RESERVE_2_SF = 64,
+ RESERVE_3_SF = 128,
+} UENUM1BYTE(DEV_SPEED_FEATURES);
+
+/* This enumeration defines when the rate control recode loop will be
+ * enabled.
+ */
+enum {
+ /*
+ * No recodes allowed
+ */
+ DISALLOW_RECODE = 0,
+ /*
+ * Allow recode only for KF/ARF/GF frames
+ */
+ ALLOW_RECODE_KFARFGF = 1,
+ /*
+ * Allow recode for all frame types based on bitrate constraints.
+ */
+ ALLOW_RECODE = 2,
+} UENUM1BYTE(RECODE_LOOP_TYPE);
+
+enum {
+ SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches
+ SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively
+ SUBPEL_SEARCH_METHODS
+} UENUM1BYTE(SUBPEL_SEARCH_METHOD);
+
+enum {
+ // Try the full image with different values.
+ LPF_PICK_FROM_FULL_IMAGE,
+ // Try the full image filter search with non-dual filter only.
+ LPF_PICK_FROM_FULL_IMAGE_NON_DUAL,
+ // Try a small portion of the image with different values.
+ LPF_PICK_FROM_SUBIMAGE,
+ // Estimate the level based on quantizer and frame type
+ LPF_PICK_FROM_Q,
+ // Pick 0 to disable LPF if LPF was enabled last frame
+ LPF_PICK_MINIMAL_LPF
+} UENUM1BYTE(LPF_PICK_METHOD);
+/*!\endcond */
+
+/*!\enum CDEF_PICK_METHOD
+ * \brief This enumeration defines a variety of CDEF pick methods
+ */
+typedef enum {
+ CDEF_FULL_SEARCH, /**< Full search */
+ CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */
+ CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */
+ CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
+ Level 2. */
+ CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+ CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */
+ CDEF_PICK_FROM_Q, /**< Estimate filter strength based on quantizer. */
+ CDEF_PICK_METHODS
+} CDEF_PICK_METHOD;
+
+/*!\cond */
+enum {
+ // Terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1 << 0,
+
+ // Skips comp inter modes if the best so far is an intra mode.
+ FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+ // Skips oblique intra modes if the best so far is an inter mode.
+ FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+ // Skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions.
+ FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+ // Skips intra modes other than DC_PRED if the source variance is small
+ FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
+
+enum {
+ // No tx type pruning
+ TX_TYPE_PRUNE_0 = 0,
+ // adaptively prunes the least perspective tx types out of all 16
+ // (tuned to provide negligible quality loss)
+ TX_TYPE_PRUNE_1 = 1,
+ // similar, but applies much more aggressive pruning to get better speed-up
+ TX_TYPE_PRUNE_2 = 2,
+ TX_TYPE_PRUNE_3 = 3,
+ // More aggressive pruning based on tx type score and allowed tx count
+ TX_TYPE_PRUNE_4 = 4,
+ TX_TYPE_PRUNE_5 = 5,
+} UENUM1BYTE(TX_TYPE_PRUNE_MODE);
+
+enum {
+ // No reaction to rate control on a detected slide/scene change.
+ NO_DETECTION = 0,
+
+ // Set to larger Q based only on the detected slide/scene change and
+ // current/past Q.
+ FAST_DETECTION_MAXQ = 1,
+} UENUM1BYTE(OVERSHOOT_DETECTION_CBR);
+
+enum {
+ // Turns off multi-winner mode. So we will do txfm search on either all modes
+ // if winner mode is off, or we will only on txfm search on a single winner
+ // mode.
+ MULTI_WINNER_MODE_OFF = 0,
+
+ // Limits the number of winner modes to at most 2
+ MULTI_WINNER_MODE_FAST = 1,
+
+ // Uses the default number of winner modes, which is 3 for intra mode, and 1
+ // for inter mode.
+ MULTI_WINNER_MODE_DEFAULT = 2,
+
+ // Maximum number of winner modes allowed.
+ MULTI_WINNER_MODE_LEVELS,
+} UENUM1BYTE(MULTI_WINNER_MODE_TYPE);
+
+enum {
+ PRUNE_NEARMV_OFF = 0, // Turn off nearmv pruning
+ PRUNE_NEARMV_LEVEL1 = 1, // Prune nearmv for qindex (0-85)
+ PRUNE_NEARMV_LEVEL2 = 2, // Prune nearmv for qindex (0-170)
+ PRUNE_NEARMV_LEVEL3 = 3, // Prune nearmv more aggressively for qindex (0-170)
+ PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3,
+} UENUM1BYTE(PRUNE_NEARMV_LEVEL);
+
+enum {
+ // Default transform search used in evaluation of best inter candidates
+ // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL
+ // stage).
+ TX_SEARCH_DEFAULT = 0,
+ // Transform search in motion mode rd during MODE_EVAL stage.
+ TX_SEARCH_MOTION_MODE,
+ // Transform search in compound type mode rd during MODE_EVAL stage.
+ TX_SEARCH_COMP_TYPE_MODE,
+ // All transform search cases
+ TX_SEARCH_CASES
+} UENUM1BYTE(TX_SEARCH_CASE);
+
+typedef struct {
+ TX_TYPE_PRUNE_MODE prune_2d_txfm_mode;
+ int fast_intra_tx_type_search;
+
+ // INT_MAX: Disable fast search.
+ // 1 - 1024: Probability threshold used for conditionally forcing tx type,
+ // during mode search.
+ // 0: Force tx type to be DCT_DCT unconditionally, during
+ // mode search.
+ int fast_inter_tx_type_prob_thresh;
+
+ // Prune less likely chosen transforms for each intra mode. The speed
+ // feature ranges from 0 to 2, for different speed / compression trade offs.
+ int use_reduced_intra_txset;
+
+ // Use a skip flag prediction model to detect blocks with skip = 1 early
+ // and avoid doing full TX type search for such blocks.
+ int use_skip_flag_prediction;
+
+ // Threshold used by the ML based method to predict TX block split decisions.
+ int ml_tx_split_thresh;
+
+ // skip remaining transform type search when we found the rdcost of skip is
+ // better than applying transform
+ int skip_tx_search;
+
+ // Prune tx type search using previous frame stats.
+ int prune_tx_type_using_stats;
+ // Prune tx type search using estimated RDcost
+ int prune_tx_type_est_rd;
+
+ // Flag used to control the winner mode processing for tx type pruning for
+ // inter blocks. It enables further tx type mode pruning based on ML model for
+ // mode evaluation and disables tx type mode pruning for winner mode
+ // processing.
+ int winner_mode_tx_type_pruning;
+} TX_TYPE_SEARCH;
+
+enum {
+ // Search partitions using RD criterion
+ SEARCH_PARTITION,
+
+ // Always use a fixed size partition
+ FIXED_PARTITION,
+
+ // Partition using source variance
+ VAR_BASED_PARTITION,
+
+#if CONFIG_RT_ML_PARTITIONING
+ // Partition using ML model
+ ML_BASED_PARTITION
+#endif
+} UENUM1BYTE(PARTITION_SEARCH_TYPE);
+
+enum {
+ NOT_IN_USE,
+ DIRECT_PRED,
+ RELAXED_PRED,
+ ADAPT_PRED
+} UENUM1BYTE(MAX_PART_PRED_MODE);
+
+enum {
+ LAST_MV_DATA,
+ CURRENT_Q,
+ QTR_ONLY,
+} UENUM1BYTE(MV_PREC_LOGIC);
+
+enum {
+ SUPERRES_AUTO_ALL, // Tries all possible superres ratios
+ SUPERRES_AUTO_DUAL, // Tries no superres and q-based superres ratios
+ SUPERRES_AUTO_SOLO, // Only apply the q-based superres ratio
+} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
+/*!\endcond */
+
+/*!\enum INTERNAL_COST_UPDATE_TYPE
+ * \brief This enum decides internally how often to update the entropy costs
+ *
+ * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly
+ * more flexibility in update frequency. This enum is separate from \ref
+ * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its
+ * values are public so it cannot be modified without breaking public API.
+ * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to
+ * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and
+ * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in
+ * the order of increasing frequencies.
+ *
+ * \warning In case of any updates/modifications to the enum COST_UPDATE_TYPE,
+ * update the enum INTERNAL_COST_UPDATE_TYPE as well.
+ */
+typedef enum {
+ INTERNAL_COST_UPD_OFF, /*!< Turn off cost updates. */
+ INTERNAL_COST_UPD_TILE, /*!< Update every tile. */
+ INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */
+ INTERNAL_COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */
+ INTERNAL_COST_UPD_SB, /*!< Update every sb. */
+} INTERNAL_COST_UPDATE_TYPE;
+
+/*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL
+ * \brief This enumeration defines a variety of simple motion search based
+ * partition prune levels
+ */
+typedef enum {
+ NO_PRUNING = -1,
+ SIMPLE_AGG_LVL0, /*!< Simple prune aggressiveness level 0. */
+ SIMPLE_AGG_LVL1, /*!< Simple prune aggressiveness level 1. */
+ SIMPLE_AGG_LVL2, /*!< Simple prune aggressiveness level 2. */
+ SIMPLE_AGG_LVL3, /*!< Simple prune aggressiveness level 3. */
+ QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive
+ level maps to simple agg level 1 or 2 based on qindex.
+ */
+ TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune
+ aggressiveness levels. */
+ TOTAL_QINDEX_BASED_AGG_LVLS =
+ QIDX_BASED_AGG_LVL1 -
+ SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune
+ aggressiveness levels. */
+ TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS +
+ TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */
+} SIMPLE_MOTION_SEARCH_PRUNE_LEVEL;
+
+/*!\enum PRUNE_MESH_SEARCH_LEVEL
+ * \brief This enumeration defines a variety of mesh search prune levels.
+ */
+typedef enum {
+ PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */
+ PRUNE_MESH_SEARCH_LVL_1 = 1, /*!< Prune mesh search level 1. */
+ PRUNE_MESH_SEARCH_LVL_2 = 2, /*!< Prune mesh search level 2. */
+} PRUNE_MESH_SEARCH_LEVEL;
+
+/*!\enum INTER_SEARCH_EARLY_TERM_IDX
+ * \brief This enumeration defines inter search early termination index in
+ * non-rd path based on sse value.
+ */
+typedef enum {
+ EARLY_TERM_DISABLED =
+ 0, /*!< Early terminate inter mode search based on sse disabled. */
+ EARLY_TERM_IDX_1 =
+ 1, /*!< Early terminate inter mode search based on sse, index 1. */
+ EARLY_TERM_IDX_2 =
+ 2, /*!< Early terminate inter mode search based on sse, index 2. */
+ EARLY_TERM_IDX_3 =
+ 3, /*!< Early terminate inter mode search based on sse, index 3. */
+ EARLY_TERM_IDX_4 =
+ 4, /*!< Early terminate inter mode search based on sse, index 4. */
+ EARLY_TERM_INDICES, /*!< Total number of early terminate indices */
+} INTER_SEARCH_EARLY_TERM_IDX;
+
+/*!
+ * \brief Sequence/frame level speed vs quality features
+ */
+typedef struct HIGH_LEVEL_SPEED_FEATURES {
+ /*! Frame level coding parameter update. */
+ int frame_parameter_update;
+
+ /*!
+ * Cases and frame types for which the recode loop is enabled.
+ */
+ RECODE_LOOP_TYPE recode_loop;
+
+ /*!
+ * Controls the tolerance vs target rate used in deciding whether to
+ * recode a frame. It has no meaning if recode is disabled.
+ */
+ int recode_tolerance;
+
+ /*!
+ * Determine how motion vector precision is chosen. The possibilities are:
+ * LAST_MV_DATA: use the mv data from the last coded frame
+ * CURRENT_Q: use the current q as a threshold
+ * QTR_ONLY: use quarter pel precision only.
+ */
+ MV_PREC_LOGIC high_precision_mv_usage;
+
+ /*!
+ * Always set to 0. If on it enables 0 cost background transmission
+ * (except for the initial transmission of the segmentation). The feature is
+ * disabled because the addition of very large block sizes make the
+ * backgrounds very to cheap to encode, and the segmentation we have
+ * adds overhead.
+ */
+ int static_segmentation;
+
+ /*!
+ * Superres-auto mode search type:
+ */
+ SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
+
+ /*!
+ * Enable/disable extra screen content test by encoding key frame twice.
+ */
+ int disable_extra_sc_testing;
+
+ /*!
+ * Enable/disable second_alt_ref temporal filtering.
+ */
+ int second_alt_ref_filtering;
+
+ /*!
+ * The number of frames to be used during temporal filtering of an ARF frame
+ * is adjusted based on noise level of the current frame. The sf has three
+ * levels to decide number of frames to be considered for filtering:
+ * 0 : Use default number of frames
+ * 1 and 2 : Reduce the number of frames based on noise level with varied
+ * aggressiveness
+ */
+ int adjust_num_frames_for_arf_filtering;
+
+ /*!
+ * Decide the bit estimation approach used in qindex decision.
+ * 0: estimate bits based on a constant value;
+ * 1: estimate bits more accurately based on the frame complexity.
+ */
+ int accurate_bit_estimate;
+
+ /*!
+ * Decide the approach for weight calculation during temporal filtering.
+ * 0: Calculate weight using exp()
+ * 1: Calculate weight using a lookup table that approximates exp().
+ */
+ int weight_calc_level_in_tf;
+
+ /*!
+ * Decide whether to perform motion estimation at split block (i.e. 16x16)
+ * level or not.
+ * 0: Always allow motion estimation.
+ * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance.
+ */
+ int allow_sub_blk_me_in_tf;
+} HIGH_LEVEL_SPEED_FEATURES;
+
+/*!
+ * Speed features for the first pass.
+ */
+typedef struct FIRST_PASS_SPEED_FEATURES {
+ /*!
+ * \brief Reduces the mv search window.
+ * By default, the initial search window is around
+ * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023).
+ * Each step reduction decrease the window size by about a factor of 2.
+ */
+ int reduce_mv_step_param;
+
+ /*!
+ * \brief Skips the motion search when the zero mv has small sse.
+ */
+ int skip_motion_search_threshold;
+
+ /*!
+ * \brief Skips reconstruction by using source buffers for prediction
+ */
+ int disable_recon;
+
+ /*!
+ * \brief Skips the motion search centered on 0,0 mv.
+ */
+ int skip_zeromv_motion_search;
+} FIRST_PASS_SPEED_FEATURES;
+
+/*!\cond */
+typedef struct TPL_SPEED_FEATURES {
+ // GOP length adaptive decision.
+ // If set to 0, tpl model decides whether a shorter gf interval is better.
+ // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+ // (base+2) layer decide whether a shorter gf interval is better.
+ // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+ // decide whether a shorter gf interval is better.
+ // If set to 3, gop length adaptive decision is disabled.
+ int gop_length_decision_method;
+ // Prune the intra modes search by tpl.
+ // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
+ // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
+ int prune_intra_modes;
+ // This parameter controls which step in the n-step process we start at.
+ int reduce_first_step_size;
+ // Skip motion estimation based on the precision of center MVs and the
+ // difference between center MVs.
+ // If set to 0, motion estimation is skipped for duplicate center MVs
+ // (default). If set to 1, motion estimation is skipped for duplicate
+ // full-pixel center MVs. If set to 2, motion estimation is skipped if the
+ // difference between center MVs is less than the threshold.
+ int skip_alike_starting_mv;
+
+ // When to stop subpel search.
+ SUBPEL_FORCE_STOP subpel_force_stop;
+
+ // Which search method to use.
+ SEARCH_METHODS search_method;
+
+ // Prune starting mvs in TPL based on sad scores.
+ int prune_starting_mv;
+
+ // Prune reference frames in TPL.
+ int prune_ref_frames_in_tpl;
+
+ // Support compound predictions.
+ int allow_compound_pred;
+
+ // Calculate rate and distortion based on Y plane only.
+ int use_y_only_rate_distortion;
+
+ // Use SAD instead of SATD during intra/inter mode search.
+ // If set to 0, use SATD always.
+ // If set to 1, use SAD during intra/inter mode search for frames in the
+ // higher temporal layers of the hierarchical prediction structure.
+ // If set to 2, use SAD during intra/inter mode search for all frames.
+ // This sf is disabled for the first GF group of the key-frame interval,
+ // i.e., SATD is used during intra/inter mode search of the first GF group.
+ int use_sad_for_mode_decision;
+
+ // Skip tpl processing for frames of type LF_UPDATE.
+ // This sf is disabled for the first GF group of the key-frame interval.
+ int reduce_num_frames;
+} TPL_SPEED_FEATURES;
+
+typedef struct GLOBAL_MOTION_SPEED_FEATURES {
+ GM_SEARCH_TYPE gm_search_type;
+
+ // During global motion estimation, prune remaining reference frames in a
+ // given direction(past/future), if the evaluated ref_frame in that direction
+ // yields gm_type as INVALID/TRANSLATION/IDENTITY
+ int prune_ref_frame_for_gm_search;
+
+ // When the current GM type is set to ZEROMV, prune ZEROMV if its performance
+ // is worse than NEWMV under SSE metric.
+ // 0 : no pruning
+ // 1 : conservative pruning
+ // 2 : aggressive pruning
+ int prune_zero_mv_with_sse;
+
+ // Disable global motion estimation based on stats of previous frames in the
+ // GF group
+ int disable_gm_search_based_on_stats;
+
+ // Number of refinement steps to apply after initial model generation
+ int num_refinement_steps;
+} GLOBAL_MOTION_SPEED_FEATURES;
+
+typedef struct PARTITION_SPEED_FEATURES {
+ PARTITION_SEARCH_TYPE partition_search_type;
+
+ // Used if partition_search_type = FIXED_PARTITION
+ BLOCK_SIZE fixed_partition_size;
+
+ // Prune extended partition types search based on the current best partition
+ // and the combined rdcost of the subblocks estimated from previous
+ // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2
+ // increasing aggressiveness of pruning in order.
+ int prune_ext_partition_types_search_level;
+
+ // Prune part4 based on block size
+ int prune_part4_search;
+
+ // Use a ML model to prune rectangular, ab and 4-way horz
+ // and vert partitions
+ int ml_prune_partition;
+
+ // Use a ML model to adaptively terminate partition search after trying
+ // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and
+ // 1 - 2 increasing aggressiveness in order.
+ int ml_early_term_after_part_split_level;
+
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split. Can take values 0 - 2, 0 referring to no
+ // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+ int less_rectangular_check_level;
+
+ // Use square partition only beyond this block size.
+ BLOCK_SIZE use_square_partition_only_threshold;
+
+ // Sets max square partition levels for this superblock based on
+ // motion vector and prediction error distribution produced from 16x16
+ // simple motion search
+ MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+
+ // Min and max square partition size we enable (block_size) as per auto
+ // min max, but also used by adjust partitioning, and pick_partitioning.
+ BLOCK_SIZE default_min_partition_size;
+ BLOCK_SIZE default_max_partition_size;
+
+ // Sets level of adjustment of variance-based partitioning during
+ // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions
+ // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge
+ // and split leaf partitions and 0 - 3 decreasing aggressiveness in order.
+ int adjust_var_based_rd_partitioning;
+
+ // Partition search early breakout thresholds.
+ int64_t partition_search_breakout_dist_thr;
+ int partition_search_breakout_rate_thr;
+
+ // Thresholds for ML based partition search breakout.
+ int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+ // Aggressiveness levels for pruning split and rectangular partitions based on
+ // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to
+ // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to
+ // qindex based and simple motion search based pruning.
+ int simple_motion_search_prune_agg;
+
+ // Perform simple_motion_search on each possible subblock and use it to prune
+ // PARTITION_HORZ and PARTITION_VERT.
+ int simple_motion_search_prune_rect;
+
+ // Perform simple motion search before none_partition to decide if we
+ // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this
+ // model is disabled. If set to 1, the model attempts to perform
+ // PARTITION_SPLIT only. If set to 2, the model also attempts to prune
+ // PARTITION_SPLIT.
+ int simple_motion_search_split;
+
+ // Use features from simple_motion_search to terminate prediction block
+ // partition after PARTITION_NONE
+ int simple_motion_search_early_term_none;
+
+ // Controls whether to reduce the number of motion search steps. If this is 0,
+ // then simple_motion_search has the same number of steps as
+ // single_motion_search (assuming no other speed features). Otherwise, reduce
+ // the number of steps by the value contained in this variable.
+ int simple_motion_search_reduce_search_steps;
+
+ // This variable controls the maximum block size where intra blocks can be
+ // used in inter frames.
+ // TODO(aconverse): Fold this into one of the other many mode skips
+ BLOCK_SIZE max_intra_bsize;
+
+ // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
+ // perform partition pruning in intra frames.
+ // 0: No Pruning
+ // 1: Prune split and rectangular partitions only
+ // 2: Prune none, split and rectangular partitions
+ int intra_cnn_based_part_prune_level;
+
+ // Disable extended partition search if the current bsize is greater than the
+ // threshold. Must be a square block size BLOCK_8X8 or higher.
+ BLOCK_SIZE ext_partition_eval_thresh;
+
+ // Use best partition decision so far to tune 'ext_partition_eval_thresh'
+ int ext_part_eval_based_on_cur_best;
+
+ // Disable rectangular partitions for larger block sizes.
+ int rect_partition_eval_thresh;
+
+ // Prune extended partition search based on whether the split/rect partitions
+ // provided an improvement in the previous search.
+ // 0 : no pruning
+ // 1 : prune 1:4 partition search using winner info from split partitions
+ // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info
+ int prune_ext_part_using_split_info;
+
+ // Prunt rectangular, AB and 4-way partition based on q index and block size
+ // 0 : no pruning
+ // 1 : prune sub_8x8 at very low quantizers
+ // 2 : prune all block size based on qindex
+ int prune_rectangular_split_based_on_qidx;
+
+ // Prune rectangular partitions based on 4x4 sub-block variance
+ // false : no pruning
+ // true : prune rectangular partitions based on 4x4 sub-block variance
+ // deviation
+ //
+ // For allintra encode, this speed feature reduces instruction count by 6.4%
+ // for speed=6 with coding performance change less than 0.24%. For AVIF image
+ // encode, this speed feature reduces encode time by 8.14% for speed 6 on a
+ // typical image dataset with coding performance change less than 0.16%. This
+ // speed feature is not applicable to speed >= 7.
+ bool prune_rect_part_using_4x4_var_deviation;
+
+ // Prune rectangular partitions based on prediction mode chosen by NONE
+ // partition.
+ // false : no pruning
+ // true : prunes rectangular partition as described below
+ // If prediction mode chosen by NONE partition is
+ // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if
+ // at least one of the left and top neighbor blocks is larger than the
+ // current block.
+ // Directional Mode: Prunes either of the horizontal and vertical partition
+ // based on center angle of the prediction mode chosen by NONE partition. For
+ // example, vertical partition is pruned if center angle of the prediction
+ // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal
+ // direction) and vice versa.
+ // For allintra encode, this speed feature reduces instruction count by 5.1%
+ // for speed=6 with coding performance change less than 0.22%. For AVIF image
+ // encode, this speed feature reduces encode time by 4.44% for speed 6 on a
+ // typical image dataset with coding performance change less than 0.15%.
+ // For speed >= 7, variance-based logic is used to determine the partition
+ // structure instead of recursive partition search. Therefore, this speed
+ // feature is not applicable in such cases.
+ bool prune_rect_part_using_none_pred_mode;
+
+ // Terminate partition search for child partition,
+ // when NONE and SPLIT partition rd_costs are INT64_MAX.
+ int early_term_after_none_split;
+
+ // Level used to adjust threshold for av1_ml_predict_breakout(). At lower
+ // levels, more conservative threshold is used, and value of 0 indicates
+ // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default
+ // case with no adjustment to lbd thresholds.
+ int ml_predict_breakout_level;
+
+ // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions.
+ // 0 : no pruning
+ // 1 : pruning based on neighbour block information
+ // 2 : prune always
+ int prune_sub_8x8_partition_level;
+
+ // Prune rectangular split based on simple motion search split/no_split score.
+ // 0: disable pruning, 1: enable pruning
+ int simple_motion_search_rect_split;
+
+ // The current encoder adopts a DFS search for block partitions.
+ // Therefore the mode selection and associated rdcost is ready for smaller
+ // blocks before the mode selection for some partition types.
+ // AB partition could use previous rd information and skip mode search.
+ // An example is:
+ //
+ // current block
+ // +---+---+
+ // | |
+ // + +
+ // | |
+ // +-------+
+ //
+ // SPLIT partition has been searched first before trying HORZ_A
+ // +---+---+
+ // | R | R |
+ // +---+---+
+ // | R | R |
+ // +---+---+
+ //
+ // HORZ_A
+ // +---+---+
+ // | | |
+ // +---+---+
+ // | |
+ // +-------+
+ //
+ // With this speed feature, the top two sub blocks can directly use rdcost
+ // searched in split partition, and the mode info is also copied from
+ // saved info. Similarly, the bottom rectangular block can also use
+ // the available information from previous rectangular search.
+ int reuse_prev_rd_results_for_part_ab;
+
+ // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+ // when encoding PARTITION_AB.
+ int reuse_best_prediction_for_part_ab;
+
+ // The current partition search records the best rdcost so far and uses it
+ // in mode search and transform search to early skip when some criteria is
+ // met. For example, when the current rdcost is larger than the best rdcost,
+ // or the model rdcost is larger than the best rdcost times some thresholds.
+ // By default, this feature is turned on to speed up the encoder partition
+ // search.
+ // If disabling it, at speed 0, 30 frames, we could get
+ // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown.
+ int use_best_rd_for_pruning;
+
+ // Skip evaluation of non-square partitions based on the corresponding NONE
+ // partition.
+ // 0: no pruning
+ // 1: prune extended partitions if NONE is skippable
+ // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv
+ // mode and skippable
+ int skip_non_sq_part_based_on_none;
+
+ // Disables 8x8 and below partitions for low quantizers.
+ int disable_8x8_part_based_on_qidx;
+} PARTITION_SPEED_FEATURES;
+
+typedef struct MV_SPEED_FEATURES {
+ // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+ SEARCH_METHODS search_method;
+
+ // Enable the use of faster, less accurate mv search method
+ // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp
+ // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into
+ // account.
+ int use_bsize_dependent_search_method;
+
+ // If this is set to 1, we limit the motion search range to 2 times the
+ // largest motion vector found in the last frame.
+ int auto_mv_step_size;
+
+ // Subpel_search_method can only be subpel_tree which does a subpixel
+ // logarithmic search that keeps stepping at 1/2 pixel units until
+ // you stop getting a gain, and then goes on to 1/4 and repeats
+ // the same process. Along the way it skips many diagonals.
+ SUBPEL_SEARCH_METHOD subpel_search_method;
+
+ // Maximum number of steps in logarithmic subpel search before giving up.
+ int subpel_iters_per_step;
+
+ // When to stop subpel search.
+ SUBPEL_FORCE_STOP subpel_force_stop;
+
+ // When to stop subpel search in simple motion search.
+ SUBPEL_FORCE_STOP simple_motion_subpel_force_stop;
+
+ // If true, sub-pixel search uses the exact convolve function used for final
+ // encoding and decoding; otherwise, it uses bilinear interpolation.
+ SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+ // Threshold for allowing exhaustive motion search.
+ int exhaustive_searches_thresh;
+
+ // Pattern to be used for any exhaustive mesh searches (except intraBC ME).
+ MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+ // Pattern to be used for exhaustive mesh searches of intraBC ME.
+ MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP];
+
+ // Reduce single motion search range based on MV result of prior ref_mv_idx.
+ int reduce_search_range;
+
+ // Prune mesh search.
+ PRUNE_MESH_SEARCH_LEVEL prune_mesh_search;
+
+ // Use the rd cost around the best FULLPEL_MV to speed up subpel search
+ int use_fullpel_costlist;
+
+ // Set the full pixel search level of obmc
+ // 0: obmc_full_pixel_diamond
+ // 1: obmc_refining_search_sad (faster)
+ int obmc_full_pixel_search_level;
+
+ // Accurate full pixel motion search based on TPL stats.
+ int full_pixel_search_level;
+
+ // Allow intrabc motion search
+ int use_intrabc;
+
+ // Whether to downsample the rows in sad calculation during motion search.
+ // This is only active when there are at least 16 rows. When this sf is
+ // active, if there is a large discrepancy in the SAD values for the final
+ // motion vector between skipping vs not skipping, motion search is redone
+ // with skip row features off.
+ // 0: Disabled (do not downsample rows)
+ // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and
+ // odd rows for the starting MV is small. Redo motion search with sf off
+ // when SAD deviation is high for the final motion vector.
+ // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the
+ // start MV and tested only for the final MV.
+ int use_downsampled_sad;
+
+ // Enable/disable extensive joint motion search.
+ int disable_extensive_joint_motion_search;
+
+ // Enable second best mv check in joint mv search.
+ // 0: allow second MV (use rd cost as the metric)
+ // 1: use var as the metric
+ // 2: disable second MV
+ int disable_second_mv;
+
+ // Skips full pixel search based on start mv of prior ref_mv_idx.
+ // 0: Disabled
+ // 1: Skips the full pixel search upto 4 neighbor full-pel MV positions.
+ // 2: Skips the full pixel search upto 8 neighbor full-pel MV positions.
+ int skip_fullpel_search_using_startmv;
+
+ // Method to use for refining WARPED_CAUSAL motion vectors
+ // TODO(rachelbarker): Can this be unified with OBMC in some way?
+ WARP_SEARCH_METHOD warp_search_method;
+
+ // Maximum number of iterations in WARPED_CAUSAL refinement search
+ int warp_search_iters;
+} MV_SPEED_FEATURES;
+
+typedef struct INTER_MODE_SPEED_FEATURES {
+ // 2-pass inter mode model estimation where the preliminary pass skips
+ // transform search and uses a model to estimate rd, while the final pass
+ // computes the full transform search. Two types of models are supported:
+ // 0: not used
+ // 1: used with online dynamic rd model
+ // 2: used with static rd model
+ int inter_mode_rd_model_estimation;
+
+ // Bypass transform search based on skip rd at following stages
+ // i. Compound type mode search
+ // ii. Motion mode search (mode evaluation and winner motion mode stage)
+ // iii. Transform search for best inter candidates
+ int txfm_rd_gate_level[TX_SEARCH_CASES];
+
+ // Limit the inter mode tested in the RD loop
+ int reduce_inter_modes;
+
+ // This variable is used to cap the maximum number of times we skip testing a
+ // mode to be evaluated. A high value means we will be faster.
+ int adaptive_rd_thresh;
+
+ // Aggressively prune inter modes when best mode is skippable.
+ int prune_inter_modes_if_skippable;
+
+ // Drop less likely to be picked reference frames in the RD search.
+ // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune
+ // more aggressively than lower ones. (0 means no pruning).
+ int selective_ref_frame;
+
+ // Prune reference frames for rectangular partitions.
+ // 0 implies no pruning
+ // 1 implies prune for extended partition
+ // 2 implies prune horiz, vert and extended partition
+ int prune_ref_frame_for_rect_partitions;
+
+ // Prune inter modes w.r.t past reference frames
+ // 0 no pruning
+ // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames
+ // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames
+ int alt_ref_search_fp;
+
+ // Prune reference frames for single prediction modes based on temporal
+ // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is
+ // disabled for 0. An increasing value indicates more aggressive pruning
+ // threshold.
+ int prune_single_ref;
+
+ // Prune compound reference frames
+ // 0 no pruning
+ // 1 prune compound references which do not satisfy the two conditions:
+ // a) The references are at a nearest distance from the current frame in
+ // both past and future direction.
+ // b) The references have minimum pred_mv_sad in both past and future
+ // direction.
+ // 2 prune compound references except the one with nearest distance from the
+ // current frame in both past and future direction.
+ int prune_comp_ref_frames;
+
+ // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc.
+ // This speed feature equaling 0 means no skipping.
+ // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode
+ // if we have already encountered ref_mv in the drl such that:
+ // 1. The other drl has the same mv during the SIMPLE_TRANSLATION search
+ // process as the current mv.
+ // 2. The rate needed to encode the current mv is larger than that for the
+ // other ref_mv.
+ // The speed feature equaling 1 means using subpel mv in the comparison.
+ // The speed feature equaling 2 means using fullpel mv in the comparison.
+ // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on
+ // known full_mv bestsme and drl cost.
+ int skip_newmv_in_drl;
+
+ // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
+ // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
+ // TODO(any): Instead of skipping repeated ref mv, use the recalculated
+ // rd-cost based on mode rate and skip the mode evaluation
+ int skip_repeated_ref_mv;
+
+ // Flag used to control the ref_best_rd based gating for chroma
+ int perform_best_rd_based_gating_for_chroma;
+
+ // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+ // single ref modes
+ int reuse_inter_intra_mode;
+
+ // prune wedge and compound segment approximate rd evaluation based on
+ // compound average modeled rd
+ int prune_comp_type_by_model_rd;
+
+ // prune wedge and compound segment approximate rd evaluation based on
+ // compound average rd/ref_best_rd
+ int prune_comp_type_by_comp_avg;
+
+ // Skip some ref frames in compound motion search by single motion search
+ // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+ // increasing aggressiveness of skipping in order.
+ // Note: The search order might affect the result. It assumes that the single
+ // reference modes are searched before compound modes. It is better to search
+ // same single inter mode as a group.
+ int prune_comp_search_by_single_result;
+
+ // Instead of performing a full MV search, do a simple translation first
+ // and only perform a full MV search on the motion vectors that performed
+ // well.
+ int prune_mode_search_simple_translation;
+
+ // Only search compound modes with at least one "good" reference frame.
+ // A reference frame is good if, after looking at its performance among
+ // the single reference modes, it is one of the two best performers.
+ int prune_compound_using_single_ref;
+
+ // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV,
+ // NEW_NEARMV) using ref frames of above and left neighbor
+ // blocks.
+ // 0 : no pruning
+ // 1 : prune ext compound modes using neighbor blocks (less aggressiveness)
+ // 2 : prune ext compound modes using neighbor blocks (high aggressiveness)
+ // 3 : prune ext compound modes unconditionally (highest aggressiveness)
+ int prune_ext_comp_using_neighbors;
+
+ // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+ int skip_ext_comp_nearmv_mode;
+
+ // Skip extended compound mode when ref frame corresponding to NEWMV does not
+ // have NEWMV as single mode winner.
+ // 0 : no pruning
+ // 1 : prune extended compound mode (less aggressiveness)
+ // 2 : prune extended compound mode (high aggressiveness)
+ int prune_comp_using_best_single_mode_ref;
+
+ // Skip NEARESTMV and NEARMV using weight computed in ref mv list population
+ //
+ // Pruning is enabled only when both the top and left neighbor blocks are
+ // available and when the current block already has a valid inter prediction.
+ int prune_nearest_near_mv_using_refmv_weight;
+
+ // Based on previous ref_mv_idx search result, prune the following search.
+ int prune_ref_mv_idx_search;
+
+ // Disable one sided compound modes.
+ int disable_onesided_comp;
+
+ // Prune obmc search using previous frame stats.
+ // INT_MAX : disable obmc search
+ int prune_obmc_prob_thresh;
+
+ // Prune warped motion search using previous frame stats.
+ int prune_warped_prob_thresh;
+
+ // Variance threshold to enable/disable Interintra wedge search
+ unsigned int disable_interintra_wedge_var_thresh;
+
+ // Variance threshold to enable/disable Interinter wedge search
+ unsigned int disable_interinter_wedge_var_thresh;
+
+ // De-couple wedge and mode search during interintra RDO.
+ int fast_interintra_wedge_search;
+
+ // Whether fast wedge sign estimate is used
+ int fast_wedge_sign_estimate;
+
+ // Enable/disable ME for interinter wedge search.
+ int disable_interinter_wedge_newmv_search;
+
+ // Decide when and how to use joint_comp.
+ DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
+
+ // Clip the frequency of updating the mv cost.
+ INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level;
+
+ // Clip the frequency of updating the coeff cost.
+ INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level;
+
+ // Clip the frequency of updating the mode cost.
+ INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level;
+
+ // Prune inter modes based on tpl stats
+ // 0 : no pruning
+ // 1 - 3 indicate increasing aggressiveness in order.
+ int prune_inter_modes_based_on_tpl;
+
+ // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left
+ // neighbor blocks and qindex.
+ PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors;
+
+ // Model based breakout after interpolation filter search
+ // 0: no breakout
+ // 1: use model based rd breakout
+ int model_based_post_interp_filter_breakout;
+
+ // Reuse compound type rd decision when exact match is found
+ // 0: No reuse
+ // 1: Reuse the compound type decision
+ int reuse_compound_type_decision;
+
+ // Enable/disable masked compound.
+ int disable_masked_comp;
+
+ // Enable/disable MV refinement for compound modes corresponds to compound
+ // types COMPOUND_AVERAGE, COMPOUND_DISTWTD (currently, this compound type
+ // is disabled for speeds >= 2 using the sf 'use_dist_wtd_comp_flag') and
+ // COMPOUND_DIFFWTD based on the availability. Levels 0 to 3 indicate
+ // increasing order of aggressiveness to disable MV refinement.
+ // 0: MV Refinement is enabled and for NEW_NEWMV mode used two iterations of
+ // refinement in av1_joint_motion_search().
+ // 1: MV Refinement is disabled for COMPOUND_DIFFWTD and enabled for
+ // COMPOUND_AVERAGE & COMPOUND_DISTWTD.
+ // 2: MV Refinement is enabled for COMPOUND_AVERAGE & COMPOUND_DISTWTD for
+ // NEW_NEWMV mode with one iteration of refinement in
+ // av1_joint_motion_search() and MV Refinement is disabled for other compound
+ // type modes.
+ // 3: MV Refinement is disabled.
+ int enable_fast_compound_mode_search;
+
+ // Reuse masked compound type search results
+ int reuse_mask_search_results;
+
+ // Enable/disable fast search for wedge masks
+ int enable_fast_wedge_mask_search;
+
+ // Early breakout from transform search of inter modes
+ int inter_mode_txfm_breakout;
+
+ // Limit number of inter modes for txfm search if a newmv mode gets
+ // evaluated among the top modes.
+ // 0: no pruning
+ // 1 to 3 indicate increasing order of aggressiveness
+ int limit_inter_mode_cands;
+
+ // Cap the no. of txfm searches for a given prediction mode.
+ // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches.
+ int limit_txfm_eval_per_mode;
+
+ // Prune warped motion search based on block size.
+ int extra_prune_warped;
+
+ // Do not search compound modes for ARF.
+ // The intuition is that ARF is predicted by frames far away from it,
+ // whose temporal correlations with the ARF are likely low.
+ // It is therefore likely that compound modes do not work as well for ARF
+ // as other inter frames.
+ // Speed/quality impact:
+ // Speed 1: 12% faster, 0.1% psnr loss.
+ // Speed 2: 2% faster, 0.05% psnr loss.
+ // No change for speed 3 and up, because |disable_onesided_comp| is true.
+ int skip_arf_compound;
+} INTER_MODE_SPEED_FEATURES;
+
+typedef struct INTERP_FILTER_SPEED_FEATURES {
+ // Do limited interpolation filter search for dual filters, since best choice
+ // usually includes EIGHTTAP_REGULAR.
+ int use_fast_interpolation_filter_search;
+
+ // Disable dual filter
+ int disable_dual_filter;
+
+ // Save results of av1_interpolation_filter_search for a block
+ // Check mv and ref_frames before search, if they are very close with previous
+ // saved results, filter search can be skipped.
+ int use_interp_filter;
+
+ // skip sharp_filter evaluation based on regular and smooth filter rd for
+ // dual_filter=0 case
+ int skip_sharp_interp_filter_search;
+
+ // skip interpolation filter search for a block in chessboard pattern
+ int cb_pred_filter_search;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation
+ // filter search.
+ int skip_interp_filter_search;
+} INTERP_FILTER_SPEED_FEATURES;
+
+typedef struct INTRA_MODE_SPEED_FEATURES {
+ // These bit masks allow you to enable or disable intra modes for each
+ // transform size separately.
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
+
+ // flag to allow skipping intra mode for inter frame prediction
+ int skip_intra_in_interframe;
+
+ // Prune intra mode candidates based on source block histogram of gradient.
+ // Applies to luma plane only.
+ // Feasible values are 0..4. The feature is disabled for 0. An increasing
+ // value indicates more aggressive pruning threshold.
+ int intra_pruning_with_hog;
+
+ // Prune intra mode candidates based on source block histogram of gradient.
+ // Applies to chroma plane only.
+ // Feasible values are 0..4. The feature is disabled for 0. An increasing
+ // value indicates more aggressive pruning threshold.
+ int chroma_intra_pruning_with_hog;
+
+ // Enable/disable smooth intra modes.
+ int disable_smooth_intra;
+
+ // Prune UV_SMOOTH_PRED mode for chroma based on chroma source variance.
+ // false : No pruning
+ // true : Prune UV_SMOOTH_PRED mode based on chroma source variance
+ //
+ // For allintra encode, this speed feature reduces instruction count
+ // by 1.90%, 2.21% and 1.97% for speed 6, 7 and 8 with coding performance
+ // change less than 0.04%. For AVIF image encode, this speed feature reduces
+ // encode time by 1.56%, 2.14% and 0.90% for speed 6, 7 and 8 on a typical
+ // image dataset with coding performance change less than 0.05%.
+ bool prune_smooth_intra_mode_for_chroma;
+
+ // Prune filter intra modes in intra frames.
+ // 0 : No pruning
+ // 1 : Evaluate applicable filter intra modes based on best intra mode so far
+ // 2 : Do not evaluate filter intra modes
+ int prune_filter_intra_level;
+
+ // prune palette search
+ // 0: No pruning
+ // 1: Perform coarse search to prune the palette colors. For winner colors,
+ // neighbors are also evaluated using a finer search.
+ // 2: Perform 2 way palette search from max colors to min colors (and min
+ // colors to remaining colors) and terminate the search if current number of
+ // palette colors is not the winner.
+ int prune_palette_search_level;
+
+ // Terminate early in luma palette_size search. Speed feature values indicate
+ // increasing level of pruning.
+ // 0: No early termination
+ // 1: Terminate early for higher luma palette_size, if header rd cost of lower
+ // palette_size is more than 2 * best_rd. This level of pruning is more
+ // conservative when compared to sf level 2 as the cases which will get pruned
+ // with sf level 1 is a subset of the cases which will get pruned with sf
+ // level 2.
+ // 2: Terminate early for higher luma palette_size, if header rd cost of lower
+ // palette_size is more than best_rd.
+ // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%,
+ // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4,
+ // 5, 6, 7 and 8 on screen content set with coding performance change less
+ // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF
+ // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%,
+ // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6,
+ // 7 and 8 on a typical image dataset with coding performance change less than
+ // 0.01%.
+ int prune_luma_palette_size_search_level;
+
+ // Prune chroma intra modes based on luma intra mode winner.
+ // 0: No pruning
+ // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED,
+ // UV_CFL_PRED and the mode that corresponds to luma intra mode winner.
+ int prune_chroma_modes_using_luma_winner;
+
+ // Clip the frequency of updating the mv cost for intrabc.
+ INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level;
+
+ // We use DCT_DCT transform followed by computing SATD (Sum of Absolute
+ // Transformed Differences) as an estimation of RD score to quickly find the
+ // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search
+ // near the best possible parameter. The search range is set here.
+ // The range of cfl_searh_range should be [1, 33], and the following are the
+ // recommended values.
+ // 1: Fastest mode.
+ // 3: Default mode that provides good speedup without losing compression
+ // performance at speed 0.
+ // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only
+ // be used for debugging purpose.
+ int cfl_search_range;
+
+ // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in
+ // intra mode decision. Here, add a speed feature to reduce this number for
+ // higher speeds.
+ int top_intra_model_count_allowed;
+
+ // Adapt top_intra_model_count_allowed locally to prune luma intra modes using
+ // neighbor block and quantizer information.
+ int adapt_top_model_rd_count_using_neighbors;
+
+ // Prune the evaluation of odd delta angles of directional luma intra modes by
+ // using the rdcosts of neighbouring delta angles.
+ // For allintra encode, this speed feature reduces instruction count
+ // by 4.461%, 3.699% and 3.536% for speed 6, 7 and 8 on a typical video
+ // dataset with coding performance change less than 0.26%. For AVIF image
+ // encode, this speed feature reduces encode time by 2.849%, 2.471%,
+ // and 2.051% for speed 6, 7 and 8 on a typical image dataset with coding
+ // performance change less than 0.27%.
+ int prune_luma_odd_delta_angles_in_intra;
+
+ // Terminate early in chroma palette_size search.
+ // 0: No early termination
+ // 1: Terminate early for higher palette_size, if header rd cost of lower
+ // palette_size is more than best_rd.
+ // For allintra encode, this sf reduces instruction count by 0.45%,
+ // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen
+ // content set with coding performance change less than 0.01%.
+ // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%,
+ // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image
+ // dataset with no quality drop.
+ int early_term_chroma_palette_size_search;
+
+ // Skips the evaluation of filter intra modes in inter frames if rd evaluation
+ // of luma intra dc mode results in invalid rd stats.
+ int skip_filter_intra_in_inter_frames;
+} INTRA_MODE_SPEED_FEATURES;
+
+typedef struct TX_SPEED_FEATURES {
+ // Init search depth for square and rectangular transform partitions.
+ // Values:
+ // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+ int inter_tx_size_search_init_depth_sqr;
+ int inter_tx_size_search_init_depth_rect;
+ int intra_tx_size_search_init_depth_sqr;
+ int intra_tx_size_search_init_depth_rect;
+
+ // If any dimension of a coding block size above 64, always search the
+ // largest transform only, since the largest transform block size is 64x64.
+ int tx_size_search_lgr_block;
+
+ TX_TYPE_SEARCH tx_type_search;
+
+ // Skip split transform block partition when the collocated bigger block
+ // is selected as all zero coefficients.
+ int txb_split_cap;
+
+ // Shortcut the transform block partition and type search when the target
+ // rdcost is relatively lower.
+ // Values are 0 (not used) , or 1 - 2 with progressively increasing
+ // aggressiveness
+ int adaptive_txb_search_level;
+
+ // Prune level for tx_size_type search for inter based on rd model
+ // 0: no pruning
+ // 1-2: progressively increasing aggressiveness of pruning
+ int model_based_prune_tx_search_level;
+
+ // Refine TX type after fast TX search.
+ int refine_fast_tx_search_results;
+
+ // Prune transform split/no_split eval based on residual properties. A value
+ // of 0 indicates no pruning, and the aggressiveness of pruning progressively
+ // increases from levels 1 to 3.
+ int prune_tx_size_level;
+
+ // Prune the evaluation of transform depths as decided by the NN model.
+ // false: No pruning.
+ // true : Avoid the evaluation of specific transform depths using NN model.
+ //
+ // For allintra encode, this speed feature reduces instruction count
+ // by 4.76%, 8.92% and 11.28% for speed 6, 7 and 8 with coding performance
+ // change less than 0.32%. For AVIF image encode, this speed feature reduces
+ // encode time by 4.65%, 9.16% and 10.45% for speed 6, 7 and 8 on a typical
+ // image dataset with coding performance change less than 0.19%.
+ bool prune_intra_tx_depths_using_nn;
+
+ // Enable/disable early breakout during transform search of intra modes, by
+ // using the minimum rd cost possible. By using this approach, the rd
+ // evaluation of applicable transform blocks (in the current block) can be
+ // avoided as
+ // 1) best_rd evolves during the search in choose_tx_size_type_from_rd()
+ // 2) appropriate ref_best_rd is passed in intra_block_yrd()
+ //
+ // For allintra encode, this speed feature reduces instruction count
+ // by 1.11%, 1.08%, 1.02% and 0.93% for speed 3, 6, 7 and 8 with coding
+ // performance change less than 0.02%. For AVIF image encode, this speed
+ // feature reduces encode time by 0.93%, 1.46%, 1.07%, 0.84%, 0.99% and 0.73%
+ // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding
+ // performance change less than 0.004%.
+ bool use_rd_based_breakout_for_intra_tx_search;
+} TX_SPEED_FEATURES;
+
+typedef struct RD_CALC_SPEED_FEATURES {
+ // Fast approximation of av1_model_rd_from_var_lapndz
+ int simple_model_rd_from_var;
+
+ // Perform faster distortion computation during the R-D evaluation by trying
+ // to approximate the prediction error with transform coefficients (faster but
+ // less accurate) rather than computing distortion in the pixel domain (slower
+ // but more accurate). The following methods are used for distortion
+ // computation:
+ // Method 0: Always compute distortion in the pixel domain
+ // Method 1: Based on block error, try using transform domain distortion for
+ // tx_type search and compute distortion in pixel domain for final RD_STATS
+ // Method 2: Based on block error, try to compute distortion in transform
+ // domain
+ // Methods 1 and 2 may fallback to computing distortion in the pixel domain in
+ // case the block error is less than the threshold, which is controlled by the
+ // speed feature tx_domain_dist_thres_level.
+ //
+ // The speed feature tx_domain_dist_level decides which of the above methods
+ // needs to be used across different mode evaluation stages as described
+ // below:
+ // Eval type: Default Mode Winner
+ // Level 0 : Method 0 Method 2 Method 0
+ // Level 1 : Method 1 Method 2 Method 0
+ // Level 2 : Method 2 Method 2 Method 0
+ // Level 3 : Method 2 Method 2 Method 2
+ int tx_domain_dist_level;
+
+ // Transform domain distortion threshold level
+ int tx_domain_dist_thres_level;
+
+ // Trellis (dynamic programming) optimization of quantized values
+ TRELLIS_OPT_TYPE optimize_coefficients;
+
+ // Use hash table to store macroblock RD search results
+ // to avoid repeated search on the same residue signal.
+ int use_mb_rd_hash;
+
+ // Flag used to control the extent of coeff R-D optimization
+ int perform_coeff_opt;
+} RD_CALC_SPEED_FEATURES;
+
+typedef struct WINNER_MODE_SPEED_FEATURES {
+ // Flag used to control the winner mode processing for better R-D optimization
+ // of quantized coeffs
+ int enable_winner_mode_for_coeff_opt;
+
+ // Flag used to control the winner mode processing for transform size
+ // search method
+ int enable_winner_mode_for_tx_size_srch;
+
+ // Control transform size search level
+ // Eval type: Default Mode Winner
+ // Level 0 : FULL RD LARGEST ALL FULL RD
+ // Level 1 : FAST RD LARGEST ALL FULL RD
+ // Level 2 : LARGEST ALL LARGEST ALL FULL RD
+ // Level 3 : LARGEST ALL LARGEST ALL LARGEST ALL
+ int tx_size_search_level;
+
+ // Flag used to control the winner mode processing for use transform
+ // domain distortion
+ int enable_winner_mode_for_use_tx_domain_dist;
+
+ // Flag used to enable processing of multiple winner modes
+ MULTI_WINNER_MODE_TYPE multi_winner_mode_type;
+
+ // Motion mode for winner candidates:
+ // 0: speed feature OFF
+ // 1 / 2 : Use configured number of winner candidates
+ int motion_mode_for_winner_cand;
+
+ // Controls the prediction of transform skip block or DC only block.
+ //
+ // Different speed feature values (0 to 3) decide the aggressiveness of
+ // prediction (refer to predict_dc_levels[][] in speed_features.c) to be used
+ // during different mode evaluation stages.
+ int dc_blk_pred_level;
+
+ // If on, disables interpolation filter search in handle_inter_mode loop, and
+ // performs it during winner mode processing by \ref
+ // tx_search_best_inter_candidates.
+ int winner_mode_ifs;
+
+ // Controls the disabling of winner mode processing. Speed feature levels
+ // are ordered in increasing aggressiveness of pruning. The method considered
+ // for disabling, depends on the sf level value and it is described as below.
+ // 0: Do not disable
+ // 1: Disable for blocks with low source variance.
+ // 2: Disable for blocks which turn out to be transform skip (skipped based on
+ // eob) during MODE_EVAL stage except NEWMV mode.
+ // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL
+ // stage except NEWMV mode. For high quantizers, prune conservatively based on
+ // transform skip (skipped based on eob) except for NEWMV mode.
+ // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL
+ // stage.
+ int prune_winner_mode_eval_level;
+} WINNER_MODE_SPEED_FEATURES;
+
+typedef struct LOOP_FILTER_SPEED_FEATURES {
+ // This feature controls how the loop filter level is determined.
+ LPF_PICK_METHOD lpf_pick;
+
+ // Skip some final iterations in the determination of the best loop filter
+ // level.
+ int use_coarse_filter_level_search;
+
+ // Control how the CDEF strength is determined.
+ CDEF_PICK_METHOD cdef_pick_method;
+
+ // Decoder side speed feature to add penalty for use of dual-sgr filters.
+ // Takes values 0 - 10, 0 indicating no penalty and each additional level
+ // adding a penalty of 1%
+ int dual_sgr_penalty_level;
+
+ // prune sgr ep using binary search like mechanism
+ int enable_sgr_ep_pruning;
+
+ // Disable loop restoration for Chroma plane
+ int disable_loop_restoration_chroma;
+
+ // Disable loop restoration for luma plane
+ int disable_loop_restoration_luma;
+
+ // Range of loop restoration unit sizes to search
+ // The minimum size is clamped against the superblock size in
+ // av1_pick_filter_restoration, so that the code which sets this value does
+ // not need to know the superblock size ahead of time.
+ int min_lr_unit_size;
+ int max_lr_unit_size;
+
+ // Prune RESTORE_WIENER evaluation based on source variance
+ // 0 : no pruning
+ // 1 : conservative pruning
+ // 2 : aggressive pruning
+ int prune_wiener_based_on_src_var;
+
+ // Prune self-guided loop restoration based on wiener search results
+ // 0 : no pruning
+ // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE
+ // 2 : pruning based on winner restoration type among RESTORE_WIENER and
+ // RESTORE_NONE
+ int prune_sgr_based_on_wiener;
+
+ // Reduce the wiener filter win size for luma
+ int reduce_wiener_window_size;
+
+ // Flag to disable Wiener Loop restoration filter.
+ bool disable_wiener_filter;
+
+ // Flag to disable Self-guided Loop restoration filter.
+ bool disable_sgr_filter;
+
+ // Disable the refinement search around the wiener filter coefficients.
+ bool disable_wiener_coeff_refine_search;
+
+ // Whether to downsample the rows in computation of wiener stats.
+ int use_downsampled_wiener_stats;
+} LOOP_FILTER_SPEED_FEATURES;
+
+typedef struct REAL_TIME_SPEED_FEATURES {
+ // check intra prediction for non-RD mode.
+ int check_intra_pred_nonrd;
+
+ // Skip checking intra prediction.
+ // 0 - don't skip
+ // 1 - skip if TX is skipped and best mode is not NEWMV
+ // 2 - skip if TX is skipped
+ // Skipping aggressiveness increases from level 1 to 2.
+ int skip_intra_pred;
+
+ // Estimate motion before calculating variance in variance-based partition
+ // 0 - Only use zero MV
+ // 1 - perform coarse ME
+ // 2 - perform coarse ME, and also use neighbours' MVs
+ // 3 - use neighbours' MVs without performing coarse ME
+ int estimate_motion_for_var_based_partition;
+
+ // For nonrd_use_partition: mode of extra check of leaf partition
+ // 0 - don't check merge
+ // 1 - always check merge
+ // 2 - check merge and prune checking final split
+ // 3 - check merge and prune checking final split based on bsize and qindex
+ int nonrd_check_partition_merge_mode;
+
+ // For nonrd_use_partition: check of leaf partition extra split
+ int nonrd_check_partition_split;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
+
+ // For nonrd: Reduces ref frame search.
+ // 0 - low level of search prune in non last frames
+ // 1 - pruned search in non last frames
+ // 2 - more pruned search in non last frames
+ int nonrd_prune_ref_frame_search;
+
+ // This flag controls the use of non-RD mode decision.
+ int use_nonrd_pick_mode;
+
+ // Use ALTREF frame in non-RD mode decision.
+ int use_nonrd_altref_frame;
+
+ // Use compound reference for non-RD mode.
+ int use_comp_ref_nonrd;
+
+ // Reference frames for compound prediction for nonrd pickmode:
+ // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2).
+ int ref_frame_comp_nonrd[3];
+
+ // use reduced ref set for real-time mode
+ int use_real_time_ref_set;
+
+ // Skip a number of expensive mode evaluations for blocks with very low
+ // temporal variance.
+ int short_circuit_low_temp_var;
+
+ // Reuse inter prediction in fast non-rd mode.
+ int reuse_inter_pred_nonrd;
+
+ // Number of best inter modes to search transform. INT_MAX - search all.
+ int num_inter_modes_for_tx_search;
+
+ // Use interpolation filter search in non-RD mode decision.
+ int use_nonrd_filter_search;
+
+ // Use simplified RD model for interpolation search and Intra
+ int use_simple_rd_model;
+
+ // For nonrd mode: use hybrid intra mode search for intra only frames based on
+ // block properties.
+ // 0 : use nonrd pick intra for all blocks
+ // 1 : use rd for bsize < 16x16, nonrd otherwise
+ // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise
+ int hybrid_intra_pickmode;
+
+ // Compute variance/sse on source difference, prior to encoding superblock.
+ int source_metrics_sb_nonrd;
+
+ // Flag to indicate process for handling overshoot on slide/scene change,
+ // for real-time CBR mode.
+ OVERSHOOT_DETECTION_CBR overshoot_detection_cbr;
+
+ // Check for scene/content change detection on every frame before encoding.
+ int check_scene_detection;
+
+ // For nonrd mode: Prefer larger partition blks in variance based partitioning
+ // 0: disabled, 1-3: increasing aggressiveness
+ int prefer_large_partition_blocks;
+
+ // uses results of temporal noise estimate
+ int use_temporal_noise_estimate;
+
+ // Parameter indicating initial search window to be used in full-pixel search
+ // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value
+ // indicates larger window. If set to 0, step_param is set based on internal
+ // logic in set_mv_search_params().
+ int fullpel_search_step_param;
+
+ // Bit mask to enable or disable intra modes for each prediction block size
+ // separately, for nonrd_pickmode. Currently, the sf is not respected when
+ // 'force_intra_check' is true in 'av1_estimate_intra_mode()' function. Also,
+ // H and V pred modes allowed through this sf can be further pruned when
+ //'prune_hv_pred_modes_using_src_sad' sf is true.
+ int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES];
+
+ // Prune H and V intra predition modes evalution in inter frame.
+ // The sf does not have any impact.
+ // i. when frame_source_sad is 1.1 times greater than avg_source_sad
+ // ii. when cyclic_refresh_segment_id_boosted is enabled
+ // iii. when SB level source sad is greater than kMedSad
+ // iv. when color sensitivity is non zero for both the chroma channels
+ bool prune_hv_pred_modes_using_src_sad;
+
+ // Skips mode checks more aggressively in nonRD mode
+ int nonrd_aggressive_skip;
+
+ // Skip cdef on 64x64 blocks/
+ // 0: disabled
+ // 1: skip when NEWMV or INTRA is not picked or color sensitivity is off.
+ // When color sensitivity is on for a superblock, all 64x64 blocks within
+ // will not skip.
+ // 2: more aggressive mode where skip is done for all frames where
+ // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off.
+ int skip_cdef_sb;
+
+ // Force selective cdf update.
+ int selective_cdf_update;
+
+ // Force only single reference (LAST) for prediction.
+ int force_only_last_ref;
+
+ // Forces larger partition blocks in variance based partitioning for intra
+ // frames
+ int force_large_partition_blocks_intra;
+
+ // Use fixed partition for superblocks based on source_sad.
+ // 0: disabled
+ // 1: enabled
+ int use_fast_fixed_part;
+
+ // Increase source_sad thresholds in nonrd pickmode.
+ int increase_source_sad_thresh;
+
+ // Skip evaluation of no split in tx size selection for merge partition
+ int skip_tx_no_split_var_based_partition;
+
+ // Intermediate termination of newMV mode evaluation based on so far best mode
+ // sse
+ int skip_newmv_mode_based_on_sse;
+
+ // Define gf length multiplier.
+ // Level 0: use large multiplier, level 1: use medium multiplier.
+ int gf_length_lvl;
+
+ // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes
+ int prune_inter_modes_with_golden_ref;
+
+ // Prune inter modes w.r.t golden or alt-ref frame based on sad
+ int prune_inter_modes_wrt_gf_arf_based_on_sad;
+
+ // Prune inter mode search in rd path based on current block's temporal
+ // variance wrt LAST reference.
+ int prune_inter_modes_using_temp_var;
+
+ // Reduce MV precision to halfpel for higher int MV value & frame-level motion
+ // 0: disabled
+ // 1-2: Reduce precision to halfpel, fullpel based on conservative
+ // thresholds, aggressiveness increases with increase in level
+ // 3: Reduce precision to halfpel using more aggressive thresholds
+ int reduce_mv_pel_precision_highmotion;
+
+ // Reduce MV precision for low complexity blocks
+ // 0: disabled
+ // 1: Reduce the mv resolution for zero mv if the variance is low
+ // 2: Switch to halfpel, fullpel based on low block spatial-temporal
+ // complexity.
+ int reduce_mv_pel_precision_lowcomplex;
+
+ // Prune intra mode evaluation in inter frames based on mv range.
+ BLOCK_SIZE prune_intra_mode_based_on_mv_range;
+ // The number of times to left shift the splitting thresholds in variance
+ // based partitioning. The minimum values should be 7 to avoid left shifting
+ // by a negative number.
+ int var_part_split_threshold_shift;
+
+ // Qindex based variance partition threshold index, which determines
+ // the aggressiveness of partition pruning
+ // 0: disabled for speeds 9,10
+ // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb)
+ // 3,4: (non-rd path) uses pre-tuned qindex thresholds
+ int var_part_based_on_qidx;
+
+ // Enable GF refresh based on Q value.
+ int gf_refresh_based_on_qp;
+
+ // Temporal filtering
+ // The value can be 1 or 2, which indicates the threshold to use.
+ // Must be off for lossless mode.
+ int use_rtc_tf;
+
+ // Prune the use of the identity transform in nonrd_pickmode,
+ // used for screen content mode: only for smaller blocks
+ // and higher spatial variance, and when skip_txfm is not
+ // already set.
+ int prune_idtx_nonrd;
+
+ // Prune the use of paletter mode in nonrd pickmode.
+ int prune_palette_nonrd;
+
+ // Force to only use dct for palette search in nonrd pickmode.
+ int dct_only_palette_nonrd;
+
+ // Skip loopfilter, for static content after slide change
+ // or key frame, once quality has ramped up.
+ // 0: disabled
+ // 1: skip only after quality is ramped up.
+ // 2: aggrssive mode, where skip is done for all frames that
+ // where rc->high_source_sad = 0 (no slide-changes).
+ int skip_lf_screen;
+
+ // For nonrd: early exit out of variance partition that sets the
+ // block size to superblock size, and sets mode to zeromv-last skip.
+ // 0: disabled
+ // 1: zeromv-skip is enabled at SB level only
+ // 2: zeromv-skip is enabled at SB level and coding block level
+ int part_early_exit_zeromv;
+
+ // Early terminate inter mode search based on sse in non-rd path.
+ INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search;
+
+ // SAD based adaptive altref selection
+ int sad_based_adp_altref_lag;
+
+ // Enable/disable partition direct merging.
+ int partition_direct_merging;
+
+ // Level of aggressiveness for obtaining tx size based on qstep
+ int tx_size_level_based_on_qstep;
+
+ // Avoid the partitioning of a 16x16 block in variance based partitioning
+ // (VBP) by making use of minimum and maximum sub-block variances.
+ // For allintra encode, this speed feature reduces instruction count by 5.39%
+ // for speed 9 on a typical video dataset with coding performance gain
+ // of 1.44%.
+ // For AVIF image encode, this speed feature reduces encode time
+ // by 8.44% for speed 9 on a typical image dataset with coding performance
+ // gain of 0.78%.
+ bool vbp_prune_16x16_split_using_min_max_sub_blk_var;
+
+ // A qindex threshold that determines whether to use qindex based CDEF filter
+ // strength estimation for screen content types. The strength estimation model
+ // used for screen contents prefers to allow cdef filtering for more frames.
+ // This sf is used to limit the frames which go through cdef filtering and
+ // following explains the setting of the same.
+ // MAXQ (255): This disables the usage of this sf. Here, frame does not use a
+ // screen content model thus reduces the number of frames that go through cdef
+ // filtering.
+ // MINQ (0): Frames always use screen content model thus increasing the number
+ // of frames that go through cdef filtering.
+ // This speed feature has a substantial gain on coding metrics, with moderate
+ // increase encoding time. Select threshold based on speed vs quality
+ // trade-off.
+ int screen_content_cdef_filter_qindex_thresh;
+
+ // Prune compound mode if its variance is higher than the variance of single
+ // modes.
+ bool prune_compoundmode_with_singlecompound_var;
+
+ // Allow mode cost update at frame level every couple frames. This
+ // overrides the command line setting --mode-cost-upd-freq=3 (never update
+ // except on key frame and first delta).
+ bool frame_level_mode_cost_update;
+
+ // Prune H_PRED during intra mode evaluation in the nonrd path based on best
+ // mode so far.
+ //
+ // For allintra encode, this speed feature reduces instruction count by 1.10%
+ // for speed 9 with coding performance change less than 0.04%.
+ // For AVIF image encode, this speed feature reduces encode time by 1.03% for
+ // speed 9 on a typical image dataset with coding performance change less than
+ // 0.08%.
+ bool prune_h_pred_using_best_mode_so_far;
+
+ // Enable pruning of intra mode evaluations in nonrd path based on source
+ // variance and best mode so far. The pruning logic is enabled only if the
+ // mode is not a winner mode of both the neighboring blocks (left/top).
+ //
+ // For allintra encode, this speed feature reduces instruction count by 3.96%
+ // for speed 9 with coding performance change less than 0.38%.
+ // For AVIF image encode, this speed feature reduces encode time by 3.46% for
+ // speed 9 on a typical image dataset with coding performance change less than
+ // -0.06%.
+ bool enable_intra_mode_pruning_using_neighbors;
+
+ // Prune intra mode evaluations in nonrd path based on best sad so far.
+ //
+ // For allintra encode, this speed feature reduces instruction count by 3.05%
+ // for speed 9 with coding performance change less than 0.24%.
+ // For AVIF image encode, this speed feature reduces encode time by 1.87% for
+ // speed 9 on a typical image dataset with coding performance change less than
+ // 0.16%.
+ bool prune_intra_mode_using_best_sad_so_far;
+
+ // If compound is enabled, and the current block size is \geq BLOCK_16X16,
+ // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the
+ // base layer of svc.
+ bool check_only_zero_zeromv_on_large_blocks;
+
+ // Allow for disabling cdf update for non reference frames in svc mode.
+ bool disable_cdf_update_non_reference_frame;
+
+ // Prune compound modes if the single modes variances do not perform well.
+ bool prune_compoundmode_with_singlemode_var;
+
+ // Skip searching all compound mode if the variance of single_mode residue is
+ // sufficiently low.
+ bool skip_compound_based_on_var;
+
+ // Sets force_zeromv_skip based on the source sad available. Aggressiveness
+ // increases with increase in the level set for speed feature.
+ // 0: No setting
+ // 1: If source sad is kZeroSad
+ // 2: If source sad <= kVeryLowSad
+ int set_zeromv_skip_based_on_source_sad;
+
+ // Downgrades the block-level subpel motion search to
+ // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel
+ // search performed well, zeromv has low sad or low source_var
+ bool use_adaptive_subpel_search;
+
+ // A flag used in RTC case to control frame_refs_short_signaling. Note that
+ // the final decision is made in check_frame_refs_short_signaling(). The flag
+ // can only be turned on when res < 360p and speed >= 9, in which case only
+ // LAST and GOLDEN ref frames are used now.
+ bool enable_ref_short_signaling;
+
+ // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame
+ // case.
+ bool check_globalmv_on_single_ref;
+
+ // Allows for increasing the color_threshold for palette prediction.
+ // This generally leads to better coding efficiency but with some speed loss.
+ // Only used for screen content and for nonrd_pickmode.
+ bool increase_color_thresh_palette;
+} REAL_TIME_SPEED_FEATURES;
+
+/*!\endcond */
+
+/*!
+ * \brief Top level speed vs quality trade off data struture.
+ */
+typedef struct SPEED_FEATURES {
+ /*!
+ * Sequence/frame level speed features:
+ */
+ HIGH_LEVEL_SPEED_FEATURES hl_sf;
+
+ /*!
+ * Speed features for the first pass.
+ */
+ FIRST_PASS_SPEED_FEATURES fp_sf;
+
+ /*!
+ * Speed features related to how tpl's searches are done.
+ */
+ TPL_SPEED_FEATURES tpl_sf;
+
+ /*!
+ * Global motion speed features:
+ */
+ GLOBAL_MOTION_SPEED_FEATURES gm_sf;
+
+ /*!
+ * Partition search speed features:
+ */
+ PARTITION_SPEED_FEATURES part_sf;
+
+ /*!
+ * Motion search speed features:
+ */
+ MV_SPEED_FEATURES mv_sf;
+
+ /*!
+ * Inter mode search speed features:
+ */
+ INTER_MODE_SPEED_FEATURES inter_sf;
+
+ /*!
+ * Interpolation filter search speed features:
+ */
+ INTERP_FILTER_SPEED_FEATURES interp_sf;
+
+ /*!
+ * Intra mode search speed features:
+ */
+ INTRA_MODE_SPEED_FEATURES intra_sf;
+
+ /*!
+ * Transform size/type search speed features:
+ */
+ TX_SPEED_FEATURES tx_sf;
+
+ /*!
+ * RD calculation speed features:
+ */
+ RD_CALC_SPEED_FEATURES rd_sf;
+
+ /*!
+ * Two-pass mode evaluation features:
+ */
+ WINNER_MODE_SPEED_FEATURES winner_mode_sf;
+
+ /*!
+ * In-loop filter speed features:
+ */
+ LOOP_FILTER_SPEED_FEATURES lpf_sf;
+
+ /*!
+ * Real-time mode speed features:
+ */
+ REAL_TIME_SPEED_FEATURES rt_sf;
+} SPEED_FEATURES;
+/*!\cond */
+
+struct AV1_COMP;
+
+/*!\endcond */
+/*!\brief Frame size independent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] speed Speed setting passed in from the command line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ * based on the passed in speed setting. (Higher speed gives lower
+ * quality)
+ */
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+ int speed);
+
+/*!\brief Frame size dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] speed Speed setting passed in from the command line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ * based on the passed in speed setting and frame size. (Higher speed
+ * corresponds to lower quality)
+ */
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+ int speed);
+/*!\brief Q index dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] speed Speed setting passed in from the command line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ * based on the passed in speed setting and current frame's Q index.
+ * (Higher speed corresponds to lower quality)
+ */
+void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/superres_scale.c b/third_party/aom/av1/encoder/superres_scale.c
new file mode 100644
index 0000000000..3b47909b15
--- /dev/null
+++ b/third_party/aom/av1/encoder/superres_scale.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/random.h"
+
+// Compute the horizontal frequency components' energy in a frame
+// by calculuating the 16x4 Horizontal DCT. This is to be used to
+// decide the superresolution parameters.
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+ uint64_t freq_energy[16] = { 0 };
+ const YV12_BUFFER_CONFIG *buf = cpi->source;
+ const int bd = cpi->td.mb.e_mbd.bd;
+ const int width = buf->y_crop_width;
+ const int height = buf->y_crop_height;
+ DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
+ int n = 0;
+ memset(freq_energy, 0, sizeof(freq_energy));
+ if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+ for (int i = 0; i < height - 4; i += 4) {
+ for (int j = 0; j < width - 16; j += 16) {
+ av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+ H_DCT, bd);
+ for (int k = 1; k < 16; ++k) {
+ const uint64_t this_energy =
+ ((int64_t)coeff[k] * coeff[k]) +
+ ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+ ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+ ((int64_t)coeff[k + 48] * coeff[k + 48]);
+ freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+ }
+ n++;
+ }
+ }
+ } else {
+ assert(bd == 8);
+ DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
+ for (int i = 0; i < height - 4; i += 4) {
+ for (int j = 0; j < width - 16; j += 16) {
+ for (int ii = 0; ii < 4; ++ii)
+ for (int jj = 0; jj < 16; ++jj)
+ src16[ii * 16 + jj] =
+ buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
+ av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
+ for (int k = 1; k < 16; ++k) {
+ const uint64_t this_energy =
+ ((int64_t)coeff[k] * coeff[k]) +
+ ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+ ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+ ((int64_t)coeff[k + 48] * coeff[k + 48]);
+ freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
+ }
+ n++;
+ }
+ }
+ }
+ if (n) {
+ for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
+ // Convert to cumulative energy
+ for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
+ } else {
+ for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
+ }
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+ // Choose an arbitrary random number
+ static unsigned int seed = 56789;
+ const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg;
+ if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+ uint8_t new_denom = SCALE_NUMERATOR;
+
+ if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR;
+ switch (resize_cfg->resize_mode) {
+ case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+ case RESIZE_FIXED:
+ if (cpi->common.current_frame.frame_type == KEY_FRAME)
+ new_denom = resize_cfg->resize_kf_scale_denominator;
+ else
+ new_denom = resize_cfg->resize_scale_denominator;
+ break;
+ case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+ default: assert(0);
+ }
+ return new_denom;
+}
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ // Empirically found to not be beneficial for image coding.
+ return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO &&
+ cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO &&
+ cpi->rc.frames_to_key > 1;
+}
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+ const RATE_CONTROL *rc,
+ int gf_frame_index) {
+ // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+ // level.
+ if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) {
+ return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+ } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) {
+ if (rc->frames_to_key <= 1)
+ return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+ else
+ return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+ } else {
+ assert(0);
+ }
+ return 0;
+}
+
+static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
+ double threshq,
+ double threshp) {
+ const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
+ const double tq = threshq * q * q;
+ const double tp = threshp * energy[1];
+ const double thresh = AOMMIN(tq, tp);
+ int k;
+ for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
+ if (energy[k - 1] > thresh) break;
+ }
+ return 3 * SCALE_NUMERATOR - k;
+}
+
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+ int sr_kf, int sr_arf) {
+ // Use superres for Key-frames and Alt-ref frames only.
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) {
+ return SCALE_NUMERATOR;
+ }
+ if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) {
+ return SCALE_NUMERATOR;
+ }
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) {
+ return SCALE_NUMERATOR;
+ }
+
+ double energy[16];
+ analyze_hor_freq(cpi, energy);
+
+ const double energy_by_q2_thresh =
+ get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index);
+ int denom = get_superres_denom_from_qindex_energy(
+ qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
+ /*
+ printf("\nenergy = [");
+ for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+ printf("]\n");
+ printf("boost = %d\n",
+ (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE)
+ ? cpi->ppi->p_rc.kf_boost
+ : cpi->rc.gfu_boost);
+ printf("denom = %d\n", denom);
+ */
+ if (av1_superres_in_recode_allowed(cpi)) {
+ assert(cpi->superres_mode != AOM_SUPERRES_NONE);
+ // Force superres to be tried in the recode loop, as full-res is also going
+ // to be tried anyway.
+ denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+ }
+ return denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+ // Choose an arbitrary random number
+ static unsigned int seed = 34567;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+ uint8_t new_denom = SCALE_NUMERATOR;
+
+ // Make sure that superres mode of the frame is consistent with the
+ // sequence-level flag.
+ assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
+ cpi->common.seq_params->enable_superres));
+ assert(IMPLIES(!cpi->common.seq_params->enable_superres,
+ superres_cfg->superres_mode == AOM_SUPERRES_NONE));
+ // Make sure that superres mode for current encoding is consistent with user
+ // provided superres mode.
+ assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO,
+ cpi->superres_mode == superres_cfg->superres_mode));
+
+ // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+ // not the user given mode in 'oxcf'.
+ switch (cpi->superres_mode) {
+ case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+ case AOM_SUPERRES_FIXED:
+ if (cpi->common.current_frame.frame_type == KEY_FRAME)
+ new_denom = superres_cfg->superres_kf_scale_denominator;
+ else
+ new_denom = superres_cfg->superres_scale_denominator;
+ break;
+ case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+ case AOM_SUPERRES_QTHRESH: {
+ // Do not use superres when screen content tools are used.
+ if (cpi->common.features.allow_screen_content_tools) break;
+ if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+ av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+ // Now decide the use of superres based on 'q'.
+ int bottom_index, top_index;
+ const int q = av1_rc_pick_q_and_bounds(
+ cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+ &bottom_index, &top_index);
+
+ const int qthresh = (frame_is_intra_only(&cpi->common))
+ ? superres_cfg->superres_kf_qthresh
+ : superres_cfg->superres_qthresh;
+ if (q <= qthresh) {
+ new_denom = SCALE_NUMERATOR;
+ } else {
+ new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+ }
+ break;
+ }
+ case AOM_SUPERRES_AUTO: {
+ if (cpi->common.features.allow_screen_content_tools) break;
+ if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+ av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+ // Now decide the use of superres based on 'q'.
+ int bottom_index, top_index;
+ const int q = av1_rc_pick_q_and_bounds(
+ cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+ &bottom_index, &top_index);
+
+ const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
+ cpi->sf.hl_sf.superres_auto_search_type;
+ const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0;
+ if (q <= qthresh) {
+ new_denom = SCALE_NUMERATOR; // Don't use superres.
+ } else {
+ if (sr_search_type == SUPERRES_AUTO_ALL) {
+ if (cpi->common.current_frame.frame_type == KEY_FRAME)
+ new_denom = superres_cfg->superres_kf_scale_denominator;
+ else
+ new_denom = superres_cfg->superres_scale_denominator;
+ } else {
+ new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+ }
+ }
+ break;
+ }
+ default: assert(0);
+ }
+ return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+ return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+ // Only need to check the width, as scaling is horizontal only.
+ (void)oheight;
+ return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+ aom_superres_mode superres_mode, int owidth,
+ int oheight, size_params_type *rsz) {
+ if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do.
+ return 1;
+ }
+
+ // Calculate current resize scale.
+ int resize_denom =
+ AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+ DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+ if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) {
+ // Alter superres scale as needed to enforce conformity.
+ rsz->superres_denom =
+ (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+ if (!dimensions_are_ok(owidth, oheight, rsz)) {
+ if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+ }
+ } else if (resize_mode == RESIZE_RANDOM &&
+ superres_mode != AOM_SUPERRES_RANDOM) {
+ // Alter resize scale as needed to enforce conformity.
+ resize_denom =
+ (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ if (!dimensions_are_ok(owidth, oheight, rsz)) {
+ if (resize_denom > SCALE_NUMERATOR) {
+ --resize_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ }
+ }
+ } else if (resize_mode == RESIZE_RANDOM &&
+ superres_mode == AOM_SUPERRES_RANDOM) {
+ // Alter both resize and superres scales as needed to enforce conformity.
+ do {
+ if (resize_denom > rsz->superres_denom)
+ --resize_denom;
+ else
+ --rsz->superres_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+ (resize_denom > SCALE_NUMERATOR ||
+ rsz->superres_denom > SCALE_NUMERATOR));
+ } else { // We are allowed to alter neither resize scale nor superres
+ // scale.
+ return 0;
+ }
+ return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
+ SCALE_NUMERATOR };
+ int resize_denom = SCALE_NUMERATOR;
+ if (has_no_stats_stage(cpi) && cpi->ppi->use_svc &&
+ (cpi->common.width != cpi->oxcf.frm_dim_cfg.width ||
+ cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) {
+ rsz.resize_width = cpi->common.width;
+ rsz.resize_height = cpi->common.height;
+ return rsz;
+ }
+ if (is_stat_generation_stage(cpi)) return rsz;
+ if (resize_pending_params->width && resize_pending_params->height) {
+ rsz.resize_width = resize_pending_params->width;
+ rsz.resize_height = resize_pending_params->height;
+ resize_pending_params->width = resize_pending_params->height = 0;
+ if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz;
+ } else {
+ resize_denom = calculate_next_resize_scale(cpi);
+ rsz.resize_width = frm_dim_cfg->width;
+ rsz.resize_height = frm_dim_cfg->height;
+ av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+ resize_denom);
+ }
+ rsz.superres_denom = calculate_next_superres_scale(cpi);
+ if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode,
+ frm_dim_cfg->width, frm_dim_cfg->height, &rsz))
+ assert(0 && "Invalid scale parameters");
+ return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+ const size_params_type *rsz) {
+ int encode_width = rsz->resize_width;
+ int encode_height = rsz->resize_height;
+
+ AV1_COMMON *cm = &cpi->common;
+ cm->superres_upscaled_width = encode_width;
+ cm->superres_upscaled_height = encode_height;
+ cm->superres_scale_denominator = rsz->superres_denom;
+ av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+ rsz->superres_denom);
+ av1_set_frame_size(cpi, encode_width, encode_height);
+}
+
+void av1_setup_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ // Reset superres params from previous frame.
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ const size_params_type rsz = calculate_next_size_params(cpi);
+ setup_frame_size_from_params(cpi, &rsz);
+
+ assert(av1_is_min_tile_width_satisfied(cm));
+}
+
+void av1_superres_post_encode(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ assert(cpi->oxcf.superres_cfg.enable_superres);
+ assert(!is_lossless_requested(&cpi->oxcf.rc_cfg));
+ assert(!cm->features.all_lossless);
+
+ av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels);
+
+ // If regular resizing is occurring the source will need to be downscaled to
+ // match the upscaled superres resolution. Otherwise the original source is
+ // used.
+ if (!av1_resize_scaled(cm)) {
+ cpi->source = cpi->unscaled_source;
+ if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+ } else {
+ assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+ assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+ // Do downscale. cm->(width|height) has been updated by
+ // av1_superres_upscale
+ cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width,
+ cm->superres_upscaled_height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/superres_scale.h b/third_party/aom/av1/encoder/superres_scale.h
new file mode 100644
index 0000000000..450a4ed902
--- /dev/null
+++ b/third_party/aom/av1/encoder/superres_scale.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi);
+void av1_superres_post_encode(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SUPERRES_SCALE_H_
diff --git a/third_party/aom/av1/encoder/svc_layercontext.c b/third_party/aom/av1/encoder/svc_layercontext.c
new file mode 100644
index 0000000000..2c99cb89b8
--- /dev/null
+++ b/third_party/aom/av1/encoder/svc_layercontext.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+
+static void swap_ptr(void *a, void *b) {
+ void **a_p = (void **)a;
+ void **b_p = (void **)b;
+ void *c = *a_p;
+ *a_p = *b_p;
+ *b_p = c;
+}
+
+void av1_init_layer_context(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ SVC *const svc = &cpi->svc;
+ int mi_rows = cpi->common.mi_params.mi_rows;
+ int mi_cols = cpi->common.mi_params.mi_cols;
+ svc->base_framerate = 30.0;
+ svc->current_superframe = 0;
+ svc->force_zero_mode_spatial_ref = 1;
+ svc->num_encoded_top_layer = 0;
+ svc->use_flexible_mode = 0;
+ svc->has_lower_quality_layer = 0;
+
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+ lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q;
+ lp_rc->total_actual_bits = 0;
+ lrc->ni_tot_qi = 0;
+ lp_rc->tot_q = 0.0;
+ lp_rc->avg_q = 0.0;
+ lp_rc->ni_frames = 0;
+ lrc->decimation_count = 0;
+ lrc->decimation_factor = 0;
+ lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+ lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+ lrc->rtc_external_ratectrl = 0;
+ for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ lp_rc->rate_correction_factors[i] = 1.0;
+ }
+ lc->target_bandwidth = lc->layer_target_bitrate;
+ lp_rc->last_q[INTER_FRAME] = lrc->worst_quality;
+ lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+ lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+ lp_rc->buffer_level =
+ oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000;
+ lp_rc->bits_off_target = lp_rc->buffer_level;
+ // Initialize the cyclic refresh parameters. If spatial layers are used
+ // (i.e., ss_number_layers > 1), these need to be updated per spatial
+ // layer. Cyclic refresh is only applied on base temporal layer.
+ if (svc->number_spatial_layers > 1 && tl == 0) {
+ lc->sb_index = 0;
+ lc->actual_num_seg1_blocks = 0;
+ lc->actual_num_seg2_blocks = 0;
+ lc->counter_encode_maxq_scene_change = 0;
+ aom_free(lc->map);
+ CHECK_MEM_ERROR(cm, lc->map,
+ aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
+ }
+ }
+ svc->downsample_filter_type[sl] = BILINEAR;
+ svc->downsample_filter_phase[sl] = 8;
+ svc->last_layer_dropped[sl] = false;
+ svc->drop_spatial_layer[sl] = false;
+ }
+ if (svc->number_spatial_layers == 3) {
+ svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
+ }
+}
+
+bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) {
+ SVC *const svc = &cpi->svc;
+ if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) {
+ assert(num_layers > 1);
+ aom_free(svc->layer_context);
+ svc->num_allocated_layers = 0;
+ svc->layer_context =
+ (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context));
+ if (svc->layer_context == NULL) return false;
+ svc->num_allocated_layers = num_layers;
+ }
+ return true;
+}
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(AV1_COMP *const cpi,
+ const int64_t target_bandwidth) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ AV1_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ int layer = 0;
+ int64_t spatial_layer_target = 0;
+ float bitrate_alloc = 1.0;
+ const int mi_rows = cm->mi_params.mi_rows;
+ const int mi_cols = cm->mi_params.mi_cols;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate;
+ }
+ spatial_layer_target = svc->layer_context[layer].target_bandwidth;
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ LAYER_CONTEXT *const lc =
+ &svc->layer_context[sl * svc->number_temporal_layers + tl];
+ RATE_CONTROL *const lrc = &lc->rc;
+ PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+ lc->spatial_layer_target_bandwidth = spatial_layer_target;
+ if (target_bandwidth != 0) {
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ }
+ lp_rc->starting_buffer_level =
+ (int64_t)(p_rc->starting_buffer_level * bitrate_alloc);
+ lp_rc->optimal_buffer_level =
+ (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc);
+ lp_rc->maximum_buffer_size =
+ (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc);
+ lp_rc->bits_off_target =
+ AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+ lp_rc->buffer_level =
+ AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size);
+ lc->framerate = cpi->framerate / lc->framerate_factor;
+ lrc->avg_frame_bandwidth =
+ (int)round(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+ lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl;
+ lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+ lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+ if (rc->use_external_qp_one_pass) {
+ lrc->worst_quality = rc->worst_quality;
+ lrc->best_quality = rc->best_quality;
+ }
+ // Reset the cyclic refresh parameters, if needed (map is NULL),
+ // or number of spatial layers has changed.
+ // Cyclic refresh is only applied on base temporal layer.
+ if (svc->number_spatial_layers > 1 && tl == 0 &&
+ (lc->map == NULL ||
+ svc->prev_number_spatial_layers != svc->number_spatial_layers)) {
+ lc->sb_index = 0;
+ lc->actual_num_seg1_blocks = 0;
+ lc->actual_num_seg2_blocks = 0;
+ lc->counter_encode_maxq_scene_change = 0;
+ aom_free(lc->map);
+ CHECK_MEM_ERROR(cm, lc->map,
+ aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
+ }
+ }
+ }
+}
+
+/*!\brief Return layer context for current layer.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return LAYER_CONTEXT for current layer.
+ */
+static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
+ return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id];
+}
+
+void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ RATE_CONTROL *const lrc = &lc->rc;
+ const int tl = svc->temporal_layer_id;
+ lc->framerate = cpi->framerate / lc->framerate_factor;
+ lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+ // Update the average layer frame size (non-cumulative per-frame-bw).
+ if (tl == 0) {
+ lc->avg_frame_size = lrc->avg_frame_bandwidth;
+ } else {
+ int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers +
+ svc->temporal_layer_id - 1;
+ LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer];
+ const double prev_layer_framerate =
+ cpi->framerate / lcprev->framerate_factor;
+ const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
+ lc->avg_frame_size =
+ (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+ (lc->framerate - prev_layer_framerate));
+ }
+}
+
+static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame(
+ int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) {
+ int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1];
+ return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+ rtc_ref->buffer_spatial_layer[ref_frame_idx] <=
+ svc->spatial_layer_id - 1;
+}
+
+void av1_restore_layer_context(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ const AV1_COMMON *const cm = &cpi->common;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ const int old_frame_since_key = cpi->rc.frames_since_key;
+ const int old_frame_to_key = cpi->rc.frames_to_key;
+ const int max_consec_drop = cpi->rc.max_consec_drop;
+ // Restore layer rate control.
+ cpi->rc = lc->rc;
+ cpi->ppi->p_rc = lc->p_rc;
+ cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
+ cpi->gf_frame_index = 0;
+ cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
+ if (cpi->mv_search_params.max_mv_magnitude == 0)
+ cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
+ // Reset the frames_since_key and frames_to_key counters to their values
+ // before the layer restore. Keep these defined for the stream (not layer).
+ cpi->rc.frames_since_key = old_frame_since_key;
+ cpi->rc.frames_to_key = old_frame_to_key;
+ // Reset to value before the layer restore.
+ cpi->rc.max_consec_drop = max_consec_drop;
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+ // for the base temporal layer.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ swap_ptr(&cr->map, &lc->map);
+ cr->sb_index = lc->sb_index;
+ cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+ cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+ cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
+ }
+ svc->skip_mvsearch_last = 0;
+ svc->skip_mvsearch_gf = 0;
+ svc->skip_mvsearch_altref = 0;
+ // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags.
+ // This is to skip searching mv for that reference if it was last
+ // refreshed (i.e., buffer slot holding that reference was refreshed) on the
+ // previous spatial layer(s) at the same time (current_superframe).
+ if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) {
+ svc->skip_mvsearch_last = 1;
+ }
+ if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) {
+ svc->skip_mvsearch_gf = 1;
+ }
+ if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) {
+ svc->skip_mvsearch_altref = 1;
+ }
+ }
+}
+
+void av1_svc_update_buffer_slot_refreshed(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ const unsigned int current_frame =
+ cpi->ppi->use_svc ? svc->current_superframe
+ : cpi->common.current_frame.frame_number;
+ // For any buffer slot that is refreshed, update it with
+ // the spatial_layer_id and the current_superframe.
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ // All slots are refreshed on KEY.
+ for (unsigned int i = 0; i < REF_FRAMES; i++) {
+ rtc_ref->buffer_time_index[i] = current_frame;
+ rtc_ref->buffer_spatial_layer[i] = svc->spatial_layer_id;
+ }
+ } else if (rtc_ref->set_ref_frame_config) {
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ if (cpi->ppi->rtc_ref.refresh[ref_frame_map_idx]) {
+ rtc_ref->buffer_time_index[ref_frame_map_idx] = current_frame;
+ rtc_ref->buffer_spatial_layer[ref_frame_map_idx] =
+ svc->spatial_layer_id;
+ }
+ }
+ }
+}
+
+void av1_save_layer_context(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ const AV1_COMMON *const cm = &cpi->common;
+ LAYER_CONTEXT *lc = get_layer_context(cpi);
+ lc->rc = cpi->rc;
+ lc->p_rc = cpi->ppi->p_rc;
+ lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
+ lc->group_index = cpi->gf_frame_index;
+ lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
+ if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+ // for the base temporal layer.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ signed char *temp = lc->map;
+ lc->map = cr->map;
+ cr->map = temp;
+ lc->sb_index = cr->sb_index;
+ lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+ lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+ lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+ }
+ av1_svc_update_buffer_slot_refreshed(cpi);
+ for (unsigned int i = 0; i < REF_FRAMES; i++) {
+ if (frame_is_intra_only(cm) ||
+ cm->current_frame.refresh_frame_flags & (1 << i)) {
+ svc->spatial_layer_fb[i] = svc->spatial_layer_id;
+ svc->temporal_layer_fb[i] = svc->temporal_layer_id;
+ }
+ }
+ if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ svc->current_superframe++;
+ // Reset drop flag to false for next superframe.
+ for (int sl = 0; sl < svc->number_spatial_layers; sl++)
+ svc->drop_spatial_layer[sl] = false;
+ }
+}
+
+int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) {
+ const SVC *const svc = &cpi->svc;
+ const AV1_COMMON *const cm = &cpi->common;
+ int fb_idx = -1;
+ int primary_ref_frame = PRIMARY_REF_NONE;
+ if (cpi->svc.number_spatial_layers > 1 ||
+ cpi->svc.number_temporal_layers > 1) {
+ // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST
+ // was last updated on a lower temporal layer (or base TL0) and for the
+ // same spatial layer. For RTC patterns this allows for continued decoding
+ // when set of enhancement layers are dropped (continued decoding starting
+ // at next base TL0), so error_resilience can be off/0 for all layers.
+ fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id &&
+ (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id ||
+ svc->temporal_layer_fb[fb_idx] == 0)) {
+ primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME
+ }
+ } else if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+ const ExternalFlags *const ext_flags = &cpi->ext_flags;
+ int flags = ext_flags->ref_frame_flags;
+ if (flags & AOM_LAST_FLAG) {
+ primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME
+ } else if (flags & AOM_GOLD_FLAG) {
+ primary_ref_frame = GOLDEN_FRAME - LAST_FRAME;
+ } else if (flags & AOM_ALT_FLAG) {
+ primary_ref_frame = ALTREF_FRAME - LAST_FRAME;
+ }
+ }
+ return primary_ref_frame;
+}
+
+void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ aom_free(lc->map);
+ lc->map = NULL;
+ }
+ }
+}
+
+void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *lc = NULL;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+ if (is_key) lc->frames_from_key_frame = 0;
+ }
+ }
+ av1_update_temporal_layer_framerate(cpi);
+ av1_restore_layer_context(cpi);
+}
+
+void av1_get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out) {
+ int w, h;
+ if (width_out == NULL || height_out == NULL || den == 0) return;
+ if (den == 1 && num == 1) {
+ *width_out = width_org;
+ *height_out = height_org;
+ return;
+ }
+ w = width_org * num / den;
+ h = height_org * num / den;
+ // Make height and width even.
+ w += w % 2;
+ h += h % 2;
+ *width_out = w;
+ *height_out = h;
+}
+
+void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ AV1_COMMON *const cm = &cpi->common;
+ LAYER_CONTEXT *lc = NULL;
+ int width = 0, height = 0;
+ lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ // Set the lower quality layer flag.
+ svc->has_lower_quality_layer = 0;
+ if (cpi->svc.spatial_layer_id > 0) {
+ const LAYER_CONTEXT *lc_prev =
+ &svc->layer_context[(svc->spatial_layer_id - 1) *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1)
+ svc->has_lower_quality_layer = 1;
+ }
+ av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
+ cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
+ lc->scaling_factor_den, &width, &height);
+ // Use Eightap_smooth for low resolutions.
+ if (width * height <= 320 * 240)
+ svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+
+ cm->width = width;
+ cm->height = height;
+ alloc_mb_mode_info_buffers(cpi);
+ av1_update_frame_size(cpi);
+ if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ svc->mi_cols_full_resoln = cm->mi_params.mi_cols;
+ svc->mi_rows_full_resoln = cm->mi_params.mi_rows;
+ }
+}
+
+enum {
+ SVC_LAST_FRAME = 0,
+ SVC_LAST2_FRAME,
+ SVC_LAST3_FRAME,
+ SVC_GOLDEN_FRAME,
+ SVC_BWDREF_FRAME,
+ SVC_ALTREF2_FRAME,
+ SVC_ALTREF_FRAME
+};
+
+// For fixed svc mode: fixed pattern is set based on the number of
+// spatial and temporal layers, and the ksvc_fixed_mode.
+void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ int i;
+ assert(svc->use_flexible_mode == 0);
+ // Fixed SVC mode only supports at most 3 spatial or temporal layers.
+ assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 &&
+ svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
+ rtc_ref->set_ref_frame_config = 1;
+ int superframe_cnt = svc->current_superframe;
+ // Set the reference map buffer idx for the 7 references:
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = i;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->reference[i] = 0;
+ for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0;
+ // Always reference LAST, and reference GOLDEN on SL > 0.
+ // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
+ // when frame_type is set.
+ rtc_ref->reference[SVC_LAST_FRAME] = 1;
+ if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1;
+ if (svc->temporal_layer_id == 0) {
+ // Base temporal layer.
+ if (svc->spatial_layer_id == 0) {
+ // Set all buffer_idx to 0. Update slot 0 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->refresh[0] = 1;
+ } else if (svc->spatial_layer_id == 1) {
+ // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to
+ // slot 0. Update slot 1 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+ rtc_ref->refresh[1] = 1;
+ } else if (svc->spatial_layer_id == 2) {
+ // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to
+ // slot 1. Update slot 2 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+ rtc_ref->refresh[2] = 1;
+ }
+ } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer.
+ if (svc->spatial_layer_id == 0) {
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to slot 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ rtc_ref->refresh[3] = 1;
+ }
+ } else if (svc->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and Update slot 4.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+ rtc_ref->refresh[4] = 1;
+ }
+ } else if (svc->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+ }
+ } else if (svc->temporal_layer_id == 1) {
+ // Middle temporal enhancement layer.
+ if (svc->spatial_layer_id == 0) {
+ // Reference LAST.
+ // Set all buffer_idx to 0.
+ // Set GOLDEN to slot 5 and update slot 5.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5;
+ rtc_ref->refresh[5] = 1;
+ }
+ } else if (svc->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 5.
+ // Set LAST3 to slot 6 and update slot 6.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6;
+ rtc_ref->refresh[6] = 1;
+ }
+ } else if (svc->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 6.
+ // Set LAST3 to slot 7 and update slot 7.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7;
+ rtc_ref->refresh[7] = 1;
+ }
+ }
+ } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer.
+ if (svc->spatial_layer_id == 0) {
+ // Set LAST to slot 5 and reference LAST.
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 5;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ rtc_ref->refresh[3] = 1;
+ }
+ } else if (svc->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+ // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 6;
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+ rtc_ref->refresh[4] = 1;
+ }
+ } else if (svc->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+ // GOLDEN to slot 4. No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 7;
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4;
+ }
+ }
+}
+
+void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+ // If avg_frame_bandwidth for top temporal layer is not set
+ // (because enhancement layer was inactive), use the base TL0
+ int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ int avg_frame_bandwidth = lrc->avg_frame_bandwidth;
+ int prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth;
+ if (avg_frame_bandwidth == 0 || prev_avg_frame_bandwidth == 0) {
+ // Use base TL0.
+ layer = LAYER_IDS_TO_IDX(sl, 0, svc->number_temporal_layers);
+ lc = &svc->layer_context[layer];
+ lrc = &lc->rc;
+ avg_frame_bandwidth = lrc->avg_frame_bandwidth;
+ prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth;
+ }
+ if (avg_frame_bandwidth > (3 * prev_avg_frame_bandwidth >> 1) ||
+ avg_frame_bandwidth < (prev_avg_frame_bandwidth >> 1)) {
+ // Reset for all temporal layers with spatial layer sl.
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc2 = &svc->layer_context[layer2];
+ RATE_CONTROL *lrc2 = &lc2->rc;
+ PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc;
+ PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc;
+ lrc2->rc_1_frame = 0;
+ lrc2->rc_2_frame = 0;
+ lp_rc2->bits_off_target = lp_rc->optimal_buffer_level;
+ lp_rc2->buffer_level = lp_rc->optimal_buffer_level;
+ }
+ }
+ }
+}
+
+void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input,
+ YV12_BUFFER_CONFIG *prev_source) {
+ frame_input->last_source = prev_source != NULL ? prev_source : NULL;
+ if (!cpi->ppi->use_svc && cpi->rc.prev_frame_is_dropped &&
+ cpi->rc.frame_number_encoded > 0) {
+ frame_input->last_source = &cpi->svc.source_last_TL0;
+ } else {
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ if (cpi->svc.spatial_layer_id == 0) {
+ // For base spatial layer: if the LAST reference (index 0) is not
+ // the previous (super)frame set the last_source to the source
+ // corresponding to the last TL0, otherwise keep it at prev_source.
+ // Always use source_last_TL0 if previous base TL0 was dropped.
+ if (cpi->svc.current_superframe > 0) {
+ const int buffslot_last = rtc_ref->ref_idx[0];
+ // Check if previous frame was dropped on base TL0 layer.
+ const int layer =
+ LAYER_IDS_TO_IDX(0, 0, cpi->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ if (lrc->prev_frame_is_dropped ||
+ rtc_ref->buffer_time_index[buffslot_last] <
+ cpi->svc.current_superframe - 1) {
+ frame_input->last_source = &cpi->svc.source_last_TL0;
+ }
+ }
+ } else if (cpi->svc.spatial_layer_id > 0) {
+ // For spatial enhancement layers: the previous source (prev_source)
+ // corresponds to the lower spatial layer (which is the same source so
+ // we can't use that), so always set the last_source to the source of the
+ // last TL0.
+ if (cpi->svc.current_superframe > 0)
+ frame_input->last_source = &cpi->svc.source_last_TL0;
+ else
+ frame_input->last_source = NULL;
+ }
+ }
+}
+
+int av1_svc_get_min_ref_dist(const AV1_COMP *cpi) {
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ int min_dist = INT_MAX;
+ const unsigned int current_frame_num =
+ cpi->ppi->use_svc ? cpi->svc.current_superframe
+ : cpi->common.current_frame.frame_number;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (cpi->ppi->rtc_ref.reference[i]) {
+ const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ const int dist =
+ current_frame_num - rtc_ref->buffer_time_index[ref_frame_map_idx];
+ if (dist < min_dist) min_dist = dist;
+ }
+ }
+ return min_dist;
+}
+
+void av1_svc_set_reference_was_previous(AV1_COMP *cpi) {
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ // Check if the encoded frame had some reference that was the
+ // previous frame.
+ const unsigned int current_frame =
+ cpi->ppi->use_svc ? cpi->svc.current_superframe
+ : cpi->common.current_frame.frame_number;
+ rtc_ref->reference_was_previous_frame = true;
+ if (current_frame > 0) {
+ rtc_ref->reference_was_previous_frame = false;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (rtc_ref->reference[i]) {
+ const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ if (rtc_ref->buffer_time_index[ref_frame_map_idx] == current_frame - 1)
+ rtc_ref->reference_was_previous_frame = true;
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/svc_layercontext.h b/third_party/aom/av1/encoder/svc_layercontext.h
new file mode 100644
index 0000000000..93118be2d4
--- /dev/null
+++ b/third_party/aom/av1/encoder/svc_layercontext.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief The stucture of quantities related to each spatial and temporal layer.
+ * \ingroup SVC
+ */
+typedef struct {
+ /*!\cond */
+ RATE_CONTROL rc;
+ PRIMARY_RATE_CONTROL p_rc;
+ int framerate_factor;
+ int64_t layer_target_bitrate; // In bits per second.
+ int scaling_factor_num;
+ int scaling_factor_den;
+ int64_t target_bandwidth;
+ int64_t spatial_layer_target_bandwidth;
+ double framerate;
+ int avg_frame_size;
+ int max_q;
+ int min_q;
+ int frames_from_key_frame;
+ /*!\endcond */
+
+ /*!
+ * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+ */
+ int sb_index;
+ /*!
+ * Segmentation map
+ */
+ int8_t *map;
+ /*!
+ * Number of blocks on segment 1
+ */
+ int actual_num_seg1_blocks;
+
+ /*!
+ * Number of blocks on segment 2
+ */
+ int actual_num_seg2_blocks;
+ /*!
+ * Counter used to detect scene change.
+ */
+ int counter_encode_maxq_scene_change;
+
+ /*!
+ * Speed settings for each layer.
+ */
+ uint8_t speed;
+ /*!
+ * GF group index.
+ */
+ unsigned char group_index;
+ /*!
+ * If current layer is key frame.
+ */
+ int is_key_frame;
+ /*!
+ * Maximum motion magnitude of previous encoded layer.
+ */
+ int max_mv_magnitude;
+} LAYER_CONTEXT;
+
+/*!
+ * \brief The stucture of SVC.
+ * \ingroup SVC
+ */
+typedef struct SVC {
+ /*!\cond */
+ int spatial_layer_id;
+ int temporal_layer_id;
+ int number_spatial_layers;
+ int number_temporal_layers;
+ int prev_number_spatial_layers;
+ int use_flexible_mode;
+ int ksvc_fixed_mode;
+ /*!\endcond */
+
+ /*!\cond */
+ double base_framerate;
+ unsigned int current_superframe;
+ int skip_mvsearch_last;
+ int skip_mvsearch_gf;
+ int skip_mvsearch_altref;
+ int spatial_layer_fb[REF_FRAMES];
+ int temporal_layer_fb[REF_FRAMES];
+ int num_encoded_top_layer;
+ int first_layer_denoise;
+ YV12_BUFFER_CONFIG source_last_TL0;
+ int mi_cols_full_resoln;
+ int mi_rows_full_resoln;
+ /*!\endcond */
+
+ /*!
+ * Layer context used for rate control in CBR mode.
+ * An array. The index for spatial layer `sl` and temporal layer `tl` is
+ * sl * number_temporal_layers + tl.
+ */
+ LAYER_CONTEXT *layer_context;
+
+ /*!
+ * Number of layers allocated for layer_context. If nonzero, must be greater
+ * than or equal to number_spatial_layers * number_temporal_layers.
+ */
+ int num_allocated_layers;
+
+ /*!
+ * EIGHTTAP_SMOOTH or BILINEAR
+ */
+ InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+
+ /*!
+ * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+ * = 8 will center the target pixel and get a symmetric averaging filter.
+ */
+ int downsample_filter_phase[AOM_MAX_SS_LAYERS];
+
+ /*!
+ * Force zero-mv in mode search for the spatial/inter-layer reference.
+ */
+ int force_zero_mode_spatial_ref;
+
+ /*!
+ * Flag to indicate that current spatial layer has a lower quality layer
+ * (at the same timestamp) that can be used as a reference.
+ * Lower quality layer refers to the same resolution but encoded at
+ * different/lower bitrate.
+ */
+ int has_lower_quality_layer;
+
+ /*!
+ * Flag to indicate the frame drop mode for SVC: one of the two settings:
+ * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP.
+ */
+ AOM_SVC_FRAME_DROP_MODE framedrop_mode;
+
+ /*!
+ * Flag to indicate if frame was dropped for a given spatial_layer_id on
+ * previous superframe.
+ */
+ bool last_layer_dropped[AOM_MAX_SS_LAYERS];
+
+ /*!
+ * Flag to indicate if a previous spatial was dropped for the same superframe.
+ */
+ bool drop_spatial_layer[AOM_MAX_SS_LAYERS];
+} SVC;
+
+struct AV1_COMP;
+struct EncodeFrameInput;
+
+/*!\brief Initialize layer context data from init_config().
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Nothing returned. Set cpi->svc.
+ */
+void av1_init_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Allocate layer context data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] num_layers Number of layers to be allocated
+ *
+ * \remark Allocates memory for cpi->svc.layer_context.
+ * \return True on success, false on allocation failure.
+ */
+bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers);
+
+/*!\brief Update the layer context from a change_config() call.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] target_bandwidth Total target bandwidth
+ *
+ * \remark Nothing returned. Buffer level for each layer is set.
+ */
+void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
+ const int64_t target_bandwidth);
+
+/*!\brief Prior to encoding the frame, update framerate-related quantities
+ for the current temporal layer.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Nothing returned. Frame related quantities for current temporal
+ layer are updated.
+ */
+void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
+
+/*!\brief Prior to encoding the frame, set the layer context, for the current
+ layer to be encoded, to the cpi struct.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Nothing returned. Layer context for current layer is set.
+ */
+void av1_restore_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Save the layer context after encoding the frame.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ */
+void av1_save_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Free the memory used for cyclic refresh in layer context.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ */
+void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
+
+/*!\brief Reset on key frame: reset counters, references and buffer updates.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] is_key Whether current layer is key frame
+ */
+void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
+
+/*!\brief Before encoding, set resolutions and allocate compressor data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ */
+void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
+
+/*!\brief Get primary reference frame for current layer
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return The primary reference frame for current layer.
+ */
+int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
+
+/*!\brief Get resolution for current layer.
+ *
+ * \ingroup SVC
+ * \param[in] width_org Original width, unscaled
+ * \param[in] height_org Original height, unscaled
+ * \param[in] num Numerator for the scale ratio
+ * \param[in] den Denominator for the scale ratio
+ * \param[in] width_out Output width, scaled for current layer
+ * \param[in] height_out Output height, scaled for current layer
+ *
+ * \remark Nothing is returned. Instead the scaled width and height are set.
+ */
+void av1_get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out);
+
+void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
+
+void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
+
+void av1_svc_set_last_source(struct AV1_COMP *const cpi,
+ struct EncodeFrameInput *frame_input,
+ YV12_BUFFER_CONFIG *prev_source);
+
+void av1_svc_update_buffer_slot_refreshed(struct AV1_COMP *const cpi);
+
+int av1_svc_get_min_ref_dist(const struct AV1_COMP *cpi);
+
+void av1_svc_set_reference_was_previous(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 0000000000..7d4d25de6a
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+
+/*!\cond */
+
+// NOTE: All `tf` in this file means `temporal filtering`.
+
+// Forward Declaration.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+ MV *subblock_mvs, int *subblock_mses);
+
+// This function returns the minimum and maximum log variances for 4x4 sub
+// blocks in the current block.
+static INLINE void get_log_var_4x4sub_blk(
+ AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row,
+ int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min,
+ double *blk_4x4_var_max, int is_hbd) {
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ int var_min = INT_MAX;
+ int var_max = 0;
+
+ // Derive the source buffer.
+ const int src_stride = frame_to_filter->y_stride;
+ const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width;
+ const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset;
+
+ for (int i = 0; i < mb_height; i += MI_SIZE) {
+ for (int j = 0; j < mb_width; j += MI_SIZE) {
+ // Calculate the 4x4 sub-block variance.
+ const int var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j,
+ src_stride, is_hbd);
+
+ // Record min and max for over-arching block
+ var_min = AOMMIN(var_min, var);
+ var_max = AOMMAX(var_max, var);
+ }
+ }
+
+ *blk_4x4_var_min = log1p(var_min / 16.0);
+ *blk_4x4_var_max = log1p(var_max / 16.0);
+}
+
+/*!\endcond */
+/*!\brief Does motion search for blocks in temporal filtering. This is
+ * the first step for temporal filtering. More specifically, given a frame to
+ * be filtered and another frame as reference, this function searches the
+ * reference frame to find out the most similar block as that from the frame
+ * to be filtered. This found block will be further used for weighted
+ * averaging.
+ *
+ * NOTE: Besides doing motion search for the entire block, this function will
+ * also do motion search for each 1/4 sub-block to get more precise
+ * predictions. Then, this function will determines whether to use 4
+ * sub-blocks to replace the entire block. If we do need to split the
+ * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
+ * the searched motion vector and search error (MSE) w.r.t. each sub-block
+ * respectively. Otherwise, the 4 elements will be the same, all of which
+ * are assigned as the searched motion vector and search error (MSE) for
+ * the entire block.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] mb Pointer to macroblock
+ * \param[in] frame_to_filter Pointer to the frame to be filtered
+ * \param[in] ref_frame Pointer to the reference frame
+ * \param[in] block_size Block size used for motion search
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] ref_mv Reference motion vector, which is commonly
+ * inherited from the motion search result of
+ * previous frame.
+ * \param[in] allow_me_for_sub_blks Flag to indicate whether motion search at
+ * 16x16 sub-block level is needed or not.
+ * \param[out] subblock_mvs Pointer to the motion vectors for
+ * 4 sub-blocks
+ * \param[out] subblock_mses Pointer to the search errors (MSE) for
+ * 4 sub-blocks
+ *
+ * \remark Nothing will be returned. Results are saved in subblock_mvs and
+ * subblock_mses
+ */
+static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
+ const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, MV *ref_mv,
+ bool allow_me_for_sub_blks, MV *subblock_mvs,
+ int *subblock_mses) {
+ // Frame information
+ const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
+
+ // Block information (ONLY Y-plane is used for motion search).
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mb_pels = mb_height * mb_width;
+ const int y_stride = frame_to_filter->y_stride;
+ const int src_width = frame_to_filter->y_width;
+ const int ref_width = ref_frame->y_width;
+ assert(y_stride == ref_frame->y_stride);
+ assert(src_width == ref_width);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+ // Save input state.
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ const struct buf_2d ori_src_buf = mb->plane[0].src;
+ const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+ // Parameters used for motion search.
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ const int step_param = av1_init_search_range(
+ AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
+ const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
+ const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
+ const MV_COST_TYPE mv_cost_type =
+ min_frame_size >= 720
+ ? MV_COST_L1_HDRES
+ : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
+
+ // Starting position for motion search.
+ FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+ // Baseline position for motion search (used for rate distortion comparison).
+ const MV baseline_mv = kZeroMv;
+
+ // Setup.
+ mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
+ mb->plane[0].src.stride = y_stride;
+ mb->plane[0].src.width = src_width;
+ mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
+ mbd->plane[0].pre[0].stride = y_stride;
+ mbd->plane[0].pre[0].width = ref_width;
+
+ const SEARCH_METHODS search_method = NSTEP;
+ const search_site_config *search_site_cfg =
+ av1_get_search_site_config(cpi, mb, search_method);
+
+ // Unused intermediate results for motion search.
+ unsigned int sse, error;
+ int distortion;
+ int cost_list[5];
+
+ // Do motion search.
+ int_mv best_mv; // Searched motion vector.
+ FULLPEL_MV_STATS best_mv_stats;
+ int block_mse = INT_MAX;
+ MV block_mv = kZeroMv;
+ const int q = av1_get_q(cpi);
+
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+ &baseline_mv, start_mv, search_site_cfg,
+ search_method,
+ /*fine_search_interval=*/0);
+ full_ms_params.run_mesh_search = 1;
+ full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+ if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+ // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+ full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+ full_ms_params.mesh_search_mv_diff_threshold = 2;
+ }
+
+ av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+ &best_mv_stats, NULL);
+
+ if (force_integer_mv == 1) { // Only do full search on the entire block.
+ const int mv_row = best_mv.as_mv.row;
+ const int mv_col = best_mv.as_mv.col;
+ best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
+ best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
+ const int mv_offset = mv_row * y_stride + mv_col;
+ error = cpi->ppi->fn_ptr[block_size].vf(
+ ref_frame->y_buffer + y_offset + mv_offset, y_stride,
+ frame_to_filter->y_buffer + y_offset, y_stride, &sse);
+ block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+ block_mv = best_mv.as_mv;
+ } else { // Do fractional search on the entire block and all sub-blocks.
+ av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
+ &baseline_mv, cost_list);
+ ms_params.forced_stop = EIGHTH_PEL;
+ ms_params.var_params.subpel_search_type = subpel_search_type;
+ // Since we are merely refining the result from full pixel search, we don't
+ // need regularization for subpel search
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ error = cpi->mv_search_params.find_fractional_mv_step(
+ &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv.as_mv, &distortion, &sse, NULL);
+ block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+ block_mv = best_mv.as_mv;
+ *ref_mv = best_mv.as_mv;
+
+ if (allow_me_for_sub_blks) {
+ // On 4 sub-blocks.
+ const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
+ const int subblock_height = block_size_high[subblock_size];
+ const int subblock_width = block_size_wide[subblock_size];
+ const int subblock_pels = subblock_height * subblock_width;
+ start_mv = get_fullmv_from_mv(ref_mv);
+
+ int subblock_idx = 0;
+ for (int i = 0; i < mb_height; i += subblock_height) {
+ for (int j = 0; j < mb_width; j += subblock_width) {
+ const int offset = i * y_stride + j;
+ mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+ mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+ av1_make_default_fullpel_ms_params(
+ &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
+ search_site_cfg, search_method,
+ /*fine_search_interval=*/0);
+ full_ms_params.run_mesh_search = 1;
+ full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+ if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+ // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+ full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+ full_ms_params.mesh_search_mv_diff_threshold = 2;
+ }
+ av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+ &baseline_mv, cost_list);
+ ms_params.forced_stop = EIGHTH_PEL;
+ ms_params.var_params.subpel_search_type = subpel_search_type;
+ // Since we are merely refining the result from full pixel search, we
+ // don't need regularization for subpel search
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+
+ subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(
+ av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ error = cpi->mv_search_params.find_fractional_mv_step(
+ &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+ &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
+ subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+ subblock_mvs[subblock_idx] = best_mv.as_mv;
+ ++subblock_idx;
+ }
+ }
+ }
+ }
+
+ // Restore input state.
+ mb->plane[0].src = ori_src_buf;
+ mbd->plane[0].pre[0] = ori_pre_buf;
+
+ // Make partition decision.
+ if (allow_me_for_sub_blks) {
+ tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+ subblock_mses);
+ } else {
+ // Copy 32X32 block mv and mse values to sub blocks
+ for (int i = 0; i < 4; ++i) {
+ subblock_mvs[i] = block_mv;
+ subblock_mses[i] = block_mse;
+ }
+ }
+ // Do not pass down the reference motion vector if error is too large.
+ const int thresh = (min_frame_size >= 720) ? 12 : 3;
+ if (block_mse > (thresh << (mbd->bd - 8))) {
+ *ref_mv = kZeroMv;
+ }
+}
+/*!\cond */
+
+// Determines whether to split the entire block to 4 sub-blocks for filtering.
+// In particular, this decision is made based on the comparison between the
+// motion search error of the entire block and the errors of all sub-blocks.
+// Inputs:
+// block_mv: Motion vector for the entire block (ONLY as reference).
+// block_mse: Motion search error (MSE) for the entire block (ONLY as
+// reference).
+// subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
+// modified based on the partition decision).
+// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
+// be modified based on the partition decision).
+// Returns:
+// Nothing will be returned. Results are saved in `subblock_mvs` and
+// `subblock_mses`.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+ MV *subblock_mvs, int *subblock_mses) {
+ int min_subblock_mse = INT_MAX;
+ int max_subblock_mse = INT_MIN;
+ int64_t sum_subblock_mse = 0;
+ for (int i = 0; i < 4; ++i) {
+ sum_subblock_mse += subblock_mses[i];
+ min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+ max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+ }
+
+ // TODO(any): The following magic numbers may be tuned to improve the
+ // performance OR find a way to get rid of these magic numbers.
+ if (((block_mse * 15 < sum_subblock_mse * 4) &&
+ max_subblock_mse - min_subblock_mse < 48) ||
+ ((block_mse * 14 < sum_subblock_mse * 4) &&
+ max_subblock_mse - min_subblock_mse < 24)) { // No split.
+ for (int i = 0; i < 4; ++i) {
+ subblock_mvs[i] = block_mv;
+ subblock_mses[i] = block_mse;
+ }
+ }
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+ return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+/*!\endcond */
+/*!\brief Builds predictor for blocks in temporal filtering. This is the
+ * second step for temporal filtering, which is to construct predictions from
+ * all reference frames INCLUDING the frame to be filtered itself. These
+ * predictors are built based on the motion search results (motion vector is
+ * set as 0 for the frame to be filtered), and will be futher used for
+ * weighted averaging.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] ref_frame Pointer to the reference frame (or the frame
+ * to be filtered)
+ * \param[in] mbd Pointer to the block for filtering. Besides
+ * containing the subsampling information of all
+ * planes, this field also gives the searched
+ * motion vector for the entire block, i.e.,
+ * `mbd->mi[0]->mv[0]`. This vector should be 0
+ * if the `ref_frame` itself is the frame to be
+ * filtered.
+ * \param[in] block_size Size of the block
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] num_planes Number of planes in the frame
+ * \param[in] scale Scaling factor
+ * \param[in] subblock_mvs The motion vectors for each sub-block (row-major
+ * order)
+ * \param[out] pred Pointer to the predictor to be built
+ *
+ * \remark Nothing returned, But the contents of `pred` will be modified
+ */
+static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
+ const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, const int num_planes,
+ const struct scale_factors *scale,
+ const MV *subblock_mvs, uint8_t *pred) {
+ // Information of the entire block.
+ const int mb_height = block_size_high[block_size]; // Height.
+ const int mb_width = block_size_wide[block_size]; // Width.
+ const int mb_y = mb_height * mb_row; // Y-coord (Top-left).
+ const int mb_x = mb_width * mb_col; // X-coord (Top-left).
+ const int bit_depth = mbd->bd; // Bit depth.
+ const int is_intrabc = 0; // Is intra-copied?
+ const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
+
+ // Default interpolation filters.
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(MULTITAP_SHARP2);
+
+ // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_y = mbd->plane[plane].subsampling_y;
+ const int subsampling_x = mbd->plane[plane].subsampling_x;
+ // Information of each sub-block in current plane.
+ const int plane_h = mb_height >> subsampling_y; // Plane height.
+ const int plane_w = mb_width >> subsampling_x; // Plane width.
+ const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left).
+ const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left).
+ const int h = plane_h >> 1; // Sub-block height.
+ const int w = plane_w >> 1; // Sub-block width.
+ const int is_y_plane = (plane == 0); // Is Y-plane?
+
+ const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
+ ref_frame->widths[is_y_plane ? 0 : 1],
+ ref_frame->heights[is_y_plane ? 0 : 1],
+ ref_frame->strides[is_y_plane ? 0 : 1] };
+
+ // Handle each subblock.
+ int subblock_idx = 0;
+ for (int i = 0; i < plane_h; i += h) {
+ for (int j = 0; j < plane_w; j += w) {
+ // Choose proper motion vector.
+ const MV mv = subblock_mvs[subblock_idx++];
+ assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+ mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+
+ const int y = plane_y + i;
+ const int x = plane_x + j;
+
+ // Build predictior for each sub-block on current plane.
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth,
+ is_intrabc, scale, &ref_buf, interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
+ plane_w, &mv, &inter_pred_params);
+ }
+ }
+ plane_offset += plane_h * plane_w;
+ }
+}
+/*!\cond */
+
+// Computes temporal filter weights and accumulators for the frame to be
+// filtered. More concretely, the filter weights for all pixels are the same.
+// Inputs:
+// mbd: Pointer to the block for filtering, which is ONLY used to get
+// subsampling information of all planes as well as the bit-depth.
+// block_size: Size of the block.
+// num_planes: Number of planes in the frame.
+// pred: Pointer to the well-built predictors.
+// accum: Pointer to the pixel-wise accumulator for filtering.
+// count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+// Nothing will be returned. But the content to which `accum` and `pred`
+// point will be modified.
+void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
+ const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size,
+ const int mb_row, const int mb_col,
+ const int num_planes, uint32_t *accum,
+ uint16_t *count) {
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int is_high_bitdepth = is_cur_buf_hbd(mbd);
+
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_y = mbd->plane[plane].subsampling_y;
+ const int subsampling_x = mbd->plane[plane].subsampling_x;
+ const int h = mb_height >> subsampling_y; // Plane height.
+ const int w = mb_width >> subsampling_x; // Plane width.
+
+ const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const uint8_t *buf8 = ref_frame->buffers[plane];
+ const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
+ const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+
+ int pred_idx = 0;
+ int pixel_idx = 0;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const int idx = plane_offset + pred_idx; // Index with plane shift.
+ const int pred_value = is_high_bitdepth
+ ? buf16[frame_offset + pixel_idx]
+ : buf8[frame_offset + pixel_idx];
+ accum[idx] += TF_WEIGHT_SCALE * pred_value;
+ count[idx] += TF_WEIGHT_SCALE;
+ ++pred_idx;
+ ++pixel_idx;
+ }
+ pixel_idx += (frame_stride - w);
+ }
+ plane_offset += h * w;
+ }
+}
+
+// Function to compute pixel-wise squared difference between two buffers.
+// Inputs:
+// ref: Pointer to reference buffer.
+// ref_offset: Start position of reference buffer for computation.
+// ref_stride: Stride for reference buffer.
+// tgt: Pointer to target buffer.
+// tgt_offset: Start position of target buffer for computation.
+// tgt_stride: Stride for target buffer.
+// height: Height of block for computation.
+// width: Width of block for computation.
+// is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
+// square_diff: Pointer to save the squared differces.
+// Returns:
+// Nothing will be returned. But the content to which `square_diff` points
+// will be modified.
+static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
+ const int ref_stride, const uint8_t *tgt,
+ const int tgt_offset,
+ const int tgt_stride, const int height,
+ const int width,
+ const int is_high_bitdepth,
+ uint32_t *square_diff) {
+ const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
+
+ int ref_idx = 0;
+ int tgt_idx = 0;
+ int idx = 0;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
+ : ref[ref_offset + ref_idx];
+ const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
+ : tgt[tgt_offset + tgt_idx];
+ const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
+ : (tgt_value - ref_value);
+ square_diff[idx] = diff * diff;
+
+ ++ref_idx;
+ ++tgt_idx;
+ ++idx;
+ }
+ ref_idx += (ref_stride - width);
+ tgt_idx += (tgt_stride - width);
+ }
+}
+
+// Function to accumulate pixel-wise squared difference between two luma buffers
+// to be consumed while filtering the chroma planes.
+// Inputs:
+// square_diff: Pointer to squared differences from luma plane.
+// luma_sse_sum: Pointer to save the sum of luma squared differences.
+// block_height: Height of block for computation.
+// block_width: Width of block for computation.
+// ss_x_shift: Chroma subsampling shift in 'X' direction
+// ss_y_shift: Chroma subsampling shift in 'Y' direction
+// Returns:
+// Nothing will be returned. But the content to which `luma_sse_sum` points
+// will be modified.
+void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
+ int block_height, int block_width,
+ int ss_x_shift, int ss_y_shift) {
+ for (int i = 0; i < block_height; ++i) {
+ for (int j = 0; j < block_width; ++j) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ const int ww = block_width << ss_x_shift; // Width of Y-plane.
+ luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
+ }
+ }
+ }
+ }
+}
+
+/*!\endcond */
+/*!\brief Applies temporal filtering. NOTE that there are various optimised
+ * versions of this function called where the appropriate instruction set is
+ * supported.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] frame_to_filter Pointer to the frame to be filtered, which is
+ * used as reference to compute squared
+ * difference from the predictor.
+ * \param[in] mbd Pointer to the block for filtering, ONLY used
+ * to get subsampling information for the planes
+ * \param[in] block_size Size of the block
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] num_planes Number of planes in the frame
+ * \param[in] noise_levels Estimated noise levels for each plane
+ * in the frame (Y,U,V)
+ * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
+ * \param[in] subblock_mses Pointer to the search errors (MSE) for 4
+ * sub-blocks
+ * \param[in] q_factor Quantization factor. This is actually the `q`
+ * defined in libaom, converted from `qindex`
+ * \param[in] filter_strength Filtering strength. This value lies in range
+ * [0, 6] where 6 is the maximum strength.
+ * \param[in] tf_wgt_calc_lvl Controls the weight calculation method during
+ * temporal filtering
+ * \param[out] pred Pointer to the well-built predictors
+ * \param[out] accum Pointer to the pixel-wise accumulator for
+ * filtering
+ * \param[out] count Pointer to the pixel-wise counter for
+ * filtering
+ *
+ * \remark Nothing returned, But the contents of `accum`, `pred` and 'count'
+ * will be modified
+ */
+void av1_apply_temporal_filter_c(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mb_pels = mb_height * mb_width;
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+ const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Decay factors for non-local mean approach.
+ double decay_factor[MAX_MB_PLANE] = { 0 };
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ for (int plane = 0; plane < num_planes; plane++) {
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
+ }
+ double d_factor[4] = { 0 };
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Allocate memory for pixel-wise squared differences. They,
+ // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+ uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
+ if (!square_diff) {
+ aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+ memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
+
+ // Allocate memory for accumulated luma squared error. This value will be
+ // consumed while filtering the chroma planes.
+ uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
+ if (!luma_sse_sum) {
+ aom_free(square_diff);
+ aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+ memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
+
+ // Get window size for pixel-wise filtering.
+ assert(TF_WINDOW_LENGTH % 2 == 1);
+ const int half_window = TF_WINDOW_LENGTH >> 1;
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ // Locate pixel on reference frame.
+ const int subsampling_y = mbd->plane[plane].subsampling_y;
+ const int subsampling_x = mbd->plane[plane].subsampling_x;
+ const int h = mb_height >> subsampling_y; // Plane height.
+ const int w = mb_width >> subsampling_x; // Plane width.
+ const int frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+ const uint8_t *ref = frame_to_filter->buffers[plane];
+ const int ss_y_shift =
+ subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int ss_x_shift =
+ subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane will
+ // be more accurate. The luma sse sum is reused in both chroma planes.
+ if (plane == AOM_PLANE_U)
+ compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
+ ss_y_shift);
+ compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
+ h, w, is_high_bitdepth, square_diff);
+
+ // Perform filtering.
+ int pred_idx = 0;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ // non-local mean approach
+ uint64_t sum_square_diff = 0;
+
+ for (int wi = -half_window; wi <= half_window; ++wi) {
+ for (int wj = -half_window; wj <= half_window; ++wj) {
+ const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane.
+ const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane.
+ sum_square_diff += square_diff[y * w + x];
+ }
+ }
+
+ sum_square_diff += luma_sse_sum[i * w + j];
+
+ // Scale down the difference for high bit depth input.
+ if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
+
+ // Combine window error and block error, and normalize it.
+ const double window_error = sum_square_diff * inv_num_ref_pixels;
+ const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor[plane];
+ scaled_error = AOMMIN(scaled_error, 7);
+ int weight;
+ if (tf_wgt_calc_lvl == 0) {
+ weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ } else {
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ weight = iroundpf(fweight);
+ }
+
+ const int idx = plane_offset + pred_idx; // Index with plane shift.
+ const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+ accum[idx] += weight * pred_value;
+ count[idx] += weight;
+
+ ++pred_idx;
+ }
+ }
+ plane_offset += h * w;
+ }
+
+ aom_free(square_diff);
+ aom_free(luma_sse_sum);
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+// Calls High bit-depth temporal filter
+void av1_highbd_apply_temporal_filter_c(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
+ num_planes, noise_levels, subblock_mvs,
+ subblock_mses, q_factor, filter_strength,
+ tf_wgt_calc_lvl, pred, accum, count);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+/*!\brief Normalizes the accumulated filtering result to produce the filtered
+ * frame
+ *
+ * \ingroup src_frame_proc
+ * \param[in] mbd Pointer to the block for filtering, which is
+ * ONLY used to get subsampling information for
+ * all the planes
+ * \param[in] block_size Size of the block
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] num_planes Number of planes in the frame
+ * \param[in] accum Pointer to the pre-computed accumulator
+ * \param[in] count Pointer to the pre-computed count
+ * \param[out] result_buffer Pointer to result buffer
+ *
+ * \remark Nothing returned, but the content to which `result_buffer` pointer
+ * will be modified
+ */
+static void tf_normalize_filtered_frame(
+ const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, const int num_planes, const uint32_t *accum,
+ const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
+
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+ uint8_t *const buf = result_buffer->buffers[plane];
+ uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
+
+ int plane_idx = 0; // Pixel index on current plane (block-base).
+ int frame_idx = frame_offset; // Pixel index on the entire frame.
+ for (int i = 0; i < plane_h; ++i) {
+ for (int j = 0; j < plane_w; ++j) {
+ const int idx = plane_idx + plane_offset;
+ const uint16_t rounding = count[idx] >> 1;
+ if (is_high_bitdepth) {
+ buf16[frame_idx] =
+ (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+ } else {
+ buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+ }
+ ++plane_idx;
+ ++frame_idx;
+ }
+ frame_idx += (frame_stride - plane_w);
+ }
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+int av1_get_q(const AV1_COMP *cpi) {
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ const int q =
+ (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
+ cpi->common.seq_params->bit_depth);
+ return q;
+}
+
+void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+ const int num_frames = tf_ctx->num_frames;
+ const int filter_frame_idx = tf_ctx->filter_frame_idx;
+ const int compute_frame_diff = tf_ctx->compute_frame_diff;
+ const struct scale_factors *scale = &tf_ctx->sf;
+ const double *noise_levels = tf_ctx->noise_levels;
+ const int num_pels = tf_ctx->num_pels;
+ const int q_factor = tf_ctx->q_factor;
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+ MACROBLOCK *const mb = &td->mb;
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ TemporalFilterData *const tf_data = &td->tf_data;
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mi_h = mi_size_high_log2[block_size];
+ const int mi_w = mi_size_wide_log2[block_size];
+ const int num_planes = av1_num_planes(&cpi->common);
+ const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf;
+ uint32_t *accum = tf_data->accum;
+ uint16_t *count = tf_data->count;
+ uint8_t *pred = tf_data->pred;
+
+ // Factor to control the filering strength.
+ const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
+
+ // Do filtering.
+ FRAME_DIFF *diff = &td->tf_data.diff;
+ av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
+ av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ memset(accum, 0, num_pels * sizeof(accum[0]));
+ memset(count, 0, num_pels * sizeof(count[0]));
+ MV ref_mv = kZeroMv; // Reference motion vector passed down along frames.
+ // Perform temporal filtering frame by frame.
+
+ // Decide whether to perform motion search at 16x16 sub-block level or not
+ // based on 4x4 sub-blocks source variance. Allow motion search for split
+ // partition only if the difference between max and min source variance of
+ // 4x4 blocks is greater than a threshold (which is derived empirically).
+ bool allow_me_for_sub_blks = true;
+ if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) {
+ const int is_hbd = is_frame_high_bitdepth(frame_to_filter);
+ // Initialize minimum variance to a large value and maximum variance to 0.
+ double blk_4x4_var_min = DBL_MAX;
+ double blk_4x4_var_max = 0;
+ get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col,
+ TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max,
+ is_hbd);
+ // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the
+ // threshold for high bit depth.
+ if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0)
+ allow_me_for_sub_blks = false;
+ }
+
+ for (int frame = 0; frame < num_frames; frame++) {
+ if (frames[frame] == NULL) continue;
+
+ // Motion search.
+ MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+ int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ if (frame ==
+ filter_frame_idx) { // Frame to be filtered.
+ // Change ref_mv sign for following frames.
+ ref_mv.row *= -1;
+ ref_mv.col *= -1;
+ } else { // Other reference frames.
+ tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
+ mb_row, mb_col, &ref_mv, allow_me_for_sub_blks,
+ subblock_mvs, subblock_mses);
+ }
+
+ // Perform weighted averaging.
+ if (frame == filter_frame_idx) { // Frame to be filtered.
+ tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
+ mb_col, num_planes, accum, count);
+ } else { // Other reference frames.
+ tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
+ num_planes, scale, subblock_mvs, pred);
+
+ // All variants of av1_apply_temporal_filter() contain floating point
+ // operations. Hence, clear the system state.
+
+ // TODO(any): avx2/sse2 version should be changed to align with C
+ // function before using. In particular, current avx2/sse2 function
+ // only supports 32x32 block size and 5x5 filtering window.
+ if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+ av1_highbd_apply_temporal_filter(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+ } else {
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ av1_apply_temporal_filter_c(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+#if CONFIG_AV1_HIGHBITDEPTH
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ // for 8-bit
+ if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+ av1_apply_temporal_filter(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+ } else {
+ av1_apply_temporal_filter_c(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+ }
+ }
+ }
+ }
+ tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+ accum, count, tf_ctx->output_frame);
+
+ if (compute_frame_diff) {
+ const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+ const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+ const int source_y_stride = frame_to_filter->y_stride;
+ const int filter_y_stride = tf_ctx->output_frame->y_stride;
+ const int source_offset =
+ mb_row * y_height * source_y_stride + mb_col * y_width;
+ const int filter_offset =
+ mb_row * y_height * filter_y_stride + mb_col * y_width;
+ unsigned int sse = 0;
+ cpi->ppi->fn_ptr[block_size].vf(
+ frame_to_filter->y_buffer + source_offset, source_y_stride,
+ tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
+ &sse);
+ diff->sum += sse;
+ diff->sse += sse * (int64_t)sse;
+ }
+ }
+}
+
+/*!\brief Does temporal filter for a given frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance structure
+ *
+ * \remark Nothing will be returned, but the contents of td->diff will be
+ modified.
+ */
+static void tf_do_filtering(AV1_COMP *cpi) {
+ // Basic information.
+ ThreadData *td = &cpi->td;
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ const struct scale_factors *scale = &tf_ctx->sf;
+ const int num_planes = av1_num_planes(&cpi->common);
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+ MACROBLOCKD *mbd = &td->mb.e_mbd;
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ MB_MODE_INFO **input_mb_mode_info;
+ tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+ tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+ // Perform temporal filtering for each row.
+ for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
+ av1_tf_do_filtering_row(cpi, td, mb_row);
+
+ tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+}
+
+/*!\brief Setups the frame buffer for temporal filtering. This fuction
+ * determines how many frames will be used for temporal filtering and then
+ * groups them into a buffer. This function will also estimate the noise level
+ * of the to-filter frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] filter_frame_lookahead_idx The index of the to-filter frame
+ * in the lookahead buffer cpi->lookahead
+ * \param[in] gf_frame_index GOP index
+ *
+ * \remark Nothing will be returned. But the fields `frames`, `num_frames`,
+ * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
+ */
+static void tf_setup_filtering_buffer(AV1_COMP *cpi,
+ int filter_frame_lookahead_idx,
+ int gf_frame_index) {
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+ const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index];
+ const int is_forward_keyframe =
+ av1_gop_check_forward_keyframe(gf_group, gf_frame_index);
+
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+ // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
+ // temporal filtering.
+ int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
+ int num_before = 0; // Number of filtering frames before the to-filter frame.
+ int num_after = 0; // Number of filtering frames after the to-filer frame.
+ const int lookahead_depth =
+ av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+
+ // Temporal filtering should not go beyond key frames
+ const int key_to_curframe =
+ AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0);
+ const int curframe_to_key =
+ AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0);
+
+ // Number of buffered frames before the to-filter frame.
+ int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
+
+ // Number of buffered frames after the to-filter frame.
+ int max_after =
+ AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
+
+ // Estimate noises for each plane.
+ const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage);
+ assert(to_filter_buf != NULL);
+ const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
+ const int num_planes = av1_num_planes(&cpi->common);
+ double *noise_levels = tf_ctx->noise_levels;
+ av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y,
+ num_planes - 1, cpi->common.seq_params->bit_depth,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ // Get quantization factor.
+ const int q = av1_get_q(cpi);
+ // Get correlation estimates from first-pass;
+ const FIRSTPASS_STATS *stats =
+ cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
+ double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
+ for (int i = 1; i <= max_after; i++) {
+ if (stats + filter_frame_lookahead_idx + i >=
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+ max_after = i - 1;
+ break;
+ }
+ accu_coeff1 *=
+ AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
+ }
+ if (max_after >= 1) {
+ accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
+ }
+ for (int i = 1; i <= max_before; i++) {
+ if (stats + filter_frame_lookahead_idx - i + 1 <=
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
+ max_before = i - 1;
+ break;
+ }
+ accu_coeff0 *=
+ AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
+ }
+ if (max_before >= 1) {
+ accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
+ }
+
+ // Adjust number of filtering frames based on quantization factor. When the
+ // quantization factor is small enough (lossless compression), we will not
+ // change the number of frames for key frame filtering, which is to avoid
+ // visual quality drop.
+ int adjust_num = 6;
+ const int adjust_num_frames_for_arf_filtering =
+ cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering;
+ if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering.
+ adjust_num = 0;
+ } else if ((update_type == KF_UPDATE) && q <= 10) {
+ adjust_num = 0;
+ } else if (adjust_num_frames_for_arf_filtering > 0 &&
+ update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) {
+ // Since screen content detection happens after temporal filtering,
+ // 'frames_since_key' check is added to ensure the sf is disabled for the
+ // first alt-ref frame.
+ // Adjust number of frames to be considered for filtering based on noise
+ // level of the current frame. For low-noise frame, use more frames to
+ // filter such that the filtered frame can provide better predictions for
+ // subsequent frames and vice versa.
+ const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 },
+ { 4, 2, 0 } };
+ const uint8_t *adjust_num_frames =
+ av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1];
+
+ if (noise_levels[AOM_PLANE_Y] < 0.5)
+ adjust_num = adjust_num_frames[0];
+ else if (noise_levels[AOM_PLANE_Y] < 1.0)
+ adjust_num = adjust_num_frames[1];
+ else
+ adjust_num = adjust_num_frames[2];
+ }
+ num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
+
+ if (frame_type == KEY_FRAME) {
+ num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before);
+ num_after = AOMMIN(num_frames - 1, max_after);
+ } else {
+ int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame,
+ &cpi->ppi->p_rc, &cpi->frame_info,
+ filter_frame_lookahead_idx, max_before,
+ max_after, NULL, NULL, 0);
+
+ num_frames = AOMMIN(num_frames, gfu_boost / 150);
+ num_frames += !(num_frames & 1); // Make the number odd.
+
+ // Only use 2 neighbours for the second ARF.
+ if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3);
+ if (AOMMIN(max_after, max_before) >= num_frames / 2) {
+ // just use half half
+ num_before = num_frames / 2;
+ num_after = num_frames / 2;
+ } else {
+ if (max_after < num_frames / 2) {
+ num_after = max_after;
+ num_before = AOMMIN(num_frames - 1 - num_after, max_before);
+ } else {
+ num_before = max_before;
+ num_after = AOMMIN(num_frames - 1 - num_before, max_after);
+ }
+ // Adjust insymmetry based on frame-level correlation
+ if (max_after > 0 && max_before > 0) {
+ if (num_after < num_before) {
+ const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
+ num_before = AOMMIN(num_before, num_after + insym);
+ } else {
+ const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
+ num_after = AOMMIN(num_after, num_before + insym);
+ }
+ }
+ }
+ }
+ num_frames = num_before + 1 + num_after;
+
+ // Setup the frame buffer.
+ for (int frame = 0; frame < num_frames; ++frame) {
+ const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx;
+ struct lookahead_entry *buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage);
+ assert(buf != NULL);
+ frames[frame] = &buf->img;
+ }
+ tf_ctx->num_frames = num_frames;
+ tf_ctx->filter_frame_idx = num_before;
+ assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
+
+ av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
+ cpi->common.seq_params->sb_size);
+ av1_setup_block_planes(&cpi->td.mb.e_mbd,
+ cpi->common.seq_params->subsampling_x,
+ cpi->common.seq_params->subsampling_y, num_planes);
+}
+
+/*!\cond */
+
+double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ int64_t accum = 0;
+ int count = 0;
+
+ for (int i = 1; i < height - 1; ++i) {
+ for (int j = 1; j < width - 1; ++j) {
+ // Setup a small 3x3 matrix.
+ const int center_idx = i * stride + j;
+ int mat[3][3];
+ for (int ii = -1; ii <= 1; ++ii) {
+ for (int jj = -1; jj <= 1; ++jj) {
+ const int idx = center_idx + ii * stride + jj;
+ mat[ii + 1][jj + 1] = src[idx];
+ }
+ }
+ // Compute sobel gradients.
+ const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0);
+ // Accumulate Laplacian.
+ if (Ga < edge_thresh) { // Only count smooth pixels.
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ accum += ROUND_POWER_OF_TWO(abs(v), 0);
+ ++count;
+ }
+ }
+ }
+
+ // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+ int height, int width,
+ const int stride,
+ int bit_depth,
+ int edge_thresh) {
+ int64_t accum = 0;
+ int count = 0;
+ for (int i = 1; i < height - 1; ++i) {
+ for (int j = 1; j < width - 1; ++j) {
+ // Setup a small 3x3 matrix.
+ const int center_idx = i * stride + j;
+ int mat[3][3];
+ for (int ii = -1; ii <= 1; ++ii) {
+ for (int jj = -1; jj <= 1; ++jj) {
+ const int idx = center_idx + ii * stride + jj;
+ mat[ii + 1][jj + 1] = src16[idx];
+ }
+ }
+ // Compute sobel gradients.
+ const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
+ // Accumulate Laplacian.
+ if (Ga < edge_thresh) { // Only count smooth pixels.
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
+ ++count;
+ }
+ }
+ }
+
+ // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+#endif
+
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+ double *noise_level, int plane_from, int plane_to,
+ int bit_depth, int edge_thresh) {
+ for (int plane = plane_from; plane <= plane_to; plane++) {
+ const bool is_uv_plane = (plane != AOM_PLANE_Y);
+ const int height = frame->crop_heights[is_uv_plane];
+ const int width = frame->crop_widths[is_uv_plane];
+ const int stride = frame->strides[is_uv_plane];
+ const uint8_t *src = frame->buffers[plane];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+ if (is_high_bitdepth) {
+ noise_level[plane] = av1_highbd_estimate_noise_from_single_plane(
+ src16, height, width, stride, bit_depth, edge_thresh);
+ } else {
+ noise_level[plane] = av1_estimate_noise_from_single_plane(
+ src, height, width, stride, edge_thresh);
+ }
+#else
+ (void)bit_depth;
+ noise_level[plane] = av1_estimate_noise_from_single_plane(
+ src, height, width, stride, edge_thresh);
+#endif
+ }
+}
+
+// Initializes the members of TemporalFilterCtx
+// Inputs:
+// cpi: Top level encoder instance structure
+// check_show_existing: If 1, check whether the filtered frame is similar
+// to the original frame.
+// filter_frame_lookahead_idx: The index of the frame to be filtered in the
+// lookahead buffer cpi->lookahead.
+// Returns:
+// Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
+static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
+ int gf_frame_index, int compute_frame_diff,
+ YV12_BUFFER_CONFIG *output_frame) {
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ // Setup frame buffer for filtering.
+ YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+ tf_ctx->num_frames = 0;
+ tf_ctx->filter_frame_idx = -1;
+ tf_ctx->output_frame = output_frame;
+ tf_ctx->compute_frame_diff = compute_frame_diff;
+ tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index);
+ assert(tf_ctx->num_frames > 0);
+ assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
+
+ // Setup scaling factors. Scaling on each of the arnr frames is not
+ // supported.
+ // ARF is produced at the native frame size and resized when coded.
+ struct scale_factors *sf = &tf_ctx->sf;
+ av1_setup_scale_factors_for_frame(
+ sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height);
+
+ // Initialize temporal filter parameters.
+ MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+ const int filter_frame_idx = tf_ctx->filter_frame_idx;
+ const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int mb_width = block_size_wide[block_size];
+ const int mb_height = block_size_high[block_size];
+ const int mb_rows = get_num_blocks(frame_height, mb_height);
+ const int mb_cols = get_num_blocks(frame_width, mb_width);
+ const int mb_pels = mb_width * mb_height;
+ const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
+ const int num_planes = av1_num_planes(&cpi->common);
+ int num_pels = 0;
+ for (int i = 0; i < num_planes; i++) {
+ const int subsampling_x = mbd->plane[i].subsampling_x;
+ const int subsampling_y = mbd->plane[i].subsampling_y;
+ num_pels += mb_pels >> (subsampling_x + subsampling_y);
+ }
+ tf_ctx->num_pels = num_pels;
+ tf_ctx->mb_rows = mb_rows;
+ tf_ctx->mb_cols = mb_cols;
+ tf_ctx->is_highbitdepth = is_highbitdepth;
+ tf_ctx->q_factor = av1_get_q(cpi);
+}
+
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+ const FRAME_DIFF *frame_diff, int q_index,
+ aom_bit_depth_t bit_depth) {
+ const int frame_height = frame->y_crop_height;
+ const int frame_width = frame->y_crop_width;
+ const int block_height = block_size_high[TF_BLOCK_SIZE];
+ const int block_width = block_size_wide[TF_BLOCK_SIZE];
+ const int mb_rows = get_num_blocks(frame_height, block_height);
+ const int mb_cols = get_num_blocks(frame_width, block_width);
+ const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
+ const float mean = (float)frame_diff->sum / num_mbs;
+ const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean);
+
+ const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth);
+ const float threshold = 0.7f * ac_q_step * ac_q_step;
+
+ if (mean < threshold && std < mean * 1.2) {
+ return 1;
+ }
+ return 0;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+ int gf_frame_index, FRAME_DIFF *frame_diff,
+ YV12_BUFFER_CONFIG *output_frame) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ // Basic informaton of the current frame.
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ TemporalFilterData *tf_data = &cpi->td.tf_data;
+ const int compute_frame_diff = frame_diff != NULL;
+ // TODO(anyone): Currently, we enforce the filtering strength on internal
+ // ARFs except the second ARF to be zero. We should investigate in which case
+ // it is more beneficial to use non-zero strength filtering.
+ // Only parallel level 0 frames go through temporal filtering.
+ assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0);
+
+ // Initialize temporal filter context structure.
+ init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index,
+ compute_frame_diff, output_frame);
+
+ // Allocate and reset temporal filter buffers.
+ const int is_highbitdepth = tf_ctx->is_highbitdepth;
+ if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+
+ // Perform temporal filtering process.
+ if (mt_info->num_workers > 1)
+ av1_tf_do_filtering_mt(cpi);
+ else
+ tf_do_filtering(cpi);
+
+ if (compute_frame_diff) {
+ *frame_diff = tf_data->diff;
+ }
+ // Deallocate temporal filter buffers.
+ tf_dealloc_data(tf_data, is_highbitdepth);
+}
+
+int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) {
+ return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
+}
+
+bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf);
+ if (tf_info->is_temporal_filter_on == 0) return true;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+ if (aom_realloc_frame_buffer(
+ &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width,
+ oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
+ if (tf_info->is_temporal_filter_on == 0) return;
+ for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+ aom_free_frame_buffer(&tf_info->tf_buf[i]);
+ }
+ aom_free_frame_buffer(&tf_info->tf_buf_second_arf);
+}
+
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) {
+ av1_zero(tf_info->tf_buf_valid);
+ av1_zero(tf_info->tf_buf_gf_index);
+ av1_zero(tf_info->tf_buf_display_index_offset);
+}
+
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi,
+ const GF_GROUP *gf_group) {
+ if (tf_info->is_temporal_filter_on == 0) return;
+ const AV1_COMMON *const cm = &cpi->common;
+ for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+ int update_type = gf_group->update_type[gf_index];
+ if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+ int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME;
+ int lookahead_idx = gf_group->arf_src_offset[gf_index] +
+ gf_group->cur_frame_idx[gf_index];
+ // This function is designed to be called multiple times after
+ // av1_tf_info_reset(). It will only generate the filtered frame that does
+ // not exist yet.
+ if (tf_info->tf_buf_valid[buf_idx] == 0 ||
+ tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) {
+ YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx];
+ av1_temporal_filter(cpi, lookahead_idx, gf_index,
+ &tf_info->frame_diff[buf_idx], out_buf);
+ aom_extend_frame_borders(out_buf, av1_num_planes(cm));
+ tf_info->tf_buf_gf_index[buf_idx] = gf_index;
+ tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx;
+ tf_info->tf_buf_valid[buf_idx] = 1;
+ }
+ }
+ }
+}
+
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+ int gf_index,
+ FRAME_DIFF *frame_diff) {
+ if (tf_info->is_temporal_filter_on == 0) return NULL;
+ YV12_BUFFER_CONFIG *out_buf = NULL;
+ for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+ if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) {
+ out_buf = &tf_info->tf_buf[i];
+ *frame_diff = tf_info->frame_diff[i];
+ }
+ }
+ return out_buf;
+}
+/*!\endcond */
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 0000000000..6504b91b66
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct ThreadData;
+// TODO(wtc): These two variables are only used in avx2, sse2, neon
+// implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
+// This should be fixed to align with the c implementation.
+#define BH 32
+#define BW 32
+
+// Block size used in temporal filtering.
+#define TF_BLOCK_SIZE BLOCK_32X32
+
+// Window size for temporal filtering.
+#define TF_WINDOW_LENGTH 5
+
+// A constant number, sqrt(pi / 2), used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
+// Hyper-parameters used to compute filtering weight. These hyper-parameters can
+// be tuned for a better performance.
+// 0. A scale factor used in temporal filtering to raise the filter weight from
+// `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_WEIGHT_SCALE 1000
+// 1. Weight factor used to balance the weighted-average between window error
+// and block error. The weight is for window error while the weight for block
+// error is always set as 1.
+#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5
+// 2. Threshold for using q to adjust the filtering weight. Concretely, when
+// using a small q (high bitrate), we would like to reduce the filtering
+// strength such that more detailed information can be preserved. Hence, when
+// q is smaller than this threshold, we will adjust the filtering weight
+// based on the q-value.
+#define TF_Q_DECAY_THRESHOLD 20
+// 3. Normalization factor used to normalize the motion search error. Since the
+// motion search error can be large and uncontrollable, we will simply
+// normalize it before using it to compute the filtering weight.
+#define TF_SEARCH_ERROR_NORM_WEIGHT 20
+// 4. Threshold for using `arnr_strength` to adjust the filtering strength.
+// Concretely, users can use `arnr_strength` arguments to control the
+// strength of temporal filtering. When `arnr_strength` is small enough (
+// i.e., smaller than this threshold), we will adjust the filtering weight
+// based on the strength value.
+#define TF_STRENGTH_THRESHOLD 4
+// 5. Threshold for using motion search distance to adjust the filtering weight.
+// Concretely, larger motion search vector leads to a higher probability of
+// unreliable search. Hence, we would like to reduce the filtering strength
+// when the distance is large enough. Considering that the distance actually
+// relies on the frame size, this threshold is also a resolution-based
+// threshold. Taking 720p videos as an instance, if this field equals to 0.1,
+// then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
+// for 360p videos will be 360 * 0.1 = 36.
+#define TF_SEARCH_DISTANCE_THRESHOLD 0.1
+// 6. Threshold to identify if the q is in a relative high range.
+// Above this cutoff q, a stronger filtering is applied.
+// For a high q, the quantization throws away more information, and thus a
+// stronger filtering is less likely to distort the encoded quality, while a
+// stronger filtering could reduce bit rates.
+// Ror a low q, more details are expected to be retained. Filtering is thus
+// more conservative.
+#define TF_QINDEX_CUTOFF 128
+
+#define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+
+// Sum and SSE source vs filtered frame difference returned by
+// temporal filter.
+typedef struct {
+ int64_t sum;
+ int64_t sse;
+} FRAME_DIFF;
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to temporal filtering.
+ */
+typedef struct {
+ /*!
+ * Frame buffers used for temporal filtering.
+ */
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+ /*!
+ * Number of frames in the frame buffer.
+ */
+ int num_frames;
+
+ /*!
+ * Output filtered frame
+ */
+ YV12_BUFFER_CONFIG *output_frame;
+
+ /*!
+ * Index of the frame to be filtered.
+ */
+ int filter_frame_idx;
+ /*!
+ * Whether to accumulate diff for show existing condition check.
+ */
+ int compute_frame_diff;
+ /*!
+ * Frame scaling factor.
+ */
+ struct scale_factors sf;
+ /*!
+ * Estimated noise levels for each plane in the frame.
+ */
+ double noise_levels[MAX_MB_PLANE];
+ /*!
+ * Number of pixels in the temporal filtering block across all planes.
+ */
+ int num_pels;
+ /*!
+ * Number of temporal filtering block rows.
+ */
+ int mb_rows;
+ /*!
+ * Number of temporal filtering block columns.
+ */
+ int mb_cols;
+ /*!
+ * Whether the frame is high-bitdepth or not.
+ */
+ int is_highbitdepth;
+ /*!
+ * Quantization factor used in temporal filtering.
+ */
+ int q_factor;
+} TemporalFilterCtx;
+
+/*!
+ * buffer count in TEMPORAL_FILTER_INFO
+ * Currently we only apply filtering on KEY and ARF after
+ * define_gf_group(). Hence, the count is two.
+ */
+#define TF_INFO_BUF_COUNT 2
+
+/*!
+ * \brief Temporal filter info for a gop
+ */
+typedef struct TEMPORAL_FILTER_INFO {
+ /*!
+ * A flag indicate whether temporal filter shoud be applied.
+ * This flag will stored the result of
+ * av1_is_temporal_filter_on()
+ */
+ int is_temporal_filter_on;
+ /*!
+ * buffers used for temporal filtering in a GOP
+ * index 0 for key frame and index 1 for ARF
+ */
+ YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT];
+
+ /*!
+ * buffers used for temporal filtering for
+ * INTNL_ARF_UPDATE
+ * Check av1_gop_is_second_arf() for the
+ * definition of second_arf in detail
+ */
+ YV12_BUFFER_CONFIG tf_buf_second_arf;
+ /*!
+ * whether to show the buffer directly or not.
+ */
+ FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT];
+ /*!
+ * the corresponding gf_index for the buffer.
+ */
+ int tf_buf_gf_index[TF_INFO_BUF_COUNT];
+ /*!
+ * the display_index offset between next show frame and the frames in the GOP
+ */
+ int tf_buf_display_index_offset[TF_INFO_BUF_COUNT];
+ /*!
+ * whether the buf is valid or not.
+ */
+ int tf_buf_valid[TF_INFO_BUF_COUNT];
+} TEMPORAL_FILTER_INFO;
+
+/*!\brief Check whether we should apply temporal filter at all.
+ * \param[in] oxcf AV1 encoder config
+ *
+ * \return 1: temporal filter is on 0: temporal is off
+ */
+int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf);
+
+/*!\brief Allocate buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ * \param[in,out] cpi Top level encoder instance structure
+ *
+ * \return True on success, false on memory allocation failure.
+ */
+bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info,
+ const struct AV1_COMP *cpi);
+
+/*!\brief Free buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ */
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ */
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Apply temporal filter for key frame and ARF in a gop
+ * \param[in,out] tf_info Temporal filter info for a gop
+ * \param[in,out] cpi Top level encoder instance structure
+ * \param[in] gf_group GF/ARF group data structure
+ */
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi,
+ const GF_GROUP *gf_group);
+
+/*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ * \param[in] gf_index gf_index for the target buffer
+ * \param[out] show_tf_buf whether the target buffer can be shown
+ * directly
+ */
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+ int gf_index,
+ FRAME_DIFF *frame_diff);
+
+/*!\cond */
+
+// Data related to temporal filtering.
+typedef struct {
+ // Source vs filtered frame error.
+ FRAME_DIFF diff;
+ // Pointer to temporary block info used to store state in temporal filtering
+ // process.
+ MB_MODE_INFO *tmp_mbmi;
+ // Pointer to accumulator buffer used in temporal filtering process.
+ uint32_t *accum;
+ // Pointer to count buffer used in temporal filtering process.
+ uint16_t *count;
+ // Pointer to predictor used in temporal filtering process.
+ uint8_t *pred;
+} TemporalFilterData;
+
+// Data related to temporal filter multi-thread synchronization.
+typedef struct {
+#if CONFIG_MULTITHREAD
+ // Mutex lock used for dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif // CONFIG_MULTITHREAD
+ // Next temporal filter block row to be filtered.
+ int next_tf_row;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool tf_mt_exit;
+} AV1TemporalFilterSync;
+
+// Estimates noise level from a given frame using a single plane (Y, U, or V).
+// This is an adaptation of the mehtod in the following paper:
+// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
+// estimation using Laplacian operator and adaptive edge detection",
+// Proc. 3rd International Symposium on Communications, Control and
+// Signal Processing, 2008, St Julians, Malta.
+// Inputs:
+// frame: Pointer to the frame to estimate noise level from.
+// noise_level: Pointer to store the estimated noise.
+// plane_from: Index of the starting plane used for noise estimation.
+// Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane.
+// plane_to: Index of the end plane used for noise estimation.
+// bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
+// edge_thresh: Edge threshold.
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+ double *noise_level, int plane_from, int plane_to,
+ int bit_depth, int edge_thresh);
+/*!\endcond */
+
+/*!\brief Does temporal filter for a given macroblock row.
+*
+* \ingroup src_frame_proc
+* \param[in] cpi Top level encoder instance structure
+* \param[in] td Pointer to thread data
+* \param[in] mb_row Macroblock row to be filtered
+filtering
+*
+* \remark Nothing will be returned, but the contents of td->diff will be
+modified.
+*/
+void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int mb_row);
+
+/*!\brief Performs temporal filtering if needed on a source frame.
+ * For example to create a filtered alternate reference frame (ARF)
+ *
+ * In this function, the lookahead index is different from the 0-based
+ * real index. For example, if we want to filter the first frame in the
+ * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+ * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+ * second frame in the pre-fetched buffer. Another example: if we want to filter
+ * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+ * Futhermore, negative number is used for key frame in one-pass mode, where key
+ * frame is filtered with the frames before it instead of after it. For example,
+ * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance
+ * structure
+ * \param[in] filter_frame_lookahead_idx The index of the
+ * to-filter frame in the lookahead
+ * buffer cpi->lookahead.
+ * \param[in] gf_frame_index Index of GOP
+ * \param[in,out] frame_diff structure of sse and sum of the
+ * filtered frame.
+ * \param[out] output_frame Ouput filtered frame.
+ */
+void av1_temporal_filter(struct AV1_COMP *cpi,
+ const int filter_frame_lookahead_idx,
+ int gf_frame_index, FRAME_DIFF *frame_diff,
+ YV12_BUFFER_CONFIG *output_frame);
+
+/*!\brief Check whether a filtered frame can be show directly
+ *
+ * This function will use the filtered frame's sse and current q index
+ * to make decision.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] frame filtered frame's buffer
+ * \param[in] frame_diff structure of sse and sum of the
+ * filtered frame.
+ * \param[in] q_index q_index used for this frame
+ * \param[in] bit_depth bit depth
+ * \return return 1 if this frame can be shown directly, otherwise
+ * return 0
+ */
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+ const FRAME_DIFF *frame_diff, int q_index,
+ aom_bit_depth_t bit_depth);
+
+/*!\cond */
+// Helper function to get `q` used for encoding.
+int av1_get_q(const struct AV1_COMP *cpi);
+
+// Allocates memory for members of TemporalFilterData.
+// Inputs:
+// tf_data: Pointer to the structure containing temporal filter related data.
+// num_pels: Number of pixels in the block across all planes.
+// is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+// True if allocation is successful and false otherwise.
+static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
+ int num_pels,
+ int is_high_bitdepth) {
+ tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi));
+ tf_data->accum =
+ (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
+ tf_data->count =
+ (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
+ if (is_high_bitdepth)
+ tf_data->pred = CONVERT_TO_BYTEPTR(
+ aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
+ else
+ tf_data->pred =
+ (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
+ // In case of an allocation failure, other successfully allocated buffers will
+ // be freed by the tf_dealloc_data() call in encoder_destroy().
+ if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred))
+ return false;
+ memset(&tf_data->diff, 0, sizeof(tf_data->diff));
+ return true;
+}
+
+// Setup macroblockd params for temporal filtering process.
+// Inputs:
+// mbd: Pointer to the block for filtering.
+// tf_data: Pointer to the structure containing temporal filter related data.
+// scale: Scaling factor.
+// Returns:
+// Nothing will be returned. Contents of mbd will be modified.
+static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
+ TemporalFilterData *tf_data,
+ const struct scale_factors *scale) {
+ mbd->block_ref_scale_factors[0] = scale;
+ mbd->block_ref_scale_factors[1] = scale;
+ mbd->mi = &tf_data->tmp_mbmi;
+ mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+}
+
+// Deallocates the memory allocated for members of TemporalFilterData.
+// Inputs:
+// tf_data: Pointer to the structure containing temporal filter related data.
+// is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+// Nothing will be returned.
+static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
+ int is_high_bitdepth) {
+ if (is_high_bitdepth)
+ tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
+ aom_free(tf_data->tmp_mbmi);
+ tf_data->tmp_mbmi = NULL;
+ aom_free(tf_data->accum);
+ tf_data->accum = NULL;
+ aom_free(tf_data->count);
+ tf_data->count = NULL;
+ aom_free(tf_data->pred);
+ tf_data->pred = NULL;
+}
+
+// Saves the state prior to temporal filter process.
+// Inputs:
+// mbd: Pointer to the block for filtering.
+// input_mbmi: Backup block info to save input state.
+// input_buffer: Backup buffer pointer to save input state.
+// num_planes: Number of planes.
+// Returns:
+// Nothing will be returned. Contents of input_mbmi and input_buffer will be
+// modified.
+static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
+ uint8_t **input_buffer, int num_planes) {
+ for (int i = 0; i < num_planes; i++) {
+ input_buffer[i] = mbd->plane[i].pre[0].buf;
+ }
+ *input_mbmi = mbd->mi;
+}
+
+// Restores the initial state after temporal filter process.
+// Inputs:
+// mbd: Pointer to the block for filtering.
+// input_mbmi: Backup block info from where input state is restored.
+// input_buffer: Backup buffer pointer from where input state is restored.
+// num_planes: Number of planes.
+// Returns:
+// Nothing will be returned. Contents of mbd will be modified.
+static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
+ uint8_t **input_buffer, int num_planes) {
+ for (int i = 0; i < num_planes; i++) {
+ mbd->plane[i].pre[0].buf = input_buffer[i];
+ }
+ mbd->mi = input_mbmi;
+}
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/thirdpass.c b/third_party/aom/av1/encoder/thirdpass.c
new file mode 100644
index 0000000000..a25522fbc5
--- /dev/null
+++ b/third_party/aom/av1/encoder/thirdpass.c
@@ -0,0 +1,877 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/encoder/thirdpass.h"
+
+#if CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+#include "aom/aom_codec.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/common/blockd.h"
+#include "common/ivfdec.h"
+
+static void setup_two_pass_stream_input(
+ struct AvxInputContext **input_ctx_ptr, const char *input_file_name,
+ struct aom_internal_error_info *err_info) {
+ FILE *infile;
+ infile = fopen(input_file_name, "rb");
+ if (!infile) {
+ aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+ "Failed to open input file '%s'.", input_file_name);
+ }
+ struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx));
+ if (!aom_input_ctx) {
+ fclose(infile);
+ aom_internal_error(err_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate memory for third-pass context.");
+ }
+ memset(aom_input_ctx, 0, sizeof(*aom_input_ctx));
+ aom_input_ctx->filename = input_file_name;
+ aom_input_ctx->file = infile;
+
+ if (file_is_ivf(aom_input_ctx)) {
+ aom_input_ctx->file_type = FILE_TYPE_IVF;
+ } else {
+ fclose(infile);
+ aom_free(aom_input_ctx);
+ aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+ "Unrecognized input file type.");
+ }
+ *input_ctx_ptr = aom_input_ctx;
+}
+
+static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+ if (!ctx->input_ctx) {
+ if (ctx->input_file_name == NULL) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM,
+ "No third pass input specified.");
+ }
+ setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name,
+ ctx->err_info);
+ }
+
+ if (!ctx->decoder.iface) {
+ aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo;
+ if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to initialize decoder.");
+ }
+ }
+}
+
+// Return 0: success
+// 1: cannot read because this is end of file
+// -1: failure to read the frame
+static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
+ if (!ctx->input_ctx || !ctx->decoder.iface) {
+ init_third_pass(ctx);
+ }
+ if (!ctx->have_frame) {
+ if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer,
+ &ctx->buffer_size, NULL) != 0) {
+ if (feof(ctx->input_ctx->file)) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ ctx->frame = ctx->buf;
+ ctx->end_frame = ctx->frame + ctx->bytes_in_buffer;
+ ctx->have_frame = 1;
+ }
+
+ Av1DecodeReturn adr;
+ if (aom_codec_decode(&ctx->decoder, ctx->frame,
+ (unsigned int)ctx->bytes_in_buffer,
+ &adr) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to decode frame for third pass.");
+ }
+ ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3;
+ ctx->frame = adr.buf;
+ ctx->bytes_in_buffer = ctx->end_frame - ctx->frame;
+ if (ctx->frame == ctx->end_frame) ctx->have_frame = 0;
+ return 0;
+}
+
+static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) {
+ if (!frame_info) return;
+ aom_free(frame_info->mi_info);
+ frame_info->mi_info = NULL;
+}
+
+// This function gets the information needed from the recently decoded frame,
+// via various decoder APIs, and saves the info into ctx->frame_info.
+// Return 0: success
+// 1: cannot read because this is end of file
+// -1: failure to read the frame
+static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) {
+ int ret = read_frame(ctx);
+ if (ret != 0) return ret;
+ int cur = ctx->frame_info_count;
+
+ ctx->frame_info[cur].actual_bits = ctx->this_frame_bits;
+
+ if (cur >= MAX_THIRD_PASS_BUF) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Third pass frame info ran out of available slots.");
+ }
+ aom_codec_frame_flags_t frame_type_flags = 0;
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS,
+ &frame_type_flags) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame flags.");
+ }
+ if (frame_type_flags & AOM_FRAME_IS_KEY) {
+ ctx->frame_info[cur].frame_type = KEY_FRAME;
+ } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) {
+ ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME;
+ } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) {
+ ctx->frame_info[cur].frame_type = S_FRAME;
+ } else {
+ ctx->frame_info[cur].frame_type = INTER_FRAME;
+ }
+
+ // Get frame width and height
+ int frame_size[2];
+ if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame size.");
+ }
+
+ // Check if we need to re-alloc the mi fields.
+ const int mi_cols = (frame_size[0] + 3) >> 2;
+ const int mi_rows = (frame_size[1] + 3) >> 2;
+ ctx->frame_info[cur].mi_stride = mi_cols;
+ ctx->frame_info[cur].mi_rows = mi_rows;
+ ctx->frame_info[cur].mi_cols = mi_cols;
+
+ if (ctx->frame_info[cur].width != frame_size[0] ||
+ ctx->frame_info[cur].height != frame_size[1] ||
+ !ctx->frame_info[cur].mi_info) {
+ free_frame_info(&ctx->frame_info[cur]);
+
+ ctx->frame_info[cur].mi_info =
+ aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info));
+
+ if (!ctx->frame_info[cur].mi_info) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate mi buffer for the third pass.");
+ }
+ }
+
+ ctx->frame_info[cur].width = frame_size[0];
+ ctx->frame_info[cur].height = frame_size[1];
+
+ // Get frame base q idx
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX,
+ &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read base q index.");
+ }
+
+ // Get show existing frame flag
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+ &ctx->frame_info[cur].is_show_existing_frame) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read show existing frame flag.");
+ }
+
+ // Get show frame flag
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG,
+ &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read show frame flag.");
+ }
+
+ // Get order hint
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT,
+ &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read order hint.");
+ }
+
+ // Clear MI info
+ for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+ for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+ ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize =
+ BLOCK_INVALID;
+ }
+ }
+
+ // Get relevant information regarding each 4x4 MI
+ MB_MODE_INFO cur_mi_info;
+ THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info;
+ for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+ for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+ const int offset = mi_row * mi_cols + mi_col;
+ if (this_mi[offset].bsize != BLOCK_INVALID) {
+ continue;
+ }
+ // Get info of this MI
+ if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col,
+ &cur_mi_info) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read mi info.");
+ }
+ const int blk_mi_rows = mi_size_high[cur_mi_info.bsize];
+ const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize];
+
+ for (int h = 0; h < blk_mi_rows; h++) {
+ for (int w = 0; w < blk_mi_cols; w++) {
+ if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) {
+ continue;
+ }
+ const int this_offset = offset + h * mi_cols + w;
+ this_mi[this_offset].bsize = cur_mi_info.bsize;
+ this_mi[this_offset].partition = cur_mi_info.partition;
+ this_mi[this_offset].mi_row_start = mi_row;
+ this_mi[this_offset].mi_col_start = mi_col;
+ this_mi[this_offset].mv[0] = cur_mi_info.mv[0];
+ this_mi[this_offset].mv[1] = cur_mi_info.mv[1];
+ this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0];
+ this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1];
+ this_mi[this_offset].pred_mode = cur_mi_info.mode;
+ }
+ }
+ }
+ }
+
+ ctx->frame_info_count++;
+
+ return 0;
+}
+
+#define USE_SECOND_PASS_FILE 1
+
+#if !USE_SECOND_PASS_FILE
+// Parse the frames in the gop and determine the last frame of the current GOP.
+// Decode more frames if necessary. The variable max_num is the maximum static
+// GOP length if we detect an IPPP structure, and it is expected that max_mum >=
+// MAX_GF_INTERVAL.
+static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num,
+ int *last_idx) {
+ assert(max_num >= MAX_GF_INTERVAL);
+ *last_idx = 0;
+ int cur_idx = 0;
+ int arf_order_hint = -1;
+ int num_show_frames = 0;
+ while (num_show_frames < max_num) {
+ assert(cur_idx < MAX_THIRD_PASS_BUF);
+ // Read in from bitstream if needed.
+ if (cur_idx >= ctx->frame_info_count) {
+ int ret = get_frame_info(ctx);
+ if (ret == 1) {
+ // At the end of the file, GOP ends in the prev frame.
+ if (arf_order_hint >= 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to derive GOP length.");
+ }
+ *last_idx = cur_idx - 1;
+ return;
+ }
+ if (ret < 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame for third pass.");
+ }
+ }
+
+ // TODO(bohanli): verify that fwd_kf works here.
+ if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME &&
+ ctx->frame_info[cur_idx].is_show_frame) {
+ if (cur_idx != 0) {
+ // If this is a key frame and is not the first kf in this kf group, we
+ // have reached the next key frame. Stop here.
+ *last_idx = cur_idx - 1;
+ return;
+ }
+ } else if (!ctx->frame_info[cur_idx].is_show_frame &&
+ arf_order_hint == -1) {
+ // If this is an arf (the first no show)
+ if (num_show_frames <= 1) {
+ // This is an arf and we should end the GOP with its overlay.
+ arf_order_hint = ctx->frame_info[cur_idx].order_hint;
+ } else {
+ // There are multiple show frames before the this arf, so we treat the
+ // frames previous to this arf as a GOP.
+ *last_idx = cur_idx - 1;
+ return;
+ }
+ } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint ==
+ (unsigned int)arf_order_hint) {
+ // If this is the overlay/show existing of the arf
+ assert(ctx->frame_info[cur_idx].is_show_frame);
+ *last_idx = cur_idx;
+ return;
+ } else {
+ // This frame is part of the GOP.
+ if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++;
+ }
+ cur_idx++;
+ }
+ // This is a long IPPP GOP and we will use a length of max_num here.
+ assert(arf_order_hint < 0);
+ *last_idx = max_num - 1;
+ return;
+}
+#endif
+
+static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) {
+ int cur_idx = 0;
+ while (cur_idx < ctx->gop_info.num_frames) {
+ assert(cur_idx < MAX_THIRD_PASS_BUF);
+ // Read in from bitstream if needed.
+ if (cur_idx >= ctx->frame_info_count) {
+ int ret = get_frame_info(ctx);
+ if (ret != 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame for third pass.");
+ }
+ }
+ cur_idx++;
+ }
+ return;
+}
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+ // Read in future frames in the current GOP.
+ read_gop_frames(ctx);
+
+ int gf_len = 0;
+ // Check the GOP length against the value read from second_pass_file
+ for (int i = 0; i < ctx->gop_info.num_frames; i++) {
+ if (ctx->frame_info[i].is_show_frame) gf_len++;
+ }
+
+ if (gf_len != ctx->gop_info.gf_length) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Mismatch in third pass GOP length!");
+ }
+}
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) {
+ if (ctx->frame_info_count == 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "No available frame info for third pass.");
+ }
+ ctx->frame_info_count--;
+ free_frame_info(&ctx->frame_info[0]);
+ for (int i = 0; i < ctx->frame_info_count; i++) {
+ ctx->frame_info[i] = ctx->frame_info[i + 1];
+ }
+ ctx->frame_info[ctx->frame_info_count].mi_info = NULL;
+}
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+ const char *file) {
+ av1_free_thirdpass_ctx(*ctx);
+ CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx)));
+ THIRD_PASS_DEC_CTX *ctx_ptr = *ctx;
+ ctx_ptr->input_file_name = file;
+ ctx_ptr->prev_gop_end = -1;
+ ctx_ptr->err_info = cm->error;
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) {
+ if (ctx == NULL) return;
+ if (ctx->decoder.iface) {
+ aom_codec_destroy(&ctx->decoder);
+ }
+ if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file);
+ aom_free(ctx->input_ctx);
+ if (ctx->buf) free(ctx->buf);
+ for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) {
+ free_frame_info(&ctx->frame_info[i]);
+ }
+ aom_free(ctx);
+}
+
+void av1_write_second_pass_gop_info(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+ if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+ // Write the GOP length to a log file.
+ av1_open_second_pass_log(cpi, 0);
+
+ THIRD_PASS_GOP_INFO gop_info;
+
+ gop_info.num_frames = gf_group->size;
+ gop_info.use_arf = (gf_group->arf_index >= 0);
+ gop_info.gf_length = p_rc->baseline_gf_interval;
+
+ size_t count =
+ fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+ }
+}
+
+void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+ // write target bitrate
+ int bits = gf_group->bit_allocation[gf_index];
+ size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+
+ // write sse
+ uint64_t sse = 0;
+ int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1;
+ if (pkt_idx >= 0 &&
+ cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) {
+ sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0];
+#if CONFIG_INTERNAL_STATS
+ } else if (cpi->ppi->b_calculate_psnr) {
+ sse = cpi->ppi->total_sq_error[0];
+#endif
+ } else {
+ const YV12_BUFFER_CONFIG *orig = cpi->source;
+ const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+ PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(orig, recon, &psnr);
+#endif
+ sse = psnr.sse[0];
+ }
+
+ count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+
+ // write bpm_factor
+ double factor = cpi->ppi->twopass.bpm_factor;
+ count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+ }
+}
+void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ if (oxcf->second_pass_log == NULL) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM,
+ "No second pass log file specified for the third pass!");
+ }
+ // Read the GOP length from a file.
+ if (!cpi->second_pass_log_stream) {
+ if (is_read) {
+ cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb");
+ } else {
+ cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb");
+ }
+ if (!cpi->second_pass_log_stream) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not open second pass log file!");
+ }
+ }
+}
+
+void av1_close_second_pass_log(AV1_COMP *cpi) {
+ if (cpi->second_pass_log_stream) {
+ int ret = fclose(cpi->second_pass_log_stream);
+ if (ret != 0) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not close second pass log file!");
+ }
+ cpi->second_pass_log_stream = 0;
+ }
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+ THIRD_PASS_GOP_INFO *gop_info,
+ struct aom_internal_error_info *error) {
+ size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+}
+
+void av1_read_second_pass_per_frame_info(
+ FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+ int frame_info_count, struct aom_internal_error_info *error) {
+ for (int i = 0; i < frame_info_count; i++) {
+ // read target bits
+ int bits = 0;
+ size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+ frame_info_arr[i].bits_allocated = bits;
+
+ // read distortion
+ uint64_t sse;
+ count = fread(&sse, sizeof(sse), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+ frame_info_arr[i].sse = sse;
+
+ // read bpm factor
+ double factor;
+ count = fread(&factor, sizeof(factor), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+ frame_info_arr[i].bpm_factor = factor;
+ }
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+ if (ctx == NULL) return -1;
+ int use_arf = 0;
+ for (int i = 0; i < ctx->gop_info.gf_length; i++) {
+ if (ctx->frame_info[i].order_hint != 0 &&
+ ctx->frame_info[i].is_show_frame == 0) {
+ use_arf = 1;
+ }
+ }
+ if (use_arf != ctx->gop_info.use_arf) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Mismatch in third pass GOP length!");
+ }
+ return use_arf;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+ int fwidth, double *ratio_h, double *ratio_w) {
+ assert(ctx);
+ assert(fidx < ctx->frame_info_count);
+ const int fheight_second_pass = ctx->frame_info[fidx].height;
+ const int fwidth_second_pass = ctx->frame_info[fidx].width;
+ assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth);
+
+ *ratio_h = (double)fheight / fheight_second_pass;
+ *ratio_w = (double)fwidth / fwidth_second_pass;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+ int mi_row, int mi_col,
+ double ratio_h, double ratio_w) {
+ assert(ctx);
+ assert(fidx < ctx->frame_info_count);
+
+ const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows;
+ const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols;
+
+ const int mi_row_second_pass =
+ clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1);
+ const int mi_col_second_pass =
+ clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1);
+
+ const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride;
+ THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info +
+ mi_row_second_pass * mi_stride_second_pass +
+ mi_col_second_pass;
+ return this_mi;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+ double ratio_h, double ratio_w, int *mi_row,
+ int *mi_col) {
+ *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h);
+ *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w);
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w,
+ MV_REFERENCE_FRAME frame) {
+ assert(this_mi != NULL);
+ int_mv cur_mv;
+ cur_mv.as_int = INVALID_MV;
+
+ if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv;
+
+ for (int r = 0; r < 2; r++) {
+ if (this_mi->ref_frame[r] == frame) {
+ cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h);
+ cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w);
+ }
+ }
+
+ return cur_mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h,
+ double ratio_w) {
+ assert(this_mi != NULL);
+ BLOCK_SIZE bsize = BLOCK_INVALID;
+
+ const BLOCK_SIZE bsize_second_pass = this_mi->bsize;
+ assert(bsize_second_pass != BLOCK_INVALID);
+
+ const int w_second_pass = block_size_wide[bsize_second_pass];
+ const int h_second_pass = block_size_high[bsize_second_pass];
+
+ int part_type;
+
+ if (w_second_pass == h_second_pass) {
+ part_type = PARTITION_NONE;
+ } else if (w_second_pass / h_second_pass == 2) {
+ part_type = PARTITION_HORZ;
+ } else if (w_second_pass / h_second_pass == 4) {
+ part_type = PARTITION_HORZ_4;
+ } else if (h_second_pass / w_second_pass == 2) {
+ part_type = PARTITION_VERT;
+ } else if (h_second_pass / w_second_pass == 4) {
+ part_type = PARTITION_VERT_4;
+ } else {
+ part_type = PARTITION_INVALID;
+ }
+ assert(part_type != PARTITION_INVALID);
+
+ const int w = (int)(round(w_second_pass * ratio_w));
+ const int h = (int)(round(h_second_pass * ratio_h));
+
+ for (int i = 0; i < SQR_BLOCK_SIZES; i++) {
+ const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i];
+ if (this_bsize == BLOCK_INVALID) continue;
+
+ const int this_w = block_size_wide[this_bsize];
+ const int this_h = block_size_high[this_bsize];
+
+ if (this_w >= w && this_h >= h) {
+ // find the smallest block size that contains the mapped block
+ bsize = this_bsize;
+ break;
+ }
+ }
+ if (bsize == BLOCK_INVALID) {
+ // could not find a proper one, just use the largest then.
+ bsize = BLOCK_128X128;
+ }
+
+ return bsize;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+ THIRD_PASS_MI_INFO *this_mi) {
+ int mi_stride = ctx->frame_info[0].mi_stride;
+
+ int mi_row = this_mi->mi_row_start;
+ int mi_col = this_mi->mi_col_start;
+
+ THIRD_PASS_MI_INFO *corner_mi =
+ &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col];
+
+ return corner_mi->partition;
+}
+
+#else // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER)
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+ const char *file) {
+ (void)ctx;
+ (void)file;
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "To utilize three-pass encoding, libaom must be built "
+ "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1.");
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) {
+ (void)cpi;
+ (void)is_read;
+}
+
+void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) {
+ (void)cpi;
+ (void)gf_index;
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+ THIRD_PASS_GOP_INFO *gop_info,
+ struct aom_internal_error_info *error) {
+ (void)second_pass_log_stream;
+ (void)gop_info;
+ (void)error;
+}
+
+void av1_read_second_pass_per_frame_info(
+ FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+ int frame_info_count, struct aom_internal_error_info *error) {
+ (void)second_pass_log_stream;
+ (void)frame_info_arr;
+ (void)frame_info_count;
+ (void)error;
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+ (void)ctx;
+ return 1;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+ int fwidth, double *ratio_h, double *ratio_w) {
+ (void)ctx;
+ (void)fidx;
+ (void)fheight;
+ (void)fwidth;
+ (void)ratio_h;
+ (void)ratio_w;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+ int mi_row, int mi_col,
+ double ratio_h, double ratio_w) {
+ (void)ctx;
+ (void)fidx;
+ (void)mi_row;
+ (void)mi_col;
+ (void)ratio_h;
+ (void)ratio_w;
+ return NULL;
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w,
+ MV_REFERENCE_FRAME frame) {
+ (void)this_mi;
+ (void)ratio_h;
+ (void)ratio_w;
+ (void)frame;
+ int_mv mv;
+ mv.as_int = INVALID_MV;
+ return mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h,
+ double ratio_w) {
+ (void)this_mi;
+ (void)ratio_h;
+ (void)ratio_w;
+ return BLOCK_INVALID;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+ double ratio_h, double ratio_w, int *mi_row,
+ int *mi_col) {
+ (void)third_pass_mi;
+ (void)ratio_h;
+ (void)ratio_w;
+ (void)mi_row;
+ (void)mi_col;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+ THIRD_PASS_MI_INFO *this_mi) {
+ (void)ctx;
+ (void)this_mi;
+ return PARTITION_INVALID;
+}
+#endif // CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+
+#if CONFIG_BITRATE_ACCURACY
+static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb,
+ FILE *stream,
+ struct aom_internal_error_info *error) {
+ size_t count = fwrite(ptr, size, nmemb, stream);
+ if (count < nmemb) {
+ aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n");
+ }
+}
+
+static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream,
+ struct aom_internal_error_info *error) {
+ size_t count = fread(ptr, size, nmemb, stream);
+ if (count < nmemb) {
+ aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n");
+ }
+}
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+ const TplParams *tpl_data) {
+ tpl_info->tpl_ready = tpl_data->ready;
+ if (tpl_info->tpl_ready) {
+ tpl_info->gf_length = gf_group->size;
+ for (int i = 0; i < tpl_info->gf_length; ++i) {
+ tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i];
+ tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+ tpl_info->update_type_list[i] = gf_group->update_type[i];
+ }
+ }
+}
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error) {
+ fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+ log_stream, error);
+ if (tpl_info->tpl_ready) {
+ fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+ log_stream, error);
+ assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+ fwrite_and_check(&tpl_info->txfm_stats_list,
+ sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ fwrite_and_check(&tpl_info->qstep_ratio_ls,
+ sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+ log_stream, error);
+ fwrite_and_check(&tpl_info->update_type_list,
+ sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ }
+}
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error) {
+ av1_zero(*tpl_info);
+ fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+ log_stream, error);
+ if (tpl_info->tpl_ready) {
+ fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+ log_stream, error);
+ assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+ fread_and_check(&tpl_info->txfm_stats_list,
+ sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ fread_and_check(&tpl_info->qstep_ratio_ls,
+ sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+ log_stream, error);
+ fread_and_check(&tpl_info->update_type_list,
+ sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ }
+}
+#endif // CONFIG_BITRATE_ACCURACY
diff --git a/third_party/aom/av1/encoder/thirdpass.h b/third_party/aom/av1/encoder/thirdpass.h
new file mode 100644
index 0000000000..8080c06cb6
--- /dev/null
+++ b/third_party/aom/av1/encoder/thirdpass.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_THIRDPASS_H_
+#define AOM_AV1_ENCODER_THIRDPASS_H_
+
+#include "av1/common/enums.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+
+struct AV1_COMP;
+
+// TODO(bohanli): optimize this number
+#define MAX_THIRD_PASS_BUF \
+ (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH))
+
+// Struct to store useful information related to a GOP, in addition to what is
+// available in the bitstream
+typedef struct {
+ int gf_length;
+ int num_frames;
+ int use_arf;
+} THIRD_PASS_GOP_INFO;
+
+#if CONFIG_BITRATE_ACCURACY
+typedef struct TPL_INFO {
+ int gf_length;
+ int tpl_ready;
+ TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS];
+ double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS];
+} TPL_INFO;
+#endif // CONFIG_BITRATE_ACCURACY
+
+typedef struct {
+ BLOCK_SIZE bsize;
+ PARTITION_TYPE partition;
+ int mi_row_start;
+ int mi_col_start;
+ int_mv mv[2];
+ MV_REFERENCE_FRAME ref_frame[2];
+ PREDICTION_MODE pred_mode;
+} THIRD_PASS_MI_INFO;
+
+// Struct to store useful information about a frame for the third pass.
+// The members are extracted from the decoder by function get_frame_info.
+typedef struct {
+ int width;
+ int height;
+ int mi_stride;
+ int mi_rows;
+ int mi_cols;
+ int base_q_idx;
+ int is_show_existing_frame;
+ int is_show_frame;
+ int bits_allocated;
+ int actual_bits;
+ uint64_t sse;
+ double bpm_factor;
+ FRAME_TYPE frame_type;
+ unsigned int order_hint;
+ THIRD_PASS_MI_INFO *mi_info;
+} THIRD_PASS_FRAME_INFO;
+
+typedef struct {
+ /* --- Input and decoding related members --- */
+ // the input file
+ const char *input_file_name;
+#if CONFIG_THREE_PASS
+ // input context
+ struct AvxInputContext *input_ctx;
+#endif
+ // decoder codec context
+ aom_codec_ctx_t decoder;
+ // start of the frame in buf
+ const unsigned char *frame;
+ // end of the frame(s) in buf
+ const unsigned char *end_frame;
+ // whether we still have following frames in buf
+ int have_frame;
+ // pointer to buffer for the read frames
+ uint8_t *buf;
+ // size of data in buffer
+ size_t bytes_in_buffer;
+ // current buffer size
+ size_t buffer_size;
+ // error info pointer
+ struct aom_internal_error_info *err_info;
+
+ int this_frame_bits;
+
+ /* --- Members for third pass encoding --- */
+ // Array to store info about each frame.
+ // frame_info[0] should point to the current frame.
+ THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+ // number of frames available in frame_info
+ int frame_info_count;
+ // the end of the previous GOP (order hint)
+ int prev_gop_end;
+ THIRD_PASS_GOP_INFO gop_info;
+} THIRD_PASS_DEC_CTX;
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+ const char *file);
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx);
+
+// Set the GOP structure from the twopass bitstream.
+// TODO(bohanli): this is currently a skeleton and we only return the gop
+// length. This function also saves all frame information in the array
+// ctx->frame_info for this GOP.
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx);
+
+// Pop one frame out of the array ctx->frame_info. This function is used to make
+// sure that frame_info[0] always corresponds to the current frame.
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx);
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read);
+void av1_close_second_pass_log(struct AV1_COMP *cpi);
+
+// Write the current GOP information into the second pass log file.
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi);
+// Write the information of the frames in this GOP into the second pass log
+// file.
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index);
+
+// Read the next GOP information from the second pass log file.
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+ THIRD_PASS_GOP_INFO *gop_info,
+ struct aom_internal_error_info *error);
+// read the information of the frames in next GOP from the second pass log file.
+void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream,
+ THIRD_PASS_FRAME_INFO *frame_info_arr,
+ int frame_info_count,
+ struct aom_internal_error_info *error);
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx);
+
+// Calculate the ratio of third pass frame dimensions over second pass frame
+// dimensions. Return them in ratio_h and ratio_w.
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+ int fwidth, double *ratio_h, double *ratio_w);
+
+// Get the pointer to a second pass mi info, where mi_row and mi_col are the mi
+// location in the thirdpass frame.
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+ int mi_row, int mi_col,
+ double ratio_h, double ratio_w);
+
+// Get the adjusted MVs of this_mi, associated with the reference frame. If no
+// MV is found with the reference frame, INVALID_MV is returned.
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w,
+ MV_REFERENCE_FRAME frame);
+
+// Get the adjusted block size of this_mi.
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w);
+
+// Get the adjusted mi position in the third pass frame, of a given
+// third_pass_mi. Location is returned in mi_row and mi_col.
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+ double ratio_h, double ratio_w, int *mi_row,
+ int *mi_col);
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+ THIRD_PASS_MI_INFO *this_mi);
+
+#if CONFIG_BITRATE_ACCURACY
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+ const TplParams *tpl_data);
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error);
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error);
+
+#endif // CONFIG_BITRATE_ACCURACY
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_THIRDPASS_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 0000000000..ffac886e32
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static AOM_INLINE int av1_fast_palette_color_index_context_on_edge(
+ const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+ const bool has_left = (c - 1 >= 0);
+ const bool has_above = (r - 1 >= 0);
+ assert(r > 0 || c > 0);
+ assert(has_above ^ has_left);
+ assert(color_idx);
+ (void)has_left;
+
+ const uint8_t color_neighbor = has_above
+ ? color_map[(r - 1) * stride + (c - 0)]
+ : color_map[(r - 0) * stride + (c - 1)];
+ // If the neighbor color has higher index than current color index, then we
+ // move up by 1.
+ const uint8_t current_color = *color_idx = color_map[r * stride + c];
+ if (color_neighbor > current_color) {
+ (*color_idx)++;
+ } else if (color_neighbor == current_color) {
+ *color_idx = 0;
+ }
+
+ // Get hash value of context.
+ // The non-diagonal neighbors get a weight of 2.
+ const uint8_t color_score = 2;
+ const uint8_t hash_multiplier = 1;
+ const uint8_t color_index_ctx_hash = color_score * hash_multiplier;
+
+ // Lookup context from hash.
+ const int color_index_ctx =
+ av1_palette_color_index_context_lookup[color_index_ctx_hash];
+ assert(color_index_ctx == 0);
+ (void)color_index_ctx;
+ return 0;
+}
+
+#define SWAP(i, j) \
+ do { \
+ const uint8_t tmp_score = score_rank[i]; \
+ const uint8_t tmp_color = color_rank[i]; \
+ score_rank[i] = score_rank[j]; \
+ color_rank[i] = color_rank[j]; \
+ score_rank[j] = tmp_score; \
+ color_rank[j] = tmp_color; \
+ } while (0)
+#define INVALID_COLOR_IDX (UINT8_MAX)
+
+// A faster version of av1_get_palette_color_index_context used by the encoder
+// exploiting the fact that the encoder does not need to maintain a color order.
+static AOM_INLINE int av1_fast_palette_color_index_context(
+ const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+ assert(r > 0 || c > 0);
+
+ const bool has_above = (r - 1 >= 0);
+ const bool has_left = (c - 1 >= 0);
+ assert(has_above || has_left);
+ if (has_above ^ has_left) {
+ return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c,
+ color_idx);
+ }
+
+ // This goes in the order of left, top, and top-left. This has the advantage
+ // that unless anything here are not distinct or invalid, this will already
+ // be in sorted order. Furthermore, if either of the first two is
+ // invalid, we know the last one is also invalid.
+ uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS];
+ color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)];
+ color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)];
+ color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)];
+
+ // Aggregate duplicated values.
+ // Since our array is so small, using a couple if statements is faster
+ uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
+ uint8_t num_invalid_colors = 0;
+ if (color_neighbors[0] == color_neighbors[1]) {
+ scores[0] += scores[1];
+ color_neighbors[1] = INVALID_COLOR_IDX;
+ num_invalid_colors += 1;
+
+ if (color_neighbors[0] == color_neighbors[2]) {
+ scores[0] += scores[2];
+ num_invalid_colors += 1;
+ }
+ } else if (color_neighbors[0] == color_neighbors[2]) {
+ scores[0] += scores[2];
+ num_invalid_colors += 1;
+ } else if (color_neighbors[1] == color_neighbors[2]) {
+ scores[1] += scores[2];
+ num_invalid_colors += 1;
+ }
+
+ const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors;
+
+ uint8_t *color_rank = color_neighbors;
+ uint8_t *score_rank = scores;
+
+ // Sort everything
+ if (num_valid_colors > 1) {
+ if (color_neighbors[1] == INVALID_COLOR_IDX) {
+ scores[1] = scores[2];
+ color_neighbors[1] = color_neighbors[2];
+ }
+
+ // We need to swap the first two elements if they have the same score but
+ // the color indices are not in the right order
+ if (score_rank[0] < score_rank[1] ||
+ (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
+ SWAP(0, 1);
+ }
+ if (num_valid_colors > 2) {
+ if (score_rank[0] < score_rank[2]) {
+ SWAP(0, 2);
+ }
+ if (score_rank[1] < score_rank[2]) {
+ SWAP(1, 2);
+ }
+ }
+ }
+
+ // If any of the neighbor colors has higher index than current color index,
+ // then we move up by 1 unless the current color is the same as one of the
+ // neighbors.
+ const uint8_t current_color = *color_idx = color_map[r * stride + c];
+ for (int idx = 0; idx < num_valid_colors; idx++) {
+ if (color_rank[idx] > current_color) {
+ (*color_idx)++;
+ } else if (color_rank[idx] == current_color) {
+ *color_idx = idx;
+ break;
+ }
+ }
+
+ // Get hash value of context.
+ uint8_t color_index_ctx_hash = 0;
+ static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+ for (int idx = 0; idx < num_valid_colors; ++idx) {
+ color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
+ }
+ assert(color_index_ctx_hash > 0);
+ assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+ // Lookup context from hash.
+ const int color_index_ctx = 9 - color_index_ctx_hash;
+ assert(color_index_ctx ==
+ av1_palette_color_index_context_lookup[color_index_ctx_hash]);
+ assert(color_index_ctx >= 0);
+ assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+ return color_index_ctx;
+}
+#undef INVALID_COLOR_IDX
+#undef SWAP
+
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t,
+ int plane, int calc_rate, int allow_update_cdf,
+ FRAME_COUNTS *counts) {
+ const uint8_t *const color_map = param->color_map;
+ MapCdf map_cdf = param->map_cdf;
+ ColorCost color_cost = param->color_cost;
+ const int plane_block_width = param->plane_width;
+ const int rows = param->rows;
+ const int cols = param->cols;
+ const int n = param->n_colors;
+ const int palette_size_idx = n - PALETTE_MIN_SIZE;
+ int this_rate = 0;
+
+ (void)plane;
+ (void)counts;
+
+ for (int k = 1; k < rows + cols - 1; ++k) {
+ for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+ int i = k - j;
+ int color_new_idx;
+ const int color_ctx = av1_fast_palette_color_index_context(
+ color_map, plane_block_width, i, j, &color_new_idx);
+ assert(color_new_idx >= 0 && color_new_idx < n);
+ if (calc_rate) {
+ this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx];
+ } else {
+ (*t)->token = color_new_idx;
+ (*t)->color_ctx = color_ctx;
+ ++(*t);
+ if (allow_update_cdf)
+ update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+ if (plane) {
+ ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+ [color_new_idx];
+ } else {
+ ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+ [color_new_idx];
+ }
+#endif
+ }
+ }
+ }
+ if (calc_rate) return this_rate;
+ return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+ BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ params->color_map = xd->plane[plane].color_index_map;
+ params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+ : xd->tile_ctx->palette_y_color_index_cdf;
+ params->color_cost = plane ? x->mode_costs.palette_uv_color_cost
+ : x->mode_costs.palette_y_color_cost;
+ params->n_colors = pmi->palette_size[plane];
+ av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+ &params->rows, &params->cols);
+}
+
+// TODO(any): Remove this function
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type,
+ Av1ColorMapParam *params) {
+ (void)tx_size;
+ memset(params, 0, sizeof(*params));
+ switch (type) {
+ case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+ default: assert(0 && "Invalid color map type"); return;
+ }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam color_map_params;
+ get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+ return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+ TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type, int allow_update_cdf,
+ FRAME_COUNTS *counts) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam color_map_params;
+ get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+ // The first color index does not use context or entropy.
+ (*t)->token = color_map_params.color_map[0];
+ (*t)->color_ctx = -1;
+ ++(*t);
+ cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+ counts);
+}
+
+static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+ int block, int plane, void *arg) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+
+ if (tx_size == plane_tx_size || plane) {
+ plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+ struct tokenize_b_args *args = arg;
+ if (args->allow_update_cdf)
+ av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+ plane_bsize, tx_size, arg);
+ else
+ av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, arg);
+
+ } else {
+ // Half the block size in transform block unit.
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsw * bsh;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+
+ tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
+ arg);
+ block += step;
+ }
+ }
+ }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ uint8_t allow_update_cdf) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+
+ if (mbmi->skip_txfm) {
+ av1_reset_entropy_context(xd, bsize, num_planes);
+ return;
+ }
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ const int bw = mi_size_wide[txb_size];
+ const int bh = mi_size_high[txb_size];
+ int block = 0;
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+
+ mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+ for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+ for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+ const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+ for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col,
+ block, plane, &arg);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+ if (rate) *rate += arg.this_rate;
+}
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 0000000000..f675c489ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/encoder/block.h"
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The token and color_ctx members of the TokenExtra structure are used
+// to store the indices of color and color context of each pixel in
+// case of palette mode.
+// 1) token can take values in the range of [0, 7] as maximum number of possible
+// colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned).
+// 2) The reserved field (1-bit) is positioned such that color_ctx occupies the
+// most significant bits and token occupies the least significant bits of the
+// byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is
+// defined as:
+// typedef struct {
+// int8_t color_ctx : 4;
+// uint8_t token : 3;
+// } TokenExtra;
+// then read of color_ctx requires an extra left shift to facilitate sign
+// extension and write of token requires an extra masking.
+// 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e.,
+// from 0 to 4. As per the current implementation it can take values in the
+// range of [-1, 4]. Here -1 corresponds to invalid color index context and is
+// used for default initialization. Hence color_ctx requires 4 bits (signed).
+typedef struct {
+ uint8_t token : 3;
+ uint8_t reserved : 1;
+ int8_t color_ctx : 4;
+} TokenExtra;
+
+typedef struct {
+ TokenExtra *start;
+ unsigned int count;
+} TokenList;
+
+typedef struct {
+ // Number of tile tokens for which memory is allocated.
+ unsigned int tokens_allocated;
+ // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith
+ // tile row, jth tile column.
+ TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+ // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of
+ // palette tokens for the kth superblock row of the ith tile row, jth tile
+ // column.
+ TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+} TokenInfo;
+
+struct AV1_COMP;
+struct ThreadData;
+struct FRAME_COUNTS;
+
+enum {
+ OUTPUT_ENABLED = 0,
+ DRY_RUN_NORMAL,
+ DRY_RUN_COSTCOEFFS,
+} UENUM1BYTE(RUN_TYPE);
+
+struct tokenize_b_args {
+ const struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ int this_rate;
+ uint8_t allow_update_cdf;
+ RUN_TYPE dry_run;
+};
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ uint8_t allow_update_cdf);
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+ TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type, int allow_update_cdf,
+ struct FRAME_COUNTS *counts);
+
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+ TX_SIZE tx_size) {
+ const int eob_max = av1_get_max_eob(tx_size);
+ return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+ int sb_size_log2,
+ const int num_planes) {
+ // Calculate the maximum number of max superblocks in the image.
+ const int shift = sb_size_log2 - 4;
+ const int sb_size = 1 << sb_size_log2;
+ const int sb_size_square = sb_size * sb_size;
+ const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift);
+ const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift);
+
+ // One palette token for each pixel. There can be palettes on two planes.
+ const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+ return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Allocate memory for token related info.
+static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
+ unsigned int tokens_required) {
+ int sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+ token_info->tokens_allocated = tokens_required;
+
+ CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0],
+ (TokenExtra *)aom_calloc(
+ tokens_required, sizeof(*token_info->tile_tok[0][0])));
+
+ CHECK_MEM_ERROR(
+ cm, token_info->tplist[0][0],
+ (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+ sizeof(*token_info->tplist[0][0])));
+}
+
+// Check if memory allocation has been done for token related info.
+static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) {
+ return ((token_info->tile_tok[0][0] != NULL) &&
+ (token_info->tplist[0][0] != NULL));
+}
+
+// Free memory from token related variables.
+static AOM_INLINE void free_token_info(TokenInfo *token_info) {
+ aom_free(token_info->tile_tok[0][0]);
+ token_info->tile_tok[0][0] = NULL;
+
+ aom_free(token_info->tplist[0][0]);
+ token_info->tplist[0][0] = NULL;
+
+ token_info->tokens_allocated = 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tpl_model.c b/third_party/aom/av1/encoder/tpl_model.c
new file mode 100644
index 0000000000..ca60e4981e
--- /dev/null
+++ b/third_party/aom/av1/encoder/tpl_model.c
@@ -0,0 +1,2511 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+
+#include "av1/encoder/thirdpass.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+static INLINE double exp_bounded(double v) {
+ // When v > 700 or <-700, the exp function will be close to overflow
+ // For details, see the "Notes" in the following link.
+ // https://en.cppreference.com/w/c/numeric/math/exp
+ if (v > 700) {
+ return DBL_MAX;
+ } else if (v < -700) {
+ return 0;
+ }
+ return exp(v);
+}
+
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) {
+ tpl_txfm_stats->ready = 0;
+ tpl_txfm_stats->coeff_num = 256;
+ tpl_txfm_stats->txfm_block_count = 0;
+ memset(tpl_txfm_stats->abs_coeff_sum, 0,
+ sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num);
+ memset(tpl_txfm_stats->abs_coeff_mean, 0,
+ sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+ TplTxfmStats *accumulated_stats) {
+ accumulated_stats->txfm_block_count += sub_stats->txfm_block_count;
+ for (int i = 0; i < accumulated_stats->coeff_num; ++i) {
+ accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i];
+ }
+}
+
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+ const tran_low_t *coeff) {
+ // For transform larger than 16x16, the scale of coeff need to be adjusted.
+ // It's not LOSSLESS_Q_STEP.
+ assert(tpl_txfm_stats->coeff_num <= 256);
+ for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) {
+ tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
+ }
+ ++tpl_txfm_stats->txfm_block_count;
+}
+
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) {
+ if (txfm_stats->txfm_block_count > 0) {
+ for (int j = 0; j < txfm_stats->coeff_num; j++) {
+ txfm_stats->abs_coeff_mean[j] =
+ txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count;
+ }
+ txfm_stats->ready = 1;
+ } else {
+ txfm_stats->ready = 0;
+ }
+}
+
+static AOM_INLINE void av1_tpl_store_txfm_stats(
+ TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
+ const int frame_index) {
+ tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
+}
+#endif // CONFIG_BITRATE_ACCURACY
+
+static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
+ const tran_low_t *coeff,
+ tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, TX_SIZE tx_size,
+ uint16_t *eob, int64_t *recon_error,
+ int64_t *sse) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+ QUANT_PARAM quant_param;
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
+ scan_order, &quant_param);
+ *recon_error =
+ av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
+ } else {
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+ &quant_param);
+ *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+ }
+#else
+ (void)xd;
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+ &quant_param);
+ *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ *recon_error = AOMMAX(*recon_error, 1);
+
+ *sse = (*sse) >> shift;
+ *sse = AOMMAX(*sse, 1);
+}
+
+static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
+ uint8_t *tpl_bsize_1d) {
+ // tpl stats bsize: 2 means 16x16
+ *block_mis_log2 = 2;
+ // Block size used in tpl motion estimation
+ *tpl_bsize_1d = 16;
+ // MIN_TPL_BSIZE_1D = 16;
+ assert(*tpl_bsize_1d >= 16);
+}
+
+void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
+ CommonModeInfoParams *const mi_params, int width,
+ int height, int byte_alignment, int lag_in_frames) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ TplParams *const tpl_data = &ppi->tpl_data;
+ set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+ &tpl_data->tpl_bsize_1d);
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ tpl_data->border_in_pixels =
+ ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5);
+
+ const int alloc_y_plane_only =
+ ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0;
+ for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+ const int mi_cols =
+ ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+ const int mi_rows =
+ ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+ TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame];
+ tpl_frame->is_valid = 0;
+ tpl_frame->width = mi_cols >> block_mis_log2;
+ tpl_frame->height = mi_rows >> block_mis_log2;
+ tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width;
+ tpl_frame->mi_rows = mi_params->mi_rows;
+ tpl_frame->mi_cols = mi_params->mi_cols;
+ }
+ tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
+
+ // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory
+ // allocations are avoided for buffers in tpl_data.
+ if (lag_in_frames <= 1) return;
+
+ AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list,
+ aom_calloc(MAX_LENGTH_TPL_FRAME_STATS,
+ sizeof(*tpl_data->txfm_stats_list)));
+
+ for (int frame = 0; frame < lag_in_frames; ++frame) {
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, tpl_data->tpl_stats_pool[frame],
+ aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+ tpl_data->tpl_stats_buffer[frame].height,
+ sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+
+ if (aom_alloc_frame_buffer(
+ &tpl_data->tpl_rec_pool[frame], width, height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, tpl_data->border_in_pixels,
+ byte_alignment, 0, alloc_y_plane_only))
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+}
+
+static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info,
+ int16_t *src_diff, int diff_stride,
+ const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ tran_low_t *coeff, int bw, int bh,
+ TX_SIZE tx_size) {
+ const int pix_num = bw * bh;
+
+ av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+ dst, dst_stride);
+ av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
+ return aom_satd(coeff, pix_num);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+
+ assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+ int rate_cost = 1;
+
+ for (int idx = 0; idx < eob; ++idx) {
+ unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+ rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+ }
+
+ return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+static AOM_INLINE void txfm_quant_rdcost(
+ const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
+ int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ uint16_t eob;
+ av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+ dst, dst_stride);
+ av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
+
+ get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
+ sse);
+
+ *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+
+ if (do_recon)
+ av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst,
+ dst_stride, eob, 0);
+}
+
+static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf, int stride,
+ int ref_stride, int width, int ref_width,
+ BLOCK_SIZE bsize, MV center_mv,
+ int_mv *best_mv) {
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+ int step_param;
+ uint32_t bestsme = UINT_MAX;
+ FULLPEL_MV_STATS best_mv_stats;
+ int distortion;
+ uint32_t sse;
+ int cost_list[5];
+ FULLPEL_MV start_mv = get_fullmv_from_mv(&center_mv);
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ x->plane[0].src.width = width;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = ref_stride;
+ xd->plane[0].pre[0].width = ref_width;
+
+ step_param = tpl_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ const search_site_config *search_site_cfg =
+ cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
+ if (search_site_cfg->stride != ref_stride)
+ search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+ assert(search_site_cfg->stride == ref_stride);
+
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+ start_mv, search_site_cfg,
+ tpl_sf->search_method,
+ /*fine_search_interval=*/0);
+
+ bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv->as_fullmv, &best_mv_stats, NULL);
+
+ // When sub-pel motion search is skipped, populate sub-pel precision MV and
+ // return.
+ if (tpl_sf->subpel_force_stop == FULL_PEL) {
+ best_mv->as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ return bestsme;
+ }
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &center_mv,
+ cost_list);
+ ms_params.forced_stop = tpl_sf->subpel_force_stop;
+ ms_params.var_params.subpel_search_type = USE_2_TAPS;
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ bestsme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv,
+ &distortion, &sse, NULL);
+
+ return bestsme;
+}
+
+typedef struct {
+ int_mv mv;
+ int sad;
+} center_mv_t;
+
+static int compare_sad(const void *a, const void *b) {
+ const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad;
+ if (diff < 0)
+ return -1;
+ else if (diff > 0)
+ return 1;
+ return 0;
+}
+
+static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs,
+ int center_mvs_count, int skip_alike_starting_mv) {
+ // MV difference threshold is in 1/8 precision.
+ const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
+ int thr = mv_diff_thr[skip_alike_starting_mv];
+ int i;
+
+ for (i = 0; i < center_mvs_count; i++) {
+ if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr &&
+ abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void get_rate_distortion(
+ int *rate_cost, int64_t *recon_error, int64_t *pred_error,
+ int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
+ const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
+ int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon,
+ TplTxfmStats *tpl_txfm_stats) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ *rate_cost = 0;
+ *recon_error = 1;
+ *pred_error = 1;
+
+ (void)tpl_txfm_stats;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ int is_compound = (best_mode == NEW_NEWMV);
+ int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE;
+
+ uint8_t *src_buffer_pool[MAX_MB_PLANE] = {
+ xd->cur_buf->y_buffer,
+ xd->cur_buf->u_buffer,
+ xd->cur_buf->v_buffer,
+ };
+ const int src_stride_pool[MAX_MB_PLANE] = {
+ xd->cur_buf->y_stride,
+ xd->cur_buf->uv_stride,
+ xd->cur_buf->uv_stride,
+ };
+
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ BLOCK_SIZE bsize_plane =
+ av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x]
+ [pd->subsampling_y];
+
+ int dst_buffer_stride = rec_stride_pool[plane];
+ int dst_mb_offset =
+ ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) +
+ ((mi_col * MI_SIZE) >> pd->subsampling_x);
+ uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset;
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ if (!is_inter_mode(best_mode)) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize_plane], block_size_high[bsize_plane],
+ max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0,
+ FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer,
+ dst_buffer_stride, 0, 0, plane);
+ } else {
+ int_mv best_mv = xd->mi[0]->mv[ref];
+ uint8_t *ref_buffer_pool[MAX_MB_PLANE] = {
+ ref_frame_ptr[ref]->y_buffer,
+ ref_frame_ptr[ref]->u_buffer,
+ ref_frame_ptr[ref]->v_buffer,
+ };
+ InterPredParams inter_pred_params;
+ struct buf_2d ref_buf = {
+ NULL, ref_buffer_pool[plane],
+ plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width,
+ plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height,
+ plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride
+ };
+ av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane],
+ block_size_high[bsize_plane],
+ (mi_row * MI_SIZE) >> pd->subsampling_y,
+ (mi_col * MI_SIZE) >> pd->subsampling_x,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), 0,
+ xd->block_ref_scale_factors[0], &ref_buf, kernel);
+ if (is_compound) av1_init_comp_mode(&inter_pred_params);
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+ av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
+ &best_mv.as_mv, &inter_pred_params);
+ }
+ }
+
+ int src_stride = src_stride_pool[plane];
+ int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) +
+ ((mi_col * MI_SIZE) >> pd->subsampling_x);
+
+ int this_rate = 1;
+ int64_t this_recon_error = 1;
+ int64_t sse;
+ txfm_quant_rdcost(
+ x, src_diff, block_size_wide[bsize_plane],
+ src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer,
+ dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane],
+ block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
+ do_recon, &this_rate, &this_recon_error, &sse);
+
+#if CONFIG_BITRATE_ACCURACY
+ if (plane == 0 && tpl_txfm_stats) {
+ // We only collect Y plane's transform coefficient
+ av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+ }
+#endif // CONFIG_BITRATE_ACCURACY
+
+ *recon_error += this_recon_error;
+ *pred_error += sse;
+ *rate_cost += this_rate;
+ }
+}
+
+static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
+ const uint8_t *src_mb_buffer,
+ int src_stride,
+ TplBuffers *tpl_tmp_buffers,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int mi_row, int mi_col, int rf_idx,
+ MV *rfidx_mv, int use_pred_sad) {
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ const YV12_BUFFER_CONFIG *const ref_frame_ptr =
+ tpl_data->src_ref_frame[rf_idx];
+ int16_t *src_diff = tpl_tmp_buffers->src_diff;
+ tran_low_t *coeff = tpl_tmp_buffers->coeff;
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ int32_t inter_cost;
+
+ if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) {
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+ uint8_t *predictor =
+ is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+ struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+ ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+ ref_frame_ptr->y_stride };
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+ &tpl_data->sf, &ref_buf, kernel);
+ inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+ av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv,
+ &inter_pred_params);
+
+ if (use_pred_sad) {
+ inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride,
+ predictor, bw);
+ } else {
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+ } else {
+ int ref_mb_offset =
+ mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+ uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+ int ref_stride = ref_frame_ptr->y_stride;
+ const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv);
+ // Since sub-pel motion search is not performed, use the prediction pixels
+ // directly from the reference block ref_mb
+ if (use_pred_sad) {
+ inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride,
+ &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride);
+ } else {
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ &ref_mb[fullmv.row * ref_stride + fullmv.col],
+ ref_stride, coeff, bw, bh, tx_size);
+ }
+ }
+ return inter_cost;
+}
+
+static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
+ TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ TplDepStats *tpl_stats) {
+ AV1_COMMON *cm = &cpi->common;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+
+ (void)gf_group;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+
+ int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
+
+ int32_t best_intra_cost = INT32_MAX;
+ int32_t intra_cost;
+ PREDICTION_MODE best_mode = DC_PRED;
+
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
+ const int src_stride = xd->cur_buf->y_stride;
+ const int src_width = xd->cur_buf->y_width;
+
+ int dst_mb_offset =
+ mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
+ uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
+ int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
+ int use_y_only_rate_distortion = tpl_sf->use_y_only_rate_distortion;
+
+ uint8_t *rec_buffer_pool[3] = {
+ tpl_frame->rec_picture->y_buffer,
+ tpl_frame->rec_picture->u_buffer,
+ tpl_frame->rec_picture->v_buffer,
+ };
+
+ const int rec_stride_pool[3] = {
+ tpl_frame->rec_picture->y_stride,
+ tpl_frame->rec_picture->uv_stride,
+ tpl_frame->rec_picture->uv_stride,
+ };
+
+ for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ pd->subsampling_x = xd->cur_buf->subsampling_x;
+ pd->subsampling_y = xd->cur_buf->subsampling_y;
+ }
+
+ uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+ int16_t *src_diff = tpl_tmp_buffers->src_diff;
+ tran_low_t *coeff = tpl_tmp_buffers->coeff;
+ tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff;
+ tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff;
+ uint8_t *predictor =
+ is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+ int64_t recon_error = 1;
+ int64_t pred_error = 1;
+
+ memset(tpl_stats, 0, sizeof(*tpl_stats));
+ tpl_stats->ref_frame_index[0] = -1;
+ tpl_stats->ref_frame_index[1] = -1;
+
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+ set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+ cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+ av1_num_planes(cm));
+ xd->mi[0]->bsize = bsize;
+ xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+ // Intra prediction search
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+ // Pre-load the bottom left line.
+ if (xd->left_available &&
+ mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+ for (int i = 0; i < bw; ++i)
+ dst[(bw + i) * dst_buffer_stride - 1] =
+ dst[(bw - 1) * dst_buffer_stride - 1];
+ } else {
+ for (int i = 0; i < bw; ++i)
+ dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+ dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+ }
+ }
+
+ // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
+ // H_PRED, and V_PRED
+ const PREDICTION_MODE last_intra_mode =
+ tpl_sf->prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+ const SequenceHeader *seq_params = cm->seq_params;
+ for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
+ ++mode) {
+ av1_predict_intra_block(xd, seq_params->sb_size,
+ seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize],
+ tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+ dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+ if (tpl_frame->use_pred_sad) {
+ intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride, predictor, bw);
+ } else {
+ intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+
+ if (intra_cost < best_intra_cost) {
+ best_intra_cost = intra_cost;
+ best_mode = mode;
+ }
+ }
+ // Calculate SATD of the best intra mode if SAD was used for mode decision
+ // as best_intra_cost is used in ML model to skip intra mode evaluation.
+ if (tpl_frame->use_pred_sad) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0,
+ 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0,
+ 0, 0);
+ best_intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+
+ int rate_cost = 1;
+
+ if (cpi->use_ducky_encode) {
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+
+ tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->intra_rate = rate_cost;
+ }
+
+ if (cpi->third_pass_ctx &&
+ frame_offset < cpi->third_pass_ctx->frame_info_count &&
+ tpl_data->frame_idx < gf_group->size) {
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+ cm->width, &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+ PREDICTION_MODE third_pass_mode = this_mi->pred_mode;
+
+ if (third_pass_mode >= last_intra_mode &&
+ third_pass_mode < INTRA_MODE_END) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size,
+ third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+ dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+ intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+
+ if (intra_cost < best_intra_cost) {
+ best_intra_cost = intra_cost;
+ best_mode = third_pass_mode;
+ }
+ }
+ }
+
+ // Motion compensated prediction
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE_FRAME;
+ xd->mi[0]->compound_idx = 1;
+
+ int best_rf_idx = -1;
+ int_mv best_mv[2];
+ int32_t inter_cost;
+ int32_t best_inter_cost = INT32_MAX;
+ int rf_idx;
+ int_mv single_mv[INTER_REFS_PER_FRAME];
+
+ best_mv[0].as_int = INVALID_MV;
+ best_mv[1].as_int = INVALID_MV;
+
+ for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
+ single_mv[rf_idx].as_int = INVALID_MV;
+ if (tpl_data->ref_frame[rf_idx] == NULL ||
+ tpl_data->src_ref_frame[rf_idx] == NULL) {
+ tpl_stats->mv[rf_idx].as_int = INVALID_MV;
+ continue;
+ }
+
+ const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
+ const int ref_mb_offset =
+ mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+ uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+ const int ref_stride = ref_frame_ptr->y_stride;
+ const int ref_width = ref_frame_ptr->y_width;
+
+ int_mv best_rfidx_mv = { 0 };
+ uint32_t bestsme = UINT32_MAX;
+
+ center_mv_t center_mvs[4] = { { { 0 }, INT_MAX },
+ { { 0 }, INT_MAX },
+ { { 0 }, INT_MAX },
+ { { 0 }, INT_MAX } };
+ int refmv_count = 1;
+ int idx;
+
+ if (xd->up_available) {
+ TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
+ if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+ ++refmv_count;
+ }
+ }
+
+ if (xd->left_available) {
+ TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
+ if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+ ++refmv_count;
+ }
+ }
+
+ if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) {
+ TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ mi_row - mi_height, mi_col + mi_width, tpl_frame->stride,
+ block_mis_log2)];
+ if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+ ++refmv_count;
+ }
+ }
+
+ if (cpi->third_pass_ctx &&
+ frame_offset < cpi->third_pass_ctx->frame_info_count &&
+ tpl_data->frame_idx < gf_group->size) {
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+ cm->width, &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+ int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w,
+ rf_idx + LAST_FRAME);
+ if (tp_mv.as_int != INVALID_MV &&
+ !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[0].mv = tp_mv;
+ }
+ }
+
+ // Prune starting mvs
+ if (tpl_sf->prune_starting_mv && refmv_count > 1) {
+ // Get each center mv's sad.
+ for (idx = 0; idx < refmv_count; ++idx) {
+ FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
+ clamp_fullmv(&mv, &x->mv_limits);
+ center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
+ ref_stride);
+ }
+
+ // Rank center_mv using sad.
+ qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
+
+ refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count);
+ // Further reduce number of refmv based on sad difference.
+ if (refmv_count > 1) {
+ int last_sad = center_mvs[refmv_count - 1].sad;
+ int second_to_last_sad = center_mvs[refmv_count - 2].sad;
+ if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad)
+ refmv_count--;
+ }
+ }
+
+ for (idx = 0; idx < refmv_count; ++idx) {
+ int_mv this_mv;
+ uint32_t thissme = motion_estimation(
+ cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width,
+ ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ best_rfidx_mv = this_mv;
+ }
+ }
+
+ tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
+ single_mv[rf_idx] = best_rfidx_mv;
+
+ inter_cost = get_inter_cost(
+ cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+ mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad);
+ // Store inter cost for each ref frame. This is used to prune inter modes.
+ tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
+
+ if (inter_cost < best_inter_cost) {
+ best_rf_idx = rf_idx;
+
+ best_inter_cost = inter_cost;
+ best_mv[0].as_int = best_rfidx_mv.as_int;
+ }
+ }
+ // Calculate SATD of the best inter mode if SAD was used for mode decision
+ // as best_inter_cost is used in ML model to skip intra mode evaluation.
+ if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) {
+ assert(best_rf_idx != -1);
+ best_inter_cost = get_inter_cost(
+ cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+ mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */);
+ }
+
+ if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) {
+ best_mode = NEWMV;
+ xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME;
+ xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+ }
+
+ // Start compound predition search.
+ int comp_ref_frames[3][2] = {
+ { 0, 4 },
+ { 0, 6 },
+ { 3, 6 },
+ };
+
+ int start_rf = 0;
+ int end_rf = 3;
+ if (!tpl_sf->allow_compound_pred) end_rf = 0;
+ if (cpi->third_pass_ctx &&
+ frame_offset < cpi->third_pass_ctx->frame_info_count &&
+ tpl_data->frame_idx < gf_group->size) {
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+ cm->width, &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+ if (this_mi->ref_frame[0] >= LAST_FRAME &&
+ this_mi->ref_frame[1] >= LAST_FRAME) {
+ int found = 0;
+ for (int i = 0; i < 3; i++) {
+ if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] &&
+ comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found || !tpl_sf->allow_compound_pred) {
+ comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME;
+ comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME;
+ if (!tpl_sf->allow_compound_pred) {
+ start_rf = 2;
+ end_rf = 3;
+ }
+ }
+ }
+ }
+
+ xd->mi_row = mi_row;
+ xd->mi_col = mi_col;
+ int best_cmp_rf_idx = -1;
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) {
+ int rf_idx0 = comp_ref_frames[cmp_rf_idx][0];
+ int rf_idx1 = comp_ref_frames[cmp_rf_idx][1];
+
+ if (tpl_data->ref_frame[rf_idx0] == NULL ||
+ tpl_data->src_ref_frame[rf_idx0] == NULL ||
+ tpl_data->ref_frame[rf_idx1] == NULL ||
+ tpl_data->src_ref_frame[rf_idx1] == NULL) {
+ continue;
+ }
+
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+ tpl_data->src_ref_frame[rf_idx0],
+ tpl_data->src_ref_frame[rf_idx1],
+ };
+
+ xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME;
+ xd->mi[0]->mode = NEW_NEWMV;
+ const int8_t ref_frame_type = av1_ref_frame_type(xd->mi[0]->ref_frame);
+ // Set up ref_mv for av1_joint_motion_search().
+ CANDIDATE_MV *this_ref_mv_stack = x->mbmi_ext.ref_mv_stack[ref_frame_type];
+ this_ref_mv_stack[xd->mi[0]->ref_mv_idx].this_mv = single_mv[rf_idx0];
+ this_ref_mv_stack[xd->mi[0]->ref_mv_idx].comp_mv = single_mv[rf_idx1];
+
+ struct buf_2d yv12_mb[2][MAX_MB_PLANE];
+ for (int i = 0; i < 2; ++i) {
+ av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i],
+ xd->block_ref_scale_factors[i],
+ xd->block_ref_scale_factors[i], MAX_MB_PLANE);
+ for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ xd->plane[plane].pre[i] = yv12_mb[i][plane];
+ }
+ }
+
+ int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] };
+ int rate_mv;
+ av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv,
+ !cpi->sf.mv_sf.disable_second_mv,
+ NUM_JOINT_ME_REFINE_ITER);
+
+ for (int ref = 0; ref < 2; ++ref) {
+ struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer,
+ ref_frame_ptr[ref]->y_width,
+ ref_frame_ptr[ref]->y_height,
+ ref_frame_ptr[ref]->y_stride };
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd),
+ 0, &tpl_data->sf, &ref_buf, kernel);
+ av1_init_comp_mode(&inter_pred_params);
+
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd);
+
+ av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv,
+ &inter_pred_params);
+ }
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ if (inter_cost < best_inter_cost) {
+ best_cmp_rf_idx = cmp_rf_idx;
+ best_inter_cost = inter_cost;
+ best_mv[0] = tmp_mv[0];
+ best_mv[1] = tmp_mv[1];
+ }
+ }
+
+ if (best_cmp_rf_idx != -1 && best_inter_cost < best_intra_cost) {
+ best_mode = NEW_NEWMV;
+ const int best_rf_idx0 = comp_ref_frames[best_cmp_rf_idx][0];
+ const int best_rf_idx1 = comp_ref_frames[best_cmp_rf_idx][1];
+ xd->mi[0]->ref_frame[0] = best_rf_idx0 + LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME;
+ }
+
+ if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) {
+ xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+ xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+ best_cmp_rf_idx >= 0
+ ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+ : tpl_data->src_ref_frame[best_rf_idx],
+ best_cmp_rf_idx >= 0
+ ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+ : NULL,
+ };
+ rate_cost = 1;
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 0 /*do_recon*/, NULL);
+ tpl_stats->srcrf_rate = rate_cost;
+ }
+
+ best_intra_cost = AOMMAX(best_intra_cost, 1);
+ best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+ tpl_stats->inter_cost = best_inter_cost;
+ tpl_stats->intra_cost = best_intra_cost;
+
+ tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+
+ // Final encode
+ rate_cost = 0;
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2];
+
+ ref_frame_ptr[0] =
+ best_mode == NEW_NEWMV
+ ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+ : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx]
+ : NULL;
+ ref_frame_ptr[1] =
+ best_mode == NEW_NEWMV
+ ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+ : NULL;
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/,
+ tpl_txfm_stats);
+
+ tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->recrf_rate = rate_cost;
+
+ if (!is_inter_mode(best_mode)) {
+ tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->srcrf_rate = rate_cost;
+ tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+ }
+
+ tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
+ tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
+
+ if (best_mode == NEW_NEWMV) {
+ ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+ ref_frame_ptr[1] =
+ tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+ tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->cmp_recrf_rate[0] = rate_cost;
+
+ tpl_stats->cmp_recrf_dist[0] =
+ AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]);
+ tpl_stats->cmp_recrf_rate[0] =
+ AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+ tpl_stats->cmp_recrf_dist[0] =
+ AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]);
+ tpl_stats->cmp_recrf_rate[0] =
+ AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+ rate_cost = 0;
+ ref_frame_ptr[0] =
+ tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+ ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+ tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->cmp_recrf_rate[1] = rate_cost;
+
+ tpl_stats->cmp_recrf_dist[1] =
+ AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]);
+ tpl_stats->cmp_recrf_rate[1] =
+ AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]);
+
+ tpl_stats->cmp_recrf_dist[1] =
+ AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]);
+ tpl_stats->cmp_recrf_rate[1] =
+ AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]);
+ }
+
+ if (best_mode == NEWMV) {
+ tpl_stats->mv[best_rf_idx] = best_mv[0];
+ tpl_stats->ref_frame_index[0] = best_rf_idx;
+ tpl_stats->ref_frame_index[1] = NONE_FRAME;
+ } else if (best_mode == NEW_NEWMV) {
+ tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0];
+ tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1];
+ tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0];
+ tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1];
+ }
+
+ for (int idy = 0; idy < mi_height; ++idy) {
+ for (int idx = 0; idx < mi_width; ++idx) {
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) {
+ xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0];
+ }
+ }
+ }
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+ int round;
+ if (ref_pos < 0)
+ round = -(1 + (-ref_pos - 1) / bsize_pix);
+ else
+ round = ref_pos / bsize_pix;
+
+ return round;
+}
+
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+ int height) {
+ int min_row = AOMMAX(row_a, row_b);
+ int max_row = AOMMIN(row_a + height, row_b + height);
+ int min_col = AOMMAX(col_a, col_b);
+ int max_col = AOMMIN(col_a + width, col_b + width);
+ if (min_row < max_row && min_col < max_col) {
+ return (max_row - min_row) * (max_col - min_col);
+ }
+ return 0;
+}
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
+ return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
+
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+ int64_t srcrf_dist, int pix_num) {
+ double beta = (double)srcrf_dist / recrf_dist;
+ int64_t rate_cost = delta_rate;
+
+ if (srcrf_dist <= 128) return rate_cost;
+
+ double dr =
+ (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) /
+ pix_num;
+
+ double log_den = log(beta) / log(2.0) + 2.0 * dr;
+
+ if (log_den > log(10.0) / log(2.0)) {
+ rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0);
+ rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+ return rate_cost;
+ }
+
+ double num = pow(2.0, log_den);
+ double den = num * beta + (1 - beta) * beta;
+
+ rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0);
+
+ rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+
+ return rate_cost;
+}
+
+static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
+ int mi_col, const BLOCK_SIZE bsize,
+ int frame_idx, int ref) {
+ TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
+ TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
+ TplDepFrame *tpl_frame = tpl_data->tpl_frame;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos(
+ mi_row, mi_col, tpl_frame->stride, block_mis_log2)];
+
+ int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0;
+
+ if (tpl_stats_ptr->ref_frame_index[ref] < 0) return;
+ const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref];
+ TplDepFrame *ref_tpl_frame =
+ &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]];
+ TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr;
+
+ if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return;
+
+ const FULLPEL_MV full_mv =
+ get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv);
+ const int ref_pos_row = mi_row * MI_SIZE + full_mv.row;
+ const int ref_pos_col = mi_col * MI_SIZE + full_mv.col;
+
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+ const int pix_num = bw * bh;
+
+ // top-left on grid block location in pixel
+ int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+ int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+ int block;
+
+ int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref]
+ : tpl_stats_ptr->srcrf_dist;
+ int64_t srcrf_rate =
+ is_compound
+ ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2)
+ : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
+
+ int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist;
+ int64_t mc_dep_dist =
+ (int64_t)(tpl_stats_ptr->mc_dep_dist *
+ ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) /
+ tpl_stats_ptr->recrf_dist));
+ int64_t delta_rate =
+ (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate;
+ int64_t mc_dep_rate =
+ av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+ srcrf_dist, pix_num);
+
+ for (block = 0; block < 4; ++block) {
+ int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+ int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+ if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+ grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+ int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col,
+ ref_pos_row, ref_pos_col, bw, bh);
+ int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+ int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+ assert((1 << block_mis_log2) == mi_height);
+ assert((1 << block_mis_log2) == mi_width);
+ TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+ ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)];
+ des_stats->mc_dep_dist +=
+ ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+ des_stats->mc_dep_rate +=
+ ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
+ }
+ }
+}
+
+static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
+ int mi_col, int frame_idx) {
+ const BLOCK_SIZE tpl_stats_block_size =
+ convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
+ tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+ 0);
+ tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+ 1);
+}
+
+static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
+ int mi_col, int stride,
+ const TplDepStats *src_stats,
+ uint8_t block_mis_log2) {
+ int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2);
+ TplDepStats *tpl_ptr = &tpl_stats_ptr[index];
+ *tpl_ptr = *src_stats;
+ tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost);
+ tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost);
+ tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist);
+ tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse);
+ tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist);
+ tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate);
+ tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate);
+ tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]);
+ tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]);
+ tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]);
+ tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]);
+}
+
+// Reset the ref and source frame pointers of tpl_data.
+static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) {
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ tpl_data->ref_frame[i] = NULL;
+ tpl_data->src_ref_frame[i] = NULL;
+ }
+}
+
+static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
+ int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1);
+ return gop_length;
+}
+
+// Initialize the mc_flow parameters used in computing tpl data.
+static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+ int pframe_qindex) {
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+ const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
+ const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+ uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+ int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+ gf_group, cpi->sf.inter_sf.selective_ref_frame,
+ tpl_sf->prune_ref_frames_in_tpl, frame_idx);
+ int gop_length = get_gop_length(gf_group);
+ int ref_frame_flags;
+ AV1_COMMON *cm = &cpi->common;
+ int rdmult, idx;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+ tpl_data->frame_idx = frame_idx;
+ tpl_reset_src_ref_frames(tpl_data);
+ av1_tile_init(&xd->tile, cm, 0, 0);
+
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ // Setup scaling factor
+ av1_setup_scale_factors_for_frame(
+ &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height);
+
+ xd->cur_buf = this_frame;
+
+ for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+ TplDepFrame *tpl_ref_frame =
+ &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]];
+ tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture;
+ tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture;
+ ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index;
+ }
+
+ // Store the reference frames based on priority order
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ ref_frames_ordered[i] =
+ tpl_data->ref_frame[ref_frame_priority_order[i] - 1];
+ }
+
+ // Work out which reference frame slots may be used.
+ ref_frame_flags =
+ get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi),
+ ref_frames_ordered, cpi->ext_flags.ref_frame_flags);
+
+ enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices,
+ tpl_frame->frame_display_index);
+
+ // Prune reference frames
+ for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+ if ((ref_frame_flags & (1 << idx)) == 0) {
+ tpl_data->ref_frame[idx] = NULL;
+ }
+ }
+
+ // Skip motion estimation w.r.t. reference frames which are not
+ // considered in RD search, using "selective_ref_frame" speed feature.
+ // The reference frame pruning is not enabled for frames beyond the gop
+ // length, as there are fewer reference frames and the reference frames
+ // differ from the frames considered during RD search.
+ if (ref_pruning_enabled && (frame_idx < gop_length)) {
+ for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+ const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME };
+ if (prune_ref_by_selective_ref_frame(cpi, NULL, refs,
+ ref_frame_display_indices)) {
+ tpl_data->ref_frame[idx] = NULL;
+ }
+ }
+ }
+
+ // Make a temporary mbmi for tpl model
+ MB_MODE_INFO mbmi;
+ memset(&mbmi, 0, sizeof(mbmi));
+ MB_MODE_INFO *mbmi_ptr = &mbmi;
+ xd->mi = &mbmi_ptr;
+
+ xd->block_ref_scale_factors[0] = &tpl_data->sf;
+ xd->block_ref_scale_factors[1] = &tpl_data->sf;
+
+ const int base_qindex =
+ cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex;
+ // Get rd multiplier set up.
+ rdmult = (int)av1_compute_rd_mult(
+ base_qindex, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ if (rdmult < 1) rdmult = 1;
+ av1_set_error_per_bit(&x->errorperbit, rdmult);
+ av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex);
+
+ tpl_frame->is_valid = 1;
+
+ cm->quant_params.base_qindex = base_qindex;
+ av1_frame_init_quantizer(cpi);
+
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ const FRAME_UPDATE_TYPE update_type =
+ gf_group->update_type[cpi->gf_frame_index];
+ tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
+ bd_info.bit_depth, update_type, base_qindex) /
+ 6;
+
+ if (cpi->use_ducky_encode)
+ tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx];
+
+ av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+ // Initialize x->mbmi_ext when compound predictions are enabled.
+ if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext);
+
+ // Set the pointer to null since mbmi is only allocated inside this function.
+ assert(xd->mi == &mbmi_ptr);
+ xd->mi = NULL;
+
+ // Tpl module is called before the setting of speed features at frame level.
+ // Thus, turning off this speed feature for key frame is done here and not
+ // integrated into the speed feature setting itself.
+ const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0;
+ tpl_frame->use_pred_sad =
+ tpl_sf->use_sad_for_mode_decision &&
+ gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->layer_depth[frame_idx] >= layer_depth_th;
+}
+
+// This function stores the motion estimation dependencies of all the blocks in
+// a row
+void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+ int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_width = mi_size_wide[bsize];
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ const int tplb_cols_in_tile =
+ ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+ const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]);
+ assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+ assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+ for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols;
+ mi_col += mi_width, tplb_col_in_tile++) {
+ (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+ if (mt_info->num_workers > 1) {
+ pthread_mutex_lock(tpl_row_mt->mutex_);
+ const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit;
+ pthread_mutex_unlock(tpl_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (tpl_mt_exit) return;
+ }
+#endif
+
+ TplDepStats tpl_stats;
+
+ // Motion estimation column boundary
+ av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+ tpl_data->border_in_pixels);
+ xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+ xd->mb_to_right_edge =
+ GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+ mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col,
+ bsize, tx_size, &tpl_stats);
+
+ // Motion flow dependency dispenser.
+ tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
+ &tpl_stats, tpl_data->tpl_stats_block_mis_log2);
+ (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_col_in_tile, tplb_cols_in_tile);
+ }
+}
+
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int mi_height = mi_size_high[bsize];
+ for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
+ // Motion estimation row boundary
+ av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+ cpi->ppi->tpl_data.border_in_pixels);
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge =
+ GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+ av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x,
+ mi_row, bsize, tx_size);
+ }
+}
+
+static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
+ int mi_cols) {
+ if (!frame_idx) {
+ return;
+ }
+ const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+ assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2));
+ assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+ for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) {
+ for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) {
+ tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
+ }
+ }
+}
+
+static AOM_INLINE void init_gop_frames_for_tpl(
+ AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
+ GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) {
+ AV1_COMMON *cm = &cpi->common;
+ assert(cpi->gf_frame_index == 0);
+ *pframe_qindex = 0;
+
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+ init_ref_map_pair(cpi, ref_frame_map_pairs);
+
+ int remapped_ref_idx[REF_FRAMES];
+
+ EncodeFrameParams frame_params = *init_frame_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+
+ int ref_picture_map[REF_FRAMES];
+
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (frame_params.frame_type == KEY_FRAME) {
+ tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
+ tpl_data->tpl_frame[-i - 1].rec_picture = NULL;
+ tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
+ } else {
+ tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+ tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf;
+ tpl_data->tpl_frame[-i - 1].frame_display_index =
+ cm->ref_frame_map[i]->display_order_hint;
+ }
+
+ ref_picture_map[i] = -i - 1;
+ }
+
+ *tpl_group_frames = 0;
+
+ int gf_index;
+ int process_frame_count = 0;
+ const int gop_length = get_gop_length(gf_group);
+
+ for (gf_index = 0; gf_index < gop_length; ++gf_index) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+ FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
+ int lookahead_index =
+ gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index];
+ frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+ frame_update_type != INTNL_ARF_UPDATE;
+ frame_params.show_existing_frame =
+ frame_update_type == INTNL_OVERLAY_UPDATE ||
+ frame_update_type == OVERLAY_UPDATE;
+ frame_params.frame_type = gf_group->frame_type[gf_index];
+
+ if (frame_update_type == LF_UPDATE)
+ *pframe_qindex = gf_group->q_val[gf_index];
+
+ const struct lookahead_entry *buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+ if (buf == NULL) break;
+ tpl_frame->gf_picture = &buf->img;
+
+ // Use filtered frame buffer if available. This will make tpl stats more
+ // precise.
+ FRAME_DIFF frame_diff;
+ const YV12_BUFFER_CONFIG *tf_buf =
+ av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff);
+ if (tf_buf != NULL) {
+ tpl_frame->gf_picture = tf_buf;
+ }
+
+ // 'cm->current_frame.frame_number' is the display number
+ // of the current frame.
+ // 'lookahead_index' is frame offset within the gf group.
+ // 'lookahead_index + cm->current_frame.frame_number'
+ // is the display index of the frame.
+ tpl_frame->frame_display_index =
+ lookahead_index + cm->current_frame.frame_number;
+ assert(buf->display_idx ==
+ cpi->frame_index_set.show_frame_count + lookahead_index);
+
+ if (frame_update_type != OVERLAY_UPDATE &&
+ frame_update_type != INTNL_OVERLAY_UPDATE) {
+ tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+ tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+ ++process_frame_count;
+ }
+ const int true_disp = (int)(tpl_frame->frame_display_index);
+
+ av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0,
+ remapped_ref_idx);
+
+ int refresh_mask =
+ av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+ gf_index, true_disp, ref_frame_map_pairs);
+
+ // Make the frames marked as is_frame_non_ref to non-reference frames.
+ if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0;
+
+ int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+
+ if (refresh_frame_map_index < REF_FRAMES &&
+ refresh_frame_map_index != INVALID_IDX) {
+ ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+ AOMMAX(0, true_disp);
+ ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+ get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+ cpi->ppi->gf_group.max_layer_depth);
+ }
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ tpl_frame->ref_map_index[i - LAST_FRAME] =
+ ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
+
+ if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+ ++*tpl_group_frames;
+ }
+
+ const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL;
+ int extend_frame_count = 0;
+ int extend_frame_length = AOMMIN(
+ tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval);
+
+ int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
+ gf_group->arf_src_offset[gop_length - 1] + 1;
+
+ for (;
+ gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length;
+ ++gf_index) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+ FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
+ frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+ frame_update_type != INTNL_ARF_UPDATE;
+ frame_params.show_existing_frame =
+ frame_update_type == INTNL_OVERLAY_UPDATE;
+ frame_params.frame_type = INTER_FRAME;
+
+ int lookahead_index = frame_display_index;
+ struct lookahead_entry *buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+
+ if (buf == NULL) break;
+
+ tpl_frame->gf_picture = &buf->img;
+ tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+ tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+ // 'cm->current_frame.frame_number' is the display number
+ // of the current frame.
+ // 'frame_display_index' is frame offset within the gf group.
+ // 'frame_display_index + cm->current_frame.frame_number'
+ // is the display index of the frame.
+ tpl_frame->frame_display_index =
+ frame_display_index + cm->current_frame.frame_number;
+
+ ++process_frame_count;
+
+ gf_group->update_type[gf_index] = LF_UPDATE;
+
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) {
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ *pframe_qindex = cpi->oxcf.rc_cfg.cq_level;
+ } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+ // TODO(angiebird): Find a more adaptive method to decide pframe_qindex
+ // override the pframe_qindex in the second pass when bitrate accuracy
+ // is on. We found that setting this pframe_qindex make the tpl stats
+ // more stable.
+ *pframe_qindex = 128;
+ }
+ }
+#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ gf_group->q_val[gf_index] = *pframe_qindex;
+ const int true_disp = (int)(tpl_frame->frame_display_index);
+ av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0,
+ remapped_ref_idx);
+ int refresh_mask =
+ av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+ gf_index, true_disp, ref_frame_map_pairs);
+ int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+
+ if (refresh_frame_map_index < REF_FRAMES &&
+ refresh_frame_map_index != INVALID_IDX) {
+ ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+ AOMMAX(0, true_disp);
+ ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+ get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+ cpi->ppi->gf_group.max_layer_depth);
+ }
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ tpl_frame->ref_map_index[i - LAST_FRAME] =
+ ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
+
+ tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
+ tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
+ tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1;
+ tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1;
+
+ if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+ ++*tpl_group_frames;
+ ++extend_frame_count;
+ ++frame_display_index;
+ }
+}
+
+void av1_init_tpl_stats(TplParams *const tpl_data) {
+ tpl_data->ready = 0;
+ set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+ &tpl_data->tpl_bsize_1d);
+ for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+ tpl_frame->is_valid = 0;
+ }
+ for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+ if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue;
+ memset(tpl_data->tpl_stats_pool[frame_idx], 0,
+ tpl_frame->height * tpl_frame->width *
+ sizeof(*tpl_frame->tpl_stats_ptr));
+ }
+}
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
+ if (tpl_data->ready == 0) {
+ return 0;
+ }
+ if (gf_frame_index >= MAX_TPL_FRAME_IDX) {
+ // The sub-GOP length exceeds the TPL buffer capacity.
+ // Hence the TPL related functions are disabled hereafter.
+ return 0;
+ }
+ return tpl_data->tpl_frame[gf_frame_index].is_valid;
+}
+
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+ switch (gop_eval) {
+ case 1:
+ // Allow larger GOP size if the base layer ARF has higher dependency
+ // factor than the intermediate ARF and both ARFs have reasonably high
+ // dependency factors.
+ return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+ case 2:
+ if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+ return 1; // Don't shorten the gf interval
+ else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+ return 0; // Shorten the gf interval
+ else
+ return 2; // Cannot decide the gf interval, so redo the
+ // tpl stats calculation.
+ case 3: return beta[0] > 1.1;
+ default: return 2;
+ }
+}
+
+// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down
+// the scope of input arguments.
+void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
+ const EncodeFrameParams *const frame_params) {
+ AV1_COMMON *cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ int bottom_index, top_index;
+ if (cpi->use_ducky_encode) return;
+
+ cm->current_frame.frame_type = frame_params->frame_type;
+ for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+ ++gf_index) {
+ cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+ cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+ gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+ gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds(
+ cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index);
+ }
+}
+
+static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group,
+ int frame_idx, int gop_eval,
+ int approx_gop_eval,
+ int reduce_num_frames) {
+ // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+ // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+ // tpl stats calculation is limited to ARFs from base layer and (base+1)
+ // layer.
+ const int num_arf_layers = (gop_eval == 2) ? 3 : 2;
+ const int gop_length = get_gop_length(gf_group);
+
+ if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+ return 1;
+
+ // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+ // frames and for frames beyond gop length.
+ if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+ frame_idx >= gop_length))
+ return 1;
+
+ if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE &&
+ frame_idx < gop_length)
+ return 1;
+
+ return 0;
+}
+
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+ const EncodeFrameParams *const frame_params) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+ assert(cpi->gf_frame_index == 0);
+ AV1_COMMON *cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ EncodeFrameParams this_frame_params = *frame_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ int approx_gop_eval = (gop_eval > 1);
+
+ if (cpi->superres_mode != AOM_SUPERRES_NONE) {
+ assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
+ av1_init_tpl_stats(tpl_data);
+ return 0;
+ }
+
+ cm->current_frame.frame_type = frame_params->frame_type;
+ for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+ ++gf_index) {
+ cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+ av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+ gf_group->update_type[gf_index],
+ gf_group->refbuf_state[gf_index], 0);
+
+ memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
+ sizeof(cpi->refresh_frame));
+ }
+
+ int pframe_qindex;
+ int tpl_gf_group_frames;
+ init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames,
+ &pframe_qindex);
+
+ cpi->ppi->p_rc.base_layer_qp = pframe_qindex;
+
+ av1_init_tpl_stats(tpl_data);
+
+ TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers;
+ if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating tpl data");
+ }
+
+ tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
+ tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
+
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+ cm->width, cm->height);
+
+ if (frame_params->frame_type == KEY_FRAME) {
+ av1_init_mv_probs(cm);
+ }
+ av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+ cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
+
+ const int num_planes =
+ cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm);
+ // As tpl module is called before the setting of speed features at frame
+ // level, turning off this speed feature for the first GF group of the
+ // key-frame interval is done here.
+ int reduce_num_frames =
+ cpi->sf.tpl_sf.reduce_num_frames &&
+ gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->max_layer_depth > 2;
+ // TPL processing is skipped for frames of type LF_UPDATE when
+ // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor
+ // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the
+ // frames in the gf group on an average.
+ tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0;
+
+ // Backward propagation from tpl_group_frames to 1.
+ for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
+ ++frame_idx) {
+ if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+ reduce_num_frames))
+ continue;
+
+ init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+ if (mt_info->num_workers > 1) {
+ tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
+ tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write;
+ av1_mc_flow_dispenser_mt(cpi);
+ } else {
+ mc_flow_dispenser(cpi);
+ }
+#if CONFIG_BITRATE_ACCURACY
+ av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats);
+ av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx);
+#endif // CONFIG_BITRATE_ACCURACY
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx);
+ rc_log_frame_stats(&cpi->rc_log, frame_coding_idx,
+ &cpi->td.tpl_txfm_stats);
+ }
+#endif // CONFIG_RATECTRL_LOG
+
+ aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
+ num_planes);
+ }
+
+ for (int frame_idx = tpl_gf_group_frames - 1;
+ frame_idx >= cpi->gf_frame_index; --frame_idx) {
+ if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+ reduce_num_frames))
+ continue;
+
+ mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols);
+ }
+
+ av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+ gf_group->update_type[cpi->gf_frame_index],
+ gf_group->update_type[cpi->gf_frame_index], 0);
+ cm->current_frame.frame_type = frame_params->frame_type;
+ cm->show_frame = frame_params->show_frame;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // Record the time if the function returns.
+ if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 ||
+ !gop_eval)
+ end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+
+ tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+
+ if (!approx_gop_eval) {
+ tpl_data->ready = 1;
+ }
+ if (cpi->common.tiles.large_scale) return 0;
+ if (gf_group->max_layer_depth_allowed == 0) return 1;
+ if (!gop_eval) return 0;
+ assert(gf_group->arf_index >= 0);
+
+ double beta[2] = { 0.0 };
+ const int frame_idx_0 = gf_group->arf_index;
+ const int frame_idx_1 =
+ AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1);
+ beta[0] = av1_tpl_get_frame_importance(tpl_data, frame_idx_0);
+ beta[1] = av1_tpl_get_frame_importance(tpl_data, frame_idx_1);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+ return eval_gop_length(beta, gop_eval);
+}
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tpl_idx = cpi->gf_frame_index;
+
+ assert(
+ IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size));
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+
+ if (!tpl_frame->is_valid) return;
+
+ const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int tpl_stride = tpl_frame->stride;
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ const int block_size = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const double c = 1.2;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+ // Loop through each 'block_size' X 'block_size' block.
+ for (int row = 0; row < num_rows; row++) {
+ for (int col = 0; col < num_cols; col++) {
+ double intra_cost = 0.0, mc_dep_cost = 0.0;
+ // Loop through each mi block.
+ for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h;
+ mi_row += step) {
+ for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w;
+ mi_col += step) {
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue;
+ const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS);
+ mc_dep_cost +=
+ (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+ }
+ }
+ const double rk = intra_cost / mc_dep_cost;
+ const int index = row * num_cols + col;
+ cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+ }
+ }
+}
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+ BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
+ TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+ if (!tpl_frame->is_valid) return;
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int sb_mi_width_sr = coded_to_superres_mi(
+ mi_size_wide[sb_size], cm->superres_scale_denominator);
+
+ const int bsize_base = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
+ int row, col;
+
+ double base_block_count = 0.0;
+ double log_sum = 0.0;
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col_sr / num_mi_h;
+ col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+ base_block_count += 1.0;
+ }
+ }
+
+ const CommonQuantParams *quant_params = &cm->quant_params;
+
+ const int orig_qindex_rdmult =
+ quant_params->base_qindex + quant_params->y_dc_delta_q;
+ const int orig_rdmult = av1_compute_rd_mult(
+ orig_qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ const int new_qindex_rdmult = quant_params->base_qindex +
+ x->rdmult_delta_qindex +
+ quant_params->y_dc_delta_q;
+ const int new_rdmult = av1_compute_rd_mult(
+ new_qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
+
+ double scale_adj = log(scaling_factor) - log_sum / base_block_count;
+ scale_adj = exp_bounded(scale_adj);
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col_sr / num_mi_h;
+ col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->ppi->tpl_sb_rdmult_scaling_factors[index] =
+ scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+ }
+ }
+}
+
+double av1_exponential_entropy(double q_step, double b) {
+ b = AOMMAX(b, TPL_EPSILON);
+ double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+ return -log2(1 - z) - z * log2(z) / (1 - z);
+}
+
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
+ // zero bin's size is zero_bin_ratio * q_step
+ // non-zero bin's size is q_step
+ b = AOMMAX(b, TPL_EPSILON);
+ double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+ double h = av1_exponential_entropy(q_step, b);
+ double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1);
+ return r;
+}
+
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+ const double *abs_coeff_mean,
+ int coeff_num) {
+ double zero_bin_ratio = 2;
+ double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double est_rate = 0;
+ // dc coeff
+ est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio);
+ // ac coeff
+ for (int i = 1; i < coeff_num; ++i) {
+ est_rate +=
+ av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio);
+ }
+ est_rate *= block_count;
+ return est_rate;
+}
+
+double av1_estimate_coeff_entropy(double q_step, double b,
+ double zero_bin_ratio, int qcoeff) {
+ b = AOMMAX(b, TPL_EPSILON);
+ int abs_qcoeff = abs(qcoeff);
+ double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+ if (abs_qcoeff == 0) {
+ double r = -log2(1 - z0);
+ return r;
+ } else {
+ double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+ double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z);
+ return r;
+ }
+}
+
+double av1_estimate_txfm_block_entropy(int q_index,
+ const double *abs_coeff_mean,
+ int *qcoeff_arr, int coeff_num) {
+ double zero_bin_ratio = 2;
+ double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double est_rate = 0;
+ // dc coeff
+ est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
+ zero_bin_ratio, qcoeff_arr[0]);
+ // ac coeff
+ for (int i = 1; i < coeff_num; ++i) {
+ est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
+ zero_bin_ratio, qcoeff_arr[i]);
+ }
+ return est_rate;
+}
+
+#if CONFIG_RD_COMMAND
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) {
+ FILE *fptr = fopen(filepath, "r");
+ fscanf(fptr, "%d", &rd_command->frame_count);
+ rd_command->frame_index = 0;
+ for (int i = 0; i < rd_command->frame_count; ++i) {
+ int option;
+ fscanf(fptr, "%d", &option);
+ rd_command->option_ls[i] = (RD_OPTION)option;
+ if (option == RD_OPTION_SET_Q) {
+ fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+ } else if (option == RD_OPTION_SET_Q_RDMULT) {
+ fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+ fscanf(fptr, "%d", &rd_command->rdmult_ls[i]);
+ }
+ }
+ fclose(fptr);
+}
+#endif // CONFIG_RD_COMMAND
+
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+ int gf_frame_index) {
+ const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index];
+ const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ const int tpl_stride = tpl_frame->stride;
+ double intra_cost_base = 0;
+ double mc_dep_cost_base = 0;
+ double cbcmp_base = 1;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+ for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+ for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+ const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ double cbcmp = (double)this_stats->srcrf_dist;
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ dist_scaled = AOMMAX(dist_scaled, 1);
+ intra_cost_base += log(dist_scaled) * cbcmp;
+ mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+ cbcmp_base += cbcmp;
+ }
+ }
+ return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base);
+}
+
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) {
+ if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) {
+ return 1;
+ }
+ const double frame_importance =
+ av1_tpl_get_frame_importance(tpl_data, gf_frame_index);
+ return sqrt(1 / frame_importance);
+}
+
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+ aom_bit_depth_t bit_depth) {
+ const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth);
+ const double target_qstep = leaf_qstep * qstep_ratio;
+ int qindex = leaf_qindex;
+ if (qstep_ratio < 1.0) {
+ for (qindex = leaf_qindex; qindex > 0; --qindex) {
+ const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (qstep <= target_qstep) break;
+ }
+ } else {
+ for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) {
+ const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (qstep >= target_qstep) break;
+ }
+ }
+ return qindex;
+}
+
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+ int leaf_qindex, aom_bit_depth_t bit_depth) {
+ const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index);
+ return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+ int show_frame_count) {
+ av1_zero(*vbr_rc_info);
+ vbr_rc_info->ready = 0;
+ vbr_rc_info->total_bit_budget = total_bit_budget;
+ vbr_rc_info->show_frame_count = show_frame_count;
+ const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1,
+ 0.94559, 1, 1,
+ 0.94559 };
+
+ // TODO(angiebird): Based on the previous code, only the scale factor 0.94559
+ // will be used in most of the cases with --limi=17. Figure out if the
+ // following scale factors works better.
+ // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1,
+ // 1.10199, 1, 1,
+ // 0.16393 };
+
+ const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 };
+ memcpy(vbr_rc_info->scale_factors, scale_factors,
+ sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES);
+ memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors,
+ sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES);
+
+ vbr_rc_reset_gop_data(vbr_rc_info);
+#if CONFIG_THREE_PASS
+ // TODO(angiebird): Explain why we use -1 here
+ vbr_rc_info->cur_gop_idx = -1;
+ vbr_rc_info->gop_count = 0;
+ vbr_rc_info->total_frame_count = 0;
+#endif // CONFIG_THREE_PASS
+}
+
+#if CONFIG_THREE_PASS
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+ int gf_frame_index) {
+ int gop_idx = vbr_rc_info->cur_gop_idx;
+ int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx];
+ return gop_start_idx + gf_frame_index;
+}
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+ const TPL_INFO *tpl_info) {
+ int gop_start_idx = vbr_rc_info->total_frame_count;
+ vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx;
+ vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length;
+ assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES);
+ for (int i = 0; i < tpl_info->gf_length; ++i) {
+ vbr_rc_info->txfm_stats_list[gop_start_idx + i] =
+ tpl_info->txfm_stats_list[i];
+ vbr_rc_info->qstep_ratio_list[gop_start_idx + i] =
+ tpl_info->qstep_ratio_ls[i];
+ vbr_rc_info->update_type_list[gop_start_idx + i] =
+ tpl_info->update_type_list[i];
+ }
+ vbr_rc_info->total_frame_count += tpl_info->gf_length;
+ vbr_rc_info->gop_count++;
+}
+#endif // CONFIG_THREE_PASS
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+ int gop_showframe_count) {
+ vbr_rc_info->gop_showframe_count = gop_showframe_count;
+ vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget *
+ gop_showframe_count /
+ vbr_rc_info->show_frame_count;
+}
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+ const double *qstep_ratio_list,
+ aom_bit_depth_t bit_depth,
+ int *q_index_list) {
+ for (int i = 0; i < frame_count; ++i) {
+ q_index_list[i] = av1_get_q_index_from_qstep_ratio(
+ base_q_index, qstep_ratio_list[i], bit_depth);
+ }
+}
+
+double av1_vbr_rc_info_estimate_gop_bitrate(
+ int base_q_index, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe) {
+ av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list,
+ bit_depth, q_index_list);
+ double estimated_gop_bitrate = 0;
+ for (int frame_index = 0; frame_index < frame_count; frame_index++) {
+ const TplTxfmStats *frame_stats = &stats_list[frame_index];
+ double frame_bitrate = 0;
+ if (frame_stats->ready) {
+ int q_index = q_index_list[frame_index];
+
+ frame_bitrate = av1_laplace_estimate_frame_rate(
+ q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean,
+ frame_stats->coeff_num);
+ }
+ FRAME_UPDATE_TYPE update_type = update_type_list[frame_index];
+ estimated_gop_bitrate +=
+ frame_bitrate * update_type_scale_factors[update_type];
+ if (estimated_bitrate_byframe != NULL) {
+ estimated_bitrate_byframe[frame_index] = frame_bitrate;
+ }
+ }
+ return estimated_gop_bitrate;
+}
+
+int av1_vbr_rc_info_estimate_base_q(
+ double bit_budget, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe) {
+ int q_max = 255; // Maximum q value.
+ int q_min = 0; // Minimum q value.
+ int q = (q_max + q_min) / 2;
+
+ double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q_max, bit_depth, update_type_scale_factors, frame_count,
+ update_type_list, qstep_ratio_list, stats_list, q_index_list,
+ estimated_bitrate_byframe);
+
+ double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q_min, bit_depth, update_type_scale_factors, frame_count,
+ update_type_list, qstep_ratio_list, stats_list, q_index_list,
+ estimated_bitrate_byframe);
+ while (q_min + 1 < q_max) {
+ double estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+ qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+ if (estimate > bit_budget) {
+ q_min = q;
+ q_min_estimate = estimate;
+ } else {
+ q_max = q;
+ q_max_estimate = estimate;
+ }
+ q = (q_max + q_min) / 2;
+ }
+ // Pick the estimate that lands closest to the budget.
+ if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) {
+ q = q_max;
+ } else {
+ q = q_min;
+ }
+ // Update q_index_list and vbr_rc_info.
+ av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+ qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+ return q;
+}
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+ const TplParams *tpl_data,
+ const GF_GROUP *gf_group,
+ aom_bit_depth_t bit_depth) {
+ vbr_rc_info->q_index_list_ready = 1;
+ double gop_bit_budget = vbr_rc_info->gop_bit_budget;
+
+ for (int i = 0; i < gf_group->size; i++) {
+ vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+ }
+
+ double mv_bits = 0;
+ for (int i = 0; i < gf_group->size; i++) {
+ double frame_mv_bits = 0;
+ if (av1_tpl_stats_ready(tpl_data, i)) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i];
+ frame_mv_bits = av1_tpl_compute_frame_mv_entropy(
+ tpl_frame, tpl_data->tpl_stats_block_mis_log2);
+ FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i];
+ mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type];
+ }
+ }
+
+ mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget);
+ gop_bit_budget -= mv_bits;
+
+ vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+ gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size,
+ gf_group->update_type, vbr_rc_info->qstep_ratio_list,
+ tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+}
+
+#endif // CONFIG_BITRATE_ACCURACY
+
+// Use upper and left neighbor block as the reference MVs.
+// Compute the minimum difference between current MV and reference MV.
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+ int step, int tpl_stride, int right_shift) {
+ const TplDepStats *tpl_stats =
+ &tpl_frame
+ ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)];
+ int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+ int current_mv_magnitude =
+ abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col);
+
+ // Retrieve the up and left neighbors.
+ int up_error = INT_MAX;
+ int_mv up_mv_diff;
+ if (row - step >= 0) {
+ tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ row - step, col, tpl_stride, right_shift)];
+ up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+ up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row;
+ up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col;
+ up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col);
+ }
+
+ int left_error = INT_MAX;
+ int_mv left_mv_diff;
+ if (col - step >= 0) {
+ tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ row, col - step, tpl_stride, right_shift)];
+ left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+ left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row;
+ left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col;
+ left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col);
+ }
+
+ // Return the MV with the minimum distance from current.
+ if (up_error < left_error && up_error < current_mv_magnitude) {
+ return up_mv_diff;
+ } else if (left_error < up_error && left_error < current_mv_magnitude) {
+ return left_mv_diff;
+ }
+ return current_mv;
+}
+
+/* Compute the entropy of motion vectors for a single frame. */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+ uint8_t right_shift) {
+ if (!tpl_frame->is_valid) {
+ return 0;
+ }
+
+ int count_row[500] = { 0 };
+ int count_col[500] = { 0 };
+ int n = 0; // number of MVs to process
+
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << right_shift;
+
+ for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+ for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+ int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step,
+ tpl_stride, right_shift);
+ count_row[clamp(mv.as_mv.row, 0, 499)] += 1;
+ count_col[clamp(mv.as_mv.row, 0, 499)] += 1;
+ n += 1;
+ }
+ }
+
+ // Estimate the bits used using the entropy formula.
+ double rate_row = 0;
+ double rate_col = 0;
+ for (int i = 0; i < 500; i++) {
+ if (count_row[i] != 0) {
+ double p = count_row[i] / (double)n;
+ rate_row += count_row[i] * -log2(p);
+ }
+ if (count_col[i] != 0) {
+ double p = count_col[i] / (double)n;
+ rate_col += count_col[i] * -log2(p);
+ }
+ }
+
+ return rate_row + rate_col;
+}
diff --git a/third_party/aom/av1/encoder/tpl_model.h b/third_party/aom/av1/encoder/tpl_model.h
new file mode 100644
index 0000000000..bcd58216c5
--- /dev/null
+++ b/third_party/aom/av1/encoder/tpl_model.h
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1_SEQ_CODING_TOOLS;
+struct EncodeFrameParams;
+struct EncodeFrameInput;
+struct GF_GROUP;
+struct ThreadData;
+struct TPL_INFO;
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
+ switch (length) {
+ case 64: return BLOCK_64X64;
+ case 32: return BLOCK_32X32;
+ case 16: return BLOCK_16X16;
+ case 8: return BLOCK_8X8;
+ case 4: return BLOCK_4X4;
+ default:
+ assert(0 && "Invalid block size for tpl model");
+ return BLOCK_16X16;
+ }
+}
+
+typedef struct AV1TplRowMultiThreadSync {
+#if CONFIG_MULTITHREAD
+ // Synchronization objects for top-right dependency.
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ // Buffer to store the macroblock whose encoding is complete.
+ // num_finished_cols[i] stores the number of macroblocks which finished
+ // encoding in the ith macroblock row.
+ int *num_finished_cols;
+ // Number of extra macroblocks of the top row to be complete for encoding
+ // of the current macroblock to start. A value of 1 indicates top-right
+ // dependency.
+ int sync_range;
+ // Number of macroblock rows.
+ int rows;
+ // Number of threads processing the current tile.
+ int num_threads_working;
+} AV1TplRowMultiThreadSync;
+
+typedef struct AV1TplRowMultiThreadInfo {
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool tpl_mt_exit;
+#if CONFIG_MULTITHREAD
+ // Mutex lock object used for error handling.
+ pthread_mutex_t *mutex_;
+#endif
+ // Row synchronization related function pointers.
+ void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
+ void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
+ int cols);
+} AV1TplRowMultiThreadInfo;
+
+// TODO(jingning): This needs to be cleaned up next.
+
+// TPL stats buffers are prepared for every frame in the GOP,
+// including (internal) overlays and (internal) arfs.
+// In addition, frames in the lookahead that are outside of the GOP
+// are also used.
+// Thus it should use
+// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) =
+// MAX_LAG_BUFFERS + (# overlays)
+// 2 * MAX_LAG_BUFFERS is therefore a safe estimate.
+// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER
+#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS)
+// The first REF_FRAMES + 1 buffers are reserved.
+// tpl_data->tpl_frame starts after REF_FRAMES + 1
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1)
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+#define TPL_EPSILON 0.0000001
+
+typedef struct TplTxfmStats {
+ int ready; // Whether abs_coeff_mean is ready
+ double abs_coeff_sum[256]; // Assume we are using 16x16 transform block
+ double abs_coeff_mean[256];
+ int txfm_block_count;
+ int coeff_num;
+} TplTxfmStats;
+
+typedef struct {
+ uint8_t *predictor8;
+ int16_t *src_diff;
+ tran_low_t *coeff;
+ tran_low_t *qcoeff;
+ tran_low_t *dqcoeff;
+} TplBuffers;
+
+typedef struct TplDepStats {
+ int64_t srcrf_sse;
+ int64_t srcrf_dist;
+ int64_t recrf_sse;
+ int64_t recrf_dist;
+ int64_t intra_sse;
+ int64_t intra_dist;
+ int64_t cmp_recrf_dist[2];
+ int64_t mc_dep_rate;
+ int64_t mc_dep_dist;
+ int64_t pred_error[INTER_REFS_PER_FRAME];
+ int32_t intra_cost;
+ int32_t inter_cost;
+ int32_t srcrf_rate;
+ int32_t recrf_rate;
+ int32_t intra_rate;
+ int32_t cmp_recrf_rate[2];
+ int_mv mv[INTER_REFS_PER_FRAME];
+ int8_t ref_frame_index[2];
+} TplDepStats;
+
+typedef struct TplDepFrame {
+ uint8_t is_valid;
+ TplDepStats *tpl_stats_ptr;
+ const YV12_BUFFER_CONFIG *gf_picture;
+ YV12_BUFFER_CONFIG *rec_picture;
+ int ref_map_index[REF_FRAMES];
+ int stride;
+ int width;
+ int height;
+ int mi_rows;
+ int mi_cols;
+ int base_rdmult;
+ uint32_t frame_display_index;
+ // When set, SAD metric is used for intra and inter mode decision.
+ int use_pred_sad;
+} TplDepFrame;
+
+/*!\endcond */
+/*!
+ * \brief Params related to temporal dependency model.
+ */
+typedef struct TplParams {
+ /*!
+ * Whether the tpl stats is ready.
+ */
+ int ready;
+
+ /*!
+ * Block granularity of tpl score storage.
+ */
+ uint8_t tpl_stats_block_mis_log2;
+
+ /*!
+ * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16.
+ */
+ uint8_t tpl_bsize_1d;
+
+ /*!
+ * Buffer to store the frame level tpl information for each frame in a gf
+ * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+ * group
+ */
+ TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+ /*!
+ * Buffer to store tpl stats at block granularity.
+ * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+ * group.
+ */
+ TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+ /*!
+ * Pointer to the buffer which stores tpl transform stats per frame.
+ * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group.
+ * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when
+ * tpl is enabled.
+ */
+ TplTxfmStats *txfm_stats_list;
+
+ /*!
+ * Buffer to store tpl reconstructed frame.
+ * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+ */
+ YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+ /*!
+ * Pointer to tpl_stats_buffer.
+ */
+ TplDepFrame *tpl_frame;
+
+ /*!
+ * Scale factors for the current frame.
+ */
+ struct scale_factors sf;
+
+ /*!
+ * GF group index of the current frame.
+ */
+ int frame_idx;
+
+ /*!
+ * Array of pointers to the frame buffers holding the source frame.
+ * src_ref_frame[i] stores the pointer to the source frame of the ith
+ * reference frame type.
+ */
+ const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME];
+
+ /*!
+ * Array of pointers to the frame buffers holding the tpl reconstructed frame.
+ * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith
+ * reference frame type.
+ */
+ const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME];
+
+ /*!
+ * Parameters related to synchronization for top-right dependency in row based
+ * multi-threading of tpl
+ */
+ AV1TplRowMultiThreadSync tpl_mt_sync;
+
+ /*!
+ * Frame border for tpl frame.
+ */
+ int border_in_pixels;
+
+ /*!
+ * Factor to adjust r0 if TPL uses a subset of frames in the gf group.
+ */
+ double r0_adjust_factor;
+} TplParams;
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+#define VBR_RC_INFO_MAX_FRAMES 500
+#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+
+#if CONFIG_BITRATE_ACCURACY
+
+/*!
+ * \brief This structure stores information needed for bitrate accuracy
+ * experiment.
+ */
+typedef struct {
+ int ready;
+ double total_bit_budget; // The total bit budget of the entire video
+ int show_frame_count; // Number of show frames in the entire video
+
+ int gop_showframe_count; // The number of show frames in the current gop
+ double gop_bit_budget; // The bitbudget for the current gop
+ double scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve the
+ // budget estimation
+ double mv_scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve
+ // MV entropy estimation
+
+ // === Below this line are GOP related data that will be updated per GOP ===
+ int base_q_index; // Stores the base q index.
+ int q_index_list_ready;
+ int q_index_list[VBR_RC_INFO_MAX_FRAMES]; // q indices for the current
+ // GOP
+
+ // Array to store qstep_ratio for each frame in a GOP
+ double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+
+#if CONFIG_THREE_PASS
+ TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+ FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+ int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES];
+ int gop_length_list[VBR_RC_INFO_MAX_FRAMES];
+ int cur_gop_idx;
+ int total_frame_count;
+ int gop_count;
+#endif // CONFIG_THREE_PASS
+} VBR_RATECTRL_INFO;
+
+static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
+ vbr_rc_info->q_index_list_ready = 0;
+ av1_zero(vbr_rc_info->q_index_list);
+}
+
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+ int show_frame_count);
+
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+ int gf_frame_index);
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+ const struct TPL_INFO *tpl_info);
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+ int gop_showframe_count);
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+ const double *qstep_ratio_list,
+ aom_bit_depth_t bit_depth, int *q_index_list);
+
+/*!\brief Update q_index_list in vbr_rc_info based on tpl stats
+ *
+ * \param[out] vbr_rc_info Rate control info for BITRATE_ACCURACY
+ * experiment
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_group GOP struct
+ * \param[in] bit_depth bit depth
+ */
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+ const TplParams *tpl_data,
+ const struct GF_GROUP *gf_group,
+ aom_bit_depth_t bit_depth);
+/*
+ *!\brief Compute the number of bits needed to encode a GOP
+ *
+ * \param[in] base_q_index base layer q_index
+ * \param[in] bit_depth bit depth
+ * \param[in] update_type_scale_factors array of scale factors for each
+ * update_type
+ * \param[in] frame_count size of update_type_list,
+ * qstep_ratio_list stats_list,
+ * q_index_list and
+ * estimated_bitrate_byframe
+ * \param[in] update_type_list array of update_type, one per frame
+ * \param[in] qstep_ratio_list array of qstep_ratio, one per frame
+ * \param[in] stats_list array of transform stats, one per
+ * frame
+ * \param[out] q_index_list array of q_index, one per frame
+ * \param[out] estimated_bitrate_byframe array to keep track of frame
+ * bitrate
+ *
+ * \return The estimated GOP bitrate.
+ *
+ */
+double av1_vbr_rc_info_estimate_gop_bitrate(
+ int base_q_index, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe);
+
+/*!\brief Estimate the optimal base q index for a GOP.
+ *
+ * This function uses a binary search to find base layer q index to
+ * achieve the specified bit budget.
+ *
+ * \param[in] bit_budget target bit budget
+ * \param[in] bit_depth bit depth
+ * \param[in] update_type_scale_factors array of scale factors for each
+ * update_type
+ * \param[in] frame_count size of update_type_list, qstep_ratio_list
+ * stats_list, q_index_list and
+ * estimated_bitrate_byframe
+ * \param[in] update_type_list array of update_type, one per frame
+ * \param[in] qstep_ratio_list array of qstep_ratio, one per frame
+ * \param[in] stats_list array of transform stats, one per frame
+ * \param[out] q_index_list array of q_index, one per frame
+ * \param[out] estimated_bitrate_byframe Array to keep track of frame
+ * bitrate
+ *
+ * \return Returns the optimal base q index to use.
+ */
+int av1_vbr_rc_info_estimate_base_q(
+ double bit_budget, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe);
+
+#endif // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RD_COMMAND
+typedef enum {
+ RD_OPTION_NONE,
+ RD_OPTION_SET_Q,
+ RD_OPTION_SET_Q_RDMULT
+} RD_OPTION;
+
+typedef struct RD_COMMAND {
+ RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ int frame_count;
+ int frame_index;
+} RD_COMMAND;
+
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command);
+#endif // CONFIG_RD_COMMAND
+
+/*!\brief Allocate buffers used by tpl model
+ *
+ * \param[in] Top-level encode/decode structure
+ * \param[in] lag_in_frames number of lookahead frames
+ *
+ * \param[out] tpl_data tpl data structure
+ */
+
+void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
+ CommonModeInfoParams *const mi_params, int width,
+ int height, int byte_alignment, int lag_in_frames);
+
+static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
+ aom_free(tpl_tmp_buffers->predictor8);
+ tpl_tmp_buffers->predictor8 = NULL;
+ aom_free(tpl_tmp_buffers->src_diff);
+ tpl_tmp_buffers->src_diff = NULL;
+ aom_free(tpl_tmp_buffers->coeff);
+ tpl_tmp_buffers->coeff = NULL;
+ aom_free(tpl_tmp_buffers->qcoeff);
+ tpl_tmp_buffers->qcoeff = NULL;
+ aom_free(tpl_tmp_buffers->dqcoeff);
+ tpl_tmp_buffers->dqcoeff = NULL;
+}
+
+static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
+ uint8_t tpl_bsize_1d) {
+ // Number of pixels in a tpl block
+ const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d;
+
+ // Allocate temporary buffers used in mode estimation.
+ tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign(
+ 32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8));
+ tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff));
+ tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff));
+ tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff));
+ tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff));
+
+ if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff &&
+ tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff &&
+ tpl_tmp_buffers->dqcoeff)) {
+ tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+ return false;
+ }
+ return true;
+}
+
+/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
+ * group) and selects between 16 and 32 frame GOP structure.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] gop_eval Flag if it is in the GOP length decision stage
+ * \param[in] frame_params Per frame encoding parameters
+ *
+ * \return Indicates whether or not we should use a longer GOP length.
+ */
+int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,
+ const struct EncodeFrameParams *const frame_params);
+
+/*!\cond */
+
+void av1_tpl_preload_rc_estimate(
+ struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
+
+void av1_init_tpl_stats(TplParams *const tpl_data);
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index);
+
+void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
+
+void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
+ BLOCK_SIZE sb_size, int mi_row, int mi_col);
+
+void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
+ TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+ int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+/*!\brief Compute the entropy of an exponential probability distribution
+ * function (pdf) subjected to uniform quantization.
+ *
+ * pdf(x) = b*exp(-b*x)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_step quantizer step size
+ * \param[in] b parameter of exponential distribution
+ *
+ * \return entropy cost
+ */
+double av1_exponential_entropy(double q_step, double b);
+
+/*!\brief Compute the entropy of a Laplace probability distribution
+ * function (pdf) subjected to non-uniform quantization.
+ *
+ * pdf(x) = 0.5*b*exp(-0.5*b*|x|)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_step quantizer step size for non-zero bins
+ * \param[in] b parameter of Laplace distribution
+ * \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio * q_step
+ *
+ * \return entropy cost
+ */
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio);
+
+/*!\brief Compute the frame rate using transform block stats
+ *
+ * Assume each position i in the transform block is of Laplace distribution
+ * with mean absolute deviation abs_coeff_mean[i]
+ *
+ * Then we can use av1_laplace_entropy() to compute the expected frame
+ * rate.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_index quantizer index
+ * \param[in] block_count number of transform blocks
+ * \param[in] abs_coeff_mean array of mean absolute deviation
+ * \param[in] coeff_num number of coefficients per transform block
+ *
+ * \return expected frame rate
+ */
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+ const double *abs_coeff_mean,
+ int coeff_num);
+
+/*
+ *!\brief Init TplTxfmStats
+ *
+ * \param[in] tpl_txfm_stats a structure for storing transform stats
+ *
+ */
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats);
+
+#if CONFIG_BITRATE_ACCURACY
+/*
+ *!\brief Accumulate TplTxfmStats
+ *
+ * \param[in] sub_stats a structure for storing sub transform stats
+ * \param[out] accumulated_stats a structure for storing accumulated
+ *transform stats
+ *
+ */
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+ TplTxfmStats *accumulated_stats);
+
+/*
+ *!\brief Record a transform block into TplTxfmStats
+ *
+ * \param[in] tpl_txfm_stats A structure for storing transform stats
+ * \param[out] coeff An array of transform coefficients. Its size
+ * should equal to tpl_txfm_stats.coeff_num.
+ *
+ */
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+ const tran_low_t *coeff);
+
+/*
+ *!\brief Update abs_coeff_mean and ready of txfm_stats
+ * If txfm_block_count > 0, this function will use abs_coeff_sum and
+ * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag
+ * will be set to one.
+ *
+ * \param[in] txfm_stats A structure for storing transform stats
+ */
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats);
+#endif // CONFIG_BITRATE_ACCURACY
+
+/*!\brief Estimate coefficient entropy using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * This function is equivalent to -log2(laplace_prob()), where laplace_prob()
+ *is defined in tpl_model_test.cc
+ *
+ * \param[in] q_step quantizer step size without any scaling
+ * \param[in] b mean absolute deviation of Laplace
+ *distribution \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio
+ ** q_step \param[in] qcoeff quantized coefficient
+ *
+ * \return estimated coefficient entropy
+ *
+ */
+double av1_estimate_coeff_entropy(double q_step, double b,
+ double zero_bin_ratio, int qcoeff);
+
+/*!\brief Estimate entropy of a transform block using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_index quantizer index
+ * \param[in] abs_coeff_mean array of mean absolute deviations
+ * \param[in] qcoeff_arr array of quantized coefficients
+ * \param[in] coeff_num number of coefficients per transform block
+ *
+ * \return estimated transform block entropy
+ *
+ */
+double av1_estimate_txfm_block_entropy(int q_index,
+ const double *abs_coeff_mean,
+ int *qcoeff_arr, int coeff_num);
+
+// TODO(angiebird): Add doxygen description here.
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+ int64_t srcrf_dist, int pix_num);
+
+/*!\brief Compute the overlap area between two blocks with the same size
+ *
+ *\ingroup tpl_modelling
+ *
+ * If there is no overlap, this function should return zero.
+ *
+ * \param[in] row_a row position of the first block
+ * \param[in] col_a column position of the first block
+ * \param[in] row_b row position of the second block
+ * \param[in] col_b column position of the second block
+ * \param[in] width width shared by the two blocks
+ * \param[in] height height shared by the two blocks
+ *
+ * \return overlap area of the two blocks
+ */
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+ int height);
+
+/*!\brief Get current frame's q_index from tpl stats and leaf_qindex
+ *
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_frame_index current frame index in the GOP
+ * \param[in] leaf_qindex q index of leaf frame
+ * \param[in] bit_depth bit depth
+ *
+ * \return q_index
+ */
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+ int leaf_qindex, aom_bit_depth_t bit_depth);
+
+/*!\brief Compute the frame importance from TPL stats
+ *
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_frame_index current frame index in the GOP
+ *
+ * \return frame_importance
+ */
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+ int gf_frame_index);
+
+/*!\brief Compute the ratio between arf q step and the leaf q step based on
+ * TPL stats
+ *
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_frame_index current frame index in the GOP
+ * \param[in] leaf_qindex q index of leaf frame
+ * \param[in] bit_depth bit depth
+ *
+ * \return qstep_ratio
+ */
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index);
+
+/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep
+ *
+ * \param[in] leaf_qindex q index of leaf frame
+ * \param[in] qstep_ratio step ratio between target q index and
+ * leaf q index \param[in] bit_depth bit depth
+ *
+ * \return q_index
+ */
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+ aom_bit_depth_t bit_depth);
+
+/*!\brief Improve the motion vector estimation by taking neighbors into
+ * account.
+ *
+ * Use the upper and left neighbor block as the reference MVs.
+ * Compute the minimum difference between current MV and reference MV.
+ *
+ * \param[in] tpl_frame Tpl frame struct
+ * \param[in] row Current row
+ * \param[in] col Current column
+ * \param[in] step Step parameter for av1_tpl_ptr_pos
+ * \param[in] tpl_stride Stride parameter for av1_tpl_ptr_pos
+ * \param[in] right_shift Right shift parameter for
+ * av1_tpl_ptr_pos
+ */
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+ int step, int tpl_stride, int right_shift);
+
+/*!\brief Compute the entropy of motion vectors for a single frame.
+ *
+ * \param[in] tpl_frame TPL frame struct
+ * \param[in] right_shift right shift value for step
+ *
+ * \return Bits used by the motion vectors for one frame.
+ */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+ uint8_t right_shift);
+
+#if CONFIG_RATECTRL_LOG
+typedef struct {
+ int coding_frame_count;
+ int base_q_index;
+
+ // Encode decision
+ int q_index_list[VBR_RC_INFO_MAX_FRAMES];
+ double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+ FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+
+ // Frame stats
+ TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+
+ // Estimated encode results
+ double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+
+ // Actual encode results
+ double act_rate_list[VBR_RC_INFO_MAX_FRAMES];
+ double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+} RATECTRL_LOG;
+
+static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); }
+
+static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index,
+ const TplTxfmStats *txfm_stats) {
+ rc_log->txfm_stats_list[coding_index] = *txfm_stats;
+}
+
+static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
+ int coding_index,
+ double qstep_ratio, int q_index,
+ FRAME_UPDATE_TYPE update_type) {
+ rc_log->qstep_ratio_list[coding_index] = qstep_ratio;
+ rc_log->q_index_list[coding_index] = q_index;
+ rc_log->update_type_list[coding_index] = update_type;
+ const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index];
+ rc_log->est_coeff_rate_list[coding_index] = 0;
+ if (txfm_stats->ready) {
+ rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate(
+ q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean,
+ txfm_stats->coeff_num);
+ }
+}
+
+static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index,
+ double act_rate,
+ double act_coeff_rate) {
+ rc_log->act_rate_list[coding_index] = act_rate;
+ rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate;
+}
+
+static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log,
+ int base_q_index,
+ int coding_frame_count) {
+ rc_log->base_q_index = base_q_index;
+ rc_log->coding_frame_count = coding_frame_count;
+}
+
+static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) {
+ printf("= chunk 1\n");
+ printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count,
+ rc_log->base_q_index);
+ printf("= frame %d\n", rc_log->coding_frame_count);
+ for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count;
+ coding_idx++) {
+ printf(
+ "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f "
+ "act_coeff_rate %f act_rate %f\n",
+ coding_idx, rc_log->update_type_list[coding_idx],
+ rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx],
+ rc_log->est_coeff_rate_list[coding_idx],
+ rc_log->act_coeff_rate_list[coding_idx],
+ rc_log->act_rate_list[coding_idx]);
+ }
+}
+#endif // CONFIG_RATECTRL_LOG
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TPL_MODEL_H_
diff --git a/third_party/aom/av1/encoder/tune_butteraugli.c b/third_party/aom/av1/encoder/tune_butteraugli.c
new file mode 100644
index 0000000000..92fc4b2a92
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_butteraugli.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/encoder/tune_butteraugli.h"
+
+#include "aom_dsp/butteraugli.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/var_based_part.h"
+
+static const int resize_factor = 2;
+
+static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *recon,
+ const double K) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const aom_color_range_t color_range =
+ seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int width = source->y_crop_width;
+ const int height = source->y_crop_height;
+ const int ss_x = source->subsampling_x;
+ const int ss_y = source->subsampling_y;
+
+ float *diffmap;
+ CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap)));
+ if (!aom_calc_butteraugli(source, recon, bit_depth,
+ seq_params->matrix_coefficients, color_range,
+ diffmap)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Failed to calculate Butteraugli distances.");
+ }
+
+ const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor;
+ const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor;
+ const int num_cols =
+ (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w;
+ const int num_rows =
+ (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h;
+ const int block_w = num_mi_w << 2;
+ const int block_h = num_mi_h << 2;
+ double log_sum = 0.0;
+ double blk_count = 0.0;
+
+ // Loop through each block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ const int y_start = row * block_h;
+ const int x_start = col * block_w;
+ float dbutteraugli = 0.0f;
+ float dmse = 0.0f;
+ float px_count = 0.0f;
+
+ // Loop through each pixel.
+ for (int y = y_start; y < y_start + block_h && y < height; y++) {
+ for (int x = x_start; x < x_start + block_w && x < width; x++) {
+ dbutteraugli += powf(diffmap[y * width + x], 12.0f);
+ float px_diff = source->y_buffer[y * source->y_stride + x] -
+ recon->y_buffer[y * recon->y_stride + x];
+ dmse += px_diff * px_diff;
+ px_count += 1.0f;
+ }
+ }
+ const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+ (height + ss_y) >> ss_y);
+ for (int y = y_start >> ss_y; y < y_end; y++) {
+ const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+ (width + ss_x) >> ss_x);
+ for (int x = x_start >> ss_x; x < x_end; x++) {
+ const int src_px_index = y * source->uv_stride + x;
+ const int recon_px_index = y * recon->uv_stride + x;
+ const float px_diff_u = (float)(source->u_buffer[src_px_index] -
+ recon->u_buffer[recon_px_index]);
+ const float px_diff_v = (float)(source->v_buffer[src_px_index] -
+ recon->v_buffer[recon_px_index]);
+ dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+ px_count += 2.0f;
+ }
+ }
+
+ dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
+ dmse = dmse / px_count;
+ const float eps = 0.01f;
+ double weight;
+ if (dbutteraugli < eps || dmse < eps) {
+ weight = -1.0;
+ } else {
+ blk_count += 1.0;
+ weight = dmse / dbutteraugli;
+ weight = AOMMIN(weight, 5.0);
+ weight += K;
+ log_sum += log(weight);
+ }
+ cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
+ }
+ }
+ // Geometric average of the weights.
+ log_sum = exp(log_sum / blk_count);
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index];
+ if (*weight <= 0.0) {
+ *weight = 1.0;
+ } else {
+ *weight /= log_sum;
+ }
+ *weight = AOMMIN(*weight, 2.5);
+ *weight = AOMMAX(*weight, 0.4);
+ }
+ }
+
+ aom_free(diffmap);
+}
+
+void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int *rdmult) {
+ assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI);
+ if (!cpi->butteraugli_info.recon_set) {
+ return;
+ }
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize];
+ const int num_mi_h = mi_size_high[butteraugli_rdo_bsize];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 0.0;
+
+ for (int row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (int col = mi_col / num_mi_h;
+ col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale +=
+ log(cpi->butteraugli_info.rdmult_scaling_factors[index]);
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h) {
+ for (int row = 0; row < h; row++) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int width, int height) {
+ copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
+ height);
+ const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
+ const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
+ copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ width_uv, height_uv);
+ copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ width_uv, height_uv);
+}
+
+static void zero_plane(uint8_t *dst, int dst_stride, int h) {
+ for (int row = 0; row < h; row++) {
+ memset(dst, 0, dst_stride);
+ dst += dst_stride;
+ }
+}
+
+static void zero_img(YV12_BUFFER_CONFIG *dst) {
+ zero_plane(dst->y_buffer, dst->y_stride, dst->y_height);
+ zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height);
+ zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height);
+}
+
+void av1_setup_butteraugli_source(AV1_COMP *cpi) {
+ YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source;
+ AV1_COMMON *const cm = &cpi->common;
+ const int width = cpi->source->y_crop_width;
+ const int height = cpi->source->y_crop_height;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = cpi->source->subsampling_x;
+ const int ss_y = cpi->source->subsampling_y;
+ if (dst->buffer_alloc_sz == 0) {
+ aom_alloc_frame_buffer(
+ dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+ }
+ av1_copy_and_extend_frame(cpi->source, dst);
+
+ YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source;
+ if (resized_dst->buffer_alloc_sz == 0) {
+ aom_alloc_frame_buffer(
+ resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ }
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers during resize");
+ }
+
+ zero_img(cpi->source);
+ copy_img(resized_dst, cpi->source, width / resize_factor,
+ height / resize_factor);
+}
+
+void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
+ av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source);
+ AV1_COMMON *const cm = &cpi->common;
+ const int width = cpi->source->y_crop_width;
+ const int height = cpi->source->y_crop_height;
+ const int ss_x = cpi->source->subsampling_x;
+ const int ss_y = cpi->source->subsampling_y;
+
+ YV12_BUFFER_CONFIG resized_recon;
+ memset(&resized_recon, 0, sizeof(resized_recon));
+ aom_alloc_frame_buffer(
+ &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
+ height / resize_factor);
+
+ set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source,
+ &resized_recon, K);
+ cpi->butteraugli_info.recon_set = true;
+ aom_free_frame_buffer(&resized_recon);
+}
+
+void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+ const int q_index = 96;
+
+ // Setup necessary params for encoding, including frame source, etc.
+ if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+ av1_set_frame_size(cpi, cm->superres_upscaled_width,
+ cm->superres_upscaled_height);
+
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+ 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ av1_setup_butteraugli_source(cpi);
+ av1_setup_frame(cpi);
+
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+ const PARTITION_SEARCH_TYPE partition_search_type =
+ cpi->sf.part_sf.partition_search_type;
+ const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size;
+ // Enable a quicker pass by uncommenting the following lines:
+ // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+ // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+
+ av1_set_variance_partition_thresholds(cpi, q_index, 0);
+ av1_encode_frame(cpi);
+
+ av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3);
+ cpi->sf.part_sf.partition_search_type = partition_search_type;
+ cpi->sf.part_sf.fixed_partition_size = fixed_partition_size;
+}
diff --git a/third_party/aom/av1/encoder/tune_butteraugli.h b/third_party/aom/av1/encoder/tune_butteraugli.h
new file mode 100644
index 0000000000..bae5d2a882
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_butteraugli.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+#define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+ // Stores the scaling factors for rdmult when tuning for Butteraugli.
+ // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+ // 4x4 block at (row, col).
+ double *rdmult_scaling_factors;
+ YV12_BUFFER_CONFIG source, resized_source;
+ bool recon_set;
+} TuneButteraugliInfo;
+
+struct AV1_COMP;
+static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16;
+
+void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int *rdmult);
+
+void av1_setup_butteraugli_source(struct AV1_COMP *cpi);
+
+// 'K' is used to balance the rate-distortion distribution between PSNR
+// and Butteraugli.
+void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi,
+ double K);
+
+void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi);
+
+#endif // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c
new file mode 100644
index 0000000000..4e5ffa387c
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_vmaf.c
@@ -0,0 +1,1112 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/tune_vmaf.h"
+
+#include "aom_dsp/psnr.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rdopt.h"
+#include "config/aom_scale_rtcd.h"
+
+static const double kBaselineVmaf = 97.42773;
+
+static double get_layer_value(const double *array, int layer) {
+ while (array[layer] < 0.0 && layer > 0) layer--;
+ return AOMMAX(array[layer], 0.0);
+}
+
+static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *ref,
+ const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, FULLPEL_MV *ref_mv) {
+ // Block information (ONLY Y-plane is used for motion search).
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int y_stride = src->y_stride;
+ assert(y_stride == ref->y_stride);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+ // Save input state.
+ MACROBLOCK *const mb = &cpi->td.mb;
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ const struct buf_2d ori_src_buf = mb->plane[0].src;
+ const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+ // Parameters used for motion search.
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ const SEARCH_METHODS search_method = NSTEP;
+ const search_site_config *search_site_cfg =
+ cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+ const int step_param =
+ av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height));
+
+ // Baseline position for motion search (used for rate distortion comparison).
+ const MV baseline_mv = kZeroMv;
+
+ // Setup.
+ mb->plane[0].src.buf = src->y_buffer + y_offset;
+ mb->plane[0].src.stride = y_stride;
+ mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset;
+ mbd->plane[0].pre[0].stride = y_stride;
+
+ // Unused intermediate results for motion search.
+ int cost_list[5];
+
+ // Do motion search.
+ // Only do full search on the entire block.
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+ &baseline_mv, *ref_mv, search_site_cfg,
+ search_method,
+ /*fine_search_interval=*/0);
+ av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats,
+ NULL);
+
+ // Restore input state.
+ mb->plane[0].src = ori_src_buf;
+ mbd->plane[0].pre[0] = ori_pre_buf;
+}
+
+static unsigned int residual_variance(const AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *ref,
+ const BLOCK_SIZE block_size,
+ const int mb_row, const int mb_col,
+ FULLPEL_MV ref_mv, unsigned int *sse) {
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int y_stride = src->y_stride;
+ assert(y_stride == ref->y_stride);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+ const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
+ const unsigned int var = cpi->ppi->fn_ptr[block_size].vf(
+ ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset,
+ y_stride, sse);
+ return var;
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+ const YV12_BUFFER_CONFIG *const frame) {
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const uint8_t *const y_buffer = frame->y_buffer;
+ const int y_stride = frame->y_stride;
+ const BLOCK_SIZE block_size = BLOCK_64X64;
+
+ const int block_w = mi_size_wide[block_size] * 4;
+ const int block_h = mi_size_high[block_size] * 4;
+ int row, col;
+ double var = 0.0, var_count = 0.0;
+ const int use_hbd = frame->flags & YV12_FLAG_HIGHBITDEPTH;
+
+ // Loop through each block.
+ for (row = 0; row < frame->y_height / block_h; ++row) {
+ for (col = 0; col < frame->y_width / block_w; ++col) {
+ struct buf_2d buf;
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+
+ buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ var += av1_get_perpixel_variance(cpi, xd, &buf, block_size, AOM_PLANE_Y,
+ use_hbd);
+ var_count += 1.0;
+ }
+ }
+ var /= var_count;
+ return var;
+}
+
+static double residual_frame_average_variance(AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *ref,
+ FULLPEL_MV *mvs) {
+ if (ref == NULL) return frame_average_variance(cpi, src);
+ const BLOCK_SIZE block_size = BLOCK_16X16;
+ const int frame_height = src->y_height;
+ const int frame_width = src->y_width;
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mb_rows = (frame_height + mb_height - 1) / mb_height;
+ const int mb_cols = (frame_width + mb_width - 1) / mb_width;
+ const int num_planes = av1_num_planes(&cpi->common);
+ const int mi_h = mi_size_high_log2[block_size];
+ const int mi_w = mi_size_wide_log2[block_size];
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+ // Save input state.
+ MACROBLOCK *const mb = &cpi->td.mb;
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ for (int i = 0; i < num_planes; i++) {
+ input_buffer[i] = mbd->plane[i].pre[0].buf;
+ }
+ MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+ bool do_motion_search = false;
+ if (mvs == NULL) {
+ do_motion_search = true;
+ CHECK_MEM_ERROR(&cpi->common, mvs,
+ (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs)));
+ }
+
+ unsigned int variance = 0;
+ // Perform temporal filtering block by block.
+ for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+ av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+ av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols];
+ if (do_motion_search) {
+ motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv);
+ }
+ unsigned int mv_sse;
+ const unsigned int blk_var = residual_variance(
+ cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse);
+ variance += blk_var;
+ }
+ }
+
+ // Restore input state
+ for (int i = 0; i < num_planes; i++) {
+ mbd->plane[i].pre[0].buf = input_buffer[i];
+ }
+ mbd->mi = input_mb_mode_info;
+ return (double)variance / (double)(mb_rows * mb_cols);
+}
+
+// TODO(sdeng): Add the SIMD implementation.
+static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
+ int source_stride,
+ const uint16_t *blurred,
+ int blurred_stride, uint16_t *dst,
+ int dst_stride, int w, int h,
+ double amount, int bit_depth) {
+ const int max_value = (1 << bit_depth) - 1;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const double val =
+ (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+ dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value);
+ }
+ source += source_stride;
+ blurred += blurred_stride;
+ dst += dst_stride;
+ }
+}
+
+static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
+ const uint8_t *blurred, int blurred_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ double amount) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const double val =
+ (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+ dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
+ }
+ source += source_stride;
+ blurred += blurred_stride;
+ dst += dst_stride;
+ }
+}
+
+static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *blurred,
+ const YV12_BUFFER_CONFIG *dst, double amount) {
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ if (cpi->common.seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
+ highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+ CONVERT_TO_SHORTPTR(blurred->y_buffer),
+ blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
+ dst->y_stride, source->y_width, source->y_height,
+ amount, bit_depth);
+ } else {
+ unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
+ blurred->y_stride, dst->y_buffer, dst->y_stride,
+ source->y_width, source->y_height, amount);
+ }
+}
+
+// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
+// all co-efficients must be even.
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52,
+ 30, 8, 0, 0 };
+static AOM_INLINE void gaussian_blur(const int bit_depth,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dst) {
+ const int block_size = BLOCK_128X128;
+ const int block_w = mi_size_wide[block_size] * 4;
+ const int block_h = mi_size_high[block_size] * 4;
+ const int num_cols = (source->y_width + block_w - 1) / block_w;
+ const int num_rows = (source->y_height + block_h - 1) / block_h;
+ int row, col;
+
+ ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
+ InterpFilterParams filter = { .filter_ptr = gauss_filter,
+ .taps = 8,
+ .interp_filter = EIGHTTAP_REGULAR };
+
+ for (row = 0; row < num_rows; ++row) {
+ for (col = 0; col < num_cols; ++col) {
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+
+ uint8_t *src_buf =
+ source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+ uint8_t *dst_buf =
+ dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
+
+ if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_convolve_2d_sr(
+ CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
+ CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+ &filter, &filter, 0, 0, &conv_params, bit_depth);
+ } else {
+ av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
+ block_w, block_h, &filter, &filter, 0, 0,
+ &conv_params);
+ }
+ }
+ }
+}
+
+static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi,
+ double source_variance,
+ YV12_BUFFER_CONFIG *const source,
+ YV12_BUFFER_CONFIG *const sharpened) {
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ double new_vmaf;
+
+ aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth,
+ cal_vmaf_neg, &new_vmaf);
+
+ const double sharpened_var = frame_average_variance(cpi, sharpened);
+ return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
+}
+
+static double find_best_frame_unsharp_amount_loop(
+ const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+ YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+ double best_vmaf, const double baseline_variance,
+ const double unsharp_amount_start, const double step_size,
+ const int max_loop_count, const double max_amount) {
+ const double min_amount = 0.0;
+ int loop_count = 0;
+ double approx_vmaf = best_vmaf;
+ double unsharp_amount = unsharp_amount_start;
+ do {
+ best_vmaf = approx_vmaf;
+ unsharp_amount += step_size;
+ if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+ unsharp(cpi, source, blurred, sharpened, unsharp_amount);
+ approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+
+ loop_count++;
+ } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+ unsharp_amount =
+ approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+ return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source,
+ YV12_BUFFER_CONFIG *const blurred,
+ const double unsharp_amount_start,
+ const double step_size,
+ const int max_loop_count,
+ const double max_filter_amount) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int width = source->y_width;
+ const int height = source->y_height;
+ YV12_BUFFER_CONFIG sharpened;
+ memset(&sharpened, 0, sizeof(sharpened));
+ aom_alloc_frame_buffer(
+ &sharpened, width, height, source->subsampling_x, source->subsampling_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ const double baseline_variance = frame_average_variance(cpi, source);
+ double unsharp_amount;
+ if (unsharp_amount_start <= step_size) {
+ unsharp_amount = find_best_frame_unsharp_amount_loop(
+ cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+ step_size, max_loop_count, max_filter_amount);
+ } else {
+ double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
+ double v0, v1;
+ unsharp(cpi, source, blurred, &sharpened, a0);
+ v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+ unsharp(cpi, source, blurred, &sharpened, a1);
+ v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+ if (fabs(v0 - v1) < 0.01) {
+ unsharp_amount = a0;
+ } else if (v0 > v1) {
+ unsharp_amount = find_best_frame_unsharp_amount_loop(
+ cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+ -step_size, max_loop_count, max_filter_amount);
+ } else {
+ unsharp_amount = find_best_frame_unsharp_amount_loop(
+ cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+ step_size, max_loop_count, max_filter_amount);
+ }
+ }
+
+ aom_free_frame_buffer(&sharpened);
+ return unsharp_amount;
+}
+
+void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int width = source->y_width;
+ const int height = source->y_height;
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double best_frame_unsharp_amount =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+ if (best_frame_unsharp_amount <= 0.0) return;
+
+ YV12_BUFFER_CONFIG blurred;
+ memset(&blurred, 0, sizeof(blurred));
+ aom_alloc_frame_buffer(
+ &blurred, width, height, source->subsampling_x, source->subsampling_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ gaussian_blur(bit_depth, source, &blurred);
+ unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+ aom_free_frame_buffer(&blurred);
+}
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int width = source->y_width;
+ const int height = source->y_height;
+
+ YV12_BUFFER_CONFIG source_extended, blurred;
+ memset(&source_extended, 0, sizeof(source_extended));
+ memset(&blurred, 0, sizeof(blurred));
+ aom_alloc_frame_buffer(
+ &source_extended, width, height, source->subsampling_x,
+ source->subsampling_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(
+ &blurred, width, height, source->subsampling_x, source->subsampling_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ av1_copy_and_extend_frame(source, &source_extended);
+ gaussian_blur(bit_depth, &source_extended, &blurred);
+ aom_free_frame_buffer(&source_extended);
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double last_frame_unsharp_amount =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+ const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+ cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+ cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+ best_frame_unsharp_amount;
+
+ unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+ aom_free_frame_buffer(&blurred);
+}
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int width = source->y_width;
+ const int height = source->y_height;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = source->subsampling_x;
+ const int ss_y = source->subsampling_y;
+
+ YV12_BUFFER_CONFIG source_extended, blurred;
+ memset(&blurred, 0, sizeof(blurred));
+ memset(&source_extended, 0, sizeof(source_extended));
+ aom_alloc_frame_buffer(
+ &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ av1_copy_and_extend_frame(source, &source_extended);
+ gaussian_blur(bit_depth, &source_extended, &blurred);
+ aom_free_frame_buffer(&source_extended);
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double last_frame_unsharp_amount =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+ const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+ cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+ cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+ best_frame_unsharp_amount;
+
+ const int block_size = BLOCK_64X64;
+ const int block_w = mi_size_wide[block_size] * 4;
+ const int block_h = mi_size_high[block_size] * 4;
+ const int num_cols = (source->y_width + block_w - 1) / block_w;
+ const int num_rows = (source->y_height + block_h - 1) / block_h;
+ double *best_unsharp_amounts =
+ aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts));
+ if (!best_unsharp_amounts) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating vmaf data");
+ }
+
+ YV12_BUFFER_CONFIG source_block, blurred_block;
+ memset(&source_block, 0, sizeof(source_block));
+ memset(&blurred_block, 0, sizeof(blurred_block));
+ aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+ const int block_width = AOMMIN(width - col_offset_y, block_w);
+ const int block_height = AOMMIN(height - row_offset_y, block_h);
+ const int index = col + row * num_cols;
+
+ if (cm->seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
+ uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+ row_offset_y * source->y_stride +
+ col_offset_y;
+ uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+ row_offset_y * blurred.y_stride +
+ col_offset_y;
+ uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
+ uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
+
+ // Copy block from source frame.
+ for (int i = 0; i < block_h; ++i) {
+ for (int j = 0; j < block_w; ++j) {
+ if (i >= block_height || j >= block_width) {
+ src_dst[j] = 0;
+ blurred_dst[j] = 0;
+ } else {
+ src_dst[j] = frame_src_buf[j];
+ blurred_dst[j] = frame_blurred_buf[j];
+ }
+ }
+ frame_src_buf += source->y_stride;
+ frame_blurred_buf += blurred.y_stride;
+ src_dst += source_block.y_stride;
+ blurred_dst += blurred_block.y_stride;
+ }
+ } else {
+ uint8_t *frame_src_buf =
+ source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+ uint8_t *frame_blurred_buf =
+ blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+ uint8_t *blurred_dst = blurred_block.y_buffer;
+ uint8_t *src_dst = source_block.y_buffer;
+
+ // Copy block from source frame.
+ for (int i = 0; i < block_h; ++i) {
+ for (int j = 0; j < block_w; ++j) {
+ if (i >= block_height || j >= block_width) {
+ src_dst[j] = 0;
+ blurred_dst[j] = 0;
+ } else {
+ src_dst[j] = frame_src_buf[j];
+ blurred_dst[j] = frame_blurred_buf[j];
+ }
+ }
+ frame_src_buf += source->y_stride;
+ frame_blurred_buf += blurred.y_stride;
+ src_dst += source_block.y_stride;
+ blurred_dst += blurred_block.y_stride;
+ }
+ }
+
+ best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
+ cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3,
+ 1.5);
+ }
+ }
+
+ // Apply best blur amounts
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+ const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+ const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+ const int index = col + row * num_cols;
+
+ if (cm->seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
+ uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+ row_offset_y * source->y_stride + col_offset_y;
+ uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+ row_offset_y * blurred.y_stride + col_offset_y;
+ highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
+ blurred.y_stride, src_buf, source->y_stride,
+ block_width, block_height,
+ best_unsharp_amounts[index], bit_depth);
+ } else {
+ uint8_t *src_buf =
+ source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+ uint8_t *blurred_buf =
+ blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+ unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+ src_buf, source->y_stride, block_width, block_height,
+ best_unsharp_amounts[index]);
+ }
+ }
+ }
+
+ aom_free_frame_buffer(&source_block);
+ aom_free_frame_buffer(&blurred_block);
+ aom_free_frame_buffer(&blurred);
+ aom_free(best_unsharp_amounts);
+}
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int y_width = cpi->source->y_width;
+ const int y_height = cpi->source->y_height;
+ const int resized_block_size = BLOCK_32X32;
+ const int resize_factor = 2;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = cpi->source->subsampling_x;
+ const int ss_y = cpi->source->subsampling_y;
+
+ YV12_BUFFER_CONFIG resized_source;
+ memset(&resized_source, 0, sizeof(resized_source));
+ aom_alloc_frame_buffer(
+ &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
+ ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers during resize");
+ }
+
+ const int resized_y_width = resized_source.y_width;
+ const int resized_y_height = resized_source.y_height;
+ const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+ const int resized_block_h = mi_size_high[resized_block_size] * 4;
+ const int num_cols =
+ (resized_y_width + resized_block_w - 1) / resized_block_w;
+ const int num_rows =
+ (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+ YV12_BUFFER_CONFIG blurred;
+ memset(&blurred, 0, sizeof(blurred));
+ aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x,
+ ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ gaussian_blur(bit_depth, &resized_source, &blurred);
+
+ YV12_BUFFER_CONFIG recon;
+ memset(&recon, 0, sizeof(recon));
+ aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_yv12_copy_frame(&resized_source, &recon, 1);
+
+ VmafContext *vmaf_context;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg);
+ unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses));
+ if (!sses) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating vmaf data");
+ }
+
+ // Loop through each 'block_size' block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ const int row_offset_y = row * resized_block_h;
+ const int col_offset_y = col * resized_block_w;
+
+ uint8_t *const orig_buf = resized_source.y_buffer +
+ row_offset_y * resized_source.y_stride +
+ col_offset_y;
+ uint8_t *const blurred_buf =
+ blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+
+ cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+ blurred_buf, blurred.y_stride,
+ &sses[index]);
+
+ uint8_t *const recon_buf =
+ recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
+ // Set recon buf
+ if (cpi->common.seq_params->use_highbitdepth) {
+ highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+ CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+ CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
+ resized_block_w, resized_block_h, 0.0, bit_depth);
+ } else {
+ unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf,
+ blurred.y_stride, recon_buf, recon.y_stride,
+ resized_block_w, resized_block_h, 0.0);
+ }
+
+ aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth,
+ index);
+
+ // Restore recon buf
+ if (cpi->common.seq_params->use_highbitdepth) {
+ highbd_unsharp_rect(
+ CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+ CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+ CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
+ resized_block_h, 0.0, bit_depth);
+ } else {
+ unsharp_rect(orig_buf, resized_source.y_stride, orig_buf,
+ resized_source.y_stride, recon_buf, recon.y_stride,
+ resized_block_w, resized_block_h, 0.0);
+ }
+ }
+ }
+ aom_flush_vmaf_context(vmaf_context);
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ const double vmaf = aom_calc_vmaf_at_index(
+ vmaf_context, cpi->vmaf_info.vmaf_model, index);
+ const double dvmaf = kBaselineVmaf - vmaf;
+
+ const double mse =
+ (double)sses[index] / (double)(resized_y_width * resized_y_height);
+ double weight;
+ const double eps = 0.01 / (num_rows * num_cols);
+ if (dvmaf < eps || mse < eps) {
+ weight = 1.0;
+ } else {
+ weight = mse / dvmaf;
+ }
+
+ // Normalize it with a data fitted model.
+ weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
+ cpi->vmaf_info.rdmult_scaling_factors[index] = weight;
+ }
+ }
+
+ aom_free_frame_buffer(&resized_source);
+ aom_free_frame_buffer(&blurred);
+ aom_close_vmaf_context(vmaf_context);
+ aom_free(sses);
+}
+
+void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int bsize_base = BLOCK_64X64;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ int row, col;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 0.0;
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col / num_mi_h;
+ col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]);
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+// TODO(sdeng): replace them with the SIMD versions.
+static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h) {
+ double accum = 0.0;
+ int i, j;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ double img1px = src[i * src_stride + j];
+ double img2px = ref[i * ref_stride + j];
+
+ accum += fabs(img1px - img2px);
+ }
+ }
+
+ return accum / (double)(h * w);
+}
+
+static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w,
+ int h) {
+ double accum = 0.0;
+ int i, j;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ double img1px = src[i * src_stride + j];
+ double img2px = ref[i * ref_stride + j];
+
+ accum += fabs(img1px - img2px);
+ }
+ }
+
+ return accum / (double)(h * w);
+}
+
+static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
+ const AV1_COMMON *const cm,
+ const YV12_BUFFER_CONFIG *const cur,
+ const YV12_BUFFER_CONFIG *const last,
+ const YV12_BUFFER_CONFIG *const next) {
+ const int y_width = cur->y_width;
+ const int y_height = cur->y_height;
+ YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = cur->subsampling_x;
+ const int ss_y = cur->subsampling_y;
+
+ memset(&blurred_cur, 0, sizeof(blurred_cur));
+ memset(&blurred_last, 0, sizeof(blurred_last));
+ memset(&blurred_next, 0, sizeof(blurred_next));
+
+ aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ gaussian_blur(bit_depth, cur, &blurred_cur);
+ gaussian_blur(bit_depth, last, &blurred_last);
+ if (next) gaussian_blur(bit_depth, next, &blurred_next);
+
+ double motion1, motion2 = 65536.0;
+ if (cm->seq_params->use_highbitdepth) {
+ assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
+ const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+ motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+ blurred_cur.y_stride,
+ CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
+ blurred_last.y_stride, y_width, y_height) *
+ scale_factor;
+ if (next) {
+ assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH);
+ motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+ blurred_cur.y_stride,
+ CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
+ blurred_next.y_stride, y_width, y_height) *
+ scale_factor;
+ }
+ } else {
+ motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+ blurred_last.y_buffer, blurred_last.y_stride, y_width,
+ y_height);
+ if (next) {
+ motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+ blurred_next.y_buffer, blurred_next.y_stride,
+ y_width, y_height);
+ }
+ }
+
+ aom_free_frame_buffer(&blurred_cur);
+ aom_free_frame_buffer(&blurred_last);
+ aom_free_frame_buffer(&blurred_next);
+
+ return AOMMIN(motion1, motion2);
+}
+
+static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG **last,
+ YV12_BUFFER_CONFIG **next) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const int src_index =
+ cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index];
+ struct lookahead_entry *last_entry = av1_lookahead_peek(
+ cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage);
+ struct lookahead_entry *next_entry = av1_lookahead_peek(
+ cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage);
+ *next = &next_entry->img;
+ *last = cm->show_frame ? cpi->last_source : &last_entry->img;
+}
+
+// Calculates the new qindex from the VMAF motion score. This is based on the
+// observation: when the motion score becomes higher, the VMAF score of the
+// same source and distorted frames would become higher.
+int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
+ return current_qindex;
+ }
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double last_frame_ysse =
+ get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth);
+ const double last_frame_vmaf =
+ get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth);
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) *
+ (1 << (bit_depth - 8)));
+ const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf;
+ const double sse_threshold =
+ 0.01 * cpi->source->y_width * cpi->source->y_height;
+ const double vmaf_threshold = 0.01;
+ if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
+ return current_qindex;
+ }
+ YV12_BUFFER_CONFIG *cur_buf = cpi->source;
+ if (cm->show_frame == 0) {
+ const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+ struct lookahead_entry *cur_entry = av1_lookahead_peek(
+ cpi->ppi->lookahead, src_index, cpi->compressor_stage);
+ cur_buf = &cur_entry->img;
+ }
+ assert(cur_buf);
+
+ YV12_BUFFER_CONFIG *next_buf, *last_buf;
+ get_neighbor_frames(cpi, &last_buf, &next_buf);
+ assert(last_buf);
+
+ const double motion =
+ calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf);
+
+ // Get dVMAF through a data fitted model.
+ const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion));
+ const double dsse = dvmaf * approx_sse / approx_dvmaf;
+
+ // Clamping beta to address VQ issue (aomedia:3170).
+ const double beta = AOMMAX(approx_sse / (dsse + approx_sse), 0.5);
+ const int offset =
+ av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta);
+ int qindex = current_qindex + offset;
+
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+
+ return qindex;
+}
+
+static AOM_INLINE double cal_approx_score(
+ AV1_COMP *const cpi, double src_variance, double new_variance,
+ double src_score, YV12_BUFFER_CONFIG *const src,
+ YV12_BUFFER_CONFIG *const recon_sharpened) {
+ double score;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth,
+ cal_vmaf_neg, &score);
+ return src_variance / new_variance * (score - src_score);
+}
+
+static double find_best_frame_unsharp_amount_loop_neg(
+ AV1_COMP *const cpi, double src_variance, double base_score,
+ YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
+ YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred,
+ YV12_BUFFER_CONFIG *const recon_blurred,
+ YV12_BUFFER_CONFIG *const src_sharpened,
+ YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs,
+ double best_score, const double unsharp_amount_start,
+ const double step_size, const int max_loop_count, const double max_amount) {
+ const double min_amount = 0.0;
+ int loop_count = 0;
+ double approx_score = best_score;
+ double unsharp_amount = unsharp_amount_start;
+
+ do {
+ best_score = approx_score;
+ unsharp_amount += step_size;
+ if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+ unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount);
+ unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount);
+ const double new_variance =
+ residual_frame_average_variance(cpi, src_sharpened, ref, mvs);
+ approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score,
+ src, recon_sharpened);
+
+ loop_count++;
+ } while (approx_score > best_score && loop_count < max_loop_count);
+ unsharp_amount =
+ approx_score > best_score ? unsharp_amount : unsharp_amount - step_size;
+
+ return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount_neg(
+ AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src,
+ YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
+ double base_score, const double unsharp_amount_start,
+ const double step_size, const int max_loop_count,
+ const double max_filter_amount) {
+ FULLPEL_MV *mvs = NULL;
+ const double src_variance =
+ residual_frame_average_variance(cpi, src, ref, mvs);
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int width = recon->y_width;
+ const int height = recon->y_height;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = recon->subsampling_x;
+ const int ss_y = recon->subsampling_y;
+
+ YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened;
+ memset(&recon_sharpened, 0, sizeof(recon_sharpened));
+ memset(&src_sharpened, 0, sizeof(src_sharpened));
+ memset(&recon_blurred, 0, sizeof(recon_blurred));
+ memset(&src_blurred, 0, sizeof(src_blurred));
+ aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(
+ &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+
+ gaussian_blur(bit_depth, recon, &recon_blurred);
+ gaussian_blur(bit_depth, src, &src_blurred);
+
+ unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start);
+ unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start);
+ const double variance_start =
+ residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+ const double score_start = cal_approx_score(
+ cpi, src_variance, variance_start, base_score, src, &recon_sharpened);
+
+ const double unsharp_amount_next = unsharp_amount_start + step_size;
+ unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next);
+ unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next);
+ const double variance_next =
+ residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+ const double score_next = cal_approx_score(cpi, src_variance, variance_next,
+ base_score, src, &recon_sharpened);
+
+ double unsharp_amount;
+ if (score_next > score_start) {
+ unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+ cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+ &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next,
+ unsharp_amount_next, step_size, max_loop_count, max_filter_amount);
+ } else {
+ unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+ cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+ &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start,
+ unsharp_amount_start, -step_size, max_loop_count, max_filter_amount);
+ }
+
+ aom_free_frame_buffer(&recon_sharpened);
+ aom_free_frame_buffer(&src_sharpened);
+ aom_free_frame_buffer(&recon_blurred);
+ aom_free_frame_buffer(&src_blurred);
+ aom_free(mvs);
+ return unsharp_amount;
+}
+
+void av1_update_vmaf_curve(AV1_COMP *cpi) {
+ YV12_BUFFER_CONFIG *source = cpi->source;
+ YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ double base_score;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth,
+ cal_vmaf_neg, &base_score);
+ cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
+ if (cpi->common.seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
+ cpi->vmaf_info.last_frame_ysse[layer_depth] =
+ (double)aom_highbd_get_y_sse(source, recon);
+ } else {
+ cpi->vmaf_info.last_frame_ysse[layer_depth] =
+ (double)aom_get_y_sse(source, recon);
+ }
+
+ if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ YV12_BUFFER_CONFIG *last, *next;
+ get_neighbor_frames(cpi, &last, &next);
+ double best_unsharp_amount_start =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+ const int max_loop_count = 5;
+ cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+ find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score,
+ best_unsharp_amount_start, 0.025,
+ max_loop_count, 1.01);
+ }
+}
diff --git a/third_party/aom/av1/encoder/tune_vmaf.h b/third_party/aom/av1/encoder/tune_vmaf.h
new file mode 100644
index 0000000000..a04a29e6fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_vmaf.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
+#define AOM_AV1_ENCODER_TUNE_VMAF_H_
+
+#include "aom_dsp/vmaf.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+ // Stores the scaling factors for rdmult when tuning for VMAF.
+ // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+ // 64x64 block at (row, col).
+ double *rdmult_scaling_factors;
+
+ // Stores the luma sse of the last frame.
+ double last_frame_ysse[MAX_ARF_LAYERS];
+
+ // Stores the VMAF of the last frame.
+ double last_frame_vmaf[MAX_ARF_LAYERS];
+
+ // Stores the filter strength of the last frame.
+ double last_frame_unsharp_amount[MAX_ARF_LAYERS];
+
+ // Stores the origial qindex before scaling.
+ int original_qindex;
+
+ // VMAF model used in VMAF caculations.
+ VmafModel *vmaf_model;
+} TuneVMAFInfo;
+
+struct AV1_COMP;
+
+void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG *source);
+
+void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi);
+
+void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult);
+
+int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex);
+
+void av1_update_vmaf_curve(struct AV1_COMP *cpi);
+
+#endif // AOM_AV1_ENCODER_TUNE_VMAF_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 0000000000..aab5e1398d
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,3422 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * Contains the details of the ML models used for pruning transform size. This
+ * file is only included by av1/encoder/tx_search.c.
+ */
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+/***************************CONFIG_NN_V2 (New)********************************/
+#if CONFIG_NN_V2
+// Tx type model for 4x4 block.
+static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x4_hor_layer0_weights, // weights
+ av1_tx_type_nn_4x4_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x4_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x4_hor_layer1_weights,
+ av1_tx_type_nn_4x4_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x4_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x4_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x4_ver_layer0_weights, // weights
+ av1_tx_type_nn_4x4_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x4_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x4_ver_layer1_weights,
+ av1_tx_type_nn_4x4_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x4_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x4_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = {
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = {
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = {
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = {
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x8_hor_layer0_weights, // weights
+ av1_tx_type_nn_4x8_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x8_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x8_hor_layer1_weights,
+ av1_tx_type_nn_4x8_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x8_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x8_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = {
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = {
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = {
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = {
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_4x8_ver_layer0_weights, // weights
+ av1_tx_type_nn_4x8_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x8_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x8_ver_layer1_weights,
+ av1_tx_type_nn_4x8_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x8_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x8_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = {
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = {
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = {
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = {
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x4_hor_layer0_weights, // weights
+ av1_tx_type_nn_8x4_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x4_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x4_hor_layer1_weights,
+ av1_tx_type_nn_8x4_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x4_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x4_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = {
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = {
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = {
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = {
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_8x4_ver_layer0_weights, // weights
+ av1_tx_type_nn_8x4_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x4_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x4_ver_layer1_weights,
+ av1_tx_type_nn_8x4_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x4_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x4_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x8_hor_layer0_weights, // weights
+ av1_tx_type_nn_8x8_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x8_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x8_hor_layer1_weights,
+ av1_tx_type_nn_8x8_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x8_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x8_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x8_ver_layer0_weights, // weights
+ av1_tx_type_nn_8x8_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x8_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x8_ver_layer1_weights,
+ av1_tx_type_nn_8x8_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x8_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x8_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = {
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = {
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = {
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = {
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x16_hor_layer0_weights, // weights
+ av1_tx_type_nn_8x16_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x16_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x16_hor_layer1_weights,
+ av1_tx_type_nn_8x16_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x16_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x16_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = {
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = {
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = {
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = {
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x16_ver_layer0_weights, // weights
+ av1_tx_type_nn_8x16_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x16_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x16_ver_layer1_weights,
+ av1_tx_type_nn_8x16_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x16_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x16_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = {
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = {
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = {
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = {
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x8_hor_layer0_weights, // weights
+ av1_tx_type_nn_16x8_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x8_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x8_hor_layer1_weights,
+ av1_tx_type_nn_16x8_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x8_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x8_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = {
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = {
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = {
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = {
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x8_ver_layer0_weights, // weights
+ av1_tx_type_nn_16x8_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x8_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x8_ver_layer1_weights,
+ av1_tx_type_nn_16x8_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x8_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x8_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static float av1_tx_type_nn_16x16_layer0_weights[128] = {
+ 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f,
+ 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f,
+ -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+ -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f,
+ 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f,
+ 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f,
+ 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f,
+ 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f,
+ -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f,
+ 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f,
+ 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f,
+ 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f,
+ -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f,
+ 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f,
+ 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f,
+ -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+ -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f,
+ 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f,
+ 0.50355f, 0.08592f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_bias[16] = {
+ -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f,
+ -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f,
+ -0.14062f, -0.42120f, 0.94573f, -0.09287f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_weights[64] = {
+ -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f,
+ 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f,
+ 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+ 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f,
+ 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+ 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f,
+ -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f,
+ 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f,
+ -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f,
+ 1.08829f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_bias[4] = {
+ 0.81986f,
+ 1.26865f,
+ 0.11118f,
+ 2.48404f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x16_layer0_weights, // weights
+ av1_tx_type_nn_16x16_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x16_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x16_layer1_weights,
+ av1_tx_type_nn_16x16_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x16_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x16_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x16_hor_layer0_weights, // weights
+ av1_tx_type_nn_4x16_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x16_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x16_hor_layer1_weights,
+ av1_tx_type_nn_4x16_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x16_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x16_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_4x16_ver_layer0_weights, // weights
+ av1_tx_type_nn_4x16_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x16_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x16_ver_layer1_weights,
+ av1_tx_type_nn_4x16_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x16_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x16_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x4_hor_layer0_weights, // weights
+ av1_tx_type_nn_16x4_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x4_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x4_hor_layer1_weights,
+ av1_tx_type_nn_16x4_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x4_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x4_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_16x4_ver_layer0_weights, // weights
+ av1_tx_type_nn_16x4_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x4_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x4_ver_layer1_weights,
+ av1_tx_type_nn_16x4_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x4_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x4_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = {
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = {
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+#else
+/******************************CONFIG_NN***************************************/
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_hor_layer0,
+ av1_tx_type_nn_weights_4x4_hor_layer1 },
+ { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_ver_layer0,
+ av1_tx_type_nn_weights_4x4_ver_layer1 },
+ { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x8_hor_layer0,
+ av1_tx_type_nn_weights_4x8_hor_layer1 },
+ { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x8_ver_layer0,
+ av1_tx_type_nn_weights_4x8_ver_layer1 },
+ { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x4_hor_layer0,
+ av1_tx_type_nn_weights_8x4_hor_layer1 },
+ { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x4_ver_layer0,
+ av1_tx_type_nn_weights_8x4_ver_layer1 },
+ { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_hor_layer0,
+ av1_tx_type_nn_weights_8x8_hor_layer1 },
+ { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_ver_layer0,
+ av1_tx_type_nn_weights_8x8_ver_layer1 },
+ { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x16_hor_layer0,
+ av1_tx_type_nn_weights_8x16_hor_layer1 },
+ { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x16_ver_layer0,
+ av1_tx_type_nn_weights_8x16_ver_layer1 },
+ { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x8_hor_layer0,
+ av1_tx_type_nn_weights_16x8_hor_layer1 },
+ { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x8_ver_layer0,
+ av1_tx_type_nn_weights_16x8_ver_layer1 },
+ { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+ 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f,
+ 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f,
+ -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+ -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f,
+ 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f,
+ 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f,
+ 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f,
+ 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f,
+ -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f,
+ 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f,
+ 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f,
+ 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f,
+ -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f,
+ 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f,
+ 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f,
+ -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+ -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f,
+ 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f,
+ 0.50355f, 0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+ -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f,
+ -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f,
+ -0.14062f, -0.42120f, 0.94573f, -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+ -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f,
+ 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f,
+ 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+ 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f,
+ 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+ 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f,
+ -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f,
+ 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f,
+ -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f,
+ 1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+ 0.81986f,
+ 1.26865f,
+ 0.11118f,
+ 2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_type_nn_weights_16x16_layer0,
+ av1_tx_type_nn_weights_16x16_layer1,
+ },
+ {
+ av1_tx_type_nn_bias_16x16_layer0,
+ av1_tx_type_nn_bias_16x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_hor_layer0,
+ av1_tx_type_nn_weights_4x16_hor_layer1 },
+ { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_ver_layer0,
+ av1_tx_type_nn_weights_4x16_ver_layer1 },
+ { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x4_hor_layer0,
+ av1_tx_type_nn_weights_16x4_hor_layer1 },
+ { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x4_ver_layer0,
+ av1_tx_type_nn_weights_16x4_ver_layer1 },
+ { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *const av1_tx_type_nnconfig_map_hor[] = {
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+
+static const NN_CONFIG *const av1_tx_type_nnconfig_map_ver[] = {
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+#endif // CONFIG_NN_V2
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+ 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f,
+ -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f,
+ 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f,
+ -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f,
+ -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f,
+ 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f,
+ 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+ 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f,
+ 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f,
+ 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f,
+ -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+ 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f,
+ -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+ -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f,
+ 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f,
+ -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f,
+ -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f,
+ 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+ -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f,
+ -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f,
+ 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f,
+ -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+ 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f,
+ -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f,
+ 0.262171f, -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+ -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+ -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f,
+ 0.085082f, 0.614986f, 0.847904f, 0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+ 0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_4x8_layer0,
+ av1_tx_split_nn_weights_4x8_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_4x8_layer0,
+ av1_tx_split_nn_bias_4x8_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+ 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+ -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f,
+ -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+ -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+ 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f,
+ 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+ 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+ -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+ 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f,
+ 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f,
+ 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f,
+ 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f,
+ -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+ -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+ 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f,
+ -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+ 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f,
+ 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f,
+ -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+ -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+ 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f,
+ -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f,
+ -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f,
+ 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+ 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f,
+ 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+ 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+ -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+ -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 12,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x8_layer0,
+ av1_tx_split_nn_weights_8x8_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x8_layer0,
+ av1_tx_split_nn_bias_8x8_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+ 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f,
+ 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f,
+ -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f,
+ -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f,
+ -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f,
+ -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f,
+ 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f,
+ 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f,
+ -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+ -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f,
+ -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f,
+ -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f,
+ 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f,
+ 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f,
+ -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+ 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f,
+ 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+ 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f,
+ 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f,
+ -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+ 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f,
+ 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f,
+ 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f,
+ -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f,
+ -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f,
+ 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f,
+ -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f,
+ 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+ 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f,
+ 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f,
+ 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+ 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f,
+ -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f,
+ -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+ 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f,
+ 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f,
+ -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f,
+ -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f,
+ 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f,
+ 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f,
+ 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f,
+ 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+ -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f,
+ -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+ 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+ -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f,
+ 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f,
+ -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+ -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f,
+ 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f,
+ 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f,
+ -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+ 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+ -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+ -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f,
+ 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f,
+ 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f,
+ 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+ -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f,
+ -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+ -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f,
+ 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+ -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f,
+ 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f,
+ -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+ -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f,
+ -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f,
+ -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+ 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f,
+ 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f,
+ 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f,
+ -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+ -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f,
+ -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f,
+ 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f,
+ 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f,
+ 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f,
+ -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f,
+ 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f,
+ -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f,
+ 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f,
+ 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+ 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f,
+ -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f,
+ -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+ -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+ -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f,
+ -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f,
+ 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f,
+ 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f,
+ 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f,
+ 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f,
+ -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f,
+ -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f,
+ 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f,
+ -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f,
+ -0.255844f, -0.078400f, 0.476752f, 0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+ -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f,
+ 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f,
+ 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+ -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f,
+ 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f,
+ 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+ 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f,
+ -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f,
+ 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f,
+ -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f,
+ -0.256734f, 0.177370f, 0.213522f, -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+ 0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x16_layer0,
+ av1_tx_split_nn_weights_8x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x16_layer0,
+ av1_tx_split_nn_bias_8x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+ -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f,
+ 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f,
+ 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+ -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f,
+ 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f,
+ -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f,
+ -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+ -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f,
+ 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f,
+ -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f,
+ 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f,
+ -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f,
+ -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f,
+ 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f,
+ 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+ -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f,
+ -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f,
+ -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+ -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f,
+ -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+ -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f,
+ -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f,
+ 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+ -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f,
+ -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f,
+ -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f,
+ 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f,
+ -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+ 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f,
+ -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f,
+ -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f,
+ 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f,
+ 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+ 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+ -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f,
+ -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f,
+ 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f,
+ 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f,
+ -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+ -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f,
+ 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f,
+ 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f,
+ 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f,
+ -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f,
+ -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f,
+ -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f,
+ 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f,
+ 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+ -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f,
+ 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f,
+ -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f,
+ -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+ -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f,
+ -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f,
+ -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f,
+ -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+ 0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x16_layer0,
+ av1_tx_split_nn_weights_16x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x16_layer0,
+ av1_tx_split_nn_bias_16x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+ -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+ -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f,
+ 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f,
+ -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+ -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f,
+ 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f,
+ -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+ -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f,
+ -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+ 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+ -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f,
+ 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f,
+ -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f,
+ -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f,
+ -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f,
+ 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+ -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f,
+ 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f,
+ -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+ -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f,
+ 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+ -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+ 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f,
+ -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f,
+ 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f,
+ -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+ -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+ -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f,
+ 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+ -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f,
+ 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f,
+ 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f,
+ 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f,
+ 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f,
+ -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f,
+ 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f,
+ 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f,
+ 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f,
+ 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f,
+ 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f,
+ -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f,
+ 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+ 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f,
+ -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f,
+ -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f,
+ -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f,
+ -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f,
+ -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f,
+ -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f,
+ 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f,
+ -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f,
+ -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f,
+ 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+ 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+ -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f,
+ -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f,
+ -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f,
+ 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f,
+ -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f,
+ -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f,
+ 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f,
+ 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f,
+ 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f,
+ 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+ 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f,
+ -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f,
+ -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f,
+ -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f,
+ 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f,
+ 0.254942f, -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+ -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f,
+ -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f,
+ 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f,
+ -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f,
+ 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f,
+ -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+ 0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_32x32_layer0,
+ av1_tx_split_nn_weights_32x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_32x32_layer0,
+ av1_tx_split_nn_bias_32x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+ -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+ 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f,
+ 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f,
+ 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f,
+ -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+ -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f,
+ 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f,
+ -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f,
+ -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f,
+ 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f,
+ -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f,
+ 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f,
+ 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f,
+ -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f,
+ -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f,
+ 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f,
+ 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f,
+ 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f,
+ -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f,
+ -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f,
+ 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+ -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f,
+ 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f,
+ 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f,
+ 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f,
+ 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f,
+ 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f,
+ -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f,
+ 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f,
+ 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+ 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f,
+ 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f,
+ 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f,
+ 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f,
+ -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+ -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f,
+ 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+ -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+ -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f,
+ -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f,
+ -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f,
+ -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f,
+ -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f,
+ -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f,
+ 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f,
+ 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f,
+ 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f,
+ -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+ -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f,
+ 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f,
+ 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f,
+ 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f,
+ -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f,
+ -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f,
+ -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f,
+ 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f,
+ -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f,
+ -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f,
+ -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+ -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f,
+ 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f,
+ -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f,
+ 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f,
+ 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+ 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f,
+ -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f,
+ 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+ 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f,
+ -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f,
+ -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+ 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f,
+ -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f,
+ -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f,
+ -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f,
+ 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f,
+ 0.207812f, 0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_64x64_layer0,
+ av1_tx_split_nn_weights_64x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_64x64_layer0,
+ av1_tx_split_nn_bias_64x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+ -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+ -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f,
+ -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f,
+ -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+ -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f,
+ -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+ -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f,
+ 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f,
+ 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f,
+ 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f,
+ 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+ -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f,
+ -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+ -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f,
+ -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f,
+ -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f,
+ -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+ -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f,
+ 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f,
+ 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f,
+ 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f,
+ -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+ 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f,
+ -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+ -1.191704f, -3.800073f, 4.121552f, -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+ -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f,
+ -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f,
+ 0.462109f, 0.343315f, 1.092593f, 0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+ 0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_4x16_layer0,
+ av1_tx_split_nn_weights_4x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_4x16_layer0,
+ av1_tx_split_nn_bias_4x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+ 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f,
+ 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f,
+ 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f,
+ 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f,
+ -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f,
+ 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f,
+ -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f,
+ -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f,
+ 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f,
+ -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f,
+ 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+ -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f,
+ -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f,
+ 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f,
+ 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f,
+ 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f,
+ -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f,
+ 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f,
+ 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f,
+ 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f,
+ -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+ -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f,
+ -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+ -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f,
+ -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f,
+ -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f,
+ -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f,
+ 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f,
+ 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f,
+ -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f,
+ 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+ -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f,
+ -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f,
+ 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f,
+ -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f,
+ 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f,
+ 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f,
+ -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f,
+ -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f,
+ 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f,
+ 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f,
+ 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f,
+ -0.129147f, 0.045916f, -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+ 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f,
+ 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f,
+ 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f,
+ 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f,
+ 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f,
+ -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+ 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f,
+ -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f,
+ -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f,
+ -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f,
+ -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f,
+ 0.418904f, 1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+ -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x32_layer0,
+ av1_tx_split_nn_weights_16x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x32_layer0,
+ av1_tx_split_nn_bias_16x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+ 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f,
+ -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f,
+ 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f,
+ 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f,
+ 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f,
+ 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f,
+ -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f,
+ 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f,
+ 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f,
+ -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f,
+ -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f,
+ 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f,
+ -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f,
+ -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f,
+ 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+ -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f,
+ -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f,
+ -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f,
+ 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f,
+ 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+ 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f,
+ 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+ -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+ 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f,
+ 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f,
+ -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f,
+ -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f,
+ 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f,
+ -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+ -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f,
+ -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f,
+ -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f,
+ -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+ -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f,
+ 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f,
+ 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f,
+ 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f,
+ -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f,
+ 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f,
+ -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f,
+ -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f,
+ 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f,
+ 0.440626f, -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+ 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f,
+ -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f,
+ -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f,
+ 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f,
+ 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f,
+ 0.552712f, 0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+ 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f,
+ 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f,
+ -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f,
+ -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f,
+ 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f,
+ 0.352981f, 0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+ -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_32x64_layer0,
+ av1_tx_split_nn_weights_32x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_32x64_layer0,
+ av1_tx_split_nn_bias_32x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+ -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f,
+ -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f,
+ 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f,
+ 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f,
+ -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f,
+ 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f,
+ 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f,
+ 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f,
+ 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f,
+ 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f,
+ 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f,
+ 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f,
+ 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f,
+ 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f,
+ 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f,
+ 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f,
+ 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f,
+ 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f,
+ -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f,
+ 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f,
+ 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f,
+ -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f,
+ 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f,
+ -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f,
+ 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f,
+ 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f,
+ 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f,
+ 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f,
+ 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f,
+ 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+ -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f,
+ 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+ -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+ -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f,
+ -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f,
+ 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+ 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f,
+ -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f,
+ -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f,
+ 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+ 0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x32_layer0,
+ av1_tx_split_nn_weights_8x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x32_layer0,
+ av1_tx_split_nn_bias_8x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+ -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+ -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+ -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f,
+ 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f,
+ -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+ -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+ -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f,
+ 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f,
+ -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+ -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+ -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f,
+ 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f,
+ -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f,
+ -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f,
+ 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f,
+ 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f,
+ -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f,
+ -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f,
+ -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f,
+ -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f,
+ 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f,
+ 0.101996f, 0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+ 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f,
+ -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f,
+ -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+ -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f,
+ 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f,
+ 0.348337f, -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+ 0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x64_layer0,
+ av1_tx_split_nn_weights_16x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x64_layer0,
+ av1_tx_split_nn_bias_16x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *const av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+ NULL, // TX_4X4,
+ &av1_tx_split_nnconfig_8x8, // TX_8X8,
+ &av1_tx_split_nnconfig_16x16, // TX_16X16,
+ &av1_tx_split_nnconfig_32x32, // TX_32X32,
+ &av1_tx_split_nnconfig_64x64, // TX_64X64,
+ &av1_tx_split_nnconfig_4x8, // TX_4X8,
+ &av1_tx_split_nnconfig_4x8, // TX_8X4,
+ &av1_tx_split_nnconfig_8x16, // TX_8X16,
+ &av1_tx_split_nnconfig_8x16, // TX_16X8,
+ &av1_tx_split_nnconfig_16x32, // TX_16X32,
+ &av1_tx_split_nnconfig_16x32, // TX_32X16,
+ &av1_tx_split_nnconfig_32x64, // TX_32X64,
+ &av1_tx_split_nnconfig_32x64, // TX_64X32,
+ &av1_tx_split_nnconfig_4x16, // TX_4X16,
+ &av1_tx_split_nnconfig_4x16, // TX_16X4,
+ &av1_tx_split_nnconfig_8x32, // TX_8X32,
+ &av1_tx_split_nnconfig_8x32, // TX_32X8,
+ &av1_tx_split_nnconfig_16x64, // TX_16X64,
+ &av1_tx_split_nnconfig_16x64, // TX_64X16,
+};
+
+#if !CONFIG_REALTIME_ONLY
+#define NUM_INTRA_TX_SPLIT_FEATURES 14
+#define NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS 1
+#define NUM_INTRA_TX_SPLIT_HIDDEN_NODES 16
+// Model to prune intra transform depth for intra 8x8 block.
+static const float av1_intra_tx_split_8x8_mean[NUM_INTRA_TX_SPLIT_FEATURES] = {
+ 0.110706f, 18.901518f, 0.250436f, 13.483487f, 0.118141f,
+ 14.318728f, 0.028409f, 14.257664f, 0.045839f, 15.143358f,
+ 9.702971f, 14.300809f, 6.018646f, 3.682534f,
+};
+
+static const float av1_intra_tx_split_8x8_std[NUM_INTRA_TX_SPLIT_FEATURES] = {
+ 13.750575f, 13.440116f, 14.334330f, 12.236641f, 18.415247f,
+ 12.733355f, 18.309339f, 12.858130f, 23.465142f, 13.447014f,
+ 8.625048f, 10.456774f, 1.185447f, 1.810423f,
+};
+
+static const float av1_intra_tx_split_nn_weights_8x8_layer0
+ [NUM_INTRA_TX_SPLIT_FEATURES * NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+ -0.156142f, -0.753623f, 0.026883f, 0.039188f, -0.035310f, 0.106140f,
+ 0.051622f, 0.077838f, 0.101632f, 0.107278f, 0.232200f, 0.269083f,
+ 0.048966f, -1.553293f, -0.113983f, -0.151248f, -0.067369f, 0.787292f,
+ 0.076651f, -0.802634f, 0.266414f, 1.107563f, -0.068848f, -0.956468f,
+ -0.074920f, -0.192258f, 0.006207f, 0.176196f, -0.493442f, 0.152290f,
+ -0.208874f, -0.014658f, 0.297385f, -0.351695f, 0.246295f, -0.178519f,
+ -0.204191f, 0.049663f, -0.330343f, -0.299754f, 0.246215f, -0.014558f,
+ -0.117611f, 0.206445f, 0.045840f, -0.047563f, -0.049679f, 0.406892f,
+ -0.052307f, -1.513404f, 0.166166f, 0.520760f, -0.143320f, -0.593928f,
+ -0.010533f, 0.250752f, 0.076738f, 0.537512f, -0.082619f, -1.534031f,
+ 0.047109f, 0.634247f, -0.089730f, 0.545534f, -0.022742f, -0.779047f,
+ -0.606358f, -0.199145f, -0.051269f, 0.248784f, 0.327545f, -0.851751f,
+ 0.071739f, 0.035975f, 0.387781f, -0.136427f, -0.284436f, 0.578449f,
+ -0.198276f, 0.579950f, 0.600111f, -0.370164f, -0.215297f, 0.517342f,
+ 0.200061f, -2.507660f, -0.030851f, 0.227315f, -0.078289f, 0.276052f,
+ -0.050281f, 0.251481f, -0.139318f, 0.281175f, 0.226524f, 0.058968f,
+ 0.197436f, 0.517294f, -0.105914f, -1.599567f, 0.064985f, 0.043209f,
+ -0.280038f, 0.126874f, 0.330387f, -0.014407f, 0.031241f, 0.237801f,
+ 0.948959f, -0.253791f, -0.022622f, -0.061430f, 0.265852f, 0.750823f,
+ 0.086606f, 0.853527f, -0.180971f, -1.255744f, -0.152979f, -1.022198f,
+ -0.044708f, 0.506424f, -0.501968f, -0.416863f, -0.012688f, 0.193523f,
+ -0.093698f, 0.430875f, 0.007379f, 0.019278f, 0.080890f, 0.462755f,
+ -0.054326f, -0.157611f, -0.004851f, -1.275676f, -0.060528f, -0.508170f,
+ 0.195429f, -0.023534f, 0.355211f, 0.983561f, -0.122036f, -0.911948f,
+ -0.172280f, -1.135245f, -0.043211f, 0.576456f, -0.075247f, 0.429734f,
+ -0.246309f, -0.355575f, -0.048809f, 0.217113f, 0.078385f, 0.720341f,
+ 0.007070f, 0.144617f, -0.167642f, 0.303056f, -0.031425f, 0.123448f,
+ -0.320530f, 0.164070f, -0.497849f, -0.233918f, -0.032123f, 0.084983f,
+ 0.312216f, 0.062609f, -0.389815f, 0.237593f, 0.000157f, -0.642068f,
+ 0.167898f, 0.495234f, -0.083493f, -0.555971f, 0.124437f, 0.381125f,
+ -0.459219f, 0.047924f, -0.138222f, -2.232816f, 0.127585f, -0.102420f,
+ 0.131598f, 0.036837f, -0.163055f, -0.067429f, -0.078521f, -0.055666f,
+ 1.387057f, 0.400154f, -0.003355f, -0.073627f, -0.305098f, -0.413383f,
+ -0.008266f, -0.038329f, 0.209808f, 0.375777f, 0.037274f, -0.050226f,
+ -0.100576f, 0.237441f, 0.237854f, 0.828296f, 0.001149f, -0.093964f,
+ 0.214051f, -0.031486f, -0.561307f, 0.014540f, 0.169357f, 0.323202f,
+ -0.395334f, -0.038941f, 0.476800f, -0.213122f, -0.287521f, -0.420717f,
+ -0.054142f, -0.102266f,
+ };
+
+static const float
+ av1_intra_tx_split_nn_bias_8x8_layer0[NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+ -1.150850f, -0.236404f, 0.184554f, -0.904162f, -0.949979f, 0.427016f,
+ -0.546867f, -0.611094f, -0.676570f, -0.208959f, -0.286384f, 0.562238f,
+ 0.434197f, -0.746518f, 0.123085f, -0.549836f,
+ };
+
+static const float av1_intra_tx_split_nn_weights_8x8_layer1
+ [NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+ 0.749814f, 0.598172f, 0.375611f, 0.751612f, 0.947538f, -0.282228f,
+ -1.457522f, -1.092290f, 0.738657f, 0.575779f, 0.514823f, -0.560616f,
+ -0.491619f, -1.482014f, 0.524625f, -0.533590f,
+ };
+
+static const float av1_intra_tx_split_nn_bias_8x8_layer1[1] = {
+ -0.488888f,
+};
+
+static const NN_CONFIG av1_intra_tx_split_nnconfig_8x8 = {
+ NUM_INTRA_TX_SPLIT_FEATURES, // num_inputs
+ 1, // num_outputs
+ NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS, // num_hidden_layers
+ {
+ NUM_INTRA_TX_SPLIT_HIDDEN_NODES,
+ }, // num_hidden_nodes
+ {
+ av1_intra_tx_split_nn_weights_8x8_layer0,
+ av1_intra_tx_split_nn_weights_8x8_layer1,
+ },
+ {
+ av1_intra_tx_split_nn_bias_8x8_layer0,
+ av1_intra_tx_split_nn_bias_8x8_layer1,
+ },
+};
+
+static const float av1_intra_tx_prune_nn_thresh_8x8[2] = { -0.405465f,
+ 0.405465f };
+#endif // !CONFIG_REALTIME_ONLY
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/tx_search.c b/third_party/aom/av1/encoder/tx_search.c
new file mode 100644
index 0000000000..7292c01191
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_search.c
@@ -0,0 +1,3830 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/common/idct.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/sorting_network.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+#include "av1/encoder/tx_search.h"
+#include "av1/encoder/txb_rdopt.h"
+
+#define PROB_THRESH_OFFSET_TX_TYPE 100
+
+struct rdcost_block_args {
+ const AV1_COMP *cpi;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+ RD_STATS rd_stats;
+ int64_t current_rd;
+ int64_t best_rd;
+ int exit_early;
+ int incomplete_exit;
+ FAST_TX_SEARCH_MODE ftxs_mode;
+ int skip_trellis;
+};
+
+typedef struct {
+ int64_t rd;
+ int txb_entropy_ctx;
+ TX_TYPE tx_type;
+} TxCandidateInfo;
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+ {
+ 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+ },
+ {
+ 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+ },
+ {
+ 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+ },
+};
+
+// lookup table for predict_skip_txfm
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+ TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4,
+ TX_8X8, TX_8X8, TX_16X16, TX_16X16,
+};
+
+// look-up table for sqrt of number of pixels in a transform block
+// rounded up to the nearest integer.
+static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6,
+ 12, 12, 23, 23, 32, 32, 8,
+ 8, 16, 16, 23, 23 };
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+ const int16_t *diff = x->plane[0].src_diff;
+ const uint32_t hash =
+ av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator,
+ (uint8_t *)diff, 2 * rows * cols);
+ return (hash << 5) + bsize;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+ const int64_t ref_best_rd,
+ const uint32_t hash) {
+ int32_t match_index = -1;
+ if (ref_best_rd != INT64_MAX) {
+ for (int i = 0; i < mb_rd_record->num; ++i) {
+ const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+ // If there is a match in the mb_rd_record, fetch the RD decision and
+ // terminate early.
+ if (mb_rd_record->mb_rd_info[index].hash_value == hash) {
+ match_index = index;
+ break;
+ }
+ }
+ }
+ return match_index;
+}
+
+static AOM_INLINE void fetch_mb_rd_info(int n4,
+ const MB_RD_INFO *const mb_rd_info,
+ RD_STATS *const rd_stats,
+ MACROBLOCK *const x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ mbmi->tx_size = mb_rd_info->tx_size;
+ memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip,
+ sizeof(mb_rd_info->blk_skip[0]) * n4);
+ av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size);
+ av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4);
+ *rd_stats = mb_rd_info->rd_stats;
+}
+
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8) {
+ int visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+
+ diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+ uint64_t sse =
+ aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+ if (block_mse_q8 != NULL) {
+ if (visible_cols > 0 && visible_rows > 0)
+ *block_mse_q8 =
+ (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+ else
+ *block_mse_q8 = UINT_MAX;
+ }
+ return sse;
+}
+
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+ MACROBLOCK *x, int plane, int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+ int visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+
+ diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+ uint64_t sse = 0;
+ int sum = 0;
+ sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+ if (visible_cols > 0 && visible_rows > 0) {
+ double norm_factor = 1.0 / (visible_cols * visible_rows);
+ int sign_sum = sum > 0 ? 1 : -1;
+ // Conversion to transform domain
+ *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+ *per_px_mean = sign_sum * (*per_px_mean);
+ *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+ *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+ } else {
+ *block_mse_q8 = UINT_MAX;
+ }
+ return sse;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+ int reduced_tx_set) {
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+ *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+ const int64_t mse = *dist / bw / bh;
+ // Normalized quantizer takes the transform upscaling factor (8 for tx size
+ // smaller than 32) into account.
+ const int16_t normalized_dc_q = dc_q >> 3;
+ const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+ // For faster early skip decision, use dist to compare against threshold so
+ // that quality risk is less for the skip=1 decision. Otherwise, use mse
+ // since the fwd_txfm coeff checks will take care of quality
+ // TODO(any): Use dist to return 0 when skip_txfm_level is 1
+ int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse;
+ // Predict not to skip when error is larger than threshold.
+ if (pred_err > mse_thresh) return 0;
+ // Return as skip otherwise for aggressive early skip
+ else if (txfm_params->skip_txfm_level >= 2)
+ return 1;
+
+ const int max_tx_size = max_predict_sf_tx_size[bsize];
+ const int tx_h = tx_size_high[max_tx_size];
+ const int tx_w = tx_size_wide[max_tx_size];
+ DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+ TxfmParam param;
+ param.tx_type = DCT_DCT;
+ param.tx_size = max_tx_size;
+ param.bd = xd->bd;
+ param.is_hbd = is_cur_buf_hbd(xd);
+ param.lossless = 0;
+ param.tx_set_type = av1_get_ext_tx_set_type(
+ param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+ const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+ const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+ const int16_t *src_diff = x->plane[0].src_diff;
+ const int n_coeff = tx_w * tx_h;
+ const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+ const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+ const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+ for (int row = 0; row < bh; row += tx_h) {
+ for (int col = 0; col < bw; col += tx_w) {
+ av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+ // Operating on TX domain, not pixels; we want the QTX quantizers
+ const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+ if (dc_coef >= dc_thresh) return 0;
+ for (int i = 1; i < n_coeff; ++i) {
+ const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+ if (ac_coef >= ac_thresh) return 0;
+ }
+ }
+ src_diff += tx_h * bw;
+ }
+ return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int64_t dist) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int n4 = bsize_to_num_blk(bsize);
+ const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
+ memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+ mbmi->tx_size = tx_size;
+ for (int i = 0; i < n4; ++i)
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
+ rd_stats->skip_txfm = 1;
+ if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+ rd_stats->dist = rd_stats->sse = (dist << 4);
+ // Though decision is to make the block as skip based on luma stats,
+ // it is possible that block becomes non skip after chroma rd. In addition
+ // intermediate non skip costs calculated by caller function will be
+ // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not
+ // accounted). Hence intermediate rate is populated to code the luma tx blks
+ // as skip, the caller function based on final rd decision (i.e., skip vs
+ // non-skip) sets the final rate accordingly. Here the rate populated
+ // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+ // size possible) in the current block. Eg: For 128*128 block, rate would be
+ // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+ // block as 'all zeros'
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+ ENTROPY_CONTEXT *ta = ctxa;
+ ENTROPY_CONTEXT *tl = ctxl;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->rate = zero_blk_rate *
+ (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+ (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash,
+ const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ MB_RD_RECORD *mb_rd_record) {
+ int index;
+ if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) {
+ index =
+ (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN;
+ ++mb_rd_record->num;
+ } else {
+ index = mb_rd_record->index_start;
+ mb_rd_record->index_start =
+ (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+ }
+ MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ mb_rd_info->hash_value = hash;
+ mb_rd_info->tx_size = mbmi->tx_size;
+ memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(mb_rd_info->blk_skip[0]) * n4);
+ av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size);
+ av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4);
+ mb_rd_info->rd_stats = *rd_stats;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+ const SPEED_FEATURES *sf,
+ int tx_size_search_method) {
+ if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+ if (sf->tx_sf.tx_size_search_lgr_block) {
+ if (mi_width > mi_size_wide[BLOCK_64X64] ||
+ mi_height > mi_size_high[BLOCK_64X64])
+ return MAX_VARTX_DEPTH;
+ }
+
+ if (is_inter) {
+ return (mi_height != mi_width)
+ ? sf->tx_sf.inter_tx_size_search_init_depth_rect
+ : sf->tx_sf.inter_tx_size_search_init_depth_sqr;
+ } else {
+ return (mi_height != mi_width)
+ ? sf->tx_sf.intra_tx_size_search_init_depth_rect
+ : sf->tx_sf.intra_tx_size_search_init_depth_sqr;
+ }
+}
+
+static AOM_INLINE void select_tx_block(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void get_energy_distribution_fine(
+ const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int need_4th, double *hordist,
+ double *verdist) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+ // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+ // functions for the 16 (very small) sub-blocks of this block.
+ const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+ const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+ assert(bw <= 32);
+ assert(bh <= 32);
+ assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+ if (cpi->common.seq_params->use_highbitdepth) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] +=
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+ }
+ } else {
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+ (src[j + i * src_stride] - dst[j + i * dst_stride]);
+ }
+ }
+ } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+ const int f_index =
+ (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+ assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+ const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+ assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+ assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[1]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[2]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[3]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[5]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[6]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[7]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[9]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[10]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[11]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[13]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[14]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[15]);
+ }
+
+ double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+ esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+ esq[12] + esq[13] + esq[14] + esq[15];
+ if (total > 0) {
+ const double e_recip = 1.0 / total;
+ hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+ hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+ hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+ if (need_4th) {
+ hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+ }
+ verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+ verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+ verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+ if (need_4th) {
+ verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+ }
+ } else {
+ hordist[0] = verdist[0] = 0.25;
+ hordist[1] = verdist[1] = 0.25;
+ hordist[2] = verdist[2] = 0.25;
+ if (need_4th) {
+ hordist[3] = verdist[3] = 0.25;
+ }
+ }
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int err = diff[j * stride + i];
+ sum += err * err;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += abs(diff[j * stride + i]);
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static AOM_INLINE void get_2x2_normalized_sses_and_sads(
+ const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+ int src_stride, const uint8_t *const dst, int dst_stride,
+ const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+ double *const sad_norm_arr) {
+ const BLOCK_SIZE tx_bsize_half =
+ get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+ if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats
+ const int half_width = block_size_wide[tx_bsize] / 2;
+ const int half_height = block_size_high[tx_bsize] / 2;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ const int16_t *const this_src_diff =
+ src_diff + row * half_height * diff_stride + col * half_width;
+ if (sse_norm_arr) {
+ sse_norm_arr[row * 2 + col] =
+ get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ if (sad_norm_arr) {
+ sad_norm_arr[row * 2 + col] =
+ get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ }
+ }
+ } else { // use function pointers to calculate stats
+ const int half_width = block_size_wide[tx_bsize_half];
+ const int half_height = block_size_high[tx_bsize_half];
+ const int num_samples_half = half_width * half_height;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ const uint8_t *const this_src =
+ src + row * half_height * src_stride + col * half_width;
+ const uint8_t *const this_dst =
+ dst + row * half_height * dst_stride + col * half_width;
+
+ if (sse_norm_arr) {
+ unsigned int this_sse;
+ cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+ dst_stride, &this_sse);
+ sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+ }
+
+ if (sad_norm_arr) {
+ const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf(
+ this_src, src_stride, this_dst, dst_stride);
+ sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+ }
+ }
+ }
+ }
+}
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += diff[j * stride + i];
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+static AOM_INLINE void PrintTransformUnitStats(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ TX_TYPE tx_type, int64_t rd) {
+ if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 21743;
+ if (lcg_rand16(&seed) % 256 > 0) return;
+
+ const char output_file[] = "tu_stats.txt";
+ FILE *fout = fopen(output_file, "a");
+ if (!fout) return;
+
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int txw = tx_size_wide[tx_size];
+ const int txh = tx_size_high[tx_size];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int q_step = p->dequant_QTX[1] >> dequant_shift;
+ const int num_samples = txw * txh;
+
+ const double rate_norm = (double)rd_stats->rate / num_samples;
+ const double dist_norm = (double)rd_stats->dist / num_samples;
+
+ fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+ const int src_stride = p->src.stride;
+ const uint8_t *const src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ unsigned int sse;
+ cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ const double sse_norm = (double)sse / num_samples;
+
+ const unsigned int sad =
+ cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+ const double sad_norm = (double)sad / num_samples;
+
+ fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *const src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+
+ double sse_norm_arr[4], sad_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, sad_norm_arr);
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sse_norm_arr[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sad_norm_arr[i]);
+ }
+
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+ fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+ tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rate_norm = (double)model_rate / num_samples;
+ const double model_dist_norm = (double)model_dist / num_samples;
+ fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+ const double mean = get_mean(src_diff, diff_stride, txw, txh);
+ float hor_corr, vert_corr;
+ av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr,
+ &vert_corr);
+ fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+ double hdist[4] = { 0 }, vdist[4] = { 0 };
+ get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+ 1, hdist, vdist);
+ fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+ hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+ fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+ fprintf(fout, "\n");
+ fclose(fout);
+}
+#endif // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int64_t total_sse = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+ unsigned int sse;
+
+ if (plane) continue;
+
+ cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ total_sse += sse;
+ }
+ total_sse <<= 4;
+ return total_sse;
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_dist = (int64_t)round(md->dist_mean);
+ const double est_ld = md->a * sse + md->b;
+ // Clamp estimated rate cost by INT_MAX / 2.
+ // TODO(angiebird@google.com): find better solution than clamping.
+ if (fabs(est_ld) < 1e-2) {
+ *est_residue_cost = INT_MAX / 2;
+ } else {
+ double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+ if (est_residue_cost_dbl < 0) {
+ *est_residue_cost = 0;
+ } else {
+ *est_residue_cost =
+ (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+ }
+ }
+ if (*est_residue_cost <= 0) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ }
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+ const uint8_t *dst8, int dst_stride, int w,
+ int h) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+ sum += diff;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_diff_mean(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+ sum += diff;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data,
+ MACROBLOCK *x,
+ const RD_STATS *const rd_stats,
+ BLOCK_SIZE plane_bsize) {
+ if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+ (tile_data == NULL ||
+ !tile_data->inter_mode_rd_models[plane_bsize].ready))
+ return;
+ (void)tile_data;
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 95014;
+
+ if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+ 1)
+ return;
+
+ const char output_file[] = "pu_stats.txt";
+ FILE *fout = fopen(output_file, "a");
+ if (!fout) return;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+ const int num_samples = bw * bh;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int q_step = p->dequant_QTX[1] >> dequant_shift;
+ const int shift = (xd->bd - 8);
+
+ const double rate_norm = (double)rd_stats->rate / num_samples;
+ const double dist_norm = (double)rd_stats->dist / num_samples;
+ const double rdcost_norm =
+ (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+ fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+ const int src_stride = p->src.stride;
+ const uint8_t *const src = p->src.buf;
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst = pd->dst.buf;
+ const int16_t *const src_diff = p->src_diff;
+
+ int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+ const double sse_norm = (double)sse / num_samples;
+
+ const unsigned int sad =
+ cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+ const double sad_norm =
+ (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+ fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+ double sse_norm_arr[4], sad_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, sad_norm_arr);
+ if (shift) {
+ for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+ for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sse_norm_arr[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sad_norm_arr[i]);
+ }
+
+ fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rdcost_norm =
+ (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+ const double model_rate_norm = (double)model_rate / num_samples;
+ const double model_dist_norm = (double)model_dist / num_samples;
+ fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+ model_rdcost_norm);
+
+ double mean;
+ if (is_cur_buf_hbd(xd)) {
+ mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ bw, bh);
+ }
+ mean /= (1 << shift);
+ float hor_corr, vert_corr;
+ av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
+ &vert_corr);
+ fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+ double hdist[4] = { 0 }, vdist[4] = { 0 };
+ get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, 1, hdist, vdist);
+ fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+ hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+ const int64_t overall_sse = get_sse(cpi, x);
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+ &est_dist);
+ const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+ const double est_dist_norm = (double)est_dist / num_samples;
+ const double est_rdcost_norm =
+ (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+ fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+ est_rdcost_norm);
+ }
+
+ fprintf(fout, "\n");
+ fclose(fout);
+}
+#endif // CONFIG_COLLECT_RD_STATS >= 2
+#endif // CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
+ int plane, int block,
+ int blk_row, int blk_col,
+ int eob,
+ int reduced_tx_set) {
+ if (!eob) return;
+ struct macroblock_plane *const p = &x->plane[plane];
+ MACROBLOCKD *const xd = &x->e_mbd;
+ tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+ tx_size, reduced_tx_set);
+
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ dst_stride, eob, reduced_tx_set);
+}
+
+static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx, int skip_trellis,
+ TX_TYPE best_tx_type, int do_quant,
+ int *rate_cost, uint16_t best_eob) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ if (!is_inter && best_eob &&
+ (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+ blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+ // if the quantized coefficients are stored in the dqcoeff buffer, we don't
+ // need to do transform and quantization again.
+ if (do_quant) {
+ TxfmParam txfm_param_intra;
+ QUANT_PARAM quant_param_intra;
+ av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra);
+ av1_setup_quant(tx_size, !skip_trellis,
+ skip_trellis
+ ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+ : AV1_XFORM_QUANT_FP)
+ : AV1_XFORM_QUANT_FP,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
+ &quant_param_intra);
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+ &txfm_param_intra, &quant_param_intra);
+ if (quant_param_intra.use_optimize_b) {
+ av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+ rate_cost);
+ }
+ }
+
+ inverse_transform_block_facade(x, plane, block, blk_row, blk_col,
+ x->plane[plane].eobs[block],
+ cm->features.reduced_tx_set_used);
+
+ // This may happen because of hash collision. The eob stored in the hash
+ // table is non-zero, but the real eob is zero. We need to make sure tx_type
+ // is DCT_DCT in this case.
+ if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+ best_tx_type != DCT_DCT) {
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+ }
+}
+
+static unsigned pixel_dist_visible_only(
+ const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+ const int src_stride, const uint8_t *dst, const int dst_stride,
+ const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+ int visible_cols) {
+ unsigned sse;
+
+ if (txb_rows == visible_rows && txb_cols == visible_cols) {
+ cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ return sse;
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const MACROBLOCKD *xd = &x->e_mbd;
+ if (is_cur_buf_hbd(xd)) {
+ uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+ visible_cols, visible_rows);
+ return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+ }
+#else
+ (void)x;
+#endif
+ sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+ visible_rows);
+ return sse;
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+ int plane, const uint8_t *src, const int src_stride,
+ const uint8_t *dst, const int dst_stride,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ int txb_rows, txb_cols, visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+ &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+ assert(visible_rows > 0);
+ assert(visible_cols > 0);
+
+ unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+ dst_stride, tx_bsize, txb_rows,
+ txb_cols, visible_rows, visible_cols);
+
+ return sse;
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+ int plane, BLOCK_SIZE plane_bsize,
+ int block, int blk_row, int blk_col,
+ TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const uint16_t eob = p->eobs[block];
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const int bsw = block_size_wide[tx_bsize];
+ const int bsh = block_size_high[tx_bsize];
+ const int src_stride = x->plane[plane].src.stride;
+ const int dst_stride = xd->plane[plane].dst.stride;
+ // Scale the transform block index to pixel unit.
+ const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2;
+ const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
+ const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+ const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+ const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+
+ assert(cpi != NULL);
+ assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+ uint8_t *recon;
+ DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ recon = CONVERT_TO_BYTEPTR(recon16);
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
+ CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
+ } else {
+ recon = (uint8_t *)recon16;
+ aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
+ }
+#else
+ recon = (uint8_t *)recon16;
+ aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
+#endif
+
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cpi->common.features.reduced_tx_set_used);
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+ MAX_TX_SIZE, eob,
+ cpi->common.features.reduced_tx_set_used);
+
+ return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100
+
+// R-D costs are sorted in ascending order.
+static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
+ int i, j, k;
+
+ for (i = 1; i <= len - 1; ++i) {
+ for (j = 0; j < i; ++j) {
+ if (rds[j] > rds[i]) {
+ int64_t temprd;
+ int tempi;
+
+ temprd = rds[i];
+ tempi = txk[i];
+
+ for (k = i; k > j; k--) {
+ rds[k] = rds[k - 1];
+ txk[k] = txk[k - 1];
+ }
+
+ rds[j] = temprd;
+ txk[j] = tempi;
+ break;
+ }
+ }
+ }
+}
+
+static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ const qm_val_t *qmatrix,
+ const int16_t *scan, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ int64_t weight = qmatrix[scan[i]];
+ int64_t dd = coeff[i] - dqcoeff[i];
+ dd *= weight;
+ int64_t cc = coeff[i];
+ cc *= weight;
+ // The ranges of coeff and dqcoeff are
+ // bd8 : 18 bits (including sign)
+ // bd10: 20 bits (including sign)
+ // bd12: 22 bits (including sign)
+ // As AOM_QM_BITS is 5, the intermediate quantities in the calculation
+ // below should fit in 54 bits, thus no overflow should happen.
+ error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+ sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size,
+ const qm_val_t *qmatrix,
+ const int16_t *scan, int64_t *out_dist,
+ int64_t *out_sse) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ // Transform domain distortion computation is more efficient as it does
+ // not involve an inverse transform, but it is less accurate.
+ const int buffer_length = av1_get_max_eob(tx_size);
+ int64_t this_sse;
+ // TX-domain results need to shift down to Q2/D10 to match pixel
+ // domain distortion values which are in Q2^2
+ int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+#if CONFIG_AV1_HIGHBITDEPTH
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (is_cur_buf_hbd(xd)) {
+ // TODO(veluca): handle use_qm_dist_metric for HBD too.
+ *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+ xd->bd);
+ } else {
+#endif
+ if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) {
+ *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+ } else {
+ *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix,
+ scan, &this_sse);
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ }
+#endif
+
+ *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+ *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+ int16_t allowed_tx_mask, int prune_factor,
+ const TXB_CTX *const txb_ctx,
+ int reduced_tx_set_used, int64_t ref_best_rd,
+ int num_sel) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int idx;
+
+ int64_t rds_v[4];
+ int64_t rds_h[4];
+ int idx_v[4] = { 0, 1, 2, 3 };
+ int idx_h[4] = { 0, 1, 2, 3 };
+ int skip_v[4] = { 0 };
+ int skip_h[4] = { 0 };
+ const int idx_map[16] = {
+ DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT,
+ ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST,
+ FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+ H_DCT, H_ADST, H_FLIPADST, IDTX
+ };
+
+ const int sel_pattern_v[16] = {
+ 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3
+ };
+ const int sel_pattern_h[16] = {
+ 0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3
+ };
+
+ QUANT_PARAM quant_param;
+ TxfmParam txfm_param;
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+ &quant_param);
+ int tx_type;
+ // to ensure we can try ones even outside of ext_tx_set of current block
+ // this function should only be called for size < 16
+ assert(txsize_sqr_up_map[tx_size] <= TX_16X16);
+ txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+ int rate_cost = 0;
+ int64_t dist = 0, sse = 0;
+ // evaluate horizontal with vertical DCT
+ for (idx = 0; idx < 4; ++idx) {
+ tx_type = idx_map[idx];
+ txfm_param.tx_type = tx_type;
+
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &dist, &sse);
+
+ rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used, 0);
+
+ rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+ if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) {
+ skip_h[idx] = 1;
+ }
+ }
+ sort_rd(rds_h, idx_h, 4);
+ for (idx = 1; idx < 4; idx++) {
+ if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
+ }
+
+ if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
+
+ // evaluate vertical with the best horizontal chosen
+ rds_v[0] = rds_h[0];
+ int start_v = 1, end_v = 4;
+ const int *idx_map_v = idx_map + idx_h[0];
+
+ for (idx = start_v; idx < end_v; ++idx) {
+ tx_type = idx_map_v[idx_v[idx] * 4];
+ txfm_param.tx_type = tx_type;
+
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &dist, &sse);
+
+ rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used, 0);
+
+ rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+ if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) {
+ skip_v[idx] = 1;
+ }
+ }
+ sort_rd(rds_v, idx_v, 4);
+ for (idx = 1; idx < 4; idx++) {
+ if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1;
+ }
+
+ // combine rd_h and rd_v to prune tx candidates
+ int i_v, i_h;
+ int64_t rds[16];
+ int num_cand = 0, last = TX_TYPES - 1;
+
+ for (int i = 0; i < 16; i++) {
+ i_v = sel_pattern_v[i];
+ i_h = sel_pattern_h[i];
+ tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]];
+ if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] ||
+ skip_v[idx_v[i_v]]) {
+ txk_map[last] = tx_type;
+ last--;
+ } else {
+ txk_map[num_cand] = tx_type;
+ rds[num_cand] = rds_v[i_v] + rds_h[i_h];
+ if (rds[num_cand] == 0) rds[num_cand] = 1;
+ num_cand++;
+ }
+ }
+ sort_rd(rds, txk_map, num_cand);
+
+ uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+ num_sel = AOMMIN(num_sel, num_cand);
+
+ for (int i = 1; i < num_sel; i++) {
+ int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]);
+ if (factor < (int64_t)prune_factor)
+ prune &= ~(1 << txk_map[i]);
+ else
+ break;
+ }
+ return prune;
+}
+
+uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, int *txk_map,
+ uint16_t allowed_tx_mask, int prune_factor,
+ const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int tx_type;
+
+ int64_t rds[TX_TYPES];
+
+ int num_cand = 0;
+ int last = TX_TYPES - 1;
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+ &quant_param);
+
+ for (int idx = 0; idx < TX_TYPES; idx++) {
+ tx_type = idx;
+ int rate_cost = 0;
+ int64_t dist = 0, sse = 0;
+ if (!(allowed_tx_mask & (1 << tx_type))) {
+ txk_map[last] = tx_type;
+ last--;
+ continue;
+ }
+ txfm_param.tx_type = tx_type;
+
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ // do txfm and quantization
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+ // estimate rate cost
+ rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used, 0);
+ // tx domain dist
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &dist, &sse);
+
+ txk_map[num_cand] = tx_type;
+ rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
+ if (rds[num_cand] == 0) rds[num_cand] = 1;
+ num_cand++;
+ }
+
+ if (num_cand == 0) return (uint16_t)0xFFFF;
+
+ sort_rd(rds, txk_map, num_cand);
+ uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+
+ // 0 < prune_factor <= 1000 controls aggressiveness
+ int64_t factor = 0;
+ for (int idx = 1; idx < num_cand; idx++) {
+ factor = 1000 * (rds[idx] - rds[0]) / rds[0];
+ if (factor < (int64_t)prune_factor)
+ prune &= ~(1 << txk_map[idx]);
+ else
+ break;
+ }
+ return prune;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+ // TX_4X4
+ (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+ 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+ 0.09778f, 0.11780f },
+ // TX_8X8
+ (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+ 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+ 0.10803f, 0.14124f },
+ // TX_16X16
+ (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+ 0.06897f, 0.07629f, 0.08875f, 0.11169f },
+ // TX_32X32
+ NULL,
+ // TX_64X64
+ NULL,
+ // TX_4X8
+ (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+ 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+ 0.10168f, 0.12585f },
+ // TX_8X4
+ (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+ 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+ 0.10583f, 0.13123f },
+ // TX_8X16
+ (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+ 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+ 0.10730f, 0.14221f },
+ // TX_16X8
+ (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+ 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+ 0.10339f, 0.13464f },
+ // TX_16X32
+ NULL,
+ // TX_32X16
+ NULL,
+ // TX_32X64
+ NULL,
+ // TX_64X32
+ NULL,
+ // TX_4X16
+ (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+ 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+ 0.10242f, 0.12878f },
+ // TX_16X4
+ (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+ 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+ 0.10217f, 0.12610f },
+ // TX_8X32
+ NULL,
+ // TX_32X8
+ NULL,
+ // TX_16X64
+ NULL,
+ // TX_64X16
+ NULL,
+};
+
+static INLINE float get_adaptive_thresholds(
+ TX_SIZE tx_size, TxSetType tx_set_type,
+ TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
+ const int prune_aggr_table[5][2] = {
+ { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 }
+ };
+ int pruning_aggressiveness = 0;
+ if (tx_set_type == EXT_TX_SET_ALL16)
+ pruning_aggressiveness =
+ prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0];
+ else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+ pruning_aggressiveness =
+ prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1];
+
+ return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
+}
+
+static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
+ int stride, int bw, int bh,
+ float *hordist,
+ float *verdist) {
+ // First compute downscaled block energy values (esq); downscale factors
+ // are defined by w_shift and h_shift.
+ unsigned int esq[256];
+ const int w_shift = bw <= 8 ? 0 : 1;
+ const int h_shift = bh <= 8 ? 0 : 1;
+ const int esq_w = bw >> w_shift;
+ const int esq_h = bh >> h_shift;
+ const int esq_sz = esq_w * esq_h;
+ int i, j;
+ memset(esq, 0, esq_sz * sizeof(esq[0]));
+ if (w_shift) {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j += 2) {
+ cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+ cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+ }
+ }
+ } else {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j++) {
+ cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+ }
+ }
+ }
+
+ uint64_t total = 0;
+ for (i = 0; i < esq_sz; i++) total += esq[i];
+
+ // Output hordist and verdist arrays are normalized 1D projections of esq
+ if (total == 0) {
+ float hor_val = 1.0f / esq_w;
+ for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+ float ver_val = 1.0f / esq_h;
+ for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+ return;
+ }
+
+ const float e_recip = 1.0f / (float)total;
+ memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+ memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+ const unsigned int *cur_esq_row;
+ for (i = 0; i < esq_h - 1; i++) {
+ cur_esq_row = esq + i * esq_w;
+ for (j = 0; j < esq_w - 1; j++) {
+ hordist[j] += (float)cur_esq_row[j];
+ verdist[i] += (float)cur_esq_row[j];
+ }
+ verdist[i] += (float)cur_esq_row[j];
+ }
+ cur_esq_row = esq + i * esq_w;
+ for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+ for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+ for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) {
+ return mask & (1 << val);
+}
+
+static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) {
+ *mask |= (1 << val);
+}
+
+static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) {
+ *mask &= ~(1 << val);
+}
+
+static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int blk_row, int blk_col, TxSetType tx_set_type,
+ TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map,
+ uint16_t *allowed_tx_mask) {
+ // This table is used because the search order is different from the enum
+ // order.
+ static const int tx_type_table_2D[16] = {
+ DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT,
+ ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST,
+ FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+ H_DCT, H_ADST, H_FLIPADST, IDTX
+ };
+ if (tx_set_type != EXT_TX_SET_ALL16 &&
+ tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+ return;
+#if CONFIG_NN_V2
+ NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+ NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#else
+ const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+ const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#endif
+ if (!nn_config_hor || !nn_config_ver) return; // Model not established yet.
+
+ float hfeatures[16], vfeatures[16];
+ float hscores[4], vscores[4];
+ float scores_2D_raw[16];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+ const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+ assert(hfeatures_num <= 16);
+ assert(vfeatures_num <= 16);
+
+ const struct macroblock_plane *const p = &x->plane[0];
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+ vfeatures);
+
+ av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+ &hfeatures[hfeatures_num - 1],
+ &vfeatures[vfeatures_num - 1]);
+
+#if CONFIG_NN_V2
+ av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+ av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
+#else
+ av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+ av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
+#endif
+
+ for (int i = 0; i < 4; i++) {
+ float *cur_scores_2D = scores_2D_raw + i * 4;
+ cur_scores_2D[0] = vscores[i] * hscores[0];
+ cur_scores_2D[1] = vscores[i] * hscores[1];
+ cur_scores_2D[2] = vscores[i] * hscores[2];
+ cur_scores_2D[3] = vscores[i] * hscores[3];
+ }
+
+ assert(TX_TYPES == 16);
+ // This version of the function only works when there are at most 16 classes.
+ // So we will need to change the optimization or use av1_nn_softmax instead if
+ // this ever gets changed.
+ av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw);
+
+ const float score_thresh =
+ get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode);
+
+ // Always keep the TX type with the highest score, prune all others with
+ // score below score_thresh.
+ int max_score_i = 0;
+ float max_score = 0.0f;
+ uint16_t allow_bitmask = 0;
+ float sum_score = 0.0;
+ // Calculate sum of allowed tx type score and Populate allow bit mask based
+ // on score_thresh and allowed_tx_mask
+ int allow_count = 0;
+ int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID };
+ float scores_2D[16] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ };
+ for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+ const int allow_tx_type =
+ check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]);
+ if (!allow_tx_type) {
+ continue;
+ }
+ if (scores_2D_raw[tx_idx] > max_score) {
+ max_score = scores_2D_raw[tx_idx];
+ max_score_i = tx_idx;
+ }
+ if (scores_2D_raw[tx_idx] >= score_thresh) {
+ // Set allow mask based on score_thresh
+ set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]);
+
+ // Accumulate score of allowed tx type
+ sum_score += scores_2D_raw[tx_idx];
+
+ scores_2D[allow_count] = scores_2D_raw[tx_idx];
+ tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx];
+ allow_count += 1;
+ }
+ }
+ if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) {
+ // If even the tx_type with max score is pruned, this means that no other
+ // tx_type is feasible. When this happens, we force enable max_score_i and
+ // end the search.
+ set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]);
+ memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+ *allowed_tx_mask = allow_bitmask;
+ return;
+ }
+
+ // Sort tx type probability of all types
+ if (allow_count <= 8) {
+ av1_sort_fi32_8(scores_2D, tx_type_allowed);
+ } else {
+ av1_sort_fi32_16(scores_2D, tx_type_allowed);
+ }
+
+ // Enable more pruning based on tx type probability and number of allowed tx
+ // types
+ if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) {
+ float temp_score = 0.0;
+ float score_ratio = 0.0;
+ int tx_idx, tx_count = 0;
+ const float inv_sum_score = 100 / sum_score;
+ // Get allowed tx types based on sorted probability score and tx count
+ for (tx_idx = 0; tx_idx < allow_count; tx_idx++) {
+ // Skip the tx type which has more than 30% of cumulative
+ // probability and allowed tx type count is more than 2
+ if (score_ratio > 30.0 && tx_count >= 2) break;
+
+ assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx]));
+ // Calculate cumulative probability
+ temp_score += scores_2D[tx_idx];
+
+ // Calculate percentage of cumulative probability of allowed tx type
+ score_ratio = temp_score * inv_sum_score;
+ tx_count++;
+ }
+ // Set remaining tx types as pruned
+ for (; tx_idx < allow_count; tx_idx++)
+ unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]);
+ }
+
+ memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D));
+ *allowed_tx_mask = allow_bitmask;
+}
+
+static float get_dev(float mean, double x2_sum, int num) {
+ const float e_x2 = (float)(x2_sum / num);
+ const float diff = e_x2 - mean * mean;
+ const float dev = (diff > 0) ? sqrtf(diff) : 0;
+ return dev;
+}
+
+// Writes the features required by the ML model to predict tx split based on
+// mean and standard deviation values of the block and sub-blocks.
+// Returns the number of elements written to the output array which is at most
+// 12 currently. Hence 'features' buffer should be able to accommodate at least
+// 12 elements.
+static AOM_INLINE int get_mean_dev_features(const int16_t *data, int stride,
+ int bw, int bh, float *features) {
+ const int16_t *const data_ptr = &data[0];
+ const int subh = (bh >= bw) ? (bh >> 1) : bh;
+ const int subw = (bw >= bh) ? (bw >> 1) : bw;
+ const int num = bw * bh;
+ const int sub_num = subw * subh;
+ int feature_idx = 2;
+ int total_x_sum = 0;
+ int64_t total_x2_sum = 0;
+ int num_sub_blks = 0;
+ double mean2_sum = 0.0f;
+ float dev_sum = 0.0f;
+
+ for (int row = 0; row < bh; row += subh) {
+ for (int col = 0; col < bw; col += subw) {
+ int x_sum;
+ int64_t x2_sum;
+ // TODO(any): Write a SIMD version. Clear registers.
+ aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+ &x_sum, &x2_sum);
+ total_x_sum += x_sum;
+ total_x2_sum += x2_sum;
+
+ const float mean = (float)x_sum / sub_num;
+ const float dev = get_dev(mean, (double)x2_sum, sub_num);
+ features[feature_idx++] = mean;
+ features[feature_idx++] = dev;
+ mean2_sum += (double)(mean * mean);
+ dev_sum += dev;
+ num_sub_blks++;
+ }
+ }
+
+ const float lvl0_mean = (float)total_x_sum / num;
+ features[0] = lvl0_mean;
+ features[1] = get_dev(lvl0_mean, (double)total_x2_sum, num);
+
+ // Deviation of means.
+ features[feature_idx++] = get_dev(lvl0_mean, mean2_sum, num_sub_blks);
+ // Mean of deviations.
+ features[feature_idx++] = dev_sum / num_sub_blks;
+
+ return feature_idx;
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+ int blk_col, TX_SIZE tx_size) {
+ const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+ if (!nn_config) return -1;
+
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff =
+ x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ float features[64] = { 0.0f };
+ get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ int int_score = (int)(score * 10000);
+ return clamp(int_score, -80000, 80000);
+}
+
+static INLINE uint16_t
+get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
+ int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const int is_inter = is_inter_block(mbmi);
+ const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+ // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
+ // TX_TYPES, only that specific tx type is allowed.
+ TX_TYPE txk_allowed = TX_TYPES;
+
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int use_actual_frame_probs = 1;
+ const int *tx_type_probs;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ tx_type_probs =
+ (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size];
+ }
+#endif
+ if (use_actual_frame_probs) {
+ tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size];
+ }
+
+ if ((!is_inter && txfm_params->use_default_intra_tx_type) ||
+ (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) {
+ txk_allowed =
+ get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools);
+ } else if (is_inter &&
+ txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) {
+ if (tx_type_probs[DEFAULT_INTER_TX_TYPE] >
+ txfm_params->default_inter_tx_type_prob_thresh) {
+ txk_allowed = DEFAULT_INTER_TX_TYPE;
+ } else {
+ int force_tx_type = 0;
+ int max_prob = 0;
+ const int tx_type_prob_threshold =
+ txfm_params->default_inter_tx_type_prob_thresh +
+ PROB_THRESH_OFFSET_TX_TYPE;
+ for (int i = 1; i < TX_TYPES; i++) { // find maximum probability.
+ if (tx_type_probs[i] > max_prob) {
+ max_prob = tx_type_probs[i];
+ force_tx_type = i;
+ }
+ }
+ if (max_prob > tx_type_prob_threshold) // force tx type with max prob.
+ txk_allowed = force_tx_type;
+ else if (x->rd_model == LOW_TXFM_RD) {
+ if (plane == 0) txk_allowed = DCT_DCT;
+ }
+ }
+ } else if (x->rd_model == LOW_TXFM_RD) {
+ if (plane == 0) txk_allowed = DCT_DCT;
+ }
+
+ const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, is_inter, cm->features.reduced_tx_set_used);
+
+ TX_TYPE uv_tx_type = DCT_DCT;
+ if (plane) {
+ // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+ uv_tx_type = txk_allowed =
+ av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ }
+ PREDICTION_MODE intra_dir =
+ mbmi->filter_intra_mode_info.use_filter_intra
+ ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
+ : mbmi->mode;
+ uint16_t ext_tx_used_flag =
+ cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 &&
+ tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
+ ? av1_reduced_intra_tx_used_flag[intra_dir]
+ : av1_ext_tx_used_flag[tx_set_type];
+
+ if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2)
+ ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir];
+
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+ ext_tx_used_flag == 0x0001 ||
+ (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) ||
+ (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) {
+ txk_allowed = DCT_DCT;
+ }
+
+ if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0)
+ ext_tx_used_flag &= DCT_ADST_TX_MASK;
+
+ uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip.
+ if (txk_allowed < TX_TYPES) {
+ allowed_tx_mask = 1 << txk_allowed;
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else if (fast_tx_search) {
+ allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else {
+ assert(plane == 0);
+ allowed_tx_mask = ext_tx_used_flag;
+ int num_allowed = 0;
+ int i;
+
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 },
+ { 10, 17, 17, 10, 17, 17, 17 } };
+ const int thresh =
+ thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1]
+ [update_type];
+ uint16_t prune = 0;
+ int max_prob = -1;
+ int max_idx = 0;
+ for (i = 0; i < TX_TYPES; i++) {
+ if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) {
+ max_prob = tx_type_probs[i];
+ max_idx = i;
+ }
+ if (tx_type_probs[i] < thresh) prune |= (1 << i);
+ }
+ if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx);
+ allowed_tx_mask &= (~prune);
+ }
+ for (i = 0; i < TX_TYPES; i++) {
+ if (allowed_tx_mask & (1 << i)) num_allowed++;
+ }
+ assert(num_allowed > 0);
+
+ if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+ int pf = prune_factors[txfm_params->prune_2d_txfm_mode];
+ int mf = mul_factors[txfm_params->prune_2d_txfm_mode];
+ if (num_allowed <= 7) {
+ const uint16_t prune =
+ prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
+ plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx,
+ cm->features.reduced_tx_set_used);
+ allowed_tx_mask &= (~prune);
+ } else {
+ const int num_sel = (num_allowed * mf + 50) / 100;
+ const uint16_t prune = prune_txk_type_separ(
+ cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+ txk_map, allowed_tx_mask, pf, txb_ctx,
+ cm->features.reduced_tx_set_used, ref_best_rd, num_sel);
+
+ allowed_tx_mask &= (~prune);
+ }
+ } else {
+ assert(num_allowed > 0);
+ int allowed_tx_count =
+ (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5;
+ // !fast_tx_search && txk_end != txk_start && plane == 0
+ if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter &&
+ num_allowed > allowed_tx_count) {
+ prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+ txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask);
+ }
+ }
+ }
+
+ // Need to have at least one transform type allowed.
+ if (allowed_tx_mask == 0) {
+ txk_allowed = (plane ? uv_tx_type : DCT_DCT);
+ allowed_tx_mask = (1 << txk_allowed);
+ }
+
+ assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed));
+ *allowed_txk_types = txk_allowed;
+ return allowed_tx_mask;
+}
+
+#if CONFIG_RD_DEBUG
+static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+ int txb_coeff_cost) {
+ rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+}
+#endif
+
+static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size, const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ int reduced_tx_set_used) {
+#if TXCOEFF_COST_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used);
+#if TXCOEFF_COST_TIMER
+ AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ tmp_cm->txcoeff_cost_timer += elapsed_time;
+ ++tmp_cm->txcoeff_cost_count;
+#endif
+ return cost;
+}
+
+static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
+ QUANT_PARAM *quant_param, int plane,
+ int block, TX_SIZE tx_size,
+ int quant_b_adapt, int qstep,
+ unsigned int coeff_opt_satd_threshold,
+ int skip_trellis, int dc_only_blk) {
+ if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
+ return skip_trellis;
+
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff_ptr = p->coeff + block_offset;
+ const int n_coeffs = av1_get_max_eob(tx_size);
+ const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
+ int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
+ satd = RIGHT_SIGNED_SHIFT(satd, shift);
+ satd >>= (x->e_mbd.bd - 8);
+
+ const int skip_block_trellis =
+ ((uint64_t)satd >
+ (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]);
+
+ av1_setup_quant(
+ tx_size, !skip_block_trellis,
+ skip_block_trellis
+ ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP)
+ : AV1_XFORM_QUANT_FP,
+ quant_b_adapt, quant_param);
+
+ return skip_block_trellis;
+}
+
+// Predict DC only blocks if the residual variance is below a qstep based
+// threshold.For such blocks, transform type search is bypassed.
+static INLINE void predict_dc_only_block(
+ MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int block, int blk_row, int blk_col, RD_STATS *best_rd_stats,
+ int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean,
+ int *dc_only_blk) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+ uint64_t block_var = UINT64_MAX;
+ const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+ *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], block_mse_q8,
+ per_px_mean, &block_var);
+ assert((*block_mse_q8) != UINT_MAX);
+ uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+ if (is_cur_buf_hbd(xd))
+ block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+
+ if (block_var >= var_threshold) return;
+ const unsigned int predict_dc_level = x->txfm_search_params.predict_dc_level;
+ assert(predict_dc_level != 0);
+
+ // Prediction of skip block if residual mean and variance are less
+ // than qstep based threshold
+ if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) {
+ // If the normalized mean of residual block is less than the dc qstep and
+ // the normalized block variance is less than ac qstep, then the block is
+ // assumed to be a skip block and its rdcost is updated accordingly.
+ best_rd_stats->skip_txfm = 1;
+
+ x->plane[plane].eobs[block] = 0;
+
+ if (is_cur_buf_hbd(xd))
+ *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
+
+ best_rd_stats->dist = (*block_sse) << 4;
+ best_rd_stats->sse = best_rd_stats->dist;
+
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+ ENTROPY_CONTEXT *ta = ctxa;
+ ENTROPY_CONTEXT *tl = ctxl;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx_tmp;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+ .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+ best_rd_stats->rate = zero_blk_rate;
+
+ best_rd_stats->rdcost =
+ RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+ x->plane[plane].txb_entropy_ctx[block] = 0;
+ } else if (predict_dc_level > 1) {
+ // Predict DC only blocks based on residual variance.
+ // For chroma plane, this prediction is disabled for intra blocks.
+ if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1;
+ }
+}
+
+// Search for the best transform type for a given transform block.
+// This function can be used for both inter and intra, both luma and chroma.
+static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx,
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis,
+ int64_t ref_best_rd, RD_STATS *best_rd_stats) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ int64_t best_rd = INT64_MAX;
+ uint16_t best_eob = 0;
+ TX_TYPE best_tx_type = DCT_DCT;
+ int rate_cost = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ tran_low_t *orig_dqcoeff = p->dqcoeff;
+ tran_low_t *best_dqcoeff = x->dqcoeff_buf;
+ const int tx_type_map_idx =
+ plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
+ av1_invalid_rd_stats(best_rd_stats);
+
+ skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
+ DRY_RUN_NORMAL);
+
+ uint8_t best_txb_ctx = 0;
+ // txk_allowed = TX_TYPES: >1 tx types are allowed
+ // txk_allowed < TX_TYPES: only that specific tx type is allowed.
+ TX_TYPE txk_allowed = TX_TYPES;
+ int txk_map[TX_TYPES] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ int64_t block_sse;
+ unsigned int block_mse_q8;
+ int dc_only_blk = 0;
+ const bool predict_dc_block =
+ txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64;
+ int64_t per_px_mean = INT64_MAX;
+ if (predict_dc_block) {
+ predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
+ blk_col, best_rd_stats, &block_sse, &block_mse_q8,
+ &per_px_mean, &dc_only_blk);
+ if (best_rd_stats->skip_txfm == 1) {
+ const TX_TYPE tx_type = DCT_DCT;
+ if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+ return;
+ }
+ } else {
+ block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], &block_mse_q8);
+ assert(block_mse_q8 != UINT_MAX);
+ }
+
+ // Bit mask to indicate which transform types are allowed in the RD search.
+ uint16_t tx_mask;
+
+ // Use DCT_DCT transform for DC only block.
+ if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1)
+ tx_mask = 1 << DCT_DCT;
+ else
+ tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+ &txk_allowed, txk_map);
+ const uint16_t allowed_tx_mask = tx_mask;
+
+ if (is_cur_buf_hbd(xd)) {
+ block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+ block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+ }
+ block_sse *= 16;
+ // Use mse / qstep^2 based threshold logic to take decision of R-D
+ // optimization of coeffs. For smaller residuals, coeff optimization
+ // would be helpful. For larger residuals, R-D optimization may not be
+ // effective.
+ // TODO(any): Experiment with variance and mean based thresholds
+ const int perform_block_coeff_opt =
+ ((uint64_t)block_mse_q8 <=
+ (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep);
+ skip_trellis |= !perform_block_coeff_opt;
+
+ // Flag to indicate if distortion should be calculated in transform domain or
+ // not during iterating through transform type candidates.
+ // Transform domain distortion is accurate for higher residuals.
+ // TODO(any): Experiment with variance and mean based thresholds
+ int use_transform_domain_distortion =
+ (txfm_params->use_transform_domain_distortion > 0) &&
+ (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) &&
+ // Any 64-pt transforms only preserves half the coefficients.
+ // Therefore transform domain distortion is not valid for these
+ // transform sizes.
+ (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+ // Use pixel domain distortion for DC only blocks
+ !dc_only_blk;
+ // Flag to indicate if an extra calculation of distortion in the pixel domain
+ // should be performed at the end, after the best transform type has been
+ // decided.
+ int calc_pixel_domain_distortion_final =
+ txfm_params->use_transform_domain_distortion == 1 &&
+ use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
+ if (calc_pixel_domain_distortion_final &&
+ (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
+ calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+ const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ int skip_trellis_based_on_satd[TX_TYPES] = { 0 };
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, !skip_trellis,
+ skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+ : AV1_XFORM_QUANT_FP)
+ : AV1_XFORM_QUANT_FP,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+
+ // Iterate through all transform type candidates.
+ for (int idx = 0; idx < TX_TYPES; ++idx) {
+ const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
+ if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type))
+ continue;
+ txfm_param.tx_type = tx_type;
+ if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+ }
+ if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+ RD_STATS this_rd_stats;
+ av1_invalid_rd_stats(&this_rd_stats);
+
+ if (!dc_only_blk)
+ av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+ else
+ av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
+
+ skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
+ x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
+ qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk);
+
+ av1_quant(x, plane, block, &txfm_param, &quant_param);
+
+ // Calculate rate cost of quantized coefficients.
+ if (quant_param.use_optimize_b) {
+ // TODO(aomedia:3209): update Trellis quantization to take into account
+ // quantization matrices.
+ av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+ &rate_cost);
+ } else {
+ rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+ cm->features.reduced_tx_set_used);
+ }
+
+ // If rd cost based on coeff rate alone is already more than best_rd,
+ // terminate early.
+ if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue;
+
+ // Calculate distortion.
+ if (eobs_ptr[block] == 0) {
+ // When eob is 0, pixel domain distortion is more efficient and accurate.
+ this_rd_stats.dist = this_rd_stats.sse = block_sse;
+ } else if (dc_only_blk) {
+ this_rd_stats.sse = block_sse;
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ } else if (use_transform_domain_distortion) {
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+ } else {
+ int64_t sse_diff = INT64_MAX;
+ // high_energy threshold assumes that every pixel within a txfm block
+ // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+ // for 8 bit.
+ const int64_t high_energy_thresh =
+ ((int64_t)128 * 128 * tx_size_2d[tx_size]);
+ const int is_high_energy = (block_sse >= high_energy_thresh);
+ if (tx_size == TX_64X64 || is_high_energy) {
+ // Because 3 out 4 quadrants of transform coefficients are forced to
+ // zero, the inverse transform has a tendency to overflow. sse_diff
+ // is effectively the energy of those 3 quadrants, here we use it
+ // to decide if we should do pixel domain distortion. If the energy
+ // is mostly in first quadrant, then it is unlikely that we have
+ // overflow issue in inverse transform.
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+ sse_diff = block_sse - this_rd_stats.sse;
+ }
+ if (tx_size != TX_64X64 || !is_high_energy ||
+ (sse_diff * 2) < this_rd_stats.sse) {
+ const int64_t tx_domain_dist = this_rd_stats.dist;
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ // For high energy blocks, occasionally, the pixel domain distortion
+ // can be artificially low due to clamping at reconstruction stage
+ // even when inverse transform output is hugely different from the
+ // actual residue.
+ if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+ this_rd_stats.dist = tx_domain_dist;
+ } else {
+ assert(sse_diff < INT64_MAX);
+ this_rd_stats.dist += sse_diff;
+ }
+ this_rd_stats.sse = block_sse;
+ }
+
+ this_rd_stats.rate = rate_cost;
+
+ const int64_t rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+ if (rd < best_rd) {
+ best_rd = rd;
+ *best_rd_stats = this_rd_stats;
+ best_tx_type = tx_type;
+ best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+ best_eob = x->plane[plane].eobs[block];
+ // Swap dqcoeff buffers
+ tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+ best_dqcoeff = p->dqcoeff;
+ p->dqcoeff = tmp_dqcoeff;
+ }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+ if (plane == 0) {
+ PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+ plane_bsize, tx_size, tx_type, rd);
+ }
+#endif // CONFIG_COLLECT_RD_STATS == 1
+
+#if COLLECT_TX_SIZE_DATA
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 21743;
+ if (lcg_rand16(&seed) % 200 == 0) {
+ FILE *fp = NULL;
+
+ if (within_border) {
+ fp = fopen(av1_tx_size_data_output_file, "a");
+ }
+
+ if (fp) {
+ // Transform info and RD
+ const int txb_w = tx_size_wide[tx_size];
+ const int txb_h = tx_size_high[tx_size];
+
+ // Residue signal.
+ const int diff_stride = block_size_wide[plane_bsize];
+ struct macroblock_plane *const p = &x->plane[plane];
+ const int16_t *src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+
+ for (int r = 0; r < txb_h; ++r) {
+ for (int c = 0; c < txb_w; ++c) {
+ fprintf(fp, "%d,", src_diff[c]);
+ }
+ src_diff += diff_stride;
+ }
+
+ fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd);
+ fprintf(fp, "\n");
+ fclose(fp);
+ }
+ }
+#endif // COLLECT_TX_SIZE_DATA
+
+ // If the current best RD cost is much worse than the reference RD cost,
+ // terminate early.
+ if (cpi->sf.tx_sf.adaptive_txb_search_level) {
+ if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
+ ref_best_rd) {
+ break;
+ }
+ }
+
+ // Terminate transform type search if the block has been quantized to
+ // all zero.
+ if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break;
+ }
+
+ assert(best_rd != INT64_MAX);
+
+ best_rd_stats->skip_txfm = best_eob == 0;
+ if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+ x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+ x->plane[plane].eobs[block] = best_eob;
+ skip_trellis = skip_trellis_based_on_satd[best_tx_type];
+
+ // Point dqcoeff to the quantized coefficients corresponding to the best
+ // transform type, then we can skip transform and quantization, e.g. in the
+ // final pixel domain distortion calculation and recon_intra().
+ p->dqcoeff = best_dqcoeff;
+
+ if (calc_pixel_domain_distortion_final && best_eob) {
+ best_rd_stats->dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ best_rd_stats->sse = block_sse;
+ }
+
+ // Intra mode needs decoded pixels such that the next transform block
+ // can use them for prediction.
+ recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
+ p->dqcoeff = orig_dqcoeff;
+}
+
+// Pick transform type for a luma transform block of tx_size. Note this function
+// is used only for inter-predicted blocks.
+static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
+ TX_SIZE tx_size, int blk_row, int blk_col,
+ int block, int plane_bsize, TXB_CTX *txb_ctx,
+ RD_STATS *rd_stats,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ int64_t ref_rdcost) {
+ assert(is_inter_block(x->e_mbd.mi[0]));
+ RD_STATS this_rd_stats;
+ const int skip_trellis = 0;
+ search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
+ txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
+
+ av1_merge_rd_stats(rd_stats, &this_rd_stats);
+}
+
+static AOM_INLINE void try_tx_block_no_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+ const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+ int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ const int bw = mi_size_wide[plane_bsize];
+ const ENTROPY_CONTEXT *const pta = ta + blk_col;
+ const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->zero_rate = zero_blk_rate;
+ const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+ mbmi->inter_tx_size[index] = tx_size;
+ tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+ rd_stats, ftxs_mode, ref_best_rd);
+ assert(rd_stats->rate < INT_MAX);
+
+ const int pick_skip_txfm =
+ !xd->lossless[mbmi->segment_id] &&
+ (rd_stats->skip_txfm == 1 ||
+ RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+ if (pick_skip_txfm) {
+#if CONFIG_RD_DEBUG
+ update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate);
+#endif // CONFIG_RD_DEBUG
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ p->eobs[block] = 0;
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+ rd_stats->skip_txfm = pick_skip_txfm;
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+ pick_skip_txfm);
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0];
+
+ no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+ no_split->tx_type =
+ xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+}
+
+static AOM_INLINE void try_tx_block_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) {
+ assert(tx_size < TX_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+ const int txb_width = tx_size_wide_unit[tx_size];
+ const int txb_height = tx_size_high_unit[tx_size];
+ // Transform size after splitting current block.
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int sub_txb_width = tx_size_wide_unit[sub_txs];
+ const int sub_txb_height = tx_size_high_unit[sub_txs];
+ const int sub_step = sub_txb_width * sub_txb_height;
+ const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
+ assert(nblks > 0);
+ av1_init_rd_stats(split_rd_stats);
+ split_rd_stats->rate =
+ x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1];
+
+ for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+ const int offsetr = blk_row + r;
+ if (offsetr >= max_blocks_high) break;
+ for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
+ assert(blk_idx < 4);
+ const int offsetc = blk_col + c;
+ if (offsetc >= max_blocks_wide) continue;
+
+ RD_STATS this_rd_stats;
+ int this_cost_valid = 1;
+ select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
+ plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
+ no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
+ &this_cost_valid, ftxs_mode);
+ if (!this_cost_valid) {
+ split_rd_stats->rdcost = INT64_MAX;
+ return;
+ }
+ av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+ split_rd_stats->rdcost =
+ RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+ if (split_rd_stats->rdcost > ref_best_rd) {
+ split_rd_stats->rdcost = INT64_MAX;
+ return;
+ }
+ block += sub_step;
+ }
+ }
+}
+
+static float get_var(float mean, double x2_sum, int num) {
+ const float e_x2 = (float)(x2_sum / num);
+ const float diff = e_x2 - mean * mean;
+ return diff;
+}
+
+static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw,
+ int bh, float *dev_of_mean,
+ float *var_of_vars) {
+ const int16_t *const data_ptr = &data[0];
+ const int subh = (bh >= bw) ? (bh >> 1) : bh;
+ const int subw = (bw >= bh) ? (bw >> 1) : bw;
+ const int num = bw * bh;
+ const int sub_num = subw * subh;
+ int total_x_sum = 0;
+ int64_t total_x2_sum = 0;
+ int blk_idx = 0;
+ float var_sum = 0.0f;
+ float mean_sum = 0.0f;
+ double var2_sum = 0.0f;
+ double mean2_sum = 0.0f;
+
+ for (int row = 0; row < bh; row += subh) {
+ for (int col = 0; col < bw; col += subw) {
+ int x_sum;
+ int64_t x2_sum;
+ aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+ &x_sum, &x2_sum);
+ total_x_sum += x_sum;
+ total_x2_sum += x2_sum;
+
+ const float mean = (float)x_sum / sub_num;
+ const float var = get_var(mean, (double)x2_sum, sub_num);
+ mean_sum += mean;
+ mean2_sum += (double)(mean * mean);
+ var_sum += var;
+ var2_sum += var * var;
+ blk_idx++;
+ }
+ }
+
+ const float lvl0_mean = (float)total_x_sum / num;
+ const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num);
+ mean_sum += lvl0_mean;
+ mean2_sum += (double)(lvl0_mean * lvl0_mean);
+ var_sum += block_var;
+ var2_sum += block_var * block_var;
+ const float av_mean = mean_sum / 5;
+
+ if (blk_idx > 1) {
+ // Deviation of means.
+ *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1));
+ // Variance of variances.
+ const float mean_var = var_sum / (blk_idx + 1);
+ *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1));
+ }
+}
+
+static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize,
+ int blk_row, int blk_col, TX_SIZE tx_size,
+ int *try_no_split, int *try_split,
+ int pruning_level) {
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff =
+ x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ float dev_of_means = 0.0f;
+ float var_of_vars = 0.0f;
+
+ // This function calculates the deviation of means, and the variance of pixel
+ // variances of the block as well as it's sub-blocks.
+ get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars);
+ const int dc_q = x->plane[0].dequant_QTX[0] >> 3;
+ const int ac_q = x->plane[0].dequant_QTX[1] >> 3;
+ const int no_split_thresh_scales[4] = { 0, 24, 8, 8 };
+ const int no_split_thresh_scale = no_split_thresh_scales[pruning_level];
+ const int split_thresh_scales[4] = { 0, 24, 10, 8 };
+ const int split_thresh_scale = split_thresh_scales[pruning_level];
+
+ if ((dev_of_means <= dc_q) &&
+ (split_thresh_scale * var_of_vars <= ac_q * ac_q)) {
+ *try_split = 0;
+ }
+ if ((dev_of_means > no_split_thresh_scale * dc_q) &&
+ (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) {
+ *try_no_split = 0;
+ }
+}
+
+// Search for the best transform partition(recursive)/type for a given
+// inter-predicted luma block. The obtained transform selection will be saved
+// in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
+static AOM_INLINE void select_tx_block(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
+ assert(tx_size < TX_SIZES_ALL);
+ av1_init_rd_stats(rd_stats);
+ if (ref_best_rd < 0) {
+ *is_cost_valid = 0;
+ return;
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ assert(blk_row < max_block_high(xd, plane_bsize, 0) &&
+ blk_col < max_block_wide(xd, plane_bsize, 0));
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+ mbmi->bsize, tx_size);
+ struct macroblock_plane *const p = &x->plane[0];
+
+ int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 ||
+ txsize_sqr_up_map[tx_size] != TX_64X64) &&
+ (cpi->oxcf.txfm_cfg.enable_rect_tx ||
+ tx_size_wide[tx_size] == tx_size_high[tx_size]);
+ int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+ TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+ // Prune tx_split and no-split based on sub-block properties.
+ if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 &&
+ cpi->sf.tx_sf.prune_tx_size_level > 0) {
+ prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size,
+ &try_no_split, &try_split,
+ cpi->sf.tx_sf.prune_tx_size_level);
+ }
+
+ if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) {
+ if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0;
+ }
+
+ // Try using current block as a single transform block without split.
+ if (try_no_split) {
+ try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+ ftxs_mode, &no_split);
+
+ // Speed features for early termination.
+ const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
+ if (search_level) {
+ if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
+ *is_cost_valid = 0;
+ return;
+ }
+ if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) {
+ try_split = 0;
+ }
+ }
+ if (cpi->sf.tx_sf.txb_split_cap) {
+ if (p->eobs[block] == 0) try_split = 0;
+ }
+ }
+
+ // ML based speed feature to skip searching for split transform blocks.
+ if (x->e_mbd.bd == 8 && try_split &&
+ !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) {
+ const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh;
+ if (threshold >= 0) {
+ const int split_score =
+ ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+ if (split_score < -threshold) try_split = 0;
+ }
+ }
+
+ RD_STATS split_rd_stats;
+ split_rd_stats.rdcost = INT64_MAX;
+ // Try splitting current block into smaller transform blocks.
+ if (try_split) {
+ try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+ AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+ &split_rd_stats);
+ }
+
+ if (no_split.rd < split_rd_stats.rdcost) {
+ ENTROPY_CONTEXT *pta = ta + blk_col;
+ ENTROPY_CONTEXT *ptl = tl + blk_row;
+ p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+ av1_set_txb_context(x, 0, block, tx_size, pta, ptl);
+ txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+ tx_size);
+ for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+ for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+ const int index =
+ av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+ mbmi->inter_tx_size[index] = tx_size;
+ }
+ }
+ mbmi->tx_size = tx_size;
+ update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
+ const int bw = mi_size_wide[plane_bsize];
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+ rd_stats->skip_txfm);
+ } else {
+ *rd_stats = split_rd_stats;
+ if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
+ }
+}
+
+static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
+ MACROBLOCK *x, RD_STATS *rd_stats,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
+
+ // If tx64 is not enabled, we need to go down to the next available size
+ if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) {
+ static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_32X32, // 64x64 transform
+ TX_4X8, // 4x8 transform
+ TX_8X4, // 8x4 transform
+ TX_8X16, // 8x16 transform
+ TX_16X8, // 16x8 transform
+ TX_16X32, // 16x32 transform
+ TX_32X16, // 32x16 transform
+ TX_32X32, // 32x64 transform
+ TX_32X32, // 64x32 transform
+ TX_4X16, // 4x16 transform
+ TX_16X4, // 16x4 transform
+ TX_8X32, // 8x32 transform
+ TX_32X8, // 32x8 transform
+ TX_16X32, // 16x64 transform
+ TX_32X16, // 64x16 transform
+ };
+ mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+ } else if (cpi->oxcf.txfm_cfg.enable_tx64 &&
+ !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+ static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_64X64, // 64x64 transform
+ TX_4X4, // 4x8 transform
+ TX_4X4, // 8x4 transform
+ TX_8X8, // 8x16 transform
+ TX_8X8, // 16x8 transform
+ TX_16X16, // 16x32 transform
+ TX_16X16, // 32x16 transform
+ TX_32X32, // 32x64 transform
+ TX_32X32, // 64x32 transform
+ TX_4X4, // 4x16 transform
+ TX_4X4, // 16x4 transform
+ TX_8X8, // 8x32 transform
+ TX_8X8, // 32x8 transform
+ TX_16X16, // 16x64 transform
+ TX_16X16, // 64x16 transform
+ };
+ mbmi->tx_size = tx_size_max_square[mbmi->tx_size];
+ } else if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+ !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+ static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_32X32, // 64x64 transform
+ TX_4X4, // 4x8 transform
+ TX_4X4, // 8x4 transform
+ TX_8X8, // 8x16 transform
+ TX_8X8, // 16x8 transform
+ TX_16X16, // 16x32 transform
+ TX_16X16, // 32x16 transform
+ TX_32X32, // 32x64 transform
+ TX_32X32, // 64x32 transform
+ TX_4X4, // 4x16 transform
+ TX_4X4, // 16x4 transform
+ TX_8X8, // 8x32 transform
+ TX_8X8, // 32x8 transform
+ TX_16X16, // 16x64 transform
+ TX_16X16, // 64x16 transform
+ };
+
+ mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size];
+ }
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ // Skip RDcost is used only for Inter blocks
+ const int64_t skip_txfm_rd =
+ is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+ const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0);
+ const int skip_trellis = 0;
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+ AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+ mbmi->tx_size, FTXS_NONE, skip_trellis);
+}
+
+static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ mbmi->tx_size = TX_4X4;
+ // TODO(any) : Pass this_rd based on skip/non-skip cost
+ const int skip_trellis = 0;
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
+ FTXS_NONE, skip_trellis);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row,
+ int blk_col, BLOCK_SIZE bsize,
+ TX_SIZE tx_size) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ // Disable the pruning logic using NN model for the following cases:
+ // 1) Lossless coding as only 4x4 transform is evaluated in this case
+ // 2) When transform and current block sizes do not match as the features are
+ // obtained over the current block
+ // 3) When operating bit-depth is not 8-bit as the input features are not
+ // scaled according to bit-depth.
+ if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize ||
+ xd->bd != 8)
+ return;
+
+ // Currently NN model based pruning is supported only when largest transform
+ // size is 8x8
+ if (tx_size != TX_8X8) return;
+
+ // Neural network model is a sequential neural net and was trained using SGD
+ // optimizer. The model can be further improved in terms of speed/quality by
+ // considering the following experiments:
+ // 1) Generate ML model by training with balanced data for different learning
+ // rates and optimizers.
+ // 2) Experiment with ML model by adding features related to the statistics of
+ // top and left pixels to capture the accuracy of reconstructed neighbouring
+ // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4
+ // sub-blocks, etc.
+ // 3) Generate ML models for transform blocks other than 8x8.
+ const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8;
+ const float *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8;
+
+ float features[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.0f };
+ const int diff_stride = block_size_wide[bsize];
+
+ const int16_t *diff = x->plane[0].src_diff + MI_SIZE * blk_row * diff_stride +
+ MI_SIZE * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ int feature_idx = get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+ features[feature_idx++] = log1pf((float)x->source_variance);
+
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ const float log_dc_q_square = log1pf((float)(dc_q * dc_q) / 256.0f);
+ features[feature_idx++] = log_dc_q_square;
+ assert(feature_idx == NUM_INTRA_TX_SPLIT_FEATURES);
+ for (int i = 0; i < NUM_INTRA_TX_SPLIT_FEATURES; i++) {
+ features[i] = (features[i] - av1_intra_tx_split_8x8_mean[i]) /
+ av1_intra_tx_split_8x8_std[i];
+ }
+
+ float score;
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+ if (score <= intra_tx_prune_thresh[0])
+ txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_SPLIT;
+ else if (score > intra_tx_prune_thresh[1])
+ txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_LARGEST;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Search for the best uniform transform size and type for current coding block.
+static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ av1_invalid_rd_stats(rd_stats);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+ const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+ const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT;
+ int start_tx;
+ // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
+ // how many times of splitting is allowed during the RD search.
+ int init_depth;
+
+ if (tx_select) {
+ start_tx = max_rect_tx_size;
+ init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+ is_inter_block(mbmi), &cpi->sf,
+ txfm_params->tx_size_search_method);
+ if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 &&
+ txsize_sqr_up_map[start_tx] == TX_64X64) {
+ start_tx = sub_tx_size_map[start_tx];
+ }
+ } else {
+ const TX_SIZE chosen_tx_size =
+ tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
+ start_tx = chosen_tx_size;
+ init_depth = MAX_TX_DEPTH;
+ }
+
+ const int skip_trellis = 0;
+ uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ TX_SIZE best_tx_size = max_rect_tx_size;
+ int64_t best_rd = INT64_MAX;
+ const int num_blks = bsize_to_num_blk(bs);
+ x->rd_model = FULL_TXFM_RD;
+ int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
+ depth++, tx_size = sub_tx_size_map[tx_size]) {
+ if ((!cpi->oxcf.txfm_cfg.enable_tx64 &&
+ txsize_sqr_up_map[tx_size] == TX_64X64) ||
+ (!cpi->oxcf.txfm_cfg.enable_rect_tx &&
+ tx_size_wide[tx_size] != tx_size_high[tx_size])) {
+ continue;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break;
+
+ // Set the flag to enable the evaluation of NN classifier to prune transform
+ // depths. As the features are based on intra residual information of
+ // largest transform, the evaluation of NN model is enabled only for this
+ // case.
+ txfm_params->enable_nn_prune_intra_tx_depths =
+ (cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx);
+#endif
+
+ RD_STATS this_rd_stats;
+ // When the speed feature use_rd_based_breakout_for_intra_tx_search is
+ // enabled, use the known minimum best_rd for early termination.
+ const int64_t rd_thresh =
+ cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
+ ? AOMMIN(ref_best_rd, best_rd)
+ : ref_best_rd;
+ rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs,
+ tx_size, FTXS_NONE, skip_trellis);
+ if (rd[depth] < best_rd) {
+ av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks);
+ av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
+ best_tx_size = tx_size;
+ best_rd = rd[depth];
+ *rd_stats = this_rd_stats;
+ }
+ if (tx_size == TX_4X4) break;
+ // If we are searching three depths, prune the smallest size depending
+ // on rd results for the first two depths for low contrast blocks.
+ if (depth > init_depth && depth != MAX_TX_DEPTH &&
+ x->source_variance < 256) {
+ if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+ }
+ }
+
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
+ av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ // Reset the flags to avoid any unintentional evaluation of NN model and
+ // consumption of prune depths.
+ txfm_params->enable_nn_prune_intra_tx_depths = false;
+ txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE;
+#endif
+}
+
+// Search for the best transform type for the given transform block in the
+// given plane/channel, and calculate the corresponding RD cost.
+static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ if (args->exit_early) {
+ args->incomplete_exit = 1;
+ return;
+ }
+
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int is_inter = is_inter_block(xd->mi[0]);
+ const AV1_COMP *cpi = args->cpi;
+ ENTROPY_CONTEXT *a = args->t_above + blk_col;
+ ENTROPY_CONTEXT *l = args->t_left + blk_row;
+ const AV1_COMMON *cm = &cpi->common;
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (!is_inter) {
+ av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+#if !CONFIG_REALTIME_ONLY
+ const TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+ if (txfm_params->enable_nn_prune_intra_tx_depths) {
+ ml_predict_intra_tx_depth_prune(x, blk_row, blk_col, plane_bsize,
+ tx_size);
+ if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_LARGEST) {
+ av1_invalid_rd_stats(&args->rd_stats);
+ args->exit_early = 1;
+ return;
+ }
+ }
+#endif
+ }
+
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ &txb_ctx, args->ftxs_mode, args->skip_trellis,
+ args->best_rd - args->current_rd, &this_rd_stats);
+
+ if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+ assert(!is_inter || plane_bsize < BLOCK_8X8);
+ cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+ }
+
+#if CONFIG_RD_DEBUG
+ update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate);
+#endif // CONFIG_RD_DEBUG
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ const int blk_idx =
+ blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ if (plane == 0)
+ set_blk_skip(txfm_info->blk_skip, plane, blk_idx,
+ x->plane[plane].eobs[block] == 0);
+ else
+ set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0);
+
+ int64_t rd;
+ if (is_inter) {
+ const int64_t no_skip_txfm_rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+ rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd);
+ this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block];
+ } else {
+ // Signal non-skip_txfm for Intra blocks
+ rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ this_rd_stats.skip_txfm = 0;
+ }
+
+ av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+ args->current_rd += rd;
+ if (args->current_rd > args->best_rd) args->exit_early = 1;
+}
+
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ block_signals_txsize(mbmi->bsize);
+ int tx_size_rate = 0;
+ if (tx_select) {
+ const int ctx = txfm_partition_context(
+ xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+ tx_size_rate = mode_costs->txfm_partition_cost[ctx][0];
+ }
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+ const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0);
+ const int64_t no_this_rd =
+ RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+ mbmi->tx_size = tx_size;
+
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+ const int max_blocks_wide = max_block_wide(xd, bs, 0);
+ const int max_blocks_high = max_block_high(xd, bs, 0);
+
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.x = x;
+ args.cpi = cpi;
+ args.best_rd = ref_best_rd;
+ args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd);
+ av1_init_rd_stats(&args.rd_stats);
+ av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left);
+ int i = 0;
+ for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit;
+ blk_row += txh_unit) {
+ for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) {
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (args.exit_early) {
+ args.incomplete_exit = 1;
+ break;
+ }
+
+ ENTROPY_CONTEXT *a = args.t_above + blk_col;
+ ENTROPY_CONTEXT *l = args.t_left + blk_row;
+ TXB_CTX txb_ctx;
+ get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx);
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param);
+
+ av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param);
+ av1_quant(x, 0, i, &txfm_param, &quant_param);
+
+ this_rd_stats.rate =
+ cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0);
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix,
+ scan_order->scan, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+
+ const int64_t no_skip_txfm_rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+ this_rd_stats.skip_txfm &= !x->plane[0].eobs[i];
+
+ av1_merge_rd_stats(&args.rd_stats, &this_rd_stats);
+ args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd);
+
+ if (args.current_rd > ref_best_rd) {
+ args.exit_early = 1;
+ break;
+ }
+
+ av1_set_txb_context(x, 0, i, tx_size, a, l);
+ i += step;
+ }
+ }
+
+ if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats);
+
+ *rd_stats = args.rd_stats;
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ int64_t rd;
+ // rdstats->rate should include all the rate except skip/non-skip cost as the
+ // same is accounted in the caller functions after rd evaluation of all
+ // planes. However the decisions should be done after considering the
+ // skip/non-skip header cost
+ if (rd_stats->skip_txfm && is_inter) {
+ rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ } else {
+ // Intra blocks are always signalled as non-skip
+ rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+ rd_stats->dist);
+ rd_stats->rate += tx_size_rate;
+ }
+ // Check if forcing the block to skip transform leads to smaller RD cost.
+ if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+ int64_t temp_skip_txfm_rd =
+ RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ if (temp_skip_txfm_rd <= rd) {
+ rd = temp_skip_txfm_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ }
+ }
+
+ return rd;
+}
+
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size,
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ block_signals_txsize(mbmi->bsize);
+ int tx_size_rate = 0;
+ if (tx_select) {
+ const int ctx = txfm_partition_context(
+ xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+ tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0]
+ : tx_size_cost(x, bs, tx_size);
+ }
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+ const int64_t skip_txfm_rd =
+ is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+ const int64_t no_this_rd =
+ RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+
+ mbmi->tx_size = tx_size;
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+ AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+ tx_size, ftxs_mode, skip_trellis);
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ int64_t rd;
+ // rdstats->rate should include all the rate except skip/non-skip cost as the
+ // same is accounted in the caller functions after rd evaluation of all
+ // planes. However the decisions should be done after considering the
+ // skip/non-skip header cost
+ if (rd_stats->skip_txfm && is_inter) {
+ rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ } else {
+ // Intra blocks are always signalled as non-skip
+ rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+ rd_stats->dist);
+ rd_stats->rate += tx_size_rate;
+ }
+ // Check if forcing the block to skip transform leads to smaller RD cost.
+ if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+ int64_t temp_skip_txfm_rd =
+ RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ if (temp_skip_txfm_rd <= rd) {
+ rd = temp_skip_txfm_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ }
+ }
+
+ return rd;
+}
+
+// Search for the best transform type for a luma inter-predicted block, given
+// the transform block partitions.
+// This function is used only when some speed features are enabled.
+static AOM_INLINE void tx_block_yrd(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth,
+ ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+ TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd,
+ RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) {
+ assert(tx_size < TX_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(is_inter_block(mbmi));
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+ plane_bsize, blk_row, blk_col)];
+ const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+ mbmi->bsize, tx_size);
+
+ av1_init_rd_stats(rd_stats);
+ if (tx_size == plane_tx_size) {
+ ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+ ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+ const int zero_blk_rate =
+ x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->zero_rate = zero_blk_rate;
+ tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+ rd_stats, ftxs_mode, ref_best_rd);
+ const int mi_width = mi_size_wide[plane_bsize];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip_txfm == 1) {
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1);
+ x->plane[0].eobs[block] = 0;
+ x->plane[0].txb_entropy_ctx[block] = 0;
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ } else {
+ rd_stats->skip_txfm = 0;
+ set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0);
+ }
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0];
+ av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+ txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+ tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int txb_width = tx_size_wide_unit[sub_txs];
+ const int txb_height = tx_size_high_unit[sub_txs];
+ const int step = txb_height * txb_width;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+ RD_STATS pn_rd_stats;
+ int64_t this_rd = 0;
+ assert(txb_width > 0 && txb_height > 0);
+
+ for (int row = 0; row < row_end; row += txb_height) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += txb_width) {
+ const int offsetc = blk_col + col;
+
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+ depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+ ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+ block += step;
+ }
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1];
+ }
+}
+
+// search for tx type with tx sizes already decided for a inter-predicted luma
+// partition block. It's used only when some speed features are enabled.
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+ if (ref_best_rd < 0) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+
+ av1_init_rd_stats(rd_stats);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int step = bw * bh;
+ const int init_depth = get_search_init_depth(
+ mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+ int64_t this_rd = 0;
+ for (int idy = 0, block = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth,
+ ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd,
+ &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd +=
+ AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+ RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+ block += step;
+ }
+ }
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ this_rd =
+ RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist);
+ if (skip_txfm_rd < this_rd) {
+ this_rd = skip_txfm_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ }
+
+ const int is_cost_valid = this_rd > ref_best_rd;
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+ return is_cost_valid;
+}
+
+// Search for the best transform size and type for current inter-predicted
+// luma block with recursive transform block partitioning. The obtained
+// transform selection will be saved in xd->mi[0], the corresponding RD stats
+// will be saved in rd_stats. The returned value is the corresponding RD cost.
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ assert(is_inter_block(xd->mi[0]));
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD;
+ int64_t rd_thresh = ref_best_rd;
+ if (rd_thresh == 0) {
+ av1_invalid_rd_stats(rd_stats);
+ return INT64_MAX;
+ }
+ if (fast_tx_search && rd_thresh < INT64_MAX) {
+ if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+ }
+ assert(rd_thresh > 0);
+ const FAST_TX_SEARCH_MODE ftxs_mode =
+ fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+ const int init_depth = get_search_init_depth(
+ mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int step = bw * bh;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
+ int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
+ int block = 0;
+
+ av1_init_rd_stats(rd_stats);
+ for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
+ for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) {
+ const int64_t best_rd_sofar =
+ (rd_thresh == INT64_MAX)
+ ? INT64_MAX
+ : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd)));
+ int is_cost_valid = 1;
+ RD_STATS pn_rd_stats;
+ // Search for the best transform block size and type for the sub-block.
+ select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
+ ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
+ best_rd_sofar, &is_cost_valid, ftxs_mode);
+ if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return INT64_MAX;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+ no_skip_txfm_rd =
+ RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
+ block += step;
+ }
+ }
+
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd);
+
+ // If fast_tx_search is true, only DCT and 1D DCT were tested in
+ // select_inter_block_yrd() above. Do a better search for tx type with
+ // tx sizes already decided.
+ if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) {
+ if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+ return INT64_MAX;
+ }
+
+ int64_t final_rd;
+ if (rd_stats->skip_txfm) {
+ final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+ } else {
+ final_rd =
+ RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
+ if (!xd->lossless[xd->mi[0]->segment_id]) {
+ final_rd =
+ AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse));
+ }
+ }
+
+ return final_rd;
+}
+
+// Return 1 to terminate transform search early. The decision is made based on
+// the comparison with the reference RD cost and the model-estimated RD cost.
+static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
+ MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level;
+ assert(level >= 0 && level <= 2);
+ int model_rate;
+ int64_t model_dist;
+ uint8_t model_skip;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+ cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL,
+ NULL, NULL, NULL);
+ if (model_skip) return 0;
+ const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+ // TODO(debargha, urvang): Improve the model and make the check below
+ // tighter.
+ static const int prune_factor_by8[] = { 3, 5 };
+ const int factor = prune_factor_by8[level - 1];
+ return ((model_rd * factor) >> 3) > ref_best_rd;
+}
+
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ assert(is_inter_block(xd->mi[0]));
+
+ av1_invalid_rd_stats(rd_stats);
+
+ // If modeled RD cost is a lot worse than the best so far, terminate early.
+ if (cpi->sf.tx_sf.model_based_prune_tx_search_level &&
+ ref_best_rd != INT64_MAX) {
+ if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return;
+ }
+
+ // Hashing based speed feature. If the hash of the prediction residue block is
+ // found in the hash table, use previous search results and terminate early.
+ uint32_t hash = 0;
+ MB_RD_RECORD *mb_rd_record = NULL;
+ const int mi_row = x->e_mbd.mi_row;
+ const int mi_col = x->e_mbd.mi_col;
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+ const int is_mb_rd_hash_enabled =
+ (within_border && cpi->sf.rd_sf.use_mb_rd_hash);
+ const int n4 = bsize_to_num_blk(bsize);
+ if (is_mb_rd_hash_enabled) {
+ hash = get_block_residue_hash(x, bsize);
+ mb_rd_record = x->txfm_search_info.mb_rd_record;
+ const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+ if (match_index != -1) {
+ MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+ fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x);
+ return;
+ }
+ }
+
+ // If we predict that skip is the optimal RD decision - set the respective
+ // context and terminate early.
+ int64_t dist;
+ if (txfm_params->skip_txfm_level &&
+ predict_skip_txfm(x, bsize, &dist,
+ cpi->common.features.reduced_tx_set_used)) {
+ set_skip_txfm(x, rd_stats, bsize, dist);
+ // Save the RD search results into mb_rd_record.
+ if (is_mb_rd_hash_enabled)
+ save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ return;
+ }
+#if CONFIG_SPEED_STATS
+ ++x->txfm_search_info.tx_search_count;
+#endif // CONFIG_SPEED_STATS
+
+ const int64_t rd =
+ select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd);
+
+ if (rd == INT64_MAX) {
+ // We should always find at least one candidate unless ref_best_rd is less
+ // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+ // might have failed to find something better)
+ assert(ref_best_rd != INT64_MAX);
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
+ // Save the RD search results into mb_rd_record.
+ if (is_mb_rd_hash_enabled) {
+ assert(mb_rd_record != NULL);
+ save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ }
+}
+
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bs,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *tx_params = &x->txfm_search_params;
+ assert(bs == mbmi->bsize);
+ const int is_inter = is_inter_block(mbmi);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ av1_init_rd_stats(rd_stats);
+
+ // Hashing based speed feature for inter blocks. If the hash of the residue
+ // block is found in the table, use previously saved search results and
+ // terminate early.
+ uint32_t hash = 0;
+ MB_RD_RECORD *mb_rd_record = NULL;
+ const int num_blks = bsize_to_num_blk(bs);
+ if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) {
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+ if (within_border) {
+ hash = get_block_residue_hash(x, bs);
+ mb_rd_record = x->txfm_search_info.mb_rd_record;
+ const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+ if (match_index != -1) {
+ MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+ fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x);
+ return;
+ }
+ }
+ }
+
+ // If we predict that skip is the optimal RD decision - set the respective
+ // context and terminate early.
+ int64_t dist;
+ if (tx_params->skip_txfm_level && is_inter &&
+ !xd->lossless[mbmi->segment_id] &&
+ predict_skip_txfm(x, bs, &dist,
+ cpi->common.features.reduced_tx_set_used)) {
+ // Populate rdstats as per skip decision
+ set_skip_txfm(x, rd_stats, bs, dist);
+ // Save the RD search results into mb_rd_record.
+ if (mb_rd_record) {
+ save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+ }
+ return;
+ }
+
+ if (xd->lossless[mbmi->segment_id]) {
+ // Lossless mode can only pick the smallest (4x4) transform size.
+ choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else if (tx_params->tx_size_search_method == USE_LARGESTALL) {
+ choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else {
+ choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+ }
+
+ // Save the RD search results into mb_rd_record for possible reuse in future.
+ if (mb_rd_record) {
+ save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+ }
+}
+
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int64_t ref_best_rd) {
+ av1_init_rd_stats(rd_stats);
+ if (ref_best_rd < 0) return 0;
+ if (!x->e_mbd.is_chroma_ref) return 1;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+ const int is_inter = is_inter_block(mbmi);
+ int64_t this_rd = 0, skip_txfm_rd = 0;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ if (is_inter) {
+ for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
+ av1_subtract_plane(x, plane_bsize, plane);
+ }
+
+ const int skip_trellis = 0;
+ const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ int is_cost_valid = 1;
+ for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ RD_STATS this_rd_stats;
+ int64_t chroma_ref_best_rd = ref_best_rd;
+ // For inter blocks, refined ref_best_rd is used for early exit
+ // For intra blocks, even though current rd crosses ref_best_rd, early
+ // exit is not recommended as current rd is used for gating subsequent
+ // modes as well (say, for angular modes)
+ // TODO(any): Extend the early exit mechanism for intra modes as well
+ if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+ chroma_ref_best_rd != INT64_MAX)
+ chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd);
+ av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
+ plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis);
+ if (this_rd_stats.rate == INT_MAX) {
+ is_cost_valid = 0;
+ break;
+ }
+ av1_merge_rd_stats(rd_stats, &this_rd_stats);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+ if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) {
+ is_cost_valid = 0;
+ break;
+ }
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+
+ return is_cost_valid;
+}
+
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+ int skip_trellis) {
+ assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
+
+ if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+ txsize_sqr_up_map[tx_size] == TX_64X64) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
+ if (current_rd > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.x = x;
+ args.cpi = cpi;
+ args.best_rd = ref_best_rd;
+ args.current_rd = current_rd;
+ args.ftxs_mode = ftxs_mode;
+ args.skip_trellis = skip_trellis;
+ av1_init_rd_stats(&args.rd_stats);
+
+ av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left);
+ av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm,
+ &args);
+
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+ if (invalid_rd) {
+ av1_invalid_rd_stats(rd_stats);
+ } else {
+ *rd_stats = args.rd_stats;
+ }
+}
+
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0],
+ x->mode_costs.skip_txfm_cost[skip_ctx][1] };
+ const int64_t min_header_rate =
+ mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]);
+ // Account for minimum skip and non_skip rd.
+ // Eventually either one of them will be added to mode_rate
+ const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+ if (min_header_rd_possible > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats_y);
+ return 0;
+ }
+
+ const AV1_COMMON *cm = &cpi->common;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+ const int64_t rd_thresh =
+ ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_y);
+ rd_stats->rate = mode_rate;
+
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ !xd->lossless[mbmi->segment_id]) {
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+ PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 2
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->height * xd->width; ++i)
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm);
+ }
+
+ if (rd_stats_y->rate == INT_MAX) return 0;
+
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ const int64_t non_skip_txfm_rdcosty =
+ RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist);
+ const int64_t skip_txfm_rdcosty =
+ RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse);
+ const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty);
+ if (min_rdcosty > ref_best_rd) return 0;
+
+ av1_init_rd_stats(rd_stats_uv);
+ const int num_planes = av1_num_planes(cm);
+ if (num_planes > 1) {
+ int64_t ref_best_chroma_rd = ref_best_rd;
+ // Calculate best rd cost possible for chroma
+ if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
+ (ref_best_chroma_rd != INT64_MAX)) {
+ ref_best_chroma_rd = (ref_best_chroma_rd -
+ AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty));
+ }
+ const int is_cost_valid_uv =
+ av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+ if (!is_cost_valid_uv) return 0;
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+ }
+
+ int choose_skip_txfm = rd_stats->skip_txfm;
+ if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) {
+ const int64_t rdcost_no_skip_txfm = RDCOST(
+ x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0],
+ rd_stats->dist);
+ const int64_t rdcost_skip_txfm =
+ RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse);
+ if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1;
+ }
+ if (choose_skip_txfm) {
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->rate = mode_rate + skip_txfm_cost[1];
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->dist = rd_stats_y->sse;
+ rd_stats_uv->dist = rd_stats_uv->sse;
+ mbmi->skip_txfm = 1;
+ if (rd_stats->skip_txfm) {
+ const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) return 0;
+ }
+ } else {
+ rd_stats->rate += skip_txfm_cost[0];
+ mbmi->skip_txfm = 0;
+ }
+
+ return 1;
+}
diff --git a/third_party/aom/av1/encoder/tx_search.h b/third_party/aom/av1/encoder/tx_search.h
new file mode 100644
index 0000000000..ed95c1cd98
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_search.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
+enum {
+ FTXS_NONE = 0,
+ FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+ FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+ FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
+
+static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ TX_SIZE tx_size) {
+ assert(bsize == x->e_mbd.mi[0]->bsize);
+ if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT ||
+ !block_signals_txsize(bsize))
+ return 0;
+
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int tx_size_ctx = get_tx_size_context(xd);
+ return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+}
+
+/*!\brief Compute the pixel domain distortion.
+ *
+ * \ingroup transform_search
+ * Compute the pixel domain distortion from diff on all visible 4x4s in the
+ * transform block.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane Plane index
+ * \param[in] blk_row Block row index
+ * \param[in] blk_col Block col index
+ * \param[in] plane_bsize Current plane block size
+ * \param[in] tx_bsize Transform size
+ * \param[in] block_mse_q8 Block mse
+ * \return An int64_t value that is the block sse.
+ */
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8);
+
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size);
+
+/*!\brief Transform type search for luma macroblock with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and return the transform coefficients RD
+ * cost of current luma macroblock with the given uniform transform size.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \param[in] bs Size of the current macroblock
+ * \param[in] tx_size The given transform size
+ * \param[in] ftxs_mode Transform search mode specifying desired speed
+ and quality tradeoff
+ * \param[in] skip_trellis Binary flag indicating if trellis optimization
+ should be skipped
+ * \return An int64_t value that is the best RD cost found.
+ */
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size,
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for best transform size and type for luma inter blocks. The transform
+ * block partitioning can be recursive resulting in non-uniform transform sizes.
+ * The best transform size and type, if found, will be saved in the MB_MODE_INFO
+ * structure, and the corresponding RD stats will be saved in rd_stats.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] bsize Current macroblock size
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \remark Nothing is returned. The selected transform size and type will
+ be saved in the MB_MODE_INFO structure
+ */
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd);
+
+/*!\brief Uniform transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for the best transform size and type for current macroblock block,
+ * with the assumption that all the transform blocks have a uniform size
+ * (VP9 style). The selected transform size and type will be saved in the
+ * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats.
+ * This function may be used for both intra and inter predicted blocks.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] bs Current macroblock size
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \remark Nothing is returned. The selected transform size and type will
+ be saved in the MB_MODE_INFO structure
+ */
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bs,
+ int64_t ref_best_rd);
+
+/*!\brief Chroma block transform search.
+ *
+ * \ingroup transform_search
+ * Calculate the transform coefficient RD cost for the given chroma macroblock
+ * If the current mode is intra, then this function will compute the predictor.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] bsize Current macroblock size
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \return An integer value is returned. 0: early termination triggered,
+ no valid rd cost available; 1: rd cost values are valid.
+ */
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int64_t ref_best_rd);
+
+/*!\brief Transform type search with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and calculate the transform coefficients
+ * RD cost of the current transform block with the specified (uniform) transform
+ * size and plane. The RD results will be saved in rd_stats.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \param[in] current_rd Current RD cost for this block so far
+ * \param[in] plane Plane index
+ * \param[in] plane_bsize Size of the current macroblock considering
+ sup-sampling
+ * \param[in] tx_size The given transform size
+ * \param[in] ftxs_mode Transform search mode specifying desired speed
+ and quality tradeoff
+ * \param[in] skip_trellis Binary flag indicating if trellis optimization
+ should be skipped
+ *
+ * \remark Nothing is returned. The RD results will be saved in rd_stats.
+ */
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+ int skip_trellis);
+
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * This function combines y and uv planes' transform search processes together
+ * for inter-predicted blocks (including IntraBC), when the prediction is
+ * already generated. It first does subtraction to obtain the prediction error.
+ * Then it calls
+ * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+ * av1_txfm_uvrd sequentially and handles possible early terminations.
+ * The RD metrics are calculated and stored in rd_stats/_y/_uv.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] bsize Current macroblock size
+ * \param[in] rd_stats Pointer to struct to keep track of the overal RD
+ stats
+ * \param[in] rd_stats_y Pointer to struct to keep track of the RD
+ stats for the luma plane
+ * \param[in] rd_stats_uv Pointer to struct to keep track of the RD
+ stats for the chroma planes
+ * \param[in] mode_rate Rate cost to encode the prediction mode info. of
+ the current macroblock
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ *
+ * \return An integer value is returned indicating if a valid transform
+ candidate is found (1) or not (0).
+ */
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/txb_rdopt.c b/third_party/aom/av1/encoder/txb_rdopt.c
new file mode 100644
index 0000000000..e551e8aa12
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt.c
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/txb_rdopt_utils.h"
+
+#include "av1/common/idct.h"
+
+static INLINE void update_coeff_general(
+ int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+ TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift,
+ int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+ const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int is_last = si == (eob - 1);
+ const int coeff_ctx = get_lower_levels_ctx_general(
+ is_last, si, bhl, width, levels, ci, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ const int sign = (qc < 0) ? 1 : 0;
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ const int rate =
+ get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+ const int64_t rd = RDCOST(rdmult, rate, dist);
+
+ tran_low_t qc_low, dqc_low;
+ tran_low_t abs_qc_low;
+ int64_t dist_low, rd_low;
+ int rate_low;
+ if (abs_qc == 1) {
+ abs_qc_low = qc_low = dqc_low = 0;
+ dist_low = dist0;
+ rate_low = txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ abs_qc_low = abs_qc - 1;
+ dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci);
+ rate_low =
+ get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+ }
+
+ rd_low = RDCOST(rdmult, rate_low, dist_low);
+ if (rd_low < rd) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ *accu_dist += dist_low - dist0;
+ } else {
+ *accu_rate += rate;
+ *accu_dist += dist - dist0;
+ }
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+ int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ int bhl, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+ const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ (void)eob;
+ // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+ // and not the last (scan_idx != eob - 1)
+ assert(si != eob - 1);
+ assert(si > 0);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t abs_tqc = abs(tcoeff[ci]);
+ const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+ int rate_low = 0;
+ const int rate = get_two_coeff_cost_simple(
+ ci, abs_qc, coeff_ctx, txb_costs, bhl, tx_class, levels, &rate_low);
+ if (abs_dqc < abs_tqc) {
+ *accu_rate += rate;
+ return;
+ }
+
+ const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
+ const int64_t rd = RDCOST(rdmult, rate, dist);
+
+ const tran_low_t abs_qc_low = abs_qc - 1;
+ const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ const int64_t dist_low =
+ get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
+ if (rd_low < rd) {
+ const int sign = (qc < 0) ? 1 : 0;
+ qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+ dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ } else {
+ *accu_rate += rate;
+ }
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+ int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+ int si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width,
+ int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+ const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ assert(si != *eob - 1);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ int lower_level = 0;
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int sign = (qc < 0) ? 1 : 0;
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+ int rate =
+ get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+ txb_costs, bhl, tx_class, levels);
+ int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+ tran_low_t qc_low, dqc_low;
+ tran_low_t abs_qc_low;
+ int64_t dist_low, rd_low;
+ int rate_low;
+
+ if (abs_qc == 1) {
+ abs_qc_low = 0;
+ dqc_low = qc_low = 0;
+ dist_low = 0;
+ rate_low = txb_costs->base_cost[coeff_ctx][0];
+ rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+ } else {
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ abs_qc_low = abs_qc - 1;
+ dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0;
+ rate_low =
+ get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+ rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+ }
+
+ int lower_level_new_eob = 0;
+ const int new_eob = si + 1;
+ const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si);
+ const int new_eob_cost =
+ get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+ int rate_coeff_eob =
+ new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+ dc_sign_ctx, txb_costs, bhl,
+ tx_class);
+ int64_t dist_new_eob = dist;
+ int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+ if (abs_qc_low > 0) {
+ const int rate_coeff_eob_low =
+ new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+ coeff_ctx_new_eob, dc_sign_ctx,
+ txb_costs, bhl, tx_class);
+ const int64_t dist_new_eob_low = dist_low;
+ const int64_t rd_new_eob_low =
+ RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+ if (rd_new_eob_low < rd_new_eob) {
+ lower_level_new_eob = 1;
+ rd_new_eob = rd_new_eob_low;
+ rate_coeff_eob = rate_coeff_eob_low;
+ dist_new_eob = dist_new_eob_low;
+ }
+ }
+
+ if (sharpness == 0 || abs_qc > 1) {
+ if (rd_low < rd) {
+ lower_level = 1;
+ rd = rd_low;
+ rate = rate_low;
+ dist = dist_low;
+ }
+ }
+
+ if (sharpness == 0 && rd_new_eob < rd) {
+ for (int ni = 0; ni < *nz_num; ++ni) {
+ int last_ci = nz_ci[ni];
+ levels[get_padded_idx(last_ci, bhl)] = 0;
+ qcoeff[last_ci] = 0;
+ dqcoeff[last_ci] = 0;
+ }
+ *eob = new_eob;
+ *nz_num = 0;
+ *accu_rate = rate_coeff_eob;
+ *accu_dist = dist_new_eob;
+ lower_level = lower_level_new_eob;
+ } else {
+ *accu_rate += rate;
+ *accu_dist += dist;
+ }
+
+ if (lower_level) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ }
+ if (qcoeff[ci]) {
+ nz_ci[*nz_num] = ci;
+ ++*nz_num;
+ }
+ }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+ int nz_num, int *nz_ci, int64_t rdmult,
+ int skip_cost, int non_skip_cost,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+ const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+ const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+ if (rd_new_eob < rd) {
+ for (int i = 0; i < nz_num; ++i) {
+ const int ci = nz_ci[i];
+ qcoeff[ci] = 0;
+ dqcoeff[ci] = 0;
+ // no need to set up levels because this is the last step
+ // levels[get_padded_idx(ci, bhl)] = 0;
+ }
+ *accu_rate = 0;
+ *eob = 0;
+ }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+ int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+ int reduced_tx_set_used) {
+ if (plane > 0) return 0;
+
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+ !xd->lossless[xd->mi[0]->segment_id]) {
+ const int ext_tx_set =
+ get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+ if (is_inter) {
+ if (ext_tx_set > 0)
+ return x->mode_costs
+ .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+ } else {
+ if (ext_tx_set > 0) {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+ return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size]
+ [intra_dir][tx_type];
+ }
+ }
+ }
+ return 0;
+}
+
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblock_plane *p = &x->plane[plane];
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ const int shift = av1_get_tx_scale(tx_size);
+ int eob = p->eobs[block];
+ const int16_t *dequant = p->dequant_QTX;
+ const qm_val_t *iqmatrix =
+ av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+ const qm_val_t *qmatrix =
+ cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR
+ ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size,
+ tx_type)
+ : NULL;
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + block_offset;
+ tran_low_t *dqcoeff = p->dqcoeff + block_offset;
+ const tran_low_t *tcoeff = p->coeff + block_offset;
+ const CoeffCosts *coeff_costs = &x->coeff_costs;
+
+ // This function is not called if eob = 0.
+ assert(eob > 0);
+
+ const AV1_COMMON *cm = &cpi->common;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ assert(height == (1 << bhl));
+ const int is_inter = is_inter_block(mbmi);
+ const LV_MAP_COEFF_COST *txb_costs =
+ &coeff_costs->coeff_costs[txs_ctx][plane_type];
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *txb_eob_costs =
+ &coeff_costs->eob_costs[eob_multi_size][plane_type];
+
+ const int rshift = 2;
+
+ const int64_t rdmult =
+ (((int64_t)x->rdmult *
+ (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+ 2) >>
+ rshift;
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+
+ if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
+
+ // TODO(angirbird): check iqmatrix
+
+ const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+ const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+ int accu_rate = eob_cost;
+ int64_t accu_dist = 0;
+ int si = eob - 1;
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const tran_low_t abs_qc = abs(qc);
+ const int sign = qc < 0;
+ const int max_nz_num = 2;
+ int nz_num = 1;
+ int nz_ci[3] = { ci, 0, 0 };
+ if (abs_qc >= 2) {
+ update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+ bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx,
+ dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels, iqmatrix, qmatrix);
+ --si;
+ } else {
+ assert(abs_qc == 1);
+ const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, si);
+ accu_rate +=
+ get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+ txb_costs, bhl, tx_class);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ accu_dist += dist - dist0;
+ --si;
+ }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ for (; si >= 0 && nz_num <= max_nz_num; --si) { \
+ update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \
+ tx_size, tx_class_literal, bhl, width, \
+ txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+ txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \
+ levels, sharpness, iqmatrix, qmatrix); \
+ } \
+ break
+ switch (tx_class) {
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+ default: assert(false);
+ }
+
+ if (si == -1 && nz_num <= max_nz_num && sharpness == 0) {
+ update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+ non_skip_cost, qcoeff, dqcoeff);
+ }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ for (; si >= 1; --si) { \
+ update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \
+ rdmult, shift, dequant, scan, txb_costs, tcoeff, \
+ qcoeff, dqcoeff, levels, iqmatrix, qmatrix); \
+ } \
+ break
+ switch (tx_class) {
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+ default: assert(false);
+ }
+
+ // DC position
+ if (si == 0) {
+ // no need to update accu_dist because it's not used after this point
+ int64_t dummy_dist = 0;
+ update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+ bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx,
+ dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels, iqmatrix, qmatrix);
+ }
+
+ const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+ cm->features.reduced_tx_set_used);
+ if (eob == 0)
+ accu_rate += skip_cost;
+ else
+ accu_rate += non_skip_cost + tx_type_cost;
+
+ p->eobs[block] = eob;
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+ *rate_cost = accu_rate;
+ return eob;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+ const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+ const struct macroblock_plane *p, const int eob,
+ const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+ const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+ int reduced_tx_set_used) {
+ const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+ const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *const eob_costs =
+ &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+ int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+ av1_txb_init_levels(qcoeff, width, height, levels);
+
+ cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+ cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+ coeff_costs->lps_cost;
+ int c = eob - 1;
+ {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const int sign = AOMSIGN(v);
+ const int level = (v ^ sign) - sign;
+ const int coeff_ctx = coeff_contexts[pos];
+ cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+ if (v) {
+ // sign bit cost
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx_eob(pos, bhl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
+ }
+ if (c) {
+ cost += av1_cost_literal(1);
+ } else {
+ const int sign01 = (sign ^ sign) - sign;
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+ return cost;
+ }
+ }
+ }
+ const int(*base_cost)[8] = coeff_costs->base_cost;
+ for (c = eob - 2; c >= 1; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = qcoeff[pos];
+ const int level = abs(v);
+ cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+ if (v) {
+ // sign bit cost
+ cost += av1_cost_literal(1);
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
+ }
+ }
+ }
+ // c == 0 after previous loop
+ {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const int coeff_ctx = coeff_contexts[pos];
+ const int sign = AOMSIGN(v);
+ const int level = (v ^ sign) - sign;
+ cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+ if (v) {
+ // sign bit cost
+ const int sign01 = (sign ^ sign) - sign;
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
+ }
+ }
+ }
+ return cost;
+}
+
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type) {
+ assert(plane == 0);
+
+ int cost = 0;
+ const struct macroblock_plane *p = &x->plane[plane];
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+ int eob = p->eobs[block];
+
+ // coeffs
+ int c = eob - 1;
+ // eob
+ {
+ const int pos = scan[c];
+ const tran_low_t v = abs(qcoeff[pos]) - 1;
+ cost += (v << (AV1_PROB_COST_SHIFT + 2));
+ }
+ // other coeffs
+ for (c = eob - 2; c >= 0; c--) {
+ const int pos = scan[c];
+ const tran_low_t v = abs(qcoeff[pos]);
+ const int idx = AOMMIN(v, 14);
+
+ cost += costLUT[idx];
+ }
+
+ // const_term does not contain DC, and log(e) does not contain eob, so both
+ // (eob-1)
+ cost += (const_term + loge_par) * (eob - 1);
+
+ return cost;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+ const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+ const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+ const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+ int reduced_tx_set_used) {
+ const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *const eob_costs =
+ &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+ int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+ cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+ cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+ cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+ return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+ const struct macroblock_plane *p = &x->plane[plane];
+ const int eob = p->eobs[block];
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs =
+ &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+ if (eob == 0) {
+ return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ }
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+ return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+ plane_type, coeff_costs, xd, tx_type,
+ tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ const int reduced_tx_set_used,
+ const int adjust_eob) {
+ const struct macroblock_plane *p = &x->plane[plane];
+ int eob = p->eobs[block];
+
+ if (adjust_eob) {
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+ tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+ tcoeff, qcoeff, dqcoeff);
+ p->eobs[block] = eob;
+ }
+
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs =
+ &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+ if (eob == 0) {
+ return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ }
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+ return warehouse_efficients_txb_laplacian(
+ x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+ tx_type, tx_class, reduced_tx_set_used);
+}
diff --git a/third_party/aom/av1/encoder/txb_rdopt.h b/third_party/aom/av1/encoder/txb_rdopt.h
new file mode 100644
index 0000000000..70b322a2e1
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Adjust the magnitude of quantized coefficients to achieve better
+ * rate-distortion (RD) trade-off.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function goes through each coefficient and greedily choose to lower
+ * the coefficient magnitude by 1 or not based on the RD score.
+ *
+ * The coefficients are processing in reversed scan order.
+ *
+ * Note that, the end of block position (eob) may change if the original last
+ * coefficient is lowered to zero.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * \param[in] tx_size The transform size
+ * \param[in] tx_type The transform type
+ * \param[in] txb_ctx Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[out] rate_cost The entropy cost of coding the transform block
+ * after adjustment of coefficients.
+ * \param[in] sharpness When sharpness > 0, the function will be less
+ * aggressive towards lowering the magnitude of coefficients.
+ * In this way, the transform block will contain more high-frequency
+ * coefficients and therefore will preserve the sharpness of the reconstructed
+ * block.
+ */
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness);
+
+/*!\brief Compute the entropy cost of coding coefficients in a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in] x Pointer to structure holding the data for
+ the current encoding macroblock.
+ * \param[in] plane The index of the current plane.
+ * \param[in] block The index of the current transform block
+ in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in] tx_size The transform size.
+ * \param[in] tx_type The transform type.
+ * \param[in] txb_ctx Context info for entropy coding transform
+ block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in] reduced_tx_set_used Whether the transform type is chosen from
+ * a reduced set.
+ */
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+
+/*!\brief Estimate the entropy cost of coding a transform block using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function compute the entropy costs of the end of block position (eob)
+ * and the transform type (tx_type) precisely.
+ *
+ * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs
+ * of coefficients in the transform block.
+ *
+ * In the end, the function returns the sum of entropy costs of end of block
+ * position (eob), transform type (tx_type) and coefficients.
+ *
+ * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less
+ * accurate.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in] tx_size The transform size
+ * \param[in] tx_type The transform type
+ * \param[in] txb_ctx Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in] reduced_tx_set_used Whether the transform type is chosen from
+ * a reduced set.
+ * \param[in] adjust_eob Whether to adjust the end of block position
+ (eob)
+ * or not.
+ * \return int Estimated entropy cost of coding the transform
+ block.
+ */
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ const int reduced_tx_set_used,
+ const int adjust_eob);
+
+/*!\brief Estimate the entropy cost of transform coefficients using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function assumes each transform coefficient is of its own Laplacian
+ * distribution and the coefficient is the only observation of the Laplacian
+ * distribution.
+ *
+ * Based on that, each coefficient's coding cost can be estimated by computing
+ * the entropy of the corresponding Laplacian distribution.
+ *
+ * This function then return the sum of the estimated entropy cost for all
+ * coefficients in the transform block.
+ *
+ * Note that the entropy cost of end of block (eob) and transform type (tx_type)
+ * are not included.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in] tx_size The transform size
+ * \param[in] tx_type The transform type
+ * \return int Estimated entropy cost of coefficients in the
+ * transform block.
+ */
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_TXB_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/txb_rdopt_utils.h b/third_party/aom/av1/encoder/txb_rdopt_utils.h
new file mode 100644
index 0000000000..b9f08aacf0
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt_utils.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+
+#include "av1/encoder/encodetxb.h"
+
+static const int golomb_bits_cost[32] = {
+ 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+ 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+ 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+ 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+
+static const int golomb_cost_diff[32] = {
+ 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+ 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53, 545, 825, 1031,
+ 1209, 1393, 1577, 1763, 1947,
+ 2132, 2317, 2501, 2686, 2871 };
+
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+ const qm_val_t *iqmatrix) {
+ int dqv = dequant[!!coeff_idx];
+ if (iqmatrix != NULL)
+ dqv =
+ ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ return dqv;
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+ int shift, const qm_val_t *qmatrix,
+ int coeff_idx) {
+ int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+ if (qmatrix == NULL) {
+ return diff * diff;
+ }
+ // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion
+ // computation done in av1_block_error_qm, improving visual quality.
+ // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22
+ // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The
+ // multiplication `diff * diff` then does not risk overflowing.
+ diff *= qmatrix[coeff_idx];
+ const int64_t error =
+ (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+ return error;
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+ int eob_extra;
+ const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+ int eob_cost = 0;
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+ if (av1_eob_offset_bits[eob_pt] > 0) {
+ const int eob_ctx = eob_pt - 3;
+ const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+ const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+ const int offset_bits = av1_eob_offset_bits[eob_pt];
+ if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+ }
+ return eob_cost;
+}
+
+static INLINE int get_golomb_cost(int abs_qc) {
+ if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+ const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ const int length = get_msb(r) + 1;
+ return av1_cost_literal(2 * length - 1);
+ }
+ return 0;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+ const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+ int *diff) {
+ const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ int golomb_bits = 0;
+ if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+ *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+ if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+ int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ if (r < 32) {
+ golomb_bits = golomb_bits_cost[r];
+ *diff += golomb_cost_diff[r];
+ } else {
+ golomb_bits = get_golomb_cost(level);
+ *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+ }
+ }
+
+ return coeff_lps[base_range] + golomb_bits;
+}
+
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
+ int ci, tran_low_t abs_qc, int coeff_ctx,
+ const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class,
+ const uint8_t *levels, int *cost_low) {
+ // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+ // and not the last (scan_idx != eob - 1)
+ assert(ci > 0);
+ int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ int diff = 0;
+ if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
+ if (abs_qc) {
+ cost += av1_cost_literal(1);
+ if (abs_qc > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, ci, bhl, tx_class);
+ int brcost_diff = 0;
+ cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+ &brcost_diff);
+ diff += brcost_diff;
+ }
+ }
+ *cost_low = cost - diff;
+
+ return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+ int coeff_ctx, int dc_sign_ctx,
+ const LV_MAP_COEFF_COST *txb_costs,
+ int bhl, TX_CLASS tx_class) {
+ int cost = 0;
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ if (abs_qc != 0) {
+ if (ci == 0) {
+ cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+ } else {
+ cost += av1_cost_literal(1);
+ }
+ if (abs_qc > NUM_BASE_LEVELS) {
+ int br_ctx;
+ br_ctx = get_br_ctx_eob(ci, bhl, tx_class);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+ }
+ }
+ return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+ int sign, int coeff_ctx,
+ int dc_sign_ctx,
+ const LV_MAP_COEFF_COST *txb_costs,
+ int bhl, TX_CLASS tx_class,
+ const uint8_t *levels) {
+ int cost = 0;
+ if (is_last) {
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ } else {
+ cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ }
+ if (abs_qc != 0) {
+ if (ci == 0) {
+ cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+ } else {
+ cost += av1_cost_literal(1);
+ }
+ if (abs_qc > NUM_BASE_LEVELS) {
+ int br_ctx;
+ if (is_last)
+ br_ctx = get_br_ctx_eob(ci, bhl, tx_class);
+ else
+ br_ctx = get_br_ctx(levels, ci, bhl, tx_class);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+ }
+ }
+ return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+ int shift, tran_low_t *qc_low,
+ tran_low_t *dqc_low) {
+ tran_low_t abs_qc_low = abs_qc - 1;
+ *qc_low = (-sign ^ abs_qc_low) + sign;
+ assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+ tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ *dqc_low = (-sign ^ abs_dqc_low) + sign;
+ assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+ const int16_t *dequant_ptr,
+ const int16_t *scan,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ // TODO(sarahparker) make this work for aomqm
+ int eob_out = *eob;
+ int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+ dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+ for (int i = *eob - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+ eob_out--;
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ } else {
+ break;
+ }
+ }
+
+ *eob = eob_out;
+}
+#endif // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c
new file mode 100644
index 0000000000..f664795153
--- /dev/null
+++ b/third_party/aom/av1/encoder/var_based_part.c
@@ -0,0 +1,1914 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/rdopt_utils.h"
+
+// Possible values for the force_split variable while evaluating variance based
+// partitioning.
+enum {
+ // Evaluate all partition types
+ PART_EVAL_ALL = 0,
+ // Force PARTITION_SPLIT
+ PART_EVAL_ONLY_SPLIT = 1,
+ // Force PARTITION_NONE
+ PART_EVAL_ONLY_NONE = 2
+} UENUM1BYTE(PART_EVAL_STATUS);
+
+typedef struct {
+ VPVariance *part_variances;
+ VPartVar *split[4];
+} variance_node;
+
+static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
+ variance_node *node) {
+ node->part_variances = NULL;
+ switch (bsize) {
+ case BLOCK_128X128: {
+ VP128x128 *vt = (VP128x128 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_64X64: {
+ VP64x64 *vt = (VP64x64 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_32X32: {
+ VP32x32 *vt = (VP32x32 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_16X16: {
+ VP16x16 *vt = (VP16x16 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_8X8: {
+ VP8x8 *vt = (VP8x8 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ default: {
+ VP4x4 *vt = (VP4x4 *)data;
+ assert(bsize == BLOCK_4X4);
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx];
+ break;
+ }
+ }
+}
+
+// Set variance values given sum square error, sum error, count.
+static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
+ VPartVar *v) {
+ v->sum_square_error = s2;
+ v->sum_error = s;
+ v->log2_count = c;
+}
+
+static AOM_INLINE void get_variance(VPartVar *v) {
+ v->variance =
+ (int)(256 * (v->sum_square_error -
+ (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+ v->log2_count)) >>
+ v->log2_count);
+}
+
+static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
+ VPartVar *r) {
+ assert(a->log2_count == b->log2_count);
+ fill_variance(a->sum_square_error + b->sum_square_error,
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+ variance_node node;
+ memset(&node, 0, sizeof(node));
+ tree_to_node(data, bsize, &node);
+ sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+ sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+ sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+ sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+ sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+ &node.part_variances->none);
+}
+
+static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ if (cpi->common.mi_params.mi_cols > mi_col &&
+ cpi->common.mi_params.mi_rows > mi_row) {
+ CommonModeInfoParams *mi_params = &cpi->common.mi_params;
+ const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+ const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+ MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] =
+ &mi_params->mi_alloc[mi_alloc_idx];
+ mi->bsize = bsize;
+ }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd,
+ const TileInfo *const tile, void *data,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int64_t threshold, BLOCK_SIZE bsize_min,
+ PART_EVAL_STATUS force_split) {
+ AV1_COMMON *const cm = &cpi->common;
+ variance_node vt;
+ const int block_width = mi_size_wide[bsize];
+ const int block_height = mi_size_high[bsize];
+ int bs_width_check = block_width;
+ int bs_height_check = block_height;
+ int bs_width_vert_check = block_width >> 1;
+ int bs_height_horiz_check = block_height >> 1;
+ // On the right and bottom boundary we only need to check
+ // if half the bsize fits, because boundary is extended
+ // up to 64. So do this check only for sb_size = 64X64.
+ if (cm->seq_params->sb_size == BLOCK_64X64) {
+ if (tile->mi_col_end == cm->mi_params.mi_cols) {
+ bs_width_check = (block_width >> 1) + 1;
+ bs_width_vert_check = (block_width >> 2) + 1;
+ }
+ if (tile->mi_row_end == cm->mi_params.mi_rows) {
+ bs_height_check = (block_height >> 1) + 1;
+ bs_height_horiz_check = (block_height >> 2) + 1;
+ }
+ }
+
+ assert(block_height == block_width);
+ tree_to_node(data, bsize, &vt);
+
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_check <= tile->mi_row_end &&
+ force_split == PART_EVAL_ONLY_NONE) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ return 1;
+ }
+ if (force_split == PART_EVAL_ONLY_SPLIT) return 0;
+
+ // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+ // variance is below threshold, otherwise split will be selected.
+ // No check for vert/horiz split as too few samples for variance.
+ if (bsize == bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_check <= tile->mi_row_end &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ return 1;
+ }
+ return 0;
+ } else if (bsize > bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ // For key frame: take split for bsize above 32X32 or very high variance.
+ if (frame_is_intra_only(cm) &&
+ (bsize > BLOCK_32X32 ||
+ vt.part_variances->none.variance > (threshold << 4))) {
+ return 0;
+ }
+ // If variance is low, take the bsize (no split).
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_check <= tile->mi_row_end &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ return 1;
+ }
+ // Check vertical split.
+ if (mi_row + bs_height_check <= tile->mi_row_end &&
+ mi_col + bs_width_vert_check <= tile->mi_col_end) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ BLOCK_SIZE plane_bsize =
+ get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold &&
+ vt.part_variances->vert[1].variance < threshold &&
+ plane_bsize < BLOCK_INVALID) {
+ set_block_size(cpi, mi_row, mi_col, subsize);
+ set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
+ }
+ // Check horizontal split.
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_horiz_check <= tile->mi_row_end) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ BLOCK_SIZE plane_bsize =
+ get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold &&
+ vt.part_variances->horz[1].variance < threshold &&
+ plane_bsize < BLOCK_INVALID) {
+ set_block_size(cpi, mi_row, mi_col, subsize);
+ set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
+ }
+ return 0;
+ }
+ return 0;
+}
+
+static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
+ int pixels_high) {
+ int all_inside = 1;
+ for (int idx = 0; idx < 4; idx++) {
+ all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide);
+ all_inside &= ((y16_idx + GET_BLK_IDX_Y(idx, 3)) < pixels_high);
+ }
+ return all_inside;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd
+static AOM_INLINE void fill_variance_8x8avg_highbd(
+ const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+ int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
+ int pixels_high) {
+ for (int idx = 0; idx < 4; idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int src_avg = aom_highbd_avg_8x8(src_buf + y8_idx * src_stride + x8_idx,
+ src_stride);
+ int dst_avg = aom_highbd_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx,
+ dst_stride);
+
+ sum = src_avg - dst_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none);
+ }
+}
+#endif
+
+static AOM_INLINE void fill_variance_8x8avg_lowbd(
+ const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+ int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
+ int pixels_high) {
+ unsigned int sse[4] = { 0 };
+ int sum[4] = { 0 };
+
+ if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) {
+ int src_avg[4];
+ int dst_avg[4];
+ aom_avg_8x8_quad(src_buf, src_stride, x16_idx, y16_idx, src_avg);
+ aom_avg_8x8_quad(dst_buf, dst_stride, x16_idx, y16_idx, dst_avg);
+ for (int idx = 0; idx < 4; idx++) {
+ sum[idx] = src_avg[idx] - dst_avg[idx];
+ sse[idx] = sum[idx] * sum[idx];
+ }
+ } else {
+ for (int idx = 0; idx < 4; idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int src_avg =
+ aom_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride);
+ int dst_avg =
+ aom_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride);
+ sum[idx] = src_avg - dst_avg;
+ sse[idx] = sum[idx] * sum[idx];
+ }
+ }
+ }
+
+ for (int idx = 0; idx < 4; idx++) {
+ fill_variance(sse[idx], sum[idx], 0, &vst->split[idx].part_variances.none);
+ }
+}
+
+// Obtain parameters required to calculate variance (such as sum, sse, etc,.)
+// at 8x8 sub-block level for a given 16x16 block.
+// The function can be called only when is_key_frame is false since sum is
+// computed between source and reference frames.
+static AOM_INLINE void fill_variance_8x8avg(
+ const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+ int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag,
+ int pixels_wide, int pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd_flag) {
+ fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride,
+ x16_idx, y16_idx, vst, pixels_wide,
+ pixels_high);
+ return;
+ }
+#else
+ (void)highbd_flag;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ fill_variance_8x8avg_lowbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx,
+ y16_idx, vst, pixels_wide, pixels_high);
+}
+
+static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride,
+ const uint8_t *dst_buf, int dst_stride,
+ int x16_idx, int y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high) {
+ int minmax_max = 0;
+ int minmax_min = 255;
+ // Loop over the 4 8x8 subblocks.
+ for (int idx = 0; idx < 4; idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+ int min = 0;
+ int max = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_minmax_8x8(
+ src_buf + y8_idx * src_stride + x8_idx, src_stride,
+ dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max);
+ } else {
+ aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride,
+ dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min,
+ &max);
+ }
+#else
+ aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride,
+ dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min,
+ &max);
+#endif
+ if ((max - min) > minmax_max) minmax_max = (max - min);
+ if ((max - min) < minmax_min) minmax_min = (max - min);
+ }
+ }
+ return (minmax_max - minmax_min);
+}
+
+// Function to compute average and variance of 4x4 sub-block.
+// The function can be called only when is_key_frame is true since sum is
+// computed using source frame only.
+static AOM_INLINE void fill_variance_4x4avg(const uint8_t *src_buf,
+ int src_stride, int x8_idx,
+ int y8_idx, VP8x8 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high,
+ int border_offset_4x4) {
+ for (int idx = 0; idx < 4; idx++) {
+ const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2);
+ const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x4_idx < pixels_wide - border_offset_4x4 &&
+ y4_idx < pixels_high - border_offset_4x4) {
+ int src_avg;
+ int dst_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ src_avg = aom_highbd_avg_4x4(src_buf + y4_idx * src_stride + x4_idx,
+ src_stride);
+ } else {
+ src_avg =
+ aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride);
+ }
+#else
+ src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride);
+#endif
+
+ sum = src_avg - dst_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none);
+ }
+}
+
+// TODO(kyslov) Bring back threshold adjustment based on content state
+static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
+ int width, int height,
+ int non_reference_frame) {
+ (void)width;
+ (void)height;
+ int64_t threshold = threshold_base;
+ if (non_reference_frame) threshold = (3 * threshold) >> 1;
+ if (speed >= 8) {
+ return (5 * threshold) >> 2;
+ }
+ return threshold;
+}
+
+// Tune thresholds less or more aggressively to prefer larger partitions
+static AOM_INLINE void tune_thresh_based_on_qindex(
+ AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex,
+ int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd,
+ int lighting_change) {
+ double weight;
+ if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) {
+ const int win = 20;
+ if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
+ weight = 1.0;
+ else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win)
+ weight = 0.0;
+ else
+ weight =
+ 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win);
+ if (num_pixels > RESOLUTION_480P) {
+ for (int i = 0; i < 4; i++) {
+ thresholds[i] <<= 1;
+ }
+ }
+ if (num_pixels <= RESOLUTION_288P) {
+ thresholds[3] = INT64_MAX;
+ if (is_segment_id_boosted == false) {
+ thresholds[1] <<= 2;
+ thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4;
+ } else {
+ thresholds[1] <<= 1;
+ thresholds[2] <<= 3;
+ }
+ // Allow for split to 8x8 for superblocks where part of it has
+ // moving boundary. So allow for sb with source_sad above threshold,
+ // and avoid very large source_sad or high source content, to avoid
+ // too many 8x8 within superblock.
+ uint64_t avg_source_sad_thresh = 25000;
+ uint64_t block_sad_low = 25000;
+ uint64_t block_sad_high = 50000;
+ if (cpi->svc.temporal_layer_id == 0 &&
+ cpi->svc.number_temporal_layers > 1) {
+ // Increase the sad thresholds for base TL0, as reference/LAST is
+ // 2/4 frames behind (for 2/3 #TL).
+ avg_source_sad_thresh = 40000;
+ block_sad_high = 70000;
+ }
+ if (is_segment_id_boosted == false &&
+ cpi->rc.avg_source_sad < avg_source_sad_thresh &&
+ block_sad > block_sad_low && block_sad < block_sad_high &&
+ !lighting_change) {
+ thresholds[2] = (3 * thresholds[2]) >> 2;
+ thresholds[3] = thresholds[2] << 3;
+ }
+ // Condition the increase of partition thresholds on the segment
+ // and the content. Avoid the increase for superblocks which have
+ // high source sad, unless the whole frame has very high motion
+ // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
+ // have high source sad).
+ } else if (num_pixels > RESOLUTION_480P && is_segment_id_boosted == false &&
+ (source_sad_nonrd != kHighSad ||
+ cpi->rc.avg_source_sad > 50000)) {
+ thresholds[0] = (3 * thresholds[0]) >> 1;
+ thresholds[3] = INT64_MAX;
+ if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
+ thresholds[1] =
+ (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+ thresholds[2] =
+ (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+ }
+ } else if (current_qindex > QINDEX_LARGE_BLOCK_THR &&
+ is_segment_id_boosted == false &&
+ (source_sad_nonrd != kHighSad ||
+ cpi->rc.avg_source_sad > 50000)) {
+ thresholds[1] =
+ (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
+ thresholds[2] =
+ (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
+ thresholds[3] = INT64_MAX;
+ }
+ } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) {
+ thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0;
+ thresholds[2] =
+ (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2];
+ } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) {
+ const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1;
+ if (current_qindex < QINDEX_LARGE_BLOCK_THR - 45)
+ weight = 1.0;
+ else if (current_qindex > QINDEX_LARGE_BLOCK_THR + 45)
+ weight = 0.0;
+ else
+ weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + 45) / (2 * 45);
+ thresholds[1] =
+ (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+ thresholds[2] =
+ (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+ thresholds[3] =
+ (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]);
+ }
+ if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128))
+ thresholds[3] = INT64_MAX;
+}
+
+static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[],
+ int64_t threshold_base,
+ int threshold_left_shift,
+ int num_pixels) {
+ if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+ const int shift_steps =
+ threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8);
+ assert(shift_steps >= 0);
+ threshold_base <<= shift_steps;
+ }
+ thresholds[0] = threshold_base;
+ thresholds[1] = threshold_base;
+ if (num_pixels < RESOLUTION_720P) {
+ thresholds[2] = threshold_base / 3;
+ thresholds[3] = threshold_base >> 1;
+ } else {
+ int shift_val = 2;
+ if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+ shift_val = 0;
+ }
+
+ thresholds[2] = threshold_base >> shift_val;
+ thresholds[3] = threshold_base >> shift_val;
+ }
+ thresholds[4] = threshold_base << 2;
+}
+
+static AOM_INLINE void tune_thresh_based_on_resolution(
+ AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base,
+ int current_qindex, int source_sad_rd, int num_pixels) {
+ if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1;
+ if (num_pixels <= RESOLUTION_288P) {
+ const int qindex_thr[5][2] = {
+ { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 },
+ };
+ int th_idx = 0;
+ if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1)
+ th_idx =
+ (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0;
+ if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3)
+ th_idx = cpi->sf.rt_sf.var_part_based_on_qidx;
+ const int qindex_low_thr = qindex_thr[th_idx][0];
+ const int qindex_high_thr = qindex_thr[th_idx][1];
+ if (current_qindex >= qindex_high_thr) {
+ threshold_base = (5 * threshold_base) >> 1;
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = threshold_base << 2;
+ thresholds[3] = threshold_base << 5;
+ } else if (current_qindex < qindex_low_thr) {
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = threshold_base >> 1;
+ thresholds[3] = threshold_base << 3;
+ } else {
+ int64_t qi_diff_low = current_qindex - qindex_low_thr;
+ int64_t qi_diff_high = qindex_high_thr - current_qindex;
+ int64_t threshold_diff = qindex_high_thr - qindex_low_thr;
+ int64_t threshold_base_high = (5 * threshold_base) >> 1;
+
+ threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
+ threshold_base =
+ (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) /
+ threshold_diff;
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = ((qi_diff_low * threshold_base) +
+ qi_diff_high * (threshold_base >> 1)) /
+ threshold_diff;
+ thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
+ qi_diff_high * (threshold_base << 3)) /
+ threshold_diff;
+ }
+ } else if (num_pixels < RESOLUTION_720P) {
+ thresholds[2] = (5 * threshold_base) >> 2;
+ } else if (num_pixels < RESOLUTION_1080P) {
+ thresholds[2] = threshold_base << 1;
+ } else {
+ // num_pixels >= RESOLUTION_1080P
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ if (num_pixels < RESOLUTION_1440P) {
+ thresholds[2] = (5 * threshold_base) >> 1;
+ } else {
+ thresholds[2] = (7 * threshold_base) >> 1;
+ }
+ } else {
+ if (cpi->oxcf.speed > 7) {
+ thresholds[2] = 6 * threshold_base;
+ } else {
+ thresholds[2] = 3 * threshold_base;
+ }
+ }
+ }
+}
+
+// Increase partition thresholds for noisy content. Apply it only for
+// superblocks where sumdiff is low, as we assume the sumdiff of superblock
+// whose only change is due to noise will be low (i.e, noise will average
+// out over large block).
+static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
+ int64_t threshold_base,
+ int content_lowsumdiff,
+ int num_pixels) {
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t updated_thresh_base = threshold_base;
+ if (cpi->noise_estimate.enabled && content_lowsumdiff &&
+ num_pixels > RESOLUTION_480P && cm->current_frame.frame_number > 60) {
+ NOISE_LEVEL noise_level =
+ av1_noise_estimate_extract_level(&cpi->noise_estimate);
+ if (noise_level == kHigh)
+ updated_thresh_base = (5 * updated_thresh_base) >> 1;
+ else if (noise_level == kMedium &&
+ !cpi->sf.rt_sf.prefer_large_partition_blocks)
+ updated_thresh_base = (5 * updated_thresh_base) >> 2;
+ }
+ // TODO(kyslov) Enable var based partition adjusment on temporal denoising
+#if 0 // CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+ updated_thresh_base =
+ av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level,
+ content_state, cpi->svc.temporal_layer_id);
+ else
+ threshold_base =
+ scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width,
+ cm->height, cpi->ppi->rtc_ref.non_reference_frame);
+#else
+ // Increase base variance threshold based on content_state/sum_diff level.
+ updated_thresh_base = scale_part_thresh_content(
+ updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
+ cpi->ppi->rtc_ref.non_reference_frame);
+#endif
+ return updated_thresh_base;
+}
+
+static AOM_INLINE void set_vbp_thresholds(
+ AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex,
+ int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd,
+ bool is_segment_id_boosted, int lighting_change) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int threshold_multiplier = is_key_frame ? 120 : 1;
+ const int ac_q = av1_ac_quant_QTX(qindex, 0, cm->seq_params->bit_depth);
+ int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q);
+ const int current_qindex = cm->quant_params.base_qindex;
+ const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift;
+ const int num_pixels = cm->width * cm->height;
+
+ if (is_key_frame) {
+ set_vbp_thresholds_key_frame(cpi, thresholds, threshold_base,
+ threshold_left_shift, num_pixels);
+ return;
+ }
+
+ threshold_base = tune_thresh_noisy_content(cpi, threshold_base,
+ content_lowsumdiff, num_pixels);
+ thresholds[0] = threshold_base >> 1;
+ thresholds[1] = threshold_base;
+ thresholds[3] = threshold_base << threshold_left_shift;
+
+ tune_thresh_based_on_resolution(cpi, thresholds, threshold_base,
+ current_qindex, source_sad_rd, num_pixels);
+
+ tune_thresh_based_on_qindex(cpi, thresholds, blk_sad, current_qindex,
+ num_pixels, is_segment_id_boosted,
+ source_sad_nonrd, lighting_change);
+}
+
+// Set temporal variance low flag for superblock 64x64.
+// Only first 25 in the array are used in this case.
+static AOM_INLINE void set_low_temp_var_flag_64x64(
+ CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+ MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col,
+ int mi_row) {
+ if (xd->mi[0]->bsize == BLOCK_64X64) {
+ if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+ part_info->variance_low[0] = 1;
+ } else if (xd->mi[0]->bsize == BLOCK_64X32) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 1] = 1;
+ }
+ } else if (xd->mi[0]->bsize == BLOCK_32X64) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 3] = 1;
+ }
+ } else {
+ static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int idx_str = mi_params->mi_stride * (mi_row + idx[lvl1_idx][0]) +
+ mi_col + idx[lvl1_idx][1];
+ MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str;
+
+ if (mi_params->mi_cols <= mi_col + idx[lvl1_idx][1] ||
+ mi_params->mi_rows <= mi_row + idx[lvl1_idx][0])
+ continue;
+
+ if (*this_mi == NULL) continue;
+
+ if ((*this_mi)->bsize == BLOCK_32X32) {
+ int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
+ if (vt->split[lvl1_idx].part_variances.none.variance < threshold_32x32)
+ part_info->variance_low[lvl1_idx + 5] = 1;
+ } else {
+ // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+ // inside.
+ if ((*this_mi)->bsize == BLOCK_16X16 ||
+ (*this_mi)->bsize == BLOCK_32X16 ||
+ (*this_mi)->bsize == BLOCK_16X32) {
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ if (vt->split[lvl1_idx]
+ .split[lvl2_idx]
+ .part_variances.none.variance < (thresholds[2] >> 8))
+ part_info->variance_low[(lvl1_idx << 2) + lvl2_idx + 9] = 1;
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_low_temp_var_flag_128x128(
+ CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+ MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col,
+ int mi_row) {
+ if (xd->mi[0]->bsize == BLOCK_128X128) {
+ if (vt->part_variances.none.variance < (thresholds[0] >> 1))
+ part_info->variance_low[0] = 1;
+ } else if (xd->mi[0]->bsize == BLOCK_128X64) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 1] = 1;
+ }
+ } else if (xd->mi[0]->bsize == BLOCK_64X128) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 3] = 1;
+ }
+ } else {
+ static const int idx64[4][2] = {
+ { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 }
+ };
+ static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int idx_str = mi_params->mi_stride * (mi_row + idx64[lvl1_idx][0]) +
+ mi_col + idx64[lvl1_idx][1];
+ MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str;
+ if (*mi_64 == NULL) continue;
+ if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] ||
+ mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0])
+ continue;
+ const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+ if ((*mi_64)->bsize == BLOCK_64X64) {
+ if (vt->split[lvl1_idx].part_variances.none.variance < threshold_64x64)
+ part_info->variance_low[5 + lvl1_idx] = 1;
+ } else if ((*mi_64)->bsize == BLOCK_64X32) {
+ for (int part_idx = 0; part_idx < 2; part_idx++)
+ if (vt->split[lvl1_idx].part_variances.horz[part_idx].variance <
+ (threshold_64x64 >> 1))
+ part_info->variance_low[9 + (lvl1_idx << 1) + part_idx] = 1;
+ } else if ((*mi_64)->bsize == BLOCK_32X64) {
+ for (int part_idx = 0; part_idx < 2; part_idx++)
+ if (vt->split[lvl1_idx].part_variances.vert[part_idx].variance <
+ (threshold_64x64 >> 1))
+ part_info->variance_low[17 + (lvl1_idx << 1) + part_idx] = 1;
+ } else {
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ const int idx_str1 =
+ mi_params->mi_stride * idx32[lvl2_idx][0] + idx32[lvl2_idx][1];
+ MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1;
+ if (*mi_32 == NULL) continue;
+
+ if (mi_params->mi_cols <=
+ mi_col + idx64[lvl1_idx][1] + idx32[lvl2_idx][1] ||
+ mi_params->mi_rows <=
+ mi_row + idx64[lvl1_idx][0] + idx32[lvl2_idx][0])
+ continue;
+ const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
+ if ((*mi_32)->bsize == BLOCK_32X32) {
+ if (vt->split[lvl1_idx]
+ .split[lvl2_idx]
+ .part_variances.none.variance < threshold_32x32)
+ part_info->variance_low[25 + (lvl1_idx << 2) + lvl2_idx] = 1;
+ } else {
+ // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+ // inside.
+ if ((*mi_32)->bsize == BLOCK_16X16 ||
+ (*mi_32)->bsize == BLOCK_32X16 ||
+ (*mi_32)->bsize == BLOCK_16X32) {
+ for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) {
+ VPartVar *none_var = &vt->split[lvl1_idx]
+ .split[lvl2_idx]
+ .split[lvl3_idx]
+ .part_variances.none;
+ if (none_var->variance < (thresholds[3] >> 8))
+ part_info->variance_low[41 + (lvl1_idx << 4) +
+ (lvl2_idx << 2) + lvl3_idx] = 1;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_low_temp_var_flag(
+ AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd,
+ VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition,
+ int mi_col, int mi_row, const bool is_small_sb) {
+ AV1_COMMON *const cm = &cpi->common;
+ // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected.
+ // If the temporal variance is small set the flag
+ // variance_low for the block. The variance threshold can be adjusted, the
+ // higher the more aggressive.
+ if (ref_frame_partition == LAST_FRAME) {
+ if (is_small_sb)
+ set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
+ &(vt->split[0]), thresholds, mi_col, mi_row);
+ else
+ set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt,
+ thresholds, mi_col, mi_row);
+ }
+}
+
+static const int pos_shift_16x16[4][4] = {
+ { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ // Relative indices of MB inside the superblock.
+ const int mi_x = mi_row & 0xF;
+ const int mi_y = mi_col & 0xF;
+ // Relative indices of 16x16 block inside the superblock.
+ const int i = mi_x >> 2;
+ const int j = mi_y >> 2;
+ int force_skip_low_temp_var = 0;
+ // Set force_skip_low_temp_var based on the block size and block offset.
+ switch (bsize) {
+ case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+ case BLOCK_64X32:
+ if (!mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[1];
+ } else if (!mi_y && mi_x) {
+ force_skip_low_temp_var = variance_low[2];
+ }
+ break;
+ case BLOCK_32X64:
+ if (!mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[3];
+ } else if (mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[4];
+ }
+ break;
+ case BLOCK_32X32:
+ if (!mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[5];
+ } else if (mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[6];
+ } else if (!mi_y && mi_x) {
+ force_skip_low_temp_var = variance_low[7];
+ } else if (mi_y && mi_x) {
+ force_skip_low_temp_var = variance_low[8];
+ }
+ break;
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
+ force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+ break;
+ default: break;
+ }
+
+ return force_skip_low_temp_var;
+}
+
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ int force_skip_low_temp_var = 0;
+ int x, y;
+ x = (mi_col & 0x1F) >> 4;
+ // y = (mi_row & 0x1F) >> 4;
+ // const int idx64 = (y << 1) + x;
+ y = (mi_row & 0x17) >> 3;
+ const int idx64 = y + x;
+
+ x = (mi_col & 0xF) >> 3;
+ // y = (mi_row & 0xF) >> 3;
+ // const int idx32 = (y << 1) + x;
+ y = (mi_row & 0xB) >> 2;
+ const int idx32 = y + x;
+
+ x = (mi_col & 0x7) >> 2;
+ // y = (mi_row & 0x7) >> 2;
+ // const int idx16 = (y << 1) + x;
+ y = (mi_row & 0x5) >> 1;
+ const int idx16 = y + x;
+ // Set force_skip_low_temp_var based on the block size and block offset.
+ switch (bsize) {
+ case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+ case BLOCK_128X64:
+ assert((mi_col & 0x1F) == 0);
+ force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+ break;
+ case BLOCK_64X128:
+ assert((mi_row & 0x1F) == 0);
+ force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+ break;
+ case BLOCK_64X64:
+ // Location of this 64x64 block inside the 128x128 superblock
+ force_skip_low_temp_var = variance_low[5 + idx64];
+ break;
+ case BLOCK_64X32:
+ x = (mi_col & 0x1F) >> 4;
+ y = (mi_row & 0x1F) >> 3;
+ /*
+ .---------------.---------------.
+ | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+ :---------------+---------------:
+ | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+ :---------------+---------------:
+ | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+ :---------------+---------------:
+ | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+ '---------------'---------------'
+ */
+ const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+ force_skip_low_temp_var = variance_low[9 + idx64x32];
+ break;
+ case BLOCK_32X64:
+ x = (mi_col & 0x1F) >> 3;
+ y = (mi_row & 0x1F) >> 4;
+ const int idx32x64 = (y << 2) + x;
+ force_skip_low_temp_var = variance_low[17 + idx32x64];
+ break;
+ case BLOCK_32X32:
+ force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+ break;
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
+ force_skip_low_temp_var =
+ variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+ break;
+ default: break;
+ }
+ return force_skip_low_temp_var;
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex,
+ int content_lowsumdiff) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
+ return;
+ } else {
+ set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, 0, qindex,
+ content_lowsumdiff, 0, 0, 0, 0);
+ // The threshold below is not changed locally.
+ cpi->vbp_info.threshold_minmax = 15 + (qindex >> 3);
+ }
+}
+
+static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int y_sad,
+ unsigned int y_sad_g,
+ unsigned int y_sad_alt, bool is_key_frame,
+ bool zero_motion, unsigned int *uv_sad) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ int shift_upper_limit = 1;
+ int shift_lower_limit = 3;
+ int fac_uv = 6;
+ if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return;
+
+ // Use lower threshold (more conservative in setting color flag) for
+ // higher resolutions non-screen, which tend to have more camera noise.
+ // Since this may be used to skip compound mode in nonrd pickmode, which
+ // is generally more effective for higher resolutions, better to be more
+ // conservative.
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ if (cpi->common.width * cpi->common.height >= RESOLUTION_1080P)
+ fac_uv = 3;
+ else
+ fac_uv = 5;
+ }
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ cpi->rc.high_source_sad) {
+ shift_lower_limit = 7;
+ } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 &&
+ cpi->common.width * cpi->common.height >= 640 * 360) {
+ shift_upper_limit = 2;
+ shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4;
+ }
+
+ MB_MODE_INFO *mi = xd->mi[0];
+ const AV1_COMMON *const cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ const YV12_BUFFER_CONFIG *yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, LAST_FRAME);
+ struct buf_2d dst;
+ unsigned int uv_sad_g = 0;
+ unsigned int uv_sad_alt = 0;
+
+ for (int plane = AOM_PLANE_U; plane < MAX_MB_PLANE; ++plane) {
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ if (bs != BLOCK_INVALID) {
+ // For last:
+ if (zero_motion) {
+ if (mi->ref_frame[0] == LAST_FRAME) {
+ uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+ } else {
+ uint8_t *src = (plane == 1) ? yv12->u_buffer : yv12->v_buffer;
+ setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12->uv_crop_width,
+ yv12->uv_crop_height, yv12->uv_stride, xd->mi_row,
+ xd->mi_col, sf, xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y);
+
+ uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, dst.buf, dst.stride);
+ }
+ } else {
+ uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+ }
+
+ // For golden:
+ if (y_sad_g != UINT_MAX) {
+ uint8_t *src = (plane == 1) ? yv12_g->u_buffer : yv12_g->v_buffer;
+ setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_g->uv_crop_width,
+ yv12_g->uv_crop_height, yv12_g->uv_stride, xd->mi_row,
+ xd->mi_col, sf, xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y);
+ uv_sad_g = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf,
+ dst.stride);
+ }
+
+ // For altref:
+ if (y_sad_alt != UINT_MAX) {
+ uint8_t *src = (plane == 1) ? yv12_alt->u_buffer : yv12_alt->v_buffer;
+ setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_alt->uv_crop_width,
+ yv12_alt->uv_crop_height, yv12_alt->uv_stride,
+ xd->mi_row, xd->mi_col, sf,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y);
+ uv_sad_alt = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+ dst.buf, dst.stride);
+ }
+ }
+
+ if (uv_sad[plane - 1] > (y_sad >> shift_upper_limit))
+ x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 1;
+ else if (uv_sad[plane - 1] < (y_sad >> shift_lower_limit))
+ x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 0;
+ // Borderline case: to be refined at coding block level in nonrd_pickmode,
+ // for coding block size < sb_size.
+ else
+ x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 2;
+
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(plane)] =
+ uv_sad_g > y_sad_g / fac_uv;
+ x->color_sensitivity_sb_alt[COLOR_SENS_IDX(plane)] =
+ uv_sad_alt > y_sad_alt / fac_uv;
+ }
+}
+
+static void fill_variance_tree_leaves(
+ AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split,
+ int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4],
+ int64_t *thresholds, const uint8_t *src_buf, int src_stride,
+ const uint8_t *dst_buf, int dst_stride, bool is_key_frame,
+ const bool is_small_sb) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_64x64_blocks = is_small_sb ? 1 : 4;
+ // TODO(kyslov) Bring back compute_minmax_variance with content type detection
+ const int compute_minmax_variance = 0;
+ const int segment_id = xd->mi[0]->segment_id;
+ int pixels_wide = 128, pixels_high = 128;
+ int border_offset_4x4 = 0;
+ int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf;
+ // dst_buf pointer is not used for is_key_frame, so it should be NULL.
+ assert(IMPLIES(is_key_frame, dst_buf == NULL));
+ if (is_small_sb) {
+ pixels_wide = 64;
+ pixels_high = 64;
+ }
+ if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+ if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ temporal_denoising |= cpi->oxcf.noise_sensitivity;
+#endif
+ // For temporal filtering or temporal denoiser enabled: since the source
+ // is modified we need to avoid 4x4 avg along superblock boundary, since
+ // simd code will load 8 pixels for 4x4 avg and so can access source
+ // data outside superblock (while its being modified by temporal filter).
+ // Temporal filtering is never done on key frames.
+ if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4;
+ for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; blk64_idx++) {
+ const int x64_idx = GET_BLK_IDX_X(blk64_idx, 6);
+ const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 6);
+ const int blk64_scale_idx = blk64_idx << 2;
+ force_split[blk64_idx + 1] = PART_EVAL_ALL;
+
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int x32_idx = x64_idx + GET_BLK_IDX_X(lvl1_idx, 5);
+ const int y32_idx = y64_idx + GET_BLK_IDX_Y(lvl1_idx, 5);
+ const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ALL;
+ avg_16x16[blk64_idx][lvl1_idx] = 0;
+ maxvar_16x16[blk64_idx][lvl1_idx] = 0;
+ minvar_16x16[blk64_idx][lvl1_idx] = INT_MAX;
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ const int x16_idx = x32_idx + GET_BLK_IDX_X(lvl2_idx, 4);
+ const int y16_idx = y32_idx + GET_BLK_IDX_Y(lvl2_idx, 4);
+ const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+ VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+ force_split[split_index] = PART_EVAL_ALL;
+ if (is_key_frame) {
+ // Go down to 4x4 down-sampling for variance.
+ for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3);
+ VP8x8 *vst2 = &vst->split[lvl3_idx];
+ fill_variance_4x4avg(src_buf, src_stride, x8_idx, y8_idx, vst2,
+#if CONFIG_AV1_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high, border_offset_4x4);
+ }
+ } else {
+ fill_variance_8x8avg(src_buf, src_stride, dst_buf, dst_stride,
+ x16_idx, y16_idx, vst, is_cur_buf_hbd(xd),
+ pixels_wide, pixels_high);
+
+ fill_variance_tree(vst, BLOCK_16X16);
+ VPartVar *none_var = &vt->split[blk64_idx]
+ .split[lvl1_idx]
+ .split[lvl2_idx]
+ .part_variances.none;
+ get_variance(none_var);
+ const int val_none_var = none_var->variance;
+ avg_16x16[blk64_idx][lvl1_idx] += val_none_var;
+ minvar_16x16[blk64_idx][lvl1_idx] =
+ AOMMIN(minvar_16x16[blk64_idx][lvl1_idx], val_none_var);
+ maxvar_16x16[blk64_idx][lvl1_idx] =
+ AOMMAX(maxvar_16x16[blk64_idx][lvl1_idx], val_none_var);
+ if (val_none_var > thresholds[3]) {
+ // 16X16 variance is above threshold for split, so force split to
+ // 8x8 for this 16x16 block (this also forces splits for upper
+ // levels).
+ force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ } else if (!cyclic_refresh_segment_id_boosted(segment_id) &&
+ compute_minmax_variance && val_none_var > thresholds[2]) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above
+ // threshold, force split to 8x8 block for this 16x16 block.
+ int minmax = compute_minmax_8x8(src_buf, src_stride, dst_buf,
+ dst_stride, x16_idx, y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high);
+ const int thresh_minmax = (int)cpi->vbp_info.threshold_minmax;
+ if (minmax > thresh_minmax) {
+ force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+ force_split[5 + blk64_scale_idx + lvl1_idx] =
+ PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_ref_frame_for_partition(
+ AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi,
+ unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt,
+ const YV12_BUFFER_CONFIG *yv12_g, const YV12_BUFFER_CONFIG *yv12_alt,
+ int mi_row, int mi_col, int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const bool is_set_golden_ref_frame =
+ *y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt;
+ const bool is_set_altref_ref_frame =
+ *y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g;
+
+ if (is_set_golden_ref_frame) {
+ av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+ mi->ref_frame[0] = GOLDEN_FRAME;
+ mi->mv[0].as_int = 0;
+ *y_sad = *y_sad_g;
+ *ref_frame_partition = GOLDEN_FRAME;
+ x->nonrd_prune_ref_frame_search = 0;
+ x->sb_me_partition = 0;
+ } else if (is_set_altref_ref_frame) {
+ av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
+ get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
+ mi->ref_frame[0] = ALTREF_FRAME;
+ mi->mv[0].as_int = 0;
+ *y_sad = *y_sad_alt;
+ *ref_frame_partition = ALTREF_FRAME;
+ x->nonrd_prune_ref_frame_search = 0;
+ x->sb_me_partition = 0;
+ } else {
+ *ref_frame_partition = LAST_FRAME;
+ x->nonrd_prune_ref_frame_search =
+ cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+ }
+}
+
+static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0,
+ const FULLPEL_MV *mv1) {
+ return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col);
+}
+
+static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
+ unsigned int *y_sad,
+ bool is_small_sb,
+ int est_motion) {
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ // TODO(yunqingwang@google.com): test if this condition works with other
+ // speeds.
+ if (est_motion > 2 && source_sad_nonrd > kMedSad) return;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+ MB_MODE_INFO *mi = xd->mi[0];
+
+ unsigned int above_y_sad = UINT_MAX;
+ unsigned int left_y_sad = UINT_MAX;
+ FULLPEL_MV above_mv = kZeroFullMv;
+ FULLPEL_MV left_mv = kZeroFullMv;
+ SubpelMvLimits subpel_mv_limits;
+ const MV dummy_mv = { 0, 0 };
+ av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, &dummy_mv);
+
+ // Current best MV
+ FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv);
+ const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8;
+
+ if (xd->up_available) {
+ const MB_MODE_INFO *above_mbmi = xd->above_mbmi;
+ if (above_mbmi->mode >= INTRA_MODE_END &&
+ above_mbmi->ref_frame[0] == LAST_FRAME) {
+ MV temp = above_mbmi->mv[0].as_mv;
+ clamp_mv(&temp, &subpel_mv_limits);
+ above_mv = get_fullmv_from_mv(&temp);
+
+ if (mv_distance(&best_mv, &above_mv) > 0) {
+ uint8_t const *ref_buf =
+ get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv);
+ above_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, ref_buf,
+ xd->plane[0].pre[0].stride);
+ }
+ }
+ }
+ if (xd->left_available) {
+ const MB_MODE_INFO *left_mbmi = xd->left_mbmi;
+ if (left_mbmi->mode >= INTRA_MODE_END &&
+ left_mbmi->ref_frame[0] == LAST_FRAME) {
+ MV temp = left_mbmi->mv[0].as_mv;
+ clamp_mv(&temp, &subpel_mv_limits);
+ left_mv = get_fullmv_from_mv(&temp);
+
+ if (mv_distance(&best_mv, &left_mv) > 0 &&
+ mv_distance(&above_mv, &left_mv) > 0) {
+ uint8_t const *ref_buf =
+ get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv);
+ left_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, ref_buf,
+ xd->plane[0].pre[0].stride);
+ }
+ }
+ }
+
+ if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) {
+ *y_sad = above_y_sad;
+ mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv);
+ clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
+ }
+ if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) {
+ *y_sad = left_y_sad;
+ mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv);
+ clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
+ }
+}
+
+static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
+ unsigned int *y_sad_g, unsigned int *y_sad_alt,
+ unsigned int *y_sad_last,
+ MV_REFERENCE_FRAME *ref_frame_partition,
+ struct scale_factors *sf_no_scale, int mi_row,
+ int mi_col, bool is_small_sb, bool scaled_ref_last) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_planes = av1_num_planes(cm);
+ bool scaled_ref_golden = false;
+ bool scaled_ref_alt = false;
+ BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+ MB_MODE_INFO *mi = xd->mi[0];
+ const YV12_BUFFER_CONFIG *yv12 =
+ scaled_ref_last ? av1_get_scaled_ref_frame(cpi, LAST_FRAME)
+ : get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ assert(yv12 != NULL);
+ const YV12_BUFFER_CONFIG *yv12_g = NULL;
+ const YV12_BUFFER_CONFIG *yv12_alt = NULL;
+ // Check if LAST is a reference. For spatial layers always use it as
+ // reference scaling.
+ int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) ||
+ cpi->svc.number_spatial_layers > 1;
+ int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG;
+ int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config ||
+ cpi->sf.rt_sf.use_nonrd_altref_frame ||
+ (cpi->sf.rt_sf.use_comp_ref_nonrd &&
+ cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1);
+
+ // For 1 spatial layer: GOLDEN is another temporal reference.
+ // Check if it should be used as reference for partitioning.
+ if (cpi->svc.number_spatial_layers == 1 && use_golden_ref &&
+ (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+ yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ if (yv12_g && (yv12_g->y_crop_height != cm->height ||
+ yv12_g->y_crop_width != cm->width)) {
+ yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ scaled_ref_golden = true;
+ }
+ if (yv12_g && yv12_g != yv12) {
+ av1_setup_pre_planes(
+ xd, 0, yv12_g, mi_row, mi_col,
+ scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME),
+ num_planes);
+ *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+ xd->plane[AOM_PLANE_Y].pre[0].buf,
+ xd->plane[AOM_PLANE_Y].pre[0].stride);
+ }
+ }
+
+ // For 1 spatial layer: ALTREF is another temporal reference.
+ // Check if it should be used as reference for partitioning.
+ if (cpi->svc.number_spatial_layers == 1 && use_alt_ref &&
+ (cpi->ref_frame_flags & AOM_ALT_FLAG) &&
+ (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+ yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+ if (yv12_alt && (yv12_alt->y_crop_height != cm->height ||
+ yv12_alt->y_crop_width != cm->width)) {
+ yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME);
+ scaled_ref_alt = true;
+ }
+ if (yv12_alt && yv12_alt != yv12) {
+ av1_setup_pre_planes(
+ xd, 0, yv12_alt, mi_row, mi_col,
+ scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME),
+ num_planes);
+ *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+ xd->plane[AOM_PLANE_Y].pre[0].buf,
+ xd->plane[AOM_PLANE_Y].pre[0].stride);
+ }
+ }
+
+ if (use_last_ref) {
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ av1_setup_pre_planes(
+ xd, 0, yv12, mi_row, mi_col,
+ scaled_ref_last ? NULL : get_ref_scale_factors(cm, LAST_FRAME),
+ num_planes);
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+ mi->bsize = cm->seq_params->sb_size;
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+ int est_motion = cpi->sf.rt_sf.estimate_motion_for_var_based_partition;
+ // TODO(b/290596301): Look into adjusting this condition.
+ // There is regression on color content when
+ // estimate_motion_for_var_based_partition = 3 and high motion,
+ // so for now force it to 2 based on superblock sad.
+ if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
+
+ if (est_motion == 1 || est_motion == 2) {
+ if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+ // For screen only do int_pro_motion for spatial variance above
+ // threshold and motion level above LowSad.
+ if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
+ int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ int me_search_size_col =
+ is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1;
+ // For screen use larger search size row motion to capture
+ // vertical scroll, which can be larger motion.
+ int me_search_size_row =
+ is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1;
+ unsigned int y_sad_zero;
+ *y_sad = av1_int_pro_motion_estimation(
+ cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
+ &y_sad_zero, me_search_size_col, me_search_size_row);
+ // The logic below selects whether the motion estimated in the
+ // int_pro_motion() will be used in nonrd_pickmode. Only do this
+ // for screen for now.
+ if (is_screen) {
+ unsigned int thresh_sad =
+ (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
+ if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
+ x->sb_me_partition = 1;
+ x->sb_me_mv.as_int = mi->mv[0].as_int;
+ } else {
+ x->sb_me_partition = 0;
+ // Fall back to using zero motion.
+ *y_sad = y_sad_zero;
+ mi->mv[0].as_int = 0;
+ }
+ }
+ }
+ }
+ }
+
+ if (*y_sad == UINT_MAX) {
+ *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+ xd->plane[AOM_PLANE_Y].pre[0].buf,
+ xd->plane[AOM_PLANE_Y].pre[0].stride);
+ }
+
+ // Evaluate if neighbours' MVs give better predictions. Zero MV is tested
+ // already, so only non-zero MVs are tested here. Here the neighbour blocks
+ // are the first block above or left to this superblock.
+ if (est_motion >= 2 && (xd->up_available || xd->left_available))
+ evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion);
+
+ *y_sad_last = *y_sad;
+ }
+
+ // Pick the ref frame for partitioning, use golden or altref frame only if
+ // its lower sad, bias to LAST with factor 0.9.
+ set_ref_frame_for_partition(cpi, x, xd, ref_frame_partition, mi, y_sad,
+ y_sad_g, y_sad_alt, yv12_g, yv12_alt, mi_row,
+ mi_col, num_planes);
+
+ // Only calculate the predictor for non-zero MV.
+ if (mi->mv[0].as_int != 0) {
+ if (!scaled_ref_last) {
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ } else {
+ xd->block_ref_scale_factors[0] = sf_no_scale;
+ xd->block_ref_scale_factors[1] = sf_no_scale;
+ }
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+ cm->seq_params->sb_size, AOM_PLANE_Y,
+ num_planes - 1);
+ }
+}
+
+// Decides whether to split or merge a 16x16 partition block in variance based
+// partitioning based on the 8x8 sub-block variances.
+static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
+ VP16x16 *var_16x16_info, int64_t threshold16) {
+ int max_8x8_var = 0, min_8x8_var = INT_MAX;
+ for (int split_idx = 0; split_idx < 4; split_idx++) {
+ get_variance(&var_16x16_info->split[split_idx].part_variances.none);
+ int this_8x8_var =
+ var_16x16_info->split[split_idx].part_variances.none.variance;
+ max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var);
+ min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var);
+ }
+ // If the difference between maximum and minimum sub-block variances is high,
+ // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate
+ // only PARTITION_NONE. The shift factor for threshold16 has been derived
+ // empirically.
+ return ((max_8x8_var - min_8x8_var) > (threshold16 << 2))
+ ? PART_EVAL_ONLY_SPLIT
+ : PART_EVAL_ONLY_NONE;
+}
+
+static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad(
+ int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) {
+ if (set_zeromv_skip_based_on_source_sad == 0) return false;
+
+ if (set_zeromv_skip_based_on_source_sad >= 3)
+ return source_sad_nonrd <= kLowSad;
+ else if (set_zeromv_skip_based_on_source_sad >= 2)
+ return source_sad_nonrd <= kVeryLowSad;
+ else if (set_zeromv_skip_based_on_source_sad >= 1)
+ return source_sad_nonrd == kZeroSad;
+
+ return false;
+}
+
+static AOM_INLINE bool set_force_zeromv_skip_for_sb(
+ AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt,
+ unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!is_set_force_zeromv_skip_based_on_src_sad(
+ cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad,
+ x->content_state_sb.source_sad_nonrd))
+ return false;
+ int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0;
+ const int block_width = mi_size_wide[cm->seq_params->sb_size];
+ const int block_height = mi_size_high[cm->seq_params->sb_size];
+ const unsigned int thresh_exit_part_y =
+ cpi->zeromv_skip_thresh_exit_part[bsize] << shift;
+ unsigned int thresh_exit_part_uv =
+ CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift;
+ // Be more aggressive in UV threshold if source_sad >= VeryLowSad
+ // to suppreess visual artifact caused by the speed feature:
+ // set_zeromv_skip_based_on_source_sad = 2. For now only for
+ // part_early_exit_zeromv = 1.
+ if (x->content_state_sb.source_sad_nonrd >= kVeryLowSad &&
+ cpi->sf.rt_sf.part_early_exit_zeromv == 1)
+ thresh_exit_part_uv = thresh_exit_part_uv >> 3;
+ if (mi_col + block_width <= tile->mi_col_end &&
+ mi_row + block_height <= tile->mi_row_end && y_sad < thresh_exit_part_y &&
+ uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ x->force_zeromv_skip_for_sb = 1;
+ aom_free(vt);
+ // Partition shape is set here at SB level.
+ // Exit needs to happen from av1_choose_var_based_partitioning().
+ return true;
+ } else if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+ cpi->sf.rt_sf.part_early_exit_zeromv >= 2)
+ x->force_zeromv_skip_for_sb = 2;
+ return false;
+}
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ ThreadData *td, MACROBLOCK *x, int mi_row,
+ int mi_col) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, choose_var_based_partitioning_time);
+#endif
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
+ PART_EVAL_STATUS force_split[85];
+ int avg_64x64;
+ int max_var_32x32[4];
+ int min_var_32x32[4];
+ int var_32x32;
+ int var_64x64;
+ int min_var_64x64 = INT_MAX;
+ int max_var_64x64 = 0;
+ int avg_16x16[4][4];
+ int maxvar_16x16[4][4];
+ int minvar_16x16[4][4];
+ const uint8_t *src_buf;
+ const uint8_t *dst_buf;
+ int dst_stride;
+ unsigned int uv_sad[MAX_MB_PLANE - 1];
+ NOISE_LEVEL noise_level = kLow;
+ bool is_zero_motion = true;
+ bool scaled_ref_last = false;
+ struct scale_factors sf_no_scale;
+ av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+ cm->width, cm->height);
+
+ bool is_key_frame =
+ (frame_is_intra_only(cm) ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+ assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+ cm->seq_params->sb_size == BLOCK_128X128);
+ const bool is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+ const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+ unsigned int y_sad = UINT_MAX;
+ unsigned int y_sad_g = UINT_MAX;
+ unsigned int y_sad_alt = UINT_MAX;
+ unsigned int y_sad_last = UINT_MAX;
+ BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
+ // Ref frame used in partitioning.
+ MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+ int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+ vbp_thresholds[2], vbp_thresholds[3],
+ vbp_thresholds[4] };
+
+ const int segment_id = xd->mi[0]->segment_id;
+ uint64_t blk_sad = 0;
+ if (cpi->src_sad_blk_64x64 != NULL &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols =
+ (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sbi_col = mi_col / sb_size_by_mb;
+ const int sbi_row = mi_row / sb_size_by_mb;
+ blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+ }
+
+ const bool is_segment_id_boosted =
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+ cyclic_refresh_segment_id_boosted(segment_id);
+ const int qindex =
+ is_segment_id_boosted
+ ? av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex)
+ : cm->quant_params.base_qindex;
+ set_vbp_thresholds(
+ cpi, thresholds, blk_sad, qindex, x->content_state_sb.low_sumdiff,
+ x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd,
+ is_segment_id_boosted, x->content_state_sb.lighting_change);
+
+ src_buf = x->plane[AOM_PLANE_Y].src.buf;
+ int src_stride = x->plane[AOM_PLANE_Y].src.stride;
+
+ // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+ // 5-20 for the 16x16 blocks.
+ force_split[0] = PART_EVAL_ALL;
+ memset(x->part_search_info.variance_low, 0,
+ sizeof(x->part_search_info.variance_low));
+
+ // Check if LAST frame is NULL, and if so, treat this frame
+ // as a key frame, for the purpose of the superblock partitioning.
+ // LAST == NULL can happen in cases where enhancement spatial layers are
+ // enabled dyanmically and the only reference is the spatial(GOLDEN).
+ // If LAST frame has a different resolution: set the scaled_ref_last flag
+ // and check if ref_scaled is NULL.
+ if (!frame_is_intra_only(cm)) {
+ const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ if (ref == NULL) {
+ is_key_frame = true;
+ } else if (ref->y_crop_height != cm->height ||
+ ref->y_crop_width != cm->width) {
+ scaled_ref_last = true;
+ const YV12_BUFFER_CONFIG *ref_scaled =
+ av1_get_scaled_ref_frame(cpi, LAST_FRAME);
+ if (ref_scaled == NULL) is_key_frame = true;
+ }
+ }
+
+ x->source_variance = UINT_MAX;
+ // For nord_pickmode: compute source_variance, only for superblocks with
+ // some motion for now. This input can then be used to bias the partitioning
+ // or the chroma_check.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ x->content_state_sb.source_sad_nonrd > kLowSad)
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y);
+
+ if (!is_key_frame) {
+ setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last,
+ &ref_frame_partition, &sf_no_scale, mi_row, mi_col,
+ is_small_sb, scaled_ref_last);
+
+ MB_MODE_INFO *mi = xd->mi[0];
+ // Use reference SB directly for zero mv.
+ if (mi->mv[0].as_int != 0) {
+ dst_buf = xd->plane[AOM_PLANE_Y].dst.buf;
+ dst_stride = xd->plane[AOM_PLANE_Y].dst.stride;
+ is_zero_motion = false;
+ } else {
+ dst_buf = xd->plane[AOM_PLANE_Y].pre[0].buf;
+ dst_stride = xd->plane[AOM_PLANE_Y].pre[0].stride;
+ }
+ } else {
+ dst_buf = NULL;
+ dst_stride = 0;
+ }
+
+ // check and set the color sensitivity of sb.
+ av1_zero(uv_sad);
+ chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, y_sad_alt, is_key_frame,
+ is_zero_motion, uv_sad);
+
+ x->force_zeromv_skip_for_sb = 0;
+
+ VP128x128 *vt;
+ AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt)));
+ vt->split = td->vt64x64;
+
+ // If the superblock is completely static (zero source sad) and
+ // the y_sad (relative to LAST ref) is very small, take the sb_size partition
+ // and exit, and force zeromv_last skip mode for nonrd_pickmode.
+ // Only do this on the base segment (so the QP-boosted segment, if applied,
+ // can still continue cleaning/ramping up the quality).
+ // Condition on color uv_sad is also added.
+ if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv &&
+ cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE &&
+ ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) {
+ // Exit here, if zero mv skip flag is set at SB level.
+ if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col,
+ y_sad, bsize))
+ return 0;
+ }
+
+ if (cpi->noise_estimate.enabled)
+ noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+
+ // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames)
+ // variances for splits.
+ fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16,
+ minvar_16x16, thresholds, src_buf, src_stride,
+ dst_buf, dst_stride, is_key_frame, is_small_sb);
+
+ avg_64x64 = 0;
+ for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) {
+ max_var_32x32[blk64_idx] = 0;
+ min_var_32x32[blk64_idx] = INT_MAX;
+ const int blk64_scale_idx = blk64_idx << 2;
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ if (!is_key_frame) continue;
+ VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+ for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++)
+ fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8);
+ fill_variance_tree(vtemp, BLOCK_16X16);
+ // If variance of this 16x16 block is above the threshold, force block
+ // to split. This also forces a split on the upper levels.
+ get_variance(&vtemp->part_variances.none);
+ if (vtemp->part_variances.none.variance > thresholds[3]) {
+ const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+ force_split[split_index] =
+ cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var
+ ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3])
+ : PART_EVAL_ONLY_SPLIT;
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ }
+ fill_variance_tree(&vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32);
+ // If variance of this 32x32 block is above the threshold, or if its above
+ // (some threshold of) the average variance over the sub-16x16 blocks,
+ // then force this block to split. This also forces a split on the upper
+ // (64x64) level.
+ uint64_t frame_sad_thresh = 20000;
+ const int is_360p_or_smaller = cm->width * cm->height <= RESOLUTION_360P;
+ if (cpi->svc.number_temporal_layers > 2 &&
+ cpi->svc.temporal_layer_id == 0)
+ frame_sad_thresh = frame_sad_thresh << 1;
+ if (force_split[5 + blk64_scale_idx + lvl1_idx] == PART_EVAL_ALL) {
+ get_variance(&vt->split[blk64_idx].split[lvl1_idx].part_variances.none);
+ var_32x32 =
+ vt->split[blk64_idx].split[lvl1_idx].part_variances.none.variance;
+ max_var_32x32[blk64_idx] = AOMMAX(var_32x32, max_var_32x32[blk64_idx]);
+ min_var_32x32[blk64_idx] = AOMMIN(var_32x32, min_var_32x32[blk64_idx]);
+ const int max_min_var_16X16_diff = (maxvar_16x16[blk64_idx][lvl1_idx] -
+ minvar_16x16[blk64_idx][lvl1_idx]);
+
+ if (var_32x32 > thresholds[2] ||
+ (!is_key_frame && var_32x32 > (thresholds[2] >> 1) &&
+ var_32x32 > (avg_16x16[blk64_idx][lvl1_idx] >> 1))) {
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ } else if (!is_key_frame && is_360p_or_smaller &&
+ ((max_min_var_16X16_diff > (thresholds[2] >> 1) &&
+ maxvar_16x16[blk64_idx][lvl1_idx] > thresholds[2]) ||
+ (cpi->sf.rt_sf.prefer_large_partition_blocks &&
+ x->content_state_sb.source_sad_nonrd > kLowSad &&
+ cpi->rc.frame_source_sad < frame_sad_thresh &&
+ maxvar_16x16[blk64_idx][lvl1_idx] > (thresholds[2] >> 4) &&
+ maxvar_16x16[blk64_idx][lvl1_idx] >
+ (minvar_16x16[blk64_idx][lvl1_idx] << 2)))) {
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ }
+ }
+ if (force_split[1 + blk64_idx] == PART_EVAL_ALL) {
+ fill_variance_tree(&vt->split[blk64_idx], BLOCK_64X64);
+ get_variance(&vt->split[blk64_idx].part_variances.none);
+ var_64x64 = vt->split[blk64_idx].part_variances.none.variance;
+ max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+ min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+ // If the difference of the max-min variances of sub-blocks or max
+ // variance of a sub-block is above some threshold of then force this
+ // block to split. Only checking this for noise level >= medium, if
+ // encoder is in SVC or if we already forced large blocks.
+ const int max_min_var_32x32_diff =
+ max_var_32x32[blk64_idx] - min_var_32x32[blk64_idx];
+ const int check_max_var = max_var_32x32[blk64_idx] > thresholds[1] >> 1;
+ const bool check_noise_lvl = noise_level >= kMedium ||
+ cpi->ppi->use_svc ||
+ cpi->sf.rt_sf.prefer_large_partition_blocks;
+ const int64_t set_threshold = 3 * (thresholds[1] >> 3);
+
+ if (!is_key_frame && max_min_var_32x32_diff > set_threshold &&
+ check_max_var && check_noise_lvl) {
+ force_split[1 + blk64_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ avg_64x64 += var_64x64;
+ }
+ if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+
+ if (force_split[0] == PART_EVAL_ALL) {
+ fill_variance_tree(vt, BLOCK_128X128);
+ get_variance(&vt->part_variances.none);
+ const int set_avg_64x64 = (9 * avg_64x64) >> 5;
+ if (!is_key_frame && vt->part_variances.none.variance > set_avg_64x64)
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+
+ if (!is_key_frame &&
+ (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+ max_var_64x64 > thresholds[0] >> 1)
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+
+ if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
+ !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+ thresholds[0], BLOCK_16X16, force_split[0])) {
+ for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) {
+ const int x64_idx = GET_BLK_IDX_X(blk64_idx, 4);
+ const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 4);
+ const int blk64_scale_idx = blk64_idx << 2;
+
+ // Now go through the entire structure, splitting every block size until
+ // we get to one that's got a variance lower than our threshold.
+ if (set_vt_partitioning(cpi, xd, tile, &vt->split[blk64_idx], BLOCK_64X64,
+ mi_row + y64_idx, mi_col + x64_idx, thresholds[1],
+ BLOCK_16X16, force_split[1 + blk64_idx]))
+ continue;
+ for (int lvl1_idx = 0; lvl1_idx < 4; ++lvl1_idx) {
+ const int x32_idx = GET_BLK_IDX_X(lvl1_idx, 3);
+ const int y32_idx = GET_BLK_IDX_Y(lvl1_idx, 3);
+ const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+ if (set_vt_partitioning(
+ cpi, xd, tile, &vt->split[blk64_idx].split[lvl1_idx],
+ BLOCK_32X32, (mi_row + y64_idx + y32_idx),
+ (mi_col + x64_idx + x32_idx), thresholds[2], BLOCK_16X16,
+ force_split[5 + blk64_scale_idx + lvl1_idx]))
+ continue;
+ for (int lvl2_idx = 0; lvl2_idx < 4; ++lvl2_idx) {
+ const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2);
+ const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2);
+ const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+ VP16x16 *vtemp =
+ &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+ if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16,
+ mi_row + y64_idx + y32_idx + y16_idx,
+ mi_col + x64_idx + x32_idx + x16_idx,
+ thresholds[3], BLOCK_8X8,
+ force_split[split_index]))
+ continue;
+ for (int lvl3_idx = 0; lvl3_idx < 4; ++lvl3_idx) {
+ const int x8_idx = GET_BLK_IDX_X(lvl3_idx, 1);
+ const int y8_idx = GET_BLK_IDX_Y(lvl3_idx, 1);
+ set_block_size(cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+ (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+ BLOCK_8X8);
+ }
+ }
+ }
+ }
+ }
+
+ if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
+ set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds,
+ ref_frame_partition, mi_col, mi_row, is_small_sb);
+ }
+
+ aom_free(vt);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, choose_var_based_partitioning_time);
+#endif
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/var_based_part.h b/third_party/aom/av1/encoder/var_based_part.h
new file mode 100644
index 0000000000..f912458307
--- /dev/null
+++ b/third_party/aom/av1/encoder/var_based_part.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+// Calculate block index x and y from split level and index
+#define GET_BLK_IDX_X(idx, level) (((idx) & (0x01)) << (level))
+#define GET_BLK_IDX_Y(idx, level) (((idx) >> (0x01)) << (level))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QINDEX_LARGE_BLOCK_THR \
+ 100 // Use increased thresholds for midres for speed 9 when qindex is above
+ // this threshold
+
+#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \
+ ((3 * (thresh_exit_part)) >> 2)
+/*!\brief Set the thresholds for variance based partition.
+ *
+ * Set the variance split thresholds for following the block sizes:
+ * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+ * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+ * currently only used on key frame. The thresholds are based om Q, resolution,
+ * noise level, and content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] q q index
+ * \param[in] content_lowsumdiff Low sumdiff flag for superblock
+ *
+ * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds.
+ */
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+ int content_lowsumdiff);
+
+/*!\brief Variance based partition selection.
+ *
+ * Select the partitioning based on the variance of the residual signal,
+ * residual generated as the difference between the source and prediction.
+ * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever
+ * has lower y sad. For LAST, option exists (speed feature) to use motion
+ * compensation based on superblock motion via int_pro_motion_estimation. For
+ * key frames reference is fixed 128 level, so variance is the source variance.
+ * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled),
+ * and selection is done top-down via as set of partition thresholds. defined
+ * for each block level, and set based on Q, resolution, noise level, and
+ * content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] tile Pointer to TileInfo
+ * \param[in] td Pointer to ThreadData
+ * \param[in] x Pointer to MACROBLOCK
+ * \param[in] mi_row Row coordinate of the superblock in a step
+ size of MI_SIZE
+ * \param[in] mi_col Column coordinate of the super block in a step
+ size of MI_SIZE
+ *
+ * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low
+ * temporal variance flag and the color sensitivity flag (both used in
+ * nonrd_pickmode).
+ */
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ ThreadData *td, MACROBLOCK *x, int mi_row,
+ int mi_col);
+
+// Read out the block's temporal variance for 64x64 SB case.
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+// Read out the block's temporal variance for 128x128 SB case.
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+ int mi_col, BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 0000000000..40670178d7
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1: Residuals of p1.
+ * (source - p1)
+ * d: Difference of p1 and p0.
+ * (p1 - p0)
+ * m: The blending mask
+ * N: Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ * where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ * is equivalent to:
+ * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ * which is the SSE of the residuals of the compound predictor scaled up by
+ * MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ uint64_t csse = 0;
+ int i;
+
+ for (i = 0; i < N; i++) {
+ int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+ t = clamp(t, INT16_MIN, INT16_MAX);
+ csse += t * t;
+ }
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds: Difference of the squares of the residuals.
+ * r0**2 - r1**2
+ * m: The blending mask
+ * N: Number of pixels
+ * limit: Pre-computed threshold value.
+ * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ * >
+ * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ * which can be simplified to:
+ *
+ * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * The right hand side does not depend on the mask, and needs to be passed as
+ * the 'limit' parameter.
+ *
+ * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ * hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ * Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ * being small, this should not cause a noticeable issue.
+ */
+int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc = 0;
+
+ do {
+ acc += *ds++ * *m++;
+ } while (--N);
+
+ return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ int i;
+
+ for (i = 0; i < N; i++)
+ d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000000..494b0fdf15
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int stride) {
+ __m128i buf0[32];
+ __m128i buf1[32];
+ const int32_t *cospi;
+
+ int startidx = 0 * stride;
+ int endidx = 31 * stride;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+ buf0[31], cos_bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+
+ startidx = 0 * stride;
+ endidx = 31 * stride;
+ // stage 9
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
+}
+
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 4;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[4];
+ __m128i buf1[4];
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int j;
+ for (j = 0; j < 4; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ buf1[0] = buf0[3];
+ buf1[1] = buf0[0];
+ buf1[2] = buf0[1];
+ buf1[3] = buf0[2];
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ cos_bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 3
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+ buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+ buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 5
+ buf1[0] = buf0[0];
+ buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+ buf1[2] = buf0[3];
+ buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+ for (j = 0; j < 4; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+ const int instride, const int outstride) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+ __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+ __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+ __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+ __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+ __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+ __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+ __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+ __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+ __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+ __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+ __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+ __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+ __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+ __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+ __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+ __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+ __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+ __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+ __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+ __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+ __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+ __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+ __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+ __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+ __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+ __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+ __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+ __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+ __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+ __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+ __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+ __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+ __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+ __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+ __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+ __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+ __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+ __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+ __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+ __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+ __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+ __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+ __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+ __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+ __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+ __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+ __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+ __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+ __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+ __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+ __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+ __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+ __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+ __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+ __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+ __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+ __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+ __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+ __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+ __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+ __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+ __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+ __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+ __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+ __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+ __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+ __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+ __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+ __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+ __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+ __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+ __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+ __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+ __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+ __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+ __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+ __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+ int startidx = 0 * instride;
+ int endidx = 63 * instride;
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[63] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[62] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[61] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[60] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[59] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[58] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[57] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[56] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[55] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[54] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[53] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[52] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[51] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[50] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[49] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[48] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[16] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[47] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[17] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[46] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[18] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[45] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[19] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[44] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[20] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[43] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[21] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[42] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[22] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[41] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[23] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[40] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[24] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[39] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[25] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[38] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[26] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[37] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[27] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[36] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[28] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[35] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[29] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[34] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[30] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[33] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[31] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[32] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ __rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ __rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ __rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ __rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ __rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ __rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+ __rounding, cos_bit);
+ x6[4] = _mm_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ __rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ __rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ __rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+ __rounding, cos_bit);
+ x7[8] = _mm_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ __rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ __rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+ __rounding, cos_bit);
+ x8[16] = _mm_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ __rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ __rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ __rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ __rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+ __rounding, cos_bit);
+ x9[32] = _mm_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+ x10[63], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+ x10[62], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+ x10[61], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+ x10[60], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+ x10[59], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+ x10[58], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+ x10[57], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+ x10[56], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+ x10[55], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+ x10[54], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+ x10[53], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+ x10[52], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+ x10[51], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+ x10[50], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+ x10[49], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+ x10[48], __rounding, cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 63 * outstride;
+ // stage 11
+ output[startidx] = x10[0];
+ output[endidx] = x10[63];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[32];
+ output[endidx] = x10[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[16];
+ output[endidx] = x10[47];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[48];
+ output[endidx] = x10[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[8];
+ output[endidx] = x10[55];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[40];
+ output[endidx] = x10[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[24];
+ output[endidx] = x10[39];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[56];
+ output[endidx] = x10[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[4];
+ output[endidx] = x10[59];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[36];
+ output[endidx] = x10[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[20];
+ output[endidx] = x10[43];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[52];
+ output[endidx] = x10[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[12];
+ output[endidx] = x10[51];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[44];
+ output[endidx] = x10[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[28];
+ output[endidx] = x10[35];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[60];
+ output[endidx] = x10[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[2];
+ output[endidx] = x10[61];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[34];
+ output[endidx] = x10[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[18];
+ output[endidx] = x10[45];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[50];
+ output[endidx] = x10[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[10];
+ output[endidx] = x10[53];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[42];
+ output[endidx] = x10[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[26];
+ output[endidx] = x10[37];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[58];
+ output[endidx] = x10[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[6];
+ output[endidx] = x10[57];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[38];
+ output[endidx] = x10[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[22];
+ output[endidx] = x10[41];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[54];
+ output[endidx] = x10[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[14];
+ output[endidx] = x10[49];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[46];
+ output[endidx] = x10[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[30];
+ output[endidx] = x10[33];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[62];
+ output[endidx] = x10[1];
+}
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int col_num) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i++) {
+ output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 0000000000..b143df3523
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,3010 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m256i x1[16];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 7
+ output[0] = x1[0];
+ output[1] = x1[8];
+ output[2] = x1[4];
+ output[3] = x1[12];
+ output[4] = x1[2];
+ output[5] = x1[10];
+ output[6] = x1[6];
+ output[7] = x1[14];
+ output[8] = x1[1];
+ output[9] = x1[9];
+ output[10] = x1[5];
+ output[11] = x1[13];
+ output[12] = x1[3];
+ output[13] = x1[11];
+ output[14] = x1[7];
+ output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m256i x1[32];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+ __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+ __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+ __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+ __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+ __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+ __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+ __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+ __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+ __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+ __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+ __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+ __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+ __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+ __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+ __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+ __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+ __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+ __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+ __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+ __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+ __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+ __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+ __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+ __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+ __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+ __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+ __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+ __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+ __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+ __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+ __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+ __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ __m256i x1[32];
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+ // stage 0
+ // stage 1
+ btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[31]);
+ btf_32_add_sub_avx2(&x1[1], &x1[30]);
+ btf_32_add_sub_avx2(&x1[2], &x1[29]);
+ btf_32_add_sub_avx2(&x1[3], &x1[28]);
+ btf_32_add_sub_avx2(&x1[4], &x1[27]);
+ btf_32_add_sub_avx2(&x1[5], &x1[26]);
+ btf_32_add_sub_avx2(&x1[6], &x1[25]);
+ btf_32_add_sub_avx2(&x1[7], &x1[24]);
+ btf_32_add_sub_avx2(&x1[8], &x1[23]);
+ btf_32_add_sub_avx2(&x1[9], &x1[22]);
+ btf_32_add_sub_avx2(&x1[10], &x1[21]);
+ btf_32_add_sub_avx2(&x1[11], &x1[20]);
+ btf_32_add_sub_avx2(&x1[12], &x1[19]);
+ btf_32_add_sub_avx2(&x1[13], &x1[18]);
+ btf_32_add_sub_avx2(&x1[14], &x1[17]);
+ btf_32_add_sub_avx2(&x1[15], &x1[16]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[47]);
+ btf_32_add_sub_avx2(&x1[33], &x1[46]);
+ btf_32_add_sub_avx2(&x1[34], &x1[45]);
+ btf_32_add_sub_avx2(&x1[35], &x1[44]);
+ btf_32_add_sub_avx2(&x1[36], &x1[43]);
+ btf_32_add_sub_avx2(&x1[37], &x1[42]);
+ btf_32_add_sub_avx2(&x1[38], &x1[41]);
+ btf_32_add_sub_avx2(&x1[39], &x1[40]);
+ btf_32_add_sub_avx2(&x1[63], &x1[48]);
+ btf_32_add_sub_avx2(&x1[62], &x1[49]);
+ btf_32_add_sub_avx2(&x1[61], &x1[50]);
+ btf_32_add_sub_avx2(&x1[60], &x1[51]);
+ btf_32_add_sub_avx2(&x1[59], &x1[52]);
+ btf_32_add_sub_avx2(&x1[58], &x1[53]);
+ btf_32_add_sub_avx2(&x1[57], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[39]);
+ btf_32_add_sub_avx2(&x1[33], &x1[38]);
+ btf_32_add_sub_avx2(&x1[34], &x1[37]);
+ btf_32_add_sub_avx2(&x1[35], &x1[36]);
+ btf_32_add_sub_avx2(&x1[47], &x1[40]);
+ btf_32_add_sub_avx2(&x1[46], &x1[41]);
+ btf_32_add_sub_avx2(&x1[45], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[43]);
+ btf_32_add_sub_avx2(&x1[48], &x1[55]);
+ btf_32_add_sub_avx2(&x1[49], &x1[54]);
+ btf_32_add_sub_avx2(&x1[50], &x1[53]);
+ btf_32_add_sub_avx2(&x1[51], &x1[52]);
+ btf_32_add_sub_avx2(&x1[63], &x1[56]);
+ btf_32_add_sub_avx2(&x1[62], &x1[57]);
+ btf_32_add_sub_avx2(&x1[61], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[35]);
+ btf_32_add_sub_avx2(&x1[33], &x1[34]);
+ btf_32_add_sub_avx2(&x1[39], &x1[36]);
+ btf_32_add_sub_avx2(&x1[38], &x1[37]);
+ btf_32_add_sub_avx2(&x1[40], &x1[43]);
+ btf_32_add_sub_avx2(&x1[41], &x1[42]);
+ btf_32_add_sub_avx2(&x1[47], &x1[44]);
+ btf_32_add_sub_avx2(&x1[46], &x1[45]);
+ btf_32_add_sub_avx2(&x1[48], &x1[51]);
+ btf_32_add_sub_avx2(&x1[49], &x1[50]);
+ btf_32_add_sub_avx2(&x1[55], &x1[52]);
+ btf_32_add_sub_avx2(&x1[54], &x1[53]);
+ btf_32_add_sub_avx2(&x1[56], &x1[59]);
+ btf_32_add_sub_avx2(&x1[57], &x1[58]);
+ btf_32_add_sub_avx2(&x1[63], &x1[60]);
+ btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+ btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[33]);
+ btf_32_add_sub_avx2(&x1[35], &x1[34]);
+ btf_32_add_sub_avx2(&x1[36], &x1[37]);
+ btf_32_add_sub_avx2(&x1[39], &x1[38]);
+ btf_32_add_sub_avx2(&x1[40], &x1[41]);
+ btf_32_add_sub_avx2(&x1[43], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[45]);
+ btf_32_add_sub_avx2(&x1[47], &x1[46]);
+ btf_32_add_sub_avx2(&x1[48], &x1[49]);
+ btf_32_add_sub_avx2(&x1[51], &x1[50]);
+ btf_32_add_sub_avx2(&x1[52], &x1[53]);
+ btf_32_add_sub_avx2(&x1[55], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[57]);
+ btf_32_add_sub_avx2(&x1[59], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[61]);
+ btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[15]);
+ x1[2] = _mm256_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm256_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm256_subs_epi16(__zero, input[11]);
+ x1[8] = _mm256_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm256_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm256_subs_epi16(__zero, input[13]);
+ x1[14] = _mm256_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[1];
+ output[1] = x1[14];
+ output[2] = x1[3];
+ output[3] = x1[12];
+ output[4] = x1[5];
+ output[5] = x1[10];
+ output[6] = x1[7];
+ output[7] = x1[8];
+ output[8] = x1[9];
+ output[9] = x1[6];
+ output[10] = x1[11];
+ output[11] = x1[4];
+ output[12] = x1[13];
+ output[13] = x1[2];
+ output[14] = x1[15];
+ output[15] = x1[0];
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+ __m256i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm256_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm256_slli_epi16(input[i], 2);
+ }
+}
+
+static INLINE void store_output_32bit_w16(int32_t *const out,
+ const __m256i *const in1,
+ const __m256i *const in2,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out + stride * i), in1[i]);
+ _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]);
+ }
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ int32_t *out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out),
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+ _mm256_store_si256(
+ (__m256i *)(out + 8),
+ _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+ out += stride;
+ }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ _mm256_store_si256((__m256i *)b, b_lo);
+ _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+ }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+ int8_t cos_bit);
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_avx2, // IDTX
+ fdct16x32_avx2, // V_DCT
+ fidentity16x32_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_avx2, // IDTX
+ fidentity16x32_avx2, // V_DCT
+ fdct16x32_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fadst16x16_new_avx2, // ADST_DCT
+ fdct16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fadst16x16_new_avx2, // FLIPADST_DCT
+ fdct16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fdct16x16_new_avx2, // V_DCT
+ fidentity16x16_new_avx2, // H_DCT
+ fadst16x16_new_avx2, // V_ADST
+ fidentity16x16_new_avx2, // H_ADST
+ fadst16x16_new_avx2, // V_FLIPADST
+ fidentity16x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fdct16x16_new_avx2, // ADST_DCT
+ fadst16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fdct16x16_new_avx2, // FLIPADST_DCT
+ fadst16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fidentity16x16_new_avx2, // V_DCT
+ fdct16x16_new_avx2, // H_DCT
+ fidentity16x16_new_avx2, // V_ADST
+ fadst16x16_new_avx2, // H_ADST
+ fidentity16x16_new_avx2, // V_FLIPADST
+ fadst16x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fadst8x8_new_sse2, // ADST_DCT
+ fdct8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fadst8x8_new_sse2, // FLIPADST_DCT
+ fdct8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct8x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst8x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst8x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fdct8x8_new_sse2, // ADST_DCT
+ fadst8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fdct8x8_new_sse2, // FLIPADST_DCT
+ fadst8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct8x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst8x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst8x8_new_sse2 // H_FLIPADST
+};
+
+static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride,
+ __m128i *out, int bit) {
+ out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ out[0] = _mm_slli_epi16(out[0], bit);
+ out[1] = _mm_slli_epi16(out[1], bit);
+ out[2] = _mm_slli_epi16(out[2], bit);
+ out[3] = _mm_slli_epi16(out[3], bit);
+ out[4] = _mm_slli_epi16(out[4], bit);
+ out[5] = _mm_slli_epi16(out[5], bit);
+ out[6] = _mm_slli_epi16(out[6], bit);
+ out[7] = _mm_slli_epi16(out[7], bit);
+}
+
+static INLINE void load_buffer_and_flip_round_shift(const int16_t *in,
+ int stride, __m128i *out,
+ int bit) {
+ out[7] = load_16bit_to_16bit(in + 0 * stride);
+ out[6] = load_16bit_to_16bit(in + 1 * stride);
+ out[5] = load_16bit_to_16bit(in + 2 * stride);
+ out[4] = load_16bit_to_16bit(in + 3 * stride);
+ out[3] = load_16bit_to_16bit(in + 4 * stride);
+ out[2] = load_16bit_to_16bit(in + 5 * stride);
+ out[1] = load_16bit_to_16bit(in + 6 * stride);
+ out[0] = load_16bit_to_16bit(in + 7 * stride);
+ out[7] = _mm_slli_epi16(out[7], bit);
+ out[6] = _mm_slli_epi16(out[6], bit);
+ out[5] = _mm_slli_epi16(out[5], bit);
+ out[4] = _mm_slli_epi16(out[4], bit);
+ out[3] = _mm_slli_epi16(out[3], bit);
+ out[2] = _mm_slli_epi16(out[2], bit);
+ out[1] = _mm_slli_epi16(out[1], bit);
+ out[0] = _mm_slli_epi16(out[0], bit);
+}
+
+#define TRANSPOSE_8X8_AVX2() \
+ { \
+ /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \
+ /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \
+ /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \
+ /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \
+ const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \
+ const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \
+ const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \
+ const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \
+ /* Unpack 32 bit elements resulting in: */ \
+ /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \
+ /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \
+ /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \
+ /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \
+ const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \
+ const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \
+ const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \
+ const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \
+ /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \
+ /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \
+ /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \
+ /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \
+ c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \
+ c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \
+ c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \
+ c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \
+ }
+
+static INLINE void transpose_round_shift_flip_8x8(__m128i *const in,
+ __m128i *const out, int bit) {
+ __m256i c0, c1, c2, c3;
+ bit = -bit;
+ const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+ const __m256i s04 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+ const __m256i s15 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+ const __m256i s26 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+ const __m256i s37 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+ const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+ const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+ const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+ const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+ // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47
+ // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57
+ // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67
+ // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77
+ const __m256i b0 = _mm256_srai_epi16(a0, bit);
+ const __m256i b1 = _mm256_srai_epi16(a1, bit);
+ const __m256i b2 = _mm256_srai_epi16(a2, bit);
+ const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+ TRANSPOSE_8X8_AVX2()
+
+ // Unpack 64 bit elements resulting in:
+ // out[7]: 00 10 20 30 40 50 60 70
+ // out[6]: 01 11 21 31 41 51 61 71
+ // out[5]: 02 12 22 32 42 52 62 72
+ // out[4]: 03 13 23 33 43 53 63 73
+ // out[3]: 04 14 24 34 44 54 64 74
+ // out[2]: 05 15 25 35 45 55 65 75
+ // out[1]: 06 16 26 36 46 56 66 76
+ // out[0]: 07 17 27 37 47 57 67 77
+ out[7] = _mm256_castsi256_si128(c0);
+ out[6] = _mm256_extractf128_si256(c0, 1);
+ out[5] = _mm256_castsi256_si128(c1);
+ out[4] = _mm256_extractf128_si256(c1, 1);
+ out[3] = _mm256_castsi256_si128(c2);
+ out[2] = _mm256_extractf128_si256(c2, 1);
+ out[1] = _mm256_castsi256_si128(c3);
+ out[0] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void transpose_round_shift_8x8(__m128i *const in,
+ __m128i *const out, int bit) {
+ __m256i c0, c1, c2, c3;
+ bit = -bit;
+ const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+ const __m256i s04 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+ const __m256i s15 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+ const __m256i s26 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+ const __m256i s37 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+ const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+ const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+ const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+ const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+ // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47
+ // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57
+ // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67
+ // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77
+ const __m256i b0 = _mm256_srai_epi16(a0, bit);
+ const __m256i b1 = _mm256_srai_epi16(a1, bit);
+ const __m256i b2 = _mm256_srai_epi16(a2, bit);
+ const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+ TRANSPOSE_8X8_AVX2()
+ // Unpack 64 bit elements resulting in:
+ // out[7]: 00 10 20 30 40 50 60 70
+ // out[6]: 01 11 21 31 41 51 61 71
+ // out[5]: 02 12 22 32 42 52 62 72
+ // out[4]: 03 13 23 33 43 53 63 73
+ // out[3]: 04 14 24 34 44 54 64 74
+ // out[2]: 05 15 25 35 45 55 65 75
+ // out[1]: 06 16 26 36 46 56 66 76
+ // out[0]: 07 17 27 37 47 57 67 77
+ out[0] = _mm256_castsi256_si128(c0);
+ out[1] = _mm256_extractf128_si256(c0, 1);
+ out[2] = _mm256_castsi256_si128(c1);
+ out[3] = _mm256_extractf128_si256(c1, 1);
+ out[4] = _mm256_castsi256_si128(c2);
+ out[5] = _mm256_extractf128_si256(c2, 1);
+ out[6] = _mm256_castsi256_si128(c3);
+ out[7] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out + i * stride),
+ _mm256_cvtepi16_epi32(in[i]));
+ }
+}
+
+static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // Condition to check shift bit is avoided while round shifting, by assuming
+ // that shift[0] will always be positive.
+ assert(shift[0] > 0);
+ if (ud_flip)
+ load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
+ else
+ load_buffer_and_round_shift(input, stride, buf0, shift[0]);
+
+ col_txfm(buf0, buf0, cos_bit_col);
+ // Condition to check shift bit is avoided while round shifting, by assuming
+ // that shift[1] will always be negative.
+ assert(shift[1] < 0);
+
+ if (lr_flip) {
+ transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
+ } else {
+ transpose_round_shift_8x8(buf0, buf1, shift[1]);
+ }
+
+ buf = buf1;
+ row_txfm(buf, buf, cos_bit_row);
+
+ // Round and shift operation is avoided here as the shift bit is assumed to be
+ // zero always.
+ assert(shift[2] == 0);
+ store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8);
+}
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X16;
+ __m256i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int32_t i = 0;
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_32X32;
+ __m256i buf0[32], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+ transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+ store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X32;
+ __m256i buf0[32], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1);
+ transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height,
+ width);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[32], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+ }
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+ round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[32];
+ __m256i bufB[32];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ fdct32_avx2(bufA, bufA, cos_bit_row);
+ fdct32_avx2(bufB, bufB, cos_bit_row);
+ round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+ round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div16; i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32);
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+ __m256i *in1, __m128i *out0, __m128i *out1,
+ __m128i *out2, __m128i *out3,
+ const __m256i *__rounding, int8_t *cos_bit) {
+ __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+ __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+ __m256i u0 = _mm256_madd_epi16(t0, *w0);
+ __m256i u1 = _mm256_madd_epi16(t1, *w0);
+ __m256i v0 = _mm256_madd_epi16(t0, *w1);
+ __m256i v1 = _mm256_madd_epi16(t1, *w1);
+
+ __m256i a0 = _mm256_add_epi32(u0, *__rounding);
+ __m256i a1 = _mm256_add_epi32(u1, *__rounding);
+ __m256i b0 = _mm256_add_epi32(v0, *__rounding);
+ __m256i b1 = _mm256_add_epi32(v1, *__rounding);
+
+ __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
+ __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
+ __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
+ __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
+
+ __m256i temp0 = _mm256_packs_epi32(c0, c1);
+ __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+ *out0 = _mm256_castsi256_si128(temp0);
+ *out1 = _mm256_castsi256_si128(temp1);
+ *out2 = _mm256_extracti128_si256(temp0, 0x01);
+ *out3 = _mm256_extracti128_si256(temp1, 0x01);
+}
+
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = _mm256_adds_epi16(input[0], input[7]);
+ x1[7] = _mm256_subs_epi16(input[0], input[7]);
+ x1[1] = _mm256_adds_epi16(input[1], input[6]);
+ x1[6] = _mm256_subs_epi16(input[1], input[6]);
+ x1[2] = _mm256_adds_epi16(input[2], input[5]);
+ x1[5] = _mm256_subs_epi16(input[2], input[5]);
+ x1[3] = _mm256_adds_epi16(input[3], input[4]);
+ x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+ cos_bit);
+ x2[5] = x1[5];
+ x2[6] = x1[6];
+ x2[7] = x1[7];
+
+ // stage 3
+ __m256i x3[8];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+ cos_bit);
+ x3[0] = x2[0];
+ x3[1] = x2[1];
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+ cos_bit);
+ x3[2] = x2[2];
+ x3[3] = x2[3];
+ x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+ cos_bit);
+ x4[4] = x3[4];
+ x4[7] = x3[7];
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+ cos_bit);
+ x4[5] = x3[5];
+ x4[6] = x3[6];
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[7]);
+ x1[2] = _mm256_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm256_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+ cos_bit);
+ x2[2] = x1[2];
+ x2[3] = x1[3];
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+ cos_bit);
+ x2[6] = x1[6];
+ x2[7] = x1[7];
+
+ // stage 3
+ __m256i x3[8];
+ x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+ cos_bit);
+ x4[4] = x3[4];
+ x4[5] = x3[5];
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+ cos_bit);
+ x4[6] = x3[6];
+ x4[7] = x3[7];
+
+ // stage 5
+ __m256i x5[8];
+ x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m256i x6[8];
+ btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+ cos_bit);
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+ cos_bit);
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+ cos_bit);
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+ cos_bit);
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm256_adds_epi16(input[0], input[0]);
+ output[1] = _mm256_adds_epi16(input[1], input[1]);
+ output[2] = _mm256_adds_epi16(input[2], input[2]);
+ output[3] = _mm256_adds_epi16(input[3], input[3]);
+ output[4] = _mm256_adds_epi16(input[4], input[4]);
+ output[5] = _mm256_adds_epi16(input[5], input[5]);
+ output[6] = _mm256_adds_epi16(input[6], input[6]);
+ output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i temp0, temp1, temp2, temp3;
+ __m256i in0, in1;
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ __m256i cospi_arr[12];
+
+ cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+ cospi_m32_p32, 0x1);
+ cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p48_p16, 0x1);
+ cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_m16_p48, 0x1);
+ cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+ cospi_m48_m16, 0x1);
+ cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+ cospi_m16_p48, 0x1);
+ cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+ cospi_p24_p40, 0x1);
+ cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+ cospi_m40_p24, 0x1);
+ cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+ cospi_p28_p36, 0x1);
+ cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+ cospi_m36_p28, 0x1);
+ cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+ cospi_p12_p52, 0x1);
+ cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+ cospi_m52_p12, 0x1);
+
+ __m256i x[8];
+ x[0] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+ x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+ 0x1);
+ x[2] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+ x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+ 0x1);
+ x[4] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+ x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+ 0x1);
+ x[6] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+ x[7] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = _mm256_adds_epi16(x[0], x[1]);
+ x1[7] = _mm256_subs_epi16(x[0], x[1]);
+ x1[1] = _mm256_adds_epi16(x[2], x[3]);
+ x1[6] = _mm256_subs_epi16(x[2], x[3]);
+ x1[2] = _mm256_adds_epi16(x[4], x[5]);
+ x1[5] = _mm256_subs_epi16(x[4], x[5]);
+ x1[3] = _mm256_adds_epi16(x[6], x[7]);
+ x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+ x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+ x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+ x2[2] = x1[4];
+ x2[3] = x1[7];
+ btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
+ &temp2, &temp3, &__rounding_256, &cos_bit);
+ x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+ x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+ // stage 3
+ __m256i x3[8];
+ x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+ x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+ x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+ x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+ _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+ x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+ x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+ x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+ x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+ x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+ x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+ btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
+ &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
+ x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+ x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+ x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+ x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+ in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+ in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+ btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+
+ x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+ x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+ // stage 5
+ __m256i x5[4];
+ in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+ in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+ btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
+ &output[10], &output[6], &__rounding_256, &cos_bit);
+ x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
+ x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
+ x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
+ x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
+
+ // stage 6
+ in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
+ btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
+ &output[9], &output[7], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
+ in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
+ btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
+ &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+ __m256i in0, in1;
+ __m128i temp0, temp1, temp2, temp3;
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ __m256i cospi_arr[20];
+
+ cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_p32_m32, 0x1);
+ cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_p32_m32, 0x1);
+ cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+ cospi_m48_p16, 0x1);
+ cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+ cospi_p16_p48, 0x1);
+ cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+ cospi_m48_p16, 0x1);
+ cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+ cospi_p16_p48, 0x1);
+ cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+ cospi_p40_p24, 0x1);
+ cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+ cospi_p24_m40, 0x1);
+ cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+ cospi_m24_p40, 0x1);
+ cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+ cospi_p40_p24, 0x1);
+ cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+ cospi_p10_p54, 0x1);
+ cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+ cospi_p54_m10, 0x1);
+ cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+ cospi_p26_p38, 0x1);
+ cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+ cospi_p38_m26, 0x1);
+ cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+ cospi_p42_p22, 0x1);
+ cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+ cospi_p22_m42, 0x1);
+ cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+ cospi_p58_p06, 0x1);
+ cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+ cospi_p06_m58, 0x1);
+
+ __m256i x[8];
+ x[0] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+ x[1] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+ x[2] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+ x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+ 0x1);
+ x[4] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+ x[5] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+ x[6] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+ x[7] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = x[0];
+ x1[1] = _mm256_subs_epi16(__zero, x[7]);
+ x1[2] = x[2];
+ x1[3] = _mm256_subs_epi16(__zero, x[5]);
+ x1[4] = _mm256_subs_epi16(__zero, x[4]);
+ x1[5] = x[3];
+ x1[6] = _mm256_subs_epi16(__zero, x[6]);
+ x1[7] = x[1];
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+ x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+ x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+ x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+ in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+ in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+ btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+ in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+ btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 3
+ __m256i x3[8];
+ x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+ x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+ x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+ x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+ x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+ x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[4] = x3[4];
+ x4[5] = x3[5];
+ in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+ in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+ btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+ btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 5
+ __m256i x5[8];
+ x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+ x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+ x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+ x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+ x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+ x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+ x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+ x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+ // stage 6
+ __m256i x6[8];
+ x6[0] = x5[0];
+ x6[1] = x5[2];
+ x6[2] = x5[1];
+ x6[3] = x5[3];
+ in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+ btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+ btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
+ &temp2, &temp3, &__rounding_256, &cos_bit);
+ x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 7
+ __m256i x7[8];
+ x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+ x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+ x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+ x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+ x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+ x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+ x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+ x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+ // stage 8
+ in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+ btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
+ &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+ btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
+ &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+ btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
+ &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+ btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
+ &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i temp;
+ for (int i = 0; i < 16; i += 2) {
+ temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+ input[i + 1], 0x1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ temp = _mm256_packs_epi32(b_lo, b_hi);
+ output[i] = _mm256_castsi256_si128(temp);
+ output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+ }
+}
+
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x8_new_avx2, // DCT_DCT
+ fdct8x8_new_avx2, // ADST_DCT
+ fadst8x8_new_avx2, // DCT_ADST
+ fadst8x8_new_avx2, // ADST_ADST
+ fdct8x8_new_avx2, // FLIPADST_DCT
+ fadst8x8_new_avx2, // DCT_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_FLIPADST
+ fadst8x8_new_avx2, // ADST_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_ADST
+ fidentity8x8_new_avx2, // IDTX
+ fidentity8x8_new_avx2, // V_DCT
+ fdct8x8_new_avx2, // H_DCT
+ fidentity8x8_new_avx2, // V_ADST
+ fadst8x8_new_avx2, // H_ADST
+ fidentity8x8_new_avx2, // V_FLIPADST
+ fadst8x8_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_avx2, // DCT_DCT
+ fadst8x16_new_avx2, // ADST_DCT
+ fdct8x16_new_avx2, // DCT_ADST
+ fadst8x16_new_avx2, // ADST_ADST
+ fadst8x16_new_avx2, // FLIPADST_DCT
+ fdct8x16_new_avx2, // DCT_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_FLIPADST
+ fadst8x16_new_avx2, // ADST_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_ADST
+ fidentity8x16_new_avx2, // IDTX
+ fdct8x16_new_avx2, // V_DCT
+ fidentity8x16_new_avx2, // H_DCT
+ fadst8x16_new_avx2, // V_ADST
+ fidentity8x16_new_avx2, // H_ADST
+ fadst8x16_new_avx2, // V_FLIPADST
+ fidentity8x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+ fdct8x8_new_avx2, // DCT_DCT
+ fadst8x8_new_avx2, // ADST_DCT
+ fdct8x8_new_avx2, // DCT_ADST
+ fadst8x8_new_avx2, // ADST_ADST
+ fadst8x8_new_avx2, // FLIPADST_DCT
+ fdct8x8_new_avx2, // DCT_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_FLIPADST
+ fadst8x8_new_avx2, // ADST_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_ADST
+ fidentity8x8_new_avx2, // IDTX
+ fdct8x8_new_avx2, // V_DCT
+ fidentity8x8_new_avx2, // H_DCT
+ fadst8x8_new_avx2, // V_ADST
+ fidentity8x8_new_avx2, // H_ADST
+ fadst8x8_new_avx2, // V_FLIPADST
+ fidentity8x8_new_avx2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+ fdct8x16_new_avx2, // DCT_DCT
+ fdct8x16_new_avx2, // ADST_DCT
+ fadst8x16_new_avx2, // DCT_ADST
+ fadst8x16_new_avx2, // ADST_ADST
+ fdct8x16_new_avx2, // FLIPADST_DCT
+ fadst8x16_new_avx2, // DCT_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_FLIPADST
+ fadst8x16_new_avx2, // ADST_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_ADST
+ fidentity8x16_new_avx2, // IDTX
+ fidentity8x16_new_avx2, // V_DCT
+ fdct8x16_new_avx2, // H_DCT
+ fidentity8x16_new_avx2, // V_ADST
+ fadst8x16_new_avx2, // H_ADST
+ fidentity8x16_new_avx2, // V_FLIPADST
+ fadst8x16_new_avx2 // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ __m256i buf2[8];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ __m128i *bufl, *bufu;
+ if (lr_flip) {
+ bufl = buf0;
+ bufu = buf0 + 8;
+ flip_buf_sse2(buf1 + width * 0, bufl, width);
+ flip_buf_sse2(buf1 + width * 1, bufu, width);
+ } else {
+ bufl = buf1 + width * 0;
+ bufu = buf1 + width * 1;
+ }
+ pack_reg(bufl, bufu, buf2);
+ row_txfm(buf2, buf2, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width);
+}
+
+static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ __m256i buf2[8];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+ load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+ load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+ }
+ pack_reg(buf0, &buf0[8], buf2);
+ round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+ col_txfm(buf2, buf2, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+ transpose_16bit_16x8_avx2(buf2, buf2);
+ extract_reg(buf2, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform
+ lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000000..825da8d7b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+ const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+ int r, c;
+ for (r = 0; r < txfm1d_size; r++) {
+ for (c = 0; c < txfm1d_size; c++) {
+ output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+ }
+ }
+}
+
+static INLINE void store_output_32bit_w8(int32_t *const out,
+ const __m128i *const in1,
+ const __m128i *const in2,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm_store_si128((__m128i *)(out + stride * i), in1[i]);
+ _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]);
+ }
+}
+
+typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int txfm_size = 32;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num);
+ }
+}
+
+static void fdct64_new_sse4_1(__m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 64;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ (void)stage_range;
+ for (int col = 0; col < col_num; col++) {
+ av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num);
+ }
+}
+static void idtx32x32_sse4_1(__m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ (void)stage_range;
+
+ for (int i = 0; i < 8; i++) {
+ av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1);
+ }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT32: return fdct32_sse4_1;
+ case TXFM_TYPE_DCT64: return fdct64_new_sse4_1;
+ case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1;
+ default: assert(0);
+ }
+ return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+ const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ // TODO(sarahparker) This does not currently support rectangular transforms
+ // and will break without splitting txfm_size out into row and col size.
+ // Rectangular transforms use c code only, so it should be ok for now.
+ // It will be corrected when there are sse implementations for rectangular
+ // transforms.
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t *stage_range_row = cfg->stage_range_row;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+ int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+ txfm_size);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+ txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+ av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
+}
+
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+ int32_t *output, const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+
+ const int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+ int col_num = txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+ txfm_size);
+ /*col wise transform*/
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+
+ /*row wise transform*/
+ for (int col = 0; col < (col_num >> 1); col++) {
+ av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+ (col_num >> 1));
+ }
+
+ txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+ av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ (void)bd;
+ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ (void)bd;
+ fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m128i buf0[64], buf1[512];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+ store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[32];
+ __m128i bufB[32];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1);
+ av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 0000000000..aaad76e5ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 0000000000..a4def754b0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2673 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i u[4], v[4];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0
+ u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2
+ u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1
+ u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[3], __rounding);
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[1]);
+ output[1] = _mm_packs_epi32(u[2], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x1[4];
+ x1[0] = _mm_adds_epi16(input[0], input[3]);
+ x1[3] = _mm_subs_epi16(input[0], input[3]);
+ x1[1] = _mm_adds_epi16(input[1], input[2]);
+ x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+ // stage 2
+ __m128i x2[4];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+ // stage 3
+ output[0] = x2[0];
+ output[1] = x2[2];
+ output[2] = x2[1];
+ output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+ &x1[6], &x2[5], &x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+ &x2[1], &x3[0], &x3[1]);
+ btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+ &x2[3], &x3[2], &x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+ &x3[7], &x4[4], &x4[7]);
+ btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+ &x3[6], &x4[5], &x4[6]);
+
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = _mm_adds_epi16(input[0], input[15]);
+ x1[15] = _mm_subs_epi16(input[0], input[15]);
+ x1[1] = _mm_adds_epi16(input[1], input[14]);
+ x1[14] = _mm_subs_epi16(input[1], input[14]);
+ x1[2] = _mm_adds_epi16(input[2], input[13]);
+ x1[13] = _mm_subs_epi16(input[2], input[13]);
+ x1[3] = _mm_adds_epi16(input[3], input[12]);
+ x1[12] = _mm_subs_epi16(input[3], input[12]);
+ x1[4] = _mm_adds_epi16(input[4], input[11]);
+ x1[11] = _mm_subs_epi16(input[4], input[11]);
+ x1[5] = _mm_adds_epi16(input[5], input[10]);
+ x1[10] = _mm_subs_epi16(input[5], input[10]);
+ x1[6] = _mm_adds_epi16(input[6], input[9]);
+ x1[9] = _mm_subs_epi16(input[6], input[9]);
+ x1[7] = _mm_adds_epi16(input[7], input[8]);
+ x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+ x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+ x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+ x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+ x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+ x2[14] = x1[14];
+ x2[15] = x1[15];
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+ x3[4] = x2[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+ x3[7] = x2[7];
+ x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+ x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+ x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+ x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+ x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+ // stage 4
+ __m128i x4[16];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+ x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+ x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+ x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+ x4[8] = x3[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+ x4[11] = x3[11];
+ x4[12] = x3[12];
+ x4[15] = x3[15];
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = x4[0];
+ x5[1] = x4[1];
+ x5[2] = x4[2];
+ x5[3] = x4[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+ x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+ x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+ x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+ x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+ // stage 7
+ output[0] = x6[0];
+ output[1] = x6[8];
+ output[2] = x6[4];
+ output[3] = x6[12];
+ output[4] = x6[2];
+ output[5] = x6[10];
+ output[6] = x6[6];
+ output[7] = x6[14];
+ output[8] = x6[1];
+ output[9] = x6[9];
+ output[10] = x6[5];
+ output[11] = x6[13];
+ output[12] = x6[3];
+ output[13] = x6[11];
+ output[14] = x6[7];
+ output[15] = x6[15];
+}
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m128i x1[32];
+ x1[0] = _mm_adds_epi16(input[0], input[31]);
+ x1[31] = _mm_subs_epi16(input[0], input[31]);
+ x1[1] = _mm_adds_epi16(input[1], input[30]);
+ x1[30] = _mm_subs_epi16(input[1], input[30]);
+ x1[2] = _mm_adds_epi16(input[2], input[29]);
+ x1[29] = _mm_subs_epi16(input[2], input[29]);
+ x1[3] = _mm_adds_epi16(input[3], input[28]);
+ x1[28] = _mm_subs_epi16(input[3], input[28]);
+ x1[4] = _mm_adds_epi16(input[4], input[27]);
+ x1[27] = _mm_subs_epi16(input[4], input[27]);
+ x1[5] = _mm_adds_epi16(input[5], input[26]);
+ x1[26] = _mm_subs_epi16(input[5], input[26]);
+ x1[6] = _mm_adds_epi16(input[6], input[25]);
+ x1[25] = _mm_subs_epi16(input[6], input[25]);
+ x1[7] = _mm_adds_epi16(input[7], input[24]);
+ x1[24] = _mm_subs_epi16(input[7], input[24]);
+ x1[8] = _mm_adds_epi16(input[8], input[23]);
+ x1[23] = _mm_subs_epi16(input[8], input[23]);
+ x1[9] = _mm_adds_epi16(input[9], input[22]);
+ x1[22] = _mm_subs_epi16(input[9], input[22]);
+ x1[10] = _mm_adds_epi16(input[10], input[21]);
+ x1[21] = _mm_subs_epi16(input[10], input[21]);
+ x1[11] = _mm_adds_epi16(input[11], input[20]);
+ x1[20] = _mm_subs_epi16(input[11], input[20]);
+ x1[12] = _mm_adds_epi16(input[12], input[19]);
+ x1[19] = _mm_subs_epi16(input[12], input[19]);
+ x1[13] = _mm_adds_epi16(input[13], input[18]);
+ x1[18] = _mm_subs_epi16(input[13], input[18]);
+ x1[14] = _mm_adds_epi16(input[14], input[17]);
+ x1[17] = _mm_subs_epi16(input[14], input[17]);
+ x1[15] = _mm_adds_epi16(input[15], input[16]);
+ x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+ // stage 2
+ __m128i x2[32];
+ x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+ x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+ x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+ x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+ x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+ x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+ x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+ x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+ x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+ x2[16] = x1[16];
+ x2[17] = x1[17];
+ x2[18] = x1[18];
+ x2[19] = x1[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+ x2[28] = x1[28];
+ x2[29] = x1[29];
+ x2[30] = x1[30];
+ x2[31] = x1[31];
+
+ // stage 3
+ __m128i x3[32];
+ x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+ x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+ x3[8] = x2[8];
+ x3[9] = x2[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+ x3[14] = x2[14];
+ x3[15] = x2[15];
+ x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+ x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+ x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+ x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+ x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+ x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+ x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+ x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+ x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+ x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+ x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+ x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+ x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+ x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+ x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+ x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+ // stage 4
+ __m128i x4[32];
+ x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+ x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+ x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+ x4[4] = x3[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+ x4[7] = x3[7];
+ x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+ x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+ x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+ x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+ x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+ x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+ x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+ x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+ x4[16] = x3[16];
+ x4[17] = x3[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+ x4[22] = x3[22];
+ x4[23] = x3[23];
+ x4[24] = x3[24];
+ x4[25] = x3[25];
+ x4[30] = x3[30];
+ x4[31] = x3[31];
+
+ // stage 5
+ __m128i x5[32];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+ x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+ x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+ x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+ x5[8] = x4[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+ x5[11] = x4[11];
+ x5[12] = x4[12];
+ x5[15] = x4[15];
+ x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+ x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+ x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+ x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+ x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+ x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+ x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+ x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+ x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+ x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+ x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+ x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+ x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+ x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+ x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+ x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+ // stage 6
+ __m128i x6[32];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+ x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+ x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+ x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+ x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+ x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+ x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+ x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+ x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+ x6[16] = x5[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+ x6[19] = x5[19];
+ x6[20] = x5[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+ x6[23] = x5[23];
+ x6[24] = x5[24];
+ x6[27] = x5[27];
+ x6[28] = x5[28];
+ x6[31] = x5[31];
+
+ // stage 7
+ __m128i x7[32];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ x7[4] = x6[4];
+ x7[5] = x6[5];
+ x7[6] = x6[6];
+ x7[7] = x6[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+ x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+ x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+ x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+ x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+ x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+ x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+ x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+ x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+ x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+ x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+ x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+ x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+ x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+ x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+ x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+ x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+ // stage 8
+ __m128i x8[32];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ x8[8] = x7[8];
+ x8[9] = x7[9];
+ x8[10] = x7[10];
+ x8[11] = x7[11];
+ x8[12] = x7[12];
+ x8[13] = x7[13];
+ x8[14] = x7[14];
+ x8[15] = x7[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+ // stage 9
+ output[0] = x8[0];
+ output[1] = x8[16];
+ output[2] = x8[8];
+ output[3] = x8[24];
+ output[4] = x8[4];
+ output[5] = x8[20];
+ output[6] = x8[12];
+ output[7] = x8[28];
+ output[8] = x8[2];
+ output[9] = x8[18];
+ output[10] = x8[10];
+ output[11] = x8[26];
+ output[12] = x8[6];
+ output[13] = x8[22];
+ output[14] = x8[14];
+ output[15] = x8[30];
+ output[16] = x8[1];
+ output[17] = x8[17];
+ output[18] = x8[9];
+ output[19] = x8[25];
+ output[20] = x8[5];
+ output[21] = x8[21];
+ output[22] = x8[13];
+ output[23] = x8[29];
+ output[24] = x8[3];
+ output[25] = x8[19];
+ output[26] = x8[11];
+ output[27] = x8[27];
+ output[28] = x8[7];
+ output[29] = x8[23];
+ output[30] = x8[15];
+ output[31] = x8[31];
+}
+
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+ __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+ __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+ __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+ __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+ __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+ __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+ __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+ __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+ __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+ __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+ __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+ __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+ __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+ __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+ __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+ __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+ __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+ __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+ __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+ __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+ __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+ __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+ __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+ __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+ __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+ __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+ __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+ __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+ __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+ __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+ __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+ __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_adds_epi16(input[0], input[63]);
+ x1[63] = _mm_subs_epi16(input[0], input[63]);
+ x1[1] = _mm_adds_epi16(input[1], input[62]);
+ x1[62] = _mm_subs_epi16(input[1], input[62]);
+ x1[2] = _mm_adds_epi16(input[2], input[61]);
+ x1[61] = _mm_subs_epi16(input[2], input[61]);
+ x1[3] = _mm_adds_epi16(input[3], input[60]);
+ x1[60] = _mm_subs_epi16(input[3], input[60]);
+ x1[4] = _mm_adds_epi16(input[4], input[59]);
+ x1[59] = _mm_subs_epi16(input[4], input[59]);
+ x1[5] = _mm_adds_epi16(input[5], input[58]);
+ x1[58] = _mm_subs_epi16(input[5], input[58]);
+ x1[6] = _mm_adds_epi16(input[6], input[57]);
+ x1[57] = _mm_subs_epi16(input[6], input[57]);
+ x1[7] = _mm_adds_epi16(input[7], input[56]);
+ x1[56] = _mm_subs_epi16(input[7], input[56]);
+ x1[8] = _mm_adds_epi16(input[8], input[55]);
+ x1[55] = _mm_subs_epi16(input[8], input[55]);
+ x1[9] = _mm_adds_epi16(input[9], input[54]);
+ x1[54] = _mm_subs_epi16(input[9], input[54]);
+ x1[10] = _mm_adds_epi16(input[10], input[53]);
+ x1[53] = _mm_subs_epi16(input[10], input[53]);
+ x1[11] = _mm_adds_epi16(input[11], input[52]);
+ x1[52] = _mm_subs_epi16(input[11], input[52]);
+ x1[12] = _mm_adds_epi16(input[12], input[51]);
+ x1[51] = _mm_subs_epi16(input[12], input[51]);
+ x1[13] = _mm_adds_epi16(input[13], input[50]);
+ x1[50] = _mm_subs_epi16(input[13], input[50]);
+ x1[14] = _mm_adds_epi16(input[14], input[49]);
+ x1[49] = _mm_subs_epi16(input[14], input[49]);
+ x1[15] = _mm_adds_epi16(input[15], input[48]);
+ x1[48] = _mm_subs_epi16(input[15], input[48]);
+ x1[16] = _mm_adds_epi16(input[16], input[47]);
+ x1[47] = _mm_subs_epi16(input[16], input[47]);
+ x1[17] = _mm_adds_epi16(input[17], input[46]);
+ x1[46] = _mm_subs_epi16(input[17], input[46]);
+ x1[18] = _mm_adds_epi16(input[18], input[45]);
+ x1[45] = _mm_subs_epi16(input[18], input[45]);
+ x1[19] = _mm_adds_epi16(input[19], input[44]);
+ x1[44] = _mm_subs_epi16(input[19], input[44]);
+ x1[20] = _mm_adds_epi16(input[20], input[43]);
+ x1[43] = _mm_subs_epi16(input[20], input[43]);
+ x1[21] = _mm_adds_epi16(input[21], input[42]);
+ x1[42] = _mm_subs_epi16(input[21], input[42]);
+ x1[22] = _mm_adds_epi16(input[22], input[41]);
+ x1[41] = _mm_subs_epi16(input[22], input[41]);
+ x1[23] = _mm_adds_epi16(input[23], input[40]);
+ x1[40] = _mm_subs_epi16(input[23], input[40]);
+ x1[24] = _mm_adds_epi16(input[24], input[39]);
+ x1[39] = _mm_subs_epi16(input[24], input[39]);
+ x1[25] = _mm_adds_epi16(input[25], input[38]);
+ x1[38] = _mm_subs_epi16(input[25], input[38]);
+ x1[26] = _mm_adds_epi16(input[26], input[37]);
+ x1[37] = _mm_subs_epi16(input[26], input[37]);
+ x1[27] = _mm_adds_epi16(input[27], input[36]);
+ x1[36] = _mm_subs_epi16(input[27], input[36]);
+ x1[28] = _mm_adds_epi16(input[28], input[35]);
+ x1[35] = _mm_subs_epi16(input[28], input[35]);
+ x1[29] = _mm_adds_epi16(input[29], input[34]);
+ x1[34] = _mm_subs_epi16(input[29], input[34]);
+ x1[30] = _mm_adds_epi16(input[30], input[33]);
+ x1[33] = _mm_subs_epi16(input[30], input[33]);
+ x1[31] = _mm_adds_epi16(input[31], input[32]);
+ x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+ x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+ x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+ x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+ x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+ x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+ x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+ x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+ x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+ x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+ x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+ x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+ x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+ x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+ x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+ x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+ x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+ x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+ x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+ x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+ x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+ x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+ x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+ x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+ x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+ x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+ x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+ x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+ x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+ x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+ x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+ x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+ x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+ x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+ x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+ x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+ x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+ x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+ x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+ x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+ x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+ x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+ x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+ x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+ x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+ x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+ x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+ x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+ x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+ x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+ x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+ x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+ x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+ x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+ x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+ x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+ x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+ x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+ x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+ x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+ x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+ x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+ x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+ x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+ x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+ x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+ x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+ x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+ x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+ x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+ x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+ x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+ x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+ x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+ x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+ x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+ x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+ x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+ x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+ x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+ x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+ x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+ x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+ x5[7] = x4[7];
+ x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+ x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+ x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+ x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+ x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+ x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+ x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+ x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+ x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+ x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+ x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+ x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+ x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+ x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+ x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+ x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+ x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+ x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+ x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+ x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+ x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+ x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+ x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+ x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+ x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+ x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+ x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+ x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+ x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+ x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+ x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+ x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+ x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+ x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+ x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+ x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+ x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+ x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+ x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+ x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+ x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+ x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+ x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+ x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+ x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+ x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+ x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+ x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+ x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+ x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+ x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+ x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+ x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+ x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+ x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+ x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+ x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+ x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+ x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+ x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+ x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+ x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+ x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+ x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+ x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+ x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+ x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+ x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+ x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+ x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+ x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+ x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+ x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+ x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+ x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+ x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+ x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+ x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+ x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+ x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+ x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+ x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+ x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+ x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+ x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+ x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+ x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+ x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+ x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+ x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+ x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+ x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+ x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+ x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+ x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+ x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+ x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+ x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+ x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+ x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+ x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+ x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+ x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+ x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+ x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+ x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+ x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+ x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+ x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+ x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+ x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+ x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+ x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+ x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+ x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+ x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+ x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+ x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+ x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+ x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+ x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+ x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+ x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+ x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+ x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+ x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+ x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+ x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+ x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+ x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+ x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+ x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+ x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+ x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+ x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+ x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+ x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+ x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+ x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+ x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+ x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+ btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+ btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+ btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+ btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+ btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+ btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+ btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+ btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+ btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+ btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+ btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+ btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+ btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+ btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+ btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u[8], v[8];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u[2] = _mm_unpacklo_epi16(in7, __zero);
+ u[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+ v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[6], __rounding);
+
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[2]);
+ output[1] = _mm_packs_epi32(u[1], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+ &x1[3], &x2[2], &x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+ &x1[7], &x2[6], &x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+ &x3[5], &x4[4], &x4[5]);
+ btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+ &x3[7], &x4[6], &x4[7]);
+
+ // stage 5
+ __m128i x5[8];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m128i x6[8];
+ btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+ &x5[1], &x6[0], &x6[1]);
+ btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+ &x5[3], &x6[2], &x6[3]);
+ btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+ &x5[5], &x6[4], &x6[5]);
+ btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+ &x5[7], &x6[6], &x6[7]);
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+ u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+ u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+ u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+ u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+ u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+ u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+ u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+ v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2
+ v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2
+ v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5
+ v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5
+ v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1
+ v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1
+ v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3
+ v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3
+ v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6
+ v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6
+ v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4
+ v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4
+ v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+ v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+ u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+ u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+ u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+ u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+ u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+ u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+ u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+ u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+ u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+ u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+ u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+ u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+ u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+ u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+ v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+ v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+ v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+ v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+ v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+ v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+ v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+ v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+ u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+ u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+ u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+ u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+ u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+ u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+ u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+ u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+ output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+ output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+ output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[15]);
+ x1[2] = _mm_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm_subs_epi16(__zero, input[11]);
+ x1[8] = _mm_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm_subs_epi16(__zero, input[13]);
+ x1[14] = _mm_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+ x2[12] = x1[12];
+ x2[13] = x1[13];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+ x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+ x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+ x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+ // stage 4
+ __m128i x4[16];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ x4[10] = x3[10];
+ x4[11] = x3[11];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+ x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+ x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+ x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+ x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+ // stage 7
+ __m128i x7[16];
+ x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+ x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+ x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+ x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+ x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+ x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+ x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+ x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+ x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+ x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+ x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+ x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+ x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+ x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+ // stage 8
+ __m128i x8[16];
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+ // stage 9
+ output[0] = x8[1];
+ output[1] = x8[14];
+ output[2] = x8[3];
+ output[3] = x8[12];
+ output[4] = x8[5];
+ output[5] = x8[10];
+ output[6] = x8[7];
+ output[7] = x8[8];
+ output[8] = x8[9];
+ output[9] = x8[6];
+ output[10] = x8[11];
+ output[11] = x8[4];
+ output[12] = x8[13];
+ output[13] = x8[2];
+ output[14] = x8[15];
+ output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fadst4x4_new_sse2, // ADST_DCT
+ fdct4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fadst4x4_new_sse2, // FLIPADST_DCT
+ fdct4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fdct4x4_new_sse2, // V_DCT
+ fidentity4x4_new_sse2, // H_DCT
+ fadst4x4_new_sse2, // V_ADST
+ fidentity4x4_new_sse2, // H_ADST
+ fadst4x4_new_sse2, // V_FLIPADST
+ fidentity4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fdct4x4_new_sse2, // ADST_DCT
+ fadst4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fdct4x4_new_sse2, // FLIPADST_DCT
+ fadst4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fidentity4x4_new_sse2, // V_DCT
+ fdct4x4_new_sse2, // H_DCT
+ fidentity4x4_new_sse2, // V_ADST
+ fadst4x4_new_sse2, // H_ADST
+ fidentity4x4_new_sse2, // V_FLIPADST
+ fadst4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fadst4x8_new_sse2, // ADST_DCT
+ fdct4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fadst4x8_new_sse2, // FLIPADST_DCT
+ fdct4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct4x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst4x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst4x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fdct8x4_new_sse2, // ADST_DCT
+ fadst8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fdct8x4_new_sse2, // FLIPADST_DCT
+ fadst8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fidentity8x4_new_sse2, // V_DCT
+ fdct8x4_new_sse2, // H_DCT
+ fidentity8x4_new_sse2, // V_ADST
+ fadst8x4_new_sse2, // H_ADST
+ fidentity8x4_new_sse2, // V_FLIPADST
+ fadst8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fadst8x4_new_sse2, // ADST_DCT
+ fdct8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fadst8x4_new_sse2, // FLIPADST_DCT
+ fdct8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fdct8x4_new_sse2, // V_DCT
+ fidentity8x4_new_sse2, // H_DCT
+ fadst8x4_new_sse2, // V_ADST
+ fidentity8x4_new_sse2, // H_ADST
+ fadst8x4_new_sse2, // V_FLIPADST
+ fidentity8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fdct4x8_new_sse2, // ADST_DCT
+ fadst4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fdct4x8_new_sse2, // FLIPADST_DCT
+ fadst4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct4x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst4x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst4x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fadst8x8_new_sse2, // ADST_DCT
+ fdct8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fadst8x8_new_sse2, // FLIPADST_DCT
+ fdct8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct8x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst8x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst8x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fdct8x8_new_sse2, // ADST_DCT
+ fadst8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fdct8x8_new_sse2, // FLIPADST_DCT
+ fadst8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct8x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst8x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fadst8x16_new_sse2, // ADST_DCT
+ fdct8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fadst8x16_new_sse2, // FLIPADST_DCT
+ fdct8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fdct8x16_new_sse2, // V_DCT
+ fidentity8x16_new_sse2, // H_DCT
+ fadst8x16_new_sse2, // V_ADST
+ fidentity8x16_new_sse2, // H_ADST
+ fadst8x16_new_sse2, // V_FLIPADST
+ fidentity8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fdct8x16_new_sse2, // ADST_DCT
+ fadst8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fdct8x16_new_sse2, // FLIPADST_DCT
+ fadst8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fidentity8x16_new_sse2, // V_DCT
+ fdct8x16_new_sse2, // H_DCT
+ fidentity8x16_new_sse2, // V_ADST
+ fadst8x16_new_sse2, // H_ADST
+ fidentity8x16_new_sse2, // V_FLIPADST
+ fadst8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ fidentity8x32_new_sse2, // V_DCT
+ av1_fdct8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[4], buf1[4], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x4(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)stride;
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+ const int txw_idx = get_txw_idx(TX_4X8);
+ const int txh_idx = get_txh_idx(TX_4X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+ const int txw_idx = get_txw_idx(TX_4X16);
+ const int txh_idx = get_txh_idx(TX_4X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+ transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + 8 * i, buf, width);
+ } else {
+ buf = buf1 + 8 * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+ const int txw_idx = get_txw_idx(TX_8X4);
+ const int txh_idx = get_txh_idx(TX_8X4);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+ const int txw_idx = get_txw_idx(TX_8X32);
+ const int txh_idx = get_txh_idx(TX_8X32);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+ transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+ transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+ const int txw_idx = get_txw_idx(TX_16X4);
+ const int txh_idx = get_txh_idx(TX_16X4);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x4(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+ const int txw_idx = get_txw_idx(TX_16X32);
+ const int txh_idx = get_txh_idx(TX_16X32);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+ const int txw_idx = get_txw_idx(TX_32X8);
+ const int txh_idx = get_txh_idx(TX_32X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 1; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
+ const int txw_idx = get_txw_idx(TX_32X32);
+ const int txh_idx = get_txh_idx(TX_32X32);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+ const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div8; i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+ const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ NULL, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ else
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 0000000000..3cb869a8fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a = _mm_unpacklo_epi16(input[i], one);
+ const __m128i b = scale_round_sse2(a, NewSqrt2);
+ output[i] = _mm_packs_epi32(b, b);
+ }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm_adds_epi16(input[0], input[0]);
+ output[1] = _mm_adds_epi16(input[1], input[1]);
+ output[2] = _mm_adds_epi16(input[2], input[2]);
+ output[3] = _mm_adds_epi16(input[3], input[3]);
+ output[4] = _mm_adds_epi16(input[4], input[4]);
+ output[5] = _mm_adds_epi16(input[5], input[5]);
+ output[6] = _mm_adds_epi16(input[6], input[6]);
+ output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4 and 5
+ output[0] = x3[0];
+ output[4] = x3[1];
+ output[2] = x3[2];
+ output[6] = x3[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]);
+}
+
+static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+ // stage 5, 6 and 7
+ output[7] = _mm_adds_epi16(x4[0], x4[4]);
+ output[3] = _mm_subs_epi16(x4[0], x4[4]);
+ output[0] = _mm_adds_epi16(x4[1], x4[5]);
+ output[4] = _mm_subs_epi16(x4[1], x4[5]);
+ output[5] = _mm_adds_epi16(x4[2], x4[6]);
+ output[1] = _mm_subs_epi16(x4[2], x4[6]);
+ output[2] = _mm_adds_epi16(x4[3], x4[7]);
+ output[6] = _mm_subs_epi16(x4[3], x4[7]);
+
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7],
+ output[0]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5],
+ output[2]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3],
+ output[4]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1],
+ output[6]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm_slli_epi16(input[i], 2);
+ }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ av1_fdct8x32_new_sse2, // V_DCT
+ fidentity8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 0000000000..b58911fcb2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+ const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ if (log_scale) {
+ const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+ round = _mm_mulhrs_epi16(round, round_scale);
+ }
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+ init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+ const int16_t *iscan_ptr, int log_scale,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi32(*c);
+ __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
+
+ __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+ __m256i q_hi = _mm256_srli_epi64(q, 32);
+ const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+ q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+ q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+ q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+ q_hi = _mm256_slli_epi64(q_hi, 32);
+ q = _mm256_or_si256(q_lo, q_hi);
+ const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+ const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+ q = _mm256_andnot_si256(mask, q);
+
+ __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+ dq = _mm256_srai_epi32(dq, log_scale);
+ q = _mm256_sign_epi32(q, *c);
+ dq = _mm256_sign_epi32(dq, *c);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+ const __m128i zr = _mm_setzero_si128();
+ const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+ const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+ const __m256i iscan =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+ const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+ __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+ cur_eob = _mm256_and_si256(cur_eob, nz);
+ *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ (void)scan;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 8;
+ __m256i qp[3], coeff;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 0000000000..40b3b460b6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+ const int shift, const int scale,
+ __m128i *qcoeff, __m128i *dquan,
+ __m128i *sign) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+
+ *sign = _mm_cmplt_epi32(*coeff, zero);
+ *sign = _mm_or_si128(*sign, one);
+ *coeff = _mm_abs_epi32(*coeff);
+
+ qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+ qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+ qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+ qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+ qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+ dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+ dquan[0] = _mm_srli_epi64(dquan[0], scale);
+ const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+ qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+ const __m128i *sign,
+ const __m128i *param, const int shift,
+ const int scale, tran_low_t *qAddr,
+ tran_low_t *dqAddr) {
+ __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+ __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+ qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+ qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+ dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+ dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+ // combine L&H
+ qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+ qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+ qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+ qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+ dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+ dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+ dquan[0] = _mm_and_si128(dquan[0], mask0H);
+ dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+ qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+ dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+ qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+ dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+ qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+ dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+ _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+ _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+ __m128i *eob) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, iscanIdx;
+ const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+ const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+ __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+ __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+ nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+ nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+ mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+ iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+ iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+ iscanIdx = _mm_and_si128(iscanIdx, mask);
+ *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+ __m128i eob_shuffled;
+ uint16_t eobValue;
+ eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eobValue = _mm_extract_epi16(*eob, 0);
+ return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+ __m128i eob = _mm_setzero_si128();
+ const tran_low_t *src = coeff_ptr;
+ tran_low_t *quanAddr = qcoeff_ptr;
+ tran_low_t *dquanAddr = dqcoeff_ptr;
+ const int shift = 16 - log_scale;
+ const int coeff_stride = 4;
+ const int quan_stride = coeff_stride;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+ memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+ qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+ qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+ qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+ qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+ dequant_ptr[0]);
+
+ // DC and first 3 AC
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+
+ // update round/quan/dquan for AC
+ qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+ qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+ qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+ qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr, dquanAddr);
+
+ // next 4 AC
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr + quan_stride, dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+
+ // loop for the rest of AC
+ while (count > 0) {
+ src += coeff_stride << 1;
+ quanAddr += quan_stride << 1;
+ dquanAddr += quan_stride << 1;
+ iscan += quan_stride << 1;
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr, dquanAddr);
+
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr + quan_stride,
+ dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+ }
+ *eob_ptr = get_accumulated_eob(&eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c
new file mode 100644
index 0000000000..52ddc66437
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h> // AVX2
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static int64_t k_means_horizontal_sum_avx2(__m256i a) {
+ const __m128i low = _mm256_castsi256_si128(a);
+ const __m128i high = _mm256_extracti128_si256(a, 1);
+ const __m128i sum = _mm_add_epi64(low, high);
+ const __m128i sum_high = _mm_unpackhi_epi64(sum, sum);
+ int64_t res;
+ _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high));
+ return res;
+}
+
+void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m256i v_zero = _mm256_setzero_si256();
+ __m256i sum = _mm256_setzero_si256();
+ __m256i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ cents[j] = _mm256_set1_epi16(centroids[j]);
+ }
+
+ for (int i = 0; i < n; i += 16) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)data);
+ __m256i ind = _mm256_setzero_si256();
+ // Compute the distance to the first centroid.
+ __m256i d1 = _mm256_sub_epi16(in, cents[0]);
+ __m256i dist_min = _mm256_abs_epi16(d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm256_sub_epi16(in, cents[j]);
+ const __m256i dist = _mm256_abs_epi16(d1);
+ // Compare to the minimal one.
+ const __m256i cmp = _mm256_cmpgt_epi16(dist_min, dist);
+ dist_min = _mm256_min_epi16(dist_min, dist);
+ const __m256i ind1 = _mm256_set1_epi16(j);
+ ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+ _mm256_and_si256(cmp, ind1));
+ }
+
+ const __m256i p1 = _mm256_packus_epi16(ind, v_zero);
+ const __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+ const __m128i d2 = _mm256_extracti128_si256(px, 0);
+
+ _mm_storeu_si128((__m128i *)indices, d2);
+
+ if (total_dist) {
+ // Square, convert to 32 bit and add together.
+ dist_min = _mm256_madd_epi16(dist_min, dist_min);
+ // Convert to 64 bit and add to sum.
+ const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero);
+ const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm256_add_epi64(sum, dist1);
+ sum = _mm256_add_epi64(sum, dist2);
+ }
+
+ indices += 16;
+ data += 16;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_avx2(sum);
+ }
+}
+
+void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m256i v_zero = _mm256_setzero_si256();
+ const __m256i permute = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+ __m256i sum = _mm256_setzero_si256();
+ __m256i ind[2];
+ __m256i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+ cents[j] = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx,
+ cy, cx, cy, cx);
+ }
+
+ for (int i = 0; i < n; i += 16) {
+ for (int l = 0; l < 2; ++l) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)data);
+ ind[l] = _mm256_setzero_si256();
+ // Compute the distance to the first centroid.
+ __m256i d1 = _mm256_sub_epi16(in, cents[0]);
+ __m256i dist_min = _mm256_madd_epi16(d1, d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm256_sub_epi16(in, cents[j]);
+ const __m256i dist = _mm256_madd_epi16(d1, d1);
+ // Compare to the minimal one.
+ const __m256i cmp = _mm256_cmpgt_epi32(dist_min, dist);
+ dist_min = _mm256_min_epi32(dist_min, dist);
+ const __m256i ind1 = _mm256_set1_epi32(j);
+ ind[l] = _mm256_or_si256(_mm256_andnot_si256(cmp, ind[l]),
+ _mm256_and_si256(cmp, ind1));
+ }
+ if (total_dist) {
+ // Convert to 64 bit and add to sum.
+ const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero);
+ const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm256_add_epi64(sum, dist1);
+ sum = _mm256_add_epi64(sum, dist2);
+ }
+ data += 16;
+ }
+ // Cast to 8 bit and store.
+ const __m256i d2 = _mm256_packus_epi32(ind[0], ind[1]);
+ const __m256i d3 = _mm256_packus_epi16(d2, v_zero);
+ const __m256i d4 = _mm256_permutevar8x32_epi32(d3, permute);
+ const __m128i d5 = _mm256_extracti128_si256(d4, 0);
+ _mm_storeu_si128((__m128i *)indices, d5);
+ indices += 16;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_avx2(sum);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c
new file mode 100644
index 0000000000..6c75822350
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static int64_t k_means_horizontal_sum_sse2(__m128i a) {
+ const __m128i sum1 = _mm_unpackhi_epi64(a, a);
+ const __m128i sum2 = _mm_add_epi64(a, sum1);
+ int64_t res;
+ _mm_storel_epi64((__m128i *)&res, sum2);
+ return res;
+}
+
+void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m128i v_zero = _mm_setzero_si128();
+ __m128i sum = _mm_setzero_si128();
+ __m128i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ cents[j] = _mm_set1_epi16(centroids[j]);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ const __m128i in = _mm_loadu_si128((__m128i *)data);
+ __m128i ind = _mm_setzero_si128();
+ // Compute the distance to the first centroid.
+ __m128i d1 = _mm_sub_epi16(in, cents[0]);
+ __m128i d2 = _mm_sub_epi16(cents[0], in);
+ __m128i dist_min = _mm_max_epi16(d1, d2);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm_sub_epi16(in, cents[j]);
+ d2 = _mm_sub_epi16(cents[j], in);
+ const __m128i dist = _mm_max_epi16(d1, d2);
+ // Compare to the minimal one.
+ const __m128i cmp = _mm_cmpgt_epi16(dist_min, dist);
+ dist_min = _mm_min_epi16(dist_min, dist);
+ const __m128i ind1 = _mm_set1_epi16(j);
+ ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1));
+ }
+ if (total_dist) {
+ // Square, convert to 32 bit and add together.
+ dist_min = _mm_madd_epi16(dist_min, dist_min);
+ // Convert to 64 bit and add to sum.
+ const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero);
+ const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm_add_epi64(sum, dist1);
+ sum = _mm_add_epi64(sum, dist2);
+ }
+ __m128i p2 = _mm_packus_epi16(ind, v_zero);
+ _mm_storel_epi64((__m128i *)indices, p2);
+ indices += 8;
+ data += 8;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_sse2(sum);
+ }
+}
+
+void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m128i v_zero = _mm_setzero_si128();
+ __m128i sum = _mm_setzero_si128();
+ __m128i ind[2];
+ __m128i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+ cents[j] = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ for (int l = 0; l < 2; ++l) {
+ const __m128i in = _mm_loadu_si128((__m128i *)data);
+ ind[l] = _mm_setzero_si128();
+ // Compute the distance to the first centroid.
+ __m128i d1 = _mm_sub_epi16(in, cents[0]);
+ __m128i dist_min = _mm_madd_epi16(d1, d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm_sub_epi16(in, cents[j]);
+ const __m128i dist = _mm_madd_epi16(d1, d1);
+ // Compare to the minimal one.
+ const __m128i cmp = _mm_cmpgt_epi32(dist_min, dist);
+ const __m128i dist1 = _mm_andnot_si128(cmp, dist_min);
+ const __m128i dist2 = _mm_and_si128(cmp, dist);
+ dist_min = _mm_or_si128(dist1, dist2);
+ const __m128i ind1 = _mm_set1_epi32(j);
+ ind[l] = _mm_or_si128(_mm_andnot_si128(cmp, ind[l]),
+ _mm_and_si128(cmp, ind1));
+ }
+ if (total_dist) {
+ // Convert to 64 bit and add to sum.
+ const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero);
+ const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm_add_epi64(sum, dist1);
+ sum = _mm_add_epi64(sum, dist2);
+ }
+ data += 8;
+ }
+ // Cast to 8 bit and store.
+ const __m128i d2 = _mm_packus_epi16(ind[0], ind[1]);
+ const __m128i d3 = _mm_packus_epi16(d2, v_zero);
+ _mm_storel_epi64((__m128i *)indices, d3);
+ indices += 8;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_sse2(sum);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 0000000000..75c5172f85
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *thr, __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ if (log_scale > 0) {
+ const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+ round = _mm_add_epi16(round, rnd);
+ round = _mm_srai_epi16(round, log_scale);
+ }
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+
+ if (log_scale == 1) {
+ qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+ }
+
+ init_one_qp(&dequant, &qp[2]);
+ *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask.
+ *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1));
+}
+
+static INLINE void update_qp(__m256i *thr, __m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+ *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+ const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+ __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+ eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+ eob_s = _mm_minpos_epu16(eob_s);
+ return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
+static INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+static AOM_FORCE_INLINE void quantize_lp_16_first(
+ const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256,
+ __m256i *dequant256, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff);
+
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+ *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
+static AOM_FORCE_INLINE void quantize_lp_16(
+ const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256,
+ __m256i *quant256, __m256i *dequant256, __m256i *eob) {
+ const __m256i coeff =
+ _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff);
+
+ const __m256i iscan =
+ _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs));
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+ *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
+void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ __m256i eob256 = _mm256_setzero_si256();
+
+ // Setup global values.
+ __m256i round256 =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ __m256i quant256 =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ __m256i dequant256 =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+
+ // Populate upper AC values.
+ round256 = _mm256_permute4x64_epi64(round256, 0x54);
+ quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+ dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+
+ // Process DC and the first 15 AC coeffs.
+ quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+ &quant256, &dequant256, &eob256);
+
+ if (n_coeffs > 16) {
+ // Overwrite the DC constants with AC constants
+ dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+ quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+ round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+ // AC only loop.
+ for (int idx = 16; idx < n_coeffs; idx += 16) {
+ quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+ &quant256, &dequant256, &eob256);
+ }
+ }
+
+ *eob_ptr = accumulate_eob256(eob256);
+}
+
+static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+ const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+ const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static AOM_FORCE_INLINE void quantize_fp_16(
+ const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ __m256i *eob) {
+ const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]);
+ const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]);
+ const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+ const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+ const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256());
+
+ store_coefficients_avx2(q, qcoeff_ptr);
+ store_coefficients_avx2(dq, dqcoeff_ptr);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ } else {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int log_scale = 0;
+ const int step = 16;
+ __m256i qp[3], thr;
+ __m256i eob = _mm256_setzero_si256();
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+ quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(&thr, qp);
+
+ while (n_coeffs > 0) {
+ quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static AOM_FORCE_INLINE void quantize_fp_32x32(
+ const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ __m256i *eob) {
+ const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]);
+ const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]);
+ const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+ const __m256i abs_dq =
+ _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1);
+ const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256());
+ const __m256i dq = _mm256_sign_epi16(abs_dq, coeff);
+
+ store_coefficients_avx2(q, qcoeff_ptr);
+ store_coefficients_avx2(dq, dqcoeff_ptr);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ } else {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ }
+}
+
+void av1_quantize_fp_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int log_scale = 1;
+ const unsigned int step = 16;
+ __m256i qp[3], thr;
+ __m256i eob = _mm256_setzero_si256();
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+ quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(&thr, qp);
+
+ while (n_coeffs > 0) {
+ quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_fp_64x64(const __m256i *thr, const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *eob) {
+ const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask);
+ const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2);
+ const __m256i ql =
+ _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14);
+ const __m256i abs_q = _mm256_or_si256(qh, ql);
+ const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14);
+ const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2);
+ const __m256i abs_dq = _mm256_or_si256(dqh, dql);
+ const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi16(abs_dq, coeff);
+ // Check the signed q/dq value here instead of the absolute value. When
+ // dequant equals 4, the dequant threshold (*thr) becomes 0 after being
+ // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the
+ // abs_coeff is 0, the nzflag will be set. As a result, the eob will be
+ // incorrectly calculated. The psign instruction corrects the error by
+ // zeroing out q/dq if coeff is zero.
+ const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256());
+ const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256());
+
+ store_coefficients_avx2(q, qcoeff_ptr);
+ store_coefficients_avx2(dq, dqcoeff_ptr);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ } else {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ }
+}
+
+void av1_quantize_fp_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int log_scale = 2;
+ const unsigned int step = 16;
+ __m256i qp[3], thr;
+ __m256i eob = _mm256_setzero_si256();
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+ quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(&thr, qp);
+
+ while (n_coeffs > 0) {
+ quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 0000000000..b533894015
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m128i *c0, __m128i *c1) {
+ const tran_low_t *addr = coeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+ const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+ const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+ const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+ *c0 = _mm_packs_epi32(x0, x1);
+ *c1 = _mm_packs_epi32(x2, x3);
+ } else {
+ *c0 = _mm_load_si128((const __m128i *)addr);
+ *c1 = _mm_load_si128((const __m128i *)addr + 1);
+ }
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+ tran_low_t *qcoeff, intptr_t offset) {
+ tran_low_t *addr = qcoeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+ __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+ __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+ _mm_store_si128((__m128i *)addr, y0);
+ _mm_store_si128((__m128i *)addr + 1, y1);
+
+ sign_bits = _mm_cmplt_epi16(*qc1, zero);
+ y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+ y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+ _mm_store_si128((__m128i *)addr + 2, y0);
+ _mm_store_si128((__m128i *)addr + 3, y1);
+ } else {
+ _mm_store_si128((__m128i *)addr, *qc0);
+ _mm_store_si128((__m128i *)addr + 1, *qc1);
+ }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+ const __m128i zero = _mm_setzero_si128();
+ tran_low_t *addr = qcoeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ _mm_store_si128((__m128i *)addr + 2, zero);
+ _mm_store_si128((__m128i *)addr + 3, zero);
+ } else {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ }
+}
+
+static INLINE void quantize(const int16_t *iscan_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const __m128i *round0, const __m128i *round1,
+ const __m128i *quant0, const __m128i *quant1,
+ const __m128i *dequant0, const __m128i *dequant1,
+ const __m128i *thr0, const __m128i *thr1,
+ __m128i *eob) {
+ __m128i coeff0, coeff1;
+ // Do DC and first 15 AC
+ read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+ // Poor man's sign extract
+ const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+ _mm_cmpeq_epi16(qcoeff0, *thr0));
+ const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+ _mm_cmpeq_epi16(qcoeff1, *thr1));
+ const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+ const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+ const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+ coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+ write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+ const __m128i zero = _mm_setzero_si128();
+ // Scan for eob
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ const __m128i iscan0 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ const __m128i iscan1 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+ const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+ const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+ const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+ const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+ *eob = _mm_max_epi16(*eob, eob2);
+ } else {
+ write_zero(qcoeff_ptr, n_coeffs);
+ write_zero(dqcoeff_ptr, n_coeffs);
+ }
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+ const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+ const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+ const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+ const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+ const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+ __m128i eob = _mm_setzero_si128();
+
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+ &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+ n_coeffs += 8 * 2;
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+ &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+ &eob);
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+}
+
+static INLINE void quantize_lp(const int16_t *iscan_ptr,
+ const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const __m128i *round0, const __m128i *round1,
+ const __m128i *quant0, const __m128i *quant1,
+ const __m128i *dequant0, const __m128i *dequant1,
+ __m128i *eob) {
+ const int16_t *read = coeff_ptr + n_coeffs;
+ __m128i coeff0 = _mm_load_si128((const __m128i *)read);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
+
+ // Poor man's sign extract
+ const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+ const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+ const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ int16_t *addr = qcoeff_ptr + n_coeffs;
+ _mm_store_si128((__m128i *)addr, qcoeff0);
+ _mm_store_si128((__m128i *)addr + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+ coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+ addr = dqcoeff_ptr + n_coeffs;
+ _mm_store_si128((__m128i *)addr, coeff0);
+ _mm_store_si128((__m128i *)addr + 1, coeff1);
+
+ const __m128i zero = _mm_setzero_si128();
+ // Scan for eob
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+
+ const __m128i iscan0 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ const __m128i iscan1 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+
+ // Add one to convert from indices to counts
+ const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+ const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+ const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+ const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+ const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+ *eob = _mm_max_epi16(*eob, eob2);
+}
+
+void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+ const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+ const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+ const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+ __m128i eob = _mm_setzero_si128();
+
+ // DC and first 15 AC
+ quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+ &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
+ n_coeffs += 8 * 2;
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+ &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ad4ae274e2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, fp_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m1, m5
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [r2q] ; m3 = dequant
+ mov r3, qcoeffmp
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, fp_32x32
+ psllw m2, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea r5q, [ r5q+ncoeffq*2]
+ lea r3q, [ r3q+ncoeffq*2]
+ lea r4q, [r4q+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m8
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; r4[i] = r3[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+ psrlw m0, m3, 2
+%else
+ psrlw m0, m3, 1
+%endif
+ mova [r4q+ncoeffq*2+ 0], m8
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+
+ pcmpgtw m7, m6, m0
+ pcmpgtw m12, m11, m0
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+
+ or r6, r2
+ jz .skip_iter
+
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m14
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; r4[i] = r3[i] * q
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+ mova [r4q+ncoeffq*2+ 0], m14
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+ jmp .accumulate_eob
+.skip_iter:
+ mova [r3q+ncoeffq*2+ 0], m5
+ mova [r3q+ncoeffq*2+16], m5
+ mova [r4q+ncoeffq*2+ 0], m5
+ mova [r4q+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+
+ lea r0q, [r0q+ncoeffq*2]
+ lea r2q, [r2q+ncoeffq*2]
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+ mova [r0q+ncoeffq*2+ 0], m7
+ mova [r0q+ncoeffq*2+16], m7
+ mova [r2q+ncoeffq*2+ 0], m7
+ mova [r2q+ncoeffq*2+16], m7
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [r3q], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..618758105a
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(av1_ssim_parms_16x16_sse2)
+sym(av1_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(av1_ssim_parms_8x8_sse2)
+sym(av1_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c
new file mode 100644
index 0000000000..830f40ecb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+ const __m128i k_1 = _mm_set1_epi16(1);
+ const __m128i acc_diff_lo =
+ _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_hi =
+ _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+ const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+ const __m128i hgfe_dcba =
+ _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+ const __m128i hgfedcba =
+ _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+ return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i av1_denoiser_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+ const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+ const __m128i *l21, __m128i acc_diff) {
+ // Calculate differences
+ const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ __m128i v_running_avg_y;
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ // Obtain the sign. FF if diff is negative.
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+ // Clamp absolute difference to 16 to be used to get mask. Doing this
+ // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+ const __m128i clamped_absdiff =
+ _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+ // Get masks for l2 l1 and l0 adjustments.
+ const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+ const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+ const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+ // Get adjustments for l2, l1, and l0.
+ __m128i adj2 = _mm_and_si128(mask2, *l32);
+ const __m128i adj1 = _mm_and_si128(mask1, *l21);
+ const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+ __m128i adj, padj, nadj;
+
+ // Combine the adjustments and get absolute adjustments.
+ adj2 = _mm_add_epi8(adj2, adj1);
+ adj = _mm_sub_epi8(*l3, adj2);
+ adj = _mm_andnot_si128(mask0, adj);
+ adj = _mm_or_si128(adj, adj0);
+
+ // Restore the sign and get positive and negative adjustments.
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+
+ // Calculate filtered value.
+ v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ // Adjustments <=7, and each element in acc_diff can fit in signed
+ // char.
+ acc_diff = _mm_adds_epi8(acc_diff, padj);
+ acc_diff = _mm_subs_epi8(acc_diff, nadj);
+ return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i av1_denoiser_adj_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
+ __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+ // Calculate differences.
+ const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ // Obtain the sign. FF if diff is negative.
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+ // Clamp absolute difference to delta to get the adjustment.
+ const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+ // Restore the sign and get positive and negative adjustments.
+ __m128i padj, nadj;
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+ // Calculate filtered value.
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+ v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ // Accumulate the adjustments.
+ acc_diff = _mm_subs_epi8(acc_diff, padj);
+ acc_diff = _mm_adds_epi8(acc_diff, nadj);
+ return acc_diff;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude, int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+ __m128i acc_diff = _mm_setzero_si128();
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ // Modify each level's adjustment according to motion_magnitude.
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+ // Difference between level 3 and level 2 is 2.
+ const __m128i l32 = _mm_set1_epi8(2);
+ // Difference between level 2 and level 1 is 1.
+ const __m128i l21 = _mm_set1_epi8(1);
+ const int b_height = block_size_high[bs] >> 1;
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+ running_buffer[r], &k_0, &k_4, &k_8,
+ &k_16, &l3, &l32, &l21, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = sum_diff_16x1(acc_diff);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const __m128i k_delta = _mm_set1_epi8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ acc_diff = av1_denoiser_adj_16x1_sse2(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+ k_delta, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+ width);
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = sum_diff_16x1(acc_diff);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16 to 128x128 blocks.
+static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ int sum_diff_thresh, r, c, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ __m128i acc_diff[8][8];
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ // Modify each level's adjustment according to motion_magnitude.
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+ // Difference between level 3 and level 2 is 2.
+ const __m128i l32 = _mm_set1_epi8(2);
+ // Difference between level 2 and level 1 is 1.
+ const __m128i l21 = _mm_set1_epi8(1);
+ const int b_width = block_size_wide[bs];
+ const int b_height = block_size_high[bs];
+ const int b_width_shift4 = b_width >> 4;
+
+ for (r = 0; r < 8; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r] = _mm_setzero_si128();
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2(
+ sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+ &l32, &l21, acc_diff[c][r >> 4]);
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const __m128i k_delta = _mm_set1_epi8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r >> 4] =
+ av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+ k_0, k_delta, acc_diff[c][r >> 4]);
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+ }
+ }
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ } else {
+ return COPY_BLOCK;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 0000000000..7a0f32898b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int stride);
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+ const int instride, const int outstride);
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int col_num);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+ __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+ __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+ __m128i *output) {
+ const int num_per_128 = 4;
+ const int row_size = txfm_size;
+ const int col_size = txfm_size / num_per_128;
+ int r, c;
+
+ // transpose each 4x4 block internally
+ for (r = 0; r < row_size; r += 4) {
+ for (c = 0; c < col_size; c++) {
+ transpose_32_4x4(col_size, &input[r * col_size + c],
+ &output[c * 4 * col_size + r / 4]);
+ }
+ }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m128i ww0 = _mm_set1_epi32(w0); \
+ const __m128i ww1 = _mm_set1_epi32(w1); \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = av1_round_shift_32_sse4_1(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = av1_round_shift_32_sse4_1(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm_add_epi32(out0, r); \
+ out0 = _mm_srai_epi32(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm_add_epi32(out1, r); \
+ out1 = _mm_srai_epi32(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \
+ } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/cnn_avx2.c b/third_party/aom/av1/encoder/x86/cnn_avx2.c
new file mode 100644
index 0000000000..ee93b3d5a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/cnn_avx2.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+// This mask rearranges source pixels in the order shown below.
+// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7.
+// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14.
+// This shuffling is needed to process 3 5x5 blocks which need
+// source pixels in the following order.
+// 1st 5x5 block: source pixels needed are 0 to 4,
+// 2nd 5x5 block: source pixels needed are 4 to 8,
+// 3rd 5x5 block: source pixels needed are 8 to 12.
+// Source pixels are loaded like mentioned below.
+// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7
+// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14
+// After applying masks, source bytes will be in the order:
+// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6
+// consists 5 pixels needed for 1st 5x5 block and
+// first 3 pixels needed for 2nd 5x5 block.
+// load_src1 : 7, 8, 8, 9, 10, 11, 12, x
+// consists last 2 pixels needed for 2nd 5x5 block and
+// 5 pixels needed for 3rd 5x5 block.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 },
+ { 0, 1, 1, 2, 3, 4, 5, 0 } };
+
+// This mask rearrange the weights to match shuffled source pixels order.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 },
+ { 3, 4, 0, 1, 2, 3, 4, 0 } };
+
+// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2.
+// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and
+// filter_height are equal to 2. So rearranging the weights in the
+// order shown below to match source pixels. Basically this mask replicates
+// the weights across the width of 2.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_weight_layer_1_and_2[2][8]) = {
+ { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 }
+};
+
+// After the stages of multiplication and accumulation, the output values
+// in the register will be jumbled. In order to store register into
+// output buffer in a proper way, the following mask is applied on output
+// register.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 };
+
+// Load weights needed for layer 0 (for 5x5 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_5x5_convolve(
+ const float *layer_config_weights, int off, float weight[5][8],
+ const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0,
+ const __m256i weight_mask_1) {
+ for (int row = 0; row < 5; ++row) {
+ for (int col = 0; col < 5; ++col) {
+ weight[row][col] = layer_config_weights[off];
+ off += cstep;
+ }
+ }
+ shuffle_weight[0] = _mm256_loadu_ps(weight[0]);
+ shuffle_weight[1] = _mm256_loadu_ps(weight[1]);
+ shuffle_weight[2] = _mm256_loadu_ps(weight[2]);
+ shuffle_weight[3] = _mm256_loadu_ps(weight[3]);
+ shuffle_weight[4] = _mm256_loadu_ps(weight[4]);
+
+ shuffle_weight[0] =
+ _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0);
+ shuffle_weight[1] =
+ _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0);
+ shuffle_weight[2] =
+ _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0);
+ shuffle_weight[3] =
+ _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0);
+ shuffle_weight[4] =
+ _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0);
+ shuffle_weight[5] =
+ _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1);
+ shuffle_weight[6] =
+ _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1);
+ shuffle_weight[7] =
+ _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1);
+ shuffle_weight[8] =
+ _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1);
+ shuffle_weight[9] =
+ _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1);
+}
+
+// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and
+// arranges them appropriately to process 3 blocks.
+#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS() \
+ do { \
+ for (int row = 0; row < 5; row++) { \
+ load_src_0 = _mm256_loadu_ps(input_ptr); \
+ load_src_1 = _mm256_loadu_ps(input_ptr + 7); \
+ load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1); \
+ load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2); \
+ load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \
+ load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \
+ accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0); \
+ accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1); \
+ input_ptr += in_stride; \
+ } \
+ } while (0)
+
+// Load masks needed for shuffling of output and weights.
+static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
+ __m256i *weight_mask) {
+ // Load shuffle buffer needed to sort the output.
+ *output_mask =
+ _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2);
+
+ // Load shuffle buffers needed for weight.
+ weight_mask[0] =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]);
+ weight_mask[1] =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]);
+}
+
+// Load weights needed for layer 1 and 2 (for 2x2 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_2x2_convolve(
+ const float *layer_config_weights, int off, const int cstep,
+ __m256 *shuffle_weight, __m256i *weight_mask) {
+ // Weights needed for 2x2 block.
+ float weight[4] = { 0 };
+ for (int i = 0; i < 4; ++i) {
+ weight[i] = layer_config_weights[off];
+ off += cstep;
+ }
+
+ const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight));
+ shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]);
+ shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]);
+}
+
+// Do convolution of one 5x5 block.
+#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride) \
+ do { \
+ __m128 load_src[5]; \
+ load_src[0] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[0][4]; \
+ input_ptr += in_stride; \
+ load_src[1] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[1][4]; \
+ input_ptr += in_stride; \
+ load_src[2] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[2][4]; \
+ input_ptr += in_stride; \
+ load_src[3] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[3][4]; \
+ input_ptr += in_stride; \
+ load_src[4] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[4][4]; \
+ \
+ load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \
+ load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \
+ load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \
+ load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \
+ load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \
+ \
+ accum0 = _mm_add_ps(load_src[0], accum0); \
+ load_src[1] = _mm_add_ps(load_src[1], load_src[2]); \
+ load_src[3] = _mm_add_ps(load_src[3], load_src[4]); \
+ load_src[1] = _mm_add_ps(load_src[1], load_src[3]); \
+ accum0 = _mm_add_ps(accum0, load_src[1]); \
+ } while (0)
+
+// Do convolution on 8 horizontal 2x2 blocks.
+static INLINE void perform_convolve_for_8h_2x2_blocks(
+ const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+ __m256i shuffle_output_mask) {
+ __m256 load_src[4];
+ // Load input into source registers.
+ load_src[0] = _mm256_loadu_ps(input_ptr);
+ load_src[1] = _mm256_loadu_ps(input_ptr + 8);
+ load_src[2] = _mm256_loadu_ps(input_ptr + in_stride);
+ load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8);
+
+ // Multiply the loaded input with corresponding weights.
+ load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+ load_src[1] = _mm256_mul_ps(load_src[1], weight[0]);
+ load_src[2] = _mm256_mul_ps(load_src[2], weight[1]);
+ load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+ // Accumulate across 2x2 blocks.
+ load_src[0] = _mm256_add_ps(load_src[0], load_src[2]);
+ load_src[1] = _mm256_add_ps(load_src[1], load_src[3]);
+ load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]);
+
+ // Sort the output in order to store into output buffer.
+ load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+ *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks.
+static INLINE void perform_convolve_for_4hx2v_2x2_blocks(
+ const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+ __m256i shuffle_output_mask) {
+ __m256 load_src[4];
+ // Load input into source registers.
+ load_src[0] = _mm256_loadu_ps(input_ptr);
+ load_src[1] = _mm256_loadu_ps(input_ptr + in_stride);
+ load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2));
+ load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3));
+
+ // Multiply the loaded input with corresponding weights.
+ load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+ load_src[1] = _mm256_mul_ps(load_src[1], weight[1]);
+ load_src[2] = _mm256_mul_ps(load_src[2], weight[0]);
+ load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+ // Accumulate across 2x2 blocks.
+ load_src[0] = _mm256_add_ps(load_src[0], load_src[1]);
+ load_src[2] = _mm256_add_ps(load_src[2], load_src[3]);
+ load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]);
+
+ // Sort the output in order to store into output buffer.
+ load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+ *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 5.
+// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config.
+// Based on the configuration set for each layer, the current encoder
+// always chooses the case of no_maxpool_padding_valid.
+// And also for layer 0 convolution happens at 5x5 level as the
+// filter_width and filter_height are set as 5.
+static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ const int kFilterWidth = 5;
+ const int kFilterHeight = 5;
+ const int kSkipWidth = 4;
+ const int kSkipHeight = 4;
+ assert(layer_config->filter_width == kFilterWidth &&
+ layer_config->filter_height == kFilterHeight);
+ assert(layer_config->skip_width == kSkipWidth &&
+ layer_config->skip_height == kSkipHeight);
+
+ // Load shuffle buffers needed for source.
+ const __m256i block0_1 =
+ _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]);
+ const __m256i block1_2 =
+ _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]);
+
+ // Load shuffle buffers needed for weight.
+ const __m256i weight_mask_0 =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]);
+ const __m256i weight_mask_1 =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]);
+
+ // Width needs to be moved to go to next iteration of processing 3 5x5 blocks.
+ const int kSkipWidthForNextIter = kSkipWidth * 3;
+
+ // Minimum width required to process 3 5x5 blocks at a time.
+ // min width (for processing 3 5x5 block) = 2*skip_width + filter_width
+ // Here, skip_width specifies how much width we should move while processing
+ // next block convolution and filter_width specifies for how many pixels
+ // filter needs to be applied.
+ const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ const float out_ch_bias = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ __m256 shuffle_weight[10];
+
+ // Weights needed are 5x5, for SIMD purpose made this array as 5x8.
+ float weight[5][8] = { { 0 } };
+ int off = k * layer_config->out_channels + i;
+
+ // In layer 0, the convolution process happens at 5x5.
+ // The weights needed for 5x5 block are same across the in-channels,
+ // which is why the load of weights happens once for each in-channel.
+ prepare_weights_for_5x5_convolve(layer_config->weights, off, weight,
+ cstep, shuffle_weight, weight_mask_0,
+ weight_mask_1);
+
+ for (int h = 0, u = 0; h < in_height - kFilterHeight + 1;
+ h += kSkipHeight, ++u) {
+ const int out_h = u * out_stride;
+ int v = 0;
+ int w = 0;
+ int rem_width = in_width;
+ // Processing 3 5x5 blocks at a time, if sufficient width is present.
+ while (rem_width >= kMinWidthFor3_5x5Blocks) {
+ __m256 load_src_0, load_src_1;
+ __m256 accum_src_0 = _mm256_setzero_ps();
+ __m256 accum_src_1 = _mm256_setzero_ps();
+ const float *input_ptr = &input[k][h * in_stride + w];
+ PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS();
+
+ // Accumulate across column.
+ __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1);
+ __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1);
+ __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1);
+
+ __m128 accum_l = _mm256_castps256_ps128(accum);
+ __m128 accum_h = _mm256_extractf128_ps(accum, 1);
+
+ __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0);
+ __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h);
+ __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h);
+
+ // 1st 5x5 block output.
+ output[i][out_h + v] =
+ out_ch_bias + _mm_cvtss_f32(tmp_reg_2) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1));
+
+ // 2nd 5x5 block output.
+ output[i][out_h + v + 1] =
+ out_ch_bias +
+ _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2));
+
+ // 3rd 5x5 block output.
+ output[i][out_h + v + 2] =
+ out_ch_bias +
+ _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3));
+
+ v += 3;
+ w += kSkipWidthForNextIter;
+ rem_width -= kSkipWidthForNextIter;
+ }
+
+ // Process remaining blocks as single 5x5 block at a time.
+ while (rem_width >= kFilterWidth) {
+ float last_column_sum = 0;
+ __m128 accum = _mm_setzero_ps();
+ const float *input_ptr = &input[k][h * in_stride + w];
+ PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride);
+
+ // Accumulate across column.
+ accum = _mm_hadd_ps(accum, accum);
+ output[i][out_h + v] = out_ch_bias + last_column_sum +
+ _mm_cvtss_f32(accum) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1));
+
+ v += 1;
+ w += kSkipWidth;
+ rem_width -= kSkipWidth;
+ }
+ }
+ }
+ }
+}
+
+// AVX2 implementation for layer 1.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+ const float **input, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ __m256i weight_mask[2];
+ __m256i shuffle_output_mask;
+ load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+ const int kInHeight = 16;
+ const int kFilterHeight = 2;
+ const int kSkipHeight = 2;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+ // out_accum registers are used to store the 2x2 convolve outputs
+ // (calculated over input block size), which are accumulated across the
+ // in_channels. As per the design, each iteration of for loop processes 8
+ // (horizontal) 2x2 blocks and stores in corresponding out_accum register
+ // (as input size is 16x16, a total of 64 2x2 blocks are present and 8
+ // out_accum registers are enough to store the outputs).
+ // Hence for loops corresponding to 'j' and 'h', below, run over the number
+ // of out_accum registers.
+ __m256 out_accum[8];
+ for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg;
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ __m256 shuffle_weight[2];
+ int off = k * layer_config->out_channels + i;
+ // In layer 1, the convolution process happens at 2x2.
+ // The weights needed for 2x2 block are same across the in-channels,
+ // which is why the load of weights happens once for each in-channel.
+ prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+ shuffle_weight, weight_mask);
+
+ for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+ h += kSkipHeight, ++u) {
+ const float *input_ptr = &input[k][h * in_stride];
+ perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight,
+ &out_accum[u], shuffle_output_mask);
+ }
+ }
+ // Store output of layer 1.
+ for (int j = 0; j < 8; ++j) {
+ _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
+ }
+ }
+}
+
+// AVX2 implementation for layer 2.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+ const float **input, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ __m256i weight_mask[2];
+ __m256i shuffle_output_mask;
+ load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+ const int kInHeight = 8;
+ const int kFilterHeight = 2;
+ const int kSkipHeight = 2;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+ // out_accum registers are used to store the 2x2 convolve outputs
+ // (calculated over input block size), which are accumulated across the
+ // in_channels. As per the design, each iteration of for loop processes 8
+ // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding
+ // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are
+ // present and 2 out_accum registers are enough to store the outputs).
+ // Hence for loops corresponding to 'j' and 'h', below, run over the number
+ // of out_accum registers.
+ __m256 out_accum[2];
+
+ // Height needs to be moved to go to next iteration of processing
+ // while processing 2 2x2 blocks vertically.
+ const int kSkipHeightForNextIter = kSkipHeight * 2;
+ for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg;
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ __m256 shuffle_weight[2];
+ int off = k * layer_config->out_channels + i;
+ // In layer 2, the convolution process happens at 2x2.
+ // The weights needed for 2x2 block are same across the in-channels,
+ // which is why the load of weights happens once for each in-channel.
+ prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+ shuffle_weight, weight_mask);
+
+ for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+ h += kSkipHeightForNextIter, ++u) {
+ const float *input_ptr = &input[k][h * in_stride];
+ perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride,
+ shuffle_weight, &out_accum[u],
+ shuffle_output_mask);
+ }
+ }
+ // Store output of layer 2.
+ for (int j = 0; j < 2; ++j) {
+ _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
+ }
+ }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 2.
+// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config,
+// the filter_width and filter_height are equal to 2 for layer >= 1. So
+// convolution happens at 2x2 for layer >= 1.
+void cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ assert(layer_config->filter_width == 2 && layer_config->filter_height == 2);
+ assert(layer_config->skip_width == 2 && layer_config->skip_height == 2);
+
+ if (in_width == 16 && in_height == 16) {
+ // This case of in_width and in_height equal to 16 corresponds to layer 1.
+ // The output size of this layer is 8x8.
+ cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+ input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+ channel_step);
+ } else if (in_width == 8 && in_height == 8) {
+ // This case of in_width and in_height equal to 8 corresponds to layer 2.
+ // The output size of this layer is 4x4.
+ cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+ input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+ channel_step);
+ } else {
+ // For layer equal to 3 and 4, the input is of size 4x4 and 2x2
+ // respectively. Implementing SIMD for these cases might not be optimal,
+ // which is why we call C path for layer >= 3.
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+// 0 64x64 16x16 5 5 4 4
+// 1 16x16 8x8 2 2 2 2
+// 2 8x8 4x4 2 2 2 2
+// 3 4x4 2x2 2 2 2 2
+// 4 2x2 1x1 2 2 2 2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_avx2(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step) {
+ if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+ layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+ cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ } else if (layer_config->filter_width == 2 &&
+ layer_config->filter_height == 2 &&
+ layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+ cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ } else {
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000000..b185548184
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,82 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+ ; sign extension
+ mova m2, m0
+ mova m3, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ punpckhwd m2, m2
+ punpckhwd m3, m3
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ mova [outputq], m0
+ mova [outputq + 16], m2
+ mova [outputq + 32], m1
+ mova [outputq + 48], m3
+
+ RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 0000000000..9627f75930
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+#include <immintrin.h> /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ const __m256i y_zeros = _mm256_setzero_si256();
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride;
+ uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+ do {
+ yy_storeu_256(bottom_buf, y_zeros);
+ bottom_buf += 32;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (height == 4) {
+ do {
+ const __m256i c0 = yy_loadu_256(cf);
+ const __m256i c1 = yy_loadu_256(cf + 8);
+ const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+ const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+ const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+ const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ ls += 32;
+ cf += 16;
+ i += 4;
+ } while (i < width);
+ } else if (height == 8) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ const __m128i res0 = _mm256_castsi256_si128(res);
+ const __m128i res1 = _mm256_extracti128_si256(res, 1);
+ xx_storel_64(ls, res0);
+ *(int32_t *)(ls + height) = 0;
+ xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+ *(int32_t *)(ls + height + stride) = 0;
+ xx_storel_64(ls + stride * 2, res1);
+ *(int32_t *)(ls + height + stride * 2) = 0;
+ xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+ *(int32_t *)(ls + height + stride * 3) = 0;
+ cf += 32;
+ ls += stride << 2;
+ i += 4;
+ } while (i < width);
+ } else if (height == 16) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ xx_storeu_128(ls, _mm256_castsi256_si128(res));
+ xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+ cf += 32;
+ *(int32_t *)(ls + height) = 0;
+ *(int32_t *)(ls + stride + height) = 0;
+ ls += stride << 1;
+ i += 2;
+ } while (i < width);
+ } else {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ cf += 32;
+ *(int32_t *)(ls + height) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < width);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 0000000000..d23a688747
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+ level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+ level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+ level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+ level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+ const __m128i const_3 = _mm_set1_epi8(3);
+ const __m128i const_4 = _mm_set1_epi8(4);
+ __m128i count;
+
+ count = _mm_min_epu8(level[0], const_3);
+ level[1] = _mm_min_epu8(level[1], const_3);
+ level[2] = _mm_min_epu8(level[2], const_3);
+ level[3] = _mm_min_epu8(level[3], const_3);
+ level[4] = _mm_min_epu8(level[4], const_3);
+ count = _mm_add_epi8(count, level[1]);
+ count = _mm_add_epi8(count, level[2]);
+ count = _mm_add_epi8(count, level[3]);
+ count = _mm_add_epi8(count, level[4]);
+ count = _mm_avg_epu8(count, _mm_setzero_si128());
+ count = _mm_min_epu8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+ __m128i pos_to_offset =
+ (width == 4)
+ ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+ : _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21,
+ 21, 21);
+ __m128i count;
+ __m128i level[5];
+ int8_t *cc = coeff_contexts;
+ int col = width;
+
+ assert(!(width % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ col -= 4;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(width % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(width % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int col = width;
+ __m128i count;
+ __m128i level[5];
+ __m128i pos_to_offset[3];
+
+ assert(!(width % 2));
+
+ if (width == 8) {
+ pos_to_offset[0] =
+ _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ } else if (width < 8) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21,
+ 21, 21, 21, 21, 21);
+ } else {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ }
+ pos_to_offset[2] = _mm_set1_epi8(21);
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ col -= 2;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int col = width;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+ int col = width;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int col = width;
+ __m128i pos_to_offset[5];
+ __m128i pos_to_offset_large[3];
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 16));
+
+ pos_to_offset_large[2] = _mm_set1_epi8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width < real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+ 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width > real_height
+ pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(16);
+ }
+
+ do {
+ int h = height;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ levels += 16;
+ cc += 16;
+ h -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ const __m128i pos_to_offset_large =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(height % 16));
+
+ do {
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int h = height;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ __m128i pos_to_offset[3];
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(height % 16));
+
+ pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+ do {
+ int h = height;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = 0;
+ return;
+ }
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = height + TX_PAD_HOR;
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (height == 4) {
+ get_4_nz_map_contexts_2d(levels, width, offsets, coeff_contexts);
+ } else if (height == 8) {
+ get_8_coeff_contexts_2d(levels, width, offsets, coeff_contexts);
+ } else if (height == 16) {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (height == 4) {
+ get_4_nz_map_contexts_hor(levels, width, offsets, coeff_contexts);
+ } else if (height == 8) {
+ get_8_coeff_contexts_hor(levels, width, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (height == 4) {
+ get_4_nz_map_contexts_ver(levels, width, offsets, coeff_contexts);
+ } else if (height == 8) {
+ get_8_coeff_contexts_ver(levels, width, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ }
+
+ const int bhl = get_txb_bhl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (width << bhl) / 8)
+ coeff_contexts[pos] = 1;
+ else if (last_idx <= (width << bhl) / 4)
+ coeff_contexts[pos] = 2;
+ else
+ coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 0000000000..72bd8e3411
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ const __m128i zeros = _mm_setzero_si128();
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf = levels + stride * width;
+ uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+ do {
+ _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+ bottom_buf += 16;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (height == 4) {
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+ xx_storeu_128(ls, lsAB);
+ ls += (stride << 1);
+ cf += (height << 1);
+ i += 2;
+ } while (i < width);
+ } else if (height == 8) {
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ xx_storeu_128(ls, absAB8);
+ ls += stride;
+ cf += height;
+ i += 1;
+ } while (i < width);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffC = xx_loadu_128(cf + 8);
+ const __m128i coeffD = xx_loadu_128(cf + 12);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absCD = _mm_abs_epi16(coeffCD);
+ const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+ xx_storeu_128(ls + j, absABCD);
+ j += 16;
+ cf += 16;
+ } while (j < height);
+ *(int32_t *)(ls + height) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < width);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 0000000000..57725d1795
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m256i *c) {
+ const tran_low_t *addr = coeff + offset;
+
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+ const __m256i y = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(y, 0xD8);
+ } else {
+ *c = _mm256_loadu_si256((const __m256i *)addr);
+ }
+}
+
+static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ __m256i *sse_256) {
+ const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+ // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+ const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+ // r0 r1 r2 r3 r4 r5 r6 r7
+ const __m256i error = _mm256_madd_epi16(diff, diff);
+ // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7
+ const __m256i error_hi = _mm256_hadd_epi32(error, error);
+ // r0+r1 | r2+r3 | r4+r5 | r6+r7
+ *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256());
+}
+
+static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ __m256i *sse_256) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+ const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+ const __m256i _dqcoeff_1 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+
+ // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+ const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+ const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+
+ // r0 r1 r2 r3 r4 r5 r6 r7
+ const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+ const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+ const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+
+ // For extreme input values, the accumulation needs to happen in 64 bit
+ // precision to avoid any overflow.
+ const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+ const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+ const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+ *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0);
+}
+
+static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ __m256i *sse_256,
+ intptr_t num_coeff) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int i = 0; i < num_coeff; i += 64) {
+ // Load 64 elements for coeff and dqcoeff.
+ const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+ const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+ const __m256i _dqcoeff_1 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+ const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32));
+ const __m256i _dqcoeff_2 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 32));
+ const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48));
+ const __m256i _dqcoeff_3 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 48));
+
+ // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+ const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+ const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+ const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2);
+ const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3);
+
+ // r0 r1 r2 r3 r4 r5 r6 r7
+ const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+ const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+ const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2);
+ const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3);
+ // r00 r01 r02 r03 r04 r05 r06 r07
+ const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+ // r10 r11 r12 r13 r14 r15 r16 r17
+ const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3);
+
+ // For extreme input values, the accumulation needs to happen in 64 bit
+ // precision to avoid any overflow. r00 r01 r04 r05
+ const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+ // r02 r03 r06 r07
+ const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+ // r10 r11 r14 r15
+ const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero);
+ // r12 r13 r16 r17
+ const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero);
+
+ const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+ const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo);
+ const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0);
+ *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp);
+ coeff += 64;
+ dqcoeff += 64;
+ }
+}
+
+int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t num_coeff) {
+ assert(num_coeff % 16 == 0);
+ __m256i sse_256 = _mm256_setzero_si256();
+ int64_t sse;
+
+ if (num_coeff == 16)
+ av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256);
+ else if (num_coeff == 32)
+ av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256);
+ else
+ av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff);
+
+ // Save the higher 64 bit of each 128 bit lane.
+ const __m256i sse_hi = _mm256_srli_si256(sse_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+ // Accumulate the sse_256 register to get final sse
+ const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+ _mm256_extractf128_si256(sse_256, 1));
+
+ // Store the results.
+ _mm_storel_epi64((__m128i *)&sse, sse_128);
+ return sse;
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_reg_64hi, ssz_reg_64hi;
+ __m128i sse_reg128, ssz_reg128;
+ int64_t sse;
+ int i;
+ const __m256i zero_reg = _mm256_setzero_si256();
+
+ // init sse and ssz registerd to zero
+ sse_reg = _mm256_setzero_si256();
+ ssz_reg = _mm256_setzero_si256();
+
+ for (i = 0; i < block_size; i += 16) {
+ // load 32 bytes from coeff and dqcoeff
+ read_coeff(coeff, i, &coeff_reg);
+ read_coeff(dqcoeff, i, &dqcoeff_reg);
+ // dqcoeff - coeff
+ dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+ // madd (dqcoeff - coeff)
+ dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+ // madd coeff
+ coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+ // expand each double word of madd (dqcoeff - coeff) to quad word
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+ // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+ }
+ // save the higher 64 bit of each 128 bit lane
+ sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+ ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+ // add the higher 64 bit to the low 64 bit
+ sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+ // add each 64 bit from each of the 128 bit lane of the 256 bit
+ sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+ _mm256_extractf128_si256(sse_reg, 1));
+
+ ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+ _mm256_extractf128_si256(ssz_reg, 1));
+
+ // store the results
+ _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+ _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+ _mm256_zeroupper();
+ return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c
new file mode 100644
index 0000000000..61f65c623f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+ __m128i reg_hi = _mm_srli_si128(reg, 8);
+ reg = _mm_add_epi64(reg, reg_hi);
+
+ return reg;
+}
+
+int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t block_size) {
+ assert(block_size % 16 == 0);
+ assert(block_size >= 16);
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum_0 = zero;
+ __m128i accum_1 = zero;
+
+ for (int i = 0; i < block_size; i += 16) {
+ // Load 8 elements for coeff and dqcoeff.
+ const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff);
+ const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+ const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff);
+ const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8));
+ // Compute the diff
+ const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0);
+ const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1);
+ // Compute the error
+ const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0);
+ const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1);
+
+ const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero);
+ const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero);
+ const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero);
+ const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero);
+
+ // Accumulate
+ accum_0 = _mm_add_epi64(accum_0, error_lo_0);
+ accum_1 = _mm_add_epi64(accum_1, error_lo_1);
+ accum_0 = _mm_add_epi64(accum_0, error_hi_0);
+ accum_1 = _mm_add_epi64(accum_1, error_hi_1);
+
+ // Advance
+ coeff += 16;
+ dqcoeff += 16;
+ }
+
+ __m128i accum = _mm_add_epi64(accum_0, accum_1);
+ // Reduce sum the register
+ accum = reduce_sum_epi64(accum);
+
+ // Store the results.
+#if AOM_ARCH_X86_64
+ return _mm_cvtsi128_si64(accum);
+#else
+ int64_t result;
+ _mm_storel_epi64((__m128i *)&result, accum);
+ return result;
+#endif // AOM_ARCH_X86_64
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 0000000000..6407c106ab
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+ lea %1, [%1 + %2 * 4]
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+ mova m%1, [%2 + (%3) * 4]
+ packssdw m%1, [%2 + (%3) * 4 + 16]
+%endmacro
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+.loop:
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
+ paddq m4, m0
+ punpckhdq m2, m5
+ paddq m6, m7
+ paddq m6, m2
+ jg .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+%if AOM_ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000000..ebe75310e9
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+ // Align the input to the word boundary
+ for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+ crc = _mm_crc32_u8(crc, *buf);
+ }
+
+#ifdef __x86_64__
+ uint64_t crc64 = crc;
+ CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len)
+ crc = (uint32_t)crc64;
+#endif
+ CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len)
+ CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len)
+ CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len)
+ return (crc ^ 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
new file mode 100644
index 0000000000..340307cb3e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i;
+ int64_t temp1[8];
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 16) {
+ __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i));
+ __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8));
+ __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i));
+ __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8));
+
+ __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff);
+ __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2);
+ __m256i diff1h = _mm256_srli_epi64(diff1, 32);
+ __m256i diff2h = _mm256_srli_epi64(diff2, 32);
+ __m256i res = _mm256_mul_epi32(diff1, diff1);
+ __m256i res1 = _mm256_mul_epi32(diff1h, diff1h);
+ __m256i res2 = _mm256_mul_epi32(diff2, diff2);
+ __m256i res3 = _mm256_mul_epi32(diff2h, diff2h);
+ __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+ _mm256_add_epi64(res2, res3));
+ __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32);
+ __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32);
+ res = _mm256_mul_epi32(mm256_coeff, mm256_coeff);
+ res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh);
+ res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2);
+ res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2);
+ __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+ _mm256_add_epi64(res2, res3));
+ _mm256_storeu_si256((__m256i *)temp1, res_diff);
+ _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff);
+
+ error += temp1[0] + temp1[1] + temp1[2] + temp1[3];
+ sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..b0b2757568
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i, j, test;
+ uint32_t temp[4];
+ __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 8) {
+ // Load the data into xmm registers
+ __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+ // Check if any values require more than 15 bit
+ max = _mm_set1_epi32(0x3fff);
+ min = _mm_set1_epi32((int)0xffffc000);
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+ _mm_cmplt_epi32(mm_coeff, min));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+ _mm_cmplt_epi32(mm_coeff2, min));
+ cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+ _mm_cmplt_epi32(mm_dqcoeff, min));
+ cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(
+ _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+ if (!test) {
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;
+ mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+ mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+ mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+ error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+ sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+ _mm_storeu_si128((__m128i *)temp, error_sse2);
+ error = error + temp[0] + temp[1] + temp[2] + temp[3];
+ _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+ sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+ } else {
+ for (j = 0; j < 8; j++) {
+ const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+ }
+ }
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
new file mode 100644
index 0000000000..9cdf21fc7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -0,0 +1,3132 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h> /*AVX2*/
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i out1[8];
+ if (!flipud) {
+ out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ } else {
+ out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ }
+ if (!fliplr) {
+ out[0] = _mm256_cvtepi16_epi32(out1[0]);
+ out[1] = _mm256_cvtepi16_epi32(out1[1]);
+ out[2] = _mm256_cvtepi16_epi32(out1[2]);
+ out[3] = _mm256_cvtepi16_epi32(out1[3]);
+ out[4] = _mm256_cvtepi16_epi32(out1[4]);
+ out[5] = _mm256_cvtepi16_epi32(out1[5]);
+ out[6] = _mm256_cvtepi16_epi32(out1[6]);
+ out[7] = _mm256_cvtepi16_epi32(out1[7]);
+
+ } else {
+ out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
+ out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
+ out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
+ out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
+ out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
+ out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
+ out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
+ out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
+ }
+ out[0] = _mm256_slli_epi32(out[0], shift);
+ out[1] = _mm256_slli_epi32(out[1], shift);
+ out[2] = _mm256_slli_epi32(out[2], shift);
+ out[3] = _mm256_slli_epi32(out[3], shift);
+ out[4] = _mm256_slli_epi32(out[4], shift);
+ out[5] = _mm256_slli_epi32(out[5], shift);
+ out[6] = _mm256_slli_epi32(out[6], shift);
+ out[7] = _mm256_slli_epi32(out[7], shift);
+}
+static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm256_add_epi32(in[0], rounding);
+ in[1] = _mm256_add_epi32(in[1], rounding);
+ in[2] = _mm256_add_epi32(in[2], rounding);
+ in[3] = _mm256_add_epi32(in[3], rounding);
+ in[4] = _mm256_add_epi32(in[4], rounding);
+ in[5] = _mm256_add_epi32(in[5], rounding);
+ in[6] = _mm256_add_epi32(in[6], rounding);
+ in[7] = _mm256_add_epi32(in[7], rounding);
+
+ in[0] = _mm256_srai_epi32(in[0], shift);
+ in[1] = _mm256_srai_epi32(in[1], shift);
+ in[2] = _mm256_srai_epi32(in[2], shift);
+ in[3] = _mm256_srai_epi32(in[3], shift);
+ in[4] = _mm256_srai_epi32(in[4], shift);
+ in[5] = _mm256_srai_epi32(in[5], shift);
+ in[6] = _mm256_srai_epi32(in[6], shift);
+ in[7] = _mm256_srai_epi32(in[7], shift);
+}
+static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+ load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+}
+static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+ int stride, int height, int outstride,
+ int flipud, int fliplr) {
+ __m256i out1[64];
+ if (!flipud) {
+ for (int i = 0; i < height; i++) {
+ out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
+ }
+ } else {
+ for (int i = 0; i < height; i++) {
+ out1[(height - 1) - i] =
+ _mm256_loadu_si256((const __m256i *)(input + i * stride));
+ }
+ }
+ if (!fliplr) {
+ for (int i = 0; i < height; i++) {
+ out[i * outstride] =
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
+ out[i * outstride + 1] =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
+ }
+ } else {
+ for (int i = 0; i < height; i++) {
+ out[i * outstride + 1] = _mm256_cvtepi16_epi32(
+ mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
+ out[i * outstride + 0] = _mm256_cvtepi16_epi32(
+ mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
+ }
+ }
+}
+
+static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+ const int instride,
+ const int outstride) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
+ u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
+
+ u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
+ u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
+
+ u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
+ u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
+
+ u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
+ u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+ int stride) {
+ if (bit < 0) {
+ bit = -bit;
+ __m256i round = _mm256_set1_epi32(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[stride * i] = _mm256_add_epi32(in[stride * i], round);
+ in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
+ }
+ }
+}
+static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out), in[i]);
+ out += stride;
+ }
+}
+static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+ __m256i *out) {
+ fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+ fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+ fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+ fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+}
+
+static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m256i ww0 = _mm256_set1_epi32(w0); \
+ const __m256i ww1 = _mm256_set1_epi32(w1); \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \
+ } while (0)
+
+#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm256_add_epi32(out0, r); \
+ out0 = _mm256_srai_epi32(out0, bit); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm256_add_epi32(out1, r); \
+ out1 = _mm256_srai_epi32(out1, bit); \
+ } while (0)
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
+ const int8_t cos_bit, int instride,
+ int outstride);
+static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i u[8], v[8];
+ for (int col = 0; col < col_num; ++col) {
+ u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+ v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+ u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+ u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+ u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+ u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+ u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+ v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+ v[0] = _mm256_add_epi32(u[0], u[3]);
+ v[3] = _mm256_sub_epi32(u[0], u[3]);
+ v[1] = _mm256_add_epi32(u[1], u[2]);
+ v[2] = _mm256_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm256_mullo_epi32(u[5], cospim32);
+ v[6] = _mm256_mullo_epi32(u[6], cospi32);
+ v[5] = _mm256_add_epi32(v[5], v[6]);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ u[0] = _mm256_mullo_epi32(u[5], cospi32);
+ v[6] = _mm256_mullo_epi32(u[6], cospim32);
+ v[6] = _mm256_sub_epi32(u[0], v[6]);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm256_mullo_epi32(v[0], cospi32);
+ v[1] = _mm256_mullo_epi32(v[1], cospi32);
+ u[0] = _mm256_add_epi32(v[0], v[1]);
+ u[0] = _mm256_add_epi32(u[0], rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ u[1] = _mm256_sub_epi32(v[0], v[1]);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm256_mullo_epi32(v[2], cospi48);
+ v[1] = _mm256_mullo_epi32(v[3], cospi16);
+ u[2] = _mm256_add_epi32(v[0], v[1]);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ v[0] = _mm256_mullo_epi32(v[2], cospi16);
+ v[1] = _mm256_mullo_epi32(v[3], cospi48);
+ u[3] = _mm256_sub_epi32(v[1], v[0]);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ u[4] = _mm256_add_epi32(v[4], v[5]);
+ u[5] = _mm256_sub_epi32(v[4], v[5]);
+ u[6] = _mm256_sub_epi32(v[7], v[6]);
+ u[7] = _mm256_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm256_mullo_epi32(u[4], cospi56);
+ v[1] = _mm256_mullo_epi32(u[7], cospi8);
+ v[0] = _mm256_add_epi32(v[0], v[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm256_mullo_epi32(u[4], cospi8);
+ v[1] = _mm256_mullo_epi32(u[7], cospi56);
+ v[0] = _mm256_sub_epi32(v[1], v[0]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm256_mullo_epi32(u[5], cospi24);
+ v[1] = _mm256_mullo_epi32(u[6], cospi40);
+ v[0] = _mm256_add_epi32(v[0], v[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm256_mullo_epi32(u[5], cospi40);
+ v[1] = _mm256_mullo_epi32(u[6], cospi24);
+ v[0] = _mm256_sub_epi32(v[1], v[0]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0 * outstride + col] = u[0]; // buf0[0]
+ out[4 * outstride + col] = u[1]; // buf0[1]
+ out[2 * outstride + col] = u[2]; // buf0[2]
+ out[6 * outstride + col] = u[3]; // buf0[3]
+ }
+}
+static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstirde) {
+ (void)col_num;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i x, y;
+ for (int col = 0; col < col_num; ++col) {
+ u0 = in[0 * col_num + col];
+ u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
+ u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
+ u3 = in[4 * col_num + col];
+ u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
+ u5 = in[6 * col_num + col];
+ u6 = in[2 * col_num + col];
+ u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm256_mullo_epi32(u2, cospi32);
+ y = _mm256_mullo_epi32(u3, cospi32);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ v3 = _mm256_sub_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm256_mullo_epi32(u6, cospi32);
+ y = _mm256_mullo_epi32(u7, cospi32);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ v7 = _mm256_sub_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm256_add_epi32(v0, v2);
+ u1 = _mm256_add_epi32(v1, v3);
+ u2 = _mm256_sub_epi32(v0, v2);
+ u3 = _mm256_sub_epi32(v1, v3);
+ u4 = _mm256_add_epi32(v4, v6);
+ u5 = _mm256_add_epi32(v5, v7);
+ u6 = _mm256_sub_epi32(v4, v6);
+ u7 = _mm256_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm256_mullo_epi32(u4, cospi16);
+ y = _mm256_mullo_epi32(u5, cospi48);
+ v4 = _mm256_add_epi32(x, y);
+ v4 = _mm256_add_epi32(v4, rnding);
+ v4 = _mm256_srai_epi32(v4, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi48);
+ y = _mm256_mullo_epi32(u5, cospim16);
+ v5 = _mm256_add_epi32(x, y);
+ v5 = _mm256_add_epi32(v5, rnding);
+ v5 = _mm256_srai_epi32(v5, bit);
+
+ x = _mm256_mullo_epi32(u6, cospim48);
+ y = _mm256_mullo_epi32(u7, cospi16);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi16);
+ y = _mm256_mullo_epi32(u7, cospi48);
+ v7 = _mm256_add_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm256_add_epi32(v0, v4);
+ u1 = _mm256_add_epi32(v1, v5);
+ u2 = _mm256_add_epi32(v2, v6);
+ u3 = _mm256_add_epi32(v3, v7);
+ u4 = _mm256_sub_epi32(v0, v4);
+ u5 = _mm256_sub_epi32(v1, v5);
+ u6 = _mm256_sub_epi32(v2, v6);
+ u7 = _mm256_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm256_mullo_epi32(u0, cospi4);
+ y = _mm256_mullo_epi32(u1, cospi60);
+ v0 = _mm256_add_epi32(x, y);
+ v0 = _mm256_add_epi32(v0, rnding);
+ v0 = _mm256_srai_epi32(v0, bit);
+
+ x = _mm256_mullo_epi32(u0, cospi60);
+ y = _mm256_mullo_epi32(u1, cospim4);
+ v1 = _mm256_add_epi32(x, y);
+ v1 = _mm256_add_epi32(v1, rnding);
+ v1 = _mm256_srai_epi32(v1, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi20);
+ y = _mm256_mullo_epi32(u3, cospi44);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi44);
+ y = _mm256_mullo_epi32(u3, cospim20);
+ v3 = _mm256_add_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi36);
+ y = _mm256_mullo_epi32(u5, cospi28);
+ v4 = _mm256_add_epi32(x, y);
+ v4 = _mm256_add_epi32(v4, rnding);
+ v4 = _mm256_srai_epi32(v4, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi28);
+ y = _mm256_mullo_epi32(u5, cospim36);
+ v5 = _mm256_add_epi32(x, y);
+ v5 = _mm256_add_epi32(v5, rnding);
+ v5 = _mm256_srai_epi32(v5, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi52);
+ y = _mm256_mullo_epi32(u7, cospi12);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi12);
+ y = _mm256_mullo_epi32(u7, cospim52);
+ v7 = _mm256_add_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 7
+ out[0 * outstirde + col] = v1;
+ out[1 * outstirde + col] = v6;
+ out[2 * outstirde + col] = v3;
+ out[3 * outstirde + col] = v4;
+ out[4 * outstirde + col] = v5;
+ out[5 * outstirde + col] = v2;
+ out[6 * outstirde + col] = v7;
+ out[7 * outstirde + col] = v0;
+ }
+}
+static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
+ int outstride) {
+ (void)bit;
+ (void)outstride;
+ int num_iters = 8 * col_num;
+ for (int i = 0; i < num_iters; i += 8) {
+ out[i] = _mm256_add_epi32(in[i], in[i]);
+ out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
+ out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
+ out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
+ out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
+ out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
+ out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
+ out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
+ }
+}
+void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[8], out[8];
+ const TX_SIZE tx_size = TX_8X8;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int width = tx_size_wide[tx_size];
+ const int width_div8 = (width >> 3);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case IDTX:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case V_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case H_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case V_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case H_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i u[16], v[16], x;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm256_add_epi32(u[0], u[7]);
+ v[7] = _mm256_sub_epi32(u[0], u[7]);
+ v[1] = _mm256_add_epi32(u[1], u[6]);
+ v[6] = _mm256_sub_epi32(u[1], u[6]);
+ v[2] = _mm256_add_epi32(u[2], u[5]);
+ v[5] = _mm256_sub_epi32(u[2], u[5]);
+ v[3] = _mm256_add_epi32(u[3], u[4]);
+ v[4] = _mm256_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm256_mullo_epi32(u[10], cospim32);
+ x = _mm256_mullo_epi32(u[13], cospi32);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[13], cospim32);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = _mm256_mullo_epi32(u[11], cospim32);
+ x = _mm256_mullo_epi32(u[12], cospi32);
+ v[11] = _mm256_add_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[11], cospi32);
+ x = _mm256_mullo_epi32(u[12], cospim32);
+ v[12] = _mm256_sub_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm256_add_epi32(v[0], v[3]);
+ u[3] = _mm256_sub_epi32(v[0], v[3]);
+ u[1] = _mm256_add_epi32(v[1], v[2]);
+ u[2] = _mm256_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm256_mullo_epi32(v[5], cospim32);
+ x = _mm256_mullo_epi32(v[6], cospi32);
+ u[5] = _mm256_add_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[5], cospi32);
+ x = _mm256_mullo_epi32(v[6], cospim32);
+ u[6] = _mm256_sub_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm256_add_epi32(v[8], v[11]);
+ u[11] = _mm256_sub_epi32(v[8], v[11]);
+ u[9] = _mm256_add_epi32(v[9], v[10]);
+ u[10] = _mm256_sub_epi32(v[9], v[10]);
+ u[12] = _mm256_sub_epi32(v[15], v[12]);
+ u[15] = _mm256_add_epi32(v[15], v[12]);
+ u[13] = _mm256_sub_epi32(v[14], v[13]);
+ u[14] = _mm256_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm256_mullo_epi32(u[0], cospi32);
+ u[1] = _mm256_mullo_epi32(u[1], cospi32);
+ v[0] = _mm256_add_epi32(u[0], u[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_sub_epi32(u[0], u[1]);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = _mm256_mullo_epi32(u[2], cospi48);
+ x = _mm256_mullo_epi32(u[3], cospi16);
+ v[2] = _mm256_add_epi32(v[2], x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_mullo_epi32(u[2], cospi16);
+ x = _mm256_mullo_epi32(u[3], cospi48);
+ v[3] = _mm256_sub_epi32(x, v[3]);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = _mm256_add_epi32(u[4], u[5]);
+ v[5] = _mm256_sub_epi32(u[4], u[5]);
+ v[6] = _mm256_sub_epi32(u[7], u[6]);
+ v[7] = _mm256_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm256_mullo_epi32(u[9], cospim16);
+ x = _mm256_mullo_epi32(u[14], cospi48);
+ v[9] = _mm256_add_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[14] = _mm256_mullo_epi32(u[9], cospi48);
+ x = _mm256_mullo_epi32(u[14], cospim16);
+ v[14] = _mm256_sub_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospim48);
+ x = _mm256_mullo_epi32(u[13], cospim16);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospim16);
+ x = _mm256_mullo_epi32(u[13], cospim48);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm256_mullo_epi32(v[4], cospi56);
+ x = _mm256_mullo_epi32(v[7], cospi8);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[7] = _mm256_mullo_epi32(v[4], cospi8);
+ x = _mm256_mullo_epi32(v[7], cospi56);
+ u[7] = _mm256_sub_epi32(x, u[7]);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ u[5] = _mm256_mullo_epi32(v[5], cospi24);
+ x = _mm256_mullo_epi32(v[6], cospi40);
+ u[5] = _mm256_add_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[5], cospi40);
+ x = _mm256_mullo_epi32(v[6], cospi24);
+ u[6] = _mm256_sub_epi32(x, u[6]);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[8] = _mm256_add_epi32(v[8], v[9]);
+ u[9] = _mm256_sub_epi32(v[8], v[9]);
+ u[10] = _mm256_sub_epi32(v[11], v[10]);
+ u[11] = _mm256_add_epi32(v[11], v[10]);
+ u[12] = _mm256_add_epi32(v[12], v[13]);
+ u[13] = _mm256_sub_epi32(v[12], v[13]);
+ u[14] = _mm256_sub_epi32(v[15], v[14]);
+ u[15] = _mm256_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm256_mullo_epi32(u[8], cospi60);
+ x = _mm256_mullo_epi32(u[15], cospi4);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[15] = _mm256_mullo_epi32(u[8], cospi4);
+ x = _mm256_mullo_epi32(u[15], cospi60);
+ v[15] = _mm256_sub_epi32(x, v[15]);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ v[9] = _mm256_mullo_epi32(u[9], cospi28);
+ x = _mm256_mullo_epi32(u[14], cospi36);
+ v[9] = _mm256_add_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[14] = _mm256_mullo_epi32(u[9], cospi36);
+ x = _mm256_mullo_epi32(u[14], cospi28);
+ v[14] = _mm256_sub_epi32(x, v[14]);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospi44);
+ x = _mm256_mullo_epi32(u[13], cospi20);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospi20);
+ x = _mm256_mullo_epi32(u[13], cospi44);
+ v[13] = _mm256_sub_epi32(x, v[13]);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = _mm256_mullo_epi32(u[11], cospi12);
+ x = _mm256_mullo_epi32(u[12], cospi52);
+ v[11] = _mm256_add_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[11], cospi52);
+ x = _mm256_mullo_epi32(u[12], cospi12);
+ v[12] = _mm256_sub_epi32(x, v[12]);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ out[0 * outstride + col] = v[0];
+ out[1 * outstride + col] = v[8];
+ out[2 * outstride + col] = v[4];
+ out[3 * outstride + col] = v[12];
+ out[4 * outstride + col] = v[2];
+ out[5 * outstride + col] = v[10];
+ out[6 * outstride + col] = v[6];
+ out[7 * outstride + col] = v[14];
+ out[8 * outstride + col] = v[1];
+ out[9 * outstride + col] = v[9];
+ out[10 * outstride + col] = v[5];
+ out[11 * outstride + col] = v[13];
+ out[12 * outstride + col] = v[3];
+ out[13 * outstride + col] = v[11];
+ out[14 * outstride + col] = v[7];
+ out[15 * outstride + col] = v[15];
+ }
+}
+static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int num_cols, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+
+ __m256i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < num_cols; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm256_mullo_epi32(u[2], cospi32);
+ y = _mm256_mullo_epi32(u[3], cospi32);
+ v[2] = _mm256_add_epi32(x, y);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(x, y);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm256_mullo_epi32(u[6], cospi32);
+ y = _mm256_mullo_epi32(u[7], cospi32);
+ v[6] = _mm256_add_epi32(x, y);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(x, y);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[11], cospi32);
+ v[10] = _mm256_add_epi32(x, y);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(x, y);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm256_mullo_epi32(u[14], cospi32);
+ y = _mm256_mullo_epi32(u[15], cospi32);
+ v[14] = _mm256_add_epi32(x, y);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(x, y);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm256_add_epi32(v[0], v[2]);
+ u[1] = _mm256_add_epi32(v[1], v[3]);
+ u[2] = _mm256_sub_epi32(v[0], v[2]);
+ u[3] = _mm256_sub_epi32(v[1], v[3]);
+ u[4] = _mm256_add_epi32(v[4], v[6]);
+ u[5] = _mm256_add_epi32(v[5], v[7]);
+ u[6] = _mm256_sub_epi32(v[4], v[6]);
+ u[7] = _mm256_sub_epi32(v[5], v[7]);
+ u[8] = _mm256_add_epi32(v[8], v[10]);
+ u[9] = _mm256_add_epi32(v[9], v[11]);
+ u[10] = _mm256_sub_epi32(v[8], v[10]);
+ u[11] = _mm256_sub_epi32(v[9], v[11]);
+ u[12] = _mm256_add_epi32(v[12], v[14]);
+ u[13] = _mm256_add_epi32(v[13], v[15]);
+ u[14] = _mm256_sub_epi32(v[12], v[14]);
+ u[15] = _mm256_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] =
+ av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] =
+ av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+ // stage 5
+ u[0] = _mm256_add_epi32(v[0], v[4]);
+ u[1] = _mm256_add_epi32(v[1], v[5]);
+ u[2] = _mm256_add_epi32(v[2], v[6]);
+ u[3] = _mm256_add_epi32(v[3], v[7]);
+ u[4] = _mm256_sub_epi32(v[0], v[4]);
+ u[5] = _mm256_sub_epi32(v[1], v[5]);
+ u[6] = _mm256_sub_epi32(v[2], v[6]);
+ u[7] = _mm256_sub_epi32(v[3], v[7]);
+ u[8] = _mm256_add_epi32(v[8], v[12]);
+ u[9] = _mm256_add_epi32(v[9], v[13]);
+ u[10] = _mm256_add_epi32(v[10], v[14]);
+ u[11] = _mm256_add_epi32(v[11], v[15]);
+ u[12] = _mm256_sub_epi32(v[8], v[12]);
+ u[13] = _mm256_sub_epi32(v[9], v[13]);
+ u[14] = _mm256_sub_epi32(v[10], v[14]);
+ u[15] = _mm256_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] =
+ av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] =
+ av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ // stage 7
+ u[0] = _mm256_add_epi32(v[0], v[8]);
+ u[1] = _mm256_add_epi32(v[1], v[9]);
+ u[2] = _mm256_add_epi32(v[2], v[10]);
+ u[3] = _mm256_add_epi32(v[3], v[11]);
+ u[4] = _mm256_add_epi32(v[4], v[12]);
+ u[5] = _mm256_add_epi32(v[5], v[13]);
+ u[6] = _mm256_add_epi32(v[6], v[14]);
+ u[7] = _mm256_add_epi32(v[7], v[15]);
+ u[8] = _mm256_sub_epi32(v[0], v[8]);
+ u[9] = _mm256_sub_epi32(v[1], v[9]);
+ u[10] = _mm256_sub_epi32(v[2], v[10]);
+ u[11] = _mm256_sub_epi32(v[3], v[11]);
+ u[12] = _mm256_sub_epi32(v[4], v[12]);
+ u[13] = _mm256_sub_epi32(v[5], v[13]);
+ u[14] = _mm256_sub_epi32(v[6], v[14]);
+ u[15] = _mm256_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] =
+ av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] =
+ av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+ // stage 9
+ out[0 * outstride + col] = v[1];
+ out[1 * outstride + col] = v[14];
+ out[2 * outstride + col] = v[3];
+ out[3 * outstride + col] = v[12];
+ out[4 * outstride + col] = v[5];
+ out[5 * outstride + col] = v[10];
+ out[6 * outstride + col] = v[7];
+ out[7 * outstride + col] = v[8];
+ out[8 * outstride + col] = v[9];
+ out[9 * outstride + col] = v[6];
+ out[10 * outstride + col] = v[11];
+ out[11 * outstride + col] = v[4];
+ out[12 * outstride + col] = v[13];
+ out[13 * outstride + col] = v[2];
+ out[14 * outstride + col] = v[15];
+ out[15 * outstride + col] = v[0];
+ }
+}
+static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ int col_num, const int outstride) {
+ (void)bit;
+ (void)outstride;
+ __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
+ __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m256i a_low;
+
+ int num_iters = 16 * col_num;
+ for (int i = 0; i < num_iters; i++) {
+ a_low = _mm256_mullo_epi32(in[i], fact);
+ a_low = _mm256_add_epi32(a_low, offset);
+ out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16_avx2, // DCT_DCT
+ fadst16_avx2, // ADST_DCT
+ fdct16_avx2, // DCT_ADST
+ fadst16_avx2, // ADST_ADST
+ fadst16_avx2, // FLIPADST_DCT
+ fdct16_avx2, // DCT_FLIPADST
+ fadst16_avx2, // FLIPADST_FLIPADST
+ fadst16_avx2, // ADST_FLIPADST
+ fadst16_avx2, // FLIPADST_ADST
+ idtx16_avx2, // IDTX
+ fdct16_avx2, // V_DCT
+ idtx16_avx2, // H_DCT
+ fadst16_avx2, // V_ADST
+ idtx16_avx2, // H_ADST
+ fadst16_avx2, // V_FLIPADST
+ idtx16_avx2 // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8_avx2, // DCT_DCT
+ fdct8_avx2, // ADST_DCT
+ fadst8_avx2, // DCT_ADST
+ fadst8_avx2, // ADST_ADST
+ fdct8_avx2, // FLIPADST_DCT
+ fadst8_avx2, // DCT_FLIPADST
+ fadst8_avx2, // FLIPADST_FLIPADST
+ fadst8_avx2, // ADST_FLIPADST
+ fadst8_avx2, // FLIPADST_ADST
+ idtx8_avx2, // IDTX
+ idtx8_avx2, // V_DCT
+ fdct8_avx2, // H_DCT
+ idtx8_avx2, // V_ADST
+ fadst8_avx2, // H_ADST
+ idtx8_avx2, // V_FLIPADST
+ fadst8_avx2 // H_FLIPADST
+};
+void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[16], out[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, out, bit, 1, 1);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ col_txfm_8x8_rounding(&out[8], -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+ fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+ row_txfm(in, out, bit, 2, 2);
+ round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2);
+ store_buffer_avx2(in, coeff, 8, 16);
+ (void)bd;
+}
+static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8_avx2, // DCT_DCT
+ fadst8_avx2, // ADST_DCT
+ fdct8_avx2, // DCT_ADST
+ fadst8_avx2, // ADST_ADST
+ fadst8_avx2, // FLIPADST_DCT
+ fdct8_avx2, // DCT_FLIPADST
+ fadst8_avx2, // FLIPADST_FLIPADST
+ fadst8_avx2, // ADST_FLIPADST
+ fadst8_avx2, // FLIPADST_ADST
+ idtx8_avx2, // IDTX
+ fdct8_avx2, // V_DCT
+ idtx8_avx2, // H_DCT
+ fadst8_avx2, // V_ADST
+ idtx8_avx2, // H_ADST
+ fadst8_avx2, // V_FLIPADST
+ idtx8_avx2 // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16_avx2, // DCT_DCT
+ fdct16_avx2, // ADST_DCT
+ fadst16_avx2, // DCT_ADST
+ fadst16_avx2, // ADST_ADST
+ fdct16_avx2, // FLIPADST_DCT
+ fadst16_avx2, // DCT_FLIPADST
+ fadst16_avx2, // FLIPADST_FLIPADST
+ fadst16_avx2, // ADST_FLIPADST
+ fadst16_avx2, // FLIPADST_ADST
+ idtx16_avx2, // IDTX
+ idtx16_avx2, // V_DCT
+ fdct16_avx2, // H_DCT
+ idtx16_avx2, // V_ADST
+ fadst16_avx2, // H_ADST
+ idtx16_avx2, // V_FLIPADST
+ fadst16_avx2 // H_FLIPADST
+};
+void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[16], out[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+ round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+ col_txfm(in, out, bit, 2, 2);
+ round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+ fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+ fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+ row_txfm(in, out, bit, 1, 1);
+ round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2);
+ store_buffer_avx2(out, coeff, 8, 16);
+ (void)bd;
+}
+void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[32], out[32];
+ const TX_SIZE tx_size = TX_16X16;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const int width_div8 = (width >> 3);
+ const int width_div16 = (width >> 4);
+ const int size = (height << 1);
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case ADST_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case DCT_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case ADST_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case IDTX:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case V_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case H_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case V_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case H_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case V_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case H_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit, const int instride,
+ const int outstride) {
+ __m256i buf0[32];
+ __m256i buf1[32];
+ const int32_t *cospi;
+ int startidx = 0 * instride;
+ int endidx = 31 * instride;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+ cos_bit);
+ btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+ cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 31 * outstride;
+ // stage 9
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit, int instride,
+ int outstride) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i += 8) {
+ output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+ output[(i + 1) * outstride] =
+ _mm256_slli_epi32(input[(i + 1) * instride], 2);
+ output[(i + 2) * outstride] =
+ _mm256_slli_epi32(input[(i + 2) * instride], 2);
+ output[(i + 3) * outstride] =
+ _mm256_slli_epi32(input[(i + 3) * instride], 2);
+ output[(i + 4) * outstride] =
+ _mm256_slli_epi32(input[(i + 4) * instride], 2);
+ output[(i + 5) * outstride] =
+ _mm256_slli_epi32(input[(i + 5) * instride], 2);
+ output[(i + 6) * outstride] =
+ _mm256_slli_epi32(input[(i + 6) * instride], 2);
+ output[(i + 7) * outstride] =
+ _mm256_slli_epi32(input[(i + 7) * instride], 2);
+ }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+ fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+ fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[128], buf1[128];
+ const int tx_size = TX_32X32;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+ int r, c;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+ width_div8, 0, 0);
+ round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+ col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
+ width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < width_div16; i++) {
+ row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
+ width_div8);
+ row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+ round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
+ }
+
+ store_buffer_avx2(buf1, output, 8, 128);
+}
+static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x2[0] = _mm256_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm256_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm256_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm256_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm256_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm256_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm256_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm256_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm256_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm256_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm256_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm256_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm256_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm256_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm256_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm256_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ *__rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+}
+static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x3[0] = _mm256_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm256_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm256_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm256_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm256_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm256_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm256_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm256_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ *__rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm256_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm256_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm256_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm256_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm256_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm256_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm256_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm256_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm256_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm256_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm256_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm256_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm256_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm256_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm256_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm256_add_epi32(x2[56], x2[55]);
+}
+static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48,
+ __m256i *cospi_m48,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x4[0] = _mm256_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm256_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm256_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm256_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ *__rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm256_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm256_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm256_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm256_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm256_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm256_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm256_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm256_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ *__rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+}
+static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48,
+ __m256i *cospi_m48,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x5[0] = _mm256_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm256_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ *__rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm256_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm256_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm256_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm256_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ *__rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm256_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm256_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm256_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm256_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm256_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm256_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm256_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm256_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm256_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm256_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm256_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm256_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm256_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm256_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm256_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm256_add_epi32(x4[60], x4[59]);
+}
+static INLINE void fdct64_stage6_avx2(
+ __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+ __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
+ __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
+ const __m256i *__rounding, int8_t cos_bit) {
+ btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
+ *__rounding, cos_bit);
+ x6[4] = _mm256_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm256_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ *__rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm256_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm256_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm256_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm256_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm256_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm256_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm256_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm256_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ *__rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ *__rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+}
+static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
+ __m256i *cospi_p08, __m256i *cospi_p56,
+ __m256i *cospi_p40, __m256i *cospi_p24,
+ __m256i *cospi_m08, __m256i *cospi_m56,
+ __m256i *cospi_m40, __m256i *cospi_m24,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
+ *__rounding, cos_bit);
+ x7[8] = _mm256_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm256_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm256_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm256_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ *__rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ *__rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm256_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm256_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm256_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm256_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm256_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm256_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm256_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm256_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm256_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm256_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm256_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm256_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm256_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm256_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm256_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm256_add_epi32(x6[62], x6[61]);
+}
+static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+
+ btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
+ *__rounding, cos_bit);
+ x8[16] = _mm256_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm256_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm256_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm256_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm256_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm256_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm256_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm256_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ *__rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ *__rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ *__rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ *__rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+}
+static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
+ *__rounding, cos_bit);
+ x9[32] = _mm256_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm256_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm256_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm256_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm256_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm256_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm256_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm256_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm256_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm256_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm256_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm256_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm256_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm256_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm256_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm256_add_epi32(x8[63], x8[62]);
+}
+static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
+ *__rounding, cos_bit);
+}
+static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+ const int instride, const int outstride) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+
+ int startidx = 0 * instride;
+ int endidx = 63 * instride;
+ // stage 1
+ __m256i x1[64];
+ x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ __m256i x2[64];
+ fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+ // stage 3
+ fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+ // stage 4
+ fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &__rounding, cos_bit);
+ // stage 5
+ fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &__rounding, cos_bit);
+ // stage 6
+ fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40,
+ &cospi_p24, &cospi_m24, &__rounding, cos_bit);
+ // stage 7
+ fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+ &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+ &__rounding, cos_bit);
+ // stage 8
+ fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+ // stage 9
+ fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+ // stage 10
+ fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 63 * outstride;
+
+ // stage 11
+ output[startidx] = x2[0];
+ output[endidx] = x2[63];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[32];
+ output[endidx] = x2[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[16];
+ output[endidx] = x2[47];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[48];
+ output[endidx] = x2[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[8];
+ output[endidx] = x2[55];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[40];
+ output[endidx] = x2[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[24];
+ output[endidx] = x2[39];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[56];
+ output[endidx] = x2[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[4];
+ output[endidx] = x2[59];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[36];
+ output[endidx] = x2[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[20];
+ output[endidx] = x2[43];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[52];
+ output[endidx] = x2[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[12];
+ output[endidx] = x2[51];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[44];
+ output[endidx] = x2[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[28];
+ output[endidx] = x2[35];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[60];
+ output[endidx] = x2[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[2];
+ output[endidx] = x2[61];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[34];
+ output[endidx] = x2[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[18];
+ output[endidx] = x2[45];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[50];
+ output[endidx] = x2[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[10];
+ output[endidx] = x2[53];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[42];
+ output[endidx] = x2[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[26];
+ output[endidx] = x2[37];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[58];
+ output[endidx] = x2[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[6];
+ output[endidx] = x2[57];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[38];
+ output[endidx] = x2[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[22];
+ output[endidx] = x2[41];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[54];
+ output[endidx] = x2[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[14];
+ output[endidx] = x2[49];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[46];
+ output[endidx] = x2[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[30];
+ output[endidx] = x2[33];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[62];
+ output[endidx] = x2[1];
+}
+void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[512], buf1[512];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct64_avx2;
+ const transform_1d_avx2 row_txfm = fdct64_avx2;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+ int r, c;
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+ width_div8, 0, 0);
+ round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+ col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
+ width_div16);
+ row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
+ width_div16);
+ round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+ width_div16);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+ width_div16);
+ }
+
+ store_buffer_avx2(buf0, output, 8, 128);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000000..158b4ae439
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,2629 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+static INLINE void store_output_w4(int32_t *const out, const __m128i *const in,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+ }
+}
+
+void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in[4];
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+ // Convert to int32_t.
+ __m128i op[4];
+ op[0] = _mm_cvtepi16_epi32(in[0]);
+ op[1] = _mm_cvtepi16_epi32(in[1]);
+ op[2] = _mm_cvtepi16_epi32(in[2]);
+ op[3] = _mm_cvtepi16_epi32(in[3]);
+
+ for (int i = 0; i < 2; ++i) {
+ __m128i a1 = op[0];
+ __m128i b1 = op[1];
+ __m128i c1 = op[2];
+ __m128i d1 = op[3];
+ __m128i e1;
+
+ a1 = _mm_add_epi32(a1, b1); // a1 += b1
+ d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1
+ e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
+ e1 = _mm_srai_epi32(e1, 1);
+ b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
+ c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
+ a1 = _mm_sub_epi32(a1, c1); // a1 -= c1
+ d1 = _mm_add_epi32(d1, b1); // d1 += b1
+
+ op[0] = a1;
+ op[1] = c1;
+ op[2] = d1;
+ op[3] = b1;
+
+ if (i == 0) {
+ transpose_32bit_4x4(op, op);
+ }
+ }
+
+ op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
+ op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
+ op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
+ op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
+
+ _mm_storeu_si128((__m128i *)(output + 0), op[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), op[1]);
+ _mm_storeu_si128((__m128i *)(output + 8), op[2]);
+ _mm_storeu_si128((__m128i *)(output + 12), op[3]);
+}
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ }
+
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[2]);
+ in[3] = _mm_cvtepi16_epi32(in[3]);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_col) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i s0, s1, s2, s3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ int endidx = 3 * num_col;
+ s0 = _mm_add_epi32(in[0], in[endidx]);
+ s3 = _mm_sub_epi32(in[0], in[endidx]);
+ endidx -= num_col;
+ s1 = _mm_add_epi32(in[num_col], in[endidx]);
+ s2 = _mm_sub_epi32(in[num_col], in[endidx]);
+
+ // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+ u0 = _mm_mullo_epi32(s0, cospi32);
+ u1 = _mm_mullo_epi32(s1, cospi32);
+ u2 = _mm_add_epi32(u0, u1);
+ v0 = _mm_sub_epi32(u0, u1);
+
+ u3 = _mm_add_epi32(u2, rnding);
+ v1 = _mm_add_epi32(v0, rnding);
+
+ u0 = _mm_srai_epi32(u3, bit);
+ u2 = _mm_srai_epi32(v1, bit);
+
+ // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+ v0 = _mm_mullo_epi32(s2, cospi48);
+ v1 = _mm_mullo_epi32(s3, cospi16);
+ v2 = _mm_add_epi32(v0, v1);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u1 = _mm_srai_epi32(v3, bit);
+
+ v0 = _mm_mullo_epi32(s2, cospi16);
+ v1 = _mm_mullo_epi32(s3, cospi48);
+ v2 = _mm_sub_epi32(v1, v0);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u3 = _mm_srai_epi32(v3, bit);
+
+ // Note: shift[1] and shift[2] are zeros
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_col) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
+ __m128i u0, u1, u2, u3;
+
+ int idx = 0 * num_col;
+ s0 = _mm_mullo_epi32(in[idx], sinpi1);
+ s1 = _mm_mullo_epi32(in[idx], sinpi4);
+ t = _mm_add_epi32(in[idx], in[idx + num_col]);
+ idx += num_col;
+ s2 = _mm_mullo_epi32(in[idx], sinpi2);
+ s3 = _mm_mullo_epi32(in[idx], sinpi1);
+ idx += num_col;
+ s4 = _mm_mullo_epi32(in[idx], sinpi3);
+ idx += num_col;
+ s5 = _mm_mullo_epi32(in[idx], sinpi4);
+ s6 = _mm_mullo_epi32(in[idx], sinpi2);
+ s7 = _mm_sub_epi32(t, in[idx]);
+
+ t = _mm_add_epi32(s0, s2);
+ x0 = _mm_add_epi32(t, s5);
+ x1 = _mm_mullo_epi32(s7, sinpi3);
+ t = _mm_sub_epi32(s1, s3);
+ x2 = _mm_add_epi32(t, s6);
+ x3 = s4;
+
+ s0 = _mm_add_epi32(x0, x3);
+ s1 = x1;
+ s2 = _mm_sub_epi32(x2, x3);
+ t = _mm_sub_epi32(x2, x0);
+ s3 = _mm_add_epi32(t, x3);
+
+ u0 = _mm_add_epi32(s0, rnding);
+ u0 = _mm_srai_epi32(u0, bit);
+
+ u1 = _mm_add_epi32(s1, rnding);
+ u1 = _mm_srai_epi32(u1, bit);
+
+ u2 = _mm_add_epi32(s2, rnding);
+ u2 = _mm_srai_epi32(u2, bit);
+
+ u3 = _mm_add_epi32(s3, rnding);
+ u3 = _mm_srai_epi32(u3, bit);
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+}
+static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a_low;
+
+ for (int i = 0; i < 4; i++) {
+ a_low = _mm_mullo_epi32(in[i * col_num], fact);
+ a_low = _mm_add_epi32(a_low, offset);
+ out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+ int input_stride, TX_TYPE tx_type, int bd) {
+ __m128i in[4];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i u;
+ if (!flipud) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+ }
+
+ u = _mm_unpackhi_epi64(in[4], in[4]);
+ in[8] = _mm_cvtepi16_epi32(in[4]);
+ in[9] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[5], in[5]);
+ in[10] = _mm_cvtepi16_epi32(in[5]);
+ in[11] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[6], in[6]);
+ in[12] = _mm_cvtepi16_epi32(in[6]);
+ in[13] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[7], in[7]);
+ in[14] = _mm_cvtepi16_epi32(in[7]);
+ in[15] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[3], in[3]);
+ in[6] = _mm_cvtepi16_epi32(in[3]);
+ in[7] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[2], in[2]);
+ in[4] = _mm_cvtepi16_epi32(in[2]);
+ in[5] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[1], in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[1]);
+ in[3] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[0], in[0]);
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(u);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+ in[4] = _mm_slli_epi32(in[4], shift);
+ in[5] = _mm_slli_epi32(in[5], shift);
+ in[6] = _mm_slli_epi32(in[6], shift);
+ in[7] = _mm_slli_epi32(in[7], shift);
+
+ in[8] = _mm_slli_epi32(in[8], shift);
+ in[9] = _mm_slli_epi32(in[9], shift);
+ in[10] = _mm_slli_epi32(in[10], shift);
+ in[11] = _mm_slli_epi32(in[11], shift);
+ in[12] = _mm_slli_epi32(in[12], shift);
+ in[13] = _mm_slli_epi32(in[13], shift);
+ in[14] = _mm_slli_epi32(in[14], shift);
+ in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rounding);
+ in[1] = _mm_add_epi32(in[1], rounding);
+ in[2] = _mm_add_epi32(in[2], rounding);
+ in[3] = _mm_add_epi32(in[3], rounding);
+ in[4] = _mm_add_epi32(in[4], rounding);
+ in[5] = _mm_add_epi32(in[5], rounding);
+ in[6] = _mm_add_epi32(in[6], rounding);
+ in[7] = _mm_add_epi32(in[7], rounding);
+ in[8] = _mm_add_epi32(in[8], rounding);
+ in[9] = _mm_add_epi32(in[9], rounding);
+ in[10] = _mm_add_epi32(in[10], rounding);
+ in[11] = _mm_add_epi32(in[11], rounding);
+ in[12] = _mm_add_epi32(in[12], rounding);
+ in[13] = _mm_add_epi32(in[13], rounding);
+ in[14] = _mm_add_epi32(in[14], rounding);
+ in[15] = _mm_add_epi32(in[15], rounding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ in[4] = _mm_srai_epi32(in[4], shift);
+ in[5] = _mm_srai_epi32(in[5], shift);
+ in[6] = _mm_srai_epi32(in[6], shift);
+ in[7] = _mm_srai_epi32(in[7], shift);
+ in[8] = _mm_srai_epi32(in[8], shift);
+ in[9] = _mm_srai_epi32(in[9], shift);
+ in[10] = _mm_srai_epi32(in[10], shift);
+ in[11] = _mm_srai_epi32(in[11], shift);
+ in[12] = _mm_srai_epi32(in[12], shift);
+ in[13] = _mm_srai_epi32(in[13], shift);
+ in[14] = _mm_srai_epi32(in[14], shift);
+ in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rounding);
+ in[1] = _mm_add_epi32(in[1], rounding);
+ in[2] = _mm_add_epi32(in[2], rounding);
+ in[3] = _mm_add_epi32(in[3], rounding);
+ in[4] = _mm_add_epi32(in[4], rounding);
+ in[5] = _mm_add_epi32(in[5], rounding);
+ in[6] = _mm_add_epi32(in[6], rounding);
+ in[7] = _mm_add_epi32(in[7], rounding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ in[4] = _mm_srai_epi32(in[4], shift);
+ in[5] = _mm_srai_epi32(in[5], shift);
+ in[6] = _mm_srai_epi32(in[6], shift);
+ in[7] = _mm_srai_epi32(in[7], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+ _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+ _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+ _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+ _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+ _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+ _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+ _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+ _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+ _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+ const int stride) {
+ _mm_storeu_si128((__m128i *)(output), res[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+ _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+ _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[8], v[8];
+
+ int startidx = 0 * col_num;
+ int endidx = 7 * col_num;
+ // Even 8 points 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[startidx], in[endidx]);
+ v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7]
+ startidx += col_num;
+ endidx -= col_num;
+ u[1] = _mm_add_epi32(in[startidx], in[endidx]);
+ u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
+ startidx += col_num;
+ endidx -= col_num;
+ u[2] = _mm_add_epi32(in[startidx], in[endidx]);
+ u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
+ startidx += col_num;
+ endidx -= col_num;
+ u[3] = _mm_add_epi32(in[startidx], in[endidx]);
+ v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4]
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[3]);
+ v[3] = _mm_sub_epi32(u[0], u[3]);
+ v[1] = _mm_add_epi32(u[1], u[2]);
+ v[2] = _mm_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm_mullo_epi32(u[5], cospim32);
+ v[6] = _mm_mullo_epi32(u[6], cospi32);
+ v[5] = _mm_add_epi32(v[5], v[6]);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ u[0] = _mm_mullo_epi32(u[5], cospi32);
+ v[6] = _mm_mullo_epi32(u[6], cospim32);
+ v[6] = _mm_sub_epi32(u[0], v[6]);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm_mullo_epi32(v[0], cospi32);
+ v[1] = _mm_mullo_epi32(v[1], cospi32);
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_sub_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm_mullo_epi32(v[2], cospi48);
+ v[1] = _mm_mullo_epi32(v[3], cospi16);
+ u[2] = _mm_add_epi32(v[0], v[1]);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ v[0] = _mm_mullo_epi32(v[2], cospi16);
+ v[1] = _mm_mullo_epi32(v[3], cospi48);
+ u[3] = _mm_sub_epi32(v[1], v[0]);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ u[4] = _mm_add_epi32(v[4], v[5]);
+ u[5] = _mm_sub_epi32(v[4], v[5]);
+ u[6] = _mm_sub_epi32(v[7], v[6]);
+ u[7] = _mm_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm_mullo_epi32(u[4], cospi56);
+ v[1] = _mm_mullo_epi32(u[7], cospi8);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm_mullo_epi32(u[4], cospi8);
+ v[1] = _mm_mullo_epi32(u[7], cospi56);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi24);
+ v[1] = _mm_mullo_epi32(u[6], cospi40);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi40);
+ v[1] = _mm_mullo_epi32(u[6], cospi24);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0 * col_num] = u[0]; // buf0[0]
+ out[4 * col_num] = u[1]; // buf0[1]
+ out[2 * col_num] = u[2]; // buf0[2]
+ out[6 * col_num] = u[3]; // buf0[3]
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ fdct4x8_sse4_1(in, out, bit, col_num);
+ fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u0 = in[col_num * 0 + col];
+ u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
+ u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
+ u3 = in[col_num * 4 + col];
+ u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
+ u5 = in[col_num * 6 + col];
+ u6 = in[col_num * 2 + col];
+ u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm_mullo_epi32(u2, cospi32);
+ y = _mm_mullo_epi32(u3, cospi32);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ v3 = _mm_sub_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm_mullo_epi32(u6, cospi32);
+ y = _mm_mullo_epi32(u7, cospi32);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ v7 = _mm_sub_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm_add_epi32(v0, v2);
+ u1 = _mm_add_epi32(v1, v3);
+ u2 = _mm_sub_epi32(v0, v2);
+ u3 = _mm_sub_epi32(v1, v3);
+ u4 = _mm_add_epi32(v4, v6);
+ u5 = _mm_add_epi32(v5, v7);
+ u6 = _mm_sub_epi32(v4, v6);
+ u7 = _mm_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm_mullo_epi32(u4, cospi16);
+ y = _mm_mullo_epi32(u5, cospi48);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi48);
+ y = _mm_mullo_epi32(u5, cospim16);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospim48);
+ y = _mm_mullo_epi32(u7, cospi16);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi16);
+ y = _mm_mullo_epi32(u7, cospi48);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm_add_epi32(v0, v4);
+ u1 = _mm_add_epi32(v1, v5);
+ u2 = _mm_add_epi32(v2, v6);
+ u3 = _mm_add_epi32(v3, v7);
+ u4 = _mm_sub_epi32(v0, v4);
+ u5 = _mm_sub_epi32(v1, v5);
+ u6 = _mm_sub_epi32(v2, v6);
+ u7 = _mm_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm_mullo_epi32(u0, cospi4);
+ y = _mm_mullo_epi32(u1, cospi60);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ x = _mm_mullo_epi32(u0, cospi60);
+ y = _mm_mullo_epi32(u1, cospim4);
+ v1 = _mm_add_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi20);
+ y = _mm_mullo_epi32(u3, cospi44);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi44);
+ y = _mm_mullo_epi32(u3, cospim20);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ x = _mm_mullo_epi32(u4, cospi36);
+ y = _mm_mullo_epi32(u5, cospi28);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi28);
+ y = _mm_mullo_epi32(u5, cospim36);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospi52);
+ y = _mm_mullo_epi32(u7, cospi12);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi12);
+ y = _mm_mullo_epi32(u7, cospim52);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 7
+ out[col_num * 0 + col] = v1;
+ out[col_num * 1 + col] = v6;
+ out[col_num * 2 + col] = v3;
+ out[col_num * 3 + col] = v4;
+ out[col_num * 4 + col] = v5;
+ out[col_num * 5 + col] = v2;
+ out[col_num * 6 + col] = v7;
+ out[col_num * 7 + col] = v0;
+ }
+}
+static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+
+ for (int i = 0; i < col_num; i += 1) {
+ out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
+ out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
+ out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
+ out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
+ out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
+ out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
+ out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
+ out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
+ }
+}
+#if !CONFIG_REALTIME_ONLY
+static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+ (void)col_num;
+ for (int j = 0; j < 2; j++) {
+ out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
+ out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
+ out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
+ out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
+ out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
+ out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
+ out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
+ out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
+ }
+}
+#endif
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[16], out[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case IDTX:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case V_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case H_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case V_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case H_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+ int row_index = 0;
+ int dst_index = 0;
+ int src_index = 0;
+
+ // row 0, 1, .., 7
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 8);
+
+ // row 8, 9, ..., 15
+ src_index += 16;
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i in[64];
+ // Load 4 8x8 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+ const int16_t *botL = input + 8 * stride;
+ const int16_t *botR = input + 8 * stride + 8;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 8 columns
+ load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+ // load second 8 columns
+ load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+ convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *topR = input + 4;
+
+ const int16_t *tmp;
+
+ if (fliplr) {
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ }
+
+ load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+
+ const int16_t *tmp;
+
+ if (fliplr) {
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ }
+
+ load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 4 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
+ const int stride, const int flipud,
+ const int fliplr, const int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+ load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
+}
+#endif
+
+static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift, const int height) {
+ const int16_t *in = input;
+ __m128i *output = out;
+ for (int col = 0; col < height; col++) {
+ in = input + col * stride;
+ output = out + col * 8;
+ load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
+ load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
+ }
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[16], v[16], x;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[7]);
+ v[7] = _mm_sub_epi32(u[0], u[7]);
+ v[1] = _mm_add_epi32(u[1], u[6]);
+ v[6] = _mm_sub_epi32(u[1], u[6]);
+ v[2] = _mm_add_epi32(u[2], u[5]);
+ v[5] = _mm_sub_epi32(u[2], u[5]);
+ v[3] = _mm_add_epi32(u[3], u[4]);
+ v[4] = _mm_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm_mullo_epi32(u[10], cospim32);
+ x = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[13], cospim32);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospim32);
+ x = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi32);
+ x = _mm_mullo_epi32(u[12], cospim32);
+ v[12] = _mm_sub_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[3]);
+ u[3] = _mm_sub_epi32(v[0], v[3]);
+ u[1] = _mm_add_epi32(v[1], v[2]);
+ u[2] = _mm_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm_mullo_epi32(v[5], cospim32);
+ x = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi32);
+ x = _mm_mullo_epi32(v[6], cospim32);
+ u[6] = _mm_sub_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm_add_epi32(v[8], v[11]);
+ u[11] = _mm_sub_epi32(v[8], v[11]);
+ u[9] = _mm_add_epi32(v[9], v[10]);
+ u[10] = _mm_sub_epi32(v[9], v[10]);
+ u[12] = _mm_sub_epi32(v[15], v[12]);
+ u[15] = _mm_add_epi32(v[15], v[12]);
+ u[13] = _mm_sub_epi32(v[14], v[13]);
+ u[14] = _mm_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm_mullo_epi32(u[0], cospi32);
+ u[1] = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(u[0], u[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(u[0], u[1]);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(u[2], cospi48);
+ x = _mm_mullo_epi32(u[3], cospi16);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(u[2], cospi16);
+ x = _mm_mullo_epi32(u[3], cospi48);
+ v[3] = _mm_sub_epi32(x, v[3]);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_add_epi32(u[4], u[5]);
+ v[5] = _mm_sub_epi32(u[4], u[5]);
+ v[6] = _mm_sub_epi32(u[7], u[6]);
+ v[7] = _mm_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm_mullo_epi32(u[9], cospim16);
+ x = _mm_mullo_epi32(u[14], cospi48);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi48);
+ x = _mm_mullo_epi32(u[14], cospim16);
+ v[14] = _mm_sub_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospim48);
+ x = _mm_mullo_epi32(u[13], cospim16);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospim16);
+ x = _mm_mullo_epi32(u[13], cospim48);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi56);
+ x = _mm_mullo_epi32(v[7], cospi8);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[7] = _mm_mullo_epi32(v[4], cospi8);
+ x = _mm_mullo_epi32(v[7], cospi56);
+ u[7] = _mm_sub_epi32(x, u[7]);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[5] = _mm_mullo_epi32(v[5], cospi24);
+ x = _mm_mullo_epi32(v[6], cospi40);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi40);
+ x = _mm_mullo_epi32(v[6], cospi24);
+ u[6] = _mm_sub_epi32(x, u[6]);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[8] = _mm_add_epi32(v[8], v[9]);
+ u[9] = _mm_sub_epi32(v[8], v[9]);
+ u[10] = _mm_sub_epi32(v[11], v[10]);
+ u[11] = _mm_add_epi32(v[11], v[10]);
+ u[12] = _mm_add_epi32(v[12], v[13]);
+ u[13] = _mm_sub_epi32(v[12], v[13]);
+ u[14] = _mm_sub_epi32(v[15], v[14]);
+ u[15] = _mm_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi60);
+ x = _mm_mullo_epi32(u[15], cospi4);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[15] = _mm_mullo_epi32(u[8], cospi4);
+ x = _mm_mullo_epi32(u[15], cospi60);
+ v[15] = _mm_sub_epi32(x, v[15]);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ v[9] = _mm_mullo_epi32(u[9], cospi28);
+ x = _mm_mullo_epi32(u[14], cospi36);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi36);
+ x = _mm_mullo_epi32(u[14], cospi28);
+ v[14] = _mm_sub_epi32(x, v[14]);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi44);
+ x = _mm_mullo_epi32(u[13], cospi20);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi20);
+ x = _mm_mullo_epi32(u[13], cospi44);
+ v[13] = _mm_sub_epi32(x, v[13]);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospi12);
+ x = _mm_mullo_epi32(u[12], cospi52);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi52);
+ x = _mm_mullo_epi32(u[12], cospi12);
+ v[12] = _mm_sub_epi32(x, v[12]);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ out[0 * col_num + col] = v[0];
+ out[1 * col_num + col] = v[8];
+ out[2 * col_num + col] = v[4];
+ out[3 * col_num + col] = v[12];
+ out[4 * col_num + col] = v[2];
+ out[5 * col_num + col] = v[10];
+ out[6 * col_num + col] = v[6];
+ out[7 * col_num + col] = v[14];
+ out[8 * col_num + col] = v[1];
+ out[9 * col_num + col] = v[9];
+ out[10 * col_num + col] = v[5];
+ out[11 * col_num + col] = v[13];
+ out[12 * col_num + col] = v[3];
+ out[13 * col_num + col] = v[11];
+ out[14 * col_num + col] = v[7];
+ out[15 * col_num + col] = v[15];
+ }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_cols) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < num_cols; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm_mullo_epi32(u[2], cospi32);
+ y = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(x, y);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(x, y);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm_mullo_epi32(u[6], cospi32);
+ y = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(x, y);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(x, y);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(x, y);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(x, y);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm_mullo_epi32(u[14], cospi32);
+ y = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(x, y);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(x, y);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[2]);
+ u[1] = _mm_add_epi32(v[1], v[3]);
+ u[2] = _mm_sub_epi32(v[0], v[2]);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[4] = _mm_add_epi32(v[4], v[6]);
+ u[5] = _mm_add_epi32(v[5], v[7]);
+ u[6] = _mm_sub_epi32(v[4], v[6]);
+ u[7] = _mm_sub_epi32(v[5], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[10]);
+ u[9] = _mm_add_epi32(v[9], v[11]);
+ u[10] = _mm_sub_epi32(v[8], v[10]);
+ u[11] = _mm_sub_epi32(v[9], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[14]);
+ u[13] = _mm_add_epi32(v[13], v[15]);
+ u[14] = _mm_sub_epi32(v[12], v[14]);
+ u[15] = _mm_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+ // stage 5
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ // stage 7
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+ // stage 9
+ out[0 * num_cols + col] = v[1];
+ out[1 * num_cols + col] = v[14];
+ out[2 * num_cols + col] = v[3];
+ out[3 * num_cols + col] = v[12];
+ out[4 * num_cols + col] = v[5];
+ out[5 * num_cols + col] = v[10];
+ out[6 * num_cols + col] = v[7];
+ out[7 * num_cols + col] = v[8];
+ out[8 * num_cols + col] = v[9];
+ out[9 * num_cols + col] = v[6];
+ out[10 * num_cols + col] = v[11];
+ out[11 * num_cols + col] = v[4];
+ out[12 * num_cols + col] = v[13];
+ out[13 * num_cols + col] = v[2];
+ out[14 * num_cols + col] = v[15];
+ out[15 * num_cols + col] = v[0];
+ }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+ // Note:
+ // We split 16x16 rounding into 4 sections of 8x8 rounding,
+ // instead of 4 columns
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+ col_txfm_8x8_rounding(&in[32], shift);
+ col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
+ const int size_8x8 = 16 * 4;
+ write_buffer_8x8(&in[0], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[16], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[32], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[48], output);
+}
+static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a_low;
+
+ int num_iters = 16 * col_num;
+ for (int i = 0; i < num_iters; i++) {
+ a_low = _mm_mullo_epi32(in[i], fact);
+ a_low = _mm_add_epi32(a_low, offset);
+ out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64], out[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int col_num = 4;
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case IDTX:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case V_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+ for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ fdct8x8_sse4_1, // V_DCT
+ idtx8x8_sse4_1, // H_DCT
+ fadst8x8_sse4_1, // V_ADST
+ idtx8x8_sse4_1, // H_ADST
+ fadst8x8_sse4_1, // V_FLIPADST
+ idtx8x8_sse4_1 // H_FLIPADST
+};
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST-ADST
+ idtx32x8_sse4_1, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL, // H_FLIPADST
+};
+#endif
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct4x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct4x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ fdct4x8_sse4_1, // V_DCT
+ idtx8x8_sse4_1, // H_DCT
+ fadst8x8_sse4_1, // V_ADST
+ idtx8x8_sse4_1, // H_ADST
+ fadst8x8_sse4_1, // V_FLIPADST
+ idtx8x8_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fdct16x16_sse4_1, // ADST_DCT
+ fadst16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fdct16x16_sse4_1, // FLIPADST_DCT
+ fadst16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ idtx16x16_sse4_1, // IDTX
+ idtx16x16_sse4_1, // V_DCT
+ fdct16x16_sse4_1, // H_DCT
+ idtx16x16_sse4_1, // V_ADST
+ fadst16x16_sse4_1, // H_ADST
+ idtx16x16_sse4_1, // V_FLIPADST
+ fadst16x16_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fadst16x16_sse4_1, // ADST_DCT
+ fdct16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fadst16x16_sse4_1, // FLIPADST_DCT
+ fdct16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ idtx16x16_sse4_1, // IDTX
+ fdct16x16_sse4_1, // V_DCT
+ idtx16x16_sse4_1, // H_DCT
+ fadst16x16_sse4_1, // V_ADST
+ idtx16x16_sse4_1, // H_ADST
+ fadst16x16_sse4_1, // V_FLIPADST
+ idtx16x16_sse4_1 // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fdct8x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct8x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ idtx8x8_sse4_1, // V_DCT
+ fdct8x8_sse4_1, // H_DCT
+ idtx8x8_sse4_1, // V_ADST
+ fadst8x8_sse4_1, // H_ADST
+ idtx8x8_sse4_1, // V_FLIPADST
+ fadst8x8_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_sse4_1, // DCT_DCT
+ fdct4x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct4x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ idtx8x8_sse4_1, // V_DCT
+ fdct4x8_sse4_1, // H_DCT
+ idtx8x8_sse4_1, // V_ADST
+ fadst8x8_sse4_1, // H_ADST
+ idtx8x8_sse4_1, // V_FLIPADST
+ fadst8x8_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_sse4_1, // DCT_DCT
+ fdct4x4_sse4_1, // ADST_DCT
+ fadst4x4_sse4_1, // DCT_ADST
+ fadst4x4_sse4_1, // ADST_ADST
+ fdct4x4_sse4_1, // FLIPADST_DCT
+ fadst4x4_sse4_1, // DCT_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_FLIPADST
+ fadst4x4_sse4_1, // ADST_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_ADST
+ idtx4x4_sse4_1, // IDTX
+ idtx4x4_sse4_1, // V_DCT
+ fdct4x4_sse4_1, // H_DCT
+ idtx4x4_sse4_1, // V_ADST
+ fadst4x4_sse4_1, // H_ADST
+ idtx4x4_sse4_1, // V_FLIPADST
+ fadst4x4_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_sse4_1, // DCT_DCT
+ fadst4x4_sse4_1, // ADST_DCT
+ fdct4x4_sse4_1, // DCT_ADST
+ fadst4x4_sse4_1, // ADST_ADST
+ fadst4x4_sse4_1, // FLIPADST_DCT
+ fdct4x4_sse4_1, // DCT_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_FLIPADST
+ fadst4x4_sse4_1, // ADST_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_ADST
+ idtx4x4_sse4_1, // IDTX
+ fdct4x4_sse4_1, // V_DCT
+ idtx4x4_sse4_1, // H_DCT
+ fadst4x4_sse4_1, // V_ADST
+ idtx4x4_sse4_1, // H_ADST
+ fadst4x4_sse4_1, // V_FLIPADST
+ idtx4x4_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct32_sse4_1, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ av1_idtx32_sse4_1, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx16x16_sse4_1, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x8_rounding(in, -shift[1]);
+ transpose_8x8(in, out + i * 16);
+ }
+
+ if (lr_flip) {
+ flip_buf_sse4_1(in, out, 32);
+ row_txfm(in, out, bit, 2);
+ } else {
+ row_txfm(out, out, bit, 2);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2],
+ NewSqrt2);
+ write_buffer_8x8(in, coeff + i * 64);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x16_rounding(in, -shift[1]);
+ transpose_8x8(in, out);
+ transpose_8x8(in + 16, out + 16);
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(out + i * 16, out, bit, 2);
+ av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2);
+ write_buffer_16x8(out, coeff + i * 8, 16);
+ }
+ (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+ const int txw_idx = get_txw_idx(TX_4X16);
+ const int txh_idx = get_txh_idx(TX_4X16);
+ const int txfm_size_col = tx_size_wide[TX_4X16];
+ const int txfm_size_row = tx_size_high[TX_4X16];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // col transform
+ load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, outcoeff128, bitcol, 1);
+ col_txfm_8x8_rounding(outcoeff128, -shift[1]);
+ transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < 4; i++) {
+ __m128i tmp[4];
+ row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
+ store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
+ }
+ (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+ const int txw_idx = get_txw_idx(TX_16X4);
+ const int txh_idx = get_txh_idx(TX_16X4);
+ const int txfm_size_col = tx_size_wide[TX_16X4];
+ const int txfm_size_row = tx_size_high[TX_16X4];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // col transform
+ load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+
+ for (int i = 0; i < (txfm_size_col >> 2); i++) {
+ __m128i *cur_in = &in[i * txfm_size_row];
+ col_txfm(cur_in, cur_in, bitcol, 1);
+ transpose_32bit_4x4(cur_in, cur_in);
+ }
+ col_txfm_8x8_rounding(in, -shift[1]);
+
+ // row transform
+ row_txfm(in, outcoeff128, bitrow, 1);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[128];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+ const int txw_idx = get_txw_idx(TX_16X32);
+ const int txh_idx = get_txh_idx(TX_16X32);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ // column transform
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+
+ for (int i = 0; i < 4; i++) {
+ col_txfm((in + i), (in + i), bitcol, 4);
+ }
+ col_txfm_16x16_rounding(&in[0], -shift[1]);
+ col_txfm_16x16_rounding(&in[64], -shift[1]);
+ transpose_8nx8n(in, outcoef128, 16, 32);
+
+ // row transform
+ row_txfm(outcoef128, in, bitrow, 8);
+ av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
+ NewSqrt2);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)tx_type;
+ __m128i in[512];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
+ const int txw_idx = get_txw_idx(TX_32X64);
+ const int txh_idx = get_txh_idx(TX_32X64);
+ const int txfm_size_col = tx_size_wide[TX_32X64];
+ const int txfm_size_row = tx_size_high[TX_32X64];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int num_row = txfm_size_row >> 2;
+ const int num_col = txfm_size_col >> 2;
+
+ // column transform
+ load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
+ for (int i = 0; i < num_col; i++) {
+ av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
+ }
+ for (int i = 0; i < num_col; i++) {
+ col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
+ }
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < num_row; i++) {
+ av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
+ }
+ for (int i = 0; i < txfm_size_col; i++) {
+ av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8,
+ -shift[2], NewSqrt2);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)tx_type;
+ __m128i in[512];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
+ const int txw_idx = get_txw_idx(TX_64X32);
+ const int txh_idx = get_txh_idx(TX_64X32);
+ const int txfm_size_col = tx_size_wide[TX_64X32];
+ const int txfm_size_row = tx_size_high[TX_64X32];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int num_row = txfm_size_row >> 2;
+ const int num_col = txfm_size_col >> 2;
+
+ // column transform
+ for (int i = 0; i < 32; i++) {
+ load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
+ load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
+ shift[0]);
+ load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
+ shift[0]);
+ load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
+ shift[0]);
+ }
+
+ for (int i = 0; i < num_col; i++) {
+ av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
+ }
+
+ for (int i = 0; i < num_row; i++) {
+ col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
+ }
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < num_row; i++) {
+ av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+ }
+ av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2],
+ NewSqrt2);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[128];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ // column transform
+ load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
+ col_txfm(in, in, bitcol, 8);
+ col_txfm_16x16_rounding(&in[0], -shift[1]);
+ col_txfm_16x16_rounding(&in[64], -shift[1]);
+ transpose_8nx8n(in, outcoef128, 32, 16);
+
+ // row transform
+ for (int i = 0; i < 4; i++) {
+ row_txfm((outcoef128 + i), (in + i), bitrow, 4);
+ }
+ av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
+ NewSqrt2);
+ (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+ const int txw_idx = get_txw_idx(TX_8X32);
+ const int txh_idx = get_txh_idx(TX_8X32);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ const int txfm_size_col = tx_size_wide[TX_8X32];
+ const int txfm_size_row = tx_size_high[TX_8X32];
+ const int num_col = txfm_size_col >> 2;
+
+ // column transform
+ load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
+ load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
+ stride, 0, 0, shift[0]);
+
+ for (int i = 0; i < num_col; i++) {
+ col_txfm((in + i), (in + i), bitcol, num_col);
+ }
+ col_txfm_16x16_rounding(in, -shift[1]);
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < txfm_size_col; i += 2) {
+ row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+ const int txw_idx = get_txw_idx(TX_32X8);
+ const int txh_idx = get_txh_idx(TX_32X8);
+ const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ const int txfm_size_col = tx_size_wide[TX_32X8];
+ const int txfm_size_row = tx_size_high[TX_32X8];
+ const int num_col = txfm_size_row >> 2;
+
+ // column transform
+ load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
+ for (int i = 0; i < txfm_size_row; i += 2) {
+ col_txfm((in + i), (in + i), bitcol, txfm_size_row);
+ }
+
+ col_txfm_16x16_rounding(&in[0], -shift[1]);
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < num_col; i++) {
+ row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
+ }
+ (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[8];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+ const int txw_idx = get_txw_idx(TX_4X8);
+ const int txh_idx = get_txh_idx(TX_4X8);
+ const int txfm_size_col = tx_size_wide[TX_4X8];
+ const int txfm_size_row = tx_size_high[TX_4X8];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bitcol, 1);
+ col_txfm_4x8_rounding(in, -shift[1]);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *cur_in = &in[i * 4];
+ transpose_32bit_4x4(cur_in, cur_in);
+ row_txfm(cur_in, cur_in, bitrow, 1);
+ av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col,
+ -shift[2], NewSqrt2);
+ store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[8];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+ const int txw_idx = get_txw_idx(TX_8X4);
+ const int txh_idx = get_txh_idx(TX_8X4);
+ const int txfm_size_col = tx_size_wide[TX_8X4];
+ const int txfm_size_row = tx_size_high[TX_8X4];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // col tranform
+ load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+ for (int i = 0; i < 2; i++) {
+ __m128i *cur_in = &in[i * txfm_size_row];
+ col_txfm(cur_in, cur_in, bitcol, 1);
+ transpose_32bit_4x4(cur_in, cur_in);
+ }
+ col_txfm_4x8_rounding(in, -shift[1]);
+
+ // row tranform
+ row_txfm(in, outcoeff128, bitrow, 1);
+ av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col,
+ -shift[2], NewSqrt2);
+ (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[256];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
+ const int txw_idx = get_txw_idx(TX_16X64);
+ const int txh_idx = get_txh_idx(TX_16X64);
+ const int txfm_size_col = tx_size_wide[TX_16X64];
+ const int txfm_size_row = tx_size_high[TX_16X64];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int num_col = txfm_size_col >> 2;
+ // col tranform
+ for (int i = 0; i < txfm_size_row; i += num_col) {
+ load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ }
+
+ for (int i = 0; i < num_col; i++) {
+ av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+ }
+
+ col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+ transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
+ fdct16x16_sse4_1(in, outcoeff128, bitrow, 8);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[256];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
+ const int txw_idx = get_txw_idx(TX_64X16);
+ const int txh_idx = get_txh_idx(TX_64X16);
+ const int txfm_size_col = tx_size_wide[TX_64X16];
+ const int txfm_size_row = tx_size_high[TX_64X16];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // col tranform
+ for (int i = 0; i < txfm_size_row; i++) {
+ load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ }
+
+ fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
+ col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+ transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+ for (int i = 0; i < 4; i++) {
+ av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4);
+ }
+ memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
+ (void)bd;
+}
+#endif
diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c
new file mode 100644
index 0000000000..ca448ca37b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+ { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+ { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+ { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint32_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint16_t *src1 = frame1;
+ const uint16_t *src2 = frame2;
+ uint32_t *dst = frame_sse + 2;
+ for (int i = 0; i < block_height; i++) {
+ __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+ __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+ __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+ __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+ __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+ __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+ __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+ __m256i diff_lo =
+ _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+ __m256i diff_hi =
+ _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+ _mm256_storeu_si256((__m256i *)dst, diff_lo);
+ dst += 8;
+ _mm256_storeu_si256((__m256i *)dst, diff_hi);
+
+ src1 += stride, src2 += stride2;
+ dst += sse_stride - 8;
+ }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint32_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint16_t *src1 = frame1;
+ const uint16_t *src2 = frame2;
+ uint32_t *dst = frame_sse + 2;
+ for (int i = 0; i < block_height; i++) {
+ __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+ __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+ __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+ __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+ __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+ __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+ __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+ __m256i diff_lo =
+ _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+ __m256i diff_hi =
+ _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+ _mm256_storeu_si256((__m256i *)dst, diff_lo);
+ _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi);
+
+ v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16));
+ v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16));
+ v_diff = _mm256_sub_epi16(v_src1, v_src2);
+ v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+ v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+ v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+ v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+ diff_lo =
+ _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+ diff_hi =
+ _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+ _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo);
+ _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi);
+
+ src1 += stride;
+ src2 += stride2;
+ dst += sse_stride;
+ }
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src,
+ __m256i *v256tmp) {
+ *v256tmp = _mm256_loadu_si256((__m256i *)src);
+ // For the first column, replicate the first element twice to the left
+ __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA);
+ *v256tmp = _mm256_inserti128_si256(*v256tmp,
+ _mm256_extracti128_si256(v256tmp1, 0), 0);
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src,
+ __m256i *v256tmp) {
+ *v256tmp = _mm256_loadu_si256((__m256i *)src);
+ // For the last column, replicate the last element twice to the right
+ __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54);
+ *v256tmp = _mm256_inserti128_si256(*v256tmp,
+ _mm256_extracti128_si256(v256tmp1, 1), 1);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+ // Mask the required 5 values inside the vector
+ __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+ __m128i v128a, v128b;
+ // Extract 256b as two 128b registers A and B
+ v128a = _mm256_castsi256_si128(vtmp);
+ v128b = _mm256_extracti128_si256(vtmp, 1);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A2+B2, A3+B3, 0, 0]
+ v128b = _mm_srli_si128(v128a, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ v128b = _mm_srli_si128(v128a, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ return _mm_extract_epi32(v128a, 0);
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ if (block_width == 32) {
+ get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ } else {
+ get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ }
+
+ __m256i vsrc[5];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ int col;
+ uint32_t *src = frame_sse;
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad_left(src, &vsrc[i]);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ xx_load_and_pad_left(src, &vsrc[4]);
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (col = 4; col < block_width - 4; col += 4) {
+ src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ vsrc[i] = _mm256_loadu_si256((__m256i *)src);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ vsrc[4] = _mm256_loadu_si256((__m256i *)src);
+
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+ }
+
+ src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad_right(src, &vsrc[i]);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ xx_load_and_pad_right(src, &vsrc[4]);
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_avx2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ highbd_apply_temporal_filter(
+ ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+ luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c
new file mode 100644
index 0000000000..2032847083
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+ { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint16_t *frame1, const unsigned int stride,
+ const uint16_t *frame2,
+ const unsigned int stride2, const int block_width,
+ const int block_height, uint32_t *frame_sse,
+ const unsigned int dst_stride) {
+ const uint16_t *src1 = frame1;
+ const uint16_t *src2 = frame2;
+ uint32_t *dst = frame_sse;
+
+ for (int i = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j += 8) {
+ __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+ __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+ __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2);
+ __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff);
+ __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff);
+
+ __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh);
+ __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh);
+
+ _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+ _mm_storeu_si128((__m128i *)(dst + j + 6), vres2);
+ }
+
+ src1 += stride;
+ src2 += stride2;
+ dst += dst_stride;
+ }
+}
+
+static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col,
+ int block_width) {
+ __m128i vtmp1 = _mm_loadu_si128((__m128i *)src);
+ __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4));
+ // For the first column, replicate the first element twice to the left
+ dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+ // For the last column, replicate the last element twice to the right
+ dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+ __m128i veca, vecb;
+ // Mask and obtain the required 5 values inside the vector
+ veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+ vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A2+B2, A3+B3, 0, 0]
+ vecb = _mm_srli_si128(veca, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ vecb = _mm_srli_si128(veca, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ return _mm_cvtsi128_si32(veca);
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+ frame_sse, SSE_STRIDE);
+
+ __m128i vsrc[5][2];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ for (int col = 0; col < block_width; col += 4) {
+ uint32_t *src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad(src, vsrc[i], col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Padding for top 2 rows
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+ __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+ __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+ __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+ __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+ __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+ __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+ __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+ vsrc[0][0] = vsrc[1][0];
+ vsrc[0][1] = vsrc[1][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+ vsrc[2][0] = vsrc[3][0];
+ vsrc[2][1] = vsrc[3][1];
+ vsrc[3][0] = vsrc[4][0];
+ vsrc[3][1] = vsrc[4][1];
+
+ // Load next row
+ xx_load_and_pad(src, vsrc[4], col, block_width);
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+ __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+ __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+ __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+ __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+ __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+ __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+ __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+ vsrc[0][0] = vsrc[1][0];
+ vsrc[0][1] = vsrc[1][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+ vsrc[2][0] = vsrc[3][0];
+ vsrc[2][1] = vsrc[3][1];
+ vsrc[3][0] = vsrc[4][0];
+ vsrc[3][1] = vsrc[4][1];
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+ }
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_sse2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ highbd_apply_temporal_filter(
+ ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+ luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_avx2.c b/third_party/aom/av1/encoder/x86/ml_avx2.c
new file mode 100644
index 0000000000..6432708416
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_avx2.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+#define CALC_OUTPUT_FOR_2ROWS \
+ const int index = weight_idx + (2 * i * tot_num_inputs); \
+ const __m256 weight0 = _mm256_loadu_ps(&weights[index]); \
+ const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \
+ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); \
+ const __m256 mul1 = _mm256_mul_ps(inputs256, weight1); \
+ hadd[i] = _mm256_hadd_ps(mul0, mul1);
+
+static INLINE void nn_propagate_8to1(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ // Process one output row at a time.
+ for (int out = 0; out < num_outputs; out++) {
+ __m256 in_result = _mm256_setzero_ps();
+ float bias_val = bias[out];
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]);
+ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);
+ in_result = _mm256_add_ps(in_result, mul0);
+ }
+ const __m128 low_128 = _mm256_castps256_ps128(in_result);
+ const __m128 high_128 = _mm256_extractf128_ps(in_result, 1);
+ const __m128 sum_par_0 = _mm_add_ps(low_128, high_128);
+ const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0);
+ const __m128 sum_tot =
+ _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1);
+
+ bias_val += _mm_cvtss_f32(sum_tot);
+ if (is_clip_required) bias_val = AOMMAX(bias_val, 0);
+ output_nodes[out] = bias_val;
+ }
+}
+
+static INLINE void nn_propagate_8to4(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ __m256 hadd[2];
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 bias_reg = _mm_loadu_ps(&bias[out]);
+ __m128 in_result = _mm_setzero_ps();
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ // Process two output row at a time.
+ for (int i = 0; i < 2; i++) {
+ CALC_OUTPUT_FOR_2ROWS
+ }
+
+ const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]);
+ const __m128 low_128 = _mm256_castps256_ps128(sum_par);
+ const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1);
+ const __m128 result = _mm_add_ps(low_128, high_128);
+
+ in_result = _mm_add_ps(in_result, result);
+ }
+
+ in_result = _mm_add_ps(in_result, bias_reg);
+ if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps());
+ _mm_storeu_ps(&output_nodes[out], in_result);
+ }
+}
+
+static INLINE void nn_propagate_8to8(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ __m256 hadd[4];
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m256 bias_reg = _mm256_loadu_ps(&bias[out]);
+ __m256 in_result = _mm256_setzero_ps();
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ // Process two output rows at a time.
+ for (int i = 0; i < 4; i++) {
+ CALC_OUTPUT_FOR_2ROWS
+ }
+ const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]);
+ const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]);
+
+ __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20);
+ __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31);
+
+ __m256 result = _mm256_add_ps(ht_0, ht_1);
+ in_result = _mm256_add_ps(in_result, result);
+ }
+ in_result = _mm256_add_ps(in_result, bias_reg);
+ if (is_clip_required)
+ in_result = _mm256_max_ps(in_result, _mm256_setzero_ps());
+ _mm256_storeu_ps(&output_nodes[out], in_result);
+ }
+}
+
+static INLINE void nn_propagate_input_multiple_of_8(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ bool is_output_layer, int num_outputs, float *const output_nodes) {
+ // The saturation of output is considered for hidden layer which is not equal
+ // to final hidden layer.
+ const int is_clip_required =
+ !is_output_layer && num_inputs_to_process == tot_num_inputs;
+ if (num_outputs % 8 == 0) {
+ nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ } else if (num_outputs % 4 == 0) {
+ nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ } else {
+ nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ }
+}
+
+void av1_nn_predict_avx2(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+ assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER);
+
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool is_output_layer = layer == nn_config->num_hidden_layers;
+ float *const output_nodes = is_output_layer ? output : &buf[buf_index][0];
+ const int num_outputs = is_output_layer
+ ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+ assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER);
+
+ // Process input multiple of 8 using AVX2 intrinsic.
+ if (num_inputs % 8 == 0) {
+ nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias,
+ num_inputs, num_inputs, is_output_layer,
+ num_outputs, output_nodes);
+ } else {
+ // When number of inputs is not multiple of 8, use hybrid approach of AVX2
+ // and SSE3 based on the need.
+ const int in_mul_8 = num_inputs / 8;
+ const int num_inputs_to_process = in_mul_8 * 8;
+ int bias_is_considered = 0;
+ if (in_mul_8) {
+ nn_propagate_input_multiple_of_8(
+ input_nodes, layer_weights, layer_bias, num_inputs_to_process,
+ num_inputs, is_output_layer, num_outputs, output_nodes);
+ bias_is_considered = 1;
+ }
+
+ const float *out_temp = bias_is_considered ? output_nodes : layer_bias;
+ const int input_remaining = num_inputs % 8;
+ if (input_remaining % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]);
+ __m128 out_l = _mm_loadu_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to8_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &out_h, &out_l, num_inputs);
+ }
+ if (!is_output_layer) {
+ const __m128 zero = _mm_setzero_ps();
+ out_h = _mm_max_ps(out_h, zero);
+ out_l = _mm_max_ps(out_l, zero);
+ }
+ _mm_storeu_ps(&output_nodes[out + 4], out_h);
+ _mm_storeu_ps(&output_nodes[out], out_l);
+ }
+ } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to4_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs, num_inputs);
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (input_remaining % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 outputs = _mm_load1_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to1_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs);
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ output_nodes[out] = _mm_cvtss_f32(outputs);
+ }
+ } else {
+ // Use SSE instructions for scalar operations to avoid the latency
+ // of swapping between SIMD and FPU modes.
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 outputs = _mm_load1_ps(&out_temp[out]);
+ for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) {
+ __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+ __m128 weight =
+ _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+ outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight));
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ output_nodes[out] = _mm_cvtss_f32(outputs);
+ }
+ }
+ }
+ // Before processing the next layer, treat the output of current layer as
+ // input to next layer.
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.c b/third_party/aom/av1/encoder/x86/ml_sse3.c
new file mode 100644
index 0000000000..4748a68d38
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_sse3.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+// In order to avoid the high-latency of swapping between FPU and SIMD
+// operations, we keep the result in a 128-bit register even though we only
+// care about a single value.
+static void nn_propagate_8to1(const float *const inputs,
+ const float *const weights,
+ __m128 *const output) {
+ const __m128 inputs_h = _mm_loadu_ps(&inputs[4]);
+ const __m128 inputs_l = _mm_loadu_ps(inputs);
+
+ const __m128 weights_h = _mm_loadu_ps(&weights[4]);
+ const __m128 weights_l = _mm_loadu_ps(weights);
+
+ const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h);
+ const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l);
+ // [7 6 5 4] [3 2 1 0] (weight and input indices)
+
+ const __m128 vadd = _mm_add_ps(mul_l, mul_h);
+ // [7+3 6+2 5+1 4+0]
+ const __m128 hadd1 = _mm_hadd_ps(vadd, vadd);
+ // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0]
+ const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+ // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0]
+ *output = _mm_add_ps(*output, hadd2);
+}
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const output) {
+ const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+ const __m128 weights128 = _mm_loadu_ps(weights);
+
+ const __m128 mul = _mm_mul_ps(inputs128, weights128);
+ // [3 2 1 0] (weight and input indices)
+
+ const __m128 hadd1 = _mm_hadd_ps(mul, mul);
+ // [3+2 1+0 3+2 1+0]
+ const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+ // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0]
+ *output = _mm_add_ps(*output, hadd2);
+}
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const outputs, const int num_inputs) {
+ const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+ __m128 hadd[2];
+ for (int i = 0; i < 2; i++) { // For each pair of outputs
+ const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+ const __m128 mul0 = _mm_mul_ps(weight0, inputs128);
+ const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+ const __m128 mul1 = _mm_mul_ps(weight1, inputs128);
+ hadd[i] = _mm_hadd_ps(mul0, mul1);
+ }
+ // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+ // hadd[1] = [15+14 13+12 11+10 9+8]
+
+ const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]);
+ // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+
+ *outputs = _mm_add_ps(*outputs, hh);
+}
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+ const float *const weights, __m128 *const out_h,
+ __m128 *const out_l, const int num_inputs) {
+ const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+ __m128 hadd[4];
+ for (int i = 0; i < 4; i++) { // For each pair of outputs
+ const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+ const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+ const __m128 mul0 = _mm_mul_ps(inputs128, weight0);
+ const __m128 mul1 = _mm_mul_ps(inputs128, weight1);
+ hadd[i] = _mm_hadd_ps(mul0, mul1);
+ }
+ // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+ // hadd[1] = [15+14 13+12 11+10 9+8]
+ // hadd[2] = [23+22 21+20 19+18 17+16]
+ // hadd[3] = [31+30 29+28 27+26 25+24]
+
+ const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]);
+ // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+ const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]);
+ // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16]
+
+ *out_h = _mm_add_ps(*out_h, hh1);
+ *out_l = _mm_add_ps(*out_l, hh0);
+}
+
+static void nn_propagate_8to4(const float *const inputs,
+ const float *const weights, __m128 *const outputs,
+ const int num_inputs) {
+ const __m128 inputs_h = _mm_loadu_ps(inputs + 4);
+ const __m128 inputs_l = _mm_loadu_ps(inputs);
+ // [7 6 5 4] [3 2 1 0] (input indices)
+
+ __m128 add[4];
+ for (int i = 0; i < 4; i++) { // For each output:
+ const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]);
+ const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]);
+ const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h);
+ const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l);
+ add[i] = _mm_add_ps(mul_l, mul_h);
+ }
+ // add[0] = [7+3 6+2 5+1 4+0]
+ // add[1] = [15+11 14+10 13+9 12+8]
+ // add[2] = [23+19 22+18 21+17 20+16]
+ // add[3] = [31+27 30+26 29+25 28+24]
+
+ const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]);
+ // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16]
+ const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]);
+ // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0]
+
+ const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h);
+ // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16
+ // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0]
+
+ *outputs = _mm_add_ps(*outputs, haddhadd);
+}
+
+static void nn_activate8(__m128 *out_h, __m128 *out_l) {
+ const __m128 zero = _mm_setzero_ps();
+ *out_h = _mm_max_ps(*out_h, zero);
+ *out_l = _mm_max_ps(*out_l, zero);
+}
+
+static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); }
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_sse3(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+
+ // Hidden layers, except the final iteration is the output layer.
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool output_layer = (layer == nn_config->num_hidden_layers);
+ float *const output_nodes = output_layer ? output : &buf[buf_index][0];
+ const int num_outputs = output_layer ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+
+ if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
+ __m128 out_l = _mm_loadu_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to8_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &out_h, &out_l, num_inputs);
+ }
+ if (!output_layer) nn_activate8(&out_h, &out_l);
+ _mm_storeu_ps(&output_nodes[out + 4], out_h);
+ _mm_storeu_ps(&output_nodes[out], out_l);
+ }
+ } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 8) {
+ nn_propagate_8to4(&input_nodes[in],
+ &layer_weights[out * num_inputs + in], &outputs,
+ num_inputs);
+ }
+ if (!output_layer) nn_activate4(&outputs);
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to4_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs, num_inputs);
+ }
+ if (!output_layer) nn_activate4(&outputs);
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (num_inputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 total = _mm_load1_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 8) {
+ nn_propagate_8to1(&input_nodes[in],
+ &layer_weights[out * num_inputs + in], &total);
+ }
+ if (!output_layer) nn_activate4(&total);
+ output_nodes[out] = _mm_cvtss_f32(total);
+ }
+ } else if (num_inputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 total = _mm_load1_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to1_sse3(
+ &input_nodes[in], &layer_weights[out * num_inputs + in], &total);
+ }
+ if (!output_layer) nn_activate4(&total);
+ output_nodes[out] = _mm_cvtss_f32(total);
+ }
+ } else {
+ // Use SSE instructions for scalar operations to avoid the latency of
+ // swapping between SIMD and FPU modes.
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 total = _mm_load1_ps(&layer_bias[out]);
+ for (int in_node = 0; in_node < num_inputs; in_node++) {
+ __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+ __m128 weight =
+ _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+ total = _mm_add_ps(total, _mm_mul_ps(input, weight));
+ }
+ if (!output_layer) nn_activate4(&total);
+ output_nodes[out] = _mm_cvtss_f32(total);
+ }
+ }
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential
+// Function. Neural Computation, 11(4):853–862, 1999.
+static AOM_INLINE __m128 approx_exp(__m128 y) {
+#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2)
+#define B \
+ 127 // Offset for the exponent according to IEEE floating point standard.
+#define C 60801 // Magic number controls the accuracy of approximation
+ const __m128 multiplier = _mm_set1_ps(A);
+ const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C);
+
+ y = _mm_mul_ps(y, multiplier);
+ y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset));
+ return y;
+#undef A
+#undef B
+#undef C
+}
+
+static AOM_INLINE __m128 reduce_max(__m128 reg) {
+ __m128 tmp_reg;
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10
+ reg = _mm_max_ps(reg, tmp_reg);
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01
+ reg = _mm_max_ps(reg, tmp_reg);
+
+ return reg;
+}
+
+static AOM_INLINE __m128 reduce_sum(__m128 reg) {
+ __m128 tmp_reg;
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10
+ reg = _mm_add_ps(reg, tmp_reg);
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01
+ reg = _mm_add_ps(reg, tmp_reg);
+
+ return reg;
+}
+
+void av1_nn_fast_softmax_16_sse3(const float *input, float *output) {
+ // Clips at -10 to avoid underflowing
+ const __m128 clipper = _mm_set1_ps(-10.0f);
+
+ // Load in 16 values
+ __m128 in_0 = _mm_loadu_ps(&input[0]);
+ __m128 in_1 = _mm_loadu_ps(&input[4]);
+ __m128 in_2 = _mm_loadu_ps(&input[8]);
+ __m128 in_3 = _mm_loadu_ps(&input[12]);
+
+ // Get the max
+ __m128 max_0 = _mm_max_ps(in_0, in_1);
+ __m128 max_1 = _mm_max_ps(in_2, in_3);
+
+ max_0 = _mm_max_ps(max_0, max_1);
+ max_0 = reduce_max(max_0);
+
+ // Subtract the max off and clip
+ in_0 = _mm_sub_ps(in_0, max_0);
+ in_1 = _mm_sub_ps(in_1, max_0);
+ in_2 = _mm_sub_ps(in_2, max_0);
+ in_3 = _mm_sub_ps(in_3, max_0);
+
+ in_0 = _mm_max_ps(in_0, clipper);
+ in_1 = _mm_max_ps(in_1, clipper);
+ in_2 = _mm_max_ps(in_2, clipper);
+ in_3 = _mm_max_ps(in_3, clipper);
+
+ // Exponentiate and compute the denominator
+ __m128 sum = in_0 = approx_exp(in_0);
+ in_1 = approx_exp(in_1);
+ sum = _mm_add_ps(sum, in_1);
+ in_2 = approx_exp(in_2);
+ sum = _mm_add_ps(sum, in_2);
+ in_3 = approx_exp(in_3);
+ sum = _mm_add_ps(sum, in_3);
+ sum = reduce_sum(sum);
+
+ // Divide to get the probability
+ in_0 = _mm_div_ps(in_0, sum);
+ in_1 = _mm_div_ps(in_1, sum);
+ in_2 = _mm_div_ps(in_2, sum);
+ in_3 = _mm_div_ps(in_3, sum);
+
+ _mm_storeu_ps(&output[0], in_0);
+ _mm_storeu_ps(&output[4], in_1);
+ _mm_storeu_ps(&output[8], in_2);
+ _mm_storeu_ps(&output[12], in_3);
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.h b/third_party/aom/av1/encoder/x86/ml_sse3.h
new file mode 100644
index 0000000000..f41a2474af
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_sse3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_
+#define AOM_AV1_ENCODER_X86_ML_SSE3_H_
+
+#include <pmmintrin.h>
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const output);
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const outputs, const int num_inputs);
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+ const float *const weights, __m128 *const out_h,
+ __m128 *const out_l, const int num_inputs);
+
+#endif // AOM_AV1_ENCODER_X86_ML_SSE3_H_
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 0000000000..6658ed39a8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,2348 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
+ const __m256i *shuffle,
+ const __m256i *dgd_ijkl) {
+ // Load two 128-bit chunks from dgd
+ const __m256i s0 = _mm256_inserti128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)),
+ _mm_loadu_si128((__m128i *)(dgd + 4)), 1);
+ // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices)
+ // The weird order is so the shuffle stays within 128-bit lanes
+
+ // Shuffle 16x u16 values within lanes according to the mask:
+ // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4]
+ // (Actually we shuffle u8 values as there's no 16-bit shuffle)
+ const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle);
+ // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices)
+
+ // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit
+ // integers then horizontally add pairs of these integers resulting in 8x
+ // 32-bit integers
+ const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1);
+ // d0 = [a b c d] [e f g h] as u32
+
+ // Take the lower-half of d0, extend to u64, add it on to dst (H)
+ const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
+ // d0l = [a b] [c d] as u64
+ const __m256i dst0 = yy_load_256(dst);
+ yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
+
+ // Take the upper-half of d0, extend to u64, add it on to dst (H)
+ const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
+ // d0h = [e f] [g h] as u64
+ const __m256i dst1 = yy_load_256(dst + 4);
+ yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_avx2(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+ int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd_ijkl combined as a u32,
+ // then broadcast to 8x u32 slots of a 256
+ const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_avx2` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_avx2(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win7_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_avx2(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd_ijkl combined as a u32,
+ // then broadcast to 8x u32 slots of a 256
+ const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_avx2` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_avx2(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ DECLARE_ALIGNED(
+ 32, int64_t,
+ H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win5_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int64[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else {
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) {
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd));
+}
+
+static INLINE __m256i convert_and_add_avx2(__m256i src) {
+ const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src));
+ const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
+ return _mm256_add_epi64(s0, s1);
+}
+
+static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
+ __m256i *src2, __m256i *src3) {
+ // 00 01 10 11 02 03 12 13
+ const __m256i s_0 = _mm256_hadd_epi32(src0, src1);
+ // 20 21 30 31 22 23 32 33
+ const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3);
+ // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33
+ const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1);
+ return convert_and_add_avx2(s_2);
+}
+
+static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
+ // 00 10 02 12
+ const __m256i t0 = _mm256_unpacklo_epi64(src0, src1);
+ // 01 11 03 13
+ const __m256i t1 = _mm256_unpackhi_epi64(src0, src1);
+ // 00+01 10+11 02+03 12+13
+ const __m256i sum = _mm256_add_epi64(t0, t1);
+ // 00+01 10+11
+ const __m128i sum0 = _mm256_castsi256_si128(sum);
+ // 02+03 12+13
+ const __m128i sum1 = _mm256_extracti128_si256(sum, 1);
+ // 00+01+02+03 10+11+12+13
+ return _mm_add_epi64(sum0, sum1);
+}
+
+static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
+ // 00 01 02 03
+ const __m256i s0 = convert_and_add_avx2(src0);
+ // 10 11 12 13
+ const __m256i s1 = convert_and_add_avx2(src1);
+ return add_64bit_lvl_avx2(s0, s1);
+}
+
+static INLINE int32_t calc_sum_of_register(__m256i src) {
+ const __m128i src_l = _mm256_castsi256_si128(src);
+ const __m128i src_h = _mm256_extracti128_si256(src, 1);
+ const __m128i sum = _mm_add_epi32(src_l, src_h);
+ const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4));
+ return _mm_cvtsi128_si32(dst1);
+}
+
+static INLINE void transpose_64bit_4x4_avx2(const __m256i *const src,
+ __m256i *const dst) {
+ // Unpack 64 bit elements. Goes from:
+ // src[0]: 00 01 02 03
+ // src[1]: 10 11 12 13
+ // src[2]: 20 21 22 23
+ // src[3]: 30 31 32 33
+ // to:
+ // reg0: 00 10 02 12
+ // reg1: 20 30 22 32
+ // reg2: 01 11 03 13
+ // reg3: 21 31 23 33
+ const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]);
+ const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]);
+ const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]);
+ const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // dst[0]: 00 10 20 30
+ // dst[1]: 01 11 21 31
+ // dst[2]: 02 12 22 32
+ // dst[3]: 03 13 23 33
+ dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1);
+ dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1);
+ dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0);
+ dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0);
+}
+
+// When we load 32 values of int8_t type and need less than 32 values for
+// processing, the below mask is used to make the extra values zero.
+static const int8_t mask_8bit[32] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
+};
+
+// When we load 16 values of int16_t type and need less than 16 values for
+// processing, the below mask is used to make the extra values zero.
+static const int16_t mask_16bit[32] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
+};
+
+static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
+ int32_t h_end, int32_t v_start,
+ int32_t v_end, int32_t stride) {
+ const uint8_t *src_temp = src + v_start * stride + h_start;
+ const __m256i zero = _mm256_setzero_si256();
+ const int32_t width = h_end - h_start;
+ const int32_t height = v_end - v_start;
+ const int32_t wd_beyond_mul32 = width & 31;
+ const int32_t wd_mul32 = width - wd_beyond_mul32;
+ __m128i mask_low, mask_high;
+ __m256i ss = zero;
+
+ // When width is not multiple of 32, it still loads 32 and to make the data
+ // which is extra (beyond required) as zero using the below mask.
+ if (wd_beyond_mul32 >= 16) {
+ mask_low = _mm_set1_epi8(-1);
+ mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32]));
+ } else {
+ mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32]));
+ mask_high = _mm_setzero_si128();
+ }
+ const __m256i mask =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1);
+
+ int32_t proc_ht = 0;
+ do {
+ // Process width in multiple of 32.
+ int32_t proc_wd = 0;
+ while (proc_wd < wd_mul32) {
+ const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
+ const __m256i sad_0 = _mm256_sad_epu8(s_0, zero);
+ ss = _mm256_add_epi32(ss, sad_0);
+ proc_wd += 32;
+ }
+
+ // Process the remaining width.
+ if (wd_beyond_mul32) {
+ const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
+ const __m256i s_m_0 = _mm256_and_si256(s_0, mask);
+ const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero);
+ ss = _mm256_add_epi32(ss, sad_0);
+ }
+ src_temp += stride;
+ proc_ht++;
+ } while (proc_ht < height);
+
+ const uint32_t sum = calc_sum_of_register(ss);
+ const uint8_t avg = sum / (width * height);
+ return avg;
+}
+
+// Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not
+// 0, it writes (16 - n) more data than required.
+static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t avg, int32_t width,
+ int32_t height, int16_t *dst,
+ int32_t dst_stride,
+ int use_downsampled_wiener_stats) {
+ const __m256i avg_reg = _mm256_set1_epi16(avg);
+
+ int32_t proc_ht = 0;
+ do {
+ int ds_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ if (use_downsampled_wiener_stats &&
+ (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ ds_factor = height - proc_ht;
+ }
+
+ int32_t proc_wd = 0;
+ while (proc_wd < width) {
+ const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd));
+ const __m256i ss = _mm256_cvtepu8_epi16(s);
+ const __m256i d = _mm256_sub_epi16(ss, avg_reg);
+ _mm256_storeu_si256((__m256i *)(dst + proc_wd), d);
+ proc_wd += 16;
+ }
+
+ src += ds_factor * src_stride;
+ dst += ds_factor * dst_stride;
+ proc_ht += ds_factor;
+ } while (proc_ht < height);
+}
+
+// Fills lower-triangular elements of H buffer from upper triangular elements of
+// the same
+static INLINE void fill_lower_triag_elements_avx2(const int32_t wiener_win2,
+ int64_t *const H) {
+ for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
+ __m256i in[4], out[4];
+
+ in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1));
+ in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1));
+ in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1));
+ in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1));
+
+ transpose_64bit_4x4_avx2(in, out);
+
+ _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
+ _mm256_castsi256_si128(out[0]));
+ _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
+ _mm256_castsi256_si128(out[1]));
+ _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
+ _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
+
+ for (int32_t j = i + 5; j < wiener_win2; j += 4) {
+ in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j));
+ in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j));
+ in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j));
+ in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j));
+
+ transpose_64bit_4x4_avx2(in, out);
+
+ _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]);
+ _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]);
+ _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]);
+ _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]);
+ }
+ }
+}
+
+// Fill H buffer based on loop_count.
+#define INIT_H_VALUES(d, loop_count) \
+ for (int g = 0; g < (loop_count); g++) { \
+ const __m256i dgd0 = \
+ _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
+ madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]); \
+ }
+
+// Fill M & H buffer.
+#define INIT_MH_VALUES(d) \
+ for (int g = 0; g < wiener_win; g++) { \
+ const __m256i dgds_0 = \
+ _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
+ madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]); \
+ madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]); \
+ }
+
+// Update the dgd pointers appropriately.
+#define INITIALIZATION(wiener_window_sz) \
+ j = i / (wiener_window_sz); \
+ const int16_t *d_window = d + j; \
+ const int16_t *d_current_row = \
+ d + j + ((i % (wiener_window_sz)) * d_stride); \
+ int proc_ht = v_start; \
+ downsample_factor = \
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+ __m256i sum_h[wiener_window_sz]; \
+ memset(sum_h, 0, sizeof(sum_h));
+
+// Update the downsample factor appropriately.
+#define UPDATE_DOWNSAMPLE_FACTOR \
+ int proc_wd = 0; \
+ if (use_downsampled_wiener_stats && \
+ ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \
+ downsample_factor = v_end - proc_ht; \
+ } \
+ const __m256i df_reg = _mm256_set1_epi16(downsample_factor);
+
+#define CALCULATE_REMAINING_H_WIN5 \
+ while (j < wiener_win) { \
+ d_window = d; \
+ d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \
+ const __m256i zero = _mm256_setzero_si256(); \
+ sum_h[0] = zero; \
+ sum_h[1] = zero; \
+ sum_h[2] = zero; \
+ sum_h[3] = zero; \
+ sum_h[4] = zero; \
+ \
+ proc_ht = v_start; \
+ downsample_factor = \
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+ do { \
+ UPDATE_DOWNSAMPLE_FACTOR; \
+ \
+ /* Process the amount of width multiple of 16.*/ \
+ while (proc_wd < wd_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 5) \
+ \
+ proc_wd += 16; \
+ }; \
+ \
+ /* Process the remaining width here. */ \
+ if (wd_beyond_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 5) \
+ } \
+ proc_ht += downsample_factor; \
+ d_window += downsample_factor * d_stride; \
+ d_current_row += downsample_factor * d_stride; \
+ } while (proc_ht < v_end); \
+ const __m256i s_h0 = \
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
+ s_h0); \
+ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); \
+ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); \
+ _mm_storel_epi64( \
+ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); \
+ j++; \
+ }
+
+#define CALCULATE_REMAINING_H_WIN7 \
+ while (j < wiener_win) { \
+ d_window = d; \
+ d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \
+ const __m256i zero = _mm256_setzero_si256(); \
+ sum_h[0] = zero; \
+ sum_h[1] = zero; \
+ sum_h[2] = zero; \
+ sum_h[3] = zero; \
+ sum_h[4] = zero; \
+ sum_h[5] = zero; \
+ sum_h[6] = zero; \
+ \
+ proc_ht = v_start; \
+ downsample_factor = \
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+ do { \
+ UPDATE_DOWNSAMPLE_FACTOR; \
+ \
+ /* Process the amount of width multiple of 16.*/ \
+ while (proc_wd < wd_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 7) \
+ \
+ proc_wd += 16; \
+ }; \
+ \
+ /* Process the remaining width here. */ \
+ if (wd_beyond_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 7) \
+ } \
+ proc_ht += downsample_factor; \
+ d_window += downsample_factor * d_stride; \
+ d_current_row += downsample_factor * d_stride; \
+ } while (proc_ht < v_end); \
+ const __m256i s_h1 = \
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
+ s_h1); \
+ const __m256i s_h2 = \
+ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); \
+ _mm256_storeu_si256( \
+ (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2); \
+ j++; \
+ }
+
+// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
+// the filter tap values required for wiener filtering. Here, the buffer H is of
+// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
+// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
+// value above the diagonal (upper triangle) are equal to the values below the
+// diagonal (lower triangle). The calculation of elements/stats of H(upper
+// triangle) and M is done in steps as described below where each step fills
+// specific values of H and M.
+// Once the upper triangular elements of H matrix are derived, the same will be
+// copied to lower triangular using the function
+// fill_lower_triag_elements_avx2().
+// Example: Wiener window size =
+// WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy
+// (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124]
+// [H30 H31 H32 ---- H323 H324]
+// [H40 H41 H42 ---- H423 H424]
+// [H50 H51 H52 ---- H523 H524]
+// [H60 H61 H62 ---- H623 H624]
+// ||
+// ||
+// [H230 H231 H232 ---- H2323 H2324]
+// [H240 H241 H242 ---- H2423 H2424]
+// In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e.,
+// H00 to H024) is filled. The remaining rows of H buffer are filled through
+// steps 2 to 6.
+static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride,
+ const int16_t *const s, int32_t s_stride,
+ int32_t width, int v_start, int v_end,
+ int64_t *const M, int64_t *const H,
+ int use_downsampled_wiener_stats) {
+ const int32_t wiener_win = WIENER_WIN_CHROMA;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ // Amount of width which is beyond multiple of 16. This case is handled
+ // appropriately to process only the required width towards the end.
+ const int32_t wd_mul16 = width & ~15;
+ const int32_t wd_beyond_mul16 = width - wd_mul16;
+ const __m256i mask =
+ _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
+ int downsample_factor;
+
+ // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024)
+ // values are filled here. Here, the loop over 'j' is executed for values 0
+ // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of
+ // M and H are filled as shown below.
+ // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,.
+ int j = 0;
+ do {
+ const int16_t *s_t = s;
+ const int16_t *d_t = d;
+ __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+ __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int proc_ht = v_start;
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mask = _mm256_and_si256(src, mask);
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+ }
+ proc_ht += downsample_factor;
+ s_t += downsample_factor * s_stride;
+ d_t += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i s_m =
+ hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
+ const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]);
+ _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m);
+ _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h);
+
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h);
+ _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h);
+ } while (++j < wiener_win);
+
+ // The below steps are designed to fill remaining rows of H buffer. Here, aim
+ // is to fill only upper triangle elements correspond to each row and lower
+ // triangle elements are copied from upper-triangle elements. Also, as
+ // mentioned in Step 1, the core function is designed to fill 5
+ // elements/stats/values of H buffer.
+ //
+ // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill
+ // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc,
+ // are need not be filled. As the core function process 5 values, in first
+ // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69
+ // from row6, etc.
+ for (int i = 1; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill
+ // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from
+ // row7, etc, are need not be filled. As the core function process 5 values,
+ // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from
+ // row2, H77-H79 from row7, etc.
+ for (int i = 2; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill
+ // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from
+ // row8, etc, are need not be filled. As the core function process 5 values,
+ // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from
+ // row3, H88-89 from row8, etc.
+ for (int i = 3; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
+ _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill
+ // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from
+ // row9, etc, are need not be filled. As the core function process 5 values,
+ // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4,
+ // H99 from row9, etc.
+ for (int i = 4; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
+ _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only
+ // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from
+ // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59
+ // from row5 and H1010-H1014 from row10, etc.
+ for (int i = 5; i < wiener_win2; i += wiener_win) {
+ // Derive j'th iteration from where the H buffer filling needs to be
+ // started.
+ j = i / wiener_win;
+ int shift = 0;
+ do {
+ // Update the dgd pointers appropriately.
+ int proc_ht = v_start;
+ const int16_t *d_window = d + (i / wiener_win);
+ const int16_t *d_current_row =
+ d + (i / wiener_win) + ((i % wiener_win) * d_stride);
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 5)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 5)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
+ s_h);
+ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
+ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
+ _mm_storel_epi64(
+ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0);
+ shift++;
+ } while (++j < wiener_win);
+ }
+
+ fill_lower_triag_elements_avx2(wiener_win2, H);
+}
+
+// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
+// the filter tap values required for wiener filtering. Here, the buffer H is of
+// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
+// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
+// value above the diagonal (upper triangle) are equal to the values below the
+// diagonal (lower triangle). The calculation of elements/stats of H(upper
+// triangle) and M is done in steps as described below where each step fills
+// specific values of H and M.
+// Example:
+// Wiener window size = WIENER_WIN (7)
+// M buffer = [M0 M1 M2 ---- M47 M48]
+// H buffer = Hxy (x-row, y-column)
+// [H00 H01 H02 ---- H047 H048]
+// [H10 H11 H12 ---- H147 H148]
+// [H30 H31 H32 ---- H347 H348]
+// [H40 H41 H42 ---- H447 H448]
+// [H50 H51 H52 ---- H547 H548]
+// [H60 H61 H62 ---- H647 H648]
+// ||
+// ||
+// [H470 H471 H472 ---- H4747 H4748]
+// [H480 H481 H482 ---- H4847 H4848]
+// In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e.,
+// H00 to H048) is filled. The remaining rows of H buffer are filled through
+// steps 2 to 8.
+static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride,
+ const int16_t *const s, int32_t s_stride,
+ int32_t width, int v_start, int v_end,
+ int64_t *const M, int64_t *const H,
+ int use_downsampled_wiener_stats) {
+ const int32_t wiener_win = WIENER_WIN;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ // Amount of width which is beyond multiple of 16. This case is handled
+ // appropriately to process only the required width towards the end.
+ const int32_t wd_mul16 = width & ~15;
+ const int32_t wd_beyond_mul16 = width - wd_mul16;
+ const __m256i mask =
+ _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
+ int downsample_factor;
+
+ // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048)
+ // values are filled here. Here, the loop over 'j' is executed for values 0
+ // to 6. When the loop executed for a specific 'j', 7 values of M and H are
+ // filled as shown below.
+ // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,.
+ int j = 0;
+ do {
+ const int16_t *s_t = s;
+ const int16_t *d_t = d;
+ __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() };
+ __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int proc_ht = v_start;
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+
+ proc_wd += 16;
+ }
+
+ if (wd_beyond_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mask = _mm256_and_si256(src, mask);
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+ }
+ proc_ht += downsample_factor;
+ s_t += downsample_factor * s_stride;
+ d_t += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i s_m0 =
+ hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
+ const __m256i s_m1 =
+ hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]);
+ _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0);
+ _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4),
+ _mm256_castsi256_si128(s_m1));
+ _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6],
+ _mm256_extracti128_si256(s_m1, 1));
+
+ const __m256i sh_0 =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ const __m256i sh_1 =
+ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
+ _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0);
+ _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4),
+ _mm256_castsi256_si128(sh_1));
+ _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6],
+ _mm256_extracti128_si256(sh_1, 1));
+ } while (++j < wiener_win);
+
+ // The below steps are designed to fill remaining rows of H buffer. Here, aim
+ // is to fill only upper triangle elements correspond to each row and lower
+ // triangle elements are copied from upper-triangle elements. Also, as
+ // mentioned in Step 1, the core function is designed to fill 7
+ // elements/stats/values of H buffer.
+ //
+ // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need
+ // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from
+ // row8, etc. are need not be filled. As the core function process 7 values,
+ // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from
+ // row1 and H88-H813 from row8, etc.
+ for (int i = 1; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+ const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]);
+ _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need
+ // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and
+ // H97-H98 from row9, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 5 values to be filled
+ // i.e., H22-H26 from row2 and H99-H913 from row9, etc.
+ for (int i = 2; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
+ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
+ _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need
+ // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and
+ // H107-H109 from row10, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 4 values to be filled
+ // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc.
+ for (int i = 3; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need
+ // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and
+ // H117-H1110 from row10, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 3 values to be filled
+ // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc.
+ for (int i = 4; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need
+ // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and
+ // H127-H1211 from row12, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 2 values to be filled
+ // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc.
+ for (int i = 5; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need
+ // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and
+ // H137-H1312 from row13, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 1 value to be filled
+ // i.e., H66 from row6 and H1313 from row13, etc.
+ for (int i = 6; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h));
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need
+ // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and
+ // H147-H1413 from row14, etc. are need not be filled. The first iteration of
+ // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc.
+ for (int i = 7; i < wiener_win2; i += wiener_win) {
+ // Derive j'th iteration from where the H buffer filling needs to be
+ // started.
+ j = i / wiener_win;
+ int shift = 0;
+ do {
+ // Update the dgd pointers appropriately.
+ int proc_ht = v_start;
+ const int16_t *d_window = d + (i / WIENER_WIN);
+ const int16_t *d_current_row =
+ d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride);
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 7)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 7)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i sh_0 =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ const __m256i sh_1 =
+ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
+ sh_0);
+ _mm_storeu_si128(
+ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4),
+ _mm256_castsi256_si128(sh_1));
+ _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6],
+ _mm256_extracti128_si256(sh_1, 1));
+ shift++;
+ } while (++j < wiener_win);
+ }
+
+ fill_lower_triag_elements_avx2(wiener_win2, H);
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) {
+ // Currently, libaom supports Wiener filter processing with window sizes as
+ // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD
+ // support is not facilitated. Hence, invoke C function for the same.
+ av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ return;
+ }
+
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const uint8_t avg =
+ calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+ const int32_t width = h_end - h_start;
+ const int32_t height = v_end - v_start;
+ const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
+ const int32_t s_stride = (width + 15) & ~15;
+
+ // Based on the sf 'use_downsampled_wiener_stats', process either once for
+ // UPDATE_DOWNSAMPLE_FACTOR or for each row.
+ sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg,
+ width, height, src_avg, s_stride,
+ use_downsampled_wiener_stats);
+
+ // Compute (dgd-avg) buffer here which is used to fill H buffer.
+ sub_avg_block_avx2(
+ dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
+ dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin,
+ dgd_avg, d_stride, 0);
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
+ v_start, v_end, M, H, use_downsampled_wiener_stats);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
+ v_start, v_end, M, H, use_downsampled_wiener_stats);
+ }
+}
+
+static INLINE __m256i pair_set_epi16(int a, int b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+ const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+ const __m256i v0 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i v1 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m256i xq_coeff =
+ pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt + j),
+ yy_loadu_256(flt + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_active * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+ const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m256i h00, h01, h11, c0, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+ const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f2, 32));
+ h01 = _mm256_add_epi64(h01, h01_even);
+ h01 = _mm256_add_epi64(h01, h01_odd);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+ const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+ c_low = _mm256_add_epi64(c_low, c_high);
+ const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+ _mm256_castsi256_si128(c_low));
+
+ __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+ const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+ h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+ const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+ _mm256_castsi256_si128(h0x_low));
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+ const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+ h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+ const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+ _mm256_castsi256_si128(h1x_low));
+
+ xx_storeu_128(C, c_128bit);
+ xx_storeu_128(H[0], h0x_128bit);
+ xx_storeu_128(H[1], h1x_128bit);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m256i h00, c0;
+ const __m256i zero = _mm256_setzero_si256();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+ _mm256_castsi256_si128(h00));
+ const __m128i h00_val =
+ _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+ const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+ _mm256_castsi256_si128(c0));
+ const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt1,
+ int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m256i h11, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+ _mm256_castsi256_si128(h11));
+ const __m128i h11_val =
+ _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+ const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+ _mm256_castsi256_si128(c1));
+ const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2],
+ int64_t C[2], const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h00, h01, h11, c0, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+ const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f2, 32));
+ h01 = _mm256_add_epi64(h01, h01_even);
+ h01 = _mm256_add_epi64(h01, h01_odd);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+ const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+ c_low = _mm256_add_epi64(c_low, c_high);
+ const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+ _mm256_castsi256_si128(c_low));
+
+ __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+ const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+ h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+ const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+ _mm256_castsi256_si128(h0x_low));
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+ const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+ h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+ const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+ _mm256_castsi256_si128(h1x_low));
+
+ xx_storeu_128(C, c_128bit);
+ xx_storeu_128(H[0], h0x_128bit);
+ xx_storeu_128(H[1], h1x_128bit);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h00, c0;
+ const __m256i zero = _mm256_setzero_si256();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+ _mm256_castsi256_si128(h00));
+ const __m128i h00_val =
+ _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+ const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+ _mm256_castsi256_si128(c0));
+ const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h11, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+ _mm256_castsi256_si128(h11));
+ const __m128i h11_val =
+ _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+ const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+ _mm256_castsi256_si128(c1));
+ const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled
+ const __m256i xq0 = _mm256_set1_epi32(xq[0]);
+ const __m256i xq1 = _mm256_set1_epi32(xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time
+ // Load 16 pixels each from source image and corrupted image
+ const __m256i s0 = yy_loadu_256(src + j);
+ const __m256i d0 = yy_loadu_256(dat + j);
+ // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices)
+
+ // Shift-up each pixel to match filtered image scaling
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+ // Split u0 into two halves and pad each from u16 to i32
+ const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0));
+ const __m256i u0h =
+ _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1));
+ // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+ // Load 16 pixels from each filtered image
+ const __m256i flt0l = yy_loadu_256(flt0 + j);
+ const __m256i flt0h = yy_loadu_256(flt0 + j + 8);
+ const __m256i flt1l = yy_loadu_256(flt1 + j);
+ const __m256i flt1h = yy_loadu_256(flt1 + j + 8);
+ // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+ // Subtract shifted corrupt image from each filtered image
+ const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l);
+ const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h);
+ const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l);
+ const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h);
+
+ // Multiply basis vectors by appropriate coefficients
+ const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0);
+ const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0);
+ const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1);
+ const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1);
+
+ // Add together the contributions from the two basis vectors
+ const __m256i vl = _mm256_add_epi32(v0l, v1l);
+ const __m256i vh = _mm256_add_epi32(v0h, v1h);
+
+ // Right-shift v with appropriate rounding
+ const __m256i vrl =
+ _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+ const __m256i vrh =
+ _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+ // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0]
+
+ // Saturate each i32 to an i16 then combine both halves
+ // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+ const __m256i vr =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+ // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0]
+ // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0]
+
+ // Add twin-subspace-sgr-filter to corrupt image then subtract source
+ const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+
+ const __m256i sum32l =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+ sum64 = _mm256_add_epi64(sum64, sum32l);
+ const __m256i sum32h =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 16)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled
+ const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m256i xq_active = _mm256_set1_epi32(xq_on);
+ const __m256i xq_inactive =
+ _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ // Load 16 pixels from source image
+ const __m256i s0 = yy_loadu_256(src + j);
+ // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+ // Load 16 pixels from corrupted image and pad each u16 to i32
+ const __m256i d0 = yy_loadu_256(dat + j);
+ const __m256i d0h =
+ _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1));
+ const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0));
+ // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+ // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+ // Load 16 pixels from the filtered image
+ const __m256i flth = yy_loadu_256(flt + j + 8);
+ const __m256i fltl = yy_loadu_256(flt + j);
+ // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+ const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active);
+ const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active);
+ const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive);
+ const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive);
+
+ const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq);
+ const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq);
+
+ // Shift this down with appropriate rounding
+ const __m256i vrh =
+ _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+ const __m256i vrl =
+ _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+ // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+ // Saturate each i32 to an i16 then combine both halves
+ // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+ const __m256i vr =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+ // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16
+ // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+ // Subtract twin-subspace-sgr filtered from source image to get error
+ const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+
+ const __m256i sum32l =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+ sum64 = _mm256_add_epi64(sum64, sum32l);
+ const __m256i sum32h =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 16)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_on * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ }
+ } else { // Neither filter is enabled
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 32; j += 32) {
+ // Load 2x16 u16 from source image
+ const __m256i s0l = yy_loadu_256(src + j);
+ const __m256i s0h = yy_loadu_256(src + j + 16);
+
+ // Load 2x16 u16 from corrupted image
+ const __m256i d0l = yy_loadu_256(dat + j);
+ const __m256i d0h = yy_loadu_256(dat + j + 16);
+
+ // Subtract corrupted image from source image
+ const __m256i diffl = _mm256_sub_epi16(d0l, s0l);
+ const __m256i diffh = _mm256_sub_epi16(d0h, s0h);
+
+ // Square error and add adjacent values
+ const __m256i err0l = _mm256_madd_epi16(diffl, diffl);
+ const __m256i err0h = _mm256_madd_epi16(diffh, diffh);
+
+ sum32 = _mm256_add_epi32(sum32, err0l);
+ sum32 = _mm256_add_epi32(sum32, err0h);
+ }
+
+ const __m256i sum32l =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+ sum64 = _mm256_add_epi64(sum64, sum32l);
+ const __m256i sum32h =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels (modulu 16)
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ // Sum 4 values from sum64l and sum64h into err
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 0000000000..50db305802
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,1483 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m128i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+ const __m128i d1 =
+ _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+ const __m128i dst0 = xx_loadu_128(dst);
+ const __m128i dst1 = xx_loadu_128(dst + 4);
+ const __m128i r0 = _mm_add_epi32(dst0, d0);
+ const __m128i r1 = _mm_add_epi32(dst1, d1);
+ xx_storeu_128(dst, r0);
+ xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ const int wiener_win = 7;
+ int j, k, l;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ *sumX += X1;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_sse41` function wants its input to have interleaved
+ // copies of two pixels, but we only have one. However, the pixels
+ // are (effectively) used as inputs to a multiply-accumulate.
+ // So if we set the extra pixel slot to 0, then it is effectively
+ // ignored.
+ const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint8_t avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int32_t sumX_row = 0;
+ int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = vert_end - i;
+ }
+ sumX_row = 0;
+ memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+ memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+ memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8));
+ acc_stat_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+ sumX += sumX_row * downsample_factor;
+ // Scale M matrix based on the downsampling factor
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+ M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+ }
+ }
+ // Scale H matrix based on the downsampling factor
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+ }
+ }
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] =
+ M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
+ const __m128i *shuffle,
+ const __m128i *dgd_ijkl) {
+ // Load 256 bits from dgd in two chunks
+ const __m128i s0l = xx_loadu_128(dgd);
+ const __m128i s0h = xx_loadu_128(dgd + 4);
+ // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices)
+ // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices)
+ // (Slightly strange order so we can apply the same shuffle to both halves)
+
+ // Shuffle the u16 values in each half (actually using 8-bit shuffle mask)
+ const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle);
+ const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle);
+ // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices)
+ // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices)
+
+ // Multiply s1 by dgd_ijkl resulting in 8x u32 values
+ // Horizontally add pairs of u32 resulting in 4x u32
+ const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l);
+ const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h);
+ // dl = [d c b a] as u32 values
+ // dh = [h g f e] as u32 values
+
+ // Add these 8x u32 results on to dst in four parts
+ const __m128i dll = _mm_cvtepu32_epi64(dl);
+ const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8));
+ const __m128i dhl = _mm_cvtepu32_epi64(dh);
+ const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8));
+ // dll = [b a] as u64 values, etc.
+
+ const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll);
+ xx_storeu_128(dst, rll);
+ const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh);
+ xx_storeu_128(dst + 2, rlh);
+ const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl);
+ xx_storeu_128(dst + 4, rhl);
+ const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh);
+ xx_storeu_128(dst + 6, rhh);
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+ int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd as a single u32
+ // Then broadcast to 4x u32 slots of a 128
+ const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [y x y x y x y x] as u16
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_sse41` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_sse4_1(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ // Load just half of the 256-bit shuffle control used for the AVX2 version
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd as a single u32
+ // then broadcast to 4x u32 slots of a 128
+ const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [y x y x y x y x] as u16
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_sse41` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_sse4_1(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ // Load just half of the 256-bit shuffle control used for the AVX2 version
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start,
+ int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else {
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ const int wiener_win = WIENER_WIN_CHROMA;
+ int j, k, l;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ *sumX += X1;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_sse41` function wants its input to have interleaved
+ // copies of two pixels, but we only have one. However, the pixels
+ // are (effectively) used as inputs to a multiply-accumulate.
+ // So if we set the extra pixel slot to 0, then it is effectively
+ // ignored.
+ const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint8_t avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int32_t sumX_row = 0;
+ int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = vert_end - i;
+ }
+ sumX_row = 0;
+ memset(sumY_row, 0,
+ sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+ memset(M_int32_row, 0,
+ sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+ memset(H_int32_row, 0,
+ sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8));
+ acc_stat_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+ sumX += sumX_row * downsample_factor;
+ // Scale M matrix based on the downsampling factor
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+ M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+ }
+ }
+ // Scale H matrix based on the downsampling factor
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+ }
+ }
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] =
+ M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ }
+}
+
+static INLINE __m128i pair_set_epi16(int a, int b) {
+ return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+ const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+ const __m128i v0 = _mm_madd_epi16(
+ xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i v1 = _mm_madd_epi16(
+ xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m128i xq_coeff =
+ pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt_16b =
+ _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_active * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m128i sum32 = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m128i d = xx_loadu_128(dat + j);
+ const __m128i s = xx_loadu_128(src + j);
+ const __m128i d0 = _mm_cvtepu8_epi16(d);
+ const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+ const __m128i s0 = _mm_cvtepu8_epi16(s);
+ const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m128i h00, h01, h11, c0, c1;
+ const __m128i zero = _mm_setzero_si128();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+ const __m128i s_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i h01_even = _mm_mul_epi32(f1, f2);
+ const __m128i h01_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+ h01 = _mm_add_epi64(h01, h01_even);
+ h01 = _mm_add_epi64(h01, h01_odd);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+ const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+ c_low = _mm_add_epi64(c_low, c_high);
+
+ __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+ const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+ h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+ const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+ h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+ xx_storeu_128(C, c_low);
+ xx_storeu_128(H[0], h0x_low);
+ xx_storeu_128(H[1], h1x_low);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m128i h00, c0;
+ const __m128i zero = _mm_setzero_si128();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+ const __m128i s_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+ const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m128i h11, c1;
+ const __m128i zero = _mm_setzero_si128();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+ const __m128i s_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+ const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m128i h00, h01, h11, c0, c1;
+ const __m128i zero = _mm_setzero_si128();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m128i s_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i h01_even = _mm_mul_epi32(f1, f2);
+ const __m128i h01_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+ h01 = _mm_add_epi64(h01, h01_even);
+ h01 = _mm_add_epi64(h01, h01_odd);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+ const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+ c_low = _mm_add_epi64(c_low, c_high);
+
+ __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+ const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+ h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+ const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+ h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+ xx_storeu_128(C, c_low);
+ xx_storeu_128(H[0], h0x_low);
+ xx_storeu_128(H[1], h1x_low);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m128i h00, c0;
+ const __m128i zero = _mm_setzero_si128();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m128i s_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+ const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m128i h11, c1;
+ const __m128i zero = _mm_setzero_si128();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m128i s_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+ const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled
+ const __m128i xq0 = _mm_set1_epi32(xq[0]);
+ const __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ // Load 8x pixels from source image
+ const __m128i s0 = xx_loadu_128(src + j);
+ // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[])
+
+ // Load 8x pixels from corrupted image
+ const __m128i d0 = xx_loadu_128(dat + j);
+ // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[])
+
+ // Shift each pixel value up by SGRPROJ_RST_BITS
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+ // Split u0 into two halves and pad each from u16 to i32
+ const __m128i u0l = _mm_cvtepu16_epi32(u0);
+ const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8));
+ // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices
+
+ // Load 8 pixels from first and second filtered images
+ const __m128i flt0l = xx_loadu_128(flt0 + j);
+ const __m128i flt0h = xx_loadu_128(flt0 + j + 4);
+ const __m128i flt1l = xx_loadu_128(flt1 + j);
+ const __m128i flt1h = xx_loadu_128(flt1 + j + 4);
+ // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j)
+ // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j)
+
+ // Subtract shifted corrupt image from each filtered image
+ // This gives our two basis vectors for the projection
+ const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l);
+ const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h);
+ const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l);
+ const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h);
+ // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32
+ // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32
+
+ // Multiply each basis vector by the corresponding coefficient
+ const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0);
+ const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0);
+ const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1);
+ const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1);
+
+ // Add together the contribution from each scaled basis vector
+ const __m128i vl = _mm_add_epi32(v0l, v1l);
+ const __m128i vh = _mm_add_epi32(v0h, v1h);
+
+ // Right-shift v with appropriate rounding
+ const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+ const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+
+ // Saturate each i32 value to i16 and combine lower and upper halves
+ const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+ // Add twin-subspace-sgr-filter to corrupt image then subtract source
+ const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+
+ const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+ sum64 = _mm_add_epi64(sum64, sum32l);
+ const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 8)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled
+ const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m128i xq_active = _mm_set1_epi32(xq_on);
+ const __m128i xq_inactive =
+ _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ // Load 8x pixels from source image
+ const __m128i s0 = xx_loadu_128(src + j);
+ // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[])
+
+ // Load 8x pixels from corrupted image and pad each u16 to i32
+ const __m128i d0 = xx_loadu_128(dat + j);
+ const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8));
+ const __m128i d0l = _mm_cvtepu16_epi32(d0);
+ // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[])
+
+ // Load 8 pixels from the filtered image
+ const __m128i flth = xx_loadu_128(flt + j + 4);
+ const __m128i fltl = xx_loadu_128(flt + j);
+ // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j)
+
+ const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active);
+ const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active);
+ const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive);
+ const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive);
+
+ const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq);
+ const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq);
+ // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ]
+ // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ]
+
+ // Shift this down with appropriate rounding
+ const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+ const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+
+ // Saturate vr0 and vr1 from i32 to i16 then pack together
+ const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+ // Subtract twin-subspace-sgr filtered from source image to get error
+ const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+
+ const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+ sum64 = _mm_add_epi64(sum64, sum32l);
+ const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 8)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_on * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ }
+ } else { // Neither filter is enabled
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 16; j += 16) {
+ // Load 2x8 u16 from source image
+ const __m128i s0 = xx_loadu_128(src + j);
+ const __m128i s1 = xx_loadu_128(src + j + 8);
+ // Load 2x8 u16 from corrupted image
+ const __m128i d0 = xx_loadu_128(dat + j);
+ const __m128i d1 = xx_loadu_128(dat + j + 8);
+
+ // Subtract corrupted image from source image
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+
+ // Square error and add adjacent values
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+
+ const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+ sum64 = _mm_add_epi64(sum64, sum32l);
+ const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels (modulu 8)
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ // Sum 4 values from sum64l and sum64h into err
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/rdopt_avx2.c b/third_party/aom/av1/encoder/x86/rdopt_avx2.c
new file mode 100644
index 0000000000..a0ab3940c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/rdopt_avx2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+ __m256i *xy_sum_32,
+ __m256i *xz_sum_32, __m256i *x_sum_32,
+ __m256i *x2_sum_32) {
+ // Pixels in this 4x4 [ a b c d ]
+ // are referred to as: [ e f g h ]
+ // [ i j k l ]
+ // [ m n o p ]
+
+ const __m256i pixels = _mm256_set_epi64x(
+ loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]),
+ loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride]));
+ // pixels = [d c b a h g f e] [l k j i p o n m] as i16
+
+ const __m256i slli = _mm256_slli_epi64(pixels, 16);
+ // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16
+
+ const __m256i madd_xy = _mm256_madd_epi16(pixels, slli);
+ // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32
+ *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy);
+
+ // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90
+ const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90);
+ // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16
+
+ const __m256i madd_xz = _mm256_madd_epi16(slli, perm);
+ // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32
+ *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz);
+
+ // Sum every element in slli (and then also their squares)
+ const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1));
+ // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32
+ *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli);
+
+ const __m256i madd_slli = _mm256_madd_epi16(slli, slli);
+ // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32
+ *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli);
+}
+
+void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - right neighbour pixel
+ // z - below neighbour pixel
+ // w - down-right neighbour pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, x2_sum = 0;
+
+ // Process horizontal and vertical correlations through the body in 4x4
+ // blocks. This excludes the final row and column and possibly one extra
+ // column depending how 3 divides into width and height
+ int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 };
+ __m256i xy_sum_32 = _mm256_setzero_si256();
+ __m256i xz_sum_32 = _mm256_setzero_si256();
+ __m256i x_sum_32 = _mm256_setzero_si256();
+ __m256i x2_sum_32 = _mm256_setzero_si256();
+ for (int i = 0; i <= height - 4; i += 3) {
+ for (int j = 0; j <= width - 4; j += 3) {
+ horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+ &xz_sum_32, &x_sum_32, &x2_sum_32);
+ }
+ const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32);
+ // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh]
+ // [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32
+ yy_storeu_256(xy_xz_tmp, hadd_xy_xz);
+ xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1];
+ xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3];
+
+ const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32);
+ // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g]
+ // [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32
+ yy_storeu_256(x_x2_tmp, hadd_x_x2);
+ x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1];
+ x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3];
+
+ xy_sum_32 = _mm256_setzero_si256();
+ xz_sum_32 = _mm256_setzero_si256();
+ x_sum_32 = _mm256_setzero_si256();
+ x2_sum_32 = _mm256_setzero_si256();
+ }
+
+ // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+ int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+ // Do we have 2 rows remaining or just the one? Note that width and height
+ // are powers of 2, so each modulo 3 must be 1 or 2.
+ if (height % 3 == 1) { // Just horiz corrs on the final row
+ const int16_t x0 = diff[(height - 1) * stride];
+ x_sum += x0;
+ x_finalrow += x0;
+ x2_sum += x0 * x0;
+ x2_finalrow += x0 * x0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 1) * stride + j];
+ const int16_t y = diff[(height - 1) * stride + j + 1];
+ xy_sum += x * y;
+ x_sum += y;
+ x2_sum += y * y;
+ x_finalrow += y;
+ x2_finalrow += y * y;
+ }
+ } else { // Two rows remaining to do
+ const int16_t x0 = diff[(height - 2) * stride];
+ const int16_t z0 = diff[(height - 1) * stride];
+ x_sum += x0 + z0;
+ x2_sum += x0 * x0 + z0 * z0;
+ x_finalrow += z0;
+ x2_finalrow += z0 * z0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 2) * stride + j];
+ const int16_t y = diff[(height - 2) * stride + j + 1];
+ const int16_t z = diff[(height - 1) * stride + j];
+ const int16_t w = diff[(height - 1) * stride + j + 1];
+
+ // Horizontal and vertical correlations for the penultimate row:
+ xy_sum += x * y;
+ xz_sum += x * z;
+
+ // Now just horizontal correlations for the final row:
+ xy_sum += z * w;
+
+ x_sum += y + w;
+ x2_sum += y * y + w * w;
+ x_finalrow += w;
+ x2_finalrow += w * w;
+ }
+ }
+
+ // Do we have 2 columns remaining or just the one?
+ if (width % 3 == 1) { // Just vert corrs on the final col
+ const int16_t x0 = diff[width - 1];
+ x_sum += x0;
+ x_finalcol += x0;
+ x2_sum += x0 * x0;
+ x2_finalcol += x0 * x0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 1];
+ xz_sum += x * z;
+ x_finalcol += z;
+ x2_finalcol += z * z;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z;
+ x2_sum += z * z;
+ }
+ }
+ } else { // Two cols remaining
+ const int16_t x0 = diff[width - 2];
+ const int16_t y0 = diff[width - 1];
+ x_sum += x0 + y0;
+ x2_sum += x0 * x0 + y0 * y0;
+ x_finalcol += y0;
+ x2_finalcol += y0 * y0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 2];
+ const int16_t y = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 2];
+ const int16_t w = diff[(i + 1) * stride + width - 1];
+
+ // Horizontal and vertical correlations for the penultimate col:
+ // Skip these on the last iteration of this loop if we also had two
+ // rows remaining, otherwise the final horizontal and vertical correlation
+ // get erroneously processed twice
+ if (i < height - 2 || height % 3 == 1) {
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+
+ x_finalcol += w;
+ x2_finalcol += w * w;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z + w;
+ x2_sum += z * z + w * w;
+ }
+
+ // Now just vertical correlations for the final column:
+ xz_sum += y * w;
+ }
+ }
+
+ // Calculate the simple sums and squared-sums
+ int64_t x_firstrow = 0, x_firstcol = 0;
+ int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+ for (int j = 0; j < width; ++j) {
+ x_firstrow += diff[j];
+ x2_firstrow += diff[j] * diff[j];
+ }
+ for (int i = 0; i < height; ++i) {
+ x_firstcol += diff[i * stride];
+ x2_firstcol += diff[i * stride] * diff[i * stride];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/rdopt_sse4.c b/third_party/aom/av1/encoder/x86/rdopt_sse4.c
new file mode 100644
index 0000000000..12ac146195
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/rdopt_sse4.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+ __m128i *xy_sum_32,
+ __m128i *xz_sum_32, __m128i *x_sum_32,
+ __m128i *x2_sum_32) {
+ // Pixels in this 4x4 [ a b c d ]
+ // are referred to as: [ e f g h ]
+ // [ i j k l ]
+ // [ m n o p ]
+
+ const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride],
+ *(int64_t *)&diff[2 * stride]);
+ const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride],
+ *(int64_t *)&diff[3 * stride]);
+ // pixelsa = [d c b a l k j i] as i16
+ // pixelsb = [h g f e p o n m] as i16
+
+ const __m128i slli_a = _mm_slli_epi64(pixelsa, 16);
+ const __m128i slli_b = _mm_slli_epi64(pixelsb, 16);
+ // slli_a = [c b a 0 k j i 0] as i16
+ // slli_b = [g f e 0 o n m 0] as i16
+
+ const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a);
+ const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b);
+ // xy_madd_a = [bc+cd ab jk+kl ij] as i32
+ // xy_madd_b = [fg+gh ef no+op mn] as i32
+
+ const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a);
+ // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32
+ *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32);
+
+ const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b);
+ // xz_madd_a = [bf+cg ae jn+ko im] i32
+
+ const __m128i swap_b = _mm_srli_si128(slli_b, 8);
+ // swap_b = [0 0 0 0 g f e 0] as i16
+ const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b);
+ // xz_madd_b = [0 0 gk+fj ei] i32
+
+ const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a);
+ // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32
+ *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32);
+
+ // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+ // (sum up every element in slli_a and swap_b)
+ const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a);
+ const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a);
+ // sum_slli_a32 = [c+b a k+j i] as i32
+ const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b);
+ // swap_b32 = [g f e 0] as i32
+ *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32);
+ *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32);
+ // sum = [c+b+g a+f k+j+e i] as i32
+
+ // Also sum their squares
+ const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a);
+ const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b);
+ // slli_a_2 = [c2+b2 a2 k2+j2 i2]
+ // swap_b_2 = [0 0 g2+f2 e2]
+ const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2);
+ // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2]
+ *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2);
+}
+
+void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - right neighbour pixel
+ // z - below neighbour pixel
+ // w - down-right neighbour pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, x2_sum = 0;
+
+ // Process horizontal and vertical correlations through the body in 4x4
+ // blocks. This excludes the final row and column and possibly one extra
+ // column depending how 3 divides into width and height
+ int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 };
+ int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 };
+ __m128i xy_sum_32 = _mm_setzero_si128();
+ __m128i xz_sum_32 = _mm_setzero_si128();
+ __m128i x_sum_32 = _mm_setzero_si128();
+ __m128i x2_sum_32 = _mm_setzero_si128();
+ for (int i = 0; i <= height - 4; i += 3) {
+ for (int j = 0; j <= width - 4; j += 3) {
+ horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+ &xz_sum_32, &x_sum_32, &x2_sum_32);
+ }
+ xx_storeu_128(xy_tmp, xy_sum_32);
+ xx_storeu_128(xz_tmp, xz_sum_32);
+ xx_storeu_128(x_tmp, x_sum_32);
+ xx_storeu_128(x2_tmp, x2_sum_32);
+ xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1];
+ xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0];
+ x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0];
+ x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0];
+ xy_sum_32 = _mm_setzero_si128();
+ xz_sum_32 = _mm_setzero_si128();
+ x_sum_32 = _mm_setzero_si128();
+ x2_sum_32 = _mm_setzero_si128();
+ }
+
+ // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+ int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+ // Do we have 2 rows remaining or just the one? Note that width and height
+ // are powers of 2, so each modulo 3 must be 1 or 2.
+ if (height % 3 == 1) { // Just horiz corrs on the final row
+ const int16_t x0 = diff[(height - 1) * stride];
+ x_sum += x0;
+ x_finalrow += x0;
+ x2_sum += x0 * x0;
+ x2_finalrow += x0 * x0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 1) * stride + j];
+ const int16_t y = diff[(height - 1) * stride + j + 1];
+ xy_sum += x * y;
+ x_sum += y;
+ x2_sum += y * y;
+ x_finalrow += y;
+ x2_finalrow += y * y;
+ }
+ } else { // Two rows remaining to do
+ const int16_t x0 = diff[(height - 2) * stride];
+ const int16_t z0 = diff[(height - 1) * stride];
+ x_sum += x0 + z0;
+ x2_sum += x0 * x0 + z0 * z0;
+ x_finalrow += z0;
+ x2_finalrow += z0 * z0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 2) * stride + j];
+ const int16_t y = diff[(height - 2) * stride + j + 1];
+ const int16_t z = diff[(height - 1) * stride + j];
+ const int16_t w = diff[(height - 1) * stride + j + 1];
+
+ // Horizontal and vertical correlations for the penultimate row:
+ xy_sum += x * y;
+ xz_sum += x * z;
+
+ // Now just horizontal correlations for the final row:
+ xy_sum += z * w;
+
+ x_sum += y + w;
+ x2_sum += y * y + w * w;
+ x_finalrow += w;
+ x2_finalrow += w * w;
+ }
+ }
+
+ // Do we have 2 columns remaining or just the one?
+ if (width % 3 == 1) { // Just vert corrs on the final col
+ const int16_t x0 = diff[width - 1];
+ x_sum += x0;
+ x_finalcol += x0;
+ x2_sum += x0 * x0;
+ x2_finalcol += x0 * x0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 1];
+ xz_sum += x * z;
+ x_finalcol += z;
+ x2_finalcol += z * z;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z;
+ x2_sum += z * z;
+ }
+ }
+ } else { // Two cols remaining
+ const int16_t x0 = diff[width - 2];
+ const int16_t y0 = diff[width - 1];
+ x_sum += x0 + y0;
+ x2_sum += x0 * x0 + y0 * y0;
+ x_finalcol += y0;
+ x2_finalcol += y0 * y0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 2];
+ const int16_t y = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 2];
+ const int16_t w = diff[(i + 1) * stride + width - 1];
+
+ // Horizontal and vertical correlations for the penultimate col:
+ // Skip these on the last iteration of this loop if we also had two
+ // rows remaining, otherwise the final horizontal and vertical correlation
+ // get erroneously processed twice
+ if (i < height - 2 || height % 3 == 1) {
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+
+ x_finalcol += w;
+ x2_finalcol += w * w;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z + w;
+ x2_sum += z * z + w * w;
+ }
+
+ // Now just vertical correlations for the final column:
+ xz_sum += y * w;
+ }
+ }
+
+ // Calculate the simple sums and squared-sums
+ int64_t x_firstrow = 0, x_firstcol = 0;
+ int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+ for (int j = 0; j < width; ++j) {
+ x_firstrow += diff[j];
+ x2_firstrow += diff[j] * diff[j];
+ }
+ for (int i = 0; i < height; ++i) {
+ x_firstcol += diff[i * stride];
+ x2_firstcol += diff[i * stride] * diff[i * stride];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c
new file mode 100644
index 0000000000..a492483721
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+ // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+ // 2-tap yet.
+ int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ if (width >= 16) {
+ int i;
+ assert(!(width & 15));
+ /*Read 16 pixels one row at a time.*/
+ for (i = 0; i < height; i++) {
+ int j;
+ for (j = 0; j < width; j += 16) {
+ xx_storeu_128(comp_pred, xx_loadu_128(ref));
+ comp_pred += 16;
+ ref += 16;
+ }
+ ref += ref_stride - width;
+ }
+ } else if (width >= 8) {
+ int i;
+ assert(!(width & 7));
+ assert(!(height & 1));
+ /*Read 8 pixels two rows at a time.*/
+ for (i = 0; i < height; i += 2) {
+ __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+ xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+ comp_pred += 16;
+ ref += 2 * ref_stride;
+ }
+ } else {
+ int i;
+ assert(!(width & 3));
+ assert(!(height & 3));
+ /*Read 4 pixels four rows at a time.*/
+ for (i = 0; i < height; i++) {
+ const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+ const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+ const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+ const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+ const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+ _mm_unpacklo_epi32(row2, row3));
+ xx_storeu_128(comp_pred, reg);
+ comp_pred += 16;
+ ref += 4 * ref_stride;
+ }
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+ width, height);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+ width, height);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t,
+ temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+ uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+ ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+ : temp;
+ uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+ kernel_x, 16, NULL, -1, width, intermediate_height);
+ aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+ kernel_y, 16, width, height);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w0,
+ const __m128i *w1,
+ const __m128i *r,
+ void *const result) {
+ assert(DIST_PRECISION_BITS <= 4);
+ __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+ __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+ __m128i sum = _mm_adds_epu16(mult0, mult1);
+ __m128i round = _mm_adds_epu16(sum, *r);
+ __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, shift);
+}
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+ const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+ int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ if (width >= 8) {
+ int i;
+ assert(!(width & 7));
+ /*Read 8 pixels one row at a time.*/
+ for (i = 0; i < height; i++) {
+ int j;
+ for (j = 0; j < width; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ _mm_storeu_si128((__m128i *)comp_pred, s0);
+ comp_pred += 8;
+ ref += 8;
+ }
+ ref += ref_stride - width;
+ }
+ } else {
+ int i;
+ assert(!(width & 3));
+ /*Read 4 pixels two rows at a time.*/
+ for (i = 0; i < height; i += 2) {
+ __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+ __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+ __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+ _mm_storeu_si128((__m128i *)comp_pred, t0);
+ comp_pred += 8;
+ ref += 2 * ref_stride;
+ }
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+ NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+ uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+ ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+ : temp;
+ uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz(
+ ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+ MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+ aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+ comp_pred8, width, NULL, -1, kernel_y, 16, width,
+ height, bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+ /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+ assert(!(width * height & 7));
+ int n = width * height >> 3;
+ for (int i = 0; i < n; i++) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+ __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+ _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+ comp_pred16 += 8;
+ pred += 8;
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ int n;
+ int i;
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ assert(!(width * height & 7));
+ n = width * height >> 3;
+
+ const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+ const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+ const __m128i w0 = _mm_set1_epi16(wt0);
+ const __m128i w1 = _mm_set1_epi16(wt1);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (i = 0; i < n; i++) {
+ __m128i p0 = xx_loadu_128(comp_pred16);
+ __m128i p1 = xx_loadu_128(pred);
+
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+ comp_pred16 += 8;
+ pred += 8;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_avg_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ int n;
+ int i;
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+ /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+ assert(!(width * height & 15));
+ n = width * height >> 4;
+ for (i = 0; i < n; i++) {
+ __m128i s0 = xx_loadu_128(comp_pred);
+ __m128i p0 = xx_loadu_128(pred);
+ xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+ comp_pred += 16;
+ pred += 16;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c
new file mode 100644
index 0000000000..df7aa95855
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w, const __m128i *r,
+ void *const result) {
+ __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+ __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+ __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+ __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+ __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+ __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+ __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+ __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ int n;
+ int i;
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+ /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+ assert(!(width * height & 15));
+ n = width * height >> 4;
+
+ const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+ const int8_t w1 = (int8_t)jcp_param->bck_offset;
+ const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+ w1, w0, w1, w0);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+
+ for (i = 0; i < n; i++) {
+ __m128i p0 = xx_loadu_128(comp_pred);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 0000000000..752d6f3f0b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 2)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+ { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+ { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+ { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
+ { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
+};
+
+#define CALC_X_GRADIENT(AC, GI, DF, out) \
+ out = _mm256_abs_epi16( \
+ _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1)));
+
+#define CALC_Y_GRADIENT(AC, GI, BH, out) \
+ out = _mm256_abs_epi16( \
+ _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1)));
+
+double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ int count = 0;
+ int64_t accum = 0;
+ // w32 stores width multiple of 32.
+ const int w32 = (width - 1) & ~0x1f;
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh);
+ __m256i num_accumulator = zero;
+ __m256i sum_accumulator = zero;
+
+ // A | B | C
+ // D | E | F
+ // G | H | I
+ // g_x = (A - C) + (G - I) + 2*(D - F)
+ // g_y = (A + C) - (G + I) + 2*(B - H)
+ // v = 4*E - 2*(D+F+B+H) + (A+C+G+I)
+
+ // Process the width multiple of 32 here.
+ for (int w = 1; w < w32; w += 32) {
+ int h = 1;
+ const int start_idx = h * stride + w;
+ const int stride_0 = start_idx - stride;
+
+ __m256i num_accum_row_lvl = zero;
+ const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1]));
+ const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1]));
+ const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1]));
+ const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1]));
+ __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0]));
+ __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx]));
+
+ const __m256i A_lo = _mm256_unpacklo_epi8(A, zero);
+ const __m256i A_hi = _mm256_unpackhi_epi8(A, zero);
+ const __m256i C_lo = _mm256_unpacklo_epi8(C, zero);
+ const __m256i C_hi = _mm256_unpackhi_epi8(C, zero);
+ const __m256i D_lo = _mm256_unpacklo_epi8(D, zero);
+ const __m256i D_hi = _mm256_unpackhi_epi8(D, zero);
+ const __m256i F_lo = _mm256_unpacklo_epi8(F, zero);
+ const __m256i F_hi = _mm256_unpackhi_epi8(F, zero);
+
+ __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo);
+ __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi);
+ __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo);
+ __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi);
+ __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo);
+ __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi);
+ __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo);
+ __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi);
+
+ for (; h < height - 1; h++) {
+ __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi,
+ gy_hi;
+ const int k = h * stride + w;
+ const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1]));
+ const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride]));
+ const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1]));
+
+ const __m256i B_lo = _mm256_unpacklo_epi8(B, zero);
+ const __m256i B_hi = _mm256_unpackhi_epi8(B, zero);
+ const __m256i G_lo = _mm256_unpacklo_epi8(G, zero);
+ const __m256i G_hi = _mm256_unpackhi_epi8(G, zero);
+ const __m256i I_lo = _mm256_unpacklo_epi8(I, zero);
+ const __m256i I_hi = _mm256_unpackhi_epi8(I, zero);
+ const __m256i H_lo = _mm256_unpacklo_epi8(H, zero);
+ const __m256i H_hi = _mm256_unpackhi_epi8(H, zero);
+
+ sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo);
+ sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi);
+ sum_GI_lo = _mm256_add_epi16(G_lo, I_lo);
+ sum_GI_hi = _mm256_add_epi16(G_hi, I_hi);
+ const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo);
+ const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi);
+
+ CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo)
+ CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo)
+
+ const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo);
+
+ CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi)
+ CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi)
+
+ const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi);
+
+ __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo);
+ __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi);
+ const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi);
+
+ // v = 4*E -2*(D+F+B+H) + (A+C+G+I)
+ if (_mm256_movemask_epi8(comp_reg) != 0) {
+ const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo);
+ const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi);
+
+ // 2*(D+F+B+H)
+ const __m256i sum_DFBH_lo =
+ _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1);
+ // (A+C+G+I)
+ const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo);
+ const __m256i sum_DFBH_hi =
+ _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1);
+ const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi);
+
+ // Convert E register values from 8bit to 16bit
+ const __m256i E_lo = _mm256_unpacklo_epi8(E, zero);
+ const __m256i E_hi = _mm256_unpackhi_epi8(E, zero);
+
+ // 4*E - 2*(D+F+B+H)+ (A+C+G+I)
+ const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16(
+ _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo),
+ sum_ACGI_lo));
+ const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16(
+ _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi),
+ sum_ACGI_hi));
+ cmp_lo = _mm256_srli_epi16(cmp_lo, 15);
+ cmp_hi = _mm256_srli_epi16(cmp_hi, 15);
+ const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo);
+ const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi);
+
+ num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo);
+ num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi);
+
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpacklo_epi16(var_lo, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpackhi_epi16(var_lo, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpacklo_epi16(var_hi, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpackhi_epi16(var_hi, zero));
+ }
+ sub_AC_lo = sub_DF_lo;
+ sub_AC_hi = sub_DF_hi;
+ sub_DF_lo = sub_GI_lo;
+ sub_DF_hi = sub_GI_hi;
+ sum_AC_lo = sum_DF_lo;
+ sum_AC_hi = sum_DF_hi;
+ sum_DF_lo = sum_GI_lo;
+ sum_DF_hi = sum_GI_hi;
+ B = E;
+ E = H;
+ }
+ const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero);
+ const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero);
+ num_accumulator =
+ _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1));
+ }
+
+ // Process the remaining width here.
+ for (int h = 1; h < height - 1; ++h) {
+ for (int w = w32 + 1; w < width - 1; ++w) {
+ const int k = h * stride + w;
+
+ // Compute sobel gradients
+ const int g_x = (src[k - stride - 1] - src[k - stride + 1]) +
+ (src[k + stride - 1] - src[k + stride + 1]) +
+ 2 * (src[k - 1] - src[k + 1]);
+ const int g_y = (src[k - stride - 1] - src[k + stride - 1]) +
+ (src[k - stride + 1] - src[k + stride + 1]) +
+ 2 * (src[k - stride] - src[k + stride]);
+ const int ga = abs(g_x) + abs(g_y);
+
+ if (ga < edge_thresh) {
+ // Find Laplacian
+ const int v =
+ 4 * src[k] -
+ 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
+ (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
+ src[k + stride + 1]);
+ accum += abs(v);
+ ++count;
+ }
+ }
+ }
+
+ // s0 s1 n0 n1 s2 s3 n2 n3
+ __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator);
+ __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx);
+ __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1);
+ // s0+s2 s1+s3 n0+n2 n1+n3
+ __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi);
+ // s0+s2+s1+s3 n0+n2+n1+n3
+ __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1);
+
+ accum += _mm_cvtsi128_si32(result);
+ count += _mm_extract_epi32(result, 2);
+
+ // If very few smooth pels, return -1 since the estimate is unreliable.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint16_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint8_t *src1 = frame1;
+ const uint8_t *src2 = frame2;
+ uint16_t *dst = frame_sse;
+ for (int i = 0; i < block_height; i++) {
+ __m128i vf1_128, vf2_128;
+ __m256i vf1, vf2, vdiff1, vsqdiff1;
+
+ vf1_128 = _mm_loadu_si128((__m128i *)(src1));
+ vf2_128 = _mm_loadu_si128((__m128i *)(src2));
+ vf1 = _mm256_cvtepu8_epi16(vf1_128);
+ vf2 = _mm256_cvtepu8_epi16(vf2_128);
+ vdiff1 = _mm256_sub_epi16(vf1, vf2);
+ vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+
+ _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ src1 += stride, src2 += stride2;
+ dst += sse_stride;
+ }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint16_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint8_t *src1 = frame1;
+ const uint8_t *src2 = frame2;
+ uint16_t *dst = frame_sse;
+ for (int i = 0; i < block_height; i++) {
+ __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+ vsrc1 = _mm256_loadu_si256((__m256i *)src1);
+ vsrc2 = _mm256_loadu_si256((__m256i *)src2);
+ vmax = _mm256_max_epu8(vsrc1, vsrc2);
+ vmin = _mm256_min_epu8(vsrc1, vsrc2);
+ vdiff = _mm256_subs_epu8(vmax, vmin);
+
+ __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+ __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+ vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+ vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+ vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+ vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+ _mm256_storeu_si256((__m256i *)(dst), vres1);
+ _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ src1 += stride;
+ src2 += stride2;
+ dst += sse_stride;
+ }
+}
+
+static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
+ int block_width) {
+ __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
+ if (col == 0) {
+ // For the first column, replicate the first element twice to the left
+ v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
+ }
+ if (col == block_width - 4) {
+ // For the last column, replicate the last element twice to the right
+ v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
+ }
+ return _mm256_cvtepu16_epi32(v128tmp);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+ // Mask the required 5 values inside the vector
+ __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+ __m128i v128a, v128b;
+ // Extract 256b as two 128b registers A and B
+ v128a = _mm256_castsi256_si128(vtmp);
+ v128b = _mm256_extracti128_si256(vtmp, 1);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A2+B2, A3+B3, 0, 0]
+ v128b = _mm_srli_si128(v128a, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ v128b = _mm_srli_si128(v128a, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ return _mm_extract_epi32(v128a, 0);
+}
+
+// AVX2 implementation of approx_exp()
+static AOM_INLINE __m256 approx_exp_avx2(__m256 y) {
+#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2)
+#define B \
+ 127 // Offset for the exponent according to IEEE floating point standard.
+#define C 60801 // Magic number controls the accuracy of approximation
+ const __m256 multiplier = _mm256_set1_ps(A);
+ const __m256i offset = _mm256_set1_epi32(B * (1 << 23) - C);
+
+ y = _mm256_mul_ps(y, multiplier);
+ y = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_cvttps_epi32(y), offset));
+ return y;
+#undef A
+#undef B
+#undef C
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint16_t *frame_sse, uint32_t *luma_sse_sum,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ if (block_width == 32) {
+ get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ } else {
+ get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ }
+
+ __m256i vsrc[5];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ for (int col = 0; col < block_width; col += 4) {
+ uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ vsrc[i] = xx_load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height; row++) {
+ __m256i vsum = _mm256_setzero_si256();
+
+ // Add 5 consecutive rows
+ for (int i = 0; i < 5; i++) {
+ vsum = _mm256_add_epi32(vsum, vsrc[i]);
+ }
+
+ // Push all elements by one element to the top
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ // Load next row to the last element
+ if (row <= block_height - 4) {
+ vsrc[4] = xx_load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ } else {
+ vsrc[4] = vsrc[3];
+ }
+
+ // Accumulate the sum horizontally
+ for (int i = 0; i < 4; i++) {
+ acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
+ }
+ }
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ __m256d subblock_mses_reg[4];
+ __m256d d_factor_mul_n_decay_qr_invs[4];
+ const __m256 zero = _mm256_set1_ps(0.0f);
+ const __m256 point_five = _mm256_set1_ps(0.5f);
+ const __m256 seven = _mm256_set1_ps(7.0f);
+ const __m256d inv_num_ref_pixel_256bit = _mm256_set1_pd(inv_num_ref_pixels);
+ const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor);
+ const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE);
+ // Maintain registers to hold mse and d_factor at subblock level.
+ subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]);
+ subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]);
+ subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]);
+ subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]);
+ d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]);
+ d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]);
+ d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]);
+ d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]);
+
+ for (int i = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW;
+ for (int j = 0; j < block_width; j += 8) {
+ const __m256i acc_sse =
+ _mm256_lddqu_si256((__m256i *)(acc_5x5_sse[i] + j));
+ const __m256i luma_sse =
+ _mm256_lddqu_si256((__m256i *)((luma_sse_sum_temp + j)));
+
+ // uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+ const __m256i diff_sse = _mm256_add_epi32(acc_sse, luma_sse);
+
+ const __m256d diff_sse_pd_1 =
+ _mm256_cvtepi32_pd(_mm256_castsi256_si128(diff_sse));
+ const __m256d diff_sse_pd_2 =
+ _mm256_cvtepi32_pd(_mm256_extracti128_si256(diff_sse, 1));
+
+ // const double window_error = diff_sse * inv_num_ref_pixels;
+ const __m256d window_error_1 =
+ _mm256_mul_pd(diff_sse_pd_1, inv_num_ref_pixel_256bit);
+ const __m256d window_error_2 =
+ _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit);
+
+ // const int subblock_idx = y_blk_raster_offset + (j >= block_width /
+ // 2);
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const __m256d blk_error = subblock_mses_reg[subblock_idx];
+
+ // const double combined_error =
+ // weight_factor *window_error + subblock_mses_scaled[subblock_idx];
+ const __m256d combined_error_1 = _mm256_add_pd(
+ _mm256_mul_pd(window_error_1, weight_factor_256bit), blk_error);
+
+ const __m256d combined_error_2 = _mm256_add_pd(
+ _mm256_mul_pd(window_error_2, weight_factor_256bit), blk_error);
+
+ // d_factor_decayed[subblock_idx]
+ const __m256d d_fact_mul_n_decay =
+ d_factor_mul_n_decay_qr_invs[subblock_idx];
+
+ // double scaled_error = combined_error *
+ // d_factor_decayed[subblock_idx];
+ const __m256d scaled_error_1 =
+ _mm256_mul_pd(combined_error_1, d_fact_mul_n_decay);
+ const __m256d scaled_error_2 =
+ _mm256_mul_pd(combined_error_2, d_fact_mul_n_decay);
+
+ const __m128 scaled_error_ps_1 = _mm256_cvtpd_ps(scaled_error_1);
+ const __m128 scaled_error_ps_2 = _mm256_cvtpd_ps(scaled_error_2);
+
+ const __m256 scaled_error_ps = _mm256_insertf128_ps(
+ _mm256_castps128_ps256(scaled_error_ps_1), scaled_error_ps_2, 0x1);
+
+ // scaled_error = AOMMIN(scaled_error, 7);
+ const __m256 scaled_diff_ps = _mm256_min_ps(scaled_error_ps, seven);
+ const __m256 minus_scaled_diff_ps = _mm256_sub_ps(zero, scaled_diff_ps);
+ // const int weight =
+ //(int)(approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE + 0.5f);
+ const __m256 exp_result = approx_exp_avx2(minus_scaled_diff_ps);
+ const __m256 scale_weight_exp_result =
+ _mm256_mul_ps(exp_result, tf_weight_scale);
+ const __m256 round_result =
+ _mm256_add_ps(scale_weight_exp_result, point_five);
+ __m256i weights_in_32bit = _mm256_cvttps_epi32(round_result);
+
+ __m128i weights_in_16bit =
+ _mm_packus_epi32(_mm256_castsi256_si128(weights_in_32bit),
+ _mm256_extractf128_si256(weights_in_32bit, 0x1));
+
+ // count[k] += weight;
+ // accumulator[k] += weight * pixel_value;
+ const int stride_idx = i * stride2 + j;
+ const __m128i count_array =
+ _mm_loadu_si128((__m128i *)(count + stride_idx));
+ _mm_storeu_si128((__m128i *)(count + stride_idx),
+ _mm_add_epi16(count_array, weights_in_16bit));
+
+ const __m256i accumulator_array =
+ _mm256_loadu_si256((__m256i *)(accumulator + stride_idx));
+ const __m128i pred_values =
+ _mm_loadl_epi64((__m128i *)(frame2 + stride_idx));
+
+ const __m256i pred_values_u32 = _mm256_cvtepu8_epi32(pred_values);
+ const __m256i mull_frame2_weight_u32 =
+ _mm256_mullo_epi32(pred_values_u32, weights_in_32bit);
+ _mm256_storeu_si256(
+ (__m256i *)(accumulator + stride_idx),
+ _mm256_add_epi32(accumulator_array, mull_frame2_weight_u32));
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_avx2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+ }
+ }
+ }
+ }
+ }
+
+ apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+ plane_w, plane_h, subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_sse, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c
new file mode 100644
index 0000000000..842d3b13c8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+ { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
+ const uint8_t *frame2, const unsigned int stride2,
+ const int block_width, const int block_height,
+ uint16_t *frame_sse,
+ const unsigned int dst_stride) {
+ const uint8_t *src1 = frame1;
+ const uint8_t *src2 = frame2;
+ uint16_t *dst = frame_sse;
+
+ for (int i = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j += 16) {
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+ __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+ __m128i vmax = _mm_max_epu8(vsrc1, vsrc2);
+ __m128i vmin = _mm_min_epu8(vsrc1, vsrc2);
+ __m128i vdiff = _mm_subs_epu8(vmax, vmin);
+
+ __m128i vzero = _mm_setzero_si128();
+ __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero);
+ __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero);
+
+ __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1);
+ __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2);
+
+ _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+ _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
+ }
+
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst + block_width + 2) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ src1 += stride;
+ src2 += stride2;
+ dst += dst_stride;
+ }
+}
+
+static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col,
+ int block_width) {
+ __m128i vtmp = _mm_loadu_si128((__m128i *)src);
+ __m128i vzero = _mm_setzero_si128();
+ __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero);
+ __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero);
+ // For the first column, replicate the first element twice to the left
+ dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+ // For the last column, replicate the last element twice to the right
+ dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+ __m128i veca, vecb;
+ // Mask and obtain the required 5 values inside the vector
+ veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+ vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A2+B2, A3+B3, 0, 0]
+ vecb = _mm_srli_si128(veca, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ vecb = _mm_srli_si128(veca, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ return _mm_cvtsi128_si32(veca);
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint16_t *frame_sse, uint32_t *luma_sse_sum,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+ frame_sse, SSE_STRIDE);
+
+ __m128i vsrc[5][2];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ for (int col = 0; col < block_width; col += 4) {
+ uint16_t *src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad(src, vsrc[i], col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Padding for top 2 rows
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (int row = 0; row < block_height; row++) {
+ __m128i vsum1 = _mm_setzero_si128();
+ __m128i vsum2 = _mm_setzero_si128();
+
+ // Add 5 consecutive rows
+ for (int i = 0; i < 5; i++) {
+ vsum1 = _mm_add_epi32(vsrc[i][0], vsum1);
+ vsum2 = _mm_add_epi32(vsrc[i][1], vsum2);
+ }
+
+ // Push all elements by one element to the top
+ for (int i = 0; i < 4; i++) {
+ vsrc[i][0] = vsrc[i + 1][0];
+ vsrc[i][1] = vsrc[i + 1][1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row
+ xx_load_and_pad(src, vsrc[4], col, block_width);
+ src += SSE_STRIDE;
+ } else {
+ // Padding for bottom 2 rows
+ vsrc[4][0] = vsrc[3][0];
+ vsrc[4][1] = vsrc[3][1];
+ }
+
+ // Accumulate the sum horizontally
+ for (int i = 0; i < 4; i++) {
+ acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i);
+ }
+ }
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_sse2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+ plane_w, plane_h, subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_sse, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 0000000000..9cde860534
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+
+ uint64_t csse;
+
+ const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+ const __m256i v_zext_q = yy_set1_64_from_32i(~0);
+
+ __m256i v_acc0_q = _mm256_setzero_si256();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+ const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+ const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+ const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+ const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+ const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+ const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+ const __m256i v_sum0_q = _mm256_add_epi64(
+ _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if AOM_ARCH_X86_64
+ csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+ __m256i v_acc0_d = _mm256_setzero_si256();
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+ const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+ const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+ const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+ const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+ const __m256i v_m0_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+ const __m256i v_m1_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+ const __m256i v_m2_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+ const __m256i v_m3_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+ const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+ const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+ const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+ const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+ const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+ const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+ const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+ v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+ v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if AOM_ARCH_X86_64
+ acc = _mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+ return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+ const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+ const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+ const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+ const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+ const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+ const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+ const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+ const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+ const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+ const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+ const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+ const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+ const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+ const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+ const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+ const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+ _mm256_store_si256((__m256i *)(d), v_r0_w);
+ _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+ _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+ _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+ a += 64;
+ b += 64;
+ d += 64;
+ N -= 64;
+ } while (N);
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000000..d7ac2223f2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+ int n8 = n + 8;
+
+ uint64_t csse;
+
+ const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+ const __m128i v_zext_q = xx_set1_64_from_32i(~0);
+
+ __m128i v_acc0_q = _mm_setzero_si128();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m128i v_r0_w = xx_load_128(r1 + n);
+ const __m128i v_r1_w = xx_load_128(r1 + n8);
+ const __m128i v_d0_w = xx_load_128(d + n);
+ const __m128i v_d1_w = xx_load_128(d + n8);
+ const __m128i v_m01_b = xx_load_128(m + n);
+
+ const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+ const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+ const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+ const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+ const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+ const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+ const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+ const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+ const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+ const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+ const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+ const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+ _mm_srli_epi64(v_sq0_d, 32));
+ const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+ _mm_srli_epi64(v_sq1_d, 32));
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+ n8 += 16;
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if AOM_ARCH_X86_64
+ csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+ xx_storel_64(&csse, v_acc0_q);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+
+ __m128i v_sign_d;
+ __m128i v_acc0_d = _mm_setzero_si128();
+ __m128i v_acc1_d = _mm_setzero_si128();
+ __m128i v_acc_q;
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_m01_b = xx_load_128(m);
+ const __m128i v_m23_b = xx_load_128(m + 16);
+ const __m128i v_m45_b = xx_load_128(m + 32);
+ const __m128i v_m67_b = xx_load_128(m + 48);
+
+ const __m128i v_d0_w = xx_load_128(ds);
+ const __m128i v_d1_w = xx_load_128(ds + 8);
+ const __m128i v_d2_w = xx_load_128(ds + 16);
+ const __m128i v_d3_w = xx_load_128(ds + 24);
+ const __m128i v_d4_w = xx_load_128(ds + 32);
+ const __m128i v_d5_w = xx_load_128(ds + 40);
+ const __m128i v_d6_w = xx_load_128(ds + 48);
+ const __m128i v_d7_w = xx_load_128(ds + 56);
+
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+ const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+ const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+ const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+ const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+ const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+ const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+ const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+ const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+ const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+ const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+ const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+ const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+ const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+ const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+ const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+ v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+ v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+ v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+ v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+ v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if AOM_ARCH_X86_64
+ acc = _mm_cvtsi128_si64(v_acc_q);
+#else
+ xx_storel_64(&acc, v_acc_q);
+#endif
+
+ return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+ return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
+ (short)0xffff, 0, (short)0xffff, 0);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_a0_w = xx_load_128(a);
+ const __m128i v_b0_w = xx_load_128(b);
+ const __m128i v_a1_w = xx_load_128(a + 8);
+ const __m128i v_b1_w = xx_load_128(b + 8);
+ const __m128i v_a2_w = xx_load_128(a + 16);
+ const __m128i v_b2_w = xx_load_128(b + 16);
+ const __m128i v_a3_w = xx_load_128(a + 24);
+ const __m128i v_b3_w = xx_load_128(b + 24);
+
+ const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+ const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+ const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+ const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+ const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+ const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+ const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+ const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+ const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+ xx_store_128(d, v_r0_w);
+ xx_store_128(d + 8, v_r1_w);
+ xx_store_128(d + 16, v_r2_w);
+ xx_store_128(d + 24, v_r3_w);
+
+ a += 32;
+ b += 32;
+ d += 32;
+ N -= 32;
+ } while (N);
+}
diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com
new file mode 100644
index 0000000000..5c8e0e09d1
--- /dev/null
+++ b/third_party/aom/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
new file mode 100644
index 0000000000..daabf6766d
--- /dev/null
+++ b/third_party/aom/av1/exports_dec
@@ -0,0 +1,3 @@
+data aom_codec_av1_dx_algo
+text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/third_party/aom/av1/exports_enc b/third_party/aom/av1/exports_enc
new file mode 100644
index 0000000000..dc4a9eae79
--- /dev/null
+++ b/third_party/aom/av1/exports_enc
@@ -0,0 +1,2 @@
+data aom_codec_av1_cx_algo
+text aom_codec_av1_cx
diff --git a/third_party/aom/av1/exports_ident b/third_party/aom/av1/exports_ident
new file mode 100644
index 0000000000..b523a679d5
--- /dev/null
+++ b/third_party/aom/av1/exports_ident
@@ -0,0 +1,2 @@
+text ifd_init
+text ifd_inspect
diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test
new file mode 100644
index 0000000000..dab3775750
--- /dev/null
+++ b/third_party/aom/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd
diff --git a/third_party/aom/av1/ratectrl_rtc.cc b/third_party/aom/av1/ratectrl_rtc.cc
new file mode 100644
index 0000000000..83e88ba480
--- /dev/null
+++ b/third_party/aom/av1/ratectrl_rtc.cc
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/ratectrl_rtc.h"
+
+#include <memory>
+#include <new>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/svc_layercontext.h"
+
+namespace aom {
+
+AV1RateControlRtcConfig::AV1RateControlRtcConfig() {
+ width = 1280;
+ height = 720;
+ max_quantizer = 63;
+ min_quantizer = 2;
+ target_bandwidth = 1000;
+ buf_initial_sz = 600;
+ buf_optimal_sz = 600;
+ buf_sz = 1000;
+ undershoot_pct = overshoot_pct = 50;
+ max_intra_bitrate_pct = 50;
+ max_inter_bitrate_pct = 0;
+ frame_drop_thresh = 0;
+ max_consec_drop = 0;
+ framerate = 30.0;
+ ss_number_layers = 1;
+ ts_number_layers = 1;
+ aq_mode = 0;
+ layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+ ts_rate_decimator[0] = 1;
+ av1_zero(max_quantizers);
+ av1_zero(min_quantizers);
+ av1_zero(scaling_factor_den);
+ av1_zero(scaling_factor_num);
+ av1_zero(layer_target_bitrate);
+ av1_zero(ts_rate_decimator);
+ scaling_factor_num[0] = 1;
+ scaling_factor_den[0] = 1;
+ max_quantizers[0] = max_quantizer;
+ min_quantizers[0] = min_quantizer;
+}
+
+std::unique_ptr<AV1RateControlRTC> AV1RateControlRTC::Create(
+ const AV1RateControlRtcConfig &cfg) {
+ std::unique_ptr<AV1RateControlRTC> rc_api(new (std::nothrow)
+ AV1RateControlRTC());
+ if (!rc_api) return nullptr;
+ rc_api->cpi_ = static_cast<AV1_COMP *>(aom_memalign(32, sizeof(*cpi_)));
+ if (!rc_api->cpi_) return nullptr;
+ av1_zero(*rc_api->cpi_);
+ rc_api->cpi_->ppi =
+ static_cast<AV1_PRIMARY *>(aom_memalign(32, sizeof(AV1_PRIMARY)));
+ if (!rc_api->cpi_->ppi) return nullptr;
+ av1_zero(*rc_api->cpi_->ppi);
+ rc_api->cpi_->common.seq_params = &rc_api->cpi_->ppi->seq_params;
+ av1_zero(*rc_api->cpi_->common.seq_params);
+ if (!rc_api->InitRateControl(cfg)) return nullptr;
+ if (cfg.aq_mode) {
+ AV1_COMP *const cpi = rc_api->cpi_;
+ cpi->enc_seg.map = static_cast<uint8_t *>(aom_calloc(
+ cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols,
+ sizeof(*cpi->enc_seg.map)));
+ if (!cpi->enc_seg.map) return nullptr;
+ cpi->cyclic_refresh = av1_cyclic_refresh_alloc(
+ cpi->common.mi_params.mi_rows, cpi->common.mi_params.mi_cols);
+ if (!cpi->cyclic_refresh) return nullptr;
+ }
+ return rc_api;
+}
+
+AV1RateControlRTC::~AV1RateControlRTC() {
+ if (cpi_) {
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1) {
+ for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+ for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+ int layer =
+ LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers);
+ LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+ aom_free(lc->map);
+ }
+ }
+ }
+ aom_free(cpi_->svc.layer_context);
+ cpi_->svc.layer_context = nullptr;
+
+ if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+ aom_free(cpi_->enc_seg.map);
+ cpi_->enc_seg.map = nullptr;
+ av1_cyclic_refresh_free(cpi_->cyclic_refresh);
+ }
+ aom_free(cpi_->ppi);
+ aom_free(cpi_);
+ }
+}
+
+bool AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) {
+ AV1_COMMON *cm = &cpi_->common;
+ AV1EncoderConfig *oxcf = &cpi_->oxcf;
+ RATE_CONTROL *const rc = &cpi_->rc;
+ cm->seq_params->profile = PROFILE_0;
+ cm->seq_params->bit_depth = AOM_BITS_8;
+ cm->show_frame = 1;
+ oxcf->profile = cm->seq_params->profile;
+ oxcf->mode = REALTIME;
+ oxcf->rc_cfg.mode = AOM_CBR;
+ oxcf->pass = AOM_RC_ONE_PASS;
+ oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
+ oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT;
+ oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+ rc->max_consec_drop = rc_cfg.max_consec_drop;
+ cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP;
+ oxcf->tool_cfg.bit_depth = AOM_BITS_8;
+ oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC;
+ oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL;
+ cm->current_frame.frame_number = 0;
+ cpi_->ppi->p_rc.kf_boost = DEFAULT_KF_BOOST_RT;
+ for (auto &lvl_idx : oxcf->target_seq_level_idx) lvl_idx = SEQ_LEVEL_MAX;
+
+ memcpy(cpi_->ppi->level_params.target_seq_level_idx,
+ oxcf->target_seq_level_idx, sizeof(oxcf->target_seq_level_idx));
+ if (!UpdateRateControl(rc_cfg)) return false;
+ set_sb_size(cm->seq_params,
+ av1_select_sb_size(oxcf, cm->width, cm->height,
+ cpi_->svc.number_spatial_layers));
+ cpi_->ppi->use_svc = cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1;
+ av1_primary_rc_init(oxcf, &cpi_->ppi->p_rc);
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
+ av1_rc_init_minq_luts();
+ av1_rc_init(oxcf, rc);
+ // Enable external rate control.
+ cpi_->rc.rtc_external_ratectrl = 1;
+ cpi_->sf.rt_sf.use_nonrd_pick_mode = 1;
+ return true;
+}
+
+bool AV1RateControlRTC::UpdateRateControl(
+ const AV1RateControlRtcConfig &rc_cfg) {
+ if (rc_cfg.ss_number_layers < 1 ||
+ rc_cfg.ss_number_layers > AOM_MAX_SS_LAYERS ||
+ rc_cfg.ts_number_layers < 1 ||
+ rc_cfg.ts_number_layers > AOM_MAX_TS_LAYERS) {
+ return false;
+ }
+ const int num_layers = rc_cfg.ss_number_layers * rc_cfg.ts_number_layers;
+ if (num_layers > 1 && !av1_alloc_layer_context(cpi_, num_layers)) {
+ return false;
+ }
+ AV1_COMMON *cm = &cpi_->common;
+ AV1EncoderConfig *oxcf = &cpi_->oxcf;
+ RATE_CONTROL *const rc = &cpi_->rc;
+ initial_width_ = rc_cfg.width;
+ initial_height_ = rc_cfg.height;
+ cm->width = rc_cfg.width;
+ cm->height = rc_cfg.height;
+ oxcf->frm_dim_cfg.width = rc_cfg.width;
+ oxcf->frm_dim_cfg.height = rc_cfg.height;
+ oxcf->rc_cfg.worst_allowed_q = av1_quantizer_to_qindex(rc_cfg.max_quantizer);
+ oxcf->rc_cfg.best_allowed_q = av1_quantizer_to_qindex(rc_cfg.min_quantizer);
+ rc->worst_quality = oxcf->rc_cfg.worst_allowed_q;
+ rc->best_quality = oxcf->rc_cfg.best_allowed_q;
+ oxcf->input_cfg.init_framerate = rc_cfg.framerate;
+ oxcf->rc_cfg.target_bandwidth = rc_cfg.target_bandwidth > INT64_MAX / 1000
+ ? INT64_MAX
+ : 1000 * rc_cfg.target_bandwidth;
+ oxcf->rc_cfg.starting_buffer_level_ms = rc_cfg.buf_initial_sz;
+ oxcf->rc_cfg.optimal_buffer_level_ms = rc_cfg.buf_optimal_sz;
+ oxcf->rc_cfg.maximum_buffer_size_ms = rc_cfg.buf_sz;
+ oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct;
+ oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct;
+ oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+ rc->max_consec_drop = rc_cfg.max_consec_drop;
+ oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+ oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
+ cpi_->framerate = rc_cfg.framerate;
+ if (rc_cfg.is_screen) {
+ cpi_->oxcf.tune_cfg.content = AOM_CONTENT_SCREEN;
+ cpi_->is_screen_content_type = 1;
+ }
+ cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
+ cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
+ set_primary_rc_buffer_sizes(oxcf, cpi_->ppi);
+ enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8);
+ av1_new_framerate(cpi_, cpi_->framerate);
+ if (cpi_->svc.number_temporal_layers > 1 ||
+ cpi_->svc.number_spatial_layers > 1) {
+ int64_t target_bandwidth_svc = 0;
+ for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+ const int layer =
+ LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lc->layer_target_bitrate = 1000 * rc_cfg.layer_target_bitrate[layer];
+ lc->max_q = rc_cfg.max_quantizers[layer];
+ lc->min_q = rc_cfg.min_quantizers[layer];
+ lrc->worst_quality =
+ av1_quantizer_to_qindex(rc_cfg.max_quantizers[layer]);
+ lrc->best_quality =
+ av1_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
+ lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
+ lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
+ lc->framerate_factor = rc_cfg.ts_rate_decimator[tl];
+ if (tl == cpi_->svc.number_temporal_layers - 1)
+ target_bandwidth_svc += lc->layer_target_bitrate;
+ }
+ }
+
+ if (cm->current_frame.frame_number == 0) av1_init_layer_context(cpi_);
+ // This is needed to initialize external RC flag in layer context structure.
+ cpi_->rc.rtc_external_ratectrl = 1;
+ av1_update_layer_context_change_config(cpi_, target_bandwidth_svc);
+ }
+ check_reset_rc_flag(cpi_);
+ return true;
+}
+
+FrameDropDecision AV1RateControlRTC::ComputeQP(
+ const AV1FrameParamsRTC &frame_params) {
+ AV1_COMMON *const cm = &cpi_->common;
+ int width, height;
+ GF_GROUP *const gf_group = &cpi_->ppi->gf_group;
+ cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+ cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+ if (cpi_->svc.number_spatial_layers > 1) {
+ const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+ cpi_->svc.temporal_layer_id,
+ cpi_->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+ av1_get_layer_resolution(initial_width_, initial_height_,
+ lc->scaling_factor_num, lc->scaling_factor_den,
+ &width, &height);
+ cm->width = width;
+ cm->height = height;
+ }
+ enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8);
+ cm->current_frame.frame_type = frame_params.frame_type;
+ cpi_->refresh_frame.golden_frame =
+ (cm->current_frame.frame_type == KEY_FRAME) ? 1 : 0;
+ cpi_->sf.rt_sf.use_nonrd_pick_mode = 1;
+
+ if (frame_params.frame_type == kKeyFrame) {
+ gf_group->update_type[cpi_->gf_frame_index] = KF_UPDATE;
+ gf_group->frame_type[cpi_->gf_frame_index] = KEY_FRAME;
+ gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_RESET;
+ if (cpi_->ppi->use_svc) {
+ const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+ cpi_->svc.temporal_layer_id,
+ cpi_->svc.number_temporal_layers);
+ if (cm->current_frame.frame_number > 0)
+ av1_svc_reset_temporal_layers(cpi_, 1);
+ cpi_->svc.layer_context[layer].is_key_frame = 1;
+ }
+ } else {
+ gf_group->update_type[cpi_->gf_frame_index] = LF_UPDATE;
+ gf_group->frame_type[cpi_->gf_frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_UPDATE;
+ if (cpi_->ppi->use_svc) {
+ const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+ cpi_->svc.temporal_layer_id,
+ cpi_->svc.number_temporal_layers);
+ cpi_->svc.layer_context[layer].is_key_frame = 0;
+ }
+ }
+ if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+ cpi_->rc.frames_since_key++;
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1) {
+ av1_update_temporal_layer_framerate(cpi_);
+ av1_restore_layer_context(cpi_);
+ }
+ int target = 0;
+ if (cpi_->oxcf.rc_cfg.mode == AOM_CBR) {
+ if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_update_parameters(cpi_);
+ if (frame_is_intra_only(cm)) {
+ target = av1_calc_iframe_target_size_one_pass_cbr(cpi_);
+ cpi_->common.current_frame.frame_number = 0;
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_cbr(
+ cpi_, gf_group->update_type[cpi_->gf_frame_index]);
+ }
+ }
+ av1_rc_set_frame_target(cpi_, target, cm->width, cm->height);
+ // Always drop for spatial enhancement layer if layer bandwidth is 0.
+ // Otherwise check for frame-dropping based on buffer level in
+ // av1_rc_drop_frame().
+ if ((cpi_->svc.spatial_layer_id > 0 &&
+ cpi_->oxcf.rc_cfg.target_bandwidth == 0) ||
+ av1_rc_drop_frame(cpi_)) {
+ cpi_->is_dropped_frame = true;
+ av1_rc_postencode_update_drop_frame(cpi_);
+ cpi_->frame_index_set.show_frame_count++;
+ cpi_->common.current_frame.frame_number++;
+ return FrameDropDecision::kDrop;
+ }
+ int bottom_index = 0, top_index = 0;
+ cpi_->common.quant_params.base_qindex =
+ av1_rc_pick_q_and_bounds(cpi_, cm->width, cm->height,
+ cpi_->gf_frame_index, &bottom_index, &top_index);
+ if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_setup(cpi_);
+ return FrameDropDecision::kOk;
+}
+
+int AV1RateControlRTC::GetQP() const {
+ return cpi_->common.quant_params.base_qindex;
+}
+
+AV1LoopfilterLevel AV1RateControlRTC::GetLoopfilterLevel() const {
+ av1_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q);
+ AV1LoopfilterLevel lpf_level;
+ lpf_level.filter_level[0] = cpi_->common.lf.filter_level[0];
+ lpf_level.filter_level[1] = cpi_->common.lf.filter_level[1];
+ lpf_level.filter_level_u = cpi_->common.lf.filter_level_u;
+ lpf_level.filter_level_v = cpi_->common.lf.filter_level_v;
+
+ return lpf_level;
+}
+
+AV1CdefInfo AV1RateControlRTC::GetCdefInfo() const {
+ av1_pick_cdef_from_qp(&cpi_->common, 0, 0);
+ AV1CdefInfo cdef_level;
+ cdef_level.cdef_strength_y = cpi_->common.cdef_info.cdef_strengths[0];
+ cdef_level.cdef_strength_uv = cpi_->common.cdef_info.cdef_uv_strengths[0];
+ cdef_level.damping = cpi_->common.cdef_info.cdef_damping;
+
+ return cdef_level;
+}
+
+bool AV1RateControlRTC::GetSegmentationData(
+ AV1SegmentationData *segmentation_data) const {
+ if (cpi_->oxcf.q_cfg.aq_mode == 0) {
+ return false;
+ }
+ segmentation_data->segmentation_map = cpi_->enc_seg.map;
+ segmentation_data->segmentation_map_size =
+ cpi_->common.mi_params.mi_rows * cpi_->common.mi_params.mi_cols;
+ segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+ segmentation_data->delta_q_size = 3u;
+ return true;
+}
+
+void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+ cpi_->common.current_frame.frame_number++;
+ if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+ cpi_->svc.prev_number_spatial_layers = cpi_->svc.number_spatial_layers;
+ av1_rc_postencode_update(cpi_, encoded_frame_size);
+ if (cpi_->svc.number_spatial_layers > 1 ||
+ cpi_->svc.number_temporal_layers > 1)
+ av1_save_layer_context(cpi_);
+}
+
+} // namespace aom
diff --git a/third_party/aom/av1/ratectrl_rtc.h b/third_party/aom/av1/ratectrl_rtc.h
new file mode 100644
index 0000000000..1894469dd1
--- /dev/null
+++ b/third_party/aom/av1/ratectrl_rtc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_RATECTRL_RTC_H_
+#define AOM_AV1_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <memory>
+
+struct AV1_COMP;
+
+namespace aom {
+
+// These constants come from AV1 spec.
+static constexpr size_t kAV1MaxLayers = 32;
+static constexpr size_t kAV1MaxTemporalLayers = 8;
+static constexpr size_t kAV1MaxSpatialLayers = 4;
+
+enum FrameType { kKeyFrame, kInterFrame };
+
+struct AV1RateControlRtcConfig {
+ public:
+ AV1RateControlRtcConfig();
+
+ int width;
+ int height;
+ // Flag indicating if the content is screen or not.
+ bool is_screen = false;
+ // 0-63
+ int max_quantizer;
+ int min_quantizer;
+ int64_t target_bandwidth;
+ int64_t buf_initial_sz;
+ int64_t buf_optimal_sz;
+ int64_t buf_sz;
+ int undershoot_pct;
+ int overshoot_pct;
+ int max_intra_bitrate_pct;
+ int max_inter_bitrate_pct;
+ int frame_drop_thresh;
+ int max_consec_drop;
+ double framerate;
+ int layer_target_bitrate[kAV1MaxLayers];
+ int ts_rate_decimator[kAV1MaxTemporalLayers];
+ int aq_mode;
+ // Number of spatial layers
+ int ss_number_layers;
+ // Number of temporal layers
+ int ts_number_layers;
+ int max_quantizers[kAV1MaxLayers];
+ int min_quantizers[kAV1MaxLayers];
+ int scaling_factor_num[kAV1MaxSpatialLayers];
+ int scaling_factor_den[kAV1MaxSpatialLayers];
+};
+
+struct AV1FrameParamsRTC {
+ FrameType frame_type;
+ int spatial_layer_id;
+ int temporal_layer_id;
+};
+
+struct AV1LoopfilterLevel {
+ int filter_level[2];
+ int filter_level_u;
+ int filter_level_v;
+};
+
+struct AV1CdefInfo {
+ int cdef_strength_y;
+ int cdef_strength_uv;
+ int damping;
+};
+
+struct AV1SegmentationData {
+ const uint8_t *segmentation_map;
+ size_t segmentation_map_size;
+ const int *delta_q;
+ size_t delta_q_size;
+};
+
+enum class FrameDropDecision {
+ kOk, // Frame is encoded.
+ kDrop, // Frame is dropped.
+};
+
+class AV1RateControlRTC {
+ public:
+ static std::unique_ptr<AV1RateControlRTC> Create(
+ const AV1RateControlRtcConfig &cfg);
+ ~AV1RateControlRTC();
+
+ bool UpdateRateControl(const AV1RateControlRtcConfig &rc_cfg);
+ // GetQP() needs to be called after ComputeQP() to get the latest QP
+ int GetQP() const;
+ // GetLoopfilterLevel() needs to be called after ComputeQP()
+ AV1LoopfilterLevel GetLoopfilterLevel() const;
+ // GetCdefInfo() needs to be called after ComputeQP()
+ AV1CdefInfo GetCdefInfo() const;
+ // Returns the segmentation map used for cyclic refresh, based on 4x4 blocks.
+ bool GetSegmentationData(AV1SegmentationData *segmentation_data) const;
+ // ComputeQP returns the QP if the frame is not dropped (kOk return),
+ // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+ // are not to be called (av1_rc_postencode_update_drop_frame is already
+ // called via ComputeQP if drop is decided).
+ FrameDropDecision ComputeQP(const AV1FrameParamsRTC &frame_params);
+ // Feedback to rate control with the size of current encoded frame
+ void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+ AV1RateControlRTC() = default;
+ bool InitRateControl(const AV1RateControlRtcConfig &cfg);
+ AV1_COMP *cpi_;
+ int initial_width_;
+ int initial_height_;
+};
+
+} // namespace aom
+
+#endif // AOM_AV1_RATECTRL_RTC_H_
diff --git a/third_party/aom/build/cmake/aom_config.c.template b/third_party/aom/build/cmake/aom_config.c.template
new file mode 100644
index 0000000000..93a6d8f1ad
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_config.c.template
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) @year@, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_codec.h"
+static const char* const cfg = "${AOM_CMAKE_CONFIG}";
+const char *aom_codec_build_config(void) {return cfg;}
diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake
new file mode 100644
index 0000000000..da7de4b0f4
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -0,0 +1,235 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# This file sets default values for libaom configuration variables. All libaom
+# config variables are added to the CMake variable cache via the macros provided
+# in util.cmake.
+
+#
+# The variables in this section of the file are detected at configuration time,
+# but can be overridden via the use of CONFIG_* and ENABLE_* values also defined
+# in this file.
+#
+
+set_aom_detect_var(INLINE "" "Sets INLINE value for current target.")
+
+# CPUs.
+set_aom_detect_var(AOM_ARCH_AARCH64 0 "Enables AArch64 architecture.")
+set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.")
+set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.")
+set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.")
+set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.")
+
+# Arm/AArch64 feature flags.
+set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.")
+set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.")
+set_aom_detect_var(HAVE_NEON_DOTPROD 0
+ "Enables Armv8.2-A Neon dotprod intrinsics optimizations.")
+set_aom_detect_var(HAVE_NEON_I8MM 0
+ "Enables Armv8.2-A Neon i8mm intrinsics optimizations.")
+set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.")
+
+# PPC feature flags.
+set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
+
+# x86/x86_64 feature flags.
+set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ")
+set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.")
+set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.")
+set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.")
+set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.")
+set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.")
+set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
+set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
+set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
+
+# Flags describing the build environment.
+set_aom_detect_var(HAVE_FEXCEPT 0
+ "Internal flag, GNU fenv.h present for target.")
+set_aom_detect_var(HAVE_PTHREAD_H 0 "Internal flag, target pthread support.")
+set_aom_detect_var(HAVE_UNISTD_H 0
+ "Internal flag, unistd.h present for target.")
+set_aom_detect_var(HAVE_WXWIDGETS 0 "WxWidgets present.")
+
+#
+# Variables in this section can be set from the CMake command line or from
+# within the CMake GUI. The variables control libaom features.
+#
+
+# Build configuration flags.
+set_aom_config_var(AOM_RTCD_FLAGS ""
+ "Arguments to pass to rtcd.pl. Separate with ';'")
+set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.")
+set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.")
+set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.")
+set_aom_config_var(CONFIG_FPMT_TEST 0 "Enable FPMT testing.")
+set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
+set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
+set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")
+set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.")
+
+set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
+ "Build with high bitdepth support.")
+set_aom_config_var(CONFIG_AV1_TEMPORAL_DENOISING 0
+ "Build with temporal denoising support.")
+set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.")
+set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.")
+set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.")
+set_aom_config_var(CONFIG_REALTIME_ONLY 0
+ "Build for RTC-only. See aomcx.h for all disabled features.")
+set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.")
+set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.")
+set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.")
+
+# Debugging flags.
+set_aom_config_var(CONFIG_DEBUG 0 "Enable debug-only code.")
+set_aom_config_var(CONFIG_EXCLUDE_SIMD_MISMATCH 0
+ "Exclude mismatch in SIMD functions for testing/debugging.")
+set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.")
+
+# AV1 feature flags.
+set_aom_config_var(CONFIG_ACCOUNTING 0 "Enables bit accounting.")
+set_aom_config_var(CONFIG_ANALYZER 0 "Enables bit stream analyzer.")
+set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0
+ "Coefficient range check.")
+set_aom_config_var(CONFIG_DENOISE 1
+ "Denoise/noise modeling support in encoder.")
+set_aom_config_var(CONFIG_INSPECTION 0 "Enables bitstream inspection.")
+set_aom_config_var(CONFIG_INTERNAL_STATS 0 "Enables internal encoder stats.")
+set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0
+ "Force high bitdepth decoding pipeline on 8-bit input.")
+mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING)
+set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2
+ "Max profile to support decoding.")
+set_aom_config_var(
+ CONFIG_NORMAL_TILE_MODE 0
+ "Only enables general decoding (disables large scale tile decoding).")
+set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.")
+set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.")
+set_aom_config_var(CONFIG_TUNE_BUTTERAUGLI 0
+ "Enable encoding tuning for Butteraugli.")
+set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
+set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.")
+set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.")
+set_aom_config_var(STATIC_LINK_JXL 0 "Statically link the JPEG-XL library.")
+
+# AV1 experiment flags.
+set_aom_config_var(CONFIG_BITRATE_ACCURACY 0
+ "AV1 experiment: Improve bitrate accuracy.")
+set_aom_config_var(
+ CONFIG_BITRATE_ACCURACY_BL 0
+ "AV1 experiment: Baseline of improve bitrate accuracy experiment.")
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0
+ "AV1 experiment: Bitstream debugging.")
+set_aom_config_var(
+ CONFIG_COLLECT_COMPONENT_TIMING 0
+ "AV1 experiment: Collect encoding component timing information.")
+set_aom_config_var(
+ CONFIG_COLLECT_PARTITION_STATS 0
+ "AV1 experiment: Collect partition timing stats. Can be 1 or 2.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment.")
+set_aom_config_var(
+ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
+ "AV1 experiment: Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_NN_V2 0
+ "AV1 experiment: Fully-connected neural nets ver.2.")
+set_aom_config_var(CONFIG_OPTICAL_FLOW_API 0
+ "AV1 experiment: for optical flow API.")
+set_aom_config_var(CONFIG_PARTITION_SEARCH_ORDER 0
+ "AV1 experiment: Use alternative partition search order.")
+set_aom_config_var(CONFIG_RATECTRL_LOG 0
+ "AV1 experiment: Log rate control decision.")
+set_aom_config_var(CONFIG_RD_COMMAND 0
+ "AV1 experiment: Use external rdmult and q_index.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment.")
+set_aom_config_var(
+ CONFIG_RT_ML_PARTITIONING 0
+ "AV1 experiment: Build with ML-based partitioning for Real Time.")
+set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_TFLITE 0
+ "AV1 experiment: Enable tensorflow lite library.")
+set_aom_config_var(CONFIG_THREE_PASS 0
+ "AV1 experiment: Enable three-pass encoding.")
+set_aom_config_var(CONFIG_OUTPUT_FRAME_SIZE 0
+ "AV1 experiment: Output frame size information.")
+set_aom_config_var(
+ CONFIG_SALIENCY_MAP 0
+ "AV1 experiment: Enable saliency map based encoding tuning for VMAF.")
+set_aom_config_var(CONFIG_CWG_C013 0
+ "AV1 experiment: Support for 7.x and 8.x levels.")
+# Add this change to make aomenc reported PSNR consistent with libvmaf result.
+set_aom_config_var(CONFIG_LIBVMAF_PSNR_PEAK 1
+ "Use libvmaf PSNR peak for 10- and 12-bit")
+
+#
+# Variables in this section control optional features of the build system.
+#
+set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF)
+set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests"
+ OFF)
+set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF)
+set_aom_option_var(ENABLE_DOCS
+ "Enable documentation generation (doxygen required)." ON)
+set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests"
+ OFF)
+set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON)
+set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF)
+set_aom_option_var(
+ ENABLE_IDE_TEST_HOSTING
+ "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
+set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets."
+ ON)
+set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON)
+set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory."
+ ON)
+set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
+ OFF)
+
+# Arm/AArch64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_NEON
+ "Enables Neon optimizations on Arm/AArch64 targets." ON)
+set_aom_option_var(ENABLE_ARM_CRC32 "Enables Arm CRC32 optimizations." ON)
+set_aom_option_var(
+ ENABLE_NEON_DOTPROD
+ "Enables Armv8.2-A Neon dotprod optimizations on AArch64 targets." ON)
+set_aom_option_var(
+ ENABLE_NEON_I8MM
+ "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON)
+set_aom_option_var(ENABLE_SVE
+ "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON)
+
+# VSX intrinsics flags.
+set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
+ ON)
+
+# x86/x86_64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets."
+ ON)
+set_aom_option_var(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets."
+ ON)
+set_aom_option_var(ENABLE_SSE2
+ "Enables SSE2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE3
+ "Enables SSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSSE3
+ "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_1
+ "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_2
+ "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
+ ON)
+set_aom_option_var(ENABLE_AVX2
+ "Enables AVX2 optimizations on x86/x86_64 targets." ON)
diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
new file mode 100644
index 0000000000..917e7cac5d
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -0,0 +1,489 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1)
+
+include(FindThreads)
+
+include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake")
+include("${AOM_ROOT}/build/cmake/aom_optimization.cmake")
+include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
+include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+if(DEFINED CONFIG_LOWBITDEPTH)
+ message(WARNING "CONFIG_LOWBITDEPTH has been removed. \
+ Use -DFORCE_HIGHBITDEPTH_DECODING=1 instead of -DCONFIG_LOWBITDEPTH=0 \
+ and -DFORCE_HIGHBITDEPTH_DECODING=0 instead of -DCONFIG_LOWBITDEPTH=1.")
+ if(NOT CONFIG_LOWBITDEPTH)
+ set(FORCE_HIGHBITDEPTH_DECODING
+ 1
+ CACHE STRING "${cmake_cmdline_helpstring}" FORCE)
+ endif()
+endif()
+
+if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH)
+ change_config_and_warn(CONFIG_AV1_HIGHBITDEPTH 1
+ "FORCE_HIGHBITDEPTH_DECODING")
+endif()
+
+if(CONFIG_THREE_PASS AND NOT CONFIG_AV1_DECODER)
+ change_config_and_warn(CONFIG_THREE_PASS 0 "CONFIG_AV1_DECODER=0")
+endif()
+
+# Generate the user config settings.
+list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
+foreach(cache_var ${aom_build_vars})
+ get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
+ if(cache_var_helpstring STREQUAL cmake_cmdline_helpstring)
+ set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
+ endif()
+endforeach()
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
+# Detect target CPU.
+if(NOT AOM_TARGET_CPU)
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+ if(cpu_lowercase STREQUAL "amd64" OR cpu_lowercase STREQUAL "x86_64")
+ if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+ set(AOM_TARGET_CPU "x86")
+ elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ set(AOM_TARGET_CPU "x86_64")
+ else()
+ message(
+ FATAL_ERROR "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
+ " CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
+ " CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
+ " CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
+ endif()
+ elseif(cpu_lowercase STREQUAL "i386" OR cpu_lowercase STREQUAL "x86")
+ set(AOM_TARGET_CPU "x86")
+ elseif(cpu_lowercase MATCHES "^arm")
+ set(AOM_TARGET_CPU "${cpu_lowercase}")
+ elseif(cpu_lowercase MATCHES "aarch64")
+ set(AOM_TARGET_CPU "arm64")
+ elseif(cpu_lowercase MATCHES "^ppc")
+ set(AOM_TARGET_CPU "ppc")
+ else()
+ message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
+ "supported, falling back to the generic target")
+ set(AOM_TARGET_CPU "generic")
+ endif()
+endif()
+
+if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string.
+ if(IS_ABSOLUTE "${CMAKE_TOOLCHAIN_FILE}")
+ file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}"
+ "${CMAKE_TOOLCHAIN_FILE}")
+ else()
+ set(toolchain_path "${CMAKE_TOOLCHAIN_FILE}")
+ endif()
+ set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"")
+ set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}")
+else()
+
+ # Add detected CPU to the config string.
+ set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}")
+endif()
+set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}")
+file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}")
+set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}")
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
+message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
+set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME})
+
+string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+if(build_type_lowercase STREQUAL "debug")
+ set(CONFIG_DEBUG 1)
+endif()
+
+if(BUILD_SHARED_LIBS)
+ set(CONFIG_PIC 1)
+ set(CONFIG_SHARED 1)
+elseif(NOT CONFIG_PIC)
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line. This allows the user to
+ # force CONFIG_PIC=0.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE CONFIG_PIC PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ aom_check_c_compiles("pie_check" "
+ #if !(__pie__ || __PIE__)
+ #error Neither __pie__ or __PIE__ are set
+ #endif
+ extern void unused(void);
+ void unused(void) {}" HAVE_PIE)
+
+ if(HAVE_PIE)
+ # If -fpie or -fPIE are used ensure the assembly code has PIC enabled to
+ # avoid DT_TEXTRELs: /usr/bin/ld: warning: creating DT_TEXTREL in a PIE
+ set(CONFIG_PIC 1)
+ message(
+ "CONFIG_PIC enabled for position independent executable (PIE) build")
+ endif()
+ endif()
+ unset(cache_helpstring)
+endif()
+
+if(NOT MSVC)
+ if(CONFIG_PIC)
+
+ # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
+ # work.
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+ if(AOM_TARGET_SYSTEM STREQUAL "Linux"
+ AND AOM_TARGET_CPU MATCHES "^armv[78]")
+ set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
+ else()
+ set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
+ endif()
+ endif()
+endif()
+
+if(AOM_TARGET_CPU STREQUAL "x86" OR AOM_TARGET_CPU STREQUAL "x86_64")
+ find_program(CMAKE_ASM_NASM_COMPILER yasm $ENV{YASM_PATH})
+ if(NOT CMAKE_ASM_NASM_COMPILER OR ENABLE_NASM)
+ unset(CMAKE_ASM_NASM_COMPILER CACHE)
+ find_program(CMAKE_ASM_NASM_COMPILER nasm $ENV{NASM_PATH})
+ endif()
+
+ include(CheckLanguage)
+ check_language(ASM_NASM)
+ if(CMAKE_ASM_NASM_COMPILER)
+ get_asm_obj_format("objformat")
+ unset(CMAKE_ASM_NASM_OBJECT_FORMAT)
+ set(CMAKE_ASM_NASM_OBJECT_FORMAT ${objformat})
+ enable_language(ASM_NASM)
+ if(CMAKE_ASM_NASM_COMPILER_ID STREQUAL "NASM")
+ test_nasm()
+ endif()
+ # Xcode requires building the objects manually, so pass the object format
+ # flag.
+ if(XCODE)
+ set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
+ endif()
+ else()
+ message(
+ FATAL_ERROR
+ "Unable to find assembler. Install 'yasm' or 'nasm.' "
+ "To build without optimizations, add -DAOM_TARGET_CPU=generic to "
+ "your cmake command line.")
+ endif()
+ string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
+elseif(AOM_TARGET_CPU MATCHES "arm")
+ if(AOM_TARGET_SYSTEM STREQUAL "Darwin")
+ if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+ endif()
+ set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
+ elseif(AOM_TARGET_SYSTEM STREQUAL "Windows")
+ if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
+ endif()
+ else()
+ if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER as)
+ endif()
+ endif()
+ include(CheckLanguage)
+ check_language(ASM)
+ if(NOT CMAKE_ASM_COMPILER)
+ message(
+ FATAL_ERROR
+ "Unable to find assembler and optimizations are enabled."
+ "Searched for ${CMAKE_ASM_COMPILER}. Install it, add it to your path,"
+ "or set the assembler directly by adding "
+ "-DCMAKE_ASM_COMPILER=<assembler path> to your CMake command line."
+ "To build without optimizations, add -DAOM_TARGET_CPU=generic to your "
+ "cmake command line.")
+ endif()
+ enable_language(ASM)
+ string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
+endif()
+
+if(CONFIG_ANALYZER)
+ find_package(wxWidgets REQUIRED adv base core)
+ include(${wxWidgets_USE_FILE})
+endif()
+
+if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang")
+ set(CONFIG_GCC 1)
+endif()
+
+if(CONFIG_GCOV)
+ message("--- Testing for CONFIG_GCOV support.")
+ require_linker_flag("-fprofile-arcs -ftest-coverage")
+ require_compiler_flag("-fprofile-arcs -ftest-coverage" YES)
+endif()
+
+if(CONFIG_GPROF)
+ message("--- Testing for CONFIG_GPROF support.")
+ require_compiler_flag("-pg" YES)
+endif()
+
+if(AOM_TARGET_SYSTEM MATCHES "Darwin\|Linux\|Windows\|Android")
+ set(CONFIG_OS_SUPPORT 1)
+endif()
+
+if(AOM_TARGET_SYSTEM STREQUAL "Windows")
+ # The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set
+ # it to 0x0601 (Windows 7).
+ add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601")
+ # Quiet warnings related to fopen, printf, etc.
+ add_compiler_flag_if_supported("-D_CRT_SECURE_NO_WARNINGS")
+endif()
+
+#
+# Fix CONFIG_* dependencies. This must be done before including cpu.cmake to
+# ensure RTCD_CONFIG_* are properly set.
+fix_experiment_configs()
+
+# Test compiler support.
+aom_get_inline("INLINE")
+
+# Don't just check for pthread.h, but use the result of the full pthreads
+# including a linking check in FindThreads above.
+set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT})
+aom_check_source_compiles("unistd_check" "#include <unistd.h>" HAVE_UNISTD_H)
+
+if(NOT WIN32)
+ aom_push_var(CMAKE_REQUIRED_LIBRARIES "m")
+ aom_check_c_compiles("fenv_check" "#define _GNU_SOURCE
+ #include <fenv.h>
+ void unused(void) {
+ (void)unused;
+ (void)feenableexcept(FE_DIVBYZERO | FE_INVALID);
+ }" HAVE_FEXCEPT)
+ aom_pop_var(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+include("${AOM_ROOT}/build/cmake/cpu.cmake")
+
+if(ENABLE_CCACHE)
+ set_compiler_launcher(ENABLE_CCACHE ccache)
+endif()
+
+if(ENABLE_DISTCC)
+ set_compiler_launcher(ENABLE_DISTCC distcc)
+endif()
+
+if(ENABLE_GOMA)
+ set_compiler_launcher(ENABLE_GOMA gomacc)
+endif()
+
+if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER)
+ message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.")
+endif()
+
+if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT)
+ change_config_and_warn(CONFIG_SIZE_LIMIT 1
+ "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT")
+endif()
+
+if(CONFIG_SIZE_LIMIT)
+ if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT)
+ message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT "
+ "and DECODE_WIDTH_LIMIT must be set.")
+ endif()
+endif()
+
+# Test compiler flags.
+if(MSVC)
+ # It isn't possible to specify C99 conformance for MSVC. MSVC doesn't support
+ # C++ standards modes earlier than C++14.
+ add_cxx_flag_if_supported("/std:c++14")
+ add_compiler_flag_if_supported("/W3")
+
+ # Disable MSVC warnings that suggest making code non-portable.
+ add_compiler_flag_if_supported("/wd4996")
+ if(ENABLE_WERROR)
+ add_compiler_flag_if_supported("/WX")
+ endif()
+else()
+ require_c_flag("-std=c99" YES)
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+ AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU"
+ AND CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
+ # Microsoft's C++ Standard Library requires C++14 as it's MSVC's default and
+ # minimum supported C++ version. If Clang is using this Standard Library
+ # implementation, it cannot target C++11.
+ require_cxx_flag_nomsvc("-std=c++14" YES)
+ else()
+ require_cxx_flag_nomsvc("-std=c++11" YES)
+ endif()
+ add_compiler_flag_if_supported("-Wall")
+ add_compiler_flag_if_supported("-Wdisabled-optimization")
+ add_compiler_flag_if_supported("-Wextra")
+ # Prior to version 3.19.0 cmake would fail to parse the warning emitted by gcc
+ # with this flag. Note the order of this check and -Wextra-semi-stmt is
+ # important due to is_flag_present() matching substrings with string(FIND
+ # ...).
+ if(CMAKE_VERSION VERSION_LESS "3.19"
+ AND CMAKE_C_COMPILER_ID STREQUAL "GNU"
+ AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 10)
+ add_cxx_flag_if_supported("-Wextra-semi")
+ else()
+ add_compiler_flag_if_supported("-Wextra-semi")
+ endif()
+ add_compiler_flag_if_supported("-Wextra-semi-stmt")
+ add_compiler_flag_if_supported("-Wfloat-conversion")
+ add_compiler_flag_if_supported("-Wformat=2")
+ add_c_flag_if_supported("-Wimplicit-function-declaration")
+ add_compiler_flag_if_supported("-Wlogical-op")
+ add_compiler_flag_if_supported("-Wpointer-arith")
+ add_compiler_flag_if_supported("-Wshadow")
+ add_compiler_flag_if_supported("-Wshorten-64-to-32")
+ add_compiler_flag_if_supported("-Wsign-compare")
+ add_compiler_flag_if_supported("-Wstring-conversion")
+ add_compiler_flag_if_supported("-Wtype-limits")
+ add_compiler_flag_if_supported("-Wuninitialized")
+ add_compiler_flag_if_supported("-Wunreachable-code-aggressive")
+ add_compiler_flag_if_supported("-Wunused")
+ add_compiler_flag_if_supported("-Wvla")
+ add_cxx_flag_if_supported("-Wc++14-extensions")
+ add_cxx_flag_if_supported("-Wc++17-extensions")
+ add_cxx_flag_if_supported("-Wc++20-extensions")
+
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address|undefined")
+
+ # This combination has more stack overhead, so we account for it by
+ # providing higher stack limit than usual.
+ add_c_flag_if_supported("-Wstack-usage=285000")
+ add_cxx_flag_if_supported("-Wstack-usage=270000")
+ elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
+ add_c_flag_if_supported("-Wstack-usage=135000")
+ add_cxx_flag_if_supported("-Wstack-usage=240000")
+ else()
+ add_c_flag_if_supported("-Wstack-usage=100000")
+ add_cxx_flag_if_supported("-Wstack-usage=240000")
+ endif()
+
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address")
+ # Disable no optimization warning when compiling with sanitizers
+ add_compiler_flag_if_supported("-Wno-disabled-optimization")
+ endif()
+
+ # Add -Wundef only for C files to avoid massive gtest warning spam.
+ add_c_flag_if_supported("-Wundef")
+
+ # Quiet gcc 6 vs 7 abi warnings:
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+ if(AOM_TARGET_CPU MATCHES "arm")
+ add_cxx_flag_if_supported("-Wno-psabi")
+ endif()
+
+ if(ENABLE_WERROR)
+ add_compiler_flag_if_supported("-Werror")
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0")
+ endif()
+ add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE")
+ add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64")
+endif()
+
+# Prior to r23, or with ANDROID_USE_LEGACY_TOOLCHAIN_FILE set,
+# android.toolchain.cmake would set normal (non-cache) versions of variables
+# like CMAKE_C_FLAGS_RELEASE which would mask the ones added to the cache
+# variable in add_compiler_flag_if_supported(), etc. As a workaround we add
+# everything accumulated in AOM_C/CXX_FLAGS to the normal versions. This could
+# also be addressed by reworking the flag tests and adding the results directly
+# to target_compile_options() as in e.g., libgav1, but that's a larger task.
+# https://github.com/android/ndk/wiki/Changelog-r23#changes
+if(ANDROID
+ AND ("${ANDROID_NDK_MAJOR}" LESS 23 OR ANDROID_USE_LEGACY_TOOLCHAIN_FILE))
+ foreach(lang C;CXX)
+ string(STRIP "${AOM_${lang}_FLAGS}" AOM_${lang}_FLAGS)
+ if(AOM_${lang}_FLAGS)
+ foreach(config ${AOM_${lang}_CONFIGS})
+ set(${config} "${${config}} ${AOM_${lang}_FLAGS}")
+ endforeach()
+ endif()
+ endforeach()
+endif()
+
+set(AOM_LIB_LINK_TYPE PUBLIC)
+if(EMSCRIPTEN)
+
+ # Avoid CMake generation time errors resulting from collisions with the form
+ # of target_link_libraries() used by Emscripten.cmake.
+ unset(AOM_LIB_LINK_TYPE)
+endif()
+
+# Generate aom_config templates.
+set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+execute_process(
+ COMMAND ${CMAKE_COMMAND}
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -P
+ "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
+
+# Generate aom_config.{asm,h}.
+configure_file("${aom_config_asm_template}"
+ "${AOM_CONFIG_DIR}/config/aom_config.asm")
+configure_file("${aom_config_h_template}"
+ "${AOM_CONFIG_DIR}/config/aom_config.h")
+
+# Read the current git hash.
+find_package(Git)
+if(NOT GIT_FOUND)
+ message("--- Git missing, version will be read from CHANGELOG.")
+endif()
+
+string(TIMESTAMP year "%Y")
+configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template"
+ "${AOM_CONFIG_DIR}/config/aom_config.c")
+
+# Find Perl and generate the RTCD sources.
+find_package(Perl)
+if(NOT PERL_FOUND)
+ message(FATAL_ERROR "Perl is required to build libaom.")
+endif()
+
+set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+ "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+ "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
+set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+ "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+ "${AOM_CONFIG_DIR}/config/av1_rtcd.h")
+set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+ "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+ "${AOM_ROOT}/av1/common/av1_rtcd.c")
+set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd)
+list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT)
+math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1")
+
+foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT})
+ list(GET AOM_RTCD_CONFIG_FILE_LIST ${NUM} AOM_RTCD_CONFIG_FILE)
+ list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE)
+ list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE)
+ list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
+ execute_process(
+ COMMAND
+ ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl"
+ --arch=${AOM_TARGET_CPU}
+ --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
+ --config=${AOM_CONFIG_DIR}/config/aom_config.h ${AOM_RTCD_CONFIG_FILE}
+ OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
+endforeach()
+
+# Generate aom_version.h.
+execute_process(COMMAND ${CMAKE_COMMAND}
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+ -DAOM_ROOT=${AOM_ROOT}
+ -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+ -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+ "${AOM_ROOT}/build/cmake/version.cmake")
diff --git a/third_party/aom/build/cmake/aom_experiment_deps.cmake b/third_party/aom/build/cmake/aom_experiment_deps.cmake
new file mode 100644
index 0000000000..3bbeb0c874
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_experiment_deps.cmake
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1)
+
+# Adjusts CONFIG_* CMake variables to address conflicts between active AV1
+# experiments.
+macro(fix_experiment_configs)
+
+ if(CONFIG_ANALYZER)
+ change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
+ endif()
+
+endmacro()
diff --git a/third_party/aom/build/cmake/aom_install.cmake b/third_party/aom/build/cmake/aom_install.cmake
new file mode 100644
index 0000000000..2c263e96b9
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_install.cmake
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h"
+ "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_frame_buffer.h"
+ "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h")
+
+if(CONFIG_AV1_DECODER)
+ list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h"
+ "${AOM_ROOT}/aom/aomdx.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+ list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h"
+ "${AOM_ROOT}/aom/aom_encoder.h"
+ "${AOM_ROOT}/aom/aom_external_partition.h")
+endif()
+
+# Generate aom.pc and setup dependencies to ensure it is created when necessary.
+# Note: aom.pc generation uses GNUInstallDirs:
+# https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html
+macro(setup_aom_install_targets)
+ if(NOT XCODE)
+ include("GNUInstallDirs")
+ set(AOM_PKG_CONFIG_FILE "${AOM_CONFIG_DIR}/aom.pc")
+
+ # Create a library target for creating aom.pc.
+ create_no_op_source_file(aom_pc c AOM_PKG_CONFIG_SOURCES)
+ add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES})
+
+ # Setup a rule to generate aom.pc.
+ add_custom_command(
+ OUTPUT "${AOM_PKG_CONFIG_FILE}"
+ COMMAND ${CMAKE_COMMAND} ARGS
+ -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+ -DAOM_ROOT=${AOM_ROOT}
+ -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+ -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+ -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+ -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+ -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+ -DCMAKE_THREAD_LIBS_INIT=${CMAKE_THREAD_LIBS_INIT}
+ -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+ -DCONFIG_TUNE_VMAF=${CONFIG_TUNE_VMAF}
+ -DCONFIG_TUNE_BUTTERAUGLI=${CONFIG_TUNE_BUTTERAUGLI}
+ -DCONFIG_SALIENCY_MAP=${CONFIG_SALIENCY_MAP}
+ -DCONFIG_TFLITE=${CONFIG_TFLITE}
+ -P
+ "${AOM_ROOT}/build/cmake/pkg_config.cmake"
+ COMMENT "Writing aom.pc"
+ VERBATIM)
+
+ # Explicitly add a dependency on the pkg-config file to ensure it's built.
+ get_property(aom_pc_sources TARGET aom_pc PROPERTY SOURCES)
+ set_source_files_properties(${aom_pc_sources} OBJECT_DEPENDS
+ "${AOM_PKG_CONFIG_FILE}")
+
+ # Our pkg-config file carries version information: add a dependency on the
+ # version rule.
+ add_dependencies(aom_pc aom_version)
+
+ if(CONFIG_AV1_DECODER)
+ if(ENABLE_EXAMPLES)
+ list(APPEND AOM_INSTALL_BINS aomdec)
+ endif()
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ if(ENABLE_EXAMPLES)
+ list(APPEND AOM_INSTALL_BINS aomenc)
+ endif()
+ endif()
+
+ if(BUILD_SHARED_LIBS)
+ set(AOM_INSTALL_LIBS aom aom_static)
+ else()
+ set(AOM_INSTALL_LIBS aom)
+ endif()
+
+ # Setup the install rules. install() will automatically prepend
+ # CMAKE_INSTALL_PREFIX to relative paths
+ install(FILES ${AOM_INSTALL_INCS}
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/aom")
+ install(FILES "${AOM_PKG_CONFIG_FILE}"
+ DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+ install(TARGETS ${AOM_INSTALL_LIBS};${AOM_INSTALL_BINS}
+ RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+ LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+ ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+ endif()
+endmacro()
diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake
new file mode 100644
index 0000000000..0f93228eef
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_optimization.cmake
@@ -0,0 +1,279 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1)
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Translate $flag to one which MSVC understands, and write the new flag to the
+# variable named by $translated_flag (or unset it, when MSVC needs no flag).
+function(get_msvc_intrinsic_flag flag translated_flag)
+ if("${flag}" STREQUAL "-mavx")
+ set(${translated_flag} "/arch:AVX" PARENT_SCOPE)
+ elseif("${flag}" STREQUAL "-mavx2")
+ set(${translated_flag} "/arch:AVX2" PARENT_SCOPE)
+ else()
+
+ # MSVC does not need flags for intrinsics flavors other than AVX/AVX2.
+ unset(${translated_flag} PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Adds an object library target. Terminates generation if $flag is not supported
+# by the current compiler. $flag is the intrinsics flag required by the current
+# compiler, and is added to the compile flags for all sources in $sources.
+# $opt_name is used to name the target. $target_to_update is made dependent upon
+# the created target.
+#
+# Note: this function always updates the aom, and aom_static targets because
+# OBJECT libraries have rules that disallow the direct addition of .o files to
+# them as dependencies. Static and shared libraries do not have this limitation.
+function(add_intrinsics_object_library flag opt_name target_to_update sources)
+ if("${${sources}}" STREQUAL "")
+ return()
+ endif()
+ set(target_name ${target_to_update}_${opt_name}_intrinsics)
+ add_library(${target_name} OBJECT ${${sources}})
+ set_property(TARGET ${target_name} PROPERTY FOLDER ${AOM_TARGET_CPU})
+
+ # MSVC does not need flags for intrinsics flavors other than AVX/AVX2.
+ # However, for clang-cl, the default is SSE2, and the MSVC frontend does not
+ # provide any flags to enable SSE3 up to SSE4.1. So we need to restrict the
+ # usage of MSVC-style flags to only the real MSVC.
+ if(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
+ get_msvc_intrinsic_flag("${flag}" "flag")
+ endif()
+
+ if("${flag}" STREQUAL "-mavx2")
+ unset(FLAG_SUPPORTED)
+ check_c_compiler_flag("-mno-avx256-split-unaligned-load" FLAG_SUPPORTED)
+ if(${FLAG_SUPPORTED})
+ set(flag "${flag} -mno-avx256-split-unaligned-load")
+ endif()
+
+ unset(FLAG_SUPPORTED)
+ check_c_compiler_flag("-mno-avx256-split-unaligned-store" FLAG_SUPPORTED)
+ if(${FLAG_SUPPORTED})
+ set(flag "${flag} -mno-avx256-split-unaligned-store")
+ endif()
+ endif()
+
+ if(flag)
+ separate_arguments(flag)
+ target_compile_options(${target_name} PUBLIC ${flag})
+ endif()
+
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:${target_name}>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:${target_name}>)
+ endif()
+
+ # Add the new lib target to the global list of aom library targets.
+ list(APPEND AOM_LIB_TARGETS ${target_name})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+# Adds sources in list named by $sources to $target and adds $flag to the
+# compile flags for each source file.
+function(add_intrinsics_source_to_target flag target sources)
+ target_sources(${target} PRIVATE ${${sources}})
+ if(MSVC)
+ get_msvc_intrinsic_flag("${flag}" "flag")
+ endif()
+ if(flag)
+ foreach(source ${${sources}})
+ set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag})
+ endforeach()
+ endif()
+endfunction()
+
+# Writes object format for the current target to the var named by $out_format,
+# or terminates the build when the object format for the current target is
+# unknown.
+function(get_asm_obj_format out_format)
+ if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+ set(objformat "macho64")
+ elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+ OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN"
+ OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+ set(objformat "win64")
+ else()
+ set(objformat "elf64")
+ endif()
+ elseif("${AOM_TARGET_CPU}" STREQUAL "x86")
+ if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+ set(objformat "macho32")
+ elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+ OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN"
+ OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+ set(objformat "win32")
+ else()
+ set(objformat "elf32")
+ endif()
+ else()
+ message(
+ FATAL_ERROR "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+ endif()
+
+ set(${out_format} ${objformat} PARENT_SCOPE)
+endfunction()
+
+# Adds library target named $lib_name for ASM files in variable named by
+# $asm_sources. Builds an output directory path from $lib_name. Links $lib_name
+# into the aom library target(s). Generates a C file with an unused no-op
+# function to ensure that all cmake generators can determine the linker
+# language, and that build tools don't complain that an object exposes no
+# symbols.
+#
+# In Xcode-based builds every step described above happens twice, and
+# directory/target/object names are updated to include _shared and _static
+# suffixes.
+function(add_asm_library lib_name asm_sources)
+ if("${${asm_sources}}" STREQUAL "")
+ return()
+ endif()
+
+ if(XCODE)
+ # CMake's generator does not output a build rule for Nasm files. Moreover,
+ # it makes Xcode believe Nasm files are of type "sourcecode" instead of
+ # "sourcecode.nasm", which prevents even the default rule from applying.
+ # This default rule is broken, though, because it doesn't apply any of the
+ # flags specified for ASM_NASM. See https://discourse.cmake.org/t/building-
+ # nasm-files-with-xcode/7934
+ list(APPEND asm_configs "static")
+ if(BUILD_SHARED_LIBS)
+ list(APPEND asm_configs "shared")
+ endif()
+
+ set(as_executable "${CMAKE_ASM_NASM_COMPILER}")
+ if(NOT as_executable)
+ set(as_executable "${CMAKE_ASM_COMPILER}")
+ endif()
+
+ foreach(asm_config ${asm_configs})
+ set(asm_lib_name ${lib_name}_${asm_config})
+ set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${asm_lib_name}")
+ if(NOT EXISTS "${asm_lib_obj_dir}")
+ file(MAKE_DIRECTORY "${asm_lib_obj_dir}")
+ endif()
+
+ foreach(asm_source ${${asm_sources}})
+ get_filename_component(asm_source_name "${asm_source}" NAME)
+ set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o")
+ add_custom_command(OUTPUT "${asm_object}"
+ COMMAND ${as_executable} ARGS ${AOM_AS_FLAGS}
+ -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o
+ "${asm_object}" "${asm_source}"
+ DEPENDS "${asm_source}"
+ COMMENT "Building ASM object ${asm_object}"
+ WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
+ VERBATIM)
+ if(BUILD_SHARED_LIBS AND "${asm_config}" STREQUAL "static")
+ target_sources(aom_static PRIVATE "${asm_object}")
+ else()
+ target_sources(aom PRIVATE "${asm_object}")
+ endif()
+ endforeach()
+ endforeach()
+ else()
+ # For non-Xcode generators, CMake does not need extra help. The language
+ # support takes care of it.
+ set(asm_lib_name ${lib_name})
+
+ add_library(${asm_lib_name} OBJECT ${${asm_sources}})
+ target_include_directories(${asm_lib_name}
+ PRIVATE ${AOM_ROOT} ${AOM_CONFIG_DIR})
+ target_compile_options(${asm_lib_name} PRIVATE ${AOM_AS_FLAGS})
+ set_property(TARGET ${asm_lib_name} PROPERTY FOLDER ${AOM_TARGET_CPU})
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE "$<TARGET_OBJECTS:${asm_lib_name}>")
+ endif()
+ target_sources(aom PRIVATE "$<TARGET_OBJECTS:${asm_lib_name}>")
+
+ # Add the new lib target to the global list of aom library targets.
+ list(APPEND AOM_LIB_TARGETS ${asm_lib_name})
+ endif()
+
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+# Terminates generation if nasm found in PATH does not meet requirements.
+# Currently checks only for presence of required object formats and support for
+# the -Ox argument (multipass optimization).
+function(test_nasm)
+ execute_process(COMMAND ${CMAKE_ASM_NASM_COMPILER} -hf
+ OUTPUT_VARIABLE nasm_helptext)
+
+ if(NOT "${nasm_helptext}" MATCHES "-Ox")
+ message(
+ FATAL_ERROR "Unsupported nasm: multipass optimization not supported.")
+ endif()
+
+ if("${AOM_TARGET_CPU}" STREQUAL "x86")
+ if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+ if(NOT "${nasm_helptext}" MATCHES "macho32")
+ message(
+ FATAL_ERROR "Unsupported nasm: macho32 object format not supported.")
+ endif()
+ elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+ OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+ if(NOT "${nasm_helptext}" MATCHES "win32")
+ message(
+ FATAL_ERROR "Unsupported nasm: win32 object format not supported.")
+ endif()
+ else()
+ if(NOT "${nasm_helptext}" MATCHES "elf32")
+ message(
+ FATAL_ERROR "Unsupported nasm: elf32 object format not supported.")
+ endif()
+ endif()
+ else()
+ if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+ if(NOT "${nasm_helptext}" MATCHES "macho64")
+ message(
+ FATAL_ERROR "Unsupported nasm: macho64 object format not supported.")
+ endif()
+ elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+ OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+ if(NOT "${nasm_helptext}" MATCHES "win64")
+ message(
+ FATAL_ERROR "Unsupported nasm: win64 object format not supported.")
+ endif()
+ else()
+ if(NOT "${nasm_helptext}" MATCHES "elf64")
+ message(
+ FATAL_ERROR "Unsupported nasm: elf64 object format not supported.")
+ endif()
+ endif()
+ endif()
+endfunction()
+
+# Adds build command for generation of rtcd C source files using
+# build/cmake/rtcd.pl. $config is the input perl file, $output is the output C
+# include file, $source is the C source file, and $symbol is used for the symbol
+# argument passed to rtcd.pl.
+function(add_rtcd_build_step config output source symbol)
+ add_custom_command(
+ OUTPUT ${output}
+ COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/cmake/rtcd.pl"
+ --arch=${AOM_TARGET_CPU}
+ --sym=${symbol} ${AOM_RTCD_FLAGS}
+ --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output}
+ DEPENDS "${AOM_ROOT}/build/cmake/rtcd.pl" ${config}
+ COMMENT "Generating ${output}"
+ WORKING_DIRECTORY ${AOM_CONFIG_DIR}
+ VERBATIM)
+ set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
+ set_property(SOURCE ${output} PROPERTY GENERATED TRUE)
+endfunction()
diff --git a/third_party/aom/build/cmake/compiler_flags.cmake b/third_party/aom/build/cmake/compiler_flags.cmake
new file mode 100644
index 0000000000..f008b964f5
--- /dev/null
+++ b/third_party/aom/build/cmake/compiler_flags.cmake
@@ -0,0 +1,385 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_
+set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
+
+# Strings used to cache flags.
+set(AOM_C_FLAGS)
+set(AOM_CXX_FLAGS)
+set(AOM_EXE_LINKER_FLAGS)
+set(AOM_FAILED_C_FLAGS)
+set(AOM_FAILED_CXX_FLAGS)
+
+# Sets variable named by $out_is_present to YES in the caller's scope when $flag
+# is found in the string variable named by $flag_cache. Sets the var to NO
+# otherwise.
+function(is_flag_present flag_cache flag out_is_present)
+ string(FIND "${${flag_cache}}" "${flag}" flag_pos)
+ if(${flag_pos} EQUAL -1)
+ set(${out_is_present} NO PARENT_SCOPE)
+ else()
+ set(${out_is_present} YES PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Appends $flag to $flags. Ignores scope via use of FORCE with set() call.
+function(append_flag flags flag)
+ string(FIND "${${flags}}" "${flag}" found)
+ if(${found} EQUAL -1)
+ set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE)
+ endif()
+endfunction()
+
+# Checks C compiler for support of $c_flag. Adds $c_flag to all
+# $CMAKE_C_FLAGS_<CONFIG>s stored in AOM_C_CONFIGS when the compile test passes.
+# Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test
+# outcome.
+function(add_c_flag_if_supported c_flag)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
+ is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed)
+ if(${flag_ok} OR ${flag_failed})
+ return()
+ endif()
+
+ # Between 3.17.0 and 3.18.2 check_c_compiler_flag() sets a normal variable at
+ # parent scope while check_cxx_source_compiles() continues to set an internal
+ # cache variable, so we unset both to avoid the failure / success state
+ # persisting between checks. See
+ # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+ unset(C_FLAG_SUPPORTED)
+ unset(C_FLAG_SUPPORTED CACHE)
+ message("Checking C compiler flag support for: " ${c_flag})
+ check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED)
+
+ if(${C_FLAG_SUPPORTED})
+ append_flag(AOM_C_FLAGS "${c_flag}")
+ foreach(config ${AOM_C_CONFIGS})
+ unset(C_FLAG_FOUND)
+ append_flag("${config}" "${c_flag}")
+ endforeach()
+ else()
+ append_flag(AOM_FAILED_C_FLAGS "${c_flag}")
+ endif()
+endfunction()
+
+# Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all
+# $CMAKE_CXX_FLAGS_<CONFIG>s stored in AOM_CXX_CONFIGS when the compile test
+# passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending
+# on test outcome.
+function(add_cxx_flag_if_supported cxx_flag)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
+ is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed)
+ if(${flag_ok} OR ${flag_failed})
+ return()
+ endif()
+
+ # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal variable
+ # at parent scope while check_cxx_source_compiles() continues to set an
+ # internal cache variable, so we unset both to avoid the failure / success
+ # state persisting between checks. See
+ # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+ unset(CXX_FLAG_SUPPORTED)
+ unset(CXX_FLAG_SUPPORTED CACHE)
+ message("Checking C++ compiler flag support for: " ${cxx_flag})
+ check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED)
+
+ if(${CXX_FLAG_SUPPORTED})
+ append_flag(AOM_CXX_FLAGS "${cxx_flag}")
+ foreach(config ${AOM_CXX_CONFIGS})
+ unset(CXX_FLAG_FOUND)
+ append_flag("${config}" "${cxx_flag}")
+ endforeach()
+ else()
+ append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}")
+ endif()
+endfunction()
+
+# Convenience method for adding a flag to both the C and C++ compiler command
+# lines.
+function(add_compiler_flag_if_supported flag)
+ add_c_flag_if_supported(${flag})
+ add_cxx_flag_if_supported(${flag})
+endfunction()
+
+# Checks C compiler for support of $c_flag and terminates generation when
+# support is not present.
+function(require_c_flag c_flag update_c_flags)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
+ if(${flag_ok})
+ return()
+ endif()
+
+ if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+ aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
+ endif()
+
+ unset(HAVE_C_FLAG CACHE)
+ message("Checking C compiler flag support for: " ${c_flag})
+ check_c_compiler_flag("${c_flag}" HAVE_C_FLAG)
+ if(NOT HAVE_C_FLAG)
+ message(
+ FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
+ endif()
+
+ if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+ aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
+ endif()
+
+ append_flag(AOM_C_FLAGS "${c_flag}")
+ if(update_c_flags)
+ foreach(config ${AOM_C_CONFIGS})
+ set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE)
+ endforeach()
+ endif()
+endfunction()
+
+# Checks CXX compiler for support of $cxx_flag and terminates generation when
+# support is not present.
+function(require_cxx_flag cxx_flag update_cxx_flags)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
+ if(${flag_ok})
+ return()
+ endif()
+
+ if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+ aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
+ endif()
+
+ unset(HAVE_CXX_FLAG CACHE)
+ message("Checking C compiler flag support for: " ${cxx_flag})
+ check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG)
+ if(NOT HAVE_CXX_FLAG)
+ message(
+ FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
+ endif()
+
+ if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+ aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
+ endif()
+
+ append_flag(AOM_CXX_FLAGS "${cxx_flag}")
+ if(update_cxx_flags)
+ foreach(config ${AOM_CXX_CONFIGS})
+ set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE)
+ endforeach()
+ endif()
+endfunction()
+
+# Checks for support of $flag by both the C and CXX compilers. Terminates
+# generation when support is not present in both compilers.
+function(require_compiler_flag flag update_cmake_flags)
+ require_c_flag(${flag} ${update_cmake_flags})
+ require_cxx_flag(${flag} ${update_cmake_flags})
+endfunction()
+
+# Checks only non-MSVC targets for support of $c_flag and terminates generation
+# when support is not present.
+function(require_c_flag_nomsvc c_flag update_c_flags)
+ if(NOT MSVC)
+ require_c_flag(${c_flag} ${update_c_flags})
+ endif()
+endfunction()
+
+# Checks only non-MSVC targets for support of $cxx_flag and terminates
+# generation when support is not present.
+function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags)
+ if(NOT MSVC)
+ require_cxx_flag(${cxx_flag} ${update_cxx_flags})
+ endif()
+endfunction()
+
+# Checks only non-MSVC targets for support of $flag by both the C and CXX
+# compilers. Terminates generation when support is not present in both
+# compilers.
+function(require_compiler_flag_nomsvc flag update_cmake_flags)
+ require_c_flag_nomsvc(${flag} ${update_cmake_flags})
+ require_cxx_flag_nomsvc(${flag} ${update_cmake_flags})
+endfunction()
+
+# Adds $preproc_def to C compiler command line (as -D$preproc_def) if not
+# already present.
+function(add_c_preproc_definition preproc_def)
+ set(preproc_def "-D${preproc_def}")
+ is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached)
+ if(${flag_cached})
+ return()
+ endif()
+
+ foreach(config ${AOM_C_CONFIGS})
+ set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
+ endforeach()
+endfunction()
+
+# Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not
+# already present.
+function(add_cxx_preproc_definition preproc_def)
+ set(preproc_def "-D${preproc_def}")
+ is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached)
+ if(${flag_cached})
+ return()
+ endif()
+
+ foreach(config ${AOM_CXX_CONFIGS})
+ set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
+ endforeach()
+endfunction()
+
+# Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if
+# not already present.
+function(add_preproc_definition preproc_def)
+ add_c_preproc_definition(${preproc_def})
+ add_cxx_preproc_definition(${preproc_def})
+endfunction()
+
+# Adds $flag to assembler command line.
+function(append_as_flag flag)
+ is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached)
+ if(${flag_cached})
+ return()
+ endif()
+ append_flag(AOM_AS_FLAGS "${flag}")
+endfunction()
+
+# Adds $flag to the C compiler command line.
+function(append_c_flag flag)
+ is_flag_present(AOM_C_FLAGS "${flag}" flag_cached)
+ if(${flag_cached})
+ return()
+ endif()
+
+ foreach(config ${AOM_C_CONFIGS})
+ append_flag(${config} "${flag}")
+ endforeach()
+endfunction()
+
+# Adds $flag to the CXX compiler command line.
+function(append_cxx_flag flag)
+ is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached)
+ if(${flag_cached})
+ return()
+ endif()
+
+ foreach(config ${AOM_CXX_CONFIGS})
+ append_flag(${config} "${flag}")
+ endforeach()
+endfunction()
+
+# Adds $flag to the C and CXX compiler command lines.
+function(append_compiler_flag flag)
+ append_c_flag(${flag})
+ append_cxx_flag(${flag})
+endfunction()
+
+# Adds $flag to the executable linker command line when not present.
+function(append_exe_linker_flag flag)
+ is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached)
+ if(${flag_cached})
+ return()
+ endif()
+
+ append_flag(AOM_EXE_LINKER_FLAGS "${flag}")
+ foreach(config ${AOM_EXE_LINKER_CONFIGS})
+ append_flag(${config} "${flag}")
+ endforeach()
+endfunction()
+
+# Adds $flag to the link flags for $target.
+function(append_link_flag_to_target target flag)
+ unset(target_link_flags)
+ get_target_property(target_link_flags ${target} LINK_FLAGS)
+
+ if(target_link_flags)
+ is_flag_present(target_link_flags "${flag}" flag_found)
+ if(${flag_found})
+ return()
+ endif()
+ set(target_link_flags "${target_link_flags} ${flag}")
+ else()
+ set(target_link_flags "${flag}")
+ endif()
+
+ set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags})
+endfunction()
+
+# Adds $flag to executable linker flags, and makes sure C/CXX builds still work.
+function(require_linker_flag flag)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ append_exe_linker_flag(${flag})
+
+ unset(c_passed)
+ aom_check_c_compiles("LINKER_FLAG_C_TEST(${flag})" "" c_passed)
+ unset(cxx_passed)
+ aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed)
+
+ if(NOT c_passed OR NOT cxx_passed)
+ message(FATAL_ERROR "Linker flag test for ${flag} failed.")
+ endif()
+endfunction()
+
+# Appends flags in $AOM_EXTRA_<TYPE>_FLAGS variables to the flags used at build
+# time.
+function(set_user_flags)
+
+ # Linker flags are handled first because some C/CXX flags require that a
+ # linker flag is present at link time.
+ if(AOM_EXTRA_EXE_LINKER_FLAGS)
+ is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}"
+ extra_present)
+ if(NOT ${extra_present})
+ require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}")
+ endif()
+ endif()
+ if(AOM_EXTRA_AS_FLAGS)
+
+ # TODO(tomfinegan): assembler flag testing would be a good thing to have.
+ is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present)
+ if(NOT ${extra_present})
+ append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}")
+ endif()
+ endif()
+ if(AOM_EXTRA_C_FLAGS)
+ is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present)
+ if(NOT ${extra_present})
+ require_c_flag("${AOM_EXTRA_C_FLAGS}" YES)
+ endif()
+ endif()
+ if(AOM_EXTRA_CXX_FLAGS)
+ is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present)
+ if(NOT ${extra_present})
+ require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES)
+ endif()
+ endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/compiler_tests.cmake b/third_party/aom/build/cmake/compiler_tests.cmake
new file mode 100644
index 0000000000..0402832253
--- /dev/null
+++ b/third_party/aom/build/cmake/compiler_tests.cmake
@@ -0,0 +1,179 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_
+set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1)
+
+include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
+
+# CMake passes command line flags like this:
+#
+# * $compiler $lang_flags $lang_flags_config ...
+#
+# To ensure the flags tested here and elsewhere are obeyed a list of active
+# build configuration types is built, and flags are applied to the flag strings
+# for each configuration currently active for C and CXX builds as determined by
+# reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When
+# $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in
+# use: currently this includes MSVC and Xcode. For other generators
+# $CMAKE_BUILD_TYPE is used. For both cases AOM_<LANG>_CONFIGS is populated with
+# CMake string variable names that contain flags for the currently available
+# configuration(s).
+unset(AOM_C_CONFIGS)
+unset(AOM_CXX_CONFIGS)
+list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs)
+if(${num_configs} GREATER 0)
+ foreach(config ${CMAKE_CONFIGURATION_TYPES})
+ string(TOUPPER ${config} config)
+ list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
+ list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
+ list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
+ endforeach()
+else()
+ string(TOUPPER ${CMAKE_BUILD_TYPE} config)
+ set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
+ set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
+ set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
+endif()
+
+# The basic main() function used in all compile tests.
+set(AOM_C_MAIN "\nint main(void) { return 0; }")
+set(AOM_CXX_MAIN "\nint main() { return 0; }")
+
+# Strings containing the names of passed and failed tests.
+set(AOM_C_PASSED_TESTS)
+set(AOM_C_FAILED_TESTS)
+set(AOM_CXX_PASSED_TESTS)
+set(AOM_CXX_FAILED_TESTS)
+
+function(aom_push_var var new_value)
+ set(SAVED_${var} ${${var}} PARENT_SCOPE)
+ set(${var} "${${var}} ${new_value}" PARENT_SCOPE)
+endfunction()
+
+function(aom_pop_var var)
+ set(var ${SAVED_${var}} PARENT_SCOPE)
+ unset(SAVED_${var} PARENT_SCOPE)
+endfunction()
+
+# Confirms $test_source compiles and stores $test_name in one of
+# $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
+function(aom_check_c_compiles test_name test_source result_var)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ unset(C_TEST_PASSED CACHE)
+ unset(C_TEST_FAILED CACHE)
+ string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED)
+ string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED)
+ if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1)
+ unset(C_TEST_COMPILED CACHE)
+ message("Running C compiler test: ${test_name}")
+ check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED)
+ set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE)
+
+ if(C_TEST_COMPILED)
+ set(AOM_C_PASSED_TESTS
+ "${AOM_C_PASSED_TESTS} ${test_name}"
+ CACHE STRING "" FORCE)
+ else()
+ set(AOM_C_FAILED_TESTS
+ "${AOM_C_FAILED_TESTS} ${test_name}"
+ CACHE STRING "" FORCE)
+ message("C Compiler test ${test_name} failed.")
+ endif()
+ elseif(NOT ${C_TEST_PASSED} EQUAL -1)
+ set(${result_var} 1 PARENT_SCOPE)
+ else() # ${C_TEST_FAILED} NOT EQUAL -1
+ unset(${result_var} PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Confirms $test_source compiles and stores $test_name in one of
+# $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
+function(aom_check_cxx_compiles test_name test_source result_var)
+ if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+ return()
+ endif()
+
+ unset(CXX_TEST_PASSED CACHE)
+ unset(CXX_TEST_FAILED CACHE)
+ string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED)
+ string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED)
+ if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1)
+ unset(CXX_TEST_COMPILED CACHE)
+ message("Running CXX compiler test: ${test_name}")
+ check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}"
+ CXX_TEST_COMPILED)
+ set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE)
+
+ if(CXX_TEST_COMPILED)
+ set(AOM_CXX_PASSED_TESTS
+ "${AOM_CXX_PASSED_TESTS} ${test_name}"
+ CACHE STRING "" FORCE)
+ else()
+ set(AOM_CXX_FAILED_TESTS
+ "${AOM_CXX_FAILED_TESTS} ${test_name}"
+ CACHE STRING "" FORCE)
+ message("CXX Compiler test ${test_name} failed.")
+ endif()
+ elseif(NOT ${CXX_TEST_PASSED} EQUAL -1)
+ set(${result_var} 1 PARENT_SCOPE)
+ else() # ${CXX_TEST_FAILED} NOT EQUAL -1
+ unset(${result_var} PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Convenience function that confirms $test_source compiles as C and C++.
+# $result_var is set to 1 when both tests are successful, and 0 when one or both
+# tests fail. Note: This function is intended to be used to write to result
+# variables that are expanded via configure_file(). $result_var is set to 1 or 0
+# to allow direct usage of the value in generated source files.
+function(aom_check_source_compiles test_name test_source result_var)
+ unset(C_PASSED)
+ unset(CXX_PASSED)
+ aom_check_c_compiles(${test_name} ${test_source} C_PASSED)
+ aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED)
+ if(C_PASSED AND CXX_PASSED)
+ set(${result_var} 1 PARENT_SCOPE)
+ else()
+ set(${result_var} 0 PARENT_SCOPE)
+ endif()
+endfunction()
+
+# When inline support is detected for the current compiler the supported
+# inlining keyword is written to $result in caller scope.
+function(aom_get_inline result)
+ aom_check_source_compiles("inline_check_1"
+ "static inline void function(void) {}"
+ HAVE_INLINE_1)
+ if(HAVE_INLINE_1 EQUAL 1)
+ set(${result} "inline" PARENT_SCOPE)
+ return()
+ endif()
+
+ # Check __inline.
+ aom_check_source_compiles("inline_check_2"
+ "static __inline void function(void) {}"
+ HAVE_INLINE_2)
+ if(HAVE_INLINE_2 EQUAL 1)
+ set(${result} "__inline" PARENT_SCOPE)
+ endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake
new file mode 100644
index 0000000000..a9b7a67070
--- /dev/null
+++ b/third_party/aom/build/cmake/cpu.cmake
@@ -0,0 +1,108 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if("${AOM_TARGET_CPU}" STREQUAL "arm64")
+ set(AOM_ARCH_ARM 1)
+ set(AOM_ARCH_AARCH64 1)
+ set(RTCD_ARCH_ARM "yes")
+
+ set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE")
+ set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc")
+ set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod")
+ set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm")
+ set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve")
+
+ # Check that the compiler flag to enable each flavor is supported by the
+ # compiler. This may not be the case for new architecture features on old
+ # compiler versions.
+ foreach(flavor ${ARM64_FLAVORS})
+ if(ENABLE_${flavor} AND NOT DEFINED AOM_${flavor}_FLAG)
+ set(AOM_${flavor}_FLAG "${AOM_${flavor}_DEFAULT_FLAG}")
+ unset(FLAG_SUPPORTED)
+ check_c_compiler_flag("${AOM_${flavor}_FLAG}" FLAG_SUPPORTED)
+ if(NOT ${FLAG_SUPPORTED})
+ set(ENABLE_${flavor} 0)
+ endif()
+ endif()
+ endforeach()
+
+ # SVE requires that the Neon-SVE bridge header is also available.
+ if(ENABLE_SVE)
+ set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+ set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}")
+ aom_check_source_compiles("arm_neon_sve_bridge_available" "
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>" HAVE_SVE_HEADERS)
+ set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS})
+ if(HAVE_SVE_HEADERS EQUAL 0)
+ set(ENABLE_SVE 0)
+ endif()
+ endif()
+
+ foreach(flavor ${ARM64_FLAVORS})
+ if(ENABLE_${flavor})
+ set(HAVE_${flavor} 1)
+ set(RTCD_HAVE_${flavor} "yes")
+ else()
+ set(HAVE_${flavor} 0)
+ string(TOLOWER ${flavor} flavor)
+ set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+ endif()
+ endforeach()
+
+elseif("${AOM_TARGET_CPU}" MATCHES "^arm")
+ set(AOM_ARCH_ARM 1)
+ set(RTCD_ARCH_ARM "yes")
+
+ if(ENABLE_NEON)
+ set(HAVE_NEON 1)
+ set(RTCD_HAVE_NEON "yes")
+ else()
+ set(HAVE_NEON 0)
+ set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
+ endif()
+
+elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+ set(AOM_ARCH_PPC 1)
+ set(RTCD_ARCH_PPC "yes")
+
+ if(ENABLE_VSX)
+ set(HAVE_VSX 1)
+ set(RTCD_HAVE_VSX "yes")
+ else()
+ set(HAVE_VSX 0)
+ set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx)
+ endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
+ if("${AOM_TARGET_CPU}" STREQUAL "x86")
+ set(AOM_ARCH_X86 1)
+ set(RTCD_ARCH_X86 "yes")
+ elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ set(AOM_ARCH_X86_64 1)
+ set(RTCD_ARCH_X86_64 "yes")
+ endif()
+
+ set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2")
+ foreach(flavor ${X86_FLAVORS})
+ if(ENABLE_${flavor} AND NOT disable_remaining_flavors)
+ set(HAVE_${flavor} 1)
+ set(RTCD_HAVE_${flavor} "yes")
+ else()
+ set(disable_remaining_flavors 1)
+ set(HAVE_${flavor} 0)
+ string(TOLOWER ${flavor} flavor)
+ set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+ endif()
+ endforeach()
+endif()
diff --git a/third_party/aom/build/cmake/dist.cmake b/third_party/aom/build/cmake/dist.cmake
new file mode 100644
index 0000000000..5b9fc95d41
--- /dev/null
+++ b/third_party/aom/build/cmake/dist.cmake
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+# Converts spaces in $in_string to semicolons and writes the output to
+# $out_string. In CMake's eyes this converts the input string to a list.
+function(listify_string in_string out_string)
+ string(REPLACE " " ";" ${out_string} ${in_string})
+ set(${out_string} "${${out_string}}" PARENT_SCOPE)
+endfunction()
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES"
+ "AOM_DIST_LIBS" "ENABLE_DOCS")
+
+foreach(arg ${REQUIRED_ARGS})
+ if("${${arg}}" STREQUAL "")
+ message(FATAL_ERROR "${arg} must not be empty.")
+ endif()
+endforeach()
+
+if(ENABLE_DOCS)
+ file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}")
+endif()
+
+if(AOM_DIST_EXAMPLES)
+ listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES")
+ foreach(example ${AOM_DIST_EXAMPLES})
+ if(NOT "${example}" MATCHES "aomdec\|aomenc")
+ file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples")
+ endif()
+ endforeach()
+endif()
+
+if(AOM_DIST_TOOLS)
+ listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS")
+ foreach(tool ${AOM_DIST_TOOLS})
+ file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools")
+ endforeach()
+endif()
+
+if(AOM_DIST_APPS)
+ listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS")
+ foreach(app ${AOM_DIST_APPS})
+ file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin")
+ endforeach()
+endif()
+
+listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES")
+foreach(inc ${AOM_DIST_INCLUDES})
+ file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom")
+endforeach()
+
+listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS")
+foreach(lib ${AOM_DIST_LIBS})
+ file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib")
+endforeach()
diff --git a/third_party/aom/build/cmake/exports.cmake b/third_party/aom/build/cmake/exports.cmake
new file mode 100644
index 0000000000..1cea2b52ab
--- /dev/null
+++ b/third_party/aom/build/cmake/exports.cmake
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_
+set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1)
+
+include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
+
+# Creates the custom target which handles generation of the symbol export lists.
+function(setup_exports_target)
+ if(APPLE)
+ set(symbol_file_ext "syms")
+ elseif(WIN32)
+ set(symbol_file_ext "def")
+ else()
+ set(symbol_file_ext "ver")
+ endif()
+
+ set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}")
+
+ add_custom_target(
+ generate_exports
+ COMMAND ${CMAKE_COMMAND}
+ -DAOM_ROOT="${AOM_ROOT}"
+ -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+ -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
+ -DAOM_SYM_FILE="${aom_sym_file}"
+ -DAOM_MSVC=${MSVC}
+ -DAOM_XCODE=${XCODE}
+ -DCMAKE_SHARED_LIBRARY_PREFIX="${CMAKE_SHARED_LIBRARY_PREFIX}"
+ -DCONFIG_NAME=$<CONFIG>
+ -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
+ -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
+ -DCONFIG_INSPECTION=${CONFIG_INSPECTION}
+ -DENABLE_TESTS=${ENABLE_TESTS}
+ -P
+ "${AOM_ROOT}/build/cmake/generate_exports.cmake"
+ SOURCES ${AOM_EXPORTS_SOURCES}
+ DEPENDS ${AOM_EXPORTS_SOURCES} BYPRODUCTS ${aom_sym_file})
+
+ # Make libaom depend on the exports file, and set flags to pick it up when
+ # creating the dylib.
+ add_dependencies(aom generate_exports)
+
+ if(APPLE)
+ set_property(TARGET aom
+ APPEND_STRING
+ PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
+ elseif(WIN32)
+ if(MSVC)
+ set_property(TARGET aom
+ APPEND_STRING
+ PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}")
+ else()
+ # For MinGW and MSYS compilers, you can use either version scripts or
+ # module definition files. If the latter, it must be supplied as an
+ # "object".
+ set_property(TARGET aom
+ APPEND_STRING
+ PROPERTY LINK_FLAGS "${aom_sym_file}")
+ endif()
+ else()
+ set_property(TARGET aom
+ APPEND_STRING
+ PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}")
+ endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/exports_sources.cmake b/third_party/aom/build/cmake/exports_sources.cmake
new file mode 100644
index 0000000000..46bf001d86
--- /dev/null
+++ b/third_party/aom/build/cmake/exports_sources.cmake
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_
+set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1)
+
+list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com"
+ "${AOM_ROOT}/av1/exports_com")
+
+if(CONFIG_AV1_DECODER)
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec"
+ "${AOM_ROOT}/av1/exports_dec")
+ if(CONFIG_INSPECTION)
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/av1/exports_ident")
+ endif()
+endif()
+
+if(CONFIG_AV1_ENCODER)
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc"
+ "${AOM_ROOT}/av1/exports_enc")
+endif()
+
+if(ENABLE_TESTS)
+ list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_test"
+ "${AOM_ROOT}/av1/exports_test")
+endif()
diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
new file mode 100644
index 0000000000..529daaf02a
--- /dev/null
+++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+string(TIMESTAMP year "%Y")
+set(asm_file_header_block "\;
+\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+\;
+\; This source code is subject to the terms of the BSD 2 Clause License and
+\; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+\; was not distributed with this source code in the LICENSE file, you can
+\; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+\; Media Patent License 1.0 was not distributed with this source code in the
+\; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+\;
+")
+set(h_file_header_block "/*
+ * Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+\#ifndef AOM_CONFIG_H_
+\#define AOM_CONFIG_H_
+")
+set(cmake_file_header_block "##
+## Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+")
+
+# Terminates cmake execution when $var_name is an empty string, or the variable
+# name it contains does not expand to an existing directory.
+function(check_directory_var var_name)
+ if("${var_name}" STREQUAL "")
+ message(FATAL_ERROR "The CMake variable ${var_name} must be defined.")
+ endif()
+
+ if(NOT EXISTS "${${var_name}}")
+ message(FATAL_ERROR "${${var_name}} (${var_name}) missing.")
+ endif()
+endfunction()
+
+check_directory_var(AOM_CONFIG_DIR)
+check_directory_var(AOM_ROOT)
+
+set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+if(NOT EXISTS "${AOM_DEFAULTS}")
+ message(
+ FATAL_ERROR "Configuration default values file (${AOM_DEFAULTS}) missing.")
+endif()
+
+include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS})
+list(SORT aom_build_vars)
+
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+file(WRITE "${aom_config_h_template}" ${h_file_header_block})
+foreach(aom_var ${aom_build_vars})
+ if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
+ file(APPEND "${aom_config_h_template}"
+ "\#define ${aom_var} \${${aom_var}}\n")
+ endif()
+endforeach()
+file(APPEND "${aom_config_h_template}" "\#endif // AOM_CONFIG_H_")
+
+set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
+foreach(aom_var ${aom_build_vars})
+ if(NOT "${aom_var}" STREQUAL "INLINE"
+ AND NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
+ file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n")
+ endif()
+endforeach()
diff --git a/third_party/aom/build/cmake/generate_exports.cmake b/third_party/aom/build/cmake/generate_exports.cmake
new file mode 100644
index 0000000000..3a5f67cea6
--- /dev/null
+++ b/third_party/aom/build/cmake/generate_exports.cmake
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+# CMAKE_SHARED_LIBRARY_PREFIX can be empty
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE"
+ "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
+
+foreach(arg ${REQUIRED_ARGS})
+ if("${${arg}}" STREQUAL "")
+ message(FATAL_ERROR "${arg} must not be empty.")
+ endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
+
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+ set(symbol_prefix "_")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
+ file(WRITE "${AOM_SYM_FILE}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n"
+ "EXPORTS\n")
+else()
+ set(symbol_suffix ";")
+endif()
+
+set(aom_sym_file "${AOM_SYM_FILE}")
+
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+ file(REMOVE "${aom_sym_file}")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
+ file(WRITE "${aom_sym_file}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n"
+ "EXPORTS\n")
+else()
+ file(WRITE "${aom_sym_file}" "{\nglobal:\n")
+endif()
+
+foreach(export_file ${AOM_EXPORTS_SOURCES})
+ file(STRINGS "${export_file}" exported_file_data)
+ set(exported_symbols "${exported_symbols} ${exported_file_data};")
+ string(STRIP "${exported_symbols}" exported_symbols)
+endforeach()
+
+foreach(exported_symbol ${exported_symbols})
+ string(STRIP "${exported_symbol}" exported_symbol)
+ if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS")
+ string(SUBSTRING ${exported_symbol} 0 4 export_type)
+ string(COMPARE EQUAL "${export_type}" "data" is_data)
+ if(is_data)
+ set(symbol_suffix " DATA")
+ else()
+ set(symbol_suffix "")
+ endif()
+ endif()
+ string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}")
+ set(exported_symbol " ${symbol_prefix}${exported_symbol}${symbol_suffix}")
+ file(APPEND "${aom_sym_file}" "${exported_symbol}\n")
+endforeach()
+
+if("${aom_sym_file}" MATCHES "ver$")
+ file(APPEND "${aom_sym_file}" " \nlocal:\n *;\n};")
+endif()
diff --git a/third_party/aom/build/cmake/pkg_config.cmake b/third_party/aom/build/cmake/pkg_config.cmake
new file mode 100644
index 0000000000..c4f94808a5
--- /dev/null
+++ b/third_party/aom/build/cmake/pkg_config.cmake
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
+ "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR"
+ "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME"
+ "CONFIG_MULTITHREAD")
+
+foreach(arg ${REQUIRED_ARGS})
+ if("${${arg}}" STREQUAL "")
+ message(FATAL_ERROR "${arg} must not be empty.")
+ endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version)
+
+# Create a version string suitable for comparison using the RPM version compare
+# algorithm: strip out everything after the number.
+string(FIND "${aom_version}" "-" dash_pos)
+if(${dash_pos} EQUAL -1)
+ set(package_version "${aom_version}")
+else()
+ string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version)
+endif()
+
+# Write pkg-config info.
+set(prefix "${CMAKE_INSTALL_PREFIX}")
+set(bindir "${CMAKE_INSTALL_BINDIR}")
+set(includedir "${CMAKE_INSTALL_INCLUDEDIR}")
+set(libdir "${CMAKE_INSTALL_LIBDIR}")
+set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc")
+string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name)
+file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n")
+file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n")
+file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}\n")
+file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/${includedir}\n")
+file(APPEND "${pkgconfig_file}" "libdir=\${exec_prefix}/${libdir}\n\n")
+file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n")
+file(
+ APPEND "${pkgconfig_file}"
+ "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n")
+file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n")
+file(APPEND "${pkgconfig_file}" "Requires:")
+if(CONFIG_TUNE_VMAF)
+ file(APPEND "${pkgconfig_file}" " libvmaf")
+endif()
+if(CONFIG_TUNE_BUTTERAUGLI)
+ file(APPEND "${pkgconfig_file}" " libjxl")
+endif()
+file(APPEND "${pkgconfig_file}" "\nConflicts:\n")
+file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n")
+if(CONFIG_MULTITHREAD AND CMAKE_THREAD_LIBS_INIT)
+ file(APPEND "${pkgconfig_file}"
+ "Libs.private: -lm ${CMAKE_THREAD_LIBS_INIT}\n")
+else()
+ file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
+endif()
+file(APPEND "${pkgconfig_file}" "Cflags: -I\${includedir}\n")
diff --git a/third_party/aom/build/cmake/rtcd.pl b/third_party/aom/build/cmake/rtcd.pl
new file mode 100755
index 0000000000..1cf52f076c
--- /dev/null
+++ b/third_party/aom/build/cmake/rtcd.pl
@@ -0,0 +1,430 @@
+#!/usr/bin/env perl
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+no strict 'refs';
+use warnings;
+use Getopt::Long;
+Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32;
+
+my %ALL_FUNCS = ();
+my @ALL_ARCHS;
+my @ALL_FORWARD_DECLS;
+my @REQUIRES;
+
+my %opts = ();
+my %disabled = ();
+my %required = ();
+
+my @argv;
+foreach (@ARGV) {
+ $disabled{$1} = 1, next if /--disable-(.*)/;
+ $required{$1} = 1, next if /--require-(.*)/;
+ push @argv, $_;
+}
+
+# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility.
+@ARGV = @argv;
+GetOptions(
+ \%opts,
+ 'arch=s',
+ 'sym=s',
+ 'config=s',
+);
+
+foreach my $opt (qw/arch config/) {
+ if (!defined($opts{$opt})) {
+ warn "--$opt is required!\n";
+ Getopt::Long::HelpMessage('-exit' => 1);
+ }
+}
+
+foreach my $defs_file (@ARGV) {
+ if (!-f $defs_file) {
+ warn "$defs_file: $!\n";
+ Getopt::Long::HelpMessage('-exit' => 1);
+ }
+}
+
+open CONFIG_FILE, $opts{config} or
+ die "Error opening config file '$opts{config}': $!\n";
+
+my %config = ();
+while (<CONFIG_FILE>) {
+ next if !/^#define\s+(?:CONFIG_|HAVE_)/;
+ chomp;
+ my @line_components = split /\s/;
+ scalar @line_components > 2 or
+ die "Invalid input passed to rtcd.pl via $opts{config}.";
+ # $line_components[0] = #define
+ # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING)
+ # $line_components[2] = flag value (0 or 1)
+ $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : "";
+}
+close CONFIG_FILE;
+
+#
+# Routines for the RTCD DSL to call
+#
+sub aom_config($) {
+ return (defined $config{$_[0]}) ? $config{$_[0]} : "";
+}
+
+sub specialize {
+ if (@_ <= 1) {
+ die "'specialize' must be called with a function name and at least one ",
+ "architecture ('C' is implied): \n@_\n";
+ }
+ my $fn=$_[0];
+ shift;
+ foreach my $opt (@_) {
+ eval "\$${fn}_${opt}=${fn}_${opt}";
+ }
+}
+
+sub add_proto {
+ my $fn = splice(@_, -2, 1);
+ my @proto = @_;
+ foreach (@proto) { tr/\t/ / }
+ $ALL_FUNCS{$fn} = \@proto;
+ specialize $fn, "c";
+}
+
+sub require {
+ foreach my $fn (keys %ALL_FUNCS) {
+ foreach my $opt (@_) {
+ my $ofn = eval "\$${fn}_${opt}";
+ next if !$ofn;
+
+ # if we already have a default, then we can disable it, as we know
+ # we can do better.
+ my $best = eval "\$${fn}_default";
+ if ($best) {
+ my $best_ofn = eval "\$${best}";
+ if ($best_ofn && "$best_ofn" ne "$ofn") {
+ eval "\$${best}_link = 'false'";
+ }
+ }
+ eval "\$${fn}_default=${fn}_${opt}";
+ eval "\$${fn}_${opt}_link='true'";
+ }
+ }
+}
+
+sub forward_decls {
+ push @ALL_FORWARD_DECLS, @_;
+}
+
+#
+# Include the user's directives
+#
+foreach my $f (@ARGV) {
+ open FILE, "<", $f or die "cannot open $f: $!\n";
+ my $contents = join('', <FILE>);
+ close FILE;
+ eval $contents or warn "eval failed: $@\n";
+}
+
+#
+# Process the directives according to the command line
+#
+sub process_forward_decls() {
+ foreach (@ALL_FORWARD_DECLS) {
+ $_->();
+ }
+}
+
+sub determine_indirection {
+ aom_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS);
+ foreach my $fn (keys %ALL_FUNCS) {
+ my $n = "";
+ my @val = @{$ALL_FUNCS{$fn}};
+ my $args = pop @val;
+ my $rtyp = "@val";
+ my $dfn = eval "\$${fn}_default";
+ $dfn = eval "\$${dfn}";
+ foreach my $opt (@_) {
+ my $ofn = eval "\$${fn}_${opt}";
+ next if !$ofn;
+ my $link = eval "\$${fn}_${opt}_link";
+ next if $link && $link eq "false";
+ $n .= "x";
+ }
+ if ($n eq "x") {
+ eval "\$${fn}_indirect = 'false'";
+ } else {
+ eval "\$${fn}_indirect = 'true'";
+ }
+ }
+}
+
+sub declare_function_pointers {
+ foreach my $fn (sort keys %ALL_FUNCS) {
+ my @val = @{$ALL_FUNCS{$fn}};
+ my $args = pop @val;
+ my $rtyp = "@val";
+ my $dfn = eval "\$${fn}_default";
+ $dfn = eval "\$${dfn}";
+ foreach my $opt (@_) {
+ my $ofn = eval "\$${fn}_${opt}";
+ next if !$ofn;
+ print "$rtyp ${ofn}($args);\n";
+ }
+ if (eval "\$${fn}_indirect" eq "false") {
+ print "#define ${fn} ${dfn}\n";
+ } else {
+ print "RTCD_EXTERN $rtyp (*${fn})($args);\n";
+ }
+ print "\n";
+ }
+}
+
+sub set_function_pointers {
+ foreach my $fn (sort keys %ALL_FUNCS) {
+ my @val = @{$ALL_FUNCS{$fn}};
+ my $args = pop @val;
+ my $rtyp = "@val";
+ my $dfn = eval "\$${fn}_default";
+ $dfn = eval "\$${dfn}";
+ if (eval "\$${fn}_indirect" eq "true") {
+ print " $fn = $dfn;\n";
+ foreach my $opt (@_) {
+ my $ofn = eval "\$${fn}_${opt}";
+ next if !$ofn;
+ next if "$ofn" eq "$dfn";
+ my $link = eval "\$${fn}_${opt}_link";
+ next if $link && $link eq "false";
+ my $cond = eval "\$have_${opt}";
+ print " if (${cond}) $fn = $ofn;\n"
+ }
+ }
+ }
+}
+
+sub filter {
+ my @filtered;
+ foreach (@_) { push @filtered, $_ unless $disabled{$_}; }
+ return @filtered;
+}
+
+#
+# Helper functions for generating the arch specific RTCD files
+#
+sub common_top() {
+ my $include_guard = uc($opts{sym})."_H_";
+ print <<EOF;
+// This file is generated. Do not edit.
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+EOF
+
+process_forward_decls();
+print <<EOF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+declare_function_pointers("c", @ALL_ARCHS);
+
+print <<EOF;
+void $opts{sym}(void);
+
+EOF
+}
+
+sub common_bottom() {
+ print <<EOF;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
+EOF
+}
+
+sub x86() {
+ determine_indirection("c", @ALL_ARCHS);
+
+ # Assign the helper variable for each enabled extension
+ foreach my $opt (@ALL_ARCHS) {
+ my $opt_uc = uc $opt;
+ eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+ }
+
+ common_top;
+ print <<EOF;
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+ int flags = x86_simd_caps();
+
+ (void)flags;
+
+EOF
+
+ set_function_pointers("c", @ALL_ARCHS);
+
+ print <<EOF;
+}
+#endif
+EOF
+ common_bottom;
+}
+
+sub arm() {
+ determine_indirection("c", @ALL_ARCHS);
+
+ # Assign the helper variable for each enabled extension
+ foreach my $opt (@ALL_ARCHS) {
+ my $opt_uc = uc $opt;
+ eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+ }
+
+ common_top;
+ print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+ int flags = aom_arm_cpu_caps();
+
+ (void)flags;
+
+EOF
+
+ set_function_pointers("c", @ALL_ARCHS);
+
+ print <<EOF;
+}
+#endif
+EOF
+ common_bottom;
+}
+
+sub ppc() {
+ determine_indirection("c", @ALL_ARCHS);
+
+ # Assign the helper variable for each enabled extension
+ foreach my $opt (@ALL_ARCHS) {
+ my $opt_uc = uc $opt;
+ eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+ }
+
+ common_top;
+
+ print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+ int flags = ppc_simd_caps();
+
+ (void)flags;
+
+EOF
+
+ set_function_pointers("c", @ALL_ARCHS);
+
+ print <<EOF;
+}
+#endif
+EOF
+ common_bottom;
+}
+
+sub unoptimized() {
+ determine_indirection "c";
+ common_top;
+ print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+ set_function_pointers "c";
+
+ print <<EOF;
+}
+#endif
+EOF
+ common_bottom;
+}
+
+#
+# Main Driver
+#
+
+&require("c");
+&require(keys %required);
+if ($opts{arch} eq 'x86') {
+ @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
+ x86;
+} elsif ($opts{arch} eq 'x86_64') {
+ @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
+ @REQUIRES = filter(qw/mmx sse sse2/);
+ &require(@REQUIRES);
+ x86;
+} elsif ($opts{arch} =~ /armv[78]\w?/) {
+ @ALL_ARCHS = filter(qw/neon/);
+ arm;
+} elsif ($opts{arch} eq 'arm64' ) {
+ @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve/);
+ @REQUIRES = filter(qw/neon/);
+ &require(@REQUIRES);
+ arm;
+} elsif ($opts{arch} eq 'ppc') {
+ @ALL_ARCHS = filter(qw/vsx/);
+ ppc;
+} else {
+ unoptimized;
+}
+
+__END__
+
+=head1 NAME
+
+rtcd -
+
+=head1 SYNOPSIS
+
+Usage: rtcd.pl [options] FILE
+
+See 'perldoc rtcd.pl' for more details.
+
+=head1 DESCRIPTION
+
+Reads the Run Time CPU Detections definitions from FILE and generates a
+C header file on stdout.
+
+=head1 OPTIONS
+
+Options:
+ --arch=ARCH Architecture to generate defs for (required)
+ --disable-EXT Disable support for EXT extensions
+ --require-EXT Require support for EXT extensions
+ --sym=SYMBOL Unique symbol to use for RTCD initialization function
+ --config=FILE Path to file containing C preprocessor directives to parse
diff --git a/third_party/aom/build/cmake/sanitizers.cmake b/third_party/aom/build/cmake/sanitizers.cmake
new file mode 100644
index 0000000000..bcb600ce4c
--- /dev/null
+++ b/third_party/aom/build/cmake/sanitizers.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_SANITIZERS_CMAKE_
+set(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_ 1)
+
+if(MSVC OR NOT SANITIZE)
+ return()
+endif()
+
+include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
+
+string(TOLOWER ${SANITIZE} SANITIZE)
+
+# Require the sanitizer requested. cfi sanitizer requires all the flags in order
+# for the compiler to accept it.
+if("${SANITIZE}" MATCHES "cfi" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
+ require_linker_flag("-fsanitize=${SANITIZE} -flto -fno-sanitize-trap=cfi \
+ -fuse-ld=gold" YES)
+ require_compiler_flag("-fsanitize=${SANITIZE} -flto -fvisibility=hidden \
+ -fno-sanitize-trap=cfi" YES)
+else()
+ require_linker_flag("-fsanitize=${SANITIZE}")
+ require_compiler_flag("-fsanitize=${SANITIZE}" YES)
+endif()
+
+# Make callstacks accurate.
+require_compiler_flag("-fno-omit-frame-pointer -fno-optimize-sibling-calls" YES)
+
+# Fix link errors due to missing rt compiler lib in 32-bit builds.
+# http://llvm.org/bugs/show_bug.cgi?id=17693
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+ AND "${SANITIZE}" MATCHES "integer|undefined")
+ require_linker_flag("--rtlib=compiler-rt -lgcc_s")
+ endif()
+endif()
diff --git a/third_party/aom/build/cmake/toolchains/android.cmake b/third_party/aom/build/cmake/toolchains/android.cmake
new file mode 100644
index 0000000000..fb086856a7
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/android.cmake
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ANDROID_CMAKE_ 1)
+
+if(NOT ANDROID_PLATFORM)
+ set(ANDROID_PLATFORM android-24)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+ set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(AOM_ANDROID_NDK_PATH)
+ set(ENV{_AOM_ANDROID_NDK_PATH} "${AOM_ANDROID_NDK_PATH}")
+else()
+ set(AOM_ANDROID_NDK_PATH "$ENV{_AOM_ANDROID_NDK_PATH}")
+endif()
+
+if("${AOM_ANDROID_NDK_PATH}" STREQUAL "")
+ message(FATAL_ERROR "AOM_ANDROID_NDK_PATH not set.")
+ return()
+endif()
+
+include("${AOM_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
+
+if(ANDROID_ABI MATCHES "^armeabi")
+ set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
+endif()
+
+if(ANDROID_ABI MATCHES "^arm")
+ set(CMAKE_ASM_COMPILER as)
+elseif(ANDROID_ABI MATCHES "^x86")
+ set(CMAKE_ASM_NASM_COMPILER yasm)
+endif()
+
+set(CMAKE_SYSTEM_NAME "Android")
diff --git a/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake b/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
new file mode 100644
index 0000000000..2c433befd9
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_
+set(AOM_BUILD_CMAKE_ARM_IOS_COMMON_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_SYSROOT iphoneos)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+
+# TODO(tomfinegan): Handle bit code embedding.
diff --git a/third_party/aom/build/cmake/toolchains/arm64-ios.cmake b/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
new file mode 100644
index 0000000000..6feb1090f2
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_ 1)
+
+if(XCODE) # TODO(tomfinegan): Handle arm builds in Xcode.
+ message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_OSX_ARCHITECTURES "arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
diff --git a/third_party/aom/build/cmake/toolchains/arm64-linux-clang.cmake b/third_party/aom/build/cmake/toolchains/arm64-linux-clang.cmake
new file mode 100644
index 0000000000..b4645cc09e
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-linux-clang.cmake
@@ -0,0 +1,30 @@
+#
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_CLANG_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+set(TRIPLE aarch64-linux-gnu)
+
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_ASM_COMPILER clang)
+set(CMAKE_ASM_COMPILER_TARGET ${TRIPLE})
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
diff --git a/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
new file mode 100644
index 0000000000..3d0dff0252
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS aarch64-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
+set(CMAKE_C_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(AOM_AS_FLAGS "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+
+# No intrinsics flag required for arm64-linux-gcc.
+set(AOM_NEON_INTRIN_FLAG "")
diff --git a/third_party/aom/build/cmake/toolchains/arm64-macos.cmake b/third_party/aom/build/cmake/toolchains/arm64-macos.cmake
new file mode 100644
index 0000000000..99f8d16e16
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-macos.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "arm64")
+set(CMAKE_C_FLAGS_INIT "-arch arm64")
+set(CMAKE_CXX_FLAGS_INIT "-arch arm64")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch arm64")
diff --git a/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake
new file mode 100644
index 0000000000..95b26d3ceb
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by MSYS2.
+ set(CROSS aarch64-w64-mingw32-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+ set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
diff --git a/third_party/aom/build/cmake/toolchains/armv7-ios.cmake b/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
new file mode 100644
index 0000000000..11f7e160df
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_ 1)
+
+if(XCODE)
+
+ # TODO(tomfinegan): Handle arm builds in Xcode.
+ message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_OSX_ARCHITECTURES "armv7")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
+
+# No intrinsics flag required for armv7s-ios.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7s-ios.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
new file mode 100644
index 0000000000..aa0550574d
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS arm-linux-gnueabihf-)
+endif()
+
+if(NOT ${CROSS} MATCHES hf-$)
+ set(AOM_EXTRA_TOOLCHAIN_FLAGS "-mfloat-abi=softfp")
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -mfpu=vfpv3 \
+ ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -mfpu=vfpv3 \
+ ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
+ ${AOM_EXTRA_TOOLCHAIN_FLAGS})
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+
+set(AOM_NEON_INTRIN_FLAG "-mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
diff --git a/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake
new file mode 100644
index 0000000000..93f8c065c9
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by MSYS2.
+ set(CROSS armv7-w64-mingw32-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+ set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
+
+# No runtime cpu detect for armv7-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake b/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
new file mode 100644
index 0000000000..faa2933cf0
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_ 1)
+
+if(XCODE)
+
+ # TODO(tomfinegan): Handle arm builds in Xcode.
+ message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7s")
+set(CMAKE_OSX_ARCHITECTURES "armv7s")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
+
+# No intrinsics flag required for armv7s-ios.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7s-ios.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/third_party/aom/build/cmake/toolchains/i686-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/i686-linux-gcc.cmake
new file mode 100644
index 0000000000..c4f6ab9465
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/i686-linux-gcc.cmake
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2023, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_I686_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS i686-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
diff --git a/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake b/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
new file mode 100644
index 0000000000..173c423c3d
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_
+set(AOM_BUILD_CMAKE_IOS_SIMULATOR_COMMON_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_SYSROOT iphonesimulator)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+
+# TODO(tomfinegan): Handle bit code embedding.
diff --git a/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake
new file mode 100644
index 0000000000..3aa265254e
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS powerpc64le-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
+set(CMAKE_SYSTEM_PROCESSOR "ppc")
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/third_party/aom/build/cmake/toolchains/riscv-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/riscv-linux-gcc.cmake
new file mode 100644
index 0000000000..4133be68b3
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/riscv-linux-gcc.cmake
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_RISCV_LINUX_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_RISCV_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_RISCV_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS riscv64-linux-gnu-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_ASM_COMPILER)
+ set(CMAKE_ASM_COMPILER ${CROSS}as)
+endif()
+set(CMAKE_SYSTEM_PROCESSOR "riscv")
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake b/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
new file mode 100644
index 0000000000..caacb8c38b
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_ 1)
+
+if(XCODE)
+
+ # TODO(tomfinegan): Handle ios sim builds in Xcode.
+ message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "i386")
+set(CMAKE_OSX_ARCHITECTURES "i386")
+
+# Avoid noisy PIC/PIE warnings.
+set(CONFIG_PIC 1 CACHE STRING "")
+
+include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
diff --git a/third_party/aom/build/cmake/toolchains/x86-linux.cmake b/third_party/aom/build/cmake/toolchains/x86-linux.cmake
new file mode 100644
index 0000000000..a9c4f8c6b4
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-linux.cmake
@@ -0,0 +1,20 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_C_FLAGS_INIT "-m32")
+set(CMAKE_CXX_FLAGS_INIT "-m32")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-m32")
diff --git a/third_party/aom/build/cmake/toolchains/x86-macos.cmake b/third_party/aom/build/cmake/toolchains/x86-macos.cmake
new file mode 100644
index 0000000000..68e1bb07ff
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-macos.cmake
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "i386")
+set(CMAKE_C_FLAGS_INIT "-arch i386")
+set(CMAKE_CXX_FLAGS_INIT "-arch i386")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch i386")
+
+# Apple tools always complain in 32 bit mode without PIC.
+set(CONFIG_PIC 1 CACHE STRING "")
diff --git a/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
new file mode 100644
index 0000000000..2208333f37
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Windows")
+set(CMAKE_C_FLAGS_INIT "-m32")
+set(CMAKE_CXX_FLAGS_INIT "-m32")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-m32")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS i686-w64-mingw32-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+ set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake b/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
new file mode 100644
index 0000000000..d4b40ed098
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
@@ -0,0 +1,25 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_ 1)
+
+if(XCODE)
+
+ # TODO(tomfinegan): Handle ios sim builds in Xcode.
+ message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_OSX_ARCHITECTURES "x86_64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-macos.cmake b/third_party/aom/build/cmake/toolchains/x86_64-macos.cmake
new file mode 100644
index 0000000000..899df6f353
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86_64-macos.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2022, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "x86_64")
+set(CMAKE_C_FLAGS_INIT "-arch x86_64")
+set(CMAKE_CXX_FLAGS_INIT "-arch x86_64")
+set(CMAKE_EXE_LINKER_FLAGS_INIT "-arch x86_64")
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
new file mode 100644
index 0000000000..978146a4f2
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+
+ # Default the cross compiler prefix to one used by Debian and other package
+ # management systems.
+ set(CROSS x86_64-w64-mingw32-)
+endif()
+
+if(NOT CMAKE_C_COMPILER)
+ set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+ set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+if(NOT CMAKE_AR)
+ set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+endif()
+if(NOT CMAKE_RANLIB)
+ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+endif()
diff --git a/third_party/aom/build/cmake/util.cmake b/third_party/aom/build/cmake/util.cmake
new file mode 100644
index 0000000000..31de2e1702
--- /dev/null
+++ b/third_party/aom/build/cmake/util.cmake
@@ -0,0 +1,173 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_UTIL_CMAKE_)
+ return()
+endif() # AOM_BUILD_CMAKE_UTIL_CMAKE_
+set(AOM_BUILD_CMAKE_UTIL_CMAKE_ 1)
+
+# Directory where generated sources will be written.
+set(AOM_GEN_SRC_DIR "${AOM_CONFIG_DIR}/gen_src")
+
+# Creates a no-op source file in $AOM_GEN_SRC_DIR named $basename.$extension and
+# returns the full path to the source file via appending it to the list variable
+# referred to by $out_file_list_var parameter.
+macro(create_no_op_source_file basename extension out_file_list_var)
+ set(no_op_source_file "${AOM_GEN_SRC_DIR}/${basename}_no_op.${extension}")
+ file(WRITE "${no_op_source_file}"
+ "// Generated file. DO NOT EDIT!\n"
+ "// ${target_name} needs a ${extension} file to force link language, \n"
+ "// or to silence a harmless CMake warning: Ignore me.\n"
+ "void aom_${target_name}_no_op_function(void);\n"
+ "void aom_${target_name}_no_op_function(void) {}\n")
+ list(APPEND "${out_file_list_var}" "${no_op_source_file}")
+endmacro()
+
+# Convenience function for adding a no-op source file to $target_name using
+# $extension as the file extension. Wraps create_no_op_source_file().
+function(add_no_op_source_file_to_target target_name extension)
+ create_no_op_source_file("${target_name}" "${extension}"
+ "no_op_source_file_list")
+ target_sources(${target_name} PRIVATE ${no_op_source_file_list})
+endfunction()
+
+# Sets the value of the variable referenced by $feature to $value, and reports
+# the change to the user via call to message(WARNING ...). $cause is expected to
+# be a configuration variable that conflicts with $feature in some way. This
+# function is a no-op if $feature is already set to $value.
+function(change_config_and_warn feature value cause)
+ if(${feature} EQUAL ${value})
+ return()
+ endif()
+ set(${feature} ${value} PARENT_SCOPE)
+ if(${value} EQUAL 1)
+ set(verb "Enabled")
+ set(reason "required for")
+ else()
+ set(verb "Disabled")
+ set(reason "incompatible with")
+ endif()
+ set(warning_message "${verb} ${feature}, ${reason} ${cause}.")
+ message(WARNING "--- ${warning_message}")
+endfunction()
+
+# Extracts the version string from $version_file and returns it to the user via
+# $version_string_out_var. To achieve this VERSION_STRING_NOSP is located in
+# $version_file and then everything but the string literal assigned to the
+# variable is removed. Quotes and the leading 'v' are stripped from the returned
+# string.
+function(extract_version_string version_file version_string_out_var)
+ file(STRINGS "${version_file}" aom_version REGEX "VERSION_STRING_NOSP")
+ string(REPLACE "#define VERSION_STRING_NOSP " "" aom_version "${aom_version}")
+ string(REPLACE "\"" "" aom_version "${aom_version}")
+ string(REPLACE " " "" aom_version "${aom_version}")
+ string(FIND "${aom_version}" "v" v_pos)
+ if(${v_pos} EQUAL 0)
+ string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+ endif()
+ set("${version_string_out_var}" "${aom_version}" PARENT_SCOPE)
+endfunction()
+
+# Sets CMake compiler launcher to $launcher_name when $launcher_name is found in
+# $PATH. Warns user about ignoring build flag $launcher_flag when $launcher_name
+# is not found in $PATH.
+function(set_compiler_launcher launcher_flag launcher_name)
+ find_program(launcher_path "${launcher_name}")
+ if(launcher_path)
+ set(CMAKE_C_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
+ set(CMAKE_CXX_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
+ message("--- Using ${launcher_name} as compiler launcher.")
+ else()
+ message(
+ WARNING "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
+ endif()
+endfunction()
+
+# Sentinel value used to detect when a variable has been set via the -D argument
+# passed to CMake on the command line.
+set(cmake_cmdline_helpstring "No help, variable specified on the command line.")
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_DETECT_VARS to facilitate build logging and diagnostics.
+macro(set_aom_detect_var name value helpstring)
+ unset(list_index)
+ list(FIND AOM_DETECT_VARS ${name} list_index)
+ if(${list_index} EQUAL -1)
+ list(APPEND AOM_DETECT_VARS ${name})
+ endif()
+
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ set(${name} ${value} CACHE STRING "${helpstring}")
+ mark_as_advanced(${name})
+ else()
+ message(
+ WARNING
+ "${name} has been set by CMake, but it may be overridden by the build "
+ "system during environment detection")
+ endif()
+endmacro()
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_config_var name value helpstring)
+ unset(list_index)
+ list(FIND AOM_CONFIG_VARS ${name} list_index)
+ if(${list_index} EQUAL -1)
+ list(APPEND AOM_CONFIG_VARS ${name})
+ endif()
+
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ set(${name} ${value} CACHE STRING "${helpstring}")
+ endif()
+endmacro()
+
+# Wrapper macro for option() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_OPTION_VARS to facilitate build logging and diagnostics.
+macro(set_aom_option_var name helpstring value)
+ unset(list_index)
+ list(FIND AOM_OPTION_VARS ${name} list_index)
+ if(${list_index} EQUAL -1)
+ list(APPEND AOM_OPTION_VARS ${name})
+ endif()
+
+ # Update the variable only when it does not carry the CMake assigned help
+ # string for variables specified via the command line.
+ unset(cache_helpstring)
+ get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+ if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+ option(${name} "${helpstring}" ${value})
+ endif()
+endmacro()
diff --git a/third_party/aom/build/cmake/version.cmake b/third_party/aom/build/cmake/version.cmake
new file mode 100644
index 0000000000..f4377a13e1
--- /dev/null
+++ b/third_party/aom/build/cmake/version.cmake
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "GIT_EXECUTABLE"
+ "PERL_EXECUTABLE")
+
+foreach(arg ${REQUIRED_ARGS})
+ if("${${arg}}" STREQUAL "")
+ message(FATAL_ERROR "${arg} must not be empty.")
+ endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Generate the version string for this run.
+unset(aom_version)
+if(EXISTS "${GIT_EXECUTABLE}")
+ execute_process(COMMAND ${GIT_EXECUTABLE}
+ --git-dir=${AOM_ROOT}/.git describe
+ --match=v[0-9]*
+ OUTPUT_VARIABLE aom_version
+ ERROR_QUIET
+ RESULT_VARIABLE version_check_result)
+
+ if(${version_check_result} EQUAL 0)
+ string(STRIP "${aom_version}" aom_version)
+
+ # Remove the leading 'v' from the version string.
+ string(FIND "${aom_version}" "v" v_pos)
+ if(${v_pos} EQUAL 0)
+ string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+ endif()
+ else()
+ set(aom_version "")
+ endif()
+endif()
+
+if("${aom_version}" STREQUAL "")
+ set(aom_version "${AOM_ROOT}/CHANGELOG")
+endif()
+
+unset(last_aom_version)
+set(version_file "${AOM_CONFIG_DIR}/config/aom_version.h")
+if(EXISTS "${version_file}")
+ extract_version_string("${version_file}" last_aom_version)
+ if("${aom_version}" MATCHES "CHANGELOG$")
+ set(aom_version "${last_aom_version}")
+ endif()
+endif()
+
+if(NOT "${aom_version}" STREQUAL "${last_aom_version}")
+ # TODO(tomfinegan): Perl dependency is unnecessary. CMake can do everything
+ # that is done by version.pl on its own (if a bit more verbosely...).
+ execute_process(COMMAND ${PERL_EXECUTABLE}
+ "${AOM_ROOT}/build/cmake/version.pl"
+ --version_data=${aom_version}
+ --version_filename=${version_file} VERBATIM)
+endif()
diff --git a/third_party/aom/build/cmake/version.pl b/third_party/aom/build/cmake/version.pl
new file mode 100755
index 0000000000..392815f81d
--- /dev/null
+++ b/third_party/aom/build/cmake/version.pl
@@ -0,0 +1,114 @@
+#!/usr/bin/env perl
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+use strict;
+use warnings;
+use 5.010;
+use Getopt::Long;
+
+my $git_desc = '';
+my $version_data;
+my $version_filename;
+GetOptions('version_data=s' => \$version_data,
+ 'version_filename=s' => \$version_filename) or
+ die("Invalid arg(s): $!");
+
+if (!defined $version_data || length($version_data) == 0 ||
+ !defined $version_filename || length($version_filename) == 0) {
+ die("--version_data and --version_filename are required.");
+}
+
+# Determine if $version_data is a filename or a git tag/description.
+my $version_string;
+chomp($version_data);
+if (-r $version_data) {
+ # $version_data is the path to the CHANGELOG. Parse the most recent version.
+ my $changelog_filename = $version_data;
+ open(my $changelog_file, '<', $changelog_filename) or
+ die("Unable to open CHANGELOG @ $changelog_filename: $!.");
+
+ while (my $line = <$changelog_file>) {
+ my @split_line = split(" ", $line, 3);
+ next if @split_line < 2;
+ $version_string = $split_line[1];
+ last if substr($version_string, 0, 1) eq "v";
+ }
+ close($changelog_file);
+} else {
+ # $version_data is either a tag name or a full git description, one of:
+ # tagName OR tagName-commitsSinceTag-shortCommitHash
+ # In either case we want the first element of the array returned by split.
+ $version_string = (split("-", $version_data))[0];
+ $git_desc = $version_data;
+}
+
+if (substr($version_string, 0, 1) eq "v") {
+ $version_string = substr($version_string, 1);
+}
+
+my @version_components = split('\.', $version_string, 4);
+my $version_major = $version_components[0];
+my $version_minor = $version_components[1];
+my $version_patch = $version_components[2];
+
+my $version_extra = "";
+if (length($git_desc) > 0) {
+ my @git_desc_components = split('-', $git_desc, 2);
+ if (@git_desc_components > 1) {
+ $version_extra = $git_desc_components[1];
+ }
+}
+
+open(my $version_file, '>', $version_filename) or
+ die("Cannot open $version_filename: $!");
+
+my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))";
+my $year = (localtime)[5] + 1900;
+my $lic_block = << "EOF";
+/*
+ * Copyright (c) $year, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+EOF
+
+select $version_file;
+if (length($git_desc)) {
+ print << "EOF";
+$lic_block
+#define VERSION_MAJOR $version_major
+#define VERSION_MINOR $version_minor
+#define VERSION_PATCH $version_patch
+#define VERSION_EXTRA \"$version_extra\"
+#define VERSION_PACKED \\
+ $version_packed
+#define VERSION_STRING_NOSP \"$git_desc\"
+#define VERSION_STRING \" $git_desc\"
+EOF
+} else {
+ print << "EOF";
+$lic_block
+#define VERSION_MAJOR $version_major
+#define VERSION_MINOR $version_minor
+#define VERSION_PATCH $version_patch
+#define VERSION_EXTRA \"$version_extra\"
+#define VERSION_PACKED \\
+ $version_packed
+#define VERSION_STRING_NOSP \"v$version_string\"
+#define VERSION_STRING \" v$version_string\"
+EOF
+}
+close($version_file);
diff --git a/third_party/aom/codereview.settings b/third_party/aom/codereview.settings
new file mode 100644
index 0000000000..185e9344cf
--- /dev/null
+++ b/third_party/aom/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: aomedia-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/third_party/aom/common/args.c b/third_party/aom/common/args.c
new file mode 100644
index 0000000000..b5ede193b5
--- /dev/null
+++ b/third_party/aom/common/args.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/args.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/msvc.h"
+#include "aom/aom_codec.h"
+#include "common/tools_common.h"
+
+static const char kSbSizeWarningString[] =
+ "super_block_size has to be 64 or 128.";
+static const char kMinpartWarningString[] =
+ "min_partition_size has to be smaller or equal to max_partition_size.";
+static const char kMaxpartWarningString[] =
+ "max_partition_size has to be smaller or equal to super_block_size.";
+
+static char *ignore_front_spaces(const char *str) {
+ while (str[0] == ' ' || str[0] == '\t') ++str;
+ return (char *)str;
+}
+
+static void ignore_end_spaces(char *str) {
+ char *end = str + strlen(str);
+ while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' ||
+ end[0] == '\r' || end[0] == '\0'))
+ --end;
+ if (end >= str) end[1] = '\0';
+}
+
+int parse_cfg(const char *file, cfg_options_t *config) {
+ char line[1024 * 10];
+ FILE *f = fopen(file, "r");
+ if (!f) return 1;
+
+#define GET_PARAMS(field) \
+ if (strcmp(left, #field) == 0) { \
+ config->field = atoi(right); \
+ continue; \
+ }
+
+ while (fgets(line, sizeof(line) - 1, f)) {
+ char *actual_line = ignore_front_spaces(line);
+ char *left, *right, *comment;
+ size_t length = strlen(actual_line);
+
+ if (length == 0 || actual_line[0] == '#') continue;
+ right = strchr(actual_line, '=');
+ if (right == NULL) continue;
+ right[0] = '\0';
+
+ left = ignore_front_spaces(actual_line);
+ right = ignore_front_spaces(right + 1);
+
+ comment = strchr(right, '#');
+ if (comment != NULL) comment[0] = '\0';
+
+ ignore_end_spaces(left);
+ ignore_end_spaces(right);
+
+ GET_PARAMS(super_block_size)
+ GET_PARAMS(max_partition_size)
+ GET_PARAMS(min_partition_size)
+ GET_PARAMS(disable_ab_partition_type)
+ GET_PARAMS(disable_rect_partition_type)
+ GET_PARAMS(disable_1to4_partition_type)
+ GET_PARAMS(disable_flip_idtx)
+ GET_PARAMS(disable_cdef)
+ GET_PARAMS(disable_lr)
+ GET_PARAMS(disable_obmc)
+ GET_PARAMS(disable_warp_motion)
+ GET_PARAMS(disable_global_motion)
+ GET_PARAMS(disable_dist_wtd_comp)
+ GET_PARAMS(disable_diff_wtd_comp)
+ GET_PARAMS(disable_inter_intra_comp)
+ GET_PARAMS(disable_masked_comp)
+ GET_PARAMS(disable_one_sided_comp)
+ GET_PARAMS(disable_palette)
+ GET_PARAMS(disable_intrabc)
+ GET_PARAMS(disable_cfl)
+ GET_PARAMS(disable_smooth_intra)
+ GET_PARAMS(disable_filter_intra)
+ GET_PARAMS(disable_dual_filter)
+ GET_PARAMS(disable_intra_angle_delta)
+ GET_PARAMS(disable_intra_edge_filter)
+ GET_PARAMS(disable_tx_64x64)
+ GET_PARAMS(disable_smooth_inter_intra)
+ GET_PARAMS(disable_inter_inter_wedge)
+ GET_PARAMS(disable_inter_intra_wedge)
+ GET_PARAMS(disable_paeth_intra)
+ GET_PARAMS(disable_trellis_quant)
+ GET_PARAMS(disable_ref_frame_mv)
+ GET_PARAMS(reduced_reference_set)
+ GET_PARAMS(reduced_tx_type_set)
+
+ fprintf(stderr, "\nInvalid parameter: %s", left);
+ exit(-1);
+ }
+
+ if (config->super_block_size != 128 && config->super_block_size != 64) {
+ fprintf(stderr, "\n%s", kSbSizeWarningString);
+ exit(-1);
+ }
+ if (config->min_partition_size > config->max_partition_size) {
+ fprintf(stderr, "\n%s", kMinpartWarningString);
+ exit(-1);
+ }
+ if (config->max_partition_size > config->super_block_size) {
+ fprintf(stderr, "\n%s", kMaxpartWarningString);
+ exit(-1);
+ }
+
+ fclose(f);
+ config->init_by_cfg_file = 1;
+
+ return 0;
+}
+
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ int ret = arg_match_helper(arg_, def, argv, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
+
+const char *arg_next(struct arg *arg) {
+ if (arg->argv[0]) arg->argv += arg->argv_step;
+
+ return *arg->argv;
+}
+
+char **argv_dup(int argc, const char **argv) {
+ char **new_argv = malloc((argc + 1) * sizeof(*argv));
+ if (!new_argv) return NULL;
+
+ memcpy(new_argv, argv, argc * sizeof(*argv));
+ new_argv[argc] = NULL;
+ return new_argv;
+}
+
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
+ for (; *defs; defs++) {
+ const struct arg_def *def = *defs;
+ char *short_val = def->has_val ? " <arg>" : "";
+ char *long_val = def->has_val ? "=<arg>" : "";
+ int n = 0;
+
+ // Short options are indented with two spaces. Long options are indented
+ // with 12 spaces.
+ if (def->short_name && def->long_name) {
+ char *comma = def->has_val ? "," : ", ";
+
+ n = fprintf(fp, " -%s%s%s --%s%s", def->short_name, short_val, comma,
+ def->long_name, long_val);
+ } else if (def->short_name)
+ n = fprintf(fp, " -%s%s", def->short_name, short_val);
+ else if (def->long_name)
+ n = fprintf(fp, " --%s%s", def->long_name, long_val);
+
+ // Descriptions are indented with 40 spaces. If an option is 40 characters
+ // or longer, its description starts on the next line.
+ if (n < 40)
+ for (int i = 0; i < 40 - n; i++) fputc(' ', fp);
+ else
+ fputs("\n ", fp);
+ fprintf(fp, "%s\n", def->desc);
+
+ if (def->enums) {
+ const struct arg_enum_list *listptr;
+
+ fprintf(fp, " %-37s\t ", "");
+
+ for (listptr = def->enums; listptr->name; listptr++)
+ fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n");
+ }
+ }
+}
+
+unsigned int arg_parse_uint(const struct arg *arg) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ unsigned int ret = arg_parse_uint_helper(arg, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
+
+int arg_parse_int(const struct arg *arg) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ int ret = arg_parse_int_helper(arg, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
+
+struct aom_rational arg_parse_rational(const struct arg *arg) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ struct aom_rational ret = arg_parse_rational_helper(arg, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
+
+int arg_parse_enum(const struct arg *arg) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ int ret = arg_parse_enum_helper(arg, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
+
+int arg_parse_enum_or_int(const struct arg *arg) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ int ret = arg_parse_enum_or_int_helper(arg, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
+
+// parse a comma separated list of at most n integers
+// return the number of elements in the list
+int arg_parse_list(const struct arg *arg, int *list, int n) {
+ char err_msg[ARG_ERR_MSG_MAX_LEN];
+ int ret = arg_parse_list_helper(arg, list, n, err_msg);
+ if (err_msg[0] != '\0') {
+ die("%s", err_msg);
+ }
+ return ret;
+}
diff --git a/third_party/aom/common/args.h b/third_party/aom/common/args.h
new file mode 100644
index 0000000000..1c5c437632
--- /dev/null
+++ b/third_party/aom/common/args.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_ARGS_H_
+#define AOM_COMMON_ARGS_H_
+#include <stdio.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "common/args_helper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
+int parse_cfg(const char *file, cfg_options_t *config);
+const char *arg_next(struct arg *arg);
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
+char **argv_dup(int argc, const char **argv);
+
+unsigned int arg_parse_uint(const struct arg *arg);
+int arg_parse_int(const struct arg *arg);
+struct aom_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
+int arg_parse_list(const struct arg *arg, int *list, int n);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_ARGS_H_
diff --git a/third_party/aom/common/args_helper.c b/third_party/aom/common/args_helper.c
new file mode 100644
index 0000000000..2201868335
--- /dev/null
+++ b/third_party/aom/common/args_helper.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "common/args_helper.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#define SET_ERR_STRING(...) \
+ if (err_msg) snprintf(err_msg, ARG_ERR_MSG_MAX_LEN, __VA_ARGS__)
+
+struct arg arg_init(char **argv) {
+ struct arg a;
+
+ a.argv = argv;
+ a.argv_step = 1;
+ a.name = NULL;
+ a.val = NULL;
+ a.def = NULL;
+ return a;
+}
+
+int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv,
+ char *err_msg) {
+ struct arg arg;
+
+ if (err_msg) err_msg[0] = '\0';
+
+ assert(def->has_val == 0 || def->has_val == 1 || def->has_val == -1);
+
+ if (!argv[0] || argv[0][0] != '-') return 0;
+
+ arg = arg_init(argv);
+
+ if (def->short_name && !strcmp(arg.argv[0] + 1, def->short_name)) {
+ arg.name = arg.argv[0] + 1;
+ arg.val = def->has_val ? arg.argv[1] : NULL;
+ arg.argv_step = def->has_val ? 2 : 1;
+ } else if (def->long_name) {
+ const size_t name_len = strlen(def->long_name);
+
+ if (arg.argv[0][1] == '-' &&
+ !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
+ (arg.argv[0][name_len + 2] == '=' ||
+ arg.argv[0][name_len + 2] == '\0')) {
+ arg.name = arg.argv[0] + 2;
+ arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
+ arg.argv_step = 1;
+ }
+ }
+
+ if (arg.name) {
+ if (def->has_val == -1) {
+ arg.def = def;
+ *arg_ = arg;
+ return 1;
+ }
+
+ if (!arg.val && def->has_val) {
+ SET_ERR_STRING("Error: option %s requires argument.\n", arg.name);
+ return 0;
+ }
+
+ if (arg.val && !def->has_val) {
+ SET_ERR_STRING("Error: option %s requires no argument.\n", arg.name);
+ return 0;
+ }
+
+ arg.def = def;
+ *arg_ = arg;
+ return 1;
+ }
+
+ return 0;
+}
+
+unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg) {
+ char *endptr;
+ const unsigned long rawval = strtoul(arg->val, &endptr, 10); // NOLINT
+
+ if (err_msg) err_msg[0] = '\0';
+
+ if (arg->val[0] != '\0' && endptr[0] == '\0') {
+ if (rawval <= UINT_MAX) return (unsigned int)rawval;
+ SET_ERR_STRING("Option %s: Value %lu out of range for unsigned int\n",
+ arg->name, rawval);
+ return 0;
+ }
+ SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+ return 0;
+}
+
+int arg_parse_int_helper(const struct arg *arg, char *err_msg) {
+ char *endptr;
+ const long rawval = strtol(arg->val, &endptr, 10); // NOLINT
+
+ if (err_msg) err_msg[0] = '\0';
+
+ if (arg->val[0] != '\0' && endptr[0] == '\0') {
+ if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval;
+ SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+ arg->name, rawval);
+ return 0;
+ }
+ SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+ return 0;
+}
+
+struct aom_rational arg_parse_rational_helper(const struct arg *arg,
+ char *err_msg) {
+ long rawval; // NOLINT
+ char *endptr;
+ struct aom_rational rat = { 0, 1 };
+
+ if (err_msg) err_msg[0] = '\0';
+
+ /* parse numerator */
+ rawval = strtol(arg->val, &endptr, 10);
+
+ if (arg->val[0] != '\0' && endptr[0] == '/') {
+ if (rawval >= INT_MIN && rawval <= INT_MAX) {
+ rat.num = (int)rawval;
+ } else {
+ SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+ arg->name, rawval);
+ return rat;
+ }
+ } else {
+ SET_ERR_STRING("Option %s: Expected / at '%c'\n", arg->name, *endptr);
+ return rat;
+ }
+
+ /* parse denominator */
+ rawval = strtol(endptr + 1, &endptr, 10);
+
+ if (arg->val[0] != '\0' && endptr[0] == '\0') {
+ if (rawval >= INT_MIN && rawval <= INT_MAX) {
+ rat.den = (int)rawval;
+ } else {
+ SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+ arg->name, rawval);
+ return rat;
+ }
+ } else {
+ SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+ return rat;
+ }
+
+ return rat;
+}
+
+int arg_parse_enum_helper(const struct arg *arg, char *err_msg) {
+ const struct arg_enum_list *listptr;
+ long rawval; // NOLINT
+ char *endptr;
+
+ if (err_msg) err_msg[0] = '\0';
+
+ /* First see if the value can be parsed as a raw value */
+ rawval = strtol(arg->val, &endptr, 10);
+ if (arg->val[0] != '\0' && endptr[0] == '\0') {
+ /* Got a raw value, make sure it's valid */
+ for (listptr = arg->def->enums; listptr->name; listptr++)
+ if (listptr->val == rawval) return (int)rawval;
+ }
+
+ /* Next see if it can be parsed as a string */
+ for (listptr = arg->def->enums; listptr->name; listptr++)
+ if (!strcmp(arg->val, listptr->name)) return listptr->val;
+
+ SET_ERR_STRING("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+ return 0;
+}
+
+int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg) {
+ if (arg->def->enums) return arg_parse_enum_helper(arg, err_msg);
+ return arg_parse_int_helper(arg, err_msg);
+}
+
+// parse a comma separated list of at most n integers
+// return the number of elements in the list
+int arg_parse_list_helper(const struct arg *arg, int *list, int n,
+ char *err_msg) {
+ const char *ptr = arg->val;
+ char *endptr;
+ int i = 0;
+
+ if (err_msg) err_msg[0] = '\0';
+
+ while (ptr[0] != '\0') {
+ long rawval = strtol(ptr, &endptr, 10); // NOLINT
+ if (rawval < INT_MIN || rawval > INT_MAX) {
+ SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n",
+ arg->name, rawval);
+ return 0;
+ } else if (i >= n) {
+ SET_ERR_STRING("Option %s: List has more than %d entries\n", arg->name,
+ n);
+ return 0;
+ } else if (*endptr == ',') {
+ endptr++;
+ } else if (*endptr != '\0') {
+ SET_ERR_STRING("Option %s: Bad list separator '%c'\n", arg->name,
+ *endptr);
+ return 0;
+ }
+ list[i++] = (int)rawval;
+ ptr = endptr;
+ }
+ return i;
+}
diff --git a/third_party/aom/common/args_helper.h b/third_party/aom/common/args_helper.h
new file mode 100644
index 0000000000..c86a6128d3
--- /dev/null
+++ b/third_party/aom/common/args_helper.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_ARGS_HELPER_H_
+#define AOM_COMMON_ARGS_HELPER_H_
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Maximum length of the error messages for the helper functions.
+#define ARG_ERR_MSG_MAX_LEN 200
+
+struct arg {
+ char **argv;
+ const char *name;
+ const char *val;
+ unsigned int argv_step;
+ const struct arg_def *def;
+};
+
+struct arg_enum_list {
+ const char *name;
+ int val;
+};
+#define ARG_ENUM_LIST_END \
+ { 0 }
+
+typedef struct arg_def {
+ const char *short_name;
+ const char *long_name;
+ int has_val; // 0: The argument must not have a value.
+ // 1: The argument must have a value.
+ // -1: The argument may or may not have a value.
+ const char *desc;
+ const struct arg_enum_list *enums;
+} arg_def_t;
+#define ARG_DEF(s, l, v, d) \
+ { s, l, v, d, NULL }
+#define ARG_DEF_ENUM(s, l, v, d, e) \
+ { s, l, v, d, e }
+#define ARG_DEF_LIST_END \
+ { 0 }
+
+struct arg arg_init(char **argv);
+
+/*
+ * The helper functions below all take an optional parameter err_msg for
+ * error reporting. When err_msg is not NULL (must point to a buffer
+ * which is at least ARG_ERR_MSG_MAX_LEN bytes long), a related error message is
+ * stored in it if an error occurs. It will be set to an empty string if no
+ * error occurs.
+ */
+int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv,
+ char *err_msg);
+unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg);
+int arg_parse_int_helper(const struct arg *arg, char *err_msg);
+struct aom_rational arg_parse_rational_helper(const struct arg *arg,
+ char *err_msg);
+int arg_parse_enum_helper(const struct arg *arg, char *err_msg);
+int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg);
+int arg_parse_list_helper(const struct arg *arg, int *list, int n,
+ char *err_msg);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_ARGS_HELPER_H_
diff --git a/third_party/aom/common/av1_config.c b/third_party/aom/common/av1_config.c
new file mode 100644
index 0000000000..9f5b02015b
--- /dev/null
+++ b/third_party/aom/common/av1_config.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "av1/common/obu_util.h"
+#include "common/av1_config.h"
+#include "config/aom_config.h"
+
+// Helper macros to reduce verbosity required to check for read errors.
+//
+// Note that when using these macros, even single line if statements should use
+// curly braces to avoid unexpected behavior because all but the
+// AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements.
+#define AV1C_READ_BIT_OR_RETURN_ERROR(field) \
+ int field = 0; \
+ do { \
+ field = aom_rb_read_bit(reader); \
+ if (result == -1) { \
+ fprintf(stderr, \
+ "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \
+ field, result); \
+ return -1; \
+ } \
+ } while (0)
+
+#define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \
+ int field = 0; \
+ do { \
+ field = aom_rb_read_literal(reader, (length)); \
+ if (result == -1) { \
+ fprintf(stderr, \
+ "av1c: Could not read bits for " #field \
+ ", value=%d result=%d.\n", \
+ field, result); \
+ return -1; \
+ } \
+ } while (0)
+
+// Helper macros for setting/restoring the error handler data in
+// aom_read_bit_buffer.
+#define AV1C_PUSH_ERROR_HANDLER_DATA(new_data) \
+ void *original_error_handler_data = NULL; \
+ do { \
+ original_error_handler_data = reader->error_handler_data; \
+ reader->error_handler_data = &new_data; \
+ } while (0)
+
+#define AV1C_POP_ERROR_HANDLER_DATA() \
+ do { \
+ reader->error_handler_data = original_error_handler_data; \
+ } while (0)
+
+static const size_t kAv1cSize = 4;
+
+static void bitreader_error_handler(void *data) {
+ int *error_val = (int *)data;
+ *error_val = -1;
+}
+
+// Parse the AV1 timing_info() structure:
+// timing_info( ) {
+// num_units_in_display_tick f(32)
+// time_scale f(32)
+// equal_picture_interval f(1)
+// if (equal_picture_interval)
+// num_ticks_per_picture_minus_1 uvlc()
+// }
+static int parse_timing_info(struct aom_read_bit_buffer *reader) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32);
+ AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval);
+ if (equal_picture_interval) {
+ uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader);
+ if (result == -1) {
+ fprintf(stderr,
+ "av1c: Could not read bits for "
+ "num_ticks_per_picture_minus_1, value=%u.\n",
+ num_ticks_per_picture_minus_1);
+ return result;
+ }
+ }
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return result;
+}
+
+// Parse the AV1 decoder_model_info() structure:
+// decoder_model_info( ) {
+// buffer_delay_length_minus_1 f(5)
+// num_units_in_decoding_tick f(32)
+// buffer_removal_time_length_minus_1 f(5)
+// frame_presentation_time_length_minus_1 f(5)
+// }
+//
+// Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1.
+static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5);
+ AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32);
+ AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5);
+ AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5);
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return buffer_delay_length_minus_1 + 1;
+}
+
+// Parse the AV1 operating_parameters_info() structure:
+// operating_parameters_info( op ) {
+// n = buffer_delay_length_minus_1 + 1
+// decoder_buffer_delay[ op ] f(n)
+// encoder_buffer_delay[ op ] f(n)
+// low_delay_mode_flag[ op ] f(1)
+// }
+static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader,
+ int buffer_delay_length_minus_1) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ const int buffer_delay_length = buffer_delay_length_minus_1 + 1;
+ AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length);
+ AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length);
+ AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag);
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return result;
+}
+
+// Parse the AV1 color_config() structure..See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44
+static int parse_color_config(struct aom_read_bit_buffer *reader,
+ Av1Config *config) {
+ int result = 0;
+ AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+ config->high_bitdepth = high_bitdepth;
+
+ int bit_depth = 0;
+ if (config->seq_profile == 2 && config->high_bitdepth) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+ config->twelve_bit = twelve_bit;
+ bit_depth = config->twelve_bit ? 12 : 10;
+ } else {
+ bit_depth = config->high_bitdepth ? 10 : 8;
+ }
+
+ if (config->seq_profile != 1) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome);
+ config->monochrome = mono_chrome;
+ }
+
+ int color_primaries = AOM_CICP_CP_UNSPECIFIED;
+ int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+ int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag);
+ if (color_description_present_flag) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8);
+ color_primaries = color_primaries_val;
+ AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8);
+ transfer_characteristics = transfer_characteristics_val;
+ AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8);
+ matrix_coefficients = matrix_coefficients_val;
+ }
+
+ if (config->monochrome) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+ config->chroma_subsampling_x = 1;
+ config->chroma_subsampling_y = 1;
+ } else if (color_primaries == AOM_CICP_CP_BT_709 &&
+ transfer_characteristics == AOM_CICP_TC_SRGB &&
+ matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ config->chroma_subsampling_x = 0;
+ config->chroma_subsampling_y = 0;
+ } else {
+ AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+ if (config->seq_profile == 0) {
+ config->chroma_subsampling_x = 1;
+ config->chroma_subsampling_y = 1;
+ } else if (config->seq_profile == 1) {
+ config->chroma_subsampling_x = 0;
+ config->chroma_subsampling_y = 0;
+ } else {
+ if (bit_depth == 12) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x);
+ config->chroma_subsampling_x = subsampling_x;
+ if (subsampling_x) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y);
+ config->chroma_subsampling_y = subsampling_y;
+ } else {
+ config->chroma_subsampling_y = 0;
+ }
+ } else {
+ config->chroma_subsampling_x = 1;
+ config->chroma_subsampling_y = 0;
+ }
+ }
+
+ if (config->chroma_subsampling_x && config->chroma_subsampling_y) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+ config->chroma_sample_position = chroma_sample_position;
+ }
+ }
+
+ if (!config->monochrome) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q);
+ }
+
+ AV1C_POP_ERROR_HANDLER_DATA();
+ return result;
+}
+
+// Parse AV1 Sequence Header OBU. See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41
+static int parse_sequence_header(const uint8_t *const buffer, size_t length,
+ Av1Config *config) {
+ int result = 0;
+ // The reader instance is local to this function, but a pointer to the
+ // reader instance is used within this function and throughout this file to
+ // allow use of the helper macros that reduce parse error checking verbosity.
+ struct aom_read_bit_buffer reader_instance = { buffer, buffer + length, 0,
+ &result,
+ bitreader_error_handler };
+ struct aom_read_bit_buffer *reader = &reader_instance;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+ config->seq_profile = seq_profile;
+ AV1C_READ_BIT_OR_RETURN_ERROR(still_picture);
+ AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header);
+ if (reduced_still_picture_header) {
+ config->initial_presentation_delay_present = 0;
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+ config->seq_level_idx_0 = seq_level_idx_0;
+ config->seq_tier_0 = 0;
+ } else {
+ int has_decoder_model = 0;
+ int buffer_delay_length = 0;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag);
+ if (timing_info_present_flag) {
+ if (parse_timing_info(reader) != 0) return -1;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag);
+ if (decoder_model_info_present_flag &&
+ (buffer_delay_length = parse_decoder_model_info(reader)) == -1) {
+ return -1;
+ }
+ has_decoder_model = 1;
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+ config->initial_presentation_delay_present =
+ initial_presentation_delay_present;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5);
+ const int num_operating_points = operating_points_cnt_minus_1 + 1;
+
+ for (int op_index = 0; op_index < num_operating_points; ++op_index) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12);
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5);
+
+ int seq_tier = 0;
+ if (seq_level_idx > 7) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op);
+ seq_tier = seq_tier_this_op;
+ }
+
+ if (has_decoder_model) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op);
+ if (decoder_model_present_for_op) {
+ if (parse_operating_parameters_info(reader, buffer_delay_length) ==
+ -1) {
+ return -1;
+ }
+ }
+ }
+
+ if (config->initial_presentation_delay_present) {
+ // Skip the initial presentation delay bits if present since this
+ // function has no access to the data required to properly set the
+ // field.
+ AV1C_READ_BIT_OR_RETURN_ERROR(
+ initial_presentation_delay_present_for_this_op);
+ if (initial_presentation_delay_present_for_this_op) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4);
+ }
+ }
+
+ if (op_index == 0) {
+ // Av1Config needs only the values from the first operating point.
+ config->seq_level_idx_0 = seq_level_idx;
+ config->seq_tier_0 = seq_tier;
+ config->initial_presentation_delay_present = 0;
+ config->initial_presentation_delay_minus_one = 0;
+ }
+ }
+ }
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4);
+ AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4);
+ AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1,
+ frame_width_bits_minus_1 + 1);
+ AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1,
+ frame_height_bits_minus_1 + 1);
+
+ uint8_t frame_id_numbers_present = 0;
+ if (!reduced_still_picture_header) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag);
+ frame_id_numbers_present = frame_id_numbers_present_flag;
+ }
+
+ if (frame_id_numbers_present) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4);
+ AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3);
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter);
+
+ if (!reduced_still_picture_header) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint);
+ if (enable_order_hint) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs);
+ }
+
+ const int SELECT_SCREEN_CONTENT_TOOLS = 2;
+ int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS;
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools);
+ if (!seq_choose_screen_content_tools) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val);
+ seq_force_screen_content_tools = seq_force_screen_content_tools_val;
+ }
+
+ if (seq_force_screen_content_tools > 0) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv);
+
+ if (!seq_choose_integer_mv) {
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv);
+ }
+ }
+
+ if (enable_order_hint) {
+ AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3);
+ }
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration);
+
+ if (parse_color_config(reader, config) != 0) {
+ fprintf(stderr, "av1c: color_config() parse failed.\n");
+ return -1;
+ }
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present);
+ return 0;
+}
+
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+ Av1Config *config) {
+ if (!buffer || length == 0 || !config) {
+ return -1;
+ }
+
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+
+ size_t sequence_header_length = 0;
+ size_t obu_header_length = 0;
+ if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header,
+ &sequence_header_length,
+ &obu_header_length) != AOM_CODEC_OK ||
+ obu_header.type != OBU_SEQUENCE_HEADER ||
+ sequence_header_length + obu_header_length > length) {
+ return -1;
+ }
+
+ memset(config, 0, sizeof(*config));
+ config->marker = 1;
+ config->version = 1;
+ return parse_sequence_header(buffer + obu_header_length,
+ sequence_header_length, config);
+}
+
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+ size_t *bytes_read, Av1Config *config) {
+ if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1;
+
+ *bytes_read = 0;
+
+ int result = 0;
+ struct aom_read_bit_buffer reader_instance = { buffer, buffer + buffer_length,
+ 0, &result,
+ bitreader_error_handler };
+ struct aom_read_bit_buffer *reader = &reader_instance;
+
+ memset(config, 0, sizeof(*config));
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(marker);
+ config->marker = marker;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(version, 7);
+ config->version = version;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+ config->seq_profile = seq_profile;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+ config->seq_level_idx_0 = seq_level_idx_0;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0);
+ config->seq_tier_0 = seq_tier_0;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+ config->high_bitdepth = high_bitdepth;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+ config->twelve_bit = twelve_bit;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(monochrome);
+ config->monochrome = monochrome;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x);
+ config->chroma_subsampling_x = chroma_subsampling_x;
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y);
+ config->chroma_subsampling_y = chroma_subsampling_y;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+ config->chroma_sample_position = chroma_sample_position;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3);
+
+ AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+ config->initial_presentation_delay_present =
+ initial_presentation_delay_present;
+
+ AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4);
+ config->initial_presentation_delay_minus_one =
+ initial_presentation_delay_minus_one;
+
+ *bytes_read = aom_rb_bytes_read(reader);
+
+ return 0;
+}
+
+int write_av1config(const Av1Config *config, size_t capacity,
+ size_t *bytes_written, uint8_t *buffer) {
+ if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1;
+
+ *bytes_written = 0;
+ memset(buffer, 0, kAv1cSize);
+
+ struct aom_write_bit_buffer writer = { buffer, 0 };
+
+ aom_wb_write_bit(&writer, config->marker);
+ aom_wb_write_literal(&writer, config->version, 7);
+ aom_wb_write_literal(&writer, config->seq_profile, 3);
+ aom_wb_write_literal(&writer, config->seq_level_idx_0, 5);
+ aom_wb_write_bit(&writer, config->seq_tier_0);
+ aom_wb_write_bit(&writer, config->high_bitdepth);
+ aom_wb_write_bit(&writer, config->twelve_bit);
+ aom_wb_write_bit(&writer, config->monochrome);
+ aom_wb_write_bit(&writer, config->chroma_subsampling_x);
+ aom_wb_write_bit(&writer, config->chroma_subsampling_y);
+ aom_wb_write_literal(&writer, config->chroma_sample_position, 2);
+ aom_wb_write_literal(&writer, 0, 3); // reserved
+ aom_wb_write_bit(&writer, config->initial_presentation_delay_present);
+
+ if (config->initial_presentation_delay_present) {
+ aom_wb_write_literal(&writer, config->initial_presentation_delay_minus_one,
+ 4);
+ } else {
+ aom_wb_write_literal(&writer, 0, 4); // reserved
+ }
+
+ *bytes_written = aom_wb_bytes_written(&writer);
+ return 0;
+}
+
+#undef AV1C_READ_BIT_OR_RETURN_ERROR
+#undef AV1C_READ_BITS_OR_RETURN_ERROR
+#undef AV1C_PUSH_ERROR_HANDLER_DATA
+#undef AV1C_POP_ERROR_HANDLER_DATA
diff --git a/third_party/aom/common/av1_config.h b/third_party/aom/common/av1_config.h
new file mode 100644
index 0000000000..a15bedb305
--- /dev/null
+++ b/third_party/aom/common/av1_config.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_AV1_CONFIG_H_
+#define AOM_COMMON_AV1_CONFIG_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Struct representing ISOBMFF/Matroska AV1 config. See:
+// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax
+//
+// The AV1 config has the following format:
+//
+// unsigned int (1) marker = 1;
+// unsigned int (7) version = 1;
+// unsigned int (3) seq_profile;
+// unsigned int (5) seq_level_idx_0;
+// unsigned int (1) seq_tier_0;
+// unsigned int (1) high_bitdepth;
+// unsigned int (1) twelve_bit;
+// unsigned int (1) monochrome;
+// unsigned int (1) chroma_subsampling_x;
+// unsigned int (1) chroma_subsampling_y;
+// unsigned int (2) chroma_sample_position;
+// unsigned int (3) reserved = 0;
+//
+// unsigned int (1) initial_presentation_delay_present;
+// if (initial_presentation_delay_present) {
+// unsigned int (4) initial_presentation_delay_minus_one;
+// } else {
+// unsigned int (4) reserved = 0;
+// }
+//
+// unsigned int (8)[] configOBUs;
+//
+// Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so
+// the field is omitted.
+typedef struct _Av1Config {
+ uint8_t marker;
+ uint8_t version;
+ uint8_t seq_profile;
+ uint8_t seq_level_idx_0;
+ uint8_t seq_tier_0;
+ uint8_t high_bitdepth;
+ uint8_t twelve_bit;
+ uint8_t monochrome;
+ uint8_t chroma_subsampling_x;
+ uint8_t chroma_subsampling_y;
+ uint8_t chroma_sample_position;
+ uint8_t initial_presentation_delay_present;
+ uint8_t initial_presentation_delay_minus_one;
+} Av1Config;
+
+// Attempts to parse a Sequence Header OBU and set the paramenters of 'config'.
+// Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple
+// OBUs, but the Sequence Header OBU must be the first OBU within the buffer.
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+ Av1Config *config);
+
+// Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success.
+// Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or
+// when parsing of 'buffer' fails.
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+ size_t *bytes_read, Av1Config *config);
+
+// Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'.
+// Returns -1 when passed NULL pointers or when 'capacity' insufficient.
+int write_av1config(const Av1Config *config, size_t capacity,
+ size_t *bytes_written, uint8_t *buffer);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // AOM_COMMON_AV1_CONFIG_H_
diff --git a/third_party/aom/common/ivf_dec.cmake b/third_party/aom/common/ivf_dec.cmake
new file mode 100644
index 0000000000..fedeea7940
--- /dev/null
+++ b/third_party/aom/common/ivf_dec.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_COMMON_IVF_DEC_CMAKE_)
+ return()
+endif() # AOM_COMMON_AOM_COMMON_CMAKE_
+set(AOM_COMMON_IVF_DEC_CMAKE_ 1)
+
+list(APPEND IVF_DEC_SOURCES "${AOM_ROOT}/common/ivfdec.c"
+ "${AOM_ROOT}/common/ivfdec.h")
+
+# Creates the aom_common build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_ivf_dec_targets)
+ add_library(ivf_dec OBJECT ${IVF_DEC_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} ivf_dec PARENT_SCOPE)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:ivf_dec>)
+ if(BUILD_SHARED_LIBS)
+ target_sources(aom_static PRIVATE $<TARGET_OBJECTS:ivf_dec>)
+ endif()
+endfunction()
diff --git a/third_party/aom/common/ivfdec.c b/third_party/aom/common/ivfdec.c
new file mode 100644
index 0000000000..6e714d1cfe
--- /dev/null
+++ b/third_party/aom/common/ivfdec.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/ivfdec.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/sanitizer.h"
+#include "tools_common.h"
+
+static const char *IVF_SIGNATURE = "DKIF";
+
+static void fix_framerate(int *num, int *den) {
+ if (*den <= 0 || *den >= 1000000000 || *num <= 0 || *num >= 1000) {
+ // framerate seems to be invalid, just default to 30fps.
+ *num = 30;
+ *den = 1;
+ }
+}
+
+int file_is_ivf(struct AvxInputContext *input_ctx) {
+ unsigned char raw_hdr[32];
+ int is_ivf = 0;
+
+ if (buffer_input(input_ctx, 32, raw_hdr, /*buffered=*/true) == 32) {
+ if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) {
+ is_ivf = 1;
+
+ if (mem_get_le16(raw_hdr + 4) != 0) {
+ fprintf(stderr,
+ "Error: Unrecognized IVF version! This file may not"
+ " decode properly.\n");
+ }
+
+ input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
+ input_ctx->width = mem_get_le16(raw_hdr + 12);
+ input_ctx->height = mem_get_le16(raw_hdr + 14);
+ input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
+ input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
+ fix_framerate(&input_ctx->framerate.numerator,
+ &input_ctx->framerate.denominator);
+ }
+ }
+
+ if (!is_ivf) {
+ rewind_detect(input_ctx);
+ }
+ return is_ivf;
+}
+
+int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size,
+ aom_codec_pts_t *pts) {
+ unsigned char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
+ size_t frame_size = 0;
+
+ if (read_from_input(input_ctx, IVF_FRAME_HDR_SZ, raw_header) !=
+ IVF_FRAME_HDR_SZ) {
+ if (!input_eof(input_ctx))
+ fprintf(stderr, "Warning: Failed to read frame size\n");
+ } else {
+ frame_size = mem_get_le32(raw_header);
+
+ if (frame_size > 256 * 1024 * 1024) {
+ fprintf(stderr, "Warning: Read invalid frame size (%u)\n",
+ (unsigned int)frame_size);
+ frame_size = 0;
+ }
+
+ if (frame_size > *buffer_size) {
+ uint8_t *new_buffer = (uint8_t *)realloc(*buffer, 2 * frame_size);
+
+ if (new_buffer) {
+ *buffer = new_buffer;
+ *buffer_size = 2 * frame_size;
+ } else {
+ fprintf(stderr, "Warning: Failed to allocate compressed data buffer\n");
+ frame_size = 0;
+ }
+ }
+
+ if (pts) {
+ *pts = mem_get_le32(&raw_header[4]);
+ *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32);
+ }
+ }
+
+ if (!input_eof(input_ctx)) {
+ ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
+ if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) {
+ fprintf(stderr, "Warning: Failed to read full frame\n");
+ return 1;
+ }
+
+ ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size);
+ *bytes_read = frame_size;
+ return 0;
+ }
+
+ return 1;
+}
diff --git a/third_party/aom/common/ivfdec.h b/third_party/aom/common/ivfdec.h
new file mode 100644
index 0000000000..e8fe8d0c53
--- /dev/null
+++ b/third_party/aom/common/ivfdec.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_IVFDEC_H_
+#define AOM_COMMON_IVFDEC_H_
+
+#include "aom/aom_codec.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int file_is_ivf(struct AvxInputContext *input);
+int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size,
+ aom_codec_pts_t *pts);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // AOM_COMMON_IVFDEC_H_
diff --git a/third_party/aom/common/ivfenc.c b/third_party/aom/common/ivfenc.c
new file mode 100644
index 0000000000..64715f4d74
--- /dev/null
+++ b/third_party/aom/common/ivfenc.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/ivfenc.h"
+
+#include "aom/aom_encoder.h"
+#include "aom_ports/mem_ops.h"
+
+void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+ unsigned int fourcc, int frame_cnt) {
+ char header[32];
+
+ header[0] = 'D';
+ header[1] = 'K';
+ header[2] = 'I';
+ header[3] = 'F';
+ mem_put_le16(header + 4, 0); // version
+ mem_put_le16(header + 6, 32); // header size
+ mem_put_le32(header + 8, fourcc); // fourcc
+ mem_put_le16(header + 12, cfg->g_w); // width
+ mem_put_le16(header + 14, cfg->g_h); // height
+ mem_put_le32(header + 16, cfg->g_timebase.den); // rate
+ mem_put_le32(header + 20, cfg->g_timebase.num); // scale
+ mem_put_le32(header + 24, frame_cnt); // length
+ mem_put_le32(header + 28, 0); // unused
+
+ fwrite(header, 1, 32, outfile);
+}
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
+ char header[12];
+
+ mem_put_le32(header, (int)frame_size);
+ mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+ mem_put_le32(header + 8, (int)(pts >> 32));
+ fwrite(header, 1, 12, outfile);
+}
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size) {
+ char header[4];
+
+ mem_put_le32(header, (int)frame_size);
+ fwrite(header, 1, 4, outfile);
+}
diff --git a/third_party/aom/common/ivfenc.h b/third_party/aom/common/ivfenc.h
new file mode 100644
index 0000000000..8f6d947d47
--- /dev/null
+++ b/third_party/aom/common/ivfenc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_IVFENC_H_
+#define AOM_COMMON_IVFENC_H_
+
+#include "common/tools_common.h"
+
+struct aom_codec_enc_cfg;
+struct aom_codec_cx_pkt;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+ uint32_t fourcc, int frame_cnt);
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size);
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // AOM_COMMON_IVFENC_H_
diff --git a/third_party/aom/common/md5_utils.c b/third_party/aom/common/md5_utils.c
new file mode 100644
index 0000000000..c69aa57a3b
--- /dev/null
+++ b/third_party/aom/common/md5_utils.c
@@ -0,0 +1,257 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ * - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include <string.h> /* for memcpy() */
+
+#include "common/md5_utils.h"
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+ md5byte *p;
+
+ /* Only swap bytes for big endian machines */
+ int i = 1;
+
+ if (*(char *)&i == 1) return;
+
+ p = (md5byte *)buf;
+
+ do {
+ *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+ ((unsigned)p[1] << 8 | p[0]);
+ p += 4;
+ } while (--words);
+}
+
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bytes[0] = 0;
+ ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+ UWORD32 t;
+
+ /* Update byte count */
+
+ t = ctx->bytes[0];
+
+ if ((ctx->bytes[0] = t + len) < t)
+ ctx->bytes[1]++; /* Carry from low to high */
+
+ t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+ if (t > len) {
+ memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+ return;
+ }
+
+ /* First chunk is an odd size */
+ memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ buf += t;
+ len -= t;
+
+ /* Process data in 64-byte chunks */
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ buf += 64;
+ len -= 64;
+ }
+
+ /* Handle any remaining bytes of data. */
+ memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+ int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+ md5byte *p = (md5byte *)ctx->in + count;
+
+ /* Set the first char of padding to 0x80. There is always room. */
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 56 bytes (-8..55) */
+ count = 56 - 1 - count;
+
+ if (count < 0) { /* Padding forces an extra block */
+ memset(p, 0, count + 8);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ p = (md5byte *)ctx->in;
+ count = 56;
+ }
+
+ memset(p, 0, count);
+ byteSwap(ctx->in, 14);
+
+ /* Append length in bits and transform */
+ ctx->in[14] = ctx->bytes[0] << 3;
+ ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+ MD5Transform(ctx->buf, ctx->in);
+
+ byteSwap(ctx->buf, 4);
+ memcpy(digest, ctx->buf, 16);
+ memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define AOM_NO_UNSIGNED_OVERFLOW_CHECK \
+ __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+ __attribute__((no_sanitize("unsigned-shift-base")))
+#endif // __clang__ >= 12
+#endif // __clang__
+
+#ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#define AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+#ifndef AOM_NO_UNSIGNED_SHIFT_CHECK
+#define AOM_NO_UNSIGNED_SHIFT_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+AOM_NO_UNSIGNED_OVERFLOW_CHECK AOM_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+ UWORD32 buf[4], UWORD32 const in[16]) {
+ register UWORD32 a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+
+#undef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#undef AOM_NO_UNSIGNED_SHIFT_CHECK
+
+#endif
diff --git a/third_party/aom/common/md5_utils.h b/third_party/aom/common/md5_utils.h
new file mode 100644
index 0000000000..144fa3ad28
--- /dev/null
+++ b/third_party/aom/common/md5_utils.h
@@ -0,0 +1,49 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ * - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef AOM_COMMON_MD5_UTILS_H_
+#define AOM_COMMON_MD5_UTILS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+ UWORD32 buf[4];
+ UWORD32 bytes[2];
+ UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_MD5_UTILS_H_
diff --git a/third_party/aom/common/obudec.c b/third_party/aom/common/obudec.c
new file mode 100644
index 0000000000..8b7bd39a60
--- /dev/null
+++ b/third_party/aom/common/obudec.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "common/obudec.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+#include "tools_common.h"
+
+#define OBU_BUFFER_SIZE (500 * 1024)
+
+#define OBU_HEADER_SIZE 1
+#define OBU_EXTENSION_SIZE 1
+#define OBU_MAX_LENGTH_FIELD_SIZE 8
+
+#define OBU_MAX_HEADER_SIZE \
+ (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 2 * OBU_MAX_LENGTH_FIELD_SIZE)
+
+#define OBU_DETECTION_SIZE \
+ (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 4 * OBU_MAX_LENGTH_FIELD_SIZE)
+
+// Reads unsigned LEB128 integer and returns 0 upon successful read and decode.
+// Stores raw bytes in 'value_buffer', length of the number in 'value_length',
+// and decoded value in 'value'. If 'buffered' is true, it is buffered in the
+// detect buffer first.
+static int obudec_read_leb128(struct AvxInputContext *input_ctx,
+ uint8_t *value_buffer, size_t *value_length,
+ uint64_t *value, bool buffered) {
+ if (!input_ctx || !value_buffer || !value_length || !value) return -1;
+ size_t len;
+ for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) {
+ const size_t num_read =
+ buffer_input(input_ctx, 1, &value_buffer[len], buffered);
+ if (num_read == 0) {
+ if (len == 0 && input_eof(input_ctx)) {
+ *value_length = 0;
+ return 0;
+ }
+ // Ran out of data before completing read of value.
+ return -1;
+ }
+ if ((value_buffer[len] >> 7) == 0) {
+ ++len;
+ *value_length = len;
+ break;
+ }
+ }
+
+ return aom_uleb_decode(value_buffer, len, value, NULL);
+}
+
+// Reads OBU header from 'input_ctx'. The 'buffer_capacity' passed in must be
+// large enough to store an OBU header with extension (2 bytes). Raw OBU data is
+// written to 'obu_data', parsed OBU header values are written to 'obu_header',
+// and total bytes read from file are written to 'bytes_read'. Returns 0 for
+// success, and non-zero on failure. When end of file is reached, the return
+// value is 0 and the 'bytes_read' value is set to 0. If 'buffered' is true, it
+// is buffered in the detect buffer first.
+static int obudec_read_obu_header(struct AvxInputContext *input_ctx,
+ size_t buffer_capacity, int is_annexb,
+ uint8_t *obu_data, ObuHeader *obu_header,
+ size_t *bytes_read, bool buffered) {
+ if (!input_ctx || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) ||
+ !obu_data || !obu_header || !bytes_read) {
+ return -1;
+ }
+ *bytes_read = buffer_input(input_ctx, 1, obu_data, buffered);
+
+ if (input_eof(input_ctx) && *bytes_read == 0) {
+ return 0;
+ } else if (*bytes_read != 1) {
+ fprintf(stderr, "obudec: Failure reading OBU header.\n");
+ return -1;
+ }
+
+ const int has_extension = (obu_data[0] >> 2) & 0x1;
+ if (has_extension) {
+ if (buffer_input(input_ctx, 1, &obu_data[1], buffered) != 1) {
+ fprintf(stderr, "obudec: Failure reading OBU extension.");
+ return -1;
+ }
+ ++*bytes_read;
+ }
+
+ size_t obu_bytes_parsed = 0;
+ const aom_codec_err_t parse_result = aom_read_obu_header(
+ obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb);
+ if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) {
+ fprintf(stderr, "obudec: Error parsing OBU header.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+// Reads OBU payload from 'input_ctx' and returns 0 for success when all payload
+// bytes are read from the file. Payload data is written to 'obu_data', and
+// actual bytes read added to 'bytes_read'. If 'buffered' is true, it is
+// buffered in the detect buffer first.
+static int obudec_read_obu_payload(struct AvxInputContext *input_ctx,
+ size_t payload_length, uint8_t *obu_data,
+ size_t *bytes_read, bool buffered) {
+ if (!input_ctx || payload_length == 0 || !obu_data || !bytes_read) return -1;
+
+ if (buffer_input(input_ctx, payload_length, obu_data, buffered) !=
+ payload_length) {
+ fprintf(stderr, "obudec: Failure reading OBU payload.\n");
+ return -1;
+ }
+
+ *bytes_read += payload_length;
+ return 0;
+}
+
+static int obudec_read_obu_header_and_size(
+ struct AvxInputContext *input_ctx, size_t buffer_capacity, int is_annexb,
+ uint8_t *buffer, size_t *bytes_read, size_t *payload_length,
+ ObuHeader *obu_header, bool buffered) {
+ const size_t kMinimumBufferSize = OBU_MAX_HEADER_SIZE;
+ if (!input_ctx || !buffer || !bytes_read || !payload_length || !obu_header ||
+ buffer_capacity < kMinimumBufferSize) {
+ return -1;
+ }
+
+ size_t leb128_length_obu = 0;
+ size_t leb128_length_payload = 0;
+ uint64_t obu_size = 0;
+ if (is_annexb) {
+ if (obudec_read_leb128(input_ctx, &buffer[0], &leb128_length_obu, &obu_size,
+ buffered) != 0) {
+ fprintf(stderr, "obudec: Failure reading OBU size length.\n");
+ return -1;
+ } else if (leb128_length_obu == 0) {
+ *payload_length = 0;
+ return 0;
+ }
+ if (obu_size > UINT32_MAX) {
+ fprintf(stderr, "obudec: OBU payload length too large.\n");
+ return -1;
+ }
+ }
+
+ size_t header_size = 0;
+ if (obudec_read_obu_header(input_ctx, buffer_capacity - leb128_length_obu,
+ is_annexb, buffer + leb128_length_obu, obu_header,
+ &header_size, buffered) != 0) {
+ return -1;
+ } else if (header_size == 0) {
+ *payload_length = 0;
+ return 0;
+ }
+
+ if (!obu_header->has_size_field) {
+ assert(is_annexb);
+ if (obu_size < header_size) {
+ fprintf(stderr, "obudec: OBU size is too small.\n");
+ return -1;
+ }
+ *payload_length = (size_t)obu_size - header_size;
+ } else {
+ uint64_t u64_payload_length = 0;
+ if (obudec_read_leb128(input_ctx, &buffer[leb128_length_obu + header_size],
+ &leb128_length_payload, &u64_payload_length,
+ buffered) != 0) {
+ fprintf(stderr, "obudec: Failure reading OBU payload length.\n");
+ return -1;
+ }
+ if (u64_payload_length > UINT32_MAX) {
+ fprintf(stderr, "obudec: OBU payload length too large.\n");
+ return -1;
+ }
+
+ *payload_length = (size_t)u64_payload_length;
+ }
+
+ *bytes_read = leb128_length_obu + header_size + leb128_length_payload;
+ return 0;
+}
+
+static int obudec_grow_buffer(size_t growth_amount, uint8_t **obu_buffer,
+ size_t *obu_buffer_capacity) {
+ if (!*obu_buffer || !obu_buffer_capacity || growth_amount == 0) {
+ return -1;
+ }
+
+ const size_t capacity = *obu_buffer_capacity;
+ if (SIZE_MAX - growth_amount < capacity) {
+ fprintf(stderr, "obudec: cannot grow buffer, capacity will roll over.\n");
+ return -1;
+ }
+
+ const size_t new_capacity = capacity + growth_amount;
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+ if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) {
+ fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n");
+ return -1;
+ }
+#endif
+
+ uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity);
+ if (!new_buffer) {
+ fprintf(stderr, "obudec: Failed to allocate compressed data buffer.\n");
+ return -1;
+ }
+
+ *obu_buffer = new_buffer;
+ *obu_buffer_capacity = new_capacity;
+ return 0;
+}
+
+static int obudec_read_one_obu(struct AvxInputContext *input_ctx,
+ uint8_t **obu_buffer, size_t obu_bytes_buffered,
+ size_t *obu_buffer_capacity, size_t *obu_length,
+ ObuHeader *obu_header, int is_annexb,
+ bool buffered) {
+ if (!input_ctx || !(*obu_buffer) || !obu_buffer_capacity || !obu_length ||
+ !obu_header) {
+ return -1;
+ }
+
+ size_t bytes_read = 0;
+ size_t obu_payload_length = 0;
+ size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered;
+
+ if (available_buffer_capacity < OBU_MAX_HEADER_SIZE) {
+ if (obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE),
+ obu_buffer, obu_buffer_capacity) != 0) {
+ *obu_length = bytes_read;
+ return -1;
+ }
+ available_buffer_capacity +=
+ AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE);
+ }
+
+ const int status = obudec_read_obu_header_and_size(
+ input_ctx, available_buffer_capacity, is_annexb,
+ *obu_buffer + obu_bytes_buffered, &bytes_read, &obu_payload_length,
+ obu_header, buffered);
+ if (status < 0) return status;
+
+ if (obu_payload_length > SIZE_MAX - bytes_read) return -1;
+
+ if (obu_payload_length > 256 * 1024 * 1024) {
+ fprintf(stderr, "obudec: Read invalid OBU size (%u)\n",
+ (unsigned int)obu_payload_length);
+ *obu_length = bytes_read + obu_payload_length;
+ return -1;
+ }
+
+ if (bytes_read + obu_payload_length > available_buffer_capacity &&
+ obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, obu_payload_length),
+ obu_buffer, obu_buffer_capacity) != 0) {
+ *obu_length = bytes_read + obu_payload_length;
+ return -1;
+ }
+
+ if (obu_payload_length > 0 &&
+ obudec_read_obu_payload(input_ctx, obu_payload_length,
+ *obu_buffer + obu_bytes_buffered + bytes_read,
+ &bytes_read, buffered) != 0) {
+ return -1;
+ }
+
+ *obu_length = bytes_read;
+ return 0;
+}
+
+int file_is_obu(struct ObuDecInputContext *obu_ctx) {
+ if (!obu_ctx || !obu_ctx->avx_ctx) return 0;
+
+ struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx;
+ uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 };
+ const int is_annexb = obu_ctx->is_annexb;
+ size_t payload_length = 0;
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+ size_t length_of_unit_size = 0;
+ size_t annexb_header_length = 0;
+ uint64_t unit_size = 0;
+
+ if (is_annexb) {
+ // read the size of first temporal unit
+ if (obudec_read_leb128(avx_ctx, &detect_buf[0], &length_of_unit_size,
+ &unit_size, /*buffered=*/true) != 0) {
+ fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+
+ // read the size of first frame unit
+ if (obudec_read_leb128(avx_ctx, &detect_buf[length_of_unit_size],
+ &annexb_header_length, &unit_size,
+ /*buffered=*/true) != 0) {
+ fprintf(stderr, "obudec: Failure reading frame unit header\n");
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+ annexb_header_length += length_of_unit_size;
+ }
+
+ size_t bytes_read = 0;
+ if (obudec_read_obu_header_and_size(
+ avx_ctx, OBU_DETECTION_SIZE - annexb_header_length, is_annexb,
+ &detect_buf[annexb_header_length], &bytes_read, &payload_length,
+ &obu_header, /*buffered=*/true) != 0) {
+ fprintf(stderr, "obudec: Failure reading first OBU.\n");
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+
+ if (is_annexb) {
+ bytes_read += annexb_header_length;
+ }
+
+ if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+ obu_header.type != OBU_SEQUENCE_HEADER) {
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+
+ if (obu_header.has_size_field) {
+ if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) {
+ fprintf(
+ stderr,
+ "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero).");
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+ } else if (!is_annexb) {
+ fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n");
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+
+ // Appears that input is valid Section 5 AV1 stream.
+ obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE);
+ if (!obu_ctx->buffer) {
+ fprintf(stderr, "Out of memory.\n");
+ rewind_detect(avx_ctx);
+ return 0;
+ }
+ obu_ctx->buffer_capacity = OBU_BUFFER_SIZE;
+
+ memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read);
+ obu_ctx->bytes_buffered = bytes_read;
+ // If the first OBU is a SEQUENCE_HEADER, then it will have a payload.
+ // We need to read this in so that our buffer only contains complete OBUs.
+ if (payload_length > 0) {
+ if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) {
+ fprintf(stderr, "obudec: First OBU's payload is too large\n");
+ rewind_detect(avx_ctx);
+ obudec_free(obu_ctx);
+ return 0;
+ }
+
+ size_t payload_bytes = 0;
+ const int status = obudec_read_obu_payload(
+ avx_ctx, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes,
+ /*buffered=*/false);
+ if (status < 0) {
+ rewind_detect(avx_ctx);
+ obudec_free(obu_ctx);
+ return 0;
+ }
+ obu_ctx->bytes_buffered += payload_bytes;
+ }
+ return 1;
+}
+
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+ uint8_t **buffer, size_t *bytes_read,
+ size_t *buffer_size) {
+ FILE *f = obu_ctx->avx_ctx->file;
+ if (!f) return -1;
+
+ *buffer_size = 0;
+ *bytes_read = 0;
+
+ if (input_eof(obu_ctx->avx_ctx)) {
+ return 1;
+ }
+
+ size_t tu_size;
+ size_t obu_size = 0;
+ size_t length_of_temporal_unit_size = 0;
+ uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 };
+
+ if (obu_ctx->is_annexb) {
+ uint64_t size = 0;
+
+ if (obu_ctx->bytes_buffered == 0) {
+ if (obudec_read_leb128(obu_ctx->avx_ctx, &tuheader[0],
+ &length_of_temporal_unit_size, &size,
+ /*buffered=*/false) != 0) {
+ fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+ return -1;
+ }
+ if (size == 0 && input_eof(obu_ctx->avx_ctx)) {
+ return 1;
+ }
+ } else {
+ // temporal unit size was already stored in buffer
+ if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size,
+ &length_of_temporal_unit_size) != 0) {
+ fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+ return -1;
+ }
+ }
+
+ if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) {
+ fprintf(stderr, "obudec: TU too large.\n");
+ return -1;
+ }
+
+ size += length_of_temporal_unit_size;
+ tu_size = (size_t)size;
+ } else {
+ while (1) {
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+
+ if (obudec_read_one_obu(obu_ctx->avx_ctx, &obu_ctx->buffer,
+ obu_ctx->bytes_buffered,
+ &obu_ctx->buffer_capacity, &obu_size, &obu_header,
+ 0, /*buffered=*/false) != 0) {
+ fprintf(stderr, "obudec: read_one_obu failed in TU loop\n");
+ return -1;
+ }
+
+ if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) {
+ tu_size = obu_ctx->bytes_buffered;
+ break;
+ } else {
+ obu_ctx->bytes_buffered += obu_size;
+ }
+ }
+ }
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+ if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) {
+ fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n");
+ return -1;
+ }
+#endif
+ if (tu_size > 0) {
+ uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size);
+ if (!new_buffer) {
+ free(*buffer);
+ fprintf(stderr, "obudec: Out of memory.\n");
+ return -1;
+ }
+ *buffer = new_buffer;
+ }
+ *bytes_read = tu_size;
+ *buffer_size = tu_size;
+
+ if (!obu_ctx->is_annexb) {
+ memcpy(*buffer, obu_ctx->buffer, tu_size);
+
+ // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size)
+ // points to the end of the buffer.
+ memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered,
+ obu_size);
+ obu_ctx->bytes_buffered = obu_size;
+ } else {
+ if (!input_eof(obu_ctx->avx_ctx)) {
+ size_t data_size;
+ size_t offset;
+ if (!obu_ctx->bytes_buffered) {
+ data_size = tu_size - length_of_temporal_unit_size;
+ memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size);
+ offset = length_of_temporal_unit_size;
+ } else {
+ const size_t copy_size = AOMMIN(obu_ctx->bytes_buffered, tu_size);
+ memcpy(*buffer, obu_ctx->buffer, copy_size);
+ offset = copy_size;
+ data_size = tu_size - copy_size;
+ obu_ctx->bytes_buffered -= copy_size;
+ }
+
+ if (read_from_input(obu_ctx->avx_ctx, data_size, *buffer + offset) !=
+ data_size) {
+ fprintf(stderr, "obudec: Failed to read full temporal unit\n");
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+void obudec_free(struct ObuDecInputContext *obu_ctx) {
+ free(obu_ctx->buffer);
+ obu_ctx->buffer = NULL;
+ obu_ctx->buffer_capacity = 0;
+ obu_ctx->bytes_buffered = 0;
+}
diff --git a/third_party/aom/common/obudec.h b/third_party/aom/common/obudec.h
new file mode 100644
index 0000000000..b2adb1e3d7
--- /dev/null
+++ b/third_party/aom/common/obudec.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_OBUDEC_H_
+#define AOM_COMMON_OBUDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ObuDecInputContext {
+ struct AvxInputContext *avx_ctx;
+ uint8_t *buffer;
+ size_t buffer_capacity;
+ size_t bytes_buffered;
+ int is_annexb;
+};
+
+// Returns 1 when file data starts (if Annex B stream, after reading the
+// size of the OBU) with what appears to be a Temporal Delimiter
+// OBU as defined by Section 5 of the AV1 bitstream specification.
+int file_is_obu(struct ObuDecInputContext *obu_ctx);
+
+// Reads one Temporal Unit from the input file. Returns 0 when a TU is
+// successfully read, 1 when end of file is reached, and less than 0 when an
+// error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size,
+// returns buffer capacity via 'buffer_size', and returns size of buffered data
+// via 'bytes_read'.
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+ uint8_t **buffer, size_t *bytes_read,
+ size_t *buffer_size);
+
+void obudec_free(struct ObuDecInputContext *obu_ctx);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // AOM_COMMON_OBUDEC_H_
diff --git a/third_party/aom/common/rawenc.c b/third_party/aom/common/rawenc.c
new file mode 100644
index 0000000000..aa80d2cae3
--- /dev/null
+++ b/third_party/aom/common/rawenc.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include "common/rawenc.h"
+
+// Number of bytes to write per batch in write_greyscale.
+#define BATCH_SIZE 8
+
+// Interface to writing to either a file or MD5Context. Takes a pointer to
+// either the file or MD5Context, the buffer, the size of each element, and
+// number of elements to write. Note that size and nmemb (last two args) must
+// be unsigned int, as the interface to MD5Update requires that.
+typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int);
+
+static void write_file(void *fp, const uint8_t *buffer, unsigned int size,
+ unsigned int nmemb) {
+ fwrite(buffer, size, nmemb, (FILE *)fp);
+}
+
+static void write_md5(void *md5, const uint8_t *buffer, unsigned int size,
+ unsigned int nmemb) {
+ MD5Update((MD5Context *)md5, buffer, size * nmemb);
+}
+
+// Writes out n neutral chroma samples (for greyscale).
+static void write_greyscale(const aom_image_t *img, int n, WRITER writer_func,
+ void *file_or_md5) {
+ // Batch 8 writes for low bit-depth, 4 writes for high bit-depth.
+ int bytes_per_sample;
+ union {
+ uint8_t u8[BATCH_SIZE];
+ uint16_t u16[BATCH_SIZE / 2];
+ } batched;
+ if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ bytes_per_sample = 2;
+ for (int i = 0; i < BATCH_SIZE / 2; ++i) {
+ batched.u16[i] = 1 << (img->bit_depth - 1);
+ }
+ } else {
+ bytes_per_sample = 1;
+ for (int i = 0; i < BATCH_SIZE; ++i) {
+ batched.u8[i] = 0x80;
+ }
+ }
+ const int samples_per_batch = BATCH_SIZE / bytes_per_sample;
+ const int num_batched_writes = n / samples_per_batch;
+ for (int i = 0; i < num_batched_writes; ++i) {
+ writer_func(file_or_md5, batched.u8, sizeof(uint8_t), BATCH_SIZE);
+ }
+ const int remaining = n % samples_per_batch;
+ for (int i = 0; i < remaining; ++i) {
+ writer_func(file_or_md5, batched.u8, sizeof(uint8_t), bytes_per_sample);
+ }
+}
+
+// Encapsulates the logic for writing raw data to either an image file or
+// to an MD5 context.
+static void raw_write_image_file_or_md5(const aom_image_t *img,
+ const int *planes, const int num_planes,
+ void *file_or_md5, WRITER writer_func) {
+ const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH;
+ const int bytes_per_sample = high_bitdepth ? 2 : 1;
+ for (int i = 0; i < num_planes; ++i) {
+ const int plane = planes[i];
+ const int w = aom_img_plane_width(img, plane);
+ const int h = aom_img_plane_height(img, plane);
+ // If we're on a color plane and the output is monochrome, write a greyscale
+ // value. Since there are only YUV planes, compare against Y.
+ if (img->monochrome && plane != AOM_PLANE_Y) {
+ write_greyscale(img, w * h, writer_func, file_or_md5);
+ continue;
+ }
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ for (int y = 0; y < h; ++y) {
+ writer_func(file_or_md5, buf, bytes_per_sample, w);
+ buf += stride;
+ }
+ }
+}
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+ const int num_planes, FILE *file) {
+ raw_write_image_file_or_md5(img, planes, num_planes, file, write_file);
+}
+
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+ const int num_planes, MD5Context *md5) {
+ raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5);
+}
diff --git a/third_party/aom/common/rawenc.h b/third_party/aom/common/rawenc.h
new file mode 100644
index 0000000000..cf5e00e6fd
--- /dev/null
+++ b/third_party/aom/common/rawenc.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_RAWENC_H_
+#define AOM_COMMON_RAWENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+ const int num_planes, FILE *file);
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+ const int num_planes, MD5Context *md5);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_RAWENC_H_
diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c
new file mode 100644
index 0000000000..4d77a1b427
--- /dev/null
+++ b/third_party/aom/common/tools_common.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/tools_common.h"
+
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+
+#if CONFIG_AV1_DECODER
+#include "aom/aomdx.h"
+#endif
+
+#if defined(_WIN32)
+#include <io.h>
+#include <fcntl.h>
+#endif
+
+#define LOG_ERROR(label) \
+ do { \
+ const char *l = label; \
+ va_list ap; \
+ va_start(ap, fmt); \
+ if (l) fprintf(stderr, "%s: ", l); \
+ vfprintf(stderr, fmt, ap); \
+ fprintf(stderr, "\n"); \
+ va_end(ap); \
+ } while (0)
+
+FILE *set_binary_mode(FILE *stream) {
+ (void)stream;
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+void die(const char *fmt, ...) {
+ LOG_ERROR(NULL);
+ usage_exit();
+}
+
+void fatal(const char *fmt, ...) {
+ LOG_ERROR("Fatal");
+ exit(EXIT_FAILURE);
+}
+
+void aom_tools_warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
+
+void die_codec(aom_codec_ctx_t *ctx, const char *s) {
+ const char *detail = aom_codec_error_detail(ctx);
+
+ fprintf(stderr, "%s: %s\n", s, aom_codec_error(ctx));
+ if (detail) fprintf(stderr, " %s\n", detail);
+ exit(EXIT_FAILURE);
+}
+
+const char *image_format_to_string(aom_img_fmt_t fmt) {
+ switch (fmt) {
+ case AOM_IMG_FMT_I420: return "I420";
+ case AOM_IMG_FMT_I422: return "I422";
+ case AOM_IMG_FMT_I444: return "I444";
+ case AOM_IMG_FMT_YV12: return "YV12";
+ case AOM_IMG_FMT_NV12: return "NV12";
+ case AOM_IMG_FMT_YV1216: return "YV1216";
+ case AOM_IMG_FMT_I42016: return "I42016";
+ case AOM_IMG_FMT_I42216: return "I42216";
+ case AOM_IMG_FMT_I44416: return "I44416";
+ default: return "Other";
+ }
+}
+
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) {
+ FILE *f = input_ctx->file;
+ struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
+ int plane = 0;
+ int shortread = 0;
+ const int bytespp = (yuv_frame->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+
+ for (plane = 0; plane < 3; ++plane) {
+ uint8_t *ptr;
+ int w = aom_img_plane_width(yuv_frame, plane);
+ const int h = aom_img_plane_height(yuv_frame, plane);
+ int r;
+ // Assuming that for nv12 we read all chroma data at one time
+ if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break;
+ if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2;
+ /* Determine the correct plane based on the image format. The for-loop
+ * always counts in Y,U,V order, but this may not match the order of
+ * the data on disk.
+ */
+ switch (plane) {
+ case 1:
+ ptr =
+ yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_V
+ : AOM_PLANE_U];
+ break;
+ case 2:
+ ptr =
+ yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_U
+ : AOM_PLANE_V];
+ break;
+ default: ptr = yuv_frame->planes[plane];
+ }
+
+ for (r = 0; r < h; ++r) {
+ size_t needed = w * bytespp;
+ size_t buf_position = 0;
+ const size_t left = detect->buf_read - detect->position;
+ if (left > 0) {
+ const size_t more = (left < needed) ? left : needed;
+ memcpy(ptr, detect->buf + detect->position, more);
+ buf_position = more;
+ needed -= more;
+ detect->position += more;
+ }
+ if (needed > 0) {
+ shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+ }
+
+ ptr += yuv_frame->stride[plane];
+ }
+ }
+
+ return shortread;
+}
+
+struct CodecInfo {
+ // Pointer to a function of zero arguments that returns an aom_codec_iface_t.
+ aom_codec_iface_t *(*interface)(void);
+ const char *short_name;
+ uint32_t fourcc;
+};
+
+#if CONFIG_AV1_ENCODER
+static const struct CodecInfo aom_encoders[] = {
+ { &aom_codec_av1_cx, "av1", AV1_FOURCC },
+};
+
+int get_aom_encoder_count(void) {
+ return sizeof(aom_encoders) / sizeof(aom_encoders[0]);
+}
+
+aom_codec_iface_t *get_aom_encoder_by_index(int i) {
+ assert(i >= 0 && i < get_aom_encoder_count());
+ return aom_encoders[i].interface();
+}
+
+aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name) {
+ for (int i = 0; i < get_aom_encoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_encoders[i];
+ if (strcmp(info->short_name, name) == 0) return info->interface();
+ }
+ return NULL;
+}
+
+uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface) {
+ for (int i = 0; i < get_aom_encoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_encoders[i];
+ if (info->interface() == iface) {
+ return info->fourcc;
+ }
+ }
+ return 0;
+}
+
+const char *get_short_name_by_aom_encoder(aom_codec_iface_t *iface) {
+ for (int i = 0; i < get_aom_encoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_encoders[i];
+ if (info->interface() == iface) {
+ return info->short_name;
+ }
+ }
+ return NULL;
+}
+
+#endif // CONFIG_AV1_ENCODER
+
+#if CONFIG_AV1_DECODER
+static const struct CodecInfo aom_decoders[] = {
+ { &aom_codec_av1_dx, "av1", AV1_FOURCC },
+};
+
+int get_aom_decoder_count(void) {
+ return sizeof(aom_decoders) / sizeof(aom_decoders[0]);
+}
+
+aom_codec_iface_t *get_aom_decoder_by_index(int i) {
+ assert(i >= 0 && i < get_aom_decoder_count());
+ return aom_decoders[i].interface();
+}
+
+aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name) {
+ for (int i = 0; i < get_aom_decoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_decoders[i];
+ if (strcmp(info->short_name, name) == 0) return info->interface();
+ }
+ return NULL;
+}
+
+aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc) {
+ for (int i = 0; i < get_aom_decoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_decoders[i];
+ if (info->fourcc == fourcc) return info->interface();
+ }
+ return NULL;
+}
+
+const char *get_short_name_by_aom_decoder(aom_codec_iface_t *iface) {
+ for (int i = 0; i < get_aom_decoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_decoders[i];
+ if (info->interface() == iface) {
+ return info->short_name;
+ }
+ }
+ return NULL;
+}
+
+uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) {
+ for (int i = 0; i < get_aom_decoder_count(); ++i) {
+ const struct CodecInfo *info = &aom_decoders[i];
+ if (info->interface() == iface) {
+ return info->fourcc;
+ }
+ }
+ return 0;
+}
+
+#endif // CONFIG_AV1_DECODER
+
+void aom_img_write(const aom_image_t *img, FILE *file) {
+ int plane;
+
+ for (plane = 0; plane < 3; ++plane) {
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int w = aom_img_plane_width(img, plane) *
+ ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ const int h = aom_img_plane_height(img, plane);
+ int y;
+
+ for (y = 0; y < h; ++y) {
+ fwrite(buf, 1, w, file);
+ buf += stride;
+ }
+ }
+}
+
+bool aom_img_read(aom_image_t *img, FILE *file) {
+ int plane;
+ const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+
+ for (plane = 0; plane < 3; ++plane) {
+ unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int w = aom_img_plane_width(img, plane) * bytespp;
+ const int h = aom_img_plane_height(img, plane);
+ int y;
+
+ for (y = 0; y < h; ++y) {
+ if (fread(buf, 1, w, file) != (size_t)w) return false;
+ buf += stride;
+ }
+ }
+
+ return true;
+}
+
+// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
+double sse_to_psnr(double samples, double peak, double sse) {
+ static const double kMaxPSNR = 100.0;
+
+ if (sse > 0.0) {
+ const double psnr = 10.0 * log10(samples * peak * peak / sse);
+ return psnr > kMaxPSNR ? kMaxPSNR : psnr;
+ } else {
+ return kMaxPSNR;
+ }
+}
+
+// TODO(debargha): Consolidate the functions below into a separate file.
+static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+ int input_shift) {
+ // Note the offset is 1 less than half.
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+ input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case AOM_IMG_FMT_I42016:
+ case AOM_IMG_FMT_I42216:
+ case AOM_IMG_FMT_I44416: break;
+ default: fatal("Unsupported image conversion");
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ const uint16_t *p_src =
+ (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+ uint16_t *p_dst =
+ (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+ for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+}
+
+static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+ int input_shift) {
+ // Note the offset is 1 less than half.
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_I422:
+ case AOM_IMG_FMT_I444: break;
+ default: fatal("Unsupported image conversion");
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ const uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+ uint16_t *p_dst =
+ (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+ for (x = 0; x < w; x++) {
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+ }
+}
+
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src,
+ int input_shift) {
+ if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ highbd_img_upshift(dst, src, input_shift);
+ } else {
+ lowbd_img_upshift(dst, src, input_shift);
+ }
+}
+
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) {
+ int plane;
+ if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w ||
+ dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift) {
+ fatal("Unsupported image conversion");
+ }
+ switch (dst->fmt) {
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_I422:
+ case AOM_IMG_FMT_I444: break;
+ default: fatal("Unsupported image conversion");
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ const uint16_t *p_src =
+ (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+ uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+ for (x = 0; x < w; x++) {
+ *p_dst++ = (uint8_t)(*p_src++);
+ }
+ }
+ }
+}
+
+static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
+ int down_shift) {
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+ down_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case AOM_IMG_FMT_I42016:
+ case AOM_IMG_FMT_I42216:
+ case AOM_IMG_FMT_I44416: break;
+ default: fatal("Unsupported image conversion");
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ const uint16_t *p_src =
+ (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+ uint16_t *p_dst =
+ (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+ for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift;
+ }
+ }
+}
+
+static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
+ int down_shift) {
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ src->fmt != dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH || down_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (dst->fmt) {
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_I422:
+ case AOM_IMG_FMT_I444: break;
+ default: fatal("Unsupported image conversion");
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+ h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ const uint16_t *p_src =
+ (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+ uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+ for (x = 0; x < w; x++) {
+ *p_dst++ = *p_src++ >> down_shift;
+ }
+ }
+ }
+}
+
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+ int down_shift) {
+ if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ highbd_img_downshift(dst, src, down_shift);
+ } else {
+ lowbd_img_downshift(dst, src, down_shift);
+ }
+}
+
+static int img_shifted_realloc_required(const aom_image_t *img,
+ const aom_image_t *shifted,
+ aom_img_fmt_t required_fmt) {
+ return img->d_w != shifted->d_w || img->d_h != shifted->d_h ||
+ required_fmt != shifted->fmt;
+}
+
+bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
+ aom_image_t **img_shifted_ptr) {
+ aom_image_t *img = *img_ptr;
+ aom_image_t *img_shifted = *img_shifted_ptr;
+
+ const aom_img_fmt_t shifted_fmt = output_bit_depth == 8
+ ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH
+ : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
+
+ if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
+ if (img_shifted &&
+ img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+ aom_img_free(img_shifted);
+ img_shifted = NULL;
+ }
+ if (img_shifted) {
+ img_shifted->monochrome = img->monochrome;
+ }
+ if (!img_shifted) {
+ img_shifted = aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+ if (!img_shifted) {
+ *img_shifted_ptr = NULL;
+ return false;
+ }
+ img_shifted->bit_depth = output_bit_depth;
+ img_shifted->monochrome = img->monochrome;
+ img_shifted->csp = img->csp;
+ }
+ if (output_bit_depth > img->bit_depth) {
+ aom_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
+ } else {
+ aom_img_downshift(img_shifted, img, img->bit_depth - output_bit_depth);
+ }
+ *img_shifted_ptr = img_shifted;
+ *img_ptr = img_shifted;
+ }
+
+ return true;
+}
+
+// Related to I420, NV12 format has one luma "luminance" plane Y and one plane
+// with U and V values interleaved.
+void aom_img_write_nv12(const aom_image_t *img, FILE *file) {
+ // Y plane
+ const unsigned char *buf = img->planes[0];
+ int stride = img->stride[0];
+ int w = aom_img_plane_width(img, 0) *
+ ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ int h = aom_img_plane_height(img, 0);
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ fwrite(buf, 1, w, file);
+ buf += stride;
+ }
+
+ // Interleaved U and V plane
+ const unsigned char *ubuf = img->planes[1];
+ const unsigned char *vbuf = img->planes[2];
+ const size_t size = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+ stride = img->stride[1];
+ w = aom_img_plane_width(img, 1);
+ h = aom_img_plane_height(img, 1);
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ fwrite(ubuf, size, 1, file);
+ fwrite(vbuf, size, 1, file);
+ ubuf += size;
+ vbuf += size;
+ }
+ ubuf += (stride - w * size);
+ vbuf += (stride - w * size);
+ }
+}
+
+size_t read_from_input(struct AvxInputContext *input_ctx, size_t n,
+ unsigned char *buf) {
+ const size_t buffered_bytes =
+ input_ctx->detect.buf_read - input_ctx->detect.position;
+ size_t read_n;
+ if (buffered_bytes == 0) {
+ read_n = fread(buf, 1, n, input_ctx->file);
+ } else if (n <= buffered_bytes) {
+ memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position, n);
+ input_ctx->detect.position += n;
+ read_n = n;
+ } else {
+ memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position,
+ buffered_bytes);
+ input_ctx->detect.position += buffered_bytes;
+ read_n = buffered_bytes;
+ read_n +=
+ fread(buf + buffered_bytes, 1, n - buffered_bytes, input_ctx->file);
+ }
+ return read_n;
+}
+
+size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n) {
+ if (n + input_ctx->detect.position > DETECT_BUF_SZ) {
+ die("Failed to store in the detect buffer, maximum size exceeded.");
+ }
+ const size_t buffered_bytes =
+ input_ctx->detect.buf_read - input_ctx->detect.position;
+ size_t read_n;
+ if (buffered_bytes == 0) {
+ read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1, n,
+ input_ctx->file);
+ input_ctx->detect.buf_read += read_n;
+ } else if (n <= buffered_bytes) {
+ // In this case, don't need to do anything as the data is already in
+ // the detect buffer
+ read_n = n;
+ } else {
+ read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1,
+ n - buffered_bytes, input_ctx->file);
+ input_ctx->detect.buf_read += read_n;
+ read_n += buffered_bytes;
+ }
+ return read_n;
+}
+
+// Read from detect buffer to a buffer. If not enough, read from input and also
+// buffer them first.
+size_t buffer_input(struct AvxInputContext *input_ctx, size_t n,
+ unsigned char *buf, bool buffered) {
+ if (!buffered) {
+ return read_from_input(input_ctx, n, buf);
+ }
+ const size_t buf_n = input_to_detect_buf(input_ctx, n);
+ if (buf_n < n) {
+ return buf_n;
+ }
+ return read_from_input(input_ctx, n, buf);
+}
+
+void rewind_detect(struct AvxInputContext *input_ctx) {
+ input_ctx->detect.position = 0;
+}
+
+bool input_eof(struct AvxInputContext *input_ctx) {
+ return feof(input_ctx->file) &&
+ input_ctx->detect.position == input_ctx->detect.buf_read;
+}
diff --git a/third_party/aom/common/tools_common.h b/third_party/aom/common/tools_common.h
new file mode 100644
index 0000000000..b31371c670
--- /dev/null
+++ b/third_party/aom/common/tools_common.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_TOOLS_COMMON_H_
+#define AOM_COMMON_TOOLS_COMMON_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/msvc.h"
+
+#if CONFIG_AV1_ENCODER
+#include "common/y4minput.h"
+#endif
+
+#if defined(_MSC_VER)
+/* MSVS uses _f{seek,tell}i64. */
+#define fseeko _fseeki64
+#define ftello _ftelli64
+typedef int64_t FileOffset;
+#elif defined(_WIN32)
+#include <sys/types.h> /* NOLINT*/
+/* MinGW uses f{seek,tell}o64 for large files. */
+#define fseeko fseeko64
+#define ftello ftello64
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT
+#include <sys/types.h> /* NOLINT*/
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset; /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h> /* NOLINT */
+#define isatty _isatty
+#define fileno _fileno
+#else
+#include <unistd.h> /* NOLINT */
+#endif /* _MSC_VER */
+#endif /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+#define OBU_DETECTION_SZ 34 // See common/obudec.c
+
+#define DETECT_BUF_SZ 34 // Max of the above header sizes
+
+#define AV1_FOURCC 0x31305641
+
+enum VideoFileType {
+ FILE_TYPE_OBU,
+ FILE_TYPE_RAW,
+ FILE_TYPE_IVF,
+ FILE_TYPE_Y4M,
+ FILE_TYPE_WEBM
+};
+
+// The fourcc for large_scale_tile encoding is "LSTC".
+#define LST_FOURCC 0x4354534c
+
+struct FileTypeDetectionBuffer {
+ char buf[DETECT_BUF_SZ];
+ size_t buf_read;
+ size_t position;
+};
+
+struct AvxRational {
+ int numerator;
+ int denominator;
+};
+
+struct AvxInputContext {
+ const char *filename;
+ FILE *file;
+ int64_t length;
+ struct FileTypeDetectionBuffer detect;
+ enum VideoFileType file_type;
+ uint32_t width;
+ uint32_t height;
+ struct AvxRational pixel_aspect_ratio;
+ aom_img_fmt_t fmt;
+ aom_bit_depth_t bit_depth;
+ int only_i420;
+ uint32_t fourcc;
+ struct AvxRational framerate;
+#if CONFIG_AV1_ENCODER
+ y4m_input y4m;
+#endif
+ aom_color_range_t color_range;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+#define AOM_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define AOM_NO_RETURN __declspec(noreturn)
+#else
+#define AOM_NO_RETURN
+#endif
+
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef AOM_TOOLS_FORMAT_PRINTF
+#define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \
+ __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
+/* Sets a stdio stream into binary mode */
+FILE *set_binary_mode(FILE *stream);
+
+AOM_NO_RETURN void die(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2);
+AOM_NO_RETURN void fatal(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2);
+void aom_tools_warn(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2);
+
+AOM_NO_RETURN void die_codec(aom_codec_ctx_t *ctx, const char *s);
+
+/* The tool including this file must define usage_exit() */
+AOM_NO_RETURN void usage_exit(void);
+
+#undef AOM_NO_RETURN
+
+// The AOM library can support different encoders / decoders. These
+// functions provide different ways to lookup / iterate through them.
+// The return result may be NULL to indicate no codec was found.
+int get_aom_encoder_count(void);
+aom_codec_iface_t *get_aom_encoder_by_index(int i);
+aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name);
+// If the interface is unknown, returns NULL.
+const char *get_short_name_by_aom_encoder(aom_codec_iface_t *encoder);
+// If the interface is unknown, returns 0.
+uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface);
+
+int get_aom_decoder_count(void);
+aom_codec_iface_t *get_aom_decoder_by_index(int i);
+aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name);
+aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc);
+const char *get_short_name_by_aom_decoder(aom_codec_iface_t *decoder);
+// If the interface is unknown, returns 0.
+uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface);
+
+const char *image_format_to_string(aom_img_fmt_t fmt);
+
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
+
+void aom_img_write(const aom_image_t *img, FILE *file);
+// Returns true on success, false on failure.
+bool aom_img_read(aom_image_t *img, FILE *file);
+
+double sse_to_psnr(double samples, double peak, double mse);
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+ int down_shift);
+// Returns true on success, false on failure.
+bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
+ aom_image_t **img_shifted_ptr);
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
+
+// Output in NV12 format.
+void aom_img_write_nv12(const aom_image_t *img, FILE *file);
+
+size_t read_from_input(struct AvxInputContext *input_ctx, size_t n,
+ unsigned char *buf);
+size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n);
+size_t buffer_input(struct AvxInputContext *input_ctx, size_t n,
+ unsigned char *buf, bool buffered);
+void rewind_detect(struct AvxInputContext *input_ctx);
+bool input_eof(struct AvxInputContext *input_ctx);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // AOM_COMMON_TOOLS_COMMON_H_
diff --git a/third_party/aom/common/video_common.h b/third_party/aom/common/video_common.h
new file mode 100644
index 0000000000..bf95031be6
--- /dev/null
+++ b/third_party/aom/common/video_common.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_COMMON_H_
+#define AOM_COMMON_VIDEO_COMMON_H_
+
+#include "common/tools_common.h"
+
+typedef struct {
+ uint32_t codec_fourcc;
+ int frame_width;
+ int frame_height;
+ struct AvxRational time_base;
+ unsigned int is_annexb;
+} AvxVideoInfo;
+
+#endif // AOM_COMMON_VIDEO_COMMON_H_
diff --git a/third_party/aom/common/video_reader.c b/third_party/aom/common/video_reader.c
new file mode 100644
index 0000000000..27f69a9672
--- /dev/null
+++ b/third_party/aom/common/video_reader.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/webmdec.h"
+
+struct AvxVideoReaderStruct {
+ AvxVideoInfo info;
+ struct AvxInputContext input_ctx;
+ struct ObuDecInputContext obu_ctx;
+ struct WebmInputContext webm_ctx;
+ uint8_t *buffer;
+ size_t buffer_size;
+ size_t frame_size;
+ aom_codec_pts_t pts;
+};
+
+AvxVideoReader *aom_video_reader_open(const char *filename) {
+ AvxVideoReader *reader = NULL;
+ const bool using_file = strcmp(filename, "-") != 0;
+ FILE *const file =
+ using_file ? fopen(filename, "rb") : set_binary_mode(stdin);
+ if (!file) return NULL; // Can't open file
+
+ reader = (AvxVideoReader *)calloc(1, sizeof(*reader));
+ if (!reader) {
+ fclose(file);
+ return NULL; // Can't allocate AvxVideoReader
+ }
+
+ reader->input_ctx.filename = filename;
+ reader->input_ctx.file = file;
+ reader->obu_ctx.avx_ctx = &reader->input_ctx;
+ reader->obu_ctx.is_annexb = 1;
+
+ // TODO(https://crbug.com/aomedia/1706): webm type does not support reading
+ // from stdin yet, and file_is_webm is not using the detect buffer when
+ // determining the type. Therefore it should only be checked when using a file
+ // and needs to be checked prior to other types.
+ if (false) {
+#if CONFIG_WEBM_IO
+ } else if (using_file &&
+ file_is_webm(&reader->webm_ctx, &reader->input_ctx)) {
+ reader->input_ctx.file_type = FILE_TYPE_WEBM;
+ reader->info.codec_fourcc = reader->input_ctx.fourcc;
+ reader->info.frame_width = reader->input_ctx.width;
+ reader->info.frame_height = reader->input_ctx.height;
+#endif
+ } else if (file_is_ivf(&reader->input_ctx)) {
+ reader->input_ctx.file_type = FILE_TYPE_IVF;
+ reader->info.codec_fourcc = reader->input_ctx.fourcc;
+ reader->info.frame_width = reader->input_ctx.width;
+ reader->info.frame_height = reader->input_ctx.height;
+ } else if (file_is_obu(&reader->obu_ctx)) {
+ reader->input_ctx.file_type = FILE_TYPE_OBU;
+ // assume AV1
+ reader->info.codec_fourcc = AV1_FOURCC;
+ reader->info.is_annexb = reader->obu_ctx.is_annexb;
+ } else {
+ fclose(file);
+ free(reader);
+ return NULL; // Unknown file type
+ }
+
+ return reader;
+}
+
+void aom_video_reader_close(AvxVideoReader *reader) {
+ if (reader) {
+ fclose(reader->input_ctx.file);
+ if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+ obudec_free(&reader->obu_ctx);
+ }
+ free(reader->buffer);
+ free(reader);
+ }
+}
+
+int aom_video_reader_read_frame(AvxVideoReader *reader) {
+ if (reader->input_ctx.file_type == FILE_TYPE_IVF) {
+ return !ivf_read_frame(&reader->input_ctx, &reader->buffer,
+ &reader->frame_size, &reader->buffer_size,
+ &reader->pts);
+ } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+ return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer,
+ &reader->frame_size,
+ &reader->buffer_size);
+#if CONFIG_WEBM_IO
+ } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) {
+ return !webm_read_frame(&reader->webm_ctx, &reader->buffer,
+ &reader->frame_size, &reader->buffer_size);
+#endif
+ } else {
+ assert(0);
+ return 0;
+ }
+}
+
+const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader,
+ size_t *size) {
+ if (size) *size = reader->frame_size;
+
+ return reader->buffer;
+}
+
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) {
+ return (int64_t)reader->pts;
+}
+
+FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
+ return reader->input_ctx.file;
+}
+
+const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
+ return &reader->info;
+}
+
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) {
+ reader->info.codec_fourcc = fourcc;
+}
diff --git a/third_party/aom/common/video_reader.h b/third_party/aom/common/video_reader.h
new file mode 100644
index 0000000000..9ab439e8af
--- /dev/null
+++ b/third_party/aom/common/video_reader.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_READER_H_
+#define AOM_COMMON_VIDEO_READER_H_
+
+#include "common/video_common.h"
+
+// The following code is work in progress. It is going to support transparent
+// reading of input files. Right now only IVF format is supported for
+// simplicity. The main goal the API is to be simple and easy to use in example
+// code and in aomenc/aomdec later. All low-level details like memory
+// buffer management are hidden from API users.
+struct AvxVideoReaderStruct;
+typedef struct AvxVideoReaderStruct AvxVideoReader;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opens the input file for reading and inspects it to determine file type.
+// Returns an opaque AvxVideoReader* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+AvxVideoReader *aom_video_reader_open(const char *filename);
+
+// Frees all resources associated with AvxVideoReader* returned from
+// aom_video_reader_open() call.
+void aom_video_reader_close(AvxVideoReader *reader);
+
+// Reads frame from the file and stores it in internal buffer.
+int aom_video_reader_read_frame(AvxVideoReader *reader);
+
+// Returns the pointer to memory buffer with frame data read by last call to
+// aom_video_reader_read_frame().
+const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size);
+
+// Returns the pts of the frame.
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader);
+// Return the reader file.
+FILE *aom_video_reader_get_file(AvxVideoReader *reader);
+
+// Fills AvxVideoInfo with information from opened video file.
+const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
+
+// Set fourcc.
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_VIDEO_READER_H_
diff --git a/third_party/aom/common/video_writer.c b/third_party/aom/common/video_writer.c
new file mode 100644
index 0000000000..1d4328ae1e
--- /dev/null
+++ b/third_party/aom/common/video_writer.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "common/video_writer.h"
+
+#include <stdlib.h>
+
+#include "aom/aom_encoder.h"
+#include "common/ivfenc.h"
+
+struct AvxVideoWriterStruct {
+ AvxVideoInfo info;
+ FILE *file;
+ int frame_count;
+};
+
+static void write_header(FILE *file, const AvxVideoInfo *info,
+ int frame_count) {
+ struct aom_codec_enc_cfg cfg;
+ cfg.g_w = info->frame_width;
+ cfg.g_h = info->frame_height;
+ cfg.g_timebase.num = info->time_base.numerator;
+ cfg.g_timebase.den = info->time_base.denominator;
+
+ ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count);
+}
+
+AvxVideoWriter *aom_video_writer_open(const char *filename,
+ AvxContainer container,
+ const AvxVideoInfo *info) {
+ if (container == kContainerIVF) {
+ AvxVideoWriter *writer = NULL;
+ FILE *const file = fopen(filename, "wb");
+ if (!file) return NULL;
+
+ writer = malloc(sizeof(*writer));
+ if (!writer) {
+ fclose(file);
+ return NULL;
+ }
+ writer->frame_count = 0;
+ writer->info = *info;
+ writer->file = file;
+
+ write_header(writer->file, info, 0);
+
+ return writer;
+ }
+
+ return NULL;
+}
+
+void aom_video_writer_close(AvxVideoWriter *writer) {
+ if (writer) {
+ // Rewriting frame header with real frame count
+ rewind(writer->file);
+ write_header(writer->file, &writer->info, writer->frame_count);
+
+ fclose(writer->file);
+ free(writer);
+ }
+}
+
+int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+ size_t size, int64_t pts) {
+ ivf_write_frame_header(writer->file, pts, size);
+ if (fwrite(buffer, 1, size, writer->file) != size) return 0;
+
+ ++writer->frame_count;
+
+ return 1;
+}
+
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) {
+ writer->info.codec_fourcc = fourcc;
+}
diff --git a/third_party/aom/common/video_writer.h b/third_party/aom/common/video_writer.h
new file mode 100644
index 0000000000..8712d47a58
--- /dev/null
+++ b/third_party/aom/common/video_writer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_WRITER_H_
+#define AOM_COMMON_VIDEO_WRITER_H_
+
+#include "common/video_common.h"
+
+enum { kContainerIVF } UENUM1BYTE(AvxContainer);
+
+struct AvxVideoWriterStruct;
+typedef struct AvxVideoWriterStruct AvxVideoWriter;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Finds and opens writer for specified container format.
+// Returns an opaque AvxVideoWriter* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+AvxVideoWriter *aom_video_writer_open(const char *filename,
+ AvxContainer container,
+ const AvxVideoInfo *info);
+
+// Frees all resources associated with AvxVideoWriter* returned from
+// aom_video_writer_open() call.
+void aom_video_writer_close(AvxVideoWriter *writer);
+
+// Writes frame bytes to the file.
+int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+ size_t size, int64_t pts);
+// Set fourcc.
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_VIDEO_WRITER_H_
diff --git a/third_party/aom/common/warnings.c b/third_party/aom/common/warnings.c
new file mode 100644
index 0000000000..a20531cb8b
--- /dev/null
+++ b/third_party/aom/common/warnings.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/warnings.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "apps/aomenc.h"
+#include "common/tools_common.h"
+
+static const char quantizer_warning_string[] =
+ "Bad quantizer values. Quantizer values should not be equal, and should "
+ "differ by at least 8.";
+
+struct WarningListNode {
+ const char *warning_string;
+ struct WarningListNode *next_warning;
+};
+
+struct WarningList {
+ struct WarningListNode *warning_node;
+};
+
+static void add_warning(const char *warning_string,
+ struct WarningList *warning_list) {
+ struct WarningListNode **node = &warning_list->warning_node;
+
+ struct WarningListNode *new_node = malloc(sizeof(*new_node));
+ if (new_node == NULL) {
+ fatal("Unable to allocate warning node.");
+ }
+
+ new_node->warning_string = warning_string;
+ new_node->next_warning = NULL;
+
+ while (*node != NULL) node = &(*node)->next_warning;
+
+ *node = new_node;
+}
+
+static void free_warning_list(struct WarningList *warning_list) {
+ while (warning_list->warning_node != NULL) {
+ struct WarningListNode *const node = warning_list->warning_node;
+ warning_list->warning_node = node->next_warning;
+ free(node);
+ }
+}
+
+static int continue_prompt(int num_warnings) {
+ int c;
+ fprintf(stderr,
+ "%d encoder configuration warning(s). Continue? (y to continue) ",
+ num_warnings);
+ c = getchar();
+ return c == 'y';
+}
+
+static void check_quantizer(int min_q, int max_q,
+ struct WarningList *warning_list) {
+ const int lossless = min_q == 0 && max_q == 0;
+ if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8))
+ add_warning(quantizer_warning_string, warning_list);
+}
+
+void check_encoder_config(int disable_prompt,
+ const struct AvxEncoderConfig *global_config,
+ const struct aom_codec_enc_cfg *stream_config) {
+ int num_warnings = 0;
+ struct WarningListNode *warning = NULL;
+ struct WarningList warning_list = { 0 };
+ (void)global_config;
+ check_quantizer(stream_config->rc_min_quantizer,
+ stream_config->rc_max_quantizer, &warning_list);
+ /* Count and print warnings. */
+ for (warning = warning_list.warning_node; warning != NULL;
+ warning = warning->next_warning, ++num_warnings) {
+ aom_tools_warn("%s", warning->warning_string);
+ }
+
+ free_warning_list(&warning_list);
+
+ if (num_warnings) {
+ if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE);
+ }
+}
diff --git a/third_party/aom/common/warnings.h b/third_party/aom/common/warnings.h
new file mode 100644
index 0000000000..36f1fe0706
--- /dev/null
+++ b/third_party/aom/common/warnings.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WARNINGS_H_
+#define AOM_COMMON_WARNINGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_codec_enc_cfg;
+struct AvxEncoderConfig;
+
+/*
+ * Checks config for improperly used settings. Warns user upon encountering
+ * settings that will lead to poor output quality. Prompts user to continue
+ * when warnings are issued.
+ */
+void check_encoder_config(int disable_prompt,
+ const struct AvxEncoderConfig *global_config,
+ const struct aom_codec_enc_cfg *stream_config);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_WARNINGS_H_
diff --git a/third_party/aom/common/webmdec.cc b/third_party/aom/common/webmdec.cc
new file mode 100644
index 0000000000..33bda59021
--- /dev/null
+++ b/third_party/aom/common/webmdec.cc
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/webmdec.h"
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+
+#include "third_party/libwebm/mkvparser/mkvparser.h"
+#include "third_party/libwebm/mkvparser/mkvreader.h"
+
+namespace {
+
+void reset(struct WebmInputContext *const webm_ctx) {
+ if (webm_ctx->reader != NULL) {
+ mkvparser::MkvReader *const reader =
+ reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
+ delete reader;
+ }
+ if (webm_ctx->segment != NULL) {
+ mkvparser::Segment *const segment =
+ reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+ delete segment;
+ }
+ if (webm_ctx->buffer != NULL) {
+ delete[] webm_ctx->buffer;
+ }
+ webm_ctx->reader = NULL;
+ webm_ctx->segment = NULL;
+ webm_ctx->buffer = NULL;
+ webm_ctx->cluster = NULL;
+ webm_ctx->block_entry = NULL;
+ webm_ctx->block = NULL;
+ webm_ctx->block_frame_index = 0;
+ webm_ctx->video_track_index = 0;
+ webm_ctx->timestamp_ns = 0;
+ webm_ctx->is_key_frame = false;
+}
+
+void get_first_cluster(struct WebmInputContext *const webm_ctx) {
+ mkvparser::Segment *const segment =
+ reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+ const mkvparser::Cluster *const cluster = segment->GetFirst();
+ webm_ctx->cluster = cluster;
+}
+
+void rewind_and_reset(struct WebmInputContext *const webm_ctx,
+ struct AvxInputContext *const aom_ctx) {
+ rewind(aom_ctx->file);
+ reset(webm_ctx);
+}
+
+} // namespace
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+ struct AvxInputContext *aom_ctx) {
+ mkvparser::MkvReader *const reader = new mkvparser::MkvReader(aom_ctx->file);
+ webm_ctx->reader = reader;
+ webm_ctx->reached_eos = 0;
+
+ mkvparser::EBMLHeader header;
+ long long pos = 0;
+ if (header.Parse(reader, pos) < 0) {
+ rewind_and_reset(webm_ctx, aom_ctx);
+ return 0;
+ }
+
+ mkvparser::Segment *segment;
+ if (mkvparser::Segment::CreateInstance(reader, pos, segment)) {
+ rewind_and_reset(webm_ctx, aom_ctx);
+ return 0;
+ }
+ webm_ctx->segment = segment;
+ if (segment->Load() < 0) {
+ rewind_and_reset(webm_ctx, aom_ctx);
+ return 0;
+ }
+
+ const mkvparser::Tracks *const tracks = segment->GetTracks();
+ const mkvparser::VideoTrack *video_track = NULL;
+ for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
+ const mkvparser::Track *const track = tracks->GetTrackByIndex(i);
+ if (track->GetType() == mkvparser::Track::kVideo) {
+ video_track = static_cast<const mkvparser::VideoTrack *>(track);
+ webm_ctx->video_track_index = static_cast<int>(track->GetNumber());
+ break;
+ }
+ }
+
+ if (video_track == NULL || video_track->GetCodecId() == NULL) {
+ rewind_and_reset(webm_ctx, aom_ctx);
+ return 0;
+ }
+
+ if (!strncmp(video_track->GetCodecId(), "V_AV1", 5)) {
+ aom_ctx->fourcc = AV1_FOURCC;
+ } else {
+ rewind_and_reset(webm_ctx, aom_ctx);
+ return 0;
+ }
+
+ aom_ctx->framerate.denominator = 0;
+ aom_ctx->framerate.numerator = 0;
+ aom_ctx->width = static_cast<uint32_t>(video_track->GetWidth());
+ aom_ctx->height = static_cast<uint32_t>(video_track->GetHeight());
+
+ get_first_cluster(webm_ctx);
+
+ return 1;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size) {
+ assert(webm_ctx->buffer == *buffer);
+ // This check is needed for frame parallel decoding, in which case this
+ // function could be called even after it has reached end of input stream.
+ if (webm_ctx->reached_eos) {
+ return 1;
+ }
+ mkvparser::Segment *const segment =
+ reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+ const mkvparser::Cluster *cluster =
+ reinterpret_cast<const mkvparser::Cluster *>(webm_ctx->cluster);
+ const mkvparser::Block *block =
+ reinterpret_cast<const mkvparser::Block *>(webm_ctx->block);
+ const mkvparser::BlockEntry *block_entry =
+ reinterpret_cast<const mkvparser::BlockEntry *>(webm_ctx->block_entry);
+ bool block_entry_eos = false;
+ do {
+ long status = 0;
+ bool get_new_block = false;
+ if (block_entry == NULL && !block_entry_eos) {
+ status = cluster->GetFirst(block_entry);
+ get_new_block = true;
+ } else if (block_entry_eos || block_entry->EOS()) {
+ cluster = segment->GetNext(cluster);
+ if (cluster == NULL || cluster->EOS()) {
+ *bytes_read = 0;
+ webm_ctx->reached_eos = 1;
+ return 1;
+ }
+ status = cluster->GetFirst(block_entry);
+ block_entry_eos = false;
+ get_new_block = true;
+ } else if (block == NULL ||
+ webm_ctx->block_frame_index == block->GetFrameCount() ||
+ block->GetTrackNumber() != webm_ctx->video_track_index) {
+ status = cluster->GetNext(block_entry, block_entry);
+ if (block_entry == NULL || block_entry->EOS()) {
+ block_entry_eos = true;
+ continue;
+ }
+ get_new_block = true;
+ }
+ if (status || block_entry == NULL) {
+ return -1;
+ }
+ if (get_new_block) {
+ block = block_entry->GetBlock();
+ if (block == NULL) return -1;
+ webm_ctx->block_frame_index = 0;
+ }
+ } while (block_entry_eos ||
+ block->GetTrackNumber() != webm_ctx->video_track_index);
+
+ webm_ctx->cluster = cluster;
+ webm_ctx->block_entry = block_entry;
+ webm_ctx->block = block;
+
+ const mkvparser::Block::Frame &frame =
+ block->GetFrame(webm_ctx->block_frame_index);
+ ++webm_ctx->block_frame_index;
+ if (frame.len > static_cast<long>(*buffer_size)) {
+ delete[] * buffer;
+ *buffer = new uint8_t[frame.len];
+ webm_ctx->buffer = *buffer;
+ if (*buffer == NULL) {
+ return -1;
+ }
+ *buffer_size = frame.len;
+ }
+ *bytes_read = frame.len;
+ webm_ctx->timestamp_ns = block->GetTime(cluster);
+ webm_ctx->is_key_frame = block->IsKey();
+
+ mkvparser::MkvReader *const reader =
+ reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
+ return frame.Read(reader, *buffer) ? -1 : 0;
+}
+
+// Calculate the greatest common divisor between two numbers.
+static int gcd(int a, int b) {
+ int remainder;
+ while (b > 0) {
+ remainder = a % b;
+ a = b;
+ b = remainder;
+ }
+ return a;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+ struct AvxInputContext *aom_ctx) {
+ uint32_t i = 0;
+ uint8_t *buffer = NULL;
+ size_t buffer_size = 0;
+ size_t bytes_read = 0;
+ assert(webm_ctx->buffer == NULL);
+ while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
+ if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) {
+ break;
+ }
+ ++i;
+ }
+ aom_ctx->framerate.numerator = (i - 1) * 1000000;
+ aom_ctx->framerate.denominator =
+ static_cast<int>(webm_ctx->timestamp_ns / 1000);
+ // Fraction might be represented in large numbers, like 49000000/980000
+ // for 50fps. Simplify as much as possible.
+ int g = gcd(aom_ctx->framerate.numerator, aom_ctx->framerate.denominator);
+ if (g != 0) {
+ aom_ctx->framerate.numerator /= g;
+ aom_ctx->framerate.denominator /= g;
+ }
+
+ delete[] buffer;
+ webm_ctx->buffer = NULL;
+
+ get_first_cluster(webm_ctx);
+ webm_ctx->block = NULL;
+ webm_ctx->block_entry = NULL;
+ webm_ctx->block_frame_index = 0;
+ webm_ctx->timestamp_ns = 0;
+ webm_ctx->reached_eos = 0;
+
+ return 0;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); }
diff --git a/third_party/aom/common/webmdec.h b/third_party/aom/common/webmdec.h
new file mode 100644
index 0000000000..fcbdeffe4d
--- /dev/null
+++ b/third_party/aom/common/webmdec.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WEBMDEC_H_
+#define AOM_COMMON_WEBMDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AvxInputContext;
+
+struct WebmInputContext {
+ void *reader;
+ void *segment;
+ uint8_t *buffer;
+ const void *cluster;
+ const void *block_entry;
+ const void *block;
+ int block_frame_index;
+ int video_track_index;
+ int64_t timestamp_ns;
+ int is_key_frame;
+ int reached_eos;
+};
+
+// Checks if the input is a WebM file. If so, initializes WebMInputContext so
+// that webm_read_frame can be called to retrieve a video frame.
+// Returns 1 on success and 0 on failure or input is not WebM file.
+// TODO(vigneshv): Refactor this function into two smaller functions specific
+// to their task.
+int file_is_webm(struct WebmInputContext *webm_ctx,
+ struct AvxInputContext *aom_ctx);
+
+// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
+// by this function. For the first call, |buffer| should be NULL and
+// |*buffer_size| should be 0. Once all the frames are read and used,
+// webm_free() should be called, otherwise there will be a leak.
+// Parameters:
+// webm_ctx - WebmInputContext object
+// buffer - pointer where the frame data will be filled.
+// bytes_read - pointer to bytes read.
+// buffer_size - pointer to buffer size.
+// Return values:
+// 0 - Success
+// 1 - End of Stream
+// -1 - Error
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
+ size_t *bytes_read, size_t *buffer_size);
+
+// Guesses the frame rate of the input file based on the container timestamps.
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+ struct AvxInputContext *aom_ctx);
+
+// Resets the WebMInputContext.
+void webm_free(struct WebmInputContext *webm_ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_WEBMDEC_H_
diff --git a/third_party/aom/common/webmenc.cc b/third_party/aom/common/webmenc.cc
new file mode 100644
index 0000000000..bb754e8119
--- /dev/null
+++ b/third_party/aom/common/webmenc.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/webmenc.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include <memory>
+#include <new>
+#include <string>
+
+#include "common/av1_config.h"
+#include "third_party/libwebm/mkvmuxer/mkvmuxer.h"
+#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+#include "third_party/libwebm/mkvmuxer/mkvwriter.h"
+
+namespace {
+const uint64_t kDebugTrackUid = 0xDEADBEEF;
+const int kVideoTrackNumber = 1;
+
+// Simplistic mechanism to detect if an argv parameter refers to
+// an input or output file. Returns the total number of arguments that
+// should be skipped.
+int skip_input_output_arg(const char *arg, const char *input_fname) {
+ if (strcmp(arg, input_fname) == 0) {
+ return 1;
+ }
+ if (strcmp(arg, "-o") == 0 || strcmp(arg, "--output") == 0) {
+ return 2;
+ }
+ if (strncmp(arg, "--output=", strlen("--output=")) == 0) {
+ return 1;
+ }
+ return 0;
+}
+
+} // namespace
+
+char *extract_encoder_settings(const char *version, const char **argv, int argc,
+ const char *input_fname) {
+ // + 9 for "version:" prefix and for null terminator.
+ size_t total_size = strlen(version) + 9;
+ int i = 1;
+ while (i < argc) {
+ int num_skip = skip_input_output_arg(argv[i], input_fname);
+ i += num_skip;
+ if (num_skip == 0) {
+ total_size += strlen(argv[i]) + 1; // + 1 is for space separator.
+ ++i;
+ }
+ }
+ char *result = static_cast<char *>(malloc(total_size));
+ if (result == nullptr) {
+ return nullptr;
+ }
+ char *cur = result;
+ cur += snprintf(cur, total_size, "version:%s", version);
+ i = 1;
+ while (i < argc) {
+ int num_skip = skip_input_output_arg(argv[i], input_fname);
+ i += num_skip;
+ if (num_skip == 0) {
+ cur += snprintf(cur, total_size, " %s", argv[i]);
+ ++i;
+ }
+ }
+ *cur = '\0';
+ return result;
+}
+
+int write_webm_file_header(struct WebmOutputContext *webm_ctx,
+ aom_codec_ctx_t *encoder_ctx,
+ const aom_codec_enc_cfg_t *cfg,
+ stereo_format_t stereo_fmt, unsigned int fourcc,
+ const struct AvxRational *par,
+ const char *encoder_settings) {
+ std::unique_ptr<mkvmuxer::MkvWriter> writer(
+ new (std::nothrow) mkvmuxer::MkvWriter(webm_ctx->stream));
+ std::unique_ptr<mkvmuxer::Segment> segment(new (std::nothrow)
+ mkvmuxer::Segment());
+ if (writer == nullptr || segment == nullptr) {
+ fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n");
+ return -1;
+ }
+
+ bool ok = segment->Init(writer.get());
+ if (!ok) {
+ fprintf(stderr, "webmenc> mkvmuxer Init failed.\n");
+ return -1;
+ }
+
+ segment->set_mode(mkvmuxer::Segment::kFile);
+ segment->OutputCues(true);
+
+ mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo();
+ if (!info) {
+ fprintf(stderr, "webmenc> Cannot retrieve Segment Info.\n");
+ return -1;
+ }
+
+ const uint64_t kTimecodeScale = 1000000;
+ info->set_timecode_scale(kTimecodeScale);
+ std::string version = "aomenc";
+ if (!webm_ctx->debug) {
+ version.append(std::string(" ") + aom_codec_version_str());
+ }
+ info->set_writing_app(version.c_str());
+
+ const uint64_t video_track_id =
+ segment->AddVideoTrack(static_cast<int>(cfg->g_w),
+ static_cast<int>(cfg->g_h), kVideoTrackNumber);
+ mkvmuxer::VideoTrack *const video_track = static_cast<mkvmuxer::VideoTrack *>(
+ segment->GetTrackByNumber(video_track_id));
+
+ if (!video_track) {
+ fprintf(stderr, "webmenc> Video track creation failed.\n");
+ return -1;
+ }
+
+ ok = false;
+ aom_fixed_buf_t *obu_sequence_header =
+ aom_codec_get_global_headers(encoder_ctx);
+ if (obu_sequence_header) {
+ Av1Config av1_config;
+ if (get_av1config_from_obu(
+ reinterpret_cast<const uint8_t *>(obu_sequence_header->buf),
+ obu_sequence_header->sz, false, &av1_config) == 0) {
+ uint8_t av1_config_buffer[4] = { 0 };
+ size_t bytes_written = 0;
+ if (write_av1config(&av1_config, sizeof(av1_config_buffer),
+ &bytes_written, av1_config_buffer) == 0) {
+ ok = video_track->SetCodecPrivate(av1_config_buffer,
+ sizeof(av1_config_buffer));
+ }
+ }
+ free(obu_sequence_header->buf);
+ free(obu_sequence_header);
+ }
+ if (!ok) {
+ fprintf(stderr, "webmenc> Unable to set AV1 config.\n");
+ return -1;
+ }
+
+ ok = video_track->SetStereoMode(stereo_fmt);
+ if (!ok) {
+ fprintf(stderr, "webmenc> Unable to set stereo mode.\n");
+ return -1;
+ }
+
+ if (fourcc != AV1_FOURCC) {
+ fprintf(stderr, "webmenc> Unsupported codec (unknown 4 CC).\n");
+ return -1;
+ }
+ video_track->set_codec_id("V_AV1");
+
+ if (par->numerator > 1 || par->denominator > 1) {
+ // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type
+ // to WebM format.
+ const uint64_t display_width = static_cast<uint64_t>(
+ ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5);
+ video_track->set_display_width(display_width);
+ video_track->set_display_height(cfg->g_h);
+ }
+
+ if (encoder_settings != nullptr) {
+ mkvmuxer::Tag *tag = segment->AddTag();
+ if (tag == nullptr) {
+ fprintf(stderr,
+ "webmenc> Unable to allocate memory for encoder settings tag.\n");
+ return -1;
+ }
+ ok = tag->add_simple_tag("ENCODER_SETTINGS", encoder_settings);
+ if (!ok) {
+ fprintf(stderr,
+ "webmenc> Unable to allocate memory for encoder settings tag.\n");
+ return -1;
+ }
+ }
+
+ if (webm_ctx->debug) {
+ video_track->set_uid(kDebugTrackUid);
+ }
+
+ webm_ctx->writer = writer.release();
+ webm_ctx->segment = segment.release();
+ return 0;
+}
+
+int write_webm_block(struct WebmOutputContext *webm_ctx,
+ const aom_codec_enc_cfg_t *cfg,
+ const aom_codec_cx_pkt_t *pkt) {
+ if (!webm_ctx->segment) {
+ fprintf(stderr, "webmenc> segment is NULL.\n");
+ return -1;
+ }
+ mkvmuxer::Segment *const segment =
+ reinterpret_cast<mkvmuxer::Segment *>(webm_ctx->segment);
+ int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num /
+ cfg->g_timebase.den;
+ if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000;
+ webm_ctx->last_pts_ns = pts_ns;
+
+ if (!segment->AddFrame(static_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz, kVideoTrackNumber, pts_ns,
+ pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+ fprintf(stderr, "webmenc> AddFrame failed.\n");
+ return -1;
+ }
+ return 0;
+}
+
+int write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
+ if (!webm_ctx->writer || !webm_ctx->segment) {
+ fprintf(stderr, "webmenc> segment or writer NULL.\n");
+ return -1;
+ }
+ mkvmuxer::MkvWriter *const writer =
+ reinterpret_cast<mkvmuxer::MkvWriter *>(webm_ctx->writer);
+ mkvmuxer::Segment *const segment =
+ reinterpret_cast<mkvmuxer::Segment *>(webm_ctx->segment);
+ const bool ok = segment->Finalize();
+ delete segment;
+ delete writer;
+ webm_ctx->writer = NULL;
+ webm_ctx->segment = NULL;
+
+ if (!ok) {
+ fprintf(stderr, "webmenc> Segment::Finalize failed.\n");
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/third_party/aom/common/webmenc.h b/third_party/aom/common/webmenc.h
new file mode 100644
index 0000000000..c912208b45
--- /dev/null
+++ b/third_party/aom/common/webmenc.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WEBMENC_H_
+#define AOM_COMMON_WEBMENC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tools_common.h"
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WebmOutputContext {
+ int debug;
+ FILE *stream;
+ int64_t last_pts_ns;
+ void *writer;
+ void *segment;
+};
+
+/* Stereo 3D packed frame format */
+enum {
+ STEREO_FORMAT_MONO = 0,
+ STEREO_FORMAT_LEFT_RIGHT = 1,
+ STEREO_FORMAT_BOTTOM_TOP = 2,
+ STEREO_FORMAT_TOP_BOTTOM = 3,
+ STEREO_FORMAT_RIGHT_LEFT = 11
+} UENUM1BYTE(stereo_format_t);
+
+// Simplistic mechanism to extract encoder settings, without having
+// to re-invoke the entire flag-parsing logic. It lists the codec version
+// and then copies the arguments as-is from argv, but skips the binary name,
+// any arguments that match the input filename, and the output flags "-o"
+// and "--output" (and the following argument for those flags). The caller
+// is responsible for free-ing the returned string. If there is insufficient
+// memory, it returns nullptr.
+char *extract_encoder_settings(const char *version, const char **argv, int argc,
+ const char *input_fname);
+
+// The following functions wrap libwebm's mkvmuxer. All functions return 0 upon
+// success, or -1 upon failure.
+
+int write_webm_file_header(struct WebmOutputContext *webm_ctx,
+ aom_codec_ctx_t *encoder_ctx,
+ const aom_codec_enc_cfg_t *cfg,
+ stereo_format_t stereo_fmt, unsigned int fourcc,
+ const struct AvxRational *par,
+ const char *encoder_settings);
+
+int write_webm_block(struct WebmOutputContext *webm_ctx,
+ const aom_codec_enc_cfg_t *cfg,
+ const aom_codec_cx_pkt_t *pkt);
+
+int write_webm_file_footer(struct WebmOutputContext *webm_ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_WEBMENC_H_
diff --git a/third_party/aom/common/y4menc.c b/third_party/aom/common/y4menc.c
new file mode 100644
index 0000000000..25086a91d0
--- /dev/null
+++ b/third_party/aom/common/y4menc.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "common/rawenc.h"
+#include "common/y4menc.h"
+
+// Returns the Y4M name associated with the monochrome colorspace.
+static const char *monochrome_colorspace(unsigned int bit_depth) {
+ switch (bit_depth) {
+ case 8: return "Cmono";
+ case 9: return "Cmono9";
+ case 10: return "Cmono10";
+ case 12: return "Cmono12";
+ case 16: return "Cmono16";
+ default: assert(0); return NULL;
+ }
+}
+
+// Return the Y4M name of the 8-bit colorspace, given the chroma position and
+// image format.
+static const char *colorspace8(aom_chroma_sample_position_t csp,
+ aom_img_fmt_t fmt) {
+ switch (fmt) {
+ case AOM_IMG_FMT_I444: return "C444";
+ case AOM_IMG_FMT_I422: return "C422";
+ default:
+ if (csp == AOM_CSP_VERTICAL) {
+ return "C420mpeg2 XYSCSS=420MPEG2";
+ } else if (csp == AOM_CSP_COLOCATED) {
+ // Note that Y4M does not have a dedicated header for colocated chroma,
+ // and that FFMPEG interprets C420 as C420jpeg.
+ return "C420";
+ } else {
+ return "C420jpeg";
+ }
+ }
+}
+
+// Return the Y4M name of the colorspace, given the bit depth and image format.
+static const char *colorspace(unsigned int bit_depth,
+ aom_chroma_sample_position_t csp,
+ aom_img_fmt_t fmt) {
+ switch (bit_depth) {
+ case 8: return colorspace8(csp, fmt);
+ case 9:
+ return fmt == AOM_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9"
+ : "C420p9 XYSCSS=420P9";
+ case 10:
+ return fmt == AOM_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10"
+ : "C420p10 XYSCSS=420P10";
+ case 12:
+ return fmt == AOM_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12"
+ : "C420p12 XYSCSS=420P12";
+ case 14:
+ return fmt == AOM_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14"
+ : "C420p14 XYSCSS=420P14";
+ case 16:
+ return fmt == AOM_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16"
+ : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16"
+ : "C420p16 XYSCSS=420P16";
+ default: assert(0); return NULL;
+ }
+}
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+ const struct AvxRational *framerate, int monochrome,
+ aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+ unsigned int bit_depth, aom_color_range_t range) {
+ const char *color = monochrome ? monochrome_colorspace(bit_depth)
+ : colorspace(bit_depth, csp, fmt);
+ const char *color_range = ""; // Default assumption is studio range.
+ if (range == AOM_CR_FULL_RANGE) {
+ color_range = " XCOLORRANGE=FULL";
+ }
+ return snprintf(buf, len, "YUV4MPEG2 W%d H%d F%d:%d Ip %s%s\n", width, height,
+ framerate->numerator, framerate->denominator, color,
+ color_range);
+}
+
+int y4m_write_frame_header(char *buf, size_t len) {
+ return snprintf(buf, len, "FRAME\n");
+}
+
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+ FILE *file) {
+ int num_planes = img->monochrome ? 1 : 3;
+ raw_write_image_file(img, planes, num_planes, file);
+}
+
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+ MD5Context *md5) {
+ int num_planes = img->monochrome ? 1 : 3;
+ raw_update_image_md5(img, planes, num_planes, md5);
+}
diff --git a/third_party/aom/common/y4menc.h b/third_party/aom/common/y4menc.h
new file mode 100644
index 0000000000..6484efcc50
--- /dev/null
+++ b/third_party/aom/common/y4menc.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_Y4MENC_H_
+#define AOM_COMMON_Y4MENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define Y4M_BUFFER_SIZE 256
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+ const struct AvxRational *framerate, int monochrome,
+ aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+ unsigned int bit_depth, aom_color_range_t range);
+int y4m_write_frame_header(char *buf, size_t len);
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+ FILE *file);
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+ MD5Context *md5);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_Y4MENC_H_
diff --git a/third_party/aom/common/y4minput.c b/third_party/aom/common/y4minput.c
new file mode 100644
index 0000000000..1974d76f1f
--- /dev/null
+++ b/third_party/aom/common/y4minput.c
@@ -0,0 +1,1222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * Based on code from the OggTheora software codec source code,
+ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/msvc.h"
+#include "y4minput.h"
+
+// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
+// Returns true on success.
+static int file_read(void *buf, size_t size, FILE *file) {
+ const int kMaxTries = 5;
+ int try_count = 0;
+ int file_error = 0;
+ size_t len = 0;
+ while (!feof(file) && len < size && try_count < kMaxTries) {
+ const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+ ++try_count;
+ len += n;
+ file_error = ferror(file);
+ if (file_error) {
+ if (errno == EINTR || errno == EAGAIN) {
+ clearerr(file);
+ continue;
+ } else {
+ fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n",
+ (uint32_t)len, (uint32_t)size, errno, strerror(errno));
+ return 0;
+ }
+ }
+ }
+
+ if (!feof(file) && len != size) {
+ fprintf(stderr,
+ "Error reading file: %u of %u bytes read,"
+ " error: %d, tries: %d, %d: %s\n",
+ (uint32_t)len, (uint32_t)size, file_error, try_count, errno,
+ strerror(errno));
+ }
+ return len == size;
+}
+
+// Stores the color range in 'y4m_ctx', returning 1 if successfully parsed,
+// 0 otherwise.
+static int parse_color_range(y4m_input *y4m_ctx, const char *buf) {
+ // Note that default is studio range.
+ if (strcmp(buf, "LIMITED") == 0) {
+ return 1;
+ }
+ if (strcmp(buf, "FULL") == 0) {
+ y4m_ctx->color_range = AOM_CR_FULL_RANGE;
+ return 1;
+ }
+ fprintf(stderr, "Unknown color range value: %s\n", buf);
+ return 0;
+}
+
+static int parse_metadata(y4m_input *y4m_ctx, const char *buf) {
+ if (strncmp(buf, "COLORRANGE=", 11) == 0) {
+ return parse_color_range(y4m_ctx, buf + 11);
+ }
+ return 1; // No support for other metadata, just ignore them.
+}
+
+static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
+ char *p;
+ char *q;
+ for (p = _tags;; p = q) {
+ /*Skip any leading spaces.*/
+ while (*p == ' ') p++;
+ /*If that's all we have, stop.*/
+ if (p[0] == '\0') break;
+ /*Find the end of this tag.*/
+ for (q = p + 1; *q != '\0' && *q != ' '; q++) {
+ }
+ /*Process the tag.*/
+ switch (p[0]) {
+ case 'W': {
+ if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
+ } break;
+ case 'H': {
+ if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
+ } break;
+ case 'F': {
+ if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
+ return -1;
+ }
+ } break;
+ case 'I': {
+ _y4m->interlace = p[1];
+ } break;
+ case 'A': {
+ if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
+ return -1;
+ }
+ } break;
+ case 'C': {
+ if (q - p > 16) return -1;
+ memcpy(_y4m->chroma_type, p + 1, q - p - 1);
+ _y4m->chroma_type[q - p - 1] = '\0';
+ } break;
+ case 'X': {
+ if (!parse_metadata(_y4m, p + 1)) return -1;
+ } break;
+ default: break; /*Ignore unknown tags.*/
+ }
+ }
+ return 0;
+}
+
+// Copy a single tag into the buffer, along with a null character.
+// Returns 0 if any file IO errors occur.
+static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) {
+ size_t i;
+ assert(buf_len >= 1);
+ // Skip leading space characters.
+ do {
+ if (!file_read(buf, 1, file)) {
+ return 0;
+ }
+ } while (buf[0] == ' ');
+
+ // If we hit the newline, treat this as the "empty" tag.
+ if (buf[0] == '\n') {
+ buf[0] = '\0';
+ *end_tag = '\n';
+ return 1;
+ }
+
+ // Copy over characters until a space is hit, or the buffer is exhausted.
+ for (i = 1; i < buf_len; ++i) {
+ if (!file_read(buf + i, 1, file)) {
+ return 0;
+ }
+ if (buf[i] == ' ' || buf[i] == '\n') {
+ break;
+ }
+ }
+ if (i == buf_len) {
+ fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n",
+ (unsigned long)i);
+ return 0;
+ }
+ *end_tag = buf[i];
+ buf[i] = '\0';
+ return 1;
+}
+
+// Returns 1 if tags were parsed successfully, 0 otherwise.
+static int parse_tags(y4m_input *y4m_ctx, FILE *file) {
+ char tag[256];
+ char end; // Character denoting the end of the tag, ' ' or '\n'.
+ // Set Y4M tags to defaults, updating them as processing occurs. Mandatory
+ // fields are marked with -1 and will be checked after the tags are parsed.
+ y4m_ctx->pic_w = -1;
+ y4m_ctx->pic_h = -1;
+ y4m_ctx->fps_n = -1; // Also serves as marker for fps_d
+ y4m_ctx->par_n = 0;
+ y4m_ctx->par_d = 0;
+ y4m_ctx->interlace = '?';
+ y4m_ctx->color_range = AOM_CR_STUDIO_RANGE;
+ snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
+
+ // Find one tag at a time.
+ do {
+ if (!copy_tag(tag, sizeof(tag), &end, file)) {
+ return 0;
+ }
+ // y4m_parse_tags returns 0 on success.
+ if (y4m_parse_tags(y4m_ctx, tag)) {
+ return 0;
+ }
+ } while (end != '\n');
+
+ // Check the mandatory fields.
+ if (y4m_ctx->pic_w == -1) {
+ fprintf(stderr, "Width field missing\n");
+ return 0;
+ }
+ if (y4m_ctx->pic_h == -1) {
+ fprintf(stderr, "Height field missing\n");
+ return 0;
+ }
+ if (y4m_ctx->fps_n == -1) {
+ fprintf(stderr, "FPS field missing\n");
+ return 0;
+ }
+ return 1;
+}
+
+/*All anti-aliasing filters in the following conversion functions are based on
+ one of two window functions:
+ The 6-tap Lanczos window (for down-sampling and shifts):
+ sinc(\pi*t)*sinc(\pi*t/3), |t|<3 (sinc(t)==sin(t)/t)
+ 0, |t|>=3
+ The 4-tap Mitchell window (for up-sampling):
+ 7|t|^3-12|t|^2+16/3, |t|<1
+ -(7/3)|x|^3+12|x|^2-20|x|+32/3, |t|<2
+ 0, |t|>=2
+ The number of taps is intentionally kept small to reduce computational
+ overhead and limit ringing.
+
+ The taps from these filters are scaled so that their sum is 1, and the
+ result is scaled by 128 and rounded to integers to create a filter whose
+ intermediate values fit inside 16 bits.
+ Coefficients are rounded in such a way as to ensure their sum is still 128,
+ which is usually equivalent to normal rounding.
+
+ Conversions which require both horizontal and vertical filtering could
+ have these steps pipelined, for less memory consumption and better cache
+ performance, but we do them separately for simplicity.*/
+#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
+#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
+#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
+
+/*420jpeg chroma samples are sited like:
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ 420mpeg2 chroma samples are sited like:
+ Y-------Y-------Y-------Y-------
+ | | | |
+ BR | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ BR | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ We use a resampling filter to shift the site locations one quarter pixel (at
+ the chroma plane's resolution) to the right.
+ The 4:2:2 modes look exactly the same, except there are twice as many chroma
+ lines, and they are vertically co-sited with the luma samples in both the
+ mpeg2 and jpeg cases (thus requiring no vertical resampling).*/
+static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
+ const unsigned char *_src, int _c_w,
+ int _c_h) {
+ int y;
+ int x;
+ for (y = 0; y < _c_h; y++) {
+ /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
+ window.*/
+ for (x = 0; x < OC_MINI(_c_w, 2); x++) {
+ _dst[x] = (unsigned char)OC_CLAMPI(
+ 0,
+ (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+ 35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+ 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+ 64) >>
+ 7,
+ 255);
+ }
+ for (; x < _c_w - 3; x++) {
+ _dst[x] = (unsigned char)OC_CLAMPI(
+ 0,
+ (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+ 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+ 7,
+ 255);
+ }
+ for (; x < _c_w; x++) {
+ _dst[x] = (unsigned char)OC_CLAMPI(
+ 0,
+ (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+ 35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+ 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+ 7,
+ 255);
+ }
+ _dst += _c_w;
+ _src += _c_w;
+ }
+}
+
+/*This format is only used for interlaced content, but is included for
+ completeness.
+
+ 420jpeg chroma samples are sited like:
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ 420paldv chroma samples are sited like:
+ YR------Y-------YR------Y-------
+ | | | |
+ | | | |
+ | | | |
+ YB------Y-------YB------Y-------
+ | | | |
+ | | | |
+ | | | |
+ YR------Y-------YR------Y-------
+ | | | |
+ | | | |
+ | | | |
+ YB------Y-------YB------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ We use a resampling filter to shift the site locations one quarter pixel (at
+ the chroma plane's resolution) to the right.
+ Then we use another filter to move the C_r location down one quarter pixel,
+ and the C_b location up one quarter pixel.*/
+static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ unsigned char *tmp;
+ int c_w;
+ int c_h;
+ int c_sz;
+ int pli;
+ int y;
+ int x;
+ /*Skip past the luma data.*/
+ _dst += _y4m->pic_w * _y4m->pic_h;
+ /*Compute the size of each chroma plane.*/
+ c_w = (_y4m->pic_w + 1) / 2;
+ c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+ c_sz = c_w * c_h;
+ tmp = _aux + 2 * c_sz;
+ for (pli = 1; pli < 3; pli++) {
+ /*First do the horizontal re-sampling.
+ This is the same as the mpeg2 case, except that after the horizontal
+ case, we need to apply a second vertical filter.*/
+ y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+ _aux += c_sz;
+ switch (pli) {
+ case 1: {
+ /*Slide C_b up a quarter-pel.
+ This is the same filter used above, but in the other order.*/
+ for (x = 0; x < c_w; x++) {
+ for (y = 0; y < OC_MINI(c_h, 3); y++) {
+ _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+ 0,
+ (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+ 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+ 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+ 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+ 7,
+ 255);
+ }
+ for (; y < c_h - 2; y++) {
+ _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+ 0,
+ (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+ 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+ 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+ 7,
+ 255);
+ }
+ for (; y < c_h; y++) {
+ _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+ 0,
+ (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+ 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+ 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+ 4 * tmp[(c_h - 1) * c_w] + 64) >>
+ 7,
+ 255);
+ }
+ _dst++;
+ tmp++;
+ }
+ _dst += c_sz - c_w;
+ tmp -= c_w;
+ } break;
+ case 2: {
+ /*Slide C_r down a quarter-pel.
+ This is the same as the horizontal filter.*/
+ for (x = 0; x < c_w; x++) {
+ for (y = 0; y < OC_MINI(c_h, 2); y++) {
+ _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+ 0,
+ (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] +
+ 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+ 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] +
+ tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >>
+ 7,
+ 255);
+ }
+ for (; y < c_h - 3; y++) {
+ _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+ 0,
+ (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+ 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+ 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+ 7,
+ 255);
+ }
+ for (; y < c_h; y++) {
+ _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+ 0,
+ (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+ 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+ 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] +
+ 64) >>
+ 7,
+ 255);
+ }
+ _dst++;
+ tmp++;
+ }
+ } break;
+ }
+ /*For actual interlaced material, this would have to be done separately on
+ each field, and the shift amounts would be different.
+ C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8,
+ C_b up 1/8 in the bottom field.
+ The corresponding filters would be:
+ Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128
+ Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/
+ }
+}
+
+/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0.
+ This is used as a helper by several conversion routines.*/
+static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
+ const unsigned char *_src, int _c_w,
+ int _c_h) {
+ int y;
+ int x;
+ /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+ for (x = 0; x < _c_w; x++) {
+ for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
+ _dst[(y >> 1) * _c_w] =
+ OC_CLAMPI(0,
+ (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+ 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+ 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+ 7,
+ 255);
+ }
+ for (; y < _c_h - 3; y += 2) {
+ _dst[(y >> 1) * _c_w] =
+ OC_CLAMPI(0,
+ (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+ 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+ 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+ 7,
+ 255);
+ }
+ for (; y < _c_h; y += 2) {
+ _dst[(y >> 1) * _c_w] = OC_CLAMPI(
+ 0,
+ (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) -
+ 17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) +
+ 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) +
+ 64) >>
+ 7,
+ 255);
+ }
+ _src++;
+ _dst++;
+ }
+}
+
+/*420jpeg chroma samples are sited like:
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ 422jpeg chroma samples are sited like:
+ Y---BR--Y-------Y---BR--Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y---BR--Y-------Y---BR--Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y---BR--Y-------Y---BR--Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y---BR--Y-------Y---BR--Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ We use a resampling filter to decimate the chroma planes by two in the
+ vertical direction.*/
+static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ int c_w;
+ int c_h;
+ int c_sz;
+ int dst_c_w;
+ int dst_c_h;
+ int dst_c_sz;
+ int pli;
+ /*Skip past the luma data.*/
+ _dst += _y4m->pic_w * _y4m->pic_h;
+ /*Compute the size of each chroma plane.*/
+ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+ c_h = _y4m->pic_h;
+ dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+ dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+ c_sz = c_w * c_h;
+ dst_c_sz = dst_c_w * dst_c_h;
+ for (pli = 1; pli < 3; pli++) {
+ y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h);
+ _aux += c_sz;
+ _dst += dst_c_sz;
+ }
+}
+
+/*420jpeg chroma samples are sited like:
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ 422 chroma samples are sited like:
+ YBR-----Y-------YBR-----Y-------
+ | | | |
+ | | | |
+ | | | |
+ YBR-----Y-------YBR-----Y-------
+ | | | |
+ | | | |
+ | | | |
+ YBR-----Y-------YBR-----Y-------
+ | | | |
+ | | | |
+ | | | |
+ YBR-----Y-------YBR-----Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ We use a resampling filter to shift the original site locations one quarter
+ pixel (at the original chroma resolution) to the right.
+ Then we use a second resampling filter to decimate the chroma planes by two
+ in the vertical direction.*/
+static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ unsigned char *tmp;
+ int c_w;
+ int c_h;
+ int c_sz;
+ int dst_c_h;
+ int dst_c_sz;
+ int pli;
+ /*Skip past the luma data.*/
+ _dst += _y4m->pic_w * _y4m->pic_h;
+ /*Compute the size of each chroma plane.*/
+ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+ c_h = _y4m->pic_h;
+ dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+ c_sz = c_w * c_h;
+ dst_c_sz = c_w * dst_c_h;
+ tmp = _aux + 2 * c_sz;
+ for (pli = 1; pli < 3; pli++) {
+ /*In reality, the horizontal and vertical steps could be pipelined, for
+ less memory consumption and better cache performance, but we do them
+ separately for simplicity.*/
+ /*First do horizontal filtering (convert to 422jpeg)*/
+ y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+ /*Now do the vertical filtering.*/
+ y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h);
+ _aux += c_sz;
+ _dst += dst_c_sz;
+ }
+}
+
+/*420jpeg chroma samples are sited like:
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | BR | | BR |
+ | | | |
+ Y-------Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ 411 chroma samples are sited like:
+ YBR-----Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ YBR-----Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ YBR-----Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+ YBR-----Y-------Y-------Y-------
+ | | | |
+ | | | |
+ | | | |
+
+ We use a filter to resample at site locations one eighth pixel (at the source
+ chroma plane's horizontal resolution) and five eighths of a pixel to the
+ right.
+ Then we use another filter to decimate the planes by 2 in the vertical
+ direction.*/
+static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ unsigned char *tmp;
+ int c_w;
+ int c_h;
+ int c_sz;
+ int dst_c_w;
+ int dst_c_h;
+ int dst_c_sz;
+ int tmp_sz;
+ int pli;
+ int y;
+ int x;
+ /*Skip past the luma data.*/
+ _dst += _y4m->pic_w * _y4m->pic_h;
+ /*Compute the size of each chroma plane.*/
+ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+ c_h = _y4m->pic_h;
+ dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+ dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+ c_sz = c_w * c_h;
+ dst_c_sz = dst_c_w * dst_c_h;
+ tmp_sz = dst_c_w * c_h;
+ tmp = _aux + 2 * c_sz;
+ for (pli = 1; pli < 3; pli++) {
+ /*In reality, the horizontal and vertical steps could be pipelined, for
+ less memory consumption and better cache performance, but we do them
+ separately for simplicity.*/
+ /*First do horizontal filtering (convert to 422jpeg)*/
+ for (y = 0; y < c_h; y++) {
+ /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a
+ 4-tap Mitchell window.*/
+ for (x = 0; x < OC_MINI(c_w, 1); x++) {
+ tmp[x << 1] = (unsigned char)OC_CLAMPI(
+ 0,
+ (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+ _aux[OC_MINI(2, c_w - 1)] + 64) >>
+ 7,
+ 255);
+ tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+ 0,
+ (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+ 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+ 7,
+ 255);
+ }
+ for (; x < c_w - 2; x++) {
+ tmp[x << 1] =
+ (unsigned char)OC_CLAMPI(0,
+ (_aux[x - 1] + 110 * _aux[x] +
+ 18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+ 7,
+ 255);
+ tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+ 0,
+ (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+ 5 * _aux[x + 2] + 64) >>
+ 7,
+ 255);
+ }
+ for (; x < c_w; x++) {
+ tmp[x << 1] = (unsigned char)OC_CLAMPI(
+ 0,
+ (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+ _aux[c_w - 1] + 64) >>
+ 7,
+ 255);
+ if ((x << 1 | 1) < dst_c_w) {
+ tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+ 0,
+ (-3 * _aux[x - 1] + 50 * _aux[x] +
+ 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >>
+ 7,
+ 255);
+ }
+ }
+ tmp += dst_c_w;
+ _aux += c_w;
+ }
+ tmp -= tmp_sz;
+ /*Now do the vertical filtering.*/
+ y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+ _dst += dst_c_sz;
+ }
+}
+
+/*Convert 444 to 420jpeg.*/
+static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ unsigned char *tmp;
+ int c_w;
+ int c_h;
+ int c_sz;
+ int dst_c_w;
+ int dst_c_h;
+ int dst_c_sz;
+ int tmp_sz;
+ int pli;
+ int y;
+ int x;
+ /*Skip past the luma data.*/
+ _dst += _y4m->pic_w * _y4m->pic_h;
+ /*Compute the size of each chroma plane.*/
+ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+ c_h = _y4m->pic_h;
+ dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+ dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+ c_sz = c_w * c_h;
+ dst_c_sz = dst_c_w * dst_c_h;
+ tmp_sz = dst_c_w * c_h;
+ tmp = _aux + 2 * c_sz;
+ for (pli = 1; pli < 3; pli++) {
+ /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+ for (y = 0; y < c_h; y++) {
+ for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
+ tmp[x >> 1] = OC_CLAMPI(0,
+ (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+ 17 * _aux[OC_MINI(2, c_w - 1)] +
+ 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+ 7,
+ 255);
+ }
+ for (; x < c_w - 3; x += 2) {
+ tmp[x >> 1] = OC_CLAMPI(0,
+ (3 * (_aux[x - 2] + _aux[x + 3]) -
+ 17 * (_aux[x - 1] + _aux[x + 2]) +
+ 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+ 7,
+ 255);
+ }
+ for (; x < c_w; x += 2) {
+ tmp[x >> 1] =
+ OC_CLAMPI(0,
+ (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+ 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+ 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+ 7,
+ 255);
+ }
+ tmp += dst_c_w;
+ _aux += c_w;
+ }
+ tmp -= tmp_sz;
+ /*Now do the vertical filtering.*/
+ y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+ _dst += dst_c_sz;
+ }
+}
+
+/*The image is padded with empty chroma components at 4:2:0.*/
+static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ int c_sz;
+ (void)_aux;
+ _dst += _y4m->pic_w * _y4m->pic_h;
+ c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+ ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+ memset(_dst, 128, c_sz * 2);
+}
+
+/*No conversion function needed.*/
+static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_aux) {
+ (void)_y4m;
+ (void)_dst;
+ (void)_aux;
+}
+
+static const char TAG[] = "YUV4MPEG2";
+
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+ int num_skip, aom_chroma_sample_position_t csp,
+ int only_420) {
+ // File must start with |TAG|.
+ char tag_buffer[9]; // 9 == strlen(TAG)
+ // Read as much as possible from |skip_buffer|, which were characters
+ // that were previously read from the file to do input-type detection.
+ assert(num_skip >= 0 && num_skip <= 8);
+ if (num_skip > 0) {
+ memcpy(tag_buffer, skip_buffer, num_skip);
+ }
+ // Start reading from the file now that the |skip_buffer| is depleted.
+ if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) {
+ return -1;
+ }
+ if (memcmp(TAG, tag_buffer, 9) != 0) {
+ fprintf(stderr, "Error parsing header: must start with %s\n", TAG);
+ return -1;
+ }
+ // Next character must be a space.
+ if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') {
+ fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
+ return -1;
+ }
+ if (!parse_tags(y4m_ctx, file)) {
+ fprintf(stderr, "Error parsing %s header.\n", TAG);
+ return -1;
+ }
+ if (y4m_ctx->interlace == '?') {
+ fprintf(stderr,
+ "Warning: Input video interlacing format unknown; "
+ "assuming progressive scan.\n");
+ } else if (y4m_ctx->interlace != 'p') {
+ fprintf(stderr,
+ "Input video is interlaced; "
+ "Only progressive scan handled.\n");
+ return -1;
+ }
+ /* Only support vertical chroma sample position if the input format is
+ * already 420mpeg2. Colocated is not supported in Y4M.
+ */
+ if (csp == AOM_CSP_VERTICAL &&
+ strcmp(y4m_ctx->chroma_type, "420mpeg2") != 0) {
+ fprintf(stderr,
+ "Vertical chroma sample position only supported "
+ "for 420mpeg2 input\n");
+ return -1;
+ }
+ if (csp == AOM_CSP_COLOCATED) {
+ // TODO(any): check the right way to handle this in y4m
+ fprintf(stderr,
+ "Ignoring colocated chroma sample position for reading in Y4M\n");
+ }
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I420;
+ y4m_ctx->bps = 12;
+ y4m_ctx->bit_depth = 8;
+ y4m_ctx->aux_buf = NULL;
+ y4m_ctx->dst_buf = NULL;
+ if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
+ strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 ||
+ strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
+ y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz =
+ y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+ /* Natively supported: no conversion required. */
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) {
+ y4m_ctx->src_c_dec_h = 2;
+ y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 2;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz =
+ 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
+ /* Natively supported: no conversion required. */
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ y4m_ctx->bit_depth = 10;
+ y4m_ctx->bps = 15;
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016;
+ if (only_420) {
+ fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) {
+ y4m_ctx->src_c_dec_h = 2;
+ y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 2;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz =
+ 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
+ /* Natively supported: no conversion required. */
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ y4m_ctx->bit_depth = 12;
+ y4m_ctx->bps = 18;
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016;
+ if (only_420) {
+ fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
+ y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Chroma filter required: read into the aux buf first.
+ We need to make two filter passes, so we need some extra space in the
+ aux buffer.*/
+ y4m_ctx->aux_buf_sz =
+ 3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+ y4m_ctx->aux_buf_read_sz =
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+ y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg;
+ } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) {
+ y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 1;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Chroma filter required: read into the aux buf first.*/
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+ y4m_ctx->convert = y4m_convert_422jpeg_420jpeg;
+ } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) {
+ y4m_ctx->src_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 1;
+ if (only_420) {
+ y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Chroma filter required: read into the aux buf first.
+ We need to make two filter passes, so we need some extra space in the
+ aux buffer.*/
+ y4m_ctx->aux_buf_read_sz =
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+ ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+ y4m_ctx->convert = y4m_convert_422_420jpeg;
+ } else {
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I422;
+ y4m_ctx->bps = 16;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+ y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+ y4m_ctx->dst_buf_read_sz =
+ y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+ /*Natively supported: no conversion required.*/
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) {
+ y4m_ctx->src_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 1;
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216;
+ y4m_ctx->bps = 20;
+ y4m_ctx->bit_depth = 10;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+ y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+ y4m_ctx->dst_buf_read_sz =
+ 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ if (only_420) {
+ fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) {
+ y4m_ctx->src_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 1;
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216;
+ y4m_ctx->bps = 24;
+ y4m_ctx->bit_depth = 12;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+ y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+ y4m_ctx->dst_buf_read_sz =
+ 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ if (only_420) {
+ fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) {
+ y4m_ctx->src_c_dec_h = 4;
+ y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->src_c_dec_v = 1;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Chroma filter required: read into the aux buf first.
+ We need to make two filter passes, so we need some extra space in the
+ aux buffer.*/
+ y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h;
+ y4m_ctx->aux_buf_sz =
+ y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+ y4m_ctx->convert = y4m_convert_411_420jpeg;
+ } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) {
+ y4m_ctx->src_c_dec_h = 1;
+ y4m_ctx->src_c_dec_v = 1;
+ if (only_420) {
+ y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Chroma filter required: read into the aux buf first.
+ We need to make two filter passes, so we need some extra space in the
+ aux buffer.*/
+ y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+ ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+ y4m_ctx->convert = y4m_convert_444_420jpeg;
+ } else {
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I444;
+ y4m_ctx->bps = 24;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+ y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+ y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Natively supported: no conversion required.*/
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) {
+ y4m_ctx->src_c_dec_h = 1;
+ y4m_ctx->src_c_dec_v = 1;
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416;
+ y4m_ctx->bps = 30;
+ y4m_ctx->bit_depth = 10;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+ y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+ y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ if (only_420) {
+ fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) {
+ y4m_ctx->src_c_dec_h = 1;
+ y4m_ctx->src_c_dec_v = 1;
+ y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416;
+ y4m_ctx->bps = 36;
+ y4m_ctx->bit_depth = 12;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+ y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+ y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_null;
+ if (only_420) {
+ fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "444alpha") == 0) {
+ y4m_ctx->src_c_dec_h = 1;
+ y4m_ctx->src_c_dec_v = 1;
+ if (only_420) {
+ y4m_ctx->dst_c_dec_h = 2;
+ y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*Chroma filter required: read into the aux buf first.
+ We need to make two filter passes, so we need some extra space in the
+ aux buffer.
+ The extra plane also gets read into the aux buf.
+ It will be discarded.*/
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+ 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+ y4m_ctx->convert = y4m_convert_444_420jpeg;
+ } else {
+ fprintf(stderr, "Unsupported format: 444A\n");
+ return -1;
+ }
+ } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) {
+ y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0;
+ y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2;
+ y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
+ /*No extra space required, but we need to clear the chroma planes.*/
+ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+ y4m_ctx->convert = y4m_convert_mono_420jpeg;
+ } else {
+ fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type);
+ return -1;
+ }
+ /*The size of the final frame buffers is always computed from the
+ destination chroma decimation type.*/
+ y4m_ctx->dst_buf_sz =
+ y4m_ctx->pic_w * y4m_ctx->pic_h +
+ 2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) *
+ ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v);
+ if (y4m_ctx->bit_depth == 8)
+ y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
+ else
+ y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
+ if (!y4m_ctx->dst_buf) return -1;
+
+ if (y4m_ctx->aux_buf_sz > 0) {
+ y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
+ if (!y4m_ctx->aux_buf) {
+ free(y4m_ctx->dst_buf);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+void y4m_input_close(y4m_input *_y4m) {
+ free(_y4m->dst_buf);
+ free(_y4m->aux_buf);
+}
+
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *_img) {
+ char frame[6];
+ int pic_sz;
+ int c_w;
+ int c_h;
+ int c_sz;
+ int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
+ /*Read and skip the frame header.*/
+ if (!file_read(frame, 6, _fin)) return 0;
+ if (memcmp(frame, "FRAME", 5)) {
+ fprintf(stderr, "Loss of framing in Y4M input data\n");
+ return -1;
+ }
+ if (frame[5] != '\n') {
+ char c;
+ int j;
+ for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {
+ }
+ if (j == 79) {
+ fprintf(stderr, "Error parsing Y4M frame header\n");
+ return -1;
+ }
+ }
+ /*Read the frame data that needs no conversion.*/
+ if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) {
+ fprintf(stderr, "Error reading Y4M frame data.\n");
+ return -1;
+ }
+ /*Read the frame data that does need conversion.*/
+ if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) {
+ fprintf(stderr, "Error reading Y4M frame data.\n");
+ return -1;
+ }
+ /*Now convert the just read frame.*/
+ (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf);
+ /*Fill in the frame buffer pointers.
+ We don't use aom_img_wrap() because it forces padding for odd picture
+ sizes, which would require a separate fread call for every row.*/
+ memset(_img, 0, sizeof(*_img));
+ /*Y4M has the planes in Y'CbCr order, which libaom calls Y, U, and V.*/
+ _img->fmt = _y4m->aom_fmt;
+ _img->w = _img->d_w = _y4m->pic_w;
+ _img->h = _img->d_h = _y4m->pic_h;
+ _img->bit_depth = _y4m->bit_depth;
+ _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+ _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+ _img->bps = _y4m->bps;
+
+ /*Set up the buffer pointers.*/
+ pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample;
+ c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+ c_w *= bytes_per_sample;
+ c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+ c_sz = c_w * c_h;
+ _img->stride[AOM_PLANE_Y] = _y4m->pic_w * bytes_per_sample;
+ _img->stride[AOM_PLANE_U] = _img->stride[AOM_PLANE_V] = c_w;
+ _img->planes[AOM_PLANE_Y] = _y4m->dst_buf;
+ _img->planes[AOM_PLANE_U] = _y4m->dst_buf + pic_sz;
+ _img->planes[AOM_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+ return 1;
+}
diff --git a/third_party/aom/common/y4minput.h b/third_party/aom/common/y4minput.h
new file mode 100644
index 0000000000..2472007b67
--- /dev/null
+++ b/third_party/aom/common/y4minput.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * Based on code from the OggTheora software codec source code,
+ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+
+#ifndef AOM_COMMON_Y4MINPUT_H_
+#define AOM_COMMON_Y4MINPUT_H_
+
+#include <stdio.h>
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct y4m_input y4m_input;
+
+/*The function used to perform chroma conversion.*/
+typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst,
+ unsigned char *_src);
+
+struct y4m_input {
+ int pic_w;
+ int pic_h;
+ int fps_n;
+ int fps_d;
+ int par_n;
+ int par_d;
+ char interlace;
+ int src_c_dec_h;
+ int src_c_dec_v;
+ int dst_c_dec_h;
+ int dst_c_dec_v;
+ char chroma_type[16];
+ /*The size of each converted frame buffer.*/
+ size_t dst_buf_sz;
+ /*The amount to read directly into the converted frame buffer.*/
+ size_t dst_buf_read_sz;
+ /*The size of the auxilliary buffer.*/
+ size_t aux_buf_sz;
+ /*The amount to read into the auxilliary buffer.*/
+ size_t aux_buf_read_sz;
+ y4m_convert_func convert;
+ unsigned char *dst_buf;
+ unsigned char *aux_buf;
+ enum aom_img_fmt aom_fmt;
+ int bps;
+ unsigned int bit_depth;
+ aom_color_range_t color_range;
+};
+
+/**
+ * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after
+ * reading it. Note that |csp| should only be set for 420 input, and the input
+ * chroma is shifted if necessary. The code does not support the conversion
+ * from co-located to vertical. The |skip_buffer| indicates bytes that were
+ * previously read from |file|, to do input-type detection; this buffer will
+ * be read before the |file| is read. It is of size |num_skip|, which *must*
+ * be 8 or less.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+ int num_skip, aom_chroma_sample_position_t csp,
+ int only_420);
+void y4m_input_close(y4m_input *_y4m);
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_COMMON_Y4MINPUT_H_
diff --git a/third_party/aom/doc/AlgorithmDescription.md b/third_party/aom/doc/AlgorithmDescription.md
new file mode 100644
index 0000000000..bfd64dad67
--- /dev/null
+++ b/third_party/aom/doc/AlgorithmDescription.md
@@ -0,0 +1,799 @@
+<div style="font-size:3em; text-align:center;"> Algorithm Description </div>
+
+# Abstract
+This document describes technical aspects of coding tools included in
+the associated codec. This document is not a specification of the associated
+codec. Instead, it summarizes the highlighted features of coding tools for new
+developers. This document should be updated when significant new normative
+changes have been integrated into the associated codec.
+
+# Table of Contents
+
+[Abbreviations](#Abbreviations)
+
+[Algorithm description](#Algorithm-Description)
+
+- [Block Partitioning](#Block-Partitioning)
+ - [Coding block partition](#Coding-block-partition)
+ - [Transform block partition](#Transform-block-partition)
+- [Intra Prediction](#Intra-Prediction)
+ - [Directional intra prediction modes](#Directional-intra-prediction-modes)
+ - [Non-directional intra prediction modes](#Non-directional-intra-prediction-modes)
+ - [Recursive filtering modes](#Recursive-filtering-modes)
+ - [Chroma from Luma mode](#Chroma-from-Luma-mode)
+- [Inter Prediction](#Inter-Prediction)
+ - [Motion vector prediction](#Motion-vector-prediction)
+ - [Motion vector coding](#Motion-vector-coding)
+ - [Interpolation filter for motion compensation](#Interpolation-filter-for-motion-compensation)
+ - [Warped motion compensation](#Warped-motion-compensation)
+ - [Overlapped block motion compensation](#Overlapped-block-motion-compensation)
+ - [Reference frames](#Reference-frames)
+ - [Compound Prediction](#Compound-Prediction)
+- [Transform](#Transform)
+- [Quantization](#Quantization)
+- [Entropy Coding](#Entropy-Coding)
+- [Loop filtering and post-processing](#Loop-filtering-and-post-processing)
+ - [Deblocking](#Deblocking)
+ - [Constrained directional enhancement](#Constrained-directional-enhancement)
+ - [Loop Restoration filter](#Loop-Restoration-filter)
+ - [Frame super-resolution](#Frame-super-resolution)
+ - [Film grain synthesis](#Film-grain-synthesis)
+- [Screen content coding](#Screen-content-coding)
+ - [Intra block copy](#Intra-block-copy)
+ - [Palette mode](#Palette-mode)
+
+[References](#References)
+
+# Abbreviations
+
+CfL: Chroma from Luma\
+IntraBC: Intra block copy\
+LCU: Largest coding unit\
+OBMC: Overlapped Block Motion Compensation\
+CDEF: Constrained Directional Enhancement Filter
+
+# Algorithm Description
+
+## Block Partitioning
+
+### Coding block partition
+
+The largest coding block unit (LCU) applied in this codec is 128×128. In
+addition to no split mode `PARTITION_NONE`, the partition tree supports 9
+different partitioning patterns, as shown in below figure.
+
+<figure class="image"> <center><img src="img\partition_codingblock.svg"
+alt="Partition" width="360" /> <figcaption>Figure 1: Supported coding block
+partitions</figcaption> </figure>
+
+According to the number of sub-partitions, the 9 partition modes are summarized
+as follows: 1. Four partitions: `PARTITION_SPLIT`, `PARTITION_VERT_4`,
+`PARTITION_HORZ_4` 2. Three partitions (T-Shape): `PARTITION_HORZ_A`,
+`PARTITION_HORZ_B`, `PARTITION_VERT_A`, `PARTITION_HORZ_B` 3. Two partitions:
+`PARTITION_HORZ`, `PARTITION_VERT`
+
+Among all the 9 partitioning patterns, only `PARTITION_SPLIT` mode supports
+recursive partitioning, i.e., sub-partitions can be further split, other
+partitioning modes cannot further split. Particularly, for 8x8 and 128x128,
+`PARTITION_VERT_4`, `PARTITION_HORZ_4` are not used, and for 8x8, T-Shape
+partitions are not used either.
+
+### Transform block partition
+
+For both intra and inter coded blocks, the coding block can be further
+partitioned into multiple transform units with the partitioning depth up to 2
+levels. The mapping from the transform size of the current depth to the
+transform size of the next depth is shown in the following Table 1.
+
+<figure class="image"> <center><figcaption>Table 1: Transform partition size
+setting</figcaption> <img src="img\tx_partition.svg" alt="Partition" width="220"
+/> </figure>
+
+Furthermore, for intra coded blocks, the transform partition is done in a way
+that all the transform blocks have the same size, and the transform blocks are
+coded in a raster scan order. An example of the transform block partitioning for
+intra coded block is shown in the Figure 2.
+
+<figure class="image"> <center><img src="img\intra_tx_partition.svg"
+alt="Partition" width="600" /> <figcaption>Figure 2: Example of transform
+partitioning for intra coded block</figcaption> </figure>
+
+For inter coded blocks, the transform unit partitioning can be done in a
+recursive manner with the partitioning depth up to 2 levels. The transform
+partitioning supports 1:1 (square), 1:2/2:1, and 1:4/4:1 transform unit sizes
+ranging from 4×4 to 64×64. If the coding block is smaller than or equal to
+64x64, the transform block partitioning can only apply to luma component, for
+chroma blocks, the transform block size is identical to the coding block size.
+Otherwise, if the coding block width or height is greater than 64, then both the
+luma and chroma coding blocks will implicitly split into multiples of min(W,
+64)x min(H, 64) and min(W, 32)x min(H, 32) transform blocks, respectively.
+
+<figure class="image"> <center><img src="img\inter_tx_partition.svg"
+alt="Partition" width="400" /> <figcaption>Figure 3: Example of transform
+partitioning for inter coded block</figcaption> </figure>
+
+## Intra Prediction
+
+### Directional intra prediction modes
+
+Directional intra prediction modes are applied in intra prediction, which models
+local textures using a given direction pattern. Directional intra prediction
+modes are represented by nominal modes and angle delta. The nominal modes are
+similar set of intra prediction angles used in VP9, which includes 8 angles. The
+index value of angle delta is ranging from -3 ~ +3, and zero delta angle
+indicates a nominal mode. The prediction angle is represented by a nominal intra
+angle plus an angle delta. In total, there are 56 directional intra prediction
+modes, as shown in the following figure. In the below figure, solid arrows
+indicate directional intra prediction modes and dotted arrows represent non-zero
+angle delta.
+
+<figure class="image"> <center><img src="img\intra_directional.svg"
+alt="Directional intra" width="300" /> <figcaption>Figure 4: Directional intra
+prediction modes</figcaption> </figure>
+
+The nominal mode index and angle delta index is signalled separately, and
+nominal mode index is signalled before the associated angle delta index. It is
+noted that for small block sizes, where the coding gain from extending intra
+prediction angles may saturate, only the nominal modes are used and angle delta
+index is not signalled.
+
+### Non-directional intra prediction modes
+
+In addition to directional intra prediction modes, four non-directional intra
+modes which simulate smooth textures are also included. The four non-directional
+intra modes include `SMOOTH_V`, `SMOOTH_H`, `SMOOTH` and `PAETH predictor`.
+
+In `SMOOTH V`, `SMOOTH H` and `SMOOTH modes`, the prediction values are
+generated using quadratic interpolation along vertical, horizontal directions,
+or the average thereof. The samples used in the quadratic interpolation include
+reconstructed samples from the top and left neighboring blocks and samples from
+the right and bottom boundaries which are approximated by top reconstructed
+samples and the left reconstructed samples.
+
+In `PAETH predictor` mode, the prediction for each sample is assigned as one
+from the top (T), left (L) and top-left (TL) reference samples, which has the
+value closest to the Paeth predictor value, i.e., T + L -TL. The samples used in
+`PAETH predictor` are illustrated in below figure.
+
+<figure class="image"> <center><img src="img\intra_paeth.svg" alt="Directional
+intra" width="300" /> <figcaption>Figure 5: Paeth predictor</figcaption>
+</figure>
+
+### Recursive filtering modes
+
+Five filtering intra modes are defined, and each mode specify a set of eight
+7-tap filters. Given the selected filtering mode index (0~4), the current block
+is divided into 4x2 sub-blocks. For one 4×2 sub-block, each sample is predicted
+by 7-tap interpolation using the 7 top and left neighboring samples as inputs.
+Different filters are applied for samples located at different coordinates
+within a 4×2 sub-block. The prediction process can be done recursively in unit
+4x2 sub-block, which means that prediction samples generated for one 4x2
+prediction block can be used to predict another 4x2 sub-block.
+
+<figure class="image"> <center><img src="img\intra_recursive.svg"
+alt="Directional intra" width="300" /> <figcaption>Figure 6: Recursive filtering
+modes</figcaption> </figure>
+
+### Chroma from Luma mode
+
+Chroma from Luma (CfL) is a chroma intra prediction mode, which models chroma
+samples as a linear function of co-located reconstructed luma samples. To align
+the resolution between luma and chroma samples for different chroma sampling
+format, e.g., 4:2:0 and 4:2:2, reconstructed luma pixels may need to be
+sub-sampled before being used in CfL mode. In addition, the DC component is
+removed to form the AC contribution. In CfL mode, the model parameters which
+specify the linear function between two color components are optimized by
+encoder signalled in the bitstream.
+
+<figure class="image"> <center><img src="img\intra_cfl.svg" alt="Directional
+intra" width="700" /> <figcaption>Figure 7: CfL prediction</figcaption>
+</figure>
+
+## Inter Prediction
+
+### Motion vector prediction
+
+Motion vectors are predicted by neighboring blocks which can be either spatial
+neighboring blocks, or temporal neighboring blocks located in a reference frame.
+A set of MV predictors will be identified by checking all these blocks and
+utilized to encode the motion vector information.
+
+**Spatial motion vector prediction**
+
+There are two sets of spatial neighboring blocks that can be utilized for
+finding spatial MV predictors, including the adjacent spatial neighbors which
+are direct top and left neighbors of the current block, and second outer spatial
+neighbors which are close but not directly adjacent to the current block. The
+two sets of spatial neighboring blocks are illustrated in an example shown in
+Figure 8.
+
+<figure class="image"> <center><img src="img\inter_spatial_mvp.svg"
+alt="Directional intra" width="350" /><figcaption>Figure 8: Motion field
+estimation by linear projection</figcaption></figure>
+
+For each set of spatial neighbors, the top row will be checked from left to
+right and then the left column will be checked from top to down. For the
+adjacent spatial neighbors, an additional top-right block will be also checked
+after checking the left column neighboring blocks. For the non-adjacent spatial
+neighbors, the top-left block located at (-1, -1) position will be checked
+first, then the top row and left column in a similar manner as the adjacent
+neighbors. The adjacent neighbors will be checked first, then the temporal MV
+predictor that will be described in the next subsection will be checked second,
+after that, the non-adjacent spatial neighboring blocks will be checked.
+
+For compound prediction which utilizes a pair of reference frames, the
+non-adjacent spatial neighbors are not used for deriving the MV predictor.
+
+**Temporal motion vector prediction**
+
+In addition to spatial neighboring blocks, MV predictor can be also derived
+using co-located blocks of reference pictures, namely temporal MV predictor. To
+generate temporal MV predictor, the MVs of reference frames are first stored
+together with reference indices associated with the reference frame. Then for
+each 8x8 block of the current frame, the MVs of a reference frame which pass the
+8x8 block are identified and stored together with the reference frame index in a
+temporal MV buffer. In an example shown in Figure 5, the MV of reference frame 1
+(R1) pointing from R1 to a reference frame of R1 is identified, i.e., MVref,
+which passes a 8x8 block (shaded in blue dots) of current frame. Then this MVref
+is stored in the temporal MV buffer associated with this 8x8 block. <figure
+class="image"> <center><img src="img\inter_motion_field.svg" alt="Directional
+intra" width="800" /><figcaption>Figure 9: Motion field estimation by linear
+projection</figcaption></figure> Finally, given a couple of pre-defined block
+coordinates, the associated MVs stored in the temporal MV buffer are identified
+and projected accordingly to derive a temporal MV predictor which points from
+the current block to its reference frame, e.g., MV0 in Figure 5. In Figure 6,
+the pre-defined block positions for deriving temporal MV predictors of a 16x16
+block are shown and up to 7 blocks will be checked to find valid temporal MV
+predictors.<figure class="image"> <center><img
+src="img\inter_tmvp_positions.svg" alt="Directional intra" width="300"
+/><figcaption>Figure 10: Block positions for deriving temporal MV
+predictors</figcaption></figure> The temporal MV predictors are checked after
+the nearest spatial MV predictors but before the non-adjacent spatial MV
+predictors.
+
+All the spatial and temporal MV candidates will be put together in a pool, with
+each predictor associated with a weighting determined during the scanning of the
+spatial and temporal neighboring blocks. Based on the associated weightings, the
+candidates are sorted and ranked, and up to four candidates will be used as a
+list MV predictor list.
+
+### Motion vector coding
+
+### Interpolation filter for motion compensation
+
+<mark>[Ed.: to be added]</mark>
+
+### Warped motion compensation
+
+**Global warped motion**
+
+The global motion information is signalled at each inter frame, wherein the
+global motion type and motion parameters are included. The global motion types
+and the number of the associated parameters are listed in the following table.
+
+
+| Global motion type | Number of parameters |
+|:------------------:|:--------------------:|
+| Identity (zero motion)| 0 |
+| Translation | 2 |
+| Rotzoom | 4 |
+| General affine | 6 |
+
+For an inter coded block, after the reference frame index is
+transmitted, if the motion of current block is indicated as global motion, the
+global motion type and the associated parameters of the given reference will be
+used for current block.
+
+**Local warped motion**
+
+For an inter coded block, local warped motion is allowed when the following
+conditions are all satisfied:
+
+* Current block is single prediction
+* Width or height is greater than or equal to 8 samples
+* At least one of the immediate neighbors uses same reference frame with current block
+
+If the local warped motion is used for current block, instead of signalling the
+affine parameters, they are estimated by using mean square minimization of the
+distance between the reference projection and modeled projection based on the
+motion vectors of current block and its immediate neighbors. To estimate the
+parameters of local warped motion, the projection sample pair of the center
+pixel in neighboring block and its corresponding pixel in the reference frame
+are collected if the neighboring block uses the same reference frame with
+current block. After that, 3 extra samples are created by shifting the center
+position by a quarter sample in one or two dimensions, and these samples are
+also considered as projection sample pairs to ensure the stability of the model
+parameter estimation process.
+
+
+### Overlapped block motion compensation
+
+For an inter-coded block, overlapped block motion compensation (OBMC) is allowed
+when the following conditions are all satisfied.
+
+* Current block is single prediction
+* Width or height is greater than or equal to 8 samples
+* At least one of the neighboring blocks are inter-coded blocks
+
+When OBMC is applied to current block, firstly, the initial inter prediction
+samples is generated by using the assigned motion vector of current block, then
+the inter predicted samples for the current block and inter predicted samples
+based on motion vectors from the above and left blocks are blended to generate
+the final prediction samples.The maximum number of neighboring motion vectors is
+limited based on the size of current block, and up to 4 motion vectors from each
+of upper and left blocks can be involved in the OBMC process of current block.
+
+One example of the processing order of neighboring blocks is shown in the
+following picture, wherein the values marked in each block indicate the
+processing order of the motion vectors of current block and neighboring blocks.
+To be specific, the motion vector of current block is firstly applied to
+generate inter prediction samples P0(x,y). Then motion vector of block 1 is
+applied to generate the prediction samples p1(x,y). After that, the prediction
+samples in the overlapping area between block 0 and block 1 is an weighted
+average of p0(x,y) and p1(x,y). The overlapping area of block 1 and block 0 is
+marked in grey in the following picture. The motion vectors of block 2, 3, 4 are
+further applied and blended in the same way.
+
+<figure class="image"> <center><img src="img\inter_obmc.svg" alt="Directional
+intra" width="300" /><figcaption>Figure 11: neighboring blocks for OBMC
+process</figcaption></figure>
+
+### Reference frames
+
+<mark>[Ed.: to be added]</mark>
+
+### Compound Prediction
+
+<mark>[Ed.: to be added]</mark>
+
+**Compound wedge prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Difference-modulated masked prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Frame distance-based compound prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+**Compound inter-intra prediction**
+
+<mark>[Ed.: to be added]</mark>
+
+## Transform
+
+The separable 2D transform process is applied on prediction residuals. For the
+forward transform, a 1-D vertical transform is performed first on each column of
+the input residual block, then a horizontal transform is performed on each row
+of the vertical transform output. For the backward transform, a 1-D horizontal
+transform is performed first on each row of the input de-quantized coefficient
+block, then a vertical transform is performed on each column of the horizontal
+transform output. The primary 1-D transforms include four different types of
+transform: a) 4-point, 8-point, 16-point, 32-point, 64-point DCT-2; b) 4-point,
+8-point, 16-point asymmetric DST’s (DST-4, DST-7) and c) their flipped
+versions; d) 4-point, 8-point, 16-point, 32-point identity transforms. When
+transform size is 4-point, ADST refers to DST-7, otherwise, when transform size
+is greater than 4-point, ADST refers to DST-4.
+
+<figure class="image"> <center><figcaption>Table 2: Transform basis functions
+(DCT-2, DST-4 and DST-7 for N-point input.</figcaption> <img src=
+"img\tx_basis.svg" alt="Partition" width="450" /> </figure>
+
+For luma component, each transform block can select one pair of horizontal and
+vertical transform combination given a pre-defined set of transform type
+candidates, and the selection is explicitly signalled into the bitstream.
+However, the selection is not signalled when Max(width,height) is 64. When
+the maximum of transform block width and height is greater than or equal to 32,
+the set of transform type candidates depend on the prediction mode, as described
+in Table 3. Otherwise, when the maximum of transform block width and height is
+smaller than 32, the set of transform type candidates depend on the prediction
+mode, as described in Table 4.
+
+<figure class="image"> <center><figcaption>Table 3: Transform type candidates
+for luma component when max(width, height) is greater than or equal to 32.
+</figcaption> <img src="img\tx_cands_large.svg" alt="Partition" width="370" />
+</figure>
+
+<figure class="image"> <center><figcaption>Table 4: Transform type candidates
+for luma component when max(width, height) is smaller than 32. </figcaption>
+<img src="img\tx_cands_small.svg" alt="Partition" width="440" /> </figure>
+
+The set of transform type candidates (namely transform set) is defined in Table
+5.
+
+<figure class="image"> <center><figcaption>Table 5: Definition of transform set.
+</figcaption> <img src="img\tx_set.svg" alt="Partition" width="450" /> </figure>
+
+For chroma component, the transform type selection is done in an implicit way.
+For intra prediction residuals, the transform type is selected according to the
+intra prediction mode, as specified in Table 4. For inter prediction residuals,
+the transform type is selected according to the transform type selection of the
+co-located luma block. Therefore, for chroma component, there is no transform
+type signalling in the bitstream.
+
+<figure class="image"> <center><figcaption>Table 6: Transform type selection for
+chroma component intra prediction residuals.</figcaption> <img src=
+"img\tx_chroma.svg" alt="Partition" width="500" /> </figure>
+
+The computational cost of large size (e.g., 64-point) transforms is further
+reduced by zeroing out all the coefficients except the following two cases:
+
+1. The top-left 32×32 quadrant for 64×64/64×32/32×64 DCT_DCT hybrid transforms
+2. The left 32×16 area for 64×16 and top 16×32 for16×64 DCT_DCT hybrid transforms.
+
+Both the DCT-2 and ADST (DST-4, DST-7) are implemented using butterfly structure
+[1], which included multiple stages of butterfly operations. Each butterfly
+operations can be calculated in parallel and different stages are cascaded in a
+sequential order.
+
+## Quantization
+Quantization of transform coefficients may apply different quantization step
+size for DC and AC transform coefficients, and different quantization step size
+for luma and chroma transform coefficients. To specify the quantization step
+size, in the frame header, a _**base_q_idx**_ syntax element is first signalled,
+which is a 8-bit fixed length code specifying the quantization step size for
+luma AC coefficients. The valid range of _**base_q_idx**_ is [0, 255].
+
+After that, the delta value relative to base_q_idx for Luma DC coefficients,
+indicated as DeltaQYDc is further signalled. Furthermore, if there are more than
+one color plane, then a flag _**diff_uv_delta**_ is signaled to indicate whether
+Cb and Cr color components apply different quantization index values. If
+_**diff_uv_delta**_ is signalled as 0, then only the delta values relative to
+base_q_idx for chroma DC coefficients (indicated as DeltaQUDc) and AC
+coefficients (indicated as DeltaQUAc) are signalled. Otherwise, the delta values
+relative to base_q_idx for both the Cb and Cr DC coefficients (indicated as
+DeltaQUDc and DeltaQVDc) and AC coefficients (indicated as DeltaQUAc and
+DeltaQVAc) are signalled.
+
+The above decoded DeltaQYDc, DeltaQUAc, DeltaQUDc, DeltaQVAc and DeltaQVDc are
+added to _base_q_idx_ to derive the quantization indices. Then these
+quantization indices are further mapped to quantization step size according to
+two tables. For DC coefficients, the mapping from quantization index to
+quantization step size for 8-bit, 10-bit and 12-bit internal bit depth is
+specified by a lookup table Dc_Qlookup[3][256], and the mapping from
+quantization index to quantization step size for 8-bit, 10-bit and 12-bit is
+specified by a lookup table Ac_Qlookup[3][256].
+
+<figure class="image"> <center><img src="img\quant_dc.svg" alt="quant_dc"
+width="800" /><figcaption>Figure 11: Quantization step size of DC coefficients
+for different internal bit-depth</figcaption></figure>
+
+<figure class="image"> <center><img src="img\quant_ac.svg" alt="quant_ac"
+width="800" /><figcaption>Figure 12: Quantization step size of AC coefficients
+for different internal bit-depth</figcaption></figure>
+
+Given the quantization step size, indicated as _Q<sub>step_, the input quantized
+coefficients is further de-quantized using the following formula:
+
+_F_ = sign * ( (_f_ * _Q<sub>step_) % 0xFFFFFF ) / _deNorm_
+
+, where _f_ is the input quantized coefficient, _F_ is the output dequantized
+coefficient, _deNorm_ is a constant value derived from the transform block area
+size, as indicated by the following table:
+
+| _deNorm_ | Tx block area size |
+|----------|:--------------------------|
+| 1| Less than 512 samples |
+| 2 | 512 or 1024 samples |
+| 4 | Greater than 1024 samples |
+
+When the quantization index is 0, the quantization is performed using a
+quantization step size equal to 1, which is lossless coding mode.
+
+## Entropy Coding
+
+**Entropy coding engine**
+
+<mark>[Ed.: to be added]</mark>
+
+**Coefficient coding**
+
+For each transform unit, the coefficient coding starts with coding a skip sign,
+which is followed by the signaling of primary transform kernel type and the
+end-of-block (EOB) position in case the transform coding is not skipped. After
+that, the coefficient values are coded in a multiple level map manner plus sign
+values. The level maps are coded as three level planes, namely lower-level,
+middle-level and higher-level planes, and the sign is coded as another separate
+plane. The lower-level, middle-level and higher-level planes correspond to
+correspond to different ranges of coefficient magnitudes. The lower level plane
+corresponds to the range of 0–2, the middle level plane takes care of the
+range of 3–14, and the higher-level plane covers the range of 15 and above.
+
+The three level planes are coded as follows. After the EOB position is coded,
+the lower-level and middle-level planes are coded together in backward scan
+order, and the scan order refers to zig-zag scan applied on the entire transform
+unit basis. Then the sign plane and higher-level plane are coded together in
+forward scan order. After that, the remainder (coefficient level minus 14) is
+entropy coded using Exp-Golomb code.
+
+The context model applied to the lower level plane depends on the primary
+transform directions, including: bi-directional, horizontal, and vertical, as
+well as transform size, and up to five neighbor (in frequency domain)
+coefficients are used to derive the context. The middle level plane uses a
+similar context model, but the number of context neighbor coefficients is
+reduced from 5 to 2. The higher-level plane is coded by Exp-Golomb code without
+using context model. For the sign plane, except the DC sign that is coded using
+the DC signs from its neighboring transform units, sign values of other
+coefficients are coded directly without using context model.
+
+## Loop filtering and post-processing
+
+### Deblocking
+
+There are four methods when picking deblocking filter level, which are listed
+below:
+
+* LPF_PICK_FROM_FULL_IMAGE: search the full image with different values
+* LPF_PICK_FROM_Q: estimate the filter level based on quantizer and frame type
+* LPF_PICK_FROM_SUBIMAGE: estimate the level from a portion of image
+* LPF_PICK_MINIMAL_LPF: set the filter level to 0 and disable the deblocking
+
+When estimating the filter level from the full image or sub-image, the searching
+starts from the previous frame filter level, ends when the filter step is less
+or equal to zero. In addition to filter level, there are some other parameters
+which control the deblocking filter such as sharpness level, mode deltas, and
+reference deltas.
+
+Deblocking is performed at 128x128 super block level, and the vertical and
+horizontal edges are filtered respectively. For a 128x128 super block, the
+vertical/horizontal edges aligned with each 8x8 block is firstly filtered. If
+the 4x4 transform is used, the internal edge aligned with a 4x4 block will be
+further filtered. The filter length is switchable from 4-tap, 6-tap, 8-tap,
+14-tap, and 0-tap (no filtering). The location of filter taps are identified
+based on the number of filter taps in order to compute the filter mask. When
+finally performing the filtering, outer taps are added if there is high edge
+variance.
+
+### Constrained directional enhancement filter
+
+**Edge Direction Estimation**\
+In CDEF, edge direction search is performed at 8x8 block-level. There are
+eight edge directions in total, as illustrated in Figure 13.
+<figure class="image"> <center><img src="img\edge_direction.svg"
+alt="Edge direction" width="700" /> <figcaption>Figure 13: Line number
+k for pixels following direction d=0:7 in an 8x8 block.</figcaption> </figure>
+
+The optimal edge direction d_opt is found by maximizing the following
+term [3]:
+
+<figure class="image"> <center><img src="img\equ_edge_direction.svg"
+alt="Equation edge direction" width="250" /> </figure>
+<!-- $$d_{opt}=\max_{d} s_d$$
+$$s_d = \sum_{k}\frac{1}{N_{d,k}}(\sum_{p\in P_{d,k}}x_p)^2,$$ -->
+
+where x_p is the value of pixel p, P_{d,k} is the set of pixels in
+line k following direction d, N_{d,k} is the cardinality of P_{d,k}.
+
+**Directional filter**\
+CDEF consists two filter taps: the primary tap and the secondary tap.
+The primary tap works along the edge direction (as shown in Figure 14),
+while the secondary tap forms an oriented 45 degree off the edge direction
+ (as shown in Figure 15).
+
+<figure class="image"> <center><img src="img\primary_tap.svg"
+alt="Primary tap" width="700" /> <figcaption>Figure 14: Primary filter
+taps following edge direction. For even strengths a = 2 and b = 4, for
+odd strengths a = 3 and b = 3. The filtered pixel is shown in the
+highlighted center.</figcaption> </figure>
+
+<figure class="image"> <center><img src="img\secondary_tap.svg"
+alt="Edge direction" width="700" /> <figcaption>Figure 15: Secondary
+filter taps. The filtered pixel is shown in the highlighted center.
+</figcaption> </figure>
+
+CDEF can be described by the following equation:
+
+<figure class="image"> <center><img src="img\equ_dir_search.svg"
+alt="Equation direction search" width="720" /> </figure>
+
+<!-- $$y(i,j)=x(i,j)+round(\sum_{m,n}w^{(p)}_{d,m,n}f(x(m,x)-x(i,j),S^{(p)},
+D)+\sum_{m,n}w^{(s)}_{d,m,n}f(x(m,x)-x(i,j),S^{(s)},D)),$$ -->
+
+where x(i,j) and y(i,j) are the input and output reconstructed values
+of CDEF. p denotes primary tap, and s denotes secondary tap, w is
+the weight between primary and secondary tap. f(d,S,D) is a non-linear
+filtering function, S denotes filter strength, D is a damping parameter.
+For 8-bit content, S^p ranges from 0 to 15, and S^s can be
+0, 1, 2, or 4. D ranges from 3 to 6 for luma, and 2 to 4 for chroma.
+
+**Non linear filter**\
+CDEF uses a non-linear filtering function to prevent excessive blurring
+when applied across an edge. It is achieved by ignoring pixels that are
+too different from the current pixels to be filtered. When the difference
+between current pixel and it's neighboring pixel d is within a threshold,
+f(d,S,D) = d, otherwise f(d,S,D) = 0. Specifically, the strength S
+determines the maximum difference allowed and damping D determines the
+point to ignore the filter tap.
+
+### Loop Restoration filter
+
+**Separable symmetric wiener filter**
+
+Let F be a w x w 2D filter taps around the pixel to be filtered, denoted as
+a w^2 x 1 column vector. When compared with traditional Wiener Filter,
+Separable Symmetric Wiener Filter has the following three constraints in order
+to save signaling bits and reduce complexity [4]:
+
+1) The w x w filter window of is separated into horizontal and vertical w-tap
+convolutions.
+
+2) The horizontal and vertical filters are constrained to be symmetric.
+
+3) It is assumed that the summation of horizontal/vertical filter coefficients
+is 1.
+
+As a result, F can be written as F = column_vectorize[ab^T], subject to a(i)
+= a(w - 1 - i), b(i) = b(w - 1 - i), for i = [0, r - 1], and sum(a(i)) =
+sum(b(i)) = 1, where a is the vertical filters and b is the horizontal filters.
+The derivation of the filters a and b starts from an initial guess of
+horizontal and vertical filters, optimizing one of the two while holding the
+other fixed. In the implementation w = 7, thus, 3 taps need to be sent for
+filters a and b, respectively. When signaling the filter coefficients, 4, 5 and
+6 bits are used for the first three filter taps, and the remaining ones are
+obtained from the normalization and symmetry constraints. 30 bits in total are
+transmitted for both vertical and horizontal filters.
+
+
+**Dual self-guided filter**
+
+Dual self-guided filter is designed to firstly obtain two coarse restorations
+X1 and X2 of the degraded frame X, and the final restoration Xr is obtained as
+a combination of the degraded samples, and the difference between the degraded
+samples and the coarse restorations [4]:
+
+<figure class="image"> <center><img src="img\equ_dual_self_guided.svg"
+alt="Equation dual self guided filter" width="300" /> </figure>
+<!-- $$X_r = X + \alpha (X_1 - X) + \beta (X_2 - X)$$ -->
+
+At encoder side, alpha and beta are computed using:
+
+<figure class="image"> <center><img src="img\equ_dual_self_para.svg"
+alt="Equation dual self guided filter parameter" width="220" /> </figure>
+<!-- $${\alpha, \beta}^T = (A^T A) ^{-1} A^T b,$$ -->
+
+where A = {X1 - X, X2 - X}, b = Y - X, and Y is the original source.
+
+X1 and X2 are obtained using guided filtering, and the filtering is controlled
+by a radius r and a noise parameter e, where a higher r implies a higher
+spatial variance and a higher e implies a higher range variance [4]. X1 and X2
+can be described by {r1, e1} and {r2, e2}, respectively.
+
+The encoder sends a 6-tuple {r1, e1, r2, e2, alpha, beta} to the decoder. In
+the implementation, {r1, e1, r2, e2} uses a 3-bit codebook, and {alpha, beta}
+uses 7-bit each due to much higher precision, resulting in a total of 17 bits.
+r is always less or equal to 3 [4].
+
+Guided filtering can be described by a local linear model:
+
+<figure class="image"> <center><img src="img\equ_guided_filter.svg"
+alt="Equation guided filter" width="155" /> </figure>
+<!-- $$y=Fx+G,$$ -->
+
+where x and y are the input and output samples, F and G are determined by the
+statistics in the neighboring of the pixel to be filtered. It is called
+self-guided filtering when the guidance image is the same as the degraded
+image[4].
+
+Following are three steps when deriving F and G of the self-guided filtering:
+
+1) Compute mean u and variance d of pixels in a (2r + 1) x (2r + 1) window
+around the pixel to be filtered.
+
+2) For each pixel, compute f = d / (d + e); g = (1 - f)u.
+
+3) Compute F and G for each pixel as averages of f and g values in a 3 x 3
+window around the pixel for use in step 2.
+
+### Frame super-resolution
+
+In order to improve the perceptual quality of decoded pictures, a
+super-resolution process is applied at low bit-rates [5]. First, at encoder
+side, the source video is downscaled as a non-normative procedure. Second,
+the downscaled video is encoded, followed by deblocking and CDEF process.
+Third, a linear upscaling process is applied as a normative procedure to bring
+the encoded video back to it's original spatial resolution. Lastly, the loop
+restoration is applied to resolve part of the high frequency lost. The last
+two steps together are called super-resolving process [5]. Similarly, decoding,
+deblocking and CDEF processes are applied at lower spatial resolution at
+decoder side. Then, the frames go through the super-resolving process.
+In order to reduce overheads in line-buffers with respect to hardware
+implementation, the upscaling and downscaling process are applied to
+horizontal dimension only.
+
+### Film grain synthesis
+
+At encoder side, film grain is removed from the input video as a denoising
+process. Then, the structure and intensity of the input video are analyzed
+by canny edge detector, and smooth areas are used to estimate the strength
+of film grain. Once the strength is estimated, the denoised video and film
+grain parameters are sent to decoder side. Those parameters are used to
+synthesis the grain and add it back to the decoded video, producing the final
+output video.
+
+In order to reconstruct the film grain, the following parameters are sent to
+decoder side: lag value, autoregressive coefficients, values for precomputed
+look-up table index of chroma components, and a set of points for a piece-wise
+linear scaling function [6]. Those parameters are signaled as quantized
+integers including 64 bytes for scaling function and 74 bytes for
+autoregressive coefficients. Once the parameters are received, an
+autoregressive process is applied in a raster scan order to generate one 64x64
+luma and two 32x32 chroma film grain templates [6]. Those templates are used
+to generate the grain for the remaining part of a picture.
+
+## Screen content coding
+
+To improve the coding performance of screen content coding, the associated video
+codec incorporates several coding tools,for example, intra block copy
+(IntraBC) is employed to handle the repeated patterns in a screen picture, and
+palette mode is used to handle the screen blocks with a limited number of
+different colors.
+
+### Intra block copy
+
+Intra Block Copy (IntraBC) [2] is a coding tool similar to inter-picture
+prediction. The main difference is that in IntraBC, a predictor block is
+formed from the reconstructed samples (before application of in-loop filtering)
+of the current picture. Therefore, IntraBC can be considered as "motion
+compensation" within current picture.
+
+A block vector (BV) was coded to specify the location of the predictor block.
+The BV precision is integer. The BV will be signalled in the bitstream since the
+decoder needs it to locate the predictor. For current block, the flag use
+IntraBC indicating whether current block is IntraBC mode is first transmitted in
+bit stream. Then, if the current block is IntraBC mode, the BV difference diff
+is obtained by subtracting the reference BV from the current BV, and then diff
+is classified into four types according to the diff values of horizontal and
+vertical component. Type information needs to be transmitted into the bitstream,
+after that, diff values of two components may be signalled based on the type
+info.
+
+IntraBC is very effective for screen content coding, but it also brings a lot of
+difficulties to hardware design. To facilitate the hardware design, the
+following modifications are adopted.
+
+1) when IntraBC is allowed, the loop filters are disabled, which are de-blocking
+filter, the CDEF (Constrained Directional Enhancement Filter), and the Loop
+Restoration. By doing this, picture buffer of reconstructed samples can be
+shared between IntraBC and inter prediction.
+
+2) To facilitate parallel decoding, the prediction cannot exceed the restricted
+areas. For one super block, if the coordinate of its top-left position is (x0,
+y0), the prediction at position (x, y) can be accessed by IntraBC, if y < y0 and
+x < x0 + 2 * (y0 - y)
+
+3) To allow hardware writing back delay, immediate reconstructed areas cannot be
+accessed by IntraBC prediction. The restricted immediate reconstructed area can
+be 1 ∼ n super blocks. So on top of modification 2, if the coordinate of one
+super block's top-left position is (x0, y0), the prediction at position (x, y)
+can be accessed by IntraBC, if y < y0 and x < x0 + 2 * (y0 - y) - D, where D
+denotes the restricted immediate reconstructed area. When D is one super block,
+the prediction area is shown in below figure.
+
+<figure class="image"> <center><img src="img\SCC_IntraBC.svg" alt="Intra block
+copy" width="600" /> <figcaption>Figure 13: the prediction area for IntraBC mode
+in one super block prediction</figcaption> </figure>
+
+### Palette mode
+
+# References
+
+[1] J. Han, Y. Xu and D. Mukherjee, "A butterfly structured design of the hybrid
+transform coding scheme," 2013 Picture Coding Symposium (PCS), San Jose, CA,
+2013, pp. 17-20.\
+[2] J. Li, H. Su, A. Converse, B. Li, R. Zhou, B. Lin, J. Xu, Y. Lu, and R.
+Xiong, "Intra Block Copy for Screen Content in the Emerging AV1 Video Codec,"
+2018 Data Compression Conference, Snowbird, Utah, USA.\
+[3] S. Midtskogen and J.M. Valin. "The AV1 constrained directional enhancement
+ filter (CDEF)." In 2018 IEEE International Conference on Acoustics, Speech
+ and Signal Processing (ICASSP), pp. 1193-1197. IEEE, 2018.\
+[4] D. Mukherjee, S. Li, Y. Chen, A. Anis, S. Parker, and
+J. Bankoski. "A switchable loop-restoration with side-information framework
+for the emerging AV1 video codec." In 2017 IEEE International Conference on
+Image Processing (ICIP), pp. 265-269. IEEE, 2017.\
+[5] Y. Chen, D. Murherjee, J. Han, A. Grange, Y. Xu, Z. Liu,... & C.H.Chiang,
+(2018, June). "An overview of core coding tools in the AV1 video codec.""
+In 2018 Picture Coding Symposium (PCS) (pp. 41-45). IEEE.\
+[6] A. Norkin, & N. Birkbeck, (2018, March). "Film grain synthesis for AV1
+video codec." In 2018 Data Compression Conference (pp. 3-12). IEEE.
diff --git a/third_party/aom/doc/dev_guide/av1_decoder.dox b/third_party/aom/doc/dev_guide/av1_decoder.dox
new file mode 100644
index 0000000000..f65ddb51ca
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/av1_decoder.dox
@@ -0,0 +1,11 @@
+/*!\page decoder_guide AV1 DECODER GUIDE
+
+ Describe AV1 decoding techniques here.
+
+ \cond
+ \if av1_md_support
+ [AV1 Algorithm Description](\ref LALGORITHMDESCRIPTION)
+ \endif
+ \endcond
+
+*/
diff --git a/third_party/aom/doc/dev_guide/av1_encoder.dox b/third_party/aom/doc/dev_guide/av1_encoder.dox
new file mode 100644
index 0000000000..0f7e8f87e2
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/av1_encoder.dox
@@ -0,0 +1,1617 @@
+/*!\page encoder_guide AV1 ENCODER GUIDE
+
+\tableofcontents
+
+\section architecture_introduction Introduction
+
+This document provides an architectural overview of the libaom AV1 encoder.
+
+It is intended as a high level starting point for anyone wishing to contribute
+to the project, that will help them to more quickly understand the structure
+of the encoder and find their way around the codebase.
+
+It stands above and will where necessary link to more detailed function
+level documents.
+
+\subsection architecture_gencodecs Generic Block Transform Based Codecs
+
+Most modern video encoders including VP8, H.264, VP9, HEVC and AV1
+(in increasing order of complexity) share a common basic paradigm. This
+comprises separating a stream of raw video frames into a series of discrete
+blocks (of one or more sizes), then computing a prediction signal and a
+quantized, transform coded, residual error signal. The prediction and residual
+error signal, along with any side information needed by the decoder, are then
+entropy coded and packed to form the encoded bitstream. See Figure 1: below,
+where the blue blocks are, to all intents and purposes, the lossless parts of
+the encoder and the red block is the lossy part.
+
+This is of course a gross oversimplification, even in regard to the simplest
+of the above codecs. For example, all of them allow for block based
+prediction at multiple different scales (i.e. different block sizes) and may
+use previously coded pixels in the current frame for prediction or pixels from
+one or more previously encoded frames. Further, they may support multiple
+different transforms and transform sizes and quality optimization tools like
+loop filtering.
+
+\image html genericcodecflow.png "" width=70%
+
+\subsection architecture_av1_structure AV1 Structure and Complexity
+
+As previously stated, AV1 adopts the same underlying paradigm as other block
+transform based codecs. However, it is much more complicated than previous
+generation codecs and supports many more block partitioning, prediction and
+transform options.
+
+AV1 supports block partitions of various sizes from 128x128 pixels down to 4x4
+pixels using a multi-layer recursive tree structure as illustrated in figure 2
+below.
+
+\image html av1partitions.png "" width=70%
+
+AV1 also provides 71 basic intra prediction modes, 56 single frame inter prediction
+modes (7 reference frames x 4 modes x 2 for OBMC (overlapped block motion
+compensation)), 12768 compound inter prediction modes (that combine inter
+predictors from two reference frames) and 36708 compound inter / intra
+prediction modes. Furthermore, in addition to simple inter motion estimation,
+AV1 also supports warped motion prediction using affine transforms.
+
+In terms of transform coding, it has 16 separable 2-D transform kernels
+\f$(DCT, ADST, fADST, IDTX)^2\f$ that can be applied at up to 19 different
+scales from 64x64 down to 4x4 pixels.
+
+When combined together, this means that for any one 8x8 pixel block in a
+source frame, there are approximately 45,000,000 different ways that it can
+be encoded.
+
+Consequently, AV1 requires complex control processes. While not necessarily
+a normative part of the bitstream, these are the algorithms that turn a set
+of compression tools and a bitstream format specification, into a coherent
+and useful codec implementation. These may include but are not limited to
+things like :-
+
+- Rate distortion optimization (The process of trying to choose the most
+ efficient combination of block size, prediction mode, transform type
+ etc.)
+- Rate control (regulation of the output bitrate)
+- Encoder speed vs quality trade offs.
+- Features such as two pass encoding or optimization for low delay
+ encoding.
+
+For a more detailed overview of AV1's encoding tools and a discussion of some
+of the design considerations and hardware constraints that had to be
+accommodated, please refer to <a href="https://arxiv.org/abs/2008.06091">
+A Technical Overview of AV1</a>.
+
+Figure 3 provides a slightly expanded but still simplistic view of the
+AV1 encoder architecture with blocks that relate to some of the subsequent
+sections of this document. In this diagram, the raw uncompressed frame buffers
+are shown in dark green and the reconstructed frame buffers used for
+prediction in light green. Red indicates those parts of the codec that are
+(or may be) lossy, where fidelity can be traded off against compression
+efficiency, whilst light blue shows algorithms or coding tools that are
+lossless. The yellow blocks represent non-bitstream normative configuration
+and control algorithms.
+
+\image html av1encoderflow.png "" width=70%
+
+\section architecture_command_line The Libaom Command Line Interface
+
+ Add details or links here: TODO ? elliotk@
+
+\section architecture_enc_data_structures Main Encoder Data Structures
+
+The following are the main high level data structures used by the libaom AV1
+encoder and referenced elsewhere in this overview document:
+
+- \ref AV1_PRIMARY
+ - \ref AV1_PRIMARY.gf_group (\ref GF_GROUP)
+ - \ref AV1_PRIMARY.lap_enabled
+ - \ref AV1_PRIMARY.twopass (\ref TWO_PASS)
+ - \ref AV1_PRIMARY.p_rc (\ref PRIMARY_RATE_CONTROL)
+ - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO)
+
+- \ref AV1_COMP
+ - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+ - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+ - \ref AV1_COMP.speed
+ - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
+
+- \ref AV1EncoderConfig (Encoder configuration parameters)
+ - \ref AV1EncoderConfig.pass
+ - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
+ - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg)
+ - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg)
+
+- \ref AlgoCfg (Algorithm related configuration parameters)
+ - \ref AlgoCfg.arnr_max_frames
+ - \ref AlgoCfg.arnr_strength
+
+- \ref KeyFrameCfg (Keyframe coding configuration parameters)
+ - \ref KeyFrameCfg.enable_keyframe_filtering
+
+- \ref RateControlCfg (Rate control configuration)
+ - \ref RateControlCfg.mode
+ - \ref RateControlCfg.target_bandwidth
+ - \ref RateControlCfg.best_allowed_q
+ - \ref RateControlCfg.worst_allowed_q
+ - \ref RateControlCfg.cq_level
+ - \ref RateControlCfg.under_shoot_pct
+ - \ref RateControlCfg.over_shoot_pct
+ - \ref RateControlCfg.maximum_buffer_size_ms
+ - \ref RateControlCfg.starting_buffer_level_ms
+ - \ref RateControlCfg.optimal_buffer_level_ms
+ - \ref RateControlCfg.vbrbias
+ - \ref RateControlCfg.vbrmin_section
+ - \ref RateControlCfg.vbrmax_section
+
+- \ref PRIMARY_RATE_CONTROL (Primary Rate control status)
+ - \ref PRIMARY_RATE_CONTROL.gf_intervals[]
+ - \ref PRIMARY_RATE_CONTROL.cur_gf_index
+
+- \ref RATE_CONTROL (Rate control status)
+ - \ref RATE_CONTROL.intervals_till_gf_calculate_due
+ - \ref RATE_CONTROL.frames_till_gf_update_due
+ - \ref RATE_CONTROL.frames_to_key
+
+- \ref TWO_PASS (Two pass status and control data)
+
+- \ref GF_GROUP (Data related to the current GF/ARF group)
+
+- \ref FIRSTPASS_STATS (Defines entries in the first pass stats buffer)
+ - \ref FIRSTPASS_STATS.coded_error
+
+- \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters)
+ - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES)
+
+- \ref HIGH_LEVEL_SPEED_FEATURES
+ - \ref HIGH_LEVEL_SPEED_FEATURES.recode_loop
+ - \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance
+
+- \ref TplParams
+
+\section architecture_enc_use_cases Encoder Use Cases
+
+The libaom AV1 encoder is configurable to support a number of different use
+cases and rate control strategies.
+
+The principle use cases for which it is optimised are as follows:
+
+ - <b>Video on Demand / Streaming</b>
+ - <b>Low Delay or Live Streaming</b>
+ - <b>Video Conferencing / Real Time Coding (RTC)</b>
+ - <b>Fixed Quality / Testing</b>
+
+Other examples of use cases for which the encoder could be configured but for
+which there is less by way of specific optimizations include:
+
+ - <b>Download and Play</b>
+ - <b>Disk Playback</b>>
+ - <b>Storage</b>
+ - <b>Editing</b>
+ - <b>Broadcast video</b>
+
+Specific use cases may have particular requirements or constraints. For
+example:
+
+<b>Video Conferencing:</b> In a video conference we need to encode the video
+in real time and to avoid any coding tools that could increase latency, such
+as frame look ahead.
+
+<b>Live Streams:</b> In cases such as live streaming of games or events, it
+may be possible to allow some limited buffering of the video and use of
+lookahead coding tools to improve encoding quality. However, whilst a lag of
+a second or two may be fine given the one way nature of this type of video,
+it is clearly not possible to use tools such as two pass coding.
+
+<b>Broadcast:</b> Broadcast video (e.g. digital TV over satellite) may have
+specific requirements such as frequent and regular key frames (e.g. once per
+second or more) as these are important as entry points to users when switching
+channels. There may also be strict upper limits on bandwidth over a short
+window of time.
+
+<b>Download and Play:</b> Download and play applications may have less strict
+requirements in terms of local frame by frame rate control but there may be a
+requirement to accurately hit a file size target for the video clip as a
+whole. Similar considerations may apply to playback from mass storage devices
+such as DVD or disk drives.
+
+<b>Editing:</b> In certain special use cases such as offline editing, it may
+be desirable to have very high quality and data rate but also very frequent
+key frames or indeed to encode the video exclusively as key frames. Lossless
+video encoding may also be required in this use case.
+
+<b>VOD / Streaming:</b> One of the most important and common use cases for AV1
+is video on demand or streaming, for services such as YouTube and Netflix. In
+this use case it is possible to do two or even multi-pass encoding to improve
+compression efficiency. Streaming services will often store many encoded
+copies of a video at different resolutions and data rates to support users
+with different types of playback device and bandwidth limitations.
+Furthermore, these services support dynamic switching between multiple
+streams, so that they can respond to changing network conditions.
+
+Exact rate control when encoding for a specific format (e.g 360P or 1080P on
+YouTube) may not be critical, provided that the video bandwidth remains within
+allowed limits. Whilst a format may have a nominal target data rate, this can
+be considered more as the desired average egress rate over the video corpus
+rather than a strict requirement for any individual clip. Indeed, in order
+to maintain optimal quality of experience for the end user, it may be
+desirable to encode some easier videos or sections of video at a lower data
+rate and harder videos or sections at a higher rate.
+
+VOD / streaming does not usually require very frequent key frames (as in the
+broadcast case) but key frames are important in trick play (scanning back and
+forth to different points in a video) and for adaptive stream switching. As
+such, in a use case like YouTube, there is normally an upper limit on the
+maximum time between key frames of a few seconds, but within certain limits
+the encoder can try to align key frames with real scene cuts.
+
+Whilst encoder speed may not seem to be as critical in this use case, for
+services such as YouTube, where millions of new videos have to be encoded
+every day, encoder speed is still important, so libaom allows command line
+control of the encode speed vs quality trade off.
+
+<b>Fixed Quality / Testing Mode:</b> Libaom also has a fixed quality encoder
+pathway designed for testing under highly constrained conditions.
+
+\section architecture_enc_speed_quality Speed vs Quality Trade Off
+
+In any modern video encoder there are trade offs that can be made in regard to
+the amount of time spent encoding a video or video frame vs the quality of the
+final encode.
+
+These trade offs typically limit the scope of the search for an optimal
+prediction / transform combination with faster encode modes doing fewer
+partition, reference frame, prediction mode and transform searches at the cost
+of some reduction in coding efficiency.
+
+The pruning of the size of the search tree is typically based on assumptions
+about the likelihood of different search modes being selected based on what
+has gone before and features such as the dimensions of the video frames and
+the Q value selected for encoding the frame. For example certain intra modes
+are less likely to be chosen at high Q but may be more likely if similar
+modes were used for the previously coded blocks above and to the left of the
+current block.
+
+The speed settings depend both on the use case (e.g. Real Time encoding) and
+an explicit speed control passed in on the command line as <b>--cpu-used</b>
+and stored in the \ref AV1_COMP.speed field of the main compressor instance
+data structure (<b>cpi</b>).
+
+The control flags for the speed trade off are stored the \ref AV1_COMP.sf
+field of the compressor instancve and are set in the following functions:-
+
+- \ref av1_set_speed_features_framesize_independent()
+- \ref av1_set_speed_features_framesize_dependent()
+- \ref av1_set_speed_features_qindex_dependent()
+
+A second factor impacting the speed of encode is rate distortion optimisation
+(<b>rd vs non-rd</b> encoding).
+
+When rate distortion optimization is enabled each candidate combination of
+a prediction mode and transform coding strategy is fully encoded and the
+resulting error (or distortion) as compared to the original source and the
+number of bits used, are passed to a rate distortion function. This function
+converts the distortion and cost in bits to a single <b>RD</b> value (where
+lower is better). This <b>RD</b> value is used to decide between different
+encoding strategies for the current block where, for example, a one may
+result in a lower distortion but a larger number of bits.
+
+The calculation of this <b>RD</b> value is broadly speaking as follows:
+
+\f[
+ RD = (&lambda; * Rate) + Distortion
+\f]
+
+This assumes a linear relationship between the number of bits used and
+distortion (represented by the rate multiplier value <b>&lambda;</b>) which is
+not actually valid across a broad range of rate and distortion values.
+Typically, where distortion is high, expending a small number of extra bits
+will result in a large change in distortion. However, at lower values of
+distortion the cost in bits of each incremental improvement is large.
+
+To deal with this we scale the value of <b>&lambda;</b> based on the quantizer
+value chosen for the frame. This is assumed to be a proxy for our approximate
+position on the true rate distortion curve and it is further assumed that over
+a limited range of distortion values, a linear relationship between distortion
+and rate is a valid approximation.
+
+Doing a rate distortion test on each candidate prediction / transform
+combination is expensive in terms of cpu cycles. Hence, for cases where encode
+speed is critical, libaom implements a non-rd pathway where the <b>RD</b>
+value is estimated based on the prediction error and quantizer setting.
+
+\section architecture_enc_src_proc Source Frame Processing
+
+\subsection architecture_enc_frame_proc_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+- \ref AV1_PRIMARY ppi (the primary compressor instance data structure)
+ - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO)
+
+- \ref AV1_COMP cpi (the main compressor instance data structure)
+ - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+
+- \ref AV1EncoderConfig (Encoder configuration parameters)
+ - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
+ - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg)
+
+- \ref AlgoCfg (Algorithm related configuration parameters)
+ - \ref AlgoCfg.arnr_max_frames
+ - \ref AlgoCfg.arnr_strength
+
+- \ref KeyFrameCfg (Keyframe coding configuration parameters)
+ - \ref KeyFrameCfg.enable_keyframe_filtering
+
+\subsection architecture_enc_frame_proc_ingest Frame Ingest / Coding Pipeline
+
+ To encode a frame, first call \ref av1_receive_raw_frame() to obtain the raw
+ frame data. Then call \ref av1_get_compressed_data() to encode raw frame data
+ into compressed frame data. The main body of \ref av1_get_compressed_data()
+ is \ref av1_encode_strategy(), which determines high-level encode strategy
+ (frame type, frame placement, etc.) and then encodes the frame by calling
+ \ref av1_encode(). In \ref av1_encode(), \ref av1_first_pass() will execute
+ the first_pass of two-pass encoding, while \ref encode_frame_to_data_rate()
+ will perform the final pass for either one-pass or two-pass encoding.
+
+ The main body of \ref encode_frame_to_data_rate() is
+ \ref encode_with_recode_loop_and_filter(), which handles encoding before
+ in-loop filters (with recode loops \ref encode_with_recode_loop(), or
+ without any recode loop \ref encode_without_recode()), followed by in-loop
+ filters (deblocking filters \ref loopfilter_frame(), CDEF filters and
+ restoration filters \ref cdef_restoration_frame()).
+
+ Except for rate/quality control, both \ref encode_with_recode_loop() and
+ \ref encode_without_recode() call \ref av1_encode_frame() to manage the
+ reference frame buffers and \ref encode_frame_internal() to perform the
+ rest of encoding that does not require access to external frames.
+ \ref encode_frame_internal() is the starting point for the partition search
+ (see \ref architecture_enc_partitions).
+
+\subsection architecture_enc_frame_proc_tf Temporal Filtering
+
+\subsubsection architecture_enc_frame_proc_tf_overview Overview
+
+Video codecs exploit the spatial and temporal correlations in video signals to
+achieve compression efficiency. The noise factor in the source signal
+attenuates such correlation and impedes the codec performance. Denoising the
+video signal is potentially a promising solution.
+
+One strategy for denoising a source is motion compensated temporal filtering.
+Unlike image denoising, where only the spatial information is available,
+video denoising can leverage a combination of the spatial and temporal
+information. Specifically, in the temporal domain, similar pixels can often be
+tracked along the motion trajectory of moving objects. Motion estimation is
+applied to neighboring frames to find similar patches or blocks of pixels that
+can be combined to create a temporally filtered output.
+
+AV1, in common with VP8 and VP9, uses an in-loop motion compensated temporal
+filter to generate what are referred to as alternate reference frames (or ARF
+frames). These can be encoded in the bitstream and stored as frame buffers for
+use in the prediction of subsequent frames, but are not usually directly
+displayed (hence they are sometimes referred to as non-display frames).
+
+The following command line parameters set the strength of the filter, the
+number of frames used and determine whether filtering is allowed for key
+frames.
+
+- <b>--arnr-strength</b> (\ref AlgoCfg.arnr_strength)
+- <b>--arnr-maxframes</b> (\ref AlgoCfg.arnr_max_frames)
+- <b>--enable-keyframe-filtering</b>
+ (\ref KeyFrameCfg.enable_keyframe_filtering)
+
+Note that in AV1, the temporal filtering scheme is designed around the
+hierarchical ARF based pyramid coding structure. We typically apply denoising
+only on key frame and ARF frames at the highest (and sometimes the second
+highest) layer in the hierarchical coding structure.
+
+\subsubsection architecture_enc_frame_proc_tf_algo Temporal Filtering Algorithm
+
+Our method divides the current frame into "MxM" blocks. For each block, a
+motion search is applied on frames before and after the current frame. Only
+the best matching patch with the smallest mean square error (MSE) is kept as a
+candidate patch for a neighbour frame. The current block is also a candidate
+patch. A total of N candidate patches are combined to generate the filtered
+output.
+
+Let f(i) represent the filtered sample value and \f$p_{j}(i)\f$ the sample
+value of the j-th patch. The filtering process is:
+
+\f[
+ f(i) = \frac{p_{0}(i) + \sum_{j=1}^{N} &omega;_{j}(i).p_{j}(i)}
+ {1 + \sum_{j=1}^{N} &omega;_{j}(i)}
+\f]
+
+where \f$ &omega;_{j}(i) \f$ is the weight of the j-th patch from a total of
+N patches. The weight is determined by the patch difference as:
+
+\f[
+ &omega;_{j}(i) = exp(-\frac{D_{j}(i)}{h^2})
+\f]
+
+where \f$ D_{j}(i) \f$ is the sum of squared difference between the current
+block and the j-th candidate patch:
+
+\f[
+ D_{j}(i) = \sum_{k\in&Omega;_{i}}||p_{0}(k) - p_{j}(k)||_{2}
+\f]
+
+where:
+- \f$p_{0}\f$ refers to the current frame.
+- \f$&Omega;_{i}\f$ is the patch window, an "LxL" pixel square.
+- h is a critical parameter that controls the decay of the weights measured by
+ the Euclidean distance. It is derived from an estimate of noise amplitude in
+ the source. This allows the filter coefficients to adapt for videos with
+ different noise characteristics.
+- Usually, M = 32, N = 7, and L = 5, but they can be adjusted.
+
+It is recommended that the reader refers to the code for more details.
+
+\subsubsection architecture_enc_frame_proc_tf_funcs Temporal Filter Functions
+
+The main entry point for temporal filtering is \ref av1_temporal_filter().
+This function returns 1 if temporal filtering is successful, otherwise 0.
+When temporal filtering is applied, the filtered frame will be held in
+the output_frame, which is the frame to be
+encoded in the following encoding process.
+
+Almost all temporal filter related code is in av1/encoder/temporal_filter.c
+and av1/encoder/temporal_filter.h.
+
+Inside \ref av1_temporal_filter(), the reader's attention is directed to
+\ref tf_setup_filtering_buffer() and \ref tf_do_filtering().
+
+- \ref tf_setup_filtering_buffer(): sets up the frame buffer for
+ temporal filtering, determines the number of frames to be used, and
+ calculates the noise level of each frame.
+
+- \ref tf_do_filtering(): the main function for the temporal
+ filtering algorithm. It breaks each frame into "MxM" blocks. For each
+ block a motion search \ref tf_motion_search() is applied to find
+ the motion vector from one neighboring frame. tf_build_predictor() is then
+ called to build the matching patch and \ref av1_apply_temporal_filter_c() (see
+ also optimised SIMD versions) to apply temporal filtering. The weighted
+ average over each pixel is accumulated and finally normalized in
+ \ref tf_normalize_filtered_frame() to generate the final filtered frame.
+
+- \ref av1_apply_temporal_filter_c(): the core function of our temporal
+ filtering algorithm (see also optimised SIMD versions).
+
+\subsection architecture_enc_frame_proc_film Film Grain Modelling
+
+ Add details here.
+
+\section architecture_enc_rate_ctrl Rate Control
+
+\subsection architecture_enc_rate_ctrl_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+ - \ref AV1_PRIMARY ppi (the primary compressor instance data structure)
+ - \ref AV1_PRIMARY.twopass (\ref TWO_PASS)
+
+ - \ref AV1_COMP cpi (the main compressor instance data structure)
+ - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
+ - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+ - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
+
+ - \ref AV1EncoderConfig (Encoder configuration parameters)
+ - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg)
+
+ - \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first
+ pass stats)
+
+ - \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters)
+ - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES)
+
+\subsection architecture_enc_rate_ctrl_options Supported Rate Control Options
+
+Different use cases (\ref architecture_enc_use_cases) may have different
+requirements in terms of data rate control.
+
+The broad rate control strategy is selected using the <b>--end-usage</b>
+parameter on the command line, which maps onto the field
+\ref aom_codec_enc_cfg_t.rc_end_usage in \ref aom_encoder.h.
+
+The four supported options are:-
+
+- <b>VBR</b> (Variable Bitrate)
+- <b>CBR</b> (Constant Bitrate)
+- <b>CQ</b> (Constrained Quality mode ; A constrained variant of VBR)
+- <b>Fixed Q</b> (Constant quality of Q mode)
+
+The value of \ref aom_codec_enc_cfg_t.rc_end_usage is in turn copied over
+into the encoder rate control configuration data structure as
+\ref RateControlCfg.mode.
+
+In regards to the most important use cases above, Video on demand uses either
+VBR or CQ mode. CBR is the preferred rate control model for RTC and Live
+streaming and Fixed Q is only used in testing.
+
+The behaviour of each of these modes is regulated by a series of secondary
+command line rate control options but also depends somewhat on the selected
+use case, whether 2-pass coding is enabled and the selected encode speed vs
+quality trade offs (\ref AV1_COMP.speed and \ref AV1_COMP.sf).
+
+The list below gives the names of the main rate control command line
+options together with the names of the corresponding fields in the rate
+control configuration data structures.
+
+- <b>--target-bitrate</b> (\ref RateControlCfg.target_bandwidth)
+- <b>--min-q</b> (\ref RateControlCfg.best_allowed_q)
+- <b>--max-q</b> (\ref RateControlCfg.worst_allowed_q)
+- <b>--cq-level</b> (\ref RateControlCfg.cq_level)
+- <b>--undershoot-pct</b> (\ref RateControlCfg.under_shoot_pct)
+- <b>--overshoot-pct</b> (\ref RateControlCfg.over_shoot_pct)
+
+The following control aspects of vbr encoding
+
+- <b>--bias-pct</b> (\ref RateControlCfg.vbrbias)
+- <b>--minsection-pct</b> ((\ref RateControlCfg.vbrmin_section)
+- <b>--maxsection-pct</b> ((\ref RateControlCfg.vbrmax_section)
+
+The following relate to buffer and delay management in one pass low delay and
+real time coding
+
+- <b>--buf-sz</b> (\ref RateControlCfg.maximum_buffer_size_ms)
+- <b>--buf-initial-sz</b> (\ref RateControlCfg.starting_buffer_level_ms)
+- <b>--buf-optimal-sz</b> (\ref RateControlCfg.optimal_buffer_level_ms)
+
+\subsection architecture_enc_vbr Variable Bitrate (VBR) Encoding
+
+For streamed VOD content the most common rate control strategy is Variable
+Bitrate (VBR) encoding. The CQ mode mentioned above is a variant of this
+where additional quantizer and quality constraints are applied. VBR
+encoding may in theory be used in conjunction with either 1-pass or 2-pass
+encoding.
+
+VBR encoding varies the number of bits given to each frame or group of frames
+according to the difficulty of that frame or group of frames, such that easier
+frames are allocated fewer bits and harder frames are allocated more bits. The
+intent here is to even out the quality between frames. This contrasts with
+Constant Bitrate (CBR) encoding where each frame is allocated the same number
+of bits.
+
+Whilst for any given frame or group of frames the data rate may vary, the VBR
+algorithm attempts to deliver a given average bitrate over a wider time
+interval. In standard VBR encoding, the time interval over which the data rate
+is averaged is usually the duration of the video clip. An alternative
+approach is to target an average VBR bitrate over the entire video corpus for
+a particular video format (corpus VBR).
+
+\subsubsection architecture_enc_1pass_vbr 1 Pass VBR Encoding
+
+The command line for libaom does allow 1 Pass VBR, but this has not been
+properly optimised and behaves much like 1 pass CBR in most regards, with bits
+allocated to frames by the following functions:
+
+- \ref av1_calc_iframe_target_size_one_pass_vbr()
+- \ref av1_calc_pframe_target_size_one_pass_vbr()
+
+\subsubsection architecture_enc_2pass_vbr 2 Pass VBR Encoding
+
+The main focus here will be on 2-pass VBR encoding (and the related CQ mode)
+as these are the modes most commonly used for VOD content.
+
+2-pass encoding is selected on the command line by setting --passes=2
+(or -p 2).
+
+Generally speaking, in 2-pass encoding, an encoder will first encode a video
+using a default set of parameters and assumptions. Depending on the outcome
+of that first encode, the baseline assumptions and parameters will be adjusted
+to optimize the output during the second pass. In essence the first pass is a
+fact finding mission to establish the complexity and variability of the video,
+in order to allow a better allocation of bits in the second pass.
+
+The libaom 2-pass algorithm is unusual in that the first pass is not a full
+encode of the video. Rather it uses a limited set of prediction and transform
+options and a fixed quantizer, to generate statistics about each frame. No
+output bitstream is created and the per frame first pass statistics are stored
+entirely in volatile memory. This has some disadvantages when compared to a
+full first pass encode, but avoids the need for file I/O and improves speed.
+
+For two pass encoding, the function \ref av1_encode() will first be called
+for each frame in the video with the value \ref AV1EncoderConfig.pass = 1.
+This will result in calls to \ref av1_first_pass().
+
+Statistics for each frame are stored in \ref FIRSTPASS_STATS frame_stats_buf.
+
+After completion of the first pass, \ref av1_encode() will be called again for
+each frame with \ref AV1EncoderConfig.pass = 2. The frames are then encoded in
+accordance with the statistics gathered during the first pass by calls to
+\ref encode_frame_to_data_rate() which in turn calls
+ \ref av1_get_second_pass_params().
+
+In summary the second pass code :-
+
+- Searches for scene cuts (if auto key frame detection is enabled).
+- Defines the length of and hierarchical structure to be used in each
+ ARF/GF group.
+- Allocates bits based on the relative complexity of each frame, the quality
+ of frame to frame prediction and the type of frame (e.g. key frame, ARF
+ frame, golden frame or normal leaf frame).
+- Suggests a maximum Q (quantizer value) for each ARF/GF group, based on
+ estimated complexity and recent rate control compliance
+ (\ref RATE_CONTROL.active_worst_quality)
+- Tracks adherence to the overall rate control objectives and adjusts
+ heuristics.
+
+The main two pass functions in regard to the above include:-
+
+- \ref find_next_key_frame()
+- \ref define_gf_group()
+- \ref calculate_total_gf_group_bits()
+- \ref get_twopass_worst_quality()
+- \ref av1_gop_setup_structure()
+- \ref av1_gop_bit_allocation()
+- \ref av1_twopass_postencode_update()
+
+For each frame, the two pass algorithm defines a target number of bits
+\ref RATE_CONTROL.base_frame_target, which is then adjusted if necessary to
+reflect any undershoot or overshoot on previous frames to give
+\ref RATE_CONTROL.this_frame_target.
+
+As well as \ref RATE_CONTROL.active_worst_quality, the two pass code also
+maintains a record of the actual Q value used to encode previous frames
+at each level in the current pyramid hierarchy
+(\ref PRIMARY_RATE_CONTROL.active_best_quality). The function
+\ref rc_pick_q_and_bounds(), uses these values to set a permitted Q range
+for each frame.
+
+\subsubsection architecture_enc_1pass_lagged 1 Pass Lagged VBR Encoding
+
+1 pass lagged encode falls between simple 1 pass encoding and full two pass
+encoding and is used for cases where it is not possible to do a full first
+pass through the entire video clip, but where some delay is permissible. For
+example near live streaming where there is a delay of up to a few seconds. In
+this case the first pass and second pass are in effect combined such that the
+first pass starts encoding the clip and the second pass lags behind it by a
+few frames. When using this method, full sequence level statistics are not
+available, but it is possible to collect and use frame or group of frame level
+data to help in the allocation of bits and in defining ARF/GF coding
+hierarchies. The reader is referred to the \ref AV1_PRIMARY.lap_enabled field
+in the main compressor instance (where <b>lap</b> stands for
+<b>look ahead processing</b>). This encoding mode for the most part uses the
+same rate control pathways as two pass VBR encoding.
+
+\subsection architecture_enc_rc_loop The Main Rate Control Loop
+
+Having established a target rate for a given frame and an allowed range of Q
+values, the encoder then tries to encode the frame at a rate that is as close
+as possible to the target value, given the Q range constraints.
+
+There are two main mechanisms by which this is achieved.
+
+The first selects a frame level Q, using an adaptive estimate of the number of
+bits that will be generated when the frame is encoded at any given Q.
+Fundamentally this mechanism is common to VBR, CBR and to use cases such as
+RTC with small adjustments.
+
+As the Q value mainly adjusts the precision of the residual signal, it is not
+actually a reliable basis for accurately predicting the number of bits that
+will be generated across all clips. A well predicted clip, for example, may
+have a much smaller error residual after prediction. The algorithm copes with
+this by adapting its predictions on the fly using a feedback loop based on how
+well it did the previous time around.
+
+The main functions responsible for the prediction of Q and the adaptation over
+time, for the two pass encoding pipeline are:
+
+- \ref rc_pick_q_and_bounds()
+ - \ref get_q()
+ - \ref av1_rc_regulate_q()
+ - \ref get_rate_correction_factor()
+ - \ref set_rate_correction_factor()
+ - \ref find_closest_qindex_by_rate()
+- \ref av1_twopass_postencode_update()
+ - \ref av1_rc_update_rate_correction_factors()
+
+A second mechanism for control comes into play if there is a large rate miss
+for the current frame (much too big or too small). This is a recode mechanism
+which allows the current frame to be re-encoded one or more times with a
+revised Q value. This obviously has significant implications for encode speed
+and in the case of RTC latency (hence it is not used for the RTC pathway).
+
+Whether or not a recode is allowed for a given frame depends on the selected
+encode speed vs quality trade off. This is set on the command line using the
+--cpu-used parameter which maps onto the \ref AV1_COMP.speed field in the main
+compressor instance data structure.
+
+The value of \ref AV1_COMP.speed, combined with the use case, is used to
+populate the speed features data structure AV1_COMP.sf. In particular
+\ref HIGH_LEVEL_SPEED_FEATURES.recode_loop determines the types of frames that
+may be recoded and \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance is a rate
+error trigger threshold.
+
+For more information the reader is directed to the following functions:
+
+- \ref encode_with_recode_loop()
+- \ref encode_without_recode()
+- \ref recode_loop_update_q()
+- \ref recode_loop_test()
+- \ref av1_set_speed_features_framesize_independent()
+- \ref av1_set_speed_features_framesize_dependent()
+
+\subsection architecture_enc_fixed_q Fixed Q Mode
+
+There are two main fixed Q cases:
+-# Fixed Q with adaptive qp offsets: same qp offset for each pyramid level
+ in a given video, but these offsets are adaptive based on video content.
+-# Fixed Q with fixed qp offsets: content-independent fixed qp offsets for
+ each pyramid level.
+
+The reader is also refered to the following functions:
+- \ref av1_rc_pick_q_and_bounds()
+- \ref rc_pick_q_and_bounds_no_stats_cbr()
+- \ref rc_pick_q_and_bounds_no_stats()
+- \ref rc_pick_q_and_bounds()
+
+\section architecture_enc_frame_groups GF/ ARF Frame Groups & Hierarchical Coding
+
+\subsection architecture_enc_frame_groups_data Main Data Structures
+
+The following are the main data structures referenced in this section
+(see also \ref architecture_enc_data_structures):
+
+- \ref AV1_COMP cpi (the main compressor instance data structure)
+ - \ref AV1_COMP.rc (\ref RATE_CONTROL)
+
+- \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first pass
+stats)
+
+\subsection architecture_enc_frame_groups_groups Frame Groups
+
+To process a sequence/stream of video frames, the encoder divides the frames
+into groups and encodes them sequentially (possibly dependent on previous
+groups). In AV1 such a group is usually referred to as a golden frame group
+(GF group) or sometimes an Alt-Ref (ARF) group or a group of pictures (GOP).
+A GF group determines and stores the coding structure of the frames (for
+example, frame type, usage of the hierarchical structure, usage of overlay
+frames, etc.) and can be considered as the base unit to process the frames,
+therefore playing an important role in the encoder.
+
+The length of a specific GF group is arguably the most important aspect when
+determining a GF group. This is because most GF group level decisions are
+based on the frame characteristics, if not on the length itself directly.
+Note that the GF group is always a group of consecutive frames, which means
+the start and end of the group (so again, the length of it) determines which
+frames are included in it and hence determines the characteristics of the GF
+group. Therefore, in this document we will first discuss the GF group length
+decision in Libaom, followed by frame structure decisions when defining a GF
+group with a certain length.
+
+\subsection architecture_enc_gf_length GF / ARF Group Length Determination
+
+The basic intuition of determining the GF group length is that it is usually
+desirable to group together frames that are similar. Hence, we may choose
+longer groups when consecutive frames are very alike and shorter ones when
+they are very different.
+
+The determination of the GF group length is done in function \ref
+calculate_gf_length(). The following encoder use cases are supported:
+
+<ul>
+ <li><b>Single pass with look-ahead disabled(\ref has_no_stats_stage()):
+ </b> in this case there is no information available on the following stream
+ of frames, therefore the function will set the GF group length for the
+ current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS
+ groups) to be the maximum value allowed.</li>
+
+ <li><b>Single pass with look-ahead enabled (\ref AV1_PRIMARY.lap_enabled):</b>
+ look-ahead processing is enabled for single pass, therefore there is a
+ limited amount of information available regarding future frames. In this
+ case the function will determine the length based on \ref FIRSTPASS_STATS
+ (which is generated when processing the look-ahead buffer) for only the
+ current GF group.</li>
+
+ <li><b>Two pass:</b> the first pass in two-pass encoding collects the stats
+ and will not call the function. In the second pass, the function tries to
+ determine the GF group length of the current and the following GF groups (a
+ total number of MAX_NUM_GF_INTERVALS groups) based on the first-pass
+ statistics. Note that as we will be discussing later, such decisions may not
+ be accurate and can be changed later.</li>
+</ul>
+
+Except for the first trivial case where there is no prior knowledge of the
+following frames, the function \ref calculate_gf_length() tries to determine the
+GF group length based on the first pass statistics. The determination is divided
+into two parts:
+
+<ol>
+ <li>Baseline decision based on accumulated statistics: this part of the function
+ iterates through the firstpass statistics of the following frames and
+ accumulates the statistics with function accumulate_next_frame_stats.
+ The accumulated statistics are then used to determine whether the
+ correlation in the GF group has dropped too much in function detect_gf_cut.
+ If detect_gf_cut returns non-zero, or if we've reached the end of
+ first-pass statistics, the baseline decision is set at the current point.</li>
+
+ <li>If we are not at the end of the first-pass statistics, the next part will
+ try to refine the baseline decision. This algorithm is based on the analysis
+ of firstpass stats. It tries to cut the groups in stable regions or
+ relatively stable points. Also it tries to avoid cutting in a blending
+ region.</li>
+</ol>
+
+As mentioned, for two-pass encoding, the function \ref
+calculate_gf_length() tries to determine the length of as many as
+MAX_NUM_GF_INTERVALS groups. The decisions are stored in
+\ref PRIMARY_RATE_CONTROL.gf_intervals[]. The variables
+\ref RATE_CONTROL.intervals_till_gf_calculate_due and
+\ref PRIMARY_RATE_CONTROL.gf_intervals[] help with managing and updating the stored
+decisions. In the function \ref define_gf_group(), the corresponding
+stored length decision will be used to define the current GF group.
+
+When the maximum GF group length is larger or equal to 32, the encoder will
+enforce an extra layer to determine whether to use maximum GF length of 32
+or 16 for every GF group. In such a case, \ref calculate_gf_length() is
+first called with the original maximum length (>=32). Afterwards,
+\ref av1_tpl_setup_stats() is called to analyze the determined GF group
+and compare the reference to the last frame and the middle frame. If it is
+decided that we should use a maximum GF length of 16, the function
+\ref calculate_gf_length() is called again with the updated maximum
+length, and it only sets the length for a single GF group
+(\ref RATE_CONTROL.intervals_till_gf_calculate_due is set to 1). This process
+is shown below.
+
+\image html tplgfgroupdiagram.png "" width=40%
+
+Before encoding each frame, the encoder checks
+\ref RATE_CONTROL.frames_till_gf_update_due. If it is zero, indicating
+processing of the current GF group is done, the encoder will check whether
+\ref RATE_CONTROL.intervals_till_gf_calculate_due is zero. If it is, as
+discussed above, \ref calculate_gf_length() is called with original
+maximum length. If it is not zero, then the GF group length value stored
+in \ref PRIMARY_RATE_CONTROL.gf_intervals[\ref PRIMARY_RATE_CONTROL.cur_gf_index] is used
+(subject to change as discussed above).
+
+\subsection architecture_enc_gf_structure Defining a GF Group's Structure
+
+The function \ref define_gf_group() defines the frame structure as well
+as other GF group level parameters (e.g. bit allocation) once the length of
+the current GF group is determined.
+
+The function first iterates through the first pass statistics in the GF group to
+accumulate various stats, using accumulate_this_frame_stats() and
+accumulate_next_frame_stats(). The accumulated statistics are then used to
+determine the use of the use of ALTREF frame along with other properties of the
+GF group. The values of \ref PRIMARY_RATE_CONTROL.cur_gf_index, \ref
+RATE_CONTROL.intervals_till_gf_calculate_due and \ref
+RATE_CONTROL.frames_till_gf_update_due are also updated accordingly.
+
+The function \ref av1_gop_setup_structure() is called at the end to determine
+the frame layers and reference maps in the GF group, where the
+construct_multi_layer_gf_structure() function sets the frame update types for
+each frame and the group structure.
+
+- If ALTREF frames are allowed for the GF group: the first frame is set to
+ KF_UPDATE, GF_UPDATE or ARF_UPDATE. The last frames of the GF group is set to
+ OVERLAY_UPDATE. Then in set_multi_layer_params(), frame update
+ types are determined recursively in a binary tree fashion, and assigned to
+ give the final IBBB structure for the group. - If the current branch has more
+ than 2 frames and we have not reached maximum layer depth, then the middle
+ frame is set as INTNL_ARF_UPDATE, and the left and right branches are
+ processed recursively. - If the current branch has less than 3 frames, or we
+ have reached maximum layer depth, then every frame in the branch is set to
+ LF_UPDATE.
+
+- If ALTREF frame is not allowed for the GF group: the frames are set
+ as LF_UPDATE. This basically forms an IPPP GF group structure.
+
+As mentioned, the encoder may use Temporal dependancy modelling (TPL - see \ref
+architecture_enc_tpl) to determine whether we should use a maximum length of 32
+or 16 for the current GF group. This requires calls to \ref define_gf_group()
+but should not change other settings (since it is in essence a trial). This
+special case is indicated by the setting parameter <b>is_final_pass</b> for to
+zero.
+
+For single pass encodes where look-ahead processing is disabled
+(\ref AV1_PRIMARY.lap_enabled = 0), \ref define_gf_group_pass0() is used
+instead of \ref define_gf_group().
+
+\subsection architecture_enc_kf_groups Key Frame Groups
+
+A special constraint for GF group length is the location of the next keyframe
+(KF). The frames between two KFs are referred to as a KF group. Each KF group
+can be encoded and decoded independently. Because of this, a GF group cannot
+span beyond a KF and the location of the next KF is set as a hard boundary
+for GF group length.
+
+<ul>
+ <li>For two-pass encoding \ref RATE_CONTROL.frames_to_key controls when to
+ encode a key frame. When it is zero, the current frame is a keyframe and
+ the function \ref find_next_key_frame() is called. This in turn calls
+ \ref define_kf_interval() to work out where the next key frame should
+ be placed.</li>
+
+ <li>For single-pass with look-ahead enabled, \ref define_kf_interval()
+ is called whenever a GF group update is needed (when
+ \ref RATE_CONTROL.frames_till_gf_update_due is zero). This is because
+ generally KFs are more widely spaced and the look-ahead buffer is usually
+ not long enough.</li>
+
+ <li>For single-pass with look-ahead disabled, the KFs are placed according
+ to the command line parameter <b>--kf-max-dist</b> (The above two cases are
+ also subject to this constraint).</li>
+</ul>
+
+The function \ref define_kf_interval() tries to detect a scenecut.
+If a scenecut within kf-max-dist is detected, then it is set as the next
+keyframe. Otherwise the given maximum value is used.
+
+\section architecture_enc_tpl Temporal Dependency Modelling
+
+The temporal dependency model runs at the beginning of each GOP. It builds the
+motion trajectory within the GOP in units of 16x16 blocks. The temporal
+dependency of a 16x16 block is evaluated as the predictive coding gains it
+contributes to its trailing motion trajectory. This temporal dependency model
+reflects how important a coding block is for the coding efficiency of the
+overall GOP. It is hence used to scale the Lagrangian multiplier used in the
+rate-distortion optimization framework.
+
+\subsection architecture_enc_tpl_config Configurations
+
+The temporal dependency model and its applications are by default turned on in
+libaom encoder for the VoD use case. To disable it, use --tpl-model=0 in the
+aomenc configuration.
+
+\subsection architecture_enc_tpl_algoritms Algorithms
+
+The scheme works in the reverse frame processing order over the source frames,
+propagating information from future frames back to the current frame. For each
+frame, a propagation step is run for each MB. it operates as follows:
+
+<ul>
+ <li> Estimate the intra prediction cost in terms of sum of absolute Hadamard
+ transform difference (SATD) noted as intra_cost. It also loads the motion
+ information available from the first-pass encode and estimates the inter
+ prediction cost as inter_cost. Due to the use of hybrid inter/intra
+ prediction mode, the inter_cost value is further upper bounded by
+ intra_cost. A propagation cost variable is used to collect all the
+ information flowed back from future processing frames. It is initialized as
+ 0 for all the blocks in the last processing frame in a group of pictures
+ (GOP).</li>
+
+ <li> The fraction of information from a current block to be propagated towards
+ its reference block is estimated as:
+\f[
+ propagation\_fraction = (1 - inter\_cost/intra\_cost)
+\f]
+ It reflects how much the motion compensated reference would reduce the
+ prediction error in percentage.</li>
+
+ <li> The total amount of information the current block contributes to the GOP
+ is estimated as intra_cost + propagation_cost. The information that it
+ propagates towards its reference block is captured by:
+
+\f[
+ propagation\_amount =
+ (intra\_cost + propagation\_cost) * propagation\_fraction
+\f]</li>
+
+ <li> Note that the reference block may not necessarily sit on the grid of
+ 16x16 blocks. The propagation amount is hence dispensed to all the blocks
+ that overlap with the reference block. The corresponding block in the
+ reference frame accumulates its own propagation cost as it receives back
+ propagation.
+
+\f[
+ propagation\_cost = propagation\_cost +
+ (\frac{overlap\_area}{(16*16)} * propagation\_amount)
+\f]</li>
+
+ <li> In the final encoding stage, the distortion propagation factor of a block
+ is evaluated as \f$(1 + \frac{propagation\_cost}{intra\_cost})\f$, where the second term
+ captures its impact on later frames in a GOP.</li>
+
+ <li> The Lagrangian multiplier is adapted at the 64x64 block level. For every
+ 64x64 block in a frame, we have a distortion propagation factor:
+
+\f[
+ dist\_prop[i] = 1 + \frac{propagation\_cost[i]}{intra\_cost[i]}
+\f]
+
+ where i denotes the block index in the frame. We also have the frame level
+ distortion propagation factor:
+
+\f[
+ dist\_prop = 1 +
+ \frac{\sum_{i}propagation\_cost[i]}{\sum_{i}intra\_cost[i]}
+\f]
+
+ which is used to normalize the propagation factor at the 64x64 block level. The
+ Lagrangian multiplier is hence adapted as:
+
+\f[
+ &lambda;[i] = &lambda;[0] * \frac{dist\_prop}{dist\_prop[i]}
+\f]
+
+ where &lambda;0 is the multiplier associated with the frame level QP. The
+ 64x64 block level QP is scaled according to the Lagrangian multiplier.
+</ul>
+
+\subsection architecture_enc_tpl_keyfun Key Functions and data structures
+
+The reader is also refered to the following functions and data structures:
+
+- \ref TplParams
+- \ref av1_tpl_setup_stats() builds the TPL model.
+- \ref setup_delta_q() Assign different quantization parameters to each super
+ block based on its TPL weight.
+
+\section architecture_enc_partitions Block Partition Search
+
+ A frame is first split into tiles in \ref encode_tiles(), with each tile
+ compressed by av1_encode_tile(). Then a tile is processed in superblock rows
+ via \ref av1_encode_sb_row() and then \ref encode_sb_row().
+
+ The partition search processes superblocks sequentially in \ref
+ encode_sb_row(). Two search modes are supported, depending upon the encoding
+ configuration, \ref encode_nonrd_sb() is for 1-pass and real-time modes,
+ while \ref encode_rd_sb() performs more exhaustive rate distortion based
+ searches.
+
+ Partition search over the recursive quad-tree space is implemented by
+ recursive calls to \ref av1_nonrd_use_partition(),
+ \ref av1_rd_use_partition(), or av1_rd_pick_partition() and returning best
+ options for sub-trees to their parent partitions.
+
+ In libaom, the partition search lays on top of the mode search (predictor,
+ transform, etc.), instead of being a separate module. The interface of mode
+ search is \ref pick_sb_modes(), which connects the partition_search with
+ \ref architecture_enc_inter_modes and \ref architecture_enc_intra_modes. To
+ make good decisions, reconstruction is also required in order to build
+ references and contexts. This is implemented by \ref encode_sb() at the
+ sub-tree level and \ref encode_b() at coding block level.
+
+ See also \ref partition_search
+
+\section architecture_enc_intra_modes Intra Mode Search
+
+AV1 also provides 71 different intra prediction modes, i.e. modes that predict
+only based upon information in the current frame with no dependency on
+previous or future frames. For key frames, where this independence from any
+other frame is a defining requirement and for other cases where intra only
+frames are required, the encoder need only considers these modes in the rate
+distortion loop.
+
+Even so, in most use cases, searching all possible intra prediction modes for
+every block and partition size is not practical and some pruning of the search
+tree is necessary.
+
+For the Rate distortion optimized case, the main top level function
+responsible for selecting the intra prediction mode for a given block is
+\ref av1_rd_pick_intra_mode_sb(). The readers attention is also drawn to the
+functions \ref hybrid_intra_mode_search() and \ref av1_nonrd_pick_intra_mode()
+which may be used where encode speed is critical. The choice between the
+rd path and the non rd or hybrid paths depends on the encoder use case and the
+\ref AV1_COMP.speed parameter. Further fine control of the speed vs quality
+trade off is provided by means of fields in \ref AV1_COMP.sf (which has type
+\ref SPEED_FEATURES).
+
+Note that some intra modes are only considered for specific use cases or
+types of video. For example the palette based prediction modes are often
+valueable for graphics or screen share content but not for natural video.
+(See \ref av1_search_palette_mode())
+
+See also \ref intra_mode_search for more details.
+
+\section architecture_enc_inter_modes Inter Prediction Mode Search
+
+For inter frames, where we also allow prediction using one or more previously
+coded frames (which may chronologically speaking be past or future frames or
+non-display reference buffers such as ARF frames), the size of the search tree
+that needs to be traversed, to select a prediction mode, is considerably more
+massive.
+
+In addition to the 71 possible intra modes we also need to consider 56 single
+frame inter prediction modes (7 reference frames x 4 modes x 2 for OBMC
+(overlapped block motion compensation)), 12768 compound inter prediction modes
+(these are modes that combine inter predictors from two reference frames) and
+36708 compound inter / intra prediction modes.
+
+As with the intra mode search, libaom supports an RD based pathway and a non
+rd pathway for speed critical use cases. The entry points for these two cases
+are \ref av1_rd_pick_inter_mode() and \ref av1_nonrd_pick_inter_mode_sb()
+respectively.
+
+Various heuristics and predictive strategies are used to prune the search tree
+with fine control provided through the speed features parameter in the main
+compressor instance data structure \ref AV1_COMP.sf.
+
+It is worth noting, that some prediction modes incurr a much larger rate cost
+than others (ignoring for now the cost of coding the error residual). For
+example, a compound mode that requires the encoder to specify two reference
+frames and two new motion vectors will almost inevitable have a higher rate
+cost than a simple inter prediction mode that uses a predicted or 0,0 motion
+vector. As such, if we have already found a mode for the current block that
+has a low RD cost, we can skip a large number of the possible modes on the
+basis that even if the error residual is 0 the inherent rate cost of the
+mode itself will garauntee that it is not chosen.
+
+See also \ref inter_mode_search for more details.
+
+\section architecture_enc_tx_search Transform Search
+
+AV1 implements the transform stage using 4 seperable 1-d transforms (DCT,
+ADST, FLIPADST and IDTX, where FLIPADST is the reversed version of ADST
+and IDTX is the identity transform) which can be combined to give 16 2-d
+combinations.
+
+These combinations can be applied at 19 different scales from 64x64 pixels
+down to 4x4 pixels.
+
+This gives rise to a large number of possible candidate transform options
+for coding the residual error after prediction. An exhaustive rate-distortion
+based evaluation of all candidates would not be practical from a speed
+perspective in a production encoder implementation. Hence libaom addopts a
+number of strategies to prune the selection of both the transform size and
+transform type.
+
+There are a number of strategies that have been tested and implememnted in
+libaom including:
+
+- A statistics based approach that looks at the frequency with which certain
+ combinations are used in a given context and prunes out very unlikely
+ candidates. It is worth noting here that some size candidates can be pruned
+ out immediately based on the size of the prediction partition. For example it
+ does not make sense to use a transform size that is larger than the
+ prediction partition size but also a very large prediction partition size is
+ unlikely to be optimally pared with small transforms.
+
+- A Machine learning based model
+
+- A method that initially tests candidates using a fast algorithm that skips
+ entropy encoding and uses an estimated cost model to choose a reduced subset
+ for full RD analysis. This subject is covered more fully in a paper authored
+ by Bohan Li, Jingning Han, and Yaowu Xu titled: <b>Fast Transform Type
+ Selection Using Conditional Laplace Distribution Based Rate Estimation</b>
+
+<b>TODO Add link to paper when available</b>
+
+See also \ref transform_search for more details.
+
+\section architecture_post_enc_filt Post Encode Loop Filtering
+
+AV1 supports three types of post encode <b>in loop</b> filtering to improve
+the quality of the reconstructed video.
+
+- <b>Deblocking Filter</b> The first of these is a farily traditional boundary
+ deblocking filter that attempts to smooth discontinuities that may occur at
+ the boundaries between blocks. See also \ref in_loop_filter.
+
+- <b>CDEF Filter</b> The constrained directional enhancement filter (CDEF)
+ allows the codec to apply a non-linear deringing filter along certain
+ (potentially oblique) directions. A primary filter is applied along the
+ selected direction, whilst a secondary filter is applied at 45 degrees to
+ the primary direction. (See also \ref in_loop_cdef and
+ <a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+- <b>Loop Restoration Filter</b> The loop restoration filter is applied after
+ any prior post filtering stages. It acts on units of either 64 x 64,
+ 128 x 128, or 256 x 256 pixel blocks, refered to as loop restoration units.
+ Each unit can independently select either to bypass filtering, use a Wiener
+ filter, or use a self-guided filter. (See also \ref in_loop_restoration and
+ <a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+\section architecture_entropy Entropy Coding
+
+\subsection architecture_entropy_aritmetic Arithmetic Coder
+
+VP9, used a binary arithmetic coder to encode symbols, where the propability
+of a 1 or 0 at each descision node was based on a context model that took
+into account recently coded values (for example previously coded coefficients
+in the current block). A mechanism existed to update the context model each
+frame, either explicitly in the bitstream, or implicitly at both the encoder
+and decoder based on the observed frequency of different outcomes in the
+previous frame. VP9 also supported seperate context models for different types
+of frame (e.g. inter coded frames and key frames).
+
+In contrast, AV1 uses an M-ary symbol arithmetic coder to compress the syntax
+elements, where integer \f$M\in[2, 14]\f$. This approach is based upon the entropy
+coding strategy used in the Daala video codec and allows for some bit-level
+parallelism in its implementation. AV1 also has an extended context model and
+allows for updates to the probabilities on a per symbol basis as opposed to
+the per frame strategy in VP9.
+
+To improve the performance / throughput of the arithmetic encoder, especially
+in hardware implementations, the probability model is updated and maintained
+at 15-bit precision, but the arithmetic encoder only uses the most significant
+9 bits when encoding a symbol. A more detailed discussion of the algorithm
+and design constraints can be found in
+<a href="https://arxiv.org/abs/2008.06091"> A Technical Overview of AV1</a>.
+
+TODO add references to key functions / files.
+
+As with VP9, a mechanism exists in AV1 to encode some elements into the
+bitstream as uncrompresed bits or literal values, without using the arithmetic
+coder. For example, some frame and sequence header values, where it is
+beneficial to be able to read the values directly.
+
+TODO add references to key functions / files.
+
+\subsection architecture_entropy_coef Transform Coefficient Coding and Optimization
+\image html coeff_coding.png "" width=70%
+
+\subsubsection architecture_entropy_coef_what Transform coefficient coding
+Transform coefficient coding is where the encoder compresses a quantized version
+of prediction residue into the bitstream.
+
+\paragraph architecture_entropy_coef_prepare Preparation - transform and quantize
+Before the entropy coding stage, the encoder decouple the pixel-to-pixel
+correlation of the prediction residue by transforming the residue from the
+spatial domain to the frequency domain. Then the encoder quantizes the transform
+coefficients to make the coefficients ready for entropy coding.
+
+\paragraph architecture_entropy_coef_coding The coding process
+The encoder uses \ref av1_write_coeffs_txb() to write the coefficients of
+a transform block into the bitstream.
+The coding process has three stages.
+1. The encoder will code transform block skip flag (txb_skip). If the skip flag is
+off, then the encoder will code the end of block position (eob) which is the scan
+index of the last non-zero coefficient plus one.
+2. Second, the encoder will code lower magnitude levels of each coefficient in
+reverse scan order.
+3. Finally, the encoder will code the sign and higher magnitude levels for each
+coefficient if they are available.
+
+Related functions:
+- \ref av1_write_coeffs_txb()
+- write_inter_txb_coeff()
+- \ref av1_write_intra_coeffs_mb()
+
+\paragraph architecture_entropy_coef_context Context information
+To improve the compression efficiency, the encoder uses several context models
+tailored for transform coefficients to capture the correlations between coding
+symbols. Most of the context models are built to capture the correlations
+between the coefficients within the same transform block. However, transform
+block skip flag (txb_skip) and the sign of dc coefficient (dc_sign) require
+context info from neighboring transform blocks.
+
+Here is how context info spread between transform blocks. Before coding a
+transform block, the encoder will use get_txb_ctx() to collect the context
+information from neighboring transform blocks. Then the context information
+will be used for coding transform block skip flag (txb_skip) and the sign of
+dc coefficient (dc_sign). After the transform block is coded, the encoder will
+extract the context info from the current block using
+\ref av1_get_txb_entropy_context(). Then encoder will store the context info
+into a byte (uint8_t) using av1_set_entropy_contexts(). The encoder will use
+the context info to code other transform blocks.
+
+Related functions:
+- \ref av1_get_txb_entropy_context()
+- av1_set_entropy_contexts()
+- get_txb_ctx()
+- \ref av1_update_intra_mb_txb_context()
+
+\subsubsection architecture_entropy_coef_rd RD optimization
+Beside the actual entropy coding, the encoder uses several utility functions
+to make optimal RD decisions.
+
+\paragraph architecture_entropy_coef_cost Entropy cost
+The encoder uses \ref av1_cost_coeffs_txb() or \ref av1_cost_coeffs_txb_laplacian()
+to estimate the entropy cost of a transform block. Note that
+\ref av1_cost_coeffs_txb() is slower but accurate whereas
+\ref av1_cost_coeffs_txb_laplacian() is faster but less accurate.
+
+Related functions:
+- \ref av1_cost_coeffs_txb()
+- \ref av1_cost_coeffs_txb_laplacian()
+- \ref av1_cost_coeffs_txb_estimate()
+
+\paragraph architecture_entropy_coef_opt Quantized level optimization
+Beside computing entropy cost, the encoder also uses \ref av1_optimize_txb()
+to adjust the coefficient’s quantized levels to achieve optimal RD trade-off.
+In \ref av1_optimize_txb(), the encoder goes through each quantized
+coefficient and lowers the quantized coefficient level by one if the action
+yields a better RD score.
+
+Related functions:
+- \ref av1_optimize_txb()
+
+All the related functions are listed in \ref coefficient_coding.
+
+*/
+
+/*!\defgroup encoder_algo Encoder Algorithm
+ *
+ * The encoder algorithm describes how a sequence is encoded, including high
+ * level decision as well as algorithm used at every encoding stage.
+ */
+
+/*!\defgroup high_level_algo High-level Algorithm
+ * \ingroup encoder_algo
+ * This module describes sequence level/frame level algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+
+/*!\defgroup speed_features Speed vs Quality Trade Off
+ * \ingroup high_level_algo
+ * This module describes the encode speed vs quality tradeoff
+ * @{
+ */
+/*! @} - end defgroup speed_features */
+
+/*!\defgroup src_frame_proc Source Frame Processing
+ * \ingroup high_level_algo
+ * This module describes algorithms in AV1 assosciated with the
+ * pre-processing of source frames. See also \ref architecture_enc_src_proc
+ *
+ * @{
+ */
+/*! @} - end defgroup src_frame_proc */
+
+/*!\defgroup rate_control Rate Control
+ * \ingroup high_level_algo
+ * This module describes rate control algorithm in AV1.
+ * See also \ref architecture_enc_rate_ctrl
+ * @{
+ */
+/*! @} - end defgroup rate_control */
+
+/*!\defgroup tpl_modelling Temporal Dependency Modelling
+ * \ingroup high_level_algo
+ * This module includes algorithms to implement temporal dependency modelling.
+ * See also \ref architecture_enc_tpl
+ * @{
+ */
+/*! @} - end defgroup tpl_modelling */
+
+/*!\defgroup two_pass_algo Two Pass Mode
+ \ingroup high_level_algo
+
+ In two pass mode, the input file is passed into the encoder for a quick
+ first pass, where statistics are gathered. These statistics and the input
+ file are then passed back into the encoder for a second pass. The statistics
+ help the encoder reach the desired bitrate without as much overshooting or
+ undershooting.
+
+ During the first pass, the codec will return "stats" packets that contain
+ information useful for the second pass. The caller should concatenate these
+ packets as they are received. In the second pass, the concatenated packets
+ are passed in, along with the frames to encode. During the second pass,
+ "frame" packets are returned that represent the compressed video.
+
+ A complete example can be found in `examples/twopass_encoder.c`. Pseudocode
+ is provided below to illustrate the core parts.
+
+ During the first pass, the uncompressed frames are passed in and stats
+ information is appended to a byte array.
+
+~~~~~~~~~~~~~~~{.c}
+// For simplicity, assume that there is enough memory in the stats buffer.
+// Actual code will want to use a resizable array. stats_len represents
+// the length of data already present in the buffer.
+void get_stats_data(aom_codec_ctx_t *encoder, char *stats,
+ size_t *stats_len, bool *got_data) {
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = NULL;
+ while ((pkt = aom_codec_get_cx_data(encoder, &iter))) {
+ *got_data = true;
+ if (pkt->kind != AOM_CODEC_STATS_PKT) continue;
+ memcpy(stats + *stats_len, pkt->data.twopass_stats.buf,
+ pkt->data.twopass_stats.sz);
+ *stats_len += pkt->data.twopass_stats.sz;
+ }
+}
+
+void first_pass(char *stats, size_t *stats_len) {
+ struct aom_codec_enc_cfg first_pass_cfg;
+ ... // Initialize the config as needed.
+ first_pass_cfg.g_pass = AOM_RC_FIRST_PASS;
+ aom_codec_ctx_t first_pass_encoder;
+ ... // Initialize the encoder.
+
+ while (frame_available) {
+ // Read in the uncompressed frame, update frame_available
+ aom_image_t *frame_to_encode = ...;
+ aom_codec_encode(&first_pass_encoder, img, pts, duration, flags);
+ get_stats_data(&first_pass_encoder, stats, stats_len);
+ }
+ // After all frames have been processed, call aom_codec_encode with
+ // a NULL ptr repeatedly, until no more data is returned. The NULL
+ // ptr tells the encoder that no more frames are available.
+ bool got_data;
+ do {
+ got_data = false;
+ aom_codec_encode(&first_pass_encoder, NULL, pts, duration, flags);
+ get_stats_data(&first_pass_encoder, stats, stats_len, &got_data);
+ } while (got_data);
+
+ aom_codec_destroy(&first_pass_encoder);
+}
+~~~~~~~~~~~~~~~
+
+ During the second pass, the uncompressed frames and the stats are
+ passed into the encoder.
+
+~~~~~~~~~~~~~~~{.c}
+// Write out each encoded frame to the file.
+void get_cx_data(aom_codec_ctx_t *encoder, FILE *file,
+ bool *got_data) {
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = NULL;
+ while ((pkt = aom_codec_get_cx_data(encoder, &iter))) {
+ *got_data = true;
+ if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) continue;
+ fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, file);
+ }
+}
+
+void second_pass(char *stats, size_t stats_len) {
+ struct aom_codec_enc_cfg second_pass_cfg;
+ ... // Initialize the config file as needed.
+ second_pass_cfg.g_pass = AOM_RC_LAST_PASS;
+ cfg.rc_twopass_stats_in.buf = stats;
+ cfg.rc_twopass_stats_in.sz = stats_len;
+ aom_codec_ctx_t second_pass_encoder;
+ ... // Initialize the encoder from the config.
+
+ FILE *output = fopen("output.obu", "wb");
+ while (frame_available) {
+ // Read in the uncompressed frame, update frame_available
+ aom_image_t *frame_to_encode = ...;
+ aom_codec_encode(&second_pass_encoder, img, pts, duration, flags);
+ get_cx_data(&second_pass_encoder, output);
+ }
+ // Pass in NULL to flush the encoder.
+ bool got_data;
+ do {
+ got_data = false;
+ aom_codec_encode(&second_pass_encoder, NULL, pts, duration, flags);
+ get_cx_data(&second_pass_encoder, output, &got_data);
+ } while (got_data);
+
+ aom_codec_destroy(&second_pass_encoder);
+}
+~~~~~~~~~~~~~~~
+ */
+
+ /*!\defgroup look_ahead_buffer The Look-Ahead Buffer
+ \ingroup high_level_algo
+
+ A program should call \ref aom_codec_encode() for each frame that needs
+ processing. These frames are internally copied and stored in a fixed-size
+ circular buffer, known as the look-ahead buffer. Other parts of the code
+ will use future frame information to inform current frame decisions;
+ examples include the first-pass algorithm, TPL model, and temporal filter.
+ Note that this buffer also keeps a reference to the last source frame.
+
+ The look-ahead buffer is defined in \ref av1/encoder/lookahead.h. It acts as an
+ opaque structure, with an interface to create and free memory associated with
+ it. It supports pushing and popping frames onto the structure in a FIFO
+ fashion. It also allows look-ahead when using the \ref av1_lookahead_peek()
+ function with a non-negative number, and look-behind when -1 is passed in (for
+ the last source frame; e.g., firstpass will use this for motion estimation).
+ The \ref av1_lookahead_depth() function returns the current number of frames
+ stored in it. Note that \ref av1_lookahead_pop() is a bit of a misnomer - it
+ only pops if either the "flush" variable is set, or the buffer is at maximum
+ capacity.
+
+ The buffer is stored in the \ref AV1_PRIMARY::lookahead field.
+ It is initialized in the first call to \ref aom_codec_encode(), in the
+ \ref av1_receive_raw_frame() sub-routine. The buffer size is defined by
+ the g_lag_in_frames parameter set in the
+ \ref aom_codec_enc_cfg_t::g_lag_in_frames struct.
+ This can be modified manually but should only be set once. On the command
+ line, the flag "--lag-in-frames" controls it. The default size is 19 for
+ non-realtime usage and 1 for realtime. Note that a maximum value of 35 is
+ enforced.
+
+ A frame will stay in the buffer as long as possible. As mentioned above,
+ the \ref av1_lookahead_pop() only removes a frame when either flush is set,
+ or the buffer is full. Note that each call to \ref aom_codec_encode() inserts
+ another frame into the buffer, and pop is called by the sub-function
+ \ref av1_encode_strategy(). The buffer is told to flush when
+ \ref aom_codec_encode() is passed a NULL image pointer. Note that the caller
+ must repeatedly call \ref aom_codec_encode() with a NULL image pointer, until
+ no more packets are available, in order to fully flush the buffer.
+
+ */
+
+/*! @} - end defgroup high_level_algo */
+
+/*!\defgroup partition_search Partition Search
+ * \ingroup encoder_algo
+ * For and overview of the partition search see \ref architecture_enc_partitions
+ * @{
+ */
+
+/*! @} - end defgroup partition_search */
+
+/*!\defgroup intra_mode_search Intra Mode Search
+ * \ingroup encoder_algo
+ * This module describes intra mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup intra_mode_search */
+
+/*!\defgroup inter_mode_search Inter Mode Search
+ * \ingroup encoder_algo
+ * This module describes inter mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup inter_mode_search */
+
+/*!\defgroup palette_mode_search Palette Mode Search
+ * \ingroup intra_mode_search
+ * This module describes palette mode search algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup palette_mode_search */
+
+/*!\defgroup transform_search Transform Search
+ * \ingroup encoder_algo
+ * This module describes transform search algorithm in AV1.
+ * @{
+ */
+/*! @} - end defgroup transform_search */
+
+/*!\defgroup coefficient_coding Transform Coefficient Coding and Optimization
+ * \ingroup encoder_algo
+ * This module describes the algorithms of transform coefficient coding and optimization in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup coefficient_coding */
+
+/*!\defgroup in_loop_filter In-loop Filter
+ * \ingroup encoder_algo
+ * This module describes in-loop filter algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_filter */
+
+/*!\defgroup in_loop_cdef CDEF
+ * \ingroup encoder_algo
+ * This module describes the CDEF parameter search algorithm
+ * in AV1. More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_restoration */
+
+/*!\defgroup in_loop_restoration Loop Restoration
+ * \ingroup encoder_algo
+ * This module describes the loop restoration search
+ * and estimation algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup in_loop_restoration */
+
+/*!\defgroup cyclic_refresh Cyclic Refresh
+ * \ingroup encoder_algo
+ * This module describes the cyclic refresh (aq-mode=3) in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup cyclic_refresh */
+
+/*!\defgroup SVC Scalable Video Coding
+ * \ingroup encoder_algo
+ * This module describes scalable video coding algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup SVC */
+/*!\defgroup variance_partition Variance Partition
+ * \ingroup encoder_algo
+ * This module describes variance partition algorithm in AV1.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup variance_partition */
+/*!\defgroup nonrd_mode_search NonRD Optimized Mode Search
+ * \ingroup encoder_algo
+ * This module describes NonRD Optimized Mode Search used in Real-Time mode.
+ * More details will be added.
+ * @{
+ */
+/*! @} - end defgroup nonrd_mode_search */
diff --git a/third_party/aom/doc/dev_guide/av1encoderflow.png b/third_party/aom/doc/dev_guide/av1encoderflow.png
new file mode 100644
index 0000000000..5e69fce39c
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/av1encoderflow.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/av1partitions.png b/third_party/aom/doc/dev_guide/av1partitions.png
new file mode 100644
index 0000000000..125439f5cb
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/av1partitions.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/coeff_coding.png b/third_party/aom/doc/dev_guide/coeff_coding.png
new file mode 100644
index 0000000000..cba97dd712
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/coeff_coding.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/filter_flow.png b/third_party/aom/doc/dev_guide/filter_flow.png
new file mode 100644
index 0000000000..82849a0666
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/filter_flow.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/filter_thr.png b/third_party/aom/doc/dev_guide/filter_thr.png
new file mode 100644
index 0000000000..b833e941f6
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/filter_thr.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/genericcodecflow.png b/third_party/aom/doc/dev_guide/genericcodecflow.png
new file mode 100644
index 0000000000..65a6b2f19e
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/genericcodecflow.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/gf_group.png b/third_party/aom/doc/dev_guide/gf_group.png
new file mode 100644
index 0000000000..1cd47d2490
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/gf_group.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/partition.png b/third_party/aom/doc/dev_guide/partition.png
new file mode 100644
index 0000000000..914d6c2fd0
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/partition.png
Binary files differ
diff --git a/third_party/aom/doc/dev_guide/tplgfgroupdiagram.png b/third_party/aom/doc/dev_guide/tplgfgroupdiagram.png
new file mode 100644
index 0000000000..fa5b0671c2
--- /dev/null
+++ b/third_party/aom/doc/dev_guide/tplgfgroupdiagram.png
Binary files differ
diff --git a/third_party/aom/doc/img/edge_direction.svg b/third_party/aom/doc/img/edge_direction.svg
new file mode 100644
index 0000000000..343a2b9f60
--- /dev/null
+++ b/third_party/aom/doc/img/edge_direction.svg
@@ -0,0 +1,6319 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export edge_direction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="9.25333in" height="8.04538in"
+ viewBox="0 0 666.24 579.267" xml:space="preserve" color-interpolation-filters="sRGB" class="st8">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {fill:#000000;font-family:Calibri;font-size:0.75em}
+ .st3 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st4 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+ .st5 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st6 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+ .st7 {font-size:1em;font-style:normal}
+ .st8 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <v:layer v:name="Connector" v:index="0"/>
+ <g id="shape111-1" v:mID="111" v:groupContext="shape" transform="translate(18.12,-468.375)">
+ <title>Square</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape113-4" v:mID="113" v:groupContext="shape" transform="translate(36.12,-468.375)">
+ <title>Square.113</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape114-7" v:mID="114" v:groupContext="shape" transform="translate(54.12,-468.375)">
+ <title>Square.114</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape115-10" v:mID="115" v:groupContext="shape" transform="translate(72.12,-468.375)">
+ <title>Square.115</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape116-13" v:mID="116" v:groupContext="shape" transform="translate(18.12,-450.375)">
+ <title>Square.116</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape117-16" v:mID="117" v:groupContext="shape" transform="translate(36.12,-450.375)">
+ <title>Square.117</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape118-19" v:mID="118" v:groupContext="shape" transform="translate(54.12,-450.375)">
+ <title>Square.118</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape119-22" v:mID="119" v:groupContext="shape" transform="translate(72.12,-450.375)">
+ <title>Square.119</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape124-25" v:mID="124" v:groupContext="shape" transform="translate(18.12,-432.375)">
+ <title>Square.124</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape125-28" v:mID="125" v:groupContext="shape" transform="translate(36.12,-432.375)">
+ <title>Square.125</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape126-31" v:mID="126" v:groupContext="shape" transform="translate(54.12,-432.375)">
+ <title>Square.126</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape127-34" v:mID="127" v:groupContext="shape" transform="translate(72.12,-432.375)">
+ <title>Square.127</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape128-37" v:mID="128" v:groupContext="shape" transform="translate(18.12,-414.375)">
+ <title>Square.128</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape129-40" v:mID="129" v:groupContext="shape" transform="translate(36.12,-414.375)">
+ <title>Square.129</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape130-43" v:mID="130" v:groupContext="shape" transform="translate(54.12,-414.375)">
+ <title>Square.130</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape131-46" v:mID="131" v:groupContext="shape" transform="translate(72.12,-414.375)">
+ <title>Square.131</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape132-49" v:mID="132" v:groupContext="shape" transform="translate(18.12,-396.375)">
+ <title>Square.132</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape133-52" v:mID="133" v:groupContext="shape" transform="translate(36.12,-396.375)">
+ <title>Square.133</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape134-55" v:mID="134" v:groupContext="shape" transform="translate(54.12,-396.375)">
+ <title>Square.134</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape135-58" v:mID="135" v:groupContext="shape" transform="translate(72.12,-396.375)">
+ <title>Square.135</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape136-61" v:mID="136" v:groupContext="shape" transform="translate(18.12,-378.375)">
+ <title>Square.136</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape137-64" v:mID="137" v:groupContext="shape" transform="translate(36.12,-378.375)">
+ <title>Square.137</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape138-67" v:mID="138" v:groupContext="shape" transform="translate(54.12,-378.375)">
+ <title>Square.138</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape139-70" v:mID="139" v:groupContext="shape" transform="translate(72.12,-378.375)">
+ <title>Square.139</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape140-73" v:mID="140" v:groupContext="shape" transform="translate(18.12,-360.375)">
+ <title>Square.140</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape141-76" v:mID="141" v:groupContext="shape" transform="translate(36.12,-360.375)">
+ <title>Square.141</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape142-79" v:mID="142" v:groupContext="shape" transform="translate(54.12,-360.375)">
+ <title>Square.142</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape143-82" v:mID="143" v:groupContext="shape" transform="translate(72.12,-360.375)">
+ <title>Square.143</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape144-85" v:mID="144" v:groupContext="shape" transform="translate(18.12,-342.375)">
+ <title>Square.144</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape145-88" v:mID="145" v:groupContext="shape" transform="translate(36.12,-342.375)">
+ <title>Square.145</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape146-91" v:mID="146" v:groupContext="shape" transform="translate(54.12,-342.375)">
+ <title>Square.146</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape147-94" v:mID="147" v:groupContext="shape" transform="translate(72.12,-342.375)">
+ <title>Square.147</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape148-97" v:mID="148" v:groupContext="shape" transform="translate(90.12,-468.375)">
+ <title>Square.148</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape149-100" v:mID="149" v:groupContext="shape" transform="translate(108.12,-468.375)">
+ <title>Square.149</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape150-103" v:mID="150" v:groupContext="shape" transform="translate(126.12,-468.375)">
+ <title>Square.150</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape151-106" v:mID="151" v:groupContext="shape" transform="translate(144.12,-468.375)">
+ <title>Square.151</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape152-109" v:mID="152" v:groupContext="shape" transform="translate(90.12,-450.375)">
+ <title>Square.152</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape153-112" v:mID="153" v:groupContext="shape" transform="translate(108.12,-450.375)">
+ <title>Square.153</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape154-115" v:mID="154" v:groupContext="shape" transform="translate(126.12,-450.375)">
+ <title>Square.154</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape155-118" v:mID="155" v:groupContext="shape" transform="translate(144.12,-450.375)">
+ <title>Square.155</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape156-121" v:mID="156" v:groupContext="shape" transform="translate(90.12,-432.375)">
+ <title>Square.156</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape157-124" v:mID="157" v:groupContext="shape" transform="translate(108.12,-432.375)">
+ <title>Square.157</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape158-127" v:mID="158" v:groupContext="shape" transform="translate(126.12,-432.375)">
+ <title>Square.158</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape159-130" v:mID="159" v:groupContext="shape" transform="translate(144.12,-432.375)">
+ <title>Square.159</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape160-133" v:mID="160" v:groupContext="shape" transform="translate(90.12,-414.375)">
+ <title>Square.160</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape161-136" v:mID="161" v:groupContext="shape" transform="translate(108.12,-414.375)">
+ <title>Square.161</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape162-139" v:mID="162" v:groupContext="shape" transform="translate(126.12,-414.375)">
+ <title>Square.162</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape163-142" v:mID="163" v:groupContext="shape" transform="translate(144.12,-414.375)">
+ <title>Square.163</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape164-145" v:mID="164" v:groupContext="shape" transform="translate(90.12,-396.375)">
+ <title>Square.164</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape165-148" v:mID="165" v:groupContext="shape" transform="translate(108.12,-396.375)">
+ <title>Square.165</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape166-151" v:mID="166" v:groupContext="shape" transform="translate(126.12,-396.375)">
+ <title>Square.166</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape167-154" v:mID="167" v:groupContext="shape" transform="translate(144.12,-396.375)">
+ <title>Square.167</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape168-157" v:mID="168" v:groupContext="shape" transform="translate(90.12,-378.375)">
+ <title>Square.168</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape169-160" v:mID="169" v:groupContext="shape" transform="translate(108.12,-378.375)">
+ <title>Square.169</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape170-163" v:mID="170" v:groupContext="shape" transform="translate(126.12,-378.375)">
+ <title>Square.170</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape171-166" v:mID="171" v:groupContext="shape" transform="translate(144.12,-378.375)">
+ <title>Square.171</title>
+ <desc>12</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text> </g>
+ <g id="shape172-169" v:mID="172" v:groupContext="shape" transform="translate(90.12,-360.375)">
+ <title>Square.172</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape173-172" v:mID="173" v:groupContext="shape" transform="translate(108.12,-360.375)">
+ <title>Square.173</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape174-175" v:mID="174" v:groupContext="shape" transform="translate(126.12,-360.375)">
+ <title>Square.174</title>
+ <desc>12</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text> </g>
+ <g id="shape175-178" v:mID="175" v:groupContext="shape" transform="translate(144.12,-360.375)">
+ <title>Square.175</title>
+ <desc>13</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text> </g>
+ <g id="shape176-181" v:mID="176" v:groupContext="shape" transform="translate(90.12,-342.375)">
+ <title>Square.176</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape177-184" v:mID="177" v:groupContext="shape" transform="translate(108.12,-342.375)">
+ <title>Square.177</title>
+ <desc>12</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text> </g>
+ <g id="shape178-187" v:mID="178" v:groupContext="shape" transform="translate(126.12,-342.375)">
+ <title>Square.178</title>
+ <desc>13</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text> </g>
+ <g id="shape179-190" v:mID="179" v:groupContext="shape" transform="translate(144.12,-342.375)">
+ <title>Square.179</title>
+ <desc>14</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>14</text> </g>
+ <g id="shape180-193" v:mID="180" v:groupContext="shape" transform="translate(180.12,-468.375)">
+ <title>Square.180</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape181-196" v:mID="181" v:groupContext="shape" transform="translate(198.12,-468.375)">
+ <title>Square.181</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape182-199" v:mID="182" v:groupContext="shape" transform="translate(216.12,-468.375)">
+ <title>Square.182</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape183-202" v:mID="183" v:groupContext="shape" transform="translate(234.12,-468.375)">
+ <title>Square.183</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape184-205" v:mID="184" v:groupContext="shape" transform="translate(180.12,-450.375)">
+ <title>Square.184</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape185-208" v:mID="185" v:groupContext="shape" transform="translate(198.12,-450.375)">
+ <title>Square.185</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape186-211" v:mID="186" v:groupContext="shape" transform="translate(216.12,-450.375)">
+ <title>Square.186</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape187-214" v:mID="187" v:groupContext="shape" transform="translate(234.12,-450.375)">
+ <title>Square.187</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape188-217" v:mID="188" v:groupContext="shape" transform="translate(180.12,-432.375)">
+ <title>Square.188</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape189-220" v:mID="189" v:groupContext="shape" transform="translate(198.12,-432.375)">
+ <title>Square.189</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape190-223" v:mID="190" v:groupContext="shape" transform="translate(216.12,-432.375)">
+ <title>Square.190</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape191-226" v:mID="191" v:groupContext="shape" transform="translate(234.12,-432.375)">
+ <title>Square.191</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape192-229" v:mID="192" v:groupContext="shape" transform="translate(180.12,-414.375)">
+ <title>Square.192</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape193-232" v:mID="193" v:groupContext="shape" transform="translate(198.12,-414.375)">
+ <title>Square.193</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape194-235" v:mID="194" v:groupContext="shape" transform="translate(216.12,-414.375)">
+ <title>Square.194</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape195-238" v:mID="195" v:groupContext="shape" transform="translate(234.12,-414.375)">
+ <title>Square.195</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape196-241" v:mID="196" v:groupContext="shape" transform="translate(180.12,-396.375)">
+ <title>Square.196</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape197-244" v:mID="197" v:groupContext="shape" transform="translate(198.12,-396.375)">
+ <title>Square.197</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape198-247" v:mID="198" v:groupContext="shape" transform="translate(216.12,-396.375)">
+ <title>Square.198</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape199-250" v:mID="199" v:groupContext="shape" transform="translate(234.12,-396.375)">
+ <title>Square.199</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape200-253" v:mID="200" v:groupContext="shape" transform="translate(180.12,-378.375)">
+ <title>Square.200</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape201-256" v:mID="201" v:groupContext="shape" transform="translate(198.12,-378.375)">
+ <title>Square.201</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape202-259" v:mID="202" v:groupContext="shape" transform="translate(216.12,-378.375)">
+ <title>Square.202</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape203-262" v:mID="203" v:groupContext="shape" transform="translate(234.12,-378.375)">
+ <title>Square.203</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape204-265" v:mID="204" v:groupContext="shape" transform="translate(180.12,-360.375)">
+ <title>Square.204</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape205-268" v:mID="205" v:groupContext="shape" transform="translate(198.12,-360.375)">
+ <title>Square.205</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape206-271" v:mID="206" v:groupContext="shape" transform="translate(216.12,-360.375)">
+ <title>Square.206</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape207-274" v:mID="207" v:groupContext="shape" transform="translate(234.12,-360.375)">
+ <title>Square.207</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape208-277" v:mID="208" v:groupContext="shape" transform="translate(180.12,-342.375)">
+ <title>Square.208</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape209-280" v:mID="209" v:groupContext="shape" transform="translate(198.12,-342.375)">
+ <title>Square.209</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape210-283" v:mID="210" v:groupContext="shape" transform="translate(216.12,-342.375)">
+ <title>Square.210</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape211-286" v:mID="211" v:groupContext="shape" transform="translate(234.12,-342.375)">
+ <title>Square.211</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape212-289" v:mID="212" v:groupContext="shape" transform="translate(252.12,-468.375)">
+ <title>Square.212</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape213-292" v:mID="213" v:groupContext="shape" transform="translate(270.12,-468.375)">
+ <title>Square.213</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape214-295" v:mID="214" v:groupContext="shape" transform="translate(288.12,-468.375)">
+ <title>Square.214</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape215-298" v:mID="215" v:groupContext="shape" transform="translate(306.12,-468.375)">
+ <title>Square.215</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape216-301" v:mID="216" v:groupContext="shape" transform="translate(252.12,-450.375)">
+ <title>Square.216</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape217-304" v:mID="217" v:groupContext="shape" transform="translate(270.12,-450.375)">
+ <title>Square.217</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape218-307" v:mID="218" v:groupContext="shape" transform="translate(288.12,-450.375)">
+ <title>Square.218</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape219-310" v:mID="219" v:groupContext="shape" transform="translate(306.12,-450.375)">
+ <title>Square.219</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape220-313" v:mID="220" v:groupContext="shape" transform="translate(252.12,-432.375)">
+ <title>Square.220</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape221-316" v:mID="221" v:groupContext="shape" transform="translate(270.12,-432.375)">
+ <title>Square.221</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape222-319" v:mID="222" v:groupContext="shape" transform="translate(288.12,-432.375)">
+ <title>Square.222</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape223-322" v:mID="223" v:groupContext="shape" transform="translate(306.12,-432.375)">
+ <title>Square.223</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape224-325" v:mID="224" v:groupContext="shape" transform="translate(252.12,-414.375)">
+ <title>Square.224</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape225-328" v:mID="225" v:groupContext="shape" transform="translate(270.12,-414.375)">
+ <title>Square.225</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape226-331" v:mID="226" v:groupContext="shape" transform="translate(288.12,-414.375)">
+ <title>Square.226</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape227-334" v:mID="227" v:groupContext="shape" transform="translate(306.12,-414.375)">
+ <title>Square.227</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape228-337" v:mID="228" v:groupContext="shape" transform="translate(252.12,-396.375)">
+ <title>Square.228</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape229-340" v:mID="229" v:groupContext="shape" transform="translate(270.12,-396.375)">
+ <title>Square.229</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape230-343" v:mID="230" v:groupContext="shape" transform="translate(288.12,-396.375)">
+ <title>Square.230</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape231-346" v:mID="231" v:groupContext="shape" transform="translate(306.12,-396.375)">
+ <title>Square.231</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape232-349" v:mID="232" v:groupContext="shape" transform="translate(252.12,-378.375)">
+ <title>Square.232</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape233-352" v:mID="233" v:groupContext="shape" transform="translate(270.12,-378.375)">
+ <title>Square.233</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape234-355" v:mID="234" v:groupContext="shape" transform="translate(288.12,-378.375)">
+ <title>Square.234</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape235-358" v:mID="235" v:groupContext="shape" transform="translate(306.12,-378.375)">
+ <title>Square.235</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape236-361" v:mID="236" v:groupContext="shape" transform="translate(252.12,-360.375)">
+ <title>Square.236</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape237-364" v:mID="237" v:groupContext="shape" transform="translate(270.12,-360.375)">
+ <title>Square.237</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape238-367" v:mID="238" v:groupContext="shape" transform="translate(288.12,-360.375)">
+ <title>Square.238</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape239-370" v:mID="239" v:groupContext="shape" transform="translate(306.12,-360.375)">
+ <title>Square.239</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape240-373" v:mID="240" v:groupContext="shape" transform="translate(252.12,-342.375)">
+ <title>Square.240</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape241-376" v:mID="241" v:groupContext="shape" transform="translate(270.12,-342.375)">
+ <title>Square.241</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape242-379" v:mID="242" v:groupContext="shape" transform="translate(288.12,-342.375)">
+ <title>Square.242</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape243-382" v:mID="243" v:groupContext="shape" transform="translate(306.12,-342.375)">
+ <title>Square.243</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape244-385" v:mID="244" v:groupContext="shape" transform="translate(342.12,-468.375)">
+ <title>Square.244</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape245-388" v:mID="245" v:groupContext="shape" transform="translate(360.12,-468.375)">
+ <title>Square.245</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape246-391" v:mID="246" v:groupContext="shape" transform="translate(378.12,-468.375)">
+ <title>Square.246</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape247-394" v:mID="247" v:groupContext="shape" transform="translate(396.12,-468.375)">
+ <title>Square.247</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape248-397" v:mID="248" v:groupContext="shape" transform="translate(342.12,-450.375)">
+ <title>Square.248</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape249-400" v:mID="249" v:groupContext="shape" transform="translate(360.12,-450.375)">
+ <title>Square.249</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape250-403" v:mID="250" v:groupContext="shape" transform="translate(378.12,-450.375)">
+ <title>Square.250</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape251-406" v:mID="251" v:groupContext="shape" transform="translate(396.12,-450.375)">
+ <title>Square.251</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape252-409" v:mID="252" v:groupContext="shape" transform="translate(342.12,-432.375)">
+ <title>Square.252</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape253-412" v:mID="253" v:groupContext="shape" transform="translate(360.12,-432.375)">
+ <title>Square.253</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape254-415" v:mID="254" v:groupContext="shape" transform="translate(378.12,-432.375)">
+ <title>Square.254</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape255-418" v:mID="255" v:groupContext="shape" transform="translate(396.12,-432.375)">
+ <title>Square.255</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape256-421" v:mID="256" v:groupContext="shape" transform="translate(342.12,-414.375)">
+ <title>Square.256</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape257-424" v:mID="257" v:groupContext="shape" transform="translate(360.12,-414.375)">
+ <title>Square.257</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape258-427" v:mID="258" v:groupContext="shape" transform="translate(378.12,-414.375)">
+ <title>Square.258</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape259-430" v:mID="259" v:groupContext="shape" transform="translate(396.12,-414.375)">
+ <title>Square.259</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape260-433" v:mID="260" v:groupContext="shape" transform="translate(342.12,-396.375)">
+ <title>Square.260</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape261-436" v:mID="261" v:groupContext="shape" transform="translate(360.12,-396.375)">
+ <title>Square.261</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape262-439" v:mID="262" v:groupContext="shape" transform="translate(378.12,-396.375)">
+ <title>Square.262</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape263-442" v:mID="263" v:groupContext="shape" transform="translate(396.12,-396.375)">
+ <title>Square.263</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape264-445" v:mID="264" v:groupContext="shape" transform="translate(342.12,-378.375)">
+ <title>Square.264</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape265-448" v:mID="265" v:groupContext="shape" transform="translate(360.12,-378.375)">
+ <title>Square.265</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape266-451" v:mID="266" v:groupContext="shape" transform="translate(378.12,-378.375)">
+ <title>Square.266</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape267-454" v:mID="267" v:groupContext="shape" transform="translate(396.12,-378.375)">
+ <title>Square.267</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape268-457" v:mID="268" v:groupContext="shape" transform="translate(342.12,-360.375)">
+ <title>Square.268</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape269-460" v:mID="269" v:groupContext="shape" transform="translate(360.12,-360.375)">
+ <title>Square.269</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape270-463" v:mID="270" v:groupContext="shape" transform="translate(378.12,-360.375)">
+ <title>Square.270</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape271-466" v:mID="271" v:groupContext="shape" transform="translate(396.12,-360.375)">
+ <title>Square.271</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape272-469" v:mID="272" v:groupContext="shape" transform="translate(342.12,-342.375)">
+ <title>Square.272</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape273-472" v:mID="273" v:groupContext="shape" transform="translate(360.12,-342.375)">
+ <title>Square.273</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape274-475" v:mID="274" v:groupContext="shape" transform="translate(378.12,-342.375)">
+ <title>Square.274</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape275-478" v:mID="275" v:groupContext="shape" transform="translate(396.12,-342.375)">
+ <title>Square.275</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape276-481" v:mID="276" v:groupContext="shape" transform="translate(414.12,-468.375)">
+ <title>Square.276</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape277-484" v:mID="277" v:groupContext="shape" transform="translate(432.12,-468.375)">
+ <title>Square.277</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape278-487" v:mID="278" v:groupContext="shape" transform="translate(450.12,-468.375)">
+ <title>Square.278</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape279-490" v:mID="279" v:groupContext="shape" transform="translate(468.12,-468.375)">
+ <title>Square.279</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape280-493" v:mID="280" v:groupContext="shape" transform="translate(414.12,-450.375)">
+ <title>Square.280</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape281-496" v:mID="281" v:groupContext="shape" transform="translate(432.12,-450.375)">
+ <title>Square.281</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape282-499" v:mID="282" v:groupContext="shape" transform="translate(450.12,-450.375)">
+ <title>Square.282</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape283-502" v:mID="283" v:groupContext="shape" transform="translate(468.12,-450.375)">
+ <title>Square.283</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape284-505" v:mID="284" v:groupContext="shape" transform="translate(414.12,-432.375)">
+ <title>Square.284</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape285-508" v:mID="285" v:groupContext="shape" transform="translate(432.12,-432.375)">
+ <title>Square.285</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape286-511" v:mID="286" v:groupContext="shape" transform="translate(450.12,-432.375)">
+ <title>Square.286</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape287-514" v:mID="287" v:groupContext="shape" transform="translate(468.12,-432.375)">
+ <title>Square.287</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape288-517" v:mID="288" v:groupContext="shape" transform="translate(414.12,-414.375)">
+ <title>Square.288</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape289-520" v:mID="289" v:groupContext="shape" transform="translate(432.12,-414.375)">
+ <title>Square.289</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape290-523" v:mID="290" v:groupContext="shape" transform="translate(450.12,-414.375)">
+ <title>Square.290</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape291-526" v:mID="291" v:groupContext="shape" transform="translate(468.12,-414.375)">
+ <title>Square.291</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape292-529" v:mID="292" v:groupContext="shape" transform="translate(414.12,-396.375)">
+ <title>Square.292</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape293-532" v:mID="293" v:groupContext="shape" transform="translate(432.12,-396.375)">
+ <title>Square.293</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape294-535" v:mID="294" v:groupContext="shape" transform="translate(450.12,-396.375)">
+ <title>Square.294</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape295-538" v:mID="295" v:groupContext="shape" transform="translate(468.12,-396.375)">
+ <title>Square.295</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape296-541" v:mID="296" v:groupContext="shape" transform="translate(414.12,-378.375)">
+ <title>Square.296</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape297-544" v:mID="297" v:groupContext="shape" transform="translate(432.12,-378.375)">
+ <title>Square.297</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape298-547" v:mID="298" v:groupContext="shape" transform="translate(450.12,-378.375)">
+ <title>Square.298</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape299-550" v:mID="299" v:groupContext="shape" transform="translate(468.12,-378.375)">
+ <title>Square.299</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape300-553" v:mID="300" v:groupContext="shape" transform="translate(414.12,-360.375)">
+ <title>Square.300</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape301-556" v:mID="301" v:groupContext="shape" transform="translate(432.12,-360.375)">
+ <title>Square.301</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape302-559" v:mID="302" v:groupContext="shape" transform="translate(450.12,-360.375)">
+ <title>Square.302</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape303-562" v:mID="303" v:groupContext="shape" transform="translate(468.12,-360.375)">
+ <title>Square.303</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape304-565" v:mID="304" v:groupContext="shape" transform="translate(414.12,-342.375)">
+ <title>Square.304</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape305-568" v:mID="305" v:groupContext="shape" transform="translate(432.12,-342.375)">
+ <title>Square.305</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape306-571" v:mID="306" v:groupContext="shape" transform="translate(450.12,-342.375)">
+ <title>Square.306</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape307-574" v:mID="307" v:groupContext="shape" transform="translate(468.12,-342.375)">
+ <title>Square.307</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape308-577" v:mID="308" v:groupContext="shape" transform="translate(504.12,-468.375)">
+ <title>Square.308</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape309-580" v:mID="309" v:groupContext="shape" transform="translate(522.12,-468.375)">
+ <title>Square.309</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape310-583" v:mID="310" v:groupContext="shape" transform="translate(540.12,-468.375)">
+ <title>Square.310</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape311-586" v:mID="311" v:groupContext="shape" transform="translate(558.12,-468.375)">
+ <title>Square.311</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape312-589" v:mID="312" v:groupContext="shape" transform="translate(504.12,-450.375)">
+ <title>Square.312</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape313-592" v:mID="313" v:groupContext="shape" transform="translate(522.12,-450.375)">
+ <title>Square.313</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape314-595" v:mID="314" v:groupContext="shape" transform="translate(540.12,-450.375)">
+ <title>Square.314</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape315-598" v:mID="315" v:groupContext="shape" transform="translate(558.12,-450.375)">
+ <title>Square.315</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape316-601" v:mID="316" v:groupContext="shape" transform="translate(504.12,-432.375)">
+ <title>Square.316</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape317-604" v:mID="317" v:groupContext="shape" transform="translate(522.12,-432.375)">
+ <title>Square.317</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape318-607" v:mID="318" v:groupContext="shape" transform="translate(540.12,-432.375)">
+ <title>Square.318</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape319-610" v:mID="319" v:groupContext="shape" transform="translate(558.12,-432.375)">
+ <title>Square.319</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape320-613" v:mID="320" v:groupContext="shape" transform="translate(504.12,-414.375)">
+ <title>Square.320</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape321-616" v:mID="321" v:groupContext="shape" transform="translate(522.12,-414.375)">
+ <title>Square.321</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape322-619" v:mID="322" v:groupContext="shape" transform="translate(540.12,-414.375)">
+ <title>Square.322</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape323-622" v:mID="323" v:groupContext="shape" transform="translate(558.12,-414.375)">
+ <title>Square.323</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape324-625" v:mID="324" v:groupContext="shape" transform="translate(504.12,-396.375)">
+ <title>Square.324</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape325-628" v:mID="325" v:groupContext="shape" transform="translate(522.12,-396.375)">
+ <title>Square.325</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape326-631" v:mID="326" v:groupContext="shape" transform="translate(540.12,-396.375)">
+ <title>Square.326</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape327-634" v:mID="327" v:groupContext="shape" transform="translate(558.12,-396.375)">
+ <title>Square.327</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape328-637" v:mID="328" v:groupContext="shape" transform="translate(504.12,-378.375)">
+ <title>Square.328</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape329-640" v:mID="329" v:groupContext="shape" transform="translate(522.12,-378.375)">
+ <title>Square.329</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape330-643" v:mID="330" v:groupContext="shape" transform="translate(540.12,-378.375)">
+ <title>Square.330</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape331-646" v:mID="331" v:groupContext="shape" transform="translate(558.12,-378.375)">
+ <title>Square.331</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape332-649" v:mID="332" v:groupContext="shape" transform="translate(504.12,-360.375)">
+ <title>Square.332</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape333-652" v:mID="333" v:groupContext="shape" transform="translate(522.12,-360.375)">
+ <title>Square.333</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape334-655" v:mID="334" v:groupContext="shape" transform="translate(540.12,-360.375)">
+ <title>Square.334</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape335-658" v:mID="335" v:groupContext="shape" transform="translate(558.12,-360.375)">
+ <title>Square.335</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape336-661" v:mID="336" v:groupContext="shape" transform="translate(504.12,-342.375)">
+ <title>Square.336</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape337-664" v:mID="337" v:groupContext="shape" transform="translate(522.12,-342.375)">
+ <title>Square.337</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape338-667" v:mID="338" v:groupContext="shape" transform="translate(540.12,-342.375)">
+ <title>Square.338</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape339-670" v:mID="339" v:groupContext="shape" transform="translate(558.12,-342.375)">
+ <title>Square.339</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape340-673" v:mID="340" v:groupContext="shape" transform="translate(576.12,-468.375)">
+ <title>Square.340</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape341-676" v:mID="341" v:groupContext="shape" transform="translate(594.12,-468.375)">
+ <title>Square.341</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape342-679" v:mID="342" v:groupContext="shape" transform="translate(612.12,-468.375)">
+ <title>Square.342</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape343-682" v:mID="343" v:groupContext="shape" transform="translate(630.12,-468.375)">
+ <title>Square.343</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape344-685" v:mID="344" v:groupContext="shape" transform="translate(576.12,-450.375)">
+ <title>Square.344</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape345-688" v:mID="345" v:groupContext="shape" transform="translate(594.12,-450.375)">
+ <title>Square.345</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape346-691" v:mID="346" v:groupContext="shape" transform="translate(612.12,-450.375)">
+ <title>Square.346</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape347-694" v:mID="347" v:groupContext="shape" transform="translate(630.12,-450.375)">
+ <title>Square.347</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape348-697" v:mID="348" v:groupContext="shape" transform="translate(576.12,-432.375)">
+ <title>Square.348</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape349-700" v:mID="349" v:groupContext="shape" transform="translate(594.12,-432.375)">
+ <title>Square.349</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape350-703" v:mID="350" v:groupContext="shape" transform="translate(612.12,-432.375)">
+ <title>Square.350</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape351-706" v:mID="351" v:groupContext="shape" transform="translate(630.12,-432.375)">
+ <title>Square.351</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape352-709" v:mID="352" v:groupContext="shape" transform="translate(576.12,-414.375)">
+ <title>Square.352</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape353-712" v:mID="353" v:groupContext="shape" transform="translate(594.12,-414.375)">
+ <title>Square.353</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape354-715" v:mID="354" v:groupContext="shape" transform="translate(612.12,-414.375)">
+ <title>Square.354</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape355-718" v:mID="355" v:groupContext="shape" transform="translate(630.12,-414.375)">
+ <title>Square.355</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape356-721" v:mID="356" v:groupContext="shape" transform="translate(576.12,-396.375)">
+ <title>Square.356</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape357-724" v:mID="357" v:groupContext="shape" transform="translate(594.12,-396.375)">
+ <title>Square.357</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape358-727" v:mID="358" v:groupContext="shape" transform="translate(612.12,-396.375)">
+ <title>Square.358</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape359-730" v:mID="359" v:groupContext="shape" transform="translate(630.12,-396.375)">
+ <title>Square.359</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape360-733" v:mID="360" v:groupContext="shape" transform="translate(576.12,-378.375)">
+ <title>Square.360</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape361-736" v:mID="361" v:groupContext="shape" transform="translate(594.12,-378.375)">
+ <title>Square.361</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape362-739" v:mID="362" v:groupContext="shape" transform="translate(612.12,-378.375)">
+ <title>Square.362</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape363-742" v:mID="363" v:groupContext="shape" transform="translate(630.12,-378.375)">
+ <title>Square.363</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape364-745" v:mID="364" v:groupContext="shape" transform="translate(576.12,-360.375)">
+ <title>Square.364</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape365-748" v:mID="365" v:groupContext="shape" transform="translate(594.12,-360.375)">
+ <title>Square.365</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape366-751" v:mID="366" v:groupContext="shape" transform="translate(612.12,-360.375)">
+ <title>Square.366</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape367-754" v:mID="367" v:groupContext="shape" transform="translate(630.12,-360.375)">
+ <title>Square.367</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape368-757" v:mID="368" v:groupContext="shape" transform="translate(576.12,-342.375)">
+ <title>Square.368</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape369-760" v:mID="369" v:groupContext="shape" transform="translate(594.12,-342.375)">
+ <title>Square.369</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape370-763" v:mID="370" v:groupContext="shape" transform="translate(612.12,-342.375)">
+ <title>Square.370</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape371-766" v:mID="371" v:groupContext="shape" transform="translate(630.12,-342.375)">
+ <title>Square.371</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape372-769" v:mID="372" v:groupContext="shape" transform="translate(18.12,-180.375)">
+ <title>Square.372</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape373-772" v:mID="373" v:groupContext="shape" transform="translate(36.12,-180.375)">
+ <title>Square.373</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape374-775" v:mID="374" v:groupContext="shape" transform="translate(54.12,-180.375)">
+ <title>Square.374</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape375-778" v:mID="375" v:groupContext="shape" transform="translate(72.12,-180.375)">
+ <title>Square.375</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape376-781" v:mID="376" v:groupContext="shape" transform="translate(18.12,-162.375)">
+ <title>Square.376</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape377-784" v:mID="377" v:groupContext="shape" transform="translate(36.12,-162.375)">
+ <title>Square.377</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape378-787" v:mID="378" v:groupContext="shape" transform="translate(54.12,-162.375)">
+ <title>Square.378</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape379-790" v:mID="379" v:groupContext="shape" transform="translate(72.12,-162.375)">
+ <title>Square.379</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape380-793" v:mID="380" v:groupContext="shape" transform="translate(18.12,-144.375)">
+ <title>Square.380</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape381-796" v:mID="381" v:groupContext="shape" transform="translate(36.12,-144.375)">
+ <title>Square.381</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape382-799" v:mID="382" v:groupContext="shape" transform="translate(54.12,-144.375)">
+ <title>Square.382</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape383-802" v:mID="383" v:groupContext="shape" transform="translate(72.12,-144.375)">
+ <title>Square.383</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape384-805" v:mID="384" v:groupContext="shape" transform="translate(18.12,-126.375)">
+ <title>Square.384</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape385-808" v:mID="385" v:groupContext="shape" transform="translate(36.12,-126.375)">
+ <title>Square.385</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape386-811" v:mID="386" v:groupContext="shape" transform="translate(54.12,-126.375)">
+ <title>Square.386</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape387-814" v:mID="387" v:groupContext="shape" transform="translate(72.12,-126.375)">
+ <title>Square.387</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape388-817" v:mID="388" v:groupContext="shape" transform="translate(18.12,-108.375)">
+ <title>Square.388</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape389-820" v:mID="389" v:groupContext="shape" transform="translate(36.12,-108.375)">
+ <title>Square.389</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape390-823" v:mID="390" v:groupContext="shape" transform="translate(54.12,-108.375)">
+ <title>Square.390</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape391-826" v:mID="391" v:groupContext="shape" transform="translate(72.12,-108.375)">
+ <title>Square.391</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape392-829" v:mID="392" v:groupContext="shape" transform="translate(18.12,-90.375)">
+ <title>Square.392</title>
+ <desc>12</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text> </g>
+ <g id="shape393-832" v:mID="393" v:groupContext="shape" transform="translate(36.12,-90.375)">
+ <title>Square.393</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape394-835" v:mID="394" v:groupContext="shape" transform="translate(54.12,-90.375)">
+ <title>Square.394</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape395-838" v:mID="395" v:groupContext="shape" transform="translate(72.12,-90.375)">
+ <title>Square.395</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape396-841" v:mID="396" v:groupContext="shape" transform="translate(18.12,-72.375)">
+ <title>Square.396</title>
+ <desc>13</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text> </g>
+ <g id="shape397-844" v:mID="397" v:groupContext="shape" transform="translate(36.12,-72.375)">
+ <title>Square.397</title>
+ <desc>12</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text> </g>
+ <g id="shape398-847" v:mID="398" v:groupContext="shape" transform="translate(54.12,-72.375)">
+ <title>Square.398</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape399-850" v:mID="399" v:groupContext="shape" transform="translate(72.12,-72.375)">
+ <title>Square.399</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape400-853" v:mID="400" v:groupContext="shape" transform="translate(18.12,-54.375)">
+ <title>Square.400</title>
+ <desc>14</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>14</text> </g>
+ <g id="shape401-856" v:mID="401" v:groupContext="shape" transform="translate(36.12,-54.375)">
+ <title>Square.401</title>
+ <desc>13</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>13</text> </g>
+ <g id="shape402-859" v:mID="402" v:groupContext="shape" transform="translate(54.12,-54.375)">
+ <title>Square.402</title>
+ <desc>12</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>12</text> </g>
+ <g id="shape403-862" v:mID="403" v:groupContext="shape" transform="translate(72.12,-54.375)">
+ <title>Square.403</title>
+ <desc>11</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>11</text> </g>
+ <g id="shape404-865" v:mID="404" v:groupContext="shape" transform="translate(90.12,-180.375)">
+ <title>Square.404</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape405-868" v:mID="405" v:groupContext="shape" transform="translate(108.12,-180.375)">
+ <title>Square.405</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape406-871" v:mID="406" v:groupContext="shape" transform="translate(126.12,-180.375)">
+ <title>Square.406</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape407-874" v:mID="407" v:groupContext="shape" transform="translate(144.12,-180.375)">
+ <title>Square.407</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape408-877" v:mID="408" v:groupContext="shape" transform="translate(90.12,-162.375)">
+ <title>Square.408</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape409-880" v:mID="409" v:groupContext="shape" transform="translate(108.12,-162.375)">
+ <title>Square.409</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape410-883" v:mID="410" v:groupContext="shape" transform="translate(126.12,-162.375)">
+ <title>Square.410</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape411-886" v:mID="411" v:groupContext="shape" transform="translate(144.12,-162.375)">
+ <title>Square.411</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape412-889" v:mID="412" v:groupContext="shape" transform="translate(90.12,-144.375)">
+ <title>Square.412</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape413-892" v:mID="413" v:groupContext="shape" transform="translate(108.12,-144.375)">
+ <title>Square.413</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape414-895" v:mID="414" v:groupContext="shape" transform="translate(126.12,-144.375)">
+ <title>Square.414</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape415-898" v:mID="415" v:groupContext="shape" transform="translate(144.12,-144.375)">
+ <title>Square.415</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape416-901" v:mID="416" v:groupContext="shape" transform="translate(90.12,-126.375)">
+ <title>Square.416</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape417-904" v:mID="417" v:groupContext="shape" transform="translate(108.12,-126.375)">
+ <title>Square.417</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape418-907" v:mID="418" v:groupContext="shape" transform="translate(126.12,-126.375)">
+ <title>Square.418</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape419-910" v:mID="419" v:groupContext="shape" transform="translate(144.12,-126.375)">
+ <title>Square.419</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape420-913" v:mID="420" v:groupContext="shape" transform="translate(90.12,-108.375)">
+ <title>Square.420</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape421-916" v:mID="421" v:groupContext="shape" transform="translate(108.12,-108.375)">
+ <title>Square.421</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape422-919" v:mID="422" v:groupContext="shape" transform="translate(126.12,-108.375)">
+ <title>Square.422</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape423-922" v:mID="423" v:groupContext="shape" transform="translate(144.12,-108.375)">
+ <title>Square.423</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape424-925" v:mID="424" v:groupContext="shape" transform="translate(90.12,-90.375)">
+ <title>Square.424</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape425-928" v:mID="425" v:groupContext="shape" transform="translate(108.12,-90.375)">
+ <title>Square.425</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape426-931" v:mID="426" v:groupContext="shape" transform="translate(126.12,-90.375)">
+ <title>Square.426</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape427-934" v:mID="427" v:groupContext="shape" transform="translate(144.12,-90.375)">
+ <title>Square.427</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape428-937" v:mID="428" v:groupContext="shape" transform="translate(90.12,-72.375)">
+ <title>Square.428</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape429-940" v:mID="429" v:groupContext="shape" transform="translate(108.12,-72.375)">
+ <title>Square.429</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape430-943" v:mID="430" v:groupContext="shape" transform="translate(126.12,-72.375)">
+ <title>Square.430</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape431-946" v:mID="431" v:groupContext="shape" transform="translate(144.12,-72.375)">
+ <title>Square.431</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape432-949" v:mID="432" v:groupContext="shape" transform="translate(90.12,-54.375)">
+ <title>Square.432</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape433-952" v:mID="433" v:groupContext="shape" transform="translate(108.12,-54.375)">
+ <title>Square.433</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape434-955" v:mID="434" v:groupContext="shape" transform="translate(126.12,-54.375)">
+ <title>Square.434</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape435-958" v:mID="435" v:groupContext="shape" transform="translate(144.12,-54.375)">
+ <title>Square.435</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape436-961" v:mID="436" v:groupContext="shape" transform="translate(180.12,-180.375)">
+ <title>Square.436</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape437-964" v:mID="437" v:groupContext="shape" transform="translate(198.12,-180.375)">
+ <title>Square.437</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape438-967" v:mID="438" v:groupContext="shape" transform="translate(216.12,-180.375)">
+ <title>Square.438</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape439-970" v:mID="439" v:groupContext="shape" transform="translate(234.12,-180.375)">
+ <title>Square.439</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape440-973" v:mID="440" v:groupContext="shape" transform="translate(180.12,-162.375)">
+ <title>Square.440</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape441-976" v:mID="441" v:groupContext="shape" transform="translate(198.12,-162.375)">
+ <title>Square.441</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape442-979" v:mID="442" v:groupContext="shape" transform="translate(216.12,-162.375)">
+ <title>Square.442</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape443-982" v:mID="443" v:groupContext="shape" transform="translate(234.12,-162.375)">
+ <title>Square.443</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape444-985" v:mID="444" v:groupContext="shape" transform="translate(180.12,-144.375)">
+ <title>Square.444</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape445-988" v:mID="445" v:groupContext="shape" transform="translate(198.12,-144.375)">
+ <title>Square.445</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape446-991" v:mID="446" v:groupContext="shape" transform="translate(216.12,-144.375)">
+ <title>Square.446</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape447-994" v:mID="447" v:groupContext="shape" transform="translate(234.12,-144.375)">
+ <title>Square.447</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape448-997" v:mID="448" v:groupContext="shape" transform="translate(180.12,-126.375)">
+ <title>Square.448</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape449-1000" v:mID="449" v:groupContext="shape" transform="translate(198.12,-126.375)">
+ <title>Square.449</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape450-1003" v:mID="450" v:groupContext="shape" transform="translate(216.12,-126.375)">
+ <title>Square.450</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape451-1006" v:mID="451" v:groupContext="shape" transform="translate(234.12,-126.375)">
+ <title>Square.451</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape452-1009" v:mID="452" v:groupContext="shape" transform="translate(180.12,-108.375)">
+ <title>Square.452</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape453-1012" v:mID="453" v:groupContext="shape" transform="translate(198.12,-108.375)">
+ <title>Square.453</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape454-1015" v:mID="454" v:groupContext="shape" transform="translate(216.12,-108.375)">
+ <title>Square.454</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape455-1018" v:mID="455" v:groupContext="shape" transform="translate(234.12,-108.375)">
+ <title>Square.455</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape456-1021" v:mID="456" v:groupContext="shape" transform="translate(180.12,-90.375)">
+ <title>Square.456</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape457-1024" v:mID="457" v:groupContext="shape" transform="translate(198.12,-90.375)">
+ <title>Square.457</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape458-1027" v:mID="458" v:groupContext="shape" transform="translate(216.12,-90.375)">
+ <title>Square.458</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape459-1030" v:mID="459" v:groupContext="shape" transform="translate(234.12,-90.375)">
+ <title>Square.459</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape460-1033" v:mID="460" v:groupContext="shape" transform="translate(180.12,-72.375)">
+ <title>Square.460</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape461-1036" v:mID="461" v:groupContext="shape" transform="translate(198.12,-72.375)">
+ <title>Square.461</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape462-1039" v:mID="462" v:groupContext="shape" transform="translate(216.12,-72.375)">
+ <title>Square.462</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape463-1042" v:mID="463" v:groupContext="shape" transform="translate(234.12,-72.375)">
+ <title>Square.463</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape464-1045" v:mID="464" v:groupContext="shape" transform="translate(180.12,-54.375)">
+ <title>Square.464</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape465-1048" v:mID="465" v:groupContext="shape" transform="translate(198.12,-54.375)">
+ <title>Square.465</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape466-1051" v:mID="466" v:groupContext="shape" transform="translate(216.12,-54.375)">
+ <title>Square.466</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape467-1054" v:mID="467" v:groupContext="shape" transform="translate(234.12,-54.375)">
+ <title>Square.467</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape468-1057" v:mID="468" v:groupContext="shape" transform="translate(252.12,-180.375)">
+ <title>Square.468</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape469-1060" v:mID="469" v:groupContext="shape" transform="translate(270.12,-180.375)">
+ <title>Square.469</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape470-1063" v:mID="470" v:groupContext="shape" transform="translate(288.12,-180.375)">
+ <title>Square.470</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape471-1066" v:mID="471" v:groupContext="shape" transform="translate(306.12,-180.375)">
+ <title>Square.471</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape472-1069" v:mID="472" v:groupContext="shape" transform="translate(252.12,-162.375)">
+ <title>Square.472</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape473-1072" v:mID="473" v:groupContext="shape" transform="translate(270.12,-162.375)">
+ <title>Square.473</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape474-1075" v:mID="474" v:groupContext="shape" transform="translate(288.12,-162.375)">
+ <title>Square.474</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape475-1078" v:mID="475" v:groupContext="shape" transform="translate(306.12,-162.375)">
+ <title>Square.475</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape476-1081" v:mID="476" v:groupContext="shape" transform="translate(252.12,-144.375)">
+ <title>Square.476</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape477-1084" v:mID="477" v:groupContext="shape" transform="translate(270.12,-144.375)">
+ <title>Square.477</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape478-1087" v:mID="478" v:groupContext="shape" transform="translate(288.12,-144.375)">
+ <title>Square.478</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape479-1090" v:mID="479" v:groupContext="shape" transform="translate(306.12,-144.375)">
+ <title>Square.479</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape480-1093" v:mID="480" v:groupContext="shape" transform="translate(252.12,-126.375)">
+ <title>Square.480</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape481-1096" v:mID="481" v:groupContext="shape" transform="translate(270.12,-126.375)">
+ <title>Square.481</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape482-1099" v:mID="482" v:groupContext="shape" transform="translate(288.12,-126.375)">
+ <title>Square.482</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape483-1102" v:mID="483" v:groupContext="shape" transform="translate(306.12,-126.375)">
+ <title>Square.483</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape484-1105" v:mID="484" v:groupContext="shape" transform="translate(252.12,-108.375)">
+ <title>Square.484</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape485-1108" v:mID="485" v:groupContext="shape" transform="translate(270.12,-108.375)">
+ <title>Square.485</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape486-1111" v:mID="486" v:groupContext="shape" transform="translate(288.12,-108.375)">
+ <title>Square.486</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape487-1114" v:mID="487" v:groupContext="shape" transform="translate(306.12,-108.375)">
+ <title>Square.487</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape488-1117" v:mID="488" v:groupContext="shape" transform="translate(252.12,-90.375)">
+ <title>Square.488</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape489-1120" v:mID="489" v:groupContext="shape" transform="translate(270.12,-90.375)">
+ <title>Square.489</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape490-1123" v:mID="490" v:groupContext="shape" transform="translate(288.12,-90.375)">
+ <title>Square.490</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape491-1126" v:mID="491" v:groupContext="shape" transform="translate(306.12,-90.375)">
+ <title>Square.491</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape492-1129" v:mID="492" v:groupContext="shape" transform="translate(252.12,-72.375)">
+ <title>Square.492</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape493-1132" v:mID="493" v:groupContext="shape" transform="translate(270.12,-72.375)">
+ <title>Square.493</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape494-1135" v:mID="494" v:groupContext="shape" transform="translate(288.12,-72.375)">
+ <title>Square.494</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape495-1138" v:mID="495" v:groupContext="shape" transform="translate(306.12,-72.375)">
+ <title>Square.495</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape496-1141" v:mID="496" v:groupContext="shape" transform="translate(252.12,-54.375)">
+ <title>Square.496</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape497-1144" v:mID="497" v:groupContext="shape" transform="translate(270.12,-54.375)">
+ <title>Square.497</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape498-1147" v:mID="498" v:groupContext="shape" transform="translate(288.12,-54.375)">
+ <title>Square.498</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape499-1150" v:mID="499" v:groupContext="shape" transform="translate(306.12,-54.375)">
+ <title>Square.499</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape500-1153" v:mID="500" v:groupContext="shape" transform="translate(342.12,-180.375)">
+ <title>Square.500</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape501-1156" v:mID="501" v:groupContext="shape" transform="translate(360.12,-180.375)">
+ <title>Square.501</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape502-1159" v:mID="502" v:groupContext="shape" transform="translate(378.12,-180.375)">
+ <title>Square.502</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape503-1162" v:mID="503" v:groupContext="shape" transform="translate(396.12,-180.375)">
+ <title>Square.503</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape504-1165" v:mID="504" v:groupContext="shape" transform="translate(342.12,-162.375)">
+ <title>Square.504</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape505-1168" v:mID="505" v:groupContext="shape" transform="translate(360.12,-162.375)">
+ <title>Square.505</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape506-1171" v:mID="506" v:groupContext="shape" transform="translate(378.12,-162.375)">
+ <title>Square.506</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape507-1174" v:mID="507" v:groupContext="shape" transform="translate(396.12,-162.375)">
+ <title>Square.507</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape508-1177" v:mID="508" v:groupContext="shape" transform="translate(342.12,-144.375)">
+ <title>Square.508</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape509-1180" v:mID="509" v:groupContext="shape" transform="translate(360.12,-144.375)">
+ <title>Square.509</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape510-1183" v:mID="510" v:groupContext="shape" transform="translate(378.12,-144.375)">
+ <title>Square.510</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape511-1186" v:mID="511" v:groupContext="shape" transform="translate(396.12,-144.375)">
+ <title>Square.511</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape512-1189" v:mID="512" v:groupContext="shape" transform="translate(342.12,-126.375)">
+ <title>Square.512</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape513-1192" v:mID="513" v:groupContext="shape" transform="translate(360.12,-126.375)">
+ <title>Square.513</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape514-1195" v:mID="514" v:groupContext="shape" transform="translate(378.12,-126.375)">
+ <title>Square.514</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape515-1198" v:mID="515" v:groupContext="shape" transform="translate(396.12,-126.375)">
+ <title>Square.515</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape516-1201" v:mID="516" v:groupContext="shape" transform="translate(342.12,-108.375)">
+ <title>Square.516</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape517-1204" v:mID="517" v:groupContext="shape" transform="translate(360.12,-108.375)">
+ <title>Square.517</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape518-1207" v:mID="518" v:groupContext="shape" transform="translate(378.12,-108.375)">
+ <title>Square.518</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape519-1210" v:mID="519" v:groupContext="shape" transform="translate(396.12,-108.375)">
+ <title>Square.519</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape520-1213" v:mID="520" v:groupContext="shape" transform="translate(342.12,-90.375)">
+ <title>Square.520</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape521-1216" v:mID="521" v:groupContext="shape" transform="translate(360.12,-90.375)">
+ <title>Square.521</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape522-1219" v:mID="522" v:groupContext="shape" transform="translate(378.12,-90.375)">
+ <title>Square.522</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape523-1222" v:mID="523" v:groupContext="shape" transform="translate(396.12,-90.375)">
+ <title>Square.523</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape524-1225" v:mID="524" v:groupContext="shape" transform="translate(342.12,-72.375)">
+ <title>Square.524</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape525-1228" v:mID="525" v:groupContext="shape" transform="translate(360.12,-72.375)">
+ <title>Square.525</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape526-1231" v:mID="526" v:groupContext="shape" transform="translate(378.12,-72.375)">
+ <title>Square.526</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape527-1234" v:mID="527" v:groupContext="shape" transform="translate(396.12,-72.375)">
+ <title>Square.527</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape528-1237" v:mID="528" v:groupContext="shape" transform="translate(342.12,-54.375)">
+ <title>Square.528</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape529-1240" v:mID="529" v:groupContext="shape" transform="translate(360.12,-54.375)">
+ <title>Square.529</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape530-1243" v:mID="530" v:groupContext="shape" transform="translate(378.12,-54.375)">
+ <title>Square.530</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape531-1246" v:mID="531" v:groupContext="shape" transform="translate(396.12,-54.375)">
+ <title>Square.531</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape532-1249" v:mID="532" v:groupContext="shape" transform="translate(414.12,-180.375)">
+ <title>Square.532</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape533-1252" v:mID="533" v:groupContext="shape" transform="translate(432.12,-180.375)">
+ <title>Square.533</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape534-1255" v:mID="534" v:groupContext="shape" transform="translate(450.12,-180.375)">
+ <title>Square.534</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape535-1258" v:mID="535" v:groupContext="shape" transform="translate(468.12,-180.375)">
+ <title>Square.535</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape536-1261" v:mID="536" v:groupContext="shape" transform="translate(414.12,-162.375)">
+ <title>Square.536</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape537-1264" v:mID="537" v:groupContext="shape" transform="translate(432.12,-162.375)">
+ <title>Square.537</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape538-1267" v:mID="538" v:groupContext="shape" transform="translate(450.12,-162.375)">
+ <title>Square.538</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape539-1270" v:mID="539" v:groupContext="shape" transform="translate(468.12,-162.375)">
+ <title>Square.539</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape540-1273" v:mID="540" v:groupContext="shape" transform="translate(414.12,-144.375)">
+ <title>Square.540</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape541-1276" v:mID="541" v:groupContext="shape" transform="translate(432.12,-144.375)">
+ <title>Square.541</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape542-1279" v:mID="542" v:groupContext="shape" transform="translate(450.12,-144.375)">
+ <title>Square.542</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape543-1282" v:mID="543" v:groupContext="shape" transform="translate(468.12,-144.375)">
+ <title>Square.543</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape544-1285" v:mID="544" v:groupContext="shape" transform="translate(414.12,-126.375)">
+ <title>Square.544</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape545-1288" v:mID="545" v:groupContext="shape" transform="translate(432.12,-126.375)">
+ <title>Square.545</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape546-1291" v:mID="546" v:groupContext="shape" transform="translate(450.12,-126.375)">
+ <title>Square.546</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape547-1294" v:mID="547" v:groupContext="shape" transform="translate(468.12,-126.375)">
+ <title>Square.547</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape548-1297" v:mID="548" v:groupContext="shape" transform="translate(414.12,-108.375)">
+ <title>Square.548</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape549-1300" v:mID="549" v:groupContext="shape" transform="translate(432.12,-108.375)">
+ <title>Square.549</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape550-1303" v:mID="550" v:groupContext="shape" transform="translate(450.12,-108.375)">
+ <title>Square.550</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape551-1306" v:mID="551" v:groupContext="shape" transform="translate(468.12,-108.375)">
+ <title>Square.551</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape552-1309" v:mID="552" v:groupContext="shape" transform="translate(414.12,-90.375)">
+ <title>Square.552</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape553-1312" v:mID="553" v:groupContext="shape" transform="translate(432.12,-90.375)">
+ <title>Square.553</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape554-1315" v:mID="554" v:groupContext="shape" transform="translate(450.12,-90.375)">
+ <title>Square.554</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape555-1318" v:mID="555" v:groupContext="shape" transform="translate(468.12,-90.375)">
+ <title>Square.555</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape556-1321" v:mID="556" v:groupContext="shape" transform="translate(414.12,-72.375)">
+ <title>Square.556</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape557-1324" v:mID="557" v:groupContext="shape" transform="translate(432.12,-72.375)">
+ <title>Square.557</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape558-1327" v:mID="558" v:groupContext="shape" transform="translate(450.12,-72.375)">
+ <title>Square.558</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape559-1330" v:mID="559" v:groupContext="shape" transform="translate(468.12,-72.375)">
+ <title>Square.559</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape560-1333" v:mID="560" v:groupContext="shape" transform="translate(414.12,-54.375)">
+ <title>Square.560</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape561-1336" v:mID="561" v:groupContext="shape" transform="translate(432.12,-54.375)">
+ <title>Square.561</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape562-1339" v:mID="562" v:groupContext="shape" transform="translate(450.12,-54.375)">
+ <title>Square.562</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape563-1342" v:mID="563" v:groupContext="shape" transform="translate(468.12,-54.375)">
+ <title>Square.563</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape564-1345" v:mID="564" v:groupContext="shape" transform="translate(504.12,-180.375)">
+ <title>Square.564</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape565-1348" v:mID="565" v:groupContext="shape" transform="translate(522.12,-180.375)">
+ <title>Square.565</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape566-1351" v:mID="566" v:groupContext="shape" transform="translate(540.12,-180.375)">
+ <title>Square.566</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape567-1354" v:mID="567" v:groupContext="shape" transform="translate(558.12,-180.375)">
+ <title>Square.567</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape568-1357" v:mID="568" v:groupContext="shape" transform="translate(504.12,-162.375)">
+ <title>Square.568</title>
+ <desc>0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape569-1360" v:mID="569" v:groupContext="shape" transform="translate(522.12,-162.375)">
+ <title>Square.569</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape570-1363" v:mID="570" v:groupContext="shape" transform="translate(540.12,-162.375)">
+ <title>Square.570</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape571-1366" v:mID="571" v:groupContext="shape" transform="translate(558.12,-162.375)">
+ <title>Square.571</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape572-1369" v:mID="572" v:groupContext="shape" transform="translate(504.12,-144.375)">
+ <title>Square.572</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape573-1372" v:mID="573" v:groupContext="shape" transform="translate(522.12,-144.375)">
+ <title>Square.573</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape574-1375" v:mID="574" v:groupContext="shape" transform="translate(540.12,-144.375)">
+ <title>Square.574</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape575-1378" v:mID="575" v:groupContext="shape" transform="translate(558.12,-144.375)">
+ <title>Square.575</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape576-1381" v:mID="576" v:groupContext="shape" transform="translate(504.12,-126.375)">
+ <title>Square.576</title>
+ <desc>1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape577-1384" v:mID="577" v:groupContext="shape" transform="translate(522.12,-126.375)">
+ <title>Square.577</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape578-1387" v:mID="578" v:groupContext="shape" transform="translate(540.12,-126.375)">
+ <title>Square.578</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape579-1390" v:mID="579" v:groupContext="shape" transform="translate(558.12,-126.375)">
+ <title>Square.579</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape580-1393" v:mID="580" v:groupContext="shape" transform="translate(504.12,-108.375)">
+ <title>Square.580</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape581-1396" v:mID="581" v:groupContext="shape" transform="translate(522.12,-108.375)">
+ <title>Square.581</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape582-1399" v:mID="582" v:groupContext="shape" transform="translate(540.12,-108.375)">
+ <title>Square.582</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape583-1402" v:mID="583" v:groupContext="shape" transform="translate(558.12,-108.375)">
+ <title>Square.583</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape584-1405" v:mID="584" v:groupContext="shape" transform="translate(504.12,-90.375)">
+ <title>Square.584</title>
+ <desc>2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape585-1408" v:mID="585" v:groupContext="shape" transform="translate(522.12,-90.375)">
+ <title>Square.585</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape586-1411" v:mID="586" v:groupContext="shape" transform="translate(540.12,-90.375)">
+ <title>Square.586</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape587-1414" v:mID="587" v:groupContext="shape" transform="translate(558.12,-90.375)">
+ <title>Square.587</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape588-1417" v:mID="588" v:groupContext="shape" transform="translate(504.12,-72.375)">
+ <title>Square.588</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape589-1420" v:mID="589" v:groupContext="shape" transform="translate(522.12,-72.375)">
+ <title>Square.589</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape590-1423" v:mID="590" v:groupContext="shape" transform="translate(540.12,-72.375)">
+ <title>Square.590</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape591-1426" v:mID="591" v:groupContext="shape" transform="translate(558.12,-72.375)">
+ <title>Square.591</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape592-1429" v:mID="592" v:groupContext="shape" transform="translate(504.12,-54.375)">
+ <title>Square.592</title>
+ <desc>3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape593-1432" v:mID="593" v:groupContext="shape" transform="translate(522.12,-54.375)">
+ <title>Square.593</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape594-1435" v:mID="594" v:groupContext="shape" transform="translate(540.12,-54.375)">
+ <title>Square.594</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape595-1438" v:mID="595" v:groupContext="shape" transform="translate(558.12,-54.375)">
+ <title>Square.595</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape596-1441" v:mID="596" v:groupContext="shape" transform="translate(576.12,-180.375)">
+ <title>Square.596</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape597-1444" v:mID="597" v:groupContext="shape" transform="translate(594.12,-180.375)">
+ <title>Square.597</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape598-1447" v:mID="598" v:groupContext="shape" transform="translate(612.12,-180.375)">
+ <title>Square.598</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape599-1450" v:mID="599" v:groupContext="shape" transform="translate(630.12,-180.375)">
+ <title>Square.599</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape600-1453" v:mID="600" v:groupContext="shape" transform="translate(576.12,-162.375)">
+ <title>Square.600</title>
+ <desc>4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape601-1456" v:mID="601" v:groupContext="shape" transform="translate(594.12,-162.375)">
+ <title>Square.601</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape602-1459" v:mID="602" v:groupContext="shape" transform="translate(612.12,-162.375)">
+ <title>Square.602</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape603-1462" v:mID="603" v:groupContext="shape" transform="translate(630.12,-162.375)">
+ <title>Square.603</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape604-1465" v:mID="604" v:groupContext="shape" transform="translate(576.12,-144.375)">
+ <title>Square.604</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape605-1468" v:mID="605" v:groupContext="shape" transform="translate(594.12,-144.375)">
+ <title>Square.605</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape606-1471" v:mID="606" v:groupContext="shape" transform="translate(612.12,-144.375)">
+ <title>Square.606</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape607-1474" v:mID="607" v:groupContext="shape" transform="translate(630.12,-144.375)">
+ <title>Square.607</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape608-1477" v:mID="608" v:groupContext="shape" transform="translate(576.12,-126.375)">
+ <title>Square.608</title>
+ <desc>5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>5</text> </g>
+ <g id="shape609-1480" v:mID="609" v:groupContext="shape" transform="translate(594.12,-126.375)">
+ <title>Square.609</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape610-1483" v:mID="610" v:groupContext="shape" transform="translate(612.12,-126.375)">
+ <title>Square.610</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape611-1486" v:mID="611" v:groupContext="shape" transform="translate(630.12,-126.375)">
+ <title>Square.611</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape612-1489" v:mID="612" v:groupContext="shape" transform="translate(576.12,-108.375)">
+ <title>Square.612</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape613-1492" v:mID="613" v:groupContext="shape" transform="translate(594.12,-108.375)">
+ <title>Square.613</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape614-1495" v:mID="614" v:groupContext="shape" transform="translate(612.12,-108.375)">
+ <title>Square.614</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape615-1498" v:mID="615" v:groupContext="shape" transform="translate(630.12,-108.375)">
+ <title>Square.615</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape616-1501" v:mID="616" v:groupContext="shape" transform="translate(576.12,-90.375)">
+ <title>Square.616</title>
+ <desc>6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>6</text> </g>
+ <g id="shape617-1504" v:mID="617" v:groupContext="shape" transform="translate(594.12,-90.375)">
+ <title>Square.617</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape618-1507" v:mID="618" v:groupContext="shape" transform="translate(612.12,-90.375)">
+ <title>Square.618</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape619-1510" v:mID="619" v:groupContext="shape" transform="translate(630.12,-90.375)">
+ <title>Square.619</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.47" y="573.27" class="st4" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape620-1513" v:mID="620" v:groupContext="shape" transform="translate(576.12,-72.375)">
+ <title>Square.620</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape621-1516" v:mID="621" v:groupContext="shape" transform="translate(594.12,-72.375)">
+ <title>Square.621</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape622-1519" v:mID="622" v:groupContext="shape" transform="translate(612.12,-72.375)">
+ <title>Square.622</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape623-1522" v:mID="623" v:groupContext="shape" transform="translate(630.12,-72.375)">
+ <title>Square.623</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape624-1525" v:mID="624" v:groupContext="shape" transform="translate(576.12,-54.375)">
+ <title>Square.624</title>
+ <desc>7</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>7</text> </g>
+ <g id="shape625-1528" v:mID="625" v:groupContext="shape" transform="translate(594.12,-54.375)">
+ <title>Square.625</title>
+ <desc>8</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>8</text> </g>
+ <g id="shape626-1531" v:mID="626" v:groupContext="shape" transform="translate(612.12,-54.375)">
+ <title>Square.626</title>
+ <desc>9</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st3"/>
+ <text x="6.72" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>9</text> </g>
+ <g id="shape627-1534" v:mID="627" v:groupContext="shape" transform="translate(630.12,-54.375)">
+ <title>Square.627</title>
+ <desc>10</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="570.267" width="18.01" height="18"/>
+ <rect x="0" y="561.267" width="18" height="18" class="st1"/>
+ <text x="4.44" y="572.97" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>10</text> </g>
+ <g id="shape630-1537" v:mID="630" v:groupContext="shape" transform="translate(472.189,-335.711) rotate(45)">
+ <title>Sheet.630</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid1">
+ <rect width="89.024" height="228.01" id="mfid2"/>
+ </clipPath>
+ <g clip-path="url(#mfid1)">
+ <mask id="mfid3">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid4" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid3)">
+ <use xlink:href="#mfid2"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid5" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid4)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid6">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid5" clip-path="url(#mfid6)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape631-1540" v:mID="631" v:groupContext="shape" transform="translate(773.187,-98.8741) rotate(75)">
+ <title>Sheet.631</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid7">
+ <rect width="89.024" height="228.01" id="mfid8"/>
+ </clipPath>
+ <g clip-path="url(#mfid7)">
+ <mask id="mfid9">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid10" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid9)">
+ <use xlink:href="#mfid8"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid11" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid10)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid12">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid11" clip-path="url(#mfid12)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape632-1543" v:mID="632" v:groupContext="shape" transform="translate(950.873,41.6775) rotate(90)">
+ <title>Sheet.632</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid13">
+ <rect width="89.024" height="228.01" id="mfid14"/>
+ </clipPath>
+ <g clip-path="url(#mfid13)">
+ <mask id="mfid15">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid16" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid15)">
+ <use xlink:href="#mfid14"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid17" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid16)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid18">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid17" clip-path="url(#mfid18)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape633-1546" v:mID="633" v:groupContext="shape" transform="translate(1104.93,181.961) rotate(105)">
+ <title>Sheet.633</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid19">
+ <rect width="89.024" height="228.01" id="mfid20"/>
+ </clipPath>
+ <g clip-path="url(#mfid19)">
+ <mask id="mfid21">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid22" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid21)">
+ <use xlink:href="#mfid20"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid23" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid22)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid24">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid23" clip-path="url(#mfid24)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape634-1549" v:mID="634" v:groupContext="shape" transform="translate(570.995,596.312) rotate(120)">
+ <title>Sheet.634</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid25">
+ <rect width="89.024" height="228.01" id="mfid26"/>
+ </clipPath>
+ <g clip-path="url(#mfid25)">
+ <mask id="mfid27">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid28" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid27)">
+ <use xlink:href="#mfid26"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid29" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid28)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid30">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid29" clip-path="url(#mfid30)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape635-1552" v:mID="635" v:groupContext="shape" transform="translate(538.497,799.539) rotate(150)">
+ <title>Sheet.635</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid31">
+ <rect width="89.024" height="228.01" id="mfid32"/>
+ </clipPath>
+ <g clip-path="url(#mfid31)">
+ <mask id="mfid33">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid34" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid33)">
+ <use xlink:href="#mfid32"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid35" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid34)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid36">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid35" clip-path="url(#mfid36)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape636-1555" v:mID="636" v:groupContext="shape" transform="translate(398.905,-202.875)">
+ <title>Sheet.636</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid37">
+ <rect width="89.024" height="228.01" id="mfid38"/>
+ </clipPath>
+ <g clip-path="url(#mfid37)">
+ <mask id="mfid39">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid40" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid39)">
+ <use xlink:href="#mfid38"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid41" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid40)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid42">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid41" clip-path="url(#mfid42)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape637-1558" v:mID="637" v:groupContext="shape" transform="translate(838.754,-138.135) rotate(30)">
+ <title>Sheet.637</title>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ <switch>
+ <foreignObject x="0" y="500.753" width="30.4299" height="78.514"
+ requiredExtensions="http://schemas.microsoft.com/visio/2003/SVGExtensions/">
+ <v:foreignData v:orgSize="29760" v:data="data:metafile;base64,eNrtnXlcE8f7xzeESznUqoilCHi2VbwwUPAgQL1qRaoocq
+ ktNlCIFyCIyiEoalEBbT0iINRWRTkbxSoK+rVKrCbSVpEIaK0IqIgX1gM1v2dysUCOYbS/v5qX4z57vefZz3x2dja8dsOgKI
+ pLtX68odTS5pfoUZSeKUVZT5w+iaIYFDuconpSKGr36UZRSw0paq4ORXm2W3lqSBfKeqQeBQBqGBRrKIAbymAzqA8g7g5Fp/
+ upKrTbAXlB21bDivLusm1HsnWk2+lKiXHOH7MpaiBE5vL51tpa49FsiqHgt34Ou1LU1244+w9iU7Q6KcqKzaSM5XF/tq4y7s
+ fuoowH0pbbsA2UMb2+a/vYjoo66LEhm9JbCzqu6NaqR/sPWrbRw32ySdf3u0JoMnXKpzNh6gPltqE+/P/nmL9+QNuFzZzsSu
+ WJPriDDiDQZboLRRUmG736EtqT6rJ0incYRfWtQIVh/vtv9ynKtGnqpy6eK+bdr57K95/d99yyRNPX+i+7nNoo4nIimAdH6+
+ 5e4+RZGhd3couhv2ugYUrKlsHNNro6qWxrE4ZleYLJSR8zvqfjJyUPTt475t0y/tK1Vbk1acUPip8dXGnw+ZS7XpKVojVjdn
+ CPPFtnNe/JsQvDb7x58TD5icXfuTZLqmJq/1h0NWNPocFK9nbfBmEl84T5i6FeTX6rn04u5jSW7Jn45tbGGX2qZsTaBzx7Yf
+ 54z3WfYRHn/R0Cnj25m/K0aWdhsXPSq/Vx3KzIC/75lc4ZhUlWbiX+bNvRmZGlJ+dWjwzYMT7pVUvgiP6zglxK7Ko320Vcn2
+ bjG+Rrl/lga/E35/JijV8UOVxN3c1hClZGXqq8E5gaOznFdsyhvuWraiMM/7nKaUwasS4tPm+3yUT95nnHLAM5jUfKUpaYCO
+ I4LCbvygnGa9HL+2lORRPfLGLrh+8QU83jby7O+Dhi06uv47lZ3sJGt5EZJuwHRpEXLhf0fWzLELsOCXSzDguZU/v65uLxY3
+ s0DqoNrEhlCIP1UZ3LhFMbIvskMUOPRpVxBA6ly3pECcQeQRMrxn4YWezV6LTQCDYMv7M+M9c79alp1K9FgtIuArPjyeIpVy
+ 5mPoie8eZ1/QTf1TVMe7frU2VHkVNnEuSyiAfZHzbiJr26ZxjFD2SWP1tXdWR24SyBGe9Kmp3FYc5JQ8f9HPaNEZFlF3yNd9
+ s8MBSkc9ysd3MCsjnHPIVxcxaeNHhTtK7A9nH+lhEDdlzUB/Co0PPcAh4jeu7iZ5VA+ihojl3mjuCKnhezOG6pjNzI+rmFfR
+ 9Pz88fUC/d3C3G8ByrrLzbbzXLYxZWmr3YwYDcOGw+b1rg3ubisBou5f0gqzTZcLfNDXOb+svGQXPCKriGwXv4jIUn199uus
+ 6VHXy/W9xfWUyfJckLYgPsCmYJlrMi0018+PPWXgyGJoIU1rya8sv4/Bzz4vjVZ50em/QIzefbpOTUpYkXoOa6yBmQGNNs9T
+ ggt2ux6bF8WAi7zwlrQrUXzl7gWxui8/M6acMaN/zEe6rLvwDHWHVxknFu5CWPSuMXL69WpFaJ9A85Btp/FtLkZGH24tHSln
+ n3+1b1nz9doXeaeEPIlqxbvJbnTcYNnsKQconfsC2Cp4npru4u9RXVUfa6Rx5Axb5BcJAnX/uDSssVWoRx9l7q5gIB+GExz8
+ TauUdT3FeSH/WKJpZ8bM1cOZvvElKRlOItXM4/HbzYzmJU8+/PfBfY1d7vgZKW6ZxTlwcrRbs27+NBI4zsU9Vf0t81IzDBzc
+ qkfEbi2MtrLQd0W8a1nyv0D0xAbd9YX102u2D9pIoLPJM1w/K8h88Og3bdEzbfq5kh/uemwTShvtSXg4AKFnVMNHW+n/tsZn
+ kAz6TmaweUdvJNt3JWeBM3/HVT9RiLpkxvIZwaS9hGwbuR5k6TMnSbzTlnBo05FGXnrPu02dmjmo1OgaSQ7dOuVHPsq13hcC
+ 2v+o22kErA2CzJv9l1ZQ7/5vsrh1nFf5sjeSKwBHrO38V2wRF6Qk4CnHO2BxbBFDzCD2koFCaODbKXefgoR6bdP0a2KWdZF3
+ 4KhiZ6OfTDxSinLSHDdyGDQFjEKl7f7wL3l4R+t9Zbxzy6Mj78G6HbxcRpQXOe6B/OLFJspv/zG3sdh/KADJOj5WrwYfZ+bm
+ Us2QkB6V+9klO67HfL5p0cJqs4fnjGZkOxq3QP/TG53u6xf/4wreEP3voMk8jycHCBEJ0/0hNfzIJdzvSGjUa4inLlZpjFv2
+ l2vAidf27CxDeiaK/WVuhbfoxVnNjvT+4va4crNmdMT7391UKv8p9Y+aH2r1zl8iMjewoXZ5WBEIF6NS7J992WmryMnlx+BO
+ XfL9g+bpxtkMsROcLfj5/h0FAoSnRuiKv2tJM1JZPXHzaN0Bt/7oHo0Y3P/zLw8ahSHPD+4Dz2eh5gl9k/fAaCDs+WZ8PnuC
+ 31dx1x5QNguAn1U/br2R6AzsYY9RJJyMTJUQn97kH+7rJlUNM+QWg2r8a1jJW8CEzhIJRXUcULC7xVaSDKLBttwY4wAac4WC
+ iq8BBzx93usjK3jhIGN62yHxb6nk/KaxejYHeUd6BwEjvvck5g6knXMpDteo74oZ4ldP1zwkRcj4Uga7Z4abeaCPtvQyr4gl
+ B2zLWe0H2DZNcq/B7XiG3D76BTX6SfMsQg3TesXtz98j7x0i7If2BaqMnJunjt4oPHnnB1az8TrjSL6lou62WeOVqU7af8nS
+ x6eYv6D8lihuq9dFtP03fidr6gKXManH3crC/Cqh6IQgQr52VJld4hmtQUGXi50VRqf2nXnM0bWh7vM9ri0oq1855yM+uOi5
+ dKzwFF/yiC7rB4HRx7XNpaU8G2NRxkkPBlkJC3qaB30Ol03fAqMVsYLUocqpCZdYEFp5Blyr3gmIqxzR8HMgXb4jgKp/QOcv
+ GPNxV8z3vav/Y8u9soiwOOkfZ+MhOLeCYuwl2bPYV5mS0ZJtbo2CzHpNS4Kg8MjtWHPYQBmno88eAvcxVJe8TDyMLefJud2T
+ x/V/Bm3LiP4HyVm8coeI5wZXyXWnOf1BpXkK6ozp6/7O8BgeWoi4KdDjjy1/D6jgmOMKnhyI7R+Hgexwvp2ecqtO3tOL81Zh
+ cHNzivWg/9/+VxtZGBR9HKP3m3Z4IDEtl1F88NlKZb7zr052lIg1ksr0w91iHO+NO6ocaLAs/0Dt/D/zJk8Vhwl5HsIL8uz4
+ Y6D4ktw3eLKdTSdg3HeQEN9ePcaHVuBTf4O1gciJp9I93flS/ORCfUPNDuJ1aVeKlRzTMu5djnWkWjKU846WBd18OZnCs5Q3
+ ayt2SJuZSYBe6ijpn5WFddXj1TZAL9aoi9LivQ/trqyAzj8FkwCoi7mHkPfJw4tuFv3tNhP3+QAj4QR9hzglxsd5lYl5U78m
+ ++Jz0xIeE1jaMthjhH6MrqPJ5XV3/StY24TkCHo5HXWSw6NyaTGRoghPOtWJQ4rWEfb1s5Y5cL0zFxeLa4d3iE0j2Ne+u+gZ
+ 7kPHdgCFzYP2joc8LFK0UurP2VtLEWJVnM0QmLDzr2cToKjSI1K282P2OBjmOiiLcZWhosdaRcR1zPfb7OVLBZ3me6RyW4X5
+ G25fuCUeFVvGhXA+k5pKy0a/OQWWvMLm8VT6kAzRyh48vLnMUv3HUj/Qu+M5Jmd7Bo3fDsKJ+21flle89Imd+sX+MZZnuYbc
+ qyKFEcZkuaCZxW19D5ODpBP9S45jjnZFzj+05PbSoj7O+ia+0N6Wl2Hl3zTY9ncyLLi+pyxN0F+3hJgW4LAhp2nHBzuJJm5D
+ /zyeFW86SzLBjHZvIzXCL00g8iZb1GWUzI0oN2nPOoJor58s9stsFCfZNa8+PL0dE1NR/MHFG8uqXPiJqI+6uen078xB7swy
+ 8Zu7fOfOWGnlWSbx/tmBMPPt/KuteFYwk2OBSfZ33rbOWNV3euny25Otzzjeu258kF37+Jqf9BkZvHLH5W5ODjfIlPadjNeR
+ Utz75aNqF66M15ry+9fD2oev5tUfTj0rPd8iakTTrHr2zR801d/2bAOTZSz7GLZUpAAx8SE/W3a+AuHJcUbb2gimc7Jvj65f
+ l5XVEfMCy0q48HPxi660vb+9/ybkzInx803iIALvRI4b90jrNG9zuTEbi3a67gxoP4TX0CYqcVdOsSYQxq6NgemFYuVQe1Do
+ efOlvRQ3hM4+x1QLXlLziabnL5RPaN9NuhIe7Cc7tKUh8N+yd6SU6Al6zd/EM45dNhKGFQe453e64wwebW7ILYoshlpQuuSu
+ 2zHLrcEui/4Gz5wG3PixULFo5LTM/xvtT/AgPq3+kPY/rW/gydQ4zolbMFK8xaGGeR01wsL0CPWsQvDEXZPBcaHZHW9YdU1j
+ DOgPriJ/tsbo1q7BV+sM6vYSs6SY6ZXqs4cUew4iOuk4Xqg9uRWZaIhPAtsOrFnfNkzsumHMENi56jr7/3j2+f7ZnFFci81+
+ OkJ8Nms76XvnLuVRAGF4NqdO7qvf6jEmlwJh4uc9DHDOczaqHnGO4iOz9e7LWN31RWsDt4S1YLz+QyXB5CBF6LbCD714Kqe5
+ WDWH0uJWbWXdQP1m9O8u2N2B9XDiofxJK6PmINSmvIguTbxXwLtEvl2HjbnYtCmgLsq9FRpvH6wnGC52tnzUM53IqXXlPjcu
+ r7AclqY8E0aY/iLtw/Ic0Qmh2GVkUco9f/u5wCG/f8FkZE0FtP54/yXnhiUGPkiF7SFlr57erb3yHa73HpMo0eP7ELfQqWNq
+ ksqvtDvDO1J2TZt2AW/4tnNVGTFq8wLg6359ReDCxM7mKPurzgrBkiwdewTe+Ce8ETXpzeVH0XPFCyJf1g1KSm58P++ZMxNj
+ Fm8aSmeu/GW/HjB3Rr3l5wY1t+DAz+N1VL+JV2j+NGnMtINYg5lZHq9bDXqns280PPSxhMJ4fUXez+q9BN9dSJ7p/muS6Il9
+ 6DMwwV9+mKe3DddvfgFnAvD/+oMfL7cR3Z4lJZkX2/wJQtG0dRxeOtRz6YgqZofsXgnVNtUD1QvoJiLGcolvVTfich+0hv7K
+ H0lm+r2y5G35P0ku/r6e7hoQ+wL1BFRuq/02nP9oAyEn3zAeVEV9m6RKpJuf5D2rYMStW3Eq0fiUQi/zKDtpAu4ARabEXfMa
+ 41jj2lDJmSv1q/+5A8bP26SfJcGRtIXiljZ8mbVrwiF1leygpiJZJTSrxEWQED4odKvERZgQHEr5R4ibICwCsrQKGiglgUn1
+ LiFRUwpPFDJV5RgYE0fqXEKyqQ4uUVyEJZBbGy+JQSL6uAIY8fKvGyCgzk8SslXlaBHC+tQBGiCmIV8SklHlXAUMYPlXhUgY
+ EyfqXEowqUeKigNUQVMDpo8l/8X/xf/F/8X/xf/P8Rx9KuR/TrlBXt+kW/rhnQrnf06yCDdn2kXzdb47i211kr2vWXfl02oF
+ 2v6ddxBu36Tr/uK+K49uMEK9r4gT6uMKCNN+jjELo+9HGLLI7rOM6xoo1/6OMiA9p4iT6OYtDGV/RxF4rjVI3TrGjjN/q4zo
+ A23qOPAxm08SF93EjFxqkeZ/ZTMy6Vf9DYuXJ258bO42CwvwnK5zNl685RG9SMnQ07cM62GztvmTmcSp45gtrua0+lB4yn7p
+ 7eQD36LZmSVKVBSYc4BaYZsHw9TPfAfCqVvnA8xLspSTWa30rdPZUIcSbE22CaBdPvKMk1HsQ/wLp1MN0Ly76H/SZA/CP16M
+ L3sH4XJalB8XbYZi3EP0G8A/JgQbwP4p2wDZSa/dSji7uou6WwzfX9UA7APLDFOyDOhhjyuH4Q1idQkhuHqEfCNKjHGdYDtz
+ QeluXAsnSY5lKPRBmwHOr+K4+6W7IGpvmwDI5BlElJKiHnvwqoR5eyYF0cJblZCMvgeG7+DMt+gLIXuGyY51OPyvdS230gz8
+ qtlOTvwzD/I0yPwH6xEP9ESa6mwnwR9ej3fbAshpLcKoJyFOYh/6ug561fID5ApX/NhvXRlKT2GCxPhmXZML8a1sN87XGYPw
+ jTYkpyG5UTUE5CKaEkdaiUQjkF5TQlqUflf1DOQPmVkjSgchbKOShllOQOKgIo5ynJXVR+o5JnjKB+XDSZen51H8xfoCT3Lk
+ IRQhFRkkZULkEppyT3f4fyB5Q/qZY7Qio/Zh7sA8dx9yJMD8A2v8NyiCuzYZ0/MKdAfBDKIVh+gXouzqF+XAzLxLlUS8MFKL
+ 9BnEc9v5YPy6fCtAD286OeVxVAKYT152H6M9VSf57Kj4bl1XyIBTA9DNt/Rj2vOQLzZTAtgnKUaqkrg3IO4l+o59d/gX18YX
+ oMlp2F7adBfBxKMcz/CtMTVMvtX6kfl6DlJyE+A9MS6vmNEtjPh2J04hPXmY3hFrZTm+v8W+w4HSZT59/Km1n6a2fg+GzIuv
+ TM6c7AO5E3s/R/q1afLsKH47JR1qeiVnUKjp03s/RkFGKvLsGG47Eh62MnV0RFrQR2NDYcM2/mseIVK1bIEo8+gQnHYaOsj0
+ dErlAkHv0LHhwrb0Avj4iQsRE8Bg+unY2yProc2JFKUaJjseAYeTNzj4YvBzhNlFgsuDY2ZJ17JDw8fLlcFFniMbGxfO1wrX
+ kzcw+HhYW3Ji4XJTaBn60NrpmNsi4MDQ2jJS4TJQYHriVvQC9bRmOvaBUlIeFnLXBNbJR1wTLEpouiaM2EhIR8zXCNeTNz85
+ YuBXY7UZSJr9UMV89GWectQexWUdq2ZkLK2kOa4BryZubmLAa2IvEOoqDENcLVsSHrrJxFi5a0S7yNxVHi6zTA1ebNzDrIpb
+ FVWhzYmuCq2Sjr/Vxg00VRYXEQZd26A+rgavIGdDCXy1Ulyop2oqxLPLBXNVwVG2X9U1BwiCJxTRZHiSfuUw1XmTcz68egoO
+ BgjaKsbhUlMTFDJbwjG2X9QwBih6gSpUNrIvZ6lXAVeTN3Z84PQPCQ9omrtjjA12/I+L4jvD0bsgb0/MAAhSgYFofEN6iCd8
+ gb0H4ydlA7UdRZfJ00cRXwtmyU9R5fP4AHBGhtzWh6a27YsLEDvF3ezN1pc6VshSiYFgd2RzidjbJO85w7F7HbtaZWi4MoGz
+ emt4W3yZu5nTfT03Muriir24myceuuNvBWNmS9fedMxJ7bKkonLI7g37aB0/Jmbt/h7i5n+83vvMVR4t9uo8EVbMh6zXefAV
+ spSuctvhESp8OVeQN6ymefubcVpZMWR+ykbVsUcBkbZZ06EbERfK621oxUZXG5KEmtcHnezDXJEydOmfIuRElK2iyHIzbKOt
+ 7FBbHpotBaU4sotMQRe5McLs2bqfKDa3HVe+sgNrrbVc3Gsrg6NsDVX+eZqiyu6rxnMjs9hmC2dYqK1pSP3sjYeBYnYas979
+ tZnJCNZXFiNobFidiYvTgpG8fipGwci5OwO3SGaixOwsbtxYnYvn5YFidna7c4EVsuijaLk7BxBypvwdY6eiNjz8WyOBFbhS
+ iqLE7IxrI4OVu7xUnYuAMVUjaOxYnYmGPxt2NrtjgJ213V6E2FxcnYM7EsTs7WbnEittYblPC3YWNZnITd9gZFvcXJ2O5YFn
+ 9rtgaLE7E/c1drcbooxGz1FleKQsLWfrv5dmyce3AiNubtJiEby+IkbNx7cDL2FCyLk7JxLE7EVoqi2eKE7I4WV3GDQsJ2aS
+ uKWouTsfG+USFna7c4EdtlIpbFidkYFidhO7u4YFmclI1jcSK2swuWxQnZWBYnYY9zdsGyOBnbGcvipGwcixOxNYrSmjghG8
+ viJGyndqKoG72RscdhWZycrd3iRGyncVgWJ2arEYVucRI2y8kJy+KkbByLE7FZTlgWJ2RjWZyEbcdywrI4GZuFZXFSNo7Fid
+ iaRVFanJCtbM0pqiwuF4WEPQpTFDJ2+9ZU/fcwcrZ2ixOxlaK0bU3fdhYnYQ/vKIrK856MPaqNKFPUiELKxrE4EXv4KBUW9+
+ xgcRK27fBRWBYnZitFcVE7eiNhf9xOFHUWJ2KrEEWVxYnYH8tE0WZxEvYQJMoo7aKQsW2xLE7EHqJszXGaLE7IxrI4CXsghi
+ gITsYegmVxIvZAPFFI2TgWJ2HbYIpCxh6IZXEidpvE1Y/eSNj9FGwtrUnCtsATJYCI3Q+vNYnYtMQ1ifIWbK0WJ2H3xRSFjI
+ 3XmkTsvhZYFidkY1mchG2GKQoZG681Sdk4FidiK0XRbHFCNpYoJOxeZnitScrGsTgRu5cZlsUJ2ViikLB79MJoTRi9kbF7YV
+ mclI1jcRK2CZYonqRsrNYkYRuZ9MKyOBm7B5bFydnaRSFiG+G1JjEbw+Ik7C5GRlgWJ2RjiULE7oLXmoRsLIuTsA27GGFZnI
+ yNJwopG6c1idh0UTRYnJCNZXESdndMUcjYeK1JztZucSJ2d0MsixOzMUQhYZt3x2tNUjaOxTWx1TwPwjQ3745lcc3Pk6hlY4
+ mi4TkYKduDs3DePC+vL6ZO/fTTCRM+GTnyo4+src1xW7P9NyrffNn6/A7QPb5aiNhfyNifyNjmuBZv+zXT11+2Pncky/wrSP
+ wLWeKfyBI3p7WmmSqLq/5jwYIv2z4vhTL3eTeiKNE0NsC9ZInTRWnbmjaqWrPt38P8p3d8Pk2auQZRFBbX8k2q9/SOz9XJMp
+ /dvjW7q7K4rapeXCYKHd3uOUaP2a2ijFQlipax+Jzpqp9jlGU+WVVrGqlqTRV/D5s1Xf3zlyjzyaot3qOtxVWO3tqhO7CZgy
+ e3ioJvccSeMV3z865ABziRxT+frvk5XUXmqs57LRaf5qD9+WJp5hos3k+1xSc5aH8uWpY5u6PFNYniMlEVWs1z6IPZnbO4mw
+ Pec+jyzBWiWGNc2lwd8J+fR5k7qrd4+0vbeAf85/5lmTvSRTHUIIo6tIbrPMCxLD7WoXPvWZBlPgbn0mbv0Pn3Q6DMx2i3uA
+ a0RjZz8DBtooy2JHsfB8p8mGaLa0RrYQNczaVNmvgIS/L3n6DMB3S0uGL0NtSS/L0tsswHqBPlQ8u3e98M0HUHqG7NQZZv95
+ 4caea676uyeH/Lt3+/D8r8/Y4Wt7J8+/cSyTNvL8oHlu/mfUoo855tW7NPt3fzHihZ5j3pouChsd+7BXClxXt3e3fv3ZJlbq
+ wQ5b1u7/Z9YShzY1lrmnZ7t+85k2Wuj9hd9d79+9lQ5gDvBLpTbIAb6P0778NDmev9O+/xk2b+771/8F98b2JnPx1/J8EAlv
+ WnZL+jgN5vi9bvhLKJkv3e5EA2pfK3G9X9ViTONoPZbX9P0pCto2suzwH91oT0FwNgkzFsHWXdn0NZSKl/H63i3bIfUQcpVX
+ W2jVV/2uug+M1MpIMOTQdTuYaUfD2K/w8OpCyT"/>
+ </foreignObject>
+ <svg viewBox="0 0 89.024 228.01" height="78.514" preserveAspectRatio="none" width="30.4299" x="0" y="500.753">
+ <clipPath id="mfid43">
+ <rect width="89.024" height="228.01" id="mfid44"/>
+ </clipPath>
+ <g clip-path="url(#mfid43)">
+ <mask id="mfid45">
+ <rect width="90" height="229" fill="white" stroke="none"/>
+ </mask>
+ <mask id="mfid46" fill="white" stroke="none">
+ <g>
+ <g mask="url(#mfid45)">
+ <use xlink:href="#mfid44"/>
+ </g>
+ </g>
+ </mask>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetPixelOffsetMode -->
+ <defs>
+ <image id="mfid47" width="90" height="229" xlink:href=""/>
+ </defs>
+ <!-- Unsupported Record: EmfPlusRecordTypeSetObject Obj_ImageAttributes -->
+ <g mask="url(#mfid46)">
+ <g transform="matrix(0.00018373, 0, 0, 0.00018373, 0, 0)">
+ <clipPath id="mfid48">
+ <rect x="-0.5" y="-0.5" width="90" height="229"/>
+ </clipPath>
+ <use xlink:href="#mfid47" clip-path="url(#mfid48)"
+ transform="matrix(5442.9, 0, 0, 5442.9, 2721.4, 2721.5)"/>
+ </g>
+ </g>
+ </g>
+ </svg>
+ </switch>
+ <rect v:rectContext="foreign" x="0" y="500.753" width="30.4299" height="78.514" class="st5"/>
+ </g>
+ <g id="shape638-1561" v:mID="638" v:groupContext="shape" transform="translate(36.12,-306.375)">
+ <title>Sheet.638</title>
+ <desc>d = 0</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 0</tspan></text> </g>
+ <g id="shape639-1565" v:mID="639" v:groupContext="shape" transform="translate(198.12,-306.375)">
+ <title>Sheet.639</title>
+ <desc>d = 1</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 1</tspan></text> </g>
+ <g id="shape640-1569" v:mID="640" v:groupContext="shape" transform="translate(360.12,-306.375)">
+ <title>Sheet.640</title>
+ <desc>d = 2</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 2</tspan></text> </g>
+ <g id="shape641-1573" v:mID="641" v:groupContext="shape" transform="translate(522.12,-306.375)">
+ <title>Sheet.641</title>
+ <desc>d = 3</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 3</tspan></text> </g>
+ <g id="shape642-1577" v:mID="642" v:groupContext="shape" transform="translate(36.12,-18.375)">
+ <title>Sheet.642</title>
+ <desc>d = 4</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 4</tspan></text> </g>
+ <g id="shape643-1581" v:mID="643" v:groupContext="shape" transform="translate(198.12,-18.375)">
+ <title>Sheet.643</title>
+ <desc>d = 5</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 5</tspan></text> </g>
+ <g id="shape644-1585" v:mID="644" v:groupContext="shape" transform="translate(360.12,-18.375)">
+ <title>Sheet.644</title>
+ <desc>d = 6</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 6</tspan></text> </g>
+ <g id="shape645-1589" v:mID="645" v:groupContext="shape" transform="translate(522.12,-18.375)">
+ <title>Sheet.645</title>
+ <desc>d = 7</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="561.267" width="108" height="36"/>
+ <rect x="0" y="543.267" width="108" height="36" class="st5"/>
+ <text x="36.26" y="566.67" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d <tspan class="st7">= 7</tspan></text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/equ_dir_search.svg b/third_party/aom/doc/img/equ_dir_search.svg
new file mode 100644
index 0000000000..3f14e3d95c
--- /dev/null
+++ b/third_party/aom/doc/img/equ_dir_search.svg
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dir_search.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="7.17726in" height="0.950904in"
+ viewBox="0 0 516.763 68.4651" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+ <title>Sheet.1</title>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="480.013" height="31.7151" class="st1"/>
+ <image x="0" y="36.75" width="480.013" height="31.7151" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+ iVBORw0KGgoAAAANSUhEUgAABGAAAABKCAYAAAD0diLqAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+ ARjRyu0AADUmSURBVHhe7Z3pr2VF9b/9B3zjCxJfEBITYwwxhhiiaUKAQIB8MQoEBYLMo8woyCwoo8zIPDc2NINAg0wKCkIQZLJFQFBQ
+ hm6QSQRknvYvz/au+6uurl279lD77HP78yQn3ffcc/fZVbVqrVVrrar9mUIIIYQQQgghhBBCZEUBGCGEEEIIIYQQQojMKAAjhBBCCCGE
+ EEIIkRkFYIQQQgghhBBCCCEyowCMEEIIIYQQQgghRGYUgBFCCCGEEEIIIYTIjAIwQgghhBBCCCGEEJlRAEYIIYQQQgghhBAiMwrACCFE
+ Ap9++mlx2223FbfffvvMO/UsWrSouPvuu2d+EkIIIfLw0UcfFVdeeWWxePHimXfi3HvvvcW1115b2jYhhBDDoQCMEEIk8OCDDxY/+9nP
+ Sic3FT7L3/z1r3+deUcIIYTon+uvv75YsGBBckCFz82fP7/4/e9/P/OOEEKIIVAARgghanjzzTeLfffdt/jnP/858046Tz75ZHHAAQcU
+ 77777sw7QgghRH8sXbq02HPPPYvXX3995p00Xn755WK//fYr/xVCCDEMCsAIIRrz/PPPF1tttVWx/vrrD/raaKONiocffnjmLobjt7/9
+ bfHjH/+4+Pjjj2feSeeDDz4ofvjDH5bXEEKsePz73/8uzjzzzOI///nPzDtxXnnlleLUU08t3n777Zl3RFMeeeSR0l6E7EjO1xZbbFE8
+ ++yzM3cxHBdeeGH5asMpp5zS+m+FENNN0+31sk/9oACMEKIxZNm+853vFJ/5zGdmX+uss05x4403FnfeeWejF4GJs846qzj++OPLF9Ui
+ XOtzn/vcMte31+GHH94qENIWAihkCDFQbbnhhhuKgw8+eND7FkJMnvfff7848sgjG29DJIBw+umnN9ryKP4/zzzzTLHGGmssYzu++93v
+ louMkB2KvdD9BCnMRu2+++7FmmuuWXz2s59d5vr2GjqY8cYbbxTbbLNNKTNteOihh4oddtiheOutt2beEUKsKLTZXi/71B0FYIQQrWBB
+ 8fWvf33W6cQZvfzyy3s70O+TTz4pM4lnn3126eza9/CdbbYCtYVoP1nNp556auad5mCscP6blocLIaab3/zmN+XivaleJFh74oknFnfd
+ ddfMO6IpnG2yyiqrzNoO/t/neScsPh5//PFy8fKVr3xl9ns22WSTQXX9E088UVaktv3OF154odhss8062TghxPTRdnu97FN3FIARQrSC
+ BQUBFzcLiINLNL1vcHTJQq6++url9wyZYSR4summmxavvfbazDv/g/LLgw46qJg3b16ZJb3uuuuKb3/728X//d//Fffdd9/Mp/4HDi7X
+ 4DwYIcSKATpijz32KB577LGZd5pBZcLOO++sUu+WYDeoWDH7xIuqGKpj+oYzvjgA90tf+lJpE4fccopt3HXXXSvPGcMeYZfWW2+9Yrfd
+ diuTCi5Uvmy99dbFPffcM/OOEGJFoMv2etmnbigAI4RoDeX1nG/iOrgEIV566aWZT/QLZymg8IfMMOKU7rTTTsU777wz887/IOjywAMP
+ FJdddlnpdPMzQan777+/PHvgxRdfnPlkUQZvNtxww+UCM0KIuUvXrR1dt5aI8HZZthDlWjRQtYn+33vvvUv7OATYoCOOOCJYZUWwaccd
+ dyz+9a9/lQGaH/3oR+Wjql14nwCO/74QYu7SdXu97FM3BgnA/OMf/xjFAFFqRTnwUEZxLNBeFodjiFJSHfH3v/995icxLcRkiGALQRfX
+ wT3kkEOy7Q3lIEuUPnN5CAjA+NlFtkdxbg1t52wX1/mlHHy11VZbJthCAGaDDTaY6gxjTj0+Jh3VhRXNxsi2x7n00kuLY489duan5pCV
+ RL+McWGM7mPOogvHjr9dltf555/feFtYKs8991yx8cYbDzY3CMD89Kc/nflpWTh/bMsttyxeffXVcp5QEeQfEmwBGK4zreS2IWPRdV2Z
+ pnnbFdqIz+VXfE2CMfZ71+31Y7VP9DF9TdA5xqT9huwBmHvvvbfYdtttS+GbNLZlgsXhtDv6qdBO2tvn2RxdQA6QB+RCTAcpMuTvte/7
+ PBgfHCFK+4eYx6EAjEEGgL3zOLkG2YQvfvGLyzhrYwzA0Hd33313WYLqb6/yyanHx6ajurAi2ZgV2bbzHQRYmffMIXuEL/fBvCdIbM6p
+ v6glMM0TJAgILFy4sNQJVGjwBJ1FixYtNwfOPffcysX1JKEdnH1yxhlnZAu294XJh79dts/zYHxw7NuW9jclFoBZvHjxrG1ed911g1WY
+ Yw3AYJewT8yx2LzObUPGpOu6Mk3ztgu0jTY2PVw2F0P2OwEIgqw33XRT6Y/++c9/nv1OqsgfffTR8v/YqtD2eoN5xQHy6A38V7bZT4t9
+ IujOuVixhL/ZhUn5a1kDMHQApZix0/8RBrYwHHXUUcEFTluqrosQ2kn2Y5iUObG20gdVbUUAf/3rX5eLyD7P7ohdF3mgYiImFyI/RL2/
+ 9rWvzTqkKFhfEafIEDDeZBTtWrxYYOQaY76Psv4h5jALpNAWJMCAcUAwVS8GB26y3YiMo0G/jmkLEmfRcN/cK+fYcDaAu2XKJUWPtyVV
+ vqYJa9NctjEpMpHLttOnv/jFL8ozK8hKG0P0O+1gzqA3TzvttDLLNn/+/GKttdYqnWsygTh9BGZtUesfUsh5MDfffHOpVz7/+c+X54bg
+ MHMIIjrYPy+GRTFbZt57772Zd7rx4Ycflk61G4yoevEkuu2337504NG5PjitBMJzLXz7BJnA0Xbbl3O7LGOKHAyR8UZGqrYg8R7zlEUS
+ BwUzb/3tuyarY8pkM2848+3iiy8uK16322674CLJ5n0uG5LT/k2KaZq3baBNtA2Zji2sH3744bICxE2g9QHBD/TmBRdcsEwAdoh+J+D6
+ rW99q5wzFoDZZ599irXXXrsMZGI3rb0x35b7w8Zhn/j/H//4x2LzzTdfTnf0bZ+WLl1azjdXT1e92Pp/3HHHVVY4EWDnnmM6fgi/oYps
+ ARgEjbMaGMCYoKHw6UgeO9unIbTrhhaVDDALjuuvv37mnbnJ7373uzJyGTtwjslkwt7nwabudf2sCvLAYp2snz+ZxfBce+215TixF5Q9
+ oS4pMmQw51HEXMtec2GMY1kCDBlBRhxtoK2cT+OXt4/pEF7GCWeW++ax4SzEvvzlLwcPCk3V421pIl/TxFy2MZO27W7g2K8oy9nvpt9o
+ O8ElFyvl5p7scfO2qPXv8Yorrij1AJlD1/nlGty7vwjGfnKdPoNYBvfAPeOU++fUMLbMy1122aXUEVWZW/QjwfYch6/3DXLob5clSDjt
+ WwarDuG9+uqrS7tjNpixCj2Nb2yH8JIMYC7gk1x00UXlOFXpkZw2JLf9myTTNG+bQrCMw7ZjbSNgYP7qoYce2muglG2nXDc0J3P2OwGH
+ r371q8Utt9yyjLzyf6pX0ONudTbzvcq2uNXd9A2VaARj/Iq+XPbJdBL9iJ3ywRaRAGGcaXNoVwWfIehep+Mn5a9lC8DQEBpEw2IgAESx
+ cEr6VHB33HFHsfLKK5cDFyoBJdhAlNBKh+caGFgWv3X7nJk0e+65Z/nZPp1kM1zf/OY3l8lSGmZg/eBMW2gjSoXSb9EMMxb+WKTKkAvl
+ fhgXrmevSUSW+6TqEZ3oFRZbOIboOYwUBiq0NQrZJCNhgZpJwsGgZN7JmJItYTF78sknB8coVY+3oY18TRNz1cZM2rZjp6gm42DRUFA0
+ R79z/yzCbK6HYBGMvrPsIrYVxzS0qLXDD6mmMdAROMdcxyWXg2v3wD3HzqmxwBPOe8hBRQ8efvjhvd4jOjfXWXGMx5DbZYcA2fEDK1Q6
+ ceAulSEsPmgfAZlQMIH+JoDonw0zKZhDjA1nKN16661l5c5VV1213H3ntiE57d+kyTFvxwCyzoKbhXfM76T9J5xwQrmA7zsxRlXkqquu
+ OmsLXHL1OwFI2lI1Fyyg4SYMYwEY7AP6g3mIvqRq0vdrIZd9QhfNmzev1M+xynGCbaw5qp5uR2IRH7fucdmT8NeyBGBQimSBUfw5lGIf
+ UO7LoPVZ9TEmqGogKjjWx94iFyz6MG5VWx+awOQ/4IADRtvesWJKmQU5C3OXtjJERorrmYPL/3lvWqk6Kd4yBGw9oA85wwHnNmSkMMRj
+ 0YeW9ca5jZFbj49dR3VlLtqYFdW22zWrtnkAcsxWJMsuYpNwTEOOHwteSsLd36EjQvMBB7fPEm+DLZIEstAFvm7zwVHHEXaddxee/IaT
+ 3tejl2mznxDoC8aPQLnZJ170+zRvMWFMQk8jIYj1/e9/v9x+hA2jiim0UML2k4jrexHVBktsMC51i6acNmQadF1X+p63Y8DaFAp8j4W+
+ +x35ZD1FoNLdDu+DHrAKTaCP3CpMH3xZgrYc4o3+D21RzGWfzOZgJ7GXVdB25ij6wk1oGATkeCJd1RZGYxL+WpYADB230korjXoC2KKK
+ hZNfejvtmOOXIyrZJ5aJr3P+UqCdCsA0x8r5/TNLusgQWQeqXlwHF8U2zU+/IhPmH6iIs0u7fKfXxzIJGN1JY3qvLqsAOfX4tOioLsxF
+ G7Oi2nbLyIccPAPbQ1WOBShsIRkKJGD7qKaxigNzEkNZWwKmOQ45RG9RcYP+r3sKBm1joesfLm5YMNp17ruQMwADVtXj2igW22N4Wkob
+ WIQQ/G+7eECuCWaMAZOlOrnMbUOmQdd1pe95O2lsMe77s2Oj736361ExEqtiw464VTno8tD2ettS+8tf/rL82WxZKACTyz5ZojAluIOt
+ 4LNVNp82V9kuYxL+Wu8BGJsAdVGrMUAGuG5QphFzlhDgMWN73vtQQhhgBWCaY2XzKB4UkNFVhqw02JxbU6SxCPSYoT277bZbGSU3UOqh
+ Aw196EtkcwyBBu6Ve64z1Ln1+LToqK7MJRuzItt226YZO7SVrba/+tWvyn4ykO/Q9h7e59wl7o/PUyEYOizQHF/Xae4LO6MnxeG0+crn
+ QwtS2kB1UF+ykTsAA1ayb/aJ1zRvl6U92Kg6e+RDyT32fyxbJS0pxIKShWUVOW3INOm6LvQ9byeNVfWNPaDUd7/bnCGxVrWdki2JnAPj
+ fh//Z575gU62plPN+Ze//KX8mQANgU5/i08u++Sez5Myvy0AEzrzFagKojqo7lpD+2vRAAyDQFaAQd1rr72WO3gOBx4BogTfwJBj0GNl
+ TSxEiNZTgsWTOHA+QgLTlOeff748eZr75bp1GV5+z2dzODd9gCNAVoIDxrhPSkypIuDZ5pyvwnuMj19ZYIvqWPmmjS1POSDr5o9tG7hf
+ 9vsj6FwXBRNbcNskqzO0KfQVgKEvObfG7h95Rj6RJ/p0hx12CGbJOP+DQ6CQP/6WzzJuOOTu4V7unOJkcv/MGmR44403LkuFDfoVxWHz
+ BbllvvBkCvs+fsdBYlXjyOcpscbR4rO8uCZzl3v1nd0UGarD9mZyHXsNed6Hq2eQSZ7K5X73m2++WWasGQeLsHMwGvfM3mDfEWfhQX+h
+ 14j4c74F12ZeVjmvXIO/mWR5e91TT9Al/jxN0eOQS0dx3bPPPruU7SqZ52W6iznGuV/MHb6TcaHfu2YyuDan7NtYc0101R/+8IfyvriH
+ mJ4bq41Bz5guYG5w0CXzhbHkZ95HP7hzYJK23b1u6N58+u53C1bwol3IJrYmdg9ApQs2w5VDy7RRFYdfxdZFzgQI2RVkDR3V1a75mONM
+ e1ho1o1RXQAG6KPQVtY2DBGAAQ6tRJ5sbJEZHh89FK7PyhkuflCchQALKzcTjewzR0O6E/+L7VWpc47PcV5Fzsdxp8D8QK/YOPivn/zk
+ J8stqFN9lFy6zvcVsQG80A3m97g6kPlttoTf8bdm17rQpn0ufc7bvqBPkEmz59ZX6EN0pesXuDBfWEDHtlfbnKO6iX/5uQ/Mt2dsQz6P
+ T5/9bok1vpu+4RxC9FhdMNZsEXPJBVnBV6cqE/vA9kTa55PLPhEYIrbA2KdUoNUFYKx/aKubaPYZ2l+rDMAgPHQsh0/RATQOgXHhJnnf
+ Lcu1jkMhhRQLA4sTi5FAWRFVS8ki18ECh8c/EulCYaI4684XMacidgDdpLB+oqSU8mR+ZiKQsWE/Hs9xf/rpp8u+80u0zEBX7QWkr+rG
+ til2vz//+c/L+8Uoct26klgWsikl0HVgcLoGYLgGExQlbvePLKPMWGCzwMOw+sqdRRpjgCyhyFgMIvsYXowBwRR+Tun30JziXsjMoeyQ
+ a0rXTzzxxOLoo48ugwjwpz/9qfwujJPvqDA2OFkYfh5Lx/3hYLA3nO8KRXzrZCgF2kw0HoXG9/DCGRjC2aMPTjrppKiewfBhAF2lbJlu
+ dIe/IKI9jO/tt98+8049ixYtKh/9NxbMaa3Sz0adHoecOgqZ57tdmWcOcH1f5g888MBycc542++Ye4wt+6Jj7YxBNQLz9YEHHiivQVCJ
+ tuBcccgf98bWNO6hSn+O0cagh7bddtvZw9H5mXv83ve+V/Yx8+Wcc84pD7F3HdxJ2XauxRxlLzq6C/2LTontn++730PVErxwdgk0hg7/
+ A9qOk+8+Ycz60Xd6Q6CjcjylB9kl8UEbUu7D7CGvKoe4L18ChgrAILOh7bLMidxgX9Bt6MkqecUP4J5sQWALCd4LzUPaQ/+HFkshSBqx
+ UG+rI3OAbaB9sUU0pPgouXRdyFfkszwxjMf1oqcYC4KsVLpdc8015e9I8PA7vhe9gay5VbVNads+lz7nbR/Q3/iNdugrP2NTaBdV1dh2
+ /HFss29fzL9hTEKYTY/NuTbg0zKX8RGY1/iPdWeO9Nnv9BE+ENfzX8jYwoULK21IaHt9Krnsk/nmzEHmYgzaztyjrVUBGOZrij8ytL8W
+ DMAwECyMTYhDCpHPWAbFFXZrQNVeaRwR/o4Bs3Kxrvv1EPJ99913dhFp0Wvug/upwiZKarkan2EBYILd5kXGK0VYiV4ec8wxpRI36H+u
+ waKCa2AA+NldQAJjN69ii4GNrZ1HYZHDrs4OSpFJYPdripDvikGbQgGApvQRgGHSWwDDAiEofCYsfUmf8p6rMJE9C774TpsFAlGABJhC
+ c8rt99CcYlzJ8tM/Jtf8HuPqyobNO5QMfWGYMWPB4lefIAMouZBSislQE7hH7p97tlesjL8v0DPMC/rCnClfz4T0GvfF/VUp8mknJHch
+ 6vQ45NJR/B1BlVSZ53cEflPmQypcC32GXjNsPjNnmDuuTqjKmDS1MbkhQEU2y9VVdo+2GDDnhxf/N+pkIodtN+fb1V0pMpyj31nArL76
+ 6uV3+y/6rirjiYNLn9n906cEc+qSDtw3gfaqxUQXmFvYXcaTca3D+jyWLGEhzIKYz3aFse3qk6TCnKByzx1PbHps8dQHBK6tjeYv4V+a
+ nJju84MMVrVT51tNI64PVBXoM+p8lJy6Dv1vvqKNHdf0fUDukd9RrUEywsV+V9fOKrq0z6XPedsH6CbWc+78QwfSV1RkYF8YF372q+eZ
+ T7G2MucIOAL9jl/szrk2EHTBz7AnZZm/WedD9t3v9Ndhhx22TMLTfVHVHbKF+P7+9voUctonszcp/pvrI/qJPoPxxZ+rC+gM7a8FAzCU
+ ZNEQBJtBJZJHFNfN4lgGxVeAFtWrMp5UA5izSgYLYQll7ZvAddwInjnGddEuJgeTpK2TngsEiGCC29+0DaGgv0xh8y+KyC2hox20p2ry
+ p4xtU7gOVSLuNUxB1kV3kRM+19YIGbS7awDGZJPJaoE2k1WuT4YdhWOGgc9Z1DmUabex4Pfsvazrd2QVmXXnlDteJtehR6WZQ+0rIHMk
+ UCp+NZg5Dr6yqZOhplhQg++yV+iwyT6p0zMEAwgKhIw1f8dCu6r0eFpxdUhVNsyo0+ND6agUmWfxZJUvBu3jPvzATyosMn0n0A/k4Qgu
+ WLCgfNGmEGOzMfRL6DBp+tHukW2R5513Xrl90p2jdTKRw7ajs8hYmu6yeVsnw7n6nf54/PHHi1NPPbXcdufqtKrFGt+PvmOhdMEFFxSr
+ rbZaWTmDTMfsFeNy+umnZ9GTjBP3nLL9l/unH/l8LKtrtqaPwAAyViVnOWBsWLS645lzuyz6Cp8J/cb8YJ4g025Vl+k+f4xsDriJg7mC
+ LabqqqJNJmM+ylC6znxd/7OuPfS3tWE/sXGxYEEdXdrn0ue87QMSLe48AFuQWx8zX+l3O6PEoA1VQeWUOdcGgjpupb8FvXxf3CdXv7OO
+ oNKdCncCtabP0CW0PQTyTvVqE1uTyz6ZfuOeq2yqi/Ujn3fHwScmG8bQ/lrtIbwmTP4N2URHWaI0jTrFZeC8Es3k2n0/HYR7YGL5C0uf
+ sTnHMSyjWBfBox20J2aYDKuAsKhyX9g9IB/ISQzkBHlhzOpgocNn274IAsQCcoY5AXWTlYgxTltVO93IrNu+ujlVtfgPZckM60dXYZmR
+ 4f3QQsgcB3MkjCYylArtd40B89P2tefEDbS4eqaqKgbo5zFtGekLnHic+ZQtf6l63CWHjmoq84bvsPVBql1xaWpjqKLAAW3zYj6l3peL
+ 9WNdJi5VJnLa9lQZG8q2k/W04HLsu9imeuaZZy535lcVZOII8lQFO7rAPLKSbf6tCzKYnePzlj0O0WRB4TrNbV4EsPqWLc5PYX7bd2Cv
+ YkG+vrBAC1sbXf+kSvcxxwnoDXFvQ2N9UZc8beuj9K3rYr6i6arQwjfmf3QhtX0uTeYtIHch+5PyahPssLFOWU+kLLLBdJo/5/ogdQya
+ 9nsb2O7Gusn0GnIdAv2CvkndXp/TPlkVCvebUl2DTeKzVvVVxVQGYGyR5isiKht431+opCoumwB+AKcPqu7ZZ+jO7oIt2usyuqmGiQlH
+ 1Qb9xMTrEzOiKcbFlFWVYkiFdnetgDFMMdbJps2Bqiwii1wWu77hMPn0nVnLSoYWlMBcC42Xu+BxsykYFhyZ0N+Y4xAyaqky1ATkzT0P
+ ZohtSFDl3MbmE1VNKY4CjkidQzAmrC9Sst6petwlh46ySjRfflmE+NU2RlWFWVds3voByxhNbQzjwplBd955Z+MX+sZdqKXgBihd3RFi
+ DLbdZKyuP/uy7cyZW2+9deanMKa36zKeY8ECsamyTFaRz6JDcY6r6HNBgYw10T19QCaXRSJttfHMscDwMfnxq2ir9A3jx8HxsfMNgXvn
+ 3KRpkEmjbxvikkPXxXxFS6iFdBD+Bbara4WgS5P2uTSdt/htIfuT8rJtOk1o4rekBmBs0d4kUJWCjUHIL/HpQ18iO1Q3sQ2qCtP33NM0
+ BG1NBzDmjH0M8/UYy6otVsbUBWBskebftDm/NNp3jK3z6sojzajHSobaYIo5JVpqnT0NjhOGIEVhmAIgghhzljDefCa0NaUrKB4me0qm
+ mHb1oRgY974CMMg0fV1XBcGE5nNVWURzrNyy7TZzCqr+DmzB4xsoPsfnQwfVWXAoZNRSZagpZH951Cr36u+VzoWNpZ9FtLNKfD1FX7Cv
+ OpZdp7zzG9/4RpKBGBPMsdR5marHXfrWUTGZj2UXCboQfIltl2hKE7viMnYbk1pRApO27ZAqY331O99XFRA3bLFW97mxYGcPpMiyHTyd
+ cni62Zs+qgfpd15D89xzz5U6ZagEgdl9f8Fm502FbDfzMLZwx4Zx4CtjNi0yaVhSqy8b4pJD11X5F2C6yh8Dxq2v7S8uTdrn0ue8zUGT
+ 9QR9HZozLrZo7ztBA03GoI9+5/toS8wPNd9lWvxV881DQU0fe9hCir5mvVZX/T20vxYNwCBACJK/SLPMeqgxJlSxqB5/TzbFSoaYVOxl
+ 7CM6h4AhaEwChDOGLUJTI5Dcp2Vk275SDuElE3PxxReXE4vTzc3YIGgYBgOFz5Nw/EhrSqTPIsCWdUEREcDoQ+hQglw7xYFC+dTdawp9
+ BWDoi6qsuw/3zudCRtqqUjAc7v7fqjnF/3nPlCQ/IyumVGxehZSSv+BBRihRZkz5GxQKisXFHAcLHuEIUPlhpMhQE/gOKmBw+od4CpJh
+ TpAvi7SP9/25Q1k7v6sz9PRfioEYE6nOLZi8VenGIXRUTObNQQ5lKnz9w3fzGFPKcVPBqecJFugCrm92xZ+32BiMdZWtaWpjcsNYsbDj
+ LBJk3PqRsWMMDfoefer2bZ1MQE7bbjLmL1ZD9NHv3DtOf9184fc489NSDWeJgVDw3cUqQuhvdDdzOYYFeFPsfh1co4/rNMHaO2SCwBZH
+ vi60hYBvuxkD9Fvdwt2C13U+zNgwnyrlvutsSG5dx1hU+YoxXeWvUfheniTTVOa6tM+lz3nbFfqUBNcWW2xR8MQoMHvuV4KFdAQ/89mY
+ ffCPXiABzSG0boV0W2J+iU8f/Y7N4RHdsWCPrTlyn7vYBzZvGEPGPQZrIwIvKfqadS1+Wmgt5DK0vxYNwJiiYJK7Z1JY1CnkGJvh8JWA
+ i68oeIQjhyPZBEBIKCfiMzg3KJdUmHgIdUq0FOFlQdgkyzsEdl+0g0lqFQ6+sUHoQoqDCU3f0c8hTMjtMyg9niLC0xoMosSUufLoYl/x
+ xTCHIiW7Zg5uH5UWfG8fARgLhISCiz5WRuorCvrzlltuKX+HQ+cqvao5ZUbB5hTjwgLQ5pA5zX603Prbovn0qe21tbb4Sofr2yOouS5/
+ g8Jx99TXyVBTCLogv0MGX8AMsmvkLDjmG2reJzOVspBiHPyxGDPIpJ37kLKvtk6P59ZRUCXzYOPqL45N5i0AwNyj3SbbqXqN63J9O4vA
+ AtauY0WfsjBFd/L/EGOyMaZvaQfOCO2wbaiuw0GfcSCfaw+gD9sOyB/9z+G6MWfIh8VKanYx1u+p32/fZ456CKsQ8fX8UDDXeKISTjgL
+ sjpcPRDzkWgvMkAfIvtV8u1iQf0+norB/HZ1dm5sLlPZOFTwBcx++3rT9Klvu5lPLNb9Q8d98F2QyzofZkzEqn5CxGzIELou5ivGdJXZ
+ NUt+oqu4V9MxKfqpa/tc+py3XbFgAfdDwgi7QUCfn11fjcU3VV7+4//5DJ+N2XY/oEPfuDa87doHqvySEFX93uT7bQ64vrsLbVqwYEFS
+ kKJvkL2m63jWgawH6ZdYEJatbDyhdv311698+qCLBaT9dZdPld/Qpi0pRAMwKASUPYNHIxnMBx54oPyZDgpFqCzSFMuumJN20UUXlR2D
+ sLkLM1N+fAeLVRatqdjkYvLWwQDbAmJMIAQYIQ45og9x7lBCLLItKoyzte222wYnlUVWqyYvBgXDwlMccCAJqPEzE9/AGcApoC9jY+lj
+ AQZbuMSwSYG8dK28wQD1EYCxdoeCiz58J9tVmA+PPvro7HsoPJTn/Pnzl3PKQ3OKMd14443Lv0FB2JibYuYzVZkWm28oLZ4S448lBwtS
+ imyP012yZEn5HTyBw66HDLHIcMe/ToaacO+995ZP/kjJovYNfUxf02bG0w4lQ0bpF5wVxoQXFUQ86apOuZrcxgzE2DDn1nf0q6jT47l1
+ VEzmTX+FnG/uhXvGfuAEEwgl82PObapew8giN3fccUc5Z2gHc2zLLbcs9RpyRKbO15s+Y7Ix5rSjmxk/dNamm25azoX9999/mXkQeiJC
+ H7YdmHP0P68m+sWunWIvYv2e+v32fbwYa1930T4C2dxPTAZyYg4/r7qMIbgLmpCPxNzC7hDQ4bV48eKZ39TDnElJXKRAu3gNAeNKkAm9
+ MHSCAOzwX/wF9ApyhUyxEEX2OBiTe2Rxgr8Ry+4byH+KDzMmbPHlB52qiNmQIXRdzFc03YF98AM43C/zD/nmcG7G1GwmpOinru1z6XPe
+ dsWt1sB+46tR5YCvZjrY5kFortIG2hLThfzOEpb0G4Eud/tKqo/gY35Jqr2v6vfU77fv43NUIiNLLvQVfv+k9FqbdbwloqvkEftF5fWq
+ q65aHHjggcu1uQqTq7qkaZXf0CUmEaP2EF4aeOihh5YTgJPnycrz2E9uxHeMDSY9g17l6KMMED6uyfPxERLXueH3RHNXXnnlYvXVV0/e
+ p2eLDCJYKKU6GAwUft9noHTF7R8W5QsXLiwX9hhqFj2Mw84777xc9New811YwLj96sLCFEeMa6HsfEHGofzBD35Q3gOR/Lr9dQbOG7KB
+ cahbyJqyRF660lcABoeTPufwuhT43nPOOWc2KMnf0u/sJa/Cn1P0P0aFSc984L2zzz571miiaJl3nJ/C53yQdYJpobHkGsiN3R9jThsZ
+ XzKh/A2Oth9FTpGhFFh8892TyhADjyrcfvvty7byot04TBglAl8oXJtnoXvEKcYJJuJOPx922GFl31Rl6bgGgQky01dddVUpCzzSGAOA
+ sSdQxoF0LOYZ7+OOO650lqqou15IJnzMADVxKGJ6nHvKqaNiMo9zzMJ+q622CgZ53XmEjue+jFS9xndy/8iGLUSZV8wvrkub2bJXt/Ae
+ m40xvY8uwI7zyG9ezA/aylxlLKvksattBw4ZRkaw0010P1kp7pvr1xHr99TvZ4GEDDD3kQHmP0Fk5i6LZRwxnHlXvoaGcUAnfeELXyir
+ 90Lb7JhLyC19V/dCrgnGs8gP6cIqbMHKQqCPYNSQARjsALIwiQQB0M8EgOyx5sjmokWLSn/W9S1iATH0FVv90GnMZeQWO1fVHsbooIMO
+ KtZaa61yPnBdKgqwMdg3zmvDB2JRz3sshGP+HHKILUWnELC2v2WOXHfddUn9ih/DOISCFiHqbEhuXRfzFbFB/I4DUn1c34sxf/jhh2d+
+ 8z9S9VPX9kHf87YrjCOybz4E+pV5gB3BntAu9DB9Fhpz2kBbYluAzLavtNJKZV/RZy5co83ah+sij/hZftWTT6zfU7+fABXjz9MP6Sf6
+ jK3WrMt5EaQi0VqlM3KDXktZx9P/pvvqXugrAor4lSk6xWA+Mt51Wzer/IbUtjQlGoBB4fpK15ReLArEzRFhxKh0gQ4+8cQTKxWgj1Vf
+ pCwy+D2f6/MU8rFAe2gXk7OuCqUOlB/GIiUjASgCJkpKZpPPWCS6KyjbPgIw4n/0IUMYDrIXLJbHYNzbgA7CMcdgE7RBEbOYiekYsl/I
+ NsFIIvXMCZwgCxATVODJKlwbecWoxALGZAW4HtUYoeulLFQsI9ckoNaXHg/Rp45qQ1O91oa5aGP6lAnmR+oi27J9yDCyHCO13+u+n8Ct
+ 6S0CG/gXBF+YuwQo6raBDAnBYBzESWG+V5OAWoyhAjAkCNh2RHl5k4DTmMDOsii1ShlsCXM05oNRTYBdIahJdSqBc2ScRSSZXgL7Ns+Q
+ 9diiEltEgsAW/wRerCKYOZZa0cL94jumjntuG5LT/qXQRD+2pe95OwZoC8EntvF1oamPEKt68knp97rvJ0j6t7/9bdafw1axQ4XgC4EG
+ fMtQQH5ouL8m6/gc4DNXJWSMFL+h77ZUBmAsc03U3RUAlBFKMvbIJxQy2yy6RlXpEK5DpC8Ei24UrwkZgse9pSwyMD5UX6RUykwjlDQS
+ Qa+L+NWBomDBGYom0+/0v/2OBSrlmLGsgYGMkGnj2n04Pow3Tug0ldyOnS4yxLwnws9rWoMvgH5AT7h7bFHmsQqvBQsWlPOGTD2ZKVuo
+ WcCEhZLpJ2SW6H9szzsBIKpeqq5X5aSxgORe2QrEdgOyR03Gsi89XkVfOqoNMb3WF3PRxvQpE1QQVW2nwCZgW8yRtQouMoJV/oCR2u+x
+ 7582WLwOEbCoAr9wnXXWafWY2RAEFZijObEEwSSrM7tiCQH3nCL6jaBHlfx/+OGHZYUtfj12zPXBLIvvBh0IBPAevwtB4AWbxxxFBtyH
+ DsQCMNhAqj04f4Nrk31umpDLaUNy2786htBPfc/bMUB1BFVbXYNKdT7Cf//733KNar4c/llqADGl34fwUYagbh2fG/wIgrTnn39+NC6Q
+ 4jf03ZbKAIwJk3uWB1tGEOyURz7ZwgUF3Bb+lv2QoYUO348zxqKC8iKElLIzyrbc/ZQhTLG6hmeuYYY5VoqXAsqkSpFRhk3/s8impI7x
+ IgJM35ozUAVjRIBvLi1O5hptZQhnhb9L0RNjBmVNMNetdqkLehhWYkoAxMCgYnTdDD4Lp1g1jZF6PQMDgX5Eh+Mck0khS2q6PJU+9HgV
+ femoNsT0Wh/MZRvTh0ywDQ95DmWksPeUUiO7LMqA8eJnFiUxJyq132PfP20wd8i69lFJ2gZ0CrqlzsEdEyz4SRYhA9OcICDRRcLLrXZh
+ XqYEKhk3/Hv3b7El2BRsi4HNScnq44ezPkj9W7t35jXJU+xqiu/oktuG5LR/MYbQT9M4b1OgLbSJ7Tf0Y1tiPsLjjz9eyi6ywbqY7+H7
+ kP+qbddGar/n9lGGgrlTtY4fAg5ZJoAcC3al+g19t6UyAINSpjTRSgnJoqKsEZyUaCmChePvZmybwMKNMkj/bAoDZW/7HLlH9oURCSey
+ WKdM2E9IRIwDaOcytA+l4GYkmkAVFEYxlPmwBSHGk2ooDBXKB6emKlNi4PCwpQP5mEuKfy7SVIZQXmQUmZfITy6GcFCsoovghclpLOjh
+ Yhl713Hj/24pN0qcDGRKxV7oerHgDfc5b968soKRv0F3tzmIraser6OrjmpDTK/1xVy2MV1lAh2BvWbbQwhbGGLfOcOAbT+cp4BzVLdY
+ Tun3uu+fJhgLKuRSDg/PgckC9nxaAhncJ7Yjd4KAAIN/uH3fELjg7Ae3gjI1YILPxrka7t+yJck9VNZsYF3CAUjaoldJUgB2CfvkBnhc
+ bMsGfj4VLDzRpI3PkNOG5LZ/IYbQT9M4b5tAm2hb2+BSnY9ARTFrH2zN008/Xey1115lQIYHTsRI7fchfJQhqFvH5wbdwBjhF8RI8Rty
+ tKUyAEM0+fTTTy8dHxwhDthi73OT6DTXOPLII1uVeN54441leWIVCC8LI0oWMUAYnLrII9DBHN5YN1HmChg5HI2mho3tRZw2HdtKRBCM
+ 8yvof7ZRXHPNNbXyYQooxZkW4yBVhhhbFgOcU+IGCvrGZKjJeSZtMAfSdT5pF9F0AkA4q+bA43S6+22rMok4/gQvwYI5lD6SvWZfPnAd
+ c2IN/3rMM5x7rsk8uuSSS8rScoN+4fBD9DeHhmFg2vZVFz2eQlsd1YYUvdaVFcHGdJEJkjk4xlV/h5xyECPZRWwLvgfnH7nzK0Rqv9d9
+ /zTB1g78tEnZUuYsZ1pNysFuCmOOzJIsyqkD+B7sU0rgogsEN9wtPlahiW3iDAw7IDZkU/wAPgEb/GirOgN0Mz4e12IOcsYE0D6zY2B/
+ iz0ysGtms7Cb/mGgXIOtUBy2GjtgOIWcNiS3/fMZQj9N27xtA3LHeURNE08pPgKBEYIo2CdePEghJViS0u9D+ChDUbeOzwnz55hjjinO
+ OOOM6FxK9RtytCV6CG8foLyOPvropMdC54YJwmMjUdYrEpwYvttuu40imopDgGOCXIjpoU6GWDTxyF+yADkfeWffQ5AHBy8nZJQ5kMvO
+ bKHyDyePcmf+T3kon8FxZOujuz0CB9jNBiLv/B1ZQoP93QRzqG4566yzZgPIGAx/K6V/PRxuHG+uQRaUV05y6/Ex6agurEg2RrZ9xYbS
+ +x122GGZIPOYwQlHZ3Pobs5gr33PvHnzOh8EWgeBe+wA38PCjcQECVPsws033zy7Lc23KdgpEphudadtW3VtCQEk7A4JBw6fpDqNYAtP
+ qHO3WzD/sGVu4oW/xeaRpOBvc1eQ5LQhY9J1XZm2edsF2oj8jiHQtCL1+6TBLycYTCArFnyZtN+QPQAjhJj7DPEoT6o+zjzzzDJjlnJu
+ Sh+goMl0kMnbY489yqAP301W3jIUGFZKualuscAi2TLOSDJwXPk7N8vHtTngDwfajawTpMRZZvuF4V8Po8LWP5xggjoxIyOEECsy2CRs
+ EzYqZ4KALTtU9xIESdkG1BXaRaUj9oltezySluAPW4B5MpHZBd+mUL3CtgmqWgzsGDaKA98N3uO6VBKYvWNxQ1KC7UOWNeZzZJHdLV18
+ nqAM78/lSgshhGiDAjBCiE6QTcThy1WiS1CD7TkEOdh3y2tsh5Ph0HJPFoDpCg5t1d55IYQQ6eROEFDdQYCc7QjYJwIwk3i6W4y+bQpb
+ 0FeUrfxCCNE3CsAIIVpDxottOfvss0/rcwgI2lAhwrYaXgR0cF55PCWHflPxYoEXXk0fVTkES5YsWaZCpSuUgWs7hRBCdAN7wrYjKgbb
+ JgioZDH7xItKRnQ0h6Xa0zhdG0Xlx9iesNWnTbGKIm2nEEKIdigAI4RoBeXW7J90Hc8hXhw+2+Qw8Nzg1FP9kvJ0uBQIamlbkRBCdMPO
+ 7QrZkZwvOzdsLPRtUwhq8ZSeMbVRCCGmCQVghBCNodrFHkM+9ItHzY+J0BMmukC2NffZAUIIMZex6syQDcn54myU3AfEN6Vvm8KWWyUI
+ hBCiPQrACCEag/Plbhsa6sV3yvETQggRw982NNQLG0VQXgghhKhCARghhBBCCCGEEEKIzCgAI4QQQgghhBBCCJEZBWCEEEIIIYQQQggh
+ MqMAjBBCCCGEEEIIIURmFIARQgghhBBCCCGEyIwCMEIIIYQQQgghhBCZUQBGCCGEEEIIIYQQIjMKwAghhBBCCCGEEEJkRgEYIUQn7r77
+ 7mL99dcv9t9//+LVV18tzjrrrGKjjTYq1lprreK+++4rnn/++WLfffct1ltvvWKrrbYqlixZMvOXy/PRRx8Vp556arHGGmsUV111VfHc
+ c8+Vf7v22msXO+64Y/HKK6/MfFIIIYSIU2dTXn755eLOO+8sttxyy2LNNdcsjjvuuOL999+f+evlefvtt4uDDjqotG/YvsWLFxe77LJL
+ ef3DDjus/L0QQggRQwEYIURr3njjjeK0004rgyoEXbbbbrti6dKl5e8uvfTS0kk96aSTinfffbd87brrrsW5555b/j7E/fffX9xwww3F
+ HXfcUay66qrFKaecUjrD77zzTrHTTjsVl1122cwnhRBCiDgPPfRQaVPuuuuuoE3ZfPPNi1tvvbX49NNPiyeffLK0WY888sjMXy8P18JO
+ XXTRRcVqq61WXHHFFcUnn3xSJgdIMtxzzz0znxRCCCHCKAAjhGgN2b9bbrmldFjnzZu3jONKAGaTTTYpXn/99fJnc3h5v4rLL7+8zFAS
+ pNlmm22KN998s3xfARghhBBNWbBgQfHss89W2pSTTz65DL7AE088Uay77rrlvyE+/PDD8jrYtIMPPrg45JBDygobUABGCCFEKgrACCE6
+ c+WVVxabbbZZWREDH3zwQbHffvuV2UYDJ3idddYpM5Ix3nvvvWL33Xdv9bdCCCGES6pNobrFtWNVEICh4pPPG1yH63FdIYQQIoYCMEKI
+ Tnz88cdlNvCII46YzSRaNvC2224rfwb+v+GGG5bnxMR44YUXyv35lIwbqY6xEEII4RKyKfyf9/gdmB076qijZu1YFVR6cl6MWylDcIek
+ A8kHIYQQIoYCMEKIThAUITjiZwM5lPCpp54qf7aKGMq933rrreK8886bPeiQUnD20Bt+JpHP7b333qWDywGHl1xySVkKLoQQQtQRqk7B
+ nlAVQ3UMWEUM57s89thjxU033VS+j23CRrn4CQEqYthuy/svvvhiWREqhBBCVKEAjBCiE6F985zVsvXWW5fBFnjttdeKDTbYoMw68sQJ
+ c245Q2aVVVYpD/K1rCNnxLA335xe+1v21lNF41bVCCGEEDF8mxI6EB77QtUmFTE8ye+ZZ54p3z/jjDNKG/Xggw+WP2OnqPZ0Kz45vJcn
+ AfIvZ87EDvEVQgghFIARQnSCR03zGE738ZvHHnts6bgalHcTZKFsm0eC2sGFVMjwHplInGI4/vjji/nz55f/Bz57wgknlBU1OMz2t0II
+ IUQdvk2hYoUKFhIABttmeYofwX4eL21cffXVZQBm4cKF5c9UzOy1117lk/oMbB+PtiYRsWjRototTEIIIVZsFIARQkwUHNoLL7xwNgAj
+ hBBCjAUSBe4WWyGEEKILCsAIISbKkiVLlslOCiGEEGOBba/u05KEEEKILigAI4SYGGwnovpl6dKlM+8IIYQQ4+Cll17S1lchhBC9ogCM
+ EGJihJ4wIYQQQowBnuCnR0sLIYToEwVghBBCCCGEEEIIITKjAIwQQgghhBBCCCFEZhSAEUIIIYQQQgghhMiMAjBCCCGEEEIIIYQQmVEA
+ RgghhBBCCCGEECIzCsAIIYQQQgghhBBCZEYBGCGEEEIIIYQQQoisFMX/A2IP+9+ZsJeHAAAAAElFTkSuQmCC"/>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="480.013" height="31.7151" class="st1"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/equ_dual_self_guided.svg b/third_party/aom/doc/img/equ_dual_self_guided.svg
new file mode 100644
index 0000000000..c936f46f46
--- /dev/null
+++ b/third_party/aom/doc/img/equ_dual_self_guided.svg
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dual_self_guided.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.89143in" height="0.748518in"
+ viewBox="0 0 208.183 53.8933" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+ <title>Sheet.1</title>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="171.433" height="17.1433" class="st1"/>
+ <image x="0" y="36.75" width="171.433" height="17.1433" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+ iVBORw0KGgoAAAANSUhEUgAAAZAAAAAoCAYAAADQUaxgAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+ ARjRyu0AAAv7SURBVHhe7Z35yw5dGMf9A37xm5+UkiRJkiJFhJAtW9lly5Z935UQsv0gITsRRWSJ3pdEsmXf9zU72bfz9jnvnMc8Y+Z+
+ 7plz7u25r09NmGee+54z5zrne53rus6ooARBEAQhASIggiAIQiJEQARBEIREiIAIgiAIiRABEQRBEBIhAiIIgiAkQgREEARBSIQIiCAI
+ gpAIERDBmq9fv6p//vlHffz40TsjuOL06dPq5s2b3r/KP2JLmeP58+f62f769cs7Y48IiGAFA33y5Mlq8+bN6vfv395ZwRUM+t69e6sT
+ J054Z8ovYkuZ5cePH2r+/Plq+fLl+u8uiC0g3ECFChVCj23btulrvn//riZNmhR6zerVq/U1hcidO3dU/fr1Q9vVunVr9ebNG33dhQsX
+ VLVq1f66plmzZurp06f6mvIARjhv3jw1Z86clAZZbDaDHbRr1y60LXXr1lW3bt3S1z169Eg1btz4r2tq1KihLl++rK+Bq1ev6s/jz0KB
+ lcSWLVtKxkv16tXV0qVL1efPn70rSiO2FI5rW0Kkhw4d6kykE69A7t69W2IcEydO1J3mh5tD6TCcjRs3RhpOIULbFi1aVNJB586d837y
+ hwcPHqgGDRqoXr16aeEpjx7VkSNHVNOmTdW9e/e8M6kpRpvZuXNnyWA+cODAX3bw9u1b1bVrV9WmTRsdrgoLL/A7q1atUp07dy5xUvKZ
+ Fy9eqEGDBpXqw2fPnqlu3bqpMWPGaHEJIrZUNi5sCXBweXZcY0tiAfn586eaNm2abkyYZ228pjNnznhnyheoOuJB+4NejfGmZs6c6dRw
+ nzx5kjfxcCYyJjQmtqAhR1GMNkMbaSttpu08AwPPDU9w4MCB6vXr197ZcMznbNq0yTtjB9/NRMKk4xI83FGjRql///3XO/OH2bNna6fq
+ /v373pn/EVtKD1e2ZJ4dIm87P1nlQE6dOqUqV66sG4Q6GojbooRhRlRewIsaMWKEbnuHDh1KPEPTkSwTXScCmTxcTSC20N+1a9dW169f
+ 986kR7HZDPZgVqt4fXjOBtpKm2l7WZjPCZssk8DEMW7cuNj9VxbYJx5/UAj494wZM0qFXQxiS+nhypaAZ1elShV1+PBh70wyrATk3bt3
+ qlOnTrpBffr00ROm6xhbPrN7927d9ooVK5Z0RNyOjEO+CAiTD95LEg+mGG0GL9hMdGa1miSvYT7n0KFD3pnkZEJAEDacqrAwG+dwtPi5
+ P4QlthQPV7Zknh25Iv9KJi5WAgI0gsYQzjl//rwO3XCkSoSVF4JLykuXLsXuyDjki4Aw6eAxrly50jsTj2KzGSY1Jjgz0d2+fTuRh0xu
+ AXuzHfSQCQHBoQqGc4FY/Lp167TXHBwbYkvxcGVLZkVI4p3QeFKsBcSfzKpatWpGQjf5in9JWalSJVWrVq2MLptdC8irV690otH0H0va
+ KVOm6IkqFXjAXH/06FHvTDyK0WYIsdBeVqtU6CXxkL98+aKGDBmiPUc8SBtcCwiCNmvWLJ1XQUhM/5LEbtSokWrbtq26du2ad/UfxJbi
+ 48KWgKo1VjM2eSJrAfHnAqiiyEToJp/xx2HXrFnjnc0MrgQEY2NDEYMbL+T9+/f6PMJBpQyrKNOP/Izkpz8sgbeIWIZNCOlQjDbjn+im
+ Tp2aeDMXfRGWR4iLawF5+fKlrrBC2Pbv36/at2+vmjdvrr3kvn376raHrczFluLjypaOHz+uP8OUPyfBWkBg+/bt+kb8uYBMgrdDvTjf
+ mfQYPnx4qVhsUqh4YMLlMxksmfR+XAkIqyRWG2E19/Qf/cjKCqFBIMePH6++ffvmXRFdTROHJDZDf+3atUstWLDAO1M4cO/Dhg3TbbZJ
+ hK9fv157nXj6NrgWELxY7CnMEzb5j7AS3lzY0sOHD9XIkSN1PzRp0kR79IUU8nJlS4g24p00fAjWAoJXwRLVeOEu4rOFAkZHvNVsGnQx
+ sFPhQkAoA8Z74cCTCWJi0myMZEVC+/yDkomHhGeLFi10CCwJcW2GPTUMmB49eqiGDRvqSaeQYFIlzIB9MMnR5qSJcPqf38d7tMG1gHBf
+ hK7CwPmgtDcYb8+FLWHTOERmpYIzZRMGyjYubcmMdZvxZCUg3ECrVq10J0SVl5VXEA88YTwZszGH9hvPPQmmQ/mcJAd5GFYMUXBfpp+i
+ BhoDmQFNmGTPnj164jYhLrAd9DY2Y747icGfPHmyJMEa97BZVfPMt27dqkODTHZ44rQ5WI2ULnEEhHJark16sLIOq6gKgkBMnz49UoxM
+ v2Hb/mtyYUuInN8JM/fgf5NEWeCEhdlJOseOHTsSO9iubcnMNzkREBTcn/1PtbGuvGG8AJOwwyBSbWpyBYbvN/64MEAYKNxnlLdoBAQP
+ hxg2O4T92Ax6W5uxERC+m7xPkuPx48fep8SHtvrLuv1VQ/5XTKRLHAFJBc/S1QqEVcXYsWPVhw8fvDOlMSWjwVBVLmwJ22HSvHLlinfm
+ z7l0nwXtOXbsWKitlHWQu0rqYLq2pZwJCJNmsNYaBTTJrJ49e0YaEzdNgg0VpRO4liUoccxCIdiR4E+mu6jTD8NWQIzBpEpaGgGhHXhM
+ wdiwCUcglGVVa/mxsRmDjYDkArzEYFm3PwGaJPZM/xO6YEVlg0sB4V7If0TBCh2HhL7jew25sCWeH/k/Sn7B3EPLli11IUC+kglbMvPB
+ 3LlzvTPxiS0gdBRfGDa5mPIyJtKwUAqdtWzZMv3ir/79++sHwkMgDBQ0rlTg8ecqiR6118MkCvnspEvKsrAVELwfQlOpBqwRkFQeWVyP
+ zcZm/BSSgLBq8XvIBv9Eh72kGzYx8BzjPPsoXAoIk1dUnzDBE15C9A4ePOid/UO2bYn7oRzawPcyJvxilG9kypYQfvrFZk6JJSB0GB1H
+ B4ZNkMGNdcFYH0vdDRs26D+pfiC2jCFTihYVUsknjBeAiIRhu6QsC1sBwXOjUiyq6oX+ZSDhoaVapXAPDNJ06sdtbcZPoQiICa8Q/gub
+ lLB7Bi5HnPwKz4bcVVyPPQxXAmL6JKoCkUkPewqb8CFXtgR8Hq+Pj7q3fCBTtgS2e3AgbQGhcyZMmKDq1KkTObHQYXQIN5UqmYXyISA2
+ OyCzCcbFw65Xr57eNxHlqbBUN3FYm2R6FLYCAsTOGdD+QcjKECPq0qWLWrt2rVqxYoVuA20+e/as9nL8k4PxXMoSfZc2A/kuINTj84ZT
+ 3obKCvvTp0/eT0rDczGv3o6zWiUsQ3iGzYR+LzoJrgQER2TAgAF6rwcOlGkLn8+KAEcEe4pqY65sibHJ6+YXLlyYl+KRaVsCVo62e4pS
+ CgjVNxgsNxc89u7d6131P9TnUwUUvA7j4C2bfrjxOCGrXLFv377QNuER+CuTWDryNtHgdRx4RDZJWD8uBAQuXryoBzxto3/YgMWOdJPI
+ RCxGjx6tf4YBB98AjNHSrrC6/0zZDOSrgBBPZ1NmsB2UHLPnwMCkT7I5eB1HzZo1db+kwoQgUyWJ08WVgOCQLFmyRIsb/1eHeQ5M9NgH
+ JdipyJUt4TAxDxnxoG/yQUiyZUv8Po6I7d61REl0G0zSavHixd4ZIV1cCYgteHqECJLEXW3IVwHJFnjprsKjrgSEcWxTNJILW6LNFO0Y
+ weAeEEHbsGAhwcqRULatM5J1AaGT8Dhs4m7FCvFQmx27LmF5TWVN3LirDcUsIIQmCFEQonHhKePtEwoqq/ItFfwuBTC2NplNW2IMdezY
+ UYfQKVPn4O/YVVSYqDxCeJF220ZHsi4gLMO7d+9eqgRWKDyYxJjMshGKxDMlDozBE6Ygh4MTQvGFbS6gUGCSJa6fyTcdxAXhIC9hO/Fm
+ 05bI/YWFfYrpDRom5B7nP/CKIusCIpQf+G9KectqWHmm4A5i1CSq2VluO+BdQujKVShabCk7YD/YEfZkk/swiIAIVlB+GbYvRnCDGfB4
+ 6C4GvEtu3LjhNJIgtpR5eLYUAbn6r7FFQARrKPUdPHhwUSUhswXJXiqUMh3ayRfEljIH6YN+/fo5zaOKgAiCIAiJEAERBEEQEiECIgiC
+ ICRCBEQQBEFIhAiIIAiCkAgREEEQBCERIiCCIAhCApT6DyBwOP/MSHc/AAAAAElFTkSuQmCC"/>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="171.433" height="17.1433" class="st1"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/equ_dual_self_para.svg b/third_party/aom/doc/img/equ_dual_self_para.svg
new file mode 100644
index 0000000000..d294bcae25
--- /dev/null
+++ b/third_party/aom/doc/img/equ_dual_self_para.svg
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_dual_self_para.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.99855in" height="0.813996in"
+ viewBox="0 0 143.896 58.6077" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+ <title>Sheet.1</title>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="107.146" height="21.8577" class="st1"/>
+ <image x="0" y="36.75" width="107.146" height="21.8577" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+ iVBORw0KGgoAAAANSUhEUgAAAPoAAAAzCAYAAAC+CxVBAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+ ARjRyu0AAAsuSURBVHhe7Z3pyw1tHMfvf8AbL5RXXkmSJC+IFBGKkKzJfqPsW1kiWbKFhJCkbKHsb+xZEonIEiL7viRk366nz/XM777n
+ zJmZc86ca44zZ65PTY8z59xnZq75fX/bdc15qpTFYql4rNAtlhRghW6xpAArdIslBVihWywpwArdYkkBVugWSwqwQrdYUoAVusWSAqzQ
+ LZYUkGih37x5UzVu3Fg1b95cderUSXXs2FE1aNBAb/ybfa1atVL169dXly9fdv7KYkkfiRb65s2b1bZt29SfP3/06+fPn6t27dqpBQsW
+ qL9//+p9Hz58UBMmTFCPHj3Sr9PKpk2bVJ06dVTbtm21A+S/vBYnydaoUSPVu3dvPWZJ5PPnz2r9+vVq586dzp74SFqQSazQv379qtas
+ WaM+fvzo7FF6QBnYY8eOOXv+/9zChQvV+/fvnT3pAwFMnjxZ3bhxw9mj9Bgh9HPnzjl7lLpy5YqaNWuW+vHjh7MnGVy/fl0NHjxYDR06
+ VItv+/btzjvxkbQgk1ih37lzJ8tzb9myRTVr1ky/J3z69Ek7hG/fvjl70gfjsWPHjhoDhJUrV2rDxEAFPrd69WrnVfJ49+6d6ty5c+xC
+ T2KQSazQiTruyPP79281c+bMrNQTj/vlyxfnVXnDuRJh37x54+wxw969e3WqKTAe1dXVavTo0doYhWvXrqn9+/c7r8zy6tUrderUqZoI
+ GAelEnoSg0yia3Q3eM2uXbuquXPnZkSupPDr1y9tFEuXLtX/jgJiOnDgQIZ4AWG7BSZpJlHdjdd5FgrnfebMmYwSQeA9ro1rjHp9uTAt
+ dMbx8OHDWal3EoNMWQv94cOHauTIkapu3bq6nuzTp4+6evWq824mRKOGDRuqgwcPOnuSA46J1JoISz0dBYySv8fQMfgwLly4oMcTUZqE
+ 7yN9DRIa1zZ27NisMsIUpoXO91RVVWX0MfwoJMjgFHB27oZoKWy2LIXOYJFuIvL79+/r13jHPXv2aDFfunTJ+WQtu3bt0u8h+KRx69Yt
+ 1aZNG9/ryhciOUbTokULde/ePWevPxs2bFCtW7c22iTC2HHECGPRokXO3my4Py1btizqWoMwKXSCDPeE68G2wigkyGDLZDyHDh3S49C0
+ aVN1+/Zt5934KEuhnz59WneJvdGN+oc6yJtySuqEV01ad/379+9q6tSputsdNaV9+fKlvnaM0lsneqFeHDNmjBo0aJCuIU2A8dKB5vhs
+ 8+fPd97Jhns1Z86crP6AsG7duprpvrBt69atzl/UYkro3Ae653I9ub4vSpARWy7VdGbZCf3169dq0qRJ2ni94PnwgN6IwUAxYIgdQ0oS
+ Fy9e1HOvudLDIBDZqlWr9HwuaTNRndQ8CKnPw6JuoRD9EB+GizBwJGENKLnmEydOOHvMYEroZBuMZ5MmTfT1hI1V1CBDd57vdk/HxUnZ
+ CZ2FHaTtfrAfY8ZQ3IgDoPOZJLjB3OguXbqot2/fOnsLg7QfIyNtFKGFOQ2/aaBikOi3bNkyHak5flC0FuJyzCaEThbJ+WOH/JfrCctQ
+ olyL3He+u1Q9pbISOoM2ceJE9eLFC2dPLU+fPtUGvWTJEm1c3BBSXiIJK5AYNFmllJS5YMSNyKMavKT9pM1kQqzIYhzCDB1naLIuJPph
+ 6GRgfDfHR2yILggMncaVdx6/WEwInV4HAqes4Ry5njDHFSXIiHPIVWaZJKfQaYKdP39eDRs2THe/ufAOHTqo48ePG58TpcaZN2+eNlrS
+ P+m2051kMNeuXauNu1KQJk7UTOTo0aOqX79+OmUUI/cTOlG/Z8+e2gmyzFXGlNfFLM9kwQir0RAHcNx8hA7UtWQWJpaH0rDt27dvTReb
+ a4zi8J89e6aDiTQKieR+Qi82yEh9PmDAAJ2loifOO2xWqVhChY5XGz9+vO4OUvchbLzxvn37VL169TKmSY4cOVJzw6OCoZDKYCTTp0/X
+ A8fAM6gMjETzSkHqtCjTXIh74MCB6uTJk/q1TK/xfWGppkkk+knTlJKB4+cTqeSzuTrapQI7ptdBSi02VojjKgRsnO/F0S1fvlyPH9pi
+ NgSt4ZhNEyh0Dk5U5aZ5D8x7eHJO6sGDBzUpZDHemQUI1HhBKSWejwZOHNMybtzd4yhbjx498m7KYEhRohpGuXHjRj3mkuG4hV6KRUPe
+ 6Aci3nxKA0l5Me5ygOyK66GxKIjQKYlMrVbkvkhJ4A1cMn5xNOh8hU69SHOFg65YscL3oJLW4JHx3izeF88eBepVDJf6xQ9pIpWLYZiA
+ MYxSp929e1cbpffvglJN04h90I129xYkJeUcwhqCIJ8tVfYRBmM1btw4LWw3hWQo+SL1ud86BjleHPfPV+hEaaJ12NygeDtuFJ6JerEY
+ EDLfE+TJZBDKwTBMEUXoRADm3Ino3rESoVdXV8e69BKboOYnqrtxCz1XOVJOQsd2yVDdD6mA2JzJ5qX0ZfzukWiqZEInSnNAPE9QhJWT
+ ogHCvHcx0RyI1GFTPlLXeBfLJJkoQpdlpoxF0Ga6pnTjLhHCNm909FIuQqfx2717d99rcG+5MpR8CbJjMiNmX3jP9LQj+ApdIkNYrSdC
+ N9E8YHHFjBkzAg1eBoHOpKkBLwe42YVEC2p/VrQF9SnEiExPW7mhATdt2jTf2Q9JSzmHXCWWCN3kwp1CwbZpwLH52TmpNSk212Nq3YFo
+ y5vxyFQrNm56IRH4Cp3B52TCpn3cqXuxYJS9evUKFLqUEvy4gDe9Mk2pm3H8TT7OC0Pk3KiNg7y9pJqFZgn5wlz5kCFDdI/AD3e0z2UX
+ 8mBNrsgfJwQopo2DGm1hU5ZRYVz87g/3jvGIy8Z9hY5H4aBBafKTJ0/0HCADEBT18VjMZ44YMSJnGslFMl3n58mk+x/XtMO/RIRJJM4F
+ 4mLO3FsXuxHxmJqfdoNzwcn49QYEWUfPNfGsQtgjr8VMLZoAp8TUcVhvCYdN05PzNFUy4jC8vS9snAe4vDbOfhaQoaN8bCQMX6EzCHQh
+ ObA8W8zNxaMz79e/f3919uxZPe1AKomQaRDxwwKCpCj5GDKDOHv2bB3VWfwAHI+pjlGjRqlu3boZa4aUEzxlxtNmuYyIVYGMM7MSP3/+
+ dPZmw72iFDAtIBqAu3fv1pEozIEwF8x95Pi5GoKk9vk8aRcH2DfpOv2lx48fO3uzYR3J8OHD9fWYmrKUp+Jkao1jULaiNe/YSnnD8cP6
+ ZfngK3RgMPihPU6AA+FVuHmskpMVcYgdw2IFG08duecE5T28F+uGg+A4OAkGgIhEN5eoxHeyUogH/ytpNZwbPDbd3qAuK+PG2Ls3v5kQ
+ KQH8Nu5hVFiKLMtq3Rv9ErfDkfrS+zk2+gU4KjcS+bn2Ypu4hSA/6Og9R+8YuQXm3VikVOxTf2RnpOjYObpavHixb/nA2EyZMkWvH2F8
+ +WGRqAQK3RRElrD6hoYHXftiBy+pIGacKX2ItCBNrrAAYKmFzIiMopiZlNiFTooU1myiVjNV/yQRSeXSZPSscmzfvn1ov8FSC46RrNcv
+ 68uXWIUuD6f4PY0mIPJ/1ZApB6j7aHAxl8t4VTo0uCjJwpp6lkzIiIsNBLEJnXqdHwMM+zF90nXSdu9SwLQhXddKN36ujSlCrrWUtXmS
+ oQtPb6zYtfaxCZ3uOYbrbtB54WbTWAr7TFrA2THrwM9oVSoYLVOEQfPwlkxoevM/ijCxJiL2Gt2SP4gd712JQmAajamqtGdv/wordIsl
+ BVihWywpwArdYkkBVugWSwqwQrdYUoAVusWSAqzQLZYUYIVusaQAK3SLJQVYoVssKcAK3WJJAVboFkvFo9R/WJivBGHVOxsAAAAASUVO
+ RK5CYII="/>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="107.146" height="21.8577" class="st1"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/equ_edge_direction.svg b/third_party/aom/doc/img/equ_edge_direction.svg
new file mode 100644
index 0000000000..d36634db1b
--- /dev/null
+++ b/third_party/aom/doc/img/equ_edge_direction.svg
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_edge_direction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.2307in" height="1.4152in"
+ viewBox="0 0 160.61 101.895" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+ <title>Sheet.1</title>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="123.86" height="65.1446" class="st1"/>
+ <image x="0" y="36.75" width="123.86" height="65.1446" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+ iVBORw0KGgoAAAANSUhEUgAAASEAAACYCAYAAACrr18SAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+ ARjRyu0AABsPSURBVHhe7Z3nzxRVG4f5B/jCBxI/GBMTQ/xgCDEGgzFANGKE2CIIQRELIh1sAQyCEKkiKHaNitIjVUikWbECooCCImBB
+ RFFUxAKW8+Y6zL3v7LJ9Z3Zm9/ldyQSe2dkyM+f8zt3OmVZOCCESRCIkhEgUiZAQIlEkQkKIRJEICSESRSIkhEgUiZAQIlEkQkKIRJEI
+ CSESRSIkhEgUiZAQIlEkQjXw559/usGDB7tWrVpltk2bNgWvCiHKQSIUAfv27XMdO3Z0nTp1cl9++WWwVwhRDhKhCHj33Xdd69at3cCB
+ A90ff/wR7BVClINEKAKeeOIJ74o9+OCDwR4hRLlIhGrE4kJYQooHCVE5EqEa+fbbb12XLl1afDzo77//dv/++2/wlxDlIxGqgB9//NHN
+ mTPHB6GxfAYMGODWrFnjTjvttBYXD+JcOWfLCnbr1s398MMP7tVXX3VXXnmlvz5t2rRxN954o9u7d687duyYe+yxx/y14/h27dq5cePG
+ uZ9++in4xJP89ddf7qWXXnKXXnqp/wyO5T28l88IM3HixMz328bv4D7NmzfvlNfYVw6cB7+N38j7LrroIrdx40b333//BUeIKJEIlcnW
+ rVvdBRdc4MaPH+87A6P+iy++mOkoxIVaIt9884278MIL/TZ69Gh/TRCo8PW55JJL3GWXXeYWL16csZief/55f93o7P/880/wac4tXbrU
+ 77/++uvdr7/+6jv+559/7rp37+6uuOIKd+jQoeDI//Ppp596oTr//PPd/v37/b7Dhw+7Pn36+O/evHlz2VYan3/VVVe5Z555xgsi8H7O
+ b+fOnf5vES0SoTKwRt6vXz/fMYwjR474BktHI0OWFnbv3u3at2/vO3M1GxbM+++/H3xacbA6sD543+zZs7OsBXNVee2pp57Kes1+o1ku
+ xurVq/317Nmzp7++xrp16/zn5IoW8Lnz58/375s6daoXuu+++87dcMMN/t6VC58zc+bMrN/0yy+/uGuuucZbu1u2bPH7CsH7165dmxFC
+ UR4SoRKY20ED37BhQ7D3JNbJ2Ph/1NCoP/zww1g+OypMhM466yz38ccfB3tPYq+dc845bteuXcHekxQSIcDSREjC2PGF3F7eQ4LgjDPO
+ 8C7ysGHD3IoVK4JXy8PuNd/zzjvveOuJ34EFt2rVqlN+UxjcxQ4dOnjBQrhE+UiESoDwIEAXX3yxO3jwYLD3JIyMjJCjRo1yx48fD/ZG
+ B7EJvveNN94I9qQPExo6LkIRpthrxUSIzk6mccyYMd4N4vqblVYs9mYWK8eNHTu2qGjkg3vIvbTvQtBuvvlm99FHH2VZcYXAJec351pq
+ ojgSoRJMnjzZN8h8jcvqg+KKByFyxKH27NkT7Ekf1YqQCWyuCJFhxMXFJZw+fbq3ArFITLSKiRBC8dxzz/l7cscdd2RiOpVA/KlHjx4Z
+ IWJDjEqVX1ipxqJFi4I9olwkQkUw85yGOHfu3GDvSazRlRMrqBa+kwDt0aNHgz3po1oRstfCIkS8jbgblg8xnrD1UY4I8X4ylgSo+YxK
+ 3TFzAxE9RJL4DrEp7j+ZuGIglohXrksqSiMRKkJYhHJHQosHMZrTYHHVRowYkYkH0KCJE5DV4Tg+h+PgzTff9GnfkSNHeitn+PDh/nMm
+ TJiQeR9/MwITZyDDs3DhQv/ecrAOayN5pVs1gekoRMjcW7umYejcxJ24jqT1CUCT+jcQLALj7N++fbt3y9iwbMphyZIl/rzvvvvuLNea
+ WBYxLSziMFhZCxYs8FYb9weR4lxyf7cojUSoBEzFoGOGRYiR0tLPFg8idsSxdAbSvNdee61P8zKysg9hwUWgA5EpQsSohRk6dKi3dGjU
+ iBHvARoznZGsUJqJQ4TyBfotdY8Icfxdd92V9Zmvv/66T8mbCCDa3B+sVYS9GOHBBhcw7HYjxtQLhUWZe4rYITzcN7Pg4ooNNjsSoRIw
+ kjKiTpo0yTc+GiwjLjUrWCrEirB+sGqoI+EYju3fv39W4ye4jKi89dZbvkMxshN0tawRQkUNEp0GV4/Xu3btmup4EJhYYi3s2LEj2HuS
+ 8Gu52TGzMMJWD9cWUUYM7r//fv83go/lSGyM643l8cUXX3jBxvrkelMewesIvblwCEPv3r29EOHWclwhzIoaMmRIxvVln8WHEKbw+3HT
+ qJBn9QSwgHZLrRWrFYlQGWzbts3HBmjQdBxG2RMnTrjly5f7vxkply1b5hsuozMjP0ITho4QHuFXrlyZlc6l8RP/sQA4Ac40p3vD1kN4
+ 49yxGrFwcl9j32effVbwNSwcc0cty8W1nTJlihcqOj9/I94IT27FdNjiyldNze/ld+eD/Q899JB3fzkW14x7TgU4QmjYeYetHn4bv79U
+ 8FrkRyIUMVg8uTUzNlJaJ0BkEBssHxu5yQoxujK1IN/rIh1YLDCcqMCNxGrVWlLVIRGKGHO7zMUAzHYEhpEcsG6wcrCGDCwnGvKBAwcy
+ r1u6l3lLbCJ5bLAIWz0I0i233OIrvIn3lYpBiWwkQhFDnIIJnFQ6A24WcZ5wXMEyPTaVAeEhkE1wFSxoS0MnyM171bDTAfeBeJ+528TB
+ cB1x/xhsyLKJypAIxQCTXQmgkobPF1fAArr88st9hoX0LrEg3mOuF/8Sd6LehQBs2KpqaXDuzKpPEwSssVSxeCmrYLDo3Lmzr67ON8FW
+ FEciVGcs3mMBaJEfJoEi4ASIw26raD4kQnUmN94jCkMQWFXIzY9EqI7QmXDRSPXjalVSBd0SIQ2P2xpe0kM0HxIhkVo0K71lIBESqYE4
+ EFXLZAYHDRrkK57D9TiiOZEIiVTAWkBM9mXyKVBTxTSNuFYoEOlBIiQSxyaAsrSqlSkgPlhEuRNZRfMhERKJQ51NrtWDG1ZsrpdoHiRC
+ InEIQIenuthcOz3RtmUgERKJg9iErR7ECFFiHh7r+KR5jW1ROxIhkTgsCEcBJ/VALBLGKoa2LMcLL7xwygMGRHMhERKJw8Re1hBinSAm
+ /7JiABNCWaiMCaFazqS5kQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohE
+ kQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkBAiUSRCQohEkQgJIRJFIiSESBSJkDgF
+ Hru8detW9+WXXwZ7hIgPiZDIwDPhv/jiC3ffffe5Nm3auE2bNgWvCBEfEiHhdu/e7dq3b+86dOjg+vbt6y644ALXqlUriZCoCxIhkcUf
+ f/zhBg4cKBESdUMiJLKQCIl6IxESWUiERL2RCIksJEKi3kiERBYSIVFvJEIiC4mQqDcSoTx8/PHHrnv37u6SSy6p69a7d+/ECwQbUYSO
+ Hz/uJk6cmPeaxr098cQTvrhTVI9EKA/79+/P1MrY1qtXL7d+/Xr32muvVbStW7fOPfjgg27q1Kl+Gzx4sLvwwgtd69atsz7ftmeeeSb4
+ FcnQiCKECMyePTvrOrZt29YLRL57UmqbP39+5n4hbldeeaVr165d1ufbdvHFF7uDBw8Gv0RUg0SoAK+//ro744wzMo2N/7MvKqhO/uST
+ T9y0adPcOeeck/meq666yh05ciQ4qv40qjt27NgxL/B2Hdn4m/1RwX1ZvXq169OnT9YgsnTp0uCIxoY2uWTJEnf11Ve7jh07uhEjRriv
+ v/46eDU+JEIF4IYwEoYbNdYRVlLU0PFffPFFP9rSuDds2BC8Un8aOSb06aef+s4TvmdYSFG7S3ze3r17Xb9+/fx39O/fP1KxS4pFixa5
+ BQsW+PMzUb/iiivcoUOHgiPiQSJUBEa+nj17ZjXqqEfXMMSDiEUNHz7c/fXXX8He+tLIIgQrVqzIslKitmDDMFDhPvMd77//frC3Mfnl
+ l1/cHXfc4Q4fPhzscf7+cy0RpziRCJUg3+j61FNPxRaM/Oqrr3wMguB4EqRFhBBh4jOVCn4+C7Zr167uwIEDwRHR8u+///rY09ixY/13
+ p5Fff/3VrV27tujAZvMHmbz8zz//ZO0jLhYnEqESIDYEKus1ugIN5t577800hnpy9OhRd9NNN/nzJP4Rl9gWA+GhU3Pdq/l+3AfcCLtf
+ bHFasHTyQYMGuZ07dwZ70oW1Ya5poWvw/fffu8svv9zdfffdPtsIW7ZscaeddpqbN2+e/zsuJEJlwAjHDQw36jh9ZUZXzGP+rQc//vij
+ 69atW9b55W5xN0TDLJlJkybVZFlgSZ599tmZ388gUq2olQMWJFtasevKVui6njhxIvMaA+DkyZMVE0oT+UZXfOikYjfNysaNG91FF11U
+ cwKgkAW7efPm4IiWBy4pJQXEzUqBiFMHRTgibiRCFUCMhIZsjTru0bWlYYmAqGJuWCZDhw7N3C+2eozsaYZAOm4X7lchuD633nqrjwnV
+ A4lQBdAxSKWHGzWBu3qMFi0B6m24nlE2/u+++85nHMP3rFZXr5HZt2+fT7QUKoolZsT1+fzzz/3fhAR+//13//+4kAhVCDcptyiOAsMf
+ fvghOEJUg2Xl2KKOreQWnmLBLly4MHi1ZUHQedSoUe7666/3SYgwCPPcuXOzpg7hlilFn0LyTesoFvATpbF0MOnuqMGCzZ3WgTXQUi1Y
+ hOass87KKgOxGNq5556bNTeOJX/feOON4Kh4qFmEMNeYU3XppZf6EYYR55577nE//fRTcERzkm90JbUuqoM5dlzHuBp8PguWimfS62kA
+ EaBNUSNGW+rRo4d79913fZZ03Lhx/sEDzDlkX63wGXzHypUrgz3O7dmzxwtQ+Pqw5YpVHNQkQqaeXDDK2AFTmnQ2Fy6JOpd6gdWTWxTX
+ kkfXWsECYg7drl27gj3Rk6/wNA0WrPWjCRMmeLHkb2KPWIYE6skYEki++eabfXyr1rmFZnWSgk8DNYmQBbnCNSSoKzcXv9OKngpBHOWl
+ l14K/ipNblC40o3MSJSTQxlFbf6QbXEWxTUzVOV26tQp9qVMsFaxAux+Yc3GWXhaDlgaI0eOzGo3WIT8PpvCw0oM/H3NNdd466gW6Hek
+ 6seMGZMKQ6EmESJlzYXB8jGzFovo4YcfLjr7lpgKCo+JGTYJG5F8o2uc0zqaEQtKUzBJ4WSc5LNg45zWUQ78ntxJy1iG/DYb4GlnCNH2
+ 7dv937VgxalxJAGqoSYRsrJuu5mdO3d2jz32WFmWwLfffuvduLj9zXpApiV3dI3Cd28p1FOEAEuAjKbdLzaWrfjzzz+DI5LFrkdc8Zim
+ EiFGFUQHiyZ8QykQK3VydFKKpqJ0j5Iid3Stl0sWvuaNsuWj3iIEYQs2DS5ZGFxSXNMoXK98NI0I0fGso3EiZi5yQ8spOMPcTItPGgXM
+ fqfhtPSK3Gqw2hXiFPWqt8JdnjNnjrdg01b1bktoxNU/TIQYLNNg/VUlQjSUa6+91gvOtm3bgr0nb+z48eN9qo+UXxjiQEOGDPEnz4xj
+ 1lOmXqES0haYNswSUnaseghMR10tXQyzhJLOjtFnWK2A/mDz2iwInRsvJT5kMSJ+86xZs/w5sBAZwkWcldqe5cuXFxVVS8fHvURHuVQl
+ QhaQ5gJYeTcQxSean7soFzec1KIF1chQIGDElBodbjYj6fnnny8BqgE6F/HFerQJm4yctAABsdEuXbr4/kRlMoOkxavoZ4bN57KJvSwb
+ smbNGn8M140Bmpo9MtYM9MWWFSHORLypUiPA4JpNnz7dfy8eTa3WWlUixIXgRFl0yhQXl4ysEAVV4dHM0tgzZ87MHEtD4/3cgEaG87H5
+ TmmKKTQi+Qro4gArnikLaSmlMBEiw8yqhogJAskgbes58ZuJs4bbGMkQ+hkicMstt2Tmd3Esbm2xqRYUhnKtq02emDuHUBKCqLWsouqY
+ EG4YC35bUJry7ilTppzi06PUuVYPCpyWoFgt0CgYUdIWU2hEeGIFnYfJk3FdS0QH8UlT3I5zxX2iUJO+hCuGoFCgaE9lwc3aunXrKdfF
+ Ymm8xzArB6EpBEWKXOtqnxKCJYRRcfrpp7vzzjuv5sXcqhahckGpwwHHfBeuEcH1wgXDLE3apA9j8YTwVqh6HcuVRh4+lr/JeNYbfh+/
+ M66njXCPcL+Y81evuFPcmBUVnuqCJVkstka2jaxbFDMaEMUZM2bUfD1jFyE6RdjqMXORC8fi4HFPjouDNMUUCmGV62zE7ogV5IM4Alki
+ zocMX5IQmGUUj/ppI3SWZozb4V1QaGnukMVkce0KtUv6HEHpKOqPELTcxfGrIXYRokGhvIxuXCRMQVPqF154oeEeHIeIpimmkA86HW4N
+ 7jGCjxCFY3K5YK2yJQ0dhw4UpavOOTdr3I57xhK2CArniQtH1rqQq0n/QzSKiVQlYECQYUskMF0JnCzmPf4tM4S5UPxwzGIetFaoY6SR
+ esUUEGhGtGpFjnViyKTQOBEfRKhQDIA6EYKeaanwZhEyilijWpGgHnE7OiFWcT2fF2dhDRamHzZsmE/N42LlxmTD0Pdwd7nGtUL7Z0Jt
+ ODteLbGLULOAmNYjpsD3YMWEJwVXCnUg1113nTeTCRragu/5nhRKXAHLLk2ZStwMhL5W16lecTu+hyRNrW5JJVg8qFgAOgzC07dvX/fO
+ O+8Ee2rj5Zdfdm+++WbwV21IhMqABsxymHHHFOx7SHsWiuGUAw3TVjGwOAEilO9JoVhAWEJpmTdlkA267bbbio7sxSBOQm1a3HE7+55i
+ 7m4cINQMiLlFwfngGlIgHM5QpwmJUAloWJjymPRxxhRwofDVyU6ZgFQLyYBwIRpuAp9LcVnuk0LTEg+KEkscxBm3o128/fbbPkVdryJL
+ 4+mnn/YlMaT0cV3jtMzrgUSoBHHHFCjmfP75532dCNYKYlFLbIEak9xRL1yFG07Npi0eFAVxx+3IJu7YscPdeOONmfKGZnkWfVJIhIoQ
+ RUwBi4YKU9so8sRdmjZtWmZJXBqybbUUkQHuAfGgXDcGN4/PJz5kxWWVxIMQYILFNm0gF86NJ7cyQtdqyVUL96jWuB3nSerZ7tc333zj
+ Yx8MQohbu3btsu4XW6EnV4jykAgVgEWuGE1zG1zcW62xBSrUScPmioCtghn+Diwg4i6l0uFkNxGXYktLYCEgZkzRScK94/uZ8pAr6nFv
+ xWqwRHlIhPJgMYV8jS7OLV/MplIKxXisIpnvMWurkngQx5VaWgIrjKB6vQtQEVQslXoLEFvuZG1RORKhPOS6UPXaiN0wolcLFg2WTaEY
+ DwKH0NF5sBqIB4VnaheC2BGuSKnnT/G9WAblZGyiJNeFqufW6PMf04BEqIkoFeOhw1CNjAhRPEpmpZx4EMeUsxQv1lJcqwGK5kUi1ESU
+ U/Nj6XqEqND0CNwLFsoio0YlLhXuLN1QrGYH65GANIvaYZngmlG1TQ1NFFW1onmRCDUR5cR4wun6fMdahgnhQYxsPahSGS8EilgTE2fJ
+ vvHZ1NGQTSrH5RMtF4lQk8CjlrA6SBeXiitxDNZQPnEgDR+u2DYLp5S44apRuMfjnp599lkvZsyKv/POO33sRIhCSIQaHNLn+bJCjz/+
+ eHDEqSAwvXr1OiUeZDGjsNWDhYMrVsqaIWjN97K0BNaQivdEuUiEIoSnydIJzzzzzNTO0ykGosSkyPCUD84jvGZNPogBEQsihY/7xvQT
+ VRGLcpEIRQiWBOsC05HLyTqlDavzCVs9CBJrGBNLYiVGhAV3z9Y0BrJhZMVs5j91QlhPuGGvvPJK1hNZ0gZuLFYhsSvmYiG4BOPZKDcY
+ MGBA0acJi9qRCEUMk0fDC483EggMFowt+cEcKToiQWpcONZ/AlZiZN1we0QNdUFMlTDrD3cMty4sXGmGAkysuNzyAuJaWHWs8cy5iHiQ
+ CEWIFfWxemSjQjqdzkima8KECd4q4vHeLGBlE0IRI0SIND5QGsCKftZROY6MGrVFhQon44ZpNwTgyfTZxu/Nl+EzS87KC8JgCRJzS+o8
+ WgISoQixmAqWQLOD9ZPG86Ss4JFHHnEPPPCAn3xarKzA2LVrl1/FILci3Cykej6UsSUiEYoQ3BHcFxo1pjxTI4iN4KIVm3PViLASQD2D
+ 74gAywPzpFKsNVaf5NqGFz7jGs+ePbvidZ8QU5Zrya0IZ34dFiETgjU/LD4kQhGC6Y5Z//333/u6GlYHHD16dMHK5EYFd4vzQ2jrAeKC
+ a4WLxbUkRsP/cZ1YEcDWSKJIEuHPdamKwbEIGsuqhJdnxbXk3jENJlwpTiAb1zOf60Yg/vbbb09sKZNGRSIUEWa6k03BFWC0RniIDzWb
+ e0YHq2cnM9FDeHKfs4XwU6SJaKxatcpboZXAipYIDY/BYZoJIseCZawZvWzZsrwWEL+l0GRe7nW4xEGURiIUEXQCOkPbtm39yIz7UMmI
+ LEqTW7Nkwo+IICY8GhnxQEjybbNmzTpFVIhtIUClKsINkg+4Z/km85pVVU83tRmQCEUEIzAz02mc7733nm/Y9XwETEsAoQiXP1hWi46P
+ AFRjCRHbKpX9oi6Kz2ZKyr333lvwyRr8nhEjRvgEBU+joLQhzTVSaUEiFBGY4RZXoJPQWSjeo4KYznPixIngSFENuH/EWgjyG7lPE60m
+ JsTnlSou5XldDz30kP9cXMFCMR+s37vuustbZMSOcO8kQqWRCEUEsR9rnDZCU2PzwQcfuNdeey04SlSLlT/Y00Nxv6jJCj+AgP3MpSvX
+ HbJ4EJ9TaPkTCi2J89lnEgsq5LphVeGO85uUTSsfiVAE0IBZx2fFihX+bzrF4sWLfRUxJrnmUNWOxYPIhhF7I12/fv36U1YMIBlAYoAn
+ mBSqcuYYxAz3GVeMKRtYsR9++GFwxP8h/mQPDiAGxft4gGA+yxarigGHe09VOaIoC7g0EiHREGB9WAC6FAwCuEaPPvqot3JsLhgDQqVZ
+ PcQHS4j0O08x5f9YtzyB4+eff/ZTXBhkcMFZtoRANxYw38W0F9wyURyJkEg9Fg9KYjoMgobQsEbSk08+6WuHEERif1999ZW3zghqU0ZA
+ pg5Bwnri/8QJ61VL1chIhESqYQY7xYHMcOexQzNmzPBuUVrA4vroo4+Cv0Q1SISEqBKsJOKAhWJPojwkQkJUCa7Wb7/9FvwlqkUiJIRI
+ FImQECJRJEJCiESRCAkhEkUiJIRIFImQECJRJEJCiESRCAkhEkUiJIRIFImQECJRJEJCiESRCAkhEkUiJIRIFImQECJBnPsfh6LP/cPu
+ K/UAAAAASUVORK5CYII="/>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="123.86" height="65.1446" class="st1"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/equ_guided_filter.svg b/third_party/aom/doc/img/equ_guided_filter.svg
new file mode 100644
index 0000000000..021c194d7a
--- /dev/null
+++ b/third_party/aom/doc/img/equ_guided_filter.svg
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_guided_filter.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.42115in" height="0.772328in"
+ viewBox="0 0 102.323 55.6076" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+ <title>Sheet.1</title>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="65.5731" height="18.8576" class="st1"/>
+ <image x="0" y="36.75" width="65.5731" height="18.8576" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+ iVBORw0KGgoAAAANSUhEUgAAAJkAAAAsCAYAAAB2Wxp8AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+ ARjRyu0AAAZ7SURBVHhe7ZvvK15vHMf9A57sgdoDeSTtgSRJ8URtZUWhaPPAj7C2FpOE2APlx2JCnnkg+bGhlZUUTbS2VsuoJUIYkl9h
+ MjHMj8+398e59r3dzn3f577Pdcx2X6866fy4z65znff1vj6fz3XmQwqFxSiRKSxHiUxhOUpkCstRIlNYjhKZwnKUyBSWo0SmsBwlMoXl
+ KJEpLEeJTGE5SmReyM+fP+nr1680NDREg4OD9OXLFz4mODo6ovHxcTo9PdWOmMMrRPb582fy9fUlHx8ft7ZXr15pd/j7OT8/p6mpKcrL
+ yyN/f39KTU2l169f0/v376m9vZ2SkpJoYGCABfbixQveZOFVTra0tESRkZEsIIxgPX78+EFlZWV8TW9vr3b072Z/f5/Ky8vJz8+PiouL
+ aXNzUzvzP3t7e3wuPT3daf94gleJ7MOHD9yBYWFhNDc3px29ihDjp0+ftCPXx+rqKs3Ozmp75oGg4FoBAQH05s0bOjs7085cZWRkhG7f
+ vk0hISE0MzOjHTWPV4msvr6eRYZOx8h1xPb2Nj18+FBqRxsFU7SsaXpjY4OSk5M5VMDUiCnTGRDkvXv3XPaPu3iNyBDYPn78mEVWUVHh
+ tMMhsszMTHaV60aWyE5OTqi0tJSfNycn51Jg74iDgwPKzs7mwSgTrxEZBBMdHc2dbh9r4RxexO7uLu/DAXJzc3/vXyeyRPbu3Tt2MEyT
+ o6Oj2lHnwL3S0tKkhwmGRIZ0NzExkRuNzv/+/bt25gLEMHiBNTU12pGbh8gwg4ODaXp6Wjt6AVJ5jHpnKTucoaenh+7evcv3wctA7LS+
+ vk6PHj3iY+gjs/GUDJHt7OzwNIkB9ezZM84YjQC36+rqYieXiUuRodOysrJoZWWFFY6Gd3d3a2cvgDPguGyblUlTUxO3Eam6rUN9+/aN
+ YmNjnWaSEBgGUHNzM78w7GPKjYqKopSUFJqYmKD5+Xm+z9OnT+nw8FD7pfvIEBkGDUSP58XA+NM4FRlGNlJfZGVAvKi2tjbeB7impKSE
+ j4vrHIFr8XJwracbnNToyBQcHx9Tfn6+7v2wBQYGcvHREZh6qqqqWFwC9AF+W1dXx+0R98df/HueIkNkqHGhLXqu/SdwKrLl5WUemRj5
+ qLVkZGTQnTt3aHJyUruC+BzcASk/ps2biKN4DOl8a2srOxCmGD3gSoWFhZeeWQwsuIWIX/AXTo4+M4NZkdkmOM6e6zoxHPiPjY1xDQUP
+ YJupwAHgBLLTXpmItuuNbIgDgnEWj9mztbVF9+/fZ+G6m4GiLII6FETgyXbr1i2uZzlCmAGutX9XtiDksb+32MxO+fYYFpmoMdmPMtFY
+ mcsQshFTm308BhBn2ceYrhCiNTs16iHTybBy4ao2BhAS4XpnojSDIZHBoeBU9pVg23hM5jKETGzjMXcdyxEQAe6HGFU2ZkVm+06MOJJ4
+ t7jeqsTNkMhETGPvBJjvMe+7WqYRoAOuO/AXUxt+68laJIL9lpYWnoKQiQrRwsngaAI4RmVlpekak1mRAZHto6TiKiYzsp5rFkMiEw1B
+ NRhVYcHw8DB39t8Qj7nKIB0hYk4E+ai1LSwsUERExBVXxxcOT548MR1oyxAZ2iCWk5AZO6O/v58F5sgoMMhevnzJfQjn9mQmMCQyOEdB
+ QQF3LupmGLX4Bgn7aOBNrY+hnR0dHdxGrMnpfX3gCogMCUNDQwO7eHV1NTsEsmxRSYfD4esFCM0sMkQGMABQx8M7snVcgf071ItXAQqz
+ MTExfI2nFQTDgT+q/M+fP+dlCmQ4RUVFXPXGP37T4jGsUAQFBXHb9LYHDx7wJz1GwEjGCMZz456dnZ0cHKMyDvGhL1DxX1xc1H5hDlki
+ AxhU+HwHLoSg/u3bt/z9GP6iiAxhQYAYiJjq9ZIEPD9qgfgGLTw8/FIpxyiGRAaLtLfJtbU1dgdP1a3QR6bIBJg+ESvCDLAaAAezdS2I
+ 69evX9qePrimtrbWoy9TXIoMUwAsNS4u7tKaFpYr4AqYrz2ZpxX6YHH+Jg5aiBIhExIpd3EpMrGUZFs9RoCI+T4+Pp47RfHvgyVD1NMs
+ CfyRDoeGhvIiMBALyshesGiu+PeBkeAjCU+/MHEpMgS5jY2NHPQiJU5ISPj9Hw4U3kFfXx99/PhR23MfQ4G/QmEGJTKF5SiRKSxHiUxh
+ OUpkCstRIlNYjhKZwnKUyBSWo0SmsBwlMoXFEP0Ht6gu9OfTLrAAAAAASUVORK5CYII="/>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="65.5731" height="18.8576" class="st1"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/equ_wiener_filter.svg b/third_party/aom/doc/img/equ_wiener_filter.svg
new file mode 100644
index 0000000000..fcea1c8391
--- /dev/null
+++ b/third_party/aom/doc/img/equ_wiener_filter.svg
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export equ_wiener_filter.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="1.45687in" height="0.790186in"
+ viewBox="0 0 104.895 56.8934" xml:space="preserve" color-interpolation-filters="sRGB" class="st2">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.375,-18.375)">
+ <title>Sheet.1</title>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="68.1446" height="20.1434" class="st1"/>
+ <image x="0" y="36.75" width="68.1446" height="20.1434" preserveAspectRatio="none" xlink:href="data:image/png;base64,
+ iVBORw0KGgoAAAANSUhEUgAAAJ8AAAAvCAYAAAD90RiVAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAGdYAABnW
+ ARjRyu0AAAWxSURBVHhe7ZrLK25fGMf9AyZmRkpJBjIwIFKKHMXAwG1CopBbyUAUQilCuRQDA3KdGCgpYqCUASUnRSSSW0jI/bZ+fddZ
+ y9ney7bfd2/v2s7v+dTqeNde+z1r7fV9nvVd691+jCAUQeIjlEHiI5RB4iOUQeIjlEHiI5RB4iOUQeKzKbe3t6yvr4+Nj4+Lmn8PEp/N
+ +P37N8vNzWV5eXksLCyMjYyMiCv/HiQ+m3JxccF+/fpF4iN8D4mPUAaJzwDLy8vM39+f+fn5eVTs+lBXV1dZYGCgyz4HBQWxtbU13q6n
+ p4cFBAS4bJeens6urq54O2/xVHyHh4csMTHRqS9zc3OihT7r6+vcY2rvjY+PZ1tbW6KF9ViW+fb391lMTIzugK+vr1lDQwNvMzU1JWrt
+ iXY8k5OTotYZjBVtIiIiLJ0obzPf/f09Ky4uZtHR0bxfQ0ND4op7sLMuKSlhoaGh/J6ZmRlx5XuxTHyLi4u845GRkWxnZ0fUOiMndWlp
+ SdTYE5nRw8PD2ebmpqh1pr+/n487JyeH3dzciNo/TE9Ps6SkpC9La2sre35+Fnf9wVvxHR0dsezsbFZfX8/71dTUJK64B8c5tbW1LC4u
+ 7sv5sxLLxNfZ2el2ErTgoeLhfGc6twIpqpSUFHZ5eSlqP/P09MQqKyt5u+bmZvb+/i6umMdb8cE2lJWVfWRkZLSHhwdx1Zm9vT1WWlrK
+ 28NufDV/VmKJ+JDqi4qKDE0CHmp+fj6PULuiFRX+xWdXnJ+fs+TkZN7OqLcyirfiwzKLRAAPFxISous/X15eWGNjI1+FZLC1tLSIq9+P
+ JeKDkGBO0XlHL4driET5AE5PT1l5eblpQ/6dnJ2dfZh3Pc8kJ9hqvwe8EZ8MGlggaW8wL+4CfWFhgQsVyUMGm9VBpIcl4tPzR/Pz89xP
+ vL6+ihr7I3e8KPjbHRMTE3zCrNjdSnZ3d1lmZib3X3im2ATAF3Z3d4sW7kEmxi8jEJ4Ur7vAQIBVVVWxk5OTj+TxHUGkhyXikynbcRLw
+ IOGZ7L6zdQTZBuPRyxqwFnLnbrXf8xZkYqwqd3d3n6yQ4+YOfYWYZ2dn+WcZbFYGkRFMi0/rj1wVLEt4KJ6CTIlJdfWdRgsm4vHxUXyj
+ MfD/1tTUuPw+d8UuwYVMjGUUaMeBei2Yj7q6uo9nI5OHr4PItPjc+b23tzc2ODiou1u0I9pNhJ7fktniq6MYXyHFpvVs8gQC4pLgTK+6
+ upptb2/zz9rk4esgMi0+vUlAuscD+Ul+T24ivvJ7cmn29VLlDvQBh8vwexLZR9gDZDSUsbExXmSGk8lDRRCZFh92g+4mYWBgwCnl2x05
+ Ych+yIKu0GYLuwQXgqawsPDTGR2CH32E94MHRLZD1kP2k8jNooogMiU+O06CGbQ+CePC+FyhXZrtElxYMh3P6GQWh/U5Pj7mPs/Rf0u/
+ p2L+TIlPOwlW+wU8CF9vODz1e95upqxGBo3jGZ0860Pp7e3lK5F2Q4FfPvALCMarIohMic9uk2AWmSmM+j29pdmXYLksKChwOqPDRg9Z
+ D33NyMjgn7VIv2dk/vDdsbGxLDU1lR+hWYHX4kMEDQ8P84Hh1wAcWv5kjI4HmbSiooK301uafQVOFbDqoM/azQaQZ314FWxlZUXU/gWe
+ EH7PyPzJgEORxzlm8Vh8eJ9NvnrjqmRlZfFXp34KGA8mx9VYtO/v4dgIE+WqnYrjJIglLS3NqS9RUVGfMhN8YFtb24efOzg4YAkJCU73
+ oWjH6wgyH+4LDg7m3hGiN4upZZf4/4HjmI6ODvHJHCQ+wiOwxOttxjyBxEcYRr79srGxIWrMQeIjDIEN2ejoKGtvb7fsPJDERxgCr2h1
+ dXV9+nXELCQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhkkPkIZJD5CGSQ+QhGM/Qd0+F8Wgj3WpQAAAABJRU5E
+ rkJggg=="/>
+ <rect v:rectContext="foreign" x="0" y="36.75" width="68.1446" height="20.1434" class="st1"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/inter_motion_field.svg b/third_party/aom/doc/img/inter_motion_field.svg
new file mode 100644
index 0000000000..091ae11f35
--- /dev/null
+++ b/third_party/aom/doc/img/inter_motion_field.svg
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_motion_field.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.60417in" height="1.72563in"
+ viewBox="0 0 403.5 124.245" xml:space="preserve" color-interpolation-filters="sRGB" class="st21">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {fill:url(#ptrn11-12_10);shape-rendering:crispEdges;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st3 {marker-start:url(#mrkr5-20);stroke:#923931;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st4 {fill:#923931;fill-opacity:1;stroke:#923931;stroke-opacity:1;stroke-width:0.29411764705882}
+ .st5 {fill:#ffffff;stroke:none;stroke-linecap:butt;stroke-width:7.2}
+ .st6 {fill:#923931;font-family:Arial;font-size:0.666664em}
+ .st7 {baseline-shift:-32.4939%;font-size:0.649878em}
+ .st8 {stroke:#923931;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st9 {marker-start:url(#mrkr10-32);stroke:#923931;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st10 {fill:url(#ptrn17-38_36);shape-rendering:crispEdges;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st11 {marker-end:url(#mrkr10-44);stroke:#923931;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st12 {marker-end:url(#mrkr10-56);stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st13 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.29411764705882}
+ .st14 {marker-start:url(#mrkr5-62);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st15 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.34246575342466}
+ .st16 {fill:#0070c0;font-family:Arial;font-size:0.666664em}
+ .st17 {marker-end:url(#mrkr10-70);stroke:#0070c0;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st18 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.29411764705882}
+ .st19 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st20 {fill:#000000;font-family:Arial;font-size:0.499992em}
+ .st21 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Patterns_And_Gradients">
+ <pattern id="ptrn11-12" v:fillPattern="11" v:foreground="#002060" v:background="#ffffff" patternUnits="userSpaceOnUse"
+ width="6" height="6" viewBox="0 0 64 64">
+ <image x="0" y="0" width="64" height="64" image-rendering="optimizeSpeed"
+ xlink:href=""/>
+ </pattern>
+ <pattern id="ptrn17-38" v:fillPattern="17" v:foreground="#923931" v:foregroundOpacity="0.47" v:background="#ffffff"
+ v:backgroundOpacity="0.47" patternUnits="userSpaceOnUse" width="6" height="6" viewBox="0 0 64 64">
+ <image x="0" y="0" width="64" height="64" image-rendering="optimizeSpeed"
+ xlink:href=""/>
+ </pattern>
+ </defs>
+ <defs id="Markers">
+ <g id="lend5">
+ <path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr5-20" class="st4" v:arrowType="5" v:arrowSize="1" v:setback="5.47" refX="5.47" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(3.4) "/>
+ </marker>
+ <g id="lend10">
+ <path
+ d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+ style="stroke:none"/>
+ </g>
+ <marker id="mrkr10-32" class="st4" v:arrowType="10" v:arrowSize="1" v:setback="2.07" refX="2.07" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(3.4) "/>
+ </marker>
+ <marker id="mrkr10-44" class="st4" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+ </marker>
+ <marker id="mrkr10-56" class="st13" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+ </marker>
+ <marker id="mrkr5-62" class="st15" v:arrowType="5" v:arrowSize="0" v:setback="4.63" refX="4.63" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(2.92) "/>
+ </marker>
+ <marker id="mrkr10-70" class="st18" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(47.13,0.12) rotate(-90) scale(-1,1)">
+ <title>Parallelogram</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+ </g>
+ <g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(155.13,0.12) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.2</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+ </g>
+ <g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(-60.87,0.12) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.3</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+ </g>
+ <g id="shape4-7" v:mID="4" v:groupContext="shape" transform="translate(26.88,31.62) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.4</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st1"/>
+ </g>
+ <g id="shape5-9" v:mID="5" v:groupContext="shape" transform="translate(134.88,49.62) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.5</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <pattern id="ptrn11-12_10" patternUnits="userSpaceOnUse" patternTransform="rotate(-90) scale(-1,1)"
+ xlink:href="#ptrn11-12"/>
+ <path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st2"/>
+ </g>
+ <g id="shape6-13" v:mID="6" v:groupContext="shape" transform="translate(-81.12,13.62) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.6</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st1"/>
+ </g>
+ <g id="shape7-15" v:mID="7" v:groupContext="shape" transform="translate(56.8008,-95.4345) rotate(9.46232)">
+ <title>Sheet.7</title>
+ <desc>MVref</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="41.0586" cy="124.245" width="82.12" height="0"/>
+ <path d="M4.1 124.25 L4.46 124.25 L82.12 124.25" class="st3"/>
+ <rect v:rectContext="textBkgnd" x="32.0251" y="118.245" width="18.067" height="12.0287" class="st5"/>
+ <text x="32.03" y="127.25" class="st6" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV<tspan
+ dy="-0.287em" class="st7" v:baseFontSize="8">ref</tspan></text> </g>
+ <g id="shape9-24" v:mID="9" v:groupContext="shape" transform="translate(164.801,-77.4345) rotate(9.4623)">
+ <title>Sheet.9</title>
+ <path d="M0 124.25 L82.12 124.25" class="st8"/>
+ </g>
+ <g id="shape12-27" v:mID="12" v:groupContext="shape" transform="translate(123.949,167.675) rotate(-170.538)">
+ <title>Sheet.12</title>
+ <path d="M1.55 124.25 L1.91 124.25 L27.37 124.25" class="st9"/>
+ </g>
+ <g id="shape13-33" v:mID="13" v:groupContext="shape" transform="translate(263.13,0.12) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.13</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 124.25 L97.2 124.25 L108 70.25 L10.8 70.25 L0 124.25 Z" class="st1"/>
+ </g>
+ <g id="shape14-35" v:mID="14" v:groupContext="shape" transform="translate(242.88,67.62) rotate(-90) scale(-1,1)">
+ <title>Parallelogram.14</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <pattern id="ptrn17-38_36" patternUnits="userSpaceOnUse" patternTransform="rotate(-90) scale(-1,1)"
+ xlink:href="#ptrn17-38"/>
+ <path d="M0 124.25 L24.3 124.25 L27 110.75 L2.7 110.75 L0 124.25 Z" class="st10"/>
+ </g>
+ <g id="shape8-39" v:mID="8" v:groupContext="shape" transform="translate(353.801,-45.9345) rotate(9.46229)">
+ <title>Sheet.8</title>
+ <path d="M0 124.25 L25.71 124.25" class="st11"/>
+ </g>
+ <g id="shape15-45" v:mID="15" v:groupContext="shape" transform="translate(272.557,-59.475) rotate(9.46231)">
+ <title>Sheet.15</title>
+ <desc>MVref</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="41.1819" cy="124.245" width="82.37" height="0"/>
+ <path d="M0 124.25 L82.36 124.25" class="st8"/>
+ <rect v:rectContext="textBkgnd" x="32.1485" y="119.445" width="18.067" height="10.0769" class="st5"/>
+ <text x="32.15" y="126.64" class="st6" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV<tspan
+ dy="-0.287em" class="st7" v:baseFontSize="8">ref</tspan></text> </g>
+ <g id="shape16-51" v:mID="16" v:groupContext="shape" transform="translate(245.314,-64.0156) rotate(9.46229)">
+ <title>Sheet.16</title>
+ <path d="M0 124.25 L25.71 124.25" class="st12"/>
+ </g>
+ <g id="shape17-57" v:mID="17" v:groupContext="shape" transform="translate(163.726,-75.3635) rotate(9.46229)">
+ <title>Sheet.17</title>
+ <desc>MV0</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="41.6032" cy="124.245" width="83.21" height="0"/>
+ <path d="M3.47 124.25 L3.83 124.25 L83.21 124.25" class="st14"/>
+ <rect v:rectContext="textBkgnd" x="33.3787" y="119.445" width="16.449" height="9.59985" class="st5"/>
+ <text x="33.38" y="126.64" class="st16" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>MV0</text> </g>
+ <g id="shape19-65" v:mID="19" v:groupContext="shape" transform="translate(245.326,-61.7636) rotate(9.46229)">
+ <title>Sheet.19</title>
+ <path d="M0 124.25 L25.71 124.25" class="st17"/>
+ </g>
+ <g id="shape21-71" v:mID="21" v:groupContext="shape" transform="translate(225.375,-0.375)">
+ <title>Sheet.21</title>
+ <desc>Current frame</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="29.25" cy="117.495" width="58.5" height="13.5"/>
+ <rect x="0" y="110.745" width="58.5" height="13.5" class="st19"/>
+ <text x="10.74" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Current frame</text> </g>
+ <g id="shape22-74" v:mID="22" v:groupContext="shape" transform="translate(331.125,-0.375)">
+ <title>Sheet.22</title>
+ <desc>Reference frame 1 (R1)</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="36" cy="117.495" width="72.01" height="13.5"/>
+ <rect x="0" y="110.745" width="72" height="13.5" class="st19"/>
+ <text x="4.49" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame 1 (R1)</text> </g>
+ <g id="shape23-77" v:mID="23" v:groupContext="shape" transform="translate(119.625,-0.375)">
+ <title>Sheet.23</title>
+ <desc>Reference frame 0</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="29.25" cy="117.495" width="58.5" height="13.5"/>
+ <rect x="0" y="110.745" width="58.5" height="13.5" class="st19"/>
+ <text x="4.41" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame 0</text> </g>
+ <g id="shape24-80" v:mID="24" v:groupContext="shape" transform="translate(0.375,-0.375)">
+ <title>Sheet.24</title>
+ <desc>Reference frame of R1</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="36" cy="117.495" width="72.01" height="13.5"/>
+ <rect x="0" y="110.745" width="72" height="13.5" class="st19"/>
+ <text x="5.65" y="119" class="st20" v:langID="2052"><v:paragraph v:spLine="-1" v:horizAlign="1"/><v:tabList/>Reference frame of R1</text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/inter_obmc.svg b/third_party/aom/doc/img/inter_obmc.svg
new file mode 100644
index 0000000000..a69084b08e
--- /dev/null
+++ b/third_party/aom/doc/img/inter_obmc.svg
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
+<!-- 由 Microsoft Visio 11.0, SVG Export, v1.0 生成 inter_obmc.svg 页-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.98609in"
+ height="2.98609in" viewBox="0 0 214.998 214.998" xml:space="preserve" color-interpolation-filters="sRGB" class="st4">
+ <v:documentProperties v:langID="2052" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2}
+ .st2 {fill:#000000;font-family:Times New Roman;font-size:1.16666em}
+ .st3 {fill:#8c8c8c;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.2}
+ .st4 {fill:none;fill-rule:evenodd;font-size:12;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>页-1</title>
+ <v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+ v:shadowOffsetY="-4.25197"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(1.2,-1.2)">
+ <title>工作表.1</title>
+ <desc>4</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="35.4331" cy="144.132" width="70.87" height="141.732"/>
+ <rect x="0" y="73.2661" width="70.8661" height="141.732" class="st1"/>
+ <text x="31.93" y="148.33" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>4</text> </g>
+ <g id="shape2-4" v:mID="2" v:groupContext="shape" transform="translate(72.0661,-1.2)">
+ <title>工作表.2</title>
+ <desc>0</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="70.8661" cy="144.132" width="141.74" height="141.732"/>
+ <rect x="0" y="73.2661" width="141.732" height="141.732" class="st1"/>
+ <text x="67.37" y="148.33" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>0</text> </g>
+ <g id="shape3-7" v:mID="3" v:groupContext="shape" transform="translate(107.499,-142.932)">
+ <title>工作表.3</title>
+ <desc>2</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="35.4331" cy="179.565" width="70.87" height="70.8661"/>
+ <rect x="0" y="144.132" width="70.8661" height="70.8661" class="st1"/>
+ <text x="31.93" y="183.77" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>2</text> </g>
+ <g id="shape4-10" v:mID="4" v:groupContext="shape" transform="translate(178.365,-142.932)">
+ <title>工作表.4</title>
+ <desc>3</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="17.7165" cy="197.282" width="35.44" height="35.4331"/>
+ <rect x="0" y="179.565" width="35.4331" height="35.4331" class="st1"/>
+ <text x="14.22" y="201.48" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>3</text> </g>
+ <g id="shape5-13" v:mID="5" v:groupContext="shape" transform="translate(72.0661,-142.932)">
+ <title>工作表.5</title>
+ <desc>1</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="17.7165" cy="197.282" width="35.44" height="35.4331"/>
+ <rect x="0" y="179.565" width="35.4331" height="35.4331" class="st1"/>
+ <text x="14.22" y="201.48" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>1</text> </g>
+ <g id="shape6-16" v:mID="6" v:groupContext="shape" transform="translate(72.0661,-72.0661)">
+ <title>工作表.6</title>
+ <rect x="0" y="144.132" width="35.4331" height="70.8661" class="st3"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/inter_spatial_mvp.svg b/third_party/aom/doc/img/inter_spatial_mvp.svg
new file mode 100644
index 0000000000..aa2e88afe8
--- /dev/null
+++ b/third_party/aom/doc/img/inter_spatial_mvp.svg
@@ -0,0 +1,215 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_spatial_mvp.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="3.50333in" height="3.01208in"
+ viewBox="0 0 252.24 216.87" xml:space="preserve" color-interpolation-filters="sRGB" class="st10">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st2 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st3 {marker-end:url(#mrkr5-45);marker-start:url(#mrkr10-43);stroke:#ea700d;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+ .st4 {fill:#ea700d;fill-opacity:1;stroke:#ea700d;stroke-opacity:1;stroke-width:0.3315649867374}
+ .st5 {marker-end:url(#mrkr5-54);marker-start:url(#mrkr10-52);stroke:#f59d56;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+ .st6 {fill:#f59d56;fill-opacity:1;stroke:#f59d56;stroke-opacity:1;stroke-width:0.3315649867374}
+ .st7 {marker-end:url(#mrkr5-54);stroke:#f59d56;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+ .st8 {marker-end:url(#mrkr5-70);marker-start:url(#mrkr10-68);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.25}
+ .st9 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.3315649867374}
+ .st10 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend10">
+ <path
+ d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+ style="stroke:none"/>
+ </g>
+ <marker id="mrkr10-43" class="st4" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(3.016) "/>
+ </marker>
+ <g id="lend5">
+ <path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr5-45" class="st4" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+ </marker>
+ <marker id="mrkr10-52" class="st6" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(3.016) "/>
+ </marker>
+ <marker id="mrkr5-54" class="st6" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+ </marker>
+ <marker id="mrkr10-68" class="st9" v:arrowType="10" v:arrowSize="2" v:setback="1.974" refX="1.974" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(3.016) "/>
+ </marker>
+ <marker id="mrkr5-70" class="st9" v:arrowType="5" v:arrowSize="2" v:setback="5.278" refX="-5.278" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-3.016,-3.016) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape33-1" v:mID="33" v:groupContext="shape" transform="translate(72.12,-0.75)">
+ <title>Square.33</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="72.87" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape38-3" v:mID="38" v:groupContext="shape" transform="translate(72.12,-144.75)">
+ <title>Square.38</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape39-5" v:mID="39" v:groupContext="shape" transform="translate(108.12,-144.75)">
+ <title>Square.39</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape40-7" v:mID="40" v:groupContext="shape" transform="translate(144.12,-144.75)">
+ <title>Square.40</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape41-9" v:mID="41" v:groupContext="shape" transform="translate(180.12,-144.75)">
+ <title>Square.41</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape42-11" v:mID="42" v:groupContext="shape" transform="translate(36.12,-108.75)">
+ <title>Square.42</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape43-13" v:mID="43" v:groupContext="shape" transform="translate(36.12,-72.75)">
+ <title>Square.43</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape44-15" v:mID="44" v:groupContext="shape" transform="translate(36.12,-36.75)">
+ <title>Square.44</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape45-17" v:mID="45" v:groupContext="shape" transform="translate(36.12,-0.75)">
+ <title>Square.45</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape46-19" v:mID="46" v:groupContext="shape" transform="translate(0.12,-108.75)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape47-21" v:mID="47" v:groupContext="shape" transform="translate(0.12,-72.75)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape48-23" v:mID="48" v:groupContext="shape" transform="translate(0.120005,-36.75)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape49-25" v:mID="49" v:groupContext="shape" transform="translate(0.120005,-0.75)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape50-27" v:mID="50" v:groupContext="shape" transform="translate(72.12,-180.75)">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape51-29" v:mID="51" v:groupContext="shape" transform="translate(108.12,-180.75)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape52-31" v:mID="52" v:groupContext="shape" transform="translate(144.12,-180.75)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape53-33" v:mID="53" v:groupContext="shape" transform="translate(180.12,-180.75)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape54-35" v:mID="54" v:groupContext="shape" transform="translate(36.12,-144.75)">
+ <title>Square.54</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape55-37" v:mID="55" v:groupContext="shape" transform="translate(90.12,-162.75)">
+ <title>Sheet.55</title>
+ <path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st3"/>
+ </g>
+ <g id="shape56-46" v:mID="56" v:groupContext="shape" transform="translate(270.99,90.12) rotate(90)">
+ <title>Sheet.56</title>
+ <path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st5"/>
+ </g>
+ <g id="shape58-55" v:mID="58" v:groupContext="shape" transform="translate(-81.3576,28.773) rotate(-38.6598)">
+ <title>Sheet.58</title>
+ <path d="M0 216.87 L223.91 216.87" class="st7"/>
+ </g>
+ <g id="shape59-60" v:mID="59" v:groupContext="shape" transform="translate(216.12,-144.75)">
+ <title>Square.59</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="180.87" width="36" height="36" class="st2"/>
+ </g>
+ <g id="shape60-62" v:mID="60" v:groupContext="shape" transform="translate(54.12,-162.75)">
+ <title>Sheet.60</title>
+ <path d="M1.74 215.13 L2 214.87 L36 180.87 L137.4 180.87" class="st8"/>
+ </g>
+ <g id="shape61-71" v:mID="61" v:groupContext="shape" transform="translate(234.99,90.12) rotate(90)">
+ <title>Sheet.61</title>
+ <path d="M2.47 216.87 L2.83 216.87 L101.4 216.87" class="st8"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/inter_tmvp_positions.svg b/third_party/aom/doc/img/inter_tmvp_positions.svg
new file mode 100644
index 0000000000..87f8dfa80f
--- /dev/null
+++ b/third_party/aom/doc/img/inter_tmvp_positions.svg
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_tmvp_positions.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.00333in" height="1.51208in"
+ viewBox="0 0 144.24 108.87" xml:space="preserve" color-interpolation-filters="sRGB" class="st4">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+ .st3 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st4 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape31-1" v:mID="31" v:groupContext="shape" transform="translate(0.12,-0.12)">
+ <title>Square.31</title>
+ <desc>B4</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B4</text> </g>
+ <g id="shape30-4" v:mID="30" v:groupContext="shape" transform="translate(108.12,-36.12)">
+ <title>Square.30</title>
+ <desc>B6</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B6</text> </g>
+ <g id="shape32-7" v:mID="32" v:groupContext="shape" transform="translate(108.12,-0.12)">
+ <title>Square.32</title>
+ <desc>B5</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B5</text> </g>
+ <g id="shape25-10" v:mID="25" v:groupContext="shape" transform="translate(36.12,-36.12)">
+ <title>Square</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="36.87" width="72" height="72" class="st3"/>
+ </g>
+ <g id="shape26-12" v:mID="26" v:groupContext="shape" transform="translate(36.12,-72.12)">
+ <title>Square.26</title>
+ <desc>B0</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B0</text> </g>
+ <g id="shape27-15" v:mID="27" v:groupContext="shape" transform="translate(72.12,-72.12)">
+ <title>Square.27</title>
+ <desc>B1</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B1</text> </g>
+ <g id="shape28-18" v:mID="28" v:groupContext="shape" transform="translate(36.12,-36.12)">
+ <title>Square.28</title>
+ <desc>B2</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B2</text> </g>
+ <g id="shape29-21" v:mID="29" v:groupContext="shape" transform="translate(72.12,-36.12)">
+ <title>Square.29</title>
+ <desc>B3</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="90.87" width="36.01" height="36"/>
+ <rect x="0" y="72.87" width="36" height="36" class="st1"/>
+ <text x="12.75" y="93.87" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>B3</text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/inter_tx_partition.svg b/third_party/aom/doc/img/inter_tx_partition.svg
new file mode 100644
index 0000000000..6f853c65d3
--- /dev/null
+++ b/third_party/aom/doc/img/inter_tx_partition.svg
@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export inter_tx_partition.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.52083in" height="2.02083in"
+ viewBox="0 0 325.5 145.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st2 {stroke:#000000;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st3 {stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st4 {marker-end:url(#mrkr5-22);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.22935779816514}
+ .st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend5">
+ <path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr5-22" class="st5" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape24-1" v:mID="24" v:groupContext="shape" transform="translate(0.75,-0.75)">
+ <title>Square.24</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="1.5" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape25-3" v:mID="25" v:groupContext="shape" transform="translate(180.75,-0.75)">
+ <title>Square.25</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="1.5" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape26-5" v:mID="26" v:groupContext="shape" transform="translate(180.75,-72.75)">
+ <title>Sheet.26</title>
+ <path d="M0 145.5 L144 145.5" class="st2"/>
+ </g>
+ <g id="shape27-8" v:mID="27" v:groupContext="shape" transform="translate(398.25,0.75) rotate(90)">
+ <title>Sheet.27</title>
+ <path d="M0 145.5 L144 145.5" class="st2"/>
+ </g>
+ <g id="shape28-11" v:mID="28" v:groupContext="shape" transform="translate(252.75,-108.75)">
+ <title>Sheet.28</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape29-14" v:mID="29" v:groupContext="shape" transform="translate(434.25,0.750007) rotate(90)">
+ <title>Sheet.29</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape30-17" v:mID="30" v:groupContext="shape" transform="translate(170.739,-101.283) rotate(-18.4349)">
+ <title>Sheet.30</title>
+ <path d="M0 145.5 L51.2 145.5" class="st4"/>
+ </g>
+ <g id="shape31-23" v:mID="31" v:groupContext="shape" transform="translate(270.75,-126.75)">
+ <title>Sheet.31</title>
+ <path d="M0 145.5 L30.28 145.5" class="st4"/>
+ </g>
+ <g id="shape32-28" v:mID="32" v:groupContext="shape" transform="translate(409.634,121.634) rotate(135)">
+ <title>Sheet.32</title>
+ <path d="M0 145.5 L45.06 145.5" class="st4"/>
+ </g>
+ <g id="shape33-33" v:mID="33" v:groupContext="shape" transform="translate(270.844,-90.8438)">
+ <title>Sheet.33</title>
+ <path d="M0 145.5 L30.18 145.5" class="st4"/>
+ </g>
+ <g id="shape34-38" v:mID="34" v:groupContext="shape" transform="translate(381.705,179.364) rotate(148.992)">
+ <title>Sheet.34</title>
+ <path d="M0 145.5 L99.28 145.5" class="st4"/>
+ </g>
+ <g id="shape35-43" v:mID="35" v:groupContext="shape" transform="translate(216.75,-36.75)">
+ <title>Sheet.35</title>
+ <path d="M0 145.5 L66.28 145.5" class="st4"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/intra_cfl.svg b/third_party/aom/doc/img/intra_cfl.svg
new file mode 100644
index 0000000000..1153a2845e
--- /dev/null
+++ b/third_party/aom/doc/img/intra_cfl.svg
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export CfL_prediction.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="6.52269in" height="1.90714in"
+ viewBox="0 0 469.634 137.314" xml:space="preserve" color-interpolation-filters="sRGB" class="st13">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#ff00ff;fill-opacity:0;stroke:#000000;stroke-opacity:0;stroke-width:0.75}
+ .st2 {fill:#ffffff;stroke:#000000;stroke-width:0.75}
+ .st3 {fill:#000000;font-family:Calibri;font-size:0.75em}
+ .st4 {marker-end:url(#mrkr4-22);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+ .st5 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.28409090909091}
+ .st6 {fill:none;stroke:#000000;stroke-width:0.75}
+ .st7 {fill:#000000;font-family:Calibri;font-size:1.99999em}
+ .st8 {fill:#000000;font-family:Calibri;font-size:1.5em}
+ .st9 {fill:none;stroke:none;stroke-width:0.25}
+ .st10 {font-size:1em}
+ .st11 {fill:#000000;font-family:SimSun;font-size:0.75em}
+ .st12 {font-family:Calibri;font-size:1em}
+ .st13 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend4">
+ <path d="M 2 1 L 0 0 L 2 -1 L 2 1 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr4-22" class="st5" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+ v:shadowOffsetY="-4.25197"/>
+ <v:layer v:name="Flowchart" v:index="0"/>
+ <v:layer v:name="Connector" v:index="1"/>
+ <g id="group5-1" transform="translate(111.581,-86.9232)" v:mID="5" v:groupContext="group" v:layerMember="0">
+ <v:custProps>
+ <v:cp v:nameU="Cost" v:lbl="Cost" v:type="7" v:format="@" v:langID="1033"/>
+ <v:cp v:nameU="Duration" v:lbl="Duration" v:type="2" v:langID="1033"/>
+ <v:cp v:nameU="Resources" v:lbl="Resources" v:langID="1033"/>
+ </v:custProps>
+ <v:userDefs>
+ <v:ud v:nameU="ScaleFactor" v:val="VT0(1):26"/>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <title>Tagged process</title>
+ <g id="shape6-2" v:mID="6" v:groupContext="shape" transform="translate(0.566929,0)">
+ <title>Sheet.6</title>
+ <path d="M53.15 137.31 L70.87 137.31 L70.87 128.46 L70.87 116.05 L0 116.05 L0 137.31 L53.15 137.31 Z" class="st1"/>
+ </g>
+ <g id="shape7-4" v:mID="7" v:groupContext="shape" v:layerMember="0" transform="translate(54.9213,0)">
+ <title>Sheet.7</title>
+ </g>
+ <g id="shape8-6" v:mID="8" v:groupContext="shape" v:layerMember="0">
+ <title>Sheet.8</title>
+ <desc>Sub-Sample</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="35.4331" cy="126.684" width="70.87" height="21.2598"/>
+ <path d="M0 137.31 L70.87 137.31 L70.87 121.37 L70.87 116.05 L0 116.05 L0 137.31 Z" class="st2"/>
+ <text x="13.81" y="129.38" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Sub-Sample</text> </g>
+ </g>
+ <g id="group9-9" transform="translate(224.967,-86.9232)" v:mID="9" v:groupContext="group" v:layerMember="0">
+ <v:custProps>
+ <v:cp v:nameU="Cost" v:lbl="Cost" v:type="7" v:format="@" v:langID="1033"/>
+ <v:cp v:nameU="Duration" v:lbl="Duration" v:type="2" v:langID="1033"/>
+ <v:cp v:nameU="Resources" v:lbl="Resources" v:langID="1033"/>
+ </v:custProps>
+ <v:userDefs>
+ <v:ud v:nameU="ScaleFactor" v:val="VT0(1):26"/>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <title>Tagged process.9</title>
+ <g id="shape10-10" v:mID="10" v:groupContext="shape" transform="translate(0.566929,0)">
+ <title>Sheet.10</title>
+ <path d="M53.15 137.31 L70.87 137.31 L70.87 128.46 L70.87 116.05 L0 116.05 L0 137.31 L53.15 137.31 Z" class="st1"/>
+ </g>
+ <g id="shape11-12" v:mID="11" v:groupContext="shape" v:layerMember="0" transform="translate(54.9213,0)">
+ <title>Sheet.11</title>
+ </g>
+ <g id="shape12-14" v:mID="12" v:groupContext="shape" v:layerMember="0">
+ <title>Sheet.12</title>
+ <desc>Average</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="35.4331" cy="126.684" width="70.87" height="21.2598"/>
+ <path d="M0 137.31 L70.87 137.31 L70.87 121.37 L70.87 116.05 L0 116.05 L0 137.31 Z" class="st2"/>
+ <text x="20.48" y="129.38" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Average</text> </g>
+ </g>
+ <g id="shape27-17" v:mID="27" v:groupContext="shape" transform="translate(182.447,-97.5531)">
+ <title>Sheet.27</title>
+ <path d="M0 137.31 L35.48 137.31" class="st4"/>
+ </g>
+ <g id="shape28-23" v:mID="28" v:groupContext="shape" transform="translate(295.833,-97.5531)">
+ <title>Sheet.28</title>
+ <path d="M0 137.31 L35.48 137.31" class="st4"/>
+ </g>
+ <g id="shape29-28" v:mID="29" v:groupContext="shape" transform="translate(341.47,-86.9232)">
+ <title>Sheet.29</title>
+ <desc>-</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+ <ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+ <text x="7.52" y="133.32" class="st7" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>- </text> </g>
+ <g id="shape34-31" v:mID="34" v:groupContext="shape" v:layerMember="1" transform="translate(147.014,-101.663)">
+ <title>Dynamic connector</title>
+ <path d="M0 130.79 L0 109.53 L205.65 109.53 L205.65 122.62" class="st4"/>
+ </g>
+ <g id="shape35-36" v:mID="35" v:groupContext="shape" transform="translate(34.2657,-97.5531)">
+ <title>Sheet.35</title>
+ <path d="M0 137.31 L70.27 137.31" class="st4"/>
+ </g>
+ <g id="shape36-41" v:mID="36" v:groupContext="shape" transform="translate(341.329,-43.2697)">
+ <title>Sheet.36</title>
+ <desc>×</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+ <ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+ <text x="6.71" y="131.52" class="st8" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>× </text> </g>
+ <g id="shape37-44" v:mID="37" v:groupContext="shape" transform="translate(34.2657,-53.5676)">
+ <title>Sheet.37</title>
+ <path d="M0 137.31 L300.06 137.31" class="st4"/>
+ </g>
+ <g id="shape38-49" v:mID="38" v:groupContext="shape" transform="translate(489.499,50.3067) rotate(89.9693)">
+ <title>Sheet.38</title>
+ <path d="M0 137.31 L14.24 137.31" class="st4"/>
+ </g>
+ <g id="shape39-54" v:mID="39" v:groupContext="shape" transform="translate(341.329,-0.75)">
+ <title>Sheet.39</title>
+ <desc>+</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="11.1968" cy="126.117" width="22.4" height="22.3937"/>
+ <ellipse cx="11.1968" cy="126.117" rx="11.1968" ry="11.1968" class="st6"/>
+ <text x="6.71" y="131.52" class="st8" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+ </text> </g>
+ <g id="shape40-57" v:mID="40" v:groupContext="shape" transform="translate(34.2657,-11.9539)">
+ <title>Sheet.40</title>
+ <path d="M0 137.31 L300.02 137.31" class="st4"/>
+ </g>
+ <g id="shape41-62" v:mID="41" v:groupContext="shape" v:layerMember="1" transform="translate(345.51,-86.9234)">
+ <title>Dynamic connector.41</title>
+ <path d="M7.09 137.31 L7.09 151.53" class="st4"/>
+ </g>
+ <g id="shape74-67" v:mID="74" v:groupContext="shape" v:layerMember="1" transform="translate(345.439,-43.2697)">
+ <title>Dynamic connector.74</title>
+ <path d="M7.09 137.31 L7.09 150.4" class="st4"/>
+ </g>
+ <g id="shape75-72" v:mID="75" v:groupContext="shape" transform="translate(363.722,-11.9551)">
+ <title>Sheet.75</title>
+ <path d="M0 137.31 L35.48 137.31" class="st4"/>
+ </g>
+ <g id="shape78-77" v:mID="78" v:groupContext="shape" transform="translate(3.08465,-17.2788)">
+ <title>Sheet.78</title>
+ <desc>Chroma DC Prediction</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="70.3916" cy="131.314" width="140.79" height="12"/>
+ <rect x="0" y="125.314" width="140.783" height="12" class="st9"/>
+ <text x="30.02" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Chroma DC Prediction</text> </g>
+ <g id="shape82-80" v:mID="82" v:groupContext="shape" transform="translate(0.25,-60.75)">
+ <title>Sheet.82</title>
+ <desc>Scaling parameter α</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="70.3916" cy="131.314" width="140.79" height="12"/>
+ <rect x="0" y="125.314" width="140.783" height="12" class="st9"/>
+ <text x="33.74" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Scaling parameter α </text> </g>
+ <g id="shape83-83" v:mID="83" v:groupContext="shape" transform="translate(30.0138,-102.514)">
+ <title>Sheet.83</title>
+ <desc>Luma reconstructed samples</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="43.937" cy="131.314" width="87.88" height="12"/>
+ <rect x="0" y="125.314" width="87.874" height="12" class="st9"/>
+ <text x="7.25" y="128.61" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Luma reconstructed <tspan
+ x="29.03" dy="1.2em" class="st10">samples</tspan></text> </g>
+ <g id="shape84-87" v:mID="84" v:groupContext="shape" transform="translate(398.518,-5.47437)">
+ <title>Sheet.84</title>
+ <desc>CfL Prediction</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="35.4331" cy="131.314" width="70.87" height="12"/>
+ <rect x="0" y="125.314" width="70.8661" height="12" class="st9"/>
+ <text x="10.04" y="134.01" class="st3" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>CfL Prediction</text> </g>
+ <g id="shape85-90" v:mID="85" v:groupContext="shape" transform="translate(354.581,-72.75)">
+ <title>Sheet.85</title>
+ <desc>“AC” contribution</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="40.3937" cy="131.314" width="80.79" height="12"/>
+ <rect x="0" y="125.314" width="80.7874" height="12" class="st9"/>
+ <text x="2.62" y="134.31" class="st11" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>“<tspan class="st12">AC</tspan>”<tspan
+ class="st12"> </tspan><tspan class="st12">contribution</tspan></text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/intra_directional.svg b/third_party/aom/doc/img/intra_directional.svg
new file mode 100644
index 0000000000..3a08007a95
--- /dev/null
+++ b/third_party/aom/doc/img/intra_directional.svg
@@ -0,0 +1,192 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_directional.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.24969in" height="4.20313in"
+ viewBox="0 0 305.978 302.625" xml:space="preserve" color-interpolation-filters="sRGB" class="st13">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st2 {marker-start:url(#mrkr5-8);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st3 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.37313432835821}
+ .st4 {fill:#ffffff;stroke:none;stroke-linecap:butt;stroke-width:7.2}
+ .st5 {fill:#2f4f4f;font-family:Consolas;font-size:0.791656em}
+ .st6 {font-size:1em}
+ .st7 {fill:#ffffff;stroke:none;stroke-linecap:butt}
+ .st8 {marker-end:url(#mrkr5-49);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st9 {marker-end:url(#mrkr5-65);stroke:#000000;stroke-dasharray:2.25,2.25;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st10 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.22935779816514}
+ .st11 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st12 {fill:#000000;font-family:Calibri;font-size:0.666664em}
+ .st13 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend5">
+ <path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr5-8" class="st3" v:arrowType="5" v:arrowSize="2" v:setback="4.45" refX="4.45" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(2.68) "/>
+ </marker>
+ <marker id="mrkr5-49" class="st3" v:arrowType="5" v:arrowSize="2" v:setback="4.69" refX="-4.69" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-2.68,-2.68) "/>
+ </marker>
+ <marker id="mrkr5-65" class="st10" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(8.98899,-0.75)">
+ <title>Square</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="14.625" width="288" height="288" class="st1"/>
+ </g>
+ <g id="shape5-3" v:mID="5" v:groupContext="shape" transform="translate(222.977,-200.113) rotate(45)">
+ <title>Sheet.5</title>
+ <desc>D135_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="101.823" cy="302.625" width="203.65" height="0"/>
+ <path d="M6.68 302.62 L7.03 302.62 L203.65 302.62" class="st2"/>
+ <rect v:rectContext="textBkgnd" x="78.3191" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+ <text x="78.32" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+ v:langID="2052">13</tspan>5_PRED</text> </g>
+ <g id="shape6-12" v:mID="6" v:groupContext="shape" transform="translate(8.98899,-144.75)">
+ <title>Sheet.6</title>
+ <desc>H_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="302.625" width="144" height="0"/>
+ <path d="M6.67 302.62 L7.03 302.62 L144 302.62" class="st2"/>
+ <rect v:rectContext="textBkgnd" x="56.3305" y="295.425" width="31.3391" height="14.4001" class="st4"/>
+ <text x="56.33" y="306.23" class="st5" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>H<tspan class="st6"
+ v:langID="1033">_PRED</tspan></text> </g>
+ <g id="shape8-20" v:mID="8" v:groupContext="shape" transform="translate(367.241,-107.423) rotate(66.3706)">
+ <title>Sheet.8</title>
+ <desc>D113_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+ <path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+ <rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st7"/>
+ <text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+ v:langID="2052">113</tspan>_PRED</text> </g>
+ <g id="shape9-28" v:mID="9" v:groupContext="shape" transform="translate(130.287,-182.377) rotate(23.6294)">
+ <title>Sheet.9</title>
+ <desc>D157_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+ <path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+ <rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+ <text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+ v:langID="2052">157</tspan>_PRED</text> </g>
+ <g id="shape10-36" v:mID="10" v:groupContext="shape" transform="translate(-112.309,-56.3771) rotate(-23.6294)">
+ <title>Sheet.10</title>
+ <desc>D203_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+ <path d="M6.67 302.62 L7.03 302.62 L157.18 302.62" class="st2"/>
+ <rect v:rectContext="textBkgnd" x="55.0849" y="295.425" width="47.0086" height="14.4001" class="st4"/>
+ <text x="55.08" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+ v:langID="2052">203</tspan>_PRED</text> </g>
+ <g id="shape11-44" v:mID="11" v:groupContext="shape" transform="translate(-60.9992,-56.1132) rotate(-45)">
+ <title>Sheet.11</title>
+ <desc>D45_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="101.823" cy="302.625" width="203.65" height="0"/>
+ <path d="M0 302.62 L196.61 302.62" class="st8"/>
+ <rect v:rectContext="textBkgnd" x="80.9308" y="295.425" width="41.7854" height="14.4001" class="st7"/>
+ <text x="80.93" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D45_PRED</text> </g>
+ <g id="shape12-52" v:mID="12" v:groupContext="shape" transform="translate(-149.636,157.875) rotate(-90)">
+ <title>Sheet.12</title>
+ <desc>V_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="302.625" width="144" height="0"/>
+ <path d="M0 302.62 L136.96 302.62" class="st8"/>
+ <rect v:rectContext="textBkgnd" x="56.3305" y="295.425" width="31.3391" height="14.4001" class="st7"/>
+ <text x="56.33" y="306.23" class="st5" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>V<tspan class="st6"
+ v:langID="1033">_PRED</tspan></text> </g>
+ <g id="shape13-60" v:mID="13" v:groupContext="shape" transform="translate(-117.687,22.537) rotate(-63.4349)">
+ <title>Sheet.13</title>
+ <path d="M0 302.62 L155.27 302.62" class="st9"/>
+ </g>
+ <g id="shape14-66" v:mID="14" v:groupContext="shape" transform="translate(-110.772,9.50969) rotate(-60.6422)">
+ <title>Sheet.14</title>
+ <path d="M0 302.62 L159.5 302.62" class="st9"/>
+ </g>
+ <g id="shape15-71" v:mID="15" v:groupContext="shape" transform="translate(-103.636,-2.51593) rotate(-57.9946)">
+ <title>Sheet.15</title>
+ <path d="M0 302.62 L164.09 302.62" class="st9"/>
+ </g>
+ <g id="shape16-76" v:mID="16" v:groupContext="shape" transform="translate(-130.368,51.6163) rotate(-69.444)">
+ <title>Sheet.16</title>
+ <path d="M0 302.62 L148.07 302.62" class="st9"/>
+ </g>
+ <g id="shape17-81" v:mID="17" v:groupContext="shape" transform="translate(-135.861,67.6095) rotate(-72.646)">
+ <title>Sheet.17</title>
+ <path d="M0 302.62 L145.14 302.62" class="st9"/>
+ </g>
+ <g id="shape18-86" v:mID="18" v:groupContext="shape" transform="translate(-140.6,84.4777) rotate(-75.9638)">
+ <title>Sheet.18</title>
+ <path d="M0 302.62 L142.71 302.62" class="st9"/>
+ </g>
+ <g id="shape30-91" v:mID="30" v:groupContext="shape" transform="translate(-124.263,36.5772) rotate(-66.3706)">
+ <title>Sheet.30</title>
+ <desc>D67_PRED</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="78.5891" cy="302.625" width="157.18" height="0"/>
+ <path d="M0 302.62 L150.14 302.62" class="st8"/>
+ <rect v:rectContext="textBkgnd" x="57.6964" y="295.425" width="41.7854" height="14.4001" class="st4"/>
+ <text x="57.7" y="306.23" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>D<tspan class="st6"
+ v:langID="2052">67</tspan>_PRED</text> </g>
+ <g id="shape31-99" v:mID="31" v:groupContext="shape" transform="translate(214.864,-288.75)">
+ <title>Sheet.31</title>
+ <desc>+1</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+ <rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+ <text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+1</text> </g>
+ <g id="shape32-102" v:mID="32" v:groupContext="shape" transform="translate(224.989,-288.75)">
+ <title>Sheet.32</title>
+ <desc>+2</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+ <rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+ <text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+2</text> </g>
+ <g id="shape33-105" v:mID="33" v:groupContext="shape" transform="translate(238.489,-288.75)">
+ <title>Sheet.33</title>
+ <desc>+3</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+ <rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+ <text x="4.98" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>+3</text> </g>
+ <g id="shape34-108" v:mID="34" v:groupContext="shape" transform="translate(197.989,-288.75)">
+ <title>Sheet.34</title>
+ <desc>-1</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+ <rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+ <text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-1</text> </g>
+ <g id="shape35-111" v:mID="35" v:groupContext="shape" transform="translate(188.989,-288.75)">
+ <title>Sheet.35</title>
+ <desc>-2</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+ <rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+ <text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-2</text> </g>
+ <g id="shape36-114" v:mID="36" v:groupContext="shape" transform="translate(177.739,-288.75)">
+ <title>Sheet.36</title>
+ <desc>-3</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="9" cy="295.875" width="18" height="13.5"/>
+ <rect x="0" y="289.125" width="18" height="13.5" class="st11"/>
+ <text x="5.75" y="298.27" class="st12" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>-3</text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/intra_paeth.svg b/third_party/aom/doc/img/intra_paeth.svg
new file mode 100644
index 0000000000..f7a831febb
--- /dev/null
+++ b/third_party/aom/doc/img/intra_paeth.svg
@@ -0,0 +1,181 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_paeth.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="2.52083in" height="2.52083in"
+ viewBox="0 0 181.5 181.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-dasharray:2.25,2.25;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st2 {fill:#000000;font-family:Calibri;font-size:1.00001em}
+ .st3 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st4 {fill:#000000;font-family:Calibri;font-size:0.833336em}
+ .st5 {font-size:1em}
+ .st6 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:2.25}
+ .st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape211-1" v:mID="211" v:groupContext="shape" transform="translate(0.375,-73.125)">
+ <title>Square.211</title>
+ <desc>L</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="163.5" width="36" height="36"/>
+ <path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+ <text x="15.48" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>L</text> </g>
+ <g id="shape212-4" v:mID="212" v:groupContext="shape" transform="translate(108.375,-145.125)">
+ <title>Square.212</title>
+ <desc>T</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="163.5" width="36" height="36"/>
+ <path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+ <text x="15.08" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>T</text> </g>
+ <g id="shape213-7" v:mID="213" v:groupContext="shape" transform="translate(0.375007,-145.125)">
+ <title>Square.213</title>
+ <desc>TL</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="163.5" width="36" height="36"/>
+ <path d="M0 181.5 L36 181.5 L36 145.5 L0 145.5 L0 181.5 Z" class="st1"/>
+ <text x="12.55" y="167.1" class="st2" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>TL</text> </g>
+ <g id="group214-10" transform="translate(36.375,-1.12501)" v:mID="214" v:groupContext="group">
+ <title>Sheet.214</title>
+ <g id="shape183-11" v:mID="183" v:groupContext="shape" transform="translate(6.86646E-06,-108)">
+ <title>Square.183</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape184-13" v:mID="184" v:groupContext="shape" transform="translate(36,-108)">
+ <title>Square.184</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape185-15" v:mID="185" v:groupContext="shape" transform="translate(72,-108)">
+ <title>Square.185</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape186-17" v:mID="186" v:groupContext="shape" transform="translate(108,-108)">
+ <title>Square.186</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape199-19" v:mID="199" v:groupContext="shape" transform="translate(1.37329E-05,-72)">
+ <title>Square.199</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape200-21" v:mID="200" v:groupContext="shape" transform="translate(36,-72)">
+ <title>Square.200</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape201-23" v:mID="201" v:groupContext="shape" transform="translate(72,-72)">
+ <title>Square.201</title>
+ <desc>Current Pixel</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(0,0,0,0)"/>
+ <v:textRect cx="18" cy="163.5" width="36" height="36"/>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ <text x="2.43" y="160.5" class="st4" v:langID="2052"><v:paragraph v:horizAlign="1"/><v:tabList/>Current <tspan
+ x="8.47" dy="1.2em" class="st5">Pixel</tspan></text> </g>
+ <g id="shape202-27" v:mID="202" v:groupContext="shape" transform="translate(108,-72)">
+ <title>Square.202</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape203-29" v:mID="203" v:groupContext="shape" transform="translate(0,-36)">
+ <title>Square.203</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape204-31" v:mID="204" v:groupContext="shape" transform="translate(36,-36)">
+ <title>Square.204</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape205-33" v:mID="205" v:groupContext="shape" transform="translate(72,-36)">
+ <title>Square.205</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape206-35" v:mID="206" v:groupContext="shape" transform="translate(108,-36)">
+ <title>Square.206</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape207-37" v:mID="207" v:groupContext="shape" transform="translate(6.86646E-06,0)">
+ <title>Square.207</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape208-39" v:mID="208" v:groupContext="shape" transform="translate(36,0)">
+ <title>Square.208</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape209-41" v:mID="209" v:groupContext="shape" transform="translate(72,0)">
+ <title>Square.209</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape210-43" v:mID="210" v:groupContext="shape" transform="translate(108,0)">
+ <title>Square.210</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="145.5" width="36" height="36" class="st3"/>
+ </g>
+ </g>
+ <g id="shape215-45" v:mID="215" v:groupContext="shape" transform="translate(36.375,-1.125)">
+ <title>Square.215</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="37.5" width="144" height="144" class="st6"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/intra_recursive.svg b/third_party/aom/doc/img/intra_recursive.svg
new file mode 100644
index 0000000000..adc4193169
--- /dev/null
+++ b/third_party/aom/doc/img/intra_recursive.svg
@@ -0,0 +1,710 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_recursive.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="4.52015in" height="4.46693in"
+ viewBox="0 0 325.45 321.619" xml:space="preserve" color-interpolation-filters="sRGB" class="st9">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st3 {marker-end:url(#mrkr10-184);marker-start:url(#mrkr10-182);stroke:#0070c0;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st4 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.34246575342466}
+ .st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.29411764705882}
+ .st6 {marker-end:url(#mrkr10-235);marker-start:url(#mrkr10-233);stroke:#bf9000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st7 {fill:#bf9000;fill-opacity:1;stroke:#bf9000;stroke-opacity:1;stroke-width:0.34246575342466}
+ .st8 {fill:#bf9000;fill-opacity:1;stroke:#bf9000;stroke-opacity:1;stroke-width:0.29411764705882}
+ .st9 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend10">
+ <path
+ d="M 0 0.75 C -0.414214 0.75 -0.75 0.414214 -0.75 0 -0.75 -0.414214 -0.414214 -0.75 0 -0.75 0.414214 -0.75 0.75 -0.414214 0.75 0 0.75 0.414214 0.414214 0.75 0 0.75 Z "
+ style="stroke:none"/>
+ </g>
+ <marker id="mrkr10-182" class="st4" v:arrowType="10" v:arrowSize="0" v:setback="1.71" refX="1.71" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(2.92) "/>
+ </marker>
+ <marker id="mrkr10-184" class="st5" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+ </marker>
+ <marker id="mrkr10-233" class="st7" v:arrowType="10" v:arrowSize="0" v:setback="1.71" refX="1.71" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(2.92) "/>
+ </marker>
+ <marker id="mrkr10-235" class="st8" v:arrowType="10" v:arrowSize="1" v:setback="2.55" refX="-2.55" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend10" transform="scale(-3.4,-3.4) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="group149-1" transform="translate(0.12,-214.583)" v:mID="149" v:groupContext="group">
+ <title>Sheet.149</title>
+ <g id="shape142-2" v:mID="142" v:groupContext="shape" transform="translate(0,-71.2776)">
+ <title>Square.142</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape143-4" v:mID="143" v:groupContext="shape" transform="translate(36.0645,-71.2776)">
+ <title>Square.143</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape144-6" v:mID="144" v:groupContext="shape" transform="translate(72.129,-71.2776)">
+ <title>Square.144</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape145-8" v:mID="145" v:groupContext="shape" transform="translate(108.193,-71.2776)">
+ <title>Square.145</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape146-10" v:mID="146" v:groupContext="shape" transform="translate(144.258,-71.2776)">
+ <title>Square.146</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape147-12" v:mID="147" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.147</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape148-14" v:mID="148" v:groupContext="shape">
+ <title>Square.148</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ <g id="group64-16" transform="translate(36.1845,-214.583)" v:mID="64" v:groupContext="group">
+ <title>Sheet.64</title>
+ <g id="shape38-17" v:mID="38" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group63-19" v:mID="63" v:groupContext="group">
+ <title>Sheet.63</title>
+ <g id="shape46-20" v:mID="46" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape47-22" v:mID="47" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape48-24" v:mID="48" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape49-26" v:mID="49" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape50-28" v:mID="50" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape51-30" v:mID="51" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape52-32" v:mID="52" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape53-34" v:mID="53" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group65-36" transform="translate(180.442,-214.583)" v:mID="65" v:groupContext="group">
+ <title>Sheet.65</title>
+ <g id="shape66-37" v:mID="66" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group67-39" v:mID="67" v:groupContext="group">
+ <title>Sheet.67</title>
+ <g id="shape68-40" v:mID="68" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape69-42" v:mID="69" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape70-44" v:mID="70" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape71-46" v:mID="71" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape72-48" v:mID="72" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape73-50" v:mID="73" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape74-52" v:mID="74" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape75-54" v:mID="75" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group76-56" transform="translate(36.1845,-143.305)" v:mID="76" v:groupContext="group">
+ <title>Sheet.76</title>
+ <g id="shape77-57" v:mID="77" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group78-59" v:mID="78" v:groupContext="group">
+ <title>Sheet.78</title>
+ <g id="shape79-60" v:mID="79" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape80-62" v:mID="80" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape81-64" v:mID="81" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape82-66" v:mID="82" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape83-68" v:mID="83" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape84-70" v:mID="84" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape85-72" v:mID="85" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape86-74" v:mID="86" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group87-76" transform="translate(180.442,-143.305)" v:mID="87" v:groupContext="group">
+ <title>Sheet.87</title>
+ <g id="shape88-77" v:mID="88" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group89-79" v:mID="89" v:groupContext="group">
+ <title>Sheet.89</title>
+ <g id="shape90-80" v:mID="90" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape91-82" v:mID="91" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape92-84" v:mID="92" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape93-86" v:mID="93" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape94-88" v:mID="94" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape95-90" v:mID="95" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape96-92" v:mID="96" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape97-94" v:mID="97" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group98-96" transform="translate(36.1845,-72.0276)" v:mID="98" v:groupContext="group">
+ <title>Sheet.98</title>
+ <g id="shape99-97" v:mID="99" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group100-99" v:mID="100" v:groupContext="group">
+ <title>Sheet.100</title>
+ <g id="shape101-100" v:mID="101" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape102-102" v:mID="102" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape103-104" v:mID="103" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape104-106" v:mID="104" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape105-108" v:mID="105" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape106-110" v:mID="106" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape107-112" v:mID="107" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape108-114" v:mID="108" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group109-116" transform="translate(180.442,-72.0276)" v:mID="109" v:groupContext="group">
+ <title>Sheet.109</title>
+ <g id="shape110-117" v:mID="110" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group111-119" v:mID="111" v:groupContext="group">
+ <title>Sheet.111</title>
+ <g id="shape112-120" v:mID="112" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape113-122" v:mID="113" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape114-124" v:mID="114" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape115-126" v:mID="115" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape116-128" v:mID="116" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape117-130" v:mID="117" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape118-132" v:mID="118" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape119-134" v:mID="119" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group120-136" transform="translate(36.1845,-0.75)" v:mID="120" v:groupContext="group">
+ <title>Sheet.120</title>
+ <g id="shape121-137" v:mID="121" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group122-139" v:mID="122" v:groupContext="group">
+ <title>Sheet.122</title>
+ <g id="shape123-140" v:mID="123" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape124-142" v:mID="124" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape125-144" v:mID="125" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape126-146" v:mID="126" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape127-148" v:mID="127" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape128-150" v:mID="128" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape129-152" v:mID="129" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape130-154" v:mID="130" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="group131-156" transform="translate(180.442,-0.75)" v:mID="131" v:groupContext="group">
+ <title>Sheet.131</title>
+ <g id="shape132-157" v:mID="132" v:groupContext="shape">
+ <title>Rectangle</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="250.341" width="144.258" height="71.2776" class="st2"/>
+ </g>
+ <g id="group133-159" v:mID="133" v:groupContext="group">
+ <title>Sheet.133</title>
+ <g id="shape134-160" v:mID="134" v:groupContext="shape" transform="translate(0,-35.6388)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape135-162" v:mID="135" v:groupContext="shape" transform="translate(36.0645,-35.6388)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape136-164" v:mID="136" v:groupContext="shape" transform="translate(72.129,-35.6388)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape137-166" v:mID="137" v:groupContext="shape" transform="translate(108.193,-35.6388)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape138-168" v:mID="138" v:groupContext="shape">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape139-170" v:mID="139" v:groupContext="shape" transform="translate(36.0645,0)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape140-172" v:mID="140" v:groupContext="shape" transform="translate(72.129,0)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ <g id="shape141-174" v:mID="141" v:groupContext="shape" transform="translate(108.193,0)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="285.98" width="36.0645" height="35.6388" class="st1"/>
+ </g>
+ </g>
+ </g>
+ <g id="shape150-176" v:mID="150" v:groupContext="shape" transform="translate(244.217,-210.826) rotate(44.6598)">
+ <title>Sheet.150</title>
+ <path d="M1.28 321.62 L1.64 321.62 L49.49 321.62" class="st3"/>
+ </g>
+ <g id="shape151-185" v:mID="151" v:groupContext="shape" transform="translate(-266.901,54.0731) rotate(-90)">
+ <title>Sheet.151</title>
+ <path d="M1.28 321.62 L1.64 321.62 L34.22 321.62" class="st3"/>
+ </g>
+ <g id="shape152-192" v:mID="152" v:groupContext="shape" transform="translate(319.501,243.543) rotate(134.544)">
+ <title>Sheet.152</title>
+ <path d="M1.28 321.62 L1.64 321.62 L48.79 321.62" class="st3"/>
+ </g>
+ <g id="shape153-199" v:mID="153" v:groupContext="shape" transform="translate(271.203,305.09) rotate(153.231)">
+ <title>Sheet.153</title>
+ <path d="M1.28 321.62 L1.64 321.62 L78.31 321.62" class="st3"/>
+ </g>
+ <g id="shape154-206" v:mID="154" v:groupContext="shape" transform="translate(264.717,322.853) rotate(161.452)">
+ <title>Sheet.154</title>
+ <path d="M1.28 321.62 L1.64 321.62 L111.68 321.62" class="st3"/>
+ </g>
+ <g id="shape155-213" v:mID="155" v:groupContext="shape" transform="translate(18.1522,-267.546)">
+ <title>Sheet.155</title>
+ <path d="M1.28 321.62 L1.64 321.62 L34.65 321.62" class="st3"/>
+ </g>
+ <g id="shape156-220" v:mID="156" v:groupContext="shape" transform="translate(-204.714,-142.665) rotate(-43.8643)">
+ <title>Sheet.156</title>
+ <path d="M1.28 321.62 L1.64 321.62 L48.8 321.62" class="st3"/>
+ </g>
+ <g id="shape157-227" v:mID="157" v:groupContext="shape" transform="translate(388.475,-68.2707) rotate(44.6598)">
+ <title>Sheet.157</title>
+ <path d="M1.28 321.62 L1.64 321.62 L99.49 321.62" class="st6"/>
+ </g>
+ <g id="shape158-236" v:mID="158" v:groupContext="shape" transform="translate(-53.2468,375.362) rotate(-116.517)">
+ <title>Sheet.158</title>
+ <path d="M1.28 321.62 L1.64 321.62 L77.74 321.62" class="st6"/>
+ </g>
+ <g id="shape159-243" v:mID="159" v:groupContext="shape" transform="translate(556.158,160.495) rotate(90)">
+ <title>Sheet.159</title>
+ <path d="M1.28 321.62 L1.64 321.62 L69.37 321.62" class="st6"/>
+ </g>
+ <g id="shape160-250" v:mID="160" v:groupContext="shape" transform="translate(557.58,305.696) rotate(116.838)">
+ <title>Sheet.160</title>
+ <path d="M1.28 321.62 L1.64 321.62 L77.97 321.62" class="st6"/>
+ </g>
+ <g id="shape161-257" v:mID="161" v:groupContext="shape" transform="translate(532.733,389.26) rotate(135.34)">
+ <title>Sheet.161</title>
+ <path d="M1.28 321.62 L1.64 321.62 L99.49 321.62" class="st6"/>
+ </g>
+ <g id="shape162-264" v:mID="162" v:groupContext="shape" transform="translate(303.283,-92.4976) rotate(25.977)">
+ <title>Sheet.162</title>
+ <path d="M1.28 321.62 L1.64 321.62 L78.32 321.62" class="st6"/>
+ </g>
+ <g id="shape163-271" v:mID="163" v:groupContext="shape" transform="translate(162.41,-89.8469)">
+ <title>Sheet.163</title>
+ <path d="M1.28 321.62 L1.64 321.62 L70.22 321.62" class="st6"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/intra_tx_partition.svg b/third_party/aom/doc/img/intra_tx_partition.svg
new file mode 100644
index 0000000000..69575d4cd7
--- /dev/null
+++ b/third_party/aom/doc/img/intra_tx_partition.svg
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export intra_tx_partition.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="7.02083in" height="2.02083in"
+ viewBox="0 0 505.5 145.5" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st2 {stroke:#000000;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st3 {stroke:#000000;stroke-dasharray:0.75,1.5;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st4 {marker-end:url(#mrkr5-36);stroke:#0070c0;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st5 {fill:#0070c0;fill-opacity:1;stroke:#0070c0;stroke-opacity:1;stroke-width:0.22935779816514}
+ .st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend5">
+ <path d="M 2 1 L 0 0 L 1.98117 -0.993387 C 1.67173 -0.364515 1.67301 0.372641 1.98465 1.00043 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr5-36" class="st5" v:arrowType="5" v:arrowSize="2" v:setback="7.63" refX="-7.63" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend5" transform="scale(-4.36,-4.36) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.75,-0.75)">
+ <title>Square</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="1.5" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape4-3" v:mID="4" v:groupContext="shape" transform="translate(180.75,-0.75)">
+ <title>Square.4</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="1.5" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape5-5" v:mID="5" v:groupContext="shape" transform="translate(398.25,0.75) rotate(90)">
+ <title>Sheet.5</title>
+ <path d="M0 145.5 L144 145.5" class="st2"/>
+ </g>
+ <g id="shape6-8" v:mID="6" v:groupContext="shape" transform="translate(180.75,-72.75)">
+ <title>Sheet.6</title>
+ <path d="M0 145.5 L144 145.5" class="st2"/>
+ </g>
+ <g id="shape7-11" v:mID="7" v:groupContext="shape" transform="translate(360.75,-0.75)">
+ <title>Square.7</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="1.5" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape8-13" v:mID="8" v:groupContext="shape" transform="translate(578.25,0.75) rotate(90)">
+ <title>Sheet.8</title>
+ <path d="M0 145.5 L144 145.5" class="st2"/>
+ </g>
+ <g id="shape9-16" v:mID="9" v:groupContext="shape" transform="translate(432,-108.5)">
+ <title>Sheet.9</title>
+ <path d="M0 145.5 L72.75 145.5" class="st3"/>
+ </g>
+ <g id="shape10-19" v:mID="10" v:groupContext="shape" transform="translate(360.75,-72.75)">
+ <title>Sheet.10</title>
+ <path d="M0 145.5 L144 145.5" class="st2"/>
+ </g>
+ <g id="shape11-22" v:mID="11" v:groupContext="shape" transform="translate(360.75,-36.75)">
+ <title>Sheet.11</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape12-25" v:mID="12" v:groupContext="shape" transform="translate(542.25,0.750007) rotate(90)">
+ <title>Sheet.12</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape13-28" v:mID="13" v:groupContext="shape" transform="translate(614.25,0.75) rotate(90)">
+ <title>Sheet.13</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(216.75,-108.75)">
+ <title>Sheet.14</title>
+ <path d="M0 145.5 L66.28 145.5" class="st4"/>
+ </g>
+ <g id="shape15-37" v:mID="15" v:groupContext="shape" transform="translate(391.634,139.634) rotate(135)">
+ <title>Sheet.15</title>
+ <path d="M0 145.5 L96.1 145.5" class="st4"/>
+ </g>
+ <g id="shape16-42" v:mID="16" v:groupContext="shape" transform="translate(216.75,-36.75)">
+ <title>Sheet.16</title>
+ <path d="M0 145.5 L66.28 145.5" class="st4"/>
+ </g>
+ <g id="shape17-47" v:mID="17" v:groupContext="shape" transform="translate(378.75,-126.75)">
+ <title>Sheet.17</title>
+ <path d="M0 145.5 L102.28 145.5" class="st4"/>
+ </g>
+ <g id="shape18-52" v:mID="18" v:groupContext="shape" transform="translate(378.75,-90.75)">
+ <title>Sheet.18</title>
+ <path d="M0 145.5 L102.28 145.5" class="st4"/>
+ </g>
+ <g id="shape19-57" v:mID="19" v:groupContext="shape" transform="translate(378.75,-54.75)">
+ <title>Sheet.19</title>
+ <path d="M0 145.5 L102.28 145.5" class="st4"/>
+ </g>
+ <g id="shape20-62" v:mID="20" v:groupContext="shape" transform="translate(378.75,-18.75)">
+ <title>Sheet.20</title>
+ <path d="M0 145.5 L102.28 145.5" class="st4"/>
+ </g>
+ <g id="shape21-67" v:mID="21" v:groupContext="shape" transform="translate(532.761,156.783) rotate(161.565)">
+ <title>Sheet.21</title>
+ <path d="M0 145.5 L108.12 145.5" class="st4"/>
+ </g>
+ <g id="shape22-72" v:mID="22" v:groupContext="shape" transform="translate(532.761,192.783) rotate(161.565)">
+ <title>Sheet.22</title>
+ <path d="M0 145.5 L108.12 145.5" class="st4"/>
+ </g>
+ <g id="shape23-77" v:mID="23" v:groupContext="shape" transform="translate(532.761,228.783) rotate(161.565)">
+ <title>Sheet.23</title>
+ <path d="M0 145.5 L108.12 145.5" class="st4"/>
+ </g>
+ <g id="shape36-82" v:mID="36" v:groupContext="shape" transform="translate(360.75,-108.5)">
+ <title>Sheet.36</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape37-85" v:mID="37" v:groupContext="shape" transform="translate(432.75,-36.75)">
+ <title>Sheet.37</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape38-88" v:mID="38" v:groupContext="shape" transform="translate(542.25,72.75) rotate(90)">
+ <title>Sheet.38</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ <g id="shape39-91" v:mID="39" v:groupContext="shape" transform="translate(614.25,72.75) rotate(90)">
+ <title>Sheet.39</title>
+ <path d="M0 145.5 L72 145.5" class="st3"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/loop_restoration.svg b/third_party/aom/doc/img/loop_restoration.svg
new file mode 100644
index 0000000000..cdeb76a871
--- /dev/null
+++ b/third_party/aom/doc/img/loop_restoration.svg
@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export loop_restoration.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.47917in" height="2.49905in"
+ viewBox="0 0 394.5 179.932" xml:space="preserve" color-interpolation-filters="sRGB" class="st11">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#bfbfbf;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {marker-end:url(#mrkr4-8);stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+ .st3 {fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;stroke-width:0.28409090909091}
+ .st4 {stroke:#000000;stroke-dasharray:0,3.75;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st5 {marker-end:url(#mrkr4-27);stroke:#4bacc6;stroke-linecap:round;stroke-linejoin:round;stroke-width:1}
+ .st6 {fill:#4bacc6;fill-opacity:1;stroke:#4bacc6;stroke-opacity:1;stroke-width:0.28409090909091}
+ .st7 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st8 {fill:#000000;font-family:Times New Roman;font-size:1.00001em}
+ .st9 {baseline-shift:-32.4941%;font-size:0.649882em}
+ .st10 {font-size:1em}
+ .st11 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <defs id="Markers">
+ <g id="lend4">
+ <path d="M 2 1 L 0 0 L 2 -1 L 2 1 " style="stroke:none"/>
+ </g>
+ <marker id="mrkr4-8" class="st3" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+ </marker>
+ <marker id="mrkr4-27" class="st6" v:arrowType="4" v:arrowSize="2" v:setback="7.04" refX="-7.04" orient="auto"
+ markerUnits="strokeWidth" overflow="visible">
+ <use xlink:href="#lend4" transform="scale(-3.52,-3.52) "/>
+ </marker>
+ </defs>
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <v:layer v:name="Connector" v:index="0"/>
+ <g id="shape24-1" v:mID="24" v:groupContext="shape" transform="translate(34.9607,-40.8257)">
+ <title>Parallelogram</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:prompt="" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <path d="M0 179.93 L222.58 179.93 L288.29 110.74 L65.71 110.74 L0 179.93 Z" class="st1"/>
+ </g>
+ <g id="shape28-3" v:mID="28" v:groupContext="shape" transform="translate(-95.504,15.1931) rotate(-46.4754)">
+ <title>Sheet.28</title>
+ <path d="M0 179.93 L40.67 179.93" class="st2"/>
+ </g>
+ <g id="shape29-9" v:mID="29" v:groupContext="shape" transform="translate(34.9607,-40.8257)">
+ <title>Sheet.29</title>
+ <path d="M0 179.93 L48.37 179.93" class="st2"/>
+ </g>
+ <g id="shape33-14" v:mID="33" v:groupContext="shape" transform="translate(-10.6429,-34.9507) rotate(-14.6817)">
+ <title>Sheet.33</title>
+ <path d="M0 179.93 L180.5 179.93" class="st2"/>
+ </g>
+ <g id="shape36-19" v:mID="36" v:groupContext="shape" transform="translate(36.2288,91.5749) rotate(-90)">
+ <title>Sheet.36</title>
+ <path d="M0 179.93 L57.25 179.93" class="st4"/>
+ </g>
+ <g id="shape37-22" v:mID="37" v:groupContext="shape" transform="translate(-55.1147,-16.6562) rotate(-30.0403)">
+ <title>Sheet.37</title>
+ <path d="M0 179.93 L202.28 179.93" class="st5"/>
+ </g>
+ <g id="shape38-28" v:mID="38" v:groupContext="shape" transform="translate(18.375,-33.5132)">
+ <title>Sheet.38</title>
+ <desc>X</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="5.34375" cy="174.026" width="10.69" height="11.8125"/>
+ <rect x="0" y="168.119" width="10.6875" height="11.8125" class="st7"/>
+ <text x="4" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X</text> </g>
+ <g id="shape43-31" v:mID="43" v:groupContext="shape" transform="translate(31.875,-69.5132)">
+ <title>Sheet.43</title>
+ <desc>X1</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="14.3438" cy="174.026" width="28.69" height="11.8125"/>
+ <rect x="0" y="168.119" width="28.6875" height="11.8125" class="st7"/>
+ <text x="8.06" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+ class="st9" v:baseFontSize="12">1</tspan></text> </g>
+ <g id="shape52-35" v:mID="52" v:groupContext="shape" transform="translate(72.375,-20.0132)">
+ <title>Sheet.52</title>
+ <desc>X2</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="14.3438" cy="174.026" width="28.69" height="11.8125"/>
+ <rect x="0" y="168.119" width="28.6875" height="11.8125" class="st7"/>
+ <text x="8.06" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+ class="st9" v:baseFontSize="12">2</tspan></text> </g>
+ <g id="shape53-39" v:mID="53" v:groupContext="shape" transform="translate(205.688,-148.826)">
+ <title>Sheet.53</title>
+ <desc>Y</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="10.6875" cy="174.026" width="21.38" height="11.8125"/>
+ <rect x="0" y="168.119" width="21.375" height="11.8125" class="st7"/>
+ <text x="6.35" y="177.63" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Y</text> </g>
+ <g id="shape54-42" v:mID="54" v:groupContext="shape" transform="translate(200.625,-60.1114)">
+ <title>Sheet.54</title>
+ <desc>Xr = X + α(X1 – X) + β(X2 – X)</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="87.75" cy="170.932" width="175.5" height="18"/>
+ <rect x="0" y="161.932" width="175.5" height="18" class="st7"/>
+ <text x="12.79" y="174.53" class="st8" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>X<tspan dy="-0.279em"
+ class="st9" v:baseFontSize="12">r </tspan><tspan dy="0.181em" class="st10">= X + </tspan>α(X<tspan
+ dy="-0.279em" class="st9" v:baseFontSize="12">1 </tspan><tspan dy="0.181em" class="st10">–</tspan> X) + β(X<tspan
+ dy="-0.279em" class="st9" v:baseFontSize="12">2 </tspan><tspan dy="0.181em" class="st10">–</tspan> X) </text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/partition_codingblock.svg b/third_party/aom/doc/img/partition_codingblock.svg
new file mode 100644
index 0000000000..872692dbd7
--- /dev/null
+++ b/third_party/aom/doc/img/partition_codingblock.svg
@@ -0,0 +1,225 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export partition_codingblock.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="8.02083in" height="8.51563in"
+ viewBox="0 0 577.5 613.125" xml:space="preserve" color-interpolation-filters="sRGB" class="st6">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:none;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st2 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st3 {fill:#000000;font-family:Consolas;font-size:1.16666em}
+ .st4 {font-size:1em}
+ .st5 {stroke:#0070c0;stroke-dasharray:1.5,3;stroke-linecap:round;stroke-linejoin:round;stroke-width:1.5}
+ .st6 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.75,-468.375)">
+ <title>Square</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape6-3" v:mID="6" v:groupContext="shape" transform="translate(216.75,-468.375)">
+ <title>Square.6</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape15-5" v:mID="15" v:groupContext="shape" transform="translate(432.75,-468.375)">
+ <title>Square.15</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape24-7" v:mID="24" v:groupContext="shape" transform="translate(0.75,-252.375)">
+ <title>Square.24</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape30-9" v:mID="30" v:groupContext="shape" transform="translate(216.75,-252.375)">
+ <title>Square.30</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape34-11" v:mID="34" v:groupContext="shape" transform="translate(432.75,-252.375)">
+ <title>Square.34</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape38-13" v:mID="38" v:groupContext="shape" transform="translate(0.75,-36.375)">
+ <title>Square.38</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape42-15" v:mID="42" v:groupContext="shape" transform="translate(216.75,-36.375)">
+ <title>Square.42</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape47-17" v:mID="47" v:groupContext="shape" transform="translate(432.75,-36.375)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="469.125" width="144" height="144" class="st1"/>
+ </g>
+ <g id="shape50-19" v:mID="50" v:groupContext="shape" transform="translate(0.75,-436.875)">
+ <title>Sheet.50</title>
+ <desc>PARTITION_SPLIT</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="14.27" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_SPLIT</text> </g>
+ <g id="shape51-22" v:mID="51" v:groupContext="shape" transform="translate(216.75,-436.875)">
+ <title>Sheet.51</title>
+ <desc>PARTITION_VERT_4</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="10.42" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_4</text> </g>
+ <g id="shape52-25" v:mID="52" v:groupContext="shape" transform="translate(432.75,-436.875)">
+ <title>Sheet.52</title>
+ <desc>PARTITION_HORZ_4</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="10.42" y="610.43" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_4</text> </g>
+ <g id="shape60-28" v:mID="60" v:groupContext="shape" transform="translate(0.75,-220.875)">
+ <title>Sheet.60</title>
+ <desc>PARTITION_HORZ_B</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_<tspan
+ class="st4" v:langID="2052">B</tspan></text> </g>
+ <g id="shape61-32" v:mID="61" v:groupContext="shape" transform="translate(216.75,-220.875)">
+ <title>Sheet.61</title>
+ <desc>PARTITION_VERT_A</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_A</text> </g>
+ <g id="shape62-35" v:mID="62" v:groupContext="shape" transform="translate(432.75,-220.875)">
+ <title>Sheet.62</title>
+ <desc>PARTITION_HORZ_A</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ_A</text> </g>
+ <g id="shape63-38" v:mID="63" v:groupContext="shape" transform="translate(0.75,-0.375)">
+ <title>Sheet.63</title>
+ <desc>PARTITION_VERT_B</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="10.42" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_VERT_<tspan
+ class="st4" v:langID="2052">B</tspan></text> </g>
+ <g id="shape64-42" v:mID="64" v:groupContext="shape" transform="translate(216.75,-0.375)">
+ <title>Sheet.64</title>
+ <desc>PARTITION_HORZ</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="18.12" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_HORZ</text> </g>
+ <g id="shape65-45" v:mID="65" v:groupContext="shape" transform="translate(432.75,-0.375)">
+ <title>Sheet.65</title>
+ <desc>PARTITION_VERT</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="72" cy="599.625" width="144.01" height="27"/>
+ <rect x="0" y="586.125" width="144" height="27" class="st2"/>
+ <text x="18.12" y="604.32" class="st3" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>PARTITION_<tspan
+ class="st4" v:langID="2052">VERT</tspan></text> </g>
+ <g id="shape66-49" v:mID="66" v:groupContext="shape" transform="translate(685.875,0.75) rotate(90)">
+ <title>Sheet.66</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape67-52" v:mID="67" v:groupContext="shape" transform="translate(0.75,-540.375)">
+ <title>Sheet.67</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape68-55" v:mID="68" v:groupContext="shape" transform="translate(865.875,0.750007) rotate(90)">
+ <title>Sheet.68</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape69-58" v:mID="69" v:groupContext="shape" transform="translate(901.875,0.750007) rotate(90)">
+ <title>Sheet.69</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape70-61" v:mID="70" v:groupContext="shape" transform="translate(937.875,0.750007) rotate(90)">
+ <title>Sheet.70</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape71-64" v:mID="71" v:groupContext="shape" transform="translate(432.75,-504.375)">
+ <title>Sheet.71</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape72-67" v:mID="72" v:groupContext="shape" transform="translate(432.75,-540.375)">
+ <title>Sheet.72</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape73-70" v:mID="73" v:groupContext="shape" transform="translate(432.75,-576.375)">
+ <title>Sheet.73</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape74-73" v:mID="74" v:groupContext="shape" transform="translate(0.75,-324.375)">
+ <title>Sheet.74</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape75-76" v:mID="75" v:groupContext="shape" transform="translate(685.875,288.75) rotate(90)">
+ <title>Sheet.75</title>
+ <path d="M0 613.13 L72 613.13" class="st5"/>
+ </g>
+ <g id="shape76-79" v:mID="76" v:groupContext="shape" transform="translate(901.875,216.75) rotate(90)">
+ <title>Sheet.76</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape77-82" v:mID="77" v:groupContext="shape" transform="translate(216.75,-324.375)">
+ <title>Sheet.77</title>
+ <path d="M0 613.13 L72 613.13" class="st5"/>
+ </g>
+ <g id="shape78-85" v:mID="78" v:groupContext="shape" transform="translate(432.75,-324.375)">
+ <title>Sheet.78</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape79-88" v:mID="79" v:groupContext="shape" transform="translate(1117.88,216.75) rotate(90)">
+ <title>Sheet.79</title>
+ <path d="M0 613.13 L72 613.13" class="st5"/>
+ </g>
+ <g id="shape80-91" v:mID="80" v:groupContext="shape" transform="translate(685.875,432.75) rotate(90)">
+ <title>Sheet.80</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape81-94" v:mID="81" v:groupContext="shape" transform="translate(72.75,-108.375)">
+ <title>Sheet.81</title>
+ <path d="M0 613.13 L72 613.13" class="st5"/>
+ </g>
+ <g id="shape82-97" v:mID="82" v:groupContext="shape" transform="translate(216.75,-108.375)">
+ <title>Sheet.82</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ <g id="shape83-100" v:mID="83" v:groupContext="shape" transform="translate(1117.88,432.75) rotate(90)">
+ <title>Sheet.83</title>
+ <path d="M0 613.13 L144 613.13" class="st5"/>
+ </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/primary_tap.svg b/third_party/aom/doc/img/primary_tap.svg
new file mode 100644
index 0000000000..8cd2a18134
--- /dev/null
+++ b/third_party/aom/doc/img/primary_tap.svg
@@ -0,0 +1,1589 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export primary_tap.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="11.2533in" height="6.63188in"
+ viewBox="0 0 810.24 477.495" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {fill:#000000;font-family:Calibri;font-size:1.00001em;font-style:italic}
+ .st3 {font-size:1em;font-style:normal}
+ .st4 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st5 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st6 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+ .st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.12,-423.375)">
+ <title>Square</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(54.12,-423.375)">
+ <title>Square.2</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(90.12,-423.375)">
+ <title>Square.3</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape4-7" v:mID="4" v:groupContext="shape" transform="translate(126.12,-423.375)">
+ <title>Square.4</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape5-9" v:mID="5" v:groupContext="shape" transform="translate(162.12,-423.375)">
+ <title>Square.5</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape6-13" v:mID="6" v:groupContext="shape" transform="translate(18.12,-387.375)">
+ <title>Square.6</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape7-15" v:mID="7" v:groupContext="shape" transform="translate(54.12,-387.375)">
+ <title>Square.7</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape8-17" v:mID="8" v:groupContext="shape" transform="translate(90.12,-387.375)">
+ <title>Square.8</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape9-19" v:mID="9" v:groupContext="shape" transform="translate(126.12,-387.375)">
+ <title>Square.9</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape10-23" v:mID="10" v:groupContext="shape" transform="translate(162.12,-387.375)">
+ <title>Square.10</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape11-25" v:mID="11" v:groupContext="shape" transform="translate(18.12,-351.375)">
+ <title>Square.11</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape12-27" v:mID="12" v:groupContext="shape" transform="translate(54.12,-351.375)">
+ <title>Square.12</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape13-29" v:mID="13" v:groupContext="shape" transform="translate(90.12,-351.375)">
+ <title>Square.13</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(126.12,-351.375)">
+ <title>Square.14</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape15-33" v:mID="15" v:groupContext="shape" transform="translate(162.12,-351.375)">
+ <title>Square.15</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape16-35" v:mID="16" v:groupContext="shape" transform="translate(18.12,-315.375)">
+ <title>Square.16</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape17-37" v:mID="17" v:groupContext="shape" transform="translate(54.12,-315.375)">
+ <title>Square.17</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape18-41" v:mID="18" v:groupContext="shape" transform="translate(90.12,-315.375)">
+ <title>Square.18</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape19-43" v:mID="19" v:groupContext="shape" transform="translate(126.12,-315.375)">
+ <title>Square.19</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape20-45" v:mID="20" v:groupContext="shape" transform="translate(162.12,-315.375)">
+ <title>Square.20</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape21-47" v:mID="21" v:groupContext="shape" transform="translate(18.12,-279.375)">
+ <title>Square.21</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape22-51" v:mID="22" v:groupContext="shape" transform="translate(54.12,-279.375)">
+ <title>Square.22</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape23-53" v:mID="23" v:groupContext="shape" transform="translate(90.12,-279.375)">
+ <title>Square.23</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape24-55" v:mID="24" v:groupContext="shape" transform="translate(126.12,-279.375)">
+ <title>Square.24</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape25-57" v:mID="25" v:groupContext="shape" transform="translate(162.12,-279.375)">
+ <title>Square.25</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape30-59" v:mID="30" v:groupContext="shape" transform="translate(216.12,-423.375)">
+ <title>Square.30</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape31-61" v:mID="31" v:groupContext="shape" transform="translate(252.12,-423.375)">
+ <title>Square.31</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape32-63" v:mID="32" v:groupContext="shape" transform="translate(288.12,-423.375)">
+ <title>Square.32</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape33-65" v:mID="33" v:groupContext="shape" transform="translate(324.12,-423.375)">
+ <title>Square.33</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape34-67" v:mID="34" v:groupContext="shape" transform="translate(360.12,-423.375)">
+ <title>Square.34</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape35-69" v:mID="35" v:groupContext="shape" transform="translate(216.12,-387.375)">
+ <title>Square.35</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape36-71" v:mID="36" v:groupContext="shape" transform="translate(252.12,-387.375)">
+ <title>Square.36</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape37-73" v:mID="37" v:groupContext="shape" transform="translate(288.12,-387.375)">
+ <title>Square.37</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape38-75" v:mID="38" v:groupContext="shape" transform="translate(324.12,-387.375)">
+ <title>Square.38</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape39-77" v:mID="39" v:groupContext="shape" transform="translate(360.12,-387.375)">
+ <title>Square.39</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape40-81" v:mID="40" v:groupContext="shape" transform="translate(216.12,-351.375)">
+ <title>Square.40</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape41-83" v:mID="41" v:groupContext="shape" transform="translate(252.12,-351.375)">
+ <title>Square.41</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape42-87" v:mID="42" v:groupContext="shape" transform="translate(288.12,-351.375)">
+ <title>Square.42</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape43-89" v:mID="43" v:groupContext="shape" transform="translate(324.12,-351.375)">
+ <title>Square.43</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape44-93" v:mID="44" v:groupContext="shape" transform="translate(360.12,-351.375)">
+ <title>Square.44</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape45-95" v:mID="45" v:groupContext="shape" transform="translate(216.12,-315.375)">
+ <title>Square.45</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape46-99" v:mID="46" v:groupContext="shape" transform="translate(252.12,-315.375)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape47-101" v:mID="47" v:groupContext="shape" transform="translate(288.12,-315.375)">
+ <title>Square.47</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape48-103" v:mID="48" v:groupContext="shape" transform="translate(324.12,-315.375)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape49-105" v:mID="49" v:groupContext="shape" transform="translate(360.12,-315.375)">
+ <title>Square.49</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape50-107" v:mID="50" v:groupContext="shape" transform="translate(216.12,-279.375)">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape51-109" v:mID="51" v:groupContext="shape" transform="translate(252.12,-279.375)">
+ <title>Square.51</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape52-111" v:mID="52" v:groupContext="shape" transform="translate(288.12,-279.375)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape53-113" v:mID="53" v:groupContext="shape" transform="translate(324.12,-279.375)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape54-115" v:mID="54" v:groupContext="shape" transform="translate(360.12,-279.375)">
+ <title>Square.54</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape55-117" v:mID="55" v:groupContext="shape" transform="translate(414.12,-423.375)">
+ <title>Square.55</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape56-119" v:mID="56" v:groupContext="shape" transform="translate(450.12,-423.375)">
+ <title>Square.56</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape57-121" v:mID="57" v:groupContext="shape" transform="translate(486.12,-423.375)">
+ <title>Square.57</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape58-123" v:mID="58" v:groupContext="shape" transform="translate(522.12,-423.375)">
+ <title>Square.58</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape59-125" v:mID="59" v:groupContext="shape" transform="translate(558.12,-423.375)">
+ <title>Square.59</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape60-127" v:mID="60" v:groupContext="shape" transform="translate(414.12,-387.375)">
+ <title>Square.60</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape61-129" v:mID="61" v:groupContext="shape" transform="translate(450.12,-387.375)">
+ <title>Square.61</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape62-131" v:mID="62" v:groupContext="shape" transform="translate(486.12,-387.375)">
+ <title>Square.62</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape63-133" v:mID="63" v:groupContext="shape" transform="translate(522.12,-387.375)">
+ <title>Square.63</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape64-135" v:mID="64" v:groupContext="shape" transform="translate(558.12,-387.375)">
+ <title>Square.64</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape65-137" v:mID="65" v:groupContext="shape" transform="translate(414.12,-351.375)">
+ <title>Square.65</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape66-141" v:mID="66" v:groupContext="shape" transform="translate(450.12,-351.375)">
+ <title>Square.66</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape67-145" v:mID="67" v:groupContext="shape" transform="translate(486.12,-351.375)">
+ <title>Square.67</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape68-147" v:mID="68" v:groupContext="shape" transform="translate(522.12,-351.375)">
+ <title>Square.68</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape69-151" v:mID="69" v:groupContext="shape" transform="translate(558.12,-351.375)">
+ <title>Square.69</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape70-155" v:mID="70" v:groupContext="shape" transform="translate(414.12,-315.375)">
+ <title>Square.70</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape71-157" v:mID="71" v:groupContext="shape" transform="translate(450.12,-315.375)">
+ <title>Square.71</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape72-159" v:mID="72" v:groupContext="shape" transform="translate(486.12,-315.375)">
+ <title>Square.72</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape73-161" v:mID="73" v:groupContext="shape" transform="translate(522.12,-315.375)">
+ <title>Square.73</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape74-163" v:mID="74" v:groupContext="shape" transform="translate(558.12,-315.375)">
+ <title>Square.74</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape75-165" v:mID="75" v:groupContext="shape" transform="translate(414.12,-279.375)">
+ <title>Square.75</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape76-167" v:mID="76" v:groupContext="shape" transform="translate(450.12,-279.375)">
+ <title>Square.76</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape77-169" v:mID="77" v:groupContext="shape" transform="translate(486.12,-279.375)">
+ <title>Square.77</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape78-171" v:mID="78" v:groupContext="shape" transform="translate(522.12,-279.375)">
+ <title>Square.78</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape79-173" v:mID="79" v:groupContext="shape" transform="translate(558.12,-279.375)">
+ <title>Square.79</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape80-175" v:mID="80" v:groupContext="shape" transform="translate(612.12,-423.375)">
+ <title>Square.80</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape81-177" v:mID="81" v:groupContext="shape" transform="translate(648.12,-423.375)">
+ <title>Square.81</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape82-179" v:mID="82" v:groupContext="shape" transform="translate(684.12,-423.375)">
+ <title>Square.82</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape83-181" v:mID="83" v:groupContext="shape" transform="translate(720.12,-423.375)">
+ <title>Square.83</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape84-183" v:mID="84" v:groupContext="shape" transform="translate(756.12,-423.375)">
+ <title>Square.84</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape85-185" v:mID="85" v:groupContext="shape" transform="translate(612.12,-387.375)">
+ <title>Square.85</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape86-189" v:mID="86" v:groupContext="shape" transform="translate(648.12,-387.375)">
+ <title>Square.86</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape87-191" v:mID="87" v:groupContext="shape" transform="translate(684.12,-387.375)">
+ <title>Square.87</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape88-193" v:mID="88" v:groupContext="shape" transform="translate(720.12,-387.375)">
+ <title>Square.88</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape89-195" v:mID="89" v:groupContext="shape" transform="translate(756.12,-387.375)">
+ <title>Square.89</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape90-197" v:mID="90" v:groupContext="shape" transform="translate(612.12,-351.375)">
+ <title>Square.90</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape91-199" v:mID="91" v:groupContext="shape" transform="translate(648.12,-351.375)">
+ <title>Square.91</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape92-203" v:mID="92" v:groupContext="shape" transform="translate(684.12,-351.375)">
+ <title>Square.92</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape93-205" v:mID="93" v:groupContext="shape" transform="translate(720.12,-351.375)">
+ <title>Square.93</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape94-209" v:mID="94" v:groupContext="shape" transform="translate(756.12,-351.375)">
+ <title>Square.94</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape95-211" v:mID="95" v:groupContext="shape" transform="translate(612.12,-315.375)">
+ <title>Square.95</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape96-213" v:mID="96" v:groupContext="shape" transform="translate(648.12,-315.375)">
+ <title>Square.96</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape97-215" v:mID="97" v:groupContext="shape" transform="translate(684.12,-315.375)">
+ <title>Square.97</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape98-217" v:mID="98" v:groupContext="shape" transform="translate(720.12,-315.375)">
+ <title>Square.98</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape99-219" v:mID="99" v:groupContext="shape" transform="translate(756.12,-315.375)">
+ <title>Square.99</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape100-223" v:mID="100" v:groupContext="shape" transform="translate(612.12,-279.375)">
+ <title>Square.100</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape101-225" v:mID="101" v:groupContext="shape" transform="translate(648.12,-279.375)">
+ <title>Square.101</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape102-227" v:mID="102" v:groupContext="shape" transform="translate(684.12,-279.375)">
+ <title>Square.102</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape103-229" v:mID="103" v:groupContext="shape" transform="translate(720.12,-279.375)">
+ <title>Square.103</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape104-231" v:mID="104" v:groupContext="shape" transform="translate(756.12,-279.375)">
+ <title>Square.104</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape115-233" v:mID="115" v:groupContext="shape" transform="translate(18.12,-189.375)">
+ <title>Square.115</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape116-237" v:mID="116" v:groupContext="shape" transform="translate(54.12,-189.375)">
+ <title>Square.116</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape117-239" v:mID="117" v:groupContext="shape" transform="translate(90.12,-189.375)">
+ <title>Square.117</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape118-241" v:mID="118" v:groupContext="shape" transform="translate(126.12,-189.375)">
+ <title>Square.118</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape119-243" v:mID="119" v:groupContext="shape" transform="translate(162.12,-189.375)">
+ <title>Square.119</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape120-245" v:mID="120" v:groupContext="shape" transform="translate(18.12,-153.375)">
+ <title>Square.120</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape121-247" v:mID="121" v:groupContext="shape" transform="translate(54.12,-153.375)">
+ <title>Square.121</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape122-251" v:mID="122" v:groupContext="shape" transform="translate(90.12,-153.375)">
+ <title>Square.122</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape123-253" v:mID="123" v:groupContext="shape" transform="translate(126.12,-153.375)">
+ <title>Square.123</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape124-255" v:mID="124" v:groupContext="shape" transform="translate(162.12,-153.375)">
+ <title>Square.124</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape125-257" v:mID="125" v:groupContext="shape" transform="translate(18.12,-117.375)">
+ <title>Square.125</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape126-259" v:mID="126" v:groupContext="shape" transform="translate(54.12,-117.375)">
+ <title>Square.126</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape127-261" v:mID="127" v:groupContext="shape" transform="translate(90.12,-117.375)">
+ <title>Square.127</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape128-263" v:mID="128" v:groupContext="shape" transform="translate(126.12,-117.375)">
+ <title>Square.128</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape129-265" v:mID="129" v:groupContext="shape" transform="translate(162.12,-117.375)">
+ <title>Square.129</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape130-267" v:mID="130" v:groupContext="shape" transform="translate(18.12,-81.375)">
+ <title>Square.130</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape131-269" v:mID="131" v:groupContext="shape" transform="translate(54.12,-81.375)">
+ <title>Square.131</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape132-271" v:mID="132" v:groupContext="shape" transform="translate(90.12,-81.3749)">
+ <title>Square.132</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape133-273" v:mID="133" v:groupContext="shape" transform="translate(126.12,-81.3749)">
+ <title>Square.133</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape134-277" v:mID="134" v:groupContext="shape" transform="translate(162.12,-81.3749)">
+ <title>Square.134</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape135-279" v:mID="135" v:groupContext="shape" transform="translate(18.12,-45.375)">
+ <title>Square.135</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape136-281" v:mID="136" v:groupContext="shape" transform="translate(54.12,-45.375)">
+ <title>Square.136</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape137-283" v:mID="137" v:groupContext="shape" transform="translate(90.12,-45.375)">
+ <title>Square.137</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape138-285" v:mID="138" v:groupContext="shape" transform="translate(126.12,-45.375)">
+ <title>Square.138</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape139-287" v:mID="139" v:groupContext="shape" transform="translate(162.12,-45.375)">
+ <title>Square.139</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape140-291" v:mID="140" v:groupContext="shape" transform="translate(216.12,-189.375)">
+ <title>Square.140</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape141-293" v:mID="141" v:groupContext="shape" transform="translate(252.12,-189.375)">
+ <title>Square.141</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape142-297" v:mID="142" v:groupContext="shape" transform="translate(288.12,-189.375)">
+ <title>Square.142</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape143-299" v:mID="143" v:groupContext="shape" transform="translate(324.12,-189.375)">
+ <title>Square.143</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape144-301" v:mID="144" v:groupContext="shape" transform="translate(360.12,-189.375)">
+ <title>Square.144</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape145-303" v:mID="145" v:groupContext="shape" transform="translate(216.12,-153.375)">
+ <title>Square.145</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape146-305" v:mID="146" v:groupContext="shape" transform="translate(252.12,-153.375)">
+ <title>Square.146</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape147-307" v:mID="147" v:groupContext="shape" transform="translate(288.12,-153.375)">
+ <title>Square.147</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape148-311" v:mID="148" v:groupContext="shape" transform="translate(324.12,-153.375)">
+ <title>Square.148</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape149-313" v:mID="149" v:groupContext="shape" transform="translate(360.12,-153.375)">
+ <title>Square.149</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape150-315" v:mID="150" v:groupContext="shape" transform="translate(216.12,-117.375)">
+ <title>Square.150</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape151-317" v:mID="151" v:groupContext="shape" transform="translate(252.12,-117.375)">
+ <title>Square.151</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape152-319" v:mID="152" v:groupContext="shape" transform="translate(288.12,-117.375)">
+ <title>Square.152</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape153-321" v:mID="153" v:groupContext="shape" transform="translate(324.12,-117.375)">
+ <title>Square.153</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape154-323" v:mID="154" v:groupContext="shape" transform="translate(360.12,-117.375)">
+ <title>Square.154</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape155-325" v:mID="155" v:groupContext="shape" transform="translate(216.12,-81.3749)">
+ <title>Square.155</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape156-327" v:mID="156" v:groupContext="shape" transform="translate(252.12,-81.3749)">
+ <title>Square.156</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape157-329" v:mID="157" v:groupContext="shape" transform="translate(288.12,-81.3749)">
+ <title>Square.157</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape158-333" v:mID="158" v:groupContext="shape" transform="translate(324.12,-81.3749)">
+ <title>Square.158</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape159-335" v:mID="159" v:groupContext="shape" transform="translate(360.12,-81.3749)">
+ <title>Square.159</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape160-337" v:mID="160" v:groupContext="shape" transform="translate(216.12,-45.3749)">
+ <title>Square.160</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape161-339" v:mID="161" v:groupContext="shape" transform="translate(252.12,-45.3749)">
+ <title>Square.161</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape162-341" v:mID="162" v:groupContext="shape" transform="translate(288.12,-45.3749)">
+ <title>Square.162</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape163-343" v:mID="163" v:groupContext="shape" transform="translate(324.12,-45.3749)">
+ <title>Square.163</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape164-347" v:mID="164" v:groupContext="shape" transform="translate(360.12,-45.3749)">
+ <title>Square.164</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape165-349" v:mID="165" v:groupContext="shape" transform="translate(414.12,-189.375)">
+ <title>Square.165</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape166-351" v:mID="166" v:groupContext="shape" transform="translate(450.12,-189.375)">
+ <title>Square.166</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape167-353" v:mID="167" v:groupContext="shape" transform="translate(486.12,-189.375)">
+ <title>Square.167</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape168-357" v:mID="168" v:groupContext="shape" transform="translate(522.12,-189.375)">
+ <title>Square.168</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape169-359" v:mID="169" v:groupContext="shape" transform="translate(558.12,-189.375)">
+ <title>Square.169</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape170-361" v:mID="170" v:groupContext="shape" transform="translate(414.12,-153.375)">
+ <title>Square.170</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape171-363" v:mID="171" v:groupContext="shape" transform="translate(450.12,-153.375)">
+ <title>Square.171</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape172-365" v:mID="172" v:groupContext="shape" transform="translate(486.12,-153.375)">
+ <title>Square.172</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape173-369" v:mID="173" v:groupContext="shape" transform="translate(522.12,-153.375)">
+ <title>Square.173</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape174-371" v:mID="174" v:groupContext="shape" transform="translate(558.12,-153.375)">
+ <title>Square.174</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape175-373" v:mID="175" v:groupContext="shape" transform="translate(414.12,-117.375)">
+ <title>Square.175</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape176-375" v:mID="176" v:groupContext="shape" transform="translate(450.12,-117.375)">
+ <title>Square.176</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape177-377" v:mID="177" v:groupContext="shape" transform="translate(486.12,-117.375)">
+ <title>Square.177</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape178-379" v:mID="178" v:groupContext="shape" transform="translate(522.12,-117.375)">
+ <title>Square.178</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape179-381" v:mID="179" v:groupContext="shape" transform="translate(558.12,-117.375)">
+ <title>Square.179</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape180-383" v:mID="180" v:groupContext="shape" transform="translate(414.12,-81.3749)">
+ <title>Square.180</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape181-385" v:mID="181" v:groupContext="shape" transform="translate(450.12,-81.3749)">
+ <title>Square.181</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape182-387" v:mID="182" v:groupContext="shape" transform="translate(486.12,-81.3749)">
+ <title>Square.182</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape183-391" v:mID="183" v:groupContext="shape" transform="translate(522.12,-81.3749)">
+ <title>Square.183</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape184-393" v:mID="184" v:groupContext="shape" transform="translate(558.12,-81.3749)">
+ <title>Square.184</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape185-395" v:mID="185" v:groupContext="shape" transform="translate(414.12,-45.3749)">
+ <title>Square.185</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape186-397" v:mID="186" v:groupContext="shape" transform="translate(450.12,-45.3749)">
+ <title>Square.186</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape187-399" v:mID="187" v:groupContext="shape" transform="translate(486.12,-45.3749)">
+ <title>Square.187</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape188-403" v:mID="188" v:groupContext="shape" transform="translate(522.12,-45.3749)">
+ <title>Square.188</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape189-405" v:mID="189" v:groupContext="shape" transform="translate(558.12,-45.3749)">
+ <title>Square.189</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape190-407" v:mID="190" v:groupContext="shape" transform="translate(612.12,-189.375)">
+ <title>Square.190</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape191-409" v:mID="191" v:groupContext="shape" transform="translate(648.12,-189.375)">
+ <title>Square.191</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape192-411" v:mID="192" v:groupContext="shape" transform="translate(684.12,-189.375)">
+ <title>Square.192</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape193-413" v:mID="193" v:groupContext="shape" transform="translate(720.12,-189.375)">
+ <title>Square.193</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape194-417" v:mID="194" v:groupContext="shape" transform="translate(756.12,-189.375)">
+ <title>Square.194</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape195-419" v:mID="195" v:groupContext="shape" transform="translate(612.12,-153.375)">
+ <title>Square.195</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape196-421" v:mID="196" v:groupContext="shape" transform="translate(648.12,-153.375)">
+ <title>Square.196</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape197-423" v:mID="197" v:groupContext="shape" transform="translate(684.12,-153.375)">
+ <title>Square.197</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape198-427" v:mID="198" v:groupContext="shape" transform="translate(720.12,-153.375)">
+ <title>Square.198</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape199-429" v:mID="199" v:groupContext="shape" transform="translate(756.12,-153.375)">
+ <title>Square.199</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape200-431" v:mID="200" v:groupContext="shape" transform="translate(612.12,-117.375)">
+ <title>Square.200</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape201-433" v:mID="201" v:groupContext="shape" transform="translate(648.12,-117.375)">
+ <title>Square.201</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape202-435" v:mID="202" v:groupContext="shape" transform="translate(684.12,-117.375)">
+ <title>Square.202</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st4"/>
+ </g>
+ <g id="shape203-437" v:mID="203" v:groupContext="shape" transform="translate(720.12,-117.375)">
+ <title>Square.203</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape204-439" v:mID="204" v:groupContext="shape" transform="translate(756.12,-117.375)">
+ <title>Square.204</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape205-441" v:mID="205" v:groupContext="shape" transform="translate(612.12,-81.3749)">
+ <title>Square.205</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape206-443" v:mID="206" v:groupContext="shape" transform="translate(648.12,-81.3749)">
+ <title>Square.206</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape207-445" v:mID="207" v:groupContext="shape" transform="translate(684.12,-81.3749)">
+ <title>Square.207</title>
+ <desc>b/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>b<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape208-449" v:mID="208" v:groupContext="shape" transform="translate(720.12,-81.3749)">
+ <title>Square.208</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape209-451" v:mID="209" v:groupContext="shape" transform="translate(756.12,-81.3749)">
+ <title>Square.209</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape210-453" v:mID="210" v:groupContext="shape" transform="translate(612.12,-45.3749)">
+ <title>Square.210</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape211-455" v:mID="211" v:groupContext="shape" transform="translate(648.12,-45.3749)">
+ <title>Square.211</title>
+ <desc>a/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="459.495" width="36.01" height="36"/>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ <text x="6.52" y="463.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>a<tspan class="st3">/16</tspan></text> </g>
+ <g id="shape212-459" v:mID="212" v:groupContext="shape" transform="translate(684.12,-45.3749)">
+ <title>Square.212</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape213-461" v:mID="213" v:groupContext="shape" transform="translate(720.12,-45.3749)">
+ <title>Square.213</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape214-463" v:mID="214" v:groupContext="shape" transform="translate(756.12,-45.3749)">
+ <title>Square.214</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="441.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape236-465" v:mID="236" v:groupContext="shape" transform="translate(54.12,-252.375)">
+ <title>Sheet.236</title>
+ <desc>d = 0</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 0</tspan></text> </g>
+ <g id="shape237-470" v:mID="237" v:groupContext="shape" transform="translate(252.12,-252.375)">
+ <title>Sheet.237</title>
+ <desc>d = 1</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 1</tspan></text> </g>
+ <g id="shape238-475" v:mID="238" v:groupContext="shape" transform="translate(450.12,-252.375)">
+ <title>Sheet.238</title>
+ <desc>d = 2</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 2</tspan></text> </g>
+ <g id="shape239-480" v:mID="239" v:groupContext="shape" transform="translate(648.12,-252.375)">
+ <title>Sheet.239</title>
+ <desc>d = 3</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 3</tspan></text> </g>
+ <g id="shape240-485" v:mID="240" v:groupContext="shape" transform="translate(54.12,-18.375)">
+ <title>Sheet.240</title>
+ <desc>d = 4</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 4</tspan></text> </g>
+ <g id="shape241-490" v:mID="241" v:groupContext="shape" transform="translate(252.12,-18.375)">
+ <title>Sheet.241</title>
+ <desc>d = 5</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 5</tspan></text> </g>
+ <g id="shape242-495" v:mID="242" v:groupContext="shape" transform="translate(450.12,-18.375)">
+ <title>Sheet.242</title>
+ <desc>d = 6</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 6</tspan></text> </g>
+ <g id="shape243-500" v:mID="243" v:groupContext="shape" transform="translate(648.12,-18.375)">
+ <title>Sheet.243</title>
+ <desc>d = 7</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="463.995" width="108" height="27"/>
+ <rect x="0" y="450.495" width="108" height="27" class="st5"/>
+ <text x="36.26" y="469.4" class="st6" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st3"> </tspan><tspan
+ class="st3">= 7</tspan></text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/quant_ac.svg b/third_party/aom/doc/img/quant_ac.svg
new file mode 100644
index 0000000000..3f589c8be6
--- /dev/null
+++ b/third_party/aom/doc/img/quant_ac.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 653.55 392.07"><defs><style>.cls-1,.cls-10,.cls-12,.cls-2,.cls-20,.cls-26,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{fill:none;}.cls-2{stroke:#d9d9d9;}.cls-10,.cls-12,.cls-2,.cls-20,.cls-26,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{stroke-linejoin:round;}.cls-2,.cls-20,.cls-26,.cls-33{stroke-width:0.75px;}.cls-3{clip-path:url(#clip-path);}.cls-20,.cls-4,.cls-6{stroke:#5b9bd5;}.cls-10,.cls-4,.cls-7{stroke-linecap:round;stroke-width:2.25px;}.cls-5{fill:#5b9bd5;}.cls-12,.cls-6,.cls-9{stroke-width:0.72px;}.cls-26,.cls-7,.cls-9{stroke:#ed7d31;}.cls-8{fill:#ed7d31;}.cls-10,.cls-12,.cls-33{stroke:#a5a5a5;}.cls-11{fill:#a5a5a5;}.cls-13{clip-path:url(#clip-path-4);}.cls-14{font-size:9px;font-family:Calibri, Calibri;}.cls-14,.cls-15,.cls-21{fill:#595959;}.cls-15{font-size:15.96px;}.cls-15,.cls-21{font-family:TimesNewRomanPSMT, Times New Roman;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{letter-spacing:0em;}.cls-19{letter-spacing:0em;}.cls-21{font-size:14.04px;}.cls-22{letter-spacing:0em;}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:0.01em;}.cls-25{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:0em;}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:0em;}.cls-32{letter-spacing:-0.01em;}</style><clipPath id="clip-path"><rect class="cls-1" x="53.78" y="8.9" width="587.4" height="355.08"/></clipPath><clipPath id="clip-path-4"><rect class="cls-1" x="0.38" y="0.38" width="652.8" height="391.32"/></clipPath></defs><title>tables3Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-2" d="M53.81,9H640.53M53.81,59.65H640.53M53.81,110.18H640.53M53.81,160.82H640.53M53.81,211.46H640.53M53.81,262.1H640.53M53.81,312.74H640.53"/><path class="cls-2" d="M626.78,9V363.3M512.18,9V363.3M397.57,9V363.3M283,9V363.3M168.38,9V363.3M53.81,9V363.3"/><line class="cls-2" x1="53.81" y1="363.3" x2="640.53" y2="363.3"/><g class="cls-3"><polyline class="cls-4" points="54.95 363.25 57.26 363.25 59.53 363.25 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.13 73.33 363.13 75.61 363.13 77.89 363.13 80.17 363.13 82.45 363.13 84.73 363.13 87.02 363.13 89.3 363.01 91.58 363.01 93.86 363.01 96.25 363.01 98.53 363.01 100.81 363.01 103.09 363.01 105.38 363.01 107.66 363.01 109.94 363.01 112.22 363.01 114.5 363.01 116.78 362.89 119.17 362.89 121.45 362.89 123.73 362.89 126.02 362.89 128.29 362.89 130.57 362.89 132.85 362.89 135.13 362.89 137.41 362.89 139.69 362.89 142.09 362.89 144.38 362.77 146.66 362.77 148.94 362.77 151.22 362.77 153.5 362.77 155.78 362.77 158.06 362.77 160.34 362.77 162.62 362.77 165.01 362.77 167.29 362.77 169.57 362.77 171.85 362.65 174.13 362.65 176.41 362.65 178.69 362.65 180.97 362.65 183.25 362.65 185.53 362.65 187.94 362.65 190.22 362.65 192.5 362.65 194.78 362.65 197.06 362.65 199.34 362.54 201.62 362.54 203.9 362.54 206.18 362.54 208.46 362.54 210.85 362.54 213.13 362.54 215.41 362.54 217.69 362.54 219.97 362.54 222.25 362.54 224.53 362.42 226.81 362.42 229.09 362.42 231.38 362.42 233.78 362.42 236.06 362.42 238.34 362.42 240.62 362.42 242.9 362.42 245.18 362.42 247.46 362.42 249.74 362.42 252.01 362.3 254.29 362.3 256.69 362.3 258.98 362.3 261.25 362.3 263.54 362.3 265.81 362.3 268.1 362.3 270.38 362.3 272.65 362.3 274.94 362.3 277.21 362.18 279.62 362.18 281.89 362.18 284.18 362.18 286.45 362.18 288.74 362.18 291.01 362.06 293.3 362.06 295.57 362.06 297.86 362.06 300.13 362.06 302.42 362.06 304.81 361.94 307.1 361.94 309.38 361.94 311.65 361.94 313.94 361.94 316.21 361.94 318.5 361.81 320.77 361.81 323.06 361.81 325.33 361.81 327.74 361.81 330.01 361.81 332.3 361.69 334.57 361.69 336.86 361.69 339.13 361.57 341.42 361.57 343.69 361.57 345.98 361.57 348.25 361.45 350.65 361.45 352.94 361.45 355.21 361.45 357.5 361.33 359.77 361.33 362.06 361.33 364.33 361.33 366.62 361.21 368.89 361.21 371.18 361.21 373.57 361.21 375.86 361.1 378.13 361.1 380.42 361.1 382.69 360.98 384.98 360.98 387.25 360.98 389.54 360.86 391.81 360.86 394.1 360.74 396.5 360.74 398.77 360.74 401.06 360.62 403.33 360.62 405.62 360.62 407.89 360.5 410.18 360.5 412.45 360.38 414.74 360.38 417.01 360.25 419.42 360.25 421.69 360.25 423.98 360.13 426.25 360.13 428.54 360.01 430.81 360.01 433.1 359.89 435.38 359.89 437.65 359.77 439.94 359.77 442.33 359.65 444.62 359.54 446.89 359.54 449.18 359.42 451.45 359.42 453.74 359.3 456.01 359.18 458.3 359.18 460.57 359.06 462.86 359.06 465.25 358.94 467.54 358.81 469.81 358.81 472.1 358.69 474.38 358.57 476.65 358.45 478.94 358.45 481.21 358.33 483.5 358.21 485.77 358.1 488.06 357.98 490.45 357.98 492.74 357.86 495.01 357.74 497.3 357.62 499.57 357.5 501.86 357.38 504.13 357.25 506.42 357.13 508.69 357.01 510.98 356.89 513.38 356.77 515.65 356.65 517.93 356.54 520.22 356.42 522.5 356.3 524.77 356.18 527.05 356.06 529.34 355.94 531.62 355.81 533.89 355.57 536.29 355.45 538.58 355.33 540.86 355.21 543.13 354.98 545.41 354.86 547.7 354.74 549.98 354.5 552.25 354.38 554.53 354.25 556.82 354.01 559.22 353.89 561.5 353.65 563.77 353.54 566.05 353.3 568.34 353.06 570.62 352.94 572.89 352.69 575.17 352.45 577.46 352.33 579.74 352.1 582.13 351.86 584.41 351.62 586.7 351.38 588.98 351.13 591.25 350.89 593.53 350.65 595.82 350.42 598.1 350.18 600.38 349.94 602.65 349.69 605.05 349.45 607.34 349.21 609.62 348.86 611.89 348.62 614.17 348.38 616.46 348.01 618.74 347.77 621.01 347.42 623.29 347.18 625.58 346.81 627.98 346.45 630.25 346.21 632.53 345.86 634.82 345.5 637.1 345.13 639.38 344.79"/></g><circle class="cls-5" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-6" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-5" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-6" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-5" cx="59.48" cy="363.2" r="1.98"/><circle class="cls-6" cx="59.48" cy="363.2" r="1.98"/><circle class="cls-5" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-6" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-5" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-6" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-5" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-6" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-5" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-6" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-5" cx="71" cy="363.08" r="1.98"/><circle class="cls-6" cx="71" cy="363.08" r="1.98"/><circle class="cls-5" cx="73.28" cy="363.08" r="1.98"/><circle class="cls-6" cx="73.28" cy="363.08" r="1.98"/><circle class="cls-5" cx="75.56" cy="363.08" r="1.98"/><circle class="cls-6" cx="75.56" cy="363.08" r="1.98"/><circle class="cls-5" cx="77.83" cy="363.08" r="1.98"/><circle class="cls-6" cx="77.83" cy="363.08" r="1.98"/><circle class="cls-5" cx="80.12" cy="363.08" r="1.98"/><circle class="cls-6" cx="80.12" cy="363.08" r="1.98"/><circle class="cls-5" cx="82.4" cy="363.08" r="1.98"/><circle class="cls-6" cx="82.4" cy="363.08" r="1.98"/><circle class="cls-5" cx="84.67" cy="363.08" r="1.98"/><circle class="cls-6" cx="84.67" cy="363.08" r="1.98"/><circle class="cls-5" cx="86.95" cy="363.08" r="1.98"/><circle class="cls-6" cx="86.95" cy="363.08" r="1.98"/><circle class="cls-5" cx="89.24" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -280.75, 415.99)"/><circle class="cls-6" cx="89.24" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -280.75, 415.99)"/><circle class="cls-5" cx="91.52" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -278.69, 418.26)"/><circle class="cls-6" cx="91.52" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -278.69, 418.26)"/><circle class="cls-5" cx="93.8" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.64, 420.53)"/><circle class="cls-6" cx="93.8" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.64, 420.53)"/><circle class="cls-5" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-6" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-5" cx="98.47" cy="362.96" r="1.98"/><circle class="cls-6" cx="98.47" cy="362.96" r="1.98"/><circle class="cls-5" cx="100.76" cy="362.96" r="1.98"/><circle class="cls-6" cx="100.76" cy="362.96" r="1.98"/><circle class="cls-5" cx="103.03" cy="362.96" r="1.98"/><circle class="cls-6" cx="103.03" cy="362.96" r="1.98"/><circle class="cls-5" cx="105.31" cy="362.96" r="1.98"/><circle class="cls-6" cx="105.31" cy="362.96" r="1.98"/><circle class="cls-5" cx="107.6" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -264.2, 434.26)"/><circle class="cls-6" cx="107.6" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -264.2, 434.26)"/><circle class="cls-5" cx="109.88" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -262.14, 436.53)"/><circle class="cls-6" cx="109.88" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -262.14, 436.53)"/><circle class="cls-5" cx="112.16" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -260.09, 438.8)"/><circle class="cls-6" cx="112.16" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -260.09, 438.8)"/><circle class="cls-5" cx="114.44" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -258.03, 441.07)"/><circle class="cls-6" cx="114.44" cy="362.96" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -258.03, 441.07)"/><circle class="cls-5" cx="116.72" cy="362.84" r="1.98"/><circle class="cls-6" cx="116.72" cy="362.84" r="1.98"/><circle class="cls-5" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-6" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-5" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-6" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-5" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-6" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-5" cx="125.95" cy="362.84" r="1.98"/><circle class="cls-6" cx="125.95" cy="362.84" r="1.98"/><circle class="cls-5" cx="128.24" cy="362.84" r="1.98"/><circle class="cls-6" cx="128.24" cy="362.84" r="1.98"/><circle class="cls-5" cx="130.52" cy="362.84" r="1.98"/><circle class="cls-6" cx="130.52" cy="362.84" r="1.98"/><circle class="cls-5" cx="132.8" cy="362.84" r="1.98"/><circle class="cls-6" cx="132.8" cy="362.84" r="1.98"/><circle class="cls-5" cx="135.08" cy="362.84" r="1.98"/><circle class="cls-6" cx="135.08" cy="362.84" r="1.98"/><circle class="cls-5" cx="137.36" cy="362.84" r="1.98"/><circle class="cls-6" cx="137.36" cy="362.84" r="1.98"/><circle class="cls-5" cx="139.64" cy="362.84" r="1.98"/><circle class="cls-6" cx="139.64" cy="362.84" r="1.98"/><circle class="cls-5" cx="142.03" cy="362.84" r="1.98"/><circle class="cls-6" cx="142.03" cy="362.84" r="1.98"/><circle class="cls-5" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-6" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-5" cx="146.6" cy="362.72" r="1.98"/><circle class="cls-6" cx="146.6" cy="362.72" r="1.98"/><circle class="cls-5" cx="148.88" cy="362.72" r="1.98"/><circle class="cls-6" cx="148.88" cy="362.72" r="1.98"/><circle class="cls-5" cx="151.16" cy="362.72" r="1.98"/><circle class="cls-6" cx="151.16" cy="362.72" r="1.98"/><circle class="cls-5" cx="153.44" cy="362.72" r="1.98"/><circle class="cls-6" cx="153.44" cy="362.72" r="1.98"/><circle class="cls-5" cx="155.72" cy="362.72" r="1.98"/><circle class="cls-6" cx="155.72" cy="362.72" r="1.98"/><circle class="cls-5" cx="158" cy="362.72" r="1.98"/><circle class="cls-6" cx="158" cy="362.72" r="1.98"/><circle class="cls-5" cx="160.28" cy="362.72" r="1.98"/><circle class="cls-6" cx="160.28" cy="362.72" r="1.98"/><circle class="cls-5" cx="162.56" cy="362.72" r="1.98"/><circle class="cls-6" cx="162.56" cy="362.72" r="1.98"/><circle class="cls-5" cx="164.95" cy="362.72" r="1.98"/><circle class="cls-6" cx="164.95" cy="362.72" r="1.98"/><circle class="cls-5" cx="167.24" cy="362.72" r="1.98"/><circle class="cls-6" cx="167.24" cy="362.72" r="1.98"/><circle class="cls-5" cx="169.52" cy="362.72" r="1.98"/><circle class="cls-6" cx="169.52" cy="362.72" r="1.98"/><circle class="cls-5" cx="171.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -205.96, 497.82)"/><circle class="cls-6" cx="171.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -205.96, 497.82)"/><circle class="cls-5" cx="174.08" cy="362.6" r="1.98"/><circle class="cls-6" cx="174.08" cy="362.6" r="1.98"/><circle class="cls-5" cx="176.36" cy="362.6" r="1.98"/><circle class="cls-6" cx="176.36" cy="362.6" r="1.98"/><circle class="cls-5" cx="178.64" cy="362.6" r="1.98"/><circle class="cls-6" cx="178.64" cy="362.6" r="1.98"/><circle class="cls-5" cx="180.92" cy="362.6" r="1.98"/><circle class="cls-6" cx="180.92" cy="362.6" r="1.98"/><circle class="cls-5" cx="183.19" cy="362.6" r="1.98"/><circle class="cls-6" cx="183.19" cy="362.6" r="1.98"/><circle class="cls-5" cx="185.47" cy="362.6" r="1.98"/><circle class="cls-6" cx="185.47" cy="362.6" r="1.98"/><circle class="cls-5" cx="187.88" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -191.47, 513.83)"/><circle class="cls-6" cx="187.88" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -191.47, 513.83)"/><circle class="cls-5" cx="190.16" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -189.41, 516.1)"/><circle class="cls-6" cx="190.16" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -189.41, 516.1)"/><circle class="cls-5" cx="192.44" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -187.36, 518.36)"/><circle class="cls-6" cx="192.44" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -187.36, 518.36)"/><circle class="cls-5" cx="194.72" cy="362.6" r="1.98"/><circle class="cls-6" cx="194.72" cy="362.6" r="1.98"/><circle class="cls-5" cx="197" cy="362.6" r="1.98"/><circle class="cls-6" cx="197" cy="362.6" r="1.98"/><circle class="cls-5" cx="199.28" cy="362.48" r="1.98"/><circle class="cls-6" cx="199.28" cy="362.48" r="1.98"/><circle class="cls-5" cx="201.56" cy="362.48" r="1.98" transform="translate(-34.74 21.62) rotate(-5.65)"/><circle class="cls-6" cx="201.56" cy="362.48" r="1.98" transform="translate(-34.74 21.62) rotate(-5.65)"/><circle class="cls-5" cx="203.83" cy="362.48" r="1.98" transform="translate(-34.73 21.85) rotate(-5.65)"/><circle class="cls-6" cx="203.83" cy="362.48" r="1.98" transform="translate(-34.73 21.85) rotate(-5.65)"/><circle class="cls-5" cx="206.11" cy="362.48" r="1.98" transform="translate(-34.71 22.07) rotate(-5.65)"/><circle class="cls-6" cx="206.11" cy="362.48" r="1.98" transform="translate(-34.71 22.07) rotate(-5.65)"/><path class="cls-5" d="M210.38,362.48a2,2,0,1,1-2-2A2,2,0,0,1,210.38,362.48Z"/><path class="cls-6" d="M210.38,362.48a2,2,0,1,1-2-2A2,2,0,0,1,210.38,362.48Z"/><path class="cls-5" d="M212.78,362.48a2,2,0,1,1-2-2A2,2,0,0,1,212.78,362.48Z"/><path class="cls-6" d="M212.78,362.48a2,2,0,1,1-2-2A2,2,0,0,1,212.78,362.48Z"/><path class="cls-5" d="M215.06,362.48a2,2,0,1,1-2-2A2,2,0,0,1,215.06,362.48Z"/><path class="cls-6" d="M215.06,362.48a2,2,0,1,1-2-2A2,2,0,0,1,215.06,362.48Z"/><path class="cls-5" d="M217.33,362.48a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.48Z"/><path class="cls-6" d="M217.33,362.48a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.48Z"/><path class="cls-5" d="M219.61,362.48a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.48Z"/><path class="cls-6" d="M219.61,362.48a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.48Z"/><path class="cls-5" d="M221.89,362.48a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.48Z"/><path class="cls-6" d="M221.89,362.48a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.48Z"/><path class="cls-5" d="M224.17,362.48a2,2,0,1,1-2-2A2,2,0,0,1,224.17,362.48Z"/><path class="cls-6" d="M224.17,362.48a2,2,0,1,1-2-2A2,2,0,0,1,224.17,362.48Z"/><path class="cls-5" d="M226.45,362.36a2,2,0,1,1-2-2A2,2,0,0,1,226.45,362.36Z"/><path class="cls-6" d="M226.45,362.36a2,2,0,1,1-2-2A2,2,0,0,1,226.45,362.36Z"/><path class="cls-5" d="M228.73,362.36a2,2,0,1,1-2-2A2,2,0,0,1,228.73,362.36Z"/><path class="cls-6" d="M228.73,362.36a2,2,0,1,1-2-2A2,2,0,0,1,228.73,362.36Z"/><path class="cls-5" d="M231,362.36a2,2,0,1,1-2-2A2,2,0,0,1,231,362.36Z"/><path class="cls-6" d="M231,362.36a2,2,0,1,1-2-2A2,2,0,0,1,231,362.36Z"/><path class="cls-5" d="M233.29,362.36a2,2,0,1,1-2-2A2,2,0,0,1,233.29,362.36Z"/><path class="cls-6" d="M233.29,362.36a2,2,0,1,1-2-2A2,2,0,0,1,233.29,362.36Z"/><circle class="cls-5" cx="233.72" cy="362.36" r="1.98" transform="translate(-144.3 569.79) rotate(-85.93)"/><circle class="cls-6" cx="233.72" cy="362.36" r="1.98" transform="translate(-144.3 569.79) rotate(-85.93)"/><circle class="cls-5" cx="236" cy="362.36" r="1.98" transform="translate(-142.18 572.07) rotate(-85.93)"/><circle class="cls-6" cx="236" cy="362.36" r="1.98" transform="translate(-142.18 572.07) rotate(-85.93)"/><circle class="cls-5" cx="238.28" cy="362.36" r="1.98" transform="translate(-140.06 574.34) rotate(-85.93)"/><circle class="cls-6" cx="238.28" cy="362.36" r="1.98" transform="translate(-140.06 574.34) rotate(-85.93)"/><circle class="cls-5" cx="240.56" cy="362.36" r="1.98" transform="translate(-137.94 576.62) rotate(-85.93)"/><circle class="cls-6" cx="240.56" cy="362.36" r="1.98" transform="translate(-137.94 576.62) rotate(-85.93)"/><circle class="cls-5" cx="242.83" cy="362.36" r="1.98" transform="translate(-135.82 578.89) rotate(-85.93)"/><circle class="cls-6" cx="242.83" cy="362.36" r="1.98" transform="translate(-135.82 578.89) rotate(-85.93)"/><circle class="cls-5" cx="245.11" cy="362.36" r="1.98" transform="translate(-133.7 581.17) rotate(-85.93)"/><circle class="cls-6" cx="245.11" cy="362.36" r="1.98" transform="translate(-133.7 581.17) rotate(-85.93)"/><path class="cls-5" d="M249.38,362.36a2,2,0,1,1-2-2A2,2,0,0,1,249.38,362.36Z"/><path class="cls-6" d="M249.38,362.36a2,2,0,1,1-2-2A2,2,0,0,1,249.38,362.36Z"/><path class="cls-5" d="M251.66,362.36a2,2,0,1,1-2-2A2,2,0,0,1,251.66,362.36Z"/><path class="cls-6" d="M251.66,362.36a2,2,0,1,1-2-2A2,2,0,0,1,251.66,362.36Z"/><path class="cls-5" d="M253.94,362.24a2,2,0,1,1-2-2A2,2,0,0,1,253.94,362.24Z"/><path class="cls-6" d="M253.94,362.24a2,2,0,1,1-2-2A2,2,0,0,1,253.94,362.24Z"/><path class="cls-5" d="M256.22,362.24a2,2,0,1,1-2-2A2,2,0,0,1,256.22,362.24Z"/><path class="cls-6" d="M256.22,362.24a2,2,0,1,1-2-2A2,2,0,0,1,256.22,362.24Z"/><path class="cls-5" d="M258.61,362.24a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.24Z"/><path class="cls-6" d="M258.61,362.24a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.24Z"/><path class="cls-5" d="M260.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.24Z"/><path class="cls-6" d="M260.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.24Z"/><path class="cls-5" d="M263.17,362.24a2,2,0,1,1-2-2A2,2,0,0,1,263.17,362.24Z"/><path class="cls-6" d="M263.17,362.24a2,2,0,1,1-2-2A2,2,0,0,1,263.17,362.24Z"/><path class="cls-5" d="M265.45,362.24a2,2,0,1,1-2-2A2,2,0,0,1,265.45,362.24Z"/><path class="cls-6" d="M265.45,362.24a2,2,0,1,1-2-2A2,2,0,0,1,265.45,362.24Z"/><path class="cls-5" d="M267.73,362.24a2,2,0,1,1-2-2A2,2,0,0,1,267.73,362.24Z"/><path class="cls-6" d="M267.73,362.24a2,2,0,1,1-2-2A2,2,0,0,1,267.73,362.24Z"/><path class="cls-5" d="M270,362.24a2,2,0,1,1-2-2A2,2,0,0,1,270,362.24Z"/><path class="cls-6" d="M270,362.24a2,2,0,1,1-2-2A2,2,0,0,1,270,362.24Z"/><path class="cls-5" d="M272.29,362.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,362.24Z"/><path class="cls-6" d="M272.29,362.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,362.24Z"/><path class="cls-5" d="M274.57,362.24a2,2,0,1,1-2-2A2,2,0,0,1,274.57,362.24Z"/><path class="cls-6" d="M274.57,362.24a2,2,0,1,1-2-2A2,2,0,0,1,274.57,362.24Z"/><path class="cls-5" d="M276.85,362.24a2,2,0,1,1-2-2A2,2,0,0,1,276.85,362.24Z"/><path class="cls-6" d="M276.85,362.24a2,2,0,1,1-2-2A2,2,0,0,1,276.85,362.24Z"/><path class="cls-5" d="M279.13,362.12a2,2,0,1,1-2-2A2,2,0,0,1,279.13,362.12Z"/><path class="cls-6" d="M279.13,362.12a2,2,0,1,1-2-2A2,2,0,0,1,279.13,362.12Z"/><circle class="cls-5" cx="279.56" cy="362.12" r="1.98" transform="translate(-34.32 29.31) rotate(-5.65)"/><circle class="cls-6" cx="279.56" cy="362.12" r="1.98" transform="translate(-34.32 29.31) rotate(-5.65)"/><circle class="cls-5" cx="281.83" cy="362.12" r="1.98" transform="translate(-34.31 29.53) rotate(-5.65)"/><circle class="cls-6" cx="281.83" cy="362.12" r="1.98" transform="translate(-34.31 29.53) rotate(-5.65)"/><circle class="cls-5" cx="284.11" cy="362.12" r="1.98" transform="translate(-34.3 29.76) rotate(-5.65)"/><circle class="cls-6" cx="284.11" cy="362.12" r="1.98" transform="translate(-34.3 29.76) rotate(-5.65)"/><path class="cls-5" d="M288.38,362.12a2,2,0,1,1-2-2A2,2,0,0,1,288.38,362.12Z"/><path class="cls-6" d="M288.38,362.12a2,2,0,1,1-2-2A2,2,0,0,1,288.38,362.12Z"/><path class="cls-5" d="M290.66,362.12a2,2,0,1,1-2-2A2,2,0,0,1,290.66,362.12Z"/><path class="cls-6" d="M290.66,362.12a2,2,0,1,1-2-2A2,2,0,0,1,290.66,362.12Z"/><path class="cls-5" d="M292.94,362a2,2,0,1,1-2-2A2,2,0,0,1,292.94,362Z"/><path class="cls-6" d="M292.94,362a2,2,0,1,1-2-2A2,2,0,0,1,292.94,362Z"/><path class="cls-5" d="M295.22,362a2,2,0,1,1-2-2A2,2,0,0,1,295.22,362Z"/><path class="cls-6" d="M295.22,362a2,2,0,1,1-2-2A2,2,0,0,1,295.22,362Z"/><path class="cls-5" d="M297.5,362a2,2,0,1,1-2-2A2,2,0,0,1,297.5,362Z"/><path class="cls-6" d="M297.5,362a2,2,0,1,1-2-2A2,2,0,0,1,297.5,362Z"/><path class="cls-5" d="M299.78,362a2,2,0,1,1-2-2A2,2,0,0,1,299.78,362Z"/><path class="cls-6" d="M299.78,362a2,2,0,1,1-2-2A2,2,0,0,1,299.78,362Z"/><path class="cls-5" d="M302.06,362a2,2,0,1,1-2-2A2,2,0,0,1,302.06,362Z"/><path class="cls-6" d="M302.06,362a2,2,0,1,1-2-2A2,2,0,0,1,302.06,362Z"/><path class="cls-5" d="M304.33,362a2,2,0,1,1-2-2A2,2,0,0,1,304.33,362Z"/><path class="cls-6" d="M304.33,362a2,2,0,1,1-2-2A2,2,0,0,1,304.33,362Z"/><path class="cls-5" d="M306.73,361.88a2,2,0,1,1-2-2A2,2,0,0,1,306.73,361.88Z"/><path class="cls-6" d="M306.73,361.88a2,2,0,1,1-2-2A2,2,0,0,1,306.73,361.88Z"/><path class="cls-5" d="M309,361.88a2,2,0,1,1-2-2A2,2,0,0,1,309,361.88Z"/><path class="cls-6" d="M309,361.88a2,2,0,1,1-2-2A2,2,0,0,1,309,361.88Z"/><path class="cls-5" d="M311.29,361.88a2,2,0,1,1-2-2A2,2,0,0,1,311.29,361.88Z"/><path class="cls-6" d="M311.29,361.88a2,2,0,1,1-2-2A2,2,0,0,1,311.29,361.88Z"/><path class="cls-5" d="M313.57,361.88a2,2,0,1,1-2-2A2,2,0,0,1,313.57,361.88Z"/><path class="cls-6" d="M313.57,361.88a2,2,0,1,1-2-2A2,2,0,0,1,313.57,361.88Z"/><path class="cls-5" d="M315.85,361.88a2,2,0,1,1-2-2A2,2,0,0,1,315.85,361.88Z"/><path class="cls-6" d="M315.85,361.88a2,2,0,1,1-2-2A2,2,0,0,1,315.85,361.88Z"/><path class="cls-5" d="M318.13,361.88a2,2,0,1,1-2-2A2,2,0,0,1,318.13,361.88Z"/><path class="cls-6" d="M318.13,361.88a2,2,0,1,1-2-2A2,2,0,0,1,318.13,361.88Z"/><circle class="cls-5" cx="318.44" cy="361.76" r="1.98" transform="translate(-34.1 33.14) rotate(-5.65)"/><circle class="cls-6" cx="318.44" cy="361.76" r="1.98" transform="translate(-34.1 33.14) rotate(-5.65)"/><circle class="cls-5" cx="320.72" cy="361.76" r="1.98" transform="translate(-34.09 33.36) rotate(-5.65)"/><circle class="cls-6" cx="320.72" cy="361.76" r="1.98" transform="translate(-34.09 33.36) rotate(-5.65)"/><circle class="cls-5" cx="323" cy="361.76" r="1.98" transform="translate(-34.07 33.59) rotate(-5.65)"/><circle class="cls-6" cx="323" cy="361.76" r="1.98" transform="translate(-34.07 33.59) rotate(-5.65)"/><circle class="cls-5" cx="325.28" cy="361.76" r="1.98" transform="translate(-34.06 33.81) rotate(-5.65)"/><circle class="cls-6" cx="325.28" cy="361.76" r="1.98" transform="translate(-34.06 33.81) rotate(-5.65)"/><path class="cls-5" d="M329.66,361.76a2,2,0,1,1-2-2A2,2,0,0,1,329.66,361.76Z"/><path class="cls-6" d="M329.66,361.76a2,2,0,1,1-2-2A2,2,0,0,1,329.66,361.76Z"/><path class="cls-5" d="M331.94,361.76a2,2,0,1,1-2-2A2,2,0,0,1,331.94,361.76Z"/><path class="cls-6" d="M331.94,361.76a2,2,0,1,1-2-2A2,2,0,0,1,331.94,361.76Z"/><path class="cls-5" d="M334.22,361.64a2,2,0,1,1-2-2A2,2,0,0,1,334.22,361.64Z"/><path class="cls-6" d="M334.22,361.64a2,2,0,1,1-2-2A2,2,0,0,1,334.22,361.64Z"/><path class="cls-5" d="M336.5,361.64a2,2,0,1,1-2-2A2,2,0,0,1,336.5,361.64Z"/><path class="cls-6" d="M336.5,361.64a2,2,0,1,1-2-2A2,2,0,0,1,336.5,361.64Z"/><path class="cls-5" d="M338.78,361.64a2,2,0,1,1-2-2A2,2,0,0,1,338.78,361.64Z"/><path class="cls-6" d="M338.78,361.64a2,2,0,1,1-2-2A2,2,0,0,1,338.78,361.64Z"/><path class="cls-5" d="M341.06,361.52a2,2,0,1,1-2-2A2,2,0,0,1,341.06,361.52Z"/><path class="cls-6" d="M341.06,361.52a2,2,0,1,1-2-2A2,2,0,0,1,341.06,361.52Z"/><path class="cls-5" d="M343.33,361.52a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.52Z"/><path class="cls-6" d="M343.33,361.52a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.52Z"/><path class="cls-5" d="M345.61,361.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.52Z"/><path class="cls-6" d="M345.61,361.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.52Z"/><path class="cls-5" d="M347.89,361.52a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.52Z"/><path class="cls-6" d="M347.89,361.52a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.52Z"/><path class="cls-5" d="M350.17,361.4a2,2,0,1,1-2-2A2,2,0,0,1,350.17,361.4Z"/><path class="cls-6" d="M350.17,361.4a2,2,0,1,1-2-2A2,2,0,0,1,350.17,361.4Z"/><path class="cls-5" d="M352.57,361.4a2,2,0,1,1-2-2A2,2,0,0,1,352.57,361.4Z"/><path class="cls-6" d="M352.57,361.4a2,2,0,1,1-2-2A2,2,0,0,1,352.57,361.4Z"/><path class="cls-5" d="M354.85,361.4a2,2,0,1,1-2-2A2,2,0,0,1,354.85,361.4Z"/><path class="cls-6" d="M354.85,361.4a2,2,0,1,1-2-2A2,2,0,0,1,354.85,361.4Z"/><path class="cls-5" d="M357.13,361.4a2,2,0,1,1-2-2A2,2,0,0,1,357.13,361.4Z"/><path class="cls-6" d="M357.13,361.4a2,2,0,1,1-2-2A2,2,0,0,1,357.13,361.4Z"/><circle class="cls-5" cx="357.44" cy="361.28" r="1.98" transform="translate(-28.27 692.2) rotate(-85.93)"/><circle class="cls-6" cx="357.44" cy="361.28" r="1.98" transform="translate(-28.27 692.2) rotate(-85.93)"/><circle class="cls-5" cx="359.72" cy="361.28" r="1.98" transform="translate(-26.15 694.47) rotate(-85.93)"/><circle class="cls-6" cx="359.72" cy="361.28" r="1.98" transform="translate(-26.15 694.47) rotate(-85.93)"/><circle class="cls-5" cx="362" cy="361.28" r="1.98" transform="translate(-24.03 696.75) rotate(-85.93)"/><circle class="cls-6" cx="362" cy="361.28" r="1.98" transform="translate(-24.03 696.75) rotate(-85.93)"/><circle class="cls-5" cx="364.28" cy="361.28" r="1.98" transform="translate(-21.91 699.02) rotate(-85.93)"/><circle class="cls-6" cx="364.28" cy="361.28" r="1.98" transform="translate(-21.91 699.02) rotate(-85.93)"/><circle class="cls-5" cx="366.56" cy="361.16" r="1.98" transform="translate(-33.8 37.88) rotate(-5.65)"/><circle class="cls-6" cx="366.56" cy="361.16" r="1.98" transform="translate(-33.8 37.88) rotate(-5.65)"/><circle class="cls-5" cx="368.83" cy="361.16" r="1.98" transform="translate(-33.79 38.1) rotate(-5.65)"/><circle class="cls-6" cx="368.83" cy="361.16" r="1.98" transform="translate(-33.79 38.1) rotate(-5.65)"/><circle class="cls-5" cx="371.11" cy="361.16" r="1.98" transform="translate(-33.78 38.33) rotate(-5.65)"/><circle class="cls-6" cx="371.11" cy="361.16" r="1.98" transform="translate(-33.78 38.33) rotate(-5.65)"/><path class="cls-5" d="M375.5,361.16a2,2,0,1,1-2-2A2,2,0,0,1,375.5,361.16Z"/><path class="cls-6" d="M375.5,361.16a2,2,0,1,1-2-2A2,2,0,0,1,375.5,361.16Z"/><path class="cls-5" d="M377.78,361a2,2,0,1,1-2-2A2,2,0,0,1,377.78,361Z"/><path class="cls-6" d="M377.78,361a2,2,0,1,1-2-2A2,2,0,0,1,377.78,361Z"/><path class="cls-5" d="M380.06,361a2,2,0,1,1-2-2A2,2,0,0,1,380.06,361Z"/><path class="cls-6" d="M380.06,361a2,2,0,1,1-2-2A2,2,0,0,1,380.06,361Z"/><path class="cls-5" d="M382.33,361a2,2,0,1,1-2-2A2,2,0,0,1,382.33,361Z"/><path class="cls-6" d="M382.33,361a2,2,0,1,1-2-2A2,2,0,0,1,382.33,361Z"/><path class="cls-5" d="M384.61,360.92a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.92Z"/><path class="cls-6" d="M384.61,360.92a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.92Z"/><path class="cls-5" d="M386.89,360.92a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.92Z"/><path class="cls-6" d="M386.89,360.92a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.92Z"/><path class="cls-5" d="M389.17,360.92a2,2,0,1,1-2-2A2,2,0,0,1,389.17,360.92Z"/><path class="cls-6" d="M389.17,360.92a2,2,0,1,1-2-2A2,2,0,0,1,389.17,360.92Z"/><path class="cls-5" d="M391.45,360.8a2,2,0,1,1-2-2A2,2,0,0,1,391.45,360.8Z"/><path class="cls-6" d="M391.45,360.8a2,2,0,1,1-2-2A2,2,0,0,1,391.45,360.8Z"/><path class="cls-5" d="M393.73,360.8a2,2,0,1,1-2-2A2,2,0,0,1,393.73,360.8Z"/><path class="cls-6" d="M393.73,360.8a2,2,0,1,1-2-2A2,2,0,0,1,393.73,360.8Z"/><path class="cls-5" d="M396,360.68a2,2,0,1,1-2-2A2,2,0,0,1,396,360.68Z"/><path class="cls-6" d="M396,360.68a2,2,0,1,1-2-2A2,2,0,0,1,396,360.68Z"/><circle class="cls-5" cx="396.44" cy="360.68" r="1.98" transform="translate(8.56 730.54) rotate(-85.93)"/><circle class="cls-6" cx="396.44" cy="360.68" r="1.98" transform="translate(8.56 730.54) rotate(-85.93)"/><circle class="cls-5" cx="398.72" cy="360.68" r="1.98" transform="translate(10.68 732.82) rotate(-85.93)"/><circle class="cls-6" cx="398.72" cy="360.68" r="1.98" transform="translate(10.68 732.82) rotate(-85.93)"/><circle class="cls-5" cx="401" cy="360.56" r="1.98" transform="translate(12.92 734.98) rotate(-85.93)"/><circle class="cls-6" cx="401" cy="360.56" r="1.98" transform="translate(12.92 734.98) rotate(-85.93)"/><circle class="cls-5" cx="403.28" cy="360.56" r="1.98" transform="translate(15.04 737.26) rotate(-85.93)"/><circle class="cls-6" cx="403.28" cy="360.56" r="1.98" transform="translate(15.04 737.26) rotate(-85.93)"/><circle class="cls-5" cx="405.56" cy="360.56" r="1.98" transform="translate(17.16 739.53) rotate(-85.93)"/><circle class="cls-6" cx="405.56" cy="360.56" r="1.98" transform="translate(17.16 739.53) rotate(-85.93)"/><circle class="cls-5" cx="407.83" cy="360.44" r="1.98" transform="translate(-33.53 41.94) rotate(-5.65)"/><circle class="cls-6" cx="407.83" cy="360.44" r="1.98" transform="translate(-33.53 41.94) rotate(-5.65)"/><circle class="cls-5" cx="410.11" cy="360.44" r="1.98" transform="translate(-33.52 42.17) rotate(-5.65)"/><circle class="cls-6" cx="410.11" cy="360.44" r="1.98" transform="translate(-33.52 42.17) rotate(-5.65)"/><path class="cls-5" d="M414.38,360.32a2,2,0,1,1-2-2A2,2,0,0,1,414.38,360.32Z"/><path class="cls-6" d="M414.38,360.32a2,2,0,1,1-2-2A2,2,0,0,1,414.38,360.32Z"/><path class="cls-5" d="M416.66,360.32a2,2,0,1,1-2-2A2,2,0,0,1,416.66,360.32Z"/><path class="cls-6" d="M416.66,360.32a2,2,0,1,1-2-2A2,2,0,0,1,416.66,360.32Z"/><path class="cls-5" d="M418.94,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.94,360.2Z"/><path class="cls-6" d="M418.94,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.94,360.2Z"/><path class="cls-5" d="M421.33,360.2a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.2Z"/><path class="cls-6" d="M421.33,360.2a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.2Z"/><path class="cls-5" d="M423.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.2Z"/><path class="cls-6" d="M423.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.2Z"/><path class="cls-5" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-6" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-5" d="M428.17,360.08a2,2,0,1,1-2-2A2,2,0,0,1,428.17,360.08Z"/><path class="cls-6" d="M428.17,360.08a2,2,0,1,1-2-2A2,2,0,0,1,428.17,360.08Z"/><path class="cls-5" d="M430.45,360a2,2,0,1,1-2-2A2,2,0,0,1,430.45,360Z"/><path class="cls-6" d="M430.45,360a2,2,0,1,1-2-2A2,2,0,0,1,430.45,360Z"/><path class="cls-5" d="M432.73,360a2,2,0,1,1-2-2A2,2,0,0,1,432.73,360Z"/><path class="cls-6" d="M432.73,360a2,2,0,1,1-2-2A2,2,0,0,1,432.73,360Z"/><path class="cls-5" d="M435,359.84a2,2,0,1,1-2-2A2,2,0,0,1,435,359.84Z"/><path class="cls-6" d="M435,359.84a2,2,0,1,1-2-2A2,2,0,0,1,435,359.84Z"/><path class="cls-5" d="M437.29,359.84a2,2,0,1,1-2-2A2,2,0,0,1,437.29,359.84Z"/><path class="cls-6" d="M437.29,359.84a2,2,0,1,1-2-2A2,2,0,0,1,437.29,359.84Z"/><path class="cls-5" d="M439.57,359.72a2,2,0,1,1-2-2A2,2,0,0,1,439.57,359.72Z"/><path class="cls-6" d="M439.57,359.72a2,2,0,1,1-2-2A2,2,0,0,1,439.57,359.72Z"/><path class="cls-5" d="M441.85,359.72a2,2,0,1,1-2-2A2,2,0,0,1,441.85,359.72Z"/><path class="cls-6" d="M441.85,359.72a2,2,0,1,1-2-2A2,2,0,0,1,441.85,359.72Z"/><circle class="cls-5" cx="442.28" cy="359.6" r="1.98" transform="translate(52.23 775.27) rotate(-85.93)"/><circle class="cls-6" cx="442.28" cy="359.6" r="1.98" transform="translate(52.23 775.27) rotate(-85.93)"/><circle class="cls-5" cx="444.56" cy="359.48" r="1.98" transform="translate(-33.26 45.55) rotate(-5.65)"/><circle class="cls-6" cx="444.56" cy="359.48" r="1.98" transform="translate(-33.26 45.55) rotate(-5.65)"/><circle class="cls-5" cx="446.83" cy="359.48" r="1.98" transform="translate(-33.25 45.78) rotate(-5.65)"/><circle class="cls-6" cx="446.83" cy="359.48" r="1.98" transform="translate(-33.25 45.78) rotate(-5.65)"/><circle class="cls-5" cx="449.11" cy="359.36" r="1.98" transform="translate(58.83 781.87) rotate(-85.93)"/><circle class="cls-6" cx="449.11" cy="359.36" r="1.98" transform="translate(58.83 781.87) rotate(-85.93)"/><path class="cls-5" d="M453.38,359.36a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.36Z"/><path class="cls-6" d="M453.38,359.36a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.36Z"/><path class="cls-5" d="M455.66,359.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,359.24Z"/><path class="cls-6" d="M455.66,359.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,359.24Z"/><circle class="cls-5" cx="455.95" cy="359.12" r="1.98" transform="translate(76.55 798.33) rotate(-87.4)"/><circle class="cls-6" cx="455.95" cy="359.12" r="1.98" transform="translate(76.55 798.33) rotate(-87.4)"/><path class="cls-5" d="M460.22,359.12a2,2,0,1,1-2-2A2,2,0,0,1,460.22,359.12Z"/><path class="cls-6" d="M460.22,359.12a2,2,0,1,1-2-2A2,2,0,0,1,460.22,359.12Z"/><circle class="cls-5" cx="460.51" cy="359" r="1.98" transform="translate(-19.15 26.02) rotate(-3.17)"/><circle class="cls-6" cx="460.51" cy="359" r="1.98" transform="translate(-19.15 26.02) rotate(-3.17)"/><path class="cls-5" d="M464.78,359a2,2,0,1,1-2-2A2,2,0,0,1,464.78,359Z"/><path class="cls-6" d="M464.78,359a2,2,0,1,1-2-2A2,2,0,0,1,464.78,359Z"/><path class="cls-5" d="M467.18,358.88a2,2,0,1,1-2-2A2,2,0,0,1,467.18,358.88Z"/><path class="cls-6" d="M467.18,358.88a2,2,0,1,1-2-2A2,2,0,0,1,467.18,358.88Z"/><path class="cls-5" d="M469.45,358.76a2,2,0,1,1-2-2A2,2,0,0,1,469.45,358.76Z"/><path class="cls-6" d="M469.45,358.76a2,2,0,1,1-2-2A2,2,0,0,1,469.45,358.76Z"/><path class="cls-5" d="M471.74,358.76a2,2,0,1,1-2-2A2,2,0,0,1,471.74,358.76Z"/><path class="cls-6" d="M471.74,358.76a2,2,0,1,1-2-2A2,2,0,0,1,471.74,358.76Z"/><circle class="cls-5" cx="472.04" cy="358.64" r="1.98" transform="translate(-19.11 26.65) rotate(-3.17)"/><circle class="cls-6" cx="472.04" cy="358.64" r="1.98" transform="translate(-19.11 26.65) rotate(-3.17)"/><path class="cls-5" d="M476.3,358.52a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.52Z"/><path class="cls-6" d="M476.3,358.52a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.52Z"/><circle class="cls-5" cx="476.6" cy="358.4" r="1.98" transform="translate(96.97 818.26) rotate(-87.4)"/><circle class="cls-6" cx="476.6" cy="358.4" r="1.98" transform="translate(96.97 818.26) rotate(-87.4)"/><path class="cls-5" d="M480.86,358.4a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.4Z"/><path class="cls-6" d="M480.86,358.4a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.4Z"/><circle class="cls-5" cx="481.16" cy="358.28" r="1.98" transform="translate(-19.08 27.16) rotate(-3.17)"/><circle class="cls-6" cx="481.16" cy="358.28" r="1.98" transform="translate(-19.08 27.16) rotate(-3.17)"/><path class="cls-5" d="M485.42,358.16a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.16Z"/><path class="cls-6" d="M485.42,358.16a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.16Z"/><circle class="cls-5" cx="485.72" cy="358.04" r="1.98" transform="translate(106.04 827.03) rotate(-87.4)"/><circle class="cls-6" cx="485.72" cy="358.04" r="1.98" transform="translate(106.04 827.03) rotate(-87.4)"/><path class="cls-5" d="M490,357.92a2,2,0,1,1-2-2A2,2,0,0,1,490,357.92Z"/><path class="cls-6" d="M490,357.92a2,2,0,1,1-2-2A2,2,0,0,1,490,357.92Z"/><circle class="cls-5" cx="490.39" cy="357.92" r="1.98" transform="translate(-19.04 27.67) rotate(-3.17)"/><circle class="cls-6" cx="490.39" cy="357.92" r="1.98" transform="translate(-19.04 27.67) rotate(-3.17)"/><path class="cls-5" d="M494.66,357.8a2,2,0,1,1-2-2A2,2,0,0,1,494.66,357.8Z"/><path class="cls-6" d="M494.66,357.8a2,2,0,1,1-2-2A2,2,0,0,1,494.66,357.8Z"/><circle class="cls-5" cx="494.95" cy="357.68" r="1.98" transform="translate(-19.02 27.92) rotate(-3.17)"/><circle class="cls-6" cx="494.95" cy="357.68" r="1.98" transform="translate(-19.02 27.92) rotate(-3.17)"/><path class="cls-5" d="M499.22,357.56a2,2,0,1,1-2-2A2,2,0,0,1,499.22,357.56Z"/><path class="cls-6" d="M499.22,357.56a2,2,0,1,1-2-2A2,2,0,0,1,499.22,357.56Z"/><circle class="cls-5" cx="499.51" cy="357.44" r="1.98" transform="translate(119.81 840.24) rotate(-87.4)"/><circle class="cls-6" cx="499.51" cy="357.44" r="1.98" transform="translate(119.81 840.24) rotate(-87.4)"/><path class="cls-5" d="M503.78,357.32a2,2,0,1,1-2-2A2,2,0,0,1,503.78,357.32Z"/><path class="cls-6" d="M503.78,357.32a2,2,0,1,1-2-2A2,2,0,0,1,503.78,357.32Z"/><path class="cls-5" d="M506.06,357.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,357.2Z"/><path class="cls-6" d="M506.06,357.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,357.2Z"/><path class="cls-5" d="M508.33,357.08a2,2,0,1,1-2-2A2,2,0,0,1,508.33,357.08Z"/><path class="cls-6" d="M508.33,357.08a2,2,0,1,1-2-2A2,2,0,0,1,508.33,357.08Z"/><path class="cls-5" d="M510.62,357a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357Z"/><path class="cls-6" d="M510.62,357a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357Z"/><path class="cls-5" d="M512.89,356.84a2,2,0,1,1-2-2A2,2,0,0,1,512.89,356.84Z"/><path class="cls-6" d="M512.89,356.84a2,2,0,1,1-2-2A2,2,0,0,1,512.89,356.84Z"/><path class="cls-5" d="M515.3,356.72a2,2,0,1,1-2-2A2,2,0,0,1,515.3,356.72Z"/><path class="cls-6" d="M515.3,356.72a2,2,0,1,1-2-2A2,2,0,0,1,515.3,356.72Z"/><circle class="cls-5" cx="515.6" cy="356.6" r="1.98" transform="translate(-18.93 29.06) rotate(-3.17)"/><circle class="cls-6" cx="515.6" cy="356.6" r="1.98" transform="translate(-18.93 29.06) rotate(-3.17)"/><path class="cls-5" d="M519.86,356.48a2,2,0,1,1-2-2A2,2,0,0,1,519.86,356.48Z"/><path class="cls-6" d="M519.86,356.48a2,2,0,1,1-2-2A2,2,0,0,1,519.86,356.48Z"/><circle class="cls-5" cx="520.16" cy="356.36" r="1.98" transform="translate(-18.91 29.31) rotate(-3.17)"/><circle class="cls-6" cx="520.16" cy="356.36" r="1.98" transform="translate(-18.91 29.31) rotate(-3.17)"/><path class="cls-5" d="M524.42,356.24a2,2,0,1,1-2-2A2,2,0,0,1,524.42,356.24Z"/><path class="cls-6" d="M524.42,356.24a2,2,0,1,1-2-2A2,2,0,0,1,524.42,356.24Z"/><circle class="cls-5" cx="524.72" cy="356.12" r="1.98" transform="translate(145.19 864.15) rotate(-87.4)"/><circle class="cls-6" cx="524.72" cy="356.12" r="1.98" transform="translate(145.19 864.15) rotate(-87.4)"/><path class="cls-5" d="M529,356a2,2,0,1,1-2-2A2,2,0,0,1,529,356Z"/><path class="cls-6" d="M529,356a2,2,0,1,1-2-2A2,2,0,0,1,529,356Z"/><circle class="cls-5" cx="529.28" cy="355.88" r="1.98"/><circle class="cls-6" cx="529.28" cy="355.88" r="1.98"/><path class="cls-5" d="M533.54,355.76a2,2,0,1,1-2-2A2,2,0,0,1,533.54,355.76Z"/><path class="cls-6" d="M533.54,355.76a2,2,0,1,1-2-2A2,2,0,0,1,533.54,355.76Z"/><circle class="cls-5" cx="533.83" cy="355.52" r="1.98"/><circle class="cls-6" cx="533.83" cy="355.52" r="1.98"/><path class="cls-5" d="M538.22,355.4a2,2,0,1,1-2-2A2,2,0,0,1,538.22,355.4Z"/><path class="cls-6" d="M538.22,355.4a2,2,0,1,1-2-2A2,2,0,0,1,538.22,355.4Z"/><circle class="cls-5" cx="538.51" cy="355.28" r="1.98" transform="translate(-18.82 30.32) rotate(-3.17)"/><circle class="cls-6" cx="538.51" cy="355.28" r="1.98" transform="translate(-18.82 30.32) rotate(-3.17)"/><path class="cls-5" d="M542.78,355.16a2,2,0,1,1-2-2A2,2,0,0,1,542.78,355.16Z"/><path class="cls-6" d="M542.78,355.16a2,2,0,1,1-2-2A2,2,0,0,1,542.78,355.16Z"/><path class="cls-5" d="M545.06,354.92a2,2,0,1,1-2-2A2,2,0,0,1,545.06,354.92Z"/><path class="cls-6" d="M545.06,354.92a2,2,0,1,1-2-2A2,2,0,0,1,545.06,354.92Z"/><path class="cls-5" d="M547.33,354.8a2,2,0,1,1-2-2A2,2,0,0,1,547.33,354.8Z"/><path class="cls-6" d="M547.33,354.8a2,2,0,1,1-2-2A2,2,0,0,1,547.33,354.8Z"/><path class="cls-5" d="M549.62,354.68a2,2,0,1,1-2-2A2,2,0,0,1,549.62,354.68Z"/><path class="cls-6" d="M549.62,354.68a2,2,0,1,1-2-2A2,2,0,0,1,549.62,354.68Z"/><path class="cls-5" d="M551.89,354.44a2,2,0,1,1-2-2A2,2,0,0,1,551.89,354.44Z"/><path class="cls-6" d="M551.89,354.44a2,2,0,1,1-2-2A2,2,0,0,1,551.89,354.44Z"/><path class="cls-5" d="M554.18,354.32a2,2,0,1,1-2-2A2,2,0,0,1,554.18,354.32Z"/><path class="cls-6" d="M554.18,354.32a2,2,0,1,1-2-2A2,2,0,0,1,554.18,354.32Z"/><path class="cls-5" d="M556.45,354.2a2,2,0,1,1-2-2A2,2,0,0,1,556.45,354.2Z"/><path class="cls-6" d="M556.45,354.2a2,2,0,1,1-2-2A2,2,0,0,1,556.45,354.2Z"/><path class="cls-5" d="M558.74,354a2,2,0,1,1-2-2A2,2,0,0,1,558.74,354Z"/><path class="cls-6" d="M558.74,354a2,2,0,1,1-2-2A2,2,0,0,1,558.74,354Z"/><circle class="cls-5" cx="559.16" cy="353.84" r="1.98" transform="translate(180.35 896.38) rotate(-87.4)"/><circle class="cls-6" cx="559.16" cy="353.84" r="1.98" transform="translate(180.35 896.38) rotate(-87.4)"/><path class="cls-5" d="M563.42,353.6a2,2,0,1,1-2-2A2,2,0,0,1,563.42,353.6Z"/><path class="cls-6" d="M563.42,353.6a2,2,0,1,1-2-2A2,2,0,0,1,563.42,353.6Z"/><circle class="cls-5" cx="563.72" cy="353.48" r="1.98" transform="translate(185.06 900.59) rotate(-87.4)"/><circle class="cls-6" cx="563.72" cy="353.48" r="1.98" transform="translate(185.06 900.59) rotate(-87.4)"/><path class="cls-5" d="M568,353.24a2,2,0,1,1-2-2A2,2,0,0,1,568,353.24Z"/><path class="cls-6" d="M568,353.24a2,2,0,1,1-2-2A2,2,0,0,1,568,353.24Z"/><circle class="cls-5" cx="568.28" cy="353" r="1.98" transform="translate(-18.65 31.97) rotate(-3.17)"/><circle class="cls-6" cx="568.28" cy="353" r="1.98" transform="translate(-18.65 31.97) rotate(-3.17)"/><path class="cls-5" d="M572.54,352.88a2,2,0,1,1-2-2A2,2,0,0,1,572.54,352.88Z"/><path class="cls-6" d="M572.54,352.88a2,2,0,1,1-2-2A2,2,0,0,1,572.54,352.88Z"/><circle class="cls-5" cx="572.83" cy="352.64" r="1.98" transform="translate(-18.62 32.22) rotate(-3.17)"/><circle class="cls-6" cx="572.83" cy="352.64" r="1.98" transform="translate(-18.62 32.22) rotate(-3.17)"/><path class="cls-5" d="M577.1,352.4a2,2,0,1,1-2-2A2,2,0,0,1,577.1,352.4Z"/><path class="cls-6" d="M577.1,352.4a2,2,0,1,1-2-2A2,2,0,0,1,577.1,352.4Z"/><circle class="cls-5" cx="577.39" cy="352.28" r="1.98" transform="translate(-18.6 32.47) rotate(-3.17)"/><circle class="cls-6" cx="577.39" cy="352.28" r="1.98" transform="translate(-18.6 32.47) rotate(-3.17)"/><path class="cls-5" d="M581.66,352a2,2,0,1,1-2-2A2,2,0,0,1,581.66,352Z"/><path class="cls-6" d="M581.66,352a2,2,0,1,1-2-2A2,2,0,0,1,581.66,352Z"/><path class="cls-5" d="M584.06,351.8a2,2,0,1,1-2-2A2,2,0,0,1,584.06,351.8Z"/><path class="cls-6" d="M584.06,351.8a2,2,0,1,1-2-2A2,2,0,0,1,584.06,351.8Z"/><path class="cls-5" d="M586.33,351.56a2,2,0,1,1-2-2A2,2,0,0,1,586.33,351.56Z"/><path class="cls-6" d="M586.33,351.56a2,2,0,1,1-2-2A2,2,0,0,1,586.33,351.56Z"/><path class="cls-5" d="M588.62,351.32a2,2,0,1,1-2-2A2,2,0,0,1,588.62,351.32Z"/><path class="cls-6" d="M588.62,351.32a2,2,0,1,1-2-2A2,2,0,0,1,588.62,351.32Z"/><path class="cls-5" d="M590.89,351.08a2,2,0,1,1-2-2A2,2,0,0,1,590.89,351.08Z"/><path class="cls-6" d="M590.89,351.08a2,2,0,1,1-2-2A2,2,0,0,1,590.89,351.08Z"/><path class="cls-5" d="M593.18,350.84a2,2,0,1,1-2-2A2,2,0,0,1,593.18,350.84Z"/><path class="cls-6" d="M593.18,350.84a2,2,0,1,1-2-2A2,2,0,0,1,593.18,350.84Z"/><path class="cls-5" d="M595.45,350.6a2,2,0,1,1-2-2A2,2,0,0,1,595.45,350.6Z"/><path class="cls-6" d="M595.45,350.6a2,2,0,1,1-2-2A2,2,0,0,1,595.45,350.6Z"/><path class="cls-5" d="M597.74,350.36a2,2,0,1,1-2-2A2,2,0,0,1,597.74,350.36Z"/><path class="cls-6" d="M597.74,350.36a2,2,0,1,1-2-2A2,2,0,0,1,597.74,350.36Z"/><circle class="cls-5" cx="598.04" cy="350.12" r="1.98" transform="translate(221.18 931.67) rotate(-87.4)"/><circle class="cls-6" cx="598.04" cy="350.12" r="1.98" transform="translate(221.18 931.67) rotate(-87.4)"/><path class="cls-5" d="M602.3,349.88a2,2,0,1,1-2-2A2,2,0,0,1,602.3,349.88Z"/><path class="cls-6" d="M602.3,349.88a2,2,0,1,1-2-2A2,2,0,0,1,602.3,349.88Z"/><circle class="cls-5" cx="602.6" cy="349.64" r="1.98" transform="translate(-18.41 33.86) rotate(-3.17)"/><circle class="cls-6" cx="602.6" cy="349.64" r="1.98" transform="translate(-18.41 33.86) rotate(-3.17)"/><path class="cls-5" d="M607,349.4a2,2,0,1,1-2-2A2,2,0,0,1,607,349.4Z"/><path class="cls-6" d="M607,349.4a2,2,0,1,1-2-2A2,2,0,0,1,607,349.4Z"/><circle class="cls-5" cx="607.28" cy="349.16" r="1.98" transform="translate(230.96 939.98) rotate(-87.4)"/><circle class="cls-6" cx="607.28" cy="349.16" r="1.98" transform="translate(230.96 939.98) rotate(-87.4)"/><path class="cls-5" d="M611.54,348.8a2,2,0,1,1-2-2A2,2,0,0,1,611.54,348.8Z"/><path class="cls-6" d="M611.54,348.8a2,2,0,1,1-2-2A2,2,0,0,1,611.54,348.8Z"/><circle class="cls-5" cx="611.83" cy="348.56" r="1.98" transform="translate(-18.34 34.37) rotate(-3.17)"/><circle class="cls-6" cx="611.83" cy="348.56" r="1.98" transform="translate(-18.34 34.37) rotate(-3.17)"/><path class="cls-5" d="M616.1,348.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,348.32Z"/><path class="cls-6" d="M616.1,348.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,348.32Z"/><circle class="cls-5" cx="616.39" cy="347.96" r="1.98" transform="translate(-18.3 34.62) rotate(-3.17)"/><circle class="cls-6" cx="616.39" cy="347.96" r="1.98" transform="translate(-18.3 34.62) rotate(-3.17)"/><path class="cls-5" d="M620.66,347.72a2,2,0,1,1-2-2A2,2,0,0,1,620.66,347.72Z"/><path class="cls-6" d="M620.66,347.72a2,2,0,1,1-2-2A2,2,0,0,1,620.66,347.72Z"/><circle class="cls-5" cx="620.95" cy="347.36" r="1.98" transform="translate(-18.26 34.87) rotate(-3.17)"/><circle class="cls-6" cx="620.95" cy="347.36" r="1.98" transform="translate(-18.26 34.87) rotate(-3.17)"/><path class="cls-5" d="M625.22,347.12a2,2,0,1,1-2-2A2,2,0,0,1,625.22,347.12Z"/><path class="cls-6" d="M625.22,347.12a2,2,0,1,1-2-2A2,2,0,0,1,625.22,347.12Z"/><circle class="cls-5" cx="625.51" cy="346.76" r="1.98" transform="translate(250.77 955.91) rotate(-87.4)"/><circle class="cls-6" cx="625.51" cy="346.76" r="1.98" transform="translate(250.77 955.91) rotate(-87.4)"/><path class="cls-5" d="M629.89,346.4a2,2,0,1,1-2-2A2,2,0,0,1,629.89,346.4Z"/><path class="cls-6" d="M629.89,346.4a2,2,0,1,1-2-2A2,2,0,0,1,629.89,346.4Z"/><path class="cls-5" d="M632.18,346.16a2,2,0,1,1-2-2A2,2,0,0,1,632.18,346.16Z"/><path class="cls-6" d="M632.18,346.16a2,2,0,1,1-2-2A2,2,0,0,1,632.18,346.16Z"/><path class="cls-5" d="M634.45,345.8a2,2,0,1,1-2-2A2,2,0,0,1,634.45,345.8Z"/><path class="cls-6" d="M634.45,345.8a2,2,0,1,1-2-2A2,2,0,0,1,634.45,345.8Z"/><path class="cls-5" d="M636.74,345.44a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.44Z"/><path class="cls-6" d="M636.74,345.44a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.44Z"/><circle class="cls-5" cx="637.04" cy="345.08" r="1.98"/><circle class="cls-6" cx="637.04" cy="345.08" r="1.98"/><path class="cls-5" d="M641.3,344.72a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.72Z"/><path class="cls-6" d="M641.3,344.72a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.72Z"/><g class="cls-3"><polyline class="cls-7" points="54.95 363.25 57.26 363.25 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.01 73.33 363.01 75.61 363.01 77.89 363.01 80.17 362.89 82.45 362.89 84.73 362.89 87.02 362.77 89.3 362.77 91.58 362.77 93.86 362.65 96.25 362.65 98.53 362.65 100.81 362.54 103.09 362.54 105.38 362.54 107.66 362.42 109.94 362.42 112.22 362.42 114.5 362.3 116.78 362.3 119.17 362.18 121.45 362.18 123.73 362.18 126.02 362.06 128.29 362.06 130.57 362.06 132.85 361.94 135.13 361.94 137.41 361.94 139.69 361.81 142.09 361.81 144.38 361.69 146.66 361.69 148.94 361.69 151.22 361.57 153.5 361.57 155.78 361.45 158.06 361.45 160.34 361.45 162.62 361.33 165.01 361.33 167.29 361.33 169.57 361.21 171.85 361.21 174.13 361.1 176.41 361.1 178.69 361.1 180.97 360.98 183.25 360.98 185.53 360.98 187.94 360.86 190.22 360.86 192.5 360.74 194.78 360.74 197.06 360.74 199.34 360.62 201.62 360.62 203.9 360.5 206.18 360.5 208.46 360.5 210.85 360.38 213.13 360.38 215.41 360.38 217.69 360.25 219.97 360.25 222.25 360.25 224.53 360.13 226.81 360.13 229.09 360.01 231.38 360.01 233.78 360.01 236.06 359.89 238.34 359.89 240.62 359.89 242.9 359.77 245.18 359.77 247.46 359.65 249.74 359.65 252.01 359.65 254.29 359.54 256.69 359.54 258.98 359.54 261.25 359.42 263.54 359.42 265.81 359.42 268.1 359.3 270.38 359.3 272.65 359.18 274.94 359.18 277.21 359.06 279.62 358.94 281.89 358.94 284.18 358.81 286.45 358.69 288.74 358.69 291.01 358.57 293.3 358.45 295.57 358.45 297.86 358.33 300.13 358.21 302.42 358.21 304.81 358.1 307.1 357.98 309.38 357.98 311.65 357.86 313.94 357.74 316.21 357.74 318.5 357.62 320.77 357.5 323.06 357.38 325.33 357.38 327.74 357.25 330.01 357.13 332.3 357.01 334.57 356.89 336.86 356.77 339.13 356.65 341.42 356.54 343.69 356.42 345.98 356.3 348.25 356.18 350.65 356.06 352.94 355.94 355.21 355.81 357.5 355.69 359.77 355.57 362.06 355.45 364.33 355.33 366.62 355.21 368.89 355.1 371.18 354.98 373.57 354.74 375.86 354.62 378.13 354.5 380.42 354.25 382.69 354.13 384.98 354.01 387.25 353.77 389.54 353.65 391.81 353.54 394.1 353.3 396.5 353.18 398.77 352.94 401.06 352.81 403.33 352.57 405.62 352.33 407.89 352.21 410.18 351.98 412.45 351.74 414.74 351.62 417.01 351.38 419.42 351.13 421.69 351.01 423.98 350.77 426.25 350.54 428.54 350.3 430.81 349.94 433.1 349.69 435.38 349.45 437.65 349.21 439.94 348.98 442.33 348.74 444.62 348.5 446.89 348.25 449.18 347.89 451.45 347.65 453.74 347.3 456.01 347.06 458.3 346.81 460.57 346.45 462.86 346.1 465.25 345.86 467.54 345.5 469.81 345.13 472.1 344.89 474.38 344.54 476.65 344.06 478.94 343.69 481.21 343.33 483.5 342.98 485.77 342.62 488.06 342.25 490.45 341.89 492.74 341.42 495.01 341.06 497.3 340.57 499.57 340.21 501.86 339.74 504.13 339.25 506.42 338.89 508.69 338.42 510.98 337.94 513.38 337.45 515.65 336.98 517.93 336.5 520.22 335.89 522.5 335.42 524.77 334.94 527.05 334.33 529.34 333.74 531.62 333.25 533.89 332.65 536.29 332.06 538.58 331.45 540.86 330.86 543.13 330.25 545.41 329.65 547.7 328.94 549.98 328.33 552.25 327.62 554.53 326.89 556.82 326.3 559.22 325.57 561.5 324.86 563.77 324.01 566.05 323.3 568.34 322.57 570.62 321.74 572.89 320.89 575.17 320.18 577.46 319.33 579.74 318.38 582.13 317.54 584.41 316.69 586.7 315.86 588.98 314.89 591.25 313.94 593.53 312.98 595.82 312.01 598.1 310.94 600.38 309.98 602.65 308.89 605.05 307.81 607.34 306.74 609.62 305.65 611.89 304.57 614.17 303.38 616.46 302.18 618.74 301.1 621.01 299.89 623.29 298.57 625.58 297.38 627.98 296.06 630.25 294.74 632.53 293.42 634.82 292.1 637.1 290.65 639.38 289.27"/></g><circle class="cls-8" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-9" cx="54.92" cy="363.2" r="1.98"/><circle class="cls-8" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-9" cx="57.2" cy="363.2" r="1.98"/><circle class="cls-8" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-9" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-8" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-9" cx="61.76" cy="363.08" r="1.98"/><circle class="cls-8" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-9" cx="64.04" cy="363.08" r="1.98"/><circle class="cls-8" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-9" cx="66.32" cy="363.08" r="1.98"/><circle class="cls-8" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-9" cx="68.6" cy="363.08" r="1.98"/><circle class="cls-8" cx="71" cy="362.96" r="1.98"/><circle class="cls-9" cx="71" cy="362.96" r="1.98"/><circle class="cls-8" cx="73.28" cy="362.96" r="1.98"/><circle class="cls-9" cx="73.28" cy="362.96" r="1.98"/><circle class="cls-8" cx="75.56" cy="362.96" r="1.98"/><circle class="cls-9" cx="75.56" cy="362.96" r="1.98"/><circle class="cls-8" cx="77.83" cy="362.96" r="1.98"/><circle class="cls-9" cx="77.83" cy="362.96" r="1.98"/><circle class="cls-8" cx="80.12" cy="362.84" r="1.98"/><circle class="cls-9" cx="80.12" cy="362.84" r="1.98"/><circle class="cls-8" cx="82.4" cy="362.84" r="1.98"/><circle class="cls-9" cx="82.4" cy="362.84" r="1.98"/><circle class="cls-8" cx="84.67" cy="362.84" r="1.98"/><circle class="cls-9" cx="84.67" cy="362.84" r="1.98"/><circle class="cls-8" cx="86.95" cy="362.72" r="1.98"/><circle class="cls-9" cx="86.95" cy="362.72" r="1.98"/><circle class="cls-8" cx="89.24" cy="362.72" r="1.98"/><circle class="cls-9" cx="89.24" cy="362.72" r="1.98"/><circle class="cls-8" cx="91.52" cy="362.72" r="1.98"/><circle class="cls-9" cx="91.52" cy="362.72" r="1.98"/><circle class="cls-8" cx="93.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.28, 420.2)"/><circle class="cls-9" cx="93.8" cy="362.6" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -276.28, 420.2)"/><circle class="cls-8" cx="96.19" cy="362.6" r="1.98"/><circle class="cls-9" cx="96.19" cy="362.6" r="1.98"/><circle class="cls-8" cx="98.47" cy="362.6" r="1.98"/><circle class="cls-9" cx="98.47" cy="362.6" r="1.98"/><circle class="cls-8" cx="100.76" cy="362.48" r="1.98"/><circle class="cls-9" cx="100.76" cy="362.48" r="1.98"/><circle class="cls-8" cx="103.03" cy="362.48" r="1.98"/><circle class="cls-9" cx="103.03" cy="362.48" r="1.98"/><circle class="cls-8" cx="105.31" cy="362.48" r="1.98"/><circle class="cls-9" cx="105.31" cy="362.48" r="1.98"/><circle class="cls-8" cx="107.6" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -263.6, 433.72)"/><circle class="cls-9" cx="107.6" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -263.6, 433.72)"/><circle class="cls-8" cx="109.88" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -261.54, 435.99)"/><circle class="cls-9" cx="109.88" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -261.54, 435.99)"/><circle class="cls-8" cx="112.16" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -259.49, 438.26)"/><circle class="cls-9" cx="112.16" cy="362.36" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -259.49, 438.26)"/><circle class="cls-8" cx="114.44" cy="362.24" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -257.31, 440.42)"/><circle class="cls-9" cx="114.44" cy="362.24" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -257.31, 440.42)"/><circle class="cls-8" cx="116.72" cy="362.24" r="1.98"/><circle class="cls-9" cx="116.72" cy="362.24" r="1.98"/><circle class="cls-8" cx="119.12" cy="362.12" r="1.98"/><circle class="cls-9" cx="119.12" cy="362.12" r="1.98"/><circle class="cls-8" cx="121.4" cy="362.12" r="1.98"/><circle class="cls-9" cx="121.4" cy="362.12" r="1.98"/><circle class="cls-8" cx="123.67" cy="362.12" r="1.98"/><circle class="cls-9" cx="123.67" cy="362.12" r="1.98"/><circle class="cls-8" cx="125.95" cy="362" r="1.98"/><circle class="cls-9" cx="125.95" cy="362" r="1.98"/><circle class="cls-8" cx="128.24" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -244.63, 453.94)"/><circle class="cls-9" cx="128.24" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -244.63, 453.94)"/><circle class="cls-8" cx="130.52" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -242.58, 456.2)"/><circle class="cls-9" cx="130.52" cy="362" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -242.58, 456.2)"/><circle class="cls-8" cx="132.8" cy="361.88" r="1.98"/><circle class="cls-9" cx="132.8" cy="361.88" r="1.98"/><circle class="cls-8" cx="135.08" cy="361.88" r="1.98"/><circle class="cls-9" cx="135.08" cy="361.88" r="1.98"/><circle class="cls-8" cx="137.36" cy="361.88" r="1.98"/><circle class="cls-9" cx="137.36" cy="361.88" r="1.98"/><circle class="cls-8" cx="139.64" cy="361.76" r="1.98"/><circle class="cls-9" cx="139.64" cy="361.76" r="1.98"/><circle class="cls-8" cx="142.03" cy="361.76" r="1.98"/><circle class="cls-9" cx="142.03" cy="361.76" r="1.98"/><circle class="cls-8" cx="144.31" cy="361.64" r="1.98"/><circle class="cls-9" cx="144.31" cy="361.64" r="1.98"/><circle class="cls-8" cx="146.6" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -227.73, 471.88)"/><circle class="cls-9" cx="146.6" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -227.73, 471.88)"/><circle class="cls-8" cx="148.88" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -225.67, 474.15)"/><circle class="cls-9" cx="148.88" cy="361.64" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -225.67, 474.15)"/><circle class="cls-8" cx="151.16" cy="361.52" r="1.98"/><circle class="cls-9" cx="151.16" cy="361.52" r="1.98"/><circle class="cls-8" cx="153.44" cy="361.52" r="1.98"/><circle class="cls-9" cx="153.44" cy="361.52" r="1.98"/><circle class="cls-8" cx="155.72" cy="361.4" r="1.98"/><circle class="cls-9" cx="155.72" cy="361.4" r="1.98"/><circle class="cls-8" cx="158" cy="361.4" r="1.98"/><circle class="cls-9" cx="158" cy="361.4" r="1.98"/><circle class="cls-8" cx="160.28" cy="361.4" r="1.98"/><circle class="cls-9" cx="160.28" cy="361.4" r="1.98"/><circle class="cls-8" cx="162.56" cy="361.28" r="1.98"/><circle class="cls-9" cx="162.56" cy="361.28" r="1.98"/><circle class="cls-8" cx="164.95" cy="361.28" r="1.98"/><circle class="cls-9" cx="164.95" cy="361.28" r="1.98"/><circle class="cls-8" cx="167.24" cy="361.28" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -208.76, 492.1)"/><circle class="cls-9" cx="167.24" cy="361.28" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -208.76, 492.1)"/><circle class="cls-8" cx="169.52" cy="361.16" r="1.98"/><circle class="cls-9" cx="169.52" cy="361.16" r="1.98"/><circle class="cls-8" cx="171.8" cy="361.16" r="1.98"/><circle class="cls-9" cx="171.8" cy="361.16" r="1.98"/><circle class="cls-8" cx="174.08" cy="361.04" r="1.98"/><circle class="cls-9" cx="174.08" cy="361.04" r="1.98"/><circle class="cls-8" cx="176.36" cy="361.04" r="1.98"/><circle class="cls-9" cx="176.36" cy="361.04" r="1.98"/><circle class="cls-8" cx="178.64" cy="361.04" r="1.98"/><circle class="cls-9" cx="178.64" cy="361.04" r="1.98"/><circle class="cls-8" cx="180.92" cy="360.92" r="1.98"/><circle class="cls-9" cx="180.92" cy="360.92" r="1.98"/><circle class="cls-8" cx="183.19" cy="360.92" r="1.98"/><circle class="cls-9" cx="183.19" cy="360.92" r="1.98"/><circle class="cls-8" cx="185.47" cy="360.92" r="1.98"/><circle class="cls-9" cx="185.47" cy="360.92" r="1.98"/><circle class="cls-8" cx="187.88" cy="360.8" r="1.98"/><circle class="cls-9" cx="187.88" cy="360.8" r="1.98"/><circle class="cls-8" cx="190.16" cy="360.8" r="1.98"/><circle class="cls-9" cx="190.16" cy="360.8" r="1.98"/><circle class="cls-8" cx="192.44" cy="360.68" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -185.45, 516.63)"/><circle class="cls-9" cx="192.44" cy="360.68" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -185.45, 516.63)"/><circle class="cls-8" cx="194.72" cy="360.68" r="1.98"/><circle class="cls-9" cx="194.72" cy="360.68" r="1.98"/><circle class="cls-8" cx="197" cy="360.68" r="1.98"/><circle class="cls-9" cx="197" cy="360.68" r="1.98"/><circle class="cls-8" cx="199.28" cy="360.56" r="1.98"/><circle class="cls-9" cx="199.28" cy="360.56" r="1.98"/><circle class="cls-8" cx="201.56" cy="360.56" r="1.98" transform="translate(-172.38 536.04) rotate(-85.93)"/><circle class="cls-9" cx="201.56" cy="360.56" r="1.98" transform="translate(-172.38 536.04) rotate(-85.93)"/><circle class="cls-8" cx="203.83" cy="360.44" r="1.98" transform="translate(-34.52 21.84) rotate(-5.65)"/><circle class="cls-9" cx="203.83" cy="360.44" r="1.98" transform="translate(-34.52 21.84) rotate(-5.65)"/><circle class="cls-8" cx="206.11" cy="360.44" r="1.98" transform="translate(-34.51 22.06) rotate(-5.65)"/><circle class="cls-9" cx="206.11" cy="360.44" r="1.98" transform="translate(-34.51 22.06) rotate(-5.65)"/><path class="cls-8" d="M210.38,360.44a2,2,0,1,1-2-2A2,2,0,0,1,210.38,360.44Z"/><path class="cls-9" d="M210.38,360.44a2,2,0,1,1-2-2A2,2,0,0,1,210.38,360.44Z"/><path class="cls-8" d="M212.78,360.32a2,2,0,1,1-2-2A2,2,0,0,1,212.78,360.32Z"/><path class="cls-9" d="M212.78,360.32a2,2,0,1,1-2-2A2,2,0,0,1,212.78,360.32Z"/><path class="cls-8" d="M215.06,360.32a2,2,0,1,1-2-2A2,2,0,0,1,215.06,360.32Z"/><path class="cls-9" d="M215.06,360.32a2,2,0,1,1-2-2A2,2,0,0,1,215.06,360.32Z"/><path class="cls-8" d="M217.33,360.32a2,2,0,1,1-2-2A2,2,0,0,1,217.33,360.32Z"/><path class="cls-9" d="M217.33,360.32a2,2,0,1,1-2-2A2,2,0,0,1,217.33,360.32Z"/><path class="cls-8" d="M219.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,219.61,360.2Z"/><path class="cls-9" d="M219.61,360.2a2,2,0,1,1-2-2A2,2,0,0,1,219.61,360.2Z"/><path class="cls-8" d="M221.89,360.2a2,2,0,1,1-2-2A2,2,0,0,1,221.89,360.2Z"/><path class="cls-9" d="M221.89,360.2a2,2,0,1,1-2-2A2,2,0,0,1,221.89,360.2Z"/><path class="cls-8" d="M224.17,360.2a2,2,0,1,1-2-2A2,2,0,0,1,224.17,360.2Z"/><path class="cls-9" d="M224.17,360.2a2,2,0,1,1-2-2A2,2,0,0,1,224.17,360.2Z"/><path class="cls-8" d="M226.45,360.08a2,2,0,1,1-2-2A2,2,0,0,1,226.45,360.08Z"/><path class="cls-9" d="M226.45,360.08a2,2,0,1,1-2-2A2,2,0,0,1,226.45,360.08Z"/><path class="cls-8" d="M228.73,360.08a2,2,0,1,1-2-2A2,2,0,0,1,228.73,360.08Z"/><path class="cls-9" d="M228.73,360.08a2,2,0,1,1-2-2A2,2,0,0,1,228.73,360.08Z"/><path class="cls-8" d="M231,360a2,2,0,1,1-2-2A2,2,0,0,1,231,360Z"/><path class="cls-9" d="M231,360a2,2,0,1,1-2-2A2,2,0,0,1,231,360Z"/><path class="cls-8" d="M233.29,360a2,2,0,1,1-2-2A2,2,0,0,1,233.29,360Z"/><path class="cls-9" d="M233.29,360a2,2,0,1,1-2-2A2,2,0,0,1,233.29,360Z"/><circle class="cls-8" cx="233.72" cy="359.96" r="1.98" transform="translate(-141.9 567.57) rotate(-85.93)"/><circle class="cls-9" cx="233.72" cy="359.96" r="1.98" transform="translate(-141.9 567.57) rotate(-85.93)"/><circle class="cls-8" cx="236" cy="359.84" r="1.98" transform="translate(-34.31 25.01) rotate(-5.65)"/><circle class="cls-9" cx="236" cy="359.84" r="1.98" transform="translate(-34.31 25.01) rotate(-5.65)"/><circle class="cls-8" cx="238.28" cy="359.84" r="1.98" transform="translate(-34.3 25.23) rotate(-5.65)"/><circle class="cls-9" cx="238.28" cy="359.84" r="1.98" transform="translate(-34.3 25.23) rotate(-5.65)"/><circle class="cls-8" cx="240.56" cy="359.84" r="1.98" transform="translate(-34.29 25.45) rotate(-5.65)"/><circle class="cls-9" cx="240.56" cy="359.84" r="1.98" transform="translate(-34.29 25.45) rotate(-5.65)"/><circle class="cls-8" cx="242.83" cy="359.72" r="1.98" transform="translate(-34.26 25.68) rotate(-5.65)"/><circle class="cls-9" cx="242.83" cy="359.72" r="1.98" transform="translate(-34.26 25.68) rotate(-5.65)"/><circle class="cls-8" cx="245.11" cy="359.72" r="1.98" transform="translate(-34.25 25.9) rotate(-5.65)"/><circle class="cls-9" cx="245.11" cy="359.72" r="1.98" transform="translate(-34.25 25.9) rotate(-5.65)"/><path class="cls-8" d="M249.38,359.6a2,2,0,1,1-2-2A2,2,0,0,1,249.38,359.6Z"/><path class="cls-9" d="M249.38,359.6a2,2,0,1,1-2-2A2,2,0,0,1,249.38,359.6Z"/><path class="cls-8" d="M251.66,359.6a2,2,0,1,1-2-2A2,2,0,0,1,251.66,359.6Z"/><path class="cls-9" d="M251.66,359.6a2,2,0,1,1-2-2A2,2,0,0,1,251.66,359.6Z"/><path class="cls-8" d="M253.94,359.6a2,2,0,1,1-2-2A2,2,0,0,1,253.94,359.6Z"/><path class="cls-9" d="M253.94,359.6a2,2,0,1,1-2-2A2,2,0,0,1,253.94,359.6Z"/><path class="cls-8" d="M256.22,359.48a2,2,0,1,1-2-2A2,2,0,0,1,256.22,359.48Z"/><path class="cls-9" d="M256.22,359.48a2,2,0,1,1-2-2A2,2,0,0,1,256.22,359.48Z"/><path class="cls-8" d="M258.61,359.48a2,2,0,1,1-2-2A2,2,0,0,1,258.61,359.48Z"/><path class="cls-9" d="M258.61,359.48a2,2,0,1,1-2-2A2,2,0,0,1,258.61,359.48Z"/><path class="cls-8" d="M260.89,359.48a2,2,0,1,1-2-2A2,2,0,0,1,260.89,359.48Z"/><path class="cls-9" d="M260.89,359.48a2,2,0,1,1-2-2A2,2,0,0,1,260.89,359.48Z"/><path class="cls-8" d="M263.17,359.36a2,2,0,1,1-2-2A2,2,0,0,1,263.17,359.36Z"/><path class="cls-9" d="M263.17,359.36a2,2,0,1,1-2-2A2,2,0,0,1,263.17,359.36Z"/><path class="cls-8" d="M265.45,359.36a2,2,0,1,1-2-2A2,2,0,0,1,265.45,359.36Z"/><path class="cls-9" d="M265.45,359.36a2,2,0,1,1-2-2A2,2,0,0,1,265.45,359.36Z"/><path class="cls-8" d="M267.73,359.36a2,2,0,1,1-2-2A2,2,0,0,1,267.73,359.36Z"/><path class="cls-9" d="M267.73,359.36a2,2,0,1,1-2-2A2,2,0,0,1,267.73,359.36Z"/><path class="cls-8" d="M270,359.24a2,2,0,1,1-2-2A2,2,0,0,1,270,359.24Z"/><path class="cls-9" d="M270,359.24a2,2,0,1,1-2-2A2,2,0,0,1,270,359.24Z"/><path class="cls-8" d="M272.29,359.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,359.24Z"/><path class="cls-9" d="M272.29,359.24a2,2,0,1,1-2-2A2,2,0,0,1,272.29,359.24Z"/><path class="cls-8" d="M274.57,359.12a2,2,0,1,1-2-2A2,2,0,0,1,274.57,359.12Z"/><path class="cls-9" d="M274.57,359.12a2,2,0,1,1-2-2A2,2,0,0,1,274.57,359.12Z"/><path class="cls-8" d="M276.85,359.12a2,2,0,1,1-2-2A2,2,0,0,1,276.85,359.12Z"/><path class="cls-9" d="M276.85,359.12a2,2,0,1,1-2-2A2,2,0,0,1,276.85,359.12Z"/><path class="cls-8" d="M279.13,359a2,2,0,1,1-2-2A2,2,0,0,1,279.13,359Z"/><path class="cls-9" d="M279.13,359a2,2,0,1,1-2-2A2,2,0,0,1,279.13,359Z"/><circle class="cls-8" cx="279.56" cy="358.88" r="1.98"/><circle class="cls-9" cx="279.56" cy="358.88" r="1.98"/><circle class="cls-8" cx="281.83" cy="358.88" r="1.98"/><circle class="cls-9" cx="281.83" cy="358.88" r="1.98"/><circle class="cls-8" cx="284.11" cy="358.76" r="1.98" transform="translate(-33.97 29.74) rotate(-5.65)"/><circle class="cls-9" cx="284.11" cy="358.76" r="1.98" transform="translate(-33.97 29.74) rotate(-5.65)"/><path class="cls-8" d="M288.38,358.64a2,2,0,1,1-2-2A2,2,0,0,1,288.38,358.64Z"/><path class="cls-9" d="M288.38,358.64a2,2,0,1,1-2-2A2,2,0,0,1,288.38,358.64Z"/><path class="cls-8" d="M290.66,358.64a2,2,0,1,1-2-2A2,2,0,0,1,290.66,358.64Z"/><path class="cls-9" d="M290.66,358.64a2,2,0,1,1-2-2A2,2,0,0,1,290.66,358.64Z"/><path class="cls-8" d="M292.94,358.52a2,2,0,1,1-2-2A2,2,0,0,1,292.94,358.52Z"/><path class="cls-9" d="M292.94,358.52a2,2,0,1,1-2-2A2,2,0,0,1,292.94,358.52Z"/><path class="cls-8" d="M295.22,358.4a2,2,0,1,1-2-2A2,2,0,0,1,295.22,358.4Z"/><path class="cls-9" d="M295.22,358.4a2,2,0,1,1-2-2A2,2,0,0,1,295.22,358.4Z"/><path class="cls-8" d="M297.5,358.4a2,2,0,1,1-2-2A2,2,0,0,1,297.5,358.4Z"/><path class="cls-9" d="M297.5,358.4a2,2,0,1,1-2-2A2,2,0,0,1,297.5,358.4Z"/><path class="cls-8" d="M299.78,358.28a2,2,0,1,1-2-2A2,2,0,0,1,299.78,358.28Z"/><path class="cls-9" d="M299.78,358.28a2,2,0,1,1-2-2A2,2,0,0,1,299.78,358.28Z"/><path class="cls-8" d="M302.06,358.16a2,2,0,1,1-2-2A2,2,0,0,1,302.06,358.16Z"/><path class="cls-9" d="M302.06,358.16a2,2,0,1,1-2-2A2,2,0,0,1,302.06,358.16Z"/><path class="cls-8" d="M304.33,358.16a2,2,0,1,1-2-2A2,2,0,0,1,304.33,358.16Z"/><path class="cls-9" d="M304.33,358.16a2,2,0,1,1-2-2A2,2,0,0,1,304.33,358.16Z"/><path class="cls-8" d="M306.73,358a2,2,0,1,1-2-2A2,2,0,0,1,306.73,358Z"/><path class="cls-9" d="M306.73,358a2,2,0,1,1-2-2A2,2,0,0,1,306.73,358Z"/><path class="cls-8" d="M309,357.92a2,2,0,1,1-2-2A2,2,0,0,1,309,357.92Z"/><path class="cls-9" d="M309,357.92a2,2,0,1,1-2-2A2,2,0,0,1,309,357.92Z"/><path class="cls-8" d="M311.29,357.92a2,2,0,1,1-2-2A2,2,0,0,1,311.29,357.92Z"/><path class="cls-9" d="M311.29,357.92a2,2,0,1,1-2-2A2,2,0,0,1,311.29,357.92Z"/><path class="cls-8" d="M313.57,357.8a2,2,0,1,1-2-2A2,2,0,0,1,313.57,357.8Z"/><path class="cls-9" d="M313.57,357.8a2,2,0,1,1-2-2A2,2,0,0,1,313.57,357.8Z"/><path class="cls-8" d="M315.85,357.68a2,2,0,1,1-2-2A2,2,0,0,1,315.85,357.68Z"/><path class="cls-9" d="M315.85,357.68a2,2,0,1,1-2-2A2,2,0,0,1,315.85,357.68Z"/><path class="cls-8" d="M318.13,357.68a2,2,0,1,1-2-2A2,2,0,0,1,318.13,357.68Z"/><path class="cls-9" d="M318.13,357.68a2,2,0,1,1-2-2A2,2,0,0,1,318.13,357.68Z"/><circle class="cls-8" cx="318.44" cy="357.56" r="1.98" transform="translate(-60.79 649.84) rotate(-85.93)"/><circle class="cls-9" cx="318.44" cy="357.56" r="1.98" transform="translate(-60.79 649.84) rotate(-85.93)"/><circle class="cls-8" cx="320.72" cy="357.44" r="1.98" transform="translate(-33.66 33.34) rotate(-5.65)"/><circle class="cls-9" cx="320.72" cy="357.44" r="1.98" transform="translate(-33.66 33.34) rotate(-5.65)"/><circle class="cls-8" cx="323" cy="357.32" r="1.98" transform="translate(-56.32 654.17) rotate(-85.93)"/><circle class="cls-9" cx="323" cy="357.32" r="1.98" transform="translate(-56.32 654.17) rotate(-85.93)"/><circle class="cls-8" cx="325.28" cy="357.32" r="1.98" transform="translate(-54.2 656.44) rotate(-85.93)"/><circle class="cls-9" cx="325.28" cy="357.32" r="1.98" transform="translate(-54.2 656.44) rotate(-85.93)"/><path class="cls-8" d="M329.66,357.2a2,2,0,1,1-2-2A2,2,0,0,1,329.66,357.2Z"/><path class="cls-9" d="M329.66,357.2a2,2,0,1,1-2-2A2,2,0,0,1,329.66,357.2Z"/><path class="cls-8" d="M331.94,357.08a2,2,0,1,1-2-2A2,2,0,0,1,331.94,357.08Z"/><path class="cls-9" d="M331.94,357.08a2,2,0,1,1-2-2A2,2,0,0,1,331.94,357.08Z"/><path class="cls-8" d="M334.22,357a2,2,0,1,1-2-2A2,2,0,0,1,334.22,357Z"/><path class="cls-9" d="M334.22,357a2,2,0,1,1-2-2A2,2,0,0,1,334.22,357Z"/><path class="cls-8" d="M336.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,336.5,356.84Z"/><path class="cls-9" d="M336.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,336.5,356.84Z"/><path class="cls-8" d="M338.78,356.72a2,2,0,1,1-2-2A2,2,0,0,1,338.78,356.72Z"/><path class="cls-9" d="M338.78,356.72a2,2,0,1,1-2-2A2,2,0,0,1,338.78,356.72Z"/><path class="cls-8" d="M341.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,341.06,356.6Z"/><path class="cls-9" d="M341.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,341.06,356.6Z"/><path class="cls-8" d="M343.33,356.48a2,2,0,1,1-2-2A2,2,0,0,1,343.33,356.48Z"/><path class="cls-9" d="M343.33,356.48a2,2,0,1,1-2-2A2,2,0,0,1,343.33,356.48Z"/><path class="cls-8" d="M345.61,356.36a2,2,0,1,1-2-2A2,2,0,0,1,345.61,356.36Z"/><path class="cls-9" d="M345.61,356.36a2,2,0,1,1-2-2A2,2,0,0,1,345.61,356.36Z"/><path class="cls-8" d="M347.89,356.24a2,2,0,1,1-2-2A2,2,0,0,1,347.89,356.24Z"/><path class="cls-9" d="M347.89,356.24a2,2,0,1,1-2-2A2,2,0,0,1,347.89,356.24Z"/><path class="cls-8" d="M350.17,356.12a2,2,0,1,1-2-2A2,2,0,0,1,350.17,356.12Z"/><path class="cls-9" d="M350.17,356.12a2,2,0,1,1-2-2A2,2,0,0,1,350.17,356.12Z"/><path class="cls-8" d="M352.57,356a2,2,0,1,1-2-2A2,2,0,0,1,352.57,356Z"/><path class="cls-9" d="M352.57,356a2,2,0,1,1-2-2A2,2,0,0,1,352.57,356Z"/><path class="cls-8" d="M354.85,355.88a2,2,0,1,1-2-2A2,2,0,0,1,354.85,355.88Z"/><path class="cls-9" d="M354.85,355.88a2,2,0,1,1-2-2A2,2,0,0,1,354.85,355.88Z"/><path class="cls-8" d="M357.13,355.76a2,2,0,1,1-2-2A2,2,0,0,1,357.13,355.76Z"/><path class="cls-9" d="M357.13,355.76a2,2,0,1,1-2-2A2,2,0,0,1,357.13,355.76Z"/><circle class="cls-8" cx="357.44" cy="355.64" r="1.98" transform="translate(-22.64 686.96) rotate(-85.93)"/><circle class="cls-9" cx="357.44" cy="355.64" r="1.98" transform="translate(-22.64 686.96) rotate(-85.93)"/><circle class="cls-8" cx="359.72" cy="355.52" r="1.98"/><circle class="cls-9" cx="359.72" cy="355.52" r="1.98"/><circle class="cls-8" cx="362" cy="355.4" r="1.98" transform="translate(-33.26 37.4) rotate(-5.65)"/><circle class="cls-9" cx="362" cy="355.4" r="1.98" transform="translate(-33.26 37.4) rotate(-5.65)"/><circle class="cls-8" cx="364.28" cy="355.28" r="1.98" transform="translate(-15.93 693.45) rotate(-85.93)"/><circle class="cls-9" cx="364.28" cy="355.28" r="1.98" transform="translate(-15.93 693.45) rotate(-85.93)"/><circle class="cls-8" cx="366.56" cy="355.16" r="1.98" transform="translate(-33.21 37.85) rotate(-5.65)"/><circle class="cls-9" cx="366.56" cy="355.16" r="1.98" transform="translate(-33.21 37.85) rotate(-5.65)"/><circle class="cls-8" cx="368.83" cy="355.04" r="1.98" transform="translate(-33.19 38.07) rotate(-5.65)"/><circle class="cls-9" cx="368.83" cy="355.04" r="1.98" transform="translate(-33.19 38.07) rotate(-5.65)"/><circle class="cls-8" cx="371.11" cy="354.92" r="1.98" transform="translate(-9.22 699.94) rotate(-85.93)"/><circle class="cls-9" cx="371.11" cy="354.92" r="1.98" transform="translate(-9.22 699.94) rotate(-85.93)"/><path class="cls-8" d="M375.5,354.68a2,2,0,1,1-2-2A2,2,0,0,1,375.5,354.68Z"/><path class="cls-9" d="M375.5,354.68a2,2,0,1,1-2-2A2,2,0,0,1,375.5,354.68Z"/><path class="cls-8" d="M377.78,354.56a2,2,0,1,1-2-2A2,2,0,0,1,377.78,354.56Z"/><path class="cls-9" d="M377.78,354.56a2,2,0,1,1-2-2A2,2,0,0,1,377.78,354.56Z"/><path class="cls-8" d="M380.06,354.44a2,2,0,1,1-2-2A2,2,0,0,1,380.06,354.44Z"/><path class="cls-9" d="M380.06,354.44a2,2,0,1,1-2-2A2,2,0,0,1,380.06,354.44Z"/><path class="cls-8" d="M382.33,354.2a2,2,0,1,1-2-2A2,2,0,0,1,382.33,354.2Z"/><path class="cls-9" d="M382.33,354.2a2,2,0,1,1-2-2A2,2,0,0,1,382.33,354.2Z"/><path class="cls-8" d="M384.61,354.08a2,2,0,1,1-2-2A2,2,0,0,1,384.61,354.08Z"/><path class="cls-9" d="M384.61,354.08a2,2,0,1,1-2-2A2,2,0,0,1,384.61,354.08Z"/><path class="cls-8" d="M386.89,354a2,2,0,1,1-2-2A2,2,0,0,1,386.89,354Z"/><path class="cls-9" d="M386.89,354a2,2,0,1,1-2-2A2,2,0,0,1,386.89,354Z"/><path class="cls-8" d="M389.17,353.72a2,2,0,1,1-2-2A2,2,0,0,1,389.17,353.72Z"/><path class="cls-9" d="M389.17,353.72a2,2,0,1,1-2-2A2,2,0,0,1,389.17,353.72Z"/><path class="cls-8" d="M391.45,353.6a2,2,0,1,1-2-2A2,2,0,0,1,391.45,353.6Z"/><path class="cls-9" d="M391.45,353.6a2,2,0,1,1-2-2A2,2,0,0,1,391.45,353.6Z"/><path class="cls-8" d="M393.73,353.48a2,2,0,1,1-2-2A2,2,0,0,1,393.73,353.48Z"/><path class="cls-9" d="M393.73,353.48a2,2,0,1,1-2-2A2,2,0,0,1,393.73,353.48Z"/><path class="cls-8" d="M396,353.24a2,2,0,1,1-2-2A2,2,0,0,1,396,353.24Z"/><path class="cls-9" d="M396,353.24a2,2,0,1,1-2-2A2,2,0,0,1,396,353.24Z"/><circle class="cls-8" cx="396.44" cy="353.12" r="1.98" transform="translate(-32.87 40.78) rotate(-5.65)"/><circle class="cls-9" cx="396.44" cy="353.12" r="1.98" transform="translate(-32.87 40.78) rotate(-5.65)"/><circle class="cls-8" cx="398.72" cy="352.88" r="1.98"/><circle class="cls-9" cx="398.72" cy="352.88" r="1.98"/><circle class="cls-8" cx="401" cy="352.76" r="1.98" transform="translate(-32.81 41.23) rotate(-5.65)"/><circle class="cls-9" cx="401" cy="352.76" r="1.98" transform="translate(-32.81 41.23) rotate(-5.65)"/><circle class="cls-8" cx="403.28" cy="352.52" r="1.98"/><circle class="cls-9" cx="403.28" cy="352.52" r="1.98"/><circle class="cls-8" cx="405.56" cy="352.28" r="1.98" transform="translate(25.42 731.84) rotate(-85.93)"/><circle class="cls-9" cx="405.56" cy="352.28" r="1.98" transform="translate(25.42 731.84) rotate(-85.93)"/><circle class="cls-8" cx="407.83" cy="352.16" r="1.98" transform="translate(-32.72 41.9) rotate(-5.65)"/><circle class="cls-9" cx="407.83" cy="352.16" r="1.98" transform="translate(-32.72 41.9) rotate(-5.65)"/><circle class="cls-8" cx="410.11" cy="351.92" r="1.98" transform="translate(30.01 736.05) rotate(-85.93)"/><circle class="cls-9" cx="410.11" cy="351.92" r="1.98" transform="translate(30.01 736.05) rotate(-85.93)"/><path class="cls-8" d="M414.38,351.68a2,2,0,1,1-2-2A2,2,0,0,1,414.38,351.68Z"/><path class="cls-9" d="M414.38,351.68a2,2,0,1,1-2-2A2,2,0,0,1,414.38,351.68Z"/><path class="cls-8" d="M416.66,351.56a2,2,0,1,1-2-2A2,2,0,0,1,416.66,351.56Z"/><path class="cls-9" d="M416.66,351.56a2,2,0,1,1-2-2A2,2,0,0,1,416.66,351.56Z"/><path class="cls-8" d="M418.94,351.32a2,2,0,1,1-2-2A2,2,0,0,1,418.94,351.32Z"/><path class="cls-9" d="M418.94,351.32a2,2,0,1,1-2-2A2,2,0,0,1,418.94,351.32Z"/><path class="cls-8" d="M421.33,351.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,351.08Z"/><path class="cls-9" d="M421.33,351.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,351.08Z"/><path class="cls-8" d="M423.61,351a2,2,0,1,1-2-2A2,2,0,0,1,423.61,351Z"/><path class="cls-9" d="M423.61,351a2,2,0,1,1-2-2A2,2,0,0,1,423.61,351Z"/><path class="cls-8" d="M425.89,350.72a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.72Z"/><path class="cls-9" d="M425.89,350.72a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.72Z"/><path class="cls-8" d="M428.17,350.48a2,2,0,1,1-2-2A2,2,0,0,1,428.17,350.48Z"/><path class="cls-9" d="M428.17,350.48a2,2,0,1,1-2-2A2,2,0,0,1,428.17,350.48Z"/><path class="cls-8" d="M430.45,350.24a2,2,0,1,1-2-2A2,2,0,0,1,430.45,350.24Z"/><path class="cls-9" d="M430.45,350.24a2,2,0,1,1-2-2A2,2,0,0,1,430.45,350.24Z"/><path class="cls-8" d="M432.73,349.88a2,2,0,1,1-2-2A2,2,0,0,1,432.73,349.88Z"/><path class="cls-9" d="M432.73,349.88a2,2,0,1,1-2-2A2,2,0,0,1,432.73,349.88Z"/><path class="cls-8" d="M435,349.64a2,2,0,1,1-2-2A2,2,0,0,1,435,349.64Z"/><path class="cls-9" d="M435,349.64a2,2,0,1,1-2-2A2,2,0,0,1,435,349.64Z"/><path class="cls-8" d="M437.29,349.4a2,2,0,1,1-2-2A2,2,0,0,1,437.29,349.4Z"/><path class="cls-9" d="M437.29,349.4a2,2,0,1,1-2-2A2,2,0,0,1,437.29,349.4Z"/><path class="cls-8" d="M439.57,349.16a2,2,0,1,1-2-2A2,2,0,0,1,439.57,349.16Z"/><path class="cls-9" d="M439.57,349.16a2,2,0,1,1-2-2A2,2,0,0,1,439.57,349.16Z"/><path class="cls-8" d="M441.85,348.92a2,2,0,1,1-2-2A2,2,0,0,1,441.85,348.92Z"/><path class="cls-9" d="M441.85,348.92a2,2,0,1,1-2-2A2,2,0,0,1,441.85,348.92Z"/><circle class="cls-8" cx="442.28" cy="348.68" r="1.98" transform="translate(63.12 765.12) rotate(-85.93)"/><circle class="cls-9" cx="442.28" cy="348.68" r="1.98" transform="translate(63.12 765.12) rotate(-85.93)"/><circle class="cls-8" cx="444.56" cy="348.44" r="1.98" transform="translate(-32.17 45.5) rotate(-5.65)"/><circle class="cls-9" cx="444.56" cy="348.44" r="1.98" transform="translate(-32.17 45.5) rotate(-5.65)"/><circle class="cls-8" cx="446.83" cy="348.2" r="1.98"/><circle class="cls-9" cx="446.83" cy="348.2" r="1.98"/><circle class="cls-8" cx="449.11" cy="347.84" r="1.98" transform="translate(-32.09 45.95) rotate(-5.65)"/><circle class="cls-9" cx="449.11" cy="347.84" r="1.98" transform="translate(-32.09 45.95) rotate(-5.65)"/><path class="cls-8" d="M453.38,347.6a2,2,0,1,1-2-2A2,2,0,0,1,453.38,347.6Z"/><path class="cls-9" d="M453.38,347.6a2,2,0,1,1-2-2A2,2,0,0,1,453.38,347.6Z"/><path class="cls-8" d="M455.66,347.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,347.24Z"/><path class="cls-9" d="M455.66,347.24a2,2,0,1,1-2-2A2,2,0,0,1,455.66,347.24Z"/><circle class="cls-8" cx="455.95" cy="347" r="1.98" transform="translate(-18.49 25.75) rotate(-3.17)"/><circle class="cls-9" cx="455.95" cy="347" r="1.98" transform="translate(-18.49 25.75) rotate(-3.17)"/><path class="cls-8" d="M460.22,346.76a2,2,0,1,1-2-2A2,2,0,0,1,460.22,346.76Z"/><path class="cls-9" d="M460.22,346.76a2,2,0,1,1-2-2A2,2,0,0,1,460.22,346.76Z"/><circle class="cls-8" cx="460.51" cy="346.4" r="1.98" transform="translate(93.61 790.74) rotate(-87.4)"/><circle class="cls-9" cx="460.51" cy="346.4" r="1.98" transform="translate(93.61 790.74) rotate(-87.4)"/><path class="cls-8" d="M464.78,346a2,2,0,1,1-2-2A2,2,0,0,1,464.78,346Z"/><path class="cls-9" d="M464.78,346a2,2,0,1,1-2-2A2,2,0,0,1,464.78,346Z"/><path class="cls-8" d="M467.18,345.8a2,2,0,1,1-2-2A2,2,0,0,1,467.18,345.8Z"/><path class="cls-9" d="M467.18,345.8a2,2,0,1,1-2-2A2,2,0,0,1,467.18,345.8Z"/><path class="cls-8" d="M469.45,345.44a2,2,0,1,1-2-2A2,2,0,0,1,469.45,345.44Z"/><path class="cls-9" d="M469.45,345.44a2,2,0,1,1-2-2A2,2,0,0,1,469.45,345.44Z"/><path class="cls-8" d="M471.74,345.08a2,2,0,1,1-2-2A2,2,0,0,1,471.74,345.08Z"/><path class="cls-9" d="M471.74,345.08a2,2,0,1,1-2-2A2,2,0,0,1,471.74,345.08Z"/><circle class="cls-8" cx="472.04" cy="344.84" r="1.98" transform="translate(106.16 800.76) rotate(-87.4)"/><circle class="cls-9" cx="472.04" cy="344.84" r="1.98" transform="translate(106.16 800.76) rotate(-87.4)"/><path class="cls-8" d="M476.3,344.48a2,2,0,1,1-2-2A2,2,0,0,1,476.3,344.48Z"/><path class="cls-9" d="M476.3,344.48a2,2,0,1,1-2-2A2,2,0,0,1,476.3,344.48Z"/><circle class="cls-8" cx="476.6" cy="344" r="1.98"/><circle class="cls-9" cx="476.6" cy="344" r="1.98"/><path class="cls-8" d="M480.86,343.64a2,2,0,1,1-2-2A2,2,0,0,1,480.86,343.64Z"/><path class="cls-9" d="M480.86,343.64a2,2,0,1,1-2-2A2,2,0,0,1,480.86,343.64Z"/><circle class="cls-8" cx="481.16" cy="343.28" r="1.98"/><circle class="cls-9" cx="481.16" cy="343.28" r="1.98"/><path class="cls-8" d="M485.42,342.92a2,2,0,1,1-2-2A2,2,0,0,1,485.42,342.92Z"/><path class="cls-9" d="M485.42,342.92a2,2,0,1,1-2-2A2,2,0,0,1,485.42,342.92Z"/><circle class="cls-8" cx="485.72" cy="342.56" r="1.98"/><circle class="cls-9" cx="485.72" cy="342.56" r="1.98"/><path class="cls-8" d="M490,342.2a2,2,0,1,1-2-2A2,2,0,0,1,490,342.2Z"/><path class="cls-9" d="M490,342.2a2,2,0,1,1-2-2A2,2,0,0,1,490,342.2Z"/><circle class="cls-8" cx="490.39" cy="341.84" r="1.98"/><circle class="cls-9" cx="490.39" cy="341.84" r="1.98"/><path class="cls-8" d="M494.66,341.36a2,2,0,1,1-2-2A2,2,0,0,1,494.66,341.36Z"/><path class="cls-9" d="M494.66,341.36a2,2,0,1,1-2-2A2,2,0,0,1,494.66,341.36Z"/><circle class="cls-8" cx="494.95" cy="341" r="1.98"/><circle class="cls-9" cx="494.95" cy="341" r="1.98"/><path class="cls-8" d="M499.22,340.52a2,2,0,1,1-2-2A2,2,0,0,1,499.22,340.52Z"/><path class="cls-9" d="M499.22,340.52a2,2,0,1,1-2-2A2,2,0,0,1,499.22,340.52Z"/><circle class="cls-8" cx="499.51" cy="340.16" r="1.98" transform="translate(-31.09 50.88) rotate(-5.65)"/><circle class="cls-9" cx="499.51" cy="340.16" r="1.98" transform="translate(-31.09 50.88) rotate(-5.65)"/><path class="cls-8" d="M503.78,339.68a2,2,0,1,1-2-2A2,2,0,0,1,503.78,339.68Z"/><path class="cls-9" d="M503.78,339.68a2,2,0,1,1-2-2A2,2,0,0,1,503.78,339.68Z"/><path class="cls-8" d="M506.06,339.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,339.2Z"/><path class="cls-9" d="M506.06,339.2a2,2,0,1,1-2-2A2,2,0,0,1,506.06,339.2Z"/><path class="cls-8" d="M508.33,338.84a2,2,0,1,1-2-2A2,2,0,0,1,508.33,338.84Z"/><path class="cls-9" d="M508.33,338.84a2,2,0,1,1-2-2A2,2,0,0,1,508.33,338.84Z"/><path class="cls-8" d="M510.62,338.36a2,2,0,1,1-2-2A2,2,0,0,1,510.62,338.36Z"/><path class="cls-9" d="M510.62,338.36a2,2,0,1,1-2-2A2,2,0,0,1,510.62,338.36Z"/><path class="cls-8" d="M512.89,337.88a2,2,0,1,1-2-2A2,2,0,0,1,512.89,337.88Z"/><path class="cls-9" d="M512.89,337.88a2,2,0,1,1-2-2A2,2,0,0,1,512.89,337.88Z"/><path class="cls-8" d="M515.3,337.4a2,2,0,1,1-2-2A2,2,0,0,1,515.3,337.4Z"/><path class="cls-9" d="M515.3,337.4a2,2,0,1,1-2-2A2,2,0,0,1,515.3,337.4Z"/><circle class="cls-8" cx="515.6" cy="336.92" r="1.98"/><circle class="cls-9" cx="515.6" cy="336.92" r="1.98"/><path class="cls-8" d="M519.86,336.44a2,2,0,1,1-2-2A2,2,0,0,1,519.86,336.44Z"/><path class="cls-9" d="M519.86,336.44a2,2,0,1,1-2-2A2,2,0,0,1,519.86,336.44Z"/><circle class="cls-8" cx="520.16" cy="335.84" r="1.98"/><circle class="cls-9" cx="520.16" cy="335.84" r="1.98"/><path class="cls-8" d="M524.42,335.36a2,2,0,1,1-2-2A2,2,0,0,1,524.42,335.36Z"/><path class="cls-9" d="M524.42,335.36a2,2,0,1,1-2-2A2,2,0,0,1,524.42,335.36Z"/><circle class="cls-8" cx="524.72" cy="334.88" r="1.98" transform="translate(153.49 834.53) rotate(-85.93)"/><circle class="cls-9" cx="524.72" cy="334.88" r="1.98" transform="translate(153.49 834.53) rotate(-85.93)"/><path class="cls-8" d="M529,334.28a2,2,0,1,1-2-2A2,2,0,0,1,529,334.28Z"/><path class="cls-9" d="M529,334.28a2,2,0,1,1-2-2A2,2,0,0,1,529,334.28Z"/><circle class="cls-8" cx="529.28" cy="333.68" r="1.98" transform="translate(158.92 837.96) rotate(-85.93)"/><circle class="cls-9" cx="529.28" cy="333.68" r="1.98" transform="translate(158.92 837.96) rotate(-85.93)"/><path class="cls-8" d="M533.54,333.2a2,2,0,1,1-2-2A2,2,0,0,1,533.54,333.2Z"/><path class="cls-9" d="M533.54,333.2a2,2,0,1,1-2-2A2,2,0,0,1,533.54,333.2Z"/><circle class="cls-8" cx="533.83" cy="332.6" r="1.98" transform="translate(164.23 841.51) rotate(-85.93)"/><circle class="cls-9" cx="533.83" cy="332.6" r="1.98" transform="translate(164.23 841.51) rotate(-85.93)"/><path class="cls-8" d="M538.22,332a2,2,0,1,1-2-2A2,2,0,0,1,538.22,332Z"/><path class="cls-9" d="M538.22,332a2,2,0,1,1-2-2A2,2,0,0,1,538.22,332Z"/><circle class="cls-8" cx="538.51" cy="331.4" r="1.98"/><circle class="cls-9" cx="538.51" cy="331.4" r="1.98"/><path class="cls-8" d="M542.78,330.8a2,2,0,1,1-2-2A2,2,0,0,1,542.78,330.8Z"/><path class="cls-9" d="M542.78,330.8a2,2,0,1,1-2-2A2,2,0,0,1,542.78,330.8Z"/><path class="cls-8" d="M545.06,330.2a2,2,0,1,1-2-2A2,2,0,0,1,545.06,330.2Z"/><path class="cls-9" d="M545.06,330.2a2,2,0,1,1-2-2A2,2,0,0,1,545.06,330.2Z"/><path class="cls-8" d="M547.33,329.6a2,2,0,1,1-2-2A2,2,0,0,1,547.33,329.6Z"/><path class="cls-9" d="M547.33,329.6a2,2,0,1,1-2-2A2,2,0,0,1,547.33,329.6Z"/><path class="cls-8" d="M549.62,328.88a2,2,0,1,1-2-2A2,2,0,0,1,549.62,328.88Z"/><path class="cls-9" d="M549.62,328.88a2,2,0,1,1-2-2A2,2,0,0,1,549.62,328.88Z"/><path class="cls-8" d="M551.89,328.28a2,2,0,1,1-2-2A2,2,0,0,1,551.89,328.28Z"/><path class="cls-9" d="M551.89,328.28a2,2,0,1,1-2-2A2,2,0,0,1,551.89,328.28Z"/><path class="cls-8" d="M554.18,327.56a2,2,0,1,1-2-2A2,2,0,0,1,554.18,327.56Z"/><path class="cls-9" d="M554.18,327.56a2,2,0,1,1-2-2A2,2,0,0,1,554.18,327.56Z"/><path class="cls-8" d="M556.45,326.84a2,2,0,1,1-2-2A2,2,0,0,1,556.45,326.84Z"/><path class="cls-9" d="M556.45,326.84a2,2,0,1,1-2-2A2,2,0,0,1,556.45,326.84Z"/><path class="cls-8" d="M558.74,326.24a2,2,0,1,1-2-2A2,2,0,0,1,558.74,326.24Z"/><path class="cls-9" d="M558.74,326.24a2,2,0,1,1-2-2A2,2,0,0,1,558.74,326.24Z"/><circle class="cls-8" cx="559.16" cy="325.52" r="1.98" transform="translate(-29.35 56.68) rotate(-5.65)"/><circle class="cls-9" cx="559.16" cy="325.52" r="1.98" transform="translate(-29.35 56.68) rotate(-5.65)"/><path class="cls-8" d="M563.42,324.8a2,2,0,1,1-2-2A2,2,0,0,1,563.42,324.8Z"/><path class="cls-9" d="M563.42,324.8a2,2,0,1,1-2-2A2,2,0,0,1,563.42,324.8Z"/><circle class="cls-8" cx="563.72" cy="323.96" r="1.98" transform="translate(200.61 863.29) rotate(-85.93)"/><circle class="cls-9" cx="563.72" cy="323.96" r="1.98" transform="translate(200.61 863.29) rotate(-85.93)"/><path class="cls-8" d="M568,323.24a2,2,0,1,1-2-2A2,2,0,0,1,568,323.24Z"/><path class="cls-9" d="M568,323.24a2,2,0,1,1-2-2A2,2,0,0,1,568,323.24Z"/><circle class="cls-8" cx="568.28" cy="322.52" r="1.98" transform="translate(-29.01 57.57) rotate(-5.65)"/><circle class="cls-9" cx="568.28" cy="322.52" r="1.98" transform="translate(-29.01 57.57) rotate(-5.65)"/><path class="cls-8" d="M572.54,321.68a2,2,0,1,1-2-2A2,2,0,0,1,572.54,321.68Z"/><path class="cls-9" d="M572.54,321.68a2,2,0,1,1-2-2A2,2,0,0,1,572.54,321.68Z"/><circle class="cls-8" cx="572.83" cy="320.84" r="1.98"/><circle class="cls-9" cx="572.83" cy="320.84" r="1.98"/><path class="cls-8" d="M577.1,320.12a2,2,0,1,1-2-2A2,2,0,0,1,577.1,320.12Z"/><path class="cls-9" d="M577.1,320.12a2,2,0,1,1-2-2A2,2,0,0,1,577.1,320.12Z"/><circle class="cls-8" cx="577.39" cy="319.28" r="1.98"/><circle class="cls-9" cx="577.39" cy="319.28" r="1.98"/><path class="cls-8" d="M581.66,318.32a2,2,0,1,1-2-2A2,2,0,0,1,581.66,318.32Z"/><path class="cls-9" d="M581.66,318.32a2,2,0,1,1-2-2A2,2,0,0,1,581.66,318.32Z"/><path class="cls-8" d="M584.06,317.48a2,2,0,1,1-2-2A2,2,0,0,1,584.06,317.48Z"/><path class="cls-9" d="M584.06,317.48a2,2,0,1,1-2-2A2,2,0,0,1,584.06,317.48Z"/><path class="cls-8" d="M586.33,316.64a2,2,0,1,1-2-2A2,2,0,0,1,586.33,316.64Z"/><path class="cls-9" d="M586.33,316.64a2,2,0,1,1-2-2A2,2,0,0,1,586.33,316.64Z"/><path class="cls-8" d="M588.62,315.8a2,2,0,1,1-2-2A2,2,0,0,1,588.62,315.8Z"/><path class="cls-9" d="M588.62,315.8a2,2,0,1,1-2-2A2,2,0,0,1,588.62,315.8Z"/><path class="cls-8" d="M590.89,314.84a2,2,0,1,1-2-2A2,2,0,0,1,590.89,314.84Z"/><path class="cls-9" d="M590.89,314.84a2,2,0,1,1-2-2A2,2,0,0,1,590.89,314.84Z"/><path class="cls-8" d="M593.18,313.88a2,2,0,1,1-2-2A2,2,0,0,1,593.18,313.88Z"/><path class="cls-9" d="M593.18,313.88a2,2,0,1,1-2-2A2,2,0,0,1,593.18,313.88Z"/><path class="cls-8" d="M595.45,312.92a2,2,0,1,1-2-2A2,2,0,0,1,595.45,312.92Z"/><path class="cls-9" d="M595.45,312.92a2,2,0,1,1-2-2A2,2,0,0,1,595.45,312.92Z"/><path class="cls-8" d="M597.74,312a2,2,0,1,1-2-2A2,2,0,0,1,597.74,312Z"/><path class="cls-9" d="M597.74,312a2,2,0,1,1-2-2A2,2,0,0,1,597.74,312Z"/><circle class="cls-8" cx="598.04" cy="310.88" r="1.98" transform="translate(245.55 885.37) rotate(-85.93)"/><circle class="cls-9" cx="598.04" cy="310.88" r="1.98" transform="translate(245.55 885.37) rotate(-85.93)"/><path class="cls-8" d="M602.3,309.92a2,2,0,1,1-2-2A2,2,0,0,1,602.3,309.92Z"/><path class="cls-9" d="M602.3,309.92a2,2,0,1,1-2-2A2,2,0,0,1,602.3,309.92Z"/><circle class="cls-8" cx="602.6" cy="308.84" r="1.98"/><circle class="cls-9" cx="602.6" cy="308.84" r="1.98"/><path class="cls-8" d="M607,307.76a2,2,0,1,1-2-2A2,2,0,0,1,607,307.76Z"/><path class="cls-9" d="M607,307.76a2,2,0,1,1-2-2A2,2,0,0,1,607,307.76Z"/><circle class="cls-8" cx="607.28" cy="306.68" r="1.98" transform="translate(258.32 890.68) rotate(-85.93)"/><circle class="cls-9" cx="607.28" cy="306.68" r="1.98" transform="translate(258.32 890.68) rotate(-85.93)"/><path class="cls-8" d="M611.54,305.6a2,2,0,1,1-2-2A2,2,0,0,1,611.54,305.6Z"/><path class="cls-9" d="M611.54,305.6a2,2,0,1,1-2-2A2,2,0,0,1,611.54,305.6Z"/><circle class="cls-8" cx="611.83" cy="304.52" r="1.98" transform="translate(-27.03 61.77) rotate(-5.65)"/><circle class="cls-9" cx="611.83" cy="304.52" r="1.98" transform="translate(-27.03 61.77) rotate(-5.65)"/><path class="cls-8" d="M616.1,303.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,303.32Z"/><path class="cls-9" d="M616.1,303.32a2,2,0,1,1-2-2A2,2,0,0,1,616.1,303.32Z"/><circle class="cls-8" cx="616.39" cy="302.12" r="1.98"/><circle class="cls-9" cx="616.39" cy="302.12" r="1.98"/><path class="cls-8" d="M620.66,301a2,2,0,1,1-2-2A2,2,0,0,1,620.66,301Z"/><path class="cls-9" d="M620.66,301a2,2,0,1,1-2-2A2,2,0,0,1,620.66,301Z"/><circle class="cls-8" cx="620.95" cy="299.84" r="1.98"/><circle class="cls-9" cx="620.95" cy="299.84" r="1.98"/><path class="cls-8" d="M625.22,298.52a2,2,0,1,1-2-2A2,2,0,0,1,625.22,298.52Z"/><path class="cls-9" d="M625.22,298.52a2,2,0,1,1-2-2A2,2,0,0,1,625.22,298.52Z"/><circle class="cls-8" cx="625.51" cy="297.32" r="1.98" transform="translate(284.61 900.18) rotate(-85.93)"/><circle class="cls-9" cx="625.51" cy="297.32" r="1.98" transform="translate(284.61 900.18) rotate(-85.93)"/><path class="cls-8" d="M629.89,296a2,2,0,1,1-2-2A2,2,0,0,1,629.89,296Z"/><path class="cls-9" d="M629.89,296a2,2,0,1,1-2-2A2,2,0,0,1,629.89,296Z"/><path class="cls-8" d="M632.18,294.68a2,2,0,1,1-2-2A2,2,0,0,1,632.18,294.68Z"/><path class="cls-9" d="M632.18,294.68a2,2,0,1,1-2-2A2,2,0,0,1,632.18,294.68Z"/><path class="cls-8" d="M634.45,293.36a2,2,0,1,1-2-2A2,2,0,0,1,634.45,293.36Z"/><path class="cls-9" d="M634.45,293.36a2,2,0,1,1-2-2A2,2,0,0,1,634.45,293.36Z"/><path class="cls-8" d="M636.74,292a2,2,0,1,1-2-2A2,2,0,0,1,636.74,292Z"/><path class="cls-9" d="M636.74,292a2,2,0,1,1-2-2A2,2,0,0,1,636.74,292Z"/><circle class="cls-8" cx="637.04" cy="290.6" r="1.98" transform="translate(302.01 905.43) rotate(-85.93)"/><circle class="cls-9" cx="637.04" cy="290.6" r="1.98" transform="translate(302.01 905.43) rotate(-85.93)"/><path class="cls-8" d="M641.3,289.28a2,2,0,1,1-2-2A2,2,0,0,1,641.3,289.28Z"/><path class="cls-9" d="M641.3,289.28a2,2,0,1,1-2-2A2,2,0,0,1,641.3,289.28Z"/><g class="cls-3"><polyline class="cls-10" points="54.95 363.25 57.26 363.13 59.53 363.13 61.81 363.01 64.09 362.89 66.38 362.89 68.66 362.77 71.06 362.65 73.33 362.54 75.61 362.42 77.89 362.3 80.17 362.18 82.45 362.06 84.73 361.94 87.02 361.69 89.3 361.57 91.58 361.45 93.86 361.33 96.25 361.1 98.53 360.98 100.81 360.74 103.09 360.62 105.38 360.5 107.66 360.25 109.94 360.13 112.22 359.89 114.5 359.77 116.78 359.54 119.17 359.42 121.45 359.18 123.73 359.06 126.02 358.81 128.29 358.69 130.57 358.45 132.85 358.33 135.13 358.1 137.41 357.98 139.69 357.74 142.09 357.5 144.38 357.38 146.66 357.13 148.94 357.01 151.22 356.77 153.5 356.65 155.78 356.42 158.06 356.18 160.34 356.06 162.62 355.81 165.01 355.69 167.29 355.45 169.57 355.33 171.85 355.1 174.13 354.86 176.41 354.74 178.69 354.5 180.97 354.38 183.25 354.13 185.53 354.01 187.94 353.77 190.22 353.65 192.5 353.42 194.78 353.18 197.06 353.06 199.34 352.81 201.62 352.69 203.9 352.45 206.18 352.33 208.46 352.1 210.85 351.98 213.13 351.74 215.41 351.62 217.69 351.38 219.97 351.25 222.25 351.01 224.53 350.89 226.81 350.65 229.09 350.54 231.38 350.3 233.78 350.18 236.06 349.94 238.34 349.81 240.62 349.57 242.9 349.45 245.18 349.21 247.46 349.1 249.74 348.86 252.01 348.74 254.29 348.5 256.69 348.38 258.98 348.25 261.25 348.01 263.54 347.89 265.81 347.65 268.1 347.54 270.38 347.3 272.65 347.18 274.94 346.81 277.21 346.45 279.62 346.21 281.89 345.86 284.18 345.5 286.45 345.13 288.74 344.77 291.01 344.54 293.3 344.18 295.57 343.81 297.86 343.45 300.13 343.21 302.42 342.86 304.81 342.5 307.1 342.13 309.38 341.89 311.65 341.54 313.94 341.18 316.21 340.81 318.5 340.57 320.77 340.21 323.06 339.86 325.33 339.5 327.74 339.25 330.01 338.89 332.3 338.42 334.57 337.94 336.86 337.45 339.13 336.98 341.42 336.5 343.69 335.89 345.98 335.42 348.25 334.94 350.65 334.45 352.94 333.98 355.21 333.5 357.5 333.01 359.77 332.54 362.06 332.06 364.33 331.57 366.62 331.1 368.89 330.62 371.18 329.89 373.57 329.3 375.86 328.57 378.13 327.98 380.42 327.25 382.69 326.65 384.98 326.06 387.25 325.33 389.54 324.74 391.81 324.01 394.1 323.42 396.5 322.69 398.77 322.1 401.06 321.25 403.33 320.54 405.62 319.69 407.89 318.86 410.18 318.01 412.45 317.3 414.74 316.45 417.01 315.62 419.42 314.77 421.69 313.94 423.98 312.98 426.25 312.01 428.54 311.06 430.81 310.1 433.1 309.13 435.38 308.18 437.65 307.21 439.94 306.13 442.33 305.18 444.62 304.1 446.89 302.89 449.18 301.81 451.45 300.74 453.74 299.54 456.01 298.45 458.3 297.25 460.57 295.94 462.86 294.74 465.25 293.42 467.54 292.1 469.81 290.77 472.1 289.45 474.38 288.01 476.65 286.57 478.94 285.13 481.21 283.69 483.5 282.25 485.77 280.69 488.06 279.13 490.45 277.45 492.74 275.89 495.01 274.21 497.3 272.65 499.57 270.86 501.86 269.06 504.13 267.25 506.42 265.45 508.69 263.77 510.98 261.74 513.38 259.81 515.65 257.89 517.93 255.97 520.22 253.81 522.5 251.78 524.77 249.62 527.05 247.57 529.34 245.29 531.62 243.01 533.89 240.74 536.29 238.46 538.58 236.06 540.86 233.53 543.13 231.13 545.41 228.62 547.7 225.97 549.98 223.34 552.25 220.69 554.53 217.94 556.82 215.18 559.22 212.18 561.5 209.29 563.77 206.41 566.05 203.29 568.34 200.18 570.62 197.18 572.89 193.94 575.17 190.69 577.46 187.22 579.74 183.85 582.13 180.5 584.41 176.9 586.7 173.29 588.98 169.57 591.25 165.85 593.53 162.01 595.82 158.06 598.1 154.09 600.38 150.01 602.65 145.81 605.05 141.62 607.34 137.18 609.62 132.85 611.89 128.29 614.17 123.73 616.46 119.06 618.74 114.38 621.01 109.58 623.29 104.66 625.58 99.61 627.98 94.45 630.25 89.3 632.53 83.89 634.82 78.38 637.1 72.86 639.38 67.25"/></g><circle class="cls-11" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-12" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-11" cx="57.26" cy="363.14" r="2.52"/><circle class="cls-12" cx="57.26" cy="363.14" r="2.52"/><circle class="cls-11" cx="59.54" cy="363.14" r="2.52"/><circle class="cls-12" cx="59.54" cy="363.14" r="2.52"/><circle class="cls-11" cx="61.82" cy="363.02" r="2.52"/><circle class="cls-12" cx="61.82" cy="363.02" r="2.52"/><circle class="cls-11" cx="64.1" cy="362.9" r="2.52"/><circle class="cls-12" cx="64.1" cy="362.9" r="2.52"/><circle class="cls-11" cx="66.38" cy="362.9" r="2.52"/><circle class="cls-12" cx="66.38" cy="362.9" r="2.52"/><circle class="cls-11" cx="68.65" cy="362.78" r="2.52"/><circle class="cls-12" cx="68.65" cy="362.78" r="2.52"/><circle class="cls-11" cx="71.06" cy="362.66" r="2.52"/><circle class="cls-12" cx="71.06" cy="362.66" r="2.52"/><circle class="cls-11" cx="73.33" cy="362.54" r="2.52"/><circle class="cls-12" cx="73.33" cy="362.54" r="2.52"/><circle class="cls-11" cx="75.62" cy="362.42" r="2.52"/><circle class="cls-12" cx="75.62" cy="362.42" r="2.52"/><circle class="cls-11" cx="77.9" cy="362.3" r="2.52" transform="translate(-290.31 404.11) rotate(-84.35)"/><circle class="cls-12" cx="77.9" cy="362.3" r="2.52" transform="translate(-290.31 404.11) rotate(-84.35)"/><circle class="cls-11" cx="80.17" cy="362.18" r="2.52"/><circle class="cls-12" cx="80.17" cy="362.18" r="2.52"/><circle class="cls-11" cx="82.45" cy="362.06" r="2.52"/><circle class="cls-12" cx="82.45" cy="362.06" r="2.52"/><circle class="cls-11" cx="84.74" cy="361.94" r="2.52"/><circle class="cls-12" cx="84.74" cy="361.94" r="2.52"/><circle class="cls-11" cx="87.02" cy="361.7" r="2.52"/><circle class="cls-12" cx="87.02" cy="361.7" r="2.52"/><circle class="cls-11" cx="89.3" cy="361.58" r="2.52"/><circle class="cls-12" cx="89.3" cy="361.58" r="2.52"/><circle class="cls-11" cx="91.58" cy="361.46" r="2.52"/><circle class="cls-12" cx="91.58" cy="361.46" r="2.52"/><circle class="cls-11" cx="93.86" cy="361.34" r="2.52"/><circle class="cls-12" cx="93.86" cy="361.34" r="2.52"/><circle class="cls-11" cx="96.26" cy="361.1" r="2.52"/><circle class="cls-12" cx="96.26" cy="361.1" r="2.52"/><circle class="cls-11" cx="98.53" cy="360.98" r="2.52" transform="translate(-270.39 423.46) rotate(-84.35)"/><circle class="cls-12" cx="98.53" cy="360.98" r="2.52" transform="translate(-270.39 423.46) rotate(-84.35)"/><circle class="cls-11" cx="100.81" cy="360.74" r="2.52"/><circle class="cls-12" cx="100.81" cy="360.74" r="2.52"/><circle class="cls-11" cx="103.1" cy="360.62" r="2.52"/><circle class="cls-12" cx="103.1" cy="360.62" r="2.52"/><circle class="cls-11" cx="105.38" cy="360.5" r="2.52"/><circle class="cls-12" cx="105.38" cy="360.5" r="2.52"/><circle class="cls-11" cx="107.66" cy="360.26" r="2.52"/><circle class="cls-12" cx="107.66" cy="360.26" r="2.52"/><circle class="cls-11" cx="109.94" cy="360.14" r="2.52"/><circle class="cls-12" cx="109.94" cy="360.14" r="2.52"/><circle class="cls-11" cx="112.22" cy="359.9" r="2.52"/><circle class="cls-12" cx="112.22" cy="359.9" r="2.52"/><circle class="cls-11" cx="114.5" cy="359.78" r="2.52"/><circle class="cls-12" cx="114.5" cy="359.78" r="2.52"/><circle class="cls-11" cx="116.78" cy="359.54" r="2.52"/><circle class="cls-12" cx="116.78" cy="359.54" r="2.52"/><circle class="cls-11" cx="119.17" cy="359.42" r="2.52"/><circle class="cls-12" cx="119.17" cy="359.42" r="2.52"/><circle class="cls-11" cx="121.45" cy="359.18" r="2.52"/><circle class="cls-12" cx="121.45" cy="359.18" r="2.52"/><circle class="cls-11" cx="123.74" cy="359.06" r="2.52"/><circle class="cls-12" cx="123.74" cy="359.06" r="2.52"/><circle class="cls-11" cx="126.02" cy="358.82" r="2.52"/><circle class="cls-12" cx="126.02" cy="358.82" r="2.52"/><circle class="cls-11" cx="128.3" cy="358.7" r="2.52"/><circle class="cls-12" cx="128.3" cy="358.7" r="2.52"/><circle class="cls-11" cx="130.58" cy="358.46" r="2.52"/><circle class="cls-12" cx="130.58" cy="358.46" r="2.52"/><circle class="cls-11" cx="132.86" cy="358.34" r="2.52"/><circle class="cls-12" cx="132.86" cy="358.34" r="2.52"/><circle class="cls-11" cx="135.14" cy="358.1" r="2.52"/><circle class="cls-12" cx="135.14" cy="358.1" r="2.52"/><circle class="cls-11" cx="137.42" cy="357.98" r="2.52"/><circle class="cls-12" cx="137.42" cy="357.98" r="2.52"/><circle class="cls-11" cx="139.69" cy="357.74" r="2.52"/><circle class="cls-12" cx="139.69" cy="357.74" r="2.52"/><circle class="cls-11" cx="142.1" cy="357.5" r="2.52"/><circle class="cls-12" cx="142.1" cy="357.5" r="2.52"/><circle class="cls-11" cx="144.38" cy="357.38" r="2.52"/><circle class="cls-12" cx="144.38" cy="357.38" r="2.52"/><circle class="cls-11" cx="146.66" cy="357.14" r="2.52"/><circle class="cls-12" cx="146.66" cy="357.14" r="2.52"/><circle class="cls-11" cx="148.94" cy="357.02" r="2.52"/><circle class="cls-12" cx="148.94" cy="357.02" r="2.52"/><circle class="cls-11" cx="151.22" cy="356.78" r="2.52"/><circle class="cls-12" cx="151.22" cy="356.78" r="2.52"/><circle class="cls-11" cx="153.5" cy="356.66" r="2.52"/><circle class="cls-12" cx="153.5" cy="356.66" r="2.52"/><circle class="cls-11" cx="155.78" cy="356.42" r="2.52"/><circle class="cls-12" cx="155.78" cy="356.42" r="2.52"/><circle class="cls-11" cx="158.06" cy="356.18" r="2.52"/><circle class="cls-12" cx="158.06" cy="356.18" r="2.52"/><circle class="cls-11" cx="160.33" cy="356.06" r="2.52"/><circle class="cls-12" cx="160.33" cy="356.06" r="2.52"/><circle class="cls-11" cx="162.62" cy="355.82" r="2.52"/><circle class="cls-12" cx="162.62" cy="355.82" r="2.52"/><circle class="cls-11" cx="165.02" cy="355.7" r="2.52"/><circle class="cls-12" cx="165.02" cy="355.7" r="2.52"/><circle class="cls-11" cx="167.3" cy="355.46" r="2.52"/><circle class="cls-12" cx="167.3" cy="355.46" r="2.52"/><circle class="cls-11" cx="169.58" cy="355.34" r="2.52"/><circle class="cls-12" cx="169.58" cy="355.34" r="2.52"/><circle class="cls-11" cx="171.86" cy="355.1" r="2.52"/><circle class="cls-12" cx="171.86" cy="355.1" r="2.52"/><circle class="cls-11" cx="174.14" cy="354.86" r="2.52"/><circle class="cls-12" cx="174.14" cy="354.86" r="2.52"/><circle class="cls-11" cx="176.42" cy="354.74" r="2.52"/><circle class="cls-12" cx="176.42" cy="354.74" r="2.52"/><circle class="cls-11" cx="178.69" cy="354.5" r="2.52"/><circle class="cls-12" cx="178.69" cy="354.5" r="2.52"/><circle class="cls-11" cx="180.97" cy="354.38" r="2.52"/><circle class="cls-12" cx="180.97" cy="354.38" r="2.52"/><circle class="cls-11" cx="183.26" cy="354.14" r="2.52"/><circle class="cls-12" cx="183.26" cy="354.14" r="2.52"/><circle class="cls-11" cx="185.53" cy="354.02" r="2.52"/><circle class="cls-12" cx="185.53" cy="354.02" r="2.52"/><circle class="cls-11" cx="187.94" cy="353.78" r="2.52"/><circle class="cls-12" cx="187.94" cy="353.78" r="2.52"/><circle class="cls-11" cx="190.22" cy="353.66" r="2.52"/><circle class="cls-12" cx="190.22" cy="353.66" r="2.52"/><circle class="cls-11" cx="192.5" cy="353.42" r="2.52"/><circle class="cls-12" cx="192.5" cy="353.42" r="2.52"/><circle class="cls-11" cx="194.78" cy="353.18" r="2.52"/><circle class="cls-12" cx="194.78" cy="353.18" r="2.52"/><circle class="cls-11" cx="197.05" cy="353.06" r="2.52"/><circle class="cls-12" cx="197.05" cy="353.06" r="2.52"/><circle class="cls-11" cx="199.33" cy="352.82" r="2.52"/><circle class="cls-12" cx="199.33" cy="352.82" r="2.52"/><circle class="cls-11" cx="201.61" cy="352.7" r="2.52"/><circle class="cls-12" cx="201.61" cy="352.7" r="2.52"/><circle class="cls-11" cx="203.9" cy="352.46" r="2.52"/><circle class="cls-12" cx="203.9" cy="352.46" r="2.52"/><circle class="cls-11" cx="206.18" cy="352.34" r="2.52"/><circle class="cls-12" cx="206.18" cy="352.34" r="2.52"/><circle class="cls-11" cx="208.46" cy="352.1" r="2.52"/><circle class="cls-12" cx="208.46" cy="352.1" r="2.52"/><circle class="cls-11" cx="210.85" cy="351.98" r="2.52"/><circle class="cls-12" cx="210.85" cy="351.98" r="2.52"/><circle class="cls-11" cx="213.13" cy="351.74" r="2.52"/><circle class="cls-12" cx="213.13" cy="351.74" r="2.52"/><circle class="cls-11" cx="215.42" cy="351.62" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.6, 22.94)"/><circle class="cls-12" cx="215.42" cy="351.62" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.6, 22.94)"/><circle class="cls-11" cx="217.7" cy="351.38" r="2.52"/><circle class="cls-12" cx="217.7" cy="351.38" r="2.52"/><circle class="cls-11" cx="219.98" cy="351.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.54, 23.39)"/><circle class="cls-12" cx="219.98" cy="351.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.54, 23.39)"/><circle class="cls-11" cx="222.26" cy="351.02" r="2.52"/><circle class="cls-12" cx="222.26" cy="351.02" r="2.52"/><circle class="cls-11" cx="224.54" cy="350.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.48, 23.83)"/><circle class="cls-12" cx="224.54" cy="350.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.48, 23.83)"/><circle class="cls-11" cx="226.82" cy="350.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.45, 24.06)"/><circle class="cls-12" cx="226.82" cy="350.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.45, 24.06)"/><circle class="cls-11" cx="229.1" cy="350.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.43, 24.28)"/><circle class="cls-12" cx="229.1" cy="350.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.43, 24.28)"/><circle class="cls-11" cx="231.38" cy="350.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.39, 24.5)"/><circle class="cls-12" cx="231.38" cy="350.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.39, 24.5)"/><circle class="cls-11" cx="233.78" cy="350.18" r="2.52" transform="translate(-33.37 24.74) rotate(-5.65)"/><circle class="cls-12" cx="233.78" cy="350.18" r="2.52" transform="translate(-33.37 24.74) rotate(-5.65)"/><circle class="cls-11" cx="236.06" cy="349.94" r="2.52" transform="translate(-129.73 560.59) rotate(-85.93)"/><circle class="cls-12" cx="236.06" cy="349.94" r="2.52" transform="translate(-129.73 560.59) rotate(-85.93)"/><circle class="cls-11" cx="238.33" cy="349.82" r="2.52" transform="translate(-33.31 25.19) rotate(-5.65)"/><circle class="cls-12" cx="238.33" cy="349.82" r="2.52" transform="translate(-33.31 25.19) rotate(-5.65)"/><circle class="cls-11" cx="240.61" cy="349.58" r="2.52" transform="translate(-125.14 564.8) rotate(-85.93)"/><circle class="cls-12" cx="240.61" cy="349.58" r="2.52" transform="translate(-125.14 564.8) rotate(-85.93)"/><circle class="cls-11" cx="242.9" cy="349.46" r="2.52"/><circle class="cls-12" cx="242.9" cy="349.46" r="2.52"/><circle class="cls-11" cx="245.18" cy="349.22" r="2.52"/><circle class="cls-12" cx="245.18" cy="349.22" r="2.52"/><circle class="cls-11" cx="247.46" cy="349.1" r="2.52"/><circle class="cls-12" cx="247.46" cy="349.1" r="2.52"/><circle class="cls-11" cx="249.74" cy="348.86" r="2.52"/><circle class="cls-12" cx="249.74" cy="348.86" r="2.52"/><circle class="cls-11" cx="252.01" cy="348.74" r="2.52"/><circle class="cls-12" cx="252.01" cy="348.74" r="2.52"/><circle class="cls-11" cx="254.29" cy="348.5" r="2.52"/><circle class="cls-12" cx="254.29" cy="348.5" r="2.52"/><circle class="cls-11" cx="256.7" cy="348.38" r="2.52"/><circle class="cls-12" cx="256.7" cy="348.38" r="2.52"/><circle class="cls-11" cx="258.98" cy="348.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.06, 27.21)"/><circle class="cls-12" cx="258.98" cy="348.26" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33.06, 27.21)"/><circle class="cls-11" cx="261.26" cy="348.02" r="2.52"/><circle class="cls-12" cx="261.26" cy="348.02" r="2.52"/><circle class="cls-11" cx="263.54" cy="347.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33, 27.66)"/><circle class="cls-12" cx="263.54" cy="347.9" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -33, 27.66)"/><circle class="cls-11" cx="265.82" cy="347.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.96, 27.88)"/><circle class="cls-12" cx="265.82" cy="347.66" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.96, 27.88)"/><circle class="cls-11" cx="268.1" cy="347.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.94, 28.11)"/><circle class="cls-12" cx="268.1" cy="347.54" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.94, 28.11)"/><circle class="cls-11" cx="270.38" cy="347.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.91, 28.33)"/><circle class="cls-12" cx="270.38" cy="347.3" r="2.52" transform="matrix(1, -0.1, 0.1, 1, -32.91, 28.33)"/><circle class="cls-11" cx="272.65" cy="347.18" r="2.52"/><circle class="cls-12" cx="272.65" cy="347.18" r="2.52"/><circle class="cls-11" cx="274.94" cy="346.82" r="2.52"/><circle class="cls-12" cx="274.94" cy="346.82" r="2.52"/><circle class="cls-11" cx="277.22" cy="346.46" r="2.52"/><circle class="cls-12" cx="277.22" cy="346.46" r="2.52"/><circle class="cls-11" cx="279.61" cy="346.22" r="2.52" transform="translate(-85.55 600.58) rotate(-85.93)"/><circle class="cls-12" cx="279.61" cy="346.22" r="2.52" transform="translate(-85.55 600.58) rotate(-85.93)"/><circle class="cls-11" cx="281.9" cy="345.86" r="2.52"/><circle class="cls-12" cx="281.9" cy="345.86" r="2.52"/><circle class="cls-11" cx="284.18" cy="345.5" r="2.52"/><circle class="cls-12" cx="284.18" cy="345.5" r="2.52"/><circle class="cls-11" cx="286.46" cy="345.14" r="2.52"/><circle class="cls-12" cx="286.46" cy="345.14" r="2.52"/><circle class="cls-11" cx="288.74" cy="344.78" r="2.52"/><circle class="cls-12" cx="288.74" cy="344.78" r="2.52"/><circle class="cls-11" cx="291.01" cy="344.54" r="2.52"/><circle class="cls-12" cx="291.01" cy="344.54" r="2.52"/><circle class="cls-11" cx="293.29" cy="344.18" r="2.52"/><circle class="cls-12" cx="293.29" cy="344.18" r="2.52"/><circle class="cls-11" cx="295.57" cy="343.82" r="2.52"/><circle class="cls-12" cx="295.57" cy="343.82" r="2.52"/><circle class="cls-11" cx="297.85" cy="343.46" r="2.52"/><circle class="cls-12" cx="297.85" cy="343.46" r="2.52"/><circle class="cls-11" cx="300.13" cy="343.22" r="2.52"/><circle class="cls-12" cx="300.13" cy="343.22" r="2.52"/><circle class="cls-11" cx="302.42" cy="342.86" r="2.52" transform="translate(-61.02 620.2) rotate(-85.93)"/><circle class="cls-12" cx="302.42" cy="342.86" r="2.52" transform="translate(-61.02 620.2) rotate(-85.93)"/><circle class="cls-11" cx="304.82" cy="342.5" r="2.52" transform="translate(-58.43 622.26) rotate(-85.93)"/><circle class="cls-12" cx="304.82" cy="342.5" r="2.52" transform="translate(-58.43 622.26) rotate(-85.93)"/><circle class="cls-11" cx="307.1" cy="342.14" r="2.52"/><circle class="cls-12" cx="307.1" cy="342.14" r="2.52"/><circle class="cls-11" cx="309.38" cy="341.9" r="2.52"/><circle class="cls-12" cx="309.38" cy="341.9" r="2.52"/><circle class="cls-11" cx="311.65" cy="341.54" r="2.52" transform="translate(-50.68 54.33) rotate(-9.22)"/><circle class="cls-12" cx="311.65" cy="341.54" r="2.52" transform="translate(-50.68 54.33) rotate(-9.22)"/><circle class="cls-11" cx="313.94" cy="341.18" r="2.52" transform="translate(-73.12 596.41) rotate(-80.78)"/><circle class="cls-12" cx="313.94" cy="341.18" r="2.52" transform="translate(-73.12 596.41) rotate(-80.78)"/><circle class="cls-11" cx="316.22" cy="340.82" r="2.52" transform="translate(-70.85 598.35) rotate(-80.78)"/><circle class="cls-12" cx="316.22" cy="340.82" r="2.52" transform="translate(-70.85 598.35) rotate(-80.78)"/><circle class="cls-11" cx="318.5" cy="340.58" r="2.52"/><circle class="cls-12" cx="318.5" cy="340.58" r="2.52"/><circle class="cls-11" cx="320.78" cy="340.22" r="2.52"/><circle class="cls-12" cx="320.78" cy="340.22" r="2.52"/><circle class="cls-11" cx="323.06" cy="339.86" r="2.52"/><circle class="cls-12" cx="323.06" cy="339.86" r="2.52"/><circle class="cls-11" cx="325.33" cy="339.5" r="2.52"/><circle class="cls-12" cx="325.33" cy="339.5" r="2.52"/><circle class="cls-11" cx="327.74" cy="339.26" r="2.52"/><circle class="cls-12" cx="327.74" cy="339.26" r="2.52"/><circle class="cls-11" cx="330.01" cy="338.9" r="2.52"/><circle class="cls-12" cx="330.01" cy="338.9" r="2.52"/><circle class="cls-11" cx="332.29" cy="338.42" r="2.52"/><circle class="cls-12" cx="332.29" cy="338.42" r="2.52"/><circle class="cls-11" cx="334.57" cy="337.94" r="2.52"/><circle class="cls-12" cx="334.57" cy="337.94" r="2.52"/><circle class="cls-11" cx="336.85" cy="337.46" r="2.52"/><circle class="cls-12" cx="336.85" cy="337.46" r="2.52"/><circle class="cls-11" cx="339.13" cy="336.98" r="2.52"/><circle class="cls-12" cx="339.13" cy="336.98" r="2.52"/><circle class="cls-11" cx="341.42" cy="336.5" r="2.52"/><circle class="cls-12" cx="341.42" cy="336.5" r="2.52"/><circle class="cls-11" cx="343.7" cy="335.9" r="2.52"/><circle class="cls-12" cx="343.7" cy="335.9" r="2.52"/><circle class="cls-11" cx="345.98" cy="335.42" r="2.52"/><circle class="cls-12" cx="345.98" cy="335.42" r="2.52"/><circle class="cls-11" cx="348.26" cy="334.94" r="2.52" transform="translate(-19.37 648.49) rotate(-84.34)"/><circle class="cls-12" cx="348.26" cy="334.94" r="2.52" transform="translate(-19.37 648.49) rotate(-84.34)"/><circle class="cls-11" cx="350.65" cy="334.46" r="2.52" transform="translate(-49.05 60.49) rotate(-9.22)"/><circle class="cls-12" cx="350.65" cy="334.46" r="2.52" transform="translate(-49.05 60.49) rotate(-9.22)"/><circle class="cls-11" cx="352.94" cy="333.98" r="2.52"/><circle class="cls-12" cx="352.94" cy="333.98" r="2.52"/><circle class="cls-11" cx="355.22" cy="333.5" r="2.52"/><circle class="cls-12" cx="355.22" cy="333.5" r="2.52"/><circle class="cls-11" cx="357.5" cy="333.02" r="2.52"/><circle class="cls-12" cx="357.5" cy="333.02" r="2.52"/><circle class="cls-11" cx="359.78" cy="332.54" r="2.52" transform="translate(-26.1 634.4) rotate(-80.78)"/><circle class="cls-12" cx="359.78" cy="332.54" r="2.52" transform="translate(-26.1 634.4) rotate(-80.78)"/><circle class="cls-11" cx="362.06" cy="332.06" r="2.52"/><circle class="cls-12" cx="362.06" cy="332.06" r="2.52"/><circle class="cls-11" cx="364.33" cy="331.58" r="2.52"/><circle class="cls-12" cx="364.33" cy="331.58" r="2.52"/><circle class="cls-11" cx="366.61" cy="331.1" r="2.52" transform="translate(-18.93 639.94) rotate(-80.78)"/><circle class="cls-12" cx="366.61" cy="331.1" r="2.52" transform="translate(-18.93 639.94) rotate(-80.78)"/><circle class="cls-11" cx="368.9" cy="330.62" r="2.52"/><circle class="cls-12" cx="368.9" cy="330.62" r="2.52"/><circle class="cls-11" cx="371.18" cy="329.9" r="2.52"/><circle class="cls-12" cx="371.18" cy="329.9" r="2.52"/><circle class="cls-11" cx="373.57" cy="329.3" r="2.52"/><circle class="cls-12" cx="373.57" cy="329.3" r="2.52"/><circle class="cls-11" cx="375.85" cy="328.58" r="2.52"/><circle class="cls-12" cx="375.85" cy="328.58" r="2.52"/><circle class="cls-11" cx="378.13" cy="327.98" r="2.52"/><circle class="cls-12" cx="378.13" cy="327.98" r="2.52"/><circle class="cls-11" cx="380.42" cy="327.26" r="2.52"/><circle class="cls-12" cx="380.42" cy="327.26" r="2.52"/><circle class="cls-11" cx="382.7" cy="326.66" r="2.52" transform="translate(19.92 675.3) rotate(-84.34)"/><circle class="cls-12" cx="382.7" cy="326.66" r="2.52" transform="translate(19.92 675.3) rotate(-84.34)"/><circle class="cls-11" cx="384.98" cy="326.06" r="2.52"/><circle class="cls-12" cx="384.98" cy="326.06" r="2.52"/><circle class="cls-11" cx="387.26" cy="325.34" r="2.52"/><circle class="cls-12" cx="387.26" cy="325.34" r="2.52"/><circle class="cls-11" cx="389.54" cy="324.74" r="2.52" transform="translate(-46.99 66.59) rotate(-9.22)"/><circle class="cls-12" cx="389.54" cy="324.74" r="2.52" transform="translate(-46.99 66.59) rotate(-9.22)"/><circle class="cls-11" cx="391.82" cy="324.02" r="2.52" transform="translate(30.77 682) rotate(-84.34)"/><circle class="cls-12" cx="391.82" cy="324.02" r="2.52" transform="translate(30.77 682) rotate(-84.34)"/><circle class="cls-11" cx="394.1" cy="323.42" r="2.52"/><circle class="cls-12" cx="394.1" cy="323.42" r="2.52"/><circle class="cls-11" cx="396.5" cy="322.7" r="2.52"/><circle class="cls-12" cx="396.5" cy="322.7" r="2.52"/><circle class="cls-11" cx="398.78" cy="322.1" r="2.52" transform="translate(16.96 664.13) rotate(-80.78)"/><circle class="cls-12" cx="398.78" cy="322.1" r="2.52" transform="translate(16.96 664.13) rotate(-80.78)"/><circle class="cls-11" cx="401.06" cy="321.26" r="2.52"/><circle class="cls-12" cx="401.06" cy="321.26" r="2.52"/><circle class="cls-11" cx="403.33" cy="320.54" r="2.52" transform="translate(22.33 667.32) rotate(-80.78)"/><circle class="cls-12" cx="403.33" cy="320.54" r="2.52" transform="translate(22.33 667.32) rotate(-80.78)"/><circle class="cls-11" cx="405.61" cy="319.7" r="2.52"/><circle class="cls-12" cx="405.61" cy="319.7" r="2.52"/><circle class="cls-11" cx="407.9" cy="318.86" r="2.52"/><circle class="cls-12" cx="407.9" cy="318.86" r="2.52"/><circle class="cls-11" cx="410.18" cy="318.02" r="2.52"/><circle class="cls-12" cx="410.18" cy="318.02" r="2.52"/><circle class="cls-11" cx="412.46" cy="317.3" r="2.52"/><circle class="cls-12" cx="412.46" cy="317.3" r="2.52"/><circle class="cls-11" cx="414.74" cy="316.46" r="2.52"/><circle class="cls-12" cx="414.74" cy="316.46" r="2.52"/><circle class="cls-11" cx="417.01" cy="315.62" r="2.52"/><circle class="cls-12" cx="417.01" cy="315.62" r="2.52"/><circle class="cls-11" cx="419.42" cy="314.78" r="2.52"/><circle class="cls-12" cx="419.42" cy="314.78" r="2.52"/><circle class="cls-11" cx="421.7" cy="313.94" r="2.52" transform="translate(67.73 702.64) rotate(-84.34)"/><circle class="cls-12" cx="421.7" cy="313.94" r="2.52" transform="translate(67.73 702.64) rotate(-84.34)"/><circle class="cls-11" cx="423.98" cy="312.98" r="2.52"/><circle class="cls-12" cx="423.98" cy="312.98" r="2.52"/><circle class="cls-11" cx="426.26" cy="312.02" r="2.52" transform="translate(73.76 705.45) rotate(-84.34)"/><circle class="cls-12" cx="426.26" cy="312.02" r="2.52" transform="translate(73.76 705.45) rotate(-84.34)"/><circle class="cls-11" cx="428.54" cy="311.06" r="2.52"/><circle class="cls-12" cx="428.54" cy="311.06" r="2.52"/><circle class="cls-11" cx="430.82" cy="310.1" r="2.52" transform="translate(-44.11 73.01) rotate(-9.22)"/><circle class="cls-12" cx="430.82" cy="310.1" r="2.52" transform="translate(-44.11 73.01) rotate(-9.22)"/><circle class="cls-11" cx="433.1" cy="309.14" r="2.52"/><circle class="cls-12" cx="433.1" cy="309.14" r="2.52"/><circle class="cls-11" cx="435.38" cy="308.18" r="2.52" transform="translate(-43.74 73.72) rotate(-9.22)"/><circle class="cls-12" cx="435.38" cy="308.18" r="2.52" transform="translate(-43.74 73.72) rotate(-9.22)"/><circle class="cls-11" cx="437.65" cy="307.22" r="2.52" transform="translate(88.81 712.47) rotate(-84.34)"/><circle class="cls-12" cx="437.65" cy="307.22" r="2.52" transform="translate(88.81 712.47) rotate(-84.34)"/><circle class="cls-11" cx="439.94" cy="306.14" r="2.52"/><circle class="cls-12" cx="439.94" cy="306.14" r="2.52"/><circle class="cls-11" cx="442.33" cy="305.18" r="2.52" transform="translate(70.25 692.91) rotate(-80.78)"/><circle class="cls-12" cx="442.33" cy="305.18" r="2.52" transform="translate(70.25 692.91) rotate(-80.78)"/><circle class="cls-11" cx="444.61" cy="304.1" r="2.52" transform="translate(73.23 694.26) rotate(-80.78)"/><circle class="cls-12" cx="444.61" cy="304.1" r="2.52" transform="translate(73.23 694.26) rotate(-80.78)"/><circle class="cls-11" cx="446.9" cy="302.9" r="2.52"/><circle class="cls-12" cx="446.9" cy="302.9" r="2.52"/><circle class="cls-11" cx="449.18" cy="301.82" r="2.52"/><circle class="cls-12" cx="449.18" cy="301.82" r="2.52"/><circle class="cls-11" cx="451.46" cy="300.74" r="2.52"/><circle class="cls-12" cx="451.46" cy="300.74" r="2.52"/><circle class="cls-11" cx="453.73" cy="299.54" r="2.52" transform="translate(85.39 699.43) rotate(-80.78)"/><circle class="cls-12" cx="453.73" cy="299.54" r="2.52" transform="translate(85.39 699.43) rotate(-80.78)"/><circle class="cls-11" cx="456.01" cy="298.46" r="2.52"/><circle class="cls-12" cx="456.01" cy="298.46" r="2.52"/><path class="cls-11" d="M460.81,297.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,460.81,297.26Z"/><path class="cls-12" d="M460.81,297.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,460.81,297.26Z"/><path class="cls-11" d="M463.1,295.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,463.1,295.94Z"/><path class="cls-12" d="M463.1,295.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,463.1,295.94Z"/><path class="cls-11" d="M465.37,294.74a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,465.37,294.74Z"/><path class="cls-12" d="M465.37,294.74a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,465.37,294.74Z"/><path class="cls-11" d="M467.78,293.42a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,467.78,293.42Z"/><path class="cls-12" d="M467.78,293.42a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,467.78,293.42Z"/><circle class="cls-11" cx="467.54" cy="292.1" r="2.52" transform="translate(-26.51 47.49) rotate(-5.65)"/><circle class="cls-12" cx="467.54" cy="292.1" r="2.52" transform="translate(-26.51 47.49) rotate(-5.65)"/><path class="cls-11" d="M472.33,290.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,472.33,290.78Z"/><path class="cls-12" d="M472.33,290.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,472.33,290.78Z"/><circle class="cls-11" cx="472.1" cy="289.46" r="2.52" transform="translate(-26.22 47.93) rotate(-5.65)"/><circle class="cls-12" cx="472.1" cy="289.46" r="2.52" transform="translate(-26.22 47.93) rotate(-5.65)"/><path class="cls-11" d="M476.89,288a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,476.89,288Z"/><path class="cls-12" d="M476.89,288a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,476.89,288Z"/><circle class="cls-11" cx="476.66" cy="286.58" r="2.52" transform="translate(157.01 741.72) rotate(-85.93)"/><circle class="cls-12" cx="476.66" cy="286.58" r="2.52" transform="translate(157.01 741.72) rotate(-85.93)"/><path class="cls-11" d="M481.45,285.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,481.45,285.14Z"/><path class="cls-12" d="M481.45,285.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,481.45,285.14Z"/><circle class="cls-11" cx="481.22" cy="283.7" r="2.52"/><circle class="cls-12" cx="481.22" cy="283.7" r="2.52"/><path class="cls-11" d="M486,282.26a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,486,282.26Z"/><path class="cls-12" d="M486,282.26a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,486,282.26Z"/><circle class="cls-11" cx="485.78" cy="280.7" r="2.52"/><circle class="cls-12" cx="485.78" cy="280.7" r="2.52"/><path class="cls-11" d="M490.57,279.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,279.14Z"/><path class="cls-12" d="M490.57,279.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,279.14Z"/><circle class="cls-11" cx="490.45" cy="277.46" r="2.52" transform="translate(-24.95 49.68) rotate(-5.65)"/><circle class="cls-12" cx="490.45" cy="277.46" r="2.52" transform="translate(-24.95 49.68) rotate(-5.65)"/><path class="cls-11" d="M495.25,275.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,495.25,275.9Z"/><path class="cls-12" d="M495.25,275.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,495.25,275.9Z"/><circle class="cls-11" cx="495.01" cy="274.22" r="2.52" transform="translate(186.4 748.55) rotate(-85.93)"/><circle class="cls-12" cx="495.01" cy="274.22" r="2.52" transform="translate(186.4 748.55) rotate(-85.93)"/><path class="cls-11" d="M499.81,272.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,499.81,272.66Z"/><path class="cls-12" d="M499.81,272.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,499.81,272.66Z"/><path class="cls-11" d="M502.1,270.86a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,502.1,270.86Z"/><path class="cls-12" d="M502.1,270.86a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,502.1,270.86Z"/><path class="cls-11" d="M504.37,269.06a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,504.37,269.06Z"/><path class="cls-12" d="M504.37,269.06a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,504.37,269.06Z"/><path class="cls-11" d="M506.66,267.26a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,506.66,267.26Z"/><path class="cls-12" d="M506.66,267.26a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,506.66,267.26Z"/><path class="cls-11" d="M508.93,265.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,508.93,265.46Z"/><path class="cls-12" d="M508.93,265.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,508.93,265.46Z"/><path class="cls-11" d="M511.22,263.78a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,511.22,263.78Z"/><path class="cls-12" d="M511.22,263.78a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,511.22,263.78Z"/><path class="cls-11" d="M513.49,261.74a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,513.49,261.74Z"/><path class="cls-12" d="M513.49,261.74a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,513.49,261.74Z"/><path class="cls-11" d="M515.89,259.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,515.89,259.82Z"/><path class="cls-12" d="M515.89,259.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,515.89,259.82Z"/><circle class="cls-11" cx="515.66" cy="257.9" r="2.52"/><circle class="cls-12" cx="515.66" cy="257.9" r="2.52"/><path class="cls-11" d="M520.45,256a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,520.45,256Z"/><path class="cls-12" d="M520.45,256a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,520.45,256Z"/><circle class="cls-11" cx="520.22" cy="253.82" r="2.52" transform="translate(-22.48 52.5) rotate(-5.65)"/><circle class="cls-12" cx="520.22" cy="253.82" r="2.52" transform="translate(-22.48 52.5) rotate(-5.65)"/><path class="cls-11" d="M525,251.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,525,251.78Z"/><path class="cls-12" d="M525,251.78a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,525,251.78Z"/><circle class="cls-11" cx="524.78" cy="249.62" r="2.52"/><circle class="cls-12" cx="524.78" cy="249.62" r="2.52"/><path class="cls-11" d="M529.57,247.58a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,247.58Z"/><path class="cls-12" d="M529.57,247.58a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,247.58Z"/><circle class="cls-11" cx="529.33" cy="245.3" r="2.52" transform="translate(247.13 755.91) rotate(-85.93)"/><circle class="cls-12" cx="529.33" cy="245.3" r="2.52" transform="translate(247.13 755.91) rotate(-85.93)"/><path class="cls-11" d="M534.13,243a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,243Z"/><path class="cls-12" d="M534.13,243a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,243Z"/><circle class="cls-11" cx="533.89" cy="240.74" r="2.52" transform="translate(-21.12 53.78) rotate(-5.65)"/><circle class="cls-12" cx="533.89" cy="240.74" r="2.52" transform="translate(-21.12 53.78) rotate(-5.65)"/><path class="cls-11" d="M538.81,238.46a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,538.81,238.46Z"/><path class="cls-12" d="M538.81,238.46a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,538.81,238.46Z"/><path class="cls-11" d="M541.1,236.06a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,541.1,236.06Z"/><path class="cls-12" d="M541.1,236.06a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,541.1,236.06Z"/><path class="cls-11" d="M543.37,233.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,543.37,233.54Z"/><path class="cls-12" d="M543.37,233.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,543.37,233.54Z"/><path class="cls-11" d="M545.66,231.14a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,545.66,231.14Z"/><path class="cls-12" d="M545.66,231.14a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,545.66,231.14Z"/><path class="cls-11" d="M547.93,228.62a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,547.93,228.62Z"/><path class="cls-12" d="M547.93,228.62a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,547.93,228.62Z"/><path class="cls-11" d="M550.22,226a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,550.22,226Z"/><path class="cls-12" d="M550.22,226a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,550.22,226Z"/><path class="cls-11" d="M552.49,223.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,552.49,223.34Z"/><path class="cls-12" d="M552.49,223.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,552.49,223.34Z"/><path class="cls-11" d="M554.78,220.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,554.78,220.7Z"/><path class="cls-12" d="M554.78,220.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,554.78,220.7Z"/><circle class="cls-11" cx="554.54" cy="217.94" r="2.52" transform="translate(297.84 755.63) rotate(-85.93)"/><circle class="cls-12" cx="554.54" cy="217.94" r="2.52" transform="translate(297.84 755.63) rotate(-85.93)"/><path class="cls-11" d="M559.33,215.18a2.52,2.52,0,1,1-2.52-2.53A2.52,2.52,0,0,1,559.33,215.18Z"/><path class="cls-12" d="M559.33,215.18a2.52,2.52,0,1,1-2.52-2.53A2.52,2.52,0,0,1,559.33,215.18Z"/><circle class="cls-11" cx="559.22" cy="212.18" r="2.52" transform="translate(260.2 730.18) rotate(-80.78)"/><circle class="cls-12" cx="559.22" cy="212.18" r="2.52" transform="translate(260.2 730.18) rotate(-80.78)"/><path class="cls-11" d="M564,209.3a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,564,209.3Z"/><path class="cls-12" d="M564,209.3a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,564,209.3Z"/><circle class="cls-11" cx="563.78" cy="206.42" r="2.52"/><circle class="cls-12" cx="563.78" cy="206.42" r="2.52"/><path class="cls-11" d="M568.57,203.3a2.52,2.52,0,1,1-2.51-2.53A2.53,2.53,0,0,1,568.57,203.3Z"/><path class="cls-12" d="M568.57,203.3a2.52,2.52,0,1,1-2.51-2.53A2.53,2.53,0,0,1,568.57,203.3Z"/><circle class="cls-11" cx="568.33" cy="200.18" r="2.52" transform="translate(279.71 729.11) rotate(-80.78)"/><circle class="cls-12" cx="568.33" cy="200.18" r="2.52" transform="translate(279.71 729.11) rotate(-80.78)"/><path class="cls-11" d="M573.13,197.18a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,573.13,197.18Z"/><path class="cls-12" d="M573.13,197.18a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,573.13,197.18Z"/><circle class="cls-11" cx="572.89" cy="193.94" r="2.52" transform="translate(-30.61 234) rotate(-22.5)"/><circle class="cls-12" cx="572.89" cy="193.94" r="2.52" transform="translate(-30.61 234) rotate(-22.5)"/><path class="cls-11" d="M577.69,190.7a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,190.7Z"/><path class="cls-12" d="M577.69,190.7a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,190.7Z"/><circle class="cls-11" cx="577.45" cy="187.22" r="2.52" transform="translate(-27.69 235.23) rotate(-22.5)"/><circle class="cls-12" cx="577.45" cy="187.22" r="2.52" transform="translate(-27.69 235.23) rotate(-22.5)"/><path class="cls-11" d="M582.25,183.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,582.25,183.86Z"/><path class="cls-12" d="M582.25,183.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,582.25,183.86Z"/><path class="cls-11" d="M584.66,180.49a2.52,2.52,0,1,1-2.53-2.51A2.53,2.53,0,0,1,584.66,180.49Z"/><path class="cls-12" d="M584.66,180.49a2.52,2.52,0,1,1-2.53-2.51A2.53,2.53,0,0,1,584.66,180.49Z"/><path class="cls-11" d="M586.93,176.9a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,586.93,176.9Z"/><path class="cls-12" d="M586.93,176.9a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,586.93,176.9Z"/><path class="cls-11" d="M589.22,173.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,589.22,173.3Z"/><path class="cls-12" d="M589.22,173.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,589.22,173.3Z"/><path class="cls-11" d="M591.49,169.58a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,591.49,169.58Z"/><path class="cls-12" d="M591.49,169.58a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,591.49,169.58Z"/><path class="cls-11" d="M593.78,165.86a2.52,2.52,0,1,1-2.53-2.52A2.54,2.54,0,0,1,593.78,165.86Z"/><path class="cls-12" d="M593.78,165.86a2.52,2.52,0,1,1-2.53-2.52A2.54,2.54,0,0,1,593.78,165.86Z"/><circle class="cls-11" cx="593.54" cy="162.02" r="2.52" transform="translate(338.54 721.93) rotate(-80.78)"/><circle class="cls-12" cx="593.54" cy="162.02" r="2.52" transform="translate(338.54 721.93) rotate(-80.78)"/><path class="cls-11" d="M598.33,158.05a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,598.33,158.05Z"/><path class="cls-12" d="M598.33,158.05a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,598.33,158.05Z"/><circle class="cls-11" cx="598.1" cy="154.1" r="2.52" transform="translate(-13.44 240.61) rotate(-22.5)"/><circle class="cls-12" cx="598.1" cy="154.1" r="2.52" transform="translate(-13.44 240.61) rotate(-22.5)"/><path class="cls-11" d="M602.89,150a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,602.89,150Z"/><path class="cls-12" d="M602.89,150a2.52,2.52,0,1,1-2.52-2.53A2.53,2.53,0,0,1,602.89,150Z"/><circle class="cls-11" cx="602.66" cy="145.82" r="2.52" transform="translate(-9.93 241.73) rotate(-22.5)"/><circle class="cls-12" cx="602.66" cy="145.82" r="2.52" transform="translate(-9.93 241.73) rotate(-22.5)"/><path class="cls-11" d="M607.57,141.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,141.62Z"/><path class="cls-12" d="M607.57,141.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,141.62Z"/><circle class="cls-11" cx="607.33" cy="137.18" r="2.52" transform="translate(374.65 714.69) rotate(-80.78)"/><circle class="cls-12" cx="607.33" cy="137.18" r="2.52" transform="translate(374.65 714.69) rotate(-80.78)"/><path class="cls-11" d="M612.13,132.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,612.13,132.86Z"/><path class="cls-12" d="M612.13,132.86a2.52,2.52,0,1,1-2.51-2.52A2.53,2.53,0,0,1,612.13,132.86Z"/><circle class="cls-11" cx="611.89" cy="128.3" r="2.52" transform="translate(387.24 711.74) rotate(-80.78)"/><circle class="cls-12" cx="611.89" cy="128.3" r="2.52" transform="translate(387.24 711.74) rotate(-80.78)"/><path class="cls-11" d="M616.69,123.74a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,616.69,123.74Z"/><path class="cls-12" d="M616.69,123.74a2.52,2.52,0,1,1-2.51-2.53A2.52,2.52,0,0,1,616.69,123.74Z"/><circle class="cls-11" cx="616.45" cy="119.06" r="2.52" transform="translate(1.36 244.97) rotate(-22.5)"/><circle class="cls-12" cx="616.45" cy="119.06" r="2.52" transform="translate(1.36 244.97) rotate(-22.5)"/><path class="cls-11" d="M621.25,114.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,621.25,114.38Z"/><path class="cls-12" d="M621.25,114.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,621.25,114.38Z"/><circle class="cls-11" cx="621.01" cy="109.58" r="2.52" transform="translate(413.38 705.02) rotate(-80.78)"/><circle class="cls-12" cx="621.01" cy="109.58" r="2.52" transform="translate(413.38 705.02) rotate(-80.78)"/><path class="cls-11" d="M625.81,104.65a2.52,2.52,0,1,1-2.51-2.51A2.52,2.52,0,0,1,625.81,104.65Z"/><path class="cls-12" d="M625.81,104.65a2.52,2.52,0,1,1-2.51-2.51A2.52,2.52,0,0,1,625.81,104.65Z"/><path class="cls-11" d="M628.1,99.62a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,628.1,99.62Z"/><path class="cls-12" d="M628.1,99.62a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,628.1,99.62Z"/><path class="cls-11" d="M630.49,94.46A2.52,2.52,0,1,1,628,91.93,2.51,2.51,0,0,1,630.49,94.46Z"/><path class="cls-12" d="M630.49,94.46A2.52,2.52,0,1,1,628,91.93,2.51,2.51,0,0,1,630.49,94.46Z"/><path class="cls-11" d="M632.78,89.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,632.78,89.3Z"/><path class="cls-12" d="M632.78,89.3a2.52,2.52,0,1,1-2.53-2.53A2.54,2.54,0,0,1,632.78,89.3Z"/><circle class="cls-11" cx="632.54" cy="83.9" r="2.52" transform="translate(448.4 694.82) rotate(-80.78)"/><circle class="cls-12" cx="632.54" cy="83.9" r="2.52" transform="translate(448.4 694.82) rotate(-80.78)"/><path class="cls-11" d="M637.33,78.38a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,637.33,78.38Z"/><path class="cls-12" d="M637.33,78.38a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,637.33,78.38Z"/><circle class="cls-11" cx="637.1" cy="72.86" r="2.52"/><circle class="cls-12" cx="637.1" cy="72.86" r="2.52"/><path class="cls-11" d="M641.89,67.21a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,641.89,67.21Z"/><path class="cls-12" d="M641.89,67.21a2.52,2.52,0,1,1-2.52-2.51A2.52,2.52,0,0,1,641.89,67.21Z"/><g class="cls-13"><text class="cls-14" transform="translate(40.94 365.91)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(27.23 315.31)">5000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 264.69)">10000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 214.04)">15000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 163.44)">20000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 112.82)">25000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 62.17)">30000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 11.55)">35000</text></g><g class="cls-13"><text class="cls-14" transform="translate(52.69 377.63)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(165.03 377.63)">50</text></g><g class="cls-13"><text class="cls-14" transform="translate(277.36 377.63)">100</text></g><g class="cls-13"><text class="cls-14" transform="translate(391.98 377.63)">150</text></g><g class="cls-13"><text class="cls-14" transform="translate(506.58 377.63)">200</text></g><g class="cls-13"><text class="cls-14" transform="translate(621.19 377.63)">250</text></g><g class="cls-13"><text class="cls-15" transform="translate(17.56 199.36) rotate(-90)">Qstep</text></g><g class="cls-13"><text class="cls-15" transform="translate(325.65 386.9)">Q<tspan class="cls-16" x="11.53" y="0">_</tspan><tspan class="cls-17" x="19.54" y="0">i</tspan><tspan class="cls-18" x="23.97" y="0">n</tspan><tspan class="cls-19" x="31.98" y="0">d</tspan><tspan x="40.01" y="0">ex</tspan></text></g><line class="cls-4" x1="481.68" y1="70.09" x2="500.88" y2="70.09"/><path class="cls-5" d="M493.1,69.92a2,2,0,1,1-2-2A2,2,0,0,1,493.1,69.92Z"/><path class="cls-20" d="M493.1,69.92a2,2,0,1,1-2-2A2,2,0,0,1,493.1,69.92Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 74.49)"><tspan class="cls-22">8</tspan><tspan x="6.98" y="0">-</tspan><tspan class="cls-23" x="11.65" y="0">b</tspan><tspan class="cls-22" x="18.73" y="0">it</tspan><tspan class="cls-24" x="26.45" y="0"> </tspan><tspan class="cls-25" x="30.03" y="0">A</tspan><tspan x="40.11" y="0">C</tspan></text></g><line class="cls-7" x1="481.68" y1="90.76" x2="500.88" y2="90.76"/><path class="cls-8" d="M493.1,90.68a2,2,0,1,1-2-2A2,2,0,0,1,493.1,90.68Z"/><path class="cls-26" d="M493.1,90.68a2,2,0,1,1-2-2A2,2,0,0,1,493.1,90.68Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 95.15)"><tspan class="cls-22">1</tspan><tspan class="cls-27" x="6.98" y="0">0</tspan><tspan class="cls-28" x="14.03" y="0">-</tspan><tspan class="cls-29" x="18.7" y="0">bi</tspan><tspan class="cls-30" x="29.54" y="0">t</tspan><tspan class="cls-31" x="33.48" y="0"> </tspan><tspan class="cls-32" x="36.99" y="0">A</tspan><tspan x="47.05" y="0">C</tspan></text></g><line class="cls-10" x1="481.68" y1="111.43" x2="500.88" y2="111.43"/><path class="cls-11" d="M493.69,111.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,493.69,111.38Z"/><path class="cls-33" d="M493.69,111.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,493.69,111.38Z"/><g class="cls-13"><text class="cls-21" transform="translate(503.01 115.82)"><tspan class="cls-22">1</tspan><tspan class="cls-27" x="6.98" y="0">2</tspan><tspan class="cls-28" x="14.03" y="0">-</tspan><tspan class="cls-29" x="18.7" y="0">bi</tspan><tspan class="cls-30" x="29.54" y="0">t</tspan><tspan class="cls-31" x="33.48" y="0"> </tspan><tspan class="cls-32" x="36.99" y="0">A</tspan><tspan x="47.05" y="0">C</tspan></text></g><rect class="cls-2" x="0.38" y="0.38" width="652.8" height="391.32"/></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/quant_dc.svg b/third_party/aom/doc/img/quant_dc.svg
new file mode 100644
index 0000000000..4fda1084e1
--- /dev/null
+++ b/third_party/aom/doc/img/quant_dc.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 653.55 392.07"><defs><style>.cls-1,.cls-10,.cls-12,.cls-18,.cls-2,.cls-24,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{fill:none;}.cls-2{stroke:#d9d9d9;}.cls-10,.cls-12,.cls-18,.cls-2,.cls-24,.cls-33,.cls-4,.cls-6,.cls-7,.cls-9{stroke-linejoin:round;}.cls-18,.cls-2,.cls-24,.cls-33{stroke-width:0.75px;}.cls-3{clip-path:url(#clip-path);}.cls-18,.cls-4,.cls-6{stroke:#5b9bd5;}.cls-10,.cls-4,.cls-7{stroke-linecap:round;stroke-width:2.25px;}.cls-5{fill:#5b9bd5;}.cls-12,.cls-6,.cls-9{stroke-width:0.72px;}.cls-24,.cls-7,.cls-9{stroke:#ed7d31;}.cls-8{fill:#ed7d31;}.cls-10,.cls-12,.cls-33{stroke:#a5a5a5;}.cls-11{fill:#a5a5a5;}.cls-13{clip-path:url(#clip-path-4);}.cls-14{font-size:9px;font-family:Calibri, Calibri;}.cls-14,.cls-15,.cls-19,.cls-25{fill:#595959;}.cls-15{font-size:15.96px;}.cls-15,.cls-19,.cls-25{font-family:TimesNewRomanPSMT, Times New Roman;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-19{font-size:14.04px;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0.01em;}.cls-23{letter-spacing:0em;}.cls-25{font-size:14.06px;}.cls-26{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:-0.01em;}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:0em;}.cls-32{letter-spacing:-0.01em;}.cls-34{letter-spacing:0em;}.cls-35{letter-spacing:0em;}.cls-36{letter-spacing:0em;}.cls-37{letter-spacing:0em;}.cls-38{letter-spacing:0em;}.cls-39{letter-spacing:-0.01em;}</style><clipPath id="clip-path"><rect class="cls-1" x="53.77" y="8.9" width="587.4" height="355.08"/></clipPath><clipPath id="clip-path-4"><rect class="cls-1" x="0.38" y="0.38" width="652.8" height="391.32"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><path class="cls-2" d="M53.8,9H640.52M53.8,79.82H640.52M53.8,150.74H640.52M53.8,221.54H640.52M53.8,292.46H640.52"/><path class="cls-2" d="M626.77,9V363.3M512.18,9V363.3M397.58,9V363.3M283,9V363.3M168.37,9V363.3M53.8,9V363.3"/><line class="cls-2" x1="53.8" y1="363.3" x2="640.52" y2="363.3"/><g class="cls-3"><polyline class="cls-4" points="54.95 363.24 57.26 363.13 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.13 68.66 363.13 71.06 363.13 73.33 363.13 75.61 363.13 77.89 363.13 80.17 363.01 82.45 363.01 84.73 363.01 87.02 363.01 89.3 363.01 91.58 363.01 93.86 363.01 96.25 363.01 98.53 363.01 100.81 362.89 103.09 362.89 105.38 362.89 107.66 362.89 109.94 362.89 112.22 362.89 114.5 362.89 116.78 362.89 119.17 362.89 121.45 362.89 123.73 362.89 126.02 362.77 128.29 362.77 130.57 362.77 132.85 362.77 135.13 362.77 137.41 362.77 139.69 362.77 142.09 362.77 144.38 362.77 146.66 362.65 148.94 362.65 151.22 362.65 153.5 362.65 155.78 362.65 158.06 362.65 160.34 362.65 162.62 362.65 165.01 362.65 167.29 362.65 169.57 362.65 171.85 362.54 174.13 362.54 176.41 362.54 178.69 362.54 180.97 362.54 183.25 362.54 185.53 362.54 187.94 362.54 190.22 362.54 192.5 362.54 194.78 362.42 197.06 362.42 199.34 362.42 201.62 362.42 203.9 362.42 206.18 362.42 208.46 362.42 210.85 362.42 213.13 362.42 215.41 362.42 217.69 362.42 219.97 362.3 222.25 362.3 224.53 362.3 226.81 362.3 229.09 362.3 231.38 362.3 233.78 362.3 236.06 362.3 238.34 362.3 240.62 362.3 242.9 362.18 245.18 362.18 247.46 362.18 249.74 362.18 252.01 362.18 254.29 362.18 256.69 362.18 258.98 362.18 261.25 362.18 263.54 362.18 265.81 362.18 268.1 362.06 270.38 362.06 272.65 362.06 274.94 362.06 277.21 362.06 279.62 362.06 281.89 361.94 284.18 361.94 286.45 361.94 288.74 361.94 291.01 361.94 293.3 361.94 295.57 361.81 297.86 361.81 300.13 361.81 302.42 361.81 304.81 361.81 307.1 361.81 309.38 361.69 311.65 361.69 313.94 361.69 316.21 361.69 318.5 361.69 320.77 361.69 323.06 361.57 325.33 361.57 327.74 361.57 330.01 361.57 332.3 361.57 334.57 361.45 336.86 361.45 339.13 361.45 341.42 361.33 343.69 361.33 345.98 361.33 348.25 361.33 350.65 361.33 352.94 361.21 355.21 361.21 357.5 361.21 359.77 361.21 362.06 361.1 364.33 361.1 366.62 361.1 368.89 361.1 371.18 360.98 373.57 360.98 375.86 360.98 378.13 360.86 380.42 360.86 382.69 360.86 384.98 360.74 387.25 360.74 389.54 360.74 391.81 360.62 394.1 360.62 396.5 360.62 398.77 360.62 401.06 360.5 403.33 360.5 405.62 360.38 407.89 360.38 410.18 360.38 412.45 360.25 414.74 360.25 417.01 360.25 419.42 360.13 421.69 360.13 423.98 360.13 426.25 360.01 428.54 360.01 430.81 359.89 433.1 359.89 435.38 359.89 437.65 359.77 439.94 359.77 442.33 359.65 444.62 359.65 446.89 359.65 449.18 359.54 451.45 359.54 453.74 359.42 456.01 359.42 458.3 359.3 460.57 359.3 462.86 359.18 465.25 359.18 467.54 359.06 469.81 359.06 472.1 358.94 474.38 358.94 476.65 358.81 478.94 358.81 481.21 358.69 483.5 358.69 485.77 358.57 488.06 358.57 490.45 358.45 492.74 358.45 495.01 358.33 497.3 358.33 499.57 358.21 501.86 358.1 504.13 358.1 506.42 357.98 508.69 357.98 510.98 357.86 513.38 357.74 515.65 357.74 517.93 357.62 520.22 357.5 522.5 357.5 524.77 357.38 527.05 357.25 529.34 357.25 531.62 357.13 533.89 357.01 536.29 357.01 538.58 356.89 540.86 356.77 543.13 356.65 545.41 356.54 547.7 356.42 549.98 356.42 552.25 356.3 554.53 356.18 556.82 356.06 559.22 355.94 561.5 355.81 563.77 355.69 566.05 355.45 568.34 355.33 570.62 355.21 572.89 355.1 575.17 354.98 577.46 354.74 579.74 354.62 582.13 354.38 584.41 354.25 586.7 354.01 588.98 353.77 591.25 353.65 593.53 353.42 595.82 353.18 598.1 352.81 600.38 352.57 602.65 352.33 605.05 351.98 607.34 351.74 609.62 351.38 611.89 351.01 614.17 350.65 616.46 350.18 618.74 349.81 621.01 349.33 623.29 348.86 625.58 348.25 627.98 347.77 630.25 347.18 632.53 346.45 634.82 345.86 637.1 345.13 639.38 344.37"/></g><circle class="cls-5" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-6" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-5" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-6" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-5" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-6" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-5" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-6" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-5" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-6" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-5" cx="66.31" cy="363.08" r="1.98"/><circle class="cls-6" cx="66.31" cy="363.08" r="1.98"/><circle class="cls-5" cx="68.59" cy="363.08" r="1.98"/><circle class="cls-6" cx="68.59" cy="363.08" r="1.98"/><circle class="cls-5" cx="71" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -297.31, 397.95)"/><circle class="cls-6" cx="71" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -297.31, 397.95)"/><circle class="cls-5" cx="73.28" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -295.25, 400.22)"/><circle class="cls-6" cx="73.28" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -295.25, 400.22)"/><circle class="cls-5" cx="75.56" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -293.2, 402.49)"/><circle class="cls-6" cx="75.56" cy="363.08" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -293.2, 402.49)"/><circle class="cls-5" cx="77.84" cy="363.08" r="1.98"/><circle class="cls-6" cx="77.84" cy="363.08" r="1.98"/><circle class="cls-5" cx="80.12" cy="362.96" r="1.98"/><circle class="cls-6" cx="80.12" cy="362.96" r="1.98"/><circle class="cls-5" cx="82.4" cy="362.96" r="1.98"/><circle class="cls-6" cx="82.4" cy="362.96" r="1.98"/><circle class="cls-5" cx="84.67" cy="362.96" r="1.98"/><circle class="cls-6" cx="84.67" cy="362.96" r="1.98"/><circle class="cls-5" cx="86.95" cy="362.96" r="1.98"/><circle class="cls-6" cx="86.95" cy="362.96" r="1.98"/><circle class="cls-5" cx="89.23" cy="362.96" r="1.98"/><circle class="cls-6" cx="89.23" cy="362.96" r="1.98"/><circle class="cls-5" cx="91.51" cy="362.96" r="1.98"/><circle class="cls-6" cx="91.51" cy="362.96" r="1.98"/><circle class="cls-5" cx="93.79" cy="362.96" r="1.98"/><circle class="cls-6" cx="93.79" cy="362.96" r="1.98"/><circle class="cls-5" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-6" cx="96.19" cy="362.96" r="1.98"/><circle class="cls-5" cx="98.48" cy="362.96" r="1.98"/><circle class="cls-6" cx="98.48" cy="362.96" r="1.98"/><circle class="cls-5" cx="100.76" cy="362.84" r="1.98"/><circle class="cls-6" cx="100.76" cy="362.84" r="1.98"/><circle class="cls-5" cx="103.03" cy="362.84" r="1.98"/><circle class="cls-6" cx="103.03" cy="362.84" r="1.98"/><circle class="cls-5" cx="105.31" cy="362.84" r="1.98"/><circle class="cls-6" cx="105.31" cy="362.84" r="1.98"/><circle class="cls-5" cx="107.59" cy="362.84" r="1.98"/><circle class="cls-6" cx="107.59" cy="362.84" r="1.98"/><circle class="cls-5" cx="109.88" cy="362.84" r="1.98"/><circle class="cls-6" cx="109.88" cy="362.84" r="1.98"/><circle class="cls-5" cx="112.15" cy="362.84" r="1.98"/><circle class="cls-6" cx="112.15" cy="362.84" r="1.98"/><circle class="cls-5" cx="114.43" cy="362.84" r="1.98"/><circle class="cls-6" cx="114.43" cy="362.84" r="1.98"/><circle class="cls-5" cx="116.71" cy="362.84" r="1.98"/><circle class="cls-6" cx="116.71" cy="362.84" r="1.98"/><circle class="cls-5" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-6" cx="119.12" cy="362.84" r="1.98"/><circle class="cls-5" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-6" cx="121.4" cy="362.84" r="1.98"/><circle class="cls-5" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-6" cx="123.67" cy="362.84" r="1.98"/><circle class="cls-5" cx="125.95" cy="362.72" r="1.98"/><circle class="cls-6" cx="125.95" cy="362.72" r="1.98"/><circle class="cls-5" cx="128.23" cy="362.72" r="1.98"/><circle class="cls-6" cx="128.23" cy="362.72" r="1.98"/><circle class="cls-5" cx="130.51" cy="362.72" r="1.98"/><circle class="cls-6" cx="130.51" cy="362.72" r="1.98"/><circle class="cls-5" cx="132.79" cy="362.72" r="1.98"/><circle class="cls-6" cx="132.79" cy="362.72" r="1.98"/><circle class="cls-5" cx="135.07" cy="362.72" r="1.98"/><circle class="cls-6" cx="135.07" cy="362.72" r="1.98"/><circle class="cls-5" cx="137.36" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -237.13, 463.66)"/><circle class="cls-6" cx="137.36" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -237.13, 463.66)"/><circle class="cls-5" cx="139.64" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -235.07, 465.93)"/><circle class="cls-6" cx="139.64" cy="362.72" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -235.07, 465.93)"/><circle class="cls-5" cx="142.03" cy="362.72" r="1.98"/><circle class="cls-6" cx="142.03" cy="362.72" r="1.98"/><circle class="cls-5" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-6" cx="144.31" cy="362.72" r="1.98"/><circle class="cls-5" cx="146.59" cy="362.6" r="1.98"/><circle class="cls-6" cx="146.59" cy="362.6" r="1.98"/><circle class="cls-5" cx="148.88" cy="362.6" r="1.98"/><circle class="cls-6" cx="148.88" cy="362.6" r="1.98"/><circle class="cls-5" cx="151.15" cy="362.6" r="1.98"/><circle class="cls-6" cx="151.15" cy="362.6" r="1.98"/><circle class="cls-5" cx="153.43" cy="362.6" r="1.98"/><circle class="cls-6" cx="153.43" cy="362.6" r="1.98"/><circle class="cls-5" cx="155.71" cy="362.6" r="1.98"/><circle class="cls-6" cx="155.71" cy="362.6" r="1.98"/><circle class="cls-5" cx="158" cy="362.6" r="1.98"/><circle class="cls-6" cx="158" cy="362.6" r="1.98"/><circle class="cls-5" cx="160.28" cy="362.6" r="1.98"/><circle class="cls-6" cx="160.28" cy="362.6" r="1.98"/><circle class="cls-5" cx="162.56" cy="362.6" r="1.98"/><circle class="cls-6" cx="162.56" cy="362.6" r="1.98"/><circle class="cls-5" cx="164.95" cy="362.6" r="1.98"/><circle class="cls-6" cx="164.95" cy="362.6" r="1.98"/><circle class="cls-5" cx="167.23" cy="362.6" r="1.98"/><circle class="cls-6" cx="167.23" cy="362.6" r="1.98"/><circle class="cls-5" cx="169.51" cy="362.6" r="1.98"/><circle class="cls-6" cx="169.51" cy="362.6" r="1.98"/><circle class="cls-5" cx="171.79" cy="362.48" r="1.98"/><circle class="cls-6" cx="171.79" cy="362.48" r="1.98"/><circle class="cls-5" cx="174.07" cy="362.48" r="1.98"/><circle class="cls-6" cx="174.07" cy="362.48" r="1.98"/><circle class="cls-5" cx="176.36" cy="362.48" r="1.98"/><circle class="cls-6" cx="176.36" cy="362.48" r="1.98"/><circle class="cls-5" cx="178.64" cy="362.48" r="1.98"/><circle class="cls-6" cx="178.64" cy="362.48" r="1.98"/><circle class="cls-5" cx="180.92" cy="362.48" r="1.98"/><circle class="cls-6" cx="180.92" cy="362.48" r="1.98"/><circle class="cls-5" cx="183.19" cy="362.48" r="1.98"/><circle class="cls-6" cx="183.19" cy="362.48" r="1.98"/><circle class="cls-5" cx="185.48" cy="362.48" r="1.98"/><circle class="cls-6" cx="185.48" cy="362.48" r="1.98"/><circle class="cls-5" cx="187.88" cy="362.48" r="1.98"/><circle class="cls-6" cx="187.88" cy="362.48" r="1.98"/><circle class="cls-5" cx="190.15" cy="362.48" r="1.98"/><circle class="cls-6" cx="190.15" cy="362.48" r="1.98"/><circle class="cls-5" cx="192.43" cy="362.48" r="1.98"/><circle class="cls-6" cx="192.43" cy="362.48" r="1.98"/><circle class="cls-5" cx="194.71" cy="362.36" r="1.98"/><circle class="cls-6" cx="194.71" cy="362.36" r="1.98"/><circle class="cls-5" cx="196.99" cy="362.36" r="1.98"/><circle class="cls-6" cx="196.99" cy="362.36" r="1.98"/><path class="cls-5" d="M201.26,362.36a2,2,0,1,1-2-2A2,2,0,0,1,201.26,362.36Z"/><path class="cls-6" d="M201.26,362.36a2,2,0,1,1-2-2A2,2,0,0,1,201.26,362.36Z"/><path class="cls-5" d="M203.53,362.36a2,2,0,1,1-2-2A2,2,0,0,1,203.53,362.36Z"/><path class="cls-6" d="M203.53,362.36a2,2,0,1,1-2-2A2,2,0,0,1,203.53,362.36Z"/><path class="cls-5" d="M205.81,362.36a2,2,0,1,1-2-2A2,2,0,0,1,205.81,362.36Z"/><path class="cls-6" d="M205.81,362.36a2,2,0,1,1-2-2A2,2,0,0,1,205.81,362.36Z"/><path class="cls-5" d="M208.09,362.36a2,2,0,1,1-2-2A2,2,0,0,1,208.09,362.36Z"/><path class="cls-6" d="M208.09,362.36a2,2,0,1,1-2-2A2,2,0,0,1,208.09,362.36Z"/><path class="cls-5" d="M210.37,362.36a2,2,0,1,1-2-2A2,2,0,0,1,210.37,362.36Z"/><path class="cls-6" d="M210.37,362.36a2,2,0,1,1-2-2A2,2,0,0,1,210.37,362.36Z"/><path class="cls-5" d="M212.77,362.36a2,2,0,1,1-2-2A2,2,0,0,1,212.77,362.36Z"/><path class="cls-6" d="M212.77,362.36a2,2,0,1,1-2-2A2,2,0,0,1,212.77,362.36Z"/><path class="cls-5" d="M215.05,362.36a2,2,0,1,1-2-2A2,2,0,0,1,215.05,362.36Z"/><path class="cls-6" d="M215.05,362.36a2,2,0,1,1-2-2A2,2,0,0,1,215.05,362.36Z"/><path class="cls-5" d="M217.33,362.36a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.36Z"/><path class="cls-6" d="M217.33,362.36a2,2,0,1,1-2-2A2,2,0,0,1,217.33,362.36Z"/><path class="cls-5" d="M219.61,362.36a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.36Z"/><path class="cls-6" d="M219.61,362.36a2,2,0,1,1-2-2A2,2,0,0,1,219.61,362.36Z"/><path class="cls-5" d="M221.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.24Z"/><path class="cls-6" d="M221.89,362.24a2,2,0,1,1-2-2A2,2,0,0,1,221.89,362.24Z"/><circle class="cls-5" cx="222.2" cy="362.24" r="1.98" transform="translate(-34.61 23.66) rotate(-5.65)"/><circle class="cls-6" cx="222.2" cy="362.24" r="1.98" transform="translate(-34.61 23.66) rotate(-5.65)"/><circle class="cls-5" cx="224.48" cy="362.24" r="1.98" transform="translate(-34.6 23.88) rotate(-5.65)"/><circle class="cls-6" cx="224.48" cy="362.24" r="1.98" transform="translate(-34.6 23.88) rotate(-5.65)"/><circle class="cls-5" cx="226.76" cy="362.24" r="1.98" transform="translate(-34.59 24.11) rotate(-5.65)"/><circle class="cls-6" cx="226.76" cy="362.24" r="1.98" transform="translate(-34.59 24.11) rotate(-5.65)"/><circle class="cls-5" cx="229.03" cy="362.24" r="1.98" transform="translate(-34.58 24.33) rotate(-5.65)"/><circle class="cls-6" cx="229.03" cy="362.24" r="1.98" transform="translate(-34.58 24.33) rotate(-5.65)"/><circle class="cls-5" cx="231.31" cy="362.24" r="1.98" transform="translate(-34.57 24.56) rotate(-5.65)"/><circle class="cls-6" cx="231.31" cy="362.24" r="1.98" transform="translate(-34.57 24.56) rotate(-5.65)"/><path class="cls-5" d="M235.7,362.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,362.24Z"/><path class="cls-6" d="M235.7,362.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,362.24Z"/><path class="cls-5" d="M238,362.24a2,2,0,1,1-2-2A2,2,0,0,1,238,362.24Z"/><path class="cls-6" d="M238,362.24a2,2,0,1,1-2-2A2,2,0,0,1,238,362.24Z"/><path class="cls-5" d="M240.26,362.24a2,2,0,1,1-2-2A2,2,0,0,1,240.26,362.24Z"/><path class="cls-6" d="M240.26,362.24a2,2,0,1,1-2-2A2,2,0,0,1,240.26,362.24Z"/><path class="cls-5" d="M242.53,362.24a2,2,0,1,1-2-2A2,2,0,0,1,242.53,362.24Z"/><path class="cls-6" d="M242.53,362.24a2,2,0,1,1-2-2A2,2,0,0,1,242.53,362.24Z"/><path class="cls-5" d="M244.81,362.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,362.12Z"/><path class="cls-6" d="M244.81,362.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,362.12Z"/><path class="cls-5" d="M247.09,362.12a2,2,0,1,1-2-2A2,2,0,0,1,247.09,362.12Z"/><path class="cls-6" d="M247.09,362.12a2,2,0,1,1-2-2A2,2,0,0,1,247.09,362.12Z"/><path class="cls-5" d="M249.37,362.12a2,2,0,1,1-2-2A2,2,0,0,1,249.37,362.12Z"/><path class="cls-6" d="M249.37,362.12a2,2,0,1,1-2-2A2,2,0,0,1,249.37,362.12Z"/><path class="cls-5" d="M251.65,362.12a2,2,0,1,1-2-2A2,2,0,0,1,251.65,362.12Z"/><path class="cls-6" d="M251.65,362.12a2,2,0,1,1-2-2A2,2,0,0,1,251.65,362.12Z"/><path class="cls-5" d="M253.93,362.12a2,2,0,1,1-2-2A2,2,0,0,1,253.93,362.12Z"/><path class="cls-6" d="M253.93,362.12a2,2,0,1,1-2-2A2,2,0,0,1,253.93,362.12Z"/><path class="cls-5" d="M256.21,362.12a2,2,0,1,1-2-2A2,2,0,0,1,256.21,362.12Z"/><path class="cls-6" d="M256.21,362.12a2,2,0,1,1-2-2A2,2,0,0,1,256.21,362.12Z"/><path class="cls-5" d="M258.61,362.12a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.12Z"/><path class="cls-6" d="M258.61,362.12a2,2,0,1,1-2-2A2,2,0,0,1,258.61,362.12Z"/><path class="cls-5" d="M260.89,362.12a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.12Z"/><path class="cls-6" d="M260.89,362.12a2,2,0,1,1-2-2A2,2,0,0,1,260.89,362.12Z"/><circle class="cls-5" cx="261.2" cy="362.12" r="1.98" transform="translate(-118.52 596.98) rotate(-85.93)"/><circle class="cls-6" cx="261.2" cy="362.12" r="1.98" transform="translate(-118.52 596.98) rotate(-85.93)"/><circle class="cls-5" cx="263.48" cy="362.12" r="1.98" transform="translate(-116.41 599.26) rotate(-85.93)"/><circle class="cls-6" cx="263.48" cy="362.12" r="1.98" transform="translate(-116.41 599.26) rotate(-85.93)"/><circle class="cls-5" cx="265.76" cy="362.12" r="1.98" transform="translate(-114.29 601.53) rotate(-85.93)"/><circle class="cls-6" cx="265.76" cy="362.12" r="1.98" transform="translate(-114.29 601.53) rotate(-85.93)"/><circle class="cls-5" cx="268.03" cy="362" r="1.98" transform="translate(-112.05 603.69) rotate(-85.93)"/><circle class="cls-6" cx="268.03" cy="362" r="1.98" transform="translate(-112.05 603.69) rotate(-85.93)"/><circle class="cls-5" cx="270.31" cy="362" r="1.98" transform="translate(-109.93 605.97) rotate(-85.93)"/><circle class="cls-6" cx="270.31" cy="362" r="1.98" transform="translate(-109.93 605.97) rotate(-85.93)"/><circle class="cls-5" cx="272.59" cy="362" r="1.98" transform="translate(-107.81 608.24) rotate(-85.93)"/><circle class="cls-6" cx="272.59" cy="362" r="1.98" transform="translate(-107.81 608.24) rotate(-85.93)"/><circle class="cls-5" cx="274.87" cy="362" r="1.98" transform="translate(-105.69 610.52) rotate(-85.93)"/><circle class="cls-6" cx="274.87" cy="362" r="1.98" transform="translate(-105.69 610.52) rotate(-85.93)"/><path class="cls-5" d="M279.14,362a2,2,0,1,1-2-2A2,2,0,0,1,279.14,362Z"/><path class="cls-6" d="M279.14,362a2,2,0,1,1-2-2A2,2,0,0,1,279.14,362Z"/><path class="cls-5" d="M281.53,362a2,2,0,1,1-2-2A2,2,0,0,1,281.53,362Z"/><path class="cls-6" d="M281.53,362a2,2,0,1,1-2-2A2,2,0,0,1,281.53,362Z"/><path class="cls-5" d="M283.81,361.88a2,2,0,1,1-2-2A2,2,0,0,1,283.81,361.88Z"/><path class="cls-6" d="M283.81,361.88a2,2,0,1,1-2-2A2,2,0,0,1,283.81,361.88Z"/><path class="cls-5" d="M286.09,361.88a2,2,0,1,1-2-2A2,2,0,0,1,286.09,361.88Z"/><path class="cls-6" d="M286.09,361.88a2,2,0,1,1-2-2A2,2,0,0,1,286.09,361.88Z"/><path class="cls-5" d="M288.37,361.88a2,2,0,1,1-2-2A2,2,0,0,1,288.37,361.88Z"/><path class="cls-6" d="M288.37,361.88a2,2,0,1,1-2-2A2,2,0,0,1,288.37,361.88Z"/><path class="cls-5" d="M290.65,361.88a2,2,0,1,1-2-2A2,2,0,0,1,290.65,361.88Z"/><path class="cls-6" d="M290.65,361.88a2,2,0,1,1-2-2A2,2,0,0,1,290.65,361.88Z"/><path class="cls-5" d="M292.93,361.88a2,2,0,1,1-2-2A2,2,0,0,1,292.93,361.88Z"/><path class="cls-6" d="M292.93,361.88a2,2,0,1,1-2-2A2,2,0,0,1,292.93,361.88Z"/><path class="cls-5" d="M295.21,361.88a2,2,0,1,1-2-2A2,2,0,0,1,295.21,361.88Z"/><path class="cls-6" d="M295.21,361.88a2,2,0,1,1-2-2A2,2,0,0,1,295.21,361.88Z"/><path class="cls-5" d="M297.49,361.76a2,2,0,1,1-2-2A2,2,0,0,1,297.49,361.76Z"/><path class="cls-6" d="M297.49,361.76a2,2,0,1,1-2-2A2,2,0,0,1,297.49,361.76Z"/><path class="cls-5" d="M299.77,361.76a2,2,0,1,1-2-2A2,2,0,0,1,299.77,361.76Z"/><path class="cls-6" d="M299.77,361.76a2,2,0,1,1-2-2A2,2,0,0,1,299.77,361.76Z"/><path class="cls-5" d="M302.05,361.76a2,2,0,1,1-2-2A2,2,0,0,1,302.05,361.76Z"/><path class="cls-6" d="M302.05,361.76a2,2,0,1,1-2-2A2,2,0,0,1,302.05,361.76Z"/><path class="cls-5" d="M304.33,361.76a2,2,0,1,1-2-2A2,2,0,0,1,304.33,361.76Z"/><path class="cls-6" d="M304.33,361.76a2,2,0,1,1-2-2A2,2,0,0,1,304.33,361.76Z"/><circle class="cls-5" cx="304.76" cy="361.76" r="1.98" transform="translate(-77.69 640.1) rotate(-85.93)"/><circle class="cls-6" cx="304.76" cy="361.76" r="1.98" transform="translate(-77.69 640.1) rotate(-85.93)"/><circle class="cls-5" cx="307.03" cy="361.76" r="1.98" transform="translate(-75.58 642.37) rotate(-85.93)"/><circle class="cls-6" cx="307.03" cy="361.76" r="1.98" transform="translate(-75.58 642.37) rotate(-85.93)"/><circle class="cls-5" cx="309.31" cy="361.64" r="1.98"/><circle class="cls-6" cx="309.31" cy="361.64" r="1.98"/><circle class="cls-5" cx="311.59" cy="361.64" r="1.98"/><circle class="cls-6" cx="311.59" cy="361.64" r="1.98"/><circle class="cls-5" cx="313.87" cy="361.64" r="1.98"/><circle class="cls-6" cx="313.87" cy="361.64" r="1.98"/><path class="cls-5" d="M318.14,361.64a2,2,0,1,1-2-2A2,2,0,0,1,318.14,361.64Z"/><path class="cls-6" d="M318.14,361.64a2,2,0,1,1-2-2A2,2,0,0,1,318.14,361.64Z"/><path class="cls-5" d="M320.42,361.64a2,2,0,1,1-2-2A2,2,0,0,1,320.42,361.64Z"/><path class="cls-6" d="M320.42,361.64a2,2,0,1,1-2-2A2,2,0,0,1,320.42,361.64Z"/><path class="cls-5" d="M322.7,361.64a2,2,0,1,1-2-2A2,2,0,0,1,322.7,361.64Z"/><path class="cls-6" d="M322.7,361.64a2,2,0,1,1-2-2A2,2,0,0,1,322.7,361.64Z"/><path class="cls-5" d="M325,361.52a2,2,0,1,1-2-2A2,2,0,0,1,325,361.52Z"/><path class="cls-6" d="M325,361.52a2,2,0,1,1-2-2A2,2,0,0,1,325,361.52Z"/><path class="cls-5" d="M327.26,361.52a2,2,0,1,1-2-2A2,2,0,0,1,327.26,361.52Z"/><path class="cls-6" d="M327.26,361.52a2,2,0,1,1-2-2A2,2,0,0,1,327.26,361.52Z"/><path class="cls-5" d="M329.65,361.52a2,2,0,1,1-2-2A2,2,0,0,1,329.65,361.52Z"/><path class="cls-6" d="M329.65,361.52a2,2,0,1,1-2-2A2,2,0,0,1,329.65,361.52Z"/><path class="cls-5" d="M331.93,361.52a2,2,0,1,1-2-2A2,2,0,0,1,331.93,361.52Z"/><path class="cls-6" d="M331.93,361.52a2,2,0,1,1-2-2A2,2,0,0,1,331.93,361.52Z"/><path class="cls-5" d="M334.21,361.52a2,2,0,1,1-2-2A2,2,0,0,1,334.21,361.52Z"/><path class="cls-6" d="M334.21,361.52a2,2,0,1,1-2-2A2,2,0,0,1,334.21,361.52Z"/><path class="cls-5" d="M336.49,361.4a2,2,0,1,1-2-2A2,2,0,0,1,336.49,361.4Z"/><path class="cls-6" d="M336.49,361.4a2,2,0,1,1-2-2A2,2,0,0,1,336.49,361.4Z"/><path class="cls-5" d="M338.77,361.4a2,2,0,1,1-2-2A2,2,0,0,1,338.77,361.4Z"/><path class="cls-6" d="M338.77,361.4a2,2,0,1,1-2-2A2,2,0,0,1,338.77,361.4Z"/><path class="cls-5" d="M341.05,361.4a2,2,0,1,1-2-2A2,2,0,0,1,341.05,361.4Z"/><path class="cls-6" d="M341.05,361.4a2,2,0,1,1-2-2A2,2,0,0,1,341.05,361.4Z"/><path class="cls-5" d="M343.33,361.28a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.28Z"/><path class="cls-6" d="M343.33,361.28a2,2,0,1,1-2-2A2,2,0,0,1,343.33,361.28Z"/><path class="cls-5" d="M345.61,361.28a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.28Z"/><path class="cls-6" d="M345.61,361.28a2,2,0,1,1-2-2A2,2,0,0,1,345.61,361.28Z"/><path class="cls-5" d="M347.89,361.28a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.28Z"/><path class="cls-6" d="M347.89,361.28a2,2,0,1,1-2-2A2,2,0,0,1,347.89,361.28Z"/><circle class="cls-5" cx="348.2" cy="361.28" r="1.98"/><circle class="cls-6" cx="348.2" cy="361.28" r="1.98"/><circle class="cls-5" cx="350.59" cy="361.28" r="1.98"/><circle class="cls-6" cx="350.59" cy="361.28" r="1.98"/><circle class="cls-5" cx="352.87" cy="361.16" r="1.98" transform="translate(-33.87 36.53) rotate(-5.65)"/><circle class="cls-6" cx="352.87" cy="361.16" r="1.98" transform="translate(-33.87 36.53) rotate(-5.65)"/><path class="cls-5" d="M357.14,361.16a2,2,0,1,1-2-2A2,2,0,0,1,357.14,361.16Z"/><path class="cls-6" d="M357.14,361.16a2,2,0,1,1-2-2A2,2,0,0,1,357.14,361.16Z"/><path class="cls-5" d="M359.42,361.16a2,2,0,1,1-2-2A2,2,0,0,1,359.42,361.16Z"/><path class="cls-6" d="M359.42,361.16a2,2,0,1,1-2-2A2,2,0,0,1,359.42,361.16Z"/><path class="cls-5" d="M361.7,361.16a2,2,0,1,1-2-2A2,2,0,0,1,361.7,361.16Z"/><path class="cls-6" d="M361.7,361.16a2,2,0,1,1-2-2A2,2,0,0,1,361.7,361.16Z"/><path class="cls-5" d="M364,361a2,2,0,1,1-2-2A2,2,0,0,1,364,361Z"/><path class="cls-6" d="M364,361a2,2,0,1,1-2-2A2,2,0,0,1,364,361Z"/><path class="cls-5" d="M366.26,361a2,2,0,1,1-2-2A2,2,0,0,1,366.26,361Z"/><path class="cls-6" d="M366.26,361a2,2,0,1,1-2-2A2,2,0,0,1,366.26,361Z"/><path class="cls-5" d="M368.53,361a2,2,0,1,1-2-2A2,2,0,0,1,368.53,361Z"/><path class="cls-6" d="M368.53,361a2,2,0,1,1-2-2A2,2,0,0,1,368.53,361Z"/><path class="cls-5" d="M370.81,361a2,2,0,1,1-2-2A2,2,0,0,1,370.81,361Z"/><path class="cls-6" d="M370.81,361a2,2,0,1,1-2-2A2,2,0,0,1,370.81,361Z"/><path class="cls-5" d="M373.09,360.92a2,2,0,1,1-2-2A2,2,0,0,1,373.09,360.92Z"/><path class="cls-6" d="M373.09,360.92a2,2,0,1,1-2-2A2,2,0,0,1,373.09,360.92Z"/><path class="cls-5" d="M375.49,360.92a2,2,0,1,1-2-2A2,2,0,0,1,375.49,360.92Z"/><path class="cls-6" d="M375.49,360.92a2,2,0,1,1-2-2A2,2,0,0,1,375.49,360.92Z"/><path class="cls-5" d="M377.77,360.92a2,2,0,1,1-2-2A2,2,0,0,1,377.77,360.92Z"/><path class="cls-6" d="M377.77,360.92a2,2,0,1,1-2-2A2,2,0,0,1,377.77,360.92Z"/><path class="cls-5" d="M380.05,360.8a2,2,0,1,1-2-2A2,2,0,0,1,380.05,360.8Z"/><path class="cls-6" d="M380.05,360.8a2,2,0,1,1-2-2A2,2,0,0,1,380.05,360.8Z"/><path class="cls-5" d="M382.33,360.8a2,2,0,1,1-2-2A2,2,0,0,1,382.33,360.8Z"/><path class="cls-6" d="M382.33,360.8a2,2,0,1,1-2-2A2,2,0,0,1,382.33,360.8Z"/><path class="cls-5" d="M384.61,360.8a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.8Z"/><path class="cls-6" d="M384.61,360.8a2,2,0,1,1-2-2A2,2,0,0,1,384.61,360.8Z"/><path class="cls-5" d="M386.89,360.68a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.68Z"/><path class="cls-6" d="M386.89,360.68a2,2,0,1,1-2-2A2,2,0,0,1,386.89,360.68Z"/><circle class="cls-5" cx="387.2" cy="360.68" r="1.98" transform="translate(-0.02 721.33) rotate(-85.93)"/><circle class="cls-6" cx="387.2" cy="360.68" r="1.98" transform="translate(-0.02 721.33) rotate(-85.93)"/><circle class="cls-5" cx="389.48" cy="360.68" r="1.98" transform="translate(2.1 723.6) rotate(-85.93)"/><circle class="cls-6" cx="389.48" cy="360.68" r="1.98" transform="translate(2.1 723.6) rotate(-85.93)"/><circle class="cls-5" cx="391.76" cy="360.56" r="1.98" transform="translate(-33.62 40.36) rotate(-5.65)"/><circle class="cls-6" cx="391.76" cy="360.56" r="1.98" transform="translate(-33.62 40.36) rotate(-5.65)"/><circle class="cls-5" cx="394.03" cy="360.56" r="1.98" transform="translate(-33.61 40.58) rotate(-5.65)"/><circle class="cls-6" cx="394.03" cy="360.56" r="1.98" transform="translate(-33.61 40.58) rotate(-5.65)"/><path class="cls-5" d="M398.42,360.56a2,2,0,1,1-2-2A2,2,0,0,1,398.42,360.56Z"/><path class="cls-6" d="M398.42,360.56a2,2,0,1,1-2-2A2,2,0,0,1,398.42,360.56Z"/><path class="cls-5" d="M400.7,360.56a2,2,0,1,1-2-2A2,2,0,0,1,400.7,360.56Z"/><path class="cls-6" d="M400.7,360.56a2,2,0,1,1-2-2A2,2,0,0,1,400.7,360.56Z"/><path class="cls-5" d="M403,360.44a2,2,0,1,1-2-2A2,2,0,0,1,403,360.44Z"/><path class="cls-6" d="M403,360.44a2,2,0,1,1-2-2A2,2,0,0,1,403,360.44Z"/><path class="cls-5" d="M405.26,360.44a2,2,0,1,1-2-2A2,2,0,0,1,405.26,360.44Z"/><path class="cls-6" d="M405.26,360.44a2,2,0,1,1-2-2A2,2,0,0,1,405.26,360.44Z"/><path class="cls-5" d="M407.53,360.32a2,2,0,1,1-2-2A2,2,0,0,1,407.53,360.32Z"/><path class="cls-6" d="M407.53,360.32a2,2,0,1,1-2-2A2,2,0,0,1,407.53,360.32Z"/><path class="cls-5" d="M409.81,360.32a2,2,0,1,1-2-2A2,2,0,0,1,409.81,360.32Z"/><path class="cls-6" d="M409.81,360.32a2,2,0,1,1-2-2A2,2,0,0,1,409.81,360.32Z"/><path class="cls-5" d="M412.09,360.32a2,2,0,1,1-2-2A2,2,0,0,1,412.09,360.32Z"/><path class="cls-6" d="M412.09,360.32a2,2,0,1,1-2-2A2,2,0,0,1,412.09,360.32Z"/><path class="cls-5" d="M414.37,360.2a2,2,0,1,1-2-2A2,2,0,0,1,414.37,360.2Z"/><path class="cls-6" d="M414.37,360.2a2,2,0,1,1-2-2A2,2,0,0,1,414.37,360.2Z"/><path class="cls-5" d="M416.65,360.2a2,2,0,1,1-2-2A2,2,0,0,1,416.65,360.2Z"/><path class="cls-6" d="M416.65,360.2a2,2,0,1,1-2-2A2,2,0,0,1,416.65,360.2Z"/><path class="cls-5" d="M418.93,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.93,360.2Z"/><path class="cls-6" d="M418.93,360.2a2,2,0,1,1-2-2A2,2,0,0,1,418.93,360.2Z"/><path class="cls-5" d="M421.33,360.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.08Z"/><path class="cls-6" d="M421.33,360.08a2,2,0,1,1-2-2A2,2,0,0,1,421.33,360.08Z"/><path class="cls-5" d="M423.61,360.08a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.08Z"/><path class="cls-6" d="M423.61,360.08a2,2,0,1,1-2-2A2,2,0,0,1,423.61,360.08Z"/><path class="cls-5" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><path class="cls-6" d="M425.89,360.08a2,2,0,1,1-2-2A2,2,0,0,1,425.89,360.08Z"/><circle class="cls-5" cx="426.2" cy="359.96" r="1.98"/><circle class="cls-6" cx="426.2" cy="359.96" r="1.98"/><circle class="cls-5" cx="428.48" cy="359.96" r="1.98"/><circle class="cls-6" cx="428.48" cy="359.96" r="1.98"/><circle class="cls-5" cx="430.76" cy="359.84" r="1.98" transform="translate(-33.36 44.2) rotate(-5.65)"/><circle class="cls-6" cx="430.76" cy="359.84" r="1.98" transform="translate(-33.36 44.2) rotate(-5.65)"/><circle class="cls-5" cx="433.03" cy="359.84" r="1.98" transform="translate(-33.35 44.42) rotate(-5.65)"/><circle class="cls-6" cx="433.03" cy="359.84" r="1.98" transform="translate(-33.35 44.42) rotate(-5.65)"/><circle class="cls-5" cx="435.31" cy="359.84" r="1.98" transform="translate(-33.34 44.65) rotate(-5.65)"/><circle class="cls-6" cx="435.31" cy="359.84" r="1.98" transform="translate(-33.34 44.65) rotate(-5.65)"/><circle class="cls-5" cx="437.59" cy="359.72" r="1.98" transform="translate(47.76 770.71) rotate(-85.93)"/><circle class="cls-6" cx="437.59" cy="359.72" r="1.98" transform="translate(47.76 770.71) rotate(-85.93)"/><circle class="cls-5" cx="439.87" cy="359.72" r="1.98" transform="translate(49.88 772.98) rotate(-85.93)"/><circle class="cls-6" cx="439.87" cy="359.72" r="1.98" transform="translate(49.88 772.98) rotate(-85.93)"/><path class="cls-5" d="M444.26,359.6a2,2,0,1,1-2-2A2,2,0,0,1,444.26,359.6Z"/><path class="cls-6" d="M444.26,359.6a2,2,0,1,1-2-2A2,2,0,0,1,444.26,359.6Z"/><path class="cls-5" d="M446.53,359.6a2,2,0,1,1-2-2A2,2,0,0,1,446.53,359.6Z"/><path class="cls-6" d="M446.53,359.6a2,2,0,1,1-2-2A2,2,0,0,1,446.53,359.6Z"/><path class="cls-5" d="M448.81,359.6a2,2,0,1,1-2-2A2,2,0,0,1,448.81,359.6Z"/><path class="cls-6" d="M448.81,359.6a2,2,0,1,1-2-2A2,2,0,0,1,448.81,359.6Z"/><path class="cls-5" d="M451.09,359.48a2,2,0,1,1-2-2A2,2,0,0,1,451.09,359.48Z"/><path class="cls-6" d="M451.09,359.48a2,2,0,1,1-2-2A2,2,0,0,1,451.09,359.48Z"/><path class="cls-5" d="M453.38,359.48a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.48Z"/><path class="cls-6" d="M453.38,359.48a2,2,0,1,1-2-2A2,2,0,0,1,453.38,359.48Z"/><path class="cls-5" d="M455.65,359.36a2,2,0,1,1-2-2A2,2,0,0,1,455.65,359.36Z"/><path class="cls-6" d="M455.65,359.36a2,2,0,1,1-2-2A2,2,0,0,1,455.65,359.36Z"/><path class="cls-5" d="M457.94,359.36a2,2,0,1,1-2-2A2,2,0,0,1,457.94,359.36Z"/><path class="cls-6" d="M457.94,359.36a2,2,0,1,1-2-2A2,2,0,0,1,457.94,359.36Z"/><path class="cls-5" d="M460.21,359.24a2,2,0,1,1-2-2A2,2,0,0,1,460.21,359.24Z"/><path class="cls-6" d="M460.21,359.24a2,2,0,1,1-2-2A2,2,0,0,1,460.21,359.24Z"/><path class="cls-5" d="M462.5,359.24a2,2,0,1,1-2-2A2,2,0,0,1,462.5,359.24Z"/><path class="cls-6" d="M462.5,359.24a2,2,0,1,1-2-2A2,2,0,0,1,462.5,359.24Z"/><circle class="cls-5" cx="462.8" cy="359.12" r="1.98" transform="translate(-19.15 26.14) rotate(-3.17)"/><circle class="cls-6" cx="462.8" cy="359.12" r="1.98" transform="translate(-19.15 26.14) rotate(-3.17)"/><path class="cls-5" d="M467.18,359.12a2,2,0,1,1-2-2A2,2,0,0,1,467.18,359.12Z"/><path class="cls-6" d="M467.18,359.12a2,2,0,1,1-2-2A2,2,0,0,1,467.18,359.12Z"/><circle class="cls-5" cx="467.48" cy="359" r="1.98" transform="translate(-19.14 26.4) rotate(-3.17)"/><circle class="cls-6" cx="467.48" cy="359" r="1.98" transform="translate(-19.14 26.4) rotate(-3.17)"/><path class="cls-5" d="M471.74,359a2,2,0,1,1-2-2A2,2,0,0,1,471.74,359Z"/><path class="cls-6" d="M471.74,359a2,2,0,1,1-2-2A2,2,0,0,1,471.74,359Z"/><circle class="cls-5" cx="472.03" cy="358.88" r="1.98" transform="translate(92.14 814.16) rotate(-87.4)"/><circle class="cls-6" cx="472.03" cy="358.88" r="1.98" transform="translate(92.14 814.16) rotate(-87.4)"/><path class="cls-5" d="M476.3,358.88a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.88Z"/><path class="cls-6" d="M476.3,358.88a2,2,0,1,1-2-2A2,2,0,0,1,476.3,358.88Z"/><circle class="cls-5" cx="476.59" cy="358.76" r="1.98" transform="translate(-19.11 26.9) rotate(-3.17)"/><circle class="cls-6" cx="476.59" cy="358.76" r="1.98" transform="translate(-19.11 26.9) rotate(-3.17)"/><path class="cls-5" d="M480.86,358.76a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.76Z"/><path class="cls-6" d="M480.86,358.76a2,2,0,1,1-2-2A2,2,0,0,1,480.86,358.76Z"/><circle class="cls-5" cx="481.15" cy="358.64" r="1.98"/><circle class="cls-6" cx="481.15" cy="358.64" r="1.98"/><path class="cls-5" d="M485.42,358.64a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.64Z"/><path class="cls-6" d="M485.42,358.64a2,2,0,1,1-2-2A2,2,0,0,1,485.42,358.64Z"/><circle class="cls-5" cx="485.71" cy="358.52" r="1.98" transform="translate(105.56 827.49) rotate(-87.4)"/><circle class="cls-6" cx="485.71" cy="358.52" r="1.98" transform="translate(105.56 827.49) rotate(-87.4)"/><path class="cls-5" d="M490,358.52a2,2,0,1,1-2-2A2,2,0,0,1,490,358.52Z"/><path class="cls-6" d="M490,358.52a2,2,0,1,1-2-2A2,2,0,0,1,490,358.52Z"/><path class="cls-5" d="M492.38,358.4a2,2,0,1,1-2-2A2,2,0,0,1,492.38,358.4Z"/><path class="cls-6" d="M492.38,358.4a2,2,0,1,1-2-2A2,2,0,0,1,492.38,358.4Z"/><path class="cls-5" d="M494.65,358.4a2,2,0,1,1-2-2A2,2,0,0,1,494.65,358.4Z"/><path class="cls-6" d="M494.65,358.4a2,2,0,1,1-2-2A2,2,0,0,1,494.65,358.4Z"/><path class="cls-5" d="M496.94,358.28a2,2,0,1,1-2-2A2,2,0,0,1,496.94,358.28Z"/><path class="cls-6" d="M496.94,358.28a2,2,0,1,1-2-2A2,2,0,0,1,496.94,358.28Z"/><path class="cls-5" d="M499.21,358.28a2,2,0,1,1-2-2A2,2,0,0,1,499.21,358.28Z"/><path class="cls-6" d="M499.21,358.28a2,2,0,1,1-2-2A2,2,0,0,1,499.21,358.28Z"/><path class="cls-5" d="M501.5,358.16a2,2,0,1,1-2-2A2,2,0,0,1,501.5,358.16Z"/><path class="cls-6" d="M501.5,358.16a2,2,0,1,1-2-2A2,2,0,0,1,501.5,358.16Z"/><circle class="cls-5" cx="501.8" cy="358.04" r="1.98" transform="translate(-19.03 28.3) rotate(-3.17)"/><circle class="cls-6" cx="501.8" cy="358.04" r="1.98" transform="translate(-19.03 28.3) rotate(-3.17)"/><path class="cls-5" d="M506.06,358a2,2,0,1,1-2-2A2,2,0,0,1,506.06,358Z"/><path class="cls-6" d="M506.06,358a2,2,0,1,1-2-2A2,2,0,0,1,506.06,358Z"/><circle class="cls-5" cx="506.36" cy="357.92" r="1.98" transform="translate(125.86 847.53) rotate(-87.4)"/><circle class="cls-6" cx="506.36" cy="357.92" r="1.98" transform="translate(125.86 847.53) rotate(-87.4)"/><path class="cls-5" d="M510.62,357.92a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357.92Z"/><path class="cls-6" d="M510.62,357.92a2,2,0,1,1-2-2A2,2,0,0,1,510.62,357.92Z"/><circle class="cls-5" cx="510.92" cy="357.8" r="1.98" transform="translate(130.34 851.97) rotate(-87.4)"/><circle class="cls-6" cx="510.92" cy="357.8" r="1.98" transform="translate(130.34 851.97) rotate(-87.4)"/><path class="cls-5" d="M515.3,357.68a2,2,0,1,1-2-2A2,2,0,0,1,515.3,357.68Z"/><path class="cls-6" d="M515.3,357.68a2,2,0,1,1-2-2A2,2,0,0,1,515.3,357.68Z"/><circle class="cls-5" cx="515.59" cy="357.68" r="1.98" transform="translate(-18.99 29.06) rotate(-3.17)"/><circle class="cls-6" cx="515.59" cy="357.68" r="1.98" transform="translate(-18.99 29.06) rotate(-3.17)"/><path class="cls-5" d="M519.86,357.56a2,2,0,1,1-2-2A2,2,0,0,1,519.86,357.56Z"/><path class="cls-6" d="M519.86,357.56a2,2,0,1,1-2-2A2,2,0,0,1,519.86,357.56Z"/><circle class="cls-5" cx="520.15" cy="357.44" r="1.98" transform="translate(-18.97 29.31) rotate(-3.17)"/><circle class="cls-6" cx="520.15" cy="357.44" r="1.98" transform="translate(-18.97 29.31) rotate(-3.17)"/><path class="cls-5" d="M524.42,357.44a2,2,0,1,1-2-2A2,2,0,0,1,524.42,357.44Z"/><path class="cls-6" d="M524.42,357.44a2,2,0,1,1-2-2A2,2,0,0,1,524.42,357.44Z"/><circle class="cls-5" cx="524.71" cy="357.32" r="1.98" transform="translate(-18.96 29.56) rotate(-3.17)"/><circle class="cls-6" cx="524.71" cy="357.32" r="1.98" transform="translate(-18.96 29.56) rotate(-3.17)"/><path class="cls-5" d="M529,357.2a2,2,0,1,1-2-2A2,2,0,0,1,529,357.2Z"/><path class="cls-6" d="M529,357.2a2,2,0,1,1-2-2A2,2,0,0,1,529,357.2Z"/><circle class="cls-5" cx="529.27" cy="357.2" r="1.98" transform="translate(148.46 869.74) rotate(-87.4)"/><circle class="cls-6" cx="529.27" cy="357.2" r="1.98" transform="translate(148.46 869.74) rotate(-87.4)"/><path class="cls-5" d="M533.53,357.08a2,2,0,1,1-2-2A2,2,0,0,1,533.53,357.08Z"/><path class="cls-6" d="M533.53,357.08a2,2,0,1,1-2-2A2,2,0,0,1,533.53,357.08Z"/><path class="cls-5" d="M535.82,357a2,2,0,1,1-2-2A2,2,0,0,1,535.82,357Z"/><path class="cls-6" d="M535.82,357a2,2,0,1,1-2-2A2,2,0,0,1,535.82,357Z"/><path class="cls-5" d="M538.21,357a2,2,0,1,1-2-2A2,2,0,0,1,538.21,357Z"/><path class="cls-6" d="M538.21,357a2,2,0,1,1-2-2A2,2,0,0,1,538.21,357Z"/><path class="cls-5" d="M540.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,540.5,356.84Z"/><path class="cls-6" d="M540.5,356.84a2,2,0,1,1-2-2A2,2,0,0,1,540.5,356.84Z"/><circle class="cls-5" cx="540.8" cy="356.72" r="1.98" transform="translate(-18.9 30.45) rotate(-3.17)"/><circle class="cls-6" cx="540.8" cy="356.72" r="1.98" transform="translate(-18.9 30.45) rotate(-3.17)"/><path class="cls-5" d="M545.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,545.06,356.6Z"/><path class="cls-6" d="M545.06,356.6a2,2,0,1,1-2-2A2,2,0,0,1,545.06,356.6Z"/><circle class="cls-5" cx="545.36" cy="356.48" r="1.98" transform="translate(164.53 885.12) rotate(-87.4)"/><circle class="cls-6" cx="545.36" cy="356.48" r="1.98" transform="translate(164.53 885.12) rotate(-87.4)"/><path class="cls-5" d="M549.62,356.36a2,2,0,1,1-2-2A2,2,0,0,1,549.62,356.36Z"/><path class="cls-6" d="M549.62,356.36a2,2,0,1,1-2-2A2,2,0,0,1,549.62,356.36Z"/><circle class="cls-5" cx="549.92" cy="356.36" r="1.98" transform="translate(-18.86 30.96) rotate(-3.17)"/><circle class="cls-6" cx="549.92" cy="356.36" r="1.98" transform="translate(-18.86 30.96) rotate(-3.17)"/><path class="cls-5" d="M554.18,356.24a2,2,0,1,1-2-2A2,2,0,0,1,554.18,356.24Z"/><path class="cls-6" d="M554.18,356.24a2,2,0,1,1-2-2A2,2,0,0,1,554.18,356.24Z"/><circle class="cls-5" cx="554.48" cy="356.12" r="1.98" transform="translate(-18.84 31.21) rotate(-3.17)"/><circle class="cls-6" cx="554.48" cy="356.12" r="1.98" transform="translate(-18.84 31.21) rotate(-3.17)"/><path class="cls-5" d="M558.74,356a2,2,0,1,1-2-2A2,2,0,0,1,558.74,356Z"/><path class="cls-6" d="M558.74,356a2,2,0,1,1-2-2A2,2,0,0,1,558.74,356Z"/><circle class="cls-5" cx="559.15" cy="355.88" r="1.98" transform="translate(178.31 898.33) rotate(-87.4)"/><circle class="cls-6" cx="559.15" cy="355.88" r="1.98" transform="translate(178.31 898.33) rotate(-87.4)"/><path class="cls-5" d="M563.42,355.76a2,2,0,1,1-2-2A2,2,0,0,1,563.42,355.76Z"/><path class="cls-6" d="M563.42,355.76a2,2,0,1,1-2-2A2,2,0,0,1,563.42,355.76Z"/><circle class="cls-5" cx="563.71" cy="355.64" r="1.98"/><circle class="cls-6" cx="563.71" cy="355.64" r="1.98"/><path class="cls-5" d="M568,355.4a2,2,0,1,1-2-2A2,2,0,0,1,568,355.4Z"/><path class="cls-6" d="M568,355.4a2,2,0,1,1-2-2A2,2,0,0,1,568,355.4Z"/><circle class="cls-5" cx="568.27" cy="355.28" r="1.98"/><circle class="cls-6" cx="568.27" cy="355.28" r="1.98"/><path class="cls-5" d="M572.53,355.16a2,2,0,1,1-2-2A2,2,0,0,1,572.53,355.16Z"/><path class="cls-6" d="M572.53,355.16a2,2,0,1,1-2-2A2,2,0,0,1,572.53,355.16Z"/><path class="cls-5" d="M574.82,355a2,2,0,1,1-2-2A2,2,0,0,1,574.82,355Z"/><path class="cls-6" d="M574.82,355a2,2,0,1,1-2-2A2,2,0,0,1,574.82,355Z"/><path class="cls-5" d="M577.09,354.92a2,2,0,1,1-2-2A2,2,0,0,1,577.09,354.92Z"/><path class="cls-6" d="M577.09,354.92a2,2,0,1,1-2-2A2,2,0,0,1,577.09,354.92Z"/><path class="cls-5" d="M579.38,354.68a2,2,0,1,1-2-2A2,2,0,0,1,579.38,354.68Z"/><path class="cls-6" d="M579.38,354.68a2,2,0,1,1-2-2A2,2,0,0,1,579.38,354.68Z"/><path class="cls-5" d="M581.65,354.56a2,2,0,1,1-2-2A2,2,0,0,1,581.65,354.56Z"/><path class="cls-6" d="M581.65,354.56a2,2,0,1,1-2-2A2,2,0,0,1,581.65,354.56Z"/><path class="cls-5" d="M584.06,354.32a2,2,0,1,1-2-2A2,2,0,0,1,584.06,354.32Z"/><path class="cls-6" d="M584.06,354.32a2,2,0,1,1-2-2A2,2,0,0,1,584.06,354.32Z"/><circle class="cls-5" cx="584.36" cy="354.2" r="1.98"/><circle class="cls-6" cx="584.36" cy="354.2" r="1.98"/><path class="cls-5" d="M588.62,354a2,2,0,1,1-2-2A2,2,0,0,1,588.62,354Z"/><path class="cls-6" d="M588.62,354a2,2,0,1,1-2-2A2,2,0,0,1,588.62,354Z"/><circle class="cls-5" cx="588.92" cy="353.72" r="1.98" transform="translate(-18.66 33.11) rotate(-3.17)"/><circle class="cls-6" cx="588.92" cy="353.72" r="1.98" transform="translate(-18.66 33.11) rotate(-3.17)"/><path class="cls-5" d="M593.18,353.6a2,2,0,1,1-2-2A2,2,0,0,1,593.18,353.6Z"/><path class="cls-6" d="M593.18,353.6a2,2,0,1,1-2-2A2,2,0,0,1,593.18,353.6Z"/><circle class="cls-5" cx="593.48" cy="353.36" r="1.98" transform="translate(213.59 930.21) rotate(-87.4)"/><circle class="cls-6" cx="593.48" cy="353.36" r="1.98" transform="translate(213.59 930.21) rotate(-87.4)"/><path class="cls-5" d="M597.74,353.12a2,2,0,1,1-2-2A2,2,0,0,1,597.74,353.12Z"/><path class="cls-6" d="M597.74,353.12a2,2,0,1,1-2-2A2,2,0,0,1,597.74,353.12Z"/><circle class="cls-5" cx="598.03" cy="352.76" r="1.98"/><circle class="cls-6" cx="598.03" cy="352.76" r="1.98"/><path class="cls-5" d="M602.3,352.52a2,2,0,1,1-2-2A2,2,0,0,1,602.3,352.52Z"/><path class="cls-6" d="M602.3,352.52a2,2,0,1,1-2-2A2,2,0,0,1,602.3,352.52Z"/><circle class="cls-5" cx="602.59" cy="352.28" r="1.98" transform="translate(-31.78 61.09) rotate(-5.65)"/><circle class="cls-6" cx="602.59" cy="352.28" r="1.98" transform="translate(-31.78 61.09) rotate(-5.65)"/><path class="cls-5" d="M607,351.92a2,2,0,1,1-2-2A2,2,0,0,1,607,351.92Z"/><path class="cls-6" d="M607,351.92a2,2,0,1,1-2-2A2,2,0,0,1,607,351.92Z"/><circle class="cls-5" cx="607.27" cy="351.68" r="1.98"/><circle class="cls-6" cx="607.27" cy="351.68" r="1.98"/><path class="cls-5" d="M611.53,351.32a2,2,0,1,1-2-2A2,2,0,0,1,611.53,351.32Z"/><path class="cls-6" d="M611.53,351.32a2,2,0,1,1-2-2A2,2,0,0,1,611.53,351.32Z"/><path class="cls-5" d="M613.82,351a2,2,0,1,1-2-2A2,2,0,0,1,613.82,351Z"/><path class="cls-6" d="M613.82,351a2,2,0,1,1-2-2A2,2,0,0,1,613.82,351Z"/><path class="cls-5" d="M616.09,350.6a2,2,0,1,1-2-2A2,2,0,0,1,616.09,350.6Z"/><path class="cls-6" d="M616.09,350.6a2,2,0,1,1-2-2A2,2,0,0,1,616.09,350.6Z"/><path class="cls-5" d="M618.38,350.12a2,2,0,1,1-2-2A2,2,0,0,1,618.38,350.12Z"/><path class="cls-6" d="M618.38,350.12a2,2,0,1,1-2-2A2,2,0,0,1,618.38,350.12Z"/><path class="cls-5" d="M620.65,349.76a2,2,0,1,1-2-2A2,2,0,0,1,620.65,349.76Z"/><path class="cls-6" d="M620.65,349.76a2,2,0,1,1-2-2A2,2,0,0,1,620.65,349.76Z"/><path class="cls-5" d="M622.94,349.28a2,2,0,1,1-2-2A2,2,0,0,1,622.94,349.28Z"/><path class="cls-6" d="M622.94,349.28a2,2,0,1,1-2-2A2,2,0,0,1,622.94,349.28Z"/><path class="cls-5" d="M625.21,348.8a2,2,0,1,1-2-2A2,2,0,0,1,625.21,348.8Z"/><path class="cls-6" d="M625.21,348.8a2,2,0,1,1-2-2A2,2,0,0,1,625.21,348.8Z"/><path class="cls-5" d="M627.5,348.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,348.2Z"/><path class="cls-6" d="M627.5,348.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,348.2Z"/><circle class="cls-5" cx="627.92" cy="347.72" r="1.98" transform="translate(236.56 949.4) rotate(-85.93)"/><circle class="cls-6" cx="627.92" cy="347.72" r="1.98" transform="translate(236.56 949.4) rotate(-85.93)"/><path class="cls-5" d="M632.18,347.12a2,2,0,1,1-2-2A2,2,0,0,1,632.18,347.12Z"/><path class="cls-6" d="M632.18,347.12a2,2,0,1,1-2-2A2,2,0,0,1,632.18,347.12Z"/><circle class="cls-5" cx="632.48" cy="346.4" r="1.98"/><circle class="cls-6" cx="632.48" cy="346.4" r="1.98"/><path class="cls-5" d="M636.74,345.8a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.8Z"/><path class="cls-6" d="M636.74,345.8a2,2,0,1,1-2-2A2,2,0,0,1,636.74,345.8Z"/><circle class="cls-5" cx="637.03" cy="345.08" r="1.98" transform="translate(247.67 956.05) rotate(-85.93)"/><circle class="cls-6" cx="637.03" cy="345.08" r="1.98" transform="translate(247.67 956.05) rotate(-85.93)"/><path class="cls-5" d="M641.3,344.36a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.36Z"/><path class="cls-6" d="M641.3,344.36a2,2,0,1,1-2-2A2,2,0,0,1,641.3,344.36Z"/><g class="cls-3"><polyline class="cls-7" points="54.95 363.24 57.26 363.13 59.53 363.13 61.81 363.13 64.09 363.13 66.38 363.01 68.66 363.01 71.06 363.01 73.33 362.89 75.61 362.89 77.89 362.89 80.17 362.77 82.45 362.77 84.73 362.77 87.02 362.65 89.3 362.65 91.58 362.54 93.86 362.54 96.25 362.54 98.53 362.42 100.81 362.42 103.09 362.3 105.38 362.3 107.66 362.18 109.94 362.18 112.22 362.18 114.5 362.06 116.78 362.06 119.17 361.94 121.45 361.94 123.73 361.81 126.02 361.81 128.29 361.69 130.57 361.69 132.85 361.69 135.13 361.57 137.41 361.57 139.69 361.45 142.09 361.45 144.38 361.33 146.66 361.33 148.94 361.21 151.22 361.21 153.5 361.1 155.78 361.1 158.06 361.1 160.34 360.98 162.62 360.98 165.01 360.86 167.29 360.86 169.57 360.74 171.85 360.74 174.13 360.62 176.41 360.62 178.69 360.62 180.97 360.5 183.25 360.5 185.53 360.38 187.94 360.38 190.22 360.25 192.5 360.25 194.78 360.13 197.06 360.13 199.34 360.13 201.62 360.01 203.9 360.01 206.18 359.89 208.46 359.89 210.85 359.89 213.13 359.77 215.41 359.77 217.69 359.65 219.97 359.65 222.25 359.54 224.53 359.54 226.81 359.54 229.09 359.42 231.38 359.42 233.78 359.3 236.06 359.3 238.34 359.18 240.62 359.18 242.9 359.18 245.18 359.06 247.46 359.06 249.74 358.94 252.01 358.94 254.29 358.94 256.69 358.81 258.98 358.81 261.25 358.69 263.54 358.69 265.81 358.69 268.1 358.57 270.38 358.57 272.65 358.57 274.94 358.45 277.21 358.33 279.62 358.21 281.89 358.21 284.18 358.1 286.45 357.98 288.74 357.86 291.01 357.86 293.3 357.74 295.57 357.62 297.86 357.5 300.13 357.5 302.42 357.38 304.81 357.25 307.1 357.25 309.38 357.13 311.65 357.01 313.94 356.89 316.21 356.89 318.5 356.77 320.77 356.65 323.06 356.65 325.33 356.54 327.74 356.42 330.01 356.3 332.3 356.18 334.57 356.06 336.86 355.94 339.13 355.81 341.42 355.69 343.69 355.57 345.98 355.45 348.25 355.33 350.65 355.21 352.94 355.1 355.21 354.98 357.5 354.86 359.77 354.74 362.06 354.62 364.33 354.5 366.62 354.5 368.89 354.25 371.18 354.13 373.57 354.01 375.86 353.89 378.13 353.65 380.42 353.54 382.69 353.42 384.98 353.3 387.25 353.18 389.54 352.94 391.81 352.81 394.1 352.69 396.5 352.57 398.77 352.33 401.06 352.21 403.33 351.98 405.62 351.86 407.89 351.74 410.18 351.5 412.45 351.38 414.74 351.13 417.01 351.01 419.42 350.77 421.69 350.65 423.98 350.42 426.25 350.3 428.54 350.06 430.81 349.81 433.1 349.69 435.38 349.45 437.65 349.33 439.94 349.1 442.33 348.86 444.62 348.74 446.89 348.5 449.18 348.25 451.45 348.01 453.74 347.89 456.01 347.65 458.3 347.42 460.57 347.18 462.86 346.94 465.25 346.69 467.54 346.45 469.81 346.21 472.1 345.98 474.38 345.74 476.65 345.5 478.94 345.25 481.21 345.01 483.5 344.77 485.77 344.54 488.06 344.3 490.45 344.06 492.74 343.69 495.01 343.45 497.3 343.21 499.57 342.98 501.86 342.62 504.13 342.38 506.42 342.13 508.69 341.77 510.98 341.54 513.38 341.18 515.65 340.94 517.93 340.57 520.22 340.33 522.5 339.98 524.77 339.62 527.05 339.38 529.34 339.01 531.62 338.65 533.89 338.3 536.29 337.94 538.58 337.57 540.86 337.21 543.13 336.74 545.41 336.38 547.7 336.01 549.98 335.54 552.25 335.06 554.53 334.69 556.82 334.21 559.22 333.74 561.5 333.25 563.77 332.65 566.05 332.18 568.34 331.57 570.62 331.1 572.89 330.38 575.17 329.77 577.46 329.18 579.74 328.45 582.13 327.74 584.41 327.01 586.7 326.18 588.98 325.33 591.25 324.5 593.53 323.54 595.82 322.57 598.1 321.62 600.38 320.54 602.65 319.33 605.05 318.13 607.34 316.81 609.62 315.5 611.89 314.06 614.17 312.5 616.46 310.81 618.74 309.13 621.01 307.21 623.29 305.3 625.58 303.25 627.98 300.98 630.25 298.69 632.53 296.18 634.82 293.42 637.1 290.54 639.38 287.51"/></g><circle class="cls-8" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-9" cx="54.91" cy="363.2" r="1.98"/><circle class="cls-8" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-9" cx="57.2" cy="363.08" r="1.98"/><circle class="cls-8" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-9" cx="59.48" cy="363.08" r="1.98"/><circle class="cls-8" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-9" cx="61.75" cy="363.08" r="1.98"/><circle class="cls-8" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-9" cx="64.03" cy="363.08" r="1.98"/><circle class="cls-8" cx="66.31" cy="362.96" r="1.98"/><circle class="cls-9" cx="66.31" cy="362.96" r="1.98"/><circle class="cls-8" cx="68.59" cy="362.96" r="1.98"/><circle class="cls-9" cx="68.59" cy="362.96" r="1.98"/><circle class="cls-8" cx="71" cy="362.96" r="1.98"/><circle class="cls-9" cx="71" cy="362.96" r="1.98"/><circle class="cls-8" cx="73.28" cy="362.84" r="1.98"/><circle class="cls-9" cx="73.28" cy="362.84" r="1.98"/><circle class="cls-8" cx="75.56" cy="362.84" r="1.98"/><circle class="cls-9" cx="75.56" cy="362.84" r="1.98"/><circle class="cls-8" cx="77.84" cy="362.84" r="1.98"/><circle class="cls-9" cx="77.84" cy="362.84" r="1.98"/><circle class="cls-8" cx="80.12" cy="362.72" r="1.98"/><circle class="cls-9" cx="80.12" cy="362.72" r="1.98"/><circle class="cls-8" cx="82.4" cy="362.72" r="1.98"/><circle class="cls-9" cx="82.4" cy="362.72" r="1.98"/><circle class="cls-8" cx="84.67" cy="362.72" r="1.98"/><circle class="cls-9" cx="84.67" cy="362.72" r="1.98"/><circle class="cls-8" cx="86.95" cy="362.6" r="1.98"/><circle class="cls-9" cx="86.95" cy="362.6" r="1.98"/><circle class="cls-8" cx="89.23" cy="362.6" r="1.98"/><circle class="cls-9" cx="89.23" cy="362.6" r="1.98"/><circle class="cls-8" cx="91.51" cy="362.48" r="1.98"/><circle class="cls-9" cx="91.51" cy="362.48" r="1.98"/><circle class="cls-8" cx="93.79" cy="362.48" r="1.98"/><circle class="cls-9" cx="93.79" cy="362.48" r="1.98"/><circle class="cls-8" cx="96.19" cy="362.48" r="1.98"/><circle class="cls-9" cx="96.19" cy="362.48" r="1.98"/><circle class="cls-8" cx="98.48" cy="362.36" r="1.98"/><circle class="cls-9" cx="98.48" cy="362.36" r="1.98"/><circle class="cls-8" cx="100.76" cy="362.36" r="1.98"/><circle class="cls-9" cx="100.76" cy="362.36" r="1.98"/><circle class="cls-8" cx="103.03" cy="362.24" r="1.98"/><circle class="cls-9" cx="103.03" cy="362.24" r="1.98"/><circle class="cls-8" cx="105.31" cy="362.24" r="1.98"/><circle class="cls-9" cx="105.31" cy="362.24" r="1.98"/><circle class="cls-8" cx="107.59" cy="362.12" r="1.98"/><circle class="cls-9" cx="107.59" cy="362.12" r="1.98"/><circle class="cls-8" cx="109.88" cy="362.12" r="1.98"/><circle class="cls-9" cx="109.88" cy="362.12" r="1.98"/><circle class="cls-8" cx="112.15" cy="362.12" r="1.98"/><circle class="cls-9" cx="112.15" cy="362.12" r="1.98"/><circle class="cls-8" cx="114.43" cy="362" r="1.98"/><circle class="cls-9" cx="114.43" cy="362" r="1.98"/><circle class="cls-8" cx="116.71" cy="362" r="1.98"/><circle class="cls-9" cx="116.71" cy="362" r="1.98"/><circle class="cls-8" cx="119.12" cy="361.88" r="1.98"/><circle class="cls-9" cx="119.12" cy="361.88" r="1.98"/><circle class="cls-8" cx="121.4" cy="361.88" r="1.98"/><circle class="cls-9" cx="121.4" cy="361.88" r="1.98"/><circle class="cls-8" cx="123.67" cy="361.76" r="1.98"/><circle class="cls-9" cx="123.67" cy="361.76" r="1.98"/><circle class="cls-8" cx="125.95" cy="361.76" r="1.98"/><circle class="cls-9" cx="125.95" cy="361.76" r="1.98"/><circle class="cls-8" cx="128.23" cy="361.64" r="1.98"/><circle class="cls-9" cx="128.23" cy="361.64" r="1.98"/><circle class="cls-8" cx="130.51" cy="361.64" r="1.98"/><circle class="cls-9" cx="130.51" cy="361.64" r="1.98"/><circle class="cls-8" cx="132.79" cy="361.64" r="1.98"/><circle class="cls-9" cx="132.79" cy="361.64" r="1.98"/><circle class="cls-8" cx="135.07" cy="361.52" r="1.98"/><circle class="cls-9" cx="135.07" cy="361.52" r="1.98"/><circle class="cls-8" cx="137.36" cy="361.52" r="1.98"/><circle class="cls-9" cx="137.36" cy="361.52" r="1.98"/><circle class="cls-8" cx="139.64" cy="361.4" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -233.76, 464.74)"/><circle class="cls-9" cx="139.64" cy="361.4" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -233.76, 464.74)"/><circle class="cls-8" cx="142.03" cy="361.4" r="1.98"/><circle class="cls-9" cx="142.03" cy="361.4" r="1.98"/><circle class="cls-8" cx="144.31" cy="361.28" r="1.98"/><circle class="cls-9" cx="144.31" cy="361.28" r="1.98"/><circle class="cls-8" cx="146.59" cy="361.28" r="1.98"/><circle class="cls-9" cx="146.59" cy="361.28" r="1.98"/><circle class="cls-8" cx="148.88" cy="361.16" r="1.98"/><circle class="cls-9" cx="148.88" cy="361.16" r="1.98"/><circle class="cls-8" cx="151.15" cy="361.16" r="1.98"/><circle class="cls-9" cx="151.15" cy="361.16" r="1.98"/><circle class="cls-8" cx="153.43" cy="361.04" r="1.98"/><circle class="cls-9" cx="153.43" cy="361.04" r="1.98"/><circle class="cls-8" cx="155.71" cy="361.04" r="1.98"/><circle class="cls-9" cx="155.71" cy="361.04" r="1.98"/><circle class="cls-8" cx="158" cy="361.04" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -216.85, 482.69)"/><circle class="cls-9" cx="158" cy="361.04" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -216.85, 482.69)"/><circle class="cls-8" cx="160.28" cy="360.92" r="1.98"/><circle class="cls-9" cx="160.28" cy="360.92" r="1.98"/><circle class="cls-8" cx="162.56" cy="360.92" r="1.98"/><circle class="cls-9" cx="162.56" cy="360.92" r="1.98"/><circle class="cls-8" cx="164.95" cy="360.8" r="1.98"/><circle class="cls-9" cx="164.95" cy="360.8" r="1.98"/><circle class="cls-8" cx="167.23" cy="360.8" r="1.98"/><circle class="cls-9" cx="167.23" cy="360.8" r="1.98"/><circle class="cls-8" cx="169.51" cy="360.68" r="1.98"/><circle class="cls-9" cx="169.51" cy="360.68" r="1.98"/><circle class="cls-8" cx="171.79" cy="360.68" r="1.98"/><circle class="cls-9" cx="171.79" cy="360.68" r="1.98"/><circle class="cls-8" cx="174.07" cy="360.56" r="1.98"/><circle class="cls-9" cx="174.07" cy="360.56" r="1.98"/><circle class="cls-8" cx="176.36" cy="360.56" r="1.98"/><circle class="cls-9" cx="176.36" cy="360.56" r="1.98"/><circle class="cls-8" cx="178.64" cy="360.56" r="1.98"/><circle class="cls-9" cx="178.64" cy="360.56" r="1.98"/><circle class="cls-8" cx="180.92" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -195.59, 504.95)"/><circle class="cls-9" cx="180.92" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -195.59, 504.95)"/><circle class="cls-8" cx="183.19" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -193.54, 507.22)"/><circle class="cls-9" cx="183.19" cy="360.44" r="1.98" transform="matrix(0.1, -1, 1, 0.1, -193.54, 507.22)"/><circle class="cls-8" cx="185.48" cy="360.32" r="1.98"/><circle class="cls-9" cx="185.48" cy="360.32" r="1.98"/><circle class="cls-8" cx="187.88" cy="360.32" r="1.98"/><circle class="cls-9" cx="187.88" cy="360.32" r="1.98"/><circle class="cls-8" cx="190.15" cy="360.2" r="1.98"/><circle class="cls-9" cx="190.15" cy="360.2" r="1.98"/><circle class="cls-8" cx="192.43" cy="360.2" r="1.98"/><circle class="cls-9" cx="192.43" cy="360.2" r="1.98"/><circle class="cls-8" cx="194.71" cy="360.08" r="1.98"/><circle class="cls-9" cx="194.71" cy="360.08" r="1.98"/><circle class="cls-8" cx="196.99" cy="360.08" r="1.98"/><circle class="cls-9" cx="196.99" cy="360.08" r="1.98"/><path class="cls-8" d="M201.26,360.08a2,2,0,1,1-2-2A2,2,0,0,1,201.26,360.08Z"/><path class="cls-9" d="M201.26,360.08a2,2,0,1,1-2-2A2,2,0,0,1,201.26,360.08Z"/><path class="cls-8" d="M203.53,360a2,2,0,1,1-2-2A2,2,0,0,1,203.53,360Z"/><path class="cls-9" d="M203.53,360a2,2,0,1,1-2-2A2,2,0,0,1,203.53,360Z"/><path class="cls-8" d="M205.81,360a2,2,0,1,1-2-2A2,2,0,0,1,205.81,360Z"/><path class="cls-9" d="M205.81,360a2,2,0,1,1-2-2A2,2,0,0,1,205.81,360Z"/><path class="cls-8" d="M208.09,359.84a2,2,0,1,1-2-2A2,2,0,0,1,208.09,359.84Z"/><path class="cls-9" d="M208.09,359.84a2,2,0,1,1-2-2A2,2,0,0,1,208.09,359.84Z"/><path class="cls-8" d="M210.37,359.84a2,2,0,1,1-2-2A2,2,0,0,1,210.37,359.84Z"/><path class="cls-9" d="M210.37,359.84a2,2,0,1,1-2-2A2,2,0,0,1,210.37,359.84Z"/><path class="cls-8" d="M212.77,359.84a2,2,0,1,1-2-2A2,2,0,0,1,212.77,359.84Z"/><path class="cls-9" d="M212.77,359.84a2,2,0,1,1-2-2A2,2,0,0,1,212.77,359.84Z"/><path class="cls-8" d="M215.05,359.72a2,2,0,1,1-2-2A2,2,0,0,1,215.05,359.72Z"/><path class="cls-9" d="M215.05,359.72a2,2,0,1,1-2-2A2,2,0,0,1,215.05,359.72Z"/><path class="cls-8" d="M217.33,359.72a2,2,0,1,1-2-2A2,2,0,0,1,217.33,359.72Z"/><path class="cls-9" d="M217.33,359.72a2,2,0,1,1-2-2A2,2,0,0,1,217.33,359.72Z"/><path class="cls-8" d="M219.61,359.6a2,2,0,1,1-2-2A2,2,0,0,1,219.61,359.6Z"/><path class="cls-9" d="M219.61,359.6a2,2,0,1,1-2-2A2,2,0,0,1,219.61,359.6Z"/><path class="cls-8" d="M221.89,359.6a2,2,0,1,1-2-2A2,2,0,0,1,221.89,359.6Z"/><path class="cls-9" d="M221.89,359.6a2,2,0,1,1-2-2A2,2,0,0,1,221.89,359.6Z"/><circle class="cls-8" cx="222.2" cy="359.48" r="1.98" transform="translate(-34.34 23.64) rotate(-5.65)"/><circle class="cls-9" cx="222.2" cy="359.48" r="1.98" transform="translate(-34.34 23.64) rotate(-5.65)"/><circle class="cls-8" cx="224.48" cy="359.48" r="1.98" transform="translate(-34.33 23.87) rotate(-5.65)"/><circle class="cls-9" cx="224.48" cy="359.48" r="1.98" transform="translate(-34.33 23.87) rotate(-5.65)"/><circle class="cls-8" cx="226.76" cy="359.48" r="1.98" transform="translate(-34.32 24.09) rotate(-5.65)"/><circle class="cls-9" cx="226.76" cy="359.48" r="1.98" transform="translate(-34.32 24.09) rotate(-5.65)"/><circle class="cls-8" cx="229.03" cy="359.36" r="1.98" transform="translate(-145.65 562.34) rotate(-85.93)"/><circle class="cls-9" cx="229.03" cy="359.36" r="1.98" transform="translate(-145.65 562.34) rotate(-85.93)"/><circle class="cls-8" cx="231.31" cy="359.36" r="1.98" transform="translate(-143.53 564.61) rotate(-85.93)"/><circle class="cls-9" cx="231.31" cy="359.36" r="1.98" transform="translate(-143.53 564.61) rotate(-85.93)"/><path class="cls-8" d="M235.7,359.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,359.24Z"/><path class="cls-9" d="M235.7,359.24a2,2,0,1,1-2-2A2,2,0,0,1,235.7,359.24Z"/><path class="cls-8" d="M238,359.24a2,2,0,1,1-2-2A2,2,0,0,1,238,359.24Z"/><path class="cls-9" d="M238,359.24a2,2,0,1,1-2-2A2,2,0,0,1,238,359.24Z"/><path class="cls-8" d="M240.26,359.12a2,2,0,1,1-2-2A2,2,0,0,1,240.26,359.12Z"/><path class="cls-9" d="M240.26,359.12a2,2,0,1,1-2-2A2,2,0,0,1,240.26,359.12Z"/><path class="cls-8" d="M242.53,359.12a2,2,0,1,1-2-2A2,2,0,0,1,242.53,359.12Z"/><path class="cls-9" d="M242.53,359.12a2,2,0,1,1-2-2A2,2,0,0,1,242.53,359.12Z"/><path class="cls-8" d="M244.81,359.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,359.12Z"/><path class="cls-9" d="M244.81,359.12a2,2,0,1,1-2-2A2,2,0,0,1,244.81,359.12Z"/><path class="cls-8" d="M247.09,359a2,2,0,1,1-2-2A2,2,0,0,1,247.09,359Z"/><path class="cls-9" d="M247.09,359a2,2,0,1,1-2-2A2,2,0,0,1,247.09,359Z"/><path class="cls-8" d="M249.37,359a2,2,0,1,1-2-2A2,2,0,0,1,249.37,359Z"/><path class="cls-9" d="M249.37,359a2,2,0,1,1-2-2A2,2,0,0,1,249.37,359Z"/><path class="cls-8" d="M251.65,358.88a2,2,0,1,1-2-2A2,2,0,0,1,251.65,358.88Z"/><path class="cls-9" d="M251.65,358.88a2,2,0,1,1-2-2A2,2,0,0,1,251.65,358.88Z"/><path class="cls-8" d="M253.93,358.88a2,2,0,1,1-2-2A2,2,0,0,1,253.93,358.88Z"/><path class="cls-9" d="M253.93,358.88a2,2,0,1,1-2-2A2,2,0,0,1,253.93,358.88Z"/><path class="cls-8" d="M256.21,358.88a2,2,0,1,1-2-2A2,2,0,0,1,256.21,358.88Z"/><path class="cls-9" d="M256.21,358.88a2,2,0,1,1-2-2A2,2,0,0,1,256.21,358.88Z"/><path class="cls-8" d="M258.61,358.76a2,2,0,1,1-2-2A2,2,0,0,1,258.61,358.76Z"/><path class="cls-9" d="M258.61,358.76a2,2,0,1,1-2-2A2,2,0,0,1,258.61,358.76Z"/><path class="cls-8" d="M260.89,358.76a2,2,0,1,1-2-2A2,2,0,0,1,260.89,358.76Z"/><path class="cls-9" d="M260.89,358.76a2,2,0,1,1-2-2A2,2,0,0,1,260.89,358.76Z"/><circle class="cls-8" cx="261.2" cy="358.64" r="1.98"/><circle class="cls-9" cx="261.2" cy="358.64" r="1.98"/><circle class="cls-8" cx="263.48" cy="358.64" r="1.98"/><circle class="cls-9" cx="263.48" cy="358.64" r="1.98"/><circle class="cls-8" cx="265.76" cy="358.64" r="1.98"/><circle class="cls-9" cx="265.76" cy="358.64" r="1.98"/><circle class="cls-8" cx="268.03" cy="358.52" r="1.98" transform="translate(-34.02 28.16) rotate(-5.65)"/><circle class="cls-9" cx="268.03" cy="358.52" r="1.98" transform="translate(-34.02 28.16) rotate(-5.65)"/><circle class="cls-8" cx="270.31" cy="358.52" r="1.98" transform="translate(-34.01 28.38) rotate(-5.65)"/><circle class="cls-9" cx="270.31" cy="358.52" r="1.98" transform="translate(-34.01 28.38) rotate(-5.65)"/><circle class="cls-8" cx="272.59" cy="358.52" r="1.98" transform="translate(-34 28.61) rotate(-5.65)"/><circle class="cls-9" cx="272.59" cy="358.52" r="1.98" transform="translate(-34 28.61) rotate(-5.65)"/><circle class="cls-8" cx="274.87" cy="358.4" r="1.98" transform="translate(-102.1 607.17) rotate(-85.93)"/><circle class="cls-9" cx="274.87" cy="358.4" r="1.98" transform="translate(-102.1 607.17) rotate(-85.93)"/><path class="cls-8" d="M279.14,358.28a2,2,0,1,1-2-2A2,2,0,0,1,279.14,358.28Z"/><path class="cls-9" d="M279.14,358.28a2,2,0,1,1-2-2A2,2,0,0,1,279.14,358.28Z"/><path class="cls-8" d="M281.53,358.16a2,2,0,1,1-2-2A2,2,0,0,1,281.53,358.16Z"/><path class="cls-9" d="M281.53,358.16a2,2,0,1,1-2-2A2,2,0,0,1,281.53,358.16Z"/><path class="cls-8" d="M283.81,358.16a2,2,0,1,1-2-2A2,2,0,0,1,283.81,358.16Z"/><path class="cls-9" d="M283.81,358.16a2,2,0,1,1-2-2A2,2,0,0,1,283.81,358.16Z"/><path class="cls-8" d="M286.09,358a2,2,0,1,1-2-2A2,2,0,0,1,286.09,358Z"/><path class="cls-9" d="M286.09,358a2,2,0,1,1-2-2A2,2,0,0,1,286.09,358Z"/><path class="cls-8" d="M288.37,357.92a2,2,0,1,1-2-2A2,2,0,0,1,288.37,357.92Z"/><path class="cls-9" d="M288.37,357.92a2,2,0,1,1-2-2A2,2,0,0,1,288.37,357.92Z"/><path class="cls-8" d="M290.65,357.8a2,2,0,1,1-2-2A2,2,0,0,1,290.65,357.8Z"/><path class="cls-9" d="M290.65,357.8a2,2,0,1,1-2-2A2,2,0,0,1,290.65,357.8Z"/><path class="cls-8" d="M292.93,357.8a2,2,0,1,1-2-2A2,2,0,0,1,292.93,357.8Z"/><path class="cls-9" d="M292.93,357.8a2,2,0,1,1-2-2A2,2,0,0,1,292.93,357.8Z"/><path class="cls-8" d="M295.21,357.68a2,2,0,1,1-2-2A2,2,0,0,1,295.21,357.68Z"/><path class="cls-9" d="M295.21,357.68a2,2,0,1,1-2-2A2,2,0,0,1,295.21,357.68Z"/><path class="cls-8" d="M297.49,357.56a2,2,0,1,1-2-2A2,2,0,0,1,297.49,357.56Z"/><path class="cls-9" d="M297.49,357.56a2,2,0,1,1-2-2A2,2,0,0,1,297.49,357.56Z"/><path class="cls-8" d="M299.77,357.44a2,2,0,1,1-2-2A2,2,0,0,1,299.77,357.44Z"/><path class="cls-9" d="M299.77,357.44a2,2,0,1,1-2-2A2,2,0,0,1,299.77,357.44Z"/><path class="cls-8" d="M302.05,357.44a2,2,0,1,1-2-2A2,2,0,0,1,302.05,357.44Z"/><path class="cls-9" d="M302.05,357.44a2,2,0,1,1-2-2A2,2,0,0,1,302.05,357.44Z"/><path class="cls-8" d="M304.33,357.32a2,2,0,1,1-2-2A2,2,0,0,1,304.33,357.32Z"/><path class="cls-9" d="M304.33,357.32a2,2,0,1,1-2-2A2,2,0,0,1,304.33,357.32Z"/><circle class="cls-8" cx="304.76" cy="357.2" r="1.98" transform="translate(-33.71 31.77) rotate(-5.65)"/><circle class="cls-9" cx="304.76" cy="357.2" r="1.98" transform="translate(-33.71 31.77) rotate(-5.65)"/><circle class="cls-8" cx="307.03" cy="357.2" r="1.98" transform="translate(-33.7 31.99) rotate(-5.65)"/><circle class="cls-9" cx="307.03" cy="357.2" r="1.98" transform="translate(-33.7 31.99) rotate(-5.65)"/><circle class="cls-8" cx="309.31" cy="357.08" r="1.98" transform="translate(-68.79 640.3) rotate(-85.93)"/><circle class="cls-9" cx="309.31" cy="357.08" r="1.98" transform="translate(-68.79 640.3) rotate(-85.93)"/><circle class="cls-8" cx="311.59" cy="356.96" r="1.98"/><circle class="cls-9" cx="311.59" cy="356.96" r="1.98"/><circle class="cls-8" cx="313.87" cy="356.84" r="1.98" transform="translate(-33.63 32.67) rotate(-5.65)"/><circle class="cls-9" cx="313.87" cy="356.84" r="1.98" transform="translate(-33.63 32.67) rotate(-5.65)"/><path class="cls-8" d="M318.14,356.84a2,2,0,1,1-2-2A2,2,0,0,1,318.14,356.84Z"/><path class="cls-9" d="M318.14,356.84a2,2,0,1,1-2-2A2,2,0,0,1,318.14,356.84Z"/><path class="cls-8" d="M320.42,356.72a2,2,0,1,1-2-2A2,2,0,0,1,320.42,356.72Z"/><path class="cls-9" d="M320.42,356.72a2,2,0,1,1-2-2A2,2,0,0,1,320.42,356.72Z"/><path class="cls-8" d="M322.7,356.6a2,2,0,1,1-2-2A2,2,0,0,1,322.7,356.6Z"/><path class="cls-9" d="M322.7,356.6a2,2,0,1,1-2-2A2,2,0,0,1,322.7,356.6Z"/><path class="cls-8" d="M325,356.6a2,2,0,1,1-2-2A2,2,0,0,1,325,356.6Z"/><path class="cls-9" d="M325,356.6a2,2,0,1,1-2-2A2,2,0,0,1,325,356.6Z"/><path class="cls-8" d="M327.26,356.48a2,2,0,1,1-2-2A2,2,0,0,1,327.26,356.48Z"/><path class="cls-9" d="M327.26,356.48a2,2,0,1,1-2-2A2,2,0,0,1,327.26,356.48Z"/><path class="cls-8" d="M329.65,356.36a2,2,0,1,1-2-2A2,2,0,0,1,329.65,356.36Z"/><path class="cls-9" d="M329.65,356.36a2,2,0,1,1-2-2A2,2,0,0,1,329.65,356.36Z"/><path class="cls-8" d="M331.93,356.24a2,2,0,1,1-2-2A2,2,0,0,1,331.93,356.24Z"/><path class="cls-9" d="M331.93,356.24a2,2,0,1,1-2-2A2,2,0,0,1,331.93,356.24Z"/><path class="cls-8" d="M334.21,356.12a2,2,0,1,1-2-2A2,2,0,0,1,334.21,356.12Z"/><path class="cls-9" d="M334.21,356.12a2,2,0,1,1-2-2A2,2,0,0,1,334.21,356.12Z"/><path class="cls-8" d="M336.49,356a2,2,0,1,1-2-2A2,2,0,0,1,336.49,356Z"/><path class="cls-9" d="M336.49,356a2,2,0,1,1-2-2A2,2,0,0,1,336.49,356Z"/><path class="cls-8" d="M338.77,355.88a2,2,0,1,1-2-2A2,2,0,0,1,338.77,355.88Z"/><path class="cls-9" d="M338.77,355.88a2,2,0,1,1-2-2A2,2,0,0,1,338.77,355.88Z"/><path class="cls-8" d="M341.05,355.76a2,2,0,1,1-2-2A2,2,0,0,1,341.05,355.76Z"/><path class="cls-9" d="M341.05,355.76a2,2,0,1,1-2-2A2,2,0,0,1,341.05,355.76Z"/><path class="cls-8" d="M343.33,355.64a2,2,0,1,1-2-2A2,2,0,0,1,343.33,355.64Z"/><path class="cls-9" d="M343.33,355.64a2,2,0,1,1-2-2A2,2,0,0,1,343.33,355.64Z"/><path class="cls-8" d="M345.61,355.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,355.52Z"/><path class="cls-9" d="M345.61,355.52a2,2,0,1,1-2-2A2,2,0,0,1,345.61,355.52Z"/><path class="cls-8" d="M347.89,355.4a2,2,0,1,1-2-2A2,2,0,0,1,347.89,355.4Z"/><path class="cls-9" d="M347.89,355.4a2,2,0,1,1-2-2A2,2,0,0,1,347.89,355.4Z"/><circle class="cls-8" cx="348.2" cy="355.28" r="1.98"/><circle class="cls-9" cx="348.2" cy="355.28" r="1.98"/><circle class="cls-8" cx="350.59" cy="355.16" r="1.98" transform="translate(-33.29 36.28) rotate(-5.65)"/><circle class="cls-9" cx="350.59" cy="355.16" r="1.98" transform="translate(-33.29 36.28) rotate(-5.65)"/><circle class="cls-8" cx="352.87" cy="355.04" r="1.98"/><circle class="cls-9" cx="352.87" cy="355.04" r="1.98"/><path class="cls-8" d="M357.14,354.92a2,2,0,1,1-2-2A2,2,0,0,1,357.14,354.92Z"/><path class="cls-9" d="M357.14,354.92a2,2,0,1,1-2-2A2,2,0,0,1,357.14,354.92Z"/><path class="cls-8" d="M359.42,354.8a2,2,0,1,1-2-2A2,2,0,0,1,359.42,354.8Z"/><path class="cls-9" d="M359.42,354.8a2,2,0,1,1-2-2A2,2,0,0,1,359.42,354.8Z"/><path class="cls-8" d="M361.7,354.68a2,2,0,1,1-2-2A2,2,0,0,1,361.7,354.68Z"/><path class="cls-9" d="M361.7,354.68a2,2,0,1,1-2-2A2,2,0,0,1,361.7,354.68Z"/><path class="cls-8" d="M364,354.56a2,2,0,1,1-2-2A2,2,0,0,1,364,354.56Z"/><path class="cls-9" d="M364,354.56a2,2,0,1,1-2-2A2,2,0,0,1,364,354.56Z"/><path class="cls-8" d="M366.26,354.44a2,2,0,1,1-2-2A2,2,0,0,1,366.26,354.44Z"/><path class="cls-9" d="M366.26,354.44a2,2,0,1,1-2-2A2,2,0,0,1,366.26,354.44Z"/><path class="cls-8" d="M368.53,354.44a2,2,0,1,1-2-2A2,2,0,0,1,368.53,354.44Z"/><path class="cls-9" d="M368.53,354.44a2,2,0,1,1-2-2A2,2,0,0,1,368.53,354.44Z"/><path class="cls-8" d="M370.81,354.2a2,2,0,1,1-2-2A2,2,0,0,1,370.81,354.2Z"/><path class="cls-9" d="M370.81,354.2a2,2,0,1,1-2-2A2,2,0,0,1,370.81,354.2Z"/><path class="cls-8" d="M373.09,354.08a2,2,0,1,1-2-2A2,2,0,0,1,373.09,354.08Z"/><path class="cls-9" d="M373.09,354.08a2,2,0,1,1-2-2A2,2,0,0,1,373.09,354.08Z"/><path class="cls-8" d="M375.49,354a2,2,0,1,1-2-2A2,2,0,0,1,375.49,354Z"/><path class="cls-9" d="M375.49,354a2,2,0,1,1-2-2A2,2,0,0,1,375.49,354Z"/><path class="cls-8" d="M377.77,353.84a2,2,0,1,1-2-2A2,2,0,0,1,377.77,353.84Z"/><path class="cls-9" d="M377.77,353.84a2,2,0,1,1-2-2A2,2,0,0,1,377.77,353.84Z"/><path class="cls-8" d="M380.05,353.6a2,2,0,1,1-2-2A2,2,0,0,1,380.05,353.6Z"/><path class="cls-9" d="M380.05,353.6a2,2,0,1,1-2-2A2,2,0,0,1,380.05,353.6Z"/><path class="cls-8" d="M382.33,353.48a2,2,0,1,1-2-2A2,2,0,0,1,382.33,353.48Z"/><path class="cls-9" d="M382.33,353.48a2,2,0,1,1-2-2A2,2,0,0,1,382.33,353.48Z"/><path class="cls-8" d="M384.61,353.36a2,2,0,1,1-2-2A2,2,0,0,1,384.61,353.36Z"/><path class="cls-9" d="M384.61,353.36a2,2,0,1,1-2-2A2,2,0,0,1,384.61,353.36Z"/><path class="cls-8" d="M386.89,353.24a2,2,0,1,1-2-2A2,2,0,0,1,386.89,353.24Z"/><path class="cls-9" d="M386.89,353.24a2,2,0,1,1-2-2A2,2,0,0,1,386.89,353.24Z"/><circle class="cls-8" cx="387.2" cy="353.12" r="1.98" transform="translate(-23.38 678.75) rotate(-80.78)"/><circle class="cls-9" cx="387.2" cy="353.12" r="1.98" transform="translate(-23.38 678.75) rotate(-80.78)"/><circle class="cls-8" cx="389.48" cy="352.88" r="1.98" transform="translate(-32.88 40.1) rotate(-5.65)"/><circle class="cls-9" cx="389.48" cy="352.88" r="1.98" transform="translate(-32.88 40.1) rotate(-5.65)"/><circle class="cls-8" cx="391.76" cy="352.76" r="1.98"/><circle class="cls-9" cx="391.76" cy="352.76" r="1.98"/><circle class="cls-8" cx="394.03" cy="352.64" r="1.98"/><circle class="cls-9" cx="394.03" cy="352.64" r="1.98"/><path class="cls-8" d="M398.42,352.52a2,2,0,1,1-2-2A2,2,0,0,1,398.42,352.52Z"/><path class="cls-9" d="M398.42,352.52a2,2,0,1,1-2-2A2,2,0,0,1,398.42,352.52Z"/><path class="cls-8" d="M400.7,352.28a2,2,0,1,1-2-2A2,2,0,0,1,400.7,352.28Z"/><path class="cls-9" d="M400.7,352.28a2,2,0,1,1-2-2A2,2,0,0,1,400.7,352.28Z"/><path class="cls-8" d="M403,352.16a2,2,0,1,1-2-2A2,2,0,0,1,403,352.16Z"/><path class="cls-9" d="M403,352.16a2,2,0,1,1-2-2A2,2,0,0,1,403,352.16Z"/><path class="cls-8" d="M405.26,351.92a2,2,0,1,1-2-2A2,2,0,0,1,405.26,351.92Z"/><path class="cls-9" d="M405.26,351.92a2,2,0,1,1-2-2A2,2,0,0,1,405.26,351.92Z"/><path class="cls-8" d="M407.53,351.8a2,2,0,1,1-2-2A2,2,0,0,1,407.53,351.8Z"/><path class="cls-9" d="M407.53,351.8a2,2,0,1,1-2-2A2,2,0,0,1,407.53,351.8Z"/><path class="cls-8" d="M409.81,351.68a2,2,0,1,1-2-2A2,2,0,0,1,409.81,351.68Z"/><path class="cls-9" d="M409.81,351.68a2,2,0,1,1-2-2A2,2,0,0,1,409.81,351.68Z"/><path class="cls-8" d="M412.09,351.44a2,2,0,1,1-2-2A2,2,0,0,1,412.09,351.44Z"/><path class="cls-9" d="M412.09,351.44a2,2,0,1,1-2-2A2,2,0,0,1,412.09,351.44Z"/><path class="cls-8" d="M414.37,351.32a2,2,0,1,1-2-2A2,2,0,0,1,414.37,351.32Z"/><path class="cls-9" d="M414.37,351.32a2,2,0,1,1-2-2A2,2,0,0,1,414.37,351.32Z"/><path class="cls-8" d="M416.65,351.08a2,2,0,1,1-2-2A2,2,0,0,1,416.65,351.08Z"/><path class="cls-9" d="M416.65,351.08a2,2,0,1,1-2-2A2,2,0,0,1,416.65,351.08Z"/><path class="cls-8" d="M418.93,351a2,2,0,1,1-2-2A2,2,0,0,1,418.93,351Z"/><path class="cls-9" d="M418.93,351a2,2,0,1,1-2-2A2,2,0,0,1,418.93,351Z"/><path class="cls-8" d="M421.33,350.72a2,2,0,1,1-2-2A2,2,0,0,1,421.33,350.72Z"/><path class="cls-9" d="M421.33,350.72a2,2,0,1,1-2-2A2,2,0,0,1,421.33,350.72Z"/><path class="cls-8" d="M423.61,350.6a2,2,0,1,1-2-2A2,2,0,0,1,423.61,350.6Z"/><path class="cls-9" d="M423.61,350.6a2,2,0,1,1-2-2A2,2,0,0,1,423.61,350.6Z"/><path class="cls-8" d="M425.89,350.36a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.36Z"/><path class="cls-9" d="M425.89,350.36a2,2,0,1,1-2-2A2,2,0,0,1,425.89,350.36Z"/><circle class="cls-8" cx="426.2" cy="350.24" r="1.98"/><circle class="cls-9" cx="426.2" cy="350.24" r="1.98"/><circle class="cls-8" cx="428.48" cy="350" r="1.98"/><circle class="cls-9" cx="428.48" cy="350" r="1.98"/><circle class="cls-8" cx="430.76" cy="349.76" r="1.98"/><circle class="cls-9" cx="430.76" cy="349.76" r="1.98"/><circle class="cls-8" cx="433.03" cy="349.64" r="1.98"/><circle class="cls-9" cx="433.03" cy="349.64" r="1.98"/><circle class="cls-8" cx="435.31" cy="349.4" r="1.98"/><circle class="cls-9" cx="435.31" cy="349.4" r="1.98"/><circle class="cls-8" cx="437.59" cy="349.28" r="1.98" transform="translate(22.73 725.27) rotate(-80.78)"/><circle class="cls-9" cx="437.59" cy="349.28" r="1.98" transform="translate(22.73 725.27) rotate(-80.78)"/><circle class="cls-8" cx="439.87" cy="349.04" r="1.98"/><circle class="cls-9" cx="439.87" cy="349.04" r="1.98"/><path class="cls-8" d="M444.26,348.8a2,2,0,1,1-2-2A2,2,0,0,1,444.26,348.8Z"/><path class="cls-9" d="M444.26,348.8a2,2,0,1,1-2-2A2,2,0,0,1,444.26,348.8Z"/><path class="cls-8" d="M446.53,348.68a2,2,0,1,1-2-2A2,2,0,0,1,446.53,348.68Z"/><path class="cls-9" d="M446.53,348.68a2,2,0,1,1-2-2A2,2,0,0,1,446.53,348.68Z"/><path class="cls-8" d="M448.81,348.44a2,2,0,1,1-2-2A2,2,0,0,1,448.81,348.44Z"/><path class="cls-9" d="M448.81,348.44a2,2,0,1,1-2-2A2,2,0,0,1,448.81,348.44Z"/><path class="cls-8" d="M451.09,348.2a2,2,0,1,1-2-2A2,2,0,0,1,451.09,348.2Z"/><path class="cls-9" d="M451.09,348.2a2,2,0,1,1-2-2A2,2,0,0,1,451.09,348.2Z"/><path class="cls-8" d="M453.38,348a2,2,0,1,1-2-2A2,2,0,0,1,453.38,348Z"/><path class="cls-9" d="M453.38,348a2,2,0,1,1-2-2A2,2,0,0,1,453.38,348Z"/><path class="cls-8" d="M455.65,347.84a2,2,0,1,1-2-2A2,2,0,0,1,455.65,347.84Z"/><path class="cls-9" d="M455.65,347.84a2,2,0,1,1-2-2A2,2,0,0,1,455.65,347.84Z"/><path class="cls-8" d="M457.94,347.6a2,2,0,1,1-2-2A2,2,0,0,1,457.94,347.6Z"/><path class="cls-9" d="M457.94,347.6a2,2,0,1,1-2-2A2,2,0,0,1,457.94,347.6Z"/><path class="cls-8" d="M460.21,347.36a2,2,0,1,1-2-2A2,2,0,0,1,460.21,347.36Z"/><path class="cls-9" d="M460.21,347.36a2,2,0,1,1-2-2A2,2,0,0,1,460.21,347.36Z"/><path class="cls-8" d="M462.5,347.12a2,2,0,1,1-2-2A2,2,0,0,1,462.5,347.12Z"/><path class="cls-9" d="M462.5,347.12a2,2,0,1,1-2-2A2,2,0,0,1,462.5,347.12Z"/><circle class="cls-8" cx="462.8" cy="346.88" r="1.98"/><circle class="cls-9" cx="462.8" cy="346.88" r="1.98"/><path class="cls-8" d="M467.18,346.64a2,2,0,1,1-2-2A2,2,0,0,1,467.18,346.64Z"/><path class="cls-9" d="M467.18,346.64a2,2,0,1,1-2-2A2,2,0,0,1,467.18,346.64Z"/><circle class="cls-8" cx="467.48" cy="346.4" r="1.98"/><circle class="cls-9" cx="467.48" cy="346.4" r="1.98"/><path class="cls-8" d="M471.74,346.16a2,2,0,1,1-2-2A2,2,0,0,1,471.74,346.16Z"/><path class="cls-9" d="M471.74,346.16a2,2,0,1,1-2-2A2,2,0,0,1,471.74,346.16Z"/><circle class="cls-8" cx="472.03" cy="345.92" r="1.98" transform="translate(-31.79 48.2) rotate(-5.65)"/><circle class="cls-9" cx="472.03" cy="345.92" r="1.98" transform="translate(-31.79 48.2) rotate(-5.65)"/><path class="cls-8" d="M476.3,345.68a2,2,0,1,1-2-2A2,2,0,0,1,476.3,345.68Z"/><path class="cls-9" d="M476.3,345.68a2,2,0,1,1-2-2A2,2,0,0,1,476.3,345.68Z"/><circle class="cls-8" cx="476.59" cy="345.44" r="1.98" transform="translate(98.24 796.34) rotate(-85.93)"/><circle class="cls-9" cx="476.59" cy="345.44" r="1.98" transform="translate(98.24 796.34) rotate(-85.93)"/><path class="cls-8" d="M480.86,345.2a2,2,0,1,1-2-2A2,2,0,0,1,480.86,345.2Z"/><path class="cls-9" d="M480.86,345.2a2,2,0,1,1-2-2A2,2,0,0,1,480.86,345.2Z"/><circle class="cls-8" cx="481.15" cy="344.96" r="1.98"/><circle class="cls-9" cx="481.15" cy="344.96" r="1.98"/><path class="cls-8" d="M485.42,344.72a2,2,0,1,1-2-2A2,2,0,0,1,485.42,344.72Z"/><path class="cls-9" d="M485.42,344.72a2,2,0,1,1-2-2A2,2,0,0,1,485.42,344.72Z"/><circle class="cls-8" cx="485.71" cy="344.48" r="1.98" transform="translate(-31.58 49.54) rotate(-5.65)"/><circle class="cls-9" cx="485.71" cy="344.48" r="1.98" transform="translate(-31.58 49.54) rotate(-5.65)"/><path class="cls-8" d="M490,344.24a2,2,0,1,1-2-2A2,2,0,0,1,490,344.24Z"/><path class="cls-9" d="M490,344.24a2,2,0,1,1-2-2A2,2,0,0,1,490,344.24Z"/><path class="cls-8" d="M492.38,344a2,2,0,1,1-2-2A2,2,0,0,1,492.38,344Z"/><path class="cls-9" d="M492.38,344a2,2,0,1,1-2-2A2,2,0,0,1,492.38,344Z"/><path class="cls-8" d="M494.65,343.64a2,2,0,1,1-2-2A2,2,0,0,1,494.65,343.64Z"/><path class="cls-9" d="M494.65,343.64a2,2,0,1,1-2-2A2,2,0,0,1,494.65,343.64Z"/><path class="cls-8" d="M496.94,343.4a2,2,0,1,1-2-2A2,2,0,0,1,496.94,343.4Z"/><path class="cls-9" d="M496.94,343.4a2,2,0,1,1-2-2A2,2,0,0,1,496.94,343.4Z"/><path class="cls-8" d="M499.21,343.16a2,2,0,1,1-2-2A2,2,0,0,1,499.21,343.16Z"/><path class="cls-9" d="M499.21,343.16a2,2,0,1,1-2-2A2,2,0,0,1,499.21,343.16Z"/><path class="cls-8" d="M501.5,342.92a2,2,0,1,1-2-2A2,2,0,0,1,501.5,342.92Z"/><path class="cls-9" d="M501.5,342.92a2,2,0,1,1-2-2A2,2,0,0,1,501.5,342.92Z"/><circle class="cls-8" cx="501.8" cy="342.56" r="1.98" transform="translate(-31.31 51.11) rotate(-5.65)"/><circle class="cls-9" cx="501.8" cy="342.56" r="1.98" transform="translate(-31.31 51.11) rotate(-5.65)"/><path class="cls-8" d="M506.06,342.32a2,2,0,1,1-2-2A2,2,0,0,1,506.06,342.32Z"/><path class="cls-9" d="M506.06,342.32a2,2,0,1,1-2-2A2,2,0,0,1,506.06,342.32Z"/><circle class="cls-8" cx="506.36" cy="342.08" r="1.98" transform="translate(129.25 822.91) rotate(-85.93)"/><circle class="cls-9" cx="506.36" cy="342.08" r="1.98" transform="translate(129.25 822.91) rotate(-85.93)"/><path class="cls-8" d="M510.62,341.72a2,2,0,1,1-2-2A2,2,0,0,1,510.62,341.72Z"/><path class="cls-9" d="M510.62,341.72a2,2,0,1,1-2-2A2,2,0,0,1,510.62,341.72Z"/><circle class="cls-8" cx="510.92" cy="341.48" r="1.98" transform="translate(-31.16 52.01) rotate(-5.65)"/><circle class="cls-9" cx="510.92" cy="341.48" r="1.98" transform="translate(-31.16 52.01) rotate(-5.65)"/><path class="cls-8" d="M515.3,341.12a2,2,0,1,1-2-2A2,2,0,0,1,515.3,341.12Z"/><path class="cls-9" d="M515.3,341.12a2,2,0,1,1-2-2A2,2,0,0,1,515.3,341.12Z"/><circle class="cls-8" cx="515.59" cy="340.88" r="1.98"/><circle class="cls-9" cx="515.59" cy="340.88" r="1.98"/><path class="cls-8" d="M519.86,340.52a2,2,0,1,1-2-2A2,2,0,0,1,519.86,340.52Z"/><path class="cls-9" d="M519.86,340.52a2,2,0,1,1-2-2A2,2,0,0,1,519.86,340.52Z"/><circle class="cls-8" cx="520.15" cy="340.28" r="1.98" transform="translate(-31 52.91) rotate(-5.65)"/><circle class="cls-9" cx="520.15" cy="340.28" r="1.98" transform="translate(-31 52.91) rotate(-5.65)"/><path class="cls-8" d="M524.42,339.92a2,2,0,1,1-2-2A2,2,0,0,1,524.42,339.92Z"/><path class="cls-9" d="M524.42,339.92a2,2,0,1,1-2-2A2,2,0,0,1,524.42,339.92Z"/><circle class="cls-8" cx="524.71" cy="339.56" r="1.98" transform="translate(-30.91 53.36) rotate(-5.65)"/><circle class="cls-9" cx="524.71" cy="339.56" r="1.98" transform="translate(-30.91 53.36) rotate(-5.65)"/><path class="cls-8" d="M529,339.32a2,2,0,1,1-2-2A2,2,0,0,1,529,339.32Z"/><path class="cls-9" d="M529,339.32a2,2,0,1,1-2-2A2,2,0,0,1,529,339.32Z"/><circle class="cls-8" cx="529.27" cy="338.96" r="1.98"/><circle class="cls-9" cx="529.27" cy="338.96" r="1.98"/><path class="cls-8" d="M533.53,338.6a2,2,0,1,1-2-2A2,2,0,0,1,533.53,338.6Z"/><path class="cls-9" d="M533.53,338.6a2,2,0,1,1-2-2A2,2,0,0,1,533.53,338.6Z"/><path class="cls-8" d="M535.82,338.24a2,2,0,1,1-2-2A2,2,0,0,1,535.82,338.24Z"/><path class="cls-9" d="M535.82,338.24a2,2,0,1,1-2-2A2,2,0,0,1,535.82,338.24Z"/><path class="cls-8" d="M538.21,337.88a2,2,0,1,1-2-2A2,2,0,0,1,538.21,337.88Z"/><path class="cls-9" d="M538.21,337.88a2,2,0,1,1-2-2A2,2,0,0,1,538.21,337.88Z"/><path class="cls-8" d="M540.5,337.52a2,2,0,1,1-2-2A2,2,0,0,1,540.5,337.52Z"/><path class="cls-9" d="M540.5,337.52a2,2,0,1,1-2-2A2,2,0,0,1,540.5,337.52Z"/><circle class="cls-8" cx="540.8" cy="337.16" r="1.98"/><circle class="cls-9" cx="540.8" cy="337.16" r="1.98"/><path class="cls-8" d="M545.06,336.68a2,2,0,1,1-2-2A2,2,0,0,1,545.06,336.68Z"/><path class="cls-9" d="M545.06,336.68a2,2,0,1,1-2-2A2,2,0,0,1,545.06,336.68Z"/><circle class="cls-8" cx="545.36" cy="336.32" r="1.98"/><circle class="cls-9" cx="545.36" cy="336.32" r="1.98"/><path class="cls-8" d="M549.62,336a2,2,0,1,1-2-2A2,2,0,0,1,549.62,336Z"/><path class="cls-9" d="M549.62,336a2,2,0,1,1-2-2A2,2,0,0,1,549.62,336Z"/><circle class="cls-8" cx="549.92" cy="335.48" r="1.98" transform="translate(-30.38 55.82) rotate(-5.65)"/><circle class="cls-9" cx="549.92" cy="335.48" r="1.98" transform="translate(-30.38 55.82) rotate(-5.65)"/><path class="cls-8" d="M554.18,335a2,2,0,1,1-2-2A2,2,0,0,1,554.18,335Z"/><path class="cls-9" d="M554.18,335a2,2,0,1,1-2-2A2,2,0,0,1,554.18,335Z"/><circle class="cls-8" cx="554.48" cy="334.64" r="1.98" transform="translate(181.38 863.99) rotate(-85.93)"/><circle class="cls-9" cx="554.48" cy="334.64" r="1.98" transform="translate(181.38 863.99) rotate(-85.93)"/><path class="cls-8" d="M558.74,334.16a2,2,0,1,1-2-2A2,2,0,0,1,558.74,334.16Z"/><path class="cls-9" d="M558.74,334.16a2,2,0,1,1-2-2A2,2,0,0,1,558.74,334.16Z"/><circle class="cls-8" cx="559.15" cy="333.68" r="1.98"/><circle class="cls-9" cx="559.15" cy="333.68" r="1.98"/><path class="cls-8" d="M563.42,333.2a2,2,0,1,1-2-2A2,2,0,0,1,563.42,333.2Z"/><path class="cls-9" d="M563.42,333.2a2,2,0,1,1-2-2A2,2,0,0,1,563.42,333.2Z"/><circle class="cls-8" cx="563.71" cy="332.6" r="1.98"/><circle class="cls-9" cx="563.71" cy="332.6" r="1.98"/><path class="cls-8" d="M568,332.12a2,2,0,1,1-2-2A2,2,0,0,1,568,332.12Z"/><path class="cls-9" d="M568,332.12a2,2,0,1,1-2-2A2,2,0,0,1,568,332.12Z"/><circle class="cls-8" cx="568.27" cy="331.52" r="1.98"/><circle class="cls-9" cx="568.27" cy="331.52" r="1.98"/><path class="cls-8" d="M572.53,331a2,2,0,1,1-2-2A2,2,0,0,1,572.53,331Z"/><path class="cls-9" d="M572.53,331a2,2,0,1,1-2-2A2,2,0,0,1,572.53,331Z"/><path class="cls-8" d="M574.82,330.32a2,2,0,1,1-2-2A2,2,0,0,1,574.82,330.32Z"/><path class="cls-9" d="M574.82,330.32a2,2,0,1,1-2-2A2,2,0,0,1,574.82,330.32Z"/><path class="cls-8" d="M577.09,329.72a2,2,0,1,1-2-2A2,2,0,0,1,577.09,329.72Z"/><path class="cls-9" d="M577.09,329.72a2,2,0,1,1-2-2A2,2,0,0,1,577.09,329.72Z"/><path class="cls-8" d="M579.38,329.12a2,2,0,1,1-2-2A2,2,0,0,1,579.38,329.12Z"/><path class="cls-9" d="M579.38,329.12a2,2,0,1,1-2-2A2,2,0,0,1,579.38,329.12Z"/><path class="cls-8" d="M581.65,328.4a2,2,0,1,1-2-2A2,2,0,0,1,581.65,328.4Z"/><path class="cls-9" d="M581.65,328.4a2,2,0,1,1-2-2A2,2,0,0,1,581.65,328.4Z"/><path class="cls-8" d="M584.06,327.68a2,2,0,1,1-2-2A2,2,0,0,1,584.06,327.68Z"/><path class="cls-9" d="M584.06,327.68a2,2,0,1,1-2-2A2,2,0,0,1,584.06,327.68Z"/><circle class="cls-8" cx="584.36" cy="326.96" r="1.98"/><circle class="cls-9" cx="584.36" cy="326.96" r="1.98"/><path class="cls-8" d="M588.62,326.12a2,2,0,1,1-2-2A2,2,0,0,1,588.62,326.12Z"/><path class="cls-9" d="M588.62,326.12a2,2,0,1,1-2-2A2,2,0,0,1,588.62,326.12Z"/><circle class="cls-8" cx="588.92" cy="325.28" r="1.98" transform="translate(-29.19 59.61) rotate(-5.65)"/><circle class="cls-9" cx="588.92" cy="325.28" r="1.98" transform="translate(-29.19 59.61) rotate(-5.65)"/><path class="cls-8" d="M593.18,324.44a2,2,0,1,1-2-2A2,2,0,0,1,593.18,324.44Z"/><path class="cls-9" d="M593.18,324.44a2,2,0,1,1-2-2A2,2,0,0,1,593.18,324.44Z"/><circle class="cls-8" cx="593.48" cy="323.48" r="1.98" transform="translate(-28.99 60.05) rotate(-5.65)"/><circle class="cls-9" cx="593.48" cy="323.48" r="1.98" transform="translate(-28.99 60.05) rotate(-5.65)"/><path class="cls-8" d="M597.74,322.52a2,2,0,1,1-2-2A2,2,0,0,1,597.74,322.52Z"/><path class="cls-9" d="M597.74,322.52a2,2,0,1,1-2-2A2,2,0,0,1,597.74,322.52Z"/><circle class="cls-8" cx="598.03" cy="321.56" r="1.98" transform="translate(-28.77 60.49) rotate(-5.65)"/><circle class="cls-9" cx="598.03" cy="321.56" r="1.98" transform="translate(-28.77 60.49) rotate(-5.65)"/><path class="cls-8" d="M602.3,320.48a2,2,0,1,1-2-2A2,2,0,0,1,602.3,320.48Z"/><path class="cls-9" d="M602.3,320.48a2,2,0,1,1-2-2A2,2,0,0,1,602.3,320.48Z"/><circle class="cls-8" cx="602.59" cy="319.28" r="1.98" transform="translate(-28.53 60.93) rotate(-5.65)"/><circle class="cls-9" cx="602.59" cy="319.28" r="1.98" transform="translate(-28.53 60.93) rotate(-5.65)"/><path class="cls-8" d="M607,318.08a2,2,0,1,1-2-2A2,2,0,0,1,607,318.08Z"/><path class="cls-9" d="M607,318.08a2,2,0,1,1-2-2A2,2,0,0,1,607,318.08Z"/><circle class="cls-8" cx="607.27" cy="316.76" r="1.98"/><circle class="cls-9" cx="607.27" cy="316.76" r="1.98"/><path class="cls-8" d="M611.53,315.44a2,2,0,1,1-2-2A2,2,0,0,1,611.53,315.44Z"/><path class="cls-9" d="M611.53,315.44a2,2,0,1,1-2-2A2,2,0,0,1,611.53,315.44Z"/><path class="cls-8" d="M613.82,314a2,2,0,1,1-2-2A2,2,0,0,1,613.82,314Z"/><path class="cls-9" d="M613.82,314a2,2,0,1,1-2-2A2,2,0,0,1,613.82,314Z"/><path class="cls-8" d="M616.09,312.44a2,2,0,1,1-2-2A2,2,0,0,1,616.09,312.44Z"/><path class="cls-9" d="M616.09,312.44a2,2,0,1,1-2-2A2,2,0,0,1,616.09,312.44Z"/><path class="cls-8" d="M618.38,310.76a2,2,0,1,1-2-2A2,2,0,0,1,618.38,310.76Z"/><path class="cls-9" d="M618.38,310.76a2,2,0,1,1-2-2A2,2,0,0,1,618.38,310.76Z"/><path class="cls-8" d="M620.65,309.08a2,2,0,1,1-2-2A2,2,0,0,1,620.65,309.08Z"/><path class="cls-9" d="M620.65,309.08a2,2,0,1,1-2-2A2,2,0,0,1,620.65,309.08Z"/><path class="cls-8" d="M622.94,307.16a2,2,0,1,1-2-2A2,2,0,0,1,622.94,307.16Z"/><path class="cls-9" d="M622.94,307.16a2,2,0,1,1-2-2A2,2,0,0,1,622.94,307.16Z"/><path class="cls-8" d="M625.21,305.24a2,2,0,1,1-2-2A2,2,0,0,1,625.21,305.24Z"/><path class="cls-9" d="M625.21,305.24a2,2,0,1,1-2-2A2,2,0,0,1,625.21,305.24Z"/><path class="cls-8" d="M627.5,303.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,303.2Z"/><path class="cls-9" d="M627.5,303.2a2,2,0,1,1-2-2A2,2,0,0,1,627.5,303.2Z"/><circle class="cls-8" cx="627.92" cy="300.92" r="1.98" transform="translate(-26.6 63.34) rotate(-5.65)"/><circle class="cls-9" cx="627.92" cy="300.92" r="1.98" transform="translate(-26.6 63.34) rotate(-5.65)"/><path class="cls-8" d="M632.18,298.64a2,2,0,1,1-2-2A2,2,0,0,1,632.18,298.64Z"/><path class="cls-9" d="M632.18,298.64a2,2,0,1,1-2-2A2,2,0,0,1,632.18,298.64Z"/><circle class="cls-8" cx="632.48" cy="296.12" r="1.98" transform="translate(-26.1 63.76) rotate(-5.65)"/><circle class="cls-9" cx="632.48" cy="296.12" r="1.98" transform="translate(-26.1 63.76) rotate(-5.65)"/><path class="cls-8" d="M636.74,293.36a2,2,0,1,1-2-2A2,2,0,0,1,636.74,293.36Z"/><path class="cls-9" d="M636.74,293.36a2,2,0,1,1-2-2A2,2,0,0,1,636.74,293.36Z"/><circle class="cls-8" cx="637.03" cy="290.48" r="1.98" transform="translate(-25.52 64.19) rotate(-5.65)"/><circle class="cls-9" cx="637.03" cy="290.48" r="1.98" transform="translate(-25.52 64.19) rotate(-5.65)"/><path class="cls-8" d="M641.3,287.48a2,2,0,1,1-2-2A2,2,0,0,1,641.3,287.48Z"/><path class="cls-9" d="M641.3,287.48a2,2,0,1,1-2-2A2,2,0,0,1,641.3,287.48Z"/><g class="cls-3"><polyline class="cls-10" points="54.95 363.24 57.26 363.13 59.53 363.01 61.81 362.89 64.09 362.77 66.38 362.65 68.66 362.54 71.06 362.42 73.33 362.3 75.61 362.18 77.89 362.06 80.17 361.81 82.45 361.69 84.73 361.45 87.02 361.33 89.3 361.1 91.58 360.98 93.86 360.74 96.25 360.5 98.53 360.38 100.81 360.13 103.09 359.89 105.38 359.77 107.66 359.54 109.94 359.3 112.22 359.06 114.5 358.81 116.78 358.69 119.17 358.45 121.45 358.21 123.73 357.98 126.02 357.74 128.29 357.5 130.57 357.38 132.85 357.13 135.13 356.89 137.41 356.65 139.69 356.42 142.09 356.18 144.38 355.94 146.66 355.69 148.94 355.57 151.22 355.33 153.5 355.1 155.78 354.86 158.06 354.62 160.34 354.38 162.62 354.13 165.01 353.89 167.29 353.77 169.57 353.54 171.85 353.3 174.13 353.06 176.41 352.81 178.69 352.69 180.97 352.45 183.25 352.21 185.53 351.98 187.94 351.74 190.22 351.5 192.5 351.38 194.78 351.13 197.06 350.89 199.34 350.65 201.62 350.54 203.9 350.3 206.18 350.06 208.46 349.81 210.85 349.57 213.13 349.45 215.41 349.21 217.69 348.98 219.97 348.86 222.25 348.62 224.53 348.38 226.81 348.25 229.09 348.01 231.38 347.77 233.78 347.54 236.06 347.42 238.34 347.18 240.62 346.94 242.9 346.81 245.18 346.57 247.46 346.45 249.74 346.21 252.01 345.98 254.29 345.86 256.69 345.62 258.98 345.38 261.25 345.25 263.54 345.01 265.81 344.89 268.1 344.65 270.38 344.42 272.65 344.3 274.94 343.94 277.21 343.57 279.62 343.21 281.89 342.86 284.18 342.5 286.45 342.13 288.74 341.77 291.01 341.42 293.3 341.06 295.57 340.69 297.86 340.33 300.13 339.98 302.42 339.62 304.81 339.38 307.1 339.01 309.38 338.65 311.65 338.3 313.94 337.94 316.21 337.57 318.5 337.21 320.77 336.86 323.06 336.62 325.33 336.25 327.74 335.89 330.01 335.54 332.3 335.06 334.57 334.57 336.86 334.1 339.13 333.62 341.42 333.13 343.69 332.65 345.98 332.18 348.25 331.69 350.65 331.21 352.94 330.74 355.21 330.25 357.5 329.77 359.77 329.3 362.06 328.81 364.33 328.33 366.62 327.86 368.89 327.38 371.18 326.77 373.57 326.18 375.86 325.57 378.13 324.98 380.42 324.38 382.69 323.77 384.98 323.18 387.25 322.57 389.54 321.98 391.81 321.38 394.1 320.89 396.5 320.3 398.77 319.69 401.06 318.98 403.33 318.25 405.62 317.54 407.89 316.94 410.18 316.21 412.45 315.5 414.74 314.77 417.01 314.18 419.42 313.45 421.69 312.74 423.98 312.01 426.25 311.18 428.54 310.33 430.81 309.62 433.1 308.77 435.38 308.06 437.65 307.33 439.94 306.5 442.33 305.77 444.62 304.94 446.89 304.1 449.18 303.13 451.45 302.3 453.74 301.45 456.01 300.62 458.3 299.77 460.57 298.81 462.86 297.98 465.25 297.01 467.54 296.06 469.81 295.21 472.1 294.25 474.38 293.3 476.65 292.21 478.94 291.25 481.21 290.3 483.5 289.33 485.77 288.25 488.06 287.3 490.45 286.21 492.74 285.13 495.01 284.06 497.3 282.98 499.57 281.89 501.86 280.69 504.13 279.62 506.42 278.54 508.69 277.33 510.98 276.13 513.38 274.94 515.65 273.74 517.93 272.54 520.22 271.21 522.5 269.89 524.77 268.69 527.05 267.38 529.34 265.94 531.62 264.62 533.89 263.18 536.29 261.74 538.58 260.3 540.86 258.74 543.13 257.3 545.41 255.62 547.7 253.94 549.98 252.25 552.25 250.57 554.53 248.78 556.82 246.85 559.22 244.94 561.5 242.9 563.77 240.97 566.05 238.69 568.34 236.53 570.62 234.25 572.89 231.85 575.17 229.34 577.46 226.69 579.74 223.94 582.13 221.18 584.41 218.06 586.7 214.94 588.98 211.57 591.25 208.22 593.53 204.38 595.82 200.53 598.1 196.46 600.38 192.13 602.65 187.46 605.05 182.66 607.34 177.38 609.62 171.97 611.89 166.09 614.17 159.97 616.46 153.5 618.74 146.66 621.01 139.22 623.29 131.53 625.58 123.14 627.98 114.25 630.25 104.89 632.53 94.81 634.82 83.89 637.1 72.5 639.38 60.2"/></g><circle class="cls-11" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-12" cx="54.98" cy="363.26" r="2.52"/><circle class="cls-11" cx="57.25" cy="363.14" r="2.52"/><circle class="cls-12" cx="57.25" cy="363.14" r="2.52"/><circle class="cls-11" cx="59.53" cy="363.02" r="2.52"/><circle class="cls-12" cx="59.53" cy="363.02" r="2.52"/><circle class="cls-11" cx="61.82" cy="362.9" r="2.52"/><circle class="cls-12" cx="61.82" cy="362.9" r="2.52"/><circle class="cls-11" cx="64.09" cy="362.78" r="2.52"/><circle class="cls-12" cx="64.09" cy="362.78" r="2.52"/><circle class="cls-11" cx="66.37" cy="362.66" r="2.52"/><circle class="cls-12" cx="66.37" cy="362.66" r="2.52"/><circle class="cls-11" cx="68.65" cy="362.54" r="2.52"/><circle class="cls-12" cx="68.65" cy="362.54" r="2.52"/><circle class="cls-11" cx="71.06" cy="362.42" r="2.52"/><circle class="cls-12" cx="71.06" cy="362.42" r="2.52"/><circle class="cls-11" cx="73.34" cy="362.3" r="2.52"/><circle class="cls-12" cx="73.34" cy="362.3" r="2.52"/><circle class="cls-11" cx="75.62" cy="362.18" r="2.52"/><circle class="cls-12" cx="75.62" cy="362.18" r="2.52"/><circle class="cls-11" cx="77.9" cy="362.06" r="2.52"/><circle class="cls-12" cx="77.9" cy="362.06" r="2.52"/><circle class="cls-11" cx="80.17" cy="361.82" r="2.52"/><circle class="cls-12" cx="80.17" cy="361.82" r="2.52"/><circle class="cls-11" cx="82.45" cy="361.7" r="2.52"/><circle class="cls-12" cx="82.45" cy="361.7" r="2.52"/><circle class="cls-11" cx="84.73" cy="361.46" r="2.52"/><circle class="cls-12" cx="84.73" cy="361.46" r="2.52"/><circle class="cls-11" cx="87.01" cy="361.34" r="2.52" transform="translate(-281.14 412.32) rotate(-84.35)"/><circle class="cls-12" cx="87.01" cy="361.34" r="2.52" transform="translate(-281.14 412.32) rotate(-84.35)"/><circle class="cls-11" cx="89.29" cy="361.1" r="2.52" transform="translate(-278.84 414.37) rotate(-84.35)"/><circle class="cls-12" cx="89.29" cy="361.1" r="2.52" transform="translate(-278.84 414.37) rotate(-84.35)"/><circle class="cls-11" cx="91.57" cy="360.98" r="2.52" transform="translate(-276.67 416.53) rotate(-84.35)"/><circle class="cls-12" cx="91.57" cy="360.98" r="2.52" transform="translate(-276.67 416.53) rotate(-84.35)"/><circle class="cls-11" cx="93.86" cy="360.74" r="2.52"/><circle class="cls-12" cx="93.86" cy="360.74" r="2.52"/><circle class="cls-11" cx="96.26" cy="360.5" r="2.52"/><circle class="cls-12" cx="96.26" cy="360.5" r="2.52"/><circle class="cls-11" cx="98.53" cy="360.38" r="2.52"/><circle class="cls-12" cx="98.53" cy="360.38" r="2.52"/><circle class="cls-11" cx="100.81" cy="360.14" r="2.52"/><circle class="cls-12" cx="100.81" cy="360.14" r="2.52"/><circle class="cls-11" cx="103.09" cy="359.9" r="2.52"/><circle class="cls-12" cx="103.09" cy="359.9" r="2.52"/><circle class="cls-11" cx="105.38" cy="359.78" r="2.52"/><circle class="cls-12" cx="105.38" cy="359.78" r="2.52"/><circle class="cls-11" cx="107.65" cy="359.54" r="2.52"/><circle class="cls-12" cx="107.65" cy="359.54" r="2.52"/><circle class="cls-11" cx="109.93" cy="359.3" r="2.52" transform="translate(-258.44 433.29) rotate(-84.35)"/><circle class="cls-12" cx="109.93" cy="359.3" r="2.52" transform="translate(-258.44 433.29) rotate(-84.35)"/><circle class="cls-11" cx="112.21" cy="359.06" r="2.52" transform="translate(-256.15 435.34) rotate(-84.35)"/><circle class="cls-12" cx="112.21" cy="359.06" r="2.52" transform="translate(-256.15 435.34) rotate(-84.35)"/><circle class="cls-11" cx="114.5" cy="358.82" r="2.52"/><circle class="cls-12" cx="114.5" cy="358.82" r="2.52"/><circle class="cls-11" cx="116.78" cy="358.7" r="2.52"/><circle class="cls-12" cx="116.78" cy="358.7" r="2.52"/><circle class="cls-11" cx="119.17" cy="358.46" r="2.52"/><circle class="cls-12" cx="119.17" cy="358.46" r="2.52"/><circle class="cls-11" cx="121.45" cy="358.22" r="2.52"/><circle class="cls-12" cx="121.45" cy="358.22" r="2.52"/><circle class="cls-11" cx="123.73" cy="357.98" r="2.52"/><circle class="cls-12" cx="123.73" cy="357.98" r="2.52"/><circle class="cls-11" cx="126.01" cy="357.74" r="2.52" transform="translate(-242.4 447.89) rotate(-84.35)"/><circle class="cls-12" cx="126.01" cy="357.74" r="2.52" transform="translate(-242.4 447.89) rotate(-84.35)"/><circle class="cls-11" cx="128.29" cy="357.5" r="2.52"/><circle class="cls-12" cx="128.29" cy="357.5" r="2.52"/><circle class="cls-11" cx="130.57" cy="357.38" r="2.52" transform="translate(-237.93 452.1) rotate(-84.35)"/><circle class="cls-12" cx="130.57" cy="357.38" r="2.52" transform="translate(-237.93 452.1) rotate(-84.35)"/><circle class="cls-11" cx="132.86" cy="357.14" r="2.52"/><circle class="cls-12" cx="132.86" cy="357.14" r="2.52"/><circle class="cls-11" cx="135.14" cy="356.9" r="2.52"/><circle class="cls-12" cx="135.14" cy="356.9" r="2.52"/><circle class="cls-11" cx="137.42" cy="356.66" r="2.52"/><circle class="cls-12" cx="137.42" cy="356.66" r="2.52"/><circle class="cls-11" cx="139.69" cy="356.42" r="2.52"/><circle class="cls-12" cx="139.69" cy="356.42" r="2.52"/><circle class="cls-11" cx="142.09" cy="356.18" r="2.52"/><circle class="cls-12" cx="142.09" cy="356.18" r="2.52"/><circle class="cls-11" cx="144.38" cy="355.94" r="2.52"/><circle class="cls-12" cx="144.38" cy="355.94" r="2.52"/><circle class="cls-11" cx="146.65" cy="355.7" r="2.52" transform="translate(-221.76 466.59) rotate(-84.35)"/><circle class="cls-12" cx="146.65" cy="355.7" r="2.52" transform="translate(-221.76 466.59) rotate(-84.35)"/><circle class="cls-11" cx="148.93" cy="355.58" r="2.52"/><circle class="cls-12" cx="148.93" cy="355.58" r="2.52"/><circle class="cls-11" cx="151.21" cy="355.34" r="2.52" transform="translate(-217.29 470.8) rotate(-84.35)"/><circle class="cls-12" cx="151.21" cy="355.34" r="2.52" transform="translate(-217.29 470.8) rotate(-84.35)"/><circle class="cls-11" cx="153.5" cy="355.1" r="2.52"/><circle class="cls-12" cx="153.5" cy="355.1" r="2.52"/><circle class="cls-11" cx="155.78" cy="354.86" r="2.52"/><circle class="cls-12" cx="155.78" cy="354.86" r="2.52"/><circle class="cls-11" cx="158.06" cy="354.62" r="2.52"/><circle class="cls-12" cx="158.06" cy="354.62" r="2.52"/><circle class="cls-11" cx="160.34" cy="354.38" r="2.52"/><circle class="cls-12" cx="160.34" cy="354.38" r="2.52"/><circle class="cls-11" cx="162.62" cy="354.14" r="2.52"/><circle class="cls-12" cx="162.62" cy="354.14" r="2.52"/><circle class="cls-11" cx="165.01" cy="353.9" r="2.52"/><circle class="cls-12" cx="165.01" cy="353.9" r="2.52"/><circle class="cls-11" cx="167.29" cy="353.78" r="2.52"/><circle class="cls-12" cx="167.29" cy="353.78" r="2.52"/><circle class="cls-11" cx="169.57" cy="353.54" r="2.52"/><circle class="cls-12" cx="169.57" cy="353.54" r="2.52"/><circle class="cls-11" cx="171.86" cy="353.3" r="2.52"/><circle class="cls-12" cx="171.86" cy="353.3" r="2.52"/><circle class="cls-11" cx="174.14" cy="353.06" r="2.52"/><circle class="cls-12" cx="174.14" cy="353.06" r="2.52"/><circle class="cls-11" cx="176.42" cy="352.82" r="2.52"/><circle class="cls-12" cx="176.42" cy="352.82" r="2.52"/><circle class="cls-11" cx="178.69" cy="352.7" r="2.52"/><circle class="cls-12" cx="178.69" cy="352.7" r="2.52"/><circle class="cls-11" cx="180.98" cy="352.46" r="2.52"/><circle class="cls-12" cx="180.98" cy="352.46" r="2.52"/><circle class="cls-11" cx="183.26" cy="352.22" r="2.52"/><circle class="cls-12" cx="183.26" cy="352.22" r="2.52"/><circle class="cls-11" cx="185.53" cy="351.98" r="2.52"/><circle class="cls-12" cx="185.53" cy="351.98" r="2.52"/><circle class="cls-11" cx="187.93" cy="351.74" r="2.52"/><circle class="cls-12" cx="187.93" cy="351.74" r="2.52"/><circle class="cls-11" cx="190.21" cy="351.5" r="2.52" transform="translate(-195.58 455.86) rotate(-76.72)"/><circle class="cls-12" cx="190.21" cy="351.5" r="2.52" transform="translate(-195.58 455.86) rotate(-76.72)"/><circle class="cls-11" cx="192.5" cy="351.38" r="2.52"/><circle class="cls-12" cx="192.5" cy="351.38" r="2.52"/><circle class="cls-11" cx="194.78" cy="351.14" r="2.52"/><circle class="cls-12" cx="194.78" cy="351.14" r="2.52"/><circle class="cls-11" cx="197.05" cy="350.9" r="2.52"/><circle class="cls-12" cx="197.05" cy="350.9" r="2.52"/><circle class="cls-11" cx="199.33" cy="350.66" r="2.52"/><circle class="cls-12" cx="199.33" cy="350.66" r="2.52"/><circle class="cls-11" cx="201.61" cy="350.54" r="2.52"/><circle class="cls-12" cx="201.61" cy="350.54" r="2.52"/><circle class="cls-11" cx="203.89" cy="350.3" r="2.52"/><circle class="cls-12" cx="203.89" cy="350.3" r="2.52"/><circle class="cls-11" cx="206.18" cy="350.06" r="2.52" transform="translate(-162.49 520.73) rotate(-84.34)"/><circle class="cls-12" cx="206.18" cy="350.06" r="2.52" transform="translate(-162.49 520.73) rotate(-84.34)"/><circle class="cls-11" cx="208.46" cy="349.82" r="2.52"/><circle class="cls-12" cx="208.46" cy="349.82" r="2.52"/><circle class="cls-11" cx="210.85" cy="349.58" r="2.52" transform="translate(-53.27 38.29) rotate(-9.22)"/><circle class="cls-12" cx="210.85" cy="349.58" r="2.52" transform="translate(-53.27 38.29) rotate(-9.22)"/><circle class="cls-11" cx="213.13" cy="349.46" r="2.52"/><circle class="cls-12" cx="213.13" cy="349.46" r="2.52"/><circle class="cls-11" cx="215.41" cy="349.22" r="2.52" transform="translate(-53.16 39.02) rotate(-9.22)"/><circle class="cls-12" cx="215.41" cy="349.22" r="2.52" transform="translate(-53.16 39.02) rotate(-9.22)"/><circle class="cls-11" cx="217.7" cy="348.98" r="2.52"/><circle class="cls-12" cx="217.7" cy="348.98" r="2.52"/><circle class="cls-11" cx="219.98" cy="348.86" r="2.52" transform="translate(-159.61 510.11) rotate(-80.78)"/><circle class="cls-12" cx="219.98" cy="348.86" r="2.52" transform="translate(-159.61 510.11) rotate(-80.78)"/><circle class="cls-11" cx="222.26" cy="348.62" r="2.52"/><circle class="cls-12" cx="222.26" cy="348.62" r="2.52"/><circle class="cls-11" cx="224.53" cy="348.38" r="2.52"/><circle class="cls-12" cx="224.53" cy="348.38" r="2.52"/><circle class="cls-11" cx="226.81" cy="348.26" r="2.52"/><circle class="cls-12" cx="226.81" cy="348.26" r="2.52"/><circle class="cls-11" cx="229.09" cy="348.02" r="2.52"/><circle class="cls-12" cx="229.09" cy="348.02" r="2.52"/><circle class="cls-11" cx="231.37" cy="347.78" r="2.52"/><circle class="cls-12" cx="231.37" cy="347.78" r="2.52"/><circle class="cls-11" cx="233.77" cy="347.54" r="2.52"/><circle class="cls-12" cx="233.77" cy="347.54" r="2.52"/><circle class="cls-11" cx="236.05" cy="347.42" r="2.52"/><circle class="cls-12" cx="236.05" cy="347.42" r="2.52"/><circle class="cls-11" cx="238.33" cy="347.18" r="2.52"/><circle class="cls-12" cx="238.33" cy="347.18" r="2.52"/><circle class="cls-11" cx="240.61" cy="346.94" r="2.52"/><circle class="cls-12" cx="240.61" cy="346.94" r="2.52"/><circle class="cls-11" cx="242.89" cy="346.82" r="2.52"/><circle class="cls-12" cx="242.89" cy="346.82" r="2.52"/><circle class="cls-11" cx="245.18" cy="346.58" r="2.52" transform="translate(-52.35 43.75) rotate(-9.22)"/><circle class="cls-12" cx="245.18" cy="346.58" r="2.52" transform="translate(-52.35 43.75) rotate(-9.22)"/><circle class="cls-11" cx="247.46" cy="346.46" r="2.52"/><circle class="cls-12" cx="247.46" cy="346.46" r="2.52"/><circle class="cls-11" cx="249.74" cy="346.22" r="2.52" transform="translate(-52.23 44.47) rotate(-9.22)"/><circle class="cls-12" cx="249.74" cy="346.22" r="2.52" transform="translate(-52.23 44.47) rotate(-9.22)"/><circle class="cls-11" cx="252.02" cy="345.98" r="2.52" transform="translate(-117.11 562.67) rotate(-84.34)"/><circle class="cls-12" cx="252.02" cy="345.98" r="2.52" transform="translate(-117.11 562.67) rotate(-84.34)"/><circle class="cls-11" cx="254.3" cy="345.86" r="2.52" transform="translate(-52.12 45.2) rotate(-9.22)"/><circle class="cls-12" cx="254.3" cy="345.86" r="2.52" transform="translate(-52.12 45.2) rotate(-9.22)"/><circle class="cls-11" cx="256.7" cy="345.62" r="2.52"/><circle class="cls-12" cx="256.7" cy="345.62" r="2.52"/><circle class="cls-11" cx="258.98" cy="345.38" r="2.52"/><circle class="cls-12" cx="258.98" cy="345.38" r="2.52"/><circle class="cls-11" cx="261.26" cy="345.26" r="2.52"/><circle class="cls-12" cx="261.26" cy="345.26" r="2.52"/><circle class="cls-11" cx="263.53" cy="345.02" r="2.52"/><circle class="cls-12" cx="263.53" cy="345.02" r="2.52"/><circle class="cls-11" cx="265.81" cy="344.9" r="2.52"/><circle class="cls-12" cx="265.81" cy="344.9" r="2.52"/><circle class="cls-11" cx="268.09" cy="344.66" r="2.52"/><circle class="cls-12" cx="268.09" cy="344.66" r="2.52"/><circle class="cls-11" cx="270.37" cy="344.42" r="2.52"/><circle class="cls-12" cx="270.37" cy="344.42" r="2.52"/><circle class="cls-11" cx="272.66" cy="344.3" r="2.52"/><circle class="cls-12" cx="272.66" cy="344.3" r="2.52"/><circle class="cls-11" cx="274.94" cy="343.94" r="2.52"/><circle class="cls-12" cx="274.94" cy="343.94" r="2.52"/><circle class="cls-11" cx="277.22" cy="343.58" r="2.52"/><circle class="cls-12" cx="277.22" cy="343.58" r="2.52"/><circle class="cls-11" cx="279.61" cy="343.22" r="2.52"/><circle class="cls-12" cx="279.61" cy="343.22" r="2.52"/><circle class="cls-11" cx="281.89" cy="342.86" r="2.52"/><circle class="cls-12" cx="281.89" cy="342.86" r="2.52"/><circle class="cls-11" cx="284.18" cy="342.5" r="2.52" transform="translate(-51.19 49.94) rotate(-9.22)"/><circle class="cls-12" cx="284.18" cy="342.5" r="2.52" transform="translate(-51.19 49.94) rotate(-9.22)"/><circle class="cls-11" cx="286.46" cy="342.14" r="2.52" transform="translate(-51.11 50.3) rotate(-9.22)"/><circle class="cls-12" cx="286.46" cy="342.14" r="2.52" transform="translate(-51.11 50.3) rotate(-9.22)"/><circle class="cls-11" cx="288.74" cy="341.78" r="2.52" transform="translate(-79.83 595.43) rotate(-84.34)"/><circle class="cls-12" cx="288.74" cy="341.78" r="2.52" transform="translate(-79.83 595.43) rotate(-84.34)"/><circle class="cls-11" cx="291.02" cy="341.42" r="2.52" transform="translate(-77.41 597.37) rotate(-84.34)"/><circle class="cls-12" cx="291.02" cy="341.42" r="2.52" transform="translate(-77.41 597.37) rotate(-84.34)"/><circle class="cls-11" cx="293.3" cy="341.06" r="2.52" transform="translate(-75 599.32) rotate(-84.34)"/><circle class="cls-12" cx="293.3" cy="341.06" r="2.52" transform="translate(-75 599.32) rotate(-84.34)"/><circle class="cls-11" cx="295.58" cy="340.7" r="2.52" transform="translate(-72.59 601.26) rotate(-84.34)"/><circle class="cls-12" cx="295.58" cy="340.7" r="2.52" transform="translate(-72.59 601.26) rotate(-84.34)"/><circle class="cls-11" cx="297.85" cy="340.34" r="2.52" transform="translate(-70.17 603.2) rotate(-84.34)"/><circle class="cls-12" cx="297.85" cy="340.34" r="2.52" transform="translate(-70.17 603.2) rotate(-84.34)"/><circle class="cls-11" cx="300.13" cy="339.98" r="2.52" transform="translate(-67.76 605.15) rotate(-84.34)"/><circle class="cls-12" cx="300.13" cy="339.98" r="2.52" transform="translate(-67.76 605.15) rotate(-84.34)"/><circle class="cls-11" cx="302.41" cy="339.62" r="2.52" transform="translate(-65.35 607.09) rotate(-84.34)"/><circle class="cls-12" cx="302.41" cy="339.62" r="2.52" transform="translate(-65.35 607.09) rotate(-84.34)"/><circle class="cls-11" cx="304.81" cy="339.38" r="2.52"/><circle class="cls-12" cx="304.81" cy="339.38" r="2.52"/><circle class="cls-11" cx="307.09" cy="339.02" r="2.52"/><circle class="cls-12" cx="307.09" cy="339.02" r="2.52"/><circle class="cls-11" cx="309.37" cy="338.66" r="2.52"/><circle class="cls-12" cx="309.37" cy="338.66" r="2.52"/><circle class="cls-11" cx="311.66" cy="338.3" r="2.52"/><circle class="cls-12" cx="311.66" cy="338.3" r="2.52"/><circle class="cls-11" cx="313.94" cy="337.94" r="2.52"/><circle class="cls-12" cx="313.94" cy="337.94" r="2.52"/><circle class="cls-11" cx="316.22" cy="337.58" r="2.52"/><circle class="cls-12" cx="316.22" cy="337.58" r="2.52"/><circle class="cls-11" cx="318.49" cy="337.22" r="2.52"/><circle class="cls-12" cx="318.49" cy="337.22" r="2.52"/><circle class="cls-11" cx="320.77" cy="336.86" r="2.52"/><circle class="cls-12" cx="320.77" cy="336.86" r="2.52"/><circle class="cls-11" cx="323.05" cy="336.62" r="2.52"/><circle class="cls-12" cx="323.05" cy="336.62" r="2.52"/><circle class="cls-11" cx="325.33" cy="336.26" r="2.52"/><circle class="cls-12" cx="325.33" cy="336.26" r="2.52"/><circle class="cls-11" cx="327.74" cy="335.9" r="2.52"/><circle class="cls-12" cx="327.74" cy="335.9" r="2.52"/><circle class="cls-11" cx="330.02" cy="335.54" r="2.52"/><circle class="cls-12" cx="330.02" cy="335.54" r="2.52"/><circle class="cls-11" cx="332.3" cy="335.06" r="2.52" transform="translate(-33.87 632.72) rotate(-84.34)"/><circle class="cls-12" cx="332.3" cy="335.06" r="2.52" transform="translate(-33.87 632.72) rotate(-84.34)"/><circle class="cls-11" cx="334.58" cy="334.58" r="2.52" transform="translate(-49.27 57.91) rotate(-9.22)"/><circle class="cls-12" cx="334.58" cy="334.58" r="2.52" transform="translate(-49.27 57.91) rotate(-9.22)"/><circle class="cls-11" cx="336.85" cy="334.1" r="2.52"/><circle class="cls-12" cx="336.85" cy="334.1" r="2.52"/><circle class="cls-11" cx="339.13" cy="333.62" r="2.52" transform="translate(-26.27 638.23) rotate(-84.34)"/><circle class="cls-12" cx="339.13" cy="333.62" r="2.52" transform="translate(-26.27 638.23) rotate(-84.34)"/><circle class="cls-11" cx="341.41" cy="333.14" r="2.52" transform="translate(-48.95 58.99) rotate(-9.22)"/><circle class="cls-12" cx="341.41" cy="333.14" r="2.52" transform="translate(-48.95 58.99) rotate(-9.22)"/><circle class="cls-11" cx="343.7" cy="332.66" r="2.52"/><circle class="cls-12" cx="343.7" cy="332.66" r="2.52"/><circle class="cls-11" cx="345.98" cy="332.18" r="2.52"/><circle class="cls-12" cx="345.98" cy="332.18" r="2.52"/><circle class="cls-11" cx="348.26" cy="331.7" r="2.52"/><circle class="cls-12" cx="348.26" cy="331.7" r="2.52"/><circle class="cls-11" cx="350.66" cy="331.22" r="2.52"/><circle class="cls-12" cx="350.66" cy="331.22" r="2.52"/><circle class="cls-11" cx="352.94" cy="330.74" r="2.52"/><circle class="cls-12" cx="352.94" cy="330.74" r="2.52"/><circle class="cls-11" cx="355.22" cy="330.26" r="2.52"/><circle class="cls-12" cx="355.22" cy="330.26" r="2.52"/><circle class="cls-11" cx="357.49" cy="329.78" r="2.52"/><circle class="cls-12" cx="357.49" cy="329.78" r="2.52"/><circle class="cls-11" cx="359.77" cy="329.3" r="2.52"/><circle class="cls-12" cx="359.77" cy="329.3" r="2.52"/><circle class="cls-11" cx="362.05" cy="328.82" r="2.52"/><circle class="cls-12" cx="362.05" cy="328.82" r="2.52"/><circle class="cls-11" cx="364.33" cy="328.34" r="2.52"/><circle class="cls-12" cx="364.33" cy="328.34" r="2.52"/><circle class="cls-11" cx="366.61" cy="327.86" r="2.52"/><circle class="cls-12" cx="366.61" cy="327.86" r="2.52"/><circle class="cls-11" cx="368.89" cy="327.38" r="2.52"/><circle class="cls-12" cx="368.89" cy="327.38" r="2.52"/><circle class="cls-11" cx="371.18" cy="326.78" r="2.52" transform="translate(9.42 663.94) rotate(-84.34)"/><circle class="cls-12" cx="371.18" cy="326.78" r="2.52" transform="translate(9.42 663.94) rotate(-84.34)"/><circle class="cls-11" cx="373.58" cy="326.18" r="2.52"/><circle class="cls-12" cx="373.58" cy="326.18" r="2.52"/><circle class="cls-11" cx="375.85" cy="325.58" r="2.52" transform="translate(-47.3 64.41) rotate(-9.22)"/><circle class="cls-12" cx="375.85" cy="325.58" r="2.52" transform="translate(-47.3 64.41) rotate(-9.22)"/><circle class="cls-11" cx="378.13" cy="324.98" r="2.52" transform="translate(17.48 669.25) rotate(-84.34)"/><circle class="cls-12" cx="378.13" cy="324.98" r="2.52" transform="translate(17.48 669.25) rotate(-84.34)"/><circle class="cls-11" cx="380.41" cy="324.38" r="2.52"/><circle class="cls-12" cx="380.41" cy="324.38" r="2.52"/><circle class="cls-11" cx="382.7" cy="323.78" r="2.52"/><circle class="cls-12" cx="382.7" cy="323.78" r="2.52"/><circle class="cls-11" cx="384.98" cy="323.18" r="2.52"/><circle class="cls-12" cx="384.98" cy="323.18" r="2.52"/><circle class="cls-11" cx="387.26" cy="322.58" r="2.52" transform="translate(6.81 653.16) rotate(-80.78)"/><circle class="cls-12" cx="387.26" cy="322.58" r="2.52" transform="translate(6.81 653.16) rotate(-80.78)"/><circle class="cls-11" cx="389.53" cy="321.98" r="2.52"/><circle class="cls-12" cx="389.53" cy="321.98" r="2.52"/><circle class="cls-11" cx="391.81" cy="321.38" r="2.52"/><circle class="cls-12" cx="391.81" cy="321.38" r="2.52"/><circle class="cls-11" cx="394.09" cy="320.9" r="2.52"/><circle class="cls-12" cx="394.09" cy="320.9" r="2.52"/><circle class="cls-11" cx="396.49" cy="320.3" r="2.52"/><circle class="cls-12" cx="396.49" cy="320.3" r="2.52"/><circle class="cls-11" cx="398.77" cy="319.7" r="2.52"/><circle class="cls-12" cx="398.77" cy="319.7" r="2.52"/><circle class="cls-11" cx="401.05" cy="318.98" r="2.52"/><circle class="cls-12" cx="401.05" cy="318.98" r="2.52"/><circle class="cls-11" cx="403.33" cy="318.26" r="2.52"/><circle class="cls-12" cx="403.33" cy="318.26" r="2.52"/><circle class="cls-11" cx="405.61" cy="317.54" r="2.52"/><circle class="cls-12" cx="405.61" cy="317.54" r="2.52"/><circle class="cls-11" cx="407.89" cy="316.94" r="2.52"/><circle class="cls-12" cx="407.89" cy="316.94" r="2.52"/><circle class="cls-11" cx="410.18" cy="316.22" r="2.52" transform="translate(-45.36 69.79) rotate(-9.22)"/><circle class="cls-12" cx="410.18" cy="316.22" r="2.52" transform="translate(-45.36 69.79) rotate(-9.22)"/><circle class="cls-11" cx="412.46" cy="315.5" r="2.52" transform="translate(-45.21 70.14) rotate(-9.22)"/><circle class="cls-12" cx="412.46" cy="315.5" r="2.52" transform="translate(-45.21 70.14) rotate(-9.22)"/><circle class="cls-11" cx="414.74" cy="314.78" r="2.52" transform="translate(60.62 696.47) rotate(-84.34)"/><circle class="cls-12" cx="414.74" cy="314.78" r="2.52" transform="translate(60.62 696.47) rotate(-84.34)"/><circle class="cls-11" cx="417.02" cy="314.18" r="2.52"/><circle class="cls-12" cx="417.02" cy="314.18" r="2.52"/><circle class="cls-11" cx="419.41" cy="313.46" r="2.52"/><circle class="cls-12" cx="419.41" cy="313.46" r="2.52"/><circle class="cls-11" cx="421.7" cy="312.74" r="2.52"/><circle class="cls-12" cx="421.7" cy="312.74" r="2.52"/><circle class="cls-11" cx="423.98" cy="312.02" r="2.52"/><circle class="cls-12" cx="423.98" cy="312.02" r="2.52"/><circle class="cls-11" cx="426.26" cy="311.18" r="2.52"/><circle class="cls-12" cx="426.26" cy="311.18" r="2.52"/><circle class="cls-11" cx="428.53" cy="310.34" r="2.52"/><circle class="cls-12" cx="428.53" cy="310.34" r="2.52"/><circle class="cls-11" cx="430.81" cy="309.62" r="2.52"/><circle class="cls-12" cx="430.81" cy="309.62" r="2.52"/><circle class="cls-11" cx="433.09" cy="308.78" r="2.52"/><circle class="cls-12" cx="433.09" cy="308.78" r="2.52"/><circle class="cls-11" cx="435.37" cy="308.06" r="2.52"/><circle class="cls-12" cx="435.37" cy="308.06" r="2.52"/><circle class="cls-11" cx="437.66" cy="307.34" r="2.52"/><circle class="cls-12" cx="437.66" cy="307.34" r="2.52"/><circle class="cls-11" cx="439.94" cy="306.5" r="2.52"/><circle class="cls-12" cx="439.94" cy="306.5" r="2.52"/><circle class="cls-11" cx="442.33" cy="305.78" r="2.52"/><circle class="cls-12" cx="442.33" cy="305.78" r="2.52"/><circle class="cls-11" cx="444.61" cy="304.94" r="2.52"/><circle class="cls-12" cx="444.61" cy="304.94" r="2.52"/><circle class="cls-11" cx="446.89" cy="304.1" r="2.52"/><circle class="cls-12" cx="446.89" cy="304.1" r="2.52"/><path class="cls-11" d="M451.69,303.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,451.69,303.14Z"/><path class="cls-12" d="M451.69,303.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,451.69,303.14Z"/><circle class="cls-11" cx="451.46" cy="302.3" r="2.52" transform="translate(-42.59 76.22) rotate(-9.22)"/><circle class="cls-12" cx="451.46" cy="302.3" r="2.52" transform="translate(-42.59 76.22) rotate(-9.22)"/><path class="cls-11" d="M456.25,301.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,456.25,301.46Z"/><path class="cls-12" d="M456.25,301.46a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,456.25,301.46Z"/><path class="cls-11" d="M458.53,300.62A2.52,2.52,0,1,1,456,298.1,2.52,2.52,0,0,1,458.53,300.62Z"/><path class="cls-12" d="M458.53,300.62A2.52,2.52,0,1,1,456,298.1,2.52,2.52,0,0,1,458.53,300.62Z"/><circle class="cls-11" cx="458.3" cy="299.78" r="2.52" transform="translate(126.79 735.67) rotate(-85.93)"/><circle class="cls-12" cx="458.3" cy="299.78" r="2.52" transform="translate(126.79 735.67) rotate(-85.93)"/><path class="cls-11" d="M463.09,298.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,463.09,298.82Z"/><path class="cls-12" d="M463.09,298.82a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,463.09,298.82Z"/><circle class="cls-11" cx="462.86" cy="297.98" r="2.52" transform="translate(132.82 738.54) rotate(-85.93)"/><circle class="cls-12" cx="462.86" cy="297.98" r="2.52" transform="translate(132.82 738.54) rotate(-85.93)"/><path class="cls-11" d="M467.77,297a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,467.77,297Z"/><path class="cls-12" d="M467.77,297a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,467.77,297Z"/><circle class="cls-11" cx="467.53" cy="296.06" r="2.52" transform="translate(139.08 741.43) rotate(-85.93)"/><circle class="cls-12" cx="467.53" cy="296.06" r="2.52" transform="translate(139.08 741.43) rotate(-85.93)"/><path class="cls-11" d="M472.33,295.22a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,472.33,295.22Z"/><path class="cls-12" d="M472.33,295.22a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,472.33,295.22Z"/><circle class="cls-11" cx="472.09" cy="294.26" r="2.52"/><circle class="cls-12" cx="472.09" cy="294.26" r="2.52"/><path class="cls-11" d="M476.89,293.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,476.89,293.3Z"/><path class="cls-12" d="M476.89,293.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,476.89,293.3Z"/><circle class="cls-11" cx="476.65" cy="292.22" r="2.52" transform="translate(-26.47 48.39) rotate(-5.65)"/><circle class="cls-12" cx="476.65" cy="292.22" r="2.52" transform="translate(-26.47 48.39) rotate(-5.65)"/><path class="cls-11" d="M481.45,291.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,481.45,291.26Z"/><path class="cls-12" d="M481.45,291.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,481.45,291.26Z"/><circle class="cls-11" cx="481.21" cy="290.3" r="2.52" transform="translate(-26.26 48.83) rotate(-5.65)"/><circle class="cls-12" cx="481.21" cy="290.3" r="2.52" transform="translate(-26.26 48.83) rotate(-5.65)"/><path class="cls-11" d="M486,289.34a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,486,289.34Z"/><path class="cls-12" d="M486,289.34a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,486,289.34Z"/><circle class="cls-11" cx="485.77" cy="288.26" r="2.52"/><circle class="cls-12" cx="485.77" cy="288.26" r="2.52"/><path class="cls-11" d="M490.57,287.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,287.3Z"/><path class="cls-12" d="M490.57,287.3a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,490.57,287.3Z"/><path class="cls-11" d="M493,286.22a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,493,286.22Z"/><path class="cls-12" d="M493,286.22a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,493,286.22Z"/><path class="cls-11" d="M495.25,285.14a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,495.25,285.14Z"/><path class="cls-12" d="M495.25,285.14a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,495.25,285.14Z"/><path class="cls-11" d="M497.53,284.06a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,497.53,284.06Z"/><path class="cls-12" d="M497.53,284.06a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,497.53,284.06Z"/><circle class="cls-11" cx="497.3" cy="282.98" r="2.52" transform="translate(179.78 758.96) rotate(-85.93)"/><circle class="cls-12" cx="497.3" cy="282.98" r="2.52" transform="translate(179.78 758.96) rotate(-85.93)"/><path class="cls-11" d="M502.09,281.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,502.09,281.9Z"/><path class="cls-12" d="M502.09,281.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,502.09,281.9Z"/><circle class="cls-11" cx="501.86" cy="280.7" r="2.52" transform="translate(186.29 761.39) rotate(-85.93)"/><circle class="cls-12" cx="501.86" cy="280.7" r="2.52" transform="translate(186.29 761.39) rotate(-85.93)"/><path class="cls-11" d="M506.65,279.62a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,506.65,279.62Z"/><path class="cls-12" d="M506.65,279.62a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,506.65,279.62Z"/><circle class="cls-11" cx="506.42" cy="278.54" r="2.52"/><circle class="cls-12" cx="506.42" cy="278.54" r="2.52"/><path class="cls-11" d="M511.21,277.34a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,511.21,277.34Z"/><path class="cls-12" d="M511.21,277.34a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,511.21,277.34Z"/><circle class="cls-11" cx="510.98" cy="276.14" r="2.52" transform="translate(-24.72 51.69) rotate(-5.65)"/><circle class="cls-12" cx="510.98" cy="276.14" r="2.52" transform="translate(-24.72 51.69) rotate(-5.65)"/><path class="cls-11" d="M515.89,274.94a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,515.89,274.94Z"/><path class="cls-12" d="M515.89,274.94a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,515.89,274.94Z"/><circle class="cls-11" cx="515.65" cy="273.74" r="2.52"/><circle class="cls-12" cx="515.65" cy="273.74" r="2.52"/><path class="cls-11" d="M520.45,272.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,520.45,272.54Z"/><path class="cls-12" d="M520.45,272.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,520.45,272.54Z"/><circle class="cls-11" cx="520.21" cy="271.22" r="2.52" transform="translate(-24.19 52.58) rotate(-5.65)"/><circle class="cls-12" cx="520.21" cy="271.22" r="2.52" transform="translate(-24.19 52.58) rotate(-5.65)"/><path class="cls-11" d="M525,269.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,525,269.9Z"/><path class="cls-12" d="M525,269.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,525,269.9Z"/><circle class="cls-11" cx="524.77" cy="268.7" r="2.52" transform="translate(219.55 773.1) rotate(-85.93)"/><circle class="cls-12" cx="524.77" cy="268.7" r="2.52" transform="translate(219.55 773.1) rotate(-85.93)"/><path class="cls-11" d="M529.57,267.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,267.38Z"/><path class="cls-12" d="M529.57,267.38a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,529.57,267.38Z"/><path class="cls-11" d="M531.86,265.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,531.86,265.94Z"/><path class="cls-12" d="M531.86,265.94a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,531.86,265.94Z"/><path class="cls-11" d="M534.13,264.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,264.62Z"/><path class="cls-12" d="M534.13,264.62a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,534.13,264.62Z"/><path class="cls-11" d="M536.42,263.18a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,536.42,263.18Z"/><path class="cls-12" d="M536.42,263.18a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,536.42,263.18Z"/><circle class="cls-11" cx="536.3" cy="261.74" r="2.52"/><circle class="cls-12" cx="536.3" cy="261.74" r="2.52"/><path class="cls-11" d="M541.09,260.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,541.09,260.3Z"/><path class="cls-12" d="M541.09,260.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,541.09,260.3Z"/><circle class="cls-11" cx="540.86" cy="258.74" r="2.52"/><circle class="cls-12" cx="540.86" cy="258.74" r="2.52"/><path class="cls-11" d="M545.65,257.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,545.65,257.3Z"/><path class="cls-12" d="M545.65,257.3a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,545.65,257.3Z"/><circle class="cls-11" cx="545.42" cy="255.62" r="2.52" transform="translate(251.78 781.54) rotate(-85.93)"/><circle class="cls-12" cx="545.42" cy="255.62" r="2.52" transform="translate(251.78 781.54) rotate(-85.93)"/><path class="cls-11" d="M550.21,253.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,550.21,253.94Z"/><path class="cls-12" d="M550.21,253.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,550.21,253.94Z"/><circle class="cls-11" cx="549.98" cy="252.26" r="2.52"/><circle class="cls-12" cx="549.98" cy="252.26" r="2.52"/><path class="cls-11" d="M554.77,250.58a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,554.77,250.58Z"/><path class="cls-12" d="M554.77,250.58a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,554.77,250.58Z"/><circle class="cls-11" cx="554.53" cy="248.78" r="2.52" transform="translate(267.08 784.28) rotate(-85.93)"/><circle class="cls-12" cx="554.53" cy="248.78" r="2.52" transform="translate(267.08 784.28) rotate(-85.93)"/><path class="cls-11" d="M559.33,246.86a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,559.33,246.86Z"/><path class="cls-12" d="M559.33,246.86a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,559.33,246.86Z"/><circle class="cls-11" cx="559.21" cy="244.94" r="2.52" transform="translate(-21.41 56.3) rotate(-5.65)"/><circle class="cls-12" cx="559.21" cy="244.94" r="2.52" transform="translate(-21.41 56.3) rotate(-5.65)"/><path class="cls-11" d="M564,242.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,564,242.9Z"/><path class="cls-12" d="M564,242.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,564,242.9Z"/><circle class="cls-11" cx="563.77" cy="240.98" r="2.52" transform="translate(283.44 786.25) rotate(-85.93)"/><circle class="cls-12" cx="563.77" cy="240.98" r="2.52" transform="translate(283.44 786.25) rotate(-85.93)"/><path class="cls-11" d="M568.57,238.7a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,568.57,238.7Z"/><path class="cls-12" d="M568.57,238.7a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,568.57,238.7Z"/><path class="cls-11" d="M570.86,236.54a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,570.86,236.54Z"/><path class="cls-12" d="M570.86,236.54a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,570.86,236.54Z"/><path class="cls-11" d="M573.13,234.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,573.13,234.26Z"/><path class="cls-12" d="M573.13,234.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,573.13,234.26Z"/><path class="cls-11" d="M575.42,231.86a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,575.42,231.86Z"/><path class="cls-12" d="M575.42,231.86a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,575.42,231.86Z"/><path class="cls-11" d="M577.69,229.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,229.34Z"/><path class="cls-12" d="M577.69,229.34a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,577.69,229.34Z"/><path class="cls-11" d="M580,226.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,580,226.7Z"/><path class="cls-12" d="M580,226.7a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,580,226.7Z"/><path class="cls-11" d="M582.25,223.94a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,582.25,223.94Z"/><path class="cls-12" d="M582.25,223.94a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,582.25,223.94Z"/><path class="cls-11" d="M584.65,221.18a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,584.65,221.18Z"/><path class="cls-12" d="M584.65,221.18a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,584.65,221.18Z"/><circle class="cls-11" cx="584.42" cy="218.06" r="2.52" transform="translate(275.56 760) rotate(-80.78)"/><circle class="cls-12" cx="584.42" cy="218.06" r="2.52" transform="translate(275.56 760) rotate(-80.78)"/><path class="cls-11" d="M589.21,214.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,589.21,214.94Z"/><path class="cls-12" d="M589.21,214.94a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,589.21,214.94Z"/><circle class="cls-11" cx="588.98" cy="211.58" r="2.52" transform="translate(-36.13 241.5) rotate(-22.5)"/><circle class="cls-12" cx="588.98" cy="211.58" r="2.52" transform="translate(-36.13 241.5) rotate(-22.5)"/><path class="cls-11" d="M593.77,208.22a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,593.77,208.22Z"/><path class="cls-12" d="M593.77,208.22a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,593.77,208.22Z"/><circle class="cls-11" cx="593.53" cy="204.38" r="2.52" transform="translate(296.72 757.51) rotate(-80.78)"/><circle class="cls-12" cx="593.53" cy="204.38" r="2.52" transform="translate(296.72 757.51) rotate(-80.78)"/><path class="cls-11" d="M598.33,200.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,598.33,200.54Z"/><path class="cls-12" d="M598.33,200.54a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,598.33,200.54Z"/><circle class="cls-11" cx="598.09" cy="196.46" r="2.52"/><circle class="cls-12" cx="598.09" cy="196.46" r="2.52"/><path class="cls-11" d="M602.89,192.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,602.89,192.14Z"/><path class="cls-12" d="M602.89,192.14a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,602.89,192.14Z"/><circle class="cls-11" cx="602.65" cy="187.46" r="2.52"/><circle class="cls-12" cx="602.65" cy="187.46" r="2.52"/><path class="cls-11" d="M607.57,182.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,182.66Z"/><path class="cls-12" d="M607.57,182.66a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,607.57,182.66Z"/><path class="cls-11" d="M609.86,177.38a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,609.86,177.38Z"/><path class="cls-12" d="M609.86,177.38a2.52,2.52,0,1,1-2.53-2.52A2.52,2.52,0,0,1,609.86,177.38Z"/><path class="cls-11" d="M612.13,172a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,612.13,172Z"/><path class="cls-12" d="M612.13,172a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,612.13,172Z"/><path class="cls-11" d="M614.42,166.1a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,614.42,166.1Z"/><path class="cls-12" d="M614.42,166.1a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,614.42,166.1Z"/><path class="cls-11" d="M616.69,160a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,616.69,160Z"/><path class="cls-12" d="M616.69,160a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,616.69,160Z"/><path class="cls-11" d="M619,153.5a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,619,153.5Z"/><path class="cls-12" d="M619,153.5a2.52,2.52,0,1,1-2.53-2.52A2.53,2.53,0,0,1,619,153.5Z"/><path class="cls-11" d="M621.25,146.66a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,621.25,146.66Z"/><path class="cls-12" d="M621.25,146.66a2.52,2.52,0,1,1-2.51-2.52A2.51,2.51,0,0,1,621.25,146.66Z"/><path class="cls-11" d="M623.53,139.22A2.52,2.52,0,1,1,621,136.7,2.52,2.52,0,0,1,623.53,139.22Z"/><path class="cls-12" d="M623.53,139.22A2.52,2.52,0,1,1,621,136.7,2.52,2.52,0,0,1,623.53,139.22Z"/><circle class="cls-11" cx="623.3" cy="131.54" r="2.52" transform="translate(-2.89 248.54) rotate(-22.5)"/><circle class="cls-12" cx="623.3" cy="131.54" r="2.52" transform="translate(-2.89 248.54) rotate(-22.5)"/><path class="cls-11" d="M628.09,123.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,628.09,123.14Z"/><path class="cls-12" d="M628.09,123.14a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,628.09,123.14Z"/><circle class="cls-11" cx="627.98" cy="114.26" r="2.52" transform="translate(4.08 249.01) rotate(-22.5)"/><circle class="cls-12" cx="627.98" cy="114.26" r="2.52" transform="translate(4.08 249.01) rotate(-22.5)"/><path class="cls-11" d="M632.77,104.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,632.77,104.9Z"/><path class="cls-12" d="M632.77,104.9a2.52,2.52,0,1,1-2.52-2.52A2.52,2.52,0,0,1,632.77,104.9Z"/><circle class="cls-11" cx="632.53" cy="94.82" r="2.52" transform="translate(11.86 249.28) rotate(-22.5)"/><circle class="cls-12" cx="632.53" cy="94.82" r="2.52" transform="translate(11.86 249.28) rotate(-22.5)"/><path class="cls-11" d="M637.33,83.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,637.33,83.9Z"/><path class="cls-12" d="M637.33,83.9a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,637.33,83.9Z"/><circle class="cls-11" cx="637.09" cy="72.5" r="2.52" transform="translate(463.48 689.75) rotate(-80.78)"/><circle class="cls-12" cx="637.09" cy="72.5" r="2.52" transform="translate(463.48 689.75) rotate(-80.78)"/><path class="cls-11" d="M641.89,60.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,641.89,60.26Z"/><path class="cls-12" d="M641.89,60.26a2.52,2.52,0,1,1-2.51-2.52A2.52,2.52,0,0,1,641.89,60.26Z"/><g class="cls-13"><text class="cls-14" transform="translate(40.93 365.91)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(27.23 295.04)">5000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 224.17)">10000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 153.31)">15000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 82.43)">20000</text></g><g class="cls-13"><text class="cls-14" transform="translate(22.67 11.56)">25000</text></g><g class="cls-13"><text class="cls-14" transform="translate(52.7 377.63)">0</text></g><g class="cls-13"><text class="cls-14" transform="translate(165.03 377.63)">50</text></g><g class="cls-13"><text class="cls-14" transform="translate(277.35 377.63)">100</text></g><g class="cls-13"><text class="cls-14" transform="translate(391.99 377.63)">150</text></g><g class="cls-13"><text class="cls-14" transform="translate(506.58 377.63)">200</text></g><g class="cls-13"><text class="cls-14" transform="translate(621.19 377.63)">250</text></g><g class="cls-13"><text class="cls-15" transform="translate(17.56 199.35) rotate(-90)">Qstep</text></g><g class="cls-13"><text class="cls-15" transform="translate(325.64 386.9)">Q<tspan class="cls-16" x="11.53" y="0">_</tspan><tspan x="19.54" y="0">i</tspan><tspan class="cls-16" x="23.97" y="0">n</tspan><tspan class="cls-17" x="31.98" y="0">d</tspan><tspan x="40.01" y="0">ex</tspan></text></g><line class="cls-4" x1="477.71" y1="70.43" x2="496.92" y2="70.44"/><path class="cls-5" d="M489.13,70.28a2,2,0,1,1-2-2A2,2,0,0,1,489.13,70.28Z"/><path class="cls-18" d="M489.13,70.28a2,2,0,1,1-2-2A2,2,0,0,1,489.13,70.28Z"/><g class="cls-13"><text class="cls-19" transform="translate(499.04 74.83)"><tspan class="cls-20">8</tspan><tspan x="6.98" y="0">-</tspan><tspan class="cls-21" x="11.65" y="0">b</tspan><tspan class="cls-20" x="18.73" y="0">it</tspan><tspan class="cls-22" x="26.45" y="0"> </tspan><tspan class="cls-23" x="30.03" y="0">D</tspan><tspan x="40.11" y="0">C</tspan></text></g><line class="cls-7" x1="477.71" y1="91.78" x2="496.92" y2="91.78"/><path class="cls-8" d="M489.13,91.64a2,2,0,1,1-2-2A2,2,0,0,1,489.13,91.64Z"/><path class="cls-24" d="M489.13,91.64a2,2,0,1,1-2-2A2,2,0,0,1,489.13,91.64Z"/><g class="cls-13"><text class="cls-25" transform="translate(499.04 96.16)"><tspan class="cls-26">1</tspan><tspan class="cls-27" x="6.96" y="0">0</tspan><tspan x="14.01" y="0">-</tspan><tspan class="cls-28" x="18.69" y="0">b</tspan><tspan class="cls-29" x="25.65" y="0">i</tspan><tspan class="cls-30" x="29.5" y="0">t</tspan><tspan class="cls-31" x="33.45" y="0"> </tspan><tspan class="cls-32" x="36.97" y="0">D</tspan><tspan x="47.01" y="0">C</tspan></text></g><line class="cls-10" x1="477.71" y1="113.13" x2="496.92" y2="113.13"/><circle class="cls-11" cx="487.21" cy="113.06" r="2.52" transform="translate(297.58 575.87) rotate(-80.78)"/><circle class="cls-33" cx="487.21" cy="113.06" r="2.52" transform="translate(297.58 575.87) rotate(-80.78)"/><g class="cls-13"><text class="cls-19" transform="translate(499.04 117.52)"><tspan class="cls-20">1</tspan><tspan class="cls-34" x="6.98" y="0">2</tspan><tspan class="cls-35" x="14.03" y="0">-</tspan><tspan class="cls-36" x="18.7" y="0">bi</tspan><tspan class="cls-37" x="29.54" y="0">t</tspan><tspan class="cls-38" x="33.48" y="0"> </tspan><tspan class="cls-39" x="36.99" y="0">D</tspan><tspan x="47.05" y="0">C</tspan></text></g><rect class="cls-2" x="0.38" y="0.38" width="652.8" height="391.32"/></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/scc_intrabc.svg b/third_party/aom/doc/img/scc_intrabc.svg
new file mode 100644
index 0000000000..dfe4948861
--- /dev/null
+++ b/third_party/aom/doc/img/scc_intrabc.svg
@@ -0,0 +1,348 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export scc_intrabc.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="5.4258in" height="2.9597in"
+ viewBox="0 0 390.657 213.098" xml:space="preserve" color-interpolation-filters="sRGB" class="st8">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false"/>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#d8d8d8;stroke:#000000;stroke-width:0.25}
+ .st2 {fill:#fec000;stroke:#000000;stroke-width:0.25}
+ .st3 {fill:#00fefe;stroke:#000000;stroke-width:0.25}
+ .st4 {fill:#ffffff;stroke:#000000;stroke-width:0.25}
+ .st5 {fill:#ffc000;stroke:#000000;stroke-width:0.25}
+ .st6 {fill:none;stroke:none;stroke-width:0.25}
+ .st7 {fill:#4672c4;font-family:Calibri;font-size:0.666664em}
+ .st8 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="0.0393701" v:pageScale="0.0393701" v:drawingUnits="24" v:shadowOffsetX="4.25197"
+ v:shadowOffsetY="-4.25197"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(0.25,-141.982)">
+ <title>Sheet.1</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape3-3" v:mID="3" v:groupContext="shape" transform="translate(28.5965,-141.982)">
+ <title>Sheet.3</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape4-5" v:mID="4" v:groupContext="shape" transform="translate(56.9429,-141.982)">
+ <title>Sheet.4</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape5-7" v:mID="5" v:groupContext="shape" transform="translate(85.2894,-141.982)">
+ <title>Sheet.5</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape6-9" v:mID="6" v:groupContext="shape" transform="translate(113.636,-141.982)">
+ <title>Sheet.6</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape7-11" v:mID="7" v:groupContext="shape" transform="translate(141.982,-141.982)">
+ <title>Sheet.7</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape8-13" v:mID="8" v:groupContext="shape" transform="translate(170.329,-141.982)">
+ <title>Sheet.8</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape9-15" v:mID="9" v:groupContext="shape" transform="translate(198.675,-141.982)">
+ <title>Sheet.9</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape10-17" v:mID="10" v:groupContext="shape" transform="translate(0.25,-113.636)">
+ <title>Sheet.10</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape11-19" v:mID="11" v:groupContext="shape" transform="translate(28.5965,-113.636)">
+ <title>Sheet.11</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape12-21" v:mID="12" v:groupContext="shape" transform="translate(56.9429,-113.636)">
+ <title>Sheet.12</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape13-23" v:mID="13" v:groupContext="shape" transform="translate(85.2894,-113.636)">
+ <title>Sheet.13</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape14-25" v:mID="14" v:groupContext="shape" transform="translate(113.636,-113.636)">
+ <title>Sheet.14</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape15-27" v:mID="15" v:groupContext="shape" transform="translate(141.982,-113.636)">
+ <title>Sheet.15</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape16-29" v:mID="16" v:groupContext="shape" transform="translate(170.329,-113.636)">
+ <title>Sheet.16</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape17-31" v:mID="17" v:groupContext="shape" transform="translate(198.675,-113.636)">
+ <title>Sheet.17</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape18-33" v:mID="18" v:groupContext="shape" transform="translate(0.25,-85.2894)">
+ <title>Sheet.18</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape19-35" v:mID="19" v:groupContext="shape" transform="translate(28.5965,-85.2894)">
+ <title>Sheet.19</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape20-37" v:mID="20" v:groupContext="shape" transform="translate(56.9429,-85.2894)">
+ <title>Sheet.20</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape21-39" v:mID="21" v:groupContext="shape" transform="translate(85.2894,-85.2894)">
+ <title>Sheet.21</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape22-41" v:mID="22" v:groupContext="shape" transform="translate(113.636,-85.2894)">
+ <title>Sheet.22</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape23-43" v:mID="23" v:groupContext="shape" transform="translate(141.982,-85.2894)">
+ <title>Sheet.23</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape24-45" v:mID="24" v:groupContext="shape" transform="translate(170.329,-85.2894)">
+ <title>Sheet.24</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+ </g>
+ <g id="shape25-47" v:mID="25" v:groupContext="shape" transform="translate(198.675,-85.2894)">
+ <title>Sheet.25</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+ </g>
+ <g id="shape26-49" v:mID="26" v:groupContext="shape" transform="translate(0.25,-56.9429)">
+ <title>Sheet.26</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape27-51" v:mID="27" v:groupContext="shape" transform="translate(28.5965,-56.9429)">
+ <title>Sheet.27</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape28-53" v:mID="28" v:groupContext="shape" transform="translate(56.9429,-56.9429)">
+ <title>Sheet.28</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape29-55" v:mID="29" v:groupContext="shape" transform="translate(85.2894,-56.9429)">
+ <title>Sheet.29</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape30-57" v:mID="30" v:groupContext="shape" transform="translate(113.636,-56.9429)">
+ <title>Sheet.30</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+ </g>
+ <g id="shape31-59" v:mID="31" v:groupContext="shape" transform="translate(141.982,-56.9429)">
+ <title>Sheet.31</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+ </g>
+ <g id="shape32-61" v:mID="32" v:groupContext="shape" transform="translate(170.329,-56.9429)">
+ <title>Sheet.32</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape33-63" v:mID="33" v:groupContext="shape" transform="translate(198.675,-56.9429)">
+ <title>Sheet.33</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape34-65" v:mID="34" v:groupContext="shape" transform="translate(227.022,-141.982)">
+ <title>Sheet.34</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape35-67" v:mID="35" v:groupContext="shape" transform="translate(255.368,-141.982)">
+ <title>Sheet.35</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape36-69" v:mID="36" v:groupContext="shape" transform="translate(283.715,-141.982)">
+ <title>Sheet.36</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st5"/>
+ </g>
+ <g id="shape37-71" v:mID="37" v:groupContext="shape" transform="translate(312.061,-141.982)">
+ <title>Sheet.37</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+ </g>
+ <g id="shape38-73" v:mID="38" v:groupContext="shape" transform="translate(227.022,-113.636)">
+ <title>Sheet.38</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+ </g>
+ <g id="shape39-75" v:mID="39" v:groupContext="shape" transform="translate(255.368,-113.636)">
+ <title>Sheet.39</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+ </g>
+ <g id="shape40-77" v:mID="40" v:groupContext="shape" transform="translate(283.715,-113.636)">
+ <title>Sheet.40</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape41-79" v:mID="41" v:groupContext="shape" transform="translate(312.061,-113.636)">
+ <title>Sheet.41</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape42-81" v:mID="42" v:groupContext="shape" transform="translate(227.022,-85.2894)">
+ <title>Sheet.42</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape43-83" v:mID="43" v:groupContext="shape" transform="translate(255.368,-85.2894)">
+ <title>Sheet.43</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape44-85" v:mID="44" v:groupContext="shape" transform="translate(283.715,-85.2894)">
+ <title>Sheet.44</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape45-87" v:mID="45" v:groupContext="shape" transform="translate(312.061,-85.2894)">
+ <title>Sheet.45</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape46-89" v:mID="46" v:groupContext="shape" transform="translate(227.022,-56.9429)">
+ <title>Sheet.46</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape47-91" v:mID="47" v:groupContext="shape" transform="translate(255.368,-56.9429)">
+ <title>Sheet.47</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape48-93" v:mID="48" v:groupContext="shape" transform="translate(283.715,-56.9429)">
+ <title>Sheet.48</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape49-95" v:mID="49" v:groupContext="shape" transform="translate(312.061,-56.9429)">
+ <title>Sheet.49</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape50-97" v:mID="50" v:groupContext="shape" transform="translate(0.25,-28.5965)">
+ <title>Sheet.50</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape51-99" v:mID="51" v:groupContext="shape" transform="translate(28.5965,-28.5965)">
+ <title>Sheet.51</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape52-101" v:mID="52" v:groupContext="shape" transform="translate(56.9429,-28.5965)">
+ <title>Sheet.52</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st2"/>
+ </g>
+ <g id="shape53-103" v:mID="53" v:groupContext="shape" transform="translate(85.2894,-28.5965)">
+ <title>Sheet.53</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+ </g>
+ <g id="shape54-105" v:mID="54" v:groupContext="shape" transform="translate(113.636,-28.5965)">
+ <title>Sheet.54</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape55-107" v:mID="55" v:groupContext="shape" transform="translate(141.982,-28.5965)">
+ <title>Sheet.55</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape56-109" v:mID="56" v:groupContext="shape" transform="translate(170.329,-28.5965)">
+ <title>Sheet.56</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape57-111" v:mID="57" v:groupContext="shape" transform="translate(198.675,-28.5965)">
+ <title>Sheet.57</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape58-113" v:mID="58" v:groupContext="shape" transform="translate(227.022,-28.5965)">
+ <title>Sheet.58</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape59-115" v:mID="59" v:groupContext="shape" transform="translate(255.368,-28.5965)">
+ <title>Sheet.59</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape60-117" v:mID="60" v:groupContext="shape" transform="translate(283.715,-28.5965)">
+ <title>Sheet.60</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape61-119" v:mID="61" v:groupContext="shape" transform="translate(312.061,-28.5965)">
+ <title>Sheet.61</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape62-121" v:mID="62" v:groupContext="shape" transform="translate(0.25,-0.25)">
+ <title>Sheet.62</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape63-123" v:mID="63" v:groupContext="shape" transform="translate(28.5965,-0.25)">
+ <title>Sheet.63</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape64-125" v:mID="64" v:groupContext="shape" transform="translate(56.9429,-0.25)">
+ <title>Sheet.64</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape65-127" v:mID="65" v:groupContext="shape" transform="translate(85.2894,-0.25)">
+ <title>Sheet.65</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape66-129" v:mID="66" v:groupContext="shape" transform="translate(113.636,-0.25)">
+ <title>Sheet.66</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape67-131" v:mID="67" v:groupContext="shape" transform="translate(141.982,-0.25)">
+ <title>Sheet.67</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape68-133" v:mID="68" v:groupContext="shape" transform="translate(170.329,-0.25)">
+ <title>Sheet.68</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape69-135" v:mID="69" v:groupContext="shape" transform="translate(198.675,-0.25)">
+ <title>Sheet.69</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape70-137" v:mID="70" v:groupContext="shape" transform="translate(227.022,-0.25)">
+ <title>Sheet.70</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape71-139" v:mID="71" v:groupContext="shape" transform="translate(255.368,-0.25)">
+ <title>Sheet.71</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape72-141" v:mID="72" v:groupContext="shape" transform="translate(283.715,-0.25)">
+ <title>Sheet.72</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape73-143" v:mID="73" v:groupContext="shape" transform="translate(312.061,-0.25)">
+ <title>Sheet.73</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st4"/>
+ </g>
+ <g id="shape74-145" v:mID="74" v:groupContext="shape" transform="translate(0.25,-184.502)">
+ <title>Sheet.74</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st3"/>
+ </g>
+ <g id="shape75-147" v:mID="75" v:groupContext="shape" transform="translate(255.368,-184.502)">
+ <title>Sheet.75</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st5"/>
+ </g>
+ <g id="shape76-149" v:mID="76" v:groupContext="shape" transform="translate(127.809,-184.502)">
+ <title>Sheet.76</title>
+ <rect x="0" y="184.752" width="28.3465" height="28.3465" class="st1"/>
+ </g>
+ <g id="shape79-151" v:mID="79" v:groupContext="shape" transform="translate(27.8091,-193.762)">
+ <title>Sheet.79</title>
+ <desc>Current processing block</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="50" cy="207.098" width="100" height="12"/>
+ <rect x="0" y="201.098" width="100" height="12" class="st6"/>
+ <text x="9.78" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Current processing block</text> </g>
+ <g id="shape80-154" v:mID="80" v:groupContext="shape" transform="translate(158.899,-192.675)">
+ <title>Sheet.80</title>
+ <desc>Allowed prediction block</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="50" cy="207.098" width="100" height="12"/>
+ <rect x="0" y="201.098" width="100" height="12" class="st6"/>
+ <text x="9.68" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Allowed prediction block</text> </g>
+ <g id="shape81-157" v:mID="81" v:groupContext="shape" transform="translate(290.407,-192.675)">
+ <title>Sheet.81</title>
+ <desc>Restricted immediate blocks</desc>
+ <v:textBlock v:margins="rect(2,2,2,2)" v:tabSpace="42.5197"/>
+ <v:textRect cx="50" cy="207.098" width="100" height="12"/>
+ <rect x="0" y="201.098" width="100" height="12" class="st6"/>
+ <text x="3.92" y="209.5" class="st7" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>Restricted immediate blocks</text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/secondary_tap.svg b/third_party/aom/doc/img/secondary_tap.svg
new file mode 100644
index 0000000000..4c6283de36
--- /dev/null
+++ b/third_party/aom/doc/img/secondary_tap.svg
@@ -0,0 +1,857 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by Microsoft Visio, SVG Export secondary_tap.svg Page-1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ev="http://www.w3.org/2001/xml-events"
+ xmlns:v="http://schemas.microsoft.com/visio/2003/SVGExtensions/" width="11.2533in" height="3.38188in"
+ viewBox="0 0 810.24 243.495" xml:space="preserve" color-interpolation-filters="sRGB" class="st7">
+ <v:documentProperties v:langID="1033" v:viewMarkup="false">
+ <v:userDefs>
+ <v:ud v:nameU="msvNoAutoConnect" v:val="VT0(1):26"/>
+ </v:userDefs>
+ </v:documentProperties>
+
+ <style type="text/css">
+ <![CDATA[
+ .st1 {fill:#ffffff;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st2 {fill:#000000;font-family:Calibri;font-size:1.00001em}
+ .st3 {fill:#00b0f0;fill-opacity:0.5;stroke:#000000;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.24}
+ .st4 {fill:none;stroke:none;stroke-linecap:round;stroke-linejoin:round;stroke-width:0.75}
+ .st5 {fill:#000000;font-family:Calibri;font-size:1.5em;font-style:italic}
+ .st6 {font-size:1em;font-style:normal}
+ .st7 {fill:none;fill-rule:evenodd;font-size:12px;overflow:visible;stroke-linecap:square;stroke-miterlimit:3}
+ ]]>
+ </style>
+
+ <g v:mID="0" v:index="1" v:groupContext="foregroundPage">
+ <title>Page-1</title>
+ <v:pageProperties v:drawingScale="1" v:pageScale="1" v:drawingUnits="19" v:shadowOffsetX="9" v:shadowOffsetY="-9"/>
+ <g id="shape1-1" v:mID="1" v:groupContext="shape" transform="translate(18.12,-189.375)">
+ <title>Square</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape2-3" v:mID="2" v:groupContext="shape" transform="translate(54.12,-189.375)">
+ <title>Square.2</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape3-5" v:mID="3" v:groupContext="shape" transform="translate(90.12,-189.375)">
+ <title>Square.3</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape4-8" v:mID="4" v:groupContext="shape" transform="translate(126.12,-189.375)">
+ <title>Square.4</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape5-10" v:mID="5" v:groupContext="shape" transform="translate(162.12,-189.375)">
+ <title>Square.5</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape6-12" v:mID="6" v:groupContext="shape" transform="translate(18.12,-153.375)">
+ <title>Square.6</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape7-14" v:mID="7" v:groupContext="shape" transform="translate(54.12,-153.375)">
+ <title>Square.7</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape8-16" v:mID="8" v:groupContext="shape" transform="translate(90.12,-153.375)">
+ <title>Square.8</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape9-19" v:mID="9" v:groupContext="shape" transform="translate(126.12,-153.375)">
+ <title>Square.9</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape10-21" v:mID="10" v:groupContext="shape" transform="translate(162.12,-153.375)">
+ <title>Square.10</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape11-23" v:mID="11" v:groupContext="shape" transform="translate(18.12,-117.375)">
+ <title>Square.11</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape12-26" v:mID="12" v:groupContext="shape" transform="translate(54.12,-117.375)">
+ <title>Square.12</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape13-29" v:mID="13" v:groupContext="shape" transform="translate(90.12,-117.375)">
+ <title>Square.13</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape14-31" v:mID="14" v:groupContext="shape" transform="translate(126.12,-117.375)">
+ <title>Square.14</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape15-34" v:mID="15" v:groupContext="shape" transform="translate(162.12,-117.375)">
+ <title>Square.15</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape16-37" v:mID="16" v:groupContext="shape" transform="translate(18.12,-81.375)">
+ <title>Square.16</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape17-39" v:mID="17" v:groupContext="shape" transform="translate(54.12,-81.375)">
+ <title>Square.17</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape18-41" v:mID="18" v:groupContext="shape" transform="translate(90.12,-81.375)">
+ <title>Square.18</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape19-44" v:mID="19" v:groupContext="shape" transform="translate(126.12,-81.375)">
+ <title>Square.19</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape20-46" v:mID="20" v:groupContext="shape" transform="translate(162.12,-81.375)">
+ <title>Square.20</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape21-48" v:mID="21" v:groupContext="shape" transform="translate(18.12,-45.375)">
+ <title>Square.21</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape22-50" v:mID="22" v:groupContext="shape" transform="translate(54.12,-45.375)">
+ <title>Square.22</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape23-52" v:mID="23" v:groupContext="shape" transform="translate(90.12,-45.375)">
+ <title>Square.23</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape24-55" v:mID="24" v:groupContext="shape" transform="translate(126.12,-45.375)">
+ <title>Square.24</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape25-57" v:mID="25" v:groupContext="shape" transform="translate(162.12,-45.375)">
+ <title>Square.25</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape30-59" v:mID="30" v:groupContext="shape" transform="translate(216.12,-189.375)">
+ <title>Square.30</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape31-61" v:mID="31" v:groupContext="shape" transform="translate(252.12,-189.375)">
+ <title>Square.31</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape32-63" v:mID="32" v:groupContext="shape" transform="translate(288.12,-189.375)">
+ <title>Square.32</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape33-65" v:mID="33" v:groupContext="shape" transform="translate(324.12,-189.375)">
+ <title>Square.33</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape34-68" v:mID="34" v:groupContext="shape" transform="translate(360.12,-189.375)">
+ <title>Square.34</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape35-70" v:mID="35" v:groupContext="shape" transform="translate(216.12,-153.375)">
+ <title>Square.35</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape36-73" v:mID="36" v:groupContext="shape" transform="translate(252.12,-153.375)">
+ <title>Square.36</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape37-75" v:mID="37" v:groupContext="shape" transform="translate(288.12,-153.375)">
+ <title>Square.37</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape38-78" v:mID="38" v:groupContext="shape" transform="translate(324.12,-153.375)">
+ <title>Square.38</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape39-80" v:mID="39" v:groupContext="shape" transform="translate(360.12,-153.375)">
+ <title>Square.39</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape40-82" v:mID="40" v:groupContext="shape" transform="translate(216.12,-117.375)">
+ <title>Square.40</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape41-84" v:mID="41" v:groupContext="shape" transform="translate(252.12,-117.375)">
+ <title>Square.41</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape42-87" v:mID="42" v:groupContext="shape" transform="translate(288.12,-117.375)">
+ <title>Square.42</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape43-89" v:mID="43" v:groupContext="shape" transform="translate(324.12,-117.375)">
+ <title>Square.43</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape44-92" v:mID="44" v:groupContext="shape" transform="translate(360.12,-117.375)">
+ <title>Square.44</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape45-94" v:mID="45" v:groupContext="shape" transform="translate(216.12,-81.375)">
+ <title>Square.45</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape46-96" v:mID="46" v:groupContext="shape" transform="translate(252.12,-81.375)">
+ <title>Square.46</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape47-98" v:mID="47" v:groupContext="shape" transform="translate(288.12,-81.3749)">
+ <title>Square.47</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape48-101" v:mID="48" v:groupContext="shape" transform="translate(324.12,-81.3749)">
+ <title>Square.48</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape49-103" v:mID="49" v:groupContext="shape" transform="translate(360.12,-81.3749)">
+ <title>Square.49</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape50-106" v:mID="50" v:groupContext="shape" transform="translate(216.12,-45.375)">
+ <title>Square.50</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape51-108" v:mID="51" v:groupContext="shape" transform="translate(252.12,-45.375)">
+ <title>Square.51</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape52-111" v:mID="52" v:groupContext="shape" transform="translate(288.12,-45.375)">
+ <title>Square.52</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape53-113" v:mID="53" v:groupContext="shape" transform="translate(324.12,-45.375)">
+ <title>Square.53</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape54-115" v:mID="54" v:groupContext="shape" transform="translate(360.12,-45.375)">
+ <title>Square.54</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape55-117" v:mID="55" v:groupContext="shape" transform="translate(414.12,-189.375)">
+ <title>Square.55</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape56-120" v:mID="56" v:groupContext="shape" transform="translate(450.12,-189.375)">
+ <title>Square.56</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape57-122" v:mID="57" v:groupContext="shape" transform="translate(486.12,-189.375)">
+ <title>Square.57</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape58-124" v:mID="58" v:groupContext="shape" transform="translate(522.12,-189.375)">
+ <title>Square.58</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape59-126" v:mID="59" v:groupContext="shape" transform="translate(558.12,-189.375)">
+ <title>Square.59</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape60-129" v:mID="60" v:groupContext="shape" transform="translate(414.12,-153.375)">
+ <title>Square.60</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape61-131" v:mID="61" v:groupContext="shape" transform="translate(450.12,-153.375)">
+ <title>Square.61</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape62-134" v:mID="62" v:groupContext="shape" transform="translate(486.12,-153.375)">
+ <title>Square.62</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape63-136" v:mID="63" v:groupContext="shape" transform="translate(522.12,-153.375)">
+ <title>Square.63</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape64-139" v:mID="64" v:groupContext="shape" transform="translate(558.12,-153.375)">
+ <title>Square.64</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape65-141" v:mID="65" v:groupContext="shape" transform="translate(414.12,-117.375)">
+ <title>Square.65</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape66-143" v:mID="66" v:groupContext="shape" transform="translate(450.12,-117.375)">
+ <title>Square.66</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape67-145" v:mID="67" v:groupContext="shape" transform="translate(486.12,-117.375)">
+ <title>Square.67</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape68-147" v:mID="68" v:groupContext="shape" transform="translate(522.12,-117.375)">
+ <title>Square.68</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape69-149" v:mID="69" v:groupContext="shape" transform="translate(558.12,-117.375)">
+ <title>Square.69</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape70-151" v:mID="70" v:groupContext="shape" transform="translate(414.12,-81.375)">
+ <title>Square.70</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape71-153" v:mID="71" v:groupContext="shape" transform="translate(450.12,-81.375)">
+ <title>Square.71</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape72-156" v:mID="72" v:groupContext="shape" transform="translate(486.12,-81.3749)">
+ <title>Square.72</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape73-158" v:mID="73" v:groupContext="shape" transform="translate(522.12,-81.3749)">
+ <title>Square.73</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape74-161" v:mID="74" v:groupContext="shape" transform="translate(558.12,-81.3749)">
+ <title>Square.74</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape75-163" v:mID="75" v:groupContext="shape" transform="translate(414.12,-45.375)">
+ <title>Square.75</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape76-166" v:mID="76" v:groupContext="shape" transform="translate(450.12,-45.375)">
+ <title>Square.76</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape77-168" v:mID="77" v:groupContext="shape" transform="translate(486.12,-45.375)">
+ <title>Square.77</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape78-170" v:mID="78" v:groupContext="shape" transform="translate(522.12,-45.375)">
+ <title>Square.78</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape79-172" v:mID="79" v:groupContext="shape" transform="translate(558.12,-45.375)">
+ <title>Square.79</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape80-175" v:mID="80" v:groupContext="shape" transform="translate(612.12,-189.375)">
+ <title>Square.80</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape81-177" v:mID="81" v:groupContext="shape" transform="translate(648.12,-189.375)">
+ <title>Square.81</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape82-180" v:mID="82" v:groupContext="shape" transform="translate(684.12,-189.375)">
+ <title>Square.82</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape83-182" v:mID="83" v:groupContext="shape" transform="translate(720.12,-189.375)">
+ <title>Square.83</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape84-184" v:mID="84" v:groupContext="shape" transform="translate(756.12,-189.375)">
+ <title>Square.84</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape85-186" v:mID="85" v:groupContext="shape" transform="translate(612.12,-153.375)">
+ <title>Square.85</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape86-188" v:mID="86" v:groupContext="shape" transform="translate(648.12,-153.375)">
+ <title>Square.86</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape87-190" v:mID="87" v:groupContext="shape" transform="translate(684.12,-153.375)">
+ <title>Square.87</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape88-193" v:mID="88" v:groupContext="shape" transform="translate(720.12,-153.375)">
+ <title>Square.88</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape89-195" v:mID="89" v:groupContext="shape" transform="translate(756.12,-153.375)">
+ <title>Square.89</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape90-198" v:mID="90" v:groupContext="shape" transform="translate(612.12,-117.375)">
+ <title>Square.90</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape91-200" v:mID="91" v:groupContext="shape" transform="translate(648.12,-117.375)">
+ <title>Square.91</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape92-203" v:mID="92" v:groupContext="shape" transform="translate(684.12,-117.375)">
+ <title>Square.92</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st3"/>
+ </g>
+ <g id="shape93-205" v:mID="93" v:groupContext="shape" transform="translate(720.12,-117.375)">
+ <title>Square.93</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape94-208" v:mID="94" v:groupContext="shape" transform="translate(756.12,-117.375)">
+ <title>Square.94</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape95-210" v:mID="95" v:groupContext="shape" transform="translate(612.12,-81.375)">
+ <title>Square.95</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape96-213" v:mID="96" v:groupContext="shape" transform="translate(648.12,-81.375)">
+ <title>Square.96</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape97-215" v:mID="97" v:groupContext="shape" transform="translate(684.12,-81.3749)">
+ <title>Square.97</title>
+ <desc>2/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>2/16</text> </g>
+ <g id="shape98-218" v:mID="98" v:groupContext="shape" transform="translate(720.12,-81.3749)">
+ <title>Square.98</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape99-220" v:mID="99" v:groupContext="shape" transform="translate(756.12,-81.3749)">
+ <title>Square.99</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape100-222" v:mID="100" v:groupContext="shape" transform="translate(612.12,-45.375)">
+ <title>Square.100</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape101-224" v:mID="101" v:groupContext="shape" transform="translate(648.12,-45.375)">
+ <title>Square.101</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape102-226" v:mID="102" v:groupContext="shape" transform="translate(684.12,-45.375)">
+ <title>Square.102</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape103-228" v:mID="103" v:groupContext="shape" transform="translate(720.12,-45.375)">
+ <title>Square.103</title>
+ <desc>1/16</desc>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="18" cy="225.495" width="36.01" height="36"/>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ <text x="6.56" y="229.1" class="st2" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>1/16</text> </g>
+ <g id="shape104-231" v:mID="104" v:groupContext="shape" transform="translate(756.12,-45.375)">
+ <title>Square.104</title>
+ <v:userDefs>
+ <v:ud v:nameU="visVersion" v:val="VT0(15):26"/>
+ </v:userDefs>
+ <rect x="0" y="207.495" width="36" height="36" class="st1"/>
+ </g>
+ <g id="shape236-233" v:mID="236" v:groupContext="shape" transform="translate(54.12,-18.375)">
+ <title>Sheet.236</title>
+ <desc>d = 0, 4</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="229.995" width="108" height="27"/>
+ <rect x="0" y="216.495" width="108" height="27" class="st4"/>
+ <text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+ class="st6">= 0, 4</tspan></text> </g>
+ <g id="shape237-238" v:mID="237" v:groupContext="shape" transform="translate(252.12,-18.375)">
+ <title>Sheet.237</title>
+ <desc>d = 1, 5</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="229.995" width="108" height="27"/>
+ <rect x="0" y="216.495" width="108" height="27" class="st4"/>
+ <text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+ class="st6">= 1, 5</tspan></text> </g>
+ <g id="shape238-243" v:mID="238" v:groupContext="shape" transform="translate(450.12,-18.375)">
+ <title>Sheet.238</title>
+ <desc>d = 2, 6</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="229.995" width="108" height="27"/>
+ <rect x="0" y="216.495" width="108" height="27" class="st4"/>
+ <text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+ class="st6">= 2, 6</tspan></text> </g>
+ <g id="shape239-248" v:mID="239" v:groupContext="shape" transform="translate(648.12,-18.375)">
+ <title>Sheet.239</title>
+ <desc>d = 3, 7</desc>
+ <v:textBlock v:margins="rect(4,4,4,4)"/>
+ <v:textRect cx="54" cy="229.995" width="108" height="27"/>
+ <rect x="0" y="216.495" width="108" height="27" class="st4"/>
+ <text x="27.42" y="235.4" class="st5" v:langID="1033"><v:paragraph v:horizAlign="1"/><v:tabList/>d<tspan class="st6"> </tspan><tspan
+ class="st6">= 3, 7</tspan></text> </g>
+ </g>
+</svg>
diff --git a/third_party/aom/doc/img/tx_basis.svg b/third_party/aom/doc/img/tx_basis.svg
new file mode 100644
index 0000000000..eb27b0314b
--- /dev/null
+++ b/third_party/aom/doc/img/tx_basis.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 381.36 266.69"><defs><style>.cls-1,.cls-21{fill:none;}.cls-2{clip-path:url(#clip-path);}.cls-3{fill:#ddebf7;}.cls-4{clip-path:url(#clip-path-2);}.cls-15,.cls-24,.cls-5{font-size:11.04px;}.cls-5{font-family:Calibri, Calibri;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{font-size:7.32px;}.cls-14,.cls-15{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{letter-spacing:0.01em;}.cls-19{letter-spacing:0.01em;}.cls-20{clip-path:url(#clip-path-13);}.cls-21{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}.cls-22{clip-path:url(#clip-path-14);}.cls-23{clip-path:url(#clip-path-15);}.cls-24,.cls-25,.cls-33{font-family:CambriaMath, Cambria Math;}.cls-25{font-size:8.04px;}.cls-26{fill-rule:evenodd;}.cls-27{letter-spacing:0em;}.cls-28{clip-path:url(#clip-path-35);}.cls-29{clip-path:url(#clip-path-47);}.cls-30{letter-spacing:0em;}.cls-31{letter-spacing:-0.01em;}.cls-32{clip-path:url(#clip-path-98);}.cls-33{font-size:11.06px;}</style><clipPath id="clip-path" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="1.92" y="1.92" width="381.22" height="594.46"/></clipPath><clipPath id="clip-path-2" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="1.92" y="339.53" width="381.22" height="17.16"/></clipPath><clipPath id="clip-path-13" transform="translate(-1.43 -338.09)"><rect class="cls-1" width="385.18" height="598.42"/></clipPath><clipPath id="clip-path-14" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="132.72" y="370.06" width="181.68" height="53.04"/></clipPath><clipPath id="clip-path-15" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="0.96" y="0.94" width="382.08" height="595.32"/></clipPath><clipPath id="clip-path-35" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="137.4" y="411.46" width="131.16" height="27.24"/></clipPath><clipPath id="clip-path-47" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="131.52" y="451.78" width="194.28" height="41.52"/></clipPath><clipPath id="clip-path-98" transform="translate(-1.43 -338.09)"><rect class="cls-1" x="134.4" y="566.98" width="105.84" height="18.36"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><g class="cls-2"><rect class="cls-3" x="0.01" y="0.96" width="381.34" height="18.24"/></g><g class="cls-4"><text class="cls-5" transform="translate(21.49 13.8)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan x="46.53" y="0"> </tspan><tspan class="cls-10" x="49.02" y="0">T</tspan><tspan class="cls-7" x="54.42" y="0">y</tspan><tspan class="cls-6" x="59.46" y="0">p</tspan><tspan x="65.22" y="0">e</tspan></text></g><g class="cls-4"><text class="cls-5" transform="translate(164.19 13.2)">Basis f<tspan class="cls-11" x="28.33" y="0">u</tspan><tspan class="cls-6" x="34.08" y="0">n</tspan><tspan class="cls-8" x="39.85" y="0">cti</tspan><tspan class="cls-12" x="50.67" y="0">o</tspan><tspan class="cls-13" x="56.55" y="0">n</tspan><tspan class="cls-8" x="62.31" y="0" xml:space="preserve"> T</tspan></text></g><g class="cls-4"><text class="cls-14" transform="translate(234.54 15.36)">i</text></g><g class="cls-4"><text class="cls-5" transform="translate(238.26 13.2)">(</text></g><g class="cls-4"><text class="cls-15" transform="translate(241.62 13.2)">j</text></g><g class="cls-4"><text class="cls-5" transform="translate(246.3 13.2)">), </text></g><g class="cls-4"><text class="cls-15" transform="translate(254.94 13.2)">i</text></g><g class="cls-4"><text class="cls-5" transform="translate(259.5 13.2)">, </text></g><g class="cls-4"><text class="cls-15" transform="translate(264.78 13.2)">j</text></g><g class="cls-4"><text class="cls-5" transform="translate(269.46 13.2)"> <tspan class="cls-16" x="2.5" y="0">=</tspan><tspan class="cls-17" x="8.02" y="0"> </tspan><tspan class="cls-18" x="10.51" y="0">0</tspan><tspan class="cls-17" x="16.16" y="0">, </tspan><tspan class="cls-18" x="21.41" y="0">1</tspan><tspan class="cls-17" x="27.06" y="0">, </tspan><tspan class="cls-19" x="32.31" y="0">…</tspan><tspan x="40" y="0">, N</tspan><tspan class="cls-11" x="52.38" y="0">-</tspan><tspan class="cls-17" x="55.72" y="0">1</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(43.69 67.44)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">CT-2</tspan></text><text class="cls-5" transform="translate(44.17 137.55)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">ST-4</tspan></text><text class="cls-5" transform="translate(44.17 188.67)"><tspan class="cls-7">D</tspan><tspan x="6.83" y="0">ST-7</tspan></text><text class="cls-5" transform="translate(49.81 237.99)">IDT</text></g><g class="cls-20"><line class="cls-21" x1="113.25" y1="1.98" x2="113.25" y2="17.58"/><rect x="113.19" y="1.92" width="0.96" height="15.72"/><line class="cls-21" x1="113.25" y1="20.58" x2="113.25" y2="256.79"/><rect x="113.19" y="20.52" width="0.96" height="236.33"/><rect x="0.01" width="381.34" height="1.92"/><rect x="0.01" y="17.64" width="381.34" height="0.96"/><rect x="0.01" y="19.56" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="107.97" x2="381.29" y2="107.97"/><rect x="0.01" y="107.91" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="158.73" x2="381.29" y2="158.73"/><rect x="0.01" y="158.67" width="381.34" height="0.96"/><line class="cls-21" x1="0.07" y1="210.21" x2="381.29" y2="210.21"/><rect x="0.01" y="210.15" width="381.34" height="0.96"/><rect x="0.01" y="256.85" width="381.34" height="1.92"/></g><g class="cls-22"><path d="M139.1,395.72h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c0-.17.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39-.05h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M142.24,395.35a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(143.1 57.63)">(</text></g><g class="cls-23"><path d="M151.06,395.81a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81A.39.39,0,0,0,151,391a.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(151.86 57.63)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(159.42 57.63)">=</text></g><g class="cls-23"><path d="M177.09,395.81a1.21,1.21,0,0,1-1.22-.89h0a1.94,1.94,0,0,1-.39.48,1.53,1.53,0,0,1-.52.29,2.11,2.11,0,0,1-.75.12,1.65,1.65,0,0,1-.76-.19,1.37,1.37,0,0,1-.56-.58,2,2,0,0,1-.22-1,3.43,3.43,0,0,1,.63-1.93,5.86,5.86,0,0,1,1.78-1.67l.27.37a5.18,5.18,0,0,0-1.28,1.51,3.62,3.62,0,0,0-.47,1.75,1.61,1.61,0,0,0,.24.95.77.77,0,0,0,.66.33.88.88,0,0,0,.68-.32,2.1,2.1,0,0,0,.41-.95l.45-2h.9l-.41,1.85a3.25,3.25,0,0,0-.07.59.94.94,0,0,0,.19.65.66.66,0,0,0,.52.21,1.13,1.13,0,0,0,.8-.33,2.1,2.1,0,0,0,.53-.95,4.86,4.86,0,0,0,.19-1.45,4,4,0,0,0-.13-1,2.57,2.57,0,0,0-.38-.85l.37-.32a3.59,3.59,0,0,1,.81,1.13,3.25,3.25,0,0,1,.27,1.32,3.35,3.35,0,0,1-.31,1.46,2.49,2.49,0,0,1-.89,1.05,2.34,2.34,0,0,1-1.31.38Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-25" transform="translate(178.74 59.91)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(186.06 57.63)">∙</text></g><polygon class="cls-26" points="198.35 34.22 200.03 34.22 200.03 34.26 207.86 34.26 207.86 34.98 199.58 34.98 199.58 34.93 198.95 34.93 196.54 71.02 196.04 71.02 192.6 64.69 191.58 65.23 191.37 64.85 193.31 63.82 196.06 68.91 198.35 34.22"/><rect x="199.58" y="54.06" width="8.28" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(200.72 49.23)">2</text></g><g class="cls-23"><path d="M208.83,395.8l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c0,.15-.11.4-.19.75l-1.26,5.75H206l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(210.44 57.63)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(215.6 57.63)"><tspan class="cls-27">c</tspan><tspan x="4.91" y="0">os</tspan></text></g><path class="cls-26" d="M240,378.52l.25.41a12.83,12.83,0,0,0-2.94,5.56,32,32,0,0,0,0,16.15,12.9,12.9,0,0,0,2.95,5.63l-.25.4a12.67,12.67,0,0,1-3.52-5.71,27.72,27.72,0,0,1,0-16.78A12.76,12.76,0,0,1,240,378.52Zm65.56,0a12.76,12.76,0,0,1,3.52,5.66,27.72,27.72,0,0,1,0,16.78,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.79,12.79,0,0,0,2.95-5.63,29.71,29.71,0,0,0,1-8.05,29.13,29.13,0,0,0-1-8.1,12.83,12.83,0,0,0-2.94-5.56Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="239.18 54.06 271.34 54.06 303.5 54.06 303.5 54.78 271.34 54.78 239.18 54.78 239.18 54.06"/><g class="cls-23"><path d="M244.35,384.9c.06-.28.14-.63.26-1s.21-.71.29-.93l0-.05h-1.58l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6h-1.14l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.83,1.83,0,0,0-.58.51l-.34-.28c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3,1.58,1.58,0,0,1,.44-.19,2.52,2.52,0,0,1,.58-.06h4.42l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(248.36 49.23)">∙</text></g><g class="cls-23"><path d="M256.26,383.69a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(259.76 49.23)">∙</text></g><path class="cls-26" d="M270.53,379.09l.14.41a3.15,3.15,0,0,0-1.87,1.62,6.76,6.76,0,0,0-.6,3,7.06,7.06,0,0,0,.6,3.11,3.16,3.16,0,0,0,1.86,1.64l-.13.41a3.87,3.87,0,0,1-2.42-1.79,7,7,0,0,1,0-6.64A3.86,3.86,0,0,1,270.53,379.09Zm30.2,0a3.87,3.87,0,0,1,2.41,1.79,6.16,6.16,0,0,1,.85,3.32,6.23,6.23,0,0,1-.84,3.32,3.9,3.9,0,0,1-2.42,1.79l-.13-.41a3.14,3.14,0,0,0,1.85-1.64,7.06,7.06,0,0,0,.6-3.11,6.76,6.76,0,0,0-.6-3,3.12,3.12,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M274.69,386.46a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57h-4.43v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.61-.69.9-1.05a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1A5.92,5.92,0,0,1,273,380a4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm4.31,1a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(282.2 49.23)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(292.88 49.23)">1</text></g><g class="cls-23"><path d="M269.41,402.3a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57h-4.43v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.62-.69.9-1.05a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1a5.92,5.92,0,0,1,1.22-.45,4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm10.12-6.5-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,2.63,2.63,0,0,0,.13-.32c0-.12.09-.36.17-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-28"><text class="cls-5" transform="translate(136.02 91.95)">where </text></g><g class="cls-23"><path d="M175.77,430.13a1.21,1.21,0,0,1-1.22-.89h0a1.94,1.94,0,0,1-.39.48,1.53,1.53,0,0,1-.52.29,2.11,2.11,0,0,1-.75.12,1.65,1.65,0,0,1-.76-.19,1.37,1.37,0,0,1-.56-.58,2,2,0,0,1-.22-1,3.43,3.43,0,0,1,.63-1.93,5.86,5.86,0,0,1,1.78-1.67l.27.37a5.18,5.18,0,0,0-1.28,1.51,3.62,3.62,0,0,0-.47,1.75,1.61,1.61,0,0,0,.24.95.77.77,0,0,0,.66.33.88.88,0,0,0,.68-.32,2.1,2.1,0,0,0,.41-.95l.45-2h.9l-.41,1.85a3.25,3.25,0,0,0-.07.59.94.94,0,0,0,.19.65.66.66,0,0,0,.52.21,1.13,1.13,0,0,0,.8-.33,2.1,2.1,0,0,0,.53-.95,4.86,4.86,0,0,0,.19-1.45,4,4,0,0,0-.13-1,2.57,2.57,0,0,0-.38-.85l.37-.32a3.59,3.59,0,0,1,.81,1.13,3.25,3.25,0,0,1,.27,1.32,3.35,3.35,0,0,1-.31,1.46,2.49,2.49,0,0,1-.89,1.05,2.34,2.34,0,0,1-1.31.38Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-25" transform="translate(177.42 94.23)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(185.34 91.95)">=</text></g><path class="cls-26" d="M202.24,421.8l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.1,3.1,0,0,0,1.86,1.64l-.13.42a3.87,3.87,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.93,3.93,0,0,1,202.24,421.8Zm24.92,0a3.91,3.91,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.9,3.9,0,0,1-2.42,1.79l-.13-.42a3.08,3.08,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M203.92,426.41a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(208.02 91.95)">=</text></g><g class="cls-23"><text class="cls-24" transform="translate(219.3 91.95)">0</text></g><g class="cls-23"><text class="cls-24" transform="translate(229.98 91.95)">?</text></g><polygon class="cls-26" points="247.1 75.66 253.22 75.66 253.22 76.38 247.51 76.38 247.51 76.41 246.43 76.41 244.03 100.31 243.53 100.31 240.2 94.2 239.21 94.72 239 94.34 240.87 93.35 243.58 98.36 245.82 75.69 247.1 75.69 247.1 75.66"/><rect x="247.1" y="88.38" width="6.12" height="0.72"/><g class="cls-23"><text class="cls-25" transform="translate(248 85.47)">2</text></g><g class="cls-23"><path d="M254.25,430.2l0,.19a.83.83,0,0,0-.21.07.46.46,0,0,0-.12.12,1.06,1.06,0,0,0-.1.25,3.94,3.94,0,0,0-.14.54l-.92,4.19h-.54L251,432.19c-.14-.4-.26-.8-.37-1.2h0c0,.13-.06.38-.13.76s-.15.74-.22,1.1l-.35,1.57a3.29,3.29,0,0,0-.08.61.3.3,0,0,0,.09.25.5.5,0,0,0,.29.09l0,.19h-1.37l0-.19a.51.51,0,0,0,.22-.08.39.39,0,0,0,.13-.15,1.85,1.85,0,0,0,.09-.23c0-.09.07-.26.13-.52l.67-3a2.21,2.21,0,0,0,.06-.32,1.71,1.71,0,0,0,0-.31.29.29,0,0,0-.1-.26.52.52,0,0,0-.29-.09l0-.19h1.26l1,3.06c.15.44.27.81.35,1.11h0c0-.16.07-.42.15-.79s.14-.7.2-1l.28-1.23a3.58,3.58,0,0,0,.08-.63.31.31,0,0,0-.09-.26.52.52,0,0,0-.29-.09l0-.19Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(255.08 91.95)">:</text></g><g class="cls-28"><text class="cls-24" transform="translate(259.76 91.95)">1</text></g><g class="cls-29"><path d="M138,477.48h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c.05-.16.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39,0h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M141.13,477.11a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0H142l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(141.99 139.39)">(</text></g><g class="cls-23"><path d="M150,477.57a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88A3.79,3.79,0,0,0,150,473a.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(150.75 139.39)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(158.31 139.39)">=</text></g><polygon class="cls-26" points="176.73 115.98 178.42 115.98 178.42 116.03 186.25 116.03 186.25 116.75 177.97 116.75 177.97 116.7 177.34 116.7 174.93 152.79 174.43 152.79 170.99 146.45 169.97 147 169.76 146.62 171.7 145.59 174.45 150.68 176.73 115.98"/><rect x="177.97" y="135.83" width="8.28" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(179.1 130.99)">2</text></g><g class="cls-23"><path d="M187.23,477.56l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c0,.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(188.82 139.39)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(193.98 139.39)"><tspan class="cls-12">s</tspan><tspan class="cls-30" x="4.8" y="0">i</tspan><tspan x="7.91" y="0">n</tspan></text></g><path class="cls-26" d="M216.8,460.29l.26.4a12.88,12.88,0,0,0-2.95,5.57,31.83,31.83,0,0,0,0,16.14,12.84,12.84,0,0,0,2.95,5.64l-.26.4a12.75,12.75,0,0,1-3.51-5.71,25.79,25.79,0,0,1-1.3-8.38,25.53,25.53,0,0,1,1.3-8.4A12.78,12.78,0,0,1,216.8,460.29Zm100,0a12.71,12.71,0,0,1,3.52,5.66,25.8,25.8,0,0,1,1.3,8.4,26.07,26.07,0,0,1-1.3,8.38,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.93,12.93,0,0,0,2.94-5.64,31.83,31.83,0,0,0,0-16.14,13,13,0,0,0-2.94-5.57Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="216.01 135.83 248.93 135.83 281.85 135.83 314.77 135.83 314.77 136.55 281.85 136.55 248.93 136.55 216.01 136.55 216.01 135.83"/><g class="cls-23"><path d="M221.17,466.67c.06-.29.14-.64.26-1.05s.21-.71.29-.93l0-.05h-1.58l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6H218l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.76,1.76,0,0,0-.58.52l-.34-.29c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3A1.58,1.58,0,0,1,219,464a2.52,2.52,0,0,1,.58-.06H224l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(225.3 130.99)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(230.46 130.99)">(</text></g><g class="cls-23"><text class="cls-24" transform="translate(235.02 130.99)">2</text></g><g class="cls-23"><path d="M243.88,465.45a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(247.38 130.99)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(258.06 130.99)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(264.18 130.99)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(271.16 130.99)">∙</text></g><path class="cls-26" d="M281.91,460.85l.15.42a3.06,3.06,0,0,0-1.87,1.62,6.73,6.73,0,0,0-.61,3,7,7,0,0,0,.61,3.11,3.05,3.05,0,0,0,1.85,1.64l-.13.42a3.9,3.9,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A4,4,0,0,1,281.91,460.85Zm30.08,0a3.93,3.93,0,0,1,2.42,1.79,7,7,0,0,1,0,6.65,3.9,3.9,0,0,1-2.42,1.79l-.13-.42a3.05,3.05,0,0,0,1.85-1.64,6.89,6.89,0,0,0,.61-3.11,6.74,6.74,0,0,0-.6-3,3.08,3.08,0,0,0-1.88-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M286.09,468.22a2.46,2.46,0,0,0,.43,0,.78.78,0,0,0,.26-.11.62.62,0,0,0,.15-.2c0-.08.09-.21.15-.37h.45l-.09,1.57H283v-.26a6.59,6.59,0,0,1,.62-1.16,11.52,11.52,0,0,1,1.12-1.39q.62-.69.9-1a4.89,4.89,0,0,0,.45-.67,2.42,2.42,0,0,0,.21-.53,2.22,2.22,0,0,0,.06-.51,1.92,1.92,0,0,0-.12-.71,1.08,1.08,0,0,0-.4-.49,1.17,1.17,0,0,0-.67-.18,1.37,1.37,0,0,0-1.36,1.12h-.65v-1a5.92,5.92,0,0,1,1.22-.45,4.45,4.45,0,0,1,1-.13,2.27,2.27,0,0,1,1.51.43,1.58,1.58,0,0,1,.52,1.28,2.2,2.2,0,0,1-.06.53,2.27,2.27,0,0,1-.18.47,3.74,3.74,0,0,1-.32.5c-.14.18-.29.37-.46.55s-.45.49-.85.93a10.93,10.93,0,0,0-1.51,1.89Zm4.31.95a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31-.05h.31Zm1.62-7.36-.25,1.05h-.95l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(293.48 130.99)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(304.16 130.99)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(258.18 146.83)">4</text></g><g class="cls-23"><path d="M273.51,477.56l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1.05c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26H266l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M138.55,528.68h-2.27l.06-.26a1.15,1.15,0,0,0,.32-.09.48.48,0,0,0,.18-.17,1.29,1.29,0,0,0,.16-.35c.05-.16.11-.41.18-.74l1.16-5.27h-.69a1,1,0,0,0-.93.45,6.74,6.74,0,0,0-.49.89h-.51l.4-1.82h5.63l-.42,1.9h-.52a7.68,7.68,0,0,0,0-.79.92.92,0,0,0-.1-.39.37.37,0,0,0-.18-.19,1,1,0,0,0-.39-.05h-.8l-1.17,5.31c0,.14-.06.25-.07.34l0,.26a2.32,2.32,0,0,0,0,.24.48.48,0,0,0,.06.26.28.28,0,0,0,.16.14,1.61,1.61,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M141.69,528.31a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,.05.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09,3,3,0,0,0,.38-.37l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(142.55 190.59)">(</text></g><g class="cls-23"><path d="M150.53,528.77a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31,0h.31Zm1.62-7.36-.25,1.05h-1l.25-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(151.33 190.59)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(158.89 190.59)">=</text></g><polygon class="cls-26" points="177.29 167.4 178.97 167.4 178.97 167.45 212.13 167.45 212.13 168.17 178.53 168.17 178.53 168.12 177.89 168.12 175.48 204.21 174.98 204.21 171.54 197.87 170.52 198.42 170.31 198.03 172.25 197.01 175 202.09 177.29 167.4"/><rect x="178.53" y="187.01" width="33.6" height="0.72"/><g class="cls-23"><text class="cls-24" transform="translate(192.25 182.19)">4</text></g><g class="cls-23"><text class="cls-24" transform="translate(178.57 198.03)">2</text></g><g class="cls-23"><path d="M193.9,528.76l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74l-1.57-4.63c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72L191,533c.21.6.36,1.12.48,1.53h.06c0-.22.09-.58.19-1.09s.2-1,.29-1.34l.37-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(195.49 198.03)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(206.05 198.03)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(214.69 190.59)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(219.85 190.59)"><tspan class="cls-27">s</tspan><tspan class="cls-31" x="4.79" y="0">i</tspan><tspan x="7.8" y="0">n</tspan></text></g><path class="cls-26" d="M242.68,511.47l.25.4a12.87,12.87,0,0,0-2.94,5.57,31.83,31.83,0,0,0,0,16.14,12.93,12.93,0,0,0,2.94,5.64l-.25.4a12.67,12.67,0,0,1-3.52-5.71,26.08,26.08,0,0,1-1.3-8.39,25.69,25.69,0,0,1,1.3-8.39A12.71,12.71,0,0,1,242.68,511.47Zm94,0a12.71,12.71,0,0,1,3.52,5.66,25.43,25.43,0,0,1,1.3,8.39,25.81,25.81,0,0,1-1.3,8.39,12.67,12.67,0,0,1-3.52,5.71l-.25-.4a12.84,12.84,0,0,0,2.95-5.64,29.66,29.66,0,0,0,1-8.05,29,29,0,0,0-1-8.09,12.87,12.87,0,0,0-2.94-5.57Z" transform="translate(-1.43 -338.09)"/><polygon class="cls-26" points="241.89 187.01 288.27 187.01 334.65 187.01 334.65 187.73 288.27 187.73 241.89 187.73 241.89 187.01"/><g class="cls-23"><path d="M247,517.87c.06-.29.14-.64.26-1s.21-.71.29-.93l0,0H246l-.66,3.07c-.06.24-.12.5-.19.77s-.13.48-.17.6h-1.14l.05-.26a6.09,6.09,0,0,0,1.06-2.66l.35-1.52h-.17a1.38,1.38,0,0,0-.64.13,1.76,1.76,0,0,0-.58.52l-.34-.29c.19-.23.34-.4.46-.52a2,2,0,0,1,.37-.3,1.58,1.58,0,0,1,.44-.19,2.52,2.52,0,0,1,.58-.06h4.42l-.15.71h-1.29l-.61,2.7a4.59,4.59,0,0,0-.11.8.53.53,0,0,0,.07.31.26.26,0,0,0,.22.1c.21,0,.47-.18.76-.52l.31.29a3.45,3.45,0,0,1-.75.65,1.39,1.39,0,0,1-.72.2.73.73,0,0,1-.57-.25,1,1,0,0,1-.21-.66,5.29,5.29,0,0,1,.15-1.05Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(251.05 182.19)">∙</text></g><g class="cls-23"><text class="cls-24" transform="translate(256.23 182.19)">(</text></g><g class="cls-23"><text class="cls-24" transform="translate(260.79 182.19)">2</text></g><g class="cls-23"><path d="M269.65,516.65a4.12,4.12,0,0,0,.12-.81.39.39,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32,0h.32l-.78,3.46a4,4,0,0,0-.12.8.53.53,0,0,0,.07.31.27.27,0,0,0,.22.1.63.63,0,0,0,.35-.14,3.19,3.19,0,0,0,.51-.5l.31.3a5,5,0,0,1-.86.76,1.4,1.4,0,0,1-.73.2.72.72,0,0,1-.57-.26,1,1,0,0,1-.21-.65,4.26,4.26,0,0,1,.16-1.05Zm1.74-3.64-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(273.15 182.19)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(283.83 182.19)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(289.95 182.19)">)</text></g><g class="cls-23"><text class="cls-24" transform="translate(296.91 182.19)">∙</text></g><path class="cls-26" d="M307.67,512l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,307.67,512Zm24.08,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M310,520.37a4.38,4.38,0,0,1-.47,1.32,2,2,0,0,1-.77.75,2.38,2.38,0,0,1-1.13.25,2,2,0,0,1-.46-.05l.12-.48a1.31,1.31,0,0,0,.41,0,1.16,1.16,0,0,0,.44-.07,1.14,1.14,0,0,0,.34-.26,2,2,0,0,0,.29-.51,5.39,5.39,0,0,0,.24-.83l.87-3.88a3.79,3.79,0,0,0,.13-.81.39.39,0,0,0-.12-.33.85.85,0,0,0-.45-.1l.06-.28,1.31,0h.31Zm1.62-7.36-.25,1h-.95l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-24" transform="translate(313.23 182.19)">+</text></g><g class="cls-23"><text class="cls-24" transform="translate(323.91 182.19)">1</text></g><g class="cls-23"><text class="cls-24" transform="translate(281.07 198.03)">2</text></g><g class="cls-23"><path d="M296.4,528.76l-.06.27a.82.82,0,0,0-.29.09.49.49,0,0,0-.16.16,1.61,1.61,0,0,0-.15.34c-.05.15-.11.4-.19.75l-1.26,5.75h-.74L292,531.49c-.19-.55-.36-1.1-.5-1.65h-.07q0,.27-.18,1c-.1.51-.2,1-.31,1.51l-.47,2.15a3.82,3.82,0,0,0-.12.84.39.39,0,0,0,.13.34.72.72,0,0,0,.4.13l-.06.26h-1.88l.06-.26a.86.86,0,0,0,.3-.11.7.7,0,0,0,.18-.2,1.66,1.66,0,0,0,.12-.32q.06-.18.18-.72l.92-4.15c0-.15.06-.3.09-.44a3,3,0,0,0,0-.43.42.42,0,0,0-.13-.35.76.76,0,0,0-.4-.11l.06-.27h1.72l1.42,4.2c.21.6.36,1.12.48,1.53H294c0-.22.09-.58.19-1.09s.2-1,.28-1.34l.38-1.7a4.05,4.05,0,0,0,.12-.87.42.42,0,0,0-.13-.35.74.74,0,0,0-.4-.11l.06-.27Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-32"><path d="M139.76,577.52h-2.27l0-.26a1.46,1.46,0,0,0,.32-.09A.48.48,0,0,0,138,577a1,1,0,0,0,.16-.36c.05-.16.12-.4.19-.73l1.15-5.29h-.68a1.18,1.18,0,0,0-.54.1,1.17,1.17,0,0,0-.4.36,5.41,5.41,0,0,0-.48.89h-.52l.4-1.82H143l-.42,1.9H142c0-.35,0-.61,0-.79a1.14,1.14,0,0,0-.11-.4.47.47,0,0,0-.18-.19,1.18,1.18,0,0,0-.39,0h-.8L139.34,576q0,.21-.06.33a2.28,2.28,0,0,0-.05.51.6.6,0,0,0,.05.26.3.3,0,0,0,.17.14,1.5,1.5,0,0,0,.37.07Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><path d="M142.89,577.17a2.84,2.84,0,0,0,.09-.58c0-.12,0-.2-.08-.24a.54.54,0,0,0-.33-.07l0-.21,1,0h.23l-.56,2.52a2.84,2.84,0,0,0-.09.58.44.44,0,0,0,0,.23.18.18,0,0,0,.16.07.45.45,0,0,0,.25-.09A3,3,0,0,0,144,579l.22.22a3.32,3.32,0,0,1-.62.55,1.09,1.09,0,0,1-.54.14.51.51,0,0,1-.41-.18.76.76,0,0,1-.15-.48,3.63,3.63,0,0,1,.11-.76Zm1.27-2.64-.18.76h-.7l.18-.76Z" transform="translate(-1.43 -338.09)"/></g><path class="cls-26" d="M149.39,569.27l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,149.39,569.27Zm4.88,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M151.73,577.61a4.12,4.12,0,0,1-.47,1.32,1.85,1.85,0,0,1-.77.75,2.4,2.4,0,0,1-1.14.25,1.47,1.47,0,0,1-.45,0l.12-.47a1.72,1.72,0,0,0,.41,0,1.15,1.15,0,0,0,.44-.08,1,1,0,0,0,.34-.26,2.06,2.06,0,0,0,.29-.5,4.84,4.84,0,0,0,.24-.84l.88-3.89a4.33,4.33,0,0,0,.12-.81.42.42,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.31Zm1.63-7.37-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(160.09 239.43)">=</text></g><path class="cls-26" d="M177,569.27l.14.42a3.09,3.09,0,0,0-1.87,1.62,6.74,6.74,0,0,0-.6,3,7,7,0,0,0,.6,3.11,3.13,3.13,0,0,0,1.86,1.64l-.13.42a3.91,3.91,0,0,1-2.42-1.79,7,7,0,0,1,0-6.65A3.9,3.9,0,0,1,177,569.27Zm31.28,0a3.88,3.88,0,0,1,2.41,1.79,7,7,0,0,1,0,6.65,3.93,3.93,0,0,1-2.42,1.79l-.13-.42a3.11,3.11,0,0,0,1.85-1.64,7,7,0,0,0,.6-3.11,6.74,6.74,0,0,0-.6-3,3.06,3.06,0,0,0-1.87-1.62Z" transform="translate(-1.43 -338.09)"/><g class="cls-23"><path d="M178.67,573.88a3.62,3.62,0,0,0,.13-.81.39.39,0,0,0-.12-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.32l-.78,3.47a3.87,3.87,0,0,0-.12.79.63.63,0,0,0,.07.32.28.28,0,0,0,.23.1.6.6,0,0,0,.34-.14,3.64,3.64,0,0,0,.52-.5l.31.3a5.37,5.37,0,0,1-.86.76,1.44,1.44,0,0,1-.74.2.68.68,0,0,1-.56-.26,1,1,0,0,1-.21-.66,4.7,4.7,0,0,1,.15-1Zm1.74-3.64-.25,1h-.95l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(182.77 239.43)">=</text></g><g class="cls-23"><text class="cls-33" transform="translate(191.05 239.43)">=</text></g><g class="cls-23"><path d="M205.73,577.61a4.12,4.12,0,0,1-.47,1.32,1.85,1.85,0,0,1-.77.75,2.4,2.4,0,0,1-1.14.25,1.47,1.47,0,0,1-.45,0l.12-.47a1.72,1.72,0,0,0,.41,0,1.15,1.15,0,0,0,.44-.08,1,1,0,0,0,.34-.26,2.06,2.06,0,0,0,.29-.5,4.84,4.84,0,0,0,.24-.84l.88-3.89a4.33,4.33,0,0,0,.12-.81.42.42,0,0,0-.11-.33.89.89,0,0,0-.46-.1l.06-.28,1.32-.05h.31Zm1.63-7.37-.25,1h-1l.25-1Z" transform="translate(-1.43 -338.09)"/></g><g class="cls-23"><text class="cls-33" transform="translate(213.49 239.43)">?</text></g><g class="cls-23"><text class="cls-33" transform="translate(219.85 239.43)">1</text></g><g class="cls-23"><text class="cls-33" transform="translate(225.97 239.43)">:</text></g><g class="cls-32"><text class="cls-33" transform="translate(230.77 239.43)">0</text></g></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/tx_cands_large.svg b/third_party/aom/doc/img/tx_cands_large.svg
new file mode 100644
index 0000000000..fb4f5f49bf
--- /dev/null
+++ b/third_party/aom/doc/img/tx_cands_large.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 324.79 73.56"><defs><style>.cls-1,.cls-22{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-16,.cls-17,.cls-4{font-size:12px;fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0.01em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0.01em;}.cls-14{letter-spacing:0em;}.cls-15{clip-path:url(#clip-path-4);}.cls-16{font-family:Calibri, Calibri;}.cls-17{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-18{letter-spacing:0em;}.cls-19{letter-spacing:0em;}.cls-20{clip-path:url(#clip-path-7);}.cls-21{clip-path:url(#clip-path-10);}.cls-22{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="31.68" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="55.56" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-7" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="79.44" width="376.27" height="22.92"/></clipPath><clipPath id="clip-path-10" transform="translate(-53.04 -30.24)"><rect class="cls-1" width="380.26" height="105.36"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="324.79" height="24"/><g class="cls-3"><text class="cls-4" transform="translate(11.3 16.92)"><tspan class="cls-5">M</tspan><tspan class="cls-6" x="10.44" y="0">a</tspan><tspan x="16.33" y="0">x(</tspan><tspan class="cls-7" x="25.58" y="0">w</tspan><tspan class="cls-8" x="34.55" y="0">i</tspan><tspan class="cls-7" x="37.53" y="0">d</tspan><tspan x="43.99" y="0">t</tspan><tspan class="cls-9" x="48.15" y="0">h</tspan><tspan x="54.65" y="0">,</tspan><tspan class="cls-10" x="57.75" y="0"> </tspan><tspan class="cls-7" x="60.51" y="0">h</tspan><tspan class="cls-11" x="66.97" y="0">e</tspan><tspan class="cls-12" x="72.98" y="0">i</tspan><tspan class="cls-5" x="75.96" y="0">g</tspan><tspan class="cls-7" x="81.6" y="0">h</tspan><tspan x="88.06" y="0">t)</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(158.09 16.92)"><tspan class="cls-7">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-13" x="13.85" y="0">r</tspan><tspan x="18.18" y="0">a</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(261.07 16.92)"><tspan class="cls-14">In</tspan><tspan x="9.69" y="0">ter</tspan></text></g><g class="cls-15"><text class="cls-16" transform="translate(53.18 40.8)"><tspan class="cls-8">3</tspan><tspan x="6.12" y="0">2</tspan></text></g><g class="cls-15"><text class="cls-17" transform="translate(148.13 40.8)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-15"><text class="cls-17" transform="translate(235.75 40.8)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">l</tspan><tspan class="cls-18" x="36.23" y="0">y</tspan><tspan x="41.61" y="0">,</tspan><tspan class="cls-14" x="44.6" y="0" xml:space="preserve"> </tspan><tspan x="50.07" y="0">I</tspan><tspan class="cls-19" x="53.1" y="0">D</tspan><tspan x="60.49" y="0">TX</tspan></text></g><g class="cls-20"><text class="cls-16" transform="translate(53.18 64.68)"><tspan class="cls-8">6</tspan><tspan x="6.12" y="0">4</tspan></text></g><g class="cls-20"><text class="cls-17" transform="translate(148.13 64.68)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-20"><text class="cls-17" transform="translate(251.23 64.68)"><tspan class="cls-8">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-6" x="19.54" y="0">O</tspan><tspan class="cls-5" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-21"><line class="cls-22" x1="118.07" y1="1.98" x2="118.07" y2="23.82"/><rect x="118.01" y="1.92" width="0.96" height="21.96"/><line class="cls-22" x1="221.17" y1="1.98" x2="221.17" y2="23.82"/><rect x="221.11" y="1.92" width="0.96" height="21.96"/><line class="cls-22" x1="118.07" y1="25.86" x2="118.07" y2="71.58"/><rect x="118.01" y="25.8" width="0.96" height="45.84"/><line class="cls-22" x1="221.17" y1="25.86" x2="221.17" y2="71.58"/><rect x="221.11" y="25.8" width="0.96" height="45.84"/><rect width="324.79" height="1.92"/><rect y="23.88" width="324.79" height="1.92"/><rect y="71.64" width="324.79" height="1.92"/></g></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/tx_cands_small.svg b/third_party/aom/doc/img/tx_cands_small.svg
new file mode 100644
index 0000000000..ddd9a87e53
--- /dev/null
+++ b/third_party/aom/doc/img/tx_cands_small.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 380.5 90.27"><defs><style>.cls-1,.cls-30{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-17,.cls-18,.cls-4{font-size:12px;}.cls-17,.cls-18,.cls-19,.cls-4{fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0.01em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0.01em;}.cls-11{letter-spacing:0em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{clip-path:url(#clip-path-4);}.cls-17{font-family:Calibri, Calibri;}.cls-18{font-family:Calibri-Italic, Calibri;}.cls-18,.cls-19{font-style:italic;}.cls-19{font-size:11.04px;font-family:SegoeUI-Italic, Segoe UI;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0em;}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:0em;}.cls-25{clip-path:url(#clip-path-8);}.cls-26{clip-path:url(#clip-path-12);}.cls-27{letter-spacing:0em;}.cls-28{letter-spacing:0em;}.cls-29{clip-path:url(#clip-path-17);}.cls-30{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="31.68" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="55.56" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-8" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="79.44" width="431.98" height="22.92"/></clipPath><clipPath id="clip-path-12" transform="translate(-53.04 -30.24)"><rect class="cls-1" x="1.92" y="103.33" width="431.98" height="15.74"/></clipPath><clipPath id="clip-path-17" transform="translate(-53.04 -30.24)"><rect class="cls-1" width="435.94" height="673.9"/></clipPath></defs><title>tx_cands_smallAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="380.5" height="24"/><g class="cls-3"><text class="cls-4" transform="translate(8.66 16.92)"><tspan class="cls-5">M</tspan><tspan class="cls-6" x="10.44" y="0">i</tspan><tspan class="cls-7" x="13.42" y="0">n</tspan><tspan x="19.89" y="0">(w</tspan><tspan class="cls-8" x="32.57" y="0">i</tspan><tspan class="cls-9" x="35.59" y="0">d</tspan><tspan x="42.05" y="0">t</tspan><tspan class="cls-10" x="46.21" y="0">h</tspan><tspan x="52.71" y="0">,</tspan><tspan class="cls-11" x="55.8" y="0"> </tspan><tspan class="cls-7" x="58.56" y="0">h</tspan><tspan class="cls-12" x="65.03" y="0">e</tspan><tspan class="cls-13" x="71.03" y="0">i</tspan><tspan class="cls-14" x="74.02" y="0">g</tspan><tspan class="cls-9" x="79.66" y="0">h</tspan><tspan x="86.12" y="0">t)</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(159.77 16.92)"><tspan class="cls-9">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-15" x="13.85" y="0">r</tspan><tspan x="18.18" y="0">a</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(294.19 16.92)"><tspan class="cls-7">In</tspan><tspan x="9.69" y="0">ter</tspan></text></g><g class="cls-16"><text class="cls-17" transform="translate(52.7 40.8)">4</text></g><g class="cls-16"><text class="cls-18" transform="translate(122.57 40.8)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-16"><text class="cls-19" transform="translate(155.93 40.8)"><tspan class="cls-20">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-23" x="25.36" y="0"> </tspan><tspan class="cls-24" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-16"><text class="cls-18" transform="translate(290.59 40.8)">ALL<tspan class="cls-13" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-25"><text class="cls-17" transform="translate(52.7 64.68)">8</text></g><g class="cls-25"><text class="cls-18" transform="translate(122.57 64.68)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(155.93 64.68)"><tspan class="cls-20">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-23" x="25.36" y="0"> </tspan><tspan class="cls-24" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-18" transform="translate(290.59 64.68)">ALL<tspan class="cls-13" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-26"><text class="cls-17" transform="translate(49.58 84.99)"><tspan class="cls-13">1</tspan><tspan x="6.12" y="0">6</tspan></text></g><g class="cls-26"><text class="cls-18" transform="translate(142.49 84.99)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4, </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(175.85 84.99)"><tspan class="cls-20">IDT</tspan><tspan x="16.44" y="0">X</tspan></text></g><g class="cls-26"><text class="cls-18" transform="translate(257.11 84.99)"><tspan class="cls-13">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-11" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">9, </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(290.47 84.99)"><tspan class="cls-27">I</tspan><tspan class="cls-21" x="2.88" y="0">D</tspan><tspan class="cls-22" x="10.7" y="0">TX</tspan><tspan x="22.96" y="0">,</tspan><tspan class="cls-22" x="25.36" y="0"> </tspan><tspan class="cls-28" x="28.37" y="0">1</tspan><tspan class="cls-21" x="34.34" y="0">DD</tspan><tspan x="49.97" y="0">CT</tspan></text></g><g class="cls-29"><line class="cls-30" x1="110.84" y1="1.98" x2="110.84" y2="23.82"/><rect x="110.78" y="1.92" width="0.96" height="21.96"/><line class="cls-30" x1="231.73" y1="1.98" x2="231.73" y2="23.82"/><rect x="231.67" y="1.92" width="0.96" height="21.96"/><line class="cls-30" x1="110.84" y1="25.86" x2="110.84" y2="88.29"/><rect x="110.78" y="25.81" width="0.96" height="62.54"/><line class="cls-30" x1="231.73" y1="25.86" x2="231.73" y2="88.29"/><rect x="231.67" y="25.81" width="0.96" height="62.54"/><rect width="380.5" height="1.92"/><rect y="23.88" width="380.5" height="1.92"/><rect y="88.35" width="380.5" height="1.92"/></g></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/tx_chroma.svg b/third_party/aom/doc/img/tx_chroma.svg
new file mode 100644
index 0000000000..a0915e0031
--- /dev/null
+++ b/third_party/aom/doc/img/tx_chroma.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 380.5 244.23"><defs><style>.cls-1,.cls-41{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-4{font-size:12px;font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-19,.cls-4{fill:#333;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0.01em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{clip-path:url(#clip-path-3);}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0em;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0.01em;}.cls-18{clip-path:url(#clip-path-5);}.cls-19{font-size:11.04px;font-family:SegoeUI-Italic, Segoe UI;font-style:italic;}.cls-20{letter-spacing:0em;}.cls-21{letter-spacing:0em;}.cls-22{clip-path:url(#clip-path-8);}.cls-23{letter-spacing:0em;}.cls-24{letter-spacing:-0.01em;}.cls-25{clip-path:url(#clip-path-11);}.cls-26{letter-spacing:0em;}.cls-27{letter-spacing:0em;}.cls-28{clip-path:url(#clip-path-14);}.cls-29{letter-spacing:0em;}.cls-30{letter-spacing:0em;}.cls-31{clip-path:url(#clip-path-17);}.cls-32{clip-path:url(#clip-path-20);}.cls-33{clip-path:url(#clip-path-23);}.cls-34{clip-path:url(#clip-path-26);}.cls-35{clip-path:url(#clip-path-29);}.cls-36{clip-path:url(#clip-path-32);}.cls-37{clip-path:url(#clip-path-35);}.cls-38{clip-path:url(#clip-path-38);}.cls-39{clip-path:url(#clip-path-41);}.cls-40{clip-path:url(#clip-path-44);}.cls-41{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="53.52" y="17.15" width="110.3" height="30.24"/></clipPath><clipPath id="clip-path-3" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="17.15" width="431.98" height="30.24"/></clipPath><clipPath id="clip-path-5" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="48.35" width="431.98" height="15"/></clipPath><clipPath id="clip-path-8" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="64.31" width="431.98" height="15"/></clipPath><clipPath id="clip-path-11" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="80.27" width="431.98" height="15"/></clipPath><clipPath id="clip-path-14" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="96.23" width="431.98" height="15"/></clipPath><clipPath id="clip-path-17" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="112.2" width="431.98" height="15.02"/></clipPath><clipPath id="clip-path-20" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="128.18" width="431.98" height="15"/></clipPath><clipPath id="clip-path-23" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="144.14" width="431.98" height="15"/></clipPath><clipPath id="clip-path-26" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="160.1" width="431.98" height="15"/></clipPath><clipPath id="clip-path-29" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="176.06" width="431.98" height="15"/></clipPath><clipPath id="clip-path-32" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="192.02" width="431.98" height="15"/></clipPath><clipPath id="clip-path-35" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="207.98" width="431.98" height="15"/></clipPath><clipPath id="clip-path-38" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="223.94" width="431.98" height="17.88"/></clipPath><clipPath id="clip-path-41" transform="translate(-53.04 -15.71)"><rect class="cls-1" x="1.92" y="242.78" width="431.98" height="15.72"/></clipPath><clipPath id="clip-path-44" transform="translate(-53.04 -15.71)"><rect class="cls-1" width="435.94" height="567.07"/></clipPath></defs><title>tx_chromaAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" y="0.96" width="380.5" height="31.32"/><g class="cls-3"><text class="cls-4" transform="translate(16.58 12.84)"><tspan class="cls-5">In</tspan><tspan x="9.69" y="0">t</tspan><tspan class="cls-6" x="13.85" y="0">r</tspan><tspan class="cls-7" x="18.18" y="0">a</tspan><tspan class="cls-5" x="24.07" y="0"> </tspan><tspan x="26.81" y="0">P</tspan><tspan class="cls-8" x="33.2" y="0">r</tspan><tspan class="cls-7" x="37.47" y="0">e</tspan><tspan class="cls-5" x="43.48" y="0">d</tspan><tspan class="cls-9" x="49.94" y="0">i</tspan><tspan x="52.93" y="0">c</tspan><tspan class="cls-9" x="57.95" y="0">ti</tspan><tspan x="65.01" y="0">o</tspan><tspan class="cls-10" x="71.46" y="0">n</tspan><tspan x="77.95" y="0"> </tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(41.06 28.44)"><tspan class="cls-11">M</tspan><tspan x="10.44" y="0">o</tspan><tspan class="cls-10" x="16.89" y="0">d</tspan><tspan x="23.38" y="0">e</tspan></text></g><g class="cls-12"><text class="cls-4" transform="translate(126.17 20.64)">V<tspan class="cls-11" x="7.1" y="0">e</tspan><tspan class="cls-13" x="13.09" y="0">r</tspan><tspan class="cls-6" x="17.39" y="0">ti</tspan><tspan x="24.49" y="0">cal</tspan><tspan class="cls-10" x="38.38" y="0"> </tspan><tspan x="41.14" y="0">t</tspan><tspan class="cls-6" x="45.3" y="0">r</tspan><tspan class="cls-7" x="49.64" y="0">a</tspan><tspan class="cls-5" x="55.53" y="0">n</tspan><tspan x="61.99" y="0">s</tspan><tspan class="cls-9" x="66.78" y="0">f</tspan><tspan x="70.61" y="0">o</tspan><tspan class="cls-14" x="77.06" y="0">r</tspan><tspan x="81.39" y="0">m</tspan></text></g><g class="cls-12"><text class="cls-4" transform="translate(253.87 20.64)">Ho<tspan class="cls-10" x="14.02" y="0">r</tspan><tspan class="cls-9" x="18.33" y="0">i</tspan><tspan x="21.32" y="0">z</tspan><tspan class="cls-10" x="26.09" y="0">o</tspan><tspan class="cls-15" x="32.59" y="0">n</tspan><tspan x="39.05" y="0">tal</tspan><tspan class="cls-14" x="52.08" y="0"> </tspan><tspan x="54.85" y="0">t</tspan><tspan class="cls-6" x="59.01" y="0">r</tspan><tspan class="cls-7" x="63.35" y="0">a</tspan><tspan class="cls-15" x="69.24" y="0">n</tspan><tspan x="75.7" y="0">s</tspan><tspan class="cls-16" x="80.49" y="0">f</tspan><tspan x="84.32" y="0">o</tspan><tspan class="cls-17" x="90.77" y="0">r</tspan><tspan x="95.1" y="0">m</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(9.62 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">C_P</tspan><tspan class="cls-21" x="25.8" y="0">R</tspan><tspan x="32.27" y="0">ED</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(160.37 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(294.91 44.28)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(9.62 60.24)">V_P<tspan class="cls-23" x="17.86" y="0">R</tspan><tspan x="24.35" y="0">ED</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(157.61 60.24)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-22"><text class="cls-19" transform="translate(294.91 60.24)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(9.62 76.2)">H<tspan class="cls-26" x="7.82" y="0">_</tspan><tspan x="12.37" y="0">P</tspan><tspan class="cls-21" x="18.72" y="0">R</tspan><tspan x="25.19" y="0">ED</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(160.37 76.2)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(292.15 76.2)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(9.62 92.16)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">45</tspan><tspan x="19.76" y="0">_P</tspan><tspan class="cls-30" x="30.69" y="0">R</tspan><tspan x="37.14" y="0">ED</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(160.37 92.16)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(294.91 92.16)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(9.62 108.15)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">135</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(157.61 108.15)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(292.15 108.15)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(9.62 124.11)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">113</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(157.61 124.11)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(294.91 124.11)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(9.62 140.07)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">157</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(160.37 140.07)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(292.15 140.07)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(9.62 156.03)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">203</tspan><tspan x="25.74" y="0">_P</tspan><tspan class="cls-29" x="36.67" y="0">R</tspan><tspan x="43.12" y="0">ED</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(160.37 156.03)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(292.15 156.03)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(9.62 171.99)"><tspan class="cls-20">D</tspan><tspan class="cls-29" x="7.82" y="0">67</tspan><tspan x="19.76" y="0">_P</tspan><tspan class="cls-30" x="30.69" y="0">R</tspan><tspan x="37.14" y="0">ED</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(157.61 171.99)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(294.91 171.99)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(9.62 187.95)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">P</tspan><tspan class="cls-21" x="55.58" y="0">R</tspan><tspan x="62.05" y="0">ED</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(157.61 187.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(292.15 187.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(9.62 203.91)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">V_P</tspan><tspan class="cls-23" x="67.1" y="0">R</tspan><tspan x="73.58" y="0">ED</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(157.61 203.91)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(294.91 203.91)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(9.62 221.19)">SM<tspan class="cls-26" x="14.52" y="0">OOT</tspan><tspan x="36.87" y="0">H</tspan><tspan class="cls-26" x="44.69" y="0">_</tspan><tspan x="49.23" y="0">H</tspan><tspan class="cls-26" x="57.05" y="0">_</tspan><tspan x="61.6" y="0">P</tspan><tspan class="cls-21" x="67.95" y="0">R</tspan><tspan x="74.42" y="0">ED</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(160.37 221.19)"><tspan class="cls-20">D</tspan><tspan x="7.82" y="0">CT</tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(292.15 221.19)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(9.62 238.95)">PAE<tspan class="cls-26" x="18.83" y="0">T</tspan><tspan x="24.61" y="0">H</tspan><tspan class="cls-26" x="32.42" y="0">_</tspan><tspan x="36.97" y="0">P</tspan><tspan class="cls-21" x="43.32" y="0">R</tspan><tspan x="49.79" y="0">ED</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(157.61 238.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan x="14.77" y="0">ST</tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(292.15 238.95)">A<tspan class="cls-24" x="6.98" y="0">D</tspan><tspan class="cls-27" x="14.77" y="0">ST</tspan></text></g><g class="cls-40"><line class="cls-41" x1="110.84" y1="1.98" x2="110.84" y2="31.14"/><rect x="110.78" y="1.92" width="0.96" height="29.28"/><line class="cls-41" x1="231.73" y1="1.98" x2="231.73" y2="31.14"/><rect x="231.67" y="1.92" width="0.96" height="29.28"/><line class="cls-41" x1="110.84" y1="33.18" x2="110.84" y2="242.25"/><rect x="110.78" y="33.13" width="0.96" height="209.18"/><line class="cls-41" x1="231.73" y1="33.18" x2="231.73" y2="242.25"/><rect x="231.67" y="33.13" width="0.96" height="209.18"/><rect width="380.5" height="1.92"/><rect y="31.2" width="380.5" height="1.92"/><rect y="242.31" width="380.5" height="1.92"/></g></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/tx_partition.svg b/third_party/aom/doc/img/tx_partition.svg
new file mode 100644
index 0000000000..e0ce50c507
--- /dev/null
+++ b/third_party/aom/doc/img/tx_partition.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 172.61 310.73"><defs><style>.cls-1,.cls-38{fill:none;}.cls-2{clip-path:url(#clip-path);}.cls-3{fill:#ddebf7;}.cls-4{clip-path:url(#clip-path-2);}.cls-5{font-size:11.04px;font-family:Calibri, Calibri;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0em;}.cls-12{clip-path:url(#clip-path-4);}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{letter-spacing:0em;}.cls-17{clip-path:url(#clip-path-8);}.cls-18{clip-path:url(#clip-path-10);}.cls-19{letter-spacing:0.01em;}.cls-20{clip-path:url(#clip-path-12);}.cls-21{clip-path:url(#clip-path-14);}.cls-22{clip-path:url(#clip-path-16);}.cls-23{clip-path:url(#clip-path-18);}.cls-24{clip-path:url(#clip-path-20);}.cls-25{letter-spacing:0.01em;}.cls-26{clip-path:url(#clip-path-22);}.cls-27{clip-path:url(#clip-path-24);}.cls-28{clip-path:url(#clip-path-26);}.cls-29{clip-path:url(#clip-path-28);}.cls-30{clip-path:url(#clip-path-30);}.cls-31{clip-path:url(#clip-path-32);}.cls-32{clip-path:url(#clip-path-34);}.cls-33{clip-path:url(#clip-path-36);}.cls-34{clip-path:url(#clip-path-38);}.cls-35{clip-path:url(#clip-path-40);}.cls-36{clip-path:url(#clip-path-42);}.cls-37{clip-path:url(#clip-path-44);}.cls-38{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="1.92" width="172.49" height="323.09"/></clipPath><clipPath id="clip-path-2" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="17.16" width="86.9" height="29.52"/></clipPath><clipPath id="clip-path-4" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="89.78" y="17.16" width="83.66" height="29.52"/></clipPath><clipPath id="clip-path-8" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="62.88" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-10" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="77.4" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-12" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="91.92" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-14" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="106.45" width="172.49" height="13.58"/></clipPath><clipPath id="clip-path-16" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="120.99" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-18" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="135.51" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-20" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="150.03" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-22" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="164.55" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-24" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="179.07" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-26" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="193.59" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-28" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="208.11" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-30" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="222.63" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-32" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="237.15" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-34" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="251.67" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-36" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="266.19" width="172.49" height="13.58"/></clipPath><clipPath id="clip-path-38" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="280.73" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-40" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="295.25" width="172.49" height="13.56"/></clipPath><clipPath id="clip-path-42" transform="translate(-1.44 -15.24)"><rect class="cls-1" x="1.92" y="309.77" width="172.49" height="14.28"/></clipPath><clipPath id="clip-path-44" transform="translate(-1.44 -15.24)"><rect class="cls-1" width="176.45" height="327.05"/></clipPath></defs><title>tables2Asset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><g class="cls-2"><rect class="cls-3" y="1.44" width="172.61" height="30.6"/></g><g class="cls-4"><text class="cls-5" transform="translate(5.28 13.2)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan class="cls-8" x="46.53" y="0" xml:space="preserve"> size </tspan><tspan class="cls-10" x="68.22" y="0">o</tspan><tspan x="74.1" y="0">f </tspan></text></g><g class="cls-4"><text class="cls-5" transform="translate(12.96 27.72)">cu<tspan class="cls-11" x="10.47" y="0">r</tspan><tspan x="14.28" y="0">rent depth</tspan></text></g><g class="cls-12"><text class="cls-5" transform="translate(91.46 13.2)">Tra<tspan class="cls-6" x="14.52" y="0">n</tspan><tspan x="20.28" y="0">sf</tspan><tspan class="cls-7" x="27.97" y="0">o</tspan><tspan class="cls-8" x="33.83" y="0">r</tspan><tspan class="cls-9" x="37.67" y="0">m</tspan><tspan class="cls-8" x="46.53" y="0" xml:space="preserve"> size </tspan><tspan class="cls-10" x="68.22" y="0">o</tspan><tspan x="74.1" y="0">f </tspan></text></g><g class="cls-12"><text class="cls-5" transform="translate(105.86 27.72)"><tspan class="cls-6">n</tspan><tspan x="5.77" y="0">e</tspan><tspan class="cls-13" x="11.26" y="0">x</tspan><tspan x="16.06" y="0">t</tspan><tspan class="cls-14" x="19.76" y="0"> </tspan><tspan class="cls-6" x="22.28" y="0">d</tspan><tspan x="28.05" y="0">epth</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(27.48 43.32)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-2"><text class="cls-5" transform="translate(113.78 43.32)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-17"><text class="cls-5" transform="translate(27.48 58.2)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-17"><text class="cls-5" transform="translate(113.78 58.2)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-18"><text class="cls-5" transform="translate(21.84 72.72)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-18"><text class="cls-5" transform="translate(113.78 72.72)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-20"><text class="cls-5" transform="translate(21.84 87.24)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-20"><text class="cls-5" transform="translate(108.14 87.24)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-21"><text class="cls-5" transform="translate(21.84 101.79)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-21"><text class="cls-5" transform="translate(108.14 101.79)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-22"><text class="cls-5" transform="translate(27.48 116.31)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-22"><text class="cls-5" transform="translate(113.78 116.31)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-23"><text class="cls-5" transform="translate(27.48 130.83)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-23"><text class="cls-5" transform="translate(113.78 130.83)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-24"><text class="cls-5" transform="translate(24.72 145.35)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-24"><text class="cls-5" transform="translate(113.78 145.35)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-26"><text class="cls-5" transform="translate(24.72 159.87)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-26"><text class="cls-5" transform="translate(113.78 159.87)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-27"><text class="cls-5" transform="translate(21.84 174.39)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-27"><text class="cls-5" transform="translate(108.14 174.39)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-28"><text class="cls-5" transform="translate(21.84 188.91)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-28"><text class="cls-5" transform="translate(108.14 188.91)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-29"><text class="cls-5" transform="translate(21.84 203.43)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-29"><text class="cls-5" transform="translate(108.14 203.43)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-30"><text class="cls-5" transform="translate(21.84 217.95)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-30"><text class="cls-5" transform="translate(108.14 217.95)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-31"><text class="cls-5" transform="translate(24.72 232.47)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-31"><text class="cls-5" transform="translate(113.78 232.47)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">4</tspan><tspan class="cls-8" x="22.29" y="0">X8</tspan></text></g><g class="cls-32"><text class="cls-5" transform="translate(24.72 246.99)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X4</tspan></text></g><g class="cls-32"><text class="cls-5" transform="translate(113.78 246.99)">T<tspan class="cls-16" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X4</tspan></text></g><g class="cls-33"><text class="cls-5" transform="translate(24.72 261.53)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">3</tspan><tspan x="33.68" y="0">2</tspan></text></g><g class="cls-33"><text class="cls-5" transform="translate(110.9 261.53)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">8</tspan><tspan class="cls-8" x="22.29" y="0">X</tspan><tspan class="cls-25" x="28.02" y="0">1</tspan><tspan x="33.68" y="0">6</tspan></text></g><g class="cls-34"><text class="cls-5" transform="translate(24.72 276.05)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-34"><text class="cls-5" transform="translate(110.9 276.05)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X8</tspan></text></g><g class="cls-35"><text class="cls-5" transform="translate(21.84 290.57)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">6</tspan><tspan class="cls-8" x="39.31" y="0">4</tspan></text></g><g class="cls-35"><text class="cls-5" transform="translate(108.14 290.57)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">1</tspan><tspan class="cls-16" x="22.29" y="0">6</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">3</tspan><tspan class="cls-8" x="39.31" y="0">2</tspan></text></g><g class="cls-36"><text class="cls-5" transform="translate(21.84 305.45)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">6</tspan><tspan class="cls-16" x="22.29" y="0">4</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-36"><text class="cls-5" transform="translate(108.14 305.45)">T<tspan class="cls-7" x="5.38" y="0">X</tspan><tspan x="11.14" y="0">_</tspan><tspan class="cls-15" x="16.64" y="0">3</tspan><tspan class="cls-16" x="22.29" y="0">2</tspan><tspan class="cls-8" x="27.92" y="0">X</tspan><tspan class="cls-19" x="33.65" y="0">1</tspan><tspan class="cls-8" x="39.31" y="0">6</tspan></text></g><g class="cls-37"><line class="cls-38" x1="87.44" y1="2.94" x2="87.44" y2="30.42"/><rect x="87.38" y="2.88" width="0.96" height="27.6"/><line class="cls-38" x1="87.44" y1="33.42" x2="87.44" y2="307.79"/><rect x="87.38" y="33.36" width="0.96" height="274.49"/><rect width="172.61" height="2.88"/><rect y="30.48" width="172.61" height="0.96"/><rect y="32.4" width="172.61" height="0.96"/><rect y="307.85" width="172.61" height="2.88"/></g></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/doc/img/tx_set.svg b/third_party/aom/doc/img/tx_set.svg
new file mode 100644
index 0000000000..dee10d4d93
--- /dev/null
+++ b/third_party/aom/doc/img/tx_set.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 347.4 549.8"><defs><style>.cls-1,.cls-60{fill:none;}.cls-2{fill:#ddebf7;}.cls-3{clip-path:url(#clip-path);}.cls-19,.cls-4{font-size:12px;fill:#333;}.cls-4{font-family:Calibri-Bold, Calibri;font-weight:700;}.cls-5{letter-spacing:0em;}.cls-6{letter-spacing:0em;}.cls-7{letter-spacing:0em;}.cls-8{letter-spacing:0em;}.cls-9{letter-spacing:0em;}.cls-10{letter-spacing:0em;}.cls-11{letter-spacing:0.01em;}.cls-12{letter-spacing:0em;}.cls-13{letter-spacing:0em;}.cls-14{letter-spacing:0em;}.cls-15{letter-spacing:0.01em;}.cls-16{letter-spacing:0em;}.cls-17{letter-spacing:0em;}.cls-18{clip-path:url(#clip-path-4);}.cls-19{font-family:Calibri-Italic, Calibri;font-style:italic;}.cls-20{clip-path:url(#clip-path-7);}.cls-21{letter-spacing:0em;}.cls-22{letter-spacing:0em;}.cls-23{clip-path:url(#clip-path-10);}.cls-24{clip-path:url(#clip-path-12);}.cls-25{clip-path:url(#clip-path-14);}.cls-26{clip-path:url(#clip-path-16);}.cls-27{clip-path:url(#clip-path-18);}.cls-28{clip-path:url(#clip-path-20);}.cls-29{clip-path:url(#clip-path-22);}.cls-30{clip-path:url(#clip-path-24);}.cls-31{clip-path:url(#clip-path-26);}.cls-32{clip-path:url(#clip-path-28);}.cls-33{clip-path:url(#clip-path-30);}.cls-34{clip-path:url(#clip-path-32);}.cls-35{clip-path:url(#clip-path-34);}.cls-36{clip-path:url(#clip-path-36);}.cls-37{clip-path:url(#clip-path-38);}.cls-38{clip-path:url(#clip-path-40);}.cls-39{clip-path:url(#clip-path-42);}.cls-40{clip-path:url(#clip-path-44);}.cls-41{clip-path:url(#clip-path-46);}.cls-42{clip-path:url(#clip-path-48);}.cls-43{clip-path:url(#clip-path-50);}.cls-44{clip-path:url(#clip-path-52);}.cls-45{clip-path:url(#clip-path-54);}.cls-46{clip-path:url(#clip-path-56);}.cls-47{clip-path:url(#clip-path-58);}.cls-48{clip-path:url(#clip-path-60);}.cls-49{clip-path:url(#clip-path-62);}.cls-50{clip-path:url(#clip-path-64);}.cls-51{clip-path:url(#clip-path-66);}.cls-52{clip-path:url(#clip-path-68);}.cls-53{clip-path:url(#clip-path-70);}.cls-54{clip-path:url(#clip-path-72);}.cls-55{letter-spacing:0.01em;}.cls-56{clip-path:url(#clip-path-73);}.cls-57{clip-path:url(#clip-path-74);}.cls-58{clip-path:url(#clip-path-75);}.cls-59{clip-path:url(#clip-path-76);}.cls-60{stroke:#000;stroke-linecap:square;stroke-linejoin:round;stroke-width:0.14px;}</style><clipPath id="clip-path" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="17.15" width="502.08" height="30.24"/></clipPath><clipPath id="clip-path-4" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="48.35" width="502.08" height="15"/></clipPath><clipPath id="clip-path-7" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="64.31" width="502.08" height="15"/></clipPath><clipPath id="clip-path-10" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="80.27" width="502.08" height="15"/></clipPath><clipPath id="clip-path-12" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="96.23" width="502.08" height="15"/></clipPath><clipPath id="clip-path-14" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="112.2" width="502.08" height="15.02"/></clipPath><clipPath id="clip-path-16" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="128.18" width="502.08" height="15"/></clipPath><clipPath id="clip-path-18" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="144.14" width="502.08" height="15"/></clipPath><clipPath id="clip-path-20" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="160.1" width="502.08" height="15"/></clipPath><clipPath id="clip-path-22" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="176.06" width="502.08" height="15"/></clipPath><clipPath id="clip-path-24" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="192.02" width="502.08" height="15"/></clipPath><clipPath id="clip-path-26" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="207.98" width="502.08" height="15"/></clipPath><clipPath id="clip-path-28" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="223.94" width="502.08" height="17.88"/></clipPath><clipPath id="clip-path-30" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="242.78" width="502.08" height="15.72"/></clipPath><clipPath id="clip-path-32" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="259.46" width="502.08" height="14.3"/></clipPath><clipPath id="clip-path-34" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="274.72" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-36" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="289.96" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-38" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="305.2" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-40" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="320.44" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-42" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="335.68" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-44" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="350.92" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-46" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="366.16" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-48" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="381.4" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-50" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="396.64" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-52" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="411.88" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-54" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="427.13" width="502.08" height="14.3"/></clipPath><clipPath id="clip-path-56" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="442.39" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-58" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="457.63" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-60" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="472.87" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-62" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="488.11" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-64" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="503.35" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-66" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="518.59" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-68" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="533.83" width="502.08" height="14.28"/></clipPath><clipPath id="clip-path-70" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="1.92" y="549.07" width="502.08" height="15"/></clipPath><clipPath id="clip-path-72" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="79.79" width="118.49" height="31.92"/></clipPath><clipPath id="clip-path-73" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="111.72" width="118.49" height="63.86"/></clipPath><clipPath id="clip-path-74" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="175.58" width="118.49" height="144.38"/></clipPath><clipPath id="clip-path-75" transform="translate(-53.03 -15.71)"><rect class="cls-1" x="53.04" y="319.97" width="118.49" height="244.58"/></clipPath><clipPath id="clip-path-76" transform="translate(-53.03 -15.71)"><rect class="cls-1" width="506.04" height="567.07"/></clipPath></defs><title>tx_setAsset 1</title><g id="Layer_2" data-name="Layer 2"><g id="Layer_1-2" data-name="Layer 1"><rect class="cls-2" x="0.01" y="0.96" width="347.38" height="31.32"/><g class="cls-3"><text class="cls-4" transform="translate(24.51 20.64)"><tspan class="cls-5">Tr</tspan><tspan class="cls-6" x="10.28" y="0">a</tspan><tspan class="cls-7" x="16.17" y="0">n</tspan><tspan x="22.63" y="0">s</tspan><tspan class="cls-5" x="27.42" y="0">f</tspan><tspan x="31.25" y="0">o</tspan><tspan class="cls-8" x="37.7" y="0">r</tspan><tspan class="cls-6" x="42.03" y="0">m</tspan><tspan class="cls-7" x="51.75" y="0"> </tspan><tspan x="54.49" y="0">set</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(127.26 20.64)">V<tspan class="cls-9" x="7.1" y="0">e</tspan><tspan class="cls-10" x="13.09" y="0">r</tspan><tspan class="cls-11" x="17.39" y="0">ti</tspan><tspan x="24.49" y="0">cal</tspan><tspan class="cls-12" x="38.38" y="0"> </tspan><tspan x="41.14" y="0">t</tspan><tspan class="cls-11" x="45.3" y="0">r</tspan><tspan class="cls-13" x="49.64" y="0">a</tspan><tspan class="cls-7" x="55.53" y="0">n</tspan><tspan x="61.99" y="0">s</tspan><tspan class="cls-5" x="66.78" y="0">f</tspan><tspan class="cls-14" x="70.61" y="0">o</tspan><tspan class="cls-15" x="77.06" y="0">r</tspan><tspan x="81.39" y="0">m</tspan></text></g><g class="cls-3"><text class="cls-4" transform="translate(234.8 20.64)">Ho<tspan class="cls-12" x="14.02" y="0">r</tspan><tspan class="cls-5" x="18.33" y="0">i</tspan><tspan x="21.32" y="0">z</tspan><tspan class="cls-12" x="26.09" y="0">o</tspan><tspan class="cls-16" x="32.59" y="0">n</tspan><tspan x="39.05" y="0">tal</tspan><tspan class="cls-8" x="52.08" y="0"> </tspan><tspan x="54.85" y="0">t</tspan><tspan class="cls-11" x="59.01" y="0">r</tspan><tspan class="cls-13" x="63.35" y="0">a</tspan><tspan class="cls-16" x="69.24" y="0">n</tspan><tspan x="75.7" y="0">s</tspan><tspan class="cls-17" x="80.49" y="0">f</tspan><tspan x="84.32" y="0">o</tspan><tspan class="cls-15" x="90.77" y="0">r</tspan><tspan x="95.1" y="0">m</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(37.35 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan><tspan class="cls-13" x="19.54" y="0">O</tspan><tspan class="cls-9" x="27.35" y="0">n</tspan><tspan x="33.47" y="0">ly</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(162.06 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-18"><text class="cls-19" transform="translate(276.44 44.16)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(46.95 60.12)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">TX</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(163.62 60.12)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-20"><text class="cls-19" transform="translate(278 60.12)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-23"><text class="cls-19" transform="translate(160.62 76.08)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-23"><text class="cls-19" transform="translate(276.68 76.08)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-24"><text class="cls-19" transform="translate(163.62 92.04)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T</tspan></text></g><g class="cls-24"><text class="cls-19" transform="translate(276.44 92.04)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(157.62 108.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-25"><text class="cls-19" transform="translate(272 108.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(157.62 123.99)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-26"><text class="cls-19" transform="translate(275 123.99)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-27"><text class="cls-19" transform="translate(160.62 139.95)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-27"><text class="cls-19" transform="translate(272 139.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(162.06 155.91)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-28"><text class="cls-19" transform="translate(276.44 155.91)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-29"><text class="cls-19" transform="translate(160.62 171.87)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-29"><text class="cls-19" transform="translate(275 171.87)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-30"><text class="cls-19" transform="translate(160.62 187.83)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-30"><text class="cls-19" transform="translate(272 187.83)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(160.62 203.79)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-31"><text class="cls-19" transform="translate(253.04 203.79)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(158.94 221.19)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST</tspan></text></g><g class="cls-32"><text class="cls-19" transform="translate(276.44 221.19)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT</tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(157.62 238.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-33"><text class="cls-19" transform="translate(272 238.95)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(157.62 255.03)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-34"><text class="cls-19" transform="translate(253.04 255.03)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(138.66 270.29)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-35"><text class="cls-19" transform="translate(275 270.29)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(138.66 285.53)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-36"><text class="cls-19" transform="translate(272 285.53)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(139.98 300.77)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST</tspan></text></g><g class="cls-37"><text class="cls-19" transform="translate(253.04 300.77)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(160.62 316.01)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-38"><text class="cls-19" transform="translate(275 316.01)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(160.62 331.25)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-39"><text class="cls-19" transform="translate(272 331.25)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-40"><text class="cls-19" transform="translate(160.62 346.49)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-40"><text class="cls-19" transform="translate(253.04 346.49)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-41"><text class="cls-19" transform="translate(160.62 361.73)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-41"><text class="cls-19" transform="translate(276.68 361.73)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-42"><text class="cls-19" transform="translate(157.62 376.97)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-42"><text class="cls-19" transform="translate(275 376.97)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-43"><text class="cls-19" transform="translate(157.62 392.21)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-43"><text class="cls-19" transform="translate(272 392.21)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-44"><text class="cls-19" transform="translate(157.62 407.45)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-44"><text class="cls-19" transform="translate(253.04 407.45)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-45"><text class="cls-19" transform="translate(157.62 422.72)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-45"><text class="cls-19" transform="translate(276.68 422.72)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-46"><text class="cls-19" transform="translate(138.66 437.96)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-46"><text class="cls-19" transform="translate(275 437.96)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-47"><text class="cls-19" transform="translate(138.66 453.2)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-47"><text class="cls-19" transform="translate(272 453.2)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-48"><text class="cls-19" transform="translate(138.66 468.44)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-48"><text class="cls-19" transform="translate(253.04 468.44)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-49"><text class="cls-19" transform="translate(138.66 483.68)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-7" x="37.89" y="0">A</tspan><tspan class="cls-5" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-49"><text class="cls-19" transform="translate(276.68 483.68)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-50"><text class="cls-19" transform="translate(162.3 498.92)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-50"><text class="cls-19" transform="translate(275 498.92)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">CT </tspan></text></g><g class="cls-51"><text class="cls-19" transform="translate(162.3 514.16)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-51"><text class="cls-19" transform="translate(272 514.16)">A<tspan class="cls-12" x="6.94" y="0">D</tspan><tspan x="14.37" y="0">ST </tspan></text></g><g class="cls-52"><text class="cls-19" transform="translate(162.3 529.4)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-52"><text class="cls-19" transform="translate(253.04 529.4)">Fli<tspan class="cls-13" x="11.02" y="0">p</tspan><tspan class="cls-9" x="17.16" y="0">p</tspan><tspan x="23.28" y="0">ed </tspan><tspan class="cls-16" x="37.89" y="0">A</tspan><tspan class="cls-17" x="44.86" y="0">D</tspan><tspan x="52.28" y="0">ST </tspan></text></g><g class="cls-53"><text class="cls-19" transform="translate(162.3 544.88)">I<tspan class="cls-21" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-53"><text class="cls-19" transform="translate(276.68 544.88)">I<tspan class="cls-22" x="3.02" y="0">D</tspan><tspan x="10.42" y="0">T </tspan></text></g><g class="cls-54"><text class="cls-19" transform="translate(41.67 83.64)">1<tspan class="cls-55" x="6.08" y="0">D</tspan><tspan class="cls-5" x="13.54" y="0">D</tspan><tspan x="20.96" y="0">CT</tspan></text></g><g class="cls-56"><text class="cls-19" transform="translate(45.51 131.55)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-12" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">4</tspan></text></g><g class="cls-57"><text class="cls-19" transform="translate(45.51 235.59)"><tspan class="cls-5">D</tspan><tspan x="7.42" y="0">T</tspan><tspan class="cls-12" x="13.27" y="0">T</tspan><tspan x="19.16" y="0">9</tspan></text></g><g class="cls-58"><text class="cls-19" transform="translate(43.59 430.16)">ALL<tspan class="cls-5" x="17.03" y="0">1</tspan><tspan x="23.15" y="0">6</tspan></text></g><g class="cls-59"><line class="cls-60" x1="118.08" y1="1.98" x2="118.08" y2="31.14"/><rect x="118.02" y="1.92" width="0.96" height="29.28"/><line class="cls-60" x1="226.82" y1="1.98" x2="226.82" y2="31.14"/><rect x="226.76" y="1.92" width="0.96" height="29.28"/><line class="cls-60" x1="118.08" y1="33.18" x2="118.08" y2="547.82"/><rect x="118.02" y="33.13" width="0.96" height="514.75"/><line class="cls-60" x1="226.82" y1="33.18" x2="226.82" y2="547.82"/><rect x="226.76" y="33.13" width="0.96" height="514.75"/><rect x="0.01" width="347.38" height="1.92"/><rect x="0.01" y="31.2" width="347.38" height="1.92"/><line class="cls-60" x1="0.07" y1="47.7" x2="347.33" y2="47.7"/><rect x="0.01" y="47.64" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="63.66" x2="347.33" y2="63.66"/><rect x="0.01" y="63.6" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="95.58" x2="347.33" y2="95.58"/><rect x="0.01" y="95.52" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="159.45" x2="347.33" y2="159.45"/><rect x="0.01" y="159.39" width="347.38" height="0.96"/><line class="cls-60" x1="0.07" y1="303.83" x2="347.33" y2="303.83"/><rect x="0.01" y="303.77" width="347.38" height="0.96"/><rect x="0.01" y="547.88" width="347.38" height="1.92"/></g></g></g></svg> \ No newline at end of file
diff --git a/third_party/aom/docs.cmake b/third_party/aom/docs.cmake
new file mode 100644
index 0000000000..0d7b4cfde3
--- /dev/null
+++ b/third_party/aom/docs.cmake
@@ -0,0 +1,345 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_DOCS_CMAKE_)
+ return()
+endif() # AOM_DOCS_CMAKE_
+set(AOM_DOCS_CMAKE_ 1)
+
+cmake_minimum_required(VERSION 3.5)
+
+set(AOM_DOXYFILE "${AOM_CONFIG_DIR}/doxyfile")
+set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template")
+set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
+set(AOM_DOXYGEN_SECTIONS "av1")
+
+set(AOM_DOXYGEN_SOURCES
+ "${AOM_ROOT}/aom/aom.h"
+ "${AOM_ROOT}/aom/aom_codec.h"
+ "${AOM_ROOT}/aom/aom_decoder.h"
+ "${AOM_ROOT}/aom/aom_encoder.h"
+ "${AOM_ROOT}/aom/aom_external_partition.h"
+ "${AOM_ROOT}/aom/aom_frame_buffer.h"
+ "${AOM_ROOT}/aom/aom_image.h"
+ "${AOM_ROOT}/aom/aom_integer.h"
+ "${AOM_ROOT}/av1/common/av1_common_int.h"
+ "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+ "${AOM_ROOT}/av1/common/blockd.h"
+ "${AOM_ROOT}/av1/common/cdef.h"
+ "${AOM_ROOT}/av1/common/enums.h"
+ "${AOM_ROOT}/av1/common/restoration.h"
+ "${AOM_ROOT}/keywords.dox"
+ "${AOM_ROOT}/mainpage.dox"
+ "${AOM_ROOT}/usage.dox")
+
+if(CONFIG_AV1_DECODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/apps/aomdec.c"
+ "${AOM_ROOT}/examples/decode_to_md5.c"
+ "${AOM_ROOT}/examples/decode_with_drops.c"
+ "${AOM_ROOT}/examples/simple_decoder.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Full featured decoder."
+ "Frame by frame MD5 checksum."
+ "Drops frames while decoding."
+ "Simplified decoder loop.")
+
+ set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
+
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
+ "${AOM_ROOT}/usage_dx.dox"
+ "${AOM_ROOT}/av1/decoder/decoder.h")
+
+ if(CONFIG_ANALYZER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/analyzer.cc")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Bitstream analyzer.")
+ endif()
+
+ if(CONFIG_INSPECTION)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/inspect.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Bitstream inspector.")
+ endif()
+
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+ "${AOM_ROOT}/doc/dev_guide/av1_decoder.dox")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/apps/aomenc.c"
+ "${AOM_ROOT}/examples/lossless_encoder.c"
+ "${AOM_ROOT}/examples/set_maps.c"
+ "${AOM_ROOT}/examples/simple_encoder.c"
+ "${AOM_ROOT}/examples/twopass_encoder.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Full featured encoder."
+ "Simplified lossless encoder."
+ "Set active and ROI maps."
+ "Simplified encoder loop."
+ "Two-pass encoder loop.")
+
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/scalable_encoder.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Scalable encoder loop.")
+
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/svc_encoder_rtc.cc")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Layered encoder for RTC.")
+
+ set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder")
+
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
+ "${AOM_ROOT}/usage_cx.dox")
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+ "${AOM_ROOT}/doc/dev_guide/av1_encoder.dox")
+ set(AOM_DOXYGEN_SOURCES
+ ${AOM_DOXYGEN_SOURCES}
+ "${AOM_ROOT}/aom_scale/yv12config.h"
+ "${AOM_ROOT}/av1/encoder/bitstream.h"
+ "${AOM_ROOT}/av1/encoder/block.h"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+ "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+ "${AOM_ROOT}/av1/encoder/encode_strategy.h"
+ "${AOM_ROOT}/av1/encoder/encodeframe.c"
+ "${AOM_ROOT}/av1/encoder/encoder.c"
+ "${AOM_ROOT}/av1/encoder/encoder.h"
+ "${AOM_ROOT}/av1/encoder/encodetxb.h"
+ "${AOM_ROOT}/av1/encoder/firstpass.h"
+ "${AOM_ROOT}/av1/encoder/gop_structure.h"
+ "${AOM_ROOT}/av1/encoder/interp_search.c"
+ "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+ "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+ "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
+ "${AOM_ROOT}/av1/encoder/lookahead.h"
+ "${AOM_ROOT}/av1/encoder/palette.h"
+ "${AOM_ROOT}/av1/encoder/palette.c"
+ "${AOM_ROOT}/av1/encoder/partition_search.h"
+ "${AOM_ROOT}/av1/encoder/partition_search.c"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+ "${AOM_ROOT}/av1/encoder/pickcdef.h"
+ "${AOM_ROOT}/av1/encoder/picklpf.h"
+ "${AOM_ROOT}/av1/encoder/pickrst.h"
+ "${AOM_ROOT}/av1/encoder/ratectrl.c"
+ "${AOM_ROOT}/av1/encoder/ratectrl.h"
+ "${AOM_ROOT}/av1/encoder/rc_utils.h"
+ "${AOM_ROOT}/av1/encoder/rdopt.h"
+ "${AOM_ROOT}/av1/encoder/rdopt.c"
+ "${AOM_ROOT}/av1/encoder/speed_features.h"
+ "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
+ "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+ "${AOM_ROOT}/av1/encoder/tpl_model.h"
+ "${AOM_ROOT}/av1/encoder/tx_search.h"
+ "${AOM_ROOT}/av1/encoder/txb_rdopt.h"
+ "${AOM_ROOT}/av1/encoder/var_based_part.h"
+ "${AOM_ROOT}/av1/encoder/nonrd_opt.h"
+ "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/aom_cx_set_ref.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Set encoder reference frame.")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/lightfield_encoder.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Lightfield encoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES
+ ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Lightfield tile list decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/lightfield_decoder.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Lightfield decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+ set(AOM_DOXYGEN_EXAMPLE_SOURCES
+ ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c")
+
+ set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+ "Lightfield bitstream parsing example.")
+endif()
+
+# Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE
+# as values assigned to $var_name with no line breaks between list items.
+# Appends a new line after the entire config variable is expanded.
+function(write_cmake_list_to_doxygen_config_var var_name list_name)
+ unset(output_string)
+ foreach(list_item ${${list_name}})
+ set(output_string "${output_string} ${list_item} ")
+ endforeach()
+ string(STRIP "${output_string}" output_string)
+ file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n")
+endfunction()
+
+function(get_name file_path name_var)
+ get_filename_component(file_basename ${file_path} NAME)
+ get_filename_component(${name_var} ${file_basename} NAME_WE)
+ set(${name_var} ${${name_var}} PARENT_SCOPE)
+endfunction()
+
+function(setup_documentation_targets)
+
+ # Sanity check: the lengths of these lists must match.
+ list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources)
+ list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs)
+ if(NOT ${num_sources} EQUAL ${num_descs})
+ message(FATAL_ERROR "Unequal example and description totals.")
+ endif()
+
+ # Take the list of examples and produce example_basename.dox for each file in
+ # the list.
+ file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}")
+ foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES})
+ unset(example_basename)
+ get_name("${example_file}" "example_name")
+ set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox")
+ set(dox_string "/*!\\page example_${example_name} ${example_name}\n")
+ set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n")
+ file(WRITE "${example_dox}" ${dox_string})
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}")
+ endforeach()
+
+ # Generate samples.dox, an index page that refers to the example_basename.dox
+ # files that were just created.
+ set(samples_header "
+/*!\\page samples Sample Code
+This SDK includes a number of sample applications. Each sample documents a
+feature of the SDK in both prose and the associated C code. The following
+samples are included:
+")
+
+ set(utils_desc "
+In addition, the SDK contains a number of utilities. Since these utilities are
+built upon the concepts described in the sample code listed above, they are not
+documented in pieces like the samples are. Their source is included here for
+reference. The following utilities are included:
+")
+
+ # Write the description for the samples section.
+ set(samples_dox "${AOM_CONFIG_DIR}/samples.dox")
+ file(WRITE "${samples_dox}" "${samples_header}\n")
+
+ # Iterate over $AOM_DOXYGEN_EXAMPLE_SOURCES and
+ # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by
+ # AV1's doxygen setup.
+ math(EXPR max_example_index "${num_sources} - 1")
+ foreach(NUM RANGE ${max_example_index})
+ list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name)
+ get_name("${ex_name}" "ex_name")
+
+ # AV1's doxygen lists aomdec and aomenc as utils apart from the examples.
+ # Save the indexes for another pass.
+ if("${ex_name}" MATCHES "aomdec\|aomenc")
+ set(util_indexes "${util_indexes}" "${NUM}")
+ continue()
+ endif()
+ list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc)
+ file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
+ endforeach()
+
+ # Write the description and index for the utils.
+ file(APPEND "${samples_dox}" "${utils_desc}\n")
+ foreach(util_index ${util_indexes})
+ list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name)
+ get_name("${ex_name}" "ex_name")
+ list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc)
+ file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
+ endforeach()
+ file(APPEND "${samples_dox}" "*/")
+
+ # Add $samples_dox to the doxygen inputs.
+ get_filename_component(samples_dox ${samples_dox} NAME)
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox})
+
+ # There are issues to show Markdown file for old Doxygen version. Here, only
+ # enable Markdown support for 1.8.16 or newer.
+ if(${DOXYGEN_VERSION_VALUE} GREATER_EQUAL 1008016)
+ set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_md_support")
+ set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/README.md")
+ # Uncomment and add AlgorithmDescription.md in result page when it is done.
+ # set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES}
+ # "${AOM_ROOT}/doc/AlgorithmDescription.md")
+ endif()
+
+ # Generate libaom's doxyfile.
+ file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n")
+ file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
+ file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data})
+ file(APPEND "${AOM_DOXYFILE}"
+ "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
+ file(APPEND "${AOM_DOXYFILE}"
+ "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
+ file(APPEND "${AOM_DOXYFILE}"
+ "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
+ write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES")
+ write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
+ "AOM_DOXYGEN_SECTIONS")
+
+ # Add AOMedia logo.
+ set(aom_logo "aomedia_logo_200.png")
+ configure_file(${AOM_ROOT}/${aom_logo} ${AOM_CONFIG_DIR}/${aom_logo} COPYONLY)
+ file(APPEND "${AOM_DOXYFILE}"
+ "PROJECT_LOGO = ${AOM_CONFIG_DIR}/${aom_logo}\n")
+
+ # Only set HAVE_DOT to YES if dot tool is found.
+ if(DOXYGEN_DOT_FOUND)
+ file(APPEND "${AOM_DOXYFILE}" "HAVE_DOT = YES\n")
+ file(APPEND "${AOM_DOXYFILE}" "DOT_GRAPH_MAX_NODES = 10000\n")
+ endif()
+
+ # Add image path.
+ file(APPEND "${AOM_DOXYFILE}" "IMAGE_PATH += ${AOM_ROOT}/doc/dev_guide\n")
+
+ # Allow banner style comments
+ file(APPEND "${AOM_DOXYFILE}" "JAVADOC_BANNER = YES")
+
+ # Add the doxygen generation rule.
+ add_custom_target(docs ALL
+ COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}"
+ DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
+ ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_DOXYGEN_CONFIG_TEMPLATE}"
+ SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
+ ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+ "${AOM_DOXYGEN_CONFIG_TEMPLATE}")
+endfunction()
diff --git a/third_party/aom/examples/analyzer.cc b/third_party/aom/examples/analyzer.cc
new file mode 100644
index 0000000000..501f5024db
--- /dev/null
+++ b/third_party/aom/examples/analyzer.cc
@@ -0,0 +1,722 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <wx/wx.h>
+#include <wx/aboutdlg.h>
+#include <wx/cmdline.h>
+#include <wx/dcbuffer.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/decoder/accounting.h"
+#include "av1/decoder/inspection.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+#define OD_SIGNMASK(a) (-((a) < 0))
+#define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+#define OD_DIV_ROUND(x, y) (((x) + OD_FLIPSIGNI((y) >> 1, x)) / (y))
+
+enum {
+ OD_LUMA_MASK = 1 << 0,
+ OD_CB_MASK = 1 << 1,
+ OD_CR_MASK = 1 << 2,
+ OD_ALL_MASK = OD_LUMA_MASK | OD_CB_MASK | OD_CR_MASK
+};
+
+class AV1Decoder {
+ private:
+ FILE *input;
+ wxString path;
+
+ AvxVideoReader *reader;
+ const AvxVideoInfo *info;
+
+ insp_frame_data frame_data;
+
+ aom_codec_ctx_t codec;
+ bool show_padding;
+
+ public:
+ aom_image_t *image;
+ int frame;
+
+ int plane_mask;
+
+ AV1Decoder();
+ ~AV1Decoder();
+
+ bool open(const wxString &path);
+ void close();
+ bool step();
+
+ int getWidthPadding() const;
+ int getHeightPadding() const;
+ void togglePadding();
+ int getWidth() const;
+ int getHeight() const;
+
+ bool getAccountingStruct(Accounting **acct);
+ bool setInspectionCallback();
+
+ static void inspect(void *decoder, void *data);
+};
+
+AV1Decoder::AV1Decoder()
+ : reader(NULL), info(NULL), decoder(NULL), show_padding(false), image(NULL),
+ frame(0) {}
+
+AV1Decoder::~AV1Decoder() {}
+
+void AV1Decoder::togglePadding() { show_padding = !show_padding; }
+
+bool AV1Decoder::open(const wxString &path) {
+ reader = aom_video_reader_open(path.mb_str());
+ if (!reader) {
+ fprintf(stderr, "Failed to open %s for reading.", path.mb_str().data());
+ return false;
+ }
+ this->path = path;
+ info = aom_video_reader_get_info(reader);
+ decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) {
+ fprintf(stderr, "Unknown input codec.");
+ return false;
+ }
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0)) {
+ fprintf(stderr, "Failed to initialize decoder.");
+ return false;
+ }
+ ifd_init(&frame_data, info->frame_width, info->frame_height);
+ setInspectionCallback();
+ return true;
+}
+
+void AV1Decoder::close() {}
+
+bool AV1Decoder::step() {
+ if (aom_video_reader_read_frame(reader)) {
+ size_t frame_size;
+ const unsigned char *frame_data;
+ frame_data = aom_video_reader_get_frame(reader, &frame_size);
+ if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) {
+ fprintf(stderr, "Failed to decode frame.");
+ return false;
+ } else {
+ aom_codec_iter_t iter = NULL;
+ image = aom_codec_get_frame(&codec, &iter);
+ if (image != NULL) {
+ frame++;
+ return true;
+ }
+ return false;
+ }
+ }
+ return false;
+}
+
+int AV1Decoder::getWidth() const {
+ return info->frame_width + 2 * getWidthPadding();
+}
+
+int AV1Decoder::getWidthPadding() const {
+ return show_padding ? AOMMAX(info->frame_width + 16,
+ ALIGN_POWER_OF_TWO(info->frame_width, 6)) -
+ info->frame_width
+ : 0;
+}
+
+int AV1Decoder::getHeight() const {
+ return info->frame_height + 2 * getHeightPadding();
+}
+
+int AV1Decoder::getHeightPadding() const {
+ return show_padding ? AOMMAX(info->frame_height + 16,
+ ALIGN_POWER_OF_TWO(info->frame_height, 6)) -
+ info->frame_height
+ : 0;
+}
+
+bool AV1Decoder::getAccountingStruct(Accounting **accounting) {
+ return aom_codec_control(&codec, AV1_GET_ACCOUNTING, accounting) ==
+ AOM_CODEC_OK;
+}
+
+bool AV1Decoder::setInspectionCallback() {
+ aom_inspect_init ii;
+ ii.inspect_cb = AV1Decoder::inspect;
+ ii.inspect_ctx = (void *)this;
+ return aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii) ==
+ AOM_CODEC_OK;
+}
+
+void AV1Decoder::inspect(void *pbi, void *data) {
+ AV1Decoder *decoder = (AV1Decoder *)data;
+ ifd_inspect(&decoder->frame_data, pbi, 0);
+}
+
+#define MIN_ZOOM (1)
+#define MAX_ZOOM (4)
+
+class AnalyzerPanel : public wxPanel {
+ DECLARE_EVENT_TABLE()
+
+ private:
+ AV1Decoder decoder;
+ const wxString path;
+
+ int zoom;
+ unsigned char *pixels;
+
+ const bool bit_accounting;
+ double *bpp_q3;
+
+ int plane_mask;
+
+ // The display size is the decode size, scaled by the zoom.
+ int getDisplayWidth() const;
+ int getDisplayHeight() const;
+
+ bool updateDisplaySize();
+
+ void computeBitsPerPixel();
+
+ public:
+ AnalyzerPanel(wxWindow *parent, const wxString &path,
+ const bool bit_accounting);
+ ~AnalyzerPanel();
+
+ bool open(const wxString &path);
+ void close();
+ void render();
+ void togglePadding();
+ bool nextFrame();
+ void refresh();
+
+ int getZoom() const;
+ bool setZoom(int zoom);
+
+ void setShowPlane(bool show_plane, int mask);
+
+ void onPaint(wxPaintEvent &event); // NOLINT
+};
+
+BEGIN_EVENT_TABLE(AnalyzerPanel, wxPanel)
+EVT_PAINT(AnalyzerPanel::onPaint)
+END_EVENT_TABLE()
+
+AnalyzerPanel::AnalyzerPanel(wxWindow *parent, const wxString &path,
+ const bool bit_accounting)
+ : wxPanel(parent), path(path), zoom(0), pixels(NULL),
+ bit_accounting(bit_accounting), bpp_q3(NULL), plane_mask(OD_ALL_MASK) {}
+
+AnalyzerPanel::~AnalyzerPanel() { close(); }
+
+void AnalyzerPanel::setShowPlane(bool show_plane, int mask) {
+ if (show_plane) {
+ plane_mask |= mask;
+ } else {
+ plane_mask &= ~mask;
+ }
+}
+
+void AnalyzerPanel::render() {
+ aom_image_t *img = decoder.image;
+ const int hbd = !!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+ int y_stride = img->stride[0] >> hbd;
+ int cb_stride = img->stride[1] >> hbd;
+ int cr_stride = img->stride[2] >> hbd;
+ int p_stride = 3 * getDisplayWidth();
+ unsigned char *y_row = img->planes[0];
+ unsigned char *cb_row = img->planes[1];
+ unsigned char *cr_row = img->planes[2];
+ uint16_t *y_row16 = reinterpret_cast<uint16_t *>(y_row);
+ uint16_t *cb_row16 = reinterpret_cast<uint16_t *>(cb_row);
+ uint16_t *cr_row16 = reinterpret_cast<uint16_t *>(cr_row);
+ unsigned char *p_row = pixels;
+ int y_width_padding = decoder.getWidthPadding();
+ int cb_width_padding = y_width_padding >> 1;
+ int cr_width_padding = y_width_padding >> 1;
+ int y_height_padding = decoder.getHeightPadding();
+ int cb_height_padding = y_height_padding >> 1;
+ int cr_height_padding = y_height_padding >> 1;
+ for (int j = 0; j < decoder.getHeight(); j++) {
+ unsigned char *y = y_row - y_stride * y_height_padding;
+ unsigned char *cb = cb_row - cb_stride * cb_height_padding;
+ unsigned char *cr = cr_row - cr_stride * cr_height_padding;
+ uint16_t *y16 = y_row16 - y_stride * y_height_padding;
+ uint16_t *cb16 = cb_row16 - cb_stride * cb_height_padding;
+ uint16_t *cr16 = cr_row16 - cr_stride * cr_height_padding;
+ unsigned char *p = p_row;
+ for (int i = 0; i < decoder.getWidth(); i++) {
+ int64_t yval;
+ int64_t cbval;
+ int64_t crval;
+ int pmask;
+ unsigned rval;
+ unsigned gval;
+ unsigned bval;
+ if (hbd) {
+ yval = *(y16 - y_width_padding);
+ cbval = *(cb16 - cb_width_padding);
+ crval = *(cr16 - cr_width_padding);
+ } else {
+ yval = *(y - y_width_padding);
+ cbval = *(cb - cb_width_padding);
+ crval = *(cr - cr_width_padding);
+ }
+ pmask = plane_mask;
+ if (pmask & OD_LUMA_MASK) {
+ yval -= 16;
+ } else {
+ yval = 128;
+ }
+ cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128);
+ crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128);
+ /*This is intentionally slow and very accurate.*/
+ rval = OD_CLAMPI(
+ 0,
+ (int32_t)OD_DIV_ROUND(
+ 2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL),
+ 65535);
+ gval = OD_CLAMPI(0,
+ (int32_t)OD_DIV_ROUND(2916394880000LL * yval -
+ 534117096223LL * cbval -
+ 1334761232047LL * crval,
+ 9745792000LL),
+ 65535);
+ bval = OD_CLAMPI(
+ 0,
+ (int32_t)OD_DIV_ROUND(
+ 2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL),
+ 65535);
+ unsigned char *px_row = p;
+ for (int v = 0; v < zoom; v++) {
+ unsigned char *px = px_row;
+ for (int u = 0; u < zoom; u++) {
+ *(px + 0) = (unsigned char)(rval >> 8);
+ *(px + 1) = (unsigned char)(gval >> 8);
+ *(px + 2) = (unsigned char)(bval >> 8);
+ px += 3;
+ }
+ px_row += p_stride;
+ }
+ if (hbd) {
+ int dc = ((y16 - y_row16) & 1) | (1 - img->x_chroma_shift);
+ y16++;
+ cb16 += dc;
+ cr16 += dc;
+ } else {
+ int dc = ((y - y_row) & 1) | (1 - img->x_chroma_shift);
+ y++;
+ cb += dc;
+ cr += dc;
+ }
+ p += zoom * 3;
+ }
+ int dc = -((j & 1) | (1 - img->y_chroma_shift));
+ if (hbd) {
+ y_row16 += y_stride;
+ cb_row16 += dc & cb_stride;
+ cr_row16 += dc & cr_stride;
+ } else {
+ y_row += y_stride;
+ cb_row += dc & cb_stride;
+ cr_row += dc & cr_stride;
+ }
+ p_row += zoom * p_stride;
+ }
+}
+
+void AnalyzerPanel::computeBitsPerPixel() {
+ Accounting *acct;
+ double bpp_total;
+ int totals_q3[MAX_SYMBOL_TYPES] = { 0 };
+ int sym_count[MAX_SYMBOL_TYPES] = { 0 };
+ decoder.getAccountingStruct(&acct);
+ for (int j = 0; j < decoder.getHeight(); j++) {
+ for (int i = 0; i < decoder.getWidth(); i++) {
+ bpp_q3[j * decoder.getWidth() + i] = 0.0;
+ }
+ }
+ bpp_total = 0;
+ for (int i = 0; i < acct->syms.num_syms; i++) {
+ AccountingSymbol *s;
+ s = &acct->syms.syms[i];
+ totals_q3[s->id] += s->bits;
+ sym_count[s->id] += s->samples;
+ }
+ printf("=== Frame: %-3i ===\n", decoder.frame - 1);
+ for (int i = 0; i < acct->syms.dictionary.num_strs; i++) {
+ if (totals_q3[i]) {
+ printf("%30s = %10.3f (%f bit/symbol)\n", acct->syms.dictionary.strs[i],
+ (float)totals_q3[i] / 8, (float)totals_q3[i] / 8 / sym_count[i]);
+ }
+ }
+ printf("\n");
+}
+
+void AnalyzerPanel::togglePadding() {
+ decoder.togglePadding();
+ updateDisplaySize();
+}
+
+bool AnalyzerPanel::nextFrame() {
+ if (decoder.step()) {
+ refresh();
+ return true;
+ }
+ return false;
+}
+
+void AnalyzerPanel::refresh() {
+ if (bit_accounting) {
+ computeBitsPerPixel();
+ }
+ render();
+}
+
+int AnalyzerPanel::getDisplayWidth() const { return zoom * decoder.getWidth(); }
+
+int AnalyzerPanel::getDisplayHeight() const {
+ return zoom * decoder.getHeight();
+}
+
+bool AnalyzerPanel::updateDisplaySize() {
+ unsigned char *p = (unsigned char *)malloc(
+ sizeof(*p) * 3 * getDisplayWidth() * getDisplayHeight());
+ if (p == NULL) {
+ return false;
+ }
+ free(pixels);
+ pixels = p;
+ SetSize(getDisplayWidth(), getDisplayHeight());
+ return true;
+}
+
+bool AnalyzerPanel::open(const wxString &path) {
+ if (!decoder.open(path)) {
+ return false;
+ }
+ if (!setZoom(MIN_ZOOM)) {
+ return false;
+ }
+ if (bit_accounting) {
+ bpp_q3 = (double *)malloc(sizeof(*bpp_q3) * decoder.getWidth() *
+ decoder.getHeight());
+ if (bpp_q3 == NULL) {
+ fprintf(stderr, "Could not allocate memory for bit accounting\n");
+ close();
+ return false;
+ }
+ }
+ if (!nextFrame()) {
+ close();
+ return false;
+ }
+ SetFocus();
+ return true;
+}
+
+void AnalyzerPanel::close() {
+ decoder.close();
+ free(pixels);
+ pixels = NULL;
+ free(bpp_q3);
+ bpp_q3 = NULL;
+}
+
+int AnalyzerPanel::getZoom() const { return zoom; }
+
+bool AnalyzerPanel::setZoom(int z) {
+ if (z <= MAX_ZOOM && z >= MIN_ZOOM && zoom != z) {
+ int old_zoom = zoom;
+ zoom = z;
+ if (!updateDisplaySize()) {
+ zoom = old_zoom;
+ return false;
+ }
+ return true;
+ }
+ return false;
+}
+
+void AnalyzerPanel::onPaint(wxPaintEvent &) {
+ wxBitmap bmp(wxImage(getDisplayWidth(), getDisplayHeight(), pixels, true));
+ wxBufferedPaintDC dc(this, bmp);
+}
+
+class AnalyzerFrame : public wxFrame {
+ DECLARE_EVENT_TABLE()
+
+ private:
+ AnalyzerPanel *panel;
+ const bool bit_accounting;
+
+ wxMenu *fileMenu;
+ wxMenu *viewMenu;
+ wxMenu *playbackMenu;
+
+ public:
+ AnalyzerFrame(const bool bit_accounting); // NOLINT
+
+ void onOpen(wxCommandEvent &event); // NOLINT
+ void onClose(wxCommandEvent &event); // NOLINT
+ void onQuit(wxCommandEvent &event); // NOLINT
+
+ void onTogglePadding(wxCommandEvent &event); // NOLINT
+ void onZoomIn(wxCommandEvent &event); // NOLINT
+ void onZoomOut(wxCommandEvent &event); // NOLINT
+ void onActualSize(wxCommandEvent &event); // NOLINT
+
+ void onToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT
+ void onResetAndToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT
+
+ void onNextFrame(wxCommandEvent &event); // NOLINT
+ void onGotoFrame(wxCommandEvent &event); // NOLINT
+ void onRestart(wxCommandEvent &event); // NOLINT
+
+ void onAbout(wxCommandEvent &event); // NOLINT
+
+ bool open(const wxString &path);
+ bool setZoom(int zoom);
+ void updateViewMenu();
+};
+
+enum {
+ wxID_NEXT_FRAME = 6000,
+ wxID_SHOW_Y,
+ wxID_SHOW_U,
+ wxID_SHOW_V,
+ wxID_GOTO_FRAME,
+ wxID_RESTART,
+ wxID_ACTUAL_SIZE,
+ wxID_PADDING
+};
+
+BEGIN_EVENT_TABLE(AnalyzerFrame, wxFrame)
+EVT_MENU(wxID_OPEN, AnalyzerFrame::onOpen)
+EVT_MENU(wxID_CLOSE, AnalyzerFrame::onClose)
+EVT_MENU(wxID_EXIT, AnalyzerFrame::onQuit)
+EVT_MENU(wxID_PADDING, AnalyzerFrame::onTogglePadding)
+EVT_MENU(wxID_ZOOM_IN, AnalyzerFrame::onZoomIn)
+EVT_MENU(wxID_ZOOM_OUT, AnalyzerFrame::onZoomOut)
+EVT_MENU(wxID_ACTUAL_SIZE, AnalyzerFrame::onActualSize)
+EVT_MENU(wxID_SHOW_Y, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_SHOW_U, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_SHOW_V, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_NEXT_FRAME, AnalyzerFrame::onNextFrame)
+EVT_MENU(wxID_GOTO_FRAME, AnalyzerFrame::onGotoFrame)
+EVT_MENU(wxID_RESTART, AnalyzerFrame::onRestart)
+EVT_MENU(wxID_ABOUT, AnalyzerFrame::onAbout)
+END_EVENT_TABLE()
+
+AnalyzerFrame::AnalyzerFrame(const bool bit_accounting)
+ : wxFrame(NULL, wxID_ANY, _("AV1 Stream Analyzer"), wxDefaultPosition,
+ wxDefaultSize, wxDEFAULT_FRAME_STYLE),
+ panel(NULL), bit_accounting(bit_accounting) {
+ wxMenuBar *mb = new wxMenuBar();
+
+ fileMenu = new wxMenu();
+ fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open AV1 file"));
+ fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close AV1 file"));
+ fileMenu->Enable(wxID_CLOSE, false);
+ fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program"));
+ mb->Append(fileMenu, _("&File"));
+
+ wxAcceleratorEntry entries[2];
+ entries[0].Set(wxACCEL_CTRL, (int)'=', wxID_ZOOM_IN);
+ entries[1].Set(wxACCEL_CTRL | wxACCEL_SHIFT, (int)'-', wxID_ZOOM_OUT);
+ wxAcceleratorTable accel(2, entries);
+ this->SetAcceleratorTable(accel);
+
+ viewMenu = new wxMenu();
+ +viewMenu->Append(wxID_PADDING, _("Toggle padding\tCtrl-p"),
+ _("Show padding"));
+ viewMenu->Append(wxID_ZOOM_IN, _("Zoom-In\tCtrl-+"), _("Double image size"));
+ viewMenu->Append(wxID_ZOOM_OUT, _("Zoom-Out\tCtrl--"), _("Half image size"));
+ viewMenu->Append(wxID_ACTUAL_SIZE, _("Actual size\tCtrl-0"),
+ _("Actual size of the frame"));
+ viewMenu->AppendSeparator();
+ viewMenu->AppendCheckItem(wxID_SHOW_Y, _("&Y plane\tCtrl-Y"),
+ _("Show Y plane"));
+ viewMenu->AppendCheckItem(wxID_SHOW_U, _("&U plane\tCtrl-U"),
+ _("Show U plane"));
+ viewMenu->AppendCheckItem(wxID_SHOW_V, _("&V plane\tCtrl-V"),
+ _("Show V plane"));
+ mb->Append(viewMenu, _("&View"));
+
+ playbackMenu = new wxMenu();
+ playbackMenu->Append(wxID_NEXT_FRAME, _("Next frame\tCtrl-."),
+ _("Go to next frame"));
+ /*playbackMenu->Append(wxID_RESTART, _("&Restart\tCtrl-R"),
+ _("Set video to frame 0"));
+ playbackMenu->Append(wxID_GOTO_FRAME, _("Jump to Frame\tCtrl-J"),
+ _("Go to frame number"));*/
+ mb->Append(playbackMenu, _("&Playback"));
+
+ wxMenu *helpMenu = new wxMenu();
+ helpMenu->Append(wxID_ABOUT, _("&About...\tF1"), _("Show about dialog"));
+ mb->Append(helpMenu, _("&Help"));
+
+ SetMenuBar(mb);
+
+ CreateStatusBar(1);
+}
+
+void AnalyzerFrame::onOpen(wxCommandEvent &WXUNUSED(event)) {
+ wxFileDialog openFileDialog(this, _("Open file"), wxEmptyString,
+ wxEmptyString, _("AV1 files (*.ivf)|*.ivf"),
+ wxFD_OPEN | wxFD_FILE_MUST_EXIST);
+ if (openFileDialog.ShowModal() != wxID_CANCEL) {
+ open(openFileDialog.GetPath());
+ }
+}
+
+void AnalyzerFrame::onClose(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onQuit(wxCommandEvent &WXUNUSED(event)) { Close(true); }
+
+void AnalyzerFrame::onTogglePadding(wxCommandEvent &WXUNUSED(event)) {
+ panel->togglePadding();
+ SetClientSize(panel->GetSize());
+ panel->render();
+ panel->Refresh();
+}
+
+void AnalyzerFrame::onZoomIn(wxCommandEvent &WXUNUSED(event)) {
+ setZoom(panel->getZoom() + 1);
+}
+
+void AnalyzerFrame::onZoomOut(wxCommandEvent &WXUNUSED(event)) {
+ setZoom(panel->getZoom() - 1);
+}
+
+void AnalyzerFrame::onActualSize(wxCommandEvent &WXUNUSED(event)) {
+ setZoom(MIN_ZOOM);
+}
+
+void AnalyzerFrame::onToggleViewMenuCheckBox(wxCommandEvent &event) { // NOLINT
+ GetMenuBar()->Check(event.GetId(), event.IsChecked());
+ updateViewMenu();
+}
+
+void AnalyzerFrame::onResetAndToggleViewMenuCheckBox(
+ wxCommandEvent &event) { // NOLINT
+ int id = event.GetId();
+ if (id != wxID_SHOW_Y && id != wxID_SHOW_U && id != wxID_SHOW_V) {
+ GetMenuBar()->Check(wxID_SHOW_Y, true);
+ GetMenuBar()->Check(wxID_SHOW_U, true);
+ GetMenuBar()->Check(wxID_SHOW_V, true);
+ }
+ onToggleViewMenuCheckBox(event);
+}
+
+void AnalyzerFrame::onNextFrame(wxCommandEvent &WXUNUSED(event)) {
+ panel->nextFrame();
+ panel->Refresh(false);
+}
+
+void AnalyzerFrame::onGotoFrame(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onRestart(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onAbout(wxCommandEvent &WXUNUSED(event)) {
+ wxAboutDialogInfo info;
+ info.SetName(_("AV1 Bitstream Analyzer"));
+ info.SetVersion(_("0.1-beta"));
+ info.SetDescription(
+ _("This program implements a bitstream analyzer for AV1"));
+ info.SetCopyright(
+ wxT("(C) 2017 Alliance for Open Media <negge@mozilla.com>"));
+ wxAboutBox(info);
+}
+
+bool AnalyzerFrame::open(const wxString &path) {
+ panel = new AnalyzerPanel(this, path, bit_accounting);
+ if (panel->open(path)) {
+ SetClientSize(panel->GetSize());
+ return true;
+ } else {
+ delete panel;
+ return false;
+ }
+}
+
+bool AnalyzerFrame::setZoom(int zoom) {
+ if (panel->setZoom(zoom)) {
+ GetMenuBar()->Enable(wxID_ACTUAL_SIZE, zoom != MIN_ZOOM);
+ GetMenuBar()->Enable(wxID_ZOOM_IN, zoom != MAX_ZOOM);
+ GetMenuBar()->Enable(wxID_ZOOM_OUT, zoom != MIN_ZOOM);
+ SetClientSize(panel->GetSize());
+ panel->render();
+ panel->Refresh();
+ return true;
+ }
+ return false;
+}
+
+void AnalyzerFrame::updateViewMenu() {
+ panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_Y), OD_LUMA_MASK);
+ panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_U), OD_CB_MASK);
+ panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_V), OD_CR_MASK);
+ SetClientSize(panel->GetSize());
+ panel->render();
+ panel->Refresh(false);
+}
+
+class Analyzer : public wxApp {
+ private:
+ AnalyzerFrame *frame;
+
+ public:
+ void OnInitCmdLine(wxCmdLineParser &parser); // NOLINT
+ bool OnCmdLineParsed(wxCmdLineParser &parser); // NOLINT
+};
+
+static const wxCmdLineEntryDesc CMD_LINE_DESC[] = {
+ { wxCMD_LINE_SWITCH, _("h"), _("help"), _("Display this help and exit."),
+ wxCMD_LINE_VAL_NONE, wxCMD_LINE_OPTION_HELP },
+ { wxCMD_LINE_SWITCH, _("a"), _("bit-accounting"), _("Enable bit accounting"),
+ wxCMD_LINE_VAL_NONE, wxCMD_LINE_PARAM_OPTIONAL },
+ { wxCMD_LINE_PARAM, NULL, NULL, _("input.ivf"), wxCMD_LINE_VAL_STRING,
+ wxCMD_LINE_PARAM_OPTIONAL },
+ { wxCMD_LINE_NONE }
+};
+
+void Analyzer::OnInitCmdLine(wxCmdLineParser &parser) { // NOLINT
+ parser.SetDesc(CMD_LINE_DESC);
+ parser.SetSwitchChars(_("-"));
+}
+
+bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) { // NOLINT
+ bool bit_accounting = parser.Found(_("a"));
+ if (bit_accounting && !CONFIG_ACCOUNTING) {
+ fprintf(stderr,
+ "Bit accounting support not found. "
+ "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n");
+ return false;
+ }
+ frame = new AnalyzerFrame(parser.Found(_("a")));
+ frame->Show();
+ if (parser.GetParamCount() > 0) {
+ return frame->open(parser.GetParam(0));
+ }
+ return true;
+}
+
+void usage_exit(void) {
+ fprintf(stderr, "uhh\n");
+ exit(EXIT_FAILURE);
+}
+
+IMPLEMENT_APP(Analyzer)
diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c
new file mode 100644
index 0000000000..b7fb7bce45
--- /dev/null
+++ b/third_party/aom/examples/aom_cx_set_ref.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// AV1 Set Reference Frame
+// ============================
+//
+// This is an example demonstrating how to overwrite the AV1 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. This technique could be used to bounce between two cameras.
+//
+// The decoder would also have to set the reference frame to the same value
+// on the same frame, or the video will become corrupt. The 'test_decode'
+// variable is set to 1 in this example that tests if the encoder and decoder
+// results are matching.
+//
+// Usage
+// -----
+// This example encodes a raw video. And the last argument passed in specifies
+// the frame number to update the reference frame on. For example, run
+// examples/aom_cx_set_ref av1 352 288 in.yuv out.ivf 4 30
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// The encoder and decoder results should be matching when the same reference
+// frame setting operation is done in both encoder and decoder. Otherwise,
+// the encoder/decoder mismatch would be seen.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+#include "examples/encoder_util.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <codec> <width> <height> <infile> <outfile> "
+ "<frame> <limit(optional)>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
+ unsigned int frame_out, int *mismatch_seen) {
+ aom_image_t enc_img, dec_img;
+
+ if (*mismatch_seen) return;
+
+ /* Get the internal reference frame */
+ if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img))
+ die_codec(encoder, "Failed to get encoder reference frame");
+ if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img))
+ die_codec(decoder, "Failed to get decoder reference frame");
+
+ if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+ (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+ if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_image_t enc_hbd_img;
+ aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+ enc_img.d_w, enc_img.d_h, 16);
+ aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+ enc_img = enc_hbd_img;
+ }
+ if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_image_t dec_hbd_img;
+ aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+ dec_img.d_w, dec_img.d_h, 16);
+ aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+ dec_img = dec_hbd_img;
+ }
+ }
+
+ if (!aom_compare_img(&enc_img, &dec_img)) {
+ int y[4], u[4], v[4];
+ if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+ } else {
+ aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+ }
+
+ printf(
+ "Encode/decode mismatch on frame %u at"
+ " Y[%d, %d] {%d/%d},"
+ " U[%d, %d] {%d/%d},"
+ " V[%d, %d] {%d/%d}",
+ frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+ v[2], v[3]);
+ *mismatch_seen = 1;
+ }
+
+ aom_img_free(&enc_img);
+ aom_img_free(&dec_img);
+}
+
+static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
+ unsigned int frame_in, AvxVideoWriter *writer,
+ int test_decode, aom_codec_ctx_t *dcodec,
+ unsigned int *frame_out, int *mismatch_seen,
+ aom_image_t *ext_ref) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ int got_data;
+ const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0);
+ if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame");
+
+ got_data = 0;
+
+ while ((pkt = aom_codec_get_cx_data(ecodec, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+ ++*frame_out;
+
+ if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts)) {
+ die_codec(ecodec, "Failed to write compressed frame");
+ }
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ got_data = 1;
+
+ // Decode 1 frame.
+ if (test_decode) {
+ if (aom_codec_decode(dcodec, pkt->data.frame.buf,
+ (unsigned int)pkt->data.frame.sz, NULL))
+ die_codec(dcodec, "Failed to decode frame.");
+
+ // Copy out first decoded frame, and use it as reference later.
+ if (*frame_out == 1 && ext_ref != NULL)
+ if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref))
+ die_codec(dcodec, "Failed to get decoder new frame");
+ }
+ }
+ }
+
+ // Mismatch checking
+ if (got_data && test_decode) {
+ testing_decode(ecodec, dcodec, *frame_out, mismatch_seen);
+ }
+
+ return got_pkts;
+}
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ // Encoder
+ aom_codec_ctx_t ecodec;
+ aom_codec_enc_cfg_t cfg;
+ unsigned int frame_in = 0;
+ aom_image_t raw;
+ aom_image_t raw_shift;
+ aom_image_t ext_ref;
+ aom_codec_err_t res;
+ AvxVideoInfo info;
+ AvxVideoWriter *writer = NULL;
+ int flags = 0;
+ int allocated_raw_shift = 0;
+ aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420;
+ aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+
+ // Test encoder/decoder mismatch.
+ int test_decode = 1;
+ // Decoder
+ aom_codec_ctx_t dcodec;
+ unsigned int frame_out = 0;
+
+ // The frame number to set reference frame on
+ unsigned int update_frame_num = 0;
+ int mismatch_seen = 0;
+
+ const int fps = 30;
+ const int bitrate = 500;
+
+ const char *codec_arg = NULL;
+ const char *width_arg = NULL;
+ const char *height_arg = NULL;
+ const char *infile_arg = NULL;
+ const char *outfile_arg = NULL;
+ const char *update_frame_num_arg = NULL;
+ unsigned int limit = 0;
+ exec_name = argv[0];
+
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&ecodec, 0, sizeof(ecodec));
+ memset(&cfg, 0, sizeof(cfg));
+ memset(&info, 0, sizeof(info));
+
+ if (argc < 7) die("Invalid number of arguments");
+
+ codec_arg = argv[1];
+ width_arg = argv[2];
+ height_arg = argv[3];
+ infile_arg = argv[4];
+ outfile_arg = argv[5];
+ update_frame_num_arg = argv[6];
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
+ if (!encoder) die("Unsupported codec.");
+
+ update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
+ // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
+ // allocated while calling aom_codec_encode(), thus, setting reference for
+ // 1st frame isn't supported.
+ if (update_frame_num <= 1) {
+ die("Couldn't parse frame number '%s'\n", update_frame_num_arg);
+ }
+
+ if (argc > 7) {
+ limit = (unsigned int)strtoul(argv[7], NULL, 0);
+ if (update_frame_num > limit)
+ die("Update frame number couldn't larger than limit\n");
+ }
+
+ info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+ info.frame_width = (int)strtol(width_arg, NULL, 0);
+ info.frame_height = (int)strtol(height_arg, NULL, 0);
+ info.time_base.numerator = 1;
+ info.time_base.denominator = fps;
+
+ if (info.frame_width <= 0 || info.frame_height <= 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+
+ // In this test, the bit depth of input video is 8-bit, and the input format
+ // is AOM_IMG_FMT_I420.
+ if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) {
+ die("Failed to allocate image.");
+ }
+
+ if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ // Allocate memory with the border so that it can be used as a reference.
+ if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
+ info.frame_height, 32, 8,
+ AOM_DEC_BORDER_IN_PIXELS)) {
+ die("Failed to allocate image.");
+ }
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+#if CONFIG_REALTIME_ONLY
+ res = aom_codec_enc_config_default(encoder, &cfg, 1);
+#else
+ res = aom_codec_enc_config_default(encoder, &cfg, 0);
+#endif
+ if (res) die_codec(&ecodec, "Failed to get default codec config.");
+
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ cfg.g_timebase.num = info.time_base.numerator;
+ cfg.g_timebase.den = info.time_base.denominator;
+ cfg.rc_target_bitrate = bitrate;
+ cfg.g_lag_in_frames = 3;
+ cfg.g_bit_depth = AOM_BITS_8;
+
+ flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING)
+ ? AOM_CODEC_USE_HIGHBITDEPTH
+ : 0;
+
+ writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
+ if (!writer) die("Failed to open %s for writing.", outfile_arg);
+
+ if (!(infile = fopen(infile_arg, "rb")))
+ die("Failed to open %s for reading.", infile_arg);
+
+ if (aom_codec_enc_init(&ecodec, encoder, &cfg, flags))
+ die("Failed to initialize encoder");
+
+ // Disable alt_ref.
+ if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0))
+ die_codec(&ecodec, "Failed to set enable auto alt ref");
+
+ if (test_decode) {
+ aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(codec_arg);
+ if (aom_codec_dec_init(&dcodec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+ }
+
+ // Encode frames.
+ while (aom_img_read(&raw, infile)) {
+ if (limit && frame_in >= limit) break;
+ aom_image_t *frame_to_encode;
+
+ if (FORCE_HIGHBITDEPTH_DECODING) {
+ // Need to allocate larger buffer to use hbd internal.
+ int input_shift = 0;
+ if (!allocated_raw_shift) {
+ aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+ info.frame_width, info.frame_height, 32);
+ allocated_raw_shift = 1;
+ }
+ aom_img_upshift(&raw_shift, &raw, input_shift);
+ frame_to_encode = &raw_shift;
+ } else {
+ frame_to_encode = &raw;
+ }
+
+ if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
+ av1_ref_frame_t ref;
+ ref.idx = 0;
+ ref.use_external_ref = 0;
+ ref.img = ext_ref;
+ // Set reference frame in encoder.
+ if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref))
+ die_codec(&ecodec, "Failed to set encoder reference frame");
+ printf(" <SET_REF>");
+
+#if CONFIG_REALTIME_ONLY
+ // Set cpu speed in encoder.
+ if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7))
+ die_codec(&ecodec, "Failed to set cpu speed");
+#endif
+
+ // If set_reference in decoder is commented out, the enc/dec mismatch
+ // would be seen.
+ if (test_decode) {
+ ref.use_external_ref = 1;
+ if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref))
+ die_codec(&dcodec, "Failed to set decoder reference frame");
+ }
+ }
+
+ encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode,
+ &dcodec, &frame_out, &mismatch_seen, &ext_ref);
+ frame_in++;
+ if (mismatch_seen) break;
+ }
+
+ // Flush encoder.
+ if (!mismatch_seen)
+ while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
+ &frame_out, &mismatch_seen, NULL)) {
+ }
+
+ printf("\n");
+ fclose(infile);
+ printf("Processed %u frames.\n", frame_out);
+
+ if (test_decode) {
+ if (!mismatch_seen)
+ printf("Encoder/decoder results are matching.\n");
+ else
+ printf("Encoder/decoder results are NOT matching.\n");
+ }
+
+ if (test_decode)
+ if (aom_codec_destroy(&dcodec))
+ die_codec(&dcodec, "Failed to destroy decoder");
+
+ if (allocated_raw_shift) aom_img_free(&raw_shift);
+ aom_img_free(&ext_ref);
+ aom_img_free(&raw);
+ if (aom_codec_destroy(&ecodec))
+ die_codec(&ecodec, "Failed to destroy encoder.");
+
+ aom_video_writer_close(writer);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/av1_dec_fuzzer.cc b/third_party/aom/examples/av1_dec_fuzzer.cc
new file mode 100644
index 0000000000..9b9a0b9cb6
--- /dev/null
+++ b/third_party/aom/examples/av1_dec_fuzzer.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * See build_av1_dec_fuzzer.sh for building instructions.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <memory>
+#include "config/aom_config.h"
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/mem_ops.h"
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ if (size <= IVF_FILE_HDR_SZ) {
+ return 0;
+ }
+
+ aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
+ aom_codec_ctx_t codec;
+ // Set thread count in the range [1, 64].
+ const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
+ aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+ if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) {
+ return 0;
+ }
+
+ data += IVF_FILE_HDR_SZ;
+ size -= IVF_FILE_HDR_SZ;
+
+ while (size > IVF_FRAME_HDR_SZ) {
+ size_t frame_size = mem_get_le32(data);
+ size -= IVF_FRAME_HDR_SZ;
+ data += IVF_FRAME_HDR_SZ;
+ frame_size = std::min(size, frame_size);
+
+ const aom_codec_err_t err =
+ aom_codec_decode(&codec, data, frame_size, nullptr);
+ static_cast<void>(err);
+ aom_codec_iter_t iter = nullptr;
+ aom_image_t *img = nullptr;
+ while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) {
+ }
+ data += frame_size;
+ size -= frame_size;
+ }
+ aom_codec_destroy(&codec);
+ return 0;
+}
diff --git a/third_party/aom/examples/av1_dec_fuzzer.dict b/third_party/aom/examples/av1_dec_fuzzer.dict
new file mode 100644
index 0000000000..fb1638864c
--- /dev/null
+++ b/third_party/aom/examples/av1_dec_fuzzer.dict
@@ -0,0 +1,5 @@
+# IVF Signature + version (bytes 0-5)
+kw1="DKIF\x00\x00"
+
+# AV1 codec fourCC (bytes 8-11)
+kw2="AV01"
diff --git a/third_party/aom/examples/build_av1_dec_fuzzer.sh b/third_party/aom/examples/build_av1_dec_fuzzer.sh
new file mode 100755
index 0000000000..40355ea133
--- /dev/null
+++ b/third_party/aom/examples/build_av1_dec_fuzzer.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+###############################################################################
+# Fuzzer for libaom decoder.
+# ==========================
+# Requirements
+# ---------------------
+# Clang6.0 or above (must support -fsanitize=fuzzer -fsanitize=fuzzer-no-link)
+#
+# References:
+# ---------------------
+# http://llvm.org/docs/LibFuzzer.html
+# https://github.com/google/oss-fuzz
+#
+# Steps to build / run
+# ---------------------
+
+set -eu
+
+# Have a copy of AOM and a build directory ready.
+if [[ $# -ne 2 ]]; then
+ echo "Pass in the AOM source tree as first argument, and a build directory "
+ echo "as the second argument. The AOM source tree can be obtained via: "
+ echo " git clone https://aomedia.googlesource.com/aom"
+ exit 2
+fi
+if [[ -z "${CC:-}" ]]; then
+ echo "Set the CC environment variable to point to your C compiler."
+ exit 2
+fi
+if [[ -z "${CXX:-}" ]]; then
+ echo "Set the CXX environment variable to point to your C++ compiler."
+ exit 2
+fi
+
+AOM_DIR=$1
+BUILD_DIR=$2
+# Run CMake with address sanitizer enabled and build the codec.
+# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
+# in the transform functions. Also set memory limits.
+EXTRA_C_FLAGS='-UNDEBUG -DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+cd "${BUILD_DIR}"
+cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
+ -DFORCE_HIGHBITDEPTH_DECODING=0 \
+ -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \
+ -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
+ -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
+ -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=fuzzer-no-link,address
+
+# Build the codec.
+make -j$(nproc)
+
+# Build the av1 fuzzer
+$CXX -std=c++11 -I${AOM_DIR} -I${BUILD_DIR} \
+ -g -fsanitize=fuzzer,address \
+ ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
+ ${BUILD_DIR}/libaom.a
+
+echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
+echo "Create a corpus directory, copy IVF files in there, and run:"
+echo " av1_dec_fuzzer CORPUS_DIR"
diff --git a/third_party/aom/examples/decode_to_md5.c b/third_party/aom/examples/decode_to_md5.c
new file mode 100644
index 0000000000..07f788ff97
--- /dev/null
+++ b/third_party/aom/examples/decode_to_md5.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Frame-by-frame MD5 Checksum
+// ===========================
+//
+// This example builds upon the simple decoder loop to show how checksums
+// of the decoded output can be generated. These are used for validating
+// decoder implementations against the reference implementation, for example.
+//
+// MD5 algorithm
+// -------------
+// The Message-Digest 5 (MD5) is a well known hash function. We have provided
+// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm for your use. Our implmentation only changes the interface of this
+// reference code. You must include the `md5_utils.h` header for access to these
+// functions.
+//
+// Processing The Decoded Data
+// ---------------------------
+// Each row of the image is passed to the MD5 accumulator. First the Y plane
+// is processed, then U, then V. It is important to honor the image's `stride`
+// values.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) {
+ int plane, y;
+ MD5Context md5;
+
+ MD5Init(&md5);
+
+ for (plane = 0; plane < 3; ++plane) {
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+ const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+
+ for (y = 0; y < h; ++y) {
+ MD5Update(&md5, buf, w);
+ buf += stride;
+ }
+ }
+
+ MD5Final(digest, &md5);
+}
+
+static void print_md5(FILE *stream, unsigned char digest[16]) {
+ int i;
+
+ for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]);
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+ exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+ int frame_cnt = 0;
+ FILE *outfile = NULL;
+ AvxVideoReader *reader = NULL;
+ const AvxVideoInfo *info = NULL;
+
+ exec_name = argv[0];
+
+ if (argc != 3) die("Invalid number of arguments.");
+
+ reader = aom_video_reader_open(argv[1]);
+ if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+ if (!(outfile = fopen(argv[2], "wb")))
+ die("Failed to open %s for writing.", argv[2]);
+
+ info = aom_video_reader_get_info(reader);
+
+ aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) die("Unknown input codec.");
+
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder");
+
+ while (aom_video_reader_read_frame(reader)) {
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = NULL;
+ size_t frame_size = 0;
+ const unsigned char *frame =
+ aom_video_reader_get_frame(reader, &frame_size);
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode frame");
+
+ while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+ unsigned char digest[16];
+
+ get_image_md5(img, digest);
+ print_md5(outfile, digest);
+ fprintf(outfile, " img-%ux%u-%04d.i420\n", img->d_w, img->d_h,
+ ++frame_cnt);
+ }
+ }
+
+ printf("Processed %d frames.\n", frame_cnt);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ aom_video_reader_close(reader);
+
+ fclose(outfile);
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/decode_with_drops.c b/third_party/aom/examples/decode_with_drops.c
new file mode 100644
index 0000000000..9bec6ee2df
--- /dev/null
+++ b/third_party/aom/examples/decode_with_drops.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Decode With Drops Example
+// =========================
+//
+// This is an example utility which drops a series of frames, as specified
+// on the command line. This is useful for observing the error recovery
+// features of the codec.
+//
+// Usage
+// -----
+// This example adds a single argument to the `simple_decoder` example,
+// which specifies the range or pattern of frames to drop. The parameter is
+// parsed as follows:
+//
+// Dropping A Range Of Frames
+// --------------------------
+// To drop a range of frames, specify the starting frame and the ending
+// frame to drop, separated by a dash. The following command will drop
+// frames 5 through 10 (base 1).
+//
+// $ ./decode_with_drops in.ivf out.i420 5-10
+//
+//
+// Dropping A Pattern Of Frames
+// ----------------------------
+// To drop a pattern of frames, specify the number of frames to drop and
+// the number of frames after which to repeat the pattern, separated by
+// a forward-slash. The following command will drop 3 of 7 frames.
+// Specifically, it will decode 4 frames, then drop 3 frames, and then
+// repeat.
+//
+// $ ./decode_with_drops in.ivf out.i420 3/7
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the pattern passed on the command line in the
+// `n`, `m`, and `is_range` variables:
+//
+//
+// Making The Drop Decision
+// ------------------------
+// The example decides whether to drop the frame based on the current
+// frame number, immediately before decoding the frame.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
+ exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+ int frame_cnt = 0;
+ FILE *outfile = NULL;
+ AvxVideoReader *reader = NULL;
+ const AvxVideoInfo *info = NULL;
+ int n = 0;
+ int m = 0;
+ int is_range = 0;
+ char *nptr = NULL;
+
+ exec_name = argv[0];
+
+ if (argc != 4) die("Invalid number of arguments.");
+
+ reader = aom_video_reader_open(argv[1]);
+ if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+ if (!(outfile = fopen(argv[2], "wb")))
+ die("Failed to open %s for writing.", argv[2]);
+
+ n = (int)strtol(argv[3], &nptr, 0);
+ m = (int)strtol(nptr + 1, NULL, 0);
+ is_range = (*nptr == '-');
+ if (!n || !m || (*nptr != '-' && *nptr != '/'))
+ die("Couldn't parse pattern %s.\n", argv[3]);
+
+ info = aom_video_reader_get_info(reader);
+
+ aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) die("Unknown input codec.");
+
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+
+ while (aom_video_reader_read_frame(reader)) {
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = NULL;
+ size_t frame_size = 0;
+ int skip;
+ const unsigned char *frame =
+ aom_video_reader_get_frame(reader, &frame_size);
+ ++frame_cnt;
+
+ skip = (is_range && frame_cnt >= n && frame_cnt <= m) ||
+ (!is_range && m - (frame_cnt - 1) % m <= n);
+
+ if (!skip) {
+ putc('.', stdout);
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode frame.");
+
+ while ((img = aom_codec_get_frame(&codec, &iter)) != NULL)
+ aom_img_write(img, outfile);
+ } else {
+ putc('X', stdout);
+ }
+
+ fflush(stdout);
+ }
+
+ printf("Processed %d frames.\n", frame_cnt);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+ info->frame_width, info->frame_height, argv[2]);
+
+ aom_video_reader_close(reader);
+ fclose(outfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/encoder_util.c b/third_party/aom/examples/encoder_util.c
new file mode 100644
index 0000000000..e43b372506
--- /dev/null
+++ b/third_party/aom/examples/encoder_util.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Utility functions used by encoder binaries.
+
+#include "examples/encoder_util.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+static void find_mismatch_plane(const aom_image_t *const img1,
+ const aom_image_t *const img2, int plane,
+ int use_highbitdepth, int loc[4]) {
+ const unsigned char *const p1 = img1->planes[plane];
+ const int p1_stride = img1->stride[plane] >> use_highbitdepth;
+ const unsigned char *const p2 = img2->planes[plane];
+ const int p2_stride = img2->stride[plane] >> use_highbitdepth;
+ const uint32_t bsize = 64;
+ const int is_y_plane = (plane == AOM_PLANE_Y);
+ const uint32_t bsizex = is_y_plane ? bsize : bsize >> img1->x_chroma_shift;
+ const uint32_t bsizey = is_y_plane ? bsize : bsize >> img1->y_chroma_shift;
+ const uint32_t c_w =
+ is_y_plane ? img1->d_w
+ : (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+ const uint32_t c_h =
+ is_y_plane ? img1->d_h
+ : (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+ assert(img1->d_w == img2->d_w && img1->d_h == img2->d_h);
+ assert(img1->x_chroma_shift == img2->x_chroma_shift &&
+ img1->y_chroma_shift == img2->y_chroma_shift);
+ loc[0] = loc[1] = loc[2] = loc[3] = -1;
+ if (img1->monochrome && img2->monochrome && plane) return;
+ int match = 1;
+ uint32_t i, j;
+ for (i = 0; match && i < c_h; i += bsizey) {
+ for (j = 0; match && j < c_w; j += bsizex) {
+ const int si =
+ is_y_plane ? mmin(i + bsizey, c_h) - i : mmin(i + bsizey, c_h - i);
+ const int sj =
+ is_y_plane ? mmin(j + bsizex, c_w) - j : mmin(j + bsizex, c_w - j);
+ int k, l;
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ const int row = i + k;
+ const int col = j + l;
+ const int offset1 = row * p1_stride + col;
+ const int offset2 = row * p2_stride + col;
+ const int val1 = use_highbitdepth
+ ? p1[2 * offset1] | (p1[2 * offset1 + 1] << 8)
+ : p1[offset1];
+ const int val2 = use_highbitdepth
+ ? p2[2 * offset2] | (p2[2 * offset2 + 1] << 8)
+ : p2[offset2];
+ if (val1 != val2) {
+ loc[0] = row;
+ loc[1] = col;
+ loc[2] = val1;
+ loc[3] = val2;
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void find_mismatch_helper(const aom_image_t *const img1,
+ const aom_image_t *const img2,
+ int use_highbitdepth, int yloc[4], int uloc[4],
+ int vloc[4]) {
+ find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc);
+ find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc);
+ find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc);
+}
+
+void aom_find_mismatch_high(const aom_image_t *const img1,
+ const aom_image_t *const img2, int yloc[4],
+ int uloc[4], int vloc[4]) {
+ find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc);
+}
+
+void aom_find_mismatch(const aom_image_t *const img1,
+ const aom_image_t *const img2, int yloc[4], int uloc[4],
+ int vloc[4]) {
+ find_mismatch_helper(img1, img2, 0, yloc, uloc, vloc);
+}
+
+int aom_compare_img(const aom_image_t *const img1,
+ const aom_image_t *const img2) {
+ assert(img1->cp == img2->cp);
+ assert(img1->tc == img2->tc);
+ assert(img1->mc == img2->mc);
+ assert(img1->monochrome == img2->monochrome);
+
+ int num_planes = img1->monochrome ? 1 : 3;
+
+ uint32_t l_w = img1->d_w;
+ uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+ const uint32_t c_h =
+ (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+ int match = 1;
+
+ match &= (img1->fmt == img2->fmt);
+ match &= (img1->d_w == img2->d_w);
+ match &= (img1->d_h == img2->d_h);
+ if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ l_w *= 2;
+ c_w *= 2;
+ }
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ uint32_t height = plane ? c_h : img1->d_h;
+ uint32_t width = plane ? c_w : l_w;
+
+ for (uint32_t i = 0; i < height; ++i) {
+ match &=
+ (memcmp(img1->planes[plane] + i * img1->stride[plane],
+ img2->planes[plane] + i * img2->stride[plane], width) == 0);
+ }
+ }
+
+ return match;
+}
diff --git a/third_party/aom/examples/encoder_util.h b/third_party/aom/examples/encoder_util.h
new file mode 100644
index 0000000000..fa0e7d1880
--- /dev/null
+++ b/third_party/aom/examples/encoder_util.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Utility functions used by encoder binaries.
+
+#ifndef AOM_EXAMPLES_ENCODER_UTIL_H_
+#define AOM_EXAMPLES_ENCODER_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_image.h"
+
+// Returns mismatch location (?loc[0],?loc[1]) and the values at that location
+// in img1 (?loc[2]) and img2 (?loc[3]).
+void aom_find_mismatch_high(const aom_image_t *const img1,
+ const aom_image_t *const img2, int yloc[4],
+ int uloc[4], int vloc[4]);
+
+void aom_find_mismatch(const aom_image_t *const img1,
+ const aom_image_t *const img2, int yloc[4], int uloc[4],
+ int vloc[4]);
+
+// Returns 1 if the two images match.
+int aom_compare_img(const aom_image_t *const img1,
+ const aom_image_t *const img2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif // AOM_EXAMPLES_ENCODER_UTIL_H_
diff --git a/third_party/aom/examples/inspect.c b/third_party/aom/examples/inspect.c
new file mode 100644
index 0000000000..e285be0209
--- /dev/null
+++ b/third_party/aom/examples/inspect.c
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Inspect Decoder
+// ================
+//
+// This is a simple decoder loop that writes JSON stats to stdout. This tool
+// can also be compiled with Emscripten and used as a library.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#else
+#define EMSCRIPTEN_KEEPALIVE
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "av1/common/av1_common_int.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#include "av1/decoder/inspection.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_common.h"
+#include "common/video_reader.h"
+
+// Max JSON buffer size.
+const int MAX_BUFFER = 1024 * 1024 * 256;
+
+typedef enum {
+ ACCOUNTING_LAYER = 1,
+ BLOCK_SIZE_LAYER = 1 << 1,
+ TRANSFORM_SIZE_LAYER = 1 << 2,
+ TRANSFORM_TYPE_LAYER = 1 << 3,
+ MODE_LAYER = 1 << 4,
+ SKIP_LAYER = 1 << 5,
+ FILTER_LAYER = 1 << 6,
+ CDEF_LAYER = 1 << 7,
+ REFERENCE_FRAME_LAYER = 1 << 8,
+ MOTION_VECTORS_LAYER = 1 << 9,
+ UV_MODE_LAYER = 1 << 10,
+ CFL_LAYER = 1 << 11,
+ DUAL_FILTER_LAYER = 1 << 12,
+ Q_INDEX_LAYER = 1 << 13,
+ SEGMENT_ID_LAYER = 1 << 14,
+ MOTION_MODE_LAYER = 1 << 15,
+ COMPOUND_TYPE_LAYER = 1 << 16,
+ INTRABC_LAYER = 1 << 17,
+ PALETTE_LAYER = 1 << 18,
+ UV_PALETTE_LAYER = 1 << 19,
+ ALL_LAYERS = (1 << 20) - 1
+} LayerType;
+
+static LayerType layers = 0;
+
+static int stop_after = 0;
+static int compress = 0;
+
+static const arg_def_t limit_arg =
+ ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t dump_all_arg = ARG_DEF("A", "all", 0, "Dump All");
+static const arg_def_t compress_arg =
+ ARG_DEF("x", "compress", 0, "Compress JSON using RLE");
+static const arg_def_t dump_accounting_arg =
+ ARG_DEF("a", "accounting", 0, "Dump Accounting");
+static const arg_def_t dump_block_size_arg =
+ ARG_DEF("bs", "blockSize", 0, "Dump Block Size");
+static const arg_def_t dump_motion_vectors_arg =
+ ARG_DEF("mv", "motionVectors", 0, "Dump Motion Vectors");
+static const arg_def_t dump_transform_size_arg =
+ ARG_DEF("ts", "transformSize", 0, "Dump Transform Size");
+static const arg_def_t dump_transform_type_arg =
+ ARG_DEF("tt", "transformType", 0, "Dump Transform Type");
+static const arg_def_t dump_mode_arg = ARG_DEF("m", "mode", 0, "Dump Mode");
+static const arg_def_t dump_motion_mode_arg =
+ ARG_DEF("mm", "motion_mode", 0, "Dump Motion Modes");
+static const arg_def_t dump_compound_type_arg =
+ ARG_DEF("ct", "compound_type", 0, "Dump Compound Types");
+static const arg_def_t dump_uv_mode_arg =
+ ARG_DEF("uvm", "uv_mode", 0, "Dump UV Intra Prediction Modes");
+static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip");
+static const arg_def_t dump_filter_arg =
+ ARG_DEF("f", "filter", 0, "Dump Filter");
+static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF");
+static const arg_def_t dump_cfl_arg =
+ ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas");
+static const arg_def_t dump_dual_filter_type_arg =
+ ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type");
+static const arg_def_t dump_reference_frame_arg =
+ ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame");
+static const arg_def_t dump_delta_q_arg =
+ ARG_DEF("dq", "delta_q", 0, "Dump QIndex");
+static const arg_def_t dump_seg_id_arg =
+ ARG_DEF("si", "seg_id", 0, "Dump Segment ID");
+static const arg_def_t dump_intrabc_arg =
+ ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used");
+static const arg_def_t dump_palette_arg =
+ ARG_DEF("plt", "palette", 0, "Dump Palette Size");
+static const arg_def_t dump_uv_palette_arg =
+ ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size");
+static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help");
+static const arg_def_t skip_non_transform_arg = ARG_DEF(
+ "snt", "skip_non_transform", 1, "Skip is counted as a non transform.");
+static const arg_def_t combined_arg =
+ ARG_DEF("comb", "combined", 1, "combinining parameters into one output.");
+
+int combined_parm_list[15];
+int combined_parm_count = 0;
+
+static const arg_def_t *main_args[] = { &limit_arg,
+ &dump_all_arg,
+ &compress_arg,
+#if CONFIG_ACCOUNTING
+ &dump_accounting_arg,
+#endif
+ &dump_block_size_arg,
+ &dump_transform_size_arg,
+ &dump_transform_type_arg,
+ &dump_mode_arg,
+ &dump_uv_mode_arg,
+ &dump_motion_mode_arg,
+ &dump_compound_type_arg,
+ &dump_skip_arg,
+ &dump_filter_arg,
+ &dump_cdef_arg,
+ &dump_dual_filter_type_arg,
+ &dump_cfl_arg,
+ &dump_reference_frame_arg,
+ &dump_motion_vectors_arg,
+ &dump_delta_q_arg,
+ &dump_seg_id_arg,
+ &dump_intrabc_arg,
+ &dump_palette_arg,
+ &dump_uv_palette_arg,
+ &usage_arg,
+ &skip_non_transform_arg,
+ &combined_arg,
+ NULL };
+#define ENUM(name) \
+ { #name, name }
+#define LAST_ENUM \
+ { NULL, 0 }
+typedef struct map_entry {
+ const char *name;
+ int value;
+} map_entry;
+
+const map_entry refs_map[] = {
+ ENUM(INTRA_FRAME), ENUM(LAST_FRAME), ENUM(LAST2_FRAME),
+ ENUM(LAST3_FRAME), ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME),
+ ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM
+};
+
+const map_entry block_size_map[] = {
+ ENUM(BLOCK_4X4), ENUM(BLOCK_4X8), ENUM(BLOCK_8X4),
+ ENUM(BLOCK_8X8), ENUM(BLOCK_8X16), ENUM(BLOCK_16X8),
+ ENUM(BLOCK_16X16), ENUM(BLOCK_16X32), ENUM(BLOCK_32X16),
+ ENUM(BLOCK_32X32), ENUM(BLOCK_32X64), ENUM(BLOCK_64X32),
+ ENUM(BLOCK_64X64), ENUM(BLOCK_64X128), ENUM(BLOCK_128X64),
+ ENUM(BLOCK_128X128), ENUM(BLOCK_4X16), ENUM(BLOCK_16X4),
+ ENUM(BLOCK_8X32), ENUM(BLOCK_32X8), ENUM(BLOCK_16X64),
+ ENUM(BLOCK_64X16), LAST_ENUM
+};
+
+#define TX_SKIP -1
+
+const map_entry tx_size_map[] = {
+ ENUM(TX_4X4), ENUM(TX_8X8), ENUM(TX_16X16), ENUM(TX_32X32),
+ ENUM(TX_64X64), ENUM(TX_4X8), ENUM(TX_8X4), ENUM(TX_8X16),
+ ENUM(TX_16X8), ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64),
+ ENUM(TX_64X32), ENUM(TX_4X16), ENUM(TX_16X4), ENUM(TX_8X32),
+ ENUM(TX_32X8), ENUM(TX_16X64), ENUM(TX_64X16), LAST_ENUM
+};
+
+const map_entry tx_type_map[] = { ENUM(DCT_DCT),
+ ENUM(ADST_DCT),
+ ENUM(DCT_ADST),
+ ENUM(ADST_ADST),
+ ENUM(FLIPADST_DCT),
+ ENUM(DCT_FLIPADST),
+ ENUM(FLIPADST_FLIPADST),
+ ENUM(ADST_FLIPADST),
+ ENUM(FLIPADST_ADST),
+ ENUM(IDTX),
+ ENUM(V_DCT),
+ ENUM(H_DCT),
+ ENUM(V_ADST),
+ ENUM(H_ADST),
+ ENUM(V_FLIPADST),
+ ENUM(H_FLIPADST),
+ LAST_ENUM };
+const map_entry dual_filter_map[] = { ENUM(REG_REG), ENUM(REG_SMOOTH),
+ ENUM(REG_SHARP), ENUM(SMOOTH_REG),
+ ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP),
+ ENUM(SHARP_REG), ENUM(SHARP_SMOOTH),
+ ENUM(SHARP_SHARP), LAST_ENUM };
+
+const map_entry prediction_mode_map[] = {
+ ENUM(DC_PRED), ENUM(V_PRED), ENUM(H_PRED),
+ ENUM(D45_PRED), ENUM(D135_PRED), ENUM(D113_PRED),
+ ENUM(D157_PRED), ENUM(D203_PRED), ENUM(D67_PRED),
+ ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED),
+ ENUM(PAETH_PRED), ENUM(NEARESTMV), ENUM(NEARMV),
+ ENUM(GLOBALMV), ENUM(NEWMV), ENUM(NEAREST_NEARESTMV),
+ ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV),
+ ENUM(NEAR_NEWMV), ENUM(NEW_NEARMV), ENUM(GLOBAL_GLOBALMV),
+ ENUM(NEW_NEWMV), ENUM(INTRA_INVALID), LAST_ENUM
+};
+
+const map_entry motion_mode_map[] = { ENUM(SIMPLE_TRANSLATION),
+ ENUM(OBMC_CAUSAL), // 2-sided OBMC
+ ENUM(WARPED_CAUSAL), // 2-sided WARPED
+ LAST_ENUM };
+
+const map_entry compound_type_map[] = { ENUM(COMPOUND_AVERAGE),
+ ENUM(COMPOUND_WEDGE),
+ ENUM(COMPOUND_DIFFWTD), LAST_ENUM };
+
+const map_entry uv_prediction_mode_map[] = {
+ ENUM(UV_DC_PRED), ENUM(UV_V_PRED),
+ ENUM(UV_H_PRED), ENUM(UV_D45_PRED),
+ ENUM(UV_D135_PRED), ENUM(UV_D113_PRED),
+ ENUM(UV_D157_PRED), ENUM(UV_D203_PRED),
+ ENUM(UV_D67_PRED), ENUM(UV_SMOOTH_PRED),
+ ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED),
+ ENUM(UV_PAETH_PRED), ENUM(UV_CFL_PRED),
+ ENUM(UV_MODE_INVALID), LAST_ENUM
+};
+#define NO_SKIP 0
+#define SKIP 1
+
+const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM };
+
+const map_entry intrabc_map[] = { { "INTRABC", 1 },
+ { "NO_INTRABC", 0 },
+ LAST_ENUM };
+
+const map_entry palette_map[] = {
+ { "ZERO_COLORS", 0 }, { "TWO_COLORS", 2 }, { "THREE_COLORS", 3 },
+ { "FOUR_COLORS", 4 }, { "FIVE_COLORS", 5 }, { "SIX_COLORS", 6 },
+ { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM
+};
+
+const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM };
+
+static const char *exec_name;
+
+struct parm_offset {
+ char parm[60];
+ char offset;
+};
+struct parm_offset parm_offsets[] = {
+ { "blockSize", offsetof(insp_mi_data, bsize) },
+ { "transformSize", offsetof(insp_mi_data, tx_size) },
+ { "transformType", offsetof(insp_mi_data, tx_type) },
+ { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) },
+ { "mode", offsetof(insp_mi_data, mode) },
+ { "uv_mode", offsetof(insp_mi_data, uv_mode) },
+ { "motion_mode", offsetof(insp_mi_data, motion_mode) },
+ { "compound_type", offsetof(insp_mi_data, compound_type) },
+ { "referenceFrame", offsetof(insp_mi_data, ref_frame) },
+ { "skip", offsetof(insp_mi_data, skip) },
+};
+int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]);
+
+int convert_to_indices(char *str, int *indices, int maxCount, int *count) {
+ *count = 0;
+ do {
+ char *comma = strchr(str, ',');
+ int length = (comma ? (int)(comma - str) : (int)strlen(str));
+ int i;
+ for (i = 0; i < parm_count; ++i) {
+ if (!strncmp(str, parm_offsets[i].parm, length)) {
+ break;
+ }
+ }
+ if (i == parm_count) return 0;
+ indices[(*count)++] = i;
+ if (*count > maxCount) return 0;
+ str += length + 1;
+ } while (strlen(str) > 0);
+ return 1;
+}
+
+insp_frame_data frame_data;
+int frame_count = 0;
+int decoded_frame_count = 0;
+aom_codec_ctx_t codec;
+AvxVideoReader *reader = NULL;
+const AvxVideoInfo *info = NULL;
+aom_image_t *img = NULL;
+
+void on_frame_decoded_dump(char *json) {
+#ifdef __EMSCRIPTEN__
+ EM_ASM_({ Module.on_frame_decoded_json($0); }, json);
+#else
+ printf("%s", json);
+#endif
+}
+
+// Writing out the JSON buffer using snprintf is very slow, especially when
+// compiled with emscripten, these functions speed things up quite a bit.
+int put_str(char *buffer, const char *str) {
+ int i;
+ for (i = 0; str[i] != '\0'; i++) {
+ buffer[i] = str[i];
+ }
+ return i;
+}
+
+int put_str_with_escape(char *buffer, const char *str) {
+ int i;
+ int j = 0;
+ for (i = 0; str[i] != '\0'; i++) {
+ if (str[i] < ' ') {
+ continue;
+ } else if (str[i] == '"' || str[i] == '\\') {
+ buffer[j++] = '\\';
+ }
+ buffer[j++] = str[i];
+ }
+ return j;
+}
+
+int put_num(char *buffer, char prefix, int num, char suffix) {
+ int i = 0;
+ char *buf = buffer;
+ int is_neg = 0;
+ if (prefix) {
+ buf[i++] = prefix;
+ }
+ if (num == 0) {
+ buf[i++] = '0';
+ } else {
+ if (num < 0) {
+ num = -num;
+ is_neg = 1;
+ }
+ int s = i;
+ while (num != 0) {
+ buf[i++] = '0' + (num % 10);
+ num = num / 10;
+ }
+ if (is_neg) {
+ buf[i++] = '-';
+ }
+ int e = i - 1;
+ while (s < e) {
+ int t = buf[s];
+ buf[s] = buf[e];
+ buf[e] = t;
+ s++;
+ e--;
+ }
+ }
+ if (suffix) {
+ buf[i++] = suffix;
+ }
+ return i;
+}
+
+int put_map(char *buffer, const map_entry *map) {
+ char *buf = buffer;
+ const map_entry *entry = map;
+ while (entry->name != NULL) {
+ *(buf++) = '"';
+ buf += put_str(buf, entry->name);
+ *(buf++) = '"';
+ buf += put_num(buf, ':', entry->value, 0);
+ entry++;
+ if (entry->name != NULL) {
+ *(buf++) = ',';
+ }
+ }
+ return (int)(buf - buffer);
+}
+
+int put_reference_frame(char *buffer) {
+ const int mi_rows = frame_data.mi_rows;
+ const int mi_cols = frame_data.mi_cols;
+ char *buf = buffer;
+ int r, c, t;
+ buf += put_str(buf, " \"referenceFrameMap\": {");
+ buf += put_map(buf, refs_map);
+ buf += put_str(buf, "},\n");
+ buf += put_str(buf, " \"referenceFrame\": [");
+ for (r = 0; r < mi_rows; ++r) {
+ *(buf++) = '[';
+ for (c = 0; c < mi_cols; ++c) {
+ insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+ buf += put_num(buf, '[', mi->ref_frame[0], 0);
+ buf += put_num(buf, ',', mi->ref_frame[1], ']');
+ if (compress) { // RLE
+ for (t = c + 1; t < mi_cols; ++t) {
+ insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+ if (mi->ref_frame[0] != next_mi->ref_frame[0] ||
+ mi->ref_frame[1] != next_mi->ref_frame[1]) {
+ break;
+ }
+ }
+ if (t - c > 1) {
+ *(buf++) = ',';
+ buf += put_num(buf, '[', t - c - 1, ']');
+ c = t - 1;
+ }
+ }
+ if (c < mi_cols - 1) *(buf++) = ',';
+ }
+ *(buf++) = ']';
+ if (r < mi_rows - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ return (int)(buf - buffer);
+}
+
+int put_motion_vectors(char *buffer) {
+ const int mi_rows = frame_data.mi_rows;
+ const int mi_cols = frame_data.mi_cols;
+ char *buf = buffer;
+ int r, c, t;
+ buf += put_str(buf, " \"motionVectors\": [");
+ for (r = 0; r < mi_rows; ++r) {
+ *(buf++) = '[';
+ for (c = 0; c < mi_cols; ++c) {
+ insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+ buf += put_num(buf, '[', mi->mv[0].col, 0);
+ buf += put_num(buf, ',', mi->mv[0].row, 0);
+ buf += put_num(buf, ',', mi->mv[1].col, 0);
+ buf += put_num(buf, ',', mi->mv[1].row, ']');
+ if (compress) { // RLE
+ for (t = c + 1; t < mi_cols; ++t) {
+ insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+ if (mi->mv[0].col != next_mi->mv[0].col ||
+ mi->mv[0].row != next_mi->mv[0].row ||
+ mi->mv[1].col != next_mi->mv[1].col ||
+ mi->mv[1].row != next_mi->mv[1].row) {
+ break;
+ }
+ }
+ if (t - c > 1) {
+ *(buf++) = ',';
+ buf += put_num(buf, '[', t - c - 1, ']');
+ c = t - 1;
+ }
+ }
+ if (c < mi_cols - 1) *(buf++) = ',';
+ }
+ *(buf++) = ']';
+ if (r < mi_rows - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ return (int)(buf - buffer);
+}
+
+int put_combined(char *buffer) {
+ const int mi_rows = frame_data.mi_rows;
+ const int mi_cols = frame_data.mi_cols;
+ char *buf = buffer;
+ int r, c, p;
+ buf += put_str(buf, " \"");
+ for (p = 0; p < combined_parm_count; ++p) {
+ if (p) buf += put_str(buf, "&");
+ buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm);
+ }
+ buf += put_str(buf, "\": [");
+ for (r = 0; r < mi_rows; ++r) {
+ *(buf++) = '[';
+ for (c = 0; c < mi_cols; ++c) {
+ insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+ *(buf++) = '[';
+ for (p = 0; p < combined_parm_count; ++p) {
+ if (p) *(buf++) = ',';
+ int16_t *v = (int16_t *)(((int8_t *)mi) +
+ parm_offsets[combined_parm_list[p]].offset);
+ buf += put_num(buf, 0, v[0], 0);
+ }
+ *(buf++) = ']';
+ if (c < mi_cols - 1) *(buf++) = ',';
+ }
+ *(buf++) = ']';
+ if (r < mi_rows - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ return (int)(buf - buffer);
+}
+
+int put_block_info(char *buffer, const map_entry *map, const char *name,
+ size_t offset, int len) {
+ const int mi_rows = frame_data.mi_rows;
+ const int mi_cols = frame_data.mi_cols;
+ char *buf = buffer;
+ int r, c, t, i;
+ if (compress && len == 1) {
+ die("Can't encode scalars as arrays when RLE compression is enabled.");
+ }
+ if (map) {
+ buf += snprintf(buf, MAX_BUFFER, " \"%sMap\": {", name);
+ buf += put_map(buf, map);
+ buf += put_str(buf, "},\n");
+ }
+ buf += snprintf(buf, MAX_BUFFER, " \"%s\": [", name);
+ for (r = 0; r < mi_rows; ++r) {
+ *(buf++) = '[';
+ for (c = 0; c < mi_cols; ++c) {
+ insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+ int16_t *v = (int16_t *)(((int8_t *)mi) + offset);
+ if (len == 0) {
+ buf += put_num(buf, 0, v[0], 0);
+ } else {
+ buf += put_str(buf, "[");
+ for (i = 0; i < len; i++) {
+ buf += put_num(buf, 0, v[i], 0);
+ if (i < len - 1) {
+ buf += put_str(buf, ",");
+ }
+ }
+ buf += put_str(buf, "]");
+ }
+ if (compress) { // RLE
+ for (t = c + 1; t < mi_cols; ++t) {
+ insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+ int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset);
+ int same = 0;
+ if (len == 0) {
+ same = v[0] == nv[0];
+ } else {
+ for (i = 0; i < len; i++) {
+ same = v[i] == nv[i];
+ if (!same) {
+ break;
+ }
+ }
+ }
+ if (!same) {
+ break;
+ }
+ }
+ if (t - c > 1) {
+ *(buf++) = ',';
+ buf += put_num(buf, '[', t - c - 1, ']');
+ c = t - 1;
+ }
+ }
+ if (c < mi_cols - 1) *(buf++) = ',';
+ }
+ *(buf++) = ']';
+ if (r < mi_rows - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ return (int)(buf - buffer);
+}
+
+#if CONFIG_ACCOUNTING
+int put_accounting(char *buffer) {
+ char *buf = buffer;
+ int i;
+ const Accounting *accounting = frame_data.accounting;
+ if (accounting == NULL) {
+ printf("XXX\n");
+ return 0;
+ }
+ const int num_syms = accounting->syms.num_syms;
+ const int num_strs = accounting->syms.dictionary.num_strs;
+ buf += put_str(buf, " \"symbolsMap\": [");
+ for (i = 0; i < num_strs; i++) {
+ buf += snprintf(buf, MAX_BUFFER, "\"%s\"",
+ accounting->syms.dictionary.strs[i]);
+ if (i < num_strs - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ buf += put_str(buf, " \"symbols\": [\n ");
+ AccountingSymbolContext context;
+ context.x = -2;
+ context.y = -2;
+ AccountingSymbol *sym;
+ for (i = 0; i < num_syms; i++) {
+ sym = &accounting->syms.syms[i];
+ if (memcmp(&context, &sym->context, sizeof(AccountingSymbolContext)) != 0) {
+ buf += put_num(buf, '[', sym->context.x, 0);
+ buf += put_num(buf, ',', sym->context.y, ']');
+ } else {
+ buf += put_num(buf, '[', sym->id, 0);
+ buf += put_num(buf, ',', sym->bits, 0);
+ buf += put_num(buf, ',', sym->samples, ']');
+ }
+ context = sym->context;
+ if (i < num_syms - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ return (int)(buf - buffer);
+}
+#endif
+
+int skip_non_transform = 0;
+
+void inspect(void *pbi, void *data) {
+ /* Fetch frame data. */
+ ifd_inspect(&frame_data, pbi, skip_non_transform);
+
+ // Show existing frames just show a reference buffer we've already decoded.
+ // There's no information to show.
+ if (frame_data.show_existing_frame) return;
+
+ (void)data;
+ // We allocate enough space and hope we don't write out of bounds. Totally
+ // unsafe but this speeds things up, especially when compiled to Javascript.
+ char *buffer = aom_malloc(MAX_BUFFER);
+ if (!buffer) {
+ fprintf(stderr, "Error allocating inspect info buffer\n");
+ abort();
+ }
+ char *buf = buffer;
+ buf += put_str(buf, "{\n");
+ if (layers & BLOCK_SIZE_LAYER) {
+ buf += put_block_info(buf, block_size_map, "blockSize",
+ offsetof(insp_mi_data, bsize), 0);
+ }
+ if (layers & TRANSFORM_SIZE_LAYER) {
+ buf += put_block_info(buf, tx_size_map, "transformSize",
+ offsetof(insp_mi_data, tx_size), 0);
+ }
+ if (layers & TRANSFORM_TYPE_LAYER) {
+ buf += put_block_info(buf, tx_type_map, "transformType",
+ offsetof(insp_mi_data, tx_type), 0);
+ }
+ if (layers & DUAL_FILTER_LAYER) {
+ buf += put_block_info(buf, dual_filter_map, "dualFilterType",
+ offsetof(insp_mi_data, dual_filter_type), 0);
+ }
+ if (layers & MODE_LAYER) {
+ buf += put_block_info(buf, prediction_mode_map, "mode",
+ offsetof(insp_mi_data, mode), 0);
+ }
+ if (layers & UV_MODE_LAYER) {
+ buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode",
+ offsetof(insp_mi_data, uv_mode), 0);
+ }
+ if (layers & MOTION_MODE_LAYER) {
+ buf += put_block_info(buf, motion_mode_map, "motion_mode",
+ offsetof(insp_mi_data, motion_mode), 0);
+ }
+ if (layers & COMPOUND_TYPE_LAYER) {
+ buf += put_block_info(buf, compound_type_map, "compound_type",
+ offsetof(insp_mi_data, compound_type), 0);
+ }
+ if (layers & SKIP_LAYER) {
+ buf +=
+ put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0);
+ }
+ if (layers & FILTER_LAYER) {
+ buf +=
+ put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2);
+ }
+ if (layers & CDEF_LAYER) {
+ buf += put_block_info(buf, NULL, "cdef_level",
+ offsetof(insp_mi_data, cdef_level), 0);
+ buf += put_block_info(buf, NULL, "cdef_strength",
+ offsetof(insp_mi_data, cdef_strength), 0);
+ }
+ if (layers & CFL_LAYER) {
+ buf += put_block_info(buf, NULL, "cfl_alpha_idx",
+ offsetof(insp_mi_data, cfl_alpha_idx), 0);
+ buf += put_block_info(buf, NULL, "cfl_alpha_sign",
+ offsetof(insp_mi_data, cfl_alpha_sign), 0);
+ }
+ if (layers & Q_INDEX_LAYER) {
+ buf += put_block_info(buf, NULL, "delta_q",
+ offsetof(insp_mi_data, current_qindex), 0);
+ }
+ if (layers & SEGMENT_ID_LAYER) {
+ buf += put_block_info(buf, NULL, "seg_id",
+ offsetof(insp_mi_data, segment_id), 0);
+ }
+ if (layers & MOTION_VECTORS_LAYER) {
+ buf += put_motion_vectors(buf);
+ }
+ if (layers & INTRABC_LAYER) {
+ buf += put_block_info(buf, intrabc_map, "intrabc",
+ offsetof(insp_mi_data, intrabc), 0);
+ }
+ if (layers & PALETTE_LAYER) {
+ buf += put_block_info(buf, palette_map, "palette",
+ offsetof(insp_mi_data, palette), 0);
+ }
+ if (layers & UV_PALETTE_LAYER) {
+ buf += put_block_info(buf, palette_map, "uv_palette",
+ offsetof(insp_mi_data, uv_palette), 0);
+ }
+ if (combined_parm_count > 0) buf += put_combined(buf);
+ if (layers & REFERENCE_FRAME_LAYER) {
+ buf += put_block_info(buf, refs_map, "referenceFrame",
+ offsetof(insp_mi_data, ref_frame), 2);
+ }
+#if CONFIG_ACCOUNTING
+ if (layers & ACCOUNTING_LAYER) {
+ buf += put_accounting(buf);
+ }
+#endif
+ buf +=
+ snprintf(buf, MAX_BUFFER, " \"frame\": %d,\n", frame_data.frame_number);
+ buf += snprintf(buf, MAX_BUFFER, " \"showFrame\": %d,\n",
+ frame_data.show_frame);
+ buf += snprintf(buf, MAX_BUFFER, " \"frameType\": %d,\n",
+ frame_data.frame_type);
+ buf += snprintf(buf, MAX_BUFFER, " \"baseQIndex\": %d,\n",
+ frame_data.base_qindex);
+ buf += snprintf(buf, MAX_BUFFER, " \"tileCols\": %d,\n",
+ frame_data.tile_mi_cols);
+ buf += snprintf(buf, MAX_BUFFER, " \"tileRows\": %d,\n",
+ frame_data.tile_mi_rows);
+ buf += snprintf(buf, MAX_BUFFER, " \"deltaQPresentFlag\": %d,\n",
+ frame_data.delta_q_present_flag);
+ buf += snprintf(buf, MAX_BUFFER, " \"deltaQRes\": %d,\n",
+ frame_data.delta_q_res);
+ buf += put_str(buf, " \"config\": {");
+ buf += put_map(buf, config_map);
+ buf += put_str(buf, "},\n");
+ buf += put_str(buf, " \"configString\": \"");
+ buf += put_str_with_escape(buf, aom_codec_build_config());
+ buf += put_str(buf, "\"\n");
+ decoded_frame_count++;
+ buf += put_str(buf, "},\n");
+ *(buf++) = 0;
+ on_frame_decoded_dump(buffer);
+ aom_free(buffer);
+}
+
+void ifd_init_cb(void) {
+ aom_inspect_init ii;
+ ii.inspect_cb = inspect;
+ ii.inspect_ctx = NULL;
+ aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii);
+}
+
+EMSCRIPTEN_KEEPALIVE
+int open_file(char *file) {
+ if (file == NULL) {
+ // The JS analyzer puts the .ivf file at this location.
+ file = "/tmp/input.ivf";
+ }
+ reader = aom_video_reader_open(file);
+ if (!reader) die("Failed to open %s for reading.", file);
+ info = aom_video_reader_get_info(reader);
+ aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) die("Unknown input codec.");
+ fprintf(stderr, "Using %s\n", aom_codec_iface_name(decoder));
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+ ifd_init(&frame_data, info->frame_width, info->frame_height);
+ ifd_init_cb();
+ return EXIT_SUCCESS;
+}
+
+Av1DecodeReturn adr;
+int have_frame = 0;
+const unsigned char *frame;
+const unsigned char *end_frame;
+size_t frame_size = 0;
+
+EMSCRIPTEN_KEEPALIVE
+int read_frame(void) {
+ img = NULL;
+
+ // This loop skips over any frames that are show_existing_frames, as
+ // there is nothing to analyze.
+ do {
+ if (!have_frame) {
+ if (!aom_video_reader_read_frame(reader)) return EXIT_FAILURE;
+ frame = aom_video_reader_get_frame(reader, &frame_size);
+
+ have_frame = 1;
+ end_frame = frame + frame_size;
+ }
+
+ if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, &adr) !=
+ AOM_CODEC_OK) {
+ die_codec(&codec, "Failed to decode frame.");
+ }
+
+ frame = adr.buf;
+ frame_size = end_frame - frame;
+ if (frame == end_frame) have_frame = 0;
+ } while (adr.show_existing);
+
+ int got_any_frames = 0;
+ aom_image_t *frame_img;
+ struct av1_ref_frame ref_dec;
+ ref_dec.idx = adr.idx;
+
+ // ref_dec.idx is the index to the reference buffer idx to AV1_GET_REFERENCE
+ // if its -1 the decoder didn't update any reference buffer and the only
+ // way to see the frame is aom_codec_get_frame.
+ if (ref_dec.idx == -1) {
+ aom_codec_iter_t iter = NULL;
+ img = frame_img = aom_codec_get_frame(&codec, &iter);
+ ++frame_count;
+ got_any_frames = 1;
+ } else if (!aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_dec)) {
+ img = frame_img = &ref_dec.img;
+ ++frame_count;
+ got_any_frames = 1;
+ }
+ if (!got_any_frames) {
+ return EXIT_FAILURE;
+ }
+ return EXIT_SUCCESS;
+}
+
+EMSCRIPTEN_KEEPALIVE
+const char *get_aom_codec_build_config(void) {
+ return aom_codec_build_config();
+}
+
+EMSCRIPTEN_KEEPALIVE
+int get_bit_depth(void) { return img->bit_depth; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_bits_per_sample(void) { return img->bps; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_image_format(void) { return img->fmt; }
+
+EMSCRIPTEN_KEEPALIVE
+unsigned char *get_plane(int plane) { return img->planes[plane]; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_stride(int plane) { return img->stride[plane]; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_width(int plane) { return aom_img_plane_width(img, plane); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_height(int plane) { return aom_img_plane_height(img, plane); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_frame_width(void) { return info->frame_width; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_frame_height(void) { return info->frame_height; }
+
+static void parse_args(char **argv) {
+ char **argi, **argj;
+ struct arg arg;
+ (void)dump_accounting_arg;
+ (void)dump_cdef_arg;
+ for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+ arg.argv_step = 1;
+ if (arg_match(&arg, &dump_block_size_arg, argi)) layers |= BLOCK_SIZE_LAYER;
+#if CONFIG_ACCOUNTING
+ else if (arg_match(&arg, &dump_accounting_arg, argi))
+ layers |= ACCOUNTING_LAYER;
+#endif
+ else if (arg_match(&arg, &dump_transform_size_arg, argi))
+ layers |= TRANSFORM_SIZE_LAYER;
+ else if (arg_match(&arg, &dump_transform_type_arg, argi))
+ layers |= TRANSFORM_TYPE_LAYER;
+ else if (arg_match(&arg, &dump_mode_arg, argi))
+ layers |= MODE_LAYER;
+ else if (arg_match(&arg, &dump_uv_mode_arg, argi))
+ layers |= UV_MODE_LAYER;
+ else if (arg_match(&arg, &dump_motion_mode_arg, argi))
+ layers |= MOTION_MODE_LAYER;
+ else if (arg_match(&arg, &dump_compound_type_arg, argi))
+ layers |= COMPOUND_TYPE_LAYER;
+ else if (arg_match(&arg, &dump_skip_arg, argi))
+ layers |= SKIP_LAYER;
+ else if (arg_match(&arg, &dump_filter_arg, argi))
+ layers |= FILTER_LAYER;
+ else if (arg_match(&arg, &dump_cdef_arg, argi))
+ layers |= CDEF_LAYER;
+ else if (arg_match(&arg, &dump_cfl_arg, argi))
+ layers |= CFL_LAYER;
+ else if (arg_match(&arg, &dump_reference_frame_arg, argi))
+ layers |= REFERENCE_FRAME_LAYER;
+ else if (arg_match(&arg, &dump_motion_vectors_arg, argi))
+ layers |= MOTION_VECTORS_LAYER;
+ else if (arg_match(&arg, &dump_dual_filter_type_arg, argi))
+ layers |= DUAL_FILTER_LAYER;
+ else if (arg_match(&arg, &dump_delta_q_arg, argi))
+ layers |= Q_INDEX_LAYER;
+ else if (arg_match(&arg, &dump_seg_id_arg, argi))
+ layers |= SEGMENT_ID_LAYER;
+ else if (arg_match(&arg, &dump_intrabc_arg, argi))
+ layers |= INTRABC_LAYER;
+ else if (arg_match(&arg, &dump_palette_arg, argi))
+ layers |= PALETTE_LAYER;
+ else if (arg_match(&arg, &dump_uv_palette_arg, argi))
+ layers |= UV_PALETTE_LAYER;
+ else if (arg_match(&arg, &dump_all_arg, argi))
+ layers |= ALL_LAYERS;
+ else if (arg_match(&arg, &compress_arg, argi))
+ compress = 1;
+ else if (arg_match(&arg, &usage_arg, argi))
+ usage_exit();
+ else if (arg_match(&arg, &limit_arg, argi))
+ stop_after = arg_parse_uint(&arg);
+ else if (arg_match(&arg, &skip_non_transform_arg, argi))
+ skip_non_transform = arg_parse_uint(&arg);
+ else if (arg_match(&arg, &combined_arg, argi))
+ convert_to_indices(
+ (char *)arg.val, combined_parm_list,
+ sizeof(combined_parm_list) / sizeof(combined_parm_list[0]),
+ &combined_parm_count);
+ else
+ argj++;
+ }
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s src_filename <options>\n", exec_name);
+ fprintf(stderr, "\nOptions:\n");
+ arg_show_usage(stderr, main_args);
+ exit(EXIT_FAILURE);
+}
+
+EMSCRIPTEN_KEEPALIVE
+int main(int argc, char **argv) {
+ exec_name = argv[0];
+ parse_args(argv);
+ if (argc >= 2) {
+ open_file(argv[1]);
+ printf("[\n");
+ while (1) {
+ if (stop_after && (decoded_frame_count >= stop_after)) break;
+ if (read_frame()) break;
+ }
+ printf("null\n");
+ printf("]");
+ } else {
+ usage_exit();
+ }
+}
+
+EMSCRIPTEN_KEEPALIVE
+void quit(void) {
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+ aom_video_reader_close(reader);
+}
+
+EMSCRIPTEN_KEEPALIVE
+void set_layers(LayerType v) { layers = v; }
+
+EMSCRIPTEN_KEEPALIVE
+void set_compress(int v) { compress = v; }
diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c
new file mode 100644
index 0000000000..05272bafa3
--- /dev/null
+++ b/third_party/aom/examples/lightfield_bitstream_parsing.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Bitstream Parsing
+// ============================
+//
+// This is a lightfield bitstream parsing example. It takes an input file
+// containing the whole compressed lightfield bitstream(ivf file) and a text
+// file containing a stream of tiles to decode and then constructs and outputs
+// a new bitstream that can be decoded by an AV1 decoder. The output bitstream
+// contains reference frames(i.e. anchor frames), camera frame header, and
+// tile list OBUs. num_references is the number of anchor frames coded at the
+// beginning of the light field file. After running the lightfield encoder,
+// run lightfield bitstream parsing:
+// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4
+// tile_list.txt
+//
+// The tile_list.txt is expected to be of the form:
+// Frame <frame_index0>
+// <image_index0> <anchor_index0> <tile_col0> <tile_row0>
+// <image_index1> <anchor_index1> <tile_col1> <tile_row1>
+// ...
+// Frame <frame_index1)
+// ...
+//
+// The "Frame" markers indicate a new render frame and thus a new tile list
+// will be started and the old one flushed. The image_indexN, anchor_indexN,
+// tile_colN, and tile_rowN identify an individual tile to be decoded and
+// to use anchor_indexN anchor image for MCP.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/video_writer.h"
+
+#define MAX_TILES 512
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <infile> <outfile> <num_references> <tile_list>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+const int output_frame_width = 512;
+const int output_frame_height = 512;
+
+// Spec:
+// typedef struct {
+// uint8_t anchor_frame_idx;
+// uint8_t tile_row;
+// uint8_t tile_col;
+// uint16_t coded_tile_data_size_minus_1;
+// uint8_t *coded_tile_data;
+// } TILE_LIST_ENTRY;
+
+// Tile list entry provided by the application
+typedef struct {
+ int image_idx;
+ int reference_idx;
+ int tile_col;
+ int tile_row;
+} TILE_LIST_INFO;
+
+static int get_image_bps(aom_img_fmt_t fmt) {
+ switch (fmt) {
+ case AOM_IMG_FMT_I420: return 12;
+ case AOM_IMG_FMT_I422: return 16;
+ case AOM_IMG_FMT_I444: return 24;
+ case AOM_IMG_FMT_I42016: return 24;
+ case AOM_IMG_FMT_I42216: return 32;
+ case AOM_IMG_FMT_I44416: return 48;
+ default: die("Invalid image format");
+ }
+}
+
+static void process_tile_list(const TILE_LIST_INFO *tiles, int num_tiles,
+ aom_codec_pts_t tl_pts, unsigned char **frames,
+ const size_t *frame_sizes, aom_codec_ctx_t *codec,
+ unsigned char *tl_buf, AvxVideoWriter *writer,
+ uint8_t output_frame_width_in_tiles_minus_1,
+ uint8_t output_frame_height_in_tiles_minus_1) {
+ unsigned char *tl = tl_buf;
+ struct aom_write_bit_buffer wb = { tl, 0 };
+ unsigned char *saved_obu_size_loc = NULL;
+ uint32_t tile_list_obu_header_size = 0;
+ uint32_t tile_list_obu_size = 0;
+ int num_tiles_minus_1 = num_tiles - 1;
+ int i;
+
+ // Write the tile list OBU header that is 1 byte long.
+ aom_wb_write_literal(&wb, 0, 1); // forbidden bit.
+ aom_wb_write_literal(&wb, 8, 4); // tile list OBU: "1000"
+ aom_wb_write_literal(&wb, 0, 1); // obu_extension = 0
+ aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field
+ aom_wb_write_literal(&wb, 0, 1); // reserved
+ tl++;
+ tile_list_obu_header_size++;
+
+ // Write the OBU size using a fixed length_field_size of 4 bytes.
+ saved_obu_size_loc = tl;
+ // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32.
+ aom_wb_write_unsigned_literal(&wb, 0, 32);
+ tl += 4;
+ tile_list_obu_header_size += 4;
+
+ // write_tile_list_obu()
+ aom_wb_write_literal(&wb, output_frame_width_in_tiles_minus_1, 8);
+ aom_wb_write_literal(&wb, output_frame_height_in_tiles_minus_1, 8);
+ aom_wb_write_literal(&wb, num_tiles_minus_1, 16);
+ tl += 4;
+ tile_list_obu_size += 4;
+
+ // Write each tile's data
+ for (i = 0; i <= num_tiles_minus_1; i++) {
+ aom_tile_data tile_data = { 0, NULL, 0 };
+
+ int image_idx = tiles[i].image_idx;
+ int ref_idx = tiles[i].reference_idx;
+ int tc = tiles[i].tile_col;
+ int tr = tiles[i].tile_row;
+
+ // Reset bit writer to the right location.
+ wb.bit_buffer = tl;
+ wb.bit_offset = 0;
+
+ size_t frame_size = frame_sizes[image_idx];
+ const unsigned char *frame = frames[image_idx];
+
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc);
+
+ aom_codec_err_t aom_status =
+ aom_codec_decode(codec, frame, frame_size, NULL);
+ if (aom_status) die_codec(codec, "Failed to decode tile.");
+
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_DATA, &tile_data);
+
+ // Copy over tile info.
+ // uint8_t anchor_frame_idx;
+ // uint8_t tile_row;
+ // uint8_t tile_col;
+ // uint16_t coded_tile_data_size_minus_1;
+ // uint8_t *coded_tile_data;
+ uint32_t tile_info_bytes = 5;
+ aom_wb_write_literal(&wb, ref_idx, 8);
+ aom_wb_write_literal(&wb, tr, 8);
+ aom_wb_write_literal(&wb, tc, 8);
+ aom_wb_write_literal(&wb, (int)tile_data.coded_tile_data_size - 1, 16);
+ tl += tile_info_bytes;
+
+ memcpy(tl, (uint8_t *)tile_data.coded_tile_data,
+ tile_data.coded_tile_data_size);
+ tl += tile_data.coded_tile_data_size;
+
+ tile_list_obu_size +=
+ tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size;
+ }
+
+ // Write tile list OBU size.
+ size_t bytes_written = 0;
+ if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc,
+ &bytes_written))
+ die_codec(codec, "Failed to encode the tile list obu size.");
+
+ // Copy the tile list.
+ if (!aom_video_writer_write_frame(
+ writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size,
+ tl_pts))
+ die_codec(codec, "Failed to copy compressed tile list.");
+}
+
+int main(int argc, char **argv) {
+ AvxVideoReader *reader = NULL;
+ AvxVideoWriter *writer = NULL;
+ const AvxVideoInfo *info = NULL;
+ int num_references;
+ int i;
+ aom_codec_pts_t pts;
+ const char *tile_list_file = NULL;
+
+ exec_name = argv[0];
+ if (argc != 5) die("Invalid number of arguments.");
+
+ reader = aom_video_reader_open(argv[1]);
+ if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+ num_references = (int)strtol(argv[3], NULL, 0);
+ info = aom_video_reader_get_info(reader);
+
+ aom_video_reader_set_fourcc(reader, AV1_FOURCC);
+
+ // The writer to write out ivf file in tile list OBU, which can be decoded by
+ // AV1 decoder.
+ writer = aom_video_writer_open(argv[2], kContainerIVF, info);
+ if (!writer) die("Failed to open %s for writing", argv[2]);
+
+ tile_list_file = argv[4];
+
+ aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) die("Unknown input codec.");
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+
+ // Decode anchor frames.
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
+
+ printf("Reading %d reference images.\n", num_references);
+ for (i = 0; i < num_references; ++i) {
+ aom_video_reader_read_frame(reader);
+
+ size_t frame_size = 0;
+ const unsigned char *frame =
+ aom_video_reader_get_frame(reader, &frame_size);
+ pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader);
+
+ // Copy references bitstream directly.
+ if (!aom_video_writer_write_frame(writer, frame, frame_size, pts))
+ die_codec(&codec, "Failed to copy compressed anchor frame.");
+
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode frame.");
+ }
+
+ // Decode camera frames.
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1);
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, 1);
+
+ FILE *infile = aom_video_reader_get_file(reader);
+ // Record the offset of the first camera image.
+ const FileOffset camera_frame_pos = ftello(infile);
+
+ printf("Loading compressed frames into memory.\n");
+
+ // Count the frames in the lightfield.
+ int num_frames = 0;
+ while (aom_video_reader_read_frame(reader)) {
+ ++num_frames;
+ }
+ if (num_frames < 1) die("Input light field has no frames.");
+
+ // Read all of the lightfield frames into memory.
+ unsigned char **frames =
+ (unsigned char **)malloc(num_frames * sizeof(unsigned char *));
+ size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t));
+ if (!(frames && frame_sizes)) die("Failed to allocate frame data.");
+
+ // Seek to the first camera image.
+ fseeko(infile, camera_frame_pos, SEEK_SET);
+ for (int f = 0; f < num_frames; ++f) {
+ aom_video_reader_read_frame(reader);
+ size_t frame_size = 0;
+ const unsigned char *frame =
+ aom_video_reader_get_frame(reader, &frame_size);
+ frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char));
+ if (!frames[f]) die("Failed to allocate frame data.");
+ memcpy(frames[f], frame, frame_size);
+ frame_sizes[f] = frame_size;
+ }
+ printf("Read %d frames.\n", num_frames);
+
+ // Copy first camera frame for getting camera frame header. This is done
+ // only once.
+ {
+ size_t frame_size = frame_sizes[0];
+ const unsigned char *frame = frames[0];
+ pts = num_references;
+ aom_tile_data frame_header_info = { 0, NULL, 0 };
+
+ // Need to decode frame header to get camera frame header info. So, here
+ // decoding 1 tile is enough.
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_ROW, 0);
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_COL, 0);
+
+ aom_codec_err_t aom_status =
+ aom_codec_decode(&codec, frame, frame_size, NULL);
+ if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_HEADER_INFO,
+ &frame_header_info);
+
+ size_t obu_size_offset =
+ (uint8_t *)frame_header_info.coded_tile_data - frame;
+ size_t length_field_size = frame_header_info.coded_tile_data_size;
+ // Remove ext-tile tile info.
+ uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1;
+ size_t bytes_to_copy =
+ obu_size_offset + length_field_size + frame_header_size;
+
+ unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy);
+ if (frame_hdr_buf == NULL)
+ die_codec(&codec, "Failed to allocate frame header buffer.");
+
+ memcpy(frame_hdr_buf, frame, bytes_to_copy);
+
+ // Update frame header OBU size.
+ size_t bytes_written = 0;
+ if (aom_uleb_encode_fixed_size(
+ frame_header_size, length_field_size, length_field_size,
+ frame_hdr_buf + obu_size_offset, &bytes_written))
+ die_codec(&codec, "Failed to encode the tile list obu size.");
+
+ // Copy camera frame header bitstream.
+ if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy,
+ pts))
+ die_codec(&codec, "Failed to copy compressed camera frame header.");
+ free(frame_hdr_buf);
+ }
+
+ // Read out the image format.
+ aom_img_fmt_t ref_fmt = 0;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+ die_codec(&codec, "Failed to get the image format");
+ const int bps = get_image_bps(ref_fmt);
+ if (!bps) die_codec(&codec, "Invalid image format.");
+ // read out the tile size.
+ unsigned int tile_size = 0;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_TILE_SIZE, &tile_size))
+ die_codec(&codec, "Failed to get the tile size");
+ const unsigned int tile_width = tile_size >> 16;
+ const unsigned int tile_height = tile_size & 65535;
+ // Allocate a buffer to store tile list bitstream.
+ const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) *
+ ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8;
+
+ unsigned char *tl_buf = (unsigned char *)malloc(data_sz);
+ if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer.");
+
+ aom_codec_pts_t tl_pts = num_references;
+ const uint8_t output_frame_width_in_tiles_minus_1 =
+ output_frame_width / tile_width - 1;
+ const uint8_t output_frame_height_in_tiles_minus_1 =
+ output_frame_height / tile_height - 1;
+
+ printf("Reading tile list from file.\n");
+ char line[1024];
+ FILE *tile_list_fptr = fopen(tile_list_file, "r");
+ if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file.");
+ int num_tiles = 0;
+ TILE_LIST_INFO tiles[MAX_TILES];
+ while ((fgets(line, 1024, tile_list_fptr)) != NULL) {
+ if (line[0] == 'F' || num_tiles >= MAX_TILES) {
+ // Flush existing tile list and start another, either because we hit a
+ // new render frame or because we've hit our max number of tiles per list.
+ if (num_tiles > 0) {
+ process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec,
+ tl_buf, writer, output_frame_width_in_tiles_minus_1,
+ output_frame_height_in_tiles_minus_1);
+ ++tl_pts;
+ }
+ num_tiles = 0;
+ }
+ if (line[0] == 'F') {
+ continue;
+ }
+ if (sscanf(line, "%d %d %d %d", &tiles[num_tiles].image_idx,
+ &tiles[num_tiles].reference_idx, &tiles[num_tiles].tile_col,
+ &tiles[num_tiles].tile_row) == 4) {
+ if (tiles[num_tiles].image_idx >= num_frames) {
+ die("Tile list image_idx out of bounds: %d >= %d.",
+ tiles[num_tiles].image_idx, num_frames);
+ }
+ if (tiles[num_tiles].reference_idx >= num_references) {
+ die("Tile list reference_idx out of bounds: %d >= %d.",
+ tiles[num_tiles].reference_idx, num_references);
+ }
+ ++num_tiles;
+ }
+ }
+ if (num_tiles > 0) {
+ // Flush out the last tile list.
+ process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec,
+ tl_buf, writer, output_frame_width_in_tiles_minus_1,
+ output_frame_height_in_tiles_minus_1);
+ ++tl_pts;
+ }
+
+ const int num_tile_lists = (int)(tl_pts - pts);
+ printf("Finished processing tile lists. Num tile lists: %d.\n",
+ num_tile_lists);
+ free(tl_buf);
+ for (int f = 0; f < num_frames; ++f) {
+ free(frames[f]);
+ }
+ free(frame_sizes);
+ free(frames);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+ aom_video_writer_close(writer);
+ aom_video_reader_close(reader);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c
new file mode 100644
index 0000000000..65b13efa1a
--- /dev/null
+++ b/third_party/aom/examples/lightfield_decoder.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Decoder
+// ==================
+//
+// This is an example of a simple lightfield decoder. It builds upon the
+// simple_decoder.c example. It takes an input file containing the compressed
+// data (in ivf format), treating it as a lightfield instead of a video; and a
+// text file with a list of tiles to decode. There is an optional parameter
+// allowing to choose the output format, and the supported formats are
+// YUV1D(default), YUV, and NV12.
+// After running the lightfield encoder, run lightfield decoder to decode a
+// batch of tiles:
+// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4 tile_list.txt
+// 0(optional)
+// The tile_list.txt is expected to be of the form:
+// Frame <frame_index0>
+// <image_index0> <anchor_index0> <tile_col0> <tile_row0>
+// <image_index1> <anchor_index1> <tile_col1> <tile_row1>
+// ...
+// Frame <frame_index1)
+// ...
+//
+// The "Frame" markers indicate a new render frame and thus a new tile list
+// will be started and the old one flushed. The image_indexN, anchor_indexN,
+// tile_colN, and tile_rowN identify an individual tile to be decoded and
+// to use anchor_indexN anchor image for MCP.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+enum {
+ YUV1D, // 1D tile output for conformance test.
+ YUV, // Tile output in YUV format.
+ NV12, // Tile output in NV12 format.
+} UENUM1BYTE(OUTPUT_FORMAT);
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <infile> <outfile> <num_references> <tile_list> <output "
+ "format(optional)>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+// Output frame size
+static const int output_frame_width = 512;
+static const int output_frame_height = 512;
+
+static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst,
+ int dst_row_offset, int dst_col_offset) {
+ const int shift = (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+ int plane;
+
+ for (plane = 0; plane < 3; ++plane) {
+ const unsigned char *src_buf = src->planes[plane];
+ const int src_stride = src->stride[plane];
+ unsigned char *dst_buf = dst->planes[plane];
+ const int dst_stride = dst->stride[plane];
+ const int roffset =
+ (plane > 0) ? dst_row_offset >> dst->y_chroma_shift : dst_row_offset;
+ const int coffset =
+ (plane > 0) ? dst_col_offset >> dst->x_chroma_shift : dst_col_offset;
+
+ // col offset needs to be adjusted for HBD.
+ dst_buf += roffset * dst_stride + (coffset << shift);
+
+ const int w = (aom_img_plane_width(src, plane) << shift);
+ const int h = aom_img_plane_height(src, plane);
+ int y;
+
+ for (y = 0; y < h; ++y) {
+ memcpy(dst_buf, src_buf, w);
+ src_buf += src_stride;
+ dst_buf += dst_stride;
+ }
+ }
+}
+
+static void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame,
+ size_t frame_size, int tr, int tc, int ref_idx,
+ aom_image_t *reference_images, aom_image_t *output,
+ int *tile_idx, unsigned int *output_bit_depth,
+ aom_image_t **img_ptr, int output_format) {
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1);
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1);
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
+ AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc);
+
+ av1_ref_frame_t ref;
+ ref.idx = 0;
+ ref.use_external_ref = 1;
+ ref.img = reference_images[ref_idx];
+ if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_REFERENCE, &ref)) {
+ die_codec(codec, "Failed to set reference frame.");
+ }
+
+ aom_codec_err_t aom_status = aom_codec_decode(codec, frame, frame_size, NULL);
+ if (aom_status) die_codec(codec, "Failed to decode tile.");
+
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = aom_codec_get_frame(codec, &iter);
+ if (!img) die_codec(codec, "Failed to get frame.");
+ *img_ptr = img;
+
+ // aom_img_alloc() sets bit_depth as follows:
+ // output->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+ // Use img->bit_depth(read from bitstream), so that aom_shift_img()
+ // works as expected.
+ output->bit_depth = img->bit_depth;
+ *output_bit_depth = img->bit_depth;
+
+ if (output_format != YUV1D) {
+ // read out the tile size.
+ unsigned int tile_size = 0;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size))
+ die_codec(codec, "Failed to get the tile size");
+ const unsigned int tile_width = tile_size >> 16;
+ const unsigned int tile_height = tile_size & 65535;
+ const uint32_t output_frame_width_in_tiles =
+ output_frame_width / tile_width;
+
+ // Copy the tile to the output frame.
+ const int row_offset =
+ (*tile_idx / output_frame_width_in_tiles) * tile_height;
+ const int col_offset =
+ (*tile_idx % output_frame_width_in_tiles) * tile_width;
+
+ aom_img_copy_tile(img, output, row_offset, col_offset);
+ (*tile_idx)++;
+ }
+}
+
+static void img_write_to_file(const aom_image_t *img, FILE *file,
+ int output_format) {
+ if (output_format == YUV)
+ aom_img_write(img, file);
+ else if (output_format == NV12)
+ aom_img_write_nv12(img, file);
+ else
+ die("Invalid output format");
+}
+
+int main(int argc, char **argv) {
+ FILE *outfile = NULL;
+ AvxVideoReader *reader = NULL;
+ const AvxVideoInfo *info = NULL;
+ int num_references;
+ aom_img_fmt_t ref_fmt = 0;
+ aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+ aom_image_t output;
+ aom_image_t *output_shifted = NULL;
+ size_t frame_size = 0;
+ const unsigned char *frame = NULL;
+ int i, j;
+ const char *tile_list_file = NULL;
+ int output_format = YUV1D;
+ exec_name = argv[0];
+
+ if (argc < 5) die("Invalid number of arguments.");
+
+ reader = aom_video_reader_open(argv[1]);
+ if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+ if (!(outfile = fopen(argv[2], "wb")))
+ die("Failed to open %s for writing.", argv[2]);
+
+ num_references = (int)strtol(argv[3], NULL, 0);
+ tile_list_file = argv[4];
+
+ if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0);
+ if (output_format < YUV1D || output_format > NV12)
+ die("Output format out of range [0, 2]");
+
+ info = aom_video_reader_get_info(reader);
+
+ aom_codec_iface_t *decoder;
+ if (info->codec_fourcc == LST_FOURCC)
+ decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
+ else
+ die("Unknown input codec.");
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die_codec(&codec, "Failed to initialize decoder.");
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
+ info->is_annexb)) {
+ die("Failed to set annex b status");
+ }
+
+ // Decode anchor frames.
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
+ for (i = 0; i < num_references; ++i) {
+ aom_video_reader_read_frame(reader);
+ frame = aom_video_reader_get_frame(reader, &frame_size);
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode frame.");
+
+ if (i == 0) {
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+ die_codec(&codec, "Failed to get the image format");
+
+ int frame_res[2];
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+ die_codec(&codec, "Failed to get the image frame size");
+
+ // Allocate memory to store decoded references. Allocate memory with the
+ // border so that it can be used as a reference.
+ for (j = 0; j < num_references; j++) {
+ unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
+ if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+ frame_res[0], frame_res[1], 32, 8,
+ border)) {
+ die("Failed to allocate references.");
+ }
+ }
+ }
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+ &reference_images[i]))
+ die_codec(&codec, "Failed to copy decoded reference frame");
+
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = NULL;
+ while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+ char name[1024];
+ snprintf(name, sizeof(name), "ref_%d.yuv", i);
+ printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h);
+ FILE *ref_file = fopen(name, "wb");
+ aom_img_write(img, ref_file);
+ fclose(ref_file);
+ }
+ }
+
+ FILE *infile = aom_video_reader_get_file(reader);
+ // Record the offset of the first camera image.
+ const FileOffset camera_frame_pos = ftello(infile);
+
+ printf("Loading compressed frames into memory.\n");
+
+ // Count the frames in the lightfield.
+ int num_frames = 0;
+ while (aom_video_reader_read_frame(reader)) {
+ ++num_frames;
+ }
+ if (num_frames < 1) die("Input light field has no frames.");
+
+ // Read all of the lightfield frames into memory.
+ unsigned char **frames =
+ (unsigned char **)malloc(num_frames * sizeof(unsigned char *));
+ size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t));
+ if (!(frames && frame_sizes)) die("Failed to allocate frame data.");
+ // Seek to the first camera image.
+ fseeko(infile, camera_frame_pos, SEEK_SET);
+ for (int f = 0; f < num_frames; ++f) {
+ aom_video_reader_read_frame(reader);
+ frame = aom_video_reader_get_frame(reader, &frame_size);
+ frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char));
+ if (!frames[f]) die("Failed to allocate frame data.");
+ memcpy(frames[f], frame, frame_size);
+ frame_sizes[f] = frame_size;
+ }
+ printf("Read %d frames.\n", num_frames);
+
+ if (output_format != YUV1D) {
+ // Allocate the output frame.
+ aom_img_fmt_t out_fmt = ref_fmt;
+ if (FORCE_HIGHBITDEPTH_DECODING) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ if (!aom_img_alloc(&output, out_fmt, output_frame_width,
+ output_frame_height, 32))
+ die("Failed to allocate output image.");
+ }
+
+ printf("Decoding tile list from file.\n");
+ char line[1024];
+ FILE *tile_list_fptr = fopen(tile_list_file, "r");
+ if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file.");
+ int tile_list_cnt = 0;
+ int tile_list_writes = 0;
+ int tile_idx = 0;
+ aom_image_t *out = NULL;
+ unsigned int output_bit_depth = 0;
+
+ while ((fgets(line, 1024, tile_list_fptr)) != NULL) {
+ if (line[0] == 'F') {
+ if (output_format != YUV1D) {
+ // Write out the tile list.
+ if (tile_list_cnt) {
+ out = &output;
+ if (output_bit_depth != 0) {
+ if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) {
+ die("Error allocating image");
+ }
+ }
+ img_write_to_file(out, outfile, output_format);
+ tile_list_writes++;
+ }
+
+ tile_list_cnt++;
+ tile_idx = 0;
+ // Then memset the frame.
+ memset(output.img_data, 0, output.sz);
+ }
+ continue;
+ }
+
+ int image_idx, ref_idx, tc, tr;
+ sscanf(line, "%d %d %d %d", &image_idx, &ref_idx, &tc, &tr);
+ if (image_idx >= num_frames) {
+ die("Tile list image_idx out of bounds: %d >= %d.", image_idx,
+ num_frames);
+ }
+ if (ref_idx >= num_references) {
+ die("Tile list ref_idx out of bounds: %d >= %d.", ref_idx,
+ num_references);
+ }
+ frame = frames[image_idx];
+ frame_size = frame_sizes[image_idx];
+
+ aom_image_t *img = NULL;
+ decode_tile(&codec, frame, frame_size, tr, tc, ref_idx, reference_images,
+ &output, &tile_idx, &output_bit_depth, &img, output_format);
+ if (output_format == YUV1D) {
+ out = img;
+ if (output_bit_depth != 0) {
+ if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) {
+ die("Error allocating image");
+ }
+ }
+ aom_img_write(out, outfile);
+ }
+ }
+
+ if (output_format != YUV1D) {
+ // Write out the last tile list.
+ if (tile_list_writes < tile_list_cnt) {
+ out = &output;
+ if (output_bit_depth != 0) {
+ if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) {
+ die("Error allocating image");
+ }
+ }
+ img_write_to_file(out, outfile, output_format);
+ }
+ }
+
+ if (output_shifted) aom_img_free(output_shifted);
+ if (output_format != YUV1D) aom_img_free(&output);
+ for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+ for (int f = 0; f < num_frames; ++f) {
+ free(frames[f]);
+ }
+ free(frame_sizes);
+ free(frames);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+ aom_video_reader_close(reader);
+ fclose(outfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c
new file mode 100644
index 0000000000..9aef836ac2
--- /dev/null
+++ b/third_party/aom/examples/lightfield_encoder.c
@@ -0,0 +1,525 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Encoder
+// ==================
+//
+// This is an example of a simple lightfield encoder. It builds upon the
+// twopass_encoder.c example. It takes an input file in YV12 format,
+// treating it as a planar lightfield instead of a video. The img_width
+// and img_height arguments are the dimensions of the lightfield images,
+// while the lf_width and lf_height arguments are the number of
+// lightfield images in each dimension. The lf_blocksize determines the
+// number of reference images used for MCP. For example, 5 means that there
+// is a reference image for every 5x5 lightfield image block. All images
+// within a block will use the center image in that block as the reference
+// image for MCP.
+// Run "make test" to download lightfield test data: vase10x10.yuv.
+// Run lightfield encoder to encode whole lightfield:
+// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5
+
+// Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print
+// out the uncompressed header and the frame contexts, which can be used to
+// test the bit exactness of the headers and the frame contexts for large scale
+// tile coded frames.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/encoder_utils.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <img_width> <img_height> <infile> <outfile> "
+ "<lf_width> <lf_height> <lf_blocksize>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static int img_size_bytes(aom_image_t *img) {
+ int image_size_bytes = 0;
+ int plane;
+ for (plane = 0; plane < 3; ++plane) {
+ const int w = aom_img_plane_width(img, plane) *
+ ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ const int h = aom_img_plane_height(img, plane);
+ image_size_bytes += w * h;
+ }
+ return image_size_bytes;
+}
+
+static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
+ aom_codec_pts_t pts, unsigned int duration,
+ aom_enc_frame_flags_t flags,
+ aom_fixed_buf_t *stats) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+ if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
+
+ while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_STATS_PKT) {
+ const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+ const size_t pkt_size = pkt->data.twopass_stats.sz;
+ stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+ if (!stats->buf) die("Failed to allocate frame stats buffer.");
+ memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+ stats->sz += pkt_size;
+ }
+ }
+
+ return got_pkts;
+}
+
+static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
+ aom_codec_pts_t pts, unsigned int duration,
+ aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+ if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
+
+ while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+ if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts))
+ die_codec(ctx, "Failed to write compressed frame.");
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw,
+ aom_image_t *raw_shift) {
+ if (FORCE_HIGHBITDEPTH_DECODING) {
+ // Need to allocate larger buffer to use hbd internal.
+ int input_shift = 0;
+ aom_img_upshift(raw_shift, raw, input_shift);
+ *frame_to_encode = raw_shift;
+ } else {
+ *frame_to_encode = raw;
+ }
+}
+
+static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
+ aom_codec_iface_t *encoder,
+ const aom_codec_enc_cfg_t *cfg, int lf_width,
+ int lf_height, int lf_blocksize, int flags,
+ aom_image_t *raw_shift) {
+ aom_codec_ctx_t codec;
+ int frame_count = 0;
+ int image_size_bytes = img_size_bytes(raw);
+ int u_blocks, v_blocks;
+ int bu, bv;
+ aom_fixed_buf_t stats = { NULL, 0 };
+ aom_image_t *frame_to_encode;
+
+ if (aom_codec_enc_init(&codec, encoder, cfg, flags))
+ die("Failed to initialize encoder");
+ if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
+ die_codec(&codec, "Failed to turn off auto altref");
+ if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+ die_codec(&codec, "Failed to set frame parallel decoding");
+
+ // How many reference images we need to encode.
+ u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+ v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+ printf("\n First pass: ");
+
+ for (bv = 0; bv < v_blocks; ++bv) {
+ for (bu = 0; bu < u_blocks; ++bu) {
+ const int block_u_min = bu * lf_blocksize;
+ const int block_v_min = bv * lf_blocksize;
+ int block_u_end = (bu + 1) * lf_blocksize;
+ int block_v_end = (bv + 1) * lf_blocksize;
+ int u_block_size, v_block_size;
+ int block_ref_u, block_ref_v;
+
+ block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+ block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+ u_block_size = block_u_end - block_u_min;
+ v_block_size = block_v_end - block_v_min;
+ block_ref_u = block_u_min + u_block_size / 2;
+ block_ref_v = block_v_min + v_block_size / 2;
+
+ printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
+ fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
+ SEEK_SET);
+ aom_img_read(raw, infile);
+ get_raw_image(&frame_to_encode, raw, raw_shift);
+
+ // Reference frames can be encoded encoded without tiles.
+ ++frame_count;
+ get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+ AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF,
+ &stats);
+ }
+ }
+
+ if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+ die_codec(&codec, "Failed to set frame parallel decoding");
+
+ for (bv = 0; bv < v_blocks; ++bv) {
+ for (bu = 0; bu < u_blocks; ++bu) {
+ const int block_u_min = bu * lf_blocksize;
+ const int block_v_min = bv * lf_blocksize;
+ int block_u_end = (bu + 1) * lf_blocksize;
+ int block_v_end = (bv + 1) * lf_blocksize;
+ int u, v;
+ block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+ block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+ for (v = block_v_min; v < block_v_end; ++v) {
+ for (u = block_u_min; u < block_u_end; ++u) {
+ printf("C%d, ", (u + v * lf_width));
+ fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
+ aom_img_read(raw, infile);
+ get_raw_image(&frame_to_encode, raw, raw_shift);
+
+ ++frame_count;
+ get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+ AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+ &stats);
+ }
+ }
+ }
+ }
+ // Flush encoder.
+ // No ARF, this should not be needed.
+ while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
+ }
+
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ printf("\nFirst pass complete. Processed %d frames.\n", frame_count);
+
+ return stats;
+}
+
+static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
+ aom_codec_iface_t *encoder, aom_codec_enc_cfg_t *cfg,
+ int lf_width, int lf_height, int lf_blocksize, int flags,
+ aom_image_t *raw_shift) {
+ AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder),
+ cfg->g_w,
+ cfg->g_h,
+ { cfg->g_timebase.num, cfg->g_timebase.den },
+ 0 };
+ AvxVideoWriter *writer = NULL;
+ aom_codec_ctx_t codec;
+ int frame_count = 0;
+ int image_size_bytes = img_size_bytes(raw);
+ int bu, bv;
+ int u_blocks, v_blocks;
+ aom_image_t *frame_to_encode;
+ aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+ int reference_image_num = 0;
+ int i;
+
+ writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
+ if (!writer) die("Failed to open %s for writing", outfile_name);
+
+ if (aom_codec_enc_init(&codec, encoder, cfg, flags))
+ die("Failed to initialize encoder");
+ if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
+ die_codec(&codec, "Failed to turn off auto altref");
+ if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+ die_codec(&codec, "Failed to set frame parallel decoding");
+ if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1))
+ die_codec(&codec, "Failed to enable encoder ext_tile debug");
+ if (aom_codec_control(&codec, AOME_SET_CPUUSED, 3))
+ die_codec(&codec, "Failed to set cpu-used");
+
+ // Note: The superblock is a sequence parameter and has to be the same for 1
+ // sequence. In lightfield application, must choose the superblock size(either
+ // 64x64 or 128x128) before the encoding starts. Otherwise, the default is
+ // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64
+ // internally.
+ if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE,
+ AOM_SUPERBLOCK_SIZE_64X64))
+ die_codec(&codec, "Failed to set SB size");
+
+ u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+ v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+ reference_image_num = u_blocks * v_blocks;
+ // Set the max gf group length so the references are guaranteed to be in
+ // a different gf group than any of the regular frames. This avoids using
+ // both vbr and constant quality mode in a single group. The number of
+ // references now cannot surpass 17 because of the enforced MAX_GF_INTERVAL of
+ // 16. If it is necessary to exceed this reference frame limit, one will have
+ // to do some additional handling to ensure references are in separate gf
+ // groups from the regular frames.
+ if (aom_codec_control(&codec, AV1E_SET_MAX_GF_INTERVAL,
+ reference_image_num - 1))
+ die_codec(&codec, "Failed to set max gf interval");
+ aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+ if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ // Allocate memory with the border so that it can be used as a reference.
+ const bool resize =
+ codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode;
+ const bool all_intra = reference_image_num - 1 == 0;
+ int border_in_pixels =
+ av1_get_enc_border_size(resize, all_intra, BLOCK_64X64);
+
+ for (i = 0; i < reference_image_num; i++) {
+ if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w,
+ cfg->g_h, 32, 8, border_in_pixels)) {
+ die("Failed to allocate image.");
+ }
+ }
+
+ printf("\n Second pass: ");
+
+ // Encode reference images first.
+ printf("Encoding Reference Images\n");
+ for (bv = 0; bv < v_blocks; ++bv) {
+ for (bu = 0; bu < u_blocks; ++bu) {
+ const int block_u_min = bu * lf_blocksize;
+ const int block_v_min = bv * lf_blocksize;
+ int block_u_end = (bu + 1) * lf_blocksize;
+ int block_v_end = (bv + 1) * lf_blocksize;
+ int u_block_size, v_block_size;
+ int block_ref_u, block_ref_v;
+
+ block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+ block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+ u_block_size = block_u_end - block_u_min;
+ v_block_size = block_v_end - block_v_min;
+ block_ref_u = block_u_min + u_block_size / 2;
+ block_ref_v = block_v_min + v_block_size / 2;
+
+ printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
+ fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
+ SEEK_SET);
+ aom_img_read(raw, infile);
+
+ get_raw_image(&frame_to_encode, raw, raw_shift);
+
+ // Reference frames may be encoded without tiles.
+ ++frame_count;
+ printf("Encoding reference image %d of %d\n", bv * u_blocks + bu,
+ u_blocks * v_blocks);
+ encode_frame(&codec, frame_to_encode, frame_count, 1,
+ AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+ writer);
+
+ if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+ &reference_images[frame_count - 1]))
+ die_codec(&codec, "Failed to copy decoder reference frame");
+ }
+ }
+
+ cfg->large_scale_tile = 1;
+ // Fixed q encoding for camera frames.
+ cfg->rc_end_usage = AOM_Q;
+ if (aom_codec_enc_config_set(&codec, cfg))
+ die_codec(&codec, "Failed to configure encoder");
+
+ // The fixed q value used in encoding.
+ if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36))
+ die_codec(&codec, "Failed to set cq level");
+ if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+ die_codec(&codec, "Failed to set frame parallel decoding");
+ if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1))
+ die_codec(&codec, "Failed to turn on single tile decoding");
+ // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+ // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+ if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6))
+ die_codec(&codec, "Failed to set tile width");
+ if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6))
+ die_codec(&codec, "Failed to set tile height");
+
+ for (bv = 0; bv < v_blocks; ++bv) {
+ for (bu = 0; bu < u_blocks; ++bu) {
+ const int block_u_min = bu * lf_blocksize;
+ const int block_v_min = bv * lf_blocksize;
+ int block_u_end = (bu + 1) * lf_blocksize;
+ int block_v_end = (bv + 1) * lf_blocksize;
+ int u, v;
+ block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+ block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+ for (v = block_v_min; v < block_v_end; ++v) {
+ for (u = block_u_min; u < block_u_end; ++u) {
+ av1_ref_frame_t ref;
+ ref.idx = 0;
+ ref.use_external_ref = 1;
+ ref.img = reference_images[bv * u_blocks + bu];
+ if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref))
+ die_codec(&codec, "Failed to set reference frame");
+
+ printf("C%d, ", (u + v * lf_width));
+ fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
+ aom_img_read(raw, infile);
+ get_raw_image(&frame_to_encode, raw, raw_shift);
+
+ ++frame_count;
+ printf("Encoding image %d of %d\n",
+ frame_count - (u_blocks * v_blocks), lf_width * lf_height);
+ encode_frame(&codec, frame_to_encode, frame_count, 1,
+ AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+ writer);
+ }
+ }
+ }
+ }
+
+ // Flush encoder.
+ // No ARF, this should not be needed.
+ while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
+ }
+
+ for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);
+
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ // Modify large_scale_file fourcc.
+ if (cfg->large_scale_tile == 1)
+ aom_video_writer_set_fourcc(writer, LST_FOURCC);
+ aom_video_writer_close(writer);
+
+ printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ int w, h;
+ // The number of lightfield images in the u and v dimensions.
+ int lf_width, lf_height;
+ // Defines how many images refer to the same reference image for MCP.
+ // lf_blocksize X lf_blocksize images will all use the reference image
+ // in the middle of the block of images.
+ int lf_blocksize;
+ aom_codec_ctx_t codec;
+ aom_codec_enc_cfg_t cfg;
+ aom_image_t raw;
+ aom_image_t raw_shift;
+ aom_codec_err_t res;
+ aom_fixed_buf_t stats;
+ int flags = 0;
+
+ const int fps = 30;
+ const int bitrate = 200; // kbit/s
+ const char *const width_arg = argv[1];
+ const char *const height_arg = argv[2];
+ const char *const infile_arg = argv[3];
+ const char *const outfile_arg = argv[4];
+ const char *const lf_width_arg = argv[5];
+ const char *const lf_height_arg = argv[6];
+ const char *lf_blocksize_arg = argv[7];
+ exec_name = argv[0];
+
+ if (argc < 8) die("Invalid number of arguments");
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
+ if (!encoder) die("Unsupported codec.");
+
+ w = (int)strtol(width_arg, NULL, 0);
+ h = (int)strtol(height_arg, NULL, 0);
+ lf_width = (int)strtol(lf_width_arg, NULL, 0);
+ lf_height = (int)strtol(lf_height_arg, NULL, 0);
+ lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
+ lf_blocksize = lf_blocksize < lf_width ? lf_blocksize : lf_width;
+ lf_blocksize = lf_blocksize < lf_height ? lf_blocksize : lf_height;
+
+ if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+ die("Invalid frame size: %dx%d", w, h);
+ if (lf_width <= 0 || lf_height <= 0)
+ die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height);
+ if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize);
+
+ if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) {
+ die("Failed to allocate image.");
+ }
+ if (FORCE_HIGHBITDEPTH_DECODING) {
+ // Need to allocate larger buffer to use hbd internal.
+ aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h,
+ 32);
+ }
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+ // Configuration
+ res = aom_codec_enc_config_default(encoder, &cfg, 0);
+ if (res) die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = w;
+ cfg.g_h = h;
+ cfg.g_timebase.num = 1;
+ cfg.g_timebase.den = fps;
+ cfg.rc_target_bitrate = bitrate;
+ cfg.g_error_resilient = 0; // This is required.
+ cfg.g_lag_in_frames = 0; // need to set this since default is 19.
+ cfg.kf_mode = AOM_KF_DISABLED;
+ cfg.large_scale_tile = 0; // Only set it to 1 for camera frame encoding.
+ cfg.g_bit_depth = AOM_BITS_8;
+ flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING)
+ ? AOM_CODEC_USE_HIGHBITDEPTH
+ : 0;
+
+ if (!(infile = fopen(infile_arg, "rb")))
+ die("Failed to open %s for reading", infile_arg);
+
+ // Pass 0
+ cfg.g_pass = AOM_RC_FIRST_PASS;
+ stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize,
+ flags, &raw_shift);
+
+ // Pass 1
+ rewind(infile);
+ cfg.g_pass = AOM_RC_LAST_PASS;
+ cfg.rc_twopass_stats_in = stats;
+ pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height,
+ lf_blocksize, flags, &raw_shift);
+ free(stats.buf);
+
+ if (FORCE_HIGHBITDEPTH_DECODING) aom_img_free(&raw_shift);
+ aom_img_free(&raw);
+ fclose(infile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c
new file mode 100644
index 0000000000..d71ff5b387
--- /dev/null
+++ b/third_party/aom/examples/lightfield_tile_list_decoder.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Tile List Decoder
+// ============================
+//
+// This is a lightfield tile list decoder example. It takes an input file that
+// contains the anchor frames that are references of the coded tiles, the camera
+// frame header, and tile list OBUs that include the tile information and the
+// compressed tile data. This input file is reconstructed from the encoded
+// lightfield ivf file, and is decodable by AV1 decoder. num_references is
+// the number of anchor frames coded at the beginning of the light field file.
+// num_tile_lists is the number of tile lists need to be decoded. There is an
+// optional parameter allowing to choose the output format, and the supported
+// formats are YUV1D(default), YUV, and NV12.
+// Run lightfield tile list decoder to decode an AV1 tile list file:
+// examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv
+// 4 2 0(optional)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+enum {
+ YUV1D, // 1D tile output for conformance test.
+ YUV, // Tile output in YUV format.
+ NV12, // Tile output in NV12 format.
+} UENUM1BYTE(OUTPUT_FORMAT);
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <infile> <outfile> <num_references> <num_tile_lists> "
+ "<output format(optional)>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static void write_tile_yuv1d(aom_codec_ctx_t *codec, const aom_image_t *img,
+ FILE *file) {
+ // read out the tile size.
+ unsigned int tile_size = 0;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size))
+ die_codec(codec, "Failed to get the tile size");
+ const unsigned int tile_width = tile_size >> 16;
+ const unsigned int tile_height = tile_size & 65535;
+ const uint32_t output_frame_width_in_tiles = img->d_w / tile_width;
+
+ unsigned int tile_count = 0;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_COUNT, &tile_count))
+ die_codec(codec, "Failed to get the tile size");
+
+ // Write tile to file.
+ const int shift = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+ unsigned int tile_idx;
+
+ for (tile_idx = 0; tile_idx < tile_count; ++tile_idx) {
+ const int row_offset =
+ (tile_idx / output_frame_width_in_tiles) * tile_height;
+ const int col_offset =
+ (tile_idx % output_frame_width_in_tiles) * tile_width;
+ int plane;
+
+ for (plane = 0; plane < 3; ++plane) {
+ const unsigned char *buf = img->planes[plane];
+ const int stride = img->stride[plane];
+ const int roffset =
+ (plane > 0) ? row_offset >> img->y_chroma_shift : row_offset;
+ const int coffset =
+ (plane > 0) ? col_offset >> img->x_chroma_shift : col_offset;
+ const int w = (plane > 0) ? ((tile_width >> img->x_chroma_shift) << shift)
+ : (tile_width << shift);
+ const int h =
+ (plane > 0) ? (tile_height >> img->y_chroma_shift) : tile_height;
+ int y;
+
+ // col offset needs to be adjusted for HBD.
+ buf += roffset * stride + (coffset << shift);
+
+ for (y = 0; y < h; ++y) {
+ fwrite(buf, 1, w, file);
+ buf += stride;
+ }
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ FILE *outfile = NULL;
+ AvxVideoReader *reader = NULL;
+ const AvxVideoInfo *info = NULL;
+ int num_references;
+ int num_tile_lists;
+ aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+ size_t frame_size = 0;
+ const unsigned char *frame = NULL;
+ int output_format = YUV1D;
+ int i, j, n;
+
+ exec_name = argv[0];
+
+ if (argc < 5) die("Invalid number of arguments.");
+
+ reader = aom_video_reader_open(argv[1]);
+ if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+ if (!(outfile = fopen(argv[2], "wb")))
+ die("Failed to open %s for writing.", argv[2]);
+
+ num_references = (int)strtol(argv[3], NULL, 0);
+ num_tile_lists = (int)strtol(argv[4], NULL, 0);
+
+ if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0);
+ if (output_format < YUV1D || output_format > NV12)
+ die("Output format out of range [0, 2]");
+
+ info = aom_video_reader_get_info(reader);
+
+ aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) die("Unknown input codec.");
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
+ info->is_annexb)) {
+ die_codec(&codec, "Failed to set annex b status");
+ }
+
+ // Decode anchor frames.
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
+ for (i = 0; i < num_references; ++i) {
+ aom_video_reader_read_frame(reader);
+ frame = aom_video_reader_get_frame(reader, &frame_size);
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode frame.");
+
+ if (i == 0) {
+ aom_img_fmt_t ref_fmt = 0;
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+ die_codec(&codec, "Failed to get the image format");
+
+ int frame_res[2];
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+ die_codec(&codec, "Failed to get the image frame size");
+
+ // Allocate memory to store decoded references. Allocate memory with the
+ // border so that it can be used as a reference.
+ for (j = 0; j < num_references; j++) {
+ unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
+ if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+ frame_res[0], frame_res[1], 32, 8,
+ border)) {
+ fatal("Failed to allocate references.");
+ }
+ }
+ }
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+ &reference_images[i]))
+ die_codec(&codec, "Failed to copy decoded reference frame");
+
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = NULL;
+ while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+ char name[1024];
+ snprintf(name, sizeof(name), "ref_%d.yuv", i);
+ printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h);
+ FILE *ref_file = fopen(name, "wb");
+ aom_img_write(img, ref_file);
+ fclose(ref_file);
+ }
+ }
+
+ // Decode the lightfield.
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1);
+
+ // Set external references.
+ av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references };
+ AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
+ // Must decode the camera frame header first.
+ aom_video_reader_read_frame(reader);
+ frame = aom_video_reader_get_frame(reader, &frame_size);
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode the frame.");
+ // Decode tile lists one by one.
+ for (n = 0; n < num_tile_lists; n++) {
+ aom_video_reader_read_frame(reader);
+ frame = aom_video_reader_get_frame(reader, &frame_size);
+
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode the tile list.");
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = aom_codec_get_frame(&codec, &iter);
+ if (!img) die_codec(&codec, "Failed to get frame.");
+
+ if (output_format == YUV1D)
+ // write the tile to the output file in 1D format.
+ write_tile_yuv1d(&codec, img, outfile);
+ else if (output_format == YUV)
+ aom_img_write(img, outfile);
+ else
+ // NV12 output format
+ aom_img_write_nv12(img, outfile);
+ }
+
+ for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+ aom_video_reader_close(reader);
+ fclose(outfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lossless_encoder.c b/third_party/aom/examples/lossless_encoder.c
new file mode 100644
index 0000000000..1971b9c9df
--- /dev/null
+++ b/third_party/aom/examples/lossless_encoder.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "lossless_encoder: Example demonstrating lossless "
+ "encoding feature. Supports raw input only.\n");
+ fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+ int frame_index, int flags, AvxVideoWriter *writer) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res =
+ aom_codec_encode(codec, img, frame_index, 1, flags);
+ if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+ while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+ if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts)) {
+ die_codec(codec, "Failed to write compressed frame");
+ }
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ aom_codec_enc_cfg_t cfg;
+ int frame_count = 0;
+ aom_image_t raw;
+ aom_codec_err_t res;
+ AvxVideoInfo info;
+ AvxVideoWriter *writer = NULL;
+ const int fps = 30;
+
+ exec_name = argv[0];
+
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&info, 0, sizeof(info));
+
+ if (argc < 5) die("Invalid number of arguments");
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1");
+ if (!encoder) die("Unsupported codec.");
+
+ info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+ info.frame_width = (int)strtol(argv[1], NULL, 0);
+ info.frame_height = (int)strtol(argv[2], NULL, 0);
+ info.time_base.numerator = 1;
+ info.time_base.denominator = fps;
+
+ if (info.frame_width <= 0 || info.frame_height <= 0 ||
+ (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+
+ if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image.");
+ }
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+ aom_codec_ctx_t codec;
+ res = aom_codec_enc_config_default(encoder, &cfg, 0);
+ if (res) die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ cfg.g_timebase.num = info.time_base.numerator;
+ cfg.g_timebase.den = info.time_base.denominator;
+
+ writer = aom_video_writer_open(argv[4], kContainerIVF, &info);
+ if (!writer) die("Failed to open %s for writing.", argv[4]);
+
+ if (!(infile = fopen(argv[3], "rb")))
+ die("Failed to open %s for reading.", argv[3]);
+
+ if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+ die("Failed to initialize encoder");
+
+ if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1))
+ die_codec(&codec, "Failed to use lossless mode");
+
+ // Encode frames.
+ while (aom_img_read(&raw, infile)) {
+ encode_frame(&codec, &raw, frame_count++, 0, writer);
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 0, writer)) {
+ }
+
+ printf("\n");
+ fclose(infile);
+ printf("Processed %d frames.\n", frame_count);
+
+ aom_img_free(&raw);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ aom_video_writer_close(writer);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/noise_model.c b/third_party/aom/examples/noise_model.c
new file mode 100644
index 0000000000..1de13267fc
--- /dev/null
+++ b/third_party/aom/examples/noise_model.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This is an sample binary to create noise params from input video.
+ *
+ * To allow for external denoising applications, this sample binary illustrates
+ * how to create a film grain table (film grain params as a function of time)
+ * from an input video and its corresponding denoised source.
+ *
+ * The --output-grain-table file can be passed as input to the encoder (in
+ * aomenc this is done through the "--film-grain-table" parameter).
+ *
+ * As an example, where the input source is an 854x480 yuv420p 8-bit video
+ * named "input.854_480.yuv" you would use steps similar to the following:
+ *
+ * # Run your denoiser (e.g, using hqdn3d filter):
+ * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \
+ * -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \
+ * denoised.854_480.yuv
+ *
+ * # Model the noise between the denoised version and original source:
+ * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \
+ * --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \
+ * --output-grain-table=film_grain.tbl
+ *
+ * # Encode with your favorite settings (including the grain table):
+ * aomenc --limit=100 --cpu-used=4 --input-bit-depth=8 \
+ * --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \
+ * --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \
+ * -o denoised_with_grain_params.ivf denoised.854_480.yuv
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#if CONFIG_AV1_DECODER
+#include "av1/decoder/grain_synthesis.h"
+#endif
+
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s --input=<input> --input-denoised=<denoised> "
+ "--output-grain-table=<outfile> "
+ "See comments in noise_model.c for more information.\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static const arg_def_t help =
+ ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t width_arg =
+ ARG_DEF("w", "width", 1, "Input width (if rawvideo)");
+static const arg_def_t height_arg =
+ ARG_DEF("h", "height", 1, "Input height (if rawvideo)");
+static const arg_def_t skip_frames_arg =
+ ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)");
+static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate");
+static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename");
+static const arg_def_t output_grain_table_arg =
+ ARG_DEF("n", "output-grain-table", 1, "Output noise file");
+static const arg_def_t input_denoised_arg =
+ ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only");
+static const arg_def_t flat_block_finder_arg =
+ ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder");
+static const arg_def_t block_size_arg =
+ ARG_DEF("b", "block-size", 1, "Block size");
+static const arg_def_t bit_depth_arg =
+ ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input");
+static const arg_def_t use_i420 =
+ ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)");
+static const arg_def_t use_i422 =
+ ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422");
+static const arg_def_t use_i444 =
+ ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444");
+static const arg_def_t debug_file_arg =
+ ARG_DEF(NULL, "debug-file", 1, "File to output debug info");
+
+typedef struct {
+ int width;
+ int height;
+ struct aom_rational fps;
+ const char *input;
+ const char *input_denoised;
+ const char *output_grain_table;
+ int img_fmt;
+ int block_size;
+ int bit_depth;
+ int run_flat_block_finder;
+ int force_flat_psd;
+ int skip_frames;
+ const char *debug_file;
+} noise_model_args_t;
+
+static void parse_args(noise_model_args_t *noise_args, char **argv) {
+ struct arg arg;
+ static const arg_def_t *main_args[] = { &help,
+ &input_arg,
+ &fps_arg,
+ &width_arg,
+ &height_arg,
+ &block_size_arg,
+ &output_grain_table_arg,
+ &input_denoised_arg,
+ &use_i420,
+ &use_i422,
+ &use_i444,
+ &debug_file_arg,
+ NULL };
+ for (; *argv; argv++) {
+ if (arg_match(&arg, &help, argv)) {
+ fprintf(stdout, "\nOptions:\n");
+ arg_show_usage(stdout, main_args);
+ exit(0);
+ } else if (arg_match(&arg, &width_arg, argv)) {
+ noise_args->width = atoi(arg.val);
+ } else if (arg_match(&arg, &height_arg, argv)) {
+ noise_args->height = atoi(arg.val);
+ } else if (arg_match(&arg, &input_arg, argv)) {
+ noise_args->input = arg.val;
+ } else if (arg_match(&arg, &input_denoised_arg, argv)) {
+ noise_args->input_denoised = arg.val;
+ } else if (arg_match(&arg, &output_grain_table_arg, argv)) {
+ noise_args->output_grain_table = arg.val;
+ } else if (arg_match(&arg, &block_size_arg, argv)) {
+ noise_args->block_size = atoi(arg.val);
+ } else if (arg_match(&arg, &bit_depth_arg, argv)) {
+ noise_args->bit_depth = atoi(arg.val);
+ } else if (arg_match(&arg, &flat_block_finder_arg, argv)) {
+ noise_args->run_flat_block_finder = atoi(arg.val);
+ } else if (arg_match(&arg, &fps_arg, argv)) {
+ noise_args->fps = arg_parse_rational(&arg);
+ } else if (arg_match(&arg, &use_i420, argv)) {
+ noise_args->img_fmt = AOM_IMG_FMT_I420;
+ } else if (arg_match(&arg, &use_i422, argv)) {
+ noise_args->img_fmt = AOM_IMG_FMT_I422;
+ } else if (arg_match(&arg, &use_i444, argv)) {
+ noise_args->img_fmt = AOM_IMG_FMT_I444;
+ } else if (arg_match(&arg, &skip_frames_arg, argv)) {
+ noise_args->skip_frames = atoi(arg.val);
+ } else if (arg_match(&arg, &debug_file_arg, argv)) {
+ noise_args->debug_file = arg.val;
+ } else {
+ fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv);
+ arg_show_usage(stdout, main_args);
+ exit(0);
+ }
+ }
+ if (noise_args->bit_depth > 8) {
+ noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ }
+}
+
+#if CONFIG_AV1_DECODER
+static void print_variance_y(FILE *debug_file, aom_image_t *raw,
+ aom_image_t *denoised, const uint8_t *flat_blocks,
+ int block_size, aom_film_grain_t *grain) {
+ aom_image_t renoised;
+ grain->apply_grain = 1;
+ grain->random_seed = 7391;
+ grain->bit_depth = raw->bit_depth;
+ aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
+
+ if (av1_add_film_grain(grain, denoised, &renoised)) {
+ fprintf(stderr, "Internal failure in av1_add_film_grain().\n");
+ aom_img_free(&renoised);
+ return;
+ }
+
+ const int num_blocks_w = (raw->w + block_size - 1) / block_size;
+ const int num_blocks_h = (raw->h + block_size - 1) / block_size;
+ fprintf(debug_file, "x = [");
+ for (int by = 0; by < num_blocks_h; by++) {
+ for (int bx = 0; bx < num_blocks_w; bx++) {
+ double block_mean = 0;
+ double noise_std = 0, noise_mean = 0;
+ double renoise_std = 0, renoise_mean = 0;
+ for (int yi = 0; yi < block_size; ++yi) {
+ const int y = by * block_size + yi;
+ for (int xi = 0; xi < block_size; ++xi) {
+ const int x = bx * block_size + xi;
+ const double noise_v = (raw->planes[0][y * raw->stride[0] + x] -
+ denoised->planes[0][y * raw->stride[0] + x]);
+ noise_mean += noise_v;
+ noise_std += noise_v * noise_v;
+
+ block_mean += raw->planes[0][y * raw->stride[0] + x];
+
+ const double renoise_v =
+ (renoised.planes[0][y * raw->stride[0] + x] -
+ denoised->planes[0][y * raw->stride[0] + x]);
+ renoise_mean += renoise_v;
+ renoise_std += renoise_v * renoise_v;
+ }
+ }
+ int n = (block_size * block_size);
+ block_mean /= n;
+ noise_mean /= n;
+ renoise_mean /= n;
+ noise_std = sqrt(noise_std / n - noise_mean * noise_mean);
+ renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean);
+ fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf ",
+ flat_blocks[by * num_blocks_w + bx], block_mean, noise_std,
+ renoise_std);
+ }
+ fprintf(debug_file, "\n");
+ }
+ fprintf(debug_file, "];\n");
+
+ if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ fprintf(stderr,
+ "Detailed debug info not supported for high bit"
+ "depth formats\n");
+ } else {
+ fprintf(debug_file, "figure(2); clf;\n");
+ fprintf(debug_file,
+ "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n");
+ fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n");
+ fprintf(debug_file,
+ "plot(linspace(0, 255, length(noise_strength_0)), "
+ "noise_strength_0, 'b');\n");
+ fprintf(debug_file,
+ "title('Scatter plot of intensity vs noise strength');\n");
+ fprintf(debug_file,
+ "legend('Actual', 'Estimated', 'Estimated strength');\n");
+ fprintf(debug_file, "figure(3); clf;\n");
+ fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n");
+ fprintf(debug_file, "title('Actual vs Estimated');\n");
+ fprintf(debug_file, "pause(3);\n");
+ }
+ aom_img_free(&renoised);
+}
+#endif
+
+static void print_debug_info(FILE *debug_file, aom_image_t *raw,
+ aom_image_t *denoised, uint8_t *flat_blocks,
+ int block_size, aom_noise_model_t *noise_model) {
+ (void)raw;
+ (void)denoised;
+ (void)flat_blocks;
+ (void)block_size;
+ fprintf(debug_file, "figure(3); clf;\n");
+ fprintf(debug_file, "figure(2); clf;\n");
+ fprintf(debug_file, "figure(1); clf;\n");
+ for (int c = 0; c < 3; ++c) {
+ fprintf(debug_file, "noise_strength_%d = [\n", c);
+ const aom_equation_system_t *eqns =
+ &noise_model->combined_state[c].strength_solver.eqns;
+ for (int k = 0; k < eqns->n; ++k) {
+ fprintf(debug_file, "%lf ", eqns->x[k]);
+ }
+ fprintf(debug_file, "];\n");
+ fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c);
+ }
+ fprintf(debug_file, "legend('Y', 'cb', 'cr');\n");
+ fprintf(debug_file, "title('Noise strength function');\n");
+
+#if CONFIG_AV1_DECODER
+ aom_film_grain_t grain;
+ aom_noise_model_get_grain_parameters(noise_model, &grain);
+ print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain);
+#endif
+ fflush(debug_file);
+}
+
+int main(int argc, char *argv[]) {
+ noise_model_args_t args = { 0, 0, { 25, 1 }, 0, 0, 0, AOM_IMG_FMT_I420,
+ 32, 8, 1, 0, 1, NULL };
+ aom_image_t raw, denoised;
+ FILE *infile = NULL;
+ AvxVideoInfo info;
+
+ memset(&info, 0, sizeof(info));
+
+ (void)argc;
+ exec_name = argv[0];
+ parse_args(&args, argv + 1);
+
+ info.frame_width = args.width;
+ info.frame_height = args.height;
+ info.time_base.numerator = args.fps.den;
+ info.time_base.denominator = args.fps.num;
+
+ if (info.frame_width <= 0 || info.frame_height <= 0 ||
+ (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+ if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height,
+ 1)) {
+ die("Failed to allocate image.");
+ }
+ if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image.");
+ }
+ infile = fopen(args.input, "rb");
+ if (!infile) {
+ die("Failed to open input file: %s", args.input);
+ }
+ fprintf(stderr, "Bit depth: %d stride:%d\n", args.bit_depth, raw.stride[0]);
+
+ const int high_bd = args.bit_depth > 8;
+ const int block_size = args.block_size;
+ aom_flat_block_finder_t block_finder;
+ aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth,
+ high_bd);
+
+ const int num_blocks_w = (info.frame_width + block_size - 1) / block_size;
+ const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
+ uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h);
+ if (!flat_blocks) die("Failed to allocate block data.");
+ // Sets the random seed on the first entry in the output table
+ int16_t random_seed = 7391;
+ aom_noise_model_t noise_model;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth,
+ high_bd };
+ aom_noise_model_init(&noise_model, params);
+
+ FILE *denoised_file = 0;
+ if (args.input_denoised) {
+ denoised_file = fopen(args.input_denoised, "rb");
+ if (!denoised_file)
+ die("Unable to open input_denoised: %s", args.input_denoised);
+ } else {
+ die("--input-denoised file must be specified");
+ }
+ FILE *debug_file = 0;
+ if (args.debug_file) {
+ debug_file = fopen(args.debug_file, "w");
+ }
+ aom_film_grain_table_t grain_table = { 0, 0 };
+
+ int64_t prev_timestamp = 0;
+ int frame_count = 0;
+ while (aom_img_read(&raw, infile)) {
+ if (args.input_denoised) {
+ if (!aom_img_read(&denoised, denoised_file)) {
+ die("Unable to read input denoised file");
+ }
+ }
+ if (frame_count % args.skip_frames == 0) {
+ int num_flat_blocks = num_blocks_w * num_blocks_h;
+ memset(flat_blocks, 1, num_flat_blocks);
+ if (args.run_flat_block_finder) {
+ memset(flat_blocks, 0, num_flat_blocks);
+ num_flat_blocks = aom_flat_block_finder_run(
+ &block_finder, raw.planes[0], info.frame_width, info.frame_height,
+ info.frame_width, flat_blocks);
+ fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks);
+ }
+
+ const uint8_t *planes[3] = { raw.planes[0], raw.planes[1],
+ raw.planes[2] };
+ uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1],
+ denoised.planes[2] };
+ int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd,
+ raw.stride[2] >> high_bd };
+ int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 };
+
+ fprintf(stdout, "Updating noise model...\n");
+ aom_noise_status_t status = aom_noise_model_update(
+ &noise_model, (const uint8_t *const *)planes,
+ (const uint8_t *const *)denoised_planes, info.frame_width,
+ info.frame_height, strides, chroma_sub, flat_blocks, block_size);
+
+ int64_t cur_timestamp =
+ frame_count * 10000000ULL * args.fps.den / args.fps.num;
+ if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+ fprintf(stdout,
+ "Noise type is different, updating parameters for time "
+ "[ %" PRId64 ", %" PRId64 ")\n",
+ prev_timestamp, cur_timestamp);
+ aom_film_grain_t grain;
+ aom_noise_model_get_grain_parameters(&noise_model, &grain);
+ grain.random_seed = random_seed;
+ random_seed = 0;
+ aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp,
+ &grain);
+ aom_noise_model_save_latest(&noise_model);
+ prev_timestamp = cur_timestamp;
+ }
+ if (debug_file) {
+ print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size,
+ &noise_model);
+ }
+ fprintf(stdout, "Done noise model update, status = %d\n", status);
+ }
+ frame_count++;
+ }
+
+ aom_film_grain_t grain;
+ aom_noise_model_get_grain_parameters(&noise_model, &grain);
+ grain.random_seed = random_seed;
+ aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain);
+ if (args.output_grain_table) {
+ struct aom_internal_error_info error_info;
+ if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table,
+ args.output_grain_table,
+ &error_info)) {
+ die("Unable to write output film grain table");
+ }
+ }
+ aom_film_grain_table_free(&grain_table);
+
+ if (infile) fclose(infile);
+ if (denoised_file) fclose(denoised_file);
+ if (debug_file) fclose(debug_file);
+ aom_img_free(&raw);
+ aom_img_free(&denoised);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/photon_noise_table.c b/third_party/aom/examples/photon_noise_table.c
new file mode 100644
index 0000000000..d3a21a48ee
--- /dev/null
+++ b/third_party/aom/examples/photon_noise_table.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// This tool creates a film grain table, for use in stills and videos,
+// representing the noise that one would get by shooting with a digital camera
+// at a given light level. Much of the noise in digital images is photon shot
+// noise, which is due to the characteristics of photon arrival and grows in
+// standard deviation as the square root of the expected number of photons
+// captured.
+// https://www.photonstophotos.net/Emil%20Martinec/noise.html#shotnoise
+//
+// The proxy used by this tool for the amount of light captured is the ISO value
+// such that the focal plane exposure at the time of capture would have been
+// mapped by a 35mm camera to the output lightness observed in the image. That
+// is, if one were to shoot on a 35mm camera (36×24mm sensor) at the nominal
+// exposure for that ISO setting, the resulting image should contain noise of
+// the same order of magnitude as generated by this tool.
+//
+// Example usage:
+//
+// ./photon_noise_table --width=3840 --height=2160 --iso=25600 -o noise.tbl
+// # Then, for example:
+// aomenc --film-grain-table=noise.tbl ...
+// # Or:
+// avifenc -c aom -a film-grain-table=noise.tbl ...
+//
+// The (mostly) square-root relationship between light intensity and noise
+// amplitude holds in linear light, but AV1 streams are most often encoded
+// non-linearly, and the film grain is applied to those non-linear values.
+// Therefore, this tool must account for the non-linearity, and this is
+// controlled by the optional `--transfer-function` (or `-t`) parameter, which
+// specifies the tone response curve that will be used when encoding the actual
+// image. The default for this tool is sRGB, which is approximately similar to
+// an encoding gamma of 1/2.2 (i.e. a decoding gamma of 2.2) though not quite
+// identical.
+//
+// As alluded to above, the tool assumes that the image is taken from the
+// entirety of a 36×24mm (“35mm format”) sensor. If that assumption does not
+// hold, then a “35mm-equivalent ISO value” that can be passed to the tool can
+// be obtained by multiplying the true ISO value by the ratio of 36×24mm to the
+// area that was actually used. For formats that approximately share the same
+// aspect ratio, this is often expressed as the square of the “equivalence
+// ratio” which is the ratio of their diagonals. For example, APS-C (often
+// ~24×16mm) is said to have an equivalence ratio of 1.5 relative to the 35mm
+// format, and therefore ISO 1000 on APS-C and ISO 1000×1.5² = 2250 on 35mm
+// produce an image of the same lightness from the same amount of light spread
+// onto their respective surface areas (resulting in different focal plane
+// exposures), and those images will thus have similar amounts of noise if the
+// cameras are of similar technology. https://doi.org/10.1117/1.OE.57.11.110801
+//
+// The tool needs to know the resolution of the images to which its grain tables
+// will be applied so that it can know how the light on the sensor was shared
+// between its pixels. As a general rule, while a higher pixel count will lead
+// to more noise per pixel, when the final image is viewed at the same physical
+// size, that noise will tend to “average out” to the same amount over a given
+// area, since there will be more pixels in it which, in aggregate, will have
+// received essentially as much light. Put differently, the amount of noise
+// depends on the scale at which it is measured, and the decision for this tool
+// was to make that scale relative to the image instead of its constituent
+// samples. For more on this, see:
+//
+// https://www.photonstophotos.net/Emil%20Martinec/noise-p3.html#pixelsize
+// https://www.dpreview.com/articles/5365920428/the-effect-of-pixel-and-sensor-sizes-on-noise/2
+// https://www.dpreview.com/videos/7940373140/dpreview-tv-why-lower-resolution-sensors-are-not-better-in-low-light
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/grain_table.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+
+static const char *exec_name;
+
+static const struct arg_enum_list transfer_functions[] = {
+ { "bt470m", AOM_CICP_TC_BT_470_M }, { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+ { "srgb", AOM_CICP_TC_SRGB }, { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+ { "hlg", AOM_CICP_TC_HLG }, ARG_ENUM_LIST_END
+};
+
+static arg_def_t help_arg =
+ ARG_DEF("h", "help", 0, "Show the available options");
+static arg_def_t width_arg =
+ ARG_DEF("w", "width", 1, "Width of the image in pixels (required)");
+static arg_def_t height_arg =
+ ARG_DEF("l", "height", 1, "Height of the image in pixels (required)");
+static arg_def_t iso_arg = ARG_DEF(
+ "i", "iso", 1, "ISO setting indicative of the light level (required)");
+static arg_def_t output_arg =
+ ARG_DEF("o", "output", 1,
+ "Output file to which to write the film grain table (required)");
+static arg_def_t transfer_function_arg =
+ ARG_DEF_ENUM("t", "transfer-function", 1,
+ "Transfer function used by the encoded image (default = sRGB)",
+ transfer_functions);
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s [--transfer-function=<tf>] --width=<width> "
+ "--height=<height> --iso=<iso> --output=<output.tbl>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+typedef struct {
+ float (*to_linear)(float);
+ float (*from_linear)(float);
+ // In linear output light. This would typically be 0.18 for SDR (this matches
+ // the definition of Standard Output Sensitivity from ISO 12232:2019), but in
+ // HDR, we certainly do not want to consider 18% of the maximum output a
+ // “mid-tone”, as it would be e.g. 1800 cd/m² for SMPTE ST 2084 (PQ).
+ float mid_tone;
+} transfer_function_t;
+
+static const transfer_function_t *find_transfer_function(
+ aom_transfer_characteristics_t tc);
+
+typedef struct {
+ int width;
+ int height;
+ int iso_setting;
+
+ const transfer_function_t *transfer_function;
+
+ const char *output_filename;
+} photon_noise_args_t;
+
+static void parse_args(int argc, char **argv,
+ photon_noise_args_t *photon_noise_args) {
+ static const arg_def_t *args[] = { &help_arg, &width_arg,
+ &height_arg, &iso_arg,
+ &output_arg, &transfer_function_arg,
+ NULL };
+ struct arg arg;
+ int width_set = 0, height_set = 0, iso_set = 0, output_set = 0, i;
+
+ photon_noise_args->transfer_function =
+ find_transfer_function(AOM_CICP_TC_SRGB);
+
+ for (i = 1; i < argc; i += arg.argv_step) {
+ arg.argv_step = 1;
+ if (arg_match(&arg, &help_arg, argv + i)) {
+ arg_show_usage(stdout, args);
+ exit(EXIT_SUCCESS);
+ } else if (arg_match(&arg, &width_arg, argv + i)) {
+ photon_noise_args->width = arg_parse_int(&arg);
+ width_set = 1;
+ } else if (arg_match(&arg, &height_arg, argv + i)) {
+ photon_noise_args->height = arg_parse_int(&arg);
+ height_set = 1;
+ } else if (arg_match(&arg, &iso_arg, argv + i)) {
+ photon_noise_args->iso_setting = arg_parse_int(&arg);
+ iso_set = 1;
+ } else if (arg_match(&arg, &output_arg, argv + i)) {
+ photon_noise_args->output_filename = arg.val;
+ output_set = 1;
+ } else if (arg_match(&arg, &transfer_function_arg, argv + i)) {
+ const aom_transfer_characteristics_t tc = arg_parse_enum(&arg);
+ photon_noise_args->transfer_function = find_transfer_function(tc);
+ } else {
+ fatal("unrecognized argument \"%s\", see --help for available options",
+ argv[i]);
+ }
+ }
+
+ if (!width_set) {
+ fprintf(stderr, "Missing required parameter --width\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (!height_set) {
+ fprintf(stderr, "Missing required parameter --height\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (!iso_set) {
+ fprintf(stderr, "Missing required parameter --iso\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (!output_set) {
+ fprintf(stderr, "Missing required parameter --output\n");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static float maxf(float a, float b) { return a > b ? a : b; }
+static float minf(float a, float b) { return a < b ? a : b; }
+
+static float gamma22_to_linear(float g) { return powf(g, 2.2f); }
+static float gamma22_from_linear(float l) { return powf(l, 1 / 2.2f); }
+static float gamma28_to_linear(float g) { return powf(g, 2.8f); }
+static float gamma28_from_linear(float l) { return powf(l, 1 / 2.8f); }
+
+static float srgb_to_linear(float srgb) {
+ return srgb <= 0.04045f ? srgb / 12.92f
+ : powf((srgb + 0.055f) / 1.055f, 2.4f);
+}
+static float srgb_from_linear(float linear) {
+ return linear <= 0.0031308f ? 12.92f * linear
+ : 1.055f * powf(linear, 1 / 2.4f) - 0.055f;
+}
+
+static const float kPqM1 = 2610.f / 16384;
+static const float kPqM2 = 128 * 2523.f / 4096;
+static const float kPqC1 = 3424.f / 4096;
+static const float kPqC2 = 32 * 2413.f / 4096;
+static const float kPqC3 = 32 * 2392.f / 4096;
+static float pq_to_linear(float pq) {
+ const float pq_pow_inv_m2 = powf(pq, 1.f / kPqM2);
+ return powf(maxf(0, pq_pow_inv_m2 - kPqC1) / (kPqC2 - kPqC3 * pq_pow_inv_m2),
+ 1.f / kPqM1);
+}
+static float pq_from_linear(float linear) {
+ const float linear_pow_m1 = powf(linear, kPqM1);
+ return powf((kPqC1 + kPqC2 * linear_pow_m1) / (1 + kPqC3 * linear_pow_m1),
+ kPqM2);
+}
+
+// Note: it is perhaps debatable whether “linear” for HLG should be scene light
+// or display light. Here, it is implemented in terms of display light assuming
+// a nominal peak display luminance of 1000 cd/m², hence the system γ of 1.2. To
+// make it scene light instead, the OOTF (powf(x, 1.2f)) and its inverse should
+// be removed from the functions below, and the .mid_tone should be replaced
+// with powf(26.f / 1000, 1 / 1.2f).
+static const float kHlgA = 0.17883277f;
+static const float kHlgB = 0.28466892f;
+static const float kHlgC = 0.55991073f;
+static float hlg_to_linear(float hlg) {
+ // EOTF = OOTF ∘ OETF⁻¹
+ const float linear =
+ hlg <= 0.5f ? hlg * hlg / 3 : (expf((hlg - kHlgC) / kHlgA) + kHlgB) / 12;
+ return powf(linear, 1.2f);
+}
+static float hlg_from_linear(float linear) {
+ // EOTF⁻¹ = OETF ∘ OOTF⁻¹
+ linear = powf(linear, 1.f / 1.2f);
+ return linear <= 1.f / 12 ? sqrtf(3 * linear)
+ : kHlgA * logf(12 * linear - kHlgB) + kHlgC;
+}
+
+static const transfer_function_t *find_transfer_function(
+ aom_transfer_characteristics_t tc) {
+ static const transfer_function_t
+ kGamma22TransferFunction = { .to_linear = &gamma22_to_linear,
+ .from_linear = &gamma22_from_linear,
+ .mid_tone = 0.18f },
+ kGamma28TransferFunction = { .to_linear = &gamma28_to_linear,
+ .from_linear = &gamma28_from_linear,
+ .mid_tone = 0.18f },
+ kSRgbTransferFunction = { .to_linear = &srgb_to_linear,
+ .from_linear = &srgb_from_linear,
+ .mid_tone = 0.18f },
+ kPqTransferFunction = { .to_linear = &pq_to_linear,
+ .from_linear = &pq_from_linear,
+ // https://www.itu.int/pub/R-REP-BT.2408-4-2021
+ // page 6 (PDF page 8)
+ .mid_tone = 26.f / 10000 },
+ kHlgTransferFunction = { .to_linear = &hlg_to_linear,
+ .from_linear = &hlg_from_linear,
+ .mid_tone = 26.f / 1000 };
+
+ switch (tc) {
+ case AOM_CICP_TC_BT_470_M: return &kGamma22TransferFunction;
+ case AOM_CICP_TC_BT_470_B_G: return &kGamma28TransferFunction;
+ case AOM_CICP_TC_SRGB: return &kSRgbTransferFunction;
+ case AOM_CICP_TC_SMPTE_2084: return &kPqTransferFunction;
+ case AOM_CICP_TC_HLG: return &kHlgTransferFunction;
+
+ default: fatal("unimplemented transfer function %d", tc);
+ }
+}
+
+static void generate_photon_noise(const photon_noise_args_t *photon_noise_args,
+ aom_film_grain_t *film_grain) {
+ // Assumes a daylight-like spectrum.
+ // https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s
+ static const float kPhotonsPerLxSPerUm2 = 11260;
+
+ // Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into
+ // account.
+ static const float kEffectiveQuantumEfficiency = 0.20f;
+
+ // Also reasonable values for current cameras. The read noise is typically
+ // higher than this at low ISO settings but it matters less there.
+ static const float kPhotoResponseNonUniformity = 0.005f;
+ static const float kInputReferredReadNoise = 1.5f;
+
+ // Focal plane exposure for a mid-tone (typically a 18% reflectance card), in
+ // lx·s.
+ const float mid_tone_exposure = 10.f / photon_noise_args->iso_setting;
+
+ // In microns. Assumes a 35mm sensor (36mm × 24mm).
+ const float pixel_area_um2 = (36000 * 24000.f) / (photon_noise_args->width *
+ photon_noise_args->height);
+
+ const float mid_tone_electrons_per_pixel = kEffectiveQuantumEfficiency *
+ kPhotonsPerLxSPerUm2 *
+ mid_tone_exposure * pixel_area_um2;
+ const float max_electrons_per_pixel =
+ mid_tone_electrons_per_pixel /
+ photon_noise_args->transfer_function->mid_tone;
+
+ int i;
+
+ film_grain->num_y_points = 14;
+ for (i = 0; i < film_grain->num_y_points; ++i) {
+ float x = i / (film_grain->num_y_points - 1.f);
+ const float linear = photon_noise_args->transfer_function->to_linear(x);
+ const float electrons_per_pixel = max_electrons_per_pixel * linear;
+ // Quadrature sum of the relevant sources of noise, in electrons rms. Photon
+ // shot noise is sqrt(electrons) so we can skip the square root and the
+ // squaring.
+ // https://en.wikipedia.org/wiki/Addition_in_quadrature
+ // https://doi.org/10.1117/3.725073
+ const float noise_in_electrons =
+ sqrtf(kInputReferredReadNoise * kInputReferredReadNoise +
+ electrons_per_pixel +
+ (kPhotoResponseNonUniformity * kPhotoResponseNonUniformity *
+ electrons_per_pixel * electrons_per_pixel));
+ const float linear_noise = noise_in_electrons / max_electrons_per_pixel;
+ const float linear_range_start = maxf(0.f, linear - 2 * linear_noise);
+ const float linear_range_end = minf(1.f, linear + 2 * linear_noise);
+ const float tf_slope =
+ (photon_noise_args->transfer_function->from_linear(linear_range_end) -
+ photon_noise_args->transfer_function->from_linear(
+ linear_range_start)) /
+ (linear_range_end - linear_range_start);
+ float encoded_noise = linear_noise * tf_slope;
+
+ x = roundf(255 * x);
+ encoded_noise = minf(255.f, roundf(255 * 7.88f * encoded_noise));
+
+ film_grain->scaling_points_y[i][0] = (int)x;
+ film_grain->scaling_points_y[i][1] = (int)encoded_noise;
+ }
+
+ film_grain->apply_grain = 1;
+ film_grain->update_parameters = 1;
+ film_grain->num_cb_points = 0;
+ film_grain->num_cr_points = 0;
+ film_grain->scaling_shift = 8;
+ film_grain->ar_coeff_lag = 0;
+ film_grain->ar_coeffs_cb[0] = 0;
+ film_grain->ar_coeffs_cr[0] = 0;
+ film_grain->ar_coeff_shift = 6;
+ film_grain->cb_mult = 0;
+ film_grain->cb_luma_mult = 0;
+ film_grain->cb_offset = 0;
+ film_grain->cr_mult = 0;
+ film_grain->cr_luma_mult = 0;
+ film_grain->cr_offset = 0;
+ film_grain->overlap_flag = 1;
+ film_grain->random_seed = 7391;
+ film_grain->chroma_scaling_from_luma = 0;
+}
+
+int main(int argc, char **argv) {
+ photon_noise_args_t photon_noise_args;
+ aom_film_grain_table_t film_grain_table;
+ aom_film_grain_t film_grain;
+ struct aom_internal_error_info error_info;
+ memset(&photon_noise_args, 0, sizeof(photon_noise_args));
+ memset(&film_grain_table, 0, sizeof(film_grain_table));
+ memset(&film_grain, 0, sizeof(film_grain));
+ memset(&error_info, 0, sizeof(error_info));
+
+ exec_name = argv[0];
+ parse_args(argc, argv, &photon_noise_args);
+
+ generate_photon_noise(&photon_noise_args, &film_grain);
+ aom_film_grain_table_append(&film_grain_table, 0, 9223372036854775807ull,
+ &film_grain);
+ if (aom_film_grain_table_write(&film_grain_table,
+ photon_noise_args.output_filename,
+ &error_info) != AOM_CODEC_OK) {
+ aom_film_grain_table_free(&film_grain_table);
+ fprintf(stderr, "Failed to write film grain table");
+ if (error_info.has_detail) {
+ fprintf(stderr, ": %s", error_info.detail);
+ }
+ fprintf(stderr, "\n");
+ return EXIT_FAILURE;
+ }
+ aom_film_grain_table_free(&film_grain_table);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/scalable_decoder.c b/third_party/aom/examples/scalable_decoder.c
new file mode 100644
index 0000000000..00fe820fd5
--- /dev/null
+++ b/third_party/aom/examples/scalable_decoder.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Decoder
+// ==============
+//
+// This is an example of a scalable decoder loop. It takes a 2-spatial-layer
+// input file
+// containing the compressed data (in OBU format), passes it through the
+// decoder, and writes the decompressed frames to disk. The base layer and
+// enhancement layers are stored as separate files, out_lyr0.yuv and
+// out_lyr1.yuv, respectively.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// av1.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL. The `deadline` parameter is left at zero for this
+// example. This parameter is generally only used when doing adaptive post
+// processing.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+#define MAX_LAYERS 5
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <infile>\n", exec_name);
+ exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+ int frame_cnt = 0;
+ FILE *outfile[MAX_LAYERS];
+ char filename[80];
+ FILE *inputfile = NULL;
+ uint8_t *buf = NULL;
+ size_t bytes_in_buffer = 0;
+ size_t buffer_size = 0;
+ struct AvxInputContext aom_input_ctx;
+ struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 };
+ aom_codec_stream_info_t si;
+ uint8_t tmpbuf[32];
+ unsigned int i;
+
+ exec_name = argv[0];
+
+ if (argc != 2) die("Invalid number of arguments.");
+
+ if (!(inputfile = fopen(argv[1], "rb")))
+ die("Failed to open %s for read.", argv[1]);
+ obu_ctx.avx_ctx->file = inputfile;
+ obu_ctx.avx_ctx->filename = argv[1];
+
+ aom_codec_iface_t *decoder = get_aom_decoder_by_index(0);
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+
+ if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) {
+ die_codec(&codec, "Failed to set output_all_layers control.");
+ }
+
+ // peak sequence header OBU to get number of spatial layers
+ const size_t ret = fread(tmpbuf, 1, 32, inputfile);
+ if (ret != 32) die_codec(&codec, "Input is not a valid obu file");
+ si.is_annexb = 0;
+ if (aom_codec_peek_stream_info(decoder, tmpbuf, 32, &si)) {
+ die_codec(&codec, "Input is not a valid obu file");
+ }
+ fseek(inputfile, -32, SEEK_CUR);
+
+ if (!file_is_obu(&obu_ctx))
+ die_codec(&codec, "Input is not a valid obu file");
+
+ // open base layer output yuv file
+ snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0);
+ if (!(outfile[0] = fopen(filename, "wb")))
+ die("Failed top open output for writing.");
+
+ // open any enhancement layer output yuv files
+ for (i = 1; i < si.number_spatial_layers; i++) {
+ snprintf(filename, sizeof(filename), "out_lyr%u.yuv", i);
+ if (!(outfile[i] = fopen(filename, "wb")))
+ die("Failed to open output for writing.");
+ }
+
+ while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer,
+ &buffer_size)) {
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = NULL;
+ if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL))
+ die_codec(&codec, "Failed to decode frame.");
+
+ while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+ aom_image_t *img_shifted =
+ aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16);
+ img_shifted->bit_depth = 8;
+ aom_img_downshift(img_shifted, img,
+ img->bit_depth - img_shifted->bit_depth);
+ if (img->spatial_id == 0) {
+ printf("Writing base layer 0 %d\n", frame_cnt);
+ aom_img_write(img_shifted, outfile[0]);
+ } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) {
+ printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt);
+ aom_img_write(img_shifted, outfile[img->spatial_id]);
+ } else {
+ die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count");
+ }
+ if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt;
+ }
+ }
+
+ printf("Processed %d frames.\n", frame_cnt);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+ for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]);
+
+ fclose(inputfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/scalable_encoder.c b/third_party/aom/examples/scalable_encoder.c
new file mode 100644
index 0000000000..5bfd1840b2
--- /dev/null
+++ b/third_party/aom/examples/scalable_encoder.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Encoder
+// ==============
+//
+// This is an example of a scalable encoder loop. It takes two input files in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in OBU format.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size = width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <codec> <width> <height> <infile0> <infile1> "
+ "<outfile> <frames to encode>\n"
+ "See comments in scalable_encoder.c for more information.\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+ int frame_index, int flags, FILE *outfile) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res =
+ aom_codec_encode(codec, img, frame_index, 1, flags);
+ if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+ while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+ if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) !=
+ pkt->data.frame.sz) {
+ die_codec(codec, "Failed to write compressed frame");
+ }
+ printf(keyframe ? "K" : ".");
+ printf(" %6d\n", (int)pkt->data.frame.sz);
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+int main(int argc, char **argv) {
+ FILE *infile0 = NULL;
+ FILE *infile1 = NULL;
+ aom_codec_enc_cfg_t cfg;
+ int frame_count = 0;
+ aom_image_t raw0, raw1;
+ aom_codec_err_t res;
+ AvxVideoInfo info;
+ const int fps = 30;
+ const int bitrate = 200;
+ int keyframe_interval = 0;
+ int max_frames = 0;
+ int frames_encoded = 0;
+ const char *codec_arg = NULL;
+ const char *width_arg = NULL;
+ const char *height_arg = NULL;
+ const char *infile0_arg = NULL;
+ const char *infile1_arg = NULL;
+ const char *outfile_arg = NULL;
+ // const char *keyframe_interval_arg = NULL;
+ FILE *outfile = NULL;
+
+ exec_name = argv[0];
+
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&info, 0, sizeof(info));
+
+ if (argc != 8) die("Invalid number of arguments");
+
+ codec_arg = argv[1];
+ width_arg = argv[2];
+ height_arg = argv[3];
+ infile0_arg = argv[4];
+ infile1_arg = argv[5];
+ outfile_arg = argv[6];
+ max_frames = (int)strtol(argv[7], NULL, 0);
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
+ if (!encoder) die("Unsupported codec.");
+
+ info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+ info.frame_width = (int)strtol(width_arg, NULL, 0);
+ info.frame_height = (int)strtol(height_arg, NULL, 0);
+ info.time_base.numerator = 1;
+ info.time_base.denominator = fps;
+
+ if (info.frame_width <= 0 || info.frame_height <= 0 ||
+ (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+
+ if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image for layer 0.");
+ }
+ if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image for layer 1.");
+ }
+
+ // keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+ keyframe_interval = 100;
+ if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+ aom_codec_ctx_t codec;
+ res = aom_codec_enc_config_default(encoder, &cfg, 0);
+ if (res) die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ cfg.g_timebase.num = info.time_base.numerator;
+ cfg.g_timebase.den = info.time_base.denominator;
+ cfg.rc_target_bitrate = bitrate;
+ cfg.g_error_resilient = 0;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_end_usage = AOM_Q;
+ cfg.save_as_annexb = 0;
+
+ outfile = fopen(outfile_arg, "wb");
+ if (!outfile) die("Failed to open %s for writing.", outfile_arg);
+
+ if (!(infile0 = fopen(infile0_arg, "rb")))
+ die("Failed to open %s for reading.", infile0_arg);
+ if (!(infile1 = fopen(infile1_arg, "rb")))
+ die("Failed to open %s for reading.", infile0_arg);
+
+ if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+ die("Failed to initialize encoder");
+ if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8))
+ die_codec(&codec, "Failed to set cpu to 8");
+
+ if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2))
+ die_codec(&codec, "Failed to set tile columns to 2");
+ if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3))
+ die_codec(&codec, "Failed to set num of tile groups to 3");
+
+ if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2))
+ die_codec(&codec, "Failed to set number of spatial layers to 2");
+
+ // Encode frames.
+ while (aom_img_read(&raw0, infile0)) {
+ int flags = 0;
+
+ // configure and encode base layer
+
+ if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0)
+ flags |= AOM_EFLAG_FORCE_KF;
+ else
+ // use previous base layer (LAST) as sole reference
+ // save this frame as LAST to be used as reference by enhanmcent layer
+ // and next base layer
+ flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+ AOM_EFLAG_NO_UPD_ENTROPY;
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ if (aom_codec_enc_config_set(&codec, &cfg))
+ die_codec(&codec, "Failed to set enc cfg for layer 0");
+ if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0))
+ die_codec(&codec, "Failed to set layer id to 0");
+ if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62))
+ die_codec(&codec, "Failed to set cq level");
+ encode_frame(&codec, &raw0, frame_count++, flags, outfile);
+
+ // configure and encode enhancement layer
+
+ // use LAST (base layer) as sole reference
+ flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST |
+ AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+ AOM_EFLAG_NO_UPD_ENTROPY;
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ aom_img_read(&raw1, infile1);
+ if (aom_codec_enc_config_set(&codec, &cfg))
+ die_codec(&codec, "Failed to set enc cfg for layer 1");
+ if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1))
+ die_codec(&codec, "Failed to set layer id to 1");
+ if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10))
+ die_codec(&codec, "Failed to set cq level");
+ encode_frame(&codec, &raw1, frame_count++, flags, outfile);
+
+ frames_encoded++;
+
+ if (max_frames > 0 && frames_encoded >= max_frames) break;
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 0, outfile)) continue;
+
+ printf("\n");
+ fclose(infile0);
+ fclose(infile1);
+ printf("Processed %d frames.\n", frame_count / 2);
+
+ aom_img_free(&raw0);
+ aom_img_free(&raw1);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ fclose(outfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/set_maps.c b/third_party/aom/examples/set_maps.c
new file mode 100644
index 0000000000..2593faba34
--- /dev/null
+++ b/third_party/aom/examples/set_maps.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// AOM Set Active and ROI Maps
+// ===========================
+//
+// This is an example demonstrating how to control the AOM encoder's
+// ROI and Active maps.
+//
+// ROI (Region of Interest) maps are a way for the application to assign
+// each macroblock in the image to a region, and then set quantizer and
+// filtering parameters on that image.
+//
+// Active maps are a way for the application to specify on a
+// macroblock-by-macroblock basis whether there is any activity in that
+// macroblock.
+//
+//
+// Configuration
+// -------------
+// An ROI map is set on frame 22. If the width of the image in macroblocks
+// is evenly divisible by 4, then the output will appear to have distinct
+// columns, where the quantizer, loopfilter, and static threshold differ
+// from column to column.
+//
+// An active map is set on frame 33. If the width of the image in macroblocks
+// is evenly divisible by 4, then the output will appear to have distinct
+// columns, where one column will have motion and the next will not.
+//
+// The active map is cleared on frame 44.
+//
+// Observing The Effects
+// ---------------------
+// Use the `simple_decoder` example to decode this sample, and observe
+// the change in the image at frames 22, 33, and 44.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static void set_active_map(const aom_codec_enc_cfg_t *cfg,
+ aom_codec_ctx_t *codec) {
+ unsigned int i;
+ aom_active_map_t map = { 0, 0, 0 };
+
+ map.rows = (cfg->g_h + 15) / 16;
+ map.cols = (cfg->g_w + 15) / 16;
+
+ map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+ if (!map.active_map) die("Failed to allocate active map");
+ for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2;
+
+ if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
+ die_codec(codec, "Failed to set active map");
+
+ free(map.active_map);
+}
+
+static void unset_active_map(const aom_codec_enc_cfg_t *cfg,
+ aom_codec_ctx_t *codec) {
+ aom_active_map_t map = { 0, 0, 0 };
+
+ map.rows = (cfg->g_h + 15) / 16;
+ map.cols = (cfg->g_w + 15) / 16;
+ map.active_map = NULL;
+
+ if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
+ die_codec(codec, "Failed to set active map");
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+ int frame_index, AvxVideoWriter *writer) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0);
+ if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+ while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+ if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts)) {
+ die_codec(codec, "Failed to write compressed frame");
+ }
+
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ aom_codec_ctx_t codec;
+ aom_codec_enc_cfg_t cfg;
+ int frame_count = 0;
+ const int limit = 10;
+ aom_image_t raw;
+ aom_codec_err_t res;
+ AvxVideoInfo info;
+ AvxVideoWriter *writer = NULL;
+ const int fps = 2; // TODO(dkovalev) add command line argument
+ const double bits_per_pixel_per_frame = 0.067;
+
+#if CONFIG_REALTIME_ONLY
+ const int usage = 1;
+ const int speed = 7;
+#else
+ const int usage = 0;
+ const int speed = 2;
+#endif
+
+ exec_name = argv[0];
+ if (argc != 6) die("Invalid number of arguments");
+
+ memset(&info, 0, sizeof(info));
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(argv[1]);
+ if (encoder == NULL) {
+ die("Unsupported codec.");
+ }
+ assert(encoder != NULL);
+ info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+ info.frame_width = (int)strtol(argv[2], NULL, 0);
+ info.frame_height = (int)strtol(argv[3], NULL, 0);
+ info.time_base.numerator = 1;
+ info.time_base.denominator = fps;
+
+ if (info.frame_width <= 0 || info.frame_height <= 0 ||
+ (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+
+ if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image.");
+ }
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+ res = aom_codec_enc_config_default(encoder, &cfg, usage);
+ if (res) die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ cfg.g_timebase.num = info.time_base.numerator;
+ cfg.g_timebase.den = info.time_base.denominator;
+ cfg.rc_target_bitrate =
+ (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000);
+ cfg.g_lag_in_frames = 0;
+
+ writer = aom_video_writer_open(argv[5], kContainerIVF, &info);
+ if (!writer) die("Failed to open %s for writing.", argv[5]);
+
+ if (!(infile = fopen(argv[4], "rb")))
+ die("Failed to open %s for reading.", argv[4]);
+
+ if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+ die("Failed to initialize encoder");
+
+ if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
+ die_codec(&codec, "Failed to set cpu-used");
+
+ // Encode frames.
+ while (aom_img_read(&raw, infile) && frame_count < limit) {
+ ++frame_count;
+
+ if (frame_count == 5) {
+ set_active_map(&cfg, &codec);
+ } else if (frame_count == 9) {
+ unset_active_map(&cfg, &codec);
+ }
+
+ encode_frame(&codec, &raw, frame_count, writer);
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, writer)) {
+ }
+
+ printf("\n");
+ fclose(infile);
+ printf("Processed %d frames.\n", frame_count);
+
+ aom_img_free(&raw);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ aom_video_writer_close(writer);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/simple_decoder.c b/third_party/aom/examples/simple_decoder.c
new file mode 100644
index 0000000000..b6891dcbba
--- /dev/null
+++ b/third_party/aom/examples/simple_decoder.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Simple Decoder
+// ==============
+//
+// This is an example of a simple decoder loop. It takes an input file
+// containing the compressed data (in IVF format), passes it through the
+// decoder, and writes the decompressed frames to disk. Other decoder
+// examples build upon this one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// aom.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+ exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+ int frame_cnt = 0;
+ FILE *outfile = NULL;
+ AvxVideoReader *reader = NULL;
+ const AvxVideoInfo *info = NULL;
+
+ exec_name = argv[0];
+
+ if (argc != 3) die("Invalid number of arguments.");
+
+ reader = aom_video_reader_open(argv[1]);
+ if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+ if (!(outfile = fopen(argv[2], "wb")))
+ die("Failed to open %s for writing.", argv[2]);
+
+ info = aom_video_reader_get_info(reader);
+
+ aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+ if (!decoder) die("Unknown input codec.");
+
+ printf("Using %s\n", aom_codec_iface_name(decoder));
+
+ aom_codec_ctx_t codec;
+ if (aom_codec_dec_init(&codec, decoder, NULL, 0))
+ die("Failed to initialize decoder.");
+
+ while (aom_video_reader_read_frame(reader)) {
+ aom_codec_iter_t iter = NULL;
+ aom_image_t *img = NULL;
+ size_t frame_size = 0;
+ const unsigned char *frame =
+ aom_video_reader_get_frame(reader, &frame_size);
+ if (aom_codec_decode(&codec, frame, frame_size, NULL))
+ die_codec(&codec, "Failed to decode frame.");
+
+ while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+ aom_img_write(img, outfile);
+ ++frame_cnt;
+ }
+ }
+
+ printf("Processed %d frames.\n", frame_cnt);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+ printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+ info->frame_width, info->frame_height, argv[2]);
+
+ aom_video_reader_close(reader);
+
+ fclose(outfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/simple_encoder.c b/third_party/aom/examples/simple_encoder.c
new file mode 100644
index 0000000000..c026706555
--- /dev/null
+++ b/third_party/aom/examples/simple_encoder.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Simple Encoder
+// ==============
+//
+// This is an example of a simple encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in IVF format. Other decoder examples build upon this
+// one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For encoders, you only have to include `aom_encoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// aom.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Initializing The Codec
+// ----------------------
+// The encoder is initialized by the following code.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+//
+// Error Resiliency Features
+// -------------------------
+// Error resiliency is controlled by the g_error_resilient member of the
+// configuration structure. Use the `decode_with_drops` example to decode with
+// frames 5-10 dropped. Compare the output for a file encoded with this example
+// versus one encoded with the `simple_encoder` example.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <codec> <width> <height> <infile> <outfile> "
+ "<keyframe-interval> <error-resilient> <frames to encode>\n"
+ "See comments in simple_encoder.c for more information.\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+ int frame_index, int flags, AvxVideoWriter *writer) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res =
+ aom_codec_encode(codec, img, frame_index, 1, flags);
+ if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+ while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+ if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts)) {
+ die_codec(codec, "Failed to write compressed frame");
+ }
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ aom_codec_ctx_t codec;
+ aom_codec_enc_cfg_t cfg;
+ int frame_count = 0;
+ aom_image_t raw;
+ aom_codec_err_t res;
+ AvxVideoInfo info;
+ AvxVideoWriter *writer = NULL;
+ const int fps = 30;
+ const int bitrate = 200;
+ int keyframe_interval = 0;
+ int max_frames = 0;
+ int frames_encoded = 0;
+ const char *codec_arg = NULL;
+ const char *width_arg = NULL;
+ const char *height_arg = NULL;
+ const char *infile_arg = NULL;
+ const char *outfile_arg = NULL;
+ const char *keyframe_interval_arg = NULL;
+#if CONFIG_REALTIME_ONLY
+ const int usage = 1;
+ const int speed = 7;
+#else
+ const int usage = 0;
+ const int speed = 2;
+#endif
+
+ exec_name = argv[0];
+
+ // Clear explicitly, as simply assigning "{ 0 }" generates
+ // "missing-field-initializers" warning in some compilers.
+ memset(&info, 0, sizeof(info));
+
+ if (argc != 9) die("Invalid number of arguments");
+
+ codec_arg = argv[1];
+ width_arg = argv[2];
+ height_arg = argv[3];
+ infile_arg = argv[4];
+ outfile_arg = argv[5];
+ keyframe_interval_arg = argv[6];
+ max_frames = (int)strtol(argv[8], NULL, 0);
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
+ if (!encoder) die("Unsupported codec.");
+
+ info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+ info.frame_width = (int)strtol(width_arg, NULL, 0);
+ info.frame_height = (int)strtol(height_arg, NULL, 0);
+ info.time_base.numerator = 1;
+ info.time_base.denominator = fps;
+
+ if (info.frame_width <= 0 || info.frame_height <= 0 ||
+ (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+
+ if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image.");
+ }
+
+ keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+ if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+ res = aom_codec_enc_config_default(encoder, &cfg, usage);
+ if (res) die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ cfg.g_timebase.num = info.time_base.numerator;
+ cfg.g_timebase.den = info.time_base.denominator;
+ cfg.rc_target_bitrate = bitrate;
+ cfg.g_error_resilient = (aom_codec_er_flags_t)strtoul(argv[7], NULL, 0);
+
+ writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
+ if (!writer) die("Failed to open %s for writing.", outfile_arg);
+
+ if (!(infile = fopen(infile_arg, "rb")))
+ die("Failed to open %s for reading.", infile_arg);
+
+ if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
+ die("Failed to initialize encoder");
+
+ if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
+ die_codec(&codec, "Failed to set cpu-used");
+
+ // Encode frames.
+ while (aom_img_read(&raw, infile)) {
+ int flags = 0;
+ if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
+ flags |= AOM_EFLAG_FORCE_KF;
+ encode_frame(&codec, &raw, frame_count++, flags, writer);
+ frames_encoded++;
+ if (max_frames > 0 && frames_encoded >= max_frames) break;
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 0, writer)) continue;
+
+ printf("\n");
+ fclose(infile);
+ printf("Processed %d frames.\n", frame_count);
+
+ aom_img_free(&raw);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ aom_video_writer_close(writer);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/svc_encoder_rtc.cc b/third_party/aom/examples/svc_encoder_rtc.cc
new file mode 100644
index 0000000000..2c041081e5
--- /dev/null
+++ b/third_party/aom/examples/svc_encoder_rtc.cc
@@ -0,0 +1,2062 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This is an example demonstrating how to implement a multi-layer AOM
+// encoding scheme for RTC video applications.
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+
+#include "config/aom_config.h"
+
+#if CONFIG_AV1_DECODER
+#include "aom/aom_decoder.h"
+#endif
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+#include "examples/encoder_util.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/ratectrl_rtc.h"
+
+#define OPTION_BUFFER_SIZE 1024
+
+typedef struct {
+ const char *output_filename;
+ char options[OPTION_BUFFER_SIZE];
+ struct AvxInputContext input_ctx;
+ int speed;
+ int aq_mode;
+ int layering_mode;
+ int output_obu;
+ int decode;
+ int tune_content;
+ int show_psnr;
+ bool use_external_rc;
+} AppInput;
+
+typedef enum {
+ QUANTIZER = 0,
+ BITRATE,
+ SCALE_FACTOR,
+ AUTO_ALT_REF,
+ ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const arg_def_t outputfile =
+ ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t frames_arg =
+ ARG_DEF("f", "frames", 1, "Number of frames to encode");
+static const arg_def_t threads_arg =
+ ARG_DEF("th", "threads", 1, "Number of threads to use");
+static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "Source width");
+static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "Source height");
+static const arg_def_t timebase_arg =
+ ARG_DEF("t", "timebase", 1, "Timebase (num/den)");
+static const arg_def_t bitrate_arg = ARG_DEF(
+ "b", "target-bitrate", 1, "Encoding bitrate, in kilobits per second");
+static const arg_def_t spatial_layers_arg =
+ ARG_DEF("sl", "spatial-layers", 1, "Number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+ ARG_DEF("tl", "temporal-layers", 1, "Number of temporal SVC layers");
+static const arg_def_t layering_mode_arg =
+ ARG_DEF("lm", "layering-mode", 1, "Temporal layering scheme.");
+static const arg_def_t kf_dist_arg =
+ ARG_DEF("k", "kf-dist", 1, "Number of frames between keyframes");
+static const arg_def_t scale_factors_arg =
+ ARG_DEF("r", "scale-factors", 1, "Scale factors (lowest to highest layer)");
+static const arg_def_t min_q_arg =
+ ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
+static const arg_def_t max_q_arg =
+ ARG_DEF(NULL, "max-q", 1, "Maximum quantizer");
+static const arg_def_t speed_arg =
+ ARG_DEF("sp", "speed", 1, "Speed configuration");
+static const arg_def_t aqmode_arg =
+ ARG_DEF("aq", "aqmode", 1, "AQ mode off/on");
+static const arg_def_t bitrates_arg =
+ ARG_DEF("bl", "bitrates", 1,
+ "Bitrates[spatial_layer * num_temporal_layer + temporal_layer]");
+static const arg_def_t dropframe_thresh_arg =
+ ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t error_resilient_arg =
+ ARG_DEF(NULL, "error-resilient", 1, "Error resilient flag");
+static const arg_def_t output_obu_arg =
+ ARG_DEF(NULL, "output-obu", 1,
+ "Write OBUs when set to 1. Otherwise write IVF files.");
+static const arg_def_t test_decode_arg =
+ ARG_DEF(NULL, "test-decode", 1,
+ "Attempt to test decoding the output when set to 1. Default is 1.");
+static const arg_def_t psnr_arg =
+ ARG_DEF(NULL, "psnr", -1, "Show PSNR in status line.");
+static const arg_def_t ext_rc_arg =
+ ARG_DEF(NULL, "use-ext-rc", 0, "Use external rate control.");
+static const struct arg_enum_list tune_content_enum[] = {
+ { "default", AOM_CONTENT_DEFAULT },
+ { "screen", AOM_CONTENT_SCREEN },
+ { "film", AOM_CONTENT_FILM },
+ { NULL, 0 }
+};
+static const arg_def_t tune_content_arg = ARG_DEF_ENUM(
+ NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = { { "8", AOM_BITS_8 },
+ { "10", AOM_BITS_10 },
+ { NULL, 0 } };
+
+static const arg_def_t bitdepth_arg = ARG_DEF_ENUM(
+ "d", "bit-depth", 1, "Bit depth for codec 8 or 10. ", bitdepth_enum);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static const arg_def_t *svc_args[] = {
+ &frames_arg, &outputfile, &width_arg,
+ &height_arg, &timebase_arg, &bitrate_arg,
+ &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg,
+ &min_q_arg, &max_q_arg, &temporal_layers_arg,
+ &layering_mode_arg, &threads_arg, &aqmode_arg,
+#if CONFIG_AV1_HIGHBITDEPTH
+ &bitdepth_arg,
+#endif
+ &speed_arg, &bitrates_arg, &dropframe_thresh_arg,
+ &error_resilient_arg, &output_obu_arg, &test_decode_arg,
+ &tune_content_arg, &psnr_arg, NULL,
+};
+
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr, "Usage: %s <options> input_filename -o output_filename\n",
+ exec_name);
+ fprintf(stderr, "Options:\n");
+ arg_show_usage(stderr, svc_args);
+ exit(EXIT_FAILURE);
+}
+
+static int file_is_y4m(const char detect[4]) {
+ return memcmp(detect, "YUV4", 4) == 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+ if (memcmp(detect, "DKIF", 4) == 0) {
+ return 1;
+ }
+ return 0;
+}
+
+static const int option_max_values[ALL_OPTION_TYPES] = { 63, INT_MAX, INT_MAX,
+ 1 };
+
+static const int option_min_values[ALL_OPTION_TYPES] = { 0, 0, 1, 0 };
+
+static void open_input_file(struct AvxInputContext *input,
+ aom_chroma_sample_position_t csp) {
+ /* Parse certain options from the input file, if possible */
+ input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+ : set_binary_mode(stdin);
+
+ if (!input->file) fatal("Failed to open input file");
+
+ if (!fseeko(input->file, 0, SEEK_END)) {
+ /* Input file is seekable. Figure out how long it is, so we can get
+ * progress info.
+ */
+ input->length = ftello(input->file);
+ rewind(input->file);
+ }
+
+ /* Default to 1:1 pixel aspect ratio. */
+ input->pixel_aspect_ratio.numerator = 1;
+ input->pixel_aspect_ratio.denominator = 1;
+
+ /* For RAW input sources, these bytes will applied on the first frame
+ * in read_frame().
+ */
+ input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+ input->detect.position = 0;
+
+ if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+ if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp,
+ input->only_i420) >= 0) {
+ input->file_type = FILE_TYPE_Y4M;
+ input->width = input->y4m.pic_w;
+ input->height = input->y4m.pic_h;
+ input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+ input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+ input->framerate.numerator = input->y4m.fps_n;
+ input->framerate.denominator = input->y4m.fps_d;
+ input->fmt = input->y4m.aom_fmt;
+ input->bit_depth = static_cast<aom_bit_depth_t>(input->y4m.bit_depth);
+ } else {
+ fatal("Unsupported Y4M stream.");
+ }
+ } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+ fatal("IVF is not supported as input.");
+ } else {
+ input->file_type = FILE_TYPE_RAW;
+ }
+}
+
+static aom_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
+ int *value0, int *value1) {
+ if (type == SCALE_FACTOR) {
+ *value0 = (int)strtol(input, &input, 10);
+ if (*input++ != '/') return AOM_CODEC_INVALID_PARAM;
+ *value1 = (int)strtol(input, &input, 10);
+
+ if (*value0 < option_min_values[SCALE_FACTOR] ||
+ *value1 < option_min_values[SCALE_FACTOR] ||
+ *value0 > option_max_values[SCALE_FACTOR] ||
+ *value1 > option_max_values[SCALE_FACTOR] ||
+ *value0 > *value1) // num shouldn't be greater than den
+ return AOM_CODEC_INVALID_PARAM;
+ } else {
+ *value0 = atoi(input);
+ if (*value0 < option_min_values[type] || *value0 > option_max_values[type])
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_layer_options_from_string(
+ aom_svc_params_t *svc_params, LAYER_OPTION_TYPE type, const char *input,
+ int *option0, int *option1) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+ char *input_string;
+ char *token;
+ const char *delim = ",";
+ int num_layers = svc_params->number_spatial_layers;
+ int i = 0;
+
+ if (type == BITRATE)
+ num_layers =
+ svc_params->number_spatial_layers * svc_params->number_temporal_layers;
+
+ if (input == NULL || option0 == NULL ||
+ (option1 == NULL && type == SCALE_FACTOR))
+ return AOM_CODEC_INVALID_PARAM;
+
+ const size_t input_length = strlen(input);
+ input_string = reinterpret_cast<char *>(malloc(input_length + 1));
+ if (input_string == NULL) return AOM_CODEC_MEM_ERROR;
+ memcpy(input_string, input, input_length + 1);
+ token = strtok(input_string, delim); // NOLINT
+ for (i = 0; i < num_layers; ++i) {
+ if (token != NULL) {
+ res = extract_option(type, token, option0 + i, option1 + i);
+ if (res != AOM_CODEC_OK) break;
+ token = strtok(NULL, delim); // NOLINT
+ } else {
+ res = AOM_CODEC_INVALID_PARAM;
+ break;
+ }
+ }
+ free(input_string);
+ return res;
+}
+
+static void parse_command_line(int argc, const char **argv_,
+ AppInput *app_input,
+ aom_svc_params_t *svc_params,
+ aom_codec_enc_cfg_t *enc_cfg) {
+ struct arg arg;
+ char **argv = NULL;
+ char **argi = NULL;
+ char **argj = NULL;
+ char string_options[1024] = { 0 };
+
+ // Default settings
+ svc_params->number_spatial_layers = 1;
+ svc_params->number_temporal_layers = 1;
+ app_input->layering_mode = 0;
+ app_input->output_obu = 0;
+ app_input->decode = 1;
+ enc_cfg->g_threads = 1;
+ enc_cfg->rc_end_usage = AOM_CBR;
+
+ // process command line options
+ argv = argv_dup(argc - 1, argv_ + 1);
+ if (!argv) {
+ fprintf(stderr, "Error allocating argument list\n");
+ exit(EXIT_FAILURE);
+ }
+ for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+ arg.argv_step = 1;
+
+ if (arg_match(&arg, &outputfile, argi)) {
+ app_input->output_filename = arg.val;
+ } else if (arg_match(&arg, &width_arg, argi)) {
+ enc_cfg->g_w = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &height_arg, argi)) {
+ enc_cfg->g_h = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &timebase_arg, argi)) {
+ enc_cfg->g_timebase = arg_parse_rational(&arg);
+ } else if (arg_match(&arg, &bitrate_arg, argi)) {
+ enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
+ svc_params->number_spatial_layers = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+ svc_params->number_temporal_layers = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &speed_arg, argi)) {
+ app_input->speed = arg_parse_uint(&arg);
+ if (app_input->speed > 11) {
+ aom_tools_warn("Mapping speed %d to speed 11.\n", app_input->speed);
+ }
+ } else if (arg_match(&arg, &aqmode_arg, argi)) {
+ app_input->aq_mode = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &threads_arg, argi)) {
+ enc_cfg->g_threads = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &layering_mode_arg, argi)) {
+ app_input->layering_mode = arg_parse_int(&arg);
+ } else if (arg_match(&arg, &kf_dist_arg, argi)) {
+ enc_cfg->kf_min_dist = arg_parse_uint(&arg);
+ enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
+ } else if (arg_match(&arg, &scale_factors_arg, argi)) {
+ aom_codec_err_t res = parse_layer_options_from_string(
+ svc_params, SCALE_FACTOR, arg.val, svc_params->scaling_factor_num,
+ svc_params->scaling_factor_den);
+ if (res != AOM_CODEC_OK) {
+ die("Failed to parse scale factors: %s\n",
+ aom_codec_err_to_string(res));
+ }
+ } else if (arg_match(&arg, &min_q_arg, argi)) {
+ enc_cfg->rc_min_quantizer = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &max_q_arg, argi)) {
+ enc_cfg->rc_max_quantizer = arg_parse_uint(&arg);
+#if CONFIG_AV1_HIGHBITDEPTH
+ } else if (arg_match(&arg, &bitdepth_arg, argi)) {
+ enc_cfg->g_bit_depth =
+ static_cast<aom_bit_depth_t>(arg_parse_enum_or_int(&arg));
+ switch (enc_cfg->g_bit_depth) {
+ case AOM_BITS_8:
+ enc_cfg->g_input_bit_depth = 8;
+ enc_cfg->g_profile = 0;
+ break;
+ case AOM_BITS_10:
+ enc_cfg->g_input_bit_depth = 10;
+ enc_cfg->g_profile = 0;
+ break;
+ default:
+ die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
+ enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &error_resilient_arg, argi)) {
+ enc_cfg->g_error_resilient = arg_parse_uint(&arg);
+ if (enc_cfg->g_error_resilient != 0 && enc_cfg->g_error_resilient != 1)
+ die("Invalid value for error resilient (0, 1): %d.",
+ enc_cfg->g_error_resilient);
+ } else if (arg_match(&arg, &output_obu_arg, argi)) {
+ app_input->output_obu = arg_parse_uint(&arg);
+ if (app_input->output_obu != 0 && app_input->output_obu != 1)
+ die("Invalid value for obu output flag (0, 1): %d.",
+ app_input->output_obu);
+ } else if (arg_match(&arg, &test_decode_arg, argi)) {
+ app_input->decode = arg_parse_uint(&arg);
+ if (app_input->decode != 0 && app_input->decode != 1)
+ die("Invalid value for test decode flag (0, 1): %d.",
+ app_input->decode);
+ } else if (arg_match(&arg, &tune_content_arg, argi)) {
+ app_input->tune_content = arg_parse_enum_or_int(&arg);
+ printf("tune content %d\n", app_input->tune_content);
+ } else if (arg_match(&arg, &psnr_arg, argi)) {
+ app_input->show_psnr = 1;
+ } else if (arg_match(&arg, &ext_rc_arg, argi)) {
+ app_input->use_external_rc = true;
+ } else {
+ ++argj;
+ }
+ }
+
+ // Total bitrate needs to be parsed after the number of layers.
+ for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+ arg.argv_step = 1;
+ if (arg_match(&arg, &bitrates_arg, argi)) {
+ aom_codec_err_t res = parse_layer_options_from_string(
+ svc_params, BITRATE, arg.val, svc_params->layer_target_bitrate, NULL);
+ if (res != AOM_CODEC_OK) {
+ die("Failed to parse bitrates: %s\n", aom_codec_err_to_string(res));
+ }
+ } else {
+ ++argj;
+ }
+ }
+
+ // There will be a space in front of the string options
+ if (strlen(string_options) > 0)
+ strncpy(app_input->options, string_options, OPTION_BUFFER_SIZE);
+
+ // Check for unrecognized options
+ for (argi = argv; *argi; ++argi)
+ if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+ die("Error: Unrecognized option %s\n", *argi);
+
+ if (argv[0] == NULL) {
+ usage_exit();
+ }
+
+ app_input->input_ctx.filename = argv[0];
+ free(argv);
+
+ open_input_file(&app_input->input_ctx, AOM_CSP_UNKNOWN);
+ if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
+ enc_cfg->g_w = app_input->input_ctx.width;
+ enc_cfg->g_h = app_input->input_ctx.height;
+ }
+
+ if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
+ enc_cfg->g_h % 2)
+ die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
+
+ printf(
+ "Codec %s\n"
+ "layers: %d\n"
+ "width %u, height: %u\n"
+ "num: %d, den: %d, bitrate: %u\n"
+ "gop size: %u\n",
+ aom_codec_iface_name(aom_codec_av1_cx()),
+ svc_params->number_spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
+ enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
+ enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
+}
+
+static int mode_to_num_temporal_layers[12] = {
+ 1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3, 3,
+};
+static int mode_to_num_spatial_layers[12] = {
+ 1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 3,
+};
+
+// For rate control encoding stats.
+struct RateControlMetrics {
+ // Number of input frames per layer.
+ int layer_input_frames[AOM_MAX_TS_LAYERS];
+ // Number of encoded non-key frames per layer.
+ int layer_enc_frames[AOM_MAX_TS_LAYERS];
+ // Framerate per layer layer (cumulative).
+ double layer_framerate[AOM_MAX_TS_LAYERS];
+ // Target average frame size per layer (per-frame-bandwidth per layer).
+ double layer_pfb[AOM_MAX_LAYERS];
+ // Actual average frame size per layer.
+ double layer_avg_frame_size[AOM_MAX_LAYERS];
+ // Average rate mismatch per layer (|target - actual| / target).
+ double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
+ // Actual encoding bitrate per layer (cumulative across temporal layers).
+ double layer_encoding_bitrate[AOM_MAX_LAYERS];
+ // Average of the short-time encoder actual bitrate.
+ // TODO(marpan): Should we add these short-time stats for each layer?
+ double avg_st_encoding_bitrate;
+ // Variance of the short-time encoder actual bitrate.
+ double variance_st_encoding_bitrate;
+ // Window (number of frames) for computing short-timee encoding bitrate.
+ int window_size;
+ // Number of window measurements.
+ int window_count;
+ int layer_target_bitrate[AOM_MAX_LAYERS];
+};
+
+static const int REF_FRAMES = 8;
+
+static const int INTER_REFS_PER_FRAME = 7;
+
+// Reference frames used in this example encoder.
+enum {
+ SVC_LAST_FRAME = 0,
+ SVC_LAST2_FRAME,
+ SVC_LAST3_FRAME,
+ SVC_GOLDEN_FRAME,
+ SVC_BWDREF_FRAME,
+ SVC_ALTREF2_FRAME,
+ SVC_ALTREF_FRAME
+};
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+ FILE *f = input_ctx->file;
+ y4m_input *y4m = &input_ctx->y4m;
+ int shortread = 0;
+
+ if (input_ctx->file_type == FILE_TYPE_Y4M) {
+ if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+ } else {
+ shortread = read_yuv_frame(input_ctx, img);
+ }
+
+ return !shortread;
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+ fclose(input->file);
+ if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+// Note: these rate control metrics assume only 1 key frame in the
+// sequence (i.e., first frame only). So for temporal pattern# 7
+// (which has key frame for every frame on base layer), the metrics
+// computation will be off/wrong.
+// TODO(marpan): Update these metrics to account for multiple key frames
+// in the stream.
+static void set_rate_control_metrics(struct RateControlMetrics *rc,
+ double framerate, int ss_number_layers,
+ int ts_number_layers) {
+ int ts_rate_decimator[AOM_MAX_TS_LAYERS] = { 1 };
+ ts_rate_decimator[0] = 1;
+ if (ts_number_layers == 2) {
+ ts_rate_decimator[0] = 2;
+ ts_rate_decimator[1] = 1;
+ }
+ if (ts_number_layers == 3) {
+ ts_rate_decimator[0] = 4;
+ ts_rate_decimator[1] = 2;
+ ts_rate_decimator[2] = 1;
+ }
+ // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+ // per-frame-bandwidth, for the rate control encoding stats below.
+ for (int sl = 0; sl < ss_number_layers; ++sl) {
+ int i = sl * ts_number_layers;
+ rc->layer_framerate[0] = framerate / ts_rate_decimator[0];
+ rc->layer_pfb[i] =
+ 1000.0 * rc->layer_target_bitrate[i] / rc->layer_framerate[0];
+ for (int tl = 0; tl < ts_number_layers; ++tl) {
+ i = sl * ts_number_layers + tl;
+ if (tl > 0) {
+ rc->layer_framerate[tl] = framerate / ts_rate_decimator[tl];
+ rc->layer_pfb[i] =
+ 1000.0 *
+ (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+ (rc->layer_framerate[tl] - rc->layer_framerate[tl - 1]);
+ }
+ rc->layer_input_frames[tl] = 0;
+ rc->layer_enc_frames[tl] = 0;
+ rc->layer_encoding_bitrate[i] = 0.0;
+ rc->layer_avg_frame_size[i] = 0.0;
+ rc->layer_avg_rate_mismatch[i] = 0.0;
+ }
+ }
+ rc->window_count = 0;
+ rc->window_size = 15;
+ rc->avg_st_encoding_bitrate = 0.0;
+ rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlMetrics *rc,
+ int frame_cnt, int ss_number_layers,
+ int ts_number_layers) {
+ int tot_num_frames = 0;
+ double perc_fluctuation = 0.0;
+ printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
+ printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers);
+ for (int sl = 0; sl < ss_number_layers; ++sl) {
+ tot_num_frames = 0;
+ for (int tl = 0; tl < ts_number_layers; ++tl) {
+ int i = sl * ts_number_layers + tl;
+ const int num_dropped =
+ tl > 0 ? rc->layer_input_frames[tl] - rc->layer_enc_frames[tl]
+ : rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] - 1;
+ tot_num_frames += rc->layer_input_frames[tl];
+ rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[tl] *
+ rc->layer_encoding_bitrate[i] /
+ tot_num_frames;
+ rc->layer_avg_frame_size[i] =
+ rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl];
+ rc->layer_avg_rate_mismatch[i] =
+ 100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl];
+ printf("For layer#: %d %d \n", sl, tl);
+ printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i],
+ rc->layer_encoding_bitrate[i]);
+ printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i],
+ rc->layer_avg_frame_size[i]);
+ printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[i]);
+ printf(
+ "Number of input frames, encoded (non-key) frames, "
+ "and perc dropped frames: %d %d %f\n",
+ rc->layer_input_frames[tl], rc->layer_enc_frames[tl],
+ 100.0 * num_dropped / rc->layer_input_frames[tl]);
+ printf("\n");
+ }
+ }
+ rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+ rc->variance_st_encoding_bitrate =
+ rc->variance_st_encoding_bitrate / rc->window_count -
+ (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+ perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+ rc->avg_st_encoding_bitrate;
+ printf("Short-time stats, for window of %d frames:\n", rc->window_size);
+ printf("Average, rms-variance, and percent-fluct: %f %f %f\n",
+ rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate),
+ perc_fluctuation);
+ if (frame_cnt - 1 != tot_num_frames)
+ die("Error: Number of input frames not equal to output!\n");
+}
+
+// Layer pattern configuration.
+static void set_layer_pattern(
+ int layering_mode, int superframe_cnt, aom_svc_layer_id_t *layer_id,
+ aom_svc_ref_frame_config_t *ref_frame_config,
+ aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control,
+ int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) {
+ // Setting this flag to 1 enables simplex example of
+ // RPS (Reference Picture Selection) for 1 layer.
+ int use_rps_example = 0;
+ int i;
+ int enable_longterm_temporal_ref = 1;
+ int shift = (layering_mode == 8) ? 2 : 0;
+ int simulcast_mode = (layering_mode == 11);
+ *use_svc_control = 1;
+ layer_id->spatial_layer_id = spatial_layer_id;
+ int lag_index = 0;
+ int base_count = superframe_cnt >> 2;
+ ref_frame_comp_pred->use_comp_pred[0] = 0; // GOLDEN_LAST
+ ref_frame_comp_pred->use_comp_pred[1] = 0; // LAST2_LAST
+ ref_frame_comp_pred->use_comp_pred[2] = 0; // ALTREF_LAST
+ // Set the reference map buffer idx for the 7 references:
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0;
+ for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+
+ if (ksvc_mode) {
+ // Same pattern as case 9, but the reference strucutre will be constrained
+ // below.
+ layering_mode = 9;
+ }
+ switch (layering_mode) {
+ case 0:
+ if (use_rps_example == 0) {
+ // 1-layer: update LAST on every frame, reference LAST.
+ layer_id->temporal_layer_id = 0;
+ layer_id->spatial_layer_id = 0;
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else {
+ // Pattern of 2 references (ALTREF and GOLDEN) trailing
+ // LAST by 4 and 8 frames, with some switching logic to
+ // sometimes only predict from the longer-term reference
+ //(golden here). This is simple example to test RPS
+ // (reference picture selection).
+ int last_idx = 0;
+ int last_idx_refresh = 0;
+ int gld_idx = 0;
+ int alt_ref_idx = 0;
+ int lag_alt = 4;
+ int lag_gld = 8;
+ layer_id->temporal_layer_id = 0;
+ layer_id->spatial_layer_id = 0;
+ int sh = 8; // slots 0 - 7.
+ // Moving index slot for last: 0 - (sh - 1)
+ if (superframe_cnt > 1) last_idx = (superframe_cnt - 1) % sh;
+ // Moving index for refresh of last: one ahead for next frame.
+ last_idx_refresh = superframe_cnt % sh;
+ // Moving index for gld_ref, lag behind current by lag_gld
+ if (superframe_cnt > lag_gld) gld_idx = (superframe_cnt - lag_gld) % sh;
+ // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+ if (superframe_cnt > lag_alt)
+ alt_ref_idx = (superframe_cnt - lag_alt) % sh;
+ // Set the ref_idx.
+ // Default all references to slot for last.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = last_idx;
+ // Set the ref_idx for the relevant references.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = last_idx;
+ ref_frame_config->ref_idx[SVC_LAST2_FRAME] = last_idx_refresh;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = gld_idx;
+ ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = alt_ref_idx;
+ // Refresh this slot, which will become LAST on next frame.
+ ref_frame_config->refresh[last_idx_refresh] = 1;
+ // Reference LAST, ALTREF, and GOLDEN
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ // Switch to only GOLDEN every 300 frames.
+ if (superframe_cnt % 200 == 0 && superframe_cnt > 0) {
+ ref_frame_config->reference[SVC_LAST_FRAME] = 0;
+ ref_frame_config->reference[SVC_ALTREF_FRAME] = 0;
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ // Test if the long-term is LAST instead, this is just a renaming
+ // but its tests if encoder behaves the same, whether its
+ // LAST or GOLDEN.
+ if (superframe_cnt % 400 == 0 && superframe_cnt > 0) {
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = gld_idx;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ ref_frame_config->reference[SVC_ALTREF_FRAME] = 0;
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0;
+ }
+ }
+ }
+ break;
+ case 1:
+ // 2-temporal layer.
+ // 1 3 5
+ // 0 2 4
+ // Keep golden fixed at slot 3.
+ base_count = superframe_cnt >> 1;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ // Cyclically refresh slots 5, 6, 7, for lag alt ref.
+ lag_index = 5;
+ if (base_count > 0) {
+ lag_index = 5 + (base_count % 3);
+ if (superframe_cnt % 2 != 0) lag_index = 5 + ((base_count + 1) % 3);
+ }
+ // Set the altref slot to lag_index.
+ ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index;
+ if (superframe_cnt % 2 == 0) {
+ layer_id->temporal_layer_id = 0;
+ // Update LAST on layer 0, reference LAST.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ // Refresh lag_index slot, needed for lagging golen.
+ ref_frame_config->refresh[lag_index] = 1;
+ // Refresh GOLDEN every x base layer frames.
+ if (base_count % 32 == 0) ref_frame_config->refresh[3] = 1;
+ } else {
+ layer_id->temporal_layer_id = 1;
+ // No updates on layer 1, reference LAST (TL0).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ }
+ // Always reference golden and altref on TL0.
+ if (layer_id->temporal_layer_id == 0) {
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+ }
+ break;
+ case 2:
+ // 3-temporal layer:
+ // 1 3 5 7
+ // 2 6
+ // 0 4 8
+ if (superframe_cnt % 4 == 0) {
+ // Base layer.
+ layer_id->temporal_layer_id = 0;
+ // Update LAST on layer 0, reference LAST.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // First top layer: no updates, only reference LAST (TL0).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ layer_id->temporal_layer_id = 1;
+ // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+ ref_frame_config->refresh[1] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // Second top layer: no updates, only reference LAST.
+ // Set buffer idx for LAST to slot 1, since that was the slot
+ // updated in previous frame. So LAST is TL1 frame.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ }
+ break;
+ case 3:
+ // 3 TL, same as above, except allow for predicting
+ // off 2 more references (GOLDEN and ALTREF), with
+ // GOLDEN updated periodically, and ALTREF lagging from
+ // LAST from ~4 frames. Both GOLDEN and ALTREF
+ // can only be updated on base temporal layer.
+
+ // Keep golden fixed at slot 3.
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ // Cyclically refresh slots 5, 6, 7, for lag altref.
+ lag_index = 5;
+ if (base_count > 0) {
+ lag_index = 5 + (base_count % 3);
+ if (superframe_cnt % 4 != 0) lag_index = 5 + ((base_count + 1) % 3);
+ }
+ // Set the altref slot to lag_index.
+ ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index;
+ if (superframe_cnt % 4 == 0) {
+ // Base layer.
+ layer_id->temporal_layer_id = 0;
+ // Update LAST on layer 0, reference LAST.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ // Refresh GOLDEN every x ~10 base layer frames.
+ if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1;
+ // Refresh lag_index slot, needed for lagging altref.
+ ref_frame_config->refresh[lag_index] = 1;
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // First top layer: no updates, only reference LAST (TL0).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ layer_id->temporal_layer_id = 1;
+ // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+ ref_frame_config->refresh[1] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // Second top layer: no updates, only reference LAST.
+ // Set buffer idx for LAST to slot 1, since that was the slot
+ // updated in previous frame. So LAST is TL1 frame.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ }
+ // Every frame can reference GOLDEN AND ALTREF.
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+ // Allow for compound prediction for LAST-ALTREF and LAST-GOLDEN.
+ if (speed >= 7) {
+ ref_frame_comp_pred->use_comp_pred[2] = 1;
+ ref_frame_comp_pred->use_comp_pred[0] = 1;
+ }
+ break;
+ case 4:
+ // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
+ // only reference GF (not LAST). Other frames only reference LAST.
+ // 1 3 5 7
+ // 2 6
+ // 0 4 8
+ if (superframe_cnt % 4 == 0) {
+ // Base layer.
+ layer_id->temporal_layer_id = 0;
+ // Update LAST on layer 0, only reference LAST.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // First top layer: no updates, only reference LAST (TL0).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ layer_id->temporal_layer_id = 1;
+ // Middle layer (TL1): update GF, only reference LAST (TL0).
+ ref_frame_config->refresh[3] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // Second top layer: no updates, only reference GF.
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ }
+ break;
+ case 5:
+ // 2 spatial layers, 1 temporal.
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+ // and GOLDEN to slot 0. Update slot 1 (LAST).
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 0;
+ ref_frame_config->refresh[1] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ }
+ break;
+ case 6:
+ // 3 spatial layers, 1 temporal.
+ // Note for this case, we set the buffer idx for all references to be
+ // either LAST or GOLDEN, which are always valid references, since decoder
+ // will check if any of the 7 references is valid scale in
+ // valid_ref_frame_size().
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST. Set all buffer_idx to 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+ // and GOLDEN (and all other refs) to slot 0.
+ // Update slot 1 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->refresh[1] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
+ // and GOLDEN (and all other refs) to slot 1.
+ // Update slot 2 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->refresh[2] = 1;
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ // For 3 spatial layer case: allow for top spatial layer to use
+ // additional temporal reference. Update every 10 frames.
+ if (enable_longterm_temporal_ref) {
+ ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
+ ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+ if (base_count % 10 == 0)
+ ref_frame_config->refresh[REF_FRAMES - 1] = 1;
+ }
+ }
+ break;
+ case 7:
+ // 2 spatial and 3 temporal layer.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ if (superframe_cnt % 4 == 0) {
+ // Base temporal layer
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST
+ // Set all buffer_idx to 0
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->refresh[1] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer.
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST.
+ // Set all buffer_idx to 0.
+ // Set GOLDEN to slot 5 and update slot 5.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift;
+ ref_frame_config->refresh[5 - shift] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 5.
+ // Set LAST3 to slot 6 and update slot 6.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5 - shift;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift;
+ ref_frame_config->refresh[6 - shift] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Set LAST to slot 5 and reference LAST.
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+ // GOLDEN to slot 3. No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ }
+ }
+ break;
+ case 8:
+ // 3 spatial and 3 temporal layer.
+ // Same as case 9 but overalap in the buffer slot updates.
+ // (shift = 2). The slots 3 and 4 updated by first TL2 are
+ // reused for update in TL1 superframe.
+ // Note for this case, frame order hint must be disabled for
+ // lower resolutios (operating points > 0) to be decoedable.
+ case 9:
+ // 3 spatial and 3 temporal layer.
+ // No overlap in buffer updates between TL2 and TL1.
+ // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7.
+ // Set the references via the svc_ref_frame_config control.
+ // Always reference LAST.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ if (superframe_cnt % 4 == 0) {
+ // Base temporal layer.
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST.
+ // Set all buffer_idx to 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 0.
+ // Update slot 1 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 1.
+ // Update slot 2 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->refresh[2] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to slot 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and Update slot 4.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer.
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST.
+ // Set all buffer_idx to 0.
+ // Set GOLDEN to slot 5 and update slot 5.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift;
+ ref_frame_config->refresh[5 - shift] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 5.
+ // Set LAST3 to slot 6 and update slot 6.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5 - shift;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift;
+ ref_frame_config->refresh[6 - shift] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 6.
+ // Set LAST3 to slot 7 and update slot 7.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 6 - shift;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 7 - shift;
+ ref_frame_config->refresh[7 - shift] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Set LAST to slot 5 and reference LAST.
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+ // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+ // GOLDEN to slot 4. No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 7 - shift;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4;
+ }
+ }
+ break;
+ case 11:
+ // Simulcast mode for 3 spatial and 3 temporal layers.
+ // No inter-layer predicton, only prediction is temporal and single
+ // reference (LAST).
+ // No overlap in buffer slots between spatial layers. So for example,
+ // SL0 only uses slots 0 and 1.
+ // SL1 only uses slots 2 and 3.
+ // SL2 only uses slots 4 and 5.
+ // All 7 references for each inter-frame must only access buffer slots
+ // for that spatial layer.
+ // On key (super)frames: SL1 and SL2 must have no references set
+ // and must refresh all the slots for that layer only (so 2 and 3
+ // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally
+ // as a Key frame (refresh all slots). SL1/SL2 will be labelled
+ // internally as Intra-only frames that allow that stream to be decoded.
+ // These conditions will allow for each spatial stream to be
+ // independently decodeable.
+
+ // Initialize all references to 0 (don't use reference).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->reference[i] = 0;
+ // Initialize as no refresh/update for all slots.
+ for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+
+ if (is_key_frame) {
+ if (layer_id->spatial_layer_id == 0) {
+ // Assign LAST/GOLDEN to slot 0/1.
+ // Refesh slots 0 and 1 for SL0.
+ // SL0: this will get set to KEY frame internally.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 1;
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Assign LAST/GOLDEN to slot 2/3.
+ // Refesh slots 2 and 3 for SL1.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ ref_frame_config->refresh[2] = 1;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Assign LAST/GOLDEN to slot 4/5.
+ // Refresh slots 4 and 5 for SL2.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5;
+ ref_frame_config->refresh[4] = 1;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if (superframe_cnt % 4 == 0) {
+ // Base temporal layer: TL0
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST. Assign all references to either slot
+ // 0 or 1. Here we assign LAST to slot 0, all others to 1.
+ // Update slot 0 (LAST).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST. Assign all references to either slot
+ // 2 or 3. Here we assign LAST to slot 2, all others to 3.
+ // Update slot 2 (LAST).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->refresh[2] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST. Assign all references to either slot
+ // 4 or 5. Here we assign LAST to slot 4, all others to 5.
+ // Update slot 4 (LAST).
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ ref_frame_config->refresh[4] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0). Assign other references to slot 1.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2). Assign other references to slot 3.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer: TL1
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 1 and update slot 1.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4).
+ // Set GOLDEN to slot 5 and update slot 5.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 1). Assign other references to slot 0.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 3). Assign other references to slot 2.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 2;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 3;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 5). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5;
+ }
+ }
+ if (!simulcast_mode && layer_id->spatial_layer_id > 0) {
+ // Always reference GOLDEN (inter-layer prediction).
+ ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
+ if (ksvc_mode) {
+ // KSVC: only keep the inter-layer reference (GOLDEN) for
+ // superframes whose base is key.
+ if (!is_key_frame) ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0;
+ }
+ if (is_key_frame && layer_id->spatial_layer_id > 1) {
+ // On superframes whose base is key: remove LAST to avoid prediction
+ // off layer two levels below.
+ ref_frame_config->reference[SVC_LAST_FRAME] = 0;
+ }
+ }
+ // For 3 spatial layer case 8 (where there is free buffer slot):
+ // allow for top spatial layer to use additional temporal reference.
+ // Additional reference is only updated on base temporal layer, every
+ // 10 TL0 frames here.
+ if (!simulcast_mode && enable_longterm_temporal_ref &&
+ layer_id->spatial_layer_id == 2 && layering_mode == 8) {
+ ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1;
+ if (!is_key_frame) ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+ if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+ ref_frame_config->refresh[REF_FRAMES - 1] = 1;
+ }
+ break;
+ default: assert(0); die("Error: Unsupported temporal layering mode!\n");
+ }
+}
+
+#if CONFIG_AV1_DECODER
+// Returns whether there is a mismatch between the encoder's new frame and the
+// decoder's new frame.
+static int test_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
+ const int frames_out) {
+ aom_image_t enc_img, dec_img;
+ int mismatch = 0;
+
+ /* Get the internal new frame */
+ AOM_CODEC_CONTROL_TYPECHECKED(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
+ AOM_CODEC_CONTROL_TYPECHECKED(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+ (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+ if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_image_t enc_hbd_img;
+ aom_img_alloc(
+ &enc_hbd_img,
+ static_cast<aom_img_fmt_t>(enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH),
+ enc_img.d_w, enc_img.d_h, 16);
+ aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+ enc_img = enc_hbd_img;
+ }
+ if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_image_t dec_hbd_img;
+ aom_img_alloc(
+ &dec_hbd_img,
+ static_cast<aom_img_fmt_t>(dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH),
+ dec_img.d_w, dec_img.d_h, 16);
+ aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+ dec_img = dec_hbd_img;
+ }
+ }
+#endif
+
+ if (!aom_compare_img(&enc_img, &dec_img)) {
+ int y[4], u[4], v[4];
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+ } else {
+ aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+ }
+#else
+ aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
+ fprintf(stderr,
+ "Encode/decode mismatch on frame %d at"
+ " Y[%d, %d] {%d/%d},"
+ " U[%d, %d] {%d/%d},"
+ " V[%d, %d] {%d/%d}\n",
+ frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0],
+ v[1], v[2], v[3]);
+ mismatch = 1;
+ }
+
+ aom_img_free(&enc_img);
+ aom_img_free(&dec_img);
+ return mismatch;
+}
+#endif // CONFIG_AV1_DECODER
+
+struct psnr_stats {
+ // The second element of these arrays is reserved for high bitdepth.
+ uint64_t psnr_sse_total[2];
+ uint64_t psnr_samples_total[2];
+ double psnr_totals[2][4];
+ int psnr_count[2];
+};
+
+static void show_psnr(struct psnr_stats *psnr_stream, double peak) {
+ double ovpsnr;
+
+ if (!psnr_stream->psnr_count[0]) return;
+
+ fprintf(stderr, "\nPSNR (Overall/Avg/Y/U/V)");
+ ovpsnr = sse_to_psnr((double)psnr_stream->psnr_samples_total[0], peak,
+ (double)psnr_stream->psnr_sse_total[0]);
+ fprintf(stderr, " %.3f", ovpsnr);
+
+ for (int i = 0; i < 4; i++) {
+ fprintf(stderr, " %.3f",
+ psnr_stream->psnr_totals[0][i] / psnr_stream->psnr_count[0]);
+ }
+ fprintf(stderr, "\n");
+}
+
+static aom::AV1RateControlRtcConfig create_rtc_rc_config(
+ const aom_codec_enc_cfg_t &cfg, const AppInput &app_input) {
+ aom::AV1RateControlRtcConfig rc_cfg;
+ rc_cfg.width = cfg.g_w;
+ rc_cfg.height = cfg.g_h;
+ rc_cfg.max_quantizer = cfg.rc_max_quantizer;
+ rc_cfg.min_quantizer = cfg.rc_min_quantizer;
+ rc_cfg.target_bandwidth = cfg.rc_target_bitrate;
+ rc_cfg.buf_initial_sz = cfg.rc_buf_initial_sz;
+ rc_cfg.buf_optimal_sz = cfg.rc_buf_optimal_sz;
+ rc_cfg.buf_sz = cfg.rc_buf_sz;
+ rc_cfg.overshoot_pct = cfg.rc_overshoot_pct;
+ rc_cfg.undershoot_pct = cfg.rc_undershoot_pct;
+ // This is hardcoded as AOME_SET_MAX_INTRA_BITRATE_PCT
+ rc_cfg.max_intra_bitrate_pct = 300;
+ rc_cfg.framerate = cfg.g_timebase.den;
+ // TODO(jianj): Add suppor for SVC.
+ rc_cfg.ss_number_layers = 1;
+ rc_cfg.ts_number_layers = 1;
+ rc_cfg.scaling_factor_num[0] = 1;
+ rc_cfg.scaling_factor_den[0] = 1;
+ rc_cfg.layer_target_bitrate[0] = static_cast<int>(rc_cfg.target_bandwidth);
+ rc_cfg.max_quantizers[0] = rc_cfg.max_quantizer;
+ rc_cfg.min_quantizers[0] = rc_cfg.min_quantizer;
+ rc_cfg.aq_mode = app_input.aq_mode;
+
+ return rc_cfg;
+}
+
+static int qindex_to_quantizer(int qindex) {
+ // Table that converts 0-63 range Q values passed in outside to the 0-255
+ // range Qindex used internally.
+ static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+ };
+ for (int quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
+
+int main(int argc, const char **argv) {
+ AppInput app_input;
+ AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
+ FILE *obu_files[AOM_MAX_LAYERS] = { NULL };
+ AvxVideoWriter *total_layer_file = NULL;
+ FILE *total_layer_obu_file = NULL;
+ aom_codec_enc_cfg_t cfg;
+ int frame_cnt = 0;
+ aom_image_t raw;
+ int frame_avail;
+ int got_data = 0;
+ int flags = 0;
+ int i;
+ int pts = 0; // PTS starts at 0.
+ int frame_duration = 1; // 1 timebase tick per frame.
+ aom_svc_layer_id_t layer_id;
+ aom_svc_params_t svc_params;
+ aom_svc_ref_frame_config_t ref_frame_config;
+ aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred;
+
+#if CONFIG_INTERNAL_STATS
+ FILE *stats_file = fopen("opsnr.stt", "a");
+ if (stats_file == NULL) {
+ die("Cannot open opsnr.stt\n");
+ }
+#endif
+#if CONFIG_AV1_DECODER
+ aom_codec_ctx_t decoder;
+#endif
+
+ struct RateControlMetrics rc;
+ int64_t cx_time = 0;
+ int64_t cx_time_layer[AOM_MAX_LAYERS]; // max number of layers.
+ int frame_cnt_layer[AOM_MAX_LAYERS];
+ double sum_bitrate = 0.0;
+ double sum_bitrate2 = 0.0;
+ double framerate = 30.0;
+ int use_svc_control = 1;
+ int set_err_resil_frame = 0;
+ int test_changing_bitrate = 0;
+ zero(rc.layer_target_bitrate);
+ memset(&layer_id, 0, sizeof(aom_svc_layer_id_t));
+ memset(&app_input, 0, sizeof(AppInput));
+ memset(&svc_params, 0, sizeof(svc_params));
+
+ // Flag to test dynamic scaling of source frames for single
+ // spatial stream, using the scaling_mode control.
+ const int test_dynamic_scaling_single_layer = 0;
+
+ // Flag to test setting speed per layer.
+ const int test_speed_per_layer = 0;
+
+ /* Setup default input stream settings */
+ app_input.input_ctx.framerate.numerator = 30;
+ app_input.input_ctx.framerate.denominator = 1;
+ app_input.input_ctx.only_i420 = 0;
+ app_input.input_ctx.bit_depth = AOM_BITS_8;
+ app_input.speed = 7;
+ exec_name = argv[0];
+
+ // start with default encoder configuration
+ aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg,
+ AOM_USAGE_REALTIME);
+ if (res != AOM_CODEC_OK) {
+ die("Failed to get config: %s\n", aom_codec_err_to_string(res));
+ }
+
+ // Real time parameters.
+ cfg.g_usage = AOM_USAGE_REALTIME;
+
+ cfg.rc_end_usage = AOM_CBR;
+ cfg.rc_min_quantizer = 2;
+ cfg.rc_max_quantizer = 52;
+ cfg.rc_undershoot_pct = 50;
+ cfg.rc_overshoot_pct = 50;
+ cfg.rc_buf_initial_sz = 600;
+ cfg.rc_buf_optimal_sz = 600;
+ cfg.rc_buf_sz = 1000;
+ cfg.rc_resize_mode = 0; // Set to RESIZE_DYNAMIC for dynamic resize.
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_AUTO;
+
+ parse_command_line(argc, argv, &app_input, &svc_params, &cfg);
+
+ int ts_number_layers = svc_params.number_temporal_layers;
+ int ss_number_layers = svc_params.number_spatial_layers;
+
+ unsigned int width = cfg.g_w;
+ unsigned int height = cfg.g_h;
+
+ if (app_input.layering_mode >= 0) {
+ if (ts_number_layers !=
+ mode_to_num_temporal_layers[app_input.layering_mode] ||
+ ss_number_layers !=
+ mode_to_num_spatial_layers[app_input.layering_mode]) {
+ die("Number of layers doesn't match layering mode.");
+ }
+ }
+
+ // Y4M reader has its own allocation.
+ if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
+ if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) {
+ die("Failed to allocate image (%dx%d)", width, height);
+ }
+ }
+
+ aom_codec_iface_t *encoder = aom_codec_av1_cx();
+
+ memcpy(&rc.layer_target_bitrate[0], &svc_params.layer_target_bitrate[0],
+ sizeof(svc_params.layer_target_bitrate));
+
+ unsigned int total_rate = 0;
+ for (i = 0; i < ss_number_layers; i++) {
+ total_rate +=
+ svc_params
+ .layer_target_bitrate[i * ts_number_layers + ts_number_layers - 1];
+ }
+ if (total_rate != cfg.rc_target_bitrate) {
+ die("Incorrect total target bitrate");
+ }
+
+ svc_params.framerate_factor[0] = 1;
+ if (ts_number_layers == 2) {
+ svc_params.framerate_factor[0] = 2;
+ svc_params.framerate_factor[1] = 1;
+ } else if (ts_number_layers == 3) {
+ svc_params.framerate_factor[0] = 4;
+ svc_params.framerate_factor[1] = 2;
+ svc_params.framerate_factor[2] = 1;
+ }
+
+ if (app_input.input_ctx.file_type == FILE_TYPE_Y4M) {
+ // Override these settings with the info from Y4M file.
+ cfg.g_w = app_input.input_ctx.width;
+ cfg.g_h = app_input.input_ctx.height;
+ // g_timebase is the reciprocal of frame rate.
+ cfg.g_timebase.num = app_input.input_ctx.framerate.denominator;
+ cfg.g_timebase.den = app_input.input_ctx.framerate.numerator;
+ }
+ framerate = cfg.g_timebase.den / cfg.g_timebase.num;
+ set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
+
+ AvxVideoInfo info;
+ info.codec_fourcc = get_fourcc_by_aom_encoder(encoder);
+ info.frame_width = cfg.g_w;
+ info.frame_height = cfg.g_h;
+ info.time_base.numerator = cfg.g_timebase.num;
+ info.time_base.denominator = cfg.g_timebase.den;
+ // Open an output file for each stream.
+ for (int sl = 0; sl < ss_number_layers; ++sl) {
+ for (int tl = 0; tl < ts_number_layers; ++tl) {
+ i = sl * ts_number_layers + tl;
+ char file_name[PATH_MAX];
+ snprintf(file_name, sizeof(file_name), "%s_%d.av1",
+ app_input.output_filename, i);
+ if (app_input.output_obu) {
+ obu_files[i] = fopen(file_name, "wb");
+ if (!obu_files[i]) die("Failed to open %s for writing", file_name);
+ } else {
+ outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info);
+ if (!outfile[i]) die("Failed to open %s for writing", file_name);
+ }
+ }
+ }
+ if (app_input.output_obu) {
+ total_layer_obu_file = fopen(app_input.output_filename, "wb");
+ if (!total_layer_obu_file)
+ die("Failed to open %s for writing", app_input.output_filename);
+ } else {
+ total_layer_file =
+ aom_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+ if (!total_layer_file)
+ die("Failed to open %s for writing", app_input.output_filename);
+ }
+
+ // Initialize codec.
+ aom_codec_ctx_t codec;
+ aom_codec_flags_t flag = 0;
+ flag |= cfg.g_input_bit_depth == AOM_BITS_8 ? 0 : AOM_CODEC_USE_HIGHBITDEPTH;
+ flag |= app_input.show_psnr ? AOM_CODEC_USE_PSNR : 0;
+ if (aom_codec_enc_init(&codec, encoder, &cfg, flag))
+ die_codec(&codec, "Failed to initialize encoder");
+
+#if CONFIG_AV1_DECODER
+ if (app_input.decode) {
+ if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0))
+ die_codec(&decoder, "Failed to initialize decoder");
+ }
+#endif
+
+ aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed);
+ aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0);
+ aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1);
+ aom_codec_control(&codec, AV1E_SET_LOOPFILTER_CONTROL, 1);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_WARPED_MOTION, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_OBMC, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0);
+ aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0);
+ aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3);
+ aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3);
+ aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3);
+ aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3);
+ aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1);
+
+ // Settings to reduce key frame encoding time.
+ aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_SMOOTH_INTRA, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_ANGLE_DELTA, 0);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_FILTER_INTRA, 0);
+ aom_codec_control(&codec, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1);
+
+ if (cfg.g_threads > 1) {
+ aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS,
+ (unsigned int)log2(cfg.g_threads));
+ }
+
+ aom_codec_control(&codec, AV1E_SET_TUNE_CONTENT, app_input.tune_content);
+ if (app_input.tune_content == AOM_CONTENT_SCREEN) {
+ aom_codec_control(&codec, AV1E_SET_ENABLE_PALETTE, 1);
+ aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 1);
+ // INTRABC is currently disabled for rt mode, as it's too slow.
+ aom_codec_control(&codec, AV1E_SET_ENABLE_INTRABC, 0);
+ }
+
+ if (app_input.use_external_rc) {
+ aom_codec_control(&codec, AV1E_SET_RTC_EXTERNAL_RC, 1);
+ }
+
+ aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX);
+
+ aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE,
+ AOM_FULL_SUPERFRAME_DROP);
+
+ svc_params.number_spatial_layers = ss_number_layers;
+ svc_params.number_temporal_layers = ts_number_layers;
+ for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
+ svc_params.max_quantizers[i] = cfg.rc_max_quantizer;
+ svc_params.min_quantizers[i] = cfg.rc_min_quantizer;
+ }
+ for (i = 0; i < ss_number_layers; ++i) {
+ svc_params.scaling_factor_num[i] = 1;
+ svc_params.scaling_factor_den[i] = 1;
+ }
+ if (ss_number_layers == 2) {
+ svc_params.scaling_factor_num[0] = 1;
+ svc_params.scaling_factor_den[0] = 2;
+ } else if (ss_number_layers == 3) {
+ svc_params.scaling_factor_num[0] = 1;
+ svc_params.scaling_factor_den[0] = 4;
+ svc_params.scaling_factor_num[1] = 1;
+ svc_params.scaling_factor_den[1] = 2;
+ }
+ aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params);
+ // TODO(aomedia:3032): Configure KSVC in fixed mode.
+
+ // This controls the maximum target size of the key frame.
+ // For generating smaller key frames, use a smaller max_intra_size_pct
+ // value, like 100 or 200.
+ {
+ const int max_intra_size_pct = 300;
+ aom_codec_control(&codec, AOME_SET_MAX_INTRA_BITRATE_PCT,
+ max_intra_size_pct);
+ }
+
+ for (int lx = 0; lx < ts_number_layers * ss_number_layers; lx++) {
+ cx_time_layer[lx] = 0;
+ frame_cnt_layer[lx] = 0;
+ }
+
+ std::unique_ptr<aom::AV1RateControlRTC> rc_api;
+ if (app_input.use_external_rc) {
+ const aom::AV1RateControlRtcConfig rc_cfg =
+ create_rtc_rc_config(cfg, app_input);
+ rc_api = aom::AV1RateControlRTC::Create(rc_cfg);
+ }
+
+ frame_avail = 1;
+ struct psnr_stats psnr_stream;
+ memset(&psnr_stream, 0, sizeof(psnr_stream));
+ while (frame_avail || got_data) {
+ struct aom_usec_timer timer;
+ frame_avail = read_frame(&(app_input.input_ctx), &raw);
+ // Loop over spatial layers.
+ for (int slx = 0; slx < ss_number_layers; slx++) {
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt;
+ int layer = 0;
+ // Flag for superframe whose base is key.
+ int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0;
+ // For flexible mode:
+ if (app_input.layering_mode >= 0) {
+ // Set the reference/update flags, layer_id, and reference_map
+ // buffer index.
+ set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
+ &ref_frame_config, &ref_frame_comp_pred,
+ &use_svc_control, slx, is_key_frame,
+ (app_input.layering_mode == 10), app_input.speed);
+ aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+ if (use_svc_control) {
+ aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+ &ref_frame_config);
+ aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+ &ref_frame_comp_pred);
+ }
+ // Set the speed per layer.
+ if (test_speed_per_layer) {
+ int speed_per_layer = 10;
+ if (layer_id.spatial_layer_id == 0) {
+ if (layer_id.temporal_layer_id == 0) speed_per_layer = 6;
+ if (layer_id.temporal_layer_id == 1) speed_per_layer = 7;
+ if (layer_id.temporal_layer_id == 2) speed_per_layer = 8;
+ } else if (layer_id.spatial_layer_id == 1) {
+ if (layer_id.temporal_layer_id == 0) speed_per_layer = 7;
+ if (layer_id.temporal_layer_id == 1) speed_per_layer = 8;
+ if (layer_id.temporal_layer_id == 2) speed_per_layer = 9;
+ } else if (layer_id.spatial_layer_id == 2) {
+ if (layer_id.temporal_layer_id == 0) speed_per_layer = 8;
+ if (layer_id.temporal_layer_id == 1) speed_per_layer = 9;
+ if (layer_id.temporal_layer_id == 2) speed_per_layer = 10;
+ }
+ aom_codec_control(&codec, AOME_SET_CPUUSED, speed_per_layer);
+ }
+ } else {
+ // Only up to 3 temporal layers supported in fixed mode.
+ // Only need to set spatial and temporal layer_id: reference
+ // prediction, refresh, and buffer_idx are set internally.
+ layer_id.spatial_layer_id = slx;
+ layer_id.temporal_layer_id = 0;
+ if (ts_number_layers == 2) {
+ layer_id.temporal_layer_id = (frame_cnt % 2) != 0;
+ } else if (ts_number_layers == 3) {
+ if (frame_cnt % 2 != 0)
+ layer_id.temporal_layer_id = 2;
+ else if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0))
+ layer_id.temporal_layer_id = 1;
+ }
+ aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+ }
+
+ if (set_err_resil_frame && cfg.g_error_resilient == 0) {
+ // Set error_resilient per frame: off/0 for base layer and
+ // on/1 for enhancement layer frames.
+ // Note that this is can only be done on the fly/per-frame/layer
+ // if the config error_resilience is off/0. See the logic for updating
+ // in set_encoder_config():
+ // tool_cfg->error_resilient_mode =
+ // cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+ const int err_resil_mode =
+ layer_id.spatial_layer_id > 0 || layer_id.temporal_layer_id > 0;
+ aom_codec_control(&codec, AV1E_SET_ERROR_RESILIENT_MODE,
+ err_resil_mode);
+ }
+
+ layer = slx * ts_number_layers + layer_id.temporal_layer_id;
+ if (frame_avail && slx == 0) ++rc.layer_input_frames[layer];
+
+ if (test_dynamic_scaling_single_layer) {
+ // Example to scale source down by 2x2, then 4x4, and then back up to
+ // 2x2, and then back to original.
+ int frame_2x2 = 200;
+ int frame_4x4 = 400;
+ int frame_2x2up = 600;
+ int frame_orig = 800;
+ if (frame_cnt >= frame_2x2 && frame_cnt < frame_4x4) {
+ // Scale source down by 2x2.
+ struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+ aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+ } else if (frame_cnt >= frame_4x4 && frame_cnt < frame_2x2up) {
+ // Scale source down by 4x4.
+ struct aom_scaling_mode mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+ aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+ } else if (frame_cnt >= frame_2x2up && frame_cnt < frame_orig) {
+ // Source back up to 2x2.
+ struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+ aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+ } else if (frame_cnt >= frame_orig) {
+ // Source back up to original resolution (no scaling).
+ struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+ aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+ }
+ if (frame_cnt == frame_2x2 || frame_cnt == frame_4x4 ||
+ frame_cnt == frame_2x2up || frame_cnt == frame_orig) {
+ // For dynamic resize testing on single layer: refresh all references
+ // on the resized frame: this is to avoid decode error:
+ // if resize goes down by >= 4x4 then libaom decoder will throw an
+ // error that some reference (even though not used) is beyond the
+ // limit size (must be smaller than 4x4).
+ for (i = 0; i < REF_FRAMES; i++) ref_frame_config.refresh[i] = 1;
+ if (use_svc_control) {
+ aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+ &ref_frame_config);
+ aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+ &ref_frame_comp_pred);
+ }
+ }
+ }
+
+ // Change target_bitrate every other frame.
+ if (test_changing_bitrate && frame_cnt % 2 == 0) {
+ if (frame_cnt < 500)
+ cfg.rc_target_bitrate += 10;
+ else
+ cfg.rc_target_bitrate -= 10;
+ // Do big increase and decrease.
+ if (frame_cnt == 100) cfg.rc_target_bitrate <<= 1;
+ if (frame_cnt == 600) cfg.rc_target_bitrate >>= 1;
+ if (cfg.rc_target_bitrate < 100) cfg.rc_target_bitrate = 100;
+ // Call change_config, or bypass with new control.
+ // res = aom_codec_enc_config_set(&codec, &cfg);
+ if (aom_codec_control(&codec, AV1E_SET_BITRATE_ONE_PASS_CBR,
+ cfg.rc_target_bitrate))
+ die_codec(&codec, "Failed to SET_BITRATE_ONE_PASS_CBR");
+ }
+
+ if (rc_api) {
+ aom::AV1FrameParamsRTC frame_params;
+ // TODO(jianj): Add support for SVC.
+ frame_params.spatial_layer_id = 0;
+ frame_params.temporal_layer_id = 0;
+ frame_params.frame_type =
+ is_key_frame ? aom::kKeyFrame : aom::kInterFrame;
+ rc_api->ComputeQP(frame_params);
+ const int current_qp = rc_api->GetQP();
+ if (aom_codec_control(&codec, AV1E_SET_QUANTIZER_ONE_PASS,
+ qindex_to_quantizer(current_qp))) {
+ die_codec(&codec, "Failed to SET_QUANTIZER_ONE_PASS");
+ }
+ }
+
+ // Do the layer encode.
+ aom_usec_timer_start(&timer);
+ if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags))
+ die_codec(&codec, "Failed to encode frame");
+ aom_usec_timer_mark(&timer);
+ cx_time += aom_usec_timer_elapsed(&timer);
+ cx_time_layer[layer] += aom_usec_timer_elapsed(&timer);
+ frame_cnt_layer[layer] += 1;
+
+ got_data = 0;
+ // For simulcast (mode 11): write out each spatial layer to the file.
+ int ss_layers_write = (app_input.layering_mode == 11)
+ ? layer_id.spatial_layer_id + 1
+ : ss_number_layers;
+ while ((pkt = aom_codec_get_cx_data(&codec, &iter))) {
+ switch (pkt->kind) {
+ case AOM_CODEC_CX_FRAME_PKT:
+ for (int sl = layer_id.spatial_layer_id; sl < ss_layers_write;
+ ++sl) {
+ for (int tl = layer_id.temporal_layer_id; tl < ts_number_layers;
+ ++tl) {
+ int j = sl * ts_number_layers + tl;
+ if (app_input.output_obu) {
+ fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+ obu_files[j]);
+ } else {
+ aom_video_writer_write_frame(
+ outfile[j],
+ reinterpret_cast<const uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz, pts);
+ }
+ if (sl == layer_id.spatial_layer_id)
+ rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz;
+ }
+ }
+ got_data = 1;
+ // Write everything into the top layer.
+ if (app_input.output_obu) {
+ fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+ total_layer_obu_file);
+ } else {
+ aom_video_writer_write_frame(
+ total_layer_file,
+ reinterpret_cast<const uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz, pts);
+ }
+ // Keep count of rate control stats per layer (for non-key).
+ if (!(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+ int j = layer_id.spatial_layer_id * ts_number_layers +
+ layer_id.temporal_layer_id;
+ assert(j >= 0);
+ rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
+ rc.layer_avg_rate_mismatch[j] +=
+ fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
+ rc.layer_pfb[j];
+ if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id];
+ }
+
+ if (rc_api) {
+ rc_api->PostEncodeUpdate(pkt->data.frame.sz);
+ }
+ // Update for short-time encoding bitrate states, for moving window
+ // of size rc->window, shifted by rc->window / 2.
+ // Ignore first window segment, due to key frame.
+ // For spatial layers: only do this for top/highest SL.
+ if (frame_cnt > rc.window_size && slx == ss_number_layers - 1) {
+ sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+ rc.window_size = (rc.window_size <= 0) ? 1 : rc.window_size;
+ if (frame_cnt % rc.window_size == 0) {
+ rc.window_count += 1;
+ rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+ rc.variance_st_encoding_bitrate +=
+ (sum_bitrate / rc.window_size) *
+ (sum_bitrate / rc.window_size);
+ sum_bitrate = 0.0;
+ }
+ }
+ // Second shifted window.
+ if (frame_cnt > rc.window_size + rc.window_size / 2 &&
+ slx == ss_number_layers - 1) {
+ sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+ if (frame_cnt > 2 * rc.window_size &&
+ frame_cnt % rc.window_size == 0) {
+ rc.window_count += 1;
+ rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+ rc.variance_st_encoding_bitrate +=
+ (sum_bitrate2 / rc.window_size) *
+ (sum_bitrate2 / rc.window_size);
+ sum_bitrate2 = 0.0;
+ }
+ }
+
+#if CONFIG_AV1_DECODER
+ if (app_input.decode) {
+ if (aom_codec_decode(
+ &decoder,
+ reinterpret_cast<const uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz, NULL))
+ die_codec(&decoder, "Failed to decode frame");
+ }
+#endif
+
+ break;
+ case AOM_CODEC_PSNR_PKT:
+ if (app_input.show_psnr) {
+ psnr_stream.psnr_sse_total[0] += pkt->data.psnr.sse[0];
+ psnr_stream.psnr_samples_total[0] += pkt->data.psnr.samples[0];
+ for (int plane = 0; plane < 4; plane++) {
+ psnr_stream.psnr_totals[0][plane] += pkt->data.psnr.psnr[plane];
+ }
+ psnr_stream.psnr_count[0]++;
+ }
+ break;
+ default: break;
+ }
+ }
+#if CONFIG_AV1_DECODER
+ if (got_data && app_input.decode) {
+ // Don't look for mismatch on top spatial and top temporal layers as
+ // they are non reference frames.
+ if ((ss_number_layers > 1 || ts_number_layers > 1) &&
+ !(layer_id.temporal_layer_id > 0 &&
+ layer_id.temporal_layer_id == ts_number_layers - 1)) {
+ if (test_decode(&codec, &decoder, frame_cnt)) {
+#if CONFIG_INTERNAL_STATS
+ fprintf(stats_file, "First mismatch occurred in frame %d\n",
+ frame_cnt);
+ fclose(stats_file);
+#endif
+ fatal("Mismatch seen");
+ }
+ }
+ }
+#endif
+ } // loop over spatial layers
+ ++frame_cnt;
+ pts += frame_duration;
+ }
+
+ close_input_file(&(app_input.input_ctx));
+ printout_rate_control_summary(&rc, frame_cnt, ss_number_layers,
+ ts_number_layers);
+
+ printf("\n");
+ for (int slx = 0; slx < ss_number_layers; slx++)
+ for (int tlx = 0; tlx < ts_number_layers; tlx++) {
+ int lx = slx * ts_number_layers + tlx;
+ printf("Per layer encoding time/FPS stats for encoder: %d %d %d %f %f \n",
+ slx, tlx, frame_cnt_layer[lx],
+ (float)cx_time_layer[lx] / (double)(frame_cnt_layer[lx] * 1000),
+ 1000000 * (double)frame_cnt_layer[lx] / (double)cx_time_layer[lx]);
+ }
+
+ printf("\n");
+ printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n",
+ frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+ 1000000 * (double)frame_cnt / (double)cx_time);
+
+ if (app_input.show_psnr) {
+ show_psnr(&psnr_stream, 255.0);
+ }
+
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy encoder");
+
+#if CONFIG_AV1_DECODER
+ if (app_input.decode) {
+ if (aom_codec_destroy(&decoder))
+ die_codec(&decoder, "Failed to destroy decoder");
+ }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+ fprintf(stats_file, "No mismatch detected in recon buffers\n");
+ fclose(stats_file);
+#endif
+
+ // Try to rewrite the output file headers with the actual frame count.
+ for (i = 0; i < ss_number_layers * ts_number_layers; ++i)
+ aom_video_writer_close(outfile[i]);
+ aom_video_writer_close(total_layer_file);
+
+ if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
+ aom_img_free(&raw);
+ }
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/twopass_encoder.c b/third_party/aom/examples/twopass_encoder.c
new file mode 100644
index 0000000000..388f68bd4d
--- /dev/null
+++ b/third_party/aom/examples/twopass_encoder.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Two Pass Encoder
+// ================
+//
+// This is an example of a two pass encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder twice, and writes the compressed
+// frames to disk in IVF format. It builds upon the simple_encoder example.
+//
+// Twopass Variables
+// -----------------
+// Twopass mode needs to track the current pass number and the buffer of
+// statistics packets.
+//
+// Updating The Configuration
+// ---------------------------------
+// In two pass mode, the configuration has to be updated on each pass. The
+// statistics buffer is passed on the last pass.
+//
+// Encoding A Frame
+// ----------------
+// Encoding a frame in two pass mode is identical to the simple encoder
+// example.
+//
+// Processing Statistics Packets
+// -----------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+//
+// Pass Progress Reporting
+// -----------------------------
+// It's sometimes helpful to see when each pass completes.
+//
+//
+// Clean-up
+// -----------------------------
+// Destruction of the encoder instance must be done on each pass. The
+// raw image should be destroyed at the end as usual.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+ fprintf(stderr,
+ "Usage: %s <codec> <width> <height> <infile> <outfile> "
+ "<limit(optional)>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
+ aom_codec_pts_t pts, unsigned int duration,
+ aom_enc_frame_flags_t flags,
+ aom_fixed_buf_t *stats) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+ if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
+
+ while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == AOM_CODEC_STATS_PKT) {
+ const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+ const size_t pkt_size = pkt->data.twopass_stats.sz;
+ stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+ if (!stats->buf) die("Failed to allocate frame stats buffer.");
+ memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+ stats->sz += pkt_size;
+ }
+ }
+
+ return got_pkts;
+}
+
+static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
+ aom_codec_pts_t pts, unsigned int duration,
+ aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
+ int got_pkts = 0;
+ aom_codec_iter_t iter = NULL;
+ const aom_codec_cx_pkt_t *pkt = NULL;
+ const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+ if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
+
+ while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+ if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts))
+ die_codec(ctx, "Failed to write compressed frame.");
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
+ aom_codec_iface_t *encoder,
+ const aom_codec_enc_cfg_t *cfg, int limit) {
+ aom_codec_ctx_t codec;
+ int frame_count = 0;
+ aom_fixed_buf_t stats = { NULL, 0 };
+
+ if (aom_codec_enc_init(&codec, encoder, cfg, 0))
+ die("Failed to initialize encoder");
+
+ // Calculate frame statistics.
+ while (aom_img_read(raw, infile) && frame_count < limit) {
+ ++frame_count;
+ get_frame_stats(&codec, raw, frame_count, 1, 0, &stats);
+ }
+
+ // Flush encoder.
+ while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
+ }
+
+ printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ return stats;
+}
+
+static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
+ aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg,
+ int limit) {
+ AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder),
+ cfg->g_w,
+ cfg->g_h,
+ { cfg->g_timebase.num, cfg->g_timebase.den },
+ 0 };
+ AvxVideoWriter *writer = NULL;
+ aom_codec_ctx_t codec;
+ int frame_count = 0;
+
+ writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
+ if (!writer) die("Failed to open %s for writing", outfile_name);
+
+ if (aom_codec_enc_init(&codec, encoder, cfg, 0))
+ die("Failed to initialize encoder");
+
+ if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+ die_codec(&codec, "Failed to set cpu-used");
+
+ // Encode frames.
+ while (aom_img_read(raw, infile) && frame_count < limit) {
+ ++frame_count;
+ encode_frame(&codec, raw, frame_count, 1, 0, writer);
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
+ }
+
+ printf("\n");
+
+ if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ aom_video_writer_close(writer);
+
+ printf("Pass 1 complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ int w, h;
+ aom_codec_ctx_t codec;
+ aom_codec_enc_cfg_t cfg;
+ aom_image_t raw;
+ aom_codec_err_t res;
+ aom_fixed_buf_t stats;
+
+ const int fps = 30; // TODO(dkovalev) add command line argument
+ const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument
+ const char *const codec_arg = argv[1];
+ const char *const width_arg = argv[2];
+ const char *const height_arg = argv[3];
+ const char *const infile_arg = argv[4];
+ const char *const outfile_arg = argv[5];
+ int limit = 0;
+ exec_name = argv[0];
+
+ if (argc < 6) die("Invalid number of arguments");
+
+ if (argc > 6) limit = (int)strtol(argv[6], NULL, 0);
+
+ if (limit == 0) limit = 100;
+
+ aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg);
+ if (!encoder) die("Unsupported codec.");
+
+ w = (int)strtol(width_arg, NULL, 0);
+ h = (int)strtol(height_arg, NULL, 0);
+
+ if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+ die("Invalid frame size: %dx%d", w, h);
+
+ if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1))
+ die("Failed to allocate image (%dx%d)", w, h);
+
+ printf("Using %s\n", aom_codec_iface_name(encoder));
+
+ // Configuration
+ res = aom_codec_enc_config_default(encoder, &cfg, 0);
+ if (res) die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = w;
+ cfg.g_h = h;
+ cfg.g_timebase.num = 1;
+ cfg.g_timebase.den = fps;
+ cfg.rc_target_bitrate = bitrate;
+
+ if (!(infile = fopen(infile_arg, "rb")))
+ die("Failed to open %s for reading", infile_arg);
+
+ // Pass 0
+ cfg.g_pass = AOM_RC_FIRST_PASS;
+ stats = pass0(&raw, infile, encoder, &cfg, limit);
+
+ // Pass 1
+ rewind(infile);
+ cfg.g_pass = AOM_RC_LAST_PASS;
+ cfg.rc_twopass_stats_in = stats;
+ pass1(&raw, infile, outfile_arg, encoder, &cfg, limit);
+ free(stats.buf);
+
+ aom_img_free(&raw);
+ fclose(infile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/keywords.dox b/third_party/aom/keywords.dox
new file mode 100644
index 0000000000..56f5368900
--- /dev/null
+++ b/third_party/aom/keywords.dox
@@ -0,0 +1,51 @@
+/*!\page rfc2119 RFC2119 Keywords
+
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
+ NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and
+ "OPTIONAL" in this document are to be interpreted as described in
+ <a href="http://www.ietf.org/rfc/rfc2119.txt">RFC 2119.</a>
+
+Specifically, the following definitions are used:
+
+\section MUST
+\anchor REQUIRED
+\anchor SHALL
+ This word, or the terms "REQUIRED" or "SHALL", mean that the
+ definition is an absolute requirement of the specification.
+
+\section MUSTNOT MUST NOT
+\anchor SHALLNOT
+ This phrase, or the phrase "SHALL NOT", mean that the
+ definition is an absolute prohibition of the specification.
+
+\section SHOULD
+\anchor RECOMMENDED
+ This word, or the adjective "RECOMMENDED", mean that there
+ may exist valid reasons in particular circumstances to ignore a
+ particular item, but the full implications must be understood and
+ carefully weighed before choosing a different course.
+
+\section SHOULDNOT SHOULD NOT
+\anchor NOTRECOMMENDED
+ This phrase, or the phrase "NOT RECOMMENDED" mean that
+ there may exist valid reasons in particular circumstances when the
+ particular behavior is acceptable or even useful, but the full
+ implications should be understood and the case carefully weighed
+ before implementing any behavior described with this label.
+
+\section MAY
+\anchor OPTIONAL
+ This word, or the adjective "OPTIONAL", mean that an item is
+ truly optional. One vendor may choose to include the item because a
+ particular marketplace requires it or because the vendor feels that
+ it enhances the product while another vendor may omit the same item.
+ An implementation which does not include a particular option \ref MUST be
+ prepared to interoperate with another implementation which does
+ include the option, though perhaps with reduced functionality. In the
+ same vein an implementation which does include a particular option
+ \ref MUST be prepared to interoperate with another implementation which
+ does not include the option (except, of course, for the feature the
+ option provides.)
+
+
+*/
diff --git a/third_party/aom/libs.doxy_template b/third_party/aom/libs.doxy_template
new file mode 100644
index 0000000000..ba77751a50
--- /dev/null
+++ b/third_party/aom/libs.doxy_template
@@ -0,0 +1,2447 @@
+## Copyright (c) 2020, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+# Doxyfile 1.8.16
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME = "AOMedia AV1 Codec"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = docs
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
+
+ALIASES =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# (including Cygwin) ands Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS = NO
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT =
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS =
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX = NO
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX = YES
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE = letter
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS = *.h
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+#
+# In builds where CONFIG_REALTIME_ONLY is set some functions are #ifdefed out
+# which causes reference failures. Hence for doxygen we set it to 0 here.
+
+PREDEFINED = CONFIG_REALTIME_ONLY=0
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE = NO
diff --git a/third_party/aom/mainpage.dox b/third_party/aom/mainpage.dox
new file mode 100644
index 0000000000..10924acbcf
--- /dev/null
+++ b/third_party/aom/mainpage.dox
@@ -0,0 +1,68 @@
+/*!\mainpage AOMedia AV1 Codec
+
+ \tableofcontents
+
+ \section aom_sdk AOMedia Codec SDK
+
+ \subsection main_intro Introduction
+ Welcome to the AOMedia Codec SDK. This SDK allows you to integrate your
+ applications with the AOM and AV1 video codecs.
+
+ This distribution of the AOMedia Codec SDK includes the following support:
+
+ \if av1_encoder
+ - \ref aom_encoder
+ \endif
+ \if av1_decoder
+ - \ref aom_decoder
+ \endif
+
+
+ \subsection main_startpoints Starting Points
+ - Consult the \ref changelog for a complete list of improvements in this
+ release.
+ \if av1_md_support
+ - [README](\ref LREADME) contains instructions on compiling the sample applications.
+ \else
+ - \ref readme contains instructions on compiling the sample applications.
+ \endif
+ - Read the \ref usage "usage" for a narrative on codec usage.
+ - Read the \ref samples "sample code" for examples of how to interact with the
+ codec.
+ - \ref codec reference
+ \if encoder
+ - \ref encoder reference
+ \endif
+ \if decoder
+ - \ref decoder reference
+ \endif
+ <br>
+
+ \section av1_guide AV1 Developer's Guide
+
+ \if av1_encoder
+ - \ref encoder_guide
+ \endif
+
+ \if av1_decoder
+ - \ref decoder_guide
+ \endif
+ <br>
+
+ \section main_support Support Options & FAQ
+ The AOMedia project is an open source project supported by its community.
+ For questions about this SDK or for help, please visit http://aomedia.org/
+ and email the aomediacodec@jointdevelopment.kavi.com list.
+*/
+
+/*!\page changelog CHANGELOG
+ \verbinclude CHANGELOG
+*/
+
+\ifnot av1_md_support
+/*!\page readme README.md
+ \include README.md
+*/
+\endif
+
+/*!\defgroup codecs Supported Codecs */
diff --git a/third_party/aom/stats/aomstats.c b/third_party/aom/stats/aomstats.c
new file mode 100644
index 0000000000..a006ec030f
--- /dev/null
+++ b/third_party/aom/stats/aomstats.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "stats/aomstats.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "common/tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+ int res;
+ stats->pass = pass;
+
+ if (pass == 0) {
+ stats->file = fopen(fpf, "wb");
+ stats->buf.sz = 0;
+ stats->buf.buf = NULL;
+ res = (stats->file != NULL);
+ } else {
+ size_t nbytes;
+
+ stats->file = fopen(fpf, "rb");
+
+ if (stats->file == NULL) fatal("First-pass stats file does not exist!");
+
+ if (fseek(stats->file, 0, SEEK_END))
+ fatal("First-pass stats file must be seekable!");
+
+ stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+ rewind(stats->file);
+
+ stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+ if (!stats->buf.buf)
+ fatal("Failed to allocate first-pass stats buffer (%u bytes)",
+ (unsigned int)stats->buf_alloc_sz);
+
+ nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+ res = (nbytes == stats->buf.sz);
+ }
+
+ return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+ int res;
+ stats->pass = pass;
+
+ if (!pass) {
+ stats->buf.sz = 0;
+ stats->buf_alloc_sz = 64 * 1024;
+ stats->buf.buf = malloc(stats->buf_alloc_sz);
+ }
+
+ stats->buf_ptr = stats->buf.buf;
+ res = (stats->buf.buf != NULL);
+ return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+ if (stats->file) {
+ if (stats->pass == last_pass) {
+ free(stats->buf.buf);
+ }
+
+ fclose(stats->file);
+ stats->file = NULL;
+ } else {
+ if (stats->pass == last_pass) free(stats->buf.buf);
+ }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+ if (stats->file) {
+ (void)fwrite(pkt, 1, len, stats->file);
+ return;
+ }
+ assert(stats->buf.sz <= stats->buf_alloc_sz);
+ assert(0 < stats->buf_alloc_sz);
+ if (stats->buf.sz + len > stats->buf_alloc_sz) {
+ // Grow by a factor of 1.5 each time, for amortized constant time.
+ // Also make sure there is enough room for the data.
+ size_t new_sz = AOMMAX((3 * stats->buf_alloc_sz) / 2, stats->buf.sz + len);
+ char *new_ptr = realloc(stats->buf.buf, new_sz);
+
+ if (new_ptr) {
+ stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+ stats->buf.buf = new_ptr;
+ stats->buf_alloc_sz = new_sz;
+ } else {
+ fatal("Failed to realloc firstpass stats buffer.");
+ }
+ }
+
+ memcpy(stats->buf_ptr, pkt, len);
+ stats->buf.sz += len;
+ stats->buf_ptr += len;
+}
+
+aom_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; }
diff --git a/third_party/aom/stats/aomstats.h b/third_party/aom/stats/aomstats.h
new file mode 100644
index 0000000000..b9c71871a0
--- /dev/null
+++ b/third_party/aom/stats/aomstats.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_STATS_AOMSTATS_H_
+#define AOM_STATS_AOMSTATS_H_
+
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+ aom_fixed_buf_t buf;
+ int pass;
+ FILE *file;
+ char *buf_ptr;
+ size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+aom_fixed_buf_t stats_get(stats_io_t *stats);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_STATS_AOMSTATS_H_
diff --git a/third_party/aom/stats/rate_hist.c b/third_party/aom/stats/rate_hist.c
new file mode 100644
index 0000000000..d79ebc5ad2
--- /dev/null
+++ b/third_party/aom/stats/rate_hist.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "stats/rate_hist.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include <math.h>
+
+#define RATE_BINS 100
+#define HIST_BAR_MAX 40
+
+struct hist_bucket {
+ int low;
+ int high;
+ int count;
+};
+
+struct rate_hist {
+ int64_t *pts;
+ int *sz;
+ int samples;
+ int frames;
+ struct hist_bucket bucket[RATE_BINS];
+ int total;
+};
+
+struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
+ const aom_rational_t *fps) {
+ int i;
+ struct rate_hist *hist = calloc(1, sizeof(*hist));
+
+ if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 ||
+ fps->den == 0) {
+ goto fail;
+ }
+
+ // Determine the number of samples in the buffer. Use the file's framerate
+ // to determine the number of frames in rc_buf_sz milliseconds, with an
+ // adjustment (5/4) to account for alt-refs
+ hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+
+ // prevent division by zero
+ if (hist->samples == 0) hist->samples = 1;
+
+ hist->frames = 0;
+ hist->total = 0;
+
+ hist->pts = calloc(hist->samples, sizeof(*hist->pts));
+ hist->sz = calloc(hist->samples, sizeof(*hist->sz));
+ if (hist->pts == NULL || hist->sz == NULL) goto fail;
+ for (i = 0; i < RATE_BINS; i++) {
+ hist->bucket[i].low = INT_MAX;
+ hist->bucket[i].high = 0;
+ hist->bucket[i].count = 0;
+ }
+
+ return hist;
+
+fail:
+ fprintf(stderr,
+ "Warning: Unable to allocate buffers required for "
+ "show_rate_histogram().\n"
+ "Continuing without rate histogram feature...\n");
+ destroy_rate_histogram(hist);
+ return NULL;
+}
+
+void destroy_rate_histogram(struct rate_hist *hist) {
+ if (hist) {
+ free(hist->pts);
+ free(hist->sz);
+ free(hist);
+ }
+}
+
+void update_rate_histogram(struct rate_hist *hist,
+ const aom_codec_enc_cfg_t *cfg,
+ const aom_codec_cx_pkt_t *pkt) {
+ int i;
+ int64_t then = 0;
+ int64_t avg_bitrate = 0;
+ int64_t sum_sz = 0;
+ const int64_t now = pkt->data.frame.pts * 1000 *
+ (uint64_t)cfg->g_timebase.num /
+ (uint64_t)cfg->g_timebase.den;
+
+ int idx;
+
+ if (hist == NULL || cfg == NULL || pkt == NULL) return;
+
+ idx = hist->frames++ % hist->samples;
+ hist->pts[idx] = now;
+ hist->sz[idx] = (int)pkt->data.frame.sz;
+
+ if (now < cfg->rc_buf_initial_sz) return;
+
+ if (!cfg->rc_target_bitrate) return;
+
+ then = now;
+
+ /* Sum the size over the past rc_buf_sz ms */
+ for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) {
+ const int i_idx = (i - 1) % hist->samples;
+
+ then = hist->pts[i_idx];
+ if (now - then > cfg->rc_buf_sz) break;
+ sum_sz += hist->sz[i_idx];
+ }
+
+ if (now == then) return;
+
+ avg_bitrate = sum_sz * 8 * 1000 / (now - then);
+ idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
+ if (idx < 0) idx = 0;
+ if (idx > RATE_BINS - 1) idx = RATE_BINS - 1;
+ if (hist->bucket[idx].low > avg_bitrate)
+ hist->bucket[idx].low = (int)avg_bitrate;
+ if (hist->bucket[idx].high < avg_bitrate)
+ hist->bucket[idx].high = (int)avg_bitrate;
+ hist->bucket[idx].count++;
+ hist->total++;
+}
+
+static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
+ int *num_buckets) {
+ int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
+ int buckets;
+ int i;
+
+ assert(bucket != NULL);
+ assert(num_buckets != NULL);
+
+ buckets = *num_buckets;
+
+ /* Find the extrema for this list of buckets */
+ big_bucket = small_bucket = 0;
+ for (i = 0; i < buckets; i++) {
+ if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+ if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
+ }
+
+ /* If we have too many buckets, merge the smallest with an adjacent
+ * bucket.
+ */
+ while (buckets > max_buckets) {
+ int last_bucket = buckets - 1;
+
+ /* merge the small bucket with an adjacent one. */
+ if (small_bucket == 0)
+ merge_bucket = 1;
+ else if (small_bucket == last_bucket)
+ merge_bucket = last_bucket - 1;
+ else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count)
+ merge_bucket = small_bucket - 1;
+ else
+ merge_bucket = small_bucket + 1;
+
+ assert(abs(merge_bucket - small_bucket) <= 1);
+ assert(small_bucket < buckets);
+ assert(big_bucket < buckets);
+ assert(merge_bucket < buckets);
+
+ if (merge_bucket < small_bucket) {
+ bucket[merge_bucket].high = bucket[small_bucket].high;
+ bucket[merge_bucket].count += bucket[small_bucket].count;
+ } else {
+ bucket[small_bucket].high = bucket[merge_bucket].high;
+ bucket[small_bucket].count += bucket[merge_bucket].count;
+ merge_bucket = small_bucket;
+ }
+
+ assert(bucket[merge_bucket].low != bucket[merge_bucket].high);
+
+ buckets--;
+
+ /* Remove the merge_bucket from the list, and find the new small
+ * and big buckets while we're at it
+ */
+ big_bucket = small_bucket = 0;
+ for (i = 0; i < buckets; i++) {
+ if (i > merge_bucket) bucket[i] = bucket[i + 1];
+
+ if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+ if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
+ }
+ }
+
+ *num_buckets = buckets;
+ return bucket[big_bucket].count;
+}
+
+static void show_histogram(const struct hist_bucket *bucket, int buckets,
+ int total, int scale) {
+ int width1, width2;
+ int i;
+
+ if (!buckets) return;
+ assert(bucket != NULL);
+ assert(buckets > 0);
+
+ switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
+ case 1:
+ case 2:
+ width1 = 4;
+ width2 = 2;
+ break;
+ case 3:
+ width1 = 5;
+ width2 = 3;
+ break;
+ case 4:
+ width1 = 6;
+ width2 = 4;
+ break;
+ case 5:
+ width1 = 7;
+ width2 = 5;
+ break;
+ case 6:
+ width1 = 8;
+ width2 = 6;
+ break;
+ case 7:
+ width1 = 9;
+ width2 = 7;
+ break;
+ default:
+ width1 = 12;
+ width2 = 10;
+ break;
+ }
+
+ for (i = 0; i < buckets; i++) {
+ int len;
+ int j;
+ float pct;
+
+ pct = (float)(100.0 * bucket[i].count / total);
+ len = HIST_BAR_MAX * bucket[i].count / scale;
+ if (len < 1) len = 1;
+ assert(len <= HIST_BAR_MAX);
+
+ if (bucket[i].low == bucket[i].high)
+ fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, "");
+ else
+ fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2,
+ bucket[i].high);
+
+ for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
+ fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
+ }
+}
+
+void show_q_histogram(const int counts[64], int max_buckets) {
+ struct hist_bucket bucket[64];
+ int buckets = 0;
+ int total = 0;
+ int scale;
+ int i;
+
+ for (i = 0; i < 64; i++) {
+ if (counts[i]) {
+ bucket[buckets].low = bucket[buckets].high = i;
+ bucket[buckets].count = counts[i];
+ buckets++;
+ total += counts[i];
+ }
+ }
+
+ fprintf(stderr, "\nQuantizer Selection:\n");
+ scale = merge_hist_buckets(bucket, max_buckets, &buckets);
+ show_histogram(bucket, buckets, total, scale);
+}
+
+void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
+ int max_buckets) {
+ int i, scale;
+ int buckets = 0;
+
+ if (hist == NULL || cfg == NULL) return;
+
+ for (i = 0; i < RATE_BINS; i++) {
+ if (hist->bucket[i].low == INT_MAX) continue;
+ hist->bucket[buckets++] = hist->bucket[i];
+ }
+
+ fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz);
+ scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets);
+ show_histogram(hist->bucket, buckets, hist->total, scale);
+}
diff --git a/third_party/aom/stats/rate_hist.h b/third_party/aom/stats/rate_hist.h
new file mode 100644
index 0000000000..55b8c5d439
--- /dev/null
+++ b/third_party/aom/stats/rate_hist.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_STATS_RATE_HIST_H_
+#define AOM_STATS_RATE_HIST_H_
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rate_hist;
+
+struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
+ const aom_rational_t *fps);
+
+void destroy_rate_histogram(struct rate_hist *hist);
+
+void update_rate_histogram(struct rate_hist *hist,
+ const aom_codec_enc_cfg_t *cfg,
+ const aom_codec_cx_pkt_t *pkt);
+
+void show_q_histogram(const int counts[64], int max_buckets);
+
+void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
+ int max_buckets);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_STATS_RATE_HIST_H_
diff --git a/third_party/aom/test/accounting_test.cc b/third_party/aom/test/accounting_test.cc
new file mode 100644
index 0000000000..033499d13b
--- /dev/null
+++ b/third_party/aom/test/accounting_test.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+TEST(AV1, TestAccounting) {
+ const int kBufferSize = 10000;
+ const int kSymbols = 1024;
+ aom_writer bw;
+ uint8_t bw_buffer[kBufferSize];
+ aom_start_encode(&bw, bw_buffer);
+ for (int i = 0; i < kSymbols; i++) {
+ aom_write(&bw, 0, 32);
+ aom_write(&bw, 0, 32);
+ aom_write(&bw, 0, 32);
+ }
+ GTEST_ASSERT_GE(aom_stop_encode(&bw), 0);
+ aom_reader br;
+ aom_reader_init(&br, bw_buffer, bw.pos);
+
+ Accounting accounting;
+ aom_accounting_init(&accounting);
+ br.accounting = &accounting;
+ for (int i = 0; i < kSymbols; i++) {
+ aom_read(&br, 32, "A");
+ }
+ // Consecutive symbols that are the same are coalesced.
+ GTEST_ASSERT_EQ(accounting.syms.num_syms, 1);
+ GTEST_ASSERT_EQ(accounting.syms.syms[0].samples, (unsigned int)kSymbols);
+
+ aom_accounting_reset(&accounting);
+ GTEST_ASSERT_EQ(accounting.syms.num_syms, 0);
+
+ // Should record 2 * kSymbols accounting symbols.
+ aom_reader_init(&br, bw_buffer, bw.pos);
+ br.accounting = &accounting;
+ for (int i = 0; i < kSymbols; i++) {
+ aom_read(&br, 32, "A");
+ aom_read(&br, 32, "B");
+ aom_read(&br, 32, "B");
+ }
+ GTEST_ASSERT_EQ(accounting.syms.num_syms, kSymbols * 2);
+ uint32_t tell_frac = aom_reader_tell_frac(&br);
+ for (int i = 0; i < accounting.syms.num_syms; i++) {
+ tell_frac -= accounting.syms.syms[i].bits;
+ }
+ GTEST_ASSERT_EQ(tell_frac, 0U);
+
+ GTEST_ASSERT_EQ(aom_accounting_dictionary_lookup(&accounting, "A"),
+ aom_accounting_dictionary_lookup(&accounting, "A"));
+
+ // Check for collisions. The current aom_accounting_hash function returns
+ // the same hash code for AB and BA.
+ GTEST_ASSERT_NE(aom_accounting_dictionary_lookup(&accounting, "AB"),
+ aom_accounting_dictionary_lookup(&accounting, "BA"));
+}
diff --git a/third_party/aom/test/acm_random.h b/third_party/aom/test/acm_random.h
new file mode 100644
index 0000000000..15e8c9cc2e
--- /dev/null
+++ b/third_party/aom/test/acm_random.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_ACM_RANDOM_H_
+#define AOM_TEST_ACM_RANDOM_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_integer.h"
+
+namespace libaom_test {
+
+class ACMRandom {
+ public:
+ ACMRandom() : random_(DeterministicSeed()) {}
+
+ explicit ACMRandom(int seed) : random_(seed) {}
+
+ void Reset(int seed) { random_.Reseed(seed); }
+
+ // Generates a random 31-bit unsigned integer from [0, 2^31).
+ uint32_t Rand31() {
+ return random_.Generate(testing::internal::Random::kMaxRange);
+ }
+
+ uint16_t Rand16() {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ // There's a bit more entropy in the upper bits of this implementation.
+ return (value >> 15) & 0xffff;
+ }
+
+ int16_t Rand16Signed() { return static_cast<int16_t>(Rand16()); }
+
+ int16_t Rand15() {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ // There's a bit more entropy in the upper bits of this implementation.
+ return (value >> 16) & 0x7fff;
+ }
+
+ int16_t Rand15Signed() {
+ // Use 15 bits: values between 16383 (0x3FFF) and -16384 (0xC000).
+ return static_cast<int16_t>(Rand15()) - (1 << 14);
+ }
+
+ uint16_t Rand12() {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ // There's a bit more entropy in the upper bits of this implementation.
+ return (value >> 19) & 0xfff;
+ }
+
+ uint8_t Rand8() {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ // There's a bit more entropy in the upper bits of this implementation.
+ return (value >> 23) & 0xff;
+ }
+
+ uint8_t Rand8Extremes() {
+ // Returns a random value near 0 or near 255, to better exercise
+ // saturation behavior.
+ const uint8_t r = Rand8();
+ return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+ }
+
+ int PseudoUniform(int range) { return random_.Generate(range); }
+
+ int operator()(int n) { return PseudoUniform(n); }
+
+ static int DeterministicSeed() { return 0xbaba; }
+
+ private:
+ testing::internal::Random random_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_ACM_RANDOM_H_
diff --git a/third_party/aom/test/active_map_test.cc b/third_party/aom/test/active_map_test.cc
new file mode 100644
index 0000000000..979ee6b8b3
--- /dev/null
+++ b/third_party/aom/test/active_map_test.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class ActiveMapTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ static const int kWidth = 208;
+ static const int kHeight = 144;
+
+ ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
+ ~ActiveMapTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ cpu_used_ = GET_PARAM(2);
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ } else if (video->frame() == 3) {
+ aom_active_map_t map = aom_active_map_t();
+ /* clang-format off */
+ uint8_t active_map[9 * 13] = {
+ 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+ 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+ 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+ 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
+ 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
+ 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
+ 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+ };
+ /* clang-format on */
+ map.cols = (kWidth + 15) / 16;
+ map.rows = (kHeight + 15) / 16;
+ ASSERT_EQ(map.cols, 13u);
+ ASSERT_EQ(map.rows, 9u);
+ map.active_map = active_map;
+ encoder->Control(AOME_SET_ACTIVEMAP, &map);
+ } else if (video->frame() == 15) {
+ aom_active_map_t map = aom_active_map_t();
+ map.cols = (kWidth + 15) / 16;
+ map.rows = (kHeight + 15) / 16;
+ map.active_map = nullptr;
+ encoder->Control(AOME_SET_ACTIVEMAP, &map);
+ }
+ }
+
+ void DoTest() {
+ // Validate that this non multiple of 64 wide clip encodes
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_target_bitrate = 400;
+ cfg_.rc_resize_mode = 0;
+ cfg_.g_pass = AOM_RC_ONE_PASS;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.kf_max_dist = 90000;
+ ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
+ 1, 0, 20);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ int cpu_used_;
+};
+
+TEST_P(ActiveMapTest, Test) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(5, 9));
+
+} // namespace
diff --git a/third_party/aom/test/allintra_end_to_end_test.cc b/third_party/aom/test/allintra_end_to_end_test.cc
new file mode 100644
index 0000000000..8ec24aa686
--- /dev/null
+++ b/third_party/aom/test/allintra_end_to_end_test.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 20;
+const int kBitrate = 500;
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+ { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Params: test video, speed, aq mode, threads, tile columns.
+class AllIntraEndToEndTest
+ : public ::libaom_test::CodecTestWith6Params<TestVideoParam, int, int, int,
+ int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AllIntraEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0),
+ deltaq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
+ tile_columns_(GET_PARAM(5)), enable_tx_size_search_(GET_PARAM(6)) {}
+
+ ~AllIntraEndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kAllIntra);
+ cfg_.g_threads = threads_;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_ROW_MT, 1);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
+ encoder->Control(AV1E_SET_ENABLE_TX_SIZE_SEARCH, enable_tx_size_search_);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ void DoTest() {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ if (is_extension_y4m(test_video_param_.filename))
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ else
+ video.reset(new libaom_test::YUVVideoSource(test_video_param_.filename,
+ test_video_param_.fmt, 352,
+ 288, 30, 1, 0, kFrames));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ }
+
+ TestVideoParam test_video_param_;
+ int cpu_used_;
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ unsigned int deltaq_mode_;
+ int threads_;
+ int tile_columns_;
+ int enable_tx_size_search_;
+};
+
+TEST_P(AllIntraEndToEndTest, EndToEndNoFailure) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(AllIntraEndToEndTest,
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Range(5, 9), ::testing::Range(0, 4),
+ ::testing::Values(1), ::testing::Values(1),
+ ::testing::Values(0, 1));
+
+INSTANTIATE_TEST_SUITE_P(
+ AV1MultiThreaded, AllIntraEndToEndTest,
+ ::testing::Combine(
+ ::testing::Values(
+ static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+ ::testing::ValuesIn(kTestVectors), ::testing::Range(5, 9),
+ ::testing::Range(0, 4), ::testing::Values(6), ::testing::Values(1),
+ ::testing::Values(0, 1)));
+
+} // namespace
diff --git a/third_party/aom/test/altref_test.cc b/third_party/aom/test/altref_test.cc
new file mode 100644
index 0000000000..081123cbe4
--- /dev/null
+++ b/third_party/aom/test/altref_test.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+namespace {
+typedef struct {
+ const unsigned int min_kf_dist;
+ const unsigned int max_kf_dist;
+ const unsigned int min_gf_interval;
+ const unsigned int max_gf_interval;
+ const unsigned int lag_in_frames;
+ libaom_test::TestMode encoding_mode;
+} AltRefTestParams;
+
+static const AltRefTestParams TestParams[] = {
+ { 0, 10, 4, 8, 10, ::libaom_test::kOnePassGood },
+ { 0, 30, 8, 12, 16, ::libaom_test::kOnePassGood },
+ { 30, 30, 12, 16, 25, ::libaom_test::kOnePassGood },
+ { 0, 60, 12, 20, 25, ::libaom_test::kOnePassGood },
+ { 60, 60, 16, 28, 30, ::libaom_test::kOnePassGood },
+ { 0, 100, 16, 32, 35, ::libaom_test::kOnePassGood },
+ { 0, 10, 4, 8, 10, ::libaom_test::kTwoPassGood },
+ { 0, 30, 8, 12, 16, ::libaom_test::kTwoPassGood },
+ { 30, 30, 12, 16, 25, ::libaom_test::kTwoPassGood },
+ { 0, 60, 16, 24, 25, ::libaom_test::kTwoPassGood },
+ { 60, 60, 20, 28, 30, ::libaom_test::kTwoPassGood },
+ { 0, 100, 24, 32, 35, ::libaom_test::kTwoPassGood },
+};
+
+std::ostream &operator<<(std::ostream &os, const AltRefTestParams &test_arg) {
+ return os << "AltRefTestParams { min_kf_dist:" << test_arg.min_kf_dist
+ << " max_kf_dist:" << test_arg.max_kf_dist
+ << " min_gf_interval:" << test_arg.min_gf_interval
+ << " max_gf_interval:" << test_arg.max_gf_interval
+ << " lag_in_frames:" << test_arg.lag_in_frames
+ << " encoding_mode:" << test_arg.encoding_mode << " }";
+}
+
+// This class is used to check the presence of altref frame.
+class AltRefFramePresenceTestLarge
+ : public ::libaom_test::CodecTestWith2Params<AltRefTestParams, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AltRefFramePresenceTestLarge()
+ : EncoderTest(GET_PARAM(0)), altref_test_params_(GET_PARAM(1)),
+ rc_end_usage_(GET_PARAM(2)) {
+ is_arf_frame_present_ = 0;
+ }
+ ~AltRefFramePresenceTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(altref_test_params_.encoding_mode);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 1;
+ cfg_.kf_min_dist = altref_test_params_.min_kf_dist;
+ cfg_.kf_max_dist = altref_test_params_.max_kf_dist;
+ cfg_.g_lag_in_frames = altref_test_params_.lag_in_frames;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AV1E_SET_MIN_GF_INTERVAL,
+ altref_test_params_.min_gf_interval);
+ encoder->Control(AV1E_SET_MAX_GF_INTERVAL,
+ altref_test_params_.max_gf_interval);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (is_arf_frame_present_ != 1 && AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_ALTREF_PRESENT,
+ &is_arf_frame_present_);
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ const AltRefTestParams altref_test_params_;
+ int is_arf_frame_present_;
+ aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(AltRefFramePresenceTestLarge, AltRefFrameEncodePresenceTest) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 100);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(is_arf_frame_present_, 1);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AltRefFramePresenceTestLarge,
+ ::testing::ValuesIn(TestParams),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+typedef struct {
+ const ::libaom_test::TestMode encoding_mode;
+ const unsigned int min_gf_interval;
+ const unsigned int max_gf_interval;
+} gfIntervalParam;
+
+const gfIntervalParam gfTestParams[] = {
+ // single pass
+ { ::libaom_test::kOnePassGood, 0, 6 },
+ { ::libaom_test::kOnePassGood, 0, 8 },
+ { ::libaom_test::kOnePassGood, 5, 10 },
+ { ::libaom_test::kOnePassGood, 8, 16 },
+ { ::libaom_test::kOnePassGood, 16, 16 },
+
+ // two pass
+ { ::libaom_test::kTwoPassGood, 0, 6 },
+ { ::libaom_test::kTwoPassGood, 0, 8 },
+ { ::libaom_test::kTwoPassGood, 5, 10 },
+ { ::libaom_test::kTwoPassGood, 8, 16 },
+ { ::libaom_test::kTwoPassGood, 16, 32 },
+ { ::libaom_test::kTwoPassGood, 20, 32 },
+};
+
+// This class is used to test if the gf interval bounds configured by the user
+// are respected by the encoder.
+class GoldenFrameIntervalTestLarge
+ : public ::libaom_test::CodecTestWith2Params<gfIntervalParam, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ GoldenFrameIntervalTestLarge()
+ : EncoderTest(GET_PARAM(0)), gf_interval_param_(GET_PARAM(1)),
+ rc_end_usage_(GET_PARAM(2)) {
+ baseline_gf_interval_ = -1;
+ limit_ = 60;
+ frame_num_ = 0;
+ }
+ ~GoldenFrameIntervalTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(gf_interval_param_.encoding_mode);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 1;
+ // kf_min_dist is equal to kf_max_dist to make sure that there are no scene
+ // cuts due to which the min_gf_interval may not be respected.
+ cfg_.kf_min_dist = limit_;
+ cfg_.kf_max_dist = limit_;
+ cfg_.g_limit = limit_;
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_target_bitrate = 1000;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AV1E_SET_MIN_GF_INTERVAL,
+ gf_interval_param_.min_gf_interval);
+ encoder->Control(AV1E_SET_MAX_GF_INTERVAL,
+ gf_interval_param_.max_gf_interval);
+ }
+ if (frame_num_ > 0) {
+ encoder->Control(AV1E_GET_BASELINE_GF_INTERVAL, &baseline_gf_interval_);
+ ASSERT_LE(baseline_gf_interval_,
+ (int)gf_interval_param_.max_gf_interval + 1);
+ if ((frame_num_ + (int)gf_interval_param_.min_gf_interval) <= limit_) {
+ ASSERT_GE(baseline_gf_interval_,
+ (int)gf_interval_param_.min_gf_interval);
+ }
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ (void)pkt;
+ ++frame_num_;
+ }
+
+ const gfIntervalParam gf_interval_param_;
+ int baseline_gf_interval_;
+ int limit_;
+ int frame_num_;
+ aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(GoldenFrameIntervalTestLarge, GoldenFrameIntervalTest) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, limit_);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(GoldenFrameIntervalTestLarge,
+ ::testing::ValuesIn(gfTestParams),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ, AOM_CBR));
+
+} // namespace
diff --git a/third_party/aom/test/aom_image_test.cc b/third_party/aom/test/aom_image_test.cc
new file mode 100644
index 0000000000..ad48e73e3d
--- /dev/null
+++ b/third_party/aom/test/aom_image_test.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(AomImageTest, AomImgWrapInvalidAlign) {
+ const int kWidth = 128;
+ const int kHeight = 128;
+ unsigned char buf[kWidth * kHeight * 3];
+
+ aom_image_t img;
+ // Set img_data and img_data_owner to junk values. aom_img_wrap() should
+ // not read these values on failure.
+ img.img_data = (unsigned char *)"";
+ img.img_data_owner = 1;
+
+ aom_img_fmt_t format = AOM_IMG_FMT_I444;
+ // 'align' must be a power of 2 but is not. This causes the aom_img_wrap()
+ // call to fail. The test verifies we do not read the junk values in 'img'.
+ unsigned int align = 31;
+ EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr);
+}
+
+TEST(AomImageTest, AomImgSetRectOverflow) {
+ const int kWidth = 128;
+ const int kHeight = 128;
+ unsigned char buf[kWidth * kHeight * 3];
+
+ aom_image_t img;
+ aom_img_fmt_t format = AOM_IMG_FMT_I444;
+ unsigned int align = 32;
+ EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), &img);
+
+ EXPECT_EQ(aom_img_set_rect(&img, 0, 0, kWidth, kHeight, 0), 0);
+ // This would result in overflow because -1 is cast to UINT_MAX.
+ EXPECT_NE(aom_img_set_rect(&img, static_cast<unsigned int>(-1),
+ static_cast<unsigned int>(-1), kWidth, kHeight, 0),
+ 0);
+}
+
+TEST(AomImageTest, AomImgAllocNv12) {
+ const int kWidth = 128;
+ const int kHeight = 128;
+
+ aom_image_t img;
+ aom_img_fmt_t format = AOM_IMG_FMT_NV12;
+ unsigned int align = 32;
+ EXPECT_NE(aom_img_alloc(&img, format, kWidth, kHeight, align), nullptr);
+ EXPECT_EQ(img.stride[AOM_PLANE_U], img.stride[AOM_PLANE_Y]);
+ EXPECT_EQ(img.stride[AOM_PLANE_V], 0);
+ EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr);
+ aom_img_free(&img);
+}
diff --git a/third_party/aom/test/aom_integer_test.cc b/third_party/aom/test/aom_integer_test.cc
new file mode 100644
index 0000000000..fcbbfb4d48
--- /dev/null
+++ b/third_party/aom/test/aom_integer_test.cc
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+const uint64_t kMaximumLeb128CodedSize = 8;
+const uint8_t kLeb128PadByte = 0x80; // Binary: 10000000
+const uint64_t kMaximumLeb128Value = UINT32_MAX;
+const uint32_t kSizeTestNumValues = 6;
+const uint32_t kSizeTestExpectedSizes[kSizeTestNumValues] = {
+ 1, 1, 2, 3, 4, 5
+};
+const uint64_t kSizeTestInputs[kSizeTestNumValues] = { 0, 0x7f,
+ 0x3fff, 0x1fffff,
+ 0xffffff, 0x10000000 };
+
+const uint8_t kOutOfRangeLeb128Value[5] = { 0x80, 0x80, 0x80, 0x80,
+ 0x10 }; // UINT32_MAX + 1
+} // namespace
+
+TEST(AomLeb128, DecodeTest) {
+ const size_t num_leb128_bytes = 3;
+ const uint8_t leb128_bytes[num_leb128_bytes] = { 0xE5, 0x8E, 0x26 };
+ const uint64_t expected_value = 0x98765; // 624485
+ const size_t expected_length = 3;
+ uint64_t value = ~0ULL; // make sure value is cleared by the function
+ size_t length;
+ ASSERT_EQ(
+ aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes, &value, &length), 0);
+ ASSERT_EQ(expected_value, value);
+ ASSERT_EQ(expected_length, length);
+
+ // Make sure the decoder stops on the last marked LEB128 byte.
+ aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes + 1, &value, &length);
+ ASSERT_EQ(expected_value, value);
+ ASSERT_EQ(expected_length, length);
+}
+
+TEST(AomLeb128, EncodeTest) {
+ const uint32_t test_value = 0x98765; // 624485
+ const uint8_t expected_bytes[3] = { 0xE5, 0x8E, 0x26 };
+ const size_t kWriteBufferSize = 4;
+ uint8_t write_buffer[kWriteBufferSize] = { 0 };
+ size_t bytes_written = 0;
+ ASSERT_EQ(aom_uleb_encode(test_value, kWriteBufferSize, &write_buffer[0],
+ &bytes_written),
+ 0);
+ ASSERT_EQ(bytes_written, 3u);
+ for (size_t i = 0; i < bytes_written; ++i) {
+ ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+ }
+}
+
+TEST(AomLeb128, EncodeDecodeTest) {
+ const uint32_t value = 0x98765; // 624485
+ const size_t kWriteBufferSize = 4;
+ uint8_t write_buffer[kWriteBufferSize] = { 0 };
+ size_t bytes_written = 0;
+ ASSERT_EQ(aom_uleb_encode(value, kWriteBufferSize, &write_buffer[0],
+ &bytes_written),
+ 0);
+ ASSERT_EQ(bytes_written, 3u);
+ uint64_t decoded_value;
+ size_t decoded_length;
+ aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+ &decoded_length);
+ ASSERT_EQ(value, decoded_value);
+ ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, FixedSizeEncodeTest) {
+ const uint32_t test_value = 0x123;
+ const uint8_t expected_bytes[4] = { 0xa3, 0x82, 0x80, 0x00 };
+ const size_t kWriteBufferSize = 4;
+ uint8_t write_buffer[kWriteBufferSize] = { 0 };
+ size_t bytes_written = 0;
+ ASSERT_EQ(0, aom_uleb_encode_fixed_size(test_value, kWriteBufferSize,
+ kWriteBufferSize, &write_buffer[0],
+ &bytes_written));
+ ASSERT_EQ(kWriteBufferSize, bytes_written);
+ for (size_t i = 0; i < bytes_written; ++i) {
+ ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+ }
+}
+
+TEST(AomLeb128, FixedSizeEncodeDecodeTest) {
+ const uint32_t value = 0x1;
+ const size_t kWriteBufferSize = 4;
+ uint8_t write_buffer[kWriteBufferSize] = { 0 };
+ size_t bytes_written = 0;
+ ASSERT_EQ(
+ aom_uleb_encode_fixed_size(value, kWriteBufferSize, kWriteBufferSize,
+ &write_buffer[0], &bytes_written),
+ 0);
+ ASSERT_EQ(bytes_written, 4u);
+ uint64_t decoded_value;
+ size_t decoded_length;
+ aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+ &decoded_length);
+ ASSERT_EQ(value, decoded_value);
+ ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, SizeTest) {
+ for (size_t i = 0; i < kSizeTestNumValues; ++i) {
+ ASSERT_EQ(kSizeTestExpectedSizes[i],
+ aom_uleb_size_in_bytes(kSizeTestInputs[i]));
+ }
+}
+
+TEST(AomLeb128, DecodeFailTest) {
+ // Input buffer containing what would be a valid 9 byte LEB128 encoded
+ // unsigned integer.
+ const uint8_t kAllPadBytesBuffer[kMaximumLeb128CodedSize + 1] = {
+ kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+ kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+ kLeb128PadByte, kLeb128PadByte, 0
+ };
+ uint64_t decoded_value;
+
+ // Test that decode fails when result would be valid 9 byte integer.
+ ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize + 1,
+ &decoded_value, nullptr),
+ -1);
+
+ // Test that encoded value missing terminator byte within available buffer
+ // range causes decode error.
+ ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize,
+ &decoded_value, nullptr),
+ -1);
+
+ // Test that LEB128 input that decodes to a value larger than 32-bits fails.
+ size_t value_size = 0;
+ ASSERT_EQ(aom_uleb_decode(&kOutOfRangeLeb128Value[0],
+ sizeof(kOutOfRangeLeb128Value), &decoded_value,
+ &value_size),
+ -1);
+}
+
+TEST(AomLeb128, EncodeFailTest) {
+ const size_t kWriteBufferSize = 4;
+ const uint32_t kValidTestValue = 1;
+ uint8_t write_buffer[kWriteBufferSize] = { 0 };
+ size_t coded_size = 0;
+ ASSERT_EQ(
+ aom_uleb_encode(kValidTestValue, kWriteBufferSize, nullptr, &coded_size),
+ -1);
+ ASSERT_EQ(aom_uleb_encode(kValidTestValue, kWriteBufferSize, &write_buffer[0],
+ nullptr),
+ -1);
+
+ const uint32_t kValueOutOfRangeForBuffer = 0xFFFFFFFF;
+ ASSERT_EQ(aom_uleb_encode(kValueOutOfRangeForBuffer, kWriteBufferSize,
+ &write_buffer[0], &coded_size),
+ -1);
+
+ const uint64_t kValueOutOfRange = kMaximumLeb128Value + 1;
+ ASSERT_EQ(aom_uleb_encode(kValueOutOfRange, kWriteBufferSize,
+ &write_buffer[0], &coded_size),
+ -1);
+
+ const size_t kPadSizeOutOfRange = 5;
+ ASSERT_EQ(aom_uleb_encode_fixed_size(kValidTestValue, kWriteBufferSize,
+ kPadSizeOutOfRange, &write_buffer[0],
+ &coded_size),
+ -1);
+}
diff --git a/third_party/aom/test/aom_mem_test.cc b/third_party/aom/test/aom_mem_test.cc
new file mode 100644
index 0000000000..849ba64435
--- /dev/null
+++ b/third_party/aom/test/aom_mem_test.cc
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include <cstdio>
+#include <cstddef>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(AomMemTest, Overflow) {
+ // Allocations are aligned > 1 so SIZE_MAX should always fail.
+ ASSERT_EQ(aom_malloc(SIZE_MAX), nullptr);
+ ASSERT_EQ(aom_calloc(1, SIZE_MAX), nullptr);
+ ASSERT_EQ(aom_calloc(32, SIZE_MAX / 32), nullptr);
+ ASSERT_EQ(aom_calloc(SIZE_MAX, SIZE_MAX), nullptr);
+ ASSERT_EQ(aom_memalign(1, SIZE_MAX), nullptr);
+ ASSERT_EQ(aom_memalign(64, SIZE_MAX), nullptr);
+ ASSERT_EQ(aom_memalign(64, SIZE_MAX - 64), nullptr);
+ ASSERT_EQ(aom_memalign(64, SIZE_MAX - 64 - sizeof(size_t) + 2), nullptr);
+}
+
+TEST(AomMemTest, NullParams) {
+ ASSERT_EQ(aom_memset16(nullptr, 0, 0), nullptr);
+ aom_free(nullptr);
+}
diff --git a/third_party/aom/test/aomcx_set_ref.sh b/third_party/aom/test/aomcx_set_ref.sh
new file mode 100755
index 0000000000..237e2f319c
--- /dev/null
+++ b/third_party/aom/test/aomcx_set_ref.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom aom_cx_set_ref example. To add new tests to this
+## file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to aom_cx_set_ref_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+aom_cx_set_ref_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Runs aom_cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name, which aom_cx_set_ref does not support at present: It's
+# currently used only to name the output file.
+# TODO(tomfinegan): Pass the codec param once the example is updated to support
+# AV1.
+aom_set_ref() {
+ local encoder="${LIBAOM_BIN_PATH}/aom_cx_set_ref${AOM_TEST_EXE_SUFFIX}"
+ local codec="$1"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/aom_cx_set_ref_${codec}.ivf"
+ local ref_frame_num=4
+ local limit=10
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+ "${ref_frame_num}" "${limit}" ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+aom_cx_set_ref_av1() {
+ if [ "$(av1_encode_available)" = "yes" ]; then
+ aom_set_ref av1 || return 1
+ fi
+}
+
+aom_cx_set_ref_tests="aom_cx_set_ref_av1"
+
+run_tests aom_cx_set_ref_verify_environment "${aom_cx_set_ref_tests}"
+
diff --git a/third_party/aom/test/aomdec.sh b/third_party/aom/test/aomdec.sh
new file mode 100755
index 0000000000..e9738a8e89
--- /dev/null
+++ b/third_party/aom/test/aomdec.sh
@@ -0,0 +1,219 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests aomdec. To add new tests to this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to aomdec_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+AV1_MONOCHROME_B10="${LIBAOM_TEST_DATA_PATH}/av1-1-b10-24-monochrome.ivf"
+AV1_MONOCHROME_B8="${LIBAOM_TEST_DATA_PATH}/av1-1-b8-24-monochrome.ivf"
+
+# Environment check: Make sure input is available.
+aomdec_verify_environment() {
+ if [ "$(av1_encode_available)" != "yes" ] ; then
+ if [ ! -e "${AV1_IVF_FILE}" ] || \
+ [ ! -e "${AV1_OBU_ANNEXB_FILE}" ] || \
+ [ ! -e "${AV1_OBU_SEC5_FILE}" ] || \
+ [ ! -e "${AV1_WEBM_FILE}" ]; then
+ elog "Libaom test data must exist before running this test script when " \
+ " encoding is disabled. "
+ return 1
+ fi
+ fi
+ if [ ! -e "${AV1_MONOCHROME_B10}" ] || [ ! -e "${AV1_MONOCHROME_B8}" ]; then
+ elog "Libaom test data must exist before running this test script."
+ fi
+ if [ -z "$(aom_tool_path aomdec)" ]; then
+ elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent."
+ return 1
+ fi
+}
+
+# Wrapper function for running aomdec with pipe input. Requires that
+# LIBAOM_BIN_PATH points to the directory containing aomdec. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to aomdec.
+aomdec_pipe() {
+ local input="$1"
+ shift
+ if [ ! -e "${input}" ]; then
+ elog "Input file ($input) missing in aomdec_pipe()"
+ return 1
+ fi
+ cat "${file}" | aomdec - "$@" ${devnull}
+}
+
+
+# Wrapper function for running aomdec. Requires that LIBAOM_BIN_PATH points to
+# the directory containing aomdec. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to aomdec.
+aomdec() {
+ local decoder="$(aom_tool_path aomdec)"
+ local input="$1"
+ shift
+ eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
+}
+
+aomdec_can_decode_av1() {
+ if [ "$(av1_decode_available)" = "yes" ]; then
+ echo yes
+ fi
+}
+
+aomdec_av1_ivf() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_IVF_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --ivf || return 1
+ fi
+ aomdec "${AV1_IVF_FILE}" --summary --noblit
+ fi
+}
+
+aomdec_av1_ivf_error_resilient() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="av1.error-resilient.ivf"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1 || return 1
+ fi
+ aomdec "${file}" --summary --noblit
+ fi
+}
+
+ivf_multithread() {
+ local row_mt="$1"
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_IVF_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --ivf || return 1
+ fi
+ for threads in 2 3 4 5 6 7 8; do
+ aomdec "${file}" --summary --noblit --threads=$threads --row-mt=$row_mt \
+ || return 1
+ done
+ fi
+}
+
+aomdec_av1_ivf_multithread() {
+ ivf_multithread 0 # --row-mt=0
+}
+
+aomdec_av1_ivf_multithread_row_mt() {
+ ivf_multithread 1 # --row-mt=1
+}
+
+aomdec_aom_ivf_pipe_input() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_IVF_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --ivf || return 1
+ fi
+ aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit
+ fi
+}
+
+aomdec_av1_obu_annexb() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_OBU_ANNEXB_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 || return 1
+ fi
+ aomdec "${file}" --summary --noblit --annexb
+ fi
+}
+
+aomdec_av1_obu_annexb_pipe_input() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_OBU_ANNEXB_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --obu --annexb=1 || return 1
+ fi
+ aomdec_pipe "${file}" --summary --noblit --annexb
+ fi
+}
+
+aomdec_av1_obu_section5() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_OBU_SEC5_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --obu || return 1
+ fi
+ aomdec "${file}" --summary --noblit
+ fi
+}
+
+aomdec_av1_obu_section5_pipe_input() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local file="${AV1_OBU_SEC5_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" --obu || return 1
+ fi
+ aomdec_pipe "${file}" --summary --noblit
+ fi
+}
+
+aomdec_av1_webm() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local file="${AV1_WEBM_FILE}"
+ if [ ! -e "${file}" ]; then
+ encode_yuv_raw_input_av1 "${file}" || return 1
+ fi
+ aomdec "${AV1_WEBM_FILE}" --summary --noblit
+ fi
+}
+
+aomdec_av1_monochrome_yuv() {
+ if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+ local input="$1"
+ local basename="$(basename "${input}")"
+ local output="${basename}-%wx%h-%4.i420"
+ local md5file="${AOM_TEST_OUTPUT_DIR}/${basename}.md5"
+ local decoder="$(aom_tool_path aomdec)"
+ # Note aomdec() is not used to avoid ${devnull} which may also redirect
+ # stdout.
+ eval "${AOM_TEST_PREFIX}" "${decoder}" --md5 --i420 \
+ -o "${output}" "${input}" ">" "${md5file}" 2>&1 || return 1
+ diff "${1}.md5" "${md5file}"
+ fi
+}
+
+aomdec_av1_monochrome_yuv_8bit() {
+ aomdec_av1_monochrome_yuv "${AV1_MONOCHROME_B8}"
+}
+
+aomdec_av1_monochrome_yuv_10bit() {
+ aomdec_av1_monochrome_yuv "${AV1_MONOCHROME_B10}"
+}
+
+aomdec_tests="aomdec_av1_ivf
+ aomdec_av1_ivf_multithread
+ aomdec_av1_ivf_multithread_row_mt
+ aomdec_aom_ivf_pipe_input
+ aomdec_av1_monochrome_yuv_8bit"
+
+if [ ! "$(realtime_only_build)" = "yes" ]; then
+ aomdec_tests="${aomdec_tests}
+ aomdec_av1_ivf_error_resilient
+ aomdec_av1_obu_annexb
+ aomdec_av1_obu_section5
+ aomdec_av1_obu_annexb_pipe_input
+ aomdec_av1_obu_section5_pipe_input
+ aomdec_av1_webm"
+fi
+
+if [ "$(highbitdepth_available)" = "yes" ]; then
+ aomdec_tests="${aomdec_tests}
+ aomdec_av1_monochrome_yuv_10bit"
+fi
+
+run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/third_party/aom/test/aomenc.sh b/third_party/aom/test/aomenc.sh
new file mode 100755
index 0000000000..0bb9fba3b8
--- /dev/null
+++ b/third_party/aom/test/aomenc.sh
@@ -0,0 +1,306 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests aomenc using hantro_collage_w352h288.yuv as input. To add
+## new tests to this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to aomenc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available.
+aomenc_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then
+ elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in"
+ elog "LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ fi
+ if [ -z "$(aom_tool_path aomenc)" ]; then
+ elog "aomenc not found. It must exist in LIBAOM_BIN_PATH or its parent."
+ return 1
+ fi
+}
+
+aomenc_can_encode_av1() {
+ if [ "$(av1_encode_available)" = "yes" ]; then
+ echo yes
+ fi
+}
+
+# Utilities that echo aomenc input file parameters.
+y4m_input_non_square_par() {
+ echo ""${Y4M_NOSQ_PAR_INPUT}""
+}
+
+y4m_input_720p() {
+ echo ""${Y4M_720P_INPUT}""
+}
+
+# Wrapper function for running aomenc with pipe input. Requires that
+# LIBAOM_BIN_PATH points to the directory containing aomenc. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to aomenc.
+aomenc_pipe() {
+ local encoder="$(aom_tool_path aomenc)"
+ local input="$1"
+ shift
+ cat "${input}" | eval "${AOM_TEST_PREFIX}" "${encoder}" - \
+ --test-decode=fatal \
+ "$@" ${devnull}
+}
+
+# Wrapper function for running aomenc. Requires that LIBAOM_BIN_PATH points to
+# the directory containing aomenc. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to aomenc.
+aomenc() {
+ local encoder="$(aom_tool_path aomenc)"
+ local input="$1"
+ shift
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${input}" \
+ --test-decode=fatal \
+ "$@" ${devnull}
+}
+
+aomenc_av1_ivf() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AV1_IVF_FILE}"
+ if [ -e "${AV1_IVF_FILE}" ]; then
+ output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+ fi
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --ivf \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_ivf_rt() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AV1_IVF_FILE}"
+ if [ -e "${AV1_IVF_FILE}" ]; then
+ output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+ fi
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_rt_params) \
+ --ivf \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_ivf_use_16bit_internal() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AV1_IVF_FILE}"
+ if [ -e "${AV1_IVF_FILE}" ]; then
+ output="${AOM_TEST_OUTPUT_DIR}/av1_test_16bit.ivf"
+ fi
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --ivf \
+ --use-16bit-internal \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_obu_annexb() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AV1_OBU_ANNEXB_FILE}"
+ if [ -e "${AV1_OBU_ANNEXB_FILE}" ]; then
+ output="${AOM_TEST_OUTPUT_DIR}/av1_test.annexb.obu"
+ fi
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --obu \
+ --annexb=1 \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_obu_section5() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AV1_OBU_SEC5_FILE}"
+ if [ -e "${AV1_OBU_SEC5_FILE}" ]; then
+ output="${AOM_TEST_OUTPUT_DIR}/av1_test.section5.obu"
+ fi
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --obu \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_webm() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local output="${AV1_WEBM_FILE}"
+ if [ -e "${AV1_WEBM_FILE}" ]; then
+ output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+ fi
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_webm_1pass() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --passes=1 \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_ivf_lossless() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf"
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --ivf \
+ --output="${output}" \
+ --lossless=1 || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_ivf_minq0_maxq0() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+ local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf"
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --ivf \
+ --output="${output}" \
+ --min-q=0 \
+ --max-q=0 || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_webm_lag5_frames10() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local lag_total_frames=10
+ local lag_frames=5
+ local output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm"
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --limit=${lag_total_frames} \
+ --lag-in-frames=${lag_frames} \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+# TODO(fgalligan): Test that DisplayWidth is different than video width.
+aomenc_av1_webm_non_square_par() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
+ aomenc $(y4m_input_non_square_par) \
+ $(aomenc_encode_test_fast_params) \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+aomenc_av1_webm_cdf_update_mode() {
+ if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ for mode in 0 1 2; do
+ local output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm"
+ aomenc $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --cdf-update-mode=${mode} \
+ --output="${output}" || return 1
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ done
+ fi
+}
+
+if [ "$(realtime_only_build)" = "yes" ]; then
+ aomenc_tests="aomenc_av1_ivf_rt"
+else
+ aomenc_tests="aomenc_av1_ivf
+ aomenc_av1_ivf_rt
+ aomenc_av1_obu_annexb
+ aomenc_av1_obu_section5
+ aomenc_av1_webm
+ aomenc_av1_webm_1pass
+ aomenc_av1_ivf_lossless
+ aomenc_av1_ivf_minq0_maxq0
+ aomenc_av1_ivf_use_16bit_internal
+ aomenc_av1_webm_lag5_frames10
+ aomenc_av1_webm_non_square_par
+ aomenc_av1_webm_cdf_update_mode"
+fi
+
+run_tests aomenc_verify_environment "${aomenc_tests}"
diff --git a/third_party/aom/test/aq_segment_test.cc b/third_party/aom/test/aq_segment_test.cc
new file mode 100644
index 0000000000..674a883ea2
--- /dev/null
+++ b/third_party/aom/test/aq_segment_test.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+ { ::libaom_test::kRealTime };
+#else
+ { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
+class AqSegmentTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+ int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
+ ~AqSegmentTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ set_cpu_used_ = GET_PARAM(2);
+ aq_mode_ = 0;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+ encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
+ encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+ if (mode_ == ::libaom_test::kRealTime) {
+ encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ }
+ }
+ }
+
+ void DoTest(int aq_mode) {
+ aq_mode_ = aq_mode;
+ deltaq_mode_ = 0;
+ cfg_.kf_max_dist = 12;
+ cfg_.rc_min_quantizer = 8;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 6;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_target_bitrate = 300;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 15);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ int set_cpu_used_;
+ int aq_mode_;
+ int deltaq_mode_;
+};
+
+// Validate that this AQ segmentation mode (1-variance_aq, 2-complexity_aq,
+// 3-cyclic_refresh_aq) encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
+
+#if !CONFIG_REALTIME_ONLY
+// Validate that this delta q mode
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
+ cfg_.rc_end_usage = AOM_CQ;
+ aq_mode_ = 0;
+ deltaq_mode_ = 2;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 15);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#endif
+
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest, ::testing::ValuesIn(kTestModeParams),
+ ::testing::Range(5, 9), ::testing::Range(0, 4));
+
+#if !CONFIG_REALTIME_ONLY
+class AqSegmentTestLarge : public AqSegmentTest {};
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
+
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood),
+ ::testing::Range(3, 5), ::testing::Range(0, 4));
+#endif
+} // namespace
diff --git a/third_party/aom/test/arf_freq_test.cc b/third_party/aom/test/arf_freq_test.cc
new file mode 100644
index 0000000000..f51444da4d
--- /dev/null
+++ b/third_party/aom/test/arf_freq_test.cc
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "av1/encoder/ratectrl.h"
+
+namespace {
+
+const unsigned int kFrames = 100;
+const int kBitrate = 500;
+
+#define ARF_NOT_SEEN 1000001
+#define ARF_SEEN_ONCE 1000000
+
+typedef struct {
+ const char *filename;
+ unsigned int width;
+ unsigned int height;
+ unsigned int framerate_num;
+ unsigned int framerate_den;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+typedef struct {
+ libaom_test::TestMode mode;
+ int cpu_used;
+} TestEncodeParam;
+
+const TestVideoParam kTestVectors[] = {
+ // artificially increase framerate to trigger default check
+ { "hantro_collage_w352h288.yuv", 352, 288, 5000, 1, 8, AOM_IMG_FMT_I420,
+ AOM_BITS_8, 0 },
+ { "hantro_collage_w352h288.yuv", 352, 288, 30, 1, 8, AOM_IMG_FMT_I420,
+ AOM_BITS_8, 0 },
+ { "rush_hour_444.y4m", 352, 288, 30, 1, 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+ // Add list of profile 2/3 test videos here ...
+};
+
+const TestEncodeParam kEncodeVectors[] = {
+#if CONFIG_REALTIME_ONLY
+ { ::libaom_test::kRealTime, 5 },
+#else
+ { ::libaom_test::kRealTime, 5 }, { ::libaom_test::kOnePassGood, 2 },
+ { ::libaom_test::kOnePassGood, 5 }, { ::libaom_test::kTwoPassGood, 1 },
+ { ::libaom_test::kTwoPassGood, 2 }, { ::libaom_test::kTwoPassGood, 5 },
+#endif
+};
+
+const int kMinArfVectors[] = {
+ // NOTE: 0 refers to the default built-in logic in:
+ // av1_rc_get_default_min_gf_interval(...)
+ 0, 4, 8, 12, 15
+};
+
+class ArfFreqTestLarge
+ : public ::libaom_test::CodecTestWith3Params<TestVideoParam,
+ TestEncodeParam, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ArfFreqTestLarge()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
+
+ ~ArfFreqTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(test_encode_param_.mode);
+ if (test_encode_param_.mode != ::libaom_test::kRealTime) {
+ cfg_.g_lag_in_frames = 25;
+ } else {
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ }
+ }
+
+ void BeginPassHook(unsigned int) override {
+ min_run_ = ARF_NOT_SEEN;
+ run_of_visible_frames_ = 0;
+ }
+
+ int GetNumFramesInPkt(const aom_codec_cx_pkt_t *pkt) {
+ const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
+ const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+ const int mag = ((marker >> 3) & 3) + 1;
+ int frames = (marker & 0x7) + 1;
+ const unsigned int index_sz = 2 + mag * frames;
+ // Check for superframe or not.
+ // Assume superframe has only one visible frame, the rest being
+ // invisible. If superframe index is not found, then there is only
+ // one frame.
+ if (!((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz &&
+ buffer[pkt->data.frame.sz - index_sz] == marker)) {
+ frames = 1;
+ }
+ return frames;
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
+ const int frames = GetNumFramesInPkt(pkt);
+ if (frames == 1) {
+ run_of_visible_frames_++;
+ } else if (frames == 2) {
+ if (min_run_ == ARF_NOT_SEEN) {
+ min_run_ = ARF_SEEN_ONCE;
+ } else if (min_run_ == ARF_SEEN_ONCE ||
+ run_of_visible_frames_ < min_run_) {
+ min_run_ = run_of_visible_frames_;
+ }
+ run_of_visible_frames_ = 1;
+ } else {
+ min_run_ = 0;
+ run_of_visible_frames_ = 1;
+ }
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+ encoder->Control(AOME_SET_CPUUSED, test_encode_param_.cpu_used);
+ encoder->Control(AV1E_SET_MIN_GF_INTERVAL, min_arf_requested_);
+ if (test_encode_param_.mode != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+ }
+
+ int GetMinVisibleRun() const { return min_run_; }
+
+ int GetMinArfDistanceRequested() const {
+ if (min_arf_requested_)
+ return min_arf_requested_;
+ else
+ return av1_rc_get_default_min_gf_interval(
+ test_video_param_.width, test_video_param_.height,
+ (double)test_video_param_.framerate_num /
+ test_video_param_.framerate_den);
+ }
+
+ TestVideoParam test_video_param_;
+ TestEncodeParam test_encode_param_;
+
+ private:
+ int min_arf_requested_;
+ int min_run_;
+ int run_of_visible_frames_;
+};
+
+TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ if (is_extension_y4m(test_video_param_.filename)) {
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ } else {
+ video.reset(new libaom_test::YUVVideoSource(
+ test_video_param_.filename, test_video_param_.fmt,
+ test_video_param_.width, test_video_param_.height,
+ test_video_param_.framerate_num, test_video_param_.framerate_den, 0,
+ kFrames));
+ }
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const int min_run = GetMinVisibleRun();
+ const int min_arf_dist_requested = GetMinArfDistanceRequested();
+ if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+ const int min_arf_dist = min_run + 1;
+ EXPECT_GE(min_arf_dist, min_arf_dist_requested);
+ }
+}
+
+#if CONFIG_AV1_ENCODER
+// TODO(angiebird): 25-29 fail in high bitdepth mode.
+// TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as
+// BWDREF_FRAME is also a non-show frame, and the minimum run between two
+// consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive
+// number as long as it does not exceed the gf_group interval.
+INSTANTIATE_TEST_SUITE_P(
+ DISABLED_AV1, ArfFreqTestLarge,
+ ::testing::Combine(
+ ::testing::Values(
+ static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+ ::testing::ValuesIn(kTestVectors), ::testing::ValuesIn(kEncodeVectors),
+ ::testing::ValuesIn(kMinArfVectors)));
+#endif // CONFIG_AV1_ENCODER
+} // namespace
diff --git a/third_party/aom/test/av1_c_vs_simd_encode.sh b/third_party/aom/test/av1_c_vs_simd_encode.sh
new file mode 100755
index 0000000000..296204d118
--- /dev/null
+++ b/third_party/aom/test/av1_c_vs_simd_encode.sh
@@ -0,0 +1,566 @@
+#!/bin/sh
+## Copyright (c) 2023, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This script checks the bit exactness between C and SIMD
+## implementations of AV1 encoder.
+##
+. $(dirname $0)/tools_common.sh
+
+PRESETS="good rt"
+LOWBD_CIF_CLIP="yuv_raw_input"
+LOWBD_480p_CLIP="yuv_480p_raw_input"
+LOWBD_720p_CLIP="y4m_720p_input"
+HIGHBD_CLIP="y4m_360p_10bit_input"
+SC_CLIP="y4m_screen_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd)
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBAOM_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+Y4M_SCREEN_INPUT="${LIBAOM_TEST_DATA_PATH}/wikipedia_420_360p_60f.y4m"
+
+# Number of frames to test.
+AV1_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=35
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+ AOM_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+ AOM_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+ AOM_TEST_TEMP_ROOT=/tmp
+fi
+
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_TEMP_ROOT}/av1_test_$$"
+
+if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
+ [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+ echo "${0##*/}: Cannot create output directory, giving up."
+ echo "${0##*/}: AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}"
+ exit 1
+fi
+
+elog() {
+ echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${AOM_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+av1_enc_tool_path() {
+ local target="$1"
+ local preset="$2"
+ local tool_path="${AOM_TEST_OUTPUT_DIR}/build_target_${target}/aomenc_${preset}"
+
+ if [ ! -x "${tool_path}" ]; then
+ tool_path=""
+ fi
+ echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+av1_c_vs_simd_enc_verify_environment () {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${Y4M_720P_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -e "${Y4M_SCREEN_INPUT}" ]; then
+ elog "libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ ! -d "$LIBAOM_SOURCE_DIR" ]; then
+ elog "LIBAOM_SOURCE_DIR does not exist."
+ return 1
+ fi
+}
+
+# This is not needed since tools_common.sh does the same cleanup.
+# Keep the code here for our reference.
+# cleanup() {
+# rm -rf ${AOM_TEST_OUTPUT_DIR}
+# }
+
+# Echo AOM_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+ echo "0x1FF"
+}
+
+avx2() {
+ echo "0x0FF"
+}
+
+avx() {
+ echo "0x07F"
+}
+
+sse4_1() {
+ echo "0x03F"
+}
+
+ssse3() {
+ echo "0x01F"
+}
+
+sse3() {
+ echo "0x00F"
+}
+
+sse2() {
+ echo "0x007"
+}
+
+get_bitrates() {
+ local content=$1
+ local preset=$2
+
+ # Bit-rates:
+ local bitrate_lowres_good="300"
+ local bitrate_480p_good="500"
+ local bitrate_720p_good="1000"
+ local bitrate_scc_360p_good="500"
+ local bitrate_lowres_rt="200"
+ local bitrate_480p_rt="300"
+ local bitrate_720p_rt="600"
+ local bitrate_scc_360p_rt="300"
+ local bitrate_hbd_360p="500"
+
+ if [ "${preset}" = "good" ]; then
+ if [ "${content}" = "yuv_raw_input" ]; then
+ echo "${bitrate_lowres_good}"
+ elif [ "${content}" = "yuv_480p_raw_input" ]; then
+ echo "${bitrate_480p_good}"
+ elif [ "${content}" = "y4m_720p_input" ]; then
+ echo "${bitrate_720p_good}"
+ elif [ "${content}" = "y4m_screen_input" ]; then
+ echo "${bitrate_scc_360p_good}"
+ elif [ "${content}" = "y4m_360p_10bit_input" ]; then
+ echo "${bitrate_hbd_360p}"
+ else
+ elog "Invalid content"
+ fi
+ elif [ "${preset}" = "rt" ]; then
+ if [ "${content}" = "yuv_raw_input" ]; then
+ echo "${bitrate_lowres_rt}"
+ elif [ "${content}" = "yuv_480p_raw_input" ]; then
+ echo "${bitrate_480p_rt}"
+ elif [ "${content}" = "y4m_720p_input" ]; then
+ echo "${bitrate_720p_rt}"
+ elif [ "${content}" = "y4m_screen_input" ]; then
+ echo "${bitrate_scc_360p_rt}"
+ elif [ "${content}" = "y4m_360p_10bit_input" ]; then
+ echo "${bitrate_hbd_360p}"
+ else
+ elog "Invalid content"
+ fi
+ else
+ elog "invalid preset"
+ fi
+}
+
+# Echo clip details to be used as input to aomenc.
+yuv_raw_input() {
+ echo ""${YUV_RAW_INPUT}"
+ --width=352
+ --height=288
+ --bit-depth=8"
+}
+
+y4m_360p_10bit_input() {
+ echo ""${Y4M_360P_10BIT_INPUT}"
+ --bit-depth=10"
+}
+
+yuv_480p_raw_input() {
+ echo ""${YUV_480P_RAW_INPUT}"
+ --width=640
+ --height=480
+ --bit-depth=8"
+}
+
+y4m_720p_input() {
+ echo ""${Y4M_720P_INPUT}"
+ --bit-depth=8"
+}
+
+y4m_screen_input() {
+ echo ""${Y4M_SCREEN_INPUT}"
+ --tune-content=screen
+ --enable-palette=1
+ --bit-depth=8"
+}
+
+has_x86_isa_extn() {
+ instruction_set=$1
+ if ! grep -q "$instruction_set" /proc/cpuinfo; then
+ # This instruction set is not supported.
+ return 1
+ fi
+}
+
+# Echo good encode params for use with AV1 encoder.
+av1_encode_good_params() {
+ echo "--good \
+ --ivf \
+ --profile=0 \
+ --static-thresh=0 \
+ --threads=1 \
+ --tile-columns=0 \
+ --tile-rows=0 \
+ --verbose \
+ --end-usage=vbr \
+ --kf-max-dist=160 \
+ --kf-min-dist=0 \
+ --max-q=63 \
+ --min-q=0 \
+ --overshoot-pct=100 \
+ --undershoot-pct=100 \
+ --passes=2 \
+ --arnr-maxframes=7 \
+ --arnr-strength=5 \
+ --auto-alt-ref=1 \
+ --drop-frame=0 \
+ --frame-parallel=0 \
+ --lag-in-frames=35 \
+ --maxsection-pct=2000 \
+ --minsection-pct=0 \
+ --sharpness=0"
+}
+
+# Echo realtime encode params for use with AV1 encoder.
+av1_encode_rt_params() {
+ echo "--rt \
+ --ivf \
+ --profile=0 \
+ --static-thresh=0 \
+ --threads=1 \
+ --tile-columns=0 \
+ --tile-rows=0 \
+ --verbose \
+ --end-usage=cbr \
+ --kf-max-dist=90000 \
+ --max-q=58 \
+ --min-q=2 \
+ --overshoot-pct=50 \
+ --undershoot-pct=50 \
+ --passes=1 \
+ --aq-mode=3 \
+ --buf-initial-sz=500 \
+ --buf-optimal-sz=600 \
+ --buf-sz=1000 \
+ --coeff-cost-upd-freq=3 \
+ --dv-cost-upd-freq=3 \
+ --mode-cost-upd-freq=3 \
+ --mv-cost-upd-freq=3 \
+ --deltaq-mode=0 \
+ --enable-global-motion=0 \
+ --enable-obmc=0 \
+ --enable-order-hint=0 \
+ --enable-ref-frame-mvs=0 \
+ --enable-tpl-model=0 \
+ --enable-warped-motion=0 \
+ --lag-in-frames=0 \
+ --max-intra-rate=300 \
+ --noise-sensitivity=0"
+}
+
+# Configures for the given target in AOM_TEST_OUTPUT_DIR/build_target_${target}
+# directory.
+av1_enc_build() {
+ local target="$1"
+ local cmake_command="$2"
+ local tmp_build_dir=${AOM_TEST_OUTPUT_DIR}/build_target_${target}
+ if [ -d "$tmp_build_dir" ]; then
+ rm -rf $tmp_build_dir
+ fi
+
+ mkdir -p $tmp_build_dir
+ cd $tmp_build_dir
+
+ local cmake_common_args="-DCONFIG_EXCLUDE_SIMD_MISMATCH=1 \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DENABLE_CCACHE=1 \
+ '-DCMAKE_C_FLAGS_RELEASE=-O3 -g' \
+ '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g' \
+ -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DENABLE_TOOLS=0"
+
+ for preset in $PRESETS; do
+ echo "Building target[${preset} encoding]: ${target}"
+ if [ "${preset}" = "good" ]; then
+ local cmake_extra_args="-DCONFIG_AV1_HIGHBITDEPTH=1"
+ elif [ "${preset}" = "rt" ]; then
+ local cmake_extra_args="-DCONFIG_REALTIME_ONLY=1 -DCONFIG_AV1_HIGHBITDEPTH=0"
+ else
+ elog "Invalid preset"
+ return 1
+ fi
+ if ! eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" \
+ ${devnull}; then
+ elog "cmake failure"
+ return 1
+ fi
+ if ! eval make -j$(nproc) aomenc ${devnull}; then
+ elog "build failure"
+ return 1
+ fi
+
+ mv aomenc aomenc_${preset}
+ done
+ echo "Done building target: ${target}"
+}
+
+compare_enc_output() {
+ local target=$1
+ local cpu=$2
+ local clip=$3
+ local bitrate=$4
+ local preset=$5
+ if ! diff -q ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+ ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then
+ elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+ return 1
+ fi
+}
+
+av1_enc_test() {
+ local encoder="$1"
+ local arch="$2"
+ local target="$3"
+ local preset="$4"
+ if [ -z "$(av1_enc_tool_path "${target}" "${preset}")" ]; then
+ elog "aomenc_{preset} not found. It must exist in ${AOM_TEST_OUTPUT_DIR}/build_target_${target} path"
+ return 1
+ fi
+
+ if [ "${preset}" = "good" ]; then
+ if [ "${arch}" = "x86_64" ]; then
+ local min_cpu_used=0
+ local max_cpu_used=6
+ elif [ "${arch}" = "x86" ]; then
+ local min_cpu_used=2
+ local max_cpu_used=3
+ fi
+ local test_params=av1_encode_good_params
+ elif [ "${preset}" = "rt" ]; then
+ local min_cpu_used=5
+ local max_cpu_used=11
+ local test_params=av1_encode_rt_params
+ else
+ elog "Invalid preset"
+ return 1
+ fi
+
+ for cpu in $(seq $min_cpu_used $max_cpu_used); do
+ if [ "${preset}" = "good" ]; then
+ if [ "${arch}" = "x86_64" ]; then
+ if [ "${cpu}" -lt 2 ]; then
+ local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}"
+ elif [ "${cpu}" -lt 5 ]; then
+ local test_clips="${LOWBD_480p_CLIP} ${HIGHBD_CLIP}"
+ else
+ local test_clips="${LOWBD_720p_CLIP} ${HIGHBD_CLIP}"
+ fi
+ elif [ "${arch}" = "x86" ]; then
+ local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}"
+ elif [ "${arch}" = "arm64" ]; then
+ local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}"
+ fi
+ elif [ "${preset}" = "rt" ]; then
+ if [ "${cpu}" -lt 8 ]; then
+ local test_clips="${LOWBD_CIF_CLIP} ${SC_CLIP}"
+ else
+ local test_clips="${LOWBD_480p_CLIP} ${SC_CLIP}"
+ fi
+ else
+ elog "Invalid preset"
+ return 1
+ fi
+
+ for clip in ${test_clips}; do
+ local test_bitrates=$(get_bitrates ${clip} ${preset})
+ for bitrate in ${test_bitrates}; do
+ eval "${encoder}" $($clip) $($test_params) \
+ "--limit=${AV1_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+ "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+ ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+ ${devnull}
+
+ if [ "${target}" != "generic" ]; then
+ if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
+ # Found a mismatch
+ return 1
+ fi
+ fi
+ done
+ done
+ done
+}
+
+av1_test_generic() {
+ local arch=$1
+ local target="generic"
+ if [ $arch = "x86_64" ]; then
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR -DAOM_TARGET_CPU=${target}"
+ elif [ $arch = "x86" ]; then
+ # As AV1 encode output differs for x86 32-bit and 64-bit platforms
+ # (BUG=aomedia:3479), the x86 32-bit C-only build is generated separately.
+ # The cmake command line option -DENABLE_MMX=0 flag disables all SIMD
+ # optimizations, and generates a C-only binary.
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR -DENABLE_MMX=0 \
+ -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake"
+ fi
+
+ echo "Build for: Generic ${arch}"
+ if ! av1_enc_build "${target}" "${cmake_command}"; then
+ return 1
+ fi
+
+ for preset in $PRESETS; do
+ local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+ av1_enc_test $encoder "${arch}" "${target}" "${preset}"
+ done
+}
+
+# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are
+# no functions with MMX, SSE and AVX512 specialization.
+# The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants
+# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants
+# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants
+# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants
+# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants
+# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants
+# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants
+# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants
+# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as
+# all x86_64 platforms implement sse2.
+av1_test_x86() {
+ local arch=$1
+
+ if ! uname -m | grep -q "x86"; then
+ elog "Machine architecture is not x86 or x86_64"
+ return 0
+ fi
+
+ if [ $arch = "x86" ]; then
+ local target="x86-linux"
+ local cmake_command="cmake \
+ $LIBAOM_SOURCE_DIR \
+ -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake"
+ elif [ $arch = "x86_64" ]; then
+ local target="x86_64-linux"
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR"
+ fi
+
+ # Available x86 isa variants: "avx2 avx sse4_1 ssse3 sse3 sse2"
+ local x86_isa_variants="avx2 sse4_1 sse2"
+
+ echo "Build for x86: ${target}"
+ if ! av1_enc_build "${target}" "${cmake_command}"; then
+ return 1
+ fi
+
+ for preset in $PRESETS; do
+ local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+ for isa in $x86_isa_variants; do
+ # Note that if has_x86_isa_extn returns 1, it is false, and vice versa.
+ if ! has_x86_isa_extn $isa; then
+ echo "${isa} is not supported in this machine"
+ continue
+ fi
+ export AOM_SIMD_CAPS_MASK=$($isa)
+ if ! av1_enc_test $encoder "${arch}" "${target}" "${preset}"; then
+ # Found a mismatch
+ return 1
+ fi
+ unset AOM_SIMD_CAPS_MASK
+ done
+ done
+}
+
+av1_test_arm() {
+ local arch="arm64"
+ local target="arm64-linux-gcc"
+ local cmake_command="cmake $LIBAOM_SOURCE_DIR \
+ -DCMAKE_TOOLCHAIN_FILE=$LIBAOM_SOURCE_DIR/build/cmake/toolchains/${target}.cmake \
+ -DCMAKE_C_FLAGS=-Wno-maybe-uninitialized"
+ echo "Build for arm64: ${target}"
+ if ! av1_enc_build "${target}" "${cmake_command}"; then
+ return 1
+ fi
+
+ for preset in $PRESETS; do
+ local encoder="$(av1_enc_tool_path "${target}" "${preset}")"
+ if ! av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${arch}" "${target}" "${preset}"; then
+ # Found a mismatch
+ return 1
+ fi
+ done
+}
+
+av1_c_vs_simd_enc_test () {
+ # Test x86 (32 bit)
+ # x86 requires the i686-linux-gnu toolchain:
+ # $ sudo apt-get install g++-i686-linux-gnu
+ echo "av1 test for x86 (32 bit): Started."
+ # Encode 'C' only
+ av1_test_generic "x86"
+ # Encode with SIMD optimizations enabled
+ if ! av1_test_x86 "x86"; then
+ echo "av1 test for x86 (32 bit): Done, test failed."
+ return 1
+ else
+ echo "av1 test for x86 (32 bit): Done, all tests passed."
+ fi
+
+ # Test x86_64 (64 bit)
+ if [ "$(eval uname -m)" = "x86_64" ]; then
+ echo "av1 test for x86_64 (64 bit): Started."
+ # Encode 'C' only
+ av1_test_generic "x86_64"
+ # Encode with SIMD optimizations enabled
+ if ! av1_test_x86 "x86_64"; then
+ echo "av1 test for x86_64 (64 bit): Done, test failed."
+ return 1
+ else
+ echo "av1 test for x86_64 (64 bit): Done, all tests passed."
+ fi
+ fi
+
+ # Test ARM
+ echo "av1_test_arm: Started."
+ if ! av1_test_arm; then
+ echo "av1 test for arm: Done, test failed."
+ return 1
+ else
+ echo "av1 test for arm: Done, all tests passed."
+ fi
+}
+
+run_tests av1_c_vs_simd_enc_verify_environment av1_c_vs_simd_enc_test
diff --git a/third_party/aom/test/av1_common_int_test.cc b/third_party/aom/test/av1_common_int_test.cc
new file mode 100644
index 0000000000..dde2542e3d
--- /dev/null
+++ b/third_party/aom/test/av1_common_int_test.cc
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/common/av1_common_int.h"
+
+TEST(AV1CommonInt, TestGetTxSize) {
+ for (int t = TX_4X4; t < TX_SIZES_ALL; t++) {
+ TX_SIZE t2 = get_tx_size(tx_size_wide[t], tx_size_high[t]);
+ GTEST_ASSERT_EQ(tx_size_wide[t], tx_size_wide[t2]);
+ GTEST_ASSERT_EQ(tx_size_high[t], tx_size_high[t2]);
+ }
+}
diff --git a/third_party/aom/test/av1_config_test.cc b/third_party/aom/test/av1_config_test.cc
new file mode 100644
index 0000000000..3ff816c163
--- /dev/null
+++ b/third_party/aom/test/av1_config_test.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include "common/av1_config.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+//
+// Input buffers containing exactly one Sequence Header OBU.
+//
+// Each buffer is named according to the OBU storage format (Annex-B vs Low
+// Overhead Bitstream Format) and the type of Sequence Header OBU ("Full"
+// Sequence Header OBUs vs Sequence Header OBUs with the
+// reduced_still_image_flag set).
+//
+const uint8_t kAnnexBFullSequenceHeaderObu[] = { 0x0c, 0x08, 0x00, 0x00, 0x00,
+ 0x04, 0x45, 0x7e, 0x3e, 0xff,
+ 0xfc, 0xc0, 0x20 };
+const uint8_t kAnnexBReducedStillImageSequenceHeaderObu[] = {
+ 0x08, 0x08, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
+};
+
+const uint8_t kLobfFullSequenceHeaderObu[] = { 0x0a, 0x0b, 0x00, 0x00, 0x00,
+ 0x04, 0x45, 0x7e, 0x3e, 0xff,
+ 0xfc, 0xc0, 0x20 };
+
+const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = { 0x0a, 0x07, 0x18,
+ 0x22, 0x2b, 0xf1,
+ 0xfe, 0xc0, 0x20 };
+
+const uint8_t kAv1cAllZero[] = { 0, 0, 0, 0 };
+
+// The size of AV1 config when no configOBUs are present at the end of the
+// configuration structure.
+const size_t kAv1cNoConfigObusSize = 4;
+
+bool VerifyAv1c(const uint8_t *const obu_buffer, size_t obu_buffer_length,
+ bool is_annexb) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ bool parse_ok = get_av1config_from_obu(obu_buffer, obu_buffer_length,
+ is_annexb, &av1_config) == 0;
+ if (parse_ok) {
+ EXPECT_EQ(1, av1_config.marker);
+ EXPECT_EQ(1, av1_config.version);
+ EXPECT_EQ(0, av1_config.seq_profile);
+ EXPECT_EQ(0, av1_config.seq_level_idx_0);
+ EXPECT_EQ(0, av1_config.seq_tier_0);
+ EXPECT_EQ(0, av1_config.high_bitdepth);
+ EXPECT_EQ(0, av1_config.twelve_bit);
+ EXPECT_EQ(0, av1_config.monochrome);
+ EXPECT_EQ(1, av1_config.chroma_subsampling_x);
+ EXPECT_EQ(1, av1_config.chroma_subsampling_y);
+ EXPECT_EQ(0, av1_config.chroma_sample_position);
+ EXPECT_EQ(0, av1_config.initial_presentation_delay_present);
+ EXPECT_EQ(0, av1_config.initial_presentation_delay_minus_one);
+ }
+ return parse_ok && ::testing::Test::HasFailure() == false;
+}
+
+TEST(Av1Config, ObuInvalidInputs) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ ASSERT_EQ(-1, get_av1config_from_obu(nullptr, 0, 0, nullptr));
+ ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0,
+ nullptr));
+ ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0],
+ sizeof(kLobfFullSequenceHeaderObu), 0,
+ nullptr));
+ ASSERT_EQ(-1, get_av1config_from_obu(
+ nullptr, sizeof(kLobfFullSequenceHeaderObu), 0, nullptr));
+ ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0,
+ &av1_config));
+}
+
+TEST(Av1Config, ReadInvalidInputs) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ size_t bytes_read = 0;
+ ASSERT_EQ(-1, read_av1config(nullptr, 0, nullptr, nullptr));
+ ASSERT_EQ(-1, read_av1config(nullptr, 4, nullptr, nullptr));
+ ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 0, nullptr, nullptr));
+ ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 4, &bytes_read, nullptr));
+ ASSERT_EQ(-1, read_av1config(nullptr, 4, &bytes_read, &av1_config));
+}
+
+TEST(Av1Config, WriteInvalidInputs) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+ size_t bytes_written = 0;
+ uint8_t av1c_buffer[4] = { 0 };
+ ASSERT_EQ(-1, write_av1config(nullptr, 0, nullptr, nullptr));
+ ASSERT_EQ(-1, write_av1config(&av1_config, 0, nullptr, nullptr));
+ ASSERT_EQ(-1, write_av1config(&av1_config, 0, &bytes_written, nullptr));
+
+ ASSERT_EQ(-1,
+ write_av1config(&av1_config, 0, &bytes_written, &av1c_buffer[0]));
+ ASSERT_EQ(-1, write_av1config(&av1_config, 4, &bytes_written, nullptr));
+}
+
+TEST(Av1Config, GetAv1ConfigFromLobfObu) {
+ // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+ // unset-- aka a full Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kLobfFullSequenceHeaderObu,
+ sizeof(kLobfFullSequenceHeaderObu), false));
+
+ // Test parsing of a reduced still image Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kLobfReducedStillImageSequenceHeaderObu,
+ sizeof(kLobfReducedStillImageSequenceHeaderObu),
+ false));
+}
+
+TEST(Av1Config, GetAv1ConfigFromAnnexBObu) {
+ // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+ // unset-- aka a full Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kAnnexBFullSequenceHeaderObu,
+ sizeof(kAnnexBFullSequenceHeaderObu), true));
+
+ // Test parsing of a reduced still image Sequence Header OBU.
+ ASSERT_TRUE(VerifyAv1c(kAnnexBReducedStillImageSequenceHeaderObu,
+ sizeof(kAnnexBReducedStillImageSequenceHeaderObu),
+ true));
+}
+
+TEST(Av1Config, ReadWriteConfig) {
+ Av1Config av1_config;
+ memset(&av1_config, 0, sizeof(av1_config));
+
+ // Test writing out the AV1 config.
+ size_t bytes_written = 0;
+ uint8_t av1c_buffer[4] = { 0 };
+ ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+ &av1c_buffer[0]));
+ ASSERT_EQ(kAv1cNoConfigObusSize, bytes_written);
+ for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+ ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+ << "Mismatch in output Av1Config at offset=" << i;
+ }
+
+ // Test reading the AV1 config.
+ size_t bytes_read = 0;
+ ASSERT_EQ(0, read_av1config(&kAv1cAllZero[0], sizeof(kAv1cAllZero),
+ &bytes_read, &av1_config));
+ ASSERT_EQ(kAv1cNoConfigObusSize, bytes_read);
+ ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+ &av1c_buffer[0]));
+ for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+ ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+ << "Mismatch in output Av1Config at offset=" << i;
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc
new file mode 100644
index 0000000000..76cf77ab07
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_scale_test.cc
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "av1/common/common_data.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+const int kXStepQn = 16;
+const int kYStepQn = 20;
+
+using libaom_test::ACMRandom;
+using std::make_tuple;
+using std::tuple;
+
+enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
+int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; }
+
+// A 16-bit filter with a configurable number of taps.
+class TestFilter {
+ public:
+ void set(NTaps ntaps, bool backwards);
+
+ InterpFilterParams params_;
+
+ private:
+ std::vector<int16_t> coeffs_;
+};
+
+void TestFilter::set(NTaps ntaps, bool backwards) {
+ const int n = NTapsToInt(ntaps);
+ assert(n >= 8 && n <= 12);
+
+ // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus
+ // elements at the end so that convolutions can read off the end safely.
+ coeffs_.resize(n * SUBPEL_SHIFTS + 8);
+
+ // The coefficients are pretty much arbitrary, but convolutions shouldn't
+ // over or underflow. For the first filter (subpels = 0), we use an
+ // increasing or decreasing ramp (depending on the backwards parameter). We
+ // don't want any zero coefficients, so we make it have an x-intercept at -1
+ // or n. To ensure absence of under/overflow, we normalise the area under the
+ // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function
+ // gives the identity).
+ //
+ // When increasing, the function has the form:
+ //
+ // f(x) = A * (x + 1)
+ //
+ // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the
+ // filter is reversed, we have the same A but with formula
+ //
+ // g(x) = A * (n - x)
+ const int I = 1 << FILTER_BITS;
+ const float A = 2.f * I / (n * (n + 1.f));
+ for (int i = 0; i < n; ++i) {
+ coeffs_[i] = static_cast<int16_t>(A * (backwards ? (n - i) : (i + 1)));
+ }
+
+ // For the other filters, make them slightly different by swapping two
+ // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped.
+ const size_t filter_size = sizeof(coeffs_[0] * n);
+ int16_t *const filter0 = &coeffs_[0];
+ for (int k = 1; k < SUBPEL_SHIFTS; ++k) {
+ int16_t *filterk = &coeffs_[k * n];
+ memcpy(filterk, filter0, filter_size);
+
+ const int idx0 = k % n;
+ const int idx1 = (7 * k) % n;
+
+ const int16_t tmp = filterk[idx0];
+ filterk[idx0] = filterk[idx1];
+ filterk[idx1] = tmp;
+ }
+
+ // Finally, write some rubbish at the end to make sure we don't use it.
+ for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i;
+
+ // Fill in params
+ params_.filter_ptr = &coeffs_[0];
+ params_.taps = n;
+ // These are ignored by the functions being tested. Set them to whatever.
+ params_.interp_filter = EIGHTTAP_REGULAR;
+}
+
+template <typename SrcPixel>
+class TestImage {
+ public:
+ TestImage(int w, int h, int bd) : w_(w), h_(h), bd_(bd) {
+ assert(bd < 16);
+ assert(bd <= 8 * static_cast<int>(sizeof(SrcPixel)));
+
+ // Pad width by 2*kHPad and then round up to the next multiple of 16
+ // to get src_stride_. Add another 16 for dst_stride_ (to make sure
+ // something goes wrong if we use the wrong one)
+ src_stride_ = (w_ + 2 * kHPad + 15) & ~15;
+ dst_stride_ = src_stride_ + 16;
+
+ // Allocate image data
+ src_data_.resize(2 * src_block_size());
+ dst_data_.resize(2 * dst_block_size());
+ dst_16_data_.resize(2 * dst_block_size());
+ }
+
+ void Initialize(ACMRandom *rnd);
+ void Check() const;
+
+ int src_stride() const { return src_stride_; }
+ int dst_stride() const { return dst_stride_; }
+
+ int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+ int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+ const SrcPixel *GetSrcData(bool ref, bool borders) const {
+ const SrcPixel *block = &src_data_[ref ? 0 : src_block_size()];
+ return borders ? block : block + kHPad + src_stride_ * kVPad;
+ }
+
+ SrcPixel *GetDstData(bool ref, bool borders) {
+ SrcPixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+ return borders ? block : block + kHPad + dst_stride_ * kVPad;
+ }
+
+ CONV_BUF_TYPE *GetDst16Data(bool ref, bool borders) {
+ CONV_BUF_TYPE *block = &dst_16_data_[ref ? 0 : dst_block_size()];
+ return borders ? block : block + kHPad + dst_stride_ * kVPad;
+ }
+
+ private:
+ int w_, h_, bd_;
+ int src_stride_, dst_stride_;
+
+ std::vector<SrcPixel> src_data_;
+ std::vector<SrcPixel> dst_data_;
+ std::vector<CONV_BUF_TYPE> dst_16_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+ if (!trash) {
+ memset(data, 0, sizeof(*data) * num_pixels);
+ return;
+ }
+ const Pixel mask = (1 << bd) - 1;
+ for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+ bool trash_edges, Pixel *data) {
+ assert(rnd);
+ const Pixel mask = (1 << bd) - 1;
+
+ // Fill in the first buffer with random data
+ // Top border
+ FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+ for (int r = 0; r < h; ++r) {
+ Pixel *row_data = data + (kVPad + r) * stride;
+ // Left border, contents, right border
+ FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+ for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+ FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+ }
+ // Bottom border
+ FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+ const int bpp = sizeof(*data);
+ const int block_elts = stride * (h + 2 * kVPad);
+ const int block_size = bpp * block_elts;
+
+ // Now copy that to the second buffer
+ memcpy(data + block_elts, data, block_size);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Initialize(ACMRandom *rnd) {
+ PrepBuffers(rnd, w_, h_, src_stride_, bd_, false, &src_data_[0]);
+ PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+ PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_16_data_[0]);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Check() const {
+ // If memcmp returns 0, there's nothing to do.
+ const int num_pixels = dst_block_size();
+ const SrcPixel *ref_dst = &dst_data_[0];
+ const SrcPixel *tst_dst = &dst_data_[num_pixels];
+
+ const CONV_BUF_TYPE *ref_16_dst = &dst_16_data_[0];
+ const CONV_BUF_TYPE *tst_16_dst = &dst_16_data_[num_pixels];
+
+ if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) {
+ if (0 == memcmp(ref_16_dst, tst_16_dst, sizeof(*ref_16_dst) * num_pixels))
+ return;
+ }
+ // Otherwise, iterate through the buffer looking for differences (including
+ // the edges)
+ const int stride = dst_stride_;
+ for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+ for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+ const int32_t ref_value = ref_dst[r * stride + c];
+ const int32_t tst_value = tst_dst[r * stride + c];
+
+ EXPECT_EQ(tst_value, ref_value)
+ << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+ }
+ }
+
+ for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+ for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+ const int32_t ref_value = ref_16_dst[r * stride + c];
+ const int32_t tst_value = tst_16_dst[r * stride + c];
+
+ EXPECT_EQ(tst_value, ref_value)
+ << "Error in 16 bit buffer "
+ << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+ }
+ }
+}
+
+typedef tuple<int, int> BlockDimension;
+
+struct BaseParams {
+ BaseParams(BlockDimension dimensions, NTaps num_taps_x, NTaps num_taps_y,
+ bool average)
+ : dims(dimensions), ntaps_x(num_taps_x), ntaps_y(num_taps_y),
+ avg(average) {}
+
+ BlockDimension dims;
+ NTaps ntaps_x, ntaps_y;
+ bool avg;
+};
+
+template <typename SrcPixel>
+class ConvolveScaleTestBase : public ::testing::Test {
+ public:
+ ConvolveScaleTestBase() : image_(nullptr) {}
+ ~ConvolveScaleTestBase() override { delete image_; }
+
+ // Implemented by subclasses (SetUp depends on the parameters passed
+ // in and RunOne depends on the function to be tested. These can't
+ // be templated for low/high bit depths because they have different
+ // numbers of parameters)
+ void SetUp() override = 0;
+ virtual void RunOne(bool ref) = 0;
+
+ protected:
+ void SetParams(const BaseParams &params, int bd) {
+ width_ = std::get<0>(params.dims);
+ height_ = std::get<1>(params.dims);
+ ntaps_x_ = params.ntaps_x;
+ ntaps_y_ = params.ntaps_y;
+ bd_ = bd;
+ avg_ = params.avg;
+
+ filter_x_.set(ntaps_x_, false);
+ filter_y_.set(ntaps_y_, true);
+ convolve_params_ =
+ get_conv_params_no_round(avg_ != false, 0, nullptr, 0, 1, bd);
+
+ delete image_;
+ image_ = new TestImage<SrcPixel>(width_, height_, bd_);
+ ASSERT_NE(image_, nullptr);
+ }
+
+ void SetConvParamOffset(int i, int j, int is_compound, int do_average,
+ int use_dist_wtd_comp_avg) {
+ if (i == -1 && j == -1) {
+ convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
+ convolve_params_.is_compound = is_compound;
+ convolve_params_.do_average = do_average;
+ } else {
+ convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
+ convolve_params_.fwd_offset = quant_dist_lookup_table[j][i];
+ convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i];
+ convolve_params_.is_compound = is_compound;
+ convolve_params_.do_average = do_average;
+ }
+ }
+
+ void Run() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int i = 0; i < kTestIters; ++i) {
+ int is_compound = 0;
+ SetConvParamOffset(-1, -1, is_compound, 0, 0);
+ Prep(&rnd);
+ RunOne(true);
+ RunOne(false);
+ image_->Check();
+
+ is_compound = 1;
+ for (int do_average = 0; do_average < 2; do_average++) {
+ for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2;
+ use_dist_wtd_comp_avg++) {
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 4; ++k) {
+ SetConvParamOffset(j, k, is_compound, do_average,
+ use_dist_wtd_comp_avg);
+ Prep(&rnd);
+ RunOne(true);
+ RunOne(false);
+ image_->Check();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void SpeedTest() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ Prep(&rnd);
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: CDEFSpeedTest, SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+ }
+
+ static int RandomSubpel(ACMRandom *rnd) {
+ const uint8_t subpel_mode = rnd->Rand8();
+ if ((subpel_mode & 7) == 0) {
+ return 0;
+ } else if ((subpel_mode & 7) == 1) {
+ return SCALE_SUBPEL_SHIFTS - 1;
+ } else {
+ return 1 + rnd->PseudoUniform(SCALE_SUBPEL_SHIFTS - 2);
+ }
+ }
+
+ void Prep(ACMRandom *rnd) {
+ assert(rnd);
+
+ // Choose subpel_x_ and subpel_y_. They should be less than
+ // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting"
+ // values: 0 and SCALE_SUBPEL_SHIFTS - 1
+ subpel_x_ = RandomSubpel(rnd);
+ subpel_y_ = RandomSubpel(rnd);
+
+ image_->Initialize(rnd);
+ }
+
+ int width_, height_, bd_;
+ NTaps ntaps_x_, ntaps_y_;
+ bool avg_;
+ int subpel_x_, subpel_y_;
+ TestFilter filter_x_, filter_y_;
+ TestImage<SrcPixel> *image_;
+ ConvolveParams convolve_params_;
+};
+
+typedef tuple<int, int> BlockDimension;
+
+typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params);
+
+// Test parameter list:
+// <tst_fun, dims, ntaps_x, ntaps_y, avg>
+typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool>
+ LowBDParams;
+
+class LowBDConvolveScaleTest
+ : public ConvolveScaleTestBase<uint8_t>,
+ public ::testing::WithParamInterface<LowBDParams> {
+ public:
+ ~LowBDConvolveScaleTest() override = default;
+
+ void SetUp() override {
+ tst_fun_ = GET_PARAM(0);
+
+ const BlockDimension &block = GET_PARAM(1);
+ const NTaps ntaps_x = GET_PARAM(2);
+ const NTaps ntaps_y = GET_PARAM(3);
+ const int bd = 8;
+ const bool avg = GET_PARAM(4);
+
+ SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+ }
+
+ void RunOne(bool ref) override {
+ const uint8_t *src = image_->GetSrcData(ref, false);
+ uint8_t *dst = image_->GetDstData(ref, false);
+ convolve_params_.dst = image_->GetDst16Data(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+ if (ref) {
+ av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
+ &filter_x_.params_, &filter_y_.params_, subpel_x_,
+ kXStepQn, subpel_y_, kYStepQn, &convolve_params_);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+ &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+ subpel_y_, kYStepQn, &convolve_params_);
+ }
+ }
+
+ private:
+ LowbdConvolveFunc tst_fun_;
+};
+
+const BlockDimension kBlockDim[] = {
+ make_tuple(2, 2), make_tuple(2, 4), make_tuple(4, 4),
+ make_tuple(4, 8), make_tuple(8, 4), make_tuple(8, 8),
+ make_tuple(8, 16), make_tuple(16, 8), make_tuple(16, 16),
+ make_tuple(16, 32), make_tuple(32, 16), make_tuple(32, 32),
+ make_tuple(32, 64), make_tuple(64, 32), make_tuple(64, 64),
+ make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
+};
+
+const NTaps kNTaps[] = { EIGHT_TAP };
+
+TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
+TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, LowBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_convolve_2d_scale_c),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool()));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, LowBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool()));
+#endif // HAVE_SSE4_1
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd);
+
+// Test parameter list:
+// <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+ HighBDParams;
+
+class HighBDConvolveScaleTest
+ : public ConvolveScaleTestBase<uint16_t>,
+ public ::testing::WithParamInterface<HighBDParams> {
+ public:
+ ~HighBDConvolveScaleTest() override = default;
+
+ void SetUp() override {
+ tst_fun_ = GET_PARAM(0);
+
+ const BlockDimension &block = GET_PARAM(1);
+ const NTaps ntaps_x = GET_PARAM(2);
+ const NTaps ntaps_y = GET_PARAM(3);
+ const bool avg = GET_PARAM(4);
+ const int bd = GET_PARAM(5);
+
+ SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+ }
+
+ void RunOne(bool ref) override {
+ const uint16_t *src = image_->GetSrcData(ref, false);
+ uint16_t *dst = image_->GetDstData(ref, false);
+ convolve_params_.dst = image_->GetDst16Data(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+
+ if (ref) {
+ av1_highbd_convolve_2d_scale_c(
+ src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+ &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+ &convolve_params_, bd_);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+ &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+ subpel_y_, kYStepQn, &convolve_params_, bd_);
+ }
+ }
+
+ private:
+ HighbdConvolveFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, HighBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_c),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, HighBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HighBDConvolveScaleTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_neon),
+ ::testing::ValuesIn(kBlockDim),
+ ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+ ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+
+#endif // HAVE_NEON
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/av1_convolve_test.cc b/third_party/aom/test/av1_convolve_test.cc
new file mode 100644
index 0000000000..5bbac21803
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_test.cc
@@ -0,0 +1,2447 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+#include <set>
+#include <vector>
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "aom_ports/aom_timer.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// TODO(any): Remove following INTERP_FILTERS_ALL define, so that 12-tap filter
+// is tested once 12-tap filter SIMD is done.
+#undef INTERP_FILTERS_ALL
+#define INTERP_FILTERS_ALL 4
+
+// All single reference convolve tests are parameterized on block size,
+// bit-depth, and function to test.
+//
+// Note that parameterizing on these variables (and not other parameters) is
+// a conscious decision - Jenkins needs some degree of parallelization to run
+// the tests within the time limit, but if the number of parameters increases
+// too much, the gtest framework does not handle it well (increased overhead per
+// test, huge amount of output to stdout, etc.).
+//
+// Also note that the test suites must be named with the architecture, e.g.,
+// C, C_X, AVX2_X, ... The test suite that runs on Jenkins sometimes runs tests
+// that cannot deal with intrinsics (e.g., the Valgrind tests on 32-bit x86
+// binaries) and will disable tests using a filter like
+// --gtest_filter=-:SSE4_1.*. If the test suites are not named this way, the
+// testing infrastructure will not selectively filter them properly.
+class BlockSize {
+ public:
+ BlockSize(int w, int h) : width_(w), height_(h) {}
+
+ int Width() const { return width_; }
+ int Height() const { return height_; }
+
+ bool operator<(const BlockSize &other) const {
+ if (Width() == other.Width()) {
+ return Height() < other.Height();
+ }
+ return Width() < other.Width();
+ }
+
+ bool operator==(const BlockSize &other) const {
+ return Width() == other.Width() && Height() == other.Height();
+ }
+
+ private:
+ int width_;
+ int height_;
+};
+
+// Block size / bit depth / test function used to parameterize the tests.
+template <typename T>
+class TestParam {
+ public:
+ TestParam(const BlockSize &block, int bd, T test_func)
+ : block_(block), bd_(bd), test_func_(test_func) {}
+
+ const BlockSize &Block() const { return block_; }
+ int BitDepth() const { return bd_; }
+ T TestFunction() const { return test_func_; }
+
+ bool operator==(const TestParam &other) const {
+ return Block() == other.Block() && BitDepth() == other.BitDepth() &&
+ TestFunction() == other.TestFunction();
+ }
+
+ private:
+ BlockSize block_;
+ int bd_;
+ T test_func_;
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const TestParam<T> &test_arg) {
+ return os << "TestParam { width:" << test_arg.Block().Width()
+ << " height:" << test_arg.Block().Height()
+ << " bd:" << test_arg.BitDepth() << " }";
+}
+
+// Generate the list of all block widths / heights that need to be tested,
+// includes chroma and luma sizes, for the given bit-depths. The test
+// function is the same for all generated parameters.
+template <typename T>
+std::vector<TestParam<T>> GetTestParams(std::initializer_list<int> bit_depths,
+ T test_func) {
+ std::set<BlockSize> sizes;
+ for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
+ const int w = block_size_wide[b];
+ const int h = block_size_high[b];
+ sizes.insert(BlockSize(w, h));
+ // Add in smaller chroma sizes as well.
+ if (w == 4 || h == 4) {
+ sizes.insert(BlockSize(w / 2, h / 2));
+ }
+ }
+ std::vector<TestParam<T>> result;
+ for (const BlockSize &block : sizes) {
+ for (int bd : bit_depths) {
+ result.push_back(TestParam<T>(block, bd, test_func));
+ }
+ }
+ return result;
+}
+
+template <typename T>
+std::vector<TestParam<T>> GetLowbdTestParams(T test_func) {
+ return GetTestParams({ 8 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdParams(
+ T test_func) {
+ return ::testing::ValuesIn(GetLowbdTestParams(test_func));
+}
+
+// Test the test-parameters generators work as expected.
+class AV1ConvolveParametersTest : public ::testing::Test {};
+
+TEST_F(AV1ConvolveParametersTest, GetLowbdTestParams) {
+ auto v = GetLowbdTestParams(av1_convolve_x_sr_c);
+ ASSERT_EQ(27U, v.size());
+ for (const auto &p : v) {
+ ASSERT_EQ(8, p.BitDepth());
+ // Needed (instead of ASSERT_EQ(...) since gtest does not
+ // have built in printing for arbitrary functions, which
+ // causes a compilation error.
+ bool same_fn = av1_convolve_x_sr_c == p.TestFunction();
+ ASSERT_TRUE(same_fn);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+template <typename T>
+std::vector<TestParam<T>> GetHighbdTestParams(T test_func) {
+ return GetTestParams({ 10, 12 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdParams(
+ T test_func) {
+ return ::testing::ValuesIn(GetHighbdTestParams(test_func));
+}
+
+TEST_F(AV1ConvolveParametersTest, GetHighbdTestParams) {
+ auto v = GetHighbdTestParams(av1_highbd_convolve_x_sr_c);
+ ASSERT_EQ(54U, v.size());
+ int num_10 = 0;
+ int num_12 = 0;
+ for (const auto &p : v) {
+ ASSERT_TRUE(p.BitDepth() == 10 || p.BitDepth() == 12);
+ bool same_fn = av1_highbd_convolve_x_sr_c == p.TestFunction();
+ ASSERT_TRUE(same_fn);
+ if (p.BitDepth() == 10) {
+ ++num_10;
+ } else {
+ ++num_12;
+ }
+ }
+ ASSERT_EQ(num_10, num_12);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// AV1ConvolveTest is the base class that all convolve tests should derive from.
+// It provides storage/methods for generating randomized buffers for both
+// low bit-depth and high bit-depth, and setup/teardown methods for clearing
+// system state. Implementors can get the bit-depth / block-size /
+// test function by calling GetParam().
+template <typename T>
+class AV1ConvolveTest : public ::testing::TestWithParam<TestParam<T>> {
+ public:
+ ~AV1ConvolveTest() override = default;
+
+ void SetUp() override {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ }
+
+ // Randomizes the 8-bit input buffer and returns a pointer to it. Note that
+ // the pointer is safe to use with an 8-tap filter. The stride can range
+ // from width to (width + kPadding). Also note that the pointer is to the
+ // same memory location.
+ static constexpr int kInputPadding = 12;
+
+ // Get a pointer to a buffer with stride == width. Note that we must have
+ // the test param passed in explicitly -- the gtest framework does not
+ // support calling GetParam() within a templatized class.
+ // Note that FirstRandomInput8 always returns the same pointer -- if two
+ // inputs are needed, also use SecondRandomInput8.
+ const uint8_t *FirstRandomInput8(const TestParam<T> &param) {
+ // Note we can't call GetParam() directly -- gtest does not support
+ // this for parameterized types.
+ return RandomInput8(input8_1_, param);
+ }
+
+ const uint8_t *SecondRandomInput8(const TestParam<T> &param) {
+ return RandomInput8(input8_2_, param);
+ }
+
+ // Some of the intrinsics perform writes in 32 byte chunks. Moreover, some
+ // of the instrinsics assume that the stride is also a multiple of 32.
+ // To satisfy these constraints and also remain simple, output buffer strides
+ // are assumed MAX_SB_SIZE.
+ static constexpr int kOutputStride = MAX_SB_SIZE;
+
+ // Check that two 8-bit output buffers are identical.
+ void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width,
+ int height) {
+ ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations";
+ for (int j = 0; j < height; ++j) {
+ if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+ p1 += kOutputStride;
+ p2 += kOutputStride;
+ continue;
+ }
+ for (int i = 0; i < width; ++i) {
+ ASSERT_EQ(p1[i], p2[i])
+ << width << "x" << height << " Pixel mismatch at (" << i << ", "
+ << j << ")";
+ }
+ }
+ }
+
+ // Check that two 16-bit output buffers are identical.
+ void AssertOutputBufferEq(const uint16_t *p1, const uint16_t *p2, int width,
+ int height) {
+ ASSERT_TRUE(p1 != p2) << "Buffers must be in different memory locations";
+ for (int j = 0; j < height; ++j) {
+ if (memcmp(p1, p2, sizeof(*p1) * width) == 0) {
+ p1 += kOutputStride;
+ p2 += kOutputStride;
+ continue;
+ }
+ for (int i = 0; i < width; ++i) {
+ ASSERT_EQ(p1[i], p2[i])
+ << width << "x" << height << " Pixel mismatch at (" << i << ", "
+ << j << ")";
+ }
+ }
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // Note that the randomized values are capped by bit-depth.
+ const uint16_t *FirstRandomInput16(const TestParam<T> &param) {
+ return RandomInput16(input16_1_, param);
+ }
+
+ const uint16_t *SecondRandomInput16(const TestParam<T> &param) {
+ return RandomInput16(input16_2_, param);
+ }
+#endif
+
+ private:
+ const uint8_t *RandomInput8(uint8_t *p, const TestParam<T> &param) {
+ EXPECT_EQ(8, param.BitDepth());
+ EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
+ EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
+ const int padded_width = param.Block().Width() + kInputPadding;
+ const int padded_height = param.Block().Height() + kInputPadding;
+ Randomize(p, padded_width * padded_height);
+ return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
+ }
+
+ void Randomize(uint8_t *p, int size) {
+ for (int i = 0; i < size; ++i) {
+ p[i] = rnd_.Rand8();
+ }
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint16_t *RandomInput16(uint16_t *p, const TestParam<T> &param) {
+ // Check that this is only called with high bit-depths.
+ EXPECT_TRUE(param.BitDepth() == 10 || param.BitDepth() == 12);
+ EXPECT_GE(MAX_SB_SIZE, param.Block().Width());
+ EXPECT_GE(MAX_SB_SIZE, param.Block().Height());
+ const int padded_width = param.Block().Width() + kInputPadding;
+ const int padded_height = param.Block().Height() + kInputPadding;
+ Randomize(p, padded_width * padded_height, param.BitDepth());
+ return p + (kInputPadding / 2) * padded_width + kInputPadding / 2;
+ }
+
+ void Randomize(uint16_t *p, int size, int bit_depth) {
+ for (int i = 0; i < size; ++i) {
+ p[i] = rnd_.Rand16() & ((1 << bit_depth) - 1);
+ }
+ }
+#endif
+
+ static constexpr int kInputStride = MAX_SB_SIZE + kInputPadding;
+
+ libaom_test::ACMRandom rnd_;
+ // Statically allocate all the memory that is needed for the tests. Note
+ // that we cannot allocate output memory here. It must use DECLARE_ALIGNED,
+ // which is a C99 feature and interacts badly with C++ member variables.
+ uint8_t input8_1_[kInputStride * kInputStride];
+ uint8_t input8_2_[kInputStride * kInputStride];
+#if CONFIG_AV1_HIGHBITDEPTH
+ uint16_t input16_1_[kInputStride * kInputStride];
+ uint16_t input16_2_[kInputStride * kInputStride];
+#endif
+};
+
+////////////////////////////////////////////////////////
+// Single reference convolve-x functions (low bit-depth)
+////////////////////////////////////////////////////////
+typedef void (*convolve_x_func)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params);
+
+class AV1ConvolveXTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+ void RunTest() {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolve(sub_x, f);
+ }
+ }
+ }
+
+ public:
+ void SpeedTest() {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolveSpeed(f, 10000);
+ }
+ }
+
+ private:
+ void TestConvolve(const int sub_x, const InterpFilter filter) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ av1_convolve_x_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_x, sub_x, &conv_params1);
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ test_func(input, width, test, kOutputStride, width, height, filter_params_x,
+ sub_x, &conv_params2);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_convolve_x_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveXTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_neon));
+#endif
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1ConvolveXTest,
+ BuildLowbdParams(av1_convolve_x_sr_neon_i8mm));
+#endif
+
+////////////////////////////////////////////////////////////////
+// Single reference convolve-x IntraBC functions (low bit-depth)
+////////////////////////////////////////////////////////////////
+
+class AV1ConvolveXIntraBCTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8.
+ constexpr int kSubX = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_convolve_x_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+ width, height, filter_params_x, kSubX,
+ &conv_params1);
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ test_func(input, width + 2, test, kOutputStride, width, height,
+ filter_params_x, kSubX, &conv_params2);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_convolve_x_sr_intrabc_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, 0, &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveXIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXIntraBCTest,
+ BuildLowbdParams(av1_convolve_x_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXIntraBCTest,
+ BuildLowbdParams(av1_convolve_x_sr_intrabc_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////////////
+// Single reference convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_x_func)(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd);
+
+class AV1ConvolveXHighbdTest : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+ void RunTest() {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolve(sub_x, f);
+ }
+ }
+ }
+
+ public:
+ void SpeedTest() {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolveSpeed(f, 10000);
+ }
+ }
+
+ private:
+ void TestConvolve(const int sub_x, const InterpFilter filter) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, sub_x, &conv_params1,
+ bit_depth);
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+ filter_params_x, sub_x, &conv_params2, bit_depth);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_highbd_convolve_x_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, 0, &conv_params1,
+ bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ highbd_convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params2, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveXHighbdTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXHighbdTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveXHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_neon));
+#endif
+
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-x IntraBC functions (high bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1ConvolveXHighbdIntraBCTest
+ : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8.
+ constexpr int kSubX = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_highbd_convolve_x_sr_intrabc_c(
+ input, width + 2, reference, kOutputStride, width, height,
+ filter_params_x, kSubX, &conv_params1, bit_depth);
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_x, kSubX, &conv_params2,
+ bit_depth);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_highbd_convolve_x_sr_intrabc_c(input, width, reference, kOutputStride,
+ width, height, filter_params_x, 0,
+ &conv_params1, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ highbd_convolve_x_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, 0, &conv_params2, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveXHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveXHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1ConvolveXHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_x_sr_intrabc_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////////////
+// Single reference convolve-y functions (low bit-depth)
+////////////////////////////////////////////////////////
+typedef void (*convolve_y_func)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn);
+
+class AV1ConvolveYTest : public AV1ConvolveTest<convolve_y_func> {
+ public:
+ void RunTest() {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolve(sub_y, f);
+ }
+ }
+ }
+
+ public:
+ void SpeedTest() {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolveSpeed(f, 10000);
+ }
+ }
+
+ private:
+ void TestConvolve(const int sub_y, const InterpFilter filter) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, height);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_y, sub_y);
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+ filter_params_y, sub_y);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, height);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ GetParam().TestFunction()(input, width, test, kOutputStride, width,
+ height, filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveYTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYTest,
+ BuildLowbdParams(av1_convolve_y_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYTest,
+ BuildLowbdParams(av1_convolve_y_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYTest,
+ BuildLowbdParams(av1_convolve_y_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYTest,
+ BuildLowbdParams(av1_convolve_y_sr_neon));
+#endif
+
+////////////////////////////////////////////////////////////////
+// Single reference convolve-y IntraBC functions (low bit-depth)
+////////////////////////////////////////////////////////////////
+
+class AV1ConvolveYIntraBCTest : public AV1ConvolveTest<convolve_y_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_y_qn = 8.
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_convolve_y_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+ width, height, filter_params_y, kSubY);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_y, kSubY);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_convolve_y_sr_intrabc_c(input, width, reference, kOutputStride, width,
+ height, filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ convolve_y_func test_func = GetParam().TestFunction();
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_y, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveYIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYIntraBCTest,
+ BuildLowbdParams(av1_convolve_y_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYIntraBCTest,
+ BuildLowbdParams(av1_convolve_y_sr_intrabc_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////////////
+// Single reference convolve-y functions (high bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_y_func)(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
+ int bd);
+
+class AV1ConvolveYHighbdTest : public AV1ConvolveTest<highbd_convolve_y_func> {
+ public:
+ void RunTest() {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolve(sub_y, f);
+ }
+ }
+ }
+
+ public:
+ void SpeedTest() {
+ for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+ ++filter) {
+ InterpFilter f = static_cast<InterpFilter>(filter);
+ TestConvolveSpeed(f, 10000);
+ }
+ }
+
+ private:
+ void TestConvolve(const int sub_y, const InterpFilter filter) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, height);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_y, sub_y, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+ filter_params_y, sub_y, bit_depth);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_highbd_convolve_y_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_y, 0, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ highbd_convolve_y_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_y, 0, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveYHighbdTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYHighbdTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1ConvolveYHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_neon));
+#endif
+
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-y IntraBC functions (high bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1ConvolveYHighbdIntraBCTest
+ : public AV1ConvolveTest<highbd_convolve_y_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_y_qn = 8.
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_highbd_convolve_y_sr_intrabc_c(input, width + 2, reference,
+ kOutputStride, width, height,
+ filter_params_y, kSubY, bit_depth);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_y, kSubY, bit_depth);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter filter = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(filter, width);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_highbd_convolve_y_sr_intrabc_c(input, width, reference, kOutputStride,
+ width, height, filter_params_y, 0,
+ bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ highbd_convolve_y_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_y, 0, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+ time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1ConvolveYHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1ConvolveYHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1ConvolveYHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_y_sr_intrabc_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////////////
+// Single reference convolve-copy functions (low bit-depth)
+//////////////////////////////////////////////////////////////
+typedef void (*convolve_copy_func)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w,
+ int h);
+
+class AV1ConvolveCopyTest : public AV1ConvolveTest<convolve_copy_func> {
+ public:
+ void RunTest() {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ aom_convolve_copy_c(input, width, reference, kOutputStride, width, height);
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+};
+
+// Note that even though these are AOM convolve functions, we are using the
+// newer AV1 test framework.
+TEST_P(AV1ConvolveCopyTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyTest,
+ BuildLowbdParams(aom_convolve_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyTest,
+ BuildLowbdParams(aom_convolve_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyTest,
+ BuildLowbdParams(aom_convolve_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyTest,
+ BuildLowbdParams(aom_convolve_copy_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+///////////////////////////////////////////////////////////////
+// Single reference convolve-copy functions (high bit-depth)
+///////////////////////////////////////////////////////////////
+typedef void (*highbd_convolve_copy_func)(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h);
+
+class AV1ConvolveCopyHighbdTest
+ : public AV1ConvolveTest<highbd_convolve_copy_func> {
+ public:
+ void RunTest() {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ aom_highbd_convolve_copy_c(input, width, reference, kOutputStride, width,
+ height);
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+};
+
+TEST_P(AV1ConvolveCopyHighbdTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveCopyHighbdTest,
+ BuildHighbdParams(aom_highbd_convolve_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveCopyHighbdTest,
+ BuildHighbdParams(aom_highbd_convolve_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveCopyHighbdTest,
+ BuildHighbdParams(aom_highbd_convolve_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveCopyHighbdTest,
+ BuildHighbdParams(aom_highbd_convolve_copy_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+/////////////////////////////////////////////////////////
+// Single reference convolve-2D functions (low bit-depth)
+/////////////////////////////////////////////////////////
+typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params);
+
+class AV1Convolve2DTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+ void RunTest() {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+ if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+ ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+ continue;
+ TestConvolve(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), sub_x, sub_y);
+ }
+ }
+ }
+ }
+ }
+
+ public:
+ void SpeedTest() {
+ for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+ if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+ ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+ continue;
+ TestConvolveSpeed(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), 10000);
+ }
+ }
+ }
+
+ private:
+ void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+ const int sub_x, const int sub_y) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_x, filter_params_y, sub_x, sub_y,
+ &conv_params1);
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+ filter_params_x, filter_params_y, sub_x, sub_y,
+ &conv_params2);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f,
+ int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, filter_params_y, 0, 0,
+ &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ GetParam().TestFunction()(input, width, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, 0, 0,
+ &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1Convolve2DTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_neon));
+#endif
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, AV1Convolve2DTest,
+ BuildLowbdParams(av1_convolve_2d_sr_neon_i8mm));
+#endif
+
+/////////////////////////////////////////////////////////////////
+// Single reference convolve-2D IntraBC functions (low bit-depth)
+/////////////////////////////////////////////////////////////////
+
+class AV1Convolve2DIntraBCTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8 and subpel_y_qn = 8.
+ constexpr int kSubX = 8;
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_convolve_2d_sr_intrabc_c(input, width + 2, reference, kOutputStride,
+ width, height, filter_params_x,
+ filter_params_y, kSubX, kSubY, &conv_params1);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, kSubX,
+ kSubY, &conv_params2);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter h_f = static_cast<InterpFilter>(BILINEAR);
+ const InterpFilter v_f = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint8_t *input = FirstRandomInput8(GetParam());
+
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_convolve_2d_sr_intrabc_c(input, width, reference, kOutputStride,
+ width, height, filter_params_x,
+ filter_params_y, 8, 8, &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ convolve_2d_func test_func = GetParam().TestFunction();
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, filter_params_y, 8, 8, &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1Convolve2DIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DIntraBCTest,
+ BuildLowbdParams(av1_convolve_2d_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DIntraBCTest,
+ BuildLowbdParams(av1_convolve_2d_sr_intrabc_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////
+// Single reference convolve-2d functions (high bit-depth)
+//////////////////////////////////////////////////////////
+
+typedef void (*highbd_convolve_2d_func)(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+class AV1Convolve2DHighbdTest
+ : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+ void RunTest() {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+ if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+ ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+ continue;
+ TestConvolve(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), sub_x, sub_y);
+ }
+ }
+ }
+ }
+ }
+
+ public:
+ void SpeedTest() {
+ for (int h_f = EIGHTTAP_REGULAR; h_f <= INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f <= INTERP_FILTERS_ALL; ++v_f) {
+ if (((h_f == MULTITAP_SHARP2) && (v_f < MULTITAP_SHARP2)) ||
+ ((h_f < MULTITAP_SHARP2) && (v_f == MULTITAP_SHARP2)))
+ continue;
+ TestConvolveSpeed(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), 10000);
+ }
+ }
+ }
+
+ private:
+ void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+ const int sub_x, const int sub_y) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, filter_params_y, sub_x,
+ sub_y, &conv_params1, bit_depth);
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
+ filter_params_x, filter_params_y, sub_x, sub_y,
+ &conv_params2, bit_depth);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f,
+ int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_highbd_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, filter_params_y, 0,
+ 0, &conv_params1, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ GetParam().TestFunction()(input, width, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, 0, 0,
+ &conv_params2, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1Convolve2DHighbdTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DHighbdTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DHighbdTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_neon));
+#endif
+
+//////////////////////////////////////////////////////////////////
+// Single reference convolve-2d IntraBC functions (high bit-depth)
+//////////////////////////////////////////////////////////////////
+
+class AV1Convolve2DHighbdIntraBCTest
+ : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+ void RunTest() {
+ // IntraBC functions only operate for subpel_x_qn = 8 and subpel_y_qn = 8.
+ constexpr int kSubX = 8;
+ constexpr int kSubY = 8;
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x = &av1_intrabc_filter_params;
+ const InterpFilterParams *filter_params_y = &av1_intrabc_filter_params;
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ // Use a stride different from width to avoid potential storing errors that
+ // would go undetected. The input buffer is filled using a padding of 12, so
+ // the stride can be anywhere between width and width + 12.
+ av1_highbd_convolve_2d_sr_intrabc_c(input, width + 2, reference,
+ kOutputStride, width, height,
+ filter_params_x, filter_params_y, kSubX,
+ kSubY, &conv_params1, bit_depth);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, bit_depth);
+ GetParam().TestFunction()(input, width + 2, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, kSubX,
+ kSubY, &conv_params2, bit_depth);
+
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void SpeedTest() {
+ constexpr int kNumIters = 10000;
+ const InterpFilter h_f = static_cast<InterpFilter>(BILINEAR);
+ const InterpFilter v_f = static_cast<InterpFilter>(BILINEAR);
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint16_t *input = FirstRandomInput16(GetParam());
+
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ av1_highbd_convolve_2d_sr_intrabc_c(
+ input, width, reference, kOutputStride, width, height,
+ filter_params_x, filter_params_y, 0, 0, &conv_params1, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ highbd_convolve_2d_func test_func = GetParam().TestFunction();
+ ConvolveParams conv_params2 =
+ get_conv_params_no_round(0, 0, nullptr, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < kNumIters; ++i) {
+ test_func(input, width, test, kOutputStride, width, height,
+ filter_params_x, filter_params_y, 0, 0, &conv_params2,
+ bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
+};
+
+TEST_P(AV1Convolve2DHighbdIntraBCTest, RunTest) { RunTest(); }
+
+TEST_P(AV1Convolve2DHighbdIntraBCTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1Convolve2DHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_intrabc_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1Convolve2DHighbdIntraBCTest,
+ BuildHighbdParams(av1_highbd_convolve_2d_sr_intrabc_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////
+// Compound Convolve Tests
+//////////////////////////
+
+// The compound functions do not work for chroma block sizes. Provide
+// a function to generate test parameters for just luma block sizes.
+template <typename T>
+std::vector<TestParam<T>> GetLumaTestParams(
+ std::initializer_list<int> bit_depths, T test_func) {
+ std::set<BlockSize> sizes;
+ for (int b = BLOCK_4X4; b < BLOCK_SIZES_ALL; ++b) {
+ const int w = block_size_wide[b];
+ const int h = block_size_high[b];
+ sizes.insert(BlockSize(w, h));
+ }
+ std::vector<TestParam<T>> result;
+ for (int bit_depth : bit_depths) {
+ for (const auto &block : sizes) {
+ result.push_back(TestParam<T>(block, bit_depth, test_func));
+ }
+ }
+ return result;
+}
+
+template <typename T>
+std::vector<TestParam<T>> GetLowbdLumaTestParams(T test_func) {
+ return GetLumaTestParams({ 8 }, test_func);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildLowbdLumaParams(
+ T test_func) {
+ return ::testing::ValuesIn(GetLowbdLumaTestParams(test_func));
+}
+
+TEST_F(AV1ConvolveParametersTest, GetLowbdLumaTestParams) {
+ auto v = GetLowbdLumaTestParams(av1_dist_wtd_convolve_x_c);
+ ASSERT_EQ(22U, v.size());
+ for (const auto &e : v) {
+ ASSERT_EQ(8, e.BitDepth());
+ bool same_fn = av1_dist_wtd_convolve_x_c == e.TestFunction();
+ ASSERT_TRUE(same_fn);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+template <typename T>
+std::vector<TestParam<T>> GetHighbdLumaTestParams(T test_func) {
+ return GetLumaTestParams({ 10, 12 }, test_func);
+}
+
+TEST_F(AV1ConvolveParametersTest, GetHighbdLumaTestParams) {
+ auto v = GetHighbdLumaTestParams(av1_highbd_dist_wtd_convolve_x_c);
+ ASSERT_EQ(44U, v.size());
+ int num_10 = 0;
+ int num_12 = 0;
+ for (const auto &e : v) {
+ ASSERT_TRUE(10 == e.BitDepth() || 12 == e.BitDepth());
+ bool same_fn = av1_highbd_dist_wtd_convolve_x_c == e.TestFunction();
+ ASSERT_TRUE(same_fn);
+ if (e.BitDepth() == 10) {
+ ++num_10;
+ } else {
+ ++num_12;
+ }
+ }
+ ASSERT_EQ(num_10, num_12);
+}
+
+template <typename T>
+::testing::internal::ParamGenerator<TestParam<T>> BuildHighbdLumaParams(
+ T test_func) {
+ return ::testing::ValuesIn(GetHighbdLumaTestParams(test_func));
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Compound cases also need to test different frame offsets and weightings.
+class CompoundParam {
+ public:
+ CompoundParam(bool use_dist_wtd_comp_avg, int fwd_offset, int bck_offset)
+ : use_dist_wtd_comp_avg_(use_dist_wtd_comp_avg), fwd_offset_(fwd_offset),
+ bck_offset_(bck_offset) {}
+
+ bool UseDistWtdCompAvg() const { return use_dist_wtd_comp_avg_; }
+ int FwdOffset() const { return fwd_offset_; }
+ int BckOffset() const { return bck_offset_; }
+
+ private:
+ bool use_dist_wtd_comp_avg_;
+ int fwd_offset_;
+ int bck_offset_;
+};
+
+std::vector<CompoundParam> GetCompoundParams() {
+ std::vector<CompoundParam> result;
+ result.push_back(CompoundParam(false, 0, 0));
+ for (int k = 0; k < 2; ++k) {
+ for (int l = 0; l < 4; ++l) {
+ result.push_back(CompoundParam(true, quant_dist_lookup_table[l][k],
+ quant_dist_lookup_table[l][1 - k]));
+ }
+ }
+ return result;
+}
+
+TEST_F(AV1ConvolveParametersTest, GetCompoundParams) {
+ auto v = GetCompoundParams();
+ ASSERT_EQ(9U, v.size());
+ ASSERT_FALSE(v[0].UseDistWtdCompAvg());
+ for (size_t i = 1; i < v.size(); ++i) {
+ ASSERT_TRUE(v[i].UseDistWtdCompAvg());
+ }
+}
+
+////////////////////////////////////////////////
+// Compound convolve-x functions (low bit-depth)
+////////////////////////////////////////////////
+
+ConvolveParams GetConvolveParams(int do_average, CONV_BUF_TYPE *conv_buf,
+ int width, int bit_depth,
+ const CompoundParam &compound) {
+ ConvolveParams conv_params =
+ get_conv_params_no_round(do_average, 0, conv_buf, width, 1, bit_depth);
+ conv_params.use_dist_wtd_comp_avg = compound.UseDistWtdCompAvg();
+ conv_params.fwd_offset = compound.FwdOffset();
+ conv_params.bck_offset = compound.BckOffset();
+ return conv_params;
+}
+
+class AV1ConvolveXCompoundTest : public AV1ConvolveTest<convolve_x_func> {
+ public:
+ void RunTest() {
+ auto compound_params = GetCompoundParams();
+ for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+ for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
+ for (const auto &c : compound_params) {
+ TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
+ }
+ }
+ }
+ }
+
+ protected:
+ virtual const InterpFilterParams *FilterParams(InterpFilter f,
+ const BlockSize &block) const {
+ return av1_get_interp_filter_params_with_block_size(f, block.Width());
+ }
+
+ virtual convolve_x_func ReferenceFunc() const {
+ return av1_dist_wtd_convolve_x_c;
+ }
+
+ private:
+ void TestConvolve(const int sub_pix, const InterpFilter filter,
+ const CompoundParam &compound) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const uint8_t *input1 = FirstRandomInput8(GetParam());
+ const uint8_t *input2 = SecondRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+ Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
+ compound, sub_pix, filter);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+ Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+ compound, sub_pix, filter);
+
+ AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void Convolve(convolve_x_func test_func, const uint8_t *src1,
+ const uint8_t *src2, uint8_t *dst, CONV_BUF_TYPE *conv_buf,
+ const CompoundParam &compound, const int sub_pix,
+ const InterpFilter filter) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params =
+ FilterParams(filter, GetParam().Block());
+
+ ConvolveParams conv_params =
+ GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+ test_func(src1, width, dst, kOutputStride, width, height, filter_params,
+ sub_pix, &conv_params);
+
+ conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+ test_func(src2, width, dst, kOutputStride, width, height, filter_params,
+ sub_pix, &conv_params);
+ }
+};
+
+TEST_P(AV1ConvolveXCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon));
+#endif
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+ NEON_I8MM, AV1ConvolveXCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_x_neon_i8mm));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////
+// Compound convolve-x functions (high bit-depth)
+/////////////////////////////////////////////////
+class AV1ConvolveXHighbdCompoundTest
+ : public AV1ConvolveTest<highbd_convolve_x_func> {
+ public:
+ void RunTest() {
+ auto compound_params = GetCompoundParams();
+ for (int sub_pix = 0; sub_pix < 16; ++sub_pix) {
+ for (int f = EIGHTTAP_REGULAR; f < INTERP_FILTERS_ALL; ++f) {
+ for (const auto &c : compound_params) {
+ TestConvolve(sub_pix, static_cast<InterpFilter>(f), c);
+ }
+ }
+ }
+ }
+
+ protected:
+ virtual const InterpFilterParams *FilterParams(InterpFilter f,
+ const BlockSize &block) const {
+ return av1_get_interp_filter_params_with_block_size(f, block.Width());
+ }
+
+ virtual highbd_convolve_x_func ReferenceFunc() const {
+ return av1_highbd_dist_wtd_convolve_x_c;
+ }
+
+ private:
+ void TestConvolve(const int sub_pix, const InterpFilter filter,
+ const CompoundParam &compound) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const uint16_t *input1 = FirstRandomInput16(GetParam());
+ const uint16_t *input2 = SecondRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+ Convolve(ReferenceFunc(), input1, input2, reference, reference_conv_buf,
+ compound, sub_pix, filter);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+ Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+ compound, sub_pix, filter);
+
+ AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void Convolve(highbd_convolve_x_func test_func, const uint16_t *src1,
+ const uint16_t *src2, uint16_t *dst, CONV_BUF_TYPE *conv_buf,
+ const CompoundParam &compound, const int sub_pix,
+ const InterpFilter filter) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const int bit_depth = GetParam().BitDepth();
+ const InterpFilterParams *filter_params =
+ FilterParams(filter, GetParam().Block());
+ ConvolveParams conv_params =
+ GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+ test_func(src1, width, dst, kOutputStride, width, height, filter_params,
+ sub_pix, &conv_params, bit_depth);
+ conv_params =
+ GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+ test_func(src2, width, dst, kOutputStride, width, height, filter_params,
+ sub_pix, &conv_params, bit_depth);
+ }
+};
+
+TEST_P(AV1ConvolveXHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1ConvolveXHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1ConvolveXHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1ConvolveXHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1ConvolveXHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////
+// Compound convolve-y functions (low bit-depth)
+////////////////////////////////////////////////
+
+// Note that the X and Y convolve functions have the same type signature and
+// logic; they only differentiate the filter parameters and reference function.
+class AV1ConvolveYCompoundTest : public AV1ConvolveXCompoundTest {
+ protected:
+ const InterpFilterParams *FilterParams(
+ InterpFilter f, const BlockSize &block) const override {
+ return av1_get_interp_filter_params_with_block_size(f, block.Height());
+ }
+
+ convolve_x_func ReferenceFunc() const override {
+ return av1_dist_wtd_convolve_y_c;
+ }
+};
+
+TEST_P(AV1ConvolveYCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_y_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1ConvolveYCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_y_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1ConvolveYCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_y_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1ConvolveYCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_y_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/////////////////////////////////////////////////
+// Compound convolve-y functions (high bit-depth)
+/////////////////////////////////////////////////
+
+// Again, the X and Y convolve functions have the same type signature and logic.
+class AV1ConvolveYHighbdCompoundTest : public AV1ConvolveXHighbdCompoundTest {
+ highbd_convolve_x_func ReferenceFunc() const override {
+ return av1_highbd_dist_wtd_convolve_y_c;
+ }
+ const InterpFilterParams *FilterParams(
+ InterpFilter f, const BlockSize &block) const override {
+ return av1_get_interp_filter_params_with_block_size(f, block.Height());
+ }
+};
+
+TEST_P(AV1ConvolveYHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1ConvolveYHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1ConvolveYHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1ConvolveYHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1ConvolveYHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_y_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+//////////////////////////////////////////////////////
+// Compound convolve-2d-copy functions (low bit-depth)
+//////////////////////////////////////////////////////
+typedef void (*compound_conv_2d_copy_func)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w,
+ int h, ConvolveParams *conv_params);
+
+class AV1Convolve2DCopyCompoundTest
+ : public AV1ConvolveTest<compound_conv_2d_copy_func> {
+ public:
+ void RunTest() {
+ auto compound_params = GetCompoundParams();
+ for (const auto &compound : compound_params) {
+ TestConvolve(compound);
+ }
+ }
+ void SpeedTest() {
+ for (const auto &compound : GetCompoundParams()) {
+ TestConvolveSpeed(compound, 100000);
+ }
+ }
+
+ private:
+ void TestConvolve(const CompoundParam &compound) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+
+ const uint8_t *input1 = FirstRandomInput8(GetParam());
+ const uint8_t *input2 = SecondRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+ Convolve(av1_dist_wtd_convolve_2d_copy_c, input1, input2, reference,
+ reference_conv_buf, compound);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+ Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+ compound);
+
+ AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void TestConvolveSpeed(const CompoundParam &compound, const int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+
+ const uint8_t *src0 = FirstRandomInput8(GetParam());
+ const uint8_t *src1 = SecondRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, dst[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, conv_buf[MAX_SB_SQUARE]);
+
+ const auto test_func = GetParam().TestFunction();
+
+ ConvolveParams conv_params_0 =
+ GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+ ConvolveParams conv_params_1 =
+ GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_dist_wtd_convolve_2d_copy_c(src0, width, dst, kOutputStride, width,
+ height, &conv_params_0);
+ av1_dist_wtd_convolve_2d_copy_c(src1, width, dst, kOutputStride, width,
+ height, &conv_params_1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ test_func(src0, width, dst, kOutputStride, width, height, &conv_params_0);
+ test_func(src1, width, dst, kOutputStride, width, height, &conv_params_1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("Dist Weighted: %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n",
+ compound.UseDistWtdCompAvg(), width, height, time1, time2,
+ time1 / time2);
+ }
+
+ void Convolve(compound_conv_2d_copy_func test_func, const uint8_t *src1,
+ const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
+ const CompoundParam &compound) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+ ConvolveParams conv_params =
+ GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+ test_func(src1, width, dst, kOutputStride, width, height, &conv_params);
+
+ conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+ test_func(src2, width, dst, kOutputStride, width, height, &conv_params);
+ }
+};
+
+TEST_P(AV1Convolve2DCopyCompoundTest, RunTest) { RunTest(); }
+TEST_P(AV1Convolve2DCopyCompoundTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCopyCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1Convolve2DCopyCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_sse2));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1Convolve2DCopyCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1Convolve2DCopyCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_copy_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+///////////////////////////////////////////////////////
+// Compound convolve-2d-copy functions (high bit-depth)
+///////////////////////////////////////////////////////
+typedef void (*highbd_compound_conv_2d_copy_func)(const uint16_t *src,
+ int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h,
+ ConvolveParams *conv_params,
+ int bd);
+
+class AV1Convolve2DCopyHighbdCompoundTest
+ : public AV1ConvolveTest<highbd_compound_conv_2d_copy_func> {
+ public:
+ void RunTest() {
+ auto compound_params = GetCompoundParams();
+ for (const auto &compound : compound_params) {
+ TestConvolve(compound);
+ }
+ }
+
+ private:
+ void TestConvolve(const CompoundParam &compound) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+
+ const uint16_t *input1 = FirstRandomInput16(GetParam());
+ const uint16_t *input2 = SecondRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+ Convolve(av1_highbd_dist_wtd_convolve_2d_copy_c, input1, input2, reference,
+ reference_conv_buf, compound);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+ Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+ compound);
+
+ AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ void Convolve(highbd_compound_conv_2d_copy_func test_func,
+ const uint16_t *src1, const uint16_t *src2, uint16_t *dst,
+ uint16_t *conv_buf, const CompoundParam &compound) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+ const int bit_depth = GetParam().BitDepth();
+
+ ConvolveParams conv_params =
+ GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+ test_func(src1, width, dst, kOutputStride, width, height, &conv_params,
+ bit_depth);
+
+ conv_params =
+ GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+ test_func(src2, width, dst, kOutputStride, width, height, &conv_params,
+ bit_depth);
+ }
+};
+
+TEST_P(AV1Convolve2DCopyHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1Convolve2DCopyHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1Convolve2DCopyHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1Convolve2DCopyHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1Convolve2DCopyHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_copy_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+/////////////////////////////////////////////////
+// Compound convolve-2d functions (low bit-depth)
+/////////////////////////////////////////////////
+
+class AV1Convolve2DCompoundTest : public AV1ConvolveTest<convolve_2d_func> {
+ public:
+ void RunTest() {
+ auto compound_params = GetCompoundParams();
+ for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ for (const auto &compound : compound_params) {
+ TestConvolve(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), sub_x, sub_y,
+ compound);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private:
+ void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+ const int sub_x, const int sub_y,
+ const CompoundParam &compound) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+
+ const uint8_t *input1 = FirstRandomInput8(GetParam());
+ const uint8_t *input2 = SecondRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+ Convolve(av1_dist_wtd_convolve_2d_c, input1, input2, reference,
+ reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
+
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+ Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+ compound, h_f, v_f, sub_x, sub_y);
+
+ AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void Convolve(convolve_2d_func test_func, const uint8_t *src1,
+ const uint8_t *src2, uint8_t *dst, uint16_t *conv_buf,
+ const CompoundParam &compound, const InterpFilter h_f,
+ const InterpFilter v_f, const int sub_x, const int sub_y) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ ConvolveParams conv_params =
+ GetConvolveParams(0, conv_buf, kOutputStride, 8, compound);
+
+ test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
+ filter_params_y, sub_x, sub_y, &conv_params);
+
+ conv_params = GetConvolveParams(1, conv_buf, kOutputStride, 8, compound);
+ test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
+ filter_params_y, sub_x, sub_y, &conv_params);
+ }
+};
+
+TEST_P(AV1Convolve2DCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_sse2));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon));
+#endif
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon_dotprod));
+#endif
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+ NEON_I8MM, AV1Convolve2DCompoundTest,
+ BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_neon_i8mm));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////
+// Compound convolve-2d functions (high bit-depth)
+//////////////////////////////////////////////////
+
+class AV1Convolve2DHighbdCompoundTest
+ : public AV1ConvolveTest<highbd_convolve_2d_func> {
+ public:
+ void RunTest() {
+ auto compound_params = GetCompoundParams();
+ for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ for (const auto &compound : compound_params) {
+ TestConvolve(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), sub_x, sub_y,
+ compound);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private:
+ void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
+ const int sub_x, const int sub_y,
+ const CompoundParam &compound) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+ const uint16_t *input1 = FirstRandomInput16(GetParam());
+ const uint16_t *input2 = SecondRandomInput16(GetParam());
+ DECLARE_ALIGNED(32, uint16_t, reference[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, reference_conv_buf[MAX_SB_SQUARE]);
+ Convolve(av1_highbd_dist_wtd_convolve_2d_c, input1, input2, reference,
+ reference_conv_buf, compound, h_f, v_f, sub_x, sub_y);
+
+ DECLARE_ALIGNED(32, uint16_t, test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, CONV_BUF_TYPE, test_conv_buf[MAX_SB_SQUARE]);
+ Convolve(GetParam().TestFunction(), input1, input2, test, test_conv_buf,
+ compound, h_f, v_f, sub_x, sub_y);
+
+ AssertOutputBufferEq(reference_conv_buf, test_conv_buf, width, height);
+ AssertOutputBufferEq(reference, test, width, height);
+ }
+
+ private:
+ void Convolve(highbd_convolve_2d_func test_func, const uint16_t *src1,
+ const uint16_t *src2, uint16_t *dst, uint16_t *conv_buf,
+ const CompoundParam &compound, const InterpFilter h_f,
+ const InterpFilter v_f, const int sub_x, const int sub_y) {
+ const BlockSize &block = GetParam().Block();
+ const int width = block.Width();
+ const int height = block.Height();
+
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const int bit_depth = GetParam().BitDepth();
+ ConvolveParams conv_params =
+ GetConvolveParams(0, conv_buf, kOutputStride, bit_depth, compound);
+ test_func(src1, width, dst, kOutputStride, width, height, filter_params_x,
+ filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
+
+ conv_params =
+ GetConvolveParams(1, conv_buf, kOutputStride, bit_depth, compound);
+ test_func(src2, width, dst, kOutputStride, width, height, filter_params_x,
+ filter_params_y, sub_x, sub_y, &conv_params, bit_depth);
+ }
+};
+
+TEST_P(AV1Convolve2DHighbdCompoundTest, RunTest) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1Convolve2DHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1Convolve2DHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1Convolve2DHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1Convolve2DHighbdCompoundTest,
+ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_2d_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc b/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc
new file mode 100644
index 0000000000..402e70c34a
--- /dev/null
+++ b/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+#include "aom/aom_decoder.h"
+#include "av1/decoder/decoder.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+struct ParamPassingTestVideo {
+ const char *name;
+ uint32_t width;
+ uint32_t height;
+ uint32_t bitrate;
+ int frames;
+};
+
+const ParamPassingTestVideo kAV1ParamPassingTestVector = {
+ "niklas_1280_720_30.y4m", 1280, 720, 600, 3
+};
+
+struct EncodeParameters {
+ int32_t lossless;
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ aom_color_range_t color_range;
+ aom_chroma_sample_position_t chroma_sample_position;
+ int32_t render_size[2];
+};
+
+const EncodeParameters kAV1EncodeParameterSet[] = {
+ { 1,
+ AOM_CICP_CP_BT_709,
+ AOM_CICP_TC_BT_709,
+ AOM_CICP_MC_BT_709,
+ AOM_CR_STUDIO_RANGE,
+ AOM_CSP_UNKNOWN,
+ { 0, 0 } },
+ { 0,
+ AOM_CICP_CP_BT_470_M,
+ AOM_CICP_TC_BT_470_M,
+ AOM_CICP_MC_BT_470_B_G,
+ AOM_CR_FULL_RANGE,
+ AOM_CSP_VERTICAL,
+ { 0, 0 } },
+ { 1,
+ AOM_CICP_CP_BT_601,
+ AOM_CICP_TC_BT_601,
+ AOM_CICP_MC_BT_601,
+ AOM_CR_STUDIO_RANGE,
+ AOM_CSP_COLOCATED,
+ { 0, 0 } },
+ { 0,
+ AOM_CICP_CP_BT_2020,
+ AOM_CICP_TC_BT_2020_10_BIT,
+ AOM_CICP_MC_BT_2020_NCL,
+ AOM_CR_FULL_RANGE,
+ AOM_CSP_RESERVED,
+ { 640, 480 } },
+};
+
+class AVxEncoderParmsGetToDecoder
+ : public ::libaom_test::EncoderTest,
+ public ::libaom_test::CodecTestWithParam<EncodeParameters> {
+ protected:
+ AVxEncoderParmsGetToDecoder()
+ : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
+
+ ~AVxEncoderParmsGetToDecoder() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 25;
+ test_video_ = kAV1ParamPassingTestVector;
+ cfg_.rc_target_bitrate = test_video_.bitrate;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 3);
+ encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
+ encoder->Control(AV1E_SET_TRANSFER_CHARACTERISTICS,
+ encode_parms.transfer_characteristics);
+ encoder->Control(AV1E_SET_MATRIX_COEFFICIENTS,
+ encode_parms.matrix_coefficients);
+ encoder->Control(AV1E_SET_COLOR_RANGE, encode_parms.color_range);
+ encoder->Control(AV1E_SET_CHROMA_SAMPLE_POSITION,
+ encode_parms.chroma_sample_position);
+ encoder->Control(AV1E_SET_LOSSLESS, encode_parms.lossless);
+ if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+ encoder->Control(AV1E_SET_RENDER_SIZE, encode_parms.render_size);
+ }
+ }
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ (void)pts;
+ if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+ EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w);
+ EXPECT_EQ(encode_parms.render_size[1], (int)img.r_h);
+ }
+ EXPECT_EQ(encode_parms.color_primaries, img.cp);
+ EXPECT_EQ(encode_parms.transfer_characteristics, img.tc);
+ EXPECT_EQ(encode_parms.matrix_coefficients, img.mc);
+ EXPECT_EQ(encode_parms.color_range, img.range);
+ EXPECT_EQ(encode_parms.chroma_sample_position, img.csp);
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (encode_parms.lossless) {
+ EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ParamPassingTestVideo test_video_;
+
+ private:
+ EncodeParameters encode_parms;
+};
+
+TEST_P(AVxEncoderParmsGetToDecoder, BitstreamParms) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ std::unique_ptr<libaom_test::VideoSource> video(
+ new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderParmsGetToDecoder,
+ ::testing::ValuesIn(kAV1EncodeParameterSet));
+} // namespace
diff --git a/third_party/aom/test/av1_ext_tile_test.cc b/third_party/aom/test/av1_ext_tile_test.cc
new file mode 100644
index 0000000000..59c44cad12
--- /dev/null
+++ b/third_party/aom/test/av1_ext_tile_test.cc
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+
+namespace {
+// The number of frames to be encoded/decoded
+const int kLimit = 8;
+// Skip 1 frame to check the frame decoding independency.
+const int kSkip = 5;
+const int kTileSize = 1;
+const int kTIleSizeInPixels = (kTileSize << 6);
+// Fake width and height so that they can be multiples of the tile size.
+const int kImgWidth = 704;
+const int kImgHeight = 576;
+
+// This test tests large scale tile coding case. Non-large-scale tile coding
+// is tested by the tile_independence test.
+class AV1ExtTileTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1ExtTileTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ set_cpu_used_(GET_PARAM(2)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = kImgWidth;
+ cfg.h = kImgHeight;
+ cfg.allow_lowbitdepth = 1;
+
+ decoder_ = codec_->CreateDecoder(cfg, 0);
+ decoder_->Control(AV1_SET_TILE_MODE, 1);
+ decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+
+ // Allocate buffer to store tile image.
+ aom_img_alloc(&tile_img_, AOM_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+
+ md5_.clear();
+ tile_md5_.clear();
+ }
+
+ ~AV1ExtTileTest() override {
+ aom_img_free(&tile_img_);
+ delete decoder_;
+ }
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.g_error_resilient = 1;
+
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_min_quantizer = 0;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ // Encode setting
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+
+ // TODO(yunqingwang): test single_tile_decoding = 0.
+ encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1);
+ // Always use 64x64 max partition.
+ encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64);
+ // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+ // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+ encoder->Control(AV1E_SET_TILE_ROWS, 6);
+ } else if (video->frame() == 1) {
+ frame_flags_ =
+ AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+ }
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ // Skip 1 already decoded frame to be consistent with the decoder in this
+ // test.
+ if (pts == (aom_codec_pts_t)kSkip) return;
+
+ // Calculate MD5 as the reference.
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(&img);
+ md5_.push_back(md5_res.Get());
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ // Skip decoding 1 frame.
+ if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return;
+
+ bool IsLastFrame = (pkt->data.frame.pts == (aom_codec_pts_t)(kLimit - 1));
+
+ // Decode the first (kLimit - 1) frames as whole frame, and decode the last
+ // frame in single tiles.
+ for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) {
+ for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) {
+ if (!IsLastFrame) {
+ decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ } else {
+ decoder_->Control(AV1_SET_DECODE_TILE_ROW, r);
+ decoder_->Control(AV1_SET_DECODE_TILE_COL, c);
+ }
+
+ const aom_codec_err_t res = decoder_->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ const aom_image_t *img = decoder_->GetDxData().Next();
+
+ if (!IsLastFrame) {
+ if (img) {
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(img);
+ tile_md5_.push_back(md5_res.Get());
+ }
+ break;
+ }
+
+ const int kMaxMBPlane = 3;
+ for (int plane = 0; plane < kMaxMBPlane; ++plane) {
+ const int shift = (plane == 0) ? 0 : 1;
+ int tile_height = kTIleSizeInPixels >> shift;
+ int tile_width = kTIleSizeInPixels >> shift;
+
+ for (int tr = 0; tr < tile_height; ++tr) {
+ memcpy(tile_img_.planes[plane] +
+ tile_img_.stride[plane] * (r * tile_height + tr) +
+ c * tile_width,
+ img->planes[plane] + img->stride[plane] * tr, tile_width);
+ }
+ }
+ }
+
+ if (!IsLastFrame) break;
+ }
+
+ if (IsLastFrame) {
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(&tile_img_);
+ tile_md5_.push_back(md5_res.Get());
+ }
+ }
+
+ void TestRoundTrip() {
+ ::libaom_test::I420VideoSource video(
+ "hantro_collage_w352h288.yuv", kImgWidth, kImgHeight, 30, 1, 0, kLimit);
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
+ cfg_.large_scale_tile = 1;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_threads = 1;
+
+ // Tile encoding
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Compare to check if two vectors are equal.
+ ASSERT_EQ(md5_, tile_md5_);
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ int set_cpu_used_;
+ ::libaom_test::Decoder *decoder_;
+ aom_image_t tile_img_;
+ std::vector<std::string> md5_;
+ std::vector<std::string> tile_md5_;
+};
+
+TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); }
+
+AV1_INSTANTIATE_TEST_SUITE(
+ // Now only test 2-pass mode.
+ AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Range(1, 4));
+
+class AV1ExtTileTestLarge : public AV1ExtTileTest {};
+
+TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); }
+
+AV1_INSTANTIATE_TEST_SUITE(
+ // Now only test 2-pass mode.
+ AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Range(0, 1));
+} // namespace
diff --git a/third_party/aom/test/av1_external_partition_test.cc b/third_party/aom/test/av1_external_partition_test.cc
new file mode 100644
index 0000000000..88f6216fa5
--- /dev/null
+++ b/third_party/aom/test/av1_external_partition_test.cc
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fstream>
+#include <new>
+#include <sstream>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+#if CONFIG_AV1_ENCODER
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+constexpr int kFrameNum = 8;
+constexpr int kVersion = 1;
+
+typedef struct TestData {
+ int version = kVersion;
+} TestData;
+
+typedef struct ToyModel {
+ TestData *data;
+ aom_ext_part_config_t config;
+ aom_ext_part_funcs_t funcs;
+ int mi_row;
+ int mi_col;
+ int frame_width;
+ int frame_height;
+ BLOCK_SIZE block_size;
+} ToyModel;
+
+// Note:
+// if CONFIG_PARTITION_SEARCH_ORDER = 0, we test APIs designed for the baseline
+// encoder's DFS partition search workflow.
+// if CONFIG_PARTITION_SEARCH_ORDER = 1, we test APIs designed for the new
+// ML model's partition search workflow.
+#if CONFIG_PARTITION_SEARCH_ORDER
+aom_ext_part_status_t ext_part_create_model(
+ void *priv, const aom_ext_part_config_t *part_config,
+ aom_ext_part_model_t *ext_part_model) {
+ TestData *received_data = reinterpret_cast<TestData *>(priv);
+ EXPECT_EQ(received_data->version, kVersion);
+ ToyModel *toy_model = new (std::nothrow) ToyModel;
+ if (toy_model == nullptr) {
+ EXPECT_NE(toy_model, nullptr);
+ return AOM_EXT_PART_ERROR;
+ }
+ toy_model->data = received_data;
+ *ext_part_model = toy_model;
+ EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_features_t *part_features) {
+ ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+ toy_model->mi_row = part_features->mi_row;
+ toy_model->mi_col = part_features->mi_col;
+ toy_model->frame_width = part_features->frame_width;
+ toy_model->frame_height = part_features->frame_height;
+ toy_model->block_size = static_cast<BLOCK_SIZE>(part_features->block_size);
+ return AOM_EXT_PART_OK;
+}
+
+// The model provide the whole decision tree to the encoder.
+aom_ext_part_status_t ext_part_get_partition_decision_whole_tree(
+ aom_ext_part_model_t ext_part_model,
+ aom_partition_decision_t *ext_part_decision) {
+ ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+ // A toy model that always asks the encoder to encode with
+ // 4x4 blocks (the smallest).
+ ext_part_decision->is_final_decision = 1;
+ // Note: super block size is fixed to BLOCK_64X64 for the
+ // input video. It is determined inside the encoder, see the
+ // check in "ext_part_create_model".
+ const int is_last_sb_col =
+ toy_model->mi_col * 4 + 64 > toy_model->frame_width;
+ const int is_last_sb_row =
+ toy_model->mi_row * 4 + 64 > toy_model->frame_height;
+ if (is_last_sb_row && is_last_sb_col) {
+ // 64x64: 1 node
+ // 32x32: 4 nodes (only the first one will further split)
+ // 16x16: 4 nodes
+ // 8x8: 4 * 4 nodes
+ // 4x4: 4 * 4 * 4 nodes
+ const int num_blocks = 1 + 4 + 4 + 4 * 4 + 4 * 4 * 4;
+ const int num_4x4_blocks = 4 * 4 * 4;
+ ext_part_decision->num_nodes = num_blocks;
+ // 64x64
+ ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+ // 32x32, only the first one will split, the other three are
+ // out of frame boundary.
+ ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+ ext_part_decision->partition_decision[2] = PARTITION_NONE;
+ ext_part_decision->partition_decision[3] = PARTITION_NONE;
+ ext_part_decision->partition_decision[4] = PARTITION_NONE;
+ // The rest blocks inside the top-left 32x32 block.
+ for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+ ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+ }
+ for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+ ext_part_decision->partition_decision[i] = PARTITION_NONE;
+ }
+ } else if (is_last_sb_row) {
+ // 64x64: 1 node
+ // 32x32: 4 nodes (only the first two will further split)
+ // 16x16: 2 * 4 nodes
+ // 8x8: 2 * 4 * 4 nodes
+ // 4x4: 2 * 4 * 4 * 4 nodes
+ const int num_blocks = 1 + 4 + 2 * 4 + 2 * 4 * 4 + 2 * 4 * 4 * 4;
+ const int num_4x4_blocks = 2 * 4 * 4 * 4;
+ ext_part_decision->num_nodes = num_blocks;
+ // 64x64
+ ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+ // 32x32, only the first two will split, the other two are out
+ // of frame boundary.
+ ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+ ext_part_decision->partition_decision[2] = PARTITION_SPLIT;
+ ext_part_decision->partition_decision[3] = PARTITION_NONE;
+ ext_part_decision->partition_decision[4] = PARTITION_NONE;
+ // The rest blocks.
+ for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+ ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+ }
+ for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+ ext_part_decision->partition_decision[i] = PARTITION_NONE;
+ }
+ } else if (is_last_sb_col) {
+ // 64x64: 1 node
+ // 32x32: 4 nodes (only the top-left and bottom-left will further split)
+ // 16x16: 2 * 4 nodes
+ // 8x8: 2 * 4 * 4 nodes
+ // 4x4: 2 * 4 * 4 * 4 nodes
+ const int num_blocks = 1 + 4 + 2 * 4 + 2 * 4 * 4 + 2 * 4 * 4 * 4;
+ const int num_4x4_blocks = 2 * 4 * 4 * 4;
+ ext_part_decision->num_nodes = num_blocks;
+ // 64x64
+ ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+ // 32x32, only the top-left and bottom-left will split, the other two are
+ // out of frame boundary.
+ ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+ ext_part_decision->partition_decision[2] = PARTITION_NONE;
+ ext_part_decision->partition_decision[3] = PARTITION_SPLIT;
+ ext_part_decision->partition_decision[4] = PARTITION_NONE;
+ // The rest blocks.
+ for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+ ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+ }
+ for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+ ext_part_decision->partition_decision[i] = PARTITION_NONE;
+ }
+ } else {
+ // 64x64: 1 node
+ // 32x32: 4 nodes
+ // 16x16: 4 * 4 nodes
+ // 8x8: 4 * 4 * 4 nodes
+ // 4x4: 4 * 4 * 4 * 4 nodes
+ const int num_blocks = 1 + 4 + 4 * 4 + 4 * 4 * 4 + 4 * 4 * 4 * 4;
+ const int num_4x4_blocks = 4 * 4 * 4 * 4;
+ ext_part_decision->num_nodes = num_blocks;
+ for (int i = 0; i < num_blocks - num_4x4_blocks; ++i) {
+ ext_part_decision->partition_decision[i] = PARTITION_SPLIT;
+ }
+ for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+ ext_part_decision->partition_decision[i] = PARTITION_NONE;
+ }
+ }
+
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision_recursive(
+ aom_ext_part_model_t ext_part_model,
+ aom_partition_decision_t *ext_part_decision) {
+ ext_part_decision->current_decision = PARTITION_NONE;
+ ext_part_decision->is_final_decision = 1;
+ ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+ // Note: super block size is fixed to BLOCK_64X64 for the
+ // input video. It is determined inside the encoder, see the
+ // check in "ext_part_create_model".
+ const int is_last_sb_col =
+ toy_model->mi_col * 4 + 64 > toy_model->frame_width;
+ const int is_last_sb_row =
+ toy_model->mi_row * 4 + 64 > toy_model->frame_height;
+ if (is_last_sb_row && is_last_sb_col) {
+ if (block_size_wide[toy_model->block_size] == 64) {
+ ext_part_decision->current_decision = PARTITION_SPLIT;
+ } else {
+ ext_part_decision->current_decision = PARTITION_NONE;
+ }
+ } else if (is_last_sb_row) {
+ if (block_size_wide[toy_model->block_size] == 64) {
+ ext_part_decision->current_decision = PARTITION_SPLIT;
+ } else {
+ ext_part_decision->current_decision = PARTITION_NONE;
+ }
+ } else if (is_last_sb_col) {
+ if (block_size_wide[toy_model->block_size] == 64) {
+ ext_part_decision->current_decision = PARTITION_SPLIT;
+ } else {
+ ext_part_decision->current_decision = PARTITION_NONE;
+ }
+ } else {
+ ext_part_decision->current_decision = PARTITION_NONE;
+ }
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_stats_t *ext_part_stats) {
+ (void)ext_part_model;
+ (void)ext_part_stats;
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+ aom_ext_part_model_t ext_part_model) {
+ ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+ EXPECT_EQ(toy_model->data->version, kVersion);
+ delete toy_model;
+ return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTestAPI
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ExternalPartitionTestAPI()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+ ~ExternalPartitionTestAPI() override {}
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 4;
+ cfg_.rc_target_bitrate = 400;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ bool DoDecode() const override { return false; }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ void SetExternalPartition(bool use_external_partition) {
+ use_external_partition_ = use_external_partition;
+ }
+
+ void SetPartitionControlMode(int mode) { partition_control_mode_ = mode; }
+
+ void SetDecisionMode(aom_ext_part_decision_mode_t mode) {
+ decision_mode_ = mode;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ if (decision_mode_ == AOM_EXT_PART_WHOLE_TREE) {
+ aom_ext_part_funcs_t ext_part_funcs;
+ ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+ ext_part_funcs.decision_mode = AOM_EXT_PART_WHOLE_TREE;
+ ext_part_funcs.create_model = ext_part_create_model;
+ ext_part_funcs.send_features = ext_part_send_features;
+ ext_part_funcs.get_partition_decision =
+ ext_part_get_partition_decision_whole_tree;
+ ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+ ext_part_funcs.delete_model = ext_part_delete_model;
+
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ if (use_external_partition_) {
+ encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+ }
+ if (partition_control_mode_ == -1) {
+ encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 128);
+ encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+ } else {
+ switch (partition_control_mode_) {
+ case 1:
+ encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 64);
+ encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 64);
+ break;
+ case 2:
+ encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 4);
+ encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+ break;
+ default: assert(0 && "Invalid partition control mode."); break;
+ }
+ }
+ } else if (decision_mode_ == AOM_EXT_PART_RECURSIVE) {
+ aom_ext_part_funcs_t ext_part_funcs;
+ ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+ ext_part_funcs.decision_mode = AOM_EXT_PART_RECURSIVE;
+ ext_part_funcs.create_model = ext_part_create_model;
+ ext_part_funcs.send_features = ext_part_send_features;
+ ext_part_funcs.get_partition_decision =
+ ext_part_get_partition_decision_recursive;
+ ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+ ext_part_funcs.delete_model = ext_part_delete_model;
+
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ if (use_external_partition_) {
+ encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+ }
+ if (partition_control_mode_ == -1) {
+ encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 128);
+ encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+ } else {
+ switch (partition_control_mode_) {
+ case 1:
+ encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 64);
+ encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 64);
+ break;
+ case 2:
+ encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 4);
+ encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+ break;
+ default: assert(0 && "Invalid partition control mode."); break;
+ }
+ }
+ } else {
+ assert(0 && "Invalid decision mode.");
+ }
+ }
+ }
+
+ private:
+ libaom_test::TestMode encoding_mode_;
+ int cpu_used_;
+ double psnr_;
+ unsigned int nframes_;
+ bool use_external_partition_ = false;
+ TestData test_data_;
+ int partition_control_mode_ = -1;
+ aom_ext_part_decision_mode_t decision_mode_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is a normal encoding run with restricted partition types,
+// i.e., we use control flags to force the encoder to encode with the
+// 4x4 block size.
+// The second run is to get partition decisions from a toy model that we
+// built, which will asks the encoder to encode with the 4x4 blocks.
+// We expect the encoding results are the same.
+TEST_P(ExternalPartitionTestAPI, WholePartitionTree4x4Block) {
+ ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+ SetExternalPartition(false);
+ SetPartitionControlMode(2);
+ SetDecisionMode(AOM_EXT_PART_WHOLE_TREE);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr = GetAveragePsnr();
+
+ SetExternalPartition(true);
+ SetPartitionControlMode(2);
+ SetDecisionMode(AOM_EXT_PART_WHOLE_TREE);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr2 = GetAveragePsnr();
+
+ EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+TEST_P(ExternalPartitionTestAPI, RecursivePartition) {
+ ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+ SetExternalPartition(false);
+ SetPartitionControlMode(1);
+ SetDecisionMode(AOM_EXT_PART_RECURSIVE);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr = GetAveragePsnr();
+
+ SetExternalPartition(true);
+ SetPartitionControlMode(1);
+ SetDecisionMode(AOM_EXT_PART_RECURSIVE);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr2 = GetAveragePsnr();
+
+ const double psnr_thresh = 0.02;
+ EXPECT_NEAR(psnr, psnr2, psnr_thresh);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTestAPI,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Values(4)); // cpu_used
+
+#else // !CONFIG_PARTITION_SEARCH_ORDER
+// Feature files written during encoding, as defined in partition_strategy.c.
+std::string feature_file_names[] = {
+ "feature_before_partition_none",
+ "feature_before_partition_none_prune_rect",
+ "feature_after_partition_none_prune",
+ "feature_after_partition_none_terminate",
+ "feature_after_partition_split_terminate",
+ "feature_after_partition_split_prune_rect",
+ "feature_after_partition_rect",
+ "feature_after_partition_ab",
+};
+
+// Files written here in the test, where the feature data is received
+// from the API.
+std::string test_feature_file_names[] = {
+ "test_feature_before_partition_none",
+ "test_feature_before_partition_none_prune_rect",
+ "test_feature_after_partition_none_prune",
+ "test_feature_after_partition_none_terminate",
+ "test_feature_after_partition_split_terminate",
+ "test_feature_after_partition_split_prune_rect",
+ "test_feature_after_partition_rect",
+ "test_feature_after_partition_ab",
+};
+
+static void write_features_to_file(const float *features,
+ const int feature_size, const int id) {
+ if (!WRITE_FEATURE_TO_FILE) return;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s",
+ test_feature_file_names[id].c_str());
+ FILE *pfile = fopen(filename, "a");
+ ASSERT_NE(pfile, nullptr);
+ for (int i = 0; i < feature_size; ++i) {
+ fprintf(pfile, "%.6f", features[i]);
+ if (i < feature_size - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ fclose(pfile);
+}
+
+aom_ext_part_status_t ext_part_create_model(
+ void *priv, const aom_ext_part_config_t *part_config,
+ aom_ext_part_model_t *ext_part_model) {
+ TestData *received_data = reinterpret_cast<TestData *>(priv);
+ EXPECT_EQ(received_data->version, kVersion);
+ ToyModel *toy_model = new (std::nothrow) ToyModel;
+ if (toy_model == nullptr) {
+ EXPECT_NE(toy_model, nullptr);
+ return AOM_EXT_PART_ERROR;
+ }
+ toy_model->data = received_data;
+ *ext_part_model = toy_model;
+ EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_create_model_test(
+ void *priv, const aom_ext_part_config_t *part_config,
+ aom_ext_part_model_t *ext_part_model) {
+ (void)priv;
+ (void)ext_part_model;
+ EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+ // Return status indicates it's a encoder test. It lets the encoder
+ // set a flag and write partition features to text files.
+ return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_features_t *part_features) {
+ (void)ext_part_model;
+ (void)part_features;
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features_test(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_features_t *part_features) {
+ (void)ext_part_model;
+ if (part_features->id == AOM_EXT_PART_FEATURE_BEFORE_NONE) {
+ write_features_to_file(part_features->before_part_none.f,
+ AOM_EXT_PART_SIZE_DIRECT_SPLIT, 0);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2) {
+ write_features_to_file(part_features->before_part_none.f_part2,
+ AOM_EXT_PART_SIZE_PRUNE_PART, 1);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_NONE) {
+ write_features_to_file(part_features->after_part_none.f,
+ AOM_EXT_PART_SIZE_PRUNE_NONE, 2);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_NONE_PART2) {
+ write_features_to_file(part_features->after_part_none.f_terminate,
+ AOM_EXT_PART_SIZE_TERM_NONE, 3);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_SPLIT) {
+ write_features_to_file(part_features->after_part_split.f_terminate,
+ AOM_EXT_PART_SIZE_TERM_SPLIT, 4);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2) {
+ write_features_to_file(part_features->after_part_split.f_prune_rect,
+ AOM_EXT_PART_SIZE_PRUNE_RECT, 5);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_RECT) {
+ write_features_to_file(part_features->after_part_rect.f,
+ AOM_EXT_PART_SIZE_PRUNE_AB, 6);
+ } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_AB) {
+ write_features_to_file(part_features->after_part_ab.f,
+ AOM_EXT_PART_SIZE_PRUNE_4_WAY, 7);
+ }
+ return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision(
+ aom_ext_part_model_t ext_part_model,
+ aom_partition_decision_t *ext_part_decision) {
+ (void)ext_part_model;
+ (void)ext_part_decision;
+ // Return an invalid decision such that the encoder doesn't take any
+ // partition decision from the ml model.
+ return AOM_EXT_PART_ERROR;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+ aom_ext_part_model_t ext_part_model,
+ const aom_partition_stats_t *ext_part_stats) {
+ (void)ext_part_model;
+ (void)ext_part_stats;
+ return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+ aom_ext_part_model_t ext_part_model) {
+ ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+ EXPECT_EQ(toy_model->data->version, kVersion);
+ delete toy_model;
+ return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTestDfsAPI
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ExternalPartitionTestDfsAPI()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+ ~ExternalPartitionTestDfsAPI() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 4;
+ cfg_.rc_target_bitrate = 400;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ bool DoDecode() const override { return false; }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ void SetExternalPartition(bool use_external_partition) {
+ use_external_partition_ = use_external_partition;
+ }
+
+ void SetTestSendFeatures(int test_send_features) {
+ test_send_features_ = test_send_features;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ aom_ext_part_funcs_t ext_part_funcs;
+ ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+ if (use_external_partition_) {
+ ext_part_funcs.create_model = ext_part_create_model;
+ ext_part_funcs.send_features = ext_part_send_features;
+ }
+ if (test_send_features_ == 1) {
+ ext_part_funcs.create_model = ext_part_create_model;
+ ext_part_funcs.send_features = ext_part_send_features_test;
+ } else if (test_send_features_ == 0) {
+ ext_part_funcs.create_model = ext_part_create_model_test;
+ ext_part_funcs.send_features = ext_part_send_features;
+ }
+ ext_part_funcs.get_partition_decision = ext_part_get_partition_decision;
+ ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+ ext_part_funcs.delete_model = ext_part_delete_model;
+
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ if (use_external_partition_) {
+ encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+ }
+ }
+ }
+
+ private:
+ libaom_test::TestMode encoding_mode_;
+ int cpu_used_;
+ double psnr_;
+ unsigned int nframes_;
+ bool use_external_partition_ = false;
+ int test_send_features_ = -1;
+ TestData test_data_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is the baseline without external partition.
+// The second run is to get partition decisions from the toy model we defined.
+// Here, we let the partition decision return invalid for all stages.
+// In this case, the external partition doesn't alter the original encoder
+// behavior. So we expect the same encoding results.
+TEST_P(ExternalPartitionTestDfsAPI, EncodeMatch) {
+ ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+ SetExternalPartition(false);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr = GetAveragePsnr();
+
+ SetExternalPartition(true);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr2 = GetAveragePsnr();
+
+ EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+// Encode twice to compare generated feature files.
+// The first run let the encoder write partition features to file.
+// The second run calls send partition features function to send features to
+// the external model, and we write them to file.
+// The generated files should match each other.
+TEST_P(ExternalPartitionTestDfsAPI, SendFeatures) {
+ ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+ SetExternalPartition(true);
+ SetTestSendFeatures(0);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ SetExternalPartition(true);
+ SetTestSendFeatures(1);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ if (!WRITE_FEATURE_TO_FILE) return;
+
+ // Compare feature files by reading them into strings.
+ for (int i = 0; i < 8; ++i) {
+ std::ifstream base_file(feature_file_names[i]);
+ ASSERT_TRUE(base_file.good());
+ std::stringstream base_stream;
+ base_stream << base_file.rdbuf();
+ std::string base_string = base_stream.str();
+
+ std::ifstream test_file(test_feature_file_names[i]);
+ ASSERT_TRUE(test_file.good());
+ std::stringstream test_stream;
+ test_stream << test_file.rdbuf();
+ std::string test_string = test_stream.str();
+
+ EXPECT_STREQ(base_string.c_str(), test_string.c_str());
+ }
+
+ // Remove files.
+ std::string command("rm -f feature_* test_feature_*");
+ system(command.c_str());
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTestDfsAPI,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Values(4)); // cpu_used
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+
+} // namespace
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_ENCODER
diff --git a/third_party/aom/test/av1_fwd_txfm1d_test.cc b/third_party/aom/test/av1_fwd_txfm1d_test.cc
new file mode 100644
index 0000000000..6bae9f8364
--- /dev/null
+++ b/third_party/aom/test/av1_fwd_txfm1d_test.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <new>
+
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "test/av1_txfm_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+using libaom_test::reference_hybrid_1d;
+using libaom_test::TYPE_ADST;
+using libaom_test::TYPE_DCT;
+using libaom_test::TYPE_IDTX;
+using libaom_test::TYPE_TXFM;
+
+namespace {
+const int txfm_type_num = 3;
+const TYPE_TXFM txfm_type_ls[txfm_type_num] = { TYPE_DCT, TYPE_ADST,
+ TYPE_IDTX };
+
+const int txfm_size_num = 5;
+
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+ { av1_fdct4, av1_fadst4, av1_fidentity4_c },
+ { av1_fdct8, av1_fadst8, av1_fidentity8_c },
+ { av1_fdct16, av1_fadst16, av1_fidentity16_c },
+ { av1_fdct32, nullptr, av1_fidentity32_c },
+ { av1_fdct64, nullptr, nullptr },
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit = 13;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
+
+TEST(av1_fwd_txfm1d, round_shift) {
+ EXPECT_EQ(round_shift(7, 1), 4);
+ EXPECT_EQ(round_shift(-7, 1), -3);
+
+ EXPECT_EQ(round_shift(7, 2), 2);
+ EXPECT_EQ(round_shift(-7, 2), -2);
+
+ EXPECT_EQ(round_shift(8, 2), 2);
+ EXPECT_EQ(round_shift(-8, 2), -2);
+}
+
+TEST(av1_fwd_txfm1d, av1_cospi_arr_data) {
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 64; j++) {
+ EXPECT_EQ(av1_cospi_arr_data[i][j],
+ (int32_t)round(cos(PI * j / 128) * (1 << (cos_bit_min + i))));
+ }
+ }
+}
+
+TEST(av1_fwd_txfm1d, accuracy) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int si = 0; si < txfm_size_num; ++si) {
+ int txfm_size = txfm_size_ls[si];
+ std::unique_ptr<int32_t[]> input(new (std::nothrow) int32_t[txfm_size]);
+ std::unique_ptr<int32_t[]> output(new (std::nothrow) int32_t[txfm_size]);
+ std::unique_ptr<double[]> ref_input(new (std::nothrow) double[txfm_size]);
+ std::unique_ptr<double[]> ref_output(new (std::nothrow) double[txfm_size]);
+ ASSERT_NE(input, nullptr);
+ ASSERT_NE(output, nullptr);
+ ASSERT_NE(ref_input, nullptr);
+ ASSERT_NE(ref_output, nullptr);
+
+ for (int ti = 0; ti < txfm_type_num; ++ti) {
+ TYPE_TXFM txfm_type = txfm_type_ls[ti];
+ TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
+ int max_error = 7;
+
+ const int count_test_block = 5000;
+ if (fwd_txfm_func != nullptr) {
+ for (int i = 0; i < count_test_block; ++i) {
+ for (int ni = 0; ni < txfm_size; ++ni) {
+ input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+ ref_input[ni] = static_cast<double>(input[ni]);
+ }
+
+ fwd_txfm_func(input.get(), output.get(), cos_bit, range_bit);
+ reference_hybrid_1d(ref_input.get(), ref_output.get(), txfm_size,
+ txfm_type);
+
+ for (int ni = 0; ni < txfm_size; ++ni) {
+ ASSERT_LE(
+ abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+ max_error)
+ << "tx size = " << txfm_size << ", tx type = " << txfm_type;
+ }
+ }
+ }
+ }
+ }
+}
+} // namespace
diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc
new file mode 100644
index 0000000000..2ed5d94db3
--- /dev/null
+++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+#include <vector>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/av1_txfm_test.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::input_base;
+using libaom_test::tx_type_name;
+using libaom_test::TYPE_TXFM;
+
+using std::vector;
+
+namespace {
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
+
+class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
+ public:
+ void SetUp() override {
+ tx_type_ = GET_PARAM(0);
+ tx_size_ = GET_PARAM(1);
+ max_error_ = GET_PARAM(2);
+ max_avg_error_ = GET_PARAM(3);
+ count_ = 500;
+ TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+ av1_get_fwd_txfm_cfg(tx_type_, tx_size_, &fwd_txfm_flip_cfg);
+ amplify_factor_ = libaom_test::get_amplification_factor(tx_type_, tx_size_);
+ tx_width_ = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+ tx_height_ = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+ ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
+ lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
+
+ fwd_txfm_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+ txfm2d_size_ = tx_width_ * tx_height_;
+ input_ = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(input_[0]) * txfm2d_size_));
+ ASSERT_NE(input_, nullptr);
+ output_ = reinterpret_cast<int32_t *>(
+ aom_memalign(16, sizeof(output_[0]) * txfm2d_size_));
+ ASSERT_NE(output_, nullptr);
+ ref_input_ = reinterpret_cast<double *>(
+ aom_memalign(16, sizeof(ref_input_[0]) * txfm2d_size_));
+ ASSERT_NE(ref_input_, nullptr);
+ ref_output_ = reinterpret_cast<double *>(
+ aom_memalign(16, sizeof(ref_output_[0]) * txfm2d_size_));
+ ASSERT_NE(ref_output_, nullptr);
+ }
+
+ void RunFwdAccuracyCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ double avg_abs_error = 0;
+ for (int ci = 0; ci < count_; ci++) {
+ for (int ni = 0; ni < txfm2d_size_; ++ni) {
+ input_[ni] = rnd.Rand16() % input_base;
+ ref_input_[ni] = static_cast<double>(input_[ni]);
+ output_[ni] = 0;
+ ref_output_[ni] = 0;
+ }
+
+ fwd_txfm_(input_, output_, tx_width_, tx_type_, bd);
+
+ if (lr_flip_ && ud_flip_) {
+ libaom_test::fliplrud(ref_input_, tx_width_, tx_height_, tx_width_);
+ } else if (lr_flip_) {
+ libaom_test::fliplr(ref_input_, tx_width_, tx_height_, tx_width_);
+ } else if (ud_flip_) {
+ libaom_test::flipud(ref_input_, tx_width_, tx_height_, tx_width_);
+ }
+
+ libaom_test::reference_hybrid_2d(ref_input_, ref_output_, tx_type_,
+ tx_size_);
+
+ double actual_max_error = 0;
+ for (int ni = 0; ni < txfm2d_size_; ++ni) {
+ ref_output_[ni] = round(ref_output_[ni]);
+ const double this_error =
+ fabs(output_[ni] - ref_output_[ni]) / amplify_factor_;
+ actual_max_error = AOMMAX(actual_max_error, this_error);
+ }
+ EXPECT_GE(max_error_, actual_max_error)
+ << "tx_w: " << tx_width_ << " tx_h: " << tx_height_
+ << ", tx_type = " << (int)tx_type_;
+ if (actual_max_error > max_error_) { // exit early.
+ break;
+ }
+
+ avg_abs_error += compute_avg_abs_error<int32_t, double>(
+ output_, ref_output_, txfm2d_size_);
+ }
+
+ avg_abs_error /= amplify_factor_;
+ avg_abs_error /= count_;
+ EXPECT_GE(max_avg_error_, avg_abs_error)
+ << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
+ }
+
+ void TearDown() override {
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(ref_input_);
+ aom_free(ref_output_);
+ }
+
+ private:
+ double max_error_;
+ double max_avg_error_;
+ int count_;
+ double amplify_factor_;
+ TX_TYPE tx_type_;
+ TX_SIZE tx_size_;
+ int tx_width_;
+ int tx_height_;
+ int txfm2d_size_;
+ FwdTxfm2dFunc fwd_txfm_;
+ int16_t *input_;
+ int32_t *output_;
+ double *ref_input_;
+ double *ref_output_;
+ int ud_flip_; // flip upside down
+ int lr_flip_; // flip left to right
+};
+
+static double avg_error_ls[TX_SIZES_ALL] = {
+ 0.5, // 4x4 transform
+ 0.5, // 8x8 transform
+ 1.2, // 16x16 transform
+ 6.1, // 32x32 transform
+ 3.4, // 64x64 transform
+ 0.57, // 4x8 transform
+ 0.68, // 8x4 transform
+ 0.92, // 8x16 transform
+ 1.1, // 16x8 transform
+ 4.1, // 16x32 transform
+ 6, // 32x16 transform
+ 3.5, // 32x64 transform
+ 5.7, // 64x32 transform
+ 0.6, // 4x16 transform
+ 0.9, // 16x4 transform
+ 1.2, // 8x32 transform
+ 1.7, // 32x8 transform
+ 2.0, // 16x64 transform
+ 4.7, // 64x16 transform
+};
+
+static double max_error_ls[TX_SIZES_ALL] = {
+ 3, // 4x4 transform
+ 5, // 8x8 transform
+ 11, // 16x16 transform
+ 70, // 32x32 transform
+ 64, // 64x64 transform
+ 3.9, // 4x8 transform
+ 4.3, // 8x4 transform
+ 12, // 8x16 transform
+ 12, // 16x8 transform
+ 32, // 16x32 transform
+ 46, // 32x16 transform
+ 136, // 32x64 transform
+ 136, // 64x32 transform
+ 5, // 4x16 transform
+ 6, // 16x4 transform
+ 21, // 8x32 transform
+ 13, // 32x8 transform
+ 30, // 16x64 transform
+ 36, // 64x16 transform
+};
+
+vector<AV1FwdTxfm2dParam> GetTxfm2dParamList() {
+ vector<AV1FwdTxfm2dParam> param_list;
+ for (int s = 0; s < TX_SIZES; ++s) {
+ const double max_error = max_error_ls[s];
+ const double avg_error = avg_error_ls[s];
+ for (int t = 0; t < TX_TYPES; ++t) {
+ const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
+ const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+ if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+ param_list.push_back(
+ AV1FwdTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+ }
+ }
+ }
+ return param_list;
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AV1FwdTxfm2d,
+ ::testing::ValuesIn(GetTxfm2dParamList()));
+
+TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
+
+TEST(AV1FwdTxfm2d, CfgTest) {
+ for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+ int bd = libaom_test::bd_arr[bd_idx];
+ int8_t low_range = libaom_test::low_range_arr[bd_idx];
+ int8_t high_range = libaom_test::high_range_arr[bd_idx];
+ for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+ static_cast<TX_TYPE>(tx_type)) ==
+ false) {
+ continue;
+ }
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(static_cast<TX_TYPE>(tx_type),
+ static_cast<TX_SIZE>(tx_size), &cfg);
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ av1_gen_fwd_stage_range(stage_range_col, stage_range_row, &cfg, bd);
+ libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+ cfg.cos_bit_col, low_range,
+ high_range);
+ libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+ cfg.cos_bit_row, low_range,
+ high_range);
+ }
+ }
+ }
+}
+
+typedef void (*lowbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param);
+
+void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+ const int bd = 8;
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ // printf("%d x %d\n", cols, rows);
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != nullptr) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int cnt = 0; cnt < 500; ++cnt) {
+ if (cnt == 0) {
+ for (int c = 0; c < cols; ++c) {
+ for (int r = 0; r < rows; ++r) {
+ input[r * input_stride + c] = (1 << bd) - 1;
+ }
+ }
+ } else {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+ }
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ target_func(input, output, input_stride, &param);
+ const int check_cols = AOMMIN(32, cols);
+ const int check_rows = AOMMIN(32, rows * cols / check_cols);
+ for (int r = 0; r < check_rows; ++r) {
+ for (int c = 0; c < check_cols; ++c) {
+ ASSERT_EQ(ref_output[r * check_cols + c],
+ output[r * check_cols + c])
+ << "[" << r << "," << c << "] cnt:" << cnt
+ << " tx_size: " << cols << "x" << rows
+ << " tx_type: " << tx_type_name[tx_type];
+ }
+ }
+ }
+ }
+ }
+}
+
+void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ const int num_loops = 1000000 / (rows * cols);
+
+ const int bd = 8;
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != nullptr) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ target_func(input, output, input_stride, &param);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "txfm_size[%2dx%-2d] \t txfm_type[%d] \t c_time=%d \t"
+ "simd_time=%d \t gain=%d \n",
+ rows, cols, tx_type, elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ }
+ }
+}
+
+typedef std::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
+
+class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1FwdTxfm2dTest);
+
+TEST_P(AV1FwdTxfm2dTest, match) {
+ AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
+ AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+TEST(AV1FwdTxfm2dTest, DCTScaleTest) {
+ BitDepthInfo bd_info;
+ bd_info.bit_depth = 8;
+ bd_info.use_highbitdepth_buf = 0;
+ DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+ const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+ const int stride_list[4] = { 4, 8, 16, 32 };
+ const int ref_scale_list[4] = { 64, 64, 64, 16 };
+
+ for (int i = 0; i < 4; i++) {
+ TX_SIZE tx_size = tx_size_list[i];
+ int stride = stride_list[i];
+ int array_size = stride * stride;
+
+ for (int j = 0; j < array_size; j++) {
+ src_diff[j] = 8;
+ coeff[j] = 0;
+ }
+
+ av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, stride,
+ coeff);
+
+ double input_sse = 0;
+ double output_sse = 0;
+ for (int j = 0; j < array_size; j++) {
+ input_sse += pow(src_diff[j], 2);
+ output_sse += pow(coeff[j], 2);
+ }
+
+ double scale = output_sse / input_sse;
+
+ EXPECT_NEAR(scale, ref_scale_list[i], 5);
+ }
+}
+TEST(AV1FwdTxfm2dTest, HadamardScaleTest) {
+ BitDepthInfo bd_info;
+ bd_info.bit_depth = 8;
+ bd_info.use_highbitdepth_buf = 0;
+ DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+ const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+ const int stride_list[4] = { 4, 8, 16, 32 };
+ const int ref_scale_list[4] = { 1, 64, 64, 16 };
+
+ for (int i = 0; i < 4; i++) {
+ TX_SIZE tx_size = tx_size_list[i];
+ int stride = stride_list[i];
+ int array_size = stride * stride;
+
+ for (int j = 0; j < array_size; j++) {
+ src_diff[j] = 8;
+ coeff[j] = 0;
+ }
+
+ av1_quick_txfm(/*use_hadamard=*/1, tx_size, bd_info, src_diff, stride,
+ coeff);
+
+ double input_sse = 0;
+ double output_sse = 0;
+ for (int j = 0; j < array_size; j++) {
+ input_sse += pow(src_diff[j], 2);
+ output_sse += pow(coeff[j], 2);
+ }
+
+ double scale = output_sse / input_sse;
+
+ EXPECT_NEAR(scale, ref_scale_list[i], 5);
+ }
+}
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE2
+static TX_SIZE fwd_txfm_for_sse2[] = {
+ TX_4X4,
+ TX_8X8,
+ TX_16X16,
+ TX_32X32,
+ // TX_64X64,
+ TX_4X8,
+ TX_8X4,
+ TX_8X16,
+ TX_16X8,
+ TX_16X32,
+ TX_32X16,
+ // TX_32X64,
+ // TX_64X32,
+ TX_4X16,
+ TX_16X4,
+ TX_8X32,
+ TX_32X8,
+ TX_16X64,
+ TX_64X16,
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest,
+ Combine(ValuesIn(fwd_txfm_for_sse2),
+ Values(av1_lowbd_fwd_txfm_sse2)));
+#endif // HAVE_SSE2
+
+#if HAVE_SSE4_1
+static TX_SIZE fwd_txfm_for_sse41[] = {
+ TX_4X4,
+ TX_64X64,
+ TX_32X64,
+ TX_64X32,
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest,
+ Combine(ValuesIn(fwd_txfm_for_sse41),
+ Values(av1_lowbd_fwd_txfm_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+static TX_SIZE fwd_txfm_for_avx2[] = {
+ TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4,
+ TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+ TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1FwdTxfm2dTest,
+ Combine(ValuesIn(fwd_txfm_for_avx2),
+ Values(av1_lowbd_fwd_txfm_avx2)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+
+static TX_SIZE fwd_txfm_for_neon[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32,
+ TX_64X64, TX_4X8, TX_8X4, TX_8X16,
+ TX_16X8, TX_16X32, TX_32X16, TX_32X64,
+ TX_64X32, TX_4X16, TX_16X4, TX_8X32,
+ TX_32X8, TX_16X64, TX_64X16 };
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1FwdTxfm2dTest,
+ Combine(ValuesIn(fwd_txfm_for_neon),
+ Values(av1_lowbd_fwd_txfm_neon)));
+
+#endif // HAVE_NEON
+
+typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param);
+
+void AV1HighbdFwdTxfm2dMatchTest(TX_SIZE tx_size,
+ Highbd_fwd_txfm_func target_func) {
+ const int bd_ar[2] = { 10, 12 };
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ for (int i = 0; i < 2; ++i) {
+ const int bd = bd_ar[i];
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != nullptr) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int cnt = 0; cnt < 500; ++cnt) {
+ if (cnt == 0) {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = (1 << bd) - 1;
+ }
+ }
+ } else {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+ }
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ target_func(input, output, input_stride, &param);
+ const int check_cols = AOMMIN(32, cols);
+ const int check_rows = AOMMIN(32, rows * cols / check_cols);
+ for (int r = 0; r < check_rows; ++r) {
+ for (int c = 0; c < check_cols; ++c) {
+ ASSERT_EQ(ref_output[c * check_rows + r],
+ output[c * check_rows + r])
+ << "[" << r << "," << c << "] cnt:" << cnt
+ << " tx_size: " << cols << "x" << rows
+ << " tx_type: " << tx_type;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void AV1HighbdFwdTxfm2dSpeedTest(TX_SIZE tx_size,
+ Highbd_fwd_txfm_func target_func) {
+ const int bd_ar[2] = { 10, 12 };
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ const int num_loops = 1000000 / (rows * cols);
+
+ for (int i = 0; i < 2; ++i) {
+ const int bd = bd_ar[i];
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != nullptr) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int j = 0; j < num_loops; ++j) {
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int j = 0; j < num_loops; ++j) {
+ target_func(input, output, input_stride, &param);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "txfm_size[%2dx%-2d] \t txfm_type[%d] \t c_time=%d \t"
+ "simd_time=%d \t gain=%d \n",
+ cols, rows, tx_type, elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ }
+ }
+ }
+}
+
+typedef std::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
+
+class AV1HighbdFwdTxfm2dTest
+ : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdFwdTxfm2dTest);
+
+TEST_P(AV1HighbdFwdTxfm2dTest, match) {
+ AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdFwdTxfm2dTest, DISABLED_Speed) {
+ AV1HighbdFwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE4_1
+static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
+ TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4,
+ TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32,
+#if !CONFIG_REALTIME_ONLY
+ TX_4X16, TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16,
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
+ Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
+ Values(av1_highbd_fwd_txfm)));
+#endif // HAVE_SSE4_1
+#if HAVE_AVX2
+static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8, TX_16X16, TX_32X32,
+ TX_64X64, TX_8X16, TX_16X8 };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdFwdTxfm2dTest,
+ Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
+ Values(av1_highbd_fwd_txfm)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+static TX_SIZE Highbd_fwd_txfm_for_neon[] = {
+ TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4,
+ TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+ TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdFwdTxfm2dTest,
+ Combine(ValuesIn(Highbd_fwd_txfm_for_neon),
+ Values(av1_highbd_fwd_txfm)));
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/av1_highbd_iht_test.cc b/third_party/aom/test/av1_highbd_iht_test.cc
new file mode 100644
index 0000000000..2c57362a82
--- /dev/null
+++ b/third_party/aom/test/av1_highbd_iht_test.cc
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+#include "av1/common/scan.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::tuple;
+
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd);
+
+typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
+ TX_TYPE tx_type, int bd);
+static const char *tx_type_name[] = {
+ "DCT_DCT",
+ "ADST_DCT",
+ "DCT_ADST",
+ "ADST_ADST",
+ "FLIPADST_DCT",
+ "DCT_FLIPADST",
+ "FLIPADST_FLIPADST",
+ "ADST_FLIPADST",
+ "FLIPADST_ADST",
+ "IDTX",
+ "V_DCT",
+ "H_DCT",
+ "V_ADST",
+ "H_ADST",
+ "V_FLIPADST",
+ "H_FLIPADST",
+};
+// Test parameter argument list:
+// <transform reference function,
+// optimized inverse transform function,
+// inverse transform reference function,
+// num_coeffs,
+// tx_type,
+// bit_depth>
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, TX_TYPE, int> IHbdHtParam;
+
+class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
+ public:
+ ~AV1HighbdInvHTNxN() override = default;
+
+ void SetUp() override {
+ txfm_ref_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ inv_txfm_ref_ = GET_PARAM(2);
+ num_coeffs_ = GET_PARAM(3);
+ tx_type_ = GET_PARAM(4);
+ bit_depth_ = GET_PARAM(5);
+
+ input_ = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(input_[0]) * num_coeffs_));
+ ASSERT_NE(input_, nullptr);
+
+ // Note:
+ // Inverse transform input buffer is 32-byte aligned
+ // Refer to <root>/av1/encoder/context_tree.c, function,
+ // void alloc_mode_context().
+ coeffs_ = reinterpret_cast<int32_t *>(
+ aom_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+ ASSERT_NE(coeffs_, nullptr);
+ output_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(32, sizeof(output_[0]) * num_coeffs_));
+ ASSERT_NE(output_, nullptr);
+ output_ref_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+ ASSERT_NE(output_ref_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(input_);
+ aom_free(coeffs_);
+ aom_free(output_);
+ aom_free(output_ref_);
+ }
+
+ protected:
+ void RunBitexactCheck();
+
+ private:
+ int GetStride() const {
+ if (16 == num_coeffs_) {
+ return 4;
+ } else if (64 == num_coeffs_) {
+ return 8;
+ } else if (256 == num_coeffs_) {
+ return 16;
+ } else if (1024 == num_coeffs_) {
+ return 32;
+ } else if (4096 == num_coeffs_) {
+ return 64;
+ } else {
+ return 0;
+ }
+ }
+
+ HbdHtFunc txfm_ref_;
+ IHbdHtFunc inv_txfm_;
+ IHbdHtFunc inv_txfm_ref_;
+ int num_coeffs_;
+ TX_TYPE tx_type_;
+ int bit_depth_;
+
+ int16_t *input_;
+ int32_t *coeffs_;
+ uint16_t *output_;
+ uint16_t *output_ref_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvHTNxN);
+
+void AV1HighbdInvHTNxN::RunBitexactCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int stride = GetStride();
+ const int num_tests = 20000;
+ const uint16_t mask = (1 << bit_depth_) - 1;
+
+ for (int i = 0; i < num_tests; ++i) {
+ for (int j = 0; j < num_coeffs_; ++j) {
+ input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+ output_ref_[j] = rnd.Rand16() & mask;
+ output_[j] = output_ref_[j];
+ }
+
+ txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
+ inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+ API_REGISTER_STATE_CHECK(
+ inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_));
+
+ for (int j = 0; j < num_coeffs_; ++j) {
+ EXPECT_EQ(output_ref_[j], output_[j])
+ << "Not bit-exact result at index: " << j << " At test block: " << i;
+ }
+ }
+}
+
+TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
+
+using std::make_tuple;
+
+#if HAVE_SSE4_1
+#define PARAM_LIST_4X4 \
+ &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
+ &av1_inv_txfm2d_add_4x4_c, 16
+
+const IHbdHtParam kArrayIhtParam[] = {
+ // 4x4
+ make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
+ make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
+ make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
+ make_tuple(PARAM_LIST_4X4, ADST_DCT, 12),
+ make_tuple(PARAM_LIST_4X4, DCT_ADST, 10),
+ make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
+ make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
+ make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
+ make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
+ make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
+ make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
+ make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12),
+ make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10),
+ make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12),
+ make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10),
+ make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
+ make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
+ make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvHTNxN,
+ ::testing::ValuesIn(kArrayIhtParam));
+#endif // HAVE_SSE4_1
+
+typedef void (*HighbdInvTxfm2dFunc)(const int32_t *input, uint8_t *output,
+ int stride, const TxfmParam *txfm_param);
+
+typedef std::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
+class AV1HighbdInvTxfm2d
+ : public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
+ public:
+ void SetUp() override { target_func_ = GET_PARAM(0); }
+ void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
+ int bit_depth, int gt_int16 = 0);
+
+ private:
+ HighbdInvTxfm2dFunc target_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvTxfm2d);
+
+void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
+ int run_times, int bit_depth_,
+ int gt_int16) {
+#if CONFIG_REALTIME_ONLY
+ if (tx_size_ >= TX_4X16) {
+ return;
+ }
+#endif
+ FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+ TxfmParam txfm_param;
+ const int BLK_WIDTH = 64;
+ const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+ DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, uint16_t, output[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+ int stride = BLK_WIDTH;
+ int rows = tx_size_high[tx_size_];
+ int cols = tx_size_wide[tx_size_];
+ const int rows_nonezero = AOMMIN(32, rows);
+ const int cols_nonezero = AOMMIN(32, cols);
+ const uint16_t mask = (1 << bit_depth_) - 1;
+ run_times /= (rows * cols);
+ run_times = AOMMAX(1, run_times);
+ const SCAN_ORDER *scan_order = get_default_scan(tx_size_, tx_type_);
+ const int16_t *scan = scan_order->scan;
+ const int16_t eobmax = rows_nonezero * cols_nonezero;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int randTimes = run_times == 1 ? (eobmax) : 1;
+
+ txfm_param.tx_type = tx_type_;
+ txfm_param.tx_size = tx_size_;
+ txfm_param.lossless = 0;
+ txfm_param.bd = bit_depth_;
+ txfm_param.is_hbd = 1;
+ txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+ for (int cnt = 0; cnt < randTimes; ++cnt) {
+ for (int r = 0; r < BLK_WIDTH; ++r) {
+ for (int c = 0; c < BLK_WIDTH; ++c) {
+ input[r * cols + c] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+ output[r * stride + c] = rnd.Rand16() & mask;
+
+ ref_output[r * stride + c] = output[r * stride + c];
+ }
+ }
+ fwd_func_(input, inv_input, stride, tx_type_, bit_depth_);
+
+ // produce eob input by setting high freq coeffs to zero
+ const int eob = AOMMIN(cnt + 1, eobmax);
+ for (int i = eob; i < eobmax; i++) {
+ inv_input[scan[i]] = 0;
+ }
+ txfm_param.eob = eob;
+ if (gt_int16) {
+ const uint16_t inv_input_mask =
+ static_cast<uint16_t>((1 << (bit_depth_ + 7)) - 1);
+ for (int i = 0; i < eob; i++) {
+ inv_input[scan[i]] = (rnd.Rand31() & inv_input_mask);
+ }
+ }
+
+ aom_usec_timer ref_timer, test_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_highbd_inv_txfm_add_c(inv_input, CONVERT_TO_BYTEPTR(ref_output),
+ stride, &txfm_param);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(inv_input, CONVERT_TO_BYTEPTR(output), stride, &txfm_param);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+ if (run_times > 10) {
+ printf(
+ "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+ "gain=%d \n",
+ tx_size_, tx_type_, elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ } else {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ ASSERT_EQ(ref_output[r * stride + c], output[r * stride + c])
+ << "[" << r << "," << c << "] " << cnt << " tx_size: " << cols
+ << "x" << rows << " bit_depth_: " << bit_depth_
+ << " tx_type: " << tx_type_name[tx_type_] << " eob " << eob;
+ }
+ }
+ }
+ }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, match) {
+ int bitdepth_ar[3] = { 8, 10, 12 };
+ for (int k = 0; k < 3; ++k) {
+ int bd = bitdepth_ar[k];
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ for (int i = 0; i < (int)TX_TYPES; ++i) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+ static_cast<TX_TYPE>(i))) {
+ RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+ 1, bd);
+ }
+ }
+ }
+ }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, gt_int16) {
+ int bitdepth_ar[3] = { 8, 10, 12 };
+ static const TX_TYPE types[] = {
+ DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX, V_DCT, H_DCT, H_ADST, H_FLIPADST
+ };
+ for (int k = 0; k < 3; ++k) {
+ int bd = bitdepth_ar[k];
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ const TX_SIZE sz = static_cast<TX_SIZE>(j);
+ for (uint8_t i = 0; i < sizeof(types) / sizeof(TX_TYPE); ++i) {
+ const TX_TYPE tp = types[i];
+ if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
+ RunAV1InvTxfm2dTest(tp, sz, 1, bd, 1);
+ }
+ }
+ }
+ }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, DISABLED_Speed) {
+ int bitdepth_ar[2] = { 10, 12 };
+ for (int k = 0; k < 2; ++k) {
+ int bd = bitdepth_ar[k];
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ for (int i = 0; i < (int)TX_TYPES; ++i) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+ static_cast<TX_TYPE>(i))) {
+ RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+ 1000000, bd);
+ }
+ }
+ }
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvTxfm2d,
+ ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdInvTxfm2d,
+ ::testing::Values(av1_highbd_inv_txfm_add_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdInvTxfm2d,
+ ::testing::Values(av1_highbd_inv_txfm_add_neon));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/av1_horz_only_frame_superres_test.cc b/third_party/aom/test/av1_horz_only_frame_superres_test.cc
new file mode 100644
index 0000000000..e9cf02e202
--- /dev/null
+++ b/third_party/aom/test/av1_horz_only_frame_superres_test.cc
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+
+using libaom_test::ACMRandom;
+using std::make_tuple;
+using std::tuple;
+
+template <typename Pixel>
+class TestImage {
+ public:
+ TestImage(int w_src, int h, int superres_denom, int x0, int bd)
+ : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0),
+ bd_(bd) {
+ assert(bd < 16);
+ assert(bd <= 8 * static_cast<int>(sizeof(Pixel)));
+ assert(9 <= superres_denom && superres_denom <= 16);
+ assert(SCALE_NUMERATOR == 8);
+ assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK);
+
+ w_dst_ = w_src_;
+ av1_calculate_unscaled_superres_size(&w_dst_, nullptr, superres_denom);
+
+ src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4);
+ dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4);
+
+ // Allocate image data
+ src_data_.resize(2 * src_block_size());
+ dst_data_.resize(2 * dst_block_size());
+ }
+
+ void Initialize(ACMRandom *rnd);
+ void Check() const;
+
+ int src_stride() const { return src_stride_; }
+ int dst_stride() const { return dst_stride_; }
+
+ int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+ int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+ int src_width() const { return w_src_; }
+ int dst_width() const { return w_dst_; }
+ int height() const { return h_; }
+ int x0() const { return x0_; }
+
+ const Pixel *GetSrcData(bool ref, bool borders) const {
+ const Pixel *block = &src_data_[ref ? 0 : src_block_size()];
+ return borders ? block : block + kHPad + src_stride_ * kVPad;
+ }
+
+ Pixel *GetDstData(bool ref, bool borders) {
+ Pixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+ return borders ? block : block + kHPad + dst_stride_ * kVPad;
+ }
+
+ private:
+ int w_src_, w_dst_, h_, superres_denom_, x0_, bd_;
+ int src_stride_, dst_stride_;
+
+ std::vector<Pixel> src_data_;
+ std::vector<Pixel> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+ if (!trash) {
+ memset(data, 0, sizeof(*data) * num_pixels);
+ return;
+ }
+ const Pixel mask = (1 << bd) - 1;
+ for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+ bool trash_edges, Pixel *data) {
+ assert(rnd);
+ const Pixel mask = (1 << bd) - 1;
+
+ // Fill in the first buffer with random data
+ // Top border
+ FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+ for (int r = 0; r < h; ++r) {
+ Pixel *row_data = data + (kVPad + r) * stride;
+ // Left border, contents, right border
+ FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+ for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+ FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+ }
+ // Bottom border
+ FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+ const int bpp = sizeof(*data);
+ const int block_elts = stride * (h + 2 * kVPad);
+ const int block_size = bpp * block_elts;
+
+ // Now copy that to the second buffer
+ memcpy(data + block_elts, data, block_size);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Initialize(ACMRandom *rnd) {
+ PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]);
+ PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Check() const {
+ const int num_pixels = dst_block_size();
+ const Pixel *ref_dst = &dst_data_[0];
+ const Pixel *tst_dst = &dst_data_[num_pixels];
+
+ // If memcmp returns 0, there's nothing to do.
+ if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+ // Otherwise, iterate through the buffer looking for differences, *ignoring
+ // the edges*
+ const int stride = dst_stride_;
+ for (int r = kVPad; r < h_ + kVPad; ++r) {
+ for (int c = kVPad; c < w_dst_ + kHPad; ++c) {
+ const int32_t ref_value = ref_dst[r * stride + c];
+ const int32_t tst_value = tst_dst[r * stride + c];
+
+ EXPECT_EQ(tst_value, ref_value)
+ << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad)
+ << ", superres_denom: " << superres_denom_ << ", height: " << h_
+ << ", src_width: " << w_src_ << ", dst_width: " << w_dst_
+ << ", x0: " << x0_;
+ }
+ }
+}
+
+template <typename Pixel>
+class ConvolveHorizRSTestBase : public ::testing::Test {
+ public:
+ ConvolveHorizRSTestBase() : image_(nullptr) {}
+ ~ConvolveHorizRSTestBase() override = default;
+
+ // Implemented by subclasses (SetUp depends on the parameters passed
+ // in and RunOne depends on the function to be tested. These can't
+ // be templated for low/high bit depths because they have different
+ // numbers of parameters)
+ void SetUp() override = 0;
+ virtual void RunOne(bool ref) = 0;
+
+ protected:
+ void SetBitDepth(int bd) { bd_ = bd; }
+
+ void CorrectnessTest() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int i = 0; i < kTestIters; ++i) {
+ for (int superres_denom = 9; superres_denom <= 16; superres_denom++) {
+ // Get a random height between 512 and 767
+ int height = rnd.Rand8() + 512;
+
+ // Get a random src width between 128 and 383
+ int width_src = rnd.Rand8() + 128;
+
+ // x0 is normally calculated by get_upscale_convolve_x0 in
+ // av1/common/resize.c. However, this test should work for
+ // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK
+ // (inclusive), so we choose one at random.
+ int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1);
+
+ image_ =
+ new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+ ASSERT_NE(image_, nullptr);
+
+ Prep(&rnd);
+ RunOne(true);
+ RunOne(false);
+ image_->Check();
+
+ delete image_;
+ }
+ }
+ }
+
+ void SpeedTest() {
+ // Pick some specific parameters to test
+ int height = 767;
+ int width_src = 129;
+ int superres_denom = 13;
+ int x0 = RS_SCALE_SUBPEL_MASK >> 1;
+
+ image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+ ASSERT_NE(image_, nullptr);
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ Prep(&rnd);
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: ConvolveHorizRSTest (Speed Test), SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+ }
+
+ void Prep(ACMRandom *rnd) {
+ assert(rnd);
+ image_->Initialize(rnd);
+ }
+
+ int bd_;
+ TestImage<Pixel> *image_;
+};
+
+typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ const int x0_qn, const int x_step_qn);
+
+// Test parameter list:
+// <tst_fun_>
+typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
+
+class LowBDConvolveHorizRSTest
+ : public ConvolveHorizRSTestBase<uint8_t>,
+ public ::testing::WithParamInterface<LowBDParams> {
+ public:
+ ~LowBDConvolveHorizRSTest() override = default;
+
+ void SetUp() override {
+ tst_fun_ = GET_PARAM(0);
+ const int bd = 8;
+ SetBitDepth(bd);
+ }
+
+ void RunOne(bool ref) override {
+ const uint8_t *src = image_->GetSrcData(ref, false);
+ uint8_t *dst = image_->GetDstData(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+ const int width_src = image_->src_width();
+ const int width_dst = image_->dst_width();
+ const int height = image_->height();
+ const int x0_qn = image_->x0();
+
+ const int32_t x_step_qn =
+ av1_get_upscale_convolve_step(width_src, width_dst);
+
+ if (ref) {
+ av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
+ height, &av1_resize_filter_normative[0][0], x0_qn,
+ x_step_qn);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+ &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
+ }
+ }
+
+ private:
+ LowBDConvolveHorizRsFunc tst_fun_;
+};
+
+TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, LowBDConvolveHorizRSTest,
+ ::testing::Values(av1_convolve_horiz_rs_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, LowBDConvolveHorizRSTest,
+ ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ const int x0_qn, const int x_step_qn,
+ int bd);
+
+// Test parameter list:
+// <tst_fun_, bd_>
+typedef tuple<HighBDConvolveHorizRsFunc, int> HighBDParams;
+
+class HighBDConvolveHorizRSTest
+ : public ConvolveHorizRSTestBase<uint16_t>,
+ public ::testing::WithParamInterface<HighBDParams> {
+ public:
+ ~HighBDConvolveHorizRSTest() override = default;
+
+ void SetUp() override {
+ tst_fun_ = GET_PARAM(0);
+ const int bd = GET_PARAM(1);
+ SetBitDepth(bd);
+ }
+
+ void RunOne(bool ref) override {
+ const uint16_t *src = image_->GetSrcData(ref, false);
+ uint16_t *dst = image_->GetDstData(ref, false);
+ const int src_stride = image_->src_stride();
+ const int dst_stride = image_->dst_stride();
+ const int width_src = image_->src_width();
+ const int width_dst = image_->dst_width();
+ const int height = image_->height();
+ const int x0_qn = image_->x0();
+
+ const int32_t x_step_qn =
+ av1_get_upscale_convolve_step(width_src, width_dst);
+
+ if (ref) {
+ av1_highbd_convolve_horiz_rs_c(
+ src, src_stride, dst, dst_stride, width_dst, height,
+ &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+ } else {
+ tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+ &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+ }
+ }
+
+ private:
+ HighBDConvolveHorizRsFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, HighBDConvolveHorizRSTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_c),
+ ::testing::ValuesIn(kBDs)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, HighBDConvolveHorizRSTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
+ ::testing::ValuesIn(kBDs)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HighBDConvolveHorizRSTest,
+ ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_neon),
+ ::testing::ValuesIn(kBDs)));
+#endif // HAVE_NEON
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/av1_inv_txfm1d_test.cc b/third_party/aom/test/av1_inv_txfm1d_test.cc
new file mode 100644
index 0000000000..e70b22a35a
--- /dev/null
+++ b/third_party/aom/test/av1_inv_txfm1d_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "test/av1_txfm_test.h"
+#include "test/util.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+
+typedef TX_SIZE TxSize;
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+
+namespace {
+const int txfm_type_num = 2;
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+ { av1_fdct4, av1_fadst4 }, { av1_fdct8, av1_fadst8 },
+ { av1_fdct16, av1_fadst16 }, { av1_fdct32, nullptr },
+ { av1_fdct64, nullptr },
+};
+
+const TxfmFunc inv_txfm_func_ls[][txfm_type_num] = {
+ { av1_idct4, av1_iadst4 }, { av1_idct8, av1_iadst8 },
+ { av1_idct16, av1_iadst16 }, { av1_idct32, nullptr },
+ { av1_idct64, nullptr },
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit = 13;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
+
+void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) {
+ double input[64];
+ for (int i = 0; i < size; ++i) input[i] = in[i];
+
+ double output[64];
+ libaom_test::reference_idct_1d(input, output, size);
+
+ for (int i = 0; i < size; ++i) {
+ ASSERT_GE(output[i], INT32_MIN);
+ ASSERT_LE(output[i], INT32_MAX);
+ out[i] = static_cast<int32_t>(round(output[i]));
+ }
+}
+
+void random_matrix(int32_t *dst, int len, ACMRandom *rnd) {
+ const int bits = 16;
+ const int maxVal = (1 << (bits - 1)) - 1;
+ const int minVal = -(1 << (bits - 1));
+ for (int i = 0; i < len; ++i) {
+ if (rnd->Rand8() % 10)
+ dst[i] = minVal + rnd->Rand16() % (1 << bits);
+ else
+ dst[i] = rnd->Rand8() % 2 ? minVal : maxVal;
+ }
+}
+
+TEST(av1_inv_txfm1d, InvAccuracyCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 20000;
+ const int max_error[] = { 6, 10, 19, 31, 40 };
+ ASSERT_EQ(NELEMENTS(max_error), TX_SIZES);
+ ASSERT_EQ(NELEMENTS(inv_txfm_func_ls), TX_SIZES);
+ for (int i = 0; i < count_test_block; ++i) {
+ // choose a random transform to test
+ const TxSize tx_size = static_cast<TxSize>(rnd.Rand8() % TX_SIZES);
+ const int txfm_size = txfm_size_ls[tx_size];
+ const TxfmFunc inv_txfm_func = inv_txfm_func_ls[tx_size][0];
+
+ int32_t input[64];
+ random_matrix(input, txfm_size, &rnd);
+
+ // 64x64 transform assumes last 32 values are zero.
+ memset(input + 32, 0, 32 * sizeof(input[0]));
+
+ int32_t ref_output[64];
+ memset(ref_output, 0, sizeof(ref_output));
+ reference_idct_1d_int(input, ref_output, txfm_size);
+
+ int32_t output[64];
+ memset(output, 0, sizeof(output));
+ inv_txfm_func(input, output, cos_bit, range_bit);
+
+ for (int ni = 0; ni < txfm_size; ++ni) {
+ EXPECT_LE(abs(output[ni] - ref_output[ni]), max_error[tx_size])
+ << "tx_size = " << tx_size << ", ni = " << ni
+ << ", output[ni] = " << output[ni]
+ << ", ref_output[ni] = " << ref_output[ni];
+ }
+ }
+}
+
+static INLINE int get_max_bit(int x) {
+ int max_bit = -1;
+ while (x) {
+ x = x >> 1;
+ max_bit++;
+ }
+ return max_bit;
+}
+
+TEST(av1_inv_txfm1d, get_max_bit) {
+ int max_bit = get_max_bit(8);
+ EXPECT_EQ(max_bit, 3);
+}
+
+TEST(av1_inv_txfm1d, round_trip) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int si = 0; si < NELEMENTS(fwd_txfm_func_ls); ++si) {
+ int txfm_size = txfm_size_ls[si];
+
+ for (int ti = 0; ti < txfm_type_num; ++ti) {
+ TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
+ TxfmFunc inv_txfm_func = inv_txfm_func_ls[si][ti];
+ int max_error = 2;
+
+ if (!fwd_txfm_func) continue;
+
+ const int count_test_block = 5000;
+ for (int i = 0; i < count_test_block; ++i) {
+ int32_t input[64];
+ int32_t output[64];
+ int32_t round_trip_output[64];
+
+ ASSERT_LE(txfm_size, NELEMENTS(input));
+
+ for (int ni = 0; ni < txfm_size; ++ni) {
+ input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+ }
+
+ fwd_txfm_func(input, output, cos_bit, range_bit);
+ inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+
+ for (int ni = 0; ni < txfm_size; ++ni) {
+ int node_err =
+ abs(input[ni] - round_shift(round_trip_output[ni],
+ get_max_bit(txfm_size) - 1));
+ EXPECT_LE(node_err, max_error);
+ }
+ }
+ }
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/av1_inv_txfm2d_test.cc b/third_party/aom/test/av1_inv_txfm2d_test.cc
new file mode 100644
index 0000000000..35a87a43b8
--- /dev/null
+++ b/third_party/aom/test/av1_inv_txfm2d_test.cc
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+#include <vector>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::input_base;
+using libaom_test::InvTxfm2dFunc;
+using libaom_test::LbdInvTxfm2dFunc;
+using libaom_test::tx_type_name;
+
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+
+using std::vector;
+
+typedef TX_TYPE TxType;
+typedef TX_SIZE TxSize;
+
+namespace {
+
+// AV1InvTxfm2dParam argument list:
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tuple<TxType, TxSize, int, double> AV1InvTxfm2dParam;
+
+class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
+ public:
+ void SetUp() override {
+ tx_type_ = GET_PARAM(0);
+ tx_size_ = GET_PARAM(1);
+ max_error_ = GET_PARAM(2);
+ max_avg_error_ = GET_PARAM(3);
+ }
+
+ void RunRoundtripCheck() {
+ int tx_w = tx_size_wide[tx_size_];
+ int tx_h = tx_size_high[tx_size_];
+ int txfm2d_size = tx_w * tx_h;
+ const FwdTxfm2dFunc fwd_txfm_func = libaom_test::fwd_txfm_func_ls[tx_size_];
+ const InvTxfm2dFunc inv_txfm_func = libaom_test::inv_txfm_func_ls[tx_size_];
+ double avg_abs_error = 0;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ const int count = 500;
+
+ for (int ci = 0; ci < count; ci++) {
+ DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
+ ASSERT_LE(txfm2d_size, NELEMENTS(input));
+
+ for (int ni = 0; ni < txfm2d_size; ++ni) {
+ if (ci == 0) {
+ int extreme_input = input_base - 1;
+ input[ni] = extreme_input; // extreme case
+ } else {
+ input[ni] = rnd.Rand16() % input_base;
+ }
+ }
+
+ DECLARE_ALIGNED(16, uint16_t, expected[64 * 64]) = { 0 };
+ ASSERT_LE(txfm2d_size, NELEMENTS(expected));
+ if (TxfmUsesApproximation()) {
+ // Compare reference forward HT + inverse HT vs forward HT + inverse HT.
+ double ref_input[64 * 64];
+ ASSERT_LE(txfm2d_size, NELEMENTS(ref_input));
+ for (int ni = 0; ni < txfm2d_size; ++ni) {
+ ref_input[ni] = input[ni];
+ }
+ double ref_coeffs[64 * 64] = { 0 };
+ ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs));
+ ASSERT_EQ(tx_type_, static_cast<TxType>(DCT_DCT));
+ libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_,
+ tx_size_);
+ DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 };
+ ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs_int));
+ for (int ni = 0; ni < txfm2d_size; ++ni) {
+ ref_coeffs_int[ni] = (int32_t)round(ref_coeffs[ni]);
+ }
+ inv_txfm_func(ref_coeffs_int, expected, tx_w, tx_type_, bd);
+ } else {
+ // Compare original input vs forward HT + inverse HT.
+ for (int ni = 0; ni < txfm2d_size; ++ni) {
+ expected[ni] = input[ni];
+ }
+ }
+
+ DECLARE_ALIGNED(16, int32_t, coeffs[64 * 64]) = { 0 };
+ ASSERT_LE(txfm2d_size, NELEMENTS(coeffs));
+ fwd_txfm_func(input, coeffs, tx_w, tx_type_, bd);
+
+ DECLARE_ALIGNED(16, uint16_t, actual[64 * 64]) = { 0 };
+ ASSERT_LE(txfm2d_size, NELEMENTS(actual));
+ inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd);
+
+ double actual_max_error = 0;
+ for (int ni = 0; ni < txfm2d_size; ++ni) {
+ const double this_error = abs(expected[ni] - actual[ni]);
+ actual_max_error = AOMMAX(actual_max_error, this_error);
+ }
+ EXPECT_GE(max_error_, actual_max_error)
+ << " tx_w: " << tx_w << " tx_h " << tx_h
+ << " tx_type: " << tx_type_name[tx_type_];
+ if (actual_max_error > max_error_) { // exit early.
+ break;
+ }
+ avg_abs_error += compute_avg_abs_error<uint16_t, uint16_t>(
+ expected, actual, txfm2d_size);
+ }
+
+ avg_abs_error /= count;
+ EXPECT_GE(max_avg_error_, avg_abs_error)
+ << " tx_w: " << tx_w << " tx_h " << tx_h
+ << " tx_type: " << tx_type_name[tx_type_];
+ }
+
+ private:
+ bool TxfmUsesApproximation() {
+ if (tx_size_wide[tx_size_] == 64 || tx_size_high[tx_size_] == 64) {
+ return true;
+ }
+ return false;
+ }
+
+ int max_error_;
+ double max_avg_error_;
+ TxType tx_type_;
+ TxSize tx_size_;
+};
+
+static int max_error_ls[TX_SIZES_ALL] = {
+ 2, // 4x4 transform
+ 2, // 8x8 transform
+ 2, // 16x16 transform
+ 4, // 32x32 transform
+ 3, // 64x64 transform
+ 2, // 4x8 transform
+ 2, // 8x4 transform
+ 2, // 8x16 transform
+ 2, // 16x8 transform
+ 3, // 16x32 transform
+ 3, // 32x16 transform
+ 5, // 32x64 transform
+ 5, // 64x32 transform
+ 2, // 4x16 transform
+ 2, // 16x4 transform
+ 2, // 8x32 transform
+ 2, // 32x8 transform
+ 3, // 16x64 transform
+ 3, // 64x16 transform
+};
+
+static double avg_error_ls[TX_SIZES_ALL] = {
+ 0.002, // 4x4 transform
+ 0.05, // 8x8 transform
+ 0.07, // 16x16 transform
+ 0.4, // 32x32 transform
+ 0.3, // 64x64 transform
+ 0.02, // 4x8 transform
+ 0.02, // 8x4 transform
+ 0.04, // 8x16 transform
+ 0.07, // 16x8 transform
+ 0.4, // 16x32 transform
+ 0.5, // 32x16 transform
+ 0.38, // 32x64 transform
+ 0.39, // 64x32 transform
+ 0.2, // 4x16 transform
+ 0.2, // 16x4 transform
+ 0.2, // 8x32 transform
+ 0.2, // 32x8 transform
+ 0.38, // 16x64 transform
+ 0.38, // 64x16 transform
+};
+
+vector<AV1InvTxfm2dParam> GetInvTxfm2dParamList() {
+ vector<AV1InvTxfm2dParam> param_list;
+ for (int s = 0; s < TX_SIZES; ++s) {
+ const int max_error = max_error_ls[s];
+ const double avg_error = avg_error_ls[s];
+ for (int t = 0; t < TX_TYPES; ++t) {
+ const TxType tx_type = static_cast<TxType>(t);
+ const TxSize tx_size = static_cast<TxSize>(s);
+ if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+ param_list.push_back(
+ AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+ }
+ }
+ }
+ return param_list;
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AV1InvTxfm2d,
+ ::testing::ValuesIn(GetInvTxfm2dParamList()));
+
+TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+
+TEST(AV1InvTxfm2d, CfgTest) {
+ for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+ int bd = libaom_test::bd_arr[bd_idx];
+ int8_t low_range = libaom_test::low_range_arr[bd_idx];
+ int8_t high_range = libaom_test::high_range_arr[bd_idx];
+ for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(tx_size),
+ static_cast<TxType>(tx_type)) ==
+ false) {
+ continue;
+ }
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_inv_txfm_cfg(static_cast<TxType>(tx_type),
+ static_cast<TxSize>(tx_size), &cfg);
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
+ static_cast<TxSize>(tx_size), bd);
+ libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+ cfg.cos_bit_col, low_range,
+ high_range);
+ libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+ cfg.cos_bit_row, low_range,
+ high_range);
+ }
+ }
+ }
+}
+
+typedef std::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
+class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
+ public:
+ void SetUp() override { target_func_ = GET_PARAM(0); }
+ void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times,
+ int gt_int16 = 0);
+
+ private:
+ LbdInvTxfm2dFunc target_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1LbdInvTxfm2d);
+
+void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size,
+ int run_times, int gt_int16) {
+ FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size];
+ InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size];
+ if (fwd_func_ == nullptr || ref_func_ == nullptr || target_func_ == nullptr) {
+ return;
+ }
+ const int bd = 8;
+ const int BLK_WIDTH = 64;
+ const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+ DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(16, uint8_t, output[BLK_SIZE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+ int stride = BLK_WIDTH;
+ int rows = tx_size_high[tx_size];
+ int cols = tx_size_wide[tx_size];
+ const int rows_nonezero = AOMMIN(32, rows);
+ const int cols_nonezero = AOMMIN(32, cols);
+ run_times /= (rows * cols);
+ run_times = AOMMAX(1, run_times);
+ const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ const int16_t eobmax = rows_nonezero * cols_nonezero;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int randTimes = run_times == 1 ? (eobmax + 500) : 1;
+
+ for (int cnt = 0; cnt < randTimes; ++cnt) {
+ const int16_t max_in = (1 << (bd)) - 1;
+ for (int r = 0; r < BLK_WIDTH; ++r) {
+ for (int c = 0; c < BLK_WIDTH; ++c) {
+ input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes();
+ output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8();
+ ref_output[r * stride + c] = output[r * stride + c];
+ }
+ }
+ fwd_func_(input, inv_input, stride, tx_type, bd);
+
+ // produce eob input by setting high freq coeffs to zero
+ const int eob = AOMMIN(cnt + 1, eobmax);
+ for (int i = eob; i < eobmax; i++) {
+ inv_input[scan[i]] = 0;
+ }
+ if (gt_int16) {
+ inv_input[scan[eob - 1]] = ((int32_t)INT16_MAX * 100 / 141);
+ }
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ ref_func_(inv_input, ref_output, stride, tx_type, bd);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(inv_input, output, stride, tx_type, tx_size, eob);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1,
+ time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
+ if (ref_value != output[r * stride + c]) {
+ printf(" ");
+ }
+ ASSERT_EQ(ref_value, output[r * stride + c])
+ << "[" << r << "," << c << "] " << cnt << " tx_size: " << cols
+ << "x" << rows << " tx_type: " << tx_type_name[tx_type] << " eob "
+ << eob;
+ }
+ }
+ }
+}
+
+TEST_P(AV1LbdInvTxfm2d, match) {
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ for (int i = 0; i < (int)TX_TYPES; ++i) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
+ static_cast<TxType>(i))) {
+ RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j), 1);
+ }
+ }
+ }
+}
+
+TEST_P(AV1LbdInvTxfm2d, gt_int16) {
+ static const TxType types[] = { DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX,
+ V_DCT, H_DCT, H_ADST, H_FLIPADST };
+ for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+ const TxSize sz = static_cast<TxSize>(j);
+ for (uint8_t i = 0; i < sizeof(types) / sizeof(types[0]); ++i) {
+ const TxType tp = types[i];
+ if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
+ RunAV1InvTxfm2dTest(tp, sz, 1, 1);
+ }
+ }
+ }
+}
+
+TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
+ for (int j = 1; j < (int)(TX_SIZES_ALL); ++j) {
+ for (int i = 0; i < (int)TX_TYPES; ++i) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
+ static_cast<TxType>(i))) {
+ RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j),
+ 10000000);
+ }
+ }
+ }
+}
+
+#if HAVE_SSSE3
+extern "C" void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TxType tx_type, TxSize tx_size,
+ int eob);
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1LbdInvTxfm2d,
+ ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
+#endif // HAVE_SSSE3
+
+#if HAVE_AVX2
+extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TxType tx_type, TxSize tx_size,
+ int eob);
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1LbdInvTxfm2d,
+ ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1LbdInvTxfm2d,
+ ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/av1_k_means_test.cc b/third_party/aom/test/av1_k_means_test.cc
new file mode 100644
index 0000000000..7e66a8e01d
--- /dev/null
+++ b/third_party/aom/test/av1_k_means_test.cc
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "av1/encoder/palette.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1Kmeans {
+typedef void (*av1_calc_indices_dim1_func)(const int16_t *data,
+ const int16_t *centroids,
+ uint8_t *indices,
+ int64_t *total_dist, int n, int k);
+typedef void (*av1_calc_indices_dim2_func)(const int16_t *data,
+ const int16_t *centroids,
+ uint8_t *indices,
+ int64_t *total_dist, int n, int k);
+
+typedef std::tuple<av1_calc_indices_dim1_func, BLOCK_SIZE>
+ av1_calc_indices_dim1Param;
+
+typedef std::tuple<av1_calc_indices_dim2_func, BLOCK_SIZE>
+ av1_calc_indices_dim2Param;
+
+class AV1KmeansTest1
+ : public ::testing::TestWithParam<av1_calc_indices_dim1Param> {
+ public:
+ ~AV1KmeansTest1() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
+ int centroids);
+ void RunSpeedTest(av1_calc_indices_dim1_func test_impl, BLOCK_SIZE bsize,
+ int centroids);
+ bool CheckResult(int n) {
+ for (int idx = 0; idx < n; ++idx) {
+ if (indices1_[idx] != indices2_[idx]) {
+ printf("%d ", idx);
+ printf("%d != %d ", indices1_[idx], indices2_[idx]);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ int16_t data_[4096];
+ int16_t centroids_[8];
+ uint8_t indices1_[4096];
+ uint8_t indices2_[4096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest1);
+
+AV1KmeansTest1::~AV1KmeansTest1() = default;
+
+void AV1KmeansTest1::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ for (int i = 0; i < 4096; ++i) {
+ data_[i] = (int)rnd_.Rand8() << 4;
+ }
+ for (int i = 0; i < 8; i++) {
+ centroids_[i] = (int)rnd_.Rand8() << 4;
+ }
+}
+
+void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
+ BLOCK_SIZE bsize, int k) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int n = w * h;
+ int64_t total_dist_dim1, total_dist_impl;
+ av1_calc_indices_dim1_c(data_, centroids_, indices1_, &total_dist_dim1, n, k);
+ test_impl(data_, centroids_, indices2_, &total_dist_impl, n, k);
+
+ ASSERT_EQ(total_dist_dim1, total_dist_impl);
+ ASSERT_EQ(CheckResult(n), true)
+ << " block " << bsize << " index " << n << " Centroids " << k;
+}
+
+void AV1KmeansTest1::RunSpeedTest(av1_calc_indices_dim1_func test_impl,
+ BLOCK_SIZE bsize, int k) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int n = w * h;
+ const int num_loops = 1000000000 / n;
+
+ av1_calc_indices_dim1_func funcs[2] = { av1_calc_indices_dim1_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ av1_calc_indices_dim1_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(data_, centroids_, indices1_, /*total_dist=*/nullptr, n, k);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("av1_calc_indices_dim1 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+ elapsed_time[0], elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest1, CheckOutput) {
+ // centroids = 2..8
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest1, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+class AV1KmeansTest2
+ : public ::testing::TestWithParam<av1_calc_indices_dim2Param> {
+ public:
+ ~AV1KmeansTest2() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+ int centroids);
+ void RunSpeedTest(av1_calc_indices_dim2_func test_impl, BLOCK_SIZE bsize,
+ int centroids);
+ bool CheckResult(int n) {
+ bool flag = true;
+ for (int idx = 0; idx < n; ++idx) {
+ if (indices1_[idx] != indices2_[idx]) {
+ printf("%d ", idx);
+ printf("%d != %d ", indices1_[idx], indices2_[idx]);
+ flag = false;
+ }
+ }
+ if (flag == false) {
+ return false;
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ int16_t data_[4096 * 2];
+ int16_t centroids_[8 * 2];
+ uint8_t indices1_[4096];
+ uint8_t indices2_[4096];
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1KmeansTest2);
+
+AV1KmeansTest2::~AV1KmeansTest2() = default;
+
+void AV1KmeansTest2::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ for (int i = 0; i < 4096 * 2; ++i) {
+ data_[i] = (int)rnd_.Rand8();
+ }
+ for (int i = 0; i < 8 * 2; i++) {
+ centroids_[i] = (int)rnd_.Rand8();
+ }
+}
+
+void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
+ BLOCK_SIZE bsize, int k) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int n = w * h;
+ int64_t total_dist_dim2, total_dist_impl;
+ av1_calc_indices_dim2_c(data_, centroids_, indices1_, &total_dist_dim2, n, k);
+ test_impl(data_, centroids_, indices2_, &total_dist_impl, n, k);
+
+ ASSERT_EQ(total_dist_dim2, total_dist_impl);
+ ASSERT_EQ(CheckResult(n), true)
+ << " block " << bsize << " index " << n << " Centroids " << k;
+}
+
+void AV1KmeansTest2::RunSpeedTest(av1_calc_indices_dim2_func test_impl,
+ BLOCK_SIZE bsize, int k) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int n = w * h;
+ const int num_loops = 1000000000 / n;
+
+ av1_calc_indices_dim2_func funcs[2] = { av1_calc_indices_dim2_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ av1_calc_indices_dim2_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(data_, centroids_, indices1_, /*total_dist=*/nullptr, n, k);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("av1_calc_indices_dim2 indices= %d centroids=%d: %7.2f/%7.2fns", n, k,
+ elapsed_time[0], elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1KmeansTest2, CheckOutput) {
+ // centroids = 2..8
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 2);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 3);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 4);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 5);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 6);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 7);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+TEST_P(AV1KmeansTest2, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 2);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 3);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 4);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 5);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 6);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 7);
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 8);
+}
+
+#if HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_8X8, BLOCK_8X16, BLOCK_8X32,
+ BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
+ BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+ BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+ BLOCK_16X64, BLOCK_64X16 };
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1KmeansTest1,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_sse2),
+ ::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1KmeansTest2,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_sse2),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1KmeansTest1,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_avx2),
+ ::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1KmeansTest2,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_avx2),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1KmeansTest1,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim1_neon),
+ ::testing::ValuesIn(kValidBlockSize)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1KmeansTest2,
+ ::testing::Combine(::testing::Values(&av1_calc_indices_dim2_neon),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+} // namespace AV1Kmeans
diff --git a/third_party/aom/test/av1_key_value_api_test.cc b/third_party/aom/test/av1_key_value_api_test.cc
new file mode 100644
index 0000000000..a5734f6beb
--- /dev/null
+++ b/third_party/aom/test/av1_key_value_api_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstring>
+#include <tuple>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom/aomdx.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+typedef std::tuple<const char *, const char *> KeyValParam;
+
+class BaseKeyValAPI : public testing::Test {
+ public:
+ void SetUp() override {
+#if CONFIG_AV1_ENCODER
+ aom_codec_iface_t *iface_cx = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t enc_cfg;
+#if CONFIG_REALTIME_ONLY
+ const int usage = 1;
+#else
+ const int usage = 0;
+#endif
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface_cx, &enc_cfg, usage));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, usage));
+#endif
+#if CONFIG_AV1_DECODER
+ aom_codec_iface_t *iface_dx = aom_codec_av1_dx();
+ aom_codec_dec_cfg_t dec_cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec_, iface_dx, &dec_cfg, 0));
+#endif
+ }
+
+ void TearDown() override {
+#if CONFIG_AV1_ENCODER
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc_));
+#endif
+#if CONFIG_AV1_DECODER
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec_));
+#endif
+ }
+
+ protected:
+#if CONFIG_AV1_ENCODER
+ aom_codec_ctx_t enc_;
+#endif
+#if CONFIG_AV1_DECODER
+ aom_codec_ctx_t dec_;
+#endif
+};
+
+// Tests on encoder options.
+// Need to add ones for the decoder in the future if it is also supported in the
+// key & value API.
+#if CONFIG_AV1_ENCODER
+class EncValidTest : public BaseKeyValAPI,
+ public testing::WithParamInterface<KeyValParam> {};
+class EncInvalidTest : public BaseKeyValAPI,
+ public testing::WithParamInterface<KeyValParam> {};
+
+TEST_P(EncValidTest, Valid) {
+ const char *key = std::get<0>(GetParam());
+ const char *val = std::get<1>(GetParam());
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_set_option(&enc_, key, val));
+}
+
+TEST_P(EncInvalidTest, NullArg) {
+ const char *key = std::get<0>(GetParam());
+ const char *val = std::get<1>(GetParam());
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(nullptr, key, val));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(&enc_, nullptr, val));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(&enc_, key, nullptr));
+}
+
+TEST_P(EncInvalidTest, InvalidParam) {
+ const char *key = std::get<0>(GetParam());
+ const char *val = std::get<1>(GetParam());
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_set_option(&enc_, key, val));
+ ASSERT_NE(aom_codec_error_detail(&enc_), nullptr);
+ EXPECT_GT(strlen(aom_codec_error_detail(&enc_)), 0u);
+}
+
+// No test for ratio / list for now since the API does not support any of the
+// parameters of these type.
+// The string type typically involves reading a path/file, which brings
+// potential fails.
+const KeyValParam enc_valid_params[] = {
+ std::make_tuple("auto-intra-tools-off", "1"), // uint
+ std::make_tuple("min-gf-interval", "10"), // uint
+ std::make_tuple("min-partition-size", "4"), // int
+ std::make_tuple("tune", "psnr"), // enum
+};
+
+const KeyValParam enc_invalid_params[] = {
+ // no match
+ std::make_tuple("a-b-c", "10"),
+ // uint
+ std::make_tuple("min-gf-interval", "-1"),
+ std::make_tuple("min-gf-interval", "1.1"),
+ std::make_tuple("min-gf-interval", "abc"),
+ // int
+ std::make_tuple("min-partition-size", "1.1"),
+ std::make_tuple("min-partition-size", "abc"),
+ // enum
+ std::make_tuple("tune", "PsnR1"),
+ // out of range
+ std::make_tuple("cq-level", "1000"),
+};
+
+INSTANTIATE_TEST_SUITE_P(KeyValAPI, EncValidTest,
+ testing::ValuesIn(enc_valid_params));
+
+INSTANTIATE_TEST_SUITE_P(KeyValAPI, EncInvalidTest,
+ testing::ValuesIn(enc_invalid_params));
+#endif // CONFIG_AV1_ENCODER
+
+} // namespace
diff --git a/third_party/aom/test/av1_nn_predict_test.cc b/third_party/aom/test/av1_nn_predict_test.cc
new file mode 100644
index 0000000000..4201ea6ce6
--- /dev/null
+++ b/third_party/aom/test/av1_nn_predict_test.cc
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/ml.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+#include "test/acm_random.h"
+
+namespace {
+typedef void (*NnPredict_Func)(const float *const input_nodes,
+ const NN_CONFIG *const nn_config,
+ int reduce_prec, float *const output);
+
+typedef std::tuple<const NnPredict_Func> NnPredictTestParam;
+
+const float epsilon = 1e-3f; // Error threshold for functional equivalence
+
+class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
+ public:
+ void SetUp() override {
+ const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+ // Allocate two massive buffers on the heap for edge weights and node bias
+ // Then set-up the double-dimension arrays pointing into the big buffers
+ weights_buf = (float *)aom_malloc(MAX_NODES2 * (NN_MAX_HIDDEN_LAYERS + 1) *
+ sizeof(*weights_buf));
+ bias_buf =
+ (float *)aom_malloc(NN_MAX_NODES_PER_LAYER *
+ (NN_MAX_HIDDEN_LAYERS + 1) * sizeof(*bias_buf));
+ ASSERT_NE(weights_buf, nullptr);
+ ASSERT_NE(bias_buf, nullptr);
+ for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) {
+ weights[i] = &weights_buf[i * MAX_NODES2];
+ bias[i] = &bias_buf[i * NN_MAX_NODES_PER_LAYER];
+ }
+ target_func_ = GET_PARAM(0);
+ }
+ void TearDown() override {
+ aom_free(weights_buf);
+ aom_free(bias_buf);
+ }
+ void RunNnPredictTest(const NN_CONFIG *const shape);
+ void RunNnPredictSpeedTest(const NN_CONFIG *const shape, const int run_times);
+ void RunNnPredictTest_all(const NN_CONFIG *const shapes,
+ const int num_shapes);
+ void RunNnPredictSpeedTest_all(const NN_CONFIG *const shapes,
+ const int num_shapes, const int run_times);
+
+ private:
+ NnPredict_Func target_func_;
+ libaom_test::ACMRandom rng_;
+ float *weights[NN_MAX_HIDDEN_LAYERS + 1] = {};
+ float *bias[NN_MAX_HIDDEN_LAYERS + 1] = {};
+ float *weights_buf = nullptr, *bias_buf = nullptr;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NnPredictTest);
+
+void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
+ float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
+ float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
+ float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
+
+ NN_CONFIG nn_config;
+ memcpy(&nn_config, shape, sizeof(nn_config));
+
+ char shape_str[32] = { 0 };
+ snprintf(shape_str, sizeof(shape_str), "%d", shape->num_inputs);
+ for (int layer = 0; layer < shape->num_hidden_layers; layer++)
+ snprintf(&shape_str[strlen(shape_str)],
+ sizeof(shape_str) - strlen(shape_str), "x%d",
+ shape->num_hidden_nodes[layer]);
+ snprintf(&shape_str[strlen(shape_str)], sizeof(shape_str) - strlen(shape_str),
+ "x%d", shape->num_outputs);
+
+ for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) {
+ nn_config.weights[i] = weights[i];
+ nn_config.bias[i] = bias[i];
+ }
+
+ for (int iter = 0; iter < 10000 && !HasFatalFailure(); ++iter) {
+ for (int node = 0; node < shape->num_inputs; node++) {
+ inputs[node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+ }
+ for (int layer = 0; layer < shape->num_hidden_layers; layer++) {
+ for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) {
+ bias[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+ }
+ for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+ node++) {
+ weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+ }
+ }
+ // Now the outputs:
+ int layer = shape->num_hidden_layers;
+ for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) {
+ bias[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+ }
+ for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+ node++) {
+ weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+ }
+
+ av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+ target_func_(inputs, &nn_config, 0, outputs_test);
+
+ for (int node = 0; node < shape->num_outputs; node++) {
+ if (outputs_ref[node] < epsilon) {
+ ASSERT_LE(outputs_test[node], epsilon)
+ << "Reference output was near-zero, test output was not ("
+ << shape_str << ")";
+ } else {
+ const float error = outputs_ref[node] - outputs_test[node];
+ const float relative_error = fabsf(error / outputs_ref[node]);
+ ASSERT_LE(relative_error, epsilon)
+ << "Excessive relative error between reference and test ("
+ << shape_str << ")";
+ }
+ }
+ }
+}
+
+void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape,
+ const int run_times) {
+ float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
+ float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
+ float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
+
+ NN_CONFIG nn_config;
+ memcpy(&nn_config, shape, sizeof(nn_config));
+
+ for (int i = 0; i < NN_MAX_HIDDEN_LAYERS; i++) {
+ nn_config.weights[i] = weights[i];
+ nn_config.bias[i] = bias[i];
+ }
+ // Don't bother actually changing the values for inputs/weights/bias: it
+ // shouldn't make any difference for a speed test.
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(inputs, &nn_config, 0, outputs_test);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d", shape->num_inputs);
+ for (int layer = 0; layer < shape->num_hidden_layers; layer++)
+ printf("x%d", shape->num_hidden_nodes[layer]);
+ printf("x%d: ", shape->num_outputs);
+ printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2);
+}
+
+// This is all the neural network shapes observed executed in a few different
+// runs of the encoder. It also conveniently covers all the kernels
+// implemented.
+static const NN_CONFIG kShapes[] = {
+ { 37, 1, 2, { 16, 24 }, {}, {} }, { 24, 24, 1, { 12 }, {}, {} },
+ { 10, 16, 1, { 64 }, {}, {} }, { 12, 1, 1, { 12 }, {}, {} },
+ { 12, 1, 1, { 24 }, {}, {} }, { 12, 1, 1, { 32 }, {}, {} },
+ { 18, 4, 1, { 24 }, {}, {} }, { 18, 4, 1, { 32 }, {}, {} },
+ { 4, 1, 1, { 16 }, {}, {} }, { 8, 1, 0, { 0 }, {}, {} },
+ { 8, 4, 1, { 16 }, {}, {} }, { 8, 1, 1, { 32 }, {}, {} },
+ { 9, 3, 1, { 32 }, {}, {} }, { 8, 4, 0, { 0 }, {}, {} },
+ { 8, 8, 0, { 0 }, {}, {} }, { 4, 4, 1, { 8 }, {}, {} },
+ { 4, 3, 0, { 64 }, {}, {} },
+};
+
+void NnPredictTest::RunNnPredictTest_all(const NN_CONFIG *const shapes,
+ const int num_shapes) {
+ for (int i = 0; i < num_shapes; i++) RunNnPredictTest(&shapes[i]);
+}
+
+void NnPredictTest::RunNnPredictSpeedTest_all(const NN_CONFIG *const shapes,
+ const int num_shapes,
+ const int run_times) {
+ for (int i = 0; i < num_shapes; i++)
+ NnPredictTest::RunNnPredictSpeedTest(&shapes[i], run_times);
+}
+
+TEST_P(NnPredictTest, RandomValues) {
+ RunNnPredictTest_all(kShapes, sizeof(kShapes) / sizeof(kShapes[0]));
+}
+
+TEST_P(NnPredictTest, DISABLED_Speed) {
+ RunNnPredictSpeedTest_all(kShapes, sizeof(kShapes) / sizeof(kShapes[0]),
+ 10000000);
+}
+
+#if !CONFIG_EXCLUDE_SIMD_MISMATCH
+#if HAVE_SSE3
+INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest,
+ ::testing::Values(av1_nn_predict_sse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, NnPredictTest,
+ ::testing::Values(av1_nn_predict_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, NnPredictTest,
+ ::testing::Values(av1_nn_predict_neon));
+#endif
+#endif // !CONFIG_EXCLUDE_SIMD_MISMATCH
+
+} // namespace
diff --git a/third_party/aom/test/av1_quantize_test.cc b/third_party/aom/test/av1_quantize_test.cc
new file mode 100644
index 0000000000..c8af14a356
--- /dev/null
+++ b/third_party/aom/test/av1_quantize_test.cc
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/av1_quantize.h"
+
+namespace {
+
+typedef void (*QuantizeFpFunc)(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale);
+
+struct QuantizeFuncParams {
+ QuantizeFuncParams(QuantizeFpFunc qF = nullptr,
+ QuantizeFpFunc qRefF = nullptr, int count = 16)
+ : qFunc(qF), qFuncRef(qRefF), coeffCount(count) {}
+ QuantizeFpFunc qFunc;
+ QuantizeFpFunc qFuncRef;
+ int coeffCount;
+};
+
+using libaom_test::ACMRandom;
+
+const int numTests = 1000;
+const int maxSize = 1024;
+const int roundFactorRange = 127;
+const int dequantRange = 32768;
+const int coeffRange = (1 << 20) - 1;
+
+class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
+ public:
+ void RunQuantizeTest() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
+ DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+ uint16_t eob;
+ uint16_t ref_eob;
+ int err_count_total = 0;
+ int first_failure = -1;
+ int count = params_.coeffCount;
+ const TX_SIZE txSize = getTxSize(count);
+ int log_scale = (txSize == TX_32X32);
+ QuantizeFpFunc quanFunc = params_.qFunc;
+ QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+
+ const SCAN_ORDER scanOrder = av1_scan_orders[txSize][DCT_DCT];
+ for (int i = 0; i < numTests; i++) {
+ int err_count = 0;
+ ref_eob = eob = UINT16_MAX;
+ for (int j = 0; j < count; j++) {
+ coeff_ptr[j] = rnd(coeffRange);
+ }
+
+ for (int j = 0; j < 2; j++) {
+ zbin_ptr[j] = rnd.Rand16Signed();
+ quant_shift_ptr[j] = rnd.Rand16Signed();
+ // int16_t positive
+ dequant_ptr[j] = abs(rnd(dequantRange));
+ quant_ptr[j] = static_cast<int16_t>((1 << 16) / dequant_ptr[j]);
+ round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+ }
+ for (int j = 2; j < 8; ++j) {
+ zbin_ptr[j] = zbin_ptr[1];
+ quant_shift_ptr[j] = quant_shift_ptr[1];
+ dequant_ptr[j] = dequant_ptr[1];
+ quant_ptr[j] = quant_ptr[1];
+ round_ptr[j] = round_ptr[1];
+ }
+ quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+ &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+ API_REGISTER_STATE_CHECK(
+ quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+ scanOrder.scan, scanOrder.iscan, log_scale));
+
+ for (int j = 0; j < count; ++j) {
+ err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
+ (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+ ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+ << "qcoeff error: i = " << i << " j = " << j << "\n";
+ EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
+ << "dqcoeff error: i = " << i << " j = " << j << "\n";
+ }
+ EXPECT_EQ(ref_eob, eob) << "eob error: "
+ << "i = " << i << "\n";
+ err_count += (ref_eob != eob);
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Quantization Test, C output doesn't match SSE2 output. "
+ << "First failed at test case " << first_failure;
+ }
+
+ void RunEobTest() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+ DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
+ DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+ DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+ uint16_t eob;
+ uint16_t ref_eob;
+ int count = params_.coeffCount;
+ const TX_SIZE txSize = getTxSize(count);
+ int log_scale = (txSize == TX_32X32);
+ QuantizeFpFunc quanFunc = params_.qFunc;
+ QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+ const SCAN_ORDER scanOrder = av1_scan_orders[txSize][DCT_DCT];
+
+ for (int i = 0; i < numTests; i++) {
+ ref_eob = eob = UINT16_MAX;
+ for (int j = 0; j < count; j++) {
+ coeff_ptr[j] = 0;
+ }
+
+ coeff_ptr[rnd(count)] = rnd(coeffRange);
+ coeff_ptr[rnd(count)] = rnd(coeffRange);
+ coeff_ptr[rnd(count)] = rnd(coeffRange);
+
+ for (int j = 0; j < 2; j++) {
+ zbin_ptr[j] = rnd.Rand16Signed();
+ quant_shift_ptr[j] = rnd.Rand16Signed();
+ // int16_t positive
+ dequant_ptr[j] = abs(rnd(dequantRange));
+ quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+ round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+ }
+ for (int j = 2; j < 8; ++j) {
+ zbin_ptr[j] = zbin_ptr[1];
+ quant_shift_ptr[j] = quant_shift_ptr[1];
+ dequant_ptr[j] = dequant_ptr[1];
+ quant_ptr[j] = quant_ptr[1];
+ round_ptr[j] = round_ptr[1];
+ }
+
+ quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+ &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+ API_REGISTER_STATE_CHECK(
+ quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+ scanOrder.scan, scanOrder.iscan, log_scale));
+ EXPECT_EQ(ref_eob, eob) << "eob error: "
+ << "i = " << i << "\n";
+ }
+ }
+
+ void SetUp() override { params_ = GetParam(); }
+
+ ~AV1QuantizeTest() override = default;
+
+ private:
+ TX_SIZE getTxSize(int count) {
+ switch (count) {
+ case 16: return TX_4X4;
+ case 64: return TX_8X8;
+ case 256: return TX_16X16;
+ case 1024: return TX_32X32;
+ default: return TX_4X4;
+ }
+ }
+
+ QuantizeFuncParams params_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1QuantizeTest);
+
+TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
+TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
+
+TEST(AV1QuantizeTest, QuantizeFpNoQmatrix) {
+ // Here we use a uniform quantizer as an example
+ const int16_t dequant_ptr[2] = { 78, 93 }; // quantize step
+ const int16_t round_ptr[2] = { 39, 46 }; // round ~= dequant / 2
+
+ // quant ~= 2^16 / dequant. This is a 16-bit fixed point representation of the
+ // inverse of quantize step.
+ const int16_t quant_ptr[2] = { 840, 704 };
+ int log_scale = 0;
+ int coeff_count = 4;
+ const tran_low_t coeff_ptr[4] = { -449, 624, -14, 24 };
+ const tran_low_t ref_qcoeff_ptr[4] = { -6, 7, 0, 0 };
+ const tran_low_t ref_dqcoeff_ptr[4] = { -468, 651, 0, 0 };
+ const int16_t scan[4] = { 0, 1, 2, 3 };
+ tran_low_t qcoeff_ptr[4];
+ tran_low_t dqcoeff_ptr[4];
+ int eob = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+ log_scale, scan, coeff_count, coeff_ptr,
+ qcoeff_ptr, dqcoeff_ptr);
+ EXPECT_EQ(eob, 2);
+ for (int i = 0; i < coeff_count; ++i) {
+ EXPECT_EQ(qcoeff_ptr[i], ref_qcoeff_ptr[i]);
+ EXPECT_EQ(dqcoeff_ptr[i], ref_dqcoeff_ptr[i]);
+ }
+}
+
+#if HAVE_SSE4_1
+const QuantizeFuncParams qfps[4] = {
+ QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+ 16),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+ 64),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+ 256),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+ 1024),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const QuantizeFuncParams qfps_avx2[4] = {
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 16),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 64),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 256),
+ QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+ 1024),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+#endif // HAVE_AVX2
+
+} // namespace
diff --git a/third_party/aom/test/av1_round_shift_array_test.cc b/third_party/aom/test/av1_round_shift_array_test.cc
new file mode 100644
index 0000000000..937e8645a5
--- /dev/null
+++ b/third_party/aom/test/av1_round_shift_array_test.cc
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompRoundShift {
+
+typedef void (*comp_round_shift_array_func)(int32_t *arr, int size, int bit);
+
+#if HAVE_SSE4_1 || HAVE_NEON
+const int kValidBitCheck[] = {
+ -4, -3, -2, -1, 0, 1, 2, 3, 4,
+};
+#endif // HAVE_SSE4_1 || HAVE_NEON
+
+typedef std::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
+ CompRoundShiftParam;
+
+class AV1CompRoundShiftTest
+ : public ::testing::TestWithParam<CompRoundShiftParam> {
+ public:
+ ~AV1CompRoundShiftTest() override;
+
+ void SetUp() override {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ }
+
+ protected:
+ void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+ int bit);
+ void RunSpeedTest(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+ int bit);
+
+ libaom_test::ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompRoundShiftTest);
+
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() = default;
+
+void AV1CompRoundShiftTest::RunCheckOutput(
+ comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int blk_wd = 64;
+ DECLARE_ALIGNED(32, int32_t, pred_[blk_wd]);
+ DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+ for (int i = 0; i < (blk_wd); ++i) {
+ ref_buffer_[i] = pred_[i] = rnd_.Rand31() / 16;
+ }
+ av1_round_shift_array_c(ref_buffer_, w, bit);
+ test_impl(pred_, w, bit);
+ for (int x = 0; x < w; ++x) {
+ ASSERT_EQ(ref_buffer_[x], pred_[x]) << w << "x" << h << "mismatch @"
+ << "(" << x << ")";
+ }
+}
+
+void AV1CompRoundShiftTest::RunSpeedTest(comp_round_shift_array_func test_impl,
+ BLOCK_SIZE bsize, int bit) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int blk_wd = 64;
+ DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+ for (int i = 0; i < (blk_wd); ++i) {
+ ref_buffer_[i] = rnd_.Rand31();
+ }
+
+ const int num_loops = 1000000000 / (w + h);
+ comp_round_shift_array_func funcs[2] = { av1_round_shift_array_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ comp_round_shift_array_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(ref_buffer_, w, bit);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("av1_round_shift_array %3dx%-3d: bit : %d %7.2f/%7.2fns", w, h, bit,
+ elapsed_time[0], elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompRoundShiftTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1CompRoundShiftTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1CompRoundShiftTest,
+ ::testing::Combine(::testing::Values(&av1_round_shift_array_sse4_1),
+ ::testing::ValuesIn(txsize_to_bsize),
+ ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1CompRoundShiftTest,
+ ::testing::Combine(::testing::Values(&av1_round_shift_array_neon),
+ ::testing::ValuesIn(txsize_to_bsize),
+ ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+} // namespace AV1CompRoundShift
diff --git a/third_party/aom/test/av1_softmax_test.cc b/third_party/aom/test/av1_softmax_test.cc
new file mode 100644
index 0000000000..2b04af1342
--- /dev/null
+++ b/third_party/aom/test/av1_softmax_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <new>
+#include <tuple>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/ml.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using FastSoftmaxFn = void (*)(const float *const input, float *output);
+using FastSoftmaxTestParams = std::tuple<const FastSoftmaxFn, int>;
+
+// Error thresholds for functional equivalence
+constexpr float kRelEpsilon = 5e-2f;
+constexpr float kAbsEpsilon = 5e-3f;
+
+class FastSoftmaxTest : public ::testing::TestWithParam<FastSoftmaxTestParams> {
+ public:
+ FastSoftmaxTest() : target_fn_(GET_PARAM(0)), num_classes_(GET_PARAM(1)) {}
+ void SetUp() override {
+ ref_buf_.reset(new (std::nothrow) float[num_classes_]());
+ ASSERT_NE(ref_buf_, nullptr);
+ dst_buf_.reset(new (std::nothrow) float[num_classes_]());
+ ASSERT_NE(dst_buf_, nullptr);
+ input_.reset(new (std::nothrow) float[num_classes_]());
+ ASSERT_NE(input_, nullptr);
+ }
+ void RunSoftmaxTest();
+ void RunSoftmaxSpeedTest(const int run_times);
+ void FillInputBuf();
+
+ private:
+ const FastSoftmaxFn target_fn_;
+ const int num_classes_;
+ std::unique_ptr<float[]> ref_buf_, dst_buf_, input_;
+ libaom_test::ACMRandom rng_;
+};
+
+void FastSoftmaxTest::FillInputBuf() {
+ for (int idx = 0; idx < num_classes_; idx++) {
+ input_[idx] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 30);
+ }
+}
+
+void FastSoftmaxTest::RunSoftmaxTest() {
+ av1_nn_softmax(input_.get(), ref_buf_.get(), num_classes_);
+ target_fn_(input_.get(), dst_buf_.get());
+
+ for (int idx = 0; idx < num_classes_; idx++) {
+ if (ref_buf_[idx] < kAbsEpsilon) {
+ ASSERT_LE(dst_buf_[idx], kAbsEpsilon)
+ << "Reference output was near-zero, test output was not" << std::endl;
+ } else {
+ const float error = dst_buf_[idx] - ref_buf_[idx];
+ const float relative_error = fabsf(error / ref_buf_[idx]);
+ ASSERT_LE(relative_error, kRelEpsilon)
+ << "Excessive relative error between reference and test output"
+ << std::endl;
+ ASSERT_LE(error, kAbsEpsilon)
+ << "Excessive absolute error between reference and test output"
+ << std::endl;
+ }
+ }
+}
+
+void FastSoftmaxTest::RunSoftmaxSpeedTest(const int run_times) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int idx = 0; idx < run_times; idx++) {
+ target_fn_(input_.get(), dst_buf_.get());
+ }
+ aom_usec_timer_mark(&timer);
+ const int64_t time = aom_usec_timer_elapsed(&timer);
+ std::cout << "Test with " << num_classes_ << " classes took " << time
+ << " us." << std::endl;
+}
+
+TEST_P(FastSoftmaxTest, RandomValues) {
+ FillInputBuf();
+ RunSoftmaxTest();
+}
+
+TEST_P(FastSoftmaxTest, DISABLED_Speed) {
+ constexpr int kNumTimes = 1000000;
+ RunSoftmaxSpeedTest(kNumTimes);
+}
+
+void AnchorSoftmax16Fn(const float *input, float *output) {
+ av1_nn_softmax(input, output, 16);
+}
+
+const FastSoftmaxTestParams kArrayParams_c[] = {
+ FastSoftmaxTestParams(AnchorSoftmax16Fn, 16),
+ FastSoftmaxTestParams(av1_nn_fast_softmax_16_c, 16)
+};
+INSTANTIATE_TEST_SUITE_P(C, FastSoftmaxTest,
+ ::testing::ValuesIn(kArrayParams_c));
+
+#if HAVE_SSE3 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+INSTANTIATE_TEST_SUITE_P(
+ SSE3, FastSoftmaxTest,
+ ::testing::Values(FastSoftmaxTestParams(av1_nn_fast_softmax_16_sse3, 16)));
+#endif
+} // namespace
diff --git a/third_party/aom/test/av1_temporal_denoiser_test.cc b/third_party/aom/test/av1_temporal_denoiser_test.cc
new file mode 100644
index 0000000000..7aa8fb6a66
--- /dev/null
+++ b/third_party/aom/test/av1_temporal_denoiser_test.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+const int kNumPixels = 128 * 128;
+
+typedef int (*Av1DenoiserFilterFunc)(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude);
+typedef std::tuple<Av1DenoiserFilterFunc, BLOCK_SIZE> AV1DenoiserTestParam;
+
+class AV1DenoiserTest
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<AV1DenoiserTestParam> {
+ public:
+ ~AV1DenoiserTest() override = default;
+
+ void SetUp() override { bs_ = GET_PARAM(1); }
+
+ protected:
+ BLOCK_SIZE bs_;
+};
+
+TEST_P(AV1DenoiserTest, BitexactCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 4000;
+
+ // Allocate the space for input and output,
+ // where sig_block is the block to be denoised,
+ // mc_avg_block is the denoised reference block,
+ // avg_block_c is the denoised result from C code,
+ // avg_block_sse2 is the denoised result from SSE2 code.
+ DECLARE_ALIGNED(16, uint8_t, sig_block[kNumPixels]);
+ DECLARE_ALIGNED(16, uint8_t, mc_avg_block[kNumPixels]);
+ DECLARE_ALIGNED(16, uint8_t, avg_block_c[kNumPixels]);
+ DECLARE_ALIGNED(16, uint8_t, avg_block_sse2[kNumPixels]);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Generate random motion magnitude, 20% of which exceed the threshold.
+ const int motion_magnitude_random =
+ rnd.Rand8() % static_cast<int>(MOTION_MAGNITUDE_THRESHOLD * 1.2);
+
+ // Initialize a test block with random number in range [0, 255].
+ for (int j = 0; j < kNumPixels; ++j) {
+ int temp = 0;
+ sig_block[j] = rnd.Rand8();
+ // The pixels in mc_avg_block are generated by adding a random
+ // number in range [-19, 19] to corresponding pixels in sig_block.
+ temp =
+ sig_block[j] + ((rnd.Rand8() % 2 == 0) ? -1 : 1) * (rnd.Rand8() % 20);
+ // Clip.
+ mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
+ }
+
+ API_REGISTER_STATE_CHECK(
+ av1_denoiser_filter_c(sig_block, 128, mc_avg_block, 128, avg_block_c,
+ 128, 0, bs_, motion_magnitude_random));
+
+ API_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 128, mc_avg_block, 128,
+ avg_block_sse2, 128, 0, bs_,
+ motion_magnitude_random));
+
+ // Test bitexactness.
+ for (int h = 0; h < block_size_high[bs_]; ++h) {
+ for (int w = 0; w < block_size_wide[bs_]; ++w) {
+ EXPECT_EQ(avg_block_c[h * 128 + w], avg_block_sse2[h * 128 + w]);
+ }
+ }
+ }
+}
+
+using std::make_tuple;
+
+// Test for all block size.
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1DenoiserTest,
+ ::testing::Values(make_tuple(&av1_denoiser_filter_sse2, BLOCK_8X8),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_8X16),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_16X8),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_16X16),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_16X32),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_32X16),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_32X32),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_32X64),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_64X32),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_64X64),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_128X64),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_64X128),
+ make_tuple(&av1_denoiser_filter_sse2, BLOCK_128X128)));
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1DenoiserTest,
+ ::testing::Values(make_tuple(&av1_denoiser_filter_neon, BLOCK_8X8),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_8X16),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_16X8),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_16X16),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_16X32),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_32X16),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_32X32),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_32X64),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_64X32),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_64X64),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_128X64),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_64X128),
+ make_tuple(&av1_denoiser_filter_neon, BLOCK_128X128)));
+#endif
+} // namespace
diff --git a/third_party/aom/test/av1_txfm_test.cc b/third_party/aom/test/av1_txfm_test.cc
new file mode 100644
index 0000000000..77c0ec1071
--- /dev/null
+++ b/third_party/aom/test/av1_txfm_test.cc
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/av1_txfm_test.h"
+
+#include <stdio.h>
+
+#include <memory>
+#include <new>
+
+namespace libaom_test {
+
+const char *tx_type_name[] = {
+ "DCT_DCT",
+ "ADST_DCT",
+ "DCT_ADST",
+ "ADST_ADST",
+ "FLIPADST_DCT",
+ "DCT_FLIPADST",
+ "FLIPADST_FLIPADST",
+ "ADST_FLIPADST",
+ "FLIPADST_ADST",
+ "IDTX",
+ "V_DCT",
+ "H_DCT",
+ "V_ADST",
+ "H_ADST",
+ "V_FLIPADST",
+ "H_FLIPADST",
+};
+
+int get_txfm1d_size(TX_SIZE tx_size) { return tx_size_wide[tx_size]; }
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
+ switch (txfm2d_type) {
+ case DCT_DCT:
+ *type0 = TYPE_DCT;
+ *type1 = TYPE_DCT;
+ break;
+ case ADST_DCT:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_DCT;
+ break;
+ case DCT_ADST:
+ *type0 = TYPE_DCT;
+ *type1 = TYPE_ADST;
+ break;
+ case ADST_ADST:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_ADST;
+ break;
+ case FLIPADST_DCT:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_DCT;
+ break;
+ case DCT_FLIPADST:
+ *type0 = TYPE_DCT;
+ *type1 = TYPE_ADST;
+ break;
+ case FLIPADST_FLIPADST:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_ADST;
+ break;
+ case ADST_FLIPADST:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_ADST;
+ break;
+ case FLIPADST_ADST:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_ADST;
+ break;
+ case IDTX:
+ *type0 = TYPE_IDTX;
+ *type1 = TYPE_IDTX;
+ break;
+ case H_DCT:
+ *type0 = TYPE_IDTX;
+ *type1 = TYPE_DCT;
+ break;
+ case V_DCT:
+ *type0 = TYPE_DCT;
+ *type1 = TYPE_IDTX;
+ break;
+ case H_ADST:
+ *type0 = TYPE_IDTX;
+ *type1 = TYPE_ADST;
+ break;
+ case V_ADST:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_IDTX;
+ break;
+ case H_FLIPADST:
+ *type0 = TYPE_IDTX;
+ *type1 = TYPE_ADST;
+ break;
+ case V_FLIPADST:
+ *type0 = TYPE_ADST;
+ *type1 = TYPE_IDTX;
+ break;
+ default:
+ *type0 = TYPE_DCT;
+ *type1 = TYPE_DCT;
+ assert(0);
+ break;
+ }
+}
+
+double Sqrt2 = pow(2, 0.5);
+double invSqrt2 = 1 / pow(2, 0.5);
+
+double dct_matrix(double n, double k, int size) {
+ return cos(PI * (2 * n + 1) * k / (2 * size));
+}
+
+void reference_dct_1d(const double *in, double *out, int size) {
+ for (int k = 0; k < size; ++k) {
+ out[k] = 0;
+ for (int n = 0; n < size; ++n) {
+ out[k] += in[n] * dct_matrix(n, k, size);
+ }
+ if (k == 0) out[k] = out[k] * invSqrt2;
+ }
+}
+
+void reference_idct_1d(const double *in, double *out, int size) {
+ for (int k = 0; k < size; ++k) {
+ out[k] = 0;
+ for (int n = 0; n < size; ++n) {
+ if (n == 0)
+ out[k] += invSqrt2 * in[n] * dct_matrix(k, n, size);
+ else
+ out[k] += in[n] * dct_matrix(k, n, size);
+ }
+ }
+}
+
+// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4'
+// function). Should be replaced by a proper reference function that takes
+// 'double' input & output.
+static void fadst4_new(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_4_9 * x0;
+ s2 = sinpi_2_9 * x1;
+ s3 = sinpi_1_9 * x1;
+ s4 = sinpi_3_9 * x2;
+ s5 = sinpi_4_9 * x3;
+ s6 = sinpi_2_9 * x3;
+ s7 = x0 + x1 - x3;
+
+ x0 = s0 + s2 + s5;
+ x1 = sinpi_3_9 * s7;
+ x2 = s1 - s3 + s6;
+ x3 = s4;
+
+ s0 = x0 + x3;
+ s1 = x1;
+ s2 = x2 - x3;
+ s3 = x2 - x0 + x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ output[0] = (tran_low_t)fdct_round_shift(s0);
+ output[1] = (tran_low_t)fdct_round_shift(s1);
+ output[2] = (tran_low_t)fdct_round_shift(s2);
+ output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+void reference_adst_1d(const double *in, double *out, int size) {
+ if (size == 4) { // Special case.
+ tran_low_t int_input[4];
+ for (int i = 0; i < 4; ++i) {
+ int_input[i] = static_cast<tran_low_t>(round(in[i]));
+ }
+ tran_low_t int_output[4];
+ fadst4_new(int_input, int_output);
+ for (int i = 0; i < 4; ++i) {
+ out[i] = int_output[i];
+ }
+ return;
+ }
+
+ for (int k = 0; k < size; ++k) {
+ out[k] = 0;
+ for (int n = 0; n < size; ++n) {
+ out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+ }
+ }
+}
+
+void reference_idtx_1d(const double *in, double *out, int size) {
+ double scale = 0;
+ if (size == 4)
+ scale = Sqrt2;
+ else if (size == 8)
+ scale = 2;
+ else if (size == 16)
+ scale = 2 * Sqrt2;
+ else if (size == 32)
+ scale = 4;
+ else if (size == 64)
+ scale = 4 * Sqrt2;
+ for (int k = 0; k < size; ++k) {
+ out[k] = in[k] * scale;
+ }
+}
+
+void reference_hybrid_1d(double *in, double *out, int size, int type) {
+ if (type == TYPE_DCT)
+ reference_dct_1d(in, out, size);
+ else if (type == TYPE_ADST)
+ reference_adst_1d(in, out, size);
+ else
+ reference_idtx_1d(in, out, size);
+}
+
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size) {
+ TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+ av1_get_fwd_txfm_cfg(tx_type, tx_size, &fwd_txfm_flip_cfg);
+ const int tx_width = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+ const int tx_height = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+ const int8_t *shift = fwd_txfm_flip_cfg.shift;
+ const int amplify_bit = shift[0] + shift[1] + shift[2];
+ double amplify_factor =
+ amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+ // For rectangular transforms, we need to multiply by an extra factor.
+ const int rect_type = get_rect_tx_log_ratio(tx_width, tx_height);
+ if (abs(rect_type) == 1) {
+ amplify_factor *= pow(2, 0.5);
+ }
+ return amplify_factor;
+}
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+ TX_SIZE tx_size) {
+ // Get transform type and size of each dimension.
+ TYPE_TXFM type0;
+ TYPE_TXFM type1;
+ get_txfm1d_type(tx_type, &type0, &type1);
+ const int tx_width = tx_size_wide[tx_size];
+ const int tx_height = tx_size_high[tx_size];
+
+ std::unique_ptr<double[]> temp_in(
+ new (std::nothrow) double[AOMMAX(tx_width, tx_height)]);
+ std::unique_ptr<double[]> temp_out(
+ new (std::nothrow) double[AOMMAX(tx_width, tx_height)]);
+ std::unique_ptr<double[]> out_interm(
+ new (std::nothrow) double[tx_width * tx_height]);
+ ASSERT_NE(temp_in, nullptr);
+ ASSERT_NE(temp_out, nullptr);
+ ASSERT_NE(out_interm, nullptr);
+
+ // Transform columns.
+ for (int c = 0; c < tx_width; ++c) {
+ for (int r = 0; r < tx_height; ++r) {
+ temp_in[r] = in[r * tx_width + c];
+ }
+ reference_hybrid_1d(temp_in.get(), temp_out.get(), tx_height, type0);
+ for (int r = 0; r < tx_height; ++r) {
+ out_interm[r * tx_width + c] = temp_out[r];
+ }
+ }
+
+ // Transform rows.
+ for (int r = 0; r < tx_height; ++r) {
+ reference_hybrid_1d(out_interm.get() + r * tx_width, temp_out.get(),
+ tx_width, type1);
+ for (int c = 0; c < tx_width; ++c) {
+ out[c * tx_height + r] = temp_out[c];
+ }
+ }
+
+ // These transforms use an approximate 2D DCT transform, by only keeping the
+ // top-left quarter of the coefficients, and repacking them in the first
+ // quarter indices.
+ // TODO(urvang): Refactor this code.
+ if (tx_width == 64 && tx_height == 64) { // tx_size == TX_64X64
+ // Zero out top-right 32x32 area.
+ for (int col = 0; col < 32; ++col) {
+ memset(out + col * 64 + 32, 0, 32 * sizeof(*out));
+ }
+ // Zero out the bottom 64x32 area.
+ memset(out + 32 * 64, 0, 32 * 64 * sizeof(*out));
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int col = 1; col < 32; ++col) {
+ memcpy(out + col * 32, out + col * 64, 32 * sizeof(*out));
+ }
+ } else if (tx_width == 32 && tx_height == 64) { // tx_size == TX_32X64
+ // Zero out right 32x32 area.
+ for (int col = 0; col < 32; ++col) {
+ memset(out + col * 64 + 32, 0, 32 * sizeof(*out));
+ }
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int col = 1; col < 32; ++col) {
+ memcpy(out + col * 32, out + col * 64, 32 * sizeof(*out));
+ }
+ } else if (tx_width == 64 && tx_height == 32) { // tx_size == TX_64X32
+ // Zero out the bottom 32x32 area.
+ memset(out + 32 * 32, 0, 32 * 32 * sizeof(*out));
+ // Note: no repacking needed here.
+ } else if (tx_width == 16 && tx_height == 64) { // tx_size == TX_16X64
+ // Note: no repacking needed here.
+ // Zero out right 32x16 area.
+ for (int col = 0; col < 16; ++col) {
+ memset(out + col * 64 + 32, 0, 32 * sizeof(*out));
+ }
+ // Re-pack non-zero coeffs in the first 32x16 indices.
+ for (int col = 1; col < 16; ++col) {
+ memcpy(out + col * 32, out + col * 64, 32 * sizeof(*out));
+ }
+ } else if (tx_width == 64 && tx_height == 16) { // tx_size == TX_64X16
+ // Zero out the bottom 16x32 area.
+ memset(out + 16 * 32, 0, 16 * 32 * sizeof(*out));
+ }
+
+ // Apply appropriate scale.
+ const double amplify_factor = get_amplification_factor(tx_type, tx_size);
+ for (int c = 0; c < tx_width; ++c) {
+ for (int r = 0; r < tx_height; ++r) {
+ out[c * tx_height + r] *= amplify_factor;
+ }
+ }
+}
+
+template <typename Type>
+void fliplr(Type *dest, int width, int height, int stride) {
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width / 2; ++c) {
+ const Type tmp = dest[r * stride + c];
+ dest[r * stride + c] = dest[r * stride + width - 1 - c];
+ dest[r * stride + width - 1 - c] = tmp;
+ }
+ }
+}
+
+template <typename Type>
+void flipud(Type *dest, int width, int height, int stride) {
+ for (int c = 0; c < width; ++c) {
+ for (int r = 0; r < height / 2; ++r) {
+ const Type tmp = dest[r * stride + c];
+ dest[r * stride + c] = dest[(height - 1 - r) * stride + c];
+ dest[(height - 1 - r) * stride + c] = tmp;
+ }
+ }
+}
+
+template <typename Type>
+void fliplrud(Type *dest, int width, int height, int stride) {
+ for (int r = 0; r < height / 2; ++r) {
+ for (int c = 0; c < width; ++c) {
+ const Type tmp = dest[r * stride + c];
+ dest[r * stride + c] = dest[(height - 1 - r) * stride + width - 1 - c];
+ dest[(height - 1 - r) * stride + width - 1 - c] = tmp;
+ }
+ }
+}
+
+template void fliplr<double>(double *dest, int width, int height, int stride);
+template void flipud<double>(double *dest, int width, int height, int stride);
+template void fliplrud<double>(double *dest, int width, int height, int stride);
+
+int bd_arr[BD_NUM] = { 8, 10, 12 };
+
+int8_t low_range_arr[BD_NUM] = { 18, 32, 32 };
+int8_t high_range_arr[BD_NUM] = { 32, 32, 32 };
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+ int8_t cos_bit, int low_range, int high_range) {
+ for (int i = 0; i < stage_num; ++i) {
+ EXPECT_LE(stage_range[i], low_range);
+ ASSERT_LE(stage_range[i] + cos_bit, high_range) << "stage = " << i;
+ }
+ for (int i = 0; i < stage_num - 1; ++i) {
+ // make sure there is no overflow while doing half_btf()
+ ASSERT_LE(stage_range[i + 1] + cos_bit, high_range) << "stage = " << i;
+ }
+}
+} // namespace libaom_test
diff --git a/third_party/aom/test/av1_txfm_test.h b/third_party/aom/test/av1_txfm_test.h
new file mode 100644
index 0000000000..d285e3d637
--- /dev/null
+++ b/third_party/aom/test/av1_txfm_test.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_AV1_TXFM_TEST_H_
+#define AOM_TEST_AV1_TXFM_TEST_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+namespace libaom_test {
+
+extern const char *tx_type_name[];
+
+enum {
+ TYPE_DCT = 0,
+ TYPE_ADST,
+ TYPE_IDTX,
+ TYPE_IDCT,
+ TYPE_IADST,
+ TYPE_LAST
+} UENUM1BYTE(TYPE_TXFM);
+
+int get_txfm1d_size(TX_SIZE tx_size);
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1);
+
+void reference_dct_1d(const double *in, double *out, int size);
+void reference_idct_1d(const double *in, double *out, int size);
+
+void reference_adst_1d(const double *in, double *out, int size);
+
+void reference_hybrid_1d(double *in, double *out, int size, int type);
+
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size);
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+ TX_SIZE tx_size);
+template <typename Type1, typename Type2>
+static double compute_avg_abs_error(const Type1 *a, const Type2 *b,
+ const int size) {
+ double error = 0;
+ for (int i = 0; i < size; i++) {
+ error += fabs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
+ }
+ error = error / size;
+ return error;
+}
+
+template <typename Type>
+void fliplr(Type *dest, int width, int height, int stride);
+
+template <typename Type>
+void flipud(Type *dest, int width, int height, int stride);
+
+template <typename Type>
+void fliplrud(Type *dest, int width, int height, int stride);
+
+typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t cos_bit,
+ const int8_t *range_bit);
+
+typedef void (*InvTxfm2dFunc)(const int32_t *, uint16_t *, int, TX_TYPE, int);
+typedef void (*LbdInvTxfm2dFunc)(const int32_t *, uint8_t *, int, TX_TYPE,
+ TX_SIZE, int);
+
+static const int bd = 10;
+static const int input_base = (1 << bd);
+
+static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
+ const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+ TxSetType tx_set_type;
+ if (tx_size_sqr_up > TX_32X32) {
+ tx_set_type = EXT_TX_SET_DCTONLY;
+ } else if (tx_size_sqr_up == TX_32X32) {
+ tx_set_type = EXT_TX_SET_DCT_IDTX;
+ } else {
+ tx_set_type = EXT_TX_SET_ALL16;
+ }
+ return av1_ext_tx_used[tx_set_type][tx_type] != 0;
+}
+
+#if CONFIG_AV1_ENCODER
+#if !CONFIG_REALTIME_ONLY
+static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
+ av1_fwd_txfm2d_4x4_c, av1_fwd_txfm2d_8x8_c, av1_fwd_txfm2d_16x16_c,
+ av1_fwd_txfm2d_32x32_c, av1_fwd_txfm2d_64x64_c, av1_fwd_txfm2d_4x8_c,
+ av1_fwd_txfm2d_8x4_c, av1_fwd_txfm2d_8x16_c, av1_fwd_txfm2d_16x8_c,
+ av1_fwd_txfm2d_16x32_c, av1_fwd_txfm2d_32x16_c, av1_fwd_txfm2d_32x64_c,
+ av1_fwd_txfm2d_64x32_c, av1_fwd_txfm2d_4x16_c, av1_fwd_txfm2d_16x4_c,
+ av1_fwd_txfm2d_8x32_c, av1_fwd_txfm2d_32x8_c, av1_fwd_txfm2d_16x64_c,
+ av1_fwd_txfm2d_64x16_c,
+};
+#else
+static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
+ av1_fwd_txfm2d_4x4_c,
+ av1_fwd_txfm2d_8x8_c,
+ av1_fwd_txfm2d_16x16_c,
+ av1_fwd_txfm2d_32x32_c,
+ av1_fwd_txfm2d_64x64_c,
+ av1_fwd_txfm2d_4x8_c,
+ av1_fwd_txfm2d_8x4_c,
+ av1_fwd_txfm2d_8x16_c,
+ av1_fwd_txfm2d_16x8_c,
+ av1_fwd_txfm2d_16x32_c,
+ av1_fwd_txfm2d_32x16_c,
+ av1_fwd_txfm2d_32x64_c,
+ av1_fwd_txfm2d_64x32_c,
+ nullptr,
+ av1_fwd_txfm2d_16x4_c,
+ nullptr,
+ nullptr,
+ nullptr,
+ nullptr,
+};
+#endif
+#endif
+
+static const InvTxfm2dFunc inv_txfm_func_ls[TX_SIZES_ALL] = {
+ av1_inv_txfm2d_add_4x4_c, av1_inv_txfm2d_add_8x8_c,
+ av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c,
+ av1_inv_txfm2d_add_64x64_c, av1_inv_txfm2d_add_4x8_c,
+ av1_inv_txfm2d_add_8x4_c, av1_inv_txfm2d_add_8x16_c,
+ av1_inv_txfm2d_add_16x8_c, av1_inv_txfm2d_add_16x32_c,
+ av1_inv_txfm2d_add_32x16_c, av1_inv_txfm2d_add_32x64_c,
+ av1_inv_txfm2d_add_64x32_c, av1_inv_txfm2d_add_4x16_c,
+ av1_inv_txfm2d_add_16x4_c, av1_inv_txfm2d_add_8x32_c,
+ av1_inv_txfm2d_add_32x8_c, av1_inv_txfm2d_add_16x64_c,
+ av1_inv_txfm2d_add_64x16_c,
+};
+
+#define BD_NUM 3
+
+extern int bd_arr[];
+extern int8_t low_range_arr[];
+extern int8_t high_range_arr[];
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+ const int8_t cos_bit, int low_range,
+ int high_range);
+} // namespace libaom_test
+#endif // AOM_TEST_AV1_TXFM_TEST_H_
diff --git a/third_party/aom/test/av1_wedge_utils_test.cc b/third_party/aom/test/av1_wedge_utils_test.cc
new file mode 100644
index 0000000000..1055ff35b2
--- /dev/null
+++ b/third_party/aom/test/av1_wedge_utils_test.cc
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/enums.h"
+
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#define WEDGE_WEIGHT_BITS 6
+#define MAX_MASK_VALUE (1 << (WEDGE_WEIGHT_BITS))
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - functionality
+//////////////////////////////////////////////////////////////////////////////
+
+class WedgeUtilsSSEFuncTest : public testing::Test {
+ protected:
+ WedgeUtilsSSEFuncTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+ static const int kIterations = 1000;
+
+ ACMRandom rng_;
+};
+
+static void equiv_blend_residuals(int16_t *r, const int16_t *r0,
+ const int16_t *r1, const uint8_t *m, int N) {
+ for (int i = 0; i < N; i++) {
+ const int32_t m0 = m[i];
+ const int32_t m1 = MAX_MASK_VALUE - m0;
+ const int16_t R = m0 * r0[i] + m1 * r1[i];
+ // Note that this rounding is designed to match the result
+ // you would get when actually blending the 2 predictors and computing
+ // the residuals.
+ r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+ }
+}
+
+static uint64_t equiv_sse_from_residuals(const int16_t *r0, const int16_t *r1,
+ const uint8_t *m, int N) {
+ uint64_t acc = 0;
+ for (int i = 0; i < N; i++) {
+ const int32_t m0 = m[i];
+ const int32_t m1 = MAX_MASK_VALUE - m0;
+ const int16_t R = m0 * r0[i] + m1 * r1[i];
+ const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+ acc += r * r;
+ }
+ return acc;
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
+ DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]);
+
+ DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ s[i] = rng_.Rand8();
+ m[i] = rng_(MAX_MASK_VALUE + 1);
+ }
+
+ const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+ const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+ const int N = w * h;
+
+ for (int j = 0; j < N; j++) {
+ p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+ p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+ }
+
+ aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, w, h, 0, 0);
+
+ aom_subtract_block(h, w, r0, w, s, w, p0, w);
+ aom_subtract_block(h, w, r1, w, s, w, p1, w);
+
+ aom_subtract_block(h, w, r_ref, w, s, w, p, w);
+ equiv_blend_residuals(r_tst, r0, r1, m, N);
+
+ for (int i = 0; i < N; ++i) ASSERT_EQ(r_ref[i], r_tst[i]);
+
+ uint64_t ref_sse = aom_sum_squares_i16(r_ref, N);
+ uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N);
+
+ ASSERT_EQ(ref_sse, tst_sse);
+ }
+}
+
+static uint64_t sse_from_residuals(const int16_t *r0, const int16_t *r1,
+ const uint8_t *m, int N) {
+ uint64_t acc = 0;
+ for (int i = 0; i < N; i++) {
+ const int32_t m0 = m[i];
+ const int32_t m1 = MAX_MASK_VALUE - m0;
+ const int32_t r = m0 * r0[i] + m1 * r1[i];
+ acc += r * r;
+ }
+ return ROUND_POWER_OF_TWO(acc, 2 * WEDGE_WEIGHT_BITS);
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingMethod) {
+ DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r1[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+ d[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+ m[i] = rng_(MAX_MASK_VALUE + 1);
+ }
+
+ const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+ for (int i = 0; i < N; i++) r0[i] = r1[i] + d[i];
+
+ const uint64_t ref_res = sse_from_residuals(r0, r1, m, N);
+ const uint64_t tst_res = av1_wedge_sse_from_residuals(r1, d, m, N);
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - optimizations
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*FSSE)(const int16_t *r1, const int16_t *d, const uint8_t *m,
+ int N);
+typedef libaom_test::FuncParam<FSSE> TestFuncsFSSE;
+
+class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest<FSSE> {
+ protected:
+ static const int kIterations = 10000;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsSSEOptTest);
+
+TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+ d[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+ m[i] = rng_(MAX_MASK_VALUE + 1);
+ }
+
+ const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+ const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+ uint64_t tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ if (rng_(2)) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = kInt13Max;
+ } else {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = -kInt13Max;
+ }
+
+ if (rng_(2)) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = kInt13Max;
+ } else {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = -kInt13Max;
+ }
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+ const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+ const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+ uint64_t tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sign_from_residuals
+//////////////////////////////////////////////////////////////////////////////
+
+typedef int8_t (*FSign)(const int16_t *ds, const uint8_t *m, int N,
+ int64_t limit);
+typedef libaom_test::FuncParam<FSign> TestFuncsFSign;
+
+class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
+ protected:
+ static const int kIterations = 10000;
+ static const int kMaxSize = 8196; // Size limited by SIMD implementation.
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsSignOptTest);
+
+TEST_P(WedgeUtilsSignOptTest, RandomValues) {
+ DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r0[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+ r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+ m[i] = rng_(MAX_MASK_VALUE + 1);
+ }
+
+ const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+ const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+ int64_t limit;
+ limit = (int64_t)aom_sum_squares_i16(r0, N);
+ limit -= (int64_t)aom_sum_squares_i16(r1, N);
+ limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+ for (int i = 0; i < N; i++)
+ ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+ const int ref_res = params_.ref_func(ds, m, N, limit);
+ int tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
+ DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ switch (rng_(4)) {
+ case 0:
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r0[i] = 0;
+ r1[i] = kInt13Max;
+ }
+ break;
+ case 1:
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r0[i] = kInt13Max;
+ r1[i] = 0;
+ }
+ break;
+ case 2:
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r0[i] = 0;
+ r1[i] = -kInt13Max;
+ }
+ break;
+ default:
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ r0[i] = -kInt13Max;
+ r1[i] = 0;
+ }
+ break;
+ }
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+ const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+ const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+ int64_t limit;
+ limit = (int64_t)aom_sum_squares_i16(r0, N);
+ limit -= (int64_t)aom_sum_squares_i16(r1, N);
+ limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+ for (int i = 0; i < N; i++)
+ ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+ const int ref_res = params_.ref_func(ds, m, N, limit);
+ int tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_compute_delta_squares
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FDS)(int16_t *d, const int16_t *a, const int16_t *b, int N);
+typedef libaom_test::FuncParam<FDS> TestFuncsFDS;
+
+class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest<FDS> {
+ protected:
+ static const int kIterations = 10000;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(WedgeUtilsDeltaSquaresOptTest);
+
+TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
+ DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, b[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d_ref[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int16_t, d_tst[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ a[i] = rng_.Rand16Signed();
+ b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX;
+ }
+
+ const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+ memset(&d_ref, INT16_MAX, sizeof(d_ref));
+ memset(&d_tst, INT16_MAX, sizeof(d_tst));
+
+ params_.ref_func(d_ref, a, b, N);
+ API_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]);
+ }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, WedgeUtilsSSEOptTest,
+ ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+ av1_wedge_sse_from_residuals_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, WedgeUtilsSignOptTest,
+ ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+ av1_wedge_sign_from_residuals_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, WedgeUtilsDeltaSquaresOptTest,
+ ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+ av1_wedge_compute_delta_squares_sse2)));
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, WedgeUtilsSSEOptTest,
+ ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+ av1_wedge_sse_from_residuals_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, WedgeUtilsSignOptTest,
+ ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+ av1_wedge_sign_from_residuals_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, WedgeUtilsDeltaSquaresOptTest,
+ ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+ av1_wedge_compute_delta_squares_neon)));
+#endif // HAVE_NEON
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, WedgeUtilsSSEOptTest,
+ ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2,
+ av1_wedge_sse_from_residuals_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, WedgeUtilsSignOptTest,
+ ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2,
+ av1_wedge_sign_from_residuals_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, WedgeUtilsDeltaSquaresOptTest,
+ ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2,
+ av1_wedge_compute_delta_squares_avx2)));
+#endif // HAVE_AVX2
+
+} // namespace
diff --git a/third_party/aom/test/avg_test.cc b/third_party/aom/test/avg_test.cc
new file mode 100644
index 0000000000..6f4c2ff332
--- /dev/null
+++ b/third_party/aom/test/avg_test.cc
@@ -0,0 +1,1150 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class AverageTestBase : public ::testing::Test {
+ public:
+ AverageTestBase(int width, int height, int bit_depth = 8)
+ : width_(width), height_(height), source_data_(nullptr),
+ source_stride_(0), bit_depth_(bit_depth) {}
+
+ void TearDown() override {
+ aom_free(source_data_);
+ source_data_ = nullptr;
+ }
+
+ protected:
+ // Handle blocks up to 4 blocks 64x64 with stride up to 128
+ static const int kDataAlignment = 16;
+ static const int kDataBlockWidth = 128;
+ static const int kDataBlockHeight = 128;
+ static const int kDataBlockSize = kDataBlockWidth * kDataBlockHeight;
+
+ void SetUp() override {
+ const testing::TestInfo *const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ // Skip the speed test for C code as the baseline uses the same function.
+ if (std::string(test_info->test_suite_name()).find("C/") == 0 &&
+ std::string(test_info->name()).find("DISABLED_Speed") !=
+ std::string::npos) {
+ GTEST_SKIP();
+ }
+
+ source_data_ = static_cast<Pixel *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+ ASSERT_NE(source_data_, nullptr);
+ memset(source_data_, 0, kDataBlockSize * sizeof(source_data_[0]));
+ source_stride_ = (width_ + 31) & ~31;
+ bit_depth_ = 8;
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ // Sum Pixels
+ static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) {
+ unsigned int average = 0;
+ for (int h = 0; h < 8; ++h) {
+ for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
+ }
+ return (average + 32) >> 6;
+ }
+
+ static void ReferenceAverage8x8_quad(const uint8_t *source, int pitch,
+ int x16_idx, int y16_idx, int *avg) {
+ for (int k = 0; k < 4; k++) {
+ int average = 0;
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ for (int h = 0; h < 8; ++h) {
+ for (int w = 0; w < 8; ++w)
+ average += source[(h + y8_idx) * pitch + w + x8_idx];
+ }
+ avg[k] = (average + 32) >> 6;
+ }
+ }
+
+ static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
+ unsigned int average = 0;
+ for (int h = 0; h < 4; ++h) {
+ for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
+ }
+ return (average + 8) >> 4;
+ }
+
+ void FillConstant(Pixel fill_constant) {
+ for (int i = 0; i < width_ * height_; ++i) {
+ source_data_[i] = fill_constant;
+ }
+ }
+
+ void FillRandom() {
+ for (int i = 0; i < width_ * height_; ++i) {
+ source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1);
+ }
+ }
+
+ int width_, height_;
+ Pixel *source_data_;
+ int source_stride_;
+ int bit_depth_;
+
+ ACMRandom rnd_;
+};
+typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
+
+// Arguments: width, height, bit_depth, buffer start offset, block size, avg
+// function.
+typedef std::tuple<int, int, int, int, int, AverageFunction> AvgFunc;
+
+template <typename Pixel>
+class AverageTest : public AverageTestBase<Pixel>,
+ public ::testing::WithParamInterface<AvgFunc> {
+ public:
+ AverageTest()
+ : AverageTestBase<Pixel>(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)) {}
+
+ protected:
+ using AverageTestBase<Pixel>::source_data_;
+ using AverageTestBase<Pixel>::source_stride_;
+ using AverageTestBase<Pixel>::ReferenceAverage8x8;
+ using AverageTestBase<Pixel>::ReferenceAverage4x4;
+ using AverageTestBase<Pixel>::FillConstant;
+ using AverageTestBase<Pixel>::FillRandom;
+
+ void CheckAverages() {
+ const int block_size = GET_PARAM(4);
+ unsigned int expected = 0;
+
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const Pixel *const src = source_data_ + GET_PARAM(3);
+ if (block_size == 8) {
+ expected = ReferenceAverage8x8(src, source_stride_);
+ } else if (block_size == 4) {
+ expected = ReferenceAverage4x4(src, source_stride_);
+ }
+
+ aom_usec_timer timer;
+ unsigned int actual;
+ if (sizeof(Pixel) == 2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ AverageFunction avg_c =
+ (block_size == 8) ? aom_highbd_avg_8x8_c : aom_highbd_avg_4x4_c;
+ // To avoid differences in optimization with the local Reference*()
+ // functions the C implementation is used as a baseline.
+ aom_usec_timer_start(&timer);
+ avg_c(CONVERT_TO_BYTEPTR(src), source_stride_);
+ aom_usec_timer_mark(&timer);
+ ref_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+ AverageFunction avg_opt = GET_PARAM(5);
+ API_REGISTER_STATE_CHECK(
+ aom_usec_timer_start(&timer);
+ actual = avg_opt(CONVERT_TO_BYTEPTR(src), source_stride_);
+ aom_usec_timer_mark(&timer));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ ASSERT_EQ(sizeof(Pixel), 1u);
+
+ AverageFunction avg_c = (block_size == 8) ? aom_avg_8x8_c : aom_avg_4x4_c;
+ aom_usec_timer_start(&timer);
+ avg_c(reinterpret_cast<const uint8_t *>(src), source_stride_);
+ aom_usec_timer_mark(&timer);
+ ref_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+ AverageFunction avg_opt = GET_PARAM(5);
+ API_REGISTER_STATE_CHECK(
+ aom_usec_timer_start(&timer);
+ actual =
+ avg_opt(reinterpret_cast<const uint8_t *>(src), source_stride_);
+ aom_usec_timer_mark(&timer));
+ }
+ opt_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+ EXPECT_EQ(expected, actual);
+ }
+
+ void TestConstantValue(Pixel value) {
+ FillConstant(value);
+ CheckAverages();
+ }
+
+ void TestRandom(int iterations = 1000) {
+ for (int i = 0; i < iterations; i++) {
+ FillRandom();
+ CheckAverages();
+ }
+ }
+
+ void PrintTimingStats() const {
+ printf(
+ "block_size = %d \t ref_time = %d \t simd_time = %d \t Gain = %4.2f\n",
+ GET_PARAM(4), static_cast<int>(ref_elapsed_time_),
+ static_cast<int>(opt_elapsed_time_),
+ (static_cast<float>(ref_elapsed_time_) /
+ static_cast<float>(opt_elapsed_time_)));
+ }
+
+ int64_t ref_elapsed_time_ = 0;
+ int64_t opt_elapsed_time_ = 0;
+};
+
+typedef void (*AverageFunction_8x8_quad)(const uint8_t *s, int pitch, int x_idx,
+ int y_idx, int *avg);
+
+// Arguments: width, height, bit_depth, buffer start offset, block size, avg
+// function.
+typedef std::tuple<int, int, int, int, int, AverageFunction_8x8_quad>
+ AvgFunc_8x8_quad;
+
+template <typename Pixel>
+class AverageTest_8x8_quad
+ : public AverageTestBase<Pixel>,
+ public ::testing::WithParamInterface<AvgFunc_8x8_quad> {
+ public:
+ AverageTest_8x8_quad()
+ : AverageTestBase<Pixel>(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2)) {}
+
+ protected:
+ using AverageTestBase<Pixel>::source_data_;
+ using AverageTestBase<Pixel>::source_stride_;
+ using AverageTestBase<Pixel>::ReferenceAverage8x8_quad;
+ using AverageTestBase<Pixel>::FillConstant;
+ using AverageTestBase<Pixel>::FillRandom;
+
+ void CheckAveragesAt(int iterations, int x16_idx, int y16_idx) {
+ ASSERT_EQ(sizeof(Pixel), 1u);
+ const int block_size = GET_PARAM(4);
+ (void)block_size;
+ int expected[4] = { 0 };
+
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const Pixel *const src = source_data_ + GET_PARAM(3);
+ ReferenceAverage8x8_quad(src, source_stride_, x16_idx, y16_idx, expected);
+
+ aom_usec_timer timer;
+ int expected_c[4] = { 0 };
+ int actual[4] = { 0 };
+ AverageFunction_8x8_quad avg_c = aom_avg_8x8_quad_c;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < iterations; i++) {
+ avg_c(reinterpret_cast<const uint8_t *>(src), source_stride_, x16_idx,
+ y16_idx, expected_c);
+ }
+ aom_usec_timer_mark(&timer);
+ ref_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+ AverageFunction_8x8_quad avg_opt = GET_PARAM(5);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < iterations; i++) {
+ avg_opt(reinterpret_cast<const uint8_t *>(src), source_stride_, x16_idx,
+ y16_idx, actual);
+ }
+ aom_usec_timer_mark(&timer);
+ opt_elapsed_time_ += aom_usec_timer_elapsed(&timer);
+
+ for (int k = 0; k < 4; k++) {
+ EXPECT_EQ(expected[k], actual[k]);
+ EXPECT_EQ(expected_c[k], actual[k]);
+ }
+
+ // Print scaling information only when Speed test is called.
+ if (iterations > 1) {
+ printf("ref_time = %d \t simd_time = %d \t Gain = %4.2f\n",
+ static_cast<int>(ref_elapsed_time_),
+ static_cast<int>(opt_elapsed_time_),
+ (static_cast<float>(ref_elapsed_time_) /
+ static_cast<float>(opt_elapsed_time_)));
+ }
+ }
+
+ void CheckAverages() {
+ for (int x16_idx = 0; x16_idx < this->kDataBlockWidth / 8; x16_idx += 2)
+ for (int y16_idx = 0; y16_idx < this->kDataBlockHeight / 8; y16_idx += 2)
+ CheckAveragesAt(1, x16_idx, y16_idx);
+ }
+
+ void TestConstantValue(Pixel value) {
+ FillConstant(value);
+ CheckAverages();
+ }
+
+ void TestRandom() {
+ FillRandom();
+ CheckAverages();
+ }
+
+ void TestSpeed() {
+ FillRandom();
+ CheckAveragesAt(1000000, 0, 0);
+ }
+
+ int64_t ref_elapsed_time_ = 0;
+ int64_t opt_elapsed_time_ = 0;
+};
+
+using AverageTest8bpp = AverageTest<uint8_t>;
+
+TEST_P(AverageTest8bpp, MinValue) { TestConstantValue(0); }
+
+TEST_P(AverageTest8bpp, MaxValue) { TestConstantValue(255); }
+
+TEST_P(AverageTest8bpp, Random) { TestRandom(); }
+
+TEST_P(AverageTest8bpp, DISABLED_Speed) {
+ TestRandom(1000000);
+ PrintTimingStats();
+}
+
+using AvgTest8bpp_avg_8x8_quad = AverageTest_8x8_quad<uint8_t>;
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, MinValue) { TestConstantValue(0); }
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, MaxValue) { TestConstantValue(255); }
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, Random) { TestRandom(); }
+
+TEST_P(AvgTest8bpp_avg_8x8_quad, DISABLED_Speed) { TestSpeed(); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+using AverageTestHbd = AverageTest<uint16_t>;
+
+TEST_P(AverageTestHbd, MinValue) { TestConstantValue(0); }
+
+TEST_P(AverageTestHbd, MaxValue10bit) { TestConstantValue(1023); }
+TEST_P(AverageTestHbd, MaxValue12bit) { TestConstantValue(4095); }
+
+TEST_P(AverageTestHbd, Random) { TestRandom(); }
+
+TEST_P(AverageTestHbd, DISABLED_Speed) {
+ TestRandom(1000000);
+ PrintTimingStats();
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*IntProRowFunc)(int16_t *hbuf, uint8_t const *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor);
+
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest : public AverageTestBase<uint8_t>,
+ public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+ IntProRowTest()
+ : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), hbuf_asm_(nullptr),
+ hbuf_c_(nullptr) {
+ asm_func_ = GET_PARAM(2);
+ c_func_ = GET_PARAM(3);
+ }
+
+ void set_norm_factor() {
+ if (height_ == 128)
+ norm_factor_ = 6;
+ else if (height_ == 64)
+ norm_factor_ = 5;
+ else if (height_ == 32)
+ norm_factor_ = 4;
+ else if (height_ == 16)
+ norm_factor_ = 3;
+ }
+
+ protected:
+ void SetUp() override {
+ source_data_ = static_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+ ASSERT_NE(source_data_, nullptr);
+
+ hbuf_asm_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * width_));
+ ASSERT_NE(hbuf_asm_, nullptr);
+ hbuf_c_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * width_));
+ ASSERT_NE(hbuf_c_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(source_data_);
+ source_data_ = nullptr;
+ aom_free(hbuf_c_);
+ hbuf_c_ = nullptr;
+ aom_free(hbuf_asm_);
+ hbuf_asm_ = nullptr;
+ }
+
+ void RunComparison() {
+ set_norm_factor();
+ API_REGISTER_STATE_CHECK(
+ c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+ API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, width_, width_,
+ height_, norm_factor_));
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
+ << "Output mismatch\n";
+ }
+
+ void RunSpeedTest() {
+ const int numIter = 5000000;
+ set_norm_factor();
+ printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+ numIter);
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int i = 0; i < numIter; i++) {
+ c_func_(hbuf_c_, source_data_, width_, width_, height_, norm_factor_);
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer asm_timer_;
+ aom_usec_timer_start(&asm_timer_);
+
+ for (int i = 0; i < numIter; i++) {
+ asm_func_(hbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
+ }
+ aom_usec_timer_mark(&asm_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int asm_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ asm_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+ EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * width_))
+ << "Output mismatch\n";
+ }
+
+ private:
+ IntProRowFunc asm_func_;
+ IntProRowFunc c_func_;
+ int16_t *hbuf_asm_;
+ int16_t *hbuf_c_;
+ int norm_factor_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
+
+typedef void (*IntProColFunc)(int16_t *vbuf, uint8_t const *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor);
+
+// Params: width, height, asm function, c function.
+typedef std::tuple<int, int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest : public AverageTestBase<uint8_t>,
+ public ::testing::WithParamInterface<IntProColParam> {
+ public:
+ IntProColTest()
+ : AverageTestBase(GET_PARAM(0), GET_PARAM(1)), vbuf_asm_(nullptr),
+ vbuf_c_(nullptr) {
+ asm_func_ = GET_PARAM(2);
+ c_func_ = GET_PARAM(3);
+ }
+
+ protected:
+ void SetUp() override {
+ source_data_ = static_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+ ASSERT_NE(source_data_, nullptr);
+
+ vbuf_asm_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*vbuf_asm_) * width_));
+ ASSERT_NE(vbuf_asm_, nullptr);
+ vbuf_c_ = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, sizeof(*vbuf_c_) * width_));
+ ASSERT_NE(vbuf_c_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(source_data_);
+ source_data_ = nullptr;
+ aom_free(vbuf_c_);
+ vbuf_c_ = nullptr;
+ aom_free(vbuf_asm_);
+ vbuf_asm_ = nullptr;
+ }
+
+ void RunComparison() {
+ int norm_factor_ = 3 + (width_ >> 5);
+ API_REGISTER_STATE_CHECK(
+ c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_));
+ API_REGISTER_STATE_CHECK(asm_func_(vbuf_asm_, source_data_, width_, width_,
+ height_, norm_factor_));
+ EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+ << "Output mismatch\n";
+ }
+ void RunSpeedTest() {
+ const int numIter = 5000000;
+ printf("Blk_Size=%dx%d: number of iteration is %d \n", width_, height_,
+ numIter);
+ int norm_factor_ = 3 + (width_ >> 5);
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int i = 0; i < numIter; i++) {
+ c_func_(vbuf_c_, source_data_, width_, width_, height_, norm_factor_);
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer asm_timer_;
+ aom_usec_timer_start(&asm_timer_);
+
+ for (int i = 0; i < numIter; i++) {
+ asm_func_(vbuf_asm_, source_data_, width_, width_, height_, norm_factor_);
+ }
+ aom_usec_timer_mark(&asm_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int asm_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&asm_timer_));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ asm_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(asm_sum_time)));
+
+ EXPECT_EQ(0, memcmp(vbuf_c_, vbuf_asm_, sizeof(*vbuf_c_) * height_))
+ << "Output mismatch\n";
+ }
+
+ private:
+ IntProColFunc asm_func_;
+ IntProColFunc c_func_;
+ int16_t *vbuf_asm_;
+ int16_t *vbuf_c_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
+
+TEST_P(IntProRowTest, MinValue) {
+ FillConstant(0);
+ RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+ FillConstant(255);
+ RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+ FillRandom();
+ RunComparison();
+}
+
+TEST_P(IntProRowTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+
+TEST_P(IntProColTest, MinValue) {
+ FillConstant(0);
+ RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+ FillConstant(255);
+ RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+ FillRandom();
+ RunComparison();
+}
+
+TEST_P(IntProColTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+class VectorVarTestBase : public ::testing::Test {
+ public:
+ explicit VectorVarTestBase(int bwl) { m_bwl = bwl; }
+ VectorVarTestBase() = default;
+ ~VectorVarTestBase() override = default;
+
+ protected:
+ static const int kDataAlignment = 16;
+
+ void SetUp() override {
+ width = 4 << m_bwl;
+
+ ref_vector = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, width * sizeof(ref_vector[0])));
+ ASSERT_NE(ref_vector, nullptr);
+ src_vector = static_cast<int16_t *>(
+ aom_memalign(kDataAlignment, width * sizeof(src_vector[0])));
+ ASSERT_NE(src_vector, nullptr);
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+ void TearDown() override {
+ aom_free(ref_vector);
+ ref_vector = nullptr;
+ aom_free(src_vector);
+ src_vector = nullptr;
+ }
+
+ void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
+ for (int i = 0; i < width; ++i) {
+ ref_vector[i] = fill_constant_ref;
+ src_vector[i] = fill_constant_src;
+ }
+ }
+
+ void FillRandom() {
+ for (int i = 0; i < width; ++i) {
+ ref_vector[i] =
+ rnd_.Rand16() % max_range; // acc. aom_vector_var_c brief.
+ src_vector[i] = rnd_.Rand16() % max_range;
+ }
+ }
+
+ int width;
+ int m_bwl;
+ int16_t *ref_vector;
+ int16_t *src_vector;
+ ACMRandom rnd_;
+
+ static const int max_range = 510;
+ static const int num_random_cmp = 50;
+};
+
+typedef int (*VectorVarFunc)(const int16_t *ref, const int16_t *src,
+ const int bwl);
+
+typedef std::tuple<int, VectorVarFunc, VectorVarFunc> VecVarFunc;
+
+class VectorVarTest : public VectorVarTestBase,
+ public ::testing::WithParamInterface<VecVarFunc> {
+ public:
+ VectorVarTest()
+ : VectorVarTestBase(GET_PARAM(0)), c_func(GET_PARAM(1)),
+ simd_func(GET_PARAM(2)) {}
+
+ protected:
+ int calcVarC() { return c_func(ref_vector, src_vector, m_bwl); }
+ int calcVarSIMD() { return simd_func(ref_vector, src_vector, m_bwl); }
+
+ VectorVarFunc c_func;
+ VectorVarFunc simd_func;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VectorVarTest);
+
+TEST_P(VectorVarTest, MaxVar) {
+ FillConstant(0, max_range);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, MaxVarRev) {
+ FillConstant(max_range, 0);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff) {
+ FillConstant(0, 0);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, ZeroDiff2) {
+ FillConstant(max_range, max_range);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Constant) {
+ FillConstant(30, 90);
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+}
+TEST_P(VectorVarTest, Random) {
+ for (size_t i = 0; i < num_random_cmp; i++) {
+ FillRandom();
+ int c_var = calcVarC();
+ int simd_var = calcVarSIMD();
+ ASSERT_EQ(c_var, simd_var);
+ }
+}
+TEST_P(VectorVarTest, DISABLED_Speed) {
+ FillRandom();
+ const int numIter = 5000000;
+ printf("Width = %d number of iteration is %d \n", width, numIter);
+
+ int sum_c_var = 0;
+ int c_var = 0;
+
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (size_t i = 0; i < numIter; i++) {
+ c_var = calcVarC();
+ sum_c_var += c_var;
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ int simd_var = 0;
+ int sum_simd_var = 0;
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+ for (size_t i = 0; i < numIter; i++) {
+ simd_var = calcVarSIMD();
+ sum_simd_var += simd_var;
+ }
+ aom_usec_timer_mark(&simd_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int simd_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+ EXPECT_EQ(c_var, simd_var) << "Output mismatch \n";
+ EXPECT_EQ(sum_c_var, sum_simd_var) << "Output mismatch \n";
+}
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AverageTest8bpp,
+ ::testing::Values(make_tuple(16, 16, 8, 1, 8, &aom_avg_8x8_c),
+ make_tuple(16, 16, 8, 1, 4, &aom_avg_4x4_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AvgTest8bpp_avg_8x8_quad,
+ ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_c),
+ make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_c),
+ make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AverageTest8bpp,
+ ::testing::Values(make_tuple(16, 16, 8, 0, 8, &aom_avg_8x8_sse2),
+ make_tuple(16, 16, 8, 5, 8, &aom_avg_8x8_sse2),
+ make_tuple(32, 32, 8, 15, 8, &aom_avg_8x8_sse2),
+ make_tuple(16, 16, 8, 0, 4, &aom_avg_4x4_sse2),
+ make_tuple(16, 16, 8, 5, 4, &aom_avg_4x4_sse2),
+ make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AvgTest8bpp_avg_8x8_quad,
+ ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_sse2),
+ make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_sse2),
+ make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, IntProRowTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_sse2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, IntProColTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_sse2, &aom_int_pro_col_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AvgTest8bpp_avg_8x8_quad,
+ ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_avx2),
+ make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_avx2),
+ make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IntProRowTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_avx2, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_avx2, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IntProColTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_avx2, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_avx2, &aom_int_pro_col_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AverageTest8bpp,
+ ::testing::Values(make_tuple(16, 16, 8, 0, 8, &aom_avg_8x8_neon),
+ make_tuple(16, 16, 8, 5, 8, &aom_avg_8x8_neon),
+ make_tuple(32, 32, 8, 15, 8, &aom_avg_8x8_neon),
+ make_tuple(16, 16, 8, 0, 4, &aom_avg_4x4_neon),
+ make_tuple(16, 16, 8, 5, 4, &aom_avg_4x4_neon),
+ make_tuple(32, 32, 8, 15, 4, &aom_avg_4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, IntProRowTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(32, 32, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(64, 64, &aom_int_pro_row_neon, &aom_int_pro_row_c),
+ make_tuple(128, 128, &aom_int_pro_row_neon, &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, IntProColTest,
+ ::testing::Values(
+ make_tuple(16, 16, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(32, 32, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(64, 64, &aom_int_pro_col_neon, &aom_int_pro_col_c),
+ make_tuple(128, 128, &aom_int_pro_col_neon, &aom_int_pro_col_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AvgTest8bpp_avg_8x8_quad,
+ ::testing::Values(make_tuple(16, 16, 8, 0, 16, &aom_avg_8x8_quad_neon),
+ make_tuple(32, 32, 8, 16, 16, &aom_avg_8x8_quad_neon),
+ make_tuple(32, 32, 8, 8, 16, &aom_avg_8x8_quad_neon)));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ C, AverageTestHbd,
+ ::testing::Values(make_tuple(16, 16, 10, 1, 8, &aom_highbd_avg_8x8_c),
+ make_tuple(16, 16, 10, 1, 4, &aom_highbd_avg_4x4_c),
+ make_tuple(16, 16, 12, 1, 8, &aom_highbd_avg_8x8_c),
+ make_tuple(16, 16, 12, 1, 4, &aom_highbd_avg_4x4_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AverageTestHbd,
+ ::testing::Values(make_tuple(16, 16, 10, 0, 4, &aom_highbd_avg_4x4_neon),
+ make_tuple(16, 16, 10, 5, 4, &aom_highbd_avg_4x4_neon),
+ make_tuple(32, 32, 10, 15, 4, &aom_highbd_avg_4x4_neon),
+ make_tuple(16, 16, 12, 0, 4, &aom_highbd_avg_4x4_neon),
+ make_tuple(16, 16, 12, 5, 4, &aom_highbd_avg_4x4_neon),
+ make_tuple(32, 32, 12, 15, 4, &aom_highbd_avg_4x4_neon),
+ make_tuple(16, 16, 10, 0, 8, &aom_highbd_avg_8x8_neon),
+ make_tuple(16, 16, 10, 5, 8, &aom_highbd_avg_8x8_neon),
+ make_tuple(32, 32, 10, 15, 8, &aom_highbd_avg_8x8_neon),
+ make_tuple(16, 16, 12, 0, 8, &aom_highbd_avg_8x8_neon),
+ make_tuple(16, 16, 12, 5, 8, &aom_highbd_avg_8x8_neon),
+ make_tuple(32, 32, 12, 15, 8, &aom_highbd_avg_8x8_neon)));
+#endif // HAVE_NEON
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
+typedef int (*SatdLpFunc)(const int16_t *coeffs, int length);
+
+template <typename SatdFuncType>
+struct SatdTestParam {
+ SatdTestParam(int s, SatdFuncType f1, SatdFuncType f2)
+ : satd_size(s), func_ref(f1), func_simd(f2) {}
+ friend std::ostream &operator<<(std::ostream &os,
+ const SatdTestParam<SatdFuncType> &param) {
+ return os << "satd_size: " << param.satd_size;
+ }
+ int satd_size;
+ SatdFuncType func_ref;
+ SatdFuncType func_simd;
+};
+
+template <typename CoeffType, typename SatdFuncType>
+class SatdTestBase
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<SatdTestParam<SatdFuncType>> {
+ protected:
+ explicit SatdTestBase(const SatdTestParam<SatdFuncType> &func_param) {
+ satd_size_ = func_param.satd_size;
+ satd_func_ref_ = func_param.func_ref;
+ satd_func_simd_ = func_param.func_simd;
+ }
+ void SetUp() override {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<CoeffType *>(
+ aom_memalign(32, sizeof(*src_) * satd_size_));
+ ASSERT_NE(src_, nullptr);
+ }
+ void TearDown() override { aom_free(src_); }
+ void FillConstant(const CoeffType val) {
+ for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+ }
+ void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) {
+ src_[i] = static_cast<int16_t>(rnd_.Rand16());
+ }
+ }
+ void Check(int expected) {
+ int total_ref;
+ API_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+ EXPECT_EQ(expected, total_ref);
+
+ int total_simd;
+ API_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+ EXPECT_EQ(expected, total_simd);
+ }
+ void RunComparison() {
+ int total_ref;
+ API_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+
+ int total_simd;
+ API_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+
+ EXPECT_EQ(total_ref, total_simd);
+ }
+ void RunSpeedTest() {
+ const int numIter = 500000;
+ printf("size = %d number of iteration is %d \n", satd_size_, numIter);
+
+ int total_ref;
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int i = 0; i < numIter; i++) {
+ total_ref = satd_func_ref_(src_, satd_size_);
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ int total_simd;
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+
+ for (int i = 0; i < numIter; i++) {
+ total_simd = satd_func_simd_(src_, satd_size_);
+ }
+ aom_usec_timer_mark(&simd_timer_);
+
+ const int c_sum_time = static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ const int simd_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ printf(
+ "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+ EXPECT_EQ(total_ref, total_simd) << "Output mismatch \n";
+ }
+ int satd_size_;
+
+ private:
+ CoeffType *src_;
+ SatdFuncType satd_func_ref_;
+ SatdFuncType satd_func_simd_;
+ ACMRandom rnd_;
+};
+
+class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
+ public:
+ SatdTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdTest, MinValue) {
+ const int kMin = -524287;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+TEST_P(SatdTest, MaxValue) {
+ const int kMax = 524287;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+TEST_P(SatdTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 205298; break;
+ case 64: expected = 1113950; break;
+ case 256: expected = 4268415; break;
+ case 1024: expected = 16954082; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+TEST_P(SatdTest, Match) {
+ FillRandom();
+ RunComparison();
+}
+TEST_P(SatdTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
+
+INSTANTIATE_TEST_SUITE_P(
+ C, SatdTest,
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_c),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_c),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_c),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c, &aom_satd_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, SatdTest,
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_neon),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_neon),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_neon),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+ &aom_satd_neon)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_neon),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_neon),
+ make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
+#endif
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sve),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_sve),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_sve),
+ make_tuple(5, &aom_vector_var_c, &aom_vector_var_sve)));
+#endif // HAVE_SVE
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_sse4_1),
+ make_tuple(5, &aom_vector_var_c,
+ &aom_vector_var_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, SatdTest,
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_avx2),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_avx2),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+ &aom_satd_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, VectorVarTest,
+ ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_avx2),
+ make_tuple(3, &aom_vector_var_c, &aom_vector_var_avx2),
+ make_tuple(4, &aom_vector_var_c, &aom_vector_var_avx2),
+ make_tuple(5, &aom_vector_var_c, &aom_vector_var_avx2)));
+#endif // HAVE_AVX2
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, SatdTest,
+ ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_sse2),
+ SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_sse2),
+ SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_sse2),
+ SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+ &aom_satd_sse2)));
+#endif
+
+class SatdLpTest : public SatdTestBase<int16_t, SatdLpFunc> {
+ public:
+ SatdLpTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdLpTest, MinValue) {
+ const int kMin = -32640;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+TEST_P(SatdLpTest, MaxValue) {
+ const int kMax = 32640;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+TEST_P(SatdLpTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 205298; break;
+ case 64: expected = 1113950; break;
+ case 256: expected = 4268415; break;
+ case 1024: expected = 16954082; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+TEST_P(SatdLpTest, Match) {
+ FillRandom();
+ RunComparison();
+}
+TEST_P(SatdLpTest, DISABLED_Speed) {
+ FillRandom();
+ RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdLpTest);
+
+// Add the following c test to avoid gtest uninitialized warning.
+INSTANTIATE_TEST_SUITE_P(
+ C, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_c),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_c),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_c),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_neon),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_neon),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_neon),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_avx2),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_avx2),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_avx2),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, SatdLpTest,
+ ::testing::Values(
+ SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_sse2),
+ SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_sse2),
+ SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_sse2),
+ SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_sse2)));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/avif_progressive_test.cc b/third_party/aom/test/avif_progressive_test.cc
new file mode 100644
index 0000000000..2a28ca368b
--- /dev/null
+++ b/third_party/aom/test/avif_progressive_test.cc
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstddef>
+#include <vector>
+
+#include "aom/aomcx.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// This test emulates how libavif calls libaom functions to encode a
+// progressive AVIF image in libavif's ProgressiveTest.QualityChange test.
+TEST(AVIFProgressiveTest, QualityChange) {
+ constexpr int kWidth = 256;
+ constexpr int kHeight = 256;
+ // Dummy buffer of neutral gray samples.
+ constexpr size_t kBufferSize = 3 * kWidth * kHeight;
+ std::vector<unsigned char> buffer(kBufferSize,
+ static_cast<unsigned char>(128));
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I444, kWidth, kHeight, 1,
+ buffer.data()));
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+ cfg.g_profile = 1;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_end_usage = AOM_Q;
+ cfg.rc_min_quantizer = 50;
+ cfg.rc_max_quantizer = 50;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 50));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+ // First frame (layer 0)
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Second frame (layer 1)
+ cfg.rc_min_quantizer = 0;
+ cfg.rc_max_quantizer = 0;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_LOSSLESS, 1));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 1));
+ aom_enc_frame_flags_t encode_flags =
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// This test emulates how libavif calls libaom functions to encode a
+// progressive AVIF image in libavif's ProgressiveTest.DimensionChange test.
+TEST(AVIFProgressiveTest, DimensionChange) {
+ constexpr int kWidth = 256;
+ constexpr int kHeight = 256;
+ // Dummy buffer of neutral gray samples.
+ constexpr size_t kBufferSize = 3 * kWidth * kHeight;
+ std::vector<unsigned char> buffer(kBufferSize,
+ static_cast<unsigned char>(128));
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I444, kWidth, kHeight, 1,
+ buffer.data()));
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+ cfg.g_profile = 1;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_end_usage = AOM_Q;
+ cfg.rc_min_quantizer = 0;
+ cfg.rc_max_quantizer = 0;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_LOSSLESS, 1));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+ // First frame (layer 0)
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
+ aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Second frame (layer 1)
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 1));
+ aom_enc_frame_flags_t encode_flags =
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// This test reproduces bug aomedia:3382. Certain parameters such as width,
+// height, g_threads, usage, etc. were carefully chosen based on the
+// complicated logic of av1_select_sb_size() to cause an inconsistent sb_size.
+TEST(AVIFProgressiveTest, DimensionChangeLargeImageMultiThread) {
+ constexpr int kWidth = 1920;
+ constexpr int kHeight = 1080;
+ // Dummy buffer of neutral gray samples.
+ constexpr size_t kBufferSize = 2 * kWidth * kHeight;
+ std::vector<unsigned char> buffer(kBufferSize,
+ static_cast<unsigned char>(128));
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+ cfg.g_profile = 0;
+ cfg.g_w = img.w;
+ cfg.g_h = img.h;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_lag_in_frames = 0;
+ cfg.g_threads = 2; // MultiThread
+ cfg.rc_end_usage = AOM_Q;
+ cfg.rc_min_quantizer = 0;
+ cfg.rc_max_quantizer = 63;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 31));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_NUMBER_SPATIAL_LAYERS, 2));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_ROW_MT, 1)); // MultiThread
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+ // First frame (layer 0)
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 0));
+ aom_scaling_mode_t scaling_mode = { AOME_ONETWO, AOME_ONETWO };
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SCALEMODE, &scaling_mode));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Second frame (layer 1)
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_SPATIAL_LAYER_ID, 1));
+ aom_enc_frame_flags_t encode_flags =
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, encode_flags));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+} // namespace
diff --git a/third_party/aom/test/best_encode.sh b/third_party/aom/test/best_encode.sh
new file mode 100755
index 0000000000..d29fdaed52
--- /dev/null
+++ b/third_party/aom/test/best_encode.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 2 ]]; then
+ echo "Encodes a file using best known settings (slow!)"
+ echo " Usage: be [FILE] [BITRATE]"
+ echo " Example: be akiyo_cif.y4m 200"
+ exit
+fi
+
+f=$1 # file is first parameter
+b=$2 # bitrate is second parameter
+
+if [[ -e $f.fpf ]]; then
+ # First-pass file found, do second pass only
+ aomenc \
+ $f \
+ -o $f-$b.av1.webm \
+ -p 2 \
+ --pass=2 \
+ --fpf=$f.fpf \
+ --good \
+ --cpu-used=0 \
+ --target-bitrate=$b \
+ --auto-alt-ref=1 \
+ -v \
+ --minsection-pct=0 \
+ --maxsection-pct=800 \
+ --lag-in-frames=25 \
+ --kf-min-dist=0 \
+ --kf-max-dist=99999 \
+ --static-thresh=0 \
+ --min-q=0 \
+ --max-q=63 \
+ --drop-frame=0 \
+ --bias-pct=50 \
+ --minsection-pct=0 \
+ --maxsection-pct=800 \
+ --psnr \
+ --arnr-maxframes=7 \
+ --arnr-strength=3
+else
+ # No first-pass file found, do 2-pass encode
+ aomenc \
+ $f \
+ -o $f-$b.av1.webm \
+ -p 2 \
+ --pass=1 \
+ --fpf=$f.fpf \
+ --good \
+ --cpu-used=0 \
+ --target-bitrate=$b \
+ --auto-alt-ref=1 \
+ -v \
+ --minsection-pct=0 \
+ --maxsection-pct=800 \
+ --lag-in-frames=25 \
+ --kf-min-dist=0 \
+ --kf-max-dist=99999 \
+ --static-thresh=0 \
+ --min-q=0 \
+ --max-q=63 \
+ --drop-frame=0
+
+ aomenc \
+ $f \
+ -o $f-$b.av1.webm \
+ -p 2 \
+ --pass=2 \
+ --fpf=$f.fpf \
+ --good \
+ --cpu-used=0 \
+ --target-bitrate=$b \
+ --auto-alt-ref=1 \
+ -v \
+ --minsection-pct=0 \
+ --maxsection-pct=800 \
+ --lag-in-frames=25 \
+ --kf-min-dist=0 \
+ --kf-max-dist=99999 \
+ --static-thresh=0 \
+ --min-q=0 \
+ --max-q=63 \
+ --drop-frame=0 \
+ --bias-pct=50 \
+ --minsection-pct=0 \
+ --maxsection-pct=800 \
+ --psnr \
+ --arnr-maxframes=7 \
+ --arnr-strength=3
+fi
diff --git a/third_party/aom/test/binary_codes_test.cc b/third_party/aom/test/binary_codes_test.cc
new file mode 100644
index 0000000000..2c2dfb45a8
--- /dev/null
+++ b/third_party/aom/test/binary_codes_test.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/binary_codes_writer.h"
+
+#define ACCT_STR __func__
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+// Test for Finite subexponential code with reference
+TEST(AV1, TestPrimitiveRefsubexpfin) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int kBufferSize = 65536;
+ aom_writer bw;
+ uint8_t bw_buffer[kBufferSize];
+ const uint16_t kRanges = 8;
+ const uint16_t kSubexpParams = 6;
+ const uint16_t kReferences = 8;
+ const uint16_t kValues = 16;
+ uint16_t enc_values[kRanges][kSubexpParams][kReferences][kValues][4];
+ const uint16_t range_vals[kRanges] = { 1, 13, 64, 120, 230, 420, 1100, 8000 };
+ aom_start_encode(&bw, bw_buffer);
+ for (int n = 0; n < kRanges; ++n) {
+ const uint16_t range = range_vals[n];
+ for (int k = 0; k < kSubexpParams; ++k) {
+ for (int r = 0; r < kReferences; ++r) {
+ const uint16_t ref = rnd(range);
+ for (int v = 0; v < kValues; ++v) {
+ const uint16_t value = rnd(range);
+ enc_values[n][k][r][v][0] = range;
+ enc_values[n][k][r][v][1] = k;
+ enc_values[n][k][r][v][2] = ref;
+ enc_values[n][k][r][v][3] = value;
+ aom_write_primitive_refsubexpfin(&bw, range, k, ref, value);
+ }
+ }
+ }
+ }
+ GTEST_ASSERT_GE(aom_stop_encode(&bw), 0);
+ aom_reader br;
+ aom_reader_init(&br, bw_buffer, bw.pos);
+ GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
+ GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+ for (int n = 0; n < kRanges; ++n) {
+ for (int k = 0; k < kSubexpParams; ++k) {
+ for (int r = 0; r < kReferences; ++r) {
+ for (int v = 0; v < kValues; ++v) {
+ const uint16_t range = enc_values[n][k][r][v][0];
+ assert(k == enc_values[n][k][r][v][1]);
+ const uint16_t ref = enc_values[n][k][r][v][2];
+ const uint16_t value =
+ aom_read_primitive_refsubexpfin(&br, range, k, ref, ACCT_STR);
+ GTEST_ASSERT_EQ(value, enc_values[n][k][r][v][3]);
+ }
+ }
+ }
+ }
+}
+// TODO(debargha): Adds tests for other primitives
+} // namespace
diff --git a/third_party/aom/test/blend_a64_mask_1d_test.cc b/third_party/aom/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 0000000000..f9549bccb2
--- /dev/null
+++ b/third_party/aom/test/blend_a64_mask_1d_test.cc
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#include "aom_dsp/blend.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+ static const int kIterations = 10000;
+ static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides
+ static const int kMaxHeight = MAX_SB_SIZE;
+ static const int kBufSize = kMaxWidth * kMaxHeight;
+ static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+ static const int kMaxMaskSize = kMaxMaskWidth;
+
+ ~BlendA64Mask1DTest() override = default;
+
+ virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+
+ void Common(int block_size) {
+ w_ = block_size_wide[block_size];
+ h_ = block_size_high[block_size];
+
+ dst_offset_ = this->rng_(33);
+ dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+ src0_offset_ = this->rng_(33);
+ src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+ src1_offset_ = this->rng_(33);
+ src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+ T *p_src0;
+ T *p_src1;
+
+ switch (this->rng_(3)) {
+ case 0: // Separate sources
+ p_src0 = src0_;
+ p_src1 = src1_;
+ break;
+ case 1: // src0 == dst
+ p_src0 = dst_tst_;
+ src0_stride_ = dst_stride_;
+ src0_offset_ = dst_offset_;
+ p_src1 = src1_;
+ break;
+ case 2: // src1 == dst
+ p_src0 = src0_;
+ p_src1 = dst_tst_;
+ src1_stride_ = dst_stride_;
+ src1_offset_ = dst_offset_;
+ break;
+ default: FAIL();
+ }
+
+ Execute(p_src0, p_src1);
+
+ for (int r = 0; r < h_; ++r) {
+ for (int c = 0; c < w_; ++c) {
+ ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+ dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+ }
+ }
+ }
+
+ T dst_ref_[kBufSize];
+ T dst_tst_[kBufSize];
+ uint32_t dst_stride_;
+ uint32_t dst_offset_;
+
+ T src0_[kBufSize];
+ uint32_t src0_stride_;
+ uint32_t src0_offset_;
+
+ T src1_[kBufSize];
+ uint32_t src1_stride_;
+ uint32_t src1_offset_;
+
+ uint8_t mask_[kMaxMaskSize];
+
+ int w_;
+ int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1,
+ uint32_t src1_stride, const uint8_t *mask, int w, int h);
+typedef libaom_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1) override {
+ params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+ src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
+ w_, h_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(
+ dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+ src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
+ }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand8();
+ src1_[i] = rng_.Rand8();
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ Common(bsize);
+ }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(2) + 254;
+ dst_tst_[i] = rng_(2) + 254;
+ src0_[i] = rng_(2) + 254;
+ src1_[i] = rng_(2) + 254;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ Common(bsize);
+ }
+}
+
+static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+ [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+ for (int row = 0; row < h; ++row)
+ for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
+
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
+ 0, 0);
+}
+
+static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+ [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+ for (int row = 0; row < h; ++row)
+ for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
+
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
+ 0, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ C, BlendA64Mask1DTest8B,
+ ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_c),
+ TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BlendA64Mask1DTest8B,
+ ::testing::Values(
+ TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_sse4_1),
+ TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64Mask1DTest8B,
+ ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_neon),
+ TestFuncs(blend_a64_vmask_ref,
+ aom_blend_a64_vmask_neon)));
+#endif // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1,
+ uint32_t src1_stride, const uint8_t *mask, int w, int h,
+ int bd);
+typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1) override {
+ params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, w_, h_, bit_depth_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(
+ CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_,
+ bit_depth_));
+ }
+
+ int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const int hi = 1 << bit_depth_;
+
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi);
+ dst_tst_[i] = rng_(hi);
+ src0_[i] = rng_(hi);
+ src1_[i] = rng_(hi);
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ Common(bsize);
+ }
+ }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ const int hi = 1 << bit_depth_;
+ const int lo = hi - 2;
+
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi - lo) + lo;
+ dst_tst_[i] = rng_(hi - lo) + lo;
+ src0_[i] = rng_(hi - lo) + lo;
+ src1_[i] = rng_(hi - lo) + lo;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ Common(bsize);
+ }
+ }
+}
+
+static void highbd_blend_a64_hmask_ref(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+ [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+ for (int row = 0; row < h; ++row)
+ for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
+
+ aom_highbd_blend_a64_mask_c(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
+ BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+ [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+ for (int row = 0; row < h; ++row)
+ for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
+
+ aom_highbd_blend_a64_mask_c(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
+ BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ C, BlendA64Mask1DTestHBD,
+ ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+ aom_highbd_blend_a64_hmask_c),
+ TestFuncsHBD(highbd_blend_a64_vmask_ref,
+ aom_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BlendA64Mask1DTestHBD,
+ ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+ aom_highbd_blend_a64_hmask_sse4_1),
+ TestFuncsHBD(highbd_blend_a64_vmask_ref,
+ aom_highbd_blend_a64_vmask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64Mask1DTestHBD,
+ ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+ aom_highbd_blend_a64_hmask_neon),
+ TestFuncsHBD(highbd_blend_a64_vmask_ref,
+ aom_highbd_blend_a64_vmask_neon)));
+#endif // HAVE_NEON
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/blend_a64_mask_test.cc b/third_party/aom/test/blend_a64_mask_test.cc
new file mode 100644
index 0000000000..fafc7f0329
--- /dev/null
+++ b/third_party/aom/test/blend_a64_mask_test.cc
@@ -0,0 +1,649 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#include "aom_dsp/blend.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename BlendA64Func, typename SrcPixel, typename DstPixel>
+class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
+ protected:
+ static const int kIterations = 10000;
+ static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides
+ static const int kMaxHeight = MAX_SB_SIZE;
+ static const int kBufSize = kMaxWidth * kMaxHeight;
+ static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+ static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
+
+ ~BlendA64MaskTest() override = default;
+
+ virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
+ int run_times) = 0;
+
+ template <typename Pixel>
+ void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) {
+ if (run_times > 1) {
+ *src0 = src0_;
+ *src1 = src1_;
+ return;
+ }
+ switch (this->rng_(3)) {
+ case 0: // Separate sources
+ *src0 = src0_;
+ *src1 = src1_;
+ break;
+ case 1: // src0 == dst
+ *src0 = dst_tst_;
+ src0_stride_ = dst_stride_;
+ src0_offset_ = dst_offset_;
+ *src1 = src1_;
+ break;
+ case 2: // src1 == dst
+ *src0 = src0_;
+ *src1 = dst_tst_;
+ src1_stride_ = dst_stride_;
+ src1_offset_ = dst_offset_;
+ break;
+ default: FAIL();
+ }
+ }
+
+ void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/,
+ int /*run_times*/) {
+ *src0 = src0_;
+ *src1 = src1_;
+ }
+
+ uint8_t Rand1() { return this->rng_.Rand8() & 1; }
+
+ void RunOneTest(int block_size, int subx, int suby, int run_times) {
+ w_ = block_size_wide[block_size];
+ h_ = block_size_high[block_size];
+ run_times = run_times > 1 ? run_times / w_ : 1;
+ ASSERT_GT(run_times, 0);
+ subx_ = subx;
+ suby_ = suby;
+
+ dst_offset_ = this->rng_(33);
+ dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+ src0_offset_ = this->rng_(33);
+ src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+ src1_offset_ = this->rng_(33);
+ src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+ mask_stride_ =
+ this->rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1)) + w_ * (subx_ ? 2 : 1);
+
+ SrcPixel *p_src0;
+ SrcPixel *p_src1;
+
+ p_src0 = src0_;
+ p_src1 = src1_;
+
+ GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times);
+
+ Execute(p_src0, p_src1, run_times);
+
+ for (int r = 0; r < h_; ++r) {
+ for (int c = 0; c < w_; ++c) {
+ ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+ dst_tst_[dst_offset_ + r * dst_stride_ + c])
+ << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_
+ << " r: " << r << " c: " << c;
+ }
+ }
+ }
+
+ void RunTest(int block_size, int run_times) {
+ for (subx_ = 0; subx_ <= 1; subx_++) {
+ for (suby_ = 0; suby_ <= 1; suby_++) {
+ RunOneTest(block_size, subx_, suby_, run_times);
+ }
+ }
+ }
+
+ DstPixel dst_ref_[kBufSize];
+ DstPixel dst_tst_[kBufSize];
+ uint32_t dst_stride_;
+ uint32_t dst_offset_;
+
+ SrcPixel src0_[kBufSize];
+ uint32_t src0_stride_;
+ uint32_t src0_offset_;
+
+ SrcPixel src1_[kBufSize];
+ uint32_t src1_stride_;
+ uint32_t src1_offset_;
+
+ uint8_t mask_[kMaxMaskSize];
+ size_t mask_stride_;
+
+ int w_;
+ int h_;
+
+ int suby_;
+ int subx_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1,
+ uint32_t src1_stride, const uint8_t *mask,
+ uint32_t mask_stride, int w, int h, int subx, int suby);
+typedef libaom_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
+ protected:
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1,
+ int run_times) override {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B);
+
+TEST_P(BlendA64MaskTest8B, RandomValues) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand8();
+ src1_[i] = rng_.Rand8();
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, 1);
+ }
+}
+
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(2) + 254;
+ dst_tst_[i] = rng_(2) + 254;
+ src0_[i] = rng_(2) + 254;
+ src1_[i] = rng_(2) + 254;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize)
+ RunTest(bsize, 1);
+}
+
+TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
+ const int kRunTimes = 10000000;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand8();
+ src1_[i] = rng_.Rand8();
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, kRunTimes);
+ }
+}
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BlendA64MaskTest8B,
+ ::testing::Values(TestFuncs(
+ aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BlendA64MaskTest8B,
+ ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
+ aom_blend_a64_mask_avx2)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendA64MaskTest8B,
+ ::testing::Values(TestFuncs(aom_blend_a64_mask_c,
+ aom_blend_a64_mask_neon)));
+#endif // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B_D16)(uint8_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1,
+ uint32_t src1_stride, const uint8_t *mask,
+ uint32_t mask_stride, int w, int h, int subx, int suby,
+ ConvolveParams *conv_params);
+typedef libaom_test::FuncParam<F8B_D16> TestFuncs_d16;
+
+class BlendA64MaskTest8B_d16
+ : public BlendA64MaskTest<F8B_D16, uint16_t, uint8_t> {
+ protected:
+ // max number of bits used by the source
+ static const int kSrcMaxBitsMask = 0x3fff;
+
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+ int run_times) override {
+ ConvolveParams conv_params;
+ conv_params.round_0 = ROUND0_BITS;
+ conv_params.round_1 = COMPOUND_ROUND1_BITS;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTest8B_d16);
+
+TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+ src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, 1);
+ }
+}
+
+TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = 255;
+ dst_tst_[i] = 255;
+
+ src0_[i] = kSrcMaxBitsMask;
+ src1_[i] = kSrcMaxBitsMask;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure(); ++bsize)
+ RunTest(bsize, 1);
+}
+
+TEST_P(BlendA64MaskTest8B_d16, DISABLED_Speed) {
+ const int kRunTimes = 10000000;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+ src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, kRunTimes);
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BlendA64MaskTest8B_d16,
+ ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+ aom_lowbd_blend_a64_d16_mask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, BlendA64MaskTest8B_d16,
+ ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+ aom_lowbd_blend_a64_d16_mask_avx2)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64MaskTest8B_d16,
+ ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+ aom_lowbd_blend_a64_d16_mask_neon)));
+#endif // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1,
+ uint32_t src1_stride, const uint8_t *mask,
+ uint32_t mask_stride, int w, int h, int subx, int suby,
+ int bd);
+typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
+ protected:
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+ int run_times) override {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ }
+
+ int bit_depth_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlendA64MaskTestHBD);
+
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
+ for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+ bit_depth_ += 2) {
+ const int hi = 1 << bit_depth_;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi);
+ dst_tst_[i] = rng_(hi);
+ src0_[i] = rng_(hi);
+ src1_[i] = rng_(hi);
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, 1);
+ }
+ }
+}
+
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
+ for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+ bit_depth_ += 2) {
+ const int hi = 1 << bit_depth_;
+ const int lo = hi - 2;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure();
+ ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_(hi - lo) + lo;
+ dst_tst_[i] = rng_(hi - lo) + lo;
+ src0_[i] = rng_(hi - lo) + lo;
+ src1_[i] = rng_(hi - lo) + lo;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+ RunTest(bsize, 1);
+ }
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BlendA64MaskTestHBD,
+ ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+ aom_highbd_blend_a64_mask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64MaskTestHBD,
+ ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+ aom_highbd_blend_a64_mask_neon)));
+#endif // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// HBD _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD_D16)(uint8_t *dst, uint32_t dst_stride,
+ const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+ const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subx, int suby, ConvolveParams *conv_params,
+ const int bd);
+typedef libaom_test::FuncParam<FHBD_D16> TestFuncsHBD_d16;
+
+class BlendA64MaskTestHBD_d16
+ : public BlendA64MaskTest<FHBD_D16, uint16_t, uint16_t> {
+ protected:
+ // max number of bits used by the source
+ static const int kSrcMaxBitsMask = (1 << 14) - 1;
+ static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
+
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1,
+ int run_times) override {
+ ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
+ ConvolveParams conv_params;
+ conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+ conv_params.round_1 = COMPOUND_ROUND1_BITS;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+ bit_depth_);
+ }
+ if (params_.tst_func) {
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_),
+ dst_stride_, p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_, mask_,
+ kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+ bit_depth_);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 1) {
+ printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+ time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ }
+ }
+
+ int bit_depth_;
+ int src_max_bits_mask_;
+};
+
+TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
+ if (params_.tst_func == nullptr) return;
+ for (bit_depth_ = 8; bit_depth_ <= 12 && !HasFatalFailure();
+ bit_depth_ += 2) {
+ src_max_bits_mask_ =
+ (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL && !HasFatalFailure();
+ ++bsize) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand16() & src_max_bits_mask_;
+ src1_[i] = rng_.Rand16() & src_max_bits_mask_;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, 1);
+ }
+ }
+}
+
+TEST_P(BlendA64MaskTestHBD_d16, ExtremeValues) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ src_max_bits_mask_ =
+ (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = 0;
+ dst_tst_[i] = (1 << bit_depth_) - 1;
+
+ src0_[i] = src_max_bits_mask_;
+ src1_[i] = src_max_bits_mask_;
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ RunTest(bsize, 1);
+ }
+ }
+}
+
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
+ const int kRunTimes = 10000000;
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+ for (int i = 0; i < kBufSize; ++i) {
+ dst_ref_[i] = rng_.Rand12() % (1 << bit_depth_);
+ dst_tst_[i] = rng_.Rand12() % (1 << bit_depth_);
+
+ src0_[i] = rng_.Rand16();
+ src1_[i] = rng_.Rand16();
+ }
+
+ for (int i = 0; i < kMaxMaskSize; ++i)
+ mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+ RunTest(bsize, kRunTimes);
+ }
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ C, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_avx2)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlendA64MaskTestHBD_d16,
+ ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+ aom_highbd_blend_a64_d16_mask_neon)));
+#endif // HAVE_NEON
+
+// TODO(slavarnway): Enable the following in the avx2 commit. (56501)
+#if 0
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BlendA64MaskTestHBD,
+ ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+ aom_highbd_blend_a64_mask_avx2)));
+#endif // HAVE_AVX2
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/block_test.cc b/third_party/aom/test/block_test.cc
new file mode 100644
index 0000000000..686180cf87
--- /dev/null
+++ b/third_party/aom/test/block_test.cc
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_codec.h"
+#include "av1/common/blockd.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+// Verify the optimized implementation of get_partition_subsize() produces the
+// same results as the Partition_Subsize lookup table in the spec.
+TEST(BlockdTest, GetPartitionSubsize) {
+ // The Partition_Subsize table in the spec (Section 9.3. Conversion tables).
+ /* clang-format off */
+ static const BLOCK_SIZE kPartitionSubsize[10][BLOCK_SIZES_ALL] = {
+ {
+ BLOCK_4X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }, {
+ BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+ }
+ };
+ /* clang-format on */
+
+ for (int partition = 0; partition < 10; partition++) {
+ for (int bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+ EXPECT_EQ(kPartitionSubsize[partition][bsize],
+ get_partition_subsize(static_cast<BLOCK_SIZE>(bsize),
+ static_cast<PARTITION_TYPE>(partition)));
+ }
+ }
+}
+
+#if CONFIG_AV1_DECODER && CONFIG_AV1_ENCODER
+namespace {
+// This class is used to validate if sb_size configured is respected
+// in the bitstream
+class SuperBlockSizeTestLarge
+ : public ::libaom_test::CodecTestWith3Params<
+ libaom_test::TestMode, aom_superblock_size_t, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ SuperBlockSizeTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ superblock_size_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+ sb_size_violated_ = false;
+ }
+ ~SuperBlockSizeTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_target_bitrate = 1000;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, superblock_size_);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec &&
+ superblock_size_ != AOM_SUPERBLOCK_SIZE_DYNAMIC) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ aom_superblock_size_t sb_size;
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SB_SIZE, &sb_size);
+ if (superblock_size_ != sb_size) {
+ sb_size_violated_ = true;
+ }
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ aom_superblock_size_t superblock_size_;
+ bool sb_size_violated_;
+ aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(SuperBlockSizeTestLarge, SuperBlockSizeTest) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(sb_size_violated_, false)
+ << "Failed for SB size " << superblock_size_;
+}
+
+const ::libaom_test::TestMode kTestModes[] = {
+#if CONFIG_REALTIME_ONLY
+ ::libaom_test::kRealTime
+#else
+ ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood
+#endif
+};
+
+AV1_INSTANTIATE_TEST_SUITE(SuperBlockSizeTestLarge,
+ ::testing::ValuesIn(kTestModes),
+ ::testing::Values(AOM_SUPERBLOCK_SIZE_64X64,
+ AOM_SUPERBLOCK_SIZE_128X128),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+} // namespace
+#endif
diff --git a/third_party/aom/test/boolcoder_test.cc b/third_party/aom/test/boolcoder_test.cc
new file mode 100644
index 0000000000..52c58e0b2e
--- /dev/null
+++ b/third_party/aom/test/boolcoder_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int num_tests = 10;
+} // namespace
+
+TEST(AV1, TestBitIO) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int n = 0; n < num_tests; ++n) {
+ for (int method = 0; method <= 7; ++method) { // we generate various proba
+ const int kBitsToTest = 1000;
+ uint8_t probas[kBitsToTest];
+
+ for (int i = 0; i < kBitsToTest; ++i) {
+ const int parity = i & 1;
+ /* clang-format off */
+ probas[i] =
+ (method == 0) ? 0 : (method == 1) ? 255 :
+ (method == 2) ? 128 :
+ (method == 3) ? rnd.Rand8() :
+ (method == 4) ? (parity ? 0 : 255) :
+ // alternate between low and high proba:
+ (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
+ (method == 6) ?
+ (parity ? rnd(64) : 255 - rnd(64)) :
+ (parity ? rnd(32) : 255 - rnd(32));
+ /* clang-format on */
+ }
+ for (int bit_method = 0; bit_method <= 3; ++bit_method) {
+ const int random_seed = 6432;
+ const int kBufferSize = 10000;
+ ACMRandom bit_rnd(random_seed);
+ aom_writer bw;
+ uint8_t bw_buffer[kBufferSize];
+ aom_start_encode(&bw, bw_buffer);
+
+ int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
+ for (int i = 0; i < kBitsToTest; ++i) {
+ if (bit_method == 2) {
+ bit = (i & 1);
+ } else if (bit_method == 3) {
+ bit = bit_rnd(2);
+ }
+ aom_write(&bw, bit, static_cast<int>(probas[i]));
+ }
+
+ GTEST_ASSERT_GE(aom_stop_encode(&bw), 0);
+
+ aom_reader br;
+ aom_reader_init(&br, bw_buffer, bw.pos);
+ bit_rnd.Reset(random_seed);
+ for (int i = 0; i < kBitsToTest; ++i) {
+ if (bit_method == 2) {
+ bit = (i & 1);
+ } else if (bit_method == 3) {
+ bit = bit_rnd(2);
+ }
+ GTEST_ASSERT_EQ(aom_read(&br, probas[i], nullptr), bit)
+ << "pos: " << i << " / " << kBitsToTest
+ << " bit_method: " << bit_method << " method: " << method;
+ }
+ }
+ }
+ }
+}
+
+#define FRAC_DIFF_TOTAL_ERROR 0.18
+
+TEST(AV1, TestTell) {
+ const int kBufferSize = 10000;
+ aom_writer bw;
+ uint8_t bw_buffer[kBufferSize];
+ const int kSymbols = 1024;
+ // Coders are noisier at low probabilities, so we start at p = 4.
+ for (int p = 4; p < 256; p++) {
+ double probability = p / 256.;
+ aom_start_encode(&bw, bw_buffer);
+ for (int i = 0; i < kSymbols; i++) {
+ aom_write(&bw, 0, p);
+ }
+ GTEST_ASSERT_GE(aom_stop_encode(&bw), 0);
+ aom_reader br;
+ aom_reader_init(&br, bw_buffer, bw.pos);
+ uint32_t last_tell = aom_reader_tell(&br);
+ uint32_t last_tell_frac = aom_reader_tell_frac(&br);
+ double frac_diff_total = 0;
+ GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
+ GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ for (int i = 0; i < kSymbols; i++) {
+ aom_read(&br, p, nullptr);
+ uint32_t tell = aom_reader_tell(&br);
+ uint32_t tell_frac = aom_reader_tell_frac(&br);
+ GTEST_ASSERT_GE(tell, last_tell)
+ << "tell: " << tell << ", last_tell: " << last_tell;
+ GTEST_ASSERT_GE(tell_frac, last_tell_frac)
+ << "tell_frac: " << tell_frac
+ << ", last_tell_frac: " << last_tell_frac;
+ // Frac tell should round up to tell.
+ GTEST_ASSERT_EQ(tell, (tell_frac + 7) >> 3);
+ last_tell = tell;
+ frac_diff_total +=
+ fabs(((tell_frac - last_tell_frac) / 8.0) + log2(probability));
+ last_tell_frac = tell_frac;
+ }
+ const uint32_t expected = (uint32_t)(-kSymbols * log2(probability));
+ // Last tell should be close to the expected value.
+ GTEST_ASSERT_LE(last_tell, expected + 20) << " last_tell: " << last_tell;
+ // The average frac_diff error should be pretty small.
+ GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR)
+ << " frac_diff_total: " << frac_diff_total;
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ }
+}
+
+TEST(AV1, TestHasOverflowed) {
+ const int kBufferSize = 10000;
+ aom_writer bw;
+ uint8_t bw_buffer[kBufferSize];
+ const int kSymbols = 1024;
+ // Coders are noisier at low probabilities, so we start at p = 4.
+ for (int p = 4; p < 256; p++) {
+ aom_start_encode(&bw, bw_buffer);
+ for (int i = 0; i < kSymbols; i++) {
+ aom_write(&bw, 1, p);
+ }
+ GTEST_ASSERT_GE(aom_stop_encode(&bw), 0);
+ aom_reader br;
+ aom_reader_init(&br, bw_buffer, bw.pos);
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ for (int i = 0; i < kSymbols; i++) {
+ GTEST_ASSERT_EQ(aom_read(&br, p, nullptr), 1);
+ ASSERT_FALSE(aom_reader_has_overflowed(&br));
+ }
+ // In the worst case, the encoder uses just a tiny fraction of the last
+ // byte in the buffer. So to guarantee that aom_reader_has_overflowed()
+ // returns true, we have to consume very nearly 8 additional bits of data.
+ // In the worse case, one of the bits in that byte will be 1, and the rest
+ // will be zero. Once we are past that 1 bit, when the probability of
+ // reading zero symbol from aom_read() is high, each additional symbol read
+ // will consume very little additional data (in the case that p == 255,
+ // approximately -log_2(255/256) ~= 0.0056 bits). In that case it would
+ // take around 178 calls to consume more than 8 bits. That is only an upper
+ // bound. In practice we are not guaranteed to hit the worse case and can
+ // get away with 174 calls.
+ for (int i = 0; i < 174; i++) {
+ aom_read(&br, p, nullptr);
+ }
+ ASSERT_TRUE(aom_reader_has_overflowed(&br));
+ }
+}
diff --git a/third_party/aom/test/borders_test.cc b/third_party/aom/test/borders_test.cc
new file mode 100644
index 0000000000..594c3e8429
--- /dev/null
+++ b/third_party/aom/test/borders_test.cc
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class BordersTestLarge
+ : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
+ ~BordersTestLarge() override = default;
+
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 1);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (pkt->data.frame.flags & AOM_FRAME_IS_KEY) {
+ }
+ }
+};
+
+TEST_P(BordersTestLarge, TestEncodeHighBitrate) {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.rc_max_quantizer = 10;
+
+ ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 10);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(BordersTestLarge, TestLowBitrate) {
+ // Validate that this clip encodes and decodes without a mismatch
+ // when passing in a very high min q. This pushes the encoder to producing
+ // lots of small partitions which might will test the other condition.
+
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.rc_min_quantizer = 40;
+
+ ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 10);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(BordersTestLarge,
+ ::testing::Values(::libaom_test::kTwoPassGood));
+} // namespace
diff --git a/third_party/aom/test/cdef_test.cc b/third_party/aom/test/cdef_test.cc
new file mode 100644
index 0000000000..ad54407ca7
--- /dev/null
+++ b/third_party/aom/test/cdef_test.cc
@@ -0,0 +1,962 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <array>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/cdef_block.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+using CdefFilterBlockFunctions = std::array<cdef_filter_block_func, 4>;
+
+typedef std::tuple<CdefFilterBlockFunctions, CdefFilterBlockFunctions,
+ BLOCK_SIZE, int, int>
+ cdef_dir_param_t;
+
+class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
+ public:
+ ~CDEFBlockTest() override = default;
+ void SetUp() override {
+ cdef = GET_PARAM(0);
+ ref_cdef = GET_PARAM(1);
+ bsize = GET_PARAM(2);
+ boundary = GET_PARAM(3);
+ depth = GET_PARAM(4);
+ }
+
+ protected:
+ BLOCK_SIZE bsize;
+ int boundary;
+ int depth;
+ CdefFilterBlockFunctions cdef;
+ CdefFilterBlockFunctions ref_cdef;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFBlockTest);
+
+typedef CDEFBlockTest CDEFBlockHighbdTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFBlockHighbdTest);
+
+typedef CDEFBlockTest CDEFSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedTest);
+
+typedef CDEFBlockTest CDEFSpeedHighbdTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFSpeedHighbdTest);
+
+int64_t test_cdef(BLOCK_SIZE bsize, int iterations,
+ CdefFilterBlockFunctions cdef,
+ CdefFilterBlockFunctions ref_cdef, int boundary, int depth) {
+ aom_usec_timer ref_timer;
+ int64_t ref_elapsed_time = 0;
+ const int size = 8;
+ const int ysize = size + 2 * CDEF_VBORDER;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
+ DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
+ DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
+ memset(ref_d, 0, sizeof(ref_d));
+ memset(d, 0, sizeof(d));
+
+ int error = 0, pristrength = 0, secstrength, dir;
+ int pridamping, secdamping, bits, level, count,
+ errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
+ errpridamping = 0, errsecdamping = 0;
+ unsigned int pos = 0;
+
+ const int block_width =
+ ((bsize == BLOCK_8X8) || (bsize == BLOCK_8X4)) ? 8 : 4;
+ const int block_height =
+ ((bsize == BLOCK_8X8) || (bsize == BLOCK_4X8)) ? 8 : 4;
+ const unsigned int max_pos = size * size >> static_cast<int>(depth == 8);
+ for (pridamping = 3 + depth - 8; pridamping < 7 - 3 * !!boundary + depth - 8;
+ pridamping++) {
+ for (secdamping = 3 + depth - 8;
+ secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+ for (count = 0; count < iterations; count++) {
+ for (level = 0; level < (1 << depth) && !error;
+ level += (2 + 6 * !!boundary) << (depth - 8)) {
+ for (bits = 1; bits <= depth && !error; bits += 1 + 3 * !!boundary) {
+ for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+ s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << depth) - 1);
+ if (boundary) {
+ if (boundary & 1) { // Left
+ for (int i = 0; i < ysize; i++)
+ for (int j = 0; j < CDEF_HBORDER; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ if (boundary & 2) { // Right
+ for (int i = 0; i < ysize; i++)
+ for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ if (boundary & 4) { // Above
+ for (int i = 0; i < CDEF_VBORDER; i++)
+ for (int j = 0; j < CDEF_BSTRIDE; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ if (boundary & 8) { // Below
+ for (int i = CDEF_VBORDER + size; i < ysize; i++)
+ for (int j = 0; j < CDEF_BSTRIDE; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ }
+ for (dir = 0; dir < 8; dir++) {
+ for (pristrength = 0; pristrength <= 19 << (depth - 8) && !error;
+ pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+ if (pristrength == 16) pristrength = 19;
+ for (secstrength = 0; secstrength <= 4 << (depth - 8) && !error;
+ secstrength += 1 << (depth - 8)) {
+ if (secstrength == 3 << (depth - 8)) continue;
+
+ const int strength_index =
+ (secstrength == 0) | ((pristrength == 0) << 1);
+
+ aom_usec_timer_start(&ref_timer);
+ ref_cdef[strength_index](
+ ref_d, size,
+ s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+ pristrength, secstrength, dir, pridamping, secdamping,
+ depth - 8, block_width, block_height);
+ aom_usec_timer_mark(&ref_timer);
+ ref_elapsed_time += aom_usec_timer_elapsed(&ref_timer);
+ // If cdef and ref_cdef are the same, we're just testing
+ // speed
+ if (cdef[0] != ref_cdef[0])
+ API_REGISTER_STATE_CHECK(cdef[strength_index](
+ d, size, s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+ pristrength, secstrength, dir, pridamping, secdamping,
+ depth - 8, block_width, block_height));
+ if (ref_cdef[0] != cdef[0]) {
+ for (pos = 0; pos < max_pos && !error; pos++) {
+ error = ref_d[pos] != d[pos];
+ errdepth = depth;
+ errpristrength = pristrength;
+ errsecstrength = secstrength;
+ errboundary = boundary;
+ errpridamping = pridamping;
+ errsecdamping = secdamping;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ pos--;
+ EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
+ << std::endl
+ << "First error at " << pos % size << "," << pos / size
+ << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
+ << ") " << std::endl
+ << "pristrength: " << errpristrength << std::endl
+ << "pridamping: " << errpridamping << std::endl
+ << "secstrength: " << errsecstrength << std::endl
+ << "secdamping: " << errsecdamping << std::endl
+ << "depth: " << errdepth << std::endl
+ << "size: " << bsize << std::endl
+ << "boundary: " << errboundary << std::endl
+ << std::endl;
+
+ return ref_elapsed_time;
+}
+
+void test_cdef_speed(BLOCK_SIZE bsize, int iterations,
+ CdefFilterBlockFunctions cdef,
+ CdefFilterBlockFunctions ref_cdef, int boundary,
+ int depth) {
+ int64_t ref_elapsed_time =
+ test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth);
+
+ int64_t elapsed_time =
+ test_cdef(bsize, iterations, cdef, cdef, boundary, depth);
+
+ std::cout << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
+
+ EXPECT_GT(ref_elapsed_time, elapsed_time)
+ << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
+ << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift);
+
+typedef std::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+
+class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
+ public:
+ ~CDEFFindDirTest() override = default;
+ void SetUp() override {
+ finddir = GET_PARAM(0);
+ ref_finddir = GET_PARAM(1);
+ }
+
+ protected:
+ find_dir_t finddir;
+ find_dir_t ref_finddir;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirTest);
+
+typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirSpeedTest);
+
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift),
+ int (*ref_finddir)(const uint16_t *img, int stride,
+ int32_t *var, int coeff_shift)) {
+ const int size = 8;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, s[size * size]);
+
+ int error = 0;
+ int depth, bits, level, count, errdepth = 0;
+ int ref_res = 0, res = 0;
+ int32_t ref_var = 0, var = 0;
+
+ for (depth = 8; depth <= 12 && !error; depth += 2) {
+ for (count = 0; count < 512 && !error; count++) {
+ for (level = 0; level < (1 << depth) && !error;
+ level += 1 << (depth - 8)) {
+ for (bits = 1; bits <= depth && !error; bits++) {
+ for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+ s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << depth) - 1);
+ for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+ ref_res = ref_finddir(s, size, &ref_var, depth - 8);
+ if (finddir != ref_finddir)
+ API_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+ if (ref_finddir != finddir) {
+ if (res != ref_res || var != ref_var) error = 1;
+ errdepth = depth;
+ }
+ }
+ }
+ }
+ }
+
+ EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+ << std::endl
+ << "return: " << res << " : " << ref_res << std::endl
+ << "var: " << var << " : " << ref_var << std::endl
+ << "depth: " << errdepth << std::endl
+ << std::endl;
+}
+
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
+ int32_t *var, int coeff_shift),
+ int (*ref_finddir)(const uint16_t *img, int stride,
+ int32_t *var, int coeff_shift)) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&ref_timer);
+ test_finddir(ref_finddir, ref_finddir);
+ aom_usec_timer_mark(&ref_timer);
+ int64_t ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer_start(&timer);
+ test_finddir(finddir, finddir);
+ aom_usec_timer_mark(&timer);
+ int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+
+ EXPECT_GT(ref_elapsed_time, elapsed_time)
+ << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
+ << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef void (*find_dir_dual_t)(const uint16_t *img1, const uint16_t *img2,
+ int stride, int32_t *var1, int32_t *var2,
+ int coeff_shift, int *out1, int *out2);
+
+typedef std::tuple<find_dir_dual_t, find_dir_dual_t> find_dir_dual_param_t;
+
+class CDEFFindDirDualTest
+ : public ::testing::TestWithParam<find_dir_dual_param_t> {
+ public:
+ ~CDEFFindDirDualTest() override = default;
+ void SetUp() override {
+ finddir = GET_PARAM(0);
+ ref_finddir = GET_PARAM(1);
+ }
+
+ protected:
+ find_dir_dual_t finddir;
+ find_dir_dual_t ref_finddir;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirDualTest);
+
+typedef CDEFFindDirDualTest CDEFFindDirDualSpeedTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFFindDirDualSpeedTest);
+
+void test_finddir_dual(
+ void (*finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+ int32_t *var1, int32_t *var2, int coeff_shift, int *out1,
+ int *out2),
+ void (*ref_finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+ int32_t *var1, int32_t *var2, int coeff_shift,
+ int *out1, int *out2)) {
+ const int size_wd = 16;
+ const int size_ht = 8;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, s[size_ht * size_wd]);
+
+ int error = 0, errdepth = 0;
+ int32_t ref_var[2] = { 0 };
+ int ref_dir[2] = { 0 };
+ int32_t var[2] = { 0 };
+ int dir[2] = { 0 };
+
+ for (int depth = 8; depth <= 12 && !error; depth += 2) {
+ for (int count = 0; count < 512 && !error; count++) {
+ for (int level = 0; level < (1 << depth) && !error;
+ level += 1 << (depth - 8)) {
+ for (int bits = 1; bits <= depth && !error; bits++) {
+ for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+ s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << depth) - 1);
+ for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+ ref_finddir(s, s + 8, size_wd, &ref_var[0], &ref_var[1], depth - 8,
+ &ref_dir[0], &ref_dir[1]);
+ if (finddir != ref_finddir)
+ API_REGISTER_STATE_CHECK(finddir(s, s + 8, size_wd, &var[0],
+ &var[1], depth - 8, &dir[0],
+ &dir[1]));
+ if (ref_finddir != finddir) {
+ for (int j = 0; j < 2; j++) {
+ if (ref_dir[j] != dir[j] || ref_var[j] != var[j]) error = 1;
+ }
+ errdepth = depth;
+ }
+ }
+ }
+ }
+ }
+
+ for (int j = 0; j < 2; j++) {
+ EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+ << std::endl
+ << "direction: " << dir[j] << " : " << ref_dir[j]
+ << std::endl
+ << "variance: " << var[j] << " : " << ref_var[j]
+ << std::endl
+ << "depth: " << errdepth << std::endl
+ << std::endl;
+ }
+}
+
+void test_finddir_dual_speed(
+ void (*finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+ int32_t *var1, int32_t *var2, int coeff_shift, int *out1,
+ int *out2),
+ void (*ref_finddir)(const uint16_t *img1, const uint16_t *img2, int stride,
+ int32_t *var1, int32_t *var2, int coeff_shift,
+ int *out1, int *out2)) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&ref_timer);
+ test_finddir_dual(ref_finddir, ref_finddir);
+ aom_usec_timer_mark(&ref_timer);
+ const double ref_elapsed_time =
+ static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&timer);
+ test_finddir_dual(finddir, finddir);
+ aom_usec_timer_mark(&timer);
+ const double elapsed_time =
+ static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf(
+ "ref_time=%lf \t simd_time=%lf \t "
+ "gain=%lf \n",
+ ref_elapsed_time, elapsed_time, ref_elapsed_time / elapsed_time);
+}
+
+#define MAX_CDEF_BLOCK 256
+
+constexpr int kIterations = 100;
+
+using CDEFCopyRect8To16 = void (*)(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride, int width,
+ int height);
+
+using CDEFCopyRect8To16Param = std::tuple<CDEFCopyRect8To16, CDEFCopyRect8To16>;
+
+class CDEFCopyRect8to16Test
+ : public ::testing::TestWithParam<CDEFCopyRect8To16Param> {
+ public:
+ CDEFCopyRect8to16Test()
+ : rnd_(libaom_test::ACMRandom::DeterministicSeed()),
+ test_func_(GET_PARAM(0)), ref_func_(GET_PARAM(1)) {}
+ ~CDEFCopyRect8to16Test() override = default;
+ void SetUp() override {
+ src_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(8, sizeof(uint8_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(src_, nullptr);
+ ref_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(ref_dst_, nullptr);
+ test_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(test_dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(ref_dst_);
+ aom_free(test_dst_);
+ }
+
+ void test_copy_rect_8_to_16(CDEFCopyRect8To16 test_func,
+ CDEFCopyRect8To16 ref_func) {
+ constexpr int stride = MAX_CDEF_BLOCK;
+ int error = 0;
+ for (int k = 0; k < kIterations && !error; k++) {
+ // This function operates on values of width that are either 4 or a
+ // multiple of 8. For height, generate a random value between 1 and 256,
+ // making sure it is even.
+ const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8;
+ const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2;
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ src_[i * stride + j] = rnd_.Rand8();
+ }
+ }
+
+ ref_func(ref_dst_, stride, src_, stride, width, height);
+ test_func(test_dst_, stride, src_, stride, width, height);
+
+ int i, j;
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ if (test_dst_[i * stride + j] != ref_dst_[i * stride + j]) {
+ error = 1;
+ break;
+ }
+ }
+ if (error) {
+ break;
+ }
+ }
+ EXPECT_EQ(0, error)
+ << "Error: CDEFCopyRect8to16Test, SIMD and C mismatch." << std::endl
+ << "First error at " << i << "," << j << " ("
+ << ref_dst_[i * stride + j] << " : " << test_dst_[i * stride + j]
+ << ") " << std::endl
+ << "width: " << width << std::endl
+ << "height: " << height << std::endl
+ << std::endl;
+ }
+ }
+
+ protected:
+ libaom_test::ACMRandom rnd_;
+ uint8_t *src_;
+ uint16_t *ref_dst_;
+ uint16_t *test_dst_;
+ CDEFCopyRect8To16 test_func_;
+ CDEFCopyRect8To16 ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFCopyRect8to16Test);
+
+using CDEFCopyRect16To16 = void (*)(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride, int width,
+ int height);
+
+using CDEFCopyRect16To16Param =
+ std::tuple<CDEFCopyRect16To16, CDEFCopyRect16To16>;
+
+class CDEFCopyRect16to16Test
+ : public ::testing::TestWithParam<CDEFCopyRect16To16Param> {
+ public:
+ CDEFCopyRect16to16Test()
+ : rnd_(libaom_test::ACMRandom::DeterministicSeed()),
+ test_func_(GET_PARAM(0)), ref_func_(GET_PARAM(1)) {}
+ ~CDEFCopyRect16to16Test() override = default;
+ void SetUp() override {
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(src_, nullptr);
+ ref_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(ref_dst_, nullptr);
+ test_dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_CDEF_BLOCK * MAX_CDEF_BLOCK));
+ ASSERT_NE(test_dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(ref_dst_);
+ aom_free(test_dst_);
+ }
+
+ void test_copy_rect_16_to_16(CDEFCopyRect16To16 test_func,
+ CDEFCopyRect16To16 ref_func) {
+ constexpr int stride = MAX_CDEF_BLOCK;
+ int error = 0;
+ for (int k = 0; k < kIterations && !error; k++) {
+ // This function operates on values of width that are either 4 or a
+ // multiple of 8. For height, generate a random value between 1 and 256,
+ // making sure it is even.
+ const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8;
+ const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2;
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ src_[i * stride + j] = rnd_.Rand16();
+ }
+ }
+
+ ref_func(ref_dst_, stride, src_, stride, width, height);
+ test_func(test_dst_, stride, src_, stride, width, height);
+
+ int i, j;
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ if (test_dst_[i * stride + j] != ref_dst_[i * stride + j]) {
+ error = 1;
+ break;
+ }
+ }
+ if (error) {
+ break;
+ }
+ }
+ EXPECT_EQ(0, error)
+ << "Error: CDEFCopyRect16to16Test, SIMD and C mismatch." << std::endl
+ << "First error at " << i << "," << j << " ("
+ << ref_dst_[i * stride + j] << " : " << test_dst_[i * stride + j]
+ << ") " << std::endl
+ << "width: " << width << std::endl
+ << "height: " << height << std::endl
+ << std::endl;
+ }
+ }
+
+ protected:
+ libaom_test::ACMRandom rnd_;
+ uint16_t *src_;
+ uint16_t *ref_dst_;
+ uint16_t *test_dst_;
+ CDEFCopyRect16To16 test_func_;
+ CDEFCopyRect16To16 ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CDEFCopyRect16to16Test);
+
+TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
+ test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFBlockHighbdTest, TestSIMDHighbdNoMismatch) {
+ test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
+ test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFSpeedHighbdTest, DISABLED_TestSpeed) {
+ test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
+ test_finddir(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
+ test_finddir_speed(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirDualTest, TestSIMDNoMismatch) {
+ test_finddir_dual(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirDualSpeedTest, DISABLED_TestSpeed) {
+ test_finddir_dual_speed(finddir, ref_finddir);
+}
+
+TEST_P(CDEFCopyRect8to16Test, TestSIMDNoMismatch) {
+ test_copy_rect_8_to_16(test_func_, ref_func_);
+}
+
+TEST_P(CDEFCopyRect16to16Test, TestSIMDNoMismatch) {
+ test_copy_rect_16_to_16(test_func_, ref_func_);
+}
+
+using std::make_tuple;
+
+#if (HAVE_SSE2 || HAVE_SSSE3 || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON)
+static const CdefFilterBlockFunctions kCdefFilterFuncC[] = {
+ { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c,
+ &cdef_filter_8_3_c }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncC[] = {
+ { &cdef_filter_16_0_c, &cdef_filter_16_0_c, &cdef_filter_16_0_c,
+ &cdef_filter_16_0_c }
+};
+#endif
+
+#if HAVE_SSE2
+static const CdefFilterBlockFunctions kCdefFilterFuncSse2[] = {
+ { &cdef_filter_8_0_sse2, &cdef_filter_8_1_sse2, &cdef_filter_8_2_sse2,
+ &cdef_filter_8_3_sse2 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSse2[] = {
+ { &cdef_filter_16_0_sse2, &cdef_filter_16_1_sse2, &cdef_filter_16_2_sse2,
+ &cdef_filter_16_3_sse2 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFBlockHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2,
+ &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_sse2)));
+#endif
+
+#if HAVE_SSSE3
+static const CdefFilterBlockFunctions kCdefFilterFuncSsse3[] = {
+ { &cdef_filter_8_0_ssse3, &cdef_filter_8_1_ssse3, &cdef_filter_8_2_ssse3,
+ &cdef_filter_8_3_ssse3 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSsse3[] = {
+ { &cdef_filter_16_0_ssse3, &cdef_filter_16_1_ssse3, &cdef_filter_16_2_ssse3,
+ &cdef_filter_16_3_ssse3 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFBlockHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSsse3),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
+ &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_ssse3)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_ssse3)));
+#endif
+
+#if HAVE_SSE4_1
+static const CdefFilterBlockFunctions kCdefFilterFuncSse4_1[] = {
+ { &cdef_filter_8_0_sse4_1, &cdef_filter_8_1_sse4_1, &cdef_filter_8_2_sse4_1,
+ &cdef_filter_8_3_sse4_1 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSse4_1[] = {
+ { &cdef_filter_16_0_sse4_1, &cdef_filter_16_1_sse4_1,
+ &cdef_filter_16_2_sse4_1, &cdef_filter_16_3_sse4_1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse4_1),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFBlockHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse4_1),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFFindDirDualTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_sse4_1,
+ &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_sse4_1)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+static const CdefFilterBlockFunctions kCdefFilterFuncAvx2[] = {
+ { &cdef_filter_8_0_avx2, &cdef_filter_8_1_avx2, &cdef_filter_8_2_avx2,
+ &cdef_filter_8_3_avx2 }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncAvx2[] = {
+ { &cdef_filter_16_0_avx2, &cdef_filter_16_1_avx2, &cdef_filter_16_2_avx2,
+ &cdef_filter_16_3_avx2 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncAvx2),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFBlockHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncAvx2),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirDualTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_avx2,
+ &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_avx2)));
+#endif
+
+#if HAVE_NEON
+static const CdefFilterBlockFunctions kCdefFilterFuncNeon[] = {
+ { &cdef_filter_8_0_neon, &cdef_filter_8_1_neon, &cdef_filter_8_2_neon,
+ &cdef_filter_8_3_neon }
+};
+
+static const CdefFilterBlockFunctions kCdefFilterHighbdFuncNeon[] = {
+ { &cdef_filter_16_0_neon, &cdef_filter_16_1_neon, &cdef_filter_16_2_neon,
+ &cdef_filter_16_3_neon }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncNeon),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFBlockHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncNeon),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Range(10, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_neon,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_neon,
+ &cdef_find_dir_dual_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFCopyRect8to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c,
+ &cdef_copy_rect8_8bit_to_16bit_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFCopyRect16to16Test,
+ ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c,
+ &cdef_copy_rect8_16bit_to_16bit_neon)));
+#endif
+
+// Test speed for all supported architectures
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFSpeedTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, CDEFSpeedHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2,
+ &cdef_find_dir_dual_c)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFSpeedTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, CDEFSpeedHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSsse3),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirDualSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_ssse3,
+ &cdef_find_dir_dual_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFSpeedTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse4_1),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFSpeedHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse4_1),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, CDEFFindDirDualSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_sse4_1,
+ &cdef_find_dir_dual_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFSpeedTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncAvx2),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, CDEFSpeedHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncAvx2),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirDualSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_avx2,
+ &cdef_find_dir_dual_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFSpeedTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncNeon),
+ ::testing::ValuesIn(kCdefFilterFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(8)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, CDEFSpeedHighbdTest,
+ ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncNeon),
+ ::testing::ValuesIn(kCdefFilterHighbdFuncC),
+ ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8),
+ ::testing::Range(0, 16), ::testing::Values(10)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_neon,
+ &cdef_find_dir_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirDualSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_dual_neon,
+ &cdef_find_dir_dual_c)));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/cfl_test.cc b/third_party/aom/test/cfl_test.cc
new file mode 100644
index 0000000000..7fdea04c36
--- /dev/null
+++ b/third_party/aom/test/cfl_test.cc
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/util.h"
+#include "test/acm_random.h"
+
+using std::make_tuple;
+
+using libaom_test::ACMRandom;
+
+#define NUM_ITERATIONS (100)
+#define NUM_ITERATIONS_SPEED (INT16_MAX)
+
+#define ALL_CFL_TX_SIZES(function) \
+ make_tuple(static_cast<TX_SIZE>(TX_4X4), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_4X8), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_4X16), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X4), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X8), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X16), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X32), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X4), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X8), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X16), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X32), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_32X8), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_32X16), &function), \
+ make_tuple(static_cast<TX_SIZE>(TX_32X32), &function)
+
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444) \
+ make_tuple(static_cast<TX_SIZE>(TX_4X4), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_4X8), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_4X16), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X4), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X8), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X16), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_8X32), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X4), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X8), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X16), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_16X32), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_32X8), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_32X16), &fun420, &fun422, &fun444), \
+ make_tuple(static_cast<TX_SIZE>(TX_32X32), &fun420, &fun422, &fun444)
+
+namespace {
+
+template <typename A>
+static void assert_eq(const A *a, const A *b, int width, int height) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ ASSERT_EQ(a[j * CFL_BUF_LINE + i], b[j * CFL_BUF_LINE + i]);
+ }
+ }
+}
+
+static void assertFaster(int ref_elapsed_time, int elapsed_time) {
+ EXPECT_GT(ref_elapsed_time, elapsed_time)
+ << "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
+ << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
+ int height) {
+ std::cout.precision(2);
+ std::cout << "[ ] " << width << "x" << height
+ << ": C time = " << ref_elapsed_time
+ << " us, SIMD time = " << elapsed_time << " us"
+ << " (~" << ref_elapsed_time / (double)elapsed_time << "x) "
+ << std::endl;
+}
+
+class CFLTest {
+ public:
+ virtual ~CFLTest() = default;
+ void init(TX_SIZE tx) {
+ tx_size = tx;
+ width = tx_size_wide[tx_size];
+ height = tx_size_high[tx_size];
+ rnd.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ protected:
+ TX_SIZE tx_size;
+ int width;
+ int height;
+ ACMRandom rnd;
+};
+
+template <typename I>
+class CFLTestWithData : public CFLTest {
+ public:
+ ~CFLTestWithData() override = default;
+
+ protected:
+ I data[CFL_BUF_SQUARE];
+ I data_ref[CFL_BUF_SQUARE];
+ void randData(I (ACMRandom::*random)()) {
+ for (int j = 0; j < this->height; j++) {
+ for (int i = 0; i < this->width; i++) {
+ const I d = (this->rnd.*random)();
+ data[j * CFL_BUF_LINE + i] = d;
+ data_ref[j * CFL_BUF_LINE + i] = d;
+ }
+ }
+ }
+};
+
+template <typename I>
+class CFLTestWithAlignedData : public CFLTest {
+ public:
+ ~CFLTestWithAlignedData() override {
+ aom_free(chroma_pels_ref);
+ aom_free(sub_luma_pels_ref);
+ aom_free(chroma_pels);
+ aom_free(sub_luma_pels);
+ }
+
+ protected:
+ void init() {
+ chroma_pels_ref =
+ reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+ ASSERT_NE(chroma_pels_ref, nullptr);
+ chroma_pels =
+ reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+ ASSERT_NE(chroma_pels, nullptr);
+ sub_luma_pels_ref = reinterpret_cast<int16_t *>(
+ aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+ ASSERT_NE(sub_luma_pels_ref, nullptr);
+ sub_luma_pels = reinterpret_cast<int16_t *>(
+ aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+ ASSERT_NE(sub_luma_pels, nullptr);
+ memset(chroma_pels_ref, 0, sizeof(I) * CFL_BUF_SQUARE);
+ memset(chroma_pels, 0, sizeof(I) * CFL_BUF_SQUARE);
+ memset(sub_luma_pels_ref, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+ memset(sub_luma_pels, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+ }
+
+ I *chroma_pels_ref;
+ I *chroma_pels;
+ int16_t *sub_luma_pels_ref;
+ int16_t *sub_luma_pels;
+ int alpha_q3;
+ I dc;
+ void randData(int bd) {
+ alpha_q3 = this->rnd(33) - 16;
+ dc = this->rnd(1 << bd);
+ for (int j = 0; j < this->height; j++) {
+ for (int i = 0; i < this->width; i++) {
+ chroma_pels[j * CFL_BUF_LINE + i] = dc;
+ chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
+ sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
+ sub_luma_pels[j * CFL_BUF_LINE + i] = this->rnd(1 << (bd + 3));
+ }
+ }
+ }
+};
+
+typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
+class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
+ public CFLTestWithData<int16_t> {
+ public:
+ void SetUp() override {
+ CFLTest::init(std::get<0>(this->GetParam()));
+ sub_avg = std::get<1>(this->GetParam())(tx_size);
+ sub_avg_ref = cfl_get_subtract_average_fn_c(tx_size);
+ }
+ ~CFLSubAvgTest() override = default;
+
+ protected:
+ cfl_subtract_average_fn sub_avg;
+ cfl_subtract_average_fn sub_avg_ref;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubAvgTest);
+
+TEST_P(CFLSubAvgTest, SubAvgTest) {
+ for (int it = 0; it < NUM_ITERATIONS; it++) {
+ randData(&ACMRandom::Rand15);
+ sub_avg((uint16_t *)data, data);
+ sub_avg_ref((uint16_t *)data_ref, data_ref);
+ assert_eq<int16_t>(data, data_ref, width, height);
+ }
+}
+
+TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+ randData(&ACMRandom::Rand15);
+ aom_usec_timer_start(&ref_timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ sub_avg_ref((uint16_t *)data_ref, data_ref);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ sub_avg((uint16_t *)data, data);
+ }
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+ printSpeed(ref_elapsed_time, elapsed_time, width, height);
+ assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+template <typename S, typename T, typename I>
+class CFLSubsampleTest : public ::testing::TestWithParam<S>,
+ public CFLTestWithData<I> {
+ public:
+ void SetUp() override {
+ CFLTest::init(std::get<0>(this->GetParam()));
+ fun_420 = std::get<1>(this->GetParam())(this->tx_size);
+ fun_422 = std::get<2>(this->GetParam())(this->tx_size);
+ fun_444 = std::get<3>(this->GetParam())(this->tx_size);
+ }
+
+ protected:
+ T fun_420;
+ T fun_422;
+ T fun_444;
+ T fun_420_ref;
+ T fun_422_ref;
+ T fun_444_ref;
+
+ void subsampleTest(T fun, T fun_ref, int sub_width, int sub_height,
+ I (ACMRandom::*random)()) {
+ uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+ uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+
+ for (int it = 0; it < NUM_ITERATIONS; it++) {
+ CFLTestWithData<I>::randData(random);
+ fun(this->data, CFL_BUF_LINE, sub_luma_pels);
+ fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels_ref);
+ assert_eq<uint16_t>(sub_luma_pels, sub_luma_pels_ref, sub_width,
+ sub_height);
+ }
+ }
+
+ void subsampleSpeedTest(T fun, T fun_ref, I (ACMRandom::*random)()) {
+ uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+ uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+
+ CFLTestWithData<I>::randData(random);
+ aom_usec_timer_start(&ref_timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ fun(this->data, CFL_BUF_LINE, sub_luma_pels_ref);
+ }
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+ printSpeed(ref_elapsed_time, elapsed_time, this->width, this->height);
+ assertFaster(ref_elapsed_time, elapsed_time);
+ }
+};
+
+typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
+ get_subsample_lbd_fn>
+ subsample_lbd_param;
+class CFLSubsampleLBDTest
+ : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
+ uint8_t> {
+ public:
+ ~CFLSubsampleLBDTest() override = default;
+ void SetUp() override {
+ CFLSubsampleTest::SetUp();
+ fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
+ fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
+ fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleLBDTest);
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
+ subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+ &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD420SpeedTest) {
+ subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD422Test) {
+ subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD422SpeedTest) {
+ subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD444Test) {
+ subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD444SpeedTest) {
+ subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
+ get_subsample_hbd_fn>
+ subsample_hbd_param;
+class CFLSubsampleHBDTest
+ : public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
+ uint16_t> {
+ public:
+ ~CFLSubsampleHBDTest() override = default;
+ void SetUp() override {
+ CFLSubsampleTest::SetUp();
+ fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
+ fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
+ fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLSubsampleHBDTest);
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) {
+ subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+ &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD420SpeedTest) {
+ subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD422Test) {
+ subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD422SpeedTest) {
+ subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) {
+ subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) {
+ subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_predict_fn> predict_param;
+class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
+ public CFLTestWithAlignedData<uint8_t> {
+ public:
+ void SetUp() override {
+ CFLTest::init(std::get<0>(this->GetParam()));
+ CFLTestWithAlignedData::init();
+ predict = std::get<1>(this->GetParam())(tx_size);
+ predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
+ }
+ ~CFLPredictTest() override = default;
+
+ protected:
+ cfl_predict_lbd_fn predict;
+ cfl_predict_lbd_fn predict_ref;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictTest);
+
+TEST_P(CFLPredictTest, PredictTest) {
+ for (int it = 0; it < NUM_ITERATIONS; it++) {
+ randData(8);
+ predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+ predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+ assert_eq<uint8_t>(chroma_pels, chroma_pels_ref, width, height);
+ }
+}
+TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+ randData(8);
+ aom_usec_timer_start(&ref_timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+ }
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+ printSpeed(ref_elapsed_time, elapsed_time, width, height);
+ assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
+class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
+ public CFLTestWithAlignedData<uint16_t> {
+ public:
+ void SetUp() override {
+ CFLTest::init(std::get<0>(this->GetParam()));
+ CFLTestWithAlignedData::init();
+ predict = std::get<1>(this->GetParam())(tx_size);
+ predict_ref = cfl_get_predict_hbd_fn_c(tx_size);
+ }
+ ~CFLPredictHBDTest() override = default;
+
+ protected:
+ cfl_predict_hbd_fn predict;
+ cfl_predict_hbd_fn predict_ref;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CFLPredictHBDTest);
+
+TEST_P(CFLPredictHBDTest, PredictHBDTest) {
+ int bd = 12;
+ for (int it = 0; it < NUM_ITERATIONS; it++) {
+ randData(bd);
+ predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+ predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+ assert_eq<uint16_t>(chroma_pels, chroma_pels_ref, width, height);
+ }
+}
+TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+ const int bd = 12;
+ randData(bd);
+ aom_usec_timer_start(&ref_timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+ predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+ }
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+ printSpeed(ref_elapsed_time, elapsed_time, width, height);
+ assertFaster(ref_elapsed_time, elapsed_time);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSE2
+const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES(
+ cfl_get_subtract_average_fn_sse2) };
+
+INSTANTIATE_TEST_SUITE_P(SSE2, CFLSubAvgTest,
+ ::testing::ValuesIn(sub_avg_sizes_sse2));
+
+#endif
+
+#if HAVE_SSSE3
+const subsample_lbd_param subsample_lbd_sizes_ssse3[] = {
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_ssse3,
+ cfl_get_luma_subsampling_422_lbd_ssse3,
+ cfl_get_luma_subsampling_444_lbd_ssse3)
+};
+
+const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
+ cfl_get_predict_lbd_fn_ssse3) };
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleLBDTest,
+ ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictTest,
+ ::testing::ValuesIn(predict_sizes_ssse3));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
+ cfl_get_luma_subsampling_422_hbd_ssse3,
+ cfl_get_luma_subsampling_444_hbd_ssse3)
+};
+
+const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES(
+ cfl_get_predict_hbd_fn_ssse3) };
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleHBDTest,
+ ::testing::ValuesIn(subsample_hbd_sizes_ssse3));
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictHBDTest,
+ ::testing::ValuesIn(predict_sizes_hbd_ssse3));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_SSSE3
+
+#if HAVE_AVX2
+const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+ cfl_get_subtract_average_fn_avx2) };
+
+const subsample_lbd_param subsample_lbd_sizes_avx2[] = {
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2,
+ cfl_get_luma_subsampling_422_lbd_avx2,
+ cfl_get_luma_subsampling_444_lbd_avx2)
+};
+
+const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+ cfl_get_predict_lbd_fn_avx2) };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubAvgTest,
+ ::testing::ValuesIn(sub_avg_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleLBDTest,
+ ::testing::ValuesIn(subsample_lbd_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictTest,
+ ::testing::ValuesIn(predict_sizes_avx2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const subsample_hbd_param subsample_hbd_sizes_avx2[] = {
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2,
+ cfl_get_luma_subsampling_422_hbd_avx2,
+ cfl_get_luma_subsampling_444_hbd_avx2)
+};
+
+const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
+ cfl_get_predict_hbd_fn_avx2) };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleHBDTest,
+ ::testing::ValuesIn(subsample_hbd_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictHBDTest,
+ ::testing::ValuesIn(predict_sizes_hbd_avx2));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
+ cfl_get_subtract_average_fn_neon) };
+
+const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
+ cfl_get_predict_lbd_fn_neon) };
+
+const subsample_lbd_param subsample_lbd_sizes_neon[] = {
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon,
+ cfl_get_luma_subsampling_422_lbd_neon,
+ cfl_get_luma_subsampling_444_lbd_neon)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubAvgTest,
+ ::testing::ValuesIn(sub_avg_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleLBDTest,
+ ::testing::ValuesIn(subsample_lbd_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictTest,
+ ::testing::ValuesIn(predict_sizes_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const subsample_hbd_param subsample_hbd_sizes_neon[] = {
+ ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
+ cfl_get_luma_subsampling_422_hbd_neon,
+ cfl_get_luma_subsampling_444_hbd_neon)
+};
+
+const predict_param_hbd predict_sizes_hbd_neon[] = { ALL_CFL_TX_SIZES(
+ cfl_get_predict_hbd_fn_neon) };
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleHBDTest,
+ ::testing::ValuesIn(subsample_hbd_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictHBDTest,
+ ::testing::ValuesIn(predict_sizes_hbd_neon));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_NEON
+
+#if HAVE_VSX
+const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES(
+ cfl_get_subtract_average_fn_vsx) };
+
+INSTANTIATE_TEST_SUITE_P(VSX, CFLSubAvgTest,
+ ::testing::ValuesIn(sub_avg_sizes_vsx));
+#endif
+} // namespace
diff --git a/third_party/aom/test/cnn_test.cc b/third_party/aom/test/cnn_test.cc
new file mode 100644
index 0000000000..e5114b56ce
--- /dev/null
+++ b/third_party/aom/test/cnn_test.cc
@@ -0,0 +1,2661 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/util.h"
+
+#define SQR(x) ((x) * (x))
+
+// Best possible pixelwise guaranteed precision given each float has at most
+// 3 specified decimals.
+#define PIXELWISE_FLOAT_TOL 1E-2
+
+#define MSE_FLOAT_TOL 1E-6
+#define MSE_INT_TOL 0
+
+// CNN convolve pixelwise error threshold for functional equivalence.
+#define CNN_CONVOLVE_PIXELWISE_FLOAT_TOL 1E-3f
+
+namespace {
+
+class CNNTest : public ::testing::Test {
+ protected:
+ static void RunCNNTest(int image_width, int image_height, const float *input,
+ const float *expected, const CNN_CONFIG *cnn_config,
+ int in_stride, CNN_THREAD_DATA *thread_data,
+ double tolerance) {
+ int out_width, out_height, out_channels;
+ av1_find_cnn_output_size(image_width, image_height, cnn_config, &out_width,
+ &out_height, &out_channels);
+
+ const int out_size = out_width * out_height;
+ const int out_stride = out_width;
+
+ float *output_ =
+ (float *)aom_malloc(sizeof(*output_) * out_size * out_channels);
+ ASSERT_NE(output_, nullptr);
+ float *output[CNN_MAX_CHANNELS] = { nullptr };
+ for (int channel = 0; channel < out_channels; ++channel) {
+ output[channel] = output_ + (channel * out_size);
+ }
+ const int num_outputs = 1;
+ const int output_chs[1] = { out_channels };
+ const int output_strides[1] = { out_stride };
+ CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_strides,
+ output };
+
+ RunMultiOutCNNTest(&input, image_width, image_height, in_stride, cnn_config,
+ thread_data, &output_struct, &expected, tolerance);
+
+ aom_free(output_);
+ }
+
+ static void RunMultiOutCNNTest(const float **input, int image_width,
+ int image_height, int in_stride,
+ const CNN_CONFIG *cnn_config,
+ CNN_THREAD_DATA *thread_data,
+ CNN_MULTI_OUT *output, const float **expected,
+ double tolerance) {
+ const int num_outputs = output->num_outputs;
+ const int *output_chs = output->output_channels;
+
+ int *out_widths = (int *)aom_calloc(num_outputs, sizeof(*out_widths));
+ int *out_heights = (int *)aom_calloc(num_outputs, sizeof(*out_heights));
+ int *not_used = (int *)aom_calloc(num_outputs, sizeof(*not_used));
+ ASSERT_NE(out_widths, nullptr);
+ ASSERT_NE(out_heights, nullptr);
+ ASSERT_NE(not_used, nullptr);
+
+ av1_find_cnn_output_size(image_width, image_height, cnn_config, out_widths,
+ out_heights, not_used);
+ ASSERT_TRUE(av1_cnn_predict(input, image_width, image_height, in_stride,
+ cnn_config, thread_data, output));
+
+ int channel_offset = 0;
+ for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+ const float *expected_out = expected[output_idx];
+ const int curr_output_chs = output_chs[output_idx];
+ const int out_size = out_widths[output_idx] * out_heights[output_idx];
+
+ double mse = 0;
+ int expected_ite = 0;
+ for (int channel = 0; channel < curr_output_chs; ++channel) {
+ const float *buf_out = output->output_buffer[channel_offset];
+
+ for (int i = 0; i < out_size; ++i) {
+ EXPECT_NEAR(expected_out[expected_ite], buf_out[i],
+ PIXELWISE_FLOAT_TOL)
+ << " output " << output_idx << " channel " << channel << " pixel "
+ << expected_ite % out_size << ": " << expected_out[expected_ite]
+ << "/" << buf_out[i] << std::endl;
+ mse += SQR(expected_out[expected_ite] - buf_out[i]);
+ expected_ite++;
+ }
+
+ channel_offset++;
+ }
+ mse /= (out_size * curr_output_chs);
+ EXPECT_LE(mse, tolerance) << " output " << output_idx << std::endl;
+ }
+
+ aom_free(out_widths);
+ aom_free(out_heights);
+ aom_free(not_used);
+ }
+
+ static void AssignLayerWeightsBiases(CNN_CONFIG *cnn_config, float *weights,
+ float *bias) {
+ size_t weight_offset = 0;
+ size_t bias_offset = 0;
+ for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+ CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+ layer_config->weights = weights + weight_offset;
+ layer_config->bias = bias + bias_offset;
+ weight_offset += layer_config->filter_width *
+ layer_config->filter_height * layer_config->in_channels *
+ layer_config->out_channels;
+ bias_offset += layer_config->out_channels;
+
+ ASSERT_NE(layer_config->weights, nullptr);
+ ASSERT_NE(layer_config->bias, nullptr);
+ }
+ }
+};
+
+} // namespace
+
+TEST_F(CNNTest, TestMultilayerConvolution) {
+ int image_height = 16;
+ int image_width = 16;
+ int filter_height = 5;
+ int filter_width = 4;
+
+ float input[] = {
+ -3, 1, -3, 2, -2, -2, 2, -2, 1, -2, -3, 1, 2, 2, 2, -2, 0, 1, -1,
+ -3, -1, -1, 1, 0, -3, 1, 0, -1, 1, 0, 0, -3, -3, -3, 0, 2, 1, -1,
+ 2, 0, 1, -3, -1, 2, 2, 1, -2, 0, -1, 0, -2, -2, -1, 1, 0, 0, 0,
+ -2, -2, -2, 1, 1, -2, 1, 1, -2, -2, 1, -2, -1, -2, -3, 2, -3, -1, 1,
+ 0, -2, -2, -2, 1, -2, -2, -1, -1, 2, 2, 2, -1, 1, -3, -3, 0, 2, 0,
+ 2, 1, -3, -3, 1, 2, 2, 1, -2, -3, 0, -3, 0, -3, -2, 0, 1, 1, 0,
+ -3, 2, -1, 2, 1, 0, 1, -2, 1, -1, -1, 2, 0, -2, -3, 1, 1, -2, -1,
+ -3, -3, -1, 0, -3, -2, 0, 0, 1, 0, -3, -2, -1, 1, 0, 2, 1, 0, -3,
+ -2, -3, -3, -1, 0, -2, 2, -1, -3, 0, -1, -1, 2, 0, -3, -2, -1, 0, 0,
+ 1, -2, 1, 2, 1, 2, 2, -3, 2, -1, 0, 0, -1, 0, 2, 2, -1, 2, -2,
+ 1, 1, -3, -3, 1, -1, -1, -2, 2, -2, -2, 2, -1, -3, 2, -3, 1, -1, -1,
+ -3, 1, -1, 1, 0, -3, -3, 1, -3, -3, 0, 2, 2, -2, -1, 2, 0, 2, 1,
+ -1, -3, 0, 0, -1, -1, 1, 0, 2, 0, -3, 2, 1, 0, 1, -3, 2, -3, -3,
+ -1, -3, -3, 2, 0, 2, -2, 1, -1,
+ };
+
+ float weights[] = {
+ -2, 2, -2, 2, -1, -3, 2, 2, 0, 0, -3, -1, -2, -3, 1, -1, 0, 0, 0,
+ 2, -2, 2, -2, -3, 1, 1, 1, -3, -1, 0, 1, 2, -2, 0, -1, -3, -1, -2,
+ 2, -3, -3, 1, -2, -3, 0, 2, 1, -3, -3, -1, -3, -2, -1, -3, -1, -3, -2,
+ -1, -3, -1, -2, -2, -3, 2, 0, -3, 0, -3, -3, 1, -3, -1, 0, -1, 1, 1,
+ -1, 1, -2, 0, 2, 0, -3, 1, -1, -1, 2, 0, 1, -3, -3, 1, 2, -3, -3,
+ 1, -3, 2, 0, -3, 1, 2, 2, -2, -1, -2, 1, 1, 0, -2, -2, 1, 2, -1,
+ -3, 1, -2, 2, -3, -2, -3, 2, 1, 0, -2, 0, 1, -3, 2, -2, -2, 0, 2,
+ -3, 2, 0, 0, 1, -2, 1, 1, -2, -1, -2, 1, -2, 0, -2, -2, 0, -1, -1,
+ -3, -3, -3, 1, -3, -2, 2, -1, 2, 0, 2, -2, 2, -2, 1, -3, -3, -1, 0,
+ 2, 2, 1, -1, -3, -1, -3, 2, 1, -2, 0, -3, -1, -3, -1, 2, 1, 0, 2,
+ -1, 1, 0, 1, 2, -1, -2, 2, 1, -3, -1, -3, 0, 1, -2, 0, -2, -3, 0,
+ -2, 2, 2, 0, 0, 2, -3, 2, -3, -2, 1, 2, -3, -3, -1, -3, 0, -3, -3,
+ -2, -2, -2, 0, 0, 1, 0, 0, -1, 0, 0, -3, 0, -3, -1, -2, 1, -2, -1,
+ 2, -2, 0, 0, 1, 0, -2, -1, 0, -3, 1, 0, -1, -3, 1, -1, 1, -1, -3,
+ 1, 0, 1, 1, -1, 2, 2, 0, 0, 1, -3, 2, -2, -2, -3, -2, -1, -2, 2,
+ 0, 2, -2, -3, -1, -3, 2, 2, -1, 2, 2, -1, 0, -3, 1,
+ };
+
+ float bias[] = {
+ 1, -1, 0, 1, 1, 1, -2,
+ };
+
+ float expected_same[] = {
+ -1125, 2926, 6406, 631, -1244, 97, -1454, 2526, 1065, 3292, 3464,
+ 2553, -330, 532, 1038, 1182, -402, 3758, 3392, 9854, 4365, 1408,
+ 4736, 3134, 3838, 2409, 3221, 4350, 6750, 4045, 815, 1188, 2959,
+ 9802, 9590, 4572, 5740, 4253, 1701, 7974, 7012, 6854, 7093, 3907,
+ 4539, 3886, 4267, 3505, 465, 7824, 9219, 10026, 7968, 957, 2295,
+ 5594, 10811, 9641, 5950, 10043, 8783, 3132, 1421, 1110, 4108, 13929,
+ 10660, -84, -61, 3932, -180, 6811, 13393, 15147, 15640, 9337, 6961,
+ 3808, 1604, 1398, 1047, 6739, 10144, 6517, 4698, 2678, 7389, 2595,
+ 5248, 12075, 11272, 13951, 8820, 1090, 2199, 2206, 2788, 12116, 6683,
+ 2612, -291, 3183, 9414, 12316, 14524, 12333, 13208, 7832, 4664, 4657,
+ 3534, 1298, -666, 4250, 7707, 9103, 5760, 688, 9571, 15782, 14203,
+ 14878, 17339, 14684, 8690, 5671, 875, 1429, 1531, 6173, 2984, 5558,
+ 2996, 7928, 6733, 16117, 15262, 12757, 7980, 3923, 4795, 5973, 2051,
+ 455, -1922, 1816, 5906, 3321, 10908, 10910, 7377, 12204, 12809, 11195,
+ 7451, 6666, 74, -1645, -35, -391, 3813, 7324, 892, 1656, 6095,
+ 12193, 14648, 12156, 14663, 10251, 10325, 7821, 3925, 323, 697, 442,
+ 1324, 4669, 7002, 5485, 5171, 5086, 10582, 11053, 9709, 11353, 8543,
+ 5256, 2873, 235, -628, 1496, 1878, -867, 3420, 6865, 5937, 10182,
+ 13277, 10069, 10789, 5998, 624, -2082, 4417, 1258, -1080, -819, -1430,
+ 1033, 5220, 6335, 8471, 8980, 11908, 14430, 12584, 8404, 1576, -803,
+ 985, 1481, 1367, -193, 873, 3684, 2288, 6676, 9477, 11155, 9602,
+ 9707, 10507, 4739, 3174, -575, -178, 3002, 1710, 423, -477, 554,
+ 3088, 2029, 5113, 5000, 3771, 6090, 5365, 1185, 2855, 399, -312,
+ -1577, 176, 955,
+ };
+
+ float expected_replicate[] = {
+ 13768, 13528, 12999, 6906, 4618, 4043, 2611, 9955, 6685, 4776, 2753,
+ 1036, 3063, 4544, 5183, 7349, 12451, 12501, 9131, 12753, 8908, 4058,
+ 6299, 7542, 7115, 3307, 3360, 3543, 9754, 7808, 5991, 9019, 14320,
+ 14919, 12492, 6871, 7373, 3336, 2085, 10604, 9377, 6882, 5009, 3103,
+ 6220, 6278, 7588, 10196, 11045, 11563, 11842, 11911, 8279, 2030, 1858,
+ 6368, 12123, 9909, 6347, 10345, 9365, 4038, 1673, 3051, 16492, 16649,
+ 12276, 408, -301, 4122, -654, 7864, 14038, 15279, 15315, 9744, 8243,
+ 5298, 746, 380, 9824, 9124, 10895, 6640, 4712, 2669, 6980, 2759,
+ 5385, 12345, 11336, 13129, 8600, 2370, 3682, 5219, 12407, 13123, 6784,
+ 2612, -291, 3183, 9414, 12316, 14524, 12333, 13397, 7543, 3916, 4153,
+ 4477, 4314, 7983, 8418, 9163, 9103, 5760, 688, 9571, 15782, 14203,
+ 14878, 17718, 14570, 7940, 6642, 5094, 7133, 9964, 10219, 3224, 5558,
+ 2996, 7928, 6733, 16117, 15262, 12757, 7958, 4401, 5187, 5476, 5529,
+ 6055, 2206, 3909, 6015, 3321, 10908, 10910, 7377, 12204, 12809, 11195,
+ 6967, 6840, 481, -1600, 274, 1, 10373, 8514, 1123, 2117, 6758,
+ 12736, 16223, 13585, 15988, 11771, 10600, 7918, 4156, 2840, 3111, 3287,
+ 6359, 7652, 8813, 6530, 6967, 7789, 13671, 13990, 13247, 13241, 9836,
+ 5251, 3024, 2313, 1834, 4187, 2637, -1312, 2139, 7378, 7665, 11933,
+ 15591, 15314, 15678, 9531, 2820, -1516, 3400, 1314, 22, 363, -2896,
+ -898, 5906, 7308, 10650, 12975, 16978, 20370, 18817, 12381, 4118, -861,
+ -137, 236, 1802, 1632, -350, 2334, 3400, 8680, 14064, 18216, 18675,
+ 21765, 22871, 11491, 4937, -1555, -11, 1669, 2392, 3265, -5254, -217,
+ 5001, 8063, 13444, 18884, 19706, 22794, 21064, 9545, 6689, -7, 289,
+ -2021, 504, 2347,
+ };
+
+ float expected_valid[] = {
+ 2612, -291, 3183, 9414, 12316, 14524, 12333, 9103, 5760, 688,
+ 9571, 15782, 14203, 14878, 5558, 2996, 7928, 6733, 16117, 15262,
+ 12757, 3321, 10908, 10910, 7377, 12204, 12809, 11195,
+ };
+
+ CNN_CONFIG cnn_config = { 3,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ filter_width,
+ filter_height,
+ 3,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ 3,
+ filter_width,
+ filter_height,
+ 3,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ 3,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ },
+ } };
+
+ // Weights and biases need to be specified separately because
+ // of the offset.
+ AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ for (int i = 0; i < cnn_config.num_layers; ++i) {
+ cnn_config.layer_config[i].pad = PADDING_SAME_REPLICATE;
+ }
+
+ RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ for (int i = 0; i < cnn_config.num_layers; ++i) {
+ cnn_config.layer_config[i].pad = PADDING_VALID;
+ }
+
+ RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestRELUSingleLayer) {
+ int image_width = 8;
+ int image_height = 8;
+ int filter_height = 5;
+ int filter_width = 4;
+ float input[] = {
+ 0, -2, -3, 1, -1, 2, -2, 1, -3, -1, 0, 1, -2, -3, -2, -2,
+ 1, -3, 2, -3, -1, -1, 2, 0, -2, -3, 0, -2, -3, 1, -1, -1,
+ 2, -2, 0, -2, -3, -3, 1, 1, -1, 1, 0, 1, -3, 0, 2, 2,
+ 0, -3, 1, -3, 2, -2, 1, -1, -1, -2, -3, -2, -1, -3, -2, -1,
+ };
+ float expected_same[] = {
+ 9, 0, 1, 1, 0, 3, 0, 19, 0, 12, 10, 0, 0, 0, 5, 0,
+ 0, 18, 21, 7, 19, 4, 3, 0, 0, 9, 16, 0, 11, 16, 0, 11,
+ 12, 2, 0, 11, 0, 16, 6, 0, 8, 22, 13, 10, 12, 0, 0, 0,
+ 0, 1, 2, 12, 29, 6, 10, 0, 13, 0, 0, 5, 8, 10, 0, 0,
+ };
+ float expected_replicate[] = {
+ 18, 17, 12, 2, 0, 0, 5, 11, 0, 17, 22, 6, 0, 0, 17, 0,
+ 0, 18, 21, 7, 19, 4, 3, 5, 3, 9, 16, 0, 11, 16, 0, 3,
+ 3, 2, 0, 11, 0, 16, 6, 0, 17, 22, 13, 10, 12, 0, 0, 0,
+ 0, 4, 1, 10, 30, 7, 10, 0, 23, 8, 0, 13, 15, 19, 8, 10,
+ };
+ float expected_valid[] = {
+ 18, 21, 7, 19, 4, 9, 16, 0, 11, 16, 2, 0, 11, 0, 16, 22, 13, 10, 12, 0,
+ };
+ float weights[] = {
+ -2, -3, 1, 2, 2, -2, -3, 0, -3, 2, 2, -3, -3, -2, 0, 1, 2, 0, -1, -1,
+ };
+ float bias[] = { -3 };
+
+ CNN_CONFIG cnn_config = { 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ RELU,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE;
+
+ RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].pad = PADDING_VALID;
+
+ RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestVaryingStridesVaryingDimImages) {
+ float weights[] = {
+ 1, -5, -3, -4, -1, 1, 2, -3, 2, 2, -1, 1, -5, 1, 1,
+ -3, -5, 3, 1, 4, -2, -5, -2, -3, -5, 0, -1, -5, 2, -2,
+ -2, 1, -2, -4, 1, 3, -2, 2, 0, -3, 2, -3, -2, -3,
+ };
+ float bias[] = { 2 };
+
+ CNN_CONFIG cnn_config = { 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ 4,
+ 11,
+ 1,
+ 7,
+ 6,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ },
+ } };
+
+ int image_height = 24;
+ int image_width = 17;
+ float input[] = {
+ -1, -3, 4, 4, -5, 4, 3, -5, -1, -3, 4, -4, 2, -3, 3, -5, 2, -1, -5,
+ 1, -1, 3, 1, -3, -3, 4, 0, 2, -3, -5, -5, -4, 0, -5, -2, -3, -1, -2,
+ 2, -5, 4, 4, 0, -4, -3, 1, -3, -5, -4, -4, 1, -2, -3, 3, -3, -3, -1,
+ -5, -5, -2, 3, 1, -1, -5, -5, 1, -4, -2, -1, -2, -4, -4, 2, -2, 2, 1,
+ -2, -4, -1, 1, -2, -5, 3, -2, -1, -1, -5, -3, 1, -2, -2, -3, -1, -2, -4,
+ -2, 1, -4, -1, 4, 3, -4, 0, 4, 2, 2, 4, -3, -5, 2, 2, 1, -1, -4,
+ -2, 1, 3, 2, 0, 4, -1, -3, 2, 1, -4, 2, 2, -4, -2, 0, -2, -1, 4,
+ 4, 2, 3, -4, 2, -4, -5, 4, -1, -3, -1, 0, -4, 1, 3, -1, -3, -5, 3,
+ -2, -4, 1, 2, -2, -3, -3, -5, 1, -3, -1, 0, -1, 3, -4, -1, -5, -5, 1,
+ 0, 0, -2, -2, 2, -2, 0, 0, 2, 0, -3, 0, -1, -4, -4, -1, 3, -4, -4,
+ -1, 0, -5, -3, -2, 4, -3, -4, -4, 0, -5, 1, -2, -3, -3, -4, 4, 3, 4,
+ 3, 3, -1, 3, 1, -3, -2, 3, 3, 0, 2, -4, -3, 2, 2, 0, -2, 4, -2,
+ 2, -2, -1, -4, -2, 2, -4, 3, -1, 4, 1, 1, 4, -1, -4, -4, 1, 1, -2,
+ 4, -1, 3, 2, -3, 4, 3, 1, 4, 0, -4, 2, 0, 2, 4, -2, -2, 4, 2,
+ -1, -2, 1, -3, 2, 3, -5, -3, 4, 4, 2, -5, -4, -5, -2, -4, 2, 0, 2,
+ -5, 4, -4, -2, -5, 2, 1, 0, 4, 1, -2, -3, -4, -3, -4, 3, 3, 2, 0,
+ -3, 1, -5, 4, 0, 4, -1, 3, -5, -5, -2, -1, -1, 4, 3, 3, 4, 3, -4,
+ 4, -3, -3, -1, -4, -1, -4, -1, -2, 4, -2, -4, 4, 4, -3, -4, -1, 1, 2,
+ -1, -2, -2, 3, 2, 2, -3, 0, -1, 0, 3, 2, -5, 0, -4, 0, 0, 2, -4,
+ -1, -1, 0, -2, 0, 1, 0, 0, 4, -5, -1, -5, 2, -1, 0, 2, -1, 1, 3,
+ -3, -5, -2, -3, 4, -2, -2, -1, -3, -4, -1, -2, -4, 1, 4, -3, -2, -1, 3,
+ -3, -2, 3, 2, 1, -4, -3, -5, 1,
+ };
+ float expected_1[] = {
+ 41, -26, 5, 76, 13, 83, -21, 53, -54, -14, 21, 121,
+ };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected_1, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].skip_width = 6;
+ cnn_config.layer_config[0].skip_height = 7;
+
+ float expected_2[] = {
+ 21, -50, 41, 20, 72, 127, -21, 103, 62, -37, 83, -3,
+ };
+ RunCNNTest(image_width, image_height, input, expected_2, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].skip_width = 3;
+ cnn_config.layer_config[0].skip_height = 10;
+
+ float expected_3[] = {
+ -26, -21, -35, 69, 49, 4, -51, -43, -56,
+ -41, 15, -44, 40, -62, 63, 38, 27, 47,
+ };
+ RunCNNTest(image_width, image_height, input, expected_3, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].skip_width = 10;
+ cnn_config.layer_config[0].skip_height = 3;
+
+ float expected_4[] = {
+ 21, 49, 28, 87, 50, 40, 102, 81, 58, 85, 51, 66, 36, 19, -37, -45,
+ };
+
+ RunCNNTest(image_width, image_height, input, expected_4, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestMaxPool) {
+ int image_width = 8;
+ int image_height = 8;
+ int stride = 3;
+ float input[] = {
+ 1, -4, -4, 8, 0, 7, -5, -2, 8, 2, 2, 8, 5, -1, -1, 9,
+ -3, 0, -2, 0, 6, 3, -4, 8, 7, 8, 7, -1, 4, -1, 0, 2,
+ -5, -2, 8, 5, 5, 4, 2, 7, 4, 6, 2, 8, 8, -4, -3, -4,
+ -3, -1, 2, 3, 3, 6, -5, 8, 9, 5, 0, -2, -1, 6, 5, 7,
+ };
+
+ float expected[] = {
+ 49, 58, 70, 68, 68, 70, 48, 57, 88,
+ };
+
+ float weights[] = {
+ 3, 1, 3, 4, -1, 5, -2, 1, -4,
+ };
+
+ float bias[] = {
+ -3,
+ };
+
+ CNN_CONFIG cnn_config = { 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ 3,
+ 3,
+ 1,
+ stride,
+ stride,
+ 1,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestDeconvolveNonActivationSingleLayerSingleKernel) {
+ int image_width = 4;
+ int image_height = 7;
+ float input[] = {
+ 9, 6, 181, 9, 218, 30, 80, 108, 68, 216, 70, 128, 179, 228,
+ 33, 212, 34, 14, 48, 27, 230, 23, 202, 113, 80, 56, 122, 112,
+ };
+
+ float expected_1_same[] = {
+ 15, -30, 36, -525, 377, -193, 558, 531, 6, -24, -15, 124,
+ 166, -561, -356, -754, -3, -3, -3, -3, -3, -3, -3, -3,
+ 433, -311, 711, 381, 247, -317, 453, 129, 215, -627, -409, -885,
+ 17, -255, -55, -647, -3, -3, -3, -3, -3, -3, -3, -3,
+ 133, -719, 633, -225, 785, 191, 463, 79, 65, 9, 77, -853,
+ -365, -949, -15, -667, -3, -3, -3, -3, -3, -3, -3, -3,
+ 355, -866, 990, 207, 747, 12, 520, -116, 176, -312, -133, -1370,
+ -426, -802, 143, -771, -3, -3, -3, -3, -3, -3, -3, -3,
+ 65, -79, 127, -59, 135, -90, 195, 114, 31, -91, -57, -133,
+ 17, -176, -72, -276, -3, -3, -3, -3, -3, -3, -3, -3,
+ 457, -302, 733, 58, 470, -475, 829, 490, 227, -670, -440, -790,
+ 153, -588, -294, -1150, -3, -3, -3, -3, -3, -3, -3, -3,
+ 157, -251, 349, -185, 409, -293, 587, 251, 77, -187, -107, -369,
+ 7, -481, -135, -827, -3, -3, -3, -3, -3, -3, -3, -3,
+ };
+ float expected_1_valid[] = {
+ -30, 15, -30, 36, -525, 377, -193, 558, 531, 24, 24, 6,
+ 6, -24, -15, 124, 166, -561, -356, -754, -21, -39, -3, -3,
+ -3, -3, -3, -3, -3, -3, -3, -3, -3, -657, 433, -311,
+ 711, 381, 247, -317, 453, 129, 321, 321, 215, 215, -627, -409,
+ -885, 17, -255, -55, -647, -219, -435, -3, -3, -3, -3, -3,
+ -3, -3, -3, -3, -3, -3, -207, 133, -719, 633, -225, 785,
+ 191, 463, 79, 381, 381, 65, 65, 9, 77, -853, -365, -949,
+ -15, -667, -259, -515, -3, -3, -3, -3, -3, -3, -3, -3,
+ -3, -3, -3, -540, 355, -866, 990, 207, 747, 12, 520, -116,
+ 633, 633, 176, 176, -312, -133, -1370, -426, -802, 143, -771, -427,
+ -851, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
+ -105, 65, -79, 127, -59, 135, -90, 195, 114, 78, 78, 31,
+ 31, -91, -57, -133, 17, -176, -72, -276, -57, -111, -3, -3,
+ -3, -3, -3, -3, -3, -3, -3, -3, -3, -693, 457, -302,
+ 733, 58, 470, -475, 829, 490, 336, 336, 227, 227, -670, -440,
+ -790, 153, -588, -294, -1150, -229, -455, -3, -3, -3, -3, -3,
+ -3, -3, -3, -3, -3, -3, -243, 157, -251, 349, -185, 409,
+ -293, 587, 251, 333, 333, 77, 77, -187, -107, -369, 7, -481,
+ -135, -827, -227, -451,
+ };
+ float weights_1[] = { -3, 2, -1, 3, 3, 1, 1, -3, -2, -4 };
+ float bias_1[] = { -3 };
+
+ CNN_CONFIG cnn_config = { 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ 5,
+ 2,
+ 1,
+ 2,
+ 3,
+ 0,
+ weights_1,
+ bias_1,
+ PADDING_SAME_ZERO,
+ NONE,
+ 1,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected_1_same, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ // Change padding to valid
+ cnn_config.layer_config[0].pad = PADDING_VALID;
+
+ RunCNNTest(image_width, image_height, input, expected_1_valid, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ float expected_12_same[] = {
+ 15, -12, 6, 36, -9, -528, 377, -184, 513, 558, -12, 24,
+ 6, -30, -15, -33, -21, 166, 154, -546, -356, -718, -30, -21,
+ 433, -221, 561, 711, -33, -153, 247, -83, -87, 453, -111, 321,
+ 215, -657, -409, -845, -93, 17, -43, -243, -55, -215, -327, -219,
+ 133, -71, -447, 633, -219, 435, 785, -73, -177, 463, -131, 381,
+ 65, -207, 77, -59, -651, -365, -797, -213, -15, -155, -387, -259,
+ 355, -182, -150, 990, -231, 582, 747, -36, -540, 520, -215, 633,
+ 176, -540, -133, -491, -687, -426, -882, -102, 143, 77, -639, -427,
+ 65, -37, 57, 127, -17, -105, 135, -51, 60, 195, -30, 78,
+ 31, -105, -57, -125, -45, 17, -11, -147, -72, -168, -84, -57,
+ 457, -233, 618, 733, -26, -540, 470, -205, 264, 829, -116, 336,
+ 227, -693, -440, -900, -72, 153, 107, -609, -294, -698, -342, -229,
+ 157, -83, 69, 349, -59, -201, 409, -125, 27, 587, -115, 333,
+ 77, -243, -107, -267, -171, 7, -105, -369, -135, -379, -339, -227,
+ };
+ float expected_12_valid[] = {
+ -30, 15, -12, 6, 36, -9, -528, 377, -184, 513, 558, -12,
+ 24, 24, 6, 6, -30, -15, -33, -21, 166, 154, -546, -356,
+ -718, -30, -21, -39, -657, 433, -221, 561, 711, -33, -153, 247,
+ -83, -87, 453, -111, 321, 321, 215, 215, -657, -409, -845, -93,
+ 17, -43, -243, -55, -215, -327, -219, -435, -207, 133, -71, -447,
+ 633, -219, 435, 785, -73, -177, 463, -131, 381, 381, 65, 65,
+ -207, 77, -59, -651, -365, -797, -213, -15, -155, -387, -259, -515,
+ -540, 355, -182, -150, 990, -231, 582, 747, -36, -540, 520, -215,
+ 633, 633, 176, 176, -540, -133, -491, -687, -426, -882, -102, 143,
+ 77, -639, -427, -851, -105, 65, -37, 57, 127, -17, -105, 135,
+ -51, 60, 195, -30, 78, 78, 31, 31, -105, -57, -125, -45,
+ 17, -11, -147, -72, -168, -84, -57, -111, -693, 457, -233, 618,
+ 733, -26, -540, 470, -205, 264, 829, -116, 336, 336, 227, 227,
+ -693, -440, -900, -72, 153, 107, -609, -294, -698, -342, -229, -455,
+ -243, 157, -83, 69, 349, -59, -201, 409, -125, 27, 587, -115,
+ 333, 333, 77, 77, -243, -107, -267, -171, 7, -105, -369, -135,
+ -379, -339, -227, -451,
+ };
+
+ // Change skip_width, skip_height to {2, 3}
+ cnn_config.layer_config[0].skip_width = 3;
+ cnn_config.layer_config[0].skip_height = 2;
+ // Set padding to same
+ cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+ RunCNNTest(image_width, image_height, input, expected_12_same, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ // Change padding to valid
+ cnn_config.layer_config[0].pad = PADDING_VALID;
+ RunCNNTest(image_width, image_height, input, expected_12_valid, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].filter_width = 4;
+ cnn_config.layer_config[0].filter_height = 3;
+ float weights_2[] = { -1, -3, -1, -3, 0, 2, -2, 4, 3, 0, 1, 4 };
+ float bias_2[] = { -4 };
+ cnn_config.layer_config[0].weights = weights_2;
+ cnn_config.layer_config[0].bias = bias_2;
+
+ cnn_config.layer_config[0].skip_width = 5;
+ cnn_config.layer_config[0].skip_height = 2;
+ float expected_2_same[] = {
+ -13, -31, -13, -31, -4, -10, -22, -10, -22, -4, -185, -547,
+ -185, -547, -4, -13, -31, -13, -31, -4, -4, 14, -22, 32,
+ -4, -4, 8, -16, 20, -4, -4, 358, -366, 720, -4, -4,
+ 14, -22, 32, -4, -195, -658, -213, -622, -4, -16, -94, -28,
+ -70, -4, 459, -244, 97, 480, -4, -85, -328, -103, -292, -4,
+ -4, 432, -440, 868, -4, -4, 56, -64, 116, -4, -4, 156,
+ -164, 316, -4, -4, 212, -220, 428, -4, 582, -208, 146, 664,
+ -4, -130, -652, -190, -532, -4, 166, -214, 6, 106, -4, 192,
+ -388, -24, 44, -4, -4, 132, -140, 268, -4, -4, 428, -436,
+ 860, -4, -4, 136, -144, 276, -4, -4, 252, -260, 508, -4,
+ 21, -541, -115, -269, -4, 416, -688, -16, 176, -4, 173, -103,
+ 33, 177, -4, 168, -640, -88, -128, -4, -4, 354, -362, 712,
+ -4, -4, 452, -460, 908, -4, -4, 62, -70, 128, -4, -4,
+ 420, -428, 844, -4, 499, -106, 141, 610, -4, 666, -46, 210,
+ 866, -4, 47, -148, -19, -16, -4, 605, -85, 181, 763, -4,
+ -4, 64, -72, 132, -4, -4, 24, -32, 52, -4, -4, 92,
+ -100, 188, -4, -4, 50, -58, 104, -4, -132, -694, -200, -558,
+ -4, 15, -73, -13, -17, -4, -62, -610, -158, -418, -4, -36,
+ -343, -90, -235, -4, -4, 456, -464, 916, -4, -4, 42, -50,
+ 88, -4, -4, 400, -408, 804, -4, -4, 222, -230, 448, -4,
+ 606, -244, 146, 676, -4, 9, -172, -37, -80, -4, 480, -370,
+ 76, 438, -4, 223, -340, -3, 112, -4, -4, 156, -164, 316,
+ -4, -4, 108, -116, 220, -4, -4, 240, -248, 484, -4, -4,
+ 220, -228, 444, -4,
+ };
+ float expected_2_valid[] = {
+ -13, -31, -13, -31, -4, -10, -22, -10, -22, -4, -185, -547,
+ -185, -547, -4, -13, -31, -13, -31, -4, 14, -22, 32, -4,
+ -4, 8, -16, 20, -4, -4, 358, -366, 720, -4, -4, 14,
+ -22, 32, -195, -658, -213, -622, -4, -16, -94, -28, -70, -4,
+ 459, -244, 97, 480, -4, -85, -328, -103, -292, -4, 432, -440,
+ 868, -4, -4, 56, -64, 116, -4, -4, 156, -164, 316, -4,
+ -4, 212, -220, 428, 582, -208, 146, 664, -4, -130, -652, -190,
+ -532, -4, 166, -214, 6, 106, -4, 192, -388, -24, 44, -4,
+ 132, -140, 268, -4, -4, 428, -436, 860, -4, -4, 136, -144,
+ 276, -4, -4, 252, -260, 508, 21, -541, -115, -269, -4, 416,
+ -688, -16, 176, -4, 173, -103, 33, 177, -4, 168, -640, -88,
+ -128, -4, 354, -362, 712, -4, -4, 452, -460, 908, -4, -4,
+ 62, -70, 128, -4, -4, 420, -428, 844, 499, -106, 141, 610,
+ -4, 666, -46, 210, 866, -4, 47, -148, -19, -16, -4, 605,
+ -85, 181, 763, -4, 64, -72, 132, -4, -4, 24, -32, 52,
+ -4, -4, 92, -100, 188, -4, -4, 50, -58, 104, -132, -694,
+ -200, -558, -4, 15, -73, -13, -17, -4, -62, -610, -158, -418,
+ -4, -36, -343, -90, -235, -4, 456, -464, 916, -4, -4, 42,
+ -50, 88, -4, -4, 400, -408, 804, -4, -4, 222, -230, 448,
+ 606, -244, 146, 676, -4, 9, -172, -37, -80, -4, 480, -370,
+ 76, 438, -4, 223, -340, -3, 112, -4, 156, -164, 316, -4,
+ -4, 108, -116, 220, -4, -4, 240, -248, 484, -4, -4, 220,
+ -228, 444, 236, -4, 76, 316, -4, 164, -4, 52, 220, -4,
+ 362, -4, 118, 484, -4, 332, -4, 108, 444,
+ };
+ // Set padding to same
+ cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+ RunCNNTest(image_width, image_height, input, expected_2_same, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].pad = PADDING_VALID;
+
+ RunCNNTest(image_width, image_height, input, expected_2_valid, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].skip_width = 2;
+ cnn_config.layer_config[0].skip_height = 5;
+ float expected_21_same[] = {
+ -31, -19, -49, -191, -565, -194, -574, -13, 14, -22, 44, -16,
+ 382, -366, 738, -22, -4, 23, 32, 545, 20, 204, 720, 5,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -658, -252, -748, -114, -334, -192, -568, -112,
+ 432, -440, 928, -64, 276, -164, 532, -220, -4, 304, 868, 266,
+ 116, 400, 316, 104, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -208, -288, -856, -290,
+ -862, -202, -598, -132, 132, -140, 700, -436, 1000, -144, 532, -260,
+ -4, 712, 268, 422, 860, 450, 276, 124, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -541, -411, -1225, -265, -787, -249, -739, -216, 354, -362, 1168, -460,
+ 974, -70, 552, -428, -4, 859, 712, 323, 908, 665, 128, 208,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -106, -52, -148, -66, -190, -79, -229, -31,
+ 64, -72, 160, -32, 148, -100, 242, -58, -4, 72, 132, 154,
+ 52, 125, 188, 23, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -694, -257, -763, -229,
+ -679, -319, -949, -117, 456, -464, 962, -50, 492, -408, 1030, -230,
+ -4, 295, 916, 625, 88, 537, 804, 109, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -244, -140, -412, -182, -538, -238, -706, -116, 156, -164, 428, -116,
+ 464, -248, 708, -228, -4, 244, 316, 418, 220, 454, 484, 108,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4,
+ };
+ float expected_21_valid[] = {
+ -13, -31, -19, -49, -191, -565, -194, -574, -13, -31, -4, 14,
+ -22, 44, -16, 382, -366, 738, -22, 32, 23, -4, 23, 32,
+ 545, 20, 204, 720, 5, 32, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -222, -658, -252, -748, -114, -334, -192, -568, -112, -328,
+ -4, 432, -440, 928, -64, 276, -164, 532, -220, 428, 650, -4,
+ 304, 868, 266, 116, 400, 316, 104, 428, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -72, -208, -288, -856, -290, -862, -202, -598,
+ -132, -388, -4, 132, -140, 700, -436, 1000, -144, 532, -260, 508,
+ 200, -4, 712, 268, 422, 860, 450, 276, 124, 508, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -183, -541, -411, -1225, -265, -787,
+ -249, -739, -216, -640, -4, 354, -362, 1168, -460, 974, -70, 552,
+ -428, 844, 533, -4, 859, 712, 323, 908, 665, 128, 208, 844,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -38, -106, -52, -148,
+ -66, -190, -79, -229, -31, -85, -4, 64, -72, 160, -32, 148,
+ -100, 242, -58, 104, 98, -4, 72, 132, 154, 52, 125, 188,
+ 23, 104, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -234, -694,
+ -257, -763, -229, -679, -319, -949, -117, -343, -4, 456, -464, 962,
+ -50, 492, -408, 1030, -230, 448, 686, -4, 295, 916, 625, 88,
+ 537, 804, 109, 448, -4, -4, -4, -4, -4, -4, -4, -4,
+ -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
+ -84, -244, -140, -412, -182, -538, -238, -706, -116, -340, -4, 156,
+ -164, 428, -116, 464, -248, 708, -228, 444, 236, -4, 244, 316,
+ 418, 220, 454, 484, 108, 444,
+ };
+
+ cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+ RunCNNTest(image_width, image_height, input, expected_21_same, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+
+ cnn_config.layer_config[0].pad = PADDING_VALID;
+
+ RunCNNTest(image_width, image_height, input, expected_21_valid, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestLargeKernelsAndStrides) {
+ float input_10x11[] = {
+ 4, 4, 2, 4, 2, -5, -2, 3, -1, 0, 0, 1, 2, 0, -5, -2, -5, 1, -3,
+ -1, 4, -3, 2, -2, 1, 0, 1, -3, -3, -4, -2, -2, 1, -4, -1, 4, 1, -4,
+ -4, -4, 3, 2, -5, 3, -5, 1, 2, -4, 1, -1, 3, 4, -2, 3, -3, 3, 0,
+ 2, -4, -5, -5, -2, -1, -2, 1, 1, 1, -2, 4, -5, 4, -1, -1, 2, 3, -4,
+ 2, 2, 3, 0, 0, 1, 0, 3, 2, 3, 1, -2, 3, -4, 3, 2, 4, -2, 0,
+ 4, -4, 1, -3, -3, -3, -5, 1, -3, -5, 0, 4, -1, -3, 2,
+ };
+
+ float weights_10x11[] = {
+ -3, 4, -4, -3, -5, 1, -2, 3, 1, -4, -4, 0, -1, 0, 3, 1, -3, -2, 0,
+ -1, 1, 3, -4, -4, -3, -3, -2, 4, 3, -5, 4, 2, -3, 4, -2, -1, 2, -1,
+ -5, 0, -3, 0, 3, -5, -5, 3, -4, -1, -5, 3, 4, 0, 4, -5, 2, -1, 2,
+ -1, -1, -1, -5, 0, -4, 3, -1, 1, 1, -1, 3, 2, -5, -4, 0, -4, 4, -5,
+ -3, 4, -5, 2, -5, -4, -4, -1, 3, 3, 0, 2, -4, 1, -2, 1, 1, 0, 3,
+ -2, 0, 1, 2, 4, -3, -1, -5, -5, 2, -4, 1, 1, 2, -4, -2, -2, 2, 1,
+ 3, 4, -5, 1, -1, -3, -3, -1, -2, -5, 1, -1, 0, 1, 4, 4, 0, 0, 4,
+ -3, -1, -5, -3, 0, 1, 1, 1, -5, 3, 4, 3, -5, 3, -2, -2, 0, -4, 0,
+ 0, -2, 1, -4, -1, 0, -5, -2, -2, -5, -3, -3, 1, 1, -3, 2, 4, 2, 4,
+ -4, -3, 3, 1, 1, 3, -4, 4, -2, -3, -3, -3, -3, -4, -2, 3, -5, 2, 4,
+ -1, -4, -4, 4, -2, -1, 3, -3, -4, -4, -2, 4, 1, 0, 2, -1, 4, -3, 1,
+ 4, -3, 4, 4, 0, -4, 3, -2, -3, 2, 3, -1, -3, 2, 1, 4, -2, -3, 1,
+ 4, -2, 2, -2, -5, -2, 1, 4, -1, -4, 4, -5, 2, -5, -4, -1, -2, 3, 1,
+ 2, 1, -5, 1, -5, -4, -1, -2, 2, -2, -4, -3, -2, -2, 4, -1, 2, 2, -4,
+ 2, -2, 4, -4, -2, -2, 1, -1, 1, 1, 1, -4, -5, -2, 3, -4, -1, 3, -2,
+ 3, 2, -5, -4, 0, 3, -2, -4, -5, 3, -2, -4, 2, -2, 1, -4, 0, 2, -5,
+ 1, -4, -1, -1, 4, -5, -4, 0, -5, -4, -3, -5, -4, 0, 2, 0, -4, 2, -2,
+ 1, 1, -3, 2, 0, -4, 0, -4, 1, 0, -5, -1, -1, -1, -5, 4, 2, 2, -4,
+ 3, -2, -2, 2, -3, -2, -1, 2, -4, -5, 2, -2, -4, -5, -5, -1, 2, -1, 0,
+ -5, -2, -2, -5, 0, 1, -1, -5, 0, 3, 2, 3, 0, -3, -2, 0, -5, -1, -2,
+ 2, -4, -1, 2, 2, -5, 2, -4, 0, 3, -3, 1, 0, 0, 1, -5, -3, 1, -1,
+ 0, -4, -3, 2, -4, -4, 4, -1, 0, 1, 2, -4, -5, 4, -2, 1, -4, -4, -3,
+ -1, -1, 1, -1, -4, -1, -4, -3, 2, -1, -2, -4, 1, 1, 0, -2, 0, -4, 3,
+ -3, 0, -4, -1, -4, 2, -1, -2, -5, -1, -2, -3, 3, -1, 0, -3, 0, 1, -5,
+ 1, -5, 0, 1,
+ };
+
+ float bias_10x11[] = { 3 };
+
+ float expected_10x11[] = {
+ 118,
+ };
+
+ CNN_CONFIG cnn_config = { 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ 23,
+ 20,
+ 1,
+ 15,
+ 20,
+ 0,
+ weights_10x11,
+ bias_10x11,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ int image_height = 10;
+ int image_width = 11;
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input_10x11, expected_10x11,
+ &cnn_config, image_width, &thread_data, MSE_INT_TOL);
+
+ float input_11x10[] = {
+ -2, -2, 3, -5, -1, -3, 1, 3, 2, 1, 1, -5, 4, 1, 3, -5, 3, -3, -5,
+ 0, -1, -3, -3, 1, 1, -5, -1, -5, -5, -3, 0, 1, -3, -1, -3, -3, 0, 3,
+ 4, -4, -1, 3, -3, -1, -3, 1, -3, -2, -1, -4, -3, 2, -4, 1, -4, -1, -3,
+ -5, -1, 2, 3, 0, 2, 2, -5, 4, 1, 2, -1, -4, 4, -4, -4, 0, -1, 1,
+ -1, 1, -3, -3, -2, 1, 2, 4, 4, 4, -3, -3, 0, 1, 0, 1, 4, 1, 3,
+ 4, -3, -2, -4, 4, 2, 0, 3, 4, -1, 2, -2, 1, -3, -2,
+ };
+
+ float weights_11x10[] = {
+ 4, -1, 1, -1, 2, 4, 3, 3, -4, 3, -5, 1, -1, -1, -2, -2, 0, 2, -3,
+ -2, 3, -5, -1, 0, -1, -2, -2, -1, 2, 4, 3, 1, 0, 0, -3, 3, -4, -1,
+ -5, 4, -2, -2, 1, 2, -1, -3, 1, 2, -5, 1, -3, 3, 3, 0, -4, -4, -5,
+ -3, -4, -4, 4, -2, 4, 4, -2, 2, -5, -1, -2, -5, -1, 4, -3, 3, -2, 0,
+ -4, -3, 0, -1, -2, 4, 2, 0, -2, -5, -4, 1, 4, -4, -2, 2, -2, 1, 1,
+ -4, 1, -4, -4, -2, 4, 2, -1, -5, -5, 1, -3, -3, 3, -3, -5, -3, 4, -1,
+ -1, -3, 0, -4, 3, -1, 0, -2, 0, -5, -2, -5, 2, 0, -5, 2, 3, -2, 2,
+ 4, -1, 1, -3, 2, 3, 2, 0, -5, -4, -5, 2, 1, 1, -1, -2, 3, 4, 2,
+ -2, 4, -2, 3, 1, -4, -3, -1, 4, 4, -3, -5, -2, 2, 0, 3, -2, 3, -1,
+ -4, 0, -2, 0, 3, 4, -2, -3, -2, 0, 3, 4, 2, -4, 0, 1, 2, 2, -1,
+ -1, 4, 1, 4, -2, -1, -1, -5, 1, -3, 3, 3, -1, -4, 3, -5, 0, 0, -1,
+ -4, -1, -2, 4, -2, 3, 3, -3, 1, -1, 2, -1, 4, 4, -2, -2, 4, -2, 0,
+ 3, -3, -5, -1, -2, 4, -4, 2, -4, 0, -2, 3, -3, 2, 2, -2, -5, -1, 4,
+ 3, -2, -1, 3, 3, -1, 3, 0, -3, 0, 4, 2, 0, -1, 4, 1, 1, 2, 1,
+ 3, 1, 1, 1, -3, -5, -4, 4, -4, 2, 0, 0, -4, 1, 4, -5, 4, 4, 0,
+ 1, 0, -2, -4, -4, -3, 0, 1, -5, 4, 0, -3, -2, -4, 2, 4, 1, -5, 1,
+ -4, 1, 0, -3, -3, 0, 2, -5, 4, 3, -2, -5, 3, 1, -1, 0, 3, -2, -2,
+ 3, -2, -5, 4, 1, -2, 2, -1, 0, 4, 0, -5, 3, -2, 1, 2, 1, -5, -3,
+ -2, -5, 4, -4, 0, 3, 2, -1, -4, -1, 2, 1, -2, 3, -1, -4, 2, 0, -3,
+ 1, -1, 2, -5, -4, -1, -5, 1, 4, 3, 4, 2, -3, 1, -5, -1, 3, 0, -1,
+ -4, 3, 4, -5, 4, 4, -3, 2, -3, -1, -3, -5, -3, 2, -3, -2, 1, 1, 0,
+ -5, 3, 2, 1, -5, 1, 1, 1, 3, 4, -4, -1, -2, 0, -5, -3, -5, -2, -4,
+ 3, 3, 3, 4, 0, -4, -1, -5, 0, -3, 1, 4, 4, -4, 4, -5, -5, -1, -2,
+ -5, 3, -4, 4, 3, 0, -3, 2, -2, 0, 0, 4, 4, 0, -2, 1, -1, -3, 2,
+ -1, 1, -3, -5,
+ };
+
+ float bias_11x10[] = {
+ -5,
+ };
+
+ float expected_11x10[] = {
+ 36, -84, 95, 45, 18, 46, 77, -54, -99, -149, 66, 49, 161, 11,
+ 39, 61, -66, 61, 4, -3, 34, -44, -23, 31, 64, 29, 47, 72,
+ -27, -27, 121, -3, 100, 1, 30, -78, -12, -89, -59, 8, -16, 112,
+ 91, -102, -26, -4, 30, 54, 4, -84, -24, -58, 27, -53, -33, 5,
+ 53, -26, 63, 50, -103, -130, -23, 6, -104, -207, 73, 23, 77, 132,
+ 38, 32, -130, -44, -60, 7, 27, 176, 45, -32, -2, 99, -97, 63,
+ 69, 126, 47, 63, 136, -57, 5, 16, -40, -157, 8, 38, -44, -10,
+ 91, 7, 122, 140, 30, -105, 4, -1, 113, 64, 180, 141,
+ };
+
+ cnn_config.layer_config[0].weights = weights_11x10;
+ cnn_config.layer_config[0].bias = bias_11x10;
+ cnn_config.layer_config[0].filter_width = 20;
+ cnn_config.layer_config[0].filter_height = 23;
+ cnn_config.layer_config[0].skip_width = 1;
+ cnn_config.layer_config[0].skip_height = 1;
+ image_height = 11;
+ image_width = 10;
+
+ RunCNNTest(image_width, image_height, input_11x10, expected_11x10,
+ &cnn_config, image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestSoftsignSingleLayer) {
+ int image_width = 8;
+ int image_height = 8;
+ int filter_height = 5;
+ int filter_width = 4;
+ float input[] = {
+ -0.5220f, 0.8410f, -0.8990f, -0.0090f, 0.6710f, -0.9470f, -0.8240f,
+ -0.0870f, 0.5380f, 0.4750f, 0.570f, -0.3760f, -0.6960f, -0.5940f,
+ -0.3830f, 0.080f, -0.0980f, -0.4940f, -0.4030f, 0.9460f, -0.6020f,
+ 0.4220f, 0.6190f, 0.6640f, -0.9210f, -0.1470f, -0.2480f, -0.1120f,
+ -0.580f, -0.0650f, 0.3330f, 0.9860f, -0.7430f, 0.7610f, 0.4840f,
+ 0.1030f, 0.9570f, 0.6120f, -0.5240f, -0.1220f, -0.5850f, -0.270f,
+ 0.7840f, -0.9790f, 0.7290f, -0.30f, -0.6460f, 0.0780f, 0.4750f,
+ -0.0510f, 0.4550f, 0.3850f, -0.7230f, 0.4460f, -0.6260f, -0.810f,
+ 0.8720f, -0.2120f, -0.580f, -0.9510f, -0.8430f, -0.1340f, -0.0850f,
+ 0.9190f,
+ };
+ float expected_same[] = {
+ 0.430f, 0.660f, 0.5510f, -0.610f, 0.450f, -0.1610f, 0.0520f, 0.3240f,
+ 0.6820f, 0.3820f, 0.6360f, 0.7480f, 0.3080f, 0.090f, 0.3910f, 0.1730f,
+ 0.340f, 0.6660f, -0.4990f, 0.4280f, 0.1540f, 0.120f, 0.4670f, 0.6150f,
+ -0.3880f, 0.7590f, 0.4190f, 0.7350f, 0.5310f, -0.5160f, -0.1760f, 0.6790f,
+ -0.6780f, 0.5470f, 0.5750f, -0.6420f, 0.7210f, -0.4620f, 0.5430f, 0.770f,
+ -0.1990f, 0.3950f, 0.7860f, -0.4380f, 0.7540f, 0.2640f, -0.6430f, 0.4510f,
+ -0.1260f, 0.1590f, -0.2110f, -0.0560f, 0.6570f, 0.680f, 0.5870f, 0.4720f,
+ 0.4040f, 0.3630f, 0.670f, 0.2360f, 0.410f, 0.6980f, -0.5350f, 0.3940f,
+ };
+ float expected_replicate[] = {
+ 0.540f, 0.7230f, -0.3530f, -0.2130f, 0.7440f, -0.4470f, -0.6260f,
+ -0.2050f, 0.7230f, 0.4630f, 0.5920f, 0.7440f, 0.6080f, 0.3130f,
+ -0.5670f, -0.4720f, 0.5480f, 0.6660f, -0.4990f, 0.4280f, 0.1540f,
+ 0.120f, 0.3390f, 0.6090f, 0.4160f, 0.7590f, 0.4190f, 0.7350f,
+ 0.5310f, -0.5160f, -0.490f, 0.4450f, -0.610f, 0.5470f, 0.5750f,
+ -0.6420f, 0.7210f, -0.4620f, 0.3150f, 0.7370f, -0.5820f, 0.3950f,
+ 0.7860f, -0.4380f, 0.7540f, 0.2640f, -0.7430f, -0.5340f, -0.6270f,
+ 0.4430f, 0.4730f, 0.4570f, 0.7450f, 0.630f, 0.2620f, 0.3140f,
+ -0.1840f, 0.1810f, 0.7210f, 0.2760f, 0.6430f, 0.6720f, -0.4390f,
+ 0.2040f,
+ };
+ float expected_valid[] = {
+ 0.6660f, -0.4990f, 0.4280f, 0.1540f, 0.120f, 0.7590f, 0.4190f,
+ 0.7350f, 0.5310f, -0.5160f, 0.5470f, 0.5750f, -0.6420f, 0.7210f,
+ -0.4620f, 0.3950f, 0.7860f, -0.4380f, 0.7540f, 0.2640f,
+ };
+ float weights[] = {
+ 0.6210f, 0.3710f, -0.2770f, -0.7230f, -0.2450f, 0.6770f, 0.3080f,
+ -0.9880f, -0.080f, 0.7190f, -0.6760f, -0.0170f, -0.8970f, 0.8260f,
+ 0.7390f, -0.4550f, -0.4260f, -0.6330f, 0.0880f, -0.9390f,
+ };
+ float bias[] = {
+ 0.750f,
+ };
+
+ CNN_CONFIG cnn_config = { 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ SOFTSIGN,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+
+ cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE;
+
+ RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+
+ cnn_config.layer_config[0].pad = PADDING_VALID;
+
+ RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestBranchTensorAdd) {
+ int filter_width = 2;
+ int filter_height = 3;
+
+ int image_width = 4;
+ int image_height = 4;
+
+ float input[] = {
+ -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0,
+ };
+
+ float weights[] = {
+ -3, -1, 4, -1, -3, 3, 3, 0, 2, 0, 3, 2, 4, 4, 4, -5, 1, -4,
+ 2, -4, 1, -3, 0, 4, -5, 4, 0, -4, -3, -1, 0, 0, -2, 0, 0, 2,
+ -5, -1, 1, -3, 3, 4, 3, 0, 1, -1, 1, 1, 2, 4, -2, -5, 2, -2,
+ 3, -2, 4, -1, 0, 2, 3, 2, -2, -1, -3, 1, 3, 4, -1, -3, 0, -4,
+ 4, 2, -3, -3, -1, 0, 1, 0, 3, 3, -3, 0, 3, 2, -5, -3, 4, -5,
+ 3, -1, -1, -3, 0, 1, -1, -4, 2, 4, -1, 4, -1, 1, 3, 4, 4, 4,
+ 0, -1, -3, -3, -3, -3, 2, -3, -2, 2, 3, -3,
+ };
+
+ float bias[] = {
+ 3, 4, -1, -1, 2, 1, -2, 1, 4, 1, 3,
+ };
+
+ float expected[] = {
+ -11502, -4101, -3424, 668, -17950, -5470, -5504, 626,
+ 4835, 446, 1779, -3483, 3679, -4214, 4578, -105,
+ };
+
+ int channels = 2;
+
+ CNN_CONFIG cnn_config = { 6,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_INPUT,
+ BRANCH_NOC,
+ {
+ 0x02,
+ 0,
+ 0x00,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 1,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 1,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_ADD,
+ {
+ 0x00,
+ 0,
+ 0x02,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ // Weights and biases need to be specified separately because
+ // of the offset.
+ AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestBranchTensorConcatenation) {
+ int filter_width = 2;
+ int filter_height = 3;
+
+ int image_width = 4;
+ int image_height = 4;
+
+ float input[] = {
+ -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0,
+ };
+
+ float weights[] = {
+ 3, 0, 2, 0, 2, 3, 1, -3, 1, -5, -3, 0, -4, 4, 0, -5, 0, -5, -1,
+ -2, -5, 0, -3, 2, -4, 2, 0, 2, -1, 0, -4, 3, 0, 0, -1, -5, 2, -1,
+ 4, -4, -2, -3, -3, 3, 4, -2, -1, -4, -1, 4, 4, -1, 4, 3, -4, 2, -2,
+ -4, -3, -2, 3, -3, -5, -1, 3, -2, 4, 1, -4, -3, -5, -5, -3, 4, -2, -2,
+ -1, -5, -5, 0, -1, -2, -3, 3, -4, -5, 2, -3, 1, 0, -5, 2, 2, -2, 0,
+ 2, 2, -2, 4, 2, 2, 0, 1, -5, -3, 0, 2, -2, 1, 2, -5, 2, 3, 3,
+ -1, 3, 0, -3, 3, -4, -4, 3, 3, -4, -2, 2, -2, 2, -2, -1, 3, 0,
+ };
+
+ float bias[] = {
+ -3, -5, 4, -4, -3, -2, 0, 3, -4, 4, -3,
+ };
+
+ float expected[] = {
+ -33533, -32087, -6741, -2124, 39979, 41453, 14034, 689,
+ -22611, -42203, -14882, -239, 15781, 15963, 9524, 837,
+ };
+
+ int channels = 2;
+
+ CNN_CONFIG cnn_config = { 6,
+ 0,
+ 0,
+ 0,
+ 0,
+ { {
+ 1,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_INPUT,
+ BRANCH_NOC,
+ {
+ 0x02,
+ 0,
+ 0x00,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 1,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 1,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_CAT,
+ {
+ 0x00,
+ 0,
+ 0x02,
+ },
+ {},
+ -1,
+ },
+ {
+ channels + channels,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ } } };
+
+ // Weights and biases need to be specified separately because
+ // of the offset.
+ AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+// TODO(logangw): Add test to test all combinations of branch_copy_type.
+
+TEST_F(CNNTest, TestBranchCombinations) {
+ int filter_width = 2;
+ int filter_height = 3;
+
+ int image_width = 4;
+ int image_height = 4;
+
+ float input[] = {
+ 3, 2, -5, -4, 4, -2, -4, -3, 4, 2, -3, 2, -3, 1, -5, -1,
+ };
+
+ float weights[] = {
+ 2, 3, 0, 4, 4, 3, 1, 0, 1, -5, 4, -3, 3, 0, 4, -1, -1, -5,
+ 2, 1, -3, -5, 3, -1, -3, -2, 0, -2, 3, 0, -2, -4, -2, -2, 2, -5,
+ 4, -5, 0, 1, -5, -4, -3, -4, 2, -2, 1, 0, 3, -2, -4, 3, 4, -4,
+ -1, -1, -3, -2, -2, -1, 2, 0, 2, -1, 2, -4, -4, -1, 2, 0, 3, -2,
+ -2, 3, -3, 4, -2, 4, 3, 4, 1, 0, -2, -3, -5, 1, -3, 2, 0, -2,
+ -2, -1, -1, -5, -2, -3, -1, 3, 3, 4, 4, 0, 2, 1, 3, -3, 2, -5,
+ -5, 1, -5, -1, 3, 3, 2, -4, -1, 3, -4, -2, -5, -2, 1, 3, 2, 2,
+ -5, -2, -3, -1, -2, -4, -1, -2, 2, 1, -4, -4, 2, 0, 2, 0, 2, -3,
+ -2, -4, 4, 0, 1, -3, -5, 4, -1, 2, 3, -5, -1, 0, 4, -1, -1, 3,
+ -1, -3, 3, 1, 4, 3, 4, 3, -4, -5, -1, 3, 3, -4, 3, 1, 3, -5,
+ 3, 4, -5, 4, 2, -1, -5, 2, 1, 0, 4, 0, -3, 2, 0, 2, -2, 1,
+ -1, -2, -1, -5, 4, 3, 3, -2, 2, 4, -5, -5, -3, -2, 4, 0, -4, 1,
+ };
+
+ float bias[] = {
+ -1, 4, 0, 2, 2, -2, 0, -4, -5, -1, 1, -2, 3, 0, 4, -2, 1, 0, 0,
+ };
+
+ float expected[] = {
+ 149496, 15553, -24193, -20956, 134094, 86432, -68283, -6366,
+ -53031, 133739, 67407, -13539, -53205, -58635, -20033, 1979,
+ };
+
+ int channels = 2;
+
+ CNN_CONFIG cnn_config = { 10,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_INPUT,
+ BRANCH_NOC,
+ {
+ 0x06,
+ 0,
+ 0x00,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 2,
+ BRANCH_OUTPUT,
+ BRANCH_NOC,
+ {
+ 0x08,
+ 0,
+ 0x00,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 3,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 2,
+ BRANCH_NO_COPY,
+ BRANCH_ADD,
+ {
+ 0x00,
+ 0,
+ 0x08,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 2,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 1,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 1,
+ BRANCH_NO_COPY,
+ BRANCH_ADD,
+ {
+ 0x00,
+ 0,
+ 0x0C,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ channels,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_ADD,
+ {
+ 0x00,
+ 0,
+ 0x02,
+ },
+ {},
+ -1,
+ },
+ {
+ channels,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ },
+ } };
+
+ // Weights and biases need to be specified separately because
+ // of the offset.
+ AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestSplittingTensors) {
+ int filter_width = 2;
+ int filter_height = 3;
+
+ int image_width = 4;
+ int image_height = 4;
+
+ float input[] = {
+ -1, -1, 2, 1, 3, 2, 4, -3, -4, -2, 2, -3, 1, -3, 4, -2,
+ };
+
+ float weights[] = {
+ -4, 1, 0, 2, 3, 4, 4, -4, -5, -3, 2, 2, -4, -3, 3, 2,
+ 4, -4, -3, -4, -4, 1, -3, -5, -3, 4, 2, -2, 2, -1, -4, -1,
+ -2, -3, 1, 1, 0, -5, -1, 3, 3, -5, -3, 0, -3, 1, -3, -1,
+ 1, -3, -2, -2, 4, -2, 0, 1, 2, 2, -4, 2, 4, 0, -5, -2,
+ 4, 4, -5, 1, 0, 2, -2, -5, -5, -3, -5, -5, 4, -3, 0, 0,
+ -4, -4, 0, -5, -4, 0, 0, -3, -5, -3, -1, 2, -1, 4, -1, 2,
+ };
+
+ float bias[] = {
+ -4, -2, -3, -3, 3, 1, -2,
+ };
+
+ float expected[] = {
+ 530, -762, 1469, 777, 849, -771, -1698, 600,
+ -658, -1821, 98, -668, -1798, 30, 887, -971,
+ };
+
+ CNN_CONFIG cnn_config = { 3,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ filter_width,
+ filter_height,
+ 4,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_OUTPUT,
+ BRANCH_NOC,
+ {
+ 0x02,
+ 2,
+ 0x00,
+ },
+ {},
+ -1,
+ },
+ {
+ 4,
+ filter_width,
+ filter_height,
+ 2,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_CAT,
+ {
+ 0x00,
+ 0,
+ 0x02,
+ },
+ {},
+ -1,
+ },
+ {
+ 4,
+ filter_width,
+ filter_height,
+ 1,
+ 1,
+ 1,
+ 0,
+ nullptr,
+ nullptr,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ },
+ } };
+
+ // Weights and biases need to be specified separately because
+ // of the offset.
+ AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestOutputChannelsCount) {
+ int filter_width = 1;
+ int filter_height = 1;
+
+ int image_width = 2;
+ int image_height = 2;
+
+ float input[] = { 0, 0, 0, 0 };
+
+ float weights[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ float bias[] = { 0, 0, 0, 0, 0, 0 };
+
+ float expected[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+
+ CNN_CONFIG cnn_config = { 3,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ filter_width,
+ filter_height,
+ 2,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_INPUT,
+ BRANCH_NOC,
+ {
+ 0x06,
+ 0,
+ 0x00,
+ },
+ {},
+ -1,
+ },
+ {
+ 1,
+ filter_width,
+ filter_height,
+ 2,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 2,
+ BRANCH_NO_COPY,
+ BRANCH_CAT,
+ {
+ 0x00,
+ 0,
+ 0x03,
+ },
+ {},
+ -1,
+ },
+ {
+ 2,
+ filter_width,
+ filter_height,
+ 2,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_CAT,
+ {
+ 0x00,
+ 0,
+ 0x04,
+ },
+ {},
+ 0,
+ },
+ } };
+
+ // Weights and biases need to be specified separately because
+ // of the offset.
+ AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestBatchNorm) {
+ int image_width = 28;
+ int image_height = 28;
+ int filter_height = 7;
+ int filter_width = 7;
+ float input[] = {
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0117647f, 0.0705882f, 0.0705882f, 0.0705882f,
+ 0.494118f, 0.533333f, 0.686275f, 0.101961f, 0.65098f, 1.0f,
+ 0.968627f, 0.498039f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.117647f, 0.141176f, 0.368627f, 0.603922f,
+ 0.666667f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.992157f,
+ 0.882353f, 0.67451f, 0.992157f, 0.94902f, 0.764706f, 0.25098f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.192157f,
+ 0.933333f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.992157f,
+ 0.992157f, 0.992157f, 0.992157f, 0.984314f, 0.364706f, 0.321569f,
+ 0.321569f, 0.219608f, 0.152941f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0705882f, 0.858824f, 0.992157f,
+ 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.776471f, 0.713725f,
+ 0.968627f, 0.945098f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.313725f, 0.611765f, 0.419608f, 0.992157f,
+ 0.992157f, 0.803922f, 0.0431373f, 0.0f, 0.168627f, 0.603922f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.054902f, 0.00392157f, 0.603922f, 0.992157f, 0.352941f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.545098f, 0.992157f, 0.745098f, 0.00784314f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0431373f,
+ 0.745098f, 0.992157f, 0.27451f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.137255f, 0.945098f,
+ 0.882353f, 0.627451f, 0.423529f, 0.00392157f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.317647f, 0.941176f, 0.992157f,
+ 0.992157f, 0.466667f, 0.0980392f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.176471f, 0.729412f, 0.992157f, 0.992157f,
+ 0.588235f, 0.105882f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0627451f, 0.364706f, 0.988235f, 0.992157f, 0.733333f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.976471f, 0.992157f, 0.976471f, 0.25098f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.180392f, 0.509804f, 0.717647f, 0.992157f,
+ 0.992157f, 0.811765f, 0.00784314f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.152941f, 0.580392f,
+ 0.898039f, 0.992157f, 0.992157f, 0.992157f, 0.980392f, 0.713725f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0941176f, 0.447059f, 0.866667f, 0.992157f, 0.992157f, 0.992157f,
+ 0.992157f, 0.788235f, 0.305882f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0901961f, 0.258824f, 0.835294f, 0.992157f,
+ 0.992157f, 0.992157f, 0.992157f, 0.776471f, 0.317647f, 0.00784314f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0705882f, 0.670588f,
+ 0.858824f, 0.992157f, 0.992157f, 0.992157f, 0.992157f, 0.764706f,
+ 0.313725f, 0.0352941f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.215686f, 0.67451f, 0.886275f, 0.992157f, 0.992157f, 0.992157f,
+ 0.992157f, 0.956863f, 0.521569f, 0.0431373f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.533333f, 0.992157f,
+ 0.992157f, 0.992157f, 0.831373f, 0.529412f, 0.517647f, 0.0627451f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f
+ };
+ float expected[] = {
+ -0.836424f, -0.857365f, -1.62739f, -1.62739f, -0.836424f, 5.40742f,
+ 0.920853f, -0.692567f, -0.836424f, -0.534405f, -1.62739f, -0.836424f,
+ 1.32602f, 1.36312f, 0.112766f, -0.836424f, -0.192962f, 1.56975f,
+ 2.45777f, 0.944414f, -0.192962f, -1.5519f, -1.5519f, -0.554006f,
+ -0.192962f, 1.4231f, -1.5519f, -0.192962f, 1.3661f, -1.5519f,
+ -1.5519f, -0.192962f, -0.843708f, -0.359025f, -0.843708f, -0.843708f,
+ -0.843708f, 4.53065f, 0.0429584f, -0.796804f, -0.843708f, 0.3473f,
+ -0.843708f, -0.843708f, -0.114439f, 3.14817f, 0.0811934f, -0.843708f
+ };
+ float kernel[] = {
+ 0.119643f, -0.237864f, 0.0462892f, 0.0502297f, -0.0134528f,
+ 0.146347f, 0.153133f, 0.0513307f, 0.0752369f, 0.0135557f,
+ -0.111434f, 0.0941854f, 0.0788362f, 0.0299412f, 0.111762f,
+ 0.144066f, 0.00431504f, -0.0177954f, 0.0738092f, -0.0344215f,
+ 0.0832582f, 0.053989f, -0.112691f, 0.0962145f, 0.0186525f,
+ -0.00660205f, -0.111962f, -0.126801f, -0.231625f, 0.17309f,
+ 0.0748875f, -0.179569f, -0.00513812f, -0.156579f, -0.147322f,
+ 0.184168f, 0.189308f, -0.200359f, -0.0156733f, 0.140649f,
+ 0.0858496f, -0.0263217f, -0.0740749f, -0.112563f, 0.107528f,
+ 0.0609729f, -0.221625f, 0.0769944f, -0.00900815f, -0.00136441f,
+ -0.0236521f, -0.0418025f, -0.00286299f, 0.12241f, 0.0964093f,
+ -0.0150897f, 0.0532171f, 0.0625916f, 0.116939f, 0.118024f,
+ 0.161918f, -0.00909767f, 0.100897f, -0.054563f, -0.175179f,
+ -0.0687892f, 0.00734235f, 0.109833f, -0.113776f, 0.0595405f,
+ -0.170255f, 0.0124815f, -0.0363301f, -0.0127038f, 0.0445554f,
+ -0.0729894f, 0.107428f, -0.0341417f, 0.132619f, 0.00984557f,
+ -0.00443654f, 0.202929f, 0.0945134f, 0.0148725f, 0.00998574f,
+ -0.0226449f, 0.0478197f, -0.0793442f, 0.0707599f, -0.084225f,
+ 0.0865795f, 0.071104f, -0.047894f, 0.0838322f, 0.0635493f,
+ -0.00370265f, -0.157247f, -0.0289622f, -0.0590963f, 0.13207f,
+ 0.00468011f, -0.0345372f, 0.217939f, 0.18861f, -0.0290393f,
+ -0.0440664f, 0.0126197f, -0.129132f, -0.124943f, 0.0968156f,
+ -0.0853643f, -0.182305f, 0.00461618f, -0.147095f, -0.230282f,
+ 0.00856019f, 0.0278893f, -0.0300229f, 0.0417871f, 0.0804717f,
+ -0.0768571f, -0.0397085f, -0.0601096f, 0.100901f, -0.0184926f,
+ 0.0350673f, 0.0971094f, -0.0171837f, -0.289644f, -0.0899041f,
+ 0.08998f, -0.160319f, -0.0195103f, 0.0392167f, -0.137864f,
+ -0.0136294f, 0.0330886f, -0.0409244f, -0.092533f, -0.0427934f,
+ -0.191144f, -0.0969461f, 0.112035f, 0.138611f, 0.128717f,
+ 0.191184f, 0.197462f
+ };
+ float bias[] = { 0.186703f, 0.204358f, -0.0230452f };
+
+ float bn_gamma[] = { 1.32173f, 1.26171f, 1.21966f };
+ float bn_beta[] = { -0.232595f, -0.222652f, -0.232209f };
+ float bn_mean[] = { 0.329233f, 0.199894f, 0.12389f };
+ float bn_std[] = { 0.311986f, 0.189737f, 0.247104f };
+
+ CNN_BATCHNORM_PARAMS bn_params = {
+ bn_gamma,
+ bn_beta,
+ bn_mean,
+ bn_std,
+ };
+
+ CNN_CONFIG cnn_config = {
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ filter_width,
+ filter_height,
+ 3,
+ 7,
+ 7,
+ 0,
+ kernel,
+ bias,
+ PADDING_VALID,
+ RELU,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ bn_params,
+ 0,
+ },
+ },
+ };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestMultithreading) {
+ int image_height = 2;
+ int image_width = 2;
+ int filter_height = 3;
+ int filter_width = 3;
+
+ float input[] = {
+ -2,
+ 4,
+ 1,
+ 0,
+ };
+
+ float weights[] = {
+ -4, 2, -2, 0, -4, 4, -3, -3, -3, -1, 1, 0, -5, -3, 0, -5, 0, 0,
+ -1, 0, 2, -5, 0, 1, 4, 2, 1, 0, -2, -1, -5, -3, 2, -2, 1, -5,
+ };
+
+ float bias[] = {
+ -4,
+ -3,
+ -2,
+ 3,
+ };
+
+ float expected[] = {
+ 2, 10, -8, -17, -24, 5, -15, 6, -5, -5, 7, -10, 4, 13, 9, -14,
+ };
+
+ CNN_CONFIG cnn_config = {
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ {
+ {
+ 1,
+ filter_width,
+ filter_height,
+ 4,
+ 1,
+ 1,
+ 0,
+ weights,
+ bias,
+ PADDING_SAME_ZERO,
+ NONE,
+ 0,
+ 0,
+ BRANCH_NO_COPY,
+ BRANCH_NOC,
+ {},
+ {},
+ 0,
+ },
+ },
+ };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ AVxWorker workers[4];
+
+ for (int i = 0; i < 4; ++i) {
+ winterface->init(&workers[i]);
+ }
+
+ thread_data = { 4, workers };
+
+ RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+ image_width, &thread_data, MSE_FLOAT_TOL);
+
+ for (int i = 0; i < 4; ++i) {
+ winterface->end(&workers[i]);
+ }
+}
+
+TEST_F(CNNTest, TestMultiOutput) {
+ const int image_dim = 8;
+ const int image_ch = 3;
+ const int filter_dim = 2;
+ const int stride = 2;
+ const int num_filters = 2;
+
+ const float input_[] = {
+ 1.7537929121f, 0.134331551012f, 0.123580039877f, 0.957731845246f,
+ 0.391006834217f, 1.00699352042f, -0.778177955829f, -0.814166433059f,
+ -0.656374394915f, 0.321967305228f, -2.19455719176f, 0.708035038966f,
+ 0.409148822266f, -0.318254408902f, 0.152450211189f, -0.250210793369f,
+ 0.826811563186f, 1.6804156584f, 0.273626975978f, 0.437936241887f,
+ -0.329935520167f, -0.288761611645f, 0.156937008304f, 0.271054157295f,
+ -0.0224828854332f, 1.70110336895f, -0.989066699309f, 1.30863131729f,
+ -0.165813705702f, 0.00380178619265f, -0.0837342367587f, 0.760954783156f,
+ -0.413610373524f, 1.17968204175f, 0.720295719536f, 0.308718974472f,
+ -1.10091337671f, 0.693160033687f, -0.0202862320697f, 1.0221927503f,
+ -1.24521801881f, -0.478501952308f, -1.71648619442f, -0.182571723636f,
+ 0.339292649504f, 2.0806519131f, 0.967974033444f, 0.175248672328f,
+ 0.0658124561472f, 0.795504169496f, 0.750592557361f, -1.46631013249f,
+ -1.79052846838f, -1.03672179515f, -0.841985521653f, 1.20995011489f,
+ 0.140859718215f, -0.651552622661f, 0.451065110806f, 1.1189443693f,
+ 0.100213260593f, -0.834076868118f, -1.28734321611f, 1.22064420095f,
+ -0.364143084361f, 0.750961509335f, -0.888689074553f, -0.8253547106f,
+ -1.21800999027f, -0.966670603566f, 1.37384014741f, 0.47281264834f,
+ -0.420416235531f, 0.520163906493f, 0.501296589423f, 1.53418976951f,
+ 0.715234751485f, 0.644551588907f, 0.0763504863375f, -0.0018541943723f,
+ 0.322853189656f, -0.795099723224f, -0.125177096675f, 1.4476577471f,
+ -0.585888410088f, -1.44391754955f, -0.610543221933f, -0.221859179799f,
+ 0.252060200774f, -0.86287169623f, -0.0350246229157f, 1.0932311997f,
+ 0.899464648842f, -0.468806951704f, -0.300861137168f, 1.15776414206f,
+ 1.03268544738f, -0.171579585622f, -0.179136557119f, -0.354091003368f,
+ -0.612298249394f, -1.20237379258f, 1.54604109659f, 0.130664370287f,
+ 0.885225111868f, 1.0362799581f, 0.980561720868f, -0.619379186999f,
+ -1.33818929924f, -0.237233737961f, -1.89335425073f, 0.567821011321f,
+ 0.862420368465f, -1.37380916821f, 0.352190056666f, 0.611261516274f,
+ 0.393237747152f, 0.894686247967f, 0.190405182149f, 0.264872662911f,
+ -0.0657009133797f, 0.0580512653493f, -0.401825294366f, 0.4106081318f,
+ 0.49484512188f, -0.0751103149442f, -1.43243736382f, 1.79855656009f,
+ -1.1075351975f, 0.000354882733011f, -0.950716438608f, 1.27129831688f,
+ 1.00495189838f, 0.110358656713f, 1.08315032822f, -0.972676676218f,
+ -0.0757668962831f, 1.88932045165f, -0.0672638136275f, 0.425913010161f,
+ -0.781540372017f, 0.976000248609f, 0.687218504122f, 1.31374513445f,
+ -0.932658930672f, -1.25339468479f, 0.422071294078f, -0.24189927912f,
+ 0.216906604642f, -1.88720997548f, 1.99252872889f, 0.353943735777f,
+ 0.737434784132f, -1.17848645017f, 1.70424254896f, 0.775297112968f,
+ -0.516392797501f, 0.398130609129f, 0.737248101457f, 0.166282500886f,
+ 1.24699015468f, 0.47116183125f, 1.19091180182f, -0.372695424578f,
+ 0.219773209389f, -0.829467838962f, -0.52533122724f, 1.98707754595f,
+ 0.553692606972f, -0.933228902369f, 1.55427751643f, -1.08813399144f,
+ -0.325686682094f, 0.205091443796f, -1.70381666435f, 0.466465327942f,
+ 1.73126863447f, -0.939133672634f, 1.48318077459f, -0.599414038168f,
+ -1.1583078687f, 0.518116190201f, 0.133571482458f, 0.84958342672f,
+ 1.02205000597f, -0.0772082009087f, -1.69567503859f, 1.4697939436f,
+ 1.67813743122f, -0.627911582938f, 0.131380509137f, -1.35717850726f,
+ };
+ const float *input[3] = { input_, &input_[image_dim * image_dim],
+ &input_[2 * image_dim * image_dim] };
+
+ const float bias[] = { 0.0f, 0.0f };
+
+ const float weights_1[] = {
+ -0.489547413618f, 0.141916424749f, -0.279286485585f, -0.115322211094f,
+ 0.299572786936f, 0.205289980785f, -0.536254480088f, -0.253626313744f,
+ -0.422883815849f, -0.169702966298f, -0.540104704793f, 0.495319646763f,
+ 0.298799079422f, -0.10054550901f, -0.306085047056f, 0.171061886165f,
+ -0.108058703878f, -0.410734629888f, -0.0640674673049f, -0.386524840979f,
+ -0.157203423678f, -0.362138920529f, -0.216206085209f, 0.147502517971f,
+ };
+
+ const float weights_2[] = {
+ 0.207580604357f, 0.480821146263f, -0.29111909562f, 0.47422567493f,
+ 0.206892553253f, -0.235067084092f, 0.354516800602f, -0.212399370252f,
+ -0.419071343731f, -0.050350731631f, -0.0516457320279f, -0.0359310500731f,
+ 0.567044864811f, -0.060341127522f, 0.0501464839637f, -0.437785677916f,
+ };
+
+ const float weights_3[] = {
+ -0.0690452401448f, -0.356657338763f, -0.219464031809f, 0.551288365843f,
+ 0.181372090853f, -0.00245268542109f, 0.409000696276f, -0.593209108763f,
+ 0.587352566749f, -0.243720660227f, 0.266232713887f, -0.00439285245097f,
+ 0.252883228305f, 0.152646192631f, 0.0918944932026f, 0.398853715057f,
+ };
+
+ const float weights_4[] = {
+ 0.207560791573f, 0.194201350401f, 0.227802322443f, 0.206533663345f,
+ 0.0557331066805f, 0.0224159800424f, -0.143939197467f, -0.27703361602f,
+ 0.130643888389f, -0.269456557461f, 0.186242862864f, -0.162879944774f,
+ -0.145503996718f, -0.0768822987581f, -0.203127976359f, -0.238119922873f,
+ -0.258806479994f, 0.0357957680385f, -0.1027606976f, -0.287920082345f,
+ 0.189047820993f, 0.250711538481f, -0.272815714175f, -0.0431449742024f,
+ 0.207261230996f, -0.0396472677451f, 0.131236557412f, 0.174291832499f,
+ -0.251515885765f, -0.107164007499f, 0.185824534748f, -0.00561585838161f,
+ 0.273393799578f, -0.139563699075f, -0.263922456031f, -0.118859844081f,
+ 0.109230982597f, -0.170170294794f, 0.0123025648515f, -0.0839368964355f,
+ -0.0774058234297f, 0.255847138286f, -0.208430879637f, 0.279170114319f,
+ -0.272890330712f, -0.217725903006f, -0.295923275459f, -0.17008723953f,
+ -0.284281803405f, 0.281406323629f, 0.266910044663f, -0.209963914338f,
+ 0.271980962964f, 0.142013581699f, -0.143896509026f, -0.290509242975f,
+ -0.305768180935f, 0.196902832117f, -0.090424189662f, -0.147460802346f,
+ 0.217722016651f, 0.12353848977f, -0.169177363577f, -0.0454230918512f,
+ };
+
+ const float expected_0[] = {
+ -2.04858441055f, -2.12883075791f, -0.045177363807f, 0.763949675768f,
+ -0.544361512821f, -1.58123168032f, 1.89319847039f, 0.16859080901f,
+ -1.16023321135f, -0.396988107751f, 1.76637090744f, -1.40434786514f,
+ 0.908227575669f, 0.817064817605f, 0.215631134908f, -0.848605613428f,
+ -0.106756747018f, 0.0193027166685f, 0.801345615113f, -0.395407237598f,
+ -1.79983795658f, -1.73054496242f, 0.0584392594454f, -0.388786095569f,
+ -0.237269619354f, 0.000843578271263f, -1.24043512104f, 0.487839445893f,
+ -0.394259726605f, 0.559632843424f, -0.527224052291f, -1.53792340282f,
+ };
+
+ const float expected_1[] = {
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.4057888292f, 0.325309571755f,
+ 0.0f, 1.22013465602f,
+ };
+
+ const float expected_2[] = {
+ 0.156119444687f,
+ 0.517385299817f,
+ };
+
+ const float expected_3[] = {
+ 0.224177852984f,
+ 0.503384419034f,
+ 0.156119444687f,
+ 0.517385299817f,
+ };
+
+ const float *expected[] = { expected_0, expected_1, expected_2, expected_3 };
+
+ CNN_CONFIG cnn_config = {
+ 4, // num_layers
+ 0, // is_residue
+ 0, // ext_width
+ 0, // ext_height
+ 0, // strict_bounds
+ {
+ // layer_config
+ {
+ image_ch, // in_channels
+ filter_dim, // filter_width
+ filter_dim, // filter_height
+ num_filters, // out_channels
+ stride, // skip_width
+ stride, // skip_height
+ 0, // max_pool
+ weights_1, // weights
+ bias, // bias
+ PADDING_SAME_ZERO, // pad
+ NONE, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_OUTPUT, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ { 2, 0, 0 }, // branch_config
+ {}, // bn_params
+ 0, // output_num
+ },
+ {
+ num_filters, // in_channels
+ filter_dim, // filter_width
+ filter_dim, // filter_height
+ num_filters, // out_channels
+ stride, // skip_width
+ stride, // skip_height
+ 0, // max_pool
+ weights_2, // weights
+ bias, // bias
+ PADDING_SAME_ZERO, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ {}, // branch_config
+ {}, // bn_params
+ 1, // output_num
+ },
+ {
+ num_filters, // in_channels
+ filter_dim, // filter_width
+ filter_dim, // filter_height
+ num_filters, // out_channels
+ stride, // skip_width
+ stride, // skip_height
+ 0, // max_pool
+ weights_3, // weights
+ bias, // bias
+ PADDING_SAME_ZERO, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ {}, // branch_config
+ {}, // bn_params
+ 2, // output_num
+ },
+ {
+ num_filters, // in_channels
+ 2 * filter_dim, // filter_width
+ 2 * filter_dim, // filter_height
+ num_filters, // out_channels
+ 2 * stride, // skip_width
+ 2 * stride, // skip_height
+ 0, // max_pool
+ weights_4, // weights
+ bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 1, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_CAT, // branch_combine_type
+ { 0, 0, 1 }, // branch_config
+ {}, // bn_params
+ 3, // output_num
+ },
+ },
+ };
+
+ CNN_THREAD_DATA thread_data = { 1, nullptr };
+
+ const int num_outputs = 4;
+ const int output_chs[4] = { filter_dim, filter_dim, filter_dim,
+ 2 * filter_dim };
+ const int output_dims[4] = { 4, 2, 1, 1 };
+ const int output_sizes[4] = {
+ output_chs[0] * output_dims[0] * output_dims[0],
+ output_chs[1] * output_dims[1] * output_dims[1],
+ output_chs[2] * output_dims[2] * output_dims[2],
+ output_chs[3] * output_dims[3] * output_dims[3],
+ };
+ float *const output_ = (float *)aom_malloc(
+ sizeof(*output_) *
+ (output_sizes[0] + output_sizes[1] + output_sizes[2] + output_sizes[3]));
+ ASSERT_NE(output_, nullptr);
+ float *output[CNN_MAX_CHANNELS] = { nullptr };
+ int ch_ite = 0;
+ float *output_ite = output_;
+ for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+ for (int channel = 0; channel < output_chs[output_idx]; ++channel) {
+ output[ch_ite++] = output_ite;
+ output_ite += output_dims[output_idx] * output_dims[output_idx];
+ }
+ }
+ CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_dims,
+ output };
+
+ RunMultiOutCNNTest(input, image_dim, image_dim, image_dim, &cnn_config,
+ &thread_data, &output_struct, expected, MSE_FLOAT_TOL);
+
+ aom_free(output_);
+}
+
+namespace {
+
+typedef void (*CNNConvolveNoMaxpoolPaddingValidFunc)(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step);
+
+typedef libaom_test::FuncParam<CNNConvolveNoMaxpoolPaddingValidFunc>
+ CNNConvolveTestFuncs;
+
+class CNNConvolveTest : public ::testing::TestWithParam<CNNConvolveTestFuncs> {
+ protected:
+ void SetUp() override { params_ = GetParam(); }
+
+ void RunCNNConvolveSetup(int run_times) {
+ int in_width = 65;
+ int in_height = 65;
+
+ const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+ for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+ int out_width = 0, out_height = 0;
+ int in_size = in_width * in_height;
+ // Get current layer output width and height.
+ av1_find_cnn_layer_output_size(in_height, in_width,
+ &cnn_config->layer_config[layer],
+ &out_width, &out_height);
+
+ int out_size = out_width * out_height;
+ float *input[20], *output_ref[20], *output_mod[20];
+
+ float *input_data =
+ (float *)aom_malloc(sizeof(*input_data) * in_size *
+ cnn_config->layer_config[layer].in_channels);
+ float *temp_ptr = input_data;
+ ASSERT_NE(temp_ptr, nullptr);
+ for (int i = 0; i < cnn_config->layer_config[layer].in_channels; ++i) {
+ input[i] = temp_ptr;
+ for (int j = 0; j < in_size; j++) {
+ *(temp_ptr++) = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+ }
+ }
+
+ float *out_data_ref = (float *)aom_calloc(
+ sizeof(*out_data_ref),
+ out_size * cnn_config->layer_config[layer].out_channels);
+ ASSERT_NE(out_data_ref, nullptr);
+ float *out_data_mod = (float *)aom_calloc(
+ sizeof(*out_data_mod),
+ out_size * cnn_config->layer_config[layer].out_channels);
+ ASSERT_NE(out_data_mod, nullptr);
+ float *temp_ptr1 = out_data_ref;
+ float *temp_ptr2 = out_data_mod;
+ for (int i = 0; i < cnn_config->layer_config[layer].out_channels; ++i) {
+ output_ref[i] = temp_ptr1;
+ output_mod[i] = temp_ptr2;
+ temp_ptr1 += out_size;
+ temp_ptr2 += out_size;
+ }
+
+ RunCNNConvolveTest(input, in_width, in_height, out_size,
+ &cnn_config->layer_config[layer], 0, 1, run_times,
+ layer, output_ref, output_mod, out_width);
+
+ // Set current layer output width and height as next layer input width and
+ // height.
+ in_width = out_width;
+ in_height = out_height;
+
+ aom_free(input_data);
+ aom_free(out_data_ref);
+ aom_free(out_data_mod);
+ }
+ }
+
+ void RunCNNConvolveTest(float **input, int in_width, int in_height,
+ int out_size, const CNN_LAYER_CONFIG *layer_config,
+ int start_idx, int step, int run_times, int layer,
+ float **output_ref, float **output_mod,
+ int out_stride) {
+ const int cstep = layer_config->in_channels * layer_config->out_channels;
+ const int channel_step = AOMMAX(step, 1);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.ref_func((const float **)input, in_width, in_height, in_width,
+ layer_config, output_ref, out_stride, start_idx, cstep,
+ channel_step);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ params_.tst_func((const float **)input, in_width, in_height, in_width,
+ layer_config, output_mod, out_stride, start_idx, cstep,
+ channel_step);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ if (run_times > 1) {
+ printf("layer : %d \n", layer);
+ printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2);
+ } else {
+ for (int channel = 0; channel < layer_config->out_channels; ++channel) {
+ const float *buf_ref = output_ref[channel];
+ const float *buf_mod = output_mod[channel];
+
+ for (int i = 0; i < out_size; ++i) {
+ if (buf_ref[i] < CNN_CONVOLVE_PIXELWISE_FLOAT_TOL) {
+ ASSERT_LE(buf_ref[i], CNN_CONVOLVE_PIXELWISE_FLOAT_TOL)
+ << "Reference output was near-zero, test output was not ("
+ << buf_mod[i] << ")";
+ } else {
+ const float error = buf_ref[i] - buf_mod[i];
+ const float relative_error = fabsf(error / buf_ref[i]);
+ ASSERT_LE(relative_error, CNN_CONVOLVE_PIXELWISE_FLOAT_TOL)
+ << " channel " << channel << " pixel " << i << ": "
+ << buf_ref[i] << "/" << buf_mod[i] << std::endl;
+ }
+ }
+ }
+ }
+ }
+
+ private:
+ CNNConvolveTestFuncs params_;
+ libaom_test::ACMRandom rng_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(CNNConvolveTest);
+
+TEST_P(CNNConvolveTest, CheckOutput) { RunCNNConvolveSetup(1); }
+
+TEST_P(CNNConvolveTest, DISABLED_Speed) { RunCNNConvolveSetup(100000); }
+
+#if HAVE_AVX2 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+INSTANTIATE_TEST_SUITE_P(AVX2, CNNConvolveTest,
+ ::testing::Values(CNNConvolveTestFuncs(
+ &av1_cnn_convolve_no_maxpool_padding_valid_c,
+ &av1_cnn_convolve_no_maxpool_padding_valid_avx2)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CNNConvolveTest,
+ ::testing::Values(CNNConvolveTestFuncs(
+ &av1_cnn_convolve_no_maxpool_padding_valid_c,
+ &av1_cnn_convolve_no_maxpool_padding_valid_neon)));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/codec_factory.h b/third_party/aom/test/codec_factory.h
new file mode 100644
index 0000000000..7ffc465a7b
--- /dev/null
+++ b/third_party/aom/test/codec_factory.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_CODEC_FACTORY_H_
+#define AOM_TEST_CODEC_FACTORY_H_
+
+#include <tuple>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+#if CONFIG_AV1_DECODER
+#include "aom/aomdx.h"
+#endif
+
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+namespace libaom_test {
+
+const int kCodecFactoryParam = 0;
+
+class CodecFactory {
+ public:
+ CodecFactory() = default;
+
+ virtual ~CodecFactory() = default;
+
+ virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const = 0;
+
+ virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+ const aom_codec_flags_t flags) const = 0;
+
+ virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+ const aom_codec_flags_t init_flags,
+ TwopassStatsStore *stats) const = 0;
+
+ virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+ unsigned int usage) const = 0;
+};
+
+/* Provide CodecTestWith<n>Params classes for a variable number of parameters
+ * to avoid having to include a pointer to the CodecFactory in every test
+ * definition.
+ */
+template <class T1>
+class CodecTestWithParam
+ : public ::testing::TestWithParam<
+ std::tuple<const libaom_test::CodecFactory *, T1> > {};
+
+template <class T1, class T2>
+class CodecTestWith2Params
+ : public ::testing::TestWithParam<
+ std::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
+
+template <class T1, class T2, class T3>
+class CodecTestWith3Params
+ : public ::testing::TestWithParam<
+ std::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
+
+template <class T1, class T2, class T3, class T4>
+class CodecTestWith4Params
+ : public ::testing::TestWithParam<
+ std::tuple<const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
+
+template <class T1, class T2, class T3, class T4, class T5>
+class CodecTestWith5Params
+ : public ::testing::TestWithParam<
+ std::tuple<const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {
+};
+
+template <class T1, class T2, class T3, class T4, class T5, class T6>
+class CodecTestWith6Params
+ : public ::testing::TestWithParam<std::tuple<
+ const libaom_test::CodecFactory *, T1, T2, T3, T4, T5, T6> > {};
+
+/*
+ * AV1 Codec Definitions
+ */
+class AV1Decoder : public Decoder {
+ public:
+ explicit AV1Decoder(aom_codec_dec_cfg_t cfg) : Decoder(cfg) {}
+
+ AV1Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag)
+ : Decoder(cfg, flag) {}
+
+ protected:
+ aom_codec_iface_t *CodecInterface() const override {
+#if CONFIG_AV1_DECODER
+ return aom_codec_av1_dx();
+#else
+ return nullptr;
+#endif
+ }
+};
+
+class AV1Encoder : public Encoder {
+ public:
+ AV1Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags,
+ TwopassStatsStore *stats)
+ : Encoder(cfg, init_flags, stats) {}
+
+ protected:
+ aom_codec_iface_t *CodecInterface() const override {
+#if CONFIG_AV1_ENCODER
+ return aom_codec_av1_cx();
+#else
+ return nullptr;
+#endif
+ }
+};
+
+class AV1CodecFactory : public CodecFactory {
+ public:
+ AV1CodecFactory() : CodecFactory() {}
+
+ Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const override {
+ return CreateDecoder(cfg, 0);
+ }
+
+ Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+ const aom_codec_flags_t flags) const override {
+#if CONFIG_AV1_DECODER
+ return new AV1Decoder(cfg, flags);
+#else
+ (void)cfg;
+ (void)flags;
+ return nullptr;
+#endif
+ }
+
+ Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+ const aom_codec_flags_t init_flags,
+ TwopassStatsStore *stats) const override {
+#if CONFIG_AV1_ENCODER
+ return new AV1Encoder(cfg, init_flags, stats);
+#else
+ (void)cfg;
+ (void)init_flags;
+ (void)stats;
+ return nullptr;
+#endif
+ }
+
+ aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+ unsigned int usage) const override {
+#if CONFIG_AV1_ENCODER
+ return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
+#else
+ (void)cfg;
+ (void)usage;
+ return AOM_CODEC_INCAPABLE;
+#endif
+ }
+};
+
+const libaom_test::AV1CodecFactory kAV1;
+
+#define AV1_INSTANTIATE_TEST_SUITE(test, ...) \
+ INSTANTIATE_TEST_SUITE_P( \
+ AV1, test, \
+ ::testing::Combine( \
+ ::testing::Values(static_cast<const libaom_test::CodecFactory *>( \
+ &libaom_test::kAV1)), \
+ __VA_ARGS__))
+
+} // namespace libaom_test
+#endif // AOM_TEST_CODEC_FACTORY_H_
diff --git a/third_party/aom/test/coding_path_sync.cc b/third_party/aom/test/coding_path_sync.cc
new file mode 100644
index 0000000000..f7b7eace90
--- /dev/null
+++ b/third_party/aom/test/coding_path_sync.cc
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+#include "aom/aomdx.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_decoder.h"
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+using libaom_test::ACMRandom;
+namespace {
+
+class CompressedSource {
+ public:
+ explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) {
+ aom_codec_iface_t *algo = aom_codec_av1_cx();
+
+ aom_codec_enc_cfg_t cfg;
+#if CONFIG_REALTIME_ONLY
+ aom_codec_enc_config_default(algo, &cfg, 1);
+#else
+ aom_codec_enc_config_default(algo, &cfg, 0);
+#endif
+
+ // force the quantizer, to reduce the sensitivity on encoding choices.
+ // e.g, we don't want this test to break when the rate control is modified.
+ {
+ const int max_q = cfg.rc_max_quantizer;
+ const int min_q = cfg.rc_min_quantizer;
+ const int q = rnd_.PseudoUniform(max_q - min_q + 1) + min_q;
+
+ cfg.rc_end_usage = AOM_Q;
+ cfg.rc_max_quantizer = q;
+ cfg.rc_min_quantizer = q;
+ }
+
+ // choose the picture size
+ {
+ width_ = rnd_.PseudoUniform(kWidth - 8) + 8;
+ height_ = rnd_.PseudoUniform(kHeight - 8) + 8;
+ }
+
+ // choose the chroma subsampling
+ {
+ const aom_img_fmt_t fmts[] = {
+ AOM_IMG_FMT_I420,
+ AOM_IMG_FMT_I422,
+ AOM_IMG_FMT_I444,
+ };
+
+ format_ = fmts[rnd_.PseudoUniform(NELEMENTS(fmts))];
+ }
+
+ cfg.g_w = width_;
+ cfg.g_h = height_;
+ cfg.g_lag_in_frames = 0;
+ if (format_ == AOM_IMG_FMT_I420)
+ cfg.g_profile = 0;
+ else if (format_ == AOM_IMG_FMT_I444)
+ cfg.g_profile = 1;
+ else if (format_ == AOM_IMG_FMT_I422)
+ cfg.g_profile = 2;
+
+ aom_codec_enc_init(&enc_, algo, &cfg, 0);
+ }
+
+ ~CompressedSource() { aom_codec_destroy(&enc_); }
+
+ const aom_codec_cx_pkt_t *ReadFrame() {
+ uint8_t buf[kWidth * kHeight * 3] = { 0 };
+
+ // render regular pattern
+ const int period = rnd_.Rand8() % 32 + 1;
+ const int phase = rnd_.Rand8() % period;
+
+ const int val_a = rnd_.Rand8();
+ const int val_b = rnd_.Rand8();
+
+ for (int i = 0; i < (int)sizeof buf; ++i)
+ buf[i] = (i + phase) % period < period / 2 ? val_a : val_b;
+
+ aom_image_t img;
+ aom_img_wrap(&img, format_, width_, height_, 0, buf);
+ aom_codec_encode(&enc_, &img, frame_count_++, 1, 0);
+
+ aom_codec_iter_t iter = nullptr;
+
+ const aom_codec_cx_pkt_t *pkt = nullptr;
+
+ do {
+ pkt = aom_codec_get_cx_data(&enc_, &iter);
+ } while (pkt && pkt->kind != AOM_CODEC_CX_FRAME_PKT);
+
+ return pkt;
+ }
+
+ private:
+ static const int kWidth = 128;
+ static const int kHeight = 128;
+
+ ACMRandom rnd_;
+ aom_img_fmt_t format_;
+ aom_codec_ctx_t enc_;
+ int frame_count_;
+ int width_, height_;
+};
+
+// lowers an aom_image_t to an easily comparable/printable form
+std::vector<uint16_t> Serialize(const aom_image_t *img) {
+ std::vector<uint16_t> bytes;
+ bytes.reserve(img->d_w * img->d_h * 3);
+ for (int plane = 0; plane < 3; ++plane) {
+ const int w = aom_img_plane_width(img, plane);
+ const int h = aom_img_plane_height(img, plane);
+
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ const unsigned char *row = img->planes[plane] + r * img->stride[plane];
+ if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ const uint16_t *row16 = reinterpret_cast<const uint16_t *>(row);
+ bytes.push_back(row16[c]);
+ } else {
+ bytes.push_back(row[c]);
+ }
+ }
+ }
+ }
+
+ return bytes;
+}
+
+class Decoder {
+ public:
+ explicit Decoder(int allowLowbitdepth) {
+ aom_codec_iface_t *algo = aom_codec_av1_dx();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.allow_lowbitdepth = allowLowbitdepth;
+
+ aom_codec_dec_init(&dec_, algo, &cfg, 0);
+ }
+
+ ~Decoder() { aom_codec_destroy(&dec_); }
+
+ std::vector<uint16_t> decode(const aom_codec_cx_pkt_t *pkt) {
+ aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz, nullptr);
+
+ aom_codec_iter_t iter = nullptr;
+ return Serialize(aom_codec_get_frame(&dec_, &iter));
+ }
+
+ private:
+ aom_codec_ctx_t dec_;
+};
+
+// Try to reveal a mismatch between LBD and HBD coding paths.
+TEST(CodingPathSync, SearchForHbdLbdMismatch) {
+ const int count_tests = 10;
+ for (int i = 0; i < count_tests; ++i) {
+ Decoder dec_hbd(0);
+ Decoder dec_lbd(1);
+
+ CompressedSource enc(i);
+
+ for (int k = 0; k < 3; ++k) {
+ const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+ std::vector<uint16_t> lbd_yuv = dec_lbd.decode(frame);
+ std::vector<uint16_t> hbd_yuv = dec_hbd.decode(frame);
+
+ ASSERT_EQ(lbd_yuv, hbd_yuv);
+ }
+ }
+}
+
+TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
+ const int count_tests = 100;
+ const int seed = 1234;
+ for (int i = 0; i < count_tests; ++i) {
+ Decoder dec_hbd(0);
+ Decoder dec_lbd(1);
+
+ CompressedSource enc(seed + i);
+
+ for (int k = 0; k < 5; ++k) {
+ const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+ std::vector<uint16_t> lbd_yuv = dec_lbd.decode(frame);
+ std::vector<uint16_t> hbd_yuv = dec_hbd.decode(frame);
+
+ ASSERT_EQ(lbd_yuv, hbd_yuv);
+ }
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/comp_avg_pred_test.cc b/third_party/aom/test/comp_avg_pred_test.cc
new file mode 100644
index 0000000000..2f81d7e9b7
--- /dev/null
+++ b/third_party/aom/test/comp_avg_pred_test.cc
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/comp_avg_pred_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgParam;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGTest);
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1DISTWTDCOMPAVGUPSAMPLEDTest);
+using libaom_test::AV1DISTWTDCOMPAVG::DistWtdCompAvgTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdCompAvgTest);
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighBDDISTWTDCOMPAVGTest);
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(
+ AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest);
+#endif
+using std::make_tuple;
+using std::tuple;
+
+uint8_t *DistWtdCompAvgTest::reference_data_ = nullptr;
+uint8_t *DistWtdCompAvgTest::second_pred_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred_test_ = nullptr;
+uint8_t *DistWtdCompAvgTest::reference_data8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::second_pred8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred8_ = nullptr;
+uint8_t *DistWtdCompAvgTest::comp_pred8_test_ = nullptr;
+uint16_t *DistWtdCompAvgTest::reference_data16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::second_pred16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::comp_pred16_ = nullptr;
+uint16_t *DistWtdCompAvgTest::comp_pred16_test_ = nullptr;
+
+namespace {
+
+TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_dist_wtd_comp_avg_pred_ssse3));
+#endif
+
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0));
+}
+
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_dist_wtd_comp_avg_upsampled_pred_neon));
+#endif // HAVE_NEON
+
+TEST_P(DistWtdCompAvgTest, MaxRef) {
+ FillConstant(reference_data_, reference_stride_, mask_);
+ FillConstant(second_pred_, width_, 0);
+ CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
+ FillConstant(reference_data_, reference_stride_, 0);
+ FillConstant(second_pred_, width_, mask_);
+ CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckCompAvg();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdCompAvgTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckCompAvg();
+ reference_stride_ = tmp_stride;
+}
+
+// TODO(chengchen): add highbd tests
+const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+
+#if HAVE_SSSE3
+const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+const DistWtdCompAvgParam dist_wtd_comp_avg_neon_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_neon, -1),
+ make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_neon, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_neon_tests));
+#endif // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(1));
+}
+
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(1));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_pred_neon, 1));
+#endif
+
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(1));
+}
+
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(1));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/comp_avg_pred_test.h b/third_party/aom/test/comp_avg_pred_test.h
new file mode 100644
index 0000000000..396df2e2dd
--- /dev/null
+++ b/third_party/aom/test/comp_avg_pred_test.h
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_
+#define AOM_TEST_COMP_AVG_PRED_TEST_H_
+
+#include <tuple>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+#include "av1/common/common_data.h"
+#include "aom_ports/aom_timer.h"
+
+namespace libaom_test {
+const int kMaxSize = 128 + 32; // padding
+
+namespace AV1DISTWTDCOMPAVG {
+
+typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef void (*distwtdcompavgupsampled_func)(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
+
+typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
+ DISTWTDCOMPAVGUPSAMPLEDParam;
+
+typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*highbddistwtdcompavgupsampled_func)(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search);
+
+typedef std::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
+ HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
+
+typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
+ HighbdDISTWTDCOMPAVGParam;
+
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
+ distwtdcompavg_func filter, int is_hbd) {
+ (void)is_hbd;
+ return ::testing::Combine(::testing::Range(8, 13, 2),
+ ::testing::Values(filter),
+ ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
+BuildParams(highbddistwtdcompavgupsampled_func filter) {
+ return ::testing::Combine(::testing::Range(8, 13, 2),
+ ::testing::Values(filter),
+ ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+ distwtdcompavg_func filter) {
+ return ::testing::Combine(::testing::Values(filter),
+ ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
+ distwtdcompavgupsampled_func filter) {
+ return ::testing::Combine(::testing::Values(filter),
+ ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+class AV1DISTWTDCOMPAVGTest
+ : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
+ public:
+ ~AV1DISTWTDCOMPAVGTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+ void RunCheckOutput(distwtdcompavg_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(1);
+
+ uint8_t pred8[kMaxSize * kMaxSize];
+ uint8_t ref8[kMaxSize * kMaxSize];
+ uint8_t output[kMaxSize * kMaxSize];
+ uint8_t output2[kMaxSize * kMaxSize];
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand8();
+ ref8[i * w + j] = rnd_.Rand8();
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+ for (int ii = 0; ii < 2; ii++) {
+ for (int jj = 0; jj < 4; jj++) {
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+
+ const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+ const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+ aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
+ in_w, in_h, ref8 + offset_r * w + offset_c,
+ in_w, &dist_wtd_comp_params);
+ test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
+ ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
+
+ for (int i = 0; i < in_h; ++i) {
+ for (int j = 0; j < in_w; ++j) {
+ int idx = i * in_w + j;
+ ASSERT_EQ(output[idx], output2[idx])
+ << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
+ << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+ << " = (" << i << ", " << j << ")";
+ }
+ }
+ }
+ }
+ }
+ void RunSpeedTest(distwtdcompavg_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(1);
+
+ uint8_t pred8[kMaxSize * kMaxSize];
+ uint8_t ref8[kMaxSize * kMaxSize];
+ uint8_t output[kMaxSize * kMaxSize];
+ uint8_t output2[kMaxSize * kMaxSize];
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand8();
+ ref8[i * w + j] = rnd_.Rand8();
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+
+ const int num_loops = 1000000000 / (in_w + in_h);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+ &dist_wtd_comp_params);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
+
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ 1000.0 * elapsed_time1 / num_loops);
+ }
+
+ libaom_test::ACMRandom rnd_;
+}; // class AV1DISTWTDCOMPAVGTest
+
+class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+ : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
+ public:
+ ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+ void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(1);
+
+ uint8_t pred8[kMaxSize * kMaxSize];
+ uint8_t ref8[kMaxSize * kMaxSize];
+ DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand8();
+ ref8[i * w + j] = rnd_.Rand8();
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+ int sub_x_q3, sub_y_q3;
+ int subpel_search;
+ for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
+ for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+ for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+ for (int ii = 0; ii < 2; ii++) {
+ for (int jj = 0; jj < 4; jj++) {
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ dist_wtd_comp_params.bck_offset =
+ quant_dist_lookup_table[jj][1 - ii];
+
+ const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+ const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+ aom_dist_wtd_comp_avg_upsampled_pred_c(
+ nullptr, nullptr, 0, 0, nullptr, output,
+ pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+ sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+ &dist_wtd_comp_params, subpel_search);
+ test_impl(nullptr, nullptr, 0, 0, nullptr, output2,
+ pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+ sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+ &dist_wtd_comp_params, subpel_search);
+
+ for (int i = 0; i < in_h; ++i) {
+ for (int j = 0; j < in_w; ++j) {
+ int idx = i * in_w + j;
+ ASSERT_EQ(output[idx], output2[idx])
+ << "Mismatch at unit tests for "
+ "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
+ << in_w << "x" << in_h << " Pixel mismatch at index "
+ << idx << " = (" << i << ", " << j
+ << "), sub pixel offset = (" << sub_y_q3 << ", "
+ << sub_x_q3 << ")";
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(1);
+
+ uint8_t pred8[kMaxSize * kMaxSize];
+ uint8_t ref8[kMaxSize * kMaxSize];
+ DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand8();
+ ref8[i * w + j] = rnd_.Rand8();
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+
+ int sub_x_q3 = 0;
+ int sub_y_q3 = 0;
+
+ const int num_loops = 1000000000 / (in_w + in_h);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter.
+
+ for (int i = 0; i < num_loops; ++i)
+ aom_dist_wtd_comp_avg_upsampled_pred_c(
+ nullptr, nullptr, 0, 0, nullptr, output, pred8, in_w, in_h, sub_x_q3,
+ sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(nullptr, nullptr, 0, 0, nullptr, output2, pred8, in_w, in_h,
+ sub_x_q3, sub_y_q3, ref8, in_w, &dist_wtd_comp_params,
+ subpel_search);
+
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ 1000.0 * elapsed_time1 / num_loops);
+ }
+
+ libaom_test::ACMRandom rnd_;
+}; // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+
+class DistWtdCompAvgTest
+ : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+ public ::testing::Test {
+ public:
+ DistWtdCompAvgTest()
+ : width_(GET_PARAM(0)), height_(GET_PARAM(1)), bd_(GET_PARAM(3)) {}
+
+ static void SetUpTestSuite() {
+ reference_data8_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBufferSize));
+ ASSERT_NE(reference_data8_, nullptr);
+ second_pred8_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(second_pred8_, nullptr);
+ comp_pred8_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(comp_pred8_, nullptr);
+ comp_pred8_test_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(comp_pred8_test_, nullptr);
+ reference_data16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+ ASSERT_NE(reference_data16_, nullptr);
+ second_pred16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(second_pred16_, nullptr);
+ comp_pred16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(comp_pred16_, nullptr);
+ comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(comp_pred16_test_, nullptr);
+ }
+
+ static void TearDownTestSuite() {
+ aom_free(reference_data8_);
+ reference_data8_ = nullptr;
+ aom_free(second_pred8_);
+ second_pred8_ = nullptr;
+ aom_free(comp_pred8_);
+ comp_pred8_ = nullptr;
+ aom_free(comp_pred8_test_);
+ comp_pred8_test_ = nullptr;
+ aom_free(reference_data16_);
+ reference_data16_ = nullptr;
+ aom_free(second_pred16_);
+ second_pred16_ = nullptr;
+ aom_free(comp_pred16_);
+ comp_pred16_ = nullptr;
+ aom_free(comp_pred16_test_);
+ comp_pred16_test_ = nullptr;
+ }
+
+ protected:
+ // Handle up to 4 128x128 blocks, with stride up to 256
+ static const int kDataAlignment = 16;
+ static const int kDataBlockSize = 128 * 256;
+ static const int kDataBufferSize = 4 * kDataBlockSize;
+
+ void SetUp() override {
+ if (bd_ == -1) {
+ use_high_bit_depth_ = false;
+ bit_depth_ = AOM_BITS_8;
+ reference_data_ = reference_data8_;
+ second_pred_ = second_pred8_;
+ comp_pred_ = comp_pred8_;
+ comp_pred_test_ = comp_pred8_test_;
+ } else {
+ use_high_bit_depth_ = true;
+ bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+ reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+ second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+ comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+ comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+ }
+ mask_ = (1 << bit_depth_) - 1;
+ reference_stride_ = width_ * 2;
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ virtual uint8_t *GetReference(int block_idx) {
+ if (use_high_bit_depth_)
+ return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+ block_idx * kDataBlockSize);
+ return reference_data_ + block_idx * kDataBlockSize;
+ }
+
+ void ReferenceDistWtdCompAvg(int block_idx) {
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const second_pred8 = second_pred_;
+ uint8_t *const comp_pred8 = comp_pred_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+ uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ const int tmp =
+ second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+ reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+ comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+ } else {
+ const int tmp =
+ second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+ reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+ comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+ }
+ }
+ }
+ }
+
+ void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+ } else {
+ data16[h * stride + w] = fill_constant;
+ }
+ }
+ }
+ }
+
+ void FillRandom(uint8_t *data, int stride) {
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = rnd_.Rand8();
+ } else {
+ data16[h * stride + w] = rnd_.Rand16() & mask_;
+ }
+ }
+ }
+ }
+
+ void dist_wtd_comp_avg(int block_idx) {
+ const uint8_t *const reference = GetReference(block_idx);
+
+ API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+ height_, reference, reference_stride_,
+ &jcp_param_));
+ }
+
+ void CheckCompAvg() {
+ for (int j = 0; j < 2; ++j) {
+ for (int i = 0; i < 4; ++i) {
+ jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+ jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
+
+ ReferenceDistWtdCompAvg(0);
+ dist_wtd_comp_avg(0);
+
+ for (int y = 0; y < height_; ++y)
+ for (int x = 0; x < width_; ++x)
+ ASSERT_EQ(comp_pred_[y * width_ + x],
+ comp_pred_test_[y * width_ + x]);
+ }
+ }
+ }
+
+ int width_, height_, mask_, bd_;
+ aom_bit_depth_t bit_depth_;
+ static uint8_t *reference_data_;
+ static uint8_t *second_pred_;
+ bool use_high_bit_depth_;
+ static uint8_t *reference_data8_;
+ static uint8_t *second_pred8_;
+ static uint16_t *reference_data16_;
+ static uint16_t *second_pred16_;
+ int reference_stride_;
+ static uint8_t *comp_pred_;
+ static uint8_t *comp_pred8_;
+ static uint16_t *comp_pred16_;
+ static uint8_t *comp_pred_test_;
+ static uint8_t *comp_pred8_test_;
+ static uint16_t *comp_pred16_test_;
+ DIST_WTD_COMP_PARAMS jcp_param_;
+
+ ACMRandom rnd_;
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class AV1HighBDDISTWTDCOMPAVGTest
+ : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
+ public:
+ ~AV1HighBDDISTWTDCOMPAVGTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+ void RunCheckOutput(distwtdcompavg_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ uint16_t pred8[kMaxSize * kMaxSize];
+ uint16_t ref8[kMaxSize * kMaxSize];
+ uint16_t output[kMaxSize * kMaxSize];
+ uint16_t output2[kMaxSize * kMaxSize];
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+ for (int ii = 0; ii < 2; ii++) {
+ for (int jj = 0; jj < 4; jj++) {
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+
+ const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+ const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+ aom_highbd_dist_wtd_comp_avg_pred_c(
+ CONVERT_TO_BYTEPTR(output),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
+ CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+ &dist_wtd_comp_params);
+ test_impl(CONVERT_TO_BYTEPTR(output2),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+ in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+ in_w, &dist_wtd_comp_params);
+
+ for (int i = 0; i < in_h; ++i) {
+ for (int j = 0; j < in_w; ++j) {
+ int idx = i * in_w + j;
+ ASSERT_EQ(output[idx], output2[idx])
+ << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
+ << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+ << " = (" << i << ", " << j << ")";
+ }
+ }
+ }
+ }
+ }
+ void RunSpeedTest(distwtdcompavg_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ uint16_t pred8[kMaxSize * kMaxSize];
+ uint16_t ref8[kMaxSize * kMaxSize];
+ uint16_t output[kMaxSize * kMaxSize];
+ uint16_t output2[kMaxSize * kMaxSize];
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+
+ const int num_loops = 1000000000 / (in_w + in_h);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ aom_highbd_dist_wtd_comp_avg_pred_c(
+ CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+ CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
+ in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
+
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ 1000.0 * elapsed_time1 / num_loops);
+ }
+
+ libaom_test::ACMRandom rnd_;
+}; // class AV1HighBDDISTWTDCOMPAVGTest
+
+class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+ : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
+ public:
+ ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() override = default;
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+ void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ uint16_t pred8[kMaxSize * kMaxSize];
+ uint16_t ref8[kMaxSize * kMaxSize];
+ DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
+ DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+ int sub_x_q3, sub_y_q3;
+ int subpel_search;
+ for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
+ for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+ for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+ for (int ii = 0; ii < 2; ii++) {
+ for (int jj = 0; jj < 4; jj++) {
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ dist_wtd_comp_params.bck_offset =
+ quant_dist_lookup_table[jj][1 - ii];
+
+ const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+ const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+ nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+ in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+ &dist_wtd_comp_params, subpel_search);
+ test_impl(nullptr, nullptr, 0, 0, nullptr,
+ CONVERT_TO_BYTEPTR(output2),
+ CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+ in_w, in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+ in_w, bd, &dist_wtd_comp_params, subpel_search);
+
+ for (int i = 0; i < in_h; ++i) {
+ for (int j = 0; j < in_w; ++j) {
+ int idx = i * in_w + j;
+ ASSERT_EQ(output[idx], output2[idx])
+ << "Mismatch at unit tests for "
+ "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
+ << in_w << "x" << in_h << " Pixel mismatch at index "
+ << idx << " = (" << i << ", " << j
+ << "), sub pixel offset = (" << sub_y_q3 << ", "
+ << sub_x_q3 << ")";
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) {
+ const int w = kMaxSize, h = kMaxSize;
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ uint16_t pred8[kMaxSize * kMaxSize];
+ uint16_t ref8[kMaxSize * kMaxSize];
+ DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
+ DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
+
+ for (int i = 0; i < h; ++i)
+ for (int j = 0; j < w; ++j) {
+ pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+ }
+ const int in_w = block_size_wide[block_idx];
+ const int in_h = block_size_high[block_idx];
+
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
+ int sub_x_q3 = 0;
+ int sub_y_q3 = 0;
+ const int num_loops = 1000000000 / (in_w + in_h);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter.
+ for (int i = 0; i < num_loops; ++i)
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+ nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output),
+ CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+ subpel_search);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w,
+ in_h, 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(output2),
+ CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+ CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+ subpel_search);
+
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+ in_h, 1000.0 * elapsed_time1 / num_loops);
+ }
+
+ libaom_test::ACMRandom rnd_;
+}; // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace AV1DISTWTDCOMPAVG
+} // namespace libaom_test
+
+#endif // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/third_party/aom/test/comp_mask_pred_test.cc b/third_party/aom/test/comp_mask_pred_test.cc
new file mode 100644
index 0000000000..b65730aa57
--- /dev/null
+++ b/third_party/aom/test/comp_mask_pred_test.cc
@@ -0,0 +1,856 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask);
+
+typedef void (*comp_avg_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride);
+
+#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
+const BLOCK_SIZE kCompMaskPredParams[] = {
+ BLOCK_8X8, BLOCK_8X16, BLOCK_8X32, BLOCK_16X8, BLOCK_16X16,
+ BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32
+};
+#endif
+
+class AV1CompMaskPredBase : public ::testing::Test {
+ public:
+ ~AV1CompMaskPredBase() override;
+ void SetUp() override;
+
+ void TearDown() override;
+
+ protected:
+ bool CheckResult(int width, int height) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int idx = y * width + x;
+ if (comp_pred1_[idx] != comp_pred2_[idx]) {
+ printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+ printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ uint8_t *comp_pred1_;
+ uint8_t *comp_pred2_;
+ uint8_t *pred_;
+ uint8_t *ref_buffer_;
+ uint8_t *ref_;
+};
+
+AV1CompMaskPredBase::~AV1CompMaskPredBase() = default;
+
+void AV1CompMaskPredBase::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ av1_init_wedge_masks();
+ comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(comp_pred1_, nullptr);
+ comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(comp_pred2_, nullptr);
+ pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(pred_, nullptr);
+ // The biggest block size is MAX_SB_SQUARE(128*128), however for the
+ // convolution we need to access 3 bytes before and 4 bytes after (for an
+ // 8-tap filter), in both directions, so we need to allocate
+ // (128 + 7) * (128 + 7) = MAX_SB_SQUARE + (14 * MAX_SB_SIZE) + 49
+ ref_buffer_ =
+ (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (14 * MAX_SB_SIZE) + 49);
+ ASSERT_NE(ref_buffer_, nullptr);
+ // Start of the actual block where the convolution will be computed
+ ref_ = ref_buffer_ + (3 * MAX_SB_SIZE + 3);
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand8();
+ }
+ for (int i = 0; i < MAX_SB_SQUARE + (14 * MAX_SB_SIZE) + 49; ++i) {
+ ref_buffer_[i] = rnd_.Rand8();
+ }
+}
+
+void AV1CompMaskPredBase::TearDown() {
+ aom_free(comp_pred1_);
+ aom_free(comp_pred2_);
+ aom_free(pred_);
+ aom_free(ref_buffer_);
+}
+
+typedef std::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
+
+class AV1CompMaskPredTest
+ : public AV1CompMaskPredBase,
+ public ::testing::WithParamInterface<CompMaskPredParam> {
+ protected:
+ void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+ void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+};
+
+void AV1CompMaskPredTest::RunCheckOutput(comp_mask_pred_func test_impl,
+ BLOCK_SIZE bsize, int inv) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int wedge_types = get_wedge_types_lookup(bsize);
+ for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+ aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
+ inv);
+ test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
+
+ ASSERT_EQ(CheckResult(w, h), true)
+ << " wedge " << wedge_index << " inv " << inv;
+ }
+}
+
+void AV1CompMaskPredTest::RunSpeedTest(comp_mask_pred_func test_impl,
+ BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int wedge_types = get_wedge_types_lookup(bsize);
+ int wedge_index = wedge_types / 2;
+ const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+ const int num_loops = 1000000000 / (w + h);
+
+ comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ comp_mask_pred_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompMaskPredTest);
+
+TEST_P(AV1CompMaskPredTest, CheckOutput) {
+ // inv = 0, 1
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, AV1CompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+ ::testing::ValuesIn(kCompMaskPredParams)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1CompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+ ::testing::ValuesIn(kCompMaskPredParams)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1CompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_comp_mask_pred_neon),
+ ::testing::ValuesIn(kCompMaskPredParams)));
+#endif
+
+#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
+const BLOCK_SIZE kValidBlockSize[] = {
+ BLOCK_4X4, BLOCK_8X8, BLOCK_8X16, BLOCK_8X32, BLOCK_16X8,
+ BLOCK_16X16, BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+ BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64,
+ BLOCK_128X128, BLOCK_16X64, BLOCK_64X16
+};
+#endif
+
+typedef void (*upsampled_pred_func)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref, int ref_stride,
+ int subpel_search);
+
+typedef std::tuple<upsampled_pred_func, BLOCK_SIZE> UpsampledPredParam;
+
+class AV1UpsampledPredTest
+ : public AV1CompMaskPredBase,
+ public ::testing::WithParamInterface<UpsampledPredParam> {
+ protected:
+ void RunCheckOutput(upsampled_pred_func test_impl, BLOCK_SIZE bsize);
+ void RunSpeedTest(upsampled_pred_func test_impl, BLOCK_SIZE bsize,
+ int havSub);
+};
+
+void AV1UpsampledPredTest::RunCheckOutput(upsampled_pred_func test_impl,
+ BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ for (int subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
+ // loop through subx and suby
+ for (int sub = 0; sub < 8 * 8; ++sub) {
+ int subx = sub & 0x7;
+ int suby = (sub >> 3);
+
+ aom_upsampled_pred_c(nullptr, nullptr, 0, 0, nullptr, comp_pred1_, w, h,
+ subx, suby, ref_, MAX_SB_SIZE, subpel_search);
+
+ test_impl(nullptr, nullptr, 0, 0, nullptr, comp_pred2_, w, h, subx, suby,
+ ref_, MAX_SB_SIZE, subpel_search);
+ ASSERT_EQ(CheckResult(w, h), true)
+ << "sub (" << subx << "," << suby << ")";
+ }
+ }
+}
+
+void AV1UpsampledPredTest::RunSpeedTest(upsampled_pred_func test_impl,
+ BLOCK_SIZE bsize, int havSub) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int subx = havSub ? 3 : 0;
+ const int suby = havSub ? 4 : 0;
+
+ const int num_loops = 1000000000 / (w + h);
+ upsampled_pred_func funcs[2] = { aom_upsampled_pred_c, test_impl };
+ double elapsed_time[2] = { 0 };
+ int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter.
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ upsampled_pred_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(nullptr, nullptr, 0, 0, nullptr, comp_pred1_, w, h, subx, suby, ref_,
+ MAX_SB_SIZE, subpel_search);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("UpsampledPred[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h,
+ elapsed_time[0], elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1UpsampledPredTest);
+
+TEST_P(AV1UpsampledPredTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1UpsampledPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1UpsampledPredTest,
+ ::testing::Combine(::testing::Values(&aom_upsampled_pred_sse2),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1UpsampledPredTest,
+ ::testing::Combine(::testing::Values(&aom_upsampled_pred_neon),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+typedef std::tuple<comp_avg_pred_func, BLOCK_SIZE> CompAvgPredParam;
+
+class AV1CompAvgPredTest : public ::testing::TestWithParam<CompAvgPredParam> {
+ public:
+ ~AV1CompAvgPredTest() override;
+ void SetUp() override;
+
+ void TearDown() override;
+
+ protected:
+ void RunCheckOutput(comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+ void RunSpeedTest(comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+ bool CheckResult(int width, int height) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int idx = y * width + x;
+ if (comp_pred1_[idx] != comp_pred2_[idx]) {
+ printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, x, y);
+ printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ uint8_t *comp_pred1_;
+ uint8_t *comp_pred2_;
+ uint8_t *pred_;
+ uint8_t *ref_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CompAvgPredTest);
+
+AV1CompAvgPredTest::~AV1CompAvgPredTest() = default;
+
+void AV1CompAvgPredTest::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+
+ comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(comp_pred1_, nullptr);
+ comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(comp_pred2_, nullptr);
+ pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(pred_, nullptr);
+ ref_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ ASSERT_NE(ref_, nullptr);
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand8();
+ }
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ ref_[i] = rnd_.Rand8();
+ }
+}
+
+void AV1CompAvgPredTest::TearDown() {
+ aom_free(comp_pred1_);
+ aom_free(comp_pred2_);
+ aom_free(pred_);
+ aom_free(ref_);
+}
+
+void AV1CompAvgPredTest::RunCheckOutput(comp_avg_pred_func test_impl,
+ BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ aom_comp_avg_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE);
+ test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE);
+
+ ASSERT_EQ(CheckResult(w, h), true);
+}
+
+void AV1CompAvgPredTest::RunSpeedTest(comp_avg_pred_func test_impl,
+ BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int num_loops = 1000000000 / (w + h);
+
+ comp_avg_pred_func functions[2] = { aom_comp_avg_pred_c, test_impl };
+ double elapsed_time[2] = { 0.0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ comp_avg_pred_func func = functions[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time;
+ }
+ printf("CompAvgPred %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompAvgPredTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1CompAvgPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1CompAvgPredTest,
+ ::testing::Combine(::testing::Values(&aom_comp_avg_pred_avx2),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1CompAvgPredTest,
+ ::testing::Combine(::testing::Values(&aom_comp_avg_pred_neon),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class AV1HighbdCompMaskPredTestBase : public ::testing::Test {
+ public:
+ ~AV1HighbdCompMaskPredTestBase() override;
+ void SetUp() override;
+
+ void TearDown() override;
+
+ protected:
+ bool CheckResult(int width, int height) {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int idx = y * width + x;
+ if (comp_pred1_[idx] != comp_pred2_[idx]) {
+ printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+ printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ uint16_t *comp_pred1_;
+ uint16_t *comp_pred2_;
+ uint16_t *pred_;
+ uint16_t *ref_buffer_;
+ uint16_t *ref_;
+};
+
+AV1HighbdCompMaskPredTestBase::~AV1HighbdCompMaskPredTestBase() = default;
+
+void AV1HighbdCompMaskPredTestBase::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ av1_init_wedge_masks();
+
+ comp_pred1_ =
+ (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+ ASSERT_NE(comp_pred1_, nullptr);
+ comp_pred2_ =
+ (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+ ASSERT_NE(comp_pred2_, nullptr);
+ pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+ ASSERT_NE(pred_, nullptr);
+ // The biggest block size is MAX_SB_SQUARE(128*128), however for the
+ // convolution we need to access 3 elements before and 4 elements after (for
+ // an 8-tap filter), in both directions, so we need to allocate (128 + 7) *
+ // (128 + 7) = (MAX_SB_SQUARE + (14 * MAX_SB_SIZE) + 49) *
+ // sizeof(*ref_buffer_)
+ ref_buffer_ = (uint16_t *)aom_memalign(
+ 16, (MAX_SB_SQUARE + (14 * MAX_SB_SIZE) + 49) * sizeof(*ref_buffer_));
+ ASSERT_NE(ref_buffer_, nullptr);
+ // Start of the actual block where the convolution will be computed
+ ref_ = ref_buffer_ + (3 * MAX_SB_SIZE + 3);
+}
+
+void AV1HighbdCompMaskPredTestBase::TearDown() {
+ aom_free(comp_pred1_);
+ aom_free(comp_pred2_);
+ aom_free(pred_);
+ aom_free(ref_buffer_);
+}
+
+typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8,
+ const uint8_t *pred8, int width,
+ int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask);
+
+typedef std::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int>
+ HighbdCompMaskPredParam;
+
+class AV1HighbdCompMaskPredTest
+ : public AV1HighbdCompMaskPredTestBase,
+ public ::testing::WithParamInterface<HighbdCompMaskPredParam> {
+ public:
+ ~AV1HighbdCompMaskPredTest() override;
+
+ protected:
+ void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+ void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+};
+
+AV1HighbdCompMaskPredTest::~AV1HighbdCompMaskPredTest() = default;
+
+void AV1HighbdCompMaskPredTest::RunCheckOutput(
+ highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+ int bd_ = GET_PARAM(2);
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int wedge_types = get_wedge_types_lookup(bsize);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+ for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+ ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+
+ for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+ aom_highbd_comp_mask_pred_c(
+ CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+ test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+ ASSERT_EQ(CheckResult(w, h), true)
+ << " wedge " << wedge_index << " inv " << inv;
+ }
+}
+
+void AV1HighbdCompMaskPredTest::RunSpeedTest(
+ highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) {
+ int bd_ = GET_PARAM(2);
+
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int wedge_types = get_wedge_types_lookup(bsize);
+ int wedge_index = wedge_types / 2;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+ for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+ ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+
+ const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+ const int num_loops = 1000000000 / (w + h);
+
+ highbd_comp_mask_pred_func funcs[2] = { aom_highbd_comp_mask_pred_c,
+ test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ highbd_comp_mask_pred_func func = funcs[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdCompMaskPredTest);
+
+TEST_P(AV1HighbdCompMaskPredTest, CheckOutput) {
+ // inv = 0, 1
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdCompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_neon),
+ ::testing::ValuesIn(kCompMaskPredParams),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1HighbdCompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+ ::testing::ValuesIn(kCompMaskPredParams),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1HighbdCompMaskPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+ ::testing::ValuesIn(kCompMaskPredParams),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+typedef void (*highbd_upsampled_pred_func)(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
+ int bd, int subpel_search);
+
+typedef std::tuple<highbd_upsampled_pred_func, BLOCK_SIZE, int>
+ HighbdUpsampledPredParam;
+
+class AV1HighbdUpsampledPredTest
+ : public AV1HighbdCompMaskPredTestBase,
+ public ::testing::WithParamInterface<HighbdUpsampledPredParam> {
+ public:
+ ~AV1HighbdUpsampledPredTest() override;
+
+ protected:
+ void RunCheckOutput(highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize);
+ void RunSpeedTest(highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize,
+ int havSub);
+};
+
+AV1HighbdUpsampledPredTest::~AV1HighbdUpsampledPredTest() = default;
+
+void AV1HighbdUpsampledPredTest::RunCheckOutput(
+ highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize) {
+ int bd_ = GET_PARAM(2);
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+ for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+ ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+
+ for (int subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+ // loop through subx and suby
+ for (int sub = 0; sub < 8 * 8; ++sub) {
+ int subx = sub & 0x7;
+ int suby = (sub >> 3);
+
+ aom_highbd_upsampled_pred_c(nullptr, nullptr, 0, 0, nullptr,
+ CONVERT_TO_BYTEPTR(comp_pred1_), w, h, subx,
+ suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE,
+ bd_, subpel_search);
+
+ test_impl(nullptr, nullptr, 0, 0, nullptr,
+ CONVERT_TO_BYTEPTR(comp_pred2_), w, h, subx, suby,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search);
+
+ ASSERT_EQ(CheckResult(w, h), true)
+ << "sub (" << subx << "," << suby << ")";
+ }
+ }
+}
+
+void AV1HighbdUpsampledPredTest::RunSpeedTest(
+ highbd_upsampled_pred_func test_impl, BLOCK_SIZE bsize, int havSub) {
+ int bd_ = GET_PARAM(2);
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int subx = havSub ? 3 : 0;
+ const int suby = havSub ? 4 : 0;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+ for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+ ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+
+ const int num_loops = 1000000000 / (w + h);
+ highbd_upsampled_pred_func funcs[2] = { &aom_highbd_upsampled_pred_c,
+ test_impl };
+ double elapsed_time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ highbd_upsampled_pred_func func = funcs[i];
+ int subpel_search = 2; // set to 1 to test 4-tap filter.
+ for (int j = 0; j < num_loops; ++j) {
+ func(nullptr, nullptr, 0, 0, nullptr, CONVERT_TO_BYTEPTR(comp_pred1_), w,
+ h, subx, suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_,
+ subpel_search);
+ }
+ aom_usec_timer_mark(&timer);
+ double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time / num_loops;
+ }
+ printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdUpsampledPredTest);
+
+TEST_P(AV1HighbdUpsampledPredTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdUpsampledPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1HighbdUpsampledPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_upsampled_pred_sse2),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdUpsampledPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_upsampled_pred_neon),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+typedef void (*highbd_comp_avg_pred_func)(uint8_t *comp_pred,
+ const uint8_t *pred, int width,
+ int height, const uint8_t *ref,
+ int ref_stride);
+
+typedef std::tuple<highbd_comp_avg_pred_func, BLOCK_SIZE, int>
+ HighbdCompAvgPredParam;
+
+class AV1HighbdCompAvgPredTest
+ : public ::testing::TestWithParam<HighbdCompAvgPredParam> {
+ public:
+ ~AV1HighbdCompAvgPredTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+ void RunSpeedTest(highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize);
+ bool CheckResult(int width, int height) const {
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ const int idx = y * width + x;
+ if (comp_pred1_[idx] != comp_pred2_[idx]) {
+ printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, x, y);
+ printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ libaom_test::ACMRandom rnd_;
+ uint16_t *comp_pred1_;
+ uint16_t *comp_pred2_;
+ uint16_t *pred_;
+ uint16_t *ref_;
+};
+
+AV1HighbdCompAvgPredTest::~AV1HighbdCompAvgPredTest() {
+ aom_free(comp_pred1_);
+ aom_free(comp_pred2_);
+ aom_free(pred_);
+ aom_free(ref_);
+}
+
+void AV1HighbdCompAvgPredTest::SetUp() {
+ int bd_ = GET_PARAM(2);
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+
+ comp_pred1_ =
+ (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+ ASSERT_NE(comp_pred1_, nullptr);
+ comp_pred2_ =
+ (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+ ASSERT_NE(comp_pred2_, nullptr);
+ pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+ ASSERT_NE(pred_, nullptr);
+ ref_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*ref_));
+ ASSERT_NE(ref_, nullptr);
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ ref_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+ }
+}
+
+void AV1HighbdCompAvgPredTest::RunCheckOutput(
+ highbd_comp_avg_pred_func test_impl, BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(comp_pred1_),
+ CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+ test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+
+ ASSERT_EQ(CheckResult(w, h), true);
+}
+
+void AV1HighbdCompAvgPredTest::RunSpeedTest(highbd_comp_avg_pred_func test_impl,
+ BLOCK_SIZE bsize) {
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int num_loops = 1000000000 / (w + h);
+
+ highbd_comp_avg_pred_func functions[2] = { aom_highbd_comp_avg_pred_c,
+ test_impl };
+ double elapsed_time[2] = { 0.0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ highbd_comp_avg_pred_func func = functions[i];
+ for (int j = 0; j < num_loops; ++j) {
+ func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+ CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ elapsed_time[i] = 1000.0 * time;
+ }
+ printf("HighbdCompAvg %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+ elapsed_time[1]);
+ printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdCompAvgPredTest);
+
+TEST_P(AV1HighbdCompAvgPredTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdCompAvgPredTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdCompAvgPredTest,
+ ::testing::Combine(::testing::Values(&aom_highbd_comp_avg_pred_neon),
+ ::testing::ValuesIn(kValidBlockSize),
+ ::testing::Range(8, 13, 2)));
+#endif
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/convolve_test.cc b/third_party/aom/test/convolve_test.cc
new file mode 100644
index 0000000000..c97f814057
--- /dev/null
+++ b/third_party/aom/test/convolve_test.cc
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/filter.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+static const unsigned int kMaxDimension = MAX_SB_SIZE;
+
+static const int16_t kInvalidFilter[8] = {};
+static const int kNumFilterBanks = SWITCHABLE_FILTERS;
+static const int kNumFilters = 16;
+
+typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h);
+
+struct ConvolveFunctions {
+ ConvolveFunctions(ConvolveFunc h8, ConvolveFunc v8, int bd)
+ : h8_(h8), v8_(v8), use_highbd_(bd) {}
+
+ ConvolveFunc h8_;
+ ConvolveFunc v8_;
+ int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth.
+};
+
+typedef std::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+
+#define ALL_SIZES_64(convolve_fn) \
+ make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn), \
+ make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn), \
+ make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn), \
+ make_tuple(16, 16, &convolve_fn), make_tuple(32, 16, &convolve_fn), \
+ make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \
+ make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
+ make_tuple(64, 64, &convolve_fn)
+
+#define ALL_SIZES(convolve_fn) \
+ make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn), \
+ make_tuple(128, 128, &convolve_fn), ALL_SIZES_64(convolve_fn)
+
+// Reference 8-tap subpixel filter, slightly modified to fit into this test.
+#define AV1_FILTER_WEIGHT 128
+#define AV1_FILTER_SHIFT 7
+uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; }
+
+void filter_block2d_8_c(const uint8_t *src_ptr, unsigned int src_stride,
+ const int16_t *HFilter, const int16_t *VFilter,
+ uint8_t *dst_ptr, unsigned int dst_stride,
+ unsigned int output_width, unsigned int output_height) {
+ // Between passes, we use an intermediate buffer whose height is extended to
+ // have enough horizontally filtered values as input for the vertical pass.
+ // This buffer is allocated to be big enough for the largest block type we
+ // support.
+ const int kInterp_Extend = 4;
+ const unsigned int intermediate_height =
+ (kInterp_Extend - 1) + output_height + kInterp_Extend;
+ unsigned int i, j;
+
+ assert(intermediate_height > 7);
+
+ // Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+ // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+ // + kInterp_Extend
+ // = 3 + 16 + 4
+ // = 23
+ // and filter_max_width = 16
+ //
+ uint8_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension];
+ const int intermediate_next_stride =
+ 1 - static_cast<int>(intermediate_height * output_width);
+
+ // Horizontal pass (src -> transposed intermediate).
+ uint8_t *output_ptr = intermediate_buffer;
+ const int src_next_row_stride = src_stride - output_width;
+ src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ // Apply filter...
+ const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
+ (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
+ (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
+ (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+ (AV1_FILTER_WEIGHT >> 1); // Rounding
+
+ // Normalize back to 0-255...
+ *output_ptr = clip_pixel(temp >> AV1_FILTER_SHIFT);
+ ++src_ptr;
+ output_ptr += intermediate_height;
+ }
+ src_ptr += src_next_row_stride;
+ output_ptr += intermediate_next_stride;
+ }
+
+ // Vertical pass (transposed intermediate -> dst).
+ src_ptr = intermediate_buffer;
+ const int dst_next_row_stride = dst_stride - output_width;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ // Apply filter...
+ const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) +
+ (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) +
+ (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) +
+ (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) +
+ (AV1_FILTER_WEIGHT >> 1); // Rounding
+
+ // Normalize back to 0-255...
+ *dst_ptr++ = clip_pixel(temp >> AV1_FILTER_SHIFT);
+ src_ptr += intermediate_height;
+ }
+ src_ptr += intermediate_next_stride;
+ dst_ptr += dst_next_row_stride;
+ }
+}
+
+void block2d_average_c(uint8_t *src, unsigned int src_stride,
+ uint8_t *output_ptr, unsigned int output_stride,
+ unsigned int output_width, unsigned int output_height) {
+ unsigned int i, j;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+ }
+ output_ptr += output_stride;
+ }
+}
+
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+ const unsigned int src_stride,
+ const int16_t *HFilter, const int16_t *VFilter,
+ uint8_t *dst_ptr, unsigned int dst_stride,
+ unsigned int output_width,
+ unsigned int output_height) {
+ uint8_t tmp[kMaxDimension * kMaxDimension];
+
+ assert(output_width <= kMaxDimension);
+ assert(output_height <= kMaxDimension);
+ filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
+ output_width, output_height);
+ block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width,
+ output_height);
+}
+
+void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
+ const unsigned int src_stride,
+ const int16_t *HFilter, const int16_t *VFilter,
+ uint16_t *dst_ptr, unsigned int dst_stride,
+ unsigned int output_width,
+ unsigned int output_height, int bd) {
+ // Between passes, we use an intermediate buffer whose height is extended to
+ // have enough horizontally filtered values as input for the vertical pass.
+ // This buffer is allocated to be big enough for the largest block type we
+ // support.
+ const int kInterp_Extend = 4;
+ const unsigned int intermediate_height =
+ (kInterp_Extend - 1) + output_height + kInterp_Extend;
+
+ /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+ * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+ * + kInterp_Extend
+ * = 3 + 16 + 4
+ * = 23
+ * and filter_max_width = 16
+ */
+ uint16_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension] = { 0 };
+ const int intermediate_next_stride =
+ 1 - static_cast<int>(intermediate_height * output_width);
+
+ // Horizontal pass (src -> transposed intermediate).
+ {
+ uint16_t *output_ptr = intermediate_buffer;
+ const int src_next_row_stride = src_stride - output_width;
+ unsigned int i, j;
+ src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ // Apply filter...
+ const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
+ (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
+ (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
+ (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+ (AV1_FILTER_WEIGHT >> 1); // Rounding
+
+ // Normalize back to 0-255...
+ *output_ptr = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd);
+ ++src_ptr;
+ output_ptr += intermediate_height;
+ }
+ src_ptr += src_next_row_stride;
+ output_ptr += intermediate_next_stride;
+ }
+ }
+
+ // Vertical pass (transposed intermediate -> dst).
+ {
+ const uint16_t *interm_ptr = intermediate_buffer;
+ const int dst_next_row_stride = dst_stride - output_width;
+ unsigned int i, j;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ // Apply filter...
+ const int temp =
+ (interm_ptr[0] * VFilter[0]) + (interm_ptr[1] * VFilter[1]) +
+ (interm_ptr[2] * VFilter[2]) + (interm_ptr[3] * VFilter[3]) +
+ (interm_ptr[4] * VFilter[4]) + (interm_ptr[5] * VFilter[5]) +
+ (interm_ptr[6] * VFilter[6]) + (interm_ptr[7] * VFilter[7]) +
+ (AV1_FILTER_WEIGHT >> 1); // Rounding
+
+ // Normalize back to 0-255...
+ *dst_ptr++ = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd);
+ interm_ptr += intermediate_height;
+ }
+ interm_ptr += intermediate_next_stride;
+ dst_ptr += dst_next_row_stride;
+ }
+ }
+}
+
+void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride,
+ uint16_t *output_ptr, unsigned int output_stride,
+ unsigned int output_width,
+ unsigned int output_height) {
+ unsigned int i, j;
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; ++j) {
+ output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+ }
+ output_ptr += output_stride;
+ }
+}
+
+void highbd_filter_average_block2d_8_c(
+ const uint16_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+ const int16_t *VFilter, uint16_t *dst_ptr, unsigned int dst_stride,
+ unsigned int output_width, unsigned int output_height, int bd) {
+ uint16_t tmp[kMaxDimension * kMaxDimension];
+
+ assert(output_width <= kMaxDimension);
+ assert(output_height <= kMaxDimension);
+ highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp,
+ kMaxDimension, output_width, output_height, bd);
+ highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
+ output_width, output_height);
+}
+
+class ConvolveTestBase : public ::testing::TestWithParam<ConvolveParam> {
+ public:
+ static void SetUpTestSuite() {
+ // Force input_ to be unaligned, output to be 16 byte aligned.
+ input_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kInputBufferSize + 1)) +
+ 1;
+ ASSERT_NE(input_, nullptr);
+ ref8_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kOutputStride * kMaxDimension));
+ ASSERT_NE(ref8_, nullptr);
+ output_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kOutputBufferSize));
+ ASSERT_NE(output_, nullptr);
+ output_ref_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kOutputBufferSize));
+ ASSERT_NE(output_ref_, nullptr);
+ input16_ = reinterpret_cast<uint16_t *>(aom_memalign(
+ kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) +
+ 1;
+ ASSERT_NE(input16_, nullptr);
+ ref16_ = reinterpret_cast<uint16_t *>(aom_memalign(
+ kDataAlignment, kOutputStride * kMaxDimension * sizeof(uint16_t)));
+ ASSERT_NE(ref16_, nullptr);
+ output16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+ ASSERT_NE(output16_, nullptr);
+ output16_ref_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+ ASSERT_NE(output16_ref_, nullptr);
+ }
+
+ static void TearDownTestSuite() {
+ aom_free(input_ - 1);
+ input_ = nullptr;
+ aom_free(ref8_);
+ ref8_ = nullptr;
+ aom_free(output_);
+ output_ = nullptr;
+ aom_free(output_ref_);
+ output_ref_ = nullptr;
+ aom_free(input16_ - 1);
+ input16_ = nullptr;
+ aom_free(ref16_);
+ ref16_ = nullptr;
+ aom_free(output16_);
+ output16_ = nullptr;
+ aom_free(output16_ref_);
+ output16_ref_ = nullptr;
+ }
+
+ protected:
+ static const int kDataAlignment = 16;
+ static const int kOuterBlockSize = 4 * kMaxDimension;
+ static const int kInputStride = kOuterBlockSize;
+ static const int kOutputStride = kOuterBlockSize;
+ static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+ static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
+
+ int Width() const { return GET_PARAM(0); }
+ int Height() const { return GET_PARAM(1); }
+ int BorderLeft() const {
+ const int center = (kOuterBlockSize - Width()) / 2;
+ return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+ }
+ int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+
+ bool IsIndexInBorder(int i) {
+ return (i < BorderTop() * kOuterBlockSize ||
+ i >= (BorderTop() + Height()) * kOuterBlockSize ||
+ i % kOuterBlockSize < BorderLeft() ||
+ i % kOuterBlockSize >= (BorderLeft() + Width()));
+ }
+
+ void SetUp() override {
+ UUT_ = GET_PARAM(2);
+ if (UUT_->use_highbd_ != 0)
+ mask_ = (1 << UUT_->use_highbd_) - 1;
+ else
+ mask_ = 255;
+ /* Set up guard blocks for an inner block centered in the outer block */
+ for (int i = 0; i < kOutputBufferSize; ++i) {
+ if (IsIndexInBorder(i)) {
+ output_[i] = 255;
+ output16_[i] = mask_;
+ } else {
+ output_[i] = 0;
+ output16_[i] = 0;
+ }
+ }
+
+ ::libaom_test::ACMRandom prng;
+ for (int i = 0; i < kInputBufferSize; ++i) {
+ if (i & 1) {
+ input_[i] = 255;
+ input16_[i] = mask_;
+ } else {
+ input_[i] = prng.Rand8Extremes();
+ input16_[i] = prng.Rand16() & mask_;
+ }
+ }
+ }
+
+ void SetConstantInput(int value) {
+ memset(input_, value, kInputBufferSize);
+ aom_memset16(input16_, value, kInputBufferSize);
+ }
+
+ void CopyOutputToRef() {
+ memcpy(output_ref_, output_, kOutputBufferSize);
+ // Copy 16-bit pixels values. The effective number of bytes is double.
+ memcpy(output16_ref_, output16_, sizeof(output16_[0]) * kOutputBufferSize);
+ }
+
+ void CheckGuardBlocks() {
+ for (int i = 0; i < kOutputBufferSize; ++i) {
+ if (IsIndexInBorder(i)) {
+ EXPECT_EQ(255, output_[i]);
+ }
+ }
+ }
+
+ uint8_t *input() const {
+ const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+ if (UUT_->use_highbd_ == 0) {
+ return input_ + offset;
+ } else {
+ return CONVERT_TO_BYTEPTR(input16_) + offset;
+ }
+ }
+
+ uint8_t *output() const {
+ const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+ if (UUT_->use_highbd_ == 0) {
+ return output_ + offset;
+ } else {
+ return CONVERT_TO_BYTEPTR(output16_) + offset;
+ }
+ }
+
+ uint8_t *output_ref() const {
+ const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+ if (UUT_->use_highbd_ == 0) {
+ return output_ref_ + offset;
+ } else {
+ return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+ }
+ }
+
+ uint16_t lookup(uint8_t *list, int index) const {
+ if (UUT_->use_highbd_ == 0) {
+ return list[index];
+ } else {
+ return CONVERT_TO_SHORTPTR(list)[index];
+ }
+ }
+
+ void assign_val(uint8_t *list, int index, uint16_t val) const {
+ if (UUT_->use_highbd_ == 0) {
+ list[index] = (uint8_t)val;
+ } else {
+ CONVERT_TO_SHORTPTR(list)[index] = val;
+ }
+ }
+
+ void wrapper_filter_average_block2d_8_c(
+ const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+ const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
+ unsigned int output_width, unsigned int output_height) {
+ if (UUT_->use_highbd_ == 0) {
+ filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
+ dst_stride, output_width, output_height);
+ } else {
+ highbd_filter_average_block2d_8_c(
+ CONVERT_TO_SHORTPTR(src_ptr), src_stride, HFilter, VFilter,
+ CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height,
+ UUT_->use_highbd_);
+ }
+ }
+
+ void wrapper_filter_block2d_8_c(
+ const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+ const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
+ unsigned int output_width, unsigned int output_height) {
+ if (UUT_->use_highbd_ == 0) {
+ filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
+ dst_stride, output_width, output_height);
+ } else {
+ highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+ HFilter, VFilter, CONVERT_TO_SHORTPTR(dst_ptr),
+ dst_stride, output_width, output_height,
+ UUT_->use_highbd_);
+ }
+ }
+
+ void MatchesReferenceSubpixelFilter() {
+ uint8_t *const in = input();
+ uint8_t *const out = output();
+ uint8_t *ref;
+ if (UUT_->use_highbd_ == 0) {
+ ref = ref8_;
+ } else {
+ ref = CONVERT_TO_BYTEPTR(ref16_);
+ }
+ int subpel_search;
+ for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
+ for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+ const InterpFilter filter = (InterpFilter)filter_bank;
+ const InterpKernel *filters =
+ (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+ subpel_search);
+ for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+ for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+ wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
+ filters[filter_y], ref, kOutputStride,
+ Width(), Height());
+
+ if (filter_x && filter_y)
+ continue;
+ else if (filter_y)
+ UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
+ 16, filters[filter_y], 16, Width(), Height());
+ else if (filter_x)
+ API_REGISTER_STATE_CHECK(UUT_->h8_(
+ in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+ kInvalidFilter, 16, Width(), Height()));
+ else
+ continue;
+
+ CheckGuardBlocks();
+
+ for (int y = 0; y < Height(); ++y)
+ for (int x = 0; x < Width(); ++x)
+ ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+ lookup(out, y * kOutputStride + x))
+ << "mismatch at (" << x << "," << y << "), "
+ << "filters (" << filter_bank << "," << filter_x << ","
+ << filter_y << ")";
+ }
+ }
+ }
+ }
+ }
+
+ void FilterExtremes() {
+ uint8_t *const in = input();
+ uint8_t *const out = output();
+ uint8_t *ref;
+ if (UUT_->use_highbd_ == 0) {
+ ref = ref8_;
+ } else {
+ ref = CONVERT_TO_BYTEPTR(ref16_);
+ }
+
+ // Populate ref and out with some random data
+ ::libaom_test::ACMRandom prng;
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ uint16_t r;
+ if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+ r = prng.Rand8Extremes();
+ } else {
+ r = prng.Rand16() & mask_;
+ }
+ assign_val(out, y * kOutputStride + x, r);
+ assign_val(ref, y * kOutputStride + x, r);
+ }
+ }
+
+ for (int axis = 0; axis < 2; axis++) {
+ int seed_val = 0;
+ while (seed_val < 256) {
+ for (int y = 0; y < 8; ++y) {
+ for (int x = 0; x < 8; ++x) {
+ assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+ ((seed_val >> (axis ? y : x)) & 1) * mask_);
+ if (axis) seed_val++;
+ }
+ if (axis)
+ seed_val -= 8;
+ else
+ seed_val++;
+ }
+ if (axis) seed_val += 8;
+ int subpel_search;
+ for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
+ for (int filter_bank = 0; filter_bank < kNumFilterBanks;
+ ++filter_bank) {
+ const InterpFilter filter = (InterpFilter)filter_bank;
+ const InterpKernel *filters =
+ (const InterpKernel *)av1_get_interp_filter_kernel(
+ filter, subpel_search);
+ for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+ for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+ wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
+ filters[filter_y], ref,
+ kOutputStride, Width(), Height());
+ if (filter_x && filter_y)
+ continue;
+ else if (filter_y)
+ API_REGISTER_STATE_CHECK(UUT_->v8_(
+ in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
+ filters[filter_y], 16, Width(), Height()));
+ else if (filter_x)
+ API_REGISTER_STATE_CHECK(UUT_->h8_(
+ in, kInputStride, out, kOutputStride, filters[filter_x],
+ 16, kInvalidFilter, 16, Width(), Height()));
+ else
+ continue;
+
+ for (int y = 0; y < Height(); ++y)
+ for (int x = 0; x < Width(); ++x)
+ ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+ lookup(out, y * kOutputStride + x))
+ << "mismatch at (" << x << "," << y << "), "
+ << "filters (" << filter_bank << "," << filter_x << ","
+ << filter_y << ")";
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void SpeedTest() {
+ uint8_t *const in = input();
+ uint8_t *const out = output();
+ uint8_t *ref;
+ if (UUT_->use_highbd_ == 0) {
+ ref = ref8_;
+ } else {
+ ref = CONVERT_TO_BYTEPTR(ref16_);
+ }
+
+ // Populate ref and out with some random data
+ ::libaom_test::ACMRandom prng;
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ uint16_t r;
+ if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+ r = prng.Rand8Extremes();
+ } else {
+ r = prng.Rand16() & mask_;
+ }
+ assign_val(out, y * kOutputStride + x, r);
+ assign_val(ref, y * kOutputStride + x, r);
+ }
+ }
+
+ InterpFilter filter = (InterpFilter)1;
+ const InterpKernel *filters =
+ (const InterpKernel *)av1_get_interp_filter_kernel(filter, USE_8_TAPS);
+ wrapper_filter_average_block2d_8_c(in, kInputStride, filters[1], filters[1],
+ out, kOutputStride, Width(), Height());
+
+ aom_usec_timer timer;
+ int tests_num = 1000;
+
+ aom_usec_timer_start(&timer);
+ while (tests_num > 0) {
+ for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+ filter = (InterpFilter)filter_bank;
+ filters = (const InterpKernel *)av1_get_interp_filter_kernel(
+ filter, USE_8_TAPS);
+ for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+ for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+ if (filter_x && filter_y) continue;
+ if (filter_y)
+ API_REGISTER_STATE_CHECK(UUT_->v8_(
+ in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
+ filters[filter_y], 16, Width(), Height()));
+ else if (filter_x)
+ API_REGISTER_STATE_CHECK(UUT_->h8_(
+ in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+ kInvalidFilter, 16, Width(), Height()));
+ }
+ }
+ }
+ tests_num--;
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+ printf("%dx%d (bitdepth %d) time: %5d ms\n", Width(), Height(),
+ UUT_->use_highbd_, elapsed_time);
+ }
+
+ const ConvolveFunctions *UUT_;
+ static uint8_t *input_;
+ static uint8_t *ref8_;
+ static uint8_t *output_;
+ static uint8_t *output_ref_;
+ static uint16_t *input16_;
+ static uint16_t *ref16_;
+ static uint16_t *output16_;
+ static uint16_t *output16_ref_;
+ int mask_;
+};
+
+uint8_t *ConvolveTestBase::input_ = nullptr;
+uint8_t *ConvolveTestBase::ref8_ = nullptr;
+uint8_t *ConvolveTestBase::output_ = nullptr;
+uint8_t *ConvolveTestBase::output_ref_ = nullptr;
+uint16_t *ConvolveTestBase::input16_ = nullptr;
+uint16_t *ConvolveTestBase::ref16_ = nullptr;
+uint16_t *ConvolveTestBase::output16_ = nullptr;
+uint16_t *ConvolveTestBase::output16_ref_ = nullptr;
+
+using LowbdConvolveTest = ConvolveTestBase;
+
+TEST_P(LowbdConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
+
+void FiltersWontSaturateWhenAddedPairwise() {
+ int subpel_search;
+ for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+ ++subpel_search) {
+ for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+ const InterpFilter filter = (InterpFilter)filter_bank;
+ const InterpKernel *filters =
+ (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+ subpel_search);
+ for (int i = 0; i < kNumFilters; i++) {
+ const int p0 = filters[i][0] + filters[i][1];
+ const int p1 = filters[i][2] + filters[i][3];
+ const int p2 = filters[i][4] + filters[i][5];
+ const int p3 = filters[i][6] + filters[i][7];
+ EXPECT_LE(p0, 128);
+ EXPECT_LE(p1, 128);
+ EXPECT_LE(p2, 128);
+ EXPECT_LE(p3, 128);
+ EXPECT_LE(p0 + p3, 128);
+ EXPECT_LE(p0 + p3 + p1, 128);
+ EXPECT_LE(p0 + p3 + p1 + p2, 128);
+ EXPECT_EQ(p0 + p1 + p2 + p3, 128);
+ }
+ }
+ }
+}
+
+TEST(LowbdConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
+ FiltersWontSaturateWhenAddedPairwise();
+}
+
+TEST_P(LowbdConvolveTest, MatchesReferenceSubpixelFilter) {
+ MatchesReferenceSubpixelFilter();
+}
+
+TEST_P(LowbdConvolveTest, FilterExtremes) { FilterExtremes(); }
+
+TEST_P(LowbdConvolveTest, DISABLED_Speed) { SpeedTest(); }
+
+using std::make_tuple;
+
+// WRAP macro is only used for high bitdepth build.
+#if CONFIG_AV1_HIGHBITDEPTH
+#define WRAP(func, bd) \
+ static void wrap_##func##_##bd( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \
+ const int16_t *filter_y, int filter_y_stride, int w, int h) { \
+ aom_highbd_##func(src, src_stride, dst, dst_stride, filter_x, \
+ filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+ }
+#if HAVE_SSE2 && AOM_ARCH_X86_64
+WRAP(convolve8_horiz_sse2, 8)
+WRAP(convolve8_vert_sse2, 8)
+WRAP(convolve8_horiz_sse2, 10)
+WRAP(convolve8_vert_sse2, 10)
+WRAP(convolve8_horiz_sse2, 12)
+WRAP(convolve8_vert_sse2, 12)
+#endif // HAVE_SSE2 && AOM_ARCH_X86_64
+
+WRAP(convolve8_horiz_c, 8)
+WRAP(convolve8_vert_c, 8)
+WRAP(convolve8_horiz_c, 10)
+WRAP(convolve8_vert_c, 10)
+WRAP(convolve8_horiz_c, 12)
+WRAP(convolve8_vert_c, 12)
+
+#if HAVE_AVX2
+WRAP(convolve8_horiz_avx2, 8)
+WRAP(convolve8_vert_avx2, 8)
+
+WRAP(convolve8_horiz_avx2, 10)
+WRAP(convolve8_vert_avx2, 10)
+
+WRAP(convolve8_horiz_avx2, 12)
+WRAP(convolve8_vert_avx2, 12)
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+WRAP(convolve8_horiz_neon, 8)
+WRAP(convolve8_vert_neon, 8)
+
+WRAP(convolve8_horiz_neon, 10)
+WRAP(convolve8_vert_neon, 10)
+
+WRAP(convolve8_horiz_neon, 12)
+WRAP(convolve8_vert_neon, 12)
+#endif // HAVE_NEON
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#undef WRAP
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+using HighbdConvolveTest = ConvolveTestBase;
+
+TEST_P(HighbdConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
+
+TEST(HighbdConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
+ FiltersWontSaturateWhenAddedPairwise();
+}
+
+TEST_P(HighbdConvolveTest, MatchesReferenceSubpixelFilter) {
+ MatchesReferenceSubpixelFilter();
+}
+
+TEST_P(HighbdConvolveTest, FilterExtremes) { FilterExtremes(); }
+
+TEST_P(HighbdConvolveTest, DISABLED_Speed) { SpeedTest(); }
+
+const ConvolveFunctions wrap_convolve8_c(wrap_convolve8_horiz_c_8,
+ wrap_convolve8_vert_c_8, 8);
+const ConvolveFunctions wrap_convolve10_c(wrap_convolve8_horiz_c_10,
+ wrap_convolve8_vert_c_10, 10);
+const ConvolveFunctions wrap_convolve12_c(wrap_convolve8_horiz_c_12,
+ wrap_convolve8_vert_c_12, 12);
+const ConvolveParam kArrayHighbdConvolve_c[] = { ALL_SIZES(wrap_convolve8_c),
+ ALL_SIZES(wrap_convolve10_c),
+ ALL_SIZES(wrap_convolve12_c) };
+
+INSTANTIATE_TEST_SUITE_P(C, HighbdConvolveTest,
+ ::testing::ValuesIn(kArrayHighbdConvolve_c));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+const ConvolveFunctions convolve8_c(aom_convolve8_horiz_c, aom_convolve8_vert_c,
+ 0);
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
+
+INSTANTIATE_TEST_SUITE_P(C, LowbdConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve_c));
+
+#if HAVE_SSE2 && AOM_ARCH_X86_64
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve8_horiz_sse2_8,
+ wrap_convolve8_vert_sse2_8, 8);
+const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve8_horiz_sse2_10,
+ wrap_convolve8_vert_sse2_10, 10);
+const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve8_horiz_sse2_12,
+ wrap_convolve8_vert_sse2_12, 12);
+const ConvolveParam kArrayHighbdConvolve_sse2[] = {
+ ALL_SIZES(wrap_convolve8_sse2), ALL_SIZES(wrap_convolve10_sse2),
+ ALL_SIZES(wrap_convolve12_sse2)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, HighbdConvolveTest,
+ ::testing::ValuesIn(kArrayHighbdConvolve_sse2));
+#endif
+const ConvolveFunctions convolve8_sse2(aom_convolve8_horiz_sse2,
+ aom_convolve8_vert_sse2, 0);
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
+
+INSTANTIATE_TEST_SUITE_P(SSE2, LowbdConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve_sse2));
+#endif
+
+#if HAVE_SSSE3
+const ConvolveFunctions convolve8_ssse3(aom_convolve8_horiz_ssse3,
+ aom_convolve8_vert_ssse3, 0);
+
+const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, LowbdConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve8_ssse3));
+#endif
+
+#if HAVE_AVX2
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve8_horiz_avx2_8,
+ wrap_convolve8_vert_avx2_8, 8);
+const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve8_horiz_avx2_10,
+ wrap_convolve8_vert_avx2_10, 10);
+const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve8_horiz_avx2_12,
+ wrap_convolve8_vert_avx2_12, 12);
+const ConvolveParam kArray_HighbdConvolve8_avx2[] = {
+ ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
+ ALL_SIZES_64(wrap_convolve12_avx2)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, HighbdConvolveTest,
+ ::testing::ValuesIn(kArray_HighbdConvolve8_avx2));
+#endif
+const ConvolveFunctions convolve8_avx2(aom_convolve8_horiz_avx2,
+ aom_convolve8_vert_avx2, 0);
+const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, LowbdConvolveTest,
+ ::testing::ValuesIn(kArray_Convolve8_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_neon(wrap_convolve8_horiz_neon_8,
+ wrap_convolve8_vert_neon_8, 8);
+const ConvolveFunctions wrap_convolve10_neon(wrap_convolve8_horiz_neon_10,
+ wrap_convolve8_vert_neon_10, 10);
+const ConvolveFunctions wrap_convolve12_neon(wrap_convolve8_horiz_neon_12,
+ wrap_convolve8_vert_neon_12, 12);
+const ConvolveParam kArray_HighbdConvolve8_neon[] = {
+ ALL_SIZES_64(wrap_convolve8_neon), ALL_SIZES_64(wrap_convolve10_neon),
+ ALL_SIZES_64(wrap_convolve12_neon)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdConvolveTest,
+ ::testing::ValuesIn(kArray_HighbdConvolve8_neon));
+#endif
+const ConvolveFunctions convolve8_neon(aom_convolve8_horiz_neon,
+ aom_convolve8_vert_neon, 0);
+const ConvolveParam kArray_Convolve8_neon[] = { ALL_SIZES(convolve8_neon) };
+
+INSTANTIATE_TEST_SUITE_P(NEON, LowbdConvolveTest,
+ ::testing::ValuesIn(kArray_Convolve8_neon));
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(aom_convolve8_horiz_neon_dotprod,
+ aom_convolve8_vert_neon_dotprod,
+ 0);
+const ConvolveParam kArray_Convolve8_neon_dotprod[] = { ALL_SIZES(
+ convolve8_neon_dotprod) };
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, LowbdConvolveTest,
+ ::testing::ValuesIn(kArray_Convolve8_neon_dotprod));
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(aom_convolve8_horiz_neon_i8mm,
+ aom_convolve8_vert_neon_i8mm, 0);
+const ConvolveParam kArray_Convolve8_neon_i8mm[] = { ALL_SIZES(
+ convolve8_neon_i8mm) };
+
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, LowbdConvolveTest,
+ ::testing::ValuesIn(kArray_Convolve8_neon_i8mm));
+#endif // HAVE_NEON_I8MM
+
+} // namespace
diff --git a/third_party/aom/test/corner_match_test.cc b/third_party/aom/test/corner_match_test.cc
new file mode 100644
index 0000000000..9733732180
--- /dev/null
+++ b/third_party/aom/test/corner_match_test.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory>
+#include <new>
+#include <tuple>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+
+#include "aom_dsp/flow_estimation/corner_match.h"
+
+namespace test_libaom {
+
+namespace AV1CornerMatch {
+
+using libaom_test::ACMRandom;
+
+typedef double (*ComputeCrossCorrFunc)(const unsigned char *im1, int stride1,
+ int x1, int y1, const unsigned char *im2,
+ int stride2, int x2, int y2);
+
+using std::make_tuple;
+using std::tuple;
+typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam;
+
+class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
+ public:
+ ~AV1CornerMatchTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(int run_times);
+ ComputeCrossCorrFunc target_func;
+
+ libaom_test::ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1CornerMatchTest);
+
+AV1CornerMatchTest::~AV1CornerMatchTest() = default;
+void AV1CornerMatchTest::SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ target_func = GET_PARAM(1);
+}
+
+void AV1CornerMatchTest::RunCheckOutput(int run_times) {
+ const int w = 128, h = 128;
+ const int num_iters = 10000;
+ int i, j;
+ aom_usec_timer ref_timer, test_timer;
+
+ std::unique_ptr<uint8_t[]> input1(new (std::nothrow) uint8_t[w * h]);
+ std::unique_ptr<uint8_t[]> input2(new (std::nothrow) uint8_t[w * h]);
+ ASSERT_NE(input1, nullptr);
+ ASSERT_NE(input2, nullptr);
+
+ // Test the two extreme cases:
+ // i) Random data, should have correlation close to 0
+ // ii) Linearly related data + noise, should have correlation close to 1
+ int mode = GET_PARAM(0);
+ if (mode == 0) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ input1[i * w + j] = rnd_.Rand8();
+ input2[i * w + j] = rnd_.Rand8();
+ }
+ } else if (mode == 1) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ int v = rnd_.Rand8();
+ input1[i * w + j] = v;
+ input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15);
+ }
+ }
+
+ for (i = 0; i < num_iters; ++i) {
+ int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
+ int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
+ int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
+ int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
+
+ double res_c = av1_compute_cross_correlation_c(input1.get(), w, x1, y1,
+ input2.get(), w, x2, y2);
+ double res_simd =
+ target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2);
+
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (j = 0; j < run_times; j++) {
+ av1_compute_cross_correlation_c(input1.get(), w, x1, y1, input2.get(),
+ w, x2, y2);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (j = 0; j < run_times; j++) {
+ target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%d\n",
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ } else {
+ ASSERT_EQ(res_simd, res_c);
+ }
+ }
+}
+
+TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); }
+TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1CornerMatchTest,
+ ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_sse4_1),
+ make_tuple(1, &av1_compute_cross_correlation_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1CornerMatchTest,
+ ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_avx2),
+ make_tuple(1, &av1_compute_cross_correlation_avx2)));
+#endif
+} // namespace AV1CornerMatch
+
+} // namespace test_libaom
diff --git a/third_party/aom/test/cpu_speed_test.cc b/third_party/aom/test/cpu_speed_test.cc
new file mode 100644
index 0000000000..b5f5d2974d
--- /dev/null
+++ b/third_party/aom/test/cpu_speed_test.cc
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPSNR = 100;
+
+class CpuSpeedTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ CpuSpeedTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
+ tune_content_(AOM_CONTENT_DEFAULT) {}
+ ~CpuSpeedTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ cfg_.g_lag_in_frames = 25;
+ }
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
+ }
+
+ void TestQ0();
+ void TestScreencastQ0();
+ void TestTuneScreen();
+ void TestEncodeHighBitrate();
+ void TestLowBitrate();
+
+ ::libaom_test::TestMode encoding_mode_;
+ int set_cpu_used_;
+ double min_psnr_;
+ int tune_content_;
+};
+
+void CpuSpeedTest::TestQ0() {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_target_bitrate = 400;
+ cfg_.rc_max_quantizer = 0;
+ cfg_.rc_min_quantizer = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 10);
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+void CpuSpeedTest::TestScreencastQ0() {
+ ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
+ cfg_.g_timebase = video.timebase();
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_target_bitrate = 400;
+ cfg_.rc_max_quantizer = 0;
+ cfg_.rc_min_quantizer = 0;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+void CpuSpeedTest::TestTuneScreen() {
+ ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
+ cfg_.g_timebase = video.timebase();
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_min_quantizer = 0;
+ tune_content_ = AOM_CONTENT_SCREEN;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestEncodeHighBitrate() {
+ // Validate that this non multiple of 64 wide clip encodes and decodes
+ // without a mismatch when passing in a very low max q. This pushes
+ // the encoder to producing lots of big partitions which will likely
+ // extend into the border and test the border condition.
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_target_bitrate = 12000;
+ cfg_.rc_max_quantizer = 10;
+ cfg_.rc_min_quantizer = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 10);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestLowBitrate() {
+ // Validate that this clip encodes and decodes without a mismatch
+ // when passing in a very high min q. This pushes the encoder to producing
+ // lots of small partitions which might will test the other condition.
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.rc_min_quantizer = 40;
+
+ ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+ 10);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(CpuSpeedTest, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTest, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTest, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTest, TestLowBitrate) { TestLowBitrate(); }
+
+class CpuSpeedTestLarge : public CpuSpeedTest {};
+
+TEST_P(CpuSpeedTestLarge, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTestLarge, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTestLarge, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); }
+
+AV1_INSTANTIATE_TEST_SUITE(CpuSpeedTest,
+ ::testing::Values(::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood),
+ ::testing::Range(1, 3));
+AV1_INSTANTIATE_TEST_SUITE(CpuSpeedTestLarge,
+ ::testing::Values(::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood),
+ ::testing::Range(0, 1));
+} // namespace
diff --git a/third_party/aom/test/cpu_used_firstpass_test.cc b/third_party/aom/test/cpu_used_firstpass_test.cc
new file mode 100644
index 0000000000..53db8b0d13
--- /dev/null
+++ b/third_party/aom/test/cpu_used_firstpass_test.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const double kPsnrDiffThreshold = 0.1;
+
+// Params: first pass cpu used, second pass cpu used
+class CpuUsedFirstpassTest
+ : public ::libaom_test::CodecTestWith2Params<int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ CpuUsedFirstpassTest()
+ : EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(2)) {}
+ ~CpuUsedFirstpassTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_target_bitrate = 1000;
+ cfg_.g_lag_in_frames = 19;
+ cfg_.g_threads = 0;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ void BeginPassHook(unsigned int pass) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+
+ if (pass == 0)
+ cpu_used_ = first_pass_cpu_used_;
+ else
+ cpu_used_ = second_pass_cpu_used_;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrDiffThreshold() { return kPsnrDiffThreshold; }
+
+ void DoTest() {
+ libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 30);
+ double ref_psnr;
+ double psnr_diff;
+
+ first_pass_cpu_used_ = second_pass_cpu_used_;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // same preset case ref_psnr
+ ref_psnr = GetAveragePsnr();
+
+ first_pass_cpu_used_ = GET_PARAM(1);
+ if (first_pass_cpu_used_ == second_pass_cpu_used_) return;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ psnr_diff = std::abs(ref_psnr - GetAveragePsnr());
+ EXPECT_LT(psnr_diff, GetPsnrDiffThreshold())
+ << "first pass cpu used = " << first_pass_cpu_used_
+ << ", second pass cpu used = " << second_pass_cpu_used_;
+ }
+
+ int cpu_used_;
+ int first_pass_cpu_used_;
+ int second_pass_cpu_used_;
+ unsigned int nframes_;
+ double psnr_;
+};
+
+TEST_P(CpuUsedFirstpassTest, FirstPassTest) { DoTest(); }
+
+class CpuUsedFirstpassTestLarge : public CpuUsedFirstpassTest {};
+
+TEST_P(CpuUsedFirstpassTestLarge, FirstPassTest) { DoTest(); }
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+static const int kSecondPassCpuUsedLarge[] = { 2, 4 };
+static const int kSecondPassCpuUsed[] = { 6 };
+#else
+static const int kSecondPassCpuUsedLarge[] = { 2 };
+static const int kSecondPassCpuUsed[] = { 4, 6 };
+#endif
+#else
+static const int kSecondPassCpuUsedLarge[] = { 2 };
+static const int kSecondPassCpuUsed[] = { 4, 6 };
+#endif
+
+AV1_INSTANTIATE_TEST_SUITE(
+ CpuUsedFirstpassTestLarge, ::testing::Values(2, 4, 6),
+ ::testing::ValuesIn(kSecondPassCpuUsedLarge)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(
+ CpuUsedFirstpassTest, ::testing::Values(2, 4, 6),
+ ::testing::ValuesIn(kSecondPassCpuUsed)); // cpu_used
+
+} // namespace
diff --git a/third_party/aom/test/datarate_test.cc b/third_party/aom/test/datarate_test.cc
new file mode 100644
index 0000000000..a75a72fab6
--- /dev/null
+++ b/third_party/aom/test/datarate_test.cc
@@ -0,0 +1,712 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/datarate_test.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+
+namespace datarate_test {
+namespace {
+
+// Params: test mode, speed, aq mode and index for bitrate array.
+class DatarateTestLarge
+ : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+ unsigned int, int>,
+ public DatarateTest {
+ public:
+ DatarateTestLarge() : DatarateTest(GET_PARAM(0)) {
+ set_cpu_used_ = GET_PARAM(2);
+ aq_mode_ = GET_PARAM(3);
+ }
+
+ protected:
+ ~DatarateTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ ResetModel();
+ }
+
+ virtual void BasicRateTargetingVBRTest() {
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.g_error_resilient = 0;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 140);
+ const int bitrate_array[2] = { 400, 800 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
+ << " The datarate for the file is lower than target by too much!";
+ // FIXME(jingning): Lower this test threshold after vbr mode can render
+ // sufficiently accurate bit rate.
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+
+ virtual void BasicRateTargetingCBRTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 140);
+ const int bitrate_array[2] = { 150, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.19)
+ << " The datarate for the file is greater than target by too much!";
+ }
+
+ virtual void BasicRateTargetingCBRSpikeTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.kf_max_dist = 3000;
+ cfg_.kf_min_dist = 3000;
+
+ ::libaom_test::I420VideoSource video("desktopqvga2.320_240.yuv", 320, 240,
+ 30, 1, 0, 800);
+ const int bitrate_array[2] = { 100, 200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ max_perc_spike_ = 3.0;
+ max_perc_spike_high_ = 8.0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.19)
+ << " The datarate for the file is greater than target by too much!";
+ ASSERT_LE(num_spikes_, 8);
+ ASSERT_LT(num_spikes_high_, 1);
+ }
+
+ virtual void BasicRateTargetingCBRDynamicBitrateTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.kf_max_dist = 3000;
+ cfg_.kf_min_dist = 3000;
+
+ ::libaom_test::I420VideoSource video("desktop1.320_180.yuv", 320, 180, 30,
+ 1, 0, 800);
+ const int bitrate_array[2] = { 100, 200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ target_bitrate_update_[0] = cfg_.rc_target_bitrate;
+ target_bitrate_update_[1] = static_cast<int>(1.3 * cfg_.rc_target_bitrate);
+ target_bitrate_update_[2] = static_cast<int>(0.7 * cfg_.rc_target_bitrate);
+ frame_update_bitrate_ = 250;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < 3; i++) {
+ ASSERT_GE(effective_datarate_dynamic_[i],
+ target_bitrate_update_[i] * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_dynamic_[i],
+ target_bitrate_update_[i] * 1.20)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingMultiThreadCBRTest() {
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_threads = 4;
+
+ const int bitrate_array[2] = { 250, 650 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ tile_column_ = 2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 1.15)
+ << " The datarate for the file missed the target!"
+ << cfg_.rc_target_bitrate << " " << effective_datarate_;
+ }
+
+ virtual void ErrorResilienceOnSceneCuts() {
+ if (GET_PARAM(4) > 0) return;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.g_error_resilient = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 500;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+ << " The datarate for the file is greater than target by too much!";
+ }
+
+ virtual void BasicRateTargetingCBRPeriodicKeyFrameTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ // Periodic keyframe
+ cfg_.kf_max_dist = 50;
+
+ ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
+ 30, 1, 0, 310);
+ const int bitrate_array[2] = { 150, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+ << " The datarate for the file is greater than target by too much!";
+ }
+
+ virtual void CBRPeriodicKeyFrameOnSceneCuts() {
+ if (GET_PARAM(4) > 0) return;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ // Periodic keyframe
+ cfg_.kf_max_dist = 30;
+ cfg_.kf_min_dist = 30;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 500;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
+ << " The datarate for the file is greater than target by too much!";
+ }
+
+ virtual void BasicRateTargetingAQModeOnOffCBRTest() {
+ if (GET_PARAM(4) > 0) return;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_pass = AOM_RC_ONE_PASS;
+ cfg_.g_usage = AOM_USAGE_REALTIME;
+ cfg_.kf_mode = AOM_KF_DISABLED;
+
+ ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
+ 30, 1, 0, 310);
+ cfg_.rc_target_bitrate = 60;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+ << " The datarate for the file is greater than target by too much!";
+ }
+
+ virtual void BasicRateTargeting444CBRScreenTest() {
+ ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+ cfg_.g_profile = 1;
+ cfg_.g_timebase = video.timebase();
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+
+ const int bitrate_array[2] = { 250, 650 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ screen_mode_ = true;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 1.15)
+ << " The datarate for the file missed the target!"
+ << cfg_.rc_target_bitrate << " " << effective_datarate_;
+ }
+
+ virtual void BasicRateTargetingSuperresCBR() {
+ ::libaom_test::I420VideoSource video("desktopqvga2.320_240.yuv", 320, 240,
+ 30, 1, 0, 800);
+
+ cfg_.g_profile = 0;
+ cfg_.g_timebase = video.timebase();
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+
+ cfg_.rc_superres_mode = AOM_SUPERRES_FIXED;
+ cfg_.rc_superres_denominator = 16;
+ cfg_.rc_superres_kf_denominator = 16;
+
+ const int bitrate_array[2] = { 250, 650 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 1.15)
+ << " The datarate for the file missed the target!"
+ << cfg_.rc_target_bitrate << " " << effective_datarate_;
+ }
+
+ virtual void BasicRateTargetingSuperresCBRMultiThreads() {
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+
+ cfg_.g_profile = 0;
+ cfg_.g_timebase = video.timebase();
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_threads = 2;
+
+ cfg_.rc_superres_mode = AOM_SUPERRES_FIXED;
+ cfg_.rc_superres_denominator = 16;
+ cfg_.rc_superres_kf_denominator = 16;
+
+ const int bitrate_array[2] = { 250, 650 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ tile_column_ = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+ effective_datarate_ * 1.15)
+ << " The datarate for the file missed the target!"
+ << cfg_.rc_target_bitrate << " " << effective_datarate_;
+ }
+};
+
+// Params: test mode, speed, aq mode.
+class DatarateTestFrameDropLarge
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+ unsigned int>,
+ public DatarateTest {
+ public:
+ DatarateTestFrameDropLarge() : DatarateTest(GET_PARAM(0)) {
+ set_cpu_used_ = GET_PARAM(2);
+ aq_mode_ = GET_PARAM(3);
+ }
+
+ protected:
+ ~DatarateTestFrameDropLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ ResetModel();
+ }
+
+ virtual void ChangingDropFrameThreshTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_undershoot_pct = 20;
+ cfg_.rc_undershoot_pct = 20;
+ cfg_.rc_dropframe_thresh = 10;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 50;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 1;
+ // TODO(marpan): Investigate datarate target failures with a smaller
+ // keyframe interval (128).
+ cfg_.kf_max_dist = 9999;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 100);
+
+ const int kDropFrameThreshTestStep = 30;
+ aom_codec_pts_t last_drop = 140;
+ int last_num_drops = 0;
+ for (int i = 40; i < 100; i += kDropFrameThreshTestStep) {
+ cfg_.rc_dropframe_thresh = i;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.40)
+ << " The datarate for the file is greater than target by too much!";
+ if (last_drop > 0) {
+ ASSERT_LE(first_drop_, last_drop)
+ << " The first dropped frame for drop_thresh " << i
+ << " > first dropped frame for drop_thresh "
+ << i - kDropFrameThreshTestStep;
+ }
+ ASSERT_GE(num_drops_, last_num_drops * 0.7)
+ << " The number of dropped frames for drop_thresh " << i
+ << " < number of dropped frames for drop_thresh "
+ << i - kDropFrameThreshTestStep;
+ last_drop = first_drop_;
+ last_num_drops = num_drops_;
+ }
+ }
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestLarge, BasicRateTargetingVBR) {
+ BasicRateTargetingVBRTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargetingCBR) {
+ BasicRateTargetingCBRTest();
+}
+
+// Check basic rate targeting for CBR, with 4 threads
+TEST_P(DatarateTestLarge, BasicRateTargetingMultiThreadCBR) {
+ BasicRateTargetingMultiThreadCBRTest();
+}
+
+// Check basic rate targeting for periodic key frame.
+TEST_P(DatarateTestLarge, PeriodicKeyFrameCBR) {
+ BasicRateTargetingCBRPeriodicKeyFrameTest();
+}
+
+// Check basic rate targeting for periodic key frame, aligned with scene change.
+TEST_P(DatarateTestLarge, PeriodicKeyFrameCBROnSceneCuts) {
+ CBRPeriodicKeyFrameOnSceneCuts();
+}
+
+// Check basic rate targeting with error resilience on for scene cuts.
+TEST_P(DatarateTestLarge, ErrorResilienceOnSceneCuts) {
+ ErrorResilienceOnSceneCuts();
+}
+
+// Check basic rate targeting for CBR, for 444 input screen mode.
+#if defined(CONFIG_MAX_DECODE_PROFILE) && CONFIG_MAX_DECODE_PROFILE < 1
+TEST_P(DatarateTestLarge, DISABLED_BasicRateTargeting444CBRScreen) {
+#else
+TEST_P(DatarateTestLarge, BasicRateTargeting444CBRScreen) {
+#endif
+ BasicRateTargeting444CBRScreenTest();
+}
+
+// Check basic rate targeting for Superres mode with CBR.
+TEST_P(DatarateTestLarge, BasicRateTargetingSuperresCBR) {
+ BasicRateTargetingSuperresCBR();
+}
+
+// Check basic rate targeting for Superres mode with CBR and multi-threads.
+TEST_P(DatarateTestLarge, BasicRateTargetingSuperresCBRMultiThreads) {
+ BasicRateTargetingSuperresCBRMultiThreads();
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestFrameDropLarge, ChangingDropFrameThresh) {
+ ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestLarge, BasicRateTargetingAQModeOnOffCBR) {
+ BasicRateTargetingAQModeOnOffCBRTest();
+}
+
+class DatarateTestRealtime : public DatarateTestLarge {};
+
+class DatarateTestFrameDropRealtime : public DatarateTestFrameDropLarge {};
+
+// Params: aq mode.
+class DatarateTestSpeedChangeRealtime
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+ unsigned int>,
+ public DatarateTest {
+ public:
+ DatarateTestSpeedChangeRealtime() : DatarateTest(GET_PARAM(0)) {
+ aq_mode_ = GET_PARAM(1);
+ speed_change_test_ = true;
+ }
+
+ protected:
+ ~DatarateTestSpeedChangeRealtime() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ ResetModel();
+ }
+
+ virtual void ChangingSpeedTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_undershoot_pct = 20;
+ cfg_.rc_undershoot_pct = 20;
+ cfg_.rc_dropframe_thresh = 10;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 50;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 1;
+ // TODO(marpan): Investigate datarate target failures with a smaller
+ // keyframe interval (128).
+ cfg_.kf_max_dist = 9999;
+ cfg_.rc_dropframe_thresh = 0;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 100);
+
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.83)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.35)
+ << " The datarate for the file is greater than target by too much!";
+ }
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestRealtime, BasicRateTargetingVBR) {
+ BasicRateTargetingVBRTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBR) {
+ BasicRateTargetingCBRTest();
+}
+
+// Check basic rate targeting for CBR. Use a longer clip,
+// and verify #encode size spikes above threshold.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBRSpike) {
+ BasicRateTargetingCBRSpikeTest();
+}
+
+// Check basic rate targeting for CBR. Use a longer clip,
+// and verify encoder can respnd and hit new bitrates updated
+// within the stream.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBRDynamicBitrate) {
+ BasicRateTargetingCBRDynamicBitrateTest();
+}
+
+// Check basic rate targeting for CBR, with 4 threads
+TEST_P(DatarateTestRealtime, BasicRateTargetingMultiThreadCBR) {
+ BasicRateTargetingMultiThreadCBRTest();
+}
+
+// Check basic rate targeting for periodic key frame.
+TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBR) {
+ BasicRateTargetingCBRPeriodicKeyFrameTest();
+}
+
+// Check basic rate targeting for periodic key frame, aligned with scene change.
+TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBROnSceneCuts) {
+ CBRPeriodicKeyFrameOnSceneCuts();
+}
+
+// Check basic rate targeting with error resilience on for scene cuts.
+TEST_P(DatarateTestRealtime, ErrorResilienceOnSceneCuts) {
+ ErrorResilienceOnSceneCuts();
+}
+
+// Check basic rate targeting for CBR for 444 screen mode.
+#if defined(CONFIG_MAX_DECODE_PROFILE) && CONFIG_MAX_DECODE_PROFILE < 1
+TEST_P(DatarateTestRealtime, DISABLED_BasicRateTargeting444CBRScreen) {
+#else
+TEST_P(DatarateTestRealtime, BasicRateTargeting444CBRScreen) {
+#endif
+ BasicRateTargeting444CBRScreenTest();
+}
+
+// Check basic rate targeting for Superres mode with CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargetingSuperresCBR) {
+ BasicRateTargetingSuperresCBR();
+}
+
+// Check basic rate targeting for Superres mode with CBR and multi-threads.
+TEST_P(DatarateTestRealtime, BasicRateTargetingSuperresCBRMultiThreads) {
+ BasicRateTargetingSuperresCBRMultiThreads();
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestFrameDropRealtime, ChangingDropFrameThresh) {
+ ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestSpeedChangeRealtime, ChangingSpeedTest) {
+ ChangingSpeedTest();
+}
+
+class DatarateTestSetFrameQpRealtime
+ : public DatarateTest,
+ public ::testing::TestWithParam<const libaom_test::AV1CodecFactory *> {
+ public:
+ DatarateTestSetFrameQpRealtime() : DatarateTest(GetParam()), frame_(0) {}
+
+ protected:
+ ~DatarateTestSetFrameQpRealtime() override = default;
+
+ void SetUp() override {
+ InitializeConfig(libaom_test::kRealTime);
+ ResetModel();
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ set_cpu_used_ = 7;
+ DatarateTest::PreEncodeFrameHook(video, encoder);
+ frame_qp_ = rnd_.PseudoUniform(63);
+ encoder->Control(AV1E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+ frame_++;
+ }
+
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
+ if (frame_ >= total_frames_) return;
+ int qp = 0;
+ encoder->Control(AOME_GET_LAST_QUANTIZER_64, &qp);
+ ASSERT_EQ(qp, frame_qp_);
+ }
+
+ protected:
+ int total_frames_;
+
+ private:
+ int frame_qp_;
+ int frame_;
+ libaom_test::ACMRandom rnd_;
+};
+
+TEST_P(DatarateTestSetFrameQpRealtime, SetFrameQpOnePass) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_undershoot_pct = 20;
+ cfg_.rc_undershoot_pct = 20;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 50;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 1;
+ cfg_.kf_max_dist = 9999;
+ cfg_.rc_dropframe_thresh = 0;
+
+ total_frames_ = 100;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 100);
+
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestLarge,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(5, 7), ::testing::Values(0, 3),
+ ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestFrameDropLarge,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(5, 7), ::testing::Values(0, 3));
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestRealtime,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(7, 12), ::testing::Values(0, 3),
+ ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestFrameDropRealtime,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(7, 12), ::testing::Values(0, 3));
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestSpeedChangeRealtime,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Values(0, 3));
+
+INSTANTIATE_TEST_SUITE_P(
+ AV1, DatarateTestSetFrameQpRealtime,
+ ::testing::Values(
+ static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)));
+
+} // namespace
+} // namespace datarate_test
diff --git a/third_party/aom/test/datarate_test.h b/third_party/aom/test/datarate_test.h
new file mode 100644
index 0000000000..accc1ad86b
--- /dev/null
+++ b/third_party/aom/test/datarate_test.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+
+namespace datarate_test {
+namespace {
+class DatarateTest : public ::libaom_test::EncoderTest {
+ public:
+ explicit DatarateTest(const ::libaom_test::CodecFactory *codec)
+ : EncoderTest(codec), set_cpu_used_(0), aq_mode_(0),
+ speed_change_test_(false) {}
+
+ protected:
+ ~DatarateTest() override = default;
+
+ virtual void ResetModel() {
+ last_pts_ = 0;
+ bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+ frame_number_ = 0;
+ tot_frame_number_ = 0;
+ first_drop_ = 0;
+ num_drops_ = 0;
+ // Denoiser is off by default.
+ denoiser_on_ = 0;
+ bits_total_ = 0;
+ denoiser_offon_test_ = 0;
+ denoiser_offon_period_ = -1;
+ tile_column_ = 0;
+ screen_mode_ = false;
+ max_perc_spike_ = 1.0;
+ max_perc_spike_high_ = 1.0;
+ num_spikes_ = 0;
+ num_spikes_high_ = 0;
+ frame_update_bitrate_ = 0;
+ for (int i = 0; i < 3; i++) {
+ target_bitrate_update_[i] = 0;
+ frame_number_dynamic_[i] = 0;
+ bits_total_dynamic_[i] = 0;
+ effective_datarate_dynamic_[i] = 0.0;
+ }
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_column_);
+ encoder->Control(AV1E_SET_ROW_MT, 1);
+ if (cfg_.g_usage == AOM_USAGE_REALTIME) {
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+ encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+ encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+ encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
+ }
+ if (screen_mode_) {
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ encoder->Control(AV1E_SET_ENABLE_PALETTE, 1);
+ encoder->Control(AV1E_SET_ENABLE_INTRABC, 0);
+ }
+ }
+
+ if (speed_change_test_) {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 8);
+ } else if (video->frame() == 30) {
+ encoder->Control(AOME_SET_CPUUSED, 7);
+ } else if (video->frame() == 60) {
+ encoder->Control(AOME_SET_CPUUSED, 6);
+ } else if (video->frame() == 90) {
+ encoder->Control(AOME_SET_CPUUSED, 7);
+ }
+ }
+
+ if (frame_update_bitrate_ > 0) {
+ if (frame_number_ == frame_update_bitrate_) {
+ cfg_.rc_target_bitrate = target_bitrate_update_[1];
+ encoder->Config(&cfg_);
+ } else if (frame_number_ == 2 * frame_update_bitrate_) {
+ cfg_.rc_target_bitrate = target_bitrate_update_[2];
+ encoder->Config(&cfg_);
+ }
+ }
+
+ if (denoiser_offon_test_) {
+ ASSERT_GT(denoiser_offon_period_, 0)
+ << "denoiser_offon_period_ is not positive.";
+ if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+ // Flip denoiser_on_ periodically
+ denoiser_on_ ^= 1;
+ }
+ }
+
+ encoder->Control(AV1E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+ const aom_rational_t tb = video->timebase();
+ timebase_ = static_cast<double>(tb.num) / tb.den;
+ duration_ = 0;
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ // Time since last timestamp = duration.
+ aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+ if (duration > 1) {
+ // If first drop not set and we have a drop set it to this time.
+ if (!first_drop_) first_drop_ = last_pts_ + 1;
+ // Update the number of frame drops.
+ num_drops_ += static_cast<int>(duration - 1);
+ // Update counter for total number of frames (#frames input to encoder).
+ // Needed for setting the proper layer_id below.
+ tot_frame_number_ += static_cast<int>(duration - 1);
+ }
+
+ // Add to the buffer the bits we'd expect from a constant bitrate server.
+ bits_in_buffer_model_ += static_cast<int64_t>(
+ duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+ // Buffer should not go negative.
+ ASSERT_GE(bits_in_buffer_model_, 0)
+ << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+ const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+ // Update the total encoded bits.
+ bits_total_ += frame_size_in_bits;
+
+ // Update the most recent pts.
+ last_pts_ = pkt->data.frame.pts;
+ ++frame_number_;
+ ++tot_frame_number_;
+ const int per_frame_bandwidth = (cfg_.rc_target_bitrate * 1000) / 30;
+ if (frame_size_in_bits > max_perc_spike_ * per_frame_bandwidth &&
+ frame_number_ > 1)
+ num_spikes_++;
+ if (frame_size_in_bits > max_perc_spike_high_ * per_frame_bandwidth &&
+ frame_number_ > 1)
+ num_spikes_high_++;
+
+ if (frame_update_bitrate_ > 0) {
+ if (frame_number_ < frame_update_bitrate_) {
+ bits_total_dynamic_[0] += frame_size_in_bits;
+ frame_number_dynamic_[0]++;
+ } else if (frame_number_ >= frame_update_bitrate_ &&
+ frame_number_ < 2 * frame_update_bitrate_) {
+ bits_total_dynamic_[1] += frame_size_in_bits;
+ frame_number_dynamic_[1]++;
+ } else {
+ bits_total_dynamic_[2] += frame_size_in_bits;
+ frame_number_dynamic_[2]++;
+ }
+ }
+ }
+
+ void EndPassHook() override {
+ duration_ = (last_pts_ + 1) * timebase_;
+ // Effective file datarate:
+ effective_datarate_ = (bits_total_ / 1000.0) / duration_;
+ if (frame_update_bitrate_ > 0) {
+ for (int i = 0; i < 3; i++)
+ effective_datarate_dynamic_[i] =
+ 30 * (bits_total_dynamic_[i] / 1000.0) / frame_number_dynamic_[i];
+ }
+ }
+
+ aom_codec_pts_t last_pts_;
+ double timebase_;
+ int frame_number_; // Counter for number of non-dropped/encoded frames.
+ int tot_frame_number_; // Counter for total number of input frames.
+ int64_t bits_total_;
+ double duration_;
+ double effective_datarate_;
+ int set_cpu_used_;
+ int64_t bits_in_buffer_model_;
+ aom_codec_pts_t first_drop_;
+ int num_drops_;
+ int denoiser_on_;
+ int denoiser_offon_test_;
+ int denoiser_offon_period_;
+ unsigned int aq_mode_;
+ bool speed_change_test_;
+ int tile_column_;
+ bool screen_mode_;
+ double max_perc_spike_;
+ double max_perc_spike_high_;
+ int num_spikes_;
+ int num_spikes_high_;
+ // These are use for test with dynamic bitrate change.
+ // Used to verify that the encoder can respond and hit bitrate that is updated
+ // during the sequence.
+ int frame_update_bitrate_;
+ int target_bitrate_update_[3];
+ double effective_datarate_dynamic_[3];
+ int64_t bits_total_dynamic_[3];
+ int frame_number_dynamic_[3];
+};
+
+} // namespace
+} // namespace datarate_test
diff --git a/third_party/aom/test/decode_api_test.cc b/third_party/aom/test/decode_api_test.cc
new file mode 100644
index 0000000000..591a167e94
--- /dev/null
+++ b/third_party/aom/test/decode_api_test.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+
+namespace {
+
+TEST(DecodeAPI, InvalidParams) {
+ uint8_t buf[1] = { 0 };
+ aom_codec_ctx_t dec;
+
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_dec_init(nullptr, nullptr, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_dec_init(&dec, nullptr, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_decode(nullptr, nullptr, 0, nullptr));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_decode(nullptr, buf, 0, nullptr));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_decode(nullptr, buf, sizeof(buf), nullptr));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_decode(nullptr, nullptr, sizeof(buf), nullptr));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(nullptr));
+ EXPECT_NE(aom_codec_error(nullptr), nullptr);
+ EXPECT_EQ(aom_codec_error_detail(nullptr), nullptr);
+
+ aom_codec_iface_t *iface = aom_codec_av1_dx();
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_dec_init(nullptr, iface, nullptr, 0));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_decode(&dec, nullptr, sizeof(buf), nullptr));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, nullptr));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
+}
+
+TEST(DecodeAPI, InvalidControlId) {
+ aom_codec_iface_t *iface = aom_codec_av1_dx();
+ aom_codec_ctx_t dec;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&dec, -1, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&dec, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
+}
+
+} // namespace
diff --git a/third_party/aom/test/decode_multithreaded_test.cc b/third_party/aom/test/decode_multithreaded_test.cc
new file mode 100644
index 0000000000..4e06f1afac
--- /dev/null
+++ b/third_party/aom/test/decode_multithreaded_test.cc
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "aom_mem/aom_mem.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+static const int kNumMultiThreadDecoders = 3;
+
+class AV1DecodeMultiThreadedTest
+ : public ::libaom_test::CodecTestWith5Params<int, int, int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1DecodeMultiThreadedTest()
+ : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(),
+ n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+ n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)),
+ row_mt_(GET_PARAM(5)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = 704;
+ cfg.h = 576;
+ cfg.threads = 1;
+ cfg.allow_lowbitdepth = 1;
+ single_thread_dec_ = codec_->CreateDecoder(cfg, 0);
+
+ // Test cfg.threads == powers of 2.
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+ cfg.threads <<= 1;
+ multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0);
+ multi_thread_dec_[i]->Control(AV1D_SET_ROW_MT, row_mt_);
+ }
+
+ if (single_thread_dec_->IsAV1()) {
+ single_thread_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+ if (multi_thread_dec_[i]->IsAV1()) {
+ multi_thread_dec_[i]->Control(AV1D_EXT_TILE_DEBUG, 1);
+ multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+ }
+ }
+
+ ~AV1DecodeMultiThreadedTest() override {
+ delete single_thread_dec_;
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+ delete multi_thread_dec_[i];
+ }
+
+ void SetUp() override { InitializeConfig(libaom_test::kTwoPassGood); }
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+ encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+ encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ }
+ }
+
+ void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+ ::libaom_test::MD5 *md5) {
+ const aom_codec_err_t res = dec->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ const aom_image_t *img = dec->GetDxData().Next();
+ md5->Add(img);
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_);
+
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+ UpdateMD5(multi_thread_dec_[i], pkt, &md5_multi_thread_[i]);
+ }
+
+ void DoTest() {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_lag_in_frames = 12;
+ cfg_.rc_end_usage = AOM_VBR;
+
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+ timebase.den, timebase.num, 0, 2);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ const char *md5_single_thread_str = md5_single_thread_.Get();
+
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+ const char *md5_multi_thread_str = md5_multi_thread_[i].Get();
+ ASSERT_STREQ(md5_single_thread_str, md5_multi_thread_str);
+ }
+ }
+
+ ::libaom_test::MD5 md5_single_thread_;
+ ::libaom_test::MD5 md5_multi_thread_[kNumMultiThreadDecoders];
+ ::libaom_test::Decoder *single_thread_dec_;
+ ::libaom_test::Decoder *multi_thread_dec_[kNumMultiThreadDecoders];
+
+ private:
+ int n_tile_cols_;
+ int n_tile_rows_;
+ int n_tile_groups_;
+ int set_cpu_used_;
+ int row_mt_;
+};
+
+// run an encode and do the decode both in single thread
+// and multi thread. Ensure that the MD5 of the output in both cases
+// is identical. If so, the test passes.
+TEST_P(AV1DecodeMultiThreadedTest, MD5Match) {
+ cfg_.large_scale_tile = 0;
+ single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+ multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+class AV1DecodeMultiThreadedTestLarge : public AV1DecodeMultiThreadedTest {};
+
+TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) {
+ cfg_.large_scale_tile = 0;
+ single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+ multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+// TODO(ranjit): More tests have to be added using pre-generated MD5.
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
+ ::testing::Values(1, 2), ::testing::Values(1),
+ ::testing::Values(3), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedTestLarge,
+ ::testing::Values(0, 1, 2, 6),
+ ::testing::Values(0, 1, 2, 6),
+ ::testing::Values(1, 4), ::testing::Values(0),
+ ::testing::Values(0, 1));
+
+class AV1DecodeMultiThreadedLSTestLarge
+ : public AV1DecodeMultiThreadedTestLarge {};
+
+TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) {
+ cfg_.large_scale_tile = 1;
+ single_thread_dec_->Control(AV1_SET_TILE_MODE, 1);
+ for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+ multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 1);
+ DoTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AV1DecodeMultiThreadedLSTestLarge,
+ ::testing::Values(6), ::testing::Values(6),
+ ::testing::Values(1), ::testing::Values(0, 3),
+ ::testing::Values(0, 1));
+
+} // namespace
diff --git a/third_party/aom/test/decode_perf_test.cc b/third_party/aom/test/decode_perf_test.cc
new file mode 100644
index 0000000000..030035466c
--- /dev/null
+++ b/third_party/aom/test/decode_perf_test.cc
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <tuple>
+
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_timer.h"
+#include "common/ivfenc.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+using std::make_tuple;
+
+namespace {
+
+#define VIDEO_NAME 0
+#define THREADS 1
+
+const double kUsecsInSec = 1000000.0;
+const char kNewEncodeOutputFile[] = "new_encode.ivf";
+
+/*
+ DecodePerfTest takes a tuple of filename + number of threads to decode with
+ */
+typedef std::tuple<const char *, unsigned> DecodePerfParam;
+
+// TODO(jimbankoski): Add actual test vectors here when available.
+// const DecodePerfParam kAV1DecodePerfVectors[] = {};
+
+/*
+ In order to reflect real world performance as much as possible, Perf tests
+ *DO NOT* do any correctness checks. Please run them alongside correctness
+ tests to ensure proper codec integrity. Furthermore, in this test we
+ deliberately limit the amount of system calls we make to avoid OS
+ preemption.
+
+ TODO(joshualitt) create a more detailed perf measurement test to collect
+ power/temp/min max frame decode times/etc
+ */
+
+class DecodePerfTest : public ::testing::TestWithParam<DecodePerfParam> {};
+
+TEST_P(DecodePerfTest, PerfTest) {
+ const char *const video_name = GET_PARAM(VIDEO_NAME);
+ const unsigned threads = GET_PARAM(THREADS);
+
+ libaom_test::WebMVideoSource video(video_name);
+ video.Init();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.threads = threads;
+ cfg.allow_lowbitdepth = 1;
+ libaom_test::AV1Decoder decoder(cfg, 0);
+
+ aom_usec_timer t;
+ aom_usec_timer_start(&t);
+
+ for (video.Begin(); video.cxdata() != nullptr; video.Next()) {
+ decoder.DecodeFrame(video.cxdata(), video.frame_size());
+ }
+
+ aom_usec_timer_mark(&t);
+ const double elapsed_secs = double(aom_usec_timer_elapsed(&t)) / kUsecsInSec;
+ const unsigned frames = video.frame_number();
+ const double fps = double(frames) / elapsed_secs;
+
+ printf("{\n");
+ printf("\t\"type\" : \"decode_perf_test\",\n");
+ printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+ printf("\t\"videoName\" : \"%s\",\n", video_name);
+ printf("\t\"threadCount\" : %u,\n", threads);
+ printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+ printf("\t\"totalFrames\" : %u,\n", frames);
+ printf("\t\"framesPerSecond\" : %f\n", fps);
+ printf("}\n");
+}
+
+// TODO(jimbankoski): Enabled when we have actual AV1 Decode vectors.
+// INSTANTIATE_TEST_SUITE_P(AV1, DecodePerfTest,
+// ::testing::ValuesIn(kAV1DecodePerfVectors));
+
+class AV1NewEncodeDecodePerfTest
+ : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1NewEncodeDecodePerfTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
+ outfile_(nullptr), out_frames_(0) {}
+
+ ~AV1NewEncodeDecodePerfTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_end_usage = AOM_VBR;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, speed_);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
+ }
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ const char *const env = getenv("LIBAOM_TEST_DATA_PATH");
+ const std::string data_path(env ? env : ".");
+ const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
+ outfile_ = fopen(path_to_source.c_str(), "wb");
+ ASSERT_NE(outfile_, nullptr);
+ }
+
+ void EndPassHook() override {
+ if (outfile_ != nullptr) {
+ if (!fseek(outfile_, 0, SEEK_SET))
+ ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
+ fclose(outfile_);
+ outfile_ = nullptr;
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ ++out_frames_;
+
+ // Write initial file header if first frame.
+ if (pkt->data.frame.pts == 0)
+ ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
+
+ // Write frame header and data.
+ ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+ ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+ pkt->data.frame.sz);
+ }
+
+ bool DoDecode() const override { return false; }
+
+ void set_speed(unsigned int speed) { speed_ = speed; }
+
+ private:
+ libaom_test::TestMode encoding_mode_;
+ uint32_t speed_;
+ FILE *outfile_;
+ uint32_t out_frames_;
+};
+
+struct EncodePerfTestVideo {
+ EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+ uint32_t bitrate_, int frames_)
+ : name(name_), width(width_), height(height_), bitrate(bitrate_),
+ frames(frames_) {}
+ const char *name;
+ uint32_t width;
+ uint32_t height;
+ uint32_t bitrate;
+ int frames;
+};
+
+const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
+ EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+TEST_P(AV1NewEncodeDecodePerfTest, PerfTest) {
+ SetUp();
+
+ // TODO(JBB): Make this work by going through the set of given files.
+ const int i = 0;
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = kAV1EncodePerfTestVectors[i].bitrate;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ const char *video_name = kAV1EncodePerfTestVectors[i].name;
+ libaom_test::I420VideoSource video(
+ video_name, kAV1EncodePerfTestVectors[i].width,
+ kAV1EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0,
+ kAV1EncodePerfTestVectors[i].frames);
+ set_speed(2);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ const uint32_t threads = 4;
+
+ libaom_test::IVFVideoSource decode_video(kNewEncodeOutputFile);
+ decode_video.Init();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.threads = threads;
+ cfg.allow_lowbitdepth = 1;
+ libaom_test::AV1Decoder decoder(cfg, 0);
+
+ aom_usec_timer t;
+ aom_usec_timer_start(&t);
+
+ for (decode_video.Begin(); decode_video.cxdata() != nullptr;
+ decode_video.Next()) {
+ decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
+ }
+
+ aom_usec_timer_mark(&t);
+ const double elapsed_secs =
+ static_cast<double>(aom_usec_timer_elapsed(&t)) / kUsecsInSec;
+ const unsigned decode_frames = decode_video.frame_number();
+ const double fps = static_cast<double>(decode_frames) / elapsed_secs;
+
+ printf("{\n");
+ printf("\t\"type\" : \"decode_perf_test\",\n");
+ printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+ printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
+ printf("\t\"threadCount\" : %u,\n", threads);
+ printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+ printf("\t\"totalFrames\" : %u,\n", decode_frames);
+ printf("\t\"framesPerSecond\" : %f\n", fps);
+ printf("}\n");
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AV1NewEncodeDecodePerfTest,
+ ::testing::Values(::libaom_test::kTwoPassGood));
+} // namespace
diff --git a/third_party/aom/test/decode_scalability_test.cc b/third_party/aom/test/decode_scalability_test.cc
new file mode 100644
index 0000000000..d66c8ec719
--- /dev/null
+++ b/third_party/aom/test/decode_scalability_test.cc
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+struct ObuExtensionHeader {
+ int temporal_id;
+ int spatial_id;
+};
+
+struct DecodeParam {
+ const char *filename;
+ const ObuExtensionHeader *headers;
+ size_t num_headers;
+};
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+ return os << "file: " << dp.filename;
+}
+
+class DecodeScalabilityTest
+ : public ::libaom_test::DecoderTest,
+ public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+ DecodeScalabilityTest()
+ : DecoderTest(GET_PARAM(0)), headers_(GET_PARAM(1).headers),
+ num_headers_(GET_PARAM(1).num_headers) {}
+
+ ~DecodeScalabilityTest() override = default;
+
+ void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
+ if (video.frame_number() == 0)
+ decoder->Control(AV1D_SET_OUTPUT_ALL_LAYERS, 1);
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int /*frame_number*/) override {
+ const ObuExtensionHeader &header = headers_[header_index_];
+ EXPECT_EQ(img.temporal_id, header.temporal_id);
+ EXPECT_EQ(img.spatial_id, header.spatial_id);
+ header_index_ = (header_index_ + 1) % num_headers_;
+ }
+
+ void RunTest() {
+ const DecodeParam input = GET_PARAM(1);
+ aom_codec_dec_cfg_t cfg = { 1, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+ libaom_test::IVFVideoSource decode_video(input.filename);
+ decode_video.Init();
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
+ }
+
+ private:
+ const ObuExtensionHeader *const headers_;
+ const size_t num_headers_;
+ size_t header_index_ = 0;
+};
+
+TEST_P(DecodeScalabilityTest, ObuExtensionHeader) { RunTest(); }
+
+// For all test files, we have:
+// operatingPoint = 0
+// OperatingPointIdc = operating_point_idc[ 0 ]
+
+// av1-1-b8-01-size-16x16.ivf:
+// operating_points_cnt_minus_1 = 0
+// operating_point_idc[ 0 ] = 0x0
+const ObuExtensionHeader kSize16x16Headers[1] = { { 0, 0 } };
+
+// av1-1-b8-22-svc-L1T2.ivf:
+// operating_points_cnt_minus_1 = 1
+// operating_point_idc[ 0 ] = 0x103
+// operating_point_idc[ 1 ] = 0x101
+const ObuExtensionHeader kL1T2Headers[2] = { { 0, 0 }, { 1, 0 } };
+
+// av1-1-b8-22-svc-L2T1.ivf:
+// operating_points_cnt_minus_1 = 1
+// operating_point_idc[ 0 ] = 0x301
+// operating_point_idc[ 1 ] = 0x101
+const ObuExtensionHeader kL2T1Headers[2] = { { 0, 0 }, { 0, 1 } };
+
+// av1-1-b8-22-svc-L2T2.ivf:
+// operating_points_cnt_minus_1 = 3
+// operating_point_idc[ 0 ] = 0x303
+// operating_point_idc[ 1 ] = 0x301
+// operating_point_idc[ 2 ] = 0x103
+// operating_point_idc[ 3 ] = 0x101
+const ObuExtensionHeader kL2T2Headers[4] = {
+ { 0, 0 }, { 0, 1 }, { 1, 0 }, { 1, 1 }
+};
+
+const DecodeParam kAV1DecodeScalabilityTests[] = {
+ // { filename, headers, num_headers }
+ { "av1-1-b8-01-size-16x16.ivf", kSize16x16Headers, 1 },
+ { "av1-1-b8-22-svc-L1T2.ivf", kL1T2Headers, 2 },
+ { "av1-1-b8-22-svc-L2T1.ivf", kL2T1Headers, 2 },
+ { "av1-1-b8-22-svc-L2T2.ivf", kL2T2Headers, 4 },
+};
+
+AV1_INSTANTIATE_TEST_SUITE(DecodeScalabilityTest,
+ ::testing::ValuesIn(kAV1DecodeScalabilityTests));
+
+} // namespace
diff --git a/third_party/aom/test/decode_test_driver.cc b/third_party/aom/test/decode_test_driver.cc
new file mode 100644
index 0000000000..f44d670556
--- /dev/null
+++ b/third_party/aom/test/decode_test_driver.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+const char kAV1Name[] = "AOMedia Project AV1 Decoder";
+
+aom_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+ aom_codec_stream_info_t *stream_info) {
+ return aom_codec_peek_stream_info(CodecInterface(), cxdata, size,
+ stream_info);
+}
+
+aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+ return DecodeFrame(cxdata, size, nullptr);
+}
+
+aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+ void *user_priv) {
+ aom_codec_err_t res_dec;
+ InitOnce();
+ API_REGISTER_STATE_CHECK(
+ res_dec = aom_codec_decode(&decoder_, cxdata, size, user_priv));
+ return res_dec;
+}
+
+bool Decoder::IsAV1() const {
+ const char *codec_name = GetDecoderName();
+ return strncmp(kAV1Name, codec_name, sizeof(kAV1Name) - 1) == 0;
+}
+
+void DecoderTest::HandlePeekResult(Decoder *const /*decoder*/,
+ CompressedVideoSource * /*video*/,
+ const aom_codec_err_t res_peek) {
+ /* The Av1 implementation of PeekStream returns an error only if the
+ * data passed to it isn't a valid Av1 chunk. */
+ ASSERT_EQ(AOM_CODEC_OK, res_peek)
+ << "Peek return failed: " << aom_codec_err_to_string(res_peek);
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video,
+ const aom_codec_dec_cfg_t &dec_cfg) {
+ Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
+ ASSERT_NE(decoder, nullptr);
+ bool end_of_file = false;
+ bool peeked_stream = false;
+
+ // Decode frames.
+ for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
+ video->Next()) {
+ PreDecodeFrameHook(*video, decoder);
+
+ aom_codec_stream_info_t stream_info;
+ stream_info.is_annexb = 0;
+
+ if (video->cxdata() != nullptr) {
+ if (!peeked_stream) {
+ // TODO(yaowu): PeekStream returns error for non-sequence_header_obu,
+ // therefore should only be tried once per sequence, this shall be fixed
+ // once PeekStream is updated to properly operate on other obus.
+ const aom_codec_err_t res_peek = decoder->PeekStream(
+ video->cxdata(), video->frame_size(), &stream_info);
+ HandlePeekResult(decoder, video, res_peek);
+ ASSERT_FALSE(::testing::Test::HasFailure());
+ peeked_stream = true;
+ }
+
+ aom_codec_err_t res_dec =
+ decoder->DecodeFrame(video->cxdata(), video->frame_size());
+ if (!HandleDecodeResult(res_dec, *video, decoder)) break;
+ } else {
+ // Signal end of the file to the decoder.
+ const aom_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0);
+ ASSERT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ end_of_file = true;
+ }
+
+ DxDataIterator dec_iter = decoder->GetDxData();
+ const aom_image_t *img = nullptr;
+
+ // Get decompressed data
+ while (!::testing::Test::HasFailure() && (img = dec_iter.Next()))
+ DecompressedFrameHook(*img, video->frame_number());
+ }
+ delete decoder;
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video) {
+ aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+ RunLoop(video, dec_cfg);
+}
+
+void DecoderTest::set_cfg(const aom_codec_dec_cfg_t &dec_cfg) {
+ memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const aom_codec_flags_t flags) { flags_ = flags; }
+
+} // namespace libaom_test
diff --git a/third_party/aom/test/decode_test_driver.h b/third_party/aom/test/decode_test_driver.h
new file mode 100644
index 0000000000..311898ecf0
--- /dev/null
+++ b/third_party/aom/test/decode_test_driver.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_DECODE_TEST_DRIVER_H_
+#define AOM_TEST_DECODE_TEST_DRIVER_H_
+#include <cstring>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+
+namespace libaom_test {
+
+class CodecFactory;
+class CompressedVideoSource;
+
+// Provides an object to handle decoding output
+class DxDataIterator {
+ public:
+ explicit DxDataIterator(aom_codec_ctx_t *decoder)
+ : decoder_(decoder), iter_(nullptr) {}
+
+ const aom_image_t *Next() { return aom_codec_get_frame(decoder_, &iter_); }
+
+ private:
+ aom_codec_ctx_t *decoder_;
+ aom_codec_iter_t iter_;
+};
+
+// Provides a simplified interface to manage one video decoding.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
+class Decoder {
+ public:
+ explicit Decoder(aom_codec_dec_cfg_t cfg)
+ : cfg_(cfg), flags_(0), init_done_(false) {
+ memset(&decoder_, 0, sizeof(decoder_));
+ }
+
+ Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag)
+ : cfg_(cfg), flags_(flag), init_done_(false) {
+ memset(&decoder_, 0, sizeof(decoder_));
+ }
+
+ virtual ~Decoder() { aom_codec_destroy(&decoder_); }
+
+ aom_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+ aom_codec_stream_info_t *stream_info);
+
+ aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);
+
+ aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+ void *user_priv);
+
+ DxDataIterator GetDxData() { return DxDataIterator(&decoder_); }
+
+ void Control(int ctrl_id, int arg) { Control(ctrl_id, arg, AOM_CODEC_OK); }
+
+ void Control(int ctrl_id, const void *arg) {
+ InitOnce();
+ const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
+ }
+
+ void Control(int ctrl_id, int arg, aom_codec_err_t expected_value) {
+ InitOnce();
+ const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg);
+ ASSERT_EQ(expected_value, res) << DecodeError();
+ }
+
+ const char *DecodeError() {
+ const char *detail = aom_codec_error_detail(&decoder_);
+ return detail ? detail : aom_codec_error(&decoder_);
+ }
+
+ // Passes the external frame buffer information to libaom.
+ aom_codec_err_t SetFrameBufferFunctions(
+ aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release, void *user_priv) {
+ InitOnce();
+ return aom_codec_set_frame_buffer_functions(&decoder_, cb_get, cb_release,
+ user_priv);
+ }
+
+ const char *GetDecoderName() const {
+ return aom_codec_iface_name(CodecInterface());
+ }
+
+ bool IsAV1() const;
+
+ aom_codec_ctx_t *GetDecoder() { return &decoder_; }
+
+ protected:
+ virtual aom_codec_iface_t *CodecInterface() const = 0;
+
+ void InitOnce() {
+ if (!init_done_) {
+ const aom_codec_err_t res =
+ aom_codec_dec_init(&decoder_, CodecInterface(), &cfg_, flags_);
+ ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
+ init_done_ = true;
+ }
+ }
+
+ aom_codec_ctx_t decoder_;
+ aom_codec_dec_cfg_t cfg_;
+ aom_codec_flags_t flags_;
+ bool init_done_;
+};
+
+// Common test functionality for all Decoder tests.
+class DecoderTest {
+ public:
+ // Main decoding loop
+ virtual void RunLoop(CompressedVideoSource *video);
+ virtual void RunLoop(CompressedVideoSource *video,
+ const aom_codec_dec_cfg_t &dec_cfg);
+
+ virtual void set_cfg(const aom_codec_dec_cfg_t &dec_cfg);
+ virtual void set_flags(const aom_codec_flags_t flags);
+
+ // Hook to be called before decompressing every frame.
+ virtual void PreDecodeFrameHook(const CompressedVideoSource & /*video*/,
+ Decoder * /*decoder*/) {}
+
+ // Hook to be called to handle decode result. Return true to continue.
+ virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ const CompressedVideoSource & /*video*/,
+ Decoder *decoder) {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ // Hook to be called on every decompressed frame.
+ virtual void DecompressedFrameHook(const aom_image_t & /*img*/,
+ const unsigned int /*frame_number*/) {}
+
+ // Hook to be called on peek result
+ virtual void HandlePeekResult(Decoder *const decoder,
+ CompressedVideoSource *video,
+ const aom_codec_err_t res_peek);
+
+ protected:
+ explicit DecoderTest(const CodecFactory *codec)
+ : codec_(codec), cfg_(), flags_(0) {}
+
+ virtual ~DecoderTest() = default;
+
+ const CodecFactory *codec_;
+ aom_codec_dec_cfg_t cfg_;
+ aom_codec_flags_t flags_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_DECODE_TEST_DRIVER_H_
diff --git a/third_party/aom/test/decode_to_md5.sh b/third_party/aom/test/decode_to_md5.sh
new file mode 100755
index 0000000000..214755f216
--- /dev/null
+++ b/third_party/aom/test/decode_to_md5.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom decode_to_md5 example. To add new tests to this
+## file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to decode_to_md5_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+# $AV1_IVF_FILE is required.
+decode_to_md5_verify_environment() {
+ if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+ return 1
+ fi
+}
+
+# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is
+# interpreted as codec name and used solely to name the output file. $3 is the
+# expected md5 sum: It must match that of the final frame.
+decode_to_md5() {
+ local decoder="$(aom_tool_path decode_to_md5)"
+ local input_file="$1"
+ local codec="$2"
+ local expected_md5="$3"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"
+
+ if [ ! -x "${decoder}" ]; then
+ elog "${decoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+
+ local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+ local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
+ if [ "${actual_md5}" = "${expected_md5}" ]; then
+ return 0
+ else
+ elog "MD5 mismatch:"
+ elog "Expected: ${expected_md5}"
+ elog "Actual: ${actual_md5}"
+ return 1
+ fi
+}
+
+DISABLED_decode_to_md5_av1() {
+ # expected MD5 sum for the last frame.
+ local expected_md5="567dd6d4b7a7170edddbf58bbcc3aff1"
+ local file="${AV1_IVF_FILE}"
+
+ # TODO(urvang): Check in the encoded file (like libvpx does) to avoid
+ # encoding every time.
+ if [ "$(av1_decode_available)" = "yes" ]; then
+ if [ ! -e "${AV1_IVF_FILE}" ]; then
+ file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+ encode_yuv_raw_input_av1 "${file}" --ivf || return 1
+ fi
+ decode_to_md5 "${file}" "av1" "${expected_md5}"
+ fi
+}
+
+# TODO(tomfinegan): Enable when the bitstream stabilizes.
+decode_to_md5_tests="DISABLED_decode_to_md5_av1"
+
+run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}"
diff --git a/third_party/aom/test/decode_with_drops.sh b/third_party/aom/test/decode_with_drops.sh
new file mode 100755
index 0000000000..1fc13ced35
--- /dev/null
+++ b/third_party/aom/test/decode_with_drops.sh
@@ -0,0 +1,68 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom decode_with_drops example. To add new tests to
+## this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to decode_with_drops_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+# $AV1_IVF_FILE is required.
+decode_with_drops_verify_environment() {
+ if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+ return 1
+ fi
+}
+
+# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely
+# to name the output file. $3 is the drop mode, and is passed directly to
+# decode_with_drops.
+decode_with_drops() {
+ local decoder="$(aom_tool_path decode_with_drops)"
+ local input_file="$1"
+ local codec="$2"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
+ local drop_mode="$3"
+
+ if [ ! -x "${decoder}" ]; then
+ elog "${decoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+ "${drop_mode}" ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+
+# Decodes $AV1_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+DISABLED_decode_with_drops_av1() {
+ if [ "$(av1_decode_available)" = "yes" ]; then
+ local file="${AV1_IVF_FILE}"
+ if [ ! -e "${AV1_IVF_FILE}" ]; then
+ file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+ encode_yuv_raw_input_av1 "${file}" --ivf || return 1
+ fi
+ # Drop frames 3 and 4.
+ decode_with_drops "${file}" "av1" "3-4" || return 1
+
+ # Test pattern mode: Drop 3 of every 4 frames.
+ decode_with_drops "${file}" "av1" "3/4" || return 1
+ fi
+}
+
+# TODO(yaowu): Disable this test as trailing_bit check is expected to fail
+decode_with_drops_tests="DISABLED_decode_with_drops_av1"
+
+run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}"
diff --git a/third_party/aom/test/deltaq_mode_test.cc b/third_party/aom/test/deltaq_mode_test.cc
new file mode 100644
index 0000000000..5960d276d1
--- /dev/null
+++ b/third_party/aom/test/deltaq_mode_test.cc
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "aom/aomcx.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+/*
+ Reproduces https://crbug.com/aomedia/3376. Emulates the command line:
+
+ ./aomenc --cpu-used=6 --threads=10 --cq-level=14 --passes=1 --limit=1 \
+ --lag-in-frames=0 --end-usage=q --deltaq-mode=3 --min-q=0 --max-q=63 \
+ -o output.av1 niklas_1280_720_30.y4m
+*/
+TEST(DeltaqModeTest, DeltaqMode3MultiThread) {
+ constexpr int kWidth = 1280;
+ constexpr int kHeight = 720;
+ // Dummy buffer of neutral gray samples.
+ constexpr size_t kBufferSize = kWidth * kHeight + kWidth * kHeight / 2;
+ std::vector<unsigned char> buffer(kBufferSize,
+ static_cast<unsigned char>(128));
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY),
+ AOM_CODEC_OK);
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_threads = 10;
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 0;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_min_quantizer = 0;
+ cfg.rc_max_quantizer = 63;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_limit = 1;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 14), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_DELTAQ_MODE, 3), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_set_option(&enc, "passes", "1"), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_STUDIO_RANGE),
+ AOM_CODEC_OK);
+
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// The implementation of multi-threading for deltaq-mode=3 in allintra
+// mode is based on row multi-threading.
+// The test ensures that When row mt is turned off,
+// deltaq-mode = 3 can still properly encode and decode.
+TEST(DeltaqModeTest, DeltaqMode3MultiThreadNoRowMT) {
+ constexpr int kWidth = 1280;
+ constexpr int kHeight = 720;
+ // Dummy buffer of neutral gray samples.
+ constexpr size_t kBufferSize = kWidth * kHeight + kWidth * kHeight / 2;
+ std::vector<unsigned char> buffer(kBufferSize,
+ static_cast<unsigned char>(128));
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY),
+ AOM_CODEC_OK);
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_threads = 10;
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 0;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_min_quantizer = 0;
+ cfg.rc_max_quantizer = 63;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_limit = 1;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_ROW_MT, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 14), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_DELTAQ_MODE, 3), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_set_option(&enc, "passes", "1"), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_STUDIO_RANGE),
+ AOM_CODEC_OK);
+
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// 10-bit version of the DeltaqMode3MultiThread test.
+TEST(DeltaqModeTest, DeltaqMode3MultiThreadHighbd) {
+ constexpr int kWidth = 1280;
+ constexpr int kHeight = 720;
+ // Dummy buffer of 10-bit neutral gray samples.
+ constexpr size_t kBufferSize = kWidth * kHeight + kWidth * kHeight / 2;
+ std::vector<uint16_t> buffer(kBufferSize, 512);
+
+ aom_image_t img;
+ EXPECT_EQ(&img,
+ aom_img_wrap(&img, AOM_IMG_FMT_I42016, kWidth, kHeight, 1,
+ reinterpret_cast<unsigned char *>(buffer.data())));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY),
+ AOM_CODEC_OK);
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_threads = 10;
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 0;
+ cfg.g_bit_depth = AOM_BITS_10;
+ cfg.g_input_bit_depth = 10;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_min_quantizer = 0;
+ cfg.rc_max_quantizer = 63;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_limit = 1;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 14), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_DELTAQ_MODE, 3), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_set_option(&enc, "passes", "1"), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_STUDIO_RANGE),
+ AOM_CODEC_OK);
+
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/disflow_test.cc b/third_party/aom/test/disflow_test.cc
new file mode 100644
index 0000000000..124c9a96c7
--- /dev/null
+++ b/third_party/aom/test/disflow_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using ComputeFlowAtPointFunc = void (*)(const uint8_t *src, const uint8_t *ref,
+ int x, int y, int width, int height,
+ int stride, double *u, double *v);
+
+class ComputeFlowTest
+ : public ::testing::TestWithParam<ComputeFlowAtPointFunc> {
+ public:
+ ComputeFlowTest()
+ : target_func_(GetParam()),
+ rnd_(libaom_test::ACMRandom::DeterministicSeed()) {}
+
+ protected:
+ void RunCheckOutput(int run_times);
+ ComputeFlowAtPointFunc target_func_;
+
+ libaom_test::ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ComputeFlowTest);
+
+void ComputeFlowTest::RunCheckOutput(int run_times) {
+ constexpr int kWidth = 352;
+ constexpr int kHeight = 288;
+
+ ::libaom_test::YUVVideoSource video("bus_352x288_420_f20_b8.yuv",
+ AOM_IMG_FMT_I420, kWidth, kHeight, 30, 1,
+ 0, 2);
+ // Use Y (Luminance) plane.
+ video.Begin();
+ uint8_t *src = video.img()->planes[0];
+ ASSERT_NE(src, nullptr);
+ video.Next();
+ uint8_t *ref = video.img()->planes[0];
+ ASSERT_NE(ref, nullptr);
+
+ // Pick a random value between -5 and 5. The range was chosen arbitrarily as
+ // u and v can take any kind of value in practise, but it shouldn't change the
+ // outcome of the tests.
+ const double u_rand = (static_cast<double>(rnd_.Rand8()) / 255) * 10 - 5;
+ double u_ref = u_rand;
+ double u_test = u_rand;
+
+ const double v_rand = (static_cast<double>(rnd_.Rand8()) / 255) * 10 - 5;
+ double v_ref = v_rand;
+ double v_test = v_rand;
+
+ // Pick a random point in the frame. If the frame is 352x288, that means we
+ // can call the function on all values of x comprised between 8 and 344, and
+ // all values of y comprised between 8 and 280.
+ const int x = rnd_((kWidth - 8) - 8 + 1) + 8;
+ const int y = rnd_((kHeight - 8) - 8 + 1) + 8;
+
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_compute_flow_at_point_c(src, ref, x, y, kWidth, kHeight, kWidth, &u_ref,
+ &v_ref);
+
+ target_func_(src, ref, x, y, kWidth, kHeight, kWidth, &u_test, &v_test);
+
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < run_times; ++i) {
+ aom_compute_flow_at_point_c(src, ref, x, y, kWidth, kHeight, kWidth,
+ &u_ref, &v_ref);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const double elapsed_time_c =
+ static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(src, ref, x, y, kWidth, kHeight, kWidth, &u_test, &v_test);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const double elapsed_time_simd =
+ static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("c_time=%fns \t simd_time=%fns \t speedup=%.2f\n", elapsed_time_c,
+ elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
+ } else {
+ ASSERT_EQ(u_ref, u_test);
+ ASSERT_EQ(v_ref, v_test);
+ }
+}
+
+TEST_P(ComputeFlowTest, CheckOutput) { RunCheckOutput(1); }
+
+TEST_P(ComputeFlowTest, DISABLED_Speed) { RunCheckOutput(10000000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ComputeFlowTest,
+ ::testing::Values(aom_compute_flow_at_point_sse4_1));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest,
+ ::testing::Values(aom_compute_flow_at_point_neon));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/divu_small_test.cc b/third_party/aom/test/divu_small_test.cc
new file mode 100644
index 0000000000..496fbc1f8e
--- /dev/null
+++ b/third_party/aom/test/divu_small_test.cc
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom_dsp/odintrin.h"
+
+using libaom_test::ACMRandom;
+
+TEST(DivuSmallTest, TestDIVUuptoMAX) {
+ for (int d = 1; d <= OD_DIVU_DMAX; d++) {
+ for (uint32_t x = 1; x <= 1000000; x++) {
+ GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
+ << "x=" << x << " d=" << d << " x/d=" << (x / d)
+ << " != " << OD_DIVU_SMALL(x, d);
+ }
+ }
+}
+
+TEST(DivuSmallTest, TestDIVUrandI31) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int d = 1; d < OD_DIVU_DMAX; d++) {
+ for (int i = 0; i < 1000000; i++) {
+ uint32_t x = rnd.Rand31();
+ GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
+ << "x=" << x << " d=" << d << " x/d=" << (x / d)
+ << " != " << OD_DIVU_SMALL(x, d);
+ }
+ }
+}
diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc
new file mode 100644
index 0000000000..3865810e9b
--- /dev/null
+++ b/third_party/aom/test/dr_prediction_test.cc
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+const int kZ1Start = 0;
+const int kZ2Start = 90;
+const int kZ3Start = 180;
+
+const TX_SIZE kTxSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64,
+ TX_4X8, TX_8X4, TX_8X16, TX_16X8, TX_16X32,
+ TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4,
+ TX_8X32, TX_32X8, TX_16X64, TX_64X16 };
+
+const char *const kTxSizeStrings[] = {
+ "TX_4X4", "TX_8X8", "TX_16X16", "TX_32X32", "TX_64X64",
+ "TX_4X8", "TX_8X4", "TX_8X16", "TX_16X8", "TX_16X32",
+ "TX_32X16", "TX_32X64", "TX_64X32", "TX_4X16", "TX_16X4",
+ "TX_8X32", "TX_32X8", "TX_16X64", "TX_64X16"
+};
+
+using libaom_test::ACMRandom;
+
+typedef void (*DrPred_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy, int bd);
+
+typedef void (*DrPred)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx, int dy,
+ int bd);
+
+typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy);
+template <Z1_Lbd fn>
+void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ (void)upsample_left;
+ fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy);
+}
+
+typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx, int dy);
+template <Z2_Lbd fn>
+void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ (void)upsample_left;
+ fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy);
+}
+
+typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy);
+template <Z3_Lbd fn>
+void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ (void)upsample_above;
+ fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy);
+}
+
+typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_above, int dx, int dy, int bd);
+template <Z1_Hbd fn>
+void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_above, int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ (void)upsample_left;
+ fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd);
+}
+
+typedef void (*Z2_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_above, int upsample_left, int dx, int dy,
+ int bd);
+template <Z2_Hbd fn>
+void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_above, int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy,
+ bd);
+}
+
+typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_left, int dx, int dy, int bd);
+template <Z3_Hbd fn>
+void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint16_t *above, const uint16_t *left,
+ int upsample_above, int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ (void)upsample_above;
+ fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd);
+}
+
+template <typename FuncType>
+struct DrPredFunc {
+ DrPredFunc(FuncType pred = nullptr, FuncType tst = nullptr,
+ int bit_depth_value = 0, int start_angle_value = 0)
+ : ref_fn(pred), tst_fn(tst), bit_depth(bit_depth_value),
+ start_angle(start_angle_value) {}
+
+ FuncType ref_fn;
+ FuncType tst_fn;
+ int bit_depth;
+ int start_angle;
+};
+
+template <typename Pixel, typename FuncType>
+class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
+ protected:
+ static const int kMaxNumTests = 10000;
+ static const int kIterations = 10;
+ static const int kDstStride = 64;
+ static const int kDstSize = kDstStride * kDstStride;
+ static const int kOffset = 16;
+ static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
+
+ DrPredTest()
+ : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0),
+ bh_(0), dx_(1), dy_(1), bd_(8), txsize_(TX_4X4) {
+ params_ = this->GetParam();
+ start_angle_ = params_.start_angle;
+ stop_angle_ = start_angle_ + 90;
+
+ dst_ref_ = &dst_ref_data_[0];
+ dst_tst_ = &dst_tst_data_[0];
+ dst_stride_ = kDstStride;
+ above_ = &above_data_[kOffset];
+ left_ = &left_data_[kOffset];
+
+ for (int i = 0; i < kBufSize; ++i) {
+ above_data_[i] = rng_.Rand8();
+ left_data_[i] = rng_.Rand8();
+ }
+
+ for (int i = 0; i < kDstSize; ++i) {
+ dst_ref_[i] = 0;
+ dst_tst_[i] = 0;
+ }
+ }
+
+ ~DrPredTest() override = default;
+
+ void Predict(bool speedtest, int tx) {
+ const int kNumTests = speedtest ? kMaxNumTests : 1;
+ aom_usec_timer timer;
+ int tst_time = 0;
+
+ bd_ = params_.bit_depth;
+
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < kNumTests; ++k) {
+ params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_,
+ upsample_above_, upsample_left_, dx_, dy_, bd_);
+ }
+ aom_usec_timer_mark(&timer);
+ const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ if (params_.tst_fn) {
+ aom_usec_timer_start(&timer);
+ for (int k = 0; k < kNumTests; ++k) {
+ API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+ above_, left_, upsample_above_,
+ upsample_left_, dx_, dy_, bd_));
+ }
+ aom_usec_timer_mark(&timer);
+ tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ } else {
+ for (int i = 0; i < kDstSize; ++i) {
+ dst_ref_[i] = dst_tst_[i];
+ }
+ }
+
+ OutputTimes(kNumTests, ref_time, tst_time, tx);
+ }
+
+ void RunTest(bool speedtest, bool needsaturation, int p_angle) {
+ bd_ = params_.bit_depth;
+
+ if (needsaturation) {
+ for (int i = 0; i < kBufSize; ++i) {
+ above_data_[i] = left_data_[i] = (1 << bd_) - 1;
+ }
+ }
+ for (int tx = 0; tx < TX_SIZES_ALL; ++tx) {
+ if (params_.tst_fn == nullptr) {
+ for (int i = 0; i < kDstSize; ++i) {
+ dst_tst_[i] = (1 << bd_) - 1;
+ dst_ref_[i] = (1 << bd_) - 1;
+ }
+ } else {
+ for (int i = 0; i < kDstSize; ++i) {
+ dst_ref_[i] = 0;
+ dst_tst_[i] = 0;
+ }
+ }
+
+ bw_ = tx_size_wide[kTxSize[tx]];
+ bh_ = tx_size_high[kTxSize[tx]];
+
+ if (enable_upsample_) {
+ upsample_above_ =
+ av1_use_intra_edge_upsample(bw_, bh_, p_angle - 90, 0);
+ upsample_left_ =
+ av1_use_intra_edge_upsample(bw_, bh_, p_angle - 180, 0);
+ } else {
+ upsample_above_ = upsample_left_ = 0;
+ }
+
+ Predict(speedtest, tx);
+
+ for (int r = 0; r < bh_; ++r) {
+ for (int c = 0; c < bw_; ++c) {
+ ASSERT_EQ(dst_ref_[r * dst_stride_ + c],
+ dst_tst_[r * dst_stride_ + c])
+ << bw_ << "x" << bh_ << " r: " << r << " c: " << c
+ << " dx: " << dx_ << " dy: " << dy_
+ << " upsample_above: " << upsample_above_
+ << " upsample_left: " << upsample_left_;
+ }
+ }
+ }
+ }
+
+ void OutputTimes(int num_tests, int ref_time, int tst_time, int tx) {
+ if (num_tests > 1) {
+ if (params_.tst_fn) {
+ const float x = static_cast<float>(ref_time) / tst_time;
+ printf("\t[%8s] :: ref time %6d, tst time %6d %3.2f\n",
+ kTxSizeStrings[tx], ref_time, tst_time, x);
+ } else {
+ printf("\t[%8s] :: ref time %6d\n", kTxSizeStrings[tx], ref_time);
+ }
+ }
+ }
+
+ void RundrPredTest(const int speed) {
+ if (params_.tst_fn == nullptr) return;
+ const int angles[] = { 3, 45, 87 };
+ const int start_angle = speed ? 0 : start_angle_;
+ const int stop_angle = speed ? 3 : stop_angle_;
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int i = start_angle; i < stop_angle; ++i) {
+ const int angle = speed ? angles[i] + start_angle_ : i;
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ if (speed) {
+ printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+ enable_upsample_, angle);
+ }
+ if (dx_ && dy_) RunTest(speed, false, angle);
+ }
+ }
+ }
+
+ Pixel dst_ref_data_[kDstSize];
+ Pixel dst_tst_data_[kDstSize];
+
+ Pixel left_data_[kBufSize];
+ Pixel dummy_data_[kBufSize];
+ Pixel above_data_[kBufSize];
+
+ Pixel *dst_ref_;
+ Pixel *dst_tst_;
+ Pixel *above_;
+ Pixel *left_;
+ int dst_stride_;
+
+ int enable_upsample_;
+ int upsample_above_;
+ int upsample_left_;
+ int bw_;
+ int bh_;
+ int dx_;
+ int dy_;
+ int bd_;
+ TX_SIZE txsize_;
+
+ int start_angle_;
+ int stop_angle_;
+
+ ACMRandom rng_;
+
+ DrPredFunc<FuncType> params_;
+};
+
+class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
+
+TEST_P(LowbdDrPredTest, SaturatedValues) {
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ if (dx_ && dy_) RunTest(false, true, angle);
+ }
+ }
+}
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+ C, LowbdDrPredTest,
+ ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+ nullptr, AOM_BITS_8, kZ1Start),
+ DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+ nullptr, AOM_BITS_8, kZ2Start),
+ DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+ nullptr, AOM_BITS_8, kZ3Start)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
+
+TEST_P(HighbdDrPredTest, SaturatedValues) {
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ if (dx_ && dy_) RunTest(false, true, angle);
+ }
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ C, HighbdDrPredTest,
+ ::testing::Values(
+ DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ nullptr, AOM_BITS_8, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ nullptr, AOM_BITS_10, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ nullptr, AOM_BITS_12, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ nullptr, AOM_BITS_8, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ nullptr, AOM_BITS_10, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ nullptr, AOM_BITS_12, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ nullptr, AOM_BITS_8, kZ3Start),
+ DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ nullptr, AOM_BITS_10, kZ3Start),
+ DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ nullptr, AOM_BITS_12, kZ3Start)));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+TEST_P(LowbdDrPredTest, OperationCheck) { RundrPredTest(0); }
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) { RundrPredTest(1); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(HighbdDrPredTest, OperationCheck) {
+ if (params_.tst_fn == nullptr) return;
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int angle = start_angle_; angle < stop_angle_; angle++) {
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ if (dx_ && dy_) RunTest(false, false, angle);
+ }
+ }
+}
+
+TEST_P(HighbdDrPredTest, DISABLED_Speed) {
+ const int angles[] = { 3, 45, 87 };
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int i = 0; i < 3; ++i) {
+ int angle = angles[i] + start_angle_;
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+ enable_upsample_, angle);
+ if (dx_ && dy_) RunTest(true, false, angle);
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, LowbdDrPredTest,
+ ::testing::Values(
+ DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+ &z1_wrapper<av1_dr_prediction_z1_sse4_1>, AOM_BITS_8,
+ kZ1Start),
+ DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+ &z2_wrapper<av1_dr_prediction_z2_sse4_1>, AOM_BITS_8,
+ kZ2Start),
+ DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+ &z3_wrapper<av1_dr_prediction_z3_sse4_1>, AOM_BITS_8,
+ kZ3Start)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, LowbdDrPredTest,
+ ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+ &z1_wrapper<av1_dr_prediction_z1_avx2>,
+ AOM_BITS_8, kZ1Start),
+ DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+ &z2_wrapper<av1_dr_prediction_z2_avx2>,
+ AOM_BITS_8, kZ2Start),
+ DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+ &z3_wrapper<av1_dr_prediction_z3_avx2>,
+ AOM_BITS_8, kZ3Start)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, HighbdDrPredTest,
+ ::testing::Values(DrPredFunc<DrPred_Hbd>(
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
+ AOM_BITS_8, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
+ AOM_BITS_10, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
+ AOM_BITS_12, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
+ AOM_BITS_8, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
+ AOM_BITS_10, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
+ AOM_BITS_12, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
+ AOM_BITS_8, kZ3Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
+ AOM_BITS_10, kZ3Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
+ AOM_BITS_12, kZ3Start)));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, LowbdDrPredTest,
+ ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+ &z1_wrapper<av1_dr_prediction_z1_neon>,
+ AOM_BITS_8, kZ1Start),
+ DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+ &z2_wrapper<av1_dr_prediction_z2_neon>,
+ AOM_BITS_8, kZ2Start),
+ DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+ &z3_wrapper<av1_dr_prediction_z3_neon>,
+ AOM_BITS_8, kZ3Start)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HighbdDrPredTest,
+ ::testing::Values(DrPredFunc<DrPred_Hbd>(
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
+ AOM_BITS_8, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
+ AOM_BITS_10, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+ &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_neon>,
+ AOM_BITS_12, kZ1Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_neon>,
+ AOM_BITS_8, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_neon>,
+ AOM_BITS_10, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+ &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_neon>,
+ AOM_BITS_12, kZ2Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
+ AOM_BITS_8, kZ3Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
+ AOM_BITS_10, kZ3Start),
+ DrPredFunc<DrPred_Hbd>(
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+ &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_neon>,
+ AOM_BITS_12, kZ3Start)));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/dropframe_encode_test.cc b/third_party/aom/test/dropframe_encode_test.cc
new file mode 100644
index 0000000000..4a54c0b95c
--- /dev/null
+++ b/third_party/aom/test/dropframe_encode_test.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+// Params: test mode, threads.
+class DropFrameEncodeTestLarge
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+ unsigned int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ DropFrameEncodeTestLarge()
+ : EncoderTest(GET_PARAM(0)), frame_number_(0), threads_(GET_PARAM(2)) {}
+
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ frame_number_ = video->frame();
+ if (frame_number_ == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 1);
+ }
+ }
+
+ unsigned int frame_number_;
+ unsigned int threads_;
+};
+
+// Test to reproduce the assertion failure related to buf->display_idx in
+// init_gop_frames_for_tpl() and segmentation fault reported in aomedia:3372
+// while encoding with --drop-frame=1.
+TEST_P(DropFrameEncodeTestLarge, TestNoMisMatch) {
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.rc_buf_sz = 1;
+ cfg_.g_pass = AOM_RC_ONE_PASS;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.g_threads = threads_;
+
+ ::libaom_test::I420VideoSource video("desktopqvga2.320_240.yuv", 320, 240, 30,
+ 1, 0, 100);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(DropFrameEncodeTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood),
+ ::testing::Values(1, 4));
+
+} // namespace
diff --git a/third_party/aom/test/dump_obu.sh b/third_party/aom/test/dump_obu.sh
new file mode 100755
index 0000000000..933db64a6a
--- /dev/null
+++ b/third_party/aom/test/dump_obu.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom dump_obu tool. To add new tests to this
+## file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to dump_obu_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+readonly dump_obu_test_file="${AOM_TEST_OUTPUT_DIR}/av1_obu_test.ivf"
+
+dump_obu_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ "$(dump_obu_available)" = "yes" ]; then
+ if [ -z "$(aom_tool_path dump_obu)" ]; then
+ elog "dump_obu not found in LIBAOM_BIN_PATH, its parent, or child tools/."
+ fi
+ fi
+}
+
+dump_obu_available() {
+ if [ "$(av1_decode_available)" = "yes" ] && \
+ [ "$(av1_encode_available)" = "yes" ]; then
+ echo yes
+ fi
+}
+
+aomenc_available() {
+ if [ -x "$(aom_tool_path aomenc)" ]; then
+ echo yes
+ fi
+}
+
+encode_test_file() {
+ if [ "$(aomenc_available)" = "yes" ]; then
+ local encoder="$(aom_tool_path aomenc)"
+ if [ "$(realtime_only_build)" = "yes" ]; then
+ eval "${encoder}" \
+ $(aomenc_encode_test_rt_params) \
+ $(yuv_raw_input) \
+ --ivf \
+ --output=${dump_obu_test_file} \
+ ${devnull} || return 1
+ else
+ eval "${encoder}" \
+ $(aomenc_encode_test_fast_params) \
+ $(yuv_raw_input) \
+ --ivf \
+ --output=${dump_obu_test_file} \
+ ${devnull} || return 1
+ fi
+ if [ ! -e "${dump_obu_test_file}" ]; then
+ elog "dump_obu test input encode failed."
+ return 1
+ fi
+ fi
+}
+
+dump_obu() {
+ encode_test_file || return 1
+ eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull}
+}
+
+dump_obu_tests="dump_obu"
+
+run_tests dump_obu_verify_environment "${dump_obu_tests}"
diff --git a/third_party/aom/test/ec_test.cc b/third_party/aom/test/ec_test.cc
new file mode 100644
index 0000000000..a5284deac0
--- /dev/null
+++ b/third_party/aom/test/ec_test.cc
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include <cstdlib>
+#include <memory>
+#include <new>
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/entdec.h"
+
+TEST(EC_TEST, random_ec_test) {
+ od_ec_enc enc;
+ od_ec_dec dec;
+ int sz;
+ int i;
+ int ret;
+ unsigned int seed;
+ unsigned char *ptr;
+ uint32_t ptr_sz;
+ char *seed_str;
+ ret = 0;
+ seed_str = getenv("EC_TEST_SEED");
+ if (seed_str) {
+ seed = atoi(seed_str);
+ } else {
+ seed = 0xdaa1a;
+ }
+ srand(seed);
+ od_ec_enc_init(&enc, 1);
+ /*Test compatibility between multiple different encode/decode routines.*/
+ for (i = 0; i < 409600; i++) {
+ int j;
+ sz = rand() / ((RAND_MAX >> (rand() % 9U)) + 1U);
+ std::unique_ptr<unsigned[]> fz(new (std::nothrow) unsigned[sz]);
+ ASSERT_NE(fz, nullptr);
+ std::unique_ptr<unsigned[]> fts(new (std::nothrow) unsigned[sz]);
+ ASSERT_NE(fts, nullptr);
+ std::unique_ptr<unsigned[]> data(new (std::nothrow) unsigned[sz]);
+ ASSERT_NE(data, nullptr);
+ std::unique_ptr<unsigned[]> tell(new (std::nothrow) unsigned[sz + 1]);
+ ASSERT_NE(tell, nullptr);
+ std::unique_ptr<unsigned[]> enc_method(new (std::nothrow) unsigned[sz]);
+ ASSERT_NE(enc_method, nullptr);
+ od_ec_enc_reset(&enc);
+ tell[0] = od_ec_enc_tell_frac(&enc);
+ for (j = 0; j < sz; j++) {
+ data[j] = rand() / ((RAND_MAX >> 1) + 1);
+
+ fts[j] = CDF_PROB_BITS;
+ fz[j] = (rand() % (CDF_PROB_TOP - 2)) >> (CDF_PROB_BITS - fts[j]);
+ fz[j] = OD_MAXI(fz[j], 1);
+ enc_method[j] = 3 + (rand() & 1);
+ switch (enc_method[j]) {
+ case 3: {
+ od_ec_encode_bool_q15(&enc, data[j],
+ OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+ break;
+ }
+ case 4: {
+ uint16_t cdf[2];
+ cdf[0] = OD_ICDF(fz[j]);
+ cdf[1] = OD_ICDF(1U << fts[j]);
+ od_ec_encode_cdf_q15(&enc, data[j], cdf, 2);
+ break;
+ }
+ }
+
+ tell[j + 1] = od_ec_enc_tell_frac(&enc);
+ }
+ ptr = od_ec_enc_done(&enc, &ptr_sz);
+ ASSERT_NE(ptr, nullptr);
+ EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz)
+ << "od_ec_enc_tell() lied: "
+ "there's "
+ << ptr_sz << " bytes instead of " << ((od_ec_enc_tell(&enc) + 7) >> 3)
+ << " (Random seed: " << seed << ")\n";
+ od_ec_dec_init(&dec, ptr, ptr_sz);
+ EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[0])
+ << "od_ec_dec_tell() mismatch between encoder and decoder "
+ "at symbol 0: "
+ << (unsigned)od_ec_dec_tell_frac(&dec) << " instead of " << tell[0]
+ << " (Random seed: " << seed << ").\n";
+ for (j = 0; j < sz; j++) {
+ int dec_method;
+ unsigned int sym = data[j] + 1; // Initialize sym to an invalid value.
+
+ dec_method = 3 + (rand() & 1);
+
+ switch (dec_method) {
+ case 3: {
+ sym = od_ec_decode_bool_q15(
+ &dec, OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+ break;
+ }
+ case 4: {
+ uint16_t cdf[2];
+ cdf[0] = OD_ICDF(fz[j]);
+ cdf[1] = OD_ICDF(1U << fts[j]);
+ sym = od_ec_decode_cdf_q15(&dec, cdf, 2);
+ break;
+ }
+ }
+
+ EXPECT_EQ(sym, data[j])
+ << "Decoded " << sym << " instead of " << data[j]
+ << " with fz=" << fz[j] << " and ftb=" << fts[j] << "at position "
+ << j << " of " << sz << " (Random seed: " << seed << ").\n"
+ << "Encoding method: " << enc_method[j]
+ << " decoding method: " << dec_method << "\n";
+ EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[j + 1])
+ << "od_ec_dec_tell() mismatch between encoder and "
+ "decoder at symbol "
+ << j + 1 << ": " << (unsigned)od_ec_dec_tell_frac(&dec)
+ << " instead of " << tell[j + 1] << " (Random seed: " << seed
+ << ").\n";
+ }
+ }
+ od_ec_enc_reset(&enc);
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+ od_ec_enc_patch_initial_bits(&enc, 3, 2);
+ EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+ od_ec_enc_patch_initial_bits(&enc, 0, 5);
+ EXPECT_TRUE(enc.error)
+ << "od_ec_enc_patch_initial_bits() didn't fail when it should have.\n";
+ od_ec_enc_reset(&enc);
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+ od_ec_encode_bool_q15(&enc, 1, OD_ICDF(32256));
+ od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+ od_ec_enc_patch_initial_bits(&enc, 0, 2);
+ EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+ ptr = od_ec_enc_done(&enc, &ptr_sz);
+ ASSERT_NE(ptr, nullptr);
+ EXPECT_EQ(ptr_sz, 2u);
+ EXPECT_EQ(ptr[0], 63)
+ << "Got " << ptr[0]
+ << " when expecting 63 for od_ec_enc_patch_initial_bits().\n";
+ od_ec_enc_clear(&enc);
+ EXPECT_EQ(ret, 0);
+}
diff --git a/third_party/aom/test/encode_api_test.cc b/third_party/aom/test/encode_api_test.cc
new file mode 100644
index 0000000000..aa4084f9e4
--- /dev/null
+++ b/third_party/aom/test/encode_api_test.cc
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
+
+namespace {
+
+#if CONFIG_REALTIME_ONLY
+const unsigned int kUsage = AOM_USAGE_REALTIME;
+#else
+const unsigned int kUsage = AOM_USAGE_GOOD_QUALITY;
+#endif
+
+static void *Memset16(void *dest, int val, size_t length) {
+ uint16_t *dest16 = (uint16_t *)dest;
+ for (size_t i = 0; i < length; ++i) *dest16++ = val;
+ return dest;
+}
+
+TEST(EncodeAPI, InvalidParams) {
+ uint8_t buf[1] = { 0 };
+ aom_image_t img;
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf));
+
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_init(nullptr, nullptr, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_init(&enc, nullptr, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_encode(nullptr, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(nullptr, &img, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(nullptr));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_config_default(nullptr, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_config_default(nullptr, &cfg, 0));
+ EXPECT_NE(aom_codec_error(nullptr), nullptr);
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ SCOPED_TRACE(aom_codec_iface_name(iface));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_init(nullptr, iface, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_init(&enc, iface, nullptr, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_enc_config_default(iface, &cfg, 3));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_w = 1 << 16;
+ cfg.g_h = (1 << 14) + 1;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_w = (1 << 14) + 1;
+ cfg.g_h = 1 << 16;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_forced_max_frame_width = 1 << 16;
+ cfg.g_forced_max_frame_height = (1 << 14) + 1;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_forced_max_frame_width = (1 << 14) + 1;
+ cfg.g_forced_max_frame_height = 1 << 16;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(nullptr, aom_codec_get_global_headers(nullptr));
+
+ aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
+ EXPECT_NE(glob_headers->buf, nullptr);
+ if (glob_headers) {
+ free(glob_headers->buf);
+ free(glob_headers);
+ }
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeAPI, InvalidControlId) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&enc, -1, 0));
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&enc, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+void EncodeSetSFrameOnFirstFrame(aom_img_fmt fmt, aom_codec_flags_t flag) {
+ constexpr int kWidth = 2;
+ constexpr int kHeight = 128;
+ unsigned char kBuffer[kWidth * kHeight * 3] = { 0 };
+ aom_image_t img;
+ ASSERT_EQ(aom_img_wrap(&img, fmt, kWidth, kHeight, 1, kBuffer), &img);
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, kUsage), AOM_CODEC_OK);
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, flag), AOM_CODEC_OK);
+ // One of these aom_codec_encode() calls should fail.
+ if (aom_codec_encode(&enc, &img, 0, 1, AOM_EFLAG_SET_S_FRAME) ==
+ AOM_CODEC_OK) {
+ EXPECT_NE(aom_codec_encode(&enc, nullptr, 0, 0, 0), AOM_CODEC_OK);
+ }
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+TEST(EncodeAPI, SetSFrameOnFirstFrame) {
+ EncodeSetSFrameOnFirstFrame(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(EncodeAPI, SetSFrameOnFirstFrameHighbd) {
+ EncodeSetSFrameOnFirstFrame(AOM_IMG_FMT_I42016, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+TEST(EncodeAPI, MonochromeInProfiles) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_w = 128;
+ cfg.g_h = 128;
+ cfg.monochrome = 1;
+ aom_codec_ctx_t enc;
+
+ // Test Profile 0
+ cfg.g_profile = 0;
+ ASSERT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+
+ // Test Profile 1
+ cfg.g_profile = 1;
+ ASSERT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+ // Test Profile 3
+ cfg.g_profile = 2;
+ ASSERT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeAPI, LowBDEncoderLowBDImage) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, kUsage), AOM_CODEC_OK);
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ aom_image_t *image =
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
+ ASSERT_NE(image, nullptr);
+
+ // Set the image to two colors so that av1_set_screen_content_options() will
+ // call av1_get_perpixel_variance().
+ int luma_value = 0;
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], luma_value, image->d_w);
+ luma_value = 255 - luma_value;
+ }
+ unsigned int uv_h = (image->d_h + 1) / 2;
+ unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+ memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+
+ ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+TEST(EncodeAPI, HighBDEncoderHighBDImage) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, kUsage), AOM_CODEC_OK);
+
+ aom_codec_ctx_t enc;
+ aom_codec_err_t init_status =
+ aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH);
+#if !CONFIG_AV1_HIGHBITDEPTH
+ ASSERT_EQ(init_status, AOM_CODEC_INCAPABLE);
+#else
+ ASSERT_EQ(init_status, AOM_CODEC_OK);
+
+ aom_image_t *image =
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+ ASSERT_NE(image, nullptr);
+
+ // Set the image to two colors so that av1_set_screen_content_options() will
+ // call av1_get_perpixel_variance().
+ int luma_value = 0;
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ Memset16(image->planes[0] + i * image->stride[0], luma_value, image->d_w);
+ luma_value = 255 - luma_value;
+ }
+ unsigned int uv_h = (image->d_h + 1) / 2;
+ unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ Memset16(image->planes[1] + i * image->stride[1], 128, uv_w);
+ Memset16(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+
+ ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+#endif
+}
+
+TEST(EncodeAPI, HighBDEncoderLowBDImage) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, kUsage), AOM_CODEC_OK);
+
+ aom_codec_ctx_t enc;
+ aom_codec_err_t init_status =
+ aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH);
+#if !CONFIG_AV1_HIGHBITDEPTH
+ ASSERT_EQ(init_status, AOM_CODEC_INCAPABLE);
+#else
+ ASSERT_EQ(init_status, AOM_CODEC_OK);
+
+ aom_image_t *image =
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h, 0);
+ ASSERT_NE(image, nullptr);
+
+ // Set the image to two colors so that av1_set_screen_content_options() will
+ // call av1_get_perpixel_variance().
+ int luma_value = 0;
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], luma_value, image->d_w);
+ luma_value = 255 - luma_value;
+ }
+ unsigned int uv_h = (image->d_h + 1) / 2;
+ unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+ memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+
+ ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_INVALID_PARAM);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+#endif
+}
+
+TEST(EncodeAPI, LowBDEncoderHighBDImage) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, kUsage), AOM_CODEC_OK);
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ aom_image_t *image =
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+ ASSERT_NE(image, nullptr);
+
+ // Set the image to two colors so that av1_set_screen_content_options() will
+ // call av1_get_perpixel_variance().
+ int luma_value = 0;
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ Memset16(image->planes[0] + i * image->stride[0], luma_value, image->d_w);
+ luma_value = 255 - luma_value;
+ }
+ unsigned int uv_h = (image->d_h + 1) / 2;
+ unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ Memset16(image->planes[1] + i * image->stride[1], 128, uv_w);
+ Memset16(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+
+ ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_INVALID_PARAM);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+aom_image_t *CreateGrayImage(aom_img_fmt_t fmt, unsigned int w,
+ unsigned int h) {
+ aom_image_t *const image = aom_img_alloc(nullptr, fmt, w, h, 1);
+ if (!image) return image;
+
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+ }
+ const unsigned int uv_h = (image->d_h + 1) / 2;
+ const unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+ memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+ return image;
+}
+
+TEST(EncodeAPI, Buganizer310548198) {
+ aom_codec_iface_t *const iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ const unsigned int usage = AOM_USAGE_REALTIME;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+ cfg.g_w = 1;
+ cfg.g_h = 444;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ const int speed = 6;
+ ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, speed), AOM_CODEC_OK);
+
+ const aom_enc_frame_flags_t flags = 0;
+ int frame_index = 0;
+
+ // Encode a frame.
+ aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+ ASSERT_EQ(aom_codec_encode(&enc, image, frame_index, 1, flags), AOM_CODEC_OK);
+ frame_index++;
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ }
+ aom_img_free(image);
+
+ cfg.g_w = 1;
+ cfg.g_h = 254;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_OK)
+ << aom_codec_error_detail(&enc);
+
+ cfg.g_w = 1;
+ cfg.g_h = 154;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_OK)
+ << aom_codec_error_detail(&enc);
+
+ // Encode a frame.
+ image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_EQ(aom_codec_encode(&enc, image, frame_index, 1, flags), AOM_CODEC_OK);
+ frame_index++;
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ }
+ aom_img_free(image);
+
+ // Flush the encoder.
+ bool got_data;
+ do {
+ ASSERT_EQ(aom_codec_encode(&enc, nullptr, 0, 0, 0), AOM_CODEC_OK);
+ got_data = false;
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ got_data = true;
+ }
+ } while (got_data);
+
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+// Emulates the WebCodecs VideoEncoder interface.
+class AV1Encoder {
+ public:
+ explicit AV1Encoder(int speed) : speed_(speed) {}
+ ~AV1Encoder();
+
+ void Configure(unsigned int threads, unsigned int width, unsigned int height,
+ aom_rc_mode end_usage, unsigned int usage);
+ void Encode(bool key_frame);
+
+ private:
+ // Flushes the encoder. Should be called after all the Encode() calls.
+ void Flush();
+
+ const int speed_;
+ bool initialized_ = false;
+ aom_codec_enc_cfg_t cfg_;
+ aom_codec_ctx_t enc_;
+ int frame_index_ = 0;
+};
+
+AV1Encoder::~AV1Encoder() {
+ if (initialized_) {
+ Flush();
+ EXPECT_EQ(aom_codec_destroy(&enc_), AOM_CODEC_OK);
+ }
+}
+
+void AV1Encoder::Configure(unsigned int threads, unsigned int width,
+ unsigned int height, aom_rc_mode end_usage,
+ unsigned int usage) {
+ if (!initialized_) {
+ aom_codec_iface_t *const iface = aom_codec_av1_cx();
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg_, usage), AOM_CODEC_OK);
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.g_forced_max_frame_width = cfg_.g_w;
+ cfg_.g_forced_max_frame_height = cfg_.g_h;
+ cfg_.g_timebase.num = 1;
+ cfg_.g_timebase.den = 1000 * 1000; // microseconds
+ cfg_.g_pass = AOM_RC_ONE_PASS;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_end_usage = end_usage;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 58;
+ ASSERT_EQ(aom_codec_enc_init(&enc_, iface, &cfg_, 0), AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc_, AOME_SET_CPUUSED, speed_), AOM_CODEC_OK);
+ initialized_ = true;
+ return;
+ }
+
+ ASSERT_EQ(usage, cfg_.g_usage);
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.rc_end_usage = end_usage;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc_, &cfg_), AOM_CODEC_OK)
+ << aom_codec_error_detail(&enc_);
+}
+
+void AV1Encoder::Encode(bool key_frame) {
+ assert(initialized_);
+ // TODO(wtc): Support high bit depths and other YUV formats.
+ aom_image_t *const image =
+ CreateGrayImage(AOM_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+ ASSERT_NE(image, nullptr);
+ const aom_enc_frame_flags_t flags = key_frame ? AOM_EFLAG_FORCE_KF : 0;
+ ASSERT_EQ(aom_codec_encode(&enc_, image, frame_index_, 1, flags),
+ AOM_CODEC_OK);
+ frame_index_++;
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ if (key_frame) {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ }
+ }
+ aom_img_free(image);
+}
+
+void AV1Encoder::Flush() {
+ bool got_data;
+ do {
+ ASSERT_EQ(aom_codec_encode(&enc_, nullptr, 0, 0, 0), AOM_CODEC_OK);
+ got_data = false;
+ const aom_codec_cx_pkt_t *pkt;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ got_data = true;
+ }
+ } while (got_data);
+}
+
+TEST(EncodeAPI, Buganizer314858909) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(6, 1582, 750, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ encoder.Configure(0, 1582, 23, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame..
+ encoder.Encode(false);
+
+ encoder.Configure(16, 1542, 363, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame..
+ encoder.Encode(false);
+}
+
+// Run this test to reproduce the bug in fuzz test: ASSERT: cpi->rec_sse !=
+// UINT64_MAX in av1_rc_bits_per_mb.
+TEST(EncodeAPI, Buganizer310766628) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(16, 759, 383, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ encoder.Configure(2, 759, 383, AOM_VBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame. This will trigger the assertion failure.
+ encoder.Encode(false);
+}
+
+// This test covers a possible use case where the change of frame sizes and
+// thread numbers happens before and after the first frame coding.
+TEST(EncodeAPI, Buganizer310455204) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(0, 1915, 503, AOM_VBR, AOM_USAGE_REALTIME);
+
+ encoder.Configure(4, 1, 1, AOM_VBR, AOM_USAGE_REALTIME);
+
+ encoder.Configure(6, 559, 503, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ // Increase the number of threads.
+ encoder.Configure(16, 1915, 503, AOM_CBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+}
+
+// Run this test to reproduce the bug in fuzz test: Float-cast-overflow in
+// av1_rc_bits_per_mb.
+TEST(EncodeAPI, Buganizer310457427) {
+ AV1Encoder encoder(7);
+
+ encoder.Configure(12, 896, 1076, AOM_CBR, AOM_USAGE_REALTIME);
+
+ encoder.Configure(6, 609, 1076, AOM_VBR, AOM_USAGE_REALTIME);
+
+ // Encode a frame.
+ encoder.Encode(false);
+
+ // Encode a frame. This will trigger the float-cast-overflow bug which was
+ // caused by division by zero.
+ encoder.Encode(false);
+}
+
+class EncodeAPIParameterized
+ : public testing::TestWithParam<std::tuple<
+ /*usage=*/unsigned int, /*speed=*/int, /*aq_mode=*/unsigned int>> {};
+
+// Encodes two frames at a given usage, speed, and aq_mode setting.
+// Reproduces b/303023614
+TEST_P(EncodeAPIParameterized, HighBDEncoderHighBDFrames) {
+ const unsigned int usage = std::get<0>(GetParam());
+ int speed = std::get<1>(GetParam());
+
+ if (speed == 10 && usage != AOM_USAGE_REALTIME) {
+ speed = 9; // 10 is only allowed in AOM_USAGE_REALTIME
+ }
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+ cfg.g_w = 500;
+ cfg.g_h = 400;
+
+ aom_codec_ctx_t enc;
+ aom_codec_err_t init_status =
+ aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH);
+#if !CONFIG_AV1_HIGHBITDEPTH
+ ASSERT_EQ(init_status, AOM_CODEC_INCAPABLE);
+#else
+ ASSERT_EQ(init_status, AOM_CODEC_OK);
+
+ const unsigned int aq_mode = std::get<2>(GetParam());
+
+ ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, speed), AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_AQ_MODE, aq_mode), AOM_CODEC_OK);
+
+ aom_image_t *image =
+ aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, cfg.g_w, cfg.g_h, 0);
+ ASSERT_NE(image, nullptr);
+
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ Memset16(image->planes[0] + i * image->stride[0], 128, image->d_w);
+ }
+ unsigned int uv_h = (image->d_h + 1) / 2;
+ unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ Memset16(image->planes[1] + i * image->stride[1], 128, uv_w);
+ Memset16(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+
+ // Encode two frames.
+ ASSERT_EQ(
+ aom_codec_encode(&enc, image, /*pts=*/0, /*duration=*/1, /*flags=*/0),
+ AOM_CODEC_OK);
+ ASSERT_EQ(
+ aom_codec_encode(&enc, image, /*pts=*/1, /*duration=*/1, /*flags=*/0),
+ AOM_CODEC_OK);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+#endif
+}
+
+const unsigned int kUsages[] = {
+ AOM_USAGE_REALTIME,
+#if !CONFIG_REALTIME_ONLY
+ AOM_USAGE_GOOD_QUALITY,
+ AOM_USAGE_ALL_INTRA,
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(All, EncodeAPIParameterized,
+ testing::Combine(
+ /*usage=*/testing::ValuesIn(kUsages),
+ /*speed=*/testing::Values(6, 7, 10),
+ /*aq_mode=*/testing::Values(0, 1, 2, 3)));
+
+#if !CONFIG_REALTIME_ONLY
+TEST(EncodeAPI, AllIntraMode) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+
+ // Set g_lag_in_frames to a nonzero value. This should cause
+ // aom_codec_enc_init() to fail.
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+ cfg.g_lag_in_frames = 1;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+ // Set kf_max_dist to a nonzero value. This should cause aom_codec_enc_init()
+ // to fail.
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+ cfg.kf_max_dist = 1;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+}
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/encode_perf_test.cc b/third_party/aom/test/encode_perf_test.cc
new file mode 100644
index 0000000000..b52cf3392c
--- /dev/null
+++ b/third_party/aom/test/encode_perf_test.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom_ports/aom_timer.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+
+struct EncodePerfTestVideo {
+ EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+ uint32_t bitrate_, int frames_)
+ : name(name_), width(width_), height(height_), bitrate(bitrate_),
+ frames(frames_) {}
+ const char *name;
+ uint32_t width;
+ uint32_t height;
+ uint32_t bitrate;
+ int frames;
+};
+
+const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
+ EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484),
+ EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300),
+ EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987),
+ EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718),
+ EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471),
+ EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300),
+ EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv", 640, 480, 200,
+ 300),
+ EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300),
+ EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestThreads[] = { 1, 2, 4 };
+
+class AV1EncodePerfTest
+ : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1EncodePerfTest()
+ : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
+ encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
+
+ ~AV1EncodePerfTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_error_resilient = 1;
+ cfg_.g_threads = threads_;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ const int log2_tile_columns = 3;
+ encoder->Control(AOME_SET_CPUUSED, speed_);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, log2_tile_columns);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+ }
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ min_psnr_ = kMaxPsnr;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (pkt->data.psnr.psnr[0] < min_psnr_) {
+ min_psnr_ = pkt->data.psnr.psnr[0];
+ }
+ }
+
+ // for performance reasons don't decode
+ bool DoDecode() const override { return false; }
+
+ double min_psnr() const { return min_psnr_; }
+
+ void set_speed(unsigned int speed) { speed_ = speed; }
+
+ void set_threads(unsigned int threads) { threads_ = threads; }
+
+ private:
+ double min_psnr_;
+ unsigned int nframes_;
+ libaom_test::TestMode encoding_mode_;
+ unsigned speed_;
+ unsigned int threads_;
+};
+
+TEST_P(AV1EncodePerfTest, PerfTest) {
+ for (const EncodePerfTestVideo &test_video : kAV1EncodePerfTestVectors) {
+ for (int speed : kEncodePerfTestSpeeds) {
+ for (int threads : kEncodePerfTestThreads) {
+ if (test_video.width < 512 && threads > 1)
+ continue;
+ else if (test_video.width < 1024 && threads > 2)
+ continue;
+
+ set_threads(threads);
+ SetUp();
+
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = test_video.bitrate;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ const unsigned frames = test_video.frames;
+ const char *video_name = test_video.name;
+ libaom_test::I420VideoSource video(video_name, test_video.width,
+ test_video.height, timebase.den,
+ timebase.num, 0, test_video.frames);
+ set_speed(speed);
+
+ aom_usec_timer t;
+ aom_usec_timer_start(&t);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ aom_usec_timer_mark(&t);
+ const double elapsed_secs = aom_usec_timer_elapsed(&t) / kUsecsInSec;
+ const double fps = frames / elapsed_secs;
+ const double minimum_psnr = min_psnr();
+ std::string display_name(video_name);
+ if (threads > 1) {
+ char thread_count[32];
+ snprintf(thread_count, sizeof(thread_count), "_t-%d", threads);
+ display_name += thread_count;
+ }
+
+ printf("{\n");
+ printf("\t\"type\" : \"encode_perf_test\",\n");
+ printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+ printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
+ printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+ printf("\t\"totalFrames\" : %u,\n", frames);
+ printf("\t\"framesPerSecond\" : %f,\n", fps);
+ printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+ printf("\t\"speed\" : %d,\n", speed);
+ printf("\t\"threads\" : %d\n", threads);
+ printf("}\n");
+ }
+ }
+ }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AV1EncodePerfTest,
+ ::testing::Values(::libaom_test::kRealTime));
+} // namespace
diff --git a/third_party/aom/test/encode_small_width_height_test.cc b/third_party/aom/test/encode_small_width_height_test.cc
new file mode 100644
index 0000000000..22f69396d9
--- /dev/null
+++ b/third_party/aom/test/encode_small_width_height_test.cc
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/2777.
+//
+// Encode images with a small width (<= two AV1 superblocks) or a small height
+// (<= one AV1 superblock) with multiple threads. aom_codec_encode() should
+// not crash.
+
+#include <memory>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Dummy buffer of zero samples.
+constexpr unsigned char kBuffer[2 * (256 * 512 + 2 * 128 * 256)] = { 0 };
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
+#endif
+
+void EncodeSmallWidthMultiThreaded(aom_img_fmt fmt, aom_codec_flags_t flag) {
+ // The image has only one tile and the tile is two AV1 superblocks wide.
+ // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
+ constexpr int kWidth = 128;
+ constexpr int kHeight = 512;
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
+ const_cast<unsigned char *>(kBuffer)));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_threads = 2;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+ EncodeSmallWidthMultiThreaded(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallWidthMultiThreaded) {
+ EncodeSmallWidthMultiThreaded(AOM_IMG_FMT_I42016, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+void EncodeSmallWidthMultiThreadedSpeed0(aom_img_fmt fmt,
+ aom_codec_flags_t flag) {
+ // The image has only one tile and the tile is two AV1 superblocks wide.
+ // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
+ constexpr int kWidth = 256;
+ constexpr int kHeight = 512;
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
+ const_cast<unsigned char *>(kBuffer)));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+ cfg.g_threads = 2;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+ EncodeSmallWidthMultiThreadedSpeed0(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
+ EncodeSmallWidthMultiThreadedSpeed0(AOM_IMG_FMT_I42016,
+ AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif
+
+void EncodeSmallHeightMultiThreaded(aom_img_fmt fmt, aom_codec_flags_t flag) {
+ // The image has only one tile and the tile is one AV1 superblock tall.
+ // For speed >= 1, superblock size is 64x64 (see av1_select_sb_size()).
+ constexpr int kWidth = 512;
+ constexpr int kHeight = 64;
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
+ const_cast<unsigned char *>(kBuffer)));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_threads = 2;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+ EncodeSmallHeightMultiThreaded(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallHeightMultiThreaded) {
+ EncodeSmallHeightMultiThreaded(AOM_IMG_FMT_I42016,
+ AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+void EncodeSmallHeightMultiThreadedSpeed0(aom_img_fmt fmt,
+ aom_codec_flags_t flag) {
+ // The image has only one tile and the tile is one AV1 superblock tall.
+ // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
+ constexpr int kWidth = 512;
+ constexpr int kHeight = 128;
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, fmt, kWidth, kHeight, 1,
+ const_cast<unsigned char *>(kBuffer)));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+ cfg.g_threads = 2;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flag));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+ EncodeSmallHeightMultiThreadedSpeed0(AOM_IMG_FMT_I420, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
+ EncodeSmallHeightMultiThreadedSpeed0(AOM_IMG_FMT_I42016,
+ AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif
+
+// A reproducer test for aomedia:3113. The test should complete without any
+// memory errors.
+void Encode1x1(aom_img_fmt fmt, int bitdepth, aom_codec_flags_t flags) {
+ constexpr int kWidth = 1;
+ constexpr int kHeight = 1;
+
+ // This test cannot use aom_img_alloc() or aom_img_wrap() because they call
+ // align_image_dimension() to align img.w and img.h to the next even number
+ // (2). In this test it is important to set img.w and img.h to 1. Therefore we
+ // set up img manually.
+ aom_image_t img;
+ memset(&img, 0, sizeof(img));
+ img.fmt = fmt;
+ img.bit_depth = bitdepth;
+ img.w = kWidth;
+ img.h = kHeight;
+ img.d_w = kWidth;
+ img.d_h = kHeight;
+ img.x_chroma_shift = 1;
+ img.y_chroma_shift = 1;
+ img.bps = 12;
+ const int y_stride = kWidth;
+ const int uv_stride = (kWidth + 1) >> 1;
+ int y_height = kHeight;
+ int uv_height = (kHeight + 1) >> 1;
+ if (bitdepth > 8) {
+ y_height <<= 1;
+ uv_height <<= 1;
+ }
+ img.stride[AOM_PLANE_Y] = y_stride;
+ img.stride[AOM_PLANE_U] = img.stride[AOM_PLANE_V] = uv_stride;
+ std::unique_ptr<unsigned char[]> y_plane(
+ new unsigned char[y_height * y_stride]());
+ ASSERT_NE(y_plane, nullptr);
+ std::unique_ptr<unsigned char[]> u_plane(
+ new unsigned char[uv_height * uv_stride]());
+ ASSERT_NE(u_plane, nullptr);
+ std::unique_ptr<unsigned char[]> v_plane(
+ new unsigned char[uv_height * uv_stride]());
+ ASSERT_NE(v_plane, nullptr);
+ img.planes[AOM_PLANE_Y] = y_plane.get();
+ img.planes[AOM_PLANE_U] = u_plane.get();
+ img.planes[AOM_PLANE_V] = v_plane.get();
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, flags));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeSmallWidthHeight, 1x1) { Encode1x1(AOM_IMG_FMT_I420, 8, 0); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST(HighbdEncodeSmallWidthHeight, 1x1) {
+ Encode1x1(AOM_IMG_FMT_I42016, 12, AOM_CODEC_USE_HIGHBITDEPTH);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc
new file mode 100644
index 0000000000..b5c506c6d3
--- /dev/null
+++ b/third_party/aom/test/encode_test_driver.cc
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+void Encoder::InitEncoder(VideoSource *video) {
+ aom_codec_err_t res;
+ const aom_image_t *img = video->img();
+
+ if (video->img() && !encoder_.priv) {
+ cfg_.g_w = img->d_w;
+ cfg_.g_h = img->d_h;
+ cfg_.g_timebase = video->timebase();
+ cfg_.rc_twopass_stats_in = stats_->buf();
+
+ res = aom_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+}
+
+void Encoder::EncodeFrame(VideoSource *video,
+ const aom_enc_frame_flags_t frame_flags) {
+ if (video->img())
+ EncodeFrameInternal(*video, frame_flags);
+ else
+ Flush();
+
+ // Handle twopass stats
+ CxDataIterator iter = GetCxData();
+
+ while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+ if (pkt->kind != AOM_CODEC_STATS_PKT) continue;
+
+ stats_->Append(*pkt);
+ }
+}
+
+void Encoder::EncodeFrameInternal(const VideoSource &video,
+ const aom_enc_frame_flags_t frame_flags) {
+ aom_codec_err_t res;
+ const aom_image_t *img = video.img();
+
+ // Handle frame resizing
+ if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
+ cfg_.g_w = img->d_w;
+ cfg_.g_h = img->d_h;
+ res = aom_codec_enc_config_set(&encoder_, &cfg_);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ // Encode the frame
+ API_REGISTER_STATE_CHECK(res =
+ aom_codec_encode(&encoder_, img, video.pts(),
+ video.duration(), frame_flags));
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+}
+
+void Encoder::Flush() {
+ const aom_codec_err_t res = aom_codec_encode(&encoder_, nullptr, 0, 0, 0);
+ if (!encoder_.priv)
+ ASSERT_EQ(AOM_CODEC_ERROR, res) << EncoderError();
+ else
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+}
+
+void EncoderTest::InitializeConfig(TestMode mode) {
+ int usage = AOM_USAGE_GOOD_QUALITY;
+ switch (mode) {
+ case kOnePassGood:
+ case kTwoPassGood: break;
+ case kRealTime: usage = AOM_USAGE_REALTIME; break;
+ case kAllIntra: usage = AOM_USAGE_ALL_INTRA; break;
+ default: ASSERT_TRUE(false) << "Unexpected mode " << mode;
+ }
+ mode_ = mode;
+ passes_ = (mode == kTwoPassGood) ? 2 : 1;
+
+ const aom_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, usage);
+ ASSERT_EQ(AOM_CODEC_OK, res);
+}
+
+static bool compare_plane(const uint8_t *const buf1, int stride1,
+ const uint8_t *const buf2, int stride2, int w, int h,
+ int *const mismatch_row, int *const mismatch_col,
+ int *const mismatch_pix1, int *const mismatch_pix2) {
+ int r, c;
+
+ for (r = 0; r < h; ++r) {
+ for (c = 0; c < w; ++c) {
+ const int pix1 = buf1[r * stride1 + c];
+ const int pix2 = buf2[r * stride2 + c];
+
+ if (pix1 != pix2) {
+ if (mismatch_row != nullptr) *mismatch_row = r;
+ if (mismatch_col != nullptr) *mismatch_col = c;
+ if (mismatch_pix1 != nullptr) *mismatch_pix1 = pix1;
+ if (mismatch_pix2 != nullptr) *mismatch_pix2 = pix2;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+// The function should return "true" most of the time, therefore no early
+// break-out is implemented within the match checking process.
+static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
+ int *const mismatch_row, int *const mismatch_col,
+ int *const mismatch_plane, int *const mismatch_pix1,
+ int *const mismatch_pix2) {
+ if (img1->fmt != img2->fmt || img1->cp != img2->cp || img1->tc != img2->tc ||
+ img1->mc != img2->mc || img1->d_w != img2->d_w ||
+ img1->d_h != img2->d_h || img1->monochrome != img2->monochrome) {
+ if (mismatch_row != nullptr) *mismatch_row = -1;
+ if (mismatch_col != nullptr) *mismatch_col = -1;
+ return false;
+ }
+
+ const int num_planes = img1->monochrome ? 1 : 3;
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (!compare_plane(img1->planes[plane], img1->stride[plane],
+ img2->planes[plane], img2->stride[plane],
+ aom_img_plane_width(img1, plane),
+ aom_img_plane_height(img1, plane), mismatch_row,
+ mismatch_col, mismatch_pix1, mismatch_pix2)) {
+ if (mismatch_plane != nullptr) *mismatch_plane = plane;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void EncoderTest::MismatchHook(const aom_image_t *img_enc,
+ const aom_image_t *img_dec) {
+ int mismatch_row = 0;
+ int mismatch_col = 0;
+ int mismatch_plane = 0;
+ int mismatch_pix_enc = 0;
+ int mismatch_pix_dec = 0;
+
+ ASSERT_FALSE(compare_img(img_enc, img_dec, &mismatch_row, &mismatch_col,
+ &mismatch_plane, &mismatch_pix_enc,
+ &mismatch_pix_dec));
+
+ GTEST_FAIL() << "Encode/Decode mismatch found:" << std::endl
+ << " pixel value enc/dec: " << mismatch_pix_enc << "/"
+ << mismatch_pix_dec << std::endl
+ << " plane: " << mismatch_plane << std::endl
+ << " row/col: " << mismatch_row << "/"
+ << mismatch_col << std::endl;
+}
+
+void EncoderTest::RunLoop(VideoSource *video) {
+ stats_.Reset();
+
+ ASSERT_TRUE(passes_ == 1 || passes_ == 2);
+ for (unsigned int pass = 0; pass < passes_; pass++) {
+ aom_codec_pts_t last_pts = 0;
+
+ if (passes_ == 1)
+ cfg_.g_pass = AOM_RC_ONE_PASS;
+ else if (pass == 0)
+ cfg_.g_pass = AOM_RC_FIRST_PASS;
+ else
+ cfg_.g_pass = AOM_RC_LAST_PASS;
+
+ BeginPassHook(pass);
+ std::unique_ptr<Encoder> encoder(
+ codec_->CreateEncoder(cfg_, init_flags_, &stats_));
+ ASSERT_NE(encoder, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(video->Begin());
+ encoder->InitEncoder(video);
+
+ if (mode_ == kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+ }
+
+ ASSERT_FALSE(::testing::Test::HasFatalFailure());
+#if CONFIG_AV1_DECODER
+ aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+ dec_cfg.allow_lowbitdepth = 1;
+ std::unique_ptr<Decoder> decoder(
+ codec_->CreateDecoder(dec_cfg, 0 /* flags */));
+ if (decoder->IsAV1()) {
+ // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
+ // frame is decoded.
+ decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile);
+ decoder->Control(AV1D_EXT_TILE_DEBUG, 1);
+ decoder->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ decoder->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+#endif
+
+ int number_spatial_layers = GetNumSpatialLayers();
+
+ bool again;
+ for (again = true; again; video->Next()) {
+ again = (video->img() != nullptr);
+
+ for (int sl = 0; sl < number_spatial_layers; sl++) {
+ PreEncodeFrameHook(video, encoder.get());
+ encoder->EncodeFrame(video, frame_flags_);
+ PostEncodeFrameHook(encoder.get());
+ CxDataIterator iter = encoder->GetCxData();
+ bool has_cxdata = false;
+
+#if CONFIG_AV1_DECODER
+ bool has_dxdata = false;
+#endif
+ while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+ pkt = MutateEncoderOutputHook(pkt);
+ again = true;
+ switch (pkt->kind) {
+ case AOM_CODEC_CX_FRAME_PKT: //
+ has_cxdata = true;
+#if CONFIG_AV1_DECODER
+ if (decoder.get() != nullptr && DoDecode()) {
+ aom_codec_err_t res_dec;
+ if (DoDecodeInvisible()) {
+ res_dec = decoder->DecodeFrame(
+ (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
+ } else {
+ res_dec = decoder->DecodeFrame(
+ (const uint8_t *)pkt->data.frame.buf +
+ (pkt->data.frame.sz - pkt->data.frame.vis_frame_size),
+ pkt->data.frame.vis_frame_size);
+ }
+
+ if (!HandleDecodeResult(res_dec, decoder.get())) break;
+
+ has_dxdata = true;
+ }
+#endif
+ ASSERT_GE(pkt->data.frame.pts, last_pts);
+ if (sl == number_spatial_layers - 1)
+ last_pts = pkt->data.frame.pts;
+ FramePktHook(pkt);
+ break;
+
+ case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
+
+ case AOM_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+
+ default: break;
+ }
+ }
+ if (has_cxdata) {
+ const aom_image_t *img_enc = encoder->GetPreviewFrame();
+ if (img_enc) {
+ CalculateFrameLevelSSIM(video->img(), img_enc, cfg_.g_bit_depth,
+ cfg_.g_input_bit_depth);
+ }
+#if CONFIG_AV1_DECODER
+ if (has_dxdata) {
+ DxDataIterator dec_iter = decoder->GetDxData();
+ const aom_image_t *img_dec = dec_iter.Next();
+ if (img_enc && img_dec) {
+ const bool res = compare_img(img_enc, img_dec, nullptr, nullptr,
+ nullptr, nullptr, nullptr);
+ if (!res) { // Mismatch
+ MismatchHook(img_enc, img_dec);
+ }
+ }
+ if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
+ }
+#endif
+ }
+ if (!Continue()) break;
+ } // Loop over spatial layers
+ }
+
+ EndPassHook();
+
+ if (!Continue()) break;
+ }
+}
+
+} // namespace libaom_test
diff --git a/third_party/aom/test/encode_test_driver.h b/third_party/aom/test/encode_test_driver.h
new file mode 100644
index 0000000000..d1e6615cd7
--- /dev/null
+++ b/third_party/aom/test/encode_test_driver.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_ENCODE_TEST_DRIVER_H_
+#define AOM_TEST_ENCODE_TEST_DRIVER_H_
+
+#include <string>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+#include "aom/aom_encoder.h"
+
+namespace libaom_test {
+
+class CodecFactory;
+class VideoSource;
+
+enum TestMode { kRealTime, kOnePassGood, kTwoPassGood, kAllIntra };
+#define ALL_TEST_MODES \
+ ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood, \
+ ::libaom_test::kTwoPassGood)
+
+#define ONE_PASS_TEST_MODES \
+ ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood)
+
+#define TWO_PASS_TEST_MODES ::testing::Values(::libaom_test::kTwoPassGood)
+
+#define NONREALTIME_TEST_MODES \
+ ::testing::Values(::libaom_test::kOnePassGood, ::libaom_test::kTwoPassGood)
+
+// Provides an object to handle the libaom get_cx_data() iteration pattern
+class CxDataIterator {
+ public:
+ explicit CxDataIterator(aom_codec_ctx_t *encoder)
+ : encoder_(encoder), iter_(nullptr) {}
+
+ const aom_codec_cx_pkt_t *Next() {
+ return aom_codec_get_cx_data(encoder_, &iter_);
+ }
+
+ private:
+ aom_codec_ctx_t *encoder_;
+ aom_codec_iter_t iter_;
+};
+
+// Implements an in-memory store for libaom twopass statistics
+class TwopassStatsStore {
+ public:
+ void Append(const aom_codec_cx_pkt_t &pkt) {
+ buffer_.append(reinterpret_cast<char *>(pkt.data.twopass_stats.buf),
+ pkt.data.twopass_stats.sz);
+ }
+
+ aom_fixed_buf_t buf() {
+ const aom_fixed_buf_t buf = { &buffer_[0], buffer_.size() };
+ return buf;
+ }
+
+ void Reset() { buffer_.clear(); }
+
+ protected:
+ std::string buffer_;
+};
+
+// Provides a simplified interface to manage one video encoding pass, given
+// a configuration and video source.
+//
+// TODO(jkoleszar): The exact services it provides and the appropriate
+// level of abstraction will be fleshed out as more tests are written.
+class Encoder {
+ public:
+ Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags,
+ TwopassStatsStore *stats)
+ : cfg_(cfg), init_flags_(init_flags), stats_(stats) {
+ memset(&encoder_, 0, sizeof(encoder_));
+ }
+
+ virtual ~Encoder() { aom_codec_destroy(&encoder_); }
+
+ CxDataIterator GetCxData() { return CxDataIterator(&encoder_); }
+
+ void InitEncoder(VideoSource *video);
+
+ const aom_image_t *GetPreviewFrame() {
+ return aom_codec_get_preview_frame(&encoder_);
+ }
+ // This is a thin wrapper around aom_codec_encode(), so refer to
+ // aom_encoder.h for its semantics.
+ void EncodeFrame(VideoSource *video, aom_enc_frame_flags_t frame_flags);
+
+ // Convenience wrapper for EncodeFrame()
+ void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
+
+ void Control(int ctrl_id, int arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, int *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, struct aom_scaling_mode *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, struct aom_svc_layer_id *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, struct aom_svc_ref_frame_config *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, struct aom_svc_ref_frame_comp_pred *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, struct aom_svc_params *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Control(int ctrl_id, struct aom_ext_part_funcs *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+#if CONFIG_AV1_ENCODER
+ void Control(int ctrl_id, aom_active_map_t *arg) {
+ const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+#endif
+
+ void SetOption(const char *name, const char *value) {
+ const aom_codec_err_t res = aom_codec_set_option(&encoder_, name, value);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ }
+
+ void Config(const aom_codec_enc_cfg_t *cfg) {
+ const aom_codec_err_t res = aom_codec_enc_config_set(&encoder_, cfg);
+ ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+ cfg_ = *cfg;
+ }
+
+ protected:
+ virtual aom_codec_iface_t *CodecInterface() const = 0;
+
+ const char *EncoderError() {
+ const char *detail = aom_codec_error_detail(&encoder_);
+ return detail ? detail : aom_codec_error(&encoder_);
+ }
+
+ // Encode an image
+ void EncodeFrameInternal(const VideoSource &video,
+ aom_enc_frame_flags_t frame_flags);
+
+ // Flush the encoder on EOS
+ void Flush();
+
+ aom_codec_ctx_t encoder_;
+ aom_codec_enc_cfg_t cfg_;
+ aom_codec_flags_t init_flags_;
+ TwopassStatsStore *stats_;
+};
+
+// Common test functionality for all Encoder tests.
+//
+// This class is a mixin which provides the main loop common to all
+// encoder tests. It provides hooks which can be overridden by subclasses
+// to implement each test's specific behavior, while centralizing the bulk
+// of the boilerplate. Note that it doesn't inherit the gtest testing
+// classes directly, so that tests can be parameterized differently.
+class EncoderTest {
+ protected:
+ explicit EncoderTest(const CodecFactory *codec)
+ : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
+ mode_(kRealTime) {
+ // Default to 1 thread.
+ cfg_.g_threads = 1;
+ }
+
+ virtual ~EncoderTest() = default;
+
+ // Initialize the cfg_ member with the default configuration for the
+ // TestMode enum and maps the TestMode enum to the passes_ variable.
+ void InitializeConfig(TestMode mode);
+
+ // Set encoder flag.
+ void set_init_flags(aom_codec_flags_t flag) { init_flags_ = flag; }
+
+ // Main loop
+ virtual void RunLoop(VideoSource *video);
+
+ // Hook to be called at the beginning of a pass.
+ virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+ // Hook to be called at the end of a pass.
+ virtual void EndPassHook() {}
+
+ // Hook to be called before encoding a frame.
+ virtual void PreEncodeFrameHook(VideoSource * /*video*/,
+ Encoder * /*encoder*/) {}
+
+ virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
+ // Hook to be called on every compressed data packet.
+ virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+ // Hook to be called on every PSNR packet.
+ virtual void PSNRPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+ // Hook to be called on every first pass stats packet.
+ virtual void StatsPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+ // Calculates SSIM at frame level.
+ virtual void CalculateFrameLevelSSIM(const aom_image_t * /*img_src*/,
+ const aom_image_t * /*img_enc*/,
+ aom_bit_depth_t /*bit_depth*/,
+ unsigned int /*input_bit_depth*/) {}
+
+ // Hook to determine whether the encode loop should continue.
+ virtual bool Continue() const {
+ return !(::testing::Test::HasFatalFailure() || abort_);
+ }
+
+ // Hook to determine whether to decode frame after encoding
+ virtual bool DoDecode() const { return true; }
+
+ // Hook to determine whether to decode invisible frames after encoding
+ virtual bool DoDecodeInvisible() const { return true; }
+
+ // Hook to handle encode/decode mismatch
+ virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2);
+
+ // Hook to be called on every decompressed frame.
+ virtual void DecompressedFrameHook(const aom_image_t & /*img*/,
+ aom_codec_pts_t /*pts*/) {}
+
+ // Hook to be called to handle decode result. Return true to continue.
+ virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ Decoder *decoder) {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ virtual int GetNumSpatialLayers() { return 1; }
+
+ // Hook that can modify the encoder's output data
+ virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
+ const aom_codec_cx_pkt_t *pkt) {
+ return pkt;
+ }
+
+ const CodecFactory *codec_;
+ bool abort_;
+ aom_codec_enc_cfg_t cfg_;
+ unsigned int passes_;
+ TwopassStatsStore stats_;
+ aom_codec_flags_t init_flags_;
+ aom_enc_frame_flags_t frame_flags_;
+ TestMode mode_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_ENCODE_TEST_DRIVER_H_
diff --git a/third_party/aom/test/encodemb_test.cc b/third_party/aom/test/encodemb_test.cc
new file mode 100644
index 0000000000..6165fc33f5
--- /dev/null
+++ b/third_party/aom/test/encodemb_test.cc
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+// Reorders 'qcoeff_lexico', which is in lexicographic order (row by row), into
+// scan order (zigzag) in 'qcoeff_scan'.
+void ToScanOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_lexico,
+ tran_low_t *qcoeff_scan) {
+ const int max_eob = av1_get_max_eob(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ for (int i = 0; i < max_eob; ++i) {
+ qcoeff_scan[i] = qcoeff_lexico[scan_order->scan[i]];
+ }
+}
+
+// Reorders 'qcoeff_scan', which is in scan order (zigzag), into lexicographic
+// order (row by row) in 'qcoeff_lexico'.
+void ToLexicoOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_scan,
+ tran_low_t *qcoeff_lexico) {
+ const int max_eob = av1_get_max_eob(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ for (int i = 0; i < max_eob; ++i) {
+ qcoeff_lexico[scan_order->scan[i]] = qcoeff_scan[i];
+ }
+}
+
+// Runs coefficient dropout on 'qcoeff_scan'.
+void Dropout(TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before,
+ int dropout_num_after, tran_low_t *qcoeff_scan) {
+ tran_low_t qcoeff[MAX_TX_SQUARE];
+ // qcoeff_scan is assumed to be in scan order, since tests are easier to
+ // understand this way, but av1_dropout_qcoeff expects coeffs in lexico order
+ // so we convert to lexico then back to scan afterwards.
+ ToLexicoOrder(tx_size, tx_type, qcoeff_scan, qcoeff);
+
+ const int max_eob = av1_get_max_eob(tx_size);
+ const int kDequantFactor = 10;
+ tran_low_t dqcoeff[MAX_TX_SQUARE];
+ for (int i = 0; i < max_eob; ++i) {
+ dqcoeff[i] = qcoeff[i] * kDequantFactor;
+ }
+
+ uint16_t eob = max_eob;
+ while (eob > 0 && qcoeff_scan[eob - 1] == 0) --eob;
+
+ MACROBLOCK mb;
+ const int kPlane = 0;
+ const int kBlock = 0;
+ memset(&mb, 0, sizeof(mb));
+ uint16_t eobs[] = { eob };
+ mb.plane[kPlane].eobs = eobs;
+ mb.plane[kPlane].qcoeff = qcoeff;
+ mb.plane[kPlane].dqcoeff = dqcoeff;
+ uint8_t txb_entropy_ctx[1];
+ mb.plane[kPlane].txb_entropy_ctx = txb_entropy_ctx;
+
+ av1_dropout_qcoeff_num(&mb, kPlane, kBlock, tx_size, tx_type,
+ dropout_num_before, dropout_num_after);
+
+ ToScanOrder(tx_size, tx_type, qcoeff, qcoeff_scan);
+
+ // Check updated eob value is valid.
+ uint16_t new_eob = max_eob;
+ while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob;
+ EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]);
+
+ // Check dqcoeff is still valid.
+ for (int i = 0; i < max_eob; ++i) {
+ EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]);
+ }
+}
+
+void ExpectArrayEq(tran_low_t *actual, std::vector<tran_low_t> expected) {
+ for (size_t i = 0; i < expected.size(); ++i) {
+ EXPECT_EQ(expected[i], actual[i]) << "Arrays differ at index " << i;
+ }
+}
+
+static constexpr TX_TYPE kTxType = DCT_DCT;
+
+TEST(DropoutTest, KeepsLargeCoeffs) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 6;
+ // Large isolated coeffs should be preserved.
+ tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 0, 0, 42, 0, // should be kept
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, -30, // should be kept
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 42, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, -30, //
+ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, RemovesSmallIsolatedCoeffs) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 6;
+ // Small isolated coeffs should be removed.
+ tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0, 0, 0, // should be removed
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, -2, 0, 0, 0, // should be removed
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsAmongLargeOnes) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 6;
+ // Small coeffs that are not isolated (not enough zeros before/after should be
+ // kept).
+ tran_low_t qcoeff_scan[] = {
+ 1, 0, 0, 0, -5, 0, 0, -1, // should be kept
+ 0, 0, 0, 10, 0, 0, 2, 0, // should be kept
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, -2, 0, 0, 0, 0, 0, 0 // should be removed
+ }; // should be removed
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 1, 0, 0, 0, -5, 0, 0, -1, //
+ 0, 0, 0, 10, 0, 0, 2, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsCloseToStartOrEnd) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 6;
+ // Small coeffs that are too close to the beginning or end of the block
+ // should also be kept (not enough zeroes before/after).
+ tran_low_t qcoeff_scan[] = { 0, 0, -1, 0, 0, 0, 0, 0, // should be kept
+ 0, 0, 0, 10, 0, 0, 0, 0, // should be kept
+ 0, 0, 0, 2, 0, 0, 0, 0, // should be removed
+ 0, 0, 0, 0, 0, 0, -1, 0 }; // should be kept
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 0, 0, -1, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 10, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, -1, 0 });
+}
+
+TEST(DropoutTest, RemovesSmallClusterOfCoeffs) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 6;
+ // Small clusters (<= kDropoutContinuityMax) of small coeffs should be
+ // removed.
+ tran_low_t qcoeff_scan_two[] = {
+ 0, 0, 0, 0, 1, 0, 0, -1, // should be removed
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 1, 0, // should be removed
+ 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after,
+ qcoeff_scan_two);
+ ExpectArrayEq(qcoeff_scan_two, { 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsLargeClusterOfCoeffs) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 6;
+ // Large clusters (> kDropoutContinuityMax) of small coeffs should be kept.
+ tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0, 1, -1, // should be kept
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, -2, 0, 0, // should be removed
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 1, 0, 1, -1, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, NumBeforeLargerThanNumAfter) {
+ const TX_SIZE tx_size = TX_8X4;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 2;
+ // The second coeff (-2) doesn't seem to meet the dropout_num_before
+ // criteria. But since the first coeff (1) will be dropped, it will meet
+ // the criteria and should be dropped too.
+ tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0, 0, 0, // should be removed
+ -2, 0, 0, 0, 0, 0, 0, 0, // should be removed
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+// More complex test combining other test cases.
+TEST(DropoutTest, ComplexTest) {
+ const TX_SIZE tx_size = TX_8X8;
+ const uint32_t dropout_num_before = 4;
+ const uint32_t dropout_num_after = 2;
+ tran_low_t qcoeff_scan[] = { 1, 12, 0, 0, 0, 0, 1, 0, //
+ 0, 0, 0, -12, 0, 0, 0, 1, //
+ 0, 0, -2, 0, 1, 0, 0, 1, //
+ 0, 0, 0, 0, 5, 0, -1, 0, //
+ 0, 0, 0, 1, 0, 0, 0, -1, //
+ 0, 0, 0, 0, 2, 0, 0, 0, //
+ 0, 1, 0, 0, 0, 5, 0, 0, //
+ 0, 0, 1, 1, 0, 0, 0, -2 };
+ Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+ ExpectArrayEq(qcoeff_scan, { 1, 12, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, -12, 0, 0, 0, 1, //
+ 0, 0, -2, 0, 1, 0, 0, 1, //
+ 0, 0, 0, 0, 5, 0, -1, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, 0, //
+ 0, 0, 0, 0, 0, 5, 0, 0, //
+ 0, 0, 0, 0, 0, 0, 0, -2 });
+}
+
+} // namespace
diff --git a/third_party/aom/test/encodetxb_test.cc b/third_party/aom/test/encodetxb_test.cc
new file mode 100644
index 0000000000..49b0fba94a
--- /dev/null
+++ b/third_party/aom/test/encodetxb_test.cc
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/idct.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*GetNzMapContextsFunc)(const uint8_t *const levels,
+ const int16_t *const scan,
+ const uint16_t eob, const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts);
+
+class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
+ public:
+ EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
+
+ ~EncodeTxbTest() override = default;
+
+ void SetUp() override {
+ coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
+ aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
+ ASSERT_NE(coeff_contexts_ref_, nullptr);
+ coeff_contexts_ = reinterpret_cast<int8_t *>(
+ aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE));
+ ASSERT_NE(coeff_contexts_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(coeff_contexts_ref_);
+ aom_free(coeff_contexts_);
+ }
+
+ void GetNzMapContextsRun() {
+ const int kNumTests = 10;
+ int result = 0;
+
+ for (int is_inter = 0; is_inter < 2; ++is_inter) {
+ for (int tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+ const int bhl = get_txb_bhl((TX_SIZE)tx_size);
+ const int width = get_txb_wide((TX_SIZE)tx_size);
+ const int height = get_txb_high((TX_SIZE)tx_size);
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+
+ levels_ = set_levels(levels_buf_, height);
+ for (int i = 0; i < kNumTests && !result; ++i) {
+ for (int eob = 1; eob <= width * height && !result; ++eob) {
+ InitDataWithEob(scan, bhl, eob);
+
+ av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+ tx_class, coeff_contexts_ref_);
+ get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+ tx_class, coeff_contexts_);
+
+ result = Compare(scan, eob);
+
+ EXPECT_EQ(result, 0)
+ << " tx_class " << (int)tx_class << " width " << real_width
+ << " height " << real_height << " eob " << eob;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void SpeedTestGetNzMapContextsRun() {
+ const int kNumTests = 2000000000;
+ aom_usec_timer timer;
+ aom_usec_timer timer_ref;
+
+ printf("Note: Only test the largest possible eob case!\n");
+ for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+ const int bhl = get_txb_bhl((TX_SIZE)tx_size);
+ const int width = get_txb_wide((TX_SIZE)tx_size);
+ const int height = get_txb_high((TX_SIZE)tx_size);
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const TX_TYPE tx_type = DCT_DCT;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+ const int eob = width * height;
+ const int numTests = kNumTests / (width * height);
+
+ levels_ = set_levels(levels_buf_, height);
+ InitDataWithEob(scan, bhl, eob);
+
+ aom_usec_timer_start(&timer_ref);
+ for (int i = 0; i < numTests; ++i) {
+ av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+ tx_class, coeff_contexts_ref_);
+ }
+ aom_usec_timer_mark(&timer_ref);
+
+ levels_ = set_levels(levels_buf_, height);
+ InitDataWithEob(scan, bhl, eob);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < numTests; ++i) {
+ get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+ tx_class, coeff_contexts_);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time_ref =
+ static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ printf("get_nz_map_contexts_%2dx%2d: %7.1f ms ref %7.1f ms gain %4.2f\n",
+ real_width, real_height, elapsed_time / 1000.0,
+ elapsed_time_ref / 1000.0,
+ (elapsed_time_ref * 1.0) / (elapsed_time * 1.0));
+ }
+ }
+
+ private:
+ void InitDataWithEob(const int16_t *const scan, const int bhl,
+ const int eob) {
+ memset(levels_buf_, 0, sizeof(levels_buf_));
+ memset(coeff_contexts_, 0, sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+
+ for (int c = 0; c < eob; ++c) {
+ levels_[get_padded_idx(scan[c], bhl)] =
+ static_cast<uint8_t>(clamp(rnd_.Rand8(), 0, INT8_MAX));
+ coeff_contexts_[scan[c]] = static_cast<int8_t>(rnd_.Rand16() >> 1);
+ }
+
+ memcpy(coeff_contexts_ref_, coeff_contexts_,
+ sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+ }
+
+ bool Compare(const int16_t *const scan, const int eob) const {
+ bool result = false;
+ if (memcmp(coeff_contexts_, coeff_contexts_ref_,
+ sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)) {
+ for (int i = 0; i < eob; i++) {
+ const int pos = scan[i];
+ if (coeff_contexts_ref_[pos] != coeff_contexts_[pos]) {
+ printf("coeff_contexts_[%d] diff:%6d (ref),%6d (opt)\n", pos,
+ coeff_contexts_ref_[pos], coeff_contexts_[pos]);
+ result = true;
+ break;
+ }
+ }
+ }
+ return result;
+ }
+
+ GetNzMapContextsFunc get_nz_map_contexts_func_;
+ ACMRandom rnd_;
+ uint8_t levels_buf_[TX_PAD_2D];
+ uint8_t *levels_;
+ int8_t *coeff_contexts_ref_;
+ int8_t *coeff_contexts_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbTest);
+
+TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
+
+TEST_P(EncodeTxbTest, DISABLED_SpeedTestGetNzMapContexts) {
+ SpeedTestGetNzMapContextsRun();
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, EncodeTxbTest,
+ ::testing::Values(av1_get_nz_map_contexts_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, EncodeTxbTest,
+ ::testing::Values(av1_get_nz_map_contexts_neon));
+#endif
+
+typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
+ const int width, const int height,
+ uint8_t *const levels);
+
+typedef std::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
+
+class EncodeTxbInitLevelTest
+ : public ::testing::TestWithParam<TxbInitLevelParam> {
+ public:
+ ~EncodeTxbInitLevelTest() override = default;
+ void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbInitLevelTest);
+
+void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
+ int tx_size, int is_speed) {
+ const int width = get_txb_wide((TX_SIZE)tx_size);
+ const int height = get_txb_high((TX_SIZE)tx_size);
+ tran_low_t coeff[MAX_TX_SQUARE];
+
+ uint8_t levels_buf[2][TX_PAD_2D];
+ uint8_t *const levels0 = set_levels(levels_buf[0], height);
+ uint8_t *const levels1 = set_levels(levels_buf[1], height);
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ for (int i = 0; i < width * height; i++) {
+ coeff[i] = rnd.Rand16Signed();
+ }
+ for (int i = 0; i < TX_PAD_2D; i++) {
+ levels_buf[0][i] = rnd.Rand8();
+ levels_buf[1][i] = rnd.Rand8();
+ }
+ const int run_times = is_speed ? (width * height) * 10000 : 1;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_txb_init_levels_c(coeff, width, height, levels0);
+ }
+ const double t1 = get_time_mark(&timer);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ test_func(coeff, width, height, levels1);
+ }
+ const double t2 = get_time_mark(&timer);
+ if (is_speed) {
+ printf("init %3dx%-3d:%7.2f/%7.2fns", width, height, t1, t2);
+ printf("(%3.2f)\n", t1 / t2);
+ }
+ const int stride = width + TX_PAD_HOR;
+ for (int r = 0; r < height + TX_PAD_VER; ++r) {
+ for (int c = 0; c < stride; ++c) {
+ ASSERT_EQ(levels_buf[0][c + r * stride], levels_buf[1][c + r * stride])
+ << "[" << r << "," << c << "] " << run_times << width << "x"
+ << height;
+ }
+ }
+}
+
+TEST_P(EncodeTxbInitLevelTest, match) {
+ RunTest(GET_PARAM(0), GET_PARAM(1), 0);
+}
+
+TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) {
+ RunTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, EncodeTxbInitLevelTest,
+ ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1),
+ ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, EncodeTxbInitLevelTest,
+ ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
+ ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, EncodeTxbInitLevelTest,
+ ::testing::Combine(::testing::Values(&av1_txb_init_levels_neon),
+ ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+} // namespace
diff --git a/third_party/aom/test/end_to_end_psnr_test.cc b/third_party/aom/test/end_to_end_psnr_test.cc
new file mode 100644
index 0000000000..687308da8c
--- /dev/null
+++ b/third_party/aom/test/end_to_end_psnr_test.cc
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kWidth = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const unsigned int kCqLevel = 18;
+// List of psnr thresholds for speed settings 0-8 and 4 encoding modes
+const double kPsnrThreshold[][4] = {
+ { 34.9, 44.4, 39.5, 41.9 }, { 34.9, 44.4, 39.5, 41.9 },
+ { 34.9, 44.4, 39.4, 41.9 }, { 34.9, 44.4, 39.1, 41.8 },
+ { 34.9, 44.4, 39.1, 41.8 }, { 34.9, 44.29, 38.5, 41.8 },
+ { 34.9, 44.3, 38.5, 41.3 }, { 34.9, 44.3, 38.5, 40.8 },
+ { 34.9, 44.3, 38.5, 40.8 }
+};
+
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+ { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+ { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+ { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+ { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+ { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+ { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+ { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// Encoding modes tested
+const libaom_test::TestMode kEncodingModeVectors[] = {
+ ::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood,
+ ::libaom_test::kRealTime,
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
+
+class EndToEndTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+ TestVideoParam, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ EndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)),
+ cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
+ encoding_mode_(GET_PARAM(1)) {}
+
+ ~EndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+ encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ cfg_.g_lag_in_frames = 5;
+ } else if (encoding_mode_ == ::libaom_test::kRealTime) {
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ }
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass.
+ if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood)
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ else
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+ encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ }
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() {
+ return kPsnrThreshold[cpu_used_][encoding_mode_];
+ }
+
+ void DoTest() {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ if (is_extension_y4m(test_video_param_.filename)) {
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ } else {
+ video.reset(new libaom_test::YUVVideoSource(
+ test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+ kFramerate, 1, 0, kFrames));
+ }
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, GetPsnrThreshold() * 0.98)
+ << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_;
+ }
+
+ TestVideoParam test_video_param_;
+ int cpu_used_;
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ libaom_test::TestMode encoding_mode_;
+};
+
+class EndToEndTestLarge : public EndToEndTest {};
+
+class EndToEndAllIntraTestLarge : public EndToEndTest {};
+
+class EndToEndAllIntraTest : public EndToEndTest {};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndAllIntraTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndAllIntraTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
+ ::testing::ValuesIn(kEncodingModeVectors),
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndTest,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Values(kTestVectors[2]), // 444
+ ::testing::Values(3)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTestLarge,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Values(2, 4, 6, 8)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTest,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(kTestVectors[0]), // 420
+ ::testing::Values(6)); // cpu_used
+} // namespace
diff --git a/third_party/aom/test/end_to_end_qmpsnr_test.cc b/third_party/aom/test/end_to_end_qmpsnr_test.cc
new file mode 100644
index 0000000000..7a755a7a51
--- /dev/null
+++ b/third_party/aom/test/end_to_end_qmpsnr_test.cc
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+ 83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+ { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+ { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+ { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+ { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+ { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+ { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+ { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value, while using the
+// "dist-metric=qm-psnr" option.
+class EndToEndQMPSNRTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+ TestVideoParam, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ EndToEndQMPSNRTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+ ssim_(0.0) {}
+
+ ~EndToEndQMPSNRTest() override = default;
+
+ void SetUp() override { InitializeConfig(encoding_mode_); }
+
+ void BeginPassHook(unsigned int) override {
+ nframes_ = 0;
+ ssim_ = 0.0;
+ }
+
+ void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+ const aom_image_t *img_enc,
+ aom_bit_depth_t bit_depth,
+ unsigned int input_bit_depth) override {
+ double frame_ssim;
+ double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+ int crop_widths[PLANE_TYPES];
+ int crop_heights[PLANE_TYPES];
+ crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+ crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+ // Width of UV planes calculated based on chroma_shift values.
+ crop_widths[PLANE_TYPE_UV] =
+ img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+ crop_heights[PLANE_TYPE_UV] =
+ img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+ nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ uint8_t is_hbd = bit_depth > AOM_BITS_8;
+ if (is_hbd) {
+ // HBD ssim calculation.
+ uint8_t shift = bit_depth - input_bit_depth;
+ for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+ const int is_uv = i > AOM_PLANE_Y;
+ plane_ssim[i] = aom_highbd_ssim2(
+ CONVERT_TO_BYTEPTR(img_src->planes[i]),
+ CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+ img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+ crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+ }
+ frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+ .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+ // Accumulate to find sequence level ssim value.
+ ssim_ += frame_ssim;
+ return;
+ }
+#else
+ (void)bit_depth;
+ (void)input_bit_depth;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ // LBD ssim calculation.
+ for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+ const int is_uv = i > AOM_PLANE_Y;
+ plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+ img_src->stride[is_uv], img_enc->stride[is_uv],
+ crop_widths[is_uv], crop_heights[is_uv]);
+ }
+ frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+ .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+ // Accumulate to find sequence level ssim value.
+ ssim_ += frame_ssim;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ encoder->SetOption("dist-metric", "qm-psnr");
+ }
+ }
+
+ double GetAverageSsim() const {
+ if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+ return 0.0;
+ }
+
+ double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+ void DoTest() {
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video(
+ new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double ssim = GetAverageSsim();
+ EXPECT_GT(ssim, GetSsimThreshold())
+ << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+ }
+
+ private:
+ const libaom_test::TestMode encoding_mode_;
+ const TestVideoParam test_video_param_;
+ const int cpu_used_;
+ unsigned int nframes_;
+ double ssim_;
+};
+
+class EndToEndQMPSNRTestLarge : public EndToEndQMPSNRTest {};
+
+TEST_P(EndToEndQMPSNRTestLarge, EndtoEndQMPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndQMPSNRTest, EndtoEndQMPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndQMPSNRTestLarge,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Values(2, 4, 6, 8)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndQMPSNRTest,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(kTestVectors[0]), // 420
+ ::testing::Values(6)); // cpu_used
+} // namespace
diff --git a/third_party/aom/test/end_to_end_ssim_test.cc b/third_party/aom/test/end_to_end_ssim_test.cc
new file mode 100644
index 0000000000..f1b0cae75f
--- /dev/null
+++ b/third_party/aom/test/end_to_end_ssim_test.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+ 83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+ { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+ { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+ { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+ { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+ { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+ { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+ { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value.
+class EndToEndSSIMTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+ TestVideoParam, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ EndToEndSSIMTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+ ssim_(0.0) {}
+
+ ~EndToEndSSIMTest() override = default;
+
+ void SetUp() override { InitializeConfig(encoding_mode_); }
+
+ void BeginPassHook(unsigned int) override {
+ nframes_ = 0;
+ ssim_ = 0.0;
+ }
+
+ void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+ const aom_image_t *img_enc,
+ aom_bit_depth_t bit_depth,
+ unsigned int input_bit_depth) override {
+ double frame_ssim;
+ double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+ int crop_widths[PLANE_TYPES];
+ int crop_heights[PLANE_TYPES];
+ crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+ crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+ // Width of UV planes calculated based on chroma_shift values.
+ crop_widths[PLANE_TYPE_UV] =
+ img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+ crop_heights[PLANE_TYPE_UV] =
+ img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+ nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ uint8_t is_hbd = bit_depth > AOM_BITS_8;
+ if (is_hbd) {
+ // HBD ssim calculation.
+ uint8_t shift = bit_depth - input_bit_depth;
+ for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+ const int is_uv = i > AOM_PLANE_Y;
+ plane_ssim[i] = aom_highbd_ssim2(
+ CONVERT_TO_BYTEPTR(img_src->planes[i]),
+ CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+ img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+ crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+ }
+ frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+ .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+ // Accumulate to find sequence level ssim value.
+ ssim_ += frame_ssim;
+ return;
+ }
+#else
+ (void)bit_depth;
+ (void)input_bit_depth;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ // LBD ssim calculation.
+ for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+ const int is_uv = i > AOM_PLANE_Y;
+ plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+ img_src->stride[is_uv], img_enc->stride[is_uv],
+ crop_widths[is_uv], crop_heights[is_uv]);
+ }
+ frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+ .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+ // Accumulate to find sequence level ssim value.
+ ssim_ += frame_ssim;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ }
+ }
+
+ double GetAverageSsim() const {
+ if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+ return 0.0;
+ }
+
+ double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+ void DoTest() {
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video(
+ new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double ssim = GetAverageSsim();
+ EXPECT_GT(ssim, GetSsimThreshold())
+ << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+ }
+
+ private:
+ const libaom_test::TestMode encoding_mode_;
+ const TestVideoParam test_video_param_;
+ const int cpu_used_;
+ unsigned int nframes_;
+ double ssim_;
+};
+
+class EndToEndSSIMTestLarge : public EndToEndSSIMTest {};
+
+TEST_P(EndToEndSSIMTestLarge, EndtoEndSSIMTest) { DoTest(); }
+
+TEST_P(EndToEndSSIMTest, EndtoEndSSIMTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTestLarge,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Values(2, 4, 6, 8)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTest,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(kTestVectors[0]), // 420
+ ::testing::Values(6)); // cpu_used
+} // namespace
diff --git a/third_party/aom/test/error_block_test.cc b/third_party/aom/test/error_block_test.cc
new file mode 100644
index 0000000000..e7cd870a98
--- /dev/null
+++ b/third_party/aom/test/error_block_test.cc
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int kNumIterations = 1000;
+
+using ErrorBlockFunc = int64_t (*)(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bps);
+
+using ErrorBlockFunc8Bits = int64_t (*)(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz);
+
+using ErrorBlockLpFunc = int64_t (*)(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ intptr_t block_size);
+
+using ErrorBlockParam =
+ std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>;
+
+template <ErrorBlockFunc8Bits fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bps) {
+ EXPECT_EQ(bps, 8);
+ return fn(coeff, dqcoeff, block_size, ssz);
+}
+
+template <ErrorBlockLpFunc fn>
+int64_t BlockErrorLpWrapper(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bps) {
+ EXPECT_EQ(bps, 8);
+ *ssz = -1;
+ return fn(reinterpret_cast<const int16_t *>(coeff),
+ reinterpret_cast<const int16_t *>(dqcoeff), block_size);
+}
+
+class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+ ~ErrorBlockTest() override = default;
+ void SetUp() override {
+ error_block_op_ = GET_PARAM(0);
+ ref_error_block_op_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ }
+
+ protected:
+ aom_bit_depth_t bit_depth_;
+ ErrorBlockFunc error_block_op_;
+ ErrorBlockFunc ref_error_block_op_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ErrorBlockTest);
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+ int err_count_total = 0;
+ int first_failure = -1;
+ intptr_t block_size;
+ int64_t ssz;
+ int64_t ret;
+ int64_t ref_ssz;
+ int64_t ref_ret;
+ const int msb = bit_depth_ + 8 - 1;
+ for (int i = 0; i < kNumIterations; ++i) {
+ int err_count = 0;
+ block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
+ for (int j = 0; j < block_size; j++) {
+ // coeff and dqcoeff will always have at least the same sign, and this
+ // can be used for optimization, so generate test input precisely.
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << msb);
+ dqcoeff[j] = rnd(1 << msb);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << msb);
+ dqcoeff[j] = -rnd(1 << msb);
+ }
+ }
+ ref_ret =
+ ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+ API_REGISTER_STATE_CHECK(
+ ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+ err_count += (ref_ret != ret) | (ref_ssz != ssz);
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Error Block Test, C output doesn't match optimized output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+ int err_count_total = 0;
+ int first_failure = -1;
+ intptr_t block_size;
+ int64_t ssz;
+ int64_t ret;
+ int64_t ref_ssz;
+ int64_t ref_ret;
+ const int msb = bit_depth_ + 8 - 1;
+ int max_val = ((1 << msb) - 1);
+ for (int i = 0; i < kNumIterations; ++i) {
+ int err_count = 0;
+ int k = (i / 9) % 9;
+
+ // Change the maximum coeff value, to test different bit boundaries
+ if (k == 8 && (i % 9) == 0) {
+ max_val >>= 1;
+ }
+ block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
+ for (int j = 0; j < block_size; j++) {
+ if (k < 4) {
+ // Test at positive maximum values
+ coeff[j] = k % 2 ? max_val : 0;
+ dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+ } else if (k < 8) {
+ // Test at negative maximum values
+ coeff[j] = k % 2 ? -max_val : 0;
+ dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
+ } else {
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << 14);
+ dqcoeff[j] = rnd(1 << 14);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << 14);
+ dqcoeff[j] = -rnd(1 << 14);
+ }
+ }
+ }
+ ref_ret =
+ ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+ API_REGISTER_STATE_CHECK(
+ ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+ err_count += (ref_ret != ret) | (ref_ssz != ssz);
+ if (err_count && !err_count_total) {
+ first_failure = i;
+ }
+ err_count_total += err_count;
+ }
+ EXPECT_EQ(0, err_count_total)
+ << "Error: Error Block Test, C output doesn't match optimized output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, DISABLED_Speed) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+ intptr_t block_size;
+ int64_t ssz;
+ int num_iters = 100000;
+ int64_t ref_ssz;
+ const int msb = bit_depth_ + 8 - 1;
+ for (int i = 0; i < 9; ++i) {
+ block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
+ for (int k = 0; k < 9; k++) {
+ for (int j = 0; j < block_size; j++) {
+ if (k < 5) {
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << msb);
+ dqcoeff[j] = rnd(1 << msb);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << msb);
+ dqcoeff[j] = -rnd(1 << msb);
+ }
+ } else {
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << 14);
+ dqcoeff[j] = rnd(1 << 14);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << 14);
+ dqcoeff[j] = -rnd(1 << 14);
+ }
+ }
+ }
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int iter = 0; iter < num_iters; ++iter) {
+ ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int iter = 0; iter < num_iters; ++iter) {
+ error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_);
+ }
+ aom_usec_timer_mark(&test_timer);
+
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ " c_time=%d \t simd_time=%d \t "
+ "gain=%d \n",
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ }
+ }
+}
+
+using std::make_tuple;
+
+#if HAVE_SSE2
+const ErrorBlockParam kErrorBlockTestParamsSse2[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+ AOM_BITS_10),
+ make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+ AOM_BITS_12),
+ make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+ AOM_BITS_8),
+#endif
+ make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
+ &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+ make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_sse2>,
+ &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
+ ::testing::ValuesIn(kErrorBlockTestParamsSse2));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+const ErrorBlockParam kErrorBlockTestParamsAvx2[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+ AOM_BITS_10),
+ make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+ AOM_BITS_12),
+ make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+ AOM_BITS_8),
+#endif
+ make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
+ &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+ make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_avx2>,
+ &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
+ ::testing::ValuesIn(kErrorBlockTestParamsAvx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+ AOM_BITS_10),
+ make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+ AOM_BITS_12),
+ make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c,
+ AOM_BITS_8),
+#endif
+ make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
+ &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+ make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_neon>,
+ &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ErrorBlockTest,
+ ::testing::ValuesIn(kErrorBlockTestParamsNeon));
+#endif // HAVE_NEON
+
+#if HAVE_SVE
+const ErrorBlockParam kErrorBlockTestParamsSVE[] = {
+ make_tuple(&BlockError8BitWrapper<av1_block_error_sve>,
+ &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+ make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_sve>,
+ &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SVE, ErrorBlockTest,
+ ::testing::ValuesIn(kErrorBlockTestParamsSVE));
+#endif // HAVE_SVE
+} // namespace
diff --git a/third_party/aom/test/error_resilience_test.cc b/third_party/aom/test/error_resilience_test.cc
new file mode 100644
index 0000000000..d41884df2b
--- /dev/null
+++ b/third_party/aom/test/error_resilience_test.cc
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxErrorFrames = 12;
+const int kMaxInvisibleErrorFrames = 12;
+const int kMaxDroppableFrames = 12;
+const int kMaxErrorResilientFrames = 12;
+const int kMaxNoMFMVFrames = 12;
+const int kMaxPrimRefNoneFrames = 12;
+const int kMaxSFrames = 12;
+const int kCpuUsed = 1;
+
+class ErrorResilienceTestLarge
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ErrorResilienceTestLarge()
+ : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
+ mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0),
+ enable_altref_(GET_PARAM(2)) {
+ Reset();
+ }
+
+ ~ErrorResilienceTestLarge() override = default;
+
+ void Reset() {
+ error_nframes_ = 0;
+ invisible_error_nframes_ = 0;
+ droppable_nframes_ = 0;
+ error_resilient_nframes_ = 0;
+ nomfmv_nframes_ = 0;
+ prim_ref_none_nframes_ = 0;
+ s_nframes_ = 0;
+ }
+
+ void SetupEncoder(int bitrate, int lag) {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = bitrate;
+ cfg_.kf_mode = AOM_KF_DISABLED;
+ cfg_.g_lag_in_frames = lag;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ void SetUp() override { InitializeConfig(encoding_mode_); }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ decoded_nframes_ = 0;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, enable_altref_);
+ }
+ frame_flags_ &=
+ ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+ AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT |
+ AOM_EFLAG_SET_S_FRAME | AOM_EFLAG_SET_PRIMARY_REF_NONE);
+ if (droppable_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < droppable_nframes_; ++i) {
+ if (droppable_frames_[i] == video->frame()) {
+ std::cout << " Encoding droppable frame: "
+ << droppable_frames_[i] << "\n";
+ frame_flags_ |= (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF);
+ break;
+ }
+ }
+ }
+
+ if (error_resilient_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < error_resilient_nframes_; ++i) {
+ if (error_resilient_frames_[i] == video->frame()) {
+ std::cout << " Encoding error_resilient frame: "
+ << error_resilient_frames_[i] << "\n";
+ frame_flags_ |= AOM_EFLAG_ERROR_RESILIENT;
+ break;
+ }
+ }
+ }
+
+ if (nomfmv_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < nomfmv_nframes_; ++i) {
+ if (nomfmv_frames_[i] == video->frame()) {
+ std::cout << " Encoding no mfmv frame: "
+ << nomfmv_frames_[i] << "\n";
+ frame_flags_ |= AOM_EFLAG_NO_REF_FRAME_MVS;
+ break;
+ }
+ }
+ }
+
+ if (prim_ref_none_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i) {
+ if (prim_ref_none_frames_[i] == video->frame()) {
+ std::cout << " Encoding no PRIMARY_REF_NONE frame: "
+ << prim_ref_none_frames_[i] << "\n";
+ frame_flags_ |= AOM_EFLAG_SET_PRIMARY_REF_NONE;
+ break;
+ }
+ }
+ }
+
+ encoder->Control(AV1E_SET_S_FRAME_MODE, 0);
+ if (s_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < s_nframes_; ++i) {
+ if (s_frames_[i] == video->frame()) {
+ std::cout << " Encoding S frame: " << s_frames_[i]
+ << "\n";
+ frame_flags_ |= AOM_EFLAG_SET_S_FRAME;
+ break;
+ }
+ }
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ // Check that the encode frame flags are correctly reflected
+ // in the output frame flags.
+ const int encode_flags = pkt->data.frame.flags >> 16;
+ if ((encode_flags & (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF)) ==
+ (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_DROPPABLE,
+ AOM_FRAME_IS_DROPPABLE);
+ }
+ if (encode_flags & AOM_EFLAG_SET_S_FRAME) {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_SWITCH,
+ AOM_FRAME_IS_SWITCH);
+ }
+ if (encode_flags & AOM_EFLAG_ERROR_RESILIENT) {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT,
+ AOM_FRAME_IS_ERROR_RESILIENT);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetAverageMismatchPsnr() const {
+ if (mismatch_nframes_) return mismatch_psnr_ / mismatch_nframes_;
+ return 0.0;
+ }
+
+ bool DoDecode() const override {
+ if (error_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < error_nframes_; ++i) {
+ if (error_frames_[i] == nframes_ - 1) {
+ std::cout << " Skipping decoding frame: "
+ << error_frames_[i] << "\n";
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ bool DoDecodeInvisible() const override {
+ if (invisible_error_nframes_ > 0 &&
+ (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+ for (unsigned int i = 0; i < invisible_error_nframes_; ++i) {
+ if (invisible_error_frames_[i] == nframes_ - 1) {
+ std::cout << " Skipping decoding all invisible frames in "
+ "frame pkt: "
+ << invisible_error_frames_[i] << "\n";
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
+ if (allow_mismatch_) {
+ double mismatch_psnr = compute_psnr(img1, img2);
+ mismatch_psnr_ += mismatch_psnr;
+ ++mismatch_nframes_;
+ // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+ } else {
+ ::libaom_test::EncoderTest::MismatchHook(img1, img2);
+ }
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ (void)img;
+ (void)pts;
+ ++decoded_nframes_;
+ }
+
+ void SetErrorFrames(int num, unsigned int *list) {
+ if (num > kMaxErrorFrames)
+ num = kMaxErrorFrames;
+ else if (num < 0)
+ num = 0;
+ error_nframes_ = num;
+ for (unsigned int i = 0; i < error_nframes_; ++i)
+ error_frames_[i] = list[i];
+ }
+
+ void SetInvisibleErrorFrames(int num, unsigned int *list) {
+ if (num > kMaxInvisibleErrorFrames)
+ num = kMaxInvisibleErrorFrames;
+ else if (num < 0)
+ num = 0;
+ invisible_error_nframes_ = num;
+ for (unsigned int i = 0; i < invisible_error_nframes_; ++i)
+ invisible_error_frames_[i] = list[i];
+ }
+
+ void SetDroppableFrames(int num, unsigned int *list) {
+ if (num > kMaxDroppableFrames)
+ num = kMaxDroppableFrames;
+ else if (num < 0)
+ num = 0;
+ droppable_nframes_ = num;
+ for (unsigned int i = 0; i < droppable_nframes_; ++i)
+ droppable_frames_[i] = list[i];
+ }
+
+ void SetErrorResilientFrames(int num, unsigned int *list) {
+ if (num > kMaxErrorResilientFrames)
+ num = kMaxErrorResilientFrames;
+ else if (num < 0)
+ num = 0;
+ error_resilient_nframes_ = num;
+ for (unsigned int i = 0; i < error_resilient_nframes_; ++i)
+ error_resilient_frames_[i] = list[i];
+ }
+
+ void SetNoMFMVFrames(int num, unsigned int *list) {
+ if (num > kMaxNoMFMVFrames)
+ num = kMaxNoMFMVFrames;
+ else if (num < 0)
+ num = 0;
+ nomfmv_nframes_ = num;
+ for (unsigned int i = 0; i < nomfmv_nframes_; ++i)
+ nomfmv_frames_[i] = list[i];
+ }
+
+ void SetPrimaryRefNoneFrames(int num, unsigned int *list) {
+ if (num > kMaxPrimRefNoneFrames)
+ num = kMaxPrimRefNoneFrames;
+ else if (num < 0)
+ num = 0;
+ prim_ref_none_nframes_ = num;
+ for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i)
+ prim_ref_none_frames_[i] = list[i];
+ }
+
+ void SetSFrames(int num, unsigned int *list) {
+ if (num > kMaxSFrames)
+ num = kMaxSFrames;
+ else if (num < 0)
+ num = 0;
+ s_nframes_ = num;
+ for (unsigned int i = 0; i < s_nframes_; ++i) s_frames_[i] = list[i];
+ }
+
+ unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+ unsigned int GetEncodedFrames() { return nframes_; }
+ unsigned int GetDecodedFrames() { return decoded_nframes_; }
+
+ void SetAllowMismatch(int allow) { allow_mismatch_ = allow; }
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ unsigned int decoded_nframes_;
+ unsigned int error_nframes_;
+ unsigned int invisible_error_nframes_;
+ unsigned int droppable_nframes_;
+ unsigned int error_resilient_nframes_;
+ unsigned int nomfmv_nframes_;
+ unsigned int prim_ref_none_nframes_;
+ unsigned int s_nframes_;
+ double mismatch_psnr_;
+ unsigned int mismatch_nframes_;
+ unsigned int error_frames_[kMaxErrorFrames];
+ unsigned int invisible_error_frames_[kMaxInvisibleErrorFrames];
+ unsigned int droppable_frames_[kMaxDroppableFrames];
+ unsigned int error_resilient_frames_[kMaxErrorResilientFrames];
+ unsigned int nomfmv_frames_[kMaxNoMFMVFrames];
+ unsigned int prim_ref_none_frames_[kMaxPrimRefNoneFrames];
+ unsigned int s_frames_[kMaxSFrames];
+ libaom_test::TestMode encoding_mode_;
+ int allow_mismatch_;
+ int enable_altref_;
+};
+
+TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
+ SetupEncoder(2000, 10);
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 12);
+
+ // Global error resilient mode OFF.
+ cfg_.g_error_resilient = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_resilience_off = GetAveragePsnr();
+ EXPECT_GT(psnr_resilience_off, 25.0);
+
+ Reset();
+ // Error resilient mode ON for certain frames
+ unsigned int num_error_resilient_frames = 5;
+ unsigned int error_resilient_frame_list[] = { 3, 5, 6, 9, 11 };
+ SetErrorResilientFrames(num_error_resilient_frames,
+ error_resilient_frame_list);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_resilience_on = GetAveragePsnr();
+ EXPECT_GT(psnr_resilience_on, 25.0);
+
+ // Test that turning on error resilient mode hurts by 10% at most.
+ if (psnr_resilience_off > 0.0) {
+ const double psnr_ratio = psnr_resilience_on / psnr_resilience_off;
+ EXPECT_GE(psnr_ratio, 0.9);
+ EXPECT_LE(psnr_ratio, 1.1);
+ }
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) a set of droppable
+// frames (i.e., frames that don't update any reference buffers).
+TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
+ if (GET_PARAM(1) == ::libaom_test::kOnePassGood && GET_PARAM(2) == 1) {
+ fprintf(stderr, "Skipping test case #1 because of bug aomedia:3002\n");
+ return;
+ }
+ SetupEncoder(500, 10);
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 20);
+
+ // Set an arbitrary set of error frames same as droppable frames.
+ unsigned int num_droppable_frames = 3;
+ unsigned int droppable_frame_list[] = { 5, 11, 13 };
+ SetDroppableFrames(num_droppable_frames, droppable_frame_list);
+ SetErrorFrames(num_droppable_frames, droppable_frame_list);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Test that no mismatches have been found
+ std::cout << " Encoded frames: " << GetEncodedFrames() << "\n";
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_droppable_frames);
+}
+
+// Check for ParseAbility property of an error-resilient frame.
+// Encode a frame in error-resilient mode (E-frame), and disallow all
+// subsequent frames from using MFMV. If frames are dropped before the
+// E frame, all frames starting from the E frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, ParseAbilityTest) {
+ SetupEncoder(500, 10);
+
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 15);
+
+ SetAllowMismatch(1);
+
+ // Note that an E-frame cannot be forced on a frame that is a
+ // show_existing_frame, or a frame that comes directly after an invisible
+ // frame. Currently, this will cause an assertion failure.
+ // Set an arbitrary error resilient (E) frame
+ unsigned int num_error_resilient_frames = 1;
+ unsigned int error_resilient_frame_list[] = { 8 };
+ SetErrorResilientFrames(num_error_resilient_frames,
+ error_resilient_frame_list);
+ // Ensure that any invisible frames before the E frame are dropped
+ SetInvisibleErrorFrames(num_error_resilient_frames,
+ error_resilient_frame_list);
+ // Set all frames after the error resilient frame to not allow MFMV
+ unsigned int num_post_error_resilient_frames = 6;
+ unsigned int post_error_resilient_frame_list[] = { 9, 10, 11, 12, 13, 14 };
+ SetNoMFMVFrames(num_post_error_resilient_frames,
+ post_error_resilient_frame_list);
+
+ // Set a few frames before the E frame that are lost (not decoded)
+ unsigned int num_error_frames = 5;
+ unsigned int error_frame_list[] = { 3, 4, 5, 6, 7 };
+ SetErrorFrames(num_error_frames, error_frame_list);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::cout << " Encoded frames: " << GetEncodedFrames() << "\n";
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+ // All frames following the E-frame and the E-frame are expected to have
+ // mismatches, but still be parse-able.
+ EXPECT_LE(GetMismatchFrames(), num_post_error_resilient_frames + 1);
+}
+
+// Check for ParseAbility property of an S frame.
+// Encode an S-frame. If frames are dropped before the S-frame, all frames
+// starting from the S frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, SFrameTest) {
+ SetupEncoder(500, 10);
+
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 15);
+
+ SetAllowMismatch(1);
+
+ // Note that an S-frame cannot be forced on a frame that is a
+ // show_existing_frame. This issue still needs to be addressed.
+ // Set an arbitrary S-frame
+ unsigned int num_s_frames = 1;
+ unsigned int s_frame_list[] = { 6 };
+ SetSFrames(num_s_frames, s_frame_list);
+ // Ensure that any invisible frames before the S frame are dropped
+ SetInvisibleErrorFrames(num_s_frames, s_frame_list);
+
+ // Set a few frames before the S frame that are lost (not decoded)
+ unsigned int num_error_frames = 4;
+ unsigned int error_frame_list[] = { 2, 3, 4, 5 };
+ SetErrorFrames(num_error_frames, error_frame_list);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::cout << " Encoded frames: " << GetEncodedFrames() << "\n";
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+ // All frames following the S-frame and the S-frame are expected to have
+ // mismatches, but still be parse-able.
+ EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES,
+ ::testing::Values(0, 1));
+} // namespace
diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc
new file mode 100644
index 0000000000..ce45394eb8
--- /dev/null
+++ b/third_party/aom/test/ethread_test.cc
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "av1/encoder/firstpass.h"
+
+namespace {
+const unsigned int kCqLevel = 18;
+
+#if !CONFIG_REALTIME_ONLY
+const size_t kFirstPassStatsSz = sizeof(FIRSTPASS_STATS);
+class AVxFirstPassEncoderThreadTest
+ : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+ int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AVxFirstPassEncoderThreadTest()
+ : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+ encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+ tile_rows_(GET_PARAM(3)), tile_cols_(GET_PARAM(4)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ row_mt_ = 1;
+ firstpass_stats_.buf = nullptr;
+ firstpass_stats_.sz = 0;
+ }
+ ~AVxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_min_quantizer = 0;
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ encoder_initialized_ = false;
+ abort_ = false;
+ }
+
+ void EndPassHook() override {
+ // For first pass stats test, only run first pass encoder.
+ if (cfg_.g_pass == AOM_RC_FIRST_PASS) abort_ = true;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+ ::libaom_test::Encoder *encoder) override {
+ if (!encoder_initialized_) {
+ // Encode in 2-pass mode.
+ SetTileSize(encoder);
+ encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+
+ encoder_initialized_ = true;
+ }
+ }
+
+ virtual void SetTileSize(libaom_test::Encoder *encoder) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+ encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+ }
+
+ void StatsPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ const uint8_t *const pkt_buf =
+ reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
+ const size_t pkt_size = pkt->data.twopass_stats.sz;
+
+ // First pass stats size equals sizeof(FIRSTPASS_STATS)
+ EXPECT_EQ(pkt_size, kFirstPassStatsSz)
+ << "Error: First pass stats size doesn't equal kFirstPassStatsSz";
+
+ firstpass_stats_.buf =
+ realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+ ASSERT_NE(firstpass_stats_.buf, nullptr);
+ memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
+ pkt_size);
+ firstpass_stats_.sz += pkt_size;
+ }
+
+ bool encoder_initialized_;
+ ::libaom_test::TestMode encoding_mode_;
+ int set_cpu_used_;
+ int tile_rows_;
+ int tile_cols_;
+ int row_mt_;
+ aom_fixed_buf_t firstpass_stats_;
+};
+
+static void compare_fp_stats_md5(aom_fixed_buf_t *fp_stats) {
+ // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+ // stats are compared to check if the stats match.
+ uint8_t *stats1 = reinterpret_cast<uint8_t *>(fp_stats->buf);
+ uint8_t *stats2 = stats1 + fp_stats->sz / 2;
+ ::libaom_test::MD5 md5_row_mt_0, md5_row_mt_1;
+
+ md5_row_mt_0.Add(stats1, fp_stats->sz / 2);
+ const char *md5_row_mt_0_str = md5_row_mt_0.Get();
+
+ md5_row_mt_1.Add(stats2, fp_stats->sz / 2);
+ const char *md5_row_mt_1_str = md5_row_mt_1.Get();
+
+ // Check md5 match.
+ ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str)
+ << "MD5 checksums don't match";
+}
+
+TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ aom_fixed_buf_t firstpass_stats;
+ size_t single_run_sz;
+
+ cfg_.rc_target_bitrate = 1000;
+
+ // 5 encodes will be run:
+ // 1. row_mt_=0 and threads=1
+ // 2. row_mt_=1 and threads=1
+ // 3. row_mt_=1 and threads=2
+ // 4. row_mt_=1 and threads=4
+ // 5. row_mt_=1 and threads=8
+
+ // 4 comparisons will be made:
+ // 1. Between run 1 and run 2.
+ // 2. Between run 2 and run 3.
+ // 3. Between run 3 and run 4.
+ // 4. Between run 4 and run 5.
+
+ // Test row_mt_: 0 vs 1 at single thread case(threads = 1)
+ cfg_.g_threads = 1;
+
+ row_mt_ = 0;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ row_mt_ = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ firstpass_stats.buf = firstpass_stats_.buf;
+ firstpass_stats.sz = firstpass_stats_.sz;
+ single_run_sz = firstpass_stats_.sz / 2;
+
+ // Compare to check if using or not using row-mt are bit exact.
+ // Comparison 1 (between row_mt_=0 and row_mt_=1).
+ ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+ // Test single thread vs multiple threads
+ row_mt_ = 1;
+
+ cfg_.g_threads = 2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // offset to the 2nd and 3rd run.
+ firstpass_stats.buf = reinterpret_cast<void *>(
+ reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz);
+
+ // Compare to check if single-thread and multi-thread stats are bit exact.
+ // Comparison 2 (between threads=1 and threads=2).
+ ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+ cfg_.g_threads = 4;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // offset to the 3rd and 4th run
+ firstpass_stats.buf = reinterpret_cast<void *>(
+ reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz * 2);
+
+ // Comparison 3 (between threads=2 and threads=4).
+ ASSERT_NO_FATAL_FAILURE(compare_fp_stats_md5(&firstpass_stats));
+
+ cfg_.g_threads = 8;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // offset to the 4th and 5th run.
+ firstpass_stats.buf = reinterpret_cast<void *>(
+ reinterpret_cast<uint8_t *>(firstpass_stats_.buf) + single_run_sz * 3);
+
+ // Comparison 4 (between threads=4 and threads=8).
+ compare_fp_stats_md5(&firstpass_stats);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+class AVxEncoderThreadTest
+ : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+ int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AVxEncoderThreadTest()
+ : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+ encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+ tile_cols_(GET_PARAM(3)), tile_rows_(GET_PARAM(4)),
+ row_mt_(GET_PARAM(5)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = 1280;
+ cfg.h = 720;
+ cfg.allow_lowbitdepth = 1;
+ decoder_ = codec_->CreateDecoder(cfg, 0);
+ if (decoder_->IsAV1()) {
+ decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+
+ size_enc_.clear();
+ md5_dec_.clear();
+ md5_enc_.clear();
+ }
+ ~AVxEncoderThreadTest() override { delete decoder_; }
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+
+ if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+ encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ cfg_.g_lag_in_frames = 6;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ } else if (encoding_mode_ == ::libaom_test::kRealTime) {
+ cfg_.g_error_resilient = 1;
+ }
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_min_quantizer = 0;
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ encoder_initialized_ = false;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+ ::libaom_test::Encoder *encoder) override {
+ if (!encoder_initialized_) {
+ SetTileSize(encoder);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+ if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+ encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 5);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+ encoder->Control(AV1E_SET_MAX_GF_INTERVAL, 4);
+ // In row_mt_=0 case, the output of single thread (1 thread) will be
+ // compared with multi thread (4 thread) output (as per line no:340).
+ // Currently, Loop restoration stage is conditionally disabled for speed
+ // 5, 6 when num_workers > 1. Due to this, the match between single
+ // thread and multi thread output can not be achieved. Hence, testing
+ // this case alone with LR disabled.
+ // TODO(aomedia:3446): Remove the constraint on this test case once Loop
+ // restoration state is same in both single and multi thread path.
+ if (set_cpu_used_ >= 5 && row_mt_ == 0)
+ encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+ } else if (encoding_mode_ == ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+ encoder->Control(AV1E_SET_AQ_MODE, 3);
+ encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 3);
+ encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 3);
+ } else {
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ }
+ encoder_initialized_ = true;
+ }
+ }
+
+ virtual void SetTileSize(libaom_test::Encoder *encoder) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+ encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ size_enc_.push_back(pkt->data.frame.sz);
+
+ ::libaom_test::MD5 md5_enc;
+ md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz);
+ md5_enc_.push_back(md5_enc.Get());
+
+ const aom_codec_err_t res = decoder_->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ const aom_image_t *img = decoder_->GetDxData().Next();
+
+ if (img) {
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(img);
+ md5_dec_.push_back(md5_res.Get());
+ }
+ }
+
+ void DoTest() {
+ ::libaom_test::YUVVideoSource video(
+ "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 26);
+ cfg_.rc_target_bitrate = 1000;
+
+ if (row_mt_ == 0) {
+ // Encode using single thread.
+ cfg_.g_threads = 1;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> single_thr_size_enc;
+ std::vector<std::string> single_thr_md5_enc;
+ std::vector<std::string> single_thr_md5_dec;
+ single_thr_size_enc = size_enc_;
+ single_thr_md5_enc = md5_enc_;
+ single_thr_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Encode using multiple threads.
+ cfg_.g_threads = 4;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> multi_thr_size_enc;
+ std::vector<std::string> multi_thr_md5_enc;
+ std::vector<std::string> multi_thr_md5_dec;
+ multi_thr_size_enc = size_enc_;
+ multi_thr_md5_enc = md5_enc_;
+ multi_thr_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Check that the vectors are equal.
+ ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
+ ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
+ ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
+
+ DoTestMaxThreads(&video, single_thr_size_enc, single_thr_md5_enc,
+ single_thr_md5_dec);
+ } else if (row_mt_ == 1) {
+ // Encode using multiple threads row-mt enabled.
+ cfg_.g_threads = 2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> multi_thr2_row_mt_size_enc;
+ std::vector<std::string> multi_thr2_row_mt_md5_enc;
+ std::vector<std::string> multi_thr2_row_mt_md5_dec;
+ multi_thr2_row_mt_size_enc = size_enc_;
+ multi_thr2_row_mt_md5_enc = md5_enc_;
+ multi_thr2_row_mt_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Disable threads=3 test for now to reduce the time so that the nightly
+ // test would not time out.
+ // cfg_.g_threads = 3;
+ // ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // std::vector<size_t> multi_thr3_row_mt_size_enc;
+ // std::vector<std::string> multi_thr3_row_mt_md5_enc;
+ // std::vector<std::string> multi_thr3_row_mt_md5_dec;
+ // multi_thr3_row_mt_size_enc = size_enc_;
+ // multi_thr3_row_mt_md5_enc = md5_enc_;
+ // multi_thr3_row_mt_md5_dec = md5_dec_;
+ // size_enc_.clear();
+ // md5_enc_.clear();
+ // md5_dec_.clear();
+ // Check that the vectors are equal.
+ // ASSERT_EQ(multi_thr3_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+ // ASSERT_EQ(multi_thr3_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+ // ASSERT_EQ(multi_thr3_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+ cfg_.g_threads = 4;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> multi_thr4_row_mt_size_enc;
+ std::vector<std::string> multi_thr4_row_mt_md5_enc;
+ std::vector<std::string> multi_thr4_row_mt_md5_dec;
+ multi_thr4_row_mt_size_enc = size_enc_;
+ multi_thr4_row_mt_md5_enc = md5_enc_;
+ multi_thr4_row_mt_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Check that the vectors are equal.
+ ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+ ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+ ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+ DoTestMaxThreads(&video, multi_thr2_row_mt_size_enc,
+ multi_thr2_row_mt_md5_enc, multi_thr2_row_mt_md5_dec);
+ }
+ }
+
+ virtual void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+ const std::vector<size_t> ref_size_enc,
+ const std::vector<std::string> ref_md5_enc,
+ const std::vector<std::string> ref_md5_dec) {
+ // This value should be kept the same as MAX_NUM_THREADS
+ // in aom_thread.h
+ cfg_.g_threads = 64;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+ std::vector<size_t> multi_thr_max_row_mt_size_enc;
+ std::vector<std::string> multi_thr_max_row_mt_md5_enc;
+ std::vector<std::string> multi_thr_max_row_mt_md5_dec;
+ multi_thr_max_row_mt_size_enc = size_enc_;
+ multi_thr_max_row_mt_md5_enc = md5_enc_;
+ multi_thr_max_row_mt_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Check that the vectors are equal.
+ ASSERT_EQ(ref_size_enc, multi_thr_max_row_mt_size_enc);
+ ASSERT_EQ(ref_md5_enc, multi_thr_max_row_mt_md5_enc);
+ ASSERT_EQ(ref_md5_dec, multi_thr_max_row_mt_md5_dec);
+ }
+
+ bool encoder_initialized_;
+ ::libaom_test::TestMode encoding_mode_;
+ int set_cpu_used_;
+ int tile_cols_;
+ int tile_rows_;
+ int row_mt_;
+ ::libaom_test::Decoder *decoder_;
+ std::vector<size_t> size_enc_;
+ std::vector<std::string> md5_enc_;
+ std::vector<std::string> md5_dec_;
+};
+
+class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) {
+ cfg_.large_scale_tile = 0;
+ decoder_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+// For real time mode, test speed 5, 6, 7, 8, 9, 10.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Values(5, 6, 7, 8, 9, 10),
+ ::testing::Values(0, 2), ::testing::Values(0, 2),
+ ::testing::Values(0, 1));
+
+#if !CONFIG_REALTIME_ONLY
+
+// The AVxEncoderThreadTestLarge takes up ~14% of total run-time of the
+// Valgrind long tests. Exclude it; the smaller tests are still run.
+#if !AOM_VALGRIND_BUILD
+class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
+ cfg_.large_scale_tile = 0;
+ decoder_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+// Test cpu_used 0, 1, 3 and 5.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
+ ::testing::Values(::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood),
+ ::testing::Values(0, 1, 3, 5),
+ ::testing::Values(1, 6), ::testing::Values(1, 6),
+ ::testing::Values(0, 1));
+#endif // !AOM_VALGRIND_BUILD
+
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+ cfg_.large_scale_tile = 0;
+ decoder_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+class AVxEncoderThreadAllIntraTest : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTest, EncoderResultTest) {
+ cfg_.large_scale_tile = 0;
+ decoder_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+class AVxEncoderThreadAllIntraTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTestLarge, EncoderResultTest) {
+ cfg_.large_scale_tile = 0;
+ decoder_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+// first pass stats test
+AV1_INSTANTIATE_TEST_SUITE(AVxFirstPassEncoderThreadTest,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Range(0, 6, 2), ::testing::Range(0, 2),
+ ::testing::Range(1, 3));
+
+// For AV1, test speed 0, 1, 2, 3, 5.
+// Only test cpu_used 2 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTest,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Values(2), ::testing::Values(0, 2),
+ ::testing::Values(0, 2), ::testing::Values(0, 1));
+
+// For all intra mode, test speed 0, 2, 4, 6, 8.
+// Only test cpu_used 6 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTest,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(6), ::testing::Values(0, 2),
+ ::testing::Values(0, 2), ::testing::Values(0, 1));
+
+// Test cpu_used 0, 2, 4 and 8.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTestLarge,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(0, 2, 4, 8),
+ ::testing::Values(1, 6), ::testing::Values(1, 6),
+ ::testing::Values(0, 1));
+#endif // !CONFIG_REALTIME_ONLY
+
+class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
+ void SetTileSize(libaom_test::Encoder *encoder) override {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+ encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+ }
+
+ void DoTestMaxThreads(::libaom_test::YUVVideoSource *video,
+ const std::vector<size_t> ref_size_enc,
+ const std::vector<std::string> ref_md5_enc,
+ const std::vector<std::string> ref_md5_dec) override {
+ (void)video;
+ (void)ref_size_enc;
+ (void)ref_md5_enc;
+ (void)ref_md5_dec;
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AVxEncoderThreadLSTest);
+
+TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
+ cfg_.large_scale_tile = 1;
+ decoder_->Control(AV1_SET_TILE_MODE, 1);
+ decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ DoTest();
+}
+
+// AVxEncoderThreadLSTestLarge takes up about 2% of total run-time of
+// the Valgrind long tests. Since we already run AVxEncoderThreadLSTest,
+// skip this one for Valgrind.
+#if !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
+class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
+
+TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
+ cfg_.large_scale_tile = 1;
+ decoder_->Control(AV1_SET_TILE_MODE, 1);
+ decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ DoTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadLSTestLarge,
+ ::testing::Values(::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood),
+ ::testing::Values(1, 3), ::testing::Values(0, 6),
+ ::testing::Values(0, 6), ::testing::Values(1));
+#endif // !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
+} // namespace
diff --git a/third_party/aom/test/examples.sh b/third_party/aom/test/examples.sh
new file mode 100755
index 0000000000..3e1612303c
--- /dev/null
+++ b/third_party/aom/test/examples.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file runs all of the tests for the libaom examples.
+##
+readonly EXEC_DIR="$(pwd)"
+. $(dirname $0)/tools_common.sh
+
+example_tests=$(ls -r $(dirname $0)/*.sh)
+
+# List of script names to exclude.
+exclude_list="best_encode examples run_encodes tools_common"
+
+if [ "$(realtime_only_build)" = "yes" ]; then
+ exclude_list="${exclude_list} twopass_encoder simple_decoder lightfield_test"
+fi
+
+# Filter out the scripts in $exclude_list.
+for word in ${exclude_list}; do
+ example_tests=$(filter_strings "${example_tests}" "${word}" exclude)
+done
+
+for test in ${example_tests}; do
+ # Source each test script so that exporting variables can be avoided.
+ AOM_TEST_NAME="$(basename ${test%.*})"
+ . "${test}"
+ # Restore the working directory to the one at the beginning of execution.
+ # This avoids side-effects from tests that change the directory.
+ cd "${EXEC_DIR}"
+done
diff --git a/third_party/aom/test/external_frame_buffer_test.cc b/third_party/aom/test/external_frame_buffer_test.cc
new file mode 100644
index 0000000000..8f16c4e2d5
--- /dev/null
+++ b/third_party/aom/test/external_frame_buffer_test.cc
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <memory>
+#include <string>
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kVideoNameParam = 1;
+
+struct ExternalFrameBuffer {
+ uint8_t *data;
+ size_t size;
+ int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+ ExternalFrameBufferList()
+ : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(nullptr) {}
+
+ virtual ~ExternalFrameBufferList() {
+ for (int i = 0; i < num_buffers_; ++i) {
+ delete[] ext_fb_list_[i].data;
+ }
+ delete[] ext_fb_list_;
+ }
+
+ // Creates the list to hold the external buffers. Returns true on success.
+ bool CreateBufferList(int num_buffers) {
+ if (num_buffers < 0) return false;
+
+ num_buffers_ = num_buffers;
+ ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+ if (ext_fb_list_ == nullptr) {
+ EXPECT_NE(ext_fb_list_, nullptr);
+ return false;
+ }
+ memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+ return true;
+ }
+
+ // Searches the frame buffer list for a free frame buffer. Makes sure
+ // that the frame buffer is at least |min_size| in bytes. Marks that the
+ // frame buffer is in use by libaom. Finally sets |fb| to point to the
+ // external frame buffer. Returns < 0 on an error.
+ int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+ EXPECT_NE(fb, nullptr);
+ const int idx = FindFreeBufferIndex();
+ if (idx == num_buffers_) return -1;
+
+ if (ext_fb_list_[idx].size < min_size) {
+ delete[] ext_fb_list_[idx].data;
+ ext_fb_list_[idx].data = new uint8_t[min_size];
+ if (ext_fb_list_[idx].data == nullptr) {
+ EXPECT_NE(ext_fb_list_[idx].data, nullptr);
+ }
+ memset(ext_fb_list_[idx].data, 0, min_size);
+ ext_fb_list_[idx].size = min_size;
+ }
+
+ SetFrameBuffer(idx, fb);
+
+ num_used_buffers_++;
+ return 0;
+ }
+
+ // Test function that will not allocate any data for the frame buffer.
+ // Returns < 0 on an error.
+ int GetZeroFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+ EXPECT_NE(fb, nullptr);
+ const int idx = FindFreeBufferIndex();
+ if (idx == num_buffers_) return -1;
+
+ if (ext_fb_list_[idx].size < min_size) {
+ delete[] ext_fb_list_[idx].data;
+ ext_fb_list_[idx].data = nullptr;
+ ext_fb_list_[idx].size = min_size;
+ }
+
+ SetFrameBuffer(idx, fb);
+ return 0;
+ }
+
+ // Marks the external frame buffer that |fb| is pointing to as free.
+ // Returns < 0 on an error.
+ int ReturnFrameBuffer(aom_codec_frame_buffer_t *fb) {
+ if (fb == nullptr) {
+ EXPECT_NE(fb, nullptr);
+ return -1;
+ }
+ ExternalFrameBuffer *const ext_fb =
+ reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
+ if (ext_fb == nullptr) {
+ EXPECT_NE(ext_fb, nullptr);
+ return -1;
+ }
+ EXPECT_EQ(1, ext_fb->in_use);
+ ext_fb->in_use = 0;
+ num_used_buffers_--;
+ return 0;
+ }
+
+ // Checks that the aom_image_t data is contained within the external frame
+ // buffer private data passed back in the aom_image_t.
+ void CheckImageFrameBuffer(const aom_image_t *img) {
+ const struct ExternalFrameBuffer *const ext_fb =
+ reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
+
+ ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+ img->planes[0] < (ext_fb->data + ext_fb->size));
+ }
+
+ int num_used_buffers() const { return num_used_buffers_; }
+
+ private:
+ // Returns the index of the first free frame buffer. Returns |num_buffers_|
+ // if there are no free frame buffers.
+ int FindFreeBufferIndex() {
+ int i;
+ // Find a free frame buffer.
+ for (i = 0; i < num_buffers_; ++i) {
+ if (!ext_fb_list_[i].in_use) break;
+ }
+ return i;
+ }
+
+ // Sets |fb| to an external frame buffer. idx is the index into the frame
+ // buffer list.
+ void SetFrameBuffer(int idx, aom_codec_frame_buffer_t *fb) {
+ ASSERT_NE(fb, nullptr);
+ fb->data = ext_fb_list_[idx].data;
+ fb->size = ext_fb_list_[idx].size;
+ ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+ ext_fb_list_[idx].in_use = 1;
+ fb->priv = &ext_fb_list_[idx];
+ }
+
+ int num_buffers_;
+ int num_used_buffers_;
+ ExternalFrameBuffer *ext_fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+
+// Callback used by libaom to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_aom_frame_buffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libaom to tell the application that |fb| is not needed
+// anymore.
+int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_aom_zero_frame_buffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_aom_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferList *const fb_list =
+ reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+ return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_aom_frame_buffer(void *user_priv,
+ aom_codec_frame_buffer_t *fb) {
+ (void)user_priv;
+ (void)fb;
+ return 0;
+}
+
+#endif // CONFIG_WEBM_IO
+
+// Class for testing passing in external frame buffers to libaom.
+class ExternalFrameBufferMD5Test
+ : public ::libaom_test::DecoderTest,
+ public ::libaom_test::CodecTestWithParam<const char *> {
+ protected:
+ ExternalFrameBufferMD5Test()
+ : DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)),
+ md5_file_(nullptr), num_buffers_(0) {}
+
+ ~ExternalFrameBufferMD5Test() override {
+ if (md5_file_ != nullptr) fclose(md5_file_);
+ }
+
+ void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
+ if (num_buffers_ > 0 && video.frame_number() == 0) {
+ // Have libaom use frame buffers we create.
+ ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+ ASSERT_EQ(AOM_CODEC_OK,
+ decoder->SetFrameBufferFunctions(GetAV1FrameBuffer,
+ ReleaseAV1FrameBuffer, this));
+ }
+ }
+
+ void OpenMD5File(const std::string &md5_file_name_) {
+ md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+ ASSERT_NE(md5_file_, nullptr)
+ << "Md5 file open failed. Filename: " << md5_file_name_;
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int frame_number) override {
+ ASSERT_NE(md5_file_, nullptr);
+ char expected_md5[33];
+ char junk[128];
+
+ // Read correct md5 checksums.
+ const int res = fscanf(md5_file_, "%s %s", expected_md5, junk);
+ ASSERT_NE(EOF, res) << "Read md5 data failed";
+ expected_md5[32] = '\0';
+
+ ::libaom_test::MD5 md5_res;
+#if FORCE_HIGHBITDEPTH_DECODING
+ const aom_img_fmt_t shifted_fmt =
+ (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
+ if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
+ aom_image_t *img_shifted =
+ aom_img_alloc(nullptr, shifted_fmt, img.d_w, img.d_h, 16);
+ img_shifted->bit_depth = img.bit_depth;
+ img_shifted->monochrome = img.monochrome;
+ aom_img_downshift(img_shifted, &img, 0);
+ md5_res.Add(img_shifted);
+ aom_img_free(img_shifted);
+ } else {
+#endif
+ md5_res.Add(&img);
+#if FORCE_HIGHBITDEPTH_DECODING
+ }
+#endif
+ const char *const actual_md5 = md5_res.Get();
+
+ // Check md5 match.
+ ASSERT_STREQ(expected_md5, actual_md5)
+ << "Md5 checksums don't match: frame number = " << frame_number;
+
+ const struct ExternalFrameBuffer *const ext_fb =
+ reinterpret_cast<ExternalFrameBuffer *>(img.fb_priv);
+
+ ASSERT_TRUE(img.planes[0] >= ext_fb->data &&
+ img.planes[0] < (ext_fb->data + ext_fb->size));
+ }
+
+ // Callback to get a free external frame buffer. Return value < 0 is an
+ // error.
+ static int GetAV1FrameBuffer(void *user_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferMD5Test *const md5Test =
+ reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+ return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+ }
+
+ // Callback to release an external frame buffer. Return value < 0 is an
+ // error.
+ static int ReleaseAV1FrameBuffer(void *user_priv,
+ aom_codec_frame_buffer_t *fb) {
+ ExternalFrameBufferMD5Test *const md5Test =
+ reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+ return md5Test->fb_list_.ReturnFrameBuffer(fb);
+ }
+
+ void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+ int num_buffers() const { return num_buffers_; }
+
+ private:
+ FILE *md5_file_;
+ int num_buffers_;
+ ExternalFrameBufferList fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv";
+const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf";
+
+// Class for testing passing in external frame buffers to libaom.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+ ExternalFrameBufferTest()
+ : video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
+
+ void SetUp() override {
+ video_ = new libaom_test::WebMVideoSource(kAV1TestFile);
+ ASSERT_NE(video_, nullptr);
+ video_->Init();
+ video_->Begin();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+ decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+ ASSERT_NE(decoder_, nullptr);
+ }
+
+ void TearDown() override {
+ delete decoder_;
+ decoder_ = nullptr;
+ delete video_;
+ video_ = nullptr;
+ }
+
+ // Passes the external frame buffer information to libaom.
+ aom_codec_err_t SetFrameBufferFunctions(
+ int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release) {
+ if (num_buffers > 0) {
+ num_buffers_ = num_buffers;
+ EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+ }
+
+ return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+ }
+
+ aom_codec_err_t DecodeOneFrame() {
+ const aom_codec_err_t res =
+ decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+ CheckDecodedFrames();
+ if (res == AOM_CODEC_OK) video_->Next();
+ return res;
+ }
+
+ aom_codec_err_t DecodeRemainingFrames() {
+ for (; video_->cxdata() != nullptr; video_->Next()) {
+ const aom_codec_err_t res =
+ decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+ if (res != AOM_CODEC_OK) return res;
+ CheckDecodedFrames();
+ }
+ return AOM_CODEC_OK;
+ }
+
+ protected:
+ void CheckDecodedFrames() {
+ libaom_test::DxDataIterator dec_iter = decoder_->GetDxData();
+ const aom_image_t *img = nullptr;
+
+ // Get decompressed data
+ while ((img = dec_iter.Next()) != nullptr) {
+ fb_list_.CheckImageFrameBuffer(img);
+ }
+ }
+
+ libaom_test::CompressedVideoSource *video_;
+ libaom_test::AV1Decoder *decoder_;
+ int num_buffers_;
+ ExternalFrameBufferList fb_list_;
+};
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+ void SetUp() override {
+ video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
+ ASSERT_NE(video_, nullptr);
+ video_->Init();
+ video_->Begin();
+
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+ decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+ ASSERT_NE(decoder_, nullptr);
+ }
+
+ virtual void CheckFrameBufferRelease() {
+ TearDown();
+ ASSERT_EQ(0, fb_list_.num_used_buffers());
+ }
+};
+#endif // CONFIG_WEBM_IO
+
+// This test runs through the set of test vectors, and decodes them.
+// Libaom will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
+ const std::string filename = GET_PARAM(kVideoNameParam);
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+
+ // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+ // #AOM_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+ const int jitter_buffers = 4;
+ const int num_buffers =
+ AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+ set_num_buffers(num_buffers);
+
+ // Open compressed video file.
+ std::unique_ptr<libaom_test::CompressedVideoSource> video;
+ if (filename.substr(filename.length() - 3, 3) == "ivf") {
+ video.reset(new libaom_test::IVFVideoSource(filename));
+ } else {
+#if CONFIG_WEBM_IO
+ video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+ fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+ filename.c_str());
+ return;
+#endif
+ }
+ ASSERT_NE(video, nullptr);
+ video->Init();
+
+ // Construct md5 file name.
+ const std::string md5_filename = filename + ".md5";
+ OpenMD5File(md5_filename);
+
+ // Set decode config.
+ cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+ set_cfg(cfg);
+
+ // Decode frame, and check the md5 matching.
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
+}
+
+#if CONFIG_WEBM_IO
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+ // Minimum number of external frame buffers for AV1 is
+ // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS.
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+ // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+ // #AOM_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+ const int jitter_buffers = 8;
+ const int num_buffers =
+ AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
+ // Minimum number of external frame buffers for AV1 is
+ // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS. Most files will
+ // only use 5 frame buffers at one time.
+ const int num_buffers = 2;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+ // Only run this on long clips. Decoding a very short clip will return
+ // AOM_CODEC_OK even with only 2 buffers.
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NoRelease) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ do_not_release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_zero_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(
+ num_buffers, get_aom_one_less_byte_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(
+ AOM_CODEC_INVALID_PARAM,
+ SetFrameBufferFunctions(num_buffers, nullptr, release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(
+ AOM_CODEC_INVALID_PARAM,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, nullptr));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+ ASSERT_EQ(AOM_CODEC_ERROR,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+ const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ ASSERT_EQ(AOM_CODEC_OK,
+ SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+ release_aom_frame_buffer));
+ ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+ CheckFrameBufferRelease();
+}
+#endif // CONFIG_WEBM_IO
+
+AV1_INSTANTIATE_TEST_SUITE(
+ ExternalFrameBufferMD5Test,
+ ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+ libaom_test::kAV1TestVectors +
+ libaom_test::kNumAV1TestVectors));
+} // namespace
diff --git a/third_party/aom/test/fdct4x4_test.cc b/third_party/aom/test/fdct4x4_test.cc
new file mode 100644
index 0000000000..9cbf208adb
--- /dev/null
+++ b/third_party/aom/test/fdct4x4_test.cc
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+template <typename OutputType>
+using FdctFunc = void (*)(const int16_t *in, OutputType *out, int stride);
+
+template <typename OutputType>
+using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride,
+ TxfmParam *txfm_param);
+
+template <typename OutputType>
+using Fdct4x4Param =
+ std::tuple<FdctFunc<OutputType>, FhtFunc<OutputType>, aom_bit_depth_t, int>;
+
+#if HAVE_NEON || HAVE_SSE2
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+ TxfmParam * /*txfm_param*/) {
+ aom_fdct4x4_c(in, out, stride);
+}
+
+void fdct4x4_lp_ref(const int16_t *in, int16_t *out, int stride,
+ TxfmParam * /*txfm_param*/) {
+ aom_fdct4x4_lp_c(in, out, stride);
+}
+#endif
+
+template <typename OutputType>
+class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
+ public ::testing::TestWithParam<Fdct4x4Param<OutputType>> {
+ public:
+ ~Trans4x4FDCT() override = default;
+
+ using TxfmBaseOutType = libaom_test::TransformTestBase<OutputType>;
+ void SetUp() override {
+ fwd_txfm_ = std::get<0>(this->GetParam());
+ TxfmBaseOutType::pitch_ = 4;
+ TxfmBaseOutType::height_ = 4;
+ TxfmBaseOutType::fwd_txfm_ref = std::get<1>(this->GetParam());
+ TxfmBaseOutType::bit_depth_ = std::get<2>(this->GetParam());
+ TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
+ TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
+ }
+
+ protected:
+ void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) override {
+ fwd_txfm_(in, out, stride);
+ }
+
+ void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) override {
+ (void)out;
+ (void)dst;
+ (void)stride;
+ }
+
+ FdctFunc<OutputType> fwd_txfm_;
+};
+
+using Trans4x4FDCTTranLow = Trans4x4FDCT<tran_low_t>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Trans4x4FDCTTranLow);
+TEST_P(Trans4x4FDCTTranLow, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(Trans4x4FDCTTranLow, MemCheck) { RunMemCheck(); }
+
+using Trans4x4FDCTInt16 = Trans4x4FDCT<int16_t>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Trans4x4FDCTInt16);
+TEST_P(Trans4x4FDCTInt16, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(Trans4x4FDCTInt16, MemCheck) { RunMemCheck(); }
+
+using std::make_tuple;
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTTranLow,
+ ::testing::Values(make_tuple(&aom_fdct4x4_neon,
+ &fdct4x4_ref, AOM_BITS_8,
+ 16)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTInt16,
+ ::testing::Values(make_tuple(&aom_fdct4x4_lp_neon,
+ &fdct4x4_lp_ref,
+ AOM_BITS_8, 16)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTTranLow,
+ ::testing::Values(make_tuple(&aom_fdct4x4_sse2,
+ &fdct4x4_ref, AOM_BITS_8,
+ 16)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTInt16,
+ ::testing::Values(make_tuple(&aom_fdct4x4_lp_sse2,
+ &fdct4x4_lp_ref,
+ AOM_BITS_8, 16)));
+#endif
+} // namespace
diff --git a/third_party/aom/test/fft_test.cc b/third_party/aom/test/fft_test.cc
new file mode 100644
index 0000000000..06a17a3f8f
--- /dev/null
+++ b/third_party/aom/test/fft_test.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <algorithm>
+#include <complex>
+#include <ostream>
+#include <vector>
+
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef void (*tform_fun_t)(const float *input, float *temp, float *output);
+
+// Simple 1D FFT implementation
+template <typename InputType>
+void fft(const InputType *data, std::complex<float> *result, int n) {
+ if (n == 1) {
+ result[0] = data[0];
+ return;
+ }
+ std::vector<InputType> temp(n);
+ for (int k = 0; k < n / 2; ++k) {
+ temp[k] = data[2 * k];
+ temp[n / 2 + k] = data[2 * k + 1];
+ }
+ fft(&temp[0], result, n / 2);
+ fft(&temp[n / 2], result + n / 2, n / 2);
+ for (int k = 0; k < n / 2; ++k) {
+ std::complex<float> w = std::complex<float>((float)cos(2. * PI * k / n),
+ (float)-sin(2. * PI * k / n));
+ std::complex<float> a = result[k];
+ std::complex<float> b = result[n / 2 + k];
+ result[k] = a + w * b;
+ result[n / 2 + k] = a - w * b;
+ }
+}
+
+void transpose(std::vector<std::complex<float> > *data, int n) {
+ for (int y = 0; y < n; ++y) {
+ for (int x = y + 1; x < n; ++x) {
+ std::swap((*data)[y * n + x], (*data)[x * n + y]);
+ }
+ }
+}
+
+// Simple 2D FFT implementation
+template <class InputType>
+std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
+ std::vector<std::complex<float> > rowfft(n * n);
+ std::vector<std::complex<float> > result(n * n);
+ for (int y = 0; y < n; ++y) {
+ fft(input + y * n, &rowfft[y * n], n);
+ }
+ transpose(&rowfft, n);
+ for (int y = 0; y < n; ++y) {
+ fft(&rowfft[y * n], &result[y * n], n);
+ }
+ transpose(&result, n);
+ return result;
+}
+
+struct FFTTestArg {
+ int n;
+ void (*fft)(const float *input, float *temp, float *output);
+ FFTTestArg(int n_in, tform_fun_t fft_in) : n(n_in), fft(fft_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) {
+ return os << "fft_arg { n:" << test_arg.n
+ << " fft:" << reinterpret_cast<const void *>(test_arg.fft) << " }";
+}
+
+class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
+ protected:
+ void SetUp() override {
+ int n = GetParam().n;
+ input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
+ temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
+ output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+ ASSERT_NE(input_, nullptr);
+ ASSERT_NE(temp_, nullptr);
+ ASSERT_NE(output_, nullptr);
+ memset(input_, 0, sizeof(*input_) * n * n);
+ memset(temp_, 0, sizeof(*temp_) * n * n);
+ memset(output_, 0, sizeof(*output_) * n * n * 2);
+ }
+ void TearDown() override {
+ aom_free(input_);
+ aom_free(temp_);
+ aom_free(output_);
+ }
+ float *input_;
+ float *temp_;
+ float *output_;
+};
+
+TEST_P(FFT2DTest, Correct) {
+ int n = GetParam().n;
+ for (int i = 0; i < n * n; ++i) {
+ input_[i] = 1;
+ std::vector<std::complex<float> > expected = fft2d<float>(&input_[0], n);
+ GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < (n / 2) + 1; ++x) {
+ EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5);
+ EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1],
+ 1e-5);
+ }
+ }
+ input_[i] = 0;
+ }
+}
+
+TEST_P(FFT2DTest, Benchmark) {
+ int n = GetParam().n;
+ float sum = 0;
+ const int num_trials = 1000 * (64 - n);
+ for (int i = 0; i < num_trials; ++i) {
+ input_[i % (n * n)] = 1;
+ GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+ sum += output_[0];
+ input_[i % (n * n)] = 0;
+ }
+ EXPECT_NEAR(sum, num_trials, 1e-3);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, FFT2DTest,
+ ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c),
+ FFTTestArg(4, aom_fft4x4_float_c),
+ FFTTestArg(8, aom_fft8x8_float_c),
+ FFTTestArg(16, aom_fft16x16_float_c),
+ FFTTestArg(32,
+ aom_fft32x32_float_c)));
+#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, FFT2DTest,
+ ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2),
+ FFTTestArg(8, aom_fft8x8_float_sse2),
+ FFTTestArg(16, aom_fft16x16_float_sse2),
+ FFTTestArg(32, aom_fft32x32_float_sse2)));
+#endif // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, FFT2DTest,
+ ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2),
+ FFTTestArg(16, aom_fft16x16_float_avx2),
+ FFTTestArg(32, aom_fft32x32_float_avx2)));
+#endif // HAVE_AVX2
+#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
+
+struct IFFTTestArg {
+ int n;
+ tform_fun_t ifft;
+ IFFTTestArg(int n_in, tform_fun_t ifft_in) : n(n_in), ifft(ifft_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) {
+ return os << "ifft_arg { n:" << test_arg.n
+ << " fft:" << reinterpret_cast<const void *>(test_arg.ifft) << " }";
+}
+
+class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
+ protected:
+ void SetUp() override {
+ int n = GetParam().n;
+ input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
+ temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
+ output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+ ASSERT_NE(input_, nullptr);
+ ASSERT_NE(temp_, nullptr);
+ ASSERT_NE(output_, nullptr);
+ memset(input_, 0, sizeof(*input_) * n * n * 2);
+ memset(temp_, 0, sizeof(*temp_) * n * n * 2);
+ memset(output_, 0, sizeof(*output_) * n * n);
+ }
+ void TearDown() override {
+ aom_free(input_);
+ aom_free(temp_);
+ aom_free(output_);
+ }
+ float *input_;
+ float *temp_;
+ float *output_;
+};
+
+TEST_P(IFFT2DTest, Correctness) {
+ int n = GetParam().n;
+ ASSERT_GE(n, 2);
+ std::vector<float> expected(n * n);
+ std::vector<float> actual(n * n);
+ // Do forward transform then invert to make sure we get back expected
+ for (int y = 0; y < n; ++y) {
+ for (int x = 0; x < n; ++x) {
+ expected[y * n + x] = 1;
+ std::vector<std::complex<float> > input_c = fft2d(&expected[0], n);
+ for (int i = 0; i < n * n; ++i) {
+ input_[2 * i + 0] = input_c[i].real();
+ input_[2 * i + 1] = input_c[i].imag();
+ }
+ GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+
+ for (int yy = 0; yy < n; ++yy) {
+ for (int xx = 0; xx < n; ++xx) {
+ EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n),
+ 1e-5);
+ }
+ }
+ expected[y * n + x] = 0;
+ }
+ }
+}
+
+TEST_P(IFFT2DTest, Benchmark) {
+ int n = GetParam().n;
+ float sum = 0;
+ const int num_trials = 1000 * (64 - n);
+ for (int i = 0; i < num_trials; ++i) {
+ input_[i % (n * n)] = 1;
+ GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+ sum += output_[0];
+ input_[i % (n * n)] = 0;
+ }
+ EXPECT_GE(sum, num_trials / 2);
+}
+INSTANTIATE_TEST_SUITE_P(
+ C, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c),
+ IFFTTestArg(4, aom_ifft4x4_float_c),
+ IFFTTestArg(8, aom_ifft8x8_float_c),
+ IFFTTestArg(16, aom_ifft16x16_float_c),
+ IFFTTestArg(32, aom_ifft32x32_float_c)));
+#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2),
+ IFFTTestArg(8, aom_ifft8x8_float_sse2),
+ IFFTTestArg(16, aom_ifft16x16_float_sse2),
+ IFFTTestArg(32, aom_ifft32x32_float_sse2)));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, IFFT2DTest,
+ ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2),
+ IFFTTestArg(16, aom_ifft16x16_float_avx2),
+ IFFTTestArg(32, aom_ifft32x32_float_avx2)));
+#endif // HAVE_AVX2
+#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
+
+} // namespace
diff --git a/third_party/aom/test/film_grain_table_test.cc b/third_party/aom/test/film_grain_table_test.cc
new file mode 100644
index 0000000000..808d966feb
--- /dev/null
+++ b/third_party/aom/test/film_grain_table_test.cc
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom_dsp/grain_table.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+void grain_equal(const aom_film_grain_t *expected,
+ const aom_film_grain_t *actual) {
+ EXPECT_EQ(expected->apply_grain, actual->apply_grain);
+ EXPECT_EQ(expected->update_parameters, actual->update_parameters);
+ if (!expected->update_parameters) return;
+ EXPECT_EQ(expected->num_y_points, actual->num_y_points);
+ EXPECT_EQ(expected->num_cb_points, actual->num_cb_points);
+ EXPECT_EQ(expected->num_cr_points, actual->num_cr_points);
+ EXPECT_EQ(0, memcmp(expected->scaling_points_y, actual->scaling_points_y,
+ expected->num_y_points *
+ sizeof(expected->scaling_points_y[0])));
+ EXPECT_EQ(0, memcmp(expected->scaling_points_cb, actual->scaling_points_cb,
+ expected->num_cb_points *
+ sizeof(expected->scaling_points_cb[0])));
+ EXPECT_EQ(0, memcmp(expected->scaling_points_cr, actual->scaling_points_cr,
+ expected->num_cr_points *
+ sizeof(expected->scaling_points_cr[0])));
+ EXPECT_EQ(expected->scaling_shift, actual->scaling_shift);
+ EXPECT_EQ(expected->ar_coeff_lag, actual->ar_coeff_lag);
+ EXPECT_EQ(expected->ar_coeff_shift, actual->ar_coeff_shift);
+
+ const int num_pos_luma =
+ 2 * expected->ar_coeff_lag * (expected->ar_coeff_lag + 1);
+ const int num_pos_chroma = num_pos_luma;
+ EXPECT_EQ(0, memcmp(expected->ar_coeffs_y, actual->ar_coeffs_y,
+ sizeof(expected->ar_coeffs_y[0]) * num_pos_luma));
+ if (actual->num_cb_points || actual->chroma_scaling_from_luma) {
+ EXPECT_EQ(0, memcmp(expected->ar_coeffs_cb, actual->ar_coeffs_cb,
+ sizeof(expected->ar_coeffs_cb[0]) * num_pos_chroma));
+ }
+ if (actual->num_cr_points || actual->chroma_scaling_from_luma) {
+ EXPECT_EQ(0, memcmp(expected->ar_coeffs_cr, actual->ar_coeffs_cr,
+ sizeof(expected->ar_coeffs_cr[0]) * num_pos_chroma));
+ }
+ EXPECT_EQ(expected->overlap_flag, actual->overlap_flag);
+ EXPECT_EQ(expected->chroma_scaling_from_luma,
+ actual->chroma_scaling_from_luma);
+ EXPECT_EQ(expected->grain_scale_shift, actual->grain_scale_shift);
+ // EXPECT_EQ(expected->random_seed, actual->random_seed);
+
+ // clip_to_restricted and bit_depth aren't written
+ if (expected->num_cb_points) {
+ EXPECT_EQ(expected->cb_mult, actual->cb_mult);
+ EXPECT_EQ(expected->cb_luma_mult, actual->cb_luma_mult);
+ EXPECT_EQ(expected->cb_offset, actual->cb_offset);
+ }
+ if (expected->num_cr_points) {
+ EXPECT_EQ(expected->cr_mult, actual->cr_mult);
+ EXPECT_EQ(expected->cr_luma_mult, actual->cr_luma_mult);
+ EXPECT_EQ(expected->cr_offset, actual->cr_offset);
+ }
+}
+
+TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
+ aom_film_grain_table_t table;
+ memset(&table, 0, sizeof(table));
+
+ aom_film_grain_t grain;
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+
+ aom_film_grain_table_append(&table, 1000, 2000, film_grain_test_vectors + 0);
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+ grain.bit_depth = film_grain_test_vectors[0].bit_depth;
+ EXPECT_EQ(0, memcmp(&grain, film_grain_test_vectors + 0, sizeof(table)));
+
+ // Extend the existing segment
+ aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0);
+ EXPECT_EQ(nullptr, table.head->next);
+
+ // Lookup and remove and check that the entry is no longer there
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain));
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+ EXPECT_EQ(nullptr, table.head);
+ EXPECT_EQ(nullptr, table.tail);
+ aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, AddSingleSegmentRemoveBiggerSegment) {
+ aom_film_grain_table_t table;
+ aom_film_grain_t grain;
+
+ memset(&table, 0, sizeof(table));
+
+ aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 1100, true, &grain));
+
+ EXPECT_EQ(nullptr, table.head);
+ EXPECT_EQ(nullptr, table.tail);
+ aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, SplitSingleSegment) {
+ aom_film_grain_table_t table;
+ aom_film_grain_t grain;
+ memset(&table, 0, sizeof(table));
+
+ aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+
+ // Test lookup and remove that adjusts start time
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 100, true, &grain));
+ EXPECT_EQ(nullptr, table.head->next);
+ EXPECT_EQ(100, table.head->start_time);
+
+ // Test lookup and remove that adjusts end time
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 900, 1000, true, &grain));
+ EXPECT_EQ(nullptr, table.head->next);
+ EXPECT_EQ(100, table.head->start_time);
+ EXPECT_EQ(900, table.head->end_time);
+
+ // Test lookup and remove that splits the first entry
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, 400, 600, true, &grain));
+ EXPECT_EQ(100, table.head->start_time);
+ EXPECT_EQ(400, table.head->end_time);
+
+ ASSERT_NE(nullptr, table.head->next);
+ EXPECT_EQ(table.tail, table.head->next);
+ EXPECT_EQ(600, table.head->next->start_time);
+ EXPECT_EQ(900, table.head->next->end_time);
+
+ aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, AddAndLookupMultipleSegments) {
+ aom_film_grain_table_t table;
+ memset(&table, 0, sizeof(table));
+
+ aom_film_grain_t grain;
+ const int kNumTestVectors =
+ sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+ for (int i = 0; i < kNumTestVectors; ++i) {
+ aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+ film_grain_test_vectors + i);
+ }
+
+ for (int i = kNumTestVectors - 1; i >= 0; --i) {
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+ true, &grain));
+ grain_equal(film_grain_test_vectors + i, &grain);
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+ true, &grain));
+ }
+
+ // Verify that all the data has been removed
+ for (int i = 0; i < kNumTestVectors; ++i) {
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+ true, &grain));
+ }
+ aom_film_grain_table_free(&table);
+}
+
+class FilmGrainTableIOTest : public ::testing::Test {
+ protected:
+ void SetUp() override { memset(&error_, 0, sizeof(error_)); }
+ struct aom_internal_error_info error_;
+};
+
+TEST_F(FilmGrainTableIOTest, ReadMissingFile) {
+ aom_film_grain_table_t table;
+ memset(&table, 0, sizeof(table));
+ ASSERT_EQ(AOM_CODEC_ERROR, aom_film_grain_table_read(
+ &table, "/path/to/missing/file", &error_));
+}
+
+TEST_F(FilmGrainTableIOTest, ReadTruncatedFile) {
+ aom_film_grain_table_t table;
+ memset(&table, 0, sizeof(table));
+
+ std::string grain_file;
+ FILE *file = libaom_test::GetTempOutFile(&grain_file);
+ ASSERT_NE(file, nullptr);
+ fwrite("deadbeef", 8, 1, file);
+ fclose(file);
+ ASSERT_EQ(AOM_CODEC_ERROR,
+ aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+ EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripReadWrite) {
+ aom_film_grain_table_t table;
+ memset(&table, 0, sizeof(table));
+
+ aom_film_grain_t expected_grain[16];
+ const int kNumTestVectors =
+ sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+ for (int i = 0; i < kNumTestVectors; ++i) {
+ expected_grain[i] = film_grain_test_vectors[i];
+ expected_grain[i].random_seed = i;
+ expected_grain[i].update_parameters = i % 2;
+ expected_grain[i].apply_grain = (i + 1) % 2;
+ expected_grain[i].bit_depth = 0;
+ aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+ expected_grain + i);
+ }
+ std::string grain_file;
+ FILE *tmpfile = libaom_test::GetTempOutFile(&grain_file);
+ ASSERT_NE(tmpfile, nullptr);
+ fclose(tmpfile);
+ ASSERT_EQ(AOM_CODEC_OK,
+ aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+ aom_film_grain_table_free(&table);
+
+ memset(&table, 0, sizeof(table));
+ ASSERT_EQ(AOM_CODEC_OK,
+ aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+ for (int i = 0; i < kNumTestVectors; ++i) {
+ aom_film_grain_t grain;
+ EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+ true, &grain));
+ grain_equal(expected_grain + i, &grain);
+ }
+ aom_film_grain_table_free(&table);
+ EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripSplit) {
+ std::string grain_file;
+ FILE *tmpfile = libaom_test::GetTempOutFile(&grain_file);
+ ASSERT_NE(tmpfile, nullptr);
+ fclose(tmpfile);
+
+ aom_film_grain_table_t table;
+ memset(&table, 0, sizeof(table));
+
+ aom_film_grain_t grain = film_grain_test_vectors[0];
+ aom_film_grain_table_append(&table, 0, 3000, &grain);
+ ASSERT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+ ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+ EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+ ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+ ASSERT_EQ(AOM_CODEC_OK,
+ aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+ aom_film_grain_table_free(&table);
+
+ memset(&table, 0, sizeof(table));
+ ASSERT_EQ(AOM_CODEC_OK,
+ aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+ ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+ ASSERT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+ ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+ aom_film_grain_table_free(&table);
+
+ EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+const ::libaom_test::TestMode kFilmGrainEncodeTestModes[] = {
+ ::libaom_test::kRealTime,
+#if !CONFIG_REALTIME_ONLY
+ ::libaom_test::kOnePassGood
+#endif
+};
+
+class FilmGrainEncodeTest
+ : public ::libaom_test::CodecTestWith3Params<int, int,
+ ::libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ FilmGrainEncodeTest()
+ : EncoderTest(GET_PARAM(0)), test_monochrome_(GET_PARAM(1)),
+ key_frame_dist_(GET_PARAM(2)), test_mode_(GET_PARAM(3)) {}
+ ~FilmGrainEncodeTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(test_mode_);
+ cfg_.monochrome = test_monochrome_ == 1;
+ cfg_.rc_target_bitrate = 300;
+ cfg_.kf_max_dist = key_frame_dist_;
+ cfg_.g_lag_in_frames = 0;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED,
+ test_mode_ == ::libaom_test::kRealTime ? 7 : 5);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_FILM);
+ encoder->Control(AV1E_SET_DENOISE_NOISE_LEVEL, 1);
+ } else if (video->frame() == 1) {
+ cfg_.monochrome = (test_monochrome_ == 1 || test_monochrome_ == 2);
+ encoder->Config(&cfg_);
+ } else {
+ cfg_.monochrome = test_monochrome_ == 1;
+ encoder->Config(&cfg_);
+ }
+ }
+
+ bool DoDecode() const override { return false; }
+
+ void DoTest() {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 3);
+ cfg_.g_w = video.img()->d_w;
+ cfg_.g_h = video.img()->d_h;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ private:
+ // 0: monochroome always off.
+ // 1: monochrome always on.
+ // 2: monochrome changes from 0, 1, 0, for encoded frames 0, 1, 2.
+ // The case where monochrome changes from 1 to 0 (i.e., encoder initialized
+ // with monochrome = 1 and then subsequently encoded with monochrome = 0)
+ // will fail. The test InitMonochrome1_EncodeMonochrome0 below verifies this.
+ int test_monochrome_;
+ int key_frame_dist_;
+ ::libaom_test::TestMode test_mode_;
+};
+
+TEST_P(FilmGrainEncodeTest, Test) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(FilmGrainEncodeTest, ::testing::Range(0, 3),
+ ::testing::Values(0, 10),
+ ::testing::ValuesIn(kFilmGrainEncodeTestModes));
+
+// Initialize encoder with monochrome = 1, and then encode frame with
+// monochrome = 0. This will result in an error: see the following check
+// in encoder_set_config() in av1/av1_cx_iface.c.
+// TODO(marpan): Consider moving this test to another file, as the failure
+// has nothing to do with film grain mode.
+TEST(FilmGrainEncodeTest, InitMonochrome1EncodeMonochrome0) {
+ const int kWidth = 352;
+ const int kHeight = 288;
+ const int usage = AOM_USAGE_REALTIME;
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+ aom_codec_ctx_t enc;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ // Initialize encoder, with monochrome = 0.
+ cfg.monochrome = 1;
+ aom_codec_err_t init_status = aom_codec_enc_init(&enc, iface, &cfg, 0);
+ ASSERT_EQ(init_status, AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 7), AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_TUNE_CONTENT, AOM_CONTENT_FILM),
+ AOM_CODEC_OK);
+ ASSERT_EQ(aom_codec_control(&enc, AV1E_SET_DENOISE_NOISE_LEVEL, 1),
+ AOM_CODEC_OK);
+ // Set image with zero values.
+ constexpr size_t kBufferSize =
+ kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+ std::vector<unsigned char> buffer(kBufferSize);
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+ // Encode first frame.
+ ASSERT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ // Second frame: update config with monochrome = 1.
+ cfg.monochrome = 0;
+ ASSERT_EQ(aom_codec_enc_config_set(&enc, &cfg), AOM_CODEC_INVALID_PARAM);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
diff --git a/third_party/aom/test/filterintra_test.cc b/third_party/aom/test/filterintra_test.cc
new file mode 100644
index 0000000000..0a0ab11dc3
--- /dev/null
+++ b/third_party/aom/test/filterintra_test.cc
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::tuple;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+ const uint8_t *above, const uint8_t *left, int mode);
+
+// Note:
+// Test parameter list:
+// Reference predictor, optimized predictor, prediction mode, tx size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, TX_SIZE> PredParams;
+
+const int MaxTxSize = 32;
+
+const int MaxTestNum = 100;
+
+class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
+ public:
+ ~AV1FilterIntraPredTest() override = default;
+ void SetUp() override {
+ PredFuncMode funcMode = GET_PARAM(0);
+ predFuncRef_ = std::get<0>(funcMode);
+ predFunc_ = std::get<1>(funcMode);
+ mode_ = std::get<2>(funcMode);
+ txSize_ = GET_PARAM(1);
+
+ alloc_ = new uint8_t[2 * MaxTxSize + 1];
+ predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
+ pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+ ASSERT_NE(alloc_, nullptr);
+ ASSERT_NE(predRef_, nullptr);
+ ASSERT_NE(pred_, nullptr);
+ }
+
+ void TearDown() override {
+ delete[] alloc_;
+ delete[] predRef_;
+ delete[] pred_;
+ }
+
+ protected:
+ void RunTest() const {
+ int tstIndex = 0;
+ int stride = tx_size_wide[txSize_];
+ uint8_t *left = alloc_;
+ uint8_t *above = alloc_ + MaxTxSize;
+ while (tstIndex < MaxTestNum) {
+ PrepareBuffer();
+ predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+ API_REGISTER_STATE_CHECK(
+ predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
+ DiffPred(tstIndex);
+ tstIndex += 1;
+ }
+ }
+ void RunSpeedTest() const {
+ int stride = tx_size_wide[txSize_];
+ uint8_t *left = alloc_;
+ uint8_t *above = alloc_ + MaxTxSize;
+ const int numIter = 5000;
+
+ PrepareBuffer();
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < numIter; i++) {
+ predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+ }
+ aom_usec_timer_mark(&ref_timer);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < numIter; i++) {
+ predFunc_(pred_, stride, txSize_, &above[1], left, mode_);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int ref_sum_time =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+ const int sum_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ printf("c_time = %d \t simd_time = %d \t Gain = %4.2f \t mode = %d \n",
+ ref_sum_time, sum_time,
+ (static_cast<float>(ref_sum_time) / static_cast<float>(sum_time)),
+ static_cast<int>(mode_));
+
+ DiffPred(0);
+ }
+
+ private:
+ void PrepareBuffer() const {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int i = 0;
+ while (i < (2 * MaxTxSize + 1)) {
+ alloc_[i] = rnd.Rand8();
+ i++;
+ }
+ }
+
+ void DiffPred(int testNum) const {
+ int i = 0;
+ while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) {
+ EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+ << "Tx size: " << tx_size_wide[txSize_]
+ << "x" << tx_size_high[txSize_] << " "
+ << "Test number: " << testNum;
+ i++;
+ }
+ }
+
+ Predictor predFunc_;
+ Predictor predFuncRef_;
+ int mode_;
+ TX_SIZE txSize_;
+ uint8_t *alloc_;
+ uint8_t *pred_;
+ uint8_t *predRef_;
+};
+
+TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
+
+TEST_P(AV1FilterIntraPredTest, DISABLED_Speed) { RunSpeedTest(); }
+
+using ::testing::make_tuple;
+#if HAVE_SSE4_1
+const PredFuncMode kPredFuncMdArray[] = {
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+ FILTER_DC_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+ FILTER_V_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+ FILTER_H_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+ FILTER_D157_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+ FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_4X8,
+ TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16,
+ TX_4X16, TX_16X4, TX_8X32, TX_32X8 };
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1FilterIntraPredTest,
+ ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+ ::testing::ValuesIn(kTxSize)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+const PredFuncMode kPredFuncMdArrayNEON[] = {
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+ FILTER_DC_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+ FILTER_V_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+ FILTER_H_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+ FILTER_D157_PRED),
+ make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_neon,
+ FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSizeNEON[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_4X8,
+ TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16,
+ TX_4X16, TX_16X4, TX_8X32, TX_32X8 };
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1FilterIntraPredTest,
+ ::testing::Combine(::testing::ValuesIn(kPredFuncMdArrayNEON),
+ ::testing::ValuesIn(kTxSizeNEON)));
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/firstpass_test.cc b/third_party/aom/test/firstpass_test.cc
new file mode 100644
index 0000000000..1f4f3b7853
--- /dev/null
+++ b/third_party/aom/test/firstpass_test.cc
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stddef.h>
+
+#include "av1/common/common.h"
+#include "av1/encoder/firstpass.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(FirstpassTest, FirstpassInfoInitWithExtBuf) {
+ FIRSTPASS_INFO firstpass_info;
+ FIRSTPASS_STATS ext_stats_buf[10];
+ const int ref_stats_size = 10;
+ for (int i = 0; i < ref_stats_size; ++i) {
+ av1_zero(ext_stats_buf[i]);
+ ext_stats_buf[i].frame = i;
+ }
+ aom_codec_err_t ret =
+ av1_firstpass_info_init(&firstpass_info, ext_stats_buf, 10);
+ EXPECT_EQ(firstpass_info.stats_count, ref_stats_size);
+ EXPECT_EQ(firstpass_info.future_stats_count + firstpass_info.past_stats_count,
+ firstpass_info.stats_count);
+ EXPECT_EQ(firstpass_info.cur_index, 0);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+}
+
+TEST(FirstpassTest, FirstpassInfoInitWithStaticBuf) {
+ FIRSTPASS_INFO firstpass_info;
+ aom_codec_err_t ret = av1_firstpass_info_init(&firstpass_info, nullptr, 0);
+ EXPECT_EQ(firstpass_info.stats_count, 0);
+ EXPECT_EQ(firstpass_info.cur_index, 0);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+}
+
+TEST(FirstpassTest, FirstpassInfoPushPop) {
+ FIRSTPASS_INFO firstpass_info;
+ av1_firstpass_info_init(&firstpass_info, nullptr, 0);
+ EXPECT_EQ(firstpass_info.stats_buf_size, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+ for (int i = 0; i < FIRSTPASS_INFO_STATIC_BUF_SIZE; ++i) {
+ FIRSTPASS_STATS stats;
+ av1_zero(stats);
+ stats.frame = i;
+ aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+ EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+ const int pop_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+ for (int i = 0; i < pop_count; ++i) {
+ const FIRSTPASS_STATS *stats = av1_firstpass_info_peek(&firstpass_info, 0);
+ aom_codec_err_t ret =
+ av1_firstpass_info_move_cur_index_and_pop(&firstpass_info);
+ EXPECT_NE(stats, nullptr);
+ EXPECT_EQ(stats->frame, i);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+ EXPECT_EQ(firstpass_info.stats_count,
+ FIRSTPASS_INFO_STATIC_BUF_SIZE - pop_count);
+
+ const int push_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+ for (int i = 0; i < push_count; ++i) {
+ FIRSTPASS_STATS stats;
+ av1_zero(stats);
+ aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+ EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+
+ EXPECT_EQ(firstpass_info.stats_count, firstpass_info.stats_buf_size);
+ {
+ // Push the stats when the queue is full.
+ FIRSTPASS_STATS stats;
+ av1_zero(stats);
+ aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+ EXPECT_EQ(ret, AOM_CODEC_ERROR);
+ }
+}
+
+TEST(FirstpassTest, FirstpassInfoTotalStats) {
+ FIRSTPASS_INFO firstpass_info;
+ av1_firstpass_info_init(&firstpass_info, nullptr, 0);
+ EXPECT_EQ(firstpass_info.total_stats.frame, 0);
+ for (int i = 0; i < 10; ++i) {
+ FIRSTPASS_STATS stats;
+ av1_zero(stats);
+ stats.count = 1;
+ av1_firstpass_info_push(&firstpass_info, &stats);
+ }
+ EXPECT_EQ(firstpass_info.total_stats.count, 10);
+}
+
+TEST(FirstpassTest, FirstpassInfoMoveCurr) {
+ FIRSTPASS_INFO firstpass_info;
+ av1_firstpass_info_init(&firstpass_info, nullptr, 0);
+ int frame_cnt = 0;
+ EXPECT_EQ(firstpass_info.stats_buf_size, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+ for (int i = 0; i < FIRSTPASS_INFO_STATIC_BUF_SIZE; ++i) {
+ FIRSTPASS_STATS stats;
+ av1_zero(stats);
+ stats.frame = frame_cnt;
+ ++frame_cnt;
+ aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+ EXPECT_EQ(firstpass_info.cur_index, firstpass_info.start_index);
+ {
+ aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+ // We cannot pop when cur_index == start_index
+ EXPECT_EQ(ret, AOM_CODEC_ERROR);
+ }
+ int ref_frame_cnt = 0;
+ const int move_count = FIRSTPASS_INFO_STATIC_BUF_SIZE * 2 / 3;
+ for (int i = 0; i < move_count; ++i) {
+ const FIRSTPASS_STATS *this_stats =
+ av1_firstpass_info_peek(&firstpass_info, 0);
+ EXPECT_EQ(this_stats->frame, ref_frame_cnt);
+ ++ref_frame_cnt;
+ av1_firstpass_info_move_cur_index(&firstpass_info);
+ }
+ EXPECT_EQ(firstpass_info.future_stats_count,
+ FIRSTPASS_INFO_STATIC_BUF_SIZE - move_count);
+ EXPECT_EQ(firstpass_info.past_stats_count, move_count);
+ EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+
+ const int test_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+ for (int i = 0; i < test_count; ++i) {
+ aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+
+ // Pop #test_count stats
+ for (int i = 0; i < test_count; ++i) {
+ FIRSTPASS_STATS stats;
+ av1_zero(stats);
+ stats.frame = frame_cnt;
+ ++frame_cnt;
+ aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+
+ // peek and move #test_count stats
+ for (int i = 0; i < test_count; ++i) {
+ const FIRSTPASS_STATS *this_stats =
+ av1_firstpass_info_peek(&firstpass_info, 0);
+ EXPECT_EQ(this_stats->frame, ref_frame_cnt);
+ ++ref_frame_cnt;
+ av1_firstpass_info_move_cur_index(&firstpass_info);
+ }
+
+ // pop #test_count stats
+ for (int i = 0; i < test_count; ++i) {
+ aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+ EXPECT_EQ(ret, AOM_CODEC_OK);
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/force_key_frame_test.cc b/third_party/aom/test/force_key_frame_test.cc
new file mode 100644
index 0000000000..2b85d26530
--- /dev/null
+++ b/third_party/aom/test/force_key_frame_test.cc
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/3327.
+//
+// In good-quality mode, set cfg.g_lag_in_frames to 1 or 0 and encode two
+// frames in one-pass mode. Pass AOM_EFLAG_FORCE_KF to the second
+// aom_codec_encode() call. Both frames should be encoded as key frames.
+
+#include <memory>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+void TestOnePassMode(unsigned int lag_in_frames) {
+ // A buffer of gray samples of size 128x128, YUV 4:2:0.
+ constexpr size_t kImageDataSize = 128 * 128 + 2 * 64 * 64;
+ std::unique_ptr<unsigned char[]> img_data(new unsigned char[kImageDataSize]);
+ ASSERT_NE(img_data, nullptr);
+ memset(img_data.get(), 128, kImageDataSize);
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+ cfg.g_w = 128;
+ cfg.g_h = 128;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = lag_in_frames;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+ aom_image_t img;
+ EXPECT_EQ(&img,
+ aom_img_wrap(&img, AOM_IMG_FMT_I420, 128, 128, 1, img_data.get()));
+
+ aom_codec_iter_t iter;
+ const aom_codec_cx_pkt_t *pkt;
+ int frame_count = 0;
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+ << "frame " << frame_count;
+ frame_count++;
+ }
+
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_encode(&enc, &img, 1, 1, AOM_EFLAG_FORCE_KF));
+
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+ << "frame " << frame_count;
+ frame_count++;
+ }
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+ << "frame " << frame_count;
+ frame_count++;
+ }
+
+ EXPECT_EQ(frame_count, 2);
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(ForceKeyFrameTest, OnePassModeLag0) { TestOnePassMode(0); }
+
+TEST(ForceKeyFrameTest, OnePassModeLag1) { TestOnePassMode(1); }
+
+TEST(ForceKeyFrameTest, OnePassModeLag2) { TestOnePassMode(2); }
+
+} // namespace
diff --git a/third_party/aom/test/forced_max_frame_width_height_test.cc b/third_party/aom/test/forced_max_frame_width_height_test.cc
new file mode 100644
index 0000000000..3347713c5b
--- /dev/null
+++ b/third_party/aom/test/forced_max_frame_width_height_test.cc
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Tests for https://crbug.com/aomedia/3326.
+//
+// Set cfg.g_forced_max_frame_width and cfg.g_forced_max_frame_height and
+// encode two frames of increasing sizes. The second aom_codec_encode() should
+// not crash or have memory errors.
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// cfg.g_lag_in_frames must be set to 0 or 1 to allow the frame size to change,
+// as required by the following check in encoder_set_config() in
+// av1/av1_cx_iface.c:
+//
+// if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+// if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+// ERROR("Cannot change width or height after initialization");
+// ...
+// }
+
+void RunTest(unsigned int usage, unsigned int lag_in_frames,
+ const char *tune_metric) {
+ // A buffer of gray samples. Large enough for 128x128 and 256x256, YUV 4:2:0.
+ constexpr size_t kImageDataSize = 256 * 256 + 2 * 128 * 128;
+ std::unique_ptr<unsigned char[]> img_data(new unsigned char[kImageDataSize]);
+ ASSERT_NE(img_data, nullptr);
+ memset(img_data.get(), 128, kImageDataSize);
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, usage));
+ cfg.g_w = 128;
+ cfg.g_h = 128;
+ cfg.g_forced_max_frame_width = 256;
+ cfg.g_forced_max_frame_height = 256;
+ cfg.g_lag_in_frames = lag_in_frames;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_set_option(&enc, "tune", tune_metric));
+
+ aom_image_t img;
+ EXPECT_EQ(&img,
+ aom_img_wrap(&img, AOM_IMG_FMT_I420, 128, 128, 1, img_data.get()));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+ cfg.g_w = 256;
+ cfg.g_h = 256;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+
+ EXPECT_EQ(&img,
+ aom_img_wrap(&img, AOM_IMG_FMT_I420, 256, 256, 1, img_data.get()));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag0TunePSNR) {
+ RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/0, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag0TuneSSIM) {
+ RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/0, "ssim");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag1TunePSNR) {
+ RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/1, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, GoodQualityLag1TuneSSIM) {
+ RunTest(AOM_USAGE_GOOD_QUALITY, /*lag_in_frames=*/1, "ssim");
+}
+
+void FillImageGradient(aom_image_t *image, int bit_depth) {
+ assert(image->range == AOM_CR_FULL_RANGE);
+ for (int plane = 0; plane < 3; plane++) {
+ const int plane_width = aom_img_plane_width(image, plane);
+ const int plane_height = aom_img_plane_height(image, plane);
+ unsigned char *row = image->planes[plane];
+ const int stride = image->stride[plane];
+ for (int y = 0; y < plane_height; ++y) {
+ for (int x = 0; x < plane_width; ++x) {
+ const int value = (x + y) * ((1 << bit_depth) - 1) /
+ std::max(1, plane_width + plane_height - 2);
+ assert(value >= 0 && value <= (1 << bit_depth) - 1);
+ if (bit_depth > 8) {
+ reinterpret_cast<uint16_t *>(row)[x] = static_cast<uint16_t>(value);
+ } else {
+ row[x] = static_cast<unsigned char>(value);
+ }
+ }
+ row += stride;
+ }
+ }
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, DimensionDecreasing) {
+ constexpr int kWidth = 128;
+ constexpr int kHeight = 128;
+ constexpr size_t kBufferSize = 3 * kWidth * kHeight;
+ std::vector<unsigned char> buffer(kBufferSize);
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+ FillImageGradient(&img, 8);
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_GOOD_QUALITY));
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 0;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_forced_max_frame_width = kWidth;
+ cfg.g_forced_max_frame_height = kHeight;
+ cfg.g_lag_in_frames = 1;
+ cfg.rc_min_quantizer = 20;
+ cfg.rc_max_quantizer = 40;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 30));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 6));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+ // First frame
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Second frame
+ constexpr int kWidthSmall = 64;
+ constexpr int kHeightSmall = 64;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidthSmall,
+ kHeightSmall, 1, buffer.data()));
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+ FillImageGradient(&img, 8);
+ cfg.g_w = kWidthSmall;
+ cfg.g_h = kHeightSmall;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+#endif // !CONFIG_REALTIME_ONLY
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag0TunePSNR) {
+ RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/0, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag0TuneSSIM) {
+ RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/0, "ssim");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag1TunePSNR) {
+ RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/1, "psnr");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, RealtimeLag1TuneSSIM) {
+ RunTest(AOM_USAGE_REALTIME, /*lag_in_frames=*/1, "ssim");
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, MaxFrameSizeTooBig) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+ cfg.g_w = 256;
+ cfg.g_h = 256;
+ cfg.g_forced_max_frame_width = 131072;
+ cfg.g_forced_max_frame_height = 131072;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, FirstFrameTooBig) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+ cfg.g_w = 258;
+ cfg.g_h = 256;
+ cfg.g_forced_max_frame_width = 256;
+ cfg.g_forced_max_frame_height = 256;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ cfg.g_w = 256;
+ cfg.g_h = 258;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ cfg.g_w = 256;
+ cfg.g_h = 256;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(EncodeForcedMaxFrameWidthHeight, SecondFrameTooBig) {
+ // A buffer of gray samples. Large enough for 128x128 and 256x256, YUV 4:2:0.
+ constexpr size_t kImageDataSize = 256 * 256 + 2 * 128 * 128;
+ std::unique_ptr<unsigned char[]> img_data(new unsigned char[kImageDataSize]);
+ ASSERT_NE(img_data, nullptr);
+ memset(img_data.get(), 128, kImageDataSize);
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+ cfg.g_w = 128;
+ cfg.g_h = 128;
+ cfg.g_forced_max_frame_width = 255;
+ cfg.g_forced_max_frame_height = 256;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+ aom_image_t img;
+ EXPECT_EQ(&img,
+ aom_img_wrap(&img, AOM_IMG_FMT_I420, 128, 128, 1, img_data.get()));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+ cfg.g_w = 256;
+ cfg.g_h = 256;
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_config_set(&enc, &cfg));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+} // namespace
diff --git a/third_party/aom/test/frame_parallel_enc_test.cc b/third_party/aom/test/frame_parallel_enc_test.cc
new file mode 100644
index 0000000000..86d5ddb7d4
--- /dev/null
+++ b/third_party/aom/test/frame_parallel_enc_test.cc
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+#if CONFIG_FPMT_TEST && !CONFIG_REALTIME_ONLY
+class AVxFrameParallelThreadEncodeTest
+ : public ::libaom_test::CodecTestWith3Params<int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AVxFrameParallelThreadEncodeTest()
+ : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+ set_cpu_used_(GET_PARAM(1)), tile_cols_(GET_PARAM(2)),
+ tile_rows_(GET_PARAM(3)) {
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = 1280;
+ cfg.h = 720;
+ cfg.allow_lowbitdepth = 1;
+ decoder_ = codec_->CreateDecoder(cfg, 0);
+ }
+ ~AVxFrameParallelThreadEncodeTest() override { delete decoder_; }
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.g_threads = 16;
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ encoder_initialized_ = false;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+ ::libaom_test::Encoder *encoder) override {
+ if (encoder_initialized_) return;
+ SetTileSize(encoder);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_FP_MT, 1);
+ encoder->Control(AV1E_SET_FP_MT_UNIT_TEST, enable_actual_parallel_encode_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+
+ encoder_initialized_ = true;
+ }
+
+ virtual void SetTileSize(libaom_test::Encoder *encoder) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+ encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ size_enc_.push_back(pkt->data.frame.sz);
+
+ ::libaom_test::MD5 md5_enc;
+ md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz);
+ md5_enc_.push_back(md5_enc.Get());
+
+ const aom_codec_err_t res = decoder_->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ const aom_image_t *img = decoder_->GetDxData().Next();
+
+ if (img) {
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(img);
+ md5_dec_.push_back(md5_res.Get());
+ }
+ }
+
+ void DoTest(::libaom_test::VideoSource *input_video) {
+ /* This is the actual parallel encode of frames using multiple cpis.
+ * The parallel frames are independently encoded.
+ * Threads are distributed among the parallel frames whereas non-parallel
+ * frames use all the threads. Example: for 8 threads, in case of 4 frames
+ * in a parallel encode set, each frame gets 2 threads. In case of 3 frames
+ * in a parallel encode set, threads are distributed as 2, 3 ,3.
+ */
+ enable_actual_parallel_encode_ = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(input_video));
+ std::vector<size_t> enc_stream_fpmt_size;
+ std::vector<std::string> enc_stream_fpmt;
+ std::vector<std::string> dec_stream_fpmt;
+ enc_stream_fpmt_size = size_enc_;
+ enc_stream_fpmt = md5_enc_;
+ dec_stream_fpmt = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ /* This is the simulation of parallel encode of frames using single cpi.
+ * In simulation, it should be ensured to have no dependency across frames
+ * (similar to parallel encode).
+ * Each frame uses all the threads configured.
+ */
+ enable_actual_parallel_encode_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(input_video));
+ std::vector<size_t> enc_stream_sim_size;
+ std::vector<std::string> enc_stream_sim;
+ std::vector<std::string> dec_stream_sim;
+ enc_stream_sim_size = size_enc_;
+ enc_stream_sim = md5_enc_;
+ dec_stream_sim = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Check that the vectors are equal.
+ ASSERT_EQ(enc_stream_sim_size, enc_stream_fpmt_size);
+ ASSERT_EQ(enc_stream_sim, enc_stream_fpmt);
+ ASSERT_EQ(dec_stream_sim, dec_stream_fpmt);
+ }
+
+ bool encoder_initialized_;
+ int set_cpu_used_;
+ int tile_cols_;
+ int tile_rows_;
+ int enable_actual_parallel_encode_;
+ ::libaom_test::Decoder *decoder_;
+ std::vector<size_t> size_enc_;
+ std::vector<std::string> md5_enc_;
+ std::vector<std::string> md5_dec_;
+};
+
+class AVxFrameParallelThreadEncodeHDResTestLarge
+ : public AVxFrameParallelThreadEncodeTest {};
+
+TEST_P(AVxFrameParallelThreadEncodeHDResTestLarge,
+ FrameParallelThreadEncodeTest) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ cfg_.rc_target_bitrate = 500;
+ DoTest(&video);
+}
+
+class AVxFrameParallelThreadEncodeLowResTestLarge
+ : public AVxFrameParallelThreadEncodeTest {};
+
+TEST_P(AVxFrameParallelThreadEncodeLowResTestLarge,
+ FrameParallelThreadEncodeTest) {
+ ::libaom_test::YUVVideoSource video("hantro_collage_w352h288.yuv",
+ AOM_IMG_FMT_I420, 352, 288, 30, 1, 0, 60);
+ cfg_.rc_target_bitrate = 200;
+ DoTest(&video);
+}
+
+class AVxFrameParallelThreadEncodeLowResTest
+ : public AVxFrameParallelThreadEncodeTest {};
+
+TEST_P(AVxFrameParallelThreadEncodeLowResTest, FrameParallelThreadEncodeTest) {
+ ::libaom_test::YUVVideoSource video("hantro_collage_w352h288.yuv",
+ AOM_IMG_FMT_I420, 352, 288, 30, 1, 0, 60);
+ cfg_.rc_target_bitrate = 200;
+ DoTest(&video);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFrameParallelThreadEncodeHDResTestLarge,
+ ::testing::Values(2, 3, 4, 5, 6),
+ ::testing::Values(0, 1, 2), ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFrameParallelThreadEncodeLowResTestLarge,
+ ::testing::Values(2, 3), ::testing::Values(0, 1, 2),
+ ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_SUITE(AVxFrameParallelThreadEncodeLowResTest,
+ ::testing::Values(4, 5, 6), ::testing::Values(1),
+ ::testing::Values(0));
+#endif // CONFIG_FPMT_TEST && !CONFIG_REALTIME_ONLY
+
+} // namespace
diff --git a/third_party/aom/test/frame_size_tests.cc b/third_party/aom/test/frame_size_tests.cc
new file mode 100644
index 0000000000..ea8cf47ab8
--- /dev/null
+++ b/third_party/aom/test/frame_size_tests.cc
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <array>
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class AV1FrameSizeTests : public ::testing::Test,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1FrameSizeTests()
+ : EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {}
+ ~AV1FrameSizeTests() override = default;
+
+ void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
+ return !::testing::Test::HasFailure();
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 7);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ int expected_res_;
+};
+
+#if CONFIG_SIZE_LIMIT
+// TODO(Casey.Smalley@arm.com) fails due to newer bounds checks that get caught
+// before the assert below added in ebc2714d71a834fc32a19eef0a81f51fbc47db01
+TEST_F(AV1FrameSizeTests, DISABLED_TestInvalidSizes) {
+ ::libaom_test::RandomVideoSource video;
+
+ video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16);
+ video.set_limit(2);
+ expected_res_ = AOM_CODEC_CORRUPT_FRAME;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// TODO(Casey.Smalley@arm.com) similar to the above test, needs to be
+// updated for the new rejection case
+TEST_F(AV1FrameSizeTests, DISABLED_LargeValidSizes) {
+ ::libaom_test::RandomVideoSource video;
+
+ video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+ video.set_limit(2);
+ expected_res_ = AOM_CODEC_OK;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#endif
+
+TEST_F(AV1FrameSizeTests, OneByOneVideo) {
+ ::libaom_test::RandomVideoSource video;
+
+ video.SetSize(1, 1);
+ video.set_limit(2);
+ expected_res_ = AOM_CODEC_OK;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// Parameters: AOM_USAGE_*, aom_rc_mode, cpu-used.
+class AV1ResolutionChange
+ : public testing::TestWithParam<std::tuple<int, aom_rc_mode, int>> {
+ public:
+ AV1ResolutionChange()
+ : usage_(std::get<0>(GetParam())), rc_mode_(std::get<1>(GetParam())),
+ cpu_used_(std::get<2>(GetParam())) {}
+ AV1ResolutionChange(const AV1ResolutionChange &) = delete;
+ AV1ResolutionChange &operator=(const AV1ResolutionChange &) = delete;
+ ~AV1ResolutionChange() override = default;
+
+ protected:
+ int usage_;
+ aom_rc_mode rc_mode_;
+ int cpu_used_;
+};
+
+TEST_P(AV1ResolutionChange, InvalidRefSize) {
+ struct FrameSize {
+ unsigned int width;
+ unsigned int height;
+ };
+ static constexpr std::array<FrameSize, 3> kFrameSizes = { {
+ { 1768, 200 },
+ { 50, 200 },
+ { 850, 200 },
+ } };
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage_), AOM_CODEC_OK);
+
+ // Resolution changes are only permitted with one pass encoding with no lag.
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_end_usage = rc_mode_;
+
+ aom_codec_ctx_t ctx;
+ EXPECT_EQ(aom_codec_enc_init(&ctx, iface, &cfg, 0), AOM_CODEC_OK);
+ std::unique_ptr<aom_codec_ctx_t, decltype(&aom_codec_destroy)> enc(
+ &ctx, &aom_codec_destroy);
+ EXPECT_EQ(aom_codec_control(enc.get(), AOME_SET_CPUUSED, cpu_used_),
+ AOM_CODEC_OK);
+
+ size_t frame_count = 0;
+ ::libaom_test::RandomVideoSource video;
+ video.Begin();
+ constexpr int kNumFramesPerResolution = 2;
+ for (const auto &frame_size : kFrameSizes) {
+ cfg.g_w = frame_size.width;
+ cfg.g_h = frame_size.height;
+ EXPECT_EQ(aom_codec_enc_config_set(enc.get(), &cfg), AOM_CODEC_OK);
+ video.SetSize(cfg.g_w, cfg.g_h);
+
+ aom_codec_iter_t iter;
+ const aom_codec_cx_pkt_t *pkt;
+
+ for (int i = 0; i < kNumFramesPerResolution; ++i) {
+ video.Next(); // SetSize() does not call FillFrame().
+ EXPECT_EQ(aom_codec_encode(enc.get(), video.img(), video.pts(),
+ video.duration(), /*flags=*/0),
+ AOM_CODEC_OK);
+
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(enc.get(), &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // The frame following a resolution change should be a keyframe as the
+ // change is too extreme to allow previous references to be used.
+ if (i == 0 || usage_ == AOM_USAGE_ALL_INTRA) {
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+ << "frame " << frame_count;
+ }
+ frame_count++;
+ }
+ }
+ }
+
+ EXPECT_EQ(frame_count, kNumFramesPerResolution * kFrameSizes.size());
+}
+
+TEST_P(AV1ResolutionChange, RandomInput) {
+ struct FrameSize {
+ unsigned int width;
+ unsigned int height;
+ };
+ static constexpr std::array<FrameSize, 4> kFrameSizes = { {
+ { 50, 200 },
+ { 100, 200 },
+ { 100, 300 },
+ { 200, 400 },
+ } };
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage_), AOM_CODEC_OK);
+
+ // Resolution changes are only permitted with one pass encoding with no lag.
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_end_usage = rc_mode_;
+ // For random input source, if max frame sizes are not set, the first encoded
+ // frame size will be locked as the max frame size, and the encoder will
+ // identify it as unsupported bitstream.
+ unsigned int max_width = cfg.g_w; // default frame width
+ unsigned int max_height = cfg.g_h; // default frame height
+ for (const auto &frame_size : kFrameSizes) {
+ max_width = frame_size.width > max_width ? frame_size.width : max_width;
+ max_height =
+ frame_size.height > max_height ? frame_size.height : max_height;
+ }
+ cfg.g_forced_max_frame_width = max_width;
+ cfg.g_forced_max_frame_height = max_height;
+
+ aom_codec_ctx_t ctx;
+ EXPECT_EQ(aom_codec_enc_init(&ctx, iface, &cfg, 0), AOM_CODEC_OK);
+ std::unique_ptr<aom_codec_ctx_t, decltype(&aom_codec_destroy)> enc(
+ &ctx, &aom_codec_destroy);
+ EXPECT_EQ(aom_codec_control(enc.get(), AOME_SET_CPUUSED, cpu_used_),
+ AOM_CODEC_OK);
+
+ size_t frame_count = 0;
+ ::libaom_test::RandomVideoSource video;
+ video.Begin();
+ constexpr int kNumFramesPerResolution = 2;
+ for (const auto &frame_size : kFrameSizes) {
+ cfg.g_w = frame_size.width;
+ cfg.g_h = frame_size.height;
+ EXPECT_EQ(aom_codec_enc_config_set(enc.get(), &cfg), AOM_CODEC_OK);
+ video.SetSize(cfg.g_w, cfg.g_h);
+
+ aom_codec_iter_t iter;
+ const aom_codec_cx_pkt_t *pkt;
+
+ for (int i = 0; i < kNumFramesPerResolution; ++i) {
+ video.Next(); // SetSize() does not call FillFrame().
+ EXPECT_EQ(aom_codec_encode(enc.get(), video.img(), video.pts(),
+ video.duration(), /*flags=*/0),
+ AOM_CODEC_OK);
+
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(enc.get(), &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // The frame following a resolution change should be a keyframe as the
+ // change is too extreme to allow previous references to be used.
+ if (i == 0 || usage_ == AOM_USAGE_ALL_INTRA) {
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+ << "frame " << frame_count;
+ }
+ frame_count++;
+ }
+ }
+ }
+
+ EXPECT_EQ(frame_count, kNumFramesPerResolution * kFrameSizes.size());
+}
+
+TEST_P(AV1ResolutionChange, InvalidInputSize) {
+ struct FrameSize {
+ unsigned int width;
+ unsigned int height;
+ };
+ static constexpr std::array<FrameSize, 3> kFrameSizes = { {
+ { 1768, 0 },
+ { 0, 200 },
+ { 850, 200 },
+ } };
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage_), AOM_CODEC_OK);
+
+ // Resolution changes are only permitted with one pass encoding with no lag.
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_end_usage = rc_mode_;
+
+ aom_codec_ctx_t ctx;
+ EXPECT_EQ(aom_codec_enc_init(&ctx, iface, &cfg, 0), AOM_CODEC_OK);
+ std::unique_ptr<aom_codec_ctx_t, decltype(&aom_codec_destroy)> enc(
+ &ctx, &aom_codec_destroy);
+ EXPECT_EQ(aom_codec_control(enc.get(), AOME_SET_CPUUSED, cpu_used_),
+ AOM_CODEC_OK);
+
+ int frame_count = 0;
+ ::libaom_test::RandomVideoSource video;
+ video.Begin();
+ constexpr int kNumFramesPerResolution = 2;
+ for (const auto &frame_size : kFrameSizes) {
+ cfg.g_w = frame_size.width;
+ cfg.g_h = frame_size.height;
+ if (cfg.g_w < 1 || cfg.g_w > 65536 || cfg.g_h < 1 || cfg.g_h > 65536) {
+ EXPECT_EQ(aom_codec_enc_config_set(enc.get(), &cfg),
+ AOM_CODEC_INVALID_PARAM);
+ continue;
+ }
+
+ EXPECT_EQ(aom_codec_enc_config_set(enc.get(), &cfg), AOM_CODEC_OK);
+ video.SetSize(cfg.g_w, cfg.g_h);
+
+ aom_codec_iter_t iter;
+ const aom_codec_cx_pkt_t *pkt;
+
+ for (int i = 0; i < kNumFramesPerResolution; ++i) {
+ video.Next(); // SetSize() does not call FillFrame().
+ EXPECT_EQ(aom_codec_encode(enc.get(), video.img(), video.pts(),
+ video.duration(), /*flags=*/0),
+ AOM_CODEC_OK);
+
+ iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(enc.get(), &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // The frame following a resolution change should be a keyframe as the
+ // change is too extreme to allow previous references to be used.
+ if (i == 0 || usage_ == AOM_USAGE_ALL_INTRA) {
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u)
+ << "frame " << frame_count;
+ }
+ frame_count++;
+ }
+ }
+ }
+
+ EXPECT_EQ(frame_count, 2);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Realtime, AV1ResolutionChange,
+ ::testing::Combine(::testing::Values(AOM_USAGE_REALTIME),
+ ::testing::Values(AOM_VBR, AOM_CBR),
+ ::testing::Range(6, 11)));
+
+#if !CONFIG_REALTIME_ONLY
+INSTANTIATE_TEST_SUITE_P(
+ GoodQuality, AV1ResolutionChange,
+ ::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
+ ::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
+ ::testing::Range(2, 6)));
+INSTANTIATE_TEST_SUITE_P(
+ GoodQualityLarge, AV1ResolutionChange,
+ ::testing::Combine(::testing::Values(AOM_USAGE_GOOD_QUALITY),
+ ::testing::Values(AOM_VBR, AOM_CBR, AOM_CQ, AOM_Q),
+ ::testing::Range(0, 2)));
+INSTANTIATE_TEST_SUITE_P(
+ AllIntra, AV1ResolutionChange,
+ ::testing::Combine(::testing::Values(AOM_USAGE_ALL_INTRA),
+ ::testing::Values(AOM_Q), ::testing::Range(6, 10)));
+
+typedef struct {
+ unsigned int width;
+ unsigned int height;
+} FrameSizeParam;
+
+const FrameSizeParam FrameSizeTestParams[] = { { 96, 96 }, { 176, 144 } };
+
+// This unit test is used to validate the allocated size of compressed data
+// (ctx->cx_data) buffer, by feeding pseudo random input to the encoder in
+// lossless encoding mode.
+//
+// If compressed data buffer is not large enough, the av1_get_compressed_data()
+// call in av1/av1_cx_iface.c will overflow the buffer.
+class AV1LosslessFrameSizeTests
+ : public ::libaom_test::CodecTestWith2Params<FrameSizeParam,
+ ::libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1LosslessFrameSizeTests()
+ : EncoderTest(GET_PARAM(0)), frame_size_param_(GET_PARAM(1)),
+ encoding_mode_(GET_PARAM(2)) {}
+ ~AV1LosslessFrameSizeTests() override = default;
+
+ void SetUp() override { InitializeConfig(encoding_mode_); }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
+ return !::testing::Test::HasFailure();
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 6);
+ encoder->Control(AV1E_SET_LOSSLESS, 1);
+ }
+ }
+
+ const FrameSizeParam frame_size_param_;
+ const ::libaom_test::TestMode encoding_mode_;
+ int expected_res_;
+};
+
+TEST_P(AV1LosslessFrameSizeTests, LosslessEncode) {
+ ::libaom_test::RandomVideoSource video;
+
+ video.SetSize(frame_size_param_.width, frame_size_param_.height);
+ video.set_limit(10);
+ expected_res_ = AOM_CODEC_OK;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(AV1LosslessFrameSizeTests,
+ ::testing::ValuesIn(FrameSizeTestParams),
+ testing::Values(::libaom_test::kAllIntra));
+#endif // !CONFIG_REALTIME_ONLY
+
+} // namespace
diff --git a/third_party/aom/test/function_equivalence_test.h b/third_party/aom/test/function_equivalence_test.h
new file mode 100644
index 0000000000..2268b9f2ad
--- /dev/null
+++ b/third_party/aom/test/function_equivalence_test.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace libaom_test {
+// Base class for tests that compare 2 implementations of the same function
+// for equivalence. The template parameter should be pointer to a function
+// that is being tested.
+//
+// The test takes a 3-parameters encapsulating struct 'FuncParam', containing:
+// - Pointer to reference function
+// - Pointer to tested function
+// - Integer bit depth (default to 0).
+//
+// These values are then accessible in the tests as member of params_:
+// params_.ref_func, params_.tst_func, and params_.bit_depth.
+//
+
+template <typename T>
+struct FuncParam {
+ FuncParam(T ref = nullptr, T tst = nullptr, int depth = 0)
+ : ref_func(ref), tst_func(tst), bit_depth(depth) {}
+ T ref_func;
+ T tst_func;
+ int bit_depth;
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const FuncParam<T> &p) {
+ return os << "bit_depth:" << p.bit_depth
+ << " function:" << reinterpret_cast<const void *>(p.ref_func)
+ << " function:" << reinterpret_cast<const void *>(p.tst_func);
+}
+
+template <typename T>
+class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
+ public:
+ FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+ ~FunctionEquivalenceTest() override = default;
+
+ void SetUp() override { params_ = this->GetParam(); }
+
+ protected:
+ ACMRandom rng_;
+ FuncParam<T> params_;
+};
+
+} // namespace libaom_test
+#endif // AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/third_party/aom/test/fwht4x4_test.cc b/third_party/aom/test/fwht4x4_test.cc
new file mode 100644
index 0000000000..bb9e218f6f
--- /dev/null
+++ b/third_party/aom/test/fwht4x4_test.cc
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+
+using libaom_test::FhtFunc;
+
+typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int, FdctFunc>
+ Dct4x4Param;
+
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+ TxfmParam * /*txfm_param*/) {
+ av1_fwht4x4_c(in, out, stride);
+}
+
+void iwht4x4_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+ av1_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+ av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+}
+
+#if HAVE_SSE4_1
+
+void iwht4x4_10_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
+ av1_highbd_iwht4x4_16_add_sse4_1(in, out, stride, 10);
+}
+
+void iwht4x4_12_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
+ av1_highbd_iwht4x4_16_add_sse4_1(in, out, stride, 12);
+}
+
+#endif
+
+class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
+ public ::testing::TestWithParam<Dct4x4Param> {
+ public:
+ ~Trans4x4WHT() override = default;
+
+ void SetUp() override {
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ pitch_ = 4;
+ height_ = 4;
+ fwd_txfm_ref = fwht4x4_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
+ num_coeffs_ = GET_PARAM(4);
+ fwd_txfm_c_ = GET_PARAM(5);
+ }
+
+ protected:
+ void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) override {
+ fwd_txfm_(in, out, stride);
+ }
+ void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) override {
+ inv_txfm_(out, dst, stride);
+ }
+ void RunSpeedTest() {
+ if (!fwd_txfm_c_) {
+ GTEST_SKIP();
+ } else {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 10;
+ const int numIter = 5000;
+
+ int c_sum_time = 0;
+ int simd_sum_time = 0;
+
+ int stride = 96;
+
+ int16_t *input_block = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(int16_t) * stride * height_));
+ ASSERT_NE(input_block, nullptr);
+ tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+ aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+ ASSERT_NE(output_ref_block, nullptr);
+ tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+ aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+ ASSERT_NE(output_block, nullptr);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ for (int j = 0; j < height_; ++j) {
+ for (int k = 0; k < pitch_; ++k) {
+ int in_idx = j * stride + k;
+ int out_idx = j * pitch_ + k;
+ input_block[in_idx] =
+ (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ if (bit_depth_ == AOM_BITS_8) {
+ output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+ } else {
+ output_block[out_idx] = output_ref_block[out_idx] =
+ rnd.Rand16() & mask_;
+ }
+ }
+ }
+
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+ for (int iter = 0; iter < numIter; iter++) {
+ API_REGISTER_STATE_CHECK(
+ fwd_txfm_c_(input_block, output_ref_block, stride));
+ }
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+
+ for (int iter = 0; iter < numIter; iter++) {
+ API_REGISTER_STATE_CHECK(
+ fwd_txfm_(input_block, output_block, stride));
+ }
+ aom_usec_timer_mark(&simd_timer_);
+
+ c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ // The minimum quant value is 4.
+ for (int j = 0; j < height_; ++j) {
+ for (int k = 0; k < pitch_; ++k) {
+ int out_idx = j * pitch_ + k;
+ ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+ << "Error: not bit-exact result at index: " << out_idx
+ << " at test block: " << i;
+ }
+ }
+ }
+
+ printf(
+ "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+ simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+ aom_free(input_block);
+ aom_free(output_ref_block);
+ aom_free(output_block);
+ }
+ }
+
+ FdctFunc fwd_txfm_;
+ IdctFunc inv_txfm_;
+
+ FdctFunc fwd_txfm_c_; // C version of forward transform for speed test.
+};
+
+TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
+
+TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+TEST_P(Trans4x4WHT, DISABLED_Speed) { RunSpeedTest(); }
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+ C, Trans4x4WHT,
+ ::testing::Values(make_tuple(&av1_fwht4x4_c, &iwht4x4_10_c, DCT_DCT,
+ AOM_BITS_10, 16,
+ static_cast<FdctFunc>(nullptr)),
+ make_tuple(&av1_fwht4x4_c, &iwht4x4_12_c, DCT_DCT,
+ AOM_BITS_12, 16,
+ static_cast<FdctFunc>(nullptr))));
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, Trans4x4WHT,
+ ::testing::Values(make_tuple(&av1_fwht4x4_sse4_1, &iwht4x4_10_sse4_1,
+ DCT_DCT, AOM_BITS_10, 16,
+ static_cast<FdctFunc>(nullptr)),
+ make_tuple(&av1_fwht4x4_sse4_1, &iwht4x4_12_sse4_1,
+ DCT_DCT, AOM_BITS_12, 16,
+ static_cast<FdctFunc>(nullptr))));
+
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, Trans4x4WHT,
+ ::testing::Values(make_tuple(&av1_fwht4x4_neon, &iwht4x4_10_c, DCT_DCT,
+ AOM_BITS_10, 16, &av1_fwht4x4_c),
+ make_tuple(&av1_fwht4x4_neon, &iwht4x4_12_c, DCT_DCT,
+ AOM_BITS_12, 16, &av1_fwht4x4_c)));
+
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/gf_pyr_height_test.cc b/third_party/aom/test/gf_pyr_height_test.cc
new file mode 100644
index 0000000000..0996d80c25
--- /dev/null
+++ b/third_party/aom/test/gf_pyr_height_test.cc
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+static const struct GFPyrHeightTestParam {
+ int gf_min_pyr_height;
+ int gf_max_pyr_height;
+ double psnr_thresh;
+} kTestParams[] = {
+ // gf_min_pyr_height = 0
+ { 0, 0, 32.30 },
+ { 0, 1, 33.90 },
+ { 0, 2, 34.00 },
+ { 0, 3, 34.20 },
+ { 0, 4, 34.30 },
+ { 0, 5, 34.35 },
+ // gf_min_pyr_height = 1
+ { 1, 1, 33.90 },
+ { 1, 2, 34.00 },
+ { 1, 3, 34.20 },
+ { 1, 4, 34.30 },
+ { 1, 5, 34.35 },
+ // gf_min_pyr_height = 2
+ { 2, 2, 34.00 },
+ { 2, 3, 34.20 },
+ { 2, 4, 34.30 },
+ { 2, 5, 34.35 },
+ // gf_min_pyr_height = 3
+ { 3, 3, 34.20 },
+ { 3, 4, 34.30 },
+ { 3, 5, 34.35 },
+ // gf_min_pyr_height = 4
+ { 4, 4, 34.30 },
+ { 4, 5, 34.35 },
+ // gf_min_pyr_height = 5
+ { 5, 5, 34.35 },
+};
+
+// Compiler may decide to add some padding to the struct above for alignment,
+// which the gtest may try to print (on error for example). This would cause
+// valgrind to complain that the padding is uninitialized. To avoid that, we
+// provide our own function to print the struct.
+// This also makes '--gtest_list_tests' output more understandable.
+std::ostream &operator<<(std::ostream &os, const GFPyrHeightTestParam &p) {
+ os << "GFPyrHeightTestParam { "
+ << "gf_min_pyr_height = " << p.gf_min_pyr_height << ", "
+ << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", "
+ << "psnr_thresh = " << p.psnr_thresh << " }";
+ return os;
+}
+
+// Params: encoding mode, rate control mode and GFPyrHeightTestParam object.
+class GFPyrHeightTest
+ : public ::libaom_test::CodecTestWith3Params<
+ libaom_test::TestMode, aom_rc_mode, GFPyrHeightTestParam>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ GFPyrHeightTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ rc_mode_(GET_PARAM(2)) {
+ gf_min_pyr_height_ = GET_PARAM(3).gf_min_pyr_height;
+ gf_max_pyr_height_ = GET_PARAM(3).gf_max_pyr_height;
+ psnr_threshold_ = GET_PARAM(3).psnr_thresh;
+ }
+ ~GFPyrHeightTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cpu_used_ = 4;
+ cfg_.rc_end_usage = rc_mode_;
+ if (rc_mode_ == AOM_VBR) {
+ cfg_.rc_target_bitrate = 200;
+ }
+ cfg_.g_lag_in_frames = 19;
+ cfg_.g_threads = 0;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ if (rc_mode_ == AOM_Q) {
+ encoder->Control(AOME_SET_CQ_LEVEL, 32);
+ }
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ encoder->Control(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, gf_min_pyr_height_);
+ encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() { return psnr_threshold_; }
+
+ ::libaom_test::TestMode encoding_mode_;
+ aom_rc_mode rc_mode_;
+ double psnr_threshold_;
+ int gf_min_pyr_height_;
+ int gf_max_pyr_height_;
+ int cpu_used_;
+ int nframes_;
+ double psnr_;
+};
+
+TEST_P(GFPyrHeightTest, EncodeAndVerifyPSNR) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 32);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+ << "GF Min Pyramid Height = " << gf_min_pyr_height_ << ", "
+ << "GF Max Pyramid Height = " << gf_max_pyr_height_;
+}
+
+AV1_INSTANTIATE_TEST_SUITE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
+ ::testing::Values(AOM_Q, AOM_VBR),
+ ::testing::ValuesIn(kTestParams));
+} // namespace
diff --git a/third_party/aom/test/gviz_api.py b/third_party/aom/test/gviz_api.py
new file mode 100755
index 0000000000..d3a443dabf
--- /dev/null
+++ b/third_party/aom/test/gviz_api.py
@@ -0,0 +1,1087 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts Python data into data for Google Visualization API clients.
+
+This library can be used to create a google.visualization.DataTable usable by
+visualizations built on the Google Visualization API. Output formats are raw
+JSON, JSON response, JavaScript, CSV, and HTML table.
+
+See http://code.google.com/apis/visualization/ for documentation on the
+Google Visualization API.
+"""
+
+__author__ = "Amit Weinstein, Misha Seltzer, Jacob Baskin"
+
+import cgi
+import cStringIO
+import csv
+import datetime
+try:
+ import json
+except ImportError:
+ import simplejson as json
+import types
+
+
+class DataTableException(Exception):
+ """The general exception object thrown by DataTable."""
+ pass
+
+
+class DataTableJSONEncoder(json.JSONEncoder):
+ """JSON encoder that handles date/time/datetime objects correctly."""
+
+ def __init__(self):
+ json.JSONEncoder.__init__(self,
+ separators=(",", ":"),
+ ensure_ascii=False)
+
+ def default(self, o):
+ if isinstance(o, datetime.datetime):
+ if o.microsecond == 0:
+ # If the time doesn't have ms-resolution, leave it out to keep
+ # things smaller.
+ return "Date(%d,%d,%d,%d,%d,%d)" % (
+ o.year, o.month - 1, o.day, o.hour, o.minute, o.second)
+ else:
+ return "Date(%d,%d,%d,%d,%d,%d,%d)" % (
+ o.year, o.month - 1, o.day, o.hour, o.minute, o.second,
+ o.microsecond / 1000)
+ elif isinstance(o, datetime.date):
+ return "Date(%d,%d,%d)" % (o.year, o.month - 1, o.day)
+ elif isinstance(o, datetime.time):
+ return [o.hour, o.minute, o.second]
+ else:
+ return super(DataTableJSONEncoder, self).default(o)
+
+
+class DataTable(object):
+ """Wraps the data to convert to a Google Visualization API DataTable.
+
+ Create this object, populate it with data, then call one of the ToJS...
+ methods to return a string representation of the data in the format described.
+
+ You can clear all data from the object to reuse it, but you cannot clear
+ individual cells, rows, or columns. You also cannot modify the table schema
+ specified in the class constructor.
+
+ You can add new data one or more rows at a time. All data added to an
+ instantiated DataTable must conform to the schema passed in to __init__().
+
+ You can reorder the columns in the output table, and also specify row sorting
+ order by column. The default column order is according to the original
+ table_description parameter. Default row sort order is ascending, by column
+ 1 values. For a dictionary, we sort the keys for order.
+
+ The data and the table_description are closely tied, as described here:
+
+ The table schema is defined in the class constructor's table_description
+ parameter. The user defines each column using a tuple of
+ (id[, type[, label[, custom_properties]]]). The default value for type is
+ string, label is the same as ID if not specified, and custom properties is
+ an empty dictionary if not specified.
+
+ table_description is a dictionary or list, containing one or more column
+ descriptor tuples, nested dictionaries, and lists. Each dictionary key, list
+ element, or dictionary element must eventually be defined as
+ a column description tuple. Here's an example of a dictionary where the key
+ is a tuple, and the value is a list of two tuples:
+ {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+
+ This flexibility in data entry enables you to build and manipulate your data
+ in a Python structure that makes sense for your program.
+
+ Add data to the table using the same nested design as the table's
+ table_description, replacing column descriptor tuples with cell data, and
+ each row is an element in the top level collection. This will be a bit
+ clearer after you look at the following examples showing the
+ table_description, matching data, and the resulting table:
+
+ Columns as list of tuples [col1, col2, col3]
+ table_description: [('a', 'number'), ('b', 'string')]
+ AppendData( [[1, 'z'], [2, 'w'], [4, 'o'], [5, 'k']] )
+ Table:
+ a b <--- these are column ids/labels
+ 1 z
+ 2 w
+ 4 o
+ 5 k
+
+ Dictionary of columns, where key is a column, and value is a list of
+ columns {col1: [col2, col3]}
+ table_description: {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+ AppendData( data: {1: [2, 'z'], 3: [4, 'w']}
+ Table:
+ a b c
+ 1 2 z
+ 3 4 w
+
+ Dictionary where key is a column, and the value is itself a dictionary of
+ columns {col1: {col2, col3}}
+ table_description: {('a', 'number'): {'b': 'number', 'c': 'string'}}
+ AppendData( data: {1: {'b': 2, 'c': 'z'}, 3: {'b': 4, 'c': 'w'}}
+ Table:
+ a b c
+ 1 2 z
+ 3 4 w
+ """
+
+ def __init__(self, table_description, data=None, custom_properties=None):
+ """Initialize the data table from a table schema and (optionally) data.
+
+ See the class documentation for more information on table schema and data
+ values.
+
+ Args:
+ table_description: A table schema, following one of the formats described
+ in TableDescriptionParser(). Schemas describe the
+ column names, data types, and labels. See
+ TableDescriptionParser() for acceptable formats.
+ data: Optional. If given, fills the table with the given data. The data
+ structure must be consistent with schema in table_description. See
+ the class documentation for more information on acceptable data. You
+ can add data later by calling AppendData().
+ custom_properties: Optional. A dictionary from string to string that
+ goes into the table's custom properties. This can be
+ later changed by changing self.custom_properties.
+
+ Raises:
+ DataTableException: Raised if the data and the description did not match,
+ or did not use the supported formats.
+ """
+ self.__columns = self.TableDescriptionParser(table_description)
+ self.__data = []
+ self.custom_properties = {}
+ if custom_properties is not None:
+ self.custom_properties = custom_properties
+ if data:
+ self.LoadData(data)
+
+ @staticmethod
+ def CoerceValue(value, value_type):
+ """Coerces a single value into the type expected for its column.
+
+ Internal helper method.
+
+ Args:
+ value: The value which should be converted
+ value_type: One of "string", "number", "boolean", "date", "datetime" or
+ "timeofday".
+
+ Returns:
+ An item of the Python type appropriate to the given value_type. Strings
+ are also converted to Unicode using UTF-8 encoding if necessary.
+ If a tuple is given, it should be in one of the following forms:
+ - (value, formatted value)
+ - (value, formatted value, custom properties)
+ where the formatted value is a string, and custom properties is a
+ dictionary of the custom properties for this cell.
+ To specify custom properties without specifying formatted value, one can
+ pass None as the formatted value.
+ One can also have a null-valued cell with formatted value and/or custom
+ properties by specifying None for the value.
+ This method ignores the custom properties except for checking that it is a
+ dictionary. The custom properties are handled in the ToJSon and ToJSCode
+ methods.
+ The real type of the given value is not strictly checked. For example,
+ any type can be used for string - as we simply take its str( ) and for
+ boolean value we just check "if value".
+ Examples:
+ CoerceValue(None, "string") returns None
+ CoerceValue((5, "5$"), "number") returns (5, "5$")
+ CoerceValue(100, "string") returns "100"
+ CoerceValue(0, "boolean") returns False
+
+ Raises:
+ DataTableException: The value and type did not match in a not-recoverable
+ way, for example given value 'abc' for type 'number'.
+ """
+ if isinstance(value, tuple):
+ # In case of a tuple, we run the same function on the value itself and
+ # add the formatted value.
+ if (len(value) not in [2, 3] or
+ (len(value) == 3 and not isinstance(value[2], dict))):
+ raise DataTableException("Wrong format for value and formatting - %s." %
+ str(value))
+ if not isinstance(value[1], types.StringTypes + (types.NoneType,)):
+ raise DataTableException("Formatted value is not string, given %s." %
+ type(value[1]))
+ js_value = DataTable.CoerceValue(value[0], value_type)
+ return (js_value,) + value[1:]
+
+ t_value = type(value)
+ if value is None:
+ return value
+ if value_type == "boolean":
+ return bool(value)
+
+ elif value_type == "number":
+ if isinstance(value, (int, long, float)):
+ return value
+ raise DataTableException("Wrong type %s when expected number" % t_value)
+
+ elif value_type == "string":
+ if isinstance(value, unicode):
+ return value
+ else:
+ return str(value).decode("utf-8")
+
+ elif value_type == "date":
+ if isinstance(value, datetime.datetime):
+ return datetime.date(value.year, value.month, value.day)
+ elif isinstance(value, datetime.date):
+ return value
+ else:
+ raise DataTableException("Wrong type %s when expected date" % t_value)
+
+ elif value_type == "timeofday":
+ if isinstance(value, datetime.datetime):
+ return datetime.time(value.hour, value.minute, value.second)
+ elif isinstance(value, datetime.time):
+ return value
+ else:
+ raise DataTableException("Wrong type %s when expected time" % t_value)
+
+ elif value_type == "datetime":
+ if isinstance(value, datetime.datetime):
+ return value
+ else:
+ raise DataTableException("Wrong type %s when expected datetime" %
+ t_value)
+ # If we got here, it means the given value_type was not one of the
+ # supported types.
+ raise DataTableException("Unsupported type %s" % value_type)
+
+ @staticmethod
+ def EscapeForJSCode(encoder, value):
+ if value is None:
+ return "null"
+ elif isinstance(value, datetime.datetime):
+ if value.microsecond == 0:
+ # If it's not ms-resolution, leave that out to save space.
+ return "new Date(%d,%d,%d,%d,%d,%d)" % (value.year,
+ value.month - 1, # To match JS
+ value.day,
+ value.hour,
+ value.minute,
+ value.second)
+ else:
+ return "new Date(%d,%d,%d,%d,%d,%d,%d)" % (value.year,
+ value.month - 1, # match JS
+ value.day,
+ value.hour,
+ value.minute,
+ value.second,
+ value.microsecond / 1000)
+ elif isinstance(value, datetime.date):
+ return "new Date(%d,%d,%d)" % (value.year, value.month - 1, value.day)
+ else:
+ return encoder.encode(value)
+
+ @staticmethod
+ def ToString(value):
+ if value is None:
+ return "(empty)"
+ elif isinstance(value, (datetime.datetime,
+ datetime.date,
+ datetime.time)):
+ return str(value)
+ elif isinstance(value, unicode):
+ return value
+ elif isinstance(value, bool):
+ return str(value).lower()
+ else:
+ return str(value).decode("utf-8")
+
+ @staticmethod
+ def ColumnTypeParser(description):
+ """Parses a single column description. Internal helper method.
+
+ Args:
+ description: a column description in the possible formats:
+ 'id'
+ ('id',)
+ ('id', 'type')
+ ('id', 'type', 'label')
+ ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+ Returns:
+ Dictionary with the following keys: id, label, type, and
+ custom_properties where:
+ - If label not given, it equals the id.
+ - If type not given, string is used by default.
+ - If custom properties are not given, an empty dictionary is used by
+ default.
+
+ Raises:
+ DataTableException: The column description did not match the RE, or
+ unsupported type was passed.
+ """
+ if not description:
+ raise DataTableException("Description error: empty description given")
+
+ if not isinstance(description, (types.StringTypes, tuple)):
+ raise DataTableException("Description error: expected either string or "
+ "tuple, got %s." % type(description))
+
+ if isinstance(description, types.StringTypes):
+ description = (description,)
+
+ # According to the tuple's length, we fill the keys
+ # We verify everything is of type string
+ for elem in description[:3]:
+ if not isinstance(elem, types.StringTypes):
+ raise DataTableException("Description error: expected tuple of "
+ "strings, current element of type %s." %
+ type(elem))
+ desc_dict = {"id": description[0],
+ "label": description[0],
+ "type": "string",
+ "custom_properties": {}}
+ if len(description) > 1:
+ desc_dict["type"] = description[1].lower()
+ if len(description) > 2:
+ desc_dict["label"] = description[2]
+ if len(description) > 3:
+ if not isinstance(description[3], dict):
+ raise DataTableException("Description error: expected custom "
+ "properties of type dict, current element "
+ "of type %s." % type(description[3]))
+ desc_dict["custom_properties"] = description[3]
+ if len(description) > 4:
+ raise DataTableException("Description error: tuple of length > 4")
+ if desc_dict["type"] not in ["string", "number", "boolean",
+ "date", "datetime", "timeofday"]:
+ raise DataTableException(
+ "Description error: unsupported type '%s'" % desc_dict["type"])
+ return desc_dict
+
+ @staticmethod
+ def TableDescriptionParser(table_description, depth=0):
+ """Parses the table_description object for internal use.
+
+ Parses the user-submitted table description into an internal format used
+ by the Python DataTable class. Returns the flat list of parsed columns.
+
+ Args:
+ table_description: A description of the table which should comply
+ with one of the formats described below.
+ depth: Optional. The depth of the first level in the current description.
+ Used by recursive calls to this function.
+
+ Returns:
+ List of columns, where each column represented by a dictionary with the
+ keys: id, label, type, depth, container which means the following:
+ - id: the id of the column
+ - name: The name of the column
+ - type: The datatype of the elements in this column. Allowed types are
+ described in ColumnTypeParser().
+ - depth: The depth of this column in the table description
+ - container: 'dict', 'iter' or 'scalar' for parsing the format easily.
+ - custom_properties: The custom properties for this column.
+ The returned description is flattened regardless of how it was given.
+
+ Raises:
+ DataTableException: Error in a column description or in the description
+ structure.
+
+ Examples:
+ A column description can be of the following forms:
+ 'id'
+ ('id',)
+ ('id', 'type')
+ ('id', 'type', 'label')
+ ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+ or as a dictionary:
+ 'id': 'type'
+ 'id': ('type',)
+ 'id': ('type', 'label')
+ 'id': ('type', 'label', {'custom_prop1': 'custom_val1'})
+ If the type is not specified, we treat it as string.
+ If no specific label is given, the label is simply the id.
+ If no custom properties are given, we use an empty dictionary.
+
+ input: [('a', 'date'), ('b', 'timeofday', 'b', {'foo': 'bar'})]
+ output: [{'id': 'a', 'label': 'a', 'type': 'date',
+ 'depth': 0, 'container': 'iter', 'custom_properties': {}},
+ {'id': 'b', 'label': 'b', 'type': 'timeofday',
+ 'depth': 0, 'container': 'iter',
+ 'custom_properties': {'foo': 'bar'}}]
+
+ input: {'a': [('b', 'number'), ('c', 'string', 'column c')]}
+ output: [{'id': 'a', 'label': 'a', 'type': 'string',
+ 'depth': 0, 'container': 'dict', 'custom_properties': {}},
+ {'id': 'b', 'label': 'b', 'type': 'number',
+ 'depth': 1, 'container': 'iter', 'custom_properties': {}},
+ {'id': 'c', 'label': 'column c', 'type': 'string',
+ 'depth': 1, 'container': 'iter', 'custom_properties': {}}]
+
+ input: {('a', 'number', 'column a'): { 'b': 'number', 'c': 'string'}}
+ output: [{'id': 'a', 'label': 'column a', 'type': 'number',
+ 'depth': 0, 'container': 'dict', 'custom_properties': {}},
+ {'id': 'b', 'label': 'b', 'type': 'number',
+ 'depth': 1, 'container': 'dict', 'custom_properties': {}},
+ {'id': 'c', 'label': 'c', 'type': 'string',
+ 'depth': 1, 'container': 'dict', 'custom_properties': {}}]
+
+ input: { ('w', 'string', 'word'): ('c', 'number', 'count') }
+ output: [{'id': 'w', 'label': 'word', 'type': 'string',
+ 'depth': 0, 'container': 'dict', 'custom_properties': {}},
+ {'id': 'c', 'label': 'count', 'type': 'number',
+ 'depth': 1, 'container': 'scalar', 'custom_properties': {}}]
+
+ input: {'a': ('number', 'column a'), 'b': ('string', 'column b')}
+ output: [{'id': 'a', 'label': 'column a', 'type': 'number', 'depth': 0,
+ 'container': 'dict', 'custom_properties': {}},
+ {'id': 'b', 'label': 'column b', 'type': 'string', 'depth': 0,
+ 'container': 'dict', 'custom_properties': {}}
+
+ NOTE: there might be ambiguity in the case of a dictionary representation
+ of a single column. For example, the following description can be parsed
+ in 2 different ways: {'a': ('b', 'c')} can be thought of a single column
+ with the id 'a', of type 'b' and the label 'c', or as 2 columns: one named
+ 'a', and the other named 'b' of type 'c'. We choose the first option by
+ default, and in case the second option is the right one, it is possible to
+ make the key into a tuple (i.e. {('a',): ('b', 'c')}) or add more info
+ into the tuple, thus making it look like this: {'a': ('b', 'c', 'b', {})}
+ -- second 'b' is the label, and {} is the custom properties field.
+ """
+ # For the recursion step, we check for a scalar object (string or tuple)
+ if isinstance(table_description, (types.StringTypes, tuple)):
+ parsed_col = DataTable.ColumnTypeParser(table_description)
+ parsed_col["depth"] = depth
+ parsed_col["container"] = "scalar"
+ return [parsed_col]
+
+ # Since it is not scalar, table_description must be iterable.
+ if not hasattr(table_description, "__iter__"):
+ raise DataTableException("Expected an iterable object, got %s" %
+ type(table_description))
+ if not isinstance(table_description, dict):
+ # We expects a non-dictionary iterable item.
+ columns = []
+ for desc in table_description:
+ parsed_col = DataTable.ColumnTypeParser(desc)
+ parsed_col["depth"] = depth
+ parsed_col["container"] = "iter"
+ columns.append(parsed_col)
+ if not columns:
+ raise DataTableException("Description iterable objects should not"
+ " be empty.")
+ return columns
+ # The other case is a dictionary
+ if not table_description:
+ raise DataTableException("Empty dictionaries are not allowed inside"
+ " description")
+
+ # To differentiate between the two cases of more levels below or this is
+ # the most inner dictionary, we consider the number of keys (more then one
+ # key is indication for most inner dictionary) and the type of the key and
+ # value in case of only 1 key (if the type of key is string and the type of
+ # the value is a tuple of 0-3 items, we assume this is the most inner
+ # dictionary).
+ # NOTE: this way of differentiating might create ambiguity. See docs.
+ if (len(table_description) != 1 or
+ (isinstance(table_description.keys()[0], types.StringTypes) and
+ isinstance(table_description.values()[0], tuple) and
+ len(table_description.values()[0]) < 4)):
+ # This is the most inner dictionary. Parsing types.
+ columns = []
+ # We sort the items, equivalent to sort the keys since they are unique
+ for key, value in sorted(table_description.items()):
+ # We parse the column type as (key, type) or (key, type, label) using
+ # ColumnTypeParser.
+ if isinstance(value, tuple):
+ parsed_col = DataTable.ColumnTypeParser((key,) + value)
+ else:
+ parsed_col = DataTable.ColumnTypeParser((key, value))
+ parsed_col["depth"] = depth
+ parsed_col["container"] = "dict"
+ columns.append(parsed_col)
+ return columns
+ # This is an outer dictionary, must have at most one key.
+ parsed_col = DataTable.ColumnTypeParser(table_description.keys()[0])
+ parsed_col["depth"] = depth
+ parsed_col["container"] = "dict"
+ return ([parsed_col] +
+ DataTable.TableDescriptionParser(table_description.values()[0],
+ depth=depth + 1))
+
+ @property
+ def columns(self):
+ """Returns the parsed table description."""
+ return self.__columns
+
+ def NumberOfRows(self):
+ """Returns the number of rows in the current data stored in the table."""
+ return len(self.__data)
+
+ def SetRowsCustomProperties(self, rows, custom_properties):
+ """Sets the custom properties for given row(s).
+
+ Can accept a single row or an iterable of rows.
+ Sets the given custom properties for all specified rows.
+
+ Args:
+ rows: The row, or rows, to set the custom properties for.
+ custom_properties: A string to string dictionary of custom properties to
+ set for all rows.
+ """
+ if not hasattr(rows, "__iter__"):
+ rows = [rows]
+ for row in rows:
+ self.__data[row] = (self.__data[row][0], custom_properties)
+
+ def LoadData(self, data, custom_properties=None):
+ """Loads new rows to the data table, clearing existing rows.
+
+ May also set the custom_properties for the added rows. The given custom
+ properties dictionary specifies the dictionary that will be used for *all*
+ given rows.
+
+ Args:
+ data: The rows that the table will contain.
+ custom_properties: A dictionary of string to string to set as the custom
+ properties for all rows.
+ """
+ self.__data = []
+ self.AppendData(data, custom_properties)
+
+ def AppendData(self, data, custom_properties=None):
+ """Appends new data to the table.
+
+ Data is appended in rows. Data must comply with
+ the table schema passed in to __init__(). See CoerceValue() for a list
+ of acceptable data types. See the class documentation for more information
+ and examples of schema and data values.
+
+ Args:
+ data: The row to add to the table. The data must conform to the table
+ description format.
+ custom_properties: A dictionary of string to string, representing the
+ custom properties to add to all the rows.
+
+ Raises:
+ DataTableException: The data structure does not match the description.
+ """
+ # If the maximal depth is 0, we simply iterate over the data table
+ # lines and insert them using _InnerAppendData. Otherwise, we simply
+ # let the _InnerAppendData handle all the levels.
+ if not self.__columns[-1]["depth"]:
+ for row in data:
+ self._InnerAppendData(({}, custom_properties), row, 0)
+ else:
+ self._InnerAppendData(({}, custom_properties), data, 0)
+
+ def _InnerAppendData(self, prev_col_values, data, col_index):
+ """Inner function to assist LoadData."""
+ # We first check that col_index has not exceeded the columns size
+ if col_index >= len(self.__columns):
+ raise DataTableException("The data does not match description, too deep")
+
+ # Dealing with the scalar case, the data is the last value.
+ if self.__columns[col_index]["container"] == "scalar":
+ prev_col_values[0][self.__columns[col_index]["id"]] = data
+ self.__data.append(prev_col_values)
+ return
+
+ if self.__columns[col_index]["container"] == "iter":
+ if not hasattr(data, "__iter__") or isinstance(data, dict):
+ raise DataTableException("Expected iterable object, got %s" %
+ type(data))
+ # We only need to insert the rest of the columns
+ # If there are less items than expected, we only add what there is.
+ for value in data:
+ if col_index >= len(self.__columns):
+ raise DataTableException("Too many elements given in data")
+ prev_col_values[0][self.__columns[col_index]["id"]] = value
+ col_index += 1
+ self.__data.append(prev_col_values)
+ return
+
+ # We know the current level is a dictionary, we verify the type.
+ if not isinstance(data, dict):
+ raise DataTableException("Expected dictionary at current level, got %s" %
+ type(data))
+ # We check if this is the last level
+ if self.__columns[col_index]["depth"] == self.__columns[-1]["depth"]:
+ # We need to add the keys in the dictionary as they are
+ for col in self.__columns[col_index:]:
+ if col["id"] in data:
+ prev_col_values[0][col["id"]] = data[col["id"]]
+ self.__data.append(prev_col_values)
+ return
+
+ # We have a dictionary in an inner depth level.
+ if not data.keys():
+ # In case this is an empty dictionary, we add a record with the columns
+ # filled only until this point.
+ self.__data.append(prev_col_values)
+ else:
+ for key in sorted(data):
+ col_values = dict(prev_col_values[0])
+ col_values[self.__columns[col_index]["id"]] = key
+ self._InnerAppendData((col_values, prev_col_values[1]),
+ data[key], col_index + 1)
+
+ def _PreparedData(self, order_by=()):
+ """Prepares the data for enumeration - sorting it by order_by.
+
+ Args:
+ order_by: Optional. Specifies the name of the column(s) to sort by, and
+ (optionally) which direction to sort in. Default sort direction
+ is asc. Following formats are accepted:
+ "string_col_name" -- For a single key in default (asc) order.
+ ("string_col_name", "asc|desc") -- For a single key.
+ [("col_1","asc|desc"), ("col_2","asc|desc")] -- For more than
+ one column, an array of tuples of (col_name, "asc|desc").
+
+ Returns:
+ The data sorted by the keys given.
+
+ Raises:
+ DataTableException: Sort direction not in 'asc' or 'desc'
+ """
+ if not order_by:
+ return self.__data
+
+ proper_sort_keys = []
+ if isinstance(order_by, types.StringTypes) or (
+ isinstance(order_by, tuple) and len(order_by) == 2 and
+ order_by[1].lower() in ["asc", "desc"]):
+ order_by = (order_by,)
+ for key in order_by:
+ if isinstance(key, types.StringTypes):
+ proper_sort_keys.append((key, 1))
+ elif (isinstance(key, (list, tuple)) and len(key) == 2 and
+ key[1].lower() in ("asc", "desc")):
+ proper_sort_keys.append((key[0], key[1].lower() == "asc" and 1 or -1))
+ else:
+ raise DataTableException("Expected tuple with second value: "
+ "'asc' or 'desc'")
+
+ def SortCmpFunc(row1, row2):
+ """cmp function for sorted. Compares by keys and 'asc'/'desc' keywords."""
+ for key, asc_mult in proper_sort_keys:
+ cmp_result = asc_mult * cmp(row1[0].get(key), row2[0].get(key))
+ if cmp_result:
+ return cmp_result
+ return 0
+
+ return sorted(self.__data, cmp=SortCmpFunc)
+
+ def ToJSCode(self, name, columns_order=None, order_by=()):
+ """Writes the data table as a JS code string.
+
+ This method writes a string of JS code that can be run to
+ generate a DataTable with the specified data. Typically used for debugging
+ only.
+
+ Args:
+ name: The name of the table. The name would be used as the DataTable's
+ variable name in the created JS code.
+ columns_order: Optional. Specifies the order of columns in the
+ output table. Specify a list of all column IDs in the order
+ in which you want the table created.
+ Note that you must list all column IDs in this parameter,
+ if you use it.
+ order_by: Optional. Specifies the name of the column(s) to sort by.
+ Passed as is to _PreparedData.
+
+ Returns:
+ A string of JS code that, when run, generates a DataTable with the given
+ name and the data stored in the DataTable object.
+ Example result:
+ "var tab1 = new google.visualization.DataTable();
+ tab1.addColumn("string", "a", "a");
+ tab1.addColumn("number", "b", "b");
+ tab1.addColumn("boolean", "c", "c");
+ tab1.addRows(10);
+ tab1.setCell(0, 0, "a");
+ tab1.setCell(0, 1, 1, null, {"foo": "bar"});
+ tab1.setCell(0, 2, true);
+ ...
+ tab1.setCell(9, 0, "c");
+ tab1.setCell(9, 1, 3, "3$");
+ tab1.setCell(9, 2, false);"
+
+ Raises:
+ DataTableException: The data does not match the type.
+ """
+
+ encoder = DataTableJSONEncoder()
+
+ if columns_order is None:
+ columns_order = [col["id"] for col in self.__columns]
+ col_dict = dict([(col["id"], col) for col in self.__columns])
+
+ # We first create the table with the given name
+ jscode = "var %s = new google.visualization.DataTable();\n" % name
+ if self.custom_properties:
+ jscode += "%s.setTableProperties(%s);\n" % (
+ name, encoder.encode(self.custom_properties))
+
+ # We add the columns to the table
+ for i, col in enumerate(columns_order):
+ jscode += "%s.addColumn(%s, %s, %s);\n" % (
+ name,
+ encoder.encode(col_dict[col]["type"]),
+ encoder.encode(col_dict[col]["label"]),
+ encoder.encode(col_dict[col]["id"]))
+ if col_dict[col]["custom_properties"]:
+ jscode += "%s.setColumnProperties(%d, %s);\n" % (
+ name, i, encoder.encode(col_dict[col]["custom_properties"]))
+ jscode += "%s.addRows(%d);\n" % (name, len(self.__data))
+
+ # We now go over the data and add each row
+ for (i, (row, cp)) in enumerate(self._PreparedData(order_by)):
+ # We add all the elements of this row by their order
+ for (j, col) in enumerate(columns_order):
+ if col not in row or row[col] is None:
+ continue
+ value = self.CoerceValue(row[col], col_dict[col]["type"])
+ if isinstance(value, tuple):
+ cell_cp = ""
+ if len(value) == 3:
+ cell_cp = ", %s" % encoder.encode(row[col][2])
+ # We have a formatted value or custom property as well
+ jscode += ("%s.setCell(%d, %d, %s, %s%s);\n" %
+ (name, i, j,
+ self.EscapeForJSCode(encoder, value[0]),
+ self.EscapeForJSCode(encoder, value[1]), cell_cp))
+ else:
+ jscode += "%s.setCell(%d, %d, %s);\n" % (
+ name, i, j, self.EscapeForJSCode(encoder, value))
+ if cp:
+ jscode += "%s.setRowProperties(%d, %s);\n" % (
+ name, i, encoder.encode(cp))
+ return jscode
+
+ def ToHtml(self, columns_order=None, order_by=()):
+ """Writes the data table as an HTML table code string.
+
+ Args:
+ columns_order: Optional. Specifies the order of columns in the
+ output table. Specify a list of all column IDs in the order
+ in which you want the table created.
+ Note that you must list all column IDs in this parameter,
+ if you use it.
+ order_by: Optional. Specifies the name of the column(s) to sort by.
+ Passed as is to _PreparedData.
+
+ Returns:
+ An HTML table code string.
+ Example result (the result is without the newlines):
+ <html><body><table border="1">
+ <thead><tr><th>a</th><th>b</th><th>c</th></tr></thead>
+ <tbody>
+ <tr><td>1</td><td>"z"</td><td>2</td></tr>
+ <tr><td>"3$"</td><td>"w"</td><td></td></tr>
+ </tbody>
+ </table></body></html>
+
+ Raises:
+ DataTableException: The data does not match the type.
+ """
+ table_template = "<html><body><table border=\"1\">%s</table></body></html>"
+ columns_template = "<thead><tr>%s</tr></thead>"
+ rows_template = "<tbody>%s</tbody>"
+ row_template = "<tr>%s</tr>"
+ header_cell_template = "<th>%s</th>"
+ cell_template = "<td>%s</td>"
+
+ if columns_order is None:
+ columns_order = [col["id"] for col in self.__columns]
+ col_dict = dict([(col["id"], col) for col in self.__columns])
+
+ columns_list = []
+ for col in columns_order:
+ columns_list.append(header_cell_template %
+ cgi.escape(col_dict[col]["label"]))
+ columns_html = columns_template % "".join(columns_list)
+
+ rows_list = []
+ # We now go over the data and add each row
+ for row, unused_cp in self._PreparedData(order_by):
+ cells_list = []
+ # We add all the elements of this row by their order
+ for col in columns_order:
+ # For empty string we want empty quotes ("").
+ value = ""
+ if col in row and row[col] is not None:
+ value = self.CoerceValue(row[col], col_dict[col]["type"])
+ if isinstance(value, tuple):
+ # We have a formatted value and we're going to use it
+ cells_list.append(cell_template % cgi.escape(self.ToString(value[1])))
+ else:
+ cells_list.append(cell_template % cgi.escape(self.ToString(value)))
+ rows_list.append(row_template % "".join(cells_list))
+ rows_html = rows_template % "".join(rows_list)
+
+ return table_template % (columns_html + rows_html)
+
+ def ToCsv(self, columns_order=None, order_by=(), separator=","):
+ """Writes the data table as a CSV string.
+
+ Output is encoded in UTF-8 because the Python "csv" module can't handle
+ Unicode properly according to its documentation.
+
+ Args:
+ columns_order: Optional. Specifies the order of columns in the
+ output table. Specify a list of all column IDs in the order
+ in which you want the table created.
+ Note that you must list all column IDs in this parameter,
+ if you use it.
+ order_by: Optional. Specifies the name of the column(s) to sort by.
+ Passed as is to _PreparedData.
+ separator: Optional. The separator to use between the values.
+
+ Returns:
+ A CSV string representing the table.
+ Example result:
+ 'a','b','c'
+ 1,'z',2
+ 3,'w',''
+
+ Raises:
+ DataTableException: The data does not match the type.
+ """
+
+ csv_buffer = cStringIO.StringIO()
+ writer = csv.writer(csv_buffer, delimiter=separator)
+
+ if columns_order is None:
+ columns_order = [col["id"] for col in self.__columns]
+ col_dict = dict([(col["id"], col) for col in self.__columns])
+
+ writer.writerow([col_dict[col]["label"].encode("utf-8")
+ for col in columns_order])
+
+ # We now go over the data and add each row
+ for row, unused_cp in self._PreparedData(order_by):
+ cells_list = []
+ # We add all the elements of this row by their order
+ for col in columns_order:
+ value = ""
+ if col in row and row[col] is not None:
+ value = self.CoerceValue(row[col], col_dict[col]["type"])
+ if isinstance(value, tuple):
+ # We have a formatted value. Using it only for date/time types.
+ if col_dict[col]["type"] in ["date", "datetime", "timeofday"]:
+ cells_list.append(self.ToString(value[1]).encode("utf-8"))
+ else:
+ cells_list.append(self.ToString(value[0]).encode("utf-8"))
+ else:
+ cells_list.append(self.ToString(value).encode("utf-8"))
+ writer.writerow(cells_list)
+ return csv_buffer.getvalue()
+
+ def ToTsvExcel(self, columns_order=None, order_by=()):
+ """Returns a file in tab-separated-format readable by MS Excel.
+
+ Returns a file in UTF-16 little endian encoding, with tabs separating the
+ values.
+
+ Args:
+ columns_order: Delegated to ToCsv.
+ order_by: Delegated to ToCsv.
+
+ Returns:
+ A tab-separated little endian UTF16 file representing the table.
+ """
+ return (self.ToCsv(columns_order, order_by, separator="\t")
+ .decode("utf-8").encode("UTF-16LE"))
+
+ def _ToJSonObj(self, columns_order=None, order_by=()):
+ """Returns an object suitable to be converted to JSON.
+
+ Args:
+ columns_order: Optional. A list of all column IDs in the order in which
+ you want them created in the output table. If specified,
+ all column IDs must be present.
+ order_by: Optional. Specifies the name of the column(s) to sort by.
+ Passed as is to _PreparedData().
+
+ Returns:
+ A dictionary object for use by ToJSon or ToJSonResponse.
+ """
+ if columns_order is None:
+ columns_order = [col["id"] for col in self.__columns]
+ col_dict = dict([(col["id"], col) for col in self.__columns])
+
+ # Creating the column JSON objects
+ col_objs = []
+ for col_id in columns_order:
+ col_obj = {"id": col_dict[col_id]["id"],
+ "label": col_dict[col_id]["label"],
+ "type": col_dict[col_id]["type"]}
+ if col_dict[col_id]["custom_properties"]:
+ col_obj["p"] = col_dict[col_id]["custom_properties"]
+ col_objs.append(col_obj)
+
+ # Creating the rows jsons
+ row_objs = []
+ for row, cp in self._PreparedData(order_by):
+ cell_objs = []
+ for col in columns_order:
+ value = self.CoerceValue(row.get(col, None), col_dict[col]["type"])
+ if value is None:
+ cell_obj = None
+ elif isinstance(value, tuple):
+ cell_obj = {"v": value[0]}
+ if len(value) > 1 and value[1] is not None:
+ cell_obj["f"] = value[1]
+ if len(value) == 3:
+ cell_obj["p"] = value[2]
+ else:
+ cell_obj = {"v": value}
+ cell_objs.append(cell_obj)
+ row_obj = {"c": cell_objs}
+ if cp:
+ row_obj["p"] = cp
+ row_objs.append(row_obj)
+
+ json_obj = {"cols": col_objs, "rows": row_objs}
+ if self.custom_properties:
+ json_obj["p"] = self.custom_properties
+
+ return json_obj
+
+ def ToJSon(self, columns_order=None, order_by=()):
+ """Returns a string that can be used in a JS DataTable constructor.
+
+ This method writes a JSON string that can be passed directly into a Google
+ Visualization API DataTable constructor. Use this output if you are
+ hosting the visualization HTML on your site, and want to code the data
+ table in Python. Pass this string into the
+ google.visualization.DataTable constructor, e.g,:
+ ... on my page that hosts my visualization ...
+ google.setOnLoadCallback(drawTable);
+ function drawTable() {
+ var data = new google.visualization.DataTable(_my_JSon_string, 0.6);
+ myTable.draw(data);
+ }
+
+ Args:
+ columns_order: Optional. Specifies the order of columns in the
+ output table. Specify a list of all column IDs in the order
+ in which you want the table created.
+ Note that you must list all column IDs in this parameter,
+ if you use it.
+ order_by: Optional. Specifies the name of the column(s) to sort by.
+ Passed as is to _PreparedData().
+
+ Returns:
+ A JSon constructor string to generate a JS DataTable with the data
+ stored in the DataTable object.
+ Example result (the result is without the newlines):
+ {cols: [{id:"a",label:"a",type:"number"},
+ {id:"b",label:"b",type:"string"},
+ {id:"c",label:"c",type:"number"}],
+ rows: [{c:[{v:1},{v:"z"},{v:2}]}, c:{[{v:3,f:"3$"},{v:"w"},{v:null}]}],
+ p: {'foo': 'bar'}}
+
+ Raises:
+ DataTableException: The data does not match the type.
+ """
+
+ encoder = DataTableJSONEncoder()
+ return encoder.encode(
+ self._ToJSonObj(columns_order, order_by)).encode("utf-8")
+
+ def ToJSonResponse(self, columns_order=None, order_by=(), req_id=0,
+ response_handler="google.visualization.Query.setResponse"):
+ """Writes a table as a JSON response that can be returned as-is to a client.
+
+ This method writes a JSON response to return to a client in response to a
+ Google Visualization API query. This string can be processed by the calling
+ page, and is used to deliver a data table to a visualization hosted on
+ a different page.
+
+ Args:
+ columns_order: Optional. Passed straight to self.ToJSon().
+ order_by: Optional. Passed straight to self.ToJSon().
+ req_id: Optional. The response id, as retrieved by the request.
+ response_handler: Optional. The response handler, as retrieved by the
+ request.
+
+ Returns:
+ A JSON response string to be received by JS the visualization Query
+ object. This response would be translated into a DataTable on the
+ client side.
+ Example result (newlines added for readability):
+ google.visualization.Query.setResponse({
+ 'version':'0.6', 'reqId':'0', 'status':'OK',
+ 'table': {cols: [...], rows: [...]}});
+
+ Note: The URL returning this string can be used as a data source by Google
+ Visualization Gadgets or from JS code.
+ """
+
+ response_obj = {
+ "version": "0.6",
+ "reqId": str(req_id),
+ "table": self._ToJSonObj(columns_order, order_by),
+ "status": "ok"
+ }
+ encoder = DataTableJSONEncoder()
+ return "%s(%s);" % (response_handler,
+ encoder.encode(response_obj).encode("utf-8"))
+
+ def ToResponse(self, columns_order=None, order_by=(), tqx=""):
+ """Writes the right response according to the request string passed in tqx.
+
+ This method parses the tqx request string (format of which is defined in
+ the documentation for implementing a data source of Google Visualization),
+ and returns the right response according to the request.
+ It parses out the "out" parameter of tqx, calls the relevant response
+ (ToJSonResponse() for "json", ToCsv() for "csv", ToHtml() for "html",
+ ToTsvExcel() for "tsv-excel") and passes the response function the rest of
+ the relevant request keys.
+
+ Args:
+ columns_order: Optional. Passed as is to the relevant response function.
+ order_by: Optional. Passed as is to the relevant response function.
+ tqx: Optional. The request string as received by HTTP GET. Should be in
+ the format "key1:value1;key2:value2...". All keys have a default
+ value, so an empty string will just do the default (which is calling
+ ToJSonResponse() with no extra parameters).
+
+ Returns:
+ A response string, as returned by the relevant response function.
+
+ Raises:
+ DataTableException: One of the parameters passed in tqx is not supported.
+ """
+ tqx_dict = {}
+ if tqx:
+ tqx_dict = dict(opt.split(":") for opt in tqx.split(";"))
+ if tqx_dict.get("version", "0.6") != "0.6":
+ raise DataTableException(
+ "Version (%s) passed by request is not supported."
+ % tqx_dict["version"])
+
+ if tqx_dict.get("out", "json") == "json":
+ response_handler = tqx_dict.get("responseHandler",
+ "google.visualization.Query.setResponse")
+ return self.ToJSonResponse(columns_order, order_by,
+ req_id=tqx_dict.get("reqId", 0),
+ response_handler=response_handler)
+ elif tqx_dict["out"] == "html":
+ return self.ToHtml(columns_order, order_by)
+ elif tqx_dict["out"] == "csv":
+ return self.ToCsv(columns_order, order_by)
+ elif tqx_dict["out"] == "tsv-excel":
+ return self.ToTsvExcel(columns_order, order_by)
+ else:
+ raise DataTableException(
+ "'out' parameter: '%s' is not supported" % tqx_dict["out"])
diff --git a/third_party/aom/test/hadamard_test.cc b/third_party/aom/test/hadamard_test.cc
new file mode 100644
index 0000000000..b01e78faaa
--- /dev/null
+++ b/third_party/aom/test/hadamard_test.cc
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+using HadamardFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+ tran_low_t *b);
+// Low precision version of Hadamard Transform
+using HadamardLPFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+ int16_t *b);
+// Low precision version of Hadamard Transform 8x8 - Dual
+using HadamardLP8x8DualFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+ int16_t *b);
+
+template <typename OutputType>
+void Hadamard4x4(const OutputType *a, OutputType *out) {
+ OutputType b[8];
+ for (int i = 0; i < 4; i += 2) {
+ b[i + 0] = (a[i * 4] + a[(i + 1) * 4]) >> 1;
+ b[i + 1] = (a[i * 4] - a[(i + 1) * 4]) >> 1;
+ }
+
+ out[0] = b[0] + b[2];
+ out[1] = b[1] + b[3];
+ out[2] = b[0] - b[2];
+ out[3] = b[1] - b[3];
+}
+
+template <typename OutputType>
+void ReferenceHadamard4x4(const int16_t *a, int a_stride, OutputType *b) {
+ OutputType input[16];
+ OutputType buf[16];
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ input[i * 4 + j] = static_cast<OutputType>(a[i * a_stride + j]);
+ }
+ }
+ for (int i = 0; i < 4; ++i) Hadamard4x4(input + i, buf + i * 4);
+ for (int i = 0; i < 4; ++i) Hadamard4x4(buf + i, b + i * 4);
+
+ // Extra transpose to match C and SSE2 behavior(i.e., aom_hadamard_4x4).
+ for (int i = 0; i < 4; i++) {
+ for (int j = i + 1; j < 4; j++) {
+ OutputType temp = b[j * 4 + i];
+ b[j * 4 + i] = b[i * 4 + j];
+ b[i * 4 + j] = temp;
+ }
+ }
+}
+
+template <typename OutputType>
+void HadamardLoop(const OutputType *a, OutputType *out) {
+ OutputType b[8];
+ for (int i = 0; i < 8; i += 2) {
+ b[i + 0] = a[i * 8] + a[(i + 1) * 8];
+ b[i + 1] = a[i * 8] - a[(i + 1) * 8];
+ }
+ OutputType c[8];
+ for (int i = 0; i < 8; i += 4) {
+ c[i + 0] = b[i + 0] + b[i + 2];
+ c[i + 1] = b[i + 1] + b[i + 3];
+ c[i + 2] = b[i + 0] - b[i + 2];
+ c[i + 3] = b[i + 1] - b[i + 3];
+ }
+ out[0] = c[0] + c[4];
+ out[7] = c[1] + c[5];
+ out[3] = c[2] + c[6];
+ out[4] = c[3] + c[7];
+ out[2] = c[0] - c[4];
+ out[6] = c[1] - c[5];
+ out[1] = c[2] - c[6];
+ out[5] = c[3] - c[7];
+}
+
+template <typename OutputType>
+void ReferenceHadamard8x8(const int16_t *a, int a_stride, OutputType *b) {
+ OutputType input[64];
+ OutputType buf[64];
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ input[i * 8 + j] = static_cast<OutputType>(a[i * a_stride + j]);
+ }
+ }
+ for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
+ for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
+
+ // Extra transpose to match SSE2 behavior (i.e., aom_hadamard_8x8 and
+ // aom_hadamard_lp_8x8).
+ for (int i = 0; i < 8; i++) {
+ for (int j = i + 1; j < 8; j++) {
+ OutputType temp = b[j * 8 + i];
+ b[j * 8 + i] = b[i * 8 + j];
+ b[i * 8 + j] = temp;
+ }
+ }
+}
+
+template <typename OutputType>
+void ReferenceHadamard8x8Dual(const int16_t *a, int a_stride, OutputType *b) {
+ /* The source is a 8x16 block. The destination is rearranged to 8x16.
+ * Input is 9 bit. */
+ ReferenceHadamard8x8(a, a_stride, b);
+ ReferenceHadamard8x8(a + 8, a_stride, b + 64);
+}
+
+template <typename OutputType>
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b,
+ bool shift) {
+ /* The source is a 16x16 block. The destination is rearranged to 8x32.
+ * Input is 9 bit. */
+ ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+ ReferenceHadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+ ReferenceHadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+ ReferenceHadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+ /* Overlay the 8x8 blocks and combine. */
+ for (int i = 0; i < 64; ++i) {
+ /* 8x8 steps the range up to 15 bits. */
+ const OutputType a0 = b[0];
+ const OutputType a1 = b[64];
+ const OutputType a2 = b[128];
+ const OutputType a3 = b[192];
+
+ /* Prevent the result from escaping int16_t. */
+ const OutputType b0 = (a0 + a1) >> 1;
+ const OutputType b1 = (a0 - a1) >> 1;
+ const OutputType b2 = (a2 + a3) >> 1;
+ const OutputType b3 = (a2 - a3) >> 1;
+
+ /* Store a 16 bit value. */
+ b[0] = b0 + b2;
+ b[64] = b1 + b3;
+ b[128] = b0 - b2;
+ b[192] = b1 - b3;
+
+ ++b;
+ }
+
+ if (shift) {
+ b -= 64;
+ // Extra shift to match aom_hadamard_16x16_c and aom_hadamard_16x16_avx2.
+ for (int i = 0; i < 16; i++) {
+ for (int j = 0; j < 4; j++) {
+ OutputType temp = b[i * 16 + 4 + j];
+ b[i * 16 + 4 + j] = b[i * 16 + 8 + j];
+ b[i * 16 + 8 + j] = temp;
+ }
+ }
+ }
+}
+
+template <typename OutputType>
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b,
+ bool shift) {
+ ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0, shift);
+ ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256, shift);
+ ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512, shift);
+ ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768, shift);
+
+ for (int i = 0; i < 256; ++i) {
+ const OutputType a0 = b[0];
+ const OutputType a1 = b[256];
+ const OutputType a2 = b[512];
+ const OutputType a3 = b[768];
+
+ const OutputType b0 = (a0 + a1) >> 2;
+ const OutputType b1 = (a0 - a1) >> 2;
+ const OutputType b2 = (a2 + a3) >> 2;
+ const OutputType b3 = (a2 - a3) >> 2;
+
+ b[0] = b0 + b2;
+ b[256] = b1 + b3;
+ b[512] = b0 - b2;
+ b[768] = b1 - b3;
+
+ ++b;
+ }
+}
+
+template <typename OutputType>
+void ReferenceHadamard(const int16_t *a, int a_stride, OutputType *b, int bw,
+ int bh, bool shift) {
+ if (bw == 32 && bh == 32) {
+ ReferenceHadamard32x32(a, a_stride, b, shift);
+ } else if (bw == 16 && bh == 16) {
+ ReferenceHadamard16x16(a, a_stride, b, shift);
+ } else if (bw == 8 && bh == 8) {
+ ReferenceHadamard8x8(a, a_stride, b);
+ } else if (bw == 4 && bh == 4) {
+ ReferenceHadamard4x4(a, a_stride, b);
+ } else if (bw == 8 && bh == 16) {
+ ReferenceHadamard8x8Dual(a, a_stride, b);
+ } else {
+ GTEST_FAIL() << "Invalid Hadamard transform size " << bw << bh << std::endl;
+ }
+}
+
+template <typename HadamardFuncType>
+struct FuncWithSize {
+ FuncWithSize(HadamardFuncType f, int bw, int bh)
+ : func(f), block_width(bw), block_height(bh) {}
+ HadamardFuncType func;
+ int block_width;
+ int block_height;
+};
+
+using HadamardFuncWithSize = FuncWithSize<HadamardFunc>;
+using HadamardLPFuncWithSize = FuncWithSize<HadamardLPFunc>;
+using HadamardLP8x8DualFuncWithSize = FuncWithSize<HadamardLP8x8DualFunc>;
+
+template <typename OutputType, typename HadamardFuncType>
+class HadamardTestBase
+ : public ::testing::TestWithParam<FuncWithSize<HadamardFuncType>> {
+ public:
+ HadamardTestBase(const FuncWithSize<HadamardFuncType> &func_param,
+ bool do_shift) {
+ h_func_ = func_param.func;
+ bw_ = func_param.block_width;
+ bh_ = func_param.block_height;
+ shift_ = do_shift;
+ }
+
+ void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ // The Rand() function generates values in the range [-((1 << BitDepth) - 1),
+ // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
+ // is the residual pixel, which is defined as 'source pixel - predicted
+ // pixel'. Source pixel and predicted pixel take values in the range
+ // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from
+ // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1).
+ virtual int16_t Rand() = 0;
+
+ void CompareReferenceRandom() {
+ const int kMaxBlockSize = 32 * 32;
+ const int block_size = bw_ * bh_;
+
+ DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
+ DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
+ memset(a, 0, sizeof(a));
+ memset(b, 0, sizeof(b));
+
+ OutputType b_ref[kMaxBlockSize];
+ memset(b_ref, 0, sizeof(b_ref));
+
+ for (int i = 0; i < block_size; ++i) a[i] = Rand();
+ ReferenceHadamard(a, bw_, b_ref, bw_, bh_, shift_);
+ API_REGISTER_STATE_CHECK(h_func_(a, bw_, b));
+
+ // The order of the output is not important. Sort before checking.
+ std::sort(b, b + block_size);
+ std::sort(b_ref, b_ref + block_size);
+ EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
+ }
+
+ void CompareReferenceExtreme() {
+ const int kMaxBlockSize = 32 * 32;
+ const int block_size = bw_ * bh_;
+ const int kBitDepth = 8;
+ DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
+ DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
+ memset(b, 0, sizeof(b));
+
+ OutputType b_ref[kMaxBlockSize];
+ memset(b_ref, 0, sizeof(b_ref));
+ for (int i = 0; i < 2; ++i) {
+ const int sign = (i == 0) ? 1 : -1;
+ for (int j = 0; j < block_size; ++j) a[j] = sign * ((1 << kBitDepth) - 1);
+
+ ReferenceHadamard(a, bw_, b_ref, bw_, bh_, shift_);
+ API_REGISTER_STATE_CHECK(h_func_(a, bw_, b));
+
+ // The order of the output is not important. Sort before checking.
+ std::sort(b, b + block_size);
+ std::sort(b_ref, b_ref + block_size);
+ EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
+ }
+ }
+
+ void VaryStride() {
+ const int kMaxBlockSize = 32 * 32;
+ const int block_size = bw_ * bh_;
+
+ DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
+ DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
+ memset(a, 0, sizeof(a));
+ for (int i = 0; i < block_size * 8; ++i) a[i] = Rand();
+
+ OutputType b_ref[kMaxBlockSize];
+ for (int i = 8; i < 64; i += 8) {
+ memset(b, 0, sizeof(b));
+ memset(b_ref, 0, sizeof(b_ref));
+
+ ReferenceHadamard(a, i, b_ref, bw_, bh_, shift_);
+ API_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+ // The order of the output is not important. Sort before checking.
+ std::sort(b, b + block_size);
+ std::sort(b_ref, b_ref + block_size);
+ EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+ }
+ }
+
+ void SpeedTest(int times) {
+ const int kMaxBlockSize = 32 * 32;
+ DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
+ DECLARE_ALIGNED(16, OutputType, output[kMaxBlockSize]);
+ memset(input, 1, sizeof(input));
+ memset(output, 0, sizeof(output));
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < times; ++i) {
+ h_func_(input, bw_, output);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("Hadamard%dx%d[%12d runs]: %d us\n", bw_, bh_, times, elapsed_time);
+ }
+
+ protected:
+ ACMRandom rnd_;
+
+ private:
+ HadamardFuncType h_func_;
+ int bw_;
+ int bh_;
+ bool shift_;
+};
+
+class HadamardLowbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
+ public:
+ HadamardLowbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
+ // Use values between -255 (0xFF01) and 255 (0x00FF)
+ int16_t Rand() override {
+ int16_t src = rnd_.Rand8();
+ int16_t pred = rnd_.Rand8();
+ return src - pred;
+ }
+};
+
+TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
+
+TEST_P(HadamardLowbdTest, CompareReferenceExtreme) {
+ CompareReferenceExtreme();
+}
+
+TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, HadamardLowbdTest,
+ ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_c, 4, 4),
+ HadamardFuncWithSize(&aom_hadamard_8x8_c, 8, 8),
+ HadamardFuncWithSize(&aom_hadamard_16x16_c, 16, 16),
+ HadamardFuncWithSize(&aom_hadamard_32x32_c, 32, 32)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, HadamardLowbdTest,
+ ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_sse2, 4, 4),
+ HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8, 8),
+ HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16, 16),
+ HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32, 32)));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, HadamardLowbdTest,
+ ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16, 16),
+ HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32, 32)));
+#endif // HAVE_AVX2
+
+// TODO(aomedia:3314): Disable NEON unit test for now, since hadamard 16x16 NEON
+// need modifications to match C/AVX2 behavior.
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HadamardLowbdTest,
+ ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_neon, 4, 4),
+ HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8, 8),
+ HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16, 16),
+ HadamardFuncWithSize(&aom_hadamard_32x32_neon, 32, 32)));
+#endif // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class HadamardHighbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
+ protected:
+ HadamardHighbdTest() : HadamardTestBase(GetParam(), /*do_shift=*/true) {}
+ // Use values between -4095 (0xF001) and 4095 (0x0FFF)
+ int16_t Rand() override {
+ int16_t src = rnd_.Rand12();
+ int16_t pred = rnd_.Rand12();
+ return src - pred;
+ }
+};
+
+TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
+
+TEST_P(HadamardHighbdTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardHighbdTest, DISABLED_Speed) {
+ SpeedTest(10);
+ SpeedTest(10000);
+ SpeedTest(10000000);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ C, HadamardHighbdTest,
+ ::testing::Values(
+ HadamardFuncWithSize(&aom_highbd_hadamard_8x8_c, 8, 8),
+ HadamardFuncWithSize(&aom_highbd_hadamard_16x16_c, 16, 16),
+ HadamardFuncWithSize(&aom_highbd_hadamard_32x32_c, 32, 32)));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, HadamardHighbdTest,
+ ::testing::Values(
+ HadamardFuncWithSize(&aom_highbd_hadamard_8x8_avx2, 8, 8),
+ HadamardFuncWithSize(&aom_highbd_hadamard_16x16_avx2, 16, 16),
+ HadamardFuncWithSize(&aom_highbd_hadamard_32x32_avx2, 32, 32)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HadamardHighbdTest,
+ ::testing::Values(
+ HadamardFuncWithSize(&aom_highbd_hadamard_8x8_neon, 8, 8),
+ HadamardFuncWithSize(&aom_highbd_hadamard_16x16_neon, 16, 16),
+ HadamardFuncWithSize(&aom_highbd_hadamard_32x32_neon, 32, 32)));
+#endif // HAVE_NEON
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Tests for low precision
+class HadamardLowbdLPTest : public HadamardTestBase<int16_t, HadamardLPFunc> {
+ public:
+ HadamardLowbdLPTest() : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
+ // Use values between -255 (0xFF01) and 255 (0x00FF)
+ int16_t Rand() override {
+ int16_t src = rnd_.Rand8();
+ int16_t pred = rnd_.Rand8();
+ return src - pred;
+ }
+};
+
+TEST_P(HadamardLowbdLPTest, CompareReferenceRandom) {
+ CompareReferenceRandom();
+}
+
+TEST_P(HadamardLowbdLPTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdLPTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(
+ C, HadamardLowbdLPTest,
+ ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_c, 8, 8),
+ HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_c, 16,
+ 16)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, HadamardLowbdLPTest,
+ ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_sse2, 8, 8),
+ HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_sse2, 16,
+ 16)));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, HadamardLowbdLPTest,
+ ::testing::Values(HadamardLPFuncWithSize(
+ &aom_hadamard_lp_16x16_avx2, 16, 16)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HadamardLowbdLPTest,
+ ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_neon, 8, 8),
+ HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_neon, 16,
+ 16)));
+#endif // HAVE_NEON
+
+// Tests for 8x8 dual low precision
+class HadamardLowbdLP8x8DualTest
+ : public HadamardTestBase<int16_t, HadamardLP8x8DualFunc> {
+ public:
+ HadamardLowbdLP8x8DualTest()
+ : HadamardTestBase(GetParam(), /*do_shift=*/false) {}
+ // Use values between -255 (0xFF01) and 255 (0x00FF)
+ int16_t Rand() override {
+ int16_t src = rnd_.Rand8();
+ int16_t pred = rnd_.Rand8();
+ return src - pred;
+ }
+};
+
+TEST_P(HadamardLowbdLP8x8DualTest, CompareReferenceRandom) {
+ CompareReferenceRandom();
+}
+
+TEST_P(HadamardLowbdLP8x8DualTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdLP8x8DualTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(C, HadamardLowbdLP8x8DualTest,
+ ::testing::Values(HadamardLP8x8DualFuncWithSize(
+ &aom_hadamard_lp_8x8_dual_c, 8, 16)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, HadamardLowbdLP8x8DualTest,
+ ::testing::Values(HadamardLP8x8DualFuncWithSize(
+ &aom_hadamard_lp_8x8_dual_sse2, 8, 16)));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, HadamardLowbdLP8x8DualTest,
+ ::testing::Values(HadamardLP8x8DualFuncWithSize(
+ &aom_hadamard_lp_8x8_dual_avx2, 8, 16)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, HadamardLowbdLP8x8DualTest,
+ ::testing::Values(HadamardLP8x8DualFuncWithSize(
+ &aom_hadamard_lp_8x8_dual_neon, 8, 16)));
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/hash_test.cc b/third_party/aom/test/hash_test.cc
new file mode 100644
index 0000000000..a1de9323db
--- /dev/null
+++ b/third_party/aom/test/hash_test.cc
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/hash.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef uint32_t (*get_crc32c_value_func)(void *calculator, uint8_t *p,
+ size_t length);
+
+typedef std::tuple<get_crc32c_value_func, int> HashParam;
+
+class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
+ public:
+ ~AV1Crc32cHashTest() override;
+ void SetUp() override;
+
+ void TearDown() override;
+
+ protected:
+ void RunCheckOutput(get_crc32c_value_func test_impl);
+ void RunSpeedTest(get_crc32c_value_func test_impl);
+
+ void RunZeroTest(get_crc32c_value_func test_impl);
+
+ libaom_test::ACMRandom rnd_;
+ CRC32C calc_;
+ uint8_t *buffer_;
+ int bsize_;
+ size_t length_;
+};
+
+AV1Crc32cHashTest::~AV1Crc32cHashTest() = default;
+
+void AV1Crc32cHashTest::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ av1_crc32c_calculator_init(&calc_);
+
+ bsize_ = GET_PARAM(1);
+ length_ = bsize_ * bsize_ * sizeof(uint16_t);
+ buffer_ = new uint8_t[length_];
+ ASSERT_NE(buffer_, nullptr);
+ for (size_t i = 0; i < length_; ++i) {
+ buffer_[i] = rnd_.Rand8();
+ }
+}
+
+void AV1Crc32cHashTest::TearDown() { delete[] buffer_; }
+
+void AV1Crc32cHashTest::RunCheckOutput(get_crc32c_value_func test_impl) {
+ get_crc32c_value_func ref_impl = av1_get_crc32c_value_c;
+ // for the same buffer crc should be the same
+ uint32_t crc0 = test_impl(&calc_, buffer_, length_);
+ uint32_t crc1 = test_impl(&calc_, buffer_, length_);
+ uint32_t crc2 = ref_impl(&calc_, buffer_, length_);
+ ASSERT_EQ(crc0, crc1);
+ ASSERT_EQ(crc0, crc2); // should equal to software version
+ // modify buffer
+ buffer_[0] += 1;
+ uint32_t crc3 = test_impl(&calc_, buffer_, length_);
+ uint32_t crc4 = ref_impl(&calc_, buffer_, length_);
+ ASSERT_NE(crc0, crc3); // crc shoud not equal to previous one
+ ASSERT_EQ(crc3, crc4);
+}
+
+void AV1Crc32cHashTest::RunSpeedTest(get_crc32c_value_func test_impl) {
+ get_crc32c_value_func impls[] = { av1_get_crc32c_value_c, test_impl };
+ const int repeat = 10000000 / (bsize_ + bsize_);
+
+ aom_usec_timer timer;
+ double time[2];
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer_start(&timer);
+ for (int j = 0; j < repeat; ++j) {
+ impls[i](&calc_, buffer_, length_);
+ }
+ aom_usec_timer_mark(&timer);
+ time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ }
+ printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]);
+ printf("(%3.2f)\n", time[0] / time[1]);
+}
+
+void AV1Crc32cHashTest::RunZeroTest(get_crc32c_value_func test_impl) {
+ uint8_t buffer0[1024] = { 0 };
+ // for buffer with different size the crc should not be the same
+ const uint32_t crc0 = test_impl(&calc_, buffer0, 32);
+ const uint32_t crc1 = test_impl(&calc_, buffer0, 128);
+ const uint32_t crc2 = test_impl(&calc_, buffer0, 1024);
+ ASSERT_NE(crc0, crc1);
+ ASSERT_NE(crc0, crc2);
+ ASSERT_NE(crc1, crc2);
+}
+
+TEST_P(AV1Crc32cHashTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, CheckZero) { RunZeroTest(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+const int kValidBlockSize[] = { 64, 32, 8, 4 };
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1Crc32cHashTest,
+ ::testing::Combine(::testing::Values(&av1_get_crc32c_value_c),
+ ::testing::ValuesIn(kValidBlockSize)));
+
+#if HAVE_SSE4_2
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_2, AV1Crc32cHashTest,
+ ::testing::Combine(::testing::Values(&av1_get_crc32c_value_sse4_2),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_ARM_CRC32
+INSTANTIATE_TEST_SUITE_P(
+ ARM_CRC32, AV1Crc32cHashTest,
+ ::testing::Combine(::testing::Values(&av1_get_crc32c_value_arm_crc32),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/hbd_metrics_test.cc b/third_party/aom/test/hbd_metrics_test.cc
new file mode 100644
index 0000000000..303d580c4a
--- /dev/null
+++ b/third_party/aom/test/hbd_metrics_test.cc
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/msvc.h"
+#include "aom_scale/yv12config.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef double (*LBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest);
+typedef double (*HBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+ uint32_t bd);
+
+double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+ uint32_t bd) {
+ PSNR_STATS psnr;
+ aom_calc_highbd_psnr(source, dest, &psnr, bd, in_bd);
+ return psnr.psnr[0];
+}
+
+double compute_psnr(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest) {
+ PSNR_STATS psnr;
+ aom_calc_psnr(source, dest, &psnr);
+ return psnr.psnr[0];
+}
+
+double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+ uint32_t bd) {
+ double tempy, tempu, tempv;
+ return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_psnrhvs(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest) {
+ double tempy, tempu, tempv;
+ return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+ uint32_t bd) {
+ double tempy, tempu, tempv;
+ return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_fastssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest) {
+ double tempy, tempu, tempv;
+ return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+ uint32_t bd) {
+ double ssim[2], weight[2];
+ aom_highbd_calc_ssim(source, dest, weight, bd, in_bd, ssim);
+ return 100 * pow(ssim[0] / weight[0], 8.0);
+}
+
+double compute_aomssim(const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dest) {
+ double ssim, weight;
+ aom_lowbd_calc_ssim(source, dest, &weight, &ssim);
+ return 100 * pow(ssim / weight, 8.0);
+}
+
+class HBDMetricsTestBase {
+ public:
+ virtual ~HBDMetricsTestBase() = default;
+
+ protected:
+ void RunAccuracyCheck() {
+ const int width = 1920;
+ const int height = 1080;
+ size_t i = 0;
+ const uint8_t kPixFiller = 128;
+ YV12_BUFFER_CONFIG lbd_src, lbd_dst;
+ YV12_BUFFER_CONFIG hbd_src, hbd_dst;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ double lbd_db, hbd_db;
+
+ memset(&lbd_src, 0, sizeof(lbd_src));
+ memset(&lbd_dst, 0, sizeof(lbd_dst));
+ memset(&hbd_src, 0, sizeof(hbd_src));
+ memset(&hbd_dst, 0, sizeof(hbd_dst));
+
+ aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16, 0, 0);
+ aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16, 0, 0);
+ aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16, 0, 0);
+ aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16, 0, 0);
+
+ memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz);
+ while (i < lbd_src.buffer_alloc_sz) {
+ uint16_t spel, dpel;
+ spel = lbd_src.buffer_alloc[i];
+ // Create some distortion for dst buffer.
+ dpel = rnd.Rand8();
+ lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+ ((uint16_t *)(hbd_src.buffer_alloc))[i] = spel << (bit_depth_ - 8);
+ ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+ i++;
+ }
+
+ lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+ hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+ EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+ i = 0;
+ while (i < lbd_src.buffer_alloc_sz) {
+ uint16_t dpel;
+ // Create some small distortion for dst buffer.
+ dpel = 120 + (rnd.Rand8() >> 4);
+ lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+ ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+ i++;
+ }
+
+ lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+ hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+ EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+ i = 0;
+ while (i < lbd_src.buffer_alloc_sz) {
+ uint16_t dpel;
+ // Create some small distortion for dst buffer.
+ dpel = 126 + (rnd.Rand8() >> 6);
+ lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+ ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+ i++;
+ }
+
+ lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+ hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+ EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+ aom_free_frame_buffer(&lbd_src);
+ aom_free_frame_buffer(&lbd_dst);
+ aom_free_frame_buffer(&hbd_src);
+ aom_free_frame_buffer(&hbd_dst);
+ }
+
+ int input_bit_depth_;
+ int bit_depth_;
+ double threshold_;
+ LBDMetricFunc lbd_metric_;
+ HBDMetricFunc hbd_metric_;
+};
+
+typedef std::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
+ MetricTestTParam;
+class HBDMetricsTest : public HBDMetricsTestBase,
+ public ::testing::TestWithParam<MetricTestTParam> {
+ public:
+ void SetUp() override {
+ lbd_metric_ = GET_PARAM(0);
+ hbd_metric_ = GET_PARAM(1);
+ input_bit_depth_ = GET_PARAM(2);
+ bit_depth_ = GET_PARAM(3);
+ threshold_ = GET_PARAM(4);
+ }
+};
+
+TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); }
+
+// Allow small variation due to floating point operations.
+static const double kSsim_thresh = 0.001;
+// Allow some additional errors accumulated in floating point operations.
+static const double kFSsim_thresh = 0.03;
+// Allow some extra variation due to rounding error accumulated in dct.
+static const double kPhvs_thresh = 0.3;
+
+INSTANTIATE_TEST_SUITE_P(
+ AOMSSIM, HBDMetricsTest,
+ ::testing::Values(MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+ 8, 10, kSsim_thresh),
+ MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+ 10, 10, kPhvs_thresh),
+ MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+ 8, 12, kSsim_thresh),
+ MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+ 12, 12, kPhvs_thresh)));
+INSTANTIATE_TEST_SUITE_P(
+ FASTSSIM, HBDMetricsTest,
+ ::testing::Values(MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+ 8, 10, kFSsim_thresh),
+ MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+ 10, 10, kFSsim_thresh),
+ MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+ 8, 12, kFSsim_thresh),
+ MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+ 12, 12, kFSsim_thresh)));
+INSTANTIATE_TEST_SUITE_P(
+ PSNRHVS, HBDMetricsTest,
+ ::testing::Values(MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+ 8, 10, kPhvs_thresh),
+ MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+ 10, 10, kPhvs_thresh),
+ MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+ 8, 12, kPhvs_thresh),
+ MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+ 12, 12, kPhvs_thresh)));
+INSTANTIATE_TEST_SUITE_P(
+ PSNR, HBDMetricsTest,
+ ::testing::Values(
+ MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10, kPhvs_thresh),
+ MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10, 10,
+ kPhvs_thresh),
+ MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 12, kPhvs_thresh),
+ MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12, 12,
+ kPhvs_thresh)));
+} // namespace
diff --git a/third_party/aom/test/hiprec_convolve_test.cc b/third_party/aom/test/hiprec_convolve_test.cc
new file mode 100644
index 0000000000..78883ccddf
--- /dev/null
+++ b/third_party/aom/test/hiprec_convolve_test.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/hiprec_convolve_test_util.h"
+
+using libaom_test::ACMRandom;
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdHiprecConvolveTest);
+#endif
+using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HiprecConvolveTest);
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+
+TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
+TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
+ RunSpeedTest(GET_PARAM(3));
+}
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HiprecConvolveTest,
+ libaom_test::AV1HiprecConvolve::BuildParams(
+ av1_wiener_convolve_add_src_sse2));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HiprecConvolveTest,
+ libaom_test::AV1HiprecConvolve::BuildParams(
+ av1_wiener_convolve_add_src_avx2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HiprecConvolveTest,
+ libaom_test::AV1HiprecConvolve::BuildParams(
+ av1_wiener_convolve_add_src_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_SSSE3 || HAVE_AVX2 || HAVE_NEON
+TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(4));
+}
+TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) {
+ RunSpeedTest(GET_PARAM(4));
+}
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdHiprecConvolveTest,
+ libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+ av1_highbd_wiener_convolve_add_src_ssse3));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdHiprecConvolveTest,
+ libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+ av1_highbd_wiener_convolve_add_src_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HighbdHiprecConvolveTest,
+ libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+ av1_highbd_wiener_convolve_add_src_neon));
+#endif
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/hiprec_convolve_test_util.cc b/third_party/aom/test/hiprec_convolve_test_util.cc
new file mode 100644
index 0000000000..6d7902fd04
--- /dev/null
+++ b/third_party/aom/test/hiprec_convolve_test_util.cc
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/hiprec_convolve_test_util.h"
+
+#include <memory>
+#include <new>
+
+#include "av1/common/restoration.h"
+
+using std::make_tuple;
+using std::tuple;
+
+namespace libaom_test {
+
+// Generate a random pair of filter kernels, using the ranges
+// of possible values from the loop-restoration experiment
+static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
+ InterpKernel vkernel, int kernel_type = 2) {
+ if (kernel_type == 0) {
+ // Low possible values for filter coefficients, 7-tap kernel
+ hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
+ hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
+ hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
+ hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = vkernel[7] = 0;
+ } else if (kernel_type == 1) {
+ // Max possible values for filter coefficients, 7-tap kernel
+ hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
+ hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
+ hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
+ hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = vkernel[7] = 0;
+ } else if (kernel_type == 2) {
+ // Randomly generated values for filter coefficients, 7-tap kernel
+ hkernel[0] = hkernel[6] =
+ WIENER_FILT_TAP0_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
+ hkernel[1] = hkernel[5] =
+ WIENER_FILT_TAP1_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+ hkernel[2] = hkernel[4] =
+ WIENER_FILT_TAP2_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+ hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = 0;
+
+ vkernel[0] = vkernel[6] =
+ WIENER_FILT_TAP0_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 2 - WIENER_FILT_TAP0_MINV);
+ vkernel[1] = vkernel[5] =
+ WIENER_FILT_TAP1_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
+ vkernel[2] = vkernel[4] =
+ WIENER_FILT_TAP2_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
+ vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+ vkernel[7] = 0;
+ } else if (kernel_type == 3) {
+ // Low possible values for filter coefficients, 5-tap kernel
+ hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
+ hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
+ hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
+ hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = vkernel[7] = 0;
+ } else if (kernel_type == 4) {
+ // Max possible values for filter coefficients, 5-tap kernel
+ hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = 0;
+ hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
+ hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
+ hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = vkernel[7] = 0;
+ } else {
+ // Randomly generated values for filter coefficients, 5-tap kernel
+ hkernel[0] = hkernel[6] = 0;
+ hkernel[1] = hkernel[5] =
+ WIENER_FILT_TAP1_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+ hkernel[2] = hkernel[4] =
+ WIENER_FILT_TAP2_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+ hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[7] = 0;
+
+ vkernel[0] = vkernel[6] = 0;
+ vkernel[1] = vkernel[5] =
+ WIENER_FILT_TAP1_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
+ vkernel[2] = vkernel[4] =
+ WIENER_FILT_TAP2_MINV +
+ rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
+ vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+ vkernel[7] = 0;
+ }
+}
+
+namespace AV1HiprecConvolve {
+
+::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
+ hiprec_convolve_func filter) {
+ const HiprecConvolveParam params[] = {
+ make_tuple(8, 8, 50000, filter), make_tuple(8, 4, 50000, filter),
+ make_tuple(64, 24, 1000, filter), make_tuple(64, 64, 1000, filter),
+ make_tuple(64, 56, 1000, filter), make_tuple(32, 8, 10000, filter),
+ make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
+ make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
+ make_tuple(64, 34, 1000, filter), make_tuple(8, 17, 10000, filter),
+ make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
+ };
+ return ::testing::ValuesIn(params);
+}
+
+AV1HiprecConvolveTest::~AV1HiprecConvolveTest() = default;
+void AV1HiprecConvolveTest::SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
+ const int w = 128, h = 128;
+ const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+ const int num_iters = GET_PARAM(2);
+ int i, j, k, m;
+ const WienerConvolveParams conv_params = get_conv_params_wiener(8);
+
+ std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
+ ASSERT_NE(input_, nullptr);
+ uint8_t *input = input_.get();
+
+ // The AVX2 convolve functions always write rows with widths that are
+ // multiples of 16. So to avoid a buffer overflow, we may need to pad
+ // rows to a multiple of 16.
+ int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+ std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output2, nullptr);
+
+ // Generate random filter kernels
+ DECLARE_ALIGNED(16, InterpKernel, hkernel);
+ DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+ for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
+ generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
+ for (i = 0; i < num_iters; ++i) {
+ for (k = 0; k < h; ++k)
+ for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8();
+ // Choose random locations within the source block
+ int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+ int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+ av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w,
+ output.get(), out_w, hkernel, 16, vkernel,
+ 16, out_w, out_h, &conv_params);
+ test_impl(input + offset_r * w + offset_c, w, output2.get(), out_w,
+ hkernel, 16, vkernel, 16, out_w, out_h, &conv_params);
+
+ for (j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+ << (j / out_w) << ") on iteration " << i;
+ }
+ }
+}
+
+void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
+ const int w = 128, h = 128;
+ const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+ const int num_iters = GET_PARAM(2) / 500;
+ int i, j, k;
+ const WienerConvolveParams conv_params = get_conv_params_wiener(8);
+
+ std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * w]);
+ ASSERT_NE(input_, nullptr);
+ uint8_t *input = input_.get();
+
+ // The AVX2 convolve functions always write rows with widths that are
+ // multiples of 16. So to avoid a buffer overflow, we may need to pad
+ // rows to a multiple of 16.
+ int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+ std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output2, nullptr);
+
+ // Generate random filter kernels
+ DECLARE_ALIGNED(16, InterpKernel, hkernel);
+ DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+ generate_kernels(&rnd_, hkernel, vkernel);
+
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < num_iters; ++i) {
+ for (j = 3; j < h - out_h - 4; j++) {
+ for (k = 3; k < w - out_w - 4; k++) {
+ av1_wiener_convolve_add_src_c(input + j * w + k, w, output.get(), out_w,
+ hkernel, 16, vkernel, 16, out_w, out_h,
+ &conv_params);
+ }
+ }
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (i = 0; i < num_iters; ++i) {
+ for (j = 3; j < h - out_h - 4; j++) {
+ for (k = 3; k < w - out_w - 4; k++) {
+ test_impl(input + j * w + k, w, output2.get(), out_w, hkernel, 16,
+ vkernel, 16, out_w, out_h, &conv_params);
+ }
+ }
+ }
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+}
+} // namespace AV1HiprecConvolve
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdHiprecConvolve {
+
+::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
+ highbd_hiprec_convolve_func filter) {
+ const HighbdHiprecConvolveParam params[] = {
+ make_tuple(8, 8, 50000, 8, filter), make_tuple(64, 64, 1000, 8, filter),
+ make_tuple(32, 8, 10000, 8, filter), make_tuple(8, 8, 50000, 10, filter),
+ make_tuple(64, 64, 1000, 10, filter), make_tuple(32, 8, 10000, 10, filter),
+ make_tuple(8, 8, 50000, 12, filter), make_tuple(64, 64, 1000, 12, filter),
+ make_tuple(32, 8, 10000, 12, filter),
+ };
+ return ::testing::ValuesIn(params);
+}
+
+AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() = default;
+void AV1HighbdHiprecConvolveTest::SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdHiprecConvolveTest::RunCheckOutput(
+ highbd_hiprec_convolve_func test_impl) {
+ const int w = 128, h = 128;
+ const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+ const int num_iters = GET_PARAM(2);
+ const int bd = GET_PARAM(3);
+ int i, j;
+ const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
+
+ std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
+ ASSERT_NE(input, nullptr);
+
+ // The AVX2 convolve functions always write rows with widths that are
+ // multiples of 16. So to avoid a buffer overflow, we may need to pad
+ // rows to a multiple of 16.
+ int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+ std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output2, nullptr);
+
+ // Generate random filter kernels
+ DECLARE_ALIGNED(16, InterpKernel, hkernel);
+ DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+ uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
+ uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
+ uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
+ for (int kernel_type = 0; kernel_type < 6; kernel_type++) {
+ generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
+ for (i = 0; i < num_iters; ++i) {
+ // Choose random locations within the source block
+ int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+ int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+ av1_highbd_wiener_convolve_add_src_c(
+ input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel,
+ 16, vkernel, 16, out_w, out_h, &conv_params, bd);
+ test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
+ hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
+
+ for (j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+ << (j / out_w) << ") on iteration " << i;
+ }
+ }
+}
+
+void AV1HighbdHiprecConvolveTest::RunSpeedTest(
+ highbd_hiprec_convolve_func test_impl) {
+ const int w = 128, h = 128;
+ const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+ const int num_iters = GET_PARAM(2) / 500;
+ const int bd = GET_PARAM(3);
+ int i, j, k;
+ const WienerConvolveParams conv_params = get_conv_params_wiener(bd);
+
+ std::unique_ptr<uint16_t[]> input(new (std::nothrow) uint16_t[h * w]);
+ ASSERT_NE(input, nullptr);
+
+ // The AVX2 convolve functions always write rows with widths that are
+ // multiples of 16. So to avoid a buffer overflow, we may need to pad
+ // rows to a multiple of 16.
+ int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+ std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output2, nullptr);
+
+ // Generate random filter kernels
+ DECLARE_ALIGNED(16, InterpKernel, hkernel);
+ DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+ generate_kernels(&rnd_, hkernel, vkernel);
+
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+ uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input.get());
+ uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output.get());
+ uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2.get());
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < num_iters; ++i) {
+ for (j = 3; j < h - out_h - 4; j++) {
+ for (k = 3; k < w - out_w - 4; k++) {
+ av1_highbd_wiener_convolve_add_src_c(
+ input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel,
+ 16, out_w, out_h, &conv_params, bd);
+ }
+ }
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (i = 0; i < num_iters; ++i) {
+ for (j = 3; j < h - out_h - 4; j++) {
+ for (k = 3; k < w - out_w - 4; k++) {
+ test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
+ vkernel, 16, out_w, out_h, &conv_params, bd);
+ }
+ }
+ }
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+}
+} // namespace AV1HighbdHiprecConvolve
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace libaom_test
diff --git a/third_party/aom/test/hiprec_convolve_test_util.h b/third_party/aom/test/hiprec_convolve_test_util.h
new file mode 100644
index 0000000000..beae5c729b
--- /dev/null
+++ b/third_party/aom/test/hiprec_convolve_test_util.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#define AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+namespace libaom_test {
+
+namespace AV1HiprecConvolve {
+
+typedef void (*hiprec_convolve_func)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const WienerConvolveParams *conv_params);
+
+typedef std::tuple<int, int, int, hiprec_convolve_func> HiprecConvolveParam;
+
+::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
+ hiprec_convolve_func filter);
+
+class AV1HiprecConvolveTest
+ : public ::testing::TestWithParam<HiprecConvolveParam> {
+ public:
+ ~AV1HiprecConvolveTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(hiprec_convolve_func test_impl);
+ void RunSpeedTest(hiprec_convolve_func test_impl);
+
+ libaom_test::ACMRandom rnd_;
+};
+
+} // namespace AV1HiprecConvolve
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdHiprecConvolve {
+typedef void (*highbd_hiprec_convolve_func)(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bps);
+
+typedef std::tuple<int, int, int, int, highbd_hiprec_convolve_func>
+ HighbdHiprecConvolveParam;
+
+::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
+ highbd_hiprec_convolve_func filter);
+
+class AV1HighbdHiprecConvolveTest
+ : public ::testing::TestWithParam<HighbdHiprecConvolveParam> {
+ public:
+ ~AV1HighbdHiprecConvolveTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
+ void RunSpeedTest(highbd_hiprec_convolve_func test_impl);
+
+ libaom_test::ACMRandom rnd_;
+};
+
+} // namespace AV1HighbdHiprecConvolve
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace libaom_test
+
+#endif // AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
diff --git a/third_party/aom/test/horver_correlation_test.cc b/third_party/aom/test/horver_correlation_test.cc
new file mode 100644
index 0000000000..5e397ffdf7
--- /dev/null
+++ b/third_party/aom/test/horver_correlation_test.cc
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*HorverFunc)(const int16_t *diff, int stride, int w, int h,
+ float *hcorr, float *vcorr);
+
+typedef std::tuple<const HorverFunc> HorverTestParam;
+
+class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
+ public:
+ void SetUp() override {
+ data_buf_ = (int16_t *)aom_malloc(MAX_SB_SQUARE * sizeof(int16_t));
+ ASSERT_NE(data_buf_, nullptr);
+ target_func_ = GET_PARAM(0);
+ }
+ void TearDown() override { aom_free(data_buf_); }
+ void RunHorverTest();
+ void RunHorverTest_ExtremeValues();
+ void RunHorverSpeedTest(int run_times);
+
+ private:
+ HorverFunc target_func_;
+ ACMRandom rng_;
+ int16_t *data_buf_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HorverTest);
+
+void HorverTest::RunHorverTest() {
+ for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
+ const int w = block_size_wide[block_size];
+ const int h = block_size_high[block_size];
+ for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+ float hcorr_ref = 0.0, vcorr_ref = 0.0;
+ float hcorr_test = 0.0, vcorr_test = 0.0;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ data_buf_[i] = (rng_.Rand16() % (1 << 12)) - (1 << 11);
+ }
+
+ av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h,
+ &hcorr_ref, &vcorr_ref);
+
+ target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test);
+
+ ASSERT_LE(fabs(hcorr_ref - hcorr_test), 1e-6)
+ << "hcorr incorrect (" << w << "x" << h << ")";
+ ASSERT_LE(fabs(vcorr_ref - vcorr_test), 1e-6)
+ << "vcorr incorrect (" << w << "x" << h << ")";
+ }
+ // printf("(%3dx%-3d) passed\n", w, h);
+ }
+}
+
+void HorverTest::RunHorverSpeedTest(int run_times) {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ data_buf_[i] = rng_.Rand16() % (1 << 12);
+ }
+
+ for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
+ const int w = block_size_wide[block_size];
+ const int h = block_size_high[block_size];
+ float hcorr_ref = 0.0, vcorr_ref = 0.0;
+ float hcorr_test = 0.0, vcorr_test = 0.0;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h,
+ &hcorr_ref, &vcorr_ref);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", w, h, time1, time2,
+ time1 / time2);
+ }
+}
+
+void HorverTest::RunHorverTest_ExtremeValues() {
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ // Most of get_horver_test is squaring and summing, so simply saturating
+ // the whole buffer is mostly likely to cause an overflow.
+ data_buf_[i] = (1 << 12) - 1;
+ }
+
+ for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
+ const int w = block_size_wide[block_size];
+ const int h = block_size_high[block_size];
+ float hcorr_ref = 0.0, vcorr_ref = 0.0;
+ float hcorr_test = 0.0, vcorr_test = 0.0;
+
+ av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h, &hcorr_ref,
+ &vcorr_ref);
+ target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test);
+
+ ASSERT_LE(fabs(hcorr_ref - hcorr_test), 1e-6) << "hcorr incorrect";
+ ASSERT_LE(fabs(vcorr_ref - vcorr_test), 1e-6) << "vcorr incorrect";
+ }
+}
+
+TEST_P(HorverTest, RandomValues) { RunHorverTest(); }
+
+TEST_P(HorverTest, ExtremeValues) { RunHorverTest_ExtremeValues(); }
+
+TEST_P(HorverTest, DISABLED_Speed) { RunHorverSpeedTest(100000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, HorverTest,
+ ::testing::Values(av1_get_horver_correlation_full_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HorverTest, ::testing::Values(av1_get_horver_correlation_full_neon));
+#endif // HAVE_NEON
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, HorverTest, ::testing::Values(av1_get_horver_correlation_full_avx2));
+#endif // HAVE_AVX2
+
+} // namespace
diff --git a/third_party/aom/test/horz_superres_test.cc b/third_party/aom/test/horz_superres_test.cc
new file mode 100644
index 0000000000..595ed548c7
--- /dev/null
+++ b/third_party/aom/test/horz_superres_test.cc
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/encoder.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using std::make_tuple;
+using std::tuple;
+
+/* TESTING PARAMETERS */
+
+const int kBitrate = 40;
+
+typedef struct {
+ const char *filename;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+ unsigned int limit;
+ unsigned int screen_content;
+ double psnr_threshold; // used by modes other than AOM_SUPERRES_AUTO
+ double psnr_threshold2; // used by AOM_SUPERRES_AUTO
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " limit:" << test_arg.limit
+ << " screen_content:" << test_arg.screen_content
+ << " psnr_threshold:" << test_arg.psnr_threshold << " }";
+}
+
+const TestVideoParam kTestVideoVectors[] = {
+ { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.3,
+ 44.7 },
+#if CONFIG_AV1_HIGHBITDEPTH
+ { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0,
+ 46.8 },
+#endif
+ { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 23.0, 52.5 },
+ // Image coding (single frame).
+ { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0,
+ 49.0 },
+};
+
+// Modes with extra params have their own tests.
+const aom_superres_mode kSuperresModesWithoutParams[] = { AOM_SUPERRES_RANDOM,
+ AOM_SUPERRES_AUTO };
+
+// Superres denominators and superres kf denominators to be tested
+typedef tuple<int, int> SuperresDenominatorPair;
+const SuperresDenominatorPair kSuperresDenominators[] = {
+ make_tuple(16, 9), make_tuple(13, 11), make_tuple(9, 9),
+ make_tuple(13, 13), make_tuple(11, 16), make_tuple(8, 16),
+ make_tuple(16, 8), make_tuple(8, 8), make_tuple(9, 14),
+};
+
+// Superres q thresholds and superres kf q thresholds to be tested
+typedef tuple<int, int> SuperresQThresholdPair;
+const SuperresQThresholdPair kSuperresQThresholds[] = {
+ make_tuple(63, 63), make_tuple(63, 41), make_tuple(17, 63),
+ make_tuple(41, 11), make_tuple(1, 37), make_tuple(11, 11),
+ make_tuple(1, 1), make_tuple(17, 29), make_tuple(29, 11),
+};
+
+/* END (TESTING PARAMETERS) */
+
+// Test parameter list:
+// <[needed for EncoderTest], test_video_param_, superres_mode_>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+ aom_superres_mode>
+ HorzSuperresTestParam;
+
+class HorzSuperresEndToEndTest
+ : public ::testing::TestWithParam<HorzSuperresTestParam>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ HorzSuperresEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {}
+
+ ~HorzSuperresEndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_Q;
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ // Set superres parameters
+ cfg_.rc_superres_mode = superres_mode_;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ frame_count_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ frame_count_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+ // Set cpu-used = 8 for speed
+ encoder->Control(AOME_SET_CPUUSED, 8);
+
+ // Test screen coding tools
+ if (test_video_param_.screen_content)
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ else
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (frame_count_) return psnr_ / frame_count_;
+ return 0.0;
+ }
+
+ void DoTest() {
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ test_video_param_.limit));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr_thresh = (superres_mode_ == AOM_SUPERRES_AUTO)
+ ? test_video_param_.psnr_threshold2
+ : test_video_param_.psnr_threshold;
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, psnr_thresh);
+
+ EXPECT_EQ(test_video_param_.limit, frame_count_);
+ }
+
+ TestVideoParam test_video_param_;
+ aom_superres_mode superres_mode_;
+
+ private:
+ double psnr_;
+ unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresEndToEndTest,
+ ::testing::ValuesIn(kTestVideoVectors),
+ ::testing::ValuesIn(kSuperresModesWithoutParams));
+
+// Test parameter list:
+// <[needed for EncoderTest], test_video_param_, tuple(superres_denom_,
+// superres_kf_denom_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+ SuperresDenominatorPair>
+ HorzSuperresFixedTestParam;
+
+class HorzSuperresFixedEndToEndTest
+ : public ::testing::TestWithParam<HorzSuperresFixedTestParam>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ HorzSuperresFixedEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ superres_mode_(AOM_SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
+ SuperresDenominatorPair denoms = GET_PARAM(2);
+ superres_denom_ = std::get<0>(denoms);
+ superres_kf_denom_ = std::get<1>(denoms);
+ }
+
+ ~HorzSuperresFixedEndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ // Set superres parameters
+ cfg_.rc_superres_mode = superres_mode_;
+ cfg_.rc_superres_denominator = superres_denom_;
+ cfg_.rc_superres_kf_denominator = superres_kf_denom_;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ frame_count_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ frame_count_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+ // Set cpu-used = 8 for speed
+ encoder->Control(AOME_SET_CPUUSED, 8);
+
+ // Test screen coding tools
+ if (test_video_param_.screen_content)
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ else
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (frame_count_) return psnr_ / frame_count_;
+ return 0.0;
+ }
+
+ void DoTest() {
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ test_video_param_.limit));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+ << "superres_mode_ = " << superres_mode_
+ << ", superres_denom_ = " << superres_denom_
+ << ", superres_kf_denom_ = " << superres_kf_denom_;
+
+ EXPECT_EQ(test_video_param_.limit, frame_count_)
+ << "superres_mode_ = " << superres_mode_
+ << ", superres_denom_ = " << superres_denom_
+ << ", superres_kf_denom_ = " << superres_kf_denom_;
+ }
+
+ TestVideoParam test_video_param_;
+ aom_superres_mode superres_mode_;
+ int superres_denom_;
+ int superres_kf_denom_;
+
+ private:
+ double psnr_;
+ unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresFixedEndToEndTest,
+ ::testing::ValuesIn(kTestVideoVectors),
+ ::testing::ValuesIn(kSuperresDenominators));
+
+// Test parameter list:
+// <[needed for EncoderTest], test_video_param_,
+// tuple(superres_qthresh_,superres_kf_qthresh_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+ SuperresQThresholdPair>
+ HorzSuperresQThreshTestParam;
+
+class HorzSuperresQThreshEndToEndTest
+ : public ::testing::TestWithParam<HorzSuperresQThreshTestParam>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ HorzSuperresQThreshEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ superres_mode_(AOM_SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
+ SuperresQThresholdPair qthresholds = GET_PARAM(2);
+ superres_qthresh_ = std::get<0>(qthresholds);
+ superres_kf_qthresh_ = std::get<1>(qthresholds);
+ }
+
+ ~HorzSuperresQThreshEndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ // Set superres parameters
+ cfg_.rc_superres_mode = superres_mode_;
+ cfg_.rc_superres_qthresh = superres_qthresh_;
+ cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ frame_count_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ frame_count_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+
+ // Set cpu-used = 8 for speed
+ encoder->Control(AOME_SET_CPUUSED, 8);
+
+ // Test screen coding tools
+ if (test_video_param_.screen_content)
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ else
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (frame_count_) return psnr_ / frame_count_;
+ return 0.0;
+ }
+
+ void DoTest() {
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ test_video_param_.limit));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+ << "superres_mode_ = " << superres_mode_
+ << ", superres_qthresh_ = " << superres_qthresh_
+ << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+
+ EXPECT_EQ(test_video_param_.limit, frame_count_)
+ << "superres_mode_ = " << superres_mode_
+ << ", superres_qthresh_ = " << superres_qthresh_
+ << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+ }
+
+ TestVideoParam test_video_param_;
+ aom_superres_mode superres_mode_;
+ int superres_qthresh_;
+ int superres_kf_qthresh_;
+
+ private:
+ double psnr_;
+ unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
+ DoTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(HorzSuperresQThreshEndToEndTest,
+ ::testing::ValuesIn(kTestVideoVectors),
+ ::testing::ValuesIn(kSuperresQThresholds));
+
+} // namespace
diff --git a/third_party/aom/test/i420_video_source.h b/third_party/aom/test/i420_video_source.h
new file mode 100644
index 0000000000..233e7152b9
--- /dev/null
+++ b/third_party/aom/test/i420_video_source.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_I420_VIDEO_SOURCE_H_
+#define AOM_TEST_I420_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/yuv_video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class I420VideoSource : public YUVVideoSource {
+ public:
+ I420VideoSource(const std::string &file_name, unsigned int width,
+ unsigned int height, int rate_numerator, int rate_denominator,
+ unsigned int start, int limit)
+ : YUVVideoSource(file_name, AOM_IMG_FMT_I420, width, height,
+ rate_numerator, rate_denominator, start, limit) {}
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_I420_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/intra_edge_test.cc b/third_party/aom/test/intra_edge_test.cc
new file mode 100644
index 0000000000..96ee65466b
--- /dev/null
+++ b/third_party/aom/test/intra_edge_test.cc
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class UpsampleTest : public FunctionEquivalenceTest<F> {
+ protected:
+ static const int kIterations = 1000000;
+ static const int kMinEdge = 4;
+ static const int kMaxEdge = 24;
+ static const int kBufSize = 2 * 64 + 32;
+ static const int kOffset = 16;
+
+ ~UpsampleTest() override = default;
+
+ virtual void Execute(T *edge_tst) = 0;
+
+ void Common() {
+ edge_ref_ = &edge_ref_data_[kOffset];
+ edge_tst_ = &edge_tst_data_[kOffset];
+
+ Execute(edge_tst_);
+
+ const int max_idx = (size_ - 1) * 2;
+ for (int r = -2; r <= max_idx; ++r) {
+ ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+ }
+ }
+
+ T edge_ref_data_[kBufSize];
+ T edge_tst_data_[kBufSize];
+
+ T *edge_ref_;
+ T *edge_tst_;
+
+ int size_;
+};
+
+typedef void (*UP8B)(uint8_t *p, int size);
+typedef libaom_test::FuncParam<UP8B> TestFuncs;
+
+class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
+ protected:
+ void Execute(uint8_t *edge_tst) override {
+ params_.ref_func(edge_ref_, size_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
+ }
+};
+
+TEST_P(UpsampleTest8B, RandomValues) {
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ size_ = 4 * (this->rng_(4) + 1);
+
+ int i, pix = 0;
+ for (i = 0; i < kOffset + size_; ++i) {
+ pix = rng_.Rand8();
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = edge_ref_data_[i];
+ }
+
+ // Extend final sample
+ while (i < kBufSize) {
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ i++;
+ }
+
+ Common();
+ }
+}
+
+TEST_P(UpsampleTest8B, DISABLED_Speed) {
+ const int test_count = 10000000;
+ size_ = kMaxEdge;
+ for (int i = 0; i < kOffset + size_; ++i) {
+ edge_tst_data_[i] = rng_.Rand8();
+ }
+ edge_tst_ = &edge_tst_data_[kOffset];
+ for (int iter = 0; iter < test_count; ++iter) {
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, UpsampleTest8B,
+ ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+ av1_upsample_intra_edge_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, UpsampleTest8B,
+ ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+ av1_upsample_intra_edge_neon)));
+#endif // HAVE_NEON
+
+template <typename F, typename T>
+class FilterEdgeTest : public FunctionEquivalenceTest<F> {
+ protected:
+ static const int kIterations = 1000000;
+ static const int kMaxEdge = 2 * 64;
+ static const int kBufSize = kMaxEdge + 32;
+ static const int kOffset = 15;
+
+ ~FilterEdgeTest() override = default;
+
+ virtual void Execute(T *edge_tst) = 0;
+
+ void Common() {
+ edge_ref_ = &edge_ref_data_[kOffset];
+ edge_tst_ = &edge_tst_data_[kOffset];
+
+ Execute(edge_tst_);
+
+ for (int r = 0; r < size_; ++r) {
+ ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+ }
+ }
+
+ T edge_ref_data_[kBufSize];
+ T edge_tst_data_[kBufSize];
+
+ T *edge_ref_;
+ T *edge_tst_;
+
+ int size_;
+ int strength_;
+};
+
+typedef void (*FE8B)(uint8_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
+
+class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
+ protected:
+ void Execute(uint8_t *edge_tst) override {
+ params_.ref_func(edge_ref_, size_, strength_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+ }
+};
+
+TEST_P(FilterEdgeTest8B, RandomValues) {
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ strength_ = this->rng_(4);
+ size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+ int i, pix = 0;
+ for (i = 0; i < kOffset + size_; ++i) {
+ pix = rng_.Rand8();
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ }
+
+ Common();
+ }
+}
+
+TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
+ const int test_count = 10000000;
+ size_ = kMaxEdge;
+ strength_ = 1;
+ for (int i = 0; i < kOffset + size_; ++i) {
+ edge_tst_data_[i] = rng_.Rand8();
+ }
+ edge_tst_ = &edge_tst_data_[kOffset];
+ for (int iter = 0; iter < test_count; ++iter) {
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+ // iterate over filter strengths (1,2,3)
+ strength_ = strength_ == 3 ? 1 : strength_ + 1;
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, FilterEdgeTest8B,
+ ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+ av1_filter_intra_edge_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, FilterEdgeTest8B,
+ ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+ av1_filter_intra_edge_neon)));
+#endif // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*UPHB)(uint16_t *p, int size, int bd);
+typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
+
+class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
+ protected:
+ void Execute(uint16_t *edge_tst) override {
+ params_.ref_func(edge_ref_, size_, bit_depth_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
+ }
+ int bit_depth_;
+};
+
+TEST_P(UpsampleTestHB, RandomValues) {
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0: bit_depth_ = 8; break;
+ case 1: bit_depth_ = 10; break;
+ default: bit_depth_ = 12; break;
+ }
+ const int hi = 1 << bit_depth_;
+
+ size_ = 4 * (this->rng_(4) + 1);
+
+ int i, pix = 0;
+ for (i = 0; i < kOffset + size_; ++i) {
+ pix = rng_(hi);
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ }
+
+ // Extend final sample
+ while (i < kBufSize) {
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ i++;
+ }
+
+ Common();
+ }
+}
+
+TEST_P(UpsampleTestHB, DISABLED_Speed) {
+ const int test_count = 10000000;
+ size_ = kMaxEdge;
+ bit_depth_ = 12;
+ const int hi = 1 << bit_depth_;
+ for (int i = 0; i < kOffset + size_; ++i) {
+ edge_tst_data_[i] = rng_(hi);
+ }
+ edge_tst_ = &edge_tst_data_[kOffset];
+ for (int iter = 0; iter < test_count; ++iter) {
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, UpsampleTestHB,
+ ::testing::Values(TestFuncsHBD(av1_highbd_upsample_intra_edge_c,
+ av1_highbd_upsample_intra_edge_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, UpsampleTestHB,
+ ::testing::Values(TestFuncsHBD(av1_highbd_upsample_intra_edge_c,
+ av1_highbd_upsample_intra_edge_neon)));
+#endif // HAVE_NEON
+
+typedef void (*FEHB)(uint16_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
+
+class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
+ protected:
+ void Execute(uint16_t *edge_tst) override {
+ params_.ref_func(edge_ref_, size_, strength_);
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+ }
+ int bit_depth_;
+};
+
+TEST_P(FilterEdgeTestHB, RandomValues) {
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0: bit_depth_ = 8; break;
+ case 1: bit_depth_ = 10; break;
+ default: bit_depth_ = 12; break;
+ }
+ const int hi = 1 << bit_depth_;
+ strength_ = this->rng_(4);
+ size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+ int i, pix = 0;
+ for (i = 0; i < kOffset + size_; ++i) {
+ pix = rng_(hi);
+ edge_ref_data_[i] = pix;
+ edge_tst_data_[i] = pix;
+ }
+
+ Common();
+ }
+}
+
+TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
+ const int test_count = 10000000;
+ size_ = kMaxEdge;
+ strength_ = 1;
+ bit_depth_ = 12;
+ const int hi = 1 << bit_depth_;
+ for (int i = 0; i < kOffset + size_; ++i) {
+ edge_tst_data_[i] = rng_(hi);
+ }
+ edge_tst_ = &edge_tst_data_[kOffset];
+ for (int iter = 0; iter < test_count; ++iter) {
+ API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+ // iterate over filter strengths (1,2,3)
+ strength_ = strength_ == 3 ? 1 : strength_ + 1;
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
+ ::testing::Values(FilterEdgeTestFuncsHBD(
+ av1_highbd_filter_intra_edge_c,
+ av1_highbd_filter_intra_edge_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterEdgeTestHB,
+ ::testing::Values(FilterEdgeTestFuncsHBD(
+ av1_highbd_filter_intra_edge_c,
+ av1_highbd_filter_intra_edge_neon)));
+#endif // HAVE_NEON
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace
diff --git a/third_party/aom/test/intrabc_test.cc b/third_party/aom/test/intrabc_test.cc
new file mode 100644
index 0000000000..2c60596ab8
--- /dev/null
+++ b/third_party/aom/test/intrabc_test.cc
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/tile_common.h"
+
+namespace {
+TEST(IntrabcTest, DvValidation) {
+ struct DvTestCase {
+ MV dv;
+ int mi_row_offset;
+ int mi_col_offset;
+ BLOCK_SIZE bsize;
+ bool valid;
+ };
+ const int kSubPelScale = 8;
+ const int kTileMaxMibWidth = 8;
+ const DvTestCase kDvCases[] = {
+ { { 0, 0 }, 0, 0, BLOCK_128X128, false },
+ { { 0, 0 }, 0, 0, BLOCK_64X64, false },
+ { { 0, 0 }, 0, 0, BLOCK_32X32, false },
+ { { 0, 0 }, 0, 0, BLOCK_16X16, false },
+ { { 0, 0 }, 0, 0, BLOCK_8X8, false },
+ { { 0, 0 }, 0, 0, BLOCK_4X4, false },
+ { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_16X16,
+ true },
+ { { 0, -MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_16X16,
+ false },
+ { { -MAX_SB_SIZE * kSubPelScale, 0 },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_16X16,
+ true },
+ { { MAX_SB_SIZE * kSubPelScale, 0 },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_16X16,
+ false },
+ { { 0, MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_16X16,
+ false },
+ { { -32 * kSubPelScale, -32 * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_32X32,
+ true },
+ { { -32 * kSubPelScale, -32 * kSubPelScale },
+ 32 / MI_SIZE,
+ 32 / MI_SIZE,
+ BLOCK_32X32,
+ false },
+ { { -32 * kSubPelScale - kSubPelScale / 2, -32 * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_32X32,
+ false },
+ { { -33 * kSubPelScale, -32 * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_32X32,
+ true },
+ { { -32 * kSubPelScale, -32 * kSubPelScale - kSubPelScale / 2 },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_32X32,
+ false },
+ { { -32 * kSubPelScale, -33 * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_32X32,
+ true },
+ { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ true },
+ { { -(MAX_SB_SIZE + 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE + 1) * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ { { -(MAX_SB_SIZE - 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ true },
+ { { -(MAX_SB_SIZE - 1) * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ { { -MAX_SB_SIZE * kSubPelScale, MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ { { -MAX_SB_SIZE * kSubPelScale,
+ (kTileMaxMibWidth - 2) * MAX_SB_SIZE * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ { { -MAX_SB_SIZE * kSubPelScale,
+ ((kTileMaxMibWidth - 2) * MAX_SB_SIZE + 1) * kSubPelScale },
+ MAX_SB_SIZE / MI_SIZE,
+ MAX_SB_SIZE / MI_SIZE,
+ BLOCK_LARGEST,
+ false },
+ };
+
+ MACROBLOCKD xd;
+ memset(&xd, 0, sizeof(xd));
+ xd.tile.mi_row_start = 8 * MAX_MIB_SIZE;
+ xd.tile.mi_row_end = 16 * MAX_MIB_SIZE;
+ xd.tile.mi_col_start = 24 * MAX_MIB_SIZE;
+ xd.tile.mi_col_end = xd.tile.mi_col_start + kTileMaxMibWidth * MAX_MIB_SIZE;
+ xd.plane[1].subsampling_x = 1;
+ xd.plane[1].subsampling_y = 1;
+ xd.plane[2].subsampling_x = 1;
+ xd.plane[2].subsampling_y = 1;
+
+ SequenceHeader seq_params = {};
+ AV1_COMMON cm;
+ memset(&cm, 0, sizeof(cm));
+ cm.seq_params = &seq_params;
+
+ for (const DvTestCase &dv_case : kDvCases) {
+ const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;
+ const int mi_col = xd.tile.mi_col_start + dv_case.mi_col_offset;
+ xd.is_chroma_ref = is_chroma_reference(mi_row, mi_col, dv_case.bsize,
+ xd.plane[1].subsampling_x,
+ xd.plane[1].subsampling_y);
+ EXPECT_EQ(static_cast<int>(dv_case.valid),
+ av1_is_dv_valid(dv_case.dv, &cm, &xd, mi_row, mi_col,
+ dv_case.bsize, MAX_MIB_SIZE_LOG2));
+ }
+}
+} // namespace
diff --git a/third_party/aom/test/intrapred_test.cc b/third_party/aom/test/intrapred_test.cc
new file mode 100644
index 0000000000..8796e8ba69
--- /dev/null
+++ b/third_party/aom/test/intrapred_test.cc
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+const int count_test_block = 100000;
+
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bps);
+typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left);
+
+} // namespace
+
+// NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the
+// anonymous namespace, then we get a strange compiler warning in
+// the begin() and end() methods of the ParamGenerator template class in
+// gtest/internal/gtest-param-util.h:
+// warning: ‘<anonymous>’ is used uninitialized in this function
+// As a workaround, put this template outside the anonymous namespace.
+// See bug aomedia:2003.
+template <typename FuncType>
+struct IntraPredFunc {
+ IntraPredFunc(FuncType pred = nullptr, FuncType ref = nullptr,
+ int block_width_value = 0, int block_height_value = 0,
+ int bit_depth_value = 0)
+ : pred_fn(pred), ref_fn(ref), block_width(block_width_value),
+ block_height(block_height_value), bit_depth(bit_depth_value) {}
+
+ FuncType pred_fn;
+ FuncType ref_fn;
+ int block_width;
+ int block_height;
+ int bit_depth;
+};
+
+namespace {
+
+template <typename FuncType, typename Pixel>
+class AV1IntraPredTest
+ : public ::testing::TestWithParam<IntraPredFunc<FuncType> > {
+ public:
+ void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int block_width = params_.block_width;
+ const int block_height = params_.block_height;
+ above_row_ = above_data + 16;
+ left_col_ = left_col;
+ dst_ = dst;
+ ref_dst_ = ref_dst;
+ int error_count = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // Fill edges with random data, try first with saturated values.
+ for (int x = -1; x <= block_width * 2; x++) {
+ if (i == 0) {
+ above_row_[x] = mask_;
+ } else {
+ above_row_[x] = rnd.Rand16() & mask_;
+ }
+ }
+ for (int y = 0; y < block_height; y++) {
+ if (i == 0) {
+ left_col_[y] = mask_;
+ } else {
+ left_col_[y] = rnd.Rand16() & mask_;
+ }
+ }
+ Predict();
+ CheckPrediction(i, &error_count);
+ }
+ ASSERT_EQ(0, error_count);
+ }
+ void RunSpeedTest(Pixel *left_col, Pixel *above_data, Pixel *dst,
+ Pixel *ref_dst) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int block_width = params_.block_width;
+ const int block_height = params_.block_height;
+ above_row_ = above_data + 16;
+ left_col_ = left_col;
+ dst_ = dst;
+ ref_dst_ = ref_dst;
+ int error_count = 0;
+ const int numIter = 100;
+
+ int c_sum_time = 0;
+ int simd_sum_time = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // Fill edges with random data, try first with saturated values.
+ for (int x = -1; x <= block_width * 2; x++) {
+ if (i == 0) {
+ above_row_[x] = mask_;
+ } else {
+ above_row_[x] = rnd.Rand16() & mask_;
+ }
+ }
+ for (int y = 0; y < block_height; y++) {
+ if (i == 0) {
+ left_col_[y] = mask_;
+ } else {
+ left_col_[y] = rnd.Rand16() & mask_;
+ }
+ }
+
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+
+ PredictRefSpeedTest(numIter);
+
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+
+ PredictFncSpeedTest(numIter);
+
+ aom_usec_timer_mark(&simd_timer_);
+
+ c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ CheckPrediction(i, &error_count);
+ }
+
+ printf(
+ "blockWxH = %d x %d c_time = %d \t simd_time = %d \t Gain = %4.2f \n",
+ block_width, block_height, c_sum_time, simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+ ASSERT_EQ(0, error_count);
+ }
+
+ protected:
+ void SetUp() override {
+ params_ = this->GetParam();
+ stride_ = params_.block_width * 3;
+ mask_ = (1 << params_.bit_depth) - 1;
+ }
+
+ virtual void Predict() = 0;
+
+ virtual void PredictRefSpeedTest(int num) = 0;
+ virtual void PredictFncSpeedTest(int num) = 0;
+
+ void CheckPrediction(int test_case_number, int *error_count) const {
+ // For each pixel ensure that the calculated value is the same as reference.
+ const int block_width = params_.block_width;
+ const int block_height = params_.block_height;
+ for (int y = 0; y < block_height; y++) {
+ for (int x = 0; x < block_width; x++) {
+ *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+ if (*error_count == 1) {
+ ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+ << " Failed on Test Case Number " << test_case_number
+ << " location: x = " << x << " y = " << y;
+ }
+ }
+ }
+ }
+
+ Pixel *above_row_;
+ Pixel *left_col_;
+ Pixel *dst_;
+ Pixel *ref_dst_;
+ ptrdiff_t stride_;
+ int mask_;
+
+ IntraPredFunc<FuncType> params_;
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
+ protected:
+ void Predict() override {
+ const int bit_depth = params_.bit_depth;
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+ API_REGISTER_STATE_CHECK(
+ params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+ }
+ void PredictRefSpeedTest(int num) override {
+ const int bit_depth = params_.bit_depth;
+ for (int i = 0; i < num; i++) {
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+ }
+ }
+ void PredictFncSpeedTest(int num) override {
+ const int bit_depth = params_.bit_depth;
+ for (int i = 0; i < num; i++) {
+ params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth);
+ }
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdIntraPredTest);
+
+#endif
+
+class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
+ protected:
+ void Predict() override {
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+ API_REGISTER_STATE_CHECK(
+ params_.pred_fn(dst_, stride_, above_row_, left_col_));
+ }
+ void PredictRefSpeedTest(int num) override {
+ for (int i = 0; i < num; i++) {
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+ }
+ }
+ void PredictFncSpeedTest(int num) override {
+ for (int i = 0; i < num; i++) {
+ params_.pred_fn(dst_, stride_, above_row_, left_col_);
+ }
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(LowbdIntraPredTest);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(HighbdIntraPredTest, Bitexact) {
+ // max block size is 64
+ DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]);
+ DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]);
+ DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]);
+ DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]);
+ av1_zero(left_col);
+ av1_zero(above_data);
+ RunTest(left_col, above_data, dst, ref_dst);
+}
+
+TEST_P(HighbdIntraPredTest, DISABLED_Speed) {
+ // max block size is 64
+ DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]);
+ DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]);
+ DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]);
+ DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]);
+ av1_zero(left_col);
+ av1_zero(above_data);
+ RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
+#endif
+
+TEST_P(LowbdIntraPredTest, Bitexact) {
+ // max block size is 64
+ DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+ DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
+ av1_zero(left_col);
+ av1_zero(above_data);
+ RunTest(left_col, above_data, dst, ref_dst);
+}
+TEST_P(LowbdIntraPredTest, DISABLED_Speed) {
+ // max block size is 64
+ DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+ DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
+ av1_zero(left_col);
+ av1_zero(above_data);
+ RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// -----------------------------------------------------------------------------
+// High Bit Depth Tests
+#define highbd_entry(type, width, height, opt, bd) \
+ IntraPredFunc<HighbdIntraPred>( \
+ &aom_highbd_##type##_predictor_##width##x##height##_##opt, \
+ &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \
+ bd)
+
+#define highbd_intrapred(type, opt, bd) \
+ highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd), \
+ highbd_entry(type, 4, 16, opt, bd), highbd_entry(type, 8, 4, opt, bd), \
+ highbd_entry(type, 8, 8, opt, bd), highbd_entry(type, 8, 16, opt, bd), \
+ highbd_entry(type, 8, 32, opt, bd), highbd_entry(type, 16, 4, opt, bd), \
+ highbd_entry(type, 16, 8, opt, bd), highbd_entry(type, 16, 16, opt, bd), \
+ highbd_entry(type, 16, 32, opt, bd), \
+ highbd_entry(type, 16, 64, opt, bd), highbd_entry(type, 32, 8, opt, bd), \
+ highbd_entry(type, 32, 16, opt, bd), \
+ highbd_entry(type, 32, 32, opt, bd), \
+ highbd_entry(type, 32, 64, opt, bd), \
+ highbd_entry(type, 64, 16, opt, bd), \
+ highbd_entry(type, 64, 32, opt, bd), highbd_entry(type, 64, 64, opt, bd)
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// ---------------------------------------------------------------------------
+// Low Bit Depth Tests
+
+#define lowbd_entry(type, width, height, opt) \
+ IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
+ &aom_##type##_predictor_##width##x##height##_c, \
+ width, height, 8)
+
+#define lowbd_intrapred(type, opt) \
+ lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt), \
+ lowbd_entry(type, 4, 16, opt), lowbd_entry(type, 8, 4, opt), \
+ lowbd_entry(type, 8, 8, opt), lowbd_entry(type, 8, 16, opt), \
+ lowbd_entry(type, 8, 32, opt), lowbd_entry(type, 16, 4, opt), \
+ lowbd_entry(type, 16, 8, opt), lowbd_entry(type, 16, 16, opt), \
+ lowbd_entry(type, 16, 32, opt), lowbd_entry(type, 16, 64, opt), \
+ lowbd_entry(type, 32, 8, opt), lowbd_entry(type, 32, 16, opt), \
+ lowbd_entry(type, 32, 32, opt), lowbd_entry(type, 32, 64, opt), \
+ lowbd_entry(type, 64, 16, opt), lowbd_entry(type, 64, 32, opt), \
+ lowbd_entry(type, 64, 64, opt)
+
+#if HAVE_SSE2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
+ lowbd_intrapred(dc, sse2), lowbd_intrapred(dc_top, sse2),
+ lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
+ lowbd_intrapred(v, sse2), lowbd_intrapred(h, sse2),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, LowbdIntraPredTest,
+ ::testing::ValuesIn(LowbdIntraPredTestVector));
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
+ lowbd_intrapred(dc, neon), lowbd_intrapred(dc_top, neon),
+ lowbd_intrapred(dc_left, neon), lowbd_intrapred(dc_128, neon),
+ lowbd_intrapred(v, neon), lowbd_intrapred(h, neon),
+ lowbd_intrapred(smooth, neon), lowbd_intrapred(smooth_v, neon),
+ lowbd_intrapred(smooth_h, neon), lowbd_intrapred(paeth, neon),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
+ ::testing::ValuesIn(LowbdIntraPredTestVectorNeon));
+#endif // HAVE_NEON
+
+#if HAVE_SSSE3
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
+ lowbd_intrapred(paeth, ssse3),
+ lowbd_intrapred(smooth, ssse3),
+ lowbd_intrapred(smooth_v, ssse3),
+ lowbd_intrapred(smooth_h, ssse3),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, LowbdIntraPredTest,
+ ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
+#endif // HAVE_SSSE3
+
+#if HAVE_AVX2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
+ lowbd_entry(dc, 32, 16, avx2), lowbd_entry(dc, 32, 32, avx2),
+ lowbd_entry(dc, 32, 64, avx2), lowbd_entry(dc, 64, 16, avx2),
+ lowbd_entry(dc, 64, 32, avx2), lowbd_entry(dc, 64, 64, avx2),
+
+ lowbd_entry(dc_top, 32, 16, avx2), lowbd_entry(dc_top, 32, 32, avx2),
+ lowbd_entry(dc_top, 32, 64, avx2), lowbd_entry(dc_top, 64, 16, avx2),
+ lowbd_entry(dc_top, 64, 32, avx2), lowbd_entry(dc_top, 64, 64, avx2),
+
+ lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_left, 32, 32, avx2),
+ lowbd_entry(dc_left, 32, 64, avx2), lowbd_entry(dc_left, 64, 16, avx2),
+ lowbd_entry(dc_left, 64, 32, avx2), lowbd_entry(dc_left, 64, 64, avx2),
+
+ lowbd_entry(dc_128, 32, 16, avx2), lowbd_entry(dc_128, 32, 32, avx2),
+ lowbd_entry(dc_128, 32, 64, avx2), lowbd_entry(dc_128, 64, 16, avx2),
+ lowbd_entry(dc_128, 64, 32, avx2), lowbd_entry(dc_128, 64, 64, avx2),
+
+ lowbd_entry(v, 32, 16, avx2), lowbd_entry(v, 32, 32, avx2),
+ lowbd_entry(v, 32, 64, avx2), lowbd_entry(v, 64, 16, avx2),
+ lowbd_entry(v, 64, 32, avx2), lowbd_entry(v, 64, 64, avx2),
+
+ lowbd_entry(h, 32, 32, avx2),
+
+ lowbd_entry(paeth, 16, 8, avx2), lowbd_entry(paeth, 16, 16, avx2),
+ lowbd_entry(paeth, 16, 32, avx2), lowbd_entry(paeth, 16, 64, avx2),
+ lowbd_entry(paeth, 32, 16, avx2), lowbd_entry(paeth, 32, 32, avx2),
+ lowbd_entry(paeth, 32, 64, avx2), lowbd_entry(paeth, 64, 16, avx2),
+ lowbd_entry(paeth, 64, 32, avx2), lowbd_entry(paeth, 64, 64, avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, LowbdIntraPredTest,
+ ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
+#endif // HAVE_AVX2
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_NEON
+const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {
+ highbd_intrapred(dc, neon, 12), highbd_intrapred(dc_top, neon, 12),
+ highbd_intrapred(dc_left, neon, 12), highbd_intrapred(dc_128, neon, 12),
+ highbd_intrapred(v, neon, 12), highbd_intrapred(h, neon, 12),
+ highbd_intrapred(paeth, neon, 12), highbd_intrapred(smooth, neon, 12),
+ highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
+ ::testing::ValuesIn(HighbdIntraPredTestVectorNeon));
+#endif // HAVE_NEON
+
+#if HAVE_SSE2
+const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorSse2[] = {
+ highbd_entry(dc, 4, 4, sse2, 12),
+ highbd_entry(dc, 4, 8, sse2, 12),
+ highbd_entry(dc, 8, 4, sse2, 12),
+ highbd_entry(dc, 8, 8, sse2, 12),
+ highbd_entry(dc, 8, 16, sse2, 12),
+ highbd_entry(dc, 16, 8, sse2, 12),
+ highbd_entry(dc, 16, 16, sse2, 12),
+ highbd_entry(dc, 16, 32, sse2, 12),
+ highbd_entry(dc, 32, 16, sse2, 12),
+ highbd_entry(dc, 32, 32, sse2, 12),
+
+ highbd_entry(dc_top, 4, 4, sse2, 12),
+ highbd_entry(dc_top, 4, 8, sse2, 12),
+ highbd_entry(dc_top, 8, 4, sse2, 12),
+ highbd_entry(dc_top, 8, 8, sse2, 12),
+ highbd_entry(dc_top, 8, 16, sse2, 12),
+ highbd_entry(dc_top, 16, 8, sse2, 12),
+ highbd_entry(dc_top, 16, 16, sse2, 12),
+ highbd_entry(dc_top, 16, 32, sse2, 12),
+ highbd_entry(dc_top, 32, 16, sse2, 12),
+ highbd_entry(dc_top, 32, 32, sse2, 12),
+
+ highbd_entry(dc_left, 4, 4, sse2, 12),
+ highbd_entry(dc_left, 4, 8, sse2, 12),
+ highbd_entry(dc_left, 8, 4, sse2, 12),
+ highbd_entry(dc_left, 8, 8, sse2, 12),
+ highbd_entry(dc_left, 8, 16, sse2, 12),
+ highbd_entry(dc_left, 16, 8, sse2, 12),
+ highbd_entry(dc_left, 16, 16, sse2, 12),
+ highbd_entry(dc_left, 16, 32, sse2, 12),
+ highbd_entry(dc_left, 32, 16, sse2, 12),
+ highbd_entry(dc_left, 32, 32, sse2, 12),
+
+ highbd_entry(dc_128, 4, 4, sse2, 12),
+ highbd_entry(dc_128, 4, 8, sse2, 12),
+ highbd_entry(dc_128, 8, 4, sse2, 12),
+ highbd_entry(dc_128, 8, 8, sse2, 12),
+ highbd_entry(dc_128, 8, 16, sse2, 12),
+ highbd_entry(dc_128, 16, 8, sse2, 12),
+ highbd_entry(dc_128, 16, 16, sse2, 12),
+ highbd_entry(dc_128, 16, 32, sse2, 12),
+ highbd_entry(dc_128, 32, 16, sse2, 12),
+ highbd_entry(dc_128, 32, 32, sse2, 12),
+
+ highbd_entry(v, 4, 4, sse2, 12),
+ highbd_entry(v, 4, 8, sse2, 12),
+ highbd_entry(v, 8, 4, sse2, 12),
+ highbd_entry(v, 8, 8, sse2, 12),
+ highbd_entry(v, 8, 16, sse2, 12),
+ highbd_entry(v, 16, 8, sse2, 12),
+ highbd_entry(v, 16, 16, sse2, 12),
+ highbd_entry(v, 16, 32, sse2, 12),
+ highbd_entry(v, 32, 16, sse2, 12),
+ highbd_entry(v, 32, 32, sse2, 12),
+
+ highbd_entry(h, 4, 4, sse2, 12),
+ highbd_entry(h, 4, 8, sse2, 12),
+ highbd_entry(h, 8, 4, sse2, 12),
+ highbd_entry(h, 8, 8, sse2, 12),
+ highbd_entry(h, 8, 16, sse2, 12),
+ highbd_entry(h, 16, 8, sse2, 12),
+ highbd_entry(h, 16, 16, sse2, 12),
+ highbd_entry(h, 16, 32, sse2, 12),
+ highbd_entry(h, 32, 16, sse2, 12),
+ highbd_entry(h, 32, 32, sse2, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, HighbdIntraPredTest,
+ ::testing::ValuesIn(HighbdIntraPredTestVectorSse2));
+#endif // HAVE_SSE2
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/invalid_file_test.cc b/third_party/aom/test/invalid_file_test.cc
new file mode 100644
index 0000000000..791cdb8928
--- /dev/null
+++ b/third_party/aom/test/invalid_file_test.cc
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <ostream>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+
+struct DecodeParam {
+ int threads;
+ const char *filename;
+ const char *res_filename; // If nullptr, the result filename is
+ // filename + ".res".
+};
+
+// Constructs result file name.
+std::string GetResFilename(const DecodeParam &param) {
+ if (param.res_filename != nullptr) return param.res_filename;
+ const std::string filename = param.filename;
+ return filename + ".res";
+}
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+ return os << "threads: " << dp.threads << " file: " << dp.filename
+ << " result file: " << GetResFilename(dp);
+}
+
+class InvalidFileTest : public ::libaom_test::DecoderTest,
+ public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+ InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
+
+ ~InvalidFileTest() override {
+ if (res_file_ != nullptr) fclose(res_file_);
+ }
+
+ void OpenResFile(const std::string &res_file_name) {
+ res_file_ = libaom_test::OpenTestDataFile(res_file_name);
+ ASSERT_NE(res_file_, nullptr)
+ << "Result file open failed. Filename: " << res_file_name;
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int /*frame_number*/) override {
+ EXPECT_NE(img.fb_priv, nullptr);
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_NE(res_file_, nullptr);
+ int expected_res_dec = -1;
+
+ // Read integer result.
+ const int res = fscanf(res_file_, "%d", &expected_res_dec);
+ EXPECT_NE(res, EOF) << "Read result data failed";
+
+ if (expected_res_dec != -1) {
+ // Check results match.
+ const DecodeParam input = GET_PARAM(1);
+ if (input.threads > 1) {
+ // The serial decode check is too strict for tile-threaded decoding as
+ // there is no guarantee on the decode order nor which specific error
+ // will take precedence. Currently a tile-level error is not forwarded
+ // so the frame will simply be marked corrupt.
+ EXPECT_TRUE(res_dec == expected_res_dec ||
+ res_dec == AOM_CODEC_CORRUPT_FRAME)
+ << "Results don't match: frame number = " << video.frame_number()
+ << ". (" << decoder->DecodeError()
+ << "). Expected: " << expected_res_dec << " or "
+ << AOM_CODEC_CORRUPT_FRAME;
+ } else {
+ EXPECT_EQ(expected_res_dec, res_dec)
+ << "Results don't match: frame number = " << video.frame_number()
+ << ". (" << decoder->DecodeError() << ")";
+ }
+ }
+
+ return !HasFailure();
+ }
+
+ void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
+ libaom_test::CompressedVideoSource * /*video*/,
+ const aom_codec_err_t /*res_peek*/) override {}
+
+ void RunTest() {
+ const DecodeParam input = GET_PARAM(1);
+ aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+ cfg.threads = input.threads;
+ libaom_test::IVFVideoSource decode_video(input.filename);
+ decode_video.Init();
+
+ // The result file holds a list of expected integer results, one for each
+ // decoded frame. Any result that doesn't match the file's list will
+ // cause a test failure.
+ const std::string res_filename = GetResFilename(input);
+ OpenResFile(res_filename);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
+ }
+
+ private:
+ FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
+
+// If res_filename (the third field) is nullptr, then the result filename is
+// filename + ".res" by default. Set res_filename to a string if the result
+// filename differs from the default.
+const DecodeParam kAV1InvalidFileTests[] = {
+ // { threads, filename, res_filename }
+ { 1, "invalid-bug-1814.ivf", nullptr },
+ { 1, "invalid-chromium-906381.ivf", nullptr },
+ { 1, "invalid-google-142530197.ivf", nullptr },
+ { 1, "invalid-google-142530197-1.ivf", nullptr },
+ { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" },
+ { 1, "invalid-oss-fuzz-9720.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.4" },
+#if !CHROMIUM && !CONFIG_SIZE_LIMIT || \
+ (CONFIG_SIZE_LIMIT && DECODE_WIDTH_LIMIT >= 5120 && \
+ DECODE_HEIGHT_LIMIT >= 180)
+ { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" },
+#endif
+ { 4, "invalid-oss-fuzz-15363.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-16437.ivf", "invalid-oss-fuzz-16437.ivf.res.2" },
+#if CONFIG_MAX_DECODE_PROFILE >= 1
+ { 1, "invalid-oss-fuzz-24706.ivf", nullptr },
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+ // These test vectors contain 10-bit or 12-bit video.
+ { 1, "invalid-oss-fuzz-9288.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-9482.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-10061.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-10227.ivf", nullptr },
+ { 4, "invalid-oss-fuzz-10555.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-10705.ivf", nullptr },
+#if CONFIG_CWG_C013
+ { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.3" },
+#else
+ { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.2" },
+#endif
+ { 1, "invalid-oss-fuzz-10779.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-11477.ivf", nullptr },
+ { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
+ { 1, "invalid-oss-fuzz-33030.ivf", nullptr },
+#endif
+};
+
+AV1_INSTANTIATE_TEST_SUITE(InvalidFileTest,
+ ::testing::ValuesIn(kAV1InvalidFileTests));
+
+} // namespace
diff --git a/third_party/aom/test/ivf_video_source.h b/third_party/aom/test/ivf_video_source.h
new file mode 100644
index 0000000000..85731f5566
--- /dev/null
+++ b/third_party/aom/test/ivf_video_source.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_IVF_VIDEO_SOURCE_H_
+#define AOM_TEST_IVF_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+
+#include "aom_ports/sanitizer.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
+const unsigned int kIvfFileHdrSize = 32;
+const unsigned int kIvfFrameHdrSize = 12;
+
+static unsigned int MemGetLe32(const uint8_t *mem) {
+ return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]);
+}
+
+// This class extends VideoSource to allow parsing of ivf files,
+// so that we can do actual file decodes.
+class IVFVideoSource : public CompressedVideoSource {
+ public:
+ explicit IVFVideoSource(const std::string &file_name)
+ : file_name_(file_name), input_file_(nullptr),
+ compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
+ end_of_file_(false) {}
+
+ ~IVFVideoSource() override {
+ delete[] compressed_frame_buf_;
+
+ if (input_file_) fclose(input_file_);
+ }
+
+ void Init() override {
+ // Allocate a buffer for read in the compressed video frame.
+ compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
+ ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
+ ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
+ }
+
+ void Begin() override {
+ input_file_ = OpenTestDataFile(file_name_);
+ ASSERT_NE(input_file_, nullptr)
+ << "Input file open failed. Filename: " << file_name_;
+
+ // Read file header
+ uint8_t file_hdr[kIvfFileHdrSize];
+ ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_))
+ << "File header read failed.";
+ // Check file header
+ ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' &&
+ file_hdr[2] == 'I' && file_hdr[3] == 'F')
+ << "Input is not an IVF file.";
+
+ FillFrame();
+ }
+
+ void Next() override {
+ ++frame_;
+ FillFrame();
+ }
+
+ void FillFrame() {
+ ASSERT_NE(input_file_, nullptr);
+ uint8_t frame_hdr[kIvfFrameHdrSize];
+ // Check frame header and read a frame from input_file.
+ if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
+ kIvfFrameHdrSize) {
+ end_of_file_ = true;
+ } else {
+ end_of_file_ = false;
+
+ frame_sz_ = MemGetLe32(frame_hdr);
+ ASSERT_LE(frame_sz_, kCodeBufferSize)
+ << "Frame is too big for allocated code buffer";
+ ASAN_UNPOISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
+ ASSERT_EQ(frame_sz_,
+ fread(compressed_frame_buf_, 1, frame_sz_, input_file_))
+ << "Failed to read complete frame";
+ ASAN_POISON_MEMORY_REGION(compressed_frame_buf_ + frame_sz_,
+ kCodeBufferSize - frame_sz_);
+ }
+ }
+
+ const uint8_t *cxdata() const override {
+ return end_of_file_ ? nullptr : compressed_frame_buf_;
+ }
+ size_t frame_size() const override { return frame_sz_; }
+ unsigned int frame_number() const override { return frame_; }
+
+ protected:
+ std::string file_name_;
+ FILE *input_file_;
+ uint8_t *compressed_frame_buf_;
+ size_t frame_sz_;
+ unsigned int frame_;
+ bool end_of_file_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_IVF_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/kf_test.cc b/third_party/aom/test/kf_test.cc
new file mode 100644
index 0000000000..7d8cbfe8c6
--- /dev/null
+++ b/third_party/aom/test/kf_test.cc
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+
+#include <ostream>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_image.h"
+#include "aom/aomcx.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+#define NUM_LAG_VALUES 3
+
+namespace {
+aom_image_t *CreateGrayImage(aom_img_fmt_t fmt, unsigned int w,
+ unsigned int h) {
+ aom_image_t *const image = aom_img_alloc(nullptr, fmt, w, h, 1);
+ if (!image) return image;
+
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+ }
+ const unsigned int uv_h = (image->d_h + 1) / 2;
+ const unsigned int uv_w = (image->d_w + 1) / 2;
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+ memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+ }
+ return image;
+}
+
+// Tests kf_max_dist in one-pass encoding with zero lag.
+void TestKeyFrameMaximumInterval(unsigned int usage, unsigned int kf_max_dist) {
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK);
+ cfg.g_w = 320;
+ cfg.g_h = 240;
+ cfg.g_pass = AOM_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_AUTO;
+ cfg.kf_min_dist = 0;
+ cfg.kf_max_dist = kf_max_dist;
+
+ aom_codec_ctx_t enc;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK);
+
+ aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+
+ // Encode frames.
+ const aom_codec_cx_pkt_t *pkt;
+ const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1;
+ for (unsigned int i = 0; i < num_frames; ++i) {
+ ASSERT_EQ(aom_codec_encode(&enc, image, i, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ if (kf_max_dist == 0 || i % kf_max_dist == 0) {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ } else {
+ ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u);
+ }
+ }
+ }
+
+ // Flush the encoder.
+ bool got_data;
+ do {
+ ASSERT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK);
+ got_data = false;
+ aom_codec_iter_t iter = nullptr;
+ while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ got_data = true;
+ }
+ } while (got_data);
+
+ aom_img_free(image);
+ ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+TEST(KeyFrameIntervalTest, KeyFrameMaximumInterval) {
+ for (unsigned int usage : { AOM_USAGE_GOOD_QUALITY, AOM_USAGE_REALTIME }) {
+ // Test 0 and 1 (both mean all intra), some powers of 2, some multiples of
+ // 10, and some prime numbers.
+ for (unsigned int kf_max_dist :
+ { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) {
+ TestKeyFrameMaximumInterval(usage, kf_max_dist);
+ }
+ }
+}
+
+typedef struct {
+ const unsigned int min_kf_dist;
+ const unsigned int max_kf_dist;
+} kfIntervalParam;
+
+const kfIntervalParam kfTestParams[] = {
+ { 1, 1 }, { 0, 10 }, { 10, 10 }, { 0, 30 }, { 30, 30 }
+};
+
+std::ostream &operator<<(std::ostream &os, const kfIntervalParam &test_arg) {
+ return os << "kfIntervalParam { min_kf_dist:" << test_arg.min_kf_dist
+ << " max_kf_dist:" << test_arg.max_kf_dist << " }";
+}
+
+// This class is used to test the presence of forward key frame.
+class KeyFrameIntervalTestLarge
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+ kfIntervalParam, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ KeyFrameIntervalTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ kf_dist_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {
+ kf_dist_ = -1;
+ is_kf_interval_violated_ = false;
+ }
+ ~KeyFrameIntervalTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = end_usage_check_;
+ cfg_.g_threads = 1;
+ cfg_.kf_min_dist = kf_dist_param_.min_kf_dist;
+ cfg_.kf_max_dist = kf_dist_param_.max_kf_dist;
+ cfg_.g_lag_in_frames = 19;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ int frame_flags = 0;
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
+ &frame_flags);
+ if (kf_dist_ != -1) {
+ kf_dist_++;
+ if (kf_dist_ > (int)kf_dist_param_.max_kf_dist) {
+ is_kf_interval_violated_ = true;
+ }
+ }
+ if ((frame_flags & AOM_FRAME_IS_KEY) == AOM_FRAME_IS_KEY) {
+ if (kf_dist_ != -1 && kf_dist_ < (int)kf_dist_param_.min_kf_dist) {
+ is_kf_interval_violated_ = true;
+ }
+ kf_dist_ = 0;
+ }
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ const kfIntervalParam kf_dist_param_;
+ int kf_dist_;
+ bool is_kf_interval_violated_;
+ aom_rc_mode end_usage_check_;
+};
+
+// Because valgrind builds take a very long time to run, use a lower
+// resolution video for valgrind runs.
+const char *TestFileName() {
+#if AOM_VALGRIND_BUILD
+ return "hantro_collage_w176h144.yuv";
+#else
+ return "hantro_collage_w352h288.yuv";
+#endif // AOM_VALGRIND_BUILD
+}
+
+int TestFileWidth() {
+#if AOM_VALGRIND_BUILD
+ return 176;
+#else
+ return 352;
+#endif // AOM_VALGRIND_BUILD
+}
+
+int TestFileHeight() {
+#if AOM_VALGRIND_BUILD
+ return 144;
+#else
+ return 288;
+#endif // AOM_VALGRIND_BUILD
+}
+
+TEST_P(KeyFrameIntervalTestLarge, KeyFrameIntervalTest) {
+ libaom_test::I420VideoSource video(TestFileName(), TestFileWidth(),
+ TestFileHeight(), cfg_.g_timebase.den,
+ cfg_.g_timebase.num, 0, 75);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(is_kf_interval_violated_, false) << kf_dist_param_;
+}
+
+// This class tests for presence and placement of application forced key frames.
+class ForcedKeyTestLarge
+ : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+ int, int, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ForcedKeyTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ auto_alt_ref_(GET_PARAM(2)), fwd_kf_enabled_(GET_PARAM(3)),
+ cpu_used_(GET_PARAM(4)), rc_end_usage_(GET_PARAM(5)) {
+ forced_kf_frame_num_ = 1;
+ frame_num_ = 0;
+ is_kf_placement_violated_ = false;
+ }
+ ~ForcedKeyTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 0;
+ cfg_.kf_max_dist = 30;
+ cfg_.kf_min_dist = 0;
+ cfg_.fwd_kf_enabled = fwd_kf_enabled_;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, auto_alt_ref_);
+#if CONFIG_AV1_ENCODER
+ // override test default for tile columns if necessary.
+ if (GET_PARAM(0) == &libaom_test::kAV1) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+ }
+#endif
+ }
+ frame_flags_ =
+ ((int)video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ if ((int)frame_num_ == forced_kf_frame_num_) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ int frame_flags = 0;
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_FRAME_FLAGS,
+ &frame_flags);
+ if ((frame_flags & AOM_FRAME_IS_KEY) != AOM_FRAME_IS_KEY) {
+ is_kf_placement_violated_ = true;
+ }
+ }
+ ++frame_num_;
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ void Frame1IsKey();
+ void ForcedFrameIsKey();
+ void ForcedFrameIsKeyCornerCases();
+
+ ::libaom_test::TestMode encoding_mode_;
+ int auto_alt_ref_;
+ int fwd_kf_enabled_;
+ int cpu_used_;
+ aom_rc_mode rc_end_usage_;
+ int forced_kf_frame_num_;
+ unsigned int frame_num_;
+ bool is_kf_placement_violated_;
+};
+
+void ForcedKeyTestLarge::Frame1IsKey() {
+ const aom_rational timebase = { 1, 30 };
+ // 1st element of this 2D array is for good encoding mode and 2nd element
+ // is for RT encoding mode.
+ const int lag_values[2][NUM_LAG_VALUES] = { { 3, 15, 25 }, { 0, -1, -1 } };
+ int is_realtime = (encoding_mode_ == ::libaom_test::kRealTime);
+
+ forced_kf_frame_num_ = 1;
+ for (int i = 0; i < NUM_LAG_VALUES; ++i) {
+ if (lag_values[is_realtime][i] == -1) continue;
+ frame_num_ = 0;
+ cfg_.g_lag_in_frames = lag_values[is_realtime][i];
+ is_kf_placement_violated_ = false;
+ libaom_test::I420VideoSource video(
+ TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+ timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(is_kf_placement_violated_, false)
+ << "Frame #" << frame_num_ << " isn't a keyframe!";
+ }
+}
+
+// This class checks the presence and placement of application
+// forced key frames.
+void ForcedKeyTestLarge::ForcedFrameIsKey() {
+ const aom_rational timebase = { 1, 30 };
+ const int lag_values[] = { 3, 15, 25, -1 };
+
+ for (int i = 0; lag_values[i] != -1; ++i) {
+ frame_num_ = 0;
+ forced_kf_frame_num_ = lag_values[i] - 1;
+ cfg_.g_lag_in_frames = lag_values[i];
+ is_kf_placement_violated_ = false;
+ libaom_test::I420VideoSource video(
+ TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+ timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(is_kf_placement_violated_, false)
+ << "Frame #" << frame_num_ << " isn't a keyframe!";
+
+ // Two pass and single pass CBR are currently segfaulting for the case when
+ // forced kf is placed after lag in frames.
+ // TODO(anyone): Enable(uncomment) below test once above bug is fixed.
+ // frame_num_ = 0;
+ // forced_kf_frame_num_ = lag_values[i] + 1;
+ // cfg_.g_lag_in_frames = lag_values[i];
+ // is_kf_placement_violated_ = false;
+ // ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // ASSERT_EQ(is_kf_placement_violated_, false)
+ // << "Frame #" << frame_num_ << " isn't a keyframe!";
+ }
+}
+
+void ForcedKeyTestLarge::ForcedFrameIsKeyCornerCases() {
+ const aom_rational timebase = { 1, 30 };
+ const int kf_offsets[] = { -2, -1, 1, 2, 0 };
+ cfg_.g_lag_in_frames = 35;
+ if (encoding_mode_ == ::libaom_test::kRealTime) cfg_.g_lag_in_frames = 0;
+
+ for (int i = 0; kf_offsets[i] != 0; ++i) {
+ frame_num_ = 0;
+ forced_kf_frame_num_ = (int)cfg_.kf_max_dist + kf_offsets[i];
+ forced_kf_frame_num_ = forced_kf_frame_num_ > 0 ? forced_kf_frame_num_ : 1;
+ is_kf_placement_violated_ = false;
+ libaom_test::I420VideoSource video(
+ TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+ timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(is_kf_placement_violated_, false)
+ << "Frame #" << frame_num_ << " isn't a keyframe!";
+ }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(KeyFrameIntervalTestLarge,
+ testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::ValuesIn(kfTestParams),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+TEST_P(ForcedKeyTestLarge, Frame1IsKey) { Frame1IsKey(); }
+TEST_P(ForcedKeyTestLarge, ForcedFrameIsKey) { ForcedFrameIsKey(); }
+TEST_P(ForcedKeyTestLarge, ForcedFrameIsKeyCornerCases) {
+ ForcedFrameIsKeyCornerCases();
+}
+
+class ForcedKeyRTTestLarge : public ForcedKeyTestLarge {};
+
+TEST_P(ForcedKeyRTTestLarge, Frame1IsKey) { Frame1IsKey(); }
+TEST_P(ForcedKeyRTTestLarge, ForcedFrameIsKeyCornerCases) {
+ ForcedFrameIsKeyCornerCases();
+}
+// TODO(anyone): Add CBR to list of rc_modes once forced kf placement after
+// lag in frames bug is fixed.
+AV1_INSTANTIATE_TEST_SUITE(ForcedKeyTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(0, 1), ::testing::Values(0, 1),
+ ::testing::Values(2, 5),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ));
+AV1_INSTANTIATE_TEST_SUITE(ForcedKeyRTTestLarge,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Values(0), ::testing::Values(0),
+ ::testing::Values(7, 9),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR));
+} // namespace
diff --git a/third_party/aom/test/level_test.cc b/third_party/aom/test/level_test.cc
new file mode 100644
index 0000000000..a7c26d2305
--- /dev/null
+++ b/third_party/aom/test/level_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+const int kLevelMin = 0;
+const int kLevelMax = 31;
+const int kLevelKeepStats = 32;
+// Speed settings tested
+static const int kCpuUsedVectors[] = {
+ 1,
+ 2,
+ 3,
+ 4,
+};
+
+class LevelTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ LevelTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), target_level_(31) {}
+
+ ~LevelTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ cfg_.g_lag_in_frames = 5;
+ } else {
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ }
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ int num_operating_points;
+ encoder->Control(AV1E_GET_NUM_OPERATING_POINTS, &num_operating_points);
+ ASSERT_EQ(num_operating_points, 1);
+ encoder->Control(AV1E_GET_SEQ_LEVEL_IDX, level_);
+ ASSERT_LE(level_[0], kLevelMax);
+ ASSERT_GE(level_[0], kLevelMin);
+ }
+
+ libaom_test::TestMode encoding_mode_;
+ int cpu_used_;
+ int target_level_;
+ int level_[32];
+};
+
+TEST(LevelTest, TestTargetLevelApi) {
+ aom_codec_iface_t *codec = aom_codec_av1_cx();
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0));
+ for (int operating_point = 0; operating_point <= 32; ++operating_point) {
+ for (int level = 0; level <= 32; ++level) {
+ const int target_level = operating_point * 100 + level;
+ if (operating_point <= 31 &&
+ ((level < (CONFIG_CWG_C013 ? 28 : 20) && level != 2 && level != 3 &&
+ level != 6 && level != 7 && level != 10 && level != 11) ||
+ level == kLevelMax || level == kLevelKeepStats)) {
+ EXPECT_EQ(AOM_CODEC_OK,
+ AOM_CODEC_CONTROL_TYPECHECKED(
+ &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
+ } else {
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ AOM_CODEC_CONTROL_TYPECHECKED(
+ &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
+ }
+ }
+ }
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST(LevelTest, InvalidOperatingPointIndexErrorDetail) {
+ aom_codec_iface_t *codec = aom_codec_av1_cx();
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(codec, &cfg, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_enc_init(&enc, codec, &cfg, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, 3219),
+ AOM_CODEC_INVALID_PARAM);
+ EXPECT_EQ(aom_codec_error_detail(&enc),
+ std::string("Invalid operating point index: 32"));
+ EXPECT_EQ(aom_codec_set_option(&enc, "target-seq-level-idx", "3319"),
+ AOM_CODEC_INVALID_PARAM);
+ EXPECT_EQ(aom_codec_error_detail(&enc),
+ std::string("Invalid operating point index: 33"));
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+TEST_P(LevelTest, TestTargetLevel19) {
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10));
+ ASSERT_NE(video, nullptr);
+ // Level index 19 corresponding to level 6.3.
+ target_level_ = 19;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+TEST_P(LevelTest, TestLevelMonitoringLowBitrate) {
+ // To save run time, we only test speed 4.
+ if (cpu_used_ == 4) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 40);
+ target_level_ = kLevelKeepStats;
+ cfg_.rc_target_bitrate = 1000;
+ cfg_.g_limit = 40;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(level_[0], 0);
+ }
+}
+
+TEST_P(LevelTest, TestLevelMonitoringHighBitrate) {
+ // To save run time, we only test speed 4.
+ if (cpu_used_ == 4) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 40);
+ target_level_ = kLevelKeepStats;
+ cfg_.rc_target_bitrate = 4000;
+ cfg_.g_limit = 40;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(level_[0], 4);
+ }
+}
+
+TEST_P(LevelTest, TestTargetLevel0) {
+ // To save run time, we only test speed 4.
+ if (cpu_used_ == 4) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 50);
+ const int target_level = 0;
+ target_level_ = target_level;
+ cfg_.rc_target_bitrate = 4000;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(level_[0], target_level);
+ }
+}
+
+TEST_P(LevelTest, TestTargetLevelRecode) {
+ if (cpu_used_ == 4 && encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ libaom_test::I420VideoSource video("rand_noise_w1280h720.yuv", 1280, 720,
+ 25, 1, 0, 10);
+ const int target_level = 0005;
+ target_level_ = target_level;
+ cfg_.rc_target_bitrate = 5000;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(LevelTest,
+ ::testing::Values(::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood),
+ ::testing::ValuesIn(kCpuUsedVectors));
+} // namespace
diff --git a/third_party/aom/test/lightfield_test.sh b/third_party/aom/test/lightfield_test.sh
new file mode 100755
index 0000000000..cf1ea73a84
--- /dev/null
+++ b/third_party/aom/test/lightfield_test.sh
@@ -0,0 +1,115 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the lightfield example.
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $infile is required.
+lightfield_test_verify_environment() {
+ local infile="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+ if [ ! -e "${infile}" ]; then
+ echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Run the lightfield example
+lightfield_test() {
+ local img_width=1024
+ local img_height=1024
+ local lf_width=10
+ local lf_height=10
+ local lf_blocksize=5
+ local num_references=4
+ local num_tile_lists=2
+
+ # Encode the lightfield.
+ local encoder="${LIBAOM_BIN_PATH}/lightfield_encoder${AOM_TEST_EXE_SUFFIX}"
+ local yuv_file="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+ local lf_file="${AOM_TEST_OUTPUT_DIR}/vase10x10.ivf"
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \
+ "${yuv_file}" "${lf_file}" "${lf_width}" \
+ "${lf_height}" "${lf_blocksize}" ${devnull} || return 1
+
+ [ -e "${lf_file}" ] || return 1
+
+ # Check to ensure all camera frames have the identical frame header. If not identical, this test fails.
+ for i in ./fh*; do
+ diff ./fh004 $i > /dev/null
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+ done
+
+ # Check to ensure all camera frames use the identical frame context. If not identical, this test fails.
+ for i in ./fc*; do
+ diff ./fc004 $i > /dev/null
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+ done
+
+ # Parse lightfield bitstream to construct and output a new bitstream that can
+ # be decoded by an AV1 decoder.
+ local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}"
+ local tl_file="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.ivf"
+ local tl_text_file="${LIBAOM_TEST_DATA_PATH}/vase10x10_tiles.txt"
+ if [ ! -x "${bs_decoder}" ]; then
+ elog "${bs_decoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \
+ "${num_references}" "${tl_text_file}" ${devnull} || return 1
+
+ [ -e "${tl_file}" ] || return 1
+
+ # Run lightfield tile list decoder
+ local tl_decoder="${LIBAOM_BIN_PATH}/lightfield_tile_list_decoder${AOM_TEST_EXE_SUFFIX}"
+ local tl_outfile="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.yuv"
+ if [ ! -x "${tl_decoder}" ]; then
+ elog "${tl_decoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \
+ "${num_references}" "${num_tile_lists}" ${devnull} || return 1
+
+ [ -e "${tl_outfile}" ] || return 1
+
+ # Run reference lightfield decoder
+ local ref_decoder="${LIBAOM_BIN_PATH}/lightfield_decoder${AOM_TEST_EXE_SUFFIX}"
+ local tl_reffile="${AOM_TEST_OUTPUT_DIR}/vase_reference.yuv"
+ if [ ! -x "${ref_decoder}" ]; then
+ elog "${ref_decoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \
+ "${num_references}" "${tl_text_file}" ${devnull} || return 1
+
+ [ -e "${tl_reffile}" ] || return 1
+
+ # Check if tl_outfile and tl_reffile are identical. If not identical, this test fails.
+ diff ${tl_outfile} ${tl_reffile} > /dev/null
+ if [ $? -eq 1 ]; then
+ return 1
+ fi
+}
+
+lightfield_test_tests="lightfield_test"
+
+run_tests lightfield_test_verify_environment "${lightfield_test_tests}"
diff --git a/third_party/aom/test/log2_test.cc b/third_party/aom/test/log2_test.cc
new file mode 100644
index 0000000000..71cf8b25d9
--- /dev/null
+++ b/third_party/aom/test/log2_test.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "aom_ports/bitops.h"
+#include "av1/common/entropymode.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(Log2Test, GetMsb) {
+ // Test small numbers exhaustively.
+ for (unsigned int n = 1; n < 10000; n++) {
+ EXPECT_EQ(get_msb(n), static_cast<int>(floor(log2(n))));
+ }
+
+ // Test every power of 2 and the two adjacent numbers.
+ for (int exponent = 2; exponent < 32; exponent++) {
+ const unsigned int power_of_2 = 1U << exponent;
+ EXPECT_EQ(get_msb(power_of_2 - 1), exponent - 1);
+ EXPECT_EQ(get_msb(power_of_2), exponent);
+ EXPECT_EQ(get_msb(power_of_2 + 1), exponent);
+ }
+}
+
+TEST(Log2Test, Av1CeilLog2) {
+ // Test small numbers exhaustively.
+ EXPECT_EQ(av1_ceil_log2(0), 0);
+ for (int n = 1; n < 10000; n++) {
+ EXPECT_EQ(av1_ceil_log2(n), static_cast<int>(ceil(log2(n))));
+ }
+
+ // Test every power of 2 and the two adjacent numbers.
+ for (int exponent = 2; exponent < 31; exponent++) {
+ const int power_of_2 = 1 << exponent;
+ EXPECT_EQ(av1_ceil_log2(power_of_2 - 1), exponent);
+ EXPECT_EQ(av1_ceil_log2(power_of_2), exponent);
+ EXPECT_EQ(av1_ceil_log2(power_of_2 + 1), exponent + 1);
+ }
+
+ // INT_MAX = 2^31 - 1
+ EXPECT_EQ(av1_ceil_log2(INT_MAX), 31);
+}
diff --git a/third_party/aom/test/loopfilter_control_test.cc b/third_party/aom/test/loopfilter_control_test.cc
new file mode 100644
index 0000000000..9c00235e19
--- /dev/null
+++ b/third_party/aom/test/loopfilter_control_test.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for LF settings 0-3
+// keys: video, LF control, aq mode.
+std::unordered_map<std::string,
+ std::unordered_map<int, std::unordered_map<int, double>>>
+ kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
+ { { 0, { { 0, 35.0 }, { 3, 35.8 } } },
+ { 1, { { 0, 35.1 }, { 3, 35.9 } } },
+ { 2, { { 0, 35.1 }, { 3, 36.1 } } },
+ { 3, { { 0, 35.1 }, { 3, 36.1 } } } } },
+ { "paris_352_288_30.y4m",
+ { { 0, { { 0, 35.40 }, { 3, 36.0 } } },
+ { 1, { { 0, 35.50 }, { 3, 36.0 } } },
+ { 2, { { 0, 35.50 }, { 3, 36.0 } } },
+ { 3, { { 0, 35.50 }, { 3, 36.0 } } } } },
+ { "niklas_1280_720_30.y4m",
+ { { 0, { { 0, 33.20 }, { 3, 32.90 } } },
+ { 1, { { 0, 33.57 }, { 3, 33.22 } } },
+ { 2, { { 0, 33.57 }, { 3, 33.22 } } },
+ { 3, { { 0, 33.45 }, { 3, 33.10 } } } } } };
+
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Params: test video, lf_control, aq mode, threads, tile columns.
+class LFControlEndToEndTest
+ : public ::libaom_test::CodecTestWith5Params<TestVideoParam, int,
+ unsigned int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ LFControlEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ lf_control_(GET_PARAM(2)), psnr_(0.0), nframes_(0),
+ aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
+ tile_columns_(GET_PARAM(5)) {}
+
+ ~LFControlEndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kRealTime);
+
+ cfg_.g_threads = threads_;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.kf_max_dist = 9999;
+ cfg_.kf_min_dist = 9999;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+ encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+ encoder->Control(AOME_SET_CPUUSED, 10);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+ encoder->Control(AV1E_SET_ROW_MT, 1);
+ encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+ encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_LOOPFILTER_CONTROL, lf_control_);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() {
+ return kPsnrThreshold[test_video_param_.filename][lf_control_][aq_mode_];
+ }
+
+ void DoTest() {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, GetPsnrThreshold())
+ << "loopfilter control = " << lf_control_ << " aq mode = " << aq_mode_;
+ }
+
+ TestVideoParam test_video_param_;
+ int lf_control_;
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ unsigned int aq_mode_;
+ int threads_;
+ int tile_columns_;
+};
+
+class LFControlEndToEndTestThreaded : public LFControlEndToEndTest {};
+
+TEST_P(LFControlEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(LFControlEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
+
+TEST(LFControlGetterTest, NullptrInput) {
+ int *lf_level = nullptr;
+ aom_codec_ctx_t encoder;
+ aom_codec_enc_cfg_t cfg;
+ aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, 1);
+ EXPECT_EQ(aom_codec_enc_init(&encoder, aom_codec_av1_cx(), &cfg, 0),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&encoder, AOME_GET_LOOPFILTER_LEVEL, lf_level),
+ AOM_CODEC_INVALID_PARAM);
+ EXPECT_EQ(aom_codec_destroy(&encoder), AOM_CODEC_OK);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(LFControlEndToEndTest,
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Range(0, 4),
+ ::testing::Values<unsigned int>(0, 3),
+ ::testing::Values(1), ::testing::Values(1));
+
+AV1_INSTANTIATE_TEST_SUITE(LFControlEndToEndTestThreaded,
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Range(0, 4),
+ ::testing::Values<unsigned int>(0, 3),
+ ::testing::Range(2, 5), ::testing::Range(2, 5));
+} // namespace
diff --git a/third_party/aom/test/lossless_test.cc b/third_party/aom/test/lossless_test.cc
new file mode 100644
index 0000000000..756ad05019
--- /dev/null
+++ b/third_party/aom/test/lossless_test.cc
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LosslessTestLarge
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+ aom_rc_mode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ LosslessTestLarge()
+ : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
+ encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)),
+ cpu_used_(GET_PARAM(3)) {}
+
+ ~LosslessTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ cfg_.rc_end_usage = rc_end_usage_;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ // Only call Control if quantizer > 0 to verify that using quantizer
+ // alone will activate lossless
+ if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
+ encoder->Control(AV1E_SET_LOSSLESS, 1);
+ }
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ }
+ }
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+ psnr_ = kMaxPsnr;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
+ }
+
+ double GetMinPsnr() const { return psnr_; }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_LAST_QUANTIZER,
+ &base_qindex_);
+ EXPECT_EQ(base_qindex_, 0)
+ << "Error: Base_qindex is non zero for lossless coding";
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ void TestLosslessEncoding();
+ void TestLosslessEncodingVGALag0();
+ void TestLosslessEncoding444();
+ void TestLosslessEncodingCtrl();
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ libaom_test::TestMode encoding_mode_;
+ aom_rc_mode rc_end_usage_;
+ int cpu_used_;
+ int base_qindex_;
+};
+
+void LosslessTestLarge::TestLosslessEncoding() {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 0;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ // intentionally changed the dimension for better testing coverage
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ timebase.den, timebase.num, 0, 5);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_lossless = GetMinPsnr();
+ EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+void LosslessTestLarge::TestLosslessEncodingVGALag0() {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 0;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+ timebase.den, timebase.num, 0, 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_lossless = GetMinPsnr();
+ EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+void LosslessTestLarge::TestLosslessEncoding444() {
+ libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 5);
+
+ cfg_.g_profile = 1;
+ cfg_.g_timebase = video.timebase();
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.g_lag_in_frames = 25;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 0;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_lossless = GetMinPsnr();
+ EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+void LosslessTestLarge::TestLosslessEncodingCtrl() {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.g_lag_in_frames = 25;
+ // Intentionally set Q > 0, to make sure control can be used to activate
+ // lossless
+ cfg_.rc_min_quantizer = 10;
+ cfg_.rc_max_quantizer = 20;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ timebase.den, timebase.num, 0, 5);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_lossless = GetMinPsnr();
+ EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTestLarge, TestLosslessEncoding) { TestLosslessEncoding(); }
+
+TEST_P(LosslessTestLarge, TestLosslessEncodingVGALag0) {
+ TestLosslessEncodingVGALag0();
+}
+
+TEST_P(LosslessTestLarge, TestLosslessEncoding444) {
+ TestLosslessEncoding444();
+}
+
+TEST_P(LosslessTestLarge, TestLosslessEncodingCtrl) {
+ TestLosslessEncodingCtrl();
+}
+
+class LosslessAllIntraTestLarge : public LosslessTestLarge {};
+
+TEST_P(LosslessAllIntraTestLarge, TestLosslessEncodingCtrl) {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ // Intentionally set Q > 0, to make sure control can be used to activate
+ // lossless
+ cfg_.rc_min_quantizer = 10;
+ cfg_.rc_max_quantizer = 20;
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ timebase.den, timebase.num, 0, 5);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_lossless = GetMinPsnr();
+ EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+using LosslessRealtimeTestLarge = LosslessTestLarge;
+
+TEST_P(LosslessRealtimeTestLarge, TestLosslessEncoding) {
+ TestLosslessEncoding();
+}
+
+TEST_P(LosslessRealtimeTestLarge, TestLosslessEncodingVGALag0) {
+ TestLosslessEncodingVGALag0();
+}
+
+TEST_P(LosslessRealtimeTestLarge, TestLosslessEncoding444) {
+ TestLosslessEncoding444();
+}
+
+TEST_P(LosslessRealtimeTestLarge, TestLosslessEncodingCtrl) {
+ TestLosslessEncodingCtrl();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ),
+ ::testing::Values(0)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessAllIntraTestLarge,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(AOM_Q),
+ ::testing::Values(6, 9)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessRealtimeTestLarge,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ),
+ ::testing::Range(6, 11)); // cpu_used
+} // namespace
diff --git a/third_party/aom/test/lpf_test.cc b/third_party/aom/test/lpf_test.cc
new file mode 100644
index 0000000000..04b1c86d4d
--- /dev/null
+++ b/third_party/aom/test/lpf_test.cc
@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8 Coeffs preceeding filtered section
+// 16 Coefs within filtered section
+// 8 Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+const int kSpeedTestNum = 500000;
+
+#define LOOP_PARAM \
+ int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh
+#define DUAL_LOOP_PARAM \
+ int p, const uint8_t *blimit0, const uint8_t *limit0, \
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \
+ const uint8_t *thresh1
+
+typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM);
+typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM);
+typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
+typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
+
+typedef std::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
+typedef std::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
+ hbddual_loop_param_t;
+typedef std::tuple<loop_op_t, loop_op_t, int> loop_param_t;
+typedef std::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
+
+template <typename Pixel_t, int PIXEL_WIDTH_t>
+void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
+ const int mask, const int32_t p, const int i) {
+ uint16_t tmp_s[kNumCoeffs];
+
+ for (int j = 0; j < kNumCoeffs;) {
+ const uint8_t val = rnd->Rand8();
+ if (val & 0x80) { // 50% chance to choose a new value.
+ tmp_s[j] = rnd->Rand16();
+ j++;
+ } else { // 50% chance to repeat previous value in row X times.
+ int k = 0;
+ while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+ if (j < 1) {
+ tmp_s[j] = rnd->Rand16();
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
+ } else { // Decrement by a value within the limit.
+ tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
+ }
+ j++;
+ }
+ }
+ }
+
+ for (int j = 0; j < kNumCoeffs;) {
+ const uint8_t val = rnd->Rand8();
+ if (val & 0x80) {
+ j++;
+ } else { // 50% chance to repeat previous value in column X times.
+ int k = 0;
+ while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+ if (j < 1) {
+ tmp_s[j] = rnd->Rand16();
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+ tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1));
+ } else { // Decrement by a value within the limit.
+ tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+ tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1));
+ }
+ j++;
+ }
+ }
+ }
+
+ for (int j = 0; j < kNumCoeffs; j++) {
+ if (i % 2) {
+ s[j] = tmp_s[j] & mask;
+ } else {
+ s[j] = tmp_s[p * (j % p) + j / p] & mask;
+ }
+ ref_s[j] = s[j];
+ }
+}
+
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+ return static_cast<uint8_t>(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+ return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+ return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4);
+}
+
+template <typename func_type_t, typename params_t>
+class LoopTestParam : public ::testing::TestWithParam<params_t> {
+ public:
+ ~LoopTestParam() override = default;
+ void SetUp() override {
+ loopfilter_op_ = std::get<0>(this->GetParam());
+ ref_loopfilter_op_ = std::get<1>(this->GetParam());
+ bit_depth_ = std::get<2>(this->GetParam());
+ mask_ = (1 << bit_depth_) - 1;
+ }
+
+ protected:
+ int bit_depth_;
+ int mask_;
+ func_type_t loopfilter_op_;
+ func_type_t ref_loopfilter_op_;
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
+ op(s, p, blimit, limit, thresh, bd);
+}
+void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
+ hbddual_loop_op_t op) {
+ op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
+}
+#endif
+void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
+ (void)bd;
+ op(s, p, blimit, limit, thresh);
+}
+void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
+ (void)bd;
+ op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_hbd);
+typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
+ Loop8Test9Param_hbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_hbd);
+#endif
+typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_lbd);
+typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_lbd);
+
+#define OPCHECK(a, b) \
+ do { \
+ ACMRandom rnd(ACMRandom::DeterministicSeed()); \
+ const int count_test_block = number_of_iterations; \
+ const int32_t p = kNumCoeffs / 32; \
+ DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
+ DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
+ int err_count_total = 0; \
+ int first_failure = -1; \
+ for (int i = 0; i < count_test_block; ++i) { \
+ int err_count = 0; \
+ uint8_t tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i); \
+ call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_, \
+ ref_loopfilter_op_); \
+ API_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit, \
+ thresh, bit_depth_, \
+ loopfilter_op_)); \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ err_count += ref_s[j] != s[j]; \
+ } \
+ if (err_count && !err_count_total) { \
+ first_failure = i; \
+ } \
+ err_count_total += err_count; \
+ } \
+ EXPECT_EQ(0, err_count_total) \
+ << "Error: Loop8Test6Param, C output doesn't match SIMD " \
+ "loopfilter output. " \
+ << "First failed at test case " << first_failure; \
+ } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
+
+#define VALCHECK(a, b) \
+ do { \
+ ACMRandom rnd(ACMRandom::DeterministicSeed()); \
+ const int count_test_block = number_of_iterations; \
+ DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
+ DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
+ int err_count_total = 0; \
+ int first_failure = -1; \
+ for (int i = 0; i < count_test_block; ++i) { \
+ int err_count = 0; \
+ uint8_t tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ int32_t p = kNumCoeffs / 32; \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ s[j] = rnd.Rand16() & mask_; \
+ ref_s[j] = s[j]; \
+ } \
+ call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_, \
+ ref_loopfilter_op_); \
+ API_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit, \
+ thresh, bit_depth_, \
+ loopfilter_op_)); \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ err_count += ref_s[j] != s[j]; \
+ } \
+ if (err_count && !err_count_total) { \
+ first_failure = i; \
+ } \
+ err_count_total += err_count; \
+ } \
+ EXPECT_EQ(0, err_count_total) \
+ << "Error: Loop8Test6Param, C output doesn't match SIMD " \
+ "loopfilter output. " \
+ << "First failed at test case " << first_failure; \
+ } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
+
+#define SPEEDCHECK(a, b) \
+ do { \
+ ACMRandom rnd(ACMRandom::DeterministicSeed()); \
+ const int count_test_block = kSpeedTestNum; \
+ const int32_t bd = bit_depth_; \
+ DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
+ uint8_t tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ int32_t p = kNumCoeffs / 32; \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ s[j] = rnd.Rand16() & mask_; \
+ } \
+ for (int i = 0; i < count_test_block; ++i) { \
+ call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, \
+ loopfilter_op_); \
+ } \
+ } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
+
+#define OPCHECKd(a, b) \
+ do { \
+ ACMRandom rnd(ACMRandom::DeterministicSeed()); \
+ const int count_test_block = number_of_iterations; \
+ DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
+ DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
+ int err_count_total = 0; \
+ int first_failure = -1; \
+ for (int i = 0; i < count_test_block; ++i) { \
+ int err_count = 0; \
+ uint8_t tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ int32_t p = kNumCoeffs / 32; \
+ const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1; \
+ InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i); \
+ call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+ limit1, thresh1, bit_depth_, ref_loopfilter_op_); \
+ API_REGISTER_STATE_CHECK( \
+ call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+ limit1, thresh1, bit_depth_, loopfilter_op_)); \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ err_count += ref_s[j] != s[j]; \
+ } \
+ if (err_count && !err_count_total) { \
+ first_failure = i; \
+ } \
+ err_count_total += err_count; \
+ } \
+ EXPECT_EQ(0, err_count_total) \
+ << "Error: Loop8Test9Param, C output doesn't match SIMD " \
+ "loopfilter output. " \
+ << "First failed at test case " << first_failure; \
+ } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
+
+#define VALCHECKd(a, b) \
+ do { \
+ ACMRandom rnd(ACMRandom::DeterministicSeed()); \
+ const int count_test_block = number_of_iterations; \
+ DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
+ DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
+ int err_count_total = 0; \
+ int first_failure = -1; \
+ for (int i = 0; i < count_test_block; ++i) { \
+ int err_count = 0; \
+ uint8_t tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED( \
+ 16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp }; \
+ int32_t p = kNumCoeffs / 32; \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ s[j] = rnd.Rand16() & mask_; \
+ ref_s[j] = s[j]; \
+ } \
+ call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+ limit1, thresh1, bit_depth_, ref_loopfilter_op_); \
+ API_REGISTER_STATE_CHECK( \
+ call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+ limit1, thresh1, bit_depth_, loopfilter_op_)); \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ err_count += ref_s[j] != s[j]; \
+ } \
+ if (err_count && !err_count_total) { \
+ first_failure = i; \
+ } \
+ err_count_total += err_count; \
+ } \
+ EXPECT_EQ(0, err_count_total) \
+ << "Error: Loop8Test9Param, C output doesn't match SIMD " \
+ "loopfilter output. " \
+ << "First failed at test case " << first_failure; \
+ } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
+
+#define SPEEDCHECKd(a, b) \
+ do { \
+ ACMRandom rnd(ACMRandom::DeterministicSeed()); \
+ const int count_test_block = kSpeedTestNum; \
+ DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
+ uint8_t tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetOuterThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetInnerThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ tmp = GetHevThresh(&rnd); \
+ DECLARE_ALIGNED(16, const uint8_t, \
+ thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
+ tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+ int32_t p = kNumCoeffs / 32; \
+ for (int j = 0; j < kNumCoeffs; ++j) { \
+ s[j] = rnd.Rand16() & mask_; \
+ } \
+ for (int i = 0; i < count_test_block; ++i) { \
+ call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
+ limit1, thresh1, bit_depth_, loopfilter_op_); \
+ } \
+ } while (false)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); }
+
+using std::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_AV1_HIGHBITDEPTH
+const hbdloop_param_t kHbdLoop8Test6[] = {
+ make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+ &aom_highbd_lpf_horizontal_14_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
+
+ make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+ &aom_highbd_lpf_horizontal_14_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+ &aom_highbd_lpf_horizontal_14_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd,
+ ::testing::ValuesIn(kHbdLoop8Test6));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+const loop_param_t kLoop8Test6[] = {
+ make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
+ make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
+ make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
+ make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
+ make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
+ make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
+ make_tuple(&aom_lpf_horizontal_4_quad_sse2, &aom_lpf_horizontal_4_quad_c, 8),
+ make_tuple(&aom_lpf_vertical_4_quad_sse2, &aom_lpf_vertical_4_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_6_quad_sse2, &aom_lpf_horizontal_6_quad_c, 8),
+ make_tuple(&aom_lpf_vertical_6_quad_sse2, &aom_lpf_vertical_6_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_quad_sse2, &aom_lpf_horizontal_8_quad_c, 8),
+ make_tuple(&aom_lpf_vertical_8_quad_sse2, &aom_lpf_vertical_8_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_quad_sse2, &aom_lpf_horizontal_14_quad_c,
+ 8),
+ make_tuple(&aom_lpf_vertical_14_quad_sse2, &aom_lpf_vertical_14_quad_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_lbd,
+ ::testing::ValuesIn(kLoop8Test6));
+
+const dual_loop_param_t kLoop8Test9[] = {
+ make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_6_dual_sse2, &aom_lpf_horizontal_6_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_6_dual_sse2, &aom_lpf_vertical_6_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
+ 8),
+ make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_lbd,
+ ::testing::ValuesIn(kLoop8Test9));
+
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+const loop_param_t kLoop8Test6Avx2[] = {
+ make_tuple(&aom_lpf_horizontal_6_quad_avx2, &aom_lpf_horizontal_6_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_quad_avx2, &aom_lpf_horizontal_8_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_quad_avx2, &aom_lpf_horizontal_14_quad_c,
+ 8),
+ make_tuple(&aom_lpf_vertical_14_quad_avx2, &aom_lpf_vertical_14_quad_c, 8),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test6Param_lbd,
+ ::testing::ValuesIn(kLoop8Test6Avx2));
+#endif
+
+#if HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
+const hbddual_loop_param_t kHbdLoop8Test9[] = {
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+ &aom_highbd_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+ &aom_highbd_lpf_horizontal_6_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+ &aom_highbd_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+ &aom_highbd_lpf_horizontal_14_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+ &aom_highbd_lpf_vertical_4_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+ &aom_highbd_lpf_vertical_6_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+ &aom_highbd_lpf_vertical_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+ &aom_highbd_lpf_vertical_14_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+ &aom_highbd_lpf_horizontal_4_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+ &aom_highbd_lpf_horizontal_6_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+ &aom_highbd_lpf_horizontal_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+ &aom_highbd_lpf_horizontal_14_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+ &aom_highbd_lpf_vertical_4_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+ &aom_highbd_lpf_vertical_6_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+ &aom_highbd_lpf_vertical_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+ &aom_highbd_lpf_vertical_14_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+ &aom_highbd_lpf_horizontal_4_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+ &aom_highbd_lpf_horizontal_6_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+ &aom_highbd_lpf_horizontal_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+ &aom_highbd_lpf_horizontal_14_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+ &aom_highbd_lpf_vertical_4_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+ &aom_highbd_lpf_vertical_6_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+ &aom_highbd_lpf_vertical_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+ &aom_highbd_lpf_vertical_14_dual_c, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_hbd,
+ ::testing::ValuesIn(kHbdLoop8Test9));
+
+#endif // HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_NEON
+const loop_param_t kLoop8Test6[] = {
+ make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
+ make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
+ make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
+ make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
+ make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
+ make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8),
+ make_tuple(&aom_lpf_horizontal_4_quad_neon, &aom_lpf_horizontal_4_quad_c, 8),
+ make_tuple(&aom_lpf_vertical_4_quad_neon, &aom_lpf_vertical_4_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_6_quad_neon, &aom_lpf_horizontal_6_quad_c, 8),
+ make_tuple(&aom_lpf_vertical_6_quad_neon, &aom_lpf_vertical_6_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_quad_neon, &aom_lpf_horizontal_8_quad_c, 8),
+ make_tuple(&aom_lpf_vertical_8_quad_neon, &aom_lpf_vertical_8_quad_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_quad_neon, &aom_lpf_horizontal_14_quad_c,
+ 8),
+ make_tuple(&aom_lpf_vertical_14_quad_neon, &aom_lpf_vertical_14_quad_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_lbd,
+ ::testing::ValuesIn(kLoop8Test6));
+
+const dual_loop_param_t kLoop8Test9[] = {
+ make_tuple(&aom_lpf_horizontal_4_dual_neon, &aom_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_6_dual_neon, &aom_lpf_horizontal_6_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_8_dual_neon, &aom_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_dual_neon, &aom_lpf_horizontal_14_dual_c,
+ 8),
+ make_tuple(&aom_lpf_vertical_4_dual_neon, &aom_lpf_vertical_4_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_6_dual_neon, &aom_lpf_vertical_6_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_8_dual_neon, &aom_lpf_vertical_8_dual_c, 8),
+ make_tuple(&aom_lpf_vertical_14_dual_neon, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test9Param_lbd,
+ ::testing::ValuesIn(kLoop8Test9));
+#if CONFIG_AV1_HIGHBITDEPTH
+const hbdloop_param_t kHbdLoop8Test6[] = {
+ make_tuple(&aom_highbd_lpf_horizontal_4_neon, &aom_highbd_lpf_horizontal_4_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_horizontal_4_neon, &aom_highbd_lpf_horizontal_4_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_horizontal_4_neon, &aom_highbd_lpf_horizontal_4_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_horizontal_6_neon, &aom_highbd_lpf_horizontal_6_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_horizontal_6_neon, &aom_highbd_lpf_horizontal_6_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_horizontal_6_neon, &aom_highbd_lpf_horizontal_6_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_horizontal_8_neon, &aom_highbd_lpf_horizontal_8_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_horizontal_8_neon, &aom_highbd_lpf_horizontal_8_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_horizontal_8_neon, &aom_highbd_lpf_horizontal_8_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_horizontal_14_neon,
+ &aom_highbd_lpf_horizontal_14_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_14_neon,
+ &aom_highbd_lpf_horizontal_14_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_14_neon,
+ &aom_highbd_lpf_horizontal_14_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_4_neon, &aom_highbd_lpf_vertical_4_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_4_neon, &aom_highbd_lpf_vertical_4_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_4_neon, &aom_highbd_lpf_vertical_4_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_6_neon, &aom_highbd_lpf_vertical_6_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_6_neon, &aom_highbd_lpf_vertical_6_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_6_neon, &aom_highbd_lpf_vertical_6_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_8_neon, &aom_highbd_lpf_vertical_8_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_8_neon, &aom_highbd_lpf_vertical_8_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_8_neon, &aom_highbd_lpf_vertical_8_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_14_neon, &aom_highbd_lpf_vertical_14_c,
+ 8),
+ make_tuple(&aom_highbd_lpf_vertical_14_neon, &aom_highbd_lpf_vertical_14_c,
+ 10),
+ make_tuple(&aom_highbd_lpf_vertical_14_neon, &aom_highbd_lpf_vertical_14_c,
+ 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_hbd,
+ ::testing::ValuesIn(kHbdLoop8Test6));
+
+const hbddual_loop_param_t kHbdLoop8Test9[] = {
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_neon,
+ &aom_highbd_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_6_dual_neon,
+ &aom_highbd_lpf_horizontal_6_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_neon,
+ &aom_highbd_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_neon,
+ &aom_highbd_lpf_horizontal_14_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_neon,
+ &aom_highbd_lpf_vertical_4_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_6_dual_neon,
+ &aom_highbd_lpf_vertical_6_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_neon,
+ &aom_highbd_lpf_vertical_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_neon,
+ &aom_highbd_lpf_vertical_14_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_neon,
+ &aom_highbd_lpf_horizontal_4_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_6_dual_neon,
+ &aom_highbd_lpf_horizontal_6_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_neon,
+ &aom_highbd_lpf_horizontal_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_neon,
+ &aom_highbd_lpf_horizontal_14_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_neon,
+ &aom_highbd_lpf_vertical_4_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_6_dual_neon,
+ &aom_highbd_lpf_vertical_6_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_neon,
+ &aom_highbd_lpf_vertical_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_neon,
+ &aom_highbd_lpf_vertical_14_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_neon,
+ &aom_highbd_lpf_horizontal_4_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_6_dual_neon,
+ &aom_highbd_lpf_horizontal_6_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_neon,
+ &aom_highbd_lpf_horizontal_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_14_dual_neon,
+ &aom_highbd_lpf_horizontal_14_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_neon,
+ &aom_highbd_lpf_vertical_4_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_6_dual_neon,
+ &aom_highbd_lpf_vertical_6_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_neon,
+ &aom_highbd_lpf_vertical_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_14_dual_neon,
+ &aom_highbd_lpf_vertical_14_dual_c, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test9Param_hbd,
+ ::testing::ValuesIn(kHbdLoop8Test9));
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_NEON
+
+#if HAVE_AVX2 && CONFIG_AV1_HIGHBITDEPTH
+const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+ &aom_highbd_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+ &aom_highbd_lpf_horizontal_4_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+ &aom_highbd_lpf_horizontal_4_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+ &aom_highbd_lpf_horizontal_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+ &aom_highbd_lpf_horizontal_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+ &aom_highbd_lpf_horizontal_8_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+ &aom_highbd_lpf_vertical_4_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+ &aom_highbd_lpf_vertical_4_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+ &aom_highbd_lpf_vertical_4_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+ &aom_highbd_lpf_vertical_8_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+ &aom_highbd_lpf_vertical_8_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+ &aom_highbd_lpf_vertical_8_dual_c, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test9Param_hbd,
+ ::testing::ValuesIn(kHbdLoop8Test9Avx2));
+#endif
+} // namespace
diff --git a/third_party/aom/test/masked_sad_test.cc b/third_party/aom/test/masked_sad_test.cc
new file mode 100644
index 0000000000..bb037460d1
--- /dev/null
+++ b/third_party/aom/test/masked_sad_test.cc
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 200;
+
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride,
+ int invert_mask);
+typedef std::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+
+typedef void (*MaskedSADx4Func)(const uint8_t *src, int src_stride,
+ const uint8_t *ref[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int invert_mask,
+ unsigned sads[]);
+
+typedef std::tuple<MaskedSADx4Func, MaskedSADx4Func> MaskedSADx4Param;
+
+class MaskedSADTestBase : public ::testing::Test {
+ public:
+ ~MaskedSADTestBase() override = default;
+ void SetUp() override = 0;
+ virtual void runRef(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) = 0;
+ virtual void runTest(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) = 0;
+
+ void runMaskedSADTest(int run_times);
+};
+
+class MaskedSADTest : public MaskedSADTestBase,
+ public ::testing::WithParamInterface<MaskedSADParam> {
+ public:
+ ~MaskedSADTest() override = default;
+ void SetUp() override {
+ maskedSAD_op_ = GET_PARAM(0);
+ ref_maskedSAD_op_ = GET_PARAM(1);
+ }
+
+ void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
+ void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
+
+ protected:
+ MaskedSADFunc maskedSAD_op_;
+ MaskedSADFunc ref_maskedSAD_op_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADTest);
+
+class MaskedSADx4Test : public MaskedSADTestBase,
+ public ::testing::WithParamInterface<MaskedSADx4Param> {
+ public:
+ ~MaskedSADx4Test() override = default;
+ void SetUp() override {
+ maskedSAD_op_ = GET_PARAM(0);
+ ref_maskedSAD_op_ = GET_PARAM(1);
+ }
+ void runRef(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
+ void runTest(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr[],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int inv_mask, unsigned sads[],
+ int times) override;
+
+ protected:
+ MaskedSADx4Func maskedSAD_op_;
+ MaskedSADx4Func ref_maskedSAD_op_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSADx4Test);
+
+void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int invert_mask, unsigned sads[],
+ int times) {
+ for (int repeat = 0; repeat < times; ++repeat) {
+ sads[0] = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+ second_pred, msk, msk_stride, invert_mask);
+ }
+}
+
+void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int invert_mask, unsigned sads[],
+ int times) {
+ if (times == 1) {
+ sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+ second_pred, msk, msk_stride, invert_mask);
+ } else {
+ for (int repeat = 0; repeat < times; ++repeat) {
+ API_REGISTER_STATE_CHECK(
+ sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+ second_pred, msk, msk_stride, invert_mask));
+ }
+ }
+}
+
+void MaskedSADx4Test::runRef(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int invert_mask, unsigned sads[],
+ int times) {
+ for (int repeat = 0; repeat < times; ++repeat) {
+ ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred,
+ msk, msk_stride, invert_mask, sads);
+ }
+}
+
+void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr[], int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk,
+ int msk_stride, int invert_mask, unsigned sads[],
+ int times) {
+ if (times == 1) {
+ API_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+ ref_stride, second_pred, msk,
+ msk_stride, invert_mask, sads));
+ } else {
+ for (int repeat = 0; repeat < times; ++repeat) {
+ maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, msk,
+ msk_stride, invert_mask, sads);
+ }
+ }
+}
+
+void MaskedSADTestBase::runMaskedSADTest(int run_times) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const unsigned kBlockSize = MAX_SB_SIZE * MAX_SB_SIZE;
+ DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE * 4]);
+ DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+
+ const uint8_t *refs[] = { ref_ptr, ref_ptr + kBlockSize,
+ ref_ptr + 2 * kBlockSize,
+ ref_ptr + 3 * kBlockSize };
+ unsigned sads[] = { 0, 0, 0, 0 };
+ unsigned sads_ref[] = { 0, 0, 0, 0 };
+ int err_count = 0;
+ int first_failure = -1;
+ int src_stride = MAX_SB_SIZE;
+ int ref_stride = MAX_SB_SIZE;
+ int msk_stride = MAX_SB_SIZE;
+ const int iters = run_times == 1 ? number_of_iterations : 1;
+ for (int i = 0; i < iters; ++i) {
+ if (run_times == 1 && i == 0) {
+ // The maximum accumulator value occurs when src=0 and
+ // ref/second_pref=255 (or vice-versa, since we take the absolute
+ // difference). Check this case explicitly to ensure we do not overflow
+ // during accumulation.
+ for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+ src_ptr[j] = 0;
+ ref_ptr[j] = 255;
+ (ref_ptr + kBlockSize)[j] = 255;
+ (ref_ptr + 2 * kBlockSize)[j] = 255;
+ (ref_ptr + 3 * kBlockSize)[j] = 255;
+ second_pred_ptr[j] = 255;
+ }
+ } else {
+ for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+ src_ptr[j] = rnd.Rand8();
+ ref_ptr[j] = rnd.Rand8();
+ (ref_ptr + kBlockSize)[j] = rnd.Rand8();
+ (ref_ptr + 2 * kBlockSize)[j] = rnd.Rand8();
+ (ref_ptr + 3 * kBlockSize)[j] = rnd.Rand8();
+ second_pred_ptr[j] = rnd.Rand8();
+ }
+ }
+ for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+ msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+ assert(msk_ptr[j] <= 64);
+ }
+
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ runRef(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
+ msk_stride, invert_mask, sads_ref, run_times);
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer_start(&timer);
+ runTest(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
+ msk_stride, invert_mask, sads, run_times);
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ if (run_times > 10) {
+ printf("%7.2f/%7.2fns", time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ if (sads_ref[0] != sads[0] || sads_ref[1] != sads[1] ||
+ sads_ref[2] != sads[2] || sads_ref[3] != sads[3]) {
+ err_count++;
+ if (first_failure == -1) first_failure = i;
+ }
+ }
+ }
+ EXPECT_EQ(0, err_count) << "Error: Masked SAD Test, output doesn't match. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
+TEST_P(MaskedSADx4Test, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADx4Test, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride,
+ int invert_mask);
+typedef std::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+ HighbdMaskedSADParam;
+
+class HighbdMaskedSADTest
+ : public ::testing::TestWithParam<HighbdMaskedSADParam> {
+ public:
+ ~HighbdMaskedSADTest() override = default;
+ void SetUp() override {
+ maskedSAD_op_ = GET_PARAM(0);
+ ref_maskedSAD_op_ = GET_PARAM(1);
+ }
+
+ void runHighbdMaskedSADTest(int run_times);
+
+ protected:
+ HighbdMaskedSADFunc maskedSAD_op_;
+ HighbdMaskedSADFunc ref_maskedSAD_op_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdMaskedSADTest);
+
+void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
+ unsigned int ref_ret = 0, ret = 1;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint16_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+ uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+ uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+ uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+ int err_count = 0;
+ int first_failure = -1;
+ int src_stride = MAX_SB_SIZE;
+ int ref_stride = MAX_SB_SIZE;
+ int msk_stride = MAX_SB_SIZE;
+ const int iters = run_times == 1 ? number_of_iterations : 1;
+ for (int i = 0; i < iters; ++i) {
+ for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+ src_ptr[j] = rnd.Rand16() & 0xfff;
+ ref_ptr[j] = rnd.Rand16() & 0xfff;
+ second_pred_ptr[j] = rnd.Rand16() & 0xfff;
+ msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+ }
+
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int repeat = 0; repeat < run_times; ++repeat) {
+ ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+ second_pred8_ptr, msk_ptr, msk_stride,
+ invert_mask);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ if (run_times == 1) {
+ API_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+ ref8_ptr, ref_stride,
+ second_pred8_ptr, msk_ptr,
+ msk_stride, invert_mask));
+ } else {
+ for (int repeat = 0; repeat < run_times; ++repeat) {
+ ret =
+ maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+ second_pred8_ptr, msk_ptr, msk_stride, invert_mask);
+ }
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("%7.2f/%7.2fns", time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ if (ret != ref_ret) {
+ err_count++;
+ if (first_failure == -1) first_failure = i;
+ }
+ }
+ }
+ EXPECT_EQ(0, err_count)
+ << "Error: High BD Masked SAD Test, output doesn't match. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); }
+
+TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+using std::make_tuple;
+
+#if HAVE_SSSE3
+const MaskedSADParam msad_test[] = {
+ make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c),
+ make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
+ make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
+ make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
+ make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
+ make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
+ make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
+ make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
+ make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
+ make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
+ make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
+ make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
+ make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
+ make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
+ make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
+ make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
+ make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
+ make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
+ make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
+ make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
+ make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
+
+const MaskedSADx4Param msadx4_test[] = {
+ make_tuple(&aom_masked_sad4x4x4d_ssse3, &aom_masked_sad4x4x4d_c),
+ make_tuple(&aom_masked_sad4x8x4d_ssse3, &aom_masked_sad4x8x4d_c),
+ make_tuple(&aom_masked_sad8x4x4d_ssse3, &aom_masked_sad8x4x4d_c),
+ make_tuple(&aom_masked_sad8x8x4d_ssse3, &aom_masked_sad8x8x4d_c),
+ make_tuple(&aom_masked_sad8x16x4d_ssse3, &aom_masked_sad8x16x4d_c),
+ make_tuple(&aom_masked_sad16x8x4d_ssse3, &aom_masked_sad16x8x4d_c),
+ make_tuple(&aom_masked_sad16x16x4d_ssse3, &aom_masked_sad16x16x4d_c),
+ make_tuple(&aom_masked_sad16x32x4d_ssse3, &aom_masked_sad16x32x4d_c),
+ make_tuple(&aom_masked_sad32x16x4d_ssse3, &aom_masked_sad32x16x4d_c),
+ make_tuple(&aom_masked_sad32x32x4d_ssse3, &aom_masked_sad32x32x4d_c),
+ make_tuple(&aom_masked_sad32x64x4d_ssse3, &aom_masked_sad32x64x4d_c),
+ make_tuple(&aom_masked_sad64x32x4d_ssse3, &aom_masked_sad64x32x4d_c),
+ make_tuple(&aom_masked_sad64x64x4d_ssse3, &aom_masked_sad64x64x4d_c),
+ make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
+ make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
+ make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
+ make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
+ make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
+ make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
+ make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
+ make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
+ ::testing::ValuesIn(msadx4_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSADParam hbd_msad_test[] = {
+ make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
+ make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
+ make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c),
+ make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c),
+ make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c),
+ make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c),
+ make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c),
+ make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c),
+ make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c),
+ make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c),
+ make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c),
+ make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c),
+ make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c),
+ make_tuple(&aom_highbd_masked_sad64x128_ssse3,
+ &aom_highbd_masked_sad64x128_c),
+ make_tuple(&aom_highbd_masked_sad128x64_ssse3,
+ &aom_highbd_masked_sad128x64_c),
+ make_tuple(&aom_highbd_masked_sad128x128_ssse3,
+ &aom_highbd_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c),
+ make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c),
+ make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c),
+ make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c),
+ make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c),
+ make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, HighbdMaskedSADTest,
+ ::testing::ValuesIn(hbd_msad_test));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_SSSE3
+
+#if HAVE_AVX2
+const MaskedSADParam msad_avx2_test[] = {
+ make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3),
+ make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3),
+ make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3),
+ make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3),
+ make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3),
+ make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3),
+ make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3),
+ make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3),
+ make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3),
+ make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3),
+ make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3),
+ make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3),
+ make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3),
+ make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
+ make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
+ make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
+ make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
+ make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
+ make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
+ make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
+ make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest,
+ ::testing::ValuesIn(msad_avx2_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
+ make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3),
+ make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3),
+ make_tuple(&aom_highbd_masked_sad8x4_avx2, &aom_highbd_masked_sad8x4_ssse3),
+ make_tuple(&aom_highbd_masked_sad8x8_avx2, &aom_highbd_masked_sad8x8_ssse3),
+ make_tuple(&aom_highbd_masked_sad8x16_avx2, &aom_highbd_masked_sad8x16_ssse3),
+ make_tuple(&aom_highbd_masked_sad16x8_avx2, &aom_highbd_masked_sad16x8_ssse3),
+ make_tuple(&aom_highbd_masked_sad16x16_avx2,
+ &aom_highbd_masked_sad16x16_ssse3),
+ make_tuple(&aom_highbd_masked_sad16x32_avx2,
+ &aom_highbd_masked_sad16x32_ssse3),
+ make_tuple(&aom_highbd_masked_sad32x16_avx2,
+ &aom_highbd_masked_sad32x16_ssse3),
+ make_tuple(&aom_highbd_masked_sad32x32_avx2,
+ &aom_highbd_masked_sad32x32_ssse3),
+ make_tuple(&aom_highbd_masked_sad32x64_avx2,
+ &aom_highbd_masked_sad32x64_ssse3),
+ make_tuple(&aom_highbd_masked_sad64x32_avx2,
+ &aom_highbd_masked_sad64x32_ssse3),
+ make_tuple(&aom_highbd_masked_sad64x64_avx2,
+ &aom_highbd_masked_sad64x64_ssse3),
+ make_tuple(&aom_highbd_masked_sad64x128_avx2,
+ &aom_highbd_masked_sad64x128_ssse3),
+ make_tuple(&aom_highbd_masked_sad128x64_avx2,
+ &aom_highbd_masked_sad128x64_ssse3),
+ make_tuple(&aom_highbd_masked_sad128x128_avx2,
+ &aom_highbd_masked_sad128x128_ssse3),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3),
+ make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3),
+ make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3),
+ make_tuple(&aom_highbd_masked_sad32x8_avx2, &aom_highbd_masked_sad32x8_ssse3),
+ make_tuple(&aom_highbd_masked_sad16x64_avx2,
+ &aom_highbd_masked_sad16x64_ssse3),
+ make_tuple(&aom_highbd_masked_sad64x16_avx2,
+ &aom_highbd_masked_sad64x16_ssse3)
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, HighbdMaskedSADTest,
+ ::testing::ValuesIn(hbd_msad_avx2_test));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+const MaskedSADParam msad_test[] = {
+ make_tuple(&aom_masked_sad4x4_neon, &aom_masked_sad4x4_c),
+ make_tuple(&aom_masked_sad4x8_neon, &aom_masked_sad4x8_c),
+ make_tuple(&aom_masked_sad8x4_neon, &aom_masked_sad8x4_c),
+ make_tuple(&aom_masked_sad8x8_neon, &aom_masked_sad8x8_c),
+ make_tuple(&aom_masked_sad8x16_neon, &aom_masked_sad8x16_c),
+ make_tuple(&aom_masked_sad16x8_neon, &aom_masked_sad16x8_c),
+ make_tuple(&aom_masked_sad16x16_neon, &aom_masked_sad16x16_c),
+ make_tuple(&aom_masked_sad16x32_neon, &aom_masked_sad16x32_c),
+ make_tuple(&aom_masked_sad32x16_neon, &aom_masked_sad32x16_c),
+ make_tuple(&aom_masked_sad32x32_neon, &aom_masked_sad32x32_c),
+ make_tuple(&aom_masked_sad32x64_neon, &aom_masked_sad32x64_c),
+ make_tuple(&aom_masked_sad64x32_neon, &aom_masked_sad64x32_c),
+ make_tuple(&aom_masked_sad64x64_neon, &aom_masked_sad64x64_c),
+ make_tuple(&aom_masked_sad64x128_neon, &aom_masked_sad64x128_c),
+ make_tuple(&aom_masked_sad128x64_neon, &aom_masked_sad128x64_c),
+ make_tuple(&aom_masked_sad128x128_neon, &aom_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sad4x16_neon, &aom_masked_sad4x16_c),
+ make_tuple(&aom_masked_sad16x4_neon, &aom_masked_sad16x4_c),
+ make_tuple(&aom_masked_sad8x32_neon, &aom_masked_sad8x32_c),
+ make_tuple(&aom_masked_sad32x8_neon, &aom_masked_sad32x8_c),
+ make_tuple(&aom_masked_sad16x64_neon, &aom_masked_sad16x64_c),
+ make_tuple(&aom_masked_sad64x16_neon, &aom_masked_sad64x16_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, MaskedSADTest, ::testing::ValuesIn(msad_test));
+
+const MaskedSADx4Param msadx4_test[] = {
+ make_tuple(&aom_masked_sad4x4x4d_neon, &aom_masked_sad4x4x4d_c),
+ make_tuple(&aom_masked_sad4x8x4d_neon, &aom_masked_sad4x8x4d_c),
+ make_tuple(&aom_masked_sad8x4x4d_neon, &aom_masked_sad8x4x4d_c),
+ make_tuple(&aom_masked_sad8x8x4d_neon, &aom_masked_sad8x8x4d_c),
+ make_tuple(&aom_masked_sad8x16x4d_neon, &aom_masked_sad8x16x4d_c),
+ make_tuple(&aom_masked_sad16x8x4d_neon, &aom_masked_sad16x8x4d_c),
+ make_tuple(&aom_masked_sad16x16x4d_neon, &aom_masked_sad16x16x4d_c),
+ make_tuple(&aom_masked_sad16x32x4d_neon, &aom_masked_sad16x32x4d_c),
+ make_tuple(&aom_masked_sad32x16x4d_neon, &aom_masked_sad32x16x4d_c),
+ make_tuple(&aom_masked_sad32x32x4d_neon, &aom_masked_sad32x32x4d_c),
+ make_tuple(&aom_masked_sad32x64x4d_neon, &aom_masked_sad32x64x4d_c),
+ make_tuple(&aom_masked_sad64x32x4d_neon, &aom_masked_sad64x32x4d_c),
+ make_tuple(&aom_masked_sad64x64x4d_neon, &aom_masked_sad64x64x4d_c),
+ make_tuple(&aom_masked_sad64x128x4d_neon, &aom_masked_sad64x128x4d_c),
+ make_tuple(&aom_masked_sad128x64x4d_neon, &aom_masked_sad128x64x4d_c),
+ make_tuple(&aom_masked_sad128x128x4d_neon, &aom_masked_sad128x128x4d_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sad4x16x4d_neon, &aom_masked_sad4x16x4d_c),
+ make_tuple(&aom_masked_sad16x4x4d_neon, &aom_masked_sad16x4x4d_c),
+ make_tuple(&aom_masked_sad8x32x4d_neon, &aom_masked_sad8x32x4d_c),
+ make_tuple(&aom_masked_sad32x8x4d_neon, &aom_masked_sad32x8x4d_c),
+ make_tuple(&aom_masked_sad16x64x4d_neon, &aom_masked_sad16x64x4d_c),
+ make_tuple(&aom_masked_sad64x16x4d_neon, &aom_masked_sad64x16x4d_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, MaskedSADx4Test,
+ ::testing::ValuesIn(msadx4_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const MaskedSADParam hbd_msad_neon_test[] = {
+ make_tuple(&aom_highbd_masked_sad4x4_neon, &aom_highbd_masked_sad4x4_c),
+ make_tuple(&aom_highbd_masked_sad4x8_neon, &aom_highbd_masked_sad4x8_c),
+ make_tuple(&aom_highbd_masked_sad8x4_neon, &aom_highbd_masked_sad8x4_c),
+ make_tuple(&aom_highbd_masked_sad8x8_neon, &aom_highbd_masked_sad8x8_c),
+ make_tuple(&aom_highbd_masked_sad8x16_neon, &aom_highbd_masked_sad8x16_c),
+ make_tuple(&aom_highbd_masked_sad16x8_neon, &aom_highbd_masked_sad16x8_c),
+ make_tuple(&aom_highbd_masked_sad16x16_neon, &aom_highbd_masked_sad16x16_c),
+ make_tuple(&aom_highbd_masked_sad16x32_neon, &aom_highbd_masked_sad16x32_c),
+ make_tuple(&aom_highbd_masked_sad32x16_neon, &aom_highbd_masked_sad32x16_c),
+ make_tuple(&aom_highbd_masked_sad32x32_neon, &aom_highbd_masked_sad32x32_c),
+ make_tuple(&aom_highbd_masked_sad32x64_neon, &aom_highbd_masked_sad32x64_c),
+ make_tuple(&aom_highbd_masked_sad64x32_neon, &aom_highbd_masked_sad64x32_c),
+ make_tuple(&aom_highbd_masked_sad64x64_neon, &aom_highbd_masked_sad64x64_c),
+ make_tuple(&aom_highbd_masked_sad64x128_neon, &aom_highbd_masked_sad64x128_c),
+ make_tuple(&aom_highbd_masked_sad128x64_neon, &aom_highbd_masked_sad128x64_c),
+ make_tuple(&aom_highbd_masked_sad128x128_neon,
+ &aom_highbd_masked_sad128x128_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_masked_sad4x16_neon, &aom_highbd_masked_sad4x16_c),
+ make_tuple(&aom_highbd_masked_sad16x4_neon, &aom_highbd_masked_sad16x4_c),
+ make_tuple(&aom_highbd_masked_sad8x32_neon, &aom_highbd_masked_sad8x32_c),
+ make_tuple(&aom_highbd_masked_sad32x8_neon, &aom_highbd_masked_sad32x8_c),
+ make_tuple(&aom_highbd_masked_sad16x64_neon, &aom_highbd_masked_sad16x64_c),
+ make_tuple(&aom_highbd_masked_sad64x16_neon, &aom_highbd_masked_sad64x16_c),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdMaskedSADTest,
+ ::testing::ValuesIn(hbd_msad_neon_test));
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif // HAVE_NEON
+
+} // namespace
diff --git a/third_party/aom/test/masked_variance_test.cc b/third_party/aom/test/masked_variance_test.cc
new file mode 100644
index 0000000000..8482a12f53
--- /dev/null
+++ b/third_party/aom/test/masked_variance_test.cc
@@ -0,0 +1,712 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 200;
+
+typedef unsigned int (*MaskedSubPixelVarianceFunc)(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+ const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
+ MaskedSubPixelVarianceParam;
+
+class MaskedSubPixelVarianceTest
+ : public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
+ public:
+ ~MaskedSubPixelVarianceTest() override = default;
+ void SetUp() override {
+ opt_func_ = GET_PARAM(0);
+ ref_func_ = GET_PARAM(1);
+ }
+
+ protected:
+ MaskedSubPixelVarianceFunc opt_func_;
+ MaskedSubPixelVarianceFunc ref_func_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MaskedSubPixelVarianceTest);
+
+TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
+ unsigned int ref_ret, opt_ret;
+ unsigned int ref_sse, opt_sse;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ // Note: We pad out the input array to a multiple of 16 bytes wide, so that
+ // consecutive rows keep the 16-byte alignment.
+ DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ DECLARE_ALIGNED(16, uint8_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ int err_count = 0;
+ int first_failure = -1;
+ int src_stride = (MAX_SB_SIZE + 16);
+ int ref_stride = (MAX_SB_SIZE + 16);
+ int msk_stride = (MAX_SB_SIZE + 16);
+ int xoffset;
+ int yoffset;
+
+ for (int i = 0; i < number_of_iterations; ++i) {
+ int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+ int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+ for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16); j++) {
+ src_ptr[j] = rnd.Rand8();
+ ref_ptr[j] = rnd.Rand8();
+ second_pred_ptr[j] = rnd.Rand8();
+ msk_ptr[j] = rnd(65);
+ }
+ for (int k = 0; k < 3; k++) {
+ for (int l = 0; l < 3; l++) {
+ xoffset = xoffsets[k];
+ yoffset = yoffsets[l];
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+ ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ API_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+ ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+ msk_stride, invert_mask, &opt_sse));
+
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) first_failure = i;
+ }
+ }
+ }
+ }
+ }
+
+ EXPECT_EQ(0, err_count)
+ << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+ << "C output doesn't match SSSE3 output. "
+ << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
+ unsigned int ref_ret, opt_ret;
+ unsigned int ref_sse, opt_sse;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ DECLARE_ALIGNED(16, uint8_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+ int first_failure_x = -1;
+ int first_failure_y = -1;
+ int err_count = 0;
+ int first_failure = -1;
+ int src_stride = (MAX_SB_SIZE + 16);
+ int ref_stride = (MAX_SB_SIZE + 16);
+ int msk_stride = (MAX_SB_SIZE + 16);
+
+ for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+ for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+ for (int i = 0; i < 16; ++i) {
+ memset(src_ptr, (i & 0x1) ? 255 : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+ memset(ref_ptr, (i & 0x2) ? 255 : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+ memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+ memset(msk_ptr, (i & 0x8) ? 64 : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+ ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ API_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+ ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+ msk_stride, invert_mask, &opt_sse));
+
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) {
+ first_failure = i;
+ first_failure_x = xoffset;
+ first_failure_y = yoffset;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+ << "C output doesn't match SSSE3 output. "
+ << "First failed at test case " << first_failure
+ << " x_offset = " << first_failure_x
+ << " y_offset = " << first_failure_y;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
+ aom_bit_depth_t>
+ HighbdMaskedSubPixelVarianceParam;
+
+class HighbdMaskedSubPixelVarianceTest
+ : public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
+ public:
+ ~HighbdMaskedSubPixelVarianceTest() override = default;
+ void SetUp() override {
+ opt_func_ = GET_PARAM(0);
+ ref_func_ = GET_PARAM(1);
+ bit_depth_ = GET_PARAM(2);
+ }
+
+ protected:
+ MaskedSubPixelVarianceFunc opt_func_;
+ MaskedSubPixelVarianceFunc ref_func_;
+ aom_bit_depth_t bit_depth_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HighbdMaskedSubPixelVarianceTest);
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
+ unsigned int ref_ret, opt_ret;
+ unsigned int ref_sse, opt_sse;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ DECLARE_ALIGNED(16, uint16_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+ uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+ uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+ int err_count = 0;
+ int first_failure = -1;
+ int first_failure_x = -1;
+ int first_failure_y = -1;
+ int src_stride = (MAX_SB_SIZE + 8);
+ int ref_stride = (MAX_SB_SIZE + 8);
+ int msk_stride = (MAX_SB_SIZE + 8);
+ int xoffset, yoffset;
+
+ for (int i = 0; i < number_of_iterations; ++i) {
+ for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8); j++) {
+ src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+ ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+ second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+ msk_ptr[j] = rnd(65);
+ }
+ for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+ for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+ ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ API_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+ ref8_ptr, ref_stride, second_pred8_ptr,
+ msk_ptr, msk_stride, invert_mask, &opt_sse));
+
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) {
+ first_failure = i;
+ first_failure_x = xoffset;
+ first_failure_y = yoffset;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ EXPECT_EQ(0, err_count)
+ << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+ << "C output doesn't match SSSE3 output. "
+ << "First failed at test case " << first_failure
+ << " x_offset = " << first_failure_x << " y_offset = " << first_failure_y;
+}
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
+ unsigned int ref_ret, opt_ret;
+ unsigned int ref_sse, opt_sse;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ DECLARE_ALIGNED(16, uint16_t,
+ second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+ uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+ uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+ uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+ int first_failure_x = -1;
+ int first_failure_y = -1;
+ int err_count = 0;
+ int first_failure = -1;
+ int src_stride = (MAX_SB_SIZE + 8);
+ int ref_stride = (MAX_SB_SIZE + 8);
+ int msk_stride = (MAX_SB_SIZE + 8);
+
+ for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+ for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+ for (int i = 0; i < 16; ++i) {
+ aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+ aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+ aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+ memset(msk_ptr, (i & 0x8) ? 64 : 0,
+ (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+
+ for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+ ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+ ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+ invert_mask, &ref_sse);
+ API_REGISTER_STATE_CHECK(
+ opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+ ref8_ptr, ref_stride, second_pred8_ptr,
+ msk_ptr, msk_stride, invert_mask, &opt_sse));
+
+ if (opt_ret != ref_ret || opt_sse != ref_sse) {
+ err_count++;
+ if (first_failure == -1) {
+ first_failure = i;
+ first_failure_x = xoffset;
+ first_failure_y = yoffset;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+ << "C output doesn't match SSSE3 output. "
+ << "First failed at test case " << first_failure
+ << " x_offset = " << first_failure_x
+ << " y_offset = " << first_failure_y;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+using std::make_tuple;
+
+#if HAVE_SSSE3
+
+const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
+ make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
+ &aom_masked_sub_pixel_variance128x128_c),
+ make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
+ &aom_masked_sub_pixel_variance128x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
+ &aom_masked_sub_pixel_variance64x128_c),
+ make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
+ &aom_masked_sub_pixel_variance64x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
+ &aom_masked_sub_pixel_variance64x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
+ &aom_masked_sub_pixel_variance32x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
+ &aom_masked_sub_pixel_variance32x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
+ &aom_masked_sub_pixel_variance32x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
+ &aom_masked_sub_pixel_variance16x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
+ &aom_masked_sub_pixel_variance16x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
+ &aom_masked_sub_pixel_variance16x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
+ &aom_masked_sub_pixel_variance8x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
+ &aom_masked_sub_pixel_variance8x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
+ &aom_masked_sub_pixel_variance8x4_c),
+ make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
+ &aom_masked_sub_pixel_variance4x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
+ &aom_masked_sub_pixel_variance4x4_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3,
+ &aom_masked_sub_pixel_variance64x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3,
+ &aom_masked_sub_pixel_variance16x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x8_ssse3,
+ &aom_masked_sub_pixel_variance32x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x32_ssse3,
+ &aom_masked_sub_pixel_variance8x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x4_ssse3,
+ &aom_masked_sub_pixel_variance16x4_c),
+ make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3,
+ &aom_masked_sub_pixel_variance4x16_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+ ::testing::ValuesIn(sub_pel_var_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_ssse3,
+ &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_ssse3,
+ &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_ssse3,
+ &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+ ::testing::ValuesIn(hbd_sub_pel_var_test));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+
+const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
+ make_tuple(&aom_masked_sub_pixel_variance128x128_neon,
+ &aom_masked_sub_pixel_variance128x128_c),
+ make_tuple(&aom_masked_sub_pixel_variance128x64_neon,
+ &aom_masked_sub_pixel_variance128x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance64x128_neon,
+ &aom_masked_sub_pixel_variance64x128_c),
+ make_tuple(&aom_masked_sub_pixel_variance64x64_neon,
+ &aom_masked_sub_pixel_variance64x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance64x32_neon,
+ &aom_masked_sub_pixel_variance64x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x64_neon,
+ &aom_masked_sub_pixel_variance32x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x32_neon,
+ &aom_masked_sub_pixel_variance32x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x16_neon,
+ &aom_masked_sub_pixel_variance32x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x32_neon,
+ &aom_masked_sub_pixel_variance16x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x16_neon,
+ &aom_masked_sub_pixel_variance16x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x8_neon,
+ &aom_masked_sub_pixel_variance16x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x16_neon,
+ &aom_masked_sub_pixel_variance8x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x8_neon,
+ &aom_masked_sub_pixel_variance8x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x4_neon,
+ &aom_masked_sub_pixel_variance8x4_c),
+ make_tuple(&aom_masked_sub_pixel_variance4x8_neon,
+ &aom_masked_sub_pixel_variance4x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance4x4_neon,
+ &aom_masked_sub_pixel_variance4x4_c),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_masked_sub_pixel_variance64x16_neon,
+ &aom_masked_sub_pixel_variance64x16_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x64_neon,
+ &aom_masked_sub_pixel_variance16x64_c),
+ make_tuple(&aom_masked_sub_pixel_variance32x8_neon,
+ &aom_masked_sub_pixel_variance32x8_c),
+ make_tuple(&aom_masked_sub_pixel_variance8x32_neon,
+ &aom_masked_sub_pixel_variance8x32_c),
+ make_tuple(&aom_masked_sub_pixel_variance16x4_neon,
+ &aom_masked_sub_pixel_variance16x4_c),
+ make_tuple(&aom_masked_sub_pixel_variance4x16_neon,
+ &aom_masked_sub_pixel_variance4x16_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, MaskedSubPixelVarianceTest,
+ ::testing::ValuesIn(sub_pel_var_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test_neon[] = {
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_neon,
+ &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_neon,
+ &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_neon,
+ &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_neon,
+ &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_neon,
+ &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_neon,
+ &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_neon,
+ &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_neon,
+ &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_neon,
+ &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_neon,
+ &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_neon,
+ &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_neon,
+ &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_neon,
+ &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_neon,
+ &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_neon,
+ &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_neon,
+ &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_neon,
+ &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
+ make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_neon,
+ &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+ ::testing::ValuesIn(hbd_sub_pel_var_test_neon));
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif // HAVE_NEON
+} // namespace
diff --git a/third_party/aom/test/md5_helper.h b/third_party/aom/test/md5_helper.h
new file mode 100644
index 0000000000..69f1ae76b0
--- /dev/null
+++ b/third_party/aom/test/md5_helper.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_MD5_HELPER_H_
+#define AOM_TEST_MD5_HELPER_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+
+namespace libaom_test {
+class MD5 {
+ public:
+ MD5() { MD5Init(&md5_); }
+
+ void Add(const aom_image_t *img) {
+ for (int plane = 0; plane < 3; ++plane) {
+ const uint8_t *buf = img->planes[plane];
+ // Calculate the width and height to do the md5 check. For the chroma
+ // plane, we never want to round down and thus skip a pixel so if
+ // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+ // This works only for chroma_shift of 0 and 1.
+ const int bytes_per_sample =
+ (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+ const int h =
+ plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift
+ : img->d_h;
+ const int w =
+ (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift
+ : img->d_w) *
+ bytes_per_sample;
+
+ for (int y = 0; y < h; ++y) {
+ MD5Update(&md5_, buf, w);
+ buf += img->stride[plane];
+ }
+ }
+ }
+
+ void Add(const uint8_t *data, size_t size) {
+ MD5Update(&md5_, data, static_cast<uint32_t>(size));
+ }
+
+ const char *Get() {
+ static const char hex[16] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ };
+ uint8_t tmp[16];
+ MD5Context ctx_tmp = md5_;
+
+ MD5Final(tmp, &ctx_tmp);
+ for (int i = 0; i < 16; i++) {
+ res_[i * 2 + 0] = hex[tmp[i] >> 4];
+ res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+ }
+ res_[32] = 0;
+
+ return res_;
+ }
+
+ protected:
+ char res_[33];
+ MD5Context md5_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_MD5_HELPER_H_
diff --git a/third_party/aom/test/metadata_test.cc b/third_party/aom/test/metadata_test.cc
new file mode 100644
index 0000000000..9467c29e86
--- /dev/null
+++ b/third_party/aom/test/metadata_test.cc
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/bitstream.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+const size_t kMetadataPayloadSizeT35 = 24;
+// 0xB5 stands for the itut t35 metadata country code for the Unites States
+const uint8_t kMetadataPayloadT35[kMetadataPayloadSizeT35] = {
+ 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+
+const size_t kMetadataPayloadSizeCll = 4;
+const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02,
+ 0x03 };
+
+const size_t kMetadataObuSizeT35 = 28;
+const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = {
+ 0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80
+};
+const size_t kMetadataObuSizeMdcv = 28;
+const uint8_t kMetadataObuMdcv[kMetadataObuSizeMdcv] = {
+ 0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80
+};
+const size_t kMetadataObuSizeCll = 8;
+const uint8_t kMetadataObuCll[kMetadataObuSizeCll] = { 0x2A, 0x06, 0x01, 0xB5,
+ 0x01, 0x02, 0x03, 0x80 };
+
+class MetadataEncodeTest
+ : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ MetadataEncodeTest() : EncoderTest(GET_PARAM(0)) {}
+
+ ~MetadataEncodeTest() override = default;
+
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder * /*encoder*/) override {
+ aom_image_t *current_frame = video->img();
+ if (current_frame) {
+ if (current_frame->metadata) aom_img_remove_metadata(current_frame);
+ ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+ kMetadataPayloadT35, 0, AOM_MIF_ANY_FRAME),
+ -1);
+ ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+ nullptr, kMetadataPayloadSizeT35,
+ AOM_MIF_ANY_FRAME),
+ -1);
+ ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+ nullptr, 0, AOM_MIF_ANY_FRAME),
+ -1);
+ ASSERT_EQ(
+ aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+ kMetadataPayloadT35, kMetadataPayloadSizeT35,
+ AOM_MIF_ANY_FRAME),
+ 0);
+
+ ASSERT_EQ(
+ aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_MDCV,
+ kMetadataPayloadT35, kMetadataPayloadSizeT35,
+ AOM_MIF_KEY_FRAME),
+ 0);
+
+ ASSERT_EQ(
+ aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_CLL,
+ kMetadataPayloadCll, kMetadataPayloadSizeCll,
+ AOM_MIF_KEY_FRAME),
+ 0);
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+ const size_t bitstream_size = pkt->data.frame.sz;
+ const uint8_t *bitstream =
+ static_cast<const uint8_t *>(pkt->data.frame.buf);
+ // look for valid metadatas in bitstream
+ bool itut_t35_metadata_found = false;
+ if (bitstream_size >= kMetadataObuSizeT35) {
+ for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeT35; ++i) {
+ if (memcmp(bitstream + i, kMetadataObuT35, kMetadataObuSizeT35) ==
+ 0) {
+ itut_t35_metadata_found = true;
+ }
+ }
+ }
+ ASSERT_EQ(itut_t35_metadata_found, 1u);
+
+ // Testing for HDR MDCV metadata
+ bool hdr_mdcv_metadata_found = false;
+ if (bitstream_size >= kMetadataObuSizeMdcv) {
+ for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeMdcv; ++i) {
+ if (memcmp(bitstream + i, kMetadataObuMdcv, kMetadataObuSizeMdcv) ==
+ 0) {
+ hdr_mdcv_metadata_found = true;
+ }
+ }
+ }
+ ASSERT_TRUE(hdr_mdcv_metadata_found);
+
+ // Testing for HDR CLL metadata
+ bool hdr_cll_metadata_found = false;
+ if (bitstream_size >= kMetadataObuSizeCll) {
+ for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeCll; ++i) {
+ if (memcmp(bitstream + i, kMetadataObuCll, kMetadataObuSizeCll) ==
+ 0) {
+ hdr_cll_metadata_found = true;
+ }
+ }
+ }
+ ASSERT_TRUE(hdr_cll_metadata_found);
+ }
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t /*pts*/) override {
+ ASSERT_NE(img.metadata, nullptr);
+
+ ASSERT_EQ(img.metadata->sz, 3u);
+
+ for (size_t i = 0; i < img.metadata->sz - 1; ++i) {
+ ASSERT_EQ(kMetadataPayloadSizeT35, img.metadata->metadata_array[i]->sz);
+ EXPECT_EQ(
+ memcmp(kMetadataPayloadT35, img.metadata->metadata_array[i]->payload,
+ kMetadataPayloadSizeT35),
+ 0);
+ }
+
+ ASSERT_EQ(kMetadataPayloadSizeCll, img.metadata->metadata_array[2]->sz);
+ EXPECT_EQ(
+ memcmp(kMetadataPayloadCll, img.metadata->metadata_array[2]->payload,
+ kMetadataPayloadSizeCll),
+ 0);
+ }
+};
+
+TEST_P(MetadataEncodeTest, TestMetadataEncoding) {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 5);
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ cfg_.g_w = 352;
+ cfg_.g_h = 288;
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.kf_mode = AOM_KF_AUTO;
+ cfg_.g_lag_in_frames = 1;
+ cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+ // Enable dropped frames.
+ cfg_.rc_dropframe_thresh = 1;
+ // Disable error_resilience mode.
+ cfg_.g_error_resilient = 0;
+ // Run at low bitrate.
+ cfg_.rc_target_bitrate = 40;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(MetadataEncodeTest,
+ ::testing::Values(::libaom_test::kOnePassGood));
+
+} // namespace
+
+TEST(MetadataTest, MetadataAllocation) {
+ aom_metadata_t *metadata =
+ aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+ ASSERT_NE(metadata, nullptr);
+ aom_img_metadata_free(metadata);
+}
+
+TEST(MetadataTest, MetadataArrayAllocation) {
+ aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(2);
+ ASSERT_NE(metadata_array, nullptr);
+
+ metadata_array->metadata_array[0] =
+ aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+ metadata_array->metadata_array[1] =
+ aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+
+ aom_img_metadata_array_free(metadata_array);
+}
+
+TEST(MetadataTest, AddMetadataToImage) {
+ aom_image_t image;
+ image.metadata = nullptr;
+
+ ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+ kMetadataPayloadT35, kMetadataPayloadSizeT35,
+ AOM_MIF_ANY_FRAME),
+ 0);
+ aom_img_metadata_array_free(image.metadata);
+ EXPECT_EQ(aom_img_add_metadata(nullptr, OBU_METADATA_TYPE_ITUT_T35,
+ kMetadataPayloadT35, kMetadataPayloadSizeT35,
+ AOM_MIF_ANY_FRAME),
+ -1);
+}
+
+TEST(MetadataTest, RemoveMetadataFromImage) {
+ aom_image_t image;
+ image.metadata = nullptr;
+
+ ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+ kMetadataPayloadT35, kMetadataPayloadSizeT35,
+ AOM_MIF_ANY_FRAME),
+ 0);
+ aom_img_remove_metadata(&image);
+ aom_img_remove_metadata(nullptr);
+}
+
+TEST(MetadataTest, CopyMetadataToFrameBuffer) {
+ YV12_BUFFER_CONFIG yvBuf;
+ yvBuf.metadata = nullptr;
+
+ aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(1);
+ ASSERT_NE(metadata_array, nullptr);
+
+ metadata_array->metadata_array[0] =
+ aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+
+ // Metadata_array
+ int status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array);
+ EXPECT_EQ(status, 0);
+ status = aom_copy_metadata_to_frame_buffer(nullptr, metadata_array);
+ EXPECT_EQ(status, -1);
+ aom_img_metadata_array_free(metadata_array);
+
+ // Metadata_array_2
+ aom_metadata_array_t *metadata_array_2 = aom_img_metadata_array_alloc(0);
+ ASSERT_NE(metadata_array_2, nullptr);
+ status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array_2);
+ EXPECT_EQ(status, -1);
+ aom_img_metadata_array_free(metadata_array_2);
+
+ // YV12_BUFFER_CONFIG
+ status = aom_copy_metadata_to_frame_buffer(&yvBuf, nullptr);
+ EXPECT_EQ(status, -1);
+ aom_remove_metadata_from_frame_buffer(&yvBuf);
+ aom_remove_metadata_from_frame_buffer(nullptr);
+}
+
+TEST(MetadataTest, GetMetadataFromImage) {
+ aom_image_t image;
+ image.metadata = nullptr;
+
+ ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+ kMetadataPayloadT35, kMetadataPayloadSizeT35,
+ AOM_MIF_ANY_FRAME),
+ 0);
+
+ EXPECT_EQ(aom_img_get_metadata(nullptr, 0), nullptr);
+ EXPECT_EQ(aom_img_get_metadata(&image, 1u), nullptr);
+ EXPECT_EQ(aom_img_get_metadata(&image, 10u), nullptr);
+
+ const aom_metadata_t *metadata = aom_img_get_metadata(&image, 0);
+ ASSERT_NE(metadata, nullptr);
+ ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
+ EXPECT_EQ(
+ memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
+ 0);
+
+ aom_img_metadata_array_free(image.metadata);
+}
+
+TEST(MetadataTest, ReadMetadatasFromImage) {
+ aom_image_t image;
+ image.metadata = nullptr;
+
+ uint32_t types[3];
+ types[0] = OBU_METADATA_TYPE_ITUT_T35;
+ types[1] = OBU_METADATA_TYPE_HDR_CLL;
+ types[2] = OBU_METADATA_TYPE_HDR_MDCV;
+
+ ASSERT_EQ(aom_img_add_metadata(&image, types[0], kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME),
+ 0);
+ ASSERT_EQ(aom_img_add_metadata(&image, types[1], kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME),
+ 0);
+ ASSERT_EQ(aom_img_add_metadata(&image, types[2], kMetadataPayloadT35,
+ kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME),
+ 0);
+
+ size_t number_metadata = aom_img_num_metadata(&image);
+ ASSERT_EQ(number_metadata, 3u);
+ for (size_t i = 0; i < number_metadata; ++i) {
+ const aom_metadata_t *metadata = aom_img_get_metadata(&image, i);
+ ASSERT_NE(metadata, nullptr);
+ ASSERT_EQ(metadata->type, types[i]);
+ ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
+ EXPECT_EQ(
+ memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
+ 0);
+ }
+ aom_img_metadata_array_free(image.metadata);
+}
diff --git a/third_party/aom/test/metrics_template.html b/third_party/aom/test/metrics_template.html
new file mode 100644
index 0000000000..b57c62314a
--- /dev/null
+++ b/third_party/aom/test/metrics_template.html
@@ -0,0 +1,422 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Video Codec Test Results</title>
+<style type="text/css">
+<!-- Begin 960 reset -->
+a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,c
+ode,dd,del,details,dfn,dialog,div,dl,dt,em,embed,fieldset,figcaption,figure,font,footer,form,h1,h2,h
+3,h4,h5,h6,header,hgroup,hr,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,meter,nav,object,ol,
+output,p,pre,progress,q,rp,rt,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbo
+dy,td,tfoot,th,thead,time,tr,tt,u,ul,var,video,xmp{border:0;margin:0;padding:0;font-size:100%}html,b
+ody{height:100%}article,aside,details,figcaption,figure,footer,header,hgroup,menu,nav,section{displa
+y:block}b,strong{font-weight:bold}img{color:transparent;font-size:0;vertical-align:middle;-ms-interp
+olation-mode:bicubic}ol,ul{list-style:none}li{display:list-item}table{border-collapse:collapse;borde
+r-spacing:0}th,td,caption{font-weight:normal;vertical-align:top;text-align:left}q{quotes:none}q:befo
+re,q:after{content:'';content:none}sub,sup,small{font-size:75%}sub,sup{line-height:0;position:relati
+ve;vertical-align:baseline}sub{bottom:-0.25em}sup{top:-0.5em}svg{overflow:hidden}
+<!-- End 960 reset -->
+<!-- Begin 960 text -->
+body{font:13px/1.5 'Helvetica Neue',Arial,'Liberation Sans',FreeSans,sans-serif}pre,code{font-family
+:'DejaVu Sans Mono',Menlo,Consolas,monospace}hr{border:0 #ccc solid;border-top-width:1px;clear:both;
+height:0}h1{font-size:25px}h2{font-size:23px}h3{font-size:21px}h4{font-size:19px}h5{font-size:17px}h
+6{font-size:15px}ol{list-style:decimal}ul{list-style:disc}li{margin-left:30px}p,dl,hr,h1,h2,h3,h4,h5
+,h6,ol,ul,pre,table,address,fieldset,figure{margin-bottom:20px}
+<!-- End 960 text -->
+<!-- Begin 960 grid (fluid variant)
+ 12 columns, 1152px total width
+ http://960.gs/ | http://grids.heroku.com/ -->
+.container_12{width:92%;margin-left:4%;margin-right:4%}.grid_1,.grid_2,.grid_3,.grid_4,.grid_5,.grid
+_6,.grid_7,.grid_8,.grid_9,.grid_10,.grid_11,.grid_12{display:inline;float:left;position:relative;ma
+rgin-left:1%;margin-right:1%}.alpha{margin-left:0}.omega{margin-right:0}.container_12 .grid_1{width:
+6.333%}.container_12 .grid_2{width:14.667%}.container_12 .grid_3{width:23.0%}.container_12 .grid_4{w
+idth:31.333%}.container_12 .grid_5{width:39.667%}.container_12 .grid_6{width:48.0%}.container_12 .gr
+id_7{width:56.333%}.container_12 .grid_8{width:64.667%}.container_12 .grid_9{width:73.0%}.container_
+12 .grid_10{width:81.333%}.container_12 .grid_11{width:89.667%}.container_12 .grid_12{width:98.0%}.c
+ontainer_12 .prefix_1{padding-left:8.333%}.container_12 .prefix_2{padding-left:16.667%}.container_12
+ .prefix_3{padding-left:25.0%}.container_12 .prefix_4{padding-left:33.333%}.container_12 .prefix_5{p
+adding-left:41.667%}.container_12 .prefix_6{padding-left:50.0%}.container_12 .prefix_7{padding-left:
+58.333%}.container_12 .prefix_8{padding-left:66.667%}.container_12 .prefix_9{padding-left:75.0%}.con
+tainer_12 .prefix_10{padding-left:83.333%}.container_12 .prefix_11{padding-left:91.667%}.container_1
+2 .suffix_1{padding-right:8.333%}.container_12 .suffix_2{padding-right:16.667%}.container_12 .suffix
+_3{padding-right:25.0%}.container_12 .suffix_4{padding-right:33.333%}.container_12 .suffix_5{padding
+-right:41.667%}.container_12 .suffix_6{padding-right:50.0%}.container_12 .suffix_7{padding-right:58.
+333%}.container_12 .suffix_8{padding-right:66.667%}.container_12 .suffix_9{padding-right:75.0%}.cont
+ainer_12 .suffix_10{padding-right:83.333%}.container_12 .suffix_11{padding-right:91.667%}.container_
+12 .push_1{left:8.333%}.container_12 .push_2{left:16.667%}.container_12 .push_3{left:25.0%}.containe
+r_12 .push_4{left:33.333%}.container_12 .push_5{left:41.667%}.container_12 .push_6{left:50.0%}.conta
+iner_12 .push_7{left:58.333%}.container_12 .push_8{left:66.667%}.container_12 .push_9{left:75.0%}.co
+ntainer_12 .push_10{left:83.333%}.container_12 .push_11{left:91.667%}.container_12 .pull_1{left:-8.3
+33%}.container_12 .pull_2{left:-16.667%}.container_12 .pull_3{left:-25.0%}.container_12 .pull_4{left
+:-33.333%}.container_12 .pull_5{left:-41.667%}.container_12 .pull_6{left:-50.0%}.container_12 .pull_
+7{left:-58.333%}.container_12 .pull_8{left:-66.667%}.container_12 .pull_9{left:-75.0%}.container_12
+.pull_10{left:-83.333%}.container_12 .pull_11{left:-91.667%}.clear{clear:both;display:block;overflow
+:hidden;visibility:hidden;width:0;height:0}.clearfix:after{clear:both;content:' ';display:block;font
+-size:0;line-height:0;visibility:hidden;width:0;height:0}.clearfix{display:inline-block}* html .clea
+rfix{height:1%}.clearfix{display:block}
+<!-- End 960 grid -->
+
+div.metricgraph {
+
+}
+
+body {
+
+}
+
+div.header {
+ font-family: Arial, sans-serif;
+}
+
+div.header h2 {
+ margin: .5em auto;
+}
+
+div.radio {
+ font-family: Arial, sans-serif;
+ margin-bottom: 1em;
+}
+
+div.main {
+
+}
+
+div.cliplist {
+ font-family: Arial, sans-serif;
+ margin-top: 6px;
+}
+
+div.chartarea {
+ font-family: Arial, sans-serif;
+}
+
+div.indicators {
+ font-family: Arial, sans-serif;
+ font-size: 13px;
+ margin-top: 6px;
+ min-height: 600px;
+ background-color: #f7f7f7;
+}
+
+div.indicators div.content {
+ margin: 1em;
+}
+
+div.indicators div.content h5 {
+ font-size: 13px;
+ text-align: center;
+ margin: 0;
+}
+
+div.indicators div.content ul {
+ margin-left: 0;
+ padding-left: 0;
+ margin-top: 0;
+}
+
+div.indicators div.content ul li {
+ margin-left: 1.5em;
+}
+
+div.indicators div.content p:first-child {
+ margin-bottom: .5em;
+}
+
+span.google-visualization-table-sortind {
+ color: #000;
+}
+.header-style {
+ font-weight: bold;
+ border: 1px solid #fff;
+ background-color: #ccc;
+}
+
+td.header-style+td {
+
+}
+
+.orange-background {
+ background-color: orange;
+}
+
+.light-gray-background {
+ background-color: #f0f0f0;
+}
+</style>
+<script type="text/javascript" src="https://www.google.com/jsapi"></script>
+<script type="text/javascript">
+var chart_left = 40;
+var chart_top = 6;
+var chart_height = document.documentElement.clientHeight-100;
+var chart_width = "100%";
+ftable='filestable_avg'
+var snrs = [];
+var filestable_dsnr = [];
+var filestable_drate = [];
+var filestable_avg = [];
+
+// Python template code replaces the following 2 lines.
+//%%metrics_js%%//
+//%%filestable_dpsnr%%//
+//%%filestable_avg%%//
+//%%filestable_drate%%//
+//%%snrs%%//
+
+var selected = 0
+var imagestr = '';
+var bettertable=0;
+var chart=0;
+var better=0;
+var metricdata=0;
+var metricView=0;
+var column=1;
+var formatter=0;
+
+function changeColumn(col) {
+ column = col;
+ console.log(col)
+ draw_files();
+}
+
+function changeMetric(m) {
+ ftable=m
+ draw_files()
+}
+
+function setup_vis() {
+ chart = new google.visualization.ScatterChart(
+ document.getElementById("metricgraph"));
+
+ bettertable = new google.visualization.Table(
+ document.getElementById("bettertable"));
+
+ draw_files();
+ build_metrics_radio();
+}
+
+function build_metrics_radio() {
+ for (metric=1; metric < metrics.length; metric++) {
+ var rb = document.createElement('input');
+ var l = document.createElement('label');
+ rb.setAttribute('type','radio');
+ rb.setAttribute('name','metric');
+ rb.setAttribute('onClick', "changeColumn('"+metric.toString()+"')");
+ l.innerHTML = metrics[metric];
+ document.getElementById('metrics').appendChild(rb);
+ document.getElementById('metrics').appendChild(l);
+ }
+}
+
+function draw_files() {
+ var options = {'allowHtml': true, 'width': "100%", 'height': "50%"};
+ if (better != 0) delete better;
+
+ col=eval(ftable+'[column]')
+ better = new google.visualization.DataTable(col)
+
+ // Python Template code replaces the following line with a list of
+ // formatters.
+ if (ftable == 'filestable_dsnr')
+ formatter = new google.visualization.NumberFormat(
+ {fractionDigits: 4, suffix:" db"});
+ else
+ formatter = new google.visualization.NumberFormat(
+ {fractionDigits: 4, suffix:"%"});
+
+ //%%formatters%%//
+
+ bettertable.draw(better,options);
+ google.visualization.events.addListener(bettertable, 'select',
+ selectBetterHandler);
+ query_file()
+}
+
+function query_file() {
+ imagestr = better.getFormattedValue(selected, 0)
+ var metricjson = eval('(' + snrs[column][selected] + ')');
+ metricdata = new google.visualization.DataTable(metricjson, 0.6);
+ if( metricView != 0 ) delete metricView;
+ metricView = new google.visualization.DataView(metricdata);
+
+ chart.draw(metricView, {curveType:'function',
+ explorer: {},
+ chartArea:{left:chart_left, top:chart_top, width:chart_width,
+ height:chart_height-90},
+ hAxis:{title:"Datarate in kbps"},
+ vAxis:{title:"Quality in decibels", format: '##.0', textPosition: 'in'},
+ legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+ width:chart_width, height:chart_height-50 });
+
+ google.visualization.events.addListener(chart, 'select', chartSelect);
+ google.visualization.events.addListener(chart, 'onmouseover', chartMouseOver);
+ google.visualization.events.addListener(chart, 'onmouseout', chartMouseOut);
+}
+
+function chartMouseOut(e) {
+ statusbar = document.getElementById('status');
+ statusbar.style.display = 'none';
+}
+
+function chartMouseOver(e) {
+ pointDifference(e.row, e.column)
+}
+
+function pointDifference(row, col) {
+ if(!row || !col)
+ return;
+
+ var cols = metricdata.getNumberOfColumns();
+ var rows = metricdata.getNumberOfRows();
+
+ var sel_bitrate = metricView.getValue(row, 0 );
+ var sel_metric = metricView.getValue(row, col);
+
+ var message = '<ul>' + metricView.getColumnLabel(col) +
+ ' (' + sel_bitrate.toFixed(0) + ' kbps, ' + sel_metric.toFixed(2) + ')' + ' is ';
+
+
+ // col 0 is datarate
+ for( var i=1;i<cols;++i) {
+
+ var metric_greatest_thats_less = 0;
+ var rate_greatest_thats_less = 0;
+ var metric_smallest_thats_greater = 999;
+ var rate_smallest_thats_greater = 0;
+
+ if(i==col)
+ continue;
+
+ // Find the lowest metric for the column that's greater than sel_metric and
+ // the highest metric for this column that's less than the metric.
+ for(var line_count = 0; line_count < rows; ++line_count) {
+ this_metric = metricdata.getValue(line_count, i)
+ this_rate = metricdata.getValue(line_count, 0)
+ if(!this_metric)
+ continue;
+
+ if(this_metric > metric_greatest_thats_less &&
+ this_metric <= sel_metric) {
+ metric_greatest_thats_less = this_metric;
+ rate_greatest_thats_less = this_rate;
+ }
+ if(this_metric < metric_smallest_thats_greater &&
+ this_metric > sel_metric) {
+ metric_smallest_thats_greater = this_metric;
+ rate_smallest_thats_greater = this_rate;
+ }
+ }
+
+ if(rate_smallest_thats_greater == 0 || rate_greatest_thats_less == 0) {
+ message = message + " <li> Couldn't find a point on both sides.</li>"
+ } else {
+ metric_slope = ( rate_smallest_thats_greater - rate_greatest_thats_less) /
+ ( metric_smallest_thats_greater - metric_greatest_thats_less);
+
+ projected_rate = ( sel_metric - metric_greatest_thats_less) *
+ metric_slope + rate_greatest_thats_less;
+
+ difference = 100 * (projected_rate / sel_bitrate - 1);
+
+
+ if (difference > 0)
+ message = message + "<li> " + difference.toFixed(2) +
+ "% smaller than <em>" +
+ metricdata.getColumnLabel(i) + "</em></li> "
+ else
+ message = message + "<li> " + -difference.toFixed(2) +
+ "% bigger than <em>" +
+ metricdata.getColumnLabel(i) + "</em></li> "
+ }
+
+ }
+ message = message + "</ul>"
+ statusbar = document.getElementById('status');
+ statusbar.innerHTML = "<p>" + message + "</p>";
+ statusbar.style.display = 'block';
+}
+
+function chartSelect() {
+ var selection = chart.getSelection();
+ var message = '';
+ var min = metricView.getFormattedValue(selection[0].row, 0);
+ var max = metricView.getFormattedValue(selection[selection.length-1].row, 0);
+ var val = metricView.getFormattedValue(selection[0].row,selection[0].column);
+
+ pointDifference(selection[0].row, selection[0].column)
+ min = min / 3
+ max = max * 3
+ metricView.setRows(metricdata.getFilteredRows(
+ [{column: 0,minValue: min, maxValue:max}]));
+
+ chart.draw(metricView, {curveType:'function',
+ chartArea:{left:40, top:10, width:chart_width, height:chart_height - 110},
+ hAxis:{title:"datarate in kbps"}, vAxis:{title:"quality in decibels"},
+ legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+ width:chart_width, height:chart_height - 50});
+}
+
+function selectBetterHandler() {
+ var selection = bettertable.getSelection();
+ for (var i = 0; i < selection.length; i++) {
+ item = selection[i];
+ }
+ selected = item.row
+ query_file()
+}
+
+
+google.load('visualization', '1', {'packages' : ['corechart','table']});
+google.setOnLoadCallback(setup_vis);
+</script>
+</head>
+
+<body>
+
+ <div class="container_12">
+
+ <div class="grid_12 header">
+ <h2>Codec Comparison Results</h2>
+ </div>
+
+ <div class="grid_12 radio">
+
+ <form name="myform">
+ Method For Combining Points
+ <input type="radio" checked name="column" value="1"
+ onClick="changeMetric('filestable_avg')" />Average of bitrates difference
+ <input type="radio" name="column" value="2"
+ onClick="changeMetric('filestable_dsnr')" />BDSNR
+ <input type="radio" name="column" value="3"
+ onClick="changeMetric('filestable_drate')" />BDRATE
+ </form>
+
+ <form id="metrics" name="myform">
+ </form>
+
+ </div>
+
+ <div class="grid_12 main">
+
+ <div class="grid_5 alpha cliplist">
+ <div id="bettertable"></div>
+ </div>
+
+ <div class="grid_5 chartarea">
+ <div id="metricgraph"></div>
+ </div>
+
+ <div class="grid_2 omega indicators">
+ <div class="content">
+ <h5>Indicators</h5>
+ <hr>
+ <div id="status"></div>
+ </div>
+ </div>
+
+ </div>
+
+ </div>
+
+</body>
+</html>
diff --git a/third_party/aom/test/minmax_test.cc b/third_party/aom/test/minmax_test.cc
new file mode 100644
index 0000000000..33be4ff6dc
--- /dev/null
+++ b/third_party/aom/test/minmax_test.cc
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using ::libaom_test::ACMRandom;
+
+typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max);
+
+class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
+ public:
+ void SetUp() override {
+ mm_func_ = GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ protected:
+ MinMaxFunc mm_func_;
+ ACMRandom rnd_;
+};
+
+void reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min_ret, int *max_ret) {
+ int min = 255;
+ int max = 0;
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 8; j++) {
+ const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
+ if (min > diff) min = diff;
+ if (max < diff) max = diff;
+ }
+ }
+
+ *min_ret = min;
+ *max_ret = max;
+}
+
+TEST_P(MinMaxTest, MinValue) {
+ for (int i = 0; i < 64; i++) {
+ uint8_t a[64], b[64];
+ memset(a, 0, sizeof(a));
+ memset(b, 255, sizeof(b));
+ b[i] = i; // Set a minimum difference of i.
+
+ int min, max;
+ API_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+ EXPECT_EQ(255, max);
+ EXPECT_EQ(i, min);
+ }
+}
+
+TEST_P(MinMaxTest, MaxValue) {
+ for (int i = 0; i < 64; i++) {
+ uint8_t a[64], b[64];
+ memset(a, 0, sizeof(a));
+ memset(b, 0, sizeof(b));
+ b[i] = i; // Set a maximum difference of i.
+
+ int min, max;
+ API_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+ EXPECT_EQ(i, max);
+ EXPECT_EQ(0, min);
+ }
+}
+
+TEST_P(MinMaxTest, CompareReference) {
+ uint8_t a[64], b[64];
+ for (int j = 0; j < 64; j++) {
+ a[j] = rnd_.Rand8();
+ b[j] = rnd_.Rand8();
+ }
+
+ int min_ref, max_ref, min, max;
+ reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+ API_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+ EXPECT_EQ(max_ref, max);
+ EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
+ uint8_t a[8 * 64], b[8 * 64];
+ for (int i = 0; i < 8 * 64; i++) {
+ a[i] = rnd_.Rand8();
+ b[i] = rnd_.Rand8();
+ }
+ for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+ for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+ int min_ref, max_ref, min, max;
+ reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+ API_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+ EXPECT_EQ(max_ref, max)
+ << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+ EXPECT_EQ(min_ref, min)
+ << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+using HBDMinMaxTest = MinMaxTest;
+
+void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min_ret, int *max_ret) {
+ int min = 65535;
+ int max = 0;
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 8; j++) {
+ const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]);
+ if (min > diff) min = diff;
+ if (max < diff) max = diff;
+ }
+ }
+
+ *min_ret = min;
+ *max_ret = max;
+}
+
+TEST_P(HBDMinMaxTest, MinValue) {
+ uint8_t *a = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc(64 * sizeof(uint16_t))));
+ uint8_t *b = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc(64 * sizeof(uint16_t))));
+ for (int i = 0; i < 64; i++) {
+ aom_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+ aom_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64);
+ CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i.
+
+ int min, max;
+ API_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+ EXPECT_EQ(65535, max);
+ EXPECT_EQ(i, min);
+ }
+ aom_free(CONVERT_TO_SHORTPTR(a));
+ aom_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, MaxValue) {
+ uint8_t *a = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc(64 * sizeof(uint16_t))));
+ uint8_t *b = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc(64 * sizeof(uint16_t))));
+ for (int i = 0; i < 64; i++) {
+ aom_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+ aom_memset16(CONVERT_TO_SHORTPTR(b), 0, 64);
+ CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i.
+
+ int min, max;
+ API_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+ EXPECT_EQ(i, max);
+ EXPECT_EQ(0, min);
+ }
+ aom_free(CONVERT_TO_SHORTPTR(a));
+ aom_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, CompareReference) {
+ uint8_t *a = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc(64 * sizeof(uint16_t))));
+ uint8_t *b = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc(64 * sizeof(uint16_t))));
+ for (int j = 0; j < 64; j++) {
+ CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16();
+ CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16();
+ }
+
+ int min_ref, max_ref, min, max;
+ highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+ API_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+ aom_free(CONVERT_TO_SHORTPTR(a));
+ aom_free(CONVERT_TO_SHORTPTR(b));
+ EXPECT_EQ(max_ref, max);
+ EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) {
+ uint8_t *a = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc((8 * 64) * sizeof(uint16_t))));
+ uint8_t *b = CONVERT_TO_BYTEPTR(
+ reinterpret_cast<uint16_t *>(aom_malloc((8 * 64) * sizeof(uint16_t))));
+ for (int i = 0; i < 8 * 64; i++) {
+ CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16();
+ CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16();
+ }
+ for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+ for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+ int min_ref, max_ref, min, max;
+ highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+ API_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+ EXPECT_EQ(max_ref, max)
+ << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+ EXPECT_EQ(min_ref, min)
+ << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+ }
+ }
+ aom_free(CONVERT_TO_SHORTPTR(a));
+ aom_free(CONVERT_TO_SHORTPTR(b));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&aom_minmax_8x8_c));
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest,
+ ::testing::Values(&aom_highbd_minmax_8x8_c));
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest,
+ ::testing::Values(&aom_highbd_minmax_8x8_neon));
+#endif
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
+ ::testing::Values(&aom_minmax_8x8_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest,
+ ::testing::Values(&aom_minmax_8x8_neon));
+#endif
+} // namespace
diff --git a/third_party/aom/test/monochrome_test.cc b/third_party/aom/test/monochrome_test.cc
new file mode 100644
index 0000000000..f22b5fe0f2
--- /dev/null
+++ b/third_party/aom/test/monochrome_test.cc
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const unsigned int kCqLevel = 18;
+const double kMaxPsnr = 100.0;
+
+// kPsnrThreshold represents the psnr threshold used to validate the quality of
+// the first frame. The indices correspond to one/two-pass, allintra and
+// realtime encoding modes.
+const double kPsnrThreshold[3] = { 29.0, 41.5, 41.5 };
+
+// kPsnrFluctuation represents the maximum allowed psnr fluctuation w.r.t first
+// frame. The indices correspond to one/two-pass, allintra and realtime
+// encoding modes.
+const double kPsnrFluctuation[3] = { 2.5, 0.3, 16.0 };
+
+class MonochromeTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+ int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ MonochromeTest()
+ : EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)),
+ frame0_psnr_y_(0.0) {}
+
+ ~MonochromeTest() override = default;
+
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3));
+ if (mode_ == ::libaom_test::kAllIntra) {
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ }
+ if (lossless_) {
+ encoder->Control(AV1E_SET_LOSSLESS, 1);
+ }
+ }
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ (void)pts;
+
+ // Get value of top-left corner pixel of U plane
+ int chroma_value = img.planes[AOM_PLANE_U][0];
+
+ bool is_chroma_constant =
+ ComparePlaneToValue(img, AOM_PLANE_U, chroma_value) &&
+ ComparePlaneToValue(img, AOM_PLANE_V, chroma_value);
+
+ // Chroma planes should be constant
+ EXPECT_TRUE(is_chroma_constant);
+
+ // Monochrome flag on image should be set
+ EXPECT_EQ(img.monochrome, 1);
+
+ chroma_value_list_.push_back(chroma_value);
+ }
+
+ // Returns true if all pixels on the plane are equal to value, and returns
+ // false otherwise.
+ bool ComparePlaneToValue(const aom_image_t &img, const int plane,
+ const int value) {
+ const int w = aom_img_plane_width(&img, plane);
+ const int h = aom_img_plane_height(&img, plane);
+ const uint8_t *const buf = img.planes[plane];
+ const int stride = img.stride[plane];
+
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ if (buf[r * stride + c] != value) return false;
+ }
+ }
+ return true;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ // Check average PSNR value is >= 100 db in case of lossless encoding.
+ if (lossless_) {
+ EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
+ return;
+ }
+ const int psnr_index = (mode_ == ::libaom_test::kRealTime) ? 2
+ : (mode_ == ::libaom_test::kAllIntra) ? 1
+ : 0;
+ // Check that the initial Y PSNR value is 'high enough', and check that
+ // subsequent Y PSNR values are 'close' to this initial value.
+ if (frame0_psnr_y_ == 0.0) {
+ frame0_psnr_y_ = pkt->data.psnr.psnr[1];
+ EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[psnr_index]);
+ }
+ EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_,
+ kPsnrFluctuation[psnr_index]);
+ }
+
+ int lossless_;
+ std::vector<int> chroma_value_list_;
+ double frame0_psnr_y_;
+};
+
+TEST_P(MonochromeTest, TestMonochromeEncoding) {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 5);
+
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 1;
+ cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+ // Enable dropped frames.
+ cfg_.rc_dropframe_thresh = 1;
+ // Run at low bitrate.
+ cfg_.rc_target_bitrate = 40;
+ // Set monochrome encoding flag
+ cfg_.monochrome = 1;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Check that the chroma planes are equal across all frames
+ std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+ int initial_chroma_value = *iter;
+ for (; iter != chroma_value_list_.end(); ++iter) {
+ // Check that all decoded frames have the same constant chroma planes.
+ EXPECT_EQ(*iter, initial_chroma_value);
+ }
+}
+
+class MonochromeAllIntraTest : public MonochromeTest {};
+
+TEST_P(MonochromeAllIntraTest, TestMonochromeEncoding) {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 5);
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ // Set monochrome encoding flag
+ cfg_.monochrome = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Check that the chroma planes are equal across all frames
+ std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+ int initial_chroma_value = *iter;
+ for (; iter != chroma_value_list_.end(); ++iter) {
+ // Check that all decoded frames have the same constant chroma planes.
+ EXPECT_EQ(*iter, initial_chroma_value);
+ }
+}
+
+class MonochromeRealtimeTest : public MonochromeTest {};
+
+TEST_P(MonochromeRealtimeTest, TestMonochromeEncoding) {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 30);
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ // Set monochrome encoding flag
+ cfg_.monochrome = 1;
+ // Run at low bitrate.
+ cfg_.rc_target_bitrate = 40;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Check that the chroma planes are equal across all frames
+ std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+ int initial_chroma_value = *iter;
+ for (; iter != chroma_value_list_.end(); ++iter) {
+ // Check that all decoded frames have the same constant chroma planes.
+ EXPECT_EQ(*iter, initial_chroma_value);
+ }
+}
+
+AV1_INSTANTIATE_TEST_SUITE(MonochromeTest,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(0), // lossless
+ ::testing::Values(0)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(MonochromeAllIntraTest,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(0, 1), // lossless
+ ::testing::Values(6, 9)); // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(MonochromeRealtimeTest,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Values(0), // lossless
+ ::testing::Values(6, 8, 10)); // cpu_used
+
+} // namespace
diff --git a/third_party/aom/test/motion_vector_test.cc b/third_party/aom/test/motion_vector_test.cc
new file mode 100644
index 0000000000..4fc8d53d95
--- /dev/null
+++ b/third_party/aom/test/motion_vector_test.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+#define MAX_EXTREME_MV 1
+#define MIN_EXTREME_MV 2
+
+// Encoding modes
+const libaom_test::TestMode kEncodingModeVectors[] = {
+ ::libaom_test::kTwoPassGood,
+ ::libaom_test::kOnePassGood,
+};
+
+// Encoding speeds
+const int kCpuUsedVectors[] = { 1, 5 };
+
+// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV.
+const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV };
+
+class MotionVectorTestLarge
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+ int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ MotionVectorTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
+
+ ~MotionVectorTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ cfg_.g_lag_in_frames = 3;
+ } else {
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ }
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+ }
+
+ libaom_test::TestMode encoding_mode_;
+ int cpu_used_;
+ int mv_test_mode_;
+};
+
+TEST_P(MotionVectorTestLarge, OverallTest) {
+ int width = 3840;
+ int height = 2160;
+
+ // Reduce the test clip's resolution while testing on 32-bit system.
+ if (sizeof(void *) == 4) {
+ width = 2048;
+ height = 360;
+ }
+
+ cfg_.rc_target_bitrate = 24000;
+ cfg_.g_profile = 0;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::YUVVideoSource(
+ "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, width, height, 30, 1, 0, 3));
+
+ ASSERT_NE(video, nullptr);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge,
+ ::testing::ValuesIn(kEncodingModeVectors),
+ ::testing::ValuesIn(kCpuUsedVectors),
+ ::testing::ValuesIn(kMVTestModes));
+} // namespace
diff --git a/third_party/aom/test/mv_cost_test.cc b/third_party/aom/test/mv_cost_test.cc
new file mode 100644
index 0000000000..73d56665bf
--- /dev/null
+++ b/third_party/aom/test/mv_cost_test.cc
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+void ReferenceBuildNmvComponentCostTable(int *mvcost,
+ const nmv_component *const mvcomp,
+ MvSubpelPrecision precision) {
+ int i, v;
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+ int class0_hp_cost[2], hp_cost[2];
+ av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, nullptr);
+ av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, nullptr);
+ av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, nullptr);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], nullptr);
+ }
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+ nullptr);
+ av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, nullptr);
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, nullptr);
+ av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, nullptr);
+ }
+ mvcost[0] = 0;
+ for (v = 1; v <= MV_MAX; ++v) {
+ int z, c, o, d, e, f, cost = 0;
+ z = v - 1;
+ c = av1_get_mv_class(z, &o);
+ cost += class_cost[c];
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ e = (o & 1); /* high precision mv data */
+ if (c == MV_CLASS_0) {
+ cost += class0_cost[d];
+ } else {
+ const int b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+ }
+ if (precision > MV_SUBPEL_NONE) {
+ if (c == MV_CLASS_0) {
+ cost += class0_fp_cost[d][f];
+ } else {
+ cost += fp_cost[f];
+ }
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ if (c == MV_CLASS_0) {
+ cost += class0_hp_cost[e];
+ } else {
+ cost += hp_cost[e];
+ }
+ }
+ }
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ }
+}
+
+// Test using the default context, except for sign
+static const nmv_component kTestComponentContext = {
+ { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, 32762,
+ 32767) }, // class_cdf // fp
+ { { AOM_CDF4(16384, 24576, 26624) },
+ { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf
+ { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf
+ { AOM_CDF2(70 * 128) }, // sign_cdf
+ { AOM_CDF2(160 * 128) }, // class0_hp_cdf
+ { AOM_CDF2(128 * 128) }, // hp_cdf
+ { AOM_CDF2(216 * 128) }, // class0_cdf
+ { { AOM_CDF2(128 * 136) },
+ { AOM_CDF2(128 * 140) },
+ { AOM_CDF2(128 * 148) },
+ { AOM_CDF2(128 * 160) },
+ { AOM_CDF2(128 * 176) },
+ { AOM_CDF2(128 * 192) },
+ { AOM_CDF2(128 * 224) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 240) } }, // bits_cdf
+};
+
+void TestMvComponentCostTable(MvSubpelPrecision precision) {
+ std::unique_ptr<int[]> mvcost_ref_buf(new int[MV_VALS]);
+ std::unique_ptr<int[]> mvcost_buf(new int[MV_VALS]);
+ int *mvcost_ref = mvcost_ref_buf.get() + MV_MAX;
+ int *mvcost = mvcost_buf.get() + MV_MAX;
+
+ ReferenceBuildNmvComponentCostTable(mvcost_ref, &kTestComponentContext,
+ precision);
+ av1_build_nmv_component_cost_table(mvcost, &kTestComponentContext, precision);
+
+ for (int v = 0; v <= MV_MAX; ++v) {
+ ASSERT_EQ(mvcost_ref[v], mvcost[v]) << "v = " << v;
+ ASSERT_EQ(mvcost_ref[-v], mvcost[-v]) << "v = " << v;
+ }
+}
+
+TEST(MvCostTest, BuildMvComponentCostTableTest1) {
+ TestMvComponentCostTable(MV_SUBPEL_NONE);
+}
+
+TEST(MvCostTest, BuildMvComponentCostTableTest2) {
+ TestMvComponentCostTable(MV_SUBPEL_LOW_PRECISION);
+}
+
+TEST(MvCostTest, BuildMvComponentCostTableTest3) {
+ TestMvComponentCostTable(MV_SUBPEL_HIGH_PRECISION);
+}
+
+} // namespace \ No newline at end of file
diff --git a/third_party/aom/test/noise_model_test.cc b/third_party/aom/test/noise_model_test.cc
new file mode 100644
index 0000000000..b3edcc218e
--- /dev/null
+++ b/third_party/aom/test/noise_model_test.cc
@@ -0,0 +1,1372 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <algorithm>
+#include <vector>
+
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Return normally distrbuted values with standard deviation of sigma.
+double randn(libaom_test::ACMRandom *random, double sigma) {
+ while (true) {
+ const double u = 2.0 * ((double)random->Rand31() /
+ testing::internal::Random::kMaxRange) -
+ 1.0;
+ const double v = 2.0 * ((double)random->Rand31() /
+ testing::internal::Random::kMaxRange) -
+ 1.0;
+ const double s = u * u + v * v;
+ if (s > 0 && s < 1) {
+ return sigma * (u * sqrt(-2.0 * log(s) / s));
+ }
+ }
+}
+
+// Synthesizes noise using the auto-regressive filter of the given lag,
+// with the provided n coefficients sampled at the given coords.
+void noise_synth(libaom_test::ACMRandom *random, int lag, int n,
+ const int (*coords)[2], const double *coeffs, double *data,
+ int w, int h) {
+ const int pad_size = 3 * lag;
+ const int padded_w = w + pad_size;
+ const int padded_h = h + pad_size;
+ int x = 0, y = 0;
+ std::vector<double> padded(padded_w * padded_h);
+
+ for (y = 0; y < padded_h; ++y) {
+ for (x = 0; x < padded_w; ++x) {
+ padded[y * padded_w + x] = randn(random, 1.0);
+ }
+ }
+ for (y = lag; y < padded_h; ++y) {
+ for (x = lag; x < padded_w; ++x) {
+ double sum = 0;
+ int i = 0;
+ for (i = 0; i < n; ++i) {
+ const int dx = coords[i][0];
+ const int dy = coords[i][1];
+ sum += padded[(y + dy) * padded_w + (x + dx)] * coeffs[i];
+ }
+ padded[y * padded_w + x] += sum;
+ }
+ }
+ // Copy over the padded rows to the output
+ for (y = 0; y < h; ++y) {
+ memcpy(data + y * w, &padded[0] + y * padded_w, sizeof(*data) * w);
+ }
+}
+
+std::vector<float> get_noise_psd(double *noise, int width, int height,
+ int block_size) {
+ float *block =
+ (float *)aom_memalign(32, block_size * block_size * sizeof(block));
+ std::vector<float> psd(block_size * block_size);
+ if (block == nullptr) {
+ EXPECT_NE(block, nullptr);
+ return psd;
+ }
+ int num_blocks = 0;
+ struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+ if (tx == nullptr) {
+ EXPECT_NE(tx, nullptr);
+ return psd;
+ }
+ for (int y = 0; y <= height - block_size; y += block_size / 2) {
+ for (int x = 0; x <= width - block_size; x += block_size / 2) {
+ for (int yy = 0; yy < block_size; ++yy) {
+ for (int xx = 0; xx < block_size; ++xx) {
+ block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx];
+ }
+ }
+ aom_noise_tx_forward(tx, &block[0]);
+ aom_noise_tx_add_energy(tx, &psd[0]);
+ num_blocks++;
+ }
+ }
+ for (int yy = 0; yy < block_size; ++yy) {
+ for (int xx = 0; xx <= block_size / 2; ++xx) {
+ psd[yy * block_size + xx] /= num_blocks;
+ }
+ }
+ // Fill in the data that is missing due to symmetries
+ for (int xx = 1; xx < block_size / 2; ++xx) {
+ psd[(block_size - xx)] = psd[xx];
+ }
+ for (int yy = 1; yy < block_size; ++yy) {
+ for (int xx = 1; xx < block_size / 2; ++xx) {
+ psd[(block_size - yy) * block_size + (block_size - xx)] =
+ psd[yy * block_size + xx];
+ }
+ }
+ aom_noise_tx_free(tx);
+ aom_free(block);
+ return psd;
+}
+
+} // namespace
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins) {
+ aom_noise_strength_solver_t solver;
+ aom_noise_strength_solver_init(&solver, 2, 8);
+ EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+ EXPECT_NEAR(255, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+ aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins10bit) {
+ aom_noise_strength_solver_t solver;
+ aom_noise_strength_solver_init(&solver, 2, 10);
+ EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+ EXPECT_NEAR(1023, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+ aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCenters256Bins) {
+ const int num_bins = 256;
+ aom_noise_strength_solver_t solver;
+ aom_noise_strength_solver_init(&solver, num_bins, 8);
+
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_NEAR(i, aom_noise_strength_solver_get_center(&solver, i), 1e-5);
+ }
+ aom_noise_strength_solver_free(&solver);
+}
+
+// Tests that the noise strength solver returns the identity transform when
+// given identity-like constraints.
+TEST(NoiseStrengthSolver, ObserveIdentity) {
+ const int num_bins = 256;
+ aom_noise_strength_solver_t solver;
+ ASSERT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+ // We have to add a big more strength to constraints at the boundary to
+ // overcome any regularization.
+ for (int j = 0; j < 5; ++j) {
+ aom_noise_strength_solver_add_measurement(&solver, 0, 0);
+ aom_noise_strength_solver_add_measurement(&solver, 255, 255);
+ }
+ for (int i = 0; i < 256; ++i) {
+ aom_noise_strength_solver_add_measurement(&solver, i, i);
+ }
+ EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+ for (int i = 2; i < num_bins - 2; ++i) {
+ EXPECT_NEAR(i, solver.eqns.x[i], 0.1);
+ }
+
+ aom_noise_strength_lut_t lut;
+ EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, 2, &lut));
+
+ ASSERT_EQ(2, lut.num_points);
+ EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+ EXPECT_NEAR(0.0, lut.points[0][1], 0.5);
+ EXPECT_NEAR(255.0, lut.points[1][0], 1e-5);
+ EXPECT_NEAR(255.0, lut.points[1][1], 0.5);
+
+ aom_noise_strength_lut_free(&lut);
+ aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, SimplifiesCurve) {
+ const int num_bins = 256;
+ aom_noise_strength_solver_t solver;
+ EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+ // Create a parabolic input
+ for (int i = 0; i < 256; ++i) {
+ const double x = (i - 127.5) / 63.5;
+ aom_noise_strength_solver_add_measurement(&solver, i, x * x);
+ }
+ EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+
+ // First try to fit an unconstrained lut
+ aom_noise_strength_lut_t lut;
+ EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, -1, &lut));
+ ASSERT_LE(20, lut.num_points);
+ aom_noise_strength_lut_free(&lut);
+
+ // Now constrain the maximum number of points
+ const int kMaxPoints = 9;
+ EXPECT_EQ(1,
+ aom_noise_strength_solver_fit_piecewise(&solver, kMaxPoints, &lut));
+ ASSERT_EQ(kMaxPoints, lut.num_points);
+
+ // Check that the input parabola is still well represented
+ EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+ EXPECT_NEAR(4.0, lut.points[0][1], 0.1);
+ for (int i = 1; i < lut.num_points - 1; ++i) {
+ const double x = (lut.points[i][0] - 128.) / 64.;
+ EXPECT_NEAR(x * x, lut.points[i][1], 0.1);
+ }
+ EXPECT_NEAR(255.0, lut.points[kMaxPoints - 1][0], 1e-5);
+
+ EXPECT_NEAR(4.0, lut.points[kMaxPoints - 1][1], 0.1);
+ aom_noise_strength_lut_free(&lut);
+ aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthLut, LutInitNegativeOrZeroSize) {
+ aom_noise_strength_lut_t lut;
+ ASSERT_FALSE(aom_noise_strength_lut_init(&lut, -1));
+ ASSERT_FALSE(aom_noise_strength_lut_init(&lut, 0));
+}
+
+TEST(NoiseStrengthLut, LutEvalSinglePoint) {
+ aom_noise_strength_lut_t lut;
+ ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1));
+ ASSERT_EQ(1, lut.num_points);
+ lut.points[0][0] = 0;
+ lut.points[0][1] = 1;
+ EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, -1));
+ EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 0));
+ EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 1));
+ aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseStrengthLut, LutEvalMultiPointInterp) {
+ const double kEps = 1e-5;
+ aom_noise_strength_lut_t lut;
+ ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 4));
+ ASSERT_EQ(4, lut.num_points);
+
+ lut.points[0][0] = 0;
+ lut.points[0][1] = 0;
+
+ lut.points[1][0] = 1;
+ lut.points[1][1] = 1;
+
+ lut.points[2][0] = 2;
+ lut.points[2][1] = 1;
+
+ lut.points[3][0] = 100;
+ lut.points[3][1] = 1001;
+
+ // Test lower boundary
+ EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, -1));
+ EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, 0));
+
+ // Test first part that should be identity
+ EXPECT_NEAR(0.25, aom_noise_strength_lut_eval(&lut, 0.25), kEps);
+ EXPECT_NEAR(0.75, aom_noise_strength_lut_eval(&lut, 0.75), kEps);
+
+ // This is a constant section (should evaluate to 1)
+ EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.25), kEps);
+ EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.75), kEps);
+
+ // Test interpolation between to non-zero y coords.
+ EXPECT_NEAR(1, aom_noise_strength_lut_eval(&lut, 2), kEps);
+ EXPECT_NEAR(251, aom_noise_strength_lut_eval(&lut, 26.5), kEps);
+ EXPECT_NEAR(751, aom_noise_strength_lut_eval(&lut, 75.5), kEps);
+
+ // Test upper boundary
+ EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 100));
+ EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 101));
+
+ aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseModel, InitSuccessWithValidSquareShape) {
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+ aom_noise_model_t model;
+
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+ const int kNumCoords = 12;
+ const int kCoords[][2] = { { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 },
+ { 2, -2 }, { -2, -1 }, { -1, -1 }, { 0, -1 },
+ { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 } };
+ EXPECT_EQ(kNumCoords, model.n);
+ for (int i = 0; i < kNumCoords; ++i) {
+ const int *coord = kCoords[i];
+ EXPECT_EQ(coord[0], model.coords[i][0]);
+ EXPECT_EQ(coord[1], model.coords[i][1]);
+ }
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitSuccessWithValidDiamondShape) {
+ aom_noise_model_t model;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8, 0 };
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+ EXPECT_EQ(6, model.n);
+ const int kNumCoords = 6;
+ const int kCoords[][2] = { { 0, -2 }, { -1, -1 }, { 0, -1 },
+ { 1, -1 }, { -2, 0 }, { -1, 0 } };
+ EXPECT_EQ(kNumCoords, model.n);
+ for (int i = 0; i < kNumCoords; ++i) {
+ const int *coord = kCoords[i];
+ EXPECT_EQ(coord[0], model.coords[i][0]);
+ EXPECT_EQ(coord[1], model.coords[i][1]);
+ }
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooLargeLag) {
+ aom_noise_model_t model;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8, 0 };
+ EXPECT_FALSE(aom_noise_model_init(&model, params));
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooSmallLag) {
+ aom_noise_model_t model;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8, 0 };
+ EXPECT_FALSE(aom_noise_model_init(&model, params));
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithInvalidShape) {
+ aom_noise_model_t model;
+ aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8, 0 };
+ EXPECT_FALSE(aom_noise_model_init(&model, params));
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithInvalidBitdepth) {
+ aom_noise_model_t model;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+ for (int i = 0; i <= 32; ++i) {
+ params.bit_depth = i;
+ if (i == 8 || i == 10 || i == 12) {
+ EXPECT_TRUE(aom_noise_model_init(&model, params)) << "bit_depth: " << i;
+ aom_noise_model_free(&model);
+ } else {
+ EXPECT_FALSE(aom_noise_model_init(&model, params)) << "bit_depth: " << i;
+ }
+ }
+ params.bit_depth = INT_MAX;
+ EXPECT_FALSE(aom_noise_model_init(&model, params));
+}
+
+// A container template class to hold a data type and extra arguments.
+// All of these args are bundled into one struct so that we can use
+// parameterized tests on combinations of supported data types
+// (uint8_t and uint16_t) and bit depths (8, 10, 12).
+template <typename T, int bit_depth, bool use_highbd>
+struct BitDepthParams {
+ typedef T data_type_t;
+ static const int kBitDepth = bit_depth;
+ static const bool kUseHighBD = use_highbd;
+};
+
+template <typename T>
+class FlatBlockEstimatorTest : public ::testing::Test, public T {
+ public:
+ void SetUp() override { random_.Reset(171); }
+ typedef std::vector<typename T::data_type_t> VecType;
+ VecType data_;
+ libaom_test::ACMRandom random_;
+};
+
+TYPED_TEST_SUITE_P(FlatBlockEstimatorTest);
+
+TYPED_TEST_P(FlatBlockEstimatorTest, ExtractBlock) {
+ const int kBlockSize = 16;
+ aom_flat_block_finder_t flat_block_finder;
+ ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+ this->kBitDepth, this->kUseHighBD));
+ const double normalization = flat_block_finder.normalization;
+
+ // Test with an image of more than one block.
+ const int h = 2 * kBlockSize;
+ const int w = 2 * kBlockSize;
+ const int stride = 2 * kBlockSize;
+ this->data_.resize(h * stride, 128);
+
+ // Set up the (0,0) block to be a plane and the (0,1) block to be a
+ // checkerboard
+ const int shift = this->kBitDepth - 8;
+ for (int y = 0; y < kBlockSize; ++y) {
+ for (int x = 0; x < kBlockSize; ++x) {
+ this->data_[y * stride + x] = (-y + x + 128) << shift;
+ this->data_[y * stride + x + kBlockSize] =
+ ((x % 2 + y % 2) % 2 ? 128 - 20 : 128 + 20) << shift;
+ }
+ }
+ std::vector<double> block(kBlockSize * kBlockSize, 1);
+ std::vector<double> plane(kBlockSize * kBlockSize, 1);
+
+ // The block data should be a constant (zero) and the rest of the plane
+ // trend is covered in the plane data.
+ aom_flat_block_finder_extract_block(&flat_block_finder,
+ (uint8_t *)&this->data_[0], w, h, stride,
+ 0, 0, &plane[0], &block[0]);
+ for (int y = 0; y < kBlockSize; ++y) {
+ for (int x = 0; x < kBlockSize; ++x) {
+ EXPECT_NEAR(0, block[y * kBlockSize + x], 1e-5);
+ EXPECT_NEAR((double)(this->data_[y * stride + x]) / normalization,
+ plane[y * kBlockSize + x], 1e-5);
+ }
+ }
+
+ // The plane trend is a constant, and the block is a zero mean checkerboard.
+ aom_flat_block_finder_extract_block(&flat_block_finder,
+ (uint8_t *)&this->data_[0], w, h, stride,
+ kBlockSize, 0, &plane[0], &block[0]);
+ const int mid = 128 << shift;
+ for (int y = 0; y < kBlockSize; ++y) {
+ for (int x = 0; x < kBlockSize; ++x) {
+ EXPECT_NEAR(((double)this->data_[y * stride + x + kBlockSize] - mid) /
+ normalization,
+ block[y * kBlockSize + x], 1e-5);
+ EXPECT_NEAR(mid / normalization, plane[y * kBlockSize + x], 1e-5);
+ }
+ }
+ aom_flat_block_finder_free(&flat_block_finder);
+}
+
+TYPED_TEST_P(FlatBlockEstimatorTest, FindFlatBlocks) {
+ const int kBlockSize = 32;
+ aom_flat_block_finder_t flat_block_finder;
+ ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+ this->kBitDepth, this->kUseHighBD));
+
+ const int num_blocks_w = 8;
+ const int h = kBlockSize;
+ const int w = kBlockSize * num_blocks_w;
+ const int stride = w;
+ this->data_.resize(h * stride, 128);
+ std::vector<uint8_t> flat_blocks(num_blocks_w, 0);
+
+ const int shift = this->kBitDepth - 8;
+ for (int y = 0; y < kBlockSize; ++y) {
+ for (int x = 0; x < kBlockSize; ++x) {
+ // Block 0 (not flat): constant doesn't have enough variance to qualify
+ this->data_[y * stride + x + 0 * kBlockSize] = 128 << shift;
+
+ // Block 1 (not flat): too high of variance is hard to validate as flat
+ this->data_[y * stride + x + 1 * kBlockSize] =
+ ((uint8_t)(128 + randn(&this->random_, 5))) << shift;
+
+ // Block 2 (flat): slight checkerboard added to constant
+ const int check = (x % 2 + y % 2) % 2 ? -2 : 2;
+ this->data_[y * stride + x + 2 * kBlockSize] = (128 + check) << shift;
+
+ // Block 3 (flat): planar block with checkerboard pattern is also flat
+ this->data_[y * stride + x + 3 * kBlockSize] =
+ (y * 2 - x / 2 + 128 + check) << shift;
+
+ // Block 4 (flat): gaussian random with standard deviation 1.
+ this->data_[y * stride + x + 4 * kBlockSize] =
+ ((uint8_t)(randn(&this->random_, 1) + x + 128.0)) << shift;
+
+ // Block 5 (flat): gaussian random with standard deviation 2.
+ this->data_[y * stride + x + 5 * kBlockSize] =
+ ((uint8_t)(randn(&this->random_, 2) + y + 128.0)) << shift;
+
+ // Block 6 (not flat): too high of directional gradient.
+ const int strong_edge = x > kBlockSize / 2 ? 64 : 0;
+ this->data_[y * stride + x + 6 * kBlockSize] =
+ ((uint8_t)(randn(&this->random_, 1) + strong_edge + 128.0)) << shift;
+
+ // Block 7 (not flat): too high gradient.
+ const int big_check = ((x >> 2) % 2 + (y >> 2) % 2) % 2 ? -16 : 16;
+ this->data_[y * stride + x + 7 * kBlockSize] =
+ ((uint8_t)(randn(&this->random_, 1) + big_check + 128.0)) << shift;
+ }
+ }
+
+ EXPECT_EQ(4, aom_flat_block_finder_run(&flat_block_finder,
+ (uint8_t *)&this->data_[0], w, h,
+ stride, &flat_blocks[0]));
+
+ // First two blocks are not flat
+ EXPECT_EQ(0, flat_blocks[0]);
+ EXPECT_EQ(0, flat_blocks[1]);
+
+ // Next 4 blocks are flat.
+ EXPECT_EQ(255, flat_blocks[2]);
+ EXPECT_EQ(255, flat_blocks[3]);
+ EXPECT_EQ(255, flat_blocks[4]);
+ EXPECT_EQ(255, flat_blocks[5]);
+
+ // Last 2 are not flat by threshold
+ EXPECT_EQ(0, flat_blocks[6]);
+ EXPECT_EQ(0, flat_blocks[7]);
+
+ // Add the noise from non-flat block 1 to every block.
+ for (int y = 0; y < kBlockSize; ++y) {
+ for (int x = 0; x < kBlockSize * num_blocks_w; ++x) {
+ this->data_[y * stride + x] +=
+ (this->data_[y * stride + x % kBlockSize + kBlockSize] -
+ (128 << shift));
+ }
+ }
+ // Now the scored selection will pick the one that is most likely flat (block
+ // 0)
+ EXPECT_EQ(1, aom_flat_block_finder_run(&flat_block_finder,
+ (uint8_t *)&this->data_[0], w, h,
+ stride, &flat_blocks[0]));
+ EXPECT_EQ(1, flat_blocks[0]);
+ EXPECT_EQ(0, flat_blocks[1]);
+ EXPECT_EQ(0, flat_blocks[2]);
+ EXPECT_EQ(0, flat_blocks[3]);
+ EXPECT_EQ(0, flat_blocks[4]);
+ EXPECT_EQ(0, flat_blocks[5]);
+ EXPECT_EQ(0, flat_blocks[6]);
+ EXPECT_EQ(0, flat_blocks[7]);
+
+ aom_flat_block_finder_free(&flat_block_finder);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(FlatBlockEstimatorTest, ExtractBlock,
+ FindFlatBlocks);
+
+typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>, // lowbd
+ BitDepthParams<uint16_t, 8, true>, // lowbd in 16-bit
+ BitDepthParams<uint16_t, 10, true>, // highbd data
+ BitDepthParams<uint16_t, 12, true> >
+ AllBitDepthParams;
+INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
+ AllBitDepthParams);
+
+template <typename T>
+class NoiseModelUpdateTest : public ::testing::Test, public T {
+ public:
+ static const int kWidth = 128;
+ static const int kHeight = 128;
+ static const int kBlockSize = 16;
+ static const int kNumBlocksX = kWidth / kBlockSize;
+ static const int kNumBlocksY = kHeight / kBlockSize;
+
+ void SetUp() override {
+ const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+ T::kBitDepth, T::kUseHighBD };
+ ASSERT_TRUE(aom_noise_model_init(&model_, params));
+
+ random_.Reset(100171);
+
+ data_.resize(kWidth * kHeight * 3);
+ denoised_.resize(kWidth * kHeight * 3);
+ noise_.resize(kWidth * kHeight * 3);
+ renoise_.resize(kWidth * kHeight);
+ flat_blocks_.resize(kNumBlocksX * kNumBlocksY);
+
+ for (int c = 0, offset = 0; c < 3; ++c, offset += kWidth * kHeight) {
+ data_ptr_[c] = &data_[offset];
+ noise_ptr_[c] = &noise_[offset];
+ denoised_ptr_[c] = &denoised_[offset];
+ strides_[c] = kWidth;
+
+ data_ptr_raw_[c] = (uint8_t *)&data_[offset];
+ denoised_ptr_raw_[c] = (uint8_t *)&denoised_[offset];
+ }
+ chroma_sub_[0] = 0;
+ chroma_sub_[1] = 0;
+ }
+
+ int NoiseModelUpdate(int block_size = kBlockSize) {
+ return aom_noise_model_update(&model_, data_ptr_raw_, denoised_ptr_raw_,
+ kWidth, kHeight, strides_, chroma_sub_,
+ &flat_blocks_[0], block_size);
+ }
+
+ void TearDown() override { aom_noise_model_free(&model_); }
+
+ protected:
+ aom_noise_model_t model_;
+ std::vector<typename T::data_type_t> data_;
+ std::vector<typename T::data_type_t> denoised_;
+
+ std::vector<double> noise_;
+ std::vector<double> renoise_;
+ std::vector<uint8_t> flat_blocks_;
+
+ typename T::data_type_t *data_ptr_[3];
+ typename T::data_type_t *denoised_ptr_[3];
+
+ double *noise_ptr_[3];
+ int strides_[3];
+ int chroma_sub_[2];
+ libaom_test::ACMRandom random_;
+
+ private:
+ uint8_t *data_ptr_raw_[3];
+ uint8_t *denoised_ptr_raw_[3];
+};
+
+TYPED_TEST_SUITE_P(NoiseModelUpdateTest);
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks) {
+ EXPECT_EQ(AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+ this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForZeroNoiseAllFlat) {
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+ this->denoised_.assign(this->denoised_.size(), 128);
+ this->data_.assign(this->denoised_.size(), 128);
+ EXPECT_EQ(AOM_NOISE_STATUS_INTERNAL_ERROR, this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsBlockSizeTooSmall) {
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+ this->denoised_.assign(this->denoised_.size(), 128);
+ this->data_.assign(this->denoised_.size(), 128);
+ EXPECT_EQ(AOM_NOISE_STATUS_INVALID_ARGUMENT,
+ this->NoiseModelUpdate(6 /* block_size=6 is too small*/));
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForWhiteRandomNoise) {
+ aom_noise_model_t &model = this->model_;
+ const int width = this->kWidth;
+ const int height = this->kHeight;
+
+ const int shift = this->kBitDepth - 8;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ this->data_ptr_[0][y * width + x] = int(64 + y + randn(&this->random_, 1))
+ << shift;
+ this->denoised_ptr_[0][y * width + x] = (64 + y) << shift;
+ // Make the chroma planes completely correlated with the Y plane
+ for (int c = 1; c < 3; ++c) {
+ this->data_ptr_[c][y * width + x] = this->data_ptr_[0][y * width + x];
+ this->denoised_ptr_[c][y * width + x] =
+ this->denoised_ptr_[0][y * width + x];
+ }
+ }
+ }
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+ EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+ const double kCoeffEps = 0.075;
+ const int n = model.n;
+ for (int c = 0; c < 3; ++c) {
+ for (int i = 0; i < n; ++i) {
+ EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+ EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+ }
+ // The second and third channels are highly correlated with the first.
+ if (c > 0) {
+ ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+ ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+ EXPECT_NEAR(1, model.latest_state[c].eqns.x[n], kCoeffEps);
+ EXPECT_NEAR(1, model.combined_state[c].eqns.x[n], kCoeffEps);
+ }
+ }
+
+ // The fitted noise strength should be close to the standard deviation
+ // for all intensity bins.
+ const double kStdEps = 0.1;
+ const double normalize = 1 << shift;
+
+ for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+ EXPECT_NEAR(1.0,
+ model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+ kStdEps);
+ EXPECT_NEAR(1.0,
+ model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+ kStdEps);
+ }
+
+ aom_noise_strength_lut_t lut;
+ aom_noise_strength_solver_fit_piecewise(
+ &model.latest_state[0].strength_solver, -1, &lut);
+ ASSERT_EQ(2, lut.num_points);
+ EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+ EXPECT_NEAR(1.0, lut.points[0][1] / normalize, kStdEps);
+ EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+ EXPECT_NEAR(1.0, lut.points[1][1] / normalize, kStdEps);
+ aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForScaledWhiteNoise) {
+ aom_noise_model_t &model = this->model_;
+ const int width = this->kWidth;
+ const int height = this->kHeight;
+
+ const double kCoeffEps = 0.055;
+ const double kLowStd = 1;
+ const double kHighStd = 4;
+ const int shift = this->kBitDepth - 8;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ for (int c = 0; c < 3; ++c) {
+ // The image data is bimodal:
+ // Bottom half has low intensity and low noise strength
+ // Top half has high intensity and high noise strength
+ const int avg = (y < height / 2) ? 4 : 245;
+ const double std = (y < height / 2) ? kLowStd : kHighStd;
+ this->data_ptr_[c][y * width + x] =
+ ((uint8_t)std::min((int)255,
+ (int)(2 + avg + randn(&this->random_, std))))
+ << shift;
+ this->denoised_ptr_[c][y * width + x] = (2 + avg) << shift;
+ }
+ }
+ }
+ // Label all blocks as flat for the update
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+ EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+ const int n = model.n;
+ // The noise is uncorrelated spatially and with the y channel.
+ // All coefficients should be reasonably close to zero.
+ for (int c = 0; c < 3; ++c) {
+ for (int i = 0; i < n; ++i) {
+ EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+ EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+ }
+ if (c > 0) {
+ ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+ ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+ // The correlation to the y channel should be low (near zero)
+ EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+ EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+ }
+ }
+
+ // Noise strength should vary between kLowStd and kHighStd.
+ const double kStdEps = 0.15;
+ // We have to normalize fitted standard deviation based on bit depth.
+ const double normalize = (1 << shift);
+
+ ASSERT_EQ(20, model.latest_state[0].strength_solver.eqns.n);
+ for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+ const double a = i / 19.0;
+ const double expected = (kLowStd * (1.0 - a) + kHighStd * a);
+ EXPECT_NEAR(expected,
+ model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+ kStdEps);
+ EXPECT_NEAR(expected,
+ model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+ kStdEps);
+ }
+
+ // If we fit a piecewise linear model, there should be two points:
+ // one near kLowStd at 0, and the other near kHighStd and 255.
+ aom_noise_strength_lut_t lut;
+ aom_noise_strength_solver_fit_piecewise(
+ &model.latest_state[0].strength_solver, 2, &lut);
+ ASSERT_EQ(2, lut.num_points);
+ EXPECT_NEAR(0, lut.points[0][0], 1e-4);
+ EXPECT_NEAR(kLowStd, lut.points[0][1] / normalize, kStdEps);
+ EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+ EXPECT_NEAR(kHighStd, lut.points[1][1] / normalize, kStdEps);
+ aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForCorrelatedNoise) {
+ aom_noise_model_t &model = this->model_;
+ const int width = this->kWidth;
+ const int height = this->kHeight;
+ const int kNumCoeffs = 24;
+ const double kStd = 4;
+ const double kStdEps = 0.3;
+ const double kCoeffEps = 0.065;
+ // Use different coefficients for each channel
+ const double kCoeffs[3][24] = {
+ { 0.02884, -0.03356, 0.00633, 0.01757, 0.02849, -0.04620,
+ 0.02833, -0.07178, 0.07076, -0.11603, -0.10413, -0.16571,
+ 0.05158, -0.07969, 0.02640, -0.07191, 0.02530, 0.41968,
+ 0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+ { 0.00269, -0.01291, -0.01513, 0.07234, 0.03208, 0.00477,
+ 0.00226, -0.00254, 0.03533, 0.12841, -0.25970, -0.06336,
+ 0.05238, -0.00845, -0.03118, 0.09043, -0.36558, 0.48903,
+ 0.00595, -0.11938, 0.02106, 0.095956, -0.350139, 0.59305 },
+ { -0.00643, -0.01080, -0.01466, 0.06951, 0.03707, -0.00482,
+ 0.00817, -0.00909, 0.02949, 0.12181, -0.25210, -0.07886,
+ 0.06083, -0.01210, -0.03108, 0.08944, -0.35875, 0.49150,
+ 0.00415, -0.12905, 0.02870, 0.09740, -0.34610, 0.58824 },
+ };
+
+ ASSERT_EQ(model.n, kNumCoeffs);
+ this->chroma_sub_[0] = this->chroma_sub_[1] = 1;
+
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+
+ // Add different noise onto each plane
+ const int shift = this->kBitDepth - 8;
+ for (int c = 0; c < 3; ++c) {
+ noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+ kCoeffs[c], this->noise_ptr_[c], width, height);
+ const int x_shift = c > 0 ? this->chroma_sub_[0] : 0;
+ const int y_shift = c > 0 ? this->chroma_sub_[1] : 0;
+ for (int y = 0; y < (height >> y_shift); ++y) {
+ for (int x = 0; x < (width >> x_shift); ++x) {
+ const uint8_t value = 64 + x / 2 + y / 4;
+ this->data_ptr_[c][y * width + x] =
+ (uint8_t(value + this->noise_ptr_[c][y * width + x] * kStd))
+ << shift;
+ this->denoised_ptr_[c][y * width + x] = value << shift;
+ }
+ }
+ }
+ EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+ // For the Y plane, the solved coefficients should be close to the original
+ const int n = model.n;
+ for (int c = 0; c < 3; ++c) {
+ for (int i = 0; i < n; ++i) {
+ EXPECT_NEAR(kCoeffs[c][i], model.latest_state[c].eqns.x[i], kCoeffEps);
+ EXPECT_NEAR(kCoeffs[c][i], model.combined_state[c].eqns.x[i], kCoeffEps);
+ }
+ // The chroma planes should be uncorrelated with the luma plane
+ if (c > 0) {
+ EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+ EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+ }
+ // Correlation between the coefficient vector and the fitted coefficients
+ // should be close to 1.
+ EXPECT_LT(0.98, aom_normalized_cross_correlation(
+ model.latest_state[c].eqns.x, kCoeffs[c], kNumCoeffs));
+
+ noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+ model.latest_state[c].eqns.x, &this->renoise_[0], width,
+ height);
+
+ EXPECT_TRUE(aom_noise_data_validate(&this->renoise_[0], width, height));
+ }
+
+ // Check fitted noise strength
+ const double normalize = 1 << shift;
+ for (int c = 0; c < 3; ++c) {
+ for (int i = 0; i < model.latest_state[c].strength_solver.eqns.n; ++i) {
+ EXPECT_NEAR(kStd,
+ model.latest_state[c].strength_solver.eqns.x[i] / normalize,
+ kStdEps);
+ }
+ }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest,
+ NoiseStrengthChangeSignalsDifferentNoiseType) {
+ aom_noise_model_t &model = this->model_;
+ const int width = this->kWidth;
+ const int height = this->kHeight;
+ const int block_size = this->kBlockSize;
+ // Create a gradient image with std = 2 uncorrelated noise
+ const double kStd = 2;
+ const int shift = this->kBitDepth - 8;
+
+ for (int i = 0; i < width * height; ++i) {
+ const uint8_t val = (i % width) < width / 2 ? 64 : 192;
+ for (int c = 0; c < 3; ++c) {
+ this->noise_ptr_[c][i] = randn(&this->random_, 1);
+ this->data_ptr_[c][i] = ((uint8_t)(this->noise_ptr_[c][i] * kStd + val))
+ << shift;
+ this->denoised_ptr_[c][i] = val << shift;
+ }
+ }
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+ EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+ const int kNumBlocks = width * height / block_size / block_size;
+ EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+ EXPECT_EQ(kNumBlocks, model.latest_state[1].strength_solver.num_equations);
+ EXPECT_EQ(kNumBlocks, model.latest_state[2].strength_solver.num_equations);
+ EXPECT_EQ(kNumBlocks, model.combined_state[0].strength_solver.num_equations);
+ EXPECT_EQ(kNumBlocks, model.combined_state[1].strength_solver.num_equations);
+ EXPECT_EQ(kNumBlocks, model.combined_state[2].strength_solver.num_equations);
+
+ // Bump up noise by an insignificant amount
+ for (int i = 0; i < width * height; ++i) {
+ const uint8_t val = (i % width) < width / 2 ? 64 : 192;
+ this->data_ptr_[0][i] =
+ ((uint8_t)(this->noise_ptr_[0][i] * (kStd + 0.085) + val)) << shift;
+ }
+ EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+ const double kARGainTolerance = 0.02;
+ for (int c = 0; c < 3; ++c) {
+ EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+ EXPECT_EQ(15250, model.latest_state[c].num_observations);
+ EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+ EXPECT_EQ(2 * kNumBlocks,
+ model.combined_state[c].strength_solver.num_equations);
+ EXPECT_EQ(2 * 15250, model.combined_state[c].num_observations);
+ EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+ }
+
+ // Bump up the noise strength on half the image for one channel by a
+ // significant amount.
+ for (int i = 0; i < width * height; ++i) {
+ const uint8_t val = (i % width) < width / 2 ? 64 : 128;
+ if (i % width < width / 2) {
+ this->data_ptr_[0][i] =
+ ((uint8_t)(randn(&this->random_, kStd + 0.5) + val)) << shift;
+ }
+ }
+ EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+
+ // Since we didn't update the combined state, it should still be at 2 *
+ // num_blocks
+ EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+ EXPECT_EQ(2 * kNumBlocks,
+ model.combined_state[0].strength_solver.num_equations);
+
+ // In normal operation, the "latest" estimate can be saved to the "combined"
+ // state for continued updates.
+ aom_noise_model_save_latest(&model);
+ for (int c = 0; c < 3; ++c) {
+ EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+ EXPECT_EQ(15250, model.latest_state[c].num_observations);
+ EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+ EXPECT_EQ(kNumBlocks,
+ model.combined_state[c].strength_solver.num_equations);
+ EXPECT_EQ(15250, model.combined_state[c].num_observations);
+ EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+ }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, NoiseCoeffsSignalsDifferentNoiseType) {
+ aom_noise_model_t &model = this->model_;
+ const int width = this->kWidth;
+ const int height = this->kHeight;
+ const double kCoeffs[2][24] = {
+ { 0.02884, -0.03356, 0.00633, 0.01757, 0.02849, -0.04620,
+ 0.02833, -0.07178, 0.07076, -0.11603, -0.10413, -0.16571,
+ 0.05158, -0.07969, 0.02640, -0.07191, 0.02530, 0.41968,
+ 0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+ { 0.00269, -0.01291, -0.01513, 0.07234, 0.03208, 0.00477,
+ 0.00226, -0.00254, 0.03533, 0.12841, -0.25970, -0.06336,
+ 0.05238, -0.00845, -0.03118, 0.09043, -0.36558, 0.48903,
+ 0.00595, -0.11938, 0.02106, 0.095956, -0.350139, 0.59305 }
+ };
+
+ noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+ kCoeffs[0], this->noise_ptr_[0], width, height);
+ for (int i = 0; i < width * height; ++i) {
+ this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+ }
+ this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+ EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+ // Now try with the second set of AR coefficients
+ noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+ kCoeffs[1], this->noise_ptr_[0], width, height);
+ for (int i = 0; i < width * height; ++i) {
+ this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+ }
+ EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+}
+REGISTER_TYPED_TEST_SUITE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
+ UpdateSuccessForZeroNoiseAllFlat,
+ UpdateFailsBlockSizeTooSmall,
+ UpdateSuccessForWhiteRandomNoise,
+ UpdateSuccessForScaledWhiteNoise,
+ UpdateSuccessForCorrelatedNoise,
+ NoiseStrengthChangeSignalsDifferentNoiseType,
+ NoiseCoeffsSignalsDifferentNoiseType);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(NoiseModelUpdateTestInstatiation,
+ NoiseModelUpdateTest, AllBitDepthParams);
+
+TEST(NoiseModelGetGrainParameters, TestLagSize) {
+ aom_film_grain_t film_grain;
+ for (int lag = 1; lag <= 3; ++lag) {
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+ aom_noise_model_t model;
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+ EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+ EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+ aom_noise_model_free(&model);
+ }
+
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8, 0 };
+ aom_noise_model_t model;
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+ EXPECT_FALSE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestARCoeffShiftBounds) {
+ struct TestCase {
+ double max_input_value;
+ int expected_ar_coeff_shift;
+ int expected_value;
+ };
+ const int lag = 1;
+ const int kNumTestCases = 19;
+ const TestCase test_cases[] = {
+ // Test cases for ar_coeff_shift = 9
+ { 0, 9, 0 },
+ { 0.125, 9, 64 },
+ { -0.125, 9, -64 },
+ { 0.2499, 9, 127 },
+ { -0.25, 9, -128 },
+ // Test cases for ar_coeff_shift = 8
+ { 0.25, 8, 64 },
+ { -0.2501, 8, -64 },
+ { 0.499, 8, 127 },
+ { -0.5, 8, -128 },
+ // Test cases for ar_coeff_shift = 7
+ { 0.5, 7, 64 },
+ { -0.5001, 7, -64 },
+ { 0.999, 7, 127 },
+ { -1, 7, -128 },
+ // Test cases for ar_coeff_shift = 6
+ { 1.0, 6, 64 },
+ { -1.0001, 6, -64 },
+ { 2.0, 6, 127 },
+ { -2.0, 6, -128 },
+ { 4, 6, 127 },
+ { -4, 6, -128 },
+ };
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+ aom_noise_model_t model;
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+ for (int i = 0; i < kNumTestCases; ++i) {
+ const TestCase &test_case = test_cases[i];
+ model.combined_state[0].eqns.x[0] = test_case.max_input_value;
+
+ aom_film_grain_t film_grain;
+ EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+ EXPECT_EQ(1, film_grain.ar_coeff_lag);
+ EXPECT_EQ(test_case.expected_ar_coeff_shift, film_grain.ar_coeff_shift);
+ EXPECT_EQ(test_case.expected_value, film_grain.ar_coeffs_y[0]);
+ }
+ aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestNoiseStrengthShiftBounds) {
+ struct TestCase {
+ double max_input_value;
+ int expected_scaling_shift;
+ int expected_value;
+ };
+ const int kNumTestCases = 10;
+ const TestCase test_cases[] = {
+ { 0, 11, 0 }, { 1, 11, 64 }, { 2, 11, 128 }, { 3.99, 11, 255 },
+ { 4, 10, 128 }, { 7.99, 10, 255 }, { 8, 9, 128 }, { 16, 8, 128 },
+ { 31.99, 8, 255 }, { 64, 8, 255 }, // clipped
+ };
+ const int lag = 1;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+ aom_noise_model_t model;
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+ for (int i = 0; i < kNumTestCases; ++i) {
+ const TestCase &test_case = test_cases[i];
+ aom_equation_system_t &eqns = model.combined_state[0].strength_solver.eqns;
+ // Set the fitted scale parameters to be a constant value.
+ for (int j = 0; j < eqns.n; ++j) {
+ eqns.x[j] = test_case.max_input_value;
+ }
+ aom_film_grain_t film_grain;
+ EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+ // We expect a single constant segemnt
+ EXPECT_EQ(test_case.expected_scaling_shift, film_grain.scaling_shift);
+ EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[0][1]);
+ EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[1][1]);
+ }
+ aom_noise_model_free(&model);
+}
+
+// The AR coefficients are the same inputs used to generate "Test 2" in the test
+// vectors
+TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) {
+ const double kInputCoeffsY[] = { 0.0315, 0.0073, 0.0218, 0.00235, 0.00511,
+ -0.0222, 0.0627, -0.022, 0.05575, -0.1816,
+ 0.0107, -0.1966, 0.00065, -0.0809, 0.04934,
+ -0.1349, -0.0352, 0.41772, 0.27973, 0.04207,
+ -0.0429, -0.1372, 0.06193, 0.52032 };
+ const double kInputCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5 };
+ const double kInputCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5 };
+ const int kExpectedARCoeffsY[] = { 4, 1, 3, 0, 1, -3, 8, -3,
+ 7, -23, 1, -25, 0, -10, 6, -17,
+ -5, 53, 36, 5, -5, -18, 8, 67 };
+ const int kExpectedARCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 };
+ const int kExpectedARCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -126 };
+ // Scaling function is initialized analytically with a sqrt function.
+ const int kNumScalingPointsY = 12;
+ const int kExpectedScalingPointsY[][2] = {
+ { 0, 0 }, { 13, 44 }, { 27, 62 }, { 40, 76 },
+ { 54, 88 }, { 67, 98 }, { 94, 117 }, { 121, 132 },
+ { 148, 146 }, { 174, 159 }, { 201, 171 }, { 255, 192 },
+ };
+
+ const int lag = 3;
+ aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+ aom_noise_model_t model;
+ EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+ // Setup the AR coeffs
+ memcpy(model.combined_state[0].eqns.x, kInputCoeffsY, sizeof(kInputCoeffsY));
+ memcpy(model.combined_state[1].eqns.x, kInputCoeffsCB,
+ sizeof(kInputCoeffsCB));
+ memcpy(model.combined_state[2].eqns.x, kInputCoeffsCR,
+ sizeof(kInputCoeffsCR));
+ for (int i = 0; i < model.combined_state[0].strength_solver.num_bins; ++i) {
+ const double x =
+ ((double)i) / (model.combined_state[0].strength_solver.num_bins - 1.0);
+ model.combined_state[0].strength_solver.eqns.x[i] = 6 * sqrt(x);
+ model.combined_state[1].strength_solver.eqns.x[i] = 3;
+ model.combined_state[2].strength_solver.eqns.x[i] = 2;
+
+ // Inject some observations into the strength solver, as during film grain
+ // parameter extraction an estimate of the average strength will be used to
+ // adjust correlation.
+ const int n = model.combined_state[0].strength_solver.num_bins;
+ for (int j = 0; j < model.combined_state[0].strength_solver.num_bins; ++j) {
+ model.combined_state[0].strength_solver.eqns.A[i * n + j] = 1;
+ model.combined_state[1].strength_solver.eqns.A[i * n + j] = 1;
+ model.combined_state[2].strength_solver.eqns.A[i * n + j] = 1;
+ }
+ }
+
+ aom_film_grain_t film_grain;
+ EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+ EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+ EXPECT_EQ(3, film_grain.ar_coeff_lag);
+ EXPECT_EQ(7, film_grain.ar_coeff_shift);
+ EXPECT_EQ(10, film_grain.scaling_shift);
+ EXPECT_EQ(kNumScalingPointsY, film_grain.num_y_points);
+ EXPECT_EQ(1, film_grain.update_parameters);
+ EXPECT_EQ(1, film_grain.apply_grain);
+
+ const int kNumARCoeffs = 24;
+ for (int i = 0; i < kNumARCoeffs; ++i) {
+ EXPECT_EQ(kExpectedARCoeffsY[i], film_grain.ar_coeffs_y[i]);
+ }
+ for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+ EXPECT_EQ(kExpectedARCoeffsCB[i], film_grain.ar_coeffs_cb[i]);
+ }
+ for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+ EXPECT_EQ(kExpectedARCoeffsCR[i], film_grain.ar_coeffs_cr[i]);
+ }
+ for (int i = 0; i < kNumScalingPointsY; ++i) {
+ EXPECT_EQ(kExpectedScalingPointsY[i][0], film_grain.scaling_points_y[i][0]);
+ EXPECT_EQ(kExpectedScalingPointsY[i][1], film_grain.scaling_points_y[i][1]);
+ }
+
+ // CB strength should just be a piecewise segment
+ EXPECT_EQ(2, film_grain.num_cb_points);
+ EXPECT_EQ(0, film_grain.scaling_points_cb[0][0]);
+ EXPECT_EQ(255, film_grain.scaling_points_cb[1][0]);
+ EXPECT_EQ(96, film_grain.scaling_points_cb[0][1]);
+ EXPECT_EQ(96, film_grain.scaling_points_cb[1][1]);
+
+ // CR strength should just be a piecewise segment
+ EXPECT_EQ(2, film_grain.num_cr_points);
+ EXPECT_EQ(0, film_grain.scaling_points_cr[0][0]);
+ EXPECT_EQ(255, film_grain.scaling_points_cr[1][0]);
+ EXPECT_EQ(64, film_grain.scaling_points_cr[0][1]);
+ EXPECT_EQ(64, film_grain.scaling_points_cr[1][1]);
+
+ EXPECT_EQ(128, film_grain.cb_mult);
+ EXPECT_EQ(192, film_grain.cb_luma_mult);
+ EXPECT_EQ(256, film_grain.cb_offset);
+ EXPECT_EQ(128, film_grain.cr_mult);
+ EXPECT_EQ(192, film_grain.cr_luma_mult);
+ EXPECT_EQ(256, film_grain.cr_offset);
+ EXPECT_EQ(0, film_grain.chroma_scaling_from_luma);
+ EXPECT_EQ(0, film_grain.grain_scale_shift);
+
+ aom_noise_model_free(&model);
+}
+
+template <typename T>
+class WienerDenoiseTest : public ::testing::Test, public T {
+ public:
+ static void SetUpTestSuite() { aom_dsp_rtcd(); }
+
+ protected:
+ void SetUp() override {
+ static const float kNoiseLevel = 5.f;
+ static const float kStd = 4.0;
+ static const double kMaxValue = (1 << T::kBitDepth) - 1;
+
+ chroma_sub_[0] = 1;
+ chroma_sub_[1] = 1;
+ stride_[0] = kWidth;
+ stride_[1] = kWidth / 2;
+ stride_[2] = kWidth / 2;
+ for (int k = 0; k < 3; ++k) {
+ data_[k].resize(kWidth * kHeight);
+ denoised_[k].resize(kWidth * kHeight);
+ noise_psd_[k].resize(kBlockSize * kBlockSize);
+ }
+
+ const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093,
+ 0.048, 0.404, 0.2353, -0.035, -0.093, 0.441 };
+ const int kCoords[12][2] = {
+ { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 },
+ { -1, -1 }, { 0, -1 }, { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 }
+ };
+ const int kLag = 2;
+ const int kLength = 12;
+ libaom_test::ACMRandom random;
+ std::vector<double> noise(kWidth * kHeight);
+ noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth,
+ kHeight);
+ noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize);
+ for (int i = 0; i < kBlockSize * kBlockSize; ++i) {
+ noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise *
+ kScaleNoise / (kMaxValue * kMaxValue));
+ }
+
+ float psd_value =
+ aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel);
+ for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) {
+ noise_psd_[1][i] = psd_value;
+ noise_psd_[2][i] = psd_value;
+ }
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = 0; x < kWidth; ++x) {
+ data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp(
+ (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue);
+ }
+ }
+
+ for (int c = 1; c < 3; ++c) {
+ for (int y = 0; y < (kHeight >> 1); ++y) {
+ for (int x = 0; x < (kWidth >> 1); ++x) {
+ data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp(
+ (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue);
+ }
+ }
+ }
+ for (int k = 0; k < 3; ++k) {
+ noise_psd_ptrs_[k] = &noise_psd_[k][0];
+ }
+ }
+ static const int kBlockSize = 32;
+ static const int kBlockSizeChroma = 16;
+ static const int kWidth = 256;
+ static const int kHeight = 256;
+ static const int kScaleNoise = 1 << (T::kBitDepth - 8);
+
+ std::vector<typename T::data_type_t> data_[3];
+ std::vector<typename T::data_type_t> denoised_[3];
+ std::vector<float> noise_psd_[3];
+ int chroma_sub_[2];
+ float *noise_psd_ptrs_[3];
+ int stride_[3];
+};
+
+TYPED_TEST_SUITE_P(WienerDenoiseTest);
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
+ const uint8_t *const data_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+ };
+ uint8_t *denoised_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+ };
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_,
+ 18, this->kBitDepth, this->kUseHighBD));
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_,
+ 48, this->kBitDepth, this->kUseHighBD));
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_,
+ this->chroma_sub_, this->noise_psd_ptrs_,
+ 64, this->kBitDepth, this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
+ const uint8_t *const data_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+ };
+ uint8_t *denoised_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+ };
+ int chroma_sub[2] = { 1, 0 };
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_, chroma_sub,
+ this->noise_psd_ptrs_, 32, this->kBitDepth,
+ this->kUseHighBD));
+
+ chroma_sub[0] = 0;
+ chroma_sub[1] = 1;
+ EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+ this->kHeight, this->stride_, chroma_sub,
+ this->noise_psd_ptrs_, 32, this->kBitDepth,
+ this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
+ const int width = this->kWidth;
+ const int height = this->kHeight;
+ const int block_size = this->kBlockSize;
+ const uint8_t *const data_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+ };
+ uint8_t *denoised_ptrs[3] = {
+ reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+ reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+ };
+ const int ret = aom_wiener_denoise_2d(
+ data_ptrs, denoised_ptrs, width, height, this->stride_, this->chroma_sub_,
+ this->noise_psd_ptrs_, block_size, this->kBitDepth, this->kUseHighBD);
+ EXPECT_EQ(1, ret);
+
+ // Check the noise on the denoised image (from the analytical gradient)
+ // and make sure that it is less than what we added.
+ for (int c = 0; c < 3; ++c) {
+ std::vector<double> measured_noise(width * height);
+
+ double var = 0;
+ const int shift = (c > 0);
+ for (int x = 0; x < (width >> shift); ++x) {
+ for (int y = 0; y < (height >> shift); ++y) {
+ const double diff = this->denoised_[c][y * this->stride_[c] + x] -
+ x * this->kScaleNoise;
+ var += diff * diff;
+ measured_noise[y * width + x] = diff;
+ }
+ }
+ var /= (width * height);
+ const double std = sqrt(std::max(0.0, var));
+ EXPECT_LE(std, 1.25f * this->kScaleNoise);
+ if (c == 0) {
+ std::vector<float> measured_psd =
+ get_noise_psd(&measured_noise[0], width, height, block_size);
+ std::vector<double> measured_psd_d(block_size * block_size);
+ std::vector<double> noise_psd_d(block_size * block_size);
+ std::copy(measured_psd.begin(), measured_psd.end(),
+ measured_psd_d.begin());
+ std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(),
+ noise_psd_d.begin());
+ EXPECT_LT(
+ aom_normalized_cross_correlation(&measured_psd_d[0], &noise_psd_d[0],
+ (int)(noise_psd_d.size())),
+ 0.35);
+ }
+ }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(WienerDenoiseTest, InvalidBlockSize,
+ InvalidChromaSubsampling, GradientTest);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+ AllBitDepthParams);
diff --git a/third_party/aom/test/obmc_sad_test.cc b/third_party/aom/test/obmc_sad_test.cc
new file mode 100644
index 0000000000..967b677666
--- /dev/null
+++ b/third_party/aom/test/obmc_sad_test.cc
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask);
+typedef libaom_test::FuncParam<ObmcSadF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadTest);
+
+TEST_P(ObmcSadTest, RandomValues) {
+ DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = rng_.Rand8();
+ wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
+ mask[i] = rng_(kMaskMax * kMaskMax + 1);
+ }
+
+ const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(tst_res =
+ params_.tst_func(pre, pre_stride, wsrc, mask));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+TEST_P(ObmcSadTest, ExtremeValues) {
+ DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+ const int pre_stride = iter;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = UINT8_MAX;
+ wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+ mask[i] = kMaskMax * kMaskMax;
+ }
+
+ const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(tst_res =
+ params_.tst_func(pre, pre_stride, wsrc, mask));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+#if HAVE_SSE4_1
+const ObmcSadTest::ParamType sse4_functions[] = {
+ TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1),
+ TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1),
+ TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1),
+ TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
+ TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
+ TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
+ TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_sse4_1),
+ TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_sse4_1),
+ TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_sse4_1),
+ TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_sse4_1),
+ TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_sse4_1),
+ TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_sse4_1),
+ TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1),
+ TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1),
+ TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1),
+ TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1),
+
+ TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_sse4_1),
+ TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_sse4_1),
+ TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_sse4_1),
+ TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_sse4_1),
+ TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_sse4_1),
+ TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_sse4_1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadTest,
+ ::testing::ValuesIn(sse4_functions));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const ObmcSadTest::ParamType avx2_functions[] = {
+ TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2),
+ TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2),
+ TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2),
+ TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2),
+ TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2),
+ TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2),
+ TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2),
+ TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2),
+ TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2),
+ TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2),
+ TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2),
+ TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2),
+ TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2),
+ TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2),
+ TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2),
+ TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2),
+
+ TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_avx2),
+ TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_avx2),
+ TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_avx2),
+ TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_avx2),
+ TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_avx2),
+ TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadTest,
+ ::testing::ValuesIn(avx2_functions));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+const ObmcSadTest::ParamType neon_functions[] = {
+ TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_neon),
+ TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_neon),
+ TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_neon),
+ TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_neon),
+ TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_neon),
+ TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_neon),
+ TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_neon),
+ TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_neon),
+ TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_neon),
+ TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_neon),
+ TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_neon),
+ TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_neon),
+ TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_neon),
+ TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_neon),
+ TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_neon),
+ TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_neon),
+
+ TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_neon),
+ TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_neon),
+ TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_neon),
+ TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_neon),
+ TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_neon),
+ TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_neon),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcSadTest,
+ ::testing::ValuesIn(neon_functions));
+#endif // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcSadHBDTest);
+
+TEST_P(ObmcSadHBDTest, RandomValues) {
+ DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = rng_(1 << 12);
+ wsrc[i] = rng_(1 << 12) * rng_(kMaskMax * kMaskMax + 1);
+ mask[i] = rng_(kMaskMax * kMaskMax + 1);
+ }
+
+ const unsigned int ref_res =
+ params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(
+ tst_res =
+ params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+TEST_P(ObmcSadHBDTest, ExtremeValues) {
+ DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+ const int pre_stride = iter;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = (1 << 12) - 1;
+ wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
+ mask[i] = kMaskMax * kMaskMax;
+ }
+
+ const unsigned int ref_res =
+ params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(
+ tst_res =
+ params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+#if HAVE_NEON
+ObmcSadHBDTest::ParamType neon_functions_hbd[] = {
+ TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_neon),
+ TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_neon),
+ TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_neon),
+ TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_neon),
+ TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_neon),
+ TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_neon),
+ TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_neon),
+ TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_neon),
+ TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_neon),
+ TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_neon),
+ TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_neon),
+ TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_neon),
+ TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_neon),
+ TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_neon),
+ TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_neon),
+ TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_neon),
+#if !CONFIG_REALTIME_ONLY
+ TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_neon),
+ TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_neon),
+ TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_neon),
+ TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_neon),
+ TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_neon),
+ TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_neon),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcSadHBDTest,
+ ::testing::ValuesIn(neon_functions_hbd));
+#endif // HAVE_NEON
+
+#if HAVE_SSE4_1
+ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
+ TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_sse4_1),
+
+ TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_sse4_1),
+ TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_sse4_1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadHBDTest,
+ ::testing::ValuesIn(sse4_functions_hbd));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+ObmcSadHBDTest::ParamType avx2_functions_hbd[] = {
+ TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_avx2),
+ TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_avx2),
+ TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_avx2),
+ TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_avx2),
+ TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_avx2),
+ TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_avx2),
+ TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_avx2),
+ TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_avx2),
+ TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_avx2),
+ TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_avx2),
+ TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_avx2),
+ TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_avx2),
+ TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2),
+ TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2),
+ TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2),
+ TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2),
+
+ TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_avx2),
+ TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_avx2),
+ TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_avx2),
+ TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_avx2),
+ TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_avx2),
+ TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadHBDTest,
+ ::testing::ValuesIn(avx2_functions_hbd));
+#endif // HAVE_AVX2
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/obmc_variance_test.cc b/third_party/aom/test/obmc_variance_test.cc
new file mode 100644
index 0000000000..5f21a8a6c1
--- /dev/null
+++ b/third_party/aom/test/obmc_variance_test.cc
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcVarF)(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *sse);
+typedef libaom_test::FuncParam<ObmcVarF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceTest);
+
+TEST_P(ObmcVarianceTest, RandomValues) {
+ DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = this->rng_.Rand8();
+ wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+ mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+ }
+
+ unsigned int ref_sse, tst_sse;
+ const unsigned int ref_res =
+ params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(
+ tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+ ASSERT_EQ(ref_res, tst_res);
+ ASSERT_EQ(ref_sse, tst_sse);
+ }
+}
+
+TEST_P(ObmcVarianceTest, ExtremeValues) {
+ DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+ const int pre_stride = iter;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = UINT8_MAX;
+ wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+ mask[i] = kMaskMax * kMaskMax;
+ }
+
+ unsigned int ref_sse, tst_sse;
+ const unsigned int ref_res =
+ params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(
+ tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+ ASSERT_EQ(ref_res, tst_res);
+ ASSERT_EQ(ref_sse, tst_sse);
+ }
+}
+
+TEST_P(ObmcVarianceTest, DISABLED_Speed) {
+ DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = this->rng_.Rand8();
+ wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+ mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+ }
+
+ const int num_loops = 1000000;
+ unsigned int ref_sse, tst_sse;
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("c_time=%d \t simd_time=%d \t gain=%f \n", elapsed_time_c,
+ elapsed_time_simd,
+ static_cast<double>(elapsed_time_c) / elapsed_time_simd);
+}
+
+#if HAVE_SSE4_1
+const ObmcVarianceTest::ParamType sse4_functions[] = {
+ TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
+ TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1),
+ TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1),
+ TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1),
+ TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1),
+ TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1),
+ TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_sse4_1),
+ TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_sse4_1),
+ TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_sse4_1),
+ TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_sse4_1),
+ TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_sse4_1),
+ TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_sse4_1),
+ TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_sse4_1),
+ TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_sse4_1),
+ TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
+ TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1),
+
+ TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_sse4_1),
+ TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_sse4_1),
+ TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_sse4_1),
+ TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_sse4_1),
+ TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_sse4_1),
+ TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_sse4_1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceTest,
+ ::testing::ValuesIn(sse4_functions));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const ObmcVarianceTest::ParamType avx2_functions[] = {
+ TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_avx2),
+ TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_avx2),
+ TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_avx2),
+ TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_avx2),
+ TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_avx2),
+ TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_avx2),
+ TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_avx2),
+ TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_avx2),
+ TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_avx2),
+ TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_avx2),
+ TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_avx2),
+ TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2),
+ TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2),
+ TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2),
+ TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_avx2),
+ TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_avx2),
+
+ TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_avx2),
+ TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_avx2),
+ TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_avx2),
+ TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_avx2),
+ TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_avx2),
+ TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcVarianceTest,
+ ::testing::ValuesIn(avx2_functions));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+const ObmcVarianceTest::ParamType neon_functions[] = {
+ TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_neon),
+ TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_neon),
+ TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_neon),
+ TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_neon),
+ TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_neon),
+ TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_neon),
+ TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_neon),
+ TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_neon),
+ TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_neon),
+ TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_neon),
+ TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_neon),
+ TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_neon),
+ TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_neon),
+ TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_neon),
+ TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_neon),
+ TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_neon),
+
+ TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_neon),
+ TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_neon),
+ TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_neon),
+ TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_neon),
+ TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_neon),
+ TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_neon),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcVarianceTest,
+ ::testing::ValuesIn(neon_functions));
+#endif // HAVE_NEON
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH && !CONFIG_REALTIME_ONLY
+class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ObmcVarianceHBDTest);
+
+TEST_P(ObmcVarianceHBDTest, RandomValues) {
+ DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = this->rng_(1 << params_.bit_depth);
+ wsrc[i] = this->rng_(1 << params_.bit_depth) *
+ this->rng_(kMaskMax * kMaskMax + 1);
+ mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+ }
+
+ unsigned int ref_sse, tst_sse;
+ const unsigned int ref_res = params_.ref_func(
+ CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+ pre_stride, wsrc, mask,
+ &tst_sse));
+
+ ASSERT_EQ(ref_res, tst_res);
+ ASSERT_EQ(ref_sse, tst_sse);
+ }
+}
+
+TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
+ DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+ for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+ const int pre_stride = iter;
+
+ for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+ pre[i] = (1 << params_.bit_depth) - 1;
+ wsrc[i] = ((1 << params_.bit_depth) - 1) * kMaskMax * kMaskMax;
+ mask[i] = kMaskMax * kMaskMax;
+ }
+
+ unsigned int ref_sse, tst_sse;
+ const unsigned int ref_res = params_.ref_func(
+ CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
+ unsigned int tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+ pre_stride, wsrc, mask,
+ &tst_sse));
+
+ ASSERT_EQ(ref_res, tst_res);
+ ASSERT_EQ(ref_sse, tst_sse);
+ }
+}
+
+#if HAVE_NEON
+ObmcVarianceHBDTest::ParamType neon_functions_hbd[] = {
+ TestFuncs(aom_highbd_8_obmc_variance128x128_c,
+ aom_highbd_8_obmc_variance128x128_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance128x64_c,
+ aom_highbd_8_obmc_variance128x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x128_c,
+ aom_highbd_8_obmc_variance64x128_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x64_c,
+ aom_highbd_8_obmc_variance64x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x32_c,
+ aom_highbd_8_obmc_variance64x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x64_c,
+ aom_highbd_8_obmc_variance32x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x32_c,
+ aom_highbd_8_obmc_variance32x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x16_c,
+ aom_highbd_8_obmc_variance32x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x32_c,
+ aom_highbd_8_obmc_variance16x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x16_c,
+ aom_highbd_8_obmc_variance16x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x8_c,
+ aom_highbd_8_obmc_variance16x8_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x16_c,
+ aom_highbd_8_obmc_variance8x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x8_c, aom_highbd_8_obmc_variance8x8_neon,
+ 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x4_c, aom_highbd_8_obmc_variance8x4_neon,
+ 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x8_c, aom_highbd_8_obmc_variance4x8_neon,
+ 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x4_c, aom_highbd_8_obmc_variance4x4_neon,
+ 8),
+ TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+ aom_highbd_10_obmc_variance128x128_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+ aom_highbd_10_obmc_variance128x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+ aom_highbd_10_obmc_variance64x128_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x64_c,
+ aom_highbd_10_obmc_variance64x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x32_c,
+ aom_highbd_10_obmc_variance64x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x64_c,
+ aom_highbd_10_obmc_variance32x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x32_c,
+ aom_highbd_10_obmc_variance32x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x16_c,
+ aom_highbd_10_obmc_variance32x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x32_c,
+ aom_highbd_10_obmc_variance16x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x16_c,
+ aom_highbd_10_obmc_variance16x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x8_c,
+ aom_highbd_10_obmc_variance16x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x16_c,
+ aom_highbd_10_obmc_variance8x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x8_c,
+ aom_highbd_10_obmc_variance8x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x4_c,
+ aom_highbd_10_obmc_variance8x4_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x8_c,
+ aom_highbd_10_obmc_variance4x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x4_c,
+ aom_highbd_10_obmc_variance4x4_neon, 10),
+ TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+ aom_highbd_12_obmc_variance128x128_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+ aom_highbd_12_obmc_variance128x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+ aom_highbd_12_obmc_variance64x128_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x64_c,
+ aom_highbd_12_obmc_variance64x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x32_c,
+ aom_highbd_12_obmc_variance64x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x64_c,
+ aom_highbd_12_obmc_variance32x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x32_c,
+ aom_highbd_12_obmc_variance32x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x16_c,
+ aom_highbd_12_obmc_variance32x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x32_c,
+ aom_highbd_12_obmc_variance16x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x16_c,
+ aom_highbd_12_obmc_variance16x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x8_c,
+ aom_highbd_12_obmc_variance16x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x16_c,
+ aom_highbd_12_obmc_variance8x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x8_c,
+ aom_highbd_12_obmc_variance8x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x4_c,
+ aom_highbd_12_obmc_variance8x4_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x8_c,
+ aom_highbd_12_obmc_variance4x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x4_c,
+ aom_highbd_12_obmc_variance4x4_neon, 12),
+ TestFuncs(aom_highbd_8_obmc_variance64x16_c,
+ aom_highbd_8_obmc_variance64x16_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x64_c,
+ aom_highbd_8_obmc_variance16x64_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x8_c,
+ aom_highbd_8_obmc_variance32x8_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x32_c,
+ aom_highbd_8_obmc_variance8x32_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x4_c,
+ aom_highbd_8_obmc_variance16x4_neon, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x16_c,
+ aom_highbd_8_obmc_variance4x16_neon, 8),
+ TestFuncs(aom_highbd_10_obmc_variance64x16_c,
+ aom_highbd_10_obmc_variance64x16_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x64_c,
+ aom_highbd_10_obmc_variance16x64_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x8_c,
+ aom_highbd_10_obmc_variance32x8_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x32_c,
+ aom_highbd_10_obmc_variance8x32_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x4_c,
+ aom_highbd_10_obmc_variance16x4_neon, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x16_c,
+ aom_highbd_10_obmc_variance4x16_neon, 10),
+ TestFuncs(aom_highbd_12_obmc_variance64x16_c,
+ aom_highbd_12_obmc_variance64x16_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x64_c,
+ aom_highbd_12_obmc_variance16x64_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x8_c,
+ aom_highbd_12_obmc_variance32x8_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x32_c,
+ aom_highbd_12_obmc_variance8x32_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x4_c,
+ aom_highbd_12_obmc_variance16x4_neon, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x16_c,
+ aom_highbd_12_obmc_variance4x16_neon, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcVarianceHBDTest,
+ ::testing::ValuesIn(neon_functions_hbd));
+#endif // HAVE_NEON
+
+#if HAVE_SSE4_1
+ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
+ TestFuncs(aom_highbd_8_obmc_variance128x128_c,
+ aom_highbd_8_obmc_variance128x128_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance128x64_c,
+ aom_highbd_8_obmc_variance128x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x128_c,
+ aom_highbd_8_obmc_variance64x128_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x64_c,
+ aom_highbd_8_obmc_variance64x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance64x32_c,
+ aom_highbd_8_obmc_variance64x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x64_c,
+ aom_highbd_8_obmc_variance32x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x32_c,
+ aom_highbd_8_obmc_variance32x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x16_c,
+ aom_highbd_8_obmc_variance32x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x32_c,
+ aom_highbd_8_obmc_variance16x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x16_c,
+ aom_highbd_8_obmc_variance16x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x8_c,
+ aom_highbd_8_obmc_variance16x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x16_c,
+ aom_highbd_8_obmc_variance8x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x8_c,
+ aom_highbd_8_obmc_variance8x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x4_c,
+ aom_highbd_8_obmc_variance8x4_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x8_c,
+ aom_highbd_8_obmc_variance4x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x4_c,
+ aom_highbd_8_obmc_variance4x4_sse4_1, 8),
+ TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+ aom_highbd_10_obmc_variance128x128_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+ aom_highbd_10_obmc_variance128x64_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+ aom_highbd_10_obmc_variance64x128_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x64_c,
+ aom_highbd_10_obmc_variance64x64_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance64x32_c,
+ aom_highbd_10_obmc_variance64x32_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x64_c,
+ aom_highbd_10_obmc_variance32x64_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x32_c,
+ aom_highbd_10_obmc_variance32x32_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x16_c,
+ aom_highbd_10_obmc_variance32x16_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x32_c,
+ aom_highbd_10_obmc_variance16x32_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x16_c,
+ aom_highbd_10_obmc_variance16x16_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x8_c,
+ aom_highbd_10_obmc_variance16x8_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x16_c,
+ aom_highbd_10_obmc_variance8x16_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x8_c,
+ aom_highbd_10_obmc_variance8x8_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x4_c,
+ aom_highbd_10_obmc_variance8x4_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x8_c,
+ aom_highbd_10_obmc_variance4x8_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x4_c,
+ aom_highbd_10_obmc_variance4x4_sse4_1, 10),
+ TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+ aom_highbd_12_obmc_variance128x128_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+ aom_highbd_12_obmc_variance128x64_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+ aom_highbd_12_obmc_variance64x128_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x64_c,
+ aom_highbd_12_obmc_variance64x64_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance64x32_c,
+ aom_highbd_12_obmc_variance64x32_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x64_c,
+ aom_highbd_12_obmc_variance32x64_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x32_c,
+ aom_highbd_12_obmc_variance32x32_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x16_c,
+ aom_highbd_12_obmc_variance32x16_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x32_c,
+ aom_highbd_12_obmc_variance16x32_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x16_c,
+ aom_highbd_12_obmc_variance16x16_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x8_c,
+ aom_highbd_12_obmc_variance16x8_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x16_c,
+ aom_highbd_12_obmc_variance8x16_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x8_c,
+ aom_highbd_12_obmc_variance8x8_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x4_c,
+ aom_highbd_12_obmc_variance8x4_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x8_c,
+ aom_highbd_12_obmc_variance4x8_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x4_c,
+ aom_highbd_12_obmc_variance4x4_sse4_1, 12),
+
+ TestFuncs(aom_highbd_8_obmc_variance64x16_c,
+ aom_highbd_8_obmc_variance64x16_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x64_c,
+ aom_highbd_8_obmc_variance16x64_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance32x8_c,
+ aom_highbd_8_obmc_variance32x8_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance8x32_c,
+ aom_highbd_8_obmc_variance8x32_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance16x4_c,
+ aom_highbd_8_obmc_variance16x4_sse4_1, 8),
+ TestFuncs(aom_highbd_8_obmc_variance4x16_c,
+ aom_highbd_8_obmc_variance4x16_sse4_1, 8),
+ TestFuncs(aom_highbd_10_obmc_variance64x16_c,
+ aom_highbd_10_obmc_variance64x16_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x64_c,
+ aom_highbd_10_obmc_variance16x64_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance32x8_c,
+ aom_highbd_10_obmc_variance32x8_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance8x32_c,
+ aom_highbd_10_obmc_variance8x32_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance16x4_c,
+ aom_highbd_10_obmc_variance16x4_sse4_1, 10),
+ TestFuncs(aom_highbd_10_obmc_variance4x16_c,
+ aom_highbd_10_obmc_variance4x16_sse4_1, 10),
+ TestFuncs(aom_highbd_12_obmc_variance64x16_c,
+ aom_highbd_12_obmc_variance64x16_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x64_c,
+ aom_highbd_12_obmc_variance16x64_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance32x8_c,
+ aom_highbd_12_obmc_variance32x8_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance8x32_c,
+ aom_highbd_12_obmc_variance8x32_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance16x4_c,
+ aom_highbd_12_obmc_variance16x4_sse4_1, 12),
+ TestFuncs(aom_highbd_12_obmc_variance4x16_c,
+ aom_highbd_12_obmc_variance4x16_sse4_1, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceHBDTest,
+ ::testing::ValuesIn(sse4_functions_hbd));
+#endif // HAVE_SSE4_1
+#endif // CONFIG_AV1_HIGHBITDEPTH && !CONFIG_REALTIME_ONLY
+} // namespace
diff --git a/third_party/aom/test/pickrst_test.cc b/third_party/aom/test/pickrst_test.cc
new file mode 100644
index 0000000000..04b6f45652
--- /dev/null
+++ b/third_party/aom/test/pickrst_test.cc
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/register_state_check.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_DATA_BLOCK 384
+
+namespace pickrst_test_lowbd {
+static const int kIterations = 100;
+
+typedef int64_t (*lowbd_pixel_proj_error_func)(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef std::tuple<const lowbd_pixel_proj_error_func> PixelProjErrorTestParam;
+
+class PixelProjErrorTest
+ : public ::testing::TestWithParam<PixelProjErrorTestParam> {
+ public:
+ void SetUp() override {
+ target_func_ = GET_PARAM(0);
+ src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*src_)));
+ ASSERT_NE(src_, nullptr);
+ dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*dgd_)));
+ ASSERT_NE(dgd_, nullptr);
+ flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt0_)));
+ ASSERT_NE(flt0_, nullptr);
+ flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt1_)));
+ ASSERT_NE(flt1_, nullptr);
+ }
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dgd_);
+ aom_free(flt0_);
+ aom_free(flt1_);
+ }
+ void RunPixelProjErrorTest(int32_t run_times);
+ void RunPixelProjErrorTest_ExtremeValues();
+
+ private:
+ lowbd_pixel_proj_error_func target_func_;
+ libaom_test::ACMRandom rng_;
+ uint8_t *src_;
+ uint8_t *dgd_;
+ int32_t *flt0_;
+ int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjErrorTest);
+
+void PixelProjErrorTest::RunPixelProjErrorTest(int32_t run_times) {
+ int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+ int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ int xq[2];
+ const int iters = run_times == 1 ? kIterations : 4;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t err_ref = 0, err_test = 1;
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = rng_.Rand8();
+ src_[i] = rng_.Rand8();
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ uint8_t *dgd = dgd_;
+ uint8_t *src = src_;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ err_ref = av1_lowbd_pixel_proj_error_c(src, h_end, v_end, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride,
+ flt1_, flt1_stride, xq, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ err_test =
+ target_func_(src, h_end, v_end, src_stride, dgd, dgd_stride, flt0_,
+ flt0_stride, flt1_, flt1_stride, xq, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+ params.r[1], h_end, v_end, time1, time2, time1 / time2);
+ }
+ ASSERT_EQ(err_ref, err_test);
+ }
+}
+
+void PixelProjErrorTest::RunPixelProjErrorTest_ExtremeValues() {
+ const int h_start = 0;
+ int h_end = 192;
+ const int v_start = 0;
+ int v_end = 192;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ int xq[2];
+ const int iters = kIterations;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t err_ref = 0, err_test = 1;
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = 0;
+ src_[i] = 255;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ params.r[0] = rng_.Rand8() % MAX_RADIUS;
+ params.r[1] = rng_.Rand8() % MAX_RADIUS;
+ params.s[0] = rng_.Rand8() % MAX_RADIUS;
+ params.s[1] = rng_.Rand8() % MAX_RADIUS;
+ uint8_t *dgd = dgd_;
+ uint8_t *src = src_;
+
+ err_ref = av1_lowbd_pixel_proj_error_c(
+ src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+ flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
+
+ err_test = target_func_(src, h_end - h_start, v_end - v_start, src_stride,
+ dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+ flt1_stride, xq, &params);
+
+ ASSERT_EQ(err_ref, err_test);
+ }
+}
+
+TEST_P(PixelProjErrorTest, RandomValues) { RunPixelProjErrorTest(1); }
+
+TEST_P(PixelProjErrorTest, ExtremeValues) {
+ RunPixelProjErrorTest_ExtremeValues();
+}
+
+TEST_P(PixelProjErrorTest, DISABLED_Speed) { RunPixelProjErrorTest(200000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjErrorTest,
+ ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjErrorTest,
+ ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, PixelProjErrorTest,
+ ::testing::Values(av1_lowbd_pixel_proj_error_neon));
+#endif // HAVE_NEON
+
+} // namespace pickrst_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace pickrst_test_highbd {
+static const int kIterations = 100;
+
+typedef int64_t (*highbd_pixel_proj_error_func)(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+typedef std::tuple<const highbd_pixel_proj_error_func> PixelProjErrorTestParam;
+
+class PixelProjHighbdErrorTest
+ : public ::testing::TestWithParam<PixelProjErrorTestParam> {
+ public:
+ void SetUp() override {
+ target_func_ = GET_PARAM(0);
+ src_ =
+ (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_));
+ ASSERT_NE(src_, nullptr);
+ dgd_ =
+ (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_));
+ ASSERT_NE(dgd_, nullptr);
+ flt0_ =
+ (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt0_));
+ ASSERT_NE(flt0_, nullptr);
+ flt1_ =
+ (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt1_));
+ ASSERT_NE(flt1_, nullptr);
+ }
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dgd_);
+ aom_free(flt0_);
+ aom_free(flt1_);
+ }
+ void RunPixelProjErrorTest(int32_t run_times);
+ void RunPixelProjErrorTest_ExtremeValues();
+
+ private:
+ highbd_pixel_proj_error_func target_func_;
+ libaom_test::ACMRandom rng_;
+ uint16_t *src_;
+ uint16_t *dgd_;
+ int32_t *flt0_;
+ int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(PixelProjHighbdErrorTest);
+
+void PixelProjHighbdErrorTest::RunPixelProjErrorTest(int32_t run_times) {
+ int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+ int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ int xq[2];
+ const int iters = run_times == 1 ? kIterations : 4;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t err_ref = 0, err_test = 1;
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = rng_.Rand16() % (1 << 12);
+ src_[i] = rng_.Rand16() % (1 << 12);
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+ params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+ uint8_t *dgd8 = CONVERT_TO_BYTEPTR(dgd_);
+ uint8_t *src8 = CONVERT_TO_BYTEPTR(src_);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ err_ref = av1_highbd_pixel_proj_error_c(
+ src8, h_end, v_end, src_stride, dgd8, dgd_stride, flt0_, flt0_stride,
+ flt1_, flt1_stride, xq, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ err_test =
+ target_func_(src8, h_end, v_end, src_stride, dgd8, dgd_stride, flt0_,
+ flt0_stride, flt1_, flt1_stride, xq, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+ params.r[1], h_end, v_end, time1, time2, time1 / time2);
+ }
+ ASSERT_EQ(err_ref, err_test);
+ }
+}
+
+void PixelProjHighbdErrorTest::RunPixelProjErrorTest_ExtremeValues() {
+ const int h_start = 0;
+ int h_end = 192;
+ const int v_start = 0;
+ int v_end = 192;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ int xq[2];
+ const int iters = kIterations;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t err_ref = 0, err_test = 1;
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = 0;
+ src_[i] = (1 << 12) - 1;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+ params.r[0] = rng_.Rand8() % MAX_RADIUS;
+ params.r[1] = rng_.Rand8() % MAX_RADIUS;
+ params.s[0] = rng_.Rand8() % MAX_RADIUS;
+ params.s[1] = rng_.Rand8() % MAX_RADIUS;
+ uint8_t *dgd8 = CONVERT_TO_BYTEPTR(dgd_);
+ uint8_t *src8 = CONVERT_TO_BYTEPTR(src_);
+
+ err_ref = av1_highbd_pixel_proj_error_c(
+ src8, h_end - h_start, v_end - v_start, src_stride, dgd8, dgd_stride,
+ flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
+
+ err_test = target_func_(src8, h_end - h_start, v_end - v_start, src_stride,
+ dgd8, dgd_stride, flt0_, flt0_stride, flt1_,
+ flt1_stride, xq, &params);
+
+ ASSERT_EQ(err_ref, err_test);
+ }
+}
+
+TEST_P(PixelProjHighbdErrorTest, RandomValues) { RunPixelProjErrorTest(1); }
+
+TEST_P(PixelProjHighbdErrorTest, ExtremeValues) {
+ RunPixelProjErrorTest_ExtremeValues();
+}
+
+TEST_P(PixelProjHighbdErrorTest, DISABLED_Speed) {
+ RunPixelProjErrorTest(200000);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjHighbdErrorTest,
+ ::testing::Values(av1_highbd_pixel_proj_error_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjHighbdErrorTest,
+ ::testing::Values(av1_highbd_pixel_proj_error_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, PixelProjHighbdErrorTest,
+ ::testing::Values(av1_highbd_pixel_proj_error_neon));
+#endif // HAVE_NEON
+
+} // namespace pickrst_test_highbd
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+////////////////////////////////////////////////////////////////////////////////
+// Get_proj_subspace_Test
+////////////////////////////////////////////////////////////////////////////////
+
+namespace get_proj_subspace_test_lowbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace)(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace> GetProjSubspaceTestParam;
+
+class GetProjSubspaceTest
+ : public ::testing::TestWithParam<GetProjSubspaceTestParam> {
+ public:
+ void SetUp() override {
+ target_func_ = GET_PARAM(0);
+ src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*src_)));
+ ASSERT_NE(src_, nullptr);
+ dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*dgd_)));
+ ASSERT_NE(dgd_, nullptr);
+ flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt0_)));
+ ASSERT_NE(flt0_, nullptr);
+ flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt1_)));
+ ASSERT_NE(flt1_, nullptr);
+ }
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dgd_);
+ aom_free(flt0_);
+ aom_free(flt1_);
+ }
+ void RunGetProjSubspaceTest(int32_t run_times);
+ void RunGetProjSubspaceTest_ExtremeValues();
+
+ private:
+ set_get_proj_subspace target_func_;
+ libaom_test::ACMRandom rng_;
+ uint8_t *src_;
+ uint8_t *dgd_;
+ int32_t *flt0_;
+ int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTest);
+
+void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
+ int h_end = run_times != 1
+ ? 128
+ : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+ 2147483640); // We test for widths divisible by 8.
+ int v_end =
+ run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ const int iters = run_times == 1 ? kIterations : 3;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+ int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = rng_.Rand8();
+ src_[i] = rng_.Rand8();
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR0[iter];
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR1[iter];
+ uint8_t *dgd = dgd_;
+ uint8_t *src = src_;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_calc_proj_params_c(src, v_end, h_end, src_stride, dgd, dgd_stride,
+ flt0_, flt0_stride, flt1_, flt1_stride, H_ref,
+ C_ref, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+ flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+ params.r[1], h_end, v_end, time1, time2, time1 / time2);
+ } else {
+ ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+ ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+ ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+ ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+ ASSERT_EQ(C_ref[0], C_test[0]);
+ ASSERT_EQ(C_ref[1], C_test[1]);
+ }
+ }
+}
+
+void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() {
+ const int h_start = 0;
+ int h_end = MAX_DATA_BLOCK;
+ const int v_start = 0;
+ int v_end = MAX_DATA_BLOCK;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ const int iters = kIterations;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+ int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = 0;
+ src_[i] = 255;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ params.r[0] = kR0[iter % 3];
+ params.r[1] = kR1[iter % 3];
+ uint8_t *dgd = dgd_;
+ uint8_t *src = src_;
+
+ av1_calc_proj_params_c(src, h_end - h_start, v_end - v_start, src_stride,
+ dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+ flt1_stride, H_ref, C_ref, &params);
+
+ target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+ C_test, &params);
+
+ ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+ ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+ ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+ ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+ ASSERT_EQ(C_ref[0], C_test[0]);
+ ASSERT_EQ(C_ref[1], C_test[1]);
+ }
+}
+
+TEST_P(GetProjSubspaceTest, RandomValues) { RunGetProjSubspaceTest(1); }
+
+TEST_P(GetProjSubspaceTest, ExtremeValues) {
+ RunGetProjSubspaceTest_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); }
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, GetProjSubspaceTest,
+ ::testing::Values(av1_calc_proj_params_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
+ ::testing::Values(av1_calc_proj_params_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTest,
+ ::testing::Values(av1_calc_proj_params_neon));
+#endif // HAVE_NEON
+
+} // namespace get_proj_subspace_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace get_proj_subspace_test_hbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace_hbd)(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace_hbd> GetProjSubspaceHBDTestParam;
+
+class GetProjSubspaceTestHBD
+ : public ::testing::TestWithParam<GetProjSubspaceHBDTestParam> {
+ public:
+ void SetUp() override {
+ target_func_ = GET_PARAM(0);
+ src_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*src_)));
+ ASSERT_NE(src_, nullptr);
+ dgd_ = (uint16_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*dgd_)));
+ ASSERT_NE(dgd_, nullptr);
+ flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt0_)));
+ ASSERT_NE(flt0_, nullptr);
+ flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+ sizeof(*flt1_)));
+ ASSERT_NE(flt1_, nullptr);
+ }
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dgd_);
+ aom_free(flt0_);
+ aom_free(flt1_);
+ }
+ void RunGetProjSubspaceTestHBD(int32_t run_times);
+ void RunGetProjSubspaceTestHBD_ExtremeValues();
+
+ private:
+ set_get_proj_subspace_hbd target_func_;
+ libaom_test::ACMRandom rng_;
+ uint16_t *src_;
+ uint16_t *dgd_;
+ int32_t *flt0_;
+ int32_t *flt1_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GetProjSubspaceTestHBD);
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD(int32_t run_times) {
+ int h_end = run_times != 1
+ ? 128
+ : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+ 2147483640); // We test for widths divisible by 8.
+ int v_end =
+ run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ const int iters = run_times == 1 ? kIterations : 3;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+ int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = rng_.Rand16() % 4095;
+ src_[i] = rng_.Rand16() % 4095;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+
+ params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR0[iter];
+ params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : kR1[iter];
+ uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+ uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_calc_proj_params_high_bd_c(src, v_end, h_end, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride, flt1_,
+ flt1_stride, H_ref, C_ref, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+ flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+ params.r[1], h_end, v_end, time1, time2, time1 / time2);
+ } else {
+ ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+ ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+ ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+ ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+ ASSERT_EQ(C_ref[0], C_test[0]);
+ ASSERT_EQ(C_ref[1], C_test[1]);
+ }
+ }
+}
+
+void GetProjSubspaceTestHBD::RunGetProjSubspaceTestHBD_ExtremeValues() {
+ const int h_start = 0;
+ int h_end = MAX_DATA_BLOCK;
+ const int v_start = 0;
+ int v_end = MAX_DATA_BLOCK;
+ const int dgd_stride = MAX_DATA_BLOCK;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int flt0_stride = MAX_DATA_BLOCK;
+ const int flt1_stride = MAX_DATA_BLOCK;
+ sgr_params_type params;
+ const int iters = kIterations;
+ static constexpr int kR0[3] = { 1, 1, 0 };
+ static constexpr int kR1[3] = { 1, 0, 1 };
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+ int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_[i] = 0;
+ src_[i] = 4095;
+ flt0_[i] = rng_.Rand15Signed();
+ flt1_[i] = rng_.Rand15Signed();
+ }
+ params.r[0] = kR0[iter % 3];
+ params.r[1] = kR1[iter % 3];
+ uint8_t *dgd = CONVERT_TO_BYTEPTR(dgd_);
+ uint8_t *src = CONVERT_TO_BYTEPTR(src_);
+
+ av1_calc_proj_params_high_bd_c(
+ src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+ flt0_, flt0_stride, flt1_, flt1_stride, H_ref, C_ref, &params);
+
+ target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+ dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+ C_test, &params);
+
+ ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+ ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+ ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+ ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+ ASSERT_EQ(C_ref[0], C_test[0]);
+ ASSERT_EQ(C_ref[1], C_test[1]);
+ }
+}
+
+TEST_P(GetProjSubspaceTestHBD, RandomValues) { RunGetProjSubspaceTestHBD(1); }
+
+TEST_P(GetProjSubspaceTestHBD, ExtremeValues) {
+ RunGetProjSubspaceTestHBD_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTestHBD, DISABLED_Speed) {
+ RunGetProjSubspaceTestHBD(200000);
+}
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, GetProjSubspaceTestHBD,
+ ::testing::Values(av1_calc_proj_params_high_bd_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTestHBD,
+ ::testing::Values(av1_calc_proj_params_high_bd_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, GetProjSubspaceTestHBD,
+ ::testing::Values(av1_calc_proj_params_high_bd_neon));
+#endif // HAVE_NEON
+} // namespace get_proj_subspace_test_hbd
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/test/postproc_filters_test.cc b/third_party/aom/test/postproc_filters_test.cc
new file mode 100644
index 0000000000..9584dd8c35
--- /dev/null
+++ b/third_party/aom/test/postproc_filters_test.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+class PostprocFiltersTest
+ : public ::libaom_test::CodecTestWith2Params<int, unsigned int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ PostprocFiltersTest()
+ : EncoderTest(GET_PARAM(0)), set_skip_postproc_filtering_(false),
+ frame_number_(0), cpu_used_(GET_PARAM(1)), bd_(GET_PARAM(2)) {}
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kAllIntra);
+ cfg_.g_input_bit_depth = bd_;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ frame_number_ = video->frame();
+ if (frame_number_ == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ }
+ if (set_skip_postproc_filtering_) {
+ if (frame_number_ == 0) {
+ encoder->Control(AV1E_SET_SKIP_POSTPROC_FILTERING, 1);
+ } else if (frame_number_ == 10) {
+ encoder->Control(AV1E_SET_SKIP_POSTPROC_FILTERING, 0);
+ } else if (frame_number_ == 20) {
+ encoder->Control(AV1E_SET_SKIP_POSTPROC_FILTERING, 1);
+ }
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ ::libaom_test::MD5 md5_enc;
+ md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz);
+ md5_enc_.push_back(md5_enc.Get());
+ }
+
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
+ const aom_image_t *img_enc = encoder->GetPreviewFrame();
+ if (!set_skip_postproc_filtering_) {
+ ASSERT_NE(img_enc, nullptr);
+ } else {
+ // Null will be returned if we query the reconstructed frame when
+ // AV1E_SET_SKIP_POSTPROC_FILTERING is set to 1.
+ if (frame_number_ < 10) {
+ ASSERT_EQ(img_enc, nullptr);
+ } else if (frame_number_ < 20) {
+ // Reconstructed frame cannot be null when
+ // AV1E_SET_SKIP_POSTPROC_FILTERING is set to 0.
+ ASSERT_NE(img_enc, nullptr);
+ } else {
+ ASSERT_EQ(img_enc, nullptr);
+ }
+ }
+ }
+
+ // The encoder config flag 'AV1E_SET_SKIP_POSTPROC_FILTERING' can be used to
+ // skip the application of post-processing filters on reconstructed frame for
+ // ALLINTRA encode. This unit-test validates the bit exactness of 2 encoded
+ // streams with 'AV1E_SET_SKIP_POSTPROC_FILTERING':
+ // 1. disabled for all frames (default case)
+ // 2. enabled and disabled at different frame indices using control calls.
+ void DoTest() {
+ std::unique_ptr<libaom_test::VideoSource> video(
+ new libaom_test::YUVVideoSource("niklas_640_480_30.yuv",
+ AOM_IMG_FMT_I420, 640, 480, 30, 1, 0,
+ kFrames));
+ ASSERT_NE(video, nullptr);
+
+ // First encode: 'AV1E_SET_SKIP_POSTPROC_FILTERING' disabled for all frames
+ // (default case).
+ set_skip_postproc_filtering_ = false;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ std::vector<std::string> apply_postproc_filters_md5_enc =
+ std::move(md5_enc_);
+ md5_enc_.clear();
+
+ // Second encode: 'AV1E_SET_SKIP_POSTPROC_FILTERING' enabled and disabled at
+ // different frame intervals.
+ set_skip_postproc_filtering_ = true;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ std::vector<std::string> toggle_apply_postproc_filters_md5_enc =
+ std::move(md5_enc_);
+ md5_enc_.clear();
+
+ // Check for bit match.
+ ASSERT_EQ(apply_postproc_filters_md5_enc,
+ toggle_apply_postproc_filters_md5_enc);
+ }
+
+ bool set_skip_postproc_filtering_;
+ unsigned int frame_number_;
+ std::vector<std::string> md5_enc_;
+
+ private:
+ static constexpr int kFrames = 30;
+ static constexpr unsigned int kCqLevel = 18;
+ int cpu_used_;
+ unsigned int bd_;
+};
+
+class PostprocFiltersTestLarge : public PostprocFiltersTest {};
+
+TEST_P(PostprocFiltersTest, MD5Match) { DoTest(); }
+
+TEST_P(PostprocFiltersTestLarge, MD5Match) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(PostprocFiltersTest, ::testing::Values(9),
+ ::testing::Values(8, 10));
+
+// Test cpu_used 3 and 6.
+AV1_INSTANTIATE_TEST_SUITE(PostprocFiltersTestLarge, ::testing::Values(3, 6),
+ ::testing::Values(8, 10));
+
+} // namespace
diff --git a/third_party/aom/test/quant_test.cc b/third_party/aom/test/quant_test.cc
new file mode 100644
index 0000000000..afbabb3147
--- /dev/null
+++ b/third_party/aom/test/quant_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "av1/encoder/av1_quantize.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const ::libaom_test::TestMode kTestMode[] =
+#if CONFIG_REALTIME_ONLY
+ { ::libaom_test::kRealTime };
+#else
+ { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
+class QMTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ QMTest() : EncoderTest(GET_PARAM(0)) {}
+ ~QMTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ set_cpu_used_ = GET_PARAM(2);
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_ENABLE_QM, 1);
+ encoder->Control(AV1E_SET_QM_MIN, qm_min_);
+ encoder->Control(AV1E_SET_QM_MAX, qm_max_);
+
+ encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+ if (mode_ == ::libaom_test::kRealTime) {
+ encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ }
+ }
+ }
+
+ void DoTest(int qm_min, int qm_max) {
+ qm_min_ = qm_min;
+ qm_max_ = qm_max;
+ cfg_.kf_max_dist = 12;
+ cfg_.rc_min_quantizer = 8;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 6;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_target_bitrate = 300;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 15);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ int set_cpu_used_;
+ int qm_min_;
+ int qm_max_;
+};
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
+
+AV1_INSTANTIATE_TEST_SUITE(QMTest, ::testing::ValuesIn(kTestMode),
+ ::testing::Range(5, 9));
+
+#if !CONFIG_REALTIME_ONLY
+typedef struct {
+ const unsigned int min_q;
+ const unsigned int max_q;
+} QuantParam;
+
+const QuantParam QuantTestParams[] = {
+ { 0, 10 }, { 0, 60 }, { 20, 35 }, { 35, 50 }, { 50, 63 }
+};
+
+std::ostream &operator<<(std::ostream &os, const QuantParam &test_arg) {
+ return os << "QuantParam { min_q:" << test_arg.min_q
+ << " max_q:" << test_arg.max_q << " }";
+}
+
+/*
+ * This class is used to test whether base_qindex is within min
+ * and max quantizer range configured by user.
+ */
+class QuantizerBoundsCheckTestLarge
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+ QuantParam, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ QuantizerBoundsCheckTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ quant_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+ quant_bound_violated_ = false;
+ }
+ ~QuantizerBoundsCheckTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 1;
+ cfg_.rc_min_quantizer = quant_param_.min_q;
+ cfg_.rc_max_quantizer = quant_param_.max_q;
+ cfg_.g_lag_in_frames = 35;
+ if (rc_end_usage_ != AOM_Q) {
+ cfg_.rc_target_bitrate = 400;
+ }
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_LAST_QUANTIZER,
+ &base_qindex_);
+ min_bound_qindex_ = av1_quantizer_to_qindex(cfg_.rc_min_quantizer);
+ max_bound_qindex_ = av1_quantizer_to_qindex(cfg_.rc_max_quantizer);
+ if ((base_qindex_ < min_bound_qindex_ ||
+ base_qindex_ > max_bound_qindex_) &&
+ quant_bound_violated_ == false) {
+ quant_bound_violated_ = true;
+ }
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ const QuantParam quant_param_;
+ int base_qindex_;
+ int min_bound_qindex_;
+ int max_bound_qindex_;
+ bool quant_bound_violated_;
+ aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(QuantizerBoundsCheckTestLarge, QuantizerBoundsCheckEncodeTest) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 50);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(quant_bound_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(QuantizerBoundsCheckTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::ValuesIn(QuantTestParams),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+#endif // !CONFIG_REALTIME_ONLY
+} // namespace
diff --git a/third_party/aom/test/quantize_func_test.cc b/third_party/aom/test/quantize_func_test.cc
new file mode 100644
index 0000000000..328d5b10df
--- /dev/null
+++ b/third_party/aom/test/quantize_func_test.cc
@@ -0,0 +1,795 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <algorithm>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+#define QUAN_PARAM_LIST \
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, \
+ const int16_t *round_ptr, const int16_t *quant_ptr, \
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, \
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \
+ const int16_t *scan, const int16_t *iscan
+
+#define LP_QUANTIZE_PARAM_LIST \
+ const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, \
+ const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, \
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, \
+ const int16_t *iscan
+
+typedef void (*LPQuantizeFunc)(LP_QUANTIZE_PARAM_LIST);
+typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
+typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
+
+#undef LP_QUANTIZE_PARAM_LIST
+
+#define HBD_QUAN_FUNC \
+ fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale)
+
+#define LBD_QUAN_FUNC \
+ fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan)
+
+template <QuantizeFuncHbd fn>
+void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) {
+ const int log_scale = 0;
+ HBD_QUAN_FUNC;
+}
+
+template <QuantizeFuncHbd fn>
+void highbd_quan32x32_wrapper(QUAN_PARAM_LIST) {
+ const int log_scale = 1;
+ HBD_QUAN_FUNC;
+}
+
+template <QuantizeFuncHbd fn>
+void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
+ const int log_scale = 2;
+ HBD_QUAN_FUNC;
+}
+
+enum QuantType { TYPE_B, TYPE_DC, TYPE_FP };
+
+using std::tuple;
+
+template <typename FuncType>
+using QuantizeParam =
+ tuple<FuncType, FuncType, TX_SIZE, QuantType, aom_bit_depth_t>;
+
+typedef struct {
+ QUANTS quant;
+ Dequants dequant;
+} QuanTable;
+
+const int kTestNum = 1000;
+
+#define GET_TEMPLATE_PARAM(k) std::get<k>(this->GetParam())
+
+template <typename CoeffType, typename FuncType>
+class QuantizeTestBase
+ : public ::testing::TestWithParam<QuantizeParam<FuncType>> {
+ protected:
+ QuantizeTestBase()
+ : quant_ref_(GET_TEMPLATE_PARAM(0)), quant_(GET_TEMPLATE_PARAM(1)),
+ tx_size_(GET_TEMPLATE_PARAM(2)), type_(GET_TEMPLATE_PARAM(3)),
+ bd_(GET_TEMPLATE_PARAM(4)) {}
+
+ ~QuantizeTestBase() override = default;
+
+ void SetUp() override {
+ qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
+ ASSERT_NE(qtab_, nullptr);
+ const int n_coeffs = coeff_num();
+ coeff_ = reinterpret_cast<CoeffType *>(
+ aom_memalign(32, 6 * n_coeffs * sizeof(CoeffType)));
+ ASSERT_NE(coeff_, nullptr);
+ InitQuantizer();
+ }
+
+ void TearDown() override {
+ aom_free(qtab_);
+ qtab_ = nullptr;
+ aom_free(coeff_);
+ coeff_ = nullptr;
+ }
+
+ void InitQuantizer() {
+ av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
+ }
+
+ virtual void RunQuantizeFunc(
+ const CoeffType *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, CoeffType *qcoeff_ptr,
+ CoeffType *qcoeff_ref_ptr, CoeffType *dqcoeff_ptr,
+ CoeffType *dqcoeff_ref_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ref_ptr, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) = 0;
+
+ void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
+ CoeffType *coeff_ptr = coeff_;
+ const intptr_t n_coeffs = coeff_num();
+
+ CoeffType *qcoeff_ref = coeff_ptr + n_coeffs;
+ CoeffType *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+ CoeffType *qcoeff = dqcoeff_ref + n_coeffs;
+ CoeffType *dqcoeff = qcoeff + n_coeffs;
+ uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+ // Testing uses 2-D DCT scan order table
+ const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+ // Testing uses luminance quantization table
+ const int16_t *zbin = qtab_->quant.y_zbin[q];
+
+ const int16_t *round = nullptr;
+ const int16_t *quant = nullptr;
+ if (type_ == TYPE_B) {
+ round = qtab_->quant.y_round[q];
+ quant = qtab_->quant.y_quant[q];
+ } else if (type_ == TYPE_FP) {
+ round = qtab_->quant.y_round_fp[q];
+ quant = qtab_->quant.y_quant_fp[q];
+ }
+
+ const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
+ const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+
+ for (int i = 0; i < test_num; ++i) {
+ if (is_loop) FillCoeffRandom();
+
+ memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
+
+ RunQuantizeFunc(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
+ qcoeff, qcoeff_ref, dqcoeff, dqcoeff_ref, dequant,
+ &eob[0], &eob[1], sc->scan, sc->iscan);
+
+ for (int j = 0; j < n_coeffs; ++j) {
+ ASSERT_EQ(qcoeff_ref[j], qcoeff[j])
+ << "Q mismatch on test: " << i << " at position: " << j
+ << " Q: " << q << " coeff: " << coeff_ptr[j];
+ }
+
+ for (int j = 0; j < n_coeffs; ++j) {
+ ASSERT_EQ(dqcoeff_ref[j], dqcoeff[j])
+ << "Dq mismatch on test: " << i << " at position: " << j
+ << " Q: " << q << " coeff: " << coeff_ptr[j];
+ }
+
+ ASSERT_EQ(eob[0], eob[1])
+ << "eobs mismatch on test: " << i << " Q: " << q;
+ }
+ }
+
+ void CompareResults(const CoeffType *buf_ref, const CoeffType *buf, int size,
+ const char *text, int q, int number) {
+ int i;
+ for (i = 0; i < size; ++i) {
+ ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number
+ << " at position: " << i << " Q: " << q;
+ }
+ }
+
+ int coeff_num() const { return av1_get_max_eob(tx_size_); }
+
+ void FillCoeff(CoeffType c) {
+ const int n_coeffs = coeff_num();
+ for (int i = 0; i < n_coeffs; ++i) {
+ coeff_[i] = c;
+ }
+ }
+
+ void FillCoeffRandom() {
+ const int n_coeffs = coeff_num();
+ FillCoeffZero();
+ const int num = rnd_.Rand16() % n_coeffs;
+ // Randomize the first non zero coeff position.
+ const int start = rnd_.Rand16() % n_coeffs;
+ const int end = std::min(start + num, n_coeffs);
+ for (int i = start; i < end; ++i) {
+ coeff_[i] = GetRandomCoeff();
+ }
+ }
+
+ void FillCoeffRandomRows(int num) {
+ FillCoeffZero();
+ for (int i = 0; i < num; ++i) {
+ coeff_[i] = GetRandomCoeff();
+ }
+ }
+
+ void FillCoeffZero() { FillCoeff(0); }
+
+ void FillCoeffConstant() {
+ CoeffType c = GetRandomCoeff();
+ FillCoeff(c);
+ }
+
+ void FillDcOnly() {
+ FillCoeffZero();
+ coeff_[0] = GetRandomCoeff();
+ }
+
+ void FillDcLargeNegative() {
+ FillCoeffZero();
+ // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+ // like BUG=883 where the constant being compared was incorrectly
+ // initialized.
+ coeff_[0] = -8191;
+ }
+
+ CoeffType GetRandomCoeff() {
+ CoeffType coeff;
+ if (bd_ == AOM_BITS_8) {
+ coeff =
+ clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX);
+ } else {
+ CoeffType min = -(1 << (7 + bd_));
+ CoeffType max = -min - 1;
+ coeff = clamp(static_cast<CoeffType>(rnd_.Rand31()), min, max);
+ }
+ return coeff;
+ }
+
+ ACMRandom rnd_;
+ QuanTable *qtab_;
+ CoeffType *coeff_;
+ FuncType quant_ref_;
+ FuncType quant_;
+ TX_SIZE tx_size_;
+ QuantType type_;
+ aom_bit_depth_t bd_;
+};
+
+class FullPrecisionQuantizeTest
+ : public QuantizeTestBase<tran_low_t, QuantizeFunc> {
+ void RunQuantizeFunc(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *qcoeff_ref_ptr,
+ tran_low_t *dqcoeff_ptr, tran_low_t *dqcoeff_ref_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ref_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) override {
+ quant_ref_(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ref_ptr, dqcoeff_ref_ptr, dequant_ptr,
+ eob_ref_ptr, scan, iscan);
+
+ API_REGISTER_STATE_CHECK(quant_(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan));
+ }
+};
+
+class LowPrecisionQuantizeTest
+ : public QuantizeTestBase<int16_t, LPQuantizeFunc> {
+ void RunQuantizeFunc(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t * /*zbin_ptr*/, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t * /*quant_shift_ptr*/, int16_t *qcoeff_ptr,
+ int16_t *qcoeff_ref_ptr, int16_t *dqcoeff_ptr,
+ int16_t *dqcoeff_ref_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ref_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) override {
+ quant_ref_(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ref_ptr,
+ dqcoeff_ref_ptr, dequant_ptr, eob_ref_ptr, scan, iscan);
+
+ API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan));
+ }
+};
+
+TEST_P(FullPrecisionQuantizeTest, ZeroInput) {
+ FillCoeffZero();
+ QuantizeRun(false);
+}
+
+TEST_P(FullPrecisionQuantizeTest, LargeNegativeInput) {
+ FillDcLargeNegative();
+ QuantizeRun(false, 0, 1);
+}
+
+TEST_P(FullPrecisionQuantizeTest, DcOnlyInput) {
+ FillDcOnly();
+ QuantizeRun(false, 0, 1);
+}
+
+TEST_P(FullPrecisionQuantizeTest, RandomInput) {
+ QuantizeRun(true, 0, kTestNum);
+}
+
+TEST_P(FullPrecisionQuantizeTest, MultipleQ) {
+ for (int q = 0; q < QINDEX_RANGE; ++q) {
+ QuantizeRun(true, q, kTestNum);
+ }
+}
+
+// Force the coeff to be half the value of the dequant. This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(FullPrecisionQuantizeTest, CoeffHalfDequant) {
+ FillCoeff(16);
+ QuantizeRun(false, 25, 1);
+}
+
+TEST_P(FullPrecisionQuantizeTest, DISABLED_Speed) {
+ tran_low_t *coeff_ptr = coeff_;
+ const intptr_t n_coeffs = coeff_num();
+
+ tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
+ tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+ tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
+ tran_low_t *dqcoeff = qcoeff + n_coeffs;
+ uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+ // Testing uses 2-D DCT scan order table
+ const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+ // Testing uses luminance quantization table
+ const int q = 22;
+ const int16_t *zbin = qtab_->quant.y_zbin[q];
+ const int16_t *round_fp = qtab_->quant.y_round_fp[q];
+ const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
+ const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
+ const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+ const int kNumTests = 5000000;
+ aom_usec_timer timer, simd_timer;
+ int rows = tx_size_high[tx_size_];
+ int cols = tx_size_wide[tx_size_];
+ rows = AOMMIN(32, rows);
+ cols = AOMMIN(32, cols);
+ for (int cnt = 0; cnt <= rows; cnt++) {
+ FillCoeffRandomRows(cnt * cols);
+
+ aom_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+ qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&timer);
+
+ aom_usec_timer_start(&simd_timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+ dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&simd_timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ const int simd_elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+ printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
+ simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
+ }
+}
+
+// TODO(crbug.com/aomedia/2796)
+TEST_P(LowPrecisionQuantizeTest, ZeroInput) {
+ FillCoeffZero();
+ QuantizeRun(false);
+}
+
+TEST_P(LowPrecisionQuantizeTest, LargeNegativeInput) {
+ FillDcLargeNegative();
+ QuantizeRun(false, 0, 1);
+}
+
+TEST_P(LowPrecisionQuantizeTest, DcOnlyInput) {
+ FillDcOnly();
+ QuantizeRun(false, 0, 1);
+}
+
+TEST_P(LowPrecisionQuantizeTest, RandomInput) {
+ QuantizeRun(true, 0, kTestNum);
+}
+
+TEST_P(LowPrecisionQuantizeTest, MultipleQ) {
+ for (int q = 0; q < QINDEX_RANGE; ++q) {
+ QuantizeRun(true, q, kTestNum);
+ }
+}
+
+// Force the coeff to be half the value of the dequant. This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(LowPrecisionQuantizeTest, CoeffHalfDequant) {
+ FillCoeff(16);
+ QuantizeRun(false, 25, 1);
+}
+
+TEST_P(LowPrecisionQuantizeTest, DISABLED_Speed) {
+ int16_t *coeff_ptr = coeff_;
+ const intptr_t n_coeffs = coeff_num();
+
+ int16_t *qcoeff_ref = coeff_ptr + n_coeffs;
+ int16_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+ int16_t *qcoeff = dqcoeff_ref + n_coeffs;
+ int16_t *dqcoeff = qcoeff + n_coeffs;
+ uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+ // Testing uses 2-D DCT scan order table
+ const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+ // Testing uses luminance quantization table
+ const int q = 22;
+ const int16_t *round_fp = qtab_->quant.y_round_fp[q];
+ const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
+ const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+ const int kNumTests = 5000000;
+ aom_usec_timer timer, simd_timer;
+ int rows = tx_size_high[tx_size_];
+ int cols = tx_size_wide[tx_size_];
+ rows = AOMMIN(32, rows);
+ cols = AOMMIN(32, cols);
+ for (int cnt = 0; cnt <= rows; cnt++) {
+ FillCoeffRandomRows(cnt * cols);
+
+ aom_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_ref_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff,
+ dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&timer);
+
+ aom_usec_timer_start(&simd_timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff, dequant,
+ eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&simd_timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ const int simd_elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+ printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
+ simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
+ }
+}
+
+using std::make_tuple;
+
+#if HAVE_AVX2
+
+const QuantizeParam<LPQuantizeFunc> kLPQParamArrayAvx2[] = {
+ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, LowPrecisionQuantizeTest,
+ ::testing::ValuesIn(kLPQParamArrayAvx2));
+
+const QuantizeParam<QuantizeFunc> kQParamArrayAvx2[] = {
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+ static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+ static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+ static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+ static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+ static_cast<TX_SIZE>(TX_16X64), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+ static_cast<TX_SIZE>(TX_64X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_10),
+ make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12),
+ make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_10),
+ make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12),
+ make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+ make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_10),
+ make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_avx2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+ static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8),
+#endif // !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_avx2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, FullPrecisionQuantizeTest,
+ ::testing::ValuesIn(kQParamArrayAvx2));
+#endif // HAVE_AVX2
+
+#if HAVE_SSE2
+
+const QuantizeParam<LPQuantizeFunc> kLPQParamArraySSE2[] = {
+ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+ static_cast<TX_SIZE>(TX_8X8), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+ static_cast<TX_SIZE>(TX_4X4), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, LowPrecisionQuantizeTest,
+ ::testing::ValuesIn(kLPQParamArraySSE2));
+
+const QuantizeParam<QuantizeFunc> kQParamArraySSE2[] = {
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+ static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+ static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+ static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+ static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+#endif // !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+ &aom_highbd_quantize_b_64x64_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+ &aom_highbd_quantize_b_64x64_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+ &aom_highbd_quantize_b_64x64_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_adaptive_c,
+ &aom_quantize_b_32x32_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_32X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_adaptive_c,
+ &aom_quantize_b_32x32_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_16X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_adaptive_c,
+ &aom_quantize_b_32x32_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_adaptive_c,
+ &aom_quantize_b_64x64_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_32X64), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_adaptive_c,
+ &aom_quantize_b_64x64_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_64X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_adaptive_c,
+ &aom_quantize_b_64x64_adaptive_sse2,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, FullPrecisionQuantizeTest,
+ ::testing::ValuesIn(kQParamArraySSE2));
+#endif
+
+#if HAVE_NEON
+
+const QuantizeParam<LPQuantizeFunc> kLPQParamArrayNEON[] = {
+ make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(av1_quantize_lp_c, av1_quantize_lp_neon,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, LowPrecisionQuantizeTest,
+ ::testing::ValuesIn(kLPQParamArrayNEON));
+
+const QuantizeParam<QuantizeFunc> kQParamArrayNEON[] = {
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+ static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+ static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+ static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+ static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_neon,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+ make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_neon,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_neon,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_neon,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_neon,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_neon>,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12),
+ make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_neon>,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12),
+ make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+ &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_neon>,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_neon,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_neon,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_neon,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(&aom_highbd_quantize_b_adaptive_c,
+ &aom_highbd_quantize_b_adaptive_neon,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+ &aom_highbd_quantize_b_32x32_adaptive_neon,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+ &aom_highbd_quantize_b_64x64_adaptive_neon,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, FullPrecisionQuantizeTest,
+ ::testing::ValuesIn(kQParamArrayNEON));
+#endif
+
+#if HAVE_SSSE3 && AOM_ARCH_X86_64
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, FullPrecisionQuantizeTest,
+ ::testing::Values(
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_ssse3,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_ssse3,
+ static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)));
+
+#endif // HAVE_SSSE3 && AOM_ARCH_X86_64
+
+#if HAVE_AVX
+INSTANTIATE_TEST_SUITE_P(
+ AVX, FullPrecisionQuantizeTest,
+ ::testing::Values(
+ make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
+ static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx,
+ static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
+
+#endif // HAVE_AVX
+
+} // namespace
diff --git a/third_party/aom/test/ratectrl_rtc_test.cc b/third_party/aom/test/ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..cc054b6926
--- /dev/null
+++ b/third_party/aom/test/ratectrl_rtc_test.cc
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/ratectrl_rtc.h"
+
+#include <memory>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/i420_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+constexpr size_t kNumFrames = 450;
+
+const int kTemporalId3Layer[4] = { 0, 2, 1, 2 };
+const int kTemporalId2Layer[2] = { 0, 1 };
+const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
+const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
+const int kSpatialLayerBitrate[3] = { 200, 500, 900 };
+
+// Parameter: aq mode: 0 and 3
+class RcInterfaceTest : public ::libaom_test::EncoderTest,
+ public ::libaom_test::CodecTestWithParam<int> {
+ public:
+ RcInterfaceTest()
+ : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+ encoder_exit_(false), layer_frame_cnt_(0), superframe_cnt_(0),
+ frame_cnt_(0), dynamic_temporal_layers_(false),
+ dynamic_spatial_layers_(false), num_drops_(0), max_consec_drop_(0),
+ frame_drop_thresh_(0) {
+ memset(&svc_params_, 0, sizeof(svc_params_));
+ memset(&layer_id_, 0, sizeof(layer_id_));
+ }
+
+ ~RcInterfaceTest() override = default;
+
+ protected:
+ void SetUp() override { InitializeConfig(::libaom_test::kRealTime); }
+
+ int GetNumSpatialLayers() override { return rc_cfg_.ss_number_layers; }
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ int key_int = key_interval_;
+ const int use_svc =
+ rc_cfg_.ss_number_layers > 1 || rc_cfg_.ts_number_layers > 1;
+ encoder->Control(AV1E_SET_RTC_EXTERNAL_RC, 1);
+ if (video->frame() == 0 && layer_frame_cnt_ == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 7);
+ encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+ if (rc_cfg_.is_screen) {
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ } else {
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ }
+ encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT,
+ rc_cfg_.max_intra_bitrate_pct);
+ if (use_svc) encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ encoder->Control(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, max_consec_drop_);
+ }
+ // SVC specific settings
+ if (use_svc) {
+ frame_params_.spatial_layer_id =
+ layer_frame_cnt_ % rc_cfg_.ss_number_layers;
+ if (rc_cfg_.ts_number_layers == 3)
+ frame_params_.temporal_layer_id =
+ kTemporalId3Layer[superframe_cnt_ % 4];
+ else if (rc_cfg_.ts_number_layers == 2)
+ frame_params_.temporal_layer_id =
+ kTemporalId2Layer[superframe_cnt_ % 2];
+ else
+ frame_params_.temporal_layer_id = 0;
+ layer_id_.spatial_layer_id = frame_params_.spatial_layer_id;
+ layer_id_.temporal_layer_id = frame_params_.temporal_layer_id;
+ encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
+ key_int = key_interval_ * rc_cfg_.ss_number_layers;
+ }
+ frame_params_.frame_type =
+ layer_frame_cnt_ % key_int == 0 ? aom::kKeyFrame : aom::kInterFrame;
+ encoder_exit_ = video->frame() == kNumFrames;
+ frame_flags_ = 0;
+
+ if (dynamic_temporal_layers_) {
+ if (superframe_cnt_ == 100 && layer_id_.spatial_layer_id == 0) {
+ // Go down to 2 temporal layers.
+ SetConfigSvc(3, 2);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+ } else if (superframe_cnt_ == 200 && layer_id_.spatial_layer_id == 0) {
+ // Go down to 1 temporal layer.
+ SetConfigSvc(3, 1);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+ } else if (superframe_cnt_ == 300 && layer_id_.spatial_layer_id == 0) {
+ // Go back up to 3 temporal layers.
+ SetConfigSvc(3, 3);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+ }
+ } else if (dynamic_spatial_layers_) {
+ // In this example the #spatial layers is modified on the fly,
+ // so we go from (120p,240p,480p) to (240p,480p), etc.
+ if (superframe_cnt_ == 100 && layer_id_.spatial_layer_id == 0) {
+ // Change to 2 spatial layers (240p, 480p).
+ SetConfigSvc(2, 3);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+ } else if (superframe_cnt_ == 200 && layer_id_.spatial_layer_id == 0) {
+ // Change to 1 spatial layer (480p).
+ SetConfigSvc(1, 3);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+ } else if (superframe_cnt_ == 300 && layer_id_.spatial_layer_id == 0) {
+ // Go back to 3 spatial layers (120p, 240p, 480p).
+ SetConfigSvc(3, 3);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ // In the fixed SVC mode (which is what is used in this test):
+ // Key frame is required here on SL0 since 120p will try to predict
+ // from LAST which was the 480p, so decoder will throw an error
+ // (reference must be smaller than 4x4). In the flexible mode
+ // (not used here) we can set the frame flags to predict off the 2x2
+ // reference instead,
+ frame_flags_ = AOM_EFLAG_FORCE_KF;
+ frame_params_.frame_type = aom::kKeyFrame;
+ ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+ }
+ }
+ // TODO(marpan): Add dynamic spatial layers based on 0 layer bitrate.
+ // That is actual usage in SW where configuration (#spatial, #temporal)
+ // layers is fixed, but top layer is dropped or re-enabled based on
+ // bitrate. This requires external RC to handle dropped (zero-size) frames.
+ }
+
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
+ if (encoder_exit_) {
+ return;
+ }
+ layer_frame_cnt_++;
+ frame_cnt_++;
+ if (layer_id_.spatial_layer_id == rc_cfg_.ss_number_layers - 1)
+ superframe_cnt_++;
+ int qp;
+ encoder->Control(AOME_GET_LAST_QUANTIZER, &qp);
+ if (rc_api_->ComputeQP(frame_params_) == aom::FrameDropDecision::kOk) {
+ ASSERT_EQ(rc_api_->GetQP(), qp) << "at frame " << frame_cnt_ - 1;
+ int encoder_lpf_level;
+ encoder->Control(AOME_GET_LOOPFILTER_LEVEL, &encoder_lpf_level);
+ aom::AV1LoopfilterLevel loopfilter_level = rc_api_->GetLoopfilterLevel();
+ ASSERT_EQ(loopfilter_level.filter_level[0], encoder_lpf_level);
+ aom::AV1CdefInfo cdef_level = rc_api_->GetCdefInfo();
+ int cdef_y_strengths[16];
+ encoder->Control(AV1E_GET_LUMA_CDEF_STRENGTH, cdef_y_strengths);
+ ASSERT_EQ(cdef_level.cdef_strength_y, cdef_y_strengths[0]);
+ } else {
+ num_drops_++;
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (layer_id_.spatial_layer_id == 0)
+ rc_api_->PostEncodeUpdate(pkt->data.frame.sz - 2);
+ else
+ rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+ }
+
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
+ (void)img1;
+ (void)img2;
+ }
+
+ void RunOneLayer() {
+ key_interval_ = 10000;
+ SetConfig();
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunOneLayerScreen() {
+ key_interval_ = 10000;
+ SetConfig();
+ rc_cfg_.is_screen = true;
+ rc_cfg_.width = 352;
+ rc_cfg_.height = 288;
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 140);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunOneLayerDropFramesCBR() {
+ key_interval_ = 10000;
+ max_consec_drop_ = 8;
+ frame_drop_thresh_ = 30;
+ SetConfig();
+ rc_cfg_.target_bandwidth = 100;
+ cfg_.rc_target_bitrate = 100;
+ rc_cfg_.max_quantizer = 50;
+ cfg_.rc_max_quantizer = 50;
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check that some frames were dropped, otherwise test has no value.
+ ASSERT_GE(num_drops_, 1);
+ }
+
+ void RunOneLayerPeriodicKey() {
+ key_interval_ = 100;
+ SetConfig();
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunSvc() {
+ key_interval_ = 10000;
+ SetConfigSvc(3, 3);
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunSvcPeriodicKey() {
+ key_interval_ = 100;
+ SetConfigSvc(3, 3);
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunSvcDynamicTemporal() {
+ dynamic_temporal_layers_ = true;
+ key_interval_ = 10000;
+ SetConfigSvc(3, 3);
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ void RunSvcDynamicSpatial() {
+ dynamic_spatial_layers_ = true;
+ key_interval_ = 10000;
+ SetConfigSvc(3, 3);
+ rc_api_ = aom::AV1RateControlRTC::Create(rc_cfg_);
+ frame_params_.spatial_layer_id = 0;
+ frame_params_.temporal_layer_id = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, kNumFrames);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ private:
+ void SetConfig() {
+ rc_cfg_.width = 640;
+ rc_cfg_.height = 480;
+ rc_cfg_.max_quantizer = 52;
+ rc_cfg_.min_quantizer = 2;
+ rc_cfg_.target_bandwidth = 1000;
+ rc_cfg_.buf_initial_sz = 600;
+ rc_cfg_.buf_optimal_sz = 600;
+ rc_cfg_.buf_sz = 1000;
+ rc_cfg_.undershoot_pct = 50;
+ rc_cfg_.overshoot_pct = 50;
+ rc_cfg_.max_intra_bitrate_pct = 1000;
+ rc_cfg_.framerate = 30.0;
+ rc_cfg_.ss_number_layers = 1;
+ rc_cfg_.ts_number_layers = 1;
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 1;
+ rc_cfg_.layer_target_bitrate[0] = 1000;
+ rc_cfg_.max_quantizers[0] = 52;
+ rc_cfg_.min_quantizers[0] = 2;
+ rc_cfg_.aq_mode = aq_mode_;
+ rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+ rc_cfg_.max_consec_drop = max_consec_drop_;
+
+ // Encoder settings for ground truth.
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_buf_initial_sz = 600;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 52;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.rc_target_bitrate = 1000;
+ cfg_.kf_min_dist = key_interval_;
+ cfg_.kf_max_dist = key_interval_;
+ cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+ }
+
+ void SetConfigSvc(int number_spatial_layers, int number_temporal_layers) {
+ rc_cfg_.width = 640;
+ rc_cfg_.height = 480;
+ rc_cfg_.max_quantizer = 56;
+ rc_cfg_.min_quantizer = 2;
+ rc_cfg_.buf_initial_sz = 600;
+ rc_cfg_.buf_optimal_sz = 600;
+ rc_cfg_.buf_sz = 1000;
+ rc_cfg_.undershoot_pct = 50;
+ rc_cfg_.overshoot_pct = 50;
+ rc_cfg_.max_intra_bitrate_pct = 1000;
+ rc_cfg_.framerate = 30.0;
+ rc_cfg_.aq_mode = aq_mode_;
+ rc_cfg_.ss_number_layers = number_spatial_layers;
+ rc_cfg_.ts_number_layers = number_temporal_layers;
+
+ // Encoder settings for ground truth.
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_buf_initial_sz = 600;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.g_threads = 1;
+ cfg_.kf_min_dist = key_interval_;
+ cfg_.kf_max_dist = key_interval_;
+ cfg_.g_timebase.num = 1;
+ cfg_.g_timebase.den = 30;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ svc_params_.number_spatial_layers = number_spatial_layers;
+ svc_params_.number_temporal_layers = number_temporal_layers;
+
+ // Scale factors.
+ if (number_spatial_layers == 3) {
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 4;
+ rc_cfg_.scaling_factor_num[1] = 2;
+ rc_cfg_.scaling_factor_den[1] = 4;
+ rc_cfg_.scaling_factor_num[2] = 4;
+ rc_cfg_.scaling_factor_den[2] = 4;
+ svc_params_.scaling_factor_num[0] = 1;
+ svc_params_.scaling_factor_den[0] = 4;
+ svc_params_.scaling_factor_num[1] = 2;
+ svc_params_.scaling_factor_den[1] = 4;
+ svc_params_.scaling_factor_num[2] = 4;
+ svc_params_.scaling_factor_den[2] = 4;
+ } else if (number_spatial_layers == 2) {
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 2;
+ rc_cfg_.scaling_factor_num[1] = 2;
+ rc_cfg_.scaling_factor_den[1] = 2;
+ svc_params_.scaling_factor_num[0] = 1;
+ svc_params_.scaling_factor_den[0] = 2;
+ svc_params_.scaling_factor_num[1] = 2;
+ svc_params_.scaling_factor_den[1] = 2;
+ } else if (number_spatial_layers == 1) {
+ rc_cfg_.scaling_factor_num[0] = 1;
+ rc_cfg_.scaling_factor_den[0] = 1;
+ svc_params_.scaling_factor_num[0] = 1;
+ svc_params_.scaling_factor_den[0] = 1;
+ }
+
+ // TS rate decimator.
+ if (number_temporal_layers == 3) {
+ rc_cfg_.ts_rate_decimator[0] = 4;
+ rc_cfg_.ts_rate_decimator[1] = 2;
+ rc_cfg_.ts_rate_decimator[2] = 1;
+ svc_params_.framerate_factor[0] = 4;
+ svc_params_.framerate_factor[1] = 2;
+ svc_params_.framerate_factor[2] = 1;
+ } else if (number_temporal_layers == 2) {
+ rc_cfg_.ts_rate_decimator[0] = 2;
+ rc_cfg_.ts_rate_decimator[1] = 1;
+ svc_params_.framerate_factor[0] = 2;
+ svc_params_.framerate_factor[1] = 1;
+ } else if (number_temporal_layers == 1) {
+ rc_cfg_.ts_rate_decimator[0] = 1;
+ svc_params_.framerate_factor[0] = 1;
+ }
+
+ // Bitate.
+ rc_cfg_.target_bandwidth = 0;
+ cfg_.rc_target_bitrate = 0;
+ for (int sl = 0; sl < number_spatial_layers; sl++) {
+ int spatial_bitrate = 0;
+ if (number_spatial_layers <= 3)
+ spatial_bitrate = kSpatialLayerBitrate[sl];
+ for (int tl = 0; tl < number_temporal_layers; tl++) {
+ int layer = sl * number_temporal_layers + tl;
+ if (number_temporal_layers == 3) {
+ rc_cfg_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+ svc_params_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+ } else if (number_temporal_layers == 2) {
+ rc_cfg_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+ svc_params_.layer_target_bitrate[layer] =
+ kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+ } else if (number_temporal_layers == 1) {
+ rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+ svc_params_.layer_target_bitrate[layer] = spatial_bitrate;
+ }
+ }
+ rc_cfg_.target_bandwidth += spatial_bitrate;
+ cfg_.rc_target_bitrate += spatial_bitrate;
+ }
+
+ // Layer min/max quantizer.
+ for (int sl = 0; sl < number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < number_temporal_layers; ++tl) {
+ const int i = sl * number_temporal_layers + tl;
+ rc_cfg_.max_quantizers[i] = rc_cfg_.max_quantizer;
+ rc_cfg_.min_quantizers[i] = rc_cfg_.min_quantizer;
+ svc_params_.max_quantizers[i] = cfg_.rc_max_quantizer;
+ svc_params_.min_quantizers[i] = cfg_.rc_min_quantizer;
+ }
+ }
+ }
+
+ std::unique_ptr<aom::AV1RateControlRTC> rc_api_;
+ aom::AV1RateControlRtcConfig rc_cfg_;
+ int aq_mode_;
+ int key_interval_;
+ aom::AV1FrameParamsRTC frame_params_;
+ bool encoder_exit_;
+ aom_svc_params_t svc_params_;
+ aom_svc_layer_id_t layer_id_;
+ int layer_frame_cnt_;
+ int superframe_cnt_;
+ int frame_cnt_;
+ bool dynamic_temporal_layers_;
+ bool dynamic_spatial_layers_;
+ int num_drops_;
+ int max_consec_drop_;
+ int frame_drop_thresh_;
+};
+
+TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
+TEST_P(RcInterfaceTest, OneLayerPeriodicKey) { RunOneLayerPeriodicKey(); }
+
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
+TEST_P(RcInterfaceTest, Svc) { RunSvc(); }
+
+TEST_P(RcInterfaceTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
+TEST_P(RcInterfaceTest, SvcDynamicTemporal) { RunSvcDynamicTemporal(); }
+
+TEST_P(RcInterfaceTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
+
+AV1_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3));
+
+} // namespace
diff --git a/third_party/aom/test/ratectrl_test.cc b/third_party/aom/test/ratectrl_test.cc
new file mode 100644
index 0000000000..d951b1197f
--- /dev/null
+++ b/third_party/aom/test/ratectrl_test.cc
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RatectrlTest, QModeGetQIndexTest) {
+ int base_q_index = 36;
+ int gf_update_type = INTNL_ARF_UPDATE;
+ int gf_pyramid_level = 1;
+ int arf_q = 100;
+ int q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+ gf_pyramid_level, arf_q);
+ EXPECT_EQ(q_index, arf_q);
+
+ gf_update_type = INTNL_ARF_UPDATE;
+ gf_pyramid_level = 3;
+ q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+ gf_pyramid_level, arf_q);
+ EXPECT_LT(q_index, arf_q);
+
+ gf_update_type = LF_UPDATE;
+ q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+ gf_pyramid_level, arf_q);
+ EXPECT_EQ(q_index, base_q_index);
+}
+} // namespace
diff --git a/third_party/aom/test/rd_test.cc b/third_party/aom/test/rd_test.cc
new file mode 100644
index 0000000000..0c481fcbb6
--- /dev/null
+++ b/third_party/aom/test/rd_test.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <vector>
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/rd.h"
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RdTest, GetDeltaqOffsetValueTest1) {
+ aom_bit_depth_t bit_depth = AOM_BITS_8;
+ double beta = 4;
+ int q_index = 29;
+ int dc_q_step =
+ av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+ EXPECT_EQ(dc_q_step, 32);
+
+ int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+ EXPECT_EQ(ref_new_dc_q_step, 16);
+
+ int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+ int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+ static_cast<aom_bit_depth_t>(bit_depth));
+
+ EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetValueTest2) {
+ aom_bit_depth_t bit_depth = AOM_BITS_8;
+ double beta = 1.0 / 4.0;
+ int q_index = 29;
+ int dc_q_step =
+ av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+ EXPECT_EQ(dc_q_step, 32);
+
+ int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+ EXPECT_EQ(ref_new_dc_q_step, 64);
+
+ int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+ int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+ static_cast<aom_bit_depth_t>(bit_depth));
+
+ EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest1) {
+ aom_bit_depth_t bit_depth = AOM_BITS_8;
+ double beta = 0.000000001;
+ std::vector<int> q_index_ls = { 254, 255 };
+ for (auto q_index : q_index_ls) {
+ int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+ EXPECT_EQ(q_index + delta_q, 255);
+ }
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest2) {
+ aom_bit_depth_t bit_depth = AOM_BITS_8;
+ double beta = 100;
+ std::vector<int> q_index_ls = { 1, 0 };
+ for (auto q_index : q_index_ls) {
+ int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+ EXPECT_EQ(q_index + delta_q, 0);
+ }
+}
+
+TEST(RdTest, GetDeltaqOffsetUnitaryTest1) {
+ aom_bit_depth_t bit_depth = AOM_BITS_8;
+ double beta = 1;
+ for (int q_index = 0; q_index < 255; ++q_index) {
+ int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+ EXPECT_EQ(delta_q, 0);
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/reconinter_test.cc b/third_party/aom/test/reconinter_test.cc
new file mode 100644
index 0000000000..ee1a9893db
--- /dev/null
+++ b/third_party/aom/test/reconinter_test.cc
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+using BuildCompDiffWtdMaskFunc = void (*)(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w);
+
+using BuildCompDiffwtdMaskDParam =
+ std::tuple<BLOCK_SIZE, BuildCompDiffWtdMaskFunc>;
+
+#if HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
+ BuildCompDiffWtdMaskFunc filter) {
+ return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+ ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskTest
+ : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
+ public:
+ BuildCompDiffwtdMaskTest() : rnd_(ACMRandom::DeterministicSeed()) {}
+ ~BuildCompDiffwtdMaskTest() override = default;
+
+ protected:
+ void RunTest(BuildCompDiffWtdMaskFunc test_impl, bool is_speed,
+ const DIFFWTD_MASK_TYPE type) {
+ const int sb_type = GET_PARAM(0);
+ const int width = block_size_wide[sb_type];
+ const int height = block_size_high[sb_type];
+ DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
+ for (int i = 0; i < width * height; i++) {
+ src0[i] = rnd_.Rand8();
+ src1[i] = rnd_.Rand8();
+ }
+ const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1,
+ width, height, width);
+ }
+ const double t1 = get_time_mark(&timer);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ test_impl(mask_test, type, src0, width, src1, width, height, width);
+ }
+ const double t2 = get_time_mark(&timer);
+ if (is_speed) {
+ printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+ printf("(%3.2f)\n", t1 / t2);
+ }
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width; ++c) {
+ ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+ << "[" << r << "," << c << "] " << run_times << " @ " << width
+ << "x" << height << " inv " << type;
+ }
+ }
+ }
+
+ private:
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskTest);
+
+TEST_P(BuildCompDiffwtdMaskTest, match) {
+ RunTest(GET_PARAM(1), 0, DIFFWTD_38);
+ RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
+ RunTest(GET_PARAM(1), 1, DIFFWTD_38);
+ RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskTest,
+ BuildParams(av1_build_compound_diffwtd_mask_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+using BuildCompDiffWtdMaskHighbdFunc =
+ void (*)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h,
+ int w, int bd);
+
+using BuildCompDiffwtdMaskHighbdParam =
+ std::tuple<BLOCK_SIZE, int, BuildCompDiffWtdMaskHighbdFunc>;
+
+#if HAVE_SSSE3 || HAVE_AVX2 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskHighbdParam>
+BuildParamsHighbd(BuildCompDiffWtdMaskHighbdFunc filter) {
+ return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+ ::testing::Values(8, 10, 12),
+ ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskHighbdTest
+ : public ::testing::TestWithParam<BuildCompDiffwtdMaskHighbdParam> {
+ public:
+ BuildCompDiffwtdMaskHighbdTest() : rnd_(ACMRandom::DeterministicSeed()) {}
+ ~BuildCompDiffwtdMaskHighbdTest() override = default;
+
+ protected:
+ void RunTest(BuildCompDiffWtdMaskHighbdFunc test_impl, bool is_speed,
+ const DIFFWTD_MASK_TYPE type) {
+ const int sb_type = GET_PARAM(0);
+ const int bd = GET_PARAM(1);
+ const int width = block_size_wide[sb_type];
+ const int height = block_size_high[sb_type];
+ const int mask = (1 << bd) - 1;
+ DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, src1[MAX_SB_SQUARE]);
+ for (int i = 0; i < width * height; i++) {
+ src0[i] = rnd_.Rand16() & mask;
+ src1[i] = rnd_.Rand16() & mask;
+ }
+ const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ uint8_t *src0_8 = CONVERT_TO_BYTEPTR(src0);
+ uint8_t *src1_8 = CONVERT_TO_BYTEPTR(src1);
+ av1_build_compound_diffwtd_mask_highbd_c(
+ mask_ref, type, src0_8, width, src1_8, width, height, width, bd);
+ }
+ const double t1 = get_time_mark(&timer);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ uint8_t *src0_8 = CONVERT_TO_BYTEPTR(src0);
+ uint8_t *src1_8 = CONVERT_TO_BYTEPTR(src1);
+ test_impl(mask_test, type, src0_8, width, src1_8, width, height, width,
+ bd);
+ }
+ const double t2 = get_time_mark(&timer);
+
+ if (is_speed) {
+ printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+ printf("(%3.2f)\n", t1 / t2);
+ }
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width; ++c) {
+ ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+ << "[" << r << "," << c << "] " << run_times << " @ " << width
+ << "x" << height << " inv " << type;
+ }
+ }
+ }
+
+ private:
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskHighbdTest);
+
+TEST_P(BuildCompDiffwtdMaskHighbdTest, match) {
+ RunTest(GET_PARAM(2), 0, DIFFWTD_38);
+ RunTest(GET_PARAM(2), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskHighbdTest, DISABLED_Speed) {
+ RunTest(GET_PARAM(2), 1, DIFFWTD_38);
+ RunTest(GET_PARAM(2), 1, DIFFWTD_38_INV);
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, BuildCompDiffwtdMaskHighbdTest,
+ BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_ssse3));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, BuildCompDiffwtdMaskHighbdTest,
+ BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BuildCompDiffwtdMaskHighbdTest,
+ BuildParamsHighbd(av1_build_compound_diffwtd_mask_highbd_neon));
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+using BuildCompDiffWtdMaskD16Func = void (*)(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd);
+
+using BuildCompDiffwtdMaskD16Param =
+ std::tuple<int, BuildCompDiffWtdMaskD16Func, BLOCK_SIZE>;
+
+#if HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
+ BuildCompDiffWtdMaskD16Func filter) {
+ return ::testing::Combine(::testing::Range(8, 13, 2),
+ ::testing::Values(filter),
+ ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif
+
+class BuildCompDiffwtdMaskD16Test
+ : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
+ public:
+ BuildCompDiffwtdMaskD16Test() : rnd_(ACMRandom::DeterministicSeed()) {}
+ ~BuildCompDiffwtdMaskD16Test() override = default;
+
+ protected:
+ void RunCheckOutput(BuildCompDiffWtdMaskD16Func test_impl) {
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ const int width = block_size_wide[block_idx];
+ const int height = block_size_high[block_idx];
+ DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+ ConvolveParams conv_params =
+ get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
+
+ const int in_precision =
+ bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+ for (int i = 0; i < MAX_SB_SQUARE; i++) {
+ src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ }
+
+ for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
+ av1_build_compound_diffwtd_mask_d16_c(
+ mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+ height, width, &conv_params, bd);
+
+ test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1,
+ width, height, width, &conv_params, bd);
+
+ for (int r = 0; r < height; ++r) {
+ for (int c = 0; c < width; ++c) {
+ ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+ << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
+ << " Pixel mismatch at index "
+ << "[" << r << "," << c << "] "
+ << " @ " << width << "x" << height << " inv " << mask_type;
+ }
+ }
+ }
+ }
+
+ void RunSpeedTest(BuildCompDiffWtdMaskD16Func test_impl,
+ DIFFWTD_MASK_TYPE mask_type) {
+ const int block_idx = GET_PARAM(2);
+ const int bd = GET_PARAM(0);
+ const int width = block_size_wide[block_idx];
+ const int height = block_size_high[block_idx];
+ DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+ ConvolveParams conv_params =
+ get_conv_params_no_round(0, 0, nullptr, 0, 1, bd);
+
+ const int in_precision =
+ bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+ for (int i = 0; i < MAX_SB_SQUARE; i++) {
+ src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+ }
+
+ const int num_loops = 10000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
+ width, height, width, &conv_params,
+ bd);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(mask, mask_type, src0, width, src1, width, height, width,
+ &conv_params, bd);
+
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("av1_build_compound_diffwtd_mask_d16 %3dx%-3d: %7.2f \n", width,
+ height, elapsed_time / double(elapsed_time1));
+ }
+
+ private:
+ ACMRandom rnd_;
+}; // class BuildCompDiffwtdMaskD16Test
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BuildCompDiffwtdMaskD16Test);
+
+TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
+ RunCheckOutput(GET_PARAM(1));
+}
+
+TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(1), DIFFWTD_38);
+ RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, BuildCompDiffwtdMaskD16Test,
+ BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
+ BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskD16Test,
+ BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/register_state_check.h b/third_party/aom/test/register_state_check.h
new file mode 100644
index 0000000000..4aad81469e
--- /dev/null
+++ b/third_party/aom/test/register_state_check.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_REGISTER_STATE_CHECK_H_
+#define AOM_TEST_REGISTER_STATE_CHECK_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+// API_REGISTER_STATE_CHECK(function)
+// Validates the environment pre & post function execution to ensure the
+// environment is in a consistent state. This should be used with API
+// function sand assembly functions which are not expected to fully restore
+// the system state.
+// See platform implementations of RegisterStateCheck and
+// RegisterStateCheckMMX for details.
+
+#if defined(_WIN64) && AOM_ARCH_X86_64
+
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+inline bool operator==(const M128A &lhs, const M128A &rhs) {
+ return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+namespace libaom_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+class RegisterStateCheck {
+ public:
+ RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+ ~RegisterStateCheck() { Check(); }
+
+ private:
+ static bool StoreRegisters(CONTEXT *const context) {
+ const HANDLE this_thread = GetCurrentThread();
+ EXPECT_NE(this_thread, nullptr);
+ context->ContextFlags = CONTEXT_FLOATING_POINT;
+ const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+ EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+ return context_saved;
+ }
+
+ // Compares the register state. Returns true if the states match.
+ void Check() const {
+ ASSERT_TRUE(initialized_);
+ CONTEXT post_context;
+ ASSERT_TRUE(StoreRegisters(&post_context));
+
+ const M128A *xmm_pre = &pre_context_.Xmm6;
+ const M128A *xmm_post = &post_context.Xmm6;
+ for (int i = 6; i <= 15; ++i) {
+ EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+ ++xmm_pre;
+ ++xmm_post;
+ }
+ }
+
+ bool initialized_;
+ CONTEXT pre_context_;
+};
+} // namespace libaom_test
+
+#else
+
+namespace libaom_test {
+
+class RegisterStateCheck {};
+} // namespace libaom_test
+
+#endif // _WIN64 && AOM_ARCH_X86_64
+
+#if (AOM_ARCH_X86 || AOM_ARCH_X86_64) && defined(__GNUC__)
+namespace libaom_test {
+
+// Checks the FPU tag word pre/post execution to ensure emms has been called.
+class RegisterStateCheckMMX {
+ public:
+ RegisterStateCheckMMX() {
+ __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_));
+ }
+ ~RegisterStateCheckMMX() { Check(); }
+
+ private:
+ // Checks the FPU tag word pre/post execution, returning false if not cleared
+ // to 0xffff.
+ void Check() const {
+ EXPECT_EQ(0xffff, pre_fpu_env_[4])
+ << "FPU was in an inconsistent state prior to call";
+
+ uint16_t post_fpu_env[14];
+ __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env));
+ EXPECT_EQ(0xffff, post_fpu_env[4])
+ << "FPU was left in an inconsistent state after call";
+ }
+
+ uint16_t pre_fpu_env_[14];
+};
+} // namespace libaom_test
+
+#else
+namespace libaom_test {
+
+class RegisterStateCheckMMX {};
+} // namespace libaom_test
+
+#endif // (AOM_ARCH_X86 || AOM_ARCH_X86_64) && defined(__GNUC__)
+
+#define API_REGISTER_STATE_CHECK(statement) \
+ do { \
+ libaom_test::RegisterStateCheck reg_check; \
+ libaom_test::RegisterStateCheckMMX reg_check_mmx; \
+ statement; \
+ (void)reg_check_mmx; \
+ (void)reg_check; \
+ } while (false)
+
+#endif // AOM_TEST_REGISTER_STATE_CHECK_H_
diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
new file mode 100644
index 0000000000..7bad45300a
--- /dev/null
+++ b/third_party/aom/test/resize_test.cc
@@ -0,0 +1,1136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "aom_dsp/aom_dsp_common.h"
+#include "common/tools_common.h"
+#include "av1/encoder/encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
+namespace {
+
+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, unsigned int val) {
+ mem[0] = val;
+ mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, unsigned int val) {
+ mem[0] = val;
+ mem[1] = val >> 8;
+ mem[2] = val >> 16;
+ mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const aom_codec_enc_cfg_t *const cfg,
+ int frame_cnt, FILE *const outfile) {
+ char header[32];
+
+ header[0] = 'D';
+ header[1] = 'K';
+ header[2] = 'I';
+ header[3] = 'F';
+ mem_put_le16(header + 4, 0); /* version */
+ mem_put_le16(header + 6, 32); /* headersize */
+ mem_put_le32(header + 8, AV1_FOURCC); /* fourcc (av1) */
+ mem_put_le16(header + 12, cfg->g_w); /* width */
+ mem_put_le16(header + 14, cfg->g_h); /* height */
+ mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+ mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+ mem_put_le32(header + 24, frame_cnt); /* length */
+ mem_put_le32(header + 28, 0); /* unused */
+
+ (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+ char header[4];
+ mem_put_le32(header, static_cast<unsigned int>(size));
+ (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const aom_codec_cx_pkt_t *const pkt,
+ FILE *const outfile) {
+ char header[12];
+ aom_codec_pts_t pts;
+
+ if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
+
+ pts = pkt->data.frame.pts;
+ mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+ mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+ mem_put_le32(header + 8, pts >> 32);
+
+ (void)fwrite(header, 1, 12, outfile);
+}
+#endif // WRITE_COMPRESSED_STREAM
+
+const unsigned int kInitialWidth = 320;
+const unsigned int kInitialHeight = 240;
+
+struct FrameInfo {
+ FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+ : pts(_pts), w(_w), h(_h) {}
+
+ aom_codec_pts_t pts;
+ unsigned int w;
+ unsigned int h;
+};
+
+void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
+ unsigned int initial_h, int flag_codec,
+ bool change_start_resln, unsigned int *w,
+ unsigned int *h) {
+ if (frame < 10) {
+ if (change_start_resln) {
+ *w = initial_w / 4;
+ *h = initial_h / 4;
+ } else {
+ *w = initial_w;
+ *h = initial_h;
+ }
+ return;
+ }
+ if (frame < 20) {
+ *w = initial_w * 3 / 4;
+ *h = initial_h * 3 / 4;
+ return;
+ }
+ if (frame < 30) {
+ *w = initial_w / 2;
+ *h = initial_h / 2;
+ return;
+ }
+ if (frame < 40) {
+ *w = initial_w;
+ *h = initial_h;
+ return;
+ }
+ if (frame < 50) {
+ *w = initial_w * 3 / 4;
+ *h = initial_h * 3 / 4;
+ return;
+ }
+ if (frame < 60) {
+ *w = initial_w / 2;
+ *h = initial_h / 2;
+ return;
+ }
+ if (frame < 70) {
+ *w = initial_w;
+ *h = initial_h;
+ return;
+ }
+ if (frame < 80) {
+ *w = initial_w * 3 / 4;
+ *h = initial_h * 3 / 4;
+ return;
+ }
+ if (frame < 90) {
+ *w = initial_w / 2;
+ *h = initial_h / 2;
+ return;
+ }
+ if (frame < 100) {
+ *w = initial_w * 3 / 4;
+ *h = initial_h * 3 / 4;
+ return;
+ }
+ if (frame < 110) {
+ *w = initial_w;
+ *h = initial_h;
+ return;
+ }
+ // Go down very low
+ if (frame < 120) {
+ *w = initial_w / 4;
+ *h = initial_h / 4;
+ return;
+ }
+ if (flag_codec == 1) {
+ // Cases that only works for AV1.
+ // For AV1: Swap width and height of original.
+ if (frame < 140) {
+ *w = initial_h;
+ *h = initial_w;
+ return;
+ }
+ }
+ *w = initial_w;
+ *h = initial_h;
+}
+
+class ResizingVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+ ResizingVideoSource() {
+ SetSize(kInitialWidth, kInitialHeight);
+ limit_ = 150;
+ }
+ int flag_codec_;
+ bool change_start_resln_;
+ ~ResizingVideoSource() override = default;
+
+ protected:
+ void Begin() override {
+ frame_ = 0;
+ unsigned int width;
+ unsigned int height;
+ ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, flag_codec_,
+ change_start_resln_, &width, &height);
+ SetSize(width, height);
+ FillFrame();
+ }
+ void Next() override {
+ ++frame_;
+ unsigned int width;
+ unsigned int height;
+ ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, flag_codec_,
+ change_start_resln_, &width, &height);
+ SetSize(width, height);
+ FillFrame();
+ }
+};
+
+class ResizeTest
+ : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ResizeTest() : EncoderTest(GET_PARAM(0)) {}
+
+ ~ResizeTest() override = default;
+
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ if (GET_PARAM(1) == ::libaom_test::kRealTime) {
+ encoder->Control(AV1E_SET_AQ_MODE, 3);
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ }
+ }
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+ }
+
+ std::vector<FrameInfo> frame_info_list_;
+};
+
+TEST_P(ResizeTest, TestExternalResizeWorks) {
+ ResizingVideoSource video;
+ video.flag_codec_ = 0;
+ video.change_start_resln_ = false;
+ cfg_.g_lag_in_frames = 0;
+ // We use max(kInitialWidth, kInitialHeight) because during the test
+ // the width and height of the frame are swapped
+ cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+ AOMMAX(kInitialWidth, kInitialHeight);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const unsigned int frame = static_cast<unsigned>(info->pts);
+ unsigned int expected_w;
+ unsigned int expected_h;
+ ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, video.flag_codec_,
+ video.change_start_resln_, &expected_w, &expected_h);
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
+class ResizeInternalTestLarge : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+ ResizeInternalTestLarge()
+ : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {}
+#else
+ ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+ ~ResizeInternalTestLarge() override = default;
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+#if WRITE_COMPRESSED_STREAM
+ outfile_ = fopen("av10-2-05-resize.ivf", "wb");
+#endif
+ }
+
+ void EndPassHook() override {
+#if WRITE_COMPRESSED_STREAM
+ if (outfile_) {
+ if (!fseek(outfile_, 0, SEEK_SET))
+ write_ivf_file_header(&cfg_, out_frames_, outfile_);
+ fclose(outfile_);
+ outfile_ = nullptr;
+ }
+#endif
+ }
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ if (change_config_) {
+ int new_q = 60;
+ if (video->frame() == 0) {
+ struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ } else if (video->frame() == 1) {
+ struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q;
+ encoder->Config(&cfg_);
+ }
+ } else {
+ if (video->frame() >= kStepDownFrame && video->frame() < kStepUpFrame) {
+ struct aom_scaling_mode mode = { AOME_FOURFIVE, AOME_THREEFIVE };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ }
+ if (video->frame() >= kStepUpFrame) {
+ struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ }
+ }
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
+ EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 4.1);
+ }
+
+#if WRITE_COMPRESSED_STREAM
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ ++out_frames_;
+
+ // Write initial file header if first frame.
+ if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_);
+
+ // Write frame header and data.
+ write_ivf_frame_header(pkt, outfile_);
+ (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+ }
+#endif
+
+ double frame0_psnr_;
+ bool change_config_;
+#if WRITE_COMPRESSED_STREAM
+ FILE *outfile_;
+ unsigned int out_frames_;
+#endif
+};
+
+TEST_P(ResizeInternalTestLarge, TestInternalResizeWorks) {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 10);
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ change_config_ = false;
+
+ // q picked such that initial keyframe on this clip is ~30dB PSNR
+ cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+ // If the number of frames being encoded is smaller than g_lag_in_frames
+ // the encoded frame is unavailable using the current API. Comparing
+ // frames to detect mismatch would then not be possible. Set
+ // g_lag_in_frames = 0 to get around this.
+ cfg_.g_lag_in_frames = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ }
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const aom_codec_pts_t pts = info->pts;
+ if (pts >= kStepDownFrame && pts < kStepUpFrame) {
+ ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
+ ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
+ } else {
+ EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width";
+ EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height";
+ }
+ }
+}
+
+TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) {
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 10);
+ cfg_.g_w = 352;
+ cfg_.g_h = 288;
+ change_config_ = true;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood));
+#endif
+
+// Parameters: test mode, speed, threads
+class ResizeRealtimeTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+ int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ResizeRealtimeTest()
+ : EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)),
+ set_scale_mode_(false), set_scale_mode2_(false),
+ set_scale_mode3_(false) {}
+ ~ResizeRealtimeTest() override = default;
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_AQ_MODE, 3);
+ encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ }
+ if (set_scale_mode_) {
+ struct aom_scaling_mode mode;
+ if (video->frame() <= 20)
+ mode = { AOME_ONETWO, AOME_ONETWO };
+ else if (video->frame() <= 40)
+ mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+ else if (video->frame() > 40)
+ mode = { AOME_NORMAL, AOME_NORMAL };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ } else if (set_scale_mode2_) {
+ struct aom_scaling_mode mode;
+ if (video->frame() <= 20)
+ mode = { AOME_ONEFOUR, AOME_ONEFOUR };
+ else if (video->frame() <= 40)
+ mode = { AOME_ONETWO, AOME_ONETWO };
+ else if (video->frame() > 40)
+ mode = { AOME_THREEFOUR, AOME_THREEFOUR };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ } else if (set_scale_mode3_) {
+ struct aom_scaling_mode mode;
+ if (video->frame() <= 30)
+ mode = { AOME_ONETWO, AOME_NORMAL };
+ else
+ mode = { AOME_NORMAL, AOME_NORMAL };
+ encoder->Control(AOME_SET_SCALEMODE, &mode);
+ }
+
+ if (change_bitrate_ && video->frame() == frame_change_bitrate_) {
+ change_bitrate_ = false;
+ cfg_.rc_target_bitrate = 500;
+ encoder->Config(&cfg_);
+ }
+ }
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ set_cpu_used_ = GET_PARAM(2);
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+ }
+
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
+ double mismatch_psnr = compute_psnr(img1, img2);
+ mismatch_psnr_ += mismatch_psnr;
+ ++mismatch_nframes_;
+ }
+
+ unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+
+ void DefaultConfig() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_undershoot_pct = 50;
+ cfg_.rc_overshoot_pct = 50;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.kf_mode = AOM_KF_AUTO;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+ // Enable dropped frames.
+ cfg_.rc_dropframe_thresh = 1;
+ // Disable error_resilience mode.
+ cfg_.g_error_resilient = 0;
+ cfg_.g_threads = num_threads_;
+ // Run at low bitrate.
+ cfg_.rc_target_bitrate = 200;
+ // We use max(kInitialWidth, kInitialHeight) because during the test
+ // the width and height of the frame are swapped
+ cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+ AOMMAX(kInitialWidth, kInitialHeight);
+ if (set_scale_mode_ || set_scale_mode2_ || set_scale_mode3_) {
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.g_forced_max_frame_width = 1280;
+ cfg_.g_forced_max_frame_height = 1280;
+ }
+ }
+
+ std::vector<FrameInfo> frame_info_list_;
+ int set_cpu_used_;
+ int num_threads_;
+ bool change_bitrate_;
+ unsigned int frame_change_bitrate_;
+ double mismatch_psnr_;
+ int mismatch_nframes_;
+ bool set_scale_mode_;
+ bool set_scale_mode2_;
+ bool set_scale_mode3_;
+};
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2, then 1/4, and then back up to originsal.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ cfg_.g_w = 1280;
+ cfg_.g_h = 720;
+ set_scale_mode_ = true;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
+ DefaultConfig();
+ change_bitrate_ = false;
+ mismatch_nframes_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const auto frame = static_cast<unsigned>(info->pts);
+ unsigned int expected_w = 1280 >> 1;
+ unsigned int expected_h = 720 >> 1;
+ if (frame > 40) {
+ expected_w = 1280;
+ expected_h = 720;
+ } else if (frame > 20 && frame <= 40) {
+ expected_w = 1280 >> 2;
+ expected_h = 720 >> 2;
+ }
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2, then 1/4, and then back up to originsal.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode1QVGA) {
+ ::libaom_test::I420VideoSource video("desktop1.320_180.yuv", 320, 180, 30, 1,
+ 0, 80);
+ cfg_.g_w = 320;
+ cfg_.g_h = 180;
+ set_scale_mode_ = true;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
+ DefaultConfig();
+ change_bitrate_ = false;
+ mismatch_nframes_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const auto frame = static_cast<unsigned>(info->pts);
+ unsigned int expected_w = 320 >> 1;
+ unsigned int expected_h = 180 >> 1;
+ if (frame > 40) {
+ expected_w = 320;
+ expected_h = 180;
+ } else if (frame > 20 && frame <= 40) {
+ expected_w = 320 >> 2;
+ expected_h = 180 >> 2;
+ }
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/4, then 1/2, and then up to 3/4.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode2) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ cfg_.g_w = 1280;
+ cfg_.g_h = 720;
+ set_scale_mode_ = false;
+ set_scale_mode2_ = true;
+ set_scale_mode3_ = false;
+ DefaultConfig();
+ change_bitrate_ = false;
+ mismatch_nframes_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const auto frame = static_cast<unsigned>(info->pts);
+ unsigned int expected_w = 1280 >> 2;
+ unsigned int expected_h = 720 >> 2;
+ if (frame > 40) {
+ expected_w = (3 * 1280) >> 2;
+ expected_h = (3 * 720) >> 2;
+ } else if (frame > 20 && frame <= 40) {
+ expected_w = 1280 >> 1;
+ expected_h = 720 >> 1;
+ }
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
+// Check the AOME_SET_SCALEMODE control by downsizing to
+// 1/2 horizontally only and then back up to original.
+TEST_P(ResizeRealtimeTest, TestInternalResizeSetScaleMode3) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ cfg_.g_w = 1280;
+ cfg_.g_h = 720;
+ set_scale_mode_ = false;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = true;
+ DefaultConfig();
+ change_bitrate_ = false;
+ mismatch_nframes_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ const auto frame = static_cast<unsigned>(info->pts);
+ unsigned int expected_w = 640;
+ unsigned int expected_h = 720;
+ if (frame > 30) {
+ expected_w = 1280;
+ expected_h = 720;
+ }
+ EXPECT_EQ(expected_w, info->w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info->h)
+ << "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
+ ResizingVideoSource video;
+ video.flag_codec_ = 1;
+ change_bitrate_ = false;
+ set_scale_mode_ = false;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
+ DefaultConfig();
+ // Test external resizing with start resolution equal to
+ // 1. kInitialWidth and kInitialHeight
+ // 2. down-scaled kInitialWidth and kInitialHeight
+ for (int i = 0; i < 2; i++) {
+ video.change_start_resln_ = static_cast<bool>(i);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ for (const auto &info : frame_info_list_) {
+ const unsigned int frame = static_cast<unsigned>(info.pts);
+ unsigned int expected_w;
+ unsigned int expected_h;
+ ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+ video.flag_codec_, video.change_start_resln_,
+ &expected_w, &expected_h);
+ EXPECT_EQ(expected_w, info.w)
+ << "Frame " << frame << " had unexpected width";
+ EXPECT_EQ(expected_h, info.h)
+ << "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+ frame_info_list_.clear();
+ }
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Run at low bitrate, with resize_allowed = 1, and verify that we get
+// one resize down event.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ change_bitrate_ = false;
+ set_scale_mode_ = false;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
+ DefaultConfig();
+ // Disable dropped frames.
+ cfg_.rc_dropframe_thresh = 0;
+ // Starting bitrate low.
+ cfg_.rc_target_bitrate = 150;
+ cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+ cfg_.g_forced_max_frame_width = 1280;
+ cfg_.g_forced_max_frame_height = 1280;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ unsigned int last_w = cfg_.g_w;
+ unsigned int last_h = cfg_.g_h;
+ int resize_down_count = 0;
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ if (info->w != last_w || info->h != last_h) {
+ // Verify that resize down occurs.
+ if (info->w < last_w && info->h < last_h) {
+ resize_down_count++;
+ }
+ last_w = info->w;
+ last_h = info->h;
+ }
+ }
+
+#if CONFIG_AV1_DECODER
+ // Verify that we get at lease 1 resize down event in this test.
+ ASSERT_GE(resize_down_count, 1) << "Resizing should occur.";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+ printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip
+// (at frame# = frame_change_bitrate_), scaling-up should occur after bitrate
+// is increased.
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+ 0, 400);
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ change_bitrate_ = true;
+ frame_change_bitrate_ = 120;
+ set_scale_mode_ = false;
+ set_scale_mode2_ = false;
+ set_scale_mode3_ = false;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
+ DefaultConfig();
+ // Disable dropped frames.
+ cfg_.rc_dropframe_thresh = 0;
+ // Starting bitrate low.
+ cfg_.rc_target_bitrate = 150;
+ cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+ cfg_.g_forced_max_frame_width = 1280;
+ cfg_.g_forced_max_frame_height = 1280;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ unsigned int last_w = cfg_.g_w;
+ unsigned int last_h = cfg_.g_h;
+ unsigned int frame_number = 0;
+ int resize_down_count = 0;
+ int resize_up_count = 0;
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ if (info->w != last_w || info->h != last_h) {
+ if (frame_number < frame_change_bitrate_) {
+ // Verify that resize down occurs, before bitrate is increased.
+ ASSERT_LT(info->w, last_w);
+ ASSERT_LT(info->h, last_h);
+ resize_down_count++;
+ } else {
+ // Verify that resize up occurs, after bitrate is increased.
+ ASSERT_GT(info->w, last_w);
+ ASSERT_GT(info->h, last_h);
+ resize_up_count++;
+ }
+ last_w = info->w;
+ last_h = info->h;
+ }
+ frame_number++;
+ }
+
+#if CONFIG_AV1_DECODER
+ // Verify that we get at least 2 resize events in this test.
+ ASSERT_GE(resize_up_count, 1) << "Resizing up should occur at lease once.";
+ ASSERT_GE(resize_down_count, 1)
+ << "Resizing down should occur at lease once.";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+ printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+class ResizeCspTest : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+ ResizeCspTest()
+ : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {}
+#else
+ ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+ ~ResizeCspTest() override = default;
+
+ void BeginPassHook(unsigned int /*pass*/) override {
+#if WRITE_COMPRESSED_STREAM
+ outfile_ = fopen("av11-2-05-cspchape.ivf", "wb");
+#endif
+ }
+
+ void EndPassHook() override {
+#if WRITE_COMPRESSED_STREAM
+ if (outfile_) {
+ if (!fseek(outfile_, 0, SEEK_SET))
+ write_ivf_file_header(&cfg_, out_frames_, outfile_);
+ fclose(outfile_);
+ outfile_ = nullptr;
+ }
+#endif
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
+ EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+ }
+
+#if WRITE_COMPRESSED_STREAM
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ ++out_frames_;
+
+ // Write initial file header if first frame.
+ if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_);
+
+ // Write frame header and data.
+ write_ivf_frame_header(pkt, outfile_);
+ (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+ }
+#endif
+
+ double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+ FILE *outfile_;
+ unsigned int out_frames_;
+#endif
+};
+
+class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+ explicit ResizingCspVideoSource(aom_img_fmt_t image_format) {
+ SetSize(kInitialWidth, kInitialHeight);
+ SetImageFormat(image_format);
+ limit_ = 30;
+ }
+
+ ~ResizingCspVideoSource() override = default;
+};
+
+#if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH) || \
+ (defined(CONFIG_MAX_DECODE_PROFILE) && CONFIG_MAX_DECODE_PROFILE < 1)
+TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) {
+#else
+TEST_P(ResizeCspTest, TestResizeCspWorks) {
+#endif
+ const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 };
+ for (const aom_img_fmt_t &img_format : image_formats) {
+ ResizingCspVideoSource video(img_format);
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_profile = (img_format == AOM_IMG_FMT_I420) ? 0 : 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ // Check we decoded the same number of frames as we attempted to encode
+ ASSERT_EQ(frame_info_list_.size(), video.limit());
+ frame_info_list_.clear();
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// This class is used to check if there are any fatal
+// failures while encoding with resize-mode > 0
+class ResizeModeTestLarge
+ : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+ int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ResizeModeTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ resize_mode_(GET_PARAM(2)), resize_denominator_(GET_PARAM(3)),
+ resize_kf_denominator_(GET_PARAM(4)), cpu_used_(GET_PARAM(5)) {}
+ ~ResizeModeTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_target_bitrate = 1000;
+ cfg_.rc_resize_mode = resize_mode_;
+ cfg_.rc_resize_denominator = resize_denominator_;
+ cfg_.rc_resize_kf_denominator = resize_kf_denominator_;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ }
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ int resize_mode_;
+ int resize_denominator_;
+ int resize_kf_denominator_;
+ int cpu_used_;
+};
+
+TEST_P(ResizeModeTestLarge, ResizeModeTest) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge);
+AV1_INSTANTIATE_TEST_SUITE(ResizeModeTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(1, 2), ::testing::Values(8, 12),
+ ::testing::Values(10, 14), ::testing::Values(3, 6));
+#endif // !CONFIG_REALTIME_ONLY
+
+AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
+ ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(6, 10), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
+ ::testing::Values(::libaom_test::kRealTime));
+
+// A test that reproduces crbug.com/1393384. In realtime usage mode, encode
+// frames of sizes 202x202, 1x202, and 202x202. ASan should report no memory
+// errors.
+TEST(ResizeSimpleTest, TemporarySmallerFrameSize) {
+ constexpr int kWidth = 202;
+ constexpr int kHeight = 202;
+ // Dummy buffer of zero samples.
+ constexpr size_t kBufferSize =
+ kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+ std::vector<unsigned char> buffer(kBufferSize);
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+ aom_image_t img2;
+ EXPECT_EQ(&img2, aom_img_wrap(&img2, AOM_IMG_FMT_I420, 1, kHeight, 1,
+ buffer.data()));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+ cfg.g_w = 1;
+ cfg.g_h = kHeight;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img2, 1, 1, 0));
+
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 2, 1, 0));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// A test that reproduces crbug.com/1410766. In realtime usage mode
+// for SVC with temporal layers, encode frames of sizes 600x600,
+// 600x600, and 100x480. ASan should report no memory errors.
+TEST(ResizeSimpleTest, SmallerFrameSizeSVC) {
+ constexpr int kWidth = 600;
+ constexpr int kHeight = 600;
+ // Dummy buffer of zero samples.
+ constexpr size_t kBufferSize =
+ kWidth * kHeight + 2 * (kWidth + 1) / 2 * (kHeight + 1) / 2;
+ std::vector<unsigned char> buffer(kBufferSize);
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+ aom_image_t img2;
+ EXPECT_EQ(&img2,
+ aom_img_wrap(&img2, AOM_IMG_FMT_I420, 100, 480, 1, buffer.data()));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME));
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 5));
+
+ aom_svc_params_t svc_params = {};
+ aom_svc_layer_id_t layer_id;
+ svc_params.number_spatial_layers = 1;
+ svc_params.framerate_factor[0] = 2;
+ svc_params.framerate_factor[1] = 1;
+ svc_params.number_temporal_layers = 2;
+ // Bitrate allocation L0: 60% L1: 40%
+ svc_params.layer_target_bitrate[0] = 60 * cfg.rc_target_bitrate / 100;
+ svc_params.layer_target_bitrate[1] = cfg.rc_target_bitrate;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params));
+
+ layer_id.spatial_layer_id = 0;
+ layer_id.temporal_layer_id = 0;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ layer_id.temporal_layer_id = 1;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 1, 1, 0));
+
+ cfg.g_w = 100;
+ cfg.g_h = 480;
+ layer_id.temporal_layer_id = 0;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_SVC_LAYER_ID, &layer_id));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img2, 2, 1, 0));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+const int kUsages[] =
+#if CONFIG_REALTIME_ONLY
+ { AOM_USAGE_REALTIME };
+#else
+ { AOM_USAGE_GOOD_QUALITY, AOM_USAGE_REALTIME, AOM_USAGE_ALL_INTRA };
+#endif
+
+const int kNumThreads[] = { 2, 4, 8 };
+
+class FrameSizeChangeTest
+ : public ::libaom_test::CodecTestWith3Params<int, int, int> {
+ protected:
+ FrameSizeChangeTest() {}
+ ~FrameSizeChangeTest() override = default;
+
+ void DoTest(int change_thread) {
+ usage_ = GET_PARAM(1);
+ cpu_used_ = GET_PARAM(2);
+ threads_ = GET_PARAM(3);
+ constexpr int kWidth = 512;
+ constexpr int kHeight = 512;
+ constexpr int kFirstWidth = 256;
+ constexpr int kFirstHeight = 256;
+ // Buffer of zero samples.
+ constexpr size_t kBufferSize = 3 * kWidth * kHeight;
+ std::vector<unsigned char> buffer(kBufferSize,
+ static_cast<unsigned char>(0));
+
+ aom_image_t img1;
+ EXPECT_EQ(&img1, aom_img_wrap(&img1, AOM_IMG_FMT_I420, kFirstWidth,
+ kFirstHeight, 1, buffer.data()));
+
+ aom_image_t img2;
+ EXPECT_EQ(&img2, aom_img_wrap(&img2, AOM_IMG_FMT_I420, kWidth, kHeight, 1,
+ buffer.data()));
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, usage_));
+ cfg.g_threads = threads_;
+ cfg.g_lag_in_frames = usage_ == AOM_USAGE_ALL_INTRA ? 0 : 1;
+ cfg.g_w = kFirstWidth;
+ cfg.g_h = kFirstHeight;
+ cfg.g_forced_max_frame_width = kWidth;
+ cfg.g_forced_max_frame_height = kHeight;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_CPUUSED, cpu_used_));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img1, 0, 1, 0));
+
+ if (change_thread == 1) {
+ cfg.g_threads = AOMMAX(1, threads_ / 2);
+ } else if (change_thread == 2) {
+ cfg.g_threads = threads_ * 2;
+ }
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_set(&enc, &cfg));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img2, 1, 1, 0));
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 0, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+ }
+
+ int cpu_used_;
+ int threads_;
+ int usage_;
+};
+
+TEST_P(FrameSizeChangeTest, FixedThreads) { DoTest(0); }
+TEST_P(FrameSizeChangeTest, DecreasingThreads) { DoTest(1); }
+TEST_P(FrameSizeChangeTest, IncreasingThreads) { DoTest(2); }
+
+AV1_INSTANTIATE_TEST_SUITE(FrameSizeChangeTest, ::testing::ValuesIn(kUsages),
+ ::testing::Range(6, 7),
+ ::testing::ValuesIn(kNumThreads));
+
+} // namespace
diff --git a/third_party/aom/test/rt_end_to_end_test.cc b/third_party/aom/test/rt_end_to_end_test.cc
new file mode 100644
index 0000000000..f1f9e019c2
--- /dev/null
+++ b/third_party/aom/test/rt_end_to_end_test.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for speed settings 6-8
+// keys: video, speed, aq mode.
+std::unordered_map<std::string,
+ std::unordered_map<int, std::unordered_map<int, double>>>
+ kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
+ { { 5, { { 0, 35.4 }, { 3, 36.3 } } },
+ { 6, { { 0, 35.3 }, { 3, 36.2 } } },
+ { 7, { { 0, 34.9 }, { 3, 35.8 } } },
+ { 8, { { 0, 35.0 }, { 3, 35.8 } } },
+ { 9, { { 0, 34.9 }, { 3, 35.5 } } },
+ { 10, { { 0, 34.7 }, { 3, 35.3 } } } } },
+ { "paris_352_288_30.y4m",
+ { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
+ { 6, { { 0, 36.1 }, { 3, 36.48 } } },
+ { 7, { { 0, 35.5 }, { 3, 36.0 } } },
+ { 8, { { 0, 35.8 }, { 3, 36.4 } } },
+ { 9, { { 0, 35.5 }, { 3, 36.0 } } },
+ { 10, { { 0, 35.3 }, { 3, 35.9 } } } } },
+ { "niklas_1280_720_30.y4m",
+ { { 5, { { 0, 34.4 }, { 3, 34.2 } } },
+ { 6, { { 0, 34.1 }, { 3, 34.0 } } },
+ { 7, { { 0, 33.5 }, { 3, 33.1 } } },
+ { 8, { { 0, 33.3 }, { 3, 33.3 } } },
+ { 9, { { 0, 33.3 }, { 3, 33.3 } } },
+ { 10, { { 0, 33.1 }, { 3, 33.1 } } } } },
+ { "hantro_collage_w352h288_nv12.yuv",
+ { { 5, { { 0, 34.4 }, { 3, 34.2 } } },
+ { 6, { { 0, 34.1 }, { 3, 34.1 } } },
+ { 7, { { 0, 33.6 }, { 3, 33.6 } } },
+ { 8, { { 0, 33.3 }, { 3, 33.3 } } },
+ { 9, { { 0, 33.3 }, { 3, 33.3 } } },
+ { 10, { { 0, 33.1 }, { 3, 33.1 } } } } } };
+
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+ { "hantro_collage_w352h288_nv12.yuv", 8, AOM_IMG_FMT_NV12, AOM_BITS_8, 0 },
+};
+
+// Params: test video, speed, aq mode, threads, tile columns, tile rows.
+class RTEndToEndTest
+ : public ::libaom_test::CodecTestWith6Params<TestVideoParam, int,
+ unsigned int, int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ RTEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0),
+ aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
+ tile_columns_(GET_PARAM(5)), tile_rows_(GET_PARAM(6)) {}
+
+ ~RTEndToEndTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kRealTime);
+
+ cfg_.g_threads = threads_;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ cfg_.kf_max_dist = 9999;
+ cfg_.kf_min_dist = 9999;
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_ENABLE_RESTORATION, 0);
+ encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+ encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+ encoder->Control(AV1E_SET_ENABLE_WARPED_MOTION, 0);
+ encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+ encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+ encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+ encoder->Control(AV1E_SET_ROW_MT, 1);
+ encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+ encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+ encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() {
+ return kPsnrThreshold[test_video_param_.filename][cpu_used_][aq_mode_];
+ }
+
+ void DoTest() {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ if (is_extension_y4m(test_video_param_.filename))
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ else
+ video.reset(new libaom_test::YUVVideoSource(test_video_param_.filename,
+ test_video_param_.fmt, 352,
+ 288, 30, 1, 0, kFrames));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, GetPsnrThreshold())
+ << "cpu used = " << cpu_used_ << " aq mode = " << aq_mode_;
+ }
+
+ TestVideoParam test_video_param_;
+ int cpu_used_;
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+ unsigned int aq_mode_;
+ int threads_;
+ int tile_columns_;
+ int tile_rows_;
+};
+
+class RTEndToEndTestThreaded : public RTEndToEndTest {};
+
+TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
+ ::testing::Range(5, 12),
+ ::testing::Values<unsigned int>(0, 3),
+ ::testing::Values(1), ::testing::Values(1),
+ ::testing::Values(1));
+
+AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTestThreaded,
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::Range(5, 12),
+ ::testing::Values<unsigned int>(0, 3),
+ ::testing::Range(2, 6), ::testing::Range(1, 5),
+ ::testing::Range(1, 5));
+} // namespace
diff --git a/third_party/aom/test/run_encodes.sh b/third_party/aom/test/run_encodes.sh
new file mode 100755
index 0000000000..2096d8b158
--- /dev/null
+++ b/third_party/aom/test/run_encodes.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 4 ]]; then
+ echo Encodes all the y4m files in the directory at the bitrates specified by
+ echo the first 3 parameters and stores the results in a subdirectory named by
+ echo the 4th parameter:
+ echo
+ echo Usage: run_encodes.sh start-kbps end-kbps step-kbps output-directory
+ echo Example: run_encodes.sh 200 500 50 baseline
+ exit
+fi
+
+s=$1
+e=$2
+step=$3
+newdir=$4
+
+for i in ./*y4m; do
+ for (( b=$s; b<= $e; b+= $step ))
+ do
+ best_encode.sh $i $b
+ done
+ mv opsnr.stt $i.stt
+done
+
+mkdir $newdir
+mv *.stt $newdir
+mv *.webm $newdir
diff --git a/third_party/aom/test/sad_test.cc b/third_party/aom/test/sad_test.cc
new file mode 100644
index 0000000000..521274863c
--- /dev/null
+++ b/third_party/aom/test/sad_test.cc
@@ -0,0 +1,3353 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "aom/aom_codec.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride);
+typedef std::tuple<int, int, SadMxNFunc, int> SadMxNParam;
+
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride);
+typedef std::tuple<int, int, SadSkipMxNFunc, int> SadSkipMxNParam;
+
+typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred);
+typedef std::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+
+typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int width,
+ int height);
+typedef std::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam;
+
+typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef std::tuple<int, int, DistWtdSadMxNAvgFunc, int> DistWtdSadMxNAvgParam;
+
+typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[], int ref_stride,
+ uint32_t *sad_array);
+typedef std::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[], int ref_stride,
+ uint32_t *sad_array);
+typedef std::tuple<int, int, SadSkipMxNx4Func, int> SadSkipMxNx4Param;
+
+typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[], int ref_stride,
+ const uint8_t *second_pred,
+ uint32_t *sad_array);
+typedef std::tuple<int, int, SadMxNx4AvgFunc, int> SadMxNx4AvgParam;
+
+using libaom_test::ACMRandom;
+
+namespace {
+class SADTestBase : public ::testing::Test {
+ public:
+ SADTestBase(int width, int height, int bit_depth)
+ : width_(width), height_(height), bd_(bit_depth) {}
+
+ static void SetUpTestSuite() {
+ source_data8_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize));
+ ASSERT_NE(source_data8_, nullptr);
+ reference_data8_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(kDataAlignment, kDataBufferSize));
+ ASSERT_NE(reference_data8_, nullptr);
+ second_pred8_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(second_pred8_, nullptr);
+ comp_pred8_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(comp_pred8_, nullptr);
+ comp_pred8_test_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+ ASSERT_NE(comp_pred8_test_, nullptr);
+ source_data16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t)));
+ ASSERT_NE(source_data16_, nullptr);
+ reference_data16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+ ASSERT_NE(reference_data16_, nullptr);
+ second_pred16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(second_pred16_, nullptr);
+ comp_pred16_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(comp_pred16_, nullptr);
+ comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+ ASSERT_NE(comp_pred16_test_, nullptr);
+ }
+
+ static void TearDownTestSuite() {
+ aom_free(source_data8_);
+ source_data8_ = nullptr;
+ aom_free(reference_data8_);
+ reference_data8_ = nullptr;
+ aom_free(second_pred8_);
+ second_pred8_ = nullptr;
+ aom_free(comp_pred8_);
+ comp_pred8_ = nullptr;
+ aom_free(comp_pred8_test_);
+ comp_pred8_test_ = nullptr;
+ aom_free(source_data16_);
+ source_data16_ = nullptr;
+ aom_free(reference_data16_);
+ reference_data16_ = nullptr;
+ aom_free(second_pred16_);
+ second_pred16_ = nullptr;
+ aom_free(comp_pred16_);
+ comp_pred16_ = nullptr;
+ aom_free(comp_pred16_test_);
+ comp_pred16_test_ = nullptr;
+ }
+
+ protected:
+ // Handle up to 4 128x128 blocks, with stride up to 256
+ static const int kDataAlignment = 16;
+ static const int kDataBlockSize = 128 * 256;
+ static const int kDataBufferSize = 4 * kDataBlockSize;
+
+ void SetUp() override {
+ if (bd_ == -1) {
+ use_high_bit_depth_ = false;
+ bit_depth_ = AOM_BITS_8;
+ source_data_ = source_data8_;
+ reference_data_ = reference_data8_;
+ second_pred_ = second_pred8_;
+ comp_pred_ = comp_pred8_;
+ comp_pred_test_ = comp_pred8_test_;
+ } else {
+ use_high_bit_depth_ = true;
+ bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+ source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+ reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+ second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+ comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+ comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+ }
+ mask_ = (1 << bit_depth_) - 1;
+ source_stride_ = (width_ + 31) & ~31;
+ reference_stride_ = width_ * 2;
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ virtual uint8_t *GetReference(int block_idx) {
+ if (use_high_bit_depth_)
+ return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+ block_idx * kDataBlockSize);
+ return reference_data_ + block_idx * kDataBlockSize;
+ }
+
+ // Sum of Absolute Differences. Given two blocks, calculate the absolute
+ // difference between two pixels in the same relative location; accumulate.
+ unsigned int ReferenceSAD(int block_idx) {
+ unsigned int sad = 0;
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const source8 = source_data_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ sad += abs(source8[h * source_stride_ + w] -
+ reference8[h * reference_stride_ + w]);
+ } else {
+ sad += abs(source16[h * source_stride_ + w] -
+ reference16[h * reference_stride_ + w]);
+ }
+ }
+ }
+ return sad;
+ }
+
+ // Sum of Absolute Differences Skip rows. Given two blocks,
+ // calculate the absolute difference between two pixels in the same
+ // relative location every other row; accumulate and double the result at the
+ // end.
+ unsigned int ReferenceSADSkip(int block_idx) {
+ unsigned int sad = 0;
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const source8 = source_data_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+ for (int h = 0; h < height_; h += 2) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ sad += abs(source8[h * source_stride_ + w] -
+ reference8[h * reference_stride_ + w]);
+ } else {
+ sad += abs(source16[h * source_stride_ + w] -
+ reference16[h * reference_stride_ + w]);
+ }
+ }
+ }
+ return sad * 2;
+ }
+
+ // Sum of Absolute Differences Average. Given two blocks, and a prediction
+ // calculate the absolute difference between one pixel and average of the
+ // corresponding and predicted pixels; accumulate.
+ unsigned int ReferenceSADavg(int block_idx) {
+ unsigned int sad = 0;
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const source8 = source_data_;
+ const uint8_t *const second_pred8 = second_pred_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+ const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ const int tmp = second_pred8[h * width_ + w] +
+ reference8[h * reference_stride_ + w];
+ const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+ sad += abs(source8[h * source_stride_ + w] - comp_pred);
+ } else {
+ const int tmp = second_pred16[h * width_ + w] +
+ reference16[h * reference_stride_ + w];
+ const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+ sad += abs(source16[h * source_stride_ + w] - comp_pred);
+ }
+ }
+ }
+ return sad;
+ }
+
+ unsigned int ReferenceDistWtdSADavg(int block_idx) {
+ unsigned int sad = 0;
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const source8 = source_data_;
+ const uint8_t *const second_pred8 = second_pred_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+ const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ const int tmp =
+ second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+ reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+ const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+ sad += abs(source8[h * source_stride_ + w] - comp_pred);
+ } else {
+ const int tmp =
+ second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+ reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+ const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+ sad += abs(source16[h * source_stride_ + w] - comp_pred);
+ }
+ }
+ }
+ return sad;
+ }
+
+ void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+ } else {
+ data16[h * stride + w] = fill_constant;
+ }
+ }
+ }
+ }
+
+ void FillRandom(uint8_t *data, int stride) {
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = rnd_.Rand8();
+ } else {
+ data16[h * stride + w] = rnd_.Rand16() & mask_;
+ }
+ }
+ }
+ }
+
+ virtual void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) {
+ (void)results;
+ (void)references;
+ }
+
+ void SpeedSAD() {
+ int test_count = 20000000;
+ unsigned int exp_sad[4];
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ while (test_count > 0) {
+ SADForSpeedTest(exp_sad, references);
+ test_count -= 1;
+ }
+ aom_usec_timer_mark(&timer);
+ const int64_t time = aom_usec_timer_elapsed(&timer) / 1000;
+ std::cout << "BLOCK_" << width_ << "X" << height_
+ << ", bit_depth:" << bit_depth_ << ",Time: " << time << "ms"
+ << std::endl;
+ }
+
+ int width_, height_, mask_, bd_;
+ aom_bit_depth_t bit_depth_;
+ static uint8_t *source_data_;
+ static uint8_t *reference_data_;
+ static uint8_t *second_pred_;
+ int source_stride_;
+ bool use_high_bit_depth_;
+ static uint8_t *source_data8_;
+ static uint8_t *reference_data8_;
+ static uint8_t *second_pred8_;
+ static uint16_t *source_data16_;
+ static uint16_t *reference_data16_;
+ static uint16_t *second_pred16_;
+ int reference_stride_;
+ static uint8_t *comp_pred_;
+ static uint8_t *comp_pred8_;
+ static uint16_t *comp_pred16_;
+ static uint8_t *comp_pred_test_;
+ static uint8_t *comp_pred8_test_;
+ static uint16_t *comp_pred16_test_;
+ DIST_WTD_COMP_PARAMS jcp_param_;
+
+ ACMRandom rnd_;
+};
+
+class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+ public SADTestBase {
+ public:
+ SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ void SADs(unsigned int *results) {
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+
+ API_REGISTER_STATE_CHECK(GET_PARAM(2)(
+ source_data_, source_stride_, references, reference_stride_, results));
+ }
+
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) override {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
+ }
+
+ void CheckSADs() {
+ unsigned int reference_sad, exp_sad[4];
+ SADs(exp_sad);
+ for (int block = 0; block < 4; ++block) {
+ reference_sad = ReferenceSAD(block);
+
+ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+ }
+ }
+};
+
+class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+ public SADTestBase {
+ public:
+ SADx3Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ void SADs(unsigned int *results) {
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
+ }
+
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) override {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
+ }
+
+ void CheckSADs() {
+ unsigned int reference_sad, exp_sad[4];
+
+ SADs(exp_sad);
+ for (int block = 0; block < 3; ++block) {
+ reference_sad = ReferenceSAD(block);
+
+ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+ }
+ }
+};
+
+class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+ public SADTestBase {
+ public:
+ SADSkipx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ void SADs(unsigned int *results) {
+ const uint8_t *references[] = { GetReference(0), GetReference(1),
+ GetReference(2), GetReference(3) };
+
+ API_REGISTER_STATE_CHECK(GET_PARAM(2)(
+ source_data_, source_stride_, references, reference_stride_, results));
+ }
+
+ void CheckSADs() {
+ unsigned int reference_sad, exp_sad[4];
+
+ SADs(exp_sad);
+ for (int block = 0; block < 4; ++block) {
+ reference_sad = ReferenceSADSkip(block);
+
+ EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+ }
+ }
+
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) override {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references, reference_stride_, results);
+ }
+};
+
+class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
+ public SADTestBase {
+ public:
+ SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ unsigned int SAD(int block_idx) {
+ unsigned int ret;
+ const uint8_t *const reference = GetReference(block_idx);
+
+ API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+ reference, reference_stride_));
+ return ret;
+ }
+
+ void CheckSAD() {
+ const unsigned int reference_sad = ReferenceSAD(0);
+ const unsigned int exp_sad = SAD(0);
+
+ ASSERT_EQ(reference_sad, exp_sad);
+ }
+
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) override {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references[0], reference_stride_);
+ (void)results;
+ }
+};
+
+class SADSkipTest : public ::testing::WithParamInterface<SadMxNParam>,
+ public SADTestBase {
+ public:
+ SADSkipTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ unsigned int SAD(int block_idx) {
+ unsigned int ret;
+ const uint8_t *const reference = GetReference(block_idx);
+
+ API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+ reference, reference_stride_));
+ return ret;
+ }
+
+ void CheckSAD() {
+ const unsigned int reference_sad = ReferenceSADSkip(0);
+ const unsigned int exp_sad = SAD(0);
+
+ ASSERT_EQ(reference_sad, exp_sad);
+ }
+
+ void SADForSpeedTest(unsigned int *results,
+ const uint8_t *const *references) override {
+ GET_PARAM(2)
+ (source_data_, source_stride_, references[0], reference_stride_);
+ (void)results;
+ }
+};
+
+class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
+ public SADTestBase {
+ public:
+ SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ unsigned int SAD_avg(int block_idx) {
+ unsigned int ret;
+ const uint8_t *const reference = GetReference(block_idx);
+
+ API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+ reference, reference_stride_,
+ second_pred_));
+ return ret;
+ }
+
+ void CheckSAD() {
+ const unsigned int reference_sad = ReferenceSADavg(0);
+ const unsigned int exp_sad = SAD_avg(0);
+
+ ASSERT_EQ(reference_sad, exp_sad);
+ }
+};
+
+class DistWtdSADavgTest
+ : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
+ public SADTestBase {
+ public:
+ DistWtdSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ unsigned int dist_wtd_SAD_avg(int block_idx) {
+ unsigned int ret;
+ const uint8_t *const reference = GetReference(block_idx);
+
+ API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+ reference, reference_stride_,
+ second_pred_, &jcp_param_));
+ return ret;
+ }
+
+ void CheckSAD() {
+ for (int j = 0; j < 2; ++j) {
+ for (int i = 0; i < 4; ++i) {
+ jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+ jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
+
+ const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
+ const unsigned int exp_sad = dist_wtd_SAD_avg(0);
+
+ ASSERT_EQ(reference_sad, exp_sad);
+ }
+ }
+ }
+};
+
+uint8_t *SADTestBase::source_data_ = nullptr;
+uint8_t *SADTestBase::reference_data_ = nullptr;
+uint8_t *SADTestBase::second_pred_ = nullptr;
+uint8_t *SADTestBase::comp_pred_ = nullptr;
+uint8_t *SADTestBase::comp_pred_test_ = nullptr;
+uint8_t *SADTestBase::source_data8_ = nullptr;
+uint8_t *SADTestBase::reference_data8_ = nullptr;
+uint8_t *SADTestBase::second_pred8_ = nullptr;
+uint8_t *SADTestBase::comp_pred8_ = nullptr;
+uint8_t *SADTestBase::comp_pred8_test_ = nullptr;
+uint16_t *SADTestBase::source_data16_ = nullptr;
+uint16_t *SADTestBase::reference_data16_ = nullptr;
+uint16_t *SADTestBase::second_pred16_ = nullptr;
+uint16_t *SADTestBase::comp_pred16_ = nullptr;
+uint16_t *SADTestBase::comp_pred16_test_ = nullptr;
+
+TEST_P(SADTest, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(reference_data_, reference_stride_, mask_);
+ CheckSAD();
+}
+
+TEST_P(SADTest, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(reference_data_, reference_stride_, 0);
+ CheckSAD();
+}
+
+TEST_P(SADTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 2000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ if (testing::Test::HasFatalFailure()) break;
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, DISABLED_Speed) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ SpeedSAD();
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(reference_data_, reference_stride_, mask_);
+ CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(reference_data_, reference_stride_, 0);
+ CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 2000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ CheckSAD();
+ if (testing::Test::HasFatalFailure()) break;
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ SpeedSAD();
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(reference_data_, reference_stride_, mask_);
+ FillConstant(second_pred_, width_, 0);
+ CheckSAD();
+}
+TEST_P(SADavgTest, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(reference_data_, reference_stride_, 0);
+ FillConstant(second_pred_, width_, 0);
+ CheckSAD();
+}
+
+TEST_P(SADavgTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, ShortSrc) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 2000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ if (testing::Test::HasFatalFailure()) break;
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADavgTest, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(reference_data_, reference_stride_, mask_);
+ FillConstant(second_pred_, width_, 0);
+ CheckSAD();
+}
+TEST_P(DistWtdSADavgTest, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(reference_data_, reference_stride_, 0);
+ FillConstant(second_pred_, width_, 0);
+ CheckSAD();
+}
+
+TEST_P(DistWtdSADavgTest, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADavgTest, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADavgTest, ShortSrc) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 2000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ if (testing::Test::HasFatalFailure()) break;
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+// SADx4
+TEST_P(SADx4Test, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(GetReference(0), reference_stride_, mask_);
+ FillConstant(GetReference(1), reference_stride_, mask_);
+ FillConstant(GetReference(2), reference_stride_, mask_);
+ FillConstant(GetReference(3), reference_stride_, mask_);
+ CheckSADs();
+}
+
+TEST_P(SADx4Test, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(GetReference(0), reference_stride_, 0);
+ FillConstant(GetReference(1), reference_stride_, 0);
+ FillConstant(GetReference(2), reference_stride_, 0);
+ FillConstant(GetReference(3), reference_stride_, 0);
+ CheckSADs();
+}
+
+TEST_P(SADx4Test, ShortRef) {
+ int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, ShortSrc) {
+ int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 1000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+ uint8_t *tmp_source_data = source_data_;
+ source_data_ += width_;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_data_ = tmp_source_data;
+}
+
+TEST_P(SADx4Test, DISABLED_Speed) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ SpeedSAD();
+}
+
+// SADx3
+TEST_P(SADx3Test, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(GetReference(0), reference_stride_, mask_);
+ FillConstant(GetReference(1), reference_stride_, mask_);
+ FillConstant(GetReference(2), reference_stride_, mask_);
+ FillConstant(GetReference(3), reference_stride_, mask_);
+ CheckSADs();
+}
+
+TEST_P(SADx3Test, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(GetReference(0), reference_stride_, 0);
+ FillConstant(GetReference(1), reference_stride_, 0);
+ FillConstant(GetReference(2), reference_stride_, 0);
+ FillConstant(GetReference(3), reference_stride_, 0);
+ CheckSADs();
+}
+
+TEST_P(SADx3Test, ShortRef) {
+ int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, ShortSrc) {
+ int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 1000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx3Test, SrcAlignedByWidth) {
+ uint8_t *tmp_source_data = source_data_;
+ source_data_ += width_;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_data_ = tmp_source_data;
+}
+
+TEST_P(SADx3Test, DISABLED_Speed) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ SpeedSAD();
+}
+
+// SADSkipx4
+TEST_P(SADSkipx4Test, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(GetReference(0), reference_stride_, mask_);
+ FillConstant(GetReference(1), reference_stride_, mask_);
+ FillConstant(GetReference(2), reference_stride_, mask_);
+ FillConstant(GetReference(3), reference_stride_, mask_);
+ CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(GetReference(0), reference_stride_, 0);
+ FillConstant(GetReference(1), reference_stride_, 0);
+ FillConstant(GetReference(2), reference_stride_, 0);
+ FillConstant(GetReference(3), reference_stride_, 0);
+ CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+ int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+ int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ int test_count = 1000;
+ while (test_count > 0) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ test_count -= 1;
+ }
+ source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+ uint8_t *tmp_source_data = source_data_;
+ source_data_ += width_;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ SpeedSAD();
+}
+
+using std::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+const SadMxNParam c_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_c, -1),
+ make_tuple(128, 64, &aom_sad128x64_c, -1),
+ make_tuple(64, 128, &aom_sad64x128_c, -1),
+ make_tuple(64, 64, &aom_sad64x64_c, -1),
+ make_tuple(64, 32, &aom_sad64x32_c, -1),
+ make_tuple(32, 64, &aom_sad32x64_c, -1),
+ make_tuple(32, 32, &aom_sad32x32_c, -1),
+ make_tuple(32, 16, &aom_sad32x16_c, -1),
+ make_tuple(16, 32, &aom_sad16x32_c, -1),
+ make_tuple(16, 16, &aom_sad16x16_c, -1),
+ make_tuple(16, 8, &aom_sad16x8_c, -1),
+ make_tuple(8, 16, &aom_sad8x16_c, -1),
+ make_tuple(8, 8, &aom_sad8x8_c, -1),
+ make_tuple(8, 4, &aom_sad8x4_c, -1),
+ make_tuple(4, 8, &aom_sad4x8_c, -1),
+ make_tuple(4, 4, &aom_sad4x4_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_c, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_c, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_c, -1),
+ make_tuple(16, 64, &aom_sad16x64_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_c, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_c, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_c, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8_c, -1),
+ make_tuple(8, 32, &aom_sad8x32_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_c, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4_c, -1),
+ make_tuple(4, 16, &aom_sad4x16_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_c, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+
+const SadSkipMxNParam skip_c_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128_c, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64_c, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128_c, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64_c, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32_c, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64_c, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32_c, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16_c, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32_c, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16_c, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8_c, -1),
+ make_tuple(8, 16, &aom_sad_skip_8x16_c, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8_c, -1),
+ make_tuple(8, 4, &aom_sad_skip_8x4_c, -1),
+ make_tuple(4, 8, &aom_sad_skip_4x8_c, -1),
+ make_tuple(4, 4, &aom_sad_skip_4x4_c, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16_c, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64_c, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8_c, -1),
+ make_tuple(8, 32, &aom_sad_skip_8x32_c, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4_c, -1),
+ make_tuple(4, 16, &aom_sad_skip_4x16_c, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 8),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 8),
+#endif
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 10),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 10),
+#endif
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4_c, 12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_c, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_c, 12),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_c, 12),
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
+const SadMxNAvgParam avg_c_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
+ make_tuple(64, 128, &aom_sad64x128_avg_c, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_c, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_c, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_c, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_c, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_c, -1),
+ make_tuple(16, 32, &aom_sad16x32_avg_c, -1),
+ make_tuple(16, 16, &aom_sad16x16_avg_c, -1),
+ make_tuple(16, 8, &aom_sad16x8_avg_c, -1),
+ make_tuple(8, 16, &aom_sad8x16_avg_c, -1),
+ make_tuple(8, 8, &aom_sad8x8_avg_c, -1),
+ make_tuple(8, 4, &aom_sad8x4_avg_c, -1),
+ make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
+ make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_avg_c, -1),
+ make_tuple(16, 64, &aom_sad16x64_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8_avg_c, -1),
+ make_tuple(8, 32, &aom_sad8x32_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4_avg_c, -1),
+ make_tuple(4, 16, &aom_sad4x16_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1),
+ make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1),
+ make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1),
+ make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1),
+ make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
+ make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1),
+ make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1),
+ make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1),
+ make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1),
+ make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1),
+ make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_c_tests));
+
+const SadMxNx4Param x4d_c_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_c, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_c, -1),
+ make_tuple(64, 64, &aom_sad64x64x4d_c, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_c, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_c, -1),
+ make_tuple(32, 32, &aom_sad32x32x4d_c, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_c, -1),
+ make_tuple(16, 32, &aom_sad16x32x4d_c, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_c, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_c, -1),
+ make_tuple(8, 16, &aom_sad8x16x4d_c, -1),
+ make_tuple(8, 8, &aom_sad8x8x4d_c, -1),
+ make_tuple(8, 4, &aom_sad8x4x4d_c, -1),
+ make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
+ make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x4d_c, -1),
+ make_tuple(16, 64, &aom_sad16x64x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8x4d_c, -1),
+ make_tuple(8, 32, &aom_sad8x32x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4x4d_c, -1),
+ make_tuple(4, 16, &aom_sad4x16x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+
+const SadMxNx4Param x3d_c_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x3d_c, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_c, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_c, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_c, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_c, -1),
+ make_tuple(32, 64, &aom_sad32x64x3d_c, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_c, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_c, -1),
+ make_tuple(16, 32, &aom_sad16x32x3d_c, -1),
+ make_tuple(16, 16, &aom_sad16x16x3d_c, -1),
+ make_tuple(16, 8, &aom_sad16x8x3d_c, -1),
+ make_tuple(8, 16, &aom_sad8x16x3d_c, -1),
+ make_tuple(8, 8, &aom_sad8x8x3d_c, -1),
+ make_tuple(8, 4, &aom_sad8x4x3d_c, -1),
+ make_tuple(4, 8, &aom_sad4x8x3d_c, -1),
+ make_tuple(4, 4, &aom_sad4x4x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_c, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x3d_c, -1),
+ make_tuple(16, 64, &aom_sad16x64x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_c, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8x3d_c, -1),
+ make_tuple(8, 32, &aom_sad8x32x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_c, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4x3d_c, -1),
+ make_tuple(4, 16, &aom_sad4x16x3d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_c, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx3Test, ::testing::ValuesIn(x3d_c_tests));
+
+const SadMxNx4Param skip_x4d_c_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128x4d_c, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64x4d_c, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128x4d_c, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64x4d_c, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32x4d_c, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64x4d_c, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32x4d_c, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16x4d_c, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_c, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_c, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_c, -1),
+ make_tuple(8, 16, &aom_sad_skip_8x16x4d_c, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8x4d_c, -1),
+ make_tuple(4, 8, &aom_sad_skip_4x8x4d_c, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16x4d_c, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_c, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8x4d_c, -1),
+ make_tuple(8, 32, &aom_sad_skip_8x32x4d_c, -1),
+ make_tuple(4, 16, &aom_sad_skip_4x16x4d_c, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 8),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 8),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 8),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 8),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 8),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 8),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 8),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 8),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 8),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 8),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 8),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 8),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 8),
+#endif
+
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 10),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 10),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 10),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 10),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 10),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 10),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 10),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 10),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 10),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 10),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 10),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 10),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 10),
+#endif
+
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_c, 12),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_c, 12),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_c, 12),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_c, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_c, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_c, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_c, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_c, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_c, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_c, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_c, 12),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_c, 12),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_c, 12),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_c, 12),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_c, 12),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_c, 12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_c, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_c, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_c, 12),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_c, 12),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_c, 12),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_c, 12),
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_c_tests));
+
+//------------------------------------------------------------------------------
+// ARM functions
+#if HAVE_NEON
+const SadMxNParam neon_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_neon, -1),
+ make_tuple(128, 64, &aom_sad128x64_neon, -1),
+ make_tuple(64, 128, &aom_sad64x128_neon, -1),
+ make_tuple(64, 64, &aom_sad64x64_neon, -1),
+ make_tuple(64, 32, &aom_sad64x32_neon, -1),
+ make_tuple(32, 64, &aom_sad32x64_neon, -1),
+ make_tuple(32, 32, &aom_sad32x32_neon, -1),
+ make_tuple(32, 16, &aom_sad32x16_neon, -1),
+ make_tuple(16, 32, &aom_sad16x32_neon, -1),
+ make_tuple(16, 16, &aom_sad16x16_neon, -1),
+ make_tuple(16, 8, &aom_sad16x8_neon, -1),
+ make_tuple(8, 16, &aom_sad8x16_neon, -1),
+ make_tuple(8, 8, &aom_sad8x8_neon, -1),
+ make_tuple(8, 4, &aom_sad8x4_neon, -1),
+ make_tuple(4, 8, &aom_sad4x8_neon, -1),
+ make_tuple(4, 4, &aom_sad4x4_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_neon, -1),
+ make_tuple(32, 8, &aom_sad32x8_neon, -1),
+ make_tuple(16, 64, &aom_sad16x64_neon, -1),
+ make_tuple(16, 4, &aom_sad16x4_neon, -1),
+ make_tuple(8, 32, &aom_sad8x32_neon, -1),
+ make_tuple(4, 16, &aom_sad4x16_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+
+const SadMxNx4Param x4d_neon_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x4d_neon, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_neon, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_neon, -1),
+ make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_neon, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_neon, -1),
+ make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_neon, -1),
+ make_tuple(16, 32, &aom_sad16x32x4d_neon, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_neon, -1),
+ make_tuple(8, 16, &aom_sad8x16x4d_neon, -1),
+ make_tuple(8, 8, &aom_sad8x8x4d_neon, -1),
+ make_tuple(8, 4, &aom_sad8x4x4d_neon, -1),
+ make_tuple(4, 8, &aom_sad4x8x4d_neon, -1),
+ make_tuple(4, 4, &aom_sad4x4x4d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x4d_neon, -1),
+ make_tuple(32, 8, &aom_sad32x8x4d_neon, -1),
+ make_tuple(16, 64, &aom_sad16x64x4d_neon, -1),
+ make_tuple(16, 4, &aom_sad16x4x4d_neon, -1),
+ make_tuple(8, 32, &aom_sad8x32x4d_neon, -1),
+ make_tuple(4, 16, &aom_sad4x16x4d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+const SadSkipMxNParam skip_neon_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128_neon, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64_neon, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128_neon, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64_neon, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32_neon, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64_neon, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32_neon, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16_neon, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32_neon, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16_neon, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8_neon, -1),
+ make_tuple(8, 16, &aom_sad_skip_8x16_neon, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8_neon, -1),
+ make_tuple(8, 4, &aom_sad_skip_8x4_neon, -1),
+ make_tuple(4, 8, &aom_sad_skip_4x8_neon, -1),
+ make_tuple(4, 4, &aom_sad_skip_4x4_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16_neon, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8_neon, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64_neon, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4_neon, -1),
+ make_tuple(8, 32, &aom_sad_skip_8x32_neon, -1),
+ make_tuple(4, 16, &aom_sad_skip_4x16_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+ ::testing::ValuesIn(skip_neon_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon, -1),
+ make_tuple(8, 16, &aom_sad_skip_8x16x4d_neon, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
+ make_tuple(8, 4, &aom_sad_skip_8x4x4d_neon, -1),
+ make_tuple(4, 8, &aom_sad_skip_4x8x4d_neon, -1),
+ make_tuple(4, 4, &aom_sad_skip_4x4x4d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad_skip_8x4x4d_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad_skip_4x4x4d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4x4d_neon, -1),
+ make_tuple(8, 32, &aom_sad_skip_8x32x4d_neon, -1),
+ make_tuple(4, 16, &aom_sad_skip_4x16x4d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad_skip_16x4x4d_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_neon_tests));
+
+const SadMxNAvgParam avg_neon_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_avg_neon, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_neon, -1),
+ make_tuple(64, 128, &aom_sad64x128_avg_neon, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_neon, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_neon, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_neon, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_neon, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_neon, -1),
+ make_tuple(16, 32, &aom_sad16x32_avg_neon, -1),
+ make_tuple(16, 16, &aom_sad16x16_avg_neon, -1),
+ make_tuple(16, 8, &aom_sad16x8_avg_neon, -1),
+ make_tuple(8, 16, &aom_sad8x16_avg_neon, -1),
+ make_tuple(8, 8, &aom_sad8x8_avg_neon, -1),
+ make_tuple(8, 4, &aom_sad8x4_avg_neon, -1),
+ make_tuple(4, 8, &aom_sad4x8_avg_neon, -1),
+ make_tuple(4, 4, &aom_sad4x4_avg_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_avg_neon, -1),
+ make_tuple(32, 8, &aom_sad32x8_avg_neon, -1),
+ make_tuple(16, 64, &aom_sad16x64_avg_neon, -1),
+ make_tuple(16, 4, &aom_sad16x4_avg_neon, -1),
+ make_tuple(8, 32, &aom_sad8x32_avg_neon, -1),
+ make_tuple(4, 16, &aom_sad4x16_avg_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_neon_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_neon, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_neon, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_neon, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_neon, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_neon, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_neon, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_neon, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_neon, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_neon, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_neon, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_neon, -1),
+ make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_neon, -1),
+ make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_neon, -1),
+ make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_neon, -1),
+ make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_neon, -1),
+ make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_neon, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_neon, -1),
+ make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_neon, -1),
+ make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_neon, -1),
+ make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_neon, -1),
+ make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_neon, -1),
+ make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_neon, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_neon_tests));
+
+const SadMxNx4Param x3d_neon_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x3d_neon, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_neon, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_neon, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_neon, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_neon, -1),
+ make_tuple(32, 64, &aom_sad32x64x3d_neon, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_neon, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_neon, -1),
+ make_tuple(16, 32, &aom_sad16x32x3d_neon, -1),
+ make_tuple(16, 16, &aom_sad16x16x3d_neon, -1),
+ make_tuple(16, 8, &aom_sad16x8x3d_neon, -1),
+ make_tuple(8, 16, &aom_sad8x16x3d_neon, -1),
+ make_tuple(8, 8, &aom_sad8x8x3d_neon, -1),
+ make_tuple(8, 4, &aom_sad8x4x3d_neon, -1),
+ make_tuple(4, 8, &aom_sad4x8x3d_neon, -1),
+ make_tuple(4, 4, &aom_sad4x4x3d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_neon, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_neon, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_neon, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_neon, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_neon, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_neon, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_neon, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_neon, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_neon, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_neon, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_neon, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x3d_neon, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x3d_neon, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x3d_neon, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x3d_neon, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x3d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x3d_neon, -1),
+ make_tuple(32, 8, &aom_sad32x8x3d_neon, -1),
+ make_tuple(16, 64, &aom_sad16x64x3d_neon, -1),
+ make_tuple(16, 4, &aom_sad16x4x3d_neon, -1),
+ make_tuple(8, 32, &aom_sad8x32x3d_neon, -1),
+ make_tuple(4, 16, &aom_sad4x16x3d_neon, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_neon, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_neon, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_neon, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x3d_neon, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_neon, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x3d_neon, 12),
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADx3Test, ::testing::ValuesIn(x3d_neon_tests));
+
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+ ::testing::ValuesIn(neon_dotprod_tests));
+
+const SadMxNParam skip_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+ ::testing::ValuesIn(skip_neon_dotprod_tests));
+
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_avg_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128_avg_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32_avg_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16_avg_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_avg_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8_avg_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64_avg_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4_avg_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+ ::testing::ValuesIn(avg_neon_dotprod_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_neon_dotprod_tests));
+
+const SadMxNx4Param x3d_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x3d_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64x3d_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32x3d_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16x3d_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8x3d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x3d_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8x3d_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64x3d_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4x3d_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx3Test,
+ ::testing::ValuesIn(x3d_neon_dotprod_tests));
+
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x4d_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64x4d_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32x4d_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32x4d_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x4d_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8x4d_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64x4d_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4x4d_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+ ::testing::ValuesIn(x4d_neon_dotprod_tests));
+
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128x4d_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64x4d_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128x4d_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64x4d_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32x4d_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64x4d_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32x4d_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16x4d_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4x4d_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif // HAVE_NEON_DOTPROD
+
+//------------------------------------------------------------------------------
+// x86 functions
+#if HAVE_SSE2
+const SadMxNParam sse2_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_sse2, -1),
+ make_tuple(128, 64, &aom_sad128x64_sse2, -1),
+ make_tuple(64, 128, &aom_sad64x128_sse2, -1),
+ make_tuple(64, 64, &aom_sad64x64_sse2, -1),
+ make_tuple(64, 32, &aom_sad64x32_sse2, -1),
+ make_tuple(32, 64, &aom_sad32x64_sse2, -1),
+ make_tuple(32, 32, &aom_sad32x32_sse2, -1),
+ make_tuple(32, 16, &aom_sad32x16_sse2, -1),
+ make_tuple(16, 32, &aom_sad16x32_sse2, -1),
+ make_tuple(16, 16, &aom_sad16x16_sse2, -1),
+ make_tuple(16, 8, &aom_sad16x8_sse2, -1),
+ make_tuple(8, 16, &aom_sad8x16_sse2, -1),
+ make_tuple(8, 8, &aom_sad8x8_sse2, -1),
+ make_tuple(8, 4, &aom_sad8x4_sse2, -1),
+ make_tuple(4, 8, &aom_sad4x8_sse2, -1),
+ make_tuple(4, 4, &aom_sad4x4_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_sse2, -1),
+ make_tuple(16, 64, &aom_sad16x64_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8_sse2, -1),
+ make_tuple(8, 32, &aom_sad8x32_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4_sse2, -1),
+ make_tuple(4, 16, &aom_sad4x16_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+
+const SadSkipMxNParam skip_sse2_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128_sse2, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64_sse2, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128_sse2, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64_sse2, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32_sse2, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64_sse2, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32_sse2, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16_sse2, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32_sse2, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16_sse2, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8_sse2, -1),
+ make_tuple(8, 16, &aom_sad_skip_8x16_sse2, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8_sse2, -1),
+ make_tuple(4, 8, &aom_sad_skip_4x8_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16_sse2, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64_sse2, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8_sse2, -1),
+ make_tuple(8, 32, &aom_sad_skip_8x32_sse2, -1),
+ make_tuple(4, 16, &aom_sad_skip_4x16_sse2, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 8),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 8),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 8),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 8),
+#endif
+
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 10),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 10),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 10),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 10),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 10),
+#endif
+
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_sse2, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_sse2, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_sse2, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_sse2, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_sse2, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_sse2, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_sse2, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_sse2, 12),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16_sse2, 12),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8_sse2, 12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16_sse2, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_sse2, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8_sse2, 12),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16_sse2, 12),
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+ ::testing::ValuesIn(skip_sse2_tests));
+
+const SadMxNAvgParam avg_sse2_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
+ make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_sse2, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_sse2, -1),
+ make_tuple(16, 32, &aom_sad16x32_avg_sse2, -1),
+ make_tuple(16, 16, &aom_sad16x16_avg_sse2, -1),
+ make_tuple(16, 8, &aom_sad16x8_avg_sse2, -1),
+ make_tuple(8, 16, &aom_sad8x16_avg_sse2, -1),
+ make_tuple(8, 8, &aom_sad8x8_avg_sse2, -1),
+ make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1),
+ make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1),
+ make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
+ make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8_avg_sse2, -1),
+ make_tuple(8, 32, &aom_sad8x32_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1),
+ make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
+
+const SadMxNx4Param x4d_sse2_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1),
+ make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1),
+ make_tuple(32, 32, &aom_sad32x32x4d_sse2, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_sse2, -1),
+ make_tuple(16, 32, &aom_sad16x32x4d_sse2, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_sse2, -1),
+ make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1),
+ make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1),
+ make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
+ make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
+ make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 8),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 8),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 8),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 8),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 10),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 10),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 10),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 10),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 12),
+ make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 12),
+ make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 12),
+ make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
+ make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
+#endif
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1),
+ make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 12),
+#endif
+ make_tuple(32, 8, &aom_sad32x8x4d_sse2, -1),
+ make_tuple(8, 32, &aom_sad8x32x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 8),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 10),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 12),
+ make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 12),
+#endif
+ make_tuple(16, 4, &aom_sad16x4x4d_sse2, -1),
+ make_tuple(4, 16, &aom_sad4x16x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128x4d_sse2, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64x4d_sse2, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128x4d_sse2, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64x4d_sse2, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32x4d_sse2, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64x4d_sse2, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32x4d_sse2, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16x4d_sse2, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_sse2, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_sse2, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_sse2, -1),
+ make_tuple(8, 16, &aom_sad_skip_8x16x4d_sse2, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8x4d_sse2, -1),
+ make_tuple(4, 8, &aom_sad_skip_4x8x4d_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16x4d_sse2, -1),
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_sse2, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8x4d_sse2, -1),
+ make_tuple(8, 32, &aom_sad_skip_8x32x4d_sse2, -1),
+ make_tuple(4, 16, &aom_sad_skip_4x16x4d_sse2, -1),
+#endif
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 8),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 8),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 8),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 8),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 8),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 8),
+#endif
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 10),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 10),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 10),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 10),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 10),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 10),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 10),
+#endif
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_sse2, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_sse2, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_sse2, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_sse2, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_sse2, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_sse2, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_sse2, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_sse2, 12),
+ make_tuple(8, 16, &aom_highbd_sad_skip_8x16x4d_sse2, 12),
+ make_tuple(8, 8, &aom_highbd_sad_skip_8x8x4d_sse2, 12),
+ make_tuple(4, 8, &aom_highbd_sad_skip_4x8x4d_sse2, 12),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_sse2, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_sse2, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_sse2, 12),
+ make_tuple(8, 32, &aom_highbd_sad_skip_8x32x4d_sse2, 12),
+ make_tuple(4, 16, &aom_highbd_sad_skip_4x16x4d_sse2, 12),
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_sse2_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_sse2_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_sse2, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_sse2, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_sse2, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_sse2, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_sse2, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_sse2, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_sse2, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_sse2, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_sse2, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_sse2, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_sse2, -1),
+ make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_sse2, -1),
+ make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_sse2, -1),
+ make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_sse2, -1),
+ make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_sse2, -1),
+ make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_sse2, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_sse2, -1),
+ make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_sse2, -1),
+ make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_sse2, -1),
+ make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_sse2, -1),
+ make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_sse2, -1),
+ make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_sse2, -1),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(sse2, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_sse2_tests));
+#endif // HAVE_SSE2
+
+#if HAVE_SSE3
+// Only functions are x3, which do not have tests.
+#endif // HAVE_SSE3
+
+#if HAVE_SSE4_1
+// Only functions are x8, which do not have tests.
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const SadMxNParam avx2_tests[] = {
+ make_tuple(64, 128, &aom_sad64x128_avx2, -1),
+ make_tuple(128, 64, &aom_sad128x64_avx2, -1),
+ make_tuple(128, 128, &aom_sad128x128_avx2, -1),
+ make_tuple(64, 64, &aom_sad64x64_avx2, -1),
+ make_tuple(64, 32, &aom_sad64x32_avx2, -1),
+ make_tuple(32, 64, &aom_sad32x64_avx2, -1),
+ make_tuple(32, 32, &aom_sad32x32_avx2, -1),
+ make_tuple(32, 16, &aom_sad32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+
+const SadSkipMxNParam skip_avx2_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128_avx2, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64_avx2, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128_avx2, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64_avx2, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32_avx2, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64_avx2, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32_avx2, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 8),
+
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 10),
+
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64_avx2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+ ::testing::ValuesIn(skip_avx2_tests));
+
+const SadMxNAvgParam avg_avx2_tests[] = {
+ make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
+ make_tuple(128, 128, &aom_sad128x128_avg_avx2, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_avx2, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_avx2, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 12),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+ make_tuple(128, 128, &aom_sad_skip_128x128x4d_avx2, -1),
+ make_tuple(128, 64, &aom_sad_skip_128x64x4d_avx2, -1),
+ make_tuple(64, 128, &aom_sad_skip_64x128x4d_avx2, -1),
+ make_tuple(64, 64, &aom_sad_skip_64x64x4d_avx2, -1),
+ make_tuple(64, 32, &aom_sad_skip_64x32x4d_avx2, -1),
+ make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
+ make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
+ make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+ make_tuple(16, 32, &aom_sad_skip_16x32x4d_avx2, -1),
+ make_tuple(16, 16, &aom_sad_skip_16x16x4d_avx2, -1),
+ make_tuple(16, 8, &aom_sad_skip_16x8x4d_avx2, -1),
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 8),
+
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 10),
+
+ make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad_skip_128x64x4d_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad_skip_64x128x4d_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad_skip_64x64x4d_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad_skip_64x32x4d_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad_skip_32x64x4d_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad_skip_32x32x4d_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad_skip_32x16x4d_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad_skip_16x32x4d_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad_skip_16x16x4d_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad_skip_16x8x4d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 8),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 8),
+
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 10),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 10),
+
+ make_tuple(64, 16, &aom_highbd_sad_skip_64x16x4d_avx2, 12),
+ make_tuple(32, 8, &aom_highbd_sad_skip_32x8x4d_avx2, 12),
+ make_tuple(16, 64, &aom_highbd_sad_skip_16x64x4d_avx2, 12),
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
+ make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+
+ make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+ ::testing::ValuesIn(skip_x4d_avx2_tests));
+
+const SadMxNx4Param x4d_avx2_tests[] = {
+ make_tuple(16, 32, &aom_sad16x32x4d_avx2, -1),
+ make_tuple(16, 16, &aom_sad16x16x4d_avx2, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_avx2, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
+ make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
+ make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
+ make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(16, 64, &aom_sad16x64x4d_avx2, -1),
+ make_tuple(16, 4, &aom_sad16x4x4d_avx2, -1),
+ make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
+ make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 12),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 12),
+#endif
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+
+const SadMxNx4Param x3d_avx2_tests[] = {
+ make_tuple(32, 64, &aom_sad32x64x3d_avx2, -1),
+ make_tuple(32, 32, &aom_sad32x32x3d_avx2, -1),
+ make_tuple(32, 16, &aom_sad32x16x3d_avx2, -1),
+ make_tuple(64, 128, &aom_sad64x128x3d_avx2, -1),
+ make_tuple(64, 64, &aom_sad64x64x3d_avx2, -1),
+ make_tuple(64, 32, &aom_sad64x32x3d_avx2, -1),
+ make_tuple(128, 128, &aom_sad128x128x3d_avx2, -1),
+ make_tuple(128, 64, &aom_sad128x64x3d_avx2, -1),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(32, 8, &aom_sad32x8x3d_avx2, -1),
+ make_tuple(64, 16, &aom_sad64x16x3d_avx2, -1),
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 8),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 10),
+ make_tuple(128, 128, &aom_highbd_sad128x128x3d_avx2, 12),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 8),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 10),
+ make_tuple(128, 64, &aom_highbd_sad128x64x3d_avx2, 12),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 8),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 10),
+ make_tuple(64, 128, &aom_highbd_sad64x128x3d_avx2, 12),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 8),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 10),
+ make_tuple(64, 64, &aom_highbd_sad64x64x3d_avx2, 12),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 8),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 10),
+ make_tuple(64, 32, &aom_highbd_sad64x32x3d_avx2, 12),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 8),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 10),
+ make_tuple(32, 64, &aom_highbd_sad32x64x3d_avx2, 12),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 8),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 10),
+ make_tuple(32, 32, &aom_highbd_sad32x32x3d_avx2, 12),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 8),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 10),
+ make_tuple(32, 16, &aom_highbd_sad32x16x3d_avx2, 12),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 8),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 10),
+ make_tuple(16, 32, &aom_highbd_sad16x32x3d_avx2, 12),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 8),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 10),
+ make_tuple(16, 16, &aom_highbd_sad16x16x3d_avx2, 12),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 8),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 10),
+ make_tuple(16, 8, &aom_highbd_sad16x8x3d_avx2, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 8),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 10),
+ make_tuple(16, 64, &aom_highbd_sad16x64x3d_avx2, 12),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 8),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 10),
+ make_tuple(64, 16, &aom_highbd_sad64x16x3d_avx2, 12),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 8),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 10),
+ make_tuple(32, 8, &aom_highbd_sad32x8x3d_avx2, 12),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 8),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 10),
+ make_tuple(16, 4, &aom_highbd_sad16x4x3d_avx2, 12),
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx3Test, ::testing::ValuesIn(x3d_avx2_tests));
+#endif // HAVE_AVX2
+
+} // namespace
diff --git a/third_party/aom/test/sb_multipass_test.cc b/third_party/aom/test/sb_multipass_test.cc
new file mode 100644
index 0000000000..e27a2c60ee
--- /dev/null
+++ b/third_party/aom/test/sb_multipass_test.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+class AV1SBMultipassTest
+ : public ::libaom_test::CodecTestWith2Params<int, bool>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1SBMultipassTest()
+ : EncoderTest(GET_PARAM(0)), set_cpu_used_(GET_PARAM(1)),
+ row_mt_(GET_PARAM(2)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = 1280;
+ cfg.h = 720;
+ cfg.allow_lowbitdepth = 1;
+ decoder_ = codec_->CreateDecoder(cfg, 0);
+ if (decoder_->IsAV1()) {
+ decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+
+ size_enc_.clear();
+ md5_dec_.clear();
+ md5_enc_.clear();
+ }
+ ~AV1SBMultipassTest() override { delete decoder_; }
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_min_quantizer = 0;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ SetTileSize(encoder);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, use_multipass_);
+ encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ virtual void SetTileSize(libaom_test::Encoder *encoder) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+ encoder->Control(AV1E_SET_TILE_ROWS, 1);
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ size_enc_.push_back(pkt->data.frame.sz);
+
+ ::libaom_test::MD5 md5_enc;
+ md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+ pkt->data.frame.sz);
+ md5_enc_.push_back(md5_enc.Get());
+
+ const aom_codec_err_t res = decoder_->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ const aom_image_t *img = decoder_->GetDxData().Next();
+
+ if (img) {
+ ::libaom_test::MD5 md5_res;
+ md5_res.Add(img);
+ md5_dec_.push_back(md5_res.Get());
+ }
+ }
+
+ void DoTest() {
+ ::libaom_test::YUVVideoSource video(
+ "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 0, 6);
+ cfg_.rc_target_bitrate = 1000;
+
+ // Encode while coding each sb once
+ use_multipass_ = false;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> single_pass_size_enc;
+ std::vector<std::string> single_pass_md5_enc;
+ std::vector<std::string> single_pass_md5_dec;
+ single_pass_size_enc = size_enc_;
+ single_pass_md5_enc = md5_enc_;
+ single_pass_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Encode while coding each sb twice
+ use_multipass_ = true;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ std::vector<size_t> multi_pass_size_enc;
+ std::vector<std::string> multi_pass_md5_enc;
+ std::vector<std::string> multi_pass_md5_dec;
+ multi_pass_size_enc = size_enc_;
+ multi_pass_md5_enc = md5_enc_;
+ multi_pass_md5_dec = md5_dec_;
+ size_enc_.clear();
+ md5_enc_.clear();
+ md5_dec_.clear();
+
+ // Check that the vectors are equal.
+ ASSERT_EQ(single_pass_size_enc, multi_pass_size_enc);
+ ASSERT_EQ(single_pass_md5_enc, multi_pass_md5_enc);
+ ASSERT_EQ(single_pass_md5_dec, multi_pass_md5_dec);
+ }
+
+ bool use_multipass_;
+ int set_cpu_used_;
+ bool row_mt_;
+ ::libaom_test::Decoder *decoder_;
+ std::vector<size_t> size_enc_;
+ std::vector<std::string> md5_enc_;
+ std::vector<std::string> md5_dec_;
+};
+
+TEST_P(AV1SBMultipassTest, TwoPassMatchTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(AV1SBMultipassTest, ::testing::Range(4, 6),
+ ::testing::Bool());
+
+} // namespace
diff --git a/third_party/aom/test/sb_qp_sweep_test.cc b/third_party/aom/test/sb_qp_sweep_test.cc
new file mode 100644
index 0000000000..6c76a40b2a
--- /dev/null
+++ b/third_party/aom/test/sb_qp_sweep_test.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Parameters: cpu-used, row-mt.
+class AV1SBQPSweepTest : public ::libaom_test::CodecTestWith2Params<int, bool>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ AV1SBQPSweepTest()
+ : EncoderTest(GET_PARAM(0)), set_cpu_used_(GET_PARAM(1)),
+ row_mt_(GET_PARAM(2)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = 1280;
+ cfg.h = 720;
+ cfg.allow_lowbitdepth = 1;
+ decoder_ =
+ std::unique_ptr<::libaom_test::Decoder>(codec_->CreateDecoder(cfg, 0));
+ }
+ ~AV1SBQPSweepTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(::libaom_test::kTwoPassGood);
+
+ ASSERT_NE(decoder_, nullptr);
+ if (decoder_->IsAV1()) {
+ decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_Q;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ SetTileSize(encoder);
+ encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_ENABLE_SB_QP_SWEEP, use_sb_sweep_);
+ encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+
+ virtual void SetTileSize(libaom_test::Encoder *encoder) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+ encoder->Control(AV1E_SET_TILE_ROWS, 1);
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetAverageFrameSize() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ sum_frame_size_ += pkt->data.frame.sz;
+
+ const aom_codec_err_t res = decoder_->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ }
+
+ void DoTest() {
+ ::libaom_test::YUVVideoSource video(
+ "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 0, 6);
+ cfg_.rc_target_bitrate = 1000;
+
+ // Encode without sb_qp_sweep
+ use_sb_sweep_ = false;
+ sum_frame_size_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_1 = GetAveragePsnr();
+ const size_t avg_frame_size_1 = sum_frame_size_ / nframes_;
+
+ // Encode with sb_qp_sweep
+ use_sb_sweep_ = true;
+ sum_frame_size_ = 0;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ const double psnr_2 = GetAveragePsnr();
+ const size_t avg_frame_size_2 = sum_frame_size_ / nframes_;
+
+ if (psnr_1 >= psnr_2) {
+ ASSERT_GE(avg_frame_size_1, avg_frame_size_2);
+ }
+ if (avg_frame_size_1 <= avg_frame_size_2) {
+ ASSERT_LE(psnr_1, psnr_2);
+ }
+ }
+
+ bool use_sb_sweep_;
+ int set_cpu_used_;
+ bool row_mt_;
+ double psnr_;
+ unsigned int nframes_;
+ size_t sum_frame_size_;
+ std::unique_ptr<::libaom_test::Decoder> decoder_;
+};
+
+TEST_P(AV1SBQPSweepTest, SweepMatchTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(AV1SBQPSweepTest, ::testing::Range(4, 6),
+ ::testing::Bool());
+
+} // namespace
diff --git a/third_party/aom/test/scalability_test.cc b/third_party/aom/test/scalability_test.cc
new file mode 100644
index 0000000000..12cb03cac4
--- /dev/null
+++ b/third_party/aom/test/scalability_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kCpuUsed = 8;
+const int kBaseLayerQp = 55;
+const int kEnhancementLayerQp = 20;
+
+class ScalabilityTest
+ : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ScalabilityTest() : EncoderTest(GET_PARAM(0)) {}
+ ~ScalabilityTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ num_spatial_layers_ = 2;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+ encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_);
+ }
+ if (video->frame() % num_spatial_layers_) {
+ frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+ AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY;
+ encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 1);
+ encoder->Control(AOME_SET_CQ_LEVEL, kEnhancementLayerQp);
+ } else {
+ frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+ AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+ AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+ AOM_EFLAG_NO_UPD_ENTROPY;
+ encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 0);
+ encoder->Control(AOME_SET_CQ_LEVEL, kBaseLayerQp);
+ }
+ }
+
+ void DoTest(int num_spatial_layers) {
+ num_spatial_layers_ = num_spatial_layers;
+ cfg_.rc_end_usage = AOM_Q;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 18);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ int num_spatial_layers_;
+};
+
+TEST_P(ScalabilityTest, TestNoMismatch2SpatialLayers) { DoTest(2); }
+
+TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); }
+
+AV1_INSTANTIATE_TEST_SUITE(ScalabilityTest,
+ ::testing::Values(::libaom_test::kRealTime));
+
+} // namespace
diff --git a/third_party/aom/test/scan_test.cc b/third_party/aom/test/scan_test.cc
new file mode 100644
index 0000000000..571658ee0a
--- /dev/null
+++ b/third_party/aom/test/scan_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/av1_txfm_test.h"
+
+static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r,
+ int c, int h) {
+ if (iscan[c * h + r] != si || scan[si] != c * h + r) {
+ printf("r %d c %d ref_iscan %d iscan %d ref_scan %d scan %d\n", r, c, si,
+ iscan[c * h + r], c * h + r, scan[si]);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int scan_order_test(const SCAN_ORDER *scan_order, int w, int h,
+ SCAN_MODE mode) {
+ const int16_t *scan = scan_order->scan;
+ const int16_t *iscan = scan_order->iscan;
+ int dim = w + h - 1;
+ if (mode == SCAN_MODE_ZIG_ZAG) {
+ int si = 0;
+ for (int i = 0; i < dim; ++i) {
+ if (i % 2 == 0) {
+ for (int c = 0; c < w; ++c) {
+ int r = i - c;
+ if (r >= 0 && r < h) {
+ if (scan_test(scan, iscan, si, r, c, h)) return 1;
+ ++si;
+ }
+ }
+ } else {
+ for (int r = 0; r < h; ++r) {
+ int c = i - r;
+ if (c >= 0 && c < w) {
+ if (scan_test(scan, iscan, si, r, c, h)) return 1;
+ ++si;
+ }
+ }
+ }
+ }
+ } else if (mode == SCAN_MODE_COL_DIAG) {
+ int si = 0;
+ for (int i = 0; i < dim; ++i) {
+ for (int c = 0; c < w; ++c) {
+ int r = i - c;
+ if (r >= 0 && r < h) {
+ if (scan_test(scan, iscan, si, r, c, h)) return 1;
+ ++si;
+ }
+ }
+ }
+ } else if (mode == SCAN_MODE_ROW_DIAG) {
+ int si = 0;
+ for (int i = 0; i < dim; ++i) {
+ for (int r = 0; r < h; ++r) {
+ int c = i - r;
+ if (c >= 0 && c < w) {
+ if (scan_test(scan, iscan, si, r, c, h)) return 1;
+ ++si;
+ }
+ }
+ }
+ } else if (mode == SCAN_MODE_ROW_1D) {
+ int si = 0;
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ if (scan_test(scan, iscan, si, r, c, h)) return 1;
+ ++si;
+ }
+ }
+ } else {
+ assert(mode == SCAN_MODE_COL_1D);
+ int si = 0;
+ for (int c = 0; c < w; ++c) {
+ for (int r = 0; r < h; ++r) {
+ if (scan_test(scan, iscan, si, r, c, h)) return 1;
+ ++si;
+ }
+ }
+ }
+ return 0;
+}
+
+TEST(Av1ScanTest, Dependency) {
+ for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+ const int org_rows = tx_size_high[(TX_SIZE)tx_size];
+ const int org_cols = tx_size_wide[(TX_SIZE)tx_size];
+ const int rows = get_txb_high((TX_SIZE)tx_size);
+ const int cols = get_txb_wide((TX_SIZE)tx_size);
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+ static_cast<TX_TYPE>(tx_type)) ==
+ false) {
+ continue;
+ }
+ SCAN_MODE scan_mode;
+ TX_CLASS tx_class = tx_type_to_class[(TX_TYPE)tx_type];
+ if (tx_class == TX_CLASS_2D) {
+ if (rows == cols) {
+ scan_mode = SCAN_MODE_ZIG_ZAG;
+ } else if (rows > cols) {
+ scan_mode = SCAN_MODE_ROW_DIAG;
+ } else {
+ scan_mode = SCAN_MODE_COL_DIAG;
+ }
+ } else if (tx_class == TX_CLASS_VERT) {
+ scan_mode = SCAN_MODE_ROW_1D;
+ } else {
+ assert(tx_class == TX_CLASS_HORIZ);
+ scan_mode = SCAN_MODE_COL_1D;
+ }
+ const SCAN_ORDER *scan_order =
+ get_default_scan((TX_SIZE)tx_size, (TX_TYPE)tx_type);
+ ASSERT_EQ(scan_order_test(scan_order, cols, rows, scan_mode), 0)
+ << "scan mismatch tx_class " << tx_class << " tx_type " << tx_type
+ << " tx_w " << org_cols << " tx_h " << org_rows << " scan_mode "
+ << scan_mode << "\n";
+ }
+ }
+}
diff --git a/third_party/aom/test/screen_content_test.cc b/third_party/aom/test/screen_content_test.cc
new file mode 100644
index 0000000000..974c50b3c6
--- /dev/null
+++ b/third_party/aom/test/screen_content_test.cc
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+namespace {
+// This class is used to validate if screen_content_tools are turned on
+// appropriately.
+class ScreenContentToolsTestLarge
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+ aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ScreenContentToolsTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ rc_end_usage_(GET_PARAM(2)) {
+ is_screen_content_violated_ = true;
+ tune_content_ = AOM_CONTENT_DEFAULT;
+ }
+ ~ScreenContentToolsTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_target_bitrate = 1000;
+ cfg_.g_profile = 0;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ aom_screen_content_tools_info sc_info;
+
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SCREEN_CONTENT_TOOLS_INFO,
+ &sc_info);
+ if (sc_info.allow_screen_content_tools == 1) {
+ is_screen_content_violated_ = false;
+ }
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ bool is_screen_content_violated_;
+ int tune_content_;
+ aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(ScreenContentToolsTestLarge, ScreenContentToolsTest) {
+ // force screen content tools on
+ ::libaom_test::Y4mVideoSource video_nonsc("park_joy_90p_8_444.y4m", 0, 1);
+ cfg_.g_profile = 1;
+ tune_content_ = AOM_CONTENT_SCREEN;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video_nonsc));
+ ASSERT_EQ(is_screen_content_violated_, false)
+ << "Failed for tune_content_ = AOM_CONTENT_SCREEN";
+
+ // Don't force screen content, however as the input is screen content
+ // allow_screen_content_tools should still be turned on
+ ::libaom_test::Y4mVideoSource video_sc("desktop_credits.y4m", 0, 1);
+ cfg_.g_profile = 1;
+ is_screen_content_violated_ = true;
+ tune_content_ = AOM_CONTENT_DEFAULT;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+ ASSERT_EQ(is_screen_content_violated_, false)
+ << "Failed detection of screen content";
+
+ // TODO(anyone): Enable below test once low resolution screen content
+ // detection issues are fixed.
+ // low resolution test
+ // ::libaom_test::Y4mVideoSource video_sc("screendata.y4m", 0, 1);
+ // cfg_.g_profile = 0;
+ // is_screen_content_violated_ = true;
+ // tune_content_ = AOM_CONTENT_DEFAULT;
+ // ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+ // ASSERT_EQ(is_screen_content_violated_, false)
+ // << "Failed detection of screen content(lowres)";
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ScreenContentToolsTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(AOM_Q));
+
+class ScreenContentToolsMultiThreadTestLarge
+ : public ScreenContentToolsTestLarge {};
+
+TEST_P(ScreenContentToolsMultiThreadTestLarge, ScreenContentToolsTest) {
+ // Don't force screen content, however as the input is screen content
+ // allow_screen_content_tools should still be turned on even with
+ // multi-threaded encoding.
+ ::libaom_test::Y4mVideoSource video_sc("desktop_credits.y4m", 0, 10);
+ cfg_.g_profile = 1;
+ cfg_.g_threads = 4;
+ is_screen_content_violated_ = true;
+ tune_content_ = AOM_CONTENT_DEFAULT;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video_sc));
+ ASSERT_EQ(is_screen_content_violated_, false)
+ << "Failed detection of screen content";
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ScreenContentToolsMultiThreadTestLarge,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(AOM_Q));
+} // namespace
diff --git a/third_party/aom/test/segment_binarization_sync.cc b/third_party/aom/test/segment_binarization_sync.cc
new file mode 100644
index 0000000000..bd8cf11410
--- /dev/null
+++ b/third_party/aom/test/segment_binarization_sync.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+using libaom_test::ACMRandom;
+
+extern "C" {
+int av1_neg_interleave(int x, int ref, int max);
+int av1_neg_deinterleave(int diff, int ref, int max);
+}
+
+namespace {
+
+struct Segment {
+ int id;
+ int pred;
+ int last_id;
+};
+
+Segment GenerateSegment(int seed) {
+ static const int MAX_SEGMENTS = 8;
+
+ ACMRandom rnd_(seed);
+
+ Segment segment;
+ const int last_segid = rnd_.PseudoUniform(MAX_SEGMENTS);
+ segment.last_id = last_segid;
+ segment.pred = rnd_.PseudoUniform(MAX_SEGMENTS);
+ segment.id = rnd_.PseudoUniform(last_segid + 1);
+
+ return segment;
+}
+
+// Try to reveal a mismatch between segment binarization and debinarization
+TEST(SegmentBinarizationSync, SearchForBinarizationMismatch) {
+ const int count_tests = 1000;
+ const int seed_init = 4321;
+
+ for (int i = 0; i < count_tests; ++i) {
+ const Segment seg = GenerateSegment(seed_init + i);
+
+ const int max_segid = seg.last_id + 1;
+ const int seg_diff = av1_neg_interleave(seg.id, seg.pred, max_segid);
+ const int decoded_segid =
+ av1_neg_deinterleave(seg_diff, seg.pred, max_segid);
+
+ ASSERT_EQ(decoded_segid, seg.id);
+ }
+}
+
+} // namespace
diff --git a/third_party/aom/test/selfguided_filter_test.cc b/third_party/aom/test/selfguided_filter_test.cc
new file mode 100644
index 0000000000..3dd513b6e0
--- /dev/null
+++ b/third_party/aom/test/selfguided_filter_test.cc
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ctime>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/mv.h"
+#include "av1/common/restoration.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::make_tuple;
+using std::tuple;
+
+typedef int (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
+ int eps, const int *xqd, uint8_t *dst8, int dst_stride,
+ int32_t *tmpbuf, int bit_depth, int highbd);
+
+// Test parameter list:
+// <tst_fun_>
+typedef tuple<SgrFunc> FilterTestParam;
+
+class AV1SelfguidedFilterTest
+ : public ::testing::TestWithParam<FilterTestParam> {
+ public:
+ ~AV1SelfguidedFilterTest() override = default;
+ void SetUp() override {}
+
+ protected:
+ void RunSpeedTest() {
+ tst_fun_ = GET_PARAM(0);
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+ const int width = 256, height = 256, stride = 288, out_stride = 288;
+ const int NUM_ITERS = 2000;
+ int i, j, k;
+
+ uint8_t *input_ =
+ (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
+ ASSERT_NE(input_, nullptr);
+ uint8_t *output_ = (uint8_t *)aom_memalign(
+ 32, out_stride * (height + 32) * sizeof(uint8_t));
+ ASSERT_NE(output_, nullptr);
+ int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+ ASSERT_NE(tmpbuf, nullptr);
+ uint8_t *input = input_ + stride * 16 + 16;
+ uint8_t *output = output_ + out_stride * 16 + 16;
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (i = -16; i < height + 16; ++i)
+ for (j = -16; j < width + 16; ++j)
+ input[i * stride + j] = rnd.Rand16() & 0xFF;
+
+ int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+ SGRPROJ_PRJ_MIN0),
+ SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+ SGRPROJ_PRJ_MIN1) };
+ // Fix a parameter set, since the speed depends slightly on r.
+ // Change this to test different combinations of values of r.
+ int eps = 15;
+
+ av1_loop_restoration_precal();
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint8_t *input_p = input + k * stride + j;
+ uint8_t *output_p = output + k * out_stride + j;
+ const int ret_c = av1_apply_selfguided_restoration_c(
+ input_p, w, h, stride, eps, xqd, output_p, out_stride, tmpbuf, 8,
+ 0);
+ ASSERT_EQ(ret_c, 0);
+ }
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint8_t *input_p = input + k * stride + j;
+ uint8_t *output_p = output + k * out_stride + j;
+ const int ret_tst = tst_fun_(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf, 8, 0);
+ ASSERT_EQ(ret_tst, 0);
+ }
+ }
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(tmpbuf);
+ }
+
+ void RunCorrectnessTest() {
+ tst_fun_ = GET_PARAM(0);
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+ // Set the maximum width/height to test here. We actually test a small
+ // range of sizes *up to* this size, so that we can check, eg.,
+ // the behaviour on tiles which are not a multiple of 4 wide.
+ const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
+ const int NUM_ITERS = 81;
+ int i, j, k;
+
+ uint8_t *input_ =
+ (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
+ ASSERT_NE(input_, nullptr);
+ uint8_t *output_ = (uint8_t *)aom_memalign(
+ 32, out_stride * (max_h + 32) * sizeof(uint8_t));
+ ASSERT_NE(output_, nullptr);
+ uint8_t *output2_ = (uint8_t *)aom_memalign(
+ 32, out_stride * (max_h + 32) * sizeof(uint8_t));
+ ASSERT_NE(output2_, nullptr);
+ int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+ ASSERT_NE(tmpbuf, nullptr);
+
+ uint8_t *input = input_ + stride * 16 + 16;
+ uint8_t *output = output_ + out_stride * 16 + 16;
+ uint8_t *output2 = output2_ + out_stride * 16 + 16;
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ av1_loop_restoration_precal();
+
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (j = -16; j < max_h + 16; ++j)
+ for (k = -16; k < max_w + 16; ++k)
+ input[j * stride + k] = rnd.Rand16() & 0xFF;
+
+ int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+ SGRPROJ_PRJ_MIN0),
+ SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+ SGRPROJ_PRJ_MIN1) };
+ int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
+
+ // Test various tile sizes around 256x256
+ int test_w = max_w - (i / 9);
+ int test_h = max_h - (i % 9);
+
+ for (k = 0; k < test_h; k += pu_height)
+ for (j = 0; j < test_w; j += pu_width) {
+ int w = AOMMIN(pu_width, test_w - j);
+ int h = AOMMIN(pu_height, test_h - k);
+ uint8_t *input_p = input + k * stride + j;
+ uint8_t *output_p = output + k * out_stride + j;
+ uint8_t *output2_p = output2 + k * out_stride + j;
+ const int ret_tst = tst_fun_(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf, 8, 0);
+ ASSERT_EQ(ret_tst, 0);
+ const int ret_c = av1_apply_selfguided_restoration_c(
+ input_p, w, h, stride, eps, xqd, output2_p, out_stride, tmpbuf, 8,
+ 0);
+ ASSERT_EQ(ret_c, 0);
+ }
+
+ for (j = 0; j < test_h; ++j)
+ for (k = 0; k < test_w; ++k) {
+ ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+ }
+ }
+
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(output2_);
+ aom_free(tmpbuf);
+ }
+
+ private:
+ SgrFunc tst_fun_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1SelfguidedFilterTest);
+
+TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1SelfguidedFilterTest,
+ ::testing::Values(av1_apply_selfguided_restoration_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1SelfguidedFilterTest,
+ ::testing::Values(av1_apply_selfguided_restoration_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1SelfguidedFilterTest,
+ ::testing::Values(av1_apply_selfguided_restoration_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Test parameter list:
+// <tst_fun_, bit_depth>
+typedef tuple<SgrFunc, int> HighbdFilterTestParam;
+
+class AV1HighbdSelfguidedFilterTest
+ : public ::testing::TestWithParam<HighbdFilterTestParam> {
+ public:
+ ~AV1HighbdSelfguidedFilterTest() override = default;
+ void SetUp() override {}
+
+ protected:
+ void RunSpeedTest() {
+ tst_fun_ = GET_PARAM(0);
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+ const int width = 256, height = 256, stride = 288, out_stride = 288;
+ const int NUM_ITERS = 2000;
+ int i, j, k;
+ int bit_depth = GET_PARAM(1);
+ int mask = (1 << bit_depth) - 1;
+
+ uint16_t *input_ =
+ (uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t));
+ ASSERT_NE(input_, nullptr);
+ uint16_t *output_ = (uint16_t *)aom_memalign(
+ 32, out_stride * (height + 32) * sizeof(uint16_t));
+ ASSERT_NE(output_, nullptr);
+ int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+ ASSERT_NE(tmpbuf, nullptr);
+ uint16_t *input = input_ + stride * 16 + 16;
+ uint16_t *output = output_ + out_stride * 16 + 16;
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (i = -16; i < height + 16; ++i)
+ for (j = -16; j < width + 16; ++j)
+ input[i * stride + j] = rnd.Rand16() & mask;
+
+ int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+ SGRPROJ_PRJ_MIN0),
+ SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+ SGRPROJ_PRJ_MIN1) };
+ // Fix a parameter set, since the speed depends slightly on r.
+ // Change this to test different combinations of values of r.
+ int eps = 15;
+
+ av1_loop_restoration_precal();
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint16_t *input_p = input + k * stride + j;
+ uint16_t *output_p = output + k * out_stride + j;
+ av1_apply_selfguided_restoration_c(
+ CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+ CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
+ }
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer tst_timer;
+ aom_usec_timer_start(&tst_timer);
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint16_t *input_p = input + k * stride + j;
+ uint16_t *output_p = output + k * out_stride + j;
+ tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+ CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+ 1);
+ }
+ }
+ aom_usec_timer_mark(&tst_timer);
+ const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+ std::cout << "[ ] C time = " << ref_time / 1000
+ << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+ EXPECT_GT(ref_time, tst_time)
+ << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
+ "C.\n"
+ << "C time: " << ref_time << " us\n"
+ << "SIMD time: " << tst_time << " us\n";
+
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(tmpbuf);
+ }
+
+ void RunCorrectnessTest() {
+ tst_fun_ = GET_PARAM(0);
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+ // Set the maximum width/height to test here. We actually test a small
+ // range of sizes *up to* this size, so that we can check, eg.,
+ // the behaviour on tiles which are not a multiple of 4 wide.
+ const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
+ const int NUM_ITERS = 81;
+ int i, j, k;
+ int bit_depth = GET_PARAM(1);
+ int mask = (1 << bit_depth) - 1;
+
+ uint16_t *input_ =
+ (uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t));
+ ASSERT_NE(input_, nullptr);
+ uint16_t *output_ = (uint16_t *)aom_memalign(
+ 32, out_stride * (max_h + 32) * sizeof(uint16_t));
+ ASSERT_NE(output_, nullptr);
+ uint16_t *output2_ = (uint16_t *)aom_memalign(
+ 32, out_stride * (max_h + 32) * sizeof(uint16_t));
+ ASSERT_NE(output2_, nullptr);
+ int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+ ASSERT_NE(tmpbuf, nullptr);
+
+ uint16_t *input = input_ + stride * 16 + 16;
+ uint16_t *output = output_ + out_stride * 16 + 16;
+ uint16_t *output2 = output2_ + out_stride * 16 + 16;
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ av1_loop_restoration_precal();
+
+ for (i = 0; i < NUM_ITERS; ++i) {
+ for (j = -16; j < max_h + 16; ++j)
+ for (k = -16; k < max_w + 16; ++k)
+ input[j * stride + k] = rnd.Rand16() & mask;
+
+ int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+ SGRPROJ_PRJ_MIN0),
+ SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+ SGRPROJ_PRJ_MIN1) };
+ int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
+
+ // Test various tile sizes around 256x256
+ int test_w = max_w - (i / 9);
+ int test_h = max_h - (i % 9);
+
+ for (k = 0; k < test_h; k += pu_height)
+ for (j = 0; j < test_w; j += pu_width) {
+ int w = AOMMIN(pu_width, test_w - j);
+ int h = AOMMIN(pu_height, test_h - k);
+ uint16_t *input_p = input + k * stride + j;
+ uint16_t *output_p = output + k * out_stride + j;
+ uint16_t *output2_p = output2 + k * out_stride + j;
+ tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+ CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+ 1);
+ av1_apply_selfguided_restoration_c(
+ CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+ CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
+ }
+
+ for (j = 0; j < test_h; ++j)
+ for (k = 0; k < test_w; ++k)
+ ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+ }
+
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(output2_);
+ aom_free(tmpbuf);
+ }
+
+ private:
+ SgrFunc tst_fun_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdSelfguidedFilterTest);
+
+TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
+
+#if HAVE_SSE4_1
+const int highbd_params_sse4_1[] = { 8, 10, 12 };
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1HighbdSelfguidedFilterTest,
+ ::testing::Combine(
+ ::testing::Values(av1_apply_selfguided_restoration_sse4_1),
+ ::testing::ValuesIn(highbd_params_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+const int highbd_params_avx2[] = { 8, 10, 12 };
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1HighbdSelfguidedFilterTest,
+ ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_avx2),
+ ::testing::ValuesIn(highbd_params_avx2)));
+#endif
+
+#if HAVE_NEON
+const int highbd_params_neon[] = { 8, 10, 12 };
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdSelfguidedFilterTest,
+ ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_neon),
+ ::testing::ValuesIn(highbd_params_neon)));
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/set_maps.sh b/third_party/aom/test/set_maps.sh
new file mode 100755
index 0000000000..b79357a2b8
--- /dev/null
+++ b/third_party/aom/test/set_maps.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom set_maps example. To add new tests to this file,
+## do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBAOM_BIN_PATH.
+set_maps_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ -z "$(aom_tool_path set_maps)" ]; then
+ elog "set_maps not found. It must exist in LIBAOM_BIN_PATH or its parent."
+ return 1
+ fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+ local encoder="$(aom_tool_path set_maps)"
+ local codec="$1"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+set_maps_av1() {
+ if [ "$(av1_encode_available)" = "yes" ]; then
+ set_maps av1 || return 1
+ fi
+}
+
+set_maps_tests="set_maps_av1"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"
diff --git a/third_party/aom/test/sharpness_test.cc b/third_party/aom/test/sharpness_test.cc
new file mode 100644
index 0000000000..64465c88eb
--- /dev/null
+++ b/third_party/aom/test/sharpness_test.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const unsigned int kCqLevel = 18;
+
+// List of psnr thresholds for different test combinations
+// keys: test-mode, cpu-used, sharpness.
+const std::unordered_map<
+ int, std::unordered_map<int, std::unordered_map<int, double>>>
+ kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood),
+ { { 2, { { 2, 37.6 }, { 5, 37.6 } } },
+ { 4, { { 2, 37.5 }, { 5, 37.5 } } },
+ { 6, { { 2, 37.5 }, { 5, 37.5 } } } } },
+ { static_cast<int>(::libaom_test::kAllIntra),
+ { { 3, { { 2, 42.2 }, { 5, 42.2 } } },
+ { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } },
+ { 9, { { 2, 41.0 }, { 5, 41.0 } } } } } };
+
+// This class is used to test sharpness parameter configured through control
+// call using AOME_SET_SHARPNESS for different encoder configurations.
+class SharpnessTest
+ : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+ int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ SharpnessTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0),
+ nframes_(0) {}
+
+ ~SharpnessTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_lag_in_frames = 5;
+ }
+ }
+
+ void BeginPassHook(unsigned int) override {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AOME_SET_SHARPNESS, sharpness_level_);
+ if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+ encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+ }
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() {
+ return kPsnrThreshold.at(encoding_mode_).at(cpu_used_).at(sharpness_level_);
+ }
+
+ void DoTest() {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+
+ std::unique_ptr<libaom_test::VideoSource> video(
+ new libaom_test::Y4mVideoSource("paris_352_288_30.y4m", 0, kFrames));
+ ASSERT_NE(video, nullptr);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, GetPsnrThreshold())
+ << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_
+ << ", sharpness level = " << sharpness_level_;
+ }
+
+ private:
+ const libaom_test::TestMode encoding_mode_;
+ const int cpu_used_;
+ const int sharpness_level_;
+ double psnr_;
+ unsigned int nframes_;
+};
+
+class SharpnessTestLarge : public SharpnessTest {};
+
+class SharpnessAllIntraTest : public SharpnessTest {};
+
+class SharpnessAllIntraTestLarge : public SharpnessTest {};
+
+TEST_P(SharpnessTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTest, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessTestLarge,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Values(2, 4, 6), // cpu_used
+ ::testing::Values(2, 5)); // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTest,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(6), // cpu_used
+ ::testing::Values(4)); // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTestLarge,
+ ::testing::Values(::libaom_test::kAllIntra),
+ ::testing::Values(3, 6, 9), // cpu_used
+ ::testing::Values(2, 5)); // sharpness level
+} // namespace
diff --git a/third_party/aom/test/simd_avx2_test.cc b/third_party/aom/test/simd_avx2_test.cc
new file mode 100644
index 0000000000..8a012bff88
--- /dev/null
+++ b/third_party/aom/test/simd_avx2_test.cc
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define ARCH AVX2
+#define ARCH_POSTFIX(name) name##_avx2
+#define SIMD_NAMESPACE simd_test_avx2
+#include "test/simd_impl.h"
diff --git a/third_party/aom/test/simd_cmp_avx2.cc b/third_party/aom/test/simd_cmp_avx2.cc
new file mode 100644
index 0000000000..cda632bcdf
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_avx2.cc
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define ARCH AVX2
+#define ARCH_POSTFIX(name) name##_avx2
+#define SIMD_NAMESPACE simd_test_avx2
+#include "test/simd_cmp_impl.h"
diff --git a/third_party/aom/test/simd_cmp_impl.h b/third_party/aom/test/simd_cmp_impl.h
new file mode 100644
index 0000000000..cf85a471cd
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_impl.h
@@ -0,0 +1,2175 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+// Inlining not forced for the compiler due to some tests calling
+// SIMD_INLINE functions via function pointers
+#undef SIMD_INLINE
+#define SIMD_INLINE static inline
+#include "aom_dsp/aom_simd.h"
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+
+// Machine tuned code goes into this file. This file is included from
+// simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
+// ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
+
+#ifdef _MSC_VER
+// Disable "value of intrinsic immediate argument 'value' is out of range
+// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
+// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
+// mask doesn't always appear to be sufficient.
+#pragma warning(disable : 4556)
+#endif
+
+using libaom_test::ACMRandom;
+
+namespace SIMD_NAMESPACE {
+
+// Wrap templates around intrinsics using immediate values
+template <int shift>
+v64 imm_v64_shl_n_byte(v64 a) {
+ return v64_shl_n_byte(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_byte(v64 a) {
+ return v64_shr_n_byte(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_8(v64 a) {
+ return v64_shl_n_8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u8(v64 a) {
+ return v64_shr_n_u8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s8(v64 a) {
+ return v64_shr_n_s8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_16(v64 a) {
+ return v64_shl_n_16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u16(v64 a) {
+ return v64_shr_n_u16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s16(v64 a) {
+ return v64_shr_n_s16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_32(v64 a) {
+ return v64_shl_n_32(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u32(v64 a) {
+ return v64_shr_n_u32(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s32(v64 a) {
+ return v64_shr_n_s32(a, shift);
+}
+template <int shift>
+v64 imm_v64_align(v64 a, v64 b) {
+ return v64_align(a, b, shift);
+}
+
+// Wrap templates around corresponding C implementations of the above
+template <int shift>
+c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
+ return c_v64_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
+ return c_v64_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_8(c_v64 a) {
+ return c_v64_shl_n_8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
+ return c_v64_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
+ return c_v64_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_16(c_v64 a) {
+ return c_v64_shl_n_16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
+ return c_v64_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
+ return c_v64_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_32(c_v64 a) {
+ return c_v64_shl_n_32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
+ return c_v64_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
+ return c_v64_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
+ return c_v64_align(a, b, shift);
+}
+
+template <int shift>
+v128 imm_v128_shl_n_byte(v128 a) {
+ return v128_shl_n_byte(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_byte(v128 a) {
+ return v128_shr_n_byte(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_8(v128 a) {
+ return v128_shl_n_8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u8(v128 a) {
+ return v128_shr_n_u8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s8(v128 a) {
+ return v128_shr_n_s8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_16(v128 a) {
+ return v128_shl_n_16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u16(v128 a) {
+ return v128_shr_n_u16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s16(v128 a) {
+ return v128_shr_n_s16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_32(v128 a) {
+ return v128_shl_n_32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u32(v128 a) {
+ return v128_shr_n_u32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s32(v128 a) {
+ return v128_shr_n_s32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_64(v128 a) {
+ return v128_shl_n_64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u64(v128 a) {
+ return v128_shr_n_u64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s64(v128 a) {
+ return v128_shr_n_s64(a, shift);
+}
+template <int shift>
+v128 imm_v128_align(v128 a, v128 b) {
+ return v128_align(a, b, shift);
+}
+
+template <int shift>
+c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
+ return c_v128_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
+ return c_v128_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_8(c_v128 a) {
+ return c_v128_shl_n_8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
+ return c_v128_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
+ return c_v128_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_16(c_v128 a) {
+ return c_v128_shl_n_16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
+ return c_v128_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
+ return c_v128_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_32(c_v128 a) {
+ return c_v128_shl_n_32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
+ return c_v128_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
+ return c_v128_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_64(c_v128 a) {
+ return c_v128_shl_n_64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
+ return c_v128_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
+ return c_v128_shr_n_s64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
+ return c_v128_align(a, b, shift);
+}
+
+template <int shift>
+v256 imm_v256_shl_n_word(v256 a) {
+ return v256_shl_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_word(v256 a) {
+ return v256_shr_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_byte(v256 a) {
+ return v256_shl_n_byte(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_byte(v256 a) {
+ return v256_shr_n_byte(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_8(v256 a) {
+ return v256_shl_n_8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u8(v256 a) {
+ return v256_shr_n_u8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s8(v256 a) {
+ return v256_shr_n_s8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_16(v256 a) {
+ return v256_shl_n_16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u16(v256 a) {
+ return v256_shr_n_u16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s16(v256 a) {
+ return v256_shr_n_s16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_32(v256 a) {
+ return v256_shl_n_32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u32(v256 a) {
+ return v256_shr_n_u32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s32(v256 a) {
+ return v256_shr_n_s32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_64(v256 a) {
+ return v256_shl_n_64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u64(v256 a) {
+ return v256_shr_n_u64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s64(v256 a) {
+ return v256_shr_n_s64(a, shift);
+}
+template <int shift>
+v256 imm_v256_align(v256 a, v256 b) {
+ return v256_align(a, b, shift);
+}
+
+template <int shift>
+c_v256 c_imm_v256_shl_n_word(c_v256 a) {
+ return c_v256_shl_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_word(c_v256 a) {
+ return c_v256_shr_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
+ return c_v256_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
+ return c_v256_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_8(c_v256 a) {
+ return c_v256_shl_n_8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
+ return c_v256_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
+ return c_v256_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_16(c_v256 a) {
+ return c_v256_shl_n_16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
+ return c_v256_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
+ return c_v256_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_32(c_v256 a) {
+ return c_v256_shl_n_32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
+ return c_v256_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
+ return c_v256_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_64(c_v256 a) {
+ return c_v256_shl_n_64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
+ return c_v256_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
+ return c_v256_shr_n_s64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
+ return c_v256_align(a, b, shift);
+}
+
+// Wrappers around the the SAD and SSD functions
+uint32_t v64_sad_u8(v64 a, v64 b) {
+ return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
+}
+uint32_t v64_ssd_u8(v64 a, v64 b) {
+ return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
+}
+
+uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
+ return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
+}
+uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
+ return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
+}
+uint32_t v128_sad_u8(v128 a, v128 b) {
+ return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
+}
+uint32_t v128_ssd_u8(v128 a, v128 b) {
+ return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
+}
+uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
+ return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
+}
+uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
+ return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
+}
+uint32_t v128_sad_u16(v128 a, v128 b) {
+ return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
+}
+uint64_t v128_ssd_s16(v128 a, v128 b) {
+ return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
+}
+uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
+ return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
+}
+uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
+ return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
+}
+uint32_t v256_sad_u8(v256 a, v256 b) {
+ return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
+}
+uint32_t v256_ssd_u8(v256 a, v256 b) {
+ return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
+}
+uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
+ return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
+}
+uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
+ return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
+}
+uint32_t v256_sad_u16(v256 a, v256 b) {
+ return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
+}
+uint64_t v256_ssd_s16(v256 a, v256 b) {
+ return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
+}
+uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
+ return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
+}
+uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
+ return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
+}
+
+namespace {
+
+typedef void (*fptr)();
+
+typedef struct {
+ const char *name;
+ fptr ref;
+ fptr simd;
+} mapping;
+
+#define MAP(name) \
+ { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
+
+const mapping m[] = { MAP(v64_sad_u8),
+ MAP(v64_ssd_u8),
+ MAP(v64_add_8),
+ MAP(v64_add_16),
+ MAP(v64_sadd_s8),
+ MAP(v64_sadd_u8),
+ MAP(v64_sadd_s16),
+ MAP(v64_add_32),
+ MAP(v64_sub_8),
+ MAP(v64_ssub_u8),
+ MAP(v64_ssub_s8),
+ MAP(v64_sub_16),
+ MAP(v64_ssub_s16),
+ MAP(v64_ssub_u16),
+ MAP(v64_sub_32),
+ MAP(v64_ziplo_8),
+ MAP(v64_ziphi_8),
+ MAP(v64_ziplo_16),
+ MAP(v64_ziphi_16),
+ MAP(v64_ziplo_32),
+ MAP(v64_ziphi_32),
+ MAP(v64_pack_s32_u16),
+ MAP(v64_pack_s32_s16),
+ MAP(v64_pack_s16_u8),
+ MAP(v64_pack_s16_s8),
+ MAP(v64_unziphi_8),
+ MAP(v64_unziplo_8),
+ MAP(v64_unziphi_16),
+ MAP(v64_unziplo_16),
+ MAP(v64_or),
+ MAP(v64_xor),
+ MAP(v64_and),
+ MAP(v64_andn),
+ MAP(v64_mullo_s16),
+ MAP(v64_mulhi_s16),
+ MAP(v64_mullo_s32),
+ MAP(v64_madd_s16),
+ MAP(v64_madd_us8),
+ MAP(v64_avg_u8),
+ MAP(v64_rdavg_u8),
+ MAP(v64_rdavg_u16),
+ MAP(v64_avg_u16),
+ MAP(v64_min_u8),
+ MAP(v64_max_u8),
+ MAP(v64_min_s8),
+ MAP(v64_max_s8),
+ MAP(v64_min_s16),
+ MAP(v64_max_s16),
+ MAP(v64_cmpgt_s8),
+ MAP(v64_cmplt_s8),
+ MAP(v64_cmpeq_8),
+ MAP(v64_cmpgt_s16),
+ MAP(v64_cmplt_s16),
+ MAP(v64_cmpeq_16),
+ MAP(v64_shuffle_8),
+ MAP(imm_v64_align<1>),
+ MAP(imm_v64_align<2>),
+ MAP(imm_v64_align<3>),
+ MAP(imm_v64_align<4>),
+ MAP(imm_v64_align<5>),
+ MAP(imm_v64_align<6>),
+ MAP(imm_v64_align<7>),
+ MAP(v64_abs_s8),
+ MAP(v64_abs_s16),
+ MAP(v64_unpacklo_u8_s16),
+ MAP(v64_unpackhi_u8_s16),
+ MAP(v64_unpacklo_s8_s16),
+ MAP(v64_unpackhi_s8_s16),
+ MAP(v64_unpacklo_u16_s32),
+ MAP(v64_unpacklo_s16_s32),
+ MAP(v64_unpackhi_u16_s32),
+ MAP(v64_unpackhi_s16_s32),
+ MAP(imm_v64_shr_n_byte<1>),
+ MAP(imm_v64_shr_n_byte<2>),
+ MAP(imm_v64_shr_n_byte<3>),
+ MAP(imm_v64_shr_n_byte<4>),
+ MAP(imm_v64_shr_n_byte<5>),
+ MAP(imm_v64_shr_n_byte<6>),
+ MAP(imm_v64_shr_n_byte<7>),
+ MAP(imm_v64_shl_n_byte<1>),
+ MAP(imm_v64_shl_n_byte<2>),
+ MAP(imm_v64_shl_n_byte<3>),
+ MAP(imm_v64_shl_n_byte<4>),
+ MAP(imm_v64_shl_n_byte<5>),
+ MAP(imm_v64_shl_n_byte<6>),
+ MAP(imm_v64_shl_n_byte<7>),
+ MAP(imm_v64_shl_n_8<1>),
+ MAP(imm_v64_shl_n_8<2>),
+ MAP(imm_v64_shl_n_8<3>),
+ MAP(imm_v64_shl_n_8<4>),
+ MAP(imm_v64_shl_n_8<5>),
+ MAP(imm_v64_shl_n_8<6>),
+ MAP(imm_v64_shl_n_8<7>),
+ MAP(imm_v64_shr_n_u8<1>),
+ MAP(imm_v64_shr_n_u8<2>),
+ MAP(imm_v64_shr_n_u8<3>),
+ MAP(imm_v64_shr_n_u8<4>),
+ MAP(imm_v64_shr_n_u8<5>),
+ MAP(imm_v64_shr_n_u8<6>),
+ MAP(imm_v64_shr_n_u8<7>),
+ MAP(imm_v64_shr_n_s8<1>),
+ MAP(imm_v64_shr_n_s8<2>),
+ MAP(imm_v64_shr_n_s8<3>),
+ MAP(imm_v64_shr_n_s8<4>),
+ MAP(imm_v64_shr_n_s8<5>),
+ MAP(imm_v64_shr_n_s8<6>),
+ MAP(imm_v64_shr_n_s8<7>),
+ MAP(imm_v64_shl_n_16<1>),
+ MAP(imm_v64_shl_n_16<2>),
+ MAP(imm_v64_shl_n_16<4>),
+ MAP(imm_v64_shl_n_16<6>),
+ MAP(imm_v64_shl_n_16<8>),
+ MAP(imm_v64_shl_n_16<10>),
+ MAP(imm_v64_shl_n_16<12>),
+ MAP(imm_v64_shl_n_16<14>),
+ MAP(imm_v64_shr_n_u16<1>),
+ MAP(imm_v64_shr_n_u16<2>),
+ MAP(imm_v64_shr_n_u16<4>),
+ MAP(imm_v64_shr_n_u16<6>),
+ MAP(imm_v64_shr_n_u16<8>),
+ MAP(imm_v64_shr_n_u16<10>),
+ MAP(imm_v64_shr_n_u16<12>),
+ MAP(imm_v64_shr_n_u16<14>),
+ MAP(imm_v64_shr_n_s16<1>),
+ MAP(imm_v64_shr_n_s16<2>),
+ MAP(imm_v64_shr_n_s16<4>),
+ MAP(imm_v64_shr_n_s16<6>),
+ MAP(imm_v64_shr_n_s16<8>),
+ MAP(imm_v64_shr_n_s16<10>),
+ MAP(imm_v64_shr_n_s16<12>),
+ MAP(imm_v64_shr_n_s16<14>),
+ MAP(imm_v64_shl_n_32<1>),
+ MAP(imm_v64_shl_n_32<4>),
+ MAP(imm_v64_shl_n_32<8>),
+ MAP(imm_v64_shl_n_32<12>),
+ MAP(imm_v64_shl_n_32<16>),
+ MAP(imm_v64_shl_n_32<20>),
+ MAP(imm_v64_shl_n_32<24>),
+ MAP(imm_v64_shl_n_32<28>),
+ MAP(imm_v64_shr_n_u32<1>),
+ MAP(imm_v64_shr_n_u32<4>),
+ MAP(imm_v64_shr_n_u32<8>),
+ MAP(imm_v64_shr_n_u32<12>),
+ MAP(imm_v64_shr_n_u32<16>),
+ MAP(imm_v64_shr_n_u32<20>),
+ MAP(imm_v64_shr_n_u32<24>),
+ MAP(imm_v64_shr_n_u32<28>),
+ MAP(imm_v64_shr_n_s32<1>),
+ MAP(imm_v64_shr_n_s32<4>),
+ MAP(imm_v64_shr_n_s32<8>),
+ MAP(imm_v64_shr_n_s32<12>),
+ MAP(imm_v64_shr_n_s32<16>),
+ MAP(imm_v64_shr_n_s32<20>),
+ MAP(imm_v64_shr_n_s32<24>),
+ MAP(imm_v64_shr_n_s32<28>),
+ MAP(v64_shl_8),
+ MAP(v64_shr_u8),
+ MAP(v64_shr_s8),
+ MAP(v64_shl_16),
+ MAP(v64_shr_u16),
+ MAP(v64_shr_s16),
+ MAP(v64_shl_32),
+ MAP(v64_shr_u32),
+ MAP(v64_shr_s32),
+ MAP(v64_hadd_u8),
+ MAP(v64_hadd_s16),
+ MAP(v64_dotp_s16),
+ MAP(v64_dotp_su8),
+ MAP(v64_u64),
+ MAP(v64_low_u32),
+ MAP(v64_high_u32),
+ MAP(v64_low_s32),
+ MAP(v64_high_s32),
+ MAP(v64_dup_8),
+ MAP(v64_dup_16),
+ MAP(v64_dup_32),
+ MAP(v64_from_32),
+ MAP(v64_zero),
+ MAP(v64_from_16),
+ MAP(v128_sad_u8),
+ MAP(v128_ssd_u8),
+ MAP(v128_sad_u16),
+ MAP(v128_ssd_s16),
+ MAP(v128_add_8),
+ MAP(v128_add_16),
+ MAP(v128_sadd_s8),
+ MAP(v128_sadd_u8),
+ MAP(v128_sadd_s16),
+ MAP(v128_add_32),
+ MAP(v128_add_64),
+ MAP(v128_sub_8),
+ MAP(v128_ssub_u8),
+ MAP(v128_ssub_s8),
+ MAP(v128_sub_16),
+ MAP(v128_ssub_s16),
+ MAP(v128_ssub_u16),
+ MAP(v128_sub_32),
+ MAP(v128_sub_64),
+ MAP(v128_ziplo_8),
+ MAP(v128_ziphi_8),
+ MAP(v128_ziplo_16),
+ MAP(v128_ziphi_16),
+ MAP(v128_ziplo_32),
+ MAP(v128_ziphi_32),
+ MAP(v128_ziplo_64),
+ MAP(v128_ziphi_64),
+ MAP(v128_unziphi_8),
+ MAP(v128_unziplo_8),
+ MAP(v128_unziphi_16),
+ MAP(v128_unziplo_16),
+ MAP(v128_unziphi_32),
+ MAP(v128_unziplo_32),
+ MAP(v128_pack_s32_u16),
+ MAP(v128_pack_s32_s16),
+ MAP(v128_pack_s16_u8),
+ MAP(v128_pack_s16_s8),
+ MAP(v128_or),
+ MAP(v128_xor),
+ MAP(v128_and),
+ MAP(v128_andn),
+ MAP(v128_mullo_s16),
+ MAP(v128_mulhi_s16),
+ MAP(v128_mullo_s32),
+ MAP(v128_madd_s16),
+ MAP(v128_madd_us8),
+ MAP(v128_avg_u8),
+ MAP(v128_rdavg_u8),
+ MAP(v128_rdavg_u16),
+ MAP(v128_avg_u16),
+ MAP(v128_min_u8),
+ MAP(v128_max_u8),
+ MAP(v128_min_s8),
+ MAP(v128_max_s8),
+ MAP(v128_min_s16),
+ MAP(v128_max_s16),
+ MAP(v128_min_s32),
+ MAP(v128_max_s32),
+ MAP(v128_cmpgt_s8),
+ MAP(v128_cmplt_s8),
+ MAP(v128_cmpeq_8),
+ MAP(v128_cmpgt_s16),
+ MAP(v128_cmpeq_16),
+ MAP(v128_cmplt_s16),
+ MAP(v128_cmpgt_s32),
+ MAP(v128_cmpeq_32),
+ MAP(v128_cmplt_s32),
+ MAP(v128_shuffle_8),
+ MAP(imm_v128_align<1>),
+ MAP(imm_v128_align<2>),
+ MAP(imm_v128_align<3>),
+ MAP(imm_v128_align<4>),
+ MAP(imm_v128_align<5>),
+ MAP(imm_v128_align<6>),
+ MAP(imm_v128_align<7>),
+ MAP(imm_v128_align<8>),
+ MAP(imm_v128_align<9>),
+ MAP(imm_v128_align<10>),
+ MAP(imm_v128_align<11>),
+ MAP(imm_v128_align<12>),
+ MAP(imm_v128_align<13>),
+ MAP(imm_v128_align<14>),
+ MAP(imm_v128_align<15>),
+ MAP(v128_abs_s8),
+ MAP(v128_abs_s16),
+ MAP(v128_padd_u8),
+ MAP(v128_padd_s16),
+ MAP(v128_unpacklo_u16_s32),
+ MAP(v128_unpacklo_s16_s32),
+ MAP(v128_unpackhi_u16_s32),
+ MAP(v128_unpackhi_s16_s32),
+ MAP(imm_v128_shr_n_byte<1>),
+ MAP(imm_v128_shr_n_byte<2>),
+ MAP(imm_v128_shr_n_byte<3>),
+ MAP(imm_v128_shr_n_byte<4>),
+ MAP(imm_v128_shr_n_byte<5>),
+ MAP(imm_v128_shr_n_byte<6>),
+ MAP(imm_v128_shr_n_byte<7>),
+ MAP(imm_v128_shr_n_byte<8>),
+ MAP(imm_v128_shr_n_byte<9>),
+ MAP(imm_v128_shr_n_byte<10>),
+ MAP(imm_v128_shr_n_byte<11>),
+ MAP(imm_v128_shr_n_byte<12>),
+ MAP(imm_v128_shr_n_byte<13>),
+ MAP(imm_v128_shr_n_byte<14>),
+ MAP(imm_v128_shr_n_byte<15>),
+ MAP(imm_v128_shl_n_byte<1>),
+ MAP(imm_v128_shl_n_byte<2>),
+ MAP(imm_v128_shl_n_byte<3>),
+ MAP(imm_v128_shl_n_byte<4>),
+ MAP(imm_v128_shl_n_byte<5>),
+ MAP(imm_v128_shl_n_byte<6>),
+ MAP(imm_v128_shl_n_byte<7>),
+ MAP(imm_v128_shl_n_byte<8>),
+ MAP(imm_v128_shl_n_byte<9>),
+ MAP(imm_v128_shl_n_byte<10>),
+ MAP(imm_v128_shl_n_byte<11>),
+ MAP(imm_v128_shl_n_byte<12>),
+ MAP(imm_v128_shl_n_byte<13>),
+ MAP(imm_v128_shl_n_byte<14>),
+ MAP(imm_v128_shl_n_byte<15>),
+ MAP(imm_v128_shl_n_8<1>),
+ MAP(imm_v128_shl_n_8<2>),
+ MAP(imm_v128_shl_n_8<3>),
+ MAP(imm_v128_shl_n_8<4>),
+ MAP(imm_v128_shl_n_8<5>),
+ MAP(imm_v128_shl_n_8<6>),
+ MAP(imm_v128_shl_n_8<7>),
+ MAP(imm_v128_shr_n_u8<1>),
+ MAP(imm_v128_shr_n_u8<2>),
+ MAP(imm_v128_shr_n_u8<3>),
+ MAP(imm_v128_shr_n_u8<4>),
+ MAP(imm_v128_shr_n_u8<5>),
+ MAP(imm_v128_shr_n_u8<6>),
+ MAP(imm_v128_shr_n_u8<7>),
+ MAP(imm_v128_shr_n_s8<1>),
+ MAP(imm_v128_shr_n_s8<2>),
+ MAP(imm_v128_shr_n_s8<3>),
+ MAP(imm_v128_shr_n_s8<4>),
+ MAP(imm_v128_shr_n_s8<5>),
+ MAP(imm_v128_shr_n_s8<6>),
+ MAP(imm_v128_shr_n_s8<7>),
+ MAP(imm_v128_shl_n_16<1>),
+ MAP(imm_v128_shl_n_16<2>),
+ MAP(imm_v128_shl_n_16<4>),
+ MAP(imm_v128_shl_n_16<6>),
+ MAP(imm_v128_shl_n_16<8>),
+ MAP(imm_v128_shl_n_16<10>),
+ MAP(imm_v128_shl_n_16<12>),
+ MAP(imm_v128_shl_n_16<14>),
+ MAP(imm_v128_shr_n_u16<1>),
+ MAP(imm_v128_shr_n_u16<2>),
+ MAP(imm_v128_shr_n_u16<4>),
+ MAP(imm_v128_shr_n_u16<6>),
+ MAP(imm_v128_shr_n_u16<8>),
+ MAP(imm_v128_shr_n_u16<10>),
+ MAP(imm_v128_shr_n_u16<12>),
+ MAP(imm_v128_shr_n_u16<14>),
+ MAP(imm_v128_shr_n_s16<1>),
+ MAP(imm_v128_shr_n_s16<2>),
+ MAP(imm_v128_shr_n_s16<4>),
+ MAP(imm_v128_shr_n_s16<6>),
+ MAP(imm_v128_shr_n_s16<8>),
+ MAP(imm_v128_shr_n_s16<10>),
+ MAP(imm_v128_shr_n_s16<12>),
+ MAP(imm_v128_shr_n_s16<14>),
+ MAP(imm_v128_shl_n_32<1>),
+ MAP(imm_v128_shl_n_32<4>),
+ MAP(imm_v128_shl_n_32<8>),
+ MAP(imm_v128_shl_n_32<12>),
+ MAP(imm_v128_shl_n_32<16>),
+ MAP(imm_v128_shl_n_32<20>),
+ MAP(imm_v128_shl_n_32<24>),
+ MAP(imm_v128_shl_n_32<28>),
+ MAP(imm_v128_shr_n_u32<1>),
+ MAP(imm_v128_shr_n_u32<4>),
+ MAP(imm_v128_shr_n_u32<8>),
+ MAP(imm_v128_shr_n_u32<12>),
+ MAP(imm_v128_shr_n_u32<16>),
+ MAP(imm_v128_shr_n_u32<20>),
+ MAP(imm_v128_shr_n_u32<24>),
+ MAP(imm_v128_shr_n_u32<28>),
+ MAP(imm_v128_shr_n_s32<1>),
+ MAP(imm_v128_shr_n_s32<4>),
+ MAP(imm_v128_shr_n_s32<8>),
+ MAP(imm_v128_shr_n_s32<12>),
+ MAP(imm_v128_shr_n_s32<16>),
+ MAP(imm_v128_shr_n_s32<20>),
+ MAP(imm_v128_shr_n_s32<24>),
+ MAP(imm_v128_shr_n_s32<28>),
+ MAP(imm_v128_shl_n_64<1>),
+ MAP(imm_v128_shl_n_64<4>),
+ MAP(imm_v128_shl_n_64<8>),
+ MAP(imm_v128_shl_n_64<12>),
+ MAP(imm_v128_shl_n_64<16>),
+ MAP(imm_v128_shl_n_64<20>),
+ MAP(imm_v128_shl_n_64<24>),
+ MAP(imm_v128_shl_n_64<28>),
+ MAP(imm_v128_shl_n_64<32>),
+ MAP(imm_v128_shl_n_64<36>),
+ MAP(imm_v128_shl_n_64<40>),
+ MAP(imm_v128_shl_n_64<44>),
+ MAP(imm_v128_shl_n_64<48>),
+ MAP(imm_v128_shl_n_64<52>),
+ MAP(imm_v128_shl_n_64<56>),
+ MAP(imm_v128_shl_n_64<60>),
+ MAP(imm_v128_shr_n_u64<1>),
+ MAP(imm_v128_shr_n_u64<4>),
+ MAP(imm_v128_shr_n_u64<8>),
+ MAP(imm_v128_shr_n_u64<12>),
+ MAP(imm_v128_shr_n_u64<16>),
+ MAP(imm_v128_shr_n_u64<20>),
+ MAP(imm_v128_shr_n_u64<24>),
+ MAP(imm_v128_shr_n_u64<28>),
+ MAP(imm_v128_shr_n_u64<32>),
+ MAP(imm_v128_shr_n_u64<36>),
+ MAP(imm_v128_shr_n_u64<40>),
+ MAP(imm_v128_shr_n_u64<44>),
+ MAP(imm_v128_shr_n_u64<48>),
+ MAP(imm_v128_shr_n_u64<52>),
+ MAP(imm_v128_shr_n_u64<56>),
+ MAP(imm_v128_shr_n_u64<60>),
+ MAP(imm_v128_shr_n_s64<1>),
+ MAP(imm_v128_shr_n_s64<4>),
+ MAP(imm_v128_shr_n_s64<8>),
+ MAP(imm_v128_shr_n_s64<12>),
+ MAP(imm_v128_shr_n_s64<16>),
+ MAP(imm_v128_shr_n_s64<20>),
+ MAP(imm_v128_shr_n_s64<24>),
+ MAP(imm_v128_shr_n_s64<28>),
+ MAP(imm_v128_shr_n_s64<32>),
+ MAP(imm_v128_shr_n_s64<36>),
+ MAP(imm_v128_shr_n_s64<40>),
+ MAP(imm_v128_shr_n_s64<44>),
+ MAP(imm_v128_shr_n_s64<48>),
+ MAP(imm_v128_shr_n_s64<52>),
+ MAP(imm_v128_shr_n_s64<56>),
+ MAP(imm_v128_shr_n_s64<60>),
+ MAP(v128_from_v64),
+ MAP(v128_zip_8),
+ MAP(v128_zip_16),
+ MAP(v128_zip_32),
+ MAP(v128_mul_s16),
+ MAP(v128_unpack_u8_s16),
+ MAP(v128_unpack_s8_s16),
+ MAP(v128_unpack_u16_s32),
+ MAP(v128_unpack_s16_s32),
+ MAP(v128_shl_8),
+ MAP(v128_shr_u8),
+ MAP(v128_shr_s8),
+ MAP(v128_shl_16),
+ MAP(v128_shr_u16),
+ MAP(v128_shr_s16),
+ MAP(v128_shl_32),
+ MAP(v128_shr_u32),
+ MAP(v128_shr_s32),
+ MAP(v128_shl_64),
+ MAP(v128_shr_u64),
+ MAP(v128_shr_s64),
+ MAP(v128_hadd_u8),
+ MAP(v128_dotp_su8),
+ MAP(v128_dotp_s16),
+ MAP(v128_dotp_s32),
+ MAP(v128_low_u32),
+ MAP(v128_low_v64),
+ MAP(v128_high_v64),
+ MAP(v128_from_64),
+ MAP(v128_from_32),
+ MAP(v128_movemask_8),
+ MAP(v128_zero),
+ MAP(v128_dup_8),
+ MAP(v128_dup_16),
+ MAP(v128_dup_32),
+ MAP(v128_dup_64),
+ MAP(v128_unpacklo_u8_s16),
+ MAP(v128_unpackhi_u8_s16),
+ MAP(v128_unpacklo_s8_s16),
+ MAP(v128_unpackhi_s8_s16),
+ MAP(v128_blend_8),
+ MAP(u32_load_unaligned),
+ MAP(u32_store_unaligned),
+ MAP(v64_load_unaligned),
+ MAP(v64_store_unaligned),
+ MAP(v128_load_unaligned),
+ MAP(v128_store_unaligned),
+ MAP(v256_sad_u8),
+ MAP(v256_ssd_u8),
+ MAP(v256_sad_u16),
+ MAP(v256_ssd_s16),
+ MAP(v256_hadd_u8),
+ MAP(v256_low_u64),
+ MAP(v256_dotp_su8),
+ MAP(v256_dotp_s16),
+ MAP(v256_dotp_s32),
+ MAP(v256_add_8),
+ MAP(v256_add_16),
+ MAP(v256_sadd_s8),
+ MAP(v256_sadd_u8),
+ MAP(v256_sadd_s16),
+ MAP(v256_add_32),
+ MAP(v256_add_64),
+ MAP(v256_sub_8),
+ MAP(v256_ssub_u8),
+ MAP(v256_ssub_s8),
+ MAP(v256_sub_16),
+ MAP(v256_ssub_u16),
+ MAP(v256_ssub_s16),
+ MAP(v256_sub_32),
+ MAP(v256_sub_64),
+ MAP(v256_ziplo_8),
+ MAP(v256_ziphi_8),
+ MAP(v256_ziplo_16),
+ MAP(v256_ziphi_16),
+ MAP(v256_ziplo_32),
+ MAP(v256_ziphi_32),
+ MAP(v256_ziplo_64),
+ MAP(v256_ziphi_64),
+ MAP(v256_unziphi_8),
+ MAP(v256_unziplo_8),
+ MAP(v256_unziphi_16),
+ MAP(v256_unziplo_16),
+ MAP(v256_unziphi_32),
+ MAP(v256_unziplo_32),
+ MAP(v256_unziphi_64),
+ MAP(v256_unziplo_64),
+ MAP(v256_pack_s32_u16),
+ MAP(v256_pack_s32_s16),
+ MAP(v256_pack_s16_u8),
+ MAP(v256_pack_s16_s8),
+ MAP(v256_or),
+ MAP(v256_xor),
+ MAP(v256_and),
+ MAP(v256_andn),
+ MAP(v256_mullo_s16),
+ MAP(v256_mulhi_s16),
+ MAP(v256_mullo_s32),
+ MAP(v256_madd_s16),
+ MAP(v256_madd_us8),
+ MAP(v256_avg_u8),
+ MAP(v256_rdavg_u8),
+ MAP(v256_rdavg_u16),
+ MAP(v256_avg_u16),
+ MAP(v256_min_u8),
+ MAP(v256_max_u8),
+ MAP(v256_min_s8),
+ MAP(v256_max_s8),
+ MAP(v256_min_s16),
+ MAP(v256_max_s16),
+ MAP(v256_min_s32),
+ MAP(v256_max_s32),
+ MAP(v256_cmpgt_s8),
+ MAP(v256_cmplt_s8),
+ MAP(v256_cmpeq_8),
+ MAP(v256_cmpgt_s16),
+ MAP(v256_cmplt_s16),
+ MAP(v256_cmpeq_16),
+ MAP(v256_cmpgt_s32),
+ MAP(v256_cmplt_s32),
+ MAP(v256_cmpeq_32),
+ MAP(v256_shuffle_8),
+ MAP(v256_pshuffle_8),
+ MAP(v256_wideshuffle_8),
+ MAP(imm_v256_align<1>),
+ MAP(imm_v256_align<2>),
+ MAP(imm_v256_align<3>),
+ MAP(imm_v256_align<4>),
+ MAP(imm_v256_align<5>),
+ MAP(imm_v256_align<6>),
+ MAP(imm_v256_align<7>),
+ MAP(imm_v256_align<8>),
+ MAP(imm_v256_align<9>),
+ MAP(imm_v256_align<10>),
+ MAP(imm_v256_align<11>),
+ MAP(imm_v256_align<12>),
+ MAP(imm_v256_align<13>),
+ MAP(imm_v256_align<14>),
+ MAP(imm_v256_align<15>),
+ MAP(imm_v256_align<16>),
+ MAP(imm_v256_align<17>),
+ MAP(imm_v256_align<18>),
+ MAP(imm_v256_align<19>),
+ MAP(imm_v256_align<20>),
+ MAP(imm_v256_align<21>),
+ MAP(imm_v256_align<22>),
+ MAP(imm_v256_align<23>),
+ MAP(imm_v256_align<24>),
+ MAP(imm_v256_align<25>),
+ MAP(imm_v256_align<26>),
+ MAP(imm_v256_align<27>),
+ MAP(imm_v256_align<28>),
+ MAP(imm_v256_align<29>),
+ MAP(imm_v256_align<30>),
+ MAP(imm_v256_align<31>),
+ MAP(v256_from_v128),
+ MAP(v256_zip_8),
+ MAP(v256_zip_16),
+ MAP(v256_zip_32),
+ MAP(v256_mul_s16),
+ MAP(v256_unpack_u8_s16),
+ MAP(v256_unpack_s8_s16),
+ MAP(v256_unpack_u16_s32),
+ MAP(v256_unpack_s16_s32),
+ MAP(v256_shl_8),
+ MAP(v256_shr_u8),
+ MAP(v256_shr_s8),
+ MAP(v256_shl_16),
+ MAP(v256_shr_u16),
+ MAP(v256_shr_s16),
+ MAP(v256_shl_32),
+ MAP(v256_shr_u32),
+ MAP(v256_shr_s32),
+ MAP(v256_shl_64),
+ MAP(v256_shr_u64),
+ MAP(v256_shr_s64),
+ MAP(v256_abs_s8),
+ MAP(v256_abs_s16),
+ MAP(v256_padd_u8),
+ MAP(v256_padd_s16),
+ MAP(v256_unpacklo_u16_s32),
+ MAP(v256_unpacklo_s16_s32),
+ MAP(v256_unpackhi_u16_s32),
+ MAP(v256_unpackhi_s16_s32),
+ MAP(imm_v256_shr_n_word<1>),
+ MAP(imm_v256_shr_n_word<2>),
+ MAP(imm_v256_shr_n_word<3>),
+ MAP(imm_v256_shr_n_word<4>),
+ MAP(imm_v256_shr_n_word<5>),
+ MAP(imm_v256_shr_n_word<6>),
+ MAP(imm_v256_shr_n_word<7>),
+ MAP(imm_v256_shr_n_word<8>),
+ MAP(imm_v256_shr_n_word<9>),
+ MAP(imm_v256_shr_n_word<10>),
+ MAP(imm_v256_shr_n_word<11>),
+ MAP(imm_v256_shr_n_word<12>),
+ MAP(imm_v256_shr_n_word<13>),
+ MAP(imm_v256_shr_n_word<14>),
+ MAP(imm_v256_shr_n_word<15>),
+ MAP(imm_v256_shl_n_word<1>),
+ MAP(imm_v256_shl_n_word<2>),
+ MAP(imm_v256_shl_n_word<3>),
+ MAP(imm_v256_shl_n_word<4>),
+ MAP(imm_v256_shl_n_word<5>),
+ MAP(imm_v256_shl_n_word<6>),
+ MAP(imm_v256_shl_n_word<7>),
+ MAP(imm_v256_shl_n_word<8>),
+ MAP(imm_v256_shl_n_word<9>),
+ MAP(imm_v256_shl_n_word<10>),
+ MAP(imm_v256_shl_n_word<11>),
+ MAP(imm_v256_shl_n_word<12>),
+ MAP(imm_v256_shl_n_word<13>),
+ MAP(imm_v256_shl_n_word<14>),
+ MAP(imm_v256_shl_n_word<15>),
+ MAP(imm_v256_shr_n_byte<1>),
+ MAP(imm_v256_shr_n_byte<2>),
+ MAP(imm_v256_shr_n_byte<3>),
+ MAP(imm_v256_shr_n_byte<4>),
+ MAP(imm_v256_shr_n_byte<5>),
+ MAP(imm_v256_shr_n_byte<6>),
+ MAP(imm_v256_shr_n_byte<7>),
+ MAP(imm_v256_shr_n_byte<8>),
+ MAP(imm_v256_shr_n_byte<9>),
+ MAP(imm_v256_shr_n_byte<10>),
+ MAP(imm_v256_shr_n_byte<11>),
+ MAP(imm_v256_shr_n_byte<12>),
+ MAP(imm_v256_shr_n_byte<13>),
+ MAP(imm_v256_shr_n_byte<14>),
+ MAP(imm_v256_shr_n_byte<15>),
+ MAP(imm_v256_shr_n_byte<16>),
+ MAP(imm_v256_shr_n_byte<17>),
+ MAP(imm_v256_shr_n_byte<18>),
+ MAP(imm_v256_shr_n_byte<19>),
+ MAP(imm_v256_shr_n_byte<20>),
+ MAP(imm_v256_shr_n_byte<21>),
+ MAP(imm_v256_shr_n_byte<22>),
+ MAP(imm_v256_shr_n_byte<23>),
+ MAP(imm_v256_shr_n_byte<24>),
+ MAP(imm_v256_shr_n_byte<25>),
+ MAP(imm_v256_shr_n_byte<26>),
+ MAP(imm_v256_shr_n_byte<27>),
+ MAP(imm_v256_shr_n_byte<28>),
+ MAP(imm_v256_shr_n_byte<29>),
+ MAP(imm_v256_shr_n_byte<30>),
+ MAP(imm_v256_shr_n_byte<31>),
+ MAP(imm_v256_shl_n_byte<1>),
+ MAP(imm_v256_shl_n_byte<2>),
+ MAP(imm_v256_shl_n_byte<3>),
+ MAP(imm_v256_shl_n_byte<4>),
+ MAP(imm_v256_shl_n_byte<5>),
+ MAP(imm_v256_shl_n_byte<6>),
+ MAP(imm_v256_shl_n_byte<7>),
+ MAP(imm_v256_shl_n_byte<8>),
+ MAP(imm_v256_shl_n_byte<9>),
+ MAP(imm_v256_shl_n_byte<10>),
+ MAP(imm_v256_shl_n_byte<11>),
+ MAP(imm_v256_shl_n_byte<12>),
+ MAP(imm_v256_shl_n_byte<13>),
+ MAP(imm_v256_shl_n_byte<14>),
+ MAP(imm_v256_shl_n_byte<15>),
+ MAP(imm_v256_shl_n_byte<16>),
+ MAP(imm_v256_shl_n_byte<17>),
+ MAP(imm_v256_shl_n_byte<18>),
+ MAP(imm_v256_shl_n_byte<19>),
+ MAP(imm_v256_shl_n_byte<20>),
+ MAP(imm_v256_shl_n_byte<21>),
+ MAP(imm_v256_shl_n_byte<22>),
+ MAP(imm_v256_shl_n_byte<23>),
+ MAP(imm_v256_shl_n_byte<24>),
+ MAP(imm_v256_shl_n_byte<25>),
+ MAP(imm_v256_shl_n_byte<26>),
+ MAP(imm_v256_shl_n_byte<27>),
+ MAP(imm_v256_shl_n_byte<28>),
+ MAP(imm_v256_shl_n_byte<29>),
+ MAP(imm_v256_shl_n_byte<30>),
+ MAP(imm_v256_shl_n_byte<31>),
+ MAP(imm_v256_shl_n_8<1>),
+ MAP(imm_v256_shl_n_8<2>),
+ MAP(imm_v256_shl_n_8<3>),
+ MAP(imm_v256_shl_n_8<4>),
+ MAP(imm_v256_shl_n_8<5>),
+ MAP(imm_v256_shl_n_8<6>),
+ MAP(imm_v256_shl_n_8<7>),
+ MAP(imm_v256_shr_n_u8<1>),
+ MAP(imm_v256_shr_n_u8<2>),
+ MAP(imm_v256_shr_n_u8<3>),
+ MAP(imm_v256_shr_n_u8<4>),
+ MAP(imm_v256_shr_n_u8<5>),
+ MAP(imm_v256_shr_n_u8<6>),
+ MAP(imm_v256_shr_n_u8<7>),
+ MAP(imm_v256_shr_n_s8<1>),
+ MAP(imm_v256_shr_n_s8<2>),
+ MAP(imm_v256_shr_n_s8<3>),
+ MAP(imm_v256_shr_n_s8<4>),
+ MAP(imm_v256_shr_n_s8<5>),
+ MAP(imm_v256_shr_n_s8<6>),
+ MAP(imm_v256_shr_n_s8<7>),
+ MAP(imm_v256_shl_n_16<1>),
+ MAP(imm_v256_shl_n_16<2>),
+ MAP(imm_v256_shl_n_16<4>),
+ MAP(imm_v256_shl_n_16<6>),
+ MAP(imm_v256_shl_n_16<8>),
+ MAP(imm_v256_shl_n_16<10>),
+ MAP(imm_v256_shl_n_16<12>),
+ MAP(imm_v256_shl_n_16<14>),
+ MAP(imm_v256_shr_n_u16<1>),
+ MAP(imm_v256_shr_n_u16<2>),
+ MAP(imm_v256_shr_n_u16<4>),
+ MAP(imm_v256_shr_n_u16<6>),
+ MAP(imm_v256_shr_n_u16<8>),
+ MAP(imm_v256_shr_n_u16<10>),
+ MAP(imm_v256_shr_n_u16<12>),
+ MAP(imm_v256_shr_n_u16<14>),
+ MAP(imm_v256_shr_n_s16<1>),
+ MAP(imm_v256_shr_n_s16<2>),
+ MAP(imm_v256_shr_n_s16<4>),
+ MAP(imm_v256_shr_n_s16<6>),
+ MAP(imm_v256_shr_n_s16<8>),
+ MAP(imm_v256_shr_n_s16<10>),
+ MAP(imm_v256_shr_n_s16<12>),
+ MAP(imm_v256_shr_n_s16<14>),
+ MAP(imm_v256_shl_n_32<1>),
+ MAP(imm_v256_shl_n_32<4>),
+ MAP(imm_v256_shl_n_32<8>),
+ MAP(imm_v256_shl_n_32<12>),
+ MAP(imm_v256_shl_n_32<16>),
+ MAP(imm_v256_shl_n_32<20>),
+ MAP(imm_v256_shl_n_32<24>),
+ MAP(imm_v256_shl_n_32<28>),
+ MAP(imm_v256_shr_n_u32<1>),
+ MAP(imm_v256_shr_n_u32<4>),
+ MAP(imm_v256_shr_n_u32<8>),
+ MAP(imm_v256_shr_n_u32<12>),
+ MAP(imm_v256_shr_n_u32<16>),
+ MAP(imm_v256_shr_n_u32<20>),
+ MAP(imm_v256_shr_n_u32<24>),
+ MAP(imm_v256_shr_n_u32<28>),
+ MAP(imm_v256_shr_n_s32<1>),
+ MAP(imm_v256_shr_n_s32<4>),
+ MAP(imm_v256_shr_n_s32<8>),
+ MAP(imm_v256_shr_n_s32<12>),
+ MAP(imm_v256_shr_n_s32<16>),
+ MAP(imm_v256_shr_n_s32<20>),
+ MAP(imm_v256_shr_n_s32<24>),
+ MAP(imm_v256_shr_n_s32<28>),
+ MAP(imm_v256_shl_n_64<1>),
+ MAP(imm_v256_shl_n_64<4>),
+ MAP(imm_v256_shl_n_64<8>),
+ MAP(imm_v256_shl_n_64<12>),
+ MAP(imm_v256_shl_n_64<16>),
+ MAP(imm_v256_shl_n_64<20>),
+ MAP(imm_v256_shl_n_64<24>),
+ MAP(imm_v256_shl_n_64<28>),
+ MAP(imm_v256_shl_n_64<32>),
+ MAP(imm_v256_shl_n_64<36>),
+ MAP(imm_v256_shl_n_64<40>),
+ MAP(imm_v256_shl_n_64<44>),
+ MAP(imm_v256_shl_n_64<48>),
+ MAP(imm_v256_shl_n_64<52>),
+ MAP(imm_v256_shl_n_64<56>),
+ MAP(imm_v256_shl_n_64<60>),
+ MAP(imm_v256_shr_n_u64<1>),
+ MAP(imm_v256_shr_n_u64<4>),
+ MAP(imm_v256_shr_n_u64<8>),
+ MAP(imm_v256_shr_n_u64<12>),
+ MAP(imm_v256_shr_n_u64<16>),
+ MAP(imm_v256_shr_n_u64<20>),
+ MAP(imm_v256_shr_n_u64<24>),
+ MAP(imm_v256_shr_n_u64<28>),
+ MAP(imm_v256_shr_n_u64<32>),
+ MAP(imm_v256_shr_n_u64<36>),
+ MAP(imm_v256_shr_n_u64<40>),
+ MAP(imm_v256_shr_n_u64<44>),
+ MAP(imm_v256_shr_n_u64<48>),
+ MAP(imm_v256_shr_n_u64<52>),
+ MAP(imm_v256_shr_n_u64<56>),
+ MAP(imm_v256_shr_n_u64<60>),
+ MAP(imm_v256_shr_n_s64<1>),
+ MAP(imm_v256_shr_n_s64<4>),
+ MAP(imm_v256_shr_n_s64<8>),
+ MAP(imm_v256_shr_n_s64<12>),
+ MAP(imm_v256_shr_n_s64<16>),
+ MAP(imm_v256_shr_n_s64<20>),
+ MAP(imm_v256_shr_n_s64<24>),
+ MAP(imm_v256_shr_n_s64<28>),
+ MAP(imm_v256_shr_n_s64<32>),
+ MAP(imm_v256_shr_n_s64<36>),
+ MAP(imm_v256_shr_n_s64<40>),
+ MAP(imm_v256_shr_n_s64<44>),
+ MAP(imm_v256_shr_n_s64<48>),
+ MAP(imm_v256_shr_n_s64<52>),
+ MAP(imm_v256_shr_n_s64<56>),
+ MAP(imm_v256_shr_n_s64<60>),
+ MAP(v256_movemask_8),
+ MAP(v256_zero),
+ MAP(v256_dup_8),
+ MAP(v256_dup_16),
+ MAP(v256_dup_32),
+ MAP(v256_dup_64),
+ MAP(v256_low_u32),
+ MAP(v256_low_v64),
+ MAP(v256_from_64),
+ MAP(v256_from_v64),
+ MAP(v256_ziplo_128),
+ MAP(v256_ziphi_128),
+ MAP(v256_unpacklo_u8_s16),
+ MAP(v256_unpackhi_u8_s16),
+ MAP(v256_unpacklo_s8_s16),
+ MAP(v256_unpackhi_s8_s16),
+ MAP(v256_blend_8),
+ { nullptr, nullptr, nullptr } };
+#undef MAP
+
+// Map reference functions to machine tuned functions. Since the
+// functions depend on machine tuned types, the non-machine tuned
+// instantiations of the test can't refer to these functions directly,
+// so we refer to them by name and do the mapping here.
+void Map(const char *name, fptr *ref, fptr *simd) {
+ unsigned int i;
+ for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
+ }
+
+ *ref = m[i].ref;
+ *simd = m[i].simd;
+}
+
+// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
+std::string Print(const uint8_t *a, int size) {
+ std::string text = "0x";
+ for (int i = 0; i < size; i++) {
+ const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
+ // Same as snprintf(..., ..., "%02x", c)
+ text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
+ text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
+ }
+
+ return text;
+}
+
+// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
+// ranges
+void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
+ switch (maskwidth) {
+ case 0: {
+ break;
+ }
+ case 8: {
+ for (int i = 0; i < size; i++) s[i] &= mask;
+ break;
+ }
+ case 16: {
+ uint16_t *t = reinterpret_cast<uint16_t *>(s);
+ assert(!(reinterpret_cast<uintptr_t>(s) & 1));
+ for (int i = 0; i < size / 2; i++) t[i] &= mask;
+ break;
+ }
+ case 32: {
+ uint32_t *t = reinterpret_cast<uint32_t *>(s);
+ assert(!(reinterpret_cast<uintptr_t>(s) & 3));
+ for (int i = 0; i < size / 4; i++) t[i] &= mask;
+ break;
+ }
+ case 64: {
+ uint64_t *t = reinterpret_cast<uint64_t *>(s);
+ assert(!(reinterpret_cast<uintptr_t>(s) & 7));
+ for (int i = 0; i < size / 8; i++) t[i] &= mask;
+ break;
+ }
+ default: {
+ FAIL() << "Unsupported mask width";
+ break;
+ }
+ }
+}
+
+// We need some extra load/store functions
+void u64_store_aligned(void *p, uint64_t a) {
+ v64_store_aligned(p, v64_from_64(a));
+}
+void s32_store_aligned(void *p, int32_t a) {
+ u32_store_aligned(p, static_cast<uint32_t>(a));
+}
+void s64_store_aligned(void *p, int64_t a) {
+ v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
+}
+
+void c_u64_store_aligned(void *p, uint64_t a) {
+ c_v64_store_aligned(p, c_v64_from_64(a));
+}
+
+void c_s32_store_aligned(void *p, int32_t a) {
+ c_u32_store_aligned(p, static_cast<uint32_t>(a));
+}
+
+void c_s64_store_aligned(void *p, int64_t a) {
+ c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
+}
+
+uint64_t u64_load_aligned(const void *p) {
+ return v64_u64(v64_load_aligned(p));
+}
+uint16_t u16_load_aligned(const void *p) {
+ return *(reinterpret_cast<const uint16_t *>(p));
+}
+uint8_t u8_load_aligned(const void *p) {
+ return *(reinterpret_cast<const uint8_t *>(p));
+}
+
+uint64_t c_u64_load_aligned(const void *p) {
+ return c_v64_u64(c_v64_load_aligned(p));
+}
+uint16_t c_u16_load_aligned(const void *p) {
+ return *(reinterpret_cast<const uint16_t *>(p));
+}
+uint8_t c_u8_load_aligned(const void *p) {
+ return *(reinterpret_cast<const uint8_t *>(p));
+}
+
+// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
+// intrinsics taking 1, 2 or 3 arguments respectively with their
+// corresponding C reference. Ideally, the loads and stores should
+// have gone into the template parameter list, but v64 and v128 could
+// be typedef'ed to the same type (which is the case on x86) and then
+// we can't instantiate both v64 and v128, so the function return and
+// argument types, including the always differing types in the C
+// equivalent are used instead. The function arguments must be void
+// pointers and then go through a cast to avoid matching errors in the
+// branches eliminated by the typeid tests in the calling function.
+template <typename Ret, typename Arg, typename CRet, typename CArg>
+int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
+ fptr c_load, fptr c_simd, void *ref_d, const void *a) {
+ void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+ Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
+ Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
+ void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+ CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
+ CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
+
+ // Call reference and intrinsic
+ my_c_store(ref_d, my_c_simd(my_c_load(a)));
+ my_store(d, my_simd(my_load(a)));
+
+ // Compare results
+ return memcmp(ref_d, d, sizeof(CRet));
+}
+
+template <typename Ret, typename Arg1, typename Arg2, typename CRet,
+ typename CArg1, typename CArg2>
+int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
+ fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
+ void *ref_d, const void *a, const void *b) {
+ void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+ Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+ Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+ Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
+ void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+ CArg1 (*const my_c_load1)(const void *) =
+ (CArg1(*const)(const void *))c_load1;
+ CArg2 (*const my_c_load2)(const void *) =
+ (CArg2(*const)(const void *))c_load2;
+ CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
+
+ // Call reference and intrinsic
+ my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
+ my_store(d, my_simd(my_load1(a), my_load2(b)));
+
+ // Compare results
+ return memcmp(ref_d, d, sizeof(CRet));
+}
+
+template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
+ typename CRet, typename CArg1, typename CArg2, typename CArg3>
+int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
+ void *d, fptr c_store, fptr c_load1, fptr c_load2,
+ fptr c_load3, fptr c_simd, void *ref_d, const void *a,
+ const void *b, const void *c) {
+ void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+ Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+ Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+ Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
+ Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
+ void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+ CArg1 (*const my_c_load1)(const void *) =
+ (CArg1(*const)(const void *))c_load1;
+ CArg2 (*const my_c_load2)(const void *) =
+ (CArg2(*const)(const void *))c_load2;
+ CArg3 (*const my_c_load3)(const void *) =
+ (CArg3(*const)(const void *))c_load3;
+ CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
+ (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
+
+ // Call reference and intrinsic
+ my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
+ my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
+
+ // Compare results
+ return memcmp(ref_d, d, sizeof(CRet));
+}
+
+} // namespace
+
+template <typename CRet, typename CArg>
+void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+ const char *name) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ fptr ref_simd;
+ fptr simd;
+ int error = 0;
+ DECLARE_ALIGNED(32, uint8_t, s[32]);
+ DECLARE_ALIGNED(32, uint8_t, d[32]);
+ DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+ assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
+ memset(ref_d, 0, sizeof(ref_d));
+ memset(d, 0, sizeof(d));
+
+ Map(name, &ref_simd, &simd);
+ if (simd == nullptr || ref_simd == nullptr) {
+ FAIL() << "Internal error: Unknown intrinsic function " << name;
+ }
+ for (unsigned int count = 0;
+ count < iterations && !error && !testing::Test::HasFailure(); count++) {
+ for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
+
+ if (maskwidth) {
+ SetMask(s, sizeof(CArg), mask, maskwidth);
+ }
+
+ if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
+ // V64_V64
+ error = CompareSimd1Arg<v64, v64, c_v64, c_v64>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg) == typeid(uint8_t)) {
+ // V64_U8
+ error = CompareSimd1Arg<v64, uint8_t, c_v64, uint8_t>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg) == typeid(uint16_t)) {
+ // V64_U16
+ error = CompareSimd1Arg<v64, uint16_t, c_v64, uint16_t>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg) == typeid(uint32_t)) {
+ // V64_U32
+ error = CompareSimd1Arg<v64, uint32_t, c_v64, uint32_t>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(uint64_t) &&
+ typeid(CArg) == typeid(c_v64)) {
+ // U64_V64
+ error = CompareSimd1Arg<uint64_t, v64, uint64_t, c_v64>(
+ reinterpret_cast<fptr>(u64_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u64_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(int64_t) &&
+ typeid(CArg) == typeid(c_v64)) {
+ // S64_V64
+ error = CompareSimd1Arg<int64_t, v64, int64_t, c_v64>(
+ reinterpret_cast<fptr>(s64_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_s64_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(uint32_t) &&
+ typeid(CArg) == typeid(c_v64)) {
+ // U32_V64
+ error = CompareSimd1Arg<uint32_t, v64, uint32_t, c_v64>(
+ reinterpret_cast<fptr>(u32_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u32_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(int32_t) &&
+ typeid(CArg) == typeid(c_v64)) {
+ // S32_V64
+ error = CompareSimd1Arg<int32_t, v64, int32_t, c_v64>(
+ reinterpret_cast<fptr>(s32_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_s32_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(uint32_t) &&
+ typeid(CArg) == typeid(c_v128)) {
+ // U32_V128
+ error = CompareSimd1Arg<uint32_t, v128, uint32_t, c_v128>(
+ reinterpret_cast<fptr>(u32_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u32_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(uint64_t) &&
+ typeid(CArg) == typeid(c_v128)) {
+ // U64_V128
+ error = CompareSimd1Arg<uint64_t, v128, uint64_t, c_v128>(
+ reinterpret_cast<fptr>(u64_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u64_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(uint64_t) &&
+ typeid(CArg) == typeid(c_v256)) {
+ // U64_V256
+ error = CompareSimd1Arg<uint64_t, v256, uint64_t, c_v256>(
+ reinterpret_cast<fptr>(u64_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u64_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg) == typeid(c_v128)) {
+ // V64_V128
+ error = CompareSimd1Arg<v64, v128, c_v64, c_v128>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg) == typeid(c_v128)) {
+ // V128_V128
+ error = CompareSimd1Arg<v128, v128, c_v128, c_v128>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg) == typeid(c_v64)) {
+ // V128_V64
+ error = CompareSimd1Arg<v128, v64, c_v128, c_v64>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg) == typeid(uint8_t)) {
+ // V128_U8
+ error = CompareSimd1Arg<v128, uint8_t, c_v128, uint8_t>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg) == typeid(uint16_t)) {
+ // V128_U16
+ error = CompareSimd1Arg<v128, uint16_t, c_v128, uint16_t>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg) == typeid(uint32_t)) {
+ // V128_U32
+ error = CompareSimd1Arg<v128, uint32_t, c_v128, uint32_t>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg) == typeid(uint64_t)) {
+ // V128_U64
+ error = CompareSimd1Arg<v128, uint64_t, c_v128, uint64_t>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg) == typeid(c_v256)) {
+ // V256_V256
+ error = CompareSimd1Arg<v256, v256, c_v256, c_v256>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg) == typeid(c_v128)) {
+ // V256_V128
+ error = CompareSimd1Arg<v256, v128, c_v256, c_v128>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg) == typeid(uint8_t)) {
+ // V256_U8
+ error = CompareSimd1Arg<v256, uint8_t, c_v256, uint8_t>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg) == typeid(uint16_t)) {
+ // V256_U16
+ error = CompareSimd1Arg<v256, uint16_t, c_v256, uint16_t>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg) == typeid(uint32_t)) {
+ // V256_U32
+ error = CompareSimd1Arg<v256, uint32_t, c_v256, uint32_t>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg) == typeid(uint64_t)) {
+ // V256_U64
+ error = CompareSimd1Arg<v256, uint64_t, c_v256, uint64_t>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(uint32_t) &&
+ typeid(CArg) == typeid(c_v256)) {
+ // U32_V256
+ error = CompareSimd1Arg<uint32_t, v256, uint32_t, c_v256>(
+ reinterpret_cast<fptr>(u32_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u32_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg) == typeid(c_v256)) {
+ // V64_V256
+ error = CompareSimd1Arg<v64, v256, c_v64, c_v256>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+ } else {
+ FAIL() << "Internal error: Unknown intrinsic function "
+ << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
+ << ")";
+ }
+ }
+
+ EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+ << Print(s, sizeof(CArg)) << ") -> "
+ << Print(d, sizeof(CRet)) << " (simd), "
+ << Print(ref_d, sizeof(CRet)) << " (ref)";
+}
+
+template <typename CRet, typename CArg1, typename CArg2>
+void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+ const char *name) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ fptr ref_simd;
+ fptr simd;
+ int error = 0;
+ DECLARE_ALIGNED(32, uint8_t, s1[32]);
+ DECLARE_ALIGNED(32, uint8_t, s2[32]);
+ DECLARE_ALIGNED(32, uint8_t, d[32]);
+ DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+ assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
+ memset(ref_d, 0, sizeof(ref_d));
+ memset(d, 0, sizeof(d));
+
+ Map(name, &ref_simd, &simd);
+ if (simd == nullptr || ref_simd == nullptr) {
+ FAIL() << "Internal error: Unknown intrinsic function " << name;
+ }
+
+ for (unsigned int count = 0;
+ count < iterations && !error && !testing::Test::HasFailure(); count++) {
+ for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+ for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+ if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
+
+ if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
+ typeid(CArg2) == typeid(c_v64)) {
+ // V64_V64V64
+ error = CompareSimd2Args<v64, v64, v64, c_v64, c_v64, c_v64>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg1) == typeid(uint32_t) &&
+ typeid(CArg2) == typeid(uint32_t)) {
+ // V64_U32U32
+ error =
+ CompareSimd2Args<v64, uint32_t, uint32_t, c_v64, uint32_t, uint32_t>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(uint32_t) &&
+ typeid(CArg1) == typeid(c_v64) &&
+ typeid(CArg2) == typeid(c_v64)) {
+ // U32_V64V64
+ error = CompareSimd2Args<uint32_t, v64, v64, uint32_t, c_v64, c_v64>(
+ reinterpret_cast<fptr>(u32_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u32_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(int64_t) &&
+ typeid(CArg1) == typeid(c_v64) &&
+ typeid(CArg2) == typeid(c_v64)) {
+ // S64_V64V64
+ error = CompareSimd2Args<int64_t, v64, v64, int64_t, c_v64, c_v64>(
+ reinterpret_cast<fptr>(s64_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_s64_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v64) &&
+ typeid(CArg1) == typeid(c_v64) &&
+ typeid(CArg2) == typeid(uint32_t)) {
+ // V64_V64U32
+ error = CompareSimd2Args<v64, v64, uint32_t, c_v64, c_v64, uint32_t>(
+ reinterpret_cast<fptr>(v64_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v64_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(c_v128)) {
+ // V128_V128V128
+ error = CompareSimd2Args<v128, v128, v128, c_v128, c_v128, c_v128>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(uint32_t) &&
+ typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(c_v128)) {
+ // U32_V128V128
+ error = CompareSimd2Args<uint32_t, v128, v128, uint32_t, c_v128, c_v128>(
+ reinterpret_cast<fptr>(u32_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u32_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(uint64_t) &&
+ typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(c_v128)) {
+ // U64_V128V128
+ error = CompareSimd2Args<uint64_t, v128, v128, uint64_t, c_v128, c_v128>(
+ reinterpret_cast<fptr>(u64_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u64_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(int64_t) &&
+ typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(c_v128)) {
+ // S64_V128V128
+ error = CompareSimd2Args<int64_t, v128, v128, int64_t, c_v128, c_v128>(
+ reinterpret_cast<fptr>(s64_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_s64_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg1) == typeid(uint64_t) &&
+ typeid(CArg2) == typeid(uint64_t)) {
+ // V128_U64U64
+ error = CompareSimd2Args<v128, uint64_t, uint64_t, c_v128, uint64_t,
+ uint64_t>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(u64_load_aligned),
+ reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_u64_load_aligned),
+ reinterpret_cast<fptr>(c_u64_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg1) == typeid(c_v64) &&
+ typeid(CArg2) == typeid(c_v64)) {
+ // V128_V64V64
+ error = CompareSimd2Args<v128, v64, v64, c_v128, c_v64, c_v64>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned),
+ reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(c_v64_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v128) &&
+ typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(uint32_t)) {
+ // V128_V128U32
+ error = CompareSimd2Args<v128, v128, uint32_t, c_v128, c_v128, uint32_t>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg1) == typeid(c_v256) &&
+ typeid(CArg2) == typeid(c_v256)) {
+ // V256_V256V256
+ error = CompareSimd2Args<v256, v256, v256, c_v256, c_v256, c_v256>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(uint64_t) &&
+ typeid(CArg1) == typeid(c_v256) &&
+ typeid(CArg2) == typeid(c_v256)) {
+ // U64_V256V256
+ error = CompareSimd2Args<uint64_t, v256, v256, uint64_t, c_v256, c_v256>(
+ reinterpret_cast<fptr>(u64_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u64_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(int64_t) &&
+ typeid(CArg1) == typeid(c_v256) &&
+ typeid(CArg2) == typeid(c_v256)) {
+ // S64_V256V256
+ error = CompareSimd2Args<int64_t, v256, v256, int64_t, c_v256, c_v256>(
+ reinterpret_cast<fptr>(s64_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_s64_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(uint32_t) &&
+ typeid(CArg1) == typeid(c_v256) &&
+ typeid(CArg2) == typeid(c_v256)) {
+ // U32_V256V256
+ error = CompareSimd2Args<uint32_t, v256, v256, uint32_t, c_v256, c_v256>(
+ reinterpret_cast<fptr>(u32_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_u32_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(c_v128)) {
+ // V256_V128V128
+ error = CompareSimd2Args<v256, v128, v128, c_v256, c_v128, c_v128>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg1) == typeid(c_v256) &&
+ typeid(CArg2) == typeid(uint32_t)) {
+ // V256_V256U32
+ error = CompareSimd2Args<v256, v256, uint32_t, c_v256, c_v256, uint32_t>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_u32_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+
+ } else {
+ FAIL() << "Internal error: Unknown intrinsic function "
+ << typeid(CRet).name() << " " << name << "("
+ << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
+ }
+ }
+
+ EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+ << Print(s1, sizeof(CArg1)) << ", "
+ << Print(s2, sizeof(CArg2)) << ") -> "
+ << Print(d, sizeof(CRet)) << " (simd), "
+ << Print(ref_d, sizeof(CRet)) << " (ref)";
+}
+
+template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+ const char *name) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ fptr ref_simd;
+ fptr simd;
+ int error = 0;
+ DECLARE_ALIGNED(32, uint8_t, s1[32]);
+ DECLARE_ALIGNED(32, uint8_t, s2[32]);
+ DECLARE_ALIGNED(32, uint8_t, s3[32]);
+ DECLARE_ALIGNED(32, uint8_t, d[32]);
+ DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+ assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
+ sizeof(CRet) <= 32);
+ memset(ref_d, 0, sizeof(ref_d));
+ memset(d, 0, sizeof(d));
+
+ Map(name, &ref_simd, &simd);
+ if (simd == nullptr || ref_simd == nullptr) {
+ FAIL() << "Internal error: Unknown intrinsic function " << name;
+ }
+
+ for (unsigned int count = 0;
+ count < iterations && !error && !testing::Test::HasFailure(); count++) {
+ for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+ for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+ for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
+
+ if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
+
+ if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
+ typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
+ // V128_V128V128V128
+ error = CompareSimd3Args<v128, v128, v128, v128, c_v128, c_v128, c_v128,
+ c_v128>(
+ reinterpret_cast<fptr>(v128_store_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned),
+ reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v128_store_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(c_v128_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+ } else if (typeid(CRet) == typeid(c_v256) &&
+ typeid(CArg1) == typeid(c_v256) &&
+ typeid(CArg2) == typeid(c_v256) &&
+ typeid(CArg3) == typeid(c_v256)) {
+ // V256_V256V256V256
+ error = CompareSimd3Args<v256, v256, v256, v256, c_v256, c_v256, c_v256,
+ c_v256>(
+ reinterpret_cast<fptr>(v256_store_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned),
+ reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+ reinterpret_cast<fptr>(c_v256_store_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(c_v256_load_aligned),
+ reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+ } else {
+ FAIL() << "Internal error: Unknown intrinsic function "
+ << typeid(CRet).name() << " " << name << "("
+ << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
+ << typeid(CArg3).name() << ")";
+ }
+ }
+
+ EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+ << Print(s1, sizeof(CArg1)) << ", "
+ << Print(s2, sizeof(CArg2)) << ", "
+ << Print(s3, sizeof(CArg3)) << ") -> "
+ << Print(d, sizeof(CRet)) << " (simd), "
+ << Print(ref_d, sizeof(CRet)) << " (ref)";
+}
+
+// Instantiations to make the functions callable from another files
+template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+ uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
+ const char *);
+template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
+ uint32_t, const char *);
+template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+ uint32_t,
+ const char *);
+
+} // namespace SIMD_NAMESPACE
diff --git a/third_party/aom/test/simd_cmp_sse2.cc b/third_party/aom/test/simd_cmp_sse2.cc
new file mode 100644
index 0000000000..f7827a7fa1
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_sse2.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+ (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE2
+#define ARCH_POSTFIX(name) name##_sse2
+#define SIMD_NAMESPACE simd_test_sse2
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_cmp_sse4.cc b/third_party/aom/test/simd_cmp_sse4.cc
new file mode 100644
index 0000000000..3566764b64
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_sse4.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+ (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE4_1
+#define ARCH_POSTFIX(name) name##_sse4_1
+#define SIMD_NAMESPACE simd_test_sse4_1
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_cmp_ssse3.cc b/third_party/aom/test/simd_cmp_ssse3.cc
new file mode 100644
index 0000000000..57bf135ddb
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_ssse3.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+ (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSSE3
+#define ARCH_POSTFIX(name) name##_ssse3
+#define SIMD_NAMESPACE simd_test_ssse3
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_impl.h b/third_party/aom/test/simd_impl.h
new file mode 100644
index 0000000000..b564a7f4b3
--- /dev/null
+++ b/third_party/aom/test/simd_impl.h
@@ -0,0 +1,1140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#define SIMD_CHECK 1
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "aom_dsp/aom_simd_inline.h"
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+
+namespace SIMD_NAMESPACE {
+
+template <typename param_signature>
+class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
+ public:
+ ~TestIntrinsic() override = default;
+ void SetUp() override {
+ mask = std::get<0>(this->GetParam());
+ maskwidth = std::get<1>(this->GetParam());
+ name = std::get<2>(this->GetParam());
+ }
+
+ protected:
+ uint32_t mask, maskwidth;
+ const char *name;
+};
+
+// Create one typedef for each function signature
+#define TYPEDEF_SIMD(name) \
+ typedef TestIntrinsic<std::tuple<uint32_t, uint32_t, const char *> > \
+ ARCH_POSTFIX(name)
+
+TYPEDEF_SIMD(V64_U8);
+TYPEDEF_SIMD(V64_U16);
+TYPEDEF_SIMD(V64_U32);
+TYPEDEF_SIMD(V64_V64);
+TYPEDEF_SIMD(U32_V64);
+TYPEDEF_SIMD(S32_V64);
+TYPEDEF_SIMD(U64_V64);
+TYPEDEF_SIMD(S64_V64);
+TYPEDEF_SIMD(V64_U32U32);
+TYPEDEF_SIMD(V64_V64V64);
+TYPEDEF_SIMD(S64_V64V64);
+TYPEDEF_SIMD(V64_V64U32);
+TYPEDEF_SIMD(U32_V64V64);
+TYPEDEF_SIMD(V128_V64);
+TYPEDEF_SIMD(V128_V128);
+TYPEDEF_SIMD(U32_V128);
+TYPEDEF_SIMD(U64_V128);
+TYPEDEF_SIMD(V64_V128);
+TYPEDEF_SIMD(V128_U8);
+TYPEDEF_SIMD(V128_U16);
+TYPEDEF_SIMD(V128_U32);
+TYPEDEF_SIMD(V128_U64);
+TYPEDEF_SIMD(V128_U64U64);
+TYPEDEF_SIMD(V128_V64V64);
+TYPEDEF_SIMD(V128_V128V128);
+TYPEDEF_SIMD(V128_V128V128V128);
+TYPEDEF_SIMD(S64_V128V128);
+TYPEDEF_SIMD(V128_V128U32);
+TYPEDEF_SIMD(U32_V128V128);
+TYPEDEF_SIMD(U64_V128V128);
+TYPEDEF_SIMD(V256_V128);
+TYPEDEF_SIMD(V256_V256);
+TYPEDEF_SIMD(U64_V256);
+TYPEDEF_SIMD(V256_V128V128);
+TYPEDEF_SIMD(V256_V256V256);
+TYPEDEF_SIMD(V256_V256V256V256);
+TYPEDEF_SIMD(U64_V256V256);
+TYPEDEF_SIMD(S64_V256V256);
+TYPEDEF_SIMD(V256_V256U32);
+TYPEDEF_SIMD(U32_V256V256);
+TYPEDEF_SIMD(V256_U8);
+TYPEDEF_SIMD(V256_U16);
+TYPEDEF_SIMD(V256_U32);
+TYPEDEF_SIMD(V256_U64);
+TYPEDEF_SIMD(U32_V256);
+TYPEDEF_SIMD(V64_V256);
+
+// Google Test allows up to 50 tests per case, so split the largest
+typedef ARCH_POSTFIX(V64_V64) ARCH_POSTFIX(V64_V64_Part2);
+typedef ARCH_POSTFIX(V64_V64V64) ARCH_POSTFIX(V64_V64V64_Part2);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part2);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part3);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part4);
+typedef ARCH_POSTFIX(V128_V128V128) ARCH_POSTFIX(V128_V128V128_Part2);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part2);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part3);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part4);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part5);
+typedef ARCH_POSTFIX(V256_V256V256) ARCH_POSTFIX(V256_V256V256_Part2);
+
+// These functions are machine tuned located elsewhere
+template <typename c_ret, typename c_arg>
+void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+ const char *name);
+
+template <typename c_ret, typename c_arg1, typename c_arg2>
+void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+ const char *name);
+
+template <typename c_ret, typename c_arg1, typename c_arg2, typename c_arg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+ const char *name);
+
+const int kIterations = 65536;
+
+// Add a macro layer since TEST_P will quote the name so we need to
+// expand it first with the prefix.
+#define MY_TEST_P(name, test) TEST_P(name, test)
+
+MY_TEST_P(ARCH_POSTFIX(V64_U8), TestIntrinsics) {
+ TestSimd1Arg<c_v64, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U16), TestIntrinsics) {
+ TestSimd1Arg<c_v64, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U32), TestIntrinsics) {
+ TestSimd1Arg<c_v64, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64), TestIntrinsics) {
+ TestSimd1Arg<c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V64), TestIntrinsics) {
+ TestSimd1Arg<uint64_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V64), TestIntrinsics) {
+ TestSimd1Arg<int64_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V64), TestIntrinsics) {
+ TestSimd1Arg<uint32_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S32_V64), TestIntrinsics) {
+ TestSimd1Arg<int32_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U32U32), TestIntrinsics) {
+ TestSimd2Args<c_v64, uint32_t, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64V64), TestIntrinsics) {
+ TestSimd2Args<c_v64, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V64V64), TestIntrinsics) {
+ TestSimd2Args<int64_t, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V64V64), TestIntrinsics) {
+ TestSimd2Args<uint32_t, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64U32), TestIntrinsics) {
+ TestSimd2Args<c_v64, c_v64, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+// Google Test allows up to 50 tests per case, so split the largest
+MY_TEST_P(ARCH_POSTFIX(V64_V64_Part2), TestIntrinsics) {
+ TestSimd1Arg<c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64V64_Part2), TestIntrinsics) {
+ TestSimd2Args<c_v64, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V128), TestIntrinsics) {
+ TestSimd1Arg<uint32_t, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V128), TestIntrinsics) {
+ TestSimd1Arg<uint64_t, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V128), TestIntrinsics) {
+ TestSimd1Arg<c_v64, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128), TestIntrinsics) {
+ TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U8), TestIntrinsics) {
+ TestSimd1Arg<c_v128, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U16), TestIntrinsics) {
+ TestSimd1Arg<c_v128, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U32), TestIntrinsics) {
+ TestSimd1Arg<c_v128, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U64), TestIntrinsics) {
+ TestSimd1Arg<c_v128, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V64), TestIntrinsics) {
+ TestSimd1Arg<c_v128, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128), TestIntrinsics) {
+ TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128V128), TestIntrinsics) {
+ TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(kIterations, mask, maskwidth,
+ name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V128V128), TestIntrinsics) {
+ TestSimd2Args<uint32_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V128V128), TestIntrinsics) {
+ TestSimd2Args<uint64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V128V128), TestIntrinsics) {
+ TestSimd2Args<int64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U64U64), TestIntrinsics) {
+ TestSimd2Args<c_v128, uint64_t, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V64V64), TestIntrinsics) {
+ TestSimd2Args<c_v128, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128U32), TestIntrinsics) {
+ TestSimd2Args<c_v128, c_v128, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128_Part2), TestIntrinsics) {
+ TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part2), TestIntrinsics) {
+ TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part3), TestIntrinsics) {
+ TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part4), TestIntrinsics) {
+ TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V256), TestIntrinsics) {
+ TestSimd1Arg<uint64_t, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256), TestIntrinsics) {
+ TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V128), TestIntrinsics) {
+ TestSimd1Arg<c_v256, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256), TestIntrinsics) {
+ TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256V256), TestIntrinsics) {
+ TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(kIterations, mask, maskwidth,
+ name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V128V128), TestIntrinsics) {
+ TestSimd2Args<c_v256, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V256V256), TestIntrinsics) {
+ TestSimd2Args<uint32_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V256V256), TestIntrinsics) {
+ TestSimd2Args<uint64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V256V256), TestIntrinsics) {
+ TestSimd2Args<int64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256_Part2), TestIntrinsics) {
+ TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256U32), TestIntrinsics) {
+ TestSimd2Args<c_v256, c_v256, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part2), TestIntrinsics) {
+ TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part3), TestIntrinsics) {
+ TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part4), TestIntrinsics) {
+ TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part5), TestIntrinsics) {
+ TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U8), TestIntrinsics) {
+ TestSimd1Arg<c_v256, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U16), TestIntrinsics) {
+ TestSimd1Arg<c_v256, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U32), TestIntrinsics) {
+ TestSimd1Arg<c_v256, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U64), TestIntrinsics) {
+ TestSimd1Arg<c_v256, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V256), TestIntrinsics) {
+ TestSimd1Arg<uint32_t, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V256), TestIntrinsics) {
+ TestSimd1Arg<c_v64, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+// Add a macro layer since INSTANTIATE_TEST_SUITE_P will quote the name
+// so we need to expand it first with the prefix
+#define INSTANTIATE(name, type, ...) \
+ INSTANTIATE_TEST_SUITE_P(name, type, ::testing::Values(__VA_ARGS__))
+
+#define SIMD_TUPLE(name, mask, maskwidth) \
+ std::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64), SIMD_TUPLE(v64_sad_u8, 0U, 0U),
+ SIMD_TUPLE(v64_ssd_u8, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V64_V64V64), SIMD_TUPLE(v64_add_8, 0U, 0U),
+ SIMD_TUPLE(v64_add_16, 0U, 0U), SIMD_TUPLE(v64_sadd_s16, 0U, 0U),
+ SIMD_TUPLE(v64_add_32, 0U, 0U), SIMD_TUPLE(v64_sub_8, 0U, 0U),
+ SIMD_TUPLE(v64_ssub_u8, 0U, 0U), SIMD_TUPLE(v64_ssub_s8, 0U, 0U),
+ SIMD_TUPLE(v64_sub_16, 0U, 0U), SIMD_TUPLE(v64_ssub_s16, 0U, 0U),
+ SIMD_TUPLE(v64_ssub_u16, 0U, 0U), SIMD_TUPLE(v64_sub_32, 0U, 0U),
+ SIMD_TUPLE(v64_ziplo_8, 0U, 0U), SIMD_TUPLE(v64_ziphi_8, 0U, 0U),
+ SIMD_TUPLE(v64_ziplo_16, 0U, 0U), SIMD_TUPLE(v64_ziphi_16, 0U, 0U),
+ SIMD_TUPLE(v64_ziplo_32, 0U, 0U), SIMD_TUPLE(v64_ziphi_32, 0U, 0U),
+ SIMD_TUPLE(v64_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v64_pack_s16_u8, 0U, 0U),
+ SIMD_TUPLE(v64_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v64_unziphi_8, 0U, 0U),
+ SIMD_TUPLE(v64_unziplo_8, 0U, 0U), SIMD_TUPLE(v64_unziphi_16, 0U, 0U),
+ SIMD_TUPLE(v64_unziplo_16, 0U, 0U), SIMD_TUPLE(v64_or, 0U, 0U),
+ SIMD_TUPLE(v64_xor, 0U, 0U), SIMD_TUPLE(v64_and, 0U, 0U),
+ SIMD_TUPLE(v64_andn, 0U, 0U), SIMD_TUPLE(v64_mullo_s16, 0U, 0U),
+ SIMD_TUPLE(v64_mulhi_s16, 0U, 0U), SIMD_TUPLE(v64_mullo_s32, 0U, 0U),
+ SIMD_TUPLE(v64_madd_s16, 0U, 0U), SIMD_TUPLE(v64_madd_us8, 0U, 0U),
+ SIMD_TUPLE(v64_avg_u8, 0U, 0U), SIMD_TUPLE(v64_rdavg_u8, 0U, 0U),
+ SIMD_TUPLE(v64_avg_u16, 0U, 0U), SIMD_TUPLE(v64_min_u8, 0U, 0U),
+ SIMD_TUPLE(v64_max_u8, 0U, 0U), SIMD_TUPLE(v64_min_s8, 0U, 0U),
+ SIMD_TUPLE(v64_max_s8, 0U, 0U), SIMD_TUPLE(v64_min_s16, 0U, 0U),
+ SIMD_TUPLE(v64_max_s16, 0U, 0U), SIMD_TUPLE(v64_cmpgt_s8, 0U, 0U),
+ SIMD_TUPLE(v64_cmplt_s8, 0U, 0U), SIMD_TUPLE(v64_cmpeq_8, 0U, 0U),
+ SIMD_TUPLE(v64_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v64_cmplt_s16, 0U, 0U),
+ SIMD_TUPLE(v64_cmpeq_16, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U),
+ SIMD_TUPLE(v64_pack_s32_u16, 0U, 0U), SIMD_TUPLE(v64_rdavg_u16, 0U, 0U),
+ SIMD_TUPLE(v64_sadd_s8, 0U, 0U), SIMD_TUPLE(v64_sadd_u8, 0U, 0U),
+ SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_align<7>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U),
+ SIMD_TUPLE(v64_abs_s16, 0U, 0U),
+ SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v64_unpackhi_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v64_unpacklo_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v64_unpackhi_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v64_unpacklo_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v64_unpacklo_s16_s32, 0U, 0U),
+ SIMD_TUPLE(v64_unpackhi_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v64_unpackhi_s16_s32, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_byte<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_byte<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64_Part2),
+ SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shl_n_32<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_u32<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v64_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64U32), SIMD_TUPLE(v64_shl_8, 7U, 32U),
+ SIMD_TUPLE(v64_shr_u8, 7U, 32U), SIMD_TUPLE(v64_shr_s8, 7U, 32U),
+ SIMD_TUPLE(v64_shl_16, 15U, 32U), SIMD_TUPLE(v64_shr_u16, 15U, 32U),
+ SIMD_TUPLE(v64_shr_s16, 15U, 32U), SIMD_TUPLE(v64_shl_32, 31U, 32U),
+ SIMD_TUPLE(v64_shr_u32, 31U, 32U),
+ SIMD_TUPLE(v64_shr_s32, 31U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V64), SIMD_TUPLE(v64_hadd_u8, 0U, 0U),
+ SIMD_TUPLE(v64_u64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64), SIMD_TUPLE(v64_hadd_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64), SIMD_TUPLE(v64_low_u32, 0U, 0U),
+ SIMD_TUPLE(v64_high_u32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S32_V64), SIMD_TUPLE(v64_low_s32, 0U, 0U),
+ SIMD_TUPLE(v64_high_s32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64V64), SIMD_TUPLE(v64_dotp_s16, 0U, 0U),
+ SIMD_TUPLE(v64_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U8), SIMD_TUPLE(v64_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U16), SIMD_TUPLE(v64_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32), SIMD_TUPLE(v64_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32U32), SIMD_TUPLE(v64_from_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128V128), SIMD_TUPLE(v128_sad_u8, 0U, 0U),
+ SIMD_TUPLE(v128_ssd_u8, 0U, 0U), SIMD_TUPLE(v128_sad_u16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128V128), SIMD_TUPLE(v128_ssd_s16, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V128_V128V128), SIMD_TUPLE(v128_add_8, 0U, 0U),
+ SIMD_TUPLE(v128_add_16, 0U, 0U), SIMD_TUPLE(v128_sadd_s16, 0U, 0U),
+ SIMD_TUPLE(v128_add_32, 0U, 0U), SIMD_TUPLE(v128_sub_8, 0U, 0U),
+ SIMD_TUPLE(v128_ssub_u8, 0U, 0U), SIMD_TUPLE(v128_ssub_s8, 0U, 0U),
+ SIMD_TUPLE(v128_sub_16, 0U, 0U), SIMD_TUPLE(v128_ssub_s16, 0U, 0U),
+ SIMD_TUPLE(v128_ssub_u16, 0U, 0U), SIMD_TUPLE(v128_sub_32, 0U, 0U),
+ SIMD_TUPLE(v128_ziplo_8, 0U, 0U), SIMD_TUPLE(v128_ziphi_8, 0U, 0U),
+ SIMD_TUPLE(v128_ziplo_16, 0U, 0U), SIMD_TUPLE(v128_ziphi_16, 0U, 0U),
+ SIMD_TUPLE(v128_ziplo_32, 0U, 0U), SIMD_TUPLE(v128_ziphi_32, 0U, 0U),
+ SIMD_TUPLE(v128_ziplo_64, 0U, 0U), SIMD_TUPLE(v128_ziphi_64, 0U, 0U),
+ SIMD_TUPLE(v128_unziphi_8, 0U, 0U), SIMD_TUPLE(v128_unziplo_8, 0U, 0U),
+ SIMD_TUPLE(v128_unziphi_16, 0U, 0U), SIMD_TUPLE(v128_unziplo_16, 0U, 0U),
+ SIMD_TUPLE(v128_unziphi_32, 0U, 0U), SIMD_TUPLE(v128_unziplo_32, 0U, 0U),
+ SIMD_TUPLE(v128_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v128_pack_s16_u8, 0U, 0U),
+ SIMD_TUPLE(v128_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v128_or, 0U, 0U),
+ SIMD_TUPLE(v128_xor, 0U, 0U), SIMD_TUPLE(v128_and, 0U, 0U),
+ SIMD_TUPLE(v128_andn, 0U, 0U), SIMD_TUPLE(v128_mullo_s16, 0U, 0U),
+ SIMD_TUPLE(v128_mulhi_s16, 0U, 0U), SIMD_TUPLE(v128_mullo_s32, 0U, 0U),
+ SIMD_TUPLE(v128_madd_s16, 0U, 0U), SIMD_TUPLE(v128_madd_us8, 0U, 0U),
+ SIMD_TUPLE(v128_avg_u8, 0U, 0U), SIMD_TUPLE(v128_rdavg_u8, 0U, 0U),
+ SIMD_TUPLE(v128_avg_u16, 0U, 0U), SIMD_TUPLE(v128_min_u8, 0U, 0U),
+ SIMD_TUPLE(v128_max_u8, 0U, 0U), SIMD_TUPLE(v128_min_s8, 0U, 0U),
+ SIMD_TUPLE(v128_max_s8, 0U, 0U), SIMD_TUPLE(v128_min_s16, 0U, 0U),
+ SIMD_TUPLE(v128_max_s16, 0U, 0U), SIMD_TUPLE(v128_cmpgt_s8, 0U, 0U),
+ SIMD_TUPLE(v128_cmplt_s8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_8, 0U, 0U),
+ SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
+ SIMD_TUPLE(v128_pack_s32_u16, 0U, 0U),
+ SIMD_TUPLE(v128_rdavg_u16, 0U, 0U), SIMD_TUPLE(v128_add_64, 0U, 0U),
+ SIMD_TUPLE(v128_sub_64, 0U, 0U), SIMD_TUPLE(v128_sadd_s8, 0U, 0U),
+ SIMD_TUPLE(v128_sadd_u8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
+ SIMD_TUPLE(v128_cmplt_s16, 0U, 0U),
+ SIMD_TUPLE(v128_cmplt_s32, 0U, 0U),
+ SIMD_TUPLE(v128_cmpeq_32, 0U, 0U),
+ SIMD_TUPLE(v128_cmpgt_s32, 0U, 0U),
+ SIMD_TUPLE(v128_shuffle_8, 15U, 8U),
+ SIMD_TUPLE(v128_min_s32, 0U, 0U), SIMD_TUPLE(v128_max_s32, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_align<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128V128),
+ SIMD_TUPLE(v128_blend_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
+ SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpacklo_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpacklo_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v128_unpacklo_s16_s32, 0U, 0U),
+ SIMD_TUPLE(v128_unpackhi_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpackhi_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpackhi_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v128_unpackhi_s16_s32, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_byte<15>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_byte<15>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
+ SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_32<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3),
+ SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u32<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part4),
+ SIMD_TUPLE(imm_v128_shl_n_64<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<32>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<36>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<40>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<44>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<48>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<52>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<56>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shl_n_64<60>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<32>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<36>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<40>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<44>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<48>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<52>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<56>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_u64<60>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<32>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<36>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<40>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<44>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<48>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<52>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<56>, 0U, 0U),
+ SIMD_TUPLE(imm_v128_shr_n_s64<60>, 0U, 0U),
+ SIMD_TUPLE(v128_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64V64), SIMD_TUPLE(v128_from_v64, 0U, 0U),
+ SIMD_TUPLE(v128_zip_8, 0U, 0U), SIMD_TUPLE(v128_zip_16, 0U, 0U),
+ SIMD_TUPLE(v128_zip_32, 0U, 0U), SIMD_TUPLE(v128_mul_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64U64), SIMD_TUPLE(v128_from_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64),
+ SIMD_TUPLE(v128_unpack_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpack_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
+ SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
+ SIMD_TUPLE(v128_shl_16, 15U, 32U), SIMD_TUPLE(v128_shr_u16, 15U, 32U),
+ SIMD_TUPLE(v128_shr_s16, 15U, 32U), SIMD_TUPLE(v128_shl_32, 31U, 32U),
+ SIMD_TUPLE(v128_shr_u32, 31U, 32U), SIMD_TUPLE(v128_shr_s32, 31U, 32U),
+ SIMD_TUPLE(v128_shl_64, 63U, 32U), SIMD_TUPLE(v128_shr_u64, 63U, 32U),
+ SIMD_TUPLE(v128_shr_s64, 63U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U),
+ SIMD_TUPLE(v128_movemask_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128), SIMD_TUPLE(v128_hadd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V128), SIMD_TUPLE(v128_low_v64, 0U, 0U),
+ SIMD_TUPLE(v128_high_v64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U8), SIMD_TUPLE(v128_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U16), SIMD_TUPLE(v128_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U32), SIMD_TUPLE(v128_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64), SIMD_TUPLE(v128_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128), SIMD_TUPLE(v128_dotp_s16, 0U, 0U),
+ SIMD_TUPLE(v128_dotp_s32, 0U, 0U),
+ SIMD_TUPLE(v128_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256V256), SIMD_TUPLE(v256_sad_u8, 0U, 0U),
+ SIMD_TUPLE(v256_ssd_u8, 0U, 0U), SIMD_TUPLE(v256_sad_u16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U),
+ SIMD_TUPLE(v256_low_u64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256), SIMD_TUPLE(v256_dotp_s16, 0U, 0U),
+ SIMD_TUPLE(v256_dotp_s32, 0U, 0U),
+ SIMD_TUPLE(v256_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256V256), SIMD_TUPLE(v256_ssd_s16, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V256_V256V256), SIMD_TUPLE(v256_add_8, 0U, 0U),
+ SIMD_TUPLE(v256_add_16, 0U, 0U), SIMD_TUPLE(v256_sadd_s16, 0U, 0U),
+ SIMD_TUPLE(v256_add_32, 0U, 0U), SIMD_TUPLE(v256_sub_8, 0U, 0U),
+ SIMD_TUPLE(v256_ssub_u8, 0U, 0U), SIMD_TUPLE(v256_ssub_s8, 0U, 0U),
+ SIMD_TUPLE(v256_sub_16, 0U, 0U), SIMD_TUPLE(v256_ssub_s16, 0U, 0U),
+ SIMD_TUPLE(v256_ssub_u16, 0U, 0U), SIMD_TUPLE(v256_sub_32, 0U, 0U),
+ SIMD_TUPLE(v256_ziplo_8, 0U, 0U), SIMD_TUPLE(v256_ziphi_8, 0U, 0U),
+ SIMD_TUPLE(v256_ziplo_16, 0U, 0U), SIMD_TUPLE(v256_ziphi_16, 0U, 0U),
+ SIMD_TUPLE(v256_ziplo_32, 0U, 0U), SIMD_TUPLE(v256_ziphi_32, 0U, 0U),
+ SIMD_TUPLE(v256_ziplo_64, 0U, 0U), SIMD_TUPLE(v256_ziphi_64, 0U, 0U),
+ SIMD_TUPLE(v256_ziplo_128, 0U, 0U), SIMD_TUPLE(v256_ziphi_128, 0U, 0U),
+ SIMD_TUPLE(v256_unziphi_8, 0U, 0U), SIMD_TUPLE(v256_unziplo_8, 0U, 0U),
+ SIMD_TUPLE(v256_unziphi_16, 0U, 0U), SIMD_TUPLE(v256_unziplo_16, 0U, 0U),
+ SIMD_TUPLE(v256_unziphi_32, 0U, 0U), SIMD_TUPLE(v256_unziplo_32, 0U, 0U),
+ SIMD_TUPLE(v256_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v256_pack_s16_u8, 0U, 0U),
+ SIMD_TUPLE(v256_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v256_or, 0U, 0U),
+ SIMD_TUPLE(v256_xor, 0U, 0U), SIMD_TUPLE(v256_and, 0U, 0U),
+ SIMD_TUPLE(v256_andn, 0U, 0U), SIMD_TUPLE(v256_mullo_s16, 0U, 0U),
+ SIMD_TUPLE(v256_mulhi_s16, 0U, 0U), SIMD_TUPLE(v256_mullo_s32, 0U, 0U),
+ SIMD_TUPLE(v256_madd_s16, 0U, 0U), SIMD_TUPLE(v256_madd_us8, 0U, 0U),
+ SIMD_TUPLE(v256_avg_u8, 0U, 0U), SIMD_TUPLE(v256_rdavg_u8, 0U, 0U),
+ SIMD_TUPLE(v256_avg_u16, 0U, 0U), SIMD_TUPLE(v256_min_u8, 0U, 0U),
+ SIMD_TUPLE(v256_max_u8, 0U, 0U), SIMD_TUPLE(v256_min_s8, 0U, 0U),
+ SIMD_TUPLE(v256_max_s8, 0U, 0U), SIMD_TUPLE(v256_min_s16, 0U, 0U),
+ SIMD_TUPLE(v256_max_s16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s8, 0U, 0U),
+ SIMD_TUPLE(v256_cmplt_s8, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V256_V256V256_Part2), SIMD_TUPLE(v256_cmpeq_8, 0U, 0U),
+ SIMD_TUPLE(v256_min_s32, 0U, 0U), SIMD_TUPLE(v256_max_s32, 0U, 0U),
+ SIMD_TUPLE(v256_add_64, 0U, 0U), SIMD_TUPLE(v256_sub_64, 0U, 0U),
+ SIMD_TUPLE(v256_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v256_cmplt_s16, 0U, 0U),
+ SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s32, 0U, 0U),
+ SIMD_TUPLE(v256_cmplt_s32, 0U, 0U), SIMD_TUPLE(v256_cmpeq_32, 0U, 0U),
+ SIMD_TUPLE(v256_shuffle_8, 31U, 8U), SIMD_TUPLE(v256_pshuffle_8, 15U, 8U),
+ SIMD_TUPLE(imm_v256_align<1>, 0U, 0U), SIMD_TUPLE(v256_sadd_s8, 0U, 0U),
+ SIMD_TUPLE(v256_sadd_u8, 0U, 0U), SIMD_TUPLE(v256_pack_s32_u16, 0U, 0U),
+ SIMD_TUPLE(v256_rdavg_u16, 0U, 0U), SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+ SIMD_TUPLE(v256_unziphi_64, 0U, 0U), SIMD_TUPLE(v256_unziplo_64, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<15>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<17>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<18>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<19>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<21>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<22>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<23>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<25>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<26>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<27>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<29>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<30>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_align<31>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128V128),
+ SIMD_TUPLE(v256_from_v128, 0U, 0U), SIMD_TUPLE(v256_zip_8, 0U, 0U),
+ SIMD_TUPLE(v256_zip_16, 0U, 0U), SIMD_TUPLE(v256_zip_32, 0U, 0U),
+ SIMD_TUPLE(v256_mul_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128),
+ SIMD_TUPLE(v256_unpack_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpack_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpack_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v256_unpack_s16_s32, 0U, 0U));
+
+INSTANTIATE(
+ ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
+ SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
+ SIMD_TUPLE(v256_shl_16, 15U, 32U), SIMD_TUPLE(v256_shr_u16, 15U, 32U),
+ SIMD_TUPLE(v256_shr_s16, 15U, 32U), SIMD_TUPLE(v256_shl_32, 31U, 32U),
+ SIMD_TUPLE(v256_shr_u32, 31U, 32U), SIMD_TUPLE(v256_shr_s32, 31U, 32U),
+ SIMD_TUPLE(v256_shl_64, 63U, 32U), SIMD_TUPLE(v256_shr_u64, 63U, 32U),
+ SIMD_TUPLE(v256_shr_s64, 63U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256), SIMD_TUPLE(v256_abs_s8, 0U, 0U),
+ SIMD_TUPLE(v256_abs_s16, 0U, 0U), SIMD_TUPLE(v256_padd_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpacklo_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpacklo_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpacklo_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v256_unpacklo_s16_s32, 0U, 0U),
+ SIMD_TUPLE(v256_unpackhi_u8_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpackhi_s8_s16, 0U, 0U),
+ SIMD_TUPLE(v256_unpackhi_u16_s32, 0U, 0U),
+ SIMD_TUPLE(v256_unpackhi_s16_s32, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<15>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<17>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<18>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<19>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<21>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<22>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<23>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<25>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<26>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<27>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<29>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<30>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_byte<31>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<8>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part2),
+ SIMD_TUPLE(imm_v256_shl_n_byte<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<15>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<17>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<18>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<19>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<21>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<22>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<23>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<25>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<26>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<27>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<29>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<30>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_byte<31>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s8<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<10>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part3),
+ SIMD_TUPLE(imm_v256_shl_n_16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s16<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_32<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u32<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part4),
+ SIMD_TUPLE(imm_v256_shl_n_64<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<32>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<36>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<40>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<44>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<48>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<52>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<56>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_64<60>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<32>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<36>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<40>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<44>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<48>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<52>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<56>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_u64<60>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<16>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<20>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<24>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<28>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<32>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<36>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<40>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<44>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<48>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<52>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<56>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_s64<60>, 0U, 0U),
+ SIMD_TUPLE(v256_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part5),
+ SIMD_TUPLE(imm_v256_shr_n_word<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shr_n_word<15>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<1>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<2>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<3>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<4>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<5>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<6>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<7>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<8>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<9>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<10>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<11>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<12>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<13>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<14>, 0U, 0U),
+ SIMD_TUPLE(imm_v256_shl_n_word<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256V256V256),
+ SIMD_TUPLE(v256_blend_8, 0U, 0U),
+ SIMD_TUPLE(v256_wideshuffle_8, 63U, 8U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U8), SIMD_TUPLE(v256_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U16), SIMD_TUPLE(v256_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U32), SIMD_TUPLE(v256_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U64), SIMD_TUPLE(v256_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U),
+ SIMD_TUPLE(v256_movemask_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V256), SIMD_TUPLE(v256_low_v64, 0U, 0U));
+
+} // namespace SIMD_NAMESPACE
diff --git a/third_party/aom/test/simd_sse2_test.cc b/third_party/aom/test/simd_sse2_test.cc
new file mode 100644
index 0000000000..b37a931b38
--- /dev/null
+++ b/third_party/aom/test/simd_sse2_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+ (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE2
+#define ARCH_POSTFIX(name) name##_sse2
+#define SIMD_NAMESPACE simd_test_sse2
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_sse4_test.cc b/third_party/aom/test/simd_sse4_test.cc
new file mode 100644
index 0000000000..b1c9d5cd88
--- /dev/null
+++ b/third_party/aom/test/simd_sse4_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+ (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE4_1
+#define ARCH_POSTFIX(name) name##_sse4_1
+#define SIMD_NAMESPACE simd_test_sse4_1
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_ssse3_test.cc b/third_party/aom/test/simd_ssse3_test.cc
new file mode 100644
index 0000000000..d95c26fb5e
--- /dev/null
+++ b/third_party/aom/test/simd_ssse3_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+ (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSSE3
+#define ARCH_POSTFIX(name) name##_ssse3
+#define SIMD_NAMESPACE simd_test_ssse3
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simple_decoder.sh b/third_party/aom/test/simple_decoder.sh
new file mode 100755
index 0000000000..9b1aea1ed5
--- /dev/null
+++ b/third_party/aom/test/simple_decoder.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom simple_decoder example code. To add new tests to
+## this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to simple_decoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+simple_decoder_verify_environment() {
+ if [ ! "$(av1_encode_available)" = "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+ return 1
+ fi
+}
+
+# Runs simple_decoder using $1 as input file. $2 is the codec name, and is used
+# solely to name the output file.
+simple_decoder() {
+ local decoder="$(aom_tool_path simple_decoder)"
+ local input_file="$1"
+ local codec="$2"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"
+
+ if [ ! -x "${decoder}" ]; then
+ elog "${decoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+simple_decoder_av1() {
+ if [ "$(av1_decode_available)" = "yes" ]; then
+ if [ ! -e "${AV1_IVF_FILE}" ]; then
+ local file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+ encode_yuv_raw_input_av1 "${file}" --ivf
+ simple_decoder "${file}" av1 || return 1
+ else
+ simple_decoder "${AV1_IVF_FILE}" av1 || return 1
+ fi
+ fi
+}
+
+simple_decoder_tests="simple_decoder_av1"
+
+run_tests simple_decoder_verify_environment "${simple_decoder_tests}"
diff --git a/third_party/aom/test/simple_encoder.sh b/third_party/aom/test/simple_encoder.sh
new file mode 100755
index 0000000000..dfb1a1b546
--- /dev/null
+++ b/third_party/aom/test/simple_encoder.sh
@@ -0,0 +1,53 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom simple_encoder example. To add new tests to this
+## file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to simple_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+simple_encoder_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
+simple_encoder() {
+ local encoder="${LIBAOM_BIN_PATH}/simple_encoder${AOM_TEST_EXE_SUFFIX}"
+ local codec="$1"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"
+
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 5 \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+
+simple_encoder_av1() {
+ if [ "$(av1_encode_available)" = "yes" ]; then
+ simple_encoder av1 || return 1
+ fi
+}
+
+simple_encoder_tests="simple_encoder_av1"
+
+run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/third_party/aom/test/sse_sum_test.cc b/third_party/aom/test/sse_sum_test.cc
new file mode 100644
index 0000000000..fd6fb886d3
--- /dev/null
+++ b/third_party/aom/test/sse_sum_test.cc
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+namespace {
+const int kNumIterations = 10000;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int src_stride, int width,
+ int height, int *sum);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSSETest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+ ~SumSSETest() override = default;
+ void SetUp() override {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+ ASSERT_NE(src_, nullptr);
+ }
+
+ void TearDown() override { aom_free(src_); }
+ void RunTest(int isRandom);
+ void RunSpeedTest();
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestFuncs params_;
+ int16_t *src_;
+ ACMRandom rnd_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSSETest);
+
+void SumSSETest::RunTest(int isRandom) {
+ for (int k = 0; k < kNumIterations; k++) {
+ const int width = 4 * (rnd_(31) + 1); // Up to 128x128
+ const int height = 4 * (rnd_(31) + 1); // Up to 128x128
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ if (isRandom) {
+ GenRandomData(width, height, stride);
+ } else {
+ GenExtremeData(width, height, stride);
+ }
+ int sum_ref = 0, sum_tst = 0;
+ const uint64_t sse_ref =
+ params_.ref_func(src_, stride, width, height, &sum_ref);
+ const uint64_t sse_tst =
+ params_.tst_func(src_, stride, width, height, &sum_tst);
+
+ EXPECT_EQ(sse_ref, sse_tst)
+ << "Error: SumSSETest [" << width << "x" << height
+ << "] C SSE does not match optimized output.";
+ EXPECT_EQ(sum_ref, sum_tst)
+ << "Error: SumSSETest [" << width << "x" << height
+ << "] C Sum does not match optimized output.";
+ }
+}
+
+void SumSSETest::RunSpeedTest() {
+ for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+ const int width = block_size_wide[block]; // Up to 128x128
+ const int height = block_size_high[block]; // Up to 128x128
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ int sum_ref = 0, sum_tst = 0;
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(src_, stride, width, height, &sum_ref);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+ 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(src_, stride, width, height, &sum_tst);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+ 1000.0 * elapsed_time1 / num_loops);
+ }
+}
+
+TEST_P(SumSSETest, OperationCheck) {
+ RunTest(1); // GenRandomData
+}
+
+TEST_P(SumSSETest, ExtremeValues) {
+ RunTest(0); // GenExtremeData
+}
+
+TEST_P(SumSSETest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, SumSSETest,
+ ::testing::Values(TestFuncs(
+ &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SumSSETest,
+ ::testing::Values(TestFuncs(
+ &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_neon)));
+#endif // HAVE_NEON
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SumSSETest,
+ ::testing::Values(TestFuncs(
+ &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_avx2)));
+#endif // HAVE_AVX2
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(SVE, SumSSETest,
+ ::testing::Values(TestFuncs(&aom_sum_sse_2d_i16_c,
+ &aom_sum_sse_2d_i16_sve)));
+#endif // HAVE_SVE
+
+} // namespace
diff --git a/third_party/aom/test/still_picture_test.cc b/third_party/aom/test/still_picture_test.cc
new file mode 100644
index 0000000000..3dfb1c8693
--- /dev/null
+++ b/third_party/aom/test/still_picture_test.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+// This class is used to test the presence of still picture feature.
+class StillPicturePresenceTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ StillPicturePresenceTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ enable_full_header_(GET_PARAM(2)) {
+ still_picture_coding_violated_ = false;
+ }
+ ~StillPicturePresenceTest() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = AOM_Q;
+ cfg_.g_threads = 1;
+ cfg_.full_still_picture_hdr = enable_full_header_;
+ cfg_.g_limit = 1;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AV1E_SET_FORCE_VIDEO_MODE, 0);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_STILL_PICTURE,
+ &still_pic_info_);
+ if (still_pic_info_.is_still_picture != 1) {
+ still_picture_coding_violated_ = true;
+ }
+ if (still_pic_info_.is_reduced_still_picture_hdr == enable_full_header_) {
+ /* If full_still_picture_header is enabled in encoder config but
+ * bitstream contains reduced_still_picture_header set, then set
+ * still_picture_coding_violated_ to true.
+ * Similarly, if full_still_picture_header is disabled in encoder config
+ * but bitstream contains reduced_still_picture_header not set, then set
+ * still_picture_coding_violated_ to true.
+ */
+ still_picture_coding_violated_ = true;
+ }
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ bool still_picture_coding_violated_;
+ int enable_full_header_;
+ aom_still_picture_info still_pic_info_;
+ aom_rc_mode end_usage_check_;
+};
+
+TEST_P(StillPicturePresenceTest, StillPictureEncodePresenceTest) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 1);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(still_picture_coding_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(StillPicturePresenceTest,
+ ::testing::Values(::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood),
+ ::testing::Values(1, 0));
+} // namespace
diff --git a/third_party/aom/test/subtract_test.cc b/third_party/aom/test/subtract_test.cc
new file mode 100644
index 0000000000..e591e6543d
--- /dev/null
+++ b/third_party/aom/test/subtract_test.cc
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdint>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/blockd.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride);
+
+namespace {
+
+using std::get;
+using std::make_tuple;
+using std::tuple;
+
+using libaom_test::ACMRandom;
+
+// <BLOCK_SIZE, optimized subtract func, reference subtract func>
+using Params = tuple<BLOCK_SIZE, SubtractFunc, SubtractFunc>;
+
+class AV1SubtractBlockTestBase : public ::testing::Test {
+ public:
+ AV1SubtractBlockTestBase(BLOCK_SIZE bs, int bit_depth, SubtractFunc func,
+ SubtractFunc ref_func) {
+ block_width_ = block_size_wide[bs];
+ block_height_ = block_size_high[bs];
+ func_ = func;
+ ref_func_ = ref_func;
+ if (bit_depth == -1) {
+ hbd_ = false;
+ bit_depth_ = AOM_BITS_8;
+ } else {
+ hbd_ = true;
+ bit_depth_ = static_cast<aom_bit_depth_t>(bit_depth);
+ }
+ }
+
+ void SetUp() override {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+
+ const size_t max_width = 128;
+ const size_t max_block_size = max_width * max_width;
+ if (hbd_) {
+ src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ aom_memalign(16, max_block_size * sizeof(uint16_t))));
+ ASSERT_NE(src_, nullptr);
+ pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ aom_memalign(16, max_block_size * sizeof(uint16_t))));
+ ASSERT_NE(pred_, nullptr);
+ } else {
+ src_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, max_block_size * sizeof(uint8_t)));
+ ASSERT_NE(src_, nullptr);
+ pred_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, max_block_size * sizeof(uint8_t)));
+ ASSERT_NE(pred_, nullptr);
+ }
+ diff_ = reinterpret_cast<int16_t *>(
+ aom_memalign(32, max_block_size * sizeof(int16_t)));
+ ASSERT_NE(diff_, nullptr);
+ }
+
+ void TearDown() override {
+ if (hbd_) {
+ aom_free(CONVERT_TO_SHORTPTR(src_));
+ aom_free(CONVERT_TO_SHORTPTR(pred_));
+ } else {
+ aom_free(src_);
+ aom_free(pred_);
+ }
+ aom_free(diff_);
+ }
+
+ protected:
+ void CheckResult();
+ void RunForSpeed();
+
+ private:
+ void FillInputs();
+
+ ACMRandom rnd_;
+ int block_height_;
+ int block_width_;
+ bool hbd_;
+ aom_bit_depth_t bit_depth_;
+ SubtractFunc func_;
+ SubtractFunc ref_func_;
+ uint8_t *src_;
+ uint8_t *pred_;
+ int16_t *diff_;
+};
+
+void AV1SubtractBlockTestBase::FillInputs() {
+ const size_t max_width = 128;
+ const int max_block_size = max_width * max_width;
+ if (hbd_) {
+ const int mask = (1 << bit_depth_) - 1;
+ for (int i = 0; i < max_block_size; ++i) {
+ CONVERT_TO_SHORTPTR(src_)[i] = rnd_.Rand16() & mask;
+ CONVERT_TO_SHORTPTR(pred_)[i] = rnd_.Rand16() & mask;
+ }
+ } else {
+ if (src_ == nullptr) {
+ std::cerr << "gadfg" << std::endl;
+ }
+ for (int i = 0; i < max_block_size; ++i) {
+ src_[i] = rnd_.Rand8();
+ pred_[i] = rnd_.Rand8();
+ }
+ }
+}
+
+void AV1SubtractBlockTestBase::CheckResult() {
+ const int test_num = 100;
+ int i;
+
+ for (i = 0; i < test_num; ++i) {
+ FillInputs();
+
+ func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+ pred_, block_width_);
+
+ if (hbd_)
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_; ++c) {
+ EXPECT_EQ(diff_[r * block_width_ + c],
+ (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+ CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+ << "r = " << r << ", c = " << c << ", test: " << i;
+ }
+ }
+ else {
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_; ++c) {
+ EXPECT_EQ(diff_[r * block_width_ + c],
+ src_[r * block_width_ + c] - pred_[r * block_width_ + c])
+ << "r = " << r << ", c = " << c << ", test: " << i;
+ }
+ }
+ }
+ }
+}
+
+void AV1SubtractBlockTestBase::RunForSpeed() {
+ const int test_num = 200000;
+ int i;
+
+ if (ref_func_ == func_) GTEST_SKIP();
+
+ FillInputs();
+
+ aom_usec_timer ref_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (i = 0; i < test_num; ++i) {
+ ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+ block_width_, pred_, block_width_);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int64_t ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
+
+ FillInputs();
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (i = 0; i < test_num; ++i) {
+ func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+ pred_, block_width_);
+ }
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+
+ printf(
+ "[%dx%d]: "
+ "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+ " \t "
+ "gain=%f \n",
+ block_width_, block_height_, ref_elapsed_time, elapsed_time,
+ static_cast<double>(ref_elapsed_time) /
+ static_cast<double>(elapsed_time));
+}
+
+class AV1SubtractBlockTest : public ::testing::WithParamInterface<Params>,
+ public AV1SubtractBlockTestBase {
+ public:
+ AV1SubtractBlockTest()
+ : AV1SubtractBlockTestBase(GET_PARAM(0), -1, GET_PARAM(1), GET_PARAM(2)) {
+ }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1SubtractBlockTest);
+
+TEST_P(AV1SubtractBlockTest, CheckResult) { CheckResult(); }
+TEST_P(AV1SubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+ BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+ BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+ BLOCK_64X64, BLOCK_64X128, BLOCK_128X64,
+ BLOCK_128X128 };
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1SubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(&aom_subtract_block_c),
+ ::testing::Values(&aom_subtract_block_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1SubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(&aom_subtract_block_sse2),
+ ::testing::Values(&aom_subtract_block_c)));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1SubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(&aom_subtract_block_avx2),
+ ::testing::Values(&aom_subtract_block_c)));
+
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1SubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(&aom_subtract_block_neon),
+ ::testing::Values(&aom_subtract_block_c)));
+
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using ParamsHBD = tuple<BLOCK_SIZE, int, SubtractFunc, SubtractFunc>;
+
+class AV1HBDSubtractBlockTest : public ::testing::WithParamInterface<ParamsHBD>,
+ public AV1SubtractBlockTestBase {
+ public:
+ AV1HBDSubtractBlockTest()
+ : AV1SubtractBlockTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2),
+ GET_PARAM(3)) {}
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HBDSubtractBlockTest);
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1HBDSubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(12),
+ ::testing::Values(&aom_highbd_subtract_block_c),
+ ::testing::Values(&aom_highbd_subtract_block_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AV1HBDSubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(12),
+ ::testing::Values(&aom_highbd_subtract_block_sse2),
+ ::testing::Values(&aom_highbd_subtract_block_c)));
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HBDSubtractBlockTest,
+ ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+ ::testing::Values(12),
+ ::testing::Values(&aom_highbd_subtract_block_neon),
+ ::testing::Values(&aom_highbd_subtract_block_c)));
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
diff --git a/third_party/aom/test/sum_squares_test.cc b/third_party/aom/test/sum_squares_test.cc
new file mode 100644
index 0000000000..7b98ced523
--- /dev/null
+++ b/third_party/aom/test/sum_squares_test.cc
@@ -0,0 +1,928 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/common_data.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+namespace {
+const int kNumIterations = 10000;
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int width,
+ int height);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+ ~SumSquaresTest() override = default;
+ void SetUp() override {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+ ASSERT_NE(src_, nullptr);
+ }
+
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random);
+ void RunSpeedTest();
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestFuncs params_;
+ int16_t *src_;
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
+
+void SumSquaresTest::RunTest(bool is_random) {
+ int failed = 0;
+ for (int k = 0; k < kNumIterations; k++) {
+ const int width = 4 * (rnd_(31) + 1); // Up to 128x128
+ const int height = 4 * (rnd_(31) + 1); // Up to 128x128
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ if (is_random) {
+ GenRandomData(width, height, stride);
+ } else {
+ GenExtremeData(width, height, stride);
+ }
+ const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+ uint64_t res_tst;
+ API_REGISTER_STATE_CHECK(res_tst =
+ params_.tst_func(src_, stride, width, height));
+
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error: Sum Squares Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+}
+
+void SumSquaresTest::RunSpeedTest() {
+ for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+ const int width = block_size_wide[block]; // Up to 128x128
+ const int height = block_size_high[block]; // Up to 128x128
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(src_, stride, width, height);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+ 1000.0 * elapsed_time / num_loops);
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(src_, stride, width, height);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+ 1000.0 * elapsed_time1 / num_loops);
+ }
+}
+
+TEST_P(SumSquaresTest, OperationCheck) {
+ RunTest(true); // GenRandomData
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+ RunTest(false); // GenExtremeData
+}
+
+TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, SumSquaresTest,
+ ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+ &aom_sum_squares_2d_i16_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, SumSquaresTest,
+ ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+ &aom_sum_squares_2d_i16_neon)));
+
+#endif // HAVE_NEON
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, SumSquaresTest,
+ ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+ &aom_sum_squares_2d_i16_sve)));
+
+#endif // HAVE_SVE
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, SumSquaresTest,
+ ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+ &aom_sum_squares_2d_i16_avx2)));
+#endif // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t n);
+typedef libaom_test::FuncParam<F1D> TestFuncs1D;
+
+class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
+ protected:
+ static const int kIterations = 1000;
+ static const int kMaxSize = 256;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquares1DTest);
+
+TEST_P(SumSquares1DTest, RandomValues) {
+ DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < kMaxSize * kMaxSize; ++i)
+ src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
+
+ // Block size is between 64 and 128 * 128 and is always a multiple of 64.
+ const int n = (rng_(255) + 1) * 64;
+
+ const uint64_t ref_res = params_.ref_func(src, n);
+ uint64_t tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+TEST_P(SumSquares1DTest, ExtremeValues) {
+ DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+ for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+ if (rng_(2)) {
+ for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = kInt13Max;
+ } else {
+ for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
+ }
+
+ // Block size is between 64 and 128 * 128 and is always a multiple of 64.
+ const int n = (rng_(255) + 1) * 64;
+
+ const uint64_t ref_res = params_.ref_func(src, n);
+ uint64_t tst_res;
+ API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, n));
+
+ ASSERT_EQ(ref_res, tst_res);
+ }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, SumSquares1DTest,
+ ::testing::Values(TestFuncs1D(
+ aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SumSquares1DTest,
+ ::testing::Values(TestFuncs1D(
+ aom_sum_squares_i16_c, aom_sum_squares_i16_neon)));
+
+#endif // HAVE_NEON
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(SVE, SumSquares1DTest,
+ ::testing::Values(TestFuncs1D(
+ aom_sum_squares_i16_c, aom_sum_squares_i16_sve)));
+
+#endif // HAVE_SVE
+
+typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height);
+typedef libaom_test::FuncParam<SSEFunc> TestSSEFuncs;
+
+typedef std::tuple<TestSSEFuncs, int> SSETestParam;
+
+class SSETest : public ::testing::TestWithParam<SSETestParam> {
+ public:
+ ~SSETest() override = default;
+ void SetUp() override {
+ params_ = GET_PARAM(0);
+ width_ = GET_PARAM(1);
+ is_hbd_ =
+#if CONFIG_AV1_HIGHBITDEPTH
+ params_.ref_func == aom_highbd_sse_c;
+#else
+ false;
+#endif
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
+ ref_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
+ ASSERT_NE(src_, nullptr);
+ ASSERT_NE(ref_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(ref_);
+ }
+ void RunTest(bool is_random, int width, int height, int run_times);
+
+ void GenRandomData(int width, int height, int stride) {
+ uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+ uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ if (!is_hbd_) {
+ src_[ii * stride + jj] = rnd_.Rand8();
+ ref_[ii * stride + jj] = rnd_.Rand8();
+ } else {
+ src16[ii * stride + jj] = rnd_(limit);
+ ref16[ii * stride + jj] = rnd_(limit);
+ }
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride, uint8_t *data,
+ int16_t val) {
+ uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ if (!is_hbd_) {
+ data[ii * stride + jj] = static_cast<uint8_t>(val);
+ } else {
+ data16[ii * stride + jj] = val;
+ }
+ }
+ }
+ }
+
+ protected:
+ bool is_hbd_;
+ int width_;
+ TestSSEFuncs params_;
+ uint8_t *src_;
+ uint8_t *ref_;
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
+
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
+ int failed = 0;
+ aom_usec_timer ref_timer, test_timer;
+ for (int k = 0; k < 3; k++) {
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ if (is_random) {
+ GenRandomData(width, height, stride);
+ } else {
+ const int msb = is_hbd_ ? 12 : 8; // Up to 12 bit input
+ const int limit = (1 << msb) - 1;
+ if (k == 0) {
+ GenExtremeData(width, height, stride, src_, 0);
+ GenExtremeData(width, height, stride, ref_, limit);
+ } else {
+ GenExtremeData(width, height, stride, src_, limit);
+ GenExtremeData(width, height, stride, ref_, 0);
+ }
+ }
+ int64_t res_ref, res_tst;
+ uint8_t *src = src_;
+ uint8_t *ref = ref_;
+ if (is_hbd_) {
+ src = CONVERT_TO_BYTEPTR(src_);
+ ref = CONVERT_TO_BYTEPTR(ref_);
+ }
+ res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+ res_tst = params_.tst_func(src, stride, ref, stride, width, height);
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.ref_func(src, stride, ref, stride, width, height);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.tst_func(src, stride, ref, stride, width, height);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%d\n",
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ } else {
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
+ << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+ }
+}
+
+TEST_P(SSETest, OperationCheck) {
+ for (int height = 4; height <= 128; height += 4) {
+ RunTest(true, width_, height, 1); // GenRandomData
+ }
+}
+
+TEST_P(SSETest, ExtremeValues) {
+ for (int height = 4; height <= 128; height += 4) {
+ RunTest(false, width_, height, 1);
+ }
+}
+
+TEST_P(SSETest, DISABLED_Speed) {
+ for (int height = 4; height <= 128; height += 4) {
+ RunTest(true, width_, height, 100);
+ }
+}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+ TestSSEFuncs(&aom_sse_c, &aom_sse_neon),
+#if CONFIG_AV1_HIGHBITDEPTH
+ TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+ Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+ TestSSEFuncs(&aom_sse_c, &aom_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+ Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_SSE4_1
+TestSSEFuncs sse_sse4[] = {
+ TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
+#if CONFIG_AV1_HIGHBITDEPTH
+ TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+ Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+TestSSEFuncs sse_avx2[] = {
+ TestSSEFuncs(&aom_sse_c, &aom_sse_avx2),
+#if CONFIG_AV1_HIGHBITDEPTH
+ TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+ Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+#endif // HAVE_AVX2
+
+#if HAVE_SVE
+#if CONFIG_AV1_HIGHBITDEPTH
+TestSSEFuncs sse_sve[] = { TestSSEFuncs(&aom_highbd_sse_c,
+ &aom_highbd_sse_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, SSETest,
+ Combine(ValuesIn(sse_sve), Range(4, 129, 4)));
+#endif
+#endif // HAVE_SVE
+
+//////////////////////////////////////////////////////////////////////////////
+// get_blk sum squares test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*sse_sum_func)(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum);
+typedef libaom_test::FuncParam<sse_sum_func> TestSSE_SumFuncs;
+
+typedef std::tuple<TestSSE_SumFuncs, TX_SIZE> SSE_SumTestParam;
+
+class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
+ public:
+ ~SSE_Sum_Test() override = default;
+ void SetUp() override {
+ params_ = GET_PARAM(0);
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
+ ASSERT_NE(src_, nullptr);
+ }
+
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random, int tx_size, int run_times);
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride, int16_t *data,
+ int16_t val) {
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ data[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestSSE_SumFuncs params_;
+ int16_t *src_;
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSE_Sum_Test);
+
+void SSE_Sum_Test::RunTest(bool is_random, int tx_size, int run_times) {
+ aom_usec_timer ref_timer, test_timer;
+ int width = tx_size_wide[tx_size];
+ int height = tx_size_high[tx_size];
+ for (int k = 0; k < 3; k++) {
+ int stride = 4 << rnd_(7); // Up to 256 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(7);
+ }
+ if (is_random) {
+ GenRandomData(width, height, stride);
+ } else {
+ const int msb = 12; // Up to 12 bit input
+ const int limit = (1 << msb) - 1;
+ if (k == 0) {
+ GenExtremeData(width, height, stride, src_, limit);
+ } else {
+ GenExtremeData(width, height, stride, src_, -limit);
+ }
+ }
+ int sum_c = 0;
+ int64_t sse_intr = 0;
+ int sum_intr = 0;
+ int64_t sse_c = 0;
+
+ params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
+ params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
+
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%f\t width=%d\t height=%d \n",
+ elapsed_time_c, elapsed_time_simd,
+ (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+ height);
+
+ } else {
+ EXPECT_EQ(sum_c, sum_intr)
+ << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ EXPECT_EQ(sse_c, sse_intr)
+ << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+}
+
+TEST_P(SSE_Sum_Test, OperationCheck) {
+ RunTest(true, GET_PARAM(1), 1); // GenRandomData
+}
+
+TEST_P(SSE_Sum_Test, ExtremeValues) { RunTest(false, GET_PARAM(1), 1); }
+
+TEST_P(SSE_Sum_Test, DISABLED_Speed) { RunTest(true, GET_PARAM(1), 10000); }
+
+#if HAVE_SSE2 || HAVE_AVX2 || HAVE_NEON
+const TX_SIZE kValidBlockSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32,
+ TX_64X64, TX_4X8, TX_8X4, TX_8X16,
+ TX_16X8, TX_16X32, TX_32X16, TX_64X32,
+ TX_32X64, TX_4X16, TX_16X4, TX_8X32,
+ TX_32X8, TX_16X64, TX_64X16 };
+#endif
+
+#if HAVE_SSE2
+TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs(
+ &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test,
+ Combine(ValuesIn(sse_sum_sse2),
+ ValuesIn(kValidBlockSize)));
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs(
+ &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test,
+ Combine(ValuesIn(sse_sum_avx2),
+ ValuesIn(kValidBlockSize)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+TestSSE_SumFuncs sse_sum_neon[] = { TestSSE_SumFuncs(
+ &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, SSE_Sum_Test,
+ Combine(ValuesIn(sse_sum_neon),
+ ValuesIn(kValidBlockSize)));
+#endif // HAVE_NEON
+
+#if HAVE_SVE
+TestSSE_SumFuncs sse_sum_sve[] = { TestSSE_SumFuncs(&aom_get_blk_sse_sum_c,
+ &aom_get_blk_sse_sum_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, SSE_Sum_Test,
+ Combine(ValuesIn(sse_sum_sve),
+ ValuesIn(kValidBlockSize)));
+#endif // HAVE_SVE
+
+//////////////////////////////////////////////////////////////////////////////
+// 2D Variance test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*Var2DFunc)(uint8_t *src, int stride, int width, int height);
+typedef libaom_test::FuncParam<Var2DFunc> TestFuncVar2D;
+
+const uint16_t test_block_size[2] = { 128, 256 };
+
+class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+ ~Lowbd2dVarTest() override = default;
+ void SetUp() override {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, 512 * 512 * sizeof(uint8_t)));
+ ASSERT_NE(src_, nullptr);
+ }
+
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random);
+ void RunSpeedTest();
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 7; // Up to 8 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride) {
+ const int msb = 7; // Up to 8 bit input
+ const int limit = 1 << (msb + 1);
+ const int val = rnd_(2) ? limit - 1 : 0;
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestFuncVar2D params_;
+ uint8_t *src_;
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Lowbd2dVarTest);
+
+void Lowbd2dVarTest::RunTest(bool is_random) {
+ int failed = 0;
+ for (int k = 0; k < kNumIterations; k++) {
+ const int width = 4 * (rnd_(63) + 1); // Up to 256x256
+ const int height = 4 * (rnd_(63) + 1); // Up to 256x256
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ if (is_random) {
+ GenRandomData(width, height, stride);
+ } else {
+ GenExtremeData(width, height, stride);
+ }
+
+ const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+ uint64_t res_tst;
+ API_REGISTER_STATE_CHECK(res_tst =
+ params_.tst_func(src_, stride, width, height));
+
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error: Sum Squares Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+}
+
+void Lowbd2dVarTest::RunSpeedTest() {
+ for (int block = 0; block < 2; block++) {
+ const int width = test_block_size[block];
+ const int height = test_block_size[block];
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(src_, stride, width, height);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(src_, stride, width, height);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+ (double)elapsed_time / elapsed_time1);
+ }
+}
+
+TEST_P(Lowbd2dVarTest, OperationCheck) {
+ RunTest(true); // GenRandomData
+}
+
+TEST_P(Lowbd2dVarTest, ExtremeValues) {
+ RunTest(false); // GenExtremeData
+}
+
+TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+ &aom_var_2d_u8_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+ &aom_var_2d_u8_avx2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+ &aom_var_2d_u8_neon)));
+
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, Lowbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(
+ &aom_var_2d_u8_c, &aom_var_2d_u8_neon_dotprod)));
+
+#endif // HAVE_NEON_DOTPROD
+
+class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+ ~Highbd2dVarTest() override = default;
+ void SetUp() override {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, 512 * 512 * sizeof(uint16_t)));
+ ASSERT_NE(src_, nullptr);
+ }
+
+ void TearDown() override { aom_free(src_); }
+ void RunTest(bool is_random);
+ void RunSpeedTest();
+
+ void GenRandomData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = rnd_(limit);
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride) {
+ const int msb = 11; // Up to 12 bit input
+ const int limit = 1 << (msb + 1);
+ const int val = rnd_(2) ? limit - 1 : 0;
+ for (int ii = 0; ii < height; ii++) {
+ for (int jj = 0; jj < width; jj++) {
+ src_[ii * stride + jj] = val;
+ }
+ }
+ }
+
+ protected:
+ TestFuncVar2D params_;
+ uint16_t *src_;
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Highbd2dVarTest);
+
+void Highbd2dVarTest::RunTest(bool is_random) {
+ int failed = 0;
+ for (int k = 0; k < kNumIterations; k++) {
+ const int width = 4 * (rnd_(63) + 1); // Up to 256x256
+ const int height = 4 * (rnd_(63) + 1); // Up to 256x256
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ if (is_random) {
+ GenRandomData(width, height, stride);
+ } else {
+ GenExtremeData(width, height, stride);
+ }
+
+ const uint64_t res_ref =
+ params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+ uint64_t res_tst;
+ API_REGISTER_STATE_CHECK(
+ res_tst =
+ params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
+
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error: Sum Squares Test [" << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
+ }
+}
+
+void Highbd2dVarTest::RunSpeedTest() {
+ for (int block = 0; block < 2; block++) {
+ const int width = test_block_size[block];
+ const int height = test_block_size[block];
+ int stride = 4 << rnd_(8); // Up to 512 stride
+ while (stride < width) { // Make sure it's valid
+ stride = 4 << rnd_(8);
+ }
+ GenExtremeData(width, height, stride);
+ const int num_loops = 1000000000 / (width + height);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer timer1;
+ aom_usec_timer_start(&timer1);
+ for (int i = 0; i < num_loops; ++i)
+ params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+ aom_usec_timer_mark(&timer1);
+ const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+ printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+ (double)elapsed_time / elapsed_time1);
+ }
+}
+
+TEST_P(Highbd2dVarTest, OperationCheck) {
+ RunTest(true); // GenRandomData
+}
+
+TEST_P(Highbd2dVarTest, ExtremeValues) {
+ RunTest(false); // GenExtremeData
+}
+
+TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, Highbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_sse2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, Highbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_avx2)));
+
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, Highbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_neon)));
+
+#endif // HAVE_NEON
+
+#if HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(SVE, Highbd2dVarTest,
+ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c,
+ &aom_var_2d_u16_sve)));
+
+#endif // HAVE_SVE
+} // namespace
diff --git a/third_party/aom/test/svc_datarate_test.cc b/third_party/aom/test/svc_datarate_test.cc
new file mode 100644
index 0000000000..cc3fb674b3
--- /dev/null
+++ b/third_party/aom/test/svc_datarate_test.cc
@@ -0,0 +1,2675 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "config/aom_config.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/datarate_test.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/encoder.h"
+
+namespace datarate_test {
+namespace {
+
+struct FrameInfo {
+ FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+ : pts(_pts), w(_w), h(_h) {}
+
+ aom_codec_pts_t pts;
+ unsigned int w;
+ unsigned int h;
+};
+
+class DatarateTestSVC
+ : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+ unsigned int, int>,
+ public DatarateTest {
+ public:
+ DatarateTestSVC() : DatarateTest(GET_PARAM(0)) {
+ set_cpu_used_ = GET_PARAM(2);
+ aq_mode_ = GET_PARAM(3);
+ }
+
+ protected:
+ void SetUp() override {
+ InitializeConfig(GET_PARAM(1));
+ ResetModel();
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ aom_codec_pts_t pts) override {
+ frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+ ++decoded_nframes_;
+ }
+
+ std::vector<FrameInfo> frame_info_list_;
+
+ int GetNumSpatialLayers() override { return number_spatial_layers_; }
+
+ void ResetModel() override {
+ DatarateTest::ResetModel();
+ layer_frame_cnt_ = 0;
+ superframe_cnt_ = 0;
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 1;
+ for (int i = 0; i < AOM_MAX_LAYERS; i++) {
+ target_layer_bitrate_[i] = 0;
+ effective_datarate_tl[i] = 0.0;
+ }
+ memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t));
+ memset(&svc_params_, 0, sizeof(aom_svc_params_t));
+ memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t));
+ memset(&ref_frame_comp_pred_, 0, sizeof(aom_svc_ref_frame_comp_pred_t));
+ drop_frames_ = 0;
+ for (int i = 0; i < 1000; i++) drop_frames_list_[i] = 1000;
+ decoded_nframes_ = 0;
+ mismatch_nframes_ = 0;
+ mismatch_psnr_ = 0.0;
+ set_frame_level_er_ = 0;
+ multi_ref_ = 0;
+ use_fixed_mode_svc_ = 0;
+ comp_pred_ = 0;
+ dynamic_enable_disable_mode_ = 0;
+ intra_only_ = 0;
+ frame_to_start_decoding_ = 0;
+ layer_to_decode_ = 0;
+ frame_sync_ = 0;
+ current_video_frame_ = 0;
+ screen_mode_ = 0;
+ rps_mode_ = 0;
+ rps_recovery_frame_ = 0;
+ user_define_frame_qp_ = 0;
+ set_speed_per_layer_ = false;
+ simulcast_mode_ = false;
+ }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ int spatial_layer_id = 0;
+ current_video_frame_ = video->frame();
+ // video->frame() is called every superframe, so we should condition
+ // this on layer_frame_cnt_ = 0, so we only do this once on the very
+ // first frame.
+ if (video->frame() == 0 && layer_frame_cnt_ == 0) {
+ initialize_svc(number_temporal_layers_, number_spatial_layers_,
+ &svc_params_);
+ if (dynamic_enable_disable_mode_ == 1) {
+ svc_params_.layer_target_bitrate[2] = 0;
+ cfg_.rc_target_bitrate -= target_layer_bitrate_[2];
+ }
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ // TODO(aomedia:3032): Configure KSVC in fixed mode.
+ encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0);
+ encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+ encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+ if (cfg_.g_threads > 1) {
+ if (cfg_.g_threads == 4) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
+ encoder->Control(AV1E_SET_TILE_ROWS, 2);
+ } else if (cfg_.g_threads == 8) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+ encoder->Control(AV1E_SET_TILE_ROWS, 2);
+ } else {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, cfg_.g_threads >> 1);
+ }
+ encoder->Control(AV1E_SET_ROW_MT, 1);
+ }
+ if (screen_mode_) {
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ }
+ }
+ if (number_spatial_layers_ == 2) {
+ spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1;
+ } else if (number_spatial_layers_ == 3) {
+ spatial_layer_id = (layer_frame_cnt_ % 3 == 0) ? 0
+ : ((layer_frame_cnt_ - 1) % 3 == 0) ? 1
+ : 2;
+ }
+ // Set the reference/update flags, layer_id, and reference_map
+ // buffer index.
+ frame_flags_ = set_layer_pattern(
+ video->frame(), &layer_id_, &ref_frame_config_, &ref_frame_comp_pred_,
+ spatial_layer_id, multi_ref_, comp_pred_,
+ (video->frame() % cfg_.kf_max_dist) == 0, dynamic_enable_disable_mode_,
+ rps_mode_, rps_recovery_frame_, simulcast_mode_);
+ if (intra_only_ == 1 && frame_sync_ > 0) {
+ // Set an Intra-only frame on SL0 at frame_sync_.
+ // In order to allow decoding to start on SL0 in mid-sequence we need to
+ // set and refresh all the slots used on SL0 stream, which is 0 and 3
+ // for this test pattern. The other slots (1, 2, 4, 5) are used for the
+ // SL > 0 layers and these slotes are not refreshed on frame_sync_, so
+ // temporal prediction for the top layers can continue.
+ if (spatial_layer_id == 0 && video->frame() == frame_sync_) {
+ ref_frame_config_.ref_idx[0] = 0;
+ ref_frame_config_.ref_idx[3] = 3;
+ ref_frame_config_.refresh[0] = 1;
+ ref_frame_config_.refresh[3] = 1;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config_.reference[i] = 0;
+ }
+ }
+ if (intra_only_ && video->frame() == 50 && spatial_layer_id == 1) {
+ // Force an intra_only frame here, for SL1.
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config_.reference[i] = 0;
+ }
+ encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
+ // The SET_SVC_REF_FRAME_CONFIG and AV1E_SET_SVC_REF_FRAME_COMP_PRED api is
+ // for the flexible SVC mode (i.e., use_fixed_mode_svc == 0).
+ if (!use_fixed_mode_svc_) {
+ encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+ encoder->Control(AV1E_SET_SVC_REF_FRAME_COMP_PRED, &ref_frame_comp_pred_);
+ }
+ if (set_speed_per_layer_) {
+ int speed_per_layer = 10;
+ if (layer_id_.spatial_layer_id == 0) {
+ // For for base SL0,TL0: use the speed the test loops over.
+ if (layer_id_.temporal_layer_id == 1) speed_per_layer = 7;
+ if (layer_id_.temporal_layer_id == 2) speed_per_layer = 8;
+ } else if (layer_id_.spatial_layer_id == 1) {
+ if (layer_id_.temporal_layer_id == 0) speed_per_layer = 7;
+ if (layer_id_.temporal_layer_id == 1) speed_per_layer = 8;
+ if (layer_id_.temporal_layer_id == 2) speed_per_layer = 9;
+ } else if (layer_id_.spatial_layer_id == 2) {
+ if (layer_id_.temporal_layer_id == 0) speed_per_layer = 8;
+ if (layer_id_.temporal_layer_id == 1) speed_per_layer = 9;
+ if (layer_id_.temporal_layer_id == 2) speed_per_layer = 10;
+ }
+ encoder->Control(AOME_SET_CPUUSED, speed_per_layer);
+ }
+ if (set_frame_level_er_) {
+ int mode =
+ (layer_id_.spatial_layer_id > 0 || layer_id_.temporal_layer_id > 0);
+ encoder->Control(AV1E_SET_ERROR_RESILIENT_MODE, mode);
+ }
+ if (dynamic_enable_disable_mode_ == 1) {
+ if (layer_frame_cnt_ == 300 && spatial_layer_id == 0) {
+ // Enable: set top spatial layer bitrate back to non-zero.
+ svc_params_.layer_target_bitrate[2] = target_layer_bitrate_[2];
+ cfg_.rc_target_bitrate += target_layer_bitrate_[2];
+ encoder->Config(&cfg_);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ }
+ } else if (dynamic_enable_disable_mode_ == 2) {
+ if (layer_frame_cnt_ == 300 && spatial_layer_id == 0) {
+ // Disable top spatial layer mid-stream.
+ svc_params_.layer_target_bitrate[2] = 0;
+ cfg_.rc_target_bitrate -= target_layer_bitrate_[2];
+ encoder->Config(&cfg_);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ } else if (layer_frame_cnt_ == 600 && spatial_layer_id == 0) {
+ // Enable top spatial layer mid-stream.
+ svc_params_.layer_target_bitrate[2] = target_layer_bitrate_[2];
+ cfg_.rc_target_bitrate += target_layer_bitrate_[2];
+ encoder->Config(&cfg_);
+ encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+ }
+ }
+ layer_frame_cnt_++;
+ DatarateTest::PreEncodeFrameHook(video, encoder);
+
+ if (user_define_frame_qp_) {
+ frame_qp_ = rnd_.PseudoUniform(63);
+ encoder->Control(AV1E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+ }
+ }
+
+ void PostEncodeFrameHook(::libaom_test::Encoder *encoder) override {
+ int num_operating_points;
+ encoder->Control(AV1E_GET_NUM_OPERATING_POINTS, &num_operating_points);
+ ASSERT_EQ(num_operating_points,
+ number_temporal_layers_ * number_spatial_layers_);
+
+ if (user_define_frame_qp_) {
+ if (current_video_frame_ >= static_cast<unsigned int>(total_frame_))
+ return;
+ int qp;
+ encoder->Control(AOME_GET_LAST_QUANTIZER_64, &qp);
+ ASSERT_EQ(qp, frame_qp_);
+ }
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+ // Update the layer cumulative bitrate.
+ for (int i = layer_id_.temporal_layer_id; i < number_temporal_layers_;
+ i++) {
+ int layer = layer_id_.spatial_layer_id * number_temporal_layers_ + i;
+ effective_datarate_tl[layer] += 1.0 * frame_size_in_bits;
+ }
+ if (layer_id_.spatial_layer_id == number_spatial_layers_ - 1) {
+ last_pts_ = pkt->data.frame.pts;
+ superframe_cnt_++;
+ }
+ // For simulcast mode: verify that for first frame to start decoding,
+ // for SL > 0, are Intra-only frames (not Key), whereas SL0 is Key.
+ if (simulcast_mode_ && superframe_cnt_ == (int)frame_to_start_decoding_) {
+ if (layer_id_.spatial_layer_id > 0) {
+ EXPECT_NE(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ } else if (layer_id_.spatial_layer_id == 0) {
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ }
+ }
+ }
+
+ void EndPassHook() override {
+ duration_ = ((last_pts_ + 1) * timebase_);
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_;
+ }
+ }
+
+ bool DoDecode() const override {
+ if (drop_frames_ > 0) {
+ for (unsigned int i = 0; i < drop_frames_; ++i) {
+ if (drop_frames_list_[i] == (unsigned int)superframe_cnt_) {
+ std::cout << " Skipping decoding frame: "
+ << drop_frames_list_[i] << "\n";
+ return false;
+ }
+ }
+ } else if (intra_only_ == 1) {
+ // Only start decoding at frames_to_start_decoding_.
+ if (current_video_frame_ < frame_to_start_decoding_) return false;
+ // Only decode base layer for 3SL, for layer_to_decode_ = 0.
+ if (layer_to_decode_ == 0 && frame_sync_ > 0 &&
+ (layer_frame_cnt_ - 1) % 3 != 0)
+ return false;
+ } else if (simulcast_mode_) {
+ // Only start decoding at frames_to_start_decoding_ and only
+ // for top spatial layer SL2 (layer_to_decode_).
+ if (current_video_frame_ < frame_to_start_decoding_) return false;
+ if (layer_id_.spatial_layer_id < (int)layer_to_decode_) return false;
+ }
+ return true;
+ }
+
+ void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) override {
+ double mismatch_psnr = compute_psnr(img1, img2);
+ mismatch_psnr_ += mismatch_psnr;
+ ++mismatch_nframes_;
+ }
+
+ unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+ unsigned int GetDecodedFrames() { return decoded_nframes_; }
+
+ static void ref_config_rps(aom_svc_ref_frame_config_t *ref_frame_config,
+ int frame_cnt, int rps_recovery_frame) {
+ // Pattern of 3 references with (ALTREF and GOLDEN) trailing
+ // LAST by 4 and 8 frame, with some switching logic to
+ // only predict from longer-term reference.
+ int last_idx = 0;
+ int last_idx_refresh = 0;
+ int gld_idx = 0;
+ int alt_ref_idx = 0;
+ const int lag_alt = 4;
+ const int lag_gld = 8;
+ const int sh = 8; // slots 0 - 7.
+ // Moving index slot for last: 0 - (sh - 1)
+ if (frame_cnt > 1) last_idx = (frame_cnt - 1) % sh;
+ // Moving index for refresh of last: one ahead for next frame.
+ last_idx_refresh = frame_cnt % sh;
+ // Moving index for gld_ref, lag behind current by lag_gld
+ if (frame_cnt > lag_gld) gld_idx = (frame_cnt - lag_gld) % sh;
+ // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+ if (frame_cnt > lag_alt) alt_ref_idx = (frame_cnt - lag_alt) % sh;
+ // Set the ref_idx.
+ // Default all references (7) to slot for last.
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = last_idx;
+ // Set the ref_idx for the relevant references.
+ ref_frame_config->ref_idx[0] = last_idx;
+ ref_frame_config->ref_idx[1] = last_idx_refresh;
+ ref_frame_config->ref_idx[3] = gld_idx;
+ ref_frame_config->ref_idx[6] = alt_ref_idx;
+ // Refresh this slot, which will become LAST on next frame.
+ ref_frame_config->refresh[last_idx_refresh] = 1;
+ // Reference LAST, ALTREF, and GOLDEN
+ ref_frame_config->reference[0] = 1;
+ ref_frame_config->reference[6] = 1;
+ ref_frame_config->reference[3] = 1;
+ if (frame_cnt == rps_recovery_frame) {
+ // Switch to only reference GOLDEN at recovery_frame.
+ ref_frame_config->reference[0] = 0;
+ ref_frame_config->reference[6] = 0;
+ ref_frame_config->reference[3] = 1;
+ } else if (frame_cnt > rps_recovery_frame &&
+ frame_cnt < rps_recovery_frame + 8) {
+ // Go back to predicting from LAST, and after
+ // 8 frames (GOLDEN is 8 frames aways) go back
+ // to predicting off GOLDEN and ALTREF.
+ ref_frame_config->reference[0] = 1;
+ ref_frame_config->reference[6] = 0;
+ ref_frame_config->reference[3] = 0;
+ }
+ }
+
+ // Simulcast mode for 3 spatial and 3 temporal layers.
+ // No inter-layer predicton, only prediction is temporal and single
+ // reference (LAST).
+ // No overlap in buffer slots between spatial layers. So for example,
+ // SL0 only uses slots 0 and 1.
+ // SL1 only uses slots 2 and 3.
+ // SL2 only uses slots 4 and 5.
+ // All 7 references for each inter-frame must only access buffer slots
+ // for that spatial layer.
+ // On key (super)frames: SL1 and SL2 must have no references set
+ // and must refresh all the slots for that layer only (so 2 and 3
+ // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally
+ // as a Key frame (refresh all slots). SL1/SL2 will be labelled
+ // internally as Intra-only frames that allow that stream to be decoded.
+ // These conditions will allow for each spatial stream to be
+ // independently decodeable.
+ static void ref_config_simulcast3SL3TL(
+ aom_svc_ref_frame_config_t *ref_frame_config,
+ aom_svc_layer_id_t *layer_id, int is_key_frame, int superframe_cnt) {
+ int i;
+ // Initialize all references to 0 (don't use reference).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->reference[i] = 0;
+ // Initialize as no refresh/update for all slots.
+ for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0;
+
+ if (is_key_frame) {
+ if (layer_id->spatial_layer_id == 0) {
+ // Assign LAST/GOLDEN to slot 0/1.
+ // Refesh slots 0 and 1 for SL0.
+ // SL0: this will get set to KEY frame internally.
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->ref_idx[3] = 1;
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Assign LAST/GOLDEN to slot 2/3.
+ // Refesh slots 2 and 3 for SL1.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[2] = 1;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Assign LAST/GOLDEN to slot 4/5.
+ // Refresh slots 4 and 5 for SL2.
+ // This will get set to Intra-only frame internally.
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->ref_idx[3] = 5;
+ ref_frame_config->refresh[4] = 1;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if (superframe_cnt % 4 == 0) {
+ // Base temporal layer: TL0
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST. Assign all references to either slot
+ // 0 or 1. Here we assign LAST to slot 0, all others to 1.
+ // Update slot 0 (LAST).
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST. Assign all references to either slot
+ // 2 or 3. Here we assign LAST to slot 2, all others to 3.
+ // Update slot 2 (LAST).
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[2] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST. Assign all references to either slot
+ // 4 or 5. Here we assign LAST to slot 4, all others to 5.
+ // Update slot 4 (LAST).
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->refresh[4] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0). Assign other references to slot 1.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 0;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2). Assign other references to slot 3.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 2;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[0] = 4;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer: TL1
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 1 and update slot 1.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 2).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 4).
+ // Set GOLDEN to slot 5 and update slot 5.
+ // This will be used as reference for next TL2.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 5;
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer: TL2
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) { // SL0
+ // Reference LAST (slot 1). Assign other references to slot 0.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) { // SL1
+ // Reference LAST (slot 3). Assign other references to slot 2.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 2;
+ ref_frame_config->ref_idx[0] = 3;
+ } else if (layer_id->spatial_layer_id == 2) { // SL2
+ // Reference LAST (slot 5). Assign other references to slot 4.
+ // No update/refresh on any slots.
+ ref_frame_config->reference[0] = 1;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+ ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[0] = 5;
+ }
+ }
+ }
+
+ // 3 spatial and 3 temporal layer.
+ // Overlap in the buffer slot updates: the slots 3 and 4 updated by
+ // first TL2 are reused for update in TL1 superframe.
+ static void ref_config_3SL3TL(aom_svc_ref_frame_config_t *ref_frame_config,
+ aom_svc_layer_id_t *layer_id, int is_key_frame,
+ int superframe_cnt) {
+ if (superframe_cnt % 4 == 0) {
+ // Base temporal layer.
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST.
+ // Set all buffer_idx to 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 0.
+ // Update slot 1 (LAST).
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 1.
+ // Update slot 2 (LAST).
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[2] = 1;
+ }
+ } else if ((superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to slot 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and Update slot 4.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->ref_idx[1] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // No update.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[0] = 2;
+ }
+ } else if ((superframe_cnt - 2) % 4 == 0) {
+ // Middle temporal enhancement layer.
+ layer_id->temporal_layer_id = 1;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST.
+ // Set all buffer_idx to 0.
+ // Set GOLDEN to slot 3 and update slot 3.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and update slot 4.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->ref_idx[2] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // Set LAST2 to slot 5 and update slot 5.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->ref_idx[2] = 5;
+ ref_frame_config->refresh[5] = 1;
+ }
+ } else if ((superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer.
+ layer_id->temporal_layer_id = 2;
+ if (layer_id->spatial_layer_id == 0) {
+ // Set LAST to slot 3 and reference LAST.
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 3;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[3] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 4,
+ // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 4;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->ref_idx[1] = 4;
+ ref_frame_config->refresh[4] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 5,
+ // GOLDEN to slot 4. No update.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 5;
+ ref_frame_config->ref_idx[3] = 4;
+ }
+ }
+ if (layer_id->spatial_layer_id > 0) {
+ // Always reference GOLDEN (inter-layer prediction).
+ ref_frame_config->reference[3] = 1;
+ if (is_key_frame && layer_id->spatial_layer_id > 0) {
+ // On superframes whose base is key: remove LAST since GOLDEN
+ // is used as reference.
+ ref_frame_config->reference[0] = 0;
+ }
+ }
+ }
+
+ // Layer pattern configuration.
+ virtual int set_layer_pattern(
+ int frame_cnt, aom_svc_layer_id_t *layer_id,
+ aom_svc_ref_frame_config_t *ref_frame_config,
+ aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int spatial_layer,
+ int multi_ref, int comp_pred, int is_key_frame,
+ int dynamic_enable_disable_mode, int rps_mode, int rps_recovery_frame,
+ int simulcast_mode) {
+ int lag_index = 0;
+ int base_count = frame_cnt >> 2;
+ layer_id->spatial_layer_id = spatial_layer;
+ // Set the reference map buffer idx for the 7 references:
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ ref_frame_config->ref_idx[i] = i;
+ ref_frame_config->reference[i] = 0;
+ }
+ for (int i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+ if (comp_pred) {
+ ref_frame_comp_pred->use_comp_pred[0] = 1; // GOLDEN_LAST
+ ref_frame_comp_pred->use_comp_pred[1] = 1; // LAST2_LAST
+ ref_frame_comp_pred->use_comp_pred[2] = 1; // ALTREF_LAST
+ }
+ // Set layer_flags to 0 when using ref_frame_config->reference.
+ int layer_flags = 0;
+ // Always reference LAST.
+ ref_frame_config->reference[0] = 1;
+ if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) {
+ ref_frame_config->refresh[0] = 1;
+ if (rps_mode)
+ ref_config_rps(ref_frame_config, frame_cnt, rps_recovery_frame);
+ }
+ if (number_temporal_layers_ == 2 && number_spatial_layers_ == 1) {
+ // 2-temporal layer.
+ // 1 3 5
+ // 0 2 4
+ // Keep golden fixed at slot 3.
+ base_count = frame_cnt >> 1;
+ ref_frame_config->ref_idx[3] = 3;
+ // Cyclically refresh slots 5, 6, 7, for lag alt ref.
+ lag_index = 5;
+ if (base_count > 0) {
+ lag_index = 5 + (base_count % 3);
+ if (frame_cnt % 2 != 0) lag_index = 5 + ((base_count + 1) % 3);
+ }
+ // Set the altref slot to lag_index.
+ ref_frame_config->ref_idx[6] = lag_index;
+ if (frame_cnt % 2 == 0) {
+ layer_id->temporal_layer_id = 0;
+ // Update LAST on layer 0, reference LAST.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[0] = 1;
+ // Refresh lag_index slot, needed for lagging golen.
+ ref_frame_config->refresh[lag_index] = 1;
+ // Refresh GOLDEN every x base layer frames.
+ if (base_count % 32 == 0) ref_frame_config->refresh[3] = 1;
+ } else {
+ layer_id->temporal_layer_id = 1;
+ // No updates on layer 1, reference LAST (TL0).
+ ref_frame_config->reference[0] = 1;
+ }
+ // Always reference golden and altref on TL0.
+ if (layer_id->temporal_layer_id == 0) {
+ ref_frame_config->reference[3] = 1;
+ ref_frame_config->reference[6] = 1;
+ }
+ } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 1) {
+ // 3-layer:
+ // 1 3 5 7
+ // 2 6
+ // 0 4 8
+ if (multi_ref) {
+ // Keep golden fixed at slot 3.
+ ref_frame_config->ref_idx[3] = 3;
+ // Cyclically refresh slots 4, 5, 6, 7, for lag altref.
+ lag_index = 4 + (base_count % 4);
+ // Set the altref slot to lag_index.
+ ref_frame_config->ref_idx[6] = lag_index;
+ }
+ if (frame_cnt % 4 == 0) {
+ // Base layer.
+ layer_id->temporal_layer_id = 0;
+ // Update LAST on layer 0, reference LAST and GF.
+ ref_frame_config->refresh[0] = 1;
+ ref_frame_config->reference[3] = 1;
+ if (multi_ref) {
+ // Refresh GOLDEN every x ~10 base layer frames.
+ if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1;
+ // Refresh lag_index slot, needed for lagging altref.
+ ref_frame_config->refresh[lag_index] = 1;
+ }
+ } else if ((frame_cnt - 1) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // First top layer: no updates, only reference LAST (TL0).
+ } else if ((frame_cnt - 2) % 4 == 0) {
+ layer_id->temporal_layer_id = 1;
+ // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+ ref_frame_config->refresh[1] = 1;
+ } else if ((frame_cnt - 3) % 4 == 0) {
+ layer_id->temporal_layer_id = 2;
+ // Second top layer: no updates, only reference LAST.
+ // Set buffer idx for LAST to slot 1, since that was the slot
+ // updated in previous frame. So LAST is TL1 frame.
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->ref_idx[1] = 0;
+ }
+ if (multi_ref) {
+ // Every frame can reference GOLDEN AND ALTREF.
+ ref_frame_config->reference[3] = 1;
+ ref_frame_config->reference[6] = 1;
+ }
+ } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 2) {
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST. Keep LAST and GOLDEN in slots 0 and 3.
+ ref_frame_config->ref_idx[0] = 0;
+ ref_frame_config->ref_idx[3] = 3;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 3
+ // and GOLDEN to slot 0. Update slot 3 (LAST).
+ ref_frame_config->ref_idx[0] = 3;
+ ref_frame_config->ref_idx[3] = 0;
+ ref_frame_config->refresh[3] = 1;
+ }
+ // Reference GOLDEN.
+ if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+ } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 3) {
+ // 3 spatial layers, 1 temporal.
+ // Note for this case , we set the buffer idx for all references to be
+ // either LAST or GOLDEN, which are always valid references, since decoder
+ // will check if any of the 7 references is valid scale in
+ // valid_ref_frame_size().
+ layer_id->temporal_layer_id = 0;
+ if (layer_id->spatial_layer_id == 0) {
+ // Reference LAST, update LAST. Set all other buffer_idx to 0.
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->refresh[0] = 1;
+ } else if (layer_id->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+ // and GOLDEN (and all other refs) to slot 0.
+ // Update slot 1 (LAST).
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+ ref_frame_config->ref_idx[0] = 1;
+ ref_frame_config->refresh[1] = 1;
+ } else if (layer_id->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
+ // and GOLDEN (and all other refs) to slot 1.
+ // Update slot 2 (LAST).
+ for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+ ref_frame_config->ref_idx[0] = 2;
+ ref_frame_config->refresh[2] = 1;
+ if (multi_ref) {
+ ref_frame_config->ref_idx[6] = 7;
+ ref_frame_config->reference[6] = 1;
+ if (base_count % 10 == 0) ref_frame_config->refresh[7] = 1;
+ }
+ }
+ // Reference GOLDEN.
+ if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+ } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
+ if (simulcast_mode) {
+ ref_config_simulcast3SL3TL(ref_frame_config, layer_id, is_key_frame,
+ superframe_cnt_);
+ } else {
+ ref_config_3SL3TL(ref_frame_config, layer_id, is_key_frame,
+ superframe_cnt_);
+ // Allow for top spatial layer to use additional temporal reference.
+ // Additional reference is only updated on base temporal layer, every
+ // 10 TL0 frames here.
+ if (multi_ref && layer_id->spatial_layer_id == 2) {
+ ref_frame_config->ref_idx[6] = 7;
+ if (!is_key_frame) ref_frame_config->reference[6] = 1;
+ if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0)
+ ref_frame_config->refresh[7] = 1;
+ }
+ }
+ }
+ // If the top spatial layer is first-time encoded in mid-sequence
+ // (i.e., dynamic_enable_disable_mode = 1), then don't predict from LAST,
+ // since it will have been last updated on first key frame (SL0) and so
+ // be different resolution from SL2.
+ if (dynamic_enable_disable_mode == 1 &&
+ layer_id->spatial_layer_id == number_spatial_layers_ - 1)
+ ref_frame_config->reference[0] = 0;
+ return layer_flags;
+ }
+
+ virtual void initialize_svc(int number_temporal_layers,
+ int number_spatial_layers,
+ aom_svc_params *svc_params) {
+ svc_params->number_spatial_layers = number_spatial_layers;
+ svc_params->number_temporal_layers = number_temporal_layers;
+ for (int i = 0; i < number_temporal_layers * number_spatial_layers; ++i) {
+ svc_params->max_quantizers[i] = 60;
+ svc_params->min_quantizers[i] = 2;
+ svc_params->layer_target_bitrate[i] = target_layer_bitrate_[i];
+ }
+ // Do at most 3 spatial or temporal layers here.
+ svc_params->framerate_factor[0] = 1;
+ if (number_temporal_layers == 2) {
+ svc_params->framerate_factor[0] = 2;
+ svc_params->framerate_factor[1] = 1;
+ } else if (number_temporal_layers == 3) {
+ svc_params->framerate_factor[0] = 4;
+ svc_params->framerate_factor[1] = 2;
+ svc_params->framerate_factor[2] = 1;
+ }
+ svc_params->scaling_factor_num[0] = 1;
+ svc_params->scaling_factor_den[0] = 1;
+ if (number_spatial_layers == 2) {
+ svc_params->scaling_factor_num[0] = 1;
+ svc_params->scaling_factor_den[0] = 2;
+ svc_params->scaling_factor_num[1] = 1;
+ svc_params->scaling_factor_den[1] = 1;
+ } else if (number_spatial_layers == 3) {
+ svc_params->scaling_factor_num[0] = 1;
+ svc_params->scaling_factor_den[0] = 4;
+ svc_params->scaling_factor_num[1] = 1;
+ svc_params->scaling_factor_den[1] = 2;
+ svc_params->scaling_factor_num[2] = 1;
+ svc_params->scaling_factor_den[2] = 1;
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 1;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Top temporal layers are non_reference, so exlcude them from
+ // mismatch count, since loopfilter/cdef is not applied for these on
+ // encoder side, but is always applied on decoder.
+ // This means 150 = #frames(300) - #TL2_frames(150).
+ EXPECT_EQ((int)GetMismatchFrames(), 150);
+ }
+
+ virtual void SetFrameQpSVC3TL1SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 1;
+
+ user_define_frame_qp_ = 1;
+ total_frame_ = 300;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ virtual void SetFrameQpSVC3TL3SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ user_define_frame_qp_ = 1;
+ total_frame_ = 300;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLScreenTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 60);
+
+ const int bitrate_array[2] = { 800, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ screen_mode_ = 1;
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 1;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Top temporal layers are non_reference, so exlcude them from
+ // mismatch count, since loopfilter/cdef is not applied for these on
+ // encoder side, but is always applied on decoder.
+ // This means 30 = #frames(60) - #TL2_frames(30).
+ // We use LE for screen since loopfilter level can become very small
+ // or zero and then the frame is not a mismatch.
+ EXPECT_LE((int)GetMismatchFrames(), 30);
+ }
+
+ virtual void BasicRateTargetingSVC2TL1SLScreenDropFrameTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 30;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 52;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+
+ const int bitrate_array[2] = { 60, 100 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ screen_mode_ = 1;
+ number_temporal_layers_ = 2;
+ number_spatial_layers_ = 1;
+ target_layer_bitrate_[0] = 60 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.75)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.5)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Top temporal layers are non_reference, so exlcude them from
+ // mismatch count, since loopfilter/cdef is not applied for these on
+ // encoder side, but is always applied on decoder.
+ // This means 300 = #frames(300) - #TL2_frames(150).
+ // We use LE for screen since loopfilter level can become very small
+ // or zero and then the frame is not a mismatch.
+ EXPECT_LE((int)GetMismatchFrames(), 150);
+ }
+
+ virtual void BasicRateTargetingSVC1TL3SLScreenTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+ const int bitrate_array[2] = { 800, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ screen_mode_ = 1;
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 3;
+ target_layer_bitrate_[0] = 30 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 60 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.5)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC1TL1SLScreenScCutsMotionTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+
+ const int bitrate_array[2] = { 200, 500 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ screen_mode_ = 1;
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 1;
+ target_layer_bitrate_[0] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.40)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLResizeTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.rc_resize_mode = RESIZE_DYNAMIC;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ const int bitrate_array[2] = { 80, 90 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ unsigned int last_w = cfg_.g_w;
+ unsigned int last_h = cfg_.g_h;
+ int resize_down_count = 0;
+ for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+ info != frame_info_list_.end(); ++info) {
+ if (info->w != last_w || info->h != last_h) {
+ // Verify that resize down occurs.
+ ASSERT_LT(info->w, last_w);
+ ASSERT_LT(info->h, last_h);
+ last_w = info->w;
+ last_h = info->h;
+ resize_down_count++;
+ }
+ }
+ // Must be at least one resize down.
+ ASSERT_GE(resize_down_count, 1);
+ }
+
+ virtual void BasicRateTargetingSVC1TL2SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 300, 600 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 2;
+ target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4;
+ target_layer_bitrate_[1] = 2 * cfg_.rc_target_bitrate / 4;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLIntraStartDecodeBaseMidSeq() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ intra_only_ = 1;
+ frame_sync_ = 20;
+ frame_to_start_decoding_ = frame_sync_;
+ layer_to_decode_ = 0;
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Only check datarate on SL0 - this is layer that is decoded starting at
+ // frame_to_start_decoding_.
+ for (int i = 0; i < number_temporal_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Only base spatial layer is decoded and there are no non-referenece
+ // frames on S0, so #mismatch must be 0.
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ intra_only_ = 1;
+ frame_sync_ = 20;
+ frame_to_start_decoding_ = 0;
+ layer_to_decode_ = 3;
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.585)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // All 3 spatial layers are decoded, starting at frame 0, so there are
+ // and there 300/2 = 150 non-reference frames, so mismatch is 150.
+ EXPECT_EQ((int)GetMismatchFrames(), 150);
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLSimulcast() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.kf_max_dist = 150;
+ cfg_.kf_min_dist = 150;
+ int num_frames = 300;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, num_frames);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ simulcast_mode_ = 1;
+ frame_to_start_decoding_ = cfg_.kf_max_dist;
+ layer_to_decode_ = 2; // SL2
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // Only SL2 layer is decoded.
+ for (int tl = 0; tl < number_temporal_layers_; tl++) {
+ int i = layer_to_decode_ * number_temporal_layers_ + tl;
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.6)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.7)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Only top spatial layer (SL2) is decoded, starting at frame 150
+ // (frame_to_start_decoding_), so there (300 - 150) / 2 = 75
+ // non-reference frames, so mismatch is 75.
+ int num_mismatch = (num_frames - frame_to_start_decoding_) / 2;
+ EXPECT_EQ((int)GetMismatchFrames(), num_mismatch);
+ }
+
+ virtual void BasicRateTargetingSVC1TL2SLIntraOnlyTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 300, 600 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ intra_only_ = 1;
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 2;
+ target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4;
+ target_layer_bitrate_[1] = 2 * cfg_.rc_target_bitrate / 4;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC1TL3SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 3;
+ target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC1TL3SLMultiRefTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ multi_ref_ = 1;
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 3;
+ target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.50)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLHDTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingFixedModeSVC3TL3SLHDTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ use_fixed_mode_svc_ = 1;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLMultiThreadSpeedPerLayerTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_threads = 2;
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ set_speed_per_layer_ = true;
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLHDMultiThread2Test() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_threads = 2;
+
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLHDMultiThread4Test() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_threads = 4;
+
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLHDMultiRefTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ multi_ref_ = 1;
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLKfTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.kf_mode = AOM_KF_AUTO;
+ cfg_.kf_min_dist = cfg_.kf_max_dist = 100;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.55)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargeting444SVC3TL3SLTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = 1;
+
+ ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+ const int bitrate_array[2] = { 600, 1200 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLMultiRefDropAllEnhTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ // error_resilient can set to off/0, since for SVC the context update
+ // is done per-layer.
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ multi_ref_ = 1;
+ // Drop TL1 and TL2: #frames(300) - #TL0.
+ drop_frames_ = 300 - 300 / 4;
+ int n = 0;
+ for (int i = 0; i < 300; i++) {
+ if (i % 4 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ }
+ }
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLDropAllEnhTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ // error_resilient can set to off/0, since for SVC the context update
+ // is done per-layer.
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ // Drop TL1 and TL2: #frames(300) - #TL0.
+ drop_frames_ = 300 - 300 / 4;
+ int n = 0;
+ for (int i = 0; i < 300; i++) {
+ if (i % 4 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ }
+ }
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLDropTL2EnhTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ // error_resilient for sequence can be off/0, since dropped frames (TL2)
+ // are non-reference frames.
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ // Drop TL2: #frames(300) - (#TL0 + #TL1).
+ drop_frames_ = 300 - 300 / 2;
+ int n = 0;
+ for (int i = 0; i < 300; i++) {
+ if (i % 2 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ }
+ }
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ // Set error_resilience at frame level, with codec control,
+ // on/1 for enahancement layers and off/0 for base layer frames.
+ set_frame_level_er_ = 1;
+
+ // Drop TL1 and TL2: #frames(300) - #TL0.
+ drop_frames_ = 300 - 300 / 4;
+ int n = 0;
+ for (int i = 0; i < 300; i++) {
+ if (i % 4 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ }
+ }
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLDropSetEnhFrameERTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ // Set error_resilience at frame level, with codec control,
+ // on/1 for enahancement layers and off/0 for base layer frames.
+ set_frame_level_er_ = 1;
+
+ // Drop TL1 and TL2: for part of sequence. Start at first TL2 at
+ // frame 101, and end at second T2 at frame 199. Frame 200 is TL0,
+ // so we can continue decoding without mismatch (since LAST is the
+ // only reference and error_resilient = 1 on TL1/TL2 frames).
+ int n = 0;
+ int num_nonref = 300 / 2;
+ for (int i = 101; i < 200; i++) {
+ if (i % 4 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ if (i % 2 != 0) num_nonref -= 1;
+ }
+ }
+ drop_frames_ = n;
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+ }
+
+ virtual void BasicRateTargetingSVC2TL1SLDropSetEnhER0Test() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+
+ // Set error_resilience off.
+ cfg_.g_error_resilient = 0;
+
+ // Drop TL1: for part of sequence. Start at first TL1 at
+ // frame 101, and end at frame 199. Frame 200 is TL0,
+ // so we can continue decoding without mismatch (since LAST is the
+ // only reference).
+ int n = 0;
+ int num_nonref = 300 / 2;
+ for (int i = 101; i < 200; i++) {
+ if (i % 2 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ if (i % 2 != 0) num_nonref -= 1;
+ }
+ }
+ drop_frames_ = n;
+ number_temporal_layers_ = 2;
+ target_layer_bitrate_[0] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLDropSetEnhER0Test() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+
+ // Set error_resilience off.
+ cfg_.g_error_resilient = 0;
+
+ // Drop TL1 and TL2: for part of sequence. Start at first TL2 at
+ // frame 101, and end at second T2 at frame 199. Frame 200 is TL0,
+ // so we can continue decoding without mismatch (since LAST is the
+ // only reference).
+ int n = 0;
+ int num_nonref = 300 / 2;
+ for (int i = 101; i < 200; i++) {
+ if (i % 4 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ if (i % 2 != 0) num_nonref -= 1;
+ }
+ }
+ drop_frames_ = n;
+ number_temporal_layers_ = 3;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+ }
+
+ virtual void BasicRateTargetingSVC3TL3SLDropSetEnhER0Test() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 200, 550 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ // Set error_resilience off.
+ cfg_.g_error_resilient = 0;
+ // Drop TL1 and TL2: for part of sequence. Start at first TL2 at
+ // frame 101, and end at second T2 at frame 199. Frame 200 is TL0,
+ // so we can continue decoding without mismatch (since LAST is the
+ // only reference).
+ // Drop here means drop whole superframe.
+ int n = 0;
+ int num_nonref = 300 / 2;
+ for (int i = 101; i < 200; i++) {
+ if (i % 4 != 0) {
+ drop_frames_list_[n] = i;
+ n++;
+ if (i % 2 != 0) num_nonref -= 1;
+ }
+ }
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 3;
+ multi_ref_ = 1;
+ drop_frames_ = n * number_spatial_layers_;
+ // SL0
+ const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+ target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+ target_layer_bitrate_[2] = bitrate_sl0;
+ // SL1
+ const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+ target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+ target_layer_bitrate_[5] = bitrate_sl1;
+ // SL2
+ const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+ target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+ target_layer_bitrate_[8] = bitrate_sl2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 * number_spatial_layers_ - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), num_nonref);
+ }
+
+ virtual void BasicRateTargetingSVC3TL1SLMultiRefCompoundTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+ cfg_.g_w = 640;
+ cfg_.g_h = 480;
+ const int bitrate_array[2] = { 400, 800 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ multi_ref_ = 1;
+ comp_pred_ = 1;
+ number_temporal_layers_ = 3;
+ number_spatial_layers_ = 1;
+ target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+ target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC1TL3SLDynEnablTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+ 1, 0, 400);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 3;
+ target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+ dynamic_enable_disable_mode_ = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // No need to check RC on top layer which is disabled part of the time.
+ for (int i = 0; i < number_spatial_layers_ - 1; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingSVC1TL3SLDynDisEnablTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_error_resilient = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 500, 1000 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ number_temporal_layers_ = 1;
+ number_spatial_layers_ = 3;
+ target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+ target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+ dynamic_enable_disable_mode_ = 2;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // No need to check RC on top layer which is disabled part of the time.
+ for (int i = 0; i < number_spatial_layers_ - 1; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ }
+
+ virtual void BasicRateTargetingRPS1TL1SLDropFramesTest() {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+ 288, 30, 1, 0, 300);
+ const int bitrate_array[2] = { 100, 300 };
+ cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+ ResetModel();
+ rps_mode_ = 1;
+ rps_recovery_frame_ = 100;
+ cfg_.g_error_resilient = 0;
+ // Drop x frames before the recovery frames (where the reference
+ // is switched to an older reference (golden or altref).
+ // GOLDEN is 8 frames behind (for the rps pattern example) so we can't
+ // drop more than 8 frames recovery frame, so choose x = 7.
+ int n = 0;
+ for (int i = rps_recovery_frame_ - 7; i < rps_recovery_frame_; i++) {
+ drop_frames_list_[n] = i;
+ n++;
+ }
+ drop_frames_ = n;
+ number_spatial_layers_ = 1;
+ number_temporal_layers_ = 1;
+ target_layer_bitrate_[0] = cfg_.rc_target_bitrate;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+ ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.60)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+ << " The datarate for the file is greater than target by too much!";
+ }
+ // Test that no mismatches have been found.
+ std::cout << " Decoded frames: " << GetDecodedFrames() << "\n";
+ std::cout << " Mismatch frames: " << GetMismatchFrames() << "\n";
+ EXPECT_EQ(300 - GetDecodedFrames(), drop_frames_);
+ EXPECT_EQ((int)GetMismatchFrames(), 0);
+ }
+
+ int layer_frame_cnt_;
+ int superframe_cnt_;
+ int number_temporal_layers_;
+ int number_spatial_layers_;
+ // Allow for up to 3 temporal layers.
+ int target_layer_bitrate_[AOM_MAX_LAYERS];
+ aom_svc_params_t svc_params_;
+ aom_svc_ref_frame_config_t ref_frame_config_;
+ aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred_;
+ aom_svc_layer_id_t layer_id_;
+ double effective_datarate_tl[AOM_MAX_LAYERS];
+ unsigned int drop_frames_;
+ unsigned int drop_frames_list_[1000];
+ unsigned int mismatch_nframes_;
+ unsigned int decoded_nframes_;
+ double mismatch_psnr_;
+ int set_frame_level_er_;
+ int multi_ref_;
+ int use_fixed_mode_svc_;
+ int comp_pred_;
+ int dynamic_enable_disable_mode_;
+ int intra_only_;
+ unsigned int frame_to_start_decoding_;
+ unsigned int layer_to_decode_;
+ unsigned int frame_sync_;
+ unsigned int current_video_frame_;
+ int screen_mode_;
+ int rps_mode_;
+ int rps_recovery_frame_;
+ int simulcast_mode_;
+
+ int user_define_frame_qp_;
+ int frame_qp_;
+ int total_frame_;
+ bool set_speed_per_layer_;
+ libaom_test::ACMRandom rnd_;
+};
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SL) {
+ BasicRateTargetingSVC3TL1SLTest();
+}
+
+TEST_P(DatarateTestSVC, SetFrameQpSVC3TL1SL) { SetFrameQpSVC3TL1SLTest(); }
+
+TEST_P(DatarateTestSVC, SetFrameQpSVC3TL3SL) { SetFrameQpSVC3TL3SLTest(); }
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial
+// for screen mode.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLScreen) {
+ BasicRateTargetingSVC3TL1SLScreenTest();
+}
+
+// Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial
+// for screen mode, with frame dropper on at low bitrates
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLScreenDropFrame) {
+ BasicRateTargetingSVC2TL1SLScreenDropFrameTest();
+}
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal
+// for screen mode.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLScreen) {
+ BasicRateTargetingSVC1TL3SLScreenTest();
+}
+
+// Check basic rate targeting for CBR, for 1 temporal layer, 1 spatial
+// for screen mode, with source with many scene cuts and motion.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL1SLScreenScCutsMotion) {
+ BasicRateTargetingSVC1TL1SLScreenScCutsMotionTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial,
+// with dynamic resize on. Encode at very low bitrate and check that
+// there is at least one resize (down) event.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLResize) {
+ BasicRateTargetingSVC3TL1SLResizeTest();
+}
+
+// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SL) {
+ BasicRateTargetingSVC1TL2SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 3 temporal,
+// with Intra-only frame inserted in the stream. Verify that we can start
+// decoding the SL0 stream at the intra_only frame in mid-sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLIntraStartDecodeBaseMidSeq) {
+ BasicRateTargetingSVC3TL3SLIntraStartDecodeBaseMidSeq();
+}
+
+// Check basic rate targeting for CBR, for 3spatial layers, 3 temporal,
+// with Intra-only frame inserted in the stream. Verify that we can
+// decode all frames and layers with no mismatch.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll) {
+ BasicRateTargetingSVC3TL3SLIntraMidSeqDecodeAll();
+}
+
+// Check simulcast mode for 3 spatial layers, 3 temporal,
+// Key frame is inserted on base SLO in mid-stream, and verify that the
+// top spatial layer (SL2) case be decoded, starting with an Intra-only frame.
+// Verify that we can decode all frames for SL2 with no mismatch.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLSimulcast) {
+ BasicRateTargetingSVC3TL3SLSimulcast();
+}
+
+// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal,
+// with Intra-only frame inserted in the stream.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SLIntraOnly) {
+ BasicRateTargetingSVC1TL2SLIntraOnlyTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SL) {
+ BasicRateTargetingSVC1TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with additional temporal reference for top spatial layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLMultiRef) {
+ BasicRateTargetingSVC1TL3SLMultiRefTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SL) {
+ BasicRateTargetingSVC3TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) {
+ BasicRateTargetingSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for fixed mode SVC.
+TEST_P(DatarateTestSVC, BasicRateTargetingFixedModeSVC3TL3SLHD) {
+ BasicRateTargetingFixedModeSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 2 threads, 2 tile_columns, row-mt enabled, and different speed
+// per layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLMultiThreadSpeedPerLayer) {
+ BasicRateTargetingSVC3TL3SLMultiThreadSpeedPerLayerTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 2 threads, 2 tile_columns, row-mt enabled.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread2) {
+ BasicRateTargetingSVC3TL3SLHDMultiThread2Test();
+}
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 4 threads, 4 tile_columns, row-mt enabled.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiThread4) {
+ BasicRateTargetingSVC3TL3SLHDMultiThread4Test();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// with additional temporal reference for top spatial layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMultiRef) {
+ BasicRateTargetingSVC3TL3SLHDMultiRefTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for auto key frame mode with short key frame period.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLKf) {
+ BasicRateTargetingSVC3TL3SLKfTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for 4:4:4 input.
+#if defined(CONFIG_MAX_DECODE_PROFILE) && CONFIG_MAX_DECODE_PROFILE < 1
+TEST_P(DatarateTestSVC, DISABLED_BasicRateTargeting444SVC3TL3SL) {
+#else
+TEST_P(DatarateTestSVC, BasicRateTargeting444SVC3TL3SL) {
+#endif
+ BasicRateTargeting444SVC3TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Check that the base
+// layer (TL0) can still be decodeable (with no mismatch) with the
+// error_resilient flag set to 0. This test used the pattern with multiple
+// references (last, golden, and altref), updated on base layer.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLMultiRefDropAllEnh) {
+ BasicRateTargetingSVC3TL1SLMultiRefDropAllEnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Check that the base
+// layer (TL0) can still be decodeable (with no mismatch) with the
+// error_resilient flag set to 0.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropAllEnh) {
+ BasicRateTargetingSVC3TL1SLDropAllEnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of the TL2 enhancement layer, which are non-reference
+// (droppble) frames. For the base layer (TL0) and TL1 to still be decodeable
+// (with no mismatch), the error_resilient_flag may be off (set to 0),
+// since TL2 are non-reference frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropTL2Enh) {
+ BasicRateTargetingSVC3TL1SLDropTL2EnhTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping of all enhancement layers (TL 1 and TL2). Test that the
+// error_resilient flag can be set at frame level, with on/1 on
+// enhancement layers and off/0 on base layer.
+// This allows for successful decoding after dropping enhancement layer frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropAllEnhFrameER) {
+ BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping set of enhancement layers (TL 1 and TL2) in middle of sequence.
+// Test that the error_resilient flag can be set at frame level, with on/1 on
+// enhancement layers and off/0 on base layer.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropSetEnhFrameER) {
+ BasicRateTargetingSVC3TL1SLDropSetEnhFrameERTest();
+}
+
+// Check basic rate targeting for CBR, for 2 temporal layers, 1 spatial layer,
+// with dropping set of enhancement layers (TL 1) in middle of sequence.
+// Test that the error_resilient flag can be 0/off for all frames.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC2TL1SLDropSetEnhER0) {
+ BasicRateTargetingSVC2TL1SLDropSetEnhER0Test();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with dropping set of enhancement layers (TL 1 and TL2) in middle of sequence.
+// Test that the error_resilient flag can be 0/off for all frames.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLDropSetEnhER0) {
+ BasicRateTargetingSVC3TL1SLDropSetEnhER0Test();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 3 spatial layers,
+// with dropping set of enhancement layers (superframe TL 1 and TL2) in middle
+// of sequence. Test that the error_resilient flag can be 0/off for all frames.
+// This allows for successful decoding after dropping a set enhancement layer
+// frames in the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLDropSetEnhER0) {
+ BasicRateTargetingSVC3TL3SLDropSetEnhER0Test();
+}
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with compound prediction on, for pattern with two additional refereces
+// (golden and altref), both updated on base TLO frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLMultiRefCompound) {
+ BasicRateTargetingSVC3TL1SLMultiRefCompoundTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with the top spatial layer starting disabled (0 bitrate) and then
+// dynamically enabled after x frames with nonzero bitrate.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLDynEnabl) {
+ BasicRateTargetingSVC1TL3SLDynEnablTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal,
+// with the top spatial layer dynamically disabled snd enabled during the
+// middle of the sequence.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SLDynDisEnabl) {
+ BasicRateTargetingSVC1TL3SLDynDisEnablTest();
+}
+
+// Check basic rate targeting and encoder/decodermismatch, for RPS
+// with 1 layer. A number of consecutive frames are lost midway in
+// sequence, and encoder resorts to a longer term reference to recovery
+// and continue decoding successfully.
+TEST_P(DatarateTestSVC, BasicRateTargetingRPS1TL1SLDropFrames) {
+ BasicRateTargetingRPS1TL1SLDropFramesTest();
+}
+
+TEST(SvcParams, BitrateOverflow) {
+ uint8_t buf[6] = { 0 };
+ aom_image_t img;
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf));
+
+ aom_codec_iface_t *const iface = aom_codec_av1_cx();
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME),
+ AOM_CODEC_OK);
+ cfg.g_w = 1;
+ cfg.g_h = 1;
+ ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+
+ aom_svc_params_t svc_params = {};
+ svc_params.framerate_factor[0] = 1;
+ svc_params.framerate_factor[1] = 2;
+ svc_params.number_spatial_layers = 1;
+ svc_params.number_temporal_layers = 2;
+ svc_params.layer_target_bitrate[0] = INT_MAX;
+ svc_params.layer_target_bitrate[1] = INT_MAX;
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SVC_PARAMS, &svc_params),
+ AOM_CODEC_OK);
+ EXPECT_EQ(
+ aom_codec_encode(&enc, &img, /*pts=*/0, /*duration=*/1, /*flags=*/0),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_encode(&enc, /*img=*/nullptr, /*pts=*/0, /*duration=*/0,
+ /*flags=*/0),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
+ ::testing::Values(::libaom_test::kRealTime),
+ ::testing::Range(7, 12), ::testing::Values(0, 3),
+ ::testing::Values(0, 1));
+
+} // namespace
+} // namespace datarate_test
diff --git a/third_party/aom/test/svc_encoder_rtc.sh b/third_party/aom/test/svc_encoder_rtc.sh
new file mode 100644
index 0000000000..735166d6f6
--- /dev/null
+++ b/third_party/aom/test/svc_encoder_rtc.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+## Copyright (c) 2023, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+svc_encoder_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+common_flags="-k 10000"
+common_flags="${common_flags} --max-q=63"
+common_flags="${common_flags} --error-resilient=0"
+
+# Runs svc_encoder_rtc using with 1 spatial layer 3 temporal layers.
+svc_encoder_s1_t3() {
+ local encoder="${LIBAOM_BIN_PATH}/svc_encoder_rtc${AOM_TEST_EXE_SUFFIX}"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/svc_encoder_rtc"
+
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${common_flags}" \
+ "--width=${YUV_RAW_INPUT_WIDTH}" \
+ "--height=${YUV_RAW_INPUT_HEIGHT}" \
+ "-lm 2" \
+ "--speed=8" \
+ "--target-bitrate=400" \
+ "--bitrates=220,300,400" \
+ "--spatial-layers=1" \
+ "--temporal-layers=3" \
+ "--timebase=1/30" \
+ "${YUV_RAW_INPUT}" \
+ "-o ${output_file}" \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+# Runs svc_encoder_rtc using with 1 spatial layer 2 temporal layers with
+# speed 10.
+svc_encoder_s1_t2() {
+ local encoder="${LIBAOM_BIN_PATH}/svc_encoder_rtc${AOM_TEST_EXE_SUFFIX}"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/svc_encoder_rtc"
+
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${common_flags}" \
+ "--width=${YUV_RAW_INPUT_WIDTH}" \
+ "--height=${YUV_RAW_INPUT_HEIGHT}" \
+ "-lm 1" \
+ "--speed=10" \
+ "--target-bitrate=400" \
+ "--bitrates=220,400" \
+ "--spatial-layers=1" \
+ "--temporal-layers=2" \
+ "--timebase=1/30" \
+ "${YUV_RAW_INPUT}" \
+ "-o ${output_file}" \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+if [ "$(av1_encode_available)" = "yes" ]; then
+ svc_encoder_rtc_tests="svc_encoder_s1_t3
+ svc_encoder_s1_t2"
+ run_tests svc_encoder_verify_environment "${svc_encoder_rtc_tests}"
+fi
diff --git a/third_party/aom/test/temporal_filter_test.cc b/third_party/aom/test/temporal_filter_test.cc
new file mode 100644
index 0000000000..85f68b817e
--- /dev/null
+++ b/third_party/aom/test/temporal_filter_test.cc
@@ -0,0 +1,788 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if !CONFIG_REALTIME_ONLY
+namespace {
+typedef enum {
+ I400, // Monochrome
+ I420, // 4:2:0
+ I422, // 4:2:2
+ I444, // 4:4:4
+} ColorFormat;
+static const char *color_fmt_str[] = { "I400", "I420", "I422", "I444" };
+typedef void (*TemporalFilterFunc)(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_level, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<TemporalFilterFunc> TemporalFilterFuncParam;
+
+typedef std::tuple<TemporalFilterFuncParam, int> TemporalFilterWithParam;
+
+class TemporalFilterTest
+ : public ::testing::TestWithParam<TemporalFilterWithParam> {
+ public:
+ ~TemporalFilterTest() override = default;
+ void SetUp() override {
+ params_ = GET_PARAM(0);
+ tf_wgt_calc_lvl_ = GET_PARAM(1);
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src1_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(8, sizeof(uint8_t) * MAX_MB_PLANE * BH * BW));
+ src2_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(8, sizeof(uint8_t) * MAX_MB_PLANE * BH * BW));
+
+ ASSERT_NE(src1_, nullptr);
+ ASSERT_NE(src2_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src1_);
+ aom_free(src2_);
+ }
+ void RunTest(int isRandom, int run_times, ColorFormat color_fmt);
+
+ void GenRandomData(int width, int height, int stride, int stride2,
+ int num_planes, int subsampling_x, int subsampling_y) {
+ uint8_t *src1p = src1_;
+ uint8_t *src2p = src2_;
+ for (int plane = 0; plane < num_planes; plane++) {
+ int plane_w = plane ? width >> subsampling_x : width;
+ int plane_h = plane ? height >> subsampling_y : height;
+ int plane_stride = plane ? stride >> subsampling_x : stride;
+ int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+ for (int ii = 0; ii < plane_h; ii++) {
+ for (int jj = 0; jj < plane_w; jj++) {
+ src1p[jj] = rnd_.Rand8();
+ src2p[jj] = rnd_.Rand8();
+ }
+ src1p += plane_stride;
+ src2p += plane_stride2;
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride, int stride2,
+ int num_planes, int subsampling_x, int subsampling_y,
+ uint8_t val) {
+ uint8_t *src1p = src1_;
+ uint8_t *src2p = src2_;
+ for (int plane = 0; plane < num_planes; plane++) {
+ int plane_w = plane ? width >> subsampling_x : width;
+ int plane_h = plane ? height >> subsampling_y : height;
+ int plane_stride = plane ? stride >> subsampling_x : stride;
+ int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+ for (int ii = 0; ii < plane_h; ii++) {
+ for (int jj = 0; jj < plane_w; jj++) {
+ src1p[jj] = val;
+ src2p[jj] = (255 - val);
+ }
+ src1p += plane_stride;
+ src2p += plane_stride2;
+ }
+ }
+ }
+
+ protected:
+ TemporalFilterFuncParam params_;
+ int32_t tf_wgt_calc_lvl_;
+ uint8_t *src1_;
+ uint8_t *src2_;
+ ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(TemporalFilterTest);
+
+void TemporalFilterTest::RunTest(int isRandom, int run_times,
+ ColorFormat color_fmt) {
+ aom_usec_timer ref_timer, test_timer;
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ static_assert(block_size == BLOCK_32X32, "");
+ const int width = 32;
+ const int height = 32;
+ int num_planes = MAX_MB_PLANE;
+ int subsampling_x = 0;
+ int subsampling_y = 0;
+ if (color_fmt == I420) {
+ subsampling_x = 1;
+ subsampling_y = 1;
+ } else if (color_fmt == I422) {
+ subsampling_x = 1;
+ subsampling_y = 0;
+ } else if (color_fmt == I400) {
+ num_planes = 1;
+ }
+ for (int k = 0; k < 3; k++) {
+ const int stride = width;
+ const int stride2 = width;
+ if (isRandom) {
+ GenRandomData(width, height, stride, stride2, num_planes, subsampling_x,
+ subsampling_y);
+ } else {
+ const int msb = 8; // Up to 8 bit input
+ const int limit = (1 << msb) - 1;
+ if (k == 0) {
+ GenExtremeData(width, height, stride, stride2, num_planes,
+ subsampling_x, subsampling_y, limit);
+ } else {
+ GenExtremeData(width, height, stride, stride2, num_planes,
+ subsampling_x, subsampling_y, 0);
+ }
+ }
+ double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
+ 2.1002103677063437 };
+ DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+ memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+ memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+ DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+ memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+ memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+ static_assert(width == 32 && height == 32, "");
+ const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
+ const int subblock_mses[4] = { 15, 16, 17, 18 };
+ const int q_factor = 12;
+ const int filter_strength = 5;
+ const int mb_row = 0;
+ const int mb_col = 0;
+ std::unique_ptr<YV12_BUFFER_CONFIG> frame_to_filter(new (std::nothrow)
+ YV12_BUFFER_CONFIG);
+ ASSERT_NE(frame_to_filter, nullptr);
+ frame_to_filter->y_crop_height = 360;
+ frame_to_filter->y_crop_width = 540;
+ frame_to_filter->heights[PLANE_TYPE_Y] = height;
+ frame_to_filter->heights[PLANE_TYPE_UV] = height >> subsampling_y;
+ frame_to_filter->strides[PLANE_TYPE_Y] = stride;
+ frame_to_filter->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
+ DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+ frame_to_filter->buffer_alloc = src;
+ frame_to_filter->flags = 0; // Only support low bit-depth test.
+ memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+
+ std::unique_ptr<MACROBLOCKD> mbd(new (std::nothrow) MACROBLOCKD);
+ ASSERT_NE(mbd, nullptr);
+ mbd->bd = 8;
+ for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+ int plane_height = plane ? height >> subsampling_y : height;
+ int plane_stride = plane ? stride >> subsampling_x : stride;
+ frame_to_filter->buffers[plane] =
+ frame_to_filter->buffer_alloc + plane * plane_stride * plane_height;
+ mbd->plane[plane].subsampling_x = plane ? subsampling_x : 0;
+ mbd->plane[plane].subsampling_y = plane ? subsampling_y : 0;
+ }
+
+ params_.ref_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_, src2_,
+ accumulator_ref, count_ref);
+ params_.tst_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_, src2_,
+ accumulator_mod, count_mod);
+
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.ref_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_, src2_,
+ accumulator_ref, count_ref);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.tst_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_, src2_,
+ accumulator_mod, count_mod);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%f\t width=%d\t height=%d\t color_format=%s\n",
+ elapsed_time_c, elapsed_time_simd,
+ (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+ height, color_fmt_str[color_fmt]);
+
+ } else {
+ for (int i = 0, l = 0; i < height; i++) {
+ for (int j = 0; j < width; j++, l++) {
+ EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+ << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+ << "] " << color_fmt_str[color_fmt]
+ << " C accumulator does not match optimized accumulator.";
+ EXPECT_EQ(count_ref[l], count_mod[l])
+ << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+ << "] " << color_fmt_str[color_fmt]
+ << " count does not match optimized count.";
+ }
+ }
+ }
+ }
+}
+
+TEST_P(TemporalFilterTest, OperationCheck) {
+ RunTest(1, 1, I400);
+ RunTest(1, 1, I420);
+ RunTest(1, 1, I422);
+ RunTest(1, 1, I444);
+}
+
+TEST_P(TemporalFilterTest, ExtremeValues) {
+ RunTest(0, 1, I400);
+ RunTest(0, 1, I420);
+ RunTest(0, 1, I422);
+ RunTest(0, 1, I444);
+}
+
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
+ RunTest(1, 100000, I400);
+ RunTest(1, 100000, I420);
+ RunTest(1, 100000, I422);
+ RunTest(1, 100000, I444);
+}
+
+#if HAVE_AVX2
+TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
+ &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_avx2),
+ Values(0, 1)));
+#endif // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
+ &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_sse2),
+ Values(0, 1)));
+#endif // HAVE_SSE2
+
+#if HAVE_NEON
+TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam(
+ &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_neon),
+ Values(0, 1)));
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+ TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+ &av1_apply_temporal_filter_neon_dotprod)
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+ Values(0, 1)));
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_AVX2 || HAVE_NEON
+// Width and height for which av1_estimate_noise_from_single_plane() will be
+// tested.
+const int kWidths[] = { 3840, 1920, 1280, 800, 640, 360, 357 };
+const int kHeights[] = { 2160, 1080, 720, 600, 480, 240, 237 };
+#endif // HAVE_AVX2 || HAVE_NEON
+
+typedef double (*EstimateNoiseFunc)(const uint8_t *src, int height, int width,
+ int stride, int edge_thresh);
+
+typedef std::tuple<EstimateNoiseFunc, EstimateNoiseFunc, int, int>
+ EstimateNoiseWithParam;
+
+class EstimateNoiseTest
+ : public ::testing::TestWithParam<EstimateNoiseWithParam> {
+ public:
+ ~EstimateNoiseTest() override = default;
+ void SetUp() override {
+ ref_func = GET_PARAM(0);
+ tst_func = GET_PARAM(1);
+ width_ = GET_PARAM(2);
+ height_ = GET_PARAM(3);
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src1_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(8, sizeof(uint8_t) * width_ * height_));
+ GenRandomData(width_ * height_);
+ ASSERT_NE(src1_, nullptr);
+ }
+
+ void TearDown() override { aom_free(src1_); }
+
+ void RunTest(int run_times) {
+ stride_ = width_;
+
+ for (int i = 0; i < run_times; i++) {
+ double ref_out = ref_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ double tst_out = tst_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ EXPECT_EQ(ref_out, tst_out);
+ }
+ }
+
+ void SpeedTest(int run_times) {
+ stride_ = width_;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ ref_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ tst_func(src1_, height_, width_, stride_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("(%3.2f)\n", time1 / time2);
+ }
+
+ void GenRandomData(int size) {
+ for (int ii = 0; ii < size; ii++) src1_[ii] = rnd_.Rand8();
+ }
+
+ protected:
+ EstimateNoiseFunc ref_func;
+ EstimateNoiseFunc tst_func;
+ ACMRandom rnd_;
+ uint8_t *src1_;
+ int width_;
+ int height_;
+ int stride_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EstimateNoiseTest);
+
+TEST_P(EstimateNoiseTest, RandomValues) { RunTest(1); }
+
+TEST_P(EstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, EstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_estimate_noise_from_single_plane_avx2),
+ ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, EstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_estimate_noise_from_single_plane_neon),
+ ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights)));
+#endif // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*HBDTemporalFilterFunc)(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_level, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<HBDTemporalFilterFunc>
+ HBDTemporalFilterFuncParam;
+
+typedef std::tuple<HBDTemporalFilterFuncParam, int> HBDTemporalFilterWithParam;
+
+class HBDTemporalFilterTest
+ : public ::testing::TestWithParam<HBDTemporalFilterWithParam> {
+ public:
+ ~HBDTemporalFilterTest() override = default;
+ void SetUp() override {
+ params_ = GET_PARAM(0);
+ tf_wgt_calc_lvl_ = GET_PARAM(1);
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src1_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
+ src2_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * MAX_MB_PLANE * BH * BW));
+
+ ASSERT_NE(src1_, nullptr);
+ ASSERT_NE(src2_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src1_);
+ aom_free(src2_);
+ }
+ void RunTest(int isRandom, int run_times, int bd, ColorFormat color_fmt);
+
+ void GenRandomData(int width, int height, int stride, int stride2, int bd,
+ int subsampling_x, int subsampling_y, int num_planes) {
+ uint16_t *src1p = src1_;
+ uint16_t *src2p = src2_;
+ for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+ int plane_w = plane ? width >> subsampling_x : width;
+ int plane_h = plane ? height >> subsampling_y : height;
+ int plane_stride = plane ? stride >> subsampling_x : stride;
+ int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+ const uint16_t max_val = (1 << bd) - 1;
+ for (int ii = 0; ii < plane_h; ii++) {
+ for (int jj = 0; jj < plane_w; jj++) {
+ src1p[jj] = rnd_.Rand16() & max_val;
+ src2p[jj] = rnd_.Rand16() & max_val;
+ }
+ src1p += plane_stride;
+ src2p += plane_stride2;
+ }
+ }
+ }
+
+ void GenExtremeData(int width, int height, int stride, int stride2, int bd,
+ int subsampling_x, int subsampling_y, int num_planes,
+ uint16_t val) {
+ uint16_t *src1p = src1_;
+ uint16_t *src2p = src2_;
+ for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+ int plane_w = plane ? width >> subsampling_x : width;
+ int plane_h = plane ? height >> subsampling_y : height;
+ int plane_stride = plane ? stride >> subsampling_x : stride;
+ int plane_stride2 = plane ? stride2 >> subsampling_x : stride2;
+ uint16_t max_val = (1 << bd) - 1;
+ for (int ii = 0; ii < plane_h; ii++) {
+ for (int jj = 0; jj < plane_w; jj++) {
+ src1p[jj] = val;
+ src2p[jj] = (max_val - val);
+ }
+ src1p += plane_stride;
+ src2p += plane_stride2;
+ }
+ }
+ }
+
+ protected:
+ HBDTemporalFilterFuncParam params_;
+ int tf_wgt_calc_lvl_;
+ uint16_t *src1_;
+ uint16_t *src2_;
+ ACMRandom rnd_;
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDTemporalFilterTest);
+
+void HBDTemporalFilterTest::RunTest(int isRandom, int run_times, int BD,
+ ColorFormat color_fmt) {
+ aom_usec_timer ref_timer, test_timer;
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ static_assert(block_size == BLOCK_32X32, "");
+ const int width = 32;
+ const int height = 32;
+ int num_planes = MAX_MB_PLANE;
+ int subsampling_x = 0;
+ int subsampling_y = 0;
+ if (color_fmt == I420) {
+ subsampling_x = 1;
+ subsampling_y = 1;
+ } else if (color_fmt == I422) {
+ subsampling_x = 1;
+ subsampling_y = 0;
+ } else if (color_fmt == I400) {
+ num_planes = 1;
+ }
+ for (int k = 0; k < 3; k++) {
+ const int stride = width;
+ const int stride2 = width;
+ if (isRandom) {
+ GenRandomData(width, height, stride, stride2, BD, subsampling_x,
+ subsampling_y, num_planes);
+ } else {
+ const int msb = BD;
+ const uint16_t limit = (1 << msb) - 1;
+ if (k == 0) {
+ GenExtremeData(width, height, stride, stride2, BD, subsampling_x,
+ subsampling_y, num_planes, limit);
+ } else {
+ GenExtremeData(width, height, stride, stride2, BD, subsampling_x,
+ subsampling_y, num_planes, 0);
+ }
+ }
+ double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
+ 2.1002103677063437 };
+ DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+ memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+ memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+ DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+ memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+ memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+ static_assert(width == 32 && height == 32, "");
+ const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
+ const int subblock_mses[4] = { 15, 16, 17, 18 };
+ const int q_factor = 12;
+ const int filter_strength = 5;
+ const int mb_row = 0;
+ const int mb_col = 0;
+ std::unique_ptr<YV12_BUFFER_CONFIG> frame_to_filter(new (std::nothrow)
+ YV12_BUFFER_CONFIG);
+ ASSERT_NE(frame_to_filter, nullptr);
+ frame_to_filter->y_crop_height = 360;
+ frame_to_filter->y_crop_width = 540;
+ frame_to_filter->heights[PLANE_TYPE_Y] = height;
+ frame_to_filter->heights[PLANE_TYPE_UV] = height >> subsampling_y;
+ frame_to_filter->strides[PLANE_TYPE_Y] = stride;
+ frame_to_filter->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
+ DECLARE_ALIGNED(16, uint16_t, src[1024 * 3]);
+ frame_to_filter->buffer_alloc = CONVERT_TO_BYTEPTR(src);
+ frame_to_filter->flags =
+ YV12_FLAG_HIGHBITDEPTH; // Only Hihgbd bit-depth test.
+ memcpy(src, src1_, 1024 * 3 * sizeof(uint16_t));
+
+ std::unique_ptr<MACROBLOCKD> mbd(new (std::nothrow) MACROBLOCKD);
+ ASSERT_NE(mbd, nullptr);
+ mbd->bd = BD;
+ for (int plane = AOM_PLANE_Y; plane < num_planes; plane++) {
+ int plane_height = plane ? height >> subsampling_y : height;
+ int plane_stride = plane ? stride >> subsampling_x : stride;
+ frame_to_filter->buffers[plane] =
+ frame_to_filter->buffer_alloc + plane * plane_stride * plane_height;
+ mbd->plane[plane].subsampling_x = plane ? subsampling_x : 0;
+ mbd->plane[plane].subsampling_y = plane ? subsampling_y : 0;
+ }
+
+ params_.ref_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_ref, count_ref);
+ params_.tst_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_mod, count_mod);
+
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.ref_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_ref, count_ref);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.tst_func(frame_to_filter.get(), mbd.get(), block_size, mb_row,
+ mb_col, num_planes, sigma, subblock_mvs, subblock_mses,
+ q_factor, filter_strength, tf_wgt_calc_lvl_,
+ CONVERT_TO_BYTEPTR(src2_), accumulator_mod, count_mod);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%f\t width=%d\t height=%d\t color_format=%s\n",
+ elapsed_time_c, elapsed_time_simd,
+ (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+ height, color_fmt_str[color_fmt]);
+
+ } else {
+ for (int i = 0, l = 0; i < height; i++) {
+ for (int j = 0; j < width; j++, l++) {
+ EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+ << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+ << "] " << color_fmt_str[color_fmt]
+ << " C accumulator does not match optimized accumulator.";
+ EXPECT_EQ(count_ref[l], count_mod[l])
+ << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+ << "] " << color_fmt_str[color_fmt]
+ << " C count does not match optimized count.";
+ }
+ }
+ }
+ }
+}
+
+TEST_P(HBDTemporalFilterTest, OperationCheck) {
+ RunTest(1, 1, 10, I400);
+ RunTest(1, 1, 10, I420);
+ RunTest(1, 1, 10, I422);
+ RunTest(1, 1, 10, I444);
+}
+
+TEST_P(HBDTemporalFilterTest, ExtremeValues) {
+ RunTest(0, 1, 10, I400);
+ RunTest(0, 1, 10, I420);
+ RunTest(0, 1, 10, I422);
+ RunTest(0, 1, 10, I444);
+}
+
+TEST_P(HBDTemporalFilterTest, DISABLED_Speed) {
+ RunTest(1, 100000, 10, I400);
+ RunTest(1, 100000, 10, I420);
+ RunTest(1, 100000, 10, I422);
+ RunTest(1, 100000, 10, I444);
+}
+#if HAVE_SSE2
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
+ HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+ &av1_highbd_apply_temporal_filter_sse2)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
+ Combine(ValuesIn(HBDtemporal_filter_test_sse2),
+ Values(0, 1)));
+#endif // HAVE_SSE2
+#if HAVE_AVX2
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
+ HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+ &av1_highbd_apply_temporal_filter_avx2)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
+ Combine(ValuesIn(HBDtemporal_filter_test_avx2),
+ Values(0, 1)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+ HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+ &av1_highbd_apply_temporal_filter_neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+ Combine(ValuesIn(HBDtemporal_filter_test_neon),
+ Values(0, 1)));
+#endif // HAVE_NEON
+
+using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
+ int width, int stride, int bit_depth,
+ int edge_thresh);
+
+using HBDEstimateNoiseWithParam =
+ std::tuple<HBDEstimateNoiseFunc, HBDEstimateNoiseFunc, int, int, int>;
+
+class HBDEstimateNoiseTest
+ : public ::testing::TestWithParam<HBDEstimateNoiseWithParam> {
+ public:
+ HBDEstimateNoiseTest()
+ : ref_func_(GET_PARAM(0)), tst_func_(GET_PARAM(1)),
+ rnd_(libaom_test::ACMRandom::DeterministicSeed()), width_(GET_PARAM(2)),
+ height_(GET_PARAM(3)), bitdepth_(GET_PARAM(4)) {}
+ ~HBDEstimateNoiseTest() override = default;
+ void SetUp() override {
+ src1_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * width_ * height_));
+ ASSERT_NE(src1_, nullptr);
+ GenRandomData(width_ * height_);
+ }
+
+ void TearDown() override { aom_free(src1_); }
+
+ void RunTest() {
+ stride_ = width_;
+
+ double ref_out = ref_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ double tst_out = tst_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+
+ EXPECT_EQ(ref_out, tst_out);
+ }
+
+ void SpeedTest(int run_times) {
+ stride_ = width_;
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ ref_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; i++) {
+ tst_func_(src1_, height_, width_, stride_, bitdepth_,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf("%d %dx%d :%7.2f/%7.2f (%3.2f)\n", bitdepth_, width_, height_, time1,
+ time2, time1 / time2);
+ }
+
+ void GenRandomData(int size) {
+ for (int ii = 0; ii < size; ii++) src1_[ii] = rnd_.Rand12();
+ }
+
+ private:
+ HBDEstimateNoiseFunc ref_func_;
+ HBDEstimateNoiseFunc tst_func_;
+ ACMRandom rnd_;
+ uint16_t *src1_;
+ int width_;
+ int height_;
+ int stride_;
+ int bitdepth_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(HBDEstimateNoiseTest);
+
+TEST_P(HBDEstimateNoiseTest, RandomValues) { RunTest(); }
+
+TEST_P(HBDEstimateNoiseTest, DISABLED_Speed) { SpeedTest(2000); }
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, HBDEstimateNoiseTest,
+ ::testing::Combine(
+ ::testing::Values(av1_highbd_estimate_noise_from_single_plane_c),
+ ::testing::Values(av1_highbd_estimate_noise_from_single_plane_neon),
+ ::testing::ValuesIn(kWidths), ::testing::ValuesIn(kHeights),
+ ::testing::ValuesIn({ 8, 10, 12 })));
+#endif // HAVE_NEON
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace
+#endif
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
new file mode 100644
index 0000000000..4b4a96d444
--- /dev/null
+++ b/third_party/aom/test/test-data.sha1
@@ -0,0 +1,575 @@
+a0edab4ab4054127474074d967a33616ccdccc76 *hantro_collage_w176h144.yuv
+d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
+26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-1814.ivf.res
+09aa07e5325b3bb5462182eb30b8ecc914630740 *invalid-chromium-906381.ivf
+09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-chromium-906381.ivf.res
+f7c83c14aa35b928ba8b70f3eaa3b92070be4519 *invalid-google-142530197-1.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-google-142530197-1.ivf.res
+703c05720d5d67053bcee44987635cd78af2f971 *invalid-google-142530197.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-google-142530197.ivf.res
+fa06784f23751d8c37be94160fb821e855199af4 *invalid-oss-fuzz-10061.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10061.ivf.res
+c9e06c4c7fb7d69fd635a1f606a5e478d60e99cf *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf
+88e18e61bd2b7457b4c71ebefbdff0029c41cc04 *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res
+91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
+b2d0a29a65879436bf483d04865faca7d11cc2ee *invalid-oss-fuzz-10389.ivf
+f4ce175af1d871ed1603c8936f6b78e968f93c85 *invalid-oss-fuzz-10389.ivf.res.4
+11df8e9a068669c678097d460b63609d3da73828 *invalid-oss-fuzz-10555.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res
+cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf
+758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res
+88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf
+64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2
+1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res.3
+0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res
+7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf
+15932651aacfc4622f0910f728f3f95e08e1753d *invalid-oss-fuzz-11477.ivf.res
+1674787c38ddf82a2e5c804203f04f56a304e8e0 *invalid-oss-fuzz-11479.ivf
+64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-11479.ivf.res.2
+b1a45514f0c59be03c9991cd04882426b9b930fa *invalid-oss-fuzz-11523.ivf
+3198c7af55a7d50173ce3c369c0cf2d9cdfface6 *invalid-oss-fuzz-11523.ivf.res.2
+cb445173be760c3554f1740ce4d119f57a7be043 *invalid-oss-fuzz-15363.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-15363.ivf.res
+5b697360bf0f02de31bae9b8da78e93570958fa4 *invalid-oss-fuzz-16437.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-16437.ivf.res.2
+e821070cea8eb687be102a1a118e0341c2e9df69 *invalid-oss-fuzz-24706.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-24706.ivf.res
+c0c32af28c5c6672d14e76d197894723e8a07b07 *invalid-oss-fuzz-33030.ivf
+fb38337e7d6203618fcfce4bc2dc17d5a4f00638 *invalid-oss-fuzz-33030.ivf.res
+ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf
+67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res
+c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-9463.ivf.res.2
+f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
+a686989de79af89136f631fd630df639c7861851 *invalid-oss-fuzz-9720.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9720.ivf.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
+614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
+b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m
+81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv
+b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m
+eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv
+9ec21aa2c4a8a9d46d5403ea20c93b0ff5ad74a1 *rand_noise_w1280h720.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
+717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m
+9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
+5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
+36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv
+c542890ac929749000f7b3883174f2202070d834 *pixel_capture_w320h240.yuv
+c2e1ec9936b95254187a359e94aa32a9f3dad1b7 *av1-1-b8-00-quantizer-00.ivf
+26cd2a0321d01d9db5f6dace8b43a40cd5b9d58d *av1-1-b8-00-quantizer-00.ivf.md5
+a56dd02c0258d4afea1ee358a22b54e99e39d5e1 *av1-1-b8-00-quantizer-01.ivf
+b3d24124d81f1fbb26f5eb0036accb54f3ec69b2 *av1-1-b8-00-quantizer-01.ivf.md5
+3466327cb842a91d69839b11ef930a74f086f4c6 *av1-1-b8-00-quantizer-02.ivf
+c111dce946100efeaad34203080eee1d55464df6 *av1-1-b8-00-quantizer-02.ivf.md5
+d3f1f32de5e2c0c19a58bb8ef096108388c6a820 *av1-1-b8-00-quantizer-03.ivf
+6265321b31130545b4454982ca93e412a56845b8 *av1-1-b8-00-quantizer-03.ivf.md5
+f37c393ebe73266a5ec8508a2ca33c586ff28e64 *av1-1-b8-00-quantizer-04.ivf
+c6e979da71aecc593c0abb40135dd304152b00dd *av1-1-b8-00-quantizer-04.ivf.md5
+ac9c5e93cb19942a9be259d0567ec96c54dcdc7c *av1-1-b8-00-quantizer-05.ivf
+49e35a7399568a0e4f015ce323d5a45ea780ca87 *av1-1-b8-00-quantizer-05.ivf.md5
+461142b1b50ae74c6b698d23f5ed3b764eadfb89 *av1-1-b8-00-quantizer-06.ivf
+6477ff260624e0f76c94ac872d1e7d5576af4177 *av1-1-b8-00-quantizer-06.ivf.md5
+7f8113cd13d8faaa06fdbaaa50dc328daf037e6d *av1-1-b8-00-quantizer-07.ivf
+b26795c6cb408487c20737977cd6b77311772bf7 *av1-1-b8-00-quantizer-07.ivf.md5
+4218f7945a172e1fe4f9e77ec35085a394eda9f4 *av1-1-b8-00-quantizer-08.ivf
+ea5d7d501e9a69d805251e4871515d28468d8676 *av1-1-b8-00-quantizer-08.ivf.md5
+837f3bcadfe56cf302db2ebaf9a990446fb35801 *av1-1-b8-00-quantizer-09.ivf
+eede995cdac5fd01a411da2e74e86e8394138be1 *av1-1-b8-00-quantizer-09.ivf.md5
+adc229b3780a4968c18ded1bcbe72e3f04643833 *av1-1-b8-00-quantizer-10.ivf
+0799b7e54e54ee97bf0e8aad2b75509ce59c7097 *av1-1-b8-00-quantizer-10.ivf.md5
+44bac8247160a8d9a0ab19f890fc89cc9298de1d *av1-1-b8-00-quantizer-11.ivf
+cc6b2bf167e114599b242aba574e8c6f1fa2f047 *av1-1-b8-00-quantizer-11.ivf.md5
+ebb3af7dfc15567188bcb617021cdc95ebc560e3 *av1-1-b8-00-quantizer-12.ivf
+b716ae29d56cd0c052dbfa1b5dcf850cd0fa8ca7 *av1-1-b8-00-quantizer-12.ivf.md5
+46159641f981a26fb9c374a5ca41e44f0ce0a9f0 *av1-1-b8-00-quantizer-13.ivf
+c6db1b8b4a74f83e4a0647e053cea0fc00f6abab *av1-1-b8-00-quantizer-13.ivf.md5
+fadc909d18eb640760fbb075f922fb050e715470 *av1-1-b8-00-quantizer-14.ivf
+e36bb6b23273633ba3ef7d28160a7258840a1476 *av1-1-b8-00-quantizer-14.ivf.md5
+8befbd9cc1601dcd36ec6911613855f68e6fd40e *av1-1-b8-00-quantizer-15.ivf
+cfc2334b76fb5e7aa9d8607e89d37cbc7716d62e *av1-1-b8-00-quantizer-15.ivf.md5
+ca42e00ae27c6b7f684fe3d2a787d50d2827cb3f *av1-1-b8-00-quantizer-16.ivf
+f11278218a7c3c73cfaab2332bab55f06cedcc81 *av1-1-b8-00-quantizer-16.ivf.md5
+05270d365bdc067f9446eda3029a6f41571a5229 *av1-1-b8-00-quantizer-17.ivf
+fb6482f35e7ad04bf231ea1806226760abcb3c26 *av1-1-b8-00-quantizer-17.ivf.md5
+617bc72037165efbff478d5a0d342b3c20ffcafd *av1-1-b8-00-quantizer-18.ivf
+1ff68d5424f91322123fe0d58f436b8e49cfa99d *av1-1-b8-00-quantizer-18.ivf.md5
+821c3b1ae6054c7a91b2f64428806e57f1157ca6 *av1-1-b8-00-quantizer-19.ivf
+f2fd118e786697553d6987f786660a2bb9f00680 *av1-1-b8-00-quantizer-19.ivf.md5
+48bcf17c27d9a4eb73632a68c09f42eff9f9af99 *av1-1-b8-00-quantizer-20.ivf
+64d55e4c858414bc2837c9c3e2d5fb6d2208c4b8 *av1-1-b8-00-quantizer-20.ivf.md5
+d61ecdd4f0950bc5c8bae1270b22e711bdd22763 *av1-1-b8-00-quantizer-21.ivf
+9d447938596096704fd5f4d41bcdf6fabf9cdfb9 *av1-1-b8-00-quantizer-21.ivf.md5
+59b4b65d8e56ccdd1bddff26a03e991a63409334 *av1-1-b8-00-quantizer-22.ivf
+aa1be0c7c7622d612af85f9bf96a212f6fe5ab56 *av1-1-b8-00-quantizer-22.ivf.md5
+95ed96988eb9916cad956db9b929718769de49f1 *av1-1-b8-00-quantizer-23.ivf
+596b8a3aea468996d609624367465c412751f52b *av1-1-b8-00-quantizer-23.ivf.md5
+e6c2dc4ce725003152797b3d7b34d7eb34da50c8 *av1-1-b8-00-quantizer-24.ivf
+1cd3d7e8b3813a9e5591b94eaeb72d471780e64a *av1-1-b8-00-quantizer-24.ivf.md5
+6734e353008824e523939d1a18daa3f2ab2d8ec6 *av1-1-b8-00-quantizer-25.ivf
+c45cf440a05802c1f9e29472175ed397d130d988 *av1-1-b8-00-quantizer-25.ivf.md5
+3372b1c69fb39811156adcea4f6dba802c0918c2 *av1-1-b8-00-quantizer-26.ivf
+b1751d55bb3fb788751fe28fb7434bee153bda68 *av1-1-b8-00-quantizer-26.ivf.md5
+e7ddb19a6e2a798d6a4e7dfdfc10b4df777b60e3 *av1-1-b8-00-quantizer-27.ivf
+0e19d6b79cd71de69d03e0455349568af979b170 *av1-1-b8-00-quantizer-27.ivf.md5
+7f1c90a35543d6b673e353b3702baf3aa1caeaa7 *av1-1-b8-00-quantizer-28.ivf
+d9a4f9cb88103249a05a7e6aa616bf0c16bf9c95 *av1-1-b8-00-quantizer-28.ivf.md5
+28d741b923011c7fcc50a7318256a638d3110a07 *av1-1-b8-00-quantizer-29.ivf
+c68cacf2b2ff2694945a99ad836dcf1ee3961c09 *av1-1-b8-00-quantizer-29.ivf.md5
+9a5d9ea4bc76dd40d04e92f33f45e9c2e120e85d *av1-1-b8-00-quantizer-30.ivf
+eb02bb8c16c4c0368ddff83e05e516e84ec9eaf3 *av1-1-b8-00-quantizer-30.ivf.md5
+20193c372f44f522e094c2c05fc7e4aaa0717fa8 *av1-1-b8-00-quantizer-31.ivf
+a4c1a4ac332f4911f0d5abbd826ebecfb8432d6c *av1-1-b8-00-quantizer-31.ivf.md5
+9617bbd691f093d259dbc8a642a57a153c1fc00c *av1-1-b8-00-quantizer-32.ivf
+73d60a348454b126ea6368ea604954bc23f210ae *av1-1-b8-00-quantizer-32.ivf.md5
+d9aea9d72a686c59b60584d827f60ca1ee8eee26 *av1-1-b8-00-quantizer-33.ivf
+fbf64de376a63d2d3051da83b0e4e56579b55c0a *av1-1-b8-00-quantizer-33.ivf.md5
+791aaf067f125e5cf4a247cf06a2e29ab071ec90 *av1-1-b8-00-quantizer-34.ivf
+8e2e6efe4c069e54844da19125c4280b95990c69 *av1-1-b8-00-quantizer-34.ivf.md5
+01ba67bba5cbf7c94c65da8f4c9bd6e7db24cf3a *av1-1-b8-00-quantizer-35.ivf
+0c5e60704a4a6bd27e67b6fd72ca7d2cf7fff50f *av1-1-b8-00-quantizer-35.ivf.md5
+3e255b4a320c9522dcec539fef770b6920b9a102 *av1-1-b8-00-quantizer-36.ivf
+1241aab865fd7b4bae73736cbeec1866ea9c90ec *av1-1-b8-00-quantizer-36.ivf.md5
+44fa6fca109747d8f43f6c6aa46d782e5d476d54 *av1-1-b8-00-quantizer-37.ivf
+947f0f887c5ac9149cf85e8114a709d6f410fc32 *av1-1-b8-00-quantizer-37.ivf.md5
+8319ac1ddd6ce3279da5780175dff7a3a5fa1054 *av1-1-b8-00-quantizer-38.ivf
+5f571b7f88678eab9e54f162cc9898f14e437770 *av1-1-b8-00-quantizer-38.ivf.md5
+5975e7056e17608593a8c40619b68e6576d373d9 *av1-1-b8-00-quantizer-39.ivf
+7c870192d6eb70ce5367147a3d2c6a52e11f7bec *av1-1-b8-00-quantizer-39.ivf.md5
+47da942f1e455f1422fc65f06dd57304541d16ac *av1-1-b8-00-quantizer-40.ivf
+6ea7116c9ce3a1641c7060bab2f5e06fd0910d61 *av1-1-b8-00-quantizer-40.ivf.md5
+ab35c15dfde21c2572b14e04dbfd5fac1adae449 *av1-1-b8-00-quantizer-41.ivf
+19596f9849653b913186b9d6b7072984ede96177 *av1-1-b8-00-quantizer-41.ivf.md5
+23a5fa6c3d0eaffaf13f6402465f5dd33d8ea7f1 *av1-1-b8-00-quantizer-42.ivf
+5a2726f0d1b1799d4f70883f1bfe5c9d976c6cf5 *av1-1-b8-00-quantizer-42.ivf.md5
+86cddfc463d2b186ec5a1aa25c4562c05201e3c3 *av1-1-b8-00-quantizer-43.ivf
+674c64ec8487ee774ad09350380fa6ac43815807 *av1-1-b8-00-quantizer-43.ivf.md5
+6894c154eb56c4f3fe44d54fc4f9af468b03d175 *av1-1-b8-00-quantizer-44.ivf
+eca679a2781eb894d18b3d578e3aaf4f48019a15 *av1-1-b8-00-quantizer-44.ivf.md5
+0960bf018ada4224b8344519cf091850d50a57bd *av1-1-b8-00-quantizer-45.ivf
+291bb43b9e1ab167040b51019daf1ccf94fd1e50 *av1-1-b8-00-quantizer-45.ivf.md5
+ea644a4732f1a2534332802c2fa5073344f3c356 *av1-1-b8-00-quantizer-46.ivf
+4c7915382b1d6d08709c95525b04ab8830f20ca1 *av1-1-b8-00-quantizer-46.ivf.md5
+d1f8832d33234e2c74a2280090850153ea24ea82 *av1-1-b8-00-quantizer-47.ivf
+90eb9959e612602934dcc512fe6f54abf0c88d9c *av1-1-b8-00-quantizer-47.ivf.md5
+69c93f760e8b666eb5b98f510e09d90f9230ac9b *av1-1-b8-00-quantizer-48.ivf
+931f869e14bd455de9dac2101b383c29e7d6f04c *av1-1-b8-00-quantizer-48.ivf.md5
+8b660c577d95c031d6711c1134b8d115097f8d7e *av1-1-b8-00-quantizer-49.ivf
+0e3fe8b49d497050dc1a0eac5f3ad60f5fe068fe *av1-1-b8-00-quantizer-49.ivf.md5
+d40bb21448a6da0fc9b88cbcf76d2f4226573acb *av1-1-b8-00-quantizer-50.ivf
+bcd2a9c9a021ba44fc5dc74ae02194fe49ca76a4 *av1-1-b8-00-quantizer-50.ivf.md5
+3b5a1d464aa89b0f1a6ad4f5a03602292b826172 *av1-1-b8-00-quantizer-51.ivf
+49bcde0c56cf8b7fbe429336981be22d39025b74 *av1-1-b8-00-quantizer-51.ivf.md5
+38970a02fb38ddb4954fe4240164cb75de5fc744 *av1-1-b8-00-quantizer-52.ivf
+fd02b034d79d4be150efb02bd4349edfd0e41311 *av1-1-b8-00-quantizer-52.ivf.md5
+2fde7a7cf3014d5196d011c47de4a144227ed122 *av1-1-b8-00-quantizer-53.ivf
+0cb66e6d8fbb29962a69ae1703e22da50db2c92b *av1-1-b8-00-quantizer-53.ivf.md5
+89a69e9b9a601e40cb491ac3a1d32491f2468ac8 *av1-1-b8-00-quantizer-54.ivf
+2f8af51acc73c99b5af81db2bdd1883b611ad311 *av1-1-b8-00-quantizer-54.ivf.md5
+31ee4f56fcb0043e95fff7af49e4ef82aafa5543 *av1-1-b8-00-quantizer-55.ivf
+04a7104e02bdd0fa38c118202dbbecdbd11ace02 *av1-1-b8-00-quantizer-55.ivf.md5
+f262f0b234006a2652fceb77b1a8711aa53abb54 *av1-1-b8-00-quantizer-56.ivf
+bdd54dc25bc5a147c76163af0bced45c56435d79 *av1-1-b8-00-quantizer-56.ivf.md5
+1ef00617091db4b2b839de623bd6b4fb0b2f5f83 *av1-1-b8-00-quantizer-57.ivf
+714c65363a87ed5e6e4ad75c79ddb6af57d41fd9 *av1-1-b8-00-quantizer-57.ivf.md5
+43c9b02feccbb3c709d96015f126b7e3d4c24c64 *av1-1-b8-00-quantizer-58.ivf
+bae22b8d6377862bff8219470c0d87205d186a68 *av1-1-b8-00-quantizer-58.ivf.md5
+ca5f780abe4c02e48cceb9c804f3625723c359bf *av1-1-b8-00-quantizer-59.ivf
+c60a20bbf60b0b0a442ef3f7b682979053909d6e *av1-1-b8-00-quantizer-59.ivf.md5
+1f6f047e9f0e1da22fb514370d92c3c7c66dcf89 *av1-1-b8-00-quantizer-60.ivf
+86dc7fa59d363cf1ae4b027a57b119bda893c1c1 *av1-1-b8-00-quantizer-60.ivf.md5
+bcf0c3353568c47a043f2dc34c9abd3fc04eebd4 *av1-1-b8-00-quantizer-61.ivf
+66fc4f729c5915aa19939d1b6e28e5b398e747bb *av1-1-b8-00-quantizer-61.ivf.md5
+ac8d3c54451b52cf557ef435d33e7638088d66df *av1-1-b8-00-quantizer-62.ivf
+b57f4e1276ead626a3662339a86111ae6fda49d2 *av1-1-b8-00-quantizer-62.ivf.md5
+2a8aa33513d8e01ae9410c4bf5fe1e471b775482 *av1-1-b8-00-quantizer-63.ivf
+9f646ec35a168f495e144c64ba7ce9aeb41cd0a2 *av1-1-b8-00-quantizer-63.ivf.md5
+838388fbda4a1d91be81ff62694c3bf13c460d38 *av1-1-b8-01-size-16x16.ivf
+4229c1caf8e25eb3073456fb90ceed206753901e *av1-1-b8-01-size-16x16.ivf.md5
+23f4253bf71e02b2e8ead66da4b3de875e879ef2 *av1-1-b8-01-size-18x16.ivf
+af125644436d4b6897dade68336cedad663b6610 *av1-1-b8-01-size-18x16.ivf.md5
+94e4a75bd93052f79998e9e08e6b5dd73dc27e50 *av1-1-b8-01-size-32x16.ivf
+e7b3fbc5e4b2469838e7ae36512bd3ce0a81040c *av1-1-b8-01-size-32x16.ivf.md5
+f297bde01c05ec5c07ff8118a0280bd36c52b246 *av1-1-b8-01-size-34x16.ivf
+f6bbd94d6063c689de3c7cf94afa2c68b969d12c *av1-1-b8-01-size-34x16.ivf.md5
+1e18bdf68bab7e7282aacc77e423bc7d93d04a8e *av1-1-b8-01-size-64x16.ivf
+de75732fccfb385294b23c17f0f1a57b455edcf7 *av1-1-b8-01-size-64x16.ivf.md5
+26b1f6ae80b161e971468085778cc1ece502b330 *av1-1-b8-01-size-66x16.ivf
+48bd99813557c314d398e6952da78da07c79d416 *av1-1-b8-01-size-66x16.ivf.md5
+ff213ecf31b982a3a7f009c9739f64e066e1ffe9 *av1-1-b8-01-size-16x18.ivf
+86b20a13b1939dc5f678e80491f190d376233d58 *av1-1-b8-01-size-16x18.ivf.md5
+c90bd878c59263a15c6a6f515d1c7e071f141559 *av1-1-b8-01-size-18x18.ivf
+6f659036ffcd3dd380cf970cf1a06f7755e0b2de *av1-1-b8-01-size-18x18.ivf.md5
+e16a1411381b34817a4c0d8e5eeaeb8cddcc9c46 *av1-1-b8-01-size-32x18.ivf
+fdb1c4ec56f5aa690eadbe897340fee86a06ae2f *av1-1-b8-01-size-32x18.ivf.md5
+fac7052b39bd2d0ae107e0e94050226712c770c2 *av1-1-b8-01-size-34x18.ivf
+adb0d5a99228027eaa3b016963df447c9818c447 *av1-1-b8-01-size-34x18.ivf.md5
+b8be5e55d9be42746c2b547d0e26e80b21c9802a *av1-1-b8-01-size-64x18.ivf
+8f8f6da34cdf78c5a6551c637e1afe279cc3884e *av1-1-b8-01-size-64x18.ivf.md5
+9e066bdcc2cd789cdf551bd4c9c85c178887b880 *av1-1-b8-01-size-66x18.ivf
+e8ec6effa936423ae2eec2b60a3160720d2de912 *av1-1-b8-01-size-66x18.ivf.md5
+6ebe45085cdeebc2acd6da5abd542a59312c0ff4 *av1-1-b8-01-size-16x32.ivf
+044695669103dbf158591dce9c649317a177d5f6 *av1-1-b8-01-size-16x32.ivf.md5
+9fabb4f60641b8c7995d1dc451419165d41258ff *av1-1-b8-01-size-18x32.ivf
+7263764680dfec864c3fad5df824ab1973489a14 *av1-1-b8-01-size-18x32.ivf.md5
+3f72841a24a13e601d79cf029aa1fdb02970ce0b *av1-1-b8-01-size-32x32.ivf
+bbe1ae2888d291ec6bc98cd0784937580c554103 *av1-1-b8-01-size-32x32.ivf.md5
+392131a7c7609acd0dba88fee14f1ed042d23ab1 *av1-1-b8-01-size-34x32.ivf
+eea68165ebe9acd28693374bf2266374b9c77786 *av1-1-b8-01-size-34x32.ivf.md5
+78afdd96265811ab9466e906347b57161e5c010d *av1-1-b8-01-size-64x32.ivf
+47b317af582700b67f6e77659db1dfaa26c8cde6 *av1-1-b8-01-size-64x32.ivf.md5
+2b4d01f2c9f23044c0d886482c7073bd4d5d37d1 *av1-1-b8-01-size-66x32.ivf
+3ad5a58a0ee5086af370b22ab2b5b7592a4f33e7 *av1-1-b8-01-size-66x32.ivf.md5
+78ddae04eb8277ae605bd7017ad7ad27bfc82d39 *av1-1-b8-01-size-16x34.ivf
+d0c18e679f1fc51e4f7409831321eed9c4858f6f *av1-1-b8-01-size-16x34.ivf.md5
+38d8ed885f46aead6ec1271d8a5d4aee79b8eb68 *av1-1-b8-01-size-18x34.ivf
+097ddbd69b8f54826a35efeb0b8b07ec198bba6b *av1-1-b8-01-size-18x34.ivf.md5
+91a42720bc2e7ba701f4d97b463a098b6707cdbd *av1-1-b8-01-size-32x34.ivf
+c590d43d37095bd2e8f8d12c9278477419b72d1a *av1-1-b8-01-size-32x34.ivf.md5
+4cc2a437dba56e8878113d9b390b980522542028 *av1-1-b8-01-size-34x34.ivf
+57eeb971f00e64abde25be69dbcb4e3ce5065a57 *av1-1-b8-01-size-34x34.ivf.md5
+b36fee1b6ad69d1206466615d69c05e0a4407939 *av1-1-b8-01-size-64x34.ivf
+a78aea0250d0b32657dc0eaf2d8394bc766c0e35 *av1-1-b8-01-size-64x34.ivf.md5
+10e441209262e082e31fef8c15b51579c9e81509 *av1-1-b8-01-size-66x34.ivf
+558b46f6ef1662c208012d0b66d1857eeff3244e *av1-1-b8-01-size-66x34.ivf.md5
+dd44aad500c7ca0fc97e3d8f0abed3c83b24c79c *av1-1-b8-01-size-16x64.ivf
+a5b64e8063abcf3e4872dc4baf1c32384dc5cf83 *av1-1-b8-01-size-16x64.ivf.md5
+aa849f0d09bcb2ead44719d63043536932d5c9f2 *av1-1-b8-01-size-18x64.ivf
+bcdf2dea3590c7031158ffe7b907d9ee35e2fe57 *av1-1-b8-01-size-18x64.ivf.md5
+36e856d30e160ba2fbb00510296202f61afaae49 *av1-1-b8-01-size-32x64.ivf
+99299f75b82c40c13f168adf2d124f57044a39a2 *av1-1-b8-01-size-32x64.ivf.md5
+e3e03ec5d38eb25e97e4ec3adc6ed40ecdebd278 *av1-1-b8-01-size-34x64.ivf
+84625abf8a200a7d20dd3dd3b277b50b3d62ce32 *av1-1-b8-01-size-34x64.ivf.md5
+7d017daebef2d39ed42a505a8e6103ab0c0988c1 *av1-1-b8-01-size-64x64.ivf
+1ff38d5ecba82fb2e6ac3b09c29c9fe74885ac29 *av1-1-b8-01-size-64x64.ivf.md5
+e1b58ba0b462508593399a2ed84db5f1c59ffcd2 *av1-1-b8-01-size-66x64.ivf
+a6b2c84c94fe79ab0373d157d1203f8d66de0706 *av1-1-b8-01-size-66x64.ivf.md5
+7b4faa7eb7b73392b62de6613282a98dddc13bb6 *av1-1-b8-01-size-16x66.ivf
+a2dacf2bae3c4ab352af66a9600946d29ab9a6ee *av1-1-b8-01-size-16x66.ivf.md5
+0f97805fa30497d4cf39665150f00dfdea52d862 *av1-1-b8-01-size-18x66.ivf
+33d8ea0765953250f998da3fe161f2a8cfca2353 *av1-1-b8-01-size-18x66.ivf.md5
+c8bb00256de973e3b3ee31b924f554336d310cdb *av1-1-b8-01-size-32x66.ivf
+6a6588e6edc68ff7739968a9e7cc6d9eaaeed356 *av1-1-b8-01-size-32x66.ivf.md5
+75ec54fec5c36eecde6d0a16e0389a5f7ad8ec22 *av1-1-b8-01-size-34x66.ivf
+36101dfa9495c18696c0d7d61f25e748f4de7425 *av1-1-b8-01-size-34x66.ivf.md5
+7e5491716e70f8199156b8843513c935667b281e *av1-1-b8-01-size-64x66.ivf
+da38755bb0c9ef56b81617835ddf1340242c6dce *av1-1-b8-01-size-64x66.ivf.md5
+68b47b386f61d67cb5b824a7e6bf87c8b9c2bf7b *av1-1-b8-01-size-66x66.ivf
+25974893956ebd92df474325946130c34f880ea7 *av1-1-b8-01-size-66x66.ivf.md5
+9f386d19c87dbfd6ac84a06d2393dd88863ac003 *av1-1-b8-01-size-196x196.ivf
+788f77f655f55de3db94dd69870316134c149116 *av1-1-b8-01-size-196x196.ivf.md5
+ed3bb2bb52a9d1786e233ef38142b15b85097875 *av1-1-b8-01-size-198x196.ivf
+3bb6b6721ad9b2838b2d07e47b29d6c0117526b1 *av1-1-b8-01-size-198x196.ivf.md5
+49461772caaaa7b824d48f4e9c77a906b0dc02d5 *av1-1-b8-01-size-200x196.ivf
+f1cba00c36909c56097c8785df476d42bc91f259 *av1-1-b8-01-size-200x196.ivf.md5
+44a656a22958e26ed169a69deb8f373117224f06 *av1-1-b8-01-size-202x196.ivf
+69be876b52fe42811bba52d36d0bcc88d6c25b3f *av1-1-b8-01-size-202x196.ivf.md5
+0a6fe9b478363faedbfd465a75790b4c2661b9ba *av1-1-b8-01-size-208x196.ivf
+fc8e95a6860a8a37ccdf1dfe49828502fcf96a08 *av1-1-b8-01-size-208x196.ivf.md5
+8e05b5a20ec95afd92bb615a7daa2e17a7ef55a8 *av1-1-b8-01-size-210x196.ivf
+0add512bffbda3300d8f684a53b13b996fe2e46d *av1-1-b8-01-size-210x196.ivf.md5
+a15f12652c6b4d0c30f13a439c941bfc4a431d1a *av1-1-b8-01-size-224x196.ivf
+b904b93252175f79e0e2b28896131ce93d5fc925 *av1-1-b8-01-size-224x196.ivf.md5
+1a57b913443b267f4a31a6925c39f5b58022f550 *av1-1-b8-01-size-226x196.ivf
+7cf3087de5804763a82d2a798243a66459664772 *av1-1-b8-01-size-226x196.ivf.md5
+2cc28541a2a72e8b45a368f71e70fc294e2de3ab *av1-1-b8-01-size-196x198.ivf
+bb736eedb4bd1e39bf9d60435b4b27a12842e112 *av1-1-b8-01-size-196x198.ivf.md5
+c4ebf93fbf3ae52108fd7b39ddef3afae48188ea *av1-1-b8-01-size-198x198.ivf
+fa4de6881511728bafa15b5f441a0cfdf683cc75 *av1-1-b8-01-size-198x198.ivf.md5
+55fce983186d454b0eb15527393bb2465ba41c6b *av1-1-b8-01-size-200x198.ivf
+1ac8fb1ee622cbc4aa1b83cb46b4731c85efae62 *av1-1-b8-01-size-200x198.ivf.md5
+67d276c67886f0a91a7ee06751a64f95eeb7bc1f *av1-1-b8-01-size-202x198.ivf
+1633b62d9e4ea41737c42f70cbde9a5671da0cef *av1-1-b8-01-size-202x198.ivf.md5
+081cb3f29d3956d4d858d9661fd3d62c94b68867 *av1-1-b8-01-size-208x198.ivf
+871d1c99167408dd32fa7603a7296c9b99ccda15 *av1-1-b8-01-size-208x198.ivf.md5
+b2d80b42468d5f296ae240cfb1fc0b3dd3d96bbc *av1-1-b8-01-size-210x198.ivf
+6a3382656cb17b532a97b1061697f9a878fc58d1 *av1-1-b8-01-size-210x198.ivf.md5
+84d7994fa20fcf6c1d8dbd4c2060c988a6fce831 *av1-1-b8-01-size-224x198.ivf
+42ea12e15de81f2e8617b6de7bae76de2da4d648 *av1-1-b8-01-size-224x198.ivf.md5
+c74a9281cf98c597121df6bff0ac5312b887f969 *av1-1-b8-01-size-226x198.ivf
+4133aae0001804e2bbc7928fc065517a6dd8b288 *av1-1-b8-01-size-226x198.ivf.md5
+27adbf148c63f807bd617cfd78aeaedb8b0f2304 *av1-1-b8-01-size-196x200.ivf
+9253e525e6207ef1ce0839b8f88ea781e9abe41e *av1-1-b8-01-size-196x200.ivf.md5
+21c9ea4d882e48353d3df66fcde0e4746168163f *av1-1-b8-01-size-198x200.ivf
+3d5ee59fde9194f0eaff736051cfd1d7b7daeff1 *av1-1-b8-01-size-198x200.ivf.md5
+c27b0b57667910847122a0309c703315e444110f *av1-1-b8-01-size-200x200.ivf
+7b2a15a17b421ef07e285ca4e8a224f0512c434d *av1-1-b8-01-size-200x200.ivf.md5
+780de549e4163a52590f7c0f488e027a8a4aa053 *av1-1-b8-01-size-202x200.ivf
+cb0ec0969522ca60d79a639e9b9509363468ffd0 *av1-1-b8-01-size-202x200.ivf.md5
+2c59821904863e264ae61401cbd494a79bc04f13 *av1-1-b8-01-size-208x200.ivf
+9963955966a52b65cdd13465c9fb2ba3b5356755 *av1-1-b8-01-size-208x200.ivf.md5
+ff63121611ea9c0628c7e5af13de5e7786611ca6 *av1-1-b8-01-size-210x200.ivf
+2a5993be234e3af2af6d185b2a6f3aaf1979b83a *av1-1-b8-01-size-210x200.ivf.md5
+b8485ada95440d78b51153227231b1aced1a8273 *av1-1-b8-01-size-224x200.ivf
+9c3cd32ea6c006a91eb37d69dbeccf878de5d214 *av1-1-b8-01-size-224x200.ivf.md5
+1aa0ce3e3a74f9b600a146e98b05547a0b454c48 *av1-1-b8-01-size-226x200.ivf
+e045be96c3af16a9ddc10a9933e8ddfb3319d716 *av1-1-b8-01-size-226x200.ivf.md5
+e92b76480f4339855d998b97182f36b28deadcfa *av1-1-b8-01-size-196x202.ivf
+480c707abcd2a650e2160ec397f8348cecb45770 *av1-1-b8-01-size-196x202.ivf.md5
+137b9c0d10a3bdbdf6f97b3e6331f3e8acaf8f91 *av1-1-b8-01-size-198x202.ivf
+7429642146d0da55161ab13024a261094ee2ce87 *av1-1-b8-01-size-198x202.ivf.md5
+9cea71c44ad015ac702d675bacca17876e65cb1a *av1-1-b8-01-size-200x202.ivf
+76b1ec6c42da55f47e389a561590d1a7c713e495 *av1-1-b8-01-size-200x202.ivf.md5
+26dffdcd0dac9becf68d12e31fcd91eddf1f7154 *av1-1-b8-01-size-202x202.ivf
+ddb75e99123fed4ef05d9b85200cefd8985bc84c *av1-1-b8-01-size-202x202.ivf.md5
+04007e83bb66ba547d09f8926ea5bfc7fd9e4b2a *av1-1-b8-01-size-208x202.ivf
+5b72eb58db22087ad416c499119f41e718395b52 *av1-1-b8-01-size-208x202.ivf.md5
+721ff7c0ae0e2ed896b5acac230113f1404e769c *av1-1-b8-01-size-210x202.ivf
+187d2ef939fc26e1a1c7de65abe8e058d8aae17a *av1-1-b8-01-size-210x202.ivf.md5
+dba41421cc938bcf0234254f96be0325ab66186e *av1-1-b8-01-size-224x202.ivf
+58856038c1eb13a7bf0353a30b1affe844cd31b1 *av1-1-b8-01-size-224x202.ivf.md5
+55eba14878d25dcc351ee5e92fa06e559035b409 *av1-1-b8-01-size-226x202.ivf
+e295b3d791d40d7c1fff2c40a260078dccaef24a *av1-1-b8-01-size-226x202.ivf.md5
+6c777223990ddfd92040a8526646ed0f39299b0d *av1-1-b8-01-size-196x208.ivf
+5210daff766cddaf3945610ee05ff242aef8175a *av1-1-b8-01-size-196x208.ivf.md5
+252831abfb9f4a9a8556c21cc3bf60adfe88210f *av1-1-b8-01-size-198x208.ivf
+35ed9601e608a829980cec81e41b7bd3e5f4c2ce *av1-1-b8-01-size-198x208.ivf.md5
+e800ed893a88704a4576d4984957f3664560daa9 *av1-1-b8-01-size-200x208.ivf
+82c038f9072a2fcf8d55fb4a474fdd791ba9a290 *av1-1-b8-01-size-200x208.ivf.md5
+9ce7bb932dd99f86da8ff2ab89fa4d3089a78da8 *av1-1-b8-01-size-202x208.ivf
+0611bf0179abe3c820a447a2bd3a04c3790f3a87 *av1-1-b8-01-size-202x208.ivf.md5
+e5900d9150c8bebc49776227afd3b0a21f5a6ac6 *av1-1-b8-01-size-208x208.ivf
+86d6b9a3840aa0a77938547c905bd6f45d069681 *av1-1-b8-01-size-208x208.ivf.md5
+2758ba5dad16f4a91334f2ed07a4a037201bb873 *av1-1-b8-01-size-210x208.ivf
+78453b1fda2ccc6f35e0d762567807757bcddb16 *av1-1-b8-01-size-210x208.ivf.md5
+fff88fb8e833f6b4ad64cb591b219c7cceb7f2d2 *av1-1-b8-01-size-224x208.ivf
+87266fc34aaed82cdb98cbc309b221ad52eccd81 *av1-1-b8-01-size-224x208.ivf.md5
+dec839fe64046461015b56cda191835284f42a52 *av1-1-b8-01-size-226x208.ivf
+d7a15264fc3fd55d3aec0ccfaa7c434c6d90969f *av1-1-b8-01-size-226x208.ivf.md5
+584782e93ed1cb7797a90fece44becdd1e23bf0d *av1-1-b8-01-size-196x210.ivf
+ed76ec841b18a457853e368576967c4768fc2730 *av1-1-b8-01-size-196x210.ivf.md5
+dab625599b9f01398b593e865d9a4a95a029d60f *av1-1-b8-01-size-198x210.ivf
+b90e8d96a1f5b329b088b467a11fed2d055d74ca *av1-1-b8-01-size-198x210.ivf.md5
+6774bee17b9e50d2d8630e2e1afc30ded67e662d *av1-1-b8-01-size-200x210.ivf
+343a86bd54eb3dd5e9902eb62a3d776dcff2f4f3 *av1-1-b8-01-size-200x210.ivf.md5
+0456c3b8e242eeee019ca97d155f81124de62c90 *av1-1-b8-01-size-202x210.ivf
+5a6a6428c9858a0d3561db42ceaf981c143fe479 *av1-1-b8-01-size-202x210.ivf.md5
+6a3a8f65bf806b1be7726b983427880f772c9986 *av1-1-b8-01-size-208x210.ivf
+5563ea6d8c65887553ff3000addc6418913f1650 *av1-1-b8-01-size-208x210.ivf.md5
+5a8b69489f8e9b917ea7718ad2645101cdbe5644 *av1-1-b8-01-size-210x210.ivf
+f4b01604036fa23000d44fbf42097ae1181bcd62 *av1-1-b8-01-size-210x210.ivf.md5
+fb6f5b08a048698cfe324557ee8cd840c4a3f6ce *av1-1-b8-01-size-224x210.ivf
+3ce5c404e3ca09c8e994b3043bad42cd555b00c0 *av1-1-b8-01-size-224x210.ivf.md5
+2e9fc8510d2131b2f3c9a93bececac985e4426d2 *av1-1-b8-01-size-226x210.ivf
+897c537e259331ca86cdd6e4d2bd343f8538402e *av1-1-b8-01-size-226x210.ivf.md5
+8300512106fce3424eb74b5d4bc0f4f19f7c9af8 *av1-1-b8-01-size-196x224.ivf
+43662ea025ea79afe4964fd4d12a77f4aa4e565e *av1-1-b8-01-size-196x224.ivf.md5
+640f8fda7ade8f2850e2275a9f5e233e33a0ba8d *av1-1-b8-01-size-198x224.ivf
+9ac690bdbbce47d7b169128b568f955e70076f8c *av1-1-b8-01-size-198x224.ivf.md5
+ce2e9379c72fc924e364d5727605394a1438a211 *av1-1-b8-01-size-200x224.ivf
+1ec35a53d88072b96b255202f678178bc7e5bb20 *av1-1-b8-01-size-200x224.ivf.md5
+5d3af7921623deccb578115c8ce207c019f97f50 *av1-1-b8-01-size-202x224.ivf
+14eafd55b0cda3a3476cae7ad500dbd5ee899dd5 *av1-1-b8-01-size-202x224.ivf.md5
+6b6d78e466cf94a5ef8dfe252caa0948dd2ec175 *av1-1-b8-01-size-208x224.ivf
+e178b0c272dfcfe614c6b49cb28dad11781af0b6 *av1-1-b8-01-size-208x224.ivf.md5
+dd2232b9e18971d7e19650a1e3218aef1010247f *av1-1-b8-01-size-210x224.ivf
+40a66198c47820f5fa2d2e389ec0c1191ea4ffcc *av1-1-b8-01-size-210x224.ivf.md5
+9ec028b81a5ea311683328d856f436e6d0b0e6a0 *av1-1-b8-01-size-224x224.ivf
+143b9530ce722385db2c2d883daa649ed42b8d40 *av1-1-b8-01-size-224x224.ivf.md5
+bf833947e62935c54e1e727ccb36157f7c1e9e5d *av1-1-b8-01-size-226x224.ivf
+ca4f3b44463106e4f0bb54e490c3bd457d7d780b *av1-1-b8-01-size-226x224.ivf.md5
+5525f7e312ec073f480ed5a2be5bdc4f0ce51a09 *av1-1-b8-01-size-196x226.ivf
+062d4b240741184458d2d2abd243ed7877631de8 *av1-1-b8-01-size-196x226.ivf.md5
+e6b911142394b94c23191eaa63c9eb41a00f80b0 *av1-1-b8-01-size-198x226.ivf
+3b580d903dddf47082f5e055bfb01a4f05c09b7d *av1-1-b8-01-size-198x226.ivf.md5
+70feb5efeb28df25f7d1a661c73bf013c5ada9b4 *av1-1-b8-01-size-200x226.ivf
+f0b894e7f787e62f1492be62f3dedeb065062160 *av1-1-b8-01-size-200x226.ivf.md5
+7f9a10831e2389b31497fad50080b4d5452d6e91 *av1-1-b8-01-size-202x226.ivf
+45b7194eba9367c8059403c23ca4ae49e988dfaf *av1-1-b8-01-size-202x226.ivf.md5
+967837a2cfbf9aa3131f73aec6a52dcdd82926c7 *av1-1-b8-01-size-208x226.ivf
+c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5
+9c926226b9f6b015501d8ac1e3f95e8570283a05 *av1-1-b8-01-size-210x226.ivf
+57d4837667fd4c5a7aeb908626d701b632852c60 *av1-1-b8-01-size-210x226.ivf.md5
+25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf
+87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5
+40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf
+34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
+9bbe8499796aa588ff02e313fb0d4349940d2fea *av1-1-b10-00-quantizer-00.ivf
+36b402eedad2bacee8ac09acce44e2fc356dd80b *av1-1-b10-00-quantizer-00.ivf.md5
+1d5e1d2827624f328020bf123df213bb175577e0 *av1-1-b10-00-quantizer-01.ivf
+16c529be5502369e43ce9c6fe99a9709968e3daf *av1-1-b10-00-quantizer-01.ivf.md5
+39abc20739242a8f05efd4b35d7603c8ad7ff45d *av1-1-b10-00-quantizer-02.ivf
+81faa72c3d43b003966fe09ffaae51b07b1059be *av1-1-b10-00-quantizer-02.ivf.md5
+92ebf349b803333a43824a83d997b8cf76f656f9 *av1-1-b10-00-quantizer-03.ivf
+5e7556dc998cb8b506a43cc078e30802d7e600e6 *av1-1-b10-00-quantizer-03.ivf.md5
+1c496177c66e49f2e3556af87ec67afb5060170b *av1-1-b10-00-quantizer-04.ivf
+560fea4800a44fe19ed8d3e74f425bdbf1fb8abd *av1-1-b10-00-quantizer-04.ivf.md5
+7de864b8475ce0acd0ecb01827f2c9add815352b *av1-1-b10-00-quantizer-05.ivf
+1c1aea3db3f54a91866d89fd3b1a0d285ca10310 *av1-1-b10-00-quantizer-05.ivf.md5
+b6501c165619b036d0f7864fd4739973d2d18970 *av1-1-b10-00-quantizer-06.ivf
+d758c8eff275651006c41e7dd447cac13b489ad7 *av1-1-b10-00-quantizer-06.ivf.md5
+e4df6f588f156dffaafd9517b64f753cfc9ccf05 *av1-1-b10-00-quantizer-07.ivf
+3c577f67dade4537de642fd457ea2b367424f336 *av1-1-b10-00-quantizer-07.ivf.md5
+07e9c4c18abb36c8699c1c12bebcc727f090b525 *av1-1-b10-00-quantizer-08.ivf
+4981568ade3170f311cb114fa2689edc4bc35e67 *av1-1-b10-00-quantizer-08.ivf.md5
+2268ecd2899f1b41ae9898925b1d62cfefa30282 *av1-1-b10-00-quantizer-09.ivf
+029b03029b65b7c4c208961f0820467ad42fd3d6 *av1-1-b10-00-quantizer-09.ivf.md5
+3d2adaf6441cfa9585dcbf7d19d65bf6992a29a3 *av1-1-b10-00-quantizer-10.ivf
+017b7fb4c3ba0747c2d5688d493da33ef993d110 *av1-1-b10-00-quantizer-10.ivf.md5
+006535760bd7dc1cfc95e648b05215954a2e76c2 *av1-1-b10-00-quantizer-11.ivf
+c0ae083deb8e820aa49034af4d100944dd977018 *av1-1-b10-00-quantizer-11.ivf.md5
+840e0cbfe1acc8a7a45c823dc55ab44a0b6b553e *av1-1-b10-00-quantizer-12.ivf
+49232ea38bdef650c94808f53834f1137cd4bf39 *av1-1-b10-00-quantizer-12.ivf.md5
+04b0e5a7387e07474f51be4b2c3e05211b40f0d0 *av1-1-b10-00-quantizer-13.ivf
+a51b5ec4b890df3a64f9f0d866b8c41296c9e081 *av1-1-b10-00-quantizer-13.ivf.md5
+5dc47a140fbcbf08bf91481ee3585e9e067561ab *av1-1-b10-00-quantizer-14.ivf
+2625319eef69d6225e6ab6e5ce7790491406cb5d *av1-1-b10-00-quantizer-14.ivf.md5
+f866be86d8d8aa08ded30e42988b0936c1a16064 *av1-1-b10-00-quantizer-15.ivf
+03b7c1eefb54d99e30051c7123c0453f04a6579d *av1-1-b10-00-quantizer-15.ivf.md5
+548df2371dfb485419ed9baf28e3f495c64f364a *av1-1-b10-00-quantizer-16.ivf
+8a0d6bf1626b05b65c77331305414fe9be54e8c6 *av1-1-b10-00-quantizer-16.ivf.md5
+0077c82f96a2e095a3cb8de9bfa63715e3c9f438 *av1-1-b10-00-quantizer-17.ivf
+5d85f77f3087f4b206930722a945c60039262be4 *av1-1-b10-00-quantizer-17.ivf.md5
+1e0f1245ecb4c903b5dc7072d959fc43a7bba381 *av1-1-b10-00-quantizer-18.ivf
+06316ae2b45f2359a70cc3855ffd6ab81048b41a *av1-1-b10-00-quantizer-18.ivf.md5
+f197198f7ec058110185fda5297a1a43993654df *av1-1-b10-00-quantizer-19.ivf
+bac522c7f234d506c75b5495d74b3fa57c83a4df *av1-1-b10-00-quantizer-19.ivf.md5
+c2f57324d000b349323f37d5ebebde8c2b861f30 *av1-1-b10-00-quantizer-20.ivf
+999c6110786cbc25e67792234a5a02f2cb4553c7 *av1-1-b10-00-quantizer-20.ivf.md5
+2ffad9adfd19286fe2166ba877289d201c9a634f *av1-1-b10-00-quantizer-21.ivf
+d55713eaa791cfd7bf69b6c26d5032029d9a0f06 *av1-1-b10-00-quantizer-21.ivf.md5
+382528db53328c1a38976f5d9b579eef35d839f4 *av1-1-b10-00-quantizer-22.ivf
+cb5bd459e1a90126da9264cff4281515f95755b2 *av1-1-b10-00-quantizer-22.ivf.md5
+b52cc6160fc66f72ad66c198d275a1c73f925022 *av1-1-b10-00-quantizer-23.ivf
+c0f9d6659e1f283e9356fd7b4ac9f7cc5544cdc2 *av1-1-b10-00-quantizer-23.ivf.md5
+e11f15e3b63e7606b1122bb3670ee77c09c04840 *av1-1-b10-00-quantizer-24.ivf
+e9f141b924440e044270c81a68458fe498599a8e *av1-1-b10-00-quantizer-24.ivf.md5
+fb91793b69824c99b0218788dcea0a74ebd7e84e *av1-1-b10-00-quantizer-25.ivf
+434e33d609b2683c3cfbcc3a2cdfc26339590fb6 *av1-1-b10-00-quantizer-25.ivf.md5
+d82e38f31cdcf8b43479e6ddaa83373de38f70a2 *av1-1-b10-00-quantizer-26.ivf
+183943b851ba383a536f13c83b93f61ac8961ad5 *av1-1-b10-00-quantizer-26.ivf.md5
+6bf5e4e8e0aca699e493b9eb3672d2117494d74d *av1-1-b10-00-quantizer-27.ivf
+f0fb7e0a99180828b0e38b2cfe0622eecc2d26b8 *av1-1-b10-00-quantizer-27.ivf.md5
+d5adee2567544c3ae4223b3f3528a770377878d2 *av1-1-b10-00-quantizer-28.ivf
+14edf588efc67570e529b0ff8aeb8e7a0c69238b *av1-1-b10-00-quantizer-28.ivf.md5
+e6dcdc106847956035e3f00aabf4470f97e1887e *av1-1-b10-00-quantizer-29.ivf
+413c5cb778611c7c1a810b53861b9ab1fb391f17 *av1-1-b10-00-quantizer-29.ivf.md5
+b5e98b3f6b1db04d46bf43064c6ac64f797aff00 *av1-1-b10-00-quantizer-30.ivf
+d1a603661d76c28658c7cd2892b408e91d77893e *av1-1-b10-00-quantizer-30.ivf.md5
+80168371d1150e82e3f46bcbbcabba458b835b19 *av1-1-b10-00-quantizer-31.ivf
+904ecd033d4af5239c4d5b3f86e51ed5c3c2e3fb *av1-1-b10-00-quantizer-31.ivf.md5
+96291f6ace85980892d135a5b74188cd629c325f *av1-1-b10-00-quantizer-32.ivf
+a5ceace390d4a75d48281fe29060c21557e4f5ae *av1-1-b10-00-quantizer-32.ivf.md5
+0f80495de34eae07c4905b72573a315a879390ec *av1-1-b10-00-quantizer-33.ivf
+72b8f662973a660412946687dff878b276ae518e *av1-1-b10-00-quantizer-33.ivf.md5
+24905e3be7db320994b7fb8311dfd50a7c9e54da *av1-1-b10-00-quantizer-34.ivf
+cea514bb1b7b064c4d31914a2cb266611c278577 *av1-1-b10-00-quantizer-34.ivf.md5
+083012960dd7c17d3b00fa0e807759c98faded8f *av1-1-b10-00-quantizer-35.ivf
+de5fdb9e1e581484af1cc7d2dd3c3e84c90cebb2 *av1-1-b10-00-quantizer-35.ivf.md5
+f725f179aeee5b413620c0dd81b007b245c2a7ed *av1-1-b10-00-quantizer-36.ivf
+246b1931c04c02df1f168090e2650827cd5dbabd *av1-1-b10-00-quantizer-36.ivf.md5
+f6aa824156e9848f237481889a8103eb6130f31d *av1-1-b10-00-quantizer-37.ivf
+a8f78dd15fc2994369a08c2ddddcd0760c62ea5b *av1-1-b10-00-quantizer-37.ivf.md5
+a8dd662338c493aea266b99203e70af25982633f *av1-1-b10-00-quantizer-38.ivf
+09f36d998e85d0450060f540e50b075ae1432fc6 *av1-1-b10-00-quantizer-38.ivf.md5
+d97428871720ed658da6ed0e3f7c15da83387e4c *av1-1-b10-00-quantizer-39.ivf
+8c5230048909ee8f86f87c116f153cd910d0141f *av1-1-b10-00-quantizer-39.ivf.md5
+86e754e55e9b63c6e0a4fef01761414f8a6b61ca *av1-1-b10-00-quantizer-40.ivf
+99a71accf6457264e45ca80d3b1f082ee5acdecc *av1-1-b10-00-quantizer-40.ivf.md5
+9d18b7236506ab7e107c062620b64096ec0cf423 *av1-1-b10-00-quantizer-41.ivf
+5771159a9a7c7b66c9e13bb13ec3d53b37860208 *av1-1-b10-00-quantizer-41.ivf.md5
+54b72bc879a80e66613f421e67db62bba1c0041b *av1-1-b10-00-quantizer-42.ivf
+bf958236883ee7209ef4cb0b7503b430634a291e *av1-1-b10-00-quantizer-42.ivf.md5
+a06d5321a51d90404dd7085ae511d7df5d5e1e05 *av1-1-b10-00-quantizer-43.ivf
+ddb25723d976043d863634b9dc3b5fb84a245803 *av1-1-b10-00-quantizer-43.ivf.md5
+2ea0b64c170d7299dae1c14a8a49349aee8e0d08 *av1-1-b10-00-quantizer-44.ivf
+d18bde1b4893792173fa2014665e9364395ad5e9 *av1-1-b10-00-quantizer-44.ivf.md5
+73e506a32d3518e23424f231c7b5323d7a34a3d6 *av1-1-b10-00-quantizer-45.ivf
+be6224ebc77a3e5fb9c1645b876007e584a09d89 *av1-1-b10-00-quantizer-45.ivf.md5
+841223871374464194edc739c48dc7cefd1ff255 *av1-1-b10-00-quantizer-46.ivf
+4766d616f923496a8dc113c9b7f875f0c0735f9a *av1-1-b10-00-quantizer-46.ivf.md5
+8bbbbea130aaea453f7b826956a5520d10a0eccf *av1-1-b10-00-quantizer-47.ivf
+3ea21fac0c492b03d8ec25e4ee0971cd57e5f71a *av1-1-b10-00-quantizer-47.ivf.md5
+3ce83e0f1e1835b9a6c10fe502a16fd3650839e0 *av1-1-b10-00-quantizer-48.ivf
+b468de2c09fca5a6b2bb7a20bab4afd8d192c31d *av1-1-b10-00-quantizer-48.ivf.md5
+f3a757c678aa00f9a9c4c4658d37733fd935925a *av1-1-b10-00-quantizer-49.ivf
+f888dc88db576122695d4eb41c486aacd28a2d1d *av1-1-b10-00-quantizer-49.ivf.md5
+a9d78aaef105cc5a95b7ebb54783f37e75673123 *av1-1-b10-00-quantizer-50.ivf
+06d0c5e79cc794030c4be022089b1d12c1383f71 *av1-1-b10-00-quantizer-50.ivf.md5
+165c20ee372f83682d094541097e375227353239 *av1-1-b10-00-quantizer-51.ivf
+b3d90214b8c6e6f6d9357bb5784d10081325c356 *av1-1-b10-00-quantizer-51.ivf.md5
+5b3ea7a18654d943065f5c176974c3960b56664e *av1-1-b10-00-quantizer-52.ivf
+dc61a6e4e2549074130023b14b137fb4fe442ce3 *av1-1-b10-00-quantizer-52.ivf.md5
+74c3b5851b6a94d33b575a689eb8d34592e95d5f *av1-1-b10-00-quantizer-53.ivf
+a80e43a0fb2b852426bd941b8d4b8f56690e9bc9 *av1-1-b10-00-quantizer-53.ivf.md5
+d05b8dea2cddd4f0d9e792f42f71afbd29f7811c *av1-1-b10-00-quantizer-54.ivf
+432937893321f4bd25fa400b8988c5788cb06ecf *av1-1-b10-00-quantizer-54.ivf.md5
+4eaee0f1970426be0bbeb7d4fccdc7e804e9bea4 *av1-1-b10-00-quantizer-55.ivf
+710ab95ce1dcd2540db4477ff4ee6ab771fe0759 *av1-1-b10-00-quantizer-55.ivf.md5
+fe637930c9faa8744cba37effc4cb5510315d1c0 *av1-1-b10-00-quantizer-56.ivf
+2f9431b30523fb6a3e4122f22c6c3ff7b96a7987 *av1-1-b10-00-quantizer-56.ivf.md5
+ed54fc7fcec194eef1f50adbbe12a6a36ab6836b *av1-1-b10-00-quantizer-57.ivf
+43bccac7800b399210cf15520a83739c23a5d9c7 *av1-1-b10-00-quantizer-57.ivf.md5
+a7b8d628ba3e4c5f37aa6a3d7b82afda73ac89dc *av1-1-b10-00-quantizer-58.ivf
+b26638272b787df54f45a46629b852acbcb73e3d *av1-1-b10-00-quantizer-58.ivf.md5
+c077f22ff547fb5ffd020e8dac91d05942fb52df *av1-1-b10-00-quantizer-59.ivf
+4efd99cc0891bf345b8cd2ae8e21709d61be497b *av1-1-b10-00-quantizer-59.ivf.md5
+301ab53039d75e1ffa8cc6a0874d9ea94e4a6a0d *av1-1-b10-00-quantizer-60.ivf
+4729bd734a6edd2d8d0432a3f66b3d91d565050e *av1-1-b10-00-quantizer-60.ivf.md5
+c78640d3211034df9fcb273bdfc18625819652f2 *av1-1-b10-00-quantizer-61.ivf
+3d823eb2b33ccfea68db506626bcbecf49b0f167 *av1-1-b10-00-quantizer-61.ivf.md5
+bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf
+75457d8476f1927f737d089dcf3d0f7f99f3c4fb *av1-1-b10-00-quantizer-62.ivf.md5
+8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf
+63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5
+a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf
+8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5
+e69e41fee40b408b6eebcc79f266a95f2ee24f9e *av1-1-b8-03-sizedown.mkv
+8c528fb3ccda959a29721566e132f730935ca32b *av1-1-b8-03-sizedown.mkv.md5
+1889da5ee1708007e47bb887470ac477e1d7ba01 *av1-1-b8-03-sizeup.mkv
+8de81b170635d456602dc8923a8b39c534d01fa8 *av1-1-b8-03-sizeup.mkv.md5
+d3ed7de0aa8c155fe35e0f5f4203240710d31383 *park_joy_90p_8_420_monochrome.y4m
+5b3f0907407b809aa66b62cb080feda8c92454ca *park_joy_90p_8_420_vertical_csp.y4m
+caf8b6a5f1a5bcb38afae8a54a08c4f4459aafa3 *vase10x10_tiles.txt
+e14825f50ff845b8a6932c64cb254007a0b5e3a1 *av1-1-b8-22-svc-L2T1.ivf
+0f75f2ac44e61fc83be70c955410fa378e433237 *av1-1-b8-22-svc-L2T1.ivf.md5
+e94687eb0e90179b3800b6d5e11eb7e9bfb34eec *av1-1-b8-22-svc-L1T2.ivf
+2bc12b16385ea14323bc79607fb8dfbd7edaf8ef *av1-1-b8-22-svc-L1T2.ivf.md5
+32ef2f14ee9cb11a24a22934f4c065e926e5d236 *av1-1-b8-22-svc-L2T2.ivf
+f476a10ff06d750129f8229755d51e17ff141b2a *av1-1-b8-22-svc-L2T2.ivf.md5
+afca5502a489692b0a3c120370b0f43b8fc572a1 *av1-1-b8-04-cdfupdate.ivf
+13b9423155a08d5e3a2fd9ae4a973bb046718cdf *av1-1-b8-04-cdfupdate.ivf.md5
+f064290d7fcd3b3de19020e8aec6c43c88d3a505 *av1-1-b8-05-mv.ivf
+bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5
+b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf
+1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5
+f8724ed96272ddbc35776908f2df7cb9955766a9 *paris_352_288_30.y4m
+11bb40026103182c23a88133edafca369e5575e2 *av1-1-b8-23-film_grain-50.ivf
+c58ccf7ff04711acc559c06f0bfce3c5b14800c3 *av1-1-b8-23-film_grain-50.ivf.md5
+2f883c7e11c21a31f79bd9c809541be90b0c7c4a *av1-1-b10-23-film_grain-50.ivf
+83f2094fca597ad38b4fd623b807de1774c53ffb *av1-1-b10-23-film_grain-50.ivf.md5
+644e05c6bc0418a72b86427aa01e8b4ecea85e03 *desktop1.320_180.yuv
+ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
+a17584012187cd886b64f8cb0f35bfd8d762f9dc *av1-1-b8-24-monochrome.ivf
+e71cd9a07f928c527c900daddd071ae60337426d *av1-1-b8-24-monochrome.ivf.md5
+03a8d002594ccc51932332002bb6f9837ef46d0f *av1-1-b10-24-monochrome.ivf
+e24aa6951afd7b2bb53eb1a73e25a19e7b189f82 *av1-1-b10-24-monochrome.ivf.md5
+df0c9481104aa8c81f9e3b61b6d147a331ad3e35 *firstpass_stats
+3eaf216d9fc8b4b9bb8c3956311f49a85974806c *bus_352x288_420_f20_b8.yuv
+c7f336958e7af6162c20ddc84d67c7dfa9826910 *av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf
+36a4fcf07e645ed522cde5845dd9c6ab2b2d1502 *av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf.md5
+9f935d391fdf4a6f7c320355d45770d2e7d6095c *desktopqvga2.320_240.yuv
+4d1ad6d3070268ccb000d7fc3ae0f5a9447bfe82 *test_input_w1h1.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
+9c2aa2d0f63f706f775bf661dfa81e8bb3089d8b *wikipedia_420_360p_60f.y4m
diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake
new file mode 100644
index 0000000000..ce94a5a657
--- /dev/null
+++ b/third_party/aom/test/test.cmake
@@ -0,0 +1,647 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_TEST_TEST_CMAKE_)
+ return()
+endif() # AOM_TEST_TEST_CMAKE_
+set(AOM_TEST_TEST_CMAKE_ 1)
+
+include(ProcessorCount)
+
+include("${AOM_ROOT}/test/test_data_util.cmake")
+
+set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
+set(AOM_IDE_TEST_FOLDER "test")
+set(AOM_IDE_TESTDATA_FOLDER "testdata")
+
+# Appends |AOM_TEST_SOURCE_VARS| with |src_list_name| at the caller's scope.
+# This collects all variables containing libaom test source files.
+function(add_to_libaom_test_srcs src_list_name)
+ list(APPEND AOM_TEST_SOURCE_VARS ${src_list_name})
+ set(AOM_TEST_SOURCE_VARS "${AOM_TEST_SOURCE_VARS}" PARENT_SCOPE)
+endfunction()
+
+list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+ "${AOM_ROOT}/test/test_libaom.cc")
+add_to_libaom_test_srcs(AOM_UNIT_TEST_WRAPPER_SOURCES)
+
+list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/acm_random.h"
+ "${AOM_ROOT}/test/aom_image_test.cc"
+ "${AOM_ROOT}/test/aom_integer_test.cc"
+ "${AOM_ROOT}/test/av1_config_test.cc"
+ "${AOM_ROOT}/test/av1_key_value_api_test.cc"
+ "${AOM_ROOT}/test/block_test.cc"
+ "${AOM_ROOT}/test/codec_factory.h"
+ "${AOM_ROOT}/test/function_equivalence_test.h"
+ "${AOM_ROOT}/test/log2_test.cc"
+ "${AOM_ROOT}/test/md5_helper.h"
+ "${AOM_ROOT}/test/register_state_check.h"
+ "${AOM_ROOT}/test/test_vectors.cc"
+ "${AOM_ROOT}/test/test_vectors.h"
+ "${AOM_ROOT}/test/transform_test_base.h"
+ "${AOM_ROOT}/test/util.h"
+ "${AOM_ROOT}/test/video_source.h")
+add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_SOURCES)
+
+list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
+ "${AOM_ROOT}/test/decode_scalability_test.cc"
+ "${AOM_ROOT}/test/external_frame_buffer_test.cc"
+ "${AOM_ROOT}/test/invalid_file_test.cc"
+ "${AOM_ROOT}/test/test_vector_test.cc"
+ "${AOM_ROOT}/test/ivf_video_source.h")
+add_to_libaom_test_srcs(AOM_UNIT_TEST_DECODER_SOURCES)
+
+list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/active_map_test.cc"
+ "${AOM_ROOT}/test/aq_segment_test.cc"
+ "${AOM_ROOT}/test/av1_external_partition_test.cc"
+ "${AOM_ROOT}/test/avif_progressive_test.cc"
+ "${AOM_ROOT}/test/borders_test.cc"
+ "${AOM_ROOT}/test/cpu_speed_test.cc"
+ "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
+ "${AOM_ROOT}/test/datarate_test.cc"
+ "${AOM_ROOT}/test/datarate_test.h"
+ "${AOM_ROOT}/test/deltaq_mode_test.cc"
+ "${AOM_ROOT}/test/dropframe_encode_test.cc"
+ "${AOM_ROOT}/test/svc_datarate_test.cc"
+ "${AOM_ROOT}/test/encode_api_test.cc"
+ "${AOM_ROOT}/test/encode_small_width_height_test.cc"
+ "${AOM_ROOT}/test/encode_test_driver.cc"
+ "${AOM_ROOT}/test/encode_test_driver.h"
+ "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
+ "${AOM_ROOT}/test/forced_max_frame_width_height_test.cc"
+ "${AOM_ROOT}/test/force_key_frame_test.cc"
+ "${AOM_ROOT}/test/gf_pyr_height_test.cc"
+ "${AOM_ROOT}/test/rt_end_to_end_test.cc"
+ "${AOM_ROOT}/test/allintra_end_to_end_test.cc"
+ "${AOM_ROOT}/test/loopfilter_control_test.cc"
+ "${AOM_ROOT}/test/frame_size_tests.cc"
+ "${AOM_ROOT}/test/horz_superres_test.cc"
+ "${AOM_ROOT}/test/i420_video_source.h"
+ "${AOM_ROOT}/test/level_test.cc"
+ "${AOM_ROOT}/test/metadata_test.cc"
+ "${AOM_ROOT}/test/monochrome_test.cc"
+ "${AOM_ROOT}/test/postproc_filters_test.cc"
+ "${AOM_ROOT}/test/resize_test.cc"
+ "${AOM_ROOT}/test/scalability_test.cc"
+ "${AOM_ROOT}/test/sharpness_test.cc"
+ "${AOM_ROOT}/test/y4m_test.cc"
+ "${AOM_ROOT}/test/y4m_video_source.h"
+ "${AOM_ROOT}/test/yuv_video_source.h"
+ "${AOM_ROOT}/test/time_stamp_test.cc")
+add_to_libaom_test_srcs(AOM_UNIT_TEST_ENCODER_SOURCES)
+
+list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
+list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
+add_to_libaom_test_srcs(AOM_UNIT_TEST_WEBM_SOURCES)
+list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+ "${AOM_ROOT}/test/test_intra_pred_speed.cc")
+
+if(CONFIG_AV1_DECODER)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/decode_test_driver.cc"
+ "${AOM_ROOT}/test/decode_test_driver.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS AND CONFIG_AV1_HIGHBITDEPTH)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/hbd_metrics_test.cc")
+endif()
+
+list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
+
+if(CONFIG_REALTIME_ONLY)
+ list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/allintra_end_to_end_test.cc"
+ "${AOM_ROOT}/test/av1_external_partition_test.cc"
+ "${AOM_ROOT}/test/avif_progressive_test.cc"
+ "${AOM_ROOT}/test/borders_test.cc"
+ "${AOM_ROOT}/test/cpu_speed_test.cc"
+ "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
+ "${AOM_ROOT}/test/deltaq_mode_test.cc"
+ "${AOM_ROOT}/test/dropframe_encode_test.cc"
+ "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
+ "${AOM_ROOT}/test/force_key_frame_test.cc"
+ "${AOM_ROOT}/test/gf_pyr_height_test.cc"
+ "${AOM_ROOT}/test/horz_superres_test.cc"
+ "${AOM_ROOT}/test/level_test.cc"
+ "${AOM_ROOT}/test/metadata_test.cc"
+ "${AOM_ROOT}/test/monochrome_test.cc"
+ "${AOM_ROOT}/test/postproc_filters_test.cc"
+ "${AOM_ROOT}/test/sharpness_test.cc")
+endif()
+
+if(NOT BUILD_SHARED_LIBS)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/aom_mem_test.cc"
+ "${AOM_ROOT}/test/av1_common_int_test.cc"
+ "${AOM_ROOT}/test/cdef_test.cc"
+ "${AOM_ROOT}/test/cfl_test.cc"
+ "${AOM_ROOT}/test/convolve_test.cc"
+ "${AOM_ROOT}/test/hiprec_convolve_test.cc"
+ "${AOM_ROOT}/test/hiprec_convolve_test_util.cc"
+ "${AOM_ROOT}/test/hiprec_convolve_test_util.h"
+ "${AOM_ROOT}/test/intrabc_test.cc"
+ "${AOM_ROOT}/test/intrapred_test.cc"
+ "${AOM_ROOT}/test/lpf_test.cc"
+ "${AOM_ROOT}/test/scan_test.cc"
+ "${AOM_ROOT}/test/selfguided_filter_test.cc"
+ "${AOM_ROOT}/test/simd_cmp_impl.h"
+ "${AOM_ROOT}/test/simd_impl.h")
+
+ if(HAVE_SSE2)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/test/simd_cmp_sse2.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE2)
+ endif()
+
+ if(HAVE_SSSE3)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSSE3)
+ endif()
+
+ if(HAVE_SSE4_1)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/test/simd_cmp_sse4.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1)
+ endif()
+
+ if(HAVE_AVX2)
+ list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/test/simd_cmp_avx2.cc")
+ add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_AVX2)
+ endif()
+
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/arf_freq_test.cc"
+ "${AOM_ROOT}/test/av1_convolve_test.cc"
+ "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
+ "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
+ "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
+ "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+ "${AOM_ROOT}/test/av1_k_means_test.cc"
+ "${AOM_ROOT}/test/av1_nn_predict_test.cc"
+ "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+ "${AOM_ROOT}/test/av1_softmax_test.cc"
+ "${AOM_ROOT}/test/av1_txfm_test.cc"
+ "${AOM_ROOT}/test/av1_txfm_test.h"
+ "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
+ "${AOM_ROOT}/test/avg_test.cc"
+ "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
+ "${AOM_ROOT}/test/blend_a64_mask_test.cc"
+ "${AOM_ROOT}/test/comp_avg_pred_test.cc"
+ "${AOM_ROOT}/test/comp_avg_pred_test.h"
+ "${AOM_ROOT}/test/comp_mask_pred_test.cc"
+ "${AOM_ROOT}/test/disflow_test.cc"
+ "${AOM_ROOT}/test/encodemb_test.cc"
+ "${AOM_ROOT}/test/encodetxb_test.cc"
+ "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
+ "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
+ "${AOM_ROOT}/test/error_block_test.cc"
+ "${AOM_ROOT}/test/fdct4x4_test.cc"
+ "${AOM_ROOT}/test/fft_test.cc"
+ "${AOM_ROOT}/test/firstpass_test.cc"
+ "${AOM_ROOT}/test/fwht4x4_test.cc"
+ "${AOM_ROOT}/test/hadamard_test.cc"
+ "${AOM_ROOT}/test/horver_correlation_test.cc"
+ "${AOM_ROOT}/test/masked_sad_test.cc"
+ "${AOM_ROOT}/test/masked_variance_test.cc"
+ "${AOM_ROOT}/test/minmax_test.cc"
+ "${AOM_ROOT}/test/motion_vector_test.cc"
+ "${AOM_ROOT}/test/mv_cost_test.cc"
+ "${AOM_ROOT}/test/noise_model_test.cc"
+ "${AOM_ROOT}/test/obmc_sad_test.cc"
+ "${AOM_ROOT}/test/obmc_variance_test.cc"
+ "${AOM_ROOT}/test/pickrst_test.cc"
+ "${AOM_ROOT}/test/reconinter_test.cc"
+ "${AOM_ROOT}/test/sad_test.cc"
+ "${AOM_ROOT}/test/subtract_test.cc"
+ "${AOM_ROOT}/test/sum_squares_test.cc"
+ "${AOM_ROOT}/test/sse_sum_test.cc"
+ "${AOM_ROOT}/test/variance_test.cc"
+ "${AOM_ROOT}/test/warp_filter_test.cc"
+ "${AOM_ROOT}/test/warp_filter_test_util.cc"
+ "${AOM_ROOT}/test/warp_filter_test_util.h"
+ "${AOM_ROOT}/test/webmenc_test.cc"
+ "${AOM_ROOT}/test/wiener_test.cc")
+
+ if(NOT CONFIG_REALTIME_ONLY)
+ list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/test/corner_match_test.cc")
+ endif()
+
+ if(CONFIG_ACCOUNTING)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/accounting_test.cc")
+ endif()
+
+ if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/altref_test.cc"
+ "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+ "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+ "${AOM_ROOT}/test/binary_codes_test.cc"
+ "${AOM_ROOT}/test/boolcoder_test.cc"
+ "${AOM_ROOT}/test/cnn_test.cc"
+ "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+ "${AOM_ROOT}/test/divu_small_test.cc"
+ "${AOM_ROOT}/test/dr_prediction_test.cc"
+ "${AOM_ROOT}/test/ec_test.cc"
+ "${AOM_ROOT}/test/error_resilience_test.cc"
+ "${AOM_ROOT}/test/ethread_test.cc"
+ "${AOM_ROOT}/test/film_grain_table_test.cc"
+ "${AOM_ROOT}/test/kf_test.cc"
+ "${AOM_ROOT}/test/lossless_test.cc"
+ "${AOM_ROOT}/test/quant_test.cc"
+ "${AOM_ROOT}/test/ratectrl_test.cc"
+ "${AOM_ROOT}/test/rd_test.cc"
+ "${AOM_ROOT}/test/sb_multipass_test.cc"
+ "${AOM_ROOT}/test/sb_qp_sweep_test.cc"
+ "${AOM_ROOT}/test/screen_content_test.cc"
+ "${AOM_ROOT}/test/segment_binarization_sync.cc"
+ "${AOM_ROOT}/test/still_picture_test.cc"
+ "${AOM_ROOT}/test/temporal_filter_test.cc"
+ "${AOM_ROOT}/test/tile_config_test.cc"
+ "${AOM_ROOT}/test/tile_independence_test.cc"
+ "${AOM_ROOT}/test/tpl_model_test.cc")
+ if(CONFIG_AV1_HIGHBITDEPTH)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/coding_path_sync.cc")
+ endif()
+ if(CONFIG_REALTIME_ONLY)
+ list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/altref_test.cc"
+ "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+ "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+ "${AOM_ROOT}/test/cnn_test.cc"
+ "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+ "${AOM_ROOT}/test/error_resilience_test.cc"
+ "${AOM_ROOT}/test/kf_test.cc"
+ "${AOM_ROOT}/test/lossless_test.cc"
+ "${AOM_ROOT}/test/sb_multipass_test.cc"
+ "${AOM_ROOT}/test/sb_qp_sweep_test.cc"
+ "${AOM_ROOT}/test/selfguided_filter_test.cc"
+ "${AOM_ROOT}/test/screen_content_test.cc"
+ "${AOM_ROOT}/test/still_picture_test.cc"
+ "${AOM_ROOT}/test/tile_independence_test.cc"
+ "${AOM_ROOT}/test/tpl_model_test.cc")
+ endif()
+ endif()
+
+ if(CONFIG_FPMT_TEST AND (NOT CONFIG_REALTIME_ONLY))
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/frame_parallel_enc_test.cc")
+ endif()
+
+ if(HAVE_SSE2)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/simd_sse2_test.cc")
+ endif()
+
+ if(HAVE_SSSE3)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/simd_ssse3_test.cc")
+ endif()
+
+ if(HAVE_SSE4_1)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/simd_sse4_test.cc")
+ endif()
+
+ if(HAVE_SSE4_1 OR HAVE_NEON)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/filterintra_test.cc")
+
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/av1_highbd_iht_test.cc")
+ endif()
+
+ if(HAVE_AVX2)
+ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+ "${AOM_ROOT}/test/simd_avx2_test.cc")
+ endif()
+
+ if(CONFIG_AV1_TEMPORAL_DENOISING AND (HAVE_SSE2 OR HAVE_NEON))
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/av1_temporal_denoiser_test.cc")
+ endif()
+
+ if(CONFIG_AV1_HIGHBITDEPTH)
+ list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/test/av1_quantize_test.cc")
+ endif()
+
+ if(HAVE_SSE2 OR HAVE_NEON)
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/quantize_func_test.cc")
+ endif()
+
+ if(HAVE_SSE4_1)
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
+ "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+ "${AOM_ROOT}/test/intra_edge_test.cc")
+ endif()
+
+ if(HAVE_NEON)
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
+ "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+ "${AOM_ROOT}/test/intra_edge_test.cc")
+ endif()
+
+ if(HAVE_SSE4_2 OR HAVE_ARM_CRC32)
+ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
+ endif()
+
+ if(CONFIG_REALTIME_ONLY)
+ list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+ "${AOM_ROOT}/test/disflow_test.cc"
+ "${AOM_ROOT}/test/end_to_end_qmpsnr_test.cc"
+ "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
+ "${AOM_ROOT}/test/firstpass_test.cc"
+ "${AOM_ROOT}/test/motion_vector_test.cc"
+ "${AOM_ROOT}/test/obmc_sad_test.cc"
+ "${AOM_ROOT}/test/obmc_variance_test.cc"
+ "${AOM_ROOT}/test/pickrst_test.cc"
+ "${AOM_ROOT}/test/warp_filter_test.cc"
+ "${AOM_ROOT}/test/warp_filter_test_util.cc"
+ "${AOM_ROOT}/test/warp_filter_test_util.h"
+ "${AOM_ROOT}/test/wiener_test.cc")
+ endif()
+endif()
+
+if(CONFIG_AV1_ENCODER AND ENABLE_TESTS)
+ list(APPEND AOM_RC_TEST_SOURCES "${AOM_ROOT}/test/codec_factory.h"
+ "${AOM_ROOT}/test/decode_test_driver.cc"
+ "${AOM_ROOT}/test/decode_test_driver.h"
+ "${AOM_ROOT}/test/encode_test_driver.cc"
+ "${AOM_ROOT}/test/encode_test_driver.h"
+ "${AOM_ROOT}/test/i420_video_source.h"
+ "${AOM_ROOT}/test/ratectrl_rtc_test.cc"
+ "${AOM_ROOT}/test/test_aom_rc.cc" "${AOM_ROOT}/test/util.h")
+ if(CONFIG_THREE_PASS)
+ # Add the dependencies of "${AOM_ROOT}/common/ivfdec.c".
+ list(APPEND AOM_RC_TEST_SOURCES "${AOM_ROOT}/common/tools_common.c"
+ "${AOM_ROOT}/common/tools_common.h"
+ "${AOM_GEN_SRC_DIR}/usage_exit.c")
+ endif()
+endif()
+
+if(ENABLE_TESTS)
+ if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
+ set(CMAKE_MACOSX_RPATH 1)
+ endif()
+
+ add_library(
+ aom_gtest STATIC
+ "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+ set_property(TARGET aom_gtest PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ target_include_directories(
+ aom_gtest
+ PUBLIC "${AOM_ROOT}/third_party/googletest/src/googletest/include"
+ PRIVATE "${AOM_ROOT}/third_party/googletest/src/googletest")
+
+ # The definition of GTEST_HAS_PTHREAD must be public, since it's checked by
+ # interface headers, not just by the implementation.
+ if(NOT (MSVC OR WIN32))
+ if(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
+ target_compile_definitions(aom_gtest PUBLIC GTEST_HAS_PTHREAD=1)
+ else()
+ target_compile_definitions(aom_gtest PUBLIC GTEST_HAS_PTHREAD=0)
+ endif()
+ endif()
+
+ add_library(
+ aom_gmock STATIC
+ "${AOM_ROOT}/third_party/googletest/src/googlemock/src/gmock-all.cc")
+ set_property(TARGET aom_gmock PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ target_include_directories(
+ aom_gmock
+ PUBLIC "${AOM_ROOT}/third_party/googletest/src/googlemock/include"
+ PRIVATE "${AOM_ROOT}/third_party/googletest/src/googlemock")
+ target_link_libraries(aom_gmock ${AOM_LIB_LINK_TYPE} aom_gtest)
+endif()
+
+# Setup testdata download targets, test build targets, and test run targets. The
+# libaom and app util targets must exist before this function is called.
+function(setup_aom_test_targets)
+
+ # TODO(tomfinegan): Build speed optimization. $AOM_UNIT_TEST_COMMON_SOURCES
+ # and $AOM_UNIT_TEST_ENCODER_SOURCES are very large. The build of test targets
+ # could be sped up (on multicore build machines) by compiling sources in each
+ # list into separate object library targets, and then linking them into
+ # test_libaom.
+ add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES})
+ set_property(TARGET test_aom_common PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ add_dependencies(test_aom_common aom)
+ target_link_libraries(test_aom_common ${AOM_LIB_LINK_TYPE} aom_gtest)
+
+ if(CONFIG_AV1_DECODER)
+ add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES})
+ set_property(TARGET test_aom_decoder PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ add_dependencies(test_aom_decoder aom)
+ target_link_libraries(test_aom_decoder ${AOM_LIB_LINK_TYPE} aom_gtest)
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES})
+ set_property(TARGET test_aom_encoder PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ add_dependencies(test_aom_encoder aom)
+ target_link_libraries(test_aom_encoder ${AOM_LIB_LINK_TYPE} aom_gtest)
+ endif()
+
+ add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
+ $<TARGET_OBJECTS:aom_common_app_util>
+ $<TARGET_OBJECTS:test_aom_common>)
+ set_property(TARGET test_libaom PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ list(APPEND AOM_APP_TARGETS test_libaom)
+
+ if(CONFIG_AV1_DECODER)
+ target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:aom_decoder_app_util>
+ $<TARGET_OBJECTS:test_aom_decoder>)
+
+ if(ENABLE_DECODE_PERF_TESTS AND CONFIG_WEBM_IO)
+ target_sources(test_libaom PRIVATE ${AOM_DECODE_PERF_TEST_SOURCES})
+ endif()
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:test_aom_encoder>
+ $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+ if(ENABLE_ENCODE_PERF_TESTS)
+ target_sources(test_libaom PRIVATE ${AOM_ENCODE_PERF_TEST_SOURCES})
+ endif()
+
+ if(NOT BUILD_SHARED_LIBS)
+ add_executable(test_intra_pred_speed
+ ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
+ $<TARGET_OBJECTS:aom_common_app_util>)
+ set_property(TARGET test_intra_pred_speed
+ PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
+ aom_gtest)
+ list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
+ endif()
+ endif()
+
+ target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest)
+
+ if(CONFIG_WEBM_IO)
+ target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:webm>)
+ endif()
+ if(HAVE_SSE2)
+ add_intrinsics_source_to_target("-msse2" "test_libaom"
+ "AOM_UNIT_TEST_COMMON_INTRIN_SSE2")
+ endif()
+ if(HAVE_SSSE3)
+ add_intrinsics_source_to_target("-mssse3" "test_libaom"
+ "AOM_UNIT_TEST_COMMON_INTRIN_SSSE3")
+ endif()
+ if(HAVE_SSE4_1)
+ add_intrinsics_source_to_target("-msse4.1" "test_libaom"
+ "AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1")
+ if(CONFIG_AV1_ENCODER)
+ if(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
+ add_intrinsics_source_to_target("-msse4.1" "test_libaom"
+ "AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1")
+ endif()
+ endif()
+ endif()
+ if(HAVE_AVX2)
+ add_intrinsics_source_to_target("-mavx2" "test_libaom"
+ "AOM_UNIT_TEST_COMMON_INTRIN_AVX2")
+ endif()
+ if(HAVE_NEON)
+ add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
+ "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
+ endif()
+
+ if(ENABLE_TESTDATA)
+ make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
+ test_file_checksums)
+ list(LENGTH test_files num_test_files)
+ list(LENGTH test_file_checksums num_test_file_checksums)
+
+ math(EXPR max_file_index "${num_test_files} - 1")
+ foreach(test_index RANGE ${max_file_index})
+ list(GET test_files ${test_index} test_file)
+ list(GET test_file_checksums ${test_index} test_file_checksum)
+ add_custom_target(
+ testdata_${test_index}
+ COMMAND ${CMAKE_COMMAND}
+ -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" -DAOM_ROOT="${AOM_ROOT}"
+ -DAOM_TEST_FILE="${test_file}"
+ -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
+ "${AOM_ROOT}/test/test_data_download_worker.cmake")
+ set_property(TARGET testdata_${test_index}
+ PROPERTY FOLDER ${AOM_IDE_TESTDATA_FOLDER})
+ list(APPEND testdata_targets testdata_${test_index})
+ endforeach()
+
+ # Create a custom build target for running each test data download target.
+ add_custom_target(testdata)
+ add_dependencies(testdata ${testdata_targets})
+ set_property(TARGET testdata PROPERTY FOLDER ${AOM_IDE_TESTDATA_FOLDER})
+
+ # Skip creation of test run targets when generating for Visual Studio and
+ # Xcode unless the user explicitly requests IDE test hosting. This is done
+ # to make build cycles in the IDE tolerable when the IDE command for build
+ # project is used to build AOM. Default behavior in IDEs is to build all
+ # targets, and the test run takes hours.
+ if(((NOT MSVC) AND (NOT XCODE)) OR ENABLE_IDE_TEST_HOSTING)
+
+ # Pick a reasonable number of targets (this controls parallelization).
+ processorcount(num_test_targets)
+ if(num_test_targets EQUAL 0) # Just default to 10 targets when there's no
+ # processor count available.
+ set(num_test_targets 10)
+ endif()
+
+ math(EXPR max_shard_index "${num_test_targets} - 1")
+ foreach(shard_index RANGE ${max_shard_index})
+ set(test_name "test_${shard_index}")
+ add_custom_target(${test_name}
+ COMMAND ${CMAKE_COMMAND}
+ -DGTEST_SHARD_INDEX=${shard_index}
+ -DGTEST_TOTAL_SHARDS=${num_test_targets}
+ -DTEST_LIBAOM=$<TARGET_FILE:test_libaom> -P
+ "${AOM_ROOT}/test/test_runner.cmake"
+ DEPENDS testdata test_libaom)
+ set_property(TARGET ${test_name} PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ list(APPEND test_targets ${test_name})
+ endforeach()
+ add_custom_target(runtests)
+ set_property(TARGET runtests PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ add_dependencies(runtests ${test_targets})
+ endif()
+ endif()
+
+ # Libaom_test_srcs.txt generation.
+ set(libaom_test_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_test_srcs.txt")
+ file(WRITE "${libaom_test_srcs_txt_file}"
+ "# This file is generated. DO NOT EDIT.\n")
+
+ # Static source file list first.
+ list(SORT AOM_TEST_SOURCE_VARS)
+ foreach(aom_test_source_var ${AOM_TEST_SOURCE_VARS})
+ if("${aom_test_source_var}" STREQUAL "${last_aom_test_source_var}")
+ message(
+ FATAL_ERROR
+ "Duplicate AOM_TEST_SOURCE_VARS entry: ${aom_test_source_var}")
+ endif()
+ foreach(file ${${aom_test_source_var}})
+ if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+ string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+ file(APPEND "${libaom_test_srcs_txt_file}" "${file}\n")
+ endif()
+ endforeach()
+ set(last_aom_test_source_var ${aom_test_source_var})
+ endforeach()
+
+ # libaom_test_srcs.gni generation
+ set(libaom_test_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_test_srcs.gni")
+ file(WRITE "${libaom_test_srcs_gni_file}"
+ "# This file is generated. DO NOT EDIT.\n")
+
+ foreach(aom_test_source_var ${AOM_TEST_SOURCE_VARS})
+ string(TOLOWER "${aom_test_source_var}" aom_test_source_var_lowercase)
+ file(APPEND "${libaom_test_srcs_gni_file}"
+ "\n${aom_test_source_var_lowercase} = [\n")
+
+ foreach(file ${${aom_test_source_var}})
+ if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+ string(REPLACE "${AOM_ROOT}/" "//third_party/libaom/source/libaom/" file
+ "${file}")
+ file(APPEND "${libaom_test_srcs_gni_file}" " \"${file}\",\n")
+ endif()
+ endforeach()
+
+ file(APPEND "${libaom_test_srcs_gni_file}" "]\n")
+ endforeach()
+
+ # Set up test for rc interface
+ if(CONFIG_AV1_ENCODER
+ AND ENABLE_TESTS
+ AND CONFIG_WEBM_IO
+ AND NOT BUILD_SHARED_LIBS
+ AND NOT CONFIG_REALTIME_ONLY)
+ add_executable(test_aom_rc ${AOM_RC_TEST_SOURCES})
+ target_link_libraries(test_aom_rc ${AOM_LIB_LINK_TYPE} aom aom_av1_rc
+ aom_gtest aom_gmock webm)
+ set_property(TARGET test_aom_rc PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
+ list(APPEND AOM_APP_TARGETS test_aom_rc)
+ endif()
+
+ set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/test/test_aom_rc.cc b/third_party/aom/test/test_aom_rc.cc
new file mode 100644
index 0000000000..0182b62ec8
--- /dev/null
+++ b/third_party/aom/test/test_aom_rc.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/third_party/aom/test/test_data_download_worker.cmake b/third_party/aom/test/test_data_download_worker.cmake
new file mode 100644
index 0000000000..a49038888d
--- /dev/null
+++ b/third_party/aom/test/test_data_download_worker.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+include("${AOM_ROOT}/test/test_data_util.cmake")
+
+# https://github.com/cheshirekow/cmake_format/issues/34
+# cmake-format: off
+if (NOT AOM_ROOT OR NOT AOM_CONFIG_DIR OR NOT AOM_TEST_FILE
+ OR NOT AOM_TEST_CHECKSUM)
+ message(FATAL_ERROR
+ "AOM_ROOT, AOM_CONFIG_DIR, AOM_TEST_FILE and AOM_TEST_CHECKSUM must be
+ defined.")
+endif ()
+# cmake-format: on
+
+set(AOM_TEST_DATA_URL "https://storage.googleapis.com/aom-test-data")
+
+if(NOT AOM_TEST_DATA_PATH)
+ set(AOM_TEST_DATA_PATH "$ENV{LIBAOM_TEST_DATA_PATH}")
+endif()
+
+if("${AOM_TEST_DATA_PATH}" STREQUAL "")
+ message(
+ WARNING "Writing test data to ${AOM_CONFIG_DIR}, set "
+ "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.")
+ set(AOM_TEST_DATA_PATH "${AOM_CONFIG_DIR}")
+endif()
+
+if(NOT EXISTS "${AOM_TEST_DATA_PATH}")
+ file(MAKE_DIRECTORY "${AOM_TEST_DATA_PATH}")
+endif()
+
+expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_PATH}" "filepath")
+expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_URL}" "url")
+
+check_file("${filepath}" "${AOM_TEST_CHECKSUM}" "needs_download")
+if(needs_download)
+ download_test_file("${url}" "${AOM_TEST_CHECKSUM}" "${filepath}")
+endif()
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
new file mode 100644
index 0000000000..069e1ad526
--- /dev/null
+++ b/third_party/aom/test/test_data_util.cmake
@@ -0,0 +1,665 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+list(APPEND AOM_TEST_DATA_FILE_NAMES
+ "desktopqvga2.320_240.yuv"
+ "desktop1.320_180.yuv"
+ "hantro_collage_w176h144.yuv"
+ "hantro_collage_w352h288.yuv"
+ "hantro_collage_w352h288_nv12.yuv"
+ "hantro_odd.yuv"
+ "paris_352_288_30.y4m"
+ "park_joy_90p_10_420.y4m"
+ "park_joy_90p_10_422.y4m"
+ "park_joy_90p_10_444.y4m"
+ "park_joy_90p_12_420.y4m"
+ "park_joy_90p_12_422.y4m"
+ "park_joy_90p_12_444.y4m"
+ "park_joy_90p_8_420_a10-1.y4m"
+ "park_joy_90p_8_420.y4m"
+ "park_joy_90p_8_420_monochrome.y4m"
+ "park_joy_90p_8_420_vertical_csp.y4m"
+ "park_joy_90p_8_422.y4m"
+ "park_joy_90p_8_444.y4m"
+ "pixel_capture_w320h240.yuv"
+ "desktop_credits.y4m"
+ "rand_noise_w1280h720.yuv"
+ "niklas_1280_720_30.y4m"
+ "rush_hour_444.y4m"
+ "screendata.y4m"
+ "niklas_640_480_30.yuv"
+ "vase10x10.yuv"
+ "vase10x10_tiles.txt"
+ "bus_352x288_420_f20_b8.yuv"
+ "test_input_w1h1.yuv"
+ "crowd_run_360p_10_150f.y4m"
+ "wikipedia_420_360p_60f.y4m")
+
+if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+ list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
+endif()
+
+if(CONFIG_AV1_DECODER)
+ list(APPEND AOM_TEST_DATA_FILE_NAMES
+ "av1-1-b8-00-quantizer-00.ivf"
+ "av1-1-b8-00-quantizer-00.ivf.md5"
+ "av1-1-b8-00-quantizer-01.ivf"
+ "av1-1-b8-00-quantizer-01.ivf.md5"
+ "av1-1-b8-00-quantizer-02.ivf"
+ "av1-1-b8-00-quantizer-02.ivf.md5"
+ "av1-1-b8-00-quantizer-03.ivf"
+ "av1-1-b8-00-quantizer-03.ivf.md5"
+ "av1-1-b8-00-quantizer-04.ivf"
+ "av1-1-b8-00-quantizer-04.ivf.md5"
+ "av1-1-b8-00-quantizer-05.ivf"
+ "av1-1-b8-00-quantizer-05.ivf.md5"
+ "av1-1-b8-00-quantizer-06.ivf"
+ "av1-1-b8-00-quantizer-06.ivf.md5"
+ "av1-1-b8-00-quantizer-07.ivf"
+ "av1-1-b8-00-quantizer-07.ivf.md5"
+ "av1-1-b8-00-quantizer-08.ivf"
+ "av1-1-b8-00-quantizer-08.ivf.md5"
+ "av1-1-b8-00-quantizer-09.ivf"
+ "av1-1-b8-00-quantizer-09.ivf.md5"
+ "av1-1-b8-00-quantizer-10.ivf"
+ "av1-1-b8-00-quantizer-10.ivf.md5"
+ "av1-1-b8-00-quantizer-11.ivf"
+ "av1-1-b8-00-quantizer-11.ivf.md5"
+ "av1-1-b8-00-quantizer-12.ivf"
+ "av1-1-b8-00-quantizer-12.ivf.md5"
+ "av1-1-b8-00-quantizer-13.ivf"
+ "av1-1-b8-00-quantizer-13.ivf.md5"
+ "av1-1-b8-00-quantizer-14.ivf"
+ "av1-1-b8-00-quantizer-14.ivf.md5"
+ "av1-1-b8-00-quantizer-15.ivf"
+ "av1-1-b8-00-quantizer-15.ivf.md5"
+ "av1-1-b8-00-quantizer-16.ivf"
+ "av1-1-b8-00-quantizer-16.ivf.md5"
+ "av1-1-b8-00-quantizer-17.ivf"
+ "av1-1-b8-00-quantizer-17.ivf.md5"
+ "av1-1-b8-00-quantizer-18.ivf"
+ "av1-1-b8-00-quantizer-18.ivf.md5"
+ "av1-1-b8-00-quantizer-19.ivf"
+ "av1-1-b8-00-quantizer-19.ivf.md5"
+ "av1-1-b8-00-quantizer-20.ivf"
+ "av1-1-b8-00-quantizer-20.ivf.md5"
+ "av1-1-b8-00-quantizer-21.ivf"
+ "av1-1-b8-00-quantizer-21.ivf.md5"
+ "av1-1-b8-00-quantizer-22.ivf"
+ "av1-1-b8-00-quantizer-22.ivf.md5"
+ "av1-1-b8-00-quantizer-23.ivf"
+ "av1-1-b8-00-quantizer-23.ivf.md5"
+ "av1-1-b8-00-quantizer-24.ivf"
+ "av1-1-b8-00-quantizer-24.ivf.md5"
+ "av1-1-b8-00-quantizer-25.ivf"
+ "av1-1-b8-00-quantizer-25.ivf.md5"
+ "av1-1-b8-00-quantizer-26.ivf"
+ "av1-1-b8-00-quantizer-26.ivf.md5"
+ "av1-1-b8-00-quantizer-27.ivf"
+ "av1-1-b8-00-quantizer-27.ivf.md5"
+ "av1-1-b8-00-quantizer-28.ivf"
+ "av1-1-b8-00-quantizer-28.ivf.md5"
+ "av1-1-b8-00-quantizer-29.ivf"
+ "av1-1-b8-00-quantizer-29.ivf.md5"
+ "av1-1-b8-00-quantizer-30.ivf"
+ "av1-1-b8-00-quantizer-30.ivf.md5"
+ "av1-1-b8-00-quantizer-31.ivf"
+ "av1-1-b8-00-quantizer-31.ivf.md5"
+ "av1-1-b8-00-quantizer-32.ivf"
+ "av1-1-b8-00-quantizer-32.ivf.md5"
+ "av1-1-b8-00-quantizer-33.ivf"
+ "av1-1-b8-00-quantizer-33.ivf.md5"
+ "av1-1-b8-00-quantizer-34.ivf"
+ "av1-1-b8-00-quantizer-34.ivf.md5"
+ "av1-1-b8-00-quantizer-35.ivf"
+ "av1-1-b8-00-quantizer-35.ivf.md5"
+ "av1-1-b8-00-quantizer-36.ivf"
+ "av1-1-b8-00-quantizer-36.ivf.md5"
+ "av1-1-b8-00-quantizer-37.ivf"
+ "av1-1-b8-00-quantizer-37.ivf.md5"
+ "av1-1-b8-00-quantizer-38.ivf"
+ "av1-1-b8-00-quantizer-38.ivf.md5"
+ "av1-1-b8-00-quantizer-39.ivf"
+ "av1-1-b8-00-quantizer-39.ivf.md5"
+ "av1-1-b8-00-quantizer-40.ivf"
+ "av1-1-b8-00-quantizer-40.ivf.md5"
+ "av1-1-b8-00-quantizer-41.ivf"
+ "av1-1-b8-00-quantizer-41.ivf.md5"
+ "av1-1-b8-00-quantizer-42.ivf"
+ "av1-1-b8-00-quantizer-42.ivf.md5"
+ "av1-1-b8-00-quantizer-43.ivf"
+ "av1-1-b8-00-quantizer-43.ivf.md5"
+ "av1-1-b8-00-quantizer-44.ivf"
+ "av1-1-b8-00-quantizer-44.ivf.md5"
+ "av1-1-b8-00-quantizer-45.ivf"
+ "av1-1-b8-00-quantizer-45.ivf.md5"
+ "av1-1-b8-00-quantizer-46.ivf"
+ "av1-1-b8-00-quantizer-46.ivf.md5"
+ "av1-1-b8-00-quantizer-47.ivf"
+ "av1-1-b8-00-quantizer-47.ivf.md5"
+ "av1-1-b8-00-quantizer-48.ivf"
+ "av1-1-b8-00-quantizer-48.ivf.md5"
+ "av1-1-b8-00-quantizer-49.ivf"
+ "av1-1-b8-00-quantizer-49.ivf.md5"
+ "av1-1-b8-00-quantizer-50.ivf"
+ "av1-1-b8-00-quantizer-50.ivf.md5"
+ "av1-1-b8-00-quantizer-51.ivf"
+ "av1-1-b8-00-quantizer-51.ivf.md5"
+ "av1-1-b8-00-quantizer-52.ivf"
+ "av1-1-b8-00-quantizer-52.ivf.md5"
+ "av1-1-b8-00-quantizer-53.ivf"
+ "av1-1-b8-00-quantizer-53.ivf.md5"
+ "av1-1-b8-00-quantizer-54.ivf"
+ "av1-1-b8-00-quantizer-54.ivf.md5"
+ "av1-1-b8-00-quantizer-55.ivf"
+ "av1-1-b8-00-quantizer-55.ivf.md5"
+ "av1-1-b8-00-quantizer-56.ivf"
+ "av1-1-b8-00-quantizer-56.ivf.md5"
+ "av1-1-b8-00-quantizer-57.ivf"
+ "av1-1-b8-00-quantizer-57.ivf.md5"
+ "av1-1-b8-00-quantizer-58.ivf"
+ "av1-1-b8-00-quantizer-58.ivf.md5"
+ "av1-1-b8-00-quantizer-59.ivf"
+ "av1-1-b8-00-quantizer-59.ivf.md5"
+ "av1-1-b8-00-quantizer-60.ivf"
+ "av1-1-b8-00-quantizer-60.ivf.md5"
+ "av1-1-b8-00-quantizer-61.ivf"
+ "av1-1-b8-00-quantizer-61.ivf.md5"
+ "av1-1-b8-00-quantizer-62.ivf"
+ "av1-1-b8-00-quantizer-62.ivf.md5"
+ "av1-1-b8-00-quantizer-63.ivf"
+ "av1-1-b8-00-quantizer-63.ivf.md5"
+ "av1-1-b10-00-quantizer-00.ivf"
+ "av1-1-b10-00-quantizer-00.ivf.md5"
+ "av1-1-b10-00-quantizer-01.ivf"
+ "av1-1-b10-00-quantizer-01.ivf.md5"
+ "av1-1-b10-00-quantizer-02.ivf"
+ "av1-1-b10-00-quantizer-02.ivf.md5"
+ "av1-1-b10-00-quantizer-03.ivf"
+ "av1-1-b10-00-quantizer-03.ivf.md5"
+ "av1-1-b10-00-quantizer-04.ivf"
+ "av1-1-b10-00-quantizer-04.ivf.md5"
+ "av1-1-b10-00-quantizer-05.ivf"
+ "av1-1-b10-00-quantizer-05.ivf.md5"
+ "av1-1-b10-00-quantizer-06.ivf"
+ "av1-1-b10-00-quantizer-06.ivf.md5"
+ "av1-1-b10-00-quantizer-07.ivf"
+ "av1-1-b10-00-quantizer-07.ivf.md5"
+ "av1-1-b10-00-quantizer-08.ivf"
+ "av1-1-b10-00-quantizer-08.ivf.md5"
+ "av1-1-b10-00-quantizer-09.ivf"
+ "av1-1-b10-00-quantizer-09.ivf.md5"
+ "av1-1-b10-00-quantizer-10.ivf"
+ "av1-1-b10-00-quantizer-10.ivf.md5"
+ "av1-1-b10-00-quantizer-11.ivf"
+ "av1-1-b10-00-quantizer-11.ivf.md5"
+ "av1-1-b10-00-quantizer-12.ivf"
+ "av1-1-b10-00-quantizer-12.ivf.md5"
+ "av1-1-b10-00-quantizer-13.ivf"
+ "av1-1-b10-00-quantizer-13.ivf.md5"
+ "av1-1-b10-00-quantizer-14.ivf"
+ "av1-1-b10-00-quantizer-14.ivf.md5"
+ "av1-1-b10-00-quantizer-15.ivf"
+ "av1-1-b10-00-quantizer-15.ivf.md5"
+ "av1-1-b10-00-quantizer-16.ivf"
+ "av1-1-b10-00-quantizer-16.ivf.md5"
+ "av1-1-b10-00-quantizer-17.ivf"
+ "av1-1-b10-00-quantizer-17.ivf.md5"
+ "av1-1-b10-00-quantizer-18.ivf"
+ "av1-1-b10-00-quantizer-18.ivf.md5"
+ "av1-1-b10-00-quantizer-19.ivf"
+ "av1-1-b10-00-quantizer-19.ivf.md5"
+ "av1-1-b10-00-quantizer-20.ivf"
+ "av1-1-b10-00-quantizer-20.ivf.md5"
+ "av1-1-b10-00-quantizer-21.ivf"
+ "av1-1-b10-00-quantizer-21.ivf.md5"
+ "av1-1-b10-00-quantizer-22.ivf"
+ "av1-1-b10-00-quantizer-22.ivf.md5"
+ "av1-1-b10-00-quantizer-23.ivf"
+ "av1-1-b10-00-quantizer-23.ivf.md5"
+ "av1-1-b10-00-quantizer-24.ivf"
+ "av1-1-b10-00-quantizer-24.ivf.md5"
+ "av1-1-b10-00-quantizer-25.ivf"
+ "av1-1-b10-00-quantizer-25.ivf.md5"
+ "av1-1-b10-00-quantizer-26.ivf"
+ "av1-1-b10-00-quantizer-26.ivf.md5"
+ "av1-1-b10-00-quantizer-27.ivf"
+ "av1-1-b10-00-quantizer-27.ivf.md5"
+ "av1-1-b10-00-quantizer-28.ivf"
+ "av1-1-b10-00-quantizer-28.ivf.md5"
+ "av1-1-b10-00-quantizer-29.ivf"
+ "av1-1-b10-00-quantizer-29.ivf.md5"
+ "av1-1-b10-00-quantizer-30.ivf"
+ "av1-1-b10-00-quantizer-30.ivf.md5"
+ "av1-1-b10-00-quantizer-31.ivf"
+ "av1-1-b10-00-quantizer-31.ivf.md5"
+ "av1-1-b10-00-quantizer-32.ivf"
+ "av1-1-b10-00-quantizer-32.ivf.md5"
+ "av1-1-b10-00-quantizer-33.ivf"
+ "av1-1-b10-00-quantizer-33.ivf.md5"
+ "av1-1-b10-00-quantizer-34.ivf"
+ "av1-1-b10-00-quantizer-34.ivf.md5"
+ "av1-1-b10-00-quantizer-35.ivf"
+ "av1-1-b10-00-quantizer-35.ivf.md5"
+ "av1-1-b10-00-quantizer-36.ivf"
+ "av1-1-b10-00-quantizer-36.ivf.md5"
+ "av1-1-b10-00-quantizer-37.ivf"
+ "av1-1-b10-00-quantizer-37.ivf.md5"
+ "av1-1-b10-00-quantizer-38.ivf"
+ "av1-1-b10-00-quantizer-38.ivf.md5"
+ "av1-1-b10-00-quantizer-39.ivf"
+ "av1-1-b10-00-quantizer-39.ivf.md5"
+ "av1-1-b10-00-quantizer-40.ivf"
+ "av1-1-b10-00-quantizer-40.ivf.md5"
+ "av1-1-b10-00-quantizer-41.ivf"
+ "av1-1-b10-00-quantizer-41.ivf.md5"
+ "av1-1-b10-00-quantizer-42.ivf"
+ "av1-1-b10-00-quantizer-42.ivf.md5"
+ "av1-1-b10-00-quantizer-43.ivf"
+ "av1-1-b10-00-quantizer-43.ivf.md5"
+ "av1-1-b10-00-quantizer-44.ivf"
+ "av1-1-b10-00-quantizer-44.ivf.md5"
+ "av1-1-b10-00-quantizer-45.ivf"
+ "av1-1-b10-00-quantizer-45.ivf.md5"
+ "av1-1-b10-00-quantizer-46.ivf"
+ "av1-1-b10-00-quantizer-46.ivf.md5"
+ "av1-1-b10-00-quantizer-47.ivf"
+ "av1-1-b10-00-quantizer-47.ivf.md5"
+ "av1-1-b10-00-quantizer-48.ivf"
+ "av1-1-b10-00-quantizer-48.ivf.md5"
+ "av1-1-b10-00-quantizer-49.ivf"
+ "av1-1-b10-00-quantizer-49.ivf.md5"
+ "av1-1-b10-00-quantizer-50.ivf"
+ "av1-1-b10-00-quantizer-50.ivf.md5"
+ "av1-1-b10-00-quantizer-51.ivf"
+ "av1-1-b10-00-quantizer-51.ivf.md5"
+ "av1-1-b10-00-quantizer-52.ivf"
+ "av1-1-b10-00-quantizer-52.ivf.md5"
+ "av1-1-b10-00-quantizer-53.ivf"
+ "av1-1-b10-00-quantizer-53.ivf.md5"
+ "av1-1-b10-00-quantizer-54.ivf"
+ "av1-1-b10-00-quantizer-54.ivf.md5"
+ "av1-1-b10-00-quantizer-55.ivf"
+ "av1-1-b10-00-quantizer-55.ivf.md5"
+ "av1-1-b10-00-quantizer-56.ivf"
+ "av1-1-b10-00-quantizer-56.ivf.md5"
+ "av1-1-b10-00-quantizer-57.ivf"
+ "av1-1-b10-00-quantizer-57.ivf.md5"
+ "av1-1-b10-00-quantizer-58.ivf"
+ "av1-1-b10-00-quantizer-58.ivf.md5"
+ "av1-1-b10-00-quantizer-59.ivf"
+ "av1-1-b10-00-quantizer-59.ivf.md5"
+ "av1-1-b10-00-quantizer-60.ivf"
+ "av1-1-b10-00-quantizer-60.ivf.md5"
+ "av1-1-b10-00-quantizer-61.ivf"
+ "av1-1-b10-00-quantizer-61.ivf.md5"
+ "av1-1-b10-00-quantizer-62.ivf"
+ "av1-1-b10-00-quantizer-62.ivf.md5"
+ "av1-1-b10-00-quantizer-63.ivf"
+ "av1-1-b10-00-quantizer-63.ivf.md5"
+ "av1-1-b10-23-film_grain-50.ivf"
+ "av1-1-b10-23-film_grain-50.ivf.md5"
+ "av1-1-b10-24-monochrome.ivf"
+ "av1-1-b10-24-monochrome.ivf.md5"
+ "av1-1-b8-01-size-16x16.ivf"
+ "av1-1-b8-01-size-16x16.ivf.md5"
+ "av1-1-b8-01-size-16x18.ivf"
+ "av1-1-b8-01-size-16x18.ivf.md5"
+ "av1-1-b8-01-size-16x32.ivf"
+ "av1-1-b8-01-size-16x32.ivf.md5"
+ "av1-1-b8-01-size-16x34.ivf"
+ "av1-1-b8-01-size-16x34.ivf.md5"
+ "av1-1-b8-01-size-16x64.ivf"
+ "av1-1-b8-01-size-16x64.ivf.md5"
+ "av1-1-b8-01-size-16x66.ivf"
+ "av1-1-b8-01-size-16x66.ivf.md5"
+ "av1-1-b8-01-size-18x16.ivf"
+ "av1-1-b8-01-size-18x16.ivf.md5"
+ "av1-1-b8-01-size-18x18.ivf"
+ "av1-1-b8-01-size-18x18.ivf.md5"
+ "av1-1-b8-01-size-18x32.ivf"
+ "av1-1-b8-01-size-18x32.ivf.md5"
+ "av1-1-b8-01-size-18x34.ivf"
+ "av1-1-b8-01-size-18x34.ivf.md5"
+ "av1-1-b8-01-size-18x64.ivf"
+ "av1-1-b8-01-size-18x64.ivf.md5"
+ "av1-1-b8-01-size-18x66.ivf"
+ "av1-1-b8-01-size-18x66.ivf.md5"
+ "av1-1-b8-01-size-196x196.ivf"
+ "av1-1-b8-01-size-196x196.ivf.md5"
+ "av1-1-b8-01-size-196x198.ivf"
+ "av1-1-b8-01-size-196x198.ivf.md5"
+ "av1-1-b8-01-size-196x200.ivf"
+ "av1-1-b8-01-size-196x200.ivf.md5"
+ "av1-1-b8-01-size-196x202.ivf"
+ "av1-1-b8-01-size-196x202.ivf.md5"
+ "av1-1-b8-01-size-196x208.ivf"
+ "av1-1-b8-01-size-196x208.ivf.md5"
+ "av1-1-b8-01-size-196x210.ivf"
+ "av1-1-b8-01-size-196x210.ivf.md5"
+ "av1-1-b8-01-size-196x224.ivf"
+ "av1-1-b8-01-size-196x224.ivf.md5"
+ "av1-1-b8-01-size-196x226.ivf"
+ "av1-1-b8-01-size-196x226.ivf.md5"
+ "av1-1-b8-01-size-198x196.ivf"
+ "av1-1-b8-01-size-198x196.ivf.md5"
+ "av1-1-b8-01-size-198x198.ivf"
+ "av1-1-b8-01-size-198x198.ivf.md5"
+ "av1-1-b8-01-size-198x200.ivf"
+ "av1-1-b8-01-size-198x200.ivf.md5"
+ "av1-1-b8-01-size-198x202.ivf"
+ "av1-1-b8-01-size-198x202.ivf.md5"
+ "av1-1-b8-01-size-198x208.ivf"
+ "av1-1-b8-01-size-198x208.ivf.md5"
+ "av1-1-b8-01-size-198x210.ivf"
+ "av1-1-b8-01-size-198x210.ivf.md5"
+ "av1-1-b8-01-size-198x224.ivf"
+ "av1-1-b8-01-size-198x224.ivf.md5"
+ "av1-1-b8-01-size-198x226.ivf"
+ "av1-1-b8-01-size-198x226.ivf.md5"
+ "av1-1-b8-01-size-200x196.ivf"
+ "av1-1-b8-01-size-200x196.ivf.md5"
+ "av1-1-b8-01-size-200x198.ivf"
+ "av1-1-b8-01-size-200x198.ivf.md5"
+ "av1-1-b8-01-size-200x200.ivf"
+ "av1-1-b8-01-size-200x200.ivf.md5"
+ "av1-1-b8-01-size-200x202.ivf"
+ "av1-1-b8-01-size-200x202.ivf.md5"
+ "av1-1-b8-01-size-200x208.ivf"
+ "av1-1-b8-01-size-200x208.ivf.md5"
+ "av1-1-b8-01-size-200x210.ivf"
+ "av1-1-b8-01-size-200x210.ivf.md5"
+ "av1-1-b8-01-size-200x224.ivf"
+ "av1-1-b8-01-size-200x224.ivf.md5"
+ "av1-1-b8-01-size-200x226.ivf"
+ "av1-1-b8-01-size-200x226.ivf.md5"
+ "av1-1-b8-01-size-202x196.ivf"
+ "av1-1-b8-01-size-202x196.ivf.md5"
+ "av1-1-b8-01-size-202x198.ivf"
+ "av1-1-b8-01-size-202x198.ivf.md5"
+ "av1-1-b8-01-size-202x200.ivf"
+ "av1-1-b8-01-size-202x200.ivf.md5"
+ "av1-1-b8-01-size-202x202.ivf"
+ "av1-1-b8-01-size-202x202.ivf.md5"
+ "av1-1-b8-01-size-202x208.ivf"
+ "av1-1-b8-01-size-202x208.ivf.md5"
+ "av1-1-b8-01-size-202x210.ivf"
+ "av1-1-b8-01-size-202x210.ivf.md5"
+ "av1-1-b8-01-size-202x224.ivf"
+ "av1-1-b8-01-size-202x224.ivf.md5"
+ "av1-1-b8-01-size-202x226.ivf"
+ "av1-1-b8-01-size-202x226.ivf.md5"
+ "av1-1-b8-01-size-208x196.ivf"
+ "av1-1-b8-01-size-208x196.ivf.md5"
+ "av1-1-b8-01-size-208x198.ivf"
+ "av1-1-b8-01-size-208x198.ivf.md5"
+ "av1-1-b8-01-size-208x200.ivf"
+ "av1-1-b8-01-size-208x200.ivf.md5"
+ "av1-1-b8-01-size-208x202.ivf"
+ "av1-1-b8-01-size-208x202.ivf.md5"
+ "av1-1-b8-01-size-208x208.ivf"
+ "av1-1-b8-01-size-208x208.ivf.md5"
+ "av1-1-b8-01-size-208x210.ivf"
+ "av1-1-b8-01-size-208x210.ivf.md5"
+ "av1-1-b8-01-size-208x224.ivf"
+ "av1-1-b8-01-size-208x224.ivf.md5"
+ "av1-1-b8-01-size-208x226.ivf"
+ "av1-1-b8-01-size-208x226.ivf.md5"
+ "av1-1-b8-01-size-210x196.ivf"
+ "av1-1-b8-01-size-210x196.ivf.md5"
+ "av1-1-b8-01-size-210x198.ivf"
+ "av1-1-b8-01-size-210x198.ivf.md5"
+ "av1-1-b8-01-size-210x200.ivf"
+ "av1-1-b8-01-size-210x200.ivf.md5"
+ "av1-1-b8-01-size-210x202.ivf"
+ "av1-1-b8-01-size-210x202.ivf.md5"
+ "av1-1-b8-01-size-210x208.ivf"
+ "av1-1-b8-01-size-210x208.ivf.md5"
+ "av1-1-b8-01-size-210x210.ivf"
+ "av1-1-b8-01-size-210x210.ivf.md5"
+ "av1-1-b8-01-size-210x224.ivf"
+ "av1-1-b8-01-size-210x224.ivf.md5"
+ "av1-1-b8-01-size-210x226.ivf"
+ "av1-1-b8-01-size-210x226.ivf.md5"
+ "av1-1-b8-01-size-224x196.ivf"
+ "av1-1-b8-01-size-224x196.ivf.md5"
+ "av1-1-b8-01-size-224x198.ivf"
+ "av1-1-b8-01-size-224x198.ivf.md5"
+ "av1-1-b8-01-size-224x200.ivf"
+ "av1-1-b8-01-size-224x200.ivf.md5"
+ "av1-1-b8-01-size-224x202.ivf"
+ "av1-1-b8-01-size-224x202.ivf.md5"
+ "av1-1-b8-01-size-224x208.ivf"
+ "av1-1-b8-01-size-224x208.ivf.md5"
+ "av1-1-b8-01-size-224x210.ivf"
+ "av1-1-b8-01-size-224x210.ivf.md5"
+ "av1-1-b8-01-size-224x224.ivf"
+ "av1-1-b8-01-size-224x224.ivf.md5"
+ "av1-1-b8-01-size-224x226.ivf"
+ "av1-1-b8-01-size-224x226.ivf.md5"
+ "av1-1-b8-01-size-226x196.ivf"
+ "av1-1-b8-01-size-226x196.ivf.md5"
+ "av1-1-b8-01-size-226x198.ivf"
+ "av1-1-b8-01-size-226x198.ivf.md5"
+ "av1-1-b8-01-size-226x200.ivf"
+ "av1-1-b8-01-size-226x200.ivf.md5"
+ "av1-1-b8-01-size-226x202.ivf"
+ "av1-1-b8-01-size-226x202.ivf.md5"
+ "av1-1-b8-01-size-226x208.ivf"
+ "av1-1-b8-01-size-226x208.ivf.md5"
+ "av1-1-b8-01-size-226x210.ivf"
+ "av1-1-b8-01-size-226x210.ivf.md5"
+ "av1-1-b8-01-size-226x224.ivf"
+ "av1-1-b8-01-size-226x224.ivf.md5"
+ "av1-1-b8-01-size-226x226.ivf"
+ "av1-1-b8-01-size-226x226.ivf.md5"
+ "av1-1-b8-01-size-32x16.ivf"
+ "av1-1-b8-01-size-32x16.ivf.md5"
+ "av1-1-b8-01-size-32x18.ivf"
+ "av1-1-b8-01-size-32x18.ivf.md5"
+ "av1-1-b8-01-size-32x32.ivf"
+ "av1-1-b8-01-size-32x32.ivf.md5"
+ "av1-1-b8-01-size-32x34.ivf"
+ "av1-1-b8-01-size-32x34.ivf.md5"
+ "av1-1-b8-01-size-32x64.ivf"
+ "av1-1-b8-01-size-32x64.ivf.md5"
+ "av1-1-b8-01-size-32x66.ivf"
+ "av1-1-b8-01-size-32x66.ivf.md5"
+ "av1-1-b8-01-size-34x16.ivf"
+ "av1-1-b8-01-size-34x16.ivf.md5"
+ "av1-1-b8-01-size-34x18.ivf"
+ "av1-1-b8-01-size-34x18.ivf.md5"
+ "av1-1-b8-01-size-34x32.ivf"
+ "av1-1-b8-01-size-34x32.ivf.md5"
+ "av1-1-b8-01-size-34x34.ivf"
+ "av1-1-b8-01-size-34x34.ivf.md5"
+ "av1-1-b8-01-size-34x64.ivf"
+ "av1-1-b8-01-size-34x64.ivf.md5"
+ "av1-1-b8-01-size-34x66.ivf"
+ "av1-1-b8-01-size-34x66.ivf.md5"
+ "av1-1-b8-01-size-64x16.ivf"
+ "av1-1-b8-01-size-64x16.ivf.md5"
+ "av1-1-b8-01-size-64x18.ivf"
+ "av1-1-b8-01-size-64x18.ivf.md5"
+ "av1-1-b8-01-size-64x32.ivf"
+ "av1-1-b8-01-size-64x32.ivf.md5"
+ "av1-1-b8-01-size-64x34.ivf"
+ "av1-1-b8-01-size-64x34.ivf.md5"
+ "av1-1-b8-01-size-64x64.ivf"
+ "av1-1-b8-01-size-64x64.ivf.md5"
+ "av1-1-b8-01-size-64x66.ivf"
+ "av1-1-b8-01-size-64x66.ivf.md5"
+ "av1-1-b8-01-size-66x16.ivf"
+ "av1-1-b8-01-size-66x16.ivf.md5"
+ "av1-1-b8-01-size-66x18.ivf"
+ "av1-1-b8-01-size-66x18.ivf.md5"
+ "av1-1-b8-01-size-66x32.ivf"
+ "av1-1-b8-01-size-66x32.ivf.md5"
+ "av1-1-b8-01-size-66x34.ivf"
+ "av1-1-b8-01-size-66x34.ivf.md5"
+ "av1-1-b8-01-size-66x64.ivf"
+ "av1-1-b8-01-size-66x64.ivf.md5"
+ "av1-1-b8-01-size-66x66.ivf"
+ "av1-1-b8-01-size-66x66.ivf.md5"
+ "av1-1-b8-02-allintra.ivf"
+ "av1-1-b8-02-allintra.ivf.md5"
+ "av1-1-b8-03-sizeup.mkv"
+ "av1-1-b8-03-sizeup.mkv.md5"
+ "av1-1-b8-03-sizedown.mkv"
+ "av1-1-b8-03-sizedown.mkv.md5"
+ "av1-1-b8-04-cdfupdate.ivf"
+ "av1-1-b8-04-cdfupdate.ivf.md5"
+ "av1-1-b8-05-mv.ivf"
+ "av1-1-b8-05-mv.ivf.md5"
+ "av1-1-b8-06-mfmv.ivf"
+ "av1-1-b8-06-mfmv.ivf.md5"
+ "av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf"
+ "av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf.md5"
+ "av1-1-b8-22-svc-L2T1.ivf"
+ "av1-1-b8-22-svc-L2T1.ivf.md5"
+ "av1-1-b8-22-svc-L1T2.ivf"
+ "av1-1-b8-22-svc-L1T2.ivf.md5"
+ "av1-1-b8-22-svc-L2T2.ivf"
+ "av1-1-b8-22-svc-L2T2.ivf.md5"
+ "av1-1-b8-23-film_grain-50.ivf"
+ "av1-1-b8-23-film_grain-50.ivf.md5"
+ "av1-1-b8-24-monochrome.ivf"
+ "av1-1-b8-24-monochrome.ivf.md5"
+ "invalid-bug-1814.ivf"
+ "invalid-bug-1814.ivf.res"
+ "invalid-chromium-906381.ivf"
+ "invalid-chromium-906381.ivf.res"
+ "invalid-google-142530197-1.ivf"
+ "invalid-google-142530197-1.ivf.res"
+ "invalid-google-142530197.ivf"
+ "invalid-google-142530197.ivf.res"
+ "invalid-oss-fuzz-10061.ivf"
+ "invalid-oss-fuzz-10061.ivf.res"
+ "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf"
+ "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res"
+ "invalid-oss-fuzz-10227.ivf"
+ "invalid-oss-fuzz-10227.ivf.res"
+ "invalid-oss-fuzz-10389.ivf"
+ "invalid-oss-fuzz-10389.ivf.res.4"
+ "invalid-oss-fuzz-10555.ivf"
+ "invalid-oss-fuzz-10555.ivf.res"
+ "invalid-oss-fuzz-10705.ivf"
+ "invalid-oss-fuzz-10705.ivf.res"
+ "invalid-oss-fuzz-10723.ivf"
+ "invalid-oss-fuzz-10723.ivf.res.2"
+ "invalid-oss-fuzz-10723.ivf.res.3"
+ "invalid-oss-fuzz-10779.ivf"
+ "invalid-oss-fuzz-10779.ivf.res"
+ "invalid-oss-fuzz-11477.ivf"
+ "invalid-oss-fuzz-11477.ivf.res"
+ "invalid-oss-fuzz-11479.ivf"
+ "invalid-oss-fuzz-11479.ivf.res.2"
+ "invalid-oss-fuzz-11523.ivf"
+ "invalid-oss-fuzz-11523.ivf.res.2"
+ "invalid-oss-fuzz-15363.ivf"
+ "invalid-oss-fuzz-15363.ivf.res"
+ "invalid-oss-fuzz-16437.ivf"
+ "invalid-oss-fuzz-16437.ivf.res.2"
+ "invalid-oss-fuzz-24706.ivf"
+ "invalid-oss-fuzz-24706.ivf.res"
+ "invalid-oss-fuzz-33030.ivf"
+ "invalid-oss-fuzz-33030.ivf.res"
+ "invalid-oss-fuzz-9288.ivf"
+ "invalid-oss-fuzz-9288.ivf.res"
+ "invalid-oss-fuzz-9463.ivf"
+ "invalid-oss-fuzz-9463.ivf.res.2"
+ "invalid-oss-fuzz-9482.ivf"
+ "invalid-oss-fuzz-9482.ivf.res"
+ "invalid-oss-fuzz-9720.ivf"
+ "invalid-oss-fuzz-9720.ivf.res")
+endif()
+
+if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+ list(APPEND AOM_TEST_DATA_FILE_NAMES "desktop_640_360_30.yuv"
+ "kirland_640_480_30.yuv" "macmarcomoving_640_480_30.yuv"
+ "macmarcostationary_640_480_30.yuv" "niklas_1280_720_30.yuv"
+ "tacomanarrows_640_480_30.yuv"
+ "tacomasmallcameramovement_640_480_30.yuv"
+ "thaloundeskmtg_640_480_30.yuv")
+endif()
+
+# Parses test/test-data.sha1 and writes captured file names and checksums to
+# $out_files and $out_checksums as lists.
+function(make_test_data_lists test_data_file out_files out_checksums)
+ if(NOT test_data_file OR NOT EXISTS "${test_data_file}")
+ message(FATAL_ERROR "Test info file missing or empty (${test_data_file})")
+ endif()
+
+ # Read $test_data_file into $files_and_checksums. $files_and_checksums becomes
+ # a list with an entry for each line from $test_data_file.
+ file(STRINGS "${test_data_file}" files_and_checksums)
+
+ # Iterate over the list of lines and split it into $checksums and $filenames.
+ foreach(line ${files_and_checksums})
+ string(FIND "${line}" " *" delim_pos)
+
+ math(EXPR filename_pos "${delim_pos} + 2")
+ string(SUBSTRING "${line}" 0 ${delim_pos} checksum)
+ string(SUBSTRING "${line}" ${filename_pos} -1 filename)
+
+ list(FIND AOM_TEST_DATA_FILE_NAMES ${filename} list_index)
+ if(NOT ${list_index} EQUAL -1)
+
+ # Include the name and checksum in output only when the file is needed.
+ set(checksums ${checksums} ${checksum})
+ set(filenames ${filenames} ${filename})
+ endif()
+ endforeach()
+
+ list(LENGTH filenames num_files)
+ list(LENGTH checksums num_checksums)
+ if(NOT checksums OR NOT filenames OR NOT num_files EQUAL num_checksums)
+ message(FATAL_ERROR "Parsing of ${test_data_file} failed.")
+ endif()
+
+ set(${out_checksums} ${checksums} PARENT_SCOPE)
+ set(${out_files} ${filenames} PARENT_SCOPE)
+endfunction()
+
+# Appends each file name in $test_files to $test_dir and adds the result path to
+# $out_path_list.
+function(expand_test_file_paths test_files test_dir out_path_list)
+ foreach(filename ${${test_files}})
+ set(path_list ${path_list} "${test_dir}/${filename}")
+ endforeach()
+ set(${out_path_list} ${path_list} PARENT_SCOPE)
+endfunction()
+
+function(check_file local_path expected_checksum out_needs_update)
+ if(EXISTS "${local_path}")
+ file(SHA1 "${local_path}" file_checksum)
+ else()
+ set(${out_needs_update} 1 PARENT_SCOPE)
+ return()
+ endif()
+
+ if("${file_checksum}" STREQUAL "${expected_checksum}")
+ unset(${out_needs_update} PARENT_SCOPE)
+ else()
+ set(${out_needs_update} 1 PARENT_SCOPE)
+ return()
+ endif()
+ message("${local_path} up to date.")
+endfunction()
+
+# Downloads data from $file_url, confirms that $file_checksum matches, and
+# writes it to $local_path.
+function(download_test_file file_url file_checksum local_path)
+ message("Downloading ${file_url} ...")
+ file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS EXPECTED_HASH
+ SHA1=${file_checksum})
+ message("Download of ${file_url} complete.")
+endfunction()
diff --git a/third_party/aom/test/test_intra_pred_speed.cc b/third_party/aom/test/test_intra_pred_speed.cc
new file mode 100644
index 0000000000..d5c94be092
--- /dev/null
+++ b/third_party/aom/test/test_intra_pred_speed.cc
@@ -0,0 +1,1742 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Test and time AOM intra-predictor functions
+
+#include <stdio.h>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/md5_helper.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
+
+// -----------------------------------------------------------------------------
+
+namespace {
+
+// Note:
+// APPLY_UNIT_TESTS
+// 1: Do unit tests
+// 0: Generate MD5 array as required
+#define APPLY_UNIT_TESTS 1
+
+typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left);
+
+const int kBPS = 64;
+const int kTotalPixels = kBPS * kBPS;
+// 4 DC variants, V, H, PAETH, SMOOTH, SMOOTH_V, SMOOTH_H
+const int kNumAv1IntraFuncs = 10;
+
+#if APPLY_UNIT_TESTS
+const char *kAv1IntraPredNames[kNumAv1IntraFuncs] = {
+ "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED",
+ "H_PRED", "PAETH_PRED", "SMOOTH_PRED", "SMOOTH_V_PRED", "SMOOTH_H_PRED",
+};
+#endif // APPLY_UNIT_TESTS
+
+template <typename Pixel>
+struct IntraPredTestMem {
+ void Init(int block_width, int block_height, int bd) {
+ ASSERT_LE(block_width, kBPS);
+ ASSERT_LE(block_height, kBPS);
+ // Note: for blocks having width <= 32 and height <= 32, we generate 32x32
+ // random pixels as before to avoid having to recalculate all hashes again.
+ const int block_size_upto_32 = (block_width <= 32) && (block_height <= 32);
+ stride = block_size_upto_32 ? 32 : kBPS;
+ num_pixels = stride * stride;
+ libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed());
+ above = above_mem + 16;
+ const int mask = (1 << bd) - 1;
+ for (int i = 0; i < num_pixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+ for (int i = 0; i < stride; ++i) left[i] = rnd.Rand16() & mask;
+ for (int i = -1; i < stride; ++i) above[i] = rnd.Rand16() & mask;
+
+ for (int i = stride; i < 2 * stride; ++i) {
+ left[i] = rnd.Rand16() & mask;
+ above[i] = rnd.Rand16() & mask;
+ }
+ }
+
+ DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+ DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+ DECLARE_ALIGNED(16, Pixel, left[2 * kBPS]);
+ Pixel *above;
+ int stride;
+ int num_pixels;
+
+ private:
+ DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+// -----------------------------------------------------------------------------
+// Low Bittdepth
+
+typedef IntraPredTestMem<uint8_t> Av1IntraPredTestMem;
+
+static const char *const kTxSizeStrings[TX_SIZES_ALL] = {
+ "4X4", "8X8", "16X16", "32X32", "64X64", "4X8", "8X4",
+ "8X16", "16X8", "16X32", "32X16", "32X64", "64X32", "4X16",
+ "16X4", "8X32", "32X8", "16X64", "64X16",
+};
+
+void CheckMd5Signature(TX_SIZE tx_size, bool is_hbd,
+ const char *const signatures[], const void *data,
+ size_t data_size, int elapsed_time, int idx) {
+ const std::string hbd_str = is_hbd ? "Hbd " : "";
+ const std::string name_str = hbd_str + "Intra" + kTxSizeStrings[tx_size];
+ libaom_test::MD5 md5;
+ md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+#if APPLY_UNIT_TESTS
+ printf("Mode %s[%13s]: %5d ms MD5: %s\n", name_str.c_str(),
+ kAv1IntraPredNames[idx], elapsed_time, md5.Get());
+ EXPECT_STREQ(signatures[idx], md5.Get());
+#else
+ (void)signatures;
+ (void)elapsed_time;
+ (void)idx;
+ printf("\"%s\",\n", md5.Get());
+#endif
+}
+
+void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs,
+ const char *const signatures[]) {
+ const int block_width = tx_size_wide[tx_size];
+ const int block_height = tx_size_high[tx_size];
+ const int num_pixels_per_test =
+ block_width * block_height * kNumAv1IntraFuncs;
+ const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+ Av1IntraPredTestMem intra_pred_test_mem;
+ intra_pred_test_mem.Init(block_width, block_height, 8);
+
+ for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+ if (pred_funcs[k] == nullptr) continue;
+ memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+ sizeof(intra_pred_test_mem.src));
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+ pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+ intra_pred_test_mem.above, intra_pred_test_mem.left);
+ }
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+ CheckMd5Signature(
+ tx_size, false, signatures, intra_pred_test_mem.src,
+ intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+ elapsed_time, k);
+ }
+}
+
+static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+ {
+ // 4X4
+ "e7ed7353c3383fff942e500e9bfe82fe",
+ "2a4a26fcc6ce005eadc08354d196c8a9",
+ "269d92eff86f315d9c38fe7640d85b15",
+ "ae2960eea9f71ee3dabe08b282ec1773",
+ "6c1abcc44e90148998b51acd11144e9c",
+ "f7bb3186e1ef8a2b326037ff898cad8e",
+ "59fc0e923a08cfac0a493fb38988e2bb",
+ "9ff8bb37d9c830e6ab8ecb0c435d3c91",
+ "de6937fca02354f2874dbc5dbec5d5b3",
+ "723cf948137f7d8c7860d814e55ae67d",
+ },
+ {
+ // 8X8
+ "d8bbae5d6547cfc17e4f5f44c8730e88",
+ "373bab6d931868d41a601d9d88ce9ac3",
+ "6fdd5ff4ff79656c14747598ca9e3706",
+ "d9661c2811d6a73674f40ffb2b841847",
+ "7c722d10b19ccff0b8c171868e747385",
+ "f81dd986eb2b50f750d3a7da716b7e27",
+ "064404361748dd111a890a1470d7f0ea",
+ "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
+ "97111eb1bc26bade6272015df829f1ae",
+ "d19a8a73cc46b807f2c5e817576cc1e1",
+ },
+ {
+ // 16X16
+ "50971c07ce26977d30298538fffec619",
+ "527a6b9e0dc5b21b98cf276305432bef",
+ "7eff2868f80ebc2c43a4f367281d80f7",
+ "67cd60512b54964ef6aff1bd4816d922",
+ "48371c87dc95c08a33b2048f89cf6468",
+ "b0acf2872ee411d7530af6d2625a7084",
+ "93d6b5352b571805ab16a55e1bbed86a",
+ "03764e4c0aebbc180e4e2c68fb06df2b",
+ "bb6c74c9076c9f266ab11fb57060d8e6",
+ "0c5162bc28489756ddb847b5678e6f07",
+ },
+ {
+ // 32X32
+ "a0a618c900e65ae521ccc8af789729f2",
+ "985aaa7c72b4a6c2fb431d32100cf13a",
+ "10662d09febc3ca13ee4e700120daeb5",
+ "b3b01379ba08916ef6b1b35f7d9ad51c",
+ "9f4261755795af97e34679c333ec7004",
+ "bc2c9da91ad97ef0d1610fb0a9041657",
+ "ef1653982b69e1f64bee3759f3e1ec45",
+ "1a51a675deba2c83282142eb48d3dc3d",
+ "866c224746dc260cda861a7b1b383fb3",
+ "cea23799fc3526e1b6a6ff02b42b82af",
+ },
+ {
+ // 64X64
+ "6e1094fa7b50bc813aa2ba29f5df8755",
+ "afe020786b83b793c2bbd9468097ff6e",
+ "be91585259bc37bf4dc1651936e90b3e",
+ "a1650dbcd56e10288c3e269eca37967d",
+ "9e5c34f3797e0cdd3cd9d4c05b0d8950",
+ "bc87be7ac899cc6a28f399d7516c49fe",
+ "9811fd0d2dd515f06122f5d1bd18b784",
+ "3c140e466f2c2c0d9cb7d2157ab8dc27",
+ "9543de76c925a8f6adc884cc7f98dc91",
+ "df1df0376cc944afe7e74e94f53e575a",
+ },
+ {
+ // 4X8
+ "d9fbebdc85f71ab1e18461b2db4a2adc",
+ "5ccb2a68284bc9714d94b8a06ccadbb2",
+ "735d059abc2744f3ff3f9590f7191b37",
+ "d9fbebdc85f71ab1e18461b2db4a2adc",
+ "6819497c44cd0ace120add83672996ee",
+ "7e3244f5a2d3edf81c7e962a842b97f9",
+ "809350f164cd4d1650850bb0f59c3260",
+ "1b60a394331eeab6927a6f8aaff57040",
+ "5307de1bd7329ba6b281d2c1b0b457f9",
+ "24c58a8138339846d95568efb91751db",
+ },
+ {
+ // 8X4
+ "23f9fc11344426c9bee2e06d57dfd628",
+ "2d71a26d1bae1fb34734de7b42fc5eb7",
+ "5af9c1b2fd9d5721fad67b67b3f7c816",
+ "00d71b17be662753813d515f197d145e",
+ "bef10ec984427e28f4390f43809d10af",
+ "77773cdfb7ed6bc882ab202a64b0a470",
+ "2cc48bd66d6b0121b5221d52ccd732af",
+ "b302155e1c9eeeafe2ba2bf68e807a46",
+ "561bc8d0e76d5041ebd5168fc6a115e1",
+ "81d0113fb1d0a9a24ffd6f1987b77948",
+ },
+ {
+ // 8X16
+ "c849de88b24f773dfcdd1d48d1209796",
+ "6cb807c1897b94866a0f3d3c56ed8695",
+ "d56db05a8ac7981762f5b877f486c4ef",
+ "b4bc01eb6e59a40922ad17715cafb04b",
+ "09d178439534f4062ae687c351f66d64",
+ "644501399cf73080ac606e5cef7ca09b",
+ "278076495180e17c065a95ab7278539a",
+ "9dd7f324816f242be408ffeb0c673732",
+ "f520c4a20acfa0bea1d253c6f0f040fd",
+ "85f38df809df2c2d7c8b4a157a65cd44",
+ },
+ {
+ // 16X8
+ "b4cbdbdf10ce13300b4063a3daf99e04",
+ "3731e1e6202064a9d0604d7c293ecee4",
+ "6c856188c4256a06452f0d5d70cac436",
+ "1f2192b4c8c497589484ea7bf9c944e8",
+ "84011bd4b7f565119d06787840e333a0",
+ "0e48949f7a6aa36f0d76b5d01f91124a",
+ "60eff8064634b6c73b10681356baeee9",
+ "1559aeb081a9c0c71111d6093c2ff9fd",
+ "c15479b739713773e5cabb748451987b",
+ "72e33ec12c9b67aea26d8d005fb82de2",
+ },
+ {
+ // 16X32
+ "abe5233d189cdbf79424721571bbaa7b",
+ "282759f81e3cfb2e2d396fe406b72a8b",
+ "e2224926c264f6f174cbc3167a233168",
+ "6814e85c2b33f8c9415d62e80394b47b",
+ "99cbbb60459c08a3061d72c4e4f6276a",
+ "1d1567d40b8e816f8c1f71e576fe0f87",
+ "36fdd371b624a075814d497c4832ec85",
+ "8ab8da61b727442b6ff692b40d0df018",
+ "e35a10ad7fdf2327e821504a90f6a6eb",
+ "1f7211e727dc1de7d6a55d082fbdd821",
+ },
+ {
+ // 32X16
+ "d1aeb8d5fdcfd3307922af01a798a4dc",
+ "b0bcb514ebfbee065faea9d34c12ae75",
+ "d6a18c63b4e909871c0137ca652fad23",
+ "fd047f2fc1b8ffb95d0eeef3e8796a45",
+ "645ab60779ea348fd93c81561c31bab9",
+ "4409633c9db8dff41ade4292a3a56e7f",
+ "5e36a11e069b31c2a739f3a9c7b37c24",
+ "e83b9483d702cfae496991c3c7fa92c0",
+ "12f6ddf98c7f30a277307f1ea935b030",
+ "354321d6c32bbdb0739e4fa2acbf41e1",
+ },
+ {
+ // 32X64
+ "0ce332b343934b34cd4417725faa85cb",
+ "4e2a2cfd8f56f15939bdfc753145b303",
+ "0f46d124ba9f48cdd5d5290acf786d6d",
+ "e1e8ed803236367821981500a3d9eebe",
+ "1d2f8e48e3adb7c448be05d9f66f4954",
+ "9fb2e176636a5689b26f73ca73fcc512",
+ "e720ebccae7e25e36f23da53ae5b5d6a",
+ "86fe4364734169aaa4520d799890d530",
+ "b1870290764bb1b100d1974e2bd70f1d",
+ "ce5b238e19d85ef69d85badfab4e63ae",
+ },
+ {
+ // 64X32
+ "a6c5aeb722615089efbca80b02951ceb",
+ "538424b24bd0830f21788e7238ca762f",
+ "80c15b303235f9bc2259027bb92dfdc4",
+ "e48e1ac15e97191a8fda08d62fff343e",
+ "12604b37875533665078405ef4582e35",
+ "0048afa17bd3e1632d68b96048836530",
+ "07a0cfcb56a5eed50c4bd6c26814336b",
+ "529d8a070de5bc6531fa3ee8f450c233",
+ "33c50a11c7d78f72434064f634305e95",
+ "e0ef7f0559c1a50ec5a8c12011b962f7",
+ },
+ {
+ // 4X16
+ "750491056568eb8fe15387b86bdf06b8",
+ "3a52dae9f599f08cfb3bd1b910dc0e11",
+ "af79f71e3e03dbeca44e2e13561f70c7",
+ "ca7dfd7624afc0c06fb5552f44398535",
+ "b591af115444bf43140c29c269f68fb2",
+ "483d942ae36e69e62f31eb215331416f",
+ "f14b58525e81870bc5d95c7ac71a347f",
+ "371208bb4027d9badb04095d1590bbc4",
+ "c7049c21b2924d70c7c12784d6b6b796",
+ "7d87233f4b5b0f12086045e5d7b2d4c2",
+ },
+ {
+ // 16X4
+ "7c6e325a65e77e732b3adbe237e045e4",
+ "24478f93ffcec47852e004d0fe948464",
+ "258d042c67d4ba3ecfa667f0adc9aebf",
+ "b2cd21d06959f159a1f3c4d9768ee7fb",
+ "b4e1f38157bf8410e7c3da02f687a343",
+ "869e703729eb0fc0711c254944ff5d5a",
+ "9638dd77105a640b146a8201ea7a0801",
+ "919d932c6af8a1cc7486e8ce996dd487",
+ "e1c9be493b6714c7ae48f30044c43140",
+ "bf0fe3889d654b2f6eb98c8fc751f9e4",
+ },
+ {
+ // 8X32
+ "8dfac4319fe0bd40013ffb3102da8c72",
+ "feb46b6dc4e2ca0a09533bfc51d4dcb0",
+ "850837ec714c37262216527aaf4cbbe9",
+ "4603c7800fb08361f163daca876e8bda",
+ "1ff95e7d2debc27b05806fb25abfd624",
+ "d81b9a51a062b23ca7823804cb7bec22",
+ "f1d8978158766f46335203608cb807e7",
+ "f3527096256258c0878d644a9d7d53ca",
+ "cbde98ac8b009953eb112807ad2ea29e",
+ "654fb1153415747feae599f538122af5",
+ },
+ {
+ // 32X8
+ "3d4ee16fab374357474f60b845327bc7",
+ "bc17c5059473a476df4e85f56395ad55",
+ "3d4ee16fab374357474f60b845327bc7",
+ "c14b8db34dc2355b84e3735c9ba16c7f",
+ "a71d25b5d47a92a8b9223c98f18458ee",
+ "6c1cfe2b1893f4576a80675687cb6426",
+ "92d11bbef8b85bb48d799bb055de3514",
+ "bcf81d1db8ae5cc03360467f44f498ec",
+ "79f8c564163555592e808e145eaf5c60",
+ "46fff139cef2ef773938bcc8b0e5abb8",
+ },
+ {
+ // 16X64
+ "3b2a053ee8b05a8ac35ad23b0422a151",
+ "12b0c69595328c465e0b25e0c9e3e9fc",
+ "f77c544ac8035e01920deae40cee7b07",
+ "727797ef15ccd8d325476fe8f12006a3",
+ "f3be77c0fe67eb5d9d515e92bec21eb7",
+ "f1ece6409e01e9dd98b800d49628247d",
+ "efd2ec9bfbbd4fd1f6604ea369df1894",
+ "ec703de918422b9e03197ba0ed60a199",
+ "739418efb89c07f700895deaa5d0b3e3",
+ "9943ae1bbeeebfe1d3a92dc39e049d63",
+ },
+ {
+ // 64X16
+ "821b76b1494d4f84d20817840f719a1a",
+ "69e462c3338a9aaf993c3f7cfbc15649",
+ "516d8f6eb054d74d150e7b444185b6b9",
+ "de1b736e9d99129609d6ef3a491507a0",
+ "fd9b4276e7affe1e0e4ce4f428058994",
+ "cd82fd361a4767ac29a9f406b480b8f3",
+ "2792c2f810157a4a6cb13c28529ff779",
+ "1220442d90c4255ba0969d28b91e93a6",
+ "c7253e10b45f7f67dfee3256c9b94825",
+ "879792198071c7e0b50b9b5010d8c18f",
+ },
+};
+
+} // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to TestIntraPred. The test name is 'arch.TestIntraPred_tx_size', e.g.,
+// C.TestIntraPred.0
+#define INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, h, \
+ paeth, smooth, smooth_v, smooth_h) \
+ TEST(arch, DISABLED_##TestIntraPred_##tx_size) { \
+ static const AvxPredFunc aom_intra_pred[] = { \
+ dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \
+ }; \
+ TestIntraPred(tx_size, aom_intra_pred, kSignatures[tx_size]); \
+ }
+
+// -----------------------------------------------------------------------------
+// 4x4, 4x8, 4x16
+
+INTRA_PRED_TEST(C, TX_4X4, aom_dc_predictor_4x4_c, aom_dc_left_predictor_4x4_c,
+ aom_dc_top_predictor_4x4_c, aom_dc_128_predictor_4x4_c,
+ aom_v_predictor_4x4_c, aom_h_predictor_4x4_c,
+ aom_paeth_predictor_4x4_c, aom_smooth_predictor_4x4_c,
+ aom_smooth_v_predictor_4x4_c, aom_smooth_h_predictor_4x4_c)
+INTRA_PRED_TEST(C, TX_4X8, aom_dc_predictor_4x8_c, aom_dc_left_predictor_4x8_c,
+ aom_dc_top_predictor_4x8_c, aom_dc_128_predictor_4x8_c,
+ aom_v_predictor_4x8_c, aom_h_predictor_4x8_c,
+ aom_paeth_predictor_4x8_c, aom_smooth_predictor_4x8_c,
+ aom_smooth_v_predictor_4x8_c, aom_smooth_h_predictor_4x8_c)
+INTRA_PRED_TEST(C, TX_4X16, aom_dc_predictor_4x16_c,
+ aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
+ aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
+ aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
+ aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c,
+ aom_smooth_h_predictor_4x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TX_4X4, aom_dc_predictor_4x4_sse2,
+ aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
+ aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
+ aom_h_predictor_4x4_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_4X8, aom_dc_predictor_4x8_sse2,
+ aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
+ aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
+ aom_h_predictor_4x8_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_4X16, aom_dc_predictor_4x16_sse2,
+ aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
+ aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
+ aom_h_predictor_4x16_sse2, nullptr, nullptr, nullptr, nullptr)
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TX_4X4, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_4x4_ssse3,
+ aom_smooth_predictor_4x4_ssse3,
+ aom_smooth_v_predictor_4x4_ssse3,
+ aom_smooth_h_predictor_4x4_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_4X8, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_4x8_ssse3,
+ aom_smooth_predictor_4x8_ssse3,
+ aom_smooth_v_predictor_4x8_ssse3,
+ aom_smooth_h_predictor_4x8_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_4X16, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_4x16_ssse3,
+ aom_smooth_predictor_4x16_ssse3,
+ aom_smooth_v_predictor_4x16_ssse3,
+ aom_smooth_h_predictor_4x16_ssse3)
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
+ aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
+ aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
+ aom_h_predictor_4x4_neon, aom_paeth_predictor_4x4_neon,
+ aom_smooth_predictor_4x4_neon, aom_smooth_v_predictor_4x4_neon,
+ aom_smooth_h_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
+ aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
+ aom_dc_128_predictor_4x8_neon, aom_v_predictor_4x8_neon,
+ aom_h_predictor_4x8_neon, aom_paeth_predictor_4x8_neon,
+ aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
+ aom_smooth_h_predictor_4x8_neon)
+INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
+ aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
+ aom_dc_128_predictor_4x16_neon, aom_v_predictor_4x16_neon,
+ aom_h_predictor_4x16_neon, aom_paeth_predictor_4x16_neon,
+ aom_smooth_predictor_4x16_neon,
+ aom_smooth_v_predictor_4x16_neon,
+ aom_smooth_h_predictor_4x16_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
+
+INTRA_PRED_TEST(C, TX_8X8, aom_dc_predictor_8x8_c, aom_dc_left_predictor_8x8_c,
+ aom_dc_top_predictor_8x8_c, aom_dc_128_predictor_8x8_c,
+ aom_v_predictor_8x8_c, aom_h_predictor_8x8_c,
+ aom_paeth_predictor_8x8_c, aom_smooth_predictor_8x8_c,
+ aom_smooth_v_predictor_8x8_c, aom_smooth_h_predictor_8x8_c)
+
+INTRA_PRED_TEST(C, TX_8X4, aom_dc_predictor_8x4_c, aom_dc_left_predictor_8x4_c,
+ aom_dc_top_predictor_8x4_c, aom_dc_128_predictor_8x4_c,
+ aom_v_predictor_8x4_c, aom_h_predictor_8x4_c,
+ aom_paeth_predictor_8x4_c, aom_smooth_predictor_8x4_c,
+ aom_smooth_v_predictor_8x4_c, aom_smooth_h_predictor_8x4_c)
+INTRA_PRED_TEST(C, TX_8X16, aom_dc_predictor_8x16_c,
+ aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
+ aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
+ aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c,
+ aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
+ aom_smooth_h_predictor_8x16_c)
+INTRA_PRED_TEST(C, TX_8X32, aom_dc_predictor_8x32_c,
+ aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
+ aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
+ aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
+ aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c,
+ aom_smooth_h_predictor_8x32_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TX_8X8, aom_dc_predictor_8x8_sse2,
+ aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
+ aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
+ aom_h_predictor_8x8_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_8X4, aom_dc_predictor_8x4_sse2,
+ aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
+ aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
+ aom_h_predictor_8x4_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_8X16, aom_dc_predictor_8x16_sse2,
+ aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
+ aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
+ aom_h_predictor_8x16_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_8X32, aom_dc_predictor_8x32_sse2,
+ aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
+ aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
+ aom_h_predictor_8x32_sse2, nullptr, nullptr, nullptr, nullptr)
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TX_8X8, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_8x8_ssse3,
+ aom_smooth_predictor_8x8_ssse3,
+ aom_smooth_v_predictor_8x8_ssse3,
+ aom_smooth_h_predictor_8x8_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_8X4, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_8x4_ssse3,
+ aom_smooth_predictor_8x4_ssse3,
+ aom_smooth_v_predictor_8x4_ssse3,
+ aom_smooth_h_predictor_8x4_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_8X16, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_8x16_ssse3,
+ aom_smooth_predictor_8x16_ssse3,
+ aom_smooth_v_predictor_8x16_ssse3,
+ aom_smooth_h_predictor_8x16_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_8X32, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_8x32_ssse3,
+ aom_smooth_predictor_8x32_ssse3,
+ aom_smooth_v_predictor_8x32_ssse3,
+ aom_smooth_h_predictor_8x32_ssse3)
+#endif // HAVE_SSSE3
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
+ aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
+ aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
+ aom_h_predictor_8x8_neon, aom_paeth_predictor_8x8_neon,
+ aom_smooth_predictor_8x8_neon, aom_smooth_v_predictor_8x8_neon,
+ aom_smooth_h_predictor_8x8_neon)
+INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
+ aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
+ aom_dc_128_predictor_8x4_neon, aom_v_predictor_8x4_neon,
+ aom_h_predictor_8x4_neon, aom_paeth_predictor_8x4_neon,
+ aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
+ aom_smooth_h_predictor_8x4_neon)
+INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
+ aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
+ aom_dc_128_predictor_8x16_neon, aom_v_predictor_8x16_neon,
+ aom_h_predictor_8x16_neon, aom_paeth_predictor_8x16_neon,
+ aom_smooth_predictor_8x16_neon,
+ aom_smooth_v_predictor_8x16_neon,
+ aom_smooth_h_predictor_8x16_neon)
+INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
+ aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
+ aom_dc_128_predictor_8x32_neon, aom_v_predictor_8x32_neon,
+ aom_h_predictor_8x32_neon, aom_paeth_predictor_8x32_neon,
+ aom_smooth_predictor_8x32_neon,
+ aom_smooth_v_predictor_8x32_neon,
+ aom_smooth_h_predictor_8x32_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
+
+INTRA_PRED_TEST(C, TX_16X16, aom_dc_predictor_16x16_c,
+ aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
+ aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
+ aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c,
+ aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c,
+ aom_smooth_h_predictor_16x16_c)
+INTRA_PRED_TEST(C, TX_16X8, aom_dc_predictor_16x8_c,
+ aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
+ aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
+ aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c,
+ aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c,
+ aom_smooth_h_predictor_16x8_c)
+INTRA_PRED_TEST(C, TX_16X32, aom_dc_predictor_16x32_c,
+ aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
+ aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
+ aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c,
+ aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
+ aom_smooth_h_predictor_16x32_c)
+INTRA_PRED_TEST(C, TX_16X4, aom_dc_predictor_16x4_c,
+ aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
+ aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
+ aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c,
+ aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c,
+ aom_smooth_h_predictor_16x4_c)
+INTRA_PRED_TEST(C, TX_16X64, aom_dc_predictor_16x64_c,
+ aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c,
+ aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c,
+ aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
+ aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c,
+ aom_smooth_h_predictor_16x64_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TX_16X16, aom_dc_predictor_16x16_sse2,
+ aom_dc_left_predictor_16x16_sse2,
+ aom_dc_top_predictor_16x16_sse2,
+ aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
+ aom_h_predictor_16x16_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_16X8, aom_dc_predictor_16x8_sse2,
+ aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2,
+ aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2,
+ aom_h_predictor_16x8_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_16X32, aom_dc_predictor_16x32_sse2,
+ aom_dc_left_predictor_16x32_sse2,
+ aom_dc_top_predictor_16x32_sse2,
+ aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
+ aom_h_predictor_16x32_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_16X64, aom_dc_predictor_16x64_sse2,
+ aom_dc_left_predictor_16x64_sse2,
+ aom_dc_top_predictor_16x64_sse2,
+ aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
+ aom_h_predictor_16x64_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_16X4, aom_dc_predictor_16x4_sse2,
+ aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
+ aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
+ aom_h_predictor_16x4_sse2, nullptr, nullptr, nullptr, nullptr)
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TX_16X16, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x16_ssse3,
+ aom_smooth_predictor_16x16_ssse3,
+ aom_smooth_v_predictor_16x16_ssse3,
+ aom_smooth_h_predictor_16x16_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_16X8, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x8_ssse3,
+ aom_smooth_predictor_16x8_ssse3,
+ aom_smooth_v_predictor_16x8_ssse3,
+ aom_smooth_h_predictor_16x8_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_16X32, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x32_ssse3,
+ aom_smooth_predictor_16x32_ssse3,
+ aom_smooth_v_predictor_16x32_ssse3,
+ aom_smooth_h_predictor_16x32_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_16X64, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x64_ssse3,
+ aom_smooth_predictor_16x64_ssse3,
+ aom_smooth_v_predictor_16x64_ssse3,
+ aom_smooth_h_predictor_16x64_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_16X4, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x4_ssse3,
+ aom_smooth_predictor_16x4_ssse3,
+ aom_smooth_v_predictor_16x4_ssse3,
+ aom_smooth_h_predictor_16x4_ssse3)
+#endif // HAVE_SSSE3
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2, TX_16X16, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x16_avx2, nullptr, nullptr,
+ nullptr)
+INTRA_PRED_TEST(AVX2, TX_16X8, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x8_avx2, nullptr, nullptr,
+ nullptr)
+INTRA_PRED_TEST(AVX2, TX_16X32, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x32_avx2, nullptr, nullptr,
+ nullptr)
+INTRA_PRED_TEST(AVX2, TX_16X64, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_16x64_avx2, nullptr, nullptr,
+ nullptr)
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
+ aom_dc_left_predictor_16x16_neon,
+ aom_dc_top_predictor_16x16_neon,
+ aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
+ aom_h_predictor_16x16_neon, aom_paeth_predictor_16x16_neon,
+ aom_smooth_predictor_16x16_neon,
+ aom_smooth_v_predictor_16x16_neon,
+ aom_smooth_h_predictor_16x16_neon)
+INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
+ aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
+ aom_dc_128_predictor_16x8_neon, aom_v_predictor_16x8_neon,
+ aom_h_predictor_16x8_neon, aom_paeth_predictor_16x8_neon,
+ aom_smooth_predictor_16x8_neon,
+ aom_smooth_v_predictor_16x8_neon,
+ aom_smooth_h_predictor_16x8_neon)
+INTRA_PRED_TEST(NEON, TX_16X32, aom_dc_predictor_16x32_neon,
+ aom_dc_left_predictor_16x32_neon,
+ aom_dc_top_predictor_16x32_neon,
+ aom_dc_128_predictor_16x32_neon, aom_v_predictor_16x32_neon,
+ aom_h_predictor_16x32_neon, aom_paeth_predictor_16x32_neon,
+ aom_smooth_predictor_16x32_neon,
+ aom_smooth_v_predictor_16x32_neon,
+ aom_smooth_h_predictor_16x32_neon)
+INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
+ aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
+ aom_dc_128_predictor_16x4_neon, aom_v_predictor_16x4_neon,
+ aom_h_predictor_16x4_neon, aom_paeth_predictor_16x4_neon,
+ aom_smooth_predictor_16x4_neon,
+ aom_smooth_v_predictor_16x4_neon,
+ aom_smooth_h_predictor_16x4_neon)
+INTRA_PRED_TEST(NEON, TX_16X64, aom_dc_predictor_16x64_neon,
+ aom_dc_left_predictor_16x64_neon,
+ aom_dc_top_predictor_16x64_neon,
+ aom_dc_128_predictor_16x64_neon, aom_v_predictor_16x64_neon,
+ aom_h_predictor_16x64_neon, aom_paeth_predictor_16x64_neon,
+ aom_smooth_predictor_16x64_neon,
+ aom_smooth_v_predictor_16x64_neon,
+ aom_smooth_h_predictor_16x64_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
+
+INTRA_PRED_TEST(C, TX_32X32, aom_dc_predictor_32x32_c,
+ aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
+ aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
+ aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c,
+ aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c,
+ aom_smooth_h_predictor_32x32_c)
+INTRA_PRED_TEST(C, TX_32X16, aom_dc_predictor_32x16_c,
+ aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
+ aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
+ aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c,
+ aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c,
+ aom_smooth_h_predictor_32x16_c)
+INTRA_PRED_TEST(C, TX_32X64, aom_dc_predictor_32x64_c,
+ aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c,
+ aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c,
+ aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c,
+ aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
+ aom_smooth_h_predictor_32x64_c)
+INTRA_PRED_TEST(C, TX_32X8, aom_dc_predictor_32x8_c,
+ aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
+ aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
+ aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
+ aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c,
+ aom_smooth_h_predictor_32x8_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TX_32X32, aom_dc_predictor_32x32_sse2,
+ aom_dc_left_predictor_32x32_sse2,
+ aom_dc_top_predictor_32x32_sse2,
+ aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
+ aom_h_predictor_32x32_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_32X16, aom_dc_predictor_32x16_sse2,
+ aom_dc_left_predictor_32x16_sse2,
+ aom_dc_top_predictor_32x16_sse2,
+ aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
+ aom_h_predictor_32x16_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_32X64, aom_dc_predictor_32x64_sse2,
+ aom_dc_left_predictor_32x64_sse2,
+ aom_dc_top_predictor_32x64_sse2,
+ aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
+ aom_h_predictor_32x64_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_32X8, aom_dc_predictor_32x8_sse2,
+ aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
+ aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
+ aom_h_predictor_32x8_sse2, nullptr, nullptr, nullptr, nullptr)
+#endif // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TX_32X32, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_32x32_ssse3,
+ aom_smooth_predictor_32x32_ssse3,
+ aom_smooth_v_predictor_32x32_ssse3,
+ aom_smooth_h_predictor_32x32_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_32X16, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_32x16_ssse3,
+ aom_smooth_predictor_32x16_ssse3,
+ aom_smooth_v_predictor_32x16_ssse3,
+ aom_smooth_h_predictor_32x16_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_32X64, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_32x64_ssse3,
+ aom_smooth_predictor_32x64_ssse3,
+ aom_smooth_v_predictor_32x64_ssse3,
+ aom_smooth_h_predictor_32x64_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_32X8, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_32x8_ssse3,
+ aom_smooth_predictor_32x8_ssse3,
+ aom_smooth_v_predictor_32x8_ssse3,
+ aom_smooth_h_predictor_32x8_ssse3)
+#endif // HAVE_SSSE3
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2, TX_32X32, aom_dc_predictor_32x32_avx2,
+ aom_dc_left_predictor_32x32_avx2,
+ aom_dc_top_predictor_32x32_avx2,
+ aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
+ aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2,
+ nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(AVX2, TX_32X16, aom_dc_predictor_32x16_avx2,
+ aom_dc_left_predictor_32x16_avx2,
+ aom_dc_top_predictor_32x16_avx2,
+ aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
+ nullptr, aom_paeth_predictor_32x16_avx2, nullptr, nullptr,
+ nullptr)
+INTRA_PRED_TEST(AVX2, TX_32X64, aom_dc_predictor_32x64_avx2,
+ aom_dc_left_predictor_32x64_avx2,
+ aom_dc_top_predictor_32x64_avx2,
+ aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
+ nullptr, aom_paeth_predictor_32x64_avx2, nullptr, nullptr,
+ nullptr)
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon,
+ aom_dc_left_predictor_32x32_neon,
+ aom_dc_top_predictor_32x32_neon,
+ aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
+ aom_h_predictor_32x32_neon, aom_paeth_predictor_32x32_neon,
+ aom_smooth_predictor_32x32_neon,
+ aom_smooth_v_predictor_32x32_neon,
+ aom_smooth_h_predictor_32x32_neon)
+INTRA_PRED_TEST(NEON, TX_32X16, aom_dc_predictor_32x16_neon,
+ aom_dc_left_predictor_32x16_neon,
+ aom_dc_top_predictor_32x16_neon,
+ aom_dc_128_predictor_32x16_neon, aom_v_predictor_32x16_neon,
+ aom_h_predictor_32x16_neon, aom_paeth_predictor_32x16_neon,
+ aom_smooth_predictor_32x16_neon,
+ aom_smooth_v_predictor_32x16_neon,
+ aom_smooth_h_predictor_32x16_neon)
+INTRA_PRED_TEST(NEON, TX_32X64, aom_dc_predictor_32x64_neon,
+ aom_dc_left_predictor_32x64_neon,
+ aom_dc_top_predictor_32x64_neon,
+ aom_dc_128_predictor_32x64_neon, aom_v_predictor_32x64_neon,
+ aom_h_predictor_32x64_neon, aom_paeth_predictor_32x64_neon,
+ aom_smooth_predictor_32x64_neon,
+ aom_smooth_v_predictor_32x64_neon,
+ aom_smooth_h_predictor_32x64_neon)
+INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
+ aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
+ aom_dc_128_predictor_32x8_neon, aom_v_predictor_32x8_neon,
+ aom_h_predictor_32x8_neon, aom_paeth_predictor_32x8_neon,
+ aom_smooth_predictor_32x8_neon,
+ aom_smooth_v_predictor_32x8_neon,
+ aom_smooth_h_predictor_32x8_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+INTRA_PRED_TEST(C, TX_64X64, aom_dc_predictor_64x64_c,
+ aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c,
+ aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c,
+ aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c,
+ aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c,
+ aom_smooth_h_predictor_64x64_c)
+INTRA_PRED_TEST(C, TX_64X32, aom_dc_predictor_64x32_c,
+ aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c,
+ aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c,
+ aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c,
+ aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
+ aom_smooth_h_predictor_64x32_c)
+INTRA_PRED_TEST(C, TX_64X16, aom_dc_predictor_64x16_c,
+ aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
+ aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
+ aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
+ aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
+ aom_smooth_h_predictor_64x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TX_64X64, aom_dc_predictor_64x64_sse2,
+ aom_dc_left_predictor_64x64_sse2,
+ aom_dc_top_predictor_64x64_sse2,
+ aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
+ aom_h_predictor_64x64_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_64X32, aom_dc_predictor_64x32_sse2,
+ aom_dc_left_predictor_64x32_sse2,
+ aom_dc_top_predictor_64x32_sse2,
+ aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
+ aom_h_predictor_64x32_sse2, nullptr, nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(SSE2, TX_64X16, aom_dc_predictor_64x16_sse2,
+ aom_dc_left_predictor_64x16_sse2,
+ aom_dc_top_predictor_64x16_sse2,
+ aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
+ aom_h_predictor_64x16_sse2, nullptr, nullptr, nullptr, nullptr)
+#endif
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TX_64X64, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_64x64_ssse3,
+ aom_smooth_predictor_64x64_ssse3,
+ aom_smooth_v_predictor_64x64_ssse3,
+ aom_smooth_h_predictor_64x64_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_64X32, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_64x32_ssse3,
+ aom_smooth_predictor_64x32_ssse3,
+ aom_smooth_v_predictor_64x32_ssse3,
+ aom_smooth_h_predictor_64x32_ssse3)
+INTRA_PRED_TEST(SSSE3, TX_64X16, nullptr, nullptr, nullptr, nullptr, nullptr,
+ nullptr, aom_paeth_predictor_64x16_ssse3,
+ aom_smooth_predictor_64x16_ssse3,
+ aom_smooth_v_predictor_64x16_ssse3,
+ aom_smooth_h_predictor_64x16_ssse3)
+#endif
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2, TX_64X64, aom_dc_predictor_64x64_avx2,
+ aom_dc_left_predictor_64x64_avx2,
+ aom_dc_top_predictor_64x64_avx2,
+ aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
+ nullptr, aom_paeth_predictor_64x64_avx2, nullptr, nullptr,
+ nullptr)
+INTRA_PRED_TEST(AVX2, TX_64X32, aom_dc_predictor_64x32_avx2,
+ aom_dc_left_predictor_64x32_avx2,
+ aom_dc_top_predictor_64x32_avx2,
+ aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
+ nullptr, aom_paeth_predictor_64x32_avx2, nullptr, nullptr,
+ nullptr)
+INTRA_PRED_TEST(AVX2, TX_64X16, aom_dc_predictor_64x16_avx2,
+ aom_dc_left_predictor_64x16_avx2,
+ aom_dc_top_predictor_64x16_avx2,
+ aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
+ nullptr, aom_paeth_predictor_64x16_avx2, nullptr, nullptr,
+ nullptr)
+#endif
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_64X64, aom_dc_predictor_64x64_neon,
+ aom_dc_left_predictor_64x64_neon,
+ aom_dc_top_predictor_64x64_neon,
+ aom_dc_128_predictor_64x64_neon, aom_v_predictor_64x64_neon,
+ aom_h_predictor_64x64_neon, aom_paeth_predictor_64x64_neon,
+ aom_smooth_predictor_64x64_neon,
+ aom_smooth_v_predictor_64x64_neon,
+ aom_smooth_h_predictor_64x64_neon)
+INTRA_PRED_TEST(NEON, TX_64X32, aom_dc_predictor_64x32_neon,
+ aom_dc_left_predictor_64x32_neon,
+ aom_dc_top_predictor_64x32_neon,
+ aom_dc_128_predictor_64x32_neon, aom_v_predictor_64x32_neon,
+ aom_h_predictor_64x32_neon, aom_paeth_predictor_64x32_neon,
+ aom_smooth_predictor_64x32_neon,
+ aom_smooth_v_predictor_64x32_neon,
+ aom_smooth_h_predictor_64x32_neon)
+INTRA_PRED_TEST(NEON, TX_64X16, aom_dc_predictor_64x16_neon,
+ aom_dc_left_predictor_64x16_neon,
+ aom_dc_top_predictor_64x16_neon,
+ aom_dc_128_predictor_64x16_neon, aom_v_predictor_64x16_neon,
+ aom_h_predictor_64x16_neon, aom_paeth_predictor_64x16_neon,
+ aom_smooth_predictor_64x16_neon,
+ aom_smooth_v_predictor_64x16_neon,
+ aom_smooth_h_predictor_64x16_neon)
+#endif // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// -----------------------------------------------------------------------------
+// High Bitdepth
+namespace {
+
+typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd);
+
+typedef IntraPredTestMem<uint16_t> Av1HighbdIntraPredTestMem;
+
+void TestHighbdIntraPred(TX_SIZE tx_size, AvxHighbdPredFunc const *pred_funcs,
+ const char *const signatures[]) {
+ const int block_width = tx_size_wide[tx_size];
+ const int block_height = tx_size_high[tx_size];
+ const int num_pixels_per_test =
+ block_width * block_height * kNumAv1IntraFuncs;
+ const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+ Av1HighbdIntraPredTestMem intra_pred_test_mem;
+ const int bd = 12;
+ intra_pred_test_mem.Init(block_width, block_height, bd);
+
+ for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+ if (pred_funcs[k] == nullptr) continue;
+ memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+ sizeof(intra_pred_test_mem.src));
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+ pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+ intra_pred_test_mem.above, intra_pred_test_mem.left, bd);
+ }
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+ CheckMd5Signature(
+ tx_size, true, signatures, intra_pred_test_mem.src,
+ intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+ elapsed_time, k);
+ }
+}
+
+static const char *const kHighbdSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+ {
+ // 4X4
+ "11f74af6c5737df472f3275cbde062fa",
+ "51bea056b6447c93f6eb8f6b7e8f6f71",
+ "27e97f946766331795886f4de04c5594",
+ "53ab15974b049111fb596c5168ec7e3f",
+ "f0b640bb176fbe4584cf3d32a9b0320a",
+ "729783ca909e03afd4b47111c80d967b",
+ "6e30009c45474a22032678b1bd579c8f",
+ "e57cba016d808aa8a35619df2a65f049",
+ "55a6c37f39afcbbf5abca4a985b96459",
+ "a623d45b37dafec1f8a75c4c5218913d",
+ },
+ {
+ // 8X8
+ "03da8829fe94663047fd108c5fcaa71d",
+ "ecdb37b8120a2d3a4c706b016bd1bfd7",
+ "1d4543ed8d2b9368cb96898095fe8a75",
+ "f791c9a67b913cbd82d9da8ecede30e2",
+ "065c70646f4dbaff913282f55a45a441",
+ "51f87123616662ef7c35691497dfd0ba",
+ "85c01ba03df68f9ece7bd3fa0f8980e6",
+ "ad19b7dac092f56df6d054e1f67f21e7",
+ "0edc415b5dd7299f7a34fb9f71d31d78",
+ "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
+ },
+ {
+ // 16X16
+ "e33cb3f56a878e2fddb1b2fc51cdd275",
+ "c7bff6f04b6052c8ab335d726dbbd52d",
+ "d0b0b47b654a9bcc5c6008110a44589b",
+ "78f5da7b10b2b9ab39f114a33b6254e9",
+ "c78e31d23831abb40d6271a318fdd6f3",
+ "90d1347f4ec9198a0320daecb6ff90b8",
+ "e63ded54ab3d0e8728b6f24d4f01e53f",
+ "35ce21fbe0ea114c089fc3489a78155d",
+ "f277f6ef8e4d717f1f0dfe2706ac197d",
+ "e8014d3f41256976c02e0f1e622ba2b9",
+ },
+ {
+ // 32X32
+ "a3e8056ba7e36628cce4917cd956fedd",
+ "cc7d3024fe8748b512407edee045377e",
+ "2aab0a0f330a1d3e19b8ecb8f06387a3",
+ "a547bc3fb7b06910bf3973122a426661",
+ "26f712514da95042f93d6e8dc8e431dc",
+ "bb08c6e16177081daa3d936538dbc2e3",
+ "84bf83f94a51b33654ca940c6f8bc057",
+ "7168b03fc31bf29596a344d6a35d007c",
+ "b073a70d3672f1282236994f5d12e94b",
+ "c51607aebad5dcb3c1e3b58ef9e5b84e",
+ },
+ {
+ // 64X64
+ "a6baa0d4bfb2269a94c7a38f86a4bccf",
+ "3f1ef5f473a49eba743f17a3324adf9d",
+ "12ac11889ae5f55b7781454efd706a6a",
+ "d9a906c0e692b22e1b4414e71a704b7e",
+ "47d4cadd56f70c11ff8f3e5d8df81161",
+ "de997744cf24c16c5ac2a36b02b351cc",
+ "23781211ae178ddeb6c4bb97a6bd7d83",
+ "a79d2e28340ca34b9e37daabbf030f63",
+ "0372bd3ddfc258750a6ac106b70587f4",
+ "228ef625d9460cbf6fa253a16a730976",
+ },
+ {
+ // 4X8
+ "22d519b796d59644043466320e4ccd14",
+ "09513a738c49b3f9542d27f34abbe1d5",
+ "807ae5e8813443ff01e71be6efacfb69",
+ "cbfa18d0293430b6e9708b0be1fd2394",
+ "346c354c34ec7fa780b576db355dab88",
+ "f97dae85c35359632380b09ca98d611e",
+ "698ae351d8896d89ed9e4e67b6e53eda",
+ "dcc197034a9c45a3d8238bf085835f4e",
+ "7a35e2c42ffdc2efc2d6d1d75a100fc7",
+ "41ab6cebd4516c87a91b2a593e2c2506",
+ },
+ {
+ // 8X4
+ "d58cd4c4bf3b7bbaa5db5e1a5622ec78",
+ "6e572c35aa782d00cafcb99e9ea047ea",
+ "e8c22a3702b416dc9ab974505afbed09",
+ "aaa4e4762a795aad7ad74de0c662c4e4",
+ "a19f9101967383c3dcbd516dc317a291",
+ "9ab8cb91f1a595b9ebe3fe8de58031aa",
+ "2cf9021d5f1169268699807ee118b65f",
+ "ee9605fcbd6fb871f1c5cd81a6989327",
+ "b4871af8316089e3e23522175df7e93f",
+ "d33301e1c2cb173be46792a22d19881a",
+ },
+ {
+ // 8X16
+ "4562de1d0336610880fdd5685498a9ec",
+ "16310fa7076394f16fc85c4b149d89c9",
+ "0e94af88e1dc573b6f0f499cddd1f530",
+ "dfd245ee20d091c67809160340365aa9",
+ "d3562504327f70c096c5be23fd8a3747",
+ "601b853558502acbb5135eadd2da117a",
+ "3c624345a723a1b2b1bea05a6a08bc99",
+ "2a9c781de609e0184cc7ab442050f4e5",
+ "0ddc5035c22252747126b61fc238c74d",
+ "e43f5d83bab759af69c7b6773fc8f9b2",
+ },
+ {
+ // 16X8
+ "a57d6b5a9bfd30c29591d8717ace9c51",
+ "f5907ba97ee6c53e339e953fc8d845ee",
+ "ea3aa727913ce45af06f89dd1808db5f",
+ "408af4f23e48d14b48ee35ae094fcd18",
+ "85c41cbcb5d744f7961e8950026fbffe",
+ "8a4e588a837638887ba671f8d4910485",
+ "b792d8826b67a21757ea7097cff9e05b",
+ "f94ce7101bb87fd3bb9312112527dbf4",
+ "688c6660a6dc6fa61fa1aa38e708c209",
+ "0cdf641b4f81d69509c92ae0b93ef5ff",
+ },
+ {
+ // 16X32
+ "aee4b3b0e3cc02d48e2c40d77f807927",
+ "8baef2b2e789f79c8df9d90ad10f34a4",
+ "038c38ee3c4f090bb8d736eab136aafc",
+ "1a3de2aaeaffd68a9fd6c7f6557b83f3",
+ "385c6e0ea29421dd81011a2934641e26",
+ "6cf96c285d1a2d4787f955dad715b08c",
+ "2d7f75dcd73b9528c8396279ff09ff3a",
+ "5a63cd1841e4ed470e4ca5ef845f2281",
+ "610d899ca945fbead33287d4335a8b32",
+ "6bafaad81fce37be46730187e78d8b11",
+ },
+ {
+ // 32X16
+ "290b23c9f5a1de7905bfa71a942da29b",
+ "701e7b82593c66da5052fc4b6afd79ce",
+ "4da828c5455cd246735a663fbb204989",
+ "e3fbeaf234efece8dbd752b77226200c",
+ "4d1d8c969f05155a7e7e84cf7aad021b",
+ "c22e4877c2c946d5bdc0d542e29e70cf",
+ "8ac1ce815e7780500f842b0beb0bb980",
+ "9fee2e2502b507f25bfad30a55b0b610",
+ "4ced9c212ec6f9956e27f68a91b59fef",
+ "4a7a0b93f138bb0863e4e465b01ec0b1",
+ },
+ {
+ // 32X64
+ "ad9cfc395a5c5644a21d958c7274ac14",
+ "f29d6d03c143ddf96fef04c19f2c8333",
+ "a8bdc852ef704dd4975c61893e8fbc3f",
+ "7d0bd7dea26226741dbca9a97f27fa74",
+ "45c27c5cca9a91b6ae8379feb0881c9f",
+ "8a0b78df1e001b85c874d686eac4aa1b",
+ "ce9fa75fac54a3f6c0cc3f2083b938f1",
+ "c0dca10d88762c954af18dc9e3791a39",
+ "61df229eddfccab913b8fda4bb02f9ac",
+ "4f4df6bc8d50a5600b573f0e44d70e66",
+ },
+ {
+ // 64X32
+ "db9d82921fd88b24fdff6f849f2f9c87",
+ "5ecc7fdc52d2f575ad4f2d0e9e6b1e11",
+ "b4581311a0a73d95dfac7f8f44591032",
+ "68bd283cfd1a125f6b2ee47cee874d36",
+ "804179f05c032908a5e36077bb87c994",
+ "fc5fd041a8ee779015394d0c066ee43c",
+ "68f5579ccadfe9a1baafb158334a3db2",
+ "fe237e45e215ab06d79046da9ad71e84",
+ "9a8a938a6824551bf7d21b8fd1d70ea1",
+ "eb7332f2017cd96882c76e7136aeaf53",
+ },
+ {
+ // 4X16
+ "7bafa307d507747b8132e7735b7f1c73",
+ "e58bc2d8213a97d1fea9cfb73d7a9633",
+ "435f8a8e8bbf14dbf2fe16b2be9e97aa",
+ "1d0e767b68d84acbfb50b7a04e633836",
+ "5f713bd7b324fe73bb7063e35ee14e5e",
+ "0dac4e1fa3d59814202715468c01ed56",
+ "47709d1db4a330c7a8900f450e6fddd1",
+ "258e0b930bb27db28f05da9cf7d1ee7c",
+ "36cf030fbae767912593efea045bfff5",
+ "248d7aceabb7499febae663fae41a920",
+ },
+ {
+ // 16X4
+ "04dde98e632670e393704742c89f9067",
+ "8c72543f1664651ae1fa08e2ac0adb9b",
+ "2354a2cdc2773aa2df8ab4010db1be39",
+ "6300ad3221c26da39b10e0e6d87ee3be",
+ "8ea30b661c6ba60b28d3167f19e449b8",
+ "fb6c1e4ff101a371cede63c2955cdb7e",
+ "a517c06433d6d7927b16a72184a23e92",
+ "393828be5d62ab6c48668bea5e2f801a",
+ "b1e510c542013eb9d6fb188dea2ce90a",
+ "569a8f2fe01679ca216535ecbcdccb62",
+ },
+ {
+ // 8X32
+ "9d541865c185ca7607852852613ac1fc",
+ "b96be67f08c6b5fa5ebd3411299c2f7c",
+ "75a2dcf50004b9d188849b048239767e",
+ "429492ff415c9fd9b050d73b2ad500f8",
+ "64b3606c1ccd036bd766bd5711392cf4",
+ "cb59844a0f01660ac955bae3511f1100",
+ "3e076155b7a70e8828618e3f33b51e3d",
+ "ed2d1f597ab7c50beff690f737cf9726",
+ "7909c6a26aaf20c59d996d3e5b5f9c29",
+ "965798807240c98c6f7cc9b457ed0773",
+ },
+ {
+ // 32X8
+ "36f391aa31619eec1f4d9ee95ea454cc",
+ "b82648f14eeba2527357cb50bc3223cb",
+ "7a7b2adf429125e8bee9d1d00a66e13f",
+ "4198e4d6ba503b7cc2d7e96bb845f661",
+ "96c160d2ec1be9fe0cdea9682f14d257",
+ "19a450bcebaa75afb4fc6bd1fd6434af",
+ "2bd2e35967d43d0ec1c6587a36f204d5",
+ "49799a99aa4ccfbd989bee92a99422f1",
+ "955530e99813812a74659edeac3f5475",
+ "f0316b84e378a19cd11b19a6e40b2914",
+ },
+ {
+ // 16X64
+ "8cba1b70a0bde29e8ef235cedc5faa7d",
+ "96d00ddc7537bf7f196006591b733b4e",
+ "cbf69d5d157c9f3355a4757b1d6e3414",
+ "3ac1f642019493dec1b737d7a3a1b4e5",
+ "35f9ee300d7fa3c97338e81a6f21dcd4",
+ "aae335442e77c8ebc280f16ea50ba9c7",
+ "a6140fdac2278644328be094d88731db",
+ "2df93621b6ff100f7008432d509f4161",
+ "c77bf5aee39e7ed4a3dd715f816f452a",
+ "02109bd63557d90225c32a8f1338258e",
+ },
+ {
+ // 64X16
+ "a5e2f9fb685d5f4a048e9a96affd25a4",
+ "1348f249690d9eefe09d9ad7ead2c801",
+ "525da4b187acd81b1ff1116b60461141",
+ "e99d072de858094c98b01bd4a6772634",
+ "873bfa9dc24693f19721f7c8d527f7d3",
+ "0acfc6507bd3468e9679efc127d6e4b9",
+ "57d03f8d079c7264854e22ac1157cfae",
+ "6c2c4036f70c7d957a9399b5436c0774",
+ "42b8e4a97b7f8416c72a5148c031c0b1",
+ "a38a2c5f79993dfae8530e9e25800893",
+ },
+};
+
+} // namespace
+
+#define HIGHBD_INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, \
+ h, paeth, smooth, smooth_v, smooth_h) \
+ TEST(arch, DISABLED_##TestHighbdIntraPred_##tx_size) { \
+ static const AvxHighbdPredFunc aom_intra_pred[] = { \
+ dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \
+ }; \
+ TestHighbdIntraPred(tx_size, aom_intra_pred, kHighbdSignatures[tx_size]); \
+ }
+
+// -----------------------------------------------------------------------------
+// 4x4, 4x8, 4x16
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_4X4, aom_highbd_dc_predictor_4x4_c,
+ aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c,
+ aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c,
+ aom_highbd_h_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
+ aom_highbd_smooth_predictor_4x4_c, aom_highbd_smooth_v_predictor_4x4_c,
+ aom_highbd_smooth_h_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_4X8, aom_highbd_dc_predictor_4x8_c,
+ aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
+ aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
+ aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
+ aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c,
+ aom_highbd_smooth_h_predictor_4x8_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_4X16, aom_highbd_dc_predictor_4x16_c,
+ aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c,
+ aom_highbd_dc_128_predictor_4x16_c, aom_highbd_v_predictor_4x16_c,
+ aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c,
+ aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c,
+ aom_highbd_smooth_h_predictor_4x16_c)
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
+ aom_highbd_dc_left_predictor_4x4_sse2,
+ aom_highbd_dc_top_predictor_4x4_sse2,
+ aom_highbd_dc_128_predictor_4x4_sse2,
+ aom_highbd_v_predictor_4x4_sse2,
+ aom_highbd_h_predictor_4x4_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2,
+ aom_highbd_dc_left_predictor_4x8_sse2,
+ aom_highbd_dc_top_predictor_4x8_sse2,
+ aom_highbd_dc_128_predictor_4x8_sse2,
+ aom_highbd_v_predictor_4x8_sse2,
+ aom_highbd_h_predictor_4x8_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+#endif
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon,
+ aom_highbd_dc_left_predictor_4x4_neon,
+ aom_highbd_dc_top_predictor_4x4_neon,
+ aom_highbd_dc_128_predictor_4x4_neon,
+ aom_highbd_v_predictor_4x4_neon,
+ aom_highbd_h_predictor_4x4_neon,
+ aom_highbd_paeth_predictor_4x4_neon,
+ aom_highbd_smooth_predictor_4x4_neon,
+ aom_highbd_smooth_v_predictor_4x4_neon,
+ aom_highbd_smooth_h_predictor_4x4_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, aom_highbd_dc_predictor_4x8_neon,
+ aom_highbd_dc_left_predictor_4x8_neon,
+ aom_highbd_dc_top_predictor_4x8_neon,
+ aom_highbd_dc_128_predictor_4x8_neon,
+ aom_highbd_v_predictor_4x8_neon,
+ aom_highbd_h_predictor_4x8_neon,
+ aom_highbd_paeth_predictor_4x8_neon,
+ aom_highbd_smooth_predictor_4x8_neon,
+ aom_highbd_smooth_v_predictor_4x8_neon,
+ aom_highbd_smooth_h_predictor_4x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, aom_highbd_dc_predictor_4x16_neon,
+ aom_highbd_dc_left_predictor_4x16_neon,
+ aom_highbd_dc_top_predictor_4x16_neon,
+ aom_highbd_dc_128_predictor_4x16_neon,
+ aom_highbd_v_predictor_4x16_neon,
+ aom_highbd_h_predictor_4x16_neon,
+ aom_highbd_paeth_predictor_4x16_neon,
+ aom_highbd_smooth_predictor_4x16_neon,
+ aom_highbd_smooth_v_predictor_4x16_neon,
+ aom_highbd_smooth_h_predictor_4x16_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_8X8, aom_highbd_dc_predictor_8x8_c,
+ aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
+ aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
+ aom_highbd_h_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
+ aom_highbd_smooth_predictor_8x8_c, aom_highbd_smooth_v_predictor_8x8_c,
+ aom_highbd_smooth_h_predictor_8x8_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_8X4, aom_highbd_dc_predictor_8x4_c,
+ aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
+ aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
+ aom_highbd_h_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
+ aom_highbd_smooth_predictor_8x4_c, aom_highbd_smooth_v_predictor_8x4_c,
+ aom_highbd_smooth_h_predictor_8x4_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_8X16, aom_highbd_dc_predictor_8x16_c,
+ aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
+ aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
+ aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
+ aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c,
+ aom_highbd_smooth_h_predictor_8x16_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_8X32, aom_highbd_dc_predictor_8x32_c,
+ aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c,
+ aom_highbd_dc_128_predictor_8x32_c, aom_highbd_v_predictor_8x32_c,
+ aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c,
+ aom_highbd_smooth_predictor_8x32_c, aom_highbd_smooth_v_predictor_8x32_c,
+ aom_highbd_smooth_h_predictor_8x32_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
+ aom_highbd_dc_left_predictor_8x8_sse2,
+ aom_highbd_dc_top_predictor_8x8_sse2,
+ aom_highbd_dc_128_predictor_8x8_sse2,
+ aom_highbd_v_predictor_8x8_sse2,
+ aom_highbd_h_predictor_8x8_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2,
+ aom_highbd_dc_left_predictor_8x4_sse2,
+ aom_highbd_dc_top_predictor_8x4_sse2,
+ aom_highbd_dc_128_predictor_8x4_sse2,
+ aom_highbd_v_predictor_8x4_sse2,
+ aom_highbd_h_predictor_8x4_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_8X16, aom_highbd_dc_predictor_8x16_sse2,
+ aom_highbd_dc_left_predictor_8x16_sse2,
+ aom_highbd_dc_top_predictor_8x16_sse2,
+ aom_highbd_dc_128_predictor_8x16_sse2,
+ aom_highbd_v_predictor_8x16_sse2,
+ aom_highbd_h_predictor_8x16_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_8X8, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+#endif
+
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon,
+ aom_highbd_dc_left_predictor_8x8_neon,
+ aom_highbd_dc_top_predictor_8x8_neon,
+ aom_highbd_dc_128_predictor_8x8_neon,
+ aom_highbd_v_predictor_8x8_neon,
+ aom_highbd_h_predictor_8x8_neon,
+ aom_highbd_paeth_predictor_8x8_neon,
+ aom_highbd_smooth_predictor_8x8_neon,
+ aom_highbd_smooth_v_predictor_8x8_neon,
+ aom_highbd_smooth_h_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, aom_highbd_dc_predictor_8x4_neon,
+ aom_highbd_dc_left_predictor_8x4_neon,
+ aom_highbd_dc_top_predictor_8x4_neon,
+ aom_highbd_dc_128_predictor_8x4_neon,
+ aom_highbd_v_predictor_8x4_neon,
+ aom_highbd_h_predictor_8x4_neon,
+ aom_highbd_paeth_predictor_8x4_neon,
+ aom_highbd_smooth_predictor_8x4_neon,
+ aom_highbd_smooth_v_predictor_8x4_neon,
+ aom_highbd_smooth_h_predictor_8x4_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, aom_highbd_dc_predictor_8x16_neon,
+ aom_highbd_dc_left_predictor_8x16_neon,
+ aom_highbd_dc_top_predictor_8x16_neon,
+ aom_highbd_dc_128_predictor_8x16_neon,
+ aom_highbd_v_predictor_8x16_neon,
+ aom_highbd_h_predictor_8x16_neon,
+ aom_highbd_paeth_predictor_8x16_neon,
+ aom_highbd_smooth_predictor_8x16_neon,
+ aom_highbd_smooth_v_predictor_8x16_neon,
+ aom_highbd_smooth_h_predictor_8x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, aom_highbd_dc_predictor_8x32_neon,
+ aom_highbd_dc_left_predictor_8x32_neon,
+ aom_highbd_dc_top_predictor_8x32_neon,
+ aom_highbd_dc_128_predictor_8x32_neon,
+ aom_highbd_v_predictor_8x32_neon,
+ aom_highbd_h_predictor_8x32_neon,
+ aom_highbd_paeth_predictor_8x32_neon,
+ aom_highbd_smooth_predictor_8x32_neon,
+ aom_highbd_smooth_v_predictor_8x32_neon,
+ aom_highbd_smooth_h_predictor_8x32_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_16X16, aom_highbd_dc_predictor_16x16_c,
+ aom_highbd_dc_left_predictor_16x16_c, aom_highbd_dc_top_predictor_16x16_c,
+ aom_highbd_dc_128_predictor_16x16_c, aom_highbd_v_predictor_16x16_c,
+ aom_highbd_h_predictor_16x16_c, aom_highbd_paeth_predictor_16x16_c,
+ aom_highbd_smooth_predictor_16x16_c, aom_highbd_smooth_v_predictor_16x16_c,
+ aom_highbd_smooth_h_predictor_16x16_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_16X8, aom_highbd_dc_predictor_16x8_c,
+ aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
+ aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
+ aom_highbd_h_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
+ aom_highbd_smooth_predictor_16x8_c, aom_highbd_smooth_v_predictor_16x8_c,
+ aom_highbd_smooth_h_predictor_16x8_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_16X32, aom_highbd_dc_predictor_16x32_c,
+ aom_highbd_dc_left_predictor_16x32_c, aom_highbd_dc_top_predictor_16x32_c,
+ aom_highbd_dc_128_predictor_16x32_c, aom_highbd_v_predictor_16x32_c,
+ aom_highbd_h_predictor_16x32_c, aom_highbd_paeth_predictor_16x32_c,
+ aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c,
+ aom_highbd_smooth_h_predictor_16x32_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_16X4, aom_highbd_dc_predictor_16x4_c,
+ aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c,
+ aom_highbd_dc_128_predictor_16x4_c, aom_highbd_v_predictor_16x4_c,
+ aom_highbd_h_predictor_16x4_c, aom_highbd_paeth_predictor_16x4_c,
+ aom_highbd_smooth_predictor_16x4_c, aom_highbd_smooth_v_predictor_16x4_c,
+ aom_highbd_smooth_h_predictor_16x4_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_16X64, aom_highbd_dc_predictor_16x64_c,
+ aom_highbd_dc_left_predictor_16x64_c, aom_highbd_dc_top_predictor_16x64_c,
+ aom_highbd_dc_128_predictor_16x64_c, aom_highbd_v_predictor_16x64_c,
+ aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c,
+ aom_highbd_smooth_predictor_16x64_c, aom_highbd_smooth_v_predictor_16x64_c,
+ aom_highbd_smooth_h_predictor_16x64_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
+ aom_highbd_dc_left_predictor_16x16_sse2,
+ aom_highbd_dc_top_predictor_16x16_sse2,
+ aom_highbd_dc_128_predictor_16x16_sse2,
+ aom_highbd_v_predictor_16x16_sse2,
+ aom_highbd_h_predictor_16x16_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2,
+ aom_highbd_dc_left_predictor_16x8_sse2,
+ aom_highbd_dc_top_predictor_16x8_sse2,
+ aom_highbd_dc_128_predictor_16x8_sse2,
+ aom_highbd_v_predictor_16x8_sse2,
+ aom_highbd_h_predictor_16x8_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
+ aom_highbd_dc_left_predictor_16x32_sse2,
+ aom_highbd_dc_top_predictor_16x32_sse2,
+ aom_highbd_dc_128_predictor_16x32_sse2,
+ aom_highbd_v_predictor_16x32_sse2,
+ aom_highbd_h_predictor_16x32_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_16X16, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_16X16, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_16X8, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_16X32, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+#endif
+
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon,
+ aom_highbd_dc_left_predictor_16x16_neon,
+ aom_highbd_dc_top_predictor_16x16_neon,
+ aom_highbd_dc_128_predictor_16x16_neon,
+ aom_highbd_v_predictor_16x16_neon,
+ aom_highbd_h_predictor_16x16_neon,
+ aom_highbd_paeth_predictor_16x16_neon,
+ aom_highbd_smooth_predictor_16x16_neon,
+ aom_highbd_smooth_v_predictor_16x16_neon,
+ aom_highbd_smooth_h_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, aom_highbd_dc_predictor_16x8_neon,
+ aom_highbd_dc_left_predictor_16x8_neon,
+ aom_highbd_dc_top_predictor_16x8_neon,
+ aom_highbd_dc_128_predictor_16x8_neon,
+ aom_highbd_v_predictor_16x8_neon,
+ aom_highbd_h_predictor_16x8_neon,
+ aom_highbd_paeth_predictor_16x8_neon,
+ aom_highbd_smooth_predictor_16x8_neon,
+ aom_highbd_smooth_v_predictor_16x8_neon,
+ aom_highbd_smooth_h_predictor_16x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, aom_highbd_dc_predictor_16x32_neon,
+ aom_highbd_dc_left_predictor_16x32_neon,
+ aom_highbd_dc_top_predictor_16x32_neon,
+ aom_highbd_dc_128_predictor_16x32_neon,
+ aom_highbd_v_predictor_16x32_neon,
+ aom_highbd_h_predictor_16x32_neon,
+ aom_highbd_paeth_predictor_16x32_neon,
+ aom_highbd_smooth_predictor_16x32_neon,
+ aom_highbd_smooth_v_predictor_16x32_neon,
+ aom_highbd_smooth_h_predictor_16x32_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, aom_highbd_dc_predictor_16x4_neon,
+ aom_highbd_dc_left_predictor_16x4_neon,
+ aom_highbd_dc_top_predictor_16x4_neon,
+ aom_highbd_dc_128_predictor_16x4_neon,
+ aom_highbd_v_predictor_16x4_neon,
+ aom_highbd_h_predictor_16x4_neon,
+ aom_highbd_paeth_predictor_16x4_neon,
+ aom_highbd_smooth_predictor_16x4_neon,
+ aom_highbd_smooth_v_predictor_16x4_neon,
+ aom_highbd_smooth_h_predictor_16x4_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, aom_highbd_dc_predictor_16x64_neon,
+ aom_highbd_dc_left_predictor_16x64_neon,
+ aom_highbd_dc_top_predictor_16x64_neon,
+ aom_highbd_dc_128_predictor_16x64_neon,
+ aom_highbd_v_predictor_16x64_neon,
+ aom_highbd_h_predictor_16x64_neon,
+ aom_highbd_paeth_predictor_16x64_neon,
+ aom_highbd_smooth_predictor_16x64_neon,
+ aom_highbd_smooth_v_predictor_16x64_neon,
+ aom_highbd_smooth_h_predictor_16x64_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_32X32, aom_highbd_dc_predictor_32x32_c,
+ aom_highbd_dc_left_predictor_32x32_c, aom_highbd_dc_top_predictor_32x32_c,
+ aom_highbd_dc_128_predictor_32x32_c, aom_highbd_v_predictor_32x32_c,
+ aom_highbd_h_predictor_32x32_c, aom_highbd_paeth_predictor_32x32_c,
+ aom_highbd_smooth_predictor_32x32_c, aom_highbd_smooth_v_predictor_32x32_c,
+ aom_highbd_smooth_h_predictor_32x32_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_32X16, aom_highbd_dc_predictor_32x16_c,
+ aom_highbd_dc_left_predictor_32x16_c, aom_highbd_dc_top_predictor_32x16_c,
+ aom_highbd_dc_128_predictor_32x16_c, aom_highbd_v_predictor_32x16_c,
+ aom_highbd_h_predictor_32x16_c, aom_highbd_paeth_predictor_32x16_c,
+ aom_highbd_smooth_predictor_32x16_c, aom_highbd_smooth_v_predictor_32x16_c,
+ aom_highbd_smooth_h_predictor_32x16_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_32X64, aom_highbd_dc_predictor_32x64_c,
+ aom_highbd_dc_left_predictor_32x64_c, aom_highbd_dc_top_predictor_32x64_c,
+ aom_highbd_dc_128_predictor_32x64_c, aom_highbd_v_predictor_32x64_c,
+ aom_highbd_h_predictor_32x64_c, aom_highbd_paeth_predictor_32x64_c,
+ aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c,
+ aom_highbd_smooth_h_predictor_32x64_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_32X8, aom_highbd_dc_predictor_32x8_c,
+ aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c,
+ aom_highbd_dc_128_predictor_32x8_c, aom_highbd_v_predictor_32x8_c,
+ aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c,
+ aom_highbd_smooth_predictor_32x8_c, aom_highbd_smooth_v_predictor_32x8_c,
+ aom_highbd_smooth_h_predictor_32x8_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
+ aom_highbd_dc_left_predictor_32x32_sse2,
+ aom_highbd_dc_top_predictor_32x32_sse2,
+ aom_highbd_dc_128_predictor_32x32_sse2,
+ aom_highbd_v_predictor_32x32_sse2,
+ aom_highbd_h_predictor_32x32_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSE2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
+ aom_highbd_dc_left_predictor_32x16_sse2,
+ aom_highbd_dc_top_predictor_32x16_sse2,
+ aom_highbd_dc_128_predictor_32x16_sse2,
+ aom_highbd_v_predictor_32x16_sse2,
+ aom_highbd_h_predictor_32x16_sse2, nullptr, nullptr,
+ nullptr, nullptr)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_32X32, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_32X32, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+
+HIGHBD_INTRA_PRED_TEST(AVX2, TX_32X16, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
+#endif
+
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon,
+ aom_highbd_dc_left_predictor_32x32_neon,
+ aom_highbd_dc_top_predictor_32x32_neon,
+ aom_highbd_dc_128_predictor_32x32_neon,
+ aom_highbd_v_predictor_32x32_neon,
+ aom_highbd_h_predictor_32x32_neon,
+ aom_highbd_paeth_predictor_32x32_neon,
+ aom_highbd_smooth_predictor_32x32_neon,
+ aom_highbd_smooth_v_predictor_32x32_neon,
+ aom_highbd_smooth_h_predictor_32x32_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, aom_highbd_dc_predictor_32x16_neon,
+ aom_highbd_dc_left_predictor_32x16_neon,
+ aom_highbd_dc_top_predictor_32x16_neon,
+ aom_highbd_dc_128_predictor_32x16_neon,
+ aom_highbd_v_predictor_32x16_neon,
+ aom_highbd_h_predictor_32x16_neon,
+ aom_highbd_paeth_predictor_32x16_neon,
+ aom_highbd_smooth_predictor_32x16_neon,
+ aom_highbd_smooth_v_predictor_32x16_neon,
+ aom_highbd_smooth_h_predictor_32x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, aom_highbd_dc_predictor_32x64_neon,
+ aom_highbd_dc_left_predictor_32x64_neon,
+ aom_highbd_dc_top_predictor_32x64_neon,
+ aom_highbd_dc_128_predictor_32x64_neon,
+ aom_highbd_v_predictor_32x64_neon,
+ aom_highbd_h_predictor_32x64_neon,
+ aom_highbd_paeth_predictor_32x64_neon,
+ aom_highbd_smooth_predictor_32x64_neon,
+ aom_highbd_smooth_v_predictor_32x64_neon,
+ aom_highbd_smooth_h_predictor_32x64_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, aom_highbd_dc_predictor_32x8_neon,
+ aom_highbd_dc_left_predictor_32x8_neon,
+ aom_highbd_dc_top_predictor_32x8_neon,
+ aom_highbd_dc_128_predictor_32x8_neon,
+ aom_highbd_v_predictor_32x8_neon,
+ aom_highbd_h_predictor_32x8_neon,
+ aom_highbd_paeth_predictor_32x8_neon,
+ aom_highbd_smooth_predictor_32x8_neon,
+ aom_highbd_smooth_v_predictor_32x8_neon,
+ aom_highbd_smooth_h_predictor_32x8_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_64X64, aom_highbd_dc_predictor_64x64_c,
+ aom_highbd_dc_left_predictor_64x64_c, aom_highbd_dc_top_predictor_64x64_c,
+ aom_highbd_dc_128_predictor_64x64_c, aom_highbd_v_predictor_64x64_c,
+ aom_highbd_h_predictor_64x64_c, aom_highbd_paeth_predictor_64x64_c,
+ aom_highbd_smooth_predictor_64x64_c, aom_highbd_smooth_v_predictor_64x64_c,
+ aom_highbd_smooth_h_predictor_64x64_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_64X32, aom_highbd_dc_predictor_64x32_c,
+ aom_highbd_dc_left_predictor_64x32_c, aom_highbd_dc_top_predictor_64x32_c,
+ aom_highbd_dc_128_predictor_64x32_c, aom_highbd_v_predictor_64x32_c,
+ aom_highbd_h_predictor_64x32_c, aom_highbd_paeth_predictor_64x32_c,
+ aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c,
+ aom_highbd_smooth_h_predictor_64x32_c)
+HIGHBD_INTRA_PRED_TEST(
+ C, TX_64X16, aom_highbd_dc_predictor_64x16_c,
+ aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c,
+ aom_highbd_dc_128_predictor_64x16_c, aom_highbd_v_predictor_64x16_c,
+ aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c,
+ aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c,
+ aom_highbd_smooth_h_predictor_64x16_c)
+
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon,
+ aom_highbd_dc_left_predictor_64x64_neon,
+ aom_highbd_dc_top_predictor_64x64_neon,
+ aom_highbd_dc_128_predictor_64x64_neon,
+ aom_highbd_v_predictor_64x64_neon,
+ aom_highbd_h_predictor_64x64_neon,
+ aom_highbd_paeth_predictor_64x64_neon,
+ aom_highbd_smooth_predictor_64x64_neon,
+ aom_highbd_smooth_v_predictor_64x64_neon,
+ aom_highbd_smooth_h_predictor_64x64_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, aom_highbd_dc_predictor_64x32_neon,
+ aom_highbd_dc_left_predictor_64x32_neon,
+ aom_highbd_dc_top_predictor_64x32_neon,
+ aom_highbd_dc_128_predictor_64x32_neon,
+ aom_highbd_v_predictor_64x32_neon,
+ aom_highbd_h_predictor_64x32_neon,
+ aom_highbd_paeth_predictor_64x32_neon,
+ aom_highbd_smooth_predictor_64x32_neon,
+ aom_highbd_smooth_v_predictor_64x32_neon,
+ aom_highbd_smooth_h_predictor_64x32_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, aom_highbd_dc_predictor_64x16_neon,
+ aom_highbd_dc_left_predictor_64x16_neon,
+ aom_highbd_dc_top_predictor_64x16_neon,
+ aom_highbd_dc_128_predictor_64x16_neon,
+ aom_highbd_v_predictor_64x16_neon,
+ aom_highbd_h_predictor_64x16_neon,
+ aom_highbd_paeth_predictor_64x16_neon,
+ aom_highbd_smooth_predictor_64x16_neon,
+ aom_highbd_smooth_v_predictor_64x16_neon,
+ aom_highbd_smooth_h_predictor_64x16_neon)
+#endif // HAVE_NEON
+
+// -----------------------------------------------------------------------------
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#include "test/test_libaom.cc"
diff --git a/third_party/aom/test/test_libaom.cc b/third_party/aom/test/test_libaom.cc
new file mode 100644
index 0000000000..fbd7f2e380
--- /dev/null
+++ b/third_party/aom/test/test_libaom.cc
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#if !CONFIG_SHARED
+#include <string.h>
+
+#include <string>
+
+#if AOM_ARCH_ARM
+#include "aom_ports/arm.h"
+#endif
+#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+
+extern "C" {
+extern void av1_rtcd();
+extern void aom_dsp_rtcd();
+extern void aom_scale_rtcd();
+}
+
+#if AOM_ARCH_ARM || AOM_ARCH_X86 || AOM_ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+ std::string flag_value = GTEST_FLAG_GET(filter);
+ // Negative patterns begin with one '-' followed by a ':' separated list.
+ if (flag_value.find('-') == std::string::npos) flag_value += '-';
+ // OPT.* matches TEST() functions
+ // OPT/* matches TEST_P() functions
+ // OPT_* matches tests which have been manually sharded.
+ // We do not match OPT* because of SSE/SSE2 collisions.
+ const char *search_terminators = "./_";
+ for (size_t pos = 0; pos < strlen(search_terminators); ++pos) {
+ flag_value += ":";
+ flag_value += str;
+ flag_value += search_terminators[pos];
+ flag_value += "*";
+ }
+ GTEST_FLAG_SET(filter, flag_value);
+}
+#endif // AOM_ARCH_ARM || AOM_ARCH_X86 || AOM_ARCH_X86_64
+#endif // !CONFIG_SHARED
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+#if !CONFIG_SHARED
+#if AOM_ARCH_AARCH64
+ const int caps = aom_arm_cpu_caps();
+ if (!(caps & HAS_ARM_CRC32)) append_negative_gtest_filter("ARM_CRC32");
+ if (!(caps & HAS_NEON_DOTPROD)) append_negative_gtest_filter("NEON_DOTPROD");
+ if (!(caps & HAS_NEON_I8MM)) append_negative_gtest_filter("NEON_I8MM");
+ if (!(caps & HAS_SVE)) append_negative_gtest_filter("SVE");
+#elif AOM_ARCH_ARM
+ const int caps = aom_arm_cpu_caps();
+ if (!(caps & HAS_NEON)) append_negative_gtest_filter("NEON");
+#endif // AOM_ARCH_ARM
+
+#if AOM_ARCH_X86 || AOM_ARCH_X86_64
+ const int simd_caps = x86_simd_caps();
+ if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX");
+ if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter("SSE");
+ if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter("SSE2");
+ if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter("SSE3");
+ if (!(simd_caps & HAS_SSSE3)) append_negative_gtest_filter("SSSE3");
+ if (!(simd_caps & HAS_SSE4_1)) append_negative_gtest_filter("SSE4_1");
+ if (!(simd_caps & HAS_SSE4_2)) append_negative_gtest_filter("SSE4_2");
+ if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter("AVX");
+ if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2");
+#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
+
+ // Shared library builds don't support whitebox tests that exercise internal
+ // symbols.
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+#endif // !CONFIG_SHARED
+
+ return RUN_ALL_TESTS();
+}
diff --git a/third_party/aom/test/test_runner.cmake b/third_party/aom/test/test_runner.cmake
new file mode 100644
index 0000000000..f0648d16be
--- /dev/null
+++ b/third_party/aom/test/test_runner.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(NOT GTEST_TOTAL_SHARDS
+ OR "${GTEST_SHARD_INDEX}" STREQUAL ""
+ OR NOT TEST_LIBAOM)
+ message(
+ FATAL_ERROR
+ "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM
+ must be defined.")
+endif()
+
+set($ENV{GTEST_SHARD_INDEX} ${GTEST_SHARD_INDEX})
+set($ENV{GTEST_TOTAL_SHARDS} ${GTEST_TOTAL_SHARDS})
+execute_process(COMMAND ${TEST_LIBAOM} RESULT_VARIABLE test_result)
+set(test_message "Test shard ${GTEST_SHARD_INDEX}/${GTEST_TOTAL_SHARDS} result")
+message("${test_message}: ${test_result}")
+
+if(NOT "${test_result}" STREQUAL "0")
+ message(FATAL_ERROR "${test_message}: FAILED, non-zero exit code.")
+endif()
diff --git a/third_party/aom/test/test_vector_test.cc b/third_party/aom/test/test_vector_test.cc
new file mode 100644
index 0000000000..39414e32e4
--- /dev/null
+++ b/third_party/aom/test/test_vector_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kThreads = 0;
+const int kFileName = 1;
+const int kRowMT = 2;
+
+typedef std::tuple<int, const char *, int> DecodeParam;
+
+class TestVectorTest : public ::libaom_test::DecoderTest,
+ public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+ TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) {}
+
+ ~TestVectorTest() override {
+ if (md5_file_) fclose(md5_file_);
+ }
+
+ void OpenMD5File(const std::string &md5_file_name_) {
+ md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+ ASSERT_NE(md5_file_, nullptr)
+ << "Md5 file open failed. Filename: " << md5_file_name_;
+ }
+
+ void PreDecodeFrameHook(const libaom_test::CompressedVideoSource &video,
+ libaom_test::Decoder *decoder) override {
+ if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_);
+ }
+
+ void DecompressedFrameHook(const aom_image_t &img,
+ const unsigned int frame_number) override {
+ ASSERT_NE(md5_file_, nullptr);
+ char expected_md5[33];
+ char junk[128];
+
+ // Read correct md5 checksums.
+ const int res = fscanf(md5_file_, "%s %s", expected_md5, junk);
+ ASSERT_NE(res, EOF) << "Read md5 data failed";
+ expected_md5[32] = '\0';
+
+ ::libaom_test::MD5 md5_res;
+#if FORCE_HIGHBITDEPTH_DECODING
+ const aom_img_fmt_t shifted_fmt =
+ (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
+ if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
+ aom_image_t *img_shifted =
+ aom_img_alloc(nullptr, shifted_fmt, img.d_w, img.d_h, 16);
+ img_shifted->bit_depth = img.bit_depth;
+ img_shifted->monochrome = img.monochrome;
+ aom_img_downshift(img_shifted, &img, 0);
+ md5_res.Add(img_shifted);
+ aom_img_free(img_shifted);
+ } else {
+#endif
+ md5_res.Add(&img);
+#if FORCE_HIGHBITDEPTH_DECODING
+ }
+#endif
+
+ const char *actual_md5 = md5_res.Get();
+ // Check md5 match.
+ ASSERT_STREQ(expected_md5, actual_md5)
+ << "Md5 checksums don't match: frame number = " << frame_number;
+ }
+
+ unsigned int row_mt_;
+
+ private:
+ FILE *md5_file_;
+};
+
+// This test runs through the whole set of test vectors, and decodes them.
+// The md5 checksums are computed for each frame in the video file. If md5
+// checksums match the correct md5 data, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(TestVectorTest, MD5Match) {
+ const DecodeParam input = GET_PARAM(1);
+ const std::string filename = std::get<kFileName>(input);
+ aom_codec_flags_t flags = 0;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ char str[256];
+
+ cfg.threads = std::get<kThreads>(input);
+ row_mt_ = std::get<kRowMT>(input);
+
+ snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+ filename.c_str(), cfg.threads);
+ SCOPED_TRACE(str);
+
+ // Open compressed video file.
+ std::unique_ptr<libaom_test::CompressedVideoSource> video;
+ if (filename.substr(filename.length() - 3, 3) == "ivf") {
+ video.reset(new libaom_test::IVFVideoSource(filename));
+ } else if (filename.substr(filename.length() - 4, 4) == "webm" ||
+ filename.substr(filename.length() - 3, 3) == "mkv") {
+#if CONFIG_WEBM_IO
+ video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+ fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+ filename.c_str());
+ return;
+#endif
+ }
+ ASSERT_NE(video, nullptr);
+ video->Init();
+
+ // Construct md5 file name.
+ const std::string md5_filename = filename + ".md5";
+ OpenMD5File(md5_filename);
+
+ // Set decode config and flags.
+ cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+ set_cfg(cfg);
+ set_flags(flags);
+
+ // Decode frame, and check the md5 matching.
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
+}
+
+#if CONFIG_AV1_DECODER
+AV1_INSTANTIATE_TEST_SUITE(
+ TestVectorTest,
+ ::testing::Combine(::testing::Values(1), // Single thread.
+ ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+ libaom_test::kAV1TestVectors +
+ libaom_test::kNumAV1TestVectors),
+ ::testing::Values(0)));
+
+// Test AV1 decode in with different numbers of threads.
+INSTANTIATE_TEST_SUITE_P(
+ AV1MultiThreaded, TestVectorTest,
+ ::testing::Combine(
+ ::testing::Values(
+ static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+ ::testing::Combine(
+ ::testing::Range(2, 9), // With 2 ~ 8 threads.
+ ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+ libaom_test::kAV1TestVectors +
+ libaom_test::kNumAV1TestVectors),
+ ::testing::Range(0, 2))));
+
+#endif // CONFIG_AV1_DECODER
+
+} // namespace
diff --git a/third_party/aom/test/test_vectors.cc b/third_party/aom/test/test_vectors.cc
new file mode 100644
index 0000000000..09736d1ed8
--- /dev/null
+++ b/third_party/aom/test/test_vectors.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/test_vectors.h"
+
+namespace libaom_test {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+#if CONFIG_AV1_DECODER
+const char *const kAV1TestVectors[] = {
+ "av1-1-b8-00-quantizer-00.ivf",
+ "av1-1-b8-00-quantizer-01.ivf",
+ "av1-1-b8-00-quantizer-02.ivf",
+ "av1-1-b8-00-quantizer-03.ivf",
+ "av1-1-b8-00-quantizer-04.ivf",
+ "av1-1-b8-00-quantizer-05.ivf",
+ "av1-1-b8-00-quantizer-06.ivf",
+ "av1-1-b8-00-quantizer-07.ivf",
+ "av1-1-b8-00-quantizer-08.ivf",
+ "av1-1-b8-00-quantizer-09.ivf",
+ "av1-1-b8-00-quantizer-10.ivf",
+ "av1-1-b8-00-quantizer-11.ivf",
+ "av1-1-b8-00-quantizer-12.ivf",
+ "av1-1-b8-00-quantizer-13.ivf",
+ "av1-1-b8-00-quantizer-14.ivf",
+ "av1-1-b8-00-quantizer-15.ivf",
+ "av1-1-b8-00-quantizer-16.ivf",
+ "av1-1-b8-00-quantizer-17.ivf",
+ "av1-1-b8-00-quantizer-18.ivf",
+ "av1-1-b8-00-quantizer-19.ivf",
+ "av1-1-b8-00-quantizer-20.ivf",
+ "av1-1-b8-00-quantizer-21.ivf",
+ "av1-1-b8-00-quantizer-22.ivf",
+ "av1-1-b8-00-quantizer-23.ivf",
+ "av1-1-b8-00-quantizer-24.ivf",
+ "av1-1-b8-00-quantizer-25.ivf",
+ "av1-1-b8-00-quantizer-26.ivf",
+ "av1-1-b8-00-quantizer-27.ivf",
+ "av1-1-b8-00-quantizer-28.ivf",
+ "av1-1-b8-00-quantizer-29.ivf",
+ "av1-1-b8-00-quantizer-30.ivf",
+ "av1-1-b8-00-quantizer-31.ivf",
+ "av1-1-b8-00-quantizer-32.ivf",
+ "av1-1-b8-00-quantizer-33.ivf",
+ "av1-1-b8-00-quantizer-34.ivf",
+ "av1-1-b8-00-quantizer-35.ivf",
+ "av1-1-b8-00-quantizer-36.ivf",
+ "av1-1-b8-00-quantizer-37.ivf",
+ "av1-1-b8-00-quantizer-38.ivf",
+ "av1-1-b8-00-quantizer-39.ivf",
+ "av1-1-b8-00-quantizer-40.ivf",
+ "av1-1-b8-00-quantizer-41.ivf",
+ "av1-1-b8-00-quantizer-42.ivf",
+ "av1-1-b8-00-quantizer-43.ivf",
+ "av1-1-b8-00-quantizer-44.ivf",
+ "av1-1-b8-00-quantizer-45.ivf",
+ "av1-1-b8-00-quantizer-46.ivf",
+ "av1-1-b8-00-quantizer-47.ivf",
+ "av1-1-b8-00-quantizer-48.ivf",
+ "av1-1-b8-00-quantizer-49.ivf",
+ "av1-1-b8-00-quantizer-50.ivf",
+ "av1-1-b8-00-quantizer-51.ivf",
+ "av1-1-b8-00-quantizer-52.ivf",
+ "av1-1-b8-00-quantizer-53.ivf",
+ "av1-1-b8-00-quantizer-54.ivf",
+ "av1-1-b8-00-quantizer-55.ivf",
+ "av1-1-b8-00-quantizer-56.ivf",
+ "av1-1-b8-00-quantizer-57.ivf",
+ "av1-1-b8-00-quantizer-58.ivf",
+ "av1-1-b8-00-quantizer-59.ivf",
+ "av1-1-b8-00-quantizer-60.ivf",
+ "av1-1-b8-00-quantizer-61.ivf",
+ "av1-1-b8-00-quantizer-62.ivf",
+ "av1-1-b8-00-quantizer-63.ivf",
+#if CONFIG_AV1_HIGHBITDEPTH
+ "av1-1-b10-00-quantizer-00.ivf",
+ "av1-1-b10-00-quantizer-01.ivf",
+ "av1-1-b10-00-quantizer-02.ivf",
+ "av1-1-b10-00-quantizer-03.ivf",
+ "av1-1-b10-00-quantizer-04.ivf",
+ "av1-1-b10-00-quantizer-05.ivf",
+ "av1-1-b10-00-quantizer-06.ivf",
+ "av1-1-b10-00-quantizer-07.ivf",
+ "av1-1-b10-00-quantizer-08.ivf",
+ "av1-1-b10-00-quantizer-09.ivf",
+ "av1-1-b10-00-quantizer-10.ivf",
+ "av1-1-b10-00-quantizer-11.ivf",
+ "av1-1-b10-00-quantizer-12.ivf",
+ "av1-1-b10-00-quantizer-13.ivf",
+ "av1-1-b10-00-quantizer-14.ivf",
+ "av1-1-b10-00-quantizer-15.ivf",
+ "av1-1-b10-00-quantizer-16.ivf",
+ "av1-1-b10-00-quantizer-17.ivf",
+ "av1-1-b10-00-quantizer-18.ivf",
+ "av1-1-b10-00-quantizer-19.ivf",
+ "av1-1-b10-00-quantizer-20.ivf",
+ "av1-1-b10-00-quantizer-21.ivf",
+ "av1-1-b10-00-quantizer-22.ivf",
+ "av1-1-b10-00-quantizer-23.ivf",
+ "av1-1-b10-00-quantizer-24.ivf",
+ "av1-1-b10-00-quantizer-25.ivf",
+ "av1-1-b10-00-quantizer-26.ivf",
+ "av1-1-b10-00-quantizer-27.ivf",
+ "av1-1-b10-00-quantizer-28.ivf",
+ "av1-1-b10-00-quantizer-29.ivf",
+ "av1-1-b10-00-quantizer-30.ivf",
+ "av1-1-b10-00-quantizer-31.ivf",
+ "av1-1-b10-00-quantizer-32.ivf",
+ "av1-1-b10-00-quantizer-33.ivf",
+ "av1-1-b10-00-quantizer-34.ivf",
+ "av1-1-b10-00-quantizer-35.ivf",
+ "av1-1-b10-00-quantizer-36.ivf",
+ "av1-1-b10-00-quantizer-37.ivf",
+ "av1-1-b10-00-quantizer-38.ivf",
+ "av1-1-b10-00-quantizer-39.ivf",
+ "av1-1-b10-00-quantizer-40.ivf",
+ "av1-1-b10-00-quantizer-41.ivf",
+ "av1-1-b10-00-quantizer-42.ivf",
+ "av1-1-b10-00-quantizer-43.ivf",
+ "av1-1-b10-00-quantizer-44.ivf",
+ "av1-1-b10-00-quantizer-45.ivf",
+ "av1-1-b10-00-quantizer-46.ivf",
+ "av1-1-b10-00-quantizer-47.ivf",
+ "av1-1-b10-00-quantizer-48.ivf",
+ "av1-1-b10-00-quantizer-49.ivf",
+ "av1-1-b10-00-quantizer-50.ivf",
+ "av1-1-b10-00-quantizer-51.ivf",
+ "av1-1-b10-00-quantizer-52.ivf",
+ "av1-1-b10-00-quantizer-53.ivf",
+ "av1-1-b10-00-quantizer-54.ivf",
+ "av1-1-b10-00-quantizer-55.ivf",
+ "av1-1-b10-00-quantizer-56.ivf",
+ "av1-1-b10-00-quantizer-57.ivf",
+ "av1-1-b10-00-quantizer-58.ivf",
+ "av1-1-b10-00-quantizer-59.ivf",
+ "av1-1-b10-00-quantizer-60.ivf",
+ "av1-1-b10-00-quantizer-61.ivf",
+ "av1-1-b10-00-quantizer-62.ivf",
+ "av1-1-b10-00-quantizer-63.ivf",
+ "av1-1-b10-23-film_grain-50.ivf",
+ "av1-1-b10-24-monochrome.ivf",
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ "av1-1-b8-01-size-16x16.ivf",
+ "av1-1-b8-01-size-16x18.ivf",
+ "av1-1-b8-01-size-16x32.ivf",
+ "av1-1-b8-01-size-16x34.ivf",
+ "av1-1-b8-01-size-16x64.ivf",
+ "av1-1-b8-01-size-16x66.ivf",
+ "av1-1-b8-01-size-18x16.ivf",
+ "av1-1-b8-01-size-18x18.ivf",
+ "av1-1-b8-01-size-18x32.ivf",
+ "av1-1-b8-01-size-18x34.ivf",
+ "av1-1-b8-01-size-18x64.ivf",
+ "av1-1-b8-01-size-18x66.ivf",
+ "av1-1-b8-01-size-196x196.ivf",
+ "av1-1-b8-01-size-196x198.ivf",
+ "av1-1-b8-01-size-196x200.ivf",
+ "av1-1-b8-01-size-196x202.ivf",
+ "av1-1-b8-01-size-196x208.ivf",
+ "av1-1-b8-01-size-196x210.ivf",
+ "av1-1-b8-01-size-196x224.ivf",
+ "av1-1-b8-01-size-196x226.ivf",
+ "av1-1-b8-01-size-198x196.ivf",
+ "av1-1-b8-01-size-198x198.ivf",
+ "av1-1-b8-01-size-198x200.ivf",
+ "av1-1-b8-01-size-198x202.ivf",
+ "av1-1-b8-01-size-198x208.ivf",
+ "av1-1-b8-01-size-198x210.ivf",
+ "av1-1-b8-01-size-198x224.ivf",
+ "av1-1-b8-01-size-198x226.ivf",
+ "av1-1-b8-01-size-200x196.ivf",
+ "av1-1-b8-01-size-200x198.ivf",
+ "av1-1-b8-01-size-200x200.ivf",
+ "av1-1-b8-01-size-200x202.ivf",
+ "av1-1-b8-01-size-200x208.ivf",
+ "av1-1-b8-01-size-200x210.ivf",
+ "av1-1-b8-01-size-200x224.ivf",
+ "av1-1-b8-01-size-200x226.ivf",
+ "av1-1-b8-01-size-202x196.ivf",
+ "av1-1-b8-01-size-202x198.ivf",
+ "av1-1-b8-01-size-202x200.ivf",
+ "av1-1-b8-01-size-202x202.ivf",
+ "av1-1-b8-01-size-202x208.ivf",
+ "av1-1-b8-01-size-202x210.ivf",
+ "av1-1-b8-01-size-202x224.ivf",
+ "av1-1-b8-01-size-202x226.ivf",
+ "av1-1-b8-01-size-208x196.ivf",
+ "av1-1-b8-01-size-208x198.ivf",
+ "av1-1-b8-01-size-208x200.ivf",
+ "av1-1-b8-01-size-208x202.ivf",
+ "av1-1-b8-01-size-208x208.ivf",
+ "av1-1-b8-01-size-208x210.ivf",
+ "av1-1-b8-01-size-208x224.ivf",
+ "av1-1-b8-01-size-208x226.ivf",
+ "av1-1-b8-01-size-210x196.ivf",
+ "av1-1-b8-01-size-210x198.ivf",
+ "av1-1-b8-01-size-210x200.ivf",
+ "av1-1-b8-01-size-210x202.ivf",
+ "av1-1-b8-01-size-210x208.ivf",
+ "av1-1-b8-01-size-210x210.ivf",
+ "av1-1-b8-01-size-210x224.ivf",
+ "av1-1-b8-01-size-210x226.ivf",
+ "av1-1-b8-01-size-224x196.ivf",
+ "av1-1-b8-01-size-224x198.ivf",
+ "av1-1-b8-01-size-224x200.ivf",
+ "av1-1-b8-01-size-224x202.ivf",
+ "av1-1-b8-01-size-224x208.ivf",
+ "av1-1-b8-01-size-224x210.ivf",
+ "av1-1-b8-01-size-224x224.ivf",
+ "av1-1-b8-01-size-224x226.ivf",
+ "av1-1-b8-01-size-226x196.ivf",
+ "av1-1-b8-01-size-226x198.ivf",
+ "av1-1-b8-01-size-226x200.ivf",
+ "av1-1-b8-01-size-226x202.ivf",
+ "av1-1-b8-01-size-226x208.ivf",
+ "av1-1-b8-01-size-226x210.ivf",
+ "av1-1-b8-01-size-226x224.ivf",
+ "av1-1-b8-01-size-226x226.ivf",
+ "av1-1-b8-01-size-32x16.ivf",
+ "av1-1-b8-01-size-32x18.ivf",
+ "av1-1-b8-01-size-32x32.ivf",
+ "av1-1-b8-01-size-32x34.ivf",
+ "av1-1-b8-01-size-32x64.ivf",
+ "av1-1-b8-01-size-32x66.ivf",
+ "av1-1-b8-01-size-34x16.ivf",
+ "av1-1-b8-01-size-34x18.ivf",
+ "av1-1-b8-01-size-34x32.ivf",
+ "av1-1-b8-01-size-34x34.ivf",
+ "av1-1-b8-01-size-34x64.ivf",
+ "av1-1-b8-01-size-34x66.ivf",
+ "av1-1-b8-01-size-64x16.ivf",
+ "av1-1-b8-01-size-64x18.ivf",
+ "av1-1-b8-01-size-64x32.ivf",
+ "av1-1-b8-01-size-64x34.ivf",
+ "av1-1-b8-01-size-64x64.ivf",
+ "av1-1-b8-01-size-64x66.ivf",
+ "av1-1-b8-01-size-66x16.ivf",
+ "av1-1-b8-01-size-66x18.ivf",
+ "av1-1-b8-01-size-66x32.ivf",
+ "av1-1-b8-01-size-66x34.ivf",
+ "av1-1-b8-01-size-66x64.ivf",
+ "av1-1-b8-01-size-66x66.ivf",
+ "av1-1-b8-02-allintra.ivf",
+ "av1-1-b8-03-sizedown.mkv",
+ "av1-1-b8-03-sizeup.mkv",
+ "av1-1-b8-04-cdfupdate.ivf",
+ "av1-1-b8-05-mv.ivf",
+ "av1-1-b8-06-mfmv.ivf",
+ "av1-1-b8-16-intra_only-intrabc-extreme-dv.ivf",
+ "av1-1-b8-22-svc-L1T2.ivf",
+ "av1-1-b8-22-svc-L2T1.ivf",
+ "av1-1-b8-22-svc-L2T2.ivf",
+ "av1-1-b8-23-film_grain-50.ivf",
+ "av1-1-b8-24-monochrome.ivf"
+};
+const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
+#endif // CONFIG_AV1_DECODER
+
+} // namespace libaom_test
diff --git a/third_party/aom/test/test_vectors.h b/third_party/aom/test/test_vectors.h
new file mode 100644
index 0000000000..be37f6e377
--- /dev/null
+++ b/third_party/aom/test/test_vectors.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_TEST_VECTORS_H_
+#define AOM_TEST_TEST_VECTORS_H_
+
+#include "config/aom_config.h"
+
+namespace libaom_test {
+
+#if CONFIG_AV1_DECODER
+extern const int kNumAV1TestVectors;
+extern const char *const kAV1TestVectors[];
+#endif
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_TEST_VECTORS_H_
diff --git a/third_party/aom/test/tile_config_test.cc b/third_party/aom/test/tile_config_test.cc
new file mode 100644
index 0000000000..e2ac59284b
--- /dev/null
+++ b/third_party/aom/test/tile_config_test.cc
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+typedef struct {
+ // Superblock size
+ const unsigned int sb_size;
+ // log2(number of tile rows)
+ const unsigned int tile_rows;
+ // log2(number of tile columns)
+ const unsigned int tile_cols;
+} uniformTileConfigParam;
+
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+ { ::libaom_test::kRealTime };
+#else
+ { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+ ::libaom_test::kTwoPassGood };
+#endif
+
+static const uniformTileConfigParam uniformTileConfigParams[] = {
+ { 128, 0, 0 }, { 128, 0, 2 }, { 128, 2, 0 }, { 128, 1, 2 }, { 128, 2, 2 },
+ { 128, 3, 2 }, { 64, 0, 0 }, { 64, 0, 2 }, { 64, 2, 0 }, { 64, 1, 2 },
+ { 64, 2, 2 }, { 64, 3, 3 }, { 64, 4, 4 }
+};
+
+typedef struct {
+ // Superblock size
+ const unsigned int sb_size;
+ // number of tile widths
+ const unsigned int tile_width_count;
+ // list of tile widths
+ int tile_widths[AOM_MAX_TILE_COLS];
+ // number of tile heights
+ const unsigned int tile_height_count;
+ // list of tile heights
+ int tile_heights[AOM_MAX_TILE_ROWS];
+} nonUniformTileConfigParam;
+
+const nonUniformTileConfigParam nonUniformTileConfigParams[] = {
+ { 64, 1, { 3 }, 1, { 3 } }, { 64, 2, { 1, 2 }, 2, { 1, 2 } },
+ { 64, 3, { 2, 3, 4 }, 2, { 2, 3 } }, { 128, 1, { 3 }, 1, { 3 } },
+ { 128, 2, { 1, 2 }, 2, { 1, 2 } }, { 128, 3, { 2, 3, 4 }, 2, { 2, 3 } },
+};
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static INLINE int tile_log2(int blk_size, int target) {
+ int k;
+ for (k = 0; (blk_size << k) < target; k++) {
+ }
+ return k;
+}
+
+// This class is used to validate tile configuration for uniform spacing.
+class UniformTileConfigTestLarge
+ : public ::libaom_test::CodecTestWith3Params<
+ libaom_test::TestMode, uniformTileConfigParam, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ UniformTileConfigTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ tile_config_param_(GET_PARAM(2)), end_usage_check_(GET_PARAM(3)) {
+ tile_config_violated_ = false;
+ max_tile_cols_log2_ = tile_log2(1, AOM_MAX_TILE_COLS);
+ max_tile_rows_log2_ = tile_log2(1, AOM_MAX_TILE_ROWS);
+ }
+ ~UniformTileConfigTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = end_usage_check_;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 19;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, tile_config_param_.tile_cols);
+ encoder->Control(AV1E_SET_TILE_ROWS, tile_config_param_.tile_rows);
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AV1E_SET_SUPERBLOCK_SIZE,
+ tile_config_param_.sb_size == 64
+ ? AOM_SUPERBLOCK_SIZE_64X64
+ : AOM_SUPERBLOCK_SIZE_128X128);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ aom_tile_info tile_info;
+ int config_tile_columns = AOMMIN(1 << (int)tile_config_param_.tile_cols,
+ 1 << max_tile_cols_log2_);
+ int config_tile_rows = AOMMIN(1 << (int)tile_config_param_.tile_rows,
+ 1 << max_tile_rows_log2_);
+
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+ if (tile_info.tile_columns != config_tile_columns ||
+ tile_info.tile_rows != config_tile_rows) {
+ tile_config_violated_ = true;
+ }
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ const uniformTileConfigParam tile_config_param_;
+ int max_tile_cols_log2_;
+ int max_tile_rows_log2_;
+ bool tile_config_violated_;
+ aom_rc_mode end_usage_check_;
+};
+
+// This class is used to validate tile configuration for non uniform spacing.
+class NonUniformTileConfigTestLarge
+ : public ::libaom_test::CodecTestWith3Params<
+ libaom_test::TestMode, nonUniformTileConfigParam, aom_rc_mode>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ NonUniformTileConfigTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ tile_config_param_(GET_PARAM(2)), rc_end_usage_(GET_PARAM(3)) {
+ tile_config_violated_ = false;
+ }
+ ~NonUniformTileConfigTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = rc_end_usage_;
+ cfg_.g_threads = 1;
+ cfg_.g_lag_in_frames = 35;
+ cfg_.rc_target_bitrate = 1000;
+ cfg_.tile_width_count = tile_config_param_.tile_width_count;
+ memcpy(cfg_.tile_widths, tile_config_param_.tile_widths,
+ sizeof(tile_config_param_.tile_widths[0]) *
+ tile_config_param_.tile_width_count);
+ cfg_.tile_height_count = tile_config_param_.tile_height_count;
+ memcpy(cfg_.tile_heights, tile_config_param_.tile_heights,
+ sizeof(tile_config_param_.tile_heights[0]) *
+ tile_config_param_.tile_height_count);
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AV1E_SET_SUPERBLOCK_SIZE,
+ tile_config_param_.sb_size == 64
+ ? AOM_SUPERBLOCK_SIZE_64X64
+ : AOM_SUPERBLOCK_SIZE_128X128);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ aom_tile_info tile_info;
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+
+ // check validity of tile cols
+ int tile_col_idx, tile_col = 0;
+ for (tile_col_idx = 0; tile_col_idx < tile_info.tile_columns - 1;
+ tile_col_idx++) {
+ if (tile_config_param_.tile_widths[tile_col] !=
+ tile_info.tile_widths[tile_col_idx])
+ tile_config_violated_ = true;
+ tile_col = (tile_col + 1) % (int)tile_config_param_.tile_width_count;
+ }
+ // last column may not be able to accommodate config, but if it is
+ // greater than what is configured, there is a violation.
+ if (tile_config_param_.tile_widths[tile_col] <
+ tile_info.tile_widths[tile_col_idx])
+ tile_config_violated_ = true;
+
+ // check validity of tile rows
+ int tile_row_idx, tile_row = 0;
+ for (tile_row_idx = 0; tile_row_idx < tile_info.tile_rows - 1;
+ tile_row_idx++) {
+ if (tile_config_param_.tile_heights[tile_row] !=
+ tile_info.tile_heights[tile_row_idx])
+ tile_config_violated_ = true;
+ tile_row = (tile_row + 1) % (int)tile_config_param_.tile_height_count;
+ }
+ // last row may not be able to accommodate config, but if it is
+ // greater than what is configured, there is a violation.
+ if (tile_config_param_.tile_heights[tile_row] <
+ tile_info.tile_heights[tile_row_idx])
+ tile_config_violated_ = true;
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ ::libaom_test::TestMode encoding_mode_;
+ const nonUniformTileConfigParam tile_config_param_;
+ bool tile_config_violated_;
+ aom_rc_mode rc_end_usage_;
+};
+
+TEST_P(UniformTileConfigTestLarge, UniformTileConfigTest) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+ ASSERT_NO_FATAL_FAILURE(video.Begin());
+
+ int max_tiles_cols = video.img()->w / (int)tile_config_param_.sb_size;
+ int max_tiles_rows = video.img()->h / (int)tile_config_param_.sb_size;
+ max_tile_cols_log2_ = tile_log2(1, AOMMIN(max_tiles_cols, AOM_MAX_TILE_COLS));
+ max_tile_rows_log2_ = tile_log2(1, AOMMIN(max_tiles_rows, AOM_MAX_TILE_ROWS));
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(tile_config_violated_, false);
+}
+
+TEST_P(UniformTileConfigTestLarge, UniformTileConfigTestLowRes) {
+ ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 1);
+ ASSERT_NO_FATAL_FAILURE(video.Begin());
+
+ int max_tiles_cols = video.img()->w / (int)tile_config_param_.sb_size;
+ int max_tiles_rows = video.img()->h / (int)tile_config_param_.sb_size;
+ max_tile_cols_log2_ = tile_log2(1, AOMMIN(max_tiles_cols, AOM_MAX_TILE_COLS));
+ max_tile_rows_log2_ = tile_log2(1, AOMMIN(max_tiles_rows, AOM_MAX_TILE_ROWS));
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(tile_config_violated_, false);
+}
+
+TEST_P(NonUniformTileConfigTestLarge, NonUniformTileConfigTest) {
+ ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 1);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_EQ(tile_config_violated_, false);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(UniformTileConfigTestLarge,
+ ::testing::ValuesIn(kTestModeParams),
+ ::testing::ValuesIn(uniformTileConfigParams),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+AV1_INSTANTIATE_TEST_SUITE(NonUniformTileConfigTestLarge,
+ ::testing::ValuesIn(kTestModeParams),
+ ::testing::ValuesIn(nonUniformTileConfigParams),
+ ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+
+typedef struct {
+ // Number of tile groups to set.
+ const int num_tg;
+ // Number of tile rows to set
+ const int num_tile_rows;
+ // Number of tile columns to set
+ const int num_tile_cols;
+} TileGroupConfigParams;
+
+static const TileGroupConfigParams tileGroupTestParams[] = {
+ { 5, 4, 4 }, { 3, 3, 3 }, { 5, 3, 3 }, { 7, 5, 5 }, { 7, 3, 3 }, { 7, 4, 4 }
+};
+
+std::ostream &operator<<(std::ostream &os,
+ const TileGroupConfigParams &test_arg) {
+ return os << "TileGroupConfigParams { num_tg:" << test_arg.num_tg
+ << " num_tile_rows:" << test_arg.num_tile_rows
+ << " num_tile_cols:" << test_arg.num_tile_cols << " }";
+}
+
+// This class is used to test number of tile groups present in header.
+class TileGroupTestLarge
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+ TileGroupConfigParams>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ TileGroupTestLarge()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ tile_group_config_params_(GET_PARAM(2)) {
+ tile_group_config_violated_ = false;
+ }
+ ~TileGroupTestLarge() override = default;
+
+ void SetUp() override {
+ InitializeConfig(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_end_usage = AOM_Q;
+ cfg_.g_threads = 1;
+ }
+
+ bool DoDecode() const override { return true; }
+
+ void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, 5);
+ encoder->Control(AV1E_SET_NUM_TG, tile_group_config_params_.num_tg);
+ encoder->Control(AV1E_SET_TILE_COLUMNS,
+ tile_group_config_params_.num_tile_cols);
+ encoder->Control(AV1E_SET_TILE_ROWS,
+ tile_group_config_params_.num_tile_rows);
+ }
+ }
+
+ bool HandleDecodeResult(const aom_codec_err_t res_dec,
+ libaom_test::Decoder *decoder) override {
+ EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+ if (AOM_CODEC_OK == res_dec) {
+ aom_tile_info tile_info;
+ aom_codec_ctx_t *ctx_dec = decoder->GetDecoder();
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_TILE_INFO, &tile_info);
+ AOM_CODEC_CONTROL_TYPECHECKED(ctx_dec, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+ &show_existing_frame_);
+ if (tile_info.num_tile_groups != tile_group_config_params_.num_tg &&
+ !show_existing_frame_)
+ tile_group_config_violated_ = true;
+ EXPECT_EQ(tile_group_config_violated_, false);
+ }
+ return AOM_CODEC_OK == res_dec;
+ }
+
+ int show_existing_frame_;
+ bool tile_group_config_violated_;
+ aom_rc_mode end_usage_check_;
+ ::libaom_test::TestMode encoding_mode_;
+ const TileGroupConfigParams tile_group_config_params_;
+};
+
+TEST_P(TileGroupTestLarge, TileGroupCountTest) {
+ libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 5);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_SUITE(TileGroupTestLarge,
+ ::testing::ValuesIn(kTestModeParams),
+ ::testing::ValuesIn(tileGroupTestParams));
+} // namespace
diff --git a/third_party/aom/test/tile_independence_test.cc b/third_party/aom/test/tile_independence_test.cc
new file mode 100644
index 0000000000..84406dd3fb
--- /dev/null
+++ b/third_party/aom/test/tile_independence_test.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/md5_helper.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+class TileIndependenceTest
+ : public ::libaom_test::CodecTestWith3Params<int, int, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ TileIndependenceTest()
+ : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(),
+ n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+ n_tile_groups_(GET_PARAM(3)) {
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+ cfg.w = 704;
+ cfg.h = 576;
+ cfg.threads = 1;
+ cfg.allow_lowbitdepth = 1;
+ fw_dec_ = codec_->CreateDecoder(cfg, 0);
+ inv_dec_ = codec_->CreateDecoder(cfg, 0);
+ inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1);
+
+ if (fw_dec_->IsAV1() && inv_dec_->IsAV1()) {
+ fw_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ fw_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ inv_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+ inv_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+ }
+ }
+
+ ~TileIndependenceTest() override {
+ delete fw_dec_;
+ delete inv_dec_;
+ }
+
+ void SetUp() override { InitializeConfig(libaom_test::kTwoPassGood); }
+
+ void PreEncodeFrameHook(libaom_test::VideoSource *video,
+ libaom_test::Encoder *encoder) override {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+ encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+ SetCpuUsed(encoder);
+ } else if (video->frame() == 3) {
+ encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+ }
+ }
+
+ virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+ static const int kCpuUsed = 3;
+ encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+ }
+
+ void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+ ::libaom_test::MD5 *md5) {
+ const aom_codec_err_t res = dec->DecodeFrame(
+ reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+ if (res != AOM_CODEC_OK) {
+ abort_ = true;
+ ASSERT_EQ(AOM_CODEC_OK, res);
+ }
+ const aom_image_t *img = dec->GetDxData().Next();
+ md5->Add(img);
+ }
+
+ void FramePktHook(const aom_codec_cx_pkt_t *pkt) override {
+ UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
+ UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
+ }
+
+ void DoTest() {
+ const aom_rational timebase = { 33333333, 1000000000 };
+ cfg_.g_timebase = timebase;
+ cfg_.rc_target_bitrate = 500;
+ cfg_.g_lag_in_frames = 12;
+ cfg_.rc_end_usage = AOM_VBR;
+
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+ timebase.den, timebase.num, 0, 5);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+ const char *md5_fw_str = md5_fw_order_.Get();
+ const char *md5_inv_str = md5_inv_order_.Get();
+ ASSERT_STREQ(md5_fw_str, md5_inv_str);
+ }
+
+ ::libaom_test::MD5 md5_fw_order_, md5_inv_order_;
+ ::libaom_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+ int n_tile_cols_;
+ int n_tile_rows_;
+ int n_tile_groups_;
+};
+
+// run an encode with 2 or 4 tiles, and do the decode both in normal and
+// inverted tile ordering. Ensure that the MD5 of the output in both cases
+// is identical. If so, tiles are considered independent and the test passes.
+TEST_P(TileIndependenceTest, MD5Match) {
+ cfg_.large_scale_tile = 0;
+ fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+ inv_dec_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+class TileIndependenceTestLarge : public TileIndependenceTest {
+ void SetCpuUsed(libaom_test::Encoder *encoder) override {
+ static const int kCpuUsed = 0;
+ encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+ }
+};
+
+TEST_P(TileIndependenceTestLarge, MD5Match) {
+ cfg_.large_scale_tile = 0;
+ fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+ inv_dec_->Control(AV1_SET_TILE_MODE, 0);
+ DoTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Values(0, 1),
+ ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+ ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+
+class TileIndependenceLSTest : public TileIndependenceTest {};
+
+TEST_P(TileIndependenceLSTest, MD5Match) {
+ cfg_.large_scale_tile = 1;
+ fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+ fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+ inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ DoTest();
+}
+
+class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {};
+
+TEST_P(TileIndependenceLSTestLarge, MD5Match) {
+ cfg_.large_scale_tile = 1;
+ fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+ fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+ inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+ DoTest();
+}
+
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceLSTest, ::testing::Values(6),
+ ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_SUITE(TileIndependenceLSTestLarge, ::testing::Values(6),
+ ::testing::Values(6), ::testing::Values(1));
+} // namespace
diff --git a/third_party/aom/test/time_stamp_test.cc b/third_party/aom/test/time_stamp_test.cc
new file mode 100644
index 0000000000..5de98b719e
--- /dev/null
+++ b/third_party/aom/test/time_stamp_test.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Test AOM timestamp handling
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 3;
+
+// A video source that exposes functions to set the timebase, framerate and
+// starting pts.
+class DummyTimebaseVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+ // Parameters num and den set the timebase for the video source.
+ DummyTimebaseVideoSource(int num, int den)
+ : framerate_numerator_(30), framerate_denominator_(1), starting_pts_(0) {
+ SetSize(kVideoSourceWidth, kVideoSourceHeight);
+ set_limit(kFramesToEncode);
+ timebase_.num = num;
+ timebase_.den = den;
+ }
+
+ void SetFramerate(int numerator, int denominator) {
+ framerate_numerator_ = numerator;
+ framerate_denominator_ = denominator;
+ }
+
+ // Returns one frames duration in timebase units as a double.
+ double FrameDuration() const {
+ return (static_cast<double>(timebase_.den) / timebase_.num) /
+ (static_cast<double>(framerate_numerator_) / framerate_denominator_);
+ }
+
+ aom_codec_pts_t pts() const override {
+ return static_cast<aom_codec_pts_t>(frame_ * FrameDuration() +
+ starting_pts_ + 0.5);
+ }
+
+ unsigned long duration() const override {
+ return static_cast<unsigned long>(FrameDuration() + 0.5);
+ }
+
+ aom_rational_t timebase() const override { return timebase_; }
+
+ void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
+
+ private:
+ aom_rational_t timebase_;
+ int framerate_numerator_;
+ int framerate_denominator_;
+ int64_t starting_pts_;
+};
+
+class TimestampTest
+ : public ::libaom_test::EncoderTest,
+ public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
+ protected:
+ TimestampTest() : EncoderTest(GET_PARAM(0)) {}
+ ~TimestampTest() override = default;
+
+ void SetUp() override { InitializeConfig(GET_PARAM(1)); }
+};
+
+// Tests encoding in millisecond timebase.
+TEST_P(TimestampTest, EncodeFrames) {
+ DummyTimebaseVideoSource video(1, 1000);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestMicrosecondTimebase) {
+ // Set the timebase to microseconds.
+ DummyTimebaseVideoSource video(1, 1000000);
+ video.set_limit(1);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestAv1Rollover) {
+ DummyTimebaseVideoSource video(1, 1000);
+ video.set_starting_pts(922337170351ll);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#if CONFIG_REALTIME_ONLY
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+ ::testing::Values(::libaom_test::kRealTime));
+#else
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+ ::testing::Values(::libaom_test::kRealTime,
+ ::libaom_test::kTwoPassGood));
+#endif
+
+} // namespace
diff --git a/third_party/aom/test/tools_common.sh b/third_party/aom/test/tools_common.sh
new file mode 100755
index 0000000000..cb9eba1727
--- /dev/null
+++ b/third_party/aom/test/tools_common.sh
@@ -0,0 +1,520 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file contains shell code shared by test scripts for libaom tools.
+
+# Use $AOM_TEST_TOOLS_COMMON_SH as a pseudo include guard.
+if [ -z "${AOM_TEST_TOOLS_COMMON_SH}" ]; then
+AOM_TEST_TOOLS_COMMON_SH=included
+
+set -e
+devnull='> /dev/null 2>&1'
+AOM_TEST_PREFIX=""
+
+elog() {
+ echo "$@" 1>&2
+}
+
+vlog() {
+ if [ "${AOM_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
+ echo "$@"
+ fi
+}
+
+# Sets $AOM_TOOL_TEST to the name specified by positional parameter one.
+test_begin() {
+ AOM_TOOL_TEST="${1}"
+}
+
+# Clears the AOM_TOOL_TEST variable after confirming that $AOM_TOOL_TEST matches
+# positional parameter one.
+test_end() {
+ if [ "$1" != "${AOM_TOOL_TEST}" ]; then
+ echo "FAIL completed test mismatch!."
+ echo " completed test: ${1}"
+ echo " active test: ${AOM_TOOL_TEST}."
+ return 1
+ fi
+ AOM_TOOL_TEST='<unset>'
+}
+
+# Echoes the target configuration being tested.
+test_configuration_target() {
+ aom_config_c="${LIBAOM_CONFIG_PATH}/config/aom_config.c"
+ # Clean up the cfg pointer line from aom_config.c for easier re-use by
+ # someone examining a failure in the example tests.
+ # 1. Run grep on aom_config.c for cfg and limit the results to 1.
+ # 2. Split the line using ' = ' as separator.
+ # 3. Abuse sed to consume the leading " and trailing "; from the assignment
+ # to the cfg pointer.
+ cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \
+ | sed -e s/\"// -e s/\"\;//)
+ echo cmake generated via command: cmake path/to/aom ${cmake_config}
+}
+
+# Trap function used for failure reports and tool output directory removal.
+# When the contents of $AOM_TOOL_TEST do not match the string '<unset>', reports
+# failure of test stored in $AOM_TOOL_TEST.
+cleanup() {
+ if [ -n "${AOM_TOOL_TEST}" ] && [ "${AOM_TOOL_TEST}" != '<unset>' ]; then
+ echo "FAIL: $AOM_TOOL_TEST"
+ fi
+ if [ "${AOM_TEST_PRESERVE_OUTPUT}" = "yes" ]; then
+ return
+ fi
+ if [ -n "${AOM_TEST_OUTPUT_DIR}" ] && [ -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+ rm -rf "${AOM_TEST_OUTPUT_DIR}"
+ fi
+}
+
+# Echoes the version string assigned to the VERSION_STRING_NOSP variable defined
+# in $LIBAOM_CONFIG_PATH/config/aom_version.h to stdout.
+cmake_version() {
+ aom_version_h="${LIBAOM_CONFIG_PATH}/config/aom_version.h"
+
+ # Find VERSION_STRING_NOSP line, split it with '"' and print the next to last
+ # field to output the version string to stdout.
+ aom_version=$(awk -F \" '/VERSION_STRING_NOSP/ {print $(NF-1)}' \
+ "${aom_version_h}")
+ echo "v${aom_version}"
+}
+
+# Echoes current git version as reported by running 'git describe', or the
+# version used by the cmake build when git is unavailable.
+source_version() {
+ if git --version > /dev/null 2>&1; then
+ (cd "$(dirname "${0}")"
+ git describe)
+ else
+ cmake_version
+ fi
+}
+
+# Echoes warnings to stdout when source version and CMake build generated
+# version are out of sync.
+check_version_strings() {
+ cmake_version=$(cmake_version)
+ source_version=$(source_version)
+
+ if [ "${cmake_version}" != "${source_version}" ]; then
+ echo "Warning: version has changed since last cmake run."
+ vlog " cmake version: ${cmake_version} version now: ${source_version}"
+ fi
+}
+
+# $1 is the name of an environment variable containing a directory name to
+# test.
+test_env_var_dir() {
+ local dir=$(eval echo "\${$1}")
+ if [ ! -d "${dir}" ]; then
+ elog "'${dir}': No such directory"
+ elog "The $1 environment variable must be set to a valid directory."
+ return 1
+ fi
+}
+
+# This script requires that the LIBAOM_BIN_PATH, LIBAOM_CONFIG_PATH, and
+# LIBAOM_TEST_DATA_PATH variables are in the environment: Confirm that
+# the variables are set and that they all evaluate to directory paths.
+verify_aom_test_environment() {
+ test_env_var_dir "LIBAOM_BIN_PATH" \
+ && test_env_var_dir "LIBAOM_CONFIG_PATH" \
+ && test_env_var_dir "LIBAOM_TEST_DATA_PATH"
+}
+
+# Greps aom_config.h in LIBAOM_CONFIG_PATH for positional parameter one, which
+# should be a LIBAOM preprocessor flag. Echoes yes to stdout when the feature
+# is available.
+aom_config_option_enabled() {
+ aom_config_option="${1}"
+ aom_config_file="${LIBAOM_CONFIG_PATH}/config/aom_config.h"
+ config_line=$(grep "${aom_config_option}" "${aom_config_file}")
+ if echo "${config_line}" | egrep -q '1$'; then
+ echo yes
+ fi
+}
+
+# Echoes yes when output of test_configuration_target() contains win32 or win64.
+is_windows_target() {
+ if test_configuration_target \
+ | grep -q -e win32 -e win64 > /dev/null 2>&1; then
+ echo yes
+ fi
+}
+
+# Echoes path to $1 when it's executable and exists in one of the directories
+# included in $tool_paths, or an empty string. Caller is responsible for testing
+# the string once the function returns.
+aom_tool_path() {
+ local tool_name="$1"
+ local root_path="${LIBAOM_BIN_PATH}"
+ local suffix="${AOM_TEST_EXE_SUFFIX}"
+ local tool_paths="\
+ ${root_path}/${tool_name}${suffix} \
+ ${root_path}/../${tool_name}${suffix} \
+ ${root_path}/tools/${tool_name}${suffix} \
+ ${root_path}/../tools/${tool_name}${suffix}"
+
+ local toolpath=""
+
+ for tool_path in ${tool_paths}; do
+ if [ -x "${tool_path}" ] && [ -f "${tool_path}" ]; then
+ echo "${tool_path}"
+ return 0
+ fi
+ done
+
+ return 1
+}
+
+# Echoes yes to stdout when the file named by positional parameter one exists
+# in LIBAOM_BIN_PATH, and is executable.
+aom_tool_available() {
+ local tool_name="$1"
+ local tool="${LIBAOM_BIN_PATH}/${tool_name}${AOM_TEST_EXE_SUFFIX}"
+ [ -x "${tool}" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_DECODER.
+av1_decode_available() {
+ [ "$(aom_config_option_enabled CONFIG_AV1_DECODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_ENCODER.
+av1_encode_available() {
+ [ "$(aom_config_option_enabled CONFIG_AV1_ENCODER)" = "yes" ] && echo yes
+}
+
+# Echoes "fast" encode params for use with aomenc.
+aomenc_encode_test_fast_params() {
+ echo "--cpu-used=2
+ --limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+ --lag-in-frames=0
+ --test-decode=fatal"
+}
+
+# Echoes realtime encode params for use with aomenc.
+aomenc_encode_test_rt_params() {
+ echo "--limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+ --test-decode=fatal
+ --enable-tpl-model=0
+ --deltaq-mode=0
+ --enable-order-hint=0
+ --profile=0
+ --static-thresh=0
+ --end-usage=cbr
+ --cpu-used=7
+ --passes=1
+ --usage=1
+ --lag-in-frames=0
+ --aq-mode=3
+ --enable-obmc=0
+ --enable-warped-motion=0
+ --enable-ref-frame-mvs=0
+ --enable-cdef=1
+ --enable-order-hint=0
+ --coeff-cost-upd-freq=3
+ --mode-cost-upd-freq=3
+ --mv-cost-upd-freq=3"
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_HIGHBITDEPTH.
+highbitdepth_available() {
+ [ "$(aom_config_option_enabled CONFIG_AV1_HIGHBITDEPTH)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_WEBM_IO.
+webm_io_available() {
+ [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_REALTIME_ONLY.
+realtime_only_build() {
+ [ "$(aom_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ] && echo yes
+}
+
+# Filters strings from $1 using the filter specified by $2. Filter behavior
+# depends on the presence of $3. When $3 is present, strings that match the
+# filter are excluded. When $3 is omitted, strings matching the filter are
+# included.
+# The filtered result is echoed to stdout.
+filter_strings() {
+ strings=${1}
+ filter=${2}
+ exclude=${3}
+
+ if [ -n "${exclude}" ]; then
+ # When positional parameter three exists the caller wants to remove strings.
+ # Tell grep to invert matches using the -v argument.
+ exclude='-v'
+ else
+ unset exclude
+ fi
+
+ if [ -n "${filter}" ]; then
+ for s in ${strings}; do
+ if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+ filtered_strings="${filtered_strings} ${s}"
+ fi
+ done
+ else
+ filtered_strings="${strings}"
+ fi
+ echo "${filtered_strings}"
+}
+
+# Runs user test functions passed via positional parameters one and two.
+# Functions in positional parameter one are treated as environment verification
+# functions and are run unconditionally. Functions in positional parameter two
+# are run according to the rules specified in aom_test_usage().
+run_tests() {
+ local env_tests="verify_aom_test_environment $1"
+ local tests_to_filter="$2"
+ local test_name="${AOM_TEST_NAME}"
+
+ if [ -z "${test_name}" ]; then
+ test_name="$(basename "${0%.*}")"
+ fi
+
+ if [ "${AOM_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then
+ # Filter out DISABLED tests.
+ tests_to_filter=$(filter_strings "${tests_to_filter}" ^DISABLED exclude)
+ fi
+
+ if [ -n "${AOM_TEST_FILTER}" ]; then
+ # Remove tests not matching the user's filter.
+ tests_to_filter=$(filter_strings "${tests_to_filter}" ${AOM_TEST_FILTER})
+ fi
+
+ # User requested test listing: Dump test names and return.
+ if [ "${AOM_TEST_LIST_TESTS}" = "yes" ]; then
+ for test_name in $tests_to_filter; do
+ echo ${test_name}
+ done
+ return
+ fi
+
+ # Don't bother with the environment tests if everything else was disabled.
+ [ -z "${tests_to_filter}" ] && return
+
+ # Combine environment and actual tests.
+ local tests_to_run="${env_tests} ${tests_to_filter}"
+
+ # av1_c_vs_simd_encode is a standalone test, and it doesn't need to check the
+ # version string.
+ if [ "${test_name}" != "av1_c_vs_simd_encode" ]; then
+ check_version_strings
+ fi
+
+ # Run tests.
+ for test in ${tests_to_run}; do
+ test_begin "${test}"
+ vlog " RUN ${test}"
+ "${test}"
+ vlog " PASS ${test}"
+ test_end "${test}"
+ done
+
+ local tested_config="$(test_configuration_target) @ $(source_version)"
+ echo "${test_name}: Done, all tests pass for ${tested_config}."
+}
+
+aom_test_usage() {
+cat << EOF
+ Usage: ${0##*/} [arguments]
+ --bin-path <path to libaom binaries directory>
+ --config-path <path to libaom config directory>
+ --filter <filter>: User test filter. Only tests matching filter are run.
+ --run-disabled-tests: Run disabled tests.
+ --help: Display this message and exit.
+ --test-data-path <path to libaom test data directory>
+ --show-program-output: Shows output from all programs being tested.
+ --prefix: Allows for a user specified prefix to be inserted before all test
+ programs. Grants the ability, for example, to run test programs
+ within valgrind.
+ --list-tests: List all test names and exit without actually running tests.
+ --verbose: Verbose output.
+
+ When the --bin-path option is not specified the script attempts to use
+ \$LIBAOM_BIN_PATH and then the current directory.
+
+ When the --config-path option is not specified the script attempts to use
+ \$LIBAOM_CONFIG_PATH and then the current directory.
+
+ When the -test-data-path option is not specified the script attempts to use
+ \$LIBAOM_TEST_DATA_PATH and then the current directory.
+EOF
+}
+
+# Returns non-zero (failure) when required environment variables are empty
+# strings.
+aom_test_check_environment() {
+ if [ -z "${LIBAOM_BIN_PATH}" ] || \
+ [ -z "${LIBAOM_CONFIG_PATH}" ] || \
+ [ -z "${LIBAOM_TEST_DATA_PATH}" ]; then
+ return 1
+ fi
+}
+
+# Echo aomenc command line parameters allowing use of a raw yuv file as
+# input to aomenc.
+yuv_raw_input() {
+ echo ""${YUV_RAW_INPUT}"
+ --width="${YUV_RAW_INPUT_WIDTH}"
+ --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Do a small encode for testing decoders.
+encode_yuv_raw_input_av1() {
+ if [ "$(av1_encode_available)" = "yes" ]; then
+ local output="$1"
+ local encoder="$(aom_tool_path aomenc)"
+ shift
+ eval "${encoder}" $(yuv_raw_input) \
+ $(aomenc_encode_test_fast_params) \
+ --output="${output}" \
+ $@ \
+ ${devnull}
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+# Parse the command line.
+while [ -n "$1" ]; do
+ case "$1" in
+ --bin-path)
+ LIBAOM_BIN_PATH="$2"
+ shift
+ ;;
+ --config-path)
+ LIBAOM_CONFIG_PATH="$2"
+ shift
+ ;;
+ --filter)
+ AOM_TEST_FILTER="$2"
+ shift
+ ;;
+ --run-disabled-tests)
+ AOM_TEST_RUN_DISABLED_TESTS=yes
+ ;;
+ --help)
+ aom_test_usage
+ exit
+ ;;
+ --test-data-path)
+ LIBAOM_TEST_DATA_PATH="$2"
+ shift
+ ;;
+ --prefix)
+ AOM_TEST_PREFIX="$2"
+ shift
+ ;;
+ --verbose)
+ AOM_TEST_VERBOSE_OUTPUT=yes
+ ;;
+ --show-program-output)
+ devnull=
+ ;;
+ --list-tests)
+ AOM_TEST_LIST_TESTS=yes
+ ;;
+ *)
+ aom_test_usage
+ exit 1
+ ;;
+ esac
+ shift
+done
+
+# Handle running the tests from a build directory without arguments when running
+# the tests on *nix/macosx.
+LIBAOM_BIN_PATH="${LIBAOM_BIN_PATH:-.}"
+LIBAOM_CONFIG_PATH="${LIBAOM_CONFIG_PATH:-.}"
+LIBAOM_TEST_DATA_PATH="${LIBAOM_TEST_DATA_PATH:-.}"
+
+# Create a temporary directory for output files, and a trap to clean it up.
+if [ -n "${TMPDIR}" ]; then
+ AOM_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+ AOM_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+ AOM_TEST_TEMP_ROOT=/tmp
+fi
+
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_OUTPUT_DIR:-${AOM_TEST_TEMP_ROOT}/aom_test_$$}"
+
+if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
+ [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+ echo "${0##*/}: Cannot create output directory, giving up."
+ echo "${0##*/}: AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}"
+ exit 1
+fi
+
+AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no}
+
+# This checking requires config/aom_config.c that is available in Jenkins
+# testing.
+if [ "$(is_windows_target)" = "yes" ]; then
+ AOM_TEST_EXE_SUFFIX=".exe"
+fi
+
+# Variables shared by tests.
+AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED:-1}
+AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT:-5}
+AV1_IVF_FILE="${AV1_IVF_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.ivf}"
+AV1_OBU_ANNEXB_FILE="${AV1_OBU_ANNEXB_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.annexb.obu}"
+AV1_OBU_SEC5_FILE="${AV1_OBU_SEC5_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.section5.obu}"
+AV1_WEBM_FILE="${AV1_WEBM_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.webm}"
+
+YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_RAW_INPUT_WIDTH=352
+YUV_RAW_INPUT_HEIGHT=288
+
+Y4M_NOSQ_PAR_INPUT="${LIBAOM_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Setup a trap function to clean up after tests complete.
+trap cleanup EXIT
+
+vlog "$(basename "${0%.*}") test configuration:
+ LIBAOM_BIN_PATH=${LIBAOM_BIN_PATH}
+ LIBAOM_CONFIG_PATH=${LIBAOM_CONFIG_PATH}
+ LIBAOM_TEST_DATA_PATH=${LIBAOM_TEST_DATA_PATH}
+ AOM_TEST_EXE_SUFFIX=${AOM_TEST_EXE_SUFFIX}
+ AOM_TEST_FILTER=${AOM_TEST_FILTER}
+ AOM_TEST_LIST_TESTS=${AOM_TEST_LIST_TESTS}
+ AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}
+ AOM_TEST_PREFIX=${AOM_TEST_PREFIX}
+ AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT}
+ AOM_TEST_RUN_DISABLED_TESTS=${AOM_TEST_RUN_DISABLED_TESTS}
+ AOM_TEST_SHOW_PROGRAM_OUTPUT=${AOM_TEST_SHOW_PROGRAM_OUTPUT}
+ AOM_TEST_TEMP_ROOT=${AOM_TEST_TEMP_ROOT}
+ AOM_TEST_VERBOSE_OUTPUT=${AOM_TEST_VERBOSE_OUTPUT}
+ AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED}
+ AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT}
+ AV1_IVF_FILE=${AV1_IVF_FILE}
+ AV1_OBU_ANNEXB_FILE=${AV1_OBU_ANNEXB_FILE}
+ AV1_OBU_SEC5_FILE=${AV1_OBU_SEC5_FILE}
+ AV1_WEBM_FILE=${AV1_WEBM_FILE}
+ YUV_RAW_INPUT=${YUV_RAW_INPUT}
+ YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
+ YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
+ Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}"
+
+fi # End $AOM_TEST_TOOLS_COMMON_SH pseudo include guard.
diff --git a/third_party/aom/test/tpl_model_test.cc b/third_party/aom/test/tpl_model_test.cc
new file mode 100644
index 0000000000..91eb5e94d3
--- /dev/null
+++ b/third_party/aom/test/tpl_model_test.cc
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+#if CONFIG_BITRATE_ACCURACY
+constexpr double epsilon = 0.0000001;
+#endif
+
+double laplace_prob(double q_step, double b, double zero_bin_ratio,
+ int qcoeff) {
+ int abs_qcoeff = abs(qcoeff);
+ double z0 = fmax(exp(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+ if (abs_qcoeff == 0) {
+ double p0 = 1 - z0;
+ return p0;
+ } else {
+ assert(abs_qcoeff > 0);
+ double z = fmax(exp(-q_step / b), TPL_EPSILON);
+ double p = z0 / 2 * (1 - z) * pow(z, abs_qcoeff - 1);
+ return p;
+ }
+}
+TEST(TplModelTest, ExponentialEntropyBoundaryTest1) {
+ double b = 0;
+ double q_step = 1;
+ double entropy = av1_exponential_entropy(q_step, b);
+ EXPECT_NEAR(entropy, 0, 0.00001);
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest1) {
+ // Check the consistency between av1_estimate_coeff_entropy() and
+ // laplace_prob()
+ double b = 1;
+ double q_step = 1;
+ double zero_bin_ratio = 2;
+ for (int qcoeff = -256; qcoeff < 256; ++qcoeff) {
+ double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+ double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+ double ref_rate = -log2(prob);
+ EXPECT_DOUBLE_EQ(rate, ref_rate);
+ }
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest2) {
+ // Check the consistency between av1_estimate_coeff_entropy(), laplace_prob()
+ // and av1_laplace_entropy()
+ double b = 1;
+ double q_step = 1;
+ double zero_bin_ratio = 2;
+ double est_expected_rate = 0;
+ for (int qcoeff = -20; qcoeff < 20; ++qcoeff) {
+ double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+ double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+ est_expected_rate += prob * rate;
+ }
+ double expected_rate = av1_laplace_entropy(q_step, b, zero_bin_ratio);
+ EXPECT_NEAR(expected_rate, est_expected_rate, 0.001);
+}
+
+TEST(TplModelTest, InitTplStats1) {
+ // We use heap allocation instead of stack allocation here to avoid
+ // -Wstack-usage warning.
+ std::unique_ptr<TplParams> tpl_data(new (std::nothrow) TplParams);
+ ASSERT_NE(tpl_data, nullptr);
+ av1_zero(*tpl_data);
+ tpl_data->ready = 1;
+ EXPECT_EQ(sizeof(tpl_data->tpl_stats_buffer),
+ MAX_LENGTH_TPL_FRAME_STATS * sizeof(tpl_data->tpl_stats_buffer[0]));
+ for (int i = 0; i < MAX_LENGTH_TPL_FRAME_STATS; ++i) {
+ // Set it to a random non-zero number
+ tpl_data->tpl_stats_buffer[i].is_valid = i + 1;
+ }
+ av1_init_tpl_stats(tpl_data.get());
+ EXPECT_EQ(tpl_data->ready, 0);
+ for (int i = 0; i < MAX_LENGTH_TPL_FRAME_STATS; ++i) {
+ EXPECT_EQ(tpl_data->tpl_stats_buffer[i].is_valid, 0);
+ }
+}
+
+TEST(TplModelTest, DeltaRateCostZeroFlow) {
+ // When srcrf_dist equal to recrf_dist, av1_delta_rate_cost should return 0
+ int64_t srcrf_dist = 256;
+ int64_t recrf_dist = 256;
+ int64_t delta_rate = 512;
+ int pixel_num = 256;
+ int64_t rate_cost =
+ av1_delta_rate_cost(delta_rate, recrf_dist, srcrf_dist, pixel_num);
+ EXPECT_EQ(rate_cost, 0);
+}
+
+// a reference function of av1_delta_rate_cost() with delta_rate using bit as
+// basic unit
+double ref_delta_rate_cost(int64_t delta_rate, double src_rec_ratio,
+ int pixel_count) {
+ assert(src_rec_ratio <= 1 && src_rec_ratio >= 0);
+ double bits_per_pixel = (double)delta_rate / pixel_count;
+ double p = pow(2, bits_per_pixel);
+ double flow_rate_per_pixel =
+ sqrt(p * p / (src_rec_ratio * p * p + (1 - src_rec_ratio)));
+ double rate_cost = pixel_count * log2(flow_rate_per_pixel);
+ return rate_cost;
+}
+
+TEST(TplModelTest, DeltaRateCostReference) {
+ const int64_t scale = TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT;
+ std::vector<int64_t> srcrf_dist_arr = { 256, 257, 312 };
+ std::vector<int64_t> recrf_dist_arr = { 512, 288, 620 };
+ std::vector<int64_t> delta_rate_arr = { 10, 278, 100 };
+ for (size_t t = 0; t < srcrf_dist_arr.size(); ++t) {
+ int64_t srcrf_dist = srcrf_dist_arr[t];
+ int64_t recrf_dist = recrf_dist_arr[t];
+ int64_t delta_rate = delta_rate_arr[t];
+ int64_t scaled_delta_rate = delta_rate << scale;
+ int pixel_count = 256;
+ int64_t rate_cost = av1_delta_rate_cost(scaled_delta_rate, recrf_dist,
+ srcrf_dist, pixel_count);
+ rate_cost >>= scale;
+ double src_rec_ratio = (double)srcrf_dist / recrf_dist;
+ double ref_rate_cost =
+ ref_delta_rate_cost(delta_rate, src_rec_ratio, pixel_count);
+ EXPECT_NEAR((double)rate_cost, ref_rate_cost, 1);
+ }
+}
+
+TEST(TplModelTest, GetOverlapAreaHasOverlap) {
+ // The block a's area is [10, 17) x [18, 24).
+ // The block b's area is [8, 15) x [17, 23).
+ // The overlapping area between block a and block b is [10, 15) x [18, 23).
+ // Therefore, the size of the area is (15 - 10) * (23 - 18) = 25.
+ int row_a = 10;
+ int col_a = 18;
+ int row_b = 8;
+ int col_b = 17;
+ int height = 7;
+ int width = 6;
+ int overlap_area =
+ av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+ EXPECT_EQ(overlap_area, 25);
+}
+
+TEST(TplModelTest, GetOverlapAreaNoOverlap) {
+ // The block a's area is [10, 14) x [18, 22).
+ // The block b's area is [5, 9) x [5, 9).
+ // Threre is no overlapping area between block a and block b.
+ // Therefore, the return value should be zero.
+ int row_a = 10;
+ int col_a = 18;
+ int row_b = 5;
+ int col_b = 5;
+ int height = 4;
+ int width = 4;
+ int overlap_area =
+ av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+ EXPECT_EQ(overlap_area, 0);
+}
+
+TEST(TplModelTest, GetQIndexFromQstepRatio) {
+ const aom_bit_depth_t bit_depth = AOM_BITS_8;
+ // When qstep_ratio is 1, the output q_index should be equal to leaf_qindex.
+ double qstep_ratio = 1.0;
+ for (int leaf_qindex = 1; leaf_qindex <= 255; ++leaf_qindex) {
+ const int q_index =
+ av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+ EXPECT_EQ(q_index, leaf_qindex);
+ }
+
+ // When qstep_ratio is very low, the output q_index should be 1.
+ qstep_ratio = 0.0001;
+ for (int leaf_qindex = 1; leaf_qindex <= 255; ++leaf_qindex) {
+ const int q_index =
+ av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+ EXPECT_EQ(q_index, 0);
+ }
+}
+
+TEST(TplModelTest, TxfmStatsInitTest) {
+ TplTxfmStats tpl_txfm_stats;
+ av1_init_tpl_txfm_stats(&tpl_txfm_stats);
+ EXPECT_EQ(tpl_txfm_stats.coeff_num, 256);
+ EXPECT_EQ(tpl_txfm_stats.txfm_block_count, 0);
+ for (int i = 0; i < tpl_txfm_stats.coeff_num; ++i) {
+ EXPECT_DOUBLE_EQ(tpl_txfm_stats.abs_coeff_sum[i], 0);
+ }
+}
+
+#if CONFIG_BITRATE_ACCURACY
+TEST(TplModelTest, TxfmStatsAccumulateTest) {
+ TplTxfmStats sub_stats;
+ av1_init_tpl_txfm_stats(&sub_stats);
+ sub_stats.txfm_block_count = 17;
+ for (int i = 0; i < sub_stats.coeff_num; ++i) {
+ sub_stats.abs_coeff_sum[i] = i;
+ }
+
+ TplTxfmStats accumulated_stats;
+ av1_init_tpl_txfm_stats(&accumulated_stats);
+ accumulated_stats.txfm_block_count = 13;
+ for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+ accumulated_stats.abs_coeff_sum[i] = 5 * i;
+ }
+
+ av1_accumulate_tpl_txfm_stats(&sub_stats, &accumulated_stats);
+ EXPECT_DOUBLE_EQ(accumulated_stats.txfm_block_count, 30);
+ for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+ EXPECT_DOUBLE_EQ(accumulated_stats.abs_coeff_sum[i], 6 * i);
+ }
+}
+
+TEST(TplModelTest, TxfmStatsRecordTest) {
+ TplTxfmStats stats1;
+ TplTxfmStats stats2;
+ av1_init_tpl_txfm_stats(&stats1);
+ av1_init_tpl_txfm_stats(&stats2);
+
+ tran_low_t coeff[256];
+ for (int i = 0; i < 256; ++i) {
+ coeff[i] = i;
+ }
+ av1_record_tpl_txfm_block(&stats1, coeff);
+ EXPECT_EQ(stats1.txfm_block_count, 1);
+
+ // we record the same transform block twice for testing purpose
+ av1_record_tpl_txfm_block(&stats2, coeff);
+ av1_record_tpl_txfm_block(&stats2, coeff);
+ EXPECT_EQ(stats2.txfm_block_count, 2);
+
+ EXPECT_EQ(stats1.coeff_num, 256);
+ EXPECT_EQ(stats2.coeff_num, 256);
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_DOUBLE_EQ(stats2.abs_coeff_sum[i], 2 * stats1.abs_coeff_sum[i]);
+ }
+}
+#endif // CONFIG_BITRATE_ACCURACY
+
+TEST(TplModelTest, ComputeMVDifferenceTest) {
+ TplDepFrame tpl_frame_small;
+ tpl_frame_small.is_valid = true;
+ tpl_frame_small.mi_rows = 4;
+ tpl_frame_small.mi_cols = 4;
+ tpl_frame_small.stride = 1;
+ uint8_t right_shift_small = 1;
+ int step_small = 1 << right_shift_small;
+
+ // Test values for motion vectors.
+ int mv_vals_small[4] = { 1, 2, 3, 4 };
+ int index = 0;
+
+ // 4x4 blocks means we need to allocate a 4 size array.
+ // According to av1_tpl_ptr_pos:
+ // (row >> right_shift) * stride + (col >> right_shift)
+ // (4 >> 1) * 1 + (4 >> 1) = 4
+ TplDepStats stats_buf_small[4];
+ tpl_frame_small.tpl_stats_ptr = stats_buf_small;
+
+ for (int row = 0; row < tpl_frame_small.mi_rows; row += step_small) {
+ for (int col = 0; col < tpl_frame_small.mi_cols; col += step_small) {
+ TplDepStats tpl_stats;
+ tpl_stats.ref_frame_index[0] = 0;
+ int_mv mv;
+ mv.as_mv.row = mv_vals_small[index];
+ mv.as_mv.col = mv_vals_small[index];
+ index++;
+ tpl_stats.mv[0] = mv;
+ tpl_frame_small.tpl_stats_ptr[av1_tpl_ptr_pos(
+ row, col, tpl_frame_small.stride, right_shift_small)] = tpl_stats;
+ }
+ }
+
+ int_mv result_mv =
+ av1_compute_mv_difference(&tpl_frame_small, 1, 1, step_small,
+ tpl_frame_small.stride, right_shift_small);
+
+ // Expect the result to be exactly equal to 1 because this is the difference
+ // between neighboring motion vectors in this instance.
+ EXPECT_EQ(result_mv.as_mv.row, 1);
+ EXPECT_EQ(result_mv.as_mv.col, 1);
+}
+
+TEST(TplModelTest, ComputeMVBitsTest) {
+ TplDepFrame tpl_frame;
+ tpl_frame.is_valid = true;
+ tpl_frame.mi_rows = 16;
+ tpl_frame.mi_cols = 16;
+ tpl_frame.stride = 24;
+ uint8_t right_shift = 2;
+ int step = 1 << right_shift;
+ // Test values for motion vectors.
+ int mv_vals_ordered[16] = { 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16 };
+ int mv_vals[16] = { 1, 16, 2, 15, 3, 14, 4, 13, 5, 12, 6, 11, 7, 10, 8, 9 };
+ int index = 0;
+
+ // 16x16 blocks means we need to allocate a 100 size array.
+ // According to av1_tpl_ptr_pos:
+ // (row >> right_shift) * stride + (col >> right_shift)
+ // (16 >> 2) * 24 + (16 >> 2) = 100
+ TplDepStats stats_buf[100];
+ tpl_frame.tpl_stats_ptr = stats_buf;
+
+ for (int row = 0; row < tpl_frame.mi_rows; row += step) {
+ for (int col = 0; col < tpl_frame.mi_cols; col += step) {
+ TplDepStats tpl_stats;
+ tpl_stats.ref_frame_index[0] = 0;
+ int_mv mv;
+ mv.as_mv.row = mv_vals_ordered[index];
+ mv.as_mv.col = mv_vals_ordered[index];
+ index++;
+ tpl_stats.mv[0] = mv;
+ tpl_frame.tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_frame.stride,
+ right_shift)] = tpl_stats;
+ }
+ }
+
+ double result = av1_tpl_compute_frame_mv_entropy(&tpl_frame, right_shift);
+
+ // Expect the result to be low because the motion vectors are ordered.
+ // The estimation algorithm takes this into account and reduces the cost.
+ EXPECT_NEAR(result, 20, 5);
+
+ index = 0;
+ for (int row = 0; row < tpl_frame.mi_rows; row += step) {
+ for (int col = 0; col < tpl_frame.mi_cols; col += step) {
+ TplDepStats tpl_stats;
+ tpl_stats.ref_frame_index[0] = 0;
+ int_mv mv;
+ mv.as_mv.row = mv_vals[index];
+ mv.as_mv.col = mv_vals[index];
+ index++;
+ tpl_stats.mv[0] = mv;
+ tpl_frame.tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_frame.stride,
+ right_shift)] = tpl_stats;
+ }
+ }
+
+ result = av1_tpl_compute_frame_mv_entropy(&tpl_frame, right_shift);
+
+ // Expect the result to be higher because the vectors are not ordered.
+ // Neighboring vectors will have different values, increasing the cost.
+ EXPECT_NEAR(result, 70, 5);
+}
+#if CONFIG_BITRATE_ACCURACY
+
+TEST(TplModelTest, VbrRcInfoSetGopBitBudget) {
+ VBR_RATECTRL_INFO vbr_rc_info;
+ const double total_bit_budget = 2000;
+ const int show_frame_count = 8;
+ const int gop_show_frame_count = 4;
+ av1_vbr_rc_init(&vbr_rc_info, total_bit_budget, show_frame_count);
+ av1_vbr_rc_set_gop_bit_budget(&vbr_rc_info, gop_show_frame_count);
+ EXPECT_NEAR(vbr_rc_info.gop_bit_budget, 1000, epsilon);
+}
+
+void init_toy_gf_group(GF_GROUP *gf_group) {
+ av1_zero(*gf_group);
+ gf_group->size = 4;
+ const FRAME_UPDATE_TYPE update_type[4] = { KF_UPDATE, ARF_UPDATE,
+ INTNL_ARF_UPDATE, LF_UPDATE };
+ for (int i = 0; i < gf_group->size; ++i) {
+ gf_group->update_type[i] = update_type[i];
+ }
+}
+
+void init_toy_vbr_rc_info(VBR_RATECTRL_INFO *vbr_rc_info, int gop_size) {
+ int total_bit_budget = 2000;
+ int show_frame_count = 8;
+ av1_vbr_rc_init(vbr_rc_info, total_bit_budget, show_frame_count);
+
+ for (int i = 0; i < gop_size; ++i) {
+ vbr_rc_info->qstep_ratio_list[i] = 1;
+ }
+}
+
+void init_toy_tpl_txfm_stats(std::vector<TplTxfmStats> *stats_list) {
+ for (size_t i = 0; i < stats_list->size(); i++) {
+ TplTxfmStats *txfm_stats = &stats_list->at(i);
+ av1_init_tpl_txfm_stats(txfm_stats);
+ txfm_stats->txfm_block_count = 8;
+ for (int j = 0; j < txfm_stats->coeff_num; j++) {
+ txfm_stats->abs_coeff_sum[j] = 1000 + j;
+ }
+ av1_tpl_txfm_stats_update_abs_coeff_mean(txfm_stats);
+ }
+}
+
+/*
+ * Helper method to brute-force search for the closest q_index
+ * that achieves the specified bit budget.
+ */
+int find_gop_q_iterative(double bit_budget, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors,
+ int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list,
+ const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe) {
+ int best_q = 255;
+ double curr_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ best_q, bit_depth, update_type_scale_factors, frame_count,
+ update_type_list, qstep_ratio_list, stats_list, q_index_list,
+ estimated_bitrate_byframe);
+ double min_bits_diff = fabs(curr_estimate - bit_budget);
+ // Start at q = 254 because we already have an estimate for q = 255.
+ for (int q = 254; q >= 0; q--) {
+ curr_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+ qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+ double bits_diff = fabs(curr_estimate - bit_budget);
+ if (bits_diff <= min_bits_diff) {
+ min_bits_diff = bits_diff;
+ best_q = q;
+ }
+ }
+ return best_q;
+}
+
+TEST(TplModelTest, EstimateFrameRateTest) {
+ GF_GROUP gf_group;
+ init_toy_gf_group(&gf_group);
+
+ VBR_RATECTRL_INFO vbr_rc_info;
+ init_toy_vbr_rc_info(&vbr_rc_info, gf_group.size);
+
+ std::vector<TplTxfmStats> stats_list(gf_group.size);
+ init_toy_tpl_txfm_stats(&stats_list);
+
+ std::vector<double> est_bitrate_list(gf_group.size);
+ init_toy_tpl_txfm_stats(&stats_list);
+ const aom_bit_depth_t bit_depth = AOM_BITS_8;
+
+ const int q = 125;
+
+ // Case1: all scale factors are 0
+ double scale_factors[FRAME_UPDATE_TYPES] = { 0 };
+ double estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, scale_factors, gf_group.size, gf_group.update_type,
+ vbr_rc_info.qstep_ratio_list, stats_list.data(), vbr_rc_info.q_index_list,
+ est_bitrate_list.data());
+ EXPECT_NEAR(estimate, 0, epsilon);
+
+ // Case2: all scale factors are 1
+ for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+ scale_factors[i] = 1;
+ }
+ estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, scale_factors, gf_group.size, gf_group.update_type,
+ vbr_rc_info.qstep_ratio_list, stats_list.data(), vbr_rc_info.q_index_list,
+ est_bitrate_list.data());
+ double ref_estimate = 0;
+ for (int i = 0; i < gf_group.size; i++) {
+ ref_estimate += est_bitrate_list[i];
+ }
+ EXPECT_NEAR(estimate, ref_estimate, epsilon);
+
+ // Case3: Key frame scale factor is 0 and others are 1
+ for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+ if (i == KF_UPDATE) {
+ scale_factors[i] = 0;
+ } else {
+ scale_factors[i] = 1;
+ }
+ }
+ estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, scale_factors, gf_group.size, gf_group.update_type,
+ vbr_rc_info.qstep_ratio_list, stats_list.data(), vbr_rc_info.q_index_list,
+ est_bitrate_list.data());
+ ref_estimate = 0;
+ for (int i = 0; i < gf_group.size; i++) {
+ if (gf_group.update_type[i] != KF_UPDATE) {
+ ref_estimate += est_bitrate_list[i];
+ }
+ }
+ EXPECT_NEAR(estimate, ref_estimate, epsilon);
+}
+
+TEST(TplModelTest, VbrRcInfoEstimateBaseQTest) {
+ GF_GROUP gf_group;
+ init_toy_gf_group(&gf_group);
+
+ VBR_RATECTRL_INFO vbr_rc_info;
+ init_toy_vbr_rc_info(&vbr_rc_info, gf_group.size);
+
+ std::vector<TplTxfmStats> stats_list(gf_group.size);
+ init_toy_tpl_txfm_stats(&stats_list);
+ const aom_bit_depth_t bit_depth = AOM_BITS_8;
+
+ // Test multiple bit budgets.
+ const std::vector<double> bit_budgets = { 0, 2470, 19200, 30750,
+ 41315, 65017, DBL_MAX };
+
+ for (double bit_budget : bit_budgets) {
+ // Binary search method to find the optimal q.
+ const int base_q = av1_vbr_rc_info_estimate_base_q(
+ bit_budget, bit_depth, vbr_rc_info.scale_factors, gf_group.size,
+ gf_group.update_type, vbr_rc_info.qstep_ratio_list, stats_list.data(),
+ vbr_rc_info.q_index_list, nullptr);
+ const int ref_base_q = find_gop_q_iterative(
+ bit_budget, bit_depth, vbr_rc_info.scale_factors, gf_group.size,
+ gf_group.update_type, vbr_rc_info.qstep_ratio_list, stats_list.data(),
+ vbr_rc_info.q_index_list, nullptr);
+ if (bit_budget == 0) {
+ EXPECT_EQ(base_q, 255);
+ } else if (bit_budget == DBL_MAX) {
+ EXPECT_EQ(base_q, 0);
+ }
+ EXPECT_EQ(base_q, ref_base_q);
+ }
+}
+#endif // CONFIG_BITRATE_ACCURACY
+
+} // namespace
diff --git a/third_party/aom/test/transform_test_base.h b/third_party/aom/test/transform_test_base.h
new file mode 100644
index 0000000000..55e78fef48
--- /dev/null
+++ b/third_party/aom/test/transform_test_base.h
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
+#define AOM_TEST_TRANSFORM_TEST_BASE_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "test/acm_random.h"
+
+namespace libaom_test {
+
+// Note:
+// Same constant are defined in av1/common/av1_entropy.h and
+// av1/common/entropy.h. Goal is to make this base class
+// to use for future codec transform testing. But including
+// either of them would lead to compiling error when we do
+// unit test for another codec. Suggest to move the definition
+// to a aom header file.
+const int kDctMaxValue = 16384;
+
+template <typename OutputType>
+using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride,
+ TxfmParam *txfm_param);
+
+template <typename OutputType>
+using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+ const TxfmParam *txfm_param);
+
+template <typename OutType>
+class TransformTestBase {
+ public:
+ virtual ~TransformTestBase() = default;
+
+ protected:
+ virtual void RunFwdTxfm(const int16_t *in, OutType *out, int stride) = 0;
+
+ virtual void RunInvTxfm(const OutType *out, uint8_t *dst, int stride) = 0;
+
+ void RunAccuracyCheck(uint32_t ref_max_error, double ref_avg_error) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ uint32_t max_error = 0;
+ int64_t total_error = 0;
+ const int count_test_block = 10000;
+
+ int16_t *test_input_block = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+ ASSERT_NE(test_input_block, nullptr);
+ OutType *test_temp_block = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(test_temp_block[0]) * num_coeffs_));
+ ASSERT_NE(test_temp_block, nullptr);
+ uint8_t *dst = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+ ASSERT_NE(dst, nullptr);
+ uint8_t *src = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+ ASSERT_NE(src, nullptr);
+ uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+ ASSERT_NE(dst16, nullptr);
+ uint16_t *src16 = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+ ASSERT_NE(src16, nullptr);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < num_coeffs_; ++j) {
+ if (bit_depth_ == AOM_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ test_input_block[j] = src16[j] - dst16[j];
+ }
+ }
+
+ API_REGISTER_STATE_CHECK(
+ RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+ if (bit_depth_ == AOM_BITS_8) {
+ API_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+ } else {
+ API_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+ }
+
+ for (int j = 0; j < num_coeffs_; ++j) {
+ const int diff =
+ bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+ const uint32_t error = diff * diff;
+ if (max_error < error) max_error = error;
+ total_error += error;
+ }
+ }
+
+ double avg_error = total_error * 1. / count_test_block / num_coeffs_;
+
+ EXPECT_GE(ref_max_error, max_error)
+ << "Error: FHT/IHT has an individual round trip error > "
+ << ref_max_error;
+
+ EXPECT_GE(ref_avg_error, avg_error)
+ << "Error: FHT/IHT has average round trip error > " << ref_avg_error
+ << " per block";
+
+ aom_free(test_input_block);
+ aom_free(test_temp_block);
+ aom_free(dst);
+ aom_free(src);
+ aom_free(dst16);
+ aom_free(src16);
+ }
+
+ void RunCoeffCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+
+ // Use a stride value which is not the width of any transform, to catch
+ // cases where the transforms use the stride incorrectly.
+ int stride = 96;
+
+ int16_t *input_block = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(int16_t) * stride * height_));
+ ASSERT_NE(input_block, nullptr);
+ OutType *output_ref_block = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+ ASSERT_NE(output_ref_block, nullptr);
+ OutType *output_block = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+ ASSERT_NE(output_block, nullptr);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ int j, k;
+ for (j = 0; j < height_; ++j) {
+ for (k = 0; k < pitch_; ++k) {
+ int in_idx = j * stride + k;
+ int out_idx = j * pitch_ + k;
+ input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ if (bit_depth_ == AOM_BITS_8) {
+ output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+ } else {
+ output_block[out_idx] = output_ref_block[out_idx] =
+ rnd.Rand16() & mask_;
+ }
+ }
+ }
+
+ fwd_txfm_ref(input_block, output_ref_block, stride, &txfm_param_);
+ API_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
+
+ // The minimum quant value is 4.
+ for (j = 0; j < height_; ++j) {
+ for (k = 0; k < pitch_; ++k) {
+ int out_idx = j * pitch_ + k;
+ ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+ << "Error: not bit-exact result at index: " << out_idx
+ << " at test block: " << i;
+ }
+ }
+ }
+ aom_free(input_block);
+ aom_free(output_ref_block);
+ aom_free(output_block);
+ }
+
+ void RunInvCoeffCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+
+ // Use a stride value which is not the width of any transform, to catch
+ // cases where the transforms use the stride incorrectly.
+ int stride = 96;
+
+ int16_t *input_block = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+ ASSERT_NE(input_block, nullptr);
+ OutType *trans_block = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(trans_block[0]) * num_coeffs_));
+ ASSERT_NE(trans_block, nullptr);
+ uint8_t *output_block = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, sizeof(uint8_t) * stride * height_));
+ ASSERT_NE(output_block, nullptr);
+ uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, sizeof(uint8_t) * stride * height_));
+ ASSERT_NE(output_ref_block, nullptr);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-mask_, mask_].
+ int j, k;
+ for (j = 0; j < height_; ++j) {
+ for (k = 0; k < pitch_; ++k) {
+ int in_idx = j * pitch_ + k;
+ int out_idx = j * stride + k;
+ input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ output_ref_block[out_idx] = rnd.Rand16() & mask_;
+ output_block[out_idx] = output_ref_block[out_idx];
+ }
+ }
+
+ fwd_txfm_ref(input_block, trans_block, pitch_, &txfm_param_);
+
+ inv_txfm_ref(trans_block, output_ref_block, stride, &txfm_param_);
+ API_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
+
+ for (j = 0; j < height_; ++j) {
+ for (k = 0; k < pitch_; ++k) {
+ int out_idx = j * stride + k;
+ ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+ << "Error: not bit-exact result at index: " << out_idx
+ << " j = " << j << " k = " << k << " at test block: " << i;
+ }
+ }
+ }
+ aom_free(input_block);
+ aom_free(trans_block);
+ aom_free(output_ref_block);
+ aom_free(output_block);
+ }
+
+ void RunMemCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+
+ int16_t *input_extreme_block = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+ ASSERT_NE(input_extreme_block, nullptr);
+ OutType *output_ref_block = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+ ASSERT_NE(output_ref_block, nullptr);
+ OutType *output_block = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+ ASSERT_NE(output_block, nullptr);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-mask_, mask_].
+ for (int j = 0; j < num_coeffs_; ++j) {
+ input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+ }
+ if (i == 0) {
+ for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = mask_;
+ } else if (i == 1) {
+ for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = -mask_;
+ }
+
+ fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
+ API_REGISTER_STATE_CHECK(
+ RunFwdTxfm(input_extreme_block, output_block, pitch_));
+
+ int row_length = FindRowLength();
+ // The minimum quant value is 4.
+ for (int j = 0; j < num_coeffs_; ++j) {
+ ASSERT_EQ(output_block[j], output_ref_block[j])
+ << "Not bit-exact at test index: " << i << ", "
+ << "j = " << j << std::endl;
+ EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
+ abs(output_block[j]))
+ << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";
+ }
+ }
+ aom_free(input_extreme_block);
+ aom_free(output_ref_block);
+ aom_free(output_block);
+ }
+
+ void RunInvAccuracyCheck(int limit) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+
+ int16_t *in = reinterpret_cast<int16_t *>(
+ aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+ ASSERT_NE(in, nullptr);
+ OutType *coeff = reinterpret_cast<OutType *>(
+ aom_memalign(16, sizeof(coeff[0]) * num_coeffs_));
+ ASSERT_NE(coeff, nullptr);
+ uint8_t *dst = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+ ASSERT_NE(dst, nullptr);
+ uint8_t *src = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+ ASSERT_NE(src, nullptr);
+
+ uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+ ASSERT_NE(dst16, nullptr);
+ uint16_t *src16 = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+ ASSERT_NE(src16, nullptr);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-mask_, mask_].
+ for (int j = 0; j < num_coeffs_; ++j) {
+ if (bit_depth_ == AOM_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ in[j] = src[j] - dst[j];
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ in[j] = src16[j] - dst16[j];
+ }
+ }
+
+ fwd_txfm_ref(in, coeff, pitch_, &txfm_param_);
+
+ if (bit_depth_ == AOM_BITS_8) {
+ API_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+ } else {
+ API_REGISTER_STATE_CHECK(
+ RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+ }
+
+ for (int j = 0; j < num_coeffs_; ++j) {
+ const int diff =
+ bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+ const uint32_t error = diff * diff;
+ ASSERT_GE(static_cast<uint32_t>(limit), error)
+ << "Error: 4x4 IDCT has error " << error << " at index " << j;
+ }
+ }
+ aom_free(in);
+ aom_free(coeff);
+ aom_free(dst);
+ aom_free(src);
+ aom_free(src16);
+ aom_free(dst16);
+ }
+
+ int pitch_;
+ int height_;
+ FhtFunc<OutType> fwd_txfm_ref;
+ IhtFunc<OutType> inv_txfm_ref;
+ aom_bit_depth_t bit_depth_;
+ int mask_;
+ int num_coeffs_;
+ TxfmParam txfm_param_;
+
+ private:
+ // Assume transform size is 4x4, 8x8, 16x16,...
+ int FindRowLength() const {
+ int row = 4;
+ if (16 == num_coeffs_) {
+ row = 4;
+ } else if (64 == num_coeffs_) {
+ row = 8;
+ } else if (256 == num_coeffs_) {
+ row = 16;
+ } else if (1024 == num_coeffs_) {
+ row = 32;
+ }
+ return row;
+ }
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_TRANSFORM_TEST_BASE_H_
diff --git a/third_party/aom/test/twopass_encoder.sh b/third_party/aom/test/twopass_encoder.sh
new file mode 100755
index 0000000000..44e7327b8f
--- /dev/null
+++ b/third_party/aom/test/twopass_encoder.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom twopass_encoder example. To add new tests to this
+## file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to twopass_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+twopass_encoder_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
+twopass_encoder() {
+ local encoder="$(aom_tool_path twopass_encoder)"
+ local codec="$1"
+ local output_file="${AOM_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"
+ local limit=7
+
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" "${limit}" \
+ ${devnull} || return 1
+
+ [ -e "${output_file}" ] || return 1
+}
+
+twopass_encoder_av1() {
+ if [ "$(av1_encode_available)" = "yes" ]; then
+ twopass_encoder av1 || return 1
+ fi
+}
+
+twopass_encoder_tests="twopass_encoder_av1"
+
+run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/third_party/aom/test/util.h b/third_party/aom/test/util.h
new file mode 100644
index 0000000000..29df709c4f
--- /dev/null
+++ b/third_party/aom/test/util.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_UTIL_H_
+#define AOM_TEST_UTIL_H_
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom/aom_image.h"
+#include "aom_ports/aom_timer.h"
+
+// Macros
+#define GET_PARAM(k) std::get<k>(GetParam())
+
+inline int is_extension_y4m(const char *filename) {
+ const char *dot = strrchr(filename, '.');
+ if (!dot || dot == filename) return 0;
+
+ return !strcmp(dot, ".y4m");
+}
+
+inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
+ assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
+ (img1->d_h == img2->d_h));
+
+ const unsigned int width_y = img1->d_w;
+ const unsigned int height_y = img1->d_h;
+ unsigned int i, j;
+
+ int64_t sqrerr = 0;
+ for (i = 0; i < height_y; ++i)
+ for (j = 0; j < width_y; ++j) {
+ int64_t d = img1->planes[AOM_PLANE_Y][i * img1->stride[AOM_PLANE_Y] + j] -
+ img2->planes[AOM_PLANE_Y][i * img2->stride[AOM_PLANE_Y] + j];
+ sqrerr += d * d;
+ }
+ double mse = static_cast<double>(sqrerr) / (width_y * height_y);
+ double psnr = 100.0;
+ if (mse > 0.0) {
+ psnr = 10 * log10(255.0 * 255.0 / mse);
+ }
+ return psnr;
+}
+
+static INLINE double get_time_mark(aom_usec_timer *t) {
+ aom_usec_timer_mark(t);
+ return static_cast<double>(aom_usec_timer_elapsed(t));
+}
+
+#endif // AOM_TEST_UTIL_H_
diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc
new file mode 100644
index 0000000000..a493a1f4cb
--- /dev/null
+++ b/third_party/aom/test/variance_test.cc
@@ -0,0 +1,4370 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <ostream>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/cdef_block.h"
+
+namespace {
+
+typedef uint64_t (*MseWxH16bitFunc)(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h);
+typedef uint64_t (*Mse16xH16bitFunc)(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h);
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+typedef void (*GetSseSum8x8QuadFunc)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8);
+typedef void (*GetSseSum16x16DualFunc)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16);
+typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+ int xoffset, int yoffset,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)(
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+ int b_stride, uint32_t *sse, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+
+#if !CONFIG_REALTIME_ONLY
+typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
+ int xoffset, int yoffset,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *sse);
+#endif
+
+using libaom_test::ACMRandom;
+
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+ switch (bit_depth) {
+ case AOM_BITS_12:
+ *sse = (*sse + 128) >> 8;
+ *se = (*se + 8) >> 4;
+ break;
+ case AOM_BITS_10:
+ *sse = (*sse + 8) >> 4;
+ *se = (*se + 2) >> 2;
+ break;
+ case AOM_BITS_8:
+ default: break;
+ }
+}
+
+static unsigned int mb_ss_ref(const int16_t *src) {
+ unsigned int res = 0;
+ for (int i = 0; i < 256; ++i) {
+ res += src[i] * src[i];
+ }
+ return res;
+}
+
+/* Note:
+ * Our codebase calculates the "diff" value in the variance algorithm by
+ * (src - ref).
+ */
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+ int l2h, int src_stride, int ref_stride,
+ uint32_t *sse_ptr, bool use_high_bit_depth_,
+ aom_bit_depth_t bit_depth) {
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ int diff;
+ if (!use_high_bit_depth_) {
+ diff = src[y * src_stride + x] - ref[y * ref_stride + x];
+ se += diff;
+ sse += diff * diff;
+ } else {
+ diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
+ CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ RoundHighBitDepth(bit_depth, &se, &sse);
+ *sse_ptr = static_cast<uint32_t>(sse);
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+/* The subpel reference functions differ from the codec version in one aspect:
+ * they calculate the bilinear factors directly instead of using a lookup table
+ * and therefore upshift xoff and yoff by 1. Only every other calculated value
+ * is used so the codec version shrinks the table to save space.
+ */
+static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+ int l2w, int l2h, int xoff, int yoff,
+ uint32_t *sse_ptr, bool use_high_bit_depth_,
+ aom_bit_depth_t bit_depth) {
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+
+ xoff <<= 1;
+ yoff <<= 1;
+
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // Bilinear interpolation at a 16th pel step.
+ if (!use_high_bit_depth_) {
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = r - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ } else {
+ uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = r - src16[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ RoundHighBitDepth(bit_depth, &se, &sse);
+ *sse_ptr = static_cast<uint32_t>(sse);
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
+ const uint8_t *second_pred, int l2w,
+ int l2h, int xoff, int yoff,
+ uint32_t *sse_ptr,
+ bool use_high_bit_depth,
+ aom_bit_depth_t bit_depth) {
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+
+ xoff <<= 1;
+ yoff <<= 1;
+
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // bilinear interpolation at a 16th pel step
+ if (!use_high_bit_depth) {
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff =
+ ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ } else {
+ const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+ const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ RoundHighBitDepth(bit_depth, &se, &sse);
+ *sse_ptr = static_cast<uint32_t>(sse);
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t dist_wtd_subpel_avg_variance_ref(
+ const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
+ int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
+ aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) {
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+
+ xoff <<= 1;
+ yoff <<= 1;
+
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // bilinear interpolation at a 16th pel step
+ if (!use_high_bit_depth) {
+ const int a1 = ref[(w + 0) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 0) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 0) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 0) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int avg = ROUND_POWER_OF_TWO(
+ r * jcp_param->fwd_offset +
+ second_pred[w * y + x] * jcp_param->bck_offset,
+ DIST_PRECISION_BITS);
+ const int diff = avg - src[w * y + x];
+
+ se += diff;
+ sse += diff * diff;
+ } else {
+ const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+ const int a1 = ref16[(w + 0) * (y + 0) + x + 0];
+ const int a2 = ref16[(w + 0) * (y + 0) + x + 1];
+ const int b1 = ref16[(w + 0) * (y + 1) + x + 0];
+ const int b2 = ref16[(w + 0) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int avg =
+ ROUND_POWER_OF_TWO(r * jcp_param->fwd_offset +
+ sec16[w * y + x] * jcp_param->bck_offset,
+ DIST_PRECISION_BITS);
+ const int diff = avg - src16[w * y + x];
+
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ RoundHighBitDepth(bit_depth, &se, &sse);
+ *sse_ptr = static_cast<uint32_t>(sse);
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
+ int xoff, int yoff,
+ const int32_t *wsrc,
+ const int32_t *mask, uint32_t *sse_ptr,
+ bool use_high_bit_depth_,
+ aom_bit_depth_t bit_depth) {
+ int64_t se = 0;
+ uint64_t sse = 0;
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+
+ xoff <<= 1;
+ yoff <<= 1;
+
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // Bilinear interpolation at a 16th pel step.
+ if (!use_high_bit_depth_) {
+ const int a1 = pre[(w + 1) * (y + 0) + x + 0];
+ const int a2 = pre[(w + 1) * (y + 0) + x + 1];
+ const int b1 = pre[(w + 1) * (y + 1) + x + 0];
+ const int b2 = pre[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = ROUND_POWER_OF_TWO_SIGNED(
+ wsrc[w * y + x] - r * mask[w * y + x], 12);
+ se += diff;
+ sse += diff * diff;
+ } else {
+ uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre);
+ const int a1 = pre16[(w + 1) * (y + 0) + x + 0];
+ const int a2 = pre16[(w + 1) * (y + 0) + x + 1];
+ const int b1 = pre16[(w + 1) * (y + 1) + x + 0];
+ const int b2 = pre16[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ const int diff = ROUND_POWER_OF_TWO_SIGNED(
+ wsrc[w * y + x] - r * mask[w * y + x], 12);
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ }
+ RoundHighBitDepth(bit_depth, &se, &sse);
+ *sse_ptr = static_cast<uint32_t>(sse);
+ return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+ SumOfSquaresTest() : func_(GetParam()) {}
+
+ ~SumOfSquaresTest() override = default;
+
+ protected:
+ void ConstTest();
+ void RefTest();
+
+ SumOfSquaresFunction func_;
+ ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+ int16_t mem[256];
+ unsigned int res;
+ for (int v = 0; v < 256; ++v) {
+ for (int i = 0; i < 256; ++i) {
+ mem[i] = v;
+ }
+ API_REGISTER_STATE_CHECK(res = func_(mem));
+ EXPECT_EQ(256u * (v * v), res);
+ }
+}
+
+void SumOfSquaresTest::RefTest() {
+ int16_t mem[256];
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 256; ++j) {
+ mem[j] = rnd_.Rand8() - rnd_.Rand8();
+ }
+
+ const unsigned int expected = mb_ss_ref(mem);
+ unsigned int res;
+ API_REGISTER_STATE_CHECK(res = func_(mem));
+ EXPECT_EQ(expected, res);
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Encapsulating struct to store the function to test along with
+// some testing context.
+// Can be used for MSE, SSE, Variance, etc.
+
+template <typename Func>
+struct TestParams {
+ TestParams(int log2w = 0, int log2h = 0, Func function = nullptr,
+ int bit_depth_value = 0)
+ : log2width(log2w), log2height(log2h), func(function) {
+ use_high_bit_depth = (bit_depth_value > 0);
+ if (use_high_bit_depth) {
+ bit_depth = static_cast<aom_bit_depth_t>(bit_depth_value);
+ } else {
+ bit_depth = AOM_BITS_8;
+ }
+ width = 1 << log2width;
+ height = 1 << log2height;
+ block_size = width * height;
+ mask = (1u << bit_depth) - 1;
+ }
+
+ int log2width, log2height;
+ int width, height;
+ int block_size;
+ Func func;
+ aom_bit_depth_t bit_depth;
+ bool use_high_bit_depth;
+ uint32_t mask;
+};
+
+template <typename Func>
+std::ostream &operator<<(std::ostream &os, const TestParams<Func> &p) {
+ return os << "width/height:" << p.width << "/" << p.height
+ << " function:" << reinterpret_cast<const void *>(p.func)
+ << " bit-depth:" << p.bit_depth;
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
+class MseWxHTestClass
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+ void SetUp() override {
+ params_ = this->GetParam();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, block_size() * sizeof(src_)));
+ dst_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(16, block_size() * sizeof(dst_)));
+ ASSERT_NE(src_, nullptr);
+ ASSERT_NE(dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dst_);
+ src_ = nullptr;
+ dst_ = nullptr;
+ }
+
+ protected:
+ void RefMatchTestMse();
+ void SpeedTest();
+
+ protected:
+ ACMRandom rnd_;
+ uint8_t *dst_;
+ uint16_t *src_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ int d_stride() const { return params_.width; } // stride is same as width
+ int s_stride() const { return params_.width; } // stride is same as width
+};
+
+template <typename MseWxHFunctionType>
+void MseWxHTestClass<MseWxHFunctionType>::SpeedTest() {
+ aom_usec_timer ref_timer, test_timer;
+ double elapsed_time_c = 0;
+ double elapsed_time_simd = 0;
+ int run_time = 10000000;
+ int w = width();
+ int h = height();
+ int dstride = d_stride();
+ int sstride = s_stride();
+
+ for (int k = 0; k < block_size(); ++k) {
+ dst_[k] = rnd_.Rand8();
+ src_[k] = rnd_.Rand8();
+ }
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < run_time; i++) {
+ aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < run_time; i++) {
+ params_.func(dst_, dstride, src_, sstride, w, h);
+ }
+ aom_usec_timer_mark(&test_timer);
+ elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename MseWxHFunctionType>
+void MseWxHTestClass<MseWxHFunctionType>::RefMatchTestMse() {
+ uint64_t mse_ref = 0;
+ uint64_t mse_mod = 0;
+ int w = width();
+ int h = height();
+ int dstride = d_stride();
+ int sstride = s_stride();
+
+ for (int i = 0; i < 10; i++) {
+ for (int k = 0; k < block_size(); ++k) {
+ dst_[k] = rnd_.Rand8();
+ src_[k] = rnd_.Rand8();
+ }
+ API_REGISTER_STATE_CHECK(
+ mse_ref = aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h));
+ API_REGISTER_STATE_CHECK(
+ mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
+ EXPECT_EQ(mse_ref, mse_mod)
+ << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+ }
+}
+
+template <typename FunctionType>
+class Mse16xHTestClass
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+ // Memory required to compute mse of two 8x8 and four 4x4 blocks assigned for
+ // maximum width 16 and maximum height 8.
+ int mem_size = 16 * 8;
+ void SetUp() override {
+ params_ = this->GetParam();
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, mem_size * sizeof(*src_)));
+ dst_ =
+ reinterpret_cast<uint8_t *>(aom_memalign(16, mem_size * sizeof(*dst_)));
+ ASSERT_NE(src_, nullptr);
+ ASSERT_NE(dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dst_);
+ src_ = nullptr;
+ dst_ = nullptr;
+ }
+
+ uint8_t RandBool() {
+ const uint32_t value = rnd_.Rand8();
+ return (value & 0x1);
+ }
+
+ protected:
+ void RefMatchExtremeTestMse();
+ void RefMatchTestMse();
+ void SpeedTest();
+
+ protected:
+ ACMRandom rnd_;
+ uint8_t *dst_;
+ uint16_t *src_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ int d_stride() const { return params_.width; }
+};
+
+template <typename Mse16xHFunctionType>
+void Mse16xHTestClass<Mse16xHFunctionType>::SpeedTest() {
+ aom_usec_timer ref_timer, test_timer;
+ double elapsed_time_c = 0.0;
+ double elapsed_time_simd = 0.0;
+ const int loop_count = 10000000;
+ const int w = width();
+ const int h = height();
+ const int dstride = d_stride();
+
+ for (int k = 0; k < mem_size; ++k) {
+ dst_[k] = rnd_.Rand8();
+ // Right shift by 6 is done to generate more input in range of [0,255] than
+ // CDEF_VERY_LARGE
+ int rnd_i10 = rnd_.Rand16() >> 6;
+ src_[k] = (rnd_i10 < 256) ? rnd_i10 : CDEF_VERY_LARGE;
+ }
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < loop_count; i++) {
+ aom_mse_16xh_16bit_c(dst_, dstride, src_, w, h);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < loop_count; i++) {
+ params_.func(dst_, dstride, src_, w, h);
+ }
+ aom_usec_timer_mark(&test_timer);
+ elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%.31f\n", width(),
+ height(), elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename Mse16xHFunctionType>
+void Mse16xHTestClass<Mse16xHFunctionType>::RefMatchTestMse() {
+ uint64_t mse_ref = 0;
+ uint64_t mse_mod = 0;
+ const int w = width();
+ const int h = height();
+ const int dstride = d_stride();
+
+ for (int i = 0; i < 10; i++) {
+ for (int k = 0; k < mem_size; ++k) {
+ dst_[k] = rnd_.Rand8();
+ // Right shift by 6 is done to generate more input in range of [0,255]
+ // than CDEF_VERY_LARGE
+ int rnd_i10 = rnd_.Rand16() >> 6;
+ src_[k] = (rnd_i10 < 256) ? rnd_i10 : CDEF_VERY_LARGE;
+ }
+
+ API_REGISTER_STATE_CHECK(
+ mse_ref = aom_mse_16xh_16bit_c(dst_, dstride, src_, w, h));
+ API_REGISTER_STATE_CHECK(mse_mod = params_.func(dst_, dstride, src_, w, h));
+ EXPECT_EQ(mse_ref, mse_mod)
+ << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+ }
+}
+
+template <typename Mse16xHFunctionType>
+void Mse16xHTestClass<Mse16xHFunctionType>::RefMatchExtremeTestMse() {
+ uint64_t mse_ref = 0;
+ uint64_t mse_mod = 0;
+ const int w = width();
+ const int h = height();
+ const int dstride = d_stride();
+ const int iter = 10;
+
+ // Fill the buffers with extreme values
+ for (int i = 0; i < iter; i++) {
+ for (int k = 0; k < mem_size; ++k) {
+ dst_[k] = static_cast<uint8_t>(RandBool() ? 0 : 255);
+ src_[k] = static_cast<uint16_t>(RandBool() ? 0 : CDEF_VERY_LARGE);
+ }
+
+ API_REGISTER_STATE_CHECK(
+ mse_ref = aom_mse_16xh_16bit_c(dst_, dstride, src_, w, h));
+ API_REGISTER_STATE_CHECK(mse_mod = params_.func(dst_, dstride, src_, w, h));
+ EXPECT_EQ(mse_ref, mse_mod)
+ << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+ }
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
+class MainTestClass
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+ void SetUp() override {
+ params_ = this->GetParam();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ const size_t unit =
+ use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
+ src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size() * unit));
+ ref_ = new uint8_t[block_size() * unit];
+ ASSERT_NE(src_, nullptr);
+ ASSERT_NE(ref_, nullptr);
+ memset(src_, 0, block_size() * sizeof(src_[0]));
+ memset(ref_, 0, block_size() * sizeof(ref_[0]));
+ if (use_high_bit_depth()) {
+ // TODO(skal): remove!
+ src_ = CONVERT_TO_BYTEPTR(src_);
+ ref_ = CONVERT_TO_BYTEPTR(ref_);
+ }
+ }
+
+ void TearDown() override {
+ if (use_high_bit_depth()) {
+ // TODO(skal): remove!
+ src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
+ ref_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(ref_));
+ }
+
+ aom_free(src_);
+ delete[] ref_;
+ src_ = nullptr;
+ ref_ = nullptr;
+ }
+
+ protected:
+ // We could sub-class MainTestClass into dedicated class for Variance
+ // and MSE/SSE, but it involves a lot of 'this->xxx' dereferencing
+ // to access top class fields xxx. That's cumbersome, so for now we'll just
+ // implement the testing methods here:
+
+ // Variance tests
+ void ZeroTest();
+ void RefTest();
+ void RefStrideTest();
+ void OneQuarterTest();
+ void SpeedTest();
+
+ // SSE&SUM tests
+ void RefTestSseSum();
+ void MinTestSseSum();
+ void MaxTestSseSum();
+ void SseSum_SpeedTest();
+
+ // SSE&SUM dual tests
+ void RefTestSseSumDual();
+ void MinTestSseSumDual();
+ void MaxTestSseSumDual();
+ void SseSum_SpeedTestDual();
+
+ // MSE/SSE tests
+ void RefTestMse();
+ void RefTestSse();
+ void MaxTestMse();
+ void MaxTestSse();
+
+ protected:
+ ACMRandom rnd_;
+ uint8_t *src_;
+ uint8_t *ref_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+ int byte_shift() const { return params_.bit_depth - 8; }
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ uint32_t mask() const { return params_.mask; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to variance.
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::ZeroTest() {
+ for (int i = 0; i <= 255; ++i) {
+ if (!use_high_bit_depth()) {
+ memset(src_, i, block_size());
+ } else {
+ uint16_t *const src16 = CONVERT_TO_SHORTPTR(src_);
+ for (int k = 0; k < block_size(); ++k) src16[k] = i << byte_shift();
+ }
+ for (int j = 0; j <= 255; ++j) {
+ if (!use_high_bit_depth()) {
+ memset(ref_, j, block_size());
+ } else {
+ uint16_t *const ref16 = CONVERT_TO_SHORTPTR(ref_);
+ for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift();
+ }
+ unsigned int sse, var;
+ API_REGISTER_STATE_CHECK(
+ var = params_.func(src_, width(), ref_, width(), &sse));
+ EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
+ }
+ }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::RefTest() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size(); j++) {
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+ unsigned int sse1, sse2, var1, var2;
+ const int stride = width();
+ API_REGISTER_STATE_CHECK(
+ var1 = params_.func(src_, stride, ref_, stride, &sse1));
+ var2 =
+ variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+ stride, &sse2, use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+ EXPECT_EQ(var1, var2) << "Error at test index: " << i;
+ }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::RefStrideTest() {
+ for (int i = 0; i < 10; ++i) {
+ const int ref_stride = (i & 1) * width();
+ const int src_stride = ((i >> 1) & 1) * width();
+ for (int j = 0; j < block_size(); j++) {
+ const int ref_ind = (j / width()) * ref_stride + j % width();
+ const int src_ind = (j / width()) * src_stride + j % width();
+ if (!use_high_bit_depth()) {
+ src_[src_ind] = rnd_.Rand8();
+ ref_[ref_ind] = rnd_.Rand8();
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask();
+ }
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1, var2;
+
+ API_REGISTER_STATE_CHECK(
+ var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1));
+ var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height,
+ src_stride, ref_stride, &sse2, use_high_bit_depth(),
+ params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+ EXPECT_EQ(var1, var2) << "Error at test index: " << i;
+ }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
+ const int half = block_size() / 2;
+ if (!use_high_bit_depth()) {
+ memset(src_, 255, block_size());
+ memset(ref_, 255, half);
+ memset(ref_ + half, 0, half);
+ } else {
+ aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+ aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+ aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+ }
+ unsigned int sse, var, expected;
+ API_REGISTER_STATE_CHECK(
+ var = params_.func(src_, width(), ref_, width(), &sse));
+ expected = block_size() * 255 * 255 / 4;
+ EXPECT_EQ(expected, var);
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+ for (int j = 0; j < block_size(); j++) {
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+#if CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ }
+ }
+ unsigned int sse;
+ const int stride = width();
+ int run_time = 1000000000 / block_size();
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_time; ++i) {
+ params_.func(src_, stride, ref_, stride, &sse);
+ }
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("Variance %dx%d : %d us\n", width(), height(), elapsed_time);
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::RefTestSseSum() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size(); ++j) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+ }
+ unsigned int sse1[256] = { 0 };
+ unsigned int sse2[256] = { 0 };
+ unsigned int var1[256] = { 0 };
+ unsigned int var2[256] = { 0 };
+ int sum1[256] = { 0 };
+ int sum2[256] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+ int k = 0;
+
+ for (int row = 0; row < height(); row += 8) {
+ for (int col = 0; col < width(); col += 32) {
+ API_REGISTER_STATE_CHECK(params_.func(src_ + stride * row + col, stride,
+ ref_ + stride * row + col, stride,
+ &sse1[k], &sum1[k], &sse_tot_simd,
+ &sum_tot_simd, &var1[k]));
+ aom_get_var_sse_sum_8x8_quad_c(
+ src_ + stride * row + col, stride, ref_ + stride * row + col,
+ stride, &sse2[k], &sum2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+ k += 4;
+ }
+ }
+ EXPECT_EQ(sse_tot_c, sse_tot_simd);
+ EXPECT_EQ(sum_tot_c, sum_tot_simd);
+ for (int p = 0; p < 256; p++) {
+ EXPECT_EQ(sse1[p], sse2[p]);
+ EXPECT_EQ(sum1[p], sum2[p]);
+ EXPECT_EQ(var1[p], var2[p]);
+ }
+ }
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::MinTestSseSum() {
+ memset(src_, 0, block_size());
+ memset(ref_, 255, block_size());
+ unsigned int sse1[256] = { 0 };
+ unsigned int sse2[256] = { 0 };
+ unsigned int var1[256] = { 0 };
+ unsigned int var2[256] = { 0 };
+ int sum1[256] = { 0 };
+ int sum2[256] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+ int k = 0;
+
+ for (int i = 0; i < height(); i += 8) {
+ for (int j = 0; j < width(); j += 32) {
+ API_REGISTER_STATE_CHECK(params_.func(
+ src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+ &sse1[k], &sum1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+ aom_get_var_sse_sum_8x8_quad_c(
+ src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+ &sse2[k], &sum2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+ k += 4;
+ }
+ }
+ EXPECT_EQ(sse_tot_simd, sse_tot_c);
+ EXPECT_EQ(sum_tot_simd, sum_tot_c);
+ for (int p = 0; p < 256; p++) {
+ EXPECT_EQ(sse1[p], sse2[p]);
+ EXPECT_EQ(sum1[p], sum2[p]);
+ EXPECT_EQ(var1[p], var2[p]);
+ }
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::MaxTestSseSum() {
+ memset(src_, 255, block_size());
+ memset(ref_, 0, block_size());
+ unsigned int sse1[256] = { 0 };
+ unsigned int sse2[256] = { 0 };
+ unsigned int var1[256] = { 0 };
+ unsigned int var2[256] = { 0 };
+ int sum1[256] = { 0 };
+ int sum2[256] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+ int k = 0;
+
+ for (int i = 0; i < height(); i += 8) {
+ for (int j = 0; j < width(); j += 32) {
+ API_REGISTER_STATE_CHECK(params_.func(
+ src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+ &sse1[k], &sum1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+ aom_get_var_sse_sum_8x8_quad_c(
+ src_ + stride * i + j, stride, ref_ + stride * i + j, stride,
+ &sse2[k], &sum2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+ k += 4;
+ }
+ }
+ EXPECT_EQ(sse_tot_c, sse_tot_simd);
+ EXPECT_EQ(sum_tot_c, sum_tot_simd);
+
+ for (int p = 0; p < 256; p++) {
+ EXPECT_EQ(sse1[p], sse2[p]);
+ EXPECT_EQ(sum1[p], sum2[p]);
+ EXPECT_EQ(var1[p], var2[p]);
+ }
+}
+
+template <typename GetSseSum8x8QuadFuncType>
+void MainTestClass<GetSseSum8x8QuadFuncType>::SseSum_SpeedTest() {
+ const int loop_count = 1000000000 / block_size();
+ for (int j = 0; j < block_size(); ++j) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+ }
+
+ unsigned int sse1[4] = { 0 };
+ unsigned int sse2[4] = { 0 };
+ unsigned int var1[4] = { 0 };
+ unsigned int var2[4] = { 0 };
+ int sum1[4] = { 0 };
+ int sum2[4] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int r = 0; r < loop_count; ++r) {
+ for (int i = 0; i < height(); i += 8) {
+ for (int j = 0; j < width(); j += 32) {
+ aom_get_var_sse_sum_8x8_quad_c(src_ + stride * i + j, stride,
+ ref_ + stride * i + j, stride, sse2,
+ sum2, &sse_tot_c, &sum_tot_c, var2);
+ }
+ }
+ }
+ aom_usec_timer_mark(&timer);
+ const double elapsed_time_ref =
+ static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer_start(&timer);
+ for (int r = 0; r < loop_count; ++r) {
+ for (int i = 0; i < height(); i += 8) {
+ for (int j = 0; j < width(); j += 32) {
+ params_.func(src_ + stride * i + j, stride, ref_ + stride * i + j,
+ stride, sse1, sum1, &sse_tot_simd, &sum_tot_simd, var1);
+ }
+ }
+ }
+ aom_usec_timer_mark(&timer);
+ const double elapsed_time_simd =
+ static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf(
+ "aom_getvar_8x8_quad for block=%dx%d : ref_time=%lf \t simd_time=%lf \t "
+ "gain=%lf \n",
+ width(), height(), elapsed_time_ref, elapsed_time_simd,
+ elapsed_time_ref / elapsed_time_simd);
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::RefTestSseSumDual() {
+ for (int iter = 0; iter < 10; ++iter) {
+ for (int idx = 0; idx < block_size(); ++idx) {
+ src_[idx] = rnd_.Rand8();
+ ref_[idx] = rnd_.Rand8();
+ }
+ unsigned int sse1[64] = { 0 };
+ unsigned int sse2[64] = { 0 };
+ unsigned int var1[64] = { 0 };
+ unsigned int var2[64] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+ int k = 0;
+
+ for (int row = 0; row < height(); row += 16) {
+ for (int col = 0; col < width(); col += 32) {
+ API_REGISTER_STATE_CHECK(params_.func(
+ src_ + stride * row + col, stride, ref_ + stride * row + col,
+ stride, &sse1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+ aom_get_var_sse_sum_16x16_dual_c(
+ src_ + stride * row + col, stride, ref_ + stride * row + col,
+ stride, &sse2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+ k += 2;
+ }
+ }
+ EXPECT_EQ(sse_tot_c, sse_tot_simd);
+ EXPECT_EQ(sum_tot_c, sum_tot_simd);
+ for (int p = 0; p < 64; p++) {
+ EXPECT_EQ(sse1[p], sse2[p]);
+ EXPECT_EQ(sse_tot_simd, sse_tot_c);
+ EXPECT_EQ(sum_tot_simd, sum_tot_c);
+ EXPECT_EQ(var1[p], var2[p]);
+ }
+ }
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::MinTestSseSumDual() {
+ memset(src_, 0, block_size());
+ memset(ref_, 255, block_size());
+ unsigned int sse1[64] = { 0 };
+ unsigned int sse2[64] = { 0 };
+ unsigned int var1[64] = { 0 };
+ unsigned int var2[64] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+ int k = 0;
+
+ for (int row = 0; row < height(); row += 16) {
+ for (int col = 0; col < width(); col += 32) {
+ API_REGISTER_STATE_CHECK(params_.func(
+ src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+ &sse1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+ aom_get_var_sse_sum_16x16_dual_c(
+ src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+ &sse2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+ k += 2;
+ }
+ }
+ EXPECT_EQ(sse_tot_simd, sse_tot_c);
+ EXPECT_EQ(sum_tot_simd, sum_tot_c);
+ for (int p = 0; p < 64; p++) {
+ EXPECT_EQ(sse1[p], sse2[p]);
+ EXPECT_EQ(var1[p], var2[p]);
+ }
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::MaxTestSseSumDual() {
+ memset(src_, 255, block_size());
+ memset(ref_, 0, block_size());
+ unsigned int sse1[64] = { 0 };
+ unsigned int sse2[64] = { 0 };
+ unsigned int var1[64] = { 0 };
+ unsigned int var2[64] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+ int k = 0;
+
+ for (int row = 0; row < height(); row += 16) {
+ for (int col = 0; col < width(); col += 32) {
+ API_REGISTER_STATE_CHECK(params_.func(
+ src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+ &sse1[k], &sse_tot_simd, &sum_tot_simd, &var1[k]));
+ aom_get_var_sse_sum_16x16_dual_c(
+ src_ + stride * row + col, stride, ref_ + stride * row + col, stride,
+ &sse2[k], &sse_tot_c, &sum_tot_c, &var2[k]);
+ k += 2;
+ }
+ }
+ EXPECT_EQ(sse_tot_c, sse_tot_simd);
+ EXPECT_EQ(sum_tot_c, sum_tot_simd);
+
+ for (int p = 0; p < 64; p++) {
+ EXPECT_EQ(sse1[p], sse2[p]);
+ EXPECT_EQ(var1[p], var2[p]);
+ }
+}
+
+template <typename GetSseSum16x16DualFuncType>
+void MainTestClass<GetSseSum16x16DualFuncType>::SseSum_SpeedTestDual() {
+ const int loop_count = 1000000000 / block_size();
+ for (int idx = 0; idx < block_size(); ++idx) {
+ src_[idx] = rnd_.Rand8();
+ ref_[idx] = rnd_.Rand8();
+ }
+
+ unsigned int sse1[2] = { 0 };
+ unsigned int sse2[2] = { 0 };
+ unsigned int var1[2] = { 0 };
+ unsigned int var2[2] = { 0 };
+ unsigned int sse_tot_c = 0;
+ unsigned int sse_tot_simd = 0;
+ int sum_tot_c = 0;
+ int sum_tot_simd = 0;
+ const int stride = width();
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int r = 0; r < loop_count; ++r) {
+ for (int row = 0; row < height(); row += 16) {
+ for (int col = 0; col < width(); col += 32) {
+ aom_get_var_sse_sum_16x16_dual_c(src_ + stride * row + col, stride,
+ ref_ + stride * row + col, stride,
+ sse2, &sse_tot_c, &sum_tot_c, var2);
+ }
+ }
+ }
+ aom_usec_timer_mark(&timer);
+ const double elapsed_time_ref =
+ static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer_start(&timer);
+ for (int r = 0; r < loop_count; ++r) {
+ for (int row = 0; row < height(); row += 16) {
+ for (int col = 0; col < width(); col += 32) {
+ params_.func(src_ + stride * row + col, stride,
+ ref_ + stride * row + col, stride, sse1, &sse_tot_simd,
+ &sum_tot_simd, var1);
+ }
+ }
+ }
+ aom_usec_timer_mark(&timer);
+ const double elapsed_time_simd =
+ static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+ printf(
+ "aom_getvar_16x16_dual for block=%dx%d : ref_time=%lf \t simd_time=%lf "
+ "\t "
+ "gain=%lf \n",
+ width(), height(), elapsed_time_ref, elapsed_time_simd,
+ elapsed_time_ref / elapsed_time_simd);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to MSE / SSE.
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestMse() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size(); ++j) {
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+#if CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ }
+ }
+ unsigned int sse1, sse2;
+ const int stride = width();
+ API_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
+ variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+ stride, &sse2, use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2);
+ }
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestSse() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size(); ++j) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+ }
+ unsigned int sse2;
+ unsigned int var1;
+ const int stride = width();
+ API_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
+ variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+ stride, &sse2, false, AOM_BITS_8);
+ EXPECT_EQ(var1, sse2);
+ }
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestMse() {
+ int max_value = (1 << params_.bit_depth) - 1;
+ if (!use_high_bit_depth()) {
+ memset(src_, max_value, block_size());
+ memset(ref_, 0, block_size());
+#if CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ aom_memset16(CONVERT_TO_SHORTPTR(src_), max_value, block_size());
+ aom_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size());
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ }
+ unsigned int sse;
+ API_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
+ unsigned int expected = (unsigned int)block_size() * max_value * max_value;
+ switch (params_.bit_depth) {
+ case AOM_BITS_12: expected = ROUND_POWER_OF_TWO(expected, 8); break;
+ case AOM_BITS_10: expected = ROUND_POWER_OF_TWO(expected, 4); break;
+ case AOM_BITS_8:
+ default: break;
+ }
+ EXPECT_EQ(expected, sse);
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestSse() {
+ memset(src_, 255, block_size());
+ memset(ref_, 0, block_size());
+ unsigned int var;
+ API_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
+ const unsigned int expected = block_size() * 255 * 255;
+ EXPECT_EQ(expected, var);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using std::get;
+using std::make_tuple;
+using std::tuple;
+
+template <typename FunctionType>
+class SubpelVarianceTest
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+ void SetUp() override {
+ params_ = this->GetParam();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ if (!use_high_bit_depth()) {
+ src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+ sec_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+ ref_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(32, block_size() + width() + height() + 1));
+ } else {
+ src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ aom_memalign(32, block_size() * sizeof(uint16_t))));
+ sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ aom_memalign(32, block_size() * sizeof(uint16_t))));
+ ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
+ 32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
+ }
+ ASSERT_NE(src_, nullptr);
+ ASSERT_NE(sec_, nullptr);
+ ASSERT_NE(ref_, nullptr);
+ }
+
+ void TearDown() override {
+ if (!use_high_bit_depth()) {
+ aom_free(src_);
+ aom_free(ref_);
+ aom_free(sec_);
+ } else {
+ aom_free(CONVERT_TO_SHORTPTR(src_));
+ aom_free(CONVERT_TO_SHORTPTR(ref_));
+ aom_free(CONVERT_TO_SHORTPTR(sec_));
+ }
+ }
+
+ protected:
+ void RefTest();
+ void ExtremeRefTest();
+ void SpeedTest();
+
+ ACMRandom rnd_;
+ uint8_t *src_;
+ uint8_t *ref_;
+ uint8_t *sec_;
+ TestParams<FunctionType> params_;
+ DIST_WTD_COMP_PARAMS jcp_param_;
+
+ // some relay helpers
+ bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+ int byte_shift() const { return params_.bit_depth - 8; }
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ uint32_t mask() const { return params_.mask; }
+};
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+ for (int x = 0; x < 8; ++x) {
+ for (int y = 0; y < 8; ++y) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
+ src_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1;
+ API_REGISTER_STATE_CHECK(
+ var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+ const unsigned int var2 = subpel_variance_ref(
+ ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+}
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+ // Compare against reference.
+ // Src: Set the first half of values to 0, the second half to the maximum.
+ // Ref: Set the first half of values to the maximum, the second half to 0.
+ for (int x = 0; x < 8; ++x) {
+ for (int y = 0; y < 8; ++y) {
+ const int half = block_size() / 2;
+ if (!use_high_bit_depth()) {
+ memset(src_, 0, half);
+ memset(src_ + half, 255, half);
+ memset(ref_, 255, half);
+ memset(ref_ + half, 0, half + width() + height() + 1);
+ } else {
+ aom_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
+ aom_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+ aom_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+ aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+ half + width() + height() + 1);
+ }
+ unsigned int sse1, sse2;
+ unsigned int var1;
+ API_REGISTER_STATE_CHECK(
+ var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+ const unsigned int var2 = subpel_variance_ref(
+ ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+ EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+ }
+ }
+}
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
+ src_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+
+ unsigned int sse1, sse2;
+ int run_time = 1000000000 / block_size();
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_time; ++i) {
+ int x = rnd_(8);
+ int y = rnd_(8);
+ params_.func(ref_, width() + 1, x, y, src_, width(), &sse1);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ aom_usec_timer timer_c;
+
+ aom_usec_timer_start(&timer_c);
+ for (int i = 0; i < run_time; ++i) {
+ int x = rnd_(8);
+ int y = rnd_(8);
+ subpel_variance_ref(ref_, src_, params_.log2width, params_.log2height, x, y,
+ &sse2, use_high_bit_depth(), params_.bit_depth);
+ }
+ aom_usec_timer_mark(&timer_c);
+
+ const int elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&timer_c));
+
+ printf(
+ "sub_pixel_variance_%dx%d_%d: ref_time=%d us opt_time=%d us gain=%d \n",
+ width(), height(), params_.bit_depth, elapsed_time_c, elapsed_time,
+ elapsed_time_c / elapsed_time);
+}
+
+template <>
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+ for (int x = 0; x < 8; ++x) {
+ for (int y = 0; y < 8; ++y) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
+ src_[j] = rnd_.Rand8();
+ sec_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+ uint32_t sse1, sse2;
+ uint32_t var1, var2;
+ API_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+ src_, width(), &sse1, sec_));
+ var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+ params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+}
+
+template <>
+void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
+ for (int x = 0; x < 8; ++x) {
+ for (int y = 0; y < 8; ++y) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
+ src_[j] = rnd_.Rand8();
+ sec_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+ for (int x0 = 0; x0 < 2; ++x0) {
+ for (int y0 = 0; y0 < 4; ++y0) {
+ uint32_t sse1, sse2;
+ uint32_t var1, var2;
+ jcp_param_.fwd_offset = quant_dist_lookup_table[y0][x0];
+ jcp_param_.bck_offset = quant_dist_lookup_table[y0][1 - x0];
+ API_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
+ src_, width(), &sse1,
+ sec_, &jcp_param_));
+ var2 = dist_wtd_subpel_avg_variance_ref(
+ ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
+ &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !CONFIG_REALTIME_ONLY
+
+static const int kMaskMax = 64;
+
+typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
+
+template <typename FunctionType>
+class ObmcVarianceTest
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+ void SetUp() override {
+ params_ = this->GetParam();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ if (!use_high_bit_depth()) {
+ pre_ = reinterpret_cast<uint8_t *>(
+ aom_memalign(32, block_size() + width() + height() + 1));
+ } else {
+ pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign(
+ 32, (block_size() + width() + height() + 1) * sizeof(uint16_t))));
+ }
+ wsrc_ = reinterpret_cast<int32_t *>(
+ aom_memalign(32, block_size() * sizeof(uint32_t)));
+ mask_ = reinterpret_cast<int32_t *>(
+ aom_memalign(32, block_size() * sizeof(uint32_t)));
+ ASSERT_NE(pre_, nullptr);
+ ASSERT_NE(wsrc_, nullptr);
+ ASSERT_NE(mask_, nullptr);
+ }
+
+ void TearDown() override {
+ if (!use_high_bit_depth()) {
+ aom_free(pre_);
+ } else {
+ aom_free(CONVERT_TO_SHORTPTR(pre_));
+ }
+ aom_free(wsrc_);
+ aom_free(mask_);
+ }
+
+ protected:
+ void RefTest();
+ void ExtremeRefTest();
+ void SpeedTest();
+
+ ACMRandom rnd_;
+ uint8_t *pre_;
+ int32_t *wsrc_;
+ int32_t *mask_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+ int byte_shift() const { return params_.bit_depth - 8; }
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ uint32_t bd_mask() const { return params_.mask; }
+};
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() {
+ for (int x = 0; x < 8; ++x) {
+ for (int y = 0; y < 8; ++y) {
+ if (!use_high_bit_depth())
+ for (int j = 0; j < block_size() + width() + height() + 1; j++)
+ pre_[j] = rnd_.Rand8();
+ else
+ for (int j = 0; j < block_size() + width() + height() + 1; j++)
+ CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+ for (int j = 0; j < block_size(); j++) {
+ wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+ mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+ }
+
+ uint32_t sse1, sse2;
+ uint32_t var1, var2;
+ API_REGISTER_STATE_CHECK(
+ var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+ var2 = obmc_subpel_variance_ref(
+ pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+ &sse2, use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+ EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+ }
+ }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
+ // Pre: Set the first half of values to the maximum, the second half to 0.
+ // Mask: same as above
+ // WSrc: Set the first half of values to 0, the second half to the maximum.
+ for (int x = 0; x < 8; ++x) {
+ for (int y = 0; y < 8; ++y) {
+ const int half = block_size() / 2;
+ if (!use_high_bit_depth()) {
+ memset(pre_, 255, half);
+ memset(pre_ + half, 0, half + width() + height() + 1);
+ } else {
+ aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
+ aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0,
+ half + width() + height() + 1);
+ }
+ for (int j = 0; j < half; j++) {
+ wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
+ mask_[j] = 0;
+ }
+ for (int j = half; j < block_size(); j++) {
+ wsrc_[j] = 0;
+ mask_[j] = kMaskMax * kMaskMax;
+ }
+
+ uint32_t sse1, sse2;
+ uint32_t var1, var2;
+ API_REGISTER_STATE_CHECK(
+ var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+ var2 = obmc_subpel_variance_ref(
+ pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+ &sse2, use_high_bit_depth(), params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+ EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+ }
+ }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
+ if (!use_high_bit_depth())
+ for (int j = 0; j < block_size() + width() + height() + 1; j++)
+ pre_[j] = rnd_.Rand8();
+ else
+ for (int j = 0; j < block_size() + width() + height() + 1; j++)
+ CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+ for (int j = 0; j < block_size(); j++) {
+ wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+ mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+ }
+ unsigned int sse1;
+ const int stride = width() + 1;
+ int run_time = 1000000000 / block_size();
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_time; ++i) {
+ int x = rnd_(8);
+ int y = rnd_(8);
+ API_REGISTER_STATE_CHECK(
+ params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+ params_.bit_depth, elapsed_time);
+}
+
+#endif // !CONFIG_REALTIME_ONLY
+
+typedef MseWxHTestClass<MseWxH16bitFunc> MseWxHTest;
+typedef Mse16xHTestClass<Mse16xH16bitFunc> Mse16xHTest;
+typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
+typedef MainTestClass<GetSseSum8x8QuadFunc> GetSseSum8x8QuadTest;
+typedef MainTestClass<GetSseSum16x16DualFunc> GetSseSum16x16DualTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+ AvxDistWtdSubpelAvgVarianceTest;
+#if !CONFIG_REALTIME_ONLY
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
+#endif
+typedef TestParams<MseWxH16bitFunc> MseWxHParams;
+typedef TestParams<Mse16xH16bitFunc> Mse16xHParams;
+
+TEST_P(MseWxHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(MseWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
+TEST_P(Mse16xHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(Mse16xHTest, RefMseExtreme) { RefMatchExtremeTestMse(); }
+TEST_P(Mse16xHTest, DISABLED_SpeedMse) { SpeedTest(); }
+TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(AvxVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(GetSseSum8x8QuadTest, RefMseSum) { RefTestSseSum(); }
+TEST_P(GetSseSum8x8QuadTest, MinSseSum) { MinTestSseSum(); }
+TEST_P(GetSseSum8x8QuadTest, MaxMseSum) { MaxTestSseSum(); }
+TEST_P(GetSseSum8x8QuadTest, DISABLED_Speed) { SseSum_SpeedTest(); }
+TEST_P(GetSseSum16x16DualTest, RefMseSum) { RefTestSseSumDual(); }
+TEST_P(GetSseSum16x16DualTest, MinSseSum) { MinTestSseSumDual(); }
+TEST_P(GetSseSum16x16DualTest, MaxMseSum) { MaxTestSseSumDual(); }
+TEST_P(GetSseSum16x16DualTest, DISABLED_Speed) { SseSum_SpeedTestDual(); }
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+#if !CONFIG_REALTIME_ONLY
+TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(
+ C, MseWxHTest,
+ ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_c, 8),
+ MseWxHParams(3, 2, &aom_mse_wxh_16bit_c, 8),
+ MseWxHParams(2, 3, &aom_mse_wxh_16bit_c, 8),
+ MseWxHParams(2, 2, &aom_mse_wxh_16bit_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+ C, Mse16xHTest,
+ ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_c, 8),
+ Mse16xHParams(3, 2, &aom_mse_16xh_16bit_c, 8),
+ Mse16xHParams(2, 3, &aom_mse_16xh_16bit_c, 8),
+ Mse16xHParams(2, 2, &aom_mse_16xh_16bit_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
+ ::testing::Values(aom_get_mb_ss_c));
+
+typedef TestParams<VarianceMxNFunc> MseParams;
+INSTANTIATE_TEST_SUITE_P(C, AvxMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
+ MseParams(4, 3, &aom_mse16x8_c),
+ MseParams(3, 4, &aom_mse8x16_c),
+ MseParams(3, 3, &aom_mse8x8_c)));
+
+typedef TestParams<VarianceMxNFunc> VarianceParams;
+const VarianceParams kArrayVariance_c[] = {
+ VarianceParams(7, 7, &aom_variance128x128_c),
+ VarianceParams(7, 6, &aom_variance128x64_c),
+ VarianceParams(6, 7, &aom_variance64x128_c),
+ VarianceParams(6, 6, &aom_variance64x64_c),
+ VarianceParams(6, 5, &aom_variance64x32_c),
+ VarianceParams(5, 6, &aom_variance32x64_c),
+ VarianceParams(5, 5, &aom_variance32x32_c),
+ VarianceParams(5, 4, &aom_variance32x16_c),
+ VarianceParams(4, 5, &aom_variance16x32_c),
+ VarianceParams(4, 4, &aom_variance16x16_c),
+ VarianceParams(4, 3, &aom_variance16x8_c),
+ VarianceParams(3, 4, &aom_variance8x16_c),
+ VarianceParams(3, 3, &aom_variance8x8_c),
+ VarianceParams(3, 2, &aom_variance8x4_c),
+ VarianceParams(2, 3, &aom_variance4x8_c),
+ VarianceParams(2, 2, &aom_variance4x4_c),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_variance64x16_c),
+ VarianceParams(4, 6, &aom_variance16x64_c),
+ VarianceParams(5, 3, &aom_variance32x8_c),
+ VarianceParams(3, 5, &aom_variance8x32_c),
+ VarianceParams(4, 2, &aom_variance16x4_c),
+ VarianceParams(2, 4, &aom_variance4x16_c),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxVarianceTest,
+ ::testing::ValuesIn(kArrayVariance_c));
+
+typedef TestParams<GetSseSum8x8QuadFunc> GetSseSumParams;
+const GetSseSumParams kArrayGetSseSum8x8Quad_c[] = {
+ GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_c, 0),
+ GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_c, 0),
+ GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_c, 0),
+ GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_c, 0)
+};
+INSTANTIATE_TEST_SUITE_P(C, GetSseSum8x8QuadTest,
+ ::testing::ValuesIn(kArrayGetSseSum8x8Quad_c));
+
+typedef TestParams<GetSseSum16x16DualFunc> GetSseSumParamsDual;
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_c[] = {
+ GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_c, 0),
+ GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_c, 0),
+ GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_c, 0),
+ GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_c, 0)
+};
+
+INSTANTIATE_TEST_SUITE_P(C, GetSseSum16x16DualTest,
+ ::testing::ValuesIn(kArrayGetSseSum16x16Dual_c));
+
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
+const SubpelVarianceParams kArraySubpelVariance_c[] = {
+ SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
+ SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
+ SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
+ SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
+ SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
+ SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
+ SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
+ SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
+ SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
+ SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
+ SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
+ SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
+ SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
+ SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
+ SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxSubpelVarianceTest,
+ ::testing::ValuesIn(kArraySubpelVariance_c));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_c[] = {
+ SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
+ SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
+ SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
+ SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
+ SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
+ SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
+ SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
+ SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
+ SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
+ SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
+ SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
+ SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
+ SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
+ SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
+ SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
+ SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
+ SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
+ SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
+ SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
+ SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
+ SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArraySubpelAvgVariance_c));
+
+typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_c[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(4, 3,
+ &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(3, 4,
+ &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(3, 3,
+ &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(3, 2,
+ &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
+ DistWtdSubpelAvgVarianceParams(2, 3,
+ &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(2, 2,
+ &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
+#if !CONFIG_REALTIME_ONLY
+
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
+ DistWtdSubpelAvgVarianceParams(5, 3,
+ &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(3, 5,
+ &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(4, 2,
+ &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
+ DistWtdSubpelAvgVarianceParams(2, 4,
+ &aom_dist_wtd_sub_pixel_avg_variance4x16_c, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_c));
+
+#if !CONFIG_REALTIME_ONLY
+INSTANTIATE_TEST_SUITE_P(
+ C, AvxObmcSubpelVarianceTest,
+ ::testing::Values(
+ ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c,
+ 0),
+ ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0),
+ ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0),
+ ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0),
+ ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0),
+ ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0),
+ ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0),
+ ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0),
+ ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0),
+ ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0),
+ ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0),
+ ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0),
+ ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0),
+ ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0),
+ ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0),
+ ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0),
+
+ ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_c, 0),
+ ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_c, 0),
+ ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_c, 0),
+ ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0),
+ ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0),
+ ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0)));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef uint64_t (*MseHBDWxH16bitFunc)(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h);
+
+template <typename FunctionType>
+class MseHBDWxHTestClass
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+ void SetUp() override {
+ params_ = this->GetParam();
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, block_size() * sizeof(src_)));
+ dst_ = reinterpret_cast<uint16_t *>(
+ aom_memalign(16, block_size() * sizeof(dst_)));
+ ASSERT_NE(src_, nullptr);
+ ASSERT_NE(dst_, nullptr);
+ }
+
+ void TearDown() override {
+ aom_free(src_);
+ aom_free(dst_);
+ src_ = nullptr;
+ dst_ = nullptr;
+ }
+
+ protected:
+ void RefMatchTestMse();
+ void SpeedTest();
+
+ protected:
+ ACMRandom rnd_;
+ uint16_t *dst_;
+ uint16_t *src_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int d_stride() const { return params_.width; } // stride is same as width
+ int s_stride() const { return params_.width; } // stride is same as width
+ int height() const { return params_.height; }
+ int mask() const { return params_.mask; }
+};
+
+template <typename MseHBDWxHFunctionType>
+void MseHBDWxHTestClass<MseHBDWxHFunctionType>::SpeedTest() {
+ aom_usec_timer ref_timer, test_timer;
+ double elapsed_time_c = 0;
+ double elapsed_time_simd = 0;
+ int run_time = 10000000;
+ int w = width();
+ int h = height();
+ int dstride = d_stride();
+ int sstride = s_stride();
+ for (int k = 0; k < block_size(); ++k) {
+ dst_[k] = rnd_.Rand16() & mask();
+ src_[k] = rnd_.Rand16() & mask();
+ }
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < run_time; i++) {
+ aom_mse_wxh_16bit_highbd_c(dst_, dstride, src_, sstride, w, h);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ elapsed_time_c = static_cast<double>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < run_time; i++) {
+ params_.func(dst_, dstride, src_, sstride, w, h);
+ }
+ aom_usec_timer_mark(&test_timer);
+ elapsed_time_simd = static_cast<double>(aom_usec_timer_elapsed(&test_timer));
+
+ printf("%dx%d\tc_time=%lf \t simd_time=%lf \t gain=%lf\n", width(), height(),
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+}
+
+template <typename MseHBDWxHFunctionType>
+void MseHBDWxHTestClass<MseHBDWxHFunctionType>::RefMatchTestMse() {
+ uint64_t mse_ref = 0;
+ uint64_t mse_mod = 0;
+ int w = width();
+ int h = height();
+ int dstride = d_stride();
+ int sstride = s_stride();
+ for (int i = 0; i < 10; i++) {
+ for (int k = 0; k < block_size(); ++k) {
+ dst_[k] = rnd_.Rand16() & mask();
+ src_[k] = rnd_.Rand16() & mask();
+ }
+ API_REGISTER_STATE_CHECK(mse_ref = aom_mse_wxh_16bit_highbd_c(
+ dst_, dstride, src_, sstride, w, h));
+ API_REGISTER_STATE_CHECK(
+ mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
+ EXPECT_EQ(mse_ref, mse_mod)
+ << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
+ }
+}
+
+typedef TestParams<MseHBDWxH16bitFunc> MseHBDWxHParams;
+typedef MseHBDWxHTestClass<MseHBDWxH16bitFunc> MseHBDWxHTest;
+typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDMseTest);
+typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+ AvxHBDDistWtdSubpelAvgVarianceTest;
+#if !CONFIG_REALTIME_ONLY
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
+#endif
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AvxHBDObmcSubpelVarianceTest);
+
+TEST_P(MseHBDWxHTest, RefMse) { RefMatchTestMse(); }
+TEST_P(MseHBDWxHTest, DISABLED_SpeedMse) { SpeedTest(); }
+TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(AvxHBDMseTest, DISABLED_SpeedMse) { SpeedTest(); }
+TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(AvxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+#if !CONFIG_REALTIME_ONLY
+TEST_P(AvxHBDObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(
+ C, MseHBDWxHTest,
+ ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_c, 10),
+ MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_c, 10),
+ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_c, 10),
+ MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_c, 10)));
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AvxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_c, 12),
+ MseParams(4, 3, &aom_highbd_12_mse16x8_c, 12),
+ MseParams(3, 4, &aom_highbd_12_mse8x16_c, 12),
+ MseParams(3, 3, &aom_highbd_12_mse8x8_c, 12),
+ MseParams(4, 4, &aom_highbd_10_mse16x16_c, 10),
+ MseParams(4, 3, &aom_highbd_10_mse16x8_c, 10),
+ MseParams(3, 4, &aom_highbd_10_mse8x16_c, 10),
+ MseParams(3, 3, &aom_highbd_10_mse8x8_c, 10),
+ MseParams(4, 4, &aom_highbd_8_mse16x16_c, 8),
+ MseParams(4, 3, &aom_highbd_8_mse16x8_c, 8),
+ MseParams(3, 4, &aom_highbd_8_mse8x16_c, 8),
+ MseParams(3, 3, &aom_highbd_8_mse8x8_c, 8)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, MseHBDWxHTest,
+ ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_neon, 10),
+ MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_neon, 10),
+ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_neon, 10),
+ MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_neon,
+ 10)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AvxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_neon, 12),
+ MseParams(4, 3, &aom_highbd_12_mse16x8_neon, 12),
+ MseParams(3, 4, &aom_highbd_12_mse8x16_neon, 12),
+ MseParams(3, 3, &aom_highbd_12_mse8x8_neon, 12),
+ MseParams(4, 4, &aom_highbd_10_mse16x16_neon, 10),
+ MseParams(4, 3, &aom_highbd_10_mse16x8_neon, 10),
+ MseParams(3, 4, &aom_highbd_10_mse8x16_neon, 10),
+ MseParams(3, 3, &aom_highbd_10_mse8x8_neon, 10),
+ MseParams(4, 4, &aom_highbd_8_mse16x16_neon, 8),
+ MseParams(4, 3, &aom_highbd_8_mse16x8_neon, 8),
+ MseParams(3, 4, &aom_highbd_8_mse8x16_neon, 8),
+ MseParams(3, 3, &aom_highbd_8_mse8x8_neon, 8)));
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AvxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_highbd_8_mse16x16_neon_dotprod, 8),
+ MseParams(4, 3, &aom_highbd_8_mse16x8_neon_dotprod, 8),
+ MseParams(3, 4, &aom_highbd_8_mse8x16_neon_dotprod, 8),
+ MseParams(3, 3, &aom_highbd_8_mse8x8_neon_dotprod, 8)));
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, MseHBDWxHTest,
+ ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sve, 10),
+ MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sve, 10),
+ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sve, 10),
+ MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sve,
+ 10)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SVE, AvxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sve, 12),
+ MseParams(4, 3, &aom_highbd_12_mse16x8_sve, 12),
+ MseParams(3, 4, &aom_highbd_12_mse8x16_sve, 12),
+ MseParams(3, 3, &aom_highbd_12_mse8x8_sve, 12),
+ MseParams(4, 4, &aom_highbd_10_mse16x16_sve, 10),
+ MseParams(4, 3, &aom_highbd_10_mse16x8_sve, 10),
+ MseParams(3, 4, &aom_highbd_10_mse8x16_sve, 10),
+ MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10),
+ MseParams(4, 4, &aom_highbd_8_mse16x16_sve, 8),
+ MseParams(4, 3, &aom_highbd_8_mse16x8_sve, 8),
+ MseParams(3, 4, &aom_highbd_8_mse8x16_sve, 8),
+ MseParams(3, 3, &aom_highbd_8_mse8x8_sve, 8)));
+#endif // HAVE_SVE
+
+const VarianceParams kArrayHBDVariance_c[] = {
+ VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
+ VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
+ VarianceParams(6, 7, &aom_highbd_12_variance64x128_c, 12),
+ VarianceParams(6, 6, &aom_highbd_12_variance64x64_c, 12),
+ VarianceParams(6, 5, &aom_highbd_12_variance64x32_c, 12),
+ VarianceParams(5, 6, &aom_highbd_12_variance32x64_c, 12),
+ VarianceParams(5, 5, &aom_highbd_12_variance32x32_c, 12),
+ VarianceParams(5, 4, &aom_highbd_12_variance32x16_c, 12),
+ VarianceParams(4, 5, &aom_highbd_12_variance16x32_c, 12),
+ VarianceParams(4, 4, &aom_highbd_12_variance16x16_c, 12),
+ VarianceParams(4, 3, &aom_highbd_12_variance16x8_c, 12),
+ VarianceParams(3, 4, &aom_highbd_12_variance8x16_c, 12),
+ VarianceParams(3, 3, &aom_highbd_12_variance8x8_c, 12),
+ VarianceParams(3, 2, &aom_highbd_12_variance8x4_c, 12),
+ VarianceParams(2, 3, &aom_highbd_12_variance4x8_c, 12),
+ VarianceParams(2, 2, &aom_highbd_12_variance4x4_c, 12),
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_c, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_c, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_c, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_c, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_c, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_c, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_c, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_c, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_c, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_c, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_c, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_c, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_c, 10),
+ VarianceParams(3, 2, &aom_highbd_10_variance8x4_c, 10),
+ VarianceParams(2, 3, &aom_highbd_10_variance4x8_c, 10),
+ VarianceParams(2, 2, &aom_highbd_10_variance4x4_c, 10),
+ VarianceParams(7, 7, &aom_highbd_8_variance128x128_c, 8),
+ VarianceParams(7, 6, &aom_highbd_8_variance128x64_c, 8),
+ VarianceParams(6, 7, &aom_highbd_8_variance64x128_c, 8),
+ VarianceParams(6, 6, &aom_highbd_8_variance64x64_c, 8),
+ VarianceParams(6, 5, &aom_highbd_8_variance64x32_c, 8),
+ VarianceParams(5, 6, &aom_highbd_8_variance32x64_c, 8),
+ VarianceParams(5, 5, &aom_highbd_8_variance32x32_c, 8),
+ VarianceParams(5, 4, &aom_highbd_8_variance32x16_c, 8),
+ VarianceParams(4, 5, &aom_highbd_8_variance16x32_c, 8),
+ VarianceParams(4, 4, &aom_highbd_8_variance16x16_c, 8),
+ VarianceParams(4, 3, &aom_highbd_8_variance16x8_c, 8),
+ VarianceParams(3, 4, &aom_highbd_8_variance8x16_c, 8),
+ VarianceParams(3, 3, &aom_highbd_8_variance8x8_c, 8),
+ VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
+ VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
+ VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_highbd_12_variance64x16_c, 12),
+ VarianceParams(4, 6, &aom_highbd_12_variance16x64_c, 12),
+ VarianceParams(5, 3, &aom_highbd_12_variance32x8_c, 12),
+ VarianceParams(3, 5, &aom_highbd_12_variance8x32_c, 12),
+ VarianceParams(4, 2, &aom_highbd_12_variance16x4_c, 12),
+ VarianceParams(2, 4, &aom_highbd_12_variance4x16_c, 12),
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_c, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_c, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_c, 10),
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_c, 10),
+ VarianceParams(4, 2, &aom_highbd_10_variance16x4_c, 10),
+ VarianceParams(2, 4, &aom_highbd_10_variance4x16_c, 10),
+ VarianceParams(6, 4, &aom_highbd_8_variance64x16_c, 8),
+ VarianceParams(4, 6, &aom_highbd_8_variance16x64_c, 8),
+ VarianceParams(5, 3, &aom_highbd_8_variance32x8_c, 8),
+ VarianceParams(3, 5, &aom_highbd_8_variance8x32_c, 8),
+ VarianceParams(4, 2, &aom_highbd_8_variance16x4_c, 8),
+ VarianceParams(2, 4, &aom_highbd_8_variance4x16_c, 8),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AvxHBDVarianceTest,
+ ::testing::Values(
+ VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8),
+ VarianceParams(2, 2, &aom_highbd_10_variance4x4_sse4_1, 10),
+ VarianceParams(2, 2, &aom_highbd_12_variance4x4_sse4_1, 12)));
+#endif // HAVE_SSE4_1
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
+ SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_c, 8),
+ SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_c, 8),
+ SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_c, 8),
+ SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8),
+ SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8),
+ SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8),
+ SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_c, 8),
+ SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_c, 8),
+ SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_c, 8),
+ SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_c, 8),
+ SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_c, 8),
+ SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_c, 8),
+ SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_c, 8),
+ SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8),
+ SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8),
+ SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8),
+ SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_c, 10),
+ SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_c, 10),
+ SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_c, 10),
+ SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10),
+ SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10),
+ SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10),
+ SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_c, 10),
+ SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_c, 10),
+ SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_c, 10),
+ SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_c, 10),
+ SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_c, 10),
+ SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_c, 10),
+ SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_c, 10),
+ SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10),
+ SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10),
+ SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10),
+ SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_c, 12),
+ SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_c, 12),
+ SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_c, 12),
+ SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12),
+ SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12),
+ SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12),
+ SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_c, 12),
+ SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_c, 12),
+ SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_c, 12),
+ SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_c, 12),
+ SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_c, 12),
+ SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_c, 12),
+ SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_c, 12),
+ SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
+ SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
+ SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_c, 8),
+ SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_c, 8),
+ SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_c, 8),
+ SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_c, 8),
+ SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_c, 8),
+ SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_c, 8),
+ SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_c, 10),
+ SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_c, 10),
+ SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_c, 10),
+ SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_c, 10),
+ SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_c, 10),
+ SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_c, 10),
+ SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_c, 12),
+ SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_c, 12),
+ SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_c, 12),
+ SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_c, 12),
+ SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_c, 12),
+ SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
+ SubpelAvgVarianceParams(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c,
+ 8),
+ SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_c,
+ 8),
+ SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_c,
+ 8),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+ SubpelAvgVarianceParams(7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_c,
+ 10),
+ SubpelAvgVarianceParams(7, 6, &aom_highbd_10_sub_pixel_avg_variance128x64_c,
+ 10),
+ SubpelAvgVarianceParams(6, 7, &aom_highbd_10_sub_pixel_avg_variance64x128_c,
+ 10),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c,
+ 10),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c,
+ 10),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_c,
+ 10),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_c,
+ 10),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_c,
+ 10),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_c,
+ 10),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_c,
+ 10),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+ SubpelAvgVarianceParams(7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_c,
+ 12),
+ SubpelAvgVarianceParams(7, 6, &aom_highbd_12_sub_pixel_avg_variance128x64_c,
+ 12),
+ SubpelAvgVarianceParams(6, 7, &aom_highbd_12_sub_pixel_avg_variance64x128_c,
+ 12),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c,
+ 12),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c,
+ 12),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_c,
+ 12),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_c,
+ 12),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_c,
+ 12),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_c,
+ 12),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_c,
+ 12),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_c, 8),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_c, 8),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_c, 8),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_c, 8),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_c, 8),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_c, 8),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_c,
+ 10),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_c,
+ 10),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_c,
+ 10),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_c,
+ 10),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_c,
+ 10),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_c,
+ 12),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_c,
+ 12),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_c,
+ 12),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_c,
+ 12),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_c,
+ 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
+
+const DistWtdSubpelAvgVarianceParams kArrayHBDDistWtdSubpelAvgVariance_c[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_c, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_c, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_c, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_c, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(
+ C, AvxHBDDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDDistWtdSubpelAvgVariance_c));
+
+#if !CONFIG_REALTIME_ONLY
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
+ ObmcSubpelVarianceParams(7, 7, &aom_highbd_8_obmc_sub_pixel_variance128x128_c,
+ 8),
+ ObmcSubpelVarianceParams(7, 6, &aom_highbd_8_obmc_sub_pixel_variance128x64_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 7, &aom_highbd_8_obmc_sub_pixel_variance64x128_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 6, &aom_highbd_8_obmc_sub_pixel_variance64x64_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 5, &aom_highbd_8_obmc_sub_pixel_variance64x32_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 6, &aom_highbd_8_obmc_sub_pixel_variance32x64_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 5, &aom_highbd_8_obmc_sub_pixel_variance32x32_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 4, &aom_highbd_8_obmc_sub_pixel_variance32x16_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 5, &aom_highbd_8_obmc_sub_pixel_variance16x32_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 4, &aom_highbd_8_obmc_sub_pixel_variance16x16_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 3, &aom_highbd_8_obmc_sub_pixel_variance16x8_c,
+ 8),
+ ObmcSubpelVarianceParams(3, 4, &aom_highbd_8_obmc_sub_pixel_variance8x16_c,
+ 8),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_8_obmc_sub_pixel_variance8x8_c, 8),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_8_obmc_sub_pixel_variance8x4_c, 8),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_8_obmc_sub_pixel_variance4x8_c, 8),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_8_obmc_sub_pixel_variance4x4_c, 8),
+ ObmcSubpelVarianceParams(7, 7,
+ &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10),
+ ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c,
+ 10),
+ ObmcSubpelVarianceParams(6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_c,
+ 10),
+ ObmcSubpelVarianceParams(6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_c,
+ 10),
+ ObmcSubpelVarianceParams(6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_c,
+ 10),
+ ObmcSubpelVarianceParams(5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_c,
+ 10),
+ ObmcSubpelVarianceParams(5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_c,
+ 10),
+ ObmcSubpelVarianceParams(5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_c,
+ 10),
+ ObmcSubpelVarianceParams(4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_c,
+ 10),
+ ObmcSubpelVarianceParams(4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_c,
+ 10),
+ ObmcSubpelVarianceParams(4, 3, &aom_highbd_10_obmc_sub_pixel_variance16x8_c,
+ 10),
+ ObmcSubpelVarianceParams(3, 4, &aom_highbd_10_obmc_sub_pixel_variance8x16_c,
+ 10),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_c,
+ 10),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_c,
+ 10),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_c,
+ 10),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_c,
+ 10),
+ ObmcSubpelVarianceParams(7, 7,
+ &aom_highbd_12_obmc_sub_pixel_variance128x128_c, 12),
+ ObmcSubpelVarianceParams(7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_c,
+ 12),
+ ObmcSubpelVarianceParams(6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_c,
+ 12),
+ ObmcSubpelVarianceParams(6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_c,
+ 12),
+ ObmcSubpelVarianceParams(6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_c,
+ 12),
+ ObmcSubpelVarianceParams(5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_c,
+ 12),
+ ObmcSubpelVarianceParams(5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_c,
+ 12),
+ ObmcSubpelVarianceParams(5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_c,
+ 12),
+ ObmcSubpelVarianceParams(4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_c,
+ 12),
+ ObmcSubpelVarianceParams(4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_c,
+ 12),
+ ObmcSubpelVarianceParams(4, 3, &aom_highbd_12_obmc_sub_pixel_variance16x8_c,
+ 12),
+ ObmcSubpelVarianceParams(3, 4, &aom_highbd_12_obmc_sub_pixel_variance8x16_c,
+ 12),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_c,
+ 12),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_c,
+ 12),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c,
+ 12),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
+ 12),
+
+ ObmcSubpelVarianceParams(6, 4, &aom_highbd_8_obmc_sub_pixel_variance64x16_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 6, &aom_highbd_8_obmc_sub_pixel_variance16x64_c,
+ 8),
+ ObmcSubpelVarianceParams(5, 3, &aom_highbd_8_obmc_sub_pixel_variance32x8_c,
+ 8),
+ ObmcSubpelVarianceParams(3, 5, &aom_highbd_8_obmc_sub_pixel_variance8x32_c,
+ 8),
+ ObmcSubpelVarianceParams(4, 2, &aom_highbd_8_obmc_sub_pixel_variance16x4_c,
+ 8),
+ ObmcSubpelVarianceParams(2, 4, &aom_highbd_8_obmc_sub_pixel_variance4x16_c,
+ 8),
+ ObmcSubpelVarianceParams(6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_c,
+ 10),
+ ObmcSubpelVarianceParams(4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_c,
+ 10),
+ ObmcSubpelVarianceParams(5, 3, &aom_highbd_10_obmc_sub_pixel_variance32x8_c,
+ 10),
+ ObmcSubpelVarianceParams(3, 5, &aom_highbd_10_obmc_sub_pixel_variance8x32_c,
+ 10),
+ ObmcSubpelVarianceParams(4, 2, &aom_highbd_10_obmc_sub_pixel_variance16x4_c,
+ 10),
+ ObmcSubpelVarianceParams(2, 4, &aom_highbd_10_obmc_sub_pixel_variance4x16_c,
+ 10),
+ ObmcSubpelVarianceParams(6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_c,
+ 12),
+ ObmcSubpelVarianceParams(4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_c,
+ 12),
+ ObmcSubpelVarianceParams(5, 3, &aom_highbd_12_obmc_sub_pixel_variance32x8_c,
+ 12),
+ ObmcSubpelVarianceParams(3, 5, &aom_highbd_12_obmc_sub_pixel_variance8x32_c,
+ 12),
+ ObmcSubpelVarianceParams(4, 2, &aom_highbd_12_obmc_sub_pixel_variance16x4_c,
+ 12),
+ ObmcSubpelVarianceParams(2, 4, &aom_highbd_12_obmc_sub_pixel_variance4x16_c,
+ 12),
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDObmcSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+#endif // !CONFIG_REALTIME_ONLY
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, MseWxHTest,
+ ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_sse2, 8),
+ MseWxHParams(3, 2, &aom_mse_wxh_16bit_sse2, 8),
+ MseWxHParams(2, 3, &aom_mse_wxh_16bit_sse2, 8),
+ MseWxHParams(2, 2, &aom_mse_wxh_16bit_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, Mse16xHTest,
+ ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_sse2, 8),
+ Mse16xHParams(3, 2, &aom_mse_16xh_16bit_sse2, 8),
+ Mse16xHParams(2, 3, &aom_mse_16xh_16bit_sse2, 8),
+ Mse16xHParams(2, 2, &aom_mse_16xh_16bit_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
+ ::testing::Values(aom_get_mb_ss_sse2));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
+ MseParams(4, 3, &aom_mse16x8_sse2),
+ MseParams(3, 4, &aom_mse8x16_sse2),
+ MseParams(3, 3, &aom_mse8x8_sse2)));
+
+const VarianceParams kArrayVariance_sse2[] = {
+ VarianceParams(7, 7, &aom_variance128x128_sse2),
+ VarianceParams(7, 6, &aom_variance128x64_sse2),
+ VarianceParams(6, 7, &aom_variance64x128_sse2),
+ VarianceParams(6, 6, &aom_variance64x64_sse2),
+ VarianceParams(6, 5, &aom_variance64x32_sse2),
+ VarianceParams(5, 6, &aom_variance32x64_sse2),
+ VarianceParams(5, 5, &aom_variance32x32_sse2),
+ VarianceParams(5, 4, &aom_variance32x16_sse2),
+ VarianceParams(4, 5, &aom_variance16x32_sse2),
+ VarianceParams(4, 4, &aom_variance16x16_sse2),
+ VarianceParams(4, 3, &aom_variance16x8_sse2),
+ VarianceParams(3, 4, &aom_variance8x16_sse2),
+ VarianceParams(3, 3, &aom_variance8x8_sse2),
+ VarianceParams(3, 2, &aom_variance8x4_sse2),
+ VarianceParams(2, 3, &aom_variance4x8_sse2),
+ VarianceParams(2, 2, &aom_variance4x4_sse2),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_variance64x16_sse2),
+ VarianceParams(5, 3, &aom_variance32x8_sse2),
+ VarianceParams(4, 6, &aom_variance16x64_sse2),
+ VarianceParams(4, 2, &aom_variance16x4_sse2),
+ VarianceParams(3, 5, &aom_variance8x32_sse2),
+ VarianceParams(2, 4, &aom_variance4x16_sse2),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxVarianceTest,
+ ::testing::ValuesIn(kArrayVariance_sse2));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_sse2[] = {
+ GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_sse2, 0),
+ GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_sse2, 0),
+ GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_sse2, 0),
+ GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_sse2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum8x8QuadTest,
+ ::testing::ValuesIn(kArrayGetSseSum8x8Quad_sse2));
+
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_sse2[] = {
+ GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_sse2, 0),
+ GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_sse2, 0),
+ GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_sse2, 0),
+ GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_sse2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum16x16DualTest,
+ ::testing::ValuesIn(kArrayGetSseSum16x16Dual_sse2));
+
+const SubpelVarianceParams kArraySubpelVariance_sse2[] = {
+ SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
+ SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
+ SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
+ SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
+ SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
+ SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
+ SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
+ SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
+ SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
+ SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
+ SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
+ SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
+ SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
+ SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
+ SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelVarianceTest,
+ ::testing::ValuesIn(kArraySubpelVariance_sse2));
+
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_sse2[] = {
+ SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2, 0),
+ SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2, 0),
+ SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2, 0),
+ SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
+ SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
+ SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
+ SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
+ SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
+ SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
+ SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
+ SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
+ SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
+ SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
+ SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
+ SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArraySubpelAvgVariance_sse2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, MseHBDWxHTest,
+ ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
+ MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sse2, 10),
+ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sse2, 10),
+ MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sse2,
+ 10)));
+#endif // HAVE_SSE2
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AvxSubpelVarianceTest,
+ ::testing::Values(
+ SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_sse4_1,
+ 8),
+ SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_sse4_1,
+ 10),
+ SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_sse4_1,
+ 12)));
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AvxSubpelAvgVarianceTest,
+ ::testing::Values(
+ SubpelAvgVarianceParams(2, 2,
+ &aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1,
+ 8),
+ SubpelAvgVarianceParams(2, 2,
+ &aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1,
+ 10),
+ SubpelAvgVarianceParams(2, 2,
+ &aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1,
+ 12)));
+#endif // HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AvxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2, 12),
+ MseParams(3, 3, &aom_highbd_12_mse8x8_sse2, 12),
+ MseParams(4, 4, &aom_highbd_10_mse16x16_sse2, 10),
+ MseParams(3, 3, &aom_highbd_10_mse8x8_sse2, 10),
+ MseParams(4, 4, &aom_highbd_8_mse16x16_sse2, 8),
+ MseParams(3, 3, &aom_highbd_8_mse8x8_sse2, 8)));
+
+const VarianceParams kArrayHBDVariance_sse2[] = {
+ VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12),
+ VarianceParams(7, 6, &aom_highbd_12_variance128x64_sse2, 12),
+ VarianceParams(6, 7, &aom_highbd_12_variance64x128_sse2, 12),
+ VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
+ VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
+ VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
+ VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
+ VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
+ VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
+ VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
+ VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
+ VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
+ VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_sse2, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_sse2, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_sse2, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
+ VarianceParams(7, 7, &aom_highbd_8_variance128x128_sse2, 8),
+ VarianceParams(7, 6, &aom_highbd_8_variance128x64_sse2, 8),
+ VarianceParams(6, 7, &aom_highbd_8_variance64x128_sse2, 8),
+ VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
+ VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
+ VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
+ VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
+ VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
+ VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
+ VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
+ VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
+ VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
+ VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_highbd_12_variance64x16_sse2, 12),
+ VarianceParams(4, 6, &aom_highbd_12_variance16x64_sse2, 12),
+ VarianceParams(5, 3, &aom_highbd_12_variance32x8_sse2, 12),
+ VarianceParams(3, 5, &aom_highbd_12_variance8x32_sse2, 12),
+ // VarianceParams(4, 2, &aom_highbd_12_variance16x4_sse2, 12),
+ // VarianceParams(2, 4, &aom_highbd_12_variance4x16_sse2, 12),
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_sse2, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_sse2, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_sse2, 10),
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_sse2, 10),
+ // VarianceParams(4, 2, &aom_highbd_10_variance16x4_sse2, 10),
+ // VarianceParams(2, 4, &aom_highbd_10_variance4x16_sse2, 10),
+ VarianceParams(6, 4, &aom_highbd_8_variance64x16_sse2, 8),
+ VarianceParams(4, 6, &aom_highbd_8_variance16x64_sse2, 8),
+ VarianceParams(5, 3, &aom_highbd_8_variance32x8_sse2, 8),
+ VarianceParams(3, 5, &aom_highbd_8_variance8x32_sse2, 8),
+// VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
+// VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_sse2));
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, MseHBDWxHTest,
+ ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_avx2, 10),
+ MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_avx2, 10),
+ MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_avx2, 10),
+ MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_avx2,
+ 10)));
+
+const VarianceParams kArrayHBDVariance_avx2[] = {
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_avx2, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_avx2, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_avx2, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_avx2, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_avx2, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_avx2, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_avx2, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_avx2, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_avx2, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_avx2, 10),
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_avx2, 10),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_avx2));
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_avx2[] = {
+ SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_avx2, 10),
+ SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_avx2, 10),
+ SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_avx2, 10),
+ SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_avx2, 10),
+ SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_avx2, 10),
+ SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_avx2, 10),
+ SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_avx2, 10),
+ SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_avx2, 10),
+ SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_avx2, 10),
+ SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_avx2, 10),
+ SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_avx2, 10),
+ SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_avx2, 10),
+ SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_avx2, 10),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelVariance_avx2));
+#endif // HAVE_AVX2
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
+ SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12),
+ SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12),
+ SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12),
+ SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
+ SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12),
+ SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12),
+ SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_sse2, 12),
+ SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_sse2, 12),
+ SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_sse2, 12),
+ SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_sse2, 12),
+ SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_sse2, 12),
+ SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12),
+ SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12),
+ SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12),
+ SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10),
+ SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10),
+ SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10),
+ SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10),
+ SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10),
+ SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10),
+ SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_sse2, 10),
+ SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_sse2, 10),
+ SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_sse2, 10),
+ SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_sse2, 10),
+ SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_sse2, 10),
+ SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10),
+ SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10),
+ SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10),
+ SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8),
+ SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8),
+ SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8),
+ SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8),
+ SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8),
+ SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8),
+ SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_sse2, 8),
+ SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_sse2, 8),
+ SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_sse2, 8),
+ SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_sse2, 8),
+ SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_sse2, 8),
+ SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8),
+ SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
+ SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_sse2, 12),
+ SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_sse2, 12),
+ SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_sse2, 12),
+ SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_sse2, 12),
+ SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_sse2, 12),
+ // SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_sse2, 12),
+ SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_sse2, 10),
+ SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_sse2, 10),
+ SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_sse2, 10),
+ SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_sse2, 10),
+ SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_sse2, 10),
+ // SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_sse2, 10),
+ SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_sse2, 8),
+ SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_sse2, 8),
+ SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_sse2, 8),
+ SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_sse2, 8),
+ SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_sse2, 8),
+// SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_sse2,
+ 12),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_sse2,
+ 10),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2,
+ 8),
+
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_sse2,
+ 12),
+ // SubpelAvgVarianceParams(2, 4,
+ // &aom_highbd_12_sub_pixel_avg_variance4x16_sse2, 12),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_sse2,
+ 10),
+ // SubpelAvgVarianceParams(2, 4,
+ // &aom_highbd_10_sub_pixel_avg_variance4x16_sse2, 10),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_sse2,
+ 8),
+// SubpelAvgVarianceParams(2, 4,
+// &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
+#endif // HAVE_SSE2
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSSE3
+const SubpelVarianceParams kArraySubpelVariance_ssse3[] = {
+ SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
+ SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
+ SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
+ SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
+ SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
+ SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
+ SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
+ SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
+ SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
+ SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
+ SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
+ SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
+ SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
+ SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
+ SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, AvxSubpelVarianceTest,
+ ::testing::ValuesIn(kArraySubpelVariance_ssse3));
+
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_ssse3[] = {
+ SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3, 0),
+ SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3, 0),
+ SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3, 0),
+ SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3, 0),
+ SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3, 0),
+ SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3, 0),
+ SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3, 0),
+ SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3, 0),
+ SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3, 0),
+ SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3, 0),
+ SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
+ SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
+ SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
+ SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3, 0),
+ SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3, 0),
+ SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
+ SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
+ SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, AvxSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArraySubpelAvgVariance_ssse3));
+
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_ssse3[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(
+ SSSE3, AvxDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_ssse3));
+#endif // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+#if !CONFIG_REALTIME_ONLY
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AvxObmcSubpelVarianceTest,
+ ::testing::Values(
+ ObmcSubpelVarianceParams(7, 7,
+ &aom_obmc_sub_pixel_variance128x128_sse4_1, 0),
+ ObmcSubpelVarianceParams(7, 6,
+ &aom_obmc_sub_pixel_variance128x64_sse4_1, 0),
+ ObmcSubpelVarianceParams(6, 7,
+ &aom_obmc_sub_pixel_variance64x128_sse4_1, 0),
+ ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_sse4_1,
+ 0),
+ ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1,
+ 0)));
+#endif
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, MseWxHTest,
+ ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_avx2, 8),
+ MseWxHParams(3, 2, &aom_mse_wxh_16bit_avx2, 8),
+ MseWxHParams(2, 3, &aom_mse_wxh_16bit_avx2, 8),
+ MseWxHParams(2, 2, &aom_mse_wxh_16bit_avx2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, Mse16xHTest,
+ ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_avx2, 8),
+ Mse16xHParams(3, 2, &aom_mse_16xh_16bit_avx2, 8),
+ Mse16xHParams(2, 3, &aom_mse_16xh_16bit_avx2, 8),
+ Mse16xHParams(2, 2, &aom_mse_16xh_16bit_avx2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
+ ::testing::Values(MseParams(4, 4,
+ &aom_mse16x16_avx2)));
+
+const VarianceParams kArrayVariance_avx2[] = {
+ VarianceParams(7, 7, &aom_variance128x128_avx2),
+ VarianceParams(7, 6, &aom_variance128x64_avx2),
+ VarianceParams(6, 7, &aom_variance64x128_avx2),
+ VarianceParams(6, 6, &aom_variance64x64_avx2),
+ VarianceParams(6, 5, &aom_variance64x32_avx2),
+ VarianceParams(5, 6, &aom_variance32x64_avx2),
+ VarianceParams(5, 5, &aom_variance32x32_avx2),
+ VarianceParams(5, 4, &aom_variance32x16_avx2),
+ VarianceParams(4, 5, &aom_variance16x32_avx2),
+ VarianceParams(4, 4, &aom_variance16x16_avx2),
+ VarianceParams(4, 3, &aom_variance16x8_avx2),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_variance64x16_avx2),
+ VarianceParams(4, 6, &aom_variance16x64_avx2),
+ VarianceParams(5, 3, &aom_variance32x8_avx2),
+ VarianceParams(4, 2, &aom_variance16x4_avx2),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxVarianceTest,
+ ::testing::ValuesIn(kArrayVariance_avx2));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_avx2[] = {
+ GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_avx2, 0),
+ GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_avx2, 0),
+ GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_avx2, 0),
+ GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_avx2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, GetSseSum8x8QuadTest,
+ ::testing::ValuesIn(kArrayGetSseSum8x8Quad_avx2));
+
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_avx2[] = {
+ GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_avx2, 0),
+ GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_avx2, 0),
+ GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_avx2, 0),
+ GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_avx2, 0)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, GetSseSum16x16DualTest,
+ ::testing::ValuesIn(kArrayGetSseSum16x16Dual_avx2));
+
+const SubpelVarianceParams kArraySubpelVariance_avx2[] = {
+ SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
+ SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
+ SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
+ SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
+ SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
+ SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
+ SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxSubpelVarianceTest,
+ ::testing::ValuesIn(kArraySubpelVariance_avx2));
+
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AvxSubpelAvgVarianceTest,
+ ::testing::Values(
+ SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2,
+ 0),
+ SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_avx2,
+ 0),
+ SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_avx2,
+ 0),
+ SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0),
+ SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0),
+ SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0),
+ SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0),
+ SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2,
+ 0)));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, MseWxHTest,
+ ::testing::Values(MseWxHParams(3, 3, &aom_mse_wxh_16bit_neon, 8),
+ MseWxHParams(3, 2, &aom_mse_wxh_16bit_neon, 8),
+ MseWxHParams(2, 3, &aom_mse_wxh_16bit_neon, 8),
+ MseWxHParams(2, 2, &aom_mse_wxh_16bit_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, Mse16xHTest,
+ ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_neon, 8),
+ Mse16xHParams(3, 2, &aom_mse_16xh_16bit_neon, 8),
+ Mse16xHParams(2, 3, &aom_mse_16xh_16bit_neon, 8),
+ Mse16xHParams(2, 2, &aom_mse_16xh_16bit_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, SumOfSquaresTest,
+ ::testing::Values(aom_get_mb_ss_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
+ ::testing::Values(MseParams(3, 3, &aom_mse8x8_neon),
+ MseParams(3, 4, &aom_mse8x16_neon),
+ MseParams(4, 4, &aom_mse16x16_neon),
+ MseParams(4, 3, &aom_mse16x8_neon)));
+
+const VarianceParams kArrayVariance_neon[] = {
+ VarianceParams(7, 7, &aom_variance128x128_neon),
+ VarianceParams(6, 6, &aom_variance64x64_neon),
+ VarianceParams(7, 6, &aom_variance128x64_neon),
+ VarianceParams(6, 7, &aom_variance64x128_neon),
+ VarianceParams(6, 6, &aom_variance64x64_neon),
+ VarianceParams(6, 5, &aom_variance64x32_neon),
+ VarianceParams(5, 6, &aom_variance32x64_neon),
+ VarianceParams(5, 5, &aom_variance32x32_neon),
+ VarianceParams(5, 4, &aom_variance32x16_neon),
+ VarianceParams(4, 5, &aom_variance16x32_neon),
+ VarianceParams(4, 4, &aom_variance16x16_neon),
+ VarianceParams(4, 3, &aom_variance16x8_neon),
+ VarianceParams(3, 4, &aom_variance8x16_neon),
+ VarianceParams(3, 3, &aom_variance8x8_neon),
+ VarianceParams(3, 2, &aom_variance8x4_neon),
+ VarianceParams(2, 3, &aom_variance4x8_neon),
+ VarianceParams(2, 2, &aom_variance4x4_neon),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(2, 4, &aom_variance4x16_neon),
+ VarianceParams(4, 2, &aom_variance16x4_neon),
+ VarianceParams(3, 5, &aom_variance8x32_neon),
+ VarianceParams(5, 3, &aom_variance32x8_neon),
+ VarianceParams(4, 6, &aom_variance16x64_neon),
+ VarianceParams(6, 4, &aom_variance64x16_neon),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxVarianceTest,
+ ::testing::ValuesIn(kArrayVariance_neon));
+
+const SubpelVarianceParams kArraySubpelVariance_neon[] = {
+ SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon, 0),
+ SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_neon, 0),
+ SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_neon, 0),
+ SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
+ SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_neon, 0),
+ SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_neon, 0),
+ SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_neon, 0),
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_neon, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_neon, 0),
+ SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_neon, 0),
+ SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0),
+ SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_neon, 0),
+ SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_neon, 0),
+ SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_neon, 0),
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_neon, 0),
+ SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_neon, 0),
+ SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_neon, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_neon, 0),
+ SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_neon, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelVarianceTest,
+ ::testing::ValuesIn(kArraySubpelVariance_neon));
+
+const SubpelAvgVarianceParams kArraySubpelAvgVariance_neon[] = {
+ SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_neon, 0),
+ SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_neon, 0),
+ SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_neon, 0),
+ SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_neon, 0),
+ SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_neon, 0),
+ SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_neon, 0),
+ SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_neon, 0),
+ SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_neon, 0),
+ SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_neon, 0),
+ SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_neon, 0),
+ SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_neon, 0),
+ SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_neon, 0),
+ SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_neon, 0),
+ SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_neon, 0),
+ SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_neon, 0),
+ SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_neon, 0),
+ SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_neon, 0),
+ SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_neon, 0),
+ SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_neon, 0),
+ SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_neon, 0),
+ SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_neon, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArraySubpelAvgVariance_neon));
+
+const DistWtdSubpelAvgVarianceParams kArrayDistWtdSubpelAvgVariance_neon[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_neon, 0),
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_neon, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_neon, 0),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AvxDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayDistWtdSubpelAvgVariance_neon));
+
+#if !CONFIG_REALTIME_ONLY
+const ObmcSubpelVarianceParams kArrayObmcSubpelVariance_neon[] = {
+ ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_neon, 0),
+ ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_neon, 0),
+ ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_neon, 0),
+ ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_neon, 0),
+ ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_neon, 0),
+ ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_neon, 0),
+ ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_neon, 0),
+ ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_neon, 0),
+ ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_neon, 0),
+ ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_neon, 0),
+ ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_neon, 0),
+ ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_neon, 0),
+ ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_neon, 0),
+ ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_neon, 0),
+ ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_neon, 0),
+ ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_neon, 0),
+ ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_neon, 0),
+ ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_neon, 0),
+ ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_neon, 0),
+ ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_neon, 0),
+ ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_neon, 0),
+ ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_neon, 0),
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxObmcSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayObmcSubpelVariance_neon));
+#endif
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_neon[] = {
+ GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon, 0),
+ GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon, 0),
+ GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_neon, 0),
+ GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_neon, 0)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, GetSseSum8x8QuadTest,
+ ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon));
+
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_neon[] = {
+ GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_neon, 0),
+ GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_neon, 0),
+ GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_neon, 0),
+ GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_neon, 0)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, GetSseSum16x16DualTest,
+ ::testing::ValuesIn(kArrayGetSseSum16x16Dual_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const VarianceParams kArrayHBDVariance_neon[] = {
+ VarianceParams(7, 7, &aom_highbd_12_variance128x128_neon, 12),
+ VarianceParams(7, 6, &aom_highbd_12_variance128x64_neon, 12),
+ VarianceParams(6, 7, &aom_highbd_12_variance64x128_neon, 12),
+ VarianceParams(6, 6, &aom_highbd_12_variance64x64_neon, 12),
+ VarianceParams(6, 5, &aom_highbd_12_variance64x32_neon, 12),
+ VarianceParams(5, 6, &aom_highbd_12_variance32x64_neon, 12),
+ VarianceParams(5, 5, &aom_highbd_12_variance32x32_neon, 12),
+ VarianceParams(5, 4, &aom_highbd_12_variance32x16_neon, 12),
+ VarianceParams(4, 5, &aom_highbd_12_variance16x32_neon, 12),
+ VarianceParams(4, 4, &aom_highbd_12_variance16x16_neon, 12),
+ VarianceParams(4, 3, &aom_highbd_12_variance16x8_neon, 12),
+ VarianceParams(3, 4, &aom_highbd_12_variance8x16_neon, 12),
+ VarianceParams(3, 3, &aom_highbd_12_variance8x8_neon, 12),
+ VarianceParams(3, 2, &aom_highbd_12_variance8x4_neon, 12),
+ VarianceParams(2, 3, &aom_highbd_12_variance4x8_neon, 12),
+ VarianceParams(2, 2, &aom_highbd_12_variance4x4_neon, 12),
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_neon, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_neon, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_neon, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_neon, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_neon, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_neon, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_neon, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_neon, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_neon, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_neon, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_neon, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_neon, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_neon, 10),
+ VarianceParams(3, 2, &aom_highbd_10_variance8x4_neon, 10),
+ VarianceParams(2, 3, &aom_highbd_10_variance4x8_neon, 10),
+ VarianceParams(2, 2, &aom_highbd_10_variance4x4_neon, 10),
+ VarianceParams(7, 7, &aom_highbd_8_variance128x128_neon, 8),
+ VarianceParams(7, 6, &aom_highbd_8_variance128x64_neon, 8),
+ VarianceParams(6, 7, &aom_highbd_8_variance64x128_neon, 8),
+ VarianceParams(6, 6, &aom_highbd_8_variance64x64_neon, 8),
+ VarianceParams(6, 5, &aom_highbd_8_variance64x32_neon, 8),
+ VarianceParams(5, 6, &aom_highbd_8_variance32x64_neon, 8),
+ VarianceParams(5, 5, &aom_highbd_8_variance32x32_neon, 8),
+ VarianceParams(5, 4, &aom_highbd_8_variance32x16_neon, 8),
+ VarianceParams(4, 5, &aom_highbd_8_variance16x32_neon, 8),
+ VarianceParams(4, 4, &aom_highbd_8_variance16x16_neon, 8),
+ VarianceParams(4, 3, &aom_highbd_8_variance16x8_neon, 8),
+ VarianceParams(3, 4, &aom_highbd_8_variance8x16_neon, 8),
+ VarianceParams(3, 3, &aom_highbd_8_variance8x8_neon, 8),
+ VarianceParams(3, 2, &aom_highbd_8_variance8x4_neon, 8),
+ VarianceParams(2, 3, &aom_highbd_8_variance4x8_neon, 8),
+ VarianceParams(2, 2, &aom_highbd_8_variance4x4_neon, 8),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_highbd_12_variance64x16_neon, 12),
+ VarianceParams(4, 6, &aom_highbd_12_variance16x64_neon, 12),
+ VarianceParams(5, 3, &aom_highbd_12_variance32x8_neon, 12),
+ VarianceParams(3, 5, &aom_highbd_12_variance8x32_neon, 12),
+ VarianceParams(4, 2, &aom_highbd_12_variance16x4_neon, 12),
+ VarianceParams(2, 4, &aom_highbd_12_variance4x16_neon, 12),
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_neon, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_neon, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_neon, 10),
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_neon, 10),
+ VarianceParams(4, 2, &aom_highbd_10_variance16x4_neon, 10),
+ VarianceParams(2, 4, &aom_highbd_10_variance4x16_neon, 10),
+ VarianceParams(6, 4, &aom_highbd_8_variance64x16_neon, 8),
+ VarianceParams(4, 6, &aom_highbd_8_variance16x64_neon, 8),
+ VarianceParams(5, 3, &aom_highbd_8_variance32x8_neon, 8),
+ VarianceParams(3, 5, &aom_highbd_8_variance8x32_neon, 8),
+ VarianceParams(4, 2, &aom_highbd_8_variance16x4_neon, 8),
+ VarianceParams(2, 4, &aom_highbd_8_variance4x16_neon, 8),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_neon));
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_neon[] = {
+ SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_neon, 12),
+ SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_neon, 12),
+ SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_neon, 12),
+ SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_neon, 12),
+ SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_neon, 12),
+ SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_neon, 12),
+ SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_neon, 12),
+ SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_neon, 12),
+ SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_neon, 12),
+ SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_neon, 12),
+ SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_neon, 12),
+ SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_neon, 12),
+ SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_neon, 12),
+ SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_neon, 10),
+ SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_neon, 10),
+ SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_neon, 10),
+ SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_neon, 10),
+ SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_neon, 10),
+ SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_neon, 10),
+ SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_neon, 10),
+ SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_neon, 10),
+ SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_neon, 10),
+ SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_neon, 10),
+ SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_neon, 10),
+ SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_neon, 10),
+ SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_neon, 10),
+ SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_neon, 8),
+ SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_neon, 8),
+ SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_neon, 8),
+ SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_neon, 8),
+ SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_neon, 8),
+ SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_neon, 8),
+ SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_neon, 8),
+ SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_neon, 8),
+ SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_neon, 8),
+ SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_neon, 8),
+ SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_neon, 8),
+ SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_neon, 8),
+ SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_neon, 8),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_neon, 8),
+ SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_neon, 8),
+ SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_neon, 8),
+ SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_neon, 8),
+ SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_neon, 8),
+ SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_neon, 8),
+ SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_neon, 10),
+ SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_neon, 10),
+ SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_neon, 10),
+ SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_neon, 10),
+ SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_neon, 10),
+ SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_neon, 10),
+ SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_neon, 12),
+ SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_neon, 12),
+ SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_neon, 12),
+ SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_neon, 12),
+ SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_neon, 12),
+ SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_neon, 12),
+#endif //! CONFIG_REALTIME_ONLY
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelVariance_neon));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_neon[] = {
+ SubpelAvgVarianceParams(7, 7,
+ &aom_highbd_8_sub_pixel_avg_variance128x128_neon, 8),
+ SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_neon,
+ 8),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_neon,
+ 8),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_neon,
+ 8),
+ SubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_neon, 10),
+ SubpelAvgVarianceParams(7, 6,
+ &aom_highbd_10_sub_pixel_avg_variance128x64_neon, 10),
+ SubpelAvgVarianceParams(6, 7,
+ &aom_highbd_10_sub_pixel_avg_variance64x128_neon, 10),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_neon,
+ 10),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_neon,
+ 10),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_neon,
+ 10),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_neon,
+ 10),
+ SubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_neon, 12),
+ SubpelAvgVarianceParams(7, 6,
+ &aom_highbd_12_sub_pixel_avg_variance128x64_neon, 12),
+ SubpelAvgVarianceParams(6, 7,
+ &aom_highbd_12_sub_pixel_avg_variance64x128_neon, 12),
+ SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_neon,
+ 12),
+ SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_neon,
+ 12),
+ SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_neon,
+ 12),
+ SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_neon,
+ 12),
+
+#if !CONFIG_REALTIME_ONLY
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_neon,
+ 8),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_neon,
+ 8),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_neon,
+ 8),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_neon,
+ 8),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_neon,
+ 8),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_neon,
+ 10),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_neon,
+ 10),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_neon,
+ 10),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_neon,
+ 10),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_neon,
+ 10),
+ SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_neon,
+ 12),
+ SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_neon,
+ 12),
+ SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_neon,
+ 12),
+ SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_neon,
+ 12),
+ SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_neon,
+ 12),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_neon));
+
+const DistWtdSubpelAvgVarianceParams
+ kArrayHBDDistWtdSubpelAvgVariance_neon[] = {
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 7, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 7, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4_neon, 12),
+
+#if !CONFIG_REALTIME_ONLY
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16_neon, 8),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16_neon, 10),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 6, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 3, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 5, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 2, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4_neon, 12),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 4, &aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16_neon, 12),
+#endif // !CONFIG_REALTIME_ONLY
+ };
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AvxHBDDistWtdSubpelAvgVarianceTest,
+ ::testing::ValuesIn(kArrayHBDDistWtdSubpelAvgVariance_neon));
+
+#if !CONFIG_REALTIME_ONLY
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_neon[] = {
+ ObmcSubpelVarianceParams(
+ 7, 7, &aom_highbd_12_obmc_sub_pixel_variance128x128_neon, 12),
+ ObmcSubpelVarianceParams(
+ 7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_neon, 12),
+ ObmcSubpelVarianceParams(
+ 6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_neon, 12),
+ ObmcSubpelVarianceParams(
+ 6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_neon, 12),
+ ObmcSubpelVarianceParams(
+ 6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_neon, 12),
+ ObmcSubpelVarianceParams(
+ 5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_neon, 12),
+ ObmcSubpelVarianceParams(
+ 5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_neon, 12),
+ ObmcSubpelVarianceParams(
+ 5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_neon, 12),
+ ObmcSubpelVarianceParams(
+ 4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_neon, 12),
+ ObmcSubpelVarianceParams(
+ 4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_neon, 12),
+ ObmcSubpelVarianceParams(4, 3,
+ &aom_highbd_12_obmc_sub_pixel_variance16x8_neon, 12),
+ ObmcSubpelVarianceParams(3, 4,
+ &aom_highbd_12_obmc_sub_pixel_variance8x16_neon, 12),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_neon,
+ 12),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_neon,
+ 12),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_neon,
+ 12),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_neon,
+ 12),
+ ObmcSubpelVarianceParams(
+ 6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_neon, 12),
+ ObmcSubpelVarianceParams(
+ 4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_neon, 12),
+ ObmcSubpelVarianceParams(5, 3,
+ &aom_highbd_12_obmc_sub_pixel_variance32x8_neon, 12),
+ ObmcSubpelVarianceParams(3, 5,
+ &aom_highbd_12_obmc_sub_pixel_variance8x32_neon, 12),
+ ObmcSubpelVarianceParams(4, 2,
+ &aom_highbd_12_obmc_sub_pixel_variance16x4_neon, 12),
+ ObmcSubpelVarianceParams(2, 4,
+ &aom_highbd_12_obmc_sub_pixel_variance4x16_neon, 12),
+ ObmcSubpelVarianceParams(
+ 7, 7, &aom_highbd_10_obmc_sub_pixel_variance128x128_neon, 10),
+ ObmcSubpelVarianceParams(
+ 7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_neon, 10),
+ ObmcSubpelVarianceParams(
+ 6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_neon, 10),
+ ObmcSubpelVarianceParams(
+ 6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_neon, 10),
+ ObmcSubpelVarianceParams(
+ 6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_neon, 10),
+ ObmcSubpelVarianceParams(
+ 5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_neon, 10),
+ ObmcSubpelVarianceParams(
+ 5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_neon, 10),
+ ObmcSubpelVarianceParams(
+ 5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_neon, 10),
+ ObmcSubpelVarianceParams(
+ 4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_neon, 10),
+ ObmcSubpelVarianceParams(
+ 4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_neon, 10),
+ ObmcSubpelVarianceParams(4, 3,
+ &aom_highbd_10_obmc_sub_pixel_variance16x8_neon, 10),
+ ObmcSubpelVarianceParams(3, 4,
+ &aom_highbd_10_obmc_sub_pixel_variance8x16_neon, 10),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_neon,
+ 10),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_neon,
+ 10),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_neon,
+ 10),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_neon,
+ 10),
+ ObmcSubpelVarianceParams(
+ 6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_neon, 10),
+ ObmcSubpelVarianceParams(
+ 4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_neon, 10),
+ ObmcSubpelVarianceParams(5, 3,
+ &aom_highbd_10_obmc_sub_pixel_variance32x8_neon, 10),
+ ObmcSubpelVarianceParams(3, 5,
+ &aom_highbd_10_obmc_sub_pixel_variance8x32_neon, 10),
+ ObmcSubpelVarianceParams(4, 2,
+ &aom_highbd_10_obmc_sub_pixel_variance16x4_neon, 10),
+ ObmcSubpelVarianceParams(2, 4,
+ &aom_highbd_10_obmc_sub_pixel_variance4x16_neon, 10),
+ ObmcSubpelVarianceParams(
+ 7, 7, &aom_highbd_8_obmc_sub_pixel_variance128x128_neon, 8),
+ ObmcSubpelVarianceParams(7, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance128x64_neon, 8),
+ ObmcSubpelVarianceParams(6, 7,
+ &aom_highbd_8_obmc_sub_pixel_variance64x128_neon, 8),
+ ObmcSubpelVarianceParams(6, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance64x64_neon, 8),
+ ObmcSubpelVarianceParams(6, 5,
+ &aom_highbd_8_obmc_sub_pixel_variance64x32_neon, 8),
+ ObmcSubpelVarianceParams(5, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance32x64_neon, 8),
+ ObmcSubpelVarianceParams(5, 5,
+ &aom_highbd_8_obmc_sub_pixel_variance32x32_neon, 8),
+ ObmcSubpelVarianceParams(5, 4,
+ &aom_highbd_8_obmc_sub_pixel_variance32x16_neon, 8),
+ ObmcSubpelVarianceParams(4, 5,
+ &aom_highbd_8_obmc_sub_pixel_variance16x32_neon, 8),
+ ObmcSubpelVarianceParams(4, 4,
+ &aom_highbd_8_obmc_sub_pixel_variance16x16_neon, 8),
+ ObmcSubpelVarianceParams(4, 3, &aom_highbd_8_obmc_sub_pixel_variance16x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 4, &aom_highbd_8_obmc_sub_pixel_variance8x16_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 3, &aom_highbd_8_obmc_sub_pixel_variance8x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 2, &aom_highbd_8_obmc_sub_pixel_variance8x4_neon,
+ 8),
+ ObmcSubpelVarianceParams(2, 3, &aom_highbd_8_obmc_sub_pixel_variance4x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(2, 2, &aom_highbd_8_obmc_sub_pixel_variance4x4_neon,
+ 8),
+ ObmcSubpelVarianceParams(6, 4,
+ &aom_highbd_8_obmc_sub_pixel_variance64x16_neon, 8),
+ ObmcSubpelVarianceParams(4, 6,
+ &aom_highbd_8_obmc_sub_pixel_variance16x64_neon, 8),
+ ObmcSubpelVarianceParams(5, 3, &aom_highbd_8_obmc_sub_pixel_variance32x8_neon,
+ 8),
+ ObmcSubpelVarianceParams(3, 5, &aom_highbd_8_obmc_sub_pixel_variance8x32_neon,
+ 8),
+ ObmcSubpelVarianceParams(4, 2, &aom_highbd_8_obmc_sub_pixel_variance16x4_neon,
+ 8),
+ ObmcSubpelVarianceParams(2, 4, &aom_highbd_8_obmc_sub_pixel_variance4x16_neon,
+ 8),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxHBDObmcSubpelVarianceTest,
+ ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_neon));
+#endif // !CONFIG_REALTIME_ONLY
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+
+const VarianceParams kArrayVariance_neon_dotprod[] = {
+ VarianceParams(7, 7, &aom_variance128x128_neon_dotprod),
+ VarianceParams(6, 6, &aom_variance64x64_neon_dotprod),
+ VarianceParams(7, 6, &aom_variance128x64_neon_dotprod),
+ VarianceParams(6, 7, &aom_variance64x128_neon_dotprod),
+ VarianceParams(6, 6, &aom_variance64x64_neon_dotprod),
+ VarianceParams(6, 5, &aom_variance64x32_neon_dotprod),
+ VarianceParams(5, 6, &aom_variance32x64_neon_dotprod),
+ VarianceParams(5, 5, &aom_variance32x32_neon_dotprod),
+ VarianceParams(5, 4, &aom_variance32x16_neon_dotprod),
+ VarianceParams(4, 5, &aom_variance16x32_neon_dotprod),
+ VarianceParams(4, 4, &aom_variance16x16_neon_dotprod),
+ VarianceParams(4, 3, &aom_variance16x8_neon_dotprod),
+ VarianceParams(3, 4, &aom_variance8x16_neon_dotprod),
+ VarianceParams(3, 3, &aom_variance8x8_neon_dotprod),
+ VarianceParams(3, 2, &aom_variance8x4_neon_dotprod),
+ VarianceParams(2, 3, &aom_variance4x8_neon_dotprod),
+ VarianceParams(2, 2, &aom_variance4x4_neon_dotprod),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(2, 4, &aom_variance4x16_neon_dotprod),
+ VarianceParams(4, 2, &aom_variance16x4_neon_dotprod),
+ VarianceParams(3, 5, &aom_variance8x32_neon_dotprod),
+ VarianceParams(5, 3, &aom_variance32x8_neon_dotprod),
+ VarianceParams(4, 6, &aom_variance16x64_neon_dotprod),
+ VarianceParams(6, 4, &aom_variance64x16_neon_dotprod),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AvxVarianceTest,
+ ::testing::ValuesIn(kArrayVariance_neon_dotprod));
+
+const GetSseSumParams kArrayGetSseSum8x8Quad_neon_dotprod[] = {
+ GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+ GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+ GetSseSumParams(5, 5, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
+ GetSseSumParams(5, 4, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0)
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, GetSseSum8x8QuadTest,
+ ::testing::ValuesIn(kArrayGetSseSum8x8Quad_neon_dotprod));
+
+const GetSseSumParamsDual kArrayGetSseSum16x16Dual_neon_dotprod[] = {
+ GetSseSumParamsDual(7, 7, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+ GetSseSumParamsDual(6, 6, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+ GetSseSumParamsDual(5, 5, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0),
+ GetSseSumParamsDual(5, 4, &aom_get_var_sse_sum_16x16_dual_neon_dotprod, 0)
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, GetSseSum16x16DualTest,
+ ::testing::ValuesIn(kArrayGetSseSum16x16Dual_neon_dotprod));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AvxMseTest,
+ ::testing::Values(MseParams(3, 3, &aom_mse8x8_neon_dotprod),
+ MseParams(3, 4, &aom_mse8x16_neon_dotprod),
+ MseParams(4, 4, &aom_mse16x16_neon_dotprod),
+ MseParams(4, 3, &aom_mse16x8_neon_dotprod)));
+
+#endif // HAVE_NEON_DOTPROD
+
+#if HAVE_SVE
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const VarianceParams kArrayHBDVariance_sve[] = {
+ VarianceParams(7, 7, &aom_highbd_12_variance128x128_sve, 12),
+ VarianceParams(7, 6, &aom_highbd_12_variance128x64_sve, 12),
+ VarianceParams(6, 7, &aom_highbd_12_variance64x128_sve, 12),
+ VarianceParams(6, 6, &aom_highbd_12_variance64x64_sve, 12),
+ VarianceParams(6, 5, &aom_highbd_12_variance64x32_sve, 12),
+ VarianceParams(5, 6, &aom_highbd_12_variance32x64_sve, 12),
+ VarianceParams(5, 5, &aom_highbd_12_variance32x32_sve, 12),
+ VarianceParams(5, 4, &aom_highbd_12_variance32x16_sve, 12),
+ VarianceParams(4, 5, &aom_highbd_12_variance16x32_sve, 12),
+ VarianceParams(4, 4, &aom_highbd_12_variance16x16_sve, 12),
+ VarianceParams(4, 3, &aom_highbd_12_variance16x8_sve, 12),
+ VarianceParams(3, 4, &aom_highbd_12_variance8x16_sve, 12),
+ VarianceParams(3, 3, &aom_highbd_12_variance8x8_sve, 12),
+ VarianceParams(3, 2, &aom_highbd_12_variance8x4_sve, 12),
+ VarianceParams(2, 3, &aom_highbd_12_variance4x8_sve, 12),
+ VarianceParams(2, 2, &aom_highbd_12_variance4x4_sve, 12),
+ VarianceParams(7, 7, &aom_highbd_10_variance128x128_sve, 10),
+ VarianceParams(7, 6, &aom_highbd_10_variance128x64_sve, 10),
+ VarianceParams(6, 7, &aom_highbd_10_variance64x128_sve, 10),
+ VarianceParams(6, 6, &aom_highbd_10_variance64x64_sve, 10),
+ VarianceParams(6, 5, &aom_highbd_10_variance64x32_sve, 10),
+ VarianceParams(5, 6, &aom_highbd_10_variance32x64_sve, 10),
+ VarianceParams(5, 5, &aom_highbd_10_variance32x32_sve, 10),
+ VarianceParams(5, 4, &aom_highbd_10_variance32x16_sve, 10),
+ VarianceParams(4, 5, &aom_highbd_10_variance16x32_sve, 10),
+ VarianceParams(4, 4, &aom_highbd_10_variance16x16_sve, 10),
+ VarianceParams(4, 3, &aom_highbd_10_variance16x8_sve, 10),
+ VarianceParams(3, 4, &aom_highbd_10_variance8x16_sve, 10),
+ VarianceParams(3, 3, &aom_highbd_10_variance8x8_sve, 10),
+ VarianceParams(3, 2, &aom_highbd_10_variance8x4_sve, 10),
+ VarianceParams(2, 3, &aom_highbd_10_variance4x8_sve, 10),
+ VarianceParams(2, 2, &aom_highbd_10_variance4x4_sve, 10),
+ VarianceParams(7, 7, &aom_highbd_8_variance128x128_sve, 8),
+ VarianceParams(7, 6, &aom_highbd_8_variance128x64_sve, 8),
+ VarianceParams(6, 7, &aom_highbd_8_variance64x128_sve, 8),
+ VarianceParams(6, 6, &aom_highbd_8_variance64x64_sve, 8),
+ VarianceParams(6, 5, &aom_highbd_8_variance64x32_sve, 8),
+ VarianceParams(5, 6, &aom_highbd_8_variance32x64_sve, 8),
+ VarianceParams(5, 5, &aom_highbd_8_variance32x32_sve, 8),
+ VarianceParams(5, 4, &aom_highbd_8_variance32x16_sve, 8),
+ VarianceParams(4, 5, &aom_highbd_8_variance16x32_sve, 8),
+ VarianceParams(4, 4, &aom_highbd_8_variance16x16_sve, 8),
+ VarianceParams(4, 3, &aom_highbd_8_variance16x8_sve, 8),
+ VarianceParams(3, 4, &aom_highbd_8_variance8x16_sve, 8),
+ VarianceParams(3, 3, &aom_highbd_8_variance8x8_sve, 8),
+ VarianceParams(3, 2, &aom_highbd_8_variance8x4_sve, 8),
+ VarianceParams(2, 3, &aom_highbd_8_variance4x8_sve, 8),
+ VarianceParams(2, 2, &aom_highbd_8_variance4x4_sve, 8),
+#if !CONFIG_REALTIME_ONLY
+ VarianceParams(6, 4, &aom_highbd_12_variance64x16_sve, 12),
+ VarianceParams(4, 6, &aom_highbd_12_variance16x64_sve, 12),
+ VarianceParams(5, 3, &aom_highbd_12_variance32x8_sve, 12),
+ VarianceParams(3, 5, &aom_highbd_12_variance8x32_sve, 12),
+ VarianceParams(4, 2, &aom_highbd_12_variance16x4_sve, 12),
+ VarianceParams(2, 4, &aom_highbd_12_variance4x16_sve, 12),
+ VarianceParams(6, 4, &aom_highbd_10_variance64x16_sve, 10),
+ VarianceParams(4, 6, &aom_highbd_10_variance16x64_sve, 10),
+ VarianceParams(5, 3, &aom_highbd_10_variance32x8_sve, 10),
+ VarianceParams(3, 5, &aom_highbd_10_variance8x32_sve, 10),
+ VarianceParams(4, 2, &aom_highbd_10_variance16x4_sve, 10),
+ VarianceParams(2, 4, &aom_highbd_10_variance4x16_sve, 10),
+ VarianceParams(6, 4, &aom_highbd_8_variance64x16_sve, 8),
+ VarianceParams(4, 6, &aom_highbd_8_variance16x64_sve, 8),
+ VarianceParams(5, 3, &aom_highbd_8_variance32x8_sve, 8),
+ VarianceParams(3, 5, &aom_highbd_8_variance8x32_sve, 8),
+ VarianceParams(4, 2, &aom_highbd_8_variance16x4_sve, 8),
+ VarianceParams(2, 4, &aom_highbd_8_variance4x16_sve, 8),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(SVE, AvxHBDVarianceTest,
+ ::testing::ValuesIn(kArrayHBDVariance_sve));
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_SVE
+
+} // namespace
diff --git a/third_party/aom/test/video_source.h b/third_party/aom/test/video_source.h
new file mode 100644
index 0000000000..9d73d7b253
--- /dev/null
+++ b/third_party/aom/test/video_source.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_VIDEO_SOURCE_H_
+#define AOM_TEST_VIDEO_SOURCE_H_
+
+#if defined(_WIN32)
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "aom/aom_encoder.h"
+#include "test/acm_random.h"
+#if !defined(_WIN32)
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#endif
+
+namespace libaom_test {
+
+// Helper macros to ensure LIBAOM_TEST_DATA_PATH is a quoted string.
+// These are undefined right below GetDataPath
+// NOTE: LIBAOM_TEST_DATA_PATH MUST NOT be a quoted string before
+// Stringification or the GetDataPath will fail at runtime
+#define TO_STRING(S) #S
+#define STRINGIFY(S) TO_STRING(S)
+
+// A simple function to encapsulate cross platform retrieval of test data path
+static std::string GetDataPath() {
+ const char *const data_path = getenv("LIBAOM_TEST_DATA_PATH");
+ if (data_path == nullptr) {
+#ifdef LIBAOM_TEST_DATA_PATH
+ // In some environments, we cannot set environment variables
+ // Instead, we set the data path by using a preprocessor symbol
+ // which can be set from make files
+ return STRINGIFY(LIBAOM_TEST_DATA_PATH);
+#else
+ return ".";
+#endif
+ }
+ return data_path;
+}
+
+// Undefining stringification macros because they are not used elsewhere
+#undef TO_STRING
+#undef STRINGIFY
+
+inline FILE *OpenTestDataFile(const std::string &file_name) {
+ const std::string path_to_source = GetDataPath() + "/" + file_name;
+ return fopen(path_to_source.c_str(), "rb");
+}
+
+static FILE *GetTempOutFile(std::string *file_name) {
+ file_name->clear();
+#if defined(_WIN32)
+ char fname[MAX_PATH];
+ char tmppath[MAX_PATH];
+ if (GetTempPathA(MAX_PATH, tmppath)) {
+ // Assume for now that the filename generated is unique per process
+ if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
+ file_name->assign(fname);
+ return fopen(fname, "wb+");
+ }
+ }
+ return nullptr;
+#else
+ std::string temp_dir = testing::TempDir();
+ if (temp_dir.empty()) return nullptr;
+ // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
+ // use the value of an environment variable without checking for a trailing
+ // path delimiter.
+ if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/';
+ const char name_template[] = "libaomtest.XXXXXX";
+ std::unique_ptr<char[]> temp_file_name(
+ new char[temp_dir.size() + sizeof(name_template)]);
+ if (temp_file_name == nullptr) return nullptr;
+ memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
+ memcpy(temp_file_name.get() + temp_dir.size(), name_template,
+ sizeof(name_template));
+ const int fd = mkstemp(temp_file_name.get());
+ if (fd == -1) return nullptr;
+ *file_name = temp_file_name.get();
+ return fdopen(fd, "wb+");
+#endif
+}
+
+class TempOutFile {
+ public:
+ TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+ ~TempOutFile() {
+ CloseFile();
+ if (!file_name_.empty()) {
+ EXPECT_EQ(0, remove(file_name_.c_str()));
+ }
+ }
+ FILE *file() { return file_; }
+ const std::string &file_name() { return file_name_; }
+
+ protected:
+ void CloseFile() {
+ if (file_) {
+ fclose(file_);
+ file_ = nullptr;
+ }
+ }
+ FILE *file_;
+ std::string file_name_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// aom_image_t images with associated timestamps and duration.
+class VideoSource {
+ public:
+ virtual ~VideoSource() = default;
+
+ // Prepare the stream for reading, rewind/open as necessary.
+ virtual void Begin() = 0;
+
+ // Advance the cursor to the next frame
+ virtual void Next() = 0;
+
+ // Get the current video frame, or nullptr on End-Of-Stream.
+ virtual aom_image_t *img() const = 0;
+
+ // Get the presentation timestamp of the current frame.
+ virtual aom_codec_pts_t pts() const = 0;
+
+ // Get the current frame's duration
+ virtual unsigned long duration() const = 0;
+
+ // Get the timebase for the stream
+ virtual aom_rational_t timebase() const = 0;
+
+ // Get the current frame counter, starting at 0.
+ virtual unsigned int frame() const = 0;
+
+ // Get the current file limit.
+ virtual unsigned int limit() const = 0;
+};
+
+class DummyVideoSource : public VideoSource {
+ public:
+ DummyVideoSource()
+ : img_(nullptr), limit_(100), width_(80), height_(64),
+ format_(AOM_IMG_FMT_I420) {
+ ReallocImage();
+ }
+
+ ~DummyVideoSource() override { aom_img_free(img_); }
+
+ void Begin() override {
+ frame_ = 0;
+ FillFrame();
+ }
+
+ void Next() override {
+ ++frame_;
+ FillFrame();
+ }
+
+ aom_image_t *img() const override {
+ return (frame_ < limit_) ? img_ : nullptr;
+ }
+
+ // Models a stream where Timebase = 1/FPS, so pts == frame.
+ aom_codec_pts_t pts() const override { return frame_; }
+
+ unsigned long duration() const override { return 1; }
+
+ aom_rational_t timebase() const override {
+ const aom_rational_t t = { 1, 30 };
+ return t;
+ }
+
+ unsigned int frame() const override { return frame_; }
+
+ unsigned int limit() const override { return limit_; }
+
+ void set_limit(unsigned int limit) { limit_ = limit; }
+
+ void SetSize(unsigned int width, unsigned int height) {
+ if (width != width_ || height != height_) {
+ width_ = width;
+ height_ = height;
+ ReallocImage();
+ }
+ }
+
+ void SetImageFormat(aom_img_fmt_t format) {
+ if (format_ != format) {
+ format_ = format;
+ ReallocImage();
+ }
+ }
+
+ protected:
+ virtual void FillFrame() {
+ if (img_) memset(img_->img_data, 0, raw_sz_);
+ }
+
+ void ReallocImage() {
+ aom_img_free(img_);
+ img_ = aom_img_alloc(nullptr, format_, width_, height_, 32);
+ ASSERT_NE(img_, nullptr);
+ raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
+ }
+
+ aom_image_t *img_;
+ size_t raw_sz_;
+ unsigned int limit_;
+ unsigned int frame_;
+ unsigned int width_;
+ unsigned int height_;
+ aom_img_fmt_t format_;
+};
+
+class RandomVideoSource : public DummyVideoSource {
+ public:
+ RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
+ : rnd_(seed), seed_(seed) {}
+
+ // Reset the RNG to get a matching stream for the second pass
+ void Begin() override {
+ frame_ = 0;
+ rnd_.Reset(seed_);
+ FillFrame();
+ }
+
+ protected:
+ // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
+ // than holding previous frames to encourage keyframes to be thrown.
+ void FillFrame() override {
+ if (img_) {
+ if (frame_ % 30 < 15)
+ for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
+ else
+ memset(img_->img_data, 0, raw_sz_);
+ }
+ }
+
+ ACMRandom rnd_;
+ int seed_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// decompressed images to the decoder.
+class CompressedVideoSource {
+ public:
+ virtual ~CompressedVideoSource() = default;
+
+ virtual void Init() = 0;
+
+ // Prepare the stream for reading, rewind/open as necessary.
+ virtual void Begin() = 0;
+
+ // Advance the cursor to the next frame
+ virtual void Next() = 0;
+
+ virtual const uint8_t *cxdata() const = 0;
+
+ virtual size_t frame_size() const = 0;
+
+ virtual unsigned int frame_number() const = 0;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/visual_metrics.py b/third_party/aom/test/visual_metrics.py
new file mode 100755
index 0000000000..9055feb334
--- /dev/null
+++ b/third_party/aom/test/visual_metrics.py
@@ -0,0 +1,466 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts video encoding result data from text files to visualization
+data source."""
+
+__author__ = "jzern@google.com (James Zern),"
+__author__ += "jimbankoski@google.com (Jim Bankoski)"
+
+import fnmatch
+import numpy as np
+import scipy as sp
+import scipy.interpolate
+import os
+import re
+import string
+import sys
+import math
+import warnings
+
+import gviz_api
+
+from os.path import basename
+from os.path import splitext
+
+warnings.simplefilter('ignore', np.RankWarning)
+warnings.simplefilter('ignore', RuntimeWarning)
+
+def bdsnr2(metric_set1, metric_set2):
+ """
+ BJONTEGAARD Bjontegaard metric calculation adapted
+ Bjontegaard's snr metric allows to compute the average % saving in decibels
+ between two rate-distortion curves [1]. This is an adaptation of that
+ method that fixes inconsistencies when the curve fit operation goes awry
+ by replacing the curve fit function with a Piecewise Cubic Hermite
+ Interpolating Polynomial and then integrating that by evaluating that
+ function at small intervals using the trapezoid method to calculate
+ the integral.
+
+ metric_set1 - list of tuples ( bitrate, metric ) for first graph
+ metric_set2 - list of tuples ( bitrate, metric ) for second graph
+ """
+
+ if not metric_set1 or not metric_set2:
+ return 0.0
+
+ try:
+
+ # pchip_interlopate requires keys sorted by x axis. x-axis will
+ # be our metric not the bitrate so sort by metric.
+ metric_set1.sort()
+ metric_set2.sort()
+
+ # Pull the log of the rate and clamped psnr from metric_sets.
+ log_rate1 = [math.log(x[0]) for x in metric_set1]
+ metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+ log_rate2 = [math.log(x[0]) for x in metric_set2]
+ metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+ # Integration interval. This metric only works on the area that's
+ # overlapping. Extrapolation of these things is sketchy so we avoid.
+ min_int = max([min(log_rate1), min(log_rate2)])
+ max_int = min([max(log_rate1), max(log_rate2)])
+
+ # No overlap means no sensible metric possible.
+ if max_int <= min_int:
+ return 0.0
+
+ # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+ # create 100 new samples points separated by interval.
+ lin = np.linspace(min_int, max_int, num=100, retstep=True)
+ interval = lin[1]
+ samples = lin[0]
+ v1 = scipy.interpolate.pchip_interpolate(log_rate1, metric1, samples)
+ v2 = scipy.interpolate.pchip_interpolate(log_rate2, metric2, samples)
+
+ # Calculate the integral using the trapezoid method on the samples.
+ int_v1 = np.trapz(v1, dx=interval)
+ int_v2 = np.trapz(v2, dx=interval)
+
+ # Calculate the average improvement.
+ avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+ except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+ return 0
+
+ return avg_exp_diff
+
+def bdrate2(metric_set1, metric_set2):
+ """
+ BJONTEGAARD Bjontegaard metric calculation adapted
+ Bjontegaard's metric allows to compute the average % saving in bitrate
+ between two rate-distortion curves [1]. This is an adaptation of that
+ method that fixes inconsistencies when the curve fit operation goes awry
+ by replacing the curve fit function with a Piecewise Cubic Hermite
+ Interpolating Polynomial and then integrating that by evaluating that
+ function at small intervals using the trapezoid method to calculate
+ the integral.
+
+ metric_set1 - list of tuples ( bitrate, metric ) for first graph
+ metric_set2 - list of tuples ( bitrate, metric ) for second graph
+ """
+
+ if not metric_set1 or not metric_set2:
+ return 0.0
+
+ try:
+
+ # pchip_interlopate requires keys sorted by x axis. x-axis will
+ # be our metric not the bitrate so sort by metric.
+ metric_set1.sort(key=lambda tup: tup[1])
+ metric_set2.sort(key=lambda tup: tup[1])
+
+ # Pull the log of the rate and clamped psnr from metric_sets.
+ log_rate1 = [math.log(x[0]) for x in metric_set1]
+ metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+ log_rate2 = [math.log(x[0]) for x in metric_set2]
+ metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+ # Integration interval. This metric only works on the area that's
+ # overlapping. Extrapolation of these things is sketchy so we avoid.
+ min_int = max([min(metric1), min(metric2)])
+ max_int = min([max(metric1), max(metric2)])
+
+ # No overlap means no sensible metric possible.
+ if max_int <= min_int:
+ return 0.0
+
+ # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+ # create 100 new samples points separated by interval.
+ lin = np.linspace(min_int, max_int, num=100, retstep=True)
+ interval = lin[1]
+ samples = lin[0]
+ v1 = scipy.interpolate.pchip_interpolate(metric1, log_rate1, samples)
+ v2 = scipy.interpolate.pchip_interpolate(metric2, log_rate2, samples)
+
+ # Calculate the integral using the trapezoid method on the samples.
+ int_v1 = np.trapz(v1, dx=interval)
+ int_v2 = np.trapz(v2, dx=interval)
+
+ # Calculate the average improvement.
+ avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+ except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+ return 0
+
+ # Convert to a percentage.
+ avg_diff = (math.exp(avg_exp_diff) - 1) * 100
+
+ return avg_diff
+
+
+
+def FillForm(string_for_substitution, dictionary_of_vars):
+ """
+ This function substitutes all matches of the command string //%% ... %%//
+ with the variable represented by ... .
+ """
+ return_string = string_for_substitution
+ for i in re.findall("//%%(.*)%%//", string_for_substitution):
+ return_string = re.sub("//%%" + i + "%%//", dictionary_of_vars[i],
+ return_string)
+ return return_string
+
+
+def HasMetrics(line):
+ """
+ The metrics files produced by aomenc are started with a B for headers.
+ """
+ # If the first char of the first word on the line is a digit
+ if len(line) == 0:
+ return False
+ if len(line.split()) == 0:
+ return False
+ if line.split()[0][0:1].isdigit():
+ return True
+ return False
+
+def GetMetrics(file_name):
+ metric_file = open(file_name, "r")
+ return metric_file.readline().split();
+
+def ParseMetricFile(file_name, metric_column):
+ metric_set1 = set([])
+ metric_file = open(file_name, "r")
+ for line in metric_file:
+ metrics = string.split(line)
+ if HasMetrics(line):
+ if metric_column < len(metrics):
+ try:
+ tuple = float(metrics[0]), float(metrics[metric_column])
+ except:
+ tuple = float(metrics[0]), 0
+ else:
+ tuple = float(metrics[0]), 0
+ metric_set1.add(tuple)
+ metric_set1_sorted = sorted(metric_set1)
+ return metric_set1_sorted
+
+
+def FileBetter(file_name_1, file_name_2, metric_column, method):
+ """
+ Compares two data files and determines which is better and by how
+ much. Also produces a histogram of how much better, by PSNR.
+ metric_column is the metric.
+ """
+ # Store and parse our two files into lists of unique tuples.
+
+ # Read the two files, parsing out lines starting with bitrate.
+ metric_set1_sorted = ParseMetricFile(file_name_1, metric_column)
+ metric_set2_sorted = ParseMetricFile(file_name_2, metric_column)
+
+
+ def GraphBetter(metric_set1_sorted, metric_set2_sorted, base_is_set_2):
+ """
+ Search through the sorted metric file for metrics on either side of
+ the metric from file 1. Since both lists are sorted we really
+ should not have to search through the entire range, but these
+ are small files."""
+ total_bitrate_difference_ratio = 0.0
+ count = 0
+ for bitrate, metric in metric_set1_sorted:
+ if bitrate == 0:
+ continue
+ for i in range(len(metric_set2_sorted) - 1):
+ s2_bitrate_0, s2_metric_0 = metric_set2_sorted[i]
+ s2_bitrate_1, s2_metric_1 = metric_set2_sorted[i + 1]
+ # We have a point on either side of our metric range.
+ if metric > s2_metric_0 and metric <= s2_metric_1:
+
+ # Calculate a slope.
+ if s2_metric_1 - s2_metric_0 != 0:
+ metric_slope = ((s2_bitrate_1 - s2_bitrate_0) /
+ (s2_metric_1 - s2_metric_0))
+ else:
+ metric_slope = 0
+
+ estimated_s2_bitrate = (s2_bitrate_0 + (metric - s2_metric_0) *
+ metric_slope)
+
+ if estimated_s2_bitrate == 0:
+ continue
+ # Calculate percentage difference as given by base.
+ if base_is_set_2 == 0:
+ bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+ bitrate)
+ else:
+ bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+ estimated_s2_bitrate)
+
+ total_bitrate_difference_ratio += bitrate_difference_ratio
+ count += 1
+ break
+
+ # Calculate the average improvement between graphs.
+ if count != 0:
+ avg = total_bitrate_difference_ratio / count
+
+ else:
+ avg = 0.0
+
+ return avg
+
+ # Be fair to both graphs by testing all the points in each.
+ if method == 'avg':
+ avg_improvement = 50 * (
+ GraphBetter(metric_set1_sorted, metric_set2_sorted, 1) -
+ GraphBetter(metric_set2_sorted, metric_set1_sorted, 0))
+ elif method == 'dsnr':
+ avg_improvement = bdsnr2(metric_set1_sorted, metric_set2_sorted)
+ else:
+ avg_improvement = bdrate2(metric_set2_sorted, metric_set1_sorted)
+
+ return avg_improvement
+
+
+def HandleFiles(variables):
+ """
+ This script creates html for displaying metric data produced from data
+ in a video stats file, as created by the AOM project when enable_psnr
+ is turned on:
+
+ Usage: visual_metrics.py template.html pattern base_dir sub_dir [ sub_dir2 ..]
+
+ The script parses each metrics file [see below] that matches the
+ statfile_pattern in the baseline directory and looks for the file that
+ matches that same file in each of the sub_dirs, and compares the resultant
+ metrics bitrate, avg psnr, glb psnr, and ssim. "
+
+ It provides a table in which each row is a file in the line directory,
+ and a column for each subdir, with the cells representing how that clip
+ compares to baseline for that subdir. A graph is given for each which
+ compares filesize to that metric. If you click on a point in the graph it
+ zooms in on that point.
+
+ a SAMPLE metrics file:
+
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 25.911 38.242 38.104 38.258 38.121 75.790 14103
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 49.982 41.264 41.129 41.255 41.122 83.993 19817
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 74.967 42.911 42.767 42.899 42.756 87.928 17332
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 100.012 43.983 43.838 43.881 43.738 89.695 25389
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 149.980 45.338 45.203 45.184 45.043 91.591 25438
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 199.852 46.225 46.123 46.113 45.999 92.679 28302
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 249.922 46.864 46.773 46.777 46.673 93.334 27244
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 299.998 47.366 47.281 47.317 47.220 93.844 27137
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 349.769 47.746 47.677 47.722 47.648 94.178 32226
+ Bitrate AVGPsnr GLBPsnr AVPsnrP GLPsnrP VPXSSIM Time(us)
+ 399.773 48.032 47.971 48.013 47.946 94.362 36203
+
+ sample use:
+ visual_metrics.py template.html "*stt" aom aom_b aom_c > metrics.html
+ """
+
+ # The template file is the html file into which we will write the
+ # data from the stats file, formatted correctly for the gviz_api.
+ template_file = open(variables[1], "r")
+ page_template = template_file.read()
+ template_file.close()
+
+ # This is the path match pattern for finding stats files amongst
+ # all the other files it could be. eg: *.stt
+ file_pattern = variables[2]
+
+ # This is the directory with files that we will use to do the comparison
+ # against.
+ baseline_dir = variables[3]
+ snrs = ''
+ filestable = {}
+
+ filestable['dsnr'] = ''
+ filestable['drate'] = ''
+ filestable['avg'] = ''
+
+ # Dirs is directories after the baseline to compare to the base.
+ dirs = variables[4:len(variables)]
+
+ # Find the metric files in the baseline directory.
+ dir_list = sorted(fnmatch.filter(os.listdir(baseline_dir), file_pattern))
+
+ metrics = GetMetrics(baseline_dir + "/" + dir_list[0])
+
+ metrics_js = 'metrics = ["' + '", "'.join(metrics) + '"];'
+
+ for column in range(1, len(metrics)):
+
+ for metric in ['avg','dsnr','drate']:
+ description = {"file": ("string", "File")}
+
+ # Go through each directory and add a column header to our description.
+ countoverall = {}
+ sumoverall = {}
+
+ for directory in dirs:
+ description[directory] = ("number", directory)
+ countoverall[directory] = 0
+ sumoverall[directory] = 0
+
+ # Data holds the data for the visualization, name given comes from
+ # gviz_api sample code.
+ data = []
+ for filename in dir_list:
+ row = {'file': splitext(basename(filename))[0] }
+ baseline_file_name = baseline_dir + "/" + filename
+
+ # Read the metric file from each of the directories in our list.
+ for directory in dirs:
+ metric_file_name = directory + "/" + filename
+
+ # If there is a metric file in the current directory, open it
+ # and calculate its overall difference between it and the baseline
+ # directory's metric file.
+ if os.path.isfile(metric_file_name):
+ overall = FileBetter(baseline_file_name, metric_file_name,
+ column, metric)
+ row[directory] = overall
+
+ sumoverall[directory] += overall
+ countoverall[directory] += 1
+
+ data.append(row)
+
+ # Add the overall numbers.
+ row = {"file": "OVERALL" }
+ for directory in dirs:
+ row[directory] = sumoverall[directory] / countoverall[directory]
+ data.append(row)
+
+ # write the tables out
+ data_table = gviz_api.DataTable(description)
+ data_table.LoadData(data)
+
+ filestable[metric] = ( filestable[metric] + "filestable_" + metric +
+ "[" + str(column) + "]=" +
+ data_table.ToJSon(columns_order=["file"]+dirs) + "\n" )
+
+ filestable_avg = filestable['avg']
+ filestable_dpsnr = filestable['dsnr']
+ filestable_drate = filestable['drate']
+
+ # Now we collect all the data for all the graphs. First the column
+ # headers which will be Datarate and then each directory.
+ columns = ("datarate",baseline_dir)
+ description = {"datarate":("number", "Datarate")}
+ for directory in dirs:
+ description[directory] = ("number", directory)
+
+ description[baseline_dir] = ("number", baseline_dir)
+
+ snrs = snrs + "snrs[" + str(column) + "] = ["
+
+ # Now collect the data for the graphs, file by file.
+ for filename in dir_list:
+
+ data = []
+
+ # Collect the file in each directory and store all of its metrics
+ # in the associated gviz metrics table.
+ all_dirs = dirs + [baseline_dir]
+ for directory in all_dirs:
+
+ metric_file_name = directory + "/" + filename
+ if not os.path.isfile(metric_file_name):
+ continue
+
+ # Read and parse the metrics file storing it to the data we'll
+ # use for the gviz_api.Datatable.
+ metrics = ParseMetricFile(metric_file_name, column)
+ for bitrate, metric in metrics:
+ data.append({"datarate": bitrate, directory: metric})
+
+ data_table = gviz_api.DataTable(description)
+ data_table.LoadData(data)
+ snrs = snrs + "'" + data_table.ToJSon(
+ columns_order=tuple(["datarate",baseline_dir]+dirs)) + "',"
+
+ snrs = snrs + "]\n"
+
+ formatters = ""
+ for i in range(len(dirs)):
+ formatters = "%s formatter.format(better, %d);" % (formatters, i+1)
+
+ print FillForm(page_template, vars())
+ return
+
+if len(sys.argv) < 3:
+ print HandleFiles.__doc__
+else:
+ HandleFiles(sys.argv)
diff --git a/third_party/aom/test/warp_filter_test.cc b/third_party/aom/test/warp_filter_test.cc
new file mode 100644
index 0000000000..f0be7d226b
--- /dev/null
+++ b/third_party/aom/test/warp_filter_test.cc
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/warp_filter_test_util.h"
+using libaom_test::ACMRandom;
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
+#endif
+using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+
+TEST_P(AV1WarpFilterTest, CheckOutput) {
+ RunCheckOutput(std::get<3>(GET_PARAM(0)));
+}
+TEST_P(AV1WarpFilterTest, DISABLED_Speed) {
+ RunSpeedTest(std::get<3>(GET_PARAM(0)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ C, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
+
+#if CONFIG_AV1_HIGHBITDEPTH && (HAVE_SSE4_1 || HAVE_NEON)
+TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
+ RunCheckOutput(std::get<4>(GET_PARAM(0)));
+}
+TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
+ RunSpeedTest(std::get<4>(GET_PARAM(0)));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH && (HAVE_SSE4_1 || HAVE_NEON)
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+ SSE4_1, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest,
+ libaom_test::AV1HighbdWarpFilter::BuildParams(
+ av1_highbd_warp_affine_sse4_1));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ AVX2, AV1HighbdWarpFilterTest,
+ libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_avx2));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ NEON, AV1HighbdWarpFilterTest,
+ libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_neon));
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // HAVE_NEON
+
+#if HAVE_NEON_I8MM
+INSTANTIATE_TEST_SUITE_P(
+ NEON_I8MM, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon_i8mm));
+#endif // HAVE_NEON_I8MM
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, AV1WarpFilterTest,
+ libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve));
+#endif // HAVE_SVE
+
+} // namespace
diff --git a/third_party/aom/test/warp_filter_test_util.cc b/third_party/aom/test/warp_filter_test_util.cc
new file mode 100644
index 0000000000..470c980777
--- /dev/null
+++ b/third_party/aom/test/warp_filter_test_util.cc
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory>
+#include <new>
+
+#include "aom_ports/aom_timer.h"
+#include "test/warp_filter_test_util.h"
+
+using std::make_tuple;
+using std::tuple;
+
+namespace libaom_test {
+
+int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits,
+ int rnd_gen_zeros) {
+ // Avoid accidentally generating a zero in speed tests, they are set by the
+ // is_*_zero parameters instead.
+ if (rnd_gen_zeros) {
+ // 1 in 8 chance of generating zero (arbitrarily chosen)
+ if (((rnd->Rand8()) & 7) == 0) return 0;
+ }
+ // Otherwise, enerate uniform values in the range
+ // [-(1 << bits), 1] U [1, 1<<bits]
+ int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
+ if ((rnd->Rand8()) & 1) return -v;
+ return v;
+}
+
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+ int16_t *alpha, int16_t *beta, int16_t *gamma,
+ int16_t *delta, const int is_alpha_zero,
+ const int is_beta_zero, const int is_gamma_zero,
+ const int is_delta_zero, const int rnd_gen_zeros) {
+ while (true) {
+ int rnd8 = rnd->Rand8() & 3;
+ mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6, rnd_gen_zeros);
+ mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6, rnd_gen_zeros);
+ mat[2] =
+ (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+
+ if (rnd8 <= 1) {
+ // AFFINE
+ mat[4] =
+ random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+ mat[5] =
+ (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ } else if (rnd8 == 2) {
+ mat[4] = -mat[3];
+ mat[5] = mat[2];
+ } else {
+ mat[4] =
+ random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros);
+ mat[5] =
+ (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3, rnd_gen_zeros)) +
+ (1 << WARPEDMODEL_PREC_BITS);
+ }
+
+ if (is_alpha_zero == 1) {
+ mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ }
+ if (is_beta_zero == 1) {
+ mat[3] = 0;
+ }
+ if (is_gamma_zero == 1) {
+ mat[4] = 0;
+ }
+ if (is_delta_zero == 1) {
+ mat[5] = static_cast<int32_t>(
+ ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
+ (1 << WARPEDMODEL_PREC_BITS));
+ }
+
+ // Calculate the derived parameters and check that they are suitable
+ // for the warp filter.
+ assert(mat[2] != 0);
+
+ *alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+ *beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+ *gamma = static_cast<int16_t>(clamp64(
+ (static_cast<int64_t>(mat[4]) * (1 << WARPEDMODEL_PREC_BITS)) / mat[2],
+ INT16_MIN, INT16_MAX));
+ *delta = static_cast<int16_t>(clamp64(
+ mat[5] -
+ ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) -
+ (1 << WARPEDMODEL_PREC_BITS),
+ INT16_MIN, INT16_MAX));
+
+ if ((4 * abs(*alpha) + 7 * abs(*beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+ (4 * abs(*gamma) + 4 * abs(*delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+ continue;
+
+ *alpha = ROUND_POWER_OF_TWO_SIGNED(*alpha, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ *beta = ROUND_POWER_OF_TWO_SIGNED(*beta, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ *gamma = ROUND_POWER_OF_TWO_SIGNED(*gamma, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ *delta = ROUND_POWER_OF_TWO_SIGNED(*delta, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+
+ // We have a valid model, so finish
+ return;
+ }
+}
+
+namespace AV1WarpFilter {
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
+ warp_affine_func filter) {
+ WarpTestParam params[] = {
+ make_tuple(4, 4, 5000, filter), make_tuple(8, 8, 5000, filter),
+ make_tuple(64, 64, 100, filter), make_tuple(4, 16, 2000, filter),
+ make_tuple(32, 8, 1000, filter),
+ };
+ return ::testing::Combine(::testing::ValuesIn(params),
+ ::testing::Values(0, 1), ::testing::Values(0, 1),
+ ::testing::Values(0, 1), ::testing::Values(0, 1));
+}
+
+AV1WarpFilterTest::~AV1WarpFilterTest() = default;
+void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
+ const int w = 128, h = 128;
+ const int border = 16;
+ const int stride = w + 2 * border;
+ WarpTestParam params = GET_PARAM(0);
+ const int out_w = std::get<0>(params), out_h = std::get<1>(params);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ int sub_x, sub_y;
+ const int bd = 8;
+
+ std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * stride]);
+ ASSERT_NE(input_, nullptr);
+ uint8_t *input = input_.get() + border;
+
+ // The warp functions always write rows with widths that are multiples of 8.
+ // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+ int output_n = ((out_w + 7) & ~7) * out_h;
+ std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ int32_t mat[8];
+ int16_t alpha, beta, gamma, delta;
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+ CONV_BUF_TYPE[output_n]);
+ ASSERT_NE(dsta, nullptr);
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero, 0);
+
+ for (int r = 0; r < h; ++r)
+ for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+ for (int r = 0; r < h; ++r) {
+ memset(input + r * stride - border, input[r * stride], border);
+ memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+ }
+
+ sub_x = 0;
+ sub_y = 0;
+ int do_average = 0;
+
+ conv_params =
+ get_conv_params_no_round(do_average, 0, dsta.get(), out_w, 1, bd);
+ conv_params.use_dist_wtd_comp_avg = 0;
+
+ const int num_loops = 1000000000 / (out_w + out_h);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(mat, input, w, h, stride, output.get(), 32, 32, out_w, out_h,
+ out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("warp %3dx%-3d alpha=%d beta=%d gamma=%d delta=%d: %7.2f ns \n", out_w,
+ out_h, alpha, beta, gamma, delta, 1000.0 * elapsed_time / num_loops);
+}
+
+void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
+ const int w = 128, h = 128;
+ const int border = 16;
+ const int stride = w + 2 * border;
+ WarpTestParam params = GET_PARAM(0);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ const int out_w = std::get<0>(params), out_h = std::get<1>(params);
+ const int num_iters = std::get<2>(params);
+ const int bd = 8;
+
+ // The warp functions always write rows with widths that are multiples of 8.
+ // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+ int output_n = ((out_w + 7) & ~7) * out_h;
+ std::unique_ptr<uint8_t[]> input_(new (std::nothrow) uint8_t[h * stride]);
+ ASSERT_NE(input_, nullptr);
+ uint8_t *input = input_.get() + border;
+ std::unique_ptr<uint8_t[]> output(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ std::unique_ptr<uint8_t[]> output2(new (std::nothrow) uint8_t[output_n]);
+ ASSERT_NE(output2, nullptr);
+ int32_t mat[8];
+ int16_t alpha, beta, gamma, delta;
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+ CONV_BUF_TYPE[output_n]);
+ ASSERT_NE(dsta, nullptr);
+ std::unique_ptr<CONV_BUF_TYPE[]> dstb(new (std::nothrow)
+ CONV_BUF_TYPE[output_n]);
+ ASSERT_NE(dstb, nullptr);
+ for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
+
+ for (int i = 0; i < num_iters; ++i) {
+ // Generate an input block and extend its borders horizontally
+ for (int r = 0; r < h; ++r)
+ for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+ for (int r = 0; r < h; ++r) {
+ memset(input + r * stride - border, input[r * stride], border);
+ memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+ }
+ const int use_no_round = rnd_.Rand8() & 1;
+ for (int sub_x = 0; sub_x < 2; ++sub_x)
+ for (int sub_y = 0; sub_y < 2; ++sub_y) {
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero, 1);
+
+ for (int ii = 0; ii < 2; ++ii) {
+ for (int jj = 0; jj < 5; ++jj) {
+ for (int do_average = 0; do_average <= 1; ++do_average) {
+ if (use_no_round) {
+ conv_params = get_conv_params_no_round(
+ do_average, 0, dsta.get(), out_w, 1, bd);
+ } else {
+ conv_params = get_conv_params(0, 0, bd);
+ }
+ if (jj >= 4) {
+ conv_params.use_dist_wtd_comp_avg = 0;
+ } else {
+ conv_params.use_dist_wtd_comp_avg = 1;
+ conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+ }
+ av1_warp_affine_c(mat, input, w, h, stride, output.get(), 32, 32,
+ out_w, out_h, out_w, sub_x, sub_y, &conv_params,
+ alpha, beta, gamma, delta);
+ if (use_no_round) {
+ conv_params = get_conv_params_no_round(
+ do_average, 0, dstb.get(), out_w, 1, bd);
+ }
+ if (jj >= 4) {
+ conv_params.use_dist_wtd_comp_avg = 0;
+ } else {
+ conv_params.use_dist_wtd_comp_avg = 1;
+ conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+ }
+ test_impl(mat, input, w, h, stride, output2.get(), 32, 32, out_w,
+ out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta,
+ gamma, delta);
+ if (use_no_round) {
+ for (int j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(dsta[j], dstb[j])
+ << "Pixel mismatch at index " << j << " = ("
+ << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+ << i;
+ for (int j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = ("
+ << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+ << i;
+ } else {
+ for (int j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = ("
+ << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+ << i;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+} // namespace AV1WarpFilter
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdWarpFilter {
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
+ highbd_warp_affine_func filter) {
+ const HighbdWarpTestParam params[] = {
+ make_tuple(4, 4, 100, 8, filter), make_tuple(8, 8, 100, 8, filter),
+ make_tuple(64, 64, 100, 8, filter), make_tuple(4, 16, 100, 8, filter),
+ make_tuple(32, 8, 100, 8, filter), make_tuple(4, 4, 100, 10, filter),
+ make_tuple(8, 8, 100, 10, filter), make_tuple(64, 64, 100, 10, filter),
+ make_tuple(4, 16, 100, 10, filter), make_tuple(32, 8, 100, 10, filter),
+ make_tuple(4, 4, 100, 12, filter), make_tuple(8, 8, 100, 12, filter),
+ make_tuple(64, 64, 100, 12, filter), make_tuple(4, 16, 100, 12, filter),
+ make_tuple(32, 8, 100, 12, filter),
+ };
+ return ::testing::Combine(::testing::ValuesIn(params),
+ ::testing::Values(0, 1), ::testing::Values(0, 1),
+ ::testing::Values(0, 1), ::testing::Values(0, 1));
+}
+
+AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() = default;
+void AV1HighbdWarpFilterTest::SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
+ const int w = 128, h = 128;
+ const int border = 16;
+ const int stride = w + 2 * border;
+ HighbdWarpTestParam param = GET_PARAM(0);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ const int out_w = std::get<0>(param), out_h = std::get<1>(param);
+ const int bd = std::get<3>(param);
+ const int mask = (1 << bd) - 1;
+ int sub_x, sub_y;
+
+ // The warp functions always write rows with widths that are multiples of 8.
+ // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+ int output_n = ((out_w + 7) & ~7) * out_h;
+ std::unique_ptr<uint16_t[]> input_(new (std::nothrow) uint16_t[h * stride]);
+ ASSERT_NE(input_, nullptr);
+ uint16_t *input = input_.get() + border;
+ std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ int32_t mat[8];
+ int16_t alpha, beta, gamma, delta;
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+ CONV_BUF_TYPE[output_n]);
+ ASSERT_NE(dsta, nullptr);
+
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero, 0);
+ // Generate an input block and extend its borders horizontally
+ for (int r = 0; r < h; ++r)
+ for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < border; ++c) {
+ input[r * stride - border + c] = input[r * stride];
+ input[r * stride + w + c] = input[r * stride + (w - 1)];
+ }
+ }
+
+ sub_x = 0;
+ sub_y = 0;
+ int do_average = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
+ conv_params =
+ get_conv_params_no_round(do_average, 0, dsta.get(), out_w, 1, bd);
+
+ const int num_loops = 1000000000 / (out_w + out_h);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ for (int i = 0; i < num_loops; ++i)
+ test_impl(mat, input, w, h, stride, output.get(), 32, 32, out_w, out_h,
+ out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta);
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("highbd warp %3dx%-3d alpha=%d beta=%d gamma=%d delta=%d: %7.2f ns \n",
+ out_w, out_h, alpha, beta, gamma, delta,
+ 1000.0 * elapsed_time / num_loops);
+}
+
+void AV1HighbdWarpFilterTest::RunCheckOutput(
+ highbd_warp_affine_func test_impl) {
+ const int w = 128, h = 128;
+ const int border = 16;
+ const int stride = w + 2 * border;
+ HighbdWarpTestParam param = GET_PARAM(0);
+ const int is_alpha_zero = GET_PARAM(1);
+ const int is_beta_zero = GET_PARAM(2);
+ const int is_gamma_zero = GET_PARAM(3);
+ const int is_delta_zero = GET_PARAM(4);
+ const int out_w = std::get<0>(param), out_h = std::get<1>(param);
+ const int bd = std::get<3>(param);
+ const int num_iters = std::get<2>(param);
+ const int mask = (1 << bd) - 1;
+
+ // The warp functions always write rows with widths that are multiples of 8.
+ // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+ int output_n = ((out_w + 7) & ~7) * out_h;
+ std::unique_ptr<uint16_t[]> input_(new (std::nothrow) uint16_t[h * stride]);
+ ASSERT_NE(input_, nullptr);
+ uint16_t *input = input_.get() + border;
+ std::unique_ptr<uint16_t[]> output(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output, nullptr);
+ std::unique_ptr<uint16_t[]> output2(new (std::nothrow) uint16_t[output_n]);
+ ASSERT_NE(output2, nullptr);
+ int32_t mat[8];
+ int16_t alpha, beta, gamma, delta;
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ std::unique_ptr<CONV_BUF_TYPE[]> dsta(new (std::nothrow)
+ CONV_BUF_TYPE[output_n]);
+ ASSERT_NE(dsta, nullptr);
+ std::unique_ptr<CONV_BUF_TYPE[]> dstb(new (std::nothrow)
+ CONV_BUF_TYPE[output_n]);
+ ASSERT_NE(dstb, nullptr);
+ for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
+
+ for (int i = 0; i < num_iters; ++i) {
+ // Generate an input block and extend its borders horizontally
+ for (int r = 0; r < h; ++r)
+ for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < border; ++c) {
+ input[r * stride - border + c] = input[r * stride];
+ input[r * stride + w + c] = input[r * stride + (w - 1)];
+ }
+ }
+ const int use_no_round = rnd_.Rand8() & 1;
+ for (int sub_x = 0; sub_x < 2; ++sub_x)
+ for (int sub_y = 0; sub_y < 2; ++sub_y) {
+ generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+ is_alpha_zero, is_beta_zero, is_gamma_zero,
+ is_delta_zero, 1);
+ for (int ii = 0; ii < 2; ++ii) {
+ for (int jj = 0; jj < 5; ++jj) {
+ for (int do_average = 0; do_average <= 1; ++do_average) {
+ if (use_no_round) {
+ conv_params = get_conv_params_no_round(
+ do_average, 0, dsta.get(), out_w, 1, bd);
+ } else {
+ conv_params = get_conv_params(0, 0, bd);
+ }
+ if (jj >= 4) {
+ conv_params.use_dist_wtd_comp_avg = 0;
+ } else {
+ conv_params.use_dist_wtd_comp_avg = 1;
+ conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+ }
+
+ av1_highbd_warp_affine_c(mat, input, w, h, stride, output.get(),
+ 32, 32, out_w, out_h, out_w, sub_x,
+ sub_y, bd, &conv_params, alpha, beta,
+ gamma, delta);
+ if (use_no_round) {
+ // TODO(angiebird): Change this to test_impl once we have SIMD
+ // implementation
+ conv_params = get_conv_params_no_round(
+ do_average, 0, dstb.get(), out_w, 1, bd);
+ }
+ if (jj >= 4) {
+ conv_params.use_dist_wtd_comp_avg = 0;
+ } else {
+ conv_params.use_dist_wtd_comp_avg = 1;
+ conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+ conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
+ }
+ test_impl(mat, input, w, h, stride, output2.get(), 32, 32, out_w,
+ out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha,
+ beta, gamma, delta);
+
+ if (use_no_round) {
+ for (int j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(dsta[j], dstb[j])
+ << "Pixel mismatch at index " << j << " = ("
+ << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+ << i;
+ for (int j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = ("
+ << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+ << i;
+ } else {
+ for (int j = 0; j < out_w * out_h; ++j)
+ ASSERT_EQ(output[j], output2[j])
+ << "Pixel mismatch at index " << j << " = ("
+ << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+ << i;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+} // namespace AV1HighbdWarpFilter
+#endif // CONFIG_AV1_HIGHBITDEPTH
+} // namespace libaom_test
diff --git a/third_party/aom/test/warp_filter_test_util.h b/third_party/aom/test/warp_filter_test_util.h
new file mode 100644
index 0000000000..364368ac0c
--- /dev/null
+++ b/third_party/aom/test/warp_filter_test_util.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+#define AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/common_data.h"
+
+namespace libaom_test {
+
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+ int16_t *alpha, int16_t *beta, int16_t *gamma,
+ int16_t *delta, int is_alpha_zero, int is_beta_zero,
+ int is_gamma_zero, int is_delta_zero);
+
+namespace AV1WarpFilter {
+
+typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref,
+ int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta);
+
+typedef std::tuple<int, int, int, warp_affine_func> WarpTestParam;
+typedef std::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
+
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
+ warp_affine_func filter);
+
+class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParams> {
+ public:
+ ~AV1WarpFilterTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(warp_affine_func test_impl);
+ void RunSpeedTest(warp_affine_func test_impl);
+
+ libaom_test::ACMRandom rnd_;
+};
+
+} // namespace AV1WarpFilter
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdWarpFilter {
+typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ int bd, ConvolveParams *conv_params,
+ int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta);
+
+typedef std::tuple<int, int, int, int, highbd_warp_affine_func>
+ HighbdWarpTestParam;
+typedef std::tuple<HighbdWarpTestParam, int, int, int, int>
+ HighbdWarpTestParams;
+
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
+ highbd_warp_affine_func filter);
+
+class AV1HighbdWarpFilterTest
+ : public ::testing::TestWithParam<HighbdWarpTestParams> {
+ public:
+ ~AV1HighbdWarpFilterTest() override;
+ void SetUp() override;
+
+ protected:
+ void RunCheckOutput(highbd_warp_affine_func test_impl);
+ void RunSpeedTest(highbd_warp_affine_func test_impl);
+
+ libaom_test::ACMRandom rnd_;
+};
+
+} // namespace AV1HighbdWarpFilter
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_WARP_FILTER_TEST_UTIL_H_
diff --git a/third_party/aom/test/webm_video_source.h b/third_party/aom/test/webm_video_source.h
new file mode 100644
index 0000000000..845abd6dce
--- /dev/null
+++ b/third_party/aom/test/webm_video_source.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#define AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of WebM files,
+// so that we can do actual file decodes.
+class WebMVideoSource : public CompressedVideoSource {
+ public:
+ explicit WebMVideoSource(const std::string &file_name)
+ : file_name_(file_name), aom_ctx_(new AvxInputContext()),
+ webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0),
+ frame_sz_(0), frame_number_(0), end_of_file_(false) {}
+
+ ~WebMVideoSource() override {
+ if (aom_ctx_->file != nullptr) fclose(aom_ctx_->file);
+ webm_free(webm_ctx_);
+ delete aom_ctx_;
+ delete webm_ctx_;
+ }
+
+ void Init() override {
+ ASSERT_NE(aom_ctx_, nullptr);
+ ASSERT_NE(webm_ctx_, nullptr);
+ }
+
+ void Begin() override {
+ ASSERT_NE(aom_ctx_, nullptr);
+ ASSERT_NE(webm_ctx_, nullptr);
+ aom_ctx_->file = OpenTestDataFile(file_name_);
+ ASSERT_NE(aom_ctx_->file, nullptr)
+ << "Input file open failed. Filename: " << file_name_;
+
+ ASSERT_EQ(file_is_webm(webm_ctx_, aom_ctx_), 1) << "file is not WebM";
+
+ FillFrame();
+ }
+
+ void Next() override {
+ ++frame_number_;
+ FillFrame();
+ }
+
+ void FillFrame() {
+ ASSERT_NE(aom_ctx_, nullptr);
+ ASSERT_NE(webm_ctx_, nullptr);
+ ASSERT_NE(aom_ctx_->file, nullptr);
+ const int status = webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
+ ASSERT_GE(status, 0) << "webm_read_frame failed";
+ if (status == 1) {
+ end_of_file_ = true;
+ }
+ }
+
+ void SeekToNextKeyFrame() {
+ ASSERT_NE(aom_ctx_, nullptr);
+ ASSERT_NE(webm_ctx_, nullptr);
+ ASSERT_NE(aom_ctx_->file, nullptr);
+ do {
+ const int status =
+ webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
+ ASSERT_GE(status, 0) << "webm_read_frame failed";
+ ++frame_number_;
+ if (status == 1) {
+ end_of_file_ = true;
+ }
+ } while (!webm_ctx_->is_key_frame && !end_of_file_);
+ }
+
+ const uint8_t *cxdata() const override {
+ return end_of_file_ ? nullptr : buf_;
+ }
+ size_t frame_size() const override { return frame_sz_; }
+ unsigned int frame_number() const override { return frame_number_; }
+
+ protected:
+ std::string file_name_;
+ AvxInputContext *aom_ctx_;
+ WebmInputContext *webm_ctx_;
+ uint8_t *buf_; // Owned by webm_ctx_ and freed when webm_ctx_ is freed.
+ size_t buf_sz_;
+ size_t frame_sz_;
+ unsigned int frame_number_;
+ bool end_of_file_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/webmenc_test.cc b/third_party/aom/test/webmenc_test.cc
new file mode 100644
index 0000000000..acd795f2ec
--- /dev/null
+++ b/third_party/aom/test/webmenc_test.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "common/webmenc.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+#if CONFIG_WEBM_IO
+
+class WebmencTest : public ::testing::Test {};
+
+// All of these variations on output should be identical.
+TEST(WebmencTest, ExtractEncoderSettingsOutput1) {
+ const char *argv[] = { "aomenc", "-o", "output", "input",
+ "--target-bitrate=300" };
+ int argc = 5;
+ const std::string expected("version:1.2.3 --target-bitrate=300");
+ char *result = extract_encoder_settings("1.2.3", argv, argc, "input");
+ ASSERT_EQ(expected, std::string(result));
+ free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsOutput2) {
+ const char *argv[] = { "aomenc", "--output", "bar", "foo", "--cpu-used=3" };
+ int argc = 5;
+ const std::string expected("version:abc --cpu-used=3");
+ char *result = extract_encoder_settings("abc", argv, argc, "foo");
+ ASSERT_EQ(expected, std::string(result));
+ free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsOutput3) {
+ const char *argv[] = { "aomenc", "--cq-level=63", "--end-usage=q",
+ "--output=foo", "baz" };
+ int argc = 5;
+ const std::string expected("version:23 --cq-level=63 --end-usage=q");
+ char *result = extract_encoder_settings("23", argv, argc, "baz");
+ ASSERT_EQ(expected, std::string(result));
+ free(result);
+}
+
+TEST(WebmencTest, ExtractEncoderSettingsInput) {
+ // Check that input filename is filtered regardless of position.
+ const char *argv[] = { "aomenc", "-o", "out", "input", "-p", "2" };
+ int argc = 6;
+ const char version[] = "1.0.0";
+ const std::string expected("version:1.0.0 -p 2");
+ char *result = extract_encoder_settings(version, argv, argc, "input");
+ ASSERT_EQ(expected, std::string(result));
+ free(result);
+
+ const char *argv2[] = { "aomenc", "input", "-o", "out", "-p", "2" };
+ result = extract_encoder_settings(version, argv2, argc, "input");
+ ASSERT_EQ(expected, std::string(result));
+ free(result);
+}
+
+#endif // CONFIG_WEBM_IO
+} // namespace
diff --git a/third_party/aom/test/wiener_test.cc b/third_party/aom/test/wiener_test.cc
new file mode 100644
index 0000000000..7eb6372aaa
--- /dev/null
+++ b/third_party/aom/test/wiener_test.cc
@@ -0,0 +1,1390 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/register_state_check.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_WIENER_BLOCK 384
+#define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN)
+
+// 8-bit-depth tests
+namespace wiener_lowbd {
+
+// C implementation of the algorithm implmented by the SIMD code.
+// This is a little more efficient than the version in av1_compute_stats_c().
+static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *d, int16_t *s,
+ int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride,
+ int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+ (void)d;
+ (void)s;
+ int i, j, k, l, m, n;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ std::vector<std::vector<int64_t> > M_int(wiener_win,
+ std::vector<int64_t>(wiener_win, 0));
+ std::vector<std::vector<int64_t> > H_int(
+ wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
+ std::vector<std::vector<int32_t> > sumY(wiener_win,
+ std::vector<int32_t>(wiener_win, 0));
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ for (i = v_start; i < v_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = v_end - i;
+ }
+ int32_t sumX_row_i32 = 0;
+ std::vector<std::vector<int32_t> > sumY_row(
+ wiener_win, std::vector<int32_t>(wiener_win, 0));
+ std::vector<std::vector<int32_t> > M_row_i32(
+ wiener_win, std::vector<int32_t>(wiener_win, 0));
+ std::vector<std::vector<int32_t> > H_row_i32(
+ wiener_win * wiener_win, std::vector<int32_t>(wiener_win * 8, 0));
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint8_t X1 = src[i * src_stride + j];
+ const uint8_t X2 = src[i * src_stride + j + 1];
+ sumX_row_i32 += X1 + X2;
+
+ const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int32_t *H_int_temp = &H_row_i32[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijkl[0];
+ const uint8_t D2 = dgd_ijkl[1];
+ sumY_row[k][l] += D1 + D2;
+ M_row_i32[l][k] += D1 * X1 + D2 * X2;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+ D2 * dgd_ij[n + dgd_stride * m + 1];
+ }
+ }
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint8_t X1 = src[i * src_stride + j];
+ sumX_row_i32 += X1;
+
+ const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int32_t *H_int_temp = &H_row_i32[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijkl[0];
+ sumY_row[k][l] += D1;
+ M_row_i32[l][k] += D1 * X1;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m];
+ }
+ }
+ }
+ }
+ }
+
+ sumX += sumX_row_i32 * downsample_factor;
+ // Scale M matrix based on the downsampling factor
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ sumY[k][l] += sumY_row[k][l] * downsample_factor;
+ M_int[k][l] += (int64_t)M_row_i32[k][l] * downsample_factor;
+ }
+ }
+ // Scale H matrix based on the downsampling factor
+ for (k = 0; k < wiener_win * wiener_win; ++k) {
+ for (l = 0; l < wiener_win * 8; ++l) {
+ H_int[k][l] += (int64_t)H_row_i32[k][l] * downsample_factor;
+ }
+ }
+ }
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ M[l * wiener_win + k] =
+ M_int[l][k] + avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]);
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+ H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
+ (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int16_t *d, int16_t *s, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride,
+ int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win_opt_c(wiener_win, dgd, src, d, s, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, d, s, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ }
+}
+
+static const int kIterations = 100;
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats);
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef std::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+ void SetUp() override {
+ src_buf = (uint8_t *)aom_memalign(
+ 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
+ ASSERT_NE(src_buf, nullptr);
+ dgd_buf = (uint8_t *)aom_memalign(
+ 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
+ ASSERT_NE(dgd_buf, nullptr);
+ const int buf_size =
+ sizeof(*buf) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+ buf = (int16_t *)aom_memalign(32, buf_size);
+ ASSERT_NE(buf, nullptr);
+ memset(buf, 0, buf_size);
+ target_func_ = GET_PARAM(0);
+ }
+ void TearDown() override {
+ aom_free(src_buf);
+ aom_free(dgd_buf);
+ aom_free(buf);
+ }
+ void RunWienerTest(const int32_t wiener_win, int32_t run_times);
+ void RunWienerTest_ExtremeValues(const int32_t wiener_win);
+
+ private:
+ compute_stats_Func target_func_;
+ libaom_test::ACMRandom rng_;
+ uint8_t *src_buf;
+ uint8_t *dgd_buf;
+ int16_t *buf;
+};
+
+void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ // Note(rachelbarker):
+ // The SIMD code requires `h_start` to be even, but can otherwise
+ // deal with any values of `h_end`, `v_start`, `v_end`. We cover this
+ // entire range, even though (at the time of writing) `h_start` and `v_start`
+ // will always be multiples of 64 when called from non-test code.
+ // If in future any new requirements are added, these lines will
+ // need changing.
+ int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
+ int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+ if (h_start > h_end) std::swap(h_start, h_end);
+ int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
+ int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+ if (v_start > v_end) std::swap(v_start, v_end);
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = run_times == 1 ? kIterations : 2;
+ const int max_value_downsample_stats = 1;
+ int16_t *dgd_avg = buf;
+ int16_t *src_avg =
+ buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
+
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = rng_.Rand8();
+ src_buf[i] = rng_.Rand8();
+ }
+ uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+ uint8_t *src = src_buf;
+ for (int use_downsampled_stats = 0;
+ use_downsampled_stats <= max_value_downsample_stats;
+ use_downsampled_stats++) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start,
+ h_end, v_start, v_end, dgd_stride, src_stride,
+ M_ref, H_ref, use_downsampled_stats);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M_test, H_test,
+ use_downsampled_stats);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
+ time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (M_ref[i] != M_test[i]) {
+ failed = 1;
+ printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+ wiener_win, iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (H_ref[i] != H_test[i]) {
+ failed = 1;
+ printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+ wiener_win, iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+ }
+}
+
+void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ const int h_start = 16;
+ const int h_end = MAX_WIENER_BLOCK;
+ const int v_start = 16;
+ const int v_end = MAX_WIENER_BLOCK;
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = 1;
+ const int max_value_downsample_stats = 1;
+ int16_t *dgd_avg = buf;
+ int16_t *src_avg =
+ buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX);
+
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ // Fill with alternating extreme values to maximize difference with
+ // the average.
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = i & 1 ? 255 : 0;
+ src_buf[i] = i & 1 ? 255 : 0;
+ }
+ uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+ uint8_t *src = src_buf;
+ for (int use_downsampled_stats = 0;
+ use_downsampled_stats <= max_value_downsample_stats;
+ use_downsampled_stats++) {
+ av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start,
+ h_end, v_start, v_end, dgd_stride, src_stride, M_ref,
+ H_ref, use_downsampled_stats);
+
+ target_func_(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M_test, H_test,
+ use_downsampled_stats);
+
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (M_ref[i] != M_test[i]) {
+ failed = 1;
+ printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+ wiener_win, iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (H_ref[i] != H_test[i]) {
+ failed = 1;
+ printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+ wiener_win, iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+ }
+}
+
+TEST_P(WienerTest, RandomValues) {
+ RunWienerTest(WIENER_WIN, 1);
+ RunWienerTest(WIENER_WIN_CHROMA, 1);
+}
+
+TEST_P(WienerTest, ExtremeValues) {
+ RunWienerTest_ExtremeValues(WIENER_WIN);
+ RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA);
+}
+
+TEST_P(WienerTest, DISABLED_Speed) {
+ RunWienerTest(WIENER_WIN, 200);
+ RunWienerTest(WIENER_WIN_CHROMA, 200);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTest,
+ ::testing::Values(av1_compute_stats_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerTest,
+ ::testing::Values(av1_compute_stats_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(NEON, WienerTest,
+ ::testing::Values(av1_compute_stats_neon));
+#endif // HAVE_NEON
+
+} // namespace wiener_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// High bit-depth tests:
+namespace wiener_highbd {
+
+static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start,
+ int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride,
+ int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+ int i, j, k, l, m, n;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ std::vector<std::vector<int64_t> > M_int(wiener_win,
+ std::vector<int64_t>(wiener_win, 0));
+ std::vector<std::vector<int64_t> > H_int(
+ wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
+ std::vector<std::vector<int32_t> > sumY(wiener_win,
+ std::vector<int32_t>(wiener_win, 0));
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+ int64_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ for (i = v_start; i < v_end; i++) {
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[i * src_stride + j];
+ const uint16_t X2 = src[i * src_stride + j + 1];
+ sumX += X1 + X2;
+
+ const uint16_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint16_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijkl[0];
+ const uint16_t D2 = dgd_ijkl[1];
+ sumY[k][l] += D1 + D2;
+ M_int[l][k] += D1 * X1 + D2 * X2;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+ D2 * dgd_ij[n + dgd_stride * m + 1];
+ }
+ }
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[i * src_stride + j];
+ sumX += X1;
+
+ const uint16_t *dgd_ij = dgd_win + i * dgd_stride + j;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const uint16_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+ int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijkl[0];
+ sumY[k][l] += D1;
+ M_int[l][k] += D1 * X1;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m];
+ }
+ }
+ }
+ }
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ M[l * wiener_win + k] =
+ (M_int[l][k] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+ (H_int[(l * wiener_win + k)][n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else {
+ av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+
+static const int kIterations = 100;
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth);
+
+typedef std::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+ void SetUp() override {
+ src_buf = (uint16_t *)aom_memalign(
+ 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
+ ASSERT_NE(src_buf, nullptr);
+ dgd_buf = (uint16_t *)aom_memalign(
+ 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
+ ASSERT_NE(dgd_buf, nullptr);
+ target_func_ = GET_PARAM(0);
+ }
+ void TearDown() override {
+ aom_free(src_buf);
+ aom_free(dgd_buf);
+ }
+ void RunWienerTest(const int32_t wiener_win, int32_t run_times,
+ aom_bit_depth_t bit_depth);
+ void RunWienerTest_ExtremeValues(const int32_t wiener_win,
+ aom_bit_depth_t bit_depth);
+
+ private:
+ compute_stats_Func target_func_;
+ libaom_test::ACMRandom rng_;
+ uint16_t *src_buf;
+ uint16_t *dgd_buf;
+};
+
+void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
+ int32_t run_times,
+ aom_bit_depth_t bit_depth) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ // Note(rachelbarker):
+ // The SIMD code requires `h_start` to be even, but can otherwise
+ // deal with any values of `h_end`, `v_start`, `v_end`. We cover this
+ // entire range, even though (at the time of writing) `h_start` and `v_start`
+ // will always be multiples of 64 when called from non-test code.
+ // If in future any new requirements are added, these lines will
+ // need changing.
+ int h_start = (rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & ~1;
+ int h_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+ if (h_start > h_end) std::swap(h_start, h_end);
+ int v_start = rng_.Rand16() % (MAX_WIENER_BLOCK / 2);
+ int v_end = run_times != 1 ? 256 : (rng_.Rand16() % MAX_WIENER_BLOCK);
+ if (v_start > v_end) std::swap(v_start, v_end);
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = run_times == 1 ? kIterations : 2;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = rng_.Rand16() % (1 << bit_depth);
+ src_buf[i] = rng_.Rand16() % (1 << bit_depth);
+ }
+ const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
+ dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
+ const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf);
+
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M_ref,
+ H_ref, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_times; ++i) {
+ target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_test, H_test, bit_depth);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ if (run_times > 10) {
+ printf("win %d bd %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, bit_depth,
+ h_end, v_end, time1, time2);
+ printf("(%3.2f)\n", time1 / time2);
+ }
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (M_ref[i] != M_test[i]) {
+ failed = 1;
+ printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+ " \n",
+ wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (H_ref[i] != H_test[i]) {
+ failed = 1;
+ printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+ " \n",
+ wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+}
+
+void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win,
+ aom_bit_depth_t bit_depth) {
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+ DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+ const int h_start = 16;
+ const int h_end = MAX_WIENER_BLOCK;
+ const int v_start = 16;
+ const int v_end = MAX_WIENER_BLOCK;
+ const int dgd_stride = h_end;
+ const int src_stride = MAX_DATA_BLOCK;
+ const int iters = 1;
+ for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+ // Fill with alternating extreme values to maximize difference with
+ // the average.
+ for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+ dgd_buf[i] = i & 1 ? ((uint16_t)1 << bit_depth) - 1 : 0;
+ src_buf[i] = i & 1 ? ((uint16_t)1 << bit_depth) - 1 : 0;
+ }
+ const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
+ dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
+ const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf);
+
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M_ref, H_ref,
+ bit_depth);
+
+ target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M_test, H_test, bit_depth);
+
+ int failed = 0;
+ for (int i = 0; i < wiener_win2; ++i) {
+ if (M_ref[i] != M_test[i]) {
+ failed = 1;
+ printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+ " \n",
+ wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]);
+ break;
+ }
+ }
+ for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+ if (H_ref[i] != H_test[i]) {
+ failed = 1;
+ printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+ " \n",
+ wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]);
+ break;
+ }
+ }
+ ASSERT_EQ(failed, 0);
+ }
+}
+
+TEST_P(WienerTestHighbd, RandomValues) {
+ RunWienerTest(WIENER_WIN, 1, AOM_BITS_8);
+ RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_8);
+ RunWienerTest(WIENER_WIN, 1, AOM_BITS_10);
+ RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_10);
+ RunWienerTest(WIENER_WIN, 1, AOM_BITS_12);
+ RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_12);
+}
+
+TEST_P(WienerTestHighbd, ExtremeValues) {
+ RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_8);
+ RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_8);
+ RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_10);
+ RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_10);
+ RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_12);
+ RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_12);
+}
+
+TEST_P(WienerTestHighbd, DISABLED_Speed) {
+ RunWienerTest(WIENER_WIN, 200, AOM_BITS_8);
+ RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_8);
+ RunWienerTest(WIENER_WIN, 200, AOM_BITS_10);
+ RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_10);
+ RunWienerTest(WIENER_WIN, 200, AOM_BITS_12);
+ RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_12);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WienerTestHighbd,
+ ::testing::Values(compute_stats_highbd_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTestHighbd,
+ ::testing::Values(av1_compute_stats_highbd_sse4_1));
+#endif // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerTestHighbd,
+ ::testing::Values(av1_compute_stats_highbd_avx2));
+#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerTestHighbd,
+ ::testing::Values(av1_compute_stats_highbd_neon));
+#endif // HAVE_NEON
+
+// A test that reproduces b/274668506: signed integer overflow in
+// update_a_sep_sym().
+TEST(SearchWienerTest, 10bitSignedIntegerOverflowInUpdateASepSym) {
+ constexpr int kWidth = 427;
+ constexpr int kHeight = 1;
+ std::vector<uint16_t> buffer(3 * kWidth * kHeight);
+ // The values in the buffer alternate between 0 and 1023.
+ uint16_t value = 0;
+ for (size_t i = 0; i < buffer.size(); ++i) {
+ buffer[i] = value;
+ value = 1023 - value;
+ }
+ unsigned char *img_data = reinterpret_cast<unsigned char *>(buffer.data());
+
+ aom_image_t img;
+ EXPECT_EQ(
+ aom_img_wrap(&img, AOM_IMG_FMT_I44416, kWidth, kHeight, 1, img_data),
+ &img);
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+ AOM_CODEC_OK);
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 1;
+ cfg.g_bit_depth = AOM_BITS_10;
+ cfg.g_input_bit_depth = 10;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_limit = 1;
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_DISABLED;
+ cfg.kf_max_dist = 0;
+ cfg.g_threads = 61;
+ cfg.rc_min_quantizer = 2;
+ cfg.rc_max_quantizer = 20;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 11), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_ROW_MT, 1), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 4), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 3), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SKIP_POSTPROC_FILTERING, 1),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM),
+ AOM_CODEC_OK);
+
+ // Encode frame
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK);
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+// A test that reproduces b/281219978: signed integer overflow in
+// update_b_sep_sym().
+TEST(SearchWienerTest, 12bitSignedIntegerOverflowInUpdateBSepSym) {
+ constexpr int kWidth = 311;
+ constexpr int kHeight = 3;
+ static const uint16_t buffer[3 * kWidth * kHeight] = {
+ // Y plane:
+ 0, 0, 0, 2156, 2513, 2211, 4095, 4095, 0, 2538, 0, 0, 0, 0, 4095, 0, 258,
+ 941, 4095, 907, 0, 0, 2325, 2485, 2408, 4095, 1513, 0, 3644, 2080, 4095,
+ 4095, 0, 2135, 0, 2461, 4095, 0, 4095, 4095, 0, 1987, 0, 3629, 0, 4095,
+ 3918, 4095, 0, 4095, 4095, 4095, 0, 1065, 0, 2072, 3597, 102, 0, 534, 0, 0,
+ 0, 4095, 0, 0, 4095, 0, 4095, 0, 4095, 0, 3611, 0, 1139, 4095, 0, 0, 0, 0,
+ 0, 4095, 0, 0, 0, 0, 4095, 4095, 4095, 0, 0, 0, 3070, 3224, 0, 0, 4095,
+ 4051, 4095, 0, 4095, 3712, 0, 1465, 4095, 1699, 4095, 4095, 0, 0, 0, 3885,
+ 0, 4095, 0, 0, 4095, 1686, 4095, 4095, 4095, 4095, 1330, 0, 0, 0, 4095, 0,
+ 4095, 4095, 3919, 4095, 781, 2371, 2055, 4095, 912, 3710, 0, 2045, 0, 4095,
+ 4095, 4095, 1811, 0, 1298, 1115, 0, 3327, 0, 0, 4095, 0, 253, 2386, 4095,
+ 1791, 3657, 1444, 0, 4095, 1918, 4095, 4095, 0, 4095, 305, 1587, 0, 4095, 0,
+ 3759, 0, 0, 4095, 2387, 4095, 4095, 0, 0, 4095, 4095, 0, 1015, 4095, 0, 768,
+ 2598, 1667, 130, 4095, 0, 0, 435, 4095, 3683, 4095, 0, 4095, 4095, 1888,
+ 2828, 4095, 3349, 0, 4095, 4095, 4095, 4095, 0, 4095, 0, 0, 4095, 0, 2491,
+ 1598, 0, 0, 383, 3712, 4095, 0, 0, 4095, 760, 4095, 4095, 4095, 2030, 4095,
+ 0, 0, 3236, 0, 1040, 0, 0, 4095, 0, 0, 4095, 4095, 4095, 0, 0, 1043, 3897,
+ 2446, 233, 1589, 427, 4095, 4095, 4095, 4095, 0, 1656, 3786, 4095, 0, 840,
+ 4095, 4095, 1429, 4095, 0, 4095, 2734, 4095, 0, 2431, 1801, 278, 0, 4095, 0,
+ 4095, 0, 0, 420, 0, 0, 746, 0, 0, 3281, 3006, 4095, 4095, 0, 0, 0, 3605,
+ 4095, 4095, 0, 4095, 4095, 4095, 4095, 2660, 496, 4095, 0, 0, 0, 0, 4095, 0,
+ 1317, 4095, 4095, 510, 1919, 0, 3893, 0, 4095, 4095, 4095, 4095, 4095, 2071,
+ 2006, 0, 3316, 4095, 0, 0, 4095, 852, 2982, 0, 2073, 0, 2728, 1499, 4095,
+ 852, 361, 3137, 4095, 4095, 1502, 1575, 0, 4095, 0, 0, 0, 0, 1585, 4095, 0,
+ 4095, 0, 3188, 3244, 4095, 2958, 4095, 4095, 0, 4095, 4095, 4095, 1706,
+ 2896, 4095, 1788, 730, 1146, 4095, 0, 0, 4095, 0, 0, 0, 2791, 3613, 2175,
+ 2925, 0, 0, 0, 0, 0, 1279, 4095, 4095, 0, 4095, 0, 0, 2336, 0, 3462, 4095,
+ 0, 4095, 1997, 2328, 2860, 0, 4095, 4095, 3241, 4095, 4095, 4095, 4095,
+ 4095, 4095, 118, 0, 4095, 4095, 4095, 0, 3734, 0, 0, 0, 4095, 1952, 4095,
+ 413, 4095, 1183, 4095, 0, 4095, 0, 0, 4095, 4095, 4095, 3805, 0, 1398, 0,
+ 4095, 0, 0, 0, 4095, 4095, 4095, 2802, 3658, 4095, 4095, 0, 0, 0, 4095, 0,
+ 897, 0, 4095, 2163, 0, 0, 0, 4095, 1440, 2487, 4095, 4095, 0, 4095, 4095,
+ 4095, 2808, 0, 1999, 0, 0, 4095, 4095, 4095, 1563, 124, 2179, 754, 0, 0,
+ 2407, 2798, 0, 4095, 4095, 0, 0, 1929, 0, 0, 0, 1387, 4095, 4095, 0, 0,
+ 3911, 562, 4095, 0, 4095, 2639, 2673, 4095, 4095, 0, 0, 4095, 4095, 0, 4095,
+ 4095, 901, 0, 321, 3961, 4095, 0, 4095, 4095, 4095, 0, 0, 0, 0, 3035, 3713,
+ 3441, 0, 4095, 0, 0, 854, 1544, 3963, 1968, 4095, 0, 0, 0, 0, 2897, 4095, 0,
+ 4095, 4095, 0, 235, 1011, 4095, 0, 3452, 4095, 4095, 0, 0, 4095, 4095, 4095,
+ 4095, 4095, 3312, 0, 3064, 4095, 3981, 4095, 4095, 4095, 4095, 4095, 0, 791,
+ 3243, 4095, 799, 0, 0, 0, 523, 2117, 3776, 0, 4095, 3311, 0, 543, 4095,
+ 4095, 4095, 0, 0, 4095, 4095, 4095, 4095, 0, 0, 4095, 4095, 225, 0, 1195,
+ 3070, 1210, 4095, 0, 4095, 498, 782, 0, 0, 4095, 4095, 4095, 4095, 4095,
+ 1456, 4095, 3898, 1472, 4095, 4095, 0, 4095, 4026, 0, 0, 2354, 1554, 0,
+ 4095, 0, 2986, 0, 1053, 1228, 0, 0, 4095, 4095, 0, 0, 4095, 0, 0, 4095, 0,
+ 0, 0, 606, 0, 4095, 3563, 4095, 2016, 4095, 0, 0, 4095, 0, 4095, 4095, 4095,
+ 0, 0, 0, 929, 0, 0, 4095, 0, 3069, 4095, 0, 2687, 4095, 4095, 4095, 2015,
+ 4095, 4095, 4095, 0, 4095, 0, 0, 2860, 3668, 0, 0, 4095, 2523, 2104, 0, 0,
+ 3063, 4095, 3674, 4095, 0, 2762, 0, 4095, 2582, 3473, 930, 0, 1012, 108, 38,
+ 4095, 1148, 3568, 4036, 4095, 4095, 0, 1120, 1873, 3028, 4095, 515, 1902,
+ 4095, 0, 815, 4095, 1548, 0, 1073, 3919, 4095, 2374, 0, 3126, 4095, 2268, 0,
+ 0, 0, 4095, 425, 4095, 0, 0, 4095, 4095, 2710, 4095, 2067, 4095, 4095, 2201,
+ 4095, 4095, 0, 4095, 4095, 2933, 0, 417, 2801, 4095, 4095, 3274, 0, 2870,
+ 4095, 4095, 0, 0, 973, 0, 0, 3129, 4095, 0, 0, 0, 4095, 4095, 4095, 0, 242,
+ 4095, 0, 4095, 0, 0, 0, 0, 987, 0, 2426, 4045, 2780, 0, 4095, 3762, 3361,
+ 3095, 4095, 596, 1072, 4071, 4095, 4095, 0, 0, 81, 0, 1001, 1683, 4095,
+ 4095, 3105, 2673, 0, 3300, 104, 4030, 0, 2615, 4095, 4095, 0, 4095, 1830,
+ 3917, 4095, 4095, 4095, 0, 4095, 3637, 0, 4095, 4095, 3677, 4095, 4095, 0,
+ 880, 4095, 4095, 0, 2797, 0, 0, 0, 0, 3225, 4095, 4095, 1925, 2885, 1879, 0,
+ 0, 4095, 0, 0, 0, 2974, 559, 0, 0, 0, 699, 997, 1491, 423, 4012, 0, 2315,
+ 4095, 0, 0, 4095, 0, 836, 4095, 0, 4095, 0, 1752, 0, 0, 0, 4095, 4095, 0, 0,
+ 51, 4095, 350, 0, 2143, 2588, 0, 4095, 0, 4095, 0, 2757, 2370, 4095, 668,
+ 4095, 0, 4095, 0, 3652, 3890, 0, 4095, 0, 4095, 4095, 4095, 4095, 4095,
+ // U plane:
+ 4095, 4095, 1465, 0, 588, 4095, 0, 4095, 4095, 4095, 0, 2167, 4095, 4095,
+ 918, 3223, 4095, 4095, 0, 696, 4095, 4095, 0, 0, 594, 4095, 2935, 0, 0, 0,
+ 2036, 4095, 0, 2492, 4095, 4095, 0, 0, 0, 3883, 0, 4095, 483, 4095, 4095,
+ 324, 923, 0, 3079, 0, 4095, 4095, 810, 0, 3371, 4095, 4095, 0, 4095, 2756,
+ 0, 723, 0, 3338, 1084, 0, 4095, 4095, 3764, 0, 4095, 4095, 4095, 2323, 0,
+ 3693, 682, 0, 0, 909, 4095, 2348, 4095, 4095, 4095, 1509, 4095, 0, 4095,
+ 4095, 4095, 4095, 3977, 3652, 1580, 637, 4095, 0, 593, 4095, 1199, 1773,
+ 4095, 4095, 4095, 0, 3447, 0, 0, 4095, 3873, 0, 0, 2094, 0, 1195, 0, 3892,
+ 4095, 4095, 729, 4095, 0, 0, 4095, 449, 4095, 4095, 2900, 0, 4095, 0, 2114,
+ 4095, 4095, 4095, 1174, 995, 2933, 360, 0, 1970, 0, 4095, 1208, 0, 4095, 0,
+ 4095, 0, 4095, 4095, 0, 4095, 0, 0, 0, 1976, 0, 0, 921, 4095, 4095, 192,
+ 1006, 0, 0, 2725, 4095, 0, 2813, 0, 0, 2375, 4095, 1982, 0, 2725, 4095,
+ 1225, 3566, 4095, 0, 344, 863, 2747, 0, 4095, 4095, 1928, 4095, 4095, 0,
+ 3640, 0, 1744, 3191, 4095, 4095, 0, 4095, 4095, 4095, 0, 0, 748, 4095, 0,
+ 2609, 0, 0, 0, 0, 0, 3508, 4095, 4095, 2463, 0, 4095, 0, 4095, 4095, 4095,
+ 3175, 419, 2193, 0, 0, 4095, 0, 0, 4095, 4051, 2159, 4095, 4095, 2262, 379,
+ 4095, 0, 0, 3399, 4095, 4095, 4095, 3769, 2510, 4054, 3336, 730, 3968, 0, 0,
+ 3354, 0, 1822, 0, 4095, 0, 3847, 3823, 3262, 0, 0, 2936, 0, 4095, 4095,
+ 2120, 0, 3147, 0, 2838, 3480, 474, 1194, 4095, 4095, 2820, 4095, 0, 4095,
+ 1882, 4095, 1085, 0, 4095, 2234, 3371, 4095, 0, 4095, 0, 0, 0, 2586, 4095,
+ 4095, 4095, 4095, 0, 3818, 1401, 2273, 4095, 0, 4095, 0, 3907, 4095, 4095,
+ 694, 0, 4066, 4095, 0, 0, 4095, 2116, 4095, 4095, 4095, 4095, 4095, 0, 2821,
+ 29, 0, 0, 663, 1711, 652, 1271, 4095, 4095, 2401, 3726, 4095, 3453, 1803,
+ 3614, 0, 4095, 3439, 4095, 0, 4095, 0, 816, 0, 0, 4095, 4095, 2635, 0, 1918,
+ 0, 2663, 381, 0, 0, 3670, 0, 4095, 3065, 965, 4095, 4095, 4095, 2993, 4095,
+ 4095, 0, 4095, 973, 4095, 0, 4095, 4095, 0, 3071, 0, 2777, 4095, 4095, 0,
+ 3996, 4095, 1637, 0, 4095, 67, 3784, 0, 0, 4095, 2603, 579, 4095, 4095,
+ 2854, 4095, 3016, 0, 4095, 0, 0, 4095, 4095, 4095, 4095, 3998, 3023, 4095,
+ 4095, 0, 0, 0, 4095, 4095, 4095, 4095, 0, 0, 2623, 1308, 55, 4095, 0, 0,
+ 2554, 2311, 0, 4095, 4095, 4095, 1134, 2112, 0, 4095, 4095, 0, 4095, 0, 645,
+ 0, 0, 4095, 0, 909, 0, 0, 1719, 4095, 0, 3542, 0, 575, 0, 4095, 4095, 4095,
+ 3428, 1172, 481, 1521, 4095, 3199, 1265, 4095, 3518, 4017, 4095, 760, 2042,
+ 3986, 0, 4095, 42, 4095, 0, 4095, 4095, 4095, 4095, 2235, 346, 3865, 0,
+ 4095, 4095, 4095, 4095, 4095, 4095, 845, 4095, 0, 2826, 4095, 4095, 0, 0,
+ 335, 1614, 1465, 0, 4095, 4095, 0, 2771, 4095, 0, 2810, 4095, 4095, 0, 1254,
+ 4095, 2589, 4095, 4095, 2252, 0, 0, 0, 4095, 0, 73, 4095, 4095, 0, 1341, 0,
+ 0, 0, 0, 4095, 0, 0, 2645, 1985, 492, 914, 3996, 4095, 4095, 4095, 0, 2383,
+ 2556, 433, 0, 4095, 1094, 4095, 4095, 642, 4095, 1722, 0, 3460, 4095, 4095,
+ 4095, 4095, 4095, 0, 154, 4095, 92, 4095, 0, 0, 0, 4095, 0, 4095, 4095, 444,
+ 0, 2925, 0, 0, 0, 0, 1628, 0, 4095, 1731, 2418, 697, 4095, 0, 2513, 4095, 0,
+ 4095, 4095, 4095, 4095, 4095, 0, 2510, 4095, 3850, 0, 0, 4095, 2480, 4095,
+ 4095, 2661, 4095, 0, 4095, 0, 0, 4095, 4095, 847, 4095, 4095, 3257, 443, 0,
+ 67, 0, 0, 0, 4095, 0, 0, 3073, 4095, 0, 4095, 0, 4095, 0, 4095, 1224, 4095,
+ 4095, 4095, 0, 4095, 958, 0, 4095, 0, 2327, 684, 0, 0, 0, 0, 4095, 4095, 0,
+ 3693, 795, 4095, 0, 621, 1592, 2314, 4095, 0, 928, 1897, 4095, 4095, 0,
+ 4095, 0, 0, 4095, 2619, 4095, 0, 4095, 0, 0, 4095, 2485, 4095, 4095, 0, 435,
+ 4095, 1818, 4095, 4095, 0, 0, 0, 4095, 4095, 4095, 4095, 0, 1671, 4095,
+ 4095, 0, 2617, 0, 2572, 0, 0, 4095, 3471, 0, 0, 4095, 2719, 3979, 1307, 0,
+ 0, 0, 0, 1794, 642, 447, 913, 4095, 3927, 0, 2686, 0, 0, 4095, 0, 857, 0,
+ 4095, 4095, 567, 2385, 0, 0, 4095, 893, 0, 289, 0, 0, 0, 4095, 4095, 2566,
+ 0, 1913, 0, 2350, 1033, 2764, 0, 4095, 0, 4095, 0, 0, 0, 0, 4095, 3952,
+ 3969, 0, 3476, 0, 4095, 4095, 393, 0, 2613, 0, 0, 1422, 0, 3359, 491, 3263,
+ 4095, 4095, 0, 0, 4095, 697, 3601, 4095, 0, 4095, 4095, 0, 4095, 0, 0, 4095,
+ 0, 4095, 4095, 4095, 2506, 0, 0, 1403, 0, 3836, 3976, 0, 4095, 4095, 4095,
+ 2497, 4095, 4095, 4095, 4095, 0, 4095, 3317, 4095, 4095, 4095, 0, 0, 1131,
+ 0, 0, 0, 4095, 0, 0, 4095, 0, 0, 2988, 4095, 4095, 2711, 2487, 1335, 0, 0,
+ 0, 4095, 261, 4095, 86, 0, 0, 1138, 4095, 0, 0, 4095, 4095, 0, 0, 0, 334, 0,
+ 2395, 3297, 4095, 1698, 4095, 1791, 1341, 0, 3559, 0, 4095, 0, 2056, 3238,
+ 3310, 4095, 4095, 779, 2129, 2849, 4095, 2622, 1051, 0, 0, 1282, 4095, 1246,
+ 0, 0, 3696, 4095, 556, 0, 0, 3463, 2658, 3572, 4095, 3982, 4095, 4095, 0, 0,
+ 4053, 4095, 4095, 4095, 2162, 2567, 1621, 4095, 4095, 1522, 293, 4095, 0, 0,
+ 1976, 4095, 3089, 4095, 0, 0, 0, 0, 3650,
+ // V plane:
+ 0, 1892, 4095, 1995, 0, 0, 0, 2208, 1152, 1794, 4095, 4095, 89, 3333, 4095,
+ 2478, 4095, 2505, 4095, 0, 2664, 4095, 1984, 0, 1144, 4095, 0, 4095, 0,
+ 4095, 0, 0, 0, 2404, 1727, 4095, 4095, 0, 1326, 2033, 0, 4095, 0, 4095,
+ 3022, 0, 4095, 0, 1980, 4095, 0, 2284, 4095, 0, 3422, 0, 4095, 2171, 3155,
+ 4095, 0, 4095, 0, 636, 0, 0, 4095, 3264, 3862, 0, 2164, 0, 0, 3879, 3886, 0,
+ 225, 0, 0, 4095, 0, 1956, 523, 464, 738, 0, 1545, 0, 2829, 4095, 4095, 4095,
+ 799, 4095, 358, 4095, 0, 0, 953, 0, 0, 2081, 4095, 1604, 4095, 2086, 0, 954,
+ 0, 0, 2393, 2413, 4095, 4095, 0, 3583, 4095, 4095, 2995, 4095, 0, 4095,
+ 4095, 3501, 4095, 247, 4095, 0, 0, 0, 4095, 1303, 3382, 1059, 4095, 0, 543,
+ 1276, 1801, 0, 0, 0, 2928, 0, 4095, 3931, 70, 0, 0, 3992, 4095, 1278, 1930,
+ 4095, 0, 4095, 4095, 3894, 0, 0, 0, 0, 4095, 0, 0, 0, 0, 0, 0, 4095, 4095,
+ 4095, 1098, 4095, 2059, 0, 380, 3166, 0, 4095, 2215, 0, 0, 2846, 0, 0, 2614,
+ 528, 4095, 0, 4095, 2371, 0, 4095, 0, 0, 0, 0, 4095, 3133, 4095, 4095, 0,
+ 4095, 1283, 3821, 1772, 0, 0, 4095, 4095, 4095, 890, 3475, 4095, 4095, 133,
+ 3292, 1819, 4095, 4095, 4095, 0, 0, 4095, 702, 4095, 0, 0, 0, 4095, 0, 2137,
+ 4095, 4095, 4095, 0, 0, 0, 4095, 4095, 1555, 2435, 2778, 4095, 0, 4095,
+ 3825, 0, 3736, 3054, 0, 0, 4095, 4095, 4095, 0, 0, 0, 0, 371, 4095, 4095, 0,
+ 0, 1565, 4095, 2731, 4095, 0, 756, 925, 0, 0, 0, 4095, 775, 1379, 4095,
+ 1439, 0, 0, 0, 2680, 0, 0, 4095, 1280, 4095, 0, 0, 4095, 4095, 0, 3088, 0,
+ 4095, 4095, 4095, 0, 0, 1526, 4095, 2314, 4095, 4095, 0, 4095, 288, 0, 205,
+ 4095, 4095, 4095, 0, 1247, 2014, 0, 1530, 1985, 0, 0, 4095, 3195, 0, 4095,
+ 4, 2397, 4095, 4095, 4095, 0, 4095, 4095, 4095, 0, 0, 0, 0, 0, 4031, 928,
+ 4095, 0, 0, 4095, 4095, 4095, 1966, 4095, 2299, 1215, 4095, 0, 4095, 1335,
+ 0, 4095, 1991, 4095, 0, 4095, 114, 0, 0, 0, 2123, 2639, 4095, 3323, 4095,
+ 4095, 418, 209, 0, 0, 4095, 4095, 4095, 4095, 963, 0, 0, 0, 4095, 2505, 0,
+ 3627, 0, 311, 3748, 2047, 4095, 2791, 0, 3643, 1852, 0, 0, 4095, 0, 2179, 0,
+ 4095, 2678, 0, 0, 0, 2342, 4095, 4095, 0, 0, 4095, 0, 0, 0, 0, 1076, 0, 0,
+ 4095, 0, 2370, 0, 3530, 0, 0, 0, 0, 0, 4095, 0, 0, 0, 3474, 1201, 0, 379,
+ 699, 4095, 777, 4095, 0, 4095, 4095, 0, 1213, 1762, 4095, 4095, 4095, 0,
+ 4095, 1090, 1233, 0, 4095, 0, 4095, 0, 0, 0, 2845, 3385, 2718, 0, 0, 2975,
+ 3630, 0, 4095, 4095, 4095, 4095, 3261, 243, 0, 4095, 0, 0, 3836, 4095, 4095,
+ 4095, 963, 0, 0, 2526, 0, 4095, 4000, 4095, 2069, 0, 0, 4095, 0, 4095, 1421,
+ 0, 4095, 0, 4095, 4095, 0, 4095, 0, 4095, 4095, 1537, 4095, 3201, 0, 0,
+ 4095, 2719, 4095, 0, 4095, 4095, 4095, 0, 4095, 0, 4095, 2300, 0, 2876, 0,
+ 4095, 4095, 4095, 3235, 497, 635, 0, 1480, 4095, 0, 3067, 3979, 3741, 0,
+ 3059, 1214, 4095, 4095, 2197, 0, 4095, 4095, 2734, 0, 4095, 4095, 3364,
+ 2369, 4095, 303, 4095, 0, 4095, 4095, 3472, 1733, 4095, 4095, 4095, 0, 55,
+ 0, 10, 1378, 1169, 4095, 0, 0, 688, 3613, 0, 4095, 2832, 867, 4095, 4095,
+ 3514, 4095, 0, 4095, 4095, 2458, 3506, 0, 1920, 0, 1762, 1178, 2549, 4095,
+ 3967, 4095, 0, 2975, 1282, 0, 377, 846, 3434, 97, 0, 0, 1616, 3526, 136,
+ 1888, 0, 147, 334, 4095, 0, 4095, 0, 4095, 1106, 4095, 0, 4095, 3280, 4095,
+ 4095, 0, 2849, 3528, 0, 4095, 4095, 0, 2306, 0, 3412, 0, 4095, 4095, 4095,
+ 4048, 2273, 0, 4095, 4095, 4095, 0, 4095, 3031, 4095, 4095, 4095, 0, 3382,
+ 3812, 2315, 4095, 0, 0, 0, 432, 4095, 3606, 0, 4, 2847, 4095, 0, 4095, 0, 0,
+ 2616, 4095, 4095, 0, 4095, 0, 3394, 4095, 3976, 3119, 0, 0, 0, 0, 4046,
+ 4095, 4095, 3331, 4095, 2127, 0, 4095, 0, 0, 0, 4095, 4095, 4095, 0, 4095,
+ 4095, 4095, 0, 2068, 0, 0, 3882, 2967, 0, 1745, 4095, 2112, 478, 0, 4095, 0,
+ 199, 4095, 4095, 3542, 4095, 2634, 4095, 4095, 1235, 4095, 4095, 167, 1553,
+ 0, 4095, 2649, 0, 3383, 0, 4095, 2803, 4095, 0, 4095, 0, 785, 4095, 0, 4095,
+ 1743, 4095, 0, 3945, 0, 4095, 1894, 4095, 3973, 4095, 0, 0, 4095, 0, 0,
+ 4095, 318, 4095, 4095, 4095, 0, 261, 4095, 4095, 2125, 2690, 4095, 0, 4095,
+ 3863, 1740, 4095, 0, 2899, 1509, 0, 0, 0, 2780, 4095, 1897, 2104, 4095,
+ 1708, 284, 4095, 0, 4095, 3382, 4095, 4095, 483, 0, 0, 0, 3099, 0, 4095, 0,
+ 926, 4095, 2062, 1931, 2121, 0, 4095, 0, 2485, 1535, 4095, 4095, 3662, 4095,
+ 2419, 2487, 0, 4095, 4095, 4095, 0, 0, 4095, 0, 0, 2029, 0, 3008, 2338, 0,
+ 4095, 0, 3854, 0, 4095, 0, 0, 1315, 0, 0, 0, 0, 3492, 0, 1445, 0, 11, 4095,
+ 0, 0, 873, 0, 4095, 0, 4095, 2654, 3040, 0, 0, 0, 4095, 0, 68, 4095, 0, 0,
+ 990, 0, 828, 1015, 88, 3606, 0, 2875, 4095, 0, 3117, 411, 0, 0, 2859, 0, 0,
+ 4095, 3480, 25, 4095, 4095, 4095, 0, 0, 0, 4095, 4095, 4095, 4095, 1724, 0,
+ 0, 0, 3635, 1063, 3728, 4095, 4095, 2025, 3715, 0, 0, 0, 3722, 0, 1648, 0,
+ 4095, 3579, 0, 0, 0, 4095, 4095, 0, 4095
+ };
+ unsigned char *img_data =
+ reinterpret_cast<unsigned char *>(const_cast<uint16_t *>(buffer));
+
+ aom_image_t img;
+ EXPECT_EQ(
+ aom_img_wrap(&img, AOM_IMG_FMT_I44416, kWidth, kHeight, 1, img_data),
+ &img);
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+ AOM_CODEC_OK);
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 2;
+ cfg.g_bit_depth = AOM_BITS_12;
+ cfg.g_input_bit_depth = 12;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_limit = 1;
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_DISABLED;
+ cfg.kf_max_dist = 0;
+ cfg.g_threads = 34;
+ cfg.rc_min_quantizer = 8;
+ cfg.rc_max_quantizer = 20;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 14), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_ROW_MT, 1), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 4), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 4), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SKIP_POSTPROC_FILTERING, 1),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM),
+ AOM_CODEC_OK);
+
+ // Encode frame
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK);
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+// A test that reproduces b/272139363: signed integer overflow in
+// update_b_sep_sym().
+TEST(SearchWienerTest, 10bitSignedIntegerOverflowInUpdateBSepSym) {
+ constexpr int kWidth = 34;
+ constexpr int kHeight = 3;
+ static const uint16_t buffer[3 * kWidth * kHeight] = {
+ // Y plane:
+ 61, 765, 674, 188, 367, 944, 153, 275, 906, 433, 154, 51, 8, 855, 186, 154,
+ 392, 0, 634, 3, 690, 1023, 1023, 1023, 1023, 1023, 1023, 8, 1, 64, 426, 0,
+ 100, 344, 944, 816, 816, 33, 1023, 1023, 1023, 1023, 295, 1023, 1023, 1023,
+ 1023, 1023, 1023, 1015, 1023, 231, 1020, 254, 439, 439, 894, 439, 150, 1019,
+ 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 385, 320, 575,
+ 682, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 511, 699, 987, 3, 140,
+ 661, 120, 33, 143, 0, 0, 0, 3, 40, 625, 585, 16, 579, 160, 867,
+ // U plane:
+ 739, 646, 13, 603, 7, 328, 91, 32, 488, 870, 330, 330, 330, 330, 330, 330,
+ 109, 330, 330, 330, 3, 545, 945, 249, 35, 561, 801, 32, 931, 639, 801, 91,
+ 1023, 827, 844, 948, 631, 894, 854, 601, 432, 504, 85, 1, 0, 0, 89, 89, 0,
+ 0, 0, 0, 0, 0, 432, 801, 382, 4, 0, 0, 2, 89, 89, 89, 89, 89, 89, 384, 0, 0,
+ 0, 0, 0, 0, 0, 1023, 1019, 1, 3, 691, 575, 691, 691, 691, 691, 691, 691,
+ 691, 691, 691, 691, 691, 84, 527, 4, 485, 8, 682, 698, 340, 1015, 706,
+ // V plane:
+ 49, 10, 28, 1023, 1023, 1023, 0, 32, 32, 872, 114, 1003, 1023, 57, 477, 999,
+ 1023, 309, 309, 309, 309, 309, 309, 309, 309, 309, 309, 309, 309, 309, 309,
+ 9, 418, 418, 418, 418, 418, 418, 0, 0, 0, 1023, 4, 5, 0, 0, 1023, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 64, 0, 155, 709, 3, 331, 807, 633, 1023,
+ 1018, 646, 886, 991, 692, 915, 294, 0, 35, 2, 0, 471, 643, 770, 346, 176,
+ 32, 329, 322, 302, 61, 765, 674, 188, 367, 944, 153, 275, 906, 433, 154
+ };
+ unsigned char *img_data =
+ reinterpret_cast<unsigned char *>(const_cast<uint16_t *>(buffer));
+
+ aom_image_t img;
+ EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I44416, kWidth, kHeight, 1,
+ img_data));
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA));
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 1;
+ cfg.g_bit_depth = AOM_BITS_10;
+ cfg.g_input_bit_depth = 10;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_limit = 1;
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_DISABLED;
+ cfg.kf_max_dist = 0;
+ cfg.rc_min_quantizer = 3;
+ cfg.rc_max_quantizer = 54;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 28));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 3));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_control(&enc, AOME_SET_CPUUSED, 0));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_SKIP_POSTPROC_FILTERING, 1));
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM));
+
+ // Encode frame
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, &img, 0, 1, 0));
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, nullptr, 0, 1, 0));
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+// A test that reproduces b/277121724: signed integer overflow in
+// update_b_sep_sym().
+TEST(SearchWienerTest, 8bitSignedIntegerOverflowInUpdateBSepSym) {
+ constexpr int kWidth = 198;
+ constexpr int kHeight = 3;
+ // 8-bit YUV 4:2:2
+ static const unsigned char buffer[2 * kWidth * kHeight] = {
+ // Y plane:
+ 35, 225, 56, 91, 8, 142, 137, 143, 224, 49, 217, 57, 202, 163, 159, 246,
+ 232, 134, 135, 14, 76, 101, 239, 88, 186, 159, 118, 23, 114, 20, 108, 41,
+ 72, 17, 58, 242, 45, 146, 230, 14, 135, 140, 34, 61, 189, 181, 222, 71, 98,
+ 221, 5, 199, 244, 85, 229, 163, 105, 87, 144, 105, 64, 150, 36, 233, 235, 1,
+ 179, 190, 50, 222, 176, 109, 166, 18, 80, 129, 45, 9, 218, 144, 234, 10,
+ 148, 117, 37, 10, 232, 139, 206, 92, 208, 247, 128, 79, 202, 79, 212, 89,
+ 185, 152, 206, 182, 83, 105, 21, 86, 150, 84, 21, 165, 34, 251, 174, 240,
+ 172, 155, 254, 85, 98, 25, 96, 78, 230, 253, 36, 19, 247, 155, 112, 216,
+ 166, 114, 229, 118, 197, 149, 186, 194, 128, 45, 219, 26, 36, 77, 110, 45,
+ 252, 238, 183, 161, 171, 96, 232, 108, 73, 61, 243, 58, 155, 38, 91, 209,
+ 187, 206, 16, 165, 236, 145, 69, 126, 102, 10, 4, 43, 191, 106, 193, 240,
+ 132, 226, 38, 78, 7, 152, 101, 255, 254, 39, 33, 86, 35, 247, 199, 179, 239,
+ 198, 165, 58, 190, 171, 226, 94, 158, 21, 190, 151, 75, 176, 11, 53, 199,
+ 87, 91, 1, 226, 20, 117, 96, 75, 192, 101, 200, 125, 106, 233, 176, 63, 204,
+ 114, 16, 31, 222, 15, 14, 71, 2, 25, 47, 100, 174, 26, 209, 138, 138, 211,
+ 147, 164, 204, 9, 104, 135, 250, 9, 201, 88, 218, 71, 251, 61, 199, 0, 34,
+ 59, 115, 228, 161, 100, 132, 50, 4, 117, 100, 191, 126, 53, 28, 193, 42,
+ 155, 206, 79, 80, 117, 11, 3, 253, 181, 181, 138, 239, 107, 142, 216, 57,
+ 202, 126, 229, 250, 60, 62, 150, 128, 95, 32, 251, 207, 236, 208, 247, 183,
+ 59, 19, 117, 40, 106, 87, 140, 57, 109, 190, 51, 105, 226, 116, 156, 3, 35,
+ 86, 255, 138, 52, 211, 245, 76, 83, 109, 113, 77, 106, 77, 18, 56, 235, 158,
+ 24, 53, 151, 104, 152, 21, 15, 46, 163, 144, 217, 168, 154, 44, 80, 25, 11,
+ 37, 100, 235, 145, 154, 113, 0, 140, 153, 80, 64, 19, 121, 185, 144, 43,
+ 206, 16, 16, 72, 189, 175, 231, 177, 40, 177, 206, 116, 4, 82, 43, 244, 237,
+ 22, 252, 71, 194, 106, 4, 112, 0, 108, 137, 126, 80, 122, 142, 43, 205, 22,
+ 209, 217, 165, 32, 208, 100, 70, 3, 120, 159, 203, 7, 233, 152, 37, 96, 212,
+ 177, 1, 133, 218, 161, 172, 202, 192, 186, 114, 150, 121, 177, 227, 175, 64,
+ 127, 153, 113, 91, 198, 0, 111, 227, 226, 218, 71, 62, 5, 43, 128, 27, 3,
+ 82, 5, 10, 68, 153, 215, 181, 138, 246, 224, 170, 1, 241, 191, 181, 151,
+ 167, 14, 80, 45, 4, 252, 29, 66, 125, 58, 225, 253, 255, 248, 224, 40, 24,
+ 236, 46, 11, 219, 154, 134, 12, 76, 72, 97, 239, 50, 39, 85, 182, 55, 219,
+ 19, 109, 81, 119, 125, 206, 159, 239, 67, 193, 180, 132, 80, 127, 2, 169,
+ 99, 53, 47, 5, 100, 174, 151, 124, 246, 202, 93, 82, 65, 53, 214, 238, 32,
+ 218, 15, 254, 153, 95, 79, 189, 67, 233, 47, 83, 48, 125, 144, 206, 82, 69,
+ 186, 112, 134, 244, 96, 21, 143, 187, 248, 8, 224, 161, 227, 185, 236, 6,
+ 175, 237, 169, 154, 89, 143, 106, 205, 26, 47, 155, 42, 28, 162, 7, 8, 45,
+ // U plane:
+ 55, 165, 203, 139, 152, 208, 36, 177, 61, 49, 129, 211, 140, 71, 253, 250,
+ 120, 167, 238, 67, 255, 223, 104, 32, 240, 179, 28, 41, 86, 84, 61, 243,
+ 169, 212, 201, 0, 9, 236, 89, 194, 204, 75, 228, 250, 27, 81, 137, 29, 255,
+ 131, 194, 241, 76, 133, 186, 135, 212, 197, 150, 145, 203, 96, 86, 231, 91,
+ 119, 197, 67, 226, 2, 118, 66, 181, 86, 219, 86, 132, 137, 156, 161, 221,
+ 18, 55, 170, 35, 206, 201, 193, 38, 63, 229, 29, 110, 96, 14, 135, 229, 99,
+ 106, 108, 167, 110, 50, 32, 144, 113, 48, 29, 57, 29, 20, 199, 145, 245, 9,
+ 183, 88, 174, 114, 237, 29, 40, 99, 117, 233, 6, 51, 227, 2, 28, 76, 149,
+ 190, 23, 240, 73, 113, 10, 73, 240, 105, 220, 129, 26, 144, 214, 34, 4, 24,
+ 219, 24, 156, 198, 214, 244, 143, 106, 255, 204, 93, 2, 88, 107, 211, 241,
+ 242, 86, 189, 219, 164, 132, 149, 32, 228, 219, 60, 202, 218, 189, 34, 250,
+ 160, 158, 36, 212, 212, 41, 233, 61, 92, 121, 170, 220, 192, 232, 255, 124,
+ 249, 231, 55, 196, 219, 196, 62, 238, 187, 76, 33, 138, 67, 82, 159, 169,
+ 196, 66, 196, 110, 194, 64, 35, 205, 64, 218, 12, 41, 188, 195, 244, 178,
+ 17, 80, 8, 149, 39, 110, 146, 164, 162, 215, 227, 107, 103, 47, 52, 95, 3,
+ 181, 90, 255, 80, 83, 206, 66, 153, 112, 72, 109, 235, 69, 105, 57, 75, 145,
+ 186, 16, 87, 73, 61, 98, 197, 237, 17, 32, 207, 220, 246, 188, 46, 73, 121,
+ 84, 252, 164, 111, 21, 98, 13, 170, 174, 170, 231, 77, 10, 113, 9, 217, 11,
+ // V plane:
+ 124, 94, 69, 212, 107, 223, 228, 96, 56, 2, 158, 49, 251, 217, 143, 107,
+ 113, 17, 84, 169, 208, 43, 28, 37, 176, 54, 235, 150, 135, 135, 221, 94, 50,
+ 131, 251, 78, 38, 254, 129, 200, 207, 55, 111, 110, 144, 109, 228, 65, 70,
+ 39, 170, 5, 208, 151, 87, 86, 255, 74, 155, 153, 250, 15, 35, 33, 201, 226,
+ 117, 119, 220, 238, 133, 229, 69, 122, 160, 114, 245, 182, 13, 65, 2, 228,
+ 205, 174, 128, 248, 4, 139, 178, 227, 204, 243, 249, 253, 119, 253, 107,
+ 234, 39, 15, 173, 47, 93, 12, 222, 238, 30, 121, 124, 167, 27, 40, 215, 84,
+ 172, 130, 66, 43, 165, 55, 225, 79, 84, 153, 59, 110, 64, 176, 54, 123, 82,
+ 128, 189, 150, 52, 202, 102, 133, 199, 197, 253, 180, 221, 127, 144, 124,
+ 255, 224, 52, 149, 88, 166, 39, 38, 78, 114, 44, 242, 233, 40, 132, 142,
+ 152, 213, 112, 244, 221, 7, 52, 206, 246, 51, 182, 160, 247, 154, 183, 209,
+ 81, 70, 56, 186, 63, 182, 2, 82, 202, 178, 233, 52, 198, 241, 175, 38, 165,
+ 9, 231, 150, 114, 43, 159, 200, 42, 173, 217, 25, 233, 214, 210, 50, 43,
+ 159, 231, 102, 241, 246, 77, 76, 115, 77, 81, 114, 194, 182, 236, 0, 236,
+ 198, 197, 180, 176, 148, 48, 177, 106, 180, 150, 158, 237, 130, 242, 109,
+ 174, 247, 57, 230, 184, 64, 245, 251, 123, 169, 122, 156, 125, 123, 104,
+ 238, 1, 235, 187, 53, 67, 38, 50, 139, 123, 149, 111, 72, 80, 17, 175, 186,
+ 98, 153, 247, 97, 218, 141, 38, 0, 171, 254, 180, 81, 233, 71, 156, 48, 14,
+ 62, 210, 161, 124, 203, 92
+ };
+ unsigned char *img_data = const_cast<unsigned char *>(buffer);
+
+ aom_image_t img;
+ EXPECT_EQ(aom_img_wrap(&img, AOM_IMG_FMT_I422, kWidth, kHeight, 1, img_data),
+ &img);
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+ AOM_CODEC_OK);
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 2;
+ cfg.g_bit_depth = AOM_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_limit = 1;
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_DISABLED;
+ cfg.kf_max_dist = 0;
+ cfg.g_threads = 43;
+ cfg.rc_min_quantizer = 30;
+ cfg.rc_max_quantizer = 50;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 40), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_ROW_MT, 1), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 4), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 1), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 2), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SKIP_POSTPROC_FILTERING, 1),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM),
+ AOM_CODEC_OK);
+
+ // Encode frame
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK);
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+// A test that reproduces b/259173819: signed integer overflow in
+// linsolve_wiener().
+TEST(SearchWienerTest, 10bitSignedIntegerOverflowInLinsolveWiener) {
+ constexpr int kWidth = 3;
+ constexpr int kHeight = 3;
+ static const uint16_t buffer[3 * kWidth * kHeight] = {
+ // Y plane:
+ 81, 81, 1023, 1020, 81, 1023, 81, 128, 0,
+ // U plane:
+ 273, 273, 273, 273, 273, 273, 273, 273, 273,
+ // V plane:
+ 273, 273, 273, 273, 273, 273, 516, 81, 81
+ };
+ unsigned char *img_data =
+ reinterpret_cast<unsigned char *>(const_cast<uint16_t *>(buffer));
+
+ aom_image_t img;
+ EXPECT_EQ(
+ aom_img_wrap(&img, AOM_IMG_FMT_I44416, kWidth, kHeight, 1, img_data),
+ &img);
+ img.cp = AOM_CICP_CP_UNSPECIFIED;
+ img.tc = AOM_CICP_TC_UNSPECIFIED;
+ img.mc = AOM_CICP_MC_UNSPECIFIED;
+ img.range = AOM_CR_FULL_RANGE;
+
+ aom_codec_iface_t *iface = aom_codec_av1_cx();
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA),
+ AOM_CODEC_OK);
+ cfg.rc_end_usage = AOM_Q;
+ cfg.g_profile = 1;
+ cfg.g_bit_depth = AOM_BITS_10;
+ cfg.g_input_bit_depth = 10;
+ cfg.g_w = kWidth;
+ cfg.g_h = kHeight;
+ cfg.g_limit = 1;
+ cfg.g_lag_in_frames = 0;
+ cfg.kf_mode = AOM_KF_DISABLED;
+ cfg.kf_max_dist = 0;
+ cfg.g_threads = 21;
+ cfg.rc_min_quantizer = 16;
+ cfg.rc_max_quantizer = 54;
+ aom_codec_ctx_t enc;
+ EXPECT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_HIGHBITDEPTH),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CQ_LEVEL, 35), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_ROW_MT, 1), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_ROWS, 2), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_TILE_COLUMNS, 5), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 1), AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_COLOR_RANGE, AOM_CR_FULL_RANGE),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AV1E_SET_SKIP_POSTPROC_FILTERING, 1),
+ AOM_CODEC_OK);
+ EXPECT_EQ(aom_codec_control(&enc, AOME_SET_TUNING, AOM_TUNE_SSIM),
+ AOM_CODEC_OK);
+
+ // Encode frame
+ EXPECT_EQ(aom_codec_encode(&enc, &img, 0, 1, 0), AOM_CODEC_OK);
+ aom_codec_iter_t iter = nullptr;
+ const aom_codec_cx_pkt_t *pkt = aom_codec_get_cx_data(&enc, &iter);
+ ASSERT_NE(pkt, nullptr);
+ EXPECT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT);
+ // pkt->data.frame.flags is 0x1f0011.
+ EXPECT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY);
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ // Flush encoder
+ EXPECT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK);
+ iter = nullptr;
+ pkt = aom_codec_get_cx_data(&enc, &iter);
+ EXPECT_EQ(pkt, nullptr);
+
+ EXPECT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK);
+}
+
+} // namespace wiener_highbd
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/test/y4m_test.cc b/third_party/aom/test/y4m_test.cc
new file mode 100644
index 0000000000..a4ed13f7c5
--- /dev/null
+++ b/third_party/aom/test/y4m_test.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/y4menc.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+using std::string;
+
+static const unsigned int kWidth = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+struct Y4mTestParam {
+ const char *filename;
+ unsigned int bit_depth;
+ aom_img_fmt format;
+ const char *md5raw;
+};
+
+const Y4mTestParam kY4mTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420,
+ "e5406275b9fc6bb3436c31d4a05c1cab" },
+ { "park_joy_90p_8_420_monochrome.y4m", 8, AOM_IMG_FMT_I420,
+ "95ef5bf6218580588be24a5271bb6a7f" },
+ { "park_joy_90p_8_420_vertical_csp.y4m", 8, AOM_IMG_FMT_I420,
+ "e5406275b9fc6bb3436c31d4a05c1cab" },
+ { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422,
+ "284a47a47133b12884ec3a14e959a0b6" },
+ { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444,
+ "90517ff33843d85de712fd4fe60dbed0" },
+ { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016,
+ "63f21f9f717d8b8631bd2288ee87137b" },
+ { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216,
+ "48ab51fb540aed07f7ff5af130c9b605" },
+ { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416,
+ "067bfd75aa85ff9bae91fa3e0edd1e3e" },
+ { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016,
+ "9e6d8f6508c6e55625f6b697bc461cef" },
+ { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216,
+ "b239c6b301c0b835485be349ca83a7e3" },
+ { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416,
+ "5a6481a550821dab6d0192f5c63845e9" },
+};
+
+static const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+
+class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
+ public ::libaom_test::Y4mVideoSource {
+ protected:
+ Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+ ~Y4mVideoSourceTest() override { CloseSource(); }
+
+ virtual void Init(const std::string &file_name, int limit) {
+ file_name_ = file_name;
+ start_ = 0;
+ limit_ = limit;
+ frame_ = 0;
+ Begin();
+ }
+
+ // Checks y4m header information
+ void HeaderChecks(unsigned int bit_depth, aom_img_fmt_t fmt) {
+ ASSERT_NE(input_file_, nullptr);
+ ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+ ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+ ASSERT_EQ(img()->d_w, kWidth);
+ ASSERT_EQ(img()->d_h, kHeight);
+ ASSERT_EQ(y4m_.bit_depth, bit_depth);
+ ASSERT_EQ(y4m_.aom_fmt, fmt);
+ if (fmt == AOM_IMG_FMT_I420 || fmt == AOM_IMG_FMT_I42016) {
+ ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+ ASSERT_EQ(img()->x_chroma_shift, 1U);
+ ASSERT_EQ(img()->y_chroma_shift, 1U);
+ }
+ if (fmt == AOM_IMG_FMT_I422 || fmt == AOM_IMG_FMT_I42216) {
+ ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+ ASSERT_EQ(img()->x_chroma_shift, 1U);
+ ASSERT_EQ(img()->y_chroma_shift, 0U);
+ }
+ if (fmt == AOM_IMG_FMT_I444 || fmt == AOM_IMG_FMT_I44416) {
+ ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+ ASSERT_EQ(img()->x_chroma_shift, 0U);
+ ASSERT_EQ(img()->y_chroma_shift, 0U);
+ }
+ }
+
+ // Checks MD5 of the raw frame data
+ void Md5Check(const string &expected_md5) {
+ ASSERT_NE(input_file_, nullptr);
+ libaom_test::MD5 md5;
+ for (unsigned int i = start_; i < limit_; i++) {
+ md5.Add(img());
+ Next();
+ }
+ ASSERT_EQ(string(md5.Get()), expected_md5);
+ }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+ const Y4mTestParam t = GetParam();
+ Init(t.filename, kFrames);
+ HeaderChecks(t.bit_depth, t.format);
+ Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest,
+ ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest : public Y4mVideoSourceTest {
+ protected:
+ Y4mVideoWriteTest() : tmpfile_(nullptr) {}
+
+ ~Y4mVideoWriteTest() override {
+ delete tmpfile_;
+ input_file_ = nullptr;
+ }
+
+ void ReplaceInputFile(FILE *input_file) {
+ CloseSource();
+ frame_ = 0;
+ input_file_ = input_file;
+ rewind(input_file_);
+ ReadSourceToStart();
+ }
+
+ // Writes out a y4m file and then reads it back
+ void WriteY4mAndReadBack() {
+ ASSERT_NE(input_file_, nullptr);
+ char buf[Y4M_BUFFER_SIZE] = { 0 };
+ const struct AvxRational framerate = { y4m_.fps_n, y4m_.fps_d };
+ tmpfile_ = new libaom_test::TempOutFile;
+ ASSERT_NE(tmpfile_, nullptr);
+ ASSERT_NE(tmpfile_->file(), nullptr);
+ y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
+ img()->monochrome, img()->csp, y4m_.aom_fmt,
+ y4m_.bit_depth, AOM_CR_STUDIO_RANGE);
+ fputs(buf, tmpfile_->file());
+ for (unsigned int i = start_; i < limit_; i++) {
+ y4m_write_frame_header(buf, sizeof(buf));
+ fputs(buf, tmpfile_->file());
+ y4m_write_image_file(img(), PLANES_YUV, tmpfile_->file());
+ Next();
+ }
+ ReplaceInputFile(tmpfile_->file());
+ }
+
+ void Init(const std::string &file_name, int limit) override {
+ Y4mVideoSourceTest::Init(file_name, limit);
+ WriteY4mAndReadBack();
+ }
+ libaom_test::TempOutFile *tmpfile_;
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+ const Y4mTestParam t = GetParam();
+ Init(t.filename, kFrames);
+ HeaderChecks(t.bit_depth, t.format);
+ Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
+ ::testing::ValuesIn(kY4mTestVectors));
+
+static const char kY4MRegularHeader[] =
+ "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n"
+ "FRAME\n"
+ "012345678912345601230123";
+
+TEST(Y4MHeaderTest, RegularHeader) {
+ libaom_test::TempOutFile f;
+ ASSERT_NE(f.file(), nullptr);
+ fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
+ fflush(f.file());
+ EXPECT_EQ(0, fseek(f.file(), 0, 0));
+
+ y4m_input y4m;
+ EXPECT_EQ(y4m_input_open(&y4m, f.file(), nullptr, 0, AOM_CSP_UNKNOWN,
+ /*only_420=*/0),
+ 0);
+ EXPECT_EQ(y4m.pic_w, 4);
+ EXPECT_EQ(y4m.pic_h, 4);
+ EXPECT_EQ(y4m.fps_n, 30);
+ EXPECT_EQ(y4m.fps_d, 1);
+ EXPECT_EQ(y4m.interlace, 'p');
+ EXPECT_EQ(y4m.color_range, AOM_CR_STUDIO_RANGE);
+ EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+ y4m_input_close(&y4m);
+}
+
+// Testing that headers over 100 characters can be parsed.
+static const char kY4MLongHeader[] =
+ "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG "
+ "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n"
+ "FRAME\n"
+ "012345678912345601230123";
+
+TEST(Y4MHeaderTest, LongHeader) {
+ libaom_test::TempOutFile tmpfile;
+ FILE *f = tmpfile.file();
+ ASSERT_NE(f, nullptr);
+ fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f);
+ fflush(f);
+ EXPECT_EQ(fseek(f, 0, 0), 0);
+
+ y4m_input y4m;
+ EXPECT_EQ(y4m_input_open(&y4m, f, nullptr, 0, AOM_CSP_UNKNOWN,
+ /*only_420=*/0),
+ 0);
+ EXPECT_EQ(y4m.pic_w, 4);
+ EXPECT_EQ(y4m.pic_h, 4);
+ EXPECT_EQ(y4m.fps_n, 30);
+ EXPECT_EQ(y4m.fps_d, 1);
+ EXPECT_EQ(y4m.interlace, 'p');
+ EXPECT_EQ(y4m.color_range, AOM_CR_STUDIO_RANGE);
+ EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+ y4m_input_close(&y4m);
+}
+
+static const char kY4MFullRangeHeader[] =
+ "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG XCOLORRANGE=FULL\n"
+ "FRAME\n"
+ "012345678912345601230123";
+
+TEST(Y4MHeaderTest, FullRangeHeader) {
+ libaom_test::TempOutFile tmpfile;
+ FILE *f = tmpfile.file();
+ ASSERT_NE(f, nullptr);
+ fwrite(kY4MFullRangeHeader, 1, sizeof(kY4MFullRangeHeader), f);
+ fflush(f);
+ EXPECT_EQ(fseek(f, 0, 0), 0);
+
+ y4m_input y4m;
+ EXPECT_EQ(y4m_input_open(&y4m, f, nullptr, 0, AOM_CSP_UNKNOWN,
+ /*only_420=*/0),
+ 0);
+ EXPECT_EQ(y4m.pic_w, 4);
+ EXPECT_EQ(y4m.pic_h, 4);
+ EXPECT_EQ(y4m.fps_n, 30);
+ EXPECT_EQ(y4m.fps_d, 1);
+ EXPECT_EQ(y4m.interlace, 'p');
+ EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+ EXPECT_EQ(y4m.color_range, AOM_CR_FULL_RANGE);
+ y4m_input_close(&y4m);
+}
+
+TEST(Y4MHeaderTest, WriteStudioColorRange) {
+ char buf[128];
+ struct AvxRational framerate = { /*numerator=*/30, /*denominator=*/1 };
+ EXPECT_GE(y4m_write_file_header(
+ buf, /*len=*/128, /*width=*/4, /*height=*/5, &framerate,
+ /*monochrome=*/0, AOM_CSP_UNKNOWN, AOM_IMG_FMT_I420,
+ /*bit_depth=*/8, AOM_CR_STUDIO_RANGE),
+ 0);
+ EXPECT_EQ(strcmp("YUV4MPEG2 W4 H5 F30:1 Ip C420jpeg\n", buf), 0);
+}
+
+TEST(Y4MHeaderTest, WriteFullColorRange) {
+ char buf[128];
+ struct AvxRational framerate = { /*numerator=*/30, /*denominator=*/1 };
+ EXPECT_GE(y4m_write_file_header(
+ buf, /*len=*/128, /*width=*/4, /*height=*/5, &framerate,
+ /*monochrome=*/0, AOM_CSP_UNKNOWN, AOM_IMG_FMT_I420,
+ /*bit_depth=*/8, AOM_CR_FULL_RANGE),
+ 0);
+ EXPECT_EQ(strcmp("YUV4MPEG2 W4 H5 F30:1 Ip C420jpeg XCOLORRANGE=FULL\n", buf),
+ 0);
+}
+
+} // namespace
diff --git a/third_party/aom/test/y4m_video_source.h b/third_party/aom/test/y4m_video_source.h
new file mode 100644
index 0000000000..1369e4e280
--- /dev/null
+++ b/third_party/aom/test/y4m_video_source.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#define AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "common/y4minput.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class Y4mVideoSource : public VideoSource {
+ public:
+ Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
+ : file_name_(file_name), input_file_(nullptr), img_(new aom_image_t()),
+ start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
+ framerate_denominator_(0), y4m_() {}
+
+ ~Y4mVideoSource() override {
+ aom_img_free(img_.get());
+ CloseSource();
+ }
+
+ virtual void OpenSource() {
+ CloseSource();
+ input_file_ = OpenTestDataFile(file_name_);
+ ASSERT_NE(input_file_, nullptr)
+ << "Input file open failed. Filename: " << file_name_;
+ }
+
+ virtual void ReadSourceToStart() {
+ ASSERT_NE(input_file_, nullptr);
+ ASSERT_FALSE(
+ y4m_input_open(&y4m_, input_file_, nullptr, 0, AOM_CSP_UNKNOWN, 0));
+ framerate_numerator_ = y4m_.fps_n;
+ framerate_denominator_ = y4m_.fps_d;
+ frame_ = 0;
+ for (unsigned int i = 0; i < start_; i++) {
+ Next();
+ }
+ FillFrame();
+ }
+
+ void Begin() override {
+ OpenSource();
+ ReadSourceToStart();
+ }
+
+ void Next() override {
+ ++frame_;
+ FillFrame();
+ }
+
+ aom_image_t *img() const override {
+ return (frame_ < limit_) ? img_.get() : nullptr;
+ }
+
+ // Models a stream where Timebase = 1/FPS, so pts == frame.
+ aom_codec_pts_t pts() const override { return frame_; }
+
+ unsigned long duration() const override { return 1; }
+
+ aom_rational_t timebase() const override {
+ const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
+ return t;
+ }
+
+ unsigned int frame() const override { return frame_; }
+
+ unsigned int limit() const override { return limit_; }
+
+ virtual void FillFrame() {
+ ASSERT_NE(input_file_, nullptr);
+ // Read a frame from input_file.
+ y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
+ }
+
+ // Swap buffers with another y4m source. This allows reading a new frame
+ // while keeping the old frame around. A whole Y4mSource is required and
+ // not just a aom_image_t because of how the y4m reader manipulates
+ // aom_image_t internals,
+ void SwapBuffers(Y4mVideoSource *other) {
+ std::swap(other->y4m_.dst_buf, y4m_.dst_buf);
+ aom_image_t *tmp;
+ tmp = other->img_.release();
+ other->img_.reset(img_.release());
+ img_.reset(tmp);
+ }
+
+ protected:
+ void CloseSource() {
+ y4m_input_close(&y4m_);
+ y4m_ = y4m_input();
+ if (input_file_ != nullptr) {
+ fclose(input_file_);
+ input_file_ = nullptr;
+ }
+ }
+
+ std::string file_name_;
+ FILE *input_file_;
+ std::unique_ptr<aom_image_t> img_;
+ unsigned int start_;
+ unsigned int limit_;
+ unsigned int frame_;
+ int framerate_numerator_;
+ int framerate_denominator_;
+ y4m_input y4m_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/yuv_video_source.h b/third_party/aom/test/yuv_video_source.h
new file mode 100644
index 0000000000..77d5dfa73c
--- /dev/null
+++ b/third_party/aom/test/yuv_video_source.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_YUV_VIDEO_SOURCE_H_
+#define AOM_TEST_YUV_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/video_source.h"
+#include "aom/aom_image.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw YUV
+// formats of various color sampling and bit-depths so that we can
+// do actual file encodes.
+class YUVVideoSource : public VideoSource {
+ public:
+ YUVVideoSource(const std::string &file_name, aom_img_fmt format,
+ unsigned int width, unsigned int height, int rate_numerator,
+ int rate_denominator, unsigned int start, int limit)
+ : file_name_(file_name), input_file_(nullptr), img_(nullptr),
+ start_(start), limit_(limit), frame_(0), width_(0), height_(0),
+ format_(AOM_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
+ framerate_denominator_(rate_denominator) {
+ // This initializes format_, raw_size_, width_, height_ and allocates img.
+ SetSize(width, height, format);
+ }
+
+ ~YUVVideoSource() override {
+ aom_img_free(img_);
+ if (input_file_) fclose(input_file_);
+ }
+
+ void Begin() override {
+ if (input_file_) fclose(input_file_);
+ input_file_ = OpenTestDataFile(file_name_);
+ ASSERT_NE(input_file_, nullptr)
+ << "Input file open failed. Filename: " << file_name_;
+ if (start_)
+ fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
+
+ frame_ = start_;
+ FillFrame();
+ }
+
+ void Next() override {
+ ++frame_;
+ FillFrame();
+ }
+
+ aom_image_t *img() const override {
+ return (frame_ < limit_) ? img_ : nullptr;
+ }
+
+ // Models a stream where Timebase = 1/FPS, so pts == frame.
+ aom_codec_pts_t pts() const override { return frame_; }
+
+ unsigned long duration() const override { return 1; }
+
+ aom_rational_t timebase() const override {
+ const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
+ return t;
+ }
+
+ unsigned int frame() const override { return frame_; }
+
+ unsigned int limit() const override { return limit_; }
+
+ virtual void SetSize(unsigned int width, unsigned int height,
+ aom_img_fmt format) {
+ if (width != width_ || height != height_ || format != format_) {
+ aom_img_free(img_);
+ img_ = aom_img_alloc(nullptr, format, width, height, 1);
+ ASSERT_NE(img_, nullptr);
+ width_ = width;
+ height_ = height;
+ format_ = format;
+ switch (format) {
+ case AOM_IMG_FMT_NV12:
+ case AOM_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break;
+ case AOM_IMG_FMT_I422: raw_size_ = width * height * 2; break;
+ case AOM_IMG_FMT_I444: raw_size_ = width * height * 3; break;
+ case AOM_IMG_FMT_I42016: raw_size_ = width * height * 3; break;
+ case AOM_IMG_FMT_I42216: raw_size_ = width * height * 4; break;
+ case AOM_IMG_FMT_I44416: raw_size_ = width * height * 6; break;
+ default: ASSERT_TRUE(0);
+ }
+ }
+ }
+
+ virtual void FillFrame() {
+ ASSERT_NE(input_file_, nullptr);
+ // Read a frame from input_file.
+ if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
+ limit_ = frame_;
+ }
+ }
+
+ protected:
+ std::string file_name_;
+ FILE *input_file_;
+ aom_image_t *img_;
+ size_t raw_size_;
+ unsigned int start_;
+ unsigned int limit_;
+ unsigned int frame_;
+ unsigned int width_;
+ unsigned int height_;
+ aom_img_fmt format_;
+ int framerate_numerator_;
+ int framerate_denominator_;
+};
+
+} // namespace libaom_test
+
+#endif // AOM_TEST_YUV_VIDEO_SOURCE_H_
diff --git a/third_party/aom/third_party/SVT-AV1/EbMemory_AVX2.h b/third_party/aom/third_party/SVT-AV1/EbMemory_AVX2.h
new file mode 100644
index 0000000000..0d0ea10abc
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/EbMemory_AVX2.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright(c) 2019 Intel Corporation
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at https://www.aomedia.org/license/software-license. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * https://www.aomedia.org/license/patent-license.
+ */
+
+#ifndef AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_
+#define AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_setr_m128i
+#define _mm256_setr_m128i(/* __m128i */ lo, /* __m128i */ hi) \
+ _mm256_set_m128i((hi), (lo))
+#endif
+
+static INLINE __m256i load_u8_4x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ __m128i src01;
+ src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride));
+ src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
+ return _mm256_setr_m128i(src01, _mm_setzero_si128());
+}
+
+static INLINE __m256i load_u8_4x4_avx2(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ __m128i src01, src23;
+ src01 = _mm_cvtsi32_si128(*(int32_t *)(src + 0 * stride));
+ src01 = _mm_insert_epi32(src01, *(int32_t *)(src + 1 * stride), 1);
+ src23 = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
+ src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
+ return _mm256_setr_m128i(src01, src23);
+}
+
+static INLINE __m256i load_u8_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ const __m128i src0 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
+ const __m128i src1 = _mm_loadl_epi64((__m128i *)(src + 1 * stride));
+ return _mm256_setr_m128i(src0, src1);
+}
+
+static INLINE __m256i load_u8_8x4_avx2(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ __m128i src01, src23;
+ src01 = _mm_loadl_epi64((__m128i *)(src + 0 * stride));
+ src01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src01),
+ (double *)(void *)(src + 1 * stride)));
+ src23 = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
+ src23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(src23),
+ (double *)(void *)(src + 3 * stride)));
+ return _mm256_setr_m128i(src01, src23);
+}
+
+static INLINE __m256i loadu_8bit_16x2_avx2(const void *const src,
+ const ptrdiff_t strideInByte) {
+ const __m128i src0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i src1 =
+ _mm_loadu_si128((__m128i *)((uint8_t *)src + strideInByte));
+ return _mm256_setr_m128i(src0, src1);
+}
+
+static INLINE __m256i loadu_u8_16x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
+}
+
+static INLINE __m256i loadu_u16_8x2_avx2(const uint16_t *const src,
+ const ptrdiff_t stride) {
+ return loadu_8bit_16x2_avx2(src, sizeof(*src) * stride);
+}
+
+static INLINE void storeu_8bit_16x2_avx2(const __m256i src, void *const dst,
+ const ptrdiff_t strideInByte) {
+ const __m128i d0 = _mm256_castsi256_si128(src);
+ const __m128i d1 = _mm256_extracti128_si256(src, 1);
+ _mm_storeu_si128((__m128i *)dst, d0);
+ _mm_storeu_si128((__m128i *)((uint8_t *)dst + strideInByte), d1);
+}
+
+static INLINE void storeu_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
+ const ptrdiff_t stride) {
+ storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
+}
+
+static INLINE void storeu_s16_8x2_avx2(const __m256i src, int16_t *const dst,
+ const ptrdiff_t stride) {
+ storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
+}
+
+static INLINE void storeu_u16_8x2_avx2(const __m256i src, uint16_t *const dst,
+ const ptrdiff_t stride) {
+ storeu_8bit_16x2_avx2(src, dst, sizeof(*dst) * stride);
+}
+
+#endif // AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_AVX2_H_
diff --git a/third_party/aom/third_party/SVT-AV1/EbMemory_SSE4_1.h b/third_party/aom/third_party/SVT-AV1/EbMemory_SSE4_1.h
new file mode 100644
index 0000000000..d821d9a307
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/EbMemory_SSE4_1.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2019 Intel Corporation
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at https://www.aomedia.org/license/software-license. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * https://www.aomedia.org/license/patent-license.
+ */
+
+#ifndef AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_SSE4_1_H_
+#define AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_SSE4_1_H_
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i load8bit_4x2_sse4_1(const void *const src,
+ const ptrdiff_t strideInByte) {
+ const __m128i s = _mm_cvtsi32_si128(*(int32_t *)((uint8_t *)src));
+ return _mm_insert_epi32(s, *(int32_t *)((uint8_t *)src + strideInByte), 1);
+}
+
+static INLINE __m128i load_u8_4x2_sse4_1(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
+}
+
+static INLINE __m128i load_u16_2x2_sse4_1(const uint16_t *const src,
+ const ptrdiff_t stride) {
+ return load8bit_4x2_sse4_1(src, sizeof(*src) * stride);
+}
+
+#endif // AOM_THIRD_PARTY_SVT_AV1_EBMEMORY_SSE4_1_H_
diff --git a/third_party/aom/third_party/SVT-AV1/LICENSE.md b/third_party/aom/third_party/SVT-AV1/LICENSE.md
new file mode 100644
index 0000000000..aff96d15ed
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/LICENSE.md
@@ -0,0 +1,32 @@
+BSD 3-Clause Clear License
+The Clear BSD License
+
+Copyright (c) 2021, Alliance for Open Media
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the disclaimer below)
+provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the Alliance for Open Media nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/aom/third_party/SVT-AV1/PATENTS.md b/third_party/aom/third_party/SVT-AV1/PATENTS.md
new file mode 100644
index 0000000000..1de4dd7531
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/PATENTS.md
@@ -0,0 +1,107 @@
+**Alliance for Open Media Patent License 1.0**
+
+ 1. **License Terms.**
+
+ **Patent License.** Subject to the terms and conditions of this License, each
+ Licensor, on behalf of itself and successors in interest and assigns,
+ grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+ no-charge, royalty-free, irrevocable (except as expressly stated in this
+ License) patent license to its Necessary Claims to make, use, sell, offer
+ for sale, import or distribute any Implementation.
+
+ **Conditions.**
+
+ *Availability.* As a condition to the grant of rights to Licensee to make,
+ sell, offer for sale, import or distribute an Implementation under
+ Section 1.1, Licensee must make its Necessary Claims available under
+ this License, and must reproduce this License with any Implementation
+ as follows:
+
+ a. For distribution in source code, by including this License in the
+ root directory of the source code with its Implementation.
+
+ b. For distribution in any other form (including binary, object form,
+ and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+ GDSII, etc.)), by including this License in the documentation, legal
+ notices, and/or other written materials provided with the
+ Implementation.
+
+ *Additional Conditions.* This license is directly from Licensor to
+ Licensee. Licensee acknowledges as a condition of benefiting from it
+ that no rights from Licensor are received from suppliers, distributors,
+ or otherwise in connection with this License.
+
+ **Defensive Termination**. If any Licensee, its Affiliates, or its agents
+ initiates patent litigation or files, maintains, or voluntarily
+ participates in a lawsuit against another entity or any person asserting
+ that any Implementation infringes Necessary Claims, any patent licenses
+ granted under this License directly to the Licensee are immediately
+ terminated as of the date of the initiation of action unless 1) that suit
+ was in response to a corresponding suit regarding an Implementation first
+ brought against an initiating entity, or 2) that suit was brought to
+ enforce the terms of this License (including intervention in a third-party
+ action by a Licensee).
+
+ **Disclaimers.** The Reference Implementation and Specification are provided
+ "AS IS" and without warranty. The entire risk as to implementing or
+ otherwise using the Reference Implementation or Specification is assumed
+ by the implementer and user. Licensor expressly disclaims any warranties
+ (express, implied, or otherwise), including implied warranties of
+ merchantability, non-infringement, fitness for a particular purpose, or
+ title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+ ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+ INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+ ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+ OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+ NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. **Definitions.**
+
+ **Affiliate.** "Affiliate" means an entity that directly or indirectly
+ Controls, is Controlled by, or is under common Control of that party.
+
+ **Control.** "Control" means direct or indirect control of more than 50% of
+ the voting power to elect directors of that corporation, or for any other
+ entity, the power to direct management of such entity.
+
+ **Decoder.** "Decoder" means any decoder that conforms fully with all
+ non-optional portions of the Specification.
+
+ **Encoder.** "Encoder" means any encoder that produces a bitstream that can
+ be decoded by a Decoder only to the extent it produces such a bitstream.
+
+ **Final Deliverable.** "Final Deliverable" means the final version of a
+ deliverable approved by the Alliance for Open Media as a Final
+ Deliverable.
+
+ **Implementation.** "Implementation" means any implementation, including the
+ Reference Implementation, that is an Encoder and/or a Decoder. An
+ Implementation also includes components of an Implementation only to the
+ extent they are used as part of an Implementation.
+
+ **License.** "License" means this license.
+
+ **Licensee.** "Licensee" means any person or entity who exercises patent
+ rights granted under this License.
+
+ **Licensor.** "Licensor" means (i) any Licensee that makes, sells, offers
+ for sale, imports or distributes any Implementation, or (ii) a person
+ or entity that has a licensing obligation to the Implementation as a
+ result of its membership and/or participation in the Alliance for Open
+ Media working group that developed the Specification.
+
+ **Necessary Claims.** "Necessary Claims" means all claims of patents or
+ patent applications, (a) that currently or at any time in the future,
+ are owned or controlled by the Licensor, and (b) (i) would be an
+ Essential Claim as defined by the W3C Policy as of February 5, 2004
+ (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+ as if the Specification was a W3C Recommendation; or (ii) are infringed
+ by the Reference Implementation.
+
+ **Reference Implementation.** "Reference Implementation" means an Encoder
+ and/or Decoder released by the Alliance for Open Media as a Final
+ Deliverable.
+
+ **Specification.** "Specification" means the specification designated by
+ the Alliance for Open Media as a Final Deliverable for which this
+ License was issued.
diff --git a/third_party/aom/third_party/SVT-AV1/README.libaom b/third_party/aom/third_party/SVT-AV1/README.libaom
new file mode 100644
index 0000000000..ff365057eb
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/README.libaom
@@ -0,0 +1,14 @@
+URL: https://gitlab.com/AOMediaCodec/SVT-AV1
+
+Version: 8ff99c90359330d2e807757c9425560bbc452ff3
+License: BSD-3-clause clear
+License File: LICENSE.md
+
+Description:
+Port the x86 intrinsics used for single reference convolve reconstructions.
+
+Local Changes:
+Only ported the functions pertinent to single reference convolves.
+All functions are made static inline to avoid function call overheads.
+References to some arrays are changed to libaom version when applicable.
+Some extra intrinsic functions are added to support missing block sizes.
diff --git a/third_party/aom/third_party/SVT-AV1/convolve_2d_avx2.h b/third_party/aom/third_party/SVT-AV1/convolve_2d_avx2.h
new file mode 100644
index 0000000000..64cd810f77
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/convolve_2d_avx2.h
@@ -0,0 +1,1199 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
+#define THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
+
+#include "convolve_avx2.h"
+
+static void convolve_2d_sr_hor_2tap_avx2(
+ const uint8_t *const src, const int32_t src_stride, const int32_t w,
+ const int32_t h, const InterpFilterParams *const filter_params_x,
+ const int32_t subpel_x_q4, int16_t *const im_block) {
+ const uint8_t *src_ptr = src;
+ int32_t y = h;
+ int16_t *im = im_block;
+
+ if (w <= 8) {
+ __m128i coeffs_128;
+
+ prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
+
+ if (w == 2) {
+ do {
+ const __m128i r =
+ x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
+ xy_x_round_store_2x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 2;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ do {
+ const __m128i r =
+ x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
+ xy_x_round_store_4x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 4;
+ y -= 2;
+ } while (y);
+ } else {
+ assert(w == 8);
+
+ do {
+ __m128i r[2];
+
+ x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
+ xy_x_round_store_8x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 8;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ __m256i coeffs_256;
+
+ prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
+
+ if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
+ xy_x_round_store_32_avx2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 16;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
+ src_ptr += src_stride;
+ im += 32;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
+ xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
+ src_ptr += src_stride;
+ im += 64;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
+ xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
+ xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
+ xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
+ src_ptr += src_stride;
+ im += 128;
+ } while (--y);
+ }
+ }
+}
+
+static void convolve_2d_sr_hor_4tap_ssse3(
+ const uint8_t *const src, const int32_t src_stride, const int32_t w,
+ const int32_t h, const InterpFilterParams *const filter_params_x,
+ const int32_t subpel_x_q4, int16_t *const im_block) {
+ const uint8_t *src_ptr = src - 1;
+ int32_t y = h;
+ int16_t *im = im_block;
+
+ if (w <= 4) {
+ __m128i coeffs_128[2];
+
+ prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
+ if (w == 2) {
+ do {
+ const __m128i r =
+ x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
+ xy_x_round_store_2x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 2;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ do {
+ const __m128i r =
+ x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+ xy_x_round_store_4x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 4;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ // TODO(chiyotsai@google.com): Add better optimization
+ __m256i coeffs_256[2], filt_256[2];
+
+ prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+ filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+
+ if (w == 8) {
+ do {
+ __m256i res =
+ x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
+ xy_x_round_store_8x2_avx2(res, im);
+
+ src_ptr += 2 * src_stride;
+ im += 2 * 8;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
+ xy_x_round_store_32_avx2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 16;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+
+ src_ptr += src_stride;
+ im += 32;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+ src_ptr += src_stride;
+ im += 64;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+ xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
+ xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
+ src_ptr += src_stride;
+ im += 128;
+ } while (--y);
+ }
+ }
+}
+
+static void convolve_2d_sr_hor_6tap_avx2(
+ const uint8_t *const src, const int32_t src_stride, const int32_t w,
+ const int32_t h, const InterpFilterParams *const filter_params_x,
+ const int32_t subpel_x_q4, int16_t *const im_block) {
+ const uint8_t *src_ptr = src - 2;
+ int32_t y = h;
+ int16_t *im = im_block;
+
+ if (w <= 4) {
+ __m128i coeffs_128[3];
+
+ prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
+ if (w == 2) {
+ do {
+ const __m128i r =
+ x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
+ xy_x_round_store_2x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 2;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ do {
+ const __m128i r =
+ x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+ xy_x_round_store_4x2_sse2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 4;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ __m256i coeffs_256[3], filt_256[3];
+
+ filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+ filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+ filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
+
+ prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+ if (w == 8) {
+ do {
+ const __m256i res =
+ x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
+ xy_x_round_store_8x2_avx2(res, im);
+
+ src_ptr += 2 * src_stride;
+ im += 2 * 8;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
+ xy_x_round_store_32_avx2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 16;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ src_ptr += src_stride;
+ im += 32;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+ src_ptr += src_stride;
+ im += 64;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+ xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
+ xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
+ src_ptr += src_stride;
+ im += 128;
+ } while (--y);
+ }
+ }
+}
+
+static void convolve_2d_sr_hor_8tap_avx2(
+ const uint8_t *const src, const int32_t src_stride, const int32_t w,
+ const int32_t h, const InterpFilterParams *const filter_params_x,
+ const int32_t subpel_x_q4, int16_t *const im_block) {
+ const uint8_t *src_ptr = src - 3;
+ int32_t y = h;
+ int16_t *im = im_block;
+ __m256i coeffs_256[4], filt_256[4];
+
+ filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+ filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+ filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
+ filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
+
+ prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+ if (w == 8) {
+ do {
+ const __m256i res =
+ x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
+ xy_x_round_store_8x2_avx2(res, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 8;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
+ xy_x_round_store_32_avx2(r, im);
+ src_ptr += 2 * src_stride;
+ im += 2 * 16;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ src_ptr += src_stride;
+ im += 32;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+ src_ptr += src_stride;
+ im += 64;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
+ xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
+ xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
+ xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
+ src_ptr += src_stride;
+ im += 128;
+ } while (--y);
+ }
+}
+
+static void convolve_2d_sr_ver_2tap_avx2(
+ const int16_t *const im_block, const int32_t w, const int32_t h,
+ const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+ uint8_t *dst, const int32_t dst_stride) {
+ const int16_t *im = im_block;
+ int32_t y = h;
+
+ if (w <= 4) {
+ __m128i coeffs_128;
+
+ prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
+
+ if (w == 2) {
+ __m128i s_32[2];
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
+
+ do {
+ const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
+ xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+ im += 2 * 2;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m128i s_64[2], r[2];
+
+ assert(w == 4);
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)im);
+
+ do {
+ xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
+ r[0] = xy_y_round_sse2(r[0]);
+ r[1] = xy_y_round_sse2(r[1]);
+ const __m128i rr = _mm_packs_epi32(r[0], r[1]);
+ pack_store_4x2_sse2(rr, dst, dst_stride);
+ im += 2 * 4;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ __m256i coeffs_256;
+
+ prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
+
+ if (w == 8) {
+ __m128i s_128[2];
+ __m256i r[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)im);
+
+ do {
+ xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ __m256i s_256[2], r[4];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)im);
+
+ do {
+ xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ __m256i s_256[2][2];
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+
+ do {
+ xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256,
+ dst);
+ im += 2 * 32;
+ xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256,
+ dst + dst_stride);
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 64) {
+ __m256i s_256[2][4];
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+ s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+ s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
+
+ do {
+ xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0,
+ &coeffs_256, dst);
+ xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2,
+ &coeffs_256, dst + 32);
+ im += 2 * 64;
+ xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+ &coeffs_256, dst + dst_stride);
+ xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
+ &coeffs_256, dst + dst_stride + 32);
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i s_256[2][8];
+
+ assert(w == 128);
+
+ load_16bit_8rows_avx2(im, 16, s_256[0]);
+
+ do {
+ xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0,
+ &coeffs_256, dst);
+ xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2,
+ &coeffs_256, dst + 1 * 32);
+ xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4,
+ &coeffs_256, dst + 2 * 32);
+ xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6,
+ &coeffs_256, dst + 3 * 32);
+ im += 2 * 128;
+ xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+ &coeffs_256, dst + dst_stride);
+ xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2,
+ &coeffs_256, dst + dst_stride + 1 * 32);
+ xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4,
+ &coeffs_256, dst + dst_stride + 2 * 32);
+ xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6,
+ &coeffs_256, dst + dst_stride + 3 * 32);
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ }
+}
+
+static void convolve_2d_sr_ver_2tap_half_avx2(
+ const int16_t *const im_block, const int32_t w, const int32_t h,
+ const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+ uint8_t *dst, const int32_t dst_stride) {
+ const int16_t *im = im_block;
+ int32_t y = h;
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ if (w == 2) {
+ __m128i s_32[2];
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
+
+ do {
+ const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
+ const __m128i r = xy_y_round_half_pel_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+ im += 2 * 2;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ __m128i s_64[2];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)im);
+
+ do {
+ const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
+ const __m128i r = xy_y_round_half_pel_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+ im += 2 * 4;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 8) {
+ __m128i s_128[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)im);
+
+ do {
+ const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
+ const __m256i r = xy_y_round_half_pel_avx2(res);
+ pack_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ __m256i s_256[2], r[2];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)im);
+
+ do {
+ xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
+ r[0] = xy_y_round_half_pel_avx2(r[0]);
+ r[1] = xy_y_round_half_pel_avx2(r[1]);
+ xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ __m256i s_256[2][2];
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+
+ do {
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0],
+ dst + dst_stride);
+ im += 2 * 32;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 64) {
+ __m256i s_256[2][4];
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+ s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+ s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
+
+ do {
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0,
+ s_256[1] + 0, dst);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2,
+ s_256[1] + 2, dst + 32);
+ im += 2 * 64;
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+ dst + dst_stride);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(
+ im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32);
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i s_256[2][8];
+
+ assert(w == 128);
+
+ load_16bit_8rows_avx2(im, 16, s_256[0]);
+
+ do {
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0,
+ s_256[1] + 0, dst);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2,
+ s_256[1] + 2, dst + 1 * 32);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4,
+ s_256[1] + 4, dst + 2 * 32);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6,
+ s_256[1] + 6, dst + 3 * 32);
+ im += 2 * 128;
+ xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0,
+ dst + dst_stride);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(
+ im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(
+ im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32);
+ xy_y_convolve_2tap_half_pel_32_all_avx2(
+ im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32);
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+}
+
+static void convolve_2d_sr_ver_4tap_avx2(
+ const int16_t *const im_block, const int32_t w, const int32_t h,
+ const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+ uint8_t *dst, const int32_t dst_stride) {
+ const int16_t *im = im_block;
+ int32_t y = h;
+
+ if (w == 2) {
+ __m128i coeffs_128[2], s_32[4], ss_128[2];
+
+ prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
+ s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
+ s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
+
+ const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+
+ ss_128[0] = _mm_unpacklo_epi16(src01, src12);
+
+ do {
+ const __m128i res =
+ xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
+ xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+ im += 2 * 2;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i coeffs_256[2];
+
+ prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 4) {
+ __m128i s_64[4];
+ __m256i s_256[2], ss_256[2];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
+ s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
+ s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+ s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+
+ do {
+ const __m256i res =
+ xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
+ xy_y_round_store_4x2_avx2(res, dst, dst_stride);
+ im += 2 * 4;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 8) {
+ __m256i s_256[4], r[2];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
+
+ if (subpel_y_q4 != 8) {
+ __m256i ss_256[4];
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+
+ do {
+ xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ do {
+ xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else if (w == 16) {
+ __m256i s_256[5];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+
+ if (subpel_y_q4 != 8) {
+ __m256i ss_256[4], tt_256[4], r[4];
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+
+ tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+ tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+
+ do {
+ xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256,
+ r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i r[4];
+
+ do {
+ xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ /*It's a special condition for OBMC. A/c to Av1 spec 4-tap won't
+ support for width(w)>16, but for OBMC while predicting above block
+ it reduces size block to Wx(h/2), for example, if above block size
+ is 32x8, we get block size as 32x4 for OBMC.*/
+ int32_t x = 0;
+
+ assert(!(w % 32));
+
+ __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4];
+ do {
+ const int16_t *s = im + x;
+ uint8_t *d = dst + x;
+
+ loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
+ loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1],
+ tt_256[1]);
+
+ y = h;
+ do {
+ xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
+ coeffs_256, r0);
+ xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1],
+ tt_256[1], coeffs_256, r1);
+
+ xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
+ xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
+
+ s += 2 * w;
+ d += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+
+ x += 32;
+ } while (x < w);
+ }
+ }
+}
+
+static void convolve_2d_sr_ver_6tap_avx2(
+ const int16_t *const im_block, const int32_t w, const int32_t h,
+ const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+ uint8_t *dst, const int32_t dst_stride) {
+ const int16_t *im = im_block;
+ int32_t y;
+
+ if (w == 2) {
+ __m128i coeffs_128[3], s_32[6], ss_128[3];
+
+ prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
+ s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
+ s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
+ s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
+ s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
+
+ const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+ const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+ const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+
+ ss_128[0] = _mm_unpacklo_epi16(src01, src12);
+ ss_128[1] = _mm_unpacklo_epi16(src23, src34);
+
+ y = h;
+ do {
+ const __m128i res =
+ xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
+ xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+ im += 2 * 2;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i coeffs_256[3];
+
+ prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 4) {
+ __m128i s_64[6];
+ __m256i s_256[6], ss_256[3];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
+ s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
+ s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
+ s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
+ s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+ s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
+ s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
+ s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+
+ y = h;
+ do {
+ const __m256i res =
+ xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
+ xy_y_round_store_4x2_avx2(res, dst, dst_stride);
+ im += 2 * 4;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 8) {
+ __m256i s_256[6], r[2];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
+ y = h;
+
+ if (subpel_y_q4 != 8) {
+ __m256i ss_256[6];
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+ ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+
+ do {
+ xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ do {
+ xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else if (w == 16) {
+ __m256i s_256[6];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
+ s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16));
+ y = h;
+
+ if (subpel_y_q4 != 8) {
+ __m256i ss_256[6], tt_256[6], r[4];
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+ ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+
+ tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+ tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
+ tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+ tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
+
+ do {
+ xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256,
+ coeffs_256, r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i ss_256[4], r[4];
+
+ do {
+ xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256,
+ coeffs_256, r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ int32_t x = 0;
+
+ assert(!(w % 32));
+
+ __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
+
+ do {
+ const int16_t *s = im + x;
+ uint8_t *d = dst + x;
+
+ loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
+ loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1],
+ tt_256[1]);
+
+ y = h;
+ do {
+ xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0],
+ coeffs_256, r0);
+ xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1],
+ tt_256[1], coeffs_256, r1);
+
+ xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
+ xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
+
+ s += 2 * w;
+ d += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+
+ x += 32;
+ } while (x < w);
+ }
+ }
+}
+
+static void convolve_2d_sr_ver_8tap_avx2(
+ const int16_t *const im_block, const int32_t w, const int32_t h,
+ const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+ uint8_t *dst, const int32_t dst_stride) {
+ const int16_t *im = im_block;
+ int32_t y;
+
+ if (w == 2) {
+ __m128i coeffs_128[4], s_32[8], ss_128[4];
+
+ prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
+ s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
+ s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
+ s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
+ s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
+ s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
+ s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
+
+ const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+ const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+ const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+ const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+ const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
+
+ ss_128[0] = _mm_unpacklo_epi16(src01, src12);
+ ss_128[1] = _mm_unpacklo_epi16(src23, src34);
+ ss_128[2] = _mm_unpacklo_epi16(src45, src56);
+
+ y = h;
+ do {
+ const __m128i res =
+ xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
+ xy_y_round_store_2x2_sse2(res, dst, dst_stride);
+ im += 2 * 2;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i coeffs_256[4];
+
+ prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 4) {
+ __m128i s_64[8];
+ __m256i s_256[8], ss_256[4];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
+ s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
+ s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
+ s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
+ s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
+ s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
+ s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+ s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
+ s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
+ s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
+ s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
+ s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
+
+ y = h;
+ do {
+ const __m256i res =
+ xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
+ xy_y_round_store_4x2_avx2(res, dst, dst_stride);
+ im += 2 * 4;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 8) {
+ __m256i s_256[8], r[2];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
+ s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
+ s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
+ y = h;
+
+ if (subpel_y_q4 != 8) {
+ __m256i ss_256[8];
+
+ convolve_8tap_unpack_avx2(s_256, ss_256);
+
+ do {
+ xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ do {
+ xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r);
+ xy_y_round_store_8x2_avx2(r, dst, dst_stride);
+ im += 2 * 8;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else if (w == 16) {
+ __m256i s_256[8], r[4];
+
+ load_16bit_7rows_avx2(im, 16, s_256);
+ y = h;
+
+ if (subpel_y_q4 != 8) {
+ __m256i ss_256[8], tt_256[8];
+
+ convolve_8tap_unpack_avx2(s_256, ss_256);
+ convolve_8tap_unpack_avx2(s_256 + 1, tt_256);
+
+ do {
+ xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256,
+ tt_256, r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ do {
+ xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r);
+ xy_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+ im += 2 * 16;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ int32_t x = 0;
+ __m256i s_256[2][8], r0[4], r1[4];
+
+ assert(!(w % 32));
+
+ __m256i ss_256[2][8], tt_256[2][8];
+
+ do {
+ const int16_t *s = im + x;
+ uint8_t *d = dst + x;
+
+ load_16bit_7rows_avx2(s, w, s_256[0]);
+ convolve_8tap_unpack_avx2(s_256[0], ss_256[0]);
+ convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]);
+
+ load_16bit_7rows_avx2(s + 16, w, s_256[1]);
+ convolve_8tap_unpack_avx2(s_256[1], ss_256[1]);
+ convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]);
+
+ y = h;
+ do {
+ xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0],
+ tt_256[0], r0);
+ xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1],
+ ss_256[1], tt_256[1], r1);
+ xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d);
+ xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride);
+
+ s += 2 * w;
+ d += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+
+ x += 32;
+ } while (x < w);
+ }
+ }
+}
+
+typedef void (*Convolve2dSrHorTapFunc)(
+ const uint8_t *const src, const int32_t src_stride, const int32_t w,
+ const int32_t h, const InterpFilterParams *const filter_params_x,
+ const int32_t subpel_x_q4, int16_t *const im_block);
+
+typedef void (*Convolve2dSrVerTapFunc)(
+ const int16_t *const im_block, const int32_t w, const int32_t h,
+ const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4,
+ uint8_t *dst, const int32_t dst_stride);
+
+static AOM_FORCE_INLINE void av1_convolve_2d_sr_specialized_avx2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
+ const int32_t subpel_y_q4, ConvolveParams *conv_params) {
+ static const Convolve2dSrHorTapFunc
+ convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
+ NULL,
+ NULL,
+ convolve_2d_sr_hor_2tap_avx2,
+ NULL,
+ convolve_2d_sr_hor_4tap_ssse3,
+ NULL,
+ convolve_2d_sr_hor_6tap_avx2,
+ NULL,
+ convolve_2d_sr_hor_8tap_avx2
+ };
+ static const Convolve2dSrVerTapFunc
+ convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
+ NULL,
+ convolve_2d_sr_ver_2tap_half_avx2,
+ convolve_2d_sr_ver_2tap_avx2,
+ convolve_2d_sr_ver_4tap_avx2,
+ convolve_2d_sr_ver_4tap_avx2,
+ convolve_2d_sr_ver_6tap_avx2,
+ convolve_2d_sr_ver_6tap_avx2,
+ convolve_2d_sr_ver_8tap_avx2,
+ convolve_2d_sr_ver_8tap_avx2
+ };
+ const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4);
+ const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ assert(tap_x != 12 && tap_y != 12);
+
+ const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride;
+ // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
+ // permutation.
+ DECLARE_ALIGNED(32, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+
+ (void)conv_params;
+
+ assert(conv_params->round_0 == 3);
+ assert(conv_params->round_1 == 11);
+
+ // horizontal filter
+ int32_t hh = h + tap_y;
+ assert(!(hh % 2));
+
+ convolve_2d_sr_hor_tap_func_table[tap_x](
+ src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
+
+ // vertical filter
+ convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
+ im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride);
+}
+
+#endif // THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_
diff --git a/third_party/aom/third_party/SVT-AV1/convolve_avx2.h b/third_party/aom/third_party/SVT-AV1/convolve_avx2.h
new file mode 100644
index 0000000000..923cabee7f
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/convolve_avx2.h
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
+#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
+
+#include "EbMemory_AVX2.h"
+#include "EbMemory_SSE4_1.h"
+#include "synonyms.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+
+static INLINE void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
+ __m256i coeffs[2]) {
+ const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
+
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
+}
+
+static INLINE void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
+ __m256i coeffs[3]) {
+ const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
+}
+
+static INLINE void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
+ __m256i coeffs[4]) {
+ const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_half_coeffs_2tap_ssse3(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [1] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+ // coeffs 3 4 3 4 3 4 3 4
+ *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
+}
+
+static INLINE void prepare_half_coeffs_4tap_ssse3(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [2] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
+}
+
+static INLINE void prepare_half_coeffs_6tap_ssse3(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [3] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
+}
+
+static INLINE void prepare_half_coeffs_8tap_ssse3(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_half_coeffs_2tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [1] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
+ const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+ // coeffs 3 4 3 4 3 4 3 4
+ *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+}
+
+static INLINE void prepare_half_coeffs_4tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [2] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+ populate_coeffs_4tap_avx2(coeffs_1, coeffs);
+}
+
+static INLINE void prepare_half_coeffs_6tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [3] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+ populate_coeffs_6tap_avx2(coeffs_1, coeffs);
+}
+
+static INLINE void prepare_half_coeffs_8tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+ const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
+ populate_coeffs_8tap_avx2(coeffs_1, coeffs);
+}
+
+static INLINE void prepare_coeffs_2tap_sse2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [1] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+}
+
+static INLINE void prepare_coeffs_4tap_sse2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [2] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs_6tap_ssse3(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [3] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
+}
+
+static INLINE void prepare_coeffs_8tap_sse2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void prepare_coeffs_2tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [1] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+}
+
+static INLINE void prepare_coeffs_4tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [2] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs_6tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [3]*/) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
+}
+
+static INLINE void prepare_coeffs_8tap_avx2(
+ const InterpFilterParams *const filter_params, const int32_t subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
+ const ptrdiff_t stride,
+ __m256i dst[5]) {
+ dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+ dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+ dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+ dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+ dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+}
+
+static INLINE void load_16bit_7rows_avx2(const int16_t *const src,
+ const ptrdiff_t stride,
+ __m256i dst[7]) {
+ dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+ dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+ dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+ dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+ dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+ dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+ dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+}
+
+static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
+ const ptrdiff_t stride,
+ __m256i dst[8]) {
+ dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+ dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+ dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+ dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+ dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+ dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+ dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+ dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+}
+
+static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
+ const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
+ __m256i ss_256[5], __m256i tt_256[5]) {
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+ s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+ ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+
+ tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+ tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
+ tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+ tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
+}
+
+static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
+ const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
+ __m256i ss_256[3], __m256i tt_256[3]) {
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+
+ ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+
+ tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+ tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+}
+
+static INLINE void convolve_8tap_unpack_avx2(const __m256i s[6],
+ __m256i ss[7]) {
+ ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
+ ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
+ ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
+ ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
+ ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
+ ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
+}
+
+static INLINE __m128i convolve_2tap_ssse3(const __m128i ss[1],
+ const __m128i coeffs[1]) {
+ return _mm_maddubs_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m128i convolve_4tap_ssse3(const __m128i ss[2],
+ const __m128i coeffs[2]) {
+ const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
+ const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
+ return _mm_add_epi16(res_23, res_45);
+}
+
+static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
+ const __m128i coeffs[3]) {
+ const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
+ const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
+ const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
+ const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
+ return _mm_add_epi16(res_1256, res_34);
+}
+
+static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
+ const __m128i coeffs[4]) {
+ const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
+ const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
+ const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
+ const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
+ const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
+ const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
+ return _mm_add_epi16(res_0145, res_2367);
+}
+
+static INLINE __m256i convolve_2tap_avx2(const __m256i ss[1],
+ const __m256i coeffs[1]) {
+ return _mm256_maddubs_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m256i convolve_4tap_avx2(const __m256i ss[2],
+ const __m256i coeffs[2]) {
+ const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
+ const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
+ return _mm256_add_epi16(res_23, res_45);
+}
+
+static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
+ const __m256i coeffs[3]) {
+ const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
+ const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
+ return _mm256_add_epi16(res_0145, res_23);
+}
+
+static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
+ const __m256i coeffs[4]) {
+ const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
+ const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
+ const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
+ const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
+ return _mm256_add_epi16(res_0145, res_2367);
+}
+
+static INLINE __m128i convolve16_2tap_sse2(const __m128i ss[1],
+ const __m128i coeffs[1]) {
+ return _mm_madd_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m128i convolve16_4tap_sse2(const __m128i ss[2],
+ const __m128i coeffs[2]) {
+ const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
+ const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
+ return _mm_add_epi32(res_01, res_23);
+}
+
+static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
+ const __m128i coeffs[3]) {
+ const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
+ const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
+ const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
+ const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
+ return _mm_add_epi32(res_0123, res_45);
+}
+
+static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
+ const __m128i coeffs[4]) {
+ const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
+ const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
+ const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
+ const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
+ const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
+ const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
+ return _mm_add_epi32(res_0123, res_4567);
+}
+
+static INLINE __m256i convolve16_2tap_avx2(const __m256i ss[1],
+ const __m256i coeffs[1]) {
+ return _mm256_madd_epi16(ss[0], coeffs[0]);
+}
+
+static INLINE __m256i convolve16_4tap_avx2(const __m256i ss[2],
+ const __m256i coeffs[2]) {
+ const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
+ const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
+ return _mm256_add_epi32(res_1, res_2);
+}
+
+static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
+ const __m256i coeffs[3]) {
+ const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
+ const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
+ const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
+ const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
+ return _mm256_add_epi32(res_0123, res_45);
+}
+
+static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
+ const __m256i coeffs[4]) {
+ const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
+ const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
+ const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
+ const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
+ const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
+ const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
+ return _mm256_add_epi32(res_0123, res_4567);
+}
+
+static INLINE __m256i x_convolve_4tap_avx2(const __m256i data,
+ const __m256i coeffs[2],
+ const __m256i filt[2]) {
+ __m256i ss[2];
+
+ ss[0] = _mm256_shuffle_epi8(data, filt[0]);
+ ss[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+ return convolve_4tap_avx2(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
+ const __m256i coeffs[3],
+ const __m256i filt[3]) {
+ __m256i ss[3];
+
+ ss[0] = _mm256_shuffle_epi8(data, filt[0]);
+ ss[1] = _mm256_shuffle_epi8(data, filt[1]);
+ ss[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+ return convolve_6tap_avx2(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
+ const __m256i coeffs[4],
+ const __m256i filt[4]) {
+ __m256i ss[4];
+
+ ss[0] = _mm256_shuffle_epi8(data, filt[0]);
+ ss[1] = _mm256_shuffle_epi8(data, filt[1]);
+ ss[2] = _mm256_shuffle_epi8(data, filt[2]);
+ ss[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+ return convolve_8tap_avx2(ss, coeffs);
+}
+
+static INLINE __m256i sr_y_round_avx2(const __m256i src) {
+ const __m256i round = _mm256_set1_epi16(32);
+ const __m256i dst = _mm256_add_epi16(src, round);
+ return _mm256_srai_epi16(dst, FILTER_BITS - 1);
+}
+
+static INLINE __m128i xy_x_round_sse2(const __m128i src) {
+ const __m128i round = _mm_set1_epi16(2);
+ const __m128i dst = _mm_add_epi16(src, round);
+ return _mm_srai_epi16(dst, 2);
+}
+
+static INLINE __m256i xy_x_round_avx2(const __m256i src) {
+ const __m256i round = _mm256_set1_epi16(2);
+ const __m256i dst = _mm256_add_epi16(src, round);
+ return _mm256_srai_epi16(dst, 2);
+}
+
+static INLINE void xy_x_round_store_2x2_sse2(const __m128i res,
+ int16_t *const dst) {
+ const __m128i d = xy_x_round_sse2(res);
+ _mm_storel_epi64((__m128i *)dst, d);
+}
+
+static INLINE void xy_x_round_store_4x2_sse2(const __m128i res,
+ int16_t *const dst) {
+ const __m128i d = xy_x_round_sse2(res);
+ _mm_storeu_si128((__m128i *)dst, d);
+}
+
+static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
+ int16_t *const dst) {
+ __m128i r[2];
+
+ r[0] = xy_x_round_sse2(res[0]);
+ r[1] = xy_x_round_sse2(res[1]);
+ _mm_storeu_si128((__m128i *)dst, r[0]);
+ _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
+}
+
+static INLINE void xy_x_round_store_8x2_avx2(const __m256i res,
+ int16_t *const dst) {
+ const __m256i d = xy_x_round_avx2(res);
+ _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
+ int16_t *const dst) {
+ __m256i r[2];
+
+ r[0] = xy_x_round_avx2(res[0]);
+ r[1] = xy_x_round_avx2(res[1]);
+ const __m256i d0 =
+ _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
+ const __m256i d1 =
+ _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
+ _mm256_storeu_si256((__m256i *)dst, d0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE __m128i xy_y_round_sse2(const __m128i src) {
+ const __m128i round = _mm_set1_epi32(1024);
+ const __m128i dst = _mm_add_epi32(src, round);
+ return _mm_srai_epi32(dst, 11);
+}
+
+static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
+ const __m128i round = _mm_set1_epi16(16);
+ const __m128i dst = _mm_add_epi16(src, round);
+ return _mm_srai_epi16(dst, 5);
+}
+
+static INLINE __m256i xy_y_round_avx2(const __m256i src) {
+ const __m256i round = _mm256_set1_epi32(1024);
+ const __m256i dst = _mm256_add_epi32(src, round);
+ return _mm256_srai_epi32(dst, 11);
+}
+
+static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
+ const __m256i r0 = xy_y_round_avx2(r[0]);
+ const __m256i r1 = xy_y_round_avx2(r[1]);
+ return _mm256_packs_epi32(r0, r1);
+}
+
+static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
+ const __m256i round = _mm256_set1_epi16(16);
+ const __m256i dst = _mm256_add_epi16(src, round);
+ return _mm256_srai_epi16(dst, 5);
+}
+
+static INLINE void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m128i d = _mm_packus_epi16(res, res);
+ *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
+ *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
+}
+
+static INLINE void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m128i d = _mm_packus_epi16(res, res);
+ store_u8_4x2_sse2(d, dst, stride);
+}
+
+static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i d = _mm256_packus_epi16(res, res);
+ const __m128i d0 = _mm256_castsi256_si128(d);
+ const __m128i d1 = _mm256_extracti128_si256(d, 1);
+
+ xx_storel_32(dst, d0);
+ xx_storel_32(dst + stride, d1);
+}
+
+static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i d = _mm256_packus_epi16(res, res);
+ const __m128i d0 = _mm256_castsi256_si128(d);
+ const __m128i d1 = _mm256_extracti128_si256(d, 1);
+ _mm_storel_epi64((__m128i *)dst, d0);
+ _mm_storel_epi64((__m128i *)(dst + stride), d1);
+}
+
+static INLINE void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i d = _mm256_packus_epi16(res0, res1);
+ storeu_u8_16x2_avx2(d, dst, stride);
+}
+
+static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
+ const __m256i res1,
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i t = _mm256_packus_epi16(res0, res1);
+ const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
+ storeu_u8_16x2_avx2(d, dst, stride);
+}
+
+static INLINE void pack_store_32_avx2(const __m256i res0, const __m256i res1,
+ uint8_t *const dst) {
+ const __m256i t = _mm256_packus_epi16(res0, res1);
+ const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
+ _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m128i r = xy_y_round_sse2(res);
+ const __m128i rr = _mm_packs_epi32(r, r);
+ pack_store_2x2_sse2(rr, dst, stride);
+}
+
+static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i r = xy_y_round_avx2(res);
+ const __m256i rr = _mm256_packs_epi32(r, r);
+ pack_store_4x2_avx2(rr, dst, stride);
+}
+
+static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
+ const __m256i res1,
+ uint8_t *const dst) {
+ const __m256i d = _mm256_packus_epi16(res0, res1);
+ // d = _mm256_permute4x64_epi64(d, 0xD8);
+ _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
+ const __m256i r1[2],
+ uint8_t *const dst) {
+ const __m256i ra = xy_y_round_16_avx2(r0);
+ const __m256i rb = xy_y_round_16_avx2(r1);
+ xy_y_pack_store_32_avx2(ra, rb, dst);
+}
+
+static INLINE void convolve_store_32_avx2(const __m256i res0,
+ const __m256i res1,
+ uint8_t *const dst) {
+ const __m256i d = _mm256_packus_epi16(res0, res1);
+ _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE __m128i sr_x_round_sse2(const __m128i src) {
+ const __m128i round = _mm_set1_epi16(34);
+ const __m128i dst = _mm_add_epi16(src, round);
+ return _mm_srai_epi16(dst, 6);
+}
+
+static INLINE __m256i sr_x_round_avx2(const __m256i src) {
+ const __m256i round = _mm256_set1_epi16(34);
+ const __m256i dst = _mm256_add_epi16(src, round);
+ return _mm256_srai_epi16(dst, 6);
+}
+
+static INLINE __m128i sr_y_round_sse2(const __m128i src) {
+ const __m128i round = _mm_set1_epi16(32);
+ const __m128i dst = _mm_add_epi16(src, round);
+ return _mm_srai_epi16(dst, FILTER_BITS - 1);
+}
+
+static INLINE void sr_x_round_store_8x2_avx2(const __m256i res,
+ uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i r = sr_x_round_avx2(res);
+ pack_store_8x2_avx2(r, dst, dst_stride);
+}
+
+static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
+ uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m256i r[2];
+
+ r[0] = sr_x_round_avx2(res[0]);
+ r[1] = sr_x_round_avx2(res[1]);
+ pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
+}
+
+static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
+ uint8_t *const dst) {
+ __m256i r[2];
+
+ r[0] = sr_x_round_avx2(res[0]);
+ r[1] = sr_x_round_avx2(res[1]);
+ convolve_store_32_avx2(r[0], r[1], dst);
+}
+
+static INLINE void sr_y_round_store_8x2_avx2(const __m256i res,
+ uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i r = sr_y_round_avx2(res);
+ pack_store_8x2_avx2(r, dst, dst_stride);
+}
+
+static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
+ uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m256i r[2];
+
+ r[0] = sr_y_round_avx2(res[0]);
+ r[1] = sr_y_round_avx2(res[1]);
+ pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
+}
+
+static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
+ const __m256i s0, __m256i *const s1,
+ uint8_t *const dst) {
+ *s1 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i d = _mm256_avg_epu8(s0, *s1);
+ _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
+ uint8_t *const dst) {
+ const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
+ const __m256i d = _mm256_avg_epu8(s0, s1);
+ _mm256_storeu_si256((__m256i *)dst, d);
+}
+
+static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[1]) {
+ const __m128i sfl =
+ _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
+ const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
+ return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[1]) {
+ const __m128i sfl =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+ const __m128i s_128 = load_u8_8x2_sse2(src, stride);
+ const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
+ return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[1],
+ __m128i r[2]) {
+ __m128i ss[2];
+ const __m128i s00 = _mm_loadu_si128((__m128i *)src);
+ const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
+ const __m128i s01 = _mm_srli_si128(s00, 1);
+ const __m128i s11 = _mm_srli_si128(s10, 1);
+ ss[0] = _mm_unpacklo_epi8(s00, s01);
+ ss[1] = _mm_unpacklo_epi8(s10, s11);
+
+ r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
+ r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
+}
+
+static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[1]) {
+ __m128i s_128[2][2];
+ __m256i s_256[2];
+
+ s_128[0][0] = _mm_loadu_si128((__m128i *)src);
+ s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
+ s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
+ s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
+ s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
+ s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
+ const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ return convolve_2tap_avx2(&ss, coeffs);
+}
+
+static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[1],
+ __m256i r[2]) {
+ const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
+ const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
+ const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
+ const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
+ r[0] = convolve_2tap_avx2(&s0, coeffs);
+ r[1] = convolve_2tap_avx2(&s1, coeffs);
+}
+
+static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[1],
+ __m256i r[2]) {
+ const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
+ const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
+ const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
+
+ r[0] = convolve_2tap_avx2(&ss0, coeffs);
+ r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[2]) {
+ const __m128i sfl0 =
+ _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i sfl1 =
+ _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i s = load_u8_8x2_sse2(src, stride);
+ __m128i ss[2];
+
+ ss[0] = _mm_shuffle_epi8(s, sfl0);
+ ss[1] = _mm_shuffle_epi8(s, sfl1);
+ return convolve_4tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[2]) {
+ const __m128i s = load_u8_8x2_sse2(src, stride);
+ const __m128i sfl0 =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+ const __m128i sfl1 =
+ _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
+ __m128i ss[2];
+
+ ss[0] = _mm_shuffle_epi8(s, sfl0);
+ ss[1] = _mm_shuffle_epi8(s, sfl1);
+ return convolve_4tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[2],
+ const __m256i filt[2]) {
+ const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
+ return x_convolve_4tap_avx2(s_256, coeffs, filt);
+}
+
+static INLINE void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
+ const int32_t src_stride,
+ const __m256i coeffs[2],
+ const __m256i filt[2],
+ __m256i r[2]) {
+ r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
+ r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
+}
+
+static INLINE void x_convolve_4tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[2],
+ const __m256i filt[2],
+ __m256i r[2]) {
+ const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
+
+ r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
+ r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
+}
+
+static INLINE __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[3]) {
+ const __m128i sfl0 =
+ _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i sfl1 =
+ _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i sfl2 =
+ _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
+
+ const __m128i s = load_u8_8x2_sse2(src, stride);
+ __m128i ss[3];
+
+ ss[0] = _mm_shuffle_epi8(s, sfl0);
+ ss[1] = _mm_shuffle_epi8(s, sfl1);
+ ss[2] = _mm_shuffle_epi8(s, sfl2);
+ return convolve_6tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[3]) {
+ const __m128i s = load_u8_8x2_sse2(src, stride);
+ const __m128i sfl0 =
+ _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i sfl1 =
+ _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
+ const __m128i sfl2 =
+ _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
+ __m128i ss[3];
+
+ ss[0] = _mm_shuffle_epi8(s, sfl0);
+ ss[1] = _mm_shuffle_epi8(s, sfl1);
+ ss[2] = _mm_shuffle_epi8(s, sfl2);
+ return convolve_6tap_ssse3(ss, coeffs);
+}
+
+static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[3],
+ const __m256i filt[3]) {
+ const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
+ return x_convolve_6tap_avx2(s_256, coeffs, filt);
+}
+
+static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
+ const int32_t src_stride,
+ const __m256i coeffs[3],
+ const __m256i filt[3],
+ __m256i r[2]) {
+ r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
+ r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
+}
+
+static INLINE void x_convolve_6tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[3],
+ const __m256i filt[3],
+ __m256i r[2]) {
+ const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
+
+ r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
+ r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
+}
+
+static INLINE __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[4],
+ const __m256i filt[4]) {
+ const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
+ return x_convolve_8tap_avx2(s_256, coeffs, filt);
+}
+
+static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
+ const int32_t src_stride,
+ const __m256i coeffs[4],
+ const __m256i filt[4],
+ __m256i r[2]) {
+ r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
+ r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
+}
+
+static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[4],
+ const __m256i filt[4],
+ __m256i r[2]) {
+ const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
+
+ r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
+ r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
+}
+
+static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[1],
+ __m128i s_16[2]) {
+ __m128i s_128[2];
+
+ s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
+ s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+ s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
+ s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
+ const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
+ return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[1],
+ __m128i s_32[2]) {
+ __m128i s_128[2];
+
+ s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
+ s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
+ s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
+ const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
+ return convolve_2tap_ssse3(&ss, coeffs);
+}
+
+static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[1],
+ __m128i s_64[2]) {
+ __m256i s_256[2];
+
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
+ s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
+ s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
+ const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ return convolve_2tap_avx2(&ss, coeffs);
+}
+
+static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[1],
+ __m128i s_128[2], __m256i r[2]) {
+ __m256i s_256[2];
+
+ s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
+ s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
+ s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
+ s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
+ const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+ r[0] = convolve_2tap_avx2(&ss0, coeffs);
+ r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[1],
+ const __m256i s0, __m256i *const s1,
+ __m256i r[2]) {
+ *s1 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
+ const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
+ r[0] = convolve_2tap_avx2(&ss0, coeffs);
+ r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[2],
+ __m128i s_16[4],
+ __m128i ss_128[2]) {
+ s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
+ const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
+ const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
+ ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+ return convolve_4tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[2],
+ __m128i s_32[4],
+ __m128i ss_128[2]) {
+ s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
+ const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
+ const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
+ ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+ return convolve_4tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[2],
+ __m128i s_64[4],
+ __m256i ss_256[2]) {
+ s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
+ const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
+ s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
+ const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
+ ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+ return convolve_4tap_avx2(ss_256, coeffs);
+}
+
+static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[2],
+ __m128i s_128[4],
+ __m256i ss_256[4], __m256i r[2]) {
+ s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
+ const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
+ s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
+ const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
+ ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+ ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
+ r[0] = convolve_4tap_avx2(ss_256, coeffs);
+ r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
+}
+
+static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[3],
+ __m128i s_16[6],
+ __m128i ss_128[3]) {
+ s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
+ const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
+ s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
+ const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
+ ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+ return convolve_6tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE void y_convolve_4tap_32x2_avx2(
+ const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
+ __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
+ s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
+ ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
+ ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
+ tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
+ tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
+ r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
+ r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
+ r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
+ r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
+}
+
+static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[3],
+ __m128i s_32[6],
+ __m128i ss_128[3]) {
+ s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
+ const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+ s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
+ const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
+ ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+ return convolve_6tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[3],
+ __m128i s_64[6],
+ __m256i ss_256[3]) {
+ s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
+ const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
+ s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
+ const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
+ ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+ return convolve_6tap_avx2(ss_256, coeffs);
+}
+
+static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[3],
+ __m128i s_128[6],
+ __m256i ss_256[6], __m256i r[2]) {
+ s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
+ const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
+ s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
+ const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
+ ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+ ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
+ r[0] = convolve_6tap_avx2(ss_256, coeffs);
+ r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
+}
+
+static INLINE void y_convolve_6tap_32x2_avx2(
+ const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
+ __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
+ s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+ ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
+ ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
+ s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+ tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
+ tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
+ r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
+ r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
+ r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
+ r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
+}
+
+static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[4],
+ __m128i s_16[8],
+ __m128i ss_128[4]) {
+ s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
+ const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
+ s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
+ const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
+ ss_128[3] = _mm_unpacklo_epi8(src67, src78);
+ return convolve_8tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m128i coeffs[4],
+ __m128i s_32[8],
+ __m128i ss_128[4]) {
+ s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
+ const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
+ s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
+ const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
+ ss_128[3] = _mm_unpacklo_epi8(src67, src78);
+ return convolve_8tap_ssse3(ss_128, coeffs);
+}
+
+static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[4],
+ __m128i s_64[8],
+ __m256i ss_256[4]) {
+ s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
+ const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
+ s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
+ const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
+ ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
+ return convolve_8tap_avx2(ss_256, coeffs);
+}
+
+static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
+ const ptrdiff_t stride,
+ const __m256i coeffs[4],
+ __m128i s_128[8],
+ __m256i ss_256[8], __m256i r[2]) {
+ s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
+ const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
+ s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
+ const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
+ ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
+ ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
+ r[0] = convolve_8tap_avx2(ss_256, coeffs);
+ r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
+}
+
+static INLINE void y_convolve_8tap_32x2_avx2(
+ const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
+ __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
+ s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+ ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
+ ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
+ s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
+ tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
+ tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
+ r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
+ r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
+ r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
+ r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
+}
+
+static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[1],
+ __m256i r[2]) {
+ const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
+ const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
+ const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
+
+ r[0] = convolve_2tap_avx2(&ss0, coeffs);
+ r[1] = convolve_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[1],
+ int16_t *const dst) {
+ __m256i r[2];
+
+ xy_x_convolve_2tap_32_avx2(src, coeffs, r);
+ const __m256i d0 = xy_x_round_avx2(r[0]);
+ const __m256i d1 = xy_x_round_avx2(r[1]);
+ _mm256_storeu_si256((__m256i *)dst, d0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE void xy_x_4tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[2],
+ const __m256i filt[2],
+ int16_t *const dst) {
+ __m256i r[2];
+
+ x_convolve_4tap_32_avx2(src, coeffs, filt, r);
+ const __m256i d0 = xy_x_round_avx2(r[0]);
+ const __m256i d1 = xy_x_round_avx2(r[1]);
+ _mm256_storeu_si256((__m256i *)dst, d0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[3],
+ const __m256i filt[3],
+ int16_t *const dst) {
+ __m256i r[2];
+
+ x_convolve_6tap_32_avx2(src, coeffs, filt, r);
+ const __m256i d0 = xy_x_round_avx2(r[0]);
+ const __m256i d1 = xy_x_round_avx2(r[1]);
+ _mm256_storeu_si256((__m256i *)dst, d0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[4],
+ const __m256i filt[4],
+ int16_t *const dst) {
+ __m256i r[2];
+
+ x_convolve_8tap_32_avx2(src, coeffs, filt, r);
+ const __m256i d0 = xy_x_round_avx2(r[0]);
+ const __m256i d1 = xy_x_round_avx2(r[1]);
+ _mm256_storeu_si256((__m256i *)dst, d0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), d1);
+}
+
+static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
+ __m128i s_32[2],
+ const __m128i coeffs[1]) {
+ __m128i s_128[2];
+
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
+ s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
+ s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
+ const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
+ return convolve16_2tap_sse2(&ss, coeffs);
+}
+
+static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
+ const int16_t *const src, __m128i s_32[2]) {
+ __m128i s_128[2];
+
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
+ s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
+ s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
+ return _mm_add_epi16(s_128[0], s_128[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
+ __m128i s_64[2],
+ const __m128i coeffs[1],
+ __m128i r[2]) {
+ __m128i s_128[2];
+
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
+ s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
+ s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
+ const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
+ r[0] = convolve16_2tap_sse2(&ss0, coeffs);
+ r[1] = convolve16_2tap_sse2(&ss1, coeffs);
+}
+
+static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
+ const int16_t *const src, __m128i s_64[2]) {
+ __m128i s_128[2];
+
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
+ s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
+ s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
+ return _mm_add_epi16(s_128[0], s_128[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
+ const __m256i s1,
+ const __m256i coeffs[1],
+ __m256i r[2]) {
+ const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
+ r[0] = convolve16_2tap_avx2(&ss0, coeffs);
+ r[1] = convolve16_2tap_avx2(&ss1, coeffs);
+}
+
+static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
+ __m128i s_128[2],
+ const __m256i coeffs[1],
+ __m256i r[2]) {
+ __m256i s_256[2];
+ s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
+ s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
+ s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
+ xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
+}
+
+static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
+ const int16_t *const src, __m128i s_128[2]) {
+ __m256i s_256[2];
+ s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
+ s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
+ s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
+ return _mm256_add_epi16(s_256[0], s_256[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
+ const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+ r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
+}
+
+static INLINE void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i t = _mm256_packus_epi16(r[0], r[1]);
+ const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
+ storeu_u8_16x2_avx2(d, dst, stride);
+}
+
+static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
+ __m256i s[2],
+ const __m256i coeffs[1],
+ __m256i r[4]) {
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+ xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
+}
+
+static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
+ const __m256i s0[2],
+ __m256i s1[2],
+ const __m256i coeffs[1],
+ __m256i r[4]) {
+ s1[0] = _mm256_loadu_si256((__m256i *)src);
+ s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+ xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
+ xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
+}
+
+static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
+ const __m256i s0[2],
+ __m256i s1[2],
+ const __m256i coeffs[1],
+ uint8_t *const dst) {
+ __m256i r[4];
+
+ xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
+ xy_y_round_store_32_avx2(r + 0, r + 2, dst);
+}
+
+static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
+ const __m256i s0[2],
+ __m256i s1[2],
+ __m256i r[2]) {
+ s1[0] = _mm256_loadu_si256((__m256i *)src);
+ s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
+ r[0] = _mm256_add_epi16(s0[0], s1[0]);
+ r[1] = _mm256_add_epi16(s0[1], s1[1]);
+}
+
+static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
+ const int16_t *const src, const __m256i s0[2], __m256i s1[2],
+ uint8_t *const dst) {
+ __m256i r[2];
+
+ xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
+ r[0] = xy_y_round_half_pel_avx2(r[0]);
+ r[1] = xy_y_round_half_pel_avx2(r[1]);
+ xy_y_pack_store_32_avx2(r[0], r[1], dst);
+}
+
+static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
+ __m128i s_32[4],
+ __m128i ss_128[2],
+ const __m128i coeffs[2]) {
+ s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
+ const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+ s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
+ const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
+ ss_128[1] = _mm_unpacklo_epi16(src23, src34);
+ const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
+ ss_128[0] = ss_128[1];
+ return r;
+}
+
+static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
+ __m128i s_64[4],
+ __m256i ss_256[2],
+ const __m256i coeffs[2]) {
+ __m256i s_256[2];
+ s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
+ s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
+ s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
+ s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
+ ss_256[0] = ss_256[1];
+ return r;
+}
+
+static INLINE void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
+ const __m256i coeffs[2],
+ __m256i r[2]) {
+ r[0] = convolve16_4tap_avx2(ss, coeffs);
+ r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
+}
+
+static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
+ __m256i ss_256[4],
+ const __m256i coeffs[2],
+ __m256i r[2]) {
+ __m256i s_256[2];
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
+ ss_256[0] = ss_256[1];
+ ss_256[2] = ss_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
+ const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
+ __m256i r[2]) {
+ __m256i a_256[2];
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
+ a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
+ a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
+ xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
+ s_256[0] = s_256[2];
+ s_256[1] = s_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_16x2_avx2(
+ const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
+ __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
+ s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+ tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
+ tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+ xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
+ ss_256[0] = ss_256[1];
+ ss_256[2] = ss_256[3];
+ tt_256[0] = tt_256[1];
+ tt_256[2] = tt_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_32x2_avx2(
+ const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
+ __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
+ __m256i r[4]) {
+ s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
+ tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
+ tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+ xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
+ ss_256[0] = ss_256[1];
+ ss_256[2] = ss_256[3];
+ tt_256[0] = tt_256[1];
+ tt_256[2] = tt_256[3];
+}
+
+static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
+ const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
+ __m256i r[4]) {
+ __m256i a_256[2];
+
+ s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+
+ a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
+ a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
+ xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
+
+ a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
+ a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
+ xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
+
+ s_256[0] = s_256[2];
+ s_256[1] = s_256[3];
+ s_256[2] = s_256[4];
+}
+
+static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
+ __m128i s_32[6],
+ __m128i ss_128[3],
+ const __m128i coeffs[3]) {
+ s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
+ const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+ s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
+ const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
+ ss_128[2] = _mm_unpacklo_epi16(src45, src56);
+ const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
+ ss_128[0] = ss_128[1];
+ ss_128[1] = ss_128[2];
+ return r;
+}
+
+static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
+ __m128i s_64[6],
+ __m256i ss_256[3],
+ const __m256i coeffs[3]) {
+ __m256i s_256[2];
+ s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
+ s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
+ s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
+ s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
+ ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ return r;
+}
+
+static INLINE void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
+ const __m256i coeffs[3],
+ __m256i r[2]) {
+ r[0] = convolve16_6tap_avx2(ss, coeffs);
+ r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
+}
+
+static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
+ __m256i ss_256[6],
+ const __m256i coeffs[3],
+ __m256i r[2]) {
+ __m256i s_256[2];
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
+ ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+ xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[3] = ss_256[4];
+ ss_256[4] = ss_256[5];
+}
+
+static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
+ const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
+ __m256i r[2]) {
+ __m256i a_256[2], ss_256[4];
+ s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
+ s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
+ a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
+ a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
+ ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
+ s_256[0] = s_256[2];
+ s_256[1] = s_256[3];
+ s_256[2] = s_256[4];
+ s_256[3] = s_256[5];
+}
+
+static INLINE void xy_y_convolve_6tap_16x2_avx2(
+ const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
+ __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
+ __m256i r[4]) {
+ s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+ ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
+ ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
+ s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+ tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
+ tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
+
+ xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
+ xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[3] = ss_256[4];
+ ss_256[4] = ss_256[5];
+
+ tt_256[0] = tt_256[1];
+ tt_256[1] = tt_256[2];
+ tt_256[3] = tt_256[4];
+ tt_256[4] = tt_256[5];
+}
+
+static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
+ const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
+ __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
+ __m256i a_256[2];
+
+ s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
+ a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
+ a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
+ ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
+ ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+
+ a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
+ s_256[0] = s_256[2];
+ s_256[2] = s_256[4];
+ s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
+ a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
+ s_256[1] = s_256[3];
+ s_256[3] = s_256[5];
+ ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
+ ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
+}
+
+static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
+ __m128i s_32[8],
+ __m128i ss_128[4],
+ const __m128i coeffs[4]) {
+ s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
+ const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
+ s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
+ const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
+ ss_128[3] = _mm_unpacklo_epi16(src67, src78);
+ const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
+ ss_128[0] = ss_128[1];
+ ss_128[1] = ss_128[2];
+ ss_128[2] = ss_128[3];
+ return r;
+}
+
+static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
+ __m128i s_64[8],
+ __m256i ss_256[4],
+ const __m256i coeffs[4]) {
+ __m256i s_256[2];
+ s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
+ s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
+ s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
+ s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
+ ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[2] = ss_256[3];
+ return r;
+}
+
+static INLINE void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
+ const __m256i coeffs[4],
+ __m256i r[2]) {
+ r[0] = convolve16_8tap_avx2(ss, coeffs);
+ r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
+}
+
+static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
+ __m256i ss_256[8],
+ const __m256i coeffs[4],
+ __m256i r[2]) {
+ __m256i s_256[2];
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
+ ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
+ ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
+ xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[2] = ss_256[3];
+ ss_256[4] = ss_256[5];
+ ss_256[5] = ss_256[6];
+ ss_256[6] = ss_256[7];
+}
+
+static INLINE void xy_y_convolve_8tap_8x2_half_pel_avx2(
+ const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
+ __m256i r[2]) {
+ __m256i a_256[4], ss_256[4];
+
+ s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
+ s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
+ a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
+ a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
+ a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
+ a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
+ ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
+ ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
+ s_256[0] = s_256[2];
+ s_256[1] = s_256[3];
+ s_256[2] = s_256[4];
+ s_256[3] = s_256[5];
+ s_256[4] = s_256[6];
+ s_256[5] = s_256[7];
+}
+
+static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
+ const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
+ __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
+ s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+ ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
+ ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
+ s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
+ tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
+ tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
+
+ xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
+ xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[2] = ss_256[3];
+ ss_256[4] = ss_256[5];
+ ss_256[5] = ss_256[6];
+ ss_256[6] = ss_256[7];
+
+ tt_256[0] = tt_256[1];
+ tt_256[1] = tt_256[2];
+ tt_256[2] = tt_256[3];
+ tt_256[4] = tt_256[5];
+ tt_256[5] = tt_256[6];
+ tt_256[6] = tt_256[7];
+}
+
+static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
+ const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
+ __m256i s_256[8], __m256i r[4]) {
+ __m256i a_256[4], ss_256[4];
+ s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
+
+ a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
+ a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
+ a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
+ a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
+ ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
+ ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
+
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
+
+ a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
+ a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
+ a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
+ s_256[0] = s_256[2];
+ s_256[2] = s_256[4];
+ s_256[4] = s_256[6];
+ s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
+
+ a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
+ s_256[1] = s_256[3];
+ s_256[3] = s_256[5];
+ s_256[5] = s_256[7];
+ ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
+ ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
+ ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
+
+ xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
+}
+
+static INLINE void xy_y_round_store_8x2_avx2(const __m256i res[2],
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i r = xy_y_round_16_avx2(res);
+ pack_store_8x2_avx2(r, dst, stride);
+}
+
+static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ const __m256i r0 = xy_y_round_16_avx2(res + 0);
+ const __m256i r1 = xy_y_round_16_avx2(res + 2);
+ xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
+}
+
+static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
+ uint8_t *const dst) {
+ __m256i r[2];
+
+ r[0] = sr_y_round_avx2(res[0]);
+ r[1] = sr_y_round_avx2(res[1]);
+ convolve_store_32_avx2(r[0], r[1], dst);
+}
+
+static INLINE void sr_y_round_store_32x2_avx2(const __m256i res[4],
+ uint8_t *const dst,
+ const int32_t dst_stride) {
+ sr_y_round_store_32_avx2(res, dst);
+ sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
+}
+
+static INLINE void sr_y_2tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[1], const __m256i s0,
+ __m256i *const s1, uint8_t *const dst) {
+ __m256i r[2];
+ y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
+ sr_y_round_store_32_avx2(r, dst);
+}
+
+static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
+ const int32_t subpel_y_q4) {
+ int32_t x, y;
+ __m128i coeffs_128[4];
+ __m256i coeffs_256[4];
+
+ int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
+
+ if (vert_tap == 2) {
+ // vert_filt as 2 tap
+ const uint8_t *src_ptr = src;
+
+ y = h;
+
+ if (subpel_y_q4 != 8) {
+ if (w <= 8) {
+ prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
+ coeffs_128);
+
+ if (w == 2) {
+ __m128i s_16[2];
+
+ s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
+
+ do {
+ const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
+ coeffs_128, s_16);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ __m128i s_32[2];
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
+
+ do {
+ const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
+ coeffs_128, s_32);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m128i s_64[2], s_128[2];
+
+ assert(w == 8);
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
+
+ do {
+ // Note: Faster than binding to AVX2 registers.
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
+ s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+ s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
+ const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
+ const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
+ const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
+ const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
+ const __m128i r0 = sr_y_round_sse2(res0);
+ const __m128i r1 = sr_y_round_sse2(res1);
+ const __m128i d = _mm_packus_epi16(r0, r1);
+ _mm_storel_epi64((__m128i *)dst, d);
+ _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 16) {
+ __m128i s_128[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
+
+ do {
+ __m256i r[2];
+
+ y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+ r);
+ sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ __m256i s_256[2];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
+
+ do {
+ sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
+ &s_256[1], dst);
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
+ &s_256[0], dst + dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 64) {
+ __m256i s_256[2][2];
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+
+ do {
+ sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
+ &s_256[1][0], dst);
+ sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
+ s_256[0][1], &s_256[1][1], dst + 32);
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
+ &s_256[0][0], dst + dst_stride);
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
+ s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i s_256[2][4];
+
+ assert(w == 128);
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+ s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
+ s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
+
+ do {
+ sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
+ &s_256[1][0], dst);
+ sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
+ s_256[0][1], &s_256[1][1], dst + 1 * 32);
+ sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
+ s_256[0][2], &s_256[1][2], dst + 2 * 32);
+ sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
+ s_256[0][3], &s_256[1][3], dst + 3 * 32);
+
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
+ &s_256[0][0], dst + dst_stride);
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
+ s_256[1][1], &s_256[0][1],
+ dst + dst_stride + 1 * 32);
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
+ s_256[1][2], &s_256[0][2],
+ dst + dst_stride + 2 * 32);
+ sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
+ s_256[1][3], &s_256[0][3],
+ dst + dst_stride + 3 * 32);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ }
+ } else {
+ // average to get half pel
+ if (w <= 8) {
+ if (w == 2) {
+ __m128i s_16[2];
+
+ s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
+
+ do {
+ s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
+ const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
+ *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
+ s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
+ const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
+ *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ __m128i s_32[2];
+
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
+
+ do {
+ s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + src_stride));
+ const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
+ xx_storel_32(dst, d0);
+ s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
+ const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
+ xx_storel_32(dst + dst_stride, d1);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m128i s_64[2];
+
+ assert(w == 8);
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
+
+ do {
+ // Note: Faster than binding to AVX2 registers.
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
+ const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
+ _mm_storel_epi64((__m128i *)dst, d0);
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+ const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else if (w == 16) {
+ __m128i s_128[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
+
+ do {
+ s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
+ const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
+ _mm_storeu_si128((__m128i *)dst, d0);
+ s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+ const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
+ _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ __m256i s_256[2];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
+
+ do {
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
+ dst + dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 64) {
+ __m256i s_256[2][2];
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+
+ do {
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
+ dst);
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
+ &s_256[1][1], dst + 32);
+
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
+ &s_256[0][0], dst + dst_stride);
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
+ &s_256[0][1], dst + dst_stride + 32);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i s_256[2][4];
+
+ assert(w == 128);
+
+ s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
+ s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
+ s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
+ s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
+
+ do {
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
+ dst);
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
+ &s_256[1][1], dst + 1 * 32);
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
+ &s_256[1][2], dst + 2 * 32);
+ sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
+ &s_256[1][3], dst + 3 * 32);
+
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
+ &s_256[0][0], dst + dst_stride);
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
+ &s_256[0][1], dst + dst_stride + 1 * 32);
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
+ &s_256[0][2], dst + dst_stride + 2 * 32);
+ sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
+ &s_256[0][3], dst + dst_stride + 3 * 32);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ }
+ } else if (vert_tap == 4) {
+ // vert_filt as 4 tap
+ const uint8_t *src_ptr = src - src_stride;
+
+ y = h;
+
+ if (w <= 4) {
+ prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+ if (w == 2) {
+ __m128i s_16[4], ss_128[2];
+
+ s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+ s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+
+ const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+ const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
+
+ ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+
+ do {
+ src_ptr += 2 * src_stride;
+ const __m128i res = y_convolve_4tap_2x2_ssse3(
+ src_ptr, src_stride, coeffs_128, s_16, ss_128);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+
+ ss_128[0] = ss_128[1];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m128i s_32[4], ss_128[2];
+
+ assert(w == 4);
+
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+
+ const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+
+ ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+
+ do {
+ src_ptr += 2 * src_stride;
+ const __m128i res = y_convolve_4tap_4x2_ssse3(
+ src_ptr, src_stride, coeffs_128, s_32, ss_128);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+
+ ss_128[0] = ss_128[1];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 8) {
+ __m128i s_64[4];
+ __m256i ss_256[2];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
+ s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
+ const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
+
+ ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+
+ do {
+ src_ptr += 2 * src_stride;
+ const __m256i res = y_convolve_4tap_8x2_avx2(
+ src_ptr, src_stride, coeffs_256, s_64, ss_256);
+ sr_y_round_store_8x2_avx2(res, dst, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ __m128i s_128[4];
+ __m256i ss_256[4], r[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
+ s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
+ s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
+ const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
+
+ ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+ ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
+
+ do {
+ src_ptr += 2 * src_stride;
+ y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+ ss_256, r);
+ sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[2] = ss_256[3];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ // AV1 standard won't have 32x4 case.
+ // This only favors some optimization feature which
+ // subsamples 32x8 to 32x4 and triggers 4-tap filter.
+
+ __m256i s_256[4], ss_256[4], tt_256[4], r[4];
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
+
+ ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+
+ tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+ tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+
+ do {
+ src_ptr += 2 * src_stride;
+ y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
+ ss_256, tt_256, r);
+ sr_y_round_store_32x2_avx2(r, dst, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[2] = ss_256[3];
+
+ tt_256[0] = tt_256[1];
+ tt_256[2] = tt_256[3];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ assert(!(w % 32));
+
+ __m256i s_256[4], ss_256[4], tt_256[4], r[4];
+ x = 0;
+ do {
+ const uint8_t *s = src_ptr + x;
+ uint8_t *d = dst + x;
+ s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
+
+ ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+
+ tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+ tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+
+ y = h;
+ do {
+ s += 2 * src_stride;
+ y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
+ tt_256, r);
+ sr_y_round_store_32x2_avx2(r, d, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[2] = ss_256[3];
+
+ tt_256[0] = tt_256[1];
+ tt_256[2] = tt_256[3];
+ d += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ x += 32;
+ } while (x < w);
+ }
+ }
+ } else if (vert_tap == 6) {
+ // vert_filt as 6 tap
+ const uint8_t *src_ptr = src - 2 * src_stride;
+
+ if (w <= 4) {
+ prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+ y = h;
+
+ if (w == 2) {
+ __m128i s_16[6], ss_128[3];
+
+ s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+ s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+ s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
+ s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
+
+ const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+ const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
+ const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
+ const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
+
+ ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+ ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+
+ do {
+ src_ptr += 2 * src_stride;
+ const __m128i res = y_convolve_6tap_2x2_ssse3(
+ src_ptr, src_stride, coeffs_128, s_16, ss_128);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+
+ ss_128[0] = ss_128[1];
+ ss_128[1] = ss_128[2];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m128i s_32[6], ss_128[3];
+
+ assert(w == 4);
+
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+ s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
+ s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
+
+ const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+ const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+ const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+
+ ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+ ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+
+ do {
+ src_ptr += 2 * src_stride;
+ const __m128i res = y_convolve_6tap_4x2_ssse3(
+ src_ptr, src_stride, coeffs_128, s_32, ss_128);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+
+ ss_128[0] = ss_128[1];
+ ss_128[1] = ss_128[2];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 8) {
+ __m128i s_64[6];
+ __m256i ss_256[3];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
+ s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+ s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
+ s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
+ const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
+ const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
+ const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
+
+ ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+ ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+
+ y = h;
+ do {
+ src_ptr += 2 * src_stride;
+ const __m256i res = y_convolve_6tap_8x2_avx2(
+ src_ptr, src_stride, coeffs_256, s_64, ss_256);
+ sr_y_round_store_8x2_avx2(res, dst, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ __m128i s_128[6];
+ __m256i ss_256[6], r[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
+ s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
+ s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+ s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
+ s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
+ const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
+ const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
+ const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
+
+ ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+ ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+
+ ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
+ ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
+
+ y = h;
+ do {
+ src_ptr += 2 * src_stride;
+ y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+ ss_256, r);
+ sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+
+ ss_256[3] = ss_256[4];
+ ss_256[4] = ss_256[5];
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i s_256[6], ss_256[6], tt_256[6], r[4];
+
+ assert(!(w % 32));
+
+ x = 0;
+ do {
+ const uint8_t *s = src_ptr + x;
+ uint8_t *d = dst + x;
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
+ s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
+
+ ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
+ ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+ ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
+
+ tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+ tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
+ tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+ tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
+
+ y = h;
+ do {
+ s += 2 * src_stride;
+ y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
+ tt_256, r);
+ sr_y_round_store_32x2_avx2(r, d, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[3] = ss_256[4];
+ ss_256[4] = ss_256[5];
+
+ tt_256[0] = tt_256[1];
+ tt_256[1] = tt_256[2];
+ tt_256[3] = tt_256[4];
+ tt_256[4] = tt_256[5];
+ d += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+
+ x += 32;
+ } while (x < w);
+ }
+ }
+ } else if (vert_tap == 8) {
+ // vert_filt as 8 tap
+ const uint8_t *src_ptr = src - 3 * src_stride;
+
+ if (w <= 4) {
+ prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
+
+ y = h;
+
+ if (w == 2) {
+ __m128i s_16[8], ss_128[4];
+
+ s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
+ s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
+ s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
+ s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
+ s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
+ s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
+ s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
+
+ const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
+ const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
+ const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
+ const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
+ const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
+ const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
+
+ ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+ ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+ ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+
+ do {
+ const __m128i res = y_convolve_8tap_2x2_ssse3(
+ src_ptr, src_stride, coeffs_128, s_16, ss_128);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+ ss_128[0] = ss_128[1];
+ ss_128[1] = ss_128[2];
+ ss_128[2] = ss_128[3];
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m128i s_32[8], ss_128[4];
+
+ assert(w == 4);
+
+ s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
+ s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
+ s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
+ s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
+ s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
+ s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
+ s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
+
+ const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
+ const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
+ const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
+ const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
+ const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
+ const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
+
+ ss_128[0] = _mm_unpacklo_epi8(src01, src12);
+ ss_128[1] = _mm_unpacklo_epi8(src23, src34);
+ ss_128[2] = _mm_unpacklo_epi8(src45, src56);
+
+ do {
+ const __m128i res = y_convolve_8tap_4x2_ssse3(
+ src_ptr, src_stride, coeffs_128, s_32, ss_128);
+ const __m128i r = sr_y_round_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+ ss_128[0] = ss_128[1];
+ ss_128[1] = ss_128[2];
+ ss_128[2] = ss_128[3];
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
+
+ if (w == 8) {
+ __m128i s_64[8];
+ __m256i ss_256[4];
+
+ s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
+ s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
+ s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
+ s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
+ s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
+ s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
+ s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
+ const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
+ const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
+ const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
+ const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
+ const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
+
+ ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+ ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+ ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+
+ y = h;
+ do {
+ const __m256i res = y_convolve_8tap_8x2_avx2(
+ src_ptr, src_stride, coeffs_256, s_64, ss_256);
+ sr_y_round_store_8x2_avx2(res, dst, dst_stride);
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[2] = ss_256[3];
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ __m128i s_128[8];
+ __m256i ss_256[8], r[2];
+
+ s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
+ s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
+ s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
+ s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
+ s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
+ s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
+ s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
+ const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
+ const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
+ const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
+ const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
+ const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
+
+ ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
+ ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
+ ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
+
+ ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
+ ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
+ ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
+
+ y = h;
+ do {
+ y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
+ ss_256, r);
+ sr_y_round_store_16x2_avx2(r, dst, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[2] = ss_256[3];
+
+ ss_256[4] = ss_256[5];
+ ss_256[5] = ss_256[6];
+ ss_256[6] = ss_256[7];
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ __m256i s_256[8], ss_256[8], tt_256[8], r[4];
+
+ assert(!(w % 32));
+
+ x = 0;
+ do {
+ const uint8_t *s = src_ptr + x;
+ uint8_t *d = dst + x;
+
+ s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
+ s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
+ s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
+ s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
+ s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
+ s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
+ s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
+
+ ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
+ ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
+ ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
+ ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
+ ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
+ ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
+
+ tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
+ tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
+ tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
+ tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
+ tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
+ tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
+
+ y = h;
+ do {
+ y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
+ tt_256, r);
+ sr_y_round_store_32x2_avx2(r, d, dst_stride);
+
+ ss_256[0] = ss_256[1];
+ ss_256[1] = ss_256[2];
+ ss_256[2] = ss_256[3];
+ ss_256[4] = ss_256[5];
+ ss_256[5] = ss_256[6];
+ ss_256[6] = ss_256[7];
+
+ tt_256[0] = tt_256[1];
+ tt_256[1] = tt_256[2];
+ tt_256[2] = tt_256[3];
+ tt_256[4] = tt_256[5];
+ tt_256[5] = tt_256[6];
+ tt_256[6] = tt_256[7];
+ s += 2 * src_stride;
+ d += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+
+ x += 32;
+ } while (x < w);
+ }
+ }
+ }
+}
+
+static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[1],
+ uint8_t *const dst) {
+ __m256i r[2];
+
+ x_convolve_2tap_32_avx2(src, coeffs, r);
+ sr_x_round_store_32_avx2(r, dst);
+}
+
+static INLINE void sr_x_6tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[3],
+ const __m256i filt[3],
+ uint8_t *const dst) {
+ __m256i r[2];
+
+ x_convolve_6tap_32_avx2(src, coeffs, filt, r);
+ sr_x_round_store_32_avx2(r, dst);
+}
+
+static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
+ const __m256i coeffs[4],
+ const __m256i filt[4],
+ uint8_t *const dst) {
+ __m256i r[2];
+
+ x_convolve_8tap_32_avx2(src, coeffs, filt, r);
+ sr_x_round_store_32_avx2(r, dst);
+}
+
+static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
+ const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+ int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
+ const int32_t subpel_x_q4, ConvolveParams *conv_params) {
+ int32_t y = h;
+ __m128i coeffs_128[4];
+ __m256i coeffs_256[4];
+
+ assert(conv_params->round_0 == 3);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+ (void)conv_params;
+
+ const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
+
+ if (horz_tap == 2) {
+ // horz_filt as 2 tap
+ const uint8_t *src_ptr = src;
+
+ if (subpel_x_q4 != 8) {
+ if (w <= 8) {
+ prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
+ coeffs_128);
+
+ if (w == 2) {
+ do {
+ const __m128i res =
+ x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
+ const __m128i r = sr_x_round_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ do {
+ const __m128i res =
+ x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+ const __m128i r = sr_x_round_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else {
+ assert(w == 8);
+
+ do {
+ __m128i res[2];
+
+ x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
+ res[0] = sr_x_round_sse2(res[0]);
+ res[1] = sr_x_round_sse2(res[1]);
+ const __m128i d = _mm_packus_epi16(res[0], res[1]);
+ _mm_storel_epi64((__m128i *)dst, d);
+ _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ }
+ } else {
+ prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+ if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
+ sr_x_round_store_16x2_avx2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
+ sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
+ sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
+ sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
+ sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ }
+ }
+ } else {
+ // average to get half pel
+ if (w == 2) {
+ do {
+ __m128i s_128;
+
+ s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
+ const __m128i s1 = _mm_srli_si128(s_128, 1);
+ const __m128i d = _mm_avg_epu8(s_128, s1);
+ *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
+ *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ do {
+ __m128i s_128;
+
+ s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
+ const __m128i s1 = _mm_srli_si128(s_128, 1);
+ const __m128i d = _mm_avg_epu8(s_128, s1);
+ xx_storel_32(dst, d);
+ *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 8) {
+ do {
+ const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
+ const __m128i s10 =
+ _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
+ const __m128i s01 = _mm_srli_si128(s00, 1);
+ const __m128i s11 = _mm_srli_si128(s10, 1);
+ const __m128i d0 = _mm_avg_epu8(s00, s01);
+ const __m128i d1 = _mm_avg_epu8(s10, s11);
+ _mm_storel_epi64((__m128i *)dst, d0);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ do {
+ const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
+ const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
+ const __m128i s10 =
+ _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
+ const __m128i s11 =
+ _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
+ const __m128i d0 = _mm_avg_epu8(s00, s01);
+ const __m128i d1 = _mm_avg_epu8(s10, s11);
+ _mm_storeu_si128((__m128i *)dst, d0);
+ _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
+
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ sr_x_2tap_32_avg_avx2(src_ptr, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
+ sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
+ sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
+ sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
+ sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ }
+ }
+ } else if (horz_tap == 4) {
+ // horz_filt as 4 tap
+ const uint8_t *src_ptr = src - 1;
+
+ prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
+
+ if (w == 2) {
+ do {
+ const __m128i res =
+ x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
+ const __m128i r = sr_x_round_sse2(res);
+ pack_store_2x2_sse2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 4) {
+ do {
+ const __m128i res =
+ x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
+ const __m128i r = sr_x_round_sse2(res);
+ pack_store_4x2_sse2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 8) {
+ // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
+ // rewrite this for better performance later.
+ __m256i filt_256[2];
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
+
+ filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+ filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+ for (int i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
+ res_16b = sr_x_round_avx2(res_16b);
+
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ }
+ } else {
+ assert(!(w % 16));
+ // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
+ // rewrite this for better performance later.
+ __m256i filt_256[2];
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
+ filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+ filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b =
+ convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
+ res_16b = sr_x_round_avx2(res_16b);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+ } else {
+ __m256i filt_256[4];
+
+ filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
+ filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
+ filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
+
+ if (horz_tap == 6) {
+ // horz_filt as 6 tap
+ const uint8_t *src_ptr = src - 2;
+
+ prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+ if (w == 8) {
+ do {
+ const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
+ coeffs_256, filt_256);
+ sr_x_round_store_8x2_avx2(res, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
+ r);
+ sr_x_round_store_16x2_avx2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+ sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+ sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
+ dst + 1 * 32);
+ sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
+ dst + 2 * 32);
+ sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
+ dst + 3 * 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ }
+ } else if (horz_tap == 8) {
+ // horz_filt as 8 tap
+ const uint8_t *src_ptr = src - 3;
+
+ filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
+
+ prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
+
+ if (w == 8) {
+ do {
+ const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
+ coeffs_256, filt_256);
+ sr_x_round_store_8x2_avx2(res, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 16) {
+ do {
+ __m256i r[2];
+
+ x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
+ r);
+ sr_x_round_store_16x2_avx2(r, dst, dst_stride);
+ src_ptr += 2 * src_stride;
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ } else if (w == 32) {
+ do {
+ sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else if (w == 64) {
+ do {
+ sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+ sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ } else {
+ assert(w == 128);
+
+ do {
+ sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
+ sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
+ dst + 1 * 32);
+ sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
+ dst + 2 * 32);
+ sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
+ dst + 3 * 32);
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--y);
+ }
+ }
+ }
+}
+
+#endif // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/third_party/SVT-AV1/synonyms.h b/third_party/aom/third_party/SVT-AV1/synonyms.h
new file mode 100644
index 0000000000..0ded6e5cfc
--- /dev/null
+++ b/third_party/aom/third_party/SVT-AV1/synonyms.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_THIRD_PARTY_SVT_AV1_SYNONYMS_H_
+#define AOM_THIRD_PARTY_SVT_AV1_SYNONYMS_H_
+
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE __m128i load_u8_8x2_sse2(const uint8_t *const src,
+ const ptrdiff_t stride) {
+ return load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
+}
+
+static AOM_FORCE_INLINE void store_u8_4x2_sse2(const __m128i src,
+ uint8_t *const dst,
+ const ptrdiff_t stride) {
+ xx_storel_32(dst, src);
+ *(uint32_t *)(dst + stride) =
+ ((uint32_t)_mm_extract_epi16(src, 3) << 16) | _mm_extract_epi16(src, 2);
+}
+
+#endif // AOM_THIRD_PARTY_SVT_AV1_SYNONYMS_H_
diff --git a/third_party/aom/third_party/fastfeat/LICENSE b/third_party/aom/third_party/fastfeat/LICENSE
new file mode 100644
index 0000000000..f347008d6e
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+
+ *Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ *Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ *Neither the name of the University of Cambridge nor the names of
+ its contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/aom/third_party/fastfeat/README.libaom b/third_party/aom/third_party/fastfeat/README.libaom
new file mode 100644
index 0000000000..556d8b6749
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/README.libaom
@@ -0,0 +1,44 @@
+URL: https://github.com/edrosten/fast-C-src
+Version: 391d5e939eb1545d24c10533d7de424db8d9c191
+License: BSD
+License File: LICENSE
+
+Description:
+Library to compute FAST features with non-maximum suppression.
+
+The files are valid C and C++ code, and have no special requirements for
+compiling, and they do not depend on any libraries. Just compile them along with
+the rest of your project.
+
+To use the functions, #include "fast.h"
+
+The corner detectors have the following prototype (where X is 9, 10, 11 or 12):
+
+xy* fastX_detect_nonmax(const unsigned char * data, int xsize, int ysize, int stride, int threshold, int* numcorners)
+
+Where xy is the following simple struct typedef:
+
+typedef struct
+{
+ int x, y;
+} xy;
+
+The image is passed in as a block of data and dimensions, and the list of
+corners is returned as an array of xy structs, and an integer (numcorners)
+with the number of corners returned. The data can be deallocated with free().
+Nonmaximal suppression is performed on the corners. Note that the stride
+is the number of bytes between rows. If your image has no padding, then this
+is the same as xsize.
+
+The detection, scoring and nonmaximal suppression are available as individual
+functions. To see how to use the individual functions, see fast.c
+
+Local Modifications:
+Add lines to turn off clang formatting for these files
+Remove Fast 10, 11 and 12
+Convert tabs to spaces
+Prefix global functions with "aom_"
+Add error checking
+Add output argument to hold the scores of the detected features
+Add assertion and rewrite comparisons to appease the scan-build static analyzer
+Set output argument *ret_num_corners to -1 to signal memory allocation failure
diff --git a/third_party/aom/third_party/fastfeat/fast.c b/third_party/aom/third_party/fastfeat/fast.c
new file mode 100644
index 0000000000..c475b4c7e9
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/fast.c
@@ -0,0 +1,67 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// *Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// *Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// *Neither the name of the University of Cambridge nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b,
+ int** ret_scores, int* ret_num_corners)
+{
+ xy* corners;
+ int num_corners;
+ int* scores;
+ xy* nonmax;
+
+ corners = aom_fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+ if(!corners)
+ {
+ // Memory allocation failure
+ *ret_num_corners = -1;
+ return NULL;
+ }
+ // num_corners may be zero.
+ scores = aom_fast9_score(im, stride, corners, num_corners, b);
+ if(!scores && num_corners > 0)
+ {
+ // Memory allocation failure
+ free(corners);
+ *ret_num_corners = -1;
+ return NULL;
+ }
+ nonmax = aom_nonmax_suppression(corners, scores, num_corners, ret_scores, ret_num_corners);
+
+ free(corners);
+ free(scores);
+
+ return nonmax;
+}
+// clang-format on
diff --git a/third_party/aom/third_party/fastfeat/fast.h b/third_party/aom/third_party/fastfeat/fast.h
new file mode 100644
index 0000000000..228ba85ad4
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/fast.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// *Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// *Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// *Neither the name of the University of Cambridge nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// clang-format off
+#ifndef FAST_H
+#define FAST_H
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart);
+
+// Returns NULL on memory allocation failure.
+xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+// If num_corners > 0, returns NULL on memory allocation failure.
+int* aom_fast9_score(const byte* i, int stride, const xy* corners, int num_corners, int b);
+
+// Sets *ret_num_corners to -1 (and returns NULL) on memory allocation failure.
+// Sets *ret_num_corners to 0 if nothing went wrong but no corners were found.
+xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b,
+ int** ret_scores, int* ret_num_corners);
+
+xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners,
+ int** ret_scores, int* ret_num_nonmax);
+
+
+#endif
+// clang-format on
diff --git a/third_party/aom/third_party/fastfeat/fast_9.c b/third_party/aom/third_party/fastfeat/fast_9.c
new file mode 100644
index 0000000000..de55ab51fe
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/fast_9.c
@@ -0,0 +1,5947 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// *Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// *Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// *Neither the name of the University of Cambridge nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// clang-format off
+/*This is mechanically generated code*/
+#include <stdlib.h>
+#include "fast.h"
+
+int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart)
+{
+ int bmin = bstart;
+ int bmax = 255;
+ int b = (bmax + bmin)/2;
+
+ /*Compute the score using binary search*/
+ for(;;)
+ {
+ int cb = *p + b;
+ int c_b= *p - b;
+
+
+ if( p[pixel[0]] > cb)
+ if( p[pixel[1]] > cb)
+ if( p[pixel[2]] > cb)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[7]] < c_b)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[14]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[6]] < c_b)
+ if( p[pixel[15]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[13]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[13]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[5]] < c_b)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[12]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[14]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[6]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[12]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[6]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[4]] < c_b)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[11]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[13]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[11]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[3]] < c_b)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[10]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[10]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[2]] < c_b)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[9]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[3]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[9]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[3]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[1]] < c_b)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[2]] > cb)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[8]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[2]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[2]] > cb)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[8]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[2]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[0]] < c_b)
+ if( p[pixel[1]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[2]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[2]] < c_b)
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[1]] < c_b)
+ if( p[pixel[2]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[3]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[2]] < c_b)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[13]] < c_b)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[6]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[14]] < c_b)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[6]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[15]] < c_b)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[13]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[6]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[9]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[3]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[8]] > cb)
+ if( p[pixel[7]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[10]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[2]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[2]] < c_b)
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[7]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[7]] > cb)
+ if( p[pixel[8]] > cb)
+ if( p[pixel[9]] > cb)
+ if( p[pixel[6]] > cb)
+ if( p[pixel[5]] > cb)
+ if( p[pixel[4]] > cb)
+ if( p[pixel[3]] > cb)
+ if( p[pixel[2]] > cb)
+ if( p[pixel[1]] > cb)
+ goto is_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] > cb)
+ if( p[pixel[11]] > cb)
+ if( p[pixel[12]] > cb)
+ if( p[pixel[13]] > cb)
+ if( p[pixel[14]] > cb)
+ if( p[pixel[15]] > cb)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else if( p[pixel[7]] < c_b)
+ if( p[pixel[8]] < c_b)
+ if( p[pixel[9]] < c_b)
+ if( p[pixel[6]] < c_b)
+ if( p[pixel[5]] < c_b)
+ if( p[pixel[4]] < c_b)
+ if( p[pixel[3]] < c_b)
+ if( p[pixel[2]] < c_b)
+ if( p[pixel[1]] < c_b)
+ goto is_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ if( p[pixel[10]] < c_b)
+ if( p[pixel[11]] < c_b)
+ if( p[pixel[12]] < c_b)
+ if( p[pixel[13]] < c_b)
+ if( p[pixel[14]] < c_b)
+ if( p[pixel[15]] < c_b)
+ goto is_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+ else
+ goto is_not_a_corner;
+
+is_a_corner:
+ bmin=b;
+ goto end_if;
+
+is_not_a_corner:
+ bmax=b;
+ goto end_if;
+
+end_if:
+
+ if(bmin == bmax - 1 || bmin == bmax)
+ return bmin;
+ b = (bmin + bmax) / 2;
+ }
+}
+
+static void make_offsets(int pixel[], int row_stride)
+{
+ pixel[0] = 0 + row_stride * 3;
+ pixel[1] = 1 + row_stride * 3;
+ pixel[2] = 2 + row_stride * 2;
+ pixel[3] = 3 + row_stride * 1;
+ pixel[4] = 3 + row_stride * 0;
+ pixel[5] = 3 + row_stride * -1;
+ pixel[6] = 2 + row_stride * -2;
+ pixel[7] = 1 + row_stride * -3;
+ pixel[8] = 0 + row_stride * -3;
+ pixel[9] = -1 + row_stride * -3;
+ pixel[10] = -2 + row_stride * -2;
+ pixel[11] = -3 + row_stride * -1;
+ pixel[12] = -3 + row_stride * 0;
+ pixel[13] = -3 + row_stride * 1;
+ pixel[14] = -2 + row_stride * 2;
+ pixel[15] = -1 + row_stride * 3;
+}
+
+
+
+int* aom_fast9_score(const byte* i, int stride, const xy* corners, int num_corners, int b)
+{
+ int* scores = (int*)malloc(sizeof(int)* num_corners);
+ int n;
+
+ int pixel[16];
+ if(!scores) return NULL;
+ make_offsets(pixel, stride);
+
+ for(n=0; n < num_corners; n++)
+ scores[n] = aom_fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b);
+
+ return scores;
+}
+
+
+xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+ int num_corners=0;
+ xy* ret_corners;
+ int rsize=512;
+ int pixel[16];
+ int x, y;
+
+ ret_corners = (xy*)malloc(sizeof(xy)*rsize);
+ if(!ret_corners) return NULL;
+ make_offsets(pixel, stride);
+
+ for(y=3; y < ysize - 3; y++)
+ for(x=3; x < xsize - 3; x++)
+ {
+ const byte* p = im + y*stride + x;
+
+ int cb = *p + b;
+ int c_b= *p - b;
+ if(p[pixel[0]] > cb)
+ if(p[pixel[1]] > cb)
+ if(p[pixel[2]] > cb)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else if(p[pixel[7]] < c_b)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else if(p[pixel[14]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[6]] < c_b)
+ if(p[pixel[15]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else if(p[pixel[13]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[13]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[5]] < c_b)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[12]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[14]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[6]] < c_b)
+ {}
+ else
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[12]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[6]] < c_b)
+ {}
+ else
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[4]] < c_b)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[11]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[13]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ {}
+ else
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[11]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ {}
+ else
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[3]] < c_b)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[10]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ {}
+ else
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[10]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ {}
+ else
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[2]] < c_b)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[9]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[3]] < c_b)
+ {}
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[9]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[3]] < c_b)
+ {}
+ else
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[1]] < c_b)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[2]] > cb)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[8]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[2]] < c_b)
+ {}
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[2]] > cb)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[8]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[2]] < c_b)
+ {}
+ else
+ if(p[pixel[11]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[0]] < c_b)
+ if(p[pixel[1]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[2]] > cb)
+ {}
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[2]] < c_b)
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[1]] < c_b)
+ if(p[pixel[2]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[3]] > cb)
+ {}
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[2]] < c_b)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ {}
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ {}
+ else
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[13]] < c_b)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[6]] > cb)
+ {}
+ else
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[14]] < c_b)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[6]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[15]] < c_b)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[13]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[6]] > cb)
+ {}
+ else
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ {}
+ else
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ {}
+ else
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[9]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[3]] > cb)
+ {}
+ else
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[8]] > cb)
+ if(p[pixel[7]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[10]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[2]] > cb)
+ {}
+ else
+ if(p[pixel[11]] > cb)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[2]] < c_b)
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[7]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[7]] > cb)
+ if(p[pixel[8]] > cb)
+ if(p[pixel[9]] > cb)
+ if(p[pixel[6]] > cb)
+ if(p[pixel[5]] > cb)
+ if(p[pixel[4]] > cb)
+ if(p[pixel[3]] > cb)
+ if(p[pixel[2]] > cb)
+ if(p[pixel[1]] > cb)
+ {}
+ else
+ if(p[pixel[10]] > cb)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] > cb)
+ if(p[pixel[11]] > cb)
+ if(p[pixel[12]] > cb)
+ if(p[pixel[13]] > cb)
+ if(p[pixel[14]] > cb)
+ if(p[pixel[15]] > cb)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else if(p[pixel[7]] < c_b)
+ if(p[pixel[8]] < c_b)
+ if(p[pixel[9]] < c_b)
+ if(p[pixel[6]] < c_b)
+ if(p[pixel[5]] < c_b)
+ if(p[pixel[4]] < c_b)
+ if(p[pixel[3]] < c_b)
+ if(p[pixel[2]] < c_b)
+ if(p[pixel[1]] < c_b)
+ {}
+ else
+ if(p[pixel[10]] < c_b)
+ {}
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ if(p[pixel[10]] < c_b)
+ if(p[pixel[11]] < c_b)
+ if(p[pixel[12]] < c_b)
+ if(p[pixel[13]] < c_b)
+ if(p[pixel[14]] < c_b)
+ if(p[pixel[15]] < c_b)
+ {}
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ else
+ continue;
+ if(num_corners == rsize)
+ {
+ rsize*=2;
+ xy* new_ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
+ if(!new_ret_corners)
+ {
+ free(ret_corners);
+ return NULL;
+ }
+ ret_corners = new_ret_corners;
+ }
+ ret_corners[num_corners].x = x;
+ ret_corners[num_corners].y = y;
+ num_corners++;
+
+ }
+
+ *ret_num_corners = num_corners;
+ return ret_corners;
+
+}
+
+// clang-format on
diff --git a/third_party/aom/third_party/fastfeat/nonmax.c b/third_party/aom/third_party/fastfeat/nonmax.c
new file mode 100644
index 0000000000..a6f7da0313
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/nonmax.c
@@ -0,0 +1,174 @@
+// Copyright (c) 2006, 2008 Edward Rosten
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// *Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// *Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// *Neither the name of the University of Cambridge nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// clang-format off
+#include <assert.h>
+#include <stdlib.h>
+#include "fast.h"
+
+
+#define Compare(X, Y) ((X)>=(Y))
+
+xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners,
+ int** ret_scores, int* ret_num_nonmax)
+{
+ int num_nonmax=0;
+ int last_row;
+ int* row_start;
+ int i, j;
+ xy* ret_nonmax;
+ int* nonmax_scores;
+ const int sz = (int)num_corners;
+
+ /*Point above points (roughly) to the pixel above the one of interest, if there
+ is a feature there.*/
+ int point_above = 0;
+ int point_below = 0;
+
+ *ret_scores = 0;
+ *ret_num_nonmax = -1;
+ if(!(corners && scores) || num_corners < 1)
+ {
+ *ret_num_nonmax = 0;
+ return 0;
+ }
+
+ ret_nonmax = (xy*)malloc(num_corners * sizeof(xy));
+ if(!ret_nonmax)
+ {
+ return 0;
+ }
+
+ nonmax_scores = (int*)malloc(num_corners * sizeof(*nonmax_scores));
+ if (!nonmax_scores)
+ {
+ free(ret_nonmax);
+ return 0;
+ }
+
+ /* Find where each row begins
+ (the corners are output in raster scan order). A beginning of -1 signifies
+ that there are no corners on that row. */
+ last_row = corners[num_corners-1].y;
+ row_start = (int*)malloc((last_row+1)*sizeof(int));
+ if(!row_start)
+ {
+ free(ret_nonmax);
+ free(nonmax_scores);
+ return 0;
+ }
+
+ for(i=0; i < last_row+1; i++)
+ row_start[i] = -1;
+
+ {
+ int prev_row = -1;
+ for(i=0; i< num_corners; i++)
+ if(corners[i].y != prev_row)
+ {
+ row_start[corners[i].y] = i;
+ prev_row = corners[i].y;
+ }
+ }
+
+
+
+ for(i=0; i < sz; i++)
+ {
+ int score = scores[i];
+ xy pos = corners[i];
+ assert(pos.y <= last_row);
+
+ /*Check left */
+ if(i > 0)
+ if(corners[i-1].x == pos.x-1 && corners[i-1].y == pos.y && Compare(scores[i-1], score))
+ continue;
+
+ /*Check right*/
+ if(i < (sz - 1))
+ if(corners[i+1].x == pos.x+1 && corners[i+1].y == pos.y && Compare(scores[i+1], score))
+ continue;
+
+ /*Check above (if there is a valid row above)*/
+ if(pos.y > 0 && row_start[pos.y - 1] != -1)
+ {
+ /*Make sure that current point_above is one
+ row above.*/
+ if(corners[point_above].y < pos.y - 1)
+ point_above = row_start[pos.y-1];
+
+ /*Make point_above point to the first of the pixels above the current point,
+ if it exists.*/
+ for(; corners[point_above].y < pos.y && corners[point_above].x < pos.x - 1; point_above++)
+ {}
+
+
+ for(j=point_above; corners[j].y < pos.y && corners[j].x <= pos.x + 1; j++)
+ {
+ int x = corners[j].x;
+ if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j], score))
+ goto cont;
+ }
+
+ }
+
+ /*Check below (if there is anything below)*/
+ if (pos.y + 1 < last_row+1 && row_start[pos.y + 1] != -1 && point_below < sz) /*Nothing below*/
+ {
+ if(corners[point_below].y < pos.y + 1)
+ point_below = row_start[pos.y+1];
+
+ /* Make point below point to one of the pixels belowthe current point, if it
+ exists.*/
+ for(; point_below < sz && corners[point_below].y == pos.y+1 && corners[point_below].x < pos.x - 1; point_below++)
+ {}
+
+ for(j=point_below; j < sz && corners[j].y == pos.y+1 && corners[j].x <= pos.x + 1; j++)
+ {
+ int x = corners[j].x;
+ if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j],score))
+ goto cont;
+ }
+ }
+
+ ret_nonmax[num_nonmax] = corners[i];
+ nonmax_scores[num_nonmax] = scores[i];
+ num_nonmax++;
+cont:
+ ;
+ }
+
+ free(row_start);
+ *ret_scores = nonmax_scores;
+ *ret_num_nonmax = num_nonmax;
+ return ret_nonmax;
+}
+
+// clang-format on
diff --git a/third_party/aom/third_party/googletest/README.libaom b/third_party/aom/third_party/googletest/README.libaom
new file mode 100644
index 0000000000..5e429d4dae
--- /dev/null
+++ b/third_party/aom/third_party/googletest/README.libaom
@@ -0,0 +1,38 @@
+URL: https://github.com/google/googletest
+Version: release-1.12.1
+License: BSD
+License File: LICENSE
+
+Description:
+Google's framework for writing C++ tests on a variety of platforms
+(Linux, Mac OS X, Windows, Windows CE, Symbian, etc). Based on the
+xUnit architecture. Supports automatic test discovery, a rich set of
+assertions, user-defined assertions, death tests, fatal and non-fatal
+failures, various options for running the tests, and XML test report
+generation.
+
+Local Modifications:
+- Remove everything but:
+ .clang-format
+ CMakeLists.txt
+ CONTRIBUTORS
+ googlemock/
+ cmake
+ CMakeLists.txt
+ include
+ README.md
+ src
+ googletest/
+ cmake
+ CMakeLists.txt
+ include
+ README.md
+ src
+ LICENSE
+ README.md
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+ GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+ the mingw32 g++ compilation errors caused by the lack of std::mutex
+ and std::condition_variable in the <mutex> and <condition_variable>
+ headers if mingw32 is configured with the win32 threads option. See
+ https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/third_party/aom/third_party/googletest/src/.clang-format b/third_party/aom/third_party/googletest/src/.clang-format
new file mode 100644
index 0000000000..5b9bfe6d22
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language: Cpp
+BasedOnStyle: Google
diff --git a/third_party/aom/third_party/googletest/src/CMakeLists.txt b/third_party/aom/third_party/googletest/src/CMakeLists.txt
new file mode 100644
index 0000000000..102e28cd49
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+
+cmake_minimum_required(VERSION 3.5)
+
+if (POLICY CMP0048)
+ cmake_policy(SET CMP0048 NEW)
+endif (POLICY CMP0048)
+
+if (POLICY CMP0077)
+ cmake_policy(SET CMP0077 NEW)
+endif (POLICY CMP0077)
+
+project(googletest-distribution)
+set(GOOGLETEST_VERSION 1.12.1)
+
+if(NOT CYGWIN AND NOT MSYS AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL QNX)
+ set(CMAKE_CXX_EXTENSIONS OFF)
+endif()
+
+enable_testing()
+
+include(CMakeDependentOption)
+include(GNUInstallDirs)
+
+#Note that googlemock target already builds googletest
+option(BUILD_GMOCK "Builds the googlemock subproject" ON)
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" ON)
+
+if(BUILD_GMOCK)
+ add_subdirectory( googlemock )
+else()
+ add_subdirectory( googletest )
+endif()
diff --git a/third_party/aom/third_party/googletest/src/CONTRIBUTORS b/third_party/aom/third_party/googletest/src/CONTRIBUTORS
new file mode 100644
index 0000000000..77397a5b53
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/CONTRIBUTORS
@@ -0,0 +1,65 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Testing Framework project. People
+# who commit code to the project are encouraged to add their names
+# here. Please keep the list sorted by first names.
+
+Ajay Joshi <jaj@google.com>
+Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
+Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
+Chandler Carruth <chandlerc@google.com>
+Chris Prince <cprince@google.com>
+Chris Taylor <taylorc@google.com>
+Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
+Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
+Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
+Jói Sigurðsson <joi@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
+Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
+Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
+Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
+Matthew Simmons <simmonmt@acm.org>
+Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
+Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
+Pasi Valminen <pasi.valminen@gmail.com>
+Patrick Hanna <phanna@google.com>
+Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
+Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
+Preston Jackson <preston.a.jackson@gmail.com>
+Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
+Russ Cox <rsc@google.com>
+Russ Rufer <russ@pentad.com>
+Sean Mcafee <eefacm@gmail.com>
+Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
+Tracy Bialik <tracy@pentad.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/third_party/aom/third_party/googletest/src/LICENSE b/third_party/aom/third_party/googletest/src/LICENSE
new file mode 100644
index 0000000000..1941a11f8c
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/aom/third_party/googletest/src/README.md b/third_party/aom/third_party/googletest/src/README.md
new file mode 100644
index 0000000000..30edaecf31
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/README.md
@@ -0,0 +1,141 @@
+# GoogleTest
+
+### Announcements
+
+#### Live at Head
+
+GoogleTest now follows the
+[Abseil Live at Head philosophy](https://abseil.io/about/philosophy#upgrade-support).
+We recommend
+[updating to the latest commit in the `main` branch as often as possible](https://github.com/abseil/abseil-cpp/blob/master/FAQ.md#what-is-live-at-head-and-how-do-i-do-it).
+
+#### Documentation Updates
+
+Our documentation is now live on GitHub Pages at
+https://google.github.io/googletest/. We recommend browsing the documentation on
+GitHub Pages rather than directly in the repository.
+
+#### Release 1.11.0
+
+[Release 1.11.0](https://github.com/google/googletest/releases/tag/release-1.11.0)
+is now available.
+
+#### Coming Soon
+
+* We are planning to take a dependency on
+ [Abseil](https://github.com/abseil/abseil-cpp).
+* More documentation improvements are planned.
+
+## Welcome to **GoogleTest**, Google's C++ test framework!
+
+This repository is a merger of the formerly separate GoogleTest and GoogleMock
+projects. These were so closely related that it makes sense to maintain and
+release them together.
+
+### Getting Started
+
+See the [GoogleTest User's Guide](https://google.github.io/googletest/) for
+documentation. We recommend starting with the
+[GoogleTest Primer](https://google.github.io/googletest/primer.html).
+
+More information about building GoogleTest can be found at
+[googletest/README.md](googletest/README.md).
+
+## Features
+
+* An [xUnit](https://en.wikipedia.org/wiki/XUnit) test framework.
+* Test discovery.
+* A rich set of assertions.
+* User-defined assertions.
+* Death tests.
+* Fatal and non-fatal failures.
+* Value-parameterized tests.
+* Type-parameterized tests.
+* Various options for running the tests.
+* XML test report generation.
+
+## Supported Platforms
+
+GoogleTest requires a codebase and compiler compliant with the C++11 standard or
+newer.
+
+The GoogleTest code is officially supported on the following platforms.
+Operating systems or tools not listed below are community-supported. For
+community-supported platforms, patches that do not complicate the code may be
+considered.
+
+If you notice any problems on your platform, please file an issue on the
+[GoogleTest GitHub Issue Tracker](https://github.com/google/googletest/issues).
+Pull requests containing fixes are welcome!
+
+### Operating Systems
+
+* Linux
+* macOS
+* Windows
+
+### Compilers
+
+* gcc 5.0+
+* clang 5.0+
+* MSVC 2015+
+
+**macOS users:** Xcode 9.3+ provides clang 5.0+.
+
+### Build Systems
+
+* [Bazel](https://bazel.build/)
+* [CMake](https://cmake.org/)
+
+**Note:** Bazel is the build system used by the team internally and in tests.
+CMake is supported on a best-effort basis and by the community.
+
+## Who Is Using GoogleTest?
+
+In addition to many internal projects at Google, GoogleTest is also used by the
+following notable projects:
+
+* The [Chromium projects](http://www.chromium.org/) (behind the Chrome browser
+ and Chrome OS).
+* The [LLVM](http://llvm.org/) compiler.
+* [Protocol Buffers](https://github.com/google/protobuf), Google's data
+ interchange format.
+* The [OpenCV](http://opencv.org/) computer vision library.
+
+## Related Open Source Projects
+
+[GTest Runner](https://github.com/nholthaus/gtest-runner) is a Qt5 based
+automated test-runner and Graphical User Interface with powerful features for
+Windows and Linux platforms.
+
+[GoogleTest UI](https://github.com/ospector/gtest-gbar) is a test runner that
+runs your test binary, allows you to track its progress via a progress bar, and
+displays a list of test failures. Clicking on one shows failure text. GoogleTest
+UI is written in C#.
+
+[GTest TAP Listener](https://github.com/kinow/gtest-tap-listener) is an event
+listener for GoogleTest that implements the
+[TAP protocol](https://en.wikipedia.org/wiki/Test_Anything_Protocol) for test
+result output. If your test runner understands TAP, you may find it useful.
+
+[gtest-parallel](https://github.com/google/gtest-parallel) is a test runner that
+runs tests from your binary in parallel to provide significant speed-up.
+
+[GoogleTest Adapter](https://marketplace.visualstudio.com/items?itemName=DavidSchuldenfrei.gtest-adapter)
+is a VS Code extension allowing to view GoogleTest in a tree view and run/debug
+your tests.
+
+[C++ TestMate](https://github.com/matepek/vscode-catch2-test-adapter) is a VS
+Code extension allowing to view GoogleTest in a tree view and run/debug your
+tests.
+
+[Cornichon](https://pypi.org/project/cornichon/) is a small Gherkin DSL parser
+that generates stub code for GoogleTest.
+
+## Contributing Changes
+
+Please read
+[`CONTRIBUTING.md`](https://github.com/google/googletest/blob/master/CONTRIBUTING.md)
+for details on how to contribute to this project.
+
+Happy testing!
diff --git a/third_party/aom/third_party/googletest/src/googlemock/CMakeLists.txt b/third_party/aom/third_party/googletest/src/googlemock/CMakeLists.txt
new file mode 100644
index 0000000000..5c1f0dafea
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/CMakeLists.txt
@@ -0,0 +1,218 @@
+########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
+# CMake build script for Google Mock.
+#
+# To run the tests for Google Mock itself on Linux, use 'make test' or
+# ctest. You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+option(gmock_build_tests "Build all of Google Mock's own tests." OFF)
+
+# A directory to find Google Test sources.
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/gtest/CMakeLists.txt")
+ set(gtest_dir gtest)
+else()
+ set(gtest_dir ../googletest)
+endif()
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include("${gtest_dir}/cmake/hermetic_build.cmake" OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+ # Google Test also calls hermetic setup functions from add_subdirectory,
+ # although its changes will not affect things at the current scope.
+ pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gmock_SOURCE_DIR} and to the root binary directory as
+# ${gmock_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+cmake_minimum_required(VERSION 3.5)
+cmake_policy(SET CMP0048 NEW)
+project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+
+if (COMMAND set_up_hermetic_build)
+ set_up_hermetic_build()
+endif()
+
+# Instructs CMake to process Google Test's CMakeLists.txt and add its
+# targets to the current scope. We are placing Google Test's binary
+# directory in a subdirectory of our own as VC compilation may break
+# if they are the same (the default).
+add_subdirectory("${gtest_dir}" "${gmock_BINARY_DIR}/${gtest_dir}")
+
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gmock" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+ # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+ # make it prominent in the GUI.
+ option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+else()
+ mark_as_advanced(gmock_build_tests)
+endif()
+
+# Although Google Test's CMakeLists.txt calls this function, the
+# changes there don't affect the current scope. Therefore we have to
+# call it again here.
+config_compiler_and_linker() # from ${gtest_dir}/cmake/internal_utils.cmake
+
+# Adds Google Mock's and Google Test's header directories to the search path.
+set(gmock_build_include_dirs
+ "${gmock_SOURCE_DIR}/include"
+ "${gmock_SOURCE_DIR}"
+ "${gtest_SOURCE_DIR}/include"
+ # This directory is needed to build directly from Google Test sources.
+ "${gtest_SOURCE_DIR}")
+include_directories(${gmock_build_include_dirs})
+
+########################################################################
+#
+# Defines the gmock & gmock_main libraries. User tests should link
+# with one of them.
+
+# Google Mock libraries. We build them using more strict warnings than what
+# are used for other targets, to ensure that Google Mock can be compiled by
+# a user aggressive about warnings.
+if (MSVC)
+ cxx_library(gmock
+ "${cxx_strict}"
+ "${gtest_dir}/src/gtest-all.cc"
+ src/gmock-all.cc)
+
+ cxx_library(gmock_main
+ "${cxx_strict}"
+ "${gtest_dir}/src/gtest-all.cc"
+ src/gmock-all.cc
+ src/gmock_main.cc)
+else()
+ cxx_library(gmock "${cxx_strict}" src/gmock-all.cc)
+ target_link_libraries(gmock PUBLIC gtest)
+ set_target_properties(gmock PROPERTIES VERSION ${GOOGLETEST_VERSION})
+ cxx_library(gmock_main "${cxx_strict}" src/gmock_main.cc)
+ target_link_libraries(gmock_main PUBLIC gmock)
+ set_target_properties(gmock_main PROPERTIES VERSION ${GOOGLETEST_VERSION})
+endif()
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+ string(REPLACE ";" "$<SEMICOLON>" dirs "${gmock_build_include_dirs}")
+ target_include_directories(gmock SYSTEM INTERFACE
+ "$<BUILD_INTERFACE:${dirs}>"
+ "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+ target_include_directories(gmock_main SYSTEM INTERFACE
+ "$<BUILD_INTERFACE:${dirs}>"
+ "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+
+########################################################################
+#
+# Install rules
+install_project(gmock gmock_main)
+
+########################################################################
+#
+# Google Mock's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Mock itself.
+#
+# The tests are not built by default. To build them, set the
+# gmock_build_tests option to ON. You can do it by running ccmake
+# or specifying the -Dgmock_build_tests=ON flag when running cmake.
+
+if (gmock_build_tests)
+ # This must be set in the root directory for the tests to be run by
+ # 'make test' or ctest.
+ enable_testing()
+
+ if (MINGW OR CYGWIN)
+ if (CMAKE_VERSION VERSION_LESS "2.8.12")
+ add_compile_options("-Wa,-mbig-obj")
+ else()
+ add_definitions("-Wa,-mbig-obj")
+ endif()
+ endif()
+
+ ############################################################
+ # C++ tests built with standard compiler flags.
+
+ cxx_test(gmock-actions_test gmock_main)
+ cxx_test(gmock-cardinalities_test gmock_main)
+ cxx_test(gmock_ex_test gmock_main)
+ cxx_test(gmock-function-mocker_test gmock_main)
+ cxx_test(gmock-internal-utils_test gmock_main)
+ cxx_test(gmock-matchers-arithmetic_test gmock_main)
+ cxx_test(gmock-matchers-comparisons_test gmock_main)
+ cxx_test(gmock-matchers-containers_test gmock_main)
+ cxx_test(gmock-matchers-misc_test gmock_main)
+ cxx_test(gmock-more-actions_test gmock_main)
+ cxx_test(gmock-nice-strict_test gmock_main)
+ cxx_test(gmock-port_test gmock_main)
+ cxx_test(gmock-spec-builders_test gmock_main)
+ cxx_test(gmock_link_test gmock_main test/gmock_link2_test.cc)
+ cxx_test(gmock_test gmock_main)
+
+ if (DEFINED GTEST_HAS_PTHREAD)
+ cxx_test(gmock_stress_test gmock)
+ endif()
+
+ # gmock_all_test is commented to save time building and running tests.
+ # Uncomment if necessary.
+ # cxx_test(gmock_all_test gmock_main)
+
+ ############################################################
+ # C++ tests built with non-standard compiler flags.
+
+ if (MSVC)
+ cxx_library(gmock_main_no_exception "${cxx_no_exception}"
+ "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+ cxx_library(gmock_main_no_rtti "${cxx_no_rtti}"
+ "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+ else()
+ cxx_library(gmock_main_no_exception "${cxx_no_exception}" src/gmock_main.cc)
+ target_link_libraries(gmock_main_no_exception PUBLIC gmock)
+
+ cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" src/gmock_main.cc)
+ target_link_libraries(gmock_main_no_rtti PUBLIC gmock)
+ endif()
+ cxx_test_with_flags(gmock-more-actions_no_exception_test "${cxx_no_exception}"
+ gmock_main_no_exception test/gmock-more-actions_test.cc)
+
+ cxx_test_with_flags(gmock_no_rtti_test "${cxx_no_rtti}"
+ gmock_main_no_rtti test/gmock-spec-builders_test.cc)
+
+ cxx_shared_library(shared_gmock_main "${cxx_default}"
+ "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+ # Tests that a binary can be built with Google Mock as a shared library. On
+ # some system configurations, it may not possible to run the binary without
+ # knowing more details about the system configurations. We do not try to run
+ # this binary. To get a more robust shared library coverage, configure with
+ # -DBUILD_SHARED_LIBS=ON.
+ cxx_executable_with_flags(shared_gmock_test_ "${cxx_default}"
+ shared_gmock_main test/gmock-spec-builders_test.cc)
+ set_target_properties(shared_gmock_test_
+ PROPERTIES
+ COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+ ############################################################
+ # Python tests.
+
+ cxx_executable(gmock_leak_test_ test gmock_main)
+ py_test(gmock_leak_test)
+
+ cxx_executable(gmock_output_test_ test gmock)
+ py_test(gmock_output_test)
+endif()
diff --git a/third_party/aom/third_party/googletest/src/googlemock/README.md b/third_party/aom/third_party/googletest/src/googlemock/README.md
new file mode 100644
index 0000000000..7da60655db
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/README.md
@@ -0,0 +1,40 @@
+# Googletest Mocking (gMock) Framework
+
+### Overview
+
+Google's framework for writing and using C++ mock classes. It can help you
+derive better designs of your system and write better tests.
+
+It is inspired by:
+
+* [jMock](http://www.jmock.org/)
+* [EasyMock](http://www.easymock.org/)
+* [Hamcrest](http://code.google.com/p/hamcrest/)
+
+It is designed with C++'s specifics in mind.
+
+gMock:
+
+- Provides a declarative syntax for defining mocks.
+- Can define partial (hybrid) mocks, which are a cross of real and mock
+ objects.
+- Handles functions of arbitrary types and overloaded functions.
+- Comes with a rich set of matchers for validating function arguments.
+- Uses an intuitive syntax for controlling the behavior of a mock.
+- Does automatic verification of expectations (no record-and-replay needed).
+- Allows arbitrary (partial) ordering constraints on function calls to be
+ expressed.
+- Lets a user extend it by defining new matchers and actions.
+- Does not use exceptions.
+- Is easy to learn and use.
+
+Details and examples can be found here:
+
+* [gMock for Dummies](https://google.github.io/googletest/gmock_for_dummies.html)
+* [Legacy gMock FAQ](https://google.github.io/googletest/gmock_faq.html)
+* [gMock Cookbook](https://google.github.io/googletest/gmock_cook_book.html)
+* [gMock Cheat Sheet](https://google.github.io/googletest/gmock_cheat_sheet.html)
+
+GoogleMock is a part of
+[GoogleTest C++ testing framework](http://github.com/google/googletest/) and a
+subject to the same requirements.
diff --git a/third_party/aom/third_party/googletest/src/googlemock/cmake/gmock.pc.in b/third_party/aom/third_party/googletest/src/googlemock/cmake/gmock.pc.in
new file mode 100644
index 0000000000..23c67b5c88
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/cmake/gmock.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gmock
+Description: GoogleMock (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest = @PROJECT_VERSION@
+Libs: -L${libdir} -lgmock @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/third_party/aom/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in b/third_party/aom/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
new file mode 100644
index 0000000000..66ffea7f44
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gmock_main
+Description: GoogleMock (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gmock = @PROJECT_VERSION@
+Libs: -L${libdir} -lgmock_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
new file mode 100644
index 0000000000..c785ad8abb
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h
@@ -0,0 +1,2298 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// The ACTION* family of macros can be used in a namespace scope to
+// define custom actions easily. The syntax:
+//
+// ACTION(name) { statements; }
+//
+// will define an action with the given name that executes the
+// statements. The value returned by the statements will be used as
+// the return value of the action. Inside the statements, you can
+// refer to the K-th (0-based) argument of the mock function by
+// 'argK', and refer to its type by 'argK_type'. For example:
+//
+// ACTION(IncrementArg1) {
+// arg1_type temp = arg1;
+// return ++(*temp);
+// }
+//
+// allows you to write
+//
+// ...WillOnce(IncrementArg1());
+//
+// You can also refer to the entire argument tuple and its type by
+// 'args' and 'args_type', and refer to the mock function type and its
+// return type by 'function_type' and 'return_type'.
+//
+// Note that you don't need to specify the types of the mock function
+// arguments. However rest assured that your code is still type-safe:
+// you'll get a compiler error if *arg1 doesn't support the ++
+// operator, or if the type of ++(*arg1) isn't compatible with the
+// mock function's return type, for example.
+//
+// Sometimes you'll want to parameterize the action. For that you can use
+// another macro:
+//
+// ACTION_P(name, param_name) { statements; }
+//
+// For example:
+//
+// ACTION_P(Add, n) { return arg0 + n; }
+//
+// will allow you to write:
+//
+// ...WillOnce(Add(5));
+//
+// Note that you don't need to provide the type of the parameter
+// either. If you need to reference the type of a parameter named
+// 'foo', you can write 'foo_type'. For example, in the body of
+// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
+// of 'n'.
+//
+// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support
+// multi-parameter actions.
+//
+// For the purpose of typing, you can view
+//
+// ACTION_Pk(Foo, p1, ..., pk) { ... }
+//
+// as shorthand for
+//
+// template <typename p1_type, ..., typename pk_type>
+// FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// In particular, you can provide the template type arguments
+// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
+// although usually you can rely on the compiler to infer the types
+// for you automatically. You can assign the result of expression
+// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
+// pk_type>. This can be useful when composing actions.
+//
+// You can also overload actions with different numbers of parameters:
+//
+// ACTION_P(Plus, a) { ... }
+// ACTION_P2(Plus, a, b) { ... }
+//
+// While it's tempting to always use the ACTION* macros when defining
+// a new action, you should also consider implementing ActionInterface
+// or using MakePolymorphicAction() instead, especially if you need to
+// use the action a lot. While these approaches require more work,
+// they give you more control on the types of the mock function
+// arguments and the action parameters, which in general leads to
+// better compiler error messages that pay off in the long run. They
+// also allow overloading actions based on parameter types (as opposed
+// to just based on the number of parameters).
+//
+// CAVEAT:
+//
+// ACTION*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+// Users can, however, define any local functors (e.g. a lambda) that
+// can be used as actions.
+//
+// MORE INFORMATION:
+//
+// To learn more about using these macros, please search for 'ACTION' on
+// https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+
+#ifndef _WIN32_WCE
+#include <errno.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gmock/internal/gmock-pp.h"
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+namespace testing {
+
+// To implement an action Foo, define:
+// 1. a class FooAction that implements the ActionInterface interface, and
+// 2. a factory function that creates an Action object from a
+// const FooAction*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers. It also eases ownership
+// management as Action objects can now be copied like plain values.
+
+namespace internal {
+
+// BuiltInDefaultValueGetter<T, true>::Get() returns a
+// default-constructed T value. BuiltInDefaultValueGetter<T,
+// false>::Get() crashes with an error.
+//
+// This primary template is used when kDefaultConstructible is true.
+template <typename T, bool kDefaultConstructible>
+struct BuiltInDefaultValueGetter {
+ static T Get() { return T(); }
+};
+template <typename T>
+struct BuiltInDefaultValueGetter<T, false> {
+ static T Get() {
+ Assert(false, __FILE__, __LINE__,
+ "Default action undefined for the function return type.");
+ return internal::Invalid<T>();
+ // The above statement will never be reached, but is required in
+ // order for this function to compile.
+ }
+};
+
+// BuiltInDefaultValue<T>::Get() returns the "built-in" default value
+// for type T, which is NULL when T is a raw pointer type, 0 when T is
+// a numeric type, false when T is bool, or "" when T is string or
+// std::string. In addition, in C++11 and above, it turns a
+// default-constructed T value if T is default constructible. For any
+// other type T, the built-in default T value is undefined, and the
+// function will abort the process.
+template <typename T>
+class BuiltInDefaultValue {
+ public:
+ // This function returns true if and only if type T has a built-in default
+ // value.
+ static bool Exists() { return ::std::is_default_constructible<T>::value; }
+
+ static T Get() {
+ return BuiltInDefaultValueGetter<
+ T, ::std::is_default_constructible<T>::value>::Get();
+ }
+};
+
+// This partial specialization says that we use the same built-in
+// default value for T and const T.
+template <typename T>
+class BuiltInDefaultValue<const T> {
+ public:
+ static bool Exists() { return BuiltInDefaultValue<T>::Exists(); }
+ static T Get() { return BuiltInDefaultValue<T>::Get(); }
+};
+
+// This partial specialization defines the default values for pointer
+// types.
+template <typename T>
+class BuiltInDefaultValue<T*> {
+ public:
+ static bool Exists() { return true; }
+ static T* Get() { return nullptr; }
+};
+
+// The following specializations define the default values for
+// specific types we care about.
+#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
+ template <> \
+ class BuiltInDefaultValue<type> { \
+ public: \
+ static bool Exists() { return true; } \
+ static type Get() { return value; } \
+ }
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, ); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, "");
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0');
+
+// There's no need for a default action for signed wchar_t, as that
+// type is the same as wchar_t for gcc, and invalid for MSVC.
+//
+// There's also no need for a default action for unsigned wchar_t, as
+// that type is the same as unsigned int for gcc, and invalid for
+// MSVC.
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U); // NOLINT
+#endif
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long long, 0); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0); // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
+
+#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
+
+// Partial implementations of metaprogramming types from the standard library
+// not available in C++11.
+
+template <typename P>
+struct negation
+ // NOLINTNEXTLINE
+ : std::integral_constant<bool, bool(!P::value)> {};
+
+// Base case: with zero predicates the answer is always true.
+template <typename...>
+struct conjunction : std::true_type {};
+
+// With a single predicate, the answer is that predicate.
+template <typename P1>
+struct conjunction<P1> : P1 {};
+
+// With multiple predicates the answer is the first predicate if that is false,
+// and we recurse otherwise.
+template <typename P1, typename... Ps>
+struct conjunction<P1, Ps...>
+ : std::conditional<bool(P1::value), conjunction<Ps...>, P1>::type {};
+
+template <typename...>
+struct disjunction : std::false_type {};
+
+template <typename P1>
+struct disjunction<P1> : P1 {};
+
+template <typename P1, typename... Ps>
+struct disjunction<P1, Ps...>
+ // NOLINTNEXTLINE
+ : std::conditional<!bool(P1::value), disjunction<Ps...>, P1>::type {};
+
+template <typename...>
+using void_t = void;
+
+// Detects whether an expression of type `From` can be implicitly converted to
+// `To` according to [conv]. In C++17, [conv]/3 defines this as follows:
+//
+// An expression e can be implicitly converted to a type T if and only if
+// the declaration T t=e; is well-formed, for some invented temporary
+// variable t ([dcl.init]).
+//
+// [conv]/2 implies we can use function argument passing to detect whether this
+// initialization is valid.
+//
+// Note that this is distinct from is_convertible, which requires this be valid:
+//
+// To test() {
+// return declval<From>();
+// }
+//
+// In particular, is_convertible doesn't give the correct answer when `To` and
+// `From` are the same non-moveable type since `declval<From>` will be an rvalue
+// reference, defeating the guaranteed copy elision that would otherwise make
+// this function work.
+//
+// REQUIRES: `From` is not cv void.
+template <typename From, typename To>
+struct is_implicitly_convertible {
+ private:
+ // A function that accepts a parameter of type T. This can be called with type
+ // U successfully only if U is implicitly convertible to T.
+ template <typename T>
+ static void Accept(T);
+
+ // A function that creates a value of type T.
+ template <typename T>
+ static T Make();
+
+ // An overload be selected when implicit conversion from T to To is possible.
+ template <typename T, typename = decltype(Accept<To>(Make<T>()))>
+ static std::true_type TestImplicitConversion(int);
+
+ // A fallback overload selected in all other cases.
+ template <typename T>
+ static std::false_type TestImplicitConversion(...);
+
+ public:
+ using type = decltype(TestImplicitConversion<From>(0));
+ static constexpr bool value = type::value;
+};
+
+// Like std::invoke_result_t from C++17, but works only for objects with call
+// operators (not e.g. member function pointers, which we don't need specific
+// support for in OnceAction because std::function deals with them).
+template <typename F, typename... Args>
+using call_result_t = decltype(std::declval<F>()(std::declval<Args>()...));
+
+template <typename Void, typename R, typename F, typename... Args>
+struct is_callable_r_impl : std::false_type {};
+
+// Specialize the struct for those template arguments where call_result_t is
+// well-formed. When it's not, the generic template above is chosen, resulting
+// in std::false_type.
+template <typename R, typename F, typename... Args>
+struct is_callable_r_impl<void_t<call_result_t<F, Args...>>, R, F, Args...>
+ : std::conditional<
+ std::is_void<R>::value, //
+ std::true_type, //
+ is_implicitly_convertible<call_result_t<F, Args...>, R>>::type {};
+
+// Like std::is_invocable_r from C++17, but works only for objects with call
+// operators. See the note on call_result_t.
+template <typename R, typename F, typename... Args>
+using is_callable_r = is_callable_r_impl<void, R, F, Args...>;
+
+// Like std::as_const from C++17.
+template <typename T>
+typename std::add_const<T>::type& as_const(T& t) {
+ return t;
+}
+
+} // namespace internal
+
+// Specialized for function types below.
+template <typename F>
+class OnceAction;
+
+// An action that can only be used once.
+//
+// This is accepted by WillOnce, which doesn't require the underlying action to
+// be copy-constructible (only move-constructible), and promises to invoke it as
+// an rvalue reference. This allows the action to work with move-only types like
+// std::move_only_function in a type-safe manner.
+//
+// For example:
+//
+// // Assume we have some API that needs to accept a unique pointer to some
+// // non-copyable object Foo.
+// void AcceptUniquePointer(std::unique_ptr<Foo> foo);
+//
+// // We can define an action that provides a Foo to that API. Because It
+// // has to give away its unique pointer, it must not be called more than
+// // once, so its call operator is &&-qualified.
+// struct ProvideFoo {
+// std::unique_ptr<Foo> foo;
+//
+// void operator()() && {
+// AcceptUniquePointer(std::move(Foo));
+// }
+// };
+//
+// // This action can be used with WillOnce.
+// EXPECT_CALL(mock, Call)
+// .WillOnce(ProvideFoo{std::make_unique<Foo>(...)});
+//
+// // But a call to WillRepeatedly will fail to compile. This is correct,
+// // since the action cannot correctly be used repeatedly.
+// EXPECT_CALL(mock, Call)
+// .WillRepeatedly(ProvideFoo{std::make_unique<Foo>(...)});
+//
+// A less-contrived example would be an action that returns an arbitrary type,
+// whose &&-qualified call operator is capable of dealing with move-only types.
+template <typename Result, typename... Args>
+class OnceAction<Result(Args...)> final {
+ private:
+ // True iff we can use the given callable type (or lvalue reference) directly
+ // via StdFunctionAdaptor.
+ template <typename Callable>
+ using IsDirectlyCompatible = internal::conjunction<
+ // It must be possible to capture the callable in StdFunctionAdaptor.
+ std::is_constructible<typename std::decay<Callable>::type, Callable>,
+ // The callable must be compatible with our signature.
+ internal::is_callable_r<Result, typename std::decay<Callable>::type,
+ Args...>>;
+
+ // True iff we can use the given callable type via StdFunctionAdaptor once we
+ // ignore incoming arguments.
+ template <typename Callable>
+ using IsCompatibleAfterIgnoringArguments = internal::conjunction<
+ // It must be possible to capture the callable in a lambda.
+ std::is_constructible<typename std::decay<Callable>::type, Callable>,
+ // The callable must be invocable with zero arguments, returning something
+ // convertible to Result.
+ internal::is_callable_r<Result, typename std::decay<Callable>::type>>;
+
+ public:
+ // Construct from a callable that is directly compatible with our mocked
+ // signature: it accepts our function type's arguments and returns something
+ // convertible to our result type.
+ template <typename Callable,
+ typename std::enable_if<
+ internal::conjunction<
+ // Teach clang on macOS that we're not talking about a
+ // copy/move constructor here. Otherwise it gets confused
+ // when checking the is_constructible requirement of our
+ // traits above.
+ internal::negation<std::is_same<
+ OnceAction, typename std::decay<Callable>::type>>,
+ IsDirectlyCompatible<Callable>> //
+ ::value,
+ int>::type = 0>
+ OnceAction(Callable&& callable) // NOLINT
+ : function_(StdFunctionAdaptor<typename std::decay<Callable>::type>(
+ {}, std::forward<Callable>(callable))) {}
+
+ // As above, but for a callable that ignores the mocked function's arguments.
+ template <typename Callable,
+ typename std::enable_if<
+ internal::conjunction<
+ // Teach clang on macOS that we're not talking about a
+ // copy/move constructor here. Otherwise it gets confused
+ // when checking the is_constructible requirement of our
+ // traits above.
+ internal::negation<std::is_same<
+ OnceAction, typename std::decay<Callable>::type>>,
+ // Exclude callables for which the overload above works.
+ // We'd rather provide the arguments if possible.
+ internal::negation<IsDirectlyCompatible<Callable>>,
+ IsCompatibleAfterIgnoringArguments<Callable>>::value,
+ int>::type = 0>
+ OnceAction(Callable&& callable) // NOLINT
+ // Call the constructor above with a callable
+ // that ignores the input arguments.
+ : OnceAction(IgnoreIncomingArguments<typename std::decay<Callable>::type>{
+ std::forward<Callable>(callable)}) {}
+
+ // We are naturally copyable because we store only an std::function, but
+ // semantically we should not be copyable.
+ OnceAction(const OnceAction&) = delete;
+ OnceAction& operator=(const OnceAction&) = delete;
+ OnceAction(OnceAction&&) = default;
+
+ // Invoke the underlying action callable with which we were constructed,
+ // handing it the supplied arguments.
+ Result Call(Args... args) && {
+ return function_(std::forward<Args>(args)...);
+ }
+
+ private:
+ // An adaptor that wraps a callable that is compatible with our signature and
+ // being invoked as an rvalue reference so that it can be used as an
+ // StdFunctionAdaptor. This throws away type safety, but that's fine because
+ // this is only used by WillOnce, which we know calls at most once.
+ //
+ // Once we have something like std::move_only_function from C++23, we can do
+ // away with this.
+ template <typename Callable>
+ class StdFunctionAdaptor final {
+ public:
+ // A tag indicating that the (otherwise universal) constructor is accepting
+ // the callable itself, instead of e.g. stealing calls for the move
+ // constructor.
+ struct CallableTag final {};
+
+ template <typename F>
+ explicit StdFunctionAdaptor(CallableTag, F&& callable)
+ : callable_(std::make_shared<Callable>(std::forward<F>(callable))) {}
+
+ // Rather than explicitly returning Result, we return whatever the wrapped
+ // callable returns. This allows for compatibility with existing uses like
+ // the following, when the mocked function returns void:
+ //
+ // EXPECT_CALL(mock_fn_, Call)
+ // .WillOnce([&] {
+ // [...]
+ // return 0;
+ // });
+ //
+ // Such a callable can be turned into std::function<void()>. If we use an
+ // explicit return type of Result here then it *doesn't* work with
+ // std::function, because we'll get a "void function should not return a
+ // value" error.
+ //
+ // We need not worry about incompatible result types because the SFINAE on
+ // OnceAction already checks this for us. std::is_invocable_r_v itself makes
+ // the same allowance for void result types.
+ template <typename... ArgRefs>
+ internal::call_result_t<Callable, ArgRefs...> operator()(
+ ArgRefs&&... args) const {
+ return std::move(*callable_)(std::forward<ArgRefs>(args)...);
+ }
+
+ private:
+ // We must put the callable on the heap so that we are copyable, which
+ // std::function needs.
+ std::shared_ptr<Callable> callable_;
+ };
+
+ // An adaptor that makes a callable that accepts zero arguments callable with
+ // our mocked arguments.
+ template <typename Callable>
+ struct IgnoreIncomingArguments {
+ internal::call_result_t<Callable> operator()(Args&&...) {
+ return std::move(callable)();
+ }
+
+ Callable callable;
+ };
+
+ std::function<Result(Args...)> function_;
+};
+
+// When an unexpected function call is encountered, Google Mock will
+// let it return a default value if the user has specified one for its
+// return type, or if the return type has a built-in default value;
+// otherwise Google Mock won't know what value to return and will have
+// to abort the process.
+//
+// The DefaultValue<T> class allows a user to specify the
+// default value for a type T that is both copyable and publicly
+// destructible (i.e. anything that can be used as a function return
+// type). The usage is:
+//
+// // Sets the default value for type T to be foo.
+// DefaultValue<T>::Set(foo);
+template <typename T>
+class DefaultValue {
+ public:
+ // Sets the default value for type T; requires T to be
+ // copy-constructable and have a public destructor.
+ static void Set(T x) {
+ delete producer_;
+ producer_ = new FixedValueProducer(x);
+ }
+
+ // Provides a factory function to be called to generate the default value.
+ // This method can be used even if T is only move-constructible, but it is not
+ // limited to that case.
+ typedef T (*FactoryFunction)();
+ static void SetFactory(FactoryFunction factory) {
+ delete producer_;
+ producer_ = new FactoryValueProducer(factory);
+ }
+
+ // Unsets the default value for type T.
+ static void Clear() {
+ delete producer_;
+ producer_ = nullptr;
+ }
+
+ // Returns true if and only if the user has set the default value for type T.
+ static bool IsSet() { return producer_ != nullptr; }
+
+ // Returns true if T has a default return value set by the user or there
+ // exists a built-in default value.
+ static bool Exists() {
+ return IsSet() || internal::BuiltInDefaultValue<T>::Exists();
+ }
+
+ // Returns the default value for type T if the user has set one;
+ // otherwise returns the built-in default value. Requires that Exists()
+ // is true, which ensures that the return value is well-defined.
+ static T Get() {
+ return producer_ == nullptr ? internal::BuiltInDefaultValue<T>::Get()
+ : producer_->Produce();
+ }
+
+ private:
+ class ValueProducer {
+ public:
+ virtual ~ValueProducer() {}
+ virtual T Produce() = 0;
+ };
+
+ class FixedValueProducer : public ValueProducer {
+ public:
+ explicit FixedValueProducer(T value) : value_(value) {}
+ T Produce() override { return value_; }
+
+ private:
+ const T value_;
+ FixedValueProducer(const FixedValueProducer&) = delete;
+ FixedValueProducer& operator=(const FixedValueProducer&) = delete;
+ };
+
+ class FactoryValueProducer : public ValueProducer {
+ public:
+ explicit FactoryValueProducer(FactoryFunction factory)
+ : factory_(factory) {}
+ T Produce() override { return factory_(); }
+
+ private:
+ const FactoryFunction factory_;
+ FactoryValueProducer(const FactoryValueProducer&) = delete;
+ FactoryValueProducer& operator=(const FactoryValueProducer&) = delete;
+ };
+
+ static ValueProducer* producer_;
+};
+
+// This partial specialization allows a user to set default values for
+// reference types.
+template <typename T>
+class DefaultValue<T&> {
+ public:
+ // Sets the default value for type T&.
+ static void Set(T& x) { // NOLINT
+ address_ = &x;
+ }
+
+ // Unsets the default value for type T&.
+ static void Clear() { address_ = nullptr; }
+
+ // Returns true if and only if the user has set the default value for type T&.
+ static bool IsSet() { return address_ != nullptr; }
+
+ // Returns true if T has a default return value set by the user or there
+ // exists a built-in default value.
+ static bool Exists() {
+ return IsSet() || internal::BuiltInDefaultValue<T&>::Exists();
+ }
+
+ // Returns the default value for type T& if the user has set one;
+ // otherwise returns the built-in default value if there is one;
+ // otherwise aborts the process.
+ static T& Get() {
+ return address_ == nullptr ? internal::BuiltInDefaultValue<T&>::Get()
+ : *address_;
+ }
+
+ private:
+ static T* address_;
+};
+
+// This specialization allows DefaultValue<void>::Get() to
+// compile.
+template <>
+class DefaultValue<void> {
+ public:
+ static bool Exists() { return true; }
+ static void Get() {}
+};
+
+// Points to the user-set default value for type T.
+template <typename T>
+typename DefaultValue<T>::ValueProducer* DefaultValue<T>::producer_ = nullptr;
+
+// Points to the user-set default value for type T&.
+template <typename T>
+T* DefaultValue<T&>::address_ = nullptr;
+
+// Implement this interface to define an action for function type F.
+template <typename F>
+class ActionInterface {
+ public:
+ typedef typename internal::Function<F>::Result Result;
+ typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+ ActionInterface() {}
+ virtual ~ActionInterface() {}
+
+ // Performs the action. This method is not const, as in general an
+ // action can have side effects and be stateful. For example, a
+ // get-the-next-element-from-the-collection action will need to
+ // remember the current element.
+ virtual Result Perform(const ArgumentTuple& args) = 0;
+
+ private:
+ ActionInterface(const ActionInterface&) = delete;
+ ActionInterface& operator=(const ActionInterface&) = delete;
+};
+
+template <typename F>
+class Action;
+
+// An Action<R(Args...)> is a copyable and IMMUTABLE (except by assignment)
+// object that represents an action to be taken when a mock function of type
+// R(Args...) is called. The implementation of Action<T> is just a
+// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action! You
+// can view an object implementing ActionInterface<F> as a concrete action
+// (including its current state), and an Action<F> object as a handle to it.
+template <typename R, typename... Args>
+class Action<R(Args...)> {
+ private:
+ using F = R(Args...);
+
+ // Adapter class to allow constructing Action from a legacy ActionInterface.
+ // New code should create Actions from functors instead.
+ struct ActionAdapter {
+ // Adapter must be copyable to satisfy std::function requirements.
+ ::std::shared_ptr<ActionInterface<F>> impl_;
+
+ template <typename... InArgs>
+ typename internal::Function<F>::Result operator()(InArgs&&... args) {
+ return impl_->Perform(
+ ::std::forward_as_tuple(::std::forward<InArgs>(args)...));
+ }
+ };
+
+ template <typename G>
+ using IsCompatibleFunctor = std::is_constructible<std::function<F>, G>;
+
+ public:
+ typedef typename internal::Function<F>::Result Result;
+ typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+ // Constructs a null Action. Needed for storing Action objects in
+ // STL containers.
+ Action() {}
+
+ // Construct an Action from a specified callable.
+ // This cannot take std::function directly, because then Action would not be
+ // directly constructible from lambda (it would require two conversions).
+ template <
+ typename G,
+ typename = typename std::enable_if<internal::disjunction<
+ IsCompatibleFunctor<G>, std::is_constructible<std::function<Result()>,
+ G>>::value>::type>
+ Action(G&& fun) { // NOLINT
+ Init(::std::forward<G>(fun), IsCompatibleFunctor<G>());
+ }
+
+ // Constructs an Action from its implementation.
+ explicit Action(ActionInterface<F>* impl)
+ : fun_(ActionAdapter{::std::shared_ptr<ActionInterface<F>>(impl)}) {}
+
+ // This constructor allows us to turn an Action<Func> object into an
+ // Action<F>, as long as F's arguments can be implicitly converted
+ // to Func's and Func's return type can be implicitly converted to F's.
+ template <typename Func>
+ Action(const Action<Func>& action) // NOLINT
+ : fun_(action.fun_) {}
+
+ // Returns true if and only if this is the DoDefault() action.
+ bool IsDoDefault() const { return fun_ == nullptr; }
+
+ // Performs the action. Note that this method is const even though
+ // the corresponding method in ActionInterface is not. The reason
+ // is that a const Action<F> means that it cannot be re-bound to
+ // another concrete action, not that the concrete action it binds to
+ // cannot change state. (Think of the difference between a const
+ // pointer and a pointer to const.)
+ Result Perform(ArgumentTuple args) const {
+ if (IsDoDefault()) {
+ internal::IllegalDoDefault(__FILE__, __LINE__);
+ }
+ return internal::Apply(fun_, ::std::move(args));
+ }
+
+ // An action can be used as a OnceAction, since it's obviously safe to call it
+ // once.
+ operator OnceAction<F>() const { // NOLINT
+ // Return a OnceAction-compatible callable that calls Perform with the
+ // arguments it is provided. We could instead just return fun_, but then
+ // we'd need to handle the IsDoDefault() case separately.
+ struct OA {
+ Action<F> action;
+
+ R operator()(Args... args) && {
+ return action.Perform(
+ std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+ };
+
+ return OA{*this};
+ }
+
+ private:
+ template <typename G>
+ friend class Action;
+
+ template <typename G>
+ void Init(G&& g, ::std::true_type) {
+ fun_ = ::std::forward<G>(g);
+ }
+
+ template <typename G>
+ void Init(G&& g, ::std::false_type) {
+ fun_ = IgnoreArgs<typename ::std::decay<G>::type>{::std::forward<G>(g)};
+ }
+
+ template <typename FunctionImpl>
+ struct IgnoreArgs {
+ template <typename... InArgs>
+ Result operator()(const InArgs&...) const {
+ return function_impl();
+ }
+
+ FunctionImpl function_impl;
+ };
+
+ // fun_ is an empty function if and only if this is the DoDefault() action.
+ ::std::function<F> fun_;
+};
+
+// The PolymorphicAction class template makes it easy to implement a
+// polymorphic action (i.e. an action that can be used in mock
+// functions of than one type, e.g. Return()).
+//
+// To define a polymorphic action, a user first provides a COPYABLE
+// implementation class that has a Perform() method template:
+//
+// class FooAction {
+// public:
+// template <typename Result, typename ArgumentTuple>
+// Result Perform(const ArgumentTuple& args) const {
+// // Processes the arguments and returns a result, using
+// // std::get<N>(args) to get the N-th (0-based) argument in the tuple.
+// }
+// ...
+// };
+//
+// Then the user creates the polymorphic action using
+// MakePolymorphicAction(object) where object has type FooAction. See
+// the definition of Return(void) and SetArgumentPointee<N>(value) for
+// complete examples.
+template <typename Impl>
+class PolymorphicAction {
+ public:
+ explicit PolymorphicAction(const Impl& impl) : impl_(impl) {}
+
+ template <typename F>
+ operator Action<F>() const {
+ return Action<F>(new MonomorphicImpl<F>(impl_));
+ }
+
+ private:
+ template <typename F>
+ class MonomorphicImpl : public ActionInterface<F> {
+ public:
+ typedef typename internal::Function<F>::Result Result;
+ typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+ explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+ Result Perform(const ArgumentTuple& args) override {
+ return impl_.template Perform<Result>(args);
+ }
+
+ private:
+ Impl impl_;
+ };
+
+ Impl impl_;
+};
+
+// Creates an Action from its implementation and returns it. The
+// created Action object owns the implementation.
+template <typename F>
+Action<F> MakeAction(ActionInterface<F>* impl) {
+ return Action<F>(impl);
+}
+
+// Creates a polymorphic action from its implementation. This is
+// easier to use than the PolymorphicAction<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+// MakePolymorphicAction(foo);
+// vs
+// PolymorphicAction<TypeOfFoo>(foo);
+template <typename Impl>
+inline PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl) {
+ return PolymorphicAction<Impl>(impl);
+}
+
+namespace internal {
+
+// Helper struct to specialize ReturnAction to execute a move instead of a copy
+// on return. Useful for move-only types, but could be used on any type.
+template <typename T>
+struct ByMoveWrapper {
+ explicit ByMoveWrapper(T value) : payload(std::move(value)) {}
+ T payload;
+};
+
+// The general implementation of Return(R). Specializations follow below.
+template <typename R>
+class ReturnAction final {
+ public:
+ explicit ReturnAction(R value) : value_(std::move(value)) {}
+
+ template <typename U, typename... Args,
+ typename = typename std::enable_if<conjunction<
+ // See the requirements documented on Return.
+ negation<std::is_same<void, U>>, //
+ negation<std::is_reference<U>>, //
+ std::is_convertible<R, U>, //
+ std::is_move_constructible<U>>::value>::type>
+ operator OnceAction<U(Args...)>() && { // NOLINT
+ return Impl<U>(std::move(value_));
+ }
+
+ template <typename U, typename... Args,
+ typename = typename std::enable_if<conjunction<
+ // See the requirements documented on Return.
+ negation<std::is_same<void, U>>, //
+ negation<std::is_reference<U>>, //
+ std::is_convertible<const R&, U>, //
+ std::is_copy_constructible<U>>::value>::type>
+ operator Action<U(Args...)>() const { // NOLINT
+ return Impl<U>(value_);
+ }
+
+ private:
+ // Implements the Return(x) action for a mock function that returns type U.
+ template <typename U>
+ class Impl final {
+ public:
+ // The constructor used when the return value is allowed to move from the
+ // input value (i.e. we are converting to OnceAction).
+ explicit Impl(R&& input_value)
+ : state_(new State(std::move(input_value))) {}
+
+ // The constructor used when the return value is not allowed to move from
+ // the input value (i.e. we are converting to Action).
+ explicit Impl(const R& input_value) : state_(new State(input_value)) {}
+
+ U operator()() && { return std::move(state_->value); }
+ U operator()() const& { return state_->value; }
+
+ private:
+ // We put our state on the heap so that the compiler-generated copy/move
+ // constructors work correctly even when U is a reference-like type. This is
+ // necessary only because we eagerly create State::value (see the note on
+ // that symbol for details). If we instead had only the input value as a
+ // member then the default constructors would work fine.
+ //
+ // For example, when R is std::string and U is std::string_view, value is a
+ // reference to the string backed by input_value. The copy constructor would
+ // copy both, so that we wind up with a new input_value object (with the
+ // same contents) and a reference to the *old* input_value object rather
+ // than the new one.
+ struct State {
+ explicit State(const R& input_value_in)
+ : input_value(input_value_in),
+ // Make an implicit conversion to Result before initializing the U
+ // object we store, avoiding calling any explicit constructor of U
+ // from R.
+ //
+ // This simulates the language rules: a function with return type U
+ // that does `return R()` requires R to be implicitly convertible to
+ // U, and uses that path for the conversion, even U Result has an
+ // explicit constructor from R.
+ value(ImplicitCast_<U>(internal::as_const(input_value))) {}
+
+ // As above, but for the case where we're moving from the ReturnAction
+ // object because it's being used as a OnceAction.
+ explicit State(R&& input_value_in)
+ : input_value(std::move(input_value_in)),
+ // For the same reason as above we make an implicit conversion to U
+ // before initializing the value.
+ //
+ // Unlike above we provide the input value as an rvalue to the
+ // implicit conversion because this is a OnceAction: it's fine if it
+ // wants to consume the input value.
+ value(ImplicitCast_<U>(std::move(input_value))) {}
+
+ // A copy of the value originally provided by the user. We retain this in
+ // addition to the value of the mock function's result type below in case
+ // the latter is a reference-like type. See the std::string_view example
+ // in the documentation on Return.
+ R input_value;
+
+ // The value we actually return, as the type returned by the mock function
+ // itself.
+ //
+ // We eagerly initialize this here, rather than lazily doing the implicit
+ // conversion automatically each time Perform is called, for historical
+ // reasons: in 2009-11, commit a070cbd91c (Google changelist 13540126)
+ // made the Action<U()> conversion operator eagerly convert the R value to
+ // U, but without keeping the R alive. This broke the use case discussed
+ // in the documentation for Return, making reference-like types such as
+ // std::string_view not safe to use as U where the input type R is a
+ // value-like type such as std::string.
+ //
+ // The example the commit gave was not very clear, nor was the issue
+ // thread (https://github.com/google/googlemock/issues/86), but it seems
+ // the worry was about reference-like input types R that flatten to a
+ // value-like type U when being implicitly converted. An example of this
+ // is std::vector<bool>::reference, which is often a proxy type with an
+ // reference to the underlying vector:
+ //
+ // // Helper method: have the mock function return bools according
+ // // to the supplied script.
+ // void SetActions(MockFunction<bool(size_t)>& mock,
+ // const std::vector<bool>& script) {
+ // for (size_t i = 0; i < script.size(); ++i) {
+ // EXPECT_CALL(mock, Call(i)).WillOnce(Return(script[i]));
+ // }
+ // }
+ //
+ // TEST(Foo, Bar) {
+ // // Set actions using a temporary vector, whose operator[]
+ // // returns proxy objects that references that will be
+ // // dangling once the call to SetActions finishes and the
+ // // vector is destroyed.
+ // MockFunction<bool(size_t)> mock;
+ // SetActions(mock, {false, true});
+ //
+ // EXPECT_FALSE(mock.AsStdFunction()(0));
+ // EXPECT_TRUE(mock.AsStdFunction()(1));
+ // }
+ //
+ // This eager conversion helps with a simple case like this, but doesn't
+ // fully make these types work in general. For example the following still
+ // uses a dangling reference:
+ //
+ // TEST(Foo, Baz) {
+ // MockFunction<std::vector<std::string>()> mock;
+ //
+ // // Return the same vector twice, and then the empty vector
+ // // thereafter.
+ // auto action = Return(std::initializer_list<std::string>{
+ // "taco", "burrito",
+ // });
+ //
+ // EXPECT_CALL(mock, Call)
+ // .WillOnce(action)
+ // .WillOnce(action)
+ // .WillRepeatedly(Return(std::vector<std::string>{}));
+ //
+ // EXPECT_THAT(mock.AsStdFunction()(),
+ // ElementsAre("taco", "burrito"));
+ // EXPECT_THAT(mock.AsStdFunction()(),
+ // ElementsAre("taco", "burrito"));
+ // EXPECT_THAT(mock.AsStdFunction()(), IsEmpty());
+ // }
+ //
+ U value;
+ };
+
+ const std::shared_ptr<State> state_;
+ };
+
+ R value_;
+};
+
+// A specialization of ReturnAction<R> when R is ByMoveWrapper<T> for some T.
+//
+// This version applies the type system-defeating hack of moving from T even in
+// the const call operator, checking at runtime that it isn't called more than
+// once, since the user has declared their intent to do so by using ByMove.
+template <typename T>
+class ReturnAction<ByMoveWrapper<T>> final {
+ public:
+ explicit ReturnAction(ByMoveWrapper<T> wrapper)
+ : state_(new State(std::move(wrapper.payload))) {}
+
+ T operator()() const {
+ GTEST_CHECK_(!state_->called)
+ << "A ByMove() action must be performed at most once.";
+
+ state_->called = true;
+ return std::move(state_->value);
+ }
+
+ private:
+ // We store our state on the heap so that we are copyable as required by
+ // Action, despite the fact that we are stateful and T may not be copyable.
+ struct State {
+ explicit State(T&& value_in) : value(std::move(value_in)) {}
+
+ T value;
+ bool called = false;
+ };
+
+ const std::shared_ptr<State> state_;
+};
+
+// Implements the ReturnNull() action.
+class ReturnNullAction {
+ public:
+ // Allows ReturnNull() to be used in any pointer-returning function. In C++11
+ // this is enforced by returning nullptr, and in non-C++11 by asserting a
+ // pointer type on compile time.
+ template <typename Result, typename ArgumentTuple>
+ static Result Perform(const ArgumentTuple&) {
+ return nullptr;
+ }
+};
+
+// Implements the Return() action.
+class ReturnVoidAction {
+ public:
+ // Allows Return() to be used in any void-returning function.
+ template <typename Result, typename ArgumentTuple>
+ static void Perform(const ArgumentTuple&) {
+ static_assert(std::is_void<Result>::value, "Result should be void.");
+ }
+};
+
+// Implements the polymorphic ReturnRef(x) action, which can be used
+// in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefAction {
+ public:
+ // Constructs a ReturnRefAction object from the reference to be returned.
+ explicit ReturnRefAction(T& ref) : ref_(ref) {} // NOLINT
+
+ // This template type conversion operator allows ReturnRef(x) to be
+ // used in ANY function that returns a reference to x's type.
+ template <typename F>
+ operator Action<F>() const {
+ typedef typename Function<F>::Result Result;
+ // Asserts that the function return type is a reference. This
+ // catches the user error of using ReturnRef(x) when Return(x)
+ // should be used, and generates some helpful error message.
+ static_assert(std::is_reference<Result>::value,
+ "use Return instead of ReturnRef to return a value");
+ return Action<F>(new Impl<F>(ref_));
+ }
+
+ private:
+ // Implements the ReturnRef(x) action for a particular function type F.
+ template <typename F>
+ class Impl : public ActionInterface<F> {
+ public:
+ typedef typename Function<F>::Result Result;
+ typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+ explicit Impl(T& ref) : ref_(ref) {} // NOLINT
+
+ Result Perform(const ArgumentTuple&) override { return ref_; }
+
+ private:
+ T& ref_;
+ };
+
+ T& ref_;
+};
+
+// Implements the polymorphic ReturnRefOfCopy(x) action, which can be
+// used in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefOfCopyAction {
+ public:
+ // Constructs a ReturnRefOfCopyAction object from the reference to
+ // be returned.
+ explicit ReturnRefOfCopyAction(const T& value) : value_(value) {} // NOLINT
+
+ // This template type conversion operator allows ReturnRefOfCopy(x) to be
+ // used in ANY function that returns a reference to x's type.
+ template <typename F>
+ operator Action<F>() const {
+ typedef typename Function<F>::Result Result;
+ // Asserts that the function return type is a reference. This
+ // catches the user error of using ReturnRefOfCopy(x) when Return(x)
+ // should be used, and generates some helpful error message.
+ static_assert(std::is_reference<Result>::value,
+ "use Return instead of ReturnRefOfCopy to return a value");
+ return Action<F>(new Impl<F>(value_));
+ }
+
+ private:
+ // Implements the ReturnRefOfCopy(x) action for a particular function type F.
+ template <typename F>
+ class Impl : public ActionInterface<F> {
+ public:
+ typedef typename Function<F>::Result Result;
+ typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+ explicit Impl(const T& value) : value_(value) {} // NOLINT
+
+ Result Perform(const ArgumentTuple&) override { return value_; }
+
+ private:
+ T value_;
+ };
+
+ const T value_;
+};
+
+// Implements the polymorphic ReturnRoundRobin(v) action, which can be
+// used in any function that returns the element_type of v.
+template <typename T>
+class ReturnRoundRobinAction {
+ public:
+ explicit ReturnRoundRobinAction(std::vector<T> values) {
+ GTEST_CHECK_(!values.empty())
+ << "ReturnRoundRobin requires at least one element.";
+ state_->values = std::move(values);
+ }
+
+ template <typename... Args>
+ T operator()(Args&&...) const {
+ return state_->Next();
+ }
+
+ private:
+ struct State {
+ T Next() {
+ T ret_val = values[i++];
+ if (i == values.size()) i = 0;
+ return ret_val;
+ }
+
+ std::vector<T> values;
+ size_t i = 0;
+ };
+ std::shared_ptr<State> state_ = std::make_shared<State>();
+};
+
+// Implements the polymorphic DoDefault() action.
+class DoDefaultAction {
+ public:
+ // This template type conversion operator allows DoDefault() to be
+ // used in any function.
+ template <typename F>
+ operator Action<F>() const {
+ return Action<F>();
+ } // NOLINT
+};
+
+// Implements the Assign action to set a given pointer referent to a
+// particular value.
+template <typename T1, typename T2>
+class AssignAction {
+ public:
+ AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {}
+
+ template <typename Result, typename ArgumentTuple>
+ void Perform(const ArgumentTuple& /* args */) const {
+ *ptr_ = value_;
+ }
+
+ private:
+ T1* const ptr_;
+ const T2 value_;
+};
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetErrnoAndReturn action to simulate return from
+// various system calls and libc functions.
+template <typename T>
+class SetErrnoAndReturnAction {
+ public:
+ SetErrnoAndReturnAction(int errno_value, T result)
+ : errno_(errno_value), result_(result) {}
+ template <typename Result, typename ArgumentTuple>
+ Result Perform(const ArgumentTuple& /* args */) const {
+ errno = errno_;
+ return result_;
+ }
+
+ private:
+ const int errno_;
+ const T result_;
+};
+
+#endif // !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetArgumentPointee<N>(x) action for any function
+// whose N-th argument (0-based) is a pointer to x's type.
+template <size_t N, typename A, typename = void>
+struct SetArgumentPointeeAction {
+ A value;
+
+ template <typename... Args>
+ void operator()(const Args&... args) const {
+ *::std::get<N>(std::tie(args...)) = value;
+ }
+};
+
+// Implements the Invoke(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodAction {
+ Class* const obj_ptr;
+ const MethodPtr method_ptr;
+
+ template <typename... Args>
+ auto operator()(Args&&... args) const
+ -> decltype((obj_ptr->*method_ptr)(std::forward<Args>(args)...)) {
+ return (obj_ptr->*method_ptr)(std::forward<Args>(args)...);
+ }
+};
+
+// Implements the InvokeWithoutArgs(f) action. The template argument
+// FunctionImpl is the implementation type of f, which can be either a
+// function pointer or a functor. InvokeWithoutArgs(f) can be used as an
+// Action<F> as long as f's type is compatible with F.
+template <typename FunctionImpl>
+struct InvokeWithoutArgsAction {
+ FunctionImpl function_impl;
+
+ // Allows InvokeWithoutArgs(f) to be used as any action whose type is
+ // compatible with f.
+ template <typename... Args>
+ auto operator()(const Args&...) -> decltype(function_impl()) {
+ return function_impl();
+ }
+};
+
+// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodWithoutArgsAction {
+ Class* const obj_ptr;
+ const MethodPtr method_ptr;
+
+ using ReturnType =
+ decltype((std::declval<Class*>()->*std::declval<MethodPtr>())());
+
+ template <typename... Args>
+ ReturnType operator()(const Args&...) const {
+ return (obj_ptr->*method_ptr)();
+ }
+};
+
+// Implements the IgnoreResult(action) action.
+template <typename A>
+class IgnoreResultAction {
+ public:
+ explicit IgnoreResultAction(const A& action) : action_(action) {}
+
+ template <typename F>
+ operator Action<F>() const {
+ // Assert statement belongs here because this is the best place to verify
+ // conditions on F. It produces the clearest error messages
+ // in most compilers.
+ // Impl really belongs in this scope as a local class but can't
+ // because MSVC produces duplicate symbols in different translation units
+ // in this case. Until MS fixes that bug we put Impl into the class scope
+ // and put the typedef both here (for use in assert statement) and
+ // in the Impl class. But both definitions must be the same.
+ typedef typename internal::Function<F>::Result Result;
+
+ // Asserts at compile time that F returns void.
+ static_assert(std::is_void<Result>::value, "Result type should be void.");
+
+ return Action<F>(new Impl<F>(action_));
+ }
+
+ private:
+ template <typename F>
+ class Impl : public ActionInterface<F> {
+ public:
+ typedef typename internal::Function<F>::Result Result;
+ typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+ explicit Impl(const A& action) : action_(action) {}
+
+ void Perform(const ArgumentTuple& args) override {
+ // Performs the action and ignores its result.
+ action_.Perform(args);
+ }
+
+ private:
+ // Type OriginalFunction is the same as F except that its return
+ // type is IgnoredValue.
+ typedef
+ typename internal::Function<F>::MakeResultIgnoredValue OriginalFunction;
+
+ const Action<OriginalFunction> action_;
+ };
+
+ const A action_;
+};
+
+template <typename InnerAction, size_t... I>
+struct WithArgsAction {
+ InnerAction inner_action;
+
+ // The signature of the function as seen by the inner action, given an out
+ // action with the given result and argument types.
+ template <typename R, typename... Args>
+ using InnerSignature =
+ R(typename std::tuple_element<I, std::tuple<Args...>>::type...);
+
+ // Rather than a call operator, we must define conversion operators to
+ // particular action types. This is necessary for embedded actions like
+ // DoDefault(), which rely on an action conversion operators rather than
+ // providing a call operator because even with a particular set of arguments
+ // they don't have a fixed return type.
+
+ template <typename R, typename... Args,
+ typename std::enable_if<
+ std::is_convertible<
+ InnerAction,
+ // Unfortunately we can't use the InnerSignature alias here;
+ // MSVC complains about the I parameter pack not being
+ // expanded (error C3520) despite it being expanded in the
+ // type alias.
+ OnceAction<R(typename std::tuple_element<
+ I, std::tuple<Args...>>::type...)>>::value,
+ int>::type = 0>
+ operator OnceAction<R(Args...)>() && { // NOLINT
+ struct OA {
+ OnceAction<InnerSignature<R, Args...>> inner_action;
+
+ R operator()(Args&&... args) && {
+ return std::move(inner_action)
+ .Call(std::get<I>(
+ std::forward_as_tuple(std::forward<Args>(args)...))...);
+ }
+ };
+
+ return OA{std::move(inner_action)};
+ }
+
+ template <typename R, typename... Args,
+ typename std::enable_if<
+ std::is_convertible<
+ const InnerAction&,
+ // Unfortunately we can't use the InnerSignature alias here;
+ // MSVC complains about the I parameter pack not being
+ // expanded (error C3520) despite it being expanded in the
+ // type alias.
+ Action<R(typename std::tuple_element<
+ I, std::tuple<Args...>>::type...)>>::value,
+ int>::type = 0>
+ operator Action<R(Args...)>() const { // NOLINT
+ Action<InnerSignature<R, Args...>> converted(inner_action);
+
+ return [converted](Args&&... args) -> R {
+ return converted.Perform(std::forward_as_tuple(
+ std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
+ };
+ }
+};
+
+template <typename... Actions>
+class DoAllAction;
+
+// Base case: only a single action.
+template <typename FinalAction>
+class DoAllAction<FinalAction> {
+ public:
+ struct UserConstructorTag {};
+
+ template <typename T>
+ explicit DoAllAction(UserConstructorTag, T&& action)
+ : final_action_(std::forward<T>(action)) {}
+
+ // Rather than a call operator, we must define conversion operators to
+ // particular action types. This is necessary for embedded actions like
+ // DoDefault(), which rely on an action conversion operators rather than
+ // providing a call operator because even with a particular set of arguments
+ // they don't have a fixed return type.
+
+ template <typename R, typename... Args,
+ typename std::enable_if<
+ std::is_convertible<FinalAction, OnceAction<R(Args...)>>::value,
+ int>::type = 0>
+ operator OnceAction<R(Args...)>() && { // NOLINT
+ return std::move(final_action_);
+ }
+
+ template <
+ typename R, typename... Args,
+ typename std::enable_if<
+ std::is_convertible<const FinalAction&, Action<R(Args...)>>::value,
+ int>::type = 0>
+ operator Action<R(Args...)>() const { // NOLINT
+ return final_action_;
+ }
+
+ private:
+ FinalAction final_action_;
+};
+
+// Recursive case: support N actions by calling the initial action and then
+// calling through to the base class containing N-1 actions.
+template <typename InitialAction, typename... OtherActions>
+class DoAllAction<InitialAction, OtherActions...>
+ : private DoAllAction<OtherActions...> {
+ private:
+ using Base = DoAllAction<OtherActions...>;
+
+ // The type of reference that should be provided to an initial action for a
+ // mocked function parameter of type T.
+ //
+ // There are two quirks here:
+ //
+ // * Unlike most forwarding functions, we pass scalars through by value.
+ // This isn't strictly necessary because an lvalue reference would work
+ // fine too and be consistent with other non-reference types, but it's
+ // perhaps less surprising.
+ //
+ // For example if the mocked function has signature void(int), then it
+ // might seem surprising for the user's initial action to need to be
+ // convertible to Action<void(const int&)>. This is perhaps less
+ // surprising for a non-scalar type where there may be a performance
+ // impact, or it might even be impossible, to pass by value.
+ //
+ // * More surprisingly, `const T&` is often not a const reference type.
+ // By the reference collapsing rules in C++17 [dcl.ref]/6, if T refers to
+ // U& or U&& for some non-scalar type U, then InitialActionArgType<T> is
+ // U&. In other words, we may hand over a non-const reference.
+ //
+ // So for example, given some non-scalar type Obj we have the following
+ // mappings:
+ //
+ // T InitialActionArgType<T>
+ // ------- -----------------------
+ // Obj const Obj&
+ // Obj& Obj&
+ // Obj&& Obj&
+ // const Obj const Obj&
+ // const Obj& const Obj&
+ // const Obj&& const Obj&
+ //
+ // In other words, the initial actions get a mutable view of an non-scalar
+ // argument if and only if the mock function itself accepts a non-const
+ // reference type. They are never given an rvalue reference to an
+ // non-scalar type.
+ //
+ // This situation makes sense if you imagine use with a matcher that is
+ // designed to write through a reference. For example, if the caller wants
+ // to fill in a reference argument and then return a canned value:
+ //
+ // EXPECT_CALL(mock, Call)
+ // .WillOnce(DoAll(SetArgReferee<0>(17), Return(19)));
+ //
+ template <typename T>
+ using InitialActionArgType =
+ typename std::conditional<std::is_scalar<T>::value, T, const T&>::type;
+
+ public:
+ struct UserConstructorTag {};
+
+ template <typename T, typename... U>
+ explicit DoAllAction(UserConstructorTag, T&& initial_action,
+ U&&... other_actions)
+ : Base({}, std::forward<U>(other_actions)...),
+ initial_action_(std::forward<T>(initial_action)) {}
+
+ template <typename R, typename... Args,
+ typename std::enable_if<
+ conjunction<
+ // Both the initial action and the rest must support
+ // conversion to OnceAction.
+ std::is_convertible<
+ InitialAction,
+ OnceAction<void(InitialActionArgType<Args>...)>>,
+ std::is_convertible<Base, OnceAction<R(Args...)>>>::value,
+ int>::type = 0>
+ operator OnceAction<R(Args...)>() && { // NOLINT
+ // Return an action that first calls the initial action with arguments
+ // filtered through InitialActionArgType, then forwards arguments directly
+ // to the base class to deal with the remaining actions.
+ struct OA {
+ OnceAction<void(InitialActionArgType<Args>...)> initial_action;
+ OnceAction<R(Args...)> remaining_actions;
+
+ R operator()(Args... args) && {
+ std::move(initial_action)
+ .Call(static_cast<InitialActionArgType<Args>>(args)...);
+
+ return std::move(remaining_actions).Call(std::forward<Args>(args)...);
+ }
+ };
+
+ return OA{
+ std::move(initial_action_),
+ std::move(static_cast<Base&>(*this)),
+ };
+ }
+
+ template <
+ typename R, typename... Args,
+ typename std::enable_if<
+ conjunction<
+ // Both the initial action and the rest must support conversion to
+ // Action.
+ std::is_convertible<const InitialAction&,
+ Action<void(InitialActionArgType<Args>...)>>,
+ std::is_convertible<const Base&, Action<R(Args...)>>>::value,
+ int>::type = 0>
+ operator Action<R(Args...)>() const { // NOLINT
+ // Return an action that first calls the initial action with arguments
+ // filtered through InitialActionArgType, then forwards arguments directly
+ // to the base class to deal with the remaining actions.
+ struct OA {
+ Action<void(InitialActionArgType<Args>...)> initial_action;
+ Action<R(Args...)> remaining_actions;
+
+ R operator()(Args... args) const {
+ initial_action.Perform(std::forward_as_tuple(
+ static_cast<InitialActionArgType<Args>>(args)...));
+
+ return remaining_actions.Perform(
+ std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+ };
+
+ return OA{
+ initial_action_,
+ static_cast<const Base&>(*this),
+ };
+ }
+
+ private:
+ InitialAction initial_action_;
+};
+
+template <typename T, typename... Params>
+struct ReturnNewAction {
+ T* operator()() const {
+ return internal::Apply(
+ [](const Params&... unpacked_params) {
+ return new T(unpacked_params...);
+ },
+ params);
+ }
+ std::tuple<Params...> params;
+};
+
+template <size_t k>
+struct ReturnArgAction {
+ template <typename... Args,
+ typename = typename std::enable_if<(k < sizeof...(Args))>::type>
+ auto operator()(Args&&... args) const -> decltype(std::get<k>(
+ std::forward_as_tuple(std::forward<Args>(args)...))) {
+ return std::get<k>(std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+};
+
+template <size_t k, typename Ptr>
+struct SaveArgAction {
+ Ptr pointer;
+
+ template <typename... Args>
+ void operator()(const Args&... args) const {
+ *pointer = std::get<k>(std::tie(args...));
+ }
+};
+
+template <size_t k, typename Ptr>
+struct SaveArgPointeeAction {
+ Ptr pointer;
+
+ template <typename... Args>
+ void operator()(const Args&... args) const {
+ *pointer = *std::get<k>(std::tie(args...));
+ }
+};
+
+template <size_t k, typename T>
+struct SetArgRefereeAction {
+ T value;
+
+ template <typename... Args>
+ void operator()(Args&&... args) const {
+ using argk_type =
+ typename ::std::tuple_element<k, std::tuple<Args...>>::type;
+ static_assert(std::is_lvalue_reference<argk_type>::value,
+ "Argument must be a reference type.");
+ std::get<k>(std::tie(args...)) = value;
+ }
+};
+
+template <size_t k, typename I1, typename I2>
+struct SetArrayArgumentAction {
+ I1 first;
+ I2 last;
+
+ template <typename... Args>
+ void operator()(const Args&... args) const {
+ auto value = std::get<k>(std::tie(args...));
+ for (auto it = first; it != last; ++it, (void)++value) {
+ *value = *it;
+ }
+ }
+};
+
+template <size_t k>
+struct DeleteArgAction {
+ template <typename... Args>
+ void operator()(const Args&... args) const {
+ delete std::get<k>(std::tie(args...));
+ }
+};
+
+template <typename Ptr>
+struct ReturnPointeeAction {
+ Ptr pointer;
+ template <typename... Args>
+ auto operator()(const Args&...) const -> decltype(*pointer) {
+ return *pointer;
+ }
+};
+
+#if GTEST_HAS_EXCEPTIONS
+template <typename T>
+struct ThrowAction {
+ T exception;
+ // We use a conversion operator to adapt to any return type.
+ template <typename R, typename... Args>
+ operator Action<R(Args...)>() const { // NOLINT
+ T copy = exception;
+ return [copy](Args...) -> R { throw copy; };
+ }
+};
+#endif // GTEST_HAS_EXCEPTIONS
+
+} // namespace internal
+
+// An Unused object can be implicitly constructed from ANY value.
+// This is handy when defining actions that ignore some or all of the
+// mock function arguments. For example, given
+//
+// MOCK_METHOD3(Foo, double(const string& label, double x, double y));
+// MOCK_METHOD3(Bar, double(int index, double x, double y));
+//
+// instead of
+//
+// double DistanceToOriginWithLabel(const string& label, double x, double y) {
+// return sqrt(x*x + y*y);
+// }
+// double DistanceToOriginWithIndex(int index, double x, double y) {
+// return sqrt(x*x + y*y);
+// }
+// ...
+// EXPECT_CALL(mock, Foo("abc", _, _))
+// .WillOnce(Invoke(DistanceToOriginWithLabel));
+// EXPECT_CALL(mock, Bar(5, _, _))
+// .WillOnce(Invoke(DistanceToOriginWithIndex));
+//
+// you could write
+//
+// // We can declare any uninteresting argument as Unused.
+// double DistanceToOrigin(Unused, double x, double y) {
+// return sqrt(x*x + y*y);
+// }
+// ...
+// EXPECT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin));
+// EXPECT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin));
+typedef internal::IgnoredValue Unused;
+
+// Creates an action that does actions a1, a2, ..., sequentially in
+// each invocation. All but the last action will have a readonly view of the
+// arguments.
+template <typename... Action>
+internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
+ Action&&... action) {
+ return internal::DoAllAction<typename std::decay<Action>::type...>(
+ {}, std::forward<Action>(action)...);
+}
+
+// WithArg<k>(an_action) creates an action that passes the k-th
+// (0-based) argument of the mock function to an_action and performs
+// it. It adapts an action accepting one argument to one that accepts
+// multiple arguments. For convenience, we also provide
+// WithArgs<k>(an_action) (defined below) as a synonym.
+template <size_t k, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k> WithArg(
+ InnerAction&& action) {
+ return {std::forward<InnerAction>(action)};
+}
+
+// WithArgs<N1, N2, ..., Nk>(an_action) creates an action that passes
+// the selected arguments of the mock function to an_action and
+// performs it. It serves as an adaptor between actions with
+// different argument lists.
+template <size_t k, size_t... ks, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k, ks...>
+WithArgs(InnerAction&& action) {
+ return {std::forward<InnerAction>(action)};
+}
+
+// WithoutArgs(inner_action) can be used in a mock function with a
+// non-empty argument list to perform inner_action, which takes no
+// argument. In other words, it adapts an action accepting no
+// argument to one that accepts (and ignores) arguments.
+template <typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type> WithoutArgs(
+ InnerAction&& action) {
+ return {std::forward<InnerAction>(action)};
+}
+
+// Creates an action that returns a value.
+//
+// The returned type can be used with a mock function returning a non-void,
+// non-reference type U as follows:
+//
+// * If R is convertible to U and U is move-constructible, then the action can
+// be used with WillOnce.
+//
+// * If const R& is convertible to U and U is copy-constructible, then the
+// action can be used with both WillOnce and WillRepeatedly.
+//
+// The mock expectation contains the R value from which the U return value is
+// constructed (a move/copy of the argument to Return). This means that the R
+// value will survive at least until the mock object's expectations are cleared
+// or the mock object is destroyed, meaning that U can safely be a
+// reference-like type such as std::string_view:
+//
+// // The mock function returns a view of a copy of the string fed to
+// // Return. The view is valid even after the action is performed.
+// MockFunction<std::string_view()> mock;
+// EXPECT_CALL(mock, Call).WillOnce(Return(std::string("taco")));
+// const std::string_view result = mock.AsStdFunction()();
+// EXPECT_EQ("taco", result);
+//
+template <typename R>
+internal::ReturnAction<R> Return(R value) {
+ return internal::ReturnAction<R>(std::move(value));
+}
+
+// Creates an action that returns NULL.
+inline PolymorphicAction<internal::ReturnNullAction> ReturnNull() {
+ return MakePolymorphicAction(internal::ReturnNullAction());
+}
+
+// Creates an action that returns from a void function.
+inline PolymorphicAction<internal::ReturnVoidAction> Return() {
+ return MakePolymorphicAction(internal::ReturnVoidAction());
+}
+
+// Creates an action that returns the reference to a variable.
+template <typename R>
+inline internal::ReturnRefAction<R> ReturnRef(R& x) { // NOLINT
+ return internal::ReturnRefAction<R>(x);
+}
+
+// Prevent using ReturnRef on reference to temporary.
+template <typename R, R* = nullptr>
+internal::ReturnRefAction<R> ReturnRef(R&&) = delete;
+
+// Creates an action that returns the reference to a copy of the
+// argument. The copy is created when the action is constructed and
+// lives as long as the action.
+template <typename R>
+inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
+ return internal::ReturnRefOfCopyAction<R>(x);
+}
+
+// DEPRECATED: use Return(x) directly with WillOnce.
+//
+// Modifies the parent action (a Return() action) to perform a move of the
+// argument instead of a copy.
+// Return(ByMove()) actions can only be executed once and will assert this
+// invariant.
+template <typename R>
+internal::ByMoveWrapper<R> ByMove(R x) {
+ return internal::ByMoveWrapper<R>(std::move(x));
+}
+
+// Creates an action that returns an element of `vals`. Calling this action will
+// repeatedly return the next value from `vals` until it reaches the end and
+// will restart from the beginning.
+template <typename T>
+internal::ReturnRoundRobinAction<T> ReturnRoundRobin(std::vector<T> vals) {
+ return internal::ReturnRoundRobinAction<T>(std::move(vals));
+}
+
+// Creates an action that returns an element of `vals`. Calling this action will
+// repeatedly return the next value from `vals` until it reaches the end and
+// will restart from the beginning.
+template <typename T>
+internal::ReturnRoundRobinAction<T> ReturnRoundRobin(
+ std::initializer_list<T> vals) {
+ return internal::ReturnRoundRobinAction<T>(std::vector<T>(vals));
+}
+
+// Creates an action that does the default action for the give mock function.
+inline internal::DoDefaultAction DoDefault() {
+ return internal::DoDefaultAction();
+}
+
+// Creates an action that sets the variable pointed by the N-th
+// (0-based) function argument to 'value'.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgPointee(T value) {
+ return {std::move(value)};
+}
+
+// The following version is DEPRECATED.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgumentPointee(T value) {
+ return {std::move(value)};
+}
+
+// Creates an action that sets a pointer referent to a given value.
+template <typename T1, typename T2>
+PolymorphicAction<internal::AssignAction<T1, T2>> Assign(T1* ptr, T2 val) {
+ return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Creates an action that sets errno and returns the appropriate error.
+template <typename T>
+PolymorphicAction<internal::SetErrnoAndReturnAction<T>> SetErrnoAndReturn(
+ int errval, T result) {
+ return MakePolymorphicAction(
+ internal::SetErrnoAndReturnAction<T>(errval, result));
+}
+
+#endif // !GTEST_OS_WINDOWS_MOBILE
+
+// Various overloads for Invoke().
+
+// Legacy function.
+// Actions can now be implicitly constructed from callables. No need to create
+// wrapper objects.
+// This function exists for backwards compatibility.
+template <typename FunctionImpl>
+typename std::decay<FunctionImpl>::type Invoke(FunctionImpl&& function_impl) {
+ return std::forward<FunctionImpl>(function_impl);
+}
+
+// Creates an action that invokes the given method on the given object
+// with the mock function's arguments.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodAction<Class, MethodPtr> Invoke(Class* obj_ptr,
+ MethodPtr method_ptr) {
+ return {obj_ptr, method_ptr};
+}
+
+// Creates an action that invokes 'function_impl' with no argument.
+template <typename FunctionImpl>
+internal::InvokeWithoutArgsAction<typename std::decay<FunctionImpl>::type>
+InvokeWithoutArgs(FunctionImpl function_impl) {
+ return {std::move(function_impl)};
+}
+
+// Creates an action that invokes the given method on the given object
+// with no argument.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodWithoutArgsAction<Class, MethodPtr> InvokeWithoutArgs(
+ Class* obj_ptr, MethodPtr method_ptr) {
+ return {obj_ptr, method_ptr};
+}
+
+// Creates an action that performs an_action and throws away its
+// result. In other words, it changes the return type of an_action to
+// void. an_action MUST NOT return void, or the code won't compile.
+template <typename A>
+inline internal::IgnoreResultAction<A> IgnoreResult(const A& an_action) {
+ return internal::IgnoreResultAction<A>(an_action);
+}
+
+// Creates a reference wrapper for the given L-value. If necessary,
+// you can explicitly specify the type of the reference. For example,
+// suppose 'derived' is an object of type Derived, ByRef(derived)
+// would wrap a Derived&. If you want to wrap a const Base& instead,
+// where Base is a base class of Derived, just write:
+//
+// ByRef<const Base>(derived)
+//
+// N.B. ByRef is redundant with std::ref, std::cref and std::reference_wrapper.
+// However, it may still be used for consistency with ByMove().
+template <typename T>
+inline ::std::reference_wrapper<T> ByRef(T& l_value) { // NOLINT
+ return ::std::reference_wrapper<T>(l_value);
+}
+
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+template <typename T, typename... Params>
+internal::ReturnNewAction<T, typename std::decay<Params>::type...> ReturnNew(
+ Params&&... params) {
+ return {std::forward_as_tuple(std::forward<Params>(params)...)};
+}
+
+// Action ReturnArg<k>() returns the k-th argument of the mock function.
+template <size_t k>
+internal::ReturnArgAction<k> ReturnArg() {
+ return {};
+}
+
+// Action SaveArg<k>(pointer) saves the k-th (0-based) argument of the
+// mock function to *pointer.
+template <size_t k, typename Ptr>
+internal::SaveArgAction<k, Ptr> SaveArg(Ptr pointer) {
+ return {pointer};
+}
+
+// Action SaveArgPointee<k>(pointer) saves the value pointed to
+// by the k-th (0-based) argument of the mock function to *pointer.
+template <size_t k, typename Ptr>
+internal::SaveArgPointeeAction<k, Ptr> SaveArgPointee(Ptr pointer) {
+ return {pointer};
+}
+
+// Action SetArgReferee<k>(value) assigns 'value' to the variable
+// referenced by the k-th (0-based) argument of the mock function.
+template <size_t k, typename T>
+internal::SetArgRefereeAction<k, typename std::decay<T>::type> SetArgReferee(
+ T&& value) {
+ return {std::forward<T>(value)};
+}
+
+// Action SetArrayArgument<k>(first, last) copies the elements in
+// source range [first, last) to the array pointed to by the k-th
+// (0-based) argument, which can be either a pointer or an
+// iterator. The action does not take ownership of the elements in the
+// source range.
+template <size_t k, typename I1, typename I2>
+internal::SetArrayArgumentAction<k, I1, I2> SetArrayArgument(I1 first,
+ I2 last) {
+ return {first, last};
+}
+
+// Action DeleteArg<k>() deletes the k-th (0-based) argument of the mock
+// function.
+template <size_t k>
+internal::DeleteArgAction<k> DeleteArg() {
+ return {};
+}
+
+// This action returns the value pointed to by 'pointer'.
+template <typename Ptr>
+internal::ReturnPointeeAction<Ptr> ReturnPointee(Ptr pointer) {
+ return {pointer};
+}
+
+// Action Throw(exception) can be used in a mock function of any type
+// to throw the given exception. Any copyable value can be thrown.
+#if GTEST_HAS_EXCEPTIONS
+template <typename T>
+internal::ThrowAction<typename std::decay<T>::type> Throw(T&& exception) {
+ return {std::forward<T>(exception)};
+}
+#endif // GTEST_HAS_EXCEPTIONS
+
+namespace internal {
+
+// A macro from the ACTION* family (defined later in gmock-generated-actions.h)
+// defines an action that can be used in a mock function. Typically,
+// these actions only care about a subset of the arguments of the mock
+// function. For example, if such an action only uses the second
+// argument, it can be used in any mock function that takes >= 2
+// arguments where the type of the second argument is compatible.
+//
+// Therefore, the action implementation must be prepared to take more
+// arguments than it needs. The ExcessiveArg type is used to
+// represent those excessive arguments. In order to keep the compiler
+// error messages tractable, we define it in the testing namespace
+// instead of testing::internal. However, this is an INTERNAL TYPE
+// and subject to change without notice, so a user MUST NOT USE THIS
+// TYPE DIRECTLY.
+struct ExcessiveArg {};
+
+// Builds an implementation of an Action<> for some particular signature, using
+// a class defined by an ACTION* macro.
+template <typename F, typename Impl>
+struct ActionImpl;
+
+template <typename Impl>
+struct ImplBase {
+ struct Holder {
+ // Allows each copy of the Action<> to get to the Impl.
+ explicit operator const Impl&() const { return *ptr; }
+ std::shared_ptr<Impl> ptr;
+ };
+ using type = typename std::conditional<std::is_constructible<Impl>::value,
+ Impl, Holder>::type;
+};
+
+template <typename R, typename... Args, typename Impl>
+struct ActionImpl<R(Args...), Impl> : ImplBase<Impl>::type {
+ using Base = typename ImplBase<Impl>::type;
+ using function_type = R(Args...);
+ using args_type = std::tuple<Args...>;
+
+ ActionImpl() = default; // Only defined if appropriate for Base.
+ explicit ActionImpl(std::shared_ptr<Impl> impl) : Base{std::move(impl)} {}
+
+ R operator()(Args&&... arg) const {
+ static constexpr size_t kMaxArgs =
+ sizeof...(Args) <= 10 ? sizeof...(Args) : 10;
+ return Apply(MakeIndexSequence<kMaxArgs>{},
+ MakeIndexSequence<10 - kMaxArgs>{},
+ args_type{std::forward<Args>(arg)...});
+ }
+
+ template <std::size_t... arg_id, std::size_t... excess_id>
+ R Apply(IndexSequence<arg_id...>, IndexSequence<excess_id...>,
+ const args_type& args) const {
+ // Impl need not be specific to the signature of action being implemented;
+ // only the implementing function body needs to have all of the specific
+ // types instantiated. Up to 10 of the args that are provided by the
+ // args_type get passed, followed by a dummy of unspecified type for the
+ // remainder up to 10 explicit args.
+ static constexpr ExcessiveArg kExcessArg{};
+ return static_cast<const Impl&>(*this)
+ .template gmock_PerformImpl<
+ /*function_type=*/function_type, /*return_type=*/R,
+ /*args_type=*/args_type,
+ /*argN_type=*/
+ typename std::tuple_element<arg_id, args_type>::type...>(
+ /*args=*/args, std::get<arg_id>(args)...,
+ ((void)excess_id, kExcessArg)...);
+ }
+};
+
+// Stores a default-constructed Impl as part of the Action<>'s
+// std::function<>. The Impl should be trivial to copy.
+template <typename F, typename Impl>
+::testing::Action<F> MakeAction() {
+ return ::testing::Action<F>(ActionImpl<F, Impl>());
+}
+
+// Stores just the one given instance of Impl.
+template <typename F, typename Impl>
+::testing::Action<F> MakeAction(std::shared_ptr<Impl> impl) {
+ return ::testing::Action<F>(ActionImpl<F, Impl>(std::move(impl)));
+}
+
+#define GMOCK_INTERNAL_ARG_UNUSED(i, data, el) \
+ , const arg##i##_type& arg##i GTEST_ATTRIBUTE_UNUSED_
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_ \
+ const args_type& args GTEST_ATTRIBUTE_UNUSED_ GMOCK_PP_REPEAT( \
+ GMOCK_INTERNAL_ARG_UNUSED, , 10)
+
+#define GMOCK_INTERNAL_ARG(i, data, el) , const arg##i##_type& arg##i
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_ \
+ const args_type& args GMOCK_PP_REPEAT(GMOCK_INTERNAL_ARG, , 10)
+
+#define GMOCK_INTERNAL_TEMPLATE_ARG(i, data, el) , typename arg##i##_type
+#define GMOCK_ACTION_TEMPLATE_ARGS_NAMES_ \
+ GMOCK_PP_TAIL(GMOCK_PP_REPEAT(GMOCK_INTERNAL_TEMPLATE_ARG, , 10))
+
+#define GMOCK_INTERNAL_TYPENAME_PARAM(i, data, param) , typename param##_type
+#define GMOCK_ACTION_TYPENAME_PARAMS_(params) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPENAME_PARAM, , params))
+
+#define GMOCK_INTERNAL_TYPE_PARAM(i, data, param) , param##_type
+#define GMOCK_ACTION_TYPE_PARAMS_(params) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_PARAM, , params))
+
+#define GMOCK_INTERNAL_TYPE_GVALUE_PARAM(i, data, param) \
+ , param##_type gmock_p##i
+#define GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_GVALUE_PARAM, , params))
+
+#define GMOCK_INTERNAL_GVALUE_PARAM(i, data, param) \
+ , std::forward<param##_type>(gmock_p##i)
+#define GMOCK_ACTION_GVALUE_PARAMS_(params) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GVALUE_PARAM, , params))
+
+#define GMOCK_INTERNAL_INIT_PARAM(i, data, param) \
+ , param(::std::forward<param##_type>(gmock_p##i))
+#define GMOCK_ACTION_INIT_PARAMS_(params) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_INIT_PARAM, , params))
+
+#define GMOCK_INTERNAL_FIELD_PARAM(i, data, param) param##_type param;
+#define GMOCK_ACTION_FIELD_PARAMS_(params) \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_FIELD_PARAM, , params)
+
+#define GMOCK_INTERNAL_ACTION(name, full_name, params) \
+ template <GMOCK_ACTION_TYPENAME_PARAMS_(params)> \
+ class full_name { \
+ public: \
+ explicit full_name(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) \
+ : impl_(std::make_shared<gmock_Impl>( \
+ GMOCK_ACTION_GVALUE_PARAMS_(params))) {} \
+ full_name(const full_name&) = default; \
+ full_name(full_name&&) noexcept = default; \
+ template <typename F> \
+ operator ::testing::Action<F>() const { \
+ return ::testing::internal::MakeAction<F>(impl_); \
+ } \
+ \
+ private: \
+ class gmock_Impl { \
+ public: \
+ explicit gmock_Impl(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) \
+ : GMOCK_ACTION_INIT_PARAMS_(params) {} \
+ template <typename function_type, typename return_type, \
+ typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_> \
+ return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+ GMOCK_ACTION_FIELD_PARAMS_(params) \
+ }; \
+ std::shared_ptr<const gmock_Impl> impl_; \
+ }; \
+ template <GMOCK_ACTION_TYPENAME_PARAMS_(params)> \
+ inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name( \
+ GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) GTEST_MUST_USE_RESULT_; \
+ template <GMOCK_ACTION_TYPENAME_PARAMS_(params)> \
+ inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name( \
+ GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) { \
+ return full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>( \
+ GMOCK_ACTION_GVALUE_PARAMS_(params)); \
+ } \
+ template <GMOCK_ACTION_TYPENAME_PARAMS_(params)> \
+ template <typename function_type, typename return_type, typename args_type, \
+ GMOCK_ACTION_TEMPLATE_ARGS_NAMES_> \
+ return_type \
+ full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>::gmock_Impl::gmock_PerformImpl( \
+ GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+} // namespace internal
+
+// Similar to GMOCK_INTERNAL_ACTION, but no bound parameters are stored.
+#define ACTION(name) \
+ class name##Action { \
+ public: \
+ explicit name##Action() noexcept {} \
+ name##Action(const name##Action&) noexcept {} \
+ template <typename F> \
+ operator ::testing::Action<F>() const { \
+ return ::testing::internal::MakeAction<F, gmock_Impl>(); \
+ } \
+ \
+ private: \
+ class gmock_Impl { \
+ public: \
+ template <typename function_type, typename return_type, \
+ typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_> \
+ return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+ }; \
+ }; \
+ inline name##Action name() GTEST_MUST_USE_RESULT_; \
+ inline name##Action name() { return name##Action(); } \
+ template <typename function_type, typename return_type, typename args_type, \
+ GMOCK_ACTION_TEMPLATE_ARGS_NAMES_> \
+ return_type name##Action::gmock_Impl::gmock_PerformImpl( \
+ GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP, (__VA_ARGS__))
+
+#define ACTION_P2(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP2, (__VA_ARGS__))
+
+#define ACTION_P3(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP3, (__VA_ARGS__))
+
+#define ACTION_P4(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP4, (__VA_ARGS__))
+
+#define ACTION_P5(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP5, (__VA_ARGS__))
+
+#define ACTION_P6(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP6, (__VA_ARGS__))
+
+#define ACTION_P7(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP7, (__VA_ARGS__))
+
+#define ACTION_P8(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP8, (__VA_ARGS__))
+
+#define ACTION_P9(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP9, (__VA_ARGS__))
+
+#define ACTION_P10(name, ...) \
+ GMOCK_INTERNAL_ACTION(name, name##ActionP10, (__VA_ARGS__))
+
+} // namespace testing
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
new file mode 100644
index 0000000000..b6ab648e50
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h
@@ -0,0 +1,159 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used cardinalities. More
+// cardinalities can be defined by the user implementing the
+// CardinalityInterface interface if necessary.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+
+#include <limits.h>
+
+#include <memory>
+#include <ostream> // NOLINT
+
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// To implement a cardinality Foo, define:
+// 1. a class FooCardinality that implements the
+// CardinalityInterface interface, and
+// 2. a factory function that creates a Cardinality object from a
+// const FooCardinality*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers. It also eases ownership
+// management as Cardinality objects can now be copied like plain values.
+
+// The implementation of a cardinality.
+class CardinalityInterface {
+ public:
+ virtual ~CardinalityInterface() {}
+
+ // Conservative estimate on the lower/upper bound of the number of
+ // calls allowed.
+ virtual int ConservativeLowerBound() const { return 0; }
+ virtual int ConservativeUpperBound() const { return INT_MAX; }
+
+ // Returns true if and only if call_count calls will satisfy this
+ // cardinality.
+ virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+ // Returns true if and only if call_count calls will saturate this
+ // cardinality.
+ virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+ // Describes self to an ostream.
+ virtual void DescribeTo(::std::ostream* os) const = 0;
+};
+
+// A Cardinality is a copyable and IMMUTABLE (except by assignment)
+// object that specifies how many times a mock function is expected to
+// be called. The implementation of Cardinality is just a std::shared_ptr
+// to const CardinalityInterface. Don't inherit from Cardinality!
+class GTEST_API_ Cardinality {
+ public:
+ // Constructs a null cardinality. Needed for storing Cardinality
+ // objects in STL containers.
+ Cardinality() {}
+
+ // Constructs a Cardinality from its implementation.
+ explicit Cardinality(const CardinalityInterface* impl) : impl_(impl) {}
+
+ // Conservative estimate on the lower/upper bound of the number of
+ // calls allowed.
+ int ConservativeLowerBound() const { return impl_->ConservativeLowerBound(); }
+ int ConservativeUpperBound() const { return impl_->ConservativeUpperBound(); }
+
+ // Returns true if and only if call_count calls will satisfy this
+ // cardinality.
+ bool IsSatisfiedByCallCount(int call_count) const {
+ return impl_->IsSatisfiedByCallCount(call_count);
+ }
+
+ // Returns true if and only if call_count calls will saturate this
+ // cardinality.
+ bool IsSaturatedByCallCount(int call_count) const {
+ return impl_->IsSaturatedByCallCount(call_count);
+ }
+
+ // Returns true if and only if call_count calls will over-saturate this
+ // cardinality, i.e. exceed the maximum number of allowed calls.
+ bool IsOverSaturatedByCallCount(int call_count) const {
+ return impl_->IsSaturatedByCallCount(call_count) &&
+ !impl_->IsSatisfiedByCallCount(call_count);
+ }
+
+ // Describes self to an ostream
+ void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+ // Describes the given actual call count to an ostream.
+ static void DescribeActualCallCountTo(int actual_call_count,
+ ::std::ostream* os);
+
+ private:
+ std::shared_ptr<const CardinalityInterface> impl_;
+};
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n);
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n);
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber();
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max);
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n);
+
+// Creates a cardinality from its implementation.
+inline Cardinality MakeCardinality(const CardinalityInterface* c) {
+ return Cardinality(c);
+}
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
new file mode 100644
index 0000000000..f565d980c5
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h
@@ -0,0 +1,514 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements MOCK_METHOD.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_ // NOLINT
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_ // NOLINT
+
+#include <type_traits> // IWYU pragma: keep
+#include <utility> // IWYU pragma: keep
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-pp.h"
+
+namespace testing {
+namespace internal {
+template <typename T>
+using identity_t = T;
+
+template <typename Pattern>
+struct ThisRefAdjuster {
+ template <typename T>
+ using AdjustT = typename std::conditional<
+ std::is_const<typename std::remove_reference<Pattern>::type>::value,
+ typename std::conditional<std::is_lvalue_reference<Pattern>::value,
+ const T&, const T&&>::type,
+ typename std::conditional<std::is_lvalue_reference<Pattern>::value, T&,
+ T&&>::type>::type;
+
+ template <typename MockType>
+ static AdjustT<MockType> Adjust(const MockType& mock) {
+ return static_cast<AdjustT<MockType>>(const_cast<MockType&>(mock));
+ }
+};
+
+constexpr bool PrefixOf(const char* a, const char* b) {
+ return *a == 0 || (*a == *b && internal::PrefixOf(a + 1, b + 1));
+}
+
+template <int N, int M>
+constexpr bool StartsWith(const char (&prefix)[N], const char (&str)[M]) {
+ return N <= M && internal::PrefixOf(prefix, str);
+}
+
+template <int N, int M>
+constexpr bool EndsWith(const char (&suffix)[N], const char (&str)[M]) {
+ return N <= M && internal::PrefixOf(suffix, str + M - N);
+}
+
+template <int N, int M>
+constexpr bool Equals(const char (&a)[N], const char (&b)[M]) {
+ return N == M && internal::PrefixOf(a, b);
+}
+
+template <int N>
+constexpr bool ValidateSpec(const char (&spec)[N]) {
+ return internal::Equals("const", spec) ||
+ internal::Equals("override", spec) ||
+ internal::Equals("final", spec) ||
+ internal::Equals("noexcept", spec) ||
+ (internal::StartsWith("noexcept(", spec) &&
+ internal::EndsWith(")", spec)) ||
+ internal::Equals("ref(&)", spec) ||
+ internal::Equals("ref(&&)", spec) ||
+ (internal::StartsWith("Calltype(", spec) &&
+ internal::EndsWith(")", spec));
+}
+
+} // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file. However, the FunctionMocker class template
+// is meant to be defined in the ::testing namespace. The following
+// line is just a trick for working around a bug in MSVC 8.0, which
+// cannot handle it if we define FunctionMocker in ::testing.
+using internal::FunctionMocker;
+} // namespace testing
+
+#define MOCK_METHOD(...) \
+ GMOCK_PP_VARIADIC_CALL(GMOCK_INTERNAL_MOCK_METHOD_ARG_, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_1(...) \
+ GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_2(...) \
+ GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_3(_Ret, _MethodName, _Args) \
+ GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, ())
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec) \
+ GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args); \
+ GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec); \
+ GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE( \
+ GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)); \
+ GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec) \
+ GMOCK_INTERNAL_MOCK_METHOD_IMPL( \
+ GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec), \
+ GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec), \
+ GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Spec), \
+ GMOCK_INTERNAL_GET_CALLTYPE_SPEC(_Spec), \
+ GMOCK_INTERNAL_GET_REF_SPEC(_Spec), \
+ (GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)))
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_5(...) \
+ GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_6(...) \
+ GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_7(...) \
+ GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_WRONG_ARITY(...) \
+ static_assert( \
+ false, \
+ "MOCK_METHOD must be called with 3 or 4 arguments. _Ret, " \
+ "_MethodName, _Args and optionally _Spec. _Args and _Spec must be " \
+ "enclosed in parentheses. If _Ret is a type with unprotected commas, " \
+ "it must also be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Tuple) \
+ static_assert( \
+ GMOCK_PP_IS_ENCLOSED_PARENS(_Tuple), \
+ GMOCK_PP_STRINGIZE(_Tuple) " should be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(_N, ...) \
+ static_assert( \
+ std::is_function<__VA_ARGS__>::value, \
+ "Signature must be a function type, maybe return type contains " \
+ "unprotected comma."); \
+ static_assert( \
+ ::testing::tuple_size<typename ::testing::internal::Function< \
+ __VA_ARGS__>::ArgumentTuple>::value == _N, \
+ "This method does not take " GMOCK_PP_STRINGIZE( \
+ _N) " arguments. Parenthesize all types with unprotected commas.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec) \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT, ~, _Spec)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_IMPL(_N, _MethodName, _Constness, \
+ _Override, _Final, _NoexceptSpec, \
+ _CallType, _RefSpec, _Signature) \
+ typename ::testing::internal::Function<GMOCK_PP_REMOVE_PARENS( \
+ _Signature)>::Result \
+ GMOCK_INTERNAL_EXPAND(_CallType) \
+ _MethodName(GMOCK_PP_REPEAT(GMOCK_INTERNAL_PARAMETER, _Signature, _N)) \
+ GMOCK_PP_IF(_Constness, const, ) _RefSpec _NoexceptSpec \
+ GMOCK_PP_IF(_Override, override, ) GMOCK_PP_IF(_Final, final, ) { \
+ GMOCK_MOCKER_(_N, _Constness, _MethodName) \
+ .SetOwnerAndName(this, #_MethodName); \
+ return GMOCK_MOCKER_(_N, _Constness, _MethodName) \
+ .Invoke(GMOCK_PP_REPEAT(GMOCK_INTERNAL_FORWARD_ARG, _Signature, _N)); \
+ } \
+ ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+ GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_PARAMETER, _Signature, _N)) \
+ GMOCK_PP_IF(_Constness, const, ) _RefSpec { \
+ GMOCK_MOCKER_(_N, _Constness, _MethodName).RegisterOwner(this); \
+ return GMOCK_MOCKER_(_N, _Constness, _MethodName) \
+ .With(GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_ARGUMENT, , _N)); \
+ } \
+ ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+ const ::testing::internal::WithoutMatchers&, \
+ GMOCK_PP_IF(_Constness, const, )::testing::internal::Function< \
+ GMOCK_PP_REMOVE_PARENS(_Signature)>*) const _RefSpec _NoexceptSpec { \
+ return ::testing::internal::ThisRefAdjuster<GMOCK_PP_IF( \
+ _Constness, const, ) int _RefSpec>::Adjust(*this) \
+ .gmock_##_MethodName(GMOCK_PP_REPEAT( \
+ GMOCK_INTERNAL_A_MATCHER_ARGUMENT, _Signature, _N)); \
+ } \
+ mutable ::testing::FunctionMocker<GMOCK_PP_REMOVE_PARENS(_Signature)> \
+ GMOCK_MOCKER_(_N, _Constness, _MethodName)
+
+#define GMOCK_INTERNAL_EXPAND(...) __VA_ARGS__
+
+// Valid modifiers.
+#define GMOCK_INTERNAL_HAS_CONST(_Tuple) \
+ GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_CONST, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_OVERRIDE(_Tuple) \
+ GMOCK_PP_HAS_COMMA( \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_OVERRIDE, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_FINAL(_Tuple) \
+ GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_FINAL, ~, _Tuple))
+
+#define GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Tuple) \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT, ~, _Tuple)
+
+#define GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT(_i, _, _elem) \
+ GMOCK_PP_IF( \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)), \
+ _elem, )
+
+#define GMOCK_INTERNAL_GET_CALLTYPE_SPEC(_Tuple) \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_CALLTYPE_SPEC_IF_CALLTYPE, ~, _Tuple)
+
+#define GMOCK_INTERNAL_CALLTYPE_SPEC_IF_CALLTYPE(_i, _, _elem) \
+ GMOCK_PP_IF( \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem)), \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
+
+#define GMOCK_INTERNAL_GET_REF_SPEC(_Tuple) \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_REF_SPEC_IF_REF, ~, _Tuple)
+
+#define GMOCK_INTERNAL_REF_SPEC_IF_REF(_i, _, _elem) \
+ GMOCK_PP_IF(GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)), \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_UNPACK_, _elem), )
+
+#ifdef GMOCK_INTERNAL_STRICT_SPEC_ASSERT
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem) \
+ static_assert( \
+ ::testing::internal::ValidateSpec(GMOCK_PP_STRINGIZE(_elem)), \
+ "Token \'" GMOCK_PP_STRINGIZE( \
+ _elem) "\' cannot be recognized as a valid specification " \
+ "modifier. Is a ',' missing?");
+#else
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem) \
+ static_assert( \
+ (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) + \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) + \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) + \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) + \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_REF(_i, _, _elem)) + \
+ GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem))) == 1, \
+ GMOCK_PP_STRINGIZE( \
+ _elem) " cannot be recognized as a valid specification modifier.");
+#endif // GMOCK_INTERNAL_STRICT_SPEC_ASSERT
+
+// Modifiers implementation.
+#define GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem) \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CONST_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_CONST_I_const ,
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem) \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_OVERRIDE_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE_I_override ,
+
+#define GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem) \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_FINAL_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_FINAL_I_final ,
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem) \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_NOEXCEPT_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT_I_noexcept ,
+
+#define GMOCK_INTERNAL_DETECT_REF(_i, _, _elem) \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_REF_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_REF_I_ref ,
+
+#define GMOCK_INTERNAL_UNPACK_ref(x) x
+
+#define GMOCK_INTERNAL_DETECT_CALLTYPE(_i, _, _elem) \
+ GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CALLTYPE_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_CALLTYPE_I_Calltype ,
+
+#define GMOCK_INTERNAL_UNPACK_Calltype(...) __VA_ARGS__
+
+// Note: The use of `identity_t` here allows _Ret to represent return types that
+// would normally need to be specified in a different way. For example, a method
+// returning a function pointer must be written as
+//
+// fn_ptr_return_t (*method(method_args_t...))(fn_ptr_args_t...)
+//
+// But we only support placing the return type at the beginning. To handle this,
+// we wrap all calls in identity_t, so that a declaration will be expanded to
+//
+// identity_t<fn_ptr_return_t (*)(fn_ptr_args_t...)> method(method_args_t...)
+//
+// This allows us to work around the syntactic oddities of function/method
+// types.
+#define GMOCK_INTERNAL_SIGNATURE(_Ret, _Args) \
+ ::testing::internal::identity_t<GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_Ret), \
+ GMOCK_PP_REMOVE_PARENS, \
+ GMOCK_PP_IDENTITY)(_Ret)>( \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_TYPE, _, _Args))
+
+#define GMOCK_INTERNAL_GET_TYPE(_i, _, _elem) \
+ GMOCK_PP_COMMA_IF(_i) \
+ GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_elem), GMOCK_PP_REMOVE_PARENS, \
+ GMOCK_PP_IDENTITY) \
+ (_elem)
+
+#define GMOCK_INTERNAL_PARAMETER(_i, _Signature, _) \
+ GMOCK_PP_COMMA_IF(_i) \
+ GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
+ gmock_a##_i
+
+#define GMOCK_INTERNAL_FORWARD_ARG(_i, _Signature, _) \
+ GMOCK_PP_COMMA_IF(_i) \
+ ::std::forward<GMOCK_INTERNAL_ARG_O( \
+ _i, GMOCK_PP_REMOVE_PARENS(_Signature))>(gmock_a##_i)
+
+#define GMOCK_INTERNAL_MATCHER_PARAMETER(_i, _Signature, _) \
+ GMOCK_PP_COMMA_IF(_i) \
+ GMOCK_INTERNAL_MATCHER_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
+ gmock_a##_i
+
+#define GMOCK_INTERNAL_MATCHER_ARGUMENT(_i, _1, _2) \
+ GMOCK_PP_COMMA_IF(_i) \
+ gmock_a##_i
+
+#define GMOCK_INTERNAL_A_MATCHER_ARGUMENT(_i, _Signature, _) \
+ GMOCK_PP_COMMA_IF(_i) \
+ ::testing::A<GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature))>()
+
+#define GMOCK_INTERNAL_ARG_O(_i, ...) \
+ typename ::testing::internal::Function<__VA_ARGS__>::template Arg<_i>::type
+
+#define GMOCK_INTERNAL_MATCHER_O(_i, ...) \
+ const ::testing::Matcher<typename ::testing::internal::Function< \
+ __VA_ARGS__>::template Arg<_i>::type>&
+
+#define MOCK_METHOD0(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 0, __VA_ARGS__)
+#define MOCK_METHOD1(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 1, __VA_ARGS__)
+#define MOCK_METHOD2(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 2, __VA_ARGS__)
+#define MOCK_METHOD3(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 3, __VA_ARGS__)
+#define MOCK_METHOD4(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 4, __VA_ARGS__)
+#define MOCK_METHOD5(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 5, __VA_ARGS__)
+#define MOCK_METHOD6(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 6, __VA_ARGS__)
+#define MOCK_METHOD7(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 7, __VA_ARGS__)
+#define MOCK_METHOD8(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 8, __VA_ARGS__)
+#define MOCK_METHOD9(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 9, __VA_ARGS__)
+#define MOCK_METHOD10(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, , m, 10, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 0, __VA_ARGS__)
+#define MOCK_CONST_METHOD1(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 1, __VA_ARGS__)
+#define MOCK_CONST_METHOD2(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 2, __VA_ARGS__)
+#define MOCK_CONST_METHOD3(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 3, __VA_ARGS__)
+#define MOCK_CONST_METHOD4(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 4, __VA_ARGS__)
+#define MOCK_CONST_METHOD5(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 5, __VA_ARGS__)
+#define MOCK_CONST_METHOD6(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 6, __VA_ARGS__)
+#define MOCK_CONST_METHOD7(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 7, __VA_ARGS__)
+#define MOCK_CONST_METHOD8(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 8, __VA_ARGS__)
+#define MOCK_CONST_METHOD9(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 9, __VA_ARGS__)
+#define MOCK_CONST_METHOD10(m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, , m, 10, __VA_ARGS__)
+
+#define MOCK_METHOD0_T(m, ...) MOCK_METHOD0(m, __VA_ARGS__)
+#define MOCK_METHOD1_T(m, ...) MOCK_METHOD1(m, __VA_ARGS__)
+#define MOCK_METHOD2_T(m, ...) MOCK_METHOD2(m, __VA_ARGS__)
+#define MOCK_METHOD3_T(m, ...) MOCK_METHOD3(m, __VA_ARGS__)
+#define MOCK_METHOD4_T(m, ...) MOCK_METHOD4(m, __VA_ARGS__)
+#define MOCK_METHOD5_T(m, ...) MOCK_METHOD5(m, __VA_ARGS__)
+#define MOCK_METHOD6_T(m, ...) MOCK_METHOD6(m, __VA_ARGS__)
+#define MOCK_METHOD7_T(m, ...) MOCK_METHOD7(m, __VA_ARGS__)
+#define MOCK_METHOD8_T(m, ...) MOCK_METHOD8(m, __VA_ARGS__)
+#define MOCK_METHOD9_T(m, ...) MOCK_METHOD9(m, __VA_ARGS__)
+#define MOCK_METHOD10_T(m, ...) MOCK_METHOD10(m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T(m, ...) MOCK_CONST_METHOD0(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T(m, ...) MOCK_CONST_METHOD1(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T(m, ...) MOCK_CONST_METHOD2(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T(m, ...) MOCK_CONST_METHOD3(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T(m, ...) MOCK_CONST_METHOD4(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T(m, ...) MOCK_CONST_METHOD5(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T(m, ...) MOCK_CONST_METHOD6(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T(m, ...) MOCK_CONST_METHOD7(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T(m, ...) MOCK_CONST_METHOD8(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T(m, ...) MOCK_CONST_METHOD9(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T(m, ...) MOCK_CONST_METHOD10(m, __VA_ARGS__)
+
+#define MOCK_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 0, __VA_ARGS__)
+#define MOCK_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 1, __VA_ARGS__)
+#define MOCK_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 2, __VA_ARGS__)
+#define MOCK_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 3, __VA_ARGS__)
+#define MOCK_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 4, __VA_ARGS__)
+#define MOCK_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 5, __VA_ARGS__)
+#define MOCK_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 6, __VA_ARGS__)
+#define MOCK_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 7, __VA_ARGS__)
+#define MOCK_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 8, __VA_ARGS__)
+#define MOCK_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 9, __VA_ARGS__)
+#define MOCK_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 10, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 0, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 1, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 2, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 3, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 4, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 5, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 6, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 7, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 8, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 9, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+ GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 10, __VA_ARGS__)
+
+#define MOCK_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+ MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHODN(constness, ct, Method, args_num, ...) \
+ GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE( \
+ args_num, ::testing::internal::identity_t<__VA_ARGS__>); \
+ GMOCK_INTERNAL_MOCK_METHOD_IMPL( \
+ args_num, Method, GMOCK_PP_NARG0(constness), 0, 0, , ct, , \
+ (::testing::internal::identity_t<__VA_ARGS__>))
+
+#define GMOCK_MOCKER_(arity, constness, Method) \
+ GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
new file mode 100644
index 0000000000..6282901145
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h
@@ -0,0 +1,5610 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// The MATCHER* family of macros can be used in a namespace scope to
+// define custom matchers easily.
+//
+// Basic Usage
+// ===========
+//
+// The syntax
+//
+// MATCHER(name, description_string) { statements; }
+//
+// defines a matcher with the given name that executes the statements,
+// which must return a bool to indicate if the match succeeds. Inside
+// the statements, you can refer to the value being matched by 'arg',
+// and refer to its type by 'arg_type'.
+//
+// The description string documents what the matcher does, and is used
+// to generate the failure message when the match fails. Since a
+// MATCHER() is usually defined in a header file shared by multiple
+// C++ source files, we require the description to be a C-string
+// literal to avoid possible side effects. It can be empty, in which
+// case we'll use the sequence of words in the matcher name as the
+// description.
+//
+// For example:
+//
+// MATCHER(IsEven, "") { return (arg % 2) == 0; }
+//
+// allows you to write
+//
+// // Expects mock_foo.Bar(n) to be called where n is even.
+// EXPECT_CALL(mock_foo, Bar(IsEven()));
+//
+// or,
+//
+// // Verifies that the value of some_expression is even.
+// EXPECT_THAT(some_expression, IsEven());
+//
+// If the above assertion fails, it will print something like:
+//
+// Value of: some_expression
+// Expected: is even
+// Actual: 7
+//
+// where the description "is even" is automatically calculated from the
+// matcher name IsEven.
+//
+// Argument Type
+// =============
+//
+// Note that the type of the value being matched (arg_type) is
+// determined by the context in which you use the matcher and is
+// supplied to you by the compiler, so you don't need to worry about
+// declaring it (nor can you). This allows the matcher to be
+// polymorphic. For example, IsEven() can be used to match any type
+// where the value of "(arg % 2) == 0" can be implicitly converted to
+// a bool. In the "Bar(IsEven())" example above, if method Bar()
+// takes an int, 'arg_type' will be int; if it takes an unsigned long,
+// 'arg_type' will be unsigned long; and so on.
+//
+// Parameterizing Matchers
+// =======================
+//
+// Sometimes you'll want to parameterize the matcher. For that you
+// can use another macro:
+//
+// MATCHER_P(name, param_name, description_string) { statements; }
+//
+// For example:
+//
+// MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+//
+// will allow you to write:
+//
+// EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+//
+// which may lead to this message (assuming n is 10):
+//
+// Value of: Blah("a")
+// Expected: has absolute value 10
+// Actual: -9
+//
+// Note that both the matcher description and its parameter are
+// printed, making the message human-friendly.
+//
+// In the matcher definition body, you can write 'foo_type' to
+// reference the type of a parameter named 'foo'. For example, in the
+// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
+// 'value_type' to refer to the type of 'value'.
+//
+// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P$n to
+// support multi-parameter matchers.
+//
+// Describing Parameterized Matchers
+// =================================
+//
+// The last argument to MATCHER*() is a string-typed expression. The
+// expression can reference all of the matcher's parameters and a
+// special bool-typed variable named 'negation'. When 'negation' is
+// false, the expression should evaluate to the matcher's description;
+// otherwise it should evaluate to the description of the negation of
+// the matcher. For example,
+//
+// using testing::PrintToString;
+//
+// MATCHER_P2(InClosedRange, low, hi,
+// std::string(negation ? "is not" : "is") + " in range [" +
+// PrintToString(low) + ", " + PrintToString(hi) + "]") {
+// return low <= arg && arg <= hi;
+// }
+// ...
+// EXPECT_THAT(3, InClosedRange(4, 6));
+// EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+// Expected: is in range [4, 6]
+// ...
+// Expected: is not in range [2, 4]
+//
+// If you specify "" as the description, the failure message will
+// contain the sequence of words in the matcher name followed by the
+// parameter values printed as a tuple. For example,
+//
+// MATCHER_P2(InClosedRange, low, hi, "") { ... }
+// ...
+// EXPECT_THAT(3, InClosedRange(4, 6));
+// EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+// Expected: in closed range (4, 6)
+// ...
+// Expected: not (in closed range (2, 4))
+//
+// Types of Matcher Parameters
+// ===========================
+//
+// For the purpose of typing, you can view
+//
+// MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+//
+// as shorthand for
+//
+// template <typename p1_type, ..., typename pk_type>
+// FooMatcherPk<p1_type, ..., pk_type>
+// Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// When you write Foo(v1, ..., vk), the compiler infers the types of
+// the parameters v1, ..., and vk for you. If you are not happy with
+// the result of the type inference, you can specify the types by
+// explicitly instantiating the template, as in Foo<long, bool>(5,
+// false). As said earlier, you don't get to (or need to) specify
+// 'arg_type' as that's determined by the context in which the matcher
+// is used. You can assign the result of expression Foo(p1, ..., pk)
+// to a variable of type FooMatcherPk<p1_type, ..., pk_type>. This
+// can be useful when composing matchers.
+//
+// While you can instantiate a matcher template with reference types,
+// passing the parameters by pointer usually makes your code more
+// readable. If, however, you still want to pass a parameter by
+// reference, be aware that in the failure message generated by the
+// matcher you will see the value of the referenced object but not its
+// address.
+//
+// Explaining Match Results
+// ========================
+//
+// Sometimes the matcher description alone isn't enough to explain why
+// the match has failed or succeeded. For example, when expecting a
+// long string, it can be very helpful to also print the diff between
+// the expected string and the actual one. To achieve that, you can
+// optionally stream additional information to a special variable
+// named result_listener, whose type is a pointer to class
+// MatchResultListener:
+//
+// MATCHER_P(EqualsLongString, str, "") {
+// if (arg == str) return true;
+//
+// *result_listener << "the difference: "
+/// << DiffStrings(str, arg);
+// return false;
+// }
+//
+// Overloading Matchers
+// ====================
+//
+// You can overload matchers with different numbers of parameters:
+//
+// MATCHER_P(Blah, a, description_string1) { ... }
+// MATCHER_P2(Blah, a, b, description_string2) { ... }
+//
+// Caveats
+// =======
+//
+// When defining a new matcher, you should also consider implementing
+// MatcherInterface or using MakePolymorphicMatcher(). These
+// approaches require more work than the MATCHER* macros, but also
+// give you more control on the types of the value being matched and
+// the matcher parameters, which may leads to better compiler error
+// messages when the matcher is used wrong. They also allow
+// overloading matchers based on parameter types (as opposed to just
+// based on the number of parameters).
+//
+// MATCHER*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+//
+// More Information
+// ================
+//
+// To learn more about using these macros, please search for 'MATCHER'
+// on
+// https://github.com/google/googletest/blob/master/docs/gmock_cook_book.md
+//
+// This file also implements some commonly used argument matchers. More
+// matchers can be defined by the user implementing the
+// MatcherInterface<T> interface if necessary.
+//
+// See googletest/include/gtest/gtest-matchers.h for the definition of class
+// Matcher, class MatcherInterface, and others.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream> // NOLINT
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gmock/internal/gmock-pp.h"
+#include "gtest/gtest.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GMOCK_MAYBE_5046_ 5046
+#else
+#define GMOCK_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+ 4251 GMOCK_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+ clients of class B */
+ /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+// 1. a class FooMatcherImpl that implements the
+// MatcherInterface<T> interface, and
+// 2. a factory function that creates a Matcher<T> object from a
+// FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers. It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// A match result listener that stores the explanation in a string.
+class StringMatchResultListener : public MatchResultListener {
+ public:
+ StringMatchResultListener() : MatchResultListener(&ss_) {}
+
+ // Returns the explanation accumulated so far.
+ std::string str() const { return ss_.str(); }
+
+ // Clears the explanation accumulated so far.
+ void Clear() { ss_.str(""); }
+
+ private:
+ ::std::stringstream ss_;
+
+ StringMatchResultListener(const StringMatchResultListener&) = delete;
+ StringMatchResultListener& operator=(const StringMatchResultListener&) =
+ delete;
+};
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// The MatcherCastImpl class template is a helper for implementing
+// MatcherCast(). We need this helper in order to partially
+// specialize the implementation of MatcherCast() (C++ allows
+// class/struct templates to be partially specialized, but not
+// function templates.).
+
+// This general version is used when MatcherCast()'s argument is a
+// polymorphic matcher (i.e. something that can be converted to a
+// Matcher but is not one yet; for example, Eq(value)) or a value (for
+// example, "hello").
+template <typename T, typename M>
+class MatcherCastImpl {
+ public:
+ static Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+ // M can be a polymorphic matcher, in which case we want to use
+ // its conversion operator to create Matcher<T>. Or it can be a value
+ // that should be passed to the Matcher<T>'s constructor.
+ //
+ // We can't call Matcher<T>(polymorphic_matcher_or_value) when M is a
+ // polymorphic matcher because it'll be ambiguous if T has an implicit
+ // constructor from M (this usually happens when T has an implicit
+ // constructor from any type).
+ //
+ // It won't work to unconditionally implicit_cast
+ // polymorphic_matcher_or_value to Matcher<T> because it won't trigger
+ // a user-defined conversion from M to T if one exists (assuming M is
+ // a value).
+ return CastImpl(polymorphic_matcher_or_value,
+ std::is_convertible<M, Matcher<T>>{},
+ std::is_convertible<M, T>{});
+ }
+
+ private:
+ template <bool Ignore>
+ static Matcher<T> CastImpl(const M& polymorphic_matcher_or_value,
+ std::true_type /* convertible_to_matcher */,
+ std::integral_constant<bool, Ignore>) {
+ // M is implicitly convertible to Matcher<T>, which means that either
+ // M is a polymorphic matcher or Matcher<T> has an implicit constructor
+ // from M. In both cases using the implicit conversion will produce a
+ // matcher.
+ //
+ // Even if T has an implicit constructor from M, it won't be called because
+ // creating Matcher<T> would require a chain of two user-defined conversions
+ // (first to create T from M and then to create Matcher<T> from T).
+ return polymorphic_matcher_or_value;
+ }
+
+ // M can't be implicitly converted to Matcher<T>, so M isn't a polymorphic
+ // matcher. It's a value of a type implicitly convertible to T. Use direct
+ // initialization to create a matcher.
+ static Matcher<T> CastImpl(const M& value,
+ std::false_type /* convertible_to_matcher */,
+ std::true_type /* convertible_to_T */) {
+ return Matcher<T>(ImplicitCast_<T>(value));
+ }
+
+ // M can't be implicitly converted to either Matcher<T> or T. Attempt to use
+ // polymorphic matcher Eq(value) in this case.
+ //
+ // Note that we first attempt to perform an implicit cast on the value and
+ // only fall back to the polymorphic Eq() matcher afterwards because the
+ // latter calls bool operator==(const Lhs& lhs, const Rhs& rhs) in the end
+ // which might be undefined even when Rhs is implicitly convertible to Lhs
+ // (e.g. std::pair<const int, int> vs. std::pair<int, int>).
+ //
+ // We don't define this method inline as we need the declaration of Eq().
+ static Matcher<T> CastImpl(const M& value,
+ std::false_type /* convertible_to_matcher */,
+ std::false_type /* convertible_to_T */);
+};
+
+// This more specialized version is used when MatcherCast()'s argument
+// is already a Matcher. This only compiles when type T can be
+// statically converted to type U.
+template <typename T, typename U>
+class MatcherCastImpl<T, Matcher<U>> {
+ public:
+ static Matcher<T> Cast(const Matcher<U>& source_matcher) {
+ return Matcher<T>(new Impl(source_matcher));
+ }
+
+ private:
+ class Impl : public MatcherInterface<T> {
+ public:
+ explicit Impl(const Matcher<U>& source_matcher)
+ : source_matcher_(source_matcher) {}
+
+ // We delegate the matching logic to the source matcher.
+ bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+ using FromType = typename std::remove_cv<typename std::remove_pointer<
+ typename std::remove_reference<T>::type>::type>::type;
+ using ToType = typename std::remove_cv<typename std::remove_pointer<
+ typename std::remove_reference<U>::type>::type>::type;
+ // Do not allow implicitly converting base*/& to derived*/&.
+ static_assert(
+ // Do not trigger if only one of them is a pointer. That implies a
+ // regular conversion and not a down_cast.
+ (std::is_pointer<typename std::remove_reference<T>::type>::value !=
+ std::is_pointer<typename std::remove_reference<U>::type>::value) ||
+ std::is_same<FromType, ToType>::value ||
+ !std::is_base_of<FromType, ToType>::value,
+ "Can't implicitly convert from <base> to <derived>");
+
+ // Do the cast to `U` explicitly if necessary.
+ // Otherwise, let implicit conversions do the trick.
+ using CastType =
+ typename std::conditional<std::is_convertible<T&, const U&>::value,
+ T&, U>::type;
+
+ return source_matcher_.MatchAndExplain(static_cast<CastType>(x),
+ listener);
+ }
+
+ void DescribeTo(::std::ostream* os) const override {
+ source_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ source_matcher_.DescribeNegationTo(os);
+ }
+
+ private:
+ const Matcher<U> source_matcher_;
+ };
+};
+
+// This even more specialized version is used for efficiently casting
+// a matcher to its own type.
+template <typename T>
+class MatcherCastImpl<T, Matcher<T>> {
+ public:
+ static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
+};
+
+// Template specialization for parameterless Matcher.
+template <typename Derived>
+class MatcherBaseImpl {
+ public:
+ MatcherBaseImpl() = default;
+
+ template <typename T>
+ operator ::testing::Matcher<T>() const { // NOLINT(runtime/explicit)
+ return ::testing::Matcher<T>(new
+ typename Derived::template gmock_Impl<T>());
+ }
+};
+
+// Template specialization for Matcher with parameters.
+template <template <typename...> class Derived, typename... Ts>
+class MatcherBaseImpl<Derived<Ts...>> {
+ public:
+ // Mark the constructor explicit for single argument T to avoid implicit
+ // conversions.
+ template <typename E = std::enable_if<sizeof...(Ts) == 1>,
+ typename E::type* = nullptr>
+ explicit MatcherBaseImpl(Ts... params)
+ : params_(std::forward<Ts>(params)...) {}
+ template <typename E = std::enable_if<sizeof...(Ts) != 1>,
+ typename = typename E::type>
+ MatcherBaseImpl(Ts... params) // NOLINT
+ : params_(std::forward<Ts>(params)...) {}
+
+ template <typename F>
+ operator ::testing::Matcher<F>() const { // NOLINT(runtime/explicit)
+ return Apply<F>(MakeIndexSequence<sizeof...(Ts)>{});
+ }
+
+ private:
+ template <typename F, std::size_t... tuple_ids>
+ ::testing::Matcher<F> Apply(IndexSequence<tuple_ids...>) const {
+ return ::testing::Matcher<F>(
+ new typename Derived<Ts...>::template gmock_Impl<F>(
+ std::get<tuple_ids>(params_)...));
+ }
+
+ const std::tuple<Ts...> params_;
+};
+
+} // namespace internal
+
+// In order to be safe and clear, casting between different matcher
+// types is done explicitly via MatcherCast<T>(m), which takes a
+// matcher m and returns a Matcher<T>. It compiles only when T can be
+// statically converted to the argument type of m.
+template <typename T, typename M>
+inline Matcher<T> MatcherCast(const M& matcher) {
+ return internal::MatcherCastImpl<T, M>::Cast(matcher);
+}
+
+// This overload handles polymorphic matchers and values only since
+// monomorphic matchers are handled by the next one.
+template <typename T, typename M>
+inline Matcher<T> SafeMatcherCast(const M& polymorphic_matcher_or_value) {
+ return MatcherCast<T>(polymorphic_matcher_or_value);
+}
+
+// This overload handles monomorphic matchers.
+//
+// In general, if type T can be implicitly converted to type U, we can
+// safely convert a Matcher<U> to a Matcher<T> (i.e. Matcher is
+// contravariant): just keep a copy of the original Matcher<U>, convert the
+// argument from type T to U, and then pass it to the underlying Matcher<U>.
+// The only exception is when U is a reference and T is not, as the
+// underlying Matcher<U> may be interested in the argument's address, which
+// is not preserved in the conversion from T to U.
+template <typename T, typename U>
+inline Matcher<T> SafeMatcherCast(const Matcher<U>& matcher) {
+ // Enforce that T can be implicitly converted to U.
+ static_assert(std::is_convertible<const T&, const U&>::value,
+ "T must be implicitly convertible to U");
+ // Enforce that we are not converting a non-reference type T to a reference
+ // type U.
+ static_assert(std::is_reference<T>::value || !std::is_reference<U>::value,
+ "cannot convert non reference arg to reference");
+ // In case both T and U are arithmetic types, enforce that the
+ // conversion is not lossy.
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
+ constexpr bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
+ constexpr bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
+ static_assert(
+ kTIsOther || kUIsOther ||
+ (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
+ "conversion of arithmetic types must be lossless");
+ return MatcherCast<T>(matcher);
+}
+
+// A<T>() returns a matcher that matches any value of type T.
+template <typename T>
+Matcher<T> A();
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// If the explanation is not empty, prints it to the ostream.
+inline void PrintIfNotEmpty(const std::string& explanation,
+ ::std::ostream* os) {
+ if (explanation != "" && os != nullptr) {
+ *os << ", " << explanation;
+ }
+}
+
+// Returns true if the given type name is easy to read by a human.
+// This is used to decide whether printing the type of a value might
+// be helpful.
+inline bool IsReadableTypeName(const std::string& type_name) {
+ // We consider a type name readable if it's short or doesn't contain
+ // a template or function type.
+ return (type_name.length() <= 20 ||
+ type_name.find_first_of("<(") == std::string::npos);
+}
+
+// Matches the value against the given matcher, prints the value and explains
+// the match result to the listener. Returns the match result.
+// 'listener' must not be NULL.
+// Value cannot be passed by const reference, because some matchers take a
+// non-const argument.
+template <typename Value, typename T>
+bool MatchPrintAndExplain(Value& value, const Matcher<T>& matcher,
+ MatchResultListener* listener) {
+ if (!listener->IsInterested()) {
+ // If the listener is not interested, we do not need to construct the
+ // inner explanation.
+ return matcher.Matches(value);
+ }
+
+ StringMatchResultListener inner_listener;
+ const bool match = matcher.MatchAndExplain(value, &inner_listener);
+
+ UniversalPrint(value, listener->stream());
+#if GTEST_HAS_RTTI
+ const std::string& type_name = GetTypeName<Value>();
+ if (IsReadableTypeName(type_name))
+ *listener->stream() << " (of type " << type_name << ")";
+#endif
+ PrintIfNotEmpty(inner_listener.str(), listener->stream());
+
+ return match;
+}
+
+// An internal helper class for doing compile-time loop on a tuple's
+// fields.
+template <size_t N>
+class TuplePrefix {
+ public:
+ // TuplePrefix<N>::Matches(matcher_tuple, value_tuple) returns true
+ // if and only if the first N fields of matcher_tuple matches
+ // the first N fields of value_tuple, respectively.
+ template <typename MatcherTuple, typename ValueTuple>
+ static bool Matches(const MatcherTuple& matcher_tuple,
+ const ValueTuple& value_tuple) {
+ return TuplePrefix<N - 1>::Matches(matcher_tuple, value_tuple) &&
+ std::get<N - 1>(matcher_tuple).Matches(std::get<N - 1>(value_tuple));
+ }
+
+ // TuplePrefix<N>::ExplainMatchFailuresTo(matchers, values, os)
+ // describes failures in matching the first N fields of matchers
+ // against the first N fields of values. If there is no failure,
+ // nothing will be streamed to os.
+ template <typename MatcherTuple, typename ValueTuple>
+ static void ExplainMatchFailuresTo(const MatcherTuple& matchers,
+ const ValueTuple& values,
+ ::std::ostream* os) {
+ // First, describes failures in the first N - 1 fields.
+ TuplePrefix<N - 1>::ExplainMatchFailuresTo(matchers, values, os);
+
+ // Then describes the failure (if any) in the (N - 1)-th (0-based)
+ // field.
+ typename std::tuple_element<N - 1, MatcherTuple>::type matcher =
+ std::get<N - 1>(matchers);
+ typedef typename std::tuple_element<N - 1, ValueTuple>::type Value;
+ const Value& value = std::get<N - 1>(values);
+ StringMatchResultListener listener;
+ if (!matcher.MatchAndExplain(value, &listener)) {
+ *os << " Expected arg #" << N - 1 << ": ";
+ std::get<N - 1>(matchers).DescribeTo(os);
+ *os << "\n Actual: ";
+ // We remove the reference in type Value to prevent the
+ // universal printer from printing the address of value, which
+ // isn't interesting to the user most of the time. The
+ // matcher's MatchAndExplain() method handles the case when
+ // the address is interesting.
+ internal::UniversalPrint(value, os);
+ PrintIfNotEmpty(listener.str(), os);
+ *os << "\n";
+ }
+ }
+};
+
+// The base case.
+template <>
+class TuplePrefix<0> {
+ public:
+ template <typename MatcherTuple, typename ValueTuple>
+ static bool Matches(const MatcherTuple& /* matcher_tuple */,
+ const ValueTuple& /* value_tuple */) {
+ return true;
+ }
+
+ template <typename MatcherTuple, typename ValueTuple>
+ static void ExplainMatchFailuresTo(const MatcherTuple& /* matchers */,
+ const ValueTuple& /* values */,
+ ::std::ostream* /* os */) {}
+};
+
+// TupleMatches(matcher_tuple, value_tuple) returns true if and only if
+// all matchers in matcher_tuple match the corresponding fields in
+// value_tuple. It is a compiler error if matcher_tuple and
+// value_tuple have different number of fields or incompatible field
+// types.
+template <typename MatcherTuple, typename ValueTuple>
+bool TupleMatches(const MatcherTuple& matcher_tuple,
+ const ValueTuple& value_tuple) {
+ // Makes sure that matcher_tuple and value_tuple have the same
+ // number of fields.
+ static_assert(std::tuple_size<MatcherTuple>::value ==
+ std::tuple_size<ValueTuple>::value,
+ "matcher and value have different numbers of fields");
+ return TuplePrefix<std::tuple_size<ValueTuple>::value>::Matches(matcher_tuple,
+ value_tuple);
+}
+
+// Describes failures in matching matchers against values. If there
+// is no failure, nothing will be streamed to os.
+template <typename MatcherTuple, typename ValueTuple>
+void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
+ const ValueTuple& values, ::std::ostream* os) {
+ TuplePrefix<std::tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
+ matchers, values, os);
+}
+
+// TransformTupleValues and its helper.
+//
+// TransformTupleValuesHelper hides the internal machinery that
+// TransformTupleValues uses to implement a tuple traversal.
+template <typename Tuple, typename Func, typename OutIter>
+class TransformTupleValuesHelper {
+ private:
+ typedef ::std::tuple_size<Tuple> TupleSize;
+
+ public:
+ // For each member of tuple 't', taken in order, evaluates '*out++ = f(t)'.
+ // Returns the final value of 'out' in case the caller needs it.
+ static OutIter Run(Func f, const Tuple& t, OutIter out) {
+ return IterateOverTuple<Tuple, TupleSize::value>()(f, t, out);
+ }
+
+ private:
+ template <typename Tup, size_t kRemainingSize>
+ struct IterateOverTuple {
+ OutIter operator()(Func f, const Tup& t, OutIter out) const {
+ *out++ = f(::std::get<TupleSize::value - kRemainingSize>(t));
+ return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
+ }
+ };
+ template <typename Tup>
+ struct IterateOverTuple<Tup, 0> {
+ OutIter operator()(Func /* f */, const Tup& /* t */, OutIter out) const {
+ return out;
+ }
+ };
+};
+
+// Successively invokes 'f(element)' on each element of the tuple 't',
+// appending each result to the 'out' iterator. Returns the final value
+// of 'out'.
+template <typename Tuple, typename Func, typename OutIter>
+OutIter TransformTupleValues(Func f, const Tuple& t, OutIter out) {
+ return TransformTupleValuesHelper<Tuple, Func, OutIter>::Run(f, t, out);
+}
+
+// Implements _, a matcher that matches any value of any
+// type. This is a polymorphic matcher, so we need a template type
+// conversion operator to make it appearing as a Matcher<T> for any
+// type T.
+class AnythingMatcher {
+ public:
+ using is_gtest_matcher = void;
+
+ template <typename T>
+ bool MatchAndExplain(const T& /* x */, std::ostream* /* listener */) const {
+ return true;
+ }
+ void DescribeTo(std::ostream* os) const { *os << "is anything"; }
+ void DescribeNegationTo(::std::ostream* os) const {
+ // This is mostly for completeness' sake, as it's not very useful
+ // to write Not(A<bool>()). However we cannot completely rule out
+ // such a possibility, and it doesn't hurt to be prepared.
+ *os << "never matches";
+ }
+};
+
+// Implements the polymorphic IsNull() matcher, which matches any raw or smart
+// pointer that is NULL.
+class IsNullMatcher {
+ public:
+ template <typename Pointer>
+ bool MatchAndExplain(const Pointer& p,
+ MatchResultListener* /* listener */) const {
+ return p == nullptr;
+ }
+
+ void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
+ void DescribeNegationTo(::std::ostream* os) const { *os << "isn't NULL"; }
+};
+
+// Implements the polymorphic NotNull() matcher, which matches any raw or smart
+// pointer that is not NULL.
+class NotNullMatcher {
+ public:
+ template <typename Pointer>
+ bool MatchAndExplain(const Pointer& p,
+ MatchResultListener* /* listener */) const {
+ return p != nullptr;
+ }
+
+ void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
+ void DescribeNegationTo(::std::ostream* os) const { *os << "is NULL"; }
+};
+
+// Ref(variable) matches any argument that is a reference to
+// 'variable'. This matcher is polymorphic as it can match any
+// super type of the type of 'variable'.
+//
+// The RefMatcher template class implements Ref(variable). It can
+// only be instantiated with a reference type. This prevents a user
+// from mistakenly using Ref(x) to match a non-reference function
+// argument. For example, the following will righteously cause a
+// compiler error:
+//
+// int n;
+// Matcher<int> m1 = Ref(n); // This won't compile.
+// Matcher<int&> m2 = Ref(n); // This will compile.
+template <typename T>
+class RefMatcher;
+
+template <typename T>
+class RefMatcher<T&> {
+ // Google Mock is a generic framework and thus needs to support
+ // mocking any function types, including those that take non-const
+ // reference arguments. Therefore the template parameter T (and
+ // Super below) can be instantiated to either a const type or a
+ // non-const type.
+ public:
+ // RefMatcher() takes a T& instead of const T&, as we want the
+ // compiler to catch using Ref(const_value) as a matcher for a
+ // non-const reference.
+ explicit RefMatcher(T& x) : object_(x) {} // NOLINT
+
+ template <typename Super>
+ operator Matcher<Super&>() const {
+ // By passing object_ (type T&) to Impl(), which expects a Super&,
+ // we make sure that Super is a super type of T. In particular,
+ // this catches using Ref(const_value) as a matcher for a
+ // non-const reference, as you cannot implicitly convert a const
+ // reference to a non-const reference.
+ return MakeMatcher(new Impl<Super>(object_));
+ }
+
+ private:
+ template <typename Super>
+ class Impl : public MatcherInterface<Super&> {
+ public:
+ explicit Impl(Super& x) : object_(x) {} // NOLINT
+
+ // MatchAndExplain() takes a Super& (as opposed to const Super&)
+ // in order to match the interface MatcherInterface<Super&>.
+ bool MatchAndExplain(Super& x,
+ MatchResultListener* listener) const override {
+ *listener << "which is located @" << static_cast<const void*>(&x);
+ return &x == &object_;
+ }
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "references the variable ";
+ UniversalPrinter<Super&>::Print(object_, os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "does not reference the variable ";
+ UniversalPrinter<Super&>::Print(object_, os);
+ }
+
+ private:
+ const Super& object_;
+ };
+
+ T& object_;
+};
+
+// Polymorphic helper functions for narrow and wide string matchers.
+inline bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+ return String::CaseInsensitiveCStringEquals(lhs, rhs);
+}
+
+inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
+ const wchar_t* rhs) {
+ return String::CaseInsensitiveWideCStringEquals(lhs, rhs);
+}
+
+// String comparison for narrow or wide strings that can have embedded NUL
+// characters.
+template <typename StringType>
+bool CaseInsensitiveStringEquals(const StringType& s1, const StringType& s2) {
+ // Are the heads equal?
+ if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
+ return false;
+ }
+
+ // Skip the equal heads.
+ const typename StringType::value_type nul = 0;
+ const size_t i1 = s1.find(nul), i2 = s2.find(nul);
+
+ // Are we at the end of either s1 or s2?
+ if (i1 == StringType::npos || i2 == StringType::npos) {
+ return i1 == i2;
+ }
+
+ // Are the tails equal?
+ return CaseInsensitiveStringEquals(s1.substr(i1 + 1), s2.substr(i2 + 1));
+}
+
+// String matchers.
+
+// Implements equality-based string matchers like StrEq, StrCaseNe, and etc.
+template <typename StringType>
+class StrEqualityMatcher {
+ public:
+ StrEqualityMatcher(StringType str, bool expect_eq, bool case_sensitive)
+ : string_(std::move(str)),
+ expect_eq_(expect_eq),
+ case_sensitive_(case_sensitive) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+ bool MatchAndExplain(const internal::StringView& s,
+ MatchResultListener* listener) const {
+ // This should fail to compile if StringView is used with wide
+ // strings.
+ const StringType& str = std::string(s);
+ return MatchAndExplain(str, listener);
+ }
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+ // Accepts pointer types, particularly:
+ // const char*
+ // char*
+ // const wchar_t*
+ // wchar_t*
+ template <typename CharType>
+ bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+ if (s == nullptr) {
+ return !expect_eq_;
+ }
+ return MatchAndExplain(StringType(s), listener);
+ }
+
+ // Matches anything that can convert to StringType.
+ //
+ // This is a template, not just a plain function with const StringType&,
+ // because StringView has some interfering non-explicit constructors.
+ template <typename MatcheeStringType>
+ bool MatchAndExplain(const MatcheeStringType& s,
+ MatchResultListener* /* listener */) const {
+ const StringType s2(s);
+ const bool eq = case_sensitive_ ? s2 == string_
+ : CaseInsensitiveStringEquals(s2, string_);
+ return expect_eq_ == eq;
+ }
+
+ void DescribeTo(::std::ostream* os) const {
+ DescribeToHelper(expect_eq_, os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ DescribeToHelper(!expect_eq_, os);
+ }
+
+ private:
+ void DescribeToHelper(bool expect_eq, ::std::ostream* os) const {
+ *os << (expect_eq ? "is " : "isn't ");
+ *os << "equal to ";
+ if (!case_sensitive_) {
+ *os << "(ignoring case) ";
+ }
+ UniversalPrint(string_, os);
+ }
+
+ const StringType string_;
+ const bool expect_eq_;
+ const bool case_sensitive_;
+};
+
+// Implements the polymorphic HasSubstr(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class HasSubstrMatcher {
+ public:
+ explicit HasSubstrMatcher(const StringType& substring)
+ : substring_(substring) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+ bool MatchAndExplain(const internal::StringView& s,
+ MatchResultListener* listener) const {
+ // This should fail to compile if StringView is used with wide
+ // strings.
+ const StringType& str = std::string(s);
+ return MatchAndExplain(str, listener);
+ }
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+ // Accepts pointer types, particularly:
+ // const char*
+ // char*
+ // const wchar_t*
+ // wchar_t*
+ template <typename CharType>
+ bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+ return s != nullptr && MatchAndExplain(StringType(s), listener);
+ }
+
+ // Matches anything that can convert to StringType.
+ //
+ // This is a template, not just a plain function with const StringType&,
+ // because StringView has some interfering non-explicit constructors.
+ template <typename MatcheeStringType>
+ bool MatchAndExplain(const MatcheeStringType& s,
+ MatchResultListener* /* listener */) const {
+ return StringType(s).find(substring_) != StringType::npos;
+ }
+
+ // Describes what this matcher matches.
+ void DescribeTo(::std::ostream* os) const {
+ *os << "has substring ";
+ UniversalPrint(substring_, os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "has no substring ";
+ UniversalPrint(substring_, os);
+ }
+
+ private:
+ const StringType substring_;
+};
+
+// Implements the polymorphic StartsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class StartsWithMatcher {
+ public:
+ explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+ bool MatchAndExplain(const internal::StringView& s,
+ MatchResultListener* listener) const {
+ // This should fail to compile if StringView is used with wide
+ // strings.
+ const StringType& str = std::string(s);
+ return MatchAndExplain(str, listener);
+ }
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+ // Accepts pointer types, particularly:
+ // const char*
+ // char*
+ // const wchar_t*
+ // wchar_t*
+ template <typename CharType>
+ bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+ return s != nullptr && MatchAndExplain(StringType(s), listener);
+ }
+
+ // Matches anything that can convert to StringType.
+ //
+ // This is a template, not just a plain function with const StringType&,
+ // because StringView has some interfering non-explicit constructors.
+ template <typename MatcheeStringType>
+ bool MatchAndExplain(const MatcheeStringType& s,
+ MatchResultListener* /* listener */) const {
+ const StringType& s2(s);
+ return s2.length() >= prefix_.length() &&
+ s2.substr(0, prefix_.length()) == prefix_;
+ }
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "starts with ";
+ UniversalPrint(prefix_, os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "doesn't start with ";
+ UniversalPrint(prefix_, os);
+ }
+
+ private:
+ const StringType prefix_;
+};
+
+// Implements the polymorphic EndsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class EndsWithMatcher {
+ public:
+ explicit EndsWithMatcher(const StringType& suffix) : suffix_(suffix) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+ bool MatchAndExplain(const internal::StringView& s,
+ MatchResultListener* listener) const {
+ // This should fail to compile if StringView is used with wide
+ // strings.
+ const StringType& str = std::string(s);
+ return MatchAndExplain(str, listener);
+ }
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+ // Accepts pointer types, particularly:
+ // const char*
+ // char*
+ // const wchar_t*
+ // wchar_t*
+ template <typename CharType>
+ bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+ return s != nullptr && MatchAndExplain(StringType(s), listener);
+ }
+
+ // Matches anything that can convert to StringType.
+ //
+ // This is a template, not just a plain function with const StringType&,
+ // because StringView has some interfering non-explicit constructors.
+ template <typename MatcheeStringType>
+ bool MatchAndExplain(const MatcheeStringType& s,
+ MatchResultListener* /* listener */) const {
+ const StringType& s2(s);
+ return s2.length() >= suffix_.length() &&
+ s2.substr(s2.length() - suffix_.length()) == suffix_;
+ }
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "ends with ";
+ UniversalPrint(suffix_, os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "doesn't end with ";
+ UniversalPrint(suffix_, os);
+ }
+
+ private:
+ const StringType suffix_;
+};
+
+// Implements the polymorphic WhenBase64Unescaped(matcher) matcher, which can be
+// used as a Matcher<T> as long as T can be converted to a string.
+class WhenBase64UnescapedMatcher {
+ public:
+ using is_gtest_matcher = void;
+
+ explicit WhenBase64UnescapedMatcher(
+ const Matcher<const std::string&>& internal_matcher)
+ : internal_matcher_(internal_matcher) {}
+
+ // Matches anything that can convert to std::string.
+ template <typename MatcheeStringType>
+ bool MatchAndExplain(const MatcheeStringType& s,
+ MatchResultListener* listener) const {
+ const std::string s2(s); // NOLINT (needed for working with string_view).
+ std::string unescaped;
+ if (!internal::Base64Unescape(s2, &unescaped)) {
+ if (listener != nullptr) {
+ *listener << "is not a valid base64 escaped string";
+ }
+ return false;
+ }
+ return MatchPrintAndExplain(unescaped, internal_matcher_, listener);
+ }
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "matches after Base64Unescape ";
+ internal_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "does not match after Base64Unescape ";
+ internal_matcher_.DescribeTo(os);
+ }
+
+ private:
+ const Matcher<const std::string&> internal_matcher_;
+};
+
+// Implements a matcher that compares the two fields of a 2-tuple
+// using one of the ==, <=, <, etc, operators. The two fields being
+// compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq() can be
+// used to match a std::tuple<int, short>, a std::tuple<const long&, double>,
+// etc). Therefore we use a template type conversion operator in the
+// implementation.
+template <typename D, typename Op>
+class PairMatchBase {
+ public:
+ template <typename T1, typename T2>
+ operator Matcher<::std::tuple<T1, T2>>() const {
+ return Matcher<::std::tuple<T1, T2>>(new Impl<const ::std::tuple<T1, T2>&>);
+ }
+ template <typename T1, typename T2>
+ operator Matcher<const ::std::tuple<T1, T2>&>() const {
+ return MakeMatcher(new Impl<const ::std::tuple<T1, T2>&>);
+ }
+
+ private:
+ static ::std::ostream& GetDesc(::std::ostream& os) { // NOLINT
+ return os << D::Desc();
+ }
+
+ template <typename Tuple>
+ class Impl : public MatcherInterface<Tuple> {
+ public:
+ bool MatchAndExplain(Tuple args,
+ MatchResultListener* /* listener */) const override {
+ return Op()(::std::get<0>(args), ::std::get<1>(args));
+ }
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "are " << GetDesc;
+ }
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "aren't " << GetDesc;
+ }
+ };
+};
+
+class Eq2Matcher : public PairMatchBase<Eq2Matcher, AnyEq> {
+ public:
+ static const char* Desc() { return "an equal pair"; }
+};
+class Ne2Matcher : public PairMatchBase<Ne2Matcher, AnyNe> {
+ public:
+ static const char* Desc() { return "an unequal pair"; }
+};
+class Lt2Matcher : public PairMatchBase<Lt2Matcher, AnyLt> {
+ public:
+ static const char* Desc() { return "a pair where the first < the second"; }
+};
+class Gt2Matcher : public PairMatchBase<Gt2Matcher, AnyGt> {
+ public:
+ static const char* Desc() { return "a pair where the first > the second"; }
+};
+class Le2Matcher : public PairMatchBase<Le2Matcher, AnyLe> {
+ public:
+ static const char* Desc() { return "a pair where the first <= the second"; }
+};
+class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
+ public:
+ static const char* Desc() { return "a pair where the first >= the second"; }
+};
+
+// Implements the Not(...) matcher for a particular argument type T.
+// We do not nest it inside the NotMatcher class template, as that
+// will prevent different instantiations of NotMatcher from sharing
+// the same NotMatcherImpl<T> class.
+template <typename T>
+class NotMatcherImpl : public MatcherInterface<const T&> {
+ public:
+ explicit NotMatcherImpl(const Matcher<T>& matcher) : matcher_(matcher) {}
+
+ bool MatchAndExplain(const T& x,
+ MatchResultListener* listener) const override {
+ return !matcher_.MatchAndExplain(x, listener);
+ }
+
+ void DescribeTo(::std::ostream* os) const override {
+ matcher_.DescribeNegationTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ matcher_.DescribeTo(os);
+ }
+
+ private:
+ const Matcher<T> matcher_;
+};
+
+// Implements the Not(m) matcher, which matches a value that doesn't
+// match matcher m.
+template <typename InnerMatcher>
+class NotMatcher {
+ public:
+ explicit NotMatcher(InnerMatcher matcher) : matcher_(matcher) {}
+
+ // This template type conversion operator allows Not(m) to be used
+ // to match any type m can match.
+ template <typename T>
+ operator Matcher<T>() const {
+ return Matcher<T>(new NotMatcherImpl<T>(SafeMatcherCast<T>(matcher_)));
+ }
+
+ private:
+ InnerMatcher matcher_;
+};
+
+// Implements the AllOf(m1, m2) matcher for a particular argument type
+// T. We do not nest it inside the BothOfMatcher class template, as
+// that will prevent different instantiations of BothOfMatcher from
+// sharing the same BothOfMatcherImpl<T> class.
+template <typename T>
+class AllOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+ explicit AllOfMatcherImpl(std::vector<Matcher<T>> matchers)
+ : matchers_(std::move(matchers)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "(";
+ for (size_t i = 0; i < matchers_.size(); ++i) {
+ if (i != 0) *os << ") and (";
+ matchers_[i].DescribeTo(os);
+ }
+ *os << ")";
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "(";
+ for (size_t i = 0; i < matchers_.size(); ++i) {
+ if (i != 0) *os << ") or (";
+ matchers_[i].DescribeNegationTo(os);
+ }
+ *os << ")";
+ }
+
+ bool MatchAndExplain(const T& x,
+ MatchResultListener* listener) const override {
+ // If either matcher1_ or matcher2_ doesn't match x, we only need
+ // to explain why one of them fails.
+ std::string all_match_result;
+
+ for (size_t i = 0; i < matchers_.size(); ++i) {
+ StringMatchResultListener slistener;
+ if (matchers_[i].MatchAndExplain(x, &slistener)) {
+ if (all_match_result.empty()) {
+ all_match_result = slistener.str();
+ } else {
+ std::string result = slistener.str();
+ if (!result.empty()) {
+ all_match_result += ", and ";
+ all_match_result += result;
+ }
+ }
+ } else {
+ *listener << slistener.str();
+ return false;
+ }
+ }
+
+ // Otherwise we need to explain why *both* of them match.
+ *listener << all_match_result;
+ return true;
+ }
+
+ private:
+ const std::vector<Matcher<T>> matchers_;
+};
+
+// VariadicMatcher is used for the variadic implementation of
+// AllOf(m_1, m_2, ...) and AnyOf(m_1, m_2, ...).
+// CombiningMatcher<T> is used to recursively combine the provided matchers
+// (of type Args...).
+template <template <typename T> class CombiningMatcher, typename... Args>
+class VariadicMatcher {
+ public:
+ VariadicMatcher(const Args&... matchers) // NOLINT
+ : matchers_(matchers...) {
+ static_assert(sizeof...(Args) > 0, "Must have at least one matcher.");
+ }
+
+ VariadicMatcher(const VariadicMatcher&) = default;
+ VariadicMatcher& operator=(const VariadicMatcher&) = delete;
+
+ // This template type conversion operator allows an
+ // VariadicMatcher<Matcher1, Matcher2...> object to match any type that
+ // all of the provided matchers (Matcher1, Matcher2, ...) can match.
+ template <typename T>
+ operator Matcher<T>() const {
+ std::vector<Matcher<T>> values;
+ CreateVariadicMatcher<T>(&values, std::integral_constant<size_t, 0>());
+ return Matcher<T>(new CombiningMatcher<T>(std::move(values)));
+ }
+
+ private:
+ template <typename T, size_t I>
+ void CreateVariadicMatcher(std::vector<Matcher<T>>* values,
+ std::integral_constant<size_t, I>) const {
+ values->push_back(SafeMatcherCast<T>(std::get<I>(matchers_)));
+ CreateVariadicMatcher<T>(values, std::integral_constant<size_t, I + 1>());
+ }
+
+ template <typename T>
+ void CreateVariadicMatcher(
+ std::vector<Matcher<T>>*,
+ std::integral_constant<size_t, sizeof...(Args)>) const {}
+
+ std::tuple<Args...> matchers_;
+};
+
+template <typename... Args>
+using AllOfMatcher = VariadicMatcher<AllOfMatcherImpl, Args...>;
+
+// Implements the AnyOf(m1, m2) matcher for a particular argument type
+// T. We do not nest it inside the AnyOfMatcher class template, as
+// that will prevent different instantiations of AnyOfMatcher from
+// sharing the same EitherOfMatcherImpl<T> class.
+template <typename T>
+class AnyOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+ explicit AnyOfMatcherImpl(std::vector<Matcher<T>> matchers)
+ : matchers_(std::move(matchers)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "(";
+ for (size_t i = 0; i < matchers_.size(); ++i) {
+ if (i != 0) *os << ") or (";
+ matchers_[i].DescribeTo(os);
+ }
+ *os << ")";
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "(";
+ for (size_t i = 0; i < matchers_.size(); ++i) {
+ if (i != 0) *os << ") and (";
+ matchers_[i].DescribeNegationTo(os);
+ }
+ *os << ")";
+ }
+
+ bool MatchAndExplain(const T& x,
+ MatchResultListener* listener) const override {
+ std::string no_match_result;
+
+ // If either matcher1_ or matcher2_ matches x, we just need to
+ // explain why *one* of them matches.
+ for (size_t i = 0; i < matchers_.size(); ++i) {
+ StringMatchResultListener slistener;
+ if (matchers_[i].MatchAndExplain(x, &slistener)) {
+ *listener << slistener.str();
+ return true;
+ } else {
+ if (no_match_result.empty()) {
+ no_match_result = slistener.str();
+ } else {
+ std::string result = slistener.str();
+ if (!result.empty()) {
+ no_match_result += ", and ";
+ no_match_result += result;
+ }
+ }
+ }
+ }
+
+ // Otherwise we need to explain why *both* of them fail.
+ *listener << no_match_result;
+ return false;
+ }
+
+ private:
+ const std::vector<Matcher<T>> matchers_;
+};
+
+// AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
+template <typename... Args>
+using AnyOfMatcher = VariadicMatcher<AnyOfMatcherImpl, Args...>;
+
+// ConditionalMatcher is the implementation of Conditional(cond, m1, m2)
+template <typename MatcherTrue, typename MatcherFalse>
+class ConditionalMatcher {
+ public:
+ ConditionalMatcher(bool condition, MatcherTrue matcher_true,
+ MatcherFalse matcher_false)
+ : condition_(condition),
+ matcher_true_(std::move(matcher_true)),
+ matcher_false_(std::move(matcher_false)) {}
+
+ template <typename T>
+ operator Matcher<T>() const { // NOLINT(runtime/explicit)
+ return condition_ ? SafeMatcherCast<T>(matcher_true_)
+ : SafeMatcherCast<T>(matcher_false_);
+ }
+
+ private:
+ bool condition_;
+ MatcherTrue matcher_true_;
+ MatcherFalse matcher_false_;
+};
+
+// Wrapper for implementation of Any/AllOfArray().
+template <template <class> class MatcherImpl, typename T>
+class SomeOfArrayMatcher {
+ public:
+ // Constructs the matcher from a sequence of element values or
+ // element matchers.
+ template <typename Iter>
+ SomeOfArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+ template <typename U>
+ operator Matcher<U>() const { // NOLINT
+ using RawU = typename std::decay<U>::type;
+ std::vector<Matcher<RawU>> matchers;
+ for (const auto& matcher : matchers_) {
+ matchers.push_back(MatcherCast<RawU>(matcher));
+ }
+ return Matcher<U>(new MatcherImpl<RawU>(std::move(matchers)));
+ }
+
+ private:
+ const ::std::vector<T> matchers_;
+};
+
+template <typename T>
+using AllOfArrayMatcher = SomeOfArrayMatcher<AllOfMatcherImpl, T>;
+
+template <typename T>
+using AnyOfArrayMatcher = SomeOfArrayMatcher<AnyOfMatcherImpl, T>;
+
+// Used for implementing Truly(pred), which turns a predicate into a
+// matcher.
+template <typename Predicate>
+class TrulyMatcher {
+ public:
+ explicit TrulyMatcher(Predicate pred) : predicate_(pred) {}
+
+ // This method template allows Truly(pred) to be used as a matcher
+ // for type T where T is the argument type of predicate 'pred'. The
+ // argument is passed by reference as the predicate may be
+ // interested in the address of the argument.
+ template <typename T>
+ bool MatchAndExplain(T& x, // NOLINT
+ MatchResultListener* listener) const {
+ // Without the if-statement, MSVC sometimes warns about converting
+ // a value to bool (warning 4800).
+ //
+ // We cannot write 'return !!predicate_(x);' as that doesn't work
+ // when predicate_(x) returns a class convertible to bool but
+ // having no operator!().
+ if (predicate_(x)) return true;
+ *listener << "didn't satisfy the given predicate";
+ return false;
+ }
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "satisfies the given predicate";
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "doesn't satisfy the given predicate";
+ }
+
+ private:
+ Predicate predicate_;
+};
+
+// Used for implementing Matches(matcher), which turns a matcher into
+// a predicate.
+template <typename M>
+class MatcherAsPredicate {
+ public:
+ explicit MatcherAsPredicate(M matcher) : matcher_(matcher) {}
+
+ // This template operator() allows Matches(m) to be used as a
+ // predicate on type T where m is a matcher on type T.
+ //
+ // The argument x is passed by reference instead of by value, as
+ // some matcher may be interested in its address (e.g. as in
+ // Matches(Ref(n))(x)).
+ template <typename T>
+ bool operator()(const T& x) const {
+ // We let matcher_ commit to a particular type here instead of
+ // when the MatcherAsPredicate object was constructed. This
+ // allows us to write Matches(m) where m is a polymorphic matcher
+ // (e.g. Eq(5)).
+ //
+ // If we write Matcher<T>(matcher_).Matches(x) here, it won't
+ // compile when matcher_ has type Matcher<const T&>; if we write
+ // Matcher<const T&>(matcher_).Matches(x) here, it won't compile
+ // when matcher_ has type Matcher<T>; if we just write
+ // matcher_.Matches(x), it won't compile when matcher_ is
+ // polymorphic, e.g. Eq(5).
+ //
+ // MatcherCast<const T&>() is necessary for making the code work
+ // in all of the above situations.
+ return MatcherCast<const T&>(matcher_).Matches(x);
+ }
+
+ private:
+ M matcher_;
+};
+
+// For implementing ASSERT_THAT() and EXPECT_THAT(). The template
+// argument M must be a type that can be converted to a matcher.
+template <typename M>
+class PredicateFormatterFromMatcher {
+ public:
+ explicit PredicateFormatterFromMatcher(M m) : matcher_(std::move(m)) {}
+
+ // This template () operator allows a PredicateFormatterFromMatcher
+ // object to act as a predicate-formatter suitable for using with
+ // Google Test's EXPECT_PRED_FORMAT1() macro.
+ template <typename T>
+ AssertionResult operator()(const char* value_text, const T& x) const {
+ // We convert matcher_ to a Matcher<const T&> *now* instead of
+ // when the PredicateFormatterFromMatcher object was constructed,
+ // as matcher_ may be polymorphic (e.g. NotNull()) and we won't
+ // know which type to instantiate it to until we actually see the
+ // type of x here.
+ //
+ // We write SafeMatcherCast<const T&>(matcher_) instead of
+ // Matcher<const T&>(matcher_), as the latter won't compile when
+ // matcher_ has type Matcher<T> (e.g. An<int>()).
+ // We don't write MatcherCast<const T&> either, as that allows
+ // potentially unsafe downcasting of the matcher argument.
+ const Matcher<const T&> matcher = SafeMatcherCast<const T&>(matcher_);
+
+ // The expected path here is that the matcher should match (i.e. that most
+ // tests pass) so optimize for this case.
+ if (matcher.Matches(x)) {
+ return AssertionSuccess();
+ }
+
+ ::std::stringstream ss;
+ ss << "Value of: " << value_text << "\n"
+ << "Expected: ";
+ matcher.DescribeTo(&ss);
+
+ // Rerun the matcher to "PrintAndExplain" the failure.
+ StringMatchResultListener listener;
+ if (MatchPrintAndExplain(x, matcher, &listener)) {
+ ss << "\n The matcher failed on the initial attempt; but passed when "
+ "rerun to generate the explanation.";
+ }
+ ss << "\n Actual: " << listener.str();
+ return AssertionFailure() << ss.str();
+ }
+
+ private:
+ const M matcher_;
+};
+
+// A helper function for converting a matcher to a predicate-formatter
+// without the user needing to explicitly write the type. This is
+// used for implementing ASSERT_THAT() and EXPECT_THAT().
+// Implementation detail: 'matcher' is received by-value to force decaying.
+template <typename M>
+inline PredicateFormatterFromMatcher<M> MakePredicateFormatterFromMatcher(
+ M matcher) {
+ return PredicateFormatterFromMatcher<M>(std::move(matcher));
+}
+
+// Implements the polymorphic IsNan() matcher, which matches any floating type
+// value that is Nan.
+class IsNanMatcher {
+ public:
+ template <typename FloatType>
+ bool MatchAndExplain(const FloatType& f,
+ MatchResultListener* /* listener */) const {
+ return (::std::isnan)(f);
+ }
+
+ void DescribeTo(::std::ostream* os) const { *os << "is NaN"; }
+ void DescribeNegationTo(::std::ostream* os) const { *os << "isn't NaN"; }
+};
+
+// Implements the polymorphic floating point equality matcher, which matches
+// two float values using ULP-based approximation or, optionally, a
+// user-specified epsilon. The template is meant to be instantiated with
+// FloatType being either float or double.
+template <typename FloatType>
+class FloatingEqMatcher {
+ public:
+ // Constructor for FloatingEqMatcher.
+ // The matcher's input will be compared with expected. The matcher treats two
+ // NANs as equal if nan_eq_nan is true. Otherwise, under IEEE standards,
+ // equality comparisons between NANs will always return false. We specify a
+ // negative max_abs_error_ term to indicate that ULP-based approximation will
+ // be used for comparison.
+ FloatingEqMatcher(FloatType expected, bool nan_eq_nan)
+ : expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {}
+
+ // Constructor that supports a user-specified max_abs_error that will be used
+ // for comparison instead of ULP-based approximation. The max absolute
+ // should be non-negative.
+ FloatingEqMatcher(FloatType expected, bool nan_eq_nan,
+ FloatType max_abs_error)
+ : expected_(expected),
+ nan_eq_nan_(nan_eq_nan),
+ max_abs_error_(max_abs_error) {
+ GTEST_CHECK_(max_abs_error >= 0)
+ << ", where max_abs_error is" << max_abs_error;
+ }
+
+ // Implements floating point equality matcher as a Matcher<T>.
+ template <typename T>
+ class Impl : public MatcherInterface<T> {
+ public:
+ Impl(FloatType expected, bool nan_eq_nan, FloatType max_abs_error)
+ : expected_(expected),
+ nan_eq_nan_(nan_eq_nan),
+ max_abs_error_(max_abs_error) {}
+
+ bool MatchAndExplain(T value,
+ MatchResultListener* listener) const override {
+ const FloatingPoint<FloatType> actual(value), expected(expected_);
+
+ // Compares NaNs first, if nan_eq_nan_ is true.
+ if (actual.is_nan() || expected.is_nan()) {
+ if (actual.is_nan() && expected.is_nan()) {
+ return nan_eq_nan_;
+ }
+ // One is nan; the other is not nan.
+ return false;
+ }
+ if (HasMaxAbsError()) {
+ // We perform an equality check so that inf will match inf, regardless
+ // of error bounds. If the result of value - expected_ would result in
+ // overflow or if either value is inf, the default result is infinity,
+ // which should only match if max_abs_error_ is also infinity.
+ if (value == expected_) {
+ return true;
+ }
+
+ const FloatType diff = value - expected_;
+ if (::std::fabs(diff) <= max_abs_error_) {
+ return true;
+ }
+
+ if (listener->IsInterested()) {
+ *listener << "which is " << diff << " from " << expected_;
+ }
+ return false;
+ } else {
+ return actual.AlmostEquals(expected);
+ }
+ }
+
+ void DescribeTo(::std::ostream* os) const override {
+ // os->precision() returns the previously set precision, which we
+ // store to restore the ostream to its original configuration
+ // after outputting.
+ const ::std::streamsize old_precision =
+ os->precision(::std::numeric_limits<FloatType>::digits10 + 2);
+ if (FloatingPoint<FloatType>(expected_).is_nan()) {
+ if (nan_eq_nan_) {
+ *os << "is NaN";
+ } else {
+ *os << "never matches";
+ }
+ } else {
+ *os << "is approximately " << expected_;
+ if (HasMaxAbsError()) {
+ *os << " (absolute error <= " << max_abs_error_ << ")";
+ }
+ }
+ os->precision(old_precision);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ // As before, get original precision.
+ const ::std::streamsize old_precision =
+ os->precision(::std::numeric_limits<FloatType>::digits10 + 2);
+ if (FloatingPoint<FloatType>(expected_).is_nan()) {
+ if (nan_eq_nan_) {
+ *os << "isn't NaN";
+ } else {
+ *os << "is anything";
+ }
+ } else {
+ *os << "isn't approximately " << expected_;
+ if (HasMaxAbsError()) {
+ *os << " (absolute error > " << max_abs_error_ << ")";
+ }
+ }
+ // Restore original precision.
+ os->precision(old_precision);
+ }
+
+ private:
+ bool HasMaxAbsError() const { return max_abs_error_ >= 0; }
+
+ const FloatType expected_;
+ const bool nan_eq_nan_;
+ // max_abs_error will be used for value comparison when >= 0.
+ const FloatType max_abs_error_;
+ };
+
+ // The following 3 type conversion operators allow FloatEq(expected) and
+ // NanSensitiveFloatEq(expected) to be used as a Matcher<float>, a
+ // Matcher<const float&>, or a Matcher<float&>, but nothing else.
+ operator Matcher<FloatType>() const {
+ return MakeMatcher(
+ new Impl<FloatType>(expected_, nan_eq_nan_, max_abs_error_));
+ }
+
+ operator Matcher<const FloatType&>() const {
+ return MakeMatcher(
+ new Impl<const FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+ }
+
+ operator Matcher<FloatType&>() const {
+ return MakeMatcher(
+ new Impl<FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+ }
+
+ private:
+ const FloatType expected_;
+ const bool nan_eq_nan_;
+ // max_abs_error will be used for value comparison when >= 0.
+ const FloatType max_abs_error_;
+};
+
+// A 2-tuple ("binary") wrapper around FloatingEqMatcher:
+// FloatingEq2Matcher() matches (x, y) by matching FloatingEqMatcher(x, false)
+// against y, and FloatingEq2Matcher(e) matches FloatingEqMatcher(x, false, e)
+// against y. The former implements "Eq", the latter "Near". At present, there
+// is no version that compares NaNs as equal.
+template <typename FloatType>
+class FloatingEq2Matcher {
+ public:
+ FloatingEq2Matcher() { Init(-1, false); }
+
+ explicit FloatingEq2Matcher(bool nan_eq_nan) { Init(-1, nan_eq_nan); }
+
+ explicit FloatingEq2Matcher(FloatType max_abs_error) {
+ Init(max_abs_error, false);
+ }
+
+ FloatingEq2Matcher(FloatType max_abs_error, bool nan_eq_nan) {
+ Init(max_abs_error, nan_eq_nan);
+ }
+
+ template <typename T1, typename T2>
+ operator Matcher<::std::tuple<T1, T2>>() const {
+ return MakeMatcher(
+ new Impl<::std::tuple<T1, T2>>(max_abs_error_, nan_eq_nan_));
+ }
+ template <typename T1, typename T2>
+ operator Matcher<const ::std::tuple<T1, T2>&>() const {
+ return MakeMatcher(
+ new Impl<const ::std::tuple<T1, T2>&>(max_abs_error_, nan_eq_nan_));
+ }
+
+ private:
+ static ::std::ostream& GetDesc(::std::ostream& os) { // NOLINT
+ return os << "an almost-equal pair";
+ }
+
+ template <typename Tuple>
+ class Impl : public MatcherInterface<Tuple> {
+ public:
+ Impl(FloatType max_abs_error, bool nan_eq_nan)
+ : max_abs_error_(max_abs_error), nan_eq_nan_(nan_eq_nan) {}
+
+ bool MatchAndExplain(Tuple args,
+ MatchResultListener* listener) const override {
+ if (max_abs_error_ == -1) {
+ FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_);
+ return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+ ::std::get<1>(args), listener);
+ } else {
+ FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_,
+ max_abs_error_);
+ return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+ ::std::get<1>(args), listener);
+ }
+ }
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "are " << GetDesc;
+ }
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "aren't " << GetDesc;
+ }
+
+ private:
+ FloatType max_abs_error_;
+ const bool nan_eq_nan_;
+ };
+
+ void Init(FloatType max_abs_error_val, bool nan_eq_nan_val) {
+ max_abs_error_ = max_abs_error_val;
+ nan_eq_nan_ = nan_eq_nan_val;
+ }
+ FloatType max_abs_error_;
+ bool nan_eq_nan_;
+};
+
+// Implements the Pointee(m) matcher for matching a pointer whose
+// pointee matches matcher m. The pointer can be either raw or smart.
+template <typename InnerMatcher>
+class PointeeMatcher {
+ public:
+ explicit PointeeMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+ // This type conversion operator template allows Pointee(m) to be
+ // used as a matcher for any pointer type whose pointee type is
+ // compatible with the inner matcher, where type Pointer can be
+ // either a raw pointer or a smart pointer.
+ //
+ // The reason we do this instead of relying on
+ // MakePolymorphicMatcher() is that the latter is not flexible
+ // enough for implementing the DescribeTo() method of Pointee().
+ template <typename Pointer>
+ operator Matcher<Pointer>() const {
+ return Matcher<Pointer>(new Impl<const Pointer&>(matcher_));
+ }
+
+ private:
+ // The monomorphic implementation that works for a particular pointer type.
+ template <typename Pointer>
+ class Impl : public MatcherInterface<Pointer> {
+ public:
+ using Pointee =
+ typename std::pointer_traits<GTEST_REMOVE_REFERENCE_AND_CONST_(
+ Pointer)>::element_type;
+
+ explicit Impl(const InnerMatcher& matcher)
+ : matcher_(MatcherCast<const Pointee&>(matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "points to a value that ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "does not point to a value that ";
+ matcher_.DescribeTo(os);
+ }
+
+ bool MatchAndExplain(Pointer pointer,
+ MatchResultListener* listener) const override {
+ if (GetRawPointer(pointer) == nullptr) return false;
+
+ *listener << "which points to ";
+ return MatchPrintAndExplain(*pointer, matcher_, listener);
+ }
+
+ private:
+ const Matcher<const Pointee&> matcher_;
+ };
+
+ const InnerMatcher matcher_;
+};
+
+// Implements the Pointer(m) matcher
+// Implements the Pointer(m) matcher for matching a pointer that matches matcher
+// m. The pointer can be either raw or smart, and will match `m` against the
+// raw pointer.
+template <typename InnerMatcher>
+class PointerMatcher {
+ public:
+ explicit PointerMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+ // This type conversion operator template allows Pointer(m) to be
+ // used as a matcher for any pointer type whose pointer type is
+ // compatible with the inner matcher, where type PointerType can be
+ // either a raw pointer or a smart pointer.
+ //
+ // The reason we do this instead of relying on
+ // MakePolymorphicMatcher() is that the latter is not flexible
+ // enough for implementing the DescribeTo() method of Pointer().
+ template <typename PointerType>
+ operator Matcher<PointerType>() const { // NOLINT
+ return Matcher<PointerType>(new Impl<const PointerType&>(matcher_));
+ }
+
+ private:
+ // The monomorphic implementation that works for a particular pointer type.
+ template <typename PointerType>
+ class Impl : public MatcherInterface<PointerType> {
+ public:
+ using Pointer =
+ const typename std::pointer_traits<GTEST_REMOVE_REFERENCE_AND_CONST_(
+ PointerType)>::element_type*;
+
+ explicit Impl(const InnerMatcher& matcher)
+ : matcher_(MatcherCast<Pointer>(matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "is a pointer that ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "is not a pointer that ";
+ matcher_.DescribeTo(os);
+ }
+
+ bool MatchAndExplain(PointerType pointer,
+ MatchResultListener* listener) const override {
+ *listener << "which is a pointer that ";
+ Pointer p = GetRawPointer(pointer);
+ return MatchPrintAndExplain(p, matcher_, listener);
+ }
+
+ private:
+ Matcher<Pointer> matcher_;
+ };
+
+ const InnerMatcher matcher_;
+};
+
+#if GTEST_HAS_RTTI
+// Implements the WhenDynamicCastTo<T>(m) matcher that matches a pointer or
+// reference that matches inner_matcher when dynamic_cast<T> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+class WhenDynamicCastToMatcherBase {
+ public:
+ explicit WhenDynamicCastToMatcherBase(const Matcher<To>& matcher)
+ : matcher_(matcher) {}
+
+ void DescribeTo(::std::ostream* os) const {
+ GetCastTypeDescription(os);
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ GetCastTypeDescription(os);
+ matcher_.DescribeNegationTo(os);
+ }
+
+ protected:
+ const Matcher<To> matcher_;
+
+ static std::string GetToName() { return GetTypeName<To>(); }
+
+ private:
+ static void GetCastTypeDescription(::std::ostream* os) {
+ *os << "when dynamic_cast to " << GetToName() << ", ";
+ }
+};
+
+// Primary template.
+// To is a pointer. Cast and forward the result.
+template <typename To>
+class WhenDynamicCastToMatcher : public WhenDynamicCastToMatcherBase<To> {
+ public:
+ explicit WhenDynamicCastToMatcher(const Matcher<To>& matcher)
+ : WhenDynamicCastToMatcherBase<To>(matcher) {}
+
+ template <typename From>
+ bool MatchAndExplain(From from, MatchResultListener* listener) const {
+ To to = dynamic_cast<To>(from);
+ return MatchPrintAndExplain(to, this->matcher_, listener);
+ }
+};
+
+// Specialize for references.
+// In this case we return false if the dynamic_cast fails.
+template <typename To>
+class WhenDynamicCastToMatcher<To&> : public WhenDynamicCastToMatcherBase<To&> {
+ public:
+ explicit WhenDynamicCastToMatcher(const Matcher<To&>& matcher)
+ : WhenDynamicCastToMatcherBase<To&>(matcher) {}
+
+ template <typename From>
+ bool MatchAndExplain(From& from, MatchResultListener* listener) const {
+ // We don't want an std::bad_cast here, so do the cast with pointers.
+ To* to = dynamic_cast<To*>(&from);
+ if (to == nullptr) {
+ *listener << "which cannot be dynamic_cast to " << this->GetToName();
+ return false;
+ }
+ return MatchPrintAndExplain(*to, this->matcher_, listener);
+ }
+};
+#endif // GTEST_HAS_RTTI
+
+// Implements the Field() matcher for matching a field (i.e. member
+// variable) of an object.
+template <typename Class, typename FieldType>
+class FieldMatcher {
+ public:
+ FieldMatcher(FieldType Class::*field,
+ const Matcher<const FieldType&>& matcher)
+ : field_(field), matcher_(matcher), whose_field_("whose given field ") {}
+
+ FieldMatcher(const std::string& field_name, FieldType Class::*field,
+ const Matcher<const FieldType&>& matcher)
+ : field_(field),
+ matcher_(matcher),
+ whose_field_("whose field `" + field_name + "` ") {}
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "is an object " << whose_field_;
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "is an object " << whose_field_;
+ matcher_.DescribeNegationTo(os);
+ }
+
+ template <typename T>
+ bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
+ // FIXME: The dispatch on std::is_pointer was introduced as a workaround for
+ // a compiler bug, and can now be removed.
+ return MatchAndExplainImpl(
+ typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+ value, listener);
+ }
+
+ private:
+ bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+ const Class& obj,
+ MatchResultListener* listener) const {
+ *listener << whose_field_ << "is ";
+ return MatchPrintAndExplain(obj.*field_, matcher_, listener);
+ }
+
+ bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+ MatchResultListener* listener) const {
+ if (p == nullptr) return false;
+
+ *listener << "which points to an object ";
+ // Since *p has a field, it must be a class/struct/union type and
+ // thus cannot be a pointer. Therefore we pass false_type() as
+ // the first argument.
+ return MatchAndExplainImpl(std::false_type(), *p, listener);
+ }
+
+ const FieldType Class::*field_;
+ const Matcher<const FieldType&> matcher_;
+
+ // Contains either "whose given field " if the name of the field is unknown
+ // or "whose field `name_of_field` " if the name is known.
+ const std::string whose_field_;
+};
+
+// Implements the Property() matcher for matching a property
+// (i.e. return value of a getter method) of an object.
+//
+// Property is a const-qualified member function of Class returning
+// PropertyType.
+template <typename Class, typename PropertyType, typename Property>
+class PropertyMatcher {
+ public:
+ typedef const PropertyType& RefToConstProperty;
+
+ PropertyMatcher(Property property, const Matcher<RefToConstProperty>& matcher)
+ : property_(property),
+ matcher_(matcher),
+ whose_property_("whose given property ") {}
+
+ PropertyMatcher(const std::string& property_name, Property property,
+ const Matcher<RefToConstProperty>& matcher)
+ : property_(property),
+ matcher_(matcher),
+ whose_property_("whose property `" + property_name + "` ") {}
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "is an object " << whose_property_;
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "is an object " << whose_property_;
+ matcher_.DescribeNegationTo(os);
+ }
+
+ template <typename T>
+ bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
+ return MatchAndExplainImpl(
+ typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+ value, listener);
+ }
+
+ private:
+ bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+ const Class& obj,
+ MatchResultListener* listener) const {
+ *listener << whose_property_ << "is ";
+ // Cannot pass the return value (for example, int) to MatchPrintAndExplain,
+ // which takes a non-const reference as argument.
+ RefToConstProperty result = (obj.*property_)();
+ return MatchPrintAndExplain(result, matcher_, listener);
+ }
+
+ bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+ MatchResultListener* listener) const {
+ if (p == nullptr) return false;
+
+ *listener << "which points to an object ";
+ // Since *p has a property method, it must be a class/struct/union
+ // type and thus cannot be a pointer. Therefore we pass
+ // false_type() as the first argument.
+ return MatchAndExplainImpl(std::false_type(), *p, listener);
+ }
+
+ Property property_;
+ const Matcher<RefToConstProperty> matcher_;
+
+ // Contains either "whose given property " if the name of the property is
+ // unknown or "whose property `name_of_property` " if the name is known.
+ const std::string whose_property_;
+};
+
+// Type traits specifying various features of different functors for ResultOf.
+// The default template specifies features for functor objects.
+template <typename Functor>
+struct CallableTraits {
+ typedef Functor StorageType;
+
+ static void CheckIsValid(Functor /* functor */) {}
+
+ template <typename T>
+ static auto Invoke(Functor f, const T& arg) -> decltype(f(arg)) {
+ return f(arg);
+ }
+};
+
+// Specialization for function pointers.
+template <typename ArgType, typename ResType>
+struct CallableTraits<ResType (*)(ArgType)> {
+ typedef ResType ResultType;
+ typedef ResType (*StorageType)(ArgType);
+
+ static void CheckIsValid(ResType (*f)(ArgType)) {
+ GTEST_CHECK_(f != nullptr)
+ << "NULL function pointer is passed into ResultOf().";
+ }
+ template <typename T>
+ static ResType Invoke(ResType (*f)(ArgType), T arg) {
+ return (*f)(arg);
+ }
+};
+
+// Implements the ResultOf() matcher for matching a return value of a
+// unary function of an object.
+template <typename Callable, typename InnerMatcher>
+class ResultOfMatcher {
+ public:
+ ResultOfMatcher(Callable callable, InnerMatcher matcher)
+ : ResultOfMatcher(/*result_description=*/"", std::move(callable),
+ std::move(matcher)) {}
+
+ ResultOfMatcher(const std::string& result_description, Callable callable,
+ InnerMatcher matcher)
+ : result_description_(result_description),
+ callable_(std::move(callable)),
+ matcher_(std::move(matcher)) {
+ CallableTraits<Callable>::CheckIsValid(callable_);
+ }
+
+ template <typename T>
+ operator Matcher<T>() const {
+ return Matcher<T>(
+ new Impl<const T&>(result_description_, callable_, matcher_));
+ }
+
+ private:
+ typedef typename CallableTraits<Callable>::StorageType CallableStorageType;
+
+ template <typename T>
+ class Impl : public MatcherInterface<T> {
+ using ResultType = decltype(CallableTraits<Callable>::template Invoke<T>(
+ std::declval<CallableStorageType>(), std::declval<T>()));
+
+ public:
+ template <typename M>
+ Impl(const std::string& result_description,
+ const CallableStorageType& callable, const M& matcher)
+ : result_description_(result_description),
+ callable_(callable),
+ matcher_(MatcherCast<ResultType>(matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ if (result_description_.empty()) {
+ *os << "is mapped by the given callable to a value that ";
+ } else {
+ *os << "whose " << result_description_ << " ";
+ }
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ if (result_description_.empty()) {
+ *os << "is mapped by the given callable to a value that ";
+ } else {
+ *os << "whose " << result_description_ << " ";
+ }
+ matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(T obj, MatchResultListener* listener) const override {
+ if (result_description_.empty()) {
+ *listener << "which is mapped by the given callable to ";
+ } else {
+ *listener << "whose " << result_description_ << " is ";
+ }
+ // Cannot pass the return value directly to MatchPrintAndExplain, which
+ // takes a non-const reference as argument.
+ // Also, specifying template argument explicitly is needed because T could
+ // be a non-const reference (e.g. Matcher<Uncopyable&>).
+ ResultType result =
+ CallableTraits<Callable>::template Invoke<T>(callable_, obj);
+ return MatchPrintAndExplain(result, matcher_, listener);
+ }
+
+ private:
+ const std::string result_description_;
+ // Functors often define operator() as non-const method even though
+ // they are actually stateless. But we need to use them even when
+ // 'this' is a const pointer. It's the user's responsibility not to
+ // use stateful callables with ResultOf(), which doesn't guarantee
+ // how many times the callable will be invoked.
+ mutable CallableStorageType callable_;
+ const Matcher<ResultType> matcher_;
+ }; // class Impl
+
+ const std::string result_description_;
+ const CallableStorageType callable_;
+ const InnerMatcher matcher_;
+};
+
+// Implements a matcher that checks the size of an STL-style container.
+template <typename SizeMatcher>
+class SizeIsMatcher {
+ public:
+ explicit SizeIsMatcher(const SizeMatcher& size_matcher)
+ : size_matcher_(size_matcher) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const {
+ return Matcher<Container>(new Impl<const Container&>(size_matcher_));
+ }
+
+ template <typename Container>
+ class Impl : public MatcherInterface<Container> {
+ public:
+ using SizeType = decltype(std::declval<Container>().size());
+ explicit Impl(const SizeMatcher& size_matcher)
+ : size_matcher_(MatcherCast<SizeType>(size_matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "size ";
+ size_matcher_.DescribeTo(os);
+ }
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "size ";
+ size_matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ SizeType size = container.size();
+ StringMatchResultListener size_listener;
+ const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
+ *listener << "whose size " << size
+ << (result ? " matches" : " doesn't match");
+ PrintIfNotEmpty(size_listener.str(), listener->stream());
+ return result;
+ }
+
+ private:
+ const Matcher<SizeType> size_matcher_;
+ };
+
+ private:
+ const SizeMatcher size_matcher_;
+};
+
+// Implements a matcher that checks the begin()..end() distance of an STL-style
+// container.
+template <typename DistanceMatcher>
+class BeginEndDistanceIsMatcher {
+ public:
+ explicit BeginEndDistanceIsMatcher(const DistanceMatcher& distance_matcher)
+ : distance_matcher_(distance_matcher) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const {
+ return Matcher<Container>(new Impl<const Container&>(distance_matcher_));
+ }
+
+ template <typename Container>
+ class Impl : public MatcherInterface<Container> {
+ public:
+ typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
+ Container)>
+ ContainerView;
+ typedef typename std::iterator_traits<
+ typename ContainerView::type::const_iterator>::difference_type
+ DistanceType;
+ explicit Impl(const DistanceMatcher& distance_matcher)
+ : distance_matcher_(MatcherCast<DistanceType>(distance_matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "distance between begin() and end() ";
+ distance_matcher_.DescribeTo(os);
+ }
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "distance between begin() and end() ";
+ distance_matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ using std::begin;
+ using std::end;
+ DistanceType distance = std::distance(begin(container), end(container));
+ StringMatchResultListener distance_listener;
+ const bool result =
+ distance_matcher_.MatchAndExplain(distance, &distance_listener);
+ *listener << "whose distance between begin() and end() " << distance
+ << (result ? " matches" : " doesn't match");
+ PrintIfNotEmpty(distance_listener.str(), listener->stream());
+ return result;
+ }
+
+ private:
+ const Matcher<DistanceType> distance_matcher_;
+ };
+
+ private:
+ const DistanceMatcher distance_matcher_;
+};
+
+// Implements an equality matcher for any STL-style container whose elements
+// support ==. This matcher is like Eq(), but its failure explanations provide
+// more detailed information that is useful when the container is used as a set.
+// The failure message reports elements that are in one of the operands but not
+// the other. The failure messages do not report duplicate or out-of-order
+// elements in the containers (which don't properly matter to sets, but can
+// occur if the containers are vectors or lists, for example).
+//
+// Uses the container's const_iterator, value_type, operator ==,
+// begin(), and end().
+template <typename Container>
+class ContainerEqMatcher {
+ public:
+ typedef internal::StlContainerView<Container> View;
+ typedef typename View::type StlContainer;
+ typedef typename View::const_reference StlContainerReference;
+
+ static_assert(!std::is_const<Container>::value,
+ "Container type must not be const");
+ static_assert(!std::is_reference<Container>::value,
+ "Container type must not be a reference");
+
+ // We make a copy of expected in case the elements in it are modified
+ // after this matcher is created.
+ explicit ContainerEqMatcher(const Container& expected)
+ : expected_(View::Copy(expected)) {}
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << "equals ";
+ UniversalPrint(expected_, os);
+ }
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "does not equal ";
+ UniversalPrint(expected_, os);
+ }
+
+ template <typename LhsContainer>
+ bool MatchAndExplain(const LhsContainer& lhs,
+ MatchResultListener* listener) const {
+ typedef internal::StlContainerView<
+ typename std::remove_const<LhsContainer>::type>
+ LhsView;
+ StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+ if (lhs_stl_container == expected_) return true;
+
+ ::std::ostream* const os = listener->stream();
+ if (os != nullptr) {
+ // Something is different. Check for extra values first.
+ bool printed_header = false;
+ for (auto it = lhs_stl_container.begin(); it != lhs_stl_container.end();
+ ++it) {
+ if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
+ expected_.end()) {
+ if (printed_header) {
+ *os << ", ";
+ } else {
+ *os << "which has these unexpected elements: ";
+ printed_header = true;
+ }
+ UniversalPrint(*it, os);
+ }
+ }
+
+ // Now check for missing values.
+ bool printed_header2 = false;
+ for (auto it = expected_.begin(); it != expected_.end(); ++it) {
+ if (internal::ArrayAwareFind(lhs_stl_container.begin(),
+ lhs_stl_container.end(),
+ *it) == lhs_stl_container.end()) {
+ if (printed_header2) {
+ *os << ", ";
+ } else {
+ *os << (printed_header ? ",\nand" : "which")
+ << " doesn't have these expected elements: ";
+ printed_header2 = true;
+ }
+ UniversalPrint(*it, os);
+ }
+ }
+ }
+
+ return false;
+ }
+
+ private:
+ const StlContainer expected_;
+};
+
+// A comparator functor that uses the < operator to compare two values.
+struct LessComparator {
+ template <typename T, typename U>
+ bool operator()(const T& lhs, const U& rhs) const {
+ return lhs < rhs;
+ }
+};
+
+// Implements WhenSortedBy(comparator, container_matcher).
+template <typename Comparator, typename ContainerMatcher>
+class WhenSortedByMatcher {
+ public:
+ WhenSortedByMatcher(const Comparator& comparator,
+ const ContainerMatcher& matcher)
+ : comparator_(comparator), matcher_(matcher) {}
+
+ template <typename LhsContainer>
+ operator Matcher<LhsContainer>() const {
+ return MakeMatcher(new Impl<LhsContainer>(comparator_, matcher_));
+ }
+
+ template <typename LhsContainer>
+ class Impl : public MatcherInterface<LhsContainer> {
+ public:
+ typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
+ LhsContainer)>
+ LhsView;
+ typedef typename LhsView::type LhsStlContainer;
+ typedef typename LhsView::const_reference LhsStlContainerReference;
+ // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
+ // so that we can match associative containers.
+ typedef
+ typename RemoveConstFromKey<typename LhsStlContainer::value_type>::type
+ LhsValue;
+
+ Impl(const Comparator& comparator, const ContainerMatcher& matcher)
+ : comparator_(comparator), matcher_(matcher) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "(when sorted) ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "(when sorted) ";
+ matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(LhsContainer lhs,
+ MatchResultListener* listener) const override {
+ LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+ ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
+ lhs_stl_container.end());
+ ::std::sort(sorted_container.begin(), sorted_container.end(),
+ comparator_);
+
+ if (!listener->IsInterested()) {
+ // If the listener is not interested, we do not need to
+ // construct the inner explanation.
+ return matcher_.Matches(sorted_container);
+ }
+
+ *listener << "which is ";
+ UniversalPrint(sorted_container, listener->stream());
+ *listener << " when sorted";
+
+ StringMatchResultListener inner_listener;
+ const bool match =
+ matcher_.MatchAndExplain(sorted_container, &inner_listener);
+ PrintIfNotEmpty(inner_listener.str(), listener->stream());
+ return match;
+ }
+
+ private:
+ const Comparator comparator_;
+ const Matcher<const ::std::vector<LhsValue>&> matcher_;
+
+ Impl(const Impl&) = delete;
+ Impl& operator=(const Impl&) = delete;
+ };
+
+ private:
+ const Comparator comparator_;
+ const ContainerMatcher matcher_;
+};
+
+// Implements Pointwise(tuple_matcher, rhs_container). tuple_matcher
+// must be able to be safely cast to Matcher<std::tuple<const T1&, const
+// T2&> >, where T1 and T2 are the types of elements in the LHS
+// container and the RHS container respectively.
+template <typename TupleMatcher, typename RhsContainer>
+class PointwiseMatcher {
+ static_assert(
+ !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>::value,
+ "use UnorderedPointwise with hash tables");
+
+ public:
+ typedef internal::StlContainerView<RhsContainer> RhsView;
+ typedef typename RhsView::type RhsStlContainer;
+ typedef typename RhsStlContainer::value_type RhsValue;
+
+ static_assert(!std::is_const<RhsContainer>::value,
+ "RhsContainer type must not be const");
+ static_assert(!std::is_reference<RhsContainer>::value,
+ "RhsContainer type must not be a reference");
+
+ // Like ContainerEq, we make a copy of rhs in case the elements in
+ // it are modified after this matcher is created.
+ PointwiseMatcher(const TupleMatcher& tuple_matcher, const RhsContainer& rhs)
+ : tuple_matcher_(tuple_matcher), rhs_(RhsView::Copy(rhs)) {}
+
+ template <typename LhsContainer>
+ operator Matcher<LhsContainer>() const {
+ static_assert(
+ !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)>::value,
+ "use UnorderedPointwise with hash tables");
+
+ return Matcher<LhsContainer>(
+ new Impl<const LhsContainer&>(tuple_matcher_, rhs_));
+ }
+
+ template <typename LhsContainer>
+ class Impl : public MatcherInterface<LhsContainer> {
+ public:
+ typedef internal::StlContainerView<GTEST_REMOVE_REFERENCE_AND_CONST_(
+ LhsContainer)>
+ LhsView;
+ typedef typename LhsView::type LhsStlContainer;
+ typedef typename LhsView::const_reference LhsStlContainerReference;
+ typedef typename LhsStlContainer::value_type LhsValue;
+ // We pass the LHS value and the RHS value to the inner matcher by
+ // reference, as they may be expensive to copy. We must use tuple
+ // instead of pair here, as a pair cannot hold references (C++ 98,
+ // 20.2.2 [lib.pairs]).
+ typedef ::std::tuple<const LhsValue&, const RhsValue&> InnerMatcherArg;
+
+ Impl(const TupleMatcher& tuple_matcher, const RhsStlContainer& rhs)
+ // mono_tuple_matcher_ holds a monomorphic version of the tuple matcher.
+ : mono_tuple_matcher_(SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
+ rhs_(rhs) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "contains " << rhs_.size()
+ << " values, where each value and its corresponding value in ";
+ UniversalPrinter<RhsStlContainer>::Print(rhs_, os);
+ *os << " ";
+ mono_tuple_matcher_.DescribeTo(os);
+ }
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "doesn't contain exactly " << rhs_.size()
+ << " values, or contains a value x at some index i"
+ << " where x and the i-th value of ";
+ UniversalPrint(rhs_, os);
+ *os << " ";
+ mono_tuple_matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(LhsContainer lhs,
+ MatchResultListener* listener) const override {
+ LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+ const size_t actual_size = lhs_stl_container.size();
+ if (actual_size != rhs_.size()) {
+ *listener << "which contains " << actual_size << " values";
+ return false;
+ }
+
+ auto left = lhs_stl_container.begin();
+ auto right = rhs_.begin();
+ for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
+ if (listener->IsInterested()) {
+ StringMatchResultListener inner_listener;
+ // Create InnerMatcherArg as a temporarily object to avoid it outlives
+ // *left and *right. Dereference or the conversion to `const T&` may
+ // return temp objects, e.g. for vector<bool>.
+ if (!mono_tuple_matcher_.MatchAndExplain(
+ InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+ ImplicitCast_<const RhsValue&>(*right)),
+ &inner_listener)) {
+ *listener << "where the value pair (";
+ UniversalPrint(*left, listener->stream());
+ *listener << ", ";
+ UniversalPrint(*right, listener->stream());
+ *listener << ") at index #" << i << " don't match";
+ PrintIfNotEmpty(inner_listener.str(), listener->stream());
+ return false;
+ }
+ } else {
+ if (!mono_tuple_matcher_.Matches(
+ InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+ ImplicitCast_<const RhsValue&>(*right))))
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private:
+ const Matcher<InnerMatcherArg> mono_tuple_matcher_;
+ const RhsStlContainer rhs_;
+ };
+
+ private:
+ const TupleMatcher tuple_matcher_;
+ const RhsStlContainer rhs_;
+};
+
+// Holds the logic common to ContainsMatcherImpl and EachMatcherImpl.
+template <typename Container>
+class QuantifierMatcherImpl : public MatcherInterface<Container> {
+ public:
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+ typedef StlContainerView<RawContainer> View;
+ typedef typename View::type StlContainer;
+ typedef typename View::const_reference StlContainerReference;
+ typedef typename StlContainer::value_type Element;
+
+ template <typename InnerMatcher>
+ explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
+ : inner_matcher_(
+ testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
+
+ // Checks whether:
+ // * All elements in the container match, if all_elements_should_match.
+ // * Any element in the container matches, if !all_elements_should_match.
+ bool MatchAndExplainImpl(bool all_elements_should_match, Container container,
+ MatchResultListener* listener) const {
+ StlContainerReference stl_container = View::ConstReference(container);
+ size_t i = 0;
+ for (auto it = stl_container.begin(); it != stl_container.end();
+ ++it, ++i) {
+ StringMatchResultListener inner_listener;
+ const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+
+ if (matches != all_elements_should_match) {
+ *listener << "whose element #" << i
+ << (matches ? " matches" : " doesn't match");
+ PrintIfNotEmpty(inner_listener.str(), listener->stream());
+ return !all_elements_should_match;
+ }
+ }
+ return all_elements_should_match;
+ }
+
+ bool MatchAndExplainImpl(const Matcher<size_t>& count_matcher,
+ Container container,
+ MatchResultListener* listener) const {
+ StlContainerReference stl_container = View::ConstReference(container);
+ size_t i = 0;
+ std::vector<size_t> match_elements;
+ for (auto it = stl_container.begin(); it != stl_container.end();
+ ++it, ++i) {
+ StringMatchResultListener inner_listener;
+ const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+ if (matches) {
+ match_elements.push_back(i);
+ }
+ }
+ if (listener->IsInterested()) {
+ if (match_elements.empty()) {
+ *listener << "has no element that matches";
+ } else if (match_elements.size() == 1) {
+ *listener << "whose element #" << match_elements[0] << " matches";
+ } else {
+ *listener << "whose elements (";
+ std::string sep = "";
+ for (size_t e : match_elements) {
+ *listener << sep << e;
+ sep = ", ";
+ }
+ *listener << ") match";
+ }
+ }
+ StringMatchResultListener count_listener;
+ if (count_matcher.MatchAndExplain(match_elements.size(), &count_listener)) {
+ *listener << " and whose match quantity of " << match_elements.size()
+ << " matches";
+ PrintIfNotEmpty(count_listener.str(), listener->stream());
+ return true;
+ } else {
+ if (match_elements.empty()) {
+ *listener << " and";
+ } else {
+ *listener << " but";
+ }
+ *listener << " whose match quantity of " << match_elements.size()
+ << " does not match";
+ PrintIfNotEmpty(count_listener.str(), listener->stream());
+ return false;
+ }
+ }
+
+ protected:
+ const Matcher<const Element&> inner_matcher_;
+};
+
+// Implements Contains(element_matcher) for the given argument type Container.
+// Symmetric to EachMatcherImpl.
+template <typename Container>
+class ContainsMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+ template <typename InnerMatcher>
+ explicit ContainsMatcherImpl(InnerMatcher inner_matcher)
+ : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+ // Describes what this matcher does.
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "contains at least one element that ";
+ this->inner_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "doesn't contain any element that ";
+ this->inner_matcher_.DescribeTo(os);
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ return this->MatchAndExplainImpl(false, container, listener);
+ }
+};
+
+// Implements Each(element_matcher) for the given argument type Container.
+// Symmetric to ContainsMatcherImpl.
+template <typename Container>
+class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+ template <typename InnerMatcher>
+ explicit EachMatcherImpl(InnerMatcher inner_matcher)
+ : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+ // Describes what this matcher does.
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "only contains elements that ";
+ this->inner_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "contains some element that ";
+ this->inner_matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ return this->MatchAndExplainImpl(true, container, listener);
+ }
+};
+
+// Implements Contains(element_matcher).Times(n) for the given argument type
+// Container.
+template <typename Container>
+class ContainsTimesMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+ template <typename InnerMatcher>
+ explicit ContainsTimesMatcherImpl(InnerMatcher inner_matcher,
+ Matcher<size_t> count_matcher)
+ : QuantifierMatcherImpl<Container>(inner_matcher),
+ count_matcher_(std::move(count_matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "quantity of elements that match ";
+ this->inner_matcher_.DescribeTo(os);
+ *os << " ";
+ count_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "quantity of elements that match ";
+ this->inner_matcher_.DescribeTo(os);
+ *os << " ";
+ count_matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ return this->MatchAndExplainImpl(count_matcher_, container, listener);
+ }
+
+ private:
+ const Matcher<size_t> count_matcher_;
+};
+
+// Implements polymorphic Contains(element_matcher).Times(n).
+template <typename M>
+class ContainsTimesMatcher {
+ public:
+ explicit ContainsTimesMatcher(M m, Matcher<size_t> count_matcher)
+ : inner_matcher_(m), count_matcher_(std::move(count_matcher)) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const { // NOLINT
+ return Matcher<Container>(new ContainsTimesMatcherImpl<const Container&>(
+ inner_matcher_, count_matcher_));
+ }
+
+ private:
+ const M inner_matcher_;
+ const Matcher<size_t> count_matcher_;
+};
+
+// Implements polymorphic Contains(element_matcher).
+template <typename M>
+class ContainsMatcher {
+ public:
+ explicit ContainsMatcher(M m) : inner_matcher_(m) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const { // NOLINT
+ return Matcher<Container>(
+ new ContainsMatcherImpl<const Container&>(inner_matcher_));
+ }
+
+ ContainsTimesMatcher<M> Times(Matcher<size_t> count_matcher) const {
+ return ContainsTimesMatcher<M>(inner_matcher_, std::move(count_matcher));
+ }
+
+ private:
+ const M inner_matcher_;
+};
+
+// Implements polymorphic Each(element_matcher).
+template <typename M>
+class EachMatcher {
+ public:
+ explicit EachMatcher(M m) : inner_matcher_(m) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const { // NOLINT
+ return Matcher<Container>(
+ new EachMatcherImpl<const Container&>(inner_matcher_));
+ }
+
+ private:
+ const M inner_matcher_;
+};
+
+struct Rank1 {};
+struct Rank0 : Rank1 {};
+
+namespace pair_getters {
+using std::get;
+template <typename T>
+auto First(T& x, Rank1) -> decltype(get<0>(x)) { // NOLINT
+ return get<0>(x);
+}
+template <typename T>
+auto First(T& x, Rank0) -> decltype((x.first)) { // NOLINT
+ return x.first;
+}
+
+template <typename T>
+auto Second(T& x, Rank1) -> decltype(get<1>(x)) { // NOLINT
+ return get<1>(x);
+}
+template <typename T>
+auto Second(T& x, Rank0) -> decltype((x.second)) { // NOLINT
+ return x.second;
+}
+} // namespace pair_getters
+
+// Implements Key(inner_matcher) for the given argument pair type.
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher. For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename PairType>
+class KeyMatcherImpl : public MatcherInterface<PairType> {
+ public:
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+ typedef typename RawPairType::first_type KeyType;
+
+ template <typename InnerMatcher>
+ explicit KeyMatcherImpl(InnerMatcher inner_matcher)
+ : inner_matcher_(
+ testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {}
+
+ // Returns true if and only if 'key_value.first' (the key) matches the inner
+ // matcher.
+ bool MatchAndExplain(PairType key_value,
+ MatchResultListener* listener) const override {
+ StringMatchResultListener inner_listener;
+ const bool match = inner_matcher_.MatchAndExplain(
+ pair_getters::First(key_value, Rank0()), &inner_listener);
+ const std::string explanation = inner_listener.str();
+ if (explanation != "") {
+ *listener << "whose first field is a value " << explanation;
+ }
+ return match;
+ }
+
+ // Describes what this matcher does.
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "has a key that ";
+ inner_matcher_.DescribeTo(os);
+ }
+
+ // Describes what the negation of this matcher does.
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "doesn't have a key that ";
+ inner_matcher_.DescribeTo(os);
+ }
+
+ private:
+ const Matcher<const KeyType&> inner_matcher_;
+};
+
+// Implements polymorphic Key(matcher_for_key).
+template <typename M>
+class KeyMatcher {
+ public:
+ explicit KeyMatcher(M m) : matcher_for_key_(m) {}
+
+ template <typename PairType>
+ operator Matcher<PairType>() const {
+ return Matcher<PairType>(
+ new KeyMatcherImpl<const PairType&>(matcher_for_key_));
+ }
+
+ private:
+ const M matcher_for_key_;
+};
+
+// Implements polymorphic Address(matcher_for_address).
+template <typename InnerMatcher>
+class AddressMatcher {
+ public:
+ explicit AddressMatcher(InnerMatcher m) : matcher_(m) {}
+
+ template <typename Type>
+ operator Matcher<Type>() const { // NOLINT
+ return Matcher<Type>(new Impl<const Type&>(matcher_));
+ }
+
+ private:
+ // The monomorphic implementation that works for a particular object type.
+ template <typename Type>
+ class Impl : public MatcherInterface<Type> {
+ public:
+ using Address = const GTEST_REMOVE_REFERENCE_AND_CONST_(Type) *;
+ explicit Impl(const InnerMatcher& matcher)
+ : matcher_(MatcherCast<Address>(matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "has address that ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "does not have address that ";
+ matcher_.DescribeTo(os);
+ }
+
+ bool MatchAndExplain(Type object,
+ MatchResultListener* listener) const override {
+ *listener << "which has address ";
+ Address address = std::addressof(object);
+ return MatchPrintAndExplain(address, matcher_, listener);
+ }
+
+ private:
+ const Matcher<Address> matcher_;
+ };
+ const InnerMatcher matcher_;
+};
+
+// Implements Pair(first_matcher, second_matcher) for the given argument pair
+// type with its two matchers. See Pair() function below.
+template <typename PairType>
+class PairMatcherImpl : public MatcherInterface<PairType> {
+ public:
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+ typedef typename RawPairType::first_type FirstType;
+ typedef typename RawPairType::second_type SecondType;
+
+ template <typename FirstMatcher, typename SecondMatcher>
+ PairMatcherImpl(FirstMatcher first_matcher, SecondMatcher second_matcher)
+ : first_matcher_(
+ testing::SafeMatcherCast<const FirstType&>(first_matcher)),
+ second_matcher_(
+ testing::SafeMatcherCast<const SecondType&>(second_matcher)) {}
+
+ // Describes what this matcher does.
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "has a first field that ";
+ first_matcher_.DescribeTo(os);
+ *os << ", and has a second field that ";
+ second_matcher_.DescribeTo(os);
+ }
+
+ // Describes what the negation of this matcher does.
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "has a first field that ";
+ first_matcher_.DescribeNegationTo(os);
+ *os << ", or has a second field that ";
+ second_matcher_.DescribeNegationTo(os);
+ }
+
+ // Returns true if and only if 'a_pair.first' matches first_matcher and
+ // 'a_pair.second' matches second_matcher.
+ bool MatchAndExplain(PairType a_pair,
+ MatchResultListener* listener) const override {
+ if (!listener->IsInterested()) {
+ // If the listener is not interested, we don't need to construct the
+ // explanation.
+ return first_matcher_.Matches(pair_getters::First(a_pair, Rank0())) &&
+ second_matcher_.Matches(pair_getters::Second(a_pair, Rank0()));
+ }
+ StringMatchResultListener first_inner_listener;
+ if (!first_matcher_.MatchAndExplain(pair_getters::First(a_pair, Rank0()),
+ &first_inner_listener)) {
+ *listener << "whose first field does not match";
+ PrintIfNotEmpty(first_inner_listener.str(), listener->stream());
+ return false;
+ }
+ StringMatchResultListener second_inner_listener;
+ if (!second_matcher_.MatchAndExplain(pair_getters::Second(a_pair, Rank0()),
+ &second_inner_listener)) {
+ *listener << "whose second field does not match";
+ PrintIfNotEmpty(second_inner_listener.str(), listener->stream());
+ return false;
+ }
+ ExplainSuccess(first_inner_listener.str(), second_inner_listener.str(),
+ listener);
+ return true;
+ }
+
+ private:
+ void ExplainSuccess(const std::string& first_explanation,
+ const std::string& second_explanation,
+ MatchResultListener* listener) const {
+ *listener << "whose both fields match";
+ if (first_explanation != "") {
+ *listener << ", where the first field is a value " << first_explanation;
+ }
+ if (second_explanation != "") {
+ *listener << ", ";
+ if (first_explanation != "") {
+ *listener << "and ";
+ } else {
+ *listener << "where ";
+ }
+ *listener << "the second field is a value " << second_explanation;
+ }
+ }
+
+ const Matcher<const FirstType&> first_matcher_;
+ const Matcher<const SecondType&> second_matcher_;
+};
+
+// Implements polymorphic Pair(first_matcher, second_matcher).
+template <typename FirstMatcher, typename SecondMatcher>
+class PairMatcher {
+ public:
+ PairMatcher(FirstMatcher first_matcher, SecondMatcher second_matcher)
+ : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
+
+ template <typename PairType>
+ operator Matcher<PairType>() const {
+ return Matcher<PairType>(
+ new PairMatcherImpl<const PairType&>(first_matcher_, second_matcher_));
+ }
+
+ private:
+ const FirstMatcher first_matcher_;
+ const SecondMatcher second_matcher_;
+};
+
+template <typename T, size_t... I>
+auto UnpackStructImpl(const T& t, IndexSequence<I...>, int)
+ -> decltype(std::tie(get<I>(t)...)) {
+ static_assert(std::tuple_size<T>::value == sizeof...(I),
+ "Number of arguments doesn't match the number of fields.");
+ return std::tie(get<I>(t)...);
+}
+
+#if defined(__cpp_structured_bindings) && __cpp_structured_bindings >= 201606
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<1>, char) {
+ const auto& [a] = t;
+ return std::tie(a);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<2>, char) {
+ const auto& [a, b] = t;
+ return std::tie(a, b);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<3>, char) {
+ const auto& [a, b, c] = t;
+ return std::tie(a, b, c);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<4>, char) {
+ const auto& [a, b, c, d] = t;
+ return std::tie(a, b, c, d);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<5>, char) {
+ const auto& [a, b, c, d, e] = t;
+ return std::tie(a, b, c, d, e);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<6>, char) {
+ const auto& [a, b, c, d, e, f] = t;
+ return std::tie(a, b, c, d, e, f);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<7>, char) {
+ const auto& [a, b, c, d, e, f, g] = t;
+ return std::tie(a, b, c, d, e, f, g);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<8>, char) {
+ const auto& [a, b, c, d, e, f, g, h] = t;
+ return std::tie(a, b, c, d, e, f, g, h);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<9>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<10>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<11>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j, k] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j, k);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<12>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j, k, l] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j, k, l);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<13>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<14>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<15>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o);
+}
+template <typename T>
+auto UnpackStructImpl(const T& t, MakeIndexSequence<16>, char) {
+ const auto& [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p] = t;
+ return std::tie(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p);
+}
+#endif // defined(__cpp_structured_bindings)
+
+template <size_t I, typename T>
+auto UnpackStruct(const T& t)
+ -> decltype((UnpackStructImpl)(t, MakeIndexSequence<I>{}, 0)) {
+ return (UnpackStructImpl)(t, MakeIndexSequence<I>{}, 0);
+}
+
+// Helper function to do comma folding in C++11.
+// The array ensures left-to-right order of evaluation.
+// Usage: VariadicExpand({expr...});
+template <typename T, size_t N>
+void VariadicExpand(const T (&)[N]) {}
+
+template <typename Struct, typename StructSize>
+class FieldsAreMatcherImpl;
+
+template <typename Struct, size_t... I>
+class FieldsAreMatcherImpl<Struct, IndexSequence<I...>>
+ : public MatcherInterface<Struct> {
+ using UnpackedType =
+ decltype(UnpackStruct<sizeof...(I)>(std::declval<const Struct&>()));
+ using MatchersType = std::tuple<
+ Matcher<const typename std::tuple_element<I, UnpackedType>::type&>...>;
+
+ public:
+ template <typename Inner>
+ explicit FieldsAreMatcherImpl(const Inner& matchers)
+ : matchers_(testing::SafeMatcherCast<
+ const typename std::tuple_element<I, UnpackedType>::type&>(
+ std::get<I>(matchers))...) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ const char* separator = "";
+ VariadicExpand(
+ {(*os << separator << "has field #" << I << " that ",
+ std::get<I>(matchers_).DescribeTo(os), separator = ", and ")...});
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ const char* separator = "";
+ VariadicExpand({(*os << separator << "has field #" << I << " that ",
+ std::get<I>(matchers_).DescribeNegationTo(os),
+ separator = ", or ")...});
+ }
+
+ bool MatchAndExplain(Struct t, MatchResultListener* listener) const override {
+ return MatchInternal((UnpackStruct<sizeof...(I)>)(t), listener);
+ }
+
+ private:
+ bool MatchInternal(UnpackedType tuple, MatchResultListener* listener) const {
+ if (!listener->IsInterested()) {
+ // If the listener is not interested, we don't need to construct the
+ // explanation.
+ bool good = true;
+ VariadicExpand({good = good && std::get<I>(matchers_).Matches(
+ std::get<I>(tuple))...});
+ return good;
+ }
+
+ size_t failed_pos = ~size_t{};
+
+ std::vector<StringMatchResultListener> inner_listener(sizeof...(I));
+
+ VariadicExpand(
+ {failed_pos == ~size_t{} && !std::get<I>(matchers_).MatchAndExplain(
+ std::get<I>(tuple), &inner_listener[I])
+ ? failed_pos = I
+ : 0 ...});
+ if (failed_pos != ~size_t{}) {
+ *listener << "whose field #" << failed_pos << " does not match";
+ PrintIfNotEmpty(inner_listener[failed_pos].str(), listener->stream());
+ return false;
+ }
+
+ *listener << "whose all elements match";
+ const char* separator = ", where";
+ for (size_t index = 0; index < sizeof...(I); ++index) {
+ const std::string str = inner_listener[index].str();
+ if (!str.empty()) {
+ *listener << separator << " field #" << index << " is a value " << str;
+ separator = ", and";
+ }
+ }
+
+ return true;
+ }
+
+ MatchersType matchers_;
+};
+
+template <typename... Inner>
+class FieldsAreMatcher {
+ public:
+ explicit FieldsAreMatcher(Inner... inner) : matchers_(std::move(inner)...) {}
+
+ template <typename Struct>
+ operator Matcher<Struct>() const { // NOLINT
+ return Matcher<Struct>(
+ new FieldsAreMatcherImpl<const Struct&, IndexSequenceFor<Inner...>>(
+ matchers_));
+ }
+
+ private:
+ std::tuple<Inner...> matchers_;
+};
+
+// Implements ElementsAre() and ElementsAreArray().
+template <typename Container>
+class ElementsAreMatcherImpl : public MatcherInterface<Container> {
+ public:
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+ typedef internal::StlContainerView<RawContainer> View;
+ typedef typename View::type StlContainer;
+ typedef typename View::const_reference StlContainerReference;
+ typedef typename StlContainer::value_type Element;
+
+ // Constructs the matcher from a sequence of element values or
+ // element matchers.
+ template <typename InputIter>
+ ElementsAreMatcherImpl(InputIter first, InputIter last) {
+ while (first != last) {
+ matchers_.push_back(MatcherCast<const Element&>(*first++));
+ }
+ }
+
+ // Describes what this matcher does.
+ void DescribeTo(::std::ostream* os) const override {
+ if (count() == 0) {
+ *os << "is empty";
+ } else if (count() == 1) {
+ *os << "has 1 element that ";
+ matchers_[0].DescribeTo(os);
+ } else {
+ *os << "has " << Elements(count()) << " where\n";
+ for (size_t i = 0; i != count(); ++i) {
+ *os << "element #" << i << " ";
+ matchers_[i].DescribeTo(os);
+ if (i + 1 < count()) {
+ *os << ",\n";
+ }
+ }
+ }
+ }
+
+ // Describes what the negation of this matcher does.
+ void DescribeNegationTo(::std::ostream* os) const override {
+ if (count() == 0) {
+ *os << "isn't empty";
+ return;
+ }
+
+ *os << "doesn't have " << Elements(count()) << ", or\n";
+ for (size_t i = 0; i != count(); ++i) {
+ *os << "element #" << i << " ";
+ matchers_[i].DescribeNegationTo(os);
+ if (i + 1 < count()) {
+ *os << ", or\n";
+ }
+ }
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ // To work with stream-like "containers", we must only walk
+ // through the elements in one pass.
+
+ const bool listener_interested = listener->IsInterested();
+
+ // explanations[i] is the explanation of the element at index i.
+ ::std::vector<std::string> explanations(count());
+ StlContainerReference stl_container = View::ConstReference(container);
+ auto it = stl_container.begin();
+ size_t exam_pos = 0;
+ bool mismatch_found = false; // Have we found a mismatched element yet?
+
+ // Go through the elements and matchers in pairs, until we reach
+ // the end of either the elements or the matchers, or until we find a
+ // mismatch.
+ for (; it != stl_container.end() && exam_pos != count(); ++it, ++exam_pos) {
+ bool match; // Does the current element match the current matcher?
+ if (listener_interested) {
+ StringMatchResultListener s;
+ match = matchers_[exam_pos].MatchAndExplain(*it, &s);
+ explanations[exam_pos] = s.str();
+ } else {
+ match = matchers_[exam_pos].Matches(*it);
+ }
+
+ if (!match) {
+ mismatch_found = true;
+ break;
+ }
+ }
+ // If mismatch_found is true, 'exam_pos' is the index of the mismatch.
+
+ // Find how many elements the actual container has. We avoid
+ // calling size() s.t. this code works for stream-like "containers"
+ // that don't define size().
+ size_t actual_count = exam_pos;
+ for (; it != stl_container.end(); ++it) {
+ ++actual_count;
+ }
+
+ if (actual_count != count()) {
+ // The element count doesn't match. If the container is empty,
+ // there's no need to explain anything as Google Mock already
+ // prints the empty container. Otherwise we just need to show
+ // how many elements there actually are.
+ if (listener_interested && (actual_count != 0)) {
+ *listener << "which has " << Elements(actual_count);
+ }
+ return false;
+ }
+
+ if (mismatch_found) {
+ // The element count matches, but the exam_pos-th element doesn't match.
+ if (listener_interested) {
+ *listener << "whose element #" << exam_pos << " doesn't match";
+ PrintIfNotEmpty(explanations[exam_pos], listener->stream());
+ }
+ return false;
+ }
+
+ // Every element matches its expectation. We need to explain why
+ // (the obvious ones can be skipped).
+ if (listener_interested) {
+ bool reason_printed = false;
+ for (size_t i = 0; i != count(); ++i) {
+ const std::string& s = explanations[i];
+ if (!s.empty()) {
+ if (reason_printed) {
+ *listener << ",\nand ";
+ }
+ *listener << "whose element #" << i << " matches, " << s;
+ reason_printed = true;
+ }
+ }
+ }
+ return true;
+ }
+
+ private:
+ static Message Elements(size_t count) {
+ return Message() << count << (count == 1 ? " element" : " elements");
+ }
+
+ size_t count() const { return matchers_.size(); }
+
+ ::std::vector<Matcher<const Element&>> matchers_;
+};
+
+// Connectivity matrix of (elements X matchers), in element-major order.
+// Initially, there are no edges.
+// Use NextGraph() to iterate over all possible edge configurations.
+// Use Randomize() to generate a random edge configuration.
+class GTEST_API_ MatchMatrix {
+ public:
+ MatchMatrix(size_t num_elements, size_t num_matchers)
+ : num_elements_(num_elements),
+ num_matchers_(num_matchers),
+ matched_(num_elements_ * num_matchers_, 0) {}
+
+ size_t LhsSize() const { return num_elements_; }
+ size_t RhsSize() const { return num_matchers_; }
+ bool HasEdge(size_t ilhs, size_t irhs) const {
+ return matched_[SpaceIndex(ilhs, irhs)] == 1;
+ }
+ void SetEdge(size_t ilhs, size_t irhs, bool b) {
+ matched_[SpaceIndex(ilhs, irhs)] = b ? 1 : 0;
+ }
+
+ // Treating the connectivity matrix as a (LhsSize()*RhsSize())-bit number,
+ // adds 1 to that number; returns false if incrementing the graph left it
+ // empty.
+ bool NextGraph();
+
+ void Randomize();
+
+ std::string DebugString() const;
+
+ private:
+ size_t SpaceIndex(size_t ilhs, size_t irhs) const {
+ return ilhs * num_matchers_ + irhs;
+ }
+
+ size_t num_elements_;
+ size_t num_matchers_;
+
+ // Each element is a char interpreted as bool. They are stored as a
+ // flattened array in lhs-major order, use 'SpaceIndex()' to translate
+ // a (ilhs, irhs) matrix coordinate into an offset.
+ ::std::vector<char> matched_;
+};
+
+typedef ::std::pair<size_t, size_t> ElementMatcherPair;
+typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
+
+// Returns a maximum bipartite matching for the specified graph 'g'.
+// The matching is represented as a vector of {element, matcher} pairs.
+GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g);
+
+struct UnorderedMatcherRequire {
+ enum Flags {
+ Superset = 1 << 0,
+ Subset = 1 << 1,
+ ExactMatch = Superset | Subset,
+ };
+};
+
+// Untyped base class for implementing UnorderedElementsAre. By
+// putting logic that's not specific to the element type here, we
+// reduce binary bloat and increase compilation speed.
+class GTEST_API_ UnorderedElementsAreMatcherImplBase {
+ protected:
+ explicit UnorderedElementsAreMatcherImplBase(
+ UnorderedMatcherRequire::Flags matcher_flags)
+ : match_flags_(matcher_flags) {}
+
+ // A vector of matcher describers, one for each element matcher.
+ // Does not own the describers (and thus can be used only when the
+ // element matchers are alive).
+ typedef ::std::vector<const MatcherDescriberInterface*> MatcherDescriberVec;
+
+ // Describes this UnorderedElementsAre matcher.
+ void DescribeToImpl(::std::ostream* os) const;
+
+ // Describes the negation of this UnorderedElementsAre matcher.
+ void DescribeNegationToImpl(::std::ostream* os) const;
+
+ bool VerifyMatchMatrix(const ::std::vector<std::string>& element_printouts,
+ const MatchMatrix& matrix,
+ MatchResultListener* listener) const;
+
+ bool FindPairing(const MatchMatrix& matrix,
+ MatchResultListener* listener) const;
+
+ MatcherDescriberVec& matcher_describers() { return matcher_describers_; }
+
+ static Message Elements(size_t n) {
+ return Message() << n << " element" << (n == 1 ? "" : "s");
+ }
+
+ UnorderedMatcherRequire::Flags match_flags() const { return match_flags_; }
+
+ private:
+ UnorderedMatcherRequire::Flags match_flags_;
+ MatcherDescriberVec matcher_describers_;
+};
+
+// Implements UnorderedElementsAre, UnorderedElementsAreArray, IsSubsetOf, and
+// IsSupersetOf.
+template <typename Container>
+class UnorderedElementsAreMatcherImpl
+ : public MatcherInterface<Container>,
+ public UnorderedElementsAreMatcherImplBase {
+ public:
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+ typedef internal::StlContainerView<RawContainer> View;
+ typedef typename View::type StlContainer;
+ typedef typename View::const_reference StlContainerReference;
+ typedef typename StlContainer::value_type Element;
+
+ template <typename InputIter>
+ UnorderedElementsAreMatcherImpl(UnorderedMatcherRequire::Flags matcher_flags,
+ InputIter first, InputIter last)
+ : UnorderedElementsAreMatcherImplBase(matcher_flags) {
+ for (; first != last; ++first) {
+ matchers_.push_back(MatcherCast<const Element&>(*first));
+ }
+ for (const auto& m : matchers_) {
+ matcher_describers().push_back(m.GetDescriber());
+ }
+ }
+
+ // Describes what this matcher does.
+ void DescribeTo(::std::ostream* os) const override {
+ return UnorderedElementsAreMatcherImplBase::DescribeToImpl(os);
+ }
+
+ // Describes what the negation of this matcher does.
+ void DescribeNegationTo(::std::ostream* os) const override {
+ return UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(os);
+ }
+
+ bool MatchAndExplain(Container container,
+ MatchResultListener* listener) const override {
+ StlContainerReference stl_container = View::ConstReference(container);
+ ::std::vector<std::string> element_printouts;
+ MatchMatrix matrix =
+ AnalyzeElements(stl_container.begin(), stl_container.end(),
+ &element_printouts, listener);
+
+ if (matrix.LhsSize() == 0 && matrix.RhsSize() == 0) {
+ return true;
+ }
+
+ if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+ if (matrix.LhsSize() != matrix.RhsSize()) {
+ // The element count doesn't match. If the container is empty,
+ // there's no need to explain anything as Google Mock already
+ // prints the empty container. Otherwise we just need to show
+ // how many elements there actually are.
+ if (matrix.LhsSize() != 0 && listener->IsInterested()) {
+ *listener << "which has " << Elements(matrix.LhsSize());
+ }
+ return false;
+ }
+ }
+
+ return VerifyMatchMatrix(element_printouts, matrix, listener) &&
+ FindPairing(matrix, listener);
+ }
+
+ private:
+ template <typename ElementIter>
+ MatchMatrix AnalyzeElements(ElementIter elem_first, ElementIter elem_last,
+ ::std::vector<std::string>* element_printouts,
+ MatchResultListener* listener) const {
+ element_printouts->clear();
+ ::std::vector<char> did_match;
+ size_t num_elements = 0;
+ DummyMatchResultListener dummy;
+ for (; elem_first != elem_last; ++num_elements, ++elem_first) {
+ if (listener->IsInterested()) {
+ element_printouts->push_back(PrintToString(*elem_first));
+ }
+ for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+ did_match.push_back(
+ matchers_[irhs].MatchAndExplain(*elem_first, &dummy));
+ }
+ }
+
+ MatchMatrix matrix(num_elements, matchers_.size());
+ ::std::vector<char>::const_iterator did_match_iter = did_match.begin();
+ for (size_t ilhs = 0; ilhs != num_elements; ++ilhs) {
+ for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+ matrix.SetEdge(ilhs, irhs, *did_match_iter++ != 0);
+ }
+ }
+ return matrix;
+ }
+
+ ::std::vector<Matcher<const Element&>> matchers_;
+};
+
+// Functor for use in TransformTuple.
+// Performs MatcherCast<Target> on an input argument of any type.
+template <typename Target>
+struct CastAndAppendTransform {
+ template <typename Arg>
+ Matcher<Target> operator()(const Arg& a) const {
+ return MatcherCast<Target>(a);
+ }
+};
+
+// Implements UnorderedElementsAre.
+template <typename MatcherTuple>
+class UnorderedElementsAreMatcher {
+ public:
+ explicit UnorderedElementsAreMatcher(const MatcherTuple& args)
+ : matchers_(args) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const {
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+ typedef typename internal::StlContainerView<RawContainer>::type View;
+ typedef typename View::value_type Element;
+ typedef ::std::vector<Matcher<const Element&>> MatcherVec;
+ MatcherVec matchers;
+ matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+ TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+ ::std::back_inserter(matchers));
+ return Matcher<Container>(
+ new UnorderedElementsAreMatcherImpl<const Container&>(
+ UnorderedMatcherRequire::ExactMatch, matchers.begin(),
+ matchers.end()));
+ }
+
+ private:
+ const MatcherTuple matchers_;
+};
+
+// Implements ElementsAre.
+template <typename MatcherTuple>
+class ElementsAreMatcher {
+ public:
+ explicit ElementsAreMatcher(const MatcherTuple& args) : matchers_(args) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const {
+ static_assert(
+ !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value ||
+ ::std::tuple_size<MatcherTuple>::value < 2,
+ "use UnorderedElementsAre with hash tables");
+
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+ typedef typename internal::StlContainerView<RawContainer>::type View;
+ typedef typename View::value_type Element;
+ typedef ::std::vector<Matcher<const Element&>> MatcherVec;
+ MatcherVec matchers;
+ matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+ TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+ ::std::back_inserter(matchers));
+ return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+ matchers.begin(), matchers.end()));
+ }
+
+ private:
+ const MatcherTuple matchers_;
+};
+
+// Implements UnorderedElementsAreArray(), IsSubsetOf(), and IsSupersetOf().
+template <typename T>
+class UnorderedElementsAreArrayMatcher {
+ public:
+ template <typename Iter>
+ UnorderedElementsAreArrayMatcher(UnorderedMatcherRequire::Flags match_flags,
+ Iter first, Iter last)
+ : match_flags_(match_flags), matchers_(first, last) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const {
+ return Matcher<Container>(
+ new UnorderedElementsAreMatcherImpl<const Container&>(
+ match_flags_, matchers_.begin(), matchers_.end()));
+ }
+
+ private:
+ UnorderedMatcherRequire::Flags match_flags_;
+ ::std::vector<T> matchers_;
+};
+
+// Implements ElementsAreArray().
+template <typename T>
+class ElementsAreArrayMatcher {
+ public:
+ template <typename Iter>
+ ElementsAreArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+ template <typename Container>
+ operator Matcher<Container>() const {
+ static_assert(
+ !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value,
+ "use UnorderedElementsAreArray with hash tables");
+
+ return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+ matchers_.begin(), matchers_.end()));
+ }
+
+ private:
+ const ::std::vector<T> matchers_;
+};
+
+// Given a 2-tuple matcher tm of type Tuple2Matcher and a value second
+// of type Second, BoundSecondMatcher<Tuple2Matcher, Second>(tm,
+// second) is a polymorphic matcher that matches a value x if and only if
+// tm matches tuple (x, second). Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+//
+// BoundSecondMatcher is copyable and assignable, as we need to put
+// instances of this class in a vector when implementing
+// UnorderedPointwise().
+template <typename Tuple2Matcher, typename Second>
+class BoundSecondMatcher {
+ public:
+ BoundSecondMatcher(const Tuple2Matcher& tm, const Second& second)
+ : tuple2_matcher_(tm), second_value_(second) {}
+
+ BoundSecondMatcher(const BoundSecondMatcher& other) = default;
+
+ template <typename T>
+ operator Matcher<T>() const {
+ return MakeMatcher(new Impl<T>(tuple2_matcher_, second_value_));
+ }
+
+ // We have to define this for UnorderedPointwise() to compile in
+ // C++98 mode, as it puts BoundSecondMatcher instances in a vector,
+ // which requires the elements to be assignable in C++98. The
+ // compiler cannot generate the operator= for us, as Tuple2Matcher
+ // and Second may not be assignable.
+ //
+ // However, this should never be called, so the implementation just
+ // need to assert.
+ void operator=(const BoundSecondMatcher& /*rhs*/) {
+ GTEST_LOG_(FATAL) << "BoundSecondMatcher should never be assigned.";
+ }
+
+ private:
+ template <typename T>
+ class Impl : public MatcherInterface<T> {
+ public:
+ typedef ::std::tuple<T, Second> ArgTuple;
+
+ Impl(const Tuple2Matcher& tm, const Second& second)
+ : mono_tuple2_matcher_(SafeMatcherCast<const ArgTuple&>(tm)),
+ second_value_(second) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "and ";
+ UniversalPrint(second_value_, os);
+ *os << " ";
+ mono_tuple2_matcher_.DescribeTo(os);
+ }
+
+ bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+ return mono_tuple2_matcher_.MatchAndExplain(ArgTuple(x, second_value_),
+ listener);
+ }
+
+ private:
+ const Matcher<const ArgTuple&> mono_tuple2_matcher_;
+ const Second second_value_;
+ };
+
+ const Tuple2Matcher tuple2_matcher_;
+ const Second second_value_;
+};
+
+// Given a 2-tuple matcher tm and a value second,
+// MatcherBindSecond(tm, second) returns a matcher that matches a
+// value x if and only if tm matches tuple (x, second). Useful for
+// implementing UnorderedPointwise() in terms of UnorderedElementsAreArray().
+template <typename Tuple2Matcher, typename Second>
+BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
+ const Tuple2Matcher& tm, const Second& second) {
+ return BoundSecondMatcher<Tuple2Matcher, Second>(tm, second);
+}
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher. 'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(
+ bool negation, const char* matcher_name,
+ const std::vector<const char*>& param_names, const Strings& param_values);
+
+// Implements a matcher that checks the value of a optional<> type variable.
+template <typename ValueMatcher>
+class OptionalMatcher {
+ public:
+ explicit OptionalMatcher(const ValueMatcher& value_matcher)
+ : value_matcher_(value_matcher) {}
+
+ template <typename Optional>
+ operator Matcher<Optional>() const {
+ return Matcher<Optional>(new Impl<const Optional&>(value_matcher_));
+ }
+
+ template <typename Optional>
+ class Impl : public MatcherInterface<Optional> {
+ public:
+ typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Optional) OptionalView;
+ typedef typename OptionalView::value_type ValueType;
+ explicit Impl(const ValueMatcher& value_matcher)
+ : value_matcher_(MatcherCast<ValueType>(value_matcher)) {}
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "value ";
+ value_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "value ";
+ value_matcher_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(Optional optional,
+ MatchResultListener* listener) const override {
+ if (!optional) {
+ *listener << "which is not engaged";
+ return false;
+ }
+ const ValueType& value = *optional;
+ StringMatchResultListener value_listener;
+ const bool match = value_matcher_.MatchAndExplain(value, &value_listener);
+ *listener << "whose value " << PrintToString(value)
+ << (match ? " matches" : " doesn't match");
+ PrintIfNotEmpty(value_listener.str(), listener->stream());
+ return match;
+ }
+
+ private:
+ const Matcher<ValueType> value_matcher_;
+ };
+
+ private:
+ const ValueMatcher value_matcher_;
+};
+
+namespace variant_matcher {
+// Overloads to allow VariantMatcher to do proper ADL lookup.
+template <typename T>
+void holds_alternative() {}
+template <typename T>
+void get() {}
+
+// Implements a matcher that checks the value of a variant<> type variable.
+template <typename T>
+class VariantMatcher {
+ public:
+ explicit VariantMatcher(::testing::Matcher<const T&> matcher)
+ : matcher_(std::move(matcher)) {}
+
+ template <typename Variant>
+ bool MatchAndExplain(const Variant& value,
+ ::testing::MatchResultListener* listener) const {
+ using std::get;
+ if (!listener->IsInterested()) {
+ return holds_alternative<T>(value) && matcher_.Matches(get<T>(value));
+ }
+
+ if (!holds_alternative<T>(value)) {
+ *listener << "whose value is not of type '" << GetTypeName() << "'";
+ return false;
+ }
+
+ const T& elem = get<T>(value);
+ StringMatchResultListener elem_listener;
+ const bool match = matcher_.MatchAndExplain(elem, &elem_listener);
+ *listener << "whose value " << PrintToString(elem)
+ << (match ? " matches" : " doesn't match");
+ PrintIfNotEmpty(elem_listener.str(), listener->stream());
+ return match;
+ }
+
+ void DescribeTo(std::ostream* os) const {
+ *os << "is a variant<> with value of type '" << GetTypeName()
+ << "' and the value ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(std::ostream* os) const {
+ *os << "is a variant<> with value of type other than '" << GetTypeName()
+ << "' or the value ";
+ matcher_.DescribeNegationTo(os);
+ }
+
+ private:
+ static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+ return internal::GetTypeName<T>());
+#endif
+ return "the element type";
+ }
+
+ const ::testing::Matcher<const T&> matcher_;
+};
+
+} // namespace variant_matcher
+
+namespace any_cast_matcher {
+
+// Overloads to allow AnyCastMatcher to do proper ADL lookup.
+template <typename T>
+void any_cast() {}
+
+// Implements a matcher that any_casts the value.
+template <typename T>
+class AnyCastMatcher {
+ public:
+ explicit AnyCastMatcher(const ::testing::Matcher<const T&>& matcher)
+ : matcher_(matcher) {}
+
+ template <typename AnyType>
+ bool MatchAndExplain(const AnyType& value,
+ ::testing::MatchResultListener* listener) const {
+ if (!listener->IsInterested()) {
+ const T* ptr = any_cast<T>(&value);
+ return ptr != nullptr && matcher_.Matches(*ptr);
+ }
+
+ const T* elem = any_cast<T>(&value);
+ if (elem == nullptr) {
+ *listener << "whose value is not of type '" << GetTypeName() << "'";
+ return false;
+ }
+
+ StringMatchResultListener elem_listener;
+ const bool match = matcher_.MatchAndExplain(*elem, &elem_listener);
+ *listener << "whose value " << PrintToString(*elem)
+ << (match ? " matches" : " doesn't match");
+ PrintIfNotEmpty(elem_listener.str(), listener->stream());
+ return match;
+ }
+
+ void DescribeTo(std::ostream* os) const {
+ *os << "is an 'any' type with value of type '" << GetTypeName()
+ << "' and the value ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(std::ostream* os) const {
+ *os << "is an 'any' type with value of type other than '" << GetTypeName()
+ << "' or the value ";
+ matcher_.DescribeNegationTo(os);
+ }
+
+ private:
+ static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+ return internal::GetTypeName<T>());
+#endif
+ return "the element type";
+ }
+
+ const ::testing::Matcher<const T&> matcher_;
+};
+
+} // namespace any_cast_matcher
+
+// Implements the Args() matcher.
+template <class ArgsTuple, size_t... k>
+class ArgsMatcherImpl : public MatcherInterface<ArgsTuple> {
+ public:
+ using RawArgsTuple = typename std::decay<ArgsTuple>::type;
+ using SelectedArgs =
+ std::tuple<typename std::tuple_element<k, RawArgsTuple>::type...>;
+ using MonomorphicInnerMatcher = Matcher<const SelectedArgs&>;
+
+ template <typename InnerMatcher>
+ explicit ArgsMatcherImpl(const InnerMatcher& inner_matcher)
+ : inner_matcher_(SafeMatcherCast<const SelectedArgs&>(inner_matcher)) {}
+
+ bool MatchAndExplain(ArgsTuple args,
+ MatchResultListener* listener) const override {
+ // Workaround spurious C4100 on MSVC<=15.7 when k is empty.
+ (void)args;
+ const SelectedArgs& selected_args =
+ std::forward_as_tuple(std::get<k>(args)...);
+ if (!listener->IsInterested()) return inner_matcher_.Matches(selected_args);
+
+ PrintIndices(listener->stream());
+ *listener << "are " << PrintToString(selected_args);
+
+ StringMatchResultListener inner_listener;
+ const bool match =
+ inner_matcher_.MatchAndExplain(selected_args, &inner_listener);
+ PrintIfNotEmpty(inner_listener.str(), listener->stream());
+ return match;
+ }
+
+ void DescribeTo(::std::ostream* os) const override {
+ *os << "are a tuple ";
+ PrintIndices(os);
+ inner_matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ *os << "are a tuple ";
+ PrintIndices(os);
+ inner_matcher_.DescribeNegationTo(os);
+ }
+
+ private:
+ // Prints the indices of the selected fields.
+ static void PrintIndices(::std::ostream* os) {
+ *os << "whose fields (";
+ const char* sep = "";
+ // Workaround spurious C4189 on MSVC<=15.7 when k is empty.
+ (void)sep;
+ const char* dummy[] = {"", (*os << sep << "#" << k, sep = ", ")...};
+ (void)dummy;
+ *os << ") ";
+ }
+
+ MonomorphicInnerMatcher inner_matcher_;
+};
+
+template <class InnerMatcher, size_t... k>
+class ArgsMatcher {
+ public:
+ explicit ArgsMatcher(InnerMatcher inner_matcher)
+ : inner_matcher_(std::move(inner_matcher)) {}
+
+ template <typename ArgsTuple>
+ operator Matcher<ArgsTuple>() const { // NOLINT
+ return MakeMatcher(new ArgsMatcherImpl<ArgsTuple, k...>(inner_matcher_));
+ }
+
+ private:
+ InnerMatcher inner_matcher_;
+};
+
+} // namespace internal
+
+// ElementsAreArray(iterator_first, iterator_last)
+// ElementsAreArray(pointer, count)
+// ElementsAreArray(array)
+// ElementsAreArray(container)
+// ElementsAreArray({ e1, e2, ..., en })
+//
+// The ElementsAreArray() functions are like ElementsAre(...), except
+// that they are given a homogeneous sequence rather than taking each
+// element as a function argument. The sequence can be specified as an
+// array, a pointer and count, a vector, an initializer list, or an
+// STL iterator range. In each of these cases, the underlying sequence
+// can be either a sequence of values or a sequence of matchers.
+//
+// All forms of ElementsAreArray() make a copy of the input matcher sequence.
+
+template <typename Iter>
+inline internal::ElementsAreArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>
+ElementsAreArray(Iter first, Iter last) {
+ typedef typename ::std::iterator_traits<Iter>::value_type T;
+ return internal::ElementsAreArrayMatcher<T>(first, last);
+}
+
+template <typename T>
+inline auto ElementsAreArray(const T* pointer, size_t count)
+ -> decltype(ElementsAreArray(pointer, pointer + count)) {
+ return ElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline auto ElementsAreArray(const T (&array)[N])
+ -> decltype(ElementsAreArray(array, N)) {
+ return ElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline auto ElementsAreArray(const Container& container)
+ -> decltype(ElementsAreArray(container.begin(), container.end())) {
+ return ElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline auto ElementsAreArray(::std::initializer_list<T> xs)
+ -> decltype(ElementsAreArray(xs.begin(), xs.end())) {
+ return ElementsAreArray(xs.begin(), xs.end());
+}
+
+// UnorderedElementsAreArray(iterator_first, iterator_last)
+// UnorderedElementsAreArray(pointer, count)
+// UnorderedElementsAreArray(array)
+// UnorderedElementsAreArray(container)
+// UnorderedElementsAreArray({ e1, e2, ..., en })
+//
+// UnorderedElementsAreArray() verifies that a bijective mapping onto a
+// collection of matchers exists.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>
+UnorderedElementsAreArray(Iter first, Iter last) {
+ typedef typename ::std::iterator_traits<Iter>::value_type T;
+ return internal::UnorderedElementsAreArrayMatcher<T>(
+ internal::UnorderedMatcherRequire::ExactMatch, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
+ const T* pointer, size_t count) {
+ return UnorderedElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
+ const T (&array)[N]) {
+ return UnorderedElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename Container::value_type>
+UnorderedElementsAreArray(const Container& container) {
+ return UnorderedElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> UnorderedElementsAreArray(
+ ::std::initializer_list<T> xs) {
+ return UnorderedElementsAreArray(xs.begin(), xs.end());
+}
+
+// _ is a matcher that matches anything of any type.
+//
+// This definition is fine as:
+//
+// 1. The C++ standard permits using the name _ in a namespace that
+// is not the global namespace or ::std.
+// 2. The AnythingMatcher class has no data member or constructor,
+// so it's OK to create global variables of this type.
+// 3. c-style has approved of using _ in this case.
+const internal::AnythingMatcher _ = {};
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> A() {
+ return _;
+}
+
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> An() {
+ return _;
+}
+
+template <typename T, typename M>
+Matcher<T> internal::MatcherCastImpl<T, M>::CastImpl(
+ const M& value, std::false_type /* convertible_to_matcher */,
+ std::false_type /* convertible_to_T */) {
+ return Eq(value);
+}
+
+// Creates a polymorphic matcher that matches any NULL pointer.
+inline PolymorphicMatcher<internal::IsNullMatcher> IsNull() {
+ return MakePolymorphicMatcher(internal::IsNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any non-NULL pointer.
+// This is convenient as Not(NULL) doesn't compile (the compiler
+// thinks that that expression is comparing a pointer with an integer).
+inline PolymorphicMatcher<internal::NotNullMatcher> NotNull() {
+ return MakePolymorphicMatcher(internal::NotNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any argument that
+// references variable x.
+template <typename T>
+inline internal::RefMatcher<T&> Ref(T& x) { // NOLINT
+ return internal::RefMatcher<T&>(x);
+}
+
+// Creates a polymorphic matcher that matches any NaN floating point.
+inline PolymorphicMatcher<internal::IsNanMatcher> IsNan() {
+ return MakePolymorphicMatcher(internal::IsNanMatcher());
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<double> DoubleEq(double rhs) {
+ return internal::FloatingEqMatcher<double>(rhs, false);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
+ return internal::FloatingEqMatcher<double>(rhs, true);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal. The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> DoubleNear(double rhs,
+ double max_abs_error) {
+ return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN. The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleNear(
+ double rhs, double max_abs_error) {
+ return internal::FloatingEqMatcher<double>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<float> FloatEq(float rhs) {
+ return internal::FloatingEqMatcher<float>(rhs, false);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
+ return internal::FloatingEqMatcher<float>(rhs, true);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal. The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> FloatNear(float rhs,
+ float max_abs_error) {
+ return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN. The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatNear(
+ float rhs, float max_abs_error) {
+ return internal::FloatingEqMatcher<float>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that points
+// to a value that matches inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointeeMatcher<InnerMatcher> Pointee(
+ const InnerMatcher& inner_matcher) {
+ return internal::PointeeMatcher<InnerMatcher>(inner_matcher);
+}
+
+#if GTEST_HAS_RTTI
+// Creates a matcher that matches a pointer or reference that matches
+// inner_matcher when dynamic_cast<To> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To>>
+WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
+ return MakePolymorphicMatcher(
+ internal::WhenDynamicCastToMatcher<To>(inner_matcher));
+}
+#endif // GTEST_HAS_RTTI
+
+// Creates a matcher that matches an object whose given field matches
+// 'matcher'. For example,
+// Field(&Foo::number, Ge(5))
+// matches a Foo object x if and only if x.number >= 5.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType>> Field(
+ FieldType Class::*field, const FieldMatcher& matcher) {
+ return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
+ field, MatcherCast<const FieldType&>(matcher)));
+ // The call to MatcherCast() is required for supporting inner
+ // matchers of compatible types. For example, it allows
+ // Field(&Foo::bar, m)
+ // to compile where bar is an int32 and m is a matcher for int64.
+}
+
+// Same as Field() but also takes the name of the field to provide better error
+// messages.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType>> Field(
+ const std::string& field_name, FieldType Class::*field,
+ const FieldMatcher& matcher) {
+ return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
+ field_name, field, MatcherCast<const FieldType&>(matcher)));
+}
+
+// Creates a matcher that matches an object whose given property
+// matches 'matcher'. For example,
+// Property(&Foo::str, StartsWith("hi"))
+// matches a Foo object x if and only if x.str() starts with "hi".
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+ Class, PropertyType, PropertyType (Class::*)() const>>
+Property(PropertyType (Class::*property)() const,
+ const PropertyMatcher& matcher) {
+ return MakePolymorphicMatcher(
+ internal::PropertyMatcher<Class, PropertyType,
+ PropertyType (Class::*)() const>(
+ property, MatcherCast<const PropertyType&>(matcher)));
+ // The call to MatcherCast() is required for supporting inner
+ // matchers of compatible types. For example, it allows
+ // Property(&Foo::bar, m)
+ // to compile where bar() returns an int32 and m is a matcher for int64.
+}
+
+// Same as Property() above, but also takes the name of the property to provide
+// better error messages.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+ Class, PropertyType, PropertyType (Class::*)() const>>
+Property(const std::string& property_name,
+ PropertyType (Class::*property)() const,
+ const PropertyMatcher& matcher) {
+ return MakePolymorphicMatcher(
+ internal::PropertyMatcher<Class, PropertyType,
+ PropertyType (Class::*)() const>(
+ property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// The same as above but for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+ Class, PropertyType, PropertyType (Class::*)() const&>>
+Property(PropertyType (Class::*property)() const&,
+ const PropertyMatcher& matcher) {
+ return MakePolymorphicMatcher(
+ internal::PropertyMatcher<Class, PropertyType,
+ PropertyType (Class::*)() const&>(
+ property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Three-argument form for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+ Class, PropertyType, PropertyType (Class::*)() const&>>
+Property(const std::string& property_name,
+ PropertyType (Class::*property)() const&,
+ const PropertyMatcher& matcher) {
+ return MakePolymorphicMatcher(
+ internal::PropertyMatcher<Class, PropertyType,
+ PropertyType (Class::*)() const&>(
+ property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Creates a matcher that matches an object if and only if the result of
+// applying a callable to x matches 'matcher'. For example,
+// ResultOf(f, StartsWith("hi"))
+// matches a Foo object x if and only if f(x) starts with "hi".
+// `callable` parameter can be a function, function pointer, or a functor. It is
+// required to keep no state affecting the results of the calls on it and make
+// no assumptions about how many calls will be made. Any state it keeps must be
+// protected from the concurrent access.
+template <typename Callable, typename InnerMatcher>
+internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
+ Callable callable, InnerMatcher matcher) {
+ return internal::ResultOfMatcher<Callable, InnerMatcher>(std::move(callable),
+ std::move(matcher));
+}
+
+// Same as ResultOf() above, but also takes a description of the `callable`
+// result to provide better error messages.
+template <typename Callable, typename InnerMatcher>
+internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
+ const std::string& result_description, Callable callable,
+ InnerMatcher matcher) {
+ return internal::ResultOfMatcher<Callable, InnerMatcher>(
+ result_description, std::move(callable), std::move(matcher));
+}
+
+// String matchers.
+
+// Matches a string equal to str.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrEq(
+ const internal::StringLike<T>& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::string>(std::string(str), true, true));
+}
+
+// Matches a string not equal to str.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrNe(
+ const internal::StringLike<T>& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::string>(std::string(str), false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrCaseEq(
+ const internal::StringLike<T>& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::string>(std::string(str), true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+template <typename T = std::string>
+PolymorphicMatcher<internal::StrEqualityMatcher<std::string>> StrCaseNe(
+ const internal::StringLike<T>& str) {
+ return MakePolymorphicMatcher(internal::StrEqualityMatcher<std::string>(
+ std::string(str), false, false));
+}
+
+// Creates a matcher that matches any string, std::string, or C string
+// that contains the given substring.
+template <typename T = std::string>
+PolymorphicMatcher<internal::HasSubstrMatcher<std::string>> HasSubstr(
+ const internal::StringLike<T>& substring) {
+ return MakePolymorphicMatcher(
+ internal::HasSubstrMatcher<std::string>(std::string(substring)));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+template <typename T = std::string>
+PolymorphicMatcher<internal::StartsWithMatcher<std::string>> StartsWith(
+ const internal::StringLike<T>& prefix) {
+ return MakePolymorphicMatcher(
+ internal::StartsWithMatcher<std::string>(std::string(prefix)));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+template <typename T = std::string>
+PolymorphicMatcher<internal::EndsWithMatcher<std::string>> EndsWith(
+ const internal::StringLike<T>& suffix) {
+ return MakePolymorphicMatcher(
+ internal::EndsWithMatcher<std::string>(std::string(suffix)));
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Wide string matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrEq(
+ const std::wstring& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::wstring>(str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrNe(
+ const std::wstring& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::wstring>(str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrCaseEq(
+ const std::wstring& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::wstring>(str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring>> StrCaseNe(
+ const std::wstring& str) {
+ return MakePolymorphicMatcher(
+ internal::StrEqualityMatcher<std::wstring>(str, false, false));
+}
+
+// Creates a matcher that matches any ::wstring, std::wstring, or C wide string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring>> HasSubstr(
+ const std::wstring& substring) {
+ return MakePolymorphicMatcher(
+ internal::HasSubstrMatcher<std::wstring>(substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring>> StartsWith(
+ const std::wstring& prefix) {
+ return MakePolymorphicMatcher(
+ internal::StartsWithMatcher<std::wstring>(prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring>> EndsWith(
+ const std::wstring& suffix) {
+ return MakePolymorphicMatcher(
+ internal::EndsWithMatcher<std::wstring>(suffix));
+}
+
+#endif // GTEST_HAS_STD_WSTRING
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field == the second field.
+inline internal::Eq2Matcher Eq() { return internal::Eq2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field >= the second field.
+inline internal::Ge2Matcher Ge() { return internal::Ge2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field > the second field.
+inline internal::Gt2Matcher Gt() { return internal::Gt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field <= the second field.
+inline internal::Le2Matcher Le() { return internal::Le2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field < the second field.
+inline internal::Lt2Matcher Lt() { return internal::Lt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field != the second field.
+inline internal::Ne2Matcher Ne() { return internal::Ne2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatEq() {
+ return internal::FloatingEq2Matcher<float>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleEq() {
+ return internal::FloatingEq2Matcher<double>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatEq() {
+ return internal::FloatingEq2Matcher<float>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleEq() {
+ return internal::FloatingEq2Matcher<double>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatNear(float max_abs_error) {
+ return internal::FloatingEq2Matcher<float>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleNear(double max_abs_error) {
+ return internal::FloatingEq2Matcher<double>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatNear(
+ float max_abs_error) {
+ return internal::FloatingEq2Matcher<float>(max_abs_error, true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleNear(
+ double max_abs_error) {
+ return internal::FloatingEq2Matcher<double>(max_abs_error, true);
+}
+
+// Creates a matcher that matches any value of type T that m doesn't
+// match.
+template <typename InnerMatcher>
+inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
+ return internal::NotMatcher<InnerMatcher>(m);
+}
+
+// Returns a matcher that matches anything that satisfies the given
+// predicate. The predicate can be any unary function or functor
+// whose return type can be implicitly converted to bool.
+template <typename Predicate>
+inline PolymorphicMatcher<internal::TrulyMatcher<Predicate>> Truly(
+ Predicate pred) {
+ return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
+}
+
+// Returns a matcher that matches the container size. The container must
+// support both size() and size_type which all STL-like containers provide.
+// Note that the parameter 'size' can be a value of type size_type as well as
+// matcher. For instance:
+// EXPECT_THAT(container, SizeIs(2)); // Checks container has 2 elements.
+// EXPECT_THAT(container, SizeIs(Le(2)); // Checks container has at most 2.
+template <typename SizeMatcher>
+inline internal::SizeIsMatcher<SizeMatcher> SizeIs(
+ const SizeMatcher& size_matcher) {
+ return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
+}
+
+// Returns a matcher that matches the distance between the container's begin()
+// iterator and its end() iterator, i.e. the size of the container. This matcher
+// can be used instead of SizeIs with containers such as std::forward_list which
+// do not implement size(). The container must provide const_iterator (with
+// valid iterator_traits), begin() and end().
+template <typename DistanceMatcher>
+inline internal::BeginEndDistanceIsMatcher<DistanceMatcher> BeginEndDistanceIs(
+ const DistanceMatcher& distance_matcher) {
+ return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
+}
+
+// Returns a matcher that matches an equal container.
+// This matcher behaves like Eq(), but in the event of mismatch lists the
+// values that are included in one container but not the other. (Duplicate
+// values and order differences are not explained.)
+template <typename Container>
+inline PolymorphicMatcher<
+ internal::ContainerEqMatcher<typename std::remove_const<Container>::type>>
+ContainerEq(const Container& rhs) {
+ return MakePolymorphicMatcher(internal::ContainerEqMatcher<Container>(rhs));
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the given comparator, matches container_matcher.
+template <typename Comparator, typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher> WhenSortedBy(
+ const Comparator& comparator, const ContainerMatcher& container_matcher) {
+ return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
+ comparator, container_matcher);
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the < operator, matches container_matcher.
+template <typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
+WhenSorted(const ContainerMatcher& container_matcher) {
+ return internal::WhenSortedByMatcher<internal::LessComparator,
+ ContainerMatcher>(
+ internal::LessComparator(), container_matcher);
+}
+
+// Matches an STL-style container or a native array that contains the
+// same number of elements as in rhs, where its i-th element and rhs's
+// i-th element (as a pair) satisfy the given pair matcher, for all i.
+// TupleMatcher must be able to be safely cast to Matcher<std::tuple<const
+// T1&, const T2&> >, where T1 and T2 are the types of elements in the
+// LHS container and the RHS container respectively.
+template <typename TupleMatcher, typename Container>
+inline internal::PointwiseMatcher<TupleMatcher,
+ typename std::remove_const<Container>::type>
+Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
+ return internal::PointwiseMatcher<TupleMatcher, Container>(tuple_matcher,
+ rhs);
+}
+
+// Supports the Pointwise(m, {a, b, c}) syntax.
+template <typename TupleMatcher, typename T>
+inline internal::PointwiseMatcher<TupleMatcher, std::vector<T>> Pointwise(
+ const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
+ return Pointwise(tuple_matcher, std::vector<T>(rhs));
+}
+
+// UnorderedPointwise(pair_matcher, rhs) matches an STL-style
+// container or a native array that contains the same number of
+// elements as in rhs, where in some permutation of the container, its
+// i-th element and rhs's i-th element (as a pair) satisfy the given
+// pair matcher, for all i. Tuple2Matcher must be able to be safely
+// cast to Matcher<std::tuple<const T1&, const T2&> >, where T1 and T2 are
+// the types of elements in the LHS container and the RHS container
+// respectively.
+//
+// This is like Pointwise(pair_matcher, rhs), except that the element
+// order doesn't matter.
+template <typename Tuple2Matcher, typename RhsContainer>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename internal::BoundSecondMatcher<
+ Tuple2Matcher,
+ typename internal::StlContainerView<
+ typename std::remove_const<RhsContainer>::type>::type::value_type>>
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+ const RhsContainer& rhs_container) {
+ // RhsView allows the same code to handle RhsContainer being a
+ // STL-style container and it being a native C-style array.
+ typedef typename internal::StlContainerView<RhsContainer> RhsView;
+ typedef typename RhsView::type RhsStlContainer;
+ typedef typename RhsStlContainer::value_type Second;
+ const RhsStlContainer& rhs_stl_container =
+ RhsView::ConstReference(rhs_container);
+
+ // Create a matcher for each element in rhs_container.
+ ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second>> matchers;
+ for (auto it = rhs_stl_container.begin(); it != rhs_stl_container.end();
+ ++it) {
+ matchers.push_back(internal::MatcherBindSecond(tuple2_matcher, *it));
+ }
+
+ // Delegate the work to UnorderedElementsAreArray().
+ return UnorderedElementsAreArray(matchers);
+}
+
+// Supports the UnorderedPointwise(m, {a, b, c}) syntax.
+template <typename Tuple2Matcher, typename T>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename internal::BoundSecondMatcher<Tuple2Matcher, T>>
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+ std::initializer_list<T> rhs) {
+ return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
+}
+
+// Matches an STL-style container or a native array that contains at
+// least one element matching the given value or matcher.
+//
+// Examples:
+// ::std::set<int> page_ids;
+// page_ids.insert(3);
+// page_ids.insert(1);
+// EXPECT_THAT(page_ids, Contains(1));
+// EXPECT_THAT(page_ids, Contains(Gt(2)));
+// EXPECT_THAT(page_ids, Not(Contains(4))); // See below for Times(0)
+//
+// ::std::map<int, size_t> page_lengths;
+// page_lengths[1] = 100;
+// EXPECT_THAT(page_lengths,
+// Contains(::std::pair<const int, size_t>(1, 100)));
+//
+// const char* user_ids[] = { "joe", "mike", "tom" };
+// EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
+//
+// The matcher supports a modifier `Times` that allows to check for arbitrary
+// occurrences including testing for absence with Times(0).
+//
+// Examples:
+// ::std::vector<int> ids;
+// ids.insert(1);
+// ids.insert(1);
+// ids.insert(3);
+// EXPECT_THAT(ids, Contains(1).Times(2)); // 1 occurs 2 times
+// EXPECT_THAT(ids, Contains(2).Times(0)); // 2 is not present
+// EXPECT_THAT(ids, Contains(3).Times(Ge(1))); // 3 occurs at least once
+
+template <typename M>
+inline internal::ContainsMatcher<M> Contains(M matcher) {
+ return internal::ContainsMatcher<M>(matcher);
+}
+
+// IsSupersetOf(iterator_first, iterator_last)
+// IsSupersetOf(pointer, count)
+// IsSupersetOf(array)
+// IsSupersetOf(container)
+// IsSupersetOf({e1, e2, ..., en})
+//
+// IsSupersetOf() verifies that a surjective partial mapping onto a collection
+// of matchers exists. In other words, a container matches
+// IsSupersetOf({e1, ..., en}) if and only if there is a permutation
+// {y1, ..., yn} of some of the container's elements where y1 matches e1,
+// ..., and yn matches en. Obviously, the size of the container must be >= n
+// in order to have a match. Examples:
+//
+// - {1, 2, 3} matches IsSupersetOf({Ge(3), Ne(0)}), as 3 matches Ge(3) and
+// 1 matches Ne(0).
+// - {1, 2} doesn't match IsSupersetOf({Eq(1), Lt(2)}), even though 1 matches
+// both Eq(1) and Lt(2). The reason is that different matchers must be used
+// for elements in different slots of the container.
+// - {1, 1, 2} matches IsSupersetOf({Eq(1), Lt(2)}), as (the first) 1 matches
+// Eq(1) and (the second) 1 matches Lt(2).
+// - {1, 2, 3} matches IsSupersetOf(Gt(1), Gt(1)), as 2 matches (the first)
+// Gt(1) and 3 matches (the second) Gt(1).
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>
+IsSupersetOf(Iter first, Iter last) {
+ typedef typename ::std::iterator_traits<Iter>::value_type T;
+ return internal::UnorderedElementsAreArrayMatcher<T>(
+ internal::UnorderedMatcherRequire::Superset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+ const T* pointer, size_t count) {
+ return IsSupersetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+ const T (&array)[N]) {
+ return IsSupersetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename Container::value_type>
+IsSupersetOf(const Container& container) {
+ return IsSupersetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+ ::std::initializer_list<T> xs) {
+ return IsSupersetOf(xs.begin(), xs.end());
+}
+
+// IsSubsetOf(iterator_first, iterator_last)
+// IsSubsetOf(pointer, count)
+// IsSubsetOf(array)
+// IsSubsetOf(container)
+// IsSubsetOf({e1, e2, ..., en})
+//
+// IsSubsetOf() verifies that an injective mapping onto a collection of matchers
+// exists. In other words, a container matches IsSubsetOf({e1, ..., en}) if and
+// only if there is a subset of matchers {m1, ..., mk} which would match the
+// container using UnorderedElementsAre. Obviously, the size of the container
+// must be <= n in order to have a match. Examples:
+//
+// - {1} matches IsSubsetOf({Gt(0), Lt(0)}), as 1 matches Gt(0).
+// - {1, -1} matches IsSubsetOf({Lt(0), Gt(0)}), as 1 matches Gt(0) and -1
+// matches Lt(0).
+// - {1, 2} doesn't matches IsSubsetOf({Gt(0), Lt(0)}), even though 1 and 2 both
+// match Gt(0). The reason is that different matchers must be used for
+// elements in different slots of the container.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>
+IsSubsetOf(Iter first, Iter last) {
+ typedef typename ::std::iterator_traits<Iter>::value_type T;
+ return internal::UnorderedElementsAreArrayMatcher<T>(
+ internal::UnorderedMatcherRequire::Subset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+ const T* pointer, size_t count) {
+ return IsSubsetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+ const T (&array)[N]) {
+ return IsSubsetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+ typename Container::value_type>
+IsSubsetOf(const Container& container) {
+ return IsSubsetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+ ::std::initializer_list<T> xs) {
+ return IsSubsetOf(xs.begin(), xs.end());
+}
+
+// Matches an STL-style container or a native array that contains only
+// elements matching the given value or matcher.
+//
+// Each(m) is semantically equivalent to `Not(Contains(Not(m)))`. Only
+// the messages are different.
+//
+// Examples:
+// ::std::set<int> page_ids;
+// // Each(m) matches an empty container, regardless of what m is.
+// EXPECT_THAT(page_ids, Each(Eq(1)));
+// EXPECT_THAT(page_ids, Each(Eq(77)));
+//
+// page_ids.insert(3);
+// EXPECT_THAT(page_ids, Each(Gt(0)));
+// EXPECT_THAT(page_ids, Not(Each(Gt(4))));
+// page_ids.insert(1);
+// EXPECT_THAT(page_ids, Not(Each(Lt(2))));
+//
+// ::std::map<int, size_t> page_lengths;
+// page_lengths[1] = 100;
+// page_lengths[2] = 200;
+// page_lengths[3] = 300;
+// EXPECT_THAT(page_lengths, Not(Each(Pair(1, 100))));
+// EXPECT_THAT(page_lengths, Each(Key(Le(3))));
+//
+// const char* user_ids[] = { "joe", "mike", "tom" };
+// EXPECT_THAT(user_ids, Not(Each(Eq(::std::string("tom")))));
+template <typename M>
+inline internal::EachMatcher<M> Each(M matcher) {
+ return internal::EachMatcher<M>(matcher);
+}
+
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher. For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename M>
+inline internal::KeyMatcher<M> Key(M inner_matcher) {
+ return internal::KeyMatcher<M>(inner_matcher);
+}
+
+// Pair(first_matcher, second_matcher) matches a std::pair whose 'first' field
+// matches first_matcher and whose 'second' field matches second_matcher. For
+// example, EXPECT_THAT(map_type, ElementsAre(Pair(Ge(5), "foo"))) can be used
+// to match a std::map<int, string> that contains exactly one element whose key
+// is >= 5 and whose value equals "foo".
+template <typename FirstMatcher, typename SecondMatcher>
+inline internal::PairMatcher<FirstMatcher, SecondMatcher> Pair(
+ FirstMatcher first_matcher, SecondMatcher second_matcher) {
+ return internal::PairMatcher<FirstMatcher, SecondMatcher>(first_matcher,
+ second_matcher);
+}
+
+namespace no_adl {
+// Conditional() creates a matcher that conditionally uses either the first or
+// second matcher provided. For example, we could create an `equal if, and only
+// if' matcher using the Conditional wrapper as follows:
+//
+// EXPECT_THAT(result, Conditional(condition, Eq(expected), Ne(expected)));
+template <typename MatcherTrue, typename MatcherFalse>
+internal::ConditionalMatcher<MatcherTrue, MatcherFalse> Conditional(
+ bool condition, MatcherTrue matcher_true, MatcherFalse matcher_false) {
+ return internal::ConditionalMatcher<MatcherTrue, MatcherFalse>(
+ condition, std::move(matcher_true), std::move(matcher_false));
+}
+
+// FieldsAre(matchers...) matches piecewise the fields of compatible structs.
+// These include those that support `get<I>(obj)`, and when structured bindings
+// are enabled any class that supports them.
+// In particular, `std::tuple`, `std::pair`, `std::array` and aggregate types.
+template <typename... M>
+internal::FieldsAreMatcher<typename std::decay<M>::type...> FieldsAre(
+ M&&... matchers) {
+ return internal::FieldsAreMatcher<typename std::decay<M>::type...>(
+ std::forward<M>(matchers)...);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that matches
+// inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointerMatcher<InnerMatcher> Pointer(
+ const InnerMatcher& inner_matcher) {
+ return internal::PointerMatcher<InnerMatcher>(inner_matcher);
+}
+
+// Creates a matcher that matches an object that has an address that matches
+// inner_matcher.
+template <typename InnerMatcher>
+inline internal::AddressMatcher<InnerMatcher> Address(
+ const InnerMatcher& inner_matcher) {
+ return internal::AddressMatcher<InnerMatcher>(inner_matcher);
+}
+
+// Matches a base64 escaped string, when the unescaped string matches the
+// internal matcher.
+template <typename MatcherType>
+internal::WhenBase64UnescapedMatcher WhenBase64Unescaped(
+ const MatcherType& internal_matcher) {
+ return internal::WhenBase64UnescapedMatcher(internal_matcher);
+}
+} // namespace no_adl
+
+// Returns a predicate that is satisfied by anything that matches the
+// given matcher.
+template <typename M>
+inline internal::MatcherAsPredicate<M> Matches(M matcher) {
+ return internal::MatcherAsPredicate<M>(matcher);
+}
+
+// Returns true if and only if the value matches the matcher.
+template <typename T, typename M>
+inline bool Value(const T& value, M matcher) {
+ return testing::Matches(matcher)(value);
+}
+
+// Matches the value against the given matcher and explains the match
+// result to listener.
+template <typename T, typename M>
+inline bool ExplainMatchResult(M matcher, const T& value,
+ MatchResultListener* listener) {
+ return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
+}
+
+// Returns a string representation of the given matcher. Useful for description
+// strings of matchers defined using MATCHER_P* macros that accept matchers as
+// their arguments. For example:
+//
+// MATCHER_P(XAndYThat, matcher,
+// "X that " + DescribeMatcher<int>(matcher, negation) +
+// (negation ? " or" : " and") + " Y that " +
+// DescribeMatcher<double>(matcher, negation)) {
+// return ExplainMatchResult(matcher, arg.x(), result_listener) &&
+// ExplainMatchResult(matcher, arg.y(), result_listener);
+// }
+template <typename T, typename M>
+std::string DescribeMatcher(const M& matcher, bool negation = false) {
+ ::std::stringstream ss;
+ Matcher<T> monomorphic_matcher = SafeMatcherCast<T>(matcher);
+ if (negation) {
+ monomorphic_matcher.DescribeNegationTo(&ss);
+ } else {
+ monomorphic_matcher.DescribeTo(&ss);
+ }
+ return ss.str();
+}
+
+template <typename... Args>
+internal::ElementsAreMatcher<
+ std::tuple<typename std::decay<const Args&>::type...>>
+ElementsAre(const Args&... matchers) {
+ return internal::ElementsAreMatcher<
+ std::tuple<typename std::decay<const Args&>::type...>>(
+ std::make_tuple(matchers...));
+}
+
+template <typename... Args>
+internal::UnorderedElementsAreMatcher<
+ std::tuple<typename std::decay<const Args&>::type...>>
+UnorderedElementsAre(const Args&... matchers) {
+ return internal::UnorderedElementsAreMatcher<
+ std::tuple<typename std::decay<const Args&>::type...>>(
+ std::make_tuple(matchers...));
+}
+
+// Define variadic matcher versions.
+template <typename... Args>
+internal::AllOfMatcher<typename std::decay<const Args&>::type...> AllOf(
+ const Args&... matchers) {
+ return internal::AllOfMatcher<typename std::decay<const Args&>::type...>(
+ matchers...);
+}
+
+template <typename... Args>
+internal::AnyOfMatcher<typename std::decay<const Args&>::type...> AnyOf(
+ const Args&... matchers) {
+ return internal::AnyOfMatcher<typename std::decay<const Args&>::type...>(
+ matchers...);
+}
+
+// AnyOfArray(array)
+// AnyOfArray(pointer, count)
+// AnyOfArray(container)
+// AnyOfArray({ e1, e2, ..., en })
+// AnyOfArray(iterator_first, iterator_last)
+//
+// AnyOfArray() verifies whether a given value matches any member of a
+// collection of matchers.
+//
+// AllOfArray(array)
+// AllOfArray(pointer, count)
+// AllOfArray(container)
+// AllOfArray({ e1, e2, ..., en })
+// AllOfArray(iterator_first, iterator_last)
+//
+// AllOfArray() verifies whether a given value matches all members of a
+// collection of matchers.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::AnyOfArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>
+AnyOfArray(Iter first, Iter last) {
+ return internal::AnyOfArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename Iter>
+inline internal::AllOfArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>
+AllOfArray(Iter first, Iter last) {
+ return internal::AllOfArrayMatcher<
+ typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T* ptr, size_t count) {
+ return AnyOfArray(ptr, ptr + count);
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T* ptr, size_t count) {
+ return AllOfArray(ptr, ptr + count);
+}
+
+template <typename T, size_t N>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T (&array)[N]) {
+ return AnyOfArray(array, N);
+}
+
+template <typename T, size_t N>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T (&array)[N]) {
+ return AllOfArray(array, N);
+}
+
+template <typename Container>
+inline internal::AnyOfArrayMatcher<typename Container::value_type> AnyOfArray(
+ const Container& container) {
+ return AnyOfArray(container.begin(), container.end());
+}
+
+template <typename Container>
+inline internal::AllOfArrayMatcher<typename Container::value_type> AllOfArray(
+ const Container& container) {
+ return AllOfArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(
+ ::std::initializer_list<T> xs) {
+ return AnyOfArray(xs.begin(), xs.end());
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(
+ ::std::initializer_list<T> xs) {
+ return AllOfArray(xs.begin(), xs.end());
+}
+
+// Args<N1, N2, ..., Nk>(a_matcher) matches a tuple if the selected
+// fields of it matches a_matcher. C++ doesn't support default
+// arguments for function templates, so we have to overload it.
+template <size_t... k, typename InnerMatcher>
+internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...> Args(
+ InnerMatcher&& matcher) {
+ return internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...>(
+ std::forward<InnerMatcher>(matcher));
+}
+
+// AllArgs(m) is a synonym of m. This is useful in
+//
+// EXPECT_CALL(foo, Bar(_, _)).With(AllArgs(Eq()));
+//
+// which is easier to read than
+//
+// EXPECT_CALL(foo, Bar(_, _)).With(Eq());
+template <typename InnerMatcher>
+inline InnerMatcher AllArgs(const InnerMatcher& matcher) {
+ return matcher;
+}
+
+// Returns a matcher that matches the value of an optional<> type variable.
+// The matcher implementation only uses '!arg' and requires that the optional<>
+// type has a 'value_type' member type and that '*arg' is of type 'value_type'
+// and is printable using 'PrintToString'. It is compatible with
+// std::optional/std::experimental::optional.
+// Note that to compare an optional type variable against nullopt you should
+// use Eq(nullopt) and not Eq(Optional(nullopt)). The latter implies that the
+// optional value contains an optional itself.
+template <typename ValueMatcher>
+inline internal::OptionalMatcher<ValueMatcher> Optional(
+ const ValueMatcher& value_matcher) {
+ return internal::OptionalMatcher<ValueMatcher>(value_matcher);
+}
+
+// Returns a matcher that matches the value of a absl::any type variable.
+template <typename T>
+PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T>> AnyWith(
+ const Matcher<const T&>& matcher) {
+ return MakePolymorphicMatcher(
+ internal::any_cast_matcher::AnyCastMatcher<T>(matcher));
+}
+
+// Returns a matcher that matches the value of a variant<> type variable.
+// The matcher implementation uses ADL to find the holds_alternative and get
+// functions.
+// It is compatible with std::variant.
+template <typename T>
+PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T>> VariantWith(
+ const Matcher<const T&>& matcher) {
+ return MakePolymorphicMatcher(
+ internal::variant_matcher::VariantMatcher<T>(matcher));
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Anything inside the `internal` namespace is internal to the implementation
+// and must not be used in user code!
+namespace internal {
+
+class WithWhatMatcherImpl {
+ public:
+ WithWhatMatcherImpl(Matcher<std::string> matcher)
+ : matcher_(std::move(matcher)) {}
+
+ void DescribeTo(std::ostream* os) const {
+ *os << "contains .what() that ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(std::ostream* os) const {
+ *os << "contains .what() that does not ";
+ matcher_.DescribeTo(os);
+ }
+
+ template <typename Err>
+ bool MatchAndExplain(const Err& err, MatchResultListener* listener) const {
+ *listener << "which contains .what() (of value = " << err.what()
+ << ") that ";
+ return matcher_.MatchAndExplain(err.what(), listener);
+ }
+
+ private:
+ const Matcher<std::string> matcher_;
+};
+
+inline PolymorphicMatcher<WithWhatMatcherImpl> WithWhat(
+ Matcher<std::string> m) {
+ return MakePolymorphicMatcher(WithWhatMatcherImpl(std::move(m)));
+}
+
+template <typename Err>
+class ExceptionMatcherImpl {
+ class NeverThrown {
+ public:
+ const char* what() const noexcept {
+ return "this exception should never be thrown";
+ }
+ };
+
+ // If the matchee raises an exception of a wrong type, we'd like to
+ // catch it and print its message and type. To do that, we add an additional
+ // catch clause:
+ //
+ // try { ... }
+ // catch (const Err&) { /* an expected exception */ }
+ // catch (const std::exception&) { /* exception of a wrong type */ }
+ //
+ // However, if the `Err` itself is `std::exception`, we'd end up with two
+ // identical `catch` clauses:
+ //
+ // try { ... }
+ // catch (const std::exception&) { /* an expected exception */ }
+ // catch (const std::exception&) { /* exception of a wrong type */ }
+ //
+ // This can cause a warning or an error in some compilers. To resolve
+ // the issue, we use a fake error type whenever `Err` is `std::exception`:
+ //
+ // try { ... }
+ // catch (const std::exception&) { /* an expected exception */ }
+ // catch (const NeverThrown&) { /* exception of a wrong type */ }
+ using DefaultExceptionType = typename std::conditional<
+ std::is_same<typename std::remove_cv<
+ typename std::remove_reference<Err>::type>::type,
+ std::exception>::value,
+ const NeverThrown&, const std::exception&>::type;
+
+ public:
+ ExceptionMatcherImpl(Matcher<const Err&> matcher)
+ : matcher_(std::move(matcher)) {}
+
+ void DescribeTo(std::ostream* os) const {
+ *os << "throws an exception which is a " << GetTypeName<Err>();
+ *os << " which ";
+ matcher_.DescribeTo(os);
+ }
+
+ void DescribeNegationTo(std::ostream* os) const {
+ *os << "throws an exception which is not a " << GetTypeName<Err>();
+ *os << " which ";
+ matcher_.DescribeNegationTo(os);
+ }
+
+ template <typename T>
+ bool MatchAndExplain(T&& x, MatchResultListener* listener) const {
+ try {
+ (void)(std::forward<T>(x)());
+ } catch (const Err& err) {
+ *listener << "throws an exception which is a " << GetTypeName<Err>();
+ *listener << " ";
+ return matcher_.MatchAndExplain(err, listener);
+ } catch (DefaultExceptionType err) {
+#if GTEST_HAS_RTTI
+ *listener << "throws an exception of type " << GetTypeName(typeid(err));
+ *listener << " ";
+#else
+ *listener << "throws an std::exception-derived type ";
+#endif
+ *listener << "with description \"" << err.what() << "\"";
+ return false;
+ } catch (...) {
+ *listener << "throws an exception of an unknown type";
+ return false;
+ }
+
+ *listener << "does not throw any exception";
+ return false;
+ }
+
+ private:
+ const Matcher<const Err&> matcher_;
+};
+
+} // namespace internal
+
+// Throws()
+// Throws(exceptionMatcher)
+// ThrowsMessage(messageMatcher)
+//
+// This matcher accepts a callable and verifies that when invoked, it throws
+// an exception with the given type and properties.
+//
+// Examples:
+//
+// EXPECT_THAT(
+// []() { throw std::runtime_error("message"); },
+// Throws<std::runtime_error>());
+//
+// EXPECT_THAT(
+// []() { throw std::runtime_error("message"); },
+// ThrowsMessage<std::runtime_error>(HasSubstr("message")));
+//
+// EXPECT_THAT(
+// []() { throw std::runtime_error("message"); },
+// Throws<std::runtime_error>(
+// Property(&std::runtime_error::what, HasSubstr("message"))));
+
+template <typename Err>
+PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> Throws() {
+ return MakePolymorphicMatcher(
+ internal::ExceptionMatcherImpl<Err>(A<const Err&>()));
+}
+
+template <typename Err, typename ExceptionMatcher>
+PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> Throws(
+ const ExceptionMatcher& exception_matcher) {
+ // Using matcher cast allows users to pass a matcher of a more broad type.
+ // For example user may want to pass Matcher<std::exception>
+ // to Throws<std::runtime_error>, or Matcher<int64> to Throws<int32>.
+ return MakePolymorphicMatcher(internal::ExceptionMatcherImpl<Err>(
+ SafeMatcherCast<const Err&>(exception_matcher)));
+}
+
+template <typename Err, typename MessageMatcher>
+PolymorphicMatcher<internal::ExceptionMatcherImpl<Err>> ThrowsMessage(
+ MessageMatcher&& message_matcher) {
+ static_assert(std::is_base_of<std::exception, Err>::value,
+ "expected an std::exception-derived type");
+ return Throws<Err>(internal::WithWhat(
+ MatcherCast<std::string>(std::forward<MessageMatcher>(message_matcher))));
+}
+
+#endif // GTEST_HAS_EXCEPTIONS
+
+// These macros allow using matchers to check values in Google Test
+// tests. ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
+// succeed if and only if the value matches the matcher. If the assertion
+// fails, the value and the description of the matcher will be printed.
+#define ASSERT_THAT(value, matcher) \
+ ASSERT_PRED_FORMAT1( \
+ ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_THAT(value, matcher) \
+ EXPECT_PRED_FORMAT1( \
+ ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+
+// MATCHER* macros itself are listed below.
+#define MATCHER(name, description) \
+ class name##Matcher \
+ : public ::testing::internal::MatcherBaseImpl<name##Matcher> { \
+ public: \
+ template <typename arg_type> \
+ class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> { \
+ public: \
+ gmock_Impl() {} \
+ bool MatchAndExplain( \
+ const arg_type& arg, \
+ ::testing::MatchResultListener* result_listener) const override; \
+ void DescribeTo(::std::ostream* gmock_os) const override { \
+ *gmock_os << FormatDescription(false); \
+ } \
+ void DescribeNegationTo(::std::ostream* gmock_os) const override { \
+ *gmock_os << FormatDescription(true); \
+ } \
+ \
+ private: \
+ ::std::string FormatDescription(bool negation) const { \
+ /* NOLINTNEXTLINE readability-redundant-string-init */ \
+ ::std::string gmock_description = (description); \
+ if (!gmock_description.empty()) { \
+ return gmock_description; \
+ } \
+ return ::testing::internal::FormatMatcherDescription(negation, #name, \
+ {}, {}); \
+ } \
+ }; \
+ }; \
+ GTEST_ATTRIBUTE_UNUSED_ inline name##Matcher name() { return {}; } \
+ template <typename arg_type> \
+ bool name##Matcher::gmock_Impl<arg_type>::MatchAndExplain( \
+ const arg_type& arg, \
+ ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_) \
+ const
+
+#define MATCHER_P(name, p0, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP, description, (#p0), (p0))
+#define MATCHER_P2(name, p0, p1, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP2, description, (#p0, #p1), \
+ (p0, p1))
+#define MATCHER_P3(name, p0, p1, p2, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP3, description, (#p0, #p1, #p2), \
+ (p0, p1, p2))
+#define MATCHER_P4(name, p0, p1, p2, p3, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP4, description, \
+ (#p0, #p1, #p2, #p3), (p0, p1, p2, p3))
+#define MATCHER_P5(name, p0, p1, p2, p3, p4, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP5, description, \
+ (#p0, #p1, #p2, #p3, #p4), (p0, p1, p2, p3, p4))
+#define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP6, description, \
+ (#p0, #p1, #p2, #p3, #p4, #p5), \
+ (p0, p1, p2, p3, p4, p5))
+#define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP7, description, \
+ (#p0, #p1, #p2, #p3, #p4, #p5, #p6), \
+ (p0, p1, p2, p3, p4, p5, p6))
+#define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP8, description, \
+ (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7), \
+ (p0, p1, p2, p3, p4, p5, p6, p7))
+#define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP9, description, \
+ (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7, #p8), \
+ (p0, p1, p2, p3, p4, p5, p6, p7, p8))
+#define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description) \
+ GMOCK_INTERNAL_MATCHER(name, name##MatcherP10, description, \
+ (#p0, #p1, #p2, #p3, #p4, #p5, #p6, #p7, #p8, #p9), \
+ (p0, p1, p2, p3, p4, p5, p6, p7, p8, p9))
+
+#define GMOCK_INTERNAL_MATCHER(name, full_name, description, arg_names, args) \
+ template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)> \
+ class full_name : public ::testing::internal::MatcherBaseImpl< \
+ full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>> { \
+ public: \
+ using full_name::MatcherBaseImpl::MatcherBaseImpl; \
+ template <typename arg_type> \
+ class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> { \
+ public: \
+ explicit gmock_Impl(GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args)) \
+ : GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) {} \
+ bool MatchAndExplain( \
+ const arg_type& arg, \
+ ::testing::MatchResultListener* result_listener) const override; \
+ void DescribeTo(::std::ostream* gmock_os) const override { \
+ *gmock_os << FormatDescription(false); \
+ } \
+ void DescribeNegationTo(::std::ostream* gmock_os) const override { \
+ *gmock_os << FormatDescription(true); \
+ } \
+ GMOCK_INTERNAL_MATCHER_MEMBERS(args) \
+ \
+ private: \
+ ::std::string FormatDescription(bool negation) const { \
+ ::std::string gmock_description = (description); \
+ if (!gmock_description.empty()) { \
+ return gmock_description; \
+ } \
+ return ::testing::internal::FormatMatcherDescription( \
+ negation, #name, {GMOCK_PP_REMOVE_PARENS(arg_names)}, \
+ ::testing::internal::UniversalTersePrintTupleFieldsToStrings( \
+ ::std::tuple<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>( \
+ GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args)))); \
+ } \
+ }; \
+ }; \
+ template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)> \
+ inline full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)> name( \
+ GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args)) { \
+ return full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>( \
+ GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args)); \
+ } \
+ template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)> \
+ template <typename arg_type> \
+ bool full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>::gmock_Impl< \
+ arg_type>::MatchAndExplain(const arg_type& arg, \
+ ::testing::MatchResultListener* \
+ result_listener GTEST_ATTRIBUTE_UNUSED_) \
+ const
+
+#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args) \
+ GMOCK_PP_TAIL( \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM, , args))
+#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM(i_unused, data_unused, arg) \
+ , typename arg##_type
+
+#define GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TYPE_PARAM, , args))
+#define GMOCK_INTERNAL_MATCHER_TYPE_PARAM(i_unused, data_unused, arg) \
+ , arg##_type
+
+#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args) \
+ GMOCK_PP_TAIL(dummy_first GMOCK_PP_FOR_EACH( \
+ GMOCK_INTERNAL_MATCHER_FUNCTION_ARG, , args))
+#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARG(i, data_unused, arg) \
+ , arg##_type gmock_p##i
+
+#define GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_FORWARD_ARG, , args))
+#define GMOCK_INTERNAL_MATCHER_FORWARD_ARG(i, data_unused, arg) \
+ , arg(::std::forward<arg##_type>(gmock_p##i))
+
+#define GMOCK_INTERNAL_MATCHER_MEMBERS(args) \
+ GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER, , args)
+#define GMOCK_INTERNAL_MATCHER_MEMBER(i_unused, data_unused, arg) \
+ const arg##_type arg;
+
+#define GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER_USAGE, , args))
+#define GMOCK_INTERNAL_MATCHER_MEMBER_USAGE(i_unused, data_unused, arg) , arg
+
+#define GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args) \
+ GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_ARG_USAGE, , args))
+#define GMOCK_INTERNAL_MATCHER_ARG_USAGE(i, data_unused, arg_unused) \
+ , gmock_p##i
+
+// To prevent ADL on certain functions we put them on a separate namespace.
+using namespace no_adl; // NOLINT
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046
+
+// Include any custom callback matchers added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-matchers.h"
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
new file mode 100644
index 0000000000..148ac01721
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h
@@ -0,0 +1,662 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+
+#include <memory>
+#include <utility>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+// Include any custom callback actions added by the local installation.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters. ACTION() and
+// ACTION_P*() don't support that. ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+// ACTION_TEMPLATE(ActionName,
+// HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+// AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters. name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template. p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+// // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+// // function to type T and copies it to *output.
+// ACTION_TEMPLATE(DuplicateArg,
+// HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+// AND_1_VALUE_PARAMS(output)) {
+// *output = T(::std::get<k>(args));
+// }
+// ...
+// int n;
+// EXPECT_CALL(mock, Foo(_, _))
+// .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+// ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments. The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+// ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters. Without the restriction, the meaning of the following
+// is unclear:
+//
+// OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE. The main trick we use is to create
+// new macro invocations when expanding a macro. For example, we have
+//
+// #define ACTION_TEMPLATE(name, template_params, value_params)
+// ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+// ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+// ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable. It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported. Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+#define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
+#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, name1) \
+ kind0 name0, kind1 name1
+#define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+ kind2, name2) \
+ kind0 name0, kind1 name1, kind2 name2
+#define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+ kind2, name2, kind3, name3) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3
+#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4
+#define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+ kind2, name2, kind3, name3, \
+ kind4, name4, kind5, name5) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
+#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+ kind5 name5, kind6 name6
+#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6, kind7, name7) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+ kind5 name5, kind6 name6, kind7 name7
+#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6, kind7, name7, kind8, name8) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+ kind5 name5, kind6 name6, kind7 name7, kind8 name8
+#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6, kind7, name7, kind8, name8, kind9, name9) \
+ kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+ kind5 name5, kind6 name6, kind7 name7, kind8 name8, kind9 name9
+
+// Lists the template parameters.
+#define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
+#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, name1) \
+ name0, name1
+#define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+ kind2, name2) \
+ name0, name1, name2
+#define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+ kind2, name2, kind3, name3) \
+ name0, name1, name2, name3
+#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4) \
+ name0, name1, name2, name3, name4
+#define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+ kind2, name2, kind3, name3, \
+ kind4, name4, kind5, name5) \
+ name0, name1, name2, name3, name4, name5
+#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6) \
+ name0, name1, name2, name3, name4, name5, name6
+#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6, kind7, name7) \
+ name0, name1, name2, name3, name4, name5, name6, name7
+#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6, kind7, name7, kind8, name8) \
+ name0, name1, name2, name3, name4, name5, name6, name7, name8
+#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS( \
+ kind0, name0, kind1, name1, kind2, name2, kind3, name3, kind4, name4, \
+ kind5, name5, kind6, name6, kind7, name7, kind8, name8, kind9, name9) \
+ name0, name1, name2, name3, name4, name5, name6, name7, name8, name9
+
+// Declares the types of value parameters.
+#define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) \
+ , typename p0##_type, typename p1##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) \
+ , typename p0##_type, typename p1##_type, typename p2##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type, typename p4##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type, typename p4##_type, typename p5##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type, typename p4##_type, typename p5##_type, \
+ typename p6##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6, p7) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type, typename p4##_type, typename p5##_type, \
+ typename p6##_type, typename p7##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6, p7, p8) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type, typename p4##_type, typename p5##_type, \
+ typename p6##_type, typename p7##_type, typename p8##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6, p7, p8, p9) \
+ , typename p0##_type, typename p1##_type, typename p2##_type, \
+ typename p3##_type, typename p4##_type, typename p5##_type, \
+ typename p6##_type, typename p7##_type, typename p8##_type, \
+ typename p9##_type
+
+// Initializes the value parameters.
+#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS() ()
+#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0) \
+ (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
+#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1) \
+ (p0##_type gmock_p0, p1##_type gmock_p1) \
+ : p0(::std::move(gmock_p0)), p1(::std::move(gmock_p1))
+#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2))
+#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3))
+#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3, p4##_type gmock_p4) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3)), \
+ p4(::std::move(gmock_p4))
+#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3)), \
+ p4(::std::move(gmock_p4)), \
+ p5(::std::move(gmock_p5))
+#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+ p6##_type gmock_p6) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3)), \
+ p4(::std::move(gmock_p4)), \
+ p5(::std::move(gmock_p5)), \
+ p6(::std::move(gmock_p6))
+#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+ p6##_type gmock_p6, p7##_type gmock_p7) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3)), \
+ p4(::std::move(gmock_p4)), \
+ p5(::std::move(gmock_p5)), \
+ p6(::std::move(gmock_p6)), \
+ p7(::std::move(gmock_p7))
+#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+ p8) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+ p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3)), \
+ p4(::std::move(gmock_p4)), \
+ p5(::std::move(gmock_p5)), \
+ p6(::std::move(gmock_p6)), \
+ p7(::std::move(gmock_p7)), \
+ p8(::std::move(gmock_p8))
+#define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7, p8, p9) \
+ (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+ p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+ p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+ p9##_type gmock_p9) \
+ : p0(::std::move(gmock_p0)), \
+ p1(::std::move(gmock_p1)), \
+ p2(::std::move(gmock_p2)), \
+ p3(::std::move(gmock_p3)), \
+ p4(::std::move(gmock_p4)), \
+ p5(::std::move(gmock_p5)), \
+ p6(::std::move(gmock_p6)), \
+ p7(::std::move(gmock_p7)), \
+ p8(::std::move(gmock_p8)), \
+ p9(::std::move(gmock_p9))
+
+// Defines the copy constructor
+#define GMOCK_INTERNAL_DEFN_COPY_AND_0_VALUE_PARAMS() \
+ {} // Avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82134
+#define GMOCK_INTERNAL_DEFN_COPY_AND_1_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_2_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_3_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_4_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_5_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_6_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_7_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_8_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_9_VALUE_PARAMS(...) = default;
+#define GMOCK_INTERNAL_DEFN_COPY_AND_10_VALUE_PARAMS(...) = default;
+
+// Declares the fields for storing the value parameters.
+#define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
+#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) \
+ p0##_type p0; \
+ p1##_type p1;
+#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2;
+#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3;
+#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3; \
+ p4##_type p4;
+#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3; \
+ p4##_type p4; \
+ p5##_type p5;
+#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3; \
+ p4##_type p4; \
+ p5##_type p5; \
+ p6##_type p6;
+#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3; \
+ p4##_type p4; \
+ p5##_type p5; \
+ p6##_type p6; \
+ p7##_type p7;
+#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+ p8) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3; \
+ p4##_type p4; \
+ p5##_type p5; \
+ p6##_type p6; \
+ p7##_type p7; \
+ p8##_type p8;
+#define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7, p8, p9) \
+ p0##_type p0; \
+ p1##_type p1; \
+ p2##_type p2; \
+ p3##_type p3; \
+ p4##_type p4; \
+ p5##_type p5; \
+ p6##_type p6; \
+ p7##_type p7; \
+ p8##_type p8; \
+ p9##_type p9;
+
+// Lists the value parameters.
+#define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_AND_1_VALUE_PARAMS(p0) p0
+#define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
+#define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
+#define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
+#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+ p0, p1, p2, p3, p4
+#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+ p0, p1, p2, p3, p4, p5
+#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+ p0, p1, p2, p3, p4, p5, p6
+#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+ p0, p1, p2, p3, p4, p5, p6, p7
+#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+ p8) \
+ p0, p1, p2, p3, p4, p5, p6, p7, p8
+#define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7, p8, p9) \
+ p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
+
+// Lists the value parameter types.
+#define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) \
+ , p0##_type, p1##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) \
+ , p0##_type, p1##_type, p2##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+ , p0##_type, p1##_type, p2##_type, p3##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+ , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+ , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6) \
+ , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, p6##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6, p7) \
+ , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+ p6##_type, p7##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6, p7, p8) \
+ , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+ p6##_type, p7##_type, p8##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+ p6, p7, p8, p9) \
+ , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+ p6##_type, p7##_type, p8##_type, p9##_type
+
+// Declares the value parameters.
+#define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
+#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) \
+ p0##_type p0, p1##_type p1
+#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) \
+ p0##_type p0, p1##_type p1, p2##_type p2
+#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3
+#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
+#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+ p5##_type p5
+#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+ p5##_type p5, p6##_type p6
+#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+ p5##_type p5, p6##_type p6, p7##_type p7
+#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, \
+ p8) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+ p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
+#define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7, p8, p9) \
+ p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+ p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, p9##_type p9
+
+// The suffix of the class template implementing the action template.
+#define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_COUNT_AND_1_VALUE_PARAMS(p0) P
+#define GMOCK_INTERNAL_COUNT_AND_2_VALUE_PARAMS(p0, p1) P2
+#define GMOCK_INTERNAL_COUNT_AND_3_VALUE_PARAMS(p0, p1, p2) P3
+#define GMOCK_INTERNAL_COUNT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) P4
+#define GMOCK_INTERNAL_COUNT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) P5
+#define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
+#define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
+#define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7) \
+ P8
+#define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7, p8) \
+ P9
+#define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+ p7, p8, p9) \
+ P10
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params) \
+ GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+#define ACTION_TEMPLATE(name, template_params, value_params) \
+ template <GMOCK_INTERNAL_DECL_##template_params \
+ GMOCK_INTERNAL_DECL_TYPE_##value_params> \
+ class GMOCK_ACTION_CLASS_(name, value_params) { \
+ public: \
+ explicit GMOCK_ACTION_CLASS_(name, value_params)( \
+ GMOCK_INTERNAL_DECL_##value_params) \
+ GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params), \
+ = default; \
+ , \
+ : impl_(std::make_shared<gmock_Impl>( \
+ GMOCK_INTERNAL_LIST_##value_params)){}) \
+ GMOCK_ACTION_CLASS_(name, value_params)(const GMOCK_ACTION_CLASS_( \
+ name, value_params) &) noexcept GMOCK_INTERNAL_DEFN_COPY_ \
+ ##value_params GMOCK_ACTION_CLASS_(name, value_params)( \
+ GMOCK_ACTION_CLASS_(name, value_params) &&) noexcept \
+ GMOCK_INTERNAL_DEFN_COPY_##value_params template <typename F> \
+ operator ::testing::Action<F>() const { \
+ return GMOCK_PP_IF( \
+ GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params), \
+ (::testing::internal::MakeAction<F, gmock_Impl>()), \
+ (::testing::internal::MakeAction<F>(impl_))); \
+ } \
+ \
+ private: \
+ class gmock_Impl { \
+ public: \
+ explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {} \
+ template <typename function_type, typename return_type, \
+ typename args_type, GMOCK_ACTION_TEMPLATE_ARGS_NAMES_> \
+ return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+ GMOCK_INTERNAL_DEFN_##value_params \
+ }; \
+ GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(GMOCK_INTERNAL_COUNT_##value_params), , \
+ std::shared_ptr<const gmock_Impl> impl_;) \
+ }; \
+ template <GMOCK_INTERNAL_DECL_##template_params \
+ GMOCK_INTERNAL_DECL_TYPE_##value_params> \
+ GMOCK_ACTION_CLASS_( \
+ name, value_params)<GMOCK_INTERNAL_LIST_##template_params \
+ GMOCK_INTERNAL_LIST_TYPE_##value_params> \
+ name(GMOCK_INTERNAL_DECL_##value_params) GTEST_MUST_USE_RESULT_; \
+ template <GMOCK_INTERNAL_DECL_##template_params \
+ GMOCK_INTERNAL_DECL_TYPE_##value_params> \
+ inline GMOCK_ACTION_CLASS_( \
+ name, value_params)<GMOCK_INTERNAL_LIST_##template_params \
+ GMOCK_INTERNAL_LIST_TYPE_##value_params> \
+ name(GMOCK_INTERNAL_DECL_##value_params) { \
+ return GMOCK_ACTION_CLASS_( \
+ name, value_params)<GMOCK_INTERNAL_LIST_##template_params \
+ GMOCK_INTERNAL_LIST_TYPE_##value_params>( \
+ GMOCK_INTERNAL_LIST_##value_params); \
+ } \
+ template <GMOCK_INTERNAL_DECL_##template_params \
+ GMOCK_INTERNAL_DECL_TYPE_##value_params> \
+ template <typename function_type, typename return_type, typename args_type, \
+ GMOCK_ACTION_TEMPLATE_ARGS_NAMES_> \
+ return_type GMOCK_ACTION_CLASS_( \
+ name, value_params)<GMOCK_INTERNAL_LIST_##template_params \
+ GMOCK_INTERNAL_LIST_TYPE_##value_params>:: \
+ gmock_Impl::gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) \
+ const
+
+namespace testing {
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4. Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma. Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+namespace internal {
+
+// internal::InvokeArgument - a helper for InvokeArgument action.
+// The basic overloads are provided here for generic functors.
+// Overloads for other custom-callables are provided in the
+// internal/custom/gmock-generated-actions.h header.
+template <typename F, typename... Args>
+auto InvokeArgument(F f, Args... args) -> decltype(f(args...)) {
+ return f(args...);
+}
+
+template <std::size_t index, typename... Params>
+struct InvokeArgumentAction {
+ template <typename... Args,
+ typename = typename std::enable_if<(index < sizeof...(Args))>::type>
+ auto operator()(Args&&... args) const -> decltype(internal::InvokeArgument(
+ std::get<index>(std::forward_as_tuple(std::forward<Args>(args)...)),
+ std::declval<const Params&>()...)) {
+ internal::FlatTuple<Args&&...> args_tuple(FlatTupleConstructTag{},
+ std::forward<Args>(args)...);
+ return params.Apply([&](const Params&... unpacked_params) {
+ auto&& callable = args_tuple.template Get<index>();
+ return internal::InvokeArgument(
+ std::forward<decltype(callable)>(callable), unpacked_params...);
+ });
+ }
+
+ internal::FlatTuple<Params...> params;
+};
+
+} // namespace internal
+
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+// 1. The arguments are passed by value by default. If you need to
+// pass an argument by reference, wrap it inside std::ref(). For
+// example,
+//
+// InvokeArgument<1>(5, string("Hello"), std::ref(foo))
+//
+// passes 5 and string("Hello") by value, and passes foo by
+// reference.
+//
+// 2. If the callable takes an argument by reference but std::ref() is
+// not used, it will receive the reference to a copy of the value,
+// instead of the original value. For example, when the 0-th
+// argument of the mock function takes a const string&, the action
+//
+// InvokeArgument<0>(string("Hello"))
+//
+// makes a copy of the temporary string("Hello") object and passes a
+// reference of the copy, instead of the original temporary object,
+// to the callable. This makes it easy for a user to define an
+// InvokeArgument action from temporary values and have it performed
+// later.
+template <std::size_t index, typename... Params>
+internal::InvokeArgumentAction<index, typename std::decay<Params>::type...>
+InvokeArgument(Params&&... params) {
+ return {internal::FlatTuple<typename std::decay<Params>::type...>(
+ internal::FlatTupleConstructTag{}, std::forward<Params>(params)...)};
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+} // namespace testing
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
new file mode 100644
index 0000000000..47aaf98461
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h
@@ -0,0 +1,91 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some matchers that depend on gmock-matchers.h.
+//
+// Note that tests are implemented in gmock-matchers_test.cc rather than
+// gmock-more-matchers-test.cc.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
+
+#include "gmock/gmock-matchers.h"
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal
+// parameter) for MSVC
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#if (_MSC_VER == 1900)
+// and silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 14
+#pragma warning(disable : 4800)
+#endif
+#endif
+
+// Defines a matcher that matches an empty container. The container must
+// support both size() and empty(), which all STL-like containers provide.
+MATCHER(IsEmpty, negation ? "isn't empty" : "is empty") {
+ if (arg.empty()) {
+ return true;
+ }
+ *result_listener << "whose size is " << arg.size();
+ return false;
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to true. Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsTrue, negation ? "is false" : "is true") {
+ return static_cast<bool>(arg);
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to false. Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsFalse, negation ? "is true" : "is false") {
+ return !static_cast<bool>(arg);
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+} // namespace testing
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_MORE_MATCHERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
new file mode 100644
index 0000000000..4f0eb35db7
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h
@@ -0,0 +1,277 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Implements class templates NiceMock, NaggyMock, and StrictMock.
+//
+// Given a mock class MockFoo that is created using Google Mock,
+// NiceMock<MockFoo> is a subclass of MockFoo that allows
+// uninteresting calls (i.e. calls to mock methods that have no
+// EXPECT_CALL specs), NaggyMock<MockFoo> is a subclass of MockFoo
+// that prints a warning when an uninteresting call occurs, and
+// StrictMock<MockFoo> is a subclass of MockFoo that treats all
+// uninteresting calls as errors.
+//
+// Currently a mock is naggy by default, so MockFoo and
+// NaggyMock<MockFoo> behave like the same. However, we will soon
+// switch the default behavior of mocks to be nice, as that in general
+// leads to more maintainable tests. When that happens, MockFoo will
+// stop behaving like NaggyMock<MockFoo> and start behaving like
+// NiceMock<MockFoo>.
+//
+// NiceMock, NaggyMock, and StrictMock "inherit" the constructors of
+// their respective base class. Therefore you can write
+// NiceMock<MockFoo>(5, "a") to construct a nice mock where MockFoo
+// has a constructor that accepts (int, const char*), for example.
+//
+// A known limitation is that NiceMock<MockFoo>, NaggyMock<MockFoo>,
+// and StrictMock<MockFoo> only works for mock methods defined using
+// the MOCK_METHOD* family of macros DIRECTLY in the MockFoo class.
+// If a mock method is defined in a base class of MockFoo, the "nice"
+// or "strict" modifier may not affect it, depending on the compiler.
+// In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
+// supported.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+template <class MockClass>
+class NiceMock;
+template <class MockClass>
+class NaggyMock;
+template <class MockClass>
+class StrictMock;
+
+namespace internal {
+template <typename T>
+std::true_type StrictnessModifierProbe(const NiceMock<T>&);
+template <typename T>
+std::true_type StrictnessModifierProbe(const NaggyMock<T>&);
+template <typename T>
+std::true_type StrictnessModifierProbe(const StrictMock<T>&);
+std::false_type StrictnessModifierProbe(...);
+
+template <typename T>
+constexpr bool HasStrictnessModifier() {
+ return decltype(StrictnessModifierProbe(std::declval<const T&>()))::value;
+}
+
+// Base classes that register and deregister with testing::Mock to alter the
+// default behavior around uninteresting calls. Inheriting from one of these
+// classes first and then MockClass ensures the MockClass constructor is run
+// after registration, and that the MockClass destructor runs before
+// deregistration. This guarantees that MockClass's constructor and destructor
+// run with the same level of strictness as its instance methods.
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW && \
+ (defined(_MSC_VER) || defined(__clang__))
+// We need to mark these classes with this declspec to ensure that
+// the empty base class optimization is performed.
+#define GTEST_INTERNAL_EMPTY_BASE_CLASS __declspec(empty_bases)
+#else
+#define GTEST_INTERNAL_EMPTY_BASE_CLASS
+#endif
+
+template <typename Base>
+class NiceMockImpl {
+ public:
+ NiceMockImpl() {
+ ::testing::Mock::AllowUninterestingCalls(reinterpret_cast<uintptr_t>(this));
+ }
+
+ ~NiceMockImpl() {
+ ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
+ }
+};
+
+template <typename Base>
+class NaggyMockImpl {
+ public:
+ NaggyMockImpl() {
+ ::testing::Mock::WarnUninterestingCalls(reinterpret_cast<uintptr_t>(this));
+ }
+
+ ~NaggyMockImpl() {
+ ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
+ }
+};
+
+template <typename Base>
+class StrictMockImpl {
+ public:
+ StrictMockImpl() {
+ ::testing::Mock::FailUninterestingCalls(reinterpret_cast<uintptr_t>(this));
+ }
+
+ ~StrictMockImpl() {
+ ::testing::Mock::UnregisterCallReaction(reinterpret_cast<uintptr_t>(this));
+ }
+};
+
+} // namespace internal
+
+template <class MockClass>
+class GTEST_INTERNAL_EMPTY_BASE_CLASS NiceMock
+ : private internal::NiceMockImpl<MockClass>,
+ public MockClass {
+ public:
+ static_assert(!internal::HasStrictnessModifier<MockClass>(),
+ "Can't apply NiceMock to a class hierarchy that already has a "
+ "strictness modifier. See "
+ "https://google.github.io/googletest/"
+ "gmock_cook_book.html#NiceStrictNaggy");
+ NiceMock() : MockClass() {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ // Ideally, we would inherit base class's constructors through a using
+ // declaration, which would preserve their visibility. However, many existing
+ // tests rely on the fact that current implementation reexports protected
+ // constructors as public. These tests would need to be cleaned up first.
+
+ // Single argument constructor is special-cased so that it can be
+ // made explicit.
+ template <typename A>
+ explicit NiceMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ template <typename TArg1, typename TArg2, typename... An>
+ NiceMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
+ : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
+ std::forward<An>(args)...) {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ private:
+ NiceMock(const NiceMock&) = delete;
+ NiceMock& operator=(const NiceMock&) = delete;
+};
+
+template <class MockClass>
+class GTEST_INTERNAL_EMPTY_BASE_CLASS NaggyMock
+ : private internal::NaggyMockImpl<MockClass>,
+ public MockClass {
+ static_assert(!internal::HasStrictnessModifier<MockClass>(),
+ "Can't apply NaggyMock to a class hierarchy that already has a "
+ "strictness modifier. See "
+ "https://google.github.io/googletest/"
+ "gmock_cook_book.html#NiceStrictNaggy");
+
+ public:
+ NaggyMock() : MockClass() {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ // Ideally, we would inherit base class's constructors through a using
+ // declaration, which would preserve their visibility. However, many existing
+ // tests rely on the fact that current implementation reexports protected
+ // constructors as public. These tests would need to be cleaned up first.
+
+ // Single argument constructor is special-cased so that it can be
+ // made explicit.
+ template <typename A>
+ explicit NaggyMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ template <typename TArg1, typename TArg2, typename... An>
+ NaggyMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
+ : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
+ std::forward<An>(args)...) {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ private:
+ NaggyMock(const NaggyMock&) = delete;
+ NaggyMock& operator=(const NaggyMock&) = delete;
+};
+
+template <class MockClass>
+class GTEST_INTERNAL_EMPTY_BASE_CLASS StrictMock
+ : private internal::StrictMockImpl<MockClass>,
+ public MockClass {
+ public:
+ static_assert(
+ !internal::HasStrictnessModifier<MockClass>(),
+ "Can't apply StrictMock to a class hierarchy that already has a "
+ "strictness modifier. See "
+ "https://google.github.io/googletest/"
+ "gmock_cook_book.html#NiceStrictNaggy");
+ StrictMock() : MockClass() {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ // Ideally, we would inherit base class's constructors through a using
+ // declaration, which would preserve their visibility. However, many existing
+ // tests rely on the fact that current implementation reexports protected
+ // constructors as public. These tests would need to be cleaned up first.
+
+ // Single argument constructor is special-cased so that it can be
+ // made explicit.
+ template <typename A>
+ explicit StrictMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ template <typename TArg1, typename TArg2, typename... An>
+ StrictMock(TArg1&& arg1, TArg2&& arg2, An&&... args)
+ : MockClass(std::forward<TArg1>(arg1), std::forward<TArg2>(arg2),
+ std::forward<An>(args)...) {
+ static_assert(sizeof(*this) == sizeof(MockClass),
+ "The impl subclass shouldn't introduce any padding");
+ }
+
+ private:
+ StrictMock(const StrictMock&) = delete;
+ StrictMock& operator=(const StrictMock&) = delete;
+};
+
+#undef GTEST_INTERNAL_EMPTY_BASE_CLASS
+
+} // namespace testing
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
new file mode 100644
index 0000000000..45cc605183
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h
@@ -0,0 +1,2083 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the ON_CALL() and EXPECT_CALL() macros.
+//
+// A user can use the ON_CALL() macro to specify the default action of
+// a mock method. The syntax is:
+//
+// ON_CALL(mock_object, Method(argument-matchers))
+// .With(multi-argument-matcher)
+// .WillByDefault(action);
+//
+// where the .With() clause is optional.
+//
+// A user can use the EXPECT_CALL() macro to specify an expectation on
+// a mock method. The syntax is:
+//
+// EXPECT_CALL(mock_object, Method(argument-matchers))
+// .With(multi-argument-matchers)
+// .Times(cardinality)
+// .InSequence(sequences)
+// .After(expectations)
+// .WillOnce(action)
+// .WillRepeatedly(action)
+// .RetiresOnSaturation();
+//
+// where all clauses are optional, and .InSequence()/.After()/
+// .WillOnce() can appear any number of times.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept> // NOLINT
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// An abstract handle of an expectation.
+class Expectation;
+
+// A set of expectation handles.
+class ExpectationSet;
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// Implements a mock function.
+template <typename F>
+class FunctionMocker;
+
+// Base class for expectations.
+class ExpectationBase;
+
+// Implements an expectation.
+template <typename F>
+class TypedExpectation;
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester;
+
+// Helper classes for implementing NiceMock, StrictMock, and NaggyMock.
+template <typename MockClass>
+class NiceMockImpl;
+template <typename MockClass>
+class StrictMockImpl;
+template <typename MockClass>
+class NaggyMockImpl;
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+//
+// The reason we don't use more fine-grained protection is: when a
+// mock function Foo() is called, it needs to consult its expectations
+// to see which one should be picked. If another thread is allowed to
+// call a mock function (either Foo() or a different one) at the same
+// time, it could affect the "retired" attributes of Foo()'s
+// expectations when InSequence() is used, and thus affect which
+// expectation gets picked. Therefore, we sequence all mock function
+// calls to ensure the integrity of the mock objects' states.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Abstract base class of FunctionMocker. This is the
+// type-agnostic part of the function mocker interface. Its pure
+// virtual methods are implemented by FunctionMocker.
+class GTEST_API_ UntypedFunctionMockerBase {
+ public:
+ UntypedFunctionMockerBase();
+ virtual ~UntypedFunctionMockerBase();
+
+ // Verifies that all expectations on this mock function have been
+ // satisfied. Reports one or more Google Test non-fatal failures
+ // and returns false if not.
+ bool VerifyAndClearExpectationsLocked()
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+ // Clears the ON_CALL()s set on this mock function.
+ virtual void ClearDefaultActionsLocked()
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) = 0;
+
+ // In all of the following Untyped* functions, it's the caller's
+ // responsibility to guarantee the correctness of the arguments'
+ // types.
+
+ // Writes a message that the call is uninteresting (i.e. neither
+ // explicitly expected nor explicitly unexpected) to the given
+ // ostream.
+ virtual void UntypedDescribeUninterestingCall(const void* untyped_args,
+ ::std::ostream* os) const
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+ // Returns the expectation that matches the given function arguments
+ // (or NULL is there's no match); when a match is found,
+ // untyped_action is set to point to the action that should be
+ // performed (or NULL if the action is "do default"), and
+ // is_excessive is modified to indicate whether the call exceeds the
+ // expected number.
+ virtual const ExpectationBase* UntypedFindMatchingExpectation(
+ const void* untyped_args, const void** untyped_action, bool* is_excessive,
+ ::std::ostream* what, ::std::ostream* why)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+ // Prints the given function arguments to the ostream.
+ virtual void UntypedPrintArgs(const void* untyped_args,
+ ::std::ostream* os) const = 0;
+
+ // Sets the mock object this mock method belongs to, and registers
+ // this information in the global mock registry. Will be called
+ // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+ // method.
+ void RegisterOwner(const void* mock_obj) GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ // Sets the mock object this mock method belongs to, and sets the
+ // name of the mock function. Will be called upon each invocation
+ // of this mock function.
+ void SetOwnerAndName(const void* mock_obj, const char* name)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ // Returns the mock object this mock method belongs to. Must be
+ // called after RegisterOwner() or SetOwnerAndName() has been
+ // called.
+ const void* MockObject() const GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ // Returns the name of this mock method. Must be called after
+ // SetOwnerAndName() has been called.
+ const char* Name() const GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ protected:
+ typedef std::vector<const void*> UntypedOnCallSpecs;
+
+ using UntypedExpectations = std::vector<std::shared_ptr<ExpectationBase>>;
+
+ // Returns an Expectation object that references and co-owns exp,
+ // which must be an expectation on this mock function.
+ Expectation GetHandleOf(ExpectationBase* exp);
+
+ // Address of the mock object this mock method belongs to. Only
+ // valid after this mock method has been called or
+ // ON_CALL/EXPECT_CALL has been invoked on it.
+ const void* mock_obj_; // Protected by g_gmock_mutex.
+
+ // Name of the function being mocked. Only valid after this mock
+ // method has been called.
+ const char* name_; // Protected by g_gmock_mutex.
+
+ // All default action specs for this function mocker.
+ UntypedOnCallSpecs untyped_on_call_specs_;
+
+ // All expectations for this function mocker.
+ //
+ // It's undefined behavior to interleave expectations (EXPECT_CALLs
+ // or ON_CALLs) and mock function calls. Also, the order of
+ // expectations is important. Therefore it's a logic race condition
+ // to read/write untyped_expectations_ concurrently. In order for
+ // tools like tsan to catch concurrent read/write accesses to
+ // untyped_expectations, we deliberately leave accesses to it
+ // unprotected.
+ UntypedExpectations untyped_expectations_;
+}; // class UntypedFunctionMockerBase
+
+// Untyped base class for OnCallSpec<F>.
+class UntypedOnCallSpecBase {
+ public:
+ // The arguments are the location of the ON_CALL() statement.
+ UntypedOnCallSpecBase(const char* a_file, int a_line)
+ : file_(a_file), line_(a_line), last_clause_(kNone) {}
+
+ // Where in the source file was the default action spec defined?
+ const char* file() const { return file_; }
+ int line() const { return line_; }
+
+ protected:
+ // Gives each clause in the ON_CALL() statement a name.
+ enum Clause {
+ // Do not change the order of the enum members! The run-time
+ // syntax checking relies on it.
+ kNone,
+ kWith,
+ kWillByDefault
+ };
+
+ // Asserts that the ON_CALL() statement has a certain property.
+ void AssertSpecProperty(bool property,
+ const std::string& failure_message) const {
+ Assert(property, file_, line_, failure_message);
+ }
+
+ // Expects that the ON_CALL() statement has a certain property.
+ void ExpectSpecProperty(bool property,
+ const std::string& failure_message) const {
+ Expect(property, file_, line_, failure_message);
+ }
+
+ const char* file_;
+ int line_;
+
+ // The last clause in the ON_CALL() statement as seen so far.
+ // Initially kNone and changes as the statement is parsed.
+ Clause last_clause_;
+}; // class UntypedOnCallSpecBase
+
+// This template class implements an ON_CALL spec.
+template <typename F>
+class OnCallSpec : public UntypedOnCallSpecBase {
+ public:
+ typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+ typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+ // Constructs an OnCallSpec object from the information inside
+ // the parenthesis of an ON_CALL() statement.
+ OnCallSpec(const char* a_file, int a_line,
+ const ArgumentMatcherTuple& matchers)
+ : UntypedOnCallSpecBase(a_file, a_line),
+ matchers_(matchers),
+ // By default, extra_matcher_ should match anything. However,
+ // we cannot initialize it with _ as that causes ambiguity between
+ // Matcher's copy and move constructor for some argument types.
+ extra_matcher_(A<const ArgumentTuple&>()) {}
+
+ // Implements the .With() clause.
+ OnCallSpec& With(const Matcher<const ArgumentTuple&>& m) {
+ // Makes sure this is called at most once.
+ ExpectSpecProperty(last_clause_ < kWith,
+ ".With() cannot appear "
+ "more than once in an ON_CALL().");
+ last_clause_ = kWith;
+
+ extra_matcher_ = m;
+ return *this;
+ }
+
+ // Implements the .WillByDefault() clause.
+ OnCallSpec& WillByDefault(const Action<F>& action) {
+ ExpectSpecProperty(last_clause_ < kWillByDefault,
+ ".WillByDefault() must appear "
+ "exactly once in an ON_CALL().");
+ last_clause_ = kWillByDefault;
+
+ ExpectSpecProperty(!action.IsDoDefault(),
+ "DoDefault() cannot be used in ON_CALL().");
+ action_ = action;
+ return *this;
+ }
+
+ // Returns true if and only if the given arguments match the matchers.
+ bool Matches(const ArgumentTuple& args) const {
+ return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+ }
+
+ // Returns the action specified by the user.
+ const Action<F>& GetAction() const {
+ AssertSpecProperty(last_clause_ == kWillByDefault,
+ ".WillByDefault() must appear exactly "
+ "once in an ON_CALL().");
+ return action_;
+ }
+
+ private:
+ // The information in statement
+ //
+ // ON_CALL(mock_object, Method(matchers))
+ // .With(multi-argument-matcher)
+ // .WillByDefault(action);
+ //
+ // is recorded in the data members like this:
+ //
+ // source file that contains the statement => file_
+ // line number of the statement => line_
+ // matchers => matchers_
+ // multi-argument-matcher => extra_matcher_
+ // action => action_
+ ArgumentMatcherTuple matchers_;
+ Matcher<const ArgumentTuple&> extra_matcher_;
+ Action<F> action_;
+}; // class OnCallSpec
+
+// Possible reactions on uninteresting calls.
+enum CallReaction {
+ kAllow,
+ kWarn,
+ kFail,
+};
+
+} // namespace internal
+
+// Utilities for manipulating mock objects.
+class GTEST_API_ Mock {
+ public:
+ // The following public methods can be called concurrently.
+
+ // Tells Google Mock to ignore mock_obj when checking for leaked
+ // mock objects.
+ static void AllowLeak(const void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Verifies and clears all expectations on the given mock object.
+ // If the expectations aren't satisfied, generates one or more
+ // Google Test non-fatal failures and returns false.
+ static bool VerifyAndClearExpectations(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Verifies all expectations on the given mock object and clears its
+ // default actions and expectations. Returns true if and only if the
+ // verification was successful.
+ static bool VerifyAndClear(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Returns whether the mock was created as a naggy mock (default)
+ static bool IsNaggy(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+ // Returns whether the mock was created as a nice mock
+ static bool IsNice(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+ // Returns whether the mock was created as a strict mock
+ static bool IsStrict(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ private:
+ friend class internal::UntypedFunctionMockerBase;
+
+ // Needed for a function mocker to register itself (so that we know
+ // how to clear a mock object).
+ template <typename F>
+ friend class internal::FunctionMocker;
+
+ template <typename MockClass>
+ friend class internal::NiceMockImpl;
+ template <typename MockClass>
+ friend class internal::NaggyMockImpl;
+ template <typename MockClass>
+ friend class internal::StrictMockImpl;
+
+ // Tells Google Mock to allow uninteresting calls on the given mock
+ // object.
+ static void AllowUninterestingCalls(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Tells Google Mock to warn the user about uninteresting calls on
+ // the given mock object.
+ static void WarnUninterestingCalls(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Tells Google Mock to fail uninteresting calls on the given mock
+ // object.
+ static void FailUninterestingCalls(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Tells Google Mock the given mock object is being destroyed and
+ // its entry in the call-reaction table should be removed.
+ static void UnregisterCallReaction(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Returns the reaction Google Mock will have on uninteresting calls
+ // made on the given mock object.
+ static internal::CallReaction GetReactionOnUninterestingCalls(
+ const void* mock_obj) GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Verifies that all expectations on the given mock object have been
+ // satisfied. Reports one or more Google Test non-fatal failures
+ // and returns false if not.
+ static bool VerifyAndClearExpectationsLocked(void* mock_obj)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+ // Clears all ON_CALL()s set on the given mock object.
+ static void ClearDefaultActionsLocked(void* mock_obj)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+ // Registers a mock object and a mock method it owns.
+ static void Register(const void* mock_obj,
+ internal::UntypedFunctionMockerBase* mocker)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Tells Google Mock where in the source code mock_obj is used in an
+ // ON_CALL or EXPECT_CALL. In case mock_obj is leaked, this
+ // information helps the user identify which object it is.
+ static void RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+ const char* file, int line)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ // Unregisters a mock method; removes the owning mock object from
+ // the registry when the last mock method associated with it has
+ // been unregistered. This is called only in the destructor of
+ // FunctionMocker.
+ static void UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+}; // class Mock
+
+// An abstract handle of an expectation. Useful in the .After()
+// clause of EXPECT_CALL() for setting the (partial) order of
+// expectations. The syntax:
+//
+// Expectation e1 = EXPECT_CALL(...)...;
+// EXPECT_CALL(...).After(e1)...;
+//
+// sets two expectations where the latter can only be matched after
+// the former has been satisfied.
+//
+// Notes:
+// - This class is copyable and has value semantics.
+// - Constness is shallow: a const Expectation object itself cannot
+// be modified, but the mutable methods of the ExpectationBase
+// object it references can be called via expectation_base().
+
+class GTEST_API_ Expectation {
+ public:
+ // Constructs a null object that doesn't reference any expectation.
+ Expectation();
+ Expectation(Expectation&&) = default;
+ Expectation(const Expectation&) = default;
+ Expectation& operator=(Expectation&&) = default;
+ Expectation& operator=(const Expectation&) = default;
+ ~Expectation();
+
+ // This single-argument ctor must not be explicit, in order to support the
+ // Expectation e = EXPECT_CALL(...);
+ // syntax.
+ //
+ // A TypedExpectation object stores its pre-requisites as
+ // Expectation objects, and needs to call the non-const Retire()
+ // method on the ExpectationBase objects they reference. Therefore
+ // Expectation must receive a *non-const* reference to the
+ // ExpectationBase object.
+ Expectation(internal::ExpectationBase& exp); // NOLINT
+
+ // The compiler-generated copy ctor and operator= work exactly as
+ // intended, so we don't need to define our own.
+
+ // Returns true if and only if rhs references the same expectation as this
+ // object does.
+ bool operator==(const Expectation& rhs) const {
+ return expectation_base_ == rhs.expectation_base_;
+ }
+
+ bool operator!=(const Expectation& rhs) const { return !(*this == rhs); }
+
+ private:
+ friend class ExpectationSet;
+ friend class Sequence;
+ friend class ::testing::internal::ExpectationBase;
+ friend class ::testing::internal::UntypedFunctionMockerBase;
+
+ template <typename F>
+ friend class ::testing::internal::FunctionMocker;
+
+ template <typename F>
+ friend class ::testing::internal::TypedExpectation;
+
+ // This comparator is needed for putting Expectation objects into a set.
+ class Less {
+ public:
+ bool operator()(const Expectation& lhs, const Expectation& rhs) const {
+ return lhs.expectation_base_.get() < rhs.expectation_base_.get();
+ }
+ };
+
+ typedef ::std::set<Expectation, Less> Set;
+
+ Expectation(
+ const std::shared_ptr<internal::ExpectationBase>& expectation_base);
+
+ // Returns the expectation this object references.
+ const std::shared_ptr<internal::ExpectationBase>& expectation_base() const {
+ return expectation_base_;
+ }
+
+ // A shared_ptr that co-owns the expectation this handle references.
+ std::shared_ptr<internal::ExpectationBase> expectation_base_;
+};
+
+// A set of expectation handles. Useful in the .After() clause of
+// EXPECT_CALL() for setting the (partial) order of expectations. The
+// syntax:
+//
+// ExpectationSet es;
+// es += EXPECT_CALL(...)...;
+// es += EXPECT_CALL(...)...;
+// EXPECT_CALL(...).After(es)...;
+//
+// sets three expectations where the last one can only be matched
+// after the first two have both been satisfied.
+//
+// This class is copyable and has value semantics.
+class ExpectationSet {
+ public:
+ // A bidirectional iterator that can read a const element in the set.
+ typedef Expectation::Set::const_iterator const_iterator;
+
+ // An object stored in the set. This is an alias of Expectation.
+ typedef Expectation::Set::value_type value_type;
+
+ // Constructs an empty set.
+ ExpectationSet() {}
+
+ // This single-argument ctor must not be explicit, in order to support the
+ // ExpectationSet es = EXPECT_CALL(...);
+ // syntax.
+ ExpectationSet(internal::ExpectationBase& exp) { // NOLINT
+ *this += Expectation(exp);
+ }
+
+ // This single-argument ctor implements implicit conversion from
+ // Expectation and thus must not be explicit. This allows either an
+ // Expectation or an ExpectationSet to be used in .After().
+ ExpectationSet(const Expectation& e) { // NOLINT
+ *this += e;
+ }
+
+ // The compiler-generator ctor and operator= works exactly as
+ // intended, so we don't need to define our own.
+
+ // Returns true if and only if rhs contains the same set of Expectation
+ // objects as this does.
+ bool operator==(const ExpectationSet& rhs) const {
+ return expectations_ == rhs.expectations_;
+ }
+
+ bool operator!=(const ExpectationSet& rhs) const { return !(*this == rhs); }
+
+ // Implements the syntax
+ // expectation_set += EXPECT_CALL(...);
+ ExpectationSet& operator+=(const Expectation& e) {
+ expectations_.insert(e);
+ return *this;
+ }
+
+ int size() const { return static_cast<int>(expectations_.size()); }
+
+ const_iterator begin() const { return expectations_.begin(); }
+ const_iterator end() const { return expectations_.end(); }
+
+ private:
+ Expectation::Set expectations_;
+};
+
+// Sequence objects are used by a user to specify the relative order
+// in which the expectations should match. They are copyable (we rely
+// on the compiler-defined copy constructor and assignment operator).
+class GTEST_API_ Sequence {
+ public:
+ // Constructs an empty sequence.
+ Sequence() : last_expectation_(new Expectation) {}
+
+ // Adds an expectation to this sequence. The caller must ensure
+ // that no other thread is accessing this Sequence object.
+ void AddExpectation(const Expectation& expectation) const;
+
+ private:
+ // The last expectation in this sequence.
+ std::shared_ptr<Expectation> last_expectation_;
+}; // class Sequence
+
+// An object of this type causes all EXPECT_CALL() statements
+// encountered in its scope to be put in an anonymous sequence. The
+// work is done in the constructor and destructor. You should only
+// create an InSequence object on the stack.
+//
+// The sole purpose for this class is to support easy definition of
+// sequential expectations, e.g.
+//
+// {
+// InSequence dummy; // The name of the object doesn't matter.
+//
+// // The following expectations must match in the order they appear.
+// EXPECT_CALL(a, Bar())...;
+// EXPECT_CALL(a, Baz())...;
+// ...
+// EXPECT_CALL(b, Xyz())...;
+// }
+//
+// You can create InSequence objects in multiple threads, as long as
+// they are used to affect different mock objects. The idea is that
+// each thread can create and set up its own mocks as if it's the only
+// thread. However, for clarity of your tests we recommend you to set
+// up mocks in the main thread unless you have a good reason not to do
+// so.
+class GTEST_API_ InSequence {
+ public:
+ InSequence();
+ ~InSequence();
+
+ private:
+ bool sequence_created_;
+
+ InSequence(const InSequence&) = delete;
+ InSequence& operator=(const InSequence&) = delete;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+namespace internal {
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ extern ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Base class for implementing expectations.
+//
+// There are two reasons for having a type-agnostic base class for
+// Expectation:
+//
+// 1. We need to store collections of expectations of different
+// types (e.g. all pre-requisites of a particular expectation, all
+// expectations in a sequence). Therefore these expectation objects
+// must share a common base class.
+//
+// 2. We can avoid binary code bloat by moving methods not depending
+// on the template argument of Expectation to the base class.
+//
+// This class is internal and mustn't be used by user code directly.
+class GTEST_API_ ExpectationBase {
+ public:
+ // source_text is the EXPECT_CALL(...) source that created this Expectation.
+ ExpectationBase(const char* file, int line, const std::string& source_text);
+
+ virtual ~ExpectationBase();
+
+ // Where in the source file was the expectation spec defined?
+ const char* file() const { return file_; }
+ int line() const { return line_; }
+ const char* source_text() const { return source_text_.c_str(); }
+ // Returns the cardinality specified in the expectation spec.
+ const Cardinality& cardinality() const { return cardinality_; }
+
+ // Describes the source file location of this expectation.
+ void DescribeLocationTo(::std::ostream* os) const {
+ *os << FormatFileLocation(file(), line()) << " ";
+ }
+
+ // Describes how many times a function call matching this
+ // expectation has occurred.
+ void DescribeCallCountTo(::std::ostream* os) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+ // If this mock method has an extra matcher (i.e. .With(matcher)),
+ // describes it to the ostream.
+ virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) = 0;
+
+ protected:
+ friend class ::testing::Expectation;
+ friend class UntypedFunctionMockerBase;
+
+ enum Clause {
+ // Don't change the order of the enum members!
+ kNone,
+ kWith,
+ kTimes,
+ kInSequence,
+ kAfter,
+ kWillOnce,
+ kWillRepeatedly,
+ kRetiresOnSaturation
+ };
+
+ typedef std::vector<const void*> UntypedActions;
+
+ // Returns an Expectation object that references and co-owns this
+ // expectation.
+ virtual Expectation GetHandle() = 0;
+
+ // Asserts that the EXPECT_CALL() statement has the given property.
+ void AssertSpecProperty(bool property,
+ const std::string& failure_message) const {
+ Assert(property, file_, line_, failure_message);
+ }
+
+ // Expects that the EXPECT_CALL() statement has the given property.
+ void ExpectSpecProperty(bool property,
+ const std::string& failure_message) const {
+ Expect(property, file_, line_, failure_message);
+ }
+
+ // Explicitly specifies the cardinality of this expectation. Used
+ // by the subclasses to implement the .Times() clause.
+ void SpecifyCardinality(const Cardinality& cardinality);
+
+ // Returns true if and only if the user specified the cardinality
+ // explicitly using a .Times().
+ bool cardinality_specified() const { return cardinality_specified_; }
+
+ // Sets the cardinality of this expectation spec.
+ void set_cardinality(const Cardinality& a_cardinality) {
+ cardinality_ = a_cardinality;
+ }
+
+ // The following group of methods should only be called after the
+ // EXPECT_CALL() statement, and only when g_gmock_mutex is held by
+ // the current thread.
+
+ // Retires all pre-requisites of this expectation.
+ void RetireAllPreRequisites() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+ // Returns true if and only if this expectation is retired.
+ bool is_retired() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ return retired_;
+ }
+
+ // Retires this expectation.
+ void Retire() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ retired_ = true;
+ }
+
+ // Returns true if and only if this expectation is satisfied.
+ bool IsSatisfied() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ return cardinality().IsSatisfiedByCallCount(call_count_);
+ }
+
+ // Returns true if and only if this expectation is saturated.
+ bool IsSaturated() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ return cardinality().IsSaturatedByCallCount(call_count_);
+ }
+
+ // Returns true if and only if this expectation is over-saturated.
+ bool IsOverSaturated() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ return cardinality().IsOverSaturatedByCallCount(call_count_);
+ }
+
+ // Returns true if and only if all pre-requisites of this expectation are
+ // satisfied.
+ bool AllPrerequisitesAreSatisfied() const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+ // Adds unsatisfied pre-requisites of this expectation to 'result'.
+ void FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+ // Returns the number this expectation has been invoked.
+ int call_count() const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ return call_count_;
+ }
+
+ // Increments the number this expectation has been invoked.
+ void IncrementCallCount() GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ call_count_++;
+ }
+
+ // Checks the action count (i.e. the number of WillOnce() and
+ // WillRepeatedly() clauses) against the cardinality if this hasn't
+ // been done before. Prints a warning if there are too many or too
+ // few actions.
+ void CheckActionCountIfNotDone() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+ friend class ::testing::Sequence;
+ friend class ::testing::internal::ExpectationTester;
+
+ template <typename Function>
+ friend class TypedExpectation;
+
+ // Implements the .Times() clause.
+ void UntypedTimes(const Cardinality& a_cardinality);
+
+ // This group of fields are part of the spec and won't change after
+ // an EXPECT_CALL() statement finishes.
+ const char* file_; // The file that contains the expectation.
+ int line_; // The line number of the expectation.
+ const std::string source_text_; // The EXPECT_CALL(...) source text.
+ // True if and only if the cardinality is specified explicitly.
+ bool cardinality_specified_;
+ Cardinality cardinality_; // The cardinality of the expectation.
+ // The immediate pre-requisites (i.e. expectations that must be
+ // satisfied before this expectation can be matched) of this
+ // expectation. We use std::shared_ptr in the set because we want an
+ // Expectation object to be co-owned by its FunctionMocker and its
+ // successors. This allows multiple mock objects to be deleted at
+ // different times.
+ ExpectationSet immediate_prerequisites_;
+
+ // This group of fields are the current state of the expectation,
+ // and can change as the mock function is called.
+ int call_count_; // How many times this expectation has been invoked.
+ bool retired_; // True if and only if this expectation has retired.
+ UntypedActions untyped_actions_;
+ bool extra_matcher_specified_;
+ bool repeated_action_specified_; // True if a WillRepeatedly() was specified.
+ bool retires_on_saturation_;
+ Clause last_clause_;
+ mutable bool action_count_checked_; // Under mutex_.
+ mutable Mutex mutex_; // Protects action_count_checked_.
+}; // class ExpectationBase
+
+template <typename F>
+class TypedExpectation;
+
+// Implements an expectation for the given function type.
+template <typename R, typename... Args>
+class TypedExpectation<R(Args...)> : public ExpectationBase {
+ private:
+ using F = R(Args...);
+
+ public:
+ typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+ typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+ typedef typename Function<F>::Result Result;
+
+ TypedExpectation(FunctionMocker<F>* owner, const char* a_file, int a_line,
+ const std::string& a_source_text,
+ const ArgumentMatcherTuple& m)
+ : ExpectationBase(a_file, a_line, a_source_text),
+ owner_(owner),
+ matchers_(m),
+ // By default, extra_matcher_ should match anything. However,
+ // we cannot initialize it with _ as that causes ambiguity between
+ // Matcher's copy and move constructor for some argument types.
+ extra_matcher_(A<const ArgumentTuple&>()),
+ repeated_action_(DoDefault()) {}
+
+ ~TypedExpectation() override {
+ // Check the validity of the action count if it hasn't been done
+ // yet (for example, if the expectation was never used).
+ CheckActionCountIfNotDone();
+ for (UntypedActions::const_iterator it = untyped_actions_.begin();
+ it != untyped_actions_.end(); ++it) {
+ delete static_cast<const Action<F>*>(*it);
+ }
+ }
+
+ // Implements the .With() clause.
+ TypedExpectation& With(const Matcher<const ArgumentTuple&>& m) {
+ if (last_clause_ == kWith) {
+ ExpectSpecProperty(false,
+ ".With() cannot appear "
+ "more than once in an EXPECT_CALL().");
+ } else {
+ ExpectSpecProperty(last_clause_ < kWith,
+ ".With() must be the first "
+ "clause in an EXPECT_CALL().");
+ }
+ last_clause_ = kWith;
+
+ extra_matcher_ = m;
+ extra_matcher_specified_ = true;
+ return *this;
+ }
+
+ // Implements the .Times() clause.
+ TypedExpectation& Times(const Cardinality& a_cardinality) {
+ ExpectationBase::UntypedTimes(a_cardinality);
+ return *this;
+ }
+
+ // Implements the .Times() clause.
+ TypedExpectation& Times(int n) { return Times(Exactly(n)); }
+
+ // Implements the .InSequence() clause.
+ TypedExpectation& InSequence(const Sequence& s) {
+ ExpectSpecProperty(last_clause_ <= kInSequence,
+ ".InSequence() cannot appear after .After(),"
+ " .WillOnce(), .WillRepeatedly(), or "
+ ".RetiresOnSaturation().");
+ last_clause_ = kInSequence;
+
+ s.AddExpectation(GetHandle());
+ return *this;
+ }
+ TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2) {
+ return InSequence(s1).InSequence(s2);
+ }
+ TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+ const Sequence& s3) {
+ return InSequence(s1, s2).InSequence(s3);
+ }
+ TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+ const Sequence& s3, const Sequence& s4) {
+ return InSequence(s1, s2, s3).InSequence(s4);
+ }
+ TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+ const Sequence& s3, const Sequence& s4,
+ const Sequence& s5) {
+ return InSequence(s1, s2, s3, s4).InSequence(s5);
+ }
+
+ // Implements that .After() clause.
+ TypedExpectation& After(const ExpectationSet& s) {
+ ExpectSpecProperty(last_clause_ <= kAfter,
+ ".After() cannot appear after .WillOnce(),"
+ " .WillRepeatedly(), or "
+ ".RetiresOnSaturation().");
+ last_clause_ = kAfter;
+
+ for (ExpectationSet::const_iterator it = s.begin(); it != s.end(); ++it) {
+ immediate_prerequisites_ += *it;
+ }
+ return *this;
+ }
+ TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2) {
+ return After(s1).After(s2);
+ }
+ TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+ const ExpectationSet& s3) {
+ return After(s1, s2).After(s3);
+ }
+ TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+ const ExpectationSet& s3, const ExpectationSet& s4) {
+ return After(s1, s2, s3).After(s4);
+ }
+ TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+ const ExpectationSet& s3, const ExpectationSet& s4,
+ const ExpectationSet& s5) {
+ return After(s1, s2, s3, s4).After(s5);
+ }
+
+ // Preferred, type-safe overload: consume anything that can be directly
+ // converted to a OnceAction, except for Action<F> objects themselves.
+ TypedExpectation& WillOnce(OnceAction<F> once_action) {
+ // Call the overload below, smuggling the OnceAction as a copyable callable.
+ // We know this is safe because a WillOnce action will not be called more
+ // than once.
+ return WillOnce(Action<F>(ActionAdaptor{
+ std::make_shared<OnceAction<F>>(std::move(once_action)),
+ }));
+ }
+
+ // Fallback overload: accept Action<F> objects and those actions that define
+ // `operator Action<F>` but not `operator OnceAction<F>`.
+ //
+ // This is templated in order to cause the overload above to be preferred
+ // when the input is convertible to either type.
+ template <int&... ExplicitArgumentBarrier, typename = void>
+ TypedExpectation& WillOnce(Action<F> action) {
+ ExpectSpecProperty(last_clause_ <= kWillOnce,
+ ".WillOnce() cannot appear after "
+ ".WillRepeatedly() or .RetiresOnSaturation().");
+ last_clause_ = kWillOnce;
+
+ untyped_actions_.push_back(new Action<F>(std::move(action)));
+
+ if (!cardinality_specified()) {
+ set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
+ }
+ return *this;
+ }
+
+ // Implements the .WillRepeatedly() clause.
+ TypedExpectation& WillRepeatedly(const Action<F>& action) {
+ if (last_clause_ == kWillRepeatedly) {
+ ExpectSpecProperty(false,
+ ".WillRepeatedly() cannot appear "
+ "more than once in an EXPECT_CALL().");
+ } else {
+ ExpectSpecProperty(last_clause_ < kWillRepeatedly,
+ ".WillRepeatedly() cannot appear "
+ "after .RetiresOnSaturation().");
+ }
+ last_clause_ = kWillRepeatedly;
+ repeated_action_specified_ = true;
+
+ repeated_action_ = action;
+ if (!cardinality_specified()) {
+ set_cardinality(AtLeast(static_cast<int>(untyped_actions_.size())));
+ }
+
+ // Now that no more action clauses can be specified, we check
+ // whether their count makes sense.
+ CheckActionCountIfNotDone();
+ return *this;
+ }
+
+ // Implements the .RetiresOnSaturation() clause.
+ TypedExpectation& RetiresOnSaturation() {
+ ExpectSpecProperty(last_clause_ < kRetiresOnSaturation,
+ ".RetiresOnSaturation() cannot appear "
+ "more than once.");
+ last_clause_ = kRetiresOnSaturation;
+ retires_on_saturation_ = true;
+
+ // Now that no more action clauses can be specified, we check
+ // whether their count makes sense.
+ CheckActionCountIfNotDone();
+ return *this;
+ }
+
+ // Returns the matchers for the arguments as specified inside the
+ // EXPECT_CALL() macro.
+ const ArgumentMatcherTuple& matchers() const { return matchers_; }
+
+ // Returns the matcher specified by the .With() clause.
+ const Matcher<const ArgumentTuple&>& extra_matcher() const {
+ return extra_matcher_;
+ }
+
+ // Returns the action specified by the .WillRepeatedly() clause.
+ const Action<F>& repeated_action() const { return repeated_action_; }
+
+ // If this mock method has an extra matcher (i.e. .With(matcher)),
+ // describes it to the ostream.
+ void MaybeDescribeExtraMatcherTo(::std::ostream* os) override {
+ if (extra_matcher_specified_) {
+ *os << " Expected args: ";
+ extra_matcher_.DescribeTo(os);
+ *os << "\n";
+ }
+ }
+
+ private:
+ template <typename Function>
+ friend class FunctionMocker;
+
+ // An adaptor that turns a OneAction<F> into something compatible with
+ // Action<F>. Must be called at most once.
+ struct ActionAdaptor {
+ std::shared_ptr<OnceAction<R(Args...)>> once_action;
+
+ R operator()(Args&&... args) const {
+ return std::move(*once_action).Call(std::forward<Args>(args)...);
+ }
+ };
+
+ // Returns an Expectation object that references and co-owns this
+ // expectation.
+ Expectation GetHandle() override { return owner_->GetHandleOf(this); }
+
+ // The following methods will be called only after the EXPECT_CALL()
+ // statement finishes and when the current thread holds
+ // g_gmock_mutex.
+
+ // Returns true if and only if this expectation matches the given arguments.
+ bool Matches(const ArgumentTuple& args) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+ }
+
+ // Returns true if and only if this expectation should handle the given
+ // arguments.
+ bool ShouldHandleArguments(const ArgumentTuple& args) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+
+ // In case the action count wasn't checked when the expectation
+ // was defined (e.g. if this expectation has no WillRepeatedly()
+ // or RetiresOnSaturation() clause), we check it when the
+ // expectation is used for the first time.
+ CheckActionCountIfNotDone();
+ return !is_retired() && AllPrerequisitesAreSatisfied() && Matches(args);
+ }
+
+ // Describes the result of matching the arguments against this
+ // expectation to the given ostream.
+ void ExplainMatchResultTo(const ArgumentTuple& args, ::std::ostream* os) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+
+ if (is_retired()) {
+ *os << " Expected: the expectation is active\n"
+ << " Actual: it is retired\n";
+ } else if (!Matches(args)) {
+ if (!TupleMatches(matchers_, args)) {
+ ExplainMatchFailureTupleTo(matchers_, args, os);
+ }
+ StringMatchResultListener listener;
+ if (!extra_matcher_.MatchAndExplain(args, &listener)) {
+ *os << " Expected args: ";
+ extra_matcher_.DescribeTo(os);
+ *os << "\n Actual: don't match";
+
+ internal::PrintIfNotEmpty(listener.str(), os);
+ *os << "\n";
+ }
+ } else if (!AllPrerequisitesAreSatisfied()) {
+ *os << " Expected: all pre-requisites are satisfied\n"
+ << " Actual: the following immediate pre-requisites "
+ << "are not satisfied:\n";
+ ExpectationSet unsatisfied_prereqs;
+ FindUnsatisfiedPrerequisites(&unsatisfied_prereqs);
+ int i = 0;
+ for (ExpectationSet::const_iterator it = unsatisfied_prereqs.begin();
+ it != unsatisfied_prereqs.end(); ++it) {
+ it->expectation_base()->DescribeLocationTo(os);
+ *os << "pre-requisite #" << i++ << "\n";
+ }
+ *os << " (end of pre-requisites)\n";
+ } else {
+ // This line is here just for completeness' sake. It will never
+ // be executed as currently the ExplainMatchResultTo() function
+ // is called only when the mock function call does NOT match the
+ // expectation.
+ *os << "The call matches the expectation.\n";
+ }
+ }
+
+ // Returns the action that should be taken for the current invocation.
+ const Action<F>& GetCurrentAction(const FunctionMocker<F>* mocker,
+ const ArgumentTuple& args) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ const int count = call_count();
+ Assert(count >= 1, __FILE__, __LINE__,
+ "call_count() is <= 0 when GetCurrentAction() is "
+ "called - this should never happen.");
+
+ const int action_count = static_cast<int>(untyped_actions_.size());
+ if (action_count > 0 && !repeated_action_specified_ &&
+ count > action_count) {
+ // If there is at least one WillOnce() and no WillRepeatedly(),
+ // we warn the user when the WillOnce() clauses ran out.
+ ::std::stringstream ss;
+ DescribeLocationTo(&ss);
+ ss << "Actions ran out in " << source_text() << "...\n"
+ << "Called " << count << " times, but only " << action_count
+ << " WillOnce()" << (action_count == 1 ? " is" : "s are")
+ << " specified - ";
+ mocker->DescribeDefaultActionTo(args, &ss);
+ Log(kWarning, ss.str(), 1);
+ }
+
+ return count <= action_count
+ ? *static_cast<const Action<F>*>(
+ untyped_actions_[static_cast<size_t>(count - 1)])
+ : repeated_action();
+ }
+
+ // Given the arguments of a mock function call, if the call will
+ // over-saturate this expectation, returns the default action;
+ // otherwise, returns the next action in this expectation. Also
+ // describes *what* happened to 'what', and explains *why* Google
+ // Mock does it to 'why'. This method is not const as it calls
+ // IncrementCallCount(). A return value of NULL means the default
+ // action.
+ const Action<F>* GetActionForArguments(const FunctionMocker<F>* mocker,
+ const ArgumentTuple& args,
+ ::std::ostream* what,
+ ::std::ostream* why)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ if (IsSaturated()) {
+ // We have an excessive call.
+ IncrementCallCount();
+ *what << "Mock function called more times than expected - ";
+ mocker->DescribeDefaultActionTo(args, what);
+ DescribeCallCountTo(why);
+
+ return nullptr;
+ }
+
+ IncrementCallCount();
+ RetireAllPreRequisites();
+
+ if (retires_on_saturation_ && IsSaturated()) {
+ Retire();
+ }
+
+ // Must be done after IncrementCount()!
+ *what << "Mock function call matches " << source_text() << "...\n";
+ return &(GetCurrentAction(mocker, args));
+ }
+
+ // All the fields below won't change once the EXPECT_CALL()
+ // statement finishes.
+ FunctionMocker<F>* const owner_;
+ ArgumentMatcherTuple matchers_;
+ Matcher<const ArgumentTuple&> extra_matcher_;
+ Action<F> repeated_action_;
+
+ TypedExpectation(const TypedExpectation&) = delete;
+ TypedExpectation& operator=(const TypedExpectation&) = delete;
+}; // class TypedExpectation
+
+// A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
+// specifying the default behavior of, or expectation on, a mock
+// function.
+
+// Note: class MockSpec really belongs to the ::testing namespace.
+// However if we define it in ::testing, MSVC will complain when
+// classes in ::testing::internal declare it as a friend class
+// template. To workaround this compiler bug, we define MockSpec in
+// ::testing::internal and import it into ::testing.
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+ const char* file, int line,
+ const std::string& message);
+
+template <typename F>
+class MockSpec {
+ public:
+ typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+ typedef
+ typename internal::Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+ // Constructs a MockSpec object, given the function mocker object
+ // that the spec is associated with.
+ MockSpec(internal::FunctionMocker<F>* function_mocker,
+ const ArgumentMatcherTuple& matchers)
+ : function_mocker_(function_mocker), matchers_(matchers) {}
+
+ // Adds a new default action spec to the function mocker and returns
+ // the newly created spec.
+ internal::OnCallSpec<F>& InternalDefaultActionSetAt(const char* file,
+ int line, const char* obj,
+ const char* call) {
+ LogWithLocation(internal::kInfo, file, line,
+ std::string("ON_CALL(") + obj + ", " + call + ") invoked");
+ return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
+ }
+
+ // Adds a new expectation spec to the function mocker and returns
+ // the newly created spec.
+ internal::TypedExpectation<F>& InternalExpectedAt(const char* file, int line,
+ const char* obj,
+ const char* call) {
+ const std::string source_text(std::string("EXPECT_CALL(") + obj + ", " +
+ call + ")");
+ LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
+ return function_mocker_->AddNewExpectation(file, line, source_text,
+ matchers_);
+ }
+
+ // This operator overload is used to swallow the superfluous parameter list
+ // introduced by the ON/EXPECT_CALL macros. See the macro comments for more
+ // explanation.
+ MockSpec<F>& operator()(const internal::WithoutMatchers&, void* const) {
+ return *this;
+ }
+
+ private:
+ template <typename Function>
+ friend class internal::FunctionMocker;
+
+ // The function mocker that owns this spec.
+ internal::FunctionMocker<F>* const function_mocker_;
+ // The argument matchers specified in the spec.
+ ArgumentMatcherTuple matchers_;
+}; // class MockSpec
+
+// Wrapper type for generically holding an ordinary value or lvalue reference.
+// If T is not a reference type, it must be copyable or movable.
+// ReferenceOrValueWrapper<T> is movable, and will also be copyable unless
+// T is a move-only value type (which means that it will always be copyable
+// if the current platform does not support move semantics).
+//
+// The primary template defines handling for values, but function header
+// comments describe the contract for the whole template (including
+// specializations).
+template <typename T>
+class ReferenceOrValueWrapper {
+ public:
+ // Constructs a wrapper from the given value/reference.
+ explicit ReferenceOrValueWrapper(T value) : value_(std::move(value)) {}
+
+ // Unwraps and returns the underlying value/reference, exactly as
+ // originally passed. The behavior of calling this more than once on
+ // the same object is unspecified.
+ T Unwrap() { return std::move(value_); }
+
+ // Provides nondestructive access to the underlying value/reference.
+ // Always returns a const reference (more precisely,
+ // const std::add_lvalue_reference<T>::type). The behavior of calling this
+ // after calling Unwrap on the same object is unspecified.
+ const T& Peek() const { return value_; }
+
+ private:
+ T value_;
+};
+
+// Specialization for lvalue reference types. See primary template
+// for documentation.
+template <typename T>
+class ReferenceOrValueWrapper<T&> {
+ public:
+ // Workaround for debatable pass-by-reference lint warning (c-library-team
+ // policy precludes NOLINT in this context)
+ typedef T& reference;
+ explicit ReferenceOrValueWrapper(reference ref) : value_ptr_(&ref) {}
+ T& Unwrap() { return *value_ptr_; }
+ const T& Peek() const { return *value_ptr_; }
+
+ private:
+ T* value_ptr_;
+};
+
+// Prints the held value as an action's result to os.
+template <typename T>
+void PrintAsActionResult(const T& result, std::ostream& os) {
+ os << "\n Returns: ";
+ // T may be a reference type, so we don't use UniversalPrint().
+ UniversalPrinter<T>::Print(result, &os);
+}
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+GTEST_API_ void ReportUninterestingCall(CallReaction reaction,
+ const std::string& msg);
+
+// A generic RAII type that runs a user-provided function in its destructor.
+class Cleanup final {
+ public:
+ explicit Cleanup(std::function<void()> f) : f_(std::move(f)) {}
+ ~Cleanup() { f_(); }
+
+ private:
+ std::function<void()> f_;
+};
+
+template <typename F>
+class FunctionMocker;
+
+template <typename R, typename... Args>
+class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
+ using F = R(Args...);
+
+ public:
+ using Result = R;
+ using ArgumentTuple = std::tuple<Args...>;
+ using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+
+ FunctionMocker() {}
+
+ // There is no generally useful and implementable semantics of
+ // copying a mock object, so copying a mock is usually a user error.
+ // Thus we disallow copying function mockers. If the user really
+ // wants to copy a mock object, they should implement their own copy
+ // operation, for example:
+ //
+ // class MockFoo : public Foo {
+ // public:
+ // // Defines a copy constructor explicitly.
+ // MockFoo(const MockFoo& src) {}
+ // ...
+ // };
+ FunctionMocker(const FunctionMocker&) = delete;
+ FunctionMocker& operator=(const FunctionMocker&) = delete;
+
+ // The destructor verifies that all expectations on this mock
+ // function have been satisfied. If not, it will report Google Test
+ // non-fatal failures for the violations.
+ ~FunctionMocker() override GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ MutexLock l(&g_gmock_mutex);
+ VerifyAndClearExpectationsLocked();
+ Mock::UnregisterLocked(this);
+ ClearDefaultActionsLocked();
+ }
+
+ // Returns the ON_CALL spec that matches this mock function with the
+ // given arguments; returns NULL if no matching ON_CALL is found.
+ // L = *
+ const OnCallSpec<F>* FindOnCallSpec(const ArgumentTuple& args) const {
+ for (UntypedOnCallSpecs::const_reverse_iterator it =
+ untyped_on_call_specs_.rbegin();
+ it != untyped_on_call_specs_.rend(); ++it) {
+ const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
+ if (spec->Matches(args)) return spec;
+ }
+
+ return nullptr;
+ }
+
+ // Performs the default action of this mock function on the given
+ // arguments and returns the result. Asserts (or throws if
+ // exceptions are enabled) with a helpful call description if there
+ // is no valid return value. This method doesn't depend on the
+ // mutable state of this object, and thus can be called concurrently
+ // without locking.
+ // L = *
+ Result PerformDefaultAction(ArgumentTuple&& args,
+ const std::string& call_description) const {
+ const OnCallSpec<F>* const spec = this->FindOnCallSpec(args);
+ if (spec != nullptr) {
+ return spec->GetAction().Perform(std::move(args));
+ }
+ const std::string message =
+ call_description +
+ "\n The mock function has no default action "
+ "set, and its return type has no default value set.";
+#if GTEST_HAS_EXCEPTIONS
+ if (!DefaultValue<Result>::Exists()) {
+ throw std::runtime_error(message);
+ }
+#else
+ Assert(DefaultValue<Result>::Exists(), "", -1, message);
+#endif
+ return DefaultValue<Result>::Get();
+ }
+
+ // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
+ // clears the ON_CALL()s set on this mock function.
+ void ClearDefaultActionsLocked() override
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+
+ // Deleting our default actions may trigger other mock objects to be
+ // deleted, for example if an action contains a reference counted smart
+ // pointer to that mock object, and that is the last reference. So if we
+ // delete our actions within the context of the global mutex we may deadlock
+ // when this method is called again. Instead, make a copy of the set of
+ // actions to delete, clear our set within the mutex, and then delete the
+ // actions outside of the mutex.
+ UntypedOnCallSpecs specs_to_delete;
+ untyped_on_call_specs_.swap(specs_to_delete);
+
+ g_gmock_mutex.Unlock();
+ for (UntypedOnCallSpecs::const_iterator it = specs_to_delete.begin();
+ it != specs_to_delete.end(); ++it) {
+ delete static_cast<const OnCallSpec<F>*>(*it);
+ }
+
+ // Lock the mutex again, since the caller expects it to be locked when we
+ // return.
+ g_gmock_mutex.Lock();
+ }
+
+ // Returns the result of invoking this mock function with the given
+ // arguments. This function can be safely called from multiple
+ // threads concurrently.
+ Result Invoke(Args... args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ return InvokeWith(ArgumentTuple(std::forward<Args>(args)...));
+ }
+
+ MockSpec<F> With(Matcher<Args>... m) {
+ return MockSpec<F>(this, ::std::make_tuple(std::move(m)...));
+ }
+
+ protected:
+ template <typename Function>
+ friend class MockSpec;
+
+ // Adds and returns a default action spec for this mock function.
+ OnCallSpec<F>& AddNewOnCallSpec(const char* file, int line,
+ const ArgumentMatcherTuple& m)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+ OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
+ untyped_on_call_specs_.push_back(on_call_spec);
+ return *on_call_spec;
+ }
+
+ // Adds and returns an expectation spec for this mock function.
+ TypedExpectation<F>& AddNewExpectation(const char* file, int line,
+ const std::string& source_text,
+ const ArgumentMatcherTuple& m)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+ TypedExpectation<F>* const expectation =
+ new TypedExpectation<F>(this, file, line, source_text, m);
+ const std::shared_ptr<ExpectationBase> untyped_expectation(expectation);
+ // See the definition of untyped_expectations_ for why access to
+ // it is unprotected here.
+ untyped_expectations_.push_back(untyped_expectation);
+
+ // Adds this expectation into the implicit sequence if there is one.
+ Sequence* const implicit_sequence = g_gmock_implicit_sequence.get();
+ if (implicit_sequence != nullptr) {
+ implicit_sequence->AddExpectation(Expectation(untyped_expectation));
+ }
+
+ return *expectation;
+ }
+
+ private:
+ template <typename Func>
+ friend class TypedExpectation;
+
+ // Some utilities needed for implementing UntypedInvokeWith().
+
+ // Describes what default action will be performed for the given
+ // arguments.
+ // L = *
+ void DescribeDefaultActionTo(const ArgumentTuple& args,
+ ::std::ostream* os) const {
+ const OnCallSpec<F>* const spec = FindOnCallSpec(args);
+
+ if (spec == nullptr) {
+ *os << (std::is_void<Result>::value ? "returning directly.\n"
+ : "returning default value.\n");
+ } else {
+ *os << "taking default action specified at:\n"
+ << FormatFileLocation(spec->file(), spec->line()) << "\n";
+ }
+ }
+
+ // Writes a message that the call is uninteresting (i.e. neither
+ // explicitly expected nor explicitly unexpected) to the given
+ // ostream.
+ void UntypedDescribeUninterestingCall(const void* untyped_args,
+ ::std::ostream* os) const override
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ const ArgumentTuple& args =
+ *static_cast<const ArgumentTuple*>(untyped_args);
+ *os << "Uninteresting mock function call - ";
+ DescribeDefaultActionTo(args, os);
+ *os << " Function call: " << Name();
+ UniversalPrint(args, os);
+ }
+
+ // Returns the expectation that matches the given function arguments
+ // (or NULL is there's no match); when a match is found,
+ // untyped_action is set to point to the action that should be
+ // performed (or NULL if the action is "do default"), and
+ // is_excessive is modified to indicate whether the call exceeds the
+ // expected number.
+ //
+ // Critical section: We must find the matching expectation and the
+ // corresponding action that needs to be taken in an ATOMIC
+ // transaction. Otherwise another thread may call this mock
+ // method in the middle and mess up the state.
+ //
+ // However, performing the action has to be left out of the critical
+ // section. The reason is that we have no control on what the
+ // action does (it can invoke an arbitrary user function or even a
+ // mock function) and excessive locking could cause a dead lock.
+ const ExpectationBase* UntypedFindMatchingExpectation(
+ const void* untyped_args, const void** untyped_action, bool* is_excessive,
+ ::std::ostream* what, ::std::ostream* why) override
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ const ArgumentTuple& args =
+ *static_cast<const ArgumentTuple*>(untyped_args);
+ MutexLock l(&g_gmock_mutex);
+ TypedExpectation<F>* exp = this->FindMatchingExpectationLocked(args);
+ if (exp == nullptr) { // A match wasn't found.
+ this->FormatUnexpectedCallMessageLocked(args, what, why);
+ return nullptr;
+ }
+
+ // This line must be done before calling GetActionForArguments(),
+ // which will increment the call count for *exp and thus affect
+ // its saturation status.
+ *is_excessive = exp->IsSaturated();
+ const Action<F>* action = exp->GetActionForArguments(this, args, what, why);
+ if (action != nullptr && action->IsDoDefault())
+ action = nullptr; // Normalize "do default" to NULL.
+ *untyped_action = action;
+ return exp;
+ }
+
+ // Prints the given function arguments to the ostream.
+ void UntypedPrintArgs(const void* untyped_args,
+ ::std::ostream* os) const override {
+ const ArgumentTuple& args =
+ *static_cast<const ArgumentTuple*>(untyped_args);
+ UniversalPrint(args, os);
+ }
+
+ // Returns the expectation that matches the arguments, or NULL if no
+ // expectation matches them.
+ TypedExpectation<F>* FindMatchingExpectationLocked(const ArgumentTuple& args)
+ const GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ // See the definition of untyped_expectations_ for why access to
+ // it is unprotected here.
+ for (typename UntypedExpectations::const_reverse_iterator it =
+ untyped_expectations_.rbegin();
+ it != untyped_expectations_.rend(); ++it) {
+ TypedExpectation<F>* const exp =
+ static_cast<TypedExpectation<F>*>(it->get());
+ if (exp->ShouldHandleArguments(args)) {
+ return exp;
+ }
+ }
+ return nullptr;
+ }
+
+ // Returns a message that the arguments don't match any expectation.
+ void FormatUnexpectedCallMessageLocked(const ArgumentTuple& args,
+ ::std::ostream* os,
+ ::std::ostream* why) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ *os << "\nUnexpected mock function call - ";
+ DescribeDefaultActionTo(args, os);
+ PrintTriedExpectationsLocked(args, why);
+ }
+
+ // Prints a list of expectations that have been tried against the
+ // current mock function call.
+ void PrintTriedExpectationsLocked(const ArgumentTuple& args,
+ ::std::ostream* why) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ const size_t count = untyped_expectations_.size();
+ *why << "Google Mock tried the following " << count << " "
+ << (count == 1 ? "expectation, but it didn't match"
+ : "expectations, but none matched")
+ << ":\n";
+ for (size_t i = 0; i < count; i++) {
+ TypedExpectation<F>* const expectation =
+ static_cast<TypedExpectation<F>*>(untyped_expectations_[i].get());
+ *why << "\n";
+ expectation->DescribeLocationTo(why);
+ if (count > 1) {
+ *why << "tried expectation #" << i << ": ";
+ }
+ *why << expectation->source_text() << "...\n";
+ expectation->ExplainMatchResultTo(args, why);
+ expectation->DescribeCallCountTo(why);
+ }
+ }
+
+ // Performs the given action (or the default if it's null) with the given
+ // arguments and returns the action's result.
+ // L = *
+ R PerformAction(const void* untyped_action, ArgumentTuple&& args,
+ const std::string& call_description) const {
+ if (untyped_action == nullptr) {
+ return PerformDefaultAction(std::move(args), call_description);
+ }
+
+ // Make a copy of the action before performing it, in case the
+ // action deletes the mock object (and thus deletes itself).
+ const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
+ return action.Perform(std::move(args));
+ }
+
+ // Is it possible to store an object of the supplied type in a local variable
+ // for the sake of printing it, then return it on to the caller?
+ template <typename T>
+ using can_print_result = internal::conjunction<
+ // void can't be stored as an object (and we also don't need to print it).
+ internal::negation<std::is_void<T>>,
+ // Non-moveable types can't be returned on to the user, so there's no way
+ // for us to intercept and print them.
+ std::is_move_constructible<T>>;
+
+ // Perform the supplied action, printing the result to os.
+ template <typename T = R,
+ typename std::enable_if<can_print_result<T>::value, int>::type = 0>
+ R PerformActionAndPrintResult(const void* const untyped_action,
+ ArgumentTuple&& args,
+ const std::string& call_description,
+ std::ostream& os) {
+ R result = PerformAction(untyped_action, std::move(args), call_description);
+
+ PrintAsActionResult(result, os);
+ return std::forward<R>(result);
+ }
+
+ // An overload for when it's not possible to print the result. In this case we
+ // simply perform the action.
+ template <typename T = R,
+ typename std::enable_if<
+ internal::negation<can_print_result<T>>::value, int>::type = 0>
+ R PerformActionAndPrintResult(const void* const untyped_action,
+ ArgumentTuple&& args,
+ const std::string& call_description,
+ std::ostream&) {
+ return PerformAction(untyped_action, std::move(args), call_description);
+ }
+
+ // Returns the result of invoking this mock function with the given
+ // arguments. This function can be safely called from multiple
+ // threads concurrently.
+ R InvokeWith(ArgumentTuple&& args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+}; // class FunctionMocker
+
+// Calculates the result of invoking this mock function with the given
+// arguments, prints it, and returns it.
+template <typename R, typename... Args>
+R FunctionMocker<R(Args...)>::InvokeWith(ArgumentTuple&& args)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ // See the definition of untyped_expectations_ for why access to it
+ // is unprotected here.
+ if (untyped_expectations_.size() == 0) {
+ // No expectation is set on this mock method - we have an
+ // uninteresting call.
+
+ // We must get Google Mock's reaction on uninteresting calls
+ // made on this mock object BEFORE performing the action,
+ // because the action may DELETE the mock object and make the
+ // following expression meaningless.
+ const CallReaction reaction =
+ Mock::GetReactionOnUninterestingCalls(MockObject());
+
+ // True if and only if we need to print this call's arguments and return
+ // value. This definition must be kept in sync with
+ // the behavior of ReportUninterestingCall().
+ const bool need_to_report_uninteresting_call =
+ // If the user allows this uninteresting call, we print it
+ // only when they want informational messages.
+ reaction == kAllow ? LogIsVisible(kInfo) :
+ // If the user wants this to be a warning, we print
+ // it only when they want to see warnings.
+ reaction == kWarn
+ ? LogIsVisible(kWarning)
+ :
+ // Otherwise, the user wants this to be an error, and we
+ // should always print detailed information in the error.
+ true;
+
+ if (!need_to_report_uninteresting_call) {
+ // Perform the action without printing the call information.
+ return this->PerformDefaultAction(
+ std::move(args), "Function call: " + std::string(Name()));
+ }
+
+ // Warns about the uninteresting call.
+ ::std::stringstream ss;
+ this->UntypedDescribeUninterestingCall(&args, &ss);
+
+ // Perform the action, print the result, and then report the uninteresting
+ // call.
+ //
+ // We use RAII to do the latter in case R is void or a non-moveable type. In
+ // either case we can't assign it to a local variable.
+ const Cleanup report_uninteresting_call(
+ [&] { ReportUninterestingCall(reaction, ss.str()); });
+
+ return PerformActionAndPrintResult(nullptr, std::move(args), ss.str(), ss);
+ }
+
+ bool is_excessive = false;
+ ::std::stringstream ss;
+ ::std::stringstream why;
+ ::std::stringstream loc;
+ const void* untyped_action = nullptr;
+
+ // The UntypedFindMatchingExpectation() function acquires and
+ // releases g_gmock_mutex.
+
+ const ExpectationBase* const untyped_expectation =
+ this->UntypedFindMatchingExpectation(&args, &untyped_action,
+ &is_excessive, &ss, &why);
+ const bool found = untyped_expectation != nullptr;
+
+ // True if and only if we need to print the call's arguments
+ // and return value.
+ // This definition must be kept in sync with the uses of Expect()
+ // and Log() in this function.
+ const bool need_to_report_call =
+ !found || is_excessive || LogIsVisible(kInfo);
+ if (!need_to_report_call) {
+ // Perform the action without printing the call information.
+ return PerformAction(untyped_action, std::move(args), "");
+ }
+
+ ss << " Function call: " << Name();
+ this->UntypedPrintArgs(&args, &ss);
+
+ // In case the action deletes a piece of the expectation, we
+ // generate the message beforehand.
+ if (found && !is_excessive) {
+ untyped_expectation->DescribeLocationTo(&loc);
+ }
+
+ // Perform the action, print the result, and then fail or log in whatever way
+ // is appropriate.
+ //
+ // We use RAII to do the latter in case R is void or a non-moveable type. In
+ // either case we can't assign it to a local variable.
+ const Cleanup handle_failures([&] {
+ ss << "\n" << why.str();
+
+ if (!found) {
+ // No expectation matches this call - reports a failure.
+ Expect(false, nullptr, -1, ss.str());
+ } else if (is_excessive) {
+ // We had an upper-bound violation and the failure message is in ss.
+ Expect(false, untyped_expectation->file(), untyped_expectation->line(),
+ ss.str());
+ } else {
+ // We had an expected call and the matching expectation is
+ // described in ss.
+ Log(kInfo, loc.str() + ss.str(), 2);
+ }
+ });
+
+ return PerformActionAndPrintResult(untyped_action, std::move(args), ss.str(),
+ ss);
+}
+
+} // namespace internal
+
+namespace internal {
+
+template <typename F>
+class MockFunction;
+
+template <typename R, typename... Args>
+class MockFunction<R(Args...)> {
+ public:
+ MockFunction(const MockFunction&) = delete;
+ MockFunction& operator=(const MockFunction&) = delete;
+
+ std::function<R(Args...)> AsStdFunction() {
+ return [this](Args... args) -> R {
+ return this->Call(std::forward<Args>(args)...);
+ };
+ }
+
+ // Implementation detail: the expansion of the MOCK_METHOD macro.
+ R Call(Args... args) {
+ mock_.SetOwnerAndName(this, "Call");
+ return mock_.Invoke(std::forward<Args>(args)...);
+ }
+
+ MockSpec<R(Args...)> gmock_Call(Matcher<Args>... m) {
+ mock_.RegisterOwner(this);
+ return mock_.With(std::move(m)...);
+ }
+
+ MockSpec<R(Args...)> gmock_Call(const WithoutMatchers&, R (*)(Args...)) {
+ return this->gmock_Call(::testing::A<Args>()...);
+ }
+
+ protected:
+ MockFunction() = default;
+ ~MockFunction() = default;
+
+ private:
+ FunctionMocker<R(Args...)> mock_;
+};
+
+/*
+The SignatureOf<F> struct is a meta-function returning function signature
+corresponding to the provided F argument.
+
+It makes use of MockFunction easier by allowing it to accept more F arguments
+than just function signatures.
+
+Specializations provided here cover a signature type itself and any template
+that can be parameterized with a signature, including std::function and
+boost::function.
+*/
+
+template <typename F, typename = void>
+struct SignatureOf;
+
+template <typename R, typename... Args>
+struct SignatureOf<R(Args...)> {
+ using type = R(Args...);
+};
+
+template <template <typename> class C, typename F>
+struct SignatureOf<C<F>,
+ typename std::enable_if<std::is_function<F>::value>::type>
+ : SignatureOf<F> {};
+
+template <typename F>
+using SignatureOfT = typename SignatureOf<F>::type;
+
+} // namespace internal
+
+// A MockFunction<F> type has one mock method whose type is
+// internal::SignatureOfT<F>. It is useful when you just want your
+// test code to emit some messages and have Google Mock verify the
+// right messages are sent (and perhaps at the right times). For
+// example, if you are exercising code:
+//
+// Foo(1);
+// Foo(2);
+// Foo(3);
+//
+// and want to verify that Foo(1) and Foo(3) both invoke
+// mock.Bar("a"), but Foo(2) doesn't invoke anything, you can write:
+//
+// TEST(FooTest, InvokesBarCorrectly) {
+// MyMock mock;
+// MockFunction<void(string check_point_name)> check;
+// {
+// InSequence s;
+//
+// EXPECT_CALL(mock, Bar("a"));
+// EXPECT_CALL(check, Call("1"));
+// EXPECT_CALL(check, Call("2"));
+// EXPECT_CALL(mock, Bar("a"));
+// }
+// Foo(1);
+// check.Call("1");
+// Foo(2);
+// check.Call("2");
+// Foo(3);
+// }
+//
+// The expectation spec says that the first Bar("a") must happen
+// before check point "1", the second Bar("a") must happen after check
+// point "2", and nothing should happen between the two check
+// points. The explicit check points make it easy to tell which
+// Bar("a") is called by which call to Foo().
+//
+// MockFunction<F> can also be used to exercise code that accepts
+// std::function<internal::SignatureOfT<F>> callbacks. To do so, use
+// AsStdFunction() method to create std::function proxy forwarding to
+// original object's Call. Example:
+//
+// TEST(FooTest, RunsCallbackWithBarArgument) {
+// MockFunction<int(string)> callback;
+// EXPECT_CALL(callback, Call("bar")).WillOnce(Return(1));
+// Foo(callback.AsStdFunction());
+// }
+//
+// The internal::SignatureOfT<F> indirection allows to use other types
+// than just function signature type. This is typically useful when
+// providing a mock for a predefined std::function type. Example:
+//
+// using FilterPredicate = std::function<bool(string)>;
+// void MyFilterAlgorithm(FilterPredicate predicate);
+//
+// TEST(FooTest, FilterPredicateAlwaysAccepts) {
+// MockFunction<FilterPredicate> predicateMock;
+// EXPECT_CALL(predicateMock, Call(_)).WillRepeatedly(Return(true));
+// MyFilterAlgorithm(predicateMock.AsStdFunction());
+// }
+template <typename F>
+class MockFunction : public internal::MockFunction<internal::SignatureOfT<F>> {
+ using Base = internal::MockFunction<internal::SignatureOfT<F>>;
+
+ public:
+ using Base::Base;
+};
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file. However, the MockSpec class template is
+// meant to be defined in the ::testing namespace. The following line
+// is just a trick for working around a bug in MSVC 8.0, which cannot
+// handle it if we define MockSpec in ::testing.
+using internal::MockSpec;
+
+// Const(x) is a convenient function for obtaining a const reference
+// to x. This is useful for setting expectations on an overloaded
+// const mock method, e.g.
+//
+// class MockFoo : public FooInterface {
+// public:
+// MOCK_METHOD0(Bar, int());
+// MOCK_CONST_METHOD0(Bar, int&());
+// };
+//
+// MockFoo foo;
+// // Expects a call to non-const MockFoo::Bar().
+// EXPECT_CALL(foo, Bar());
+// // Expects a call to const MockFoo::Bar().
+// EXPECT_CALL(Const(foo), Bar());
+template <typename T>
+inline const T& Const(const T& x) {
+ return x;
+}
+
+// Constructs an Expectation object that references and co-owns exp.
+inline Expectation::Expectation(internal::ExpectationBase& exp) // NOLINT
+ : expectation_base_(exp.GetHandle().expectation_base()) {}
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+// Implementation for ON_CALL and EXPECT_CALL macros. A separate macro is
+// required to avoid compile errors when the name of the method used in call is
+// a result of macro expansion. See CompilesWithMethodNameExpandedFromMacro
+// tests in internal/gmock-spec-builders_test.cc for more details.
+//
+// This macro supports statements both with and without parameter matchers. If
+// the parameter list is omitted, gMock will accept any parameters, which allows
+// tests to be written that don't need to encode the number of method
+// parameter. This technique may only be used for non-overloaded methods.
+//
+// // These are the same:
+// ON_CALL(mock, NoArgsMethod()).WillByDefault(...);
+// ON_CALL(mock, NoArgsMethod).WillByDefault(...);
+//
+// // As are these:
+// ON_CALL(mock, TwoArgsMethod(_, _)).WillByDefault(...);
+// ON_CALL(mock, TwoArgsMethod).WillByDefault(...);
+//
+// // Can also specify args if you want, of course:
+// ON_CALL(mock, TwoArgsMethod(_, 45)).WillByDefault(...);
+//
+// // Overloads work as long as you specify parameters:
+// ON_CALL(mock, OverloadedMethod(_)).WillByDefault(...);
+// ON_CALL(mock, OverloadedMethod(_, _)).WillByDefault(...);
+//
+// // Oops! Which overload did you want?
+// ON_CALL(mock, OverloadedMethod).WillByDefault(...);
+// => ERROR: call to member function 'gmock_OverloadedMethod' is ambiguous
+//
+// How this works: The mock class uses two overloads of the gmock_Method
+// expectation setter method plus an operator() overload on the MockSpec object.
+// In the matcher list form, the macro expands to:
+//
+// // This statement:
+// ON_CALL(mock, TwoArgsMethod(_, 45))...
+//
+// // ...expands to:
+// mock.gmock_TwoArgsMethod(_, 45)(WithoutMatchers(), nullptr)...
+// |-------------v---------------||------------v-------------|
+// invokes first overload swallowed by operator()
+//
+// // ...which is essentially:
+// mock.gmock_TwoArgsMethod(_, 45)...
+//
+// Whereas the form without a matcher list:
+//
+// // This statement:
+// ON_CALL(mock, TwoArgsMethod)...
+//
+// // ...expands to:
+// mock.gmock_TwoArgsMethod(WithoutMatchers(), nullptr)...
+// |-----------------------v--------------------------|
+// invokes second overload
+//
+// // ...which is essentially:
+// mock.gmock_TwoArgsMethod(_, _)...
+//
+// The WithoutMatchers() argument is used to disambiguate overloads and to
+// block the caller from accidentally invoking the second overload directly. The
+// second argument is an internal type derived from the method signature. The
+// failure to disambiguate two overloads of this method in the ON_CALL statement
+// is how we block callers from setting expectations on overloaded methods.
+#define GMOCK_ON_CALL_IMPL_(mock_expr, Setter, call) \
+ ((mock_expr).gmock_##call)(::testing::internal::GetWithoutMatchers(), \
+ nullptr) \
+ .Setter(__FILE__, __LINE__, #mock_expr, #call)
+
+#define ON_CALL(obj, call) \
+ GMOCK_ON_CALL_IMPL_(obj, InternalDefaultActionSetAt, call)
+
+#define EXPECT_CALL(obj, call) \
+ GMOCK_ON_CALL_IMPL_(obj, InternalExpectedAt, call)
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock.h
new file mode 100644
index 0000000000..568c8c71d7
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock.h
@@ -0,0 +1,96 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This is the main header file a user should include.
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
+
+// This file implements the following syntax:
+//
+// ON_CALL(mock_object, Method(...))
+// .With(...) ?
+// .WillByDefault(...);
+//
+// where With() is optional and WillByDefault() must appear exactly
+// once.
+//
+// EXPECT_CALL(mock_object, Method(...))
+// .With(...) ?
+// .Times(...) ?
+// .InSequence(...) *
+// .WillOnce(...) *
+// .WillRepeatedly(...) ?
+// .RetiresOnSaturation() ? ;
+//
+// where all clauses are optional and WillOnce() can be repeated.
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-function-mocker.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-more-actions.h"
+#include "gmock/gmock-more-matchers.h"
+#include "gmock/gmock-nice-strict.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+
+// Declares Google Mock flags that we want a user to use programmatically.
+GMOCK_DECLARE_bool_(catch_leaked_mocks);
+GMOCK_DECLARE_string_(verbose);
+GMOCK_DECLARE_int32_(default_mock_behavior);
+
+namespace testing {
+
+// Initializes Google Mock. This must be called before running the
+// tests. In particular, it parses the command line for the flags
+// that Google Mock recognizes. Whenever a Google Mock flag is seen,
+// it is removed from argv, and *argc is decremented.
+//
+// No value is returned. Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock();
+
+} // namespace testing
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
new file mode 100644
index 0000000000..9c4874fd0c
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md
@@ -0,0 +1,18 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gmock-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+* `GMOCK_DECLARE_bool_(name)`
+* `GMOCK_DECLARE_int32_(name)`
+* `GMOCK_DECLARE_string_(name)`
+* `GMOCK_DEFINE_bool_(name, default_val, doc)`
+* `GMOCK_DEFINE_int32_(name, default_val, doc)`
+* `GMOCK_DEFINE_string_(name, default_val, doc)`
+* `GMOCK_FLAG_GET(flag_name)`
+* `GMOCK_FLAG_SET(flag_name, value)`
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
new file mode 100644
index 0000000000..bbcad31c76
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
@@ -0,0 +1,7 @@
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
new file mode 100644
index 0000000000..bb7dcbaa4c
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Injection point for custom user configurations. See README for details
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
new file mode 100644
index 0000000000..f055f7506b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h
@@ -0,0 +1,40 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
new file mode 100644
index 0000000000..b1343fdc82
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h
@@ -0,0 +1,476 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock. They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+
+#include <stdio.h>
+
+#include <ostream> // NOLINT
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+template <typename>
+class Matcher;
+
+namespace internal {
+
+// Silence MSVC C4100 (unreferenced formal parameter) and
+// C4805('==': unsafe mix of type 'const int' and type 'const bool')
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4805)
+#endif
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsKeyValueTuple(
+ const std::vector<const char*>& names, const Strings& values);
+
+// Converts an identifier name to a space-separated list of lower-case
+// words. Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word. For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name);
+
+// GetRawPointer(p) returns the raw pointer underlying p when p is a
+// smart pointer, or returns p itself when p is already a raw pointer.
+// The following default implementation is for the smart pointer case.
+template <typename Pointer>
+inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
+ return p.get();
+}
+// This overload version is for std::reference_wrapper, which does not work with
+// the overload above, as it does not have an `element_type`.
+template <typename Element>
+inline const Element* GetRawPointer(const std::reference_wrapper<Element>& r) {
+ return &r.get();
+}
+
+// This overloaded version is for the raw pointer case.
+template <typename Element>
+inline Element* GetRawPointer(Element* p) {
+ return p;
+}
+
+// MSVC treats wchar_t as a native type usually, but treats it as the
+// same as unsigned short when the compiler option /Zc:wchar_t- is
+// specified. It defines _NATIVE_WCHAR_T_DEFINED symbol when wchar_t
+// is a native type.
+#if defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED)
+// wchar_t is a typedef.
+#else
+#define GMOCK_WCHAR_T_IS_NATIVE_ 1
+#endif
+
+// In what follows, we use the term "kind" to indicate whether a type
+// is bool, an integer type (excluding bool), a floating-point type,
+// or none of them. This categorization is useful for determining
+// when a matcher argument type can be safely converted to another
+// type in the implementation of SafeMatcherCast.
+enum TypeKind { kBool, kInteger, kFloatingPoint, kOther };
+
+// KindOf<T>::value is the kind of type T.
+template <typename T>
+struct KindOf {
+ enum { value = kOther }; // The default kind.
+};
+
+// This macro declares that the kind of 'type' is 'kind'.
+#define GMOCK_DECLARE_KIND_(type, kind) \
+ template <> \
+ struct KindOf<type> { \
+ enum { value = kind }; \
+ }
+
+GMOCK_DECLARE_KIND_(bool, kBool);
+
+// All standard integer types.
+GMOCK_DECLARE_KIND_(char, kInteger);
+GMOCK_DECLARE_KIND_(signed char, kInteger);
+GMOCK_DECLARE_KIND_(unsigned char, kInteger);
+GMOCK_DECLARE_KIND_(short, kInteger); // NOLINT
+GMOCK_DECLARE_KIND_(unsigned short, kInteger); // NOLINT
+GMOCK_DECLARE_KIND_(int, kInteger);
+GMOCK_DECLARE_KIND_(unsigned int, kInteger);
+GMOCK_DECLARE_KIND_(long, kInteger); // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long, kInteger); // NOLINT
+GMOCK_DECLARE_KIND_(long long, kInteger); // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long long, kInteger); // NOLINT
+
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DECLARE_KIND_(wchar_t, kInteger);
+#endif
+
+// All standard floating-point types.
+GMOCK_DECLARE_KIND_(float, kFloatingPoint);
+GMOCK_DECLARE_KIND_(double, kFloatingPoint);
+GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
+
+#undef GMOCK_DECLARE_KIND_
+
+// Evaluates to the kind of 'type'.
+#define GMOCK_KIND_OF_(type) \
+ static_cast< ::testing::internal::TypeKind>( \
+ ::testing::internal::KindOf<type>::value)
+
+// LosslessArithmeticConvertibleImpl<kFromKind, From, kToKind, To>::value
+// is true if and only if arithmetic type From can be losslessly converted to
+// arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types, kFromKind is the kind of
+// From, and kToKind is the kind of To; the value is
+// implementation-defined when the above pre-condition is violated.
+template <TypeKind kFromKind, typename From, TypeKind kToKind, typename To>
+using LosslessArithmeticConvertibleImpl = std::integral_constant<
+ bool,
+ // clang-format off
+ // Converting from bool is always lossless
+ (kFromKind == kBool) ? true
+ // Converting between any other type kinds will be lossy if the type
+ // kinds are not the same.
+ : (kFromKind != kToKind) ? false
+ : (kFromKind == kInteger &&
+ // Converting between integers of different widths is allowed so long
+ // as the conversion does not go from signed to unsigned.
+ (((sizeof(From) < sizeof(To)) &&
+ !(std::is_signed<From>::value && !std::is_signed<To>::value)) ||
+ // Converting between integers of the same width only requires the
+ // two types to have the same signedness.
+ ((sizeof(From) == sizeof(To)) &&
+ (std::is_signed<From>::value == std::is_signed<To>::value)))
+ ) ? true
+ // Floating point conversions are lossless if and only if `To` is at least
+ // as wide as `From`.
+ : (kFromKind == kFloatingPoint && (sizeof(From) <= sizeof(To))) ? true
+ : false
+ // clang-format on
+ >;
+
+// LosslessArithmeticConvertible<From, To>::value is true if and only if
+// arithmetic type From can be losslessly converted to arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types; the value is
+// implementation-defined when the above pre-condition is violated.
+template <typename From, typename To>
+using LosslessArithmeticConvertible =
+ LosslessArithmeticConvertibleImpl<GMOCK_KIND_OF_(From), From,
+ GMOCK_KIND_OF_(To), To>;
+
+// This interface knows how to report a Google Mock failure (either
+// non-fatal or fatal).
+class FailureReporterInterface {
+ public:
+ // The type of a failure (either non-fatal or fatal).
+ enum FailureType { kNonfatal, kFatal };
+
+ virtual ~FailureReporterInterface() {}
+
+ // Reports a failure that occurred at the given source file location.
+ virtual void ReportFailure(FailureType type, const char* file, int line,
+ const std::string& message) = 0;
+};
+
+// Returns the failure reporter used by Google Mock.
+GTEST_API_ FailureReporterInterface* GetFailureReporter();
+
+// Asserts that condition is true; aborts the process with the given
+// message if condition is false. We cannot use LOG(FATAL) or CHECK()
+// as Google Mock might be used to mock the log sink itself. We
+// inline this function to prevent it from showing up in the stack
+// trace.
+inline void Assert(bool condition, const char* file, int line,
+ const std::string& msg) {
+ if (!condition) {
+ GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal, file,
+ line, msg);
+ }
+}
+inline void Assert(bool condition, const char* file, int line) {
+ Assert(condition, file, line, "Assertion failed.");
+}
+
+// Verifies that condition is true; generates a non-fatal failure if
+// condition is false.
+inline void Expect(bool condition, const char* file, int line,
+ const std::string& msg) {
+ if (!condition) {
+ GetFailureReporter()->ReportFailure(FailureReporterInterface::kNonfatal,
+ file, line, msg);
+ }
+}
+inline void Expect(bool condition, const char* file, int line) {
+ Expect(condition, file, line, "Expectation failed.");
+}
+
+// Severity level of a log.
+enum LogSeverity { kInfo = 0, kWarning = 1 };
+
+// Valid values for the --gmock_verbose flag.
+
+// All logs (informational and warnings) are printed.
+const char kInfoVerbosity[] = "info";
+// Only warnings are printed.
+const char kWarningVerbosity[] = "warning";
+// No logs are printed.
+const char kErrorVerbosity[] = "error";
+
+// Returns true if and only if a log with the given severity is visible
+// according to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity);
+
+// Prints the given message to stdout if and only if 'severity' >= the level
+// specified by the --gmock_verbose flag. If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames. In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+ int stack_frames_to_skip);
+
+// A marker class that is used to resolve parameterless expectations to the
+// correct overload. This must not be instantiable, to prevent client code from
+// accidentally resolving to the overload; for example:
+//
+// ON_CALL(mock, Method({}, nullptr))...
+//
+class WithoutMatchers {
+ private:
+ WithoutMatchers() {}
+ friend GTEST_API_ WithoutMatchers GetWithoutMatchers();
+};
+
+// Internal use only: access the singleton instance of WithoutMatchers.
+GTEST_API_ WithoutMatchers GetWithoutMatchers();
+
+// Disable MSVC warnings for infinite recursion, since in this case the
+// recursion is unreachable.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4717)
+#endif
+
+// Invalid<T>() is usable as an expression of type T, but will terminate
+// the program with an assertion failure if actually run. This is useful
+// when a value of type T is needed for compilation, but the statement
+// will not really be executed (or we don't care if the statement
+// crashes).
+template <typename T>
+inline T Invalid() {
+ Assert(false, "", -1, "Internal error: attempt to return invalid value");
+#if defined(__GNUC__) || defined(__clang__)
+ __builtin_unreachable();
+#elif defined(_MSC_VER)
+ __assume(0);
+#else
+ return Invalid<T>();
+#endif
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Given a raw type (i.e. having no top-level reference or const
+// modifier) RawContainer that's either an STL-style container or a
+// native array, class StlContainerView<RawContainer> has the
+// following members:
+//
+// - type is a type that provides an STL-style container view to
+// (i.e. implements the STL container concept for) RawContainer;
+// - const_reference is a type that provides a reference to a const
+// RawContainer;
+// - ConstReference(raw_container) returns a const reference to an STL-style
+// container view to raw_container, which is a RawContainer.
+// - Copy(raw_container) returns an STL-style container view of a
+// copy of raw_container, which is a RawContainer.
+//
+// This generic version is used when RawContainer itself is already an
+// STL-style container.
+template <class RawContainer>
+class StlContainerView {
+ public:
+ typedef RawContainer type;
+ typedef const type& const_reference;
+
+ static const_reference ConstReference(const RawContainer& container) {
+ static_assert(!std::is_const<RawContainer>::value,
+ "RawContainer type must not be const");
+ return container;
+ }
+ static type Copy(const RawContainer& container) { return container; }
+};
+
+// This specialization is used when RawContainer is a native array type.
+template <typename Element, size_t N>
+class StlContainerView<Element[N]> {
+ public:
+ typedef typename std::remove_const<Element>::type RawElement;
+ typedef internal::NativeArray<RawElement> type;
+ // NativeArray<T> can represent a native array either by value or by
+ // reference (selected by a constructor argument), so 'const type'
+ // can be used to reference a const native array. We cannot
+ // 'typedef const type& const_reference' here, as that would mean
+ // ConstReference() has to return a reference to a local variable.
+ typedef const type const_reference;
+
+ static const_reference ConstReference(const Element (&array)[N]) {
+ static_assert(std::is_same<Element, RawElement>::value,
+ "Element type must not be const");
+ return type(array, N, RelationToSourceReference());
+ }
+ static type Copy(const Element (&array)[N]) {
+ return type(array, N, RelationToSourceCopy());
+ }
+};
+
+// This specialization is used when RawContainer is a native array
+// represented as a (pointer, size) tuple.
+template <typename ElementPointer, typename Size>
+class StlContainerView< ::std::tuple<ElementPointer, Size> > {
+ public:
+ typedef typename std::remove_const<
+ typename std::pointer_traits<ElementPointer>::element_type>::type
+ RawElement;
+ typedef internal::NativeArray<RawElement> type;
+ typedef const type const_reference;
+
+ static const_reference ConstReference(
+ const ::std::tuple<ElementPointer, Size>& array) {
+ return type(std::get<0>(array), std::get<1>(array),
+ RelationToSourceReference());
+ }
+ static type Copy(const ::std::tuple<ElementPointer, Size>& array) {
+ return type(std::get<0>(array), std::get<1>(array), RelationToSourceCopy());
+ }
+};
+
+// The following specialization prevents the user from instantiating
+// StlContainer with a reference type.
+template <typename T>
+class StlContainerView<T&>;
+
+// A type transform to remove constness from the first part of a pair.
+// Pairs like that are used as the value_type of associative containers,
+// and this transform produces a similar but assignable pair.
+template <typename T>
+struct RemoveConstFromKey {
+ typedef T type;
+};
+
+// Partially specialized to remove constness from std::pair<const K, V>.
+template <typename K, typename V>
+struct RemoveConstFromKey<std::pair<const K, V> > {
+ typedef std::pair<K, V> type;
+};
+
+// Emit an assertion failure due to incorrect DoDefault() usage. Out-of-lined to
+// reduce code size.
+GTEST_API_ void IllegalDoDefault(const char* file, int line);
+
+template <typename F, typename Tuple, size_t... Idx>
+auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>)
+ -> decltype(std::forward<F>(f)(
+ std::get<Idx>(std::forward<Tuple>(args))...)) {
+ return std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...);
+}
+
+// Apply the function to a tuple of arguments.
+template <typename F, typename Tuple>
+auto Apply(F&& f, Tuple&& args) -> decltype(ApplyImpl(
+ std::forward<F>(f), std::forward<Tuple>(args),
+ MakeIndexSequence<std::tuple_size<
+ typename std::remove_reference<Tuple>::type>::value>())) {
+ return ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+ MakeIndexSequence<std::tuple_size<
+ typename std::remove_reference<Tuple>::type>::value>());
+}
+
+// Template struct Function<F>, where F must be a function type, contains
+// the following typedefs:
+//
+// Result: the function's return type.
+// Arg<N>: the type of the N-th argument, where N starts with 0.
+// ArgumentTuple: the tuple type consisting of all parameters of F.
+// ArgumentMatcherTuple: the tuple type consisting of Matchers for all
+// parameters of F.
+// MakeResultVoid: the function type obtained by substituting void
+// for the return type of F.
+// MakeResultIgnoredValue:
+// the function type obtained by substituting Something
+// for the return type of F.
+template <typename T>
+struct Function;
+
+template <typename R, typename... Args>
+struct Function<R(Args...)> {
+ using Result = R;
+ static constexpr size_t ArgumentCount = sizeof...(Args);
+ template <size_t I>
+ using Arg = ElemFromList<I, Args...>;
+ using ArgumentTuple = std::tuple<Args...>;
+ using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+ using MakeResultVoid = void(Args...);
+ using MakeResultIgnoredValue = IgnoredValue(Args...);
+};
+
+template <typename R, typename... Args>
+constexpr size_t Function<R(Args...)>::ArgumentCount;
+
+bool Base64Unescape(const std::string& encoded, std::string* decoded);
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+} // namespace internal
+} // namespace testing
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
new file mode 100644
index 0000000000..bc18a25f34
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h
@@ -0,0 +1,139 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Low-level types and utilities for porting Google Mock to various
+// platforms. All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice. Code
+// outside Google Mock MUST NOT USE THEM DIRECTLY. Macros that don't
+// end with _ are part of Google Mock's public API and can be used by
+// code outside Google Mock.
+
+// IWYU pragma: private, include "gmock/gmock.h"
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include <cstdint>
+#include <iostream>
+
+// Most of the utilities needed for porting Google Mock are also
+// required for Google Test and are defined in gtest-port.h.
+//
+// Note to maintainers: to reduce code duplication, prefer adding
+// portability utilities to Google Test's gtest-port.h instead of
+// here, as Google Mock depends on Google Test. Only add a utility
+// here if it's truly specific to Google Mock.
+
+#include "gmock/internal/custom/gmock-port.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#endif
+
+// For MS Visual C++, check the compiler version. At least VS 2015 is
+// required to compile Google Mock.
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
+#endif
+
+// Macro for referencing flags. This is public as we want the user to
+// use this syntax to reference Google Mock flags.
+#define GMOCK_FLAG_NAME_(name) gmock_##name
+#define GMOCK_FLAG(name) FLAGS_gmock_##name
+
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GMOCK_DEFINE_bool_(name, default_val, doc) \
+ ABSL_FLAG(bool, GMOCK_FLAG_NAME_(name), default_val, doc)
+#define GMOCK_DEFINE_int32_(name, default_val, doc) \
+ ABSL_FLAG(int32_t, GMOCK_FLAG_NAME_(name), default_val, doc)
+#define GMOCK_DEFINE_string_(name, default_val, doc) \
+ ABSL_FLAG(std::string, GMOCK_FLAG_NAME_(name), default_val, doc)
+
+// Macros for declaring flags.
+#define GMOCK_DECLARE_bool_(name) \
+ ABSL_DECLARE_FLAG(bool, GMOCK_FLAG_NAME_(name))
+#define GMOCK_DECLARE_int32_(name) \
+ ABSL_DECLARE_FLAG(int32_t, GMOCK_FLAG_NAME_(name))
+#define GMOCK_DECLARE_string_(name) \
+ ABSL_DECLARE_FLAG(std::string, GMOCK_FLAG_NAME_(name))
+
+#define GMOCK_FLAG_GET(name) ::absl::GetFlag(GMOCK_FLAG(name))
+#define GMOCK_FLAG_SET(name, value) \
+ (void)(::absl::SetFlag(&GMOCK_FLAG(name), value))
+
+#else // GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GMOCK_DEFINE_bool_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ bool GMOCK_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DEFINE_int32_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ int32_t GMOCK_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DEFINE_string_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+
+// Macros for declaring flags.
+#define GMOCK_DECLARE_bool_(name) \
+ namespace testing { \
+ GTEST_API_ extern bool GMOCK_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DECLARE_int32_(name) \
+ namespace testing { \
+ GTEST_API_ extern int32_t GMOCK_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GMOCK_DECLARE_string_(name) \
+ namespace testing { \
+ GTEST_API_ extern ::std::string GMOCK_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+
+#define GMOCK_FLAG_GET(name) ::testing::GMOCK_FLAG(name)
+#define GMOCK_FLAG_SET(name, value) (void)(::testing::GMOCK_FLAG(name) = value)
+
+#endif // GTEST_HAS_ABSL
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
new file mode 100644
index 0000000000..94d61c09c8
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h
@@ -0,0 +1,279 @@
+#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
+#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
+
+// Expands and concatenates the arguments. Constructed macros reevaluate.
+#define GMOCK_PP_CAT(_1, _2) GMOCK_PP_INTERNAL_CAT(_1, _2)
+
+// Expands and stringifies the only argument.
+#define GMOCK_PP_STRINGIZE(...) GMOCK_PP_INTERNAL_STRINGIZE(__VA_ARGS__)
+
+// Returns empty. Given a variadic number of arguments.
+#define GMOCK_PP_EMPTY(...)
+
+// Returns a comma. Given a variadic number of arguments.
+#define GMOCK_PP_COMMA(...) ,
+
+// Returns the only argument.
+#define GMOCK_PP_IDENTITY(_1) _1
+
+// Evaluates to the number of arguments after expansion.
+//
+// #define PAIR x, y
+//
+// GMOCK_PP_NARG() => 1
+// GMOCK_PP_NARG(x) => 1
+// GMOCK_PP_NARG(x, y) => 2
+// GMOCK_PP_NARG(PAIR) => 2
+//
+// Requires: the number of arguments after expansion is at most 15.
+#define GMOCK_PP_NARG(...) \
+ GMOCK_PP_INTERNAL_16TH( \
+ (__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+
+// Returns 1 if the expansion of arguments has an unprotected comma. Otherwise
+// returns 0. Requires no more than 15 unprotected commas.
+#define GMOCK_PP_HAS_COMMA(...) \
+ GMOCK_PP_INTERNAL_16TH( \
+ (__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0))
+
+// Returns the first argument.
+#define GMOCK_PP_HEAD(...) GMOCK_PP_INTERNAL_HEAD((__VA_ARGS__, unusedArg))
+
+// Returns the tail. A variadic list of all arguments minus the first. Requires
+// at least one argument.
+#define GMOCK_PP_TAIL(...) GMOCK_PP_INTERNAL_TAIL((__VA_ARGS__))
+
+// Calls CAT(_Macro, NARG(__VA_ARGS__))(__VA_ARGS__)
+#define GMOCK_PP_VARIADIC_CALL(_Macro, ...) \
+ GMOCK_PP_IDENTITY( \
+ GMOCK_PP_CAT(_Macro, GMOCK_PP_NARG(__VA_ARGS__))(__VA_ARGS__))
+
+// If the arguments after expansion have no tokens, evaluates to `1`. Otherwise
+// evaluates to `0`.
+//
+// Requires: * the number of arguments after expansion is at most 15.
+// * If the argument is a macro, it must be able to be called with one
+// argument.
+//
+// Implementation details:
+//
+// There is one case when it generates a compile error: if the argument is macro
+// that cannot be called with one argument.
+//
+// #define M(a, b) // it doesn't matter what it expands to
+//
+// // Expected: expands to `0`.
+// // Actual: compile error.
+// GMOCK_PP_IS_EMPTY(M)
+//
+// There are 4 cases tested:
+//
+// * __VA_ARGS__ possible expansion has no unparen'd commas. Expected 0.
+// * __VA_ARGS__ possible expansion is not enclosed in parenthesis. Expected 0.
+// * __VA_ARGS__ possible expansion is not a macro that ()-evaluates to a comma.
+// Expected 0
+// * __VA_ARGS__ is empty, or has unparen'd commas, or is enclosed in
+// parenthesis, or is a macro that ()-evaluates to comma. Expected 1.
+//
+// We trigger detection on '0001', i.e. on empty.
+#define GMOCK_PP_IS_EMPTY(...) \
+ GMOCK_PP_INTERNAL_IS_EMPTY(GMOCK_PP_HAS_COMMA(__VA_ARGS__), \
+ GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__), \
+ GMOCK_PP_HAS_COMMA(__VA_ARGS__()), \
+ GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__()))
+
+// Evaluates to _Then if _Cond is 1 and _Else if _Cond is 0.
+#define GMOCK_PP_IF(_Cond, _Then, _Else) \
+ GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IF_, _Cond)(_Then, _Else)
+
+// Similar to GMOCK_PP_IF but takes _Then and _Else in parentheses.
+//
+// GMOCK_PP_GENERIC_IF(1, (a, b, c), (d, e, f)) => a, b, c
+// GMOCK_PP_GENERIC_IF(0, (a, b, c), (d, e, f)) => d, e, f
+//
+#define GMOCK_PP_GENERIC_IF(_Cond, _Then, _Else) \
+ GMOCK_PP_REMOVE_PARENS(GMOCK_PP_IF(_Cond, _Then, _Else))
+
+// Evaluates to the number of arguments after expansion. Identifies 'empty' as
+// 0.
+//
+// #define PAIR x, y
+//
+// GMOCK_PP_NARG0() => 0
+// GMOCK_PP_NARG0(x) => 1
+// GMOCK_PP_NARG0(x, y) => 2
+// GMOCK_PP_NARG0(PAIR) => 2
+//
+// Requires: * the number of arguments after expansion is at most 15.
+// * If the argument is a macro, it must be able to be called with one
+// argument.
+#define GMOCK_PP_NARG0(...) \
+ GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(__VA_ARGS__), 0, GMOCK_PP_NARG(__VA_ARGS__))
+
+// Expands to 1 if the first argument starts with something in parentheses,
+// otherwise to 0.
+#define GMOCK_PP_IS_BEGIN_PARENS(...) \
+ GMOCK_PP_HEAD(GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_, \
+ GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C __VA_ARGS__))
+
+// Expands to 1 is there is only one argument and it is enclosed in parentheses.
+#define GMOCK_PP_IS_ENCLOSED_PARENS(...) \
+ GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(__VA_ARGS__), \
+ GMOCK_PP_IS_EMPTY(GMOCK_PP_EMPTY __VA_ARGS__), 0)
+
+// Remove the parens, requires GMOCK_PP_IS_ENCLOSED_PARENS(args) => 1.
+#define GMOCK_PP_REMOVE_PARENS(...) GMOCK_PP_INTERNAL_REMOVE_PARENS __VA_ARGS__
+
+// Expands to _Macro(0, _Data, e1) _Macro(1, _Data, e2) ... _Macro(K -1, _Data,
+// eK) as many of GMOCK_INTERNAL_NARG0 _Tuple.
+// Requires: * |_Macro| can be called with 3 arguments.
+// * |_Tuple| expansion has no more than 15 elements.
+#define GMOCK_PP_FOR_EACH(_Macro, _Data, _Tuple) \
+ GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, GMOCK_PP_NARG0 _Tuple) \
+ (0, _Macro, _Data, _Tuple)
+
+// Expands to _Macro(0, _Data, ) _Macro(1, _Data, ) ... _Macro(K - 1, _Data, )
+// Empty if _K = 0.
+// Requires: * |_Macro| can be called with 3 arguments.
+// * |_K| literal between 0 and 15
+#define GMOCK_PP_REPEAT(_Macro, _Data, _N) \
+ GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, _N) \
+ (0, _Macro, _Data, GMOCK_PP_INTENRAL_EMPTY_TUPLE)
+
+// Increments the argument, requires the argument to be between 0 and 15.
+#define GMOCK_PP_INC(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_INC_, _i)
+
+// Returns comma if _i != 0. Requires _i to be between 0 and 15.
+#define GMOCK_PP_COMMA_IF(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_COMMA_IF_, _i)
+
+// Internal details follow. Do not use any of these symbols outside of this
+// file or we will break your code.
+#define GMOCK_PP_INTENRAL_EMPTY_TUPLE (, , , , , , , , , , , , , , , )
+#define GMOCK_PP_INTERNAL_CAT(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_STRINGIZE(...) #__VA_ARGS__
+#define GMOCK_PP_INTERNAL_CAT_5(_1, _2, _3, _4, _5) _1##_2##_3##_4##_5
+#define GMOCK_PP_INTERNAL_IS_EMPTY(_1, _2, _3, _4) \
+ GMOCK_PP_HAS_COMMA(GMOCK_PP_INTERNAL_CAT_5(GMOCK_PP_INTERNAL_IS_EMPTY_CASE_, \
+ _1, _2, _3, _4))
+#define GMOCK_PP_INTERNAL_IS_EMPTY_CASE_0001 ,
+#define GMOCK_PP_INTERNAL_IF_1(_Then, _Else) _Then
+#define GMOCK_PP_INTERNAL_IF_0(_Then, _Else) _Else
+
+// Because of MSVC treating a token with a comma in it as a single token when
+// passed to another macro, we need to force it to evaluate it as multiple
+// tokens. We do that by using a "IDENTITY(MACRO PARENTHESIZED_ARGS)" macro. We
+// define one per possible macro that relies on this behavior. Note "_Args" must
+// be parenthesized.
+#define GMOCK_PP_INTERNAL_INTERNAL_16TH(_1, _2, _3, _4, _5, _6, _7, _8, _9, \
+ _10, _11, _12, _13, _14, _15, _16, \
+ ...) \
+ _16
+#define GMOCK_PP_INTERNAL_16TH(_Args) \
+ GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_16TH _Args)
+#define GMOCK_PP_INTERNAL_INTERNAL_HEAD(_1, ...) _1
+#define GMOCK_PP_INTERNAL_HEAD(_Args) \
+ GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_HEAD _Args)
+#define GMOCK_PP_INTERNAL_INTERNAL_TAIL(_1, ...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_TAIL(_Args) \
+ GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_TAIL _Args)
+
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C(...) 1 _
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_1 1,
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C \
+ 0,
+#define GMOCK_PP_INTERNAL_REMOVE_PARENS(...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_INC_0 1
+#define GMOCK_PP_INTERNAL_INC_1 2
+#define GMOCK_PP_INTERNAL_INC_2 3
+#define GMOCK_PP_INTERNAL_INC_3 4
+#define GMOCK_PP_INTERNAL_INC_4 5
+#define GMOCK_PP_INTERNAL_INC_5 6
+#define GMOCK_PP_INTERNAL_INC_6 7
+#define GMOCK_PP_INTERNAL_INC_7 8
+#define GMOCK_PP_INTERNAL_INC_8 9
+#define GMOCK_PP_INTERNAL_INC_9 10
+#define GMOCK_PP_INTERNAL_INC_10 11
+#define GMOCK_PP_INTERNAL_INC_11 12
+#define GMOCK_PP_INTERNAL_INC_12 13
+#define GMOCK_PP_INTERNAL_INC_13 14
+#define GMOCK_PP_INTERNAL_INC_14 15
+#define GMOCK_PP_INTERNAL_INC_15 16
+#define GMOCK_PP_INTERNAL_COMMA_IF_0
+#define GMOCK_PP_INTERNAL_COMMA_IF_1 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_2 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_3 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_4 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_5 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_6 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_7 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_8 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_9 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_10 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_11 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_12 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_13 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_14 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_15 ,
+#define GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, _element) \
+ _Macro(_i, _Data, _element)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_0(_i, _Macro, _Data, _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_15(_i, _Macro, _Data, _Tuple) \
+ GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+ GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(GMOCK_PP_INC(_i), _Macro, _Data, \
+ (GMOCK_PP_TAIL _Tuple))
+
+#endif // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock-all.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-all.cc
new file mode 100644
index 0000000000..e43c9b7b4c
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-all.cc
@@ -0,0 +1,46 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Mocking Framework (Google Mock)
+//
+// This file #includes all Google Mock implementation .cc files. The
+// purpose is to allow a user to build Google Mock by compiling this
+// file alone.
+
+// This line ensures that gmock.h can be compiled on its own, even
+// when it's fused.
+#include "gmock/gmock.h"
+
+// The following lines pull in the real gmock *.cc files.
+#include "src/gmock-cardinalities.cc"
+#include "src/gmock-internal-utils.cc"
+#include "src/gmock-matchers.cc"
+#include "src/gmock-spec-builders.cc"
+#include "src/gmock.cc"
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
new file mode 100644
index 0000000000..92cde3484a
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc
@@ -0,0 +1,155 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements cardinalities.
+
+#include "gmock/gmock-cardinalities.h"
+
+#include <limits.h>
+
+#include <ostream> // NOLINT
+#include <sstream>
+#include <string>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+namespace {
+
+// Implements the Between(m, n) cardinality.
+class BetweenCardinalityImpl : public CardinalityInterface {
+ public:
+ BetweenCardinalityImpl(int min, int max)
+ : min_(min >= 0 ? min : 0), max_(max >= min_ ? max : min_) {
+ std::stringstream ss;
+ if (min < 0) {
+ ss << "The invocation lower bound must be >= 0, "
+ << "but is actually " << min << ".";
+ internal::Expect(false, __FILE__, __LINE__, ss.str());
+ } else if (max < 0) {
+ ss << "The invocation upper bound must be >= 0, "
+ << "but is actually " << max << ".";
+ internal::Expect(false, __FILE__, __LINE__, ss.str());
+ } else if (min > max) {
+ ss << "The invocation upper bound (" << max
+ << ") must be >= the invocation lower bound (" << min << ").";
+ internal::Expect(false, __FILE__, __LINE__, ss.str());
+ }
+ }
+
+ // Conservative estimate on the lower/upper bound of the number of
+ // calls allowed.
+ int ConservativeLowerBound() const override { return min_; }
+ int ConservativeUpperBound() const override { return max_; }
+
+ bool IsSatisfiedByCallCount(int call_count) const override {
+ return min_ <= call_count && call_count <= max_;
+ }
+
+ bool IsSaturatedByCallCount(int call_count) const override {
+ return call_count >= max_;
+ }
+
+ void DescribeTo(::std::ostream* os) const override;
+
+ private:
+ const int min_;
+ const int max_;
+
+ BetweenCardinalityImpl(const BetweenCardinalityImpl&) = delete;
+ BetweenCardinalityImpl& operator=(const BetweenCardinalityImpl&) = delete;
+};
+
+// Formats "n times" in a human-friendly way.
+inline std::string FormatTimes(int n) {
+ if (n == 1) {
+ return "once";
+ } else if (n == 2) {
+ return "twice";
+ } else {
+ std::stringstream ss;
+ ss << n << " times";
+ return ss.str();
+ }
+}
+
+// Describes the Between(m, n) cardinality in human-friendly text.
+void BetweenCardinalityImpl::DescribeTo(::std::ostream* os) const {
+ if (min_ == 0) {
+ if (max_ == 0) {
+ *os << "never called";
+ } else if (max_ == INT_MAX) {
+ *os << "called any number of times";
+ } else {
+ *os << "called at most " << FormatTimes(max_);
+ }
+ } else if (min_ == max_) {
+ *os << "called " << FormatTimes(min_);
+ } else if (max_ == INT_MAX) {
+ *os << "called at least " << FormatTimes(min_);
+ } else {
+ // 0 < min_ < max_ < INT_MAX
+ *os << "called between " << min_ << " and " << max_ << " times";
+ }
+}
+
+} // Unnamed namespace
+
+// Describes the given call count to an ostream.
+void Cardinality::DescribeActualCallCountTo(int actual_call_count,
+ ::std::ostream* os) {
+ if (actual_call_count > 0) {
+ *os << "called " << FormatTimes(actual_call_count);
+ } else {
+ *os << "never called";
+ }
+}
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n) { return Between(n, INT_MAX); }
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n) { return Between(0, n); }
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber() { return AtLeast(0); }
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max) {
+ return Cardinality(new BetweenCardinalityImpl(min, max));
+}
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n) { return Between(n, n); }
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
new file mode 100644
index 0000000000..0a74841f35
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc
@@ -0,0 +1,250 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock. They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <ctype.h>
+
+#include <array>
+#include <cctype>
+#include <cstdint>
+#include <cstring>
+#include <ostream> // NOLINT
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsKeyValueTuple(
+ const std::vector<const char*>& names, const Strings& values) {
+ GTEST_CHECK_(names.size() == values.size());
+ if (values.empty()) {
+ return "";
+ }
+ const auto build_one = [&](const size_t i) {
+ return std::string(names[i]) + ": " + values[i];
+ };
+ std::string result = "(" + build_one(0);
+ for (size_t i = 1; i < values.size(); i++) {
+ result += ", ";
+ result += build_one(i);
+ }
+ result += ")";
+ return result;
+}
+
+// Converts an identifier name to a space-separated list of lower-case
+// words. Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word. For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name) {
+ std::string result;
+ char prev_char = '\0';
+ for (const char* p = id_name; *p != '\0'; prev_char = *(p++)) {
+ // We don't care about the current locale as the input is
+ // guaranteed to be a valid C++ identifier name.
+ const bool starts_new_word = IsUpper(*p) ||
+ (!IsAlpha(prev_char) && IsLower(*p)) ||
+ (!IsDigit(prev_char) && IsDigit(*p));
+
+ if (IsAlNum(*p)) {
+ if (starts_new_word && result != "") result += ' ';
+ result += ToLower(*p);
+ }
+ }
+ return result;
+}
+
+// This class reports Google Mock failures as Google Test failures. A
+// user can define another class in a similar fashion if they intend to
+// use Google Mock with a testing framework other than Google Test.
+class GoogleTestFailureReporter : public FailureReporterInterface {
+ public:
+ void ReportFailure(FailureType type, const char* file, int line,
+ const std::string& message) override {
+ AssertHelper(type == kFatal ? TestPartResult::kFatalFailure
+ : TestPartResult::kNonFatalFailure,
+ file, line, message.c_str()) = Message();
+ if (type == kFatal) {
+ posix::Abort();
+ }
+ }
+};
+
+// Returns the global failure reporter. Will create a
+// GoogleTestFailureReporter and return it the first time called.
+GTEST_API_ FailureReporterInterface* GetFailureReporter() {
+ // Points to the global failure reporter used by Google Mock. gcc
+ // guarantees that the following use of failure_reporter is
+ // thread-safe. We may need to add additional synchronization to
+ // protect failure_reporter if we port Google Mock to other
+ // compilers.
+ static FailureReporterInterface* const failure_reporter =
+ new GoogleTestFailureReporter();
+ return failure_reporter;
+}
+
+// Protects global resources (stdout in particular) used by Log().
+static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
+
+// Returns true if and only if a log with the given severity is visible
+// according to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity) {
+ if (GMOCK_FLAG_GET(verbose) == kInfoVerbosity) {
+ // Always show the log if --gmock_verbose=info.
+ return true;
+ } else if (GMOCK_FLAG_GET(verbose) == kErrorVerbosity) {
+ // Always hide it if --gmock_verbose=error.
+ return false;
+ } else {
+ // If --gmock_verbose is neither "info" nor "error", we treat it
+ // as "warning" (its default value).
+ return severity == kWarning;
+ }
+}
+
+// Prints the given message to stdout if and only if 'severity' >= the level
+// specified by the --gmock_verbose flag. If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames. In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+ int stack_frames_to_skip) {
+ if (!LogIsVisible(severity)) return;
+
+ // Ensures that logs from different threads don't interleave.
+ MutexLock l(&g_log_mutex);
+
+ if (severity == kWarning) {
+ // Prints a GMOCK WARNING marker to make the warnings easily searchable.
+ std::cout << "\nGMOCK WARNING:";
+ }
+ // Pre-pends a new-line to message if it doesn't start with one.
+ if (message.empty() || message[0] != '\n') {
+ std::cout << "\n";
+ }
+ std::cout << message;
+ if (stack_frames_to_skip >= 0) {
+#ifdef NDEBUG
+ // In opt mode, we have to be conservative and skip no stack frame.
+ const int actual_to_skip = 0;
+#else
+ // In dbg mode, we can do what the caller tell us to do (plus one
+ // for skipping this function's stack frame).
+ const int actual_to_skip = stack_frames_to_skip + 1;
+#endif // NDEBUG
+
+ // Appends a new-line to message if it doesn't end with one.
+ if (!message.empty() && *message.rbegin() != '\n') {
+ std::cout << "\n";
+ }
+ std::cout << "Stack trace:\n"
+ << ::testing::internal::GetCurrentOsStackTraceExceptTop(
+ ::testing::UnitTest::GetInstance(), actual_to_skip);
+ }
+ std::cout << ::std::flush;
+}
+
+GTEST_API_ WithoutMatchers GetWithoutMatchers() { return WithoutMatchers(); }
+
+GTEST_API_ void IllegalDoDefault(const char* file, int line) {
+ internal::Assert(
+ false, file, line,
+ "You are using DoDefault() inside a composite action like "
+ "DoAll() or WithArgs(). This is not supported for technical "
+ "reasons. Please instead spell out the default action, or "
+ "assign the default action to an Action variable and use "
+ "the variable in various places.");
+}
+
+constexpr char UnBase64Impl(char c, const char* const base64, char carry) {
+ return *base64 == 0 ? static_cast<char>(65)
+ : *base64 == c ? carry
+ : UnBase64Impl(c, base64 + 1, carry + 1);
+}
+
+template <size_t... I>
+constexpr std::array<char, 256> UnBase64Impl(IndexSequence<I...>,
+ const char* const base64) {
+ return {{UnBase64Impl(static_cast<char>(I), base64, 0)...}};
+}
+
+constexpr std::array<char, 256> UnBase64(const char* const base64) {
+ return UnBase64Impl(MakeIndexSequence<256>{}, base64);
+}
+
+static constexpr char kBase64[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static constexpr std::array<char, 256> kUnBase64 = UnBase64(kBase64);
+
+bool Base64Unescape(const std::string& encoded, std::string* decoded) {
+ decoded->clear();
+ size_t encoded_len = encoded.size();
+ decoded->reserve(3 * (encoded_len / 4) + (encoded_len % 4));
+ int bit_pos = 0;
+ char dst = 0;
+ for (int src : encoded) {
+ if (std::isspace(src) || src == '=') {
+ continue;
+ }
+ char src_bin = kUnBase64[static_cast<size_t>(src)];
+ if (src_bin >= 64) {
+ decoded->clear();
+ return false;
+ }
+ if (bit_pos == 0) {
+ dst |= static_cast<char>(src_bin << 2);
+ bit_pos = 6;
+ } else {
+ dst |= static_cast<char>(src_bin >> (bit_pos - 2));
+ decoded->push_back(dst);
+ dst = static_cast<char>(src_bin << (10 - bit_pos));
+ bit_pos = (bit_pos + 6) % 8;
+ }
+ }
+ return true;
+}
+
+} // namespace internal
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock-matchers.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-matchers.cc
new file mode 100644
index 0000000000..a8d04a6da0
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-matchers.cc
@@ -0,0 +1,462 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements Matcher<const string&>, Matcher<string>, and
+// utilities for defining matchers.
+
+#include "gmock/gmock-matchers.h"
+
+#include <string.h>
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace testing {
+namespace internal {
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher. 'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(
+ bool negation, const char* matcher_name,
+ const std::vector<const char*>& param_names, const Strings& param_values) {
+ std::string result = ConvertIdentifierNameToWords(matcher_name);
+ if (param_values.size() >= 1) {
+ result += " " + JoinAsKeyValueTuple(param_names, param_values);
+ }
+ return negation ? "not (" + result + ")" : result;
+}
+
+// FindMaxBipartiteMatching and its helper class.
+//
+// Uses the well-known Ford-Fulkerson max flow method to find a maximum
+// bipartite matching. Flow is considered to be from left to right.
+// There is an implicit source node that is connected to all of the left
+// nodes, and an implicit sink node that is connected to all of the
+// right nodes. All edges have unit capacity.
+//
+// Neither the flow graph nor the residual flow graph are represented
+// explicitly. Instead, they are implied by the information in 'graph' and
+// a vector<int> called 'left_' whose elements are initialized to the
+// value kUnused. This represents the initial state of the algorithm,
+// where the flow graph is empty, and the residual flow graph has the
+// following edges:
+// - An edge from source to each left_ node
+// - An edge from each right_ node to sink
+// - An edge from each left_ node to each right_ node, if the
+// corresponding edge exists in 'graph'.
+//
+// When the TryAugment() method adds a flow, it sets left_[l] = r for some
+// nodes l and r. This induces the following changes:
+// - The edges (source, l), (l, r), and (r, sink) are added to the
+// flow graph.
+// - The same three edges are removed from the residual flow graph.
+// - The reverse edges (l, source), (r, l), and (sink, r) are added
+// to the residual flow graph, which is a directional graph
+// representing unused flow capacity.
+//
+// When the method augments a flow (moving left_[l] from some r1 to some
+// other r2), this can be thought of as "undoing" the above steps with
+// respect to r1 and "redoing" them with respect to r2.
+//
+// It bears repeating that the flow graph and residual flow graph are
+// never represented explicitly, but can be derived by looking at the
+// information in 'graph' and in left_.
+//
+// As an optimization, there is a second vector<int> called right_ which
+// does not provide any new information. Instead, it enables more
+// efficient queries about edges entering or leaving the right-side nodes
+// of the flow or residual flow graphs. The following invariants are
+// maintained:
+//
+// left[l] == kUnused or right[left[l]] == l
+// right[r] == kUnused or left[right[r]] == r
+//
+// . [ source ] .
+// . ||| .
+// . ||| .
+// . ||\--> left[0]=1 ---\ right[0]=-1 ----\ .
+// . || | | .
+// . |\---> left[1]=-1 \--> right[1]=0 ---\| .
+// . | || .
+// . \----> left[2]=2 ------> right[2]=2 --\|| .
+// . ||| .
+// . elements matchers vvv .
+// . [ sink ] .
+//
+// See Also:
+// [1] Cormen, et al (2001). "Section 26.2: The Ford-Fulkerson method".
+// "Introduction to Algorithms (Second ed.)", pp. 651-664.
+// [2] "Ford-Fulkerson algorithm", Wikipedia,
+// 'http://en.wikipedia.org/wiki/Ford%E2%80%93Fulkerson_algorithm'
+class MaxBipartiteMatchState {
+ public:
+ explicit MaxBipartiteMatchState(const MatchMatrix& graph)
+ : graph_(&graph),
+ left_(graph_->LhsSize(), kUnused),
+ right_(graph_->RhsSize(), kUnused) {}
+
+ // Returns the edges of a maximal match, each in the form {left, right}.
+ ElementMatcherPairs Compute() {
+ // 'seen' is used for path finding { 0: unseen, 1: seen }.
+ ::std::vector<char> seen;
+ // Searches the residual flow graph for a path from each left node to
+ // the sink in the residual flow graph, and if one is found, add flow
+ // to the graph. It's okay to search through the left nodes once. The
+ // edge from the implicit source node to each previously-visited left
+ // node will have flow if that left node has any path to the sink
+ // whatsoever. Subsequent augmentations can only add flow to the
+ // network, and cannot take away that previous flow unit from the source.
+ // Since the source-to-left edge can only carry one flow unit (or,
+ // each element can be matched to only one matcher), there is no need
+ // to visit the left nodes more than once looking for augmented paths.
+ // The flow is known to be possible or impossible by looking at the
+ // node once.
+ for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+ // Reset the path-marking vector and try to find a path from
+ // source to sink starting at the left_[ilhs] node.
+ GTEST_CHECK_(left_[ilhs] == kUnused)
+ << "ilhs: " << ilhs << ", left_[ilhs]: " << left_[ilhs];
+ // 'seen' initialized to 'graph_->RhsSize()' copies of 0.
+ seen.assign(graph_->RhsSize(), 0);
+ TryAugment(ilhs, &seen);
+ }
+ ElementMatcherPairs result;
+ for (size_t ilhs = 0; ilhs < left_.size(); ++ilhs) {
+ size_t irhs = left_[ilhs];
+ if (irhs == kUnused) continue;
+ result.push_back(ElementMatcherPair(ilhs, irhs));
+ }
+ return result;
+ }
+
+ private:
+ static const size_t kUnused = static_cast<size_t>(-1);
+
+ // Perform a depth-first search from left node ilhs to the sink. If a
+ // path is found, flow is added to the network by linking the left and
+ // right vector elements corresponding each segment of the path.
+ // Returns true if a path to sink was found, which means that a unit of
+ // flow was added to the network. The 'seen' vector elements correspond
+ // to right nodes and are marked to eliminate cycles from the search.
+ //
+ // Left nodes will only be explored at most once because they
+ // are accessible from at most one right node in the residual flow
+ // graph.
+ //
+ // Note that left_[ilhs] is the only element of left_ that TryAugment will
+ // potentially transition from kUnused to another value. Any other
+ // left_ element holding kUnused before TryAugment will be holding it
+ // when TryAugment returns.
+ //
+ bool TryAugment(size_t ilhs, ::std::vector<char>* seen) {
+ for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+ if ((*seen)[irhs]) continue;
+ if (!graph_->HasEdge(ilhs, irhs)) continue;
+ // There's an available edge from ilhs to irhs.
+ (*seen)[irhs] = 1;
+ // Next a search is performed to determine whether
+ // this edge is a dead end or leads to the sink.
+ //
+ // right_[irhs] == kUnused means that there is residual flow from
+ // right node irhs to the sink, so we can use that to finish this
+ // flow path and return success.
+ //
+ // Otherwise there is residual flow to some ilhs. We push flow
+ // along that path and call ourselves recursively to see if this
+ // ultimately leads to sink.
+ if (right_[irhs] == kUnused || TryAugment(right_[irhs], seen)) {
+ // Add flow from left_[ilhs] to right_[irhs].
+ left_[ilhs] = irhs;
+ right_[irhs] = ilhs;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ const MatchMatrix* graph_; // not owned
+ // Each element of the left_ vector represents a left hand side node
+ // (i.e. an element) and each element of right_ is a right hand side
+ // node (i.e. a matcher). The values in the left_ vector indicate
+ // outflow from that node to a node on the right_ side. The values
+ // in the right_ indicate inflow, and specify which left_ node is
+ // feeding that right_ node, if any. For example, left_[3] == 1 means
+ // there's a flow from element #3 to matcher #1. Such a flow would also
+ // be redundantly represented in the right_ vector as right_[1] == 3.
+ // Elements of left_ and right_ are either kUnused or mutually
+ // referent. Mutually referent means that left_[right_[i]] = i and
+ // right_[left_[i]] = i.
+ ::std::vector<size_t> left_;
+ ::std::vector<size_t> right_;
+};
+
+const size_t MaxBipartiteMatchState::kUnused;
+
+GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g) {
+ return MaxBipartiteMatchState(g).Compute();
+}
+
+static void LogElementMatcherPairVec(const ElementMatcherPairs& pairs,
+ ::std::ostream* stream) {
+ typedef ElementMatcherPairs::const_iterator Iter;
+ ::std::ostream& os = *stream;
+ os << "{";
+ const char* sep = "";
+ for (Iter it = pairs.begin(); it != pairs.end(); ++it) {
+ os << sep << "\n ("
+ << "element #" << it->first << ", "
+ << "matcher #" << it->second << ")";
+ sep = ",";
+ }
+ os << "\n}";
+}
+
+bool MatchMatrix::NextGraph() {
+ for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+ for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+ char& b = matched_[SpaceIndex(ilhs, irhs)];
+ if (!b) {
+ b = 1;
+ return true;
+ }
+ b = 0;
+ }
+ }
+ return false;
+}
+
+void MatchMatrix::Randomize() {
+ for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+ for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+ char& b = matched_[SpaceIndex(ilhs, irhs)];
+ b = static_cast<char>(rand() & 1); // NOLINT
+ }
+ }
+}
+
+std::string MatchMatrix::DebugString() const {
+ ::std::stringstream ss;
+ const char* sep = "";
+ for (size_t i = 0; i < LhsSize(); ++i) {
+ ss << sep;
+ for (size_t j = 0; j < RhsSize(); ++j) {
+ ss << HasEdge(i, j);
+ }
+ sep = ";";
+ }
+ return ss.str();
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeToImpl(
+ ::std::ostream* os) const {
+ switch (match_flags()) {
+ case UnorderedMatcherRequire::ExactMatch:
+ if (matcher_describers_.empty()) {
+ *os << "is empty";
+ return;
+ }
+ if (matcher_describers_.size() == 1) {
+ *os << "has " << Elements(1) << " and that element ";
+ matcher_describers_[0]->DescribeTo(os);
+ return;
+ }
+ *os << "has " << Elements(matcher_describers_.size())
+ << " and there exists some permutation of elements such that:\n";
+ break;
+ case UnorderedMatcherRequire::Superset:
+ *os << "a surjection from elements to requirements exists such that:\n";
+ break;
+ case UnorderedMatcherRequire::Subset:
+ *os << "an injection from elements to requirements exists such that:\n";
+ break;
+ }
+
+ const char* sep = "";
+ for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+ *os << sep;
+ if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+ *os << " - element #" << i << " ";
+ } else {
+ *os << " - an element ";
+ }
+ matcher_describers_[i]->DescribeTo(os);
+ if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+ sep = ", and\n";
+ } else {
+ sep = "\n";
+ }
+ }
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(
+ ::std::ostream* os) const {
+ switch (match_flags()) {
+ case UnorderedMatcherRequire::ExactMatch:
+ if (matcher_describers_.empty()) {
+ *os << "isn't empty";
+ return;
+ }
+ if (matcher_describers_.size() == 1) {
+ *os << "doesn't have " << Elements(1) << ", or has " << Elements(1)
+ << " that ";
+ matcher_describers_[0]->DescribeNegationTo(os);
+ return;
+ }
+ *os << "doesn't have " << Elements(matcher_describers_.size())
+ << ", or there exists no permutation of elements such that:\n";
+ break;
+ case UnorderedMatcherRequire::Superset:
+ *os << "no surjection from elements to requirements exists such that:\n";
+ break;
+ case UnorderedMatcherRequire::Subset:
+ *os << "no injection from elements to requirements exists such that:\n";
+ break;
+ }
+ const char* sep = "";
+ for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+ *os << sep;
+ if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+ *os << " - element #" << i << " ";
+ } else {
+ *os << " - an element ";
+ }
+ matcher_describers_[i]->DescribeTo(os);
+ if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+ sep = ", and\n";
+ } else {
+ sep = "\n";
+ }
+ }
+}
+
+// Checks that all matchers match at least one element, and that all
+// elements match at least one matcher. This enables faster matching
+// and better error reporting.
+// Returns false, writing an explanation to 'listener', if and only
+// if the success criteria are not met.
+bool UnorderedElementsAreMatcherImplBase::VerifyMatchMatrix(
+ const ::std::vector<std::string>& element_printouts,
+ const MatchMatrix& matrix, MatchResultListener* listener) const {
+ bool result = true;
+ ::std::vector<char> element_matched(matrix.LhsSize(), 0);
+ ::std::vector<char> matcher_matched(matrix.RhsSize(), 0);
+
+ for (size_t ilhs = 0; ilhs < matrix.LhsSize(); ilhs++) {
+ for (size_t irhs = 0; irhs < matrix.RhsSize(); irhs++) {
+ char matched = matrix.HasEdge(ilhs, irhs);
+ element_matched[ilhs] |= matched;
+ matcher_matched[irhs] |= matched;
+ }
+ }
+
+ if (match_flags() & UnorderedMatcherRequire::Superset) {
+ const char* sep =
+ "where the following matchers don't match any elements:\n";
+ for (size_t mi = 0; mi < matcher_matched.size(); ++mi) {
+ if (matcher_matched[mi]) continue;
+ result = false;
+ if (listener->IsInterested()) {
+ *listener << sep << "matcher #" << mi << ": ";
+ matcher_describers_[mi]->DescribeTo(listener->stream());
+ sep = ",\n";
+ }
+ }
+ }
+
+ if (match_flags() & UnorderedMatcherRequire::Subset) {
+ const char* sep =
+ "where the following elements don't match any matchers:\n";
+ const char* outer_sep = "";
+ if (!result) {
+ outer_sep = "\nand ";
+ }
+ for (size_t ei = 0; ei < element_matched.size(); ++ei) {
+ if (element_matched[ei]) continue;
+ result = false;
+ if (listener->IsInterested()) {
+ *listener << outer_sep << sep << "element #" << ei << ": "
+ << element_printouts[ei];
+ sep = ",\n";
+ outer_sep = "";
+ }
+ }
+ }
+ return result;
+}
+
+bool UnorderedElementsAreMatcherImplBase::FindPairing(
+ const MatchMatrix& matrix, MatchResultListener* listener) const {
+ ElementMatcherPairs matches = FindMaxBipartiteMatching(matrix);
+
+ size_t max_flow = matches.size();
+ if ((match_flags() & UnorderedMatcherRequire::Superset) &&
+ max_flow < matrix.RhsSize()) {
+ if (listener->IsInterested()) {
+ *listener << "where no permutation of the elements can satisfy all "
+ "matchers, and the closest match is "
+ << max_flow << " of " << matrix.RhsSize()
+ << " matchers with the pairings:\n";
+ LogElementMatcherPairVec(matches, listener->stream());
+ }
+ return false;
+ }
+ if ((match_flags() & UnorderedMatcherRequire::Subset) &&
+ max_flow < matrix.LhsSize()) {
+ if (listener->IsInterested()) {
+ *listener
+ << "where not all elements can be matched, and the closest match is "
+ << max_flow << " of " << matrix.RhsSize()
+ << " matchers with the pairings:\n";
+ LogElementMatcherPairVec(matches, listener->stream());
+ }
+ return false;
+ }
+
+ if (matches.size() > 1) {
+ if (listener->IsInterested()) {
+ const char* sep = "where:\n";
+ for (size_t mi = 0; mi < matches.size(); ++mi) {
+ *listener << sep << " - element #" << matches[mi].first
+ << " is matched by matcher #" << matches[mi].second;
+ sep = ",\n";
+ }
+ }
+ }
+ return true;
+}
+
+} // namespace internal
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
new file mode 100644
index 0000000000..658ad3fa22
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc
@@ -0,0 +1,781 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the spec builder syntax (ON_CALL and
+// EXPECT_CALL).
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <stdlib.h>
+
+#include <iostream> // NOLINT
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
+#include <unistd.h> // NOLINT
+#endif
+
+// Silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 15
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+#endif
+
+namespace testing {
+namespace internal {
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+ const char* file, int line,
+ const std::string& message) {
+ ::std::ostringstream s;
+ s << internal::FormatFileLocation(file, line) << " " << message
+ << ::std::endl;
+ Log(severity, s.str(), 0);
+}
+
+// Constructs an ExpectationBase object.
+ExpectationBase::ExpectationBase(const char* a_file, int a_line,
+ const std::string& a_source_text)
+ : file_(a_file),
+ line_(a_line),
+ source_text_(a_source_text),
+ cardinality_specified_(false),
+ cardinality_(Exactly(1)),
+ call_count_(0),
+ retired_(false),
+ extra_matcher_specified_(false),
+ repeated_action_specified_(false),
+ retires_on_saturation_(false),
+ last_clause_(kNone),
+ action_count_checked_(false) {}
+
+// Destructs an ExpectationBase object.
+ExpectationBase::~ExpectationBase() {}
+
+// Explicitly specifies the cardinality of this expectation. Used by
+// the subclasses to implement the .Times() clause.
+void ExpectationBase::SpecifyCardinality(const Cardinality& a_cardinality) {
+ cardinality_specified_ = true;
+ cardinality_ = a_cardinality;
+}
+
+// Retires all pre-requisites of this expectation.
+void ExpectationBase::RetireAllPreRequisites()
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ if (is_retired()) {
+ // We can take this short-cut as we never retire an expectation
+ // until we have retired all its pre-requisites.
+ return;
+ }
+
+ ::std::vector<ExpectationBase*> expectations(1, this);
+ while (!expectations.empty()) {
+ ExpectationBase* exp = expectations.back();
+ expectations.pop_back();
+
+ for (ExpectationSet::const_iterator it =
+ exp->immediate_prerequisites_.begin();
+ it != exp->immediate_prerequisites_.end(); ++it) {
+ ExpectationBase* next = it->expectation_base().get();
+ if (!next->is_retired()) {
+ next->Retire();
+ expectations.push_back(next);
+ }
+ }
+ }
+}
+
+// Returns true if and only if all pre-requisites of this expectation
+// have been satisfied.
+bool ExpectationBase::AllPrerequisitesAreSatisfied() const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ ::std::vector<const ExpectationBase*> expectations(1, this);
+ while (!expectations.empty()) {
+ const ExpectationBase* exp = expectations.back();
+ expectations.pop_back();
+
+ for (ExpectationSet::const_iterator it =
+ exp->immediate_prerequisites_.begin();
+ it != exp->immediate_prerequisites_.end(); ++it) {
+ const ExpectationBase* next = it->expectation_base().get();
+ if (!next->IsSatisfied()) return false;
+ expectations.push_back(next);
+ }
+ }
+ return true;
+}
+
+// Adds unsatisfied pre-requisites of this expectation to 'result'.
+void ExpectationBase::FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ ::std::vector<const ExpectationBase*> expectations(1, this);
+ while (!expectations.empty()) {
+ const ExpectationBase* exp = expectations.back();
+ expectations.pop_back();
+
+ for (ExpectationSet::const_iterator it =
+ exp->immediate_prerequisites_.begin();
+ it != exp->immediate_prerequisites_.end(); ++it) {
+ const ExpectationBase* next = it->expectation_base().get();
+
+ if (next->IsSatisfied()) {
+ // If *it is satisfied and has a call count of 0, some of its
+ // pre-requisites may not be satisfied yet.
+ if (next->call_count_ == 0) {
+ expectations.push_back(next);
+ }
+ } else {
+ // Now that we know next is unsatisfied, we are not so interested
+ // in whether its pre-requisites are satisfied. Therefore we
+ // don't iterate into it here.
+ *result += *it;
+ }
+ }
+ }
+}
+
+// Describes how many times a function call matching this
+// expectation has occurred.
+void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+
+ // Describes how many times the function is expected to be called.
+ *os << " Expected: to be ";
+ cardinality().DescribeTo(os);
+ *os << "\n Actual: ";
+ Cardinality::DescribeActualCallCountTo(call_count(), os);
+
+ // Describes the state of the expectation (e.g. is it satisfied?
+ // is it active?).
+ *os << " - "
+ << (IsOverSaturated() ? "over-saturated"
+ : IsSaturated() ? "saturated"
+ : IsSatisfied() ? "satisfied"
+ : "unsatisfied")
+ << " and " << (is_retired() ? "retired" : "active");
+}
+
+// Checks the action count (i.e. the number of WillOnce() and
+// WillRepeatedly() clauses) against the cardinality if this hasn't
+// been done before. Prints a warning if there are too many or too
+// few actions.
+void ExpectationBase::CheckActionCountIfNotDone() const
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+ bool should_check = false;
+ {
+ MutexLock l(&mutex_);
+ if (!action_count_checked_) {
+ action_count_checked_ = true;
+ should_check = true;
+ }
+ }
+
+ if (should_check) {
+ if (!cardinality_specified_) {
+ // The cardinality was inferred - no need to check the action
+ // count against it.
+ return;
+ }
+
+ // The cardinality was explicitly specified.
+ const int action_count = static_cast<int>(untyped_actions_.size());
+ const int upper_bound = cardinality().ConservativeUpperBound();
+ const int lower_bound = cardinality().ConservativeLowerBound();
+ bool too_many; // True if there are too many actions, or false
+ // if there are too few.
+ if (action_count > upper_bound ||
+ (action_count == upper_bound && repeated_action_specified_)) {
+ too_many = true;
+ } else if (0 < action_count && action_count < lower_bound &&
+ !repeated_action_specified_) {
+ too_many = false;
+ } else {
+ return;
+ }
+
+ ::std::stringstream ss;
+ DescribeLocationTo(&ss);
+ ss << "Too " << (too_many ? "many" : "few") << " actions specified in "
+ << source_text() << "...\n"
+ << "Expected to be ";
+ cardinality().DescribeTo(&ss);
+ ss << ", but has " << (too_many ? "" : "only ") << action_count
+ << " WillOnce()" << (action_count == 1 ? "" : "s");
+ if (repeated_action_specified_) {
+ ss << " and a WillRepeatedly()";
+ }
+ ss << ".";
+ Log(kWarning, ss.str(), -1); // -1 means "don't print stack trace".
+ }
+}
+
+// Implements the .Times() clause.
+void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
+ if (last_clause_ == kTimes) {
+ ExpectSpecProperty(false,
+ ".Times() cannot appear "
+ "more than once in an EXPECT_CALL().");
+ } else {
+ ExpectSpecProperty(
+ last_clause_ < kTimes,
+ ".Times() may only appear *before* .InSequence(), .WillOnce(), "
+ ".WillRepeatedly(), or .RetiresOnSaturation(), not after.");
+ }
+ last_clause_ = kTimes;
+
+ SpecifyCardinality(a_cardinality);
+}
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg) {
+ // Include a stack trace only if --gmock_verbose=info is specified.
+ const int stack_frames_to_skip =
+ GMOCK_FLAG_GET(verbose) == kInfoVerbosity ? 3 : -1;
+ switch (reaction) {
+ case kAllow:
+ Log(kInfo, msg, stack_frames_to_skip);
+ break;
+ case kWarn:
+ Log(kWarning,
+ msg +
+ "\nNOTE: You can safely ignore the above warning unless this "
+ "call should not happen. Do not suppress it by blindly adding "
+ "an EXPECT_CALL() if you don't mean to enforce the call. "
+ "See "
+ "https://github.com/google/googletest/blob/master/docs/"
+ "gmock_cook_book.md#"
+ "knowing-when-to-expect for details.\n",
+ stack_frames_to_skip);
+ break;
+ default: // FAIL
+ Expect(false, nullptr, -1, msg);
+ }
+}
+
+UntypedFunctionMockerBase::UntypedFunctionMockerBase()
+ : mock_obj_(nullptr), name_("") {}
+
+UntypedFunctionMockerBase::~UntypedFunctionMockerBase() {}
+
+// Sets the mock object this mock method belongs to, and registers
+// this information in the global mock registry. Will be called
+// whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+// method.
+void UntypedFunctionMockerBase::RegisterOwner(const void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ {
+ MutexLock l(&g_gmock_mutex);
+ mock_obj_ = mock_obj;
+ }
+ Mock::Register(mock_obj, this);
+}
+
+// Sets the mock object this mock method belongs to, and sets the name
+// of the mock function. Will be called upon each invocation of this
+// mock function.
+void UntypedFunctionMockerBase::SetOwnerAndName(const void* mock_obj,
+ const char* name)
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ // We protect name_ under g_gmock_mutex in case this mock function
+ // is called from two threads concurrently.
+ MutexLock l(&g_gmock_mutex);
+ mock_obj_ = mock_obj;
+ name_ = name;
+}
+
+// Returns the name of the function being mocked. Must be called
+// after RegisterOwner() or SetOwnerAndName() has been called.
+const void* UntypedFunctionMockerBase::MockObject() const
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ const void* mock_obj;
+ {
+ // We protect mock_obj_ under g_gmock_mutex in case this mock
+ // function is called from two threads concurrently.
+ MutexLock l(&g_gmock_mutex);
+ Assert(mock_obj_ != nullptr, __FILE__, __LINE__,
+ "MockObject() must not be called before RegisterOwner() or "
+ "SetOwnerAndName() has been called.");
+ mock_obj = mock_obj_;
+ }
+ return mock_obj;
+}
+
+// Returns the name of this mock method. Must be called after
+// SetOwnerAndName() has been called.
+const char* UntypedFunctionMockerBase::Name() const
+ GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+ const char* name;
+ {
+ // We protect name_ under g_gmock_mutex in case this mock
+ // function is called from two threads concurrently.
+ MutexLock l(&g_gmock_mutex);
+ Assert(name_ != nullptr, __FILE__, __LINE__,
+ "Name() must not be called before SetOwnerAndName() has "
+ "been called.");
+ name = name_;
+ }
+ return name;
+}
+
+// Returns an Expectation object that references and co-owns exp,
+// which must be an expectation on this mock function.
+Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
+ // See the definition of untyped_expectations_ for why access to it
+ // is unprotected here.
+ for (UntypedExpectations::const_iterator it = untyped_expectations_.begin();
+ it != untyped_expectations_.end(); ++it) {
+ if (it->get() == exp) {
+ return Expectation(*it);
+ }
+ }
+
+ Assert(false, __FILE__, __LINE__, "Cannot find expectation.");
+ return Expectation();
+ // The above statement is just to make the code compile, and will
+ // never be executed.
+}
+
+// Verifies that all expectations on this mock function have been
+// satisfied. Reports one or more Google Test non-fatal failures
+// and returns false if not.
+bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+ g_gmock_mutex.AssertHeld();
+ bool expectations_met = true;
+ for (UntypedExpectations::const_iterator it = untyped_expectations_.begin();
+ it != untyped_expectations_.end(); ++it) {
+ ExpectationBase* const untyped_expectation = it->get();
+ if (untyped_expectation->IsOverSaturated()) {
+ // There was an upper-bound violation. Since the error was
+ // already reported when it occurred, there is no need to do
+ // anything here.
+ expectations_met = false;
+ } else if (!untyped_expectation->IsSatisfied()) {
+ expectations_met = false;
+ ::std::stringstream ss;
+ ss << "Actual function call count doesn't match "
+ << untyped_expectation->source_text() << "...\n";
+ // No need to show the source file location of the expectation
+ // in the description, as the Expect() call that follows already
+ // takes care of it.
+ untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
+ untyped_expectation->DescribeCallCountTo(&ss);
+ Expect(false, untyped_expectation->file(), untyped_expectation->line(),
+ ss.str());
+ }
+ }
+
+ // Deleting our expectations may trigger other mock objects to be deleted, for
+ // example if an action contains a reference counted smart pointer to that
+ // mock object, and that is the last reference. So if we delete our
+ // expectations within the context of the global mutex we may deadlock when
+ // this method is called again. Instead, make a copy of the set of
+ // expectations to delete, clear our set within the mutex, and then clear the
+ // copied set outside of it.
+ UntypedExpectations expectations_to_delete;
+ untyped_expectations_.swap(expectations_to_delete);
+
+ g_gmock_mutex.Unlock();
+ expectations_to_delete.clear();
+ g_gmock_mutex.Lock();
+
+ return expectations_met;
+}
+
+CallReaction intToCallReaction(int mock_behavior) {
+ if (mock_behavior >= kAllow && mock_behavior <= kFail) {
+ return static_cast<internal::CallReaction>(mock_behavior);
+ }
+ return kWarn;
+}
+
+} // namespace internal
+
+// Class Mock.
+
+namespace {
+
+typedef std::set<internal::UntypedFunctionMockerBase*> FunctionMockers;
+
+// The current state of a mock object. Such information is needed for
+// detecting leaked mock objects and explicitly verifying a mock's
+// expectations.
+struct MockObjectState {
+ MockObjectState()
+ : first_used_file(nullptr), first_used_line(-1), leakable(false) {}
+
+ // Where in the source file an ON_CALL or EXPECT_CALL is first
+ // invoked on this mock object.
+ const char* first_used_file;
+ int first_used_line;
+ ::std::string first_used_test_suite;
+ ::std::string first_used_test;
+ bool leakable; // true if and only if it's OK to leak the object.
+ FunctionMockers function_mockers; // All registered methods of the object.
+};
+
+// A global registry holding the state of all mock objects that are
+// alive. A mock object is added to this registry the first time
+// Mock::AllowLeak(), ON_CALL(), or EXPECT_CALL() is called on it. It
+// is removed from the registry in the mock object's destructor.
+class MockObjectRegistry {
+ public:
+ // Maps a mock object (identified by its address) to its state.
+ typedef std::map<const void*, MockObjectState> StateMap;
+
+ // This destructor will be called when a program exits, after all
+ // tests in it have been run. By then, there should be no mock
+ // object alive. Therefore we report any living object as test
+ // failure, unless the user explicitly asked us to ignore it.
+ ~MockObjectRegistry() {
+ if (!GMOCK_FLAG_GET(catch_leaked_mocks)) return;
+
+ int leaked_count = 0;
+ for (StateMap::const_iterator it = states_.begin(); it != states_.end();
+ ++it) {
+ if (it->second.leakable) // The user said it's fine to leak this object.
+ continue;
+
+ // FIXME: Print the type of the leaked object.
+ // This can help the user identify the leaked object.
+ std::cout << "\n";
+ const MockObjectState& state = it->second;
+ std::cout << internal::FormatFileLocation(state.first_used_file,
+ state.first_used_line);
+ std::cout << " ERROR: this mock object";
+ if (state.first_used_test != "") {
+ std::cout << " (used in test " << state.first_used_test_suite << "."
+ << state.first_used_test << ")";
+ }
+ std::cout << " should be deleted but never is. Its address is @"
+ << it->first << ".";
+ leaked_count++;
+ }
+ if (leaked_count > 0) {
+ std::cout << "\nERROR: " << leaked_count << " leaked mock "
+ << (leaked_count == 1 ? "object" : "objects")
+ << " found at program exit. Expectations on a mock object are "
+ "verified when the object is destructed. Leaking a mock "
+ "means that its expectations aren't verified, which is "
+ "usually a test bug. If you really intend to leak a mock, "
+ "you can suppress this error using "
+ "testing::Mock::AllowLeak(mock_object), or you may use a "
+ "fake or stub instead of a mock.\n";
+ std::cout.flush();
+ ::std::cerr.flush();
+ // RUN_ALL_TESTS() has already returned when this destructor is
+ // called. Therefore we cannot use the normal Google Test
+ // failure reporting mechanism.
+ _exit(1); // We cannot call exit() as it is not reentrant and
+ // may already have been called.
+ }
+ }
+
+ StateMap& states() { return states_; }
+
+ private:
+ StateMap states_;
+};
+
+// Protected by g_gmock_mutex.
+MockObjectRegistry g_mock_object_registry;
+
+// Maps a mock object to the reaction Google Mock should have when an
+// uninteresting method is called. Protected by g_gmock_mutex.
+std::unordered_map<uintptr_t, internal::CallReaction>&
+UninterestingCallReactionMap() {
+ static auto* map = new std::unordered_map<uintptr_t, internal::CallReaction>;
+ return *map;
+}
+
+// Sets the reaction Google Mock should have when an uninteresting
+// method of the given mock object is called.
+void SetReactionOnUninterestingCalls(uintptr_t mock_obj,
+ internal::CallReaction reaction)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ UninterestingCallReactionMap()[mock_obj] = reaction;
+}
+
+} // namespace
+
+// Tells Google Mock to allow uninteresting calls on the given mock
+// object.
+void Mock::AllowUninterestingCalls(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
+}
+
+// Tells Google Mock to warn the user about uninteresting calls on the
+// given mock object.
+void Mock::WarnUninterestingCalls(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
+}
+
+// Tells Google Mock to fail uninteresting calls on the given mock
+// object.
+void Mock::FailUninterestingCalls(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
+}
+
+// Tells Google Mock the given mock object is being destroyed and its
+// entry in the call-reaction table should be removed.
+void Mock::UnregisterCallReaction(uintptr_t mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ UninterestingCallReactionMap().erase(static_cast<uintptr_t>(mock_obj));
+}
+
+// Returns the reaction Google Mock will have on uninteresting calls
+// made on the given mock object.
+internal::CallReaction Mock::GetReactionOnUninterestingCalls(
+ const void* mock_obj) GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ return (UninterestingCallReactionMap().count(
+ reinterpret_cast<uintptr_t>(mock_obj)) == 0)
+ ? internal::intToCallReaction(
+ GMOCK_FLAG_GET(default_mock_behavior))
+ : UninterestingCallReactionMap()[reinterpret_cast<uintptr_t>(
+ mock_obj)];
+}
+
+// Tells Google Mock to ignore mock_obj when checking for leaked mock
+// objects.
+void Mock::AllowLeak(const void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ g_mock_object_registry.states()[mock_obj].leakable = true;
+}
+
+// Verifies and clears all expectations on the given mock object. If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectations(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies all expectations on the given mock object and clears its
+// default actions and expectations. Returns true if and only if the
+// verification was successful.
+bool Mock::VerifyAndClear(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ ClearDefaultActionsLocked(mock_obj);
+ return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies and clears all expectations on the given mock object. If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectationsLocked(void* mock_obj)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+ internal::g_gmock_mutex.AssertHeld();
+ if (g_mock_object_registry.states().count(mock_obj) == 0) {
+ // No EXPECT_CALL() was set on the given mock object.
+ return true;
+ }
+
+ // Verifies and clears the expectations on each mock method in the
+ // given mock object.
+ bool expectations_met = true;
+ FunctionMockers& mockers =
+ g_mock_object_registry.states()[mock_obj].function_mockers;
+ for (FunctionMockers::const_iterator it = mockers.begin();
+ it != mockers.end(); ++it) {
+ if (!(*it)->VerifyAndClearExpectationsLocked()) {
+ expectations_met = false;
+ }
+ }
+
+ // We don't clear the content of mockers, as they may still be
+ // needed by ClearDefaultActionsLocked().
+ return expectations_met;
+}
+
+bool Mock::IsNaggy(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kWarn;
+}
+bool Mock::IsNice(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kAllow;
+}
+bool Mock::IsStrict(void* mock_obj)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kFail;
+}
+
+// Registers a mock object and a mock method it owns.
+void Mock::Register(const void* mock_obj,
+ internal::UntypedFunctionMockerBase* mocker)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ g_mock_object_registry.states()[mock_obj].function_mockers.insert(mocker);
+}
+
+// Tells Google Mock where in the source code mock_obj is used in an
+// ON_CALL or EXPECT_CALL. In case mock_obj is leaked, this
+// information helps the user identify which object it is.
+void Mock::RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+ const char* file, int line)
+ GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+ internal::MutexLock l(&internal::g_gmock_mutex);
+ MockObjectState& state = g_mock_object_registry.states()[mock_obj];
+ if (state.first_used_file == nullptr) {
+ state.first_used_file = file;
+ state.first_used_line = line;
+ const TestInfo* const test_info =
+ UnitTest::GetInstance()->current_test_info();
+ if (test_info != nullptr) {
+ state.first_used_test_suite = test_info->test_suite_name();
+ state.first_used_test = test_info->name();
+ }
+ }
+}
+
+// Unregisters a mock method; removes the owning mock object from the
+// registry when the last mock method associated with it has been
+// unregistered. This is called only in the destructor of
+// FunctionMockerBase.
+void Mock::UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+ internal::g_gmock_mutex.AssertHeld();
+ for (MockObjectRegistry::StateMap::iterator it =
+ g_mock_object_registry.states().begin();
+ it != g_mock_object_registry.states().end(); ++it) {
+ FunctionMockers& mockers = it->second.function_mockers;
+ if (mockers.erase(mocker) > 0) {
+ // mocker was in mockers and has been just removed.
+ if (mockers.empty()) {
+ g_mock_object_registry.states().erase(it);
+ }
+ return;
+ }
+ }
+}
+
+// Clears all ON_CALL()s set on the given mock object.
+void Mock::ClearDefaultActionsLocked(void* mock_obj)
+ GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+ internal::g_gmock_mutex.AssertHeld();
+
+ if (g_mock_object_registry.states().count(mock_obj) == 0) {
+ // No ON_CALL() was set on the given mock object.
+ return;
+ }
+
+ // Clears the default actions for each mock method in the given mock
+ // object.
+ FunctionMockers& mockers =
+ g_mock_object_registry.states()[mock_obj].function_mockers;
+ for (FunctionMockers::const_iterator it = mockers.begin();
+ it != mockers.end(); ++it) {
+ (*it)->ClearDefaultActionsLocked();
+ }
+
+ // We don't clear the content of mockers, as they may still be
+ // needed by VerifyAndClearExpectationsLocked().
+}
+
+Expectation::Expectation() {}
+
+Expectation::Expectation(
+ const std::shared_ptr<internal::ExpectationBase>& an_expectation_base)
+ : expectation_base_(an_expectation_base) {}
+
+Expectation::~Expectation() {}
+
+// Adds an expectation to a sequence.
+void Sequence::AddExpectation(const Expectation& expectation) const {
+ if (*last_expectation_ != expectation) {
+ if (last_expectation_->expectation_base() != nullptr) {
+ expectation.expectation_base()->immediate_prerequisites_ +=
+ *last_expectation_;
+ }
+ *last_expectation_ = expectation;
+ }
+}
+
+// Creates the implicit sequence if there isn't one.
+InSequence::InSequence() {
+ if (internal::g_gmock_implicit_sequence.get() == nullptr) {
+ internal::g_gmock_implicit_sequence.set(new Sequence);
+ sequence_created_ = true;
+ } else {
+ sequence_created_ = false;
+ }
+}
+
+// Deletes the implicit sequence if it was created by the constructor
+// of this object.
+InSequence::~InSequence() {
+ if (sequence_created_) {
+ delete internal::g_gmock_implicit_sequence.get();
+ internal::g_gmock_implicit_sequence.set(nullptr);
+ }
+}
+
+} // namespace testing
+
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#pragma warning(pop)
+#endif
+#endif
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock.cc
new file mode 100644
index 0000000000..5025656a02
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock.cc
@@ -0,0 +1,223 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gmock/gmock.h"
+
+#include "gmock/internal/gmock-port.h"
+
+GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
+ "true if and only if Google Mock should report leaked "
+ "mock objects as failures.");
+
+GMOCK_DEFINE_string_(verbose, testing::internal::kWarningVerbosity,
+ "Controls how verbose Google Mock's output is."
+ " Valid values:\n"
+ " info - prints all messages.\n"
+ " warning - prints warnings and errors.\n"
+ " error - prints errors only.");
+
+GMOCK_DEFINE_int32_(default_mock_behavior, 1,
+ "Controls the default behavior of mocks."
+ " Valid values:\n"
+ " 0 - by default, mocks act as NiceMocks.\n"
+ " 1 - by default, mocks act as NaggyMocks.\n"
+ " 2 - by default, mocks act as StrictMocks.");
+
+namespace testing {
+namespace internal {
+
+// Parses a string as a command line flag. The string should have the
+// format "--gmock_flag=value". When def_optional is true, the
+// "=value" part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseGoogleMockFlagValue(const char* str,
+ const char* flag_name,
+ bool def_optional) {
+ // str and flag must not be NULL.
+ if (str == nullptr || flag_name == nullptr) return nullptr;
+
+ // The flag must start with "--gmock_".
+ const std::string flag_name_str = std::string("--gmock_") + flag_name;
+ const size_t flag_name_len = flag_name_str.length();
+ if (strncmp(str, flag_name_str.c_str(), flag_name_len) != 0) return nullptr;
+
+ // Skips the flag name.
+ const char* flag_end = str + flag_name_len;
+
+ // When def_optional is true, it's OK to not have a "=value" part.
+ if (def_optional && (flag_end[0] == '\0')) {
+ return flag_end;
+ }
+
+ // If def_optional is true and there are more characters after the
+ // flag name, or if def_optional is false, there must be a '=' after
+ // the flag name.
+ if (flag_end[0] != '=') return nullptr;
+
+ // Returns the string after "=".
+ return flag_end + 1;
+}
+
+// Parses a string for a Google Mock bool flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true. On failure, returns false without changing *value.
+static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
+ bool* value) {
+ // Gets the value of the flag as a string.
+ const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, true);
+
+ // Aborts if the parsing failed.
+ if (value_str == nullptr) return false;
+
+ // Converts the string value to a bool.
+ *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+ return true;
+}
+
+// Parses a string for a Google Mock string flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true. On failure, returns false without changing *value.
+template <typename String>
+static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
+ String* value) {
+ // Gets the value of the flag as a string.
+ const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, false);
+
+ // Aborts if the parsing failed.
+ if (value_str == nullptr) return false;
+
+ // Sets *value to the value of the flag.
+ *value = value_str;
+ return true;
+}
+
+static bool ParseGoogleMockFlag(const char* str, const char* flag_name,
+ int32_t* value) {
+ // Gets the value of the flag as a string.
+ const char* const value_str = ParseGoogleMockFlagValue(str, flag_name, true);
+
+ // Aborts if the parsing failed.
+ if (value_str == nullptr) return false;
+
+ // Sets *value to the value of the flag.
+ return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+ value);
+}
+
+// The internal implementation of InitGoogleMock().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleMockImpl(int* argc, CharType** argv) {
+ // Makes sure Google Test is initialized. InitGoogleTest() is
+ // idempotent, so it's fine if the user has already called it.
+ InitGoogleTest(argc, argv);
+ if (*argc <= 0) return;
+
+ for (int i = 1; i != *argc; i++) {
+ const std::string arg_string = StreamableToString(argv[i]);
+ const char* const arg = arg_string.c_str();
+
+ // Do we see a Google Mock flag?
+ bool found_gmock_flag = false;
+
+#define GMOCK_INTERNAL_PARSE_FLAG(flag_name) \
+ if (!found_gmock_flag) { \
+ auto value = GMOCK_FLAG_GET(flag_name); \
+ if (ParseGoogleMockFlag(arg, #flag_name, &value)) { \
+ GMOCK_FLAG_SET(flag_name, value); \
+ found_gmock_flag = true; \
+ } \
+ }
+
+ GMOCK_INTERNAL_PARSE_FLAG(catch_leaked_mocks)
+ GMOCK_INTERNAL_PARSE_FLAG(verbose)
+ GMOCK_INTERNAL_PARSE_FLAG(default_mock_behavior)
+
+ if (found_gmock_flag) {
+ // Yes. Shift the remainder of the argv list left by one. Note
+ // that argv has (*argc + 1) elements, the last one always being
+ // NULL. The following loop moves the trailing NULL element as
+ // well.
+ for (int j = i; j != *argc; j++) {
+ argv[j] = argv[j + 1];
+ }
+
+ // Decrements the argument count.
+ (*argc)--;
+
+ // We also need to decrement the iterator as we just removed
+ // an element.
+ i--;
+ }
+ }
+}
+
+} // namespace internal
+
+// Initializes Google Mock. This must be called before running the
+// tests. In particular, it parses a command line for the flags that
+// Google Mock recognizes. Whenever a Google Mock flag is seen, it is
+// removed from argv, and *argc is decremented.
+//
+// No value is returned. Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv) {
+ internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv) {
+ internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock() {
+ // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+ int argc = 1;
+ const auto arg0 = "dummy";
+ char* argv0 = const_cast<char*>(arg0);
+ char** argv = &argv0;
+
+ internal::InitGoogleMockImpl(&argc, argv);
+}
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googlemock/src/gmock_main.cc b/third_party/aom/third_party/googletest/src/googlemock/src/gmock_main.cc
new file mode 100644
index 0000000000..b411c5ecb9
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googlemock/src/gmock_main.cc
@@ -0,0 +1,72 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <iostream>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() {
+ // Since Google Mock depends on Google Test, InitGoogleMock() is
+ // also responsible for initializing Google Test. Therefore there's
+ // no need for calling testing::InitGoogleTest() separately.
+ testing::InitGoogleMock();
+}
+void loop() { RUN_ALL_TESTS(); }
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
+// MS C++ compiler/linker has a bug on Windows (not on Windows CE), which
+// causes a link error when _tmain is defined in a static library and UNICODE
+// is enabled. For this reason instead of _tmain, main function is used on
+// Windows. See the following link to track the current status of this bug:
+// https://web.archive.org/web/20170912203238/connect.microsoft.com/VisualStudio/feedback/details/394464/wmain-link-error-in-the-static-library
+// // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE
+#include <tchar.h> // NOLINT
+
+GTEST_API_ int _tmain(int argc, TCHAR** argv) {
+#else
+GTEST_API_ int main(int argc, char** argv) {
+#endif // GTEST_OS_WINDOWS_MOBILE
+ std::cout << "Running main() from gmock_main.cc\n";
+ // Since Google Mock depends on Google Test, InitGoogleMock() is
+ // also responsible for initializing Google Test. Therefore there's
+ // no need for calling testing::InitGoogleTest() separately.
+ testing::InitGoogleMock(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#endif
diff --git a/third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt b/third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt
new file mode 100644
index 0000000000..aa00a5f3d2
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt
@@ -0,0 +1,322 @@
+########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
+# CMake build script for Google Test.
+#
+# To run the tests for Google Test itself on Linux, use 'make test' or
+# ctest. You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+# When other libraries are using a shared version of runtime libraries,
+# Google Test also has to use one.
+option(
+ gtest_force_shared_crt
+ "Use shared (DLL) run-time lib even when Google Test is built as static lib."
+ OFF)
+
+option(gtest_build_tests "Build all of gtest's own tests." OFF)
+
+option(gtest_build_samples "Build gtest's sample programs." OFF)
+
+option(gtest_disable_pthreads "Disable uses of pthreads in gtest." OFF)
+
+option(
+ gtest_hide_internal_symbols
+ "Build gtest with internal symbols hidden in shared libraries."
+ OFF)
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include(cmake/hermetic_build.cmake OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+ pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gtest_SOURCE_DIR} and to the root binary directory as
+# ${gtest_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+
+# Project version:
+
+cmake_minimum_required(VERSION 3.5)
+cmake_policy(SET CMP0048 NEW)
+project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+
+if (POLICY CMP0063) # Visibility
+ cmake_policy(SET CMP0063 NEW)
+endif (POLICY CMP0063)
+
+if (COMMAND set_up_hermetic_build)
+ set_up_hermetic_build()
+endif()
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gtest" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+
+ # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+ # make it prominent in the GUI.
+ option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+
+else()
+
+ mark_as_advanced(
+ gtest_force_shared_crt
+ gtest_build_tests
+ gtest_build_samples
+ gtest_disable_pthreads
+ gtest_hide_internal_symbols)
+
+endif()
+
+
+if (gtest_hide_internal_symbols)
+ set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+ set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+endif()
+
+# Define helper functions and macros used by Google Test.
+include(cmake/internal_utils.cmake)
+
+config_compiler_and_linker() # Defined in internal_utils.cmake.
+
+# Needed to set the namespace for both the export targets and the
+# alias libraries
+set(cmake_package_name GTest CACHE INTERNAL "")
+
+# Create the CMake package file descriptors.
+if (INSTALL_GTEST)
+ include(CMakePackageConfigHelpers)
+ set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "")
+ set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "")
+ set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}")
+ set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake")
+ write_basic_package_version_file(${version_file} VERSION ${GOOGLETEST_VERSION} COMPATIBILITY AnyNewerVersion)
+ install(EXPORT ${targets_export_name}
+ NAMESPACE ${cmake_package_name}::
+ DESTINATION ${cmake_files_install_dir})
+ set(config_file "${generated_dir}/${cmake_package_name}Config.cmake")
+ configure_package_config_file("${gtest_SOURCE_DIR}/cmake/Config.cmake.in"
+ "${config_file}" INSTALL_DESTINATION ${cmake_files_install_dir})
+ install(FILES ${version_file} ${config_file}
+ DESTINATION ${cmake_files_install_dir})
+endif()
+
+# Where Google Test's .h files can be found.
+set(gtest_build_include_dirs
+ "${gtest_SOURCE_DIR}/include"
+ "${gtest_SOURCE_DIR}")
+include_directories(${gtest_build_include_dirs})
+
+########################################################################
+#
+# Defines the gtest & gtest_main libraries. User tests should link
+# with one of them.
+
+# Google Test libraries. We build them using more strict warnings than what
+# are used for other targets, to ensure that gtest can be compiled by a user
+# aggressive about warnings.
+cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
+set_target_properties(gtest PROPERTIES VERSION ${GOOGLETEST_VERSION})
+cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
+set_target_properties(gtest_main PROPERTIES VERSION ${GOOGLETEST_VERSION})
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+ string(REPLACE ";" "$<SEMICOLON>" dirs "${gtest_build_include_dirs}")
+ target_include_directories(gtest SYSTEM INTERFACE
+ "$<BUILD_INTERFACE:${dirs}>"
+ "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+ target_include_directories(gtest_main SYSTEM INTERFACE
+ "$<BUILD_INTERFACE:${dirs}>"
+ "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+if(CMAKE_SYSTEM_NAME MATCHES "QNX")
+ target_link_libraries(gtest PUBLIC regex)
+endif()
+target_link_libraries(gtest_main PUBLIC gtest)
+
+########################################################################
+#
+# Install rules
+install_project(gtest gtest_main)
+
+########################################################################
+#
+# Samples on how to link user tests with gtest or gtest_main.
+#
+# They are not built by default. To build them, set the
+# gtest_build_samples option to ON. You can do it by running ccmake
+# or specifying the -Dgtest_build_samples=ON flag when running cmake.
+
+if (gtest_build_samples)
+ cxx_executable(sample1_unittest samples gtest_main samples/sample1.cc)
+ cxx_executable(sample2_unittest samples gtest_main samples/sample2.cc)
+ cxx_executable(sample3_unittest samples gtest_main)
+ cxx_executable(sample4_unittest samples gtest_main samples/sample4.cc)
+ cxx_executable(sample5_unittest samples gtest_main samples/sample1.cc)
+ cxx_executable(sample6_unittest samples gtest_main)
+ cxx_executable(sample7_unittest samples gtest_main)
+ cxx_executable(sample8_unittest samples gtest_main)
+ cxx_executable(sample9_unittest samples gtest)
+ cxx_executable(sample10_unittest samples gtest)
+endif()
+
+########################################################################
+#
+# Google Test's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Test itself.
+#
+# The tests are not built by default. To build them, set the
+# gtest_build_tests option to ON. You can do it by running ccmake
+# or specifying the -Dgtest_build_tests=ON flag when running cmake.
+
+if (gtest_build_tests)
+ # This must be set in the root directory for the tests to be run by
+ # 'make test' or ctest.
+ enable_testing()
+
+ ############################################################
+ # C++ tests built with standard compiler flags.
+
+ cxx_test(googletest-death-test-test gtest_main)
+ cxx_test(gtest_environment_test gtest)
+ cxx_test(googletest-filepath-test gtest_main)
+ cxx_test(googletest-listener-test gtest_main)
+ cxx_test(gtest_main_unittest gtest_main)
+ cxx_test(googletest-message-test gtest_main)
+ cxx_test(gtest_no_test_unittest gtest)
+ cxx_test(googletest-options-test gtest_main)
+ cxx_test(googletest-param-test-test gtest
+ test/googletest-param-test2-test.cc)
+ cxx_test(googletest-port-test gtest_main)
+ cxx_test(gtest_pred_impl_unittest gtest_main)
+ cxx_test(gtest_premature_exit_test gtest
+ test/gtest_premature_exit_test.cc)
+ cxx_test(googletest-printers-test gtest_main)
+ cxx_test(gtest_prod_test gtest_main
+ test/production.cc)
+ cxx_test(gtest_repeat_test gtest)
+ cxx_test(gtest_sole_header_test gtest_main)
+ cxx_test(gtest_stress_test gtest)
+ cxx_test(googletest-test-part-test gtest_main)
+ cxx_test(gtest_throw_on_failure_ex_test gtest)
+ cxx_test(gtest-typed-test_test gtest_main
+ test/gtest-typed-test2_test.cc)
+ cxx_test(gtest_unittest gtest_main)
+ cxx_test(gtest-unittest-api_test gtest)
+ cxx_test(gtest_skip_in_environment_setup_test gtest_main)
+ cxx_test(gtest_skip_test gtest_main)
+
+ ############################################################
+ # C++ tests built with non-standard compiler flags.
+
+ # MSVC 7.1 does not support STL with exceptions disabled.
+ if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+ cxx_library(gtest_no_exception "${cxx_no_exception}"
+ src/gtest-all.cc)
+ cxx_library(gtest_main_no_exception "${cxx_no_exception}"
+ src/gtest-all.cc src/gtest_main.cc)
+ endif()
+ cxx_library(gtest_main_no_rtti "${cxx_no_rtti}"
+ src/gtest-all.cc src/gtest_main.cc)
+
+ cxx_test_with_flags(gtest-death-test_ex_nocatch_test
+ "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0"
+ gtest test/googletest-death-test_ex_test.cc)
+ cxx_test_with_flags(gtest-death-test_ex_catch_test
+ "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1"
+ gtest test/googletest-death-test_ex_test.cc)
+
+ cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}"
+ gtest_main_no_rtti test/gtest_unittest.cc)
+
+ cxx_shared_library(gtest_dll "${cxx_default}"
+ src/gtest-all.cc src/gtest_main.cc)
+
+ cxx_executable_with_flags(gtest_dll_test_ "${cxx_default}"
+ gtest_dll test/gtest_all_test.cc)
+ set_target_properties(gtest_dll_test_
+ PROPERTIES
+ COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+ ############################################################
+ # Python tests.
+
+ cxx_executable(googletest-break-on-failure-unittest_ test gtest)
+ py_test(googletest-break-on-failure-unittest)
+
+ py_test(gtest_skip_check_output_test)
+ py_test(gtest_skip_environment_check_output_test)
+
+ # Visual Studio .NET 2003 does not support STL with exceptions disabled.
+ if (NOT MSVC OR MSVC_VERSION GREATER 1310) # 1310 is Visual Studio .NET 2003
+ cxx_executable_with_flags(
+ googletest-catch-exceptions-no-ex-test_
+ "${cxx_no_exception}"
+ gtest_main_no_exception
+ test/googletest-catch-exceptions-test_.cc)
+ endif()
+
+ cxx_executable_with_flags(
+ googletest-catch-exceptions-ex-test_
+ "${cxx_exception}"
+ gtest_main
+ test/googletest-catch-exceptions-test_.cc)
+ py_test(googletest-catch-exceptions-test)
+
+ cxx_executable(googletest-color-test_ test gtest)
+ py_test(googletest-color-test)
+
+ cxx_executable(googletest-env-var-test_ test gtest)
+ py_test(googletest-env-var-test)
+
+ cxx_executable(googletest-filter-unittest_ test gtest)
+ py_test(googletest-filter-unittest)
+
+ cxx_executable(gtest_help_test_ test gtest_main)
+ py_test(gtest_help_test)
+
+ cxx_executable(googletest-list-tests-unittest_ test gtest)
+ py_test(googletest-list-tests-unittest)
+
+ cxx_executable(googletest-output-test_ test gtest)
+ py_test(googletest-output-test --no_stacktrace_support)
+
+ cxx_executable(googletest-shuffle-test_ test gtest)
+ py_test(googletest-shuffle-test)
+
+ # MSVC 7.1 does not support STL with exceptions disabled.
+ if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+ cxx_executable(googletest-throw-on-failure-test_ test gtest_no_exception)
+ set_target_properties(googletest-throw-on-failure-test_
+ PROPERTIES
+ COMPILE_FLAGS "${cxx_no_exception}")
+ py_test(googletest-throw-on-failure-test)
+ endif()
+
+ cxx_executable(googletest-uninitialized-test_ test gtest)
+ py_test(googletest-uninitialized-test)
+
+ cxx_executable(gtest_list_output_unittest_ test gtest)
+ py_test(gtest_list_output_unittest)
+
+ cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
+ cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
+ py_test(gtest_xml_outfiles_test)
+ py_test(googletest-json-outfiles-test)
+
+ cxx_executable(gtest_xml_output_unittest_ test gtest)
+ py_test(gtest_xml_output_unittest --no_stacktrace_support)
+ py_test(googletest-json-output-unittest --no_stacktrace_support)
+endif()
diff --git a/third_party/aom/third_party/googletest/src/googletest/README.md b/third_party/aom/third_party/googletest/src/googletest/README.md
new file mode 100644
index 0000000000..d26b309ed0
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/README.md
@@ -0,0 +1,217 @@
+### Generic Build Instructions
+
+#### Setup
+
+To build GoogleTest and your tests that use it, you need to tell your build
+system where to find its headers and source files. The exact way to do it
+depends on which build system you use, and is usually straightforward.
+
+### Build with CMake
+
+GoogleTest comes with a CMake build script
+([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+that can be used on a wide range of platforms ("C" stands for cross-platform.).
+If you don't have CMake installed already, you can download it for free from
+<http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can be used in
+the compiler environment of your choice. You can either build GoogleTest as a
+standalone project or it can be incorporated into an existing CMake build for
+another project.
+
+#### Standalone CMake Project
+
+When building GoogleTest as a standalone project, the typical workflow starts
+with
+
+```
+git clone https://github.com/google/googletest.git -b release-1.11.0
+cd googletest # Main directory of the cloned repository.
+mkdir build # Create a directory to hold the build output.
+cd build
+cmake .. # Generate native build scripts for GoogleTest.
+```
+
+The above command also includes GoogleMock by default. And so, if you want to
+build only GoogleTest, you should replace the last command with
+
+```
+cmake .. -DBUILD_GMOCK=OFF
+```
+
+If you are on a \*nix system, you should now see a Makefile in the current
+directory. Just type `make` to build GoogleTest. And then you can simply install
+GoogleTest if you are a system administrator.
+
+```
+make
+sudo make install # Install in /usr/local/ by default
+```
+
+If you use Windows and have Visual Studio installed, a `gtest.sln` file and
+several `.vcproj` files will be created. You can then build them using Visual
+Studio.
+
+On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
+
+#### Incorporating Into An Existing CMake Project
+
+If you want to use GoogleTest in a project which already uses CMake, the easiest
+way is to get installed libraries and headers.
+
+* Import GoogleTest by using `find_package` (or `pkg_check_modules`). For
+ example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the
+ libraries as `GTest::gtest`, `GTest::gmock`.
+
+And a more robust and flexible approach is to build GoogleTest as part of that
+project directly. This is done by making the GoogleTest source code available to
+the main build and adding it using CMake's `add_subdirectory()` command. This
+has the significant advantage that the same compiler and linker settings are
+used between GoogleTest and the rest of your project, so issues associated with
+using incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows. Making GoogleTest's source code available to the
+main build can be done a few different ways:
+
+* Download the GoogleTest source code manually and place it at a known
+ location. This is the least flexible approach and can make it more difficult
+ to use with continuous integration systems, etc.
+* Embed the GoogleTest source code as a direct copy in the main project's
+ source tree. This is often the simplest approach, but is also the hardest to
+ keep up to date. Some organizations may not permit this method.
+* Add GoogleTest as a git submodule or equivalent. This may not always be
+ possible or appropriate. Git submodules, for example, have their own set of
+ advantages and drawbacks.
+* Use CMake to download GoogleTest as part of the build's configure step. This
+ approach doesn't have the limitations of the other methods.
+
+The last of the above methods is implemented with a small piece of CMake code
+that downloads and pulls the GoogleTest code into the main build.
+
+Just add to your `CMakeLists.txt`:
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+ googletest
+ # Specify the commit you depend on and update it regularly.
+ URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+# Now simply link against gtest or gtest_main as needed. Eg
+add_executable(example example.cpp)
+target_link_libraries(example gtest_main)
+add_test(NAME example_test COMMAND example)
+```
+
+Note that this approach requires CMake 3.14 or later due to its use of the
+`FetchContent_MakeAvailable()` command.
+
+##### Visual Studio Dynamic vs Static Runtimes
+
+By default, new Visual Studio projects link the C runtimes dynamically but
+GoogleTest links them statically. This will generate an error that looks
+something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
+detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
+'MDd_DynamicDebug' in main.obj
+
+GoogleTest already has a CMake option for this: `gtest_force_shared_crt`
+
+Enabling this option will make gtest link the runtimes dynamically too, and
+match the project in which it is included.
+
+#### C++ Standard Version
+
+An environment that supports C++11 is required in order to successfully build
+GoogleTest. One way to ensure this is to specify the standard in the top-level
+project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
+is not feasible, for example in a C project using GoogleTest for validation,
+then it can be specified by adding it to the options for cmake via the
+`DCMAKE_CXX_FLAGS` option.
+
+### Tweaking GoogleTest
+
+GoogleTest can be used in diverse environments. The default configuration may
+not work (or may not work well) out of the box in some environments. However,
+you can easily tweak GoogleTest by defining control macros on the compiler
+command line. Generally, these macros are named like `GTEST_XYZ` and you define
+them to either 1 or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below. For a complete list, see file
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h).
+
+### Multi-threaded Tests
+
+GoogleTest is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the
+`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
+`#defined` to 1, no if it's undefined.).
+
+If GoogleTest doesn't correctly detect whether pthread is available in your
+environment, you can force it with
+
+ -DGTEST_HAS_PTHREAD=1
+
+or
+
+ -DGTEST_HAS_PTHREAD=0
+
+When GoogleTest uses pthread, you may need to add flags to your compiler and/or
+linker to select the pthread library, or you'll get link errors. If you use the
+CMake script, this is taken care of for you. If you use your own build script,
+you'll need to read your compiler and linker's manual to figure out what flags
+to add.
+
+### As a Shared Library (DLL)
+
+GoogleTest is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use GoogleTest as a shared library (known
+as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+ -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags. You'll also need to tell the linker to produce a shared
+library instead - consult your linker's manual for how to do it.
+
+To compile your *tests* that use the gtest shared library, add
+
+ -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when using some
+compilers (e.g. GCC), they may become necessary in the future, if we decide to
+improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
+to always add the above flags when using GoogleTest as a shared library.
+Otherwise a future release of GoogleTest may break your build script.
+
+### Avoiding Macro Name Clashes
+
+In C++, macros don't obey namespaces. Therefore two libraries that both define a
+macro of the same name will clash if you `#include` both definitions. In case a
+GoogleTest macro clashes with another library, you can force GoogleTest to
+rename its macro to avoid the conflict.
+
+Specifically, if both GoogleTest and some other code define macro FOO, you can
+add
+
+ -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
+example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
+
+ GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+ TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
diff --git a/third_party/aom/third_party/googletest/src/googletest/cmake/Config.cmake.in b/third_party/aom/third_party/googletest/src/googletest/cmake/Config.cmake.in
new file mode 100644
index 0000000000..12be4498b1
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/cmake/Config.cmake.in
@@ -0,0 +1,9 @@
+@PACKAGE_INIT@
+include(CMakeFindDependencyMacro)
+if (@GTEST_HAS_PTHREAD@)
+ set(THREADS_PREFER_PTHREAD_FLAG @THREADS_PREFER_PTHREAD_FLAG@)
+ find_dependency(Threads)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
+check_required_components("@project_name@")
diff --git a/third_party/aom/third_party/googletest/src/googletest/cmake/gtest.pc.in b/third_party/aom/third_party/googletest/src/googletest/cmake/gtest.pc.in
new file mode 100644
index 0000000000..b4148fae42
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/cmake/gtest.pc.in
@@ -0,0 +1,9 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gtest
+Description: GoogleTest (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/third_party/aom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in b/third_party/aom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
new file mode 100644
index 0000000000..38c88c54d5
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gtest_main
+Description: GoogleTest (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest = @PROJECT_VERSION@
+Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake
new file mode 100644
index 0000000000..5a34c07a1b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake
@@ -0,0 +1,342 @@
+# Defines functions and macros useful for building Google Test and
+# Google Mock.
+#
+# Note:
+#
+# - This file will be run twice when building Google Mock (once via
+# Google Test's CMakeLists.txt, and once via Google Mock's).
+# Therefore it shouldn't have any side effects other than defining
+# the functions and macros.
+#
+# - The functions/macros defined in this file may depend on Google
+# Test and Google Mock's option() definitions, and thus must be
+# called *after* the options have been defined.
+
+if (POLICY CMP0054)
+ cmake_policy(SET CMP0054 NEW)
+endif (POLICY CMP0054)
+
+# Tweaks CMake's default compiler/linker settings to suit Google Test's needs.
+#
+# This must be a macro(), as inside a function string() can only
+# update variables in the function scope.
+macro(fix_default_compiler_settings_)
+ if (MSVC)
+ # For MSVC, CMake sets certain flags to defaults we want to override.
+ # This replacement code is taken from sample in the CMake Wiki at
+ # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace.
+ foreach (flag_var
+ CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+ CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+ CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+ CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+ if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt)
+ # When Google Test is built as a shared library, it should also use
+ # shared runtime libraries. Otherwise, it may end up with multiple
+ # copies of runtime library data in different modules, resulting in
+ # hard-to-find crashes. When it is built as a static library, it is
+ # preferable to use CRT as static libraries, as we don't have to rely
+ # on CRT DLLs being available. CMake always defaults to using shared
+ # CRT libraries, so we override that default here.
+ string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}")
+ endif()
+
+ # We prefer more strict warning checking for building Google Test.
+ # Replaces /W3 with /W4 in defaults.
+ string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}")
+
+ # Prevent D9025 warning for targets that have exception handling
+ # turned off (/EHs-c- flag). Where required, exceptions are explicitly
+ # re-enabled using the cxx_exception_flags variable.
+ string(REPLACE "/EHsc" "" ${flag_var} "${${flag_var}}")
+ endforeach()
+ endif()
+endmacro()
+
+# Defines the compiler/linker flags used to build Google Test and
+# Google Mock. You can tweak these definitions to suit your need. A
+# variable's value is empty before it's explicitly assigned to.
+macro(config_compiler_and_linker)
+ # Note: pthreads on MinGW is not supported, even if available
+ # instead, we use windows threading primitives
+ unset(GTEST_HAS_PTHREAD)
+ if (NOT gtest_disable_pthreads AND NOT MINGW)
+ # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
+ find_package(Threads)
+ if (CMAKE_USE_PTHREADS_INIT)
+ set(GTEST_HAS_PTHREAD ON)
+ endif()
+ endif()
+
+ fix_default_compiler_settings_()
+ if (MSVC)
+ # Newlines inside flags variables break CMake's NMake generator.
+ # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds.
+ set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J")
+ set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
+ set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
+ set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1")
+ set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0")
+ set(cxx_no_rtti_flags "-GR-")
+ # Suppress "unreachable code" warning
+ # http://stackoverflow.com/questions/3232669 explains the issue.
+ set(cxx_base_flags "${cxx_base_flags} -wd4702")
+ # Ensure MSVC treats source files as UTF-8 encoded.
+ set(cxx_base_flags "${cxx_base_flags} -utf-8")
+ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ set(cxx_base_flags "-Wall -Wshadow -Wconversion")
+ set(cxx_exception_flags "-fexceptions")
+ set(cxx_no_exception_flags "-fno-exceptions")
+ set(cxx_strict_flags "-W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls")
+ set(cxx_no_rtti_flags "-fno-rtti")
+ elseif (CMAKE_COMPILER_IS_GNUCXX)
+ set(cxx_base_flags "-Wall -Wshadow")
+ if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
+ set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else")
+ endif()
+ set(cxx_exception_flags "-fexceptions")
+ set(cxx_no_exception_flags "-fno-exceptions")
+ # Until version 4.3.2, GCC doesn't define a macro to indicate
+ # whether RTTI is enabled. Therefore we define GTEST_HAS_RTTI
+ # explicitly.
+ set(cxx_no_rtti_flags "-fno-rtti -DGTEST_HAS_RTTI=0")
+ set(cxx_strict_flags
+ "-Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
+ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro")
+ set(cxx_exception_flags "-features=except")
+ # Sun Pro doesn't provide macros to indicate whether exceptions and
+ # RTTI are enabled, so we define GTEST_HAS_* explicitly.
+ set(cxx_no_exception_flags "-features=no%except -DGTEST_HAS_EXCEPTIONS=0")
+ set(cxx_no_rtti_flags "-features=no%rtti -DGTEST_HAS_RTTI=0")
+ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR
+ CMAKE_CXX_COMPILER_ID STREQUAL "XL")
+ # CMake 2.8 changes Visual Age's compiler ID to "XL".
+ set(cxx_exception_flags "-qeh")
+ set(cxx_no_exception_flags "-qnoeh")
+ # Until version 9.0, Visual Age doesn't define a macro to indicate
+ # whether RTTI is enabled. Therefore we define GTEST_HAS_RTTI
+ # explicitly.
+ set(cxx_no_rtti_flags "-qnortti -DGTEST_HAS_RTTI=0")
+ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "HP")
+ set(cxx_base_flags "-AA -mt")
+ set(cxx_exception_flags "-DGTEST_HAS_EXCEPTIONS=1")
+ set(cxx_no_exception_flags "+noeh -DGTEST_HAS_EXCEPTIONS=0")
+ # RTTI can not be disabled in HP aCC compiler.
+ set(cxx_no_rtti_flags "")
+ endif()
+
+ # The pthreads library is available and allowed?
+ if (DEFINED GTEST_HAS_PTHREAD)
+ set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=1")
+ else()
+ set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=0")
+ endif()
+ set(cxx_base_flags "${cxx_base_flags} ${GTEST_HAS_PTHREAD_MACRO}")
+
+ # For building gtest's own tests and samples.
+ set(cxx_exception "${cxx_base_flags} ${cxx_exception_flags}")
+ set(cxx_no_exception
+ "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
+ set(cxx_default "${cxx_exception}")
+ set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}")
+
+ # For building the gtest libraries.
+ set(cxx_strict "${cxx_default} ${cxx_strict_flags}")
+endmacro()
+
+# Defines the gtest & gtest_main libraries. User tests should link
+# with one of them.
+function(cxx_library_with_type name type cxx_flags)
+ # type can be either STATIC or SHARED to denote a static or shared library.
+ # ARGN refers to additional arguments after 'cxx_flags'.
+ add_library(${name} ${type} ${ARGN})
+ add_library(${cmake_package_name}::${name} ALIAS ${name})
+ set_target_properties(${name}
+ PROPERTIES
+ COMPILE_FLAGS "${cxx_flags}")
+ # Set the output directory for build artifacts
+ set_target_properties(${name}
+ PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+ LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+ ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+ PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+ # make PDBs match library name
+ get_target_property(pdb_debug_postfix ${name} DEBUG_POSTFIX)
+ set_target_properties(${name}
+ PROPERTIES
+ PDB_NAME "${name}"
+ PDB_NAME_DEBUG "${name}${pdb_debug_postfix}"
+ COMPILE_PDB_NAME "${name}"
+ COMPILE_PDB_NAME_DEBUG "${name}${pdb_debug_postfix}")
+
+ if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
+ set_target_properties(${name}
+ PROPERTIES
+ COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1")
+ if (NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+ target_compile_definitions(${name} INTERFACE
+ $<INSTALL_INTERFACE:GTEST_LINKED_AS_SHARED_LIBRARY=1>)
+ endif()
+ endif()
+ if (DEFINED GTEST_HAS_PTHREAD)
+ if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
+ set(threads_spec ${CMAKE_THREAD_LIBS_INIT})
+ else()
+ set(threads_spec Threads::Threads)
+ endif()
+ target_link_libraries(${name} PUBLIC ${threads_spec})
+ endif()
+
+ if (NOT "${CMAKE_VERSION}" VERSION_LESS "3.8")
+ target_compile_features(${name} PUBLIC cxx_std_11)
+ endif()
+endfunction()
+
+########################################################################
+#
+# Helper functions for creating build targets.
+
+function(cxx_shared_library name cxx_flags)
+ cxx_library_with_type(${name} SHARED "${cxx_flags}" ${ARGN})
+endfunction()
+
+function(cxx_library name cxx_flags)
+ cxx_library_with_type(${name} "" "${cxx_flags}" ${ARGN})
+endfunction()
+
+# cxx_executable_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ executable that depends on the given libraries and
+# is built from the given source files with the given compiler flags.
+function(cxx_executable_with_flags name cxx_flags libs)
+ add_executable(${name} ${ARGN})
+ if (MSVC)
+ # BigObj required for tests.
+ set(cxx_flags "${cxx_flags} -bigobj")
+ endif()
+ if (cxx_flags)
+ set_target_properties(${name}
+ PROPERTIES
+ COMPILE_FLAGS "${cxx_flags}")
+ endif()
+ if (BUILD_SHARED_LIBS)
+ set_target_properties(${name}
+ PROPERTIES
+ COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+ endif()
+ # To support mixing linking in static and dynamic libraries, link each
+ # library in with an extra call to target_link_libraries.
+ foreach (lib "${libs}")
+ target_link_libraries(${name} ${lib})
+ endforeach()
+endfunction()
+
+# cxx_executable(name dir lib srcs...)
+#
+# creates a named target that depends on the given libs and is built
+# from the given source files. dir/name.cc is implicitly included in
+# the source file list.
+function(cxx_executable name dir libs)
+ cxx_executable_with_flags(
+ ${name} "${cxx_default}" "${libs}" "${dir}/${name}.cc" ${ARGN})
+endfunction()
+
+# Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE.
+if ("${CMAKE_VERSION}" VERSION_LESS "3.12.0")
+ find_package(PythonInterp)
+else()
+ find_package(Python COMPONENTS Interpreter)
+ set(PYTHONINTERP_FOUND ${Python_Interpreter_FOUND})
+ set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
+endif()
+
+# cxx_test_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ test that depends on the given libs and is built
+# from the given source files with the given compiler flags.
+function(cxx_test_with_flags name cxx_flags libs)
+ cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
+ add_test(NAME ${name} COMMAND "$<TARGET_FILE:${name}>")
+endfunction()
+
+# cxx_test(name libs srcs...)
+#
+# creates a named test target that depends on the given libs and is
+# built from the given source files. Unlike cxx_test_with_flags,
+# test/name.cc is already implicitly included in the source file list.
+function(cxx_test name libs)
+ cxx_test_with_flags("${name}" "${cxx_default}" "${libs}"
+ "test/${name}.cc" ${ARGN})
+endfunction()
+
+# py_test(name)
+#
+# creates a Python test with the given name whose main module is in
+# test/name.py. It does nothing if Python is not installed.
+function(py_test name)
+ if (PYTHONINTERP_FOUND)
+ if ("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" VERSION_GREATER 3.1)
+ if (CMAKE_CONFIGURATION_TYPES)
+ # Multi-configuration build generators as for Visual Studio save
+ # output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
+ # Release etc.), so we have to provide it here.
+ add_test(NAME ${name}
+ COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+ --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
+ else (CMAKE_CONFIGURATION_TYPES)
+ # Single-configuration build generators like Makefile generators
+ # don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
+ add_test(NAME ${name}
+ COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+ --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
+ endif (CMAKE_CONFIGURATION_TYPES)
+ else()
+ # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can
+ # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
+ # only at ctest runtime (by calling ctest -c <Configuration>), so
+ # we have to escape $ to delay variable substitution here.
+ add_test(NAME ${name}
+ COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+ --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
+ endif()
+ # Make the Python import path consistent between Bazel and CMake.
+ set_tests_properties(${name} PROPERTIES ENVIRONMENT PYTHONPATH=${CMAKE_SOURCE_DIR})
+ endif(PYTHONINTERP_FOUND)
+endfunction()
+
+# install_project(targets...)
+#
+# Installs the specified targets and configures the associated pkgconfig files.
+function(install_project)
+ if(INSTALL_GTEST)
+ install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/"
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+ # Install the project targets.
+ install(TARGETS ${ARGN}
+ EXPORT ${targets_export_name}
+ RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+ ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+ LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+ # Install PDBs
+ foreach(t ${ARGN})
+ get_target_property(t_pdb_name ${t} COMPILE_PDB_NAME)
+ get_target_property(t_pdb_name_debug ${t} COMPILE_PDB_NAME_DEBUG)
+ get_target_property(t_pdb_output_directory ${t} PDB_OUTPUT_DIRECTORY)
+ install(FILES
+ "${t_pdb_output_directory}/\${CMAKE_INSTALL_CONFIG_NAME}/$<$<CONFIG:Debug>:${t_pdb_name_debug}>$<$<NOT:$<CONFIG:Debug>>:${t_pdb_name}>.pdb"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ OPTIONAL)
+ endforeach()
+ endif()
+ # Configure and install pkgconfig files.
+ foreach(t ${ARGN})
+ set(configured_pc "${generated_dir}/${t}.pc")
+ configure_file("${PROJECT_SOURCE_DIR}/cmake/${t}.pc.in"
+ "${configured_pc}" @ONLY)
+ install(FILES "${configured_pc}"
+ DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+ endforeach()
+ endif()
+endfunction()
diff --git a/third_party/aom/third_party/googletest/src/googletest/cmake/libgtest.la.in b/third_party/aom/third_party/googletest/src/googletest/cmake/libgtest.la.in
new file mode 100644
index 0000000000..840c83885f
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/cmake/libgtest.la.in
@@ -0,0 +1,21 @@
+# libgtest.la - a libtool library file
+# Generated by libtool (GNU libtool) 2.4.6
+
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# Names of this library.
+library_names='libgtest.so'
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='@CMAKE_INSTALL_FULL_LIBDIR@'
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h
new file mode 100644
index 0000000000..addbb59c64
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful. When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+// 1. Defining predicate functions to be used with Boolean test assertions
+// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+// 2. Defining predicate-format functions to be
+// used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+// testing::AssertionResult IsEven(int n) {
+// if ((n % 2) == 0)
+// return testing::AssertionSuccess();
+// else
+// return testing::AssertionFailure() << n << " is odd";
+// }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+// Value of: IsEven(Fib(5))
+// Actual: false (5 is odd)
+// Expected: true
+//
+// instead of a more opaque
+//
+// Value of: IsEven(Fib(5))
+// Actual: false
+// Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+// testing::AssertionResult IsEven(int n) {
+// if ((n % 2) == 0)
+// return testing::AssertionSuccess() << n << " is even";
+// else
+// return testing::AssertionFailure() << n << " is odd";
+// }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+// Value of: IsEven(Fib(6))
+// Actual: true (8 is even)
+// Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+// // Verifies that Foo() returns an even number.
+// EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+// testing::AssertionResult IsEven(const char* expr, int n) {
+// if ((n % 2) == 0)
+// return testing::AssertionSuccess();
+// else
+// return testing::AssertionFailure()
+// << "Expected: " << expr << " is even\n Actual: it's " << n;
+// }
+//
+// If Foo() returns 5, you will see the following message:
+//
+// Expected: Foo() is even
+// Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+ // Copy constructor.
+ // Used in EXPECT_TRUE/FALSE(assertion_result).
+ AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+ // Used in the EXPECT_TRUE/FALSE(bool_expression).
+ //
+ // T must be contextually convertible to bool.
+ //
+ // The second parameter prevents this overload from being considered if
+ // the argument is implicitly convertible to AssertionResult. In that case
+ // we want AssertionResult's copy constructor to be used.
+ template <typename T>
+ explicit AssertionResult(
+ const T& success,
+ typename std::enable_if<
+ !std::is_convertible<T, AssertionResult>::value>::type*
+ /*enabler*/
+ = nullptr)
+ : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+ GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+ // Assignment operator.
+ AssertionResult& operator=(AssertionResult other) {
+ swap(other);
+ return *this;
+ }
+
+ // Returns true if and only if the assertion succeeded.
+ operator bool() const { return success_; } // NOLINT
+
+ // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+ AssertionResult operator!() const;
+
+ // Returns the text streamed into this AssertionResult. Test assertions
+ // use it when they fail (i.e., the predicate's outcome doesn't match the
+ // assertion's expectation). When nothing has been streamed into the
+ // object, returns an empty string.
+ const char* message() const {
+ return message_.get() != nullptr ? message_->c_str() : "";
+ }
+ // Deprecated; please use message() instead.
+ const char* failure_message() const { return message(); }
+
+ // Streams a custom failure message into this object.
+ template <typename T>
+ AssertionResult& operator<<(const T& value) {
+ AppendMessage(Message() << value);
+ return *this;
+ }
+
+ // Allows streaming basic output manipulators such as endl or flush into
+ // this object.
+ AssertionResult& operator<<(
+ ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+ AppendMessage(Message() << basic_manipulator);
+ return *this;
+ }
+
+ private:
+ // Appends the contents of message to message_.
+ void AppendMessage(const Message& a_message) {
+ if (message_.get() == nullptr) message_.reset(new ::std::string);
+ message_->append(a_message.GetString().c_str());
+ }
+
+ // Swap the contents of this AssertionResult with other.
+ void swap(AssertionResult& other);
+
+ // Stores result of the assertion predicate.
+ bool success_;
+ // Stores the message describing the condition in case the expectation
+ // construct is not satisfied with the predicate's outcome.
+ // Referenced via a pointer to avoid taking too much stack frame space
+ // with test assertions.
+ std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
new file mode 100644
index 0000000000..84e5a5bbd3
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
@@ -0,0 +1,345 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for death tests. It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+#include "gtest/internal/gtest-death-test-internal.h"
+
+// This flag controls the style of death tests. Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+namespace testing {
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process. Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests. IMPORTANT: This is an internal utility. Using it may break the
+// implementation of death tests. User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+} // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+// 1. It generates a warning if there is more than one active
+// thread. This is because it's safe to fork() or clone() only
+// when there is a single thread.
+//
+// 2. The parent process clone()s a sub-process and runs the death
+// test in it; the sub-process exits with code 0 at the end of the
+// death test, if it hasn't exited already.
+//
+// 3. The parent process waits for the sub-process to terminate.
+//
+// 4. The parent process checks the exit code and error message of
+// the sub-process.
+//
+// Examples:
+//
+// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+// for (int i = 0; i < 5; i++) {
+// EXPECT_DEATH(server.ProcessRequest(i),
+// "Invalid request .* in ProcessRequest()")
+// << "Failed to die on request " << i;
+// }
+//
+// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+// bool KilledBySIGHUP(int exit_code) {
+// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+// }
+//
+// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// The final parameter to each of these macros is a matcher applied to any data
+// the sub-process wrote to stderr. For compatibility with existing tests, a
+// bare string is interpreted as a regular expression matcher.
+//
+// On the regular expressions used in death tests:
+//
+// On POSIX-compliant systems (*nix), we use the <regex.h> library,
+// which uses the POSIX extended regex syntax.
+//
+// On other platforms (e.g. Windows or Mac), we only support a simple regex
+// syntax implemented as part of Google Test. This limited
+// implementation should be enough most of the time when writing
+// death tests; though it lacks many features you can find in PCRE
+// or POSIX extended regex syntax. For example, we don't support
+// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+// repetition count ("x{5,7}"), among others.
+//
+// Below is the syntax that we do support. We chose it to be a
+// subset of both PCRE and POSIX extended regex, so it's easy to
+// learn wherever you come from. In the following: 'A' denotes a
+// literal character, period (.), or a single \\ escape sequence;
+// 'x' and 'y' denote regular expressions; 'm' and 'n' are for
+// natural numbers.
+//
+// c matches any literal character c
+// \\d matches any decimal digit
+// \\D matches any character that's not a decimal digit
+// \\f matches \f
+// \\n matches \n
+// \\r matches \r
+// \\s matches any ASCII whitespace, including \n
+// \\S matches any character that's not a whitespace
+// \\t matches \t
+// \\v matches \v
+// \\w matches any letter, _, or decimal digit
+// \\W matches any character that \\w doesn't match
+// \\c matches any literal character c, which must be a punctuation
+// . matches any single character except \n
+// A? matches 0 or 1 occurrences of A
+// A* matches 0 or many occurrences of A
+// A+ matches 1 or many occurrences of A
+// ^ matches the beginning of a string (not that of each line)
+// $ matches the end of a string (not that of each line)
+// xy matches x followed by y
+//
+// If you accidentally use PCRE or POSIX extended regex features
+// not implemented by us, you will get a run-time failure. In that
+// case, please try to rewrite your regular expression within the
+// above syntax.
+//
+// This implementation is *not* meant to be as highly tuned or robust
+// as a compiled regex library, but should perform well enough for a
+// death test, which already incurs significant overhead by launching
+// a child process.
+//
+// Known caveats:
+//
+// A "threadsafe" style death test obtains the path to the test
+// program from argv[0] and re-executes it in the sub-process. For
+// simplicity, the current implementation doesn't search the PATH
+// when launching the sub-process. This means that the user must
+// invoke the test program via a path that contains at least one
+// path separator (e.g. path/to/foo_test and
+// /absolute/path/to/bar_test are fine, but foo_test is not). This
+// is rarely a problem as people usually don't put the test binary
+// directory in PATH.
+//
+
+// Asserts that a given `statement` causes the program to exit, with an
+// integer exit status that satisfies `predicate`, and emitting error output
+// that matches `matcher`.
+#define ASSERT_EXIT(statement, predicate, matcher) \
+ GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
+
+// Like `ASSERT_EXIT`, but continues on to successive tests in the
+// test suite, if any:
+#define EXPECT_EXIT(statement, predicate, matcher) \
+ GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given `statement` causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches `matcher`.
+#define ASSERT_DEATH(statement, matcher) \
+ ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+
+// Like `ASSERT_DEATH`, but continues on to successive tests in the
+// test suite, if any:
+#define EXPECT_DEATH(statement, matcher) \
+ EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+ explicit ExitedWithCode(int exit_code);
+ ExitedWithCode(const ExitedWithCode&) = default;
+ void operator=(const ExitedWithCode& other) = delete;
+ bool operator()(int exit_status) const;
+
+ private:
+ const int exit_code_;
+};
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+ explicit KilledBySignal(int signum);
+ bool operator()(int exit_status) const;
+
+ private:
+ const int signum_;
+};
+#endif // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+// if (sideeffect) {
+// *sideeffect = 12;
+// }
+// LOG(DFATAL) << "death";
+// return 12;
+// }
+//
+// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) {
+// int sideeffect = 0;
+// // Only asserts in dbg.
+// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+// // opt-mode has sideeffect visible.
+// EXPECT_EQ(12, sideeffect);
+// #else
+// // dbg-mode no visible sideeffect.
+// EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects. A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+// // Side-effects here will have an effect after this statement in
+// // opt mode, but none in debug mode.
+// EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+#ifdef NDEBUG
+
+#define EXPECT_DEBUG_DEATH(statement, regex) \
+ GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#define ASSERT_DEBUG_DEATH(statement, regex) \
+ GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#else
+
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
+
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
+
+#endif // NDEBUG for EXPECT_DEBUG_DEATH
+#endif // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters
+// on systems that support death tests. This allows one to write such a macro on
+// a system that does not support death tests and be sure that it will compile
+// on a death-test supporting system. It is exposed publicly so that systems
+// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST
+// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and
+// ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+// statement - A statement that a macro such as EXPECT_DEATH would test
+// for program termination. This macro has to make sure this
+// statement is compiled but not executed, to ensure that
+// EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+// parameter if and only if EXPECT_DEATH compiles with it.
+// regex - A regex that a macro such as EXPECT_DEATH would use to test
+// the output of statement. This parameter has to be
+// compiled but not evaluated by this macro, to ensure that
+// this macro only accepts expressions that a macro such as
+// EXPECT_DEATH would accept.
+// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+// and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+// This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+// compile inside functions where ASSERT_DEATH doesn't
+// compile.
+//
+// The branch that has an always false condition is used to ensure that
+// statement and regex are compiled (and thus syntactically correct) but
+// never executed. The unreachable code macro protects the terminator
+// statement from generating an 'unreachable code' warning in case
+// statement unconditionally returns or throws. The Message constructor at
+// the end allows the syntax of streaming additional messages into the
+// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+ << "Statement '" #statement "' cannot be verified."; \
+ } else if (::testing::internal::AlwaysFalse()) { \
+ ::testing::internal::RE::PartialMatch(".*", (regex)); \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ terminator; \
+ } else \
+ ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning. This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+ EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+ ASSERT_DEATH(statement, regex)
+#else
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+ GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+ GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#endif
+
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
new file mode 100644
index 0000000000..bffa00c533
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
@@ -0,0 +1,956 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+
+#include <atomic>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GTEST_MAYBE_5046_ 5046
+#else
+#define GTEST_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+ 4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+ clients of class B */
+ /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+// 1. a class FooMatcherMatcher that implements the matcher interface:
+// using is_gtest_matcher = void;
+// bool MatchAndExplain(const T&, std::ostream*);
+// (MatchResultListener* can also be used instead of std::ostream*)
+// void DescribeTo(std::ostream*);
+// void DescribeNegationTo(std::ostream*);
+//
+// 2. a factory function that creates a Matcher<T> object from a
+// FooMatcherMatcher.
+
+class MatchResultListener {
+ public:
+ // Creates a listener object with the given underlying ostream. The
+ // listener does not own the ostream, and does not dereference it
+ // in the constructor or destructor.
+ explicit MatchResultListener(::std::ostream* os) : stream_(os) {}
+ virtual ~MatchResultListener() = 0; // Makes this class abstract.
+
+ // Streams x to the underlying ostream; does nothing if the ostream
+ // is NULL.
+ template <typename T>
+ MatchResultListener& operator<<(const T& x) {
+ if (stream_ != nullptr) *stream_ << x;
+ return *this;
+ }
+
+ // Returns the underlying ostream.
+ ::std::ostream* stream() { return stream_; }
+
+ // Returns true if and only if the listener is interested in an explanation
+ // of the match result. A matcher's MatchAndExplain() method can use
+ // this information to avoid generating the explanation when no one
+ // intends to hear it.
+ bool IsInterested() const { return stream_ != nullptr; }
+
+ private:
+ ::std::ostream* const stream_;
+
+ MatchResultListener(const MatchResultListener&) = delete;
+ MatchResultListener& operator=(const MatchResultListener&) = delete;
+};
+
+inline MatchResultListener::~MatchResultListener() {}
+
+// An instance of a subclass of this knows how to describe itself as a
+// matcher.
+class GTEST_API_ MatcherDescriberInterface {
+ public:
+ virtual ~MatcherDescriberInterface() {}
+
+ // Describes this matcher to an ostream. The function should print
+ // a verb phrase that describes the property a value matching this
+ // matcher should have. The subject of the verb phrase is the value
+ // being matched. For example, the DescribeTo() method of the Gt(7)
+ // matcher prints "is greater than 7".
+ virtual void DescribeTo(::std::ostream* os) const = 0;
+
+ // Describes the negation of this matcher to an ostream. For
+ // example, if the description of this matcher is "is greater than
+ // 7", the negated description could be "is not greater than 7".
+ // You are not required to override this when implementing
+ // MatcherInterface, but it is highly advised so that your matcher
+ // can produce good error messages.
+ virtual void DescribeNegationTo(::std::ostream* os) const {
+ *os << "not (";
+ DescribeTo(os);
+ *os << ")";
+ }
+};
+
+// The implementation of a matcher.
+template <typename T>
+class MatcherInterface : public MatcherDescriberInterface {
+ public:
+ // Returns true if and only if the matcher matches x; also explains the
+ // match result to 'listener' if necessary (see the next paragraph), in
+ // the form of a non-restrictive relative clause ("which ...",
+ // "whose ...", etc) that describes x. For example, the
+ // MatchAndExplain() method of the Pointee(...) matcher should
+ // generate an explanation like "which points to ...".
+ //
+ // Implementations of MatchAndExplain() should add an explanation of
+ // the match result *if and only if* they can provide additional
+ // information that's not already present (or not obvious) in the
+ // print-out of x and the matcher's description. Whether the match
+ // succeeds is not a factor in deciding whether an explanation is
+ // needed, as sometimes the caller needs to print a failure message
+ // when the match succeeds (e.g. when the matcher is used inside
+ // Not()).
+ //
+ // For example, a "has at least 10 elements" matcher should explain
+ // what the actual element count is, regardless of the match result,
+ // as it is useful information to the reader; on the other hand, an
+ // "is empty" matcher probably only needs to explain what the actual
+ // size is when the match fails, as it's redundant to say that the
+ // size is 0 when the value is already known to be empty.
+ //
+ // You should override this method when defining a new matcher.
+ //
+ // It's the responsibility of the caller (Google Test) to guarantee
+ // that 'listener' is not NULL. This helps to simplify a matcher's
+ // implementation when it doesn't care about the performance, as it
+ // can talk to 'listener' without checking its validity first.
+ // However, in order to implement dummy listeners efficiently,
+ // listener->stream() may be NULL.
+ virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
+
+ // Inherits these methods from MatcherDescriberInterface:
+ // virtual void DescribeTo(::std::ostream* os) const = 0;
+ // virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+
+namespace internal {
+
+struct AnyEq {
+ template <typename A, typename B>
+ bool operator()(const A& a, const B& b) const {
+ return a == b;
+ }
+};
+struct AnyNe {
+ template <typename A, typename B>
+ bool operator()(const A& a, const B& b) const {
+ return a != b;
+ }
+};
+struct AnyLt {
+ template <typename A, typename B>
+ bool operator()(const A& a, const B& b) const {
+ return a < b;
+ }
+};
+struct AnyGt {
+ template <typename A, typename B>
+ bool operator()(const A& a, const B& b) const {
+ return a > b;
+ }
+};
+struct AnyLe {
+ template <typename A, typename B>
+ bool operator()(const A& a, const B& b) const {
+ return a <= b;
+ }
+};
+struct AnyGe {
+ template <typename A, typename B>
+ bool operator()(const A& a, const B& b) const {
+ return a >= b;
+ }
+};
+
+// A match result listener that ignores the explanation.
+class DummyMatchResultListener : public MatchResultListener {
+ public:
+ DummyMatchResultListener() : MatchResultListener(nullptr) {}
+
+ private:
+ DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+ DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
+};
+
+// A match result listener that forwards the explanation to a given
+// ostream. The difference between this and MatchResultListener is
+// that the former is concrete.
+class StreamMatchResultListener : public MatchResultListener {
+ public:
+ explicit StreamMatchResultListener(::std::ostream* os)
+ : MatchResultListener(os) {}
+
+ private:
+ StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+ StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+ delete;
+};
+
+struct SharedPayloadBase {
+ std::atomic<int> ref{1};
+ void Ref() { ref.fetch_add(1, std::memory_order_relaxed); }
+ bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; }
+};
+
+template <typename T>
+struct SharedPayload : SharedPayloadBase {
+ explicit SharedPayload(const T& v) : value(v) {}
+ explicit SharedPayload(T&& v) : value(std::move(v)) {}
+
+ static void Destroy(SharedPayloadBase* shared) {
+ delete static_cast<SharedPayload*>(shared);
+ }
+
+ T value;
+};
+
+// An internal class for implementing Matcher<T>, which will derive
+// from it. We put functionalities common to all Matcher<T>
+// specializations here to avoid code duplication.
+template <typename T>
+class MatcherBase : private MatcherDescriberInterface {
+ public:
+ // Returns true if and only if the matcher matches x; also explains the
+ // match result to 'listener'.
+ bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
+ GTEST_CHECK_(vtable_ != nullptr);
+ return vtable_->match_and_explain(*this, x, listener);
+ }
+
+ // Returns true if and only if this matcher matches x.
+ bool Matches(const T& x) const {
+ DummyMatchResultListener dummy;
+ return MatchAndExplain(x, &dummy);
+ }
+
+ // Describes this matcher to an ostream.
+ void DescribeTo(::std::ostream* os) const final {
+ GTEST_CHECK_(vtable_ != nullptr);
+ vtable_->describe(*this, os, false);
+ }
+
+ // Describes the negation of this matcher to an ostream.
+ void DescribeNegationTo(::std::ostream* os) const final {
+ GTEST_CHECK_(vtable_ != nullptr);
+ vtable_->describe(*this, os, true);
+ }
+
+ // Explains why x matches, or doesn't match, the matcher.
+ void ExplainMatchResultTo(const T& x, ::std::ostream* os) const {
+ StreamMatchResultListener listener(os);
+ MatchAndExplain(x, &listener);
+ }
+
+ // Returns the describer for this matcher object; retains ownership
+ // of the describer, which is only guaranteed to be alive when
+ // this matcher object is alive.
+ const MatcherDescriberInterface* GetDescriber() const {
+ if (vtable_ == nullptr) return nullptr;
+ return vtable_->get_describer(*this);
+ }
+
+ protected:
+ MatcherBase() : vtable_(nullptr), buffer_() {}
+
+ // Constructs a matcher from its implementation.
+ template <typename U>
+ explicit MatcherBase(const MatcherInterface<U>* impl)
+ : vtable_(nullptr), buffer_() {
+ Init(impl);
+ }
+
+ template <typename M, typename = typename std::remove_reference<
+ M>::type::is_gtest_matcher>
+ MatcherBase(M&& m) : vtable_(nullptr), buffer_() { // NOLINT
+ Init(std::forward<M>(m));
+ }
+
+ MatcherBase(const MatcherBase& other)
+ : vtable_(other.vtable_), buffer_(other.buffer_) {
+ if (IsShared()) buffer_.shared->Ref();
+ }
+
+ MatcherBase& operator=(const MatcherBase& other) {
+ if (this == &other) return *this;
+ Destroy();
+ vtable_ = other.vtable_;
+ buffer_ = other.buffer_;
+ if (IsShared()) buffer_.shared->Ref();
+ return *this;
+ }
+
+ MatcherBase(MatcherBase&& other)
+ : vtable_(other.vtable_), buffer_(other.buffer_) {
+ other.vtable_ = nullptr;
+ }
+
+ MatcherBase& operator=(MatcherBase&& other) {
+ if (this == &other) return *this;
+ Destroy();
+ vtable_ = other.vtable_;
+ buffer_ = other.buffer_;
+ other.vtable_ = nullptr;
+ return *this;
+ }
+
+ ~MatcherBase() override { Destroy(); }
+
+ private:
+ struct VTable {
+ bool (*match_and_explain)(const MatcherBase&, const T&,
+ MatchResultListener*);
+ void (*describe)(const MatcherBase&, std::ostream*, bool negation);
+ // Returns the captured object if it implements the interface, otherwise
+ // returns the MatcherBase itself.
+ const MatcherDescriberInterface* (*get_describer)(const MatcherBase&);
+ // Called on shared instances when the reference count reaches 0.
+ void (*shared_destroy)(SharedPayloadBase*);
+ };
+
+ bool IsShared() const {
+ return vtable_ != nullptr && vtable_->shared_destroy != nullptr;
+ }
+
+ // If the implementation uses a listener, call that.
+ template <typename P>
+ static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+ MatchResultListener* listener)
+ -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) {
+ return P::Get(m).MatchAndExplain(value, listener->stream());
+ }
+
+ template <typename P>
+ static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+ MatchResultListener* listener)
+ -> decltype(P::Get(m).MatchAndExplain(value, listener)) {
+ return P::Get(m).MatchAndExplain(value, listener);
+ }
+
+ template <typename P>
+ static void DescribeImpl(const MatcherBase& m, std::ostream* os,
+ bool negation) {
+ if (negation) {
+ P::Get(m).DescribeNegationTo(os);
+ } else {
+ P::Get(m).DescribeTo(os);
+ }
+ }
+
+ template <typename P>
+ static const MatcherDescriberInterface* GetDescriberImpl(
+ const MatcherBase& m) {
+ // If the impl is a MatcherDescriberInterface, then return it.
+ // Otherwise use MatcherBase itself.
+ // This allows us to implement the GetDescriber() function without support
+ // from the impl, but some users really want to get their impl back when
+ // they call GetDescriber().
+ // We use std::get on a tuple as a workaround of not having `if constexpr`.
+ return std::get<(
+ std::is_convertible<decltype(&P::Get(m)),
+ const MatcherDescriberInterface*>::value
+ ? 1
+ : 0)>(std::make_tuple(&m, &P::Get(m)));
+ }
+
+ template <typename P>
+ const VTable* GetVTable() {
+ static constexpr VTable kVTable = {&MatchAndExplainImpl<P>,
+ &DescribeImpl<P>, &GetDescriberImpl<P>,
+ P::shared_destroy};
+ return &kVTable;
+ }
+
+ union Buffer {
+ // Add some types to give Buffer some common alignment/size use cases.
+ void* ptr;
+ double d;
+ int64_t i;
+ // And add one for the out-of-line cases.
+ SharedPayloadBase* shared;
+ };
+
+ void Destroy() {
+ if (IsShared() && buffer_.shared->Unref()) {
+ vtable_->shared_destroy(buffer_.shared);
+ }
+ }
+
+ template <typename M>
+ static constexpr bool IsInlined() {
+ return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) &&
+ std::is_trivially_copy_constructible<M>::value &&
+ std::is_trivially_destructible<M>::value;
+ }
+
+ template <typename M, bool = MatcherBase::IsInlined<M>()>
+ struct ValuePolicy {
+ static const M& Get(const MatcherBase& m) {
+ // When inlined along with Init, need to be explicit to avoid violating
+ // strict aliasing rules.
+ const M* ptr =
+ static_cast<const M*>(static_cast<const void*>(&m.buffer_));
+ return *ptr;
+ }
+ static void Init(MatcherBase& m, M impl) {
+ ::new (static_cast<void*>(&m.buffer_)) M(impl);
+ }
+ static constexpr auto shared_destroy = nullptr;
+ };
+
+ template <typename M>
+ struct ValuePolicy<M, false> {
+ using Shared = SharedPayload<M>;
+ static const M& Get(const MatcherBase& m) {
+ return static_cast<Shared*>(m.buffer_.shared)->value;
+ }
+ template <typename Arg>
+ static void Init(MatcherBase& m, Arg&& arg) {
+ m.buffer_.shared = new Shared(std::forward<Arg>(arg));
+ }
+ static constexpr auto shared_destroy = &Shared::Destroy;
+ };
+
+ template <typename U, bool B>
+ struct ValuePolicy<const MatcherInterface<U>*, B> {
+ using M = const MatcherInterface<U>;
+ using Shared = SharedPayload<std::unique_ptr<M>>;
+ static const M& Get(const MatcherBase& m) {
+ return *static_cast<Shared*>(m.buffer_.shared)->value;
+ }
+ static void Init(MatcherBase& m, M* impl) {
+ m.buffer_.shared = new Shared(std::unique_ptr<M>(impl));
+ }
+
+ static constexpr auto shared_destroy = &Shared::Destroy;
+ };
+
+ template <typename M>
+ void Init(M&& m) {
+ using MM = typename std::decay<M>::type;
+ using Policy = ValuePolicy<MM>;
+ vtable_ = GetVTable<Policy>();
+ Policy::Init(*this, std::forward<M>(m));
+ }
+
+ const VTable* vtable_;
+ Buffer buffer_;
+};
+
+} // namespace internal
+
+// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
+// object that can check whether a value of type T matches. The
+// implementation of Matcher<T> is just a std::shared_ptr to const
+// MatcherInterface<T>. Don't inherit from Matcher!
+template <typename T>
+class Matcher : public internal::MatcherBase<T> {
+ public:
+ // Constructs a null matcher. Needed for storing Matcher objects in STL
+ // containers. A default-constructed matcher is not yet initialized. You
+ // cannot use it until a valid value has been assigned to it.
+ explicit Matcher() {} // NOLINT
+
+ // Constructs a matcher from its implementation.
+ explicit Matcher(const MatcherInterface<const T&>* impl)
+ : internal::MatcherBase<T>(impl) {}
+
+ template <typename U>
+ explicit Matcher(
+ const MatcherInterface<U>* impl,
+ typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
+ nullptr)
+ : internal::MatcherBase<T>(impl) {}
+
+ template <typename M, typename = typename std::remove_reference<
+ M>::type::is_gtest_matcher>
+ Matcher(M&& m) : internal::MatcherBase<T>(std::forward<M>(m)) {} // NOLINT
+
+ // Implicit constructor here allows people to write
+ // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
+ Matcher(T value); // NOLINT
+};
+
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const std::string&>
+ : public internal::MatcherBase<const std::string&> {
+ public:
+ Matcher() {}
+
+ explicit Matcher(const MatcherInterface<const std::string&>* impl)
+ : internal::MatcherBase<const std::string&>(impl) {}
+
+ template <typename M, typename = typename std::remove_reference<
+ M>::type::is_gtest_matcher>
+ Matcher(M&& m) // NOLINT
+ : internal::MatcherBase<const std::string&>(std::forward<M>(m)) {}
+
+ // Allows the user to write str instead of Eq(str) sometimes, where
+ // str is a std::string object.
+ Matcher(const std::string& s); // NOLINT
+
+ // Allows the user to write "foo" instead of Eq("foo") sometimes.
+ Matcher(const char* s); // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<std::string>
+ : public internal::MatcherBase<std::string> {
+ public:
+ Matcher() {}
+
+ explicit Matcher(const MatcherInterface<const std::string&>* impl)
+ : internal::MatcherBase<std::string>(impl) {}
+ explicit Matcher(const MatcherInterface<std::string>* impl)
+ : internal::MatcherBase<std::string>(impl) {}
+
+ template <typename M, typename = typename std::remove_reference<
+ M>::type::is_gtest_matcher>
+ Matcher(M&& m) // NOLINT
+ : internal::MatcherBase<std::string>(std::forward<M>(m)) {}
+
+ // Allows the user to write str instead of Eq(str) sometimes, where
+ // str is a string object.
+ Matcher(const std::string& s); // NOLINT
+
+ // Allows the user to write "foo" instead of Eq("foo") sometimes.
+ Matcher(const char* s); // NOLINT
+};
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const internal::StringView&>
+ : public internal::MatcherBase<const internal::StringView&> {
+ public:
+ Matcher() {}
+
+ explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+ : internal::MatcherBase<const internal::StringView&>(impl) {}
+
+ template <typename M, typename = typename std::remove_reference<
+ M>::type::is_gtest_matcher>
+ Matcher(M&& m) // NOLINT
+ : internal::MatcherBase<const internal::StringView&>(std::forward<M>(m)) {
+ }
+
+ // Allows the user to write str instead of Eq(str) sometimes, where
+ // str is a std::string object.
+ Matcher(const std::string& s); // NOLINT
+
+ // Allows the user to write "foo" instead of Eq("foo") sometimes.
+ Matcher(const char* s); // NOLINT
+
+ // Allows the user to pass absl::string_views or std::string_views directly.
+ Matcher(internal::StringView s); // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<internal::StringView>
+ : public internal::MatcherBase<internal::StringView> {
+ public:
+ Matcher() {}
+
+ explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+ : internal::MatcherBase<internal::StringView>(impl) {}
+ explicit Matcher(const MatcherInterface<internal::StringView>* impl)
+ : internal::MatcherBase<internal::StringView>(impl) {}
+
+ template <typename M, typename = typename std::remove_reference<
+ M>::type::is_gtest_matcher>
+ Matcher(M&& m) // NOLINT
+ : internal::MatcherBase<internal::StringView>(std::forward<M>(m)) {}
+
+ // Allows the user to write str instead of Eq(str) sometimes, where
+ // str is a std::string object.
+ Matcher(const std::string& s); // NOLINT
+
+ // Allows the user to write "foo" instead of Eq("foo") sometimes.
+ Matcher(const char* s); // NOLINT
+
+ // Allows the user to pass absl::string_views or std::string_views directly.
+ Matcher(internal::StringView s); // NOLINT
+};
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+// Prints a matcher in a human-readable format.
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const Matcher<T>& matcher) {
+ matcher.DescribeTo(&os);
+ return os;
+}
+
+// The PolymorphicMatcher class template makes it easy to implement a
+// polymorphic matcher (i.e. a matcher that can match values of more
+// than one type, e.g. Eq(n) and NotNull()).
+//
+// To define a polymorphic matcher, a user should provide an Impl
+// class that has a DescribeTo() method and a DescribeNegationTo()
+// method, and define a member function (or member function template)
+//
+// bool MatchAndExplain(const Value& value,
+// MatchResultListener* listener) const;
+//
+// See the definition of NotNull() for a complete example.
+template <class Impl>
+class PolymorphicMatcher {
+ public:
+ explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {}
+
+ // Returns a mutable reference to the underlying matcher
+ // implementation object.
+ Impl& mutable_impl() { return impl_; }
+
+ // Returns an immutable reference to the underlying matcher
+ // implementation object.
+ const Impl& impl() const { return impl_; }
+
+ template <typename T>
+ operator Matcher<T>() const {
+ return Matcher<T>(new MonomorphicImpl<const T&>(impl_));
+ }
+
+ private:
+ template <typename T>
+ class MonomorphicImpl : public MatcherInterface<T> {
+ public:
+ explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+ void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); }
+
+ void DescribeNegationTo(::std::ostream* os) const override {
+ impl_.DescribeNegationTo(os);
+ }
+
+ bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+ return impl_.MatchAndExplain(x, listener);
+ }
+
+ private:
+ const Impl impl_;
+ };
+
+ Impl impl_;
+};
+
+// Creates a matcher from its implementation.
+// DEPRECATED: Especially in the generic code, prefer:
+// Matcher<T>(new MyMatcherImpl<const T&>(...));
+//
+// MakeMatcher may create a Matcher that accepts its argument by value, which
+// leads to unnecessary copies & lack of support for non-copyable types.
+template <typename T>
+inline Matcher<T> MakeMatcher(const MatcherInterface<T>* impl) {
+ return Matcher<T>(impl);
+}
+
+// Creates a polymorphic matcher from its implementation. This is
+// easier to use than the PolymorphicMatcher<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+// MakePolymorphicMatcher(foo);
+// vs
+// PolymorphicMatcher<TypeOfFoo>(foo);
+template <class Impl>
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl& impl) {
+ return PolymorphicMatcher<Impl>(impl);
+}
+
+namespace internal {
+// Implements a matcher that compares a given value with a
+// pre-supplied value using one of the ==, <=, <, etc, operators. The
+// two values being compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq(5) can be
+// used to match an int, a short, a double, etc). Therefore we use
+// a template type conversion operator in the implementation.
+//
+// The following template definition assumes that the Rhs parameter is
+// a "bare" type (i.e. neither 'const T' nor 'T&').
+template <typename D, typename Rhs, typename Op>
+class ComparisonBase {
+ public:
+ explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+
+ using is_gtest_matcher = void;
+
+ template <typename Lhs>
+ bool MatchAndExplain(const Lhs& lhs, std::ostream*) const {
+ return Op()(lhs, Unwrap(rhs_));
+ }
+ void DescribeTo(std::ostream* os) const {
+ *os << D::Desc() << " ";
+ UniversalPrint(Unwrap(rhs_), os);
+ }
+ void DescribeNegationTo(std::ostream* os) const {
+ *os << D::NegatedDesc() << " ";
+ UniversalPrint(Unwrap(rhs_), os);
+ }
+
+ private:
+ template <typename T>
+ static const T& Unwrap(const T& v) {
+ return v;
+ }
+ template <typename T>
+ static const T& Unwrap(std::reference_wrapper<T> v) {
+ return v;
+ }
+
+ Rhs rhs_;
+};
+
+template <typename Rhs>
+class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
+ public:
+ explicit EqMatcher(const Rhs& rhs)
+ : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
+ static const char* Desc() { return "is equal to"; }
+ static const char* NegatedDesc() { return "isn't equal to"; }
+};
+template <typename Rhs>
+class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
+ public:
+ explicit NeMatcher(const Rhs& rhs)
+ : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
+ static const char* Desc() { return "isn't equal to"; }
+ static const char* NegatedDesc() { return "is equal to"; }
+};
+template <typename Rhs>
+class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
+ public:
+ explicit LtMatcher(const Rhs& rhs)
+ : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
+ static const char* Desc() { return "is <"; }
+ static const char* NegatedDesc() { return "isn't <"; }
+};
+template <typename Rhs>
+class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
+ public:
+ explicit GtMatcher(const Rhs& rhs)
+ : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
+ static const char* Desc() { return "is >"; }
+ static const char* NegatedDesc() { return "isn't >"; }
+};
+template <typename Rhs>
+class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
+ public:
+ explicit LeMatcher(const Rhs& rhs)
+ : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
+ static const char* Desc() { return "is <="; }
+ static const char* NegatedDesc() { return "isn't <="; }
+};
+template <typename Rhs>
+class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
+ public:
+ explicit GeMatcher(const Rhs& rhs)
+ : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
+ static const char* Desc() { return "is >="; }
+ static const char* NegatedDesc() { return "isn't >="; }
+};
+
+template <typename T, typename = typename std::enable_if<
+ std::is_constructible<std::string, T>::value>::type>
+using StringLike = T;
+
+// Implements polymorphic matchers MatchesRegex(regex) and
+// ContainsRegex(regex), which can be used as a Matcher<T> as long as
+// T can be converted to a string.
+class MatchesRegexMatcher {
+ public:
+ MatchesRegexMatcher(const RE* regex, bool full_match)
+ : regex_(regex), full_match_(full_match) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+ bool MatchAndExplain(const internal::StringView& s,
+ MatchResultListener* listener) const {
+ return MatchAndExplain(std::string(s), listener);
+ }
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+ // Accepts pointer types, particularly:
+ // const char*
+ // char*
+ // const wchar_t*
+ // wchar_t*
+ template <typename CharType>
+ bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+ return s != nullptr && MatchAndExplain(std::string(s), listener);
+ }
+
+ // Matches anything that can convert to std::string.
+ //
+ // This is a template, not just a plain function with const std::string&,
+ // because absl::string_view has some interfering non-explicit constructors.
+ template <class MatcheeStringType>
+ bool MatchAndExplain(const MatcheeStringType& s,
+ MatchResultListener* /* listener */) const {
+ const std::string& s2(s);
+ return full_match_ ? RE::FullMatch(s2, *regex_)
+ : RE::PartialMatch(s2, *regex_);
+ }
+
+ void DescribeTo(::std::ostream* os) const {
+ *os << (full_match_ ? "matches" : "contains") << " regular expression ";
+ UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+ }
+
+ void DescribeNegationTo(::std::ostream* os) const {
+ *os << "doesn't " << (full_match_ ? "match" : "contain")
+ << " regular expression ";
+ UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+ }
+
+ private:
+ const std::shared_ptr<const RE> regex_;
+ const bool full_match_;
+};
+} // namespace internal
+
+// Matches a string that fully matches regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+ const internal::RE* regex) {
+ return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
+}
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+ const internal::StringLike<T>& regex) {
+ return MatchesRegex(new internal::RE(std::string(regex)));
+}
+
+// Matches a string that contains regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+ const internal::RE* regex) {
+ return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
+}
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+ const internal::StringLike<T>& regex) {
+ return ContainsRegex(new internal::RE(std::string(regex)));
+}
+
+// Creates a polymorphic matcher that matches anything equal to x.
+// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
+// wouldn't compile.
+template <typename T>
+inline internal::EqMatcher<T> Eq(T x) {
+ return internal::EqMatcher<T>(x);
+}
+
+// Constructs a Matcher<T> from a 'value' of type T. The constructed
+// matcher matches any value that's equal to 'value'.
+template <typename T>
+Matcher<T>::Matcher(T value) {
+ *this = Eq(value);
+}
+
+// Creates a monomorphic matcher that matches anything with type Lhs
+// and equal to rhs. A user may need to use this instead of Eq(...)
+// in order to resolve an overloading ambiguity.
+//
+// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
+// or Matcher<T>(x), but more readable than the latter.
+//
+// We could define similar monomorphic matchers for other comparison
+// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
+// it yet as those are used much less than Eq() in practice. A user
+// can always write Matcher<T>(Lt(5)) to be explicit about the type,
+// for example.
+template <typename Lhs, typename Rhs>
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+ return Eq(rhs);
+}
+
+// Creates a polymorphic matcher that matches anything >= x.
+template <typename Rhs>
+inline internal::GeMatcher<Rhs> Ge(Rhs x) {
+ return internal::GeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything > x.
+template <typename Rhs>
+inline internal::GtMatcher<Rhs> Gt(Rhs x) {
+ return internal::GtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything <= x.
+template <typename Rhs>
+inline internal::LeMatcher<Rhs> Le(Rhs x) {
+ return internal::LeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything < x.
+template <typename Rhs>
+inline internal::LtMatcher<Rhs> Lt(Rhs x) {
+ return internal::LtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything != x.
+template <typename Rhs>
+inline internal::NeMatcher<Rhs> Ne(Rhs x) {
+ return internal::NeMatcher<Rhs>(x);
+}
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h
new file mode 100644
index 0000000000..6c8bf90009
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h
@@ -0,0 +1,218 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+#include <memory>
+#include <sstream>
+
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+// 1. You stream a bunch of values to a Message object.
+// It will remember the text in a stringstream.
+// 2. Then you stream the Message object to an ostream.
+// This causes the text in the Message to be streamed
+// to the ostream.
+//
+// For example;
+//
+// testing::Message foo;
+// foo << 1 << " != " << 2;
+// std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from. In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC. You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do). The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+ // The type of basic IO manipulators (endl, ends, and flush) for
+ // narrow streams.
+ typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+ // Constructs an empty Message.
+ Message();
+
+ // Copy constructor.
+ Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT
+ *ss_ << msg.GetString();
+ }
+
+ // Constructs a Message from a C-string.
+ explicit Message(const char* str) : ss_(new ::std::stringstream) {
+ *ss_ << str;
+ }
+
+ // Streams a non-pointer value to this object.
+ template <typename T>
+ inline Message& operator<<(const T& val) {
+ // Some libraries overload << for STL containers. These
+ // overloads are defined in the global namespace instead of ::std.
+ //
+ // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+ // overloads are visible in either the std namespace or the global
+ // namespace, but not other namespaces, including the testing
+ // namespace which Google Test's Message class is in.
+ //
+ // To allow STL containers (and other types that has a << operator
+ // defined in the global namespace) to be used in Google Test
+ // assertions, testing::Message must access the custom << operator
+ // from the global namespace. With this using declaration,
+ // overloads of << defined in the global namespace and those
+ // visible via Koenig lookup are both exposed in this function.
+ using ::operator<<;
+ *ss_ << val;
+ return *this;
+ }
+
+ // Streams a pointer value to this object.
+ //
+ // This function is an overload of the previous one. When you
+ // stream a pointer to a Message, this definition will be used as it
+ // is more specialized. (The C++ Standard, section
+ // [temp.func.order].) If you stream a non-pointer, then the
+ // previous definition will be used.
+ //
+ // The reason for this overload is that streaming a NULL pointer to
+ // ostream is undefined behavior. Depending on the compiler, you
+ // may get "0", "(nil)", "(null)", or an access violation. To
+ // ensure consistent result across compilers, we always treat NULL
+ // as "(null)".
+ template <typename T>
+ inline Message& operator<<(T* const& pointer) { // NOLINT
+ if (pointer == nullptr) {
+ *ss_ << "(null)";
+ } else {
+ *ss_ << pointer;
+ }
+ return *this;
+ }
+
+ // Since the basic IO manipulators are overloaded for both narrow
+ // and wide streams, we have to provide this specialized definition
+ // of operator <<, even though its body is the same as the
+ // templatized version above. Without this definition, streaming
+ // endl or other basic IO manipulators to Message will confuse the
+ // compiler.
+ Message& operator<<(BasicNarrowIoManip val) {
+ *ss_ << val;
+ return *this;
+ }
+
+ // Instead of 1/0, we want to see true/false for bool values.
+ Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
+
+ // These two overloads allow streaming a wide C string to a Message
+ // using the UTF-8 encoding.
+ Message& operator<<(const wchar_t* wide_c_str);
+ Message& operator<<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+ // Converts the given wide string to a narrow string using the UTF-8
+ // encoding, and streams the result to this Message object.
+ Message& operator<<(const ::std::wstring& wstr);
+#endif // GTEST_HAS_STD_WSTRING
+
+ // Gets the text streamed to this object so far as an std::string.
+ // Each '\0' character in the buffer is replaced with "\\0".
+ //
+ // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+ std::string GetString() const;
+
+ private:
+ // We'll hold the text streamed to this object here.
+ const std::unique_ptr< ::std::stringstream> ss_;
+
+ // We declare (but don't implement) this to prevent the compiler
+ // from implementing the assignment operator.
+ void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
+ return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string. A NULL pointer is
+// converted to "(null)". When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+ return (Message() << streamable).GetString();
+}
+
+} // namespace internal
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
new file mode 100644
index 0000000000..b55119ac62
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
@@ -0,0 +1,510 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing and Mocking Framework (Google Test)
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+ // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+ // Inside a test, access the test parameter with the GetParam() method
+ // of the TestWithParam<T> class:
+ EXPECT_TRUE(foo.Blah(GetParam()));
+ ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+ ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a summary of them, which
+// are all in the testing namespace:
+//
+//
+// Range(begin, end [, step]) - Yields values {begin, begin+step,
+// begin+step+step, ...}. The values do not
+// include end. step defaults to 1.
+// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}.
+// ValuesIn(container) - Yields values from a C-style array, an STL
+// ValuesIn(begin,end) container, or an iterator range [begin, end).
+// Bool() - Yields sequence {false, true}.
+// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product
+// for the math savvy) of the values generated
+// by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test suite
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_SUITE_P(InstantiationName,
+ FooTest,
+ Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more than once) the first argument to the
+// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the
+// actual test suite name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+// * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+// * InstantiationName/FooTest.DoesBlah/1 for "miny"
+// * InstantiationName/FooTest.DoesBlah/2 for "moe"
+// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+// * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+// * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests
+// in the given test suite, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_SUITE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+ // You can inherit all the usual members for a non-parameterized test
+ // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+ // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+ // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+ // GetParam works just the same here as if you inherit from TestWithParam.
+ EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif // 0
+
+#include <iterator>
+#include <utility>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test suite is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test suite FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+// - returns a generator producing a sequence of values {start, start+1,
+// start+2, ..., }.
+// Range(start, end, step)
+// - returns a generator producing a sequence of values {start, start+step,
+// start+step+step, ..., }.
+// Notes:
+// * The generated sequences never include end. For example, Range(1, 5)
+// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+// returns a generator producing {1, 3, 5, 7}.
+// * start and end must have the same type. That type may be any integral or
+// floating-point type or a user defined type satisfying these conditions:
+// * It must be assignable (have operator=() defined).
+// * It must have operator+() (operator+(int-compatible type) for
+// two-operand version).
+// * It must have operator<() defined.
+// Elements in the resulting sequences will also have that type.
+// * Condition start < end must be satisfied in order for resulting sequences
+// to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+ return internal::ParamGenerator<T>(
+ new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+ return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+// - returns a generator producing sequences with elements from
+// a C-style array.
+// ValuesIn(const Container& container)
+// - returns a generator producing sequences with elements from
+// an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+// - returns a generator producing sequences with elements from
+// a range [begin, end) defined by a pair of STL-style iterators. These
+// iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test suite StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings));
+//
+// This instantiates tests from test suite StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+// ::std::vector< ::std::string> v;
+// v.push_back("a");
+// v.push_back("b");
+// return v;
+// }
+//
+// INSTANTIATE_TEST_SUITE_P(CharSequence,
+// StlStringTest,
+// ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+// ::std::list<char> list;
+// list.push_back('a');
+// list.push_back('b');
+// return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_SUITE_P(CharSequence2,
+// CharTest,
+// ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+ typename std::iterator_traits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+ typedef typename std::iterator_traits<ForwardIterator>::value_type ParamType;
+ return internal::ParamGenerator<ParamType>(
+ new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+ return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+ const Container& container) {
+ return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+// - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test suite BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_SUITE_P(NumSequence,
+// BarTest,
+// Values("one", "two", "three"));
+//
+// This instantiates tests from test suite BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+//
+template <typename... T>
+internal::ValueArray<T...> Values(T... v) {
+ return internal::ValueArray<T...>(std::move(v)...);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+// - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test suite FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+// virtual void SetUp() {
+// external_flag = GetParam();
+// }
+// }
+// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
+
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+// - returns a generator producing sequences with elements coming from
+// the Cartesian product of elements from the sequences generated by
+// gen1, gen2, ..., genN. The sequence elements will have a type of
+// std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+// of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Example:
+//
+// This will instantiate tests in test suite AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+// : public testing::TestWithParam<std::tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest,
+// Combine(Values("cat", "dog"),
+// Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+// : public testing::TestWithParam<std::tuple<bool, bool> > {
+// virtual void SetUp() {
+// // Assigns external_flag_1 and external_flag_2 values from the tuple.
+// std::tie(external_flag_1, external_flag_2) = GetParam();
+// }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+// // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest,
+// Combine(Bool(), Bool()));
+//
+template <typename... Generator>
+internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
+ return internal::CartesianProductHolder<Generator...>(g...);
+}
+
+#define TEST_P(test_suite_name, test_name) \
+ class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ : public test_suite_name { \
+ public: \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \
+ void TestBody() override; \
+ \
+ private: \
+ static int AddToRegistry() { \
+ ::testing::UnitTest::GetInstance() \
+ ->parameterized_test_registry() \
+ .GetTestSuitePatternHolder<test_suite_name>( \
+ GTEST_STRINGIFY_(test_suite_name), \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__)) \
+ ->AddTestPattern( \
+ GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \
+ new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+ test_suite_name, test_name)>(), \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__)); \
+ return 0; \
+ } \
+ static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \
+ const GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name) &) = delete; /* NOLINT */ \
+ }; \
+ int GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name)::gtest_registering_dummy_ = \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry(); \
+ void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify
+// generator and an optional function or functor that generates custom test name
+// suffixes based on the test parameters. Such a function or functor should
+// accept one argument of type testing::TestParamInfo<class ParamType>, and
+// return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+#define GTEST_EXPAND_(arg) arg
+#define GTEST_GET_FIRST_(first, ...) first
+#define GTEST_GET_SECOND_(first, second, ...) second
+
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \
+ static ::testing::internal::ParamGenerator<test_suite_name::ParamType> \
+ gtest_##prefix##test_suite_name##_EvalGenerator_() { \
+ return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \
+ } \
+ static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \
+ const ::testing::TestParamInfo<test_suite_name::ParamType>& info) { \
+ if (::testing::internal::AlwaysFalse()) { \
+ ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \
+ __VA_ARGS__, \
+ ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+ DUMMY_PARAM_))); \
+ auto t = std::make_tuple(__VA_ARGS__); \
+ static_assert(std::tuple_size<decltype(t)>::value <= 2, \
+ "Too Many Args!"); \
+ } \
+ return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \
+ __VA_ARGS__, \
+ ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+ DUMMY_PARAM_))))(info); \
+ } \
+ static int gtest_##prefix##test_suite_name##_dummy_ \
+ GTEST_ATTRIBUTE_UNUSED_ = \
+ ::testing::UnitTest::GetInstance() \
+ ->parameterized_test_registry() \
+ .GetTestSuitePatternHolder<test_suite_name>( \
+ GTEST_STRINGIFY_(test_suite_name), \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__)) \
+ ->AddTestSuiteInstantiation( \
+ GTEST_STRINGIFY_(prefix), \
+ &gtest_##prefix##test_suite_name##_EvalGenerator_, \
+ &gtest_##prefix##test_suite_name##_EvalGenerateName_, \
+ __FILE__, __LINE__)
+
+// Allow Marking a Parameterized test class as not needing to be instantiated.
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \
+ namespace gtest_do_not_use_outside_namespace_scope {} \
+ static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
+ GTEST_STRINGIFY_(T))
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TEST_CASE_P \
+ static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \
+ ""); \
+ INSTANTIATE_TEST_SUITE_P
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
new file mode 100644
index 0000000000..a91e8b8b10
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
@@ -0,0 +1,1048 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+// void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T. More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+// 1. foo::PrintTo(const T&, ostream*)
+// 2. operator<<(ostream&, const T&) defined in either foo or the
+// global namespace.
+//
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+// // Prints a value to a string. For a (const or not) char
+// // pointer, the NUL-terminated string (but not the pointer) is
+// // printed.
+// std::string ::testing::PrintToString(const T& value);
+//
+// // Prints a value tersely: for a reference type, the referenced
+// // value (but not the address) is printed; for a (const or not) char
+// // pointer, the NUL-terminated string (but not the pointer) is
+// // printed.
+// void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+// // Prints value using the type inferred by the compiler. The difference
+// // from UniversalTersePrint() is that this function prints both the
+// // pointer and the NUL-terminated string for a (const or not) char pointer.
+// void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+// // Prints the fields of a tuple tersely to a string vector, one
+// // element for each field. Tuple support must be enabled in
+// // gtest-port.h.
+// std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+// const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container. When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect. In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator. We'll fix this if there's an
+// actual need for it. Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <functional>
+#include <memory>
+#include <ostream> // NOLINT
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Definitions in the internal* namespaces are subject to change without notice.
+// DO NOT USE THEM IN USER CODE!
+namespace internal {
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+struct ContainerPrinter {
+ template <typename T,
+ typename = typename std::enable_if<
+ (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+ !IsRecursiveContainer<T>::value>::type>
+ static void PrintValue(const T& container, std::ostream* os) {
+ const size_t kMaxCount = 32; // The maximum number of elements to print.
+ *os << '{';
+ size_t count = 0;
+ for (auto&& elem : container) {
+ if (count > 0) {
+ *os << ',';
+ if (count == kMaxCount) { // Enough has been printed.
+ *os << " ...";
+ break;
+ }
+ }
+ *os << ' ';
+ // We cannot call PrintTo(elem, os) here as PrintTo() doesn't
+ // handle `elem` being a native array.
+ internal::UniversalPrint(elem, os);
+ ++count;
+ }
+
+ if (count > 0) {
+ *os << ' ';
+ }
+ *os << '}';
+ }
+};
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it. (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space. Their representation is
+// implementation-defined. Therefore they will be printed as raw
+// bytes.)
+struct FunctionPointerPrinter {
+ template <typename T, typename = typename std::enable_if<
+ std::is_function<T>::value>::type>
+ static void PrintValue(T* p, ::std::ostream* os) {
+ if (p == nullptr) {
+ *os << "NULL";
+ } else {
+ // T is a function type, so '*os << p' doesn't do what we want
+ // (it just prints p as bool). We want to print p as a const
+ // void*.
+ *os << reinterpret_cast<const void*>(p);
+ }
+ }
+};
+
+struct PointerPrinter {
+ template <typename T>
+ static void PrintValue(T* p, ::std::ostream* os) {
+ if (p == nullptr) {
+ *os << "NULL";
+ } else {
+ // T is not a function type. We just call << to print p,
+ // relying on ADL to pick up user-defined << for their pointer
+ // types, if any.
+ *os << p;
+ }
+ }
+};
+
+namespace internal_stream_operator_without_lexical_name_lookup {
+
+// The presence of an operator<< here will terminate lexical scope lookup
+// straight away (even though it cannot be a match because of its argument
+// types). Thus, the two operator<< calls in StreamPrinter will find only ADL
+// candidates.
+struct LookupBlocker {};
+void operator<<(LookupBlocker, LookupBlocker);
+
+struct StreamPrinter {
+ template <typename T,
+ // Don't accept member pointers here. We'd print them via implicit
+ // conversion to bool, which isn't useful.
+ typename = typename std::enable_if<
+ !std::is_member_pointer<T>::value>::type,
+ // Only accept types for which we can find a streaming operator via
+ // ADL (possibly involving implicit conversions).
+ typename = decltype(std::declval<std::ostream&>()
+ << std::declval<const T&>())>
+ static void PrintValue(const T& value, ::std::ostream* os) {
+ // Call streaming operator found by ADL, possibly with implicit conversions
+ // of the arguments.
+ *os << value;
+ }
+};
+
+} // namespace internal_stream_operator_without_lexical_name_lookup
+
+struct ProtobufPrinter {
+ // We print a protobuf using its ShortDebugString() when the string
+ // doesn't exceed this many characters; otherwise we print it using
+ // DebugString() for better readability.
+ static const size_t kProtobufOneLinerMaxLength = 50;
+
+ template <typename T,
+ typename = typename std::enable_if<
+ internal::HasDebugStringAndShortDebugString<T>::value>::type>
+ static void PrintValue(const T& value, ::std::ostream* os) {
+ std::string pretty_str = value.ShortDebugString();
+ if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+ pretty_str = "\n" + value.DebugString();
+ }
+ *os << ("<" + pretty_str + ">");
+ }
+};
+
+struct ConvertibleToIntegerPrinter {
+ // Since T has no << operator or PrintTo() but can be implicitly
+ // converted to BiggestInt, we print it as a BiggestInt.
+ //
+ // Most likely T is an enum type (either named or unnamed), in which
+ // case printing it as an integer is the desired behavior. In case
+ // T is not an enum, printing it as an integer is the best we can do
+ // given that it has no user-defined printer.
+ static void PrintValue(internal::BiggestInt value, ::std::ostream* os) {
+ *os << value;
+ }
+};
+
+struct ConvertibleToStringViewPrinter {
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+ static void PrintValue(internal::StringView value, ::std::ostream* os) {
+ internal::UniversalPrint(value, os);
+ }
+#endif
+};
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+ size_t count, ::std::ostream* os);
+struct RawBytesPrinter {
+ // SFINAE on `sizeof` to make sure we have a complete type.
+ template <typename T, size_t = sizeof(T)>
+ static void PrintValue(const T& value, ::std::ostream* os) {
+ PrintBytesInObjectTo(
+ static_cast<const unsigned char*>(
+ // Load bearing cast to void* to support iOS
+ reinterpret_cast<const void*>(std::addressof(value))),
+ sizeof(value), os);
+ }
+};
+
+struct FallbackPrinter {
+ template <typename T>
+ static void PrintValue(const T&, ::std::ostream* os) {
+ *os << "(incomplete type)";
+ }
+};
+
+// Try every printer in order and return the first one that works.
+template <typename T, typename E, typename Printer, typename... Printers>
+struct FindFirstPrinter : FindFirstPrinter<T, E, Printers...> {};
+
+template <typename T, typename Printer, typename... Printers>
+struct FindFirstPrinter<
+ T, decltype(Printer::PrintValue(std::declval<const T&>(), nullptr)),
+ Printer, Printers...> {
+ using type = Printer;
+};
+
+// Select the best printer in the following order:
+// - Print containers (they have begin/end/etc).
+// - Print function pointers.
+// - Print object pointers.
+// - Use the stream operator, if available.
+// - Print protocol buffers.
+// - Print types convertible to BiggestInt.
+// - Print types convertible to StringView, if available.
+// - Fallback to printing the raw bytes of the object.
+template <typename T>
+void PrintWithFallback(const T& value, ::std::ostream* os) {
+ using Printer = typename FindFirstPrinter<
+ T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter,
+ internal_stream_operator_without_lexical_name_lookup::StreamPrinter,
+ ProtobufPrinter, ConvertibleToIntegerPrinter,
+ ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type;
+ Printer::PrintValue(value, os);
+}
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value. In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object. If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+ static ::std::string Format(const ToPrint& value) {
+ return ::testing::PrintToString(value);
+ }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+ static ::std::string Format(const ToPrint* value) {
+ return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+ }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \
+ template <typename OtherOperand> \
+ class FormatForComparison<CharType*, OtherOperand> { \
+ public: \
+ static ::std::string Format(CharType* value) { \
+ return ::testing::PrintToString(static_cast<const void*>(value)); \
+ } \
+ }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+#ifdef __cpp_lib_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+ template <> \
+ class FormatForComparison<CharType*, OtherStringType> { \
+ public: \
+ static ::std::string Format(CharType* value) { \
+ return ::testing::PrintToString(value); \
+ } \
+ }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message. The type (but not value)
+// of the other operand may affect the format. This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(const T1& value,
+ const T2& /* other_operand */) {
+ return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream. The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it. This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined. We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+ internal::PrintWithFallback(value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+ // When printing a plain char, we always treat it as unsigned. This
+ // way, the output won't be affected by whether the compiler thinks
+ // char is signed or not.
+ PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+ *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
+inline void PrintTo(char16_t c, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#ifdef __cpp_char8_t
+inline void PrintTo(char8_t c, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#endif
+
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif // __SIZEOF_INT128__
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const void*>(s), os);
+}
+#ifdef __cpp_char8_t
+// Overloads for u8 strings.
+GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
+inline void PrintTo(char8_t* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const char8_t*>(s), os);
+}
+#endif
+// Overloads for u16 strings.
+GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os);
+inline void PrintTo(char16_t* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const char16_t*>(s), os);
+}
+// Overloads for u32 strings.
+GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os);
+inline void PrintTo(char32_t* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const char32_t*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type. When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+ PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays. Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+ UniversalPrint(a[0], os);
+ for (size_t i = 1; i != count; i++) {
+ *os << ", ";
+ UniversalPrint(a[i], os);
+ }
+}
+
+// Overloads for ::std::string.
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+ PrintStringTo(s, os);
+}
+
+// Overloads for ::std::u8string
+#ifdef __cpp_char8_t
+GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
+ PrintU8StringTo(s, os);
+}
+#endif
+
+// Overloads for ::std::u16string
+GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) {
+ PrintU16StringTo(s, os);
+}
+
+// Overloads for ::std::u32string
+GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
+ PrintU32StringTo(s, os);
+}
+
+// Overloads for ::std::wstring.
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+ PrintWideStringTo(s, os);
+}
+#endif // GTEST_HAS_STD_WSTRING
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Overload for internal::StringView.
+inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
+ PrintTo(::std::string(sp), os);
+}
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+ *os << internal::GetTypeName(info);
+}
+#endif // GTEST_HAS_RTTI
+
+template <typename T>
+void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
+ UniversalPrinter<T&>::Print(ref.get(), os);
+}
+
+inline const void* VoidifyPointer(const void* p) { return p; }
+inline const void* VoidifyPointer(volatile const void* p) {
+ return const_cast<const void*>(p);
+}
+
+template <typename T, typename Ptr>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) {
+ if (ptr == nullptr) {
+ *os << "(nullptr)";
+ } else {
+ // We can't print the value. Just print the pointer..
+ *os << "(" << (VoidifyPointer)(ptr.get()) << ")";
+ }
+}
+template <typename T, typename Ptr,
+ typename = typename std::enable_if<!std::is_void<T>::value &&
+ !std::is_array<T>::value>::type>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) {
+ if (ptr == nullptr) {
+ *os << "(nullptr)";
+ } else {
+ *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = ";
+ UniversalPrinter<T>::Print(*ptr, os);
+ *os << ")";
+ }
+}
+
+template <typename T, typename D>
+void PrintTo(const std::unique_ptr<T, D>& ptr, std::ostream* os) {
+ (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+template <typename T>
+void PrintTo(const std::shared_ptr<T>& ptr, std::ostream* os) {
+ (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+// Helper function for printing a tuple. T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T&, std::integral_constant<size_t, 0>,
+ ::std::ostream*) {}
+
+template <typename T, size_t I>
+void PrintTupleTo(const T& t, std::integral_constant<size_t, I>,
+ ::std::ostream* os) {
+ PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
+ GTEST_INTENTIONAL_CONST_COND_PUSH_()
+ if (I > 1) {
+ GTEST_INTENTIONAL_CONST_COND_POP_()
+ *os << ", ";
+ }
+ UniversalPrinter<typename std::tuple_element<I - 1, T>::type>::Print(
+ std::get<I - 1>(t), os);
+}
+
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+ *os << "(";
+ PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
+ *os << ")";
+}
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+ *os << '(';
+ // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+ // a reference type. The same for printing value.second.
+ UniversalPrinter<T1>::Print(value.first, os);
+ *os << ", ";
+ UniversalPrinter<T2>::Print(value.second, os);
+ *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+ // MSVC warns about adding const to a function type, so we want to
+ // disable the warning.
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+ // Note: we deliberately don't call this PrintTo(), as that name
+ // conflicts with ::testing::internal::PrintTo in the body of the
+ // function.
+ static void Print(const T& value, ::std::ostream* os) {
+ // By default, ::testing::internal::PrintTo() is used for printing
+ // the value.
+ //
+ // Thanks to Koenig look-up, if T is a class and has its own
+ // PrintTo() function defined in its namespace, that function will
+ // be visible here. Since it is more specific than the generic ones
+ // in ::testing::internal, it will be picked by the compiler in the
+ // following statement - exactly what we want.
+ PrintTo(value, os);
+ }
+
+ GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Remove any const-qualifiers before passing a type to UniversalPrinter.
+template <typename T>
+class UniversalPrinter<const T> : public UniversalPrinter<T> {};
+
+#if GTEST_INTERNAL_HAS_ANY
+
+// Printer for std::any / absl::any
+
+template <>
+class UniversalPrinter<Any> {
+ public:
+ static void Print(const Any& value, ::std::ostream* os) {
+ if (value.has_value()) {
+ *os << "value of type " << GetTypeName(value);
+ } else {
+ *os << "no value";
+ }
+ }
+
+ private:
+ static std::string GetTypeName(const Any& value) {
+#if GTEST_HAS_RTTI
+ return internal::GetTypeName(value.type());
+#else
+ static_cast<void>(value); // possibly unused
+ return "<unknown_type>";
+#endif // GTEST_HAS_RTTI
+ }
+};
+
+#endif // GTEST_INTERNAL_HAS_ANY
+
+#if GTEST_INTERNAL_HAS_OPTIONAL
+
+// Printer for std::optional / absl::optional
+
+template <typename T>
+class UniversalPrinter<Optional<T>> {
+ public:
+ static void Print(const Optional<T>& value, ::std::ostream* os) {
+ *os << '(';
+ if (!value) {
+ *os << "nullopt";
+ } else {
+ UniversalPrint(*value, os);
+ }
+ *os << ')';
+ }
+};
+
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+ static void Print(decltype(Nullopt()), ::std::ostream* os) {
+ *os << "(nullopt)";
+ }
+};
+
+#endif // GTEST_INTERNAL_HAS_OPTIONAL
+
+#if GTEST_INTERNAL_HAS_VARIANT
+
+// Printer for std::variant / absl::variant
+
+template <typename... T>
+class UniversalPrinter<Variant<T...>> {
+ public:
+ static void Print(const Variant<T...>& value, ::std::ostream* os) {
+ *os << '(';
+#if GTEST_HAS_ABSL
+ absl::visit(Visitor{os, value.index()}, value);
+#else
+ std::visit(Visitor{os, value.index()}, value);
+#endif // GTEST_HAS_ABSL
+ *os << ')';
+ }
+
+ private:
+ struct Visitor {
+ template <typename U>
+ void operator()(const U& u) const {
+ *os << "'" << GetTypeName<U>() << "(index = " << index
+ << ")' with value ";
+ UniversalPrint(u, os);
+ }
+ ::std::ostream* os;
+ std::size_t index;
+ };
+};
+
+#endif // GTEST_INTERNAL_HAS_VARIANT
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+ if (len == 0) {
+ *os << "{}";
+ } else {
+ *os << "{ ";
+ const size_t kThreshold = 18;
+ const size_t kChunkSize = 8;
+ // If the array has more than kThreshold elements, we'll have to
+ // omit some details by printing only the first and the last
+ // kChunkSize elements.
+ if (len <= kThreshold) {
+ PrintRawArrayTo(begin, len, os);
+ } else {
+ PrintRawArrayTo(begin, kChunkSize, os);
+ *os << ", ..., ";
+ PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+ }
+ *os << " }";
+ }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+ ::std::ostream* os);
+
+#ifdef __cpp_char8_t
+// This overload prints a (const) char8_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
+ ::std::ostream* os);
+#endif
+
+// This overload prints a (const) char16_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len,
+ ::std::ostream* os);
+
+// This overload prints a (const) char32_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
+ ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+ ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+ // Prints the given array, omitting some elements when there are too
+ // many.
+ static void Print(const T (&a)[N], ::std::ostream* os) {
+ UniversalPrintArray(a, N, os);
+ }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+ // MSVC warns about adding const to a function type, so we want to
+ // disable the warning.
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+ static void Print(const T& value, ::std::ostream* os) {
+ // Prints the address of the value. We use reinterpret_cast here
+ // as static_cast doesn't compile when T is a function type.
+ *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+ // Then prints the value itself.
+ UniversalPrint(value, os);
+ }
+
+ GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+ static void Print(const T& value, ::std::ostream* os) {
+ UniversalPrint(value, os);
+ }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+ static void Print(const T& value, ::std::ostream* os) {
+ UniversalPrint(value, os);
+ }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+ static void Print(const T (&value)[N], ::std::ostream* os) {
+ UniversalPrinter<T[N]>::Print(value, os);
+ }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+ static void Print(const char* str, ::std::ostream* os) {
+ if (str == nullptr) {
+ *os << "NULL";
+ } else {
+ UniversalPrint(std::string(str), os);
+ }
+ }
+};
+template <>
+class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
+};
+
+#ifdef __cpp_char8_t
+template <>
+class UniversalTersePrinter<const char8_t*> {
+ public:
+ static void Print(const char8_t* str, ::std::ostream* os) {
+ if (str == nullptr) {
+ *os << "NULL";
+ } else {
+ UniversalPrint(::std::u8string(str), os);
+ }
+ }
+};
+template <>
+class UniversalTersePrinter<char8_t*>
+ : public UniversalTersePrinter<const char8_t*> {};
+#endif
+
+template <>
+class UniversalTersePrinter<const char16_t*> {
+ public:
+ static void Print(const char16_t* str, ::std::ostream* os) {
+ if (str == nullptr) {
+ *os << "NULL";
+ } else {
+ UniversalPrint(::std::u16string(str), os);
+ }
+ }
+};
+template <>
+class UniversalTersePrinter<char16_t*>
+ : public UniversalTersePrinter<const char16_t*> {};
+
+template <>
+class UniversalTersePrinter<const char32_t*> {
+ public:
+ static void Print(const char32_t* str, ::std::ostream* os) {
+ if (str == nullptr) {
+ *os << "NULL";
+ } else {
+ UniversalPrint(::std::u32string(str), os);
+ }
+ }
+};
+template <>
+class UniversalTersePrinter<char32_t*>
+ : public UniversalTersePrinter<const char32_t*> {};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+ static void Print(const wchar_t* str, ::std::ostream* os) {
+ if (str == nullptr) {
+ *os << "NULL";
+ } else {
+ UniversalPrint(::std::wstring(str), os);
+ }
+ }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+ static void Print(wchar_t* str, ::std::ostream* os) {
+ UniversalTersePrinter<const wchar_t*>::Print(str, os);
+ }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+ UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler. The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+ // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+ // UniversalPrinter with T directly.
+ typedef T T1;
+ UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector<::std::string> Strings;
+
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
+template <typename Tuple>
+void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
+ Strings*) {}
+template <typename Tuple, size_t I>
+void TersePrintPrefixToStrings(const Tuple& t,
+ std::integral_constant<size_t, I>,
+ Strings* strings) {
+ TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
+ strings);
+ ::std::stringstream ss;
+ UniversalTersePrint(std::get<I - 1>(t), &ss);
+ strings->push_back(ss.str());
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field. See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+ Strings result;
+ TersePrintPrefixToStrings(
+ value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
+ &result);
+ return result;
+}
+
+} // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+ ::std::stringstream ss;
+ internal::UniversalTersePrinter<T>::Print(value, &ss);
+ return ss.str();
+}
+
+} // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
new file mode 100644
index 0000000000..bec8c4810b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
@@ -0,0 +1,248 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+ : public TestPartResultReporterInterface {
+ public:
+ // The two possible mocking modes of this object.
+ enum InterceptMode {
+ INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures.
+ INTERCEPT_ALL_THREADS // Intercepts all failures.
+ };
+
+ // The c'tor sets this object as the test part result reporter used
+ // by Google Test. The 'result' parameter specifies where to report the
+ // results. This reporter will only catch failures generated in the current
+ // thread. DEPRECATED
+ explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+ // Same as above, but you can choose the interception scope of this object.
+ ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+ TestPartResultArray* result);
+
+ // The d'tor restores the previous test part result reporter.
+ ~ScopedFakeTestPartResultReporter() override;
+
+ // Appends the TestPartResult object to the TestPartResultArray
+ // received in the constructor.
+ //
+ // This method is from the TestPartResultReporterInterface
+ // interface.
+ void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+ void Init();
+
+ const InterceptMode intercept_mode_;
+ TestPartResultReporterInterface* old_reporter_;
+ TestPartResultArray* const result_;
+
+ ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+ delete;
+ ScopedFakeTestPartResultReporter& operator=(
+ const ScopedFakeTestPartResultReporter&) = delete;
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring. If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+ // The constructor remembers the arguments.
+ SingleFailureChecker(const TestPartResultArray* results,
+ TestPartResult::Type type, const std::string& substr);
+ ~SingleFailureChecker();
+
+ private:
+ const TestPartResultArray* const results_;
+ const TestPartResult::Type type_;
+ const std::string substr_;
+
+ SingleFailureChecker(const SingleFailureChecker&) = delete;
+ SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
+};
+
+} // namespace internal
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ). It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+// - 'statement' cannot reference local non-static variables or
+// non-static members of the current object.
+// - 'statement' cannot return a value.
+// - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works. The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+ do { \
+ class GTestExpectFatalFailureHelper { \
+ public: \
+ static void Execute() { statement; } \
+ }; \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter:: \
+ INTERCEPT_ONLY_CURRENT_THREAD, \
+ &gtest_failures); \
+ GTestExpectFatalFailureHelper::Execute(); \
+ } \
+ } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+ do { \
+ class GTestExpectFatalFailureHelper { \
+ public: \
+ static void Execute() { statement; } \
+ }; \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+ &gtest_failures); \
+ GTestExpectFatalFailureHelper::Execute(); \
+ } \
+ } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+// - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works. If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma. The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+// if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+ do { \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+ (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter:: \
+ INTERCEPT_ONLY_CURRENT_THREAD, \
+ &gtest_failures); \
+ if (::testing::internal::AlwaysTrue()) { \
+ statement; \
+ } \
+ } \
+ } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+ do { \
+ ::testing::TestPartResultArray gtest_failures; \
+ ::testing::internal::SingleFailureChecker gtest_checker( \
+ &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+ (substr)); \
+ { \
+ ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \
+ ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+ &gtest_failures); \
+ if (::testing::internal::AlwaysTrue()) { \
+ statement; \
+ } \
+ } \
+ } while (::testing::internal::AlwaysFalse())
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
new file mode 100644
index 0000000000..09cc8c34f0
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
@@ -0,0 +1,190 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+ // The possible outcomes of a test part (i.e. an assertion or an
+ // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+ enum Type {
+ kSuccess, // Succeeded.
+ kNonFatalFailure, // Failed but the test can continue.
+ kFatalFailure, // Failed and the test should be terminated.
+ kSkip // Skipped.
+ };
+
+ // C'tor. TestPartResult does NOT have a default constructor.
+ // Always use this constructor (with parameters) to create a
+ // TestPartResult object.
+ TestPartResult(Type a_type, const char* a_file_name, int a_line_number,
+ const char* a_message)
+ : type_(a_type),
+ file_name_(a_file_name == nullptr ? "" : a_file_name),
+ line_number_(a_line_number),
+ summary_(ExtractSummary(a_message)),
+ message_(a_message) {}
+
+ // Gets the outcome of the test part.
+ Type type() const { return type_; }
+
+ // Gets the name of the source file where the test part took place, or
+ // NULL if it's unknown.
+ const char* file_name() const {
+ return file_name_.empty() ? nullptr : file_name_.c_str();
+ }
+
+ // Gets the line in the source file where the test part took place,
+ // or -1 if it's unknown.
+ int line_number() const { return line_number_; }
+
+ // Gets the summary of the failure message.
+ const char* summary() const { return summary_.c_str(); }
+
+ // Gets the message associated with the test part.
+ const char* message() const { return message_.c_str(); }
+
+ // Returns true if and only if the test part was skipped.
+ bool skipped() const { return type_ == kSkip; }
+
+ // Returns true if and only if the test part passed.
+ bool passed() const { return type_ == kSuccess; }
+
+ // Returns true if and only if the test part non-fatally failed.
+ bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+ // Returns true if and only if the test part fatally failed.
+ bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ // Returns true if and only if the test part failed.
+ bool failed() const { return fatally_failed() || nonfatally_failed(); }
+
+ private:
+ Type type_;
+
+ // Gets the summary of the failure message by omitting the stack
+ // trace in it.
+ static std::string ExtractSummary(const char* message);
+
+ // The name of the source file where the test part took place, or
+ // "" if the source file is unknown.
+ std::string file_name_;
+ // The line in the source file where the test part took place, or -1
+ // if the line number is unknown.
+ int line_number_;
+ std::string summary_; // The test failure summary.
+ std::string message_; // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+ TestPartResultArray() {}
+
+ // Appends the given TestPartResult to the array.
+ void Append(const TestPartResult& result);
+
+ // Returns the TestPartResult at the given index (0-based).
+ const TestPartResult& GetTestPartResult(int index) const;
+
+ // Returns the number of TestPartResult objects in the array.
+ int size() const;
+
+ private:
+ std::vector<TestPartResult> array_;
+
+ TestPartResultArray(const TestPartResultArray&) = delete;
+ TestPartResultArray& operator=(const TestPartResultArray&) = delete;
+};
+
+// This interface knows how to report a test part result.
+class GTEST_API_ TestPartResultReporterInterface {
+ public:
+ virtual ~TestPartResultReporterInterface() {}
+
+ virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+ : public TestPartResultReporterInterface {
+ public:
+ HasNewFatalFailureHelper();
+ ~HasNewFatalFailureHelper() override;
+ void ReportTestPartResult(const TestPartResult& result) override;
+ bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
+ private:
+ bool has_new_fatal_failure_;
+ TestPartResultReporterInterface* original_reporter_;
+
+ HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+ HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
+};
+
+} // namespace internal
+
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
new file mode 100644
index 0000000000..bd35a32660
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
@@ -0,0 +1,331 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list. You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template. It should be parameterized
+// by a type. Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+ ...
+ typedef std::list<T> List;
+ static T shared_;
+ T value_;
+};
+
+// Next, associate a list of types with the test suite, which will be
+// repeated for each type in the list. The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_SUITE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+// TYPED_TEST_SUITE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test suite as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+ // Inside a test, refer to the special name TypeParam to get the type
+ // parameter. Since we are inside a derived class template, C++ requires
+ // us to visit the members of FooTest via 'this'.
+ TypeParam n = this->value_;
+
+ // To visit static members of the fixture, add the TestFixture::
+ // prefix.
+ n += TestFixture::shared_;
+
+ // To refer to typedefs in the fixture, add the "typename
+ // TestFixture::" prefix.
+ typename TestFixture::List values;
+ values.push_back(n);
+ ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+// TYPED_TEST_SUITE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+// class MyTypeNames {
+// public:
+// template <typename T>
+// static std::string GetName(int) {
+// if (std::is_same<T, char>()) return "char";
+// if (std::is_same<T, int>()) return "int";
+// if (std::is_same<T, unsigned int>()) return "unsignedInt";
+// }
+// };
+// TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames);
+
+#endif // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type. Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are. The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have. Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly. Here's an example:
+
+#if 0
+
+// First, define a fixture class template. It should be parameterized
+// by a type. Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ ...
+};
+
+// Next, declare that you will define a type-parameterized test suite
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_SUITE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test suite as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+ // Inside a test, refer to TypeParam to get the type parameter.
+ TypeParam n = 0;
+ ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them. The first argument of the macro is the
+// test suite name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_SUITE_P(FooTest,
+ DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want. If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test suite name. Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_SUITE above,
+// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to
+// generate custom names.
+// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames);
+
+#endif // 0
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Implements typed tests.
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test suite.
+#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_
+
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestSuiteName) \
+ gtest_type_params_##TestSuiteName##_NameGenerator
+
+#define TYPED_TEST_SUITE(CaseName, Types, ...) \
+ typedef ::testing::internal::GenerateTypeList<Types>::type \
+ GTEST_TYPE_PARAMS_(CaseName); \
+ typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
+ GTEST_NAME_GENERATOR_(CaseName)
+
+#define TYPED_TEST(CaseName, TestName) \
+ static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \
+ "test-name must not be empty"); \
+ template <typename gtest_TypeParam_> \
+ class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+ : public CaseName<gtest_TypeParam_> { \
+ private: \
+ typedef CaseName<gtest_TypeParam_> TestFixture; \
+ typedef gtest_TypeParam_ TypeParam; \
+ void TestBody() override; \
+ }; \
+ static bool gtest_##CaseName##_##TestName##_registered_ \
+ GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest< \
+ CaseName, \
+ ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName, \
+ TestName)>, \
+ GTEST_TYPE_PARAMS_( \
+ CaseName)>::Register("", \
+ ::testing::internal::CodeLocation( \
+ __FILE__, __LINE__), \
+ GTEST_STRINGIFY_(CaseName), \
+ GTEST_STRINGIFY_(TestName), 0, \
+ ::testing::internal::GenerateNames< \
+ GTEST_NAME_GENERATOR_(CaseName), \
+ GTEST_TYPE_PARAMS_(CaseName)>()); \
+ template <typename gtest_TypeParam_> \
+ void GTEST_TEST_CLASS_NAME_(CaseName, \
+ TestName)<gtest_TypeParam_>::TestBody()
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE \
+ static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \
+ TYPED_TEST_SUITE
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Implements type-parameterized tests.
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test suite are defined in. The exact
+// name of the namespace is subject to change without notice.
+#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test suite.
+#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \
+ gtest_typed_test_suite_p_state_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test suite.
+#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \
+ gtest_registered_test_names_##TestSuiteName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+#define TYPED_TEST_SUITE_P(SuiteName) \
+ static ::testing::internal::TypedTestSuitePState \
+ GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE_P \
+ static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \
+ TYPED_TEST_SUITE_P
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define TYPED_TEST_P(SuiteName, TestName) \
+ namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \
+ template <typename gtest_TypeParam_> \
+ class TestName : public SuiteName<gtest_TypeParam_> { \
+ private: \
+ typedef SuiteName<gtest_TypeParam_> TestFixture; \
+ typedef gtest_TypeParam_ TypeParam; \
+ void TestBody() override; \
+ }; \
+ static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+ GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \
+ __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName), \
+ GTEST_STRINGIFY_(TestName)); \
+ } \
+ template <typename gtest_TypeParam_> \
+ void GTEST_SUITE_NAMESPACE_( \
+ SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+
+// Note: this won't work correctly if the trailing arguments are macros.
+#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \
+ namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \
+ typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_; \
+ } \
+ static const char* const GTEST_REGISTERED_TEST_NAMES_( \
+ SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \
+ GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \
+ GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define REGISTER_TYPED_TEST_CASE_P \
+ static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \
+ ""); \
+ REGISTER_TYPED_TEST_SUITE_P
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \
+ static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \
+ "test-suit-prefix must not be empty"); \
+ static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \
+ ::testing::internal::TypeParameterizedTestSuite< \
+ SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \
+ ::testing::internal::GenerateTypeList<Types>::type>:: \
+ Register(GTEST_STRINGIFY_(Prefix), \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+ &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \
+ GTEST_STRINGIFY_(SuiteName), \
+ GTEST_REGISTERED_TEST_NAMES_(SuiteName), \
+ ::testing::internal::GenerateNames< \
+ ::testing::internal::NameGeneratorSelector< \
+ __VA_ARGS__>::type, \
+ ::testing::internal::GenerateTypeList<Types>::type>())
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TYPED_TEST_CASE_P \
+ static_assert( \
+ ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \
+ INSTANTIATE_TYPED_TEST_SUITE_P
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h
new file mode 100644
index 0000000000..d19a587a18
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h
@@ -0,0 +1,2297 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for Google Test. It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+
+#include <cstddef>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest-matchers.h"
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-param-test.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag controls whether the test runner should continue execution past
+// first failure.
+GTEST_DECLARE_bool_(fail_fast);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints only test failures.
+GTEST_DECLARE_bool_(brief);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise. For use with an external test framework.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class FuchsiaDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+ const std::string& message);
+std::set<std::string>* GetIgnoredParameterizedTestSuites();
+
+} // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestSuite;
+
+// Old API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TestCase = TestSuite;
+#endif
+class TestInfo;
+class UnitTest;
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestSuites, and
+// each TestSuite contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used in a TEST_F. For example:
+//
+// class FooTest : public testing::Test {
+// protected:
+// void SetUp() override { ... }
+// void TearDown() override { ... }
+// ...
+// };
+//
+// TEST_F(FooTest, Bar) { ... }
+// TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+ friend class TestInfo;
+
+ // The d'tor is virtual as we intend to inherit from Test.
+ virtual ~Test();
+
+ // Sets up the stuff shared by all tests in this test suite.
+ //
+ // Google Test will call Foo::SetUpTestSuite() before running the first
+ // test in test suite Foo. Hence a sub-class can define its own
+ // SetUpTestSuite() method to shadow the one defined in the super
+ // class.
+ static void SetUpTestSuite() {}
+
+ // Tears down the stuff shared by all tests in this test suite.
+ //
+ // Google Test will call Foo::TearDownTestSuite() after running the last
+ // test in test suite Foo. Hence a sub-class can define its own
+ // TearDownTestSuite() method to shadow the one defined in the super
+ // class.
+ static void TearDownTestSuite() {}
+
+ // Legacy API is deprecated but still available. Use SetUpTestSuite and
+ // TearDownTestSuite instead.
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ static void TearDownTestCase() {}
+ static void SetUpTestCase() {}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Returns true if and only if the current test has a fatal failure.
+ static bool HasFatalFailure();
+
+ // Returns true if and only if the current test has a non-fatal failure.
+ static bool HasNonfatalFailure();
+
+ // Returns true if and only if the current test was skipped.
+ static bool IsSkipped();
+
+ // Returns true if and only if the current test has a (either fatal or
+ // non-fatal) failure.
+ static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+ // Logs a property for the current test, test suite, or for the entire
+ // invocation of the test program when used outside of the context of a
+ // test suite. Only the last value for a given key is remembered. These
+ // are public static so they can be called from utility functions that are
+ // not members of the test fixture. Calls to RecordProperty made during
+ // lifespan of the test (from the moment its constructor starts to the
+ // moment its destructor finishes) will be output in XML as attributes of
+ // the <testcase> element. Properties recorded from fixture's
+ // SetUpTestSuite or TearDownTestSuite are logged as attributes of the
+ // corresponding <testsuite> element. Calls to RecordProperty made in the
+ // global context (before or after invocation of RUN_ALL_TESTS and from
+ // SetUp/TearDown method of Environment objects registered with Google
+ // Test) will be output as attributes of the <testsuites> element.
+ static void RecordProperty(const std::string& key, const std::string& value);
+ static void RecordProperty(const std::string& key, int value);
+
+ protected:
+ // Creates a Test object.
+ Test();
+
+ // Sets up the test fixture.
+ virtual void SetUp();
+
+ // Tears down the test fixture.
+ virtual void TearDown();
+
+ private:
+ // Returns true if and only if the current test has the same fixture class
+ // as the first test in the current test suite.
+ static bool HasSameFixtureClass();
+
+ // Runs the test after the test fixture has been set up.
+ //
+ // A sub-class must implement this to define the test logic.
+ //
+ // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+ // Instead, use the TEST or TEST_F macro.
+ virtual void TestBody() = 0;
+
+ // Sets up, executes, and tears down the test.
+ void Run();
+
+ // Deletes self. We deliberately pick an unusual name for this
+ // internal method to avoid clashing with names used in user TESTs.
+ void DeleteSelf_() { delete this; }
+
+ const std::unique_ptr<GTEST_FLAG_SAVER_> gtest_flag_saver_;
+
+ // Often a user misspells SetUp() as Setup() and spends a long time
+ // wondering why it is never called by Google Test. The declaration of
+ // the following method is solely for catching such an error at
+ // compile time:
+ //
+ // - The return type is deliberately chosen to be not void, so it
+ // will be a conflict if void Setup() is declared in the user's
+ // test fixture.
+ //
+ // - This method is private, so it will be another compiler error
+ // if the method is called from the user's test fixture.
+ //
+ // DO NOT OVERRIDE THIS FUNCTION.
+ //
+ // If you see an error about overriding the following function or
+ // about it being private, you have mis-spelled SetUp() as Setup().
+ struct Setup_should_be_spelled_SetUp {};
+ virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
+
+ // We disallow copying Tests.
+ Test(const Test&) = delete;
+ Test& operator=(const Test&) = delete;
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+ // C'tor. TestProperty does NOT have a default constructor.
+ // Always use this constructor (with parameters) to create a
+ // TestProperty object.
+ TestProperty(const std::string& a_key, const std::string& a_value)
+ : key_(a_key), value_(a_value) {}
+
+ // Gets the user supplied key.
+ const char* key() const { return key_.c_str(); }
+
+ // Gets the user supplied value.
+ const char* value() const { return value_.c_str(); }
+
+ // Sets a new value, overriding the one supplied in the constructor.
+ void SetValue(const std::string& new_value) { value_ = new_value; }
+
+ private:
+ // The key supplied by the user.
+ std::string key_;
+ // The value supplied by the user.
+ std::string value_;
+};
+
+// The result of a single Test. This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+ // Creates an empty TestResult.
+ TestResult();
+
+ // D'tor. Do not inherit from TestResult.
+ ~TestResult();
+
+ // Gets the number of all test parts. This is the sum of the number
+ // of successful test parts and the number of failed test parts.
+ int total_part_count() const;
+
+ // Returns the number of the test properties.
+ int test_property_count() const;
+
+ // Returns true if and only if the test passed (i.e. no test part failed).
+ bool Passed() const { return !Skipped() && !Failed(); }
+
+ // Returns true if and only if the test was skipped.
+ bool Skipped() const;
+
+ // Returns true if and only if the test failed.
+ bool Failed() const;
+
+ // Returns true if and only if the test fatally failed.
+ bool HasFatalFailure() const;
+
+ // Returns true if and only if the test has a non-fatal failure.
+ bool HasNonfatalFailure() const;
+
+ // Returns the elapsed time, in milliseconds.
+ TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+ // Gets the time of the test case start, in ms from the start of the
+ // UNIX epoch.
+ TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+ // Returns the i-th test part result among all the results. i can range from 0
+ // to total_part_count() - 1. If i is not in that range, aborts the program.
+ const TestPartResult& GetTestPartResult(int i) const;
+
+ // Returns the i-th test property. i can range from 0 to
+ // test_property_count() - 1. If i is not in that range, aborts the
+ // program.
+ const TestProperty& GetTestProperty(int i) const;
+
+ private:
+ friend class TestInfo;
+ friend class TestSuite;
+ friend class UnitTest;
+ friend class internal::DefaultGlobalTestPartResultReporter;
+ friend class internal::ExecDeathTest;
+ friend class internal::TestResultAccessor;
+ friend class internal::UnitTestImpl;
+ friend class internal::WindowsDeathTest;
+ friend class internal::FuchsiaDeathTest;
+
+ // Gets the vector of TestPartResults.
+ const std::vector<TestPartResult>& test_part_results() const {
+ return test_part_results_;
+ }
+
+ // Gets the vector of TestProperties.
+ const std::vector<TestProperty>& test_properties() const {
+ return test_properties_;
+ }
+
+ // Sets the start time.
+ void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; }
+
+ // Sets the elapsed time.
+ void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+ // Adds a test property to the list. The property is validated and may add
+ // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+ // key names). If a property is already recorded for the same key, the
+ // value will be updated, rather than storing multiple values for the same
+ // key. xml_element specifies the element for which the property is being
+ // recorded and is used for validation.
+ void RecordProperty(const std::string& xml_element,
+ const TestProperty& test_property);
+
+ // Adds a failure if the key is a reserved attribute of Google Test
+ // testsuite tags. Returns true if the property is valid.
+ // FIXME: Validate attribute names are legal and human readable.
+ static bool ValidateTestProperty(const std::string& xml_element,
+ const TestProperty& test_property);
+
+ // Adds a test part result to the list.
+ void AddTestPartResult(const TestPartResult& test_part_result);
+
+ // Returns the death test count.
+ int death_test_count() const { return death_test_count_; }
+
+ // Increments the death test count, returning the new count.
+ int increment_death_test_count() { return ++death_test_count_; }
+
+ // Clears the test part results.
+ void ClearTestPartResults();
+
+ // Clears the object.
+ void Clear();
+
+ // Protects mutable state of the property vector and of owned
+ // properties, whose values may be updated.
+ internal::Mutex test_properties_mutex_;
+
+ // The vector of TestPartResults
+ std::vector<TestPartResult> test_part_results_;
+ // The vector of TestProperties
+ std::vector<TestProperty> test_properties_;
+ // Running count of death tests.
+ int death_test_count_;
+ // The start time, in milliseconds since UNIX Epoch.
+ TimeInMillis start_timestamp_;
+ // The elapsed time, in milliseconds.
+ TimeInMillis elapsed_time_;
+
+ // We disallow copying TestResult.
+ TestResult(const TestResult&) = delete;
+ TestResult& operator=(const TestResult&) = delete;
+}; // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+// Test suite name
+// Test name
+// Whether the test should be run
+// A function pointer that creates the test object when invoked
+// Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+ // Destructs a TestInfo object. This function is not virtual, so
+ // don't inherit from TestInfo.
+ ~TestInfo();
+
+ // Returns the test suite name.
+ const char* test_suite_name() const { return test_suite_name_.c_str(); }
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ const char* test_case_name() const { return test_suite_name(); }
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Returns the test name.
+ const char* name() const { return name_.c_str(); }
+
+ // Returns the name of the parameter type, or NULL if this is not a typed
+ // or a type-parameterized test.
+ const char* type_param() const {
+ if (type_param_.get() != nullptr) return type_param_->c_str();
+ return nullptr;
+ }
+
+ // Returns the text representation of the value parameter, or NULL if this
+ // is not a value-parameterized test.
+ const char* value_param() const {
+ if (value_param_.get() != nullptr) return value_param_->c_str();
+ return nullptr;
+ }
+
+ // Returns the file name where this test is defined.
+ const char* file() const { return location_.file.c_str(); }
+
+ // Returns the line where this test is defined.
+ int line() const { return location_.line; }
+
+ // Return true if this test should not be run because it's in another shard.
+ bool is_in_another_shard() const { return is_in_another_shard_; }
+
+ // Returns true if this test should run, that is if the test is not
+ // disabled (or it is disabled but the also_run_disabled_tests flag has
+ // been specified) and its full name matches the user-specified filter.
+ //
+ // Google Test allows the user to filter the tests by their full names.
+ // The full name of a test Bar in test suite Foo is defined as
+ // "Foo.Bar". Only the tests that match the filter will run.
+ //
+ // A filter is a colon-separated list of glob (not regex) patterns,
+ // optionally followed by a '-' and a colon-separated list of
+ // negative patterns (tests to exclude). A test is run if it
+ // matches one of the positive patterns and does not match any of
+ // the negative patterns.
+ //
+ // For example, *A*:Foo.* is a filter that matches any string that
+ // contains the character 'A' or starts with "Foo.".
+ bool should_run() const { return should_run_; }
+
+ // Returns true if and only if this test will appear in the XML report.
+ bool is_reportable() const {
+ // The XML report includes tests matching the filter, excluding those
+ // run in other shards.
+ return matches_filter_ && !is_in_another_shard_;
+ }
+
+ // Returns the result of the test.
+ const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+ friend class internal::DefaultDeathTestFactory;
+#endif // GTEST_HAS_DEATH_TEST
+ friend class Test;
+ friend class TestSuite;
+ friend class internal::UnitTestImpl;
+ friend class internal::StreamingListenerTest;
+ friend TestInfo* internal::MakeAndRegisterTestInfo(
+ const char* test_suite_name, const char* name, const char* type_param,
+ const char* value_param, internal::CodeLocation code_location,
+ internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc,
+ internal::TestFactoryBase* factory);
+
+ // Constructs a TestInfo object. The newly constructed instance assumes
+ // ownership of the factory object.
+ TestInfo(const std::string& test_suite_name, const std::string& name,
+ const char* a_type_param, // NULL if not a type-parameterized test
+ const char* a_value_param, // NULL if not a value-parameterized test
+ internal::CodeLocation a_code_location,
+ internal::TypeId fixture_class_id,
+ internal::TestFactoryBase* factory);
+
+ // Increments the number of death tests encountered in this test so
+ // far.
+ int increment_death_test_count() {
+ return result_.increment_death_test_count();
+ }
+
+ // Creates the test object, runs it, records its result, and then
+ // deletes it.
+ void Run();
+
+ // Skip and records the test result for this object.
+ void Skip();
+
+ static void ClearTestResult(TestInfo* test_info) {
+ test_info->result_.Clear();
+ }
+
+ // These fields are immutable properties of the test.
+ const std::string test_suite_name_; // test suite name
+ const std::string name_; // Test name
+ // Name of the parameter type, or NULL if this is not a typed or a
+ // type-parameterized test.
+ const std::unique_ptr<const ::std::string> type_param_;
+ // Text representation of the value parameter, or NULL if this is not a
+ // value-parameterized test.
+ const std::unique_ptr<const ::std::string> value_param_;
+ internal::CodeLocation location_;
+ const internal::TypeId fixture_class_id_; // ID of the test fixture class
+ bool should_run_; // True if and only if this test should run
+ bool is_disabled_; // True if and only if this test is disabled
+ bool matches_filter_; // True if this test matches the
+ // user-specified filter.
+ bool is_in_another_shard_; // Will be run in another shard.
+ internal::TestFactoryBase* const factory_; // The factory that creates
+ // the test object
+
+ // This field is mutable and needs to be reset before running the
+ // test for the second time.
+ TestResult result_;
+
+ TestInfo(const TestInfo&) = delete;
+ TestInfo& operator=(const TestInfo&) = delete;
+};
+
+// A test suite, which consists of a vector of TestInfos.
+//
+// TestSuite is not copyable.
+class GTEST_API_ TestSuite {
+ public:
+ // Creates a TestSuite with the given name.
+ //
+ // TestSuite does NOT have a default constructor. Always use this
+ // constructor to create a TestSuite object.
+ //
+ // Arguments:
+ //
+ // name: name of the test suite
+ // a_type_param: the name of the test's type parameter, or NULL if
+ // this is not a type-parameterized test.
+ // set_up_tc: pointer to the function that sets up the test suite
+ // tear_down_tc: pointer to the function that tears down the test suite
+ TestSuite(const char* name, const char* a_type_param,
+ internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc);
+
+ // Destructor of TestSuite.
+ virtual ~TestSuite();
+
+ // Gets the name of the TestSuite.
+ const char* name() const { return name_.c_str(); }
+
+ // Returns the name of the parameter type, or NULL if this is not a
+ // type-parameterized test suite.
+ const char* type_param() const {
+ if (type_param_.get() != nullptr) return type_param_->c_str();
+ return nullptr;
+ }
+
+ // Returns true if any test in this test suite should run.
+ bool should_run() const { return should_run_; }
+
+ // Gets the number of successful tests in this test suite.
+ int successful_test_count() const;
+
+ // Gets the number of skipped tests in this test suite.
+ int skipped_test_count() const;
+
+ // Gets the number of failed tests in this test suite.
+ int failed_test_count() const;
+
+ // Gets the number of disabled tests that will be reported in the XML report.
+ int reportable_disabled_test_count() const;
+
+ // Gets the number of disabled tests in this test suite.
+ int disabled_test_count() const;
+
+ // Gets the number of tests to be printed in the XML report.
+ int reportable_test_count() const;
+
+ // Get the number of tests in this test suite that should run.
+ int test_to_run_count() const;
+
+ // Gets the number of all tests in this test suite.
+ int total_test_count() const;
+
+ // Returns true if and only if the test suite passed.
+ bool Passed() const { return !Failed(); }
+
+ // Returns true if and only if the test suite failed.
+ bool Failed() const {
+ return failed_test_count() > 0 || ad_hoc_test_result().Failed();
+ }
+
+ // Returns the elapsed time, in milliseconds.
+ TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+ // Gets the time of the test suite start, in ms from the start of the
+ // UNIX epoch.
+ TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+ // Returns the i-th test among all the tests. i can range from 0 to
+ // total_test_count() - 1. If i is not in that range, returns NULL.
+ const TestInfo* GetTestInfo(int i) const;
+
+ // Returns the TestResult that holds test properties recorded during
+ // execution of SetUpTestSuite and TearDownTestSuite.
+ const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+ friend class Test;
+ friend class internal::UnitTestImpl;
+
+ // Gets the (mutable) vector of TestInfos in this TestSuite.
+ std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+ // Gets the (immutable) vector of TestInfos in this TestSuite.
+ const std::vector<TestInfo*>& test_info_list() const {
+ return test_info_list_;
+ }
+
+ // Returns the i-th test among all the tests. i can range from 0 to
+ // total_test_count() - 1. If i is not in that range, returns NULL.
+ TestInfo* GetMutableTestInfo(int i);
+
+ // Sets the should_run member.
+ void set_should_run(bool should) { should_run_ = should; }
+
+ // Adds a TestInfo to this test suite. Will delete the TestInfo upon
+ // destruction of the TestSuite object.
+ void AddTestInfo(TestInfo* test_info);
+
+ // Clears the results of all tests in this test suite.
+ void ClearResult();
+
+ // Clears the results of all tests in the given test suite.
+ static void ClearTestSuiteResult(TestSuite* test_suite) {
+ test_suite->ClearResult();
+ }
+
+ // Runs every test in this TestSuite.
+ void Run();
+
+ // Skips the execution of tests under this TestSuite
+ void Skip();
+
+ // Runs SetUpTestSuite() for this TestSuite. This wrapper is needed
+ // for catching exceptions thrown from SetUpTestSuite().
+ void RunSetUpTestSuite() {
+ if (set_up_tc_ != nullptr) {
+ (*set_up_tc_)();
+ }
+ }
+
+ // Runs TearDownTestSuite() for this TestSuite. This wrapper is
+ // needed for catching exceptions thrown from TearDownTestSuite().
+ void RunTearDownTestSuite() {
+ if (tear_down_tc_ != nullptr) {
+ (*tear_down_tc_)();
+ }
+ }
+
+ // Returns true if and only if test passed.
+ static bool TestPassed(const TestInfo* test_info) {
+ return test_info->should_run() && test_info->result()->Passed();
+ }
+
+ // Returns true if and only if test skipped.
+ static bool TestSkipped(const TestInfo* test_info) {
+ return test_info->should_run() && test_info->result()->Skipped();
+ }
+
+ // Returns true if and only if test failed.
+ static bool TestFailed(const TestInfo* test_info) {
+ return test_info->should_run() && test_info->result()->Failed();
+ }
+
+ // Returns true if and only if the test is disabled and will be reported in
+ // the XML report.
+ static bool TestReportableDisabled(const TestInfo* test_info) {
+ return test_info->is_reportable() && test_info->is_disabled_;
+ }
+
+ // Returns true if and only if test is disabled.
+ static bool TestDisabled(const TestInfo* test_info) {
+ return test_info->is_disabled_;
+ }
+
+ // Returns true if and only if this test will appear in the XML report.
+ static bool TestReportable(const TestInfo* test_info) {
+ return test_info->is_reportable();
+ }
+
+ // Returns true if the given test should run.
+ static bool ShouldRunTest(const TestInfo* test_info) {
+ return test_info->should_run();
+ }
+
+ // Shuffles the tests in this test suite.
+ void ShuffleTests(internal::Random* random);
+
+ // Restores the test order to before the first shuffle.
+ void UnshuffleTests();
+
+ // Name of the test suite.
+ std::string name_;
+ // Name of the parameter type, or NULL if this is not a typed or a
+ // type-parameterized test.
+ const std::unique_ptr<const ::std::string> type_param_;
+ // The vector of TestInfos in their original order. It owns the
+ // elements in the vector.
+ std::vector<TestInfo*> test_info_list_;
+ // Provides a level of indirection for the test list to allow easy
+ // shuffling and restoring the test order. The i-th element in this
+ // vector is the index of the i-th test in the shuffled test list.
+ std::vector<int> test_indices_;
+ // Pointer to the function that sets up the test suite.
+ internal::SetUpTestSuiteFunc set_up_tc_;
+ // Pointer to the function that tears down the test suite.
+ internal::TearDownTestSuiteFunc tear_down_tc_;
+ // True if and only if any test in this test suite should run.
+ bool should_run_;
+ // The start time, in milliseconds since UNIX Epoch.
+ TimeInMillis start_timestamp_;
+ // Elapsed time, in milliseconds.
+ TimeInMillis elapsed_time_;
+ // Holds test properties recorded during execution of SetUpTestSuite and
+ // TearDownTestSuite.
+ TestResult ad_hoc_test_result_;
+
+ // We disallow copying TestSuites.
+ TestSuite(const TestSuite&) = delete;
+ TestSuite& operator=(const TestSuite&) = delete;
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment. You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+// 1. You cannot safely throw from a destructor. This is a problem
+// as in some cases Google Test is used where exceptions are enabled, and
+// we may want to implement ASSERT_* using exceptions where they are
+// available.
+// 2. You cannot use ASSERT_* directly in a constructor or
+// destructor.
+class Environment {
+ public:
+ // The d'tor is virtual as we need to subclass Environment.
+ virtual ~Environment() {}
+
+ // Override this to define how to set up the environment.
+ virtual void SetUp() {}
+
+ // Override this to define how to tear down the environment.
+ virtual void TearDown() {}
+
+ private:
+ // If you see an error about overriding the following function or
+ // about it being private, you have mis-spelled SetUp() as Setup().
+ struct Setup_should_be_spelled_SetUp {};
+ virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
+};
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+ : public internal::GoogleTestFailureException {
+ public:
+ explicit AssertionException(const TestPartResult& result)
+ : GoogleTestFailureException(result) {}
+};
+
+#endif // GTEST_HAS_EXCEPTIONS
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+ virtual ~TestEventListener() {}
+
+ // Fired before any test activity starts.
+ virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+ // Fired before each iteration of tests starts. There may be more than
+ // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+ // index, starting from 0.
+ virtual void OnTestIterationStart(const UnitTest& unit_test,
+ int iteration) = 0;
+
+ // Fired before environment set-up for each iteration of tests starts.
+ virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+ // Fired after environment set-up for each iteration of tests ends.
+ virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+ // Fired before the test suite starts.
+ virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {}
+
+ // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Fired before the test starts.
+ virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+ // Fired when a test is disabled
+ virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
+ // Fired after a failed assertion or a SUCCEED() invocation.
+ // If you want to throw an exception from this function to skip to the next
+ // TEST, it must be AssertionException defined above, or inherited from it.
+ virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+ // Fired after the test ends.
+ virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+ // Fired after the test suite ends.
+ virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {}
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Fired before environment tear-down for each iteration of tests starts.
+ virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+ // Fired after environment tear-down for each iteration of tests ends.
+ virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+ // Fired after each iteration of tests finishes.
+ virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
+
+ // Fired after all test activities have ended.
+ virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build. For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+ void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+ void OnTestIterationStart(const UnitTest& /*unit_test*/,
+ int /*iteration*/) override {}
+ void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+ void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+ void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ void OnTestStart(const TestInfo& /*test_info*/) override {}
+ void OnTestDisabled(const TestInfo& /*test_info*/) override {}
+ void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
+ void OnTestEnd(const TestInfo& /*test_info*/) override {}
+ void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+ void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+ void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+ int /*iteration*/) override {}
+ void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+ TestEventListeners();
+ ~TestEventListeners();
+
+ // Appends an event listener to the end of the list. Google Test assumes
+ // the ownership of the listener (i.e. it will delete the listener when
+ // the test program finishes).
+ void Append(TestEventListener* listener);
+
+ // Removes the given event listener from the list and returns it. It then
+ // becomes the caller's responsibility to delete the listener. Returns
+ // NULL if the listener is not found in the list.
+ TestEventListener* Release(TestEventListener* listener);
+
+ // Returns the standard listener responsible for the default console
+ // output. Can be removed from the listeners list to shut down default
+ // console output. Note that removing this object from the listener list
+ // with Release transfers its ownership to the caller and makes this
+ // function return NULL the next time.
+ TestEventListener* default_result_printer() const {
+ return default_result_printer_;
+ }
+
+ // Returns the standard listener responsible for the default XML output
+ // controlled by the --gtest_output=xml flag. Can be removed from the
+ // listeners list by users who want to shut down the default XML output
+ // controlled by this flag and substitute it with custom one. Note that
+ // removing this object from the listener list with Release transfers its
+ // ownership to the caller and makes this function return NULL the next
+ // time.
+ TestEventListener* default_xml_generator() const {
+ return default_xml_generator_;
+ }
+
+ private:
+ friend class TestSuite;
+ friend class TestInfo;
+ friend class internal::DefaultGlobalTestPartResultReporter;
+ friend class internal::NoExecDeathTest;
+ friend class internal::TestEventListenersAccessor;
+ friend class internal::UnitTestImpl;
+
+ // Returns repeater that broadcasts the TestEventListener events to all
+ // subscribers.
+ TestEventListener* repeater();
+
+ // Sets the default_result_printer attribute to the provided listener.
+ // The listener is also added to the listener list and previous
+ // default_result_printer is removed from it and deleted. The listener can
+ // also be NULL in which case it will not be added to the list. Does
+ // nothing if the previous and the current listener objects are the same.
+ void SetDefaultResultPrinter(TestEventListener* listener);
+
+ // Sets the default_xml_generator attribute to the provided listener. The
+ // listener is also added to the listener list and previous
+ // default_xml_generator is removed from it and deleted. The listener can
+ // also be NULL in which case it will not be added to the list. Does
+ // nothing if the previous and the current listener objects are the same.
+ void SetDefaultXmlGenerator(TestEventListener* listener);
+
+ // Controls whether events will be forwarded by the repeater to the
+ // listeners in the list.
+ bool EventForwardingEnabled() const;
+ void SuppressEventForwarding();
+
+ // The actual list of listeners.
+ internal::TestEventRepeater* repeater_;
+ // Listener responsible for the standard result output.
+ TestEventListener* default_result_printer_;
+ // Listener responsible for the creation of the XML output file.
+ TestEventListener* default_xml_generator_;
+
+ // We disallow copying TestEventListeners.
+ TestEventListeners(const TestEventListeners&) = delete;
+ TestEventListeners& operator=(const TestEventListeners&) = delete;
+};
+
+// A UnitTest consists of a vector of TestSuites.
+//
+// This is a singleton class. The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called. This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+ // Gets the singleton UnitTest object. The first time this method
+ // is called, a UnitTest object is constructed and returned.
+ // Consecutive calls will return the same object.
+ static UnitTest* GetInstance();
+
+ // Runs all tests in this UnitTest object and prints the result.
+ // Returns 0 if successful, or 1 otherwise.
+ //
+ // This method can only be called from the main thread.
+ //
+ // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+ int Run() GTEST_MUST_USE_RESULT_;
+
+ // Returns the working directory when the first TEST() or TEST_F()
+ // was executed. The UnitTest object owns the string.
+ const char* original_working_dir() const;
+
+ // Returns the TestSuite object for the test that's currently running,
+ // or NULL if no test is running.
+ const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
+#endif
+
+ // Returns the TestInfo object for the test that's currently running,
+ // or NULL if no test is running.
+ const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+ // Returns the random seed used at the start of the current test run.
+ int random_seed() const;
+
+ // Returns the ParameterizedTestSuiteRegistry object used to keep track of
+ // value-parameterized tests and instantiate and register them.
+ //
+ // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+ internal::ParameterizedTestSuiteRegistry& parameterized_test_registry()
+ GTEST_LOCK_EXCLUDED_(mutex_);
+
+ // Gets the number of successful test suites.
+ int successful_test_suite_count() const;
+
+ // Gets the number of failed test suites.
+ int failed_test_suite_count() const;
+
+ // Gets the number of all test suites.
+ int total_test_suite_count() const;
+
+ // Gets the number of all test suites that contain at least one test
+ // that should run.
+ int test_suite_to_run_count() const;
+
+ // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ int successful_test_case_count() const;
+ int failed_test_case_count() const;
+ int total_test_case_count() const;
+ int test_case_to_run_count() const;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Gets the number of successful tests.
+ int successful_test_count() const;
+
+ // Gets the number of skipped tests.
+ int skipped_test_count() const;
+
+ // Gets the number of failed tests.
+ int failed_test_count() const;
+
+ // Gets the number of disabled tests that will be reported in the XML report.
+ int reportable_disabled_test_count() const;
+
+ // Gets the number of disabled tests.
+ int disabled_test_count() const;
+
+ // Gets the number of tests to be printed in the XML report.
+ int reportable_test_count() const;
+
+ // Gets the number of all tests.
+ int total_test_count() const;
+
+ // Gets the number of tests that should run.
+ int test_to_run_count() const;
+
+ // Gets the time of the test program start, in ms from the start of the
+ // UNIX epoch.
+ TimeInMillis start_timestamp() const;
+
+ // Gets the elapsed time, in milliseconds.
+ TimeInMillis elapsed_time() const;
+
+ // Returns true if and only if the unit test passed (i.e. all test suites
+ // passed).
+ bool Passed() const;
+
+ // Returns true if and only if the unit test failed (i.e. some test suite
+ // failed or something outside of all tests failed).
+ bool Failed() const;
+
+ // Gets the i-th test suite among all the test suites. i can range from 0 to
+ // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+ const TestSuite* GetTestSuite(int i) const;
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ const TestCase* GetTestCase(int i) const;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Returns the TestResult containing information on test failures and
+ // properties logged outside of individual test suites.
+ const TestResult& ad_hoc_test_result() const;
+
+ // Returns the list of event listeners that can be used to track events
+ // inside Google Test.
+ TestEventListeners& listeners();
+
+ private:
+ // Registers and returns a global test environment. When a test
+ // program is run, all global test environments will be set-up in
+ // the order they were registered. After all tests in the program
+ // have finished, all global test environments will be torn-down in
+ // the *reverse* order they were registered.
+ //
+ // The UnitTest object takes ownership of the given environment.
+ //
+ // This method can only be called from the main thread.
+ Environment* AddEnvironment(Environment* env);
+
+ // Adds a TestPartResult to the current TestResult object. All
+ // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+ // eventually call this to report their results. The user code
+ // should use the assertion macros instead of calling this directly.
+ void AddTestPartResult(TestPartResult::Type result_type,
+ const char* file_name, int line_number,
+ const std::string& message,
+ const std::string& os_stack_trace)
+ GTEST_LOCK_EXCLUDED_(mutex_);
+
+ // Adds a TestProperty to the current TestResult object when invoked from
+ // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+ // from SetUpTestSuite or TearDownTestSuite, or to the global property set
+ // when invoked elsewhere. If the result already contains a property with
+ // the same key, the value will be updated.
+ void RecordProperty(const std::string& key, const std::string& value);
+
+ // Gets the i-th test suite among all the test suites. i can range from 0 to
+ // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+ TestSuite* GetMutableTestSuite(int i);
+
+ // Accessors for the implementation object.
+ internal::UnitTestImpl* impl() { return impl_; }
+ const internal::UnitTestImpl* impl() const { return impl_; }
+
+ // These classes and functions are friends as they need to access private
+ // members of UnitTest.
+ friend class ScopedTrace;
+ friend class Test;
+ friend class internal::AssertHelper;
+ friend class internal::StreamingListenerTest;
+ friend class internal::UnitTestRecordPropertyTestHelper;
+ friend Environment* AddGlobalTestEnvironment(Environment* env);
+ friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
+ friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+ friend void internal::ReportFailureInUnknownLocation(
+ TestPartResult::Type result_type, const std::string& message);
+
+ // Creates an empty UnitTest.
+ UnitTest();
+
+ // D'tor
+ virtual ~UnitTest();
+
+ // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+ // Google Test trace stack.
+ void PushGTestTrace(const internal::TraceInfo& trace)
+ GTEST_LOCK_EXCLUDED_(mutex_);
+
+ // Pops a trace from the per-thread Google Test trace stack.
+ void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
+
+ // Protects mutable state in *impl_. This is mutable as some const
+ // methods need to lock it too.
+ mutable internal::Mutex mutex_;
+
+ // Opaque implementation object. This field is never changed once
+ // the object is constructed. We don't mark it as const here, as
+ // doing so will cause a warning in the constructor of UnitTest.
+ // Mutable state in *impl_ is protected by mutex_.
+ internal::UnitTestImpl* impl_;
+
+ // We disallow copying UnitTest.
+ UnitTest(const UnitTest&) = delete;
+ UnitTest& operator=(const UnitTest&) = delete;
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main(). If you use gtest_main, you need to call this before main()
+// starts for it to take effect. For example, you can define a global
+// variable like this:
+//
+// testing::Environment* const foo_env =
+// testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+ return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test. This must be called before calling
+// RUN_ALL_TESTS(). In particular, it parses a command line for the
+// flags that Google Test recognizes. Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned. Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleTest();
+
+namespace internal {
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+ const char* rhs_expression, const T1& lhs,
+ const T2& rhs) {
+ return EqFailure(lhs_expression, rhs_expression,
+ FormatForComparisonFailureMessage(lhs, rhs),
+ FormatForComparisonFailureMessage(rhs, lhs), false);
+}
+
+// This block of code defines operator==/!=
+// to block lexical scope lookup.
+// It prevents using invalid operator==/!= defined at namespace scope.
+struct faketype {};
+inline bool operator==(faketype, faketype) { return true; }
+inline bool operator!=(faketype, faketype) { return false; }
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+ const char* rhs_expression, const T1& lhs,
+ const T2& rhs) {
+ if (lhs == rhs) {
+ return AssertionSuccess();
+ }
+
+ return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+class EqHelper {
+ public:
+ // This templatized version is for the general case.
+ template <
+ typename T1, typename T2,
+ // Disable this overload for cases where one argument is a pointer
+ // and the other is the null pointer constant.
+ typename std::enable_if<!std::is_integral<T1>::value ||
+ !std::is_pointer<T2>::value>::type* = nullptr>
+ static AssertionResult Compare(const char* lhs_expression,
+ const char* rhs_expression, const T1& lhs,
+ const T2& rhs) {
+ return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+ }
+
+ // With this overloaded version, we allow anonymous enums to be used
+ // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+ // enums can be implicitly cast to BiggestInt.
+ //
+ // Even though its body looks the same as the above version, we
+ // cannot merge the two, as it will make anonymous enums unhappy.
+ static AssertionResult Compare(const char* lhs_expression,
+ const char* rhs_expression, BiggestInt lhs,
+ BiggestInt rhs) {
+ return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+ }
+
+ template <typename T>
+ static AssertionResult Compare(
+ const char* lhs_expression, const char* rhs_expression,
+ // Handle cases where '0' is used as a null pointer literal.
+ std::nullptr_t /* lhs */, T* rhs) {
+ // We already know that 'lhs' is a null pointer.
+ return CmpHelperEQ(lhs_expression, rhs_expression, static_cast<T*>(nullptr),
+ rhs);
+ }
+};
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+ const T1& val1, const T2& val2,
+ const char* op) {
+ return AssertionFailure()
+ << "Expected: (" << expr1 << ") " << op << " (" << expr2
+ << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+ << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste
+// of similar code.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op) \
+ template <typename T1, typename T2> \
+ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+ const T1& val1, const T2& val2) { \
+ if (val1 op val2) { \
+ return AssertionSuccess(); \
+ } else { \
+ return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \
+ } \
+ }
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <)
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >)
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+ const char* s2_expression,
+ const char* s1, const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+ const char* s2_expression,
+ const char* s1, const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+ const char* s2_expression,
+ const char* s1, const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+ const char* s2_expression,
+ const char* s1, const char* s2);
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+ const char* s2_expression,
+ const wchar_t* s1, const wchar_t* s2);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+ const char* s2_expression,
+ const wchar_t* s1, const wchar_t* s2);
+
+} // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves. They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const char* needle,
+ const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const wchar_t* needle,
+ const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const char* needle,
+ const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const wchar_t* needle,
+ const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack);
+#endif // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+// RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+ const char* rhs_expression,
+ RawType lhs_value, RawType rhs_value) {
+ const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+ if (lhs.AlmostEquals(rhs)) {
+ return AssertionSuccess();
+ }
+
+ ::std::stringstream lhs_ss;
+ lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+ << lhs_value;
+
+ ::std::stringstream rhs_ss;
+ rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+ << rhs_value;
+
+ return EqFailure(lhs_expression, rhs_expression,
+ StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
+ false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+ const char* expr2,
+ const char* abs_error_expr,
+ double val1, double val2,
+ double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+ // Constructor.
+ AssertHelper(TestPartResult::Type type, const char* file, int line,
+ const char* message);
+ ~AssertHelper();
+
+ // Message assignment is a semantic trick to enable assertion
+ // streaming; see the GTEST_MESSAGE_ macro below.
+ void operator=(const Message& message) const;
+
+ private:
+ // We put our data in a struct so that the size of the AssertHelper class can
+ // be as small as possible. This is important because gcc is incapable of
+ // re-using stack space even for temporary variables, so every EXPECT_EQ
+ // reserves stack space for another AssertHelper.
+ struct AssertHelperData {
+ AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
+ const char* msg)
+ : type(t), file(srcfile), line(line_num), message(msg) {}
+
+ TestPartResult::Type const type;
+ const char* const file;
+ int const line;
+ std::string const message;
+
+ private:
+ AssertHelperData(const AssertHelperData&) = delete;
+ AssertHelperData& operator=(const AssertHelperData&) = delete;
+ };
+
+ AssertHelperData* const data_;
+
+ AssertHelper(const AssertHelper&) = delete;
+ AssertHelper& operator=(const AssertHelper&) = delete;
+};
+
+} // namespace internal
+
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+// protected:
+// FooTest() {
+// // Can use GetParam() here.
+// }
+// ~FooTest() override {
+// // Can use GetParam() here.
+// }
+// void SetUp() override {
+// // Can use GetParam() here.
+// }
+// void TearDown override {
+// // Can use GetParam() here.
+// }
+// };
+// TEST_P(FooTest, DoesBar) {
+// // Can use GetParam() method here.
+// Foo foo;
+// ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+ typedef T ParamType;
+ virtual ~WithParamInterface() {}
+
+ // The current parameter value. Is also available in the test fixture's
+ // constructor.
+ static const ParamType& GetParam() {
+ GTEST_CHECK_(parameter_ != nullptr)
+ << "GetParam() can only be called inside a value-parameterized test "
+ << "-- did you intend to write TEST_P instead of TEST_F?";
+ return *parameter_;
+ }
+
+ private:
+ // Sets parameter value. The caller is responsible for making sure the value
+ // remains alive and unchanged throughout the current test.
+ static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
+
+ // Static value used for accessing parameter during a test lifetime.
+ static const ParamType* parameter_;
+
+ // TestClass must be a subclass of WithParamInterface<T> and Test.
+ template <class TestClass>
+ friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = nullptr;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {};
+
+// Macros for indicating success/failure in test code.
+
+// Skips test in runtime.
+// Skipping test aborts current function.
+// Skipped tests are neither successful nor failed.
+#define GTEST_SKIP() GTEST_SKIP_("")
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied. If not,
+// it behaves like ADD_FAILURE. In particular:
+//
+// EXPECT_TRUE verifies that a Boolean condition is true.
+// EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure. People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+ GTEST_MESSAGE_AT_(file, line, "Failed", \
+ ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Like GTEST_FAIL(), but at the given source file location.
+#define GTEST_FAIL_AT(file, line) \
+ GTEST_MESSAGE_AT_(file, line, "Failed", \
+ ::testing::TestPartResult::kFatalFailure)
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+#define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+#define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+// * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+// Tests that the statement throws the expected exception.
+// * {ASSERT|EXPECT}_NO_THROW(statement):
+// Tests that the statement doesn't throw any exception.
+// * {ASSERT|EXPECT}_ANY_THROW(statement):
+// Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+ GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+ GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+ GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+ GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+ GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+ GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define GTEST_EXPECT_TRUE(condition) \
+ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+ GTEST_NONFATAL_FAILURE_)
+#define GTEST_EXPECT_FALSE(condition) \
+ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+ GTEST_NONFATAL_FAILURE_)
+#define GTEST_ASSERT_TRUE(condition) \
+ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition) \
+ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+ GTEST_FATAL_FAILURE_)
+
+// Define these macros to 1 to omit the definition of the corresponding
+// EXPECT or ASSERT, which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_EXPECT_TRUE
+#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_EXPECT_FALSE
+#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_TRUE
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_FALSE
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+#endif
+
+// Macros for testing equalities and inequalities.
+//
+// * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values. The values must be compatible built-in types,
+// or you will get a compiler error. By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+// 1. It is possible to make a user-defined type work with
+// {ASSERT|EXPECT}_??(), but that requires overloading the
+// comparison operators and is thus discouraged by the Google C++
+// Usage Guide. Therefore, you are advised to use the
+// {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+// equal.
+//
+// 2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+// pointers (in particular, C strings). Therefore, if you use it
+// with two C strings, you are testing how their locations in memory
+// are related, not how their content is related. To compare two C
+// strings by content, use {ASSERT|EXPECT}_STR*().
+//
+// 3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+// {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
+// what the actual value is when it fails, and similarly for the
+// other comparisons.
+//
+// 4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+// evaluate their arguments, which is undefined.
+//
+// 5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+// EXPECT_NE(Foo(), 5);
+// EXPECT_EQ(a_pointer, NULL);
+// ASSERT_LT(i, array_size);
+// ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define EXPECT_NE(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define EXPECT_LE(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define GTEST_ASSERT_NE(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons. All tests treat NULL and any non-NULL string
+// as different. Two NULLs are equal.
+//
+// * {ASSERT|EXPECT}_STREQ(s1, s2): Tests that s1 == s2
+// * {ASSERT|EXPECT}_STRNE(s1, s2): Tests that s1 != s2
+// * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+// * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(s1, s2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define EXPECT_STRNE(s1, s2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(s1, s2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define EXPECT_STRCASENE(s1, s2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(s1, s2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define ASSERT_STRNE(s1, s2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(s1, s2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define ASSERT_STRCASENE(s1, s2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+// * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
+// Tests that two float values are almost equal.
+// * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
+// Tests that two double values are almost equal.
+// * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+// Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands. See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+ val1, val2)
+
+#define EXPECT_DOUBLE_EQ(val1, val2) \
+ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+ val1, val2)
+
+#define ASSERT_FLOAT_EQ(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+ val1, val2)
+
+#define ASSERT_DOUBLE_EQ(val1, val2) \
+ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+ val1, val2)
+
+#define EXPECT_NEAR(val1, val2, abs_error) \
+ EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+ abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error) \
+ ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+ abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+// EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2. Fails
+// otherwise. In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+ float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+ double val1, double val2);
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+// * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+ EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+ ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+#define EXPECT_HRESULT_FAILED(expr) \
+ EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#define ASSERT_HRESULT_FAILED(expr) \
+ ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+// * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+// EXPECT_NO_FATAL_FAILURE(Process());
+// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+ GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+ GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+// testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+ // The c'tor pushes the given source file location and message onto
+ // a trace stack maintained by Google Test.
+
+ // Template version. Uses Message() to convert the values into strings.
+ // Slow, but flexible.
+ template <typename T>
+ ScopedTrace(const char* file, int line, const T& message) {
+ PushTrace(file, line, (Message() << message).GetString());
+ }
+
+ // Optimize for some known types.
+ ScopedTrace(const char* file, int line, const char* message) {
+ PushTrace(file, line, message ? message : "(null)");
+ }
+
+ ScopedTrace(const char* file, int line, const std::string& message) {
+ PushTrace(file, line, message);
+ }
+
+ // The d'tor pops the info pushed by the c'tor.
+ //
+ // Note that the d'tor is not virtual in order to be efficient.
+ // Don't inherit from ScopedTrace!
+ ~ScopedTrace();
+
+ private:
+ void PushTrace(const char* file, int line, std::string message);
+
+ ScopedTrace(const ScopedTrace&) = delete;
+ ScopedTrace& operator=(const ScopedTrace&) = delete;
+} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its
+ // c'tor and d'tor. Therefore it doesn't
+ // need to be used otherwise.
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope. The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
+#define SCOPED_TRACE(message) \
+ ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+ __FILE__, __LINE__, (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
+// are the same type. The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template. This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated. For example, given:
+//
+// template <typename T> class Foo {
+// public:
+// void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+// };
+//
+// the code:
+//
+// void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated. Instead, you need:
+//
+// void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+constexpr bool StaticAssertTypeEq() noexcept {
+ static_assert(std::is_same<T1, T2>::value, "T1 and T2 are not the same type");
+ return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test suite, and the second
+// parameter is the name of the test within the test suite.
+//
+// The convention is to end the test suite name with "Test". For
+// example, a test suite for the Foo class can be named FooTest.
+//
+// Test code should appear between braces after an invocation of
+// this macro. Example:
+//
+// TEST(FooTest, InitializesCorrectly) {
+// Foo foo;
+// EXPECT_TRUE(foo.StatusIsOK());
+// }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test. This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X. The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code. GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_suite_name, test_name) \
+ GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \
+ ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test suite name. The second parameter is the
+// name of the test within the test suite.
+//
+// A test fixture class must be declared earlier. The user should put
+// the test code between braces after using this macro. Example:
+//
+// class FooTest : public testing::Test {
+// protected:
+// void SetUp() override { b_.AddElement(3); }
+//
+// Foo a_;
+// Foo b_;
+// };
+//
+// TEST_F(FooTest, InitializesCorrectly) {
+// EXPECT_TRUE(a_.StatusIsOK());
+// }
+//
+// TEST_F(FooTest, ReturnsElementCountCorrectly) {
+// EXPECT_EQ(a_.size(), 0);
+// EXPECT_EQ(b_.size(), 1);
+// }
+#define GTEST_TEST_F(test_fixture, test_name) \
+ GTEST_TEST_(test_fixture, test_name, test_fixture, \
+ ::testing::internal::GetTypeId<test_fixture>())
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
+
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Dynamically registers a test with the framework.
+//
+// This is an advanced API only to be used when the `TEST` macros are
+// insufficient. The macros should be preferred when possible, as they avoid
+// most of the complexity of calling this function.
+//
+// The `factory` argument is a factory callable (move-constructible) object or
+// function pointer that creates a new instance of the Test object. It
+// handles ownership to the caller. The signature of the callable is
+// `Fixture*()`, where `Fixture` is the test fixture class for the test. All
+// tests registered with the same `test_suite_name` must return the same
+// fixture type. This is checked at runtime.
+//
+// The framework will infer the fixture class from the factory and will call
+// the `SetUpTestSuite` and `TearDownTestSuite` for it.
+//
+// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
+// undefined.
+//
+// Use case example:
+//
+// class MyFixture : public ::testing::Test {
+// public:
+// // All of these optional, just like in regular macro usage.
+// static void SetUpTestSuite() { ... }
+// static void TearDownTestSuite() { ... }
+// void SetUp() override { ... }
+// void TearDown() override { ... }
+// };
+//
+// class MyTest : public MyFixture {
+// public:
+// explicit MyTest(int data) : data_(data) {}
+// void TestBody() override { ... }
+//
+// private:
+// int data_;
+// };
+//
+// void RegisterMyTests(const std::vector<int>& values) {
+// for (int v : values) {
+// ::testing::RegisterTest(
+// "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
+// std::to_string(v).c_str(),
+// __FILE__, __LINE__,
+// // Important to use the fixture type as the return type here.
+// [=]() -> MyFixture* { return new MyTest(v); });
+// }
+// }
+// ...
+// int main(int argc, char** argv) {
+// ::testing::InitGoogleTest(&argc, argv);
+// std::vector<int> values_to_test = LoadValuesFromConfig();
+// RegisterMyTests(values_to_test);
+// ...
+// return RUN_ALL_TESTS();
+// }
+//
+template <int&... ExplicitParameterBarrier, typename Factory>
+TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
+ const char* type_param, const char* value_param,
+ const char* file, int line, Factory factory) {
+ using TestT = typename std::remove_pointer<decltype(factory())>::type;
+
+ class FactoryImpl : public internal::TestFactoryBase {
+ public:
+ explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
+ Test* CreateTest() override { return factory_(); }
+
+ private:
+ Factory factory_;
+ };
+
+ return internal::MakeAndRegisterTestInfo(
+ test_suite_name, test_name, type_param, value_param,
+ internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
+ internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
+ internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
+ new FactoryImpl{std::move(factory)});
+}
+
+} // namespace testing
+
+// Use this function in main() to run all tests. It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
new file mode 100644
index 0000000000..47a24aa687
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
@@ -0,0 +1,279 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements a family of generic predicate assertion macros.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+// ASSERT_PRED_FORMAT1(pred_format, v1)
+// ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+// ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult. See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+// ASSERT_PRED1(pred, v1)
+// ASSERT_PRED2(pred, v1, v2)
+// ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce. Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (const ::testing::AssertionResult gtest_ar = (expression)) \
+ ; \
+ else \
+ on_failure(gtest_ar.failure_message())
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use
+// this in your code.
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+ Pred pred, const T1& v1) {
+ if (pred(v1)) return AssertionSuccess();
+
+ return AssertionFailure()
+ << pred_text << "(" << e1 << ") evaluates to false, where"
+ << "\n"
+ << e1 << " evaluates to " << ::testing::PrintToString(v1);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+ GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+ GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+ const char* e2, Pred pred, const T1& v1,
+ const T2& v2) {
+ if (pred(v1, v2)) return AssertionSuccess();
+
+ return AssertionFailure()
+ << pred_text << "(" << e1 << ", " << e2
+ << ") evaluates to false, where"
+ << "\n"
+ << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+ << e2 << " evaluates to " << ::testing::PrintToString(v2);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+ on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+ GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+ GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+ GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+ GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+ const char* e2, const char* e3, Pred pred,
+ const T1& v1, const T2& v2, const T3& v3) {
+ if (pred(v1, v2, v3)) return AssertionSuccess();
+
+ return AssertionFailure()
+ << pred_text << "(" << e1 << ", " << e2 << ", " << e3
+ << ") evaluates to false, where"
+ << "\n"
+ << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+ << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+ << e3 << " evaluates to " << ::testing::PrintToString(v3);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \
+ GTEST_ASSERT_( \
+ ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+ on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+ GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+ GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+ GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+ GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+ const char* e2, const char* e3,
+ const char* e4, Pred pred, const T1& v1,
+ const T2& v2, const T3& v3, const T4& v4) {
+ if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+ return AssertionFailure()
+ << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+ << ") evaluates to false, where"
+ << "\n"
+ << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+ << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+ << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+ << e4 << " evaluates to " << ::testing::PrintToString(v4);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+ v1, v2, v3, v4), \
+ on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+ GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+ GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+ GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+ GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
+ typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+ const char* e2, const char* e3,
+ const char* e4, const char* e5, Pred pred,
+ const T1& v1, const T2& v2, const T3& v3,
+ const T4& v4, const T5& v5) {
+ if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+ return AssertionFailure()
+ << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+ << ", " << e5 << ") evaluates to false, where"
+ << "\n"
+ << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+ << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+ << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+ << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n"
+ << e5 << " evaluates to " << ::testing::PrintToString(v5);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \
+ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+ on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \
+ GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+ pred, v1, v2, v3, v4, v5), \
+ on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+ GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+ GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+ GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+ GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
new file mode 100644
index 0000000000..1f37dc31c3
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
@@ -0,0 +1,60 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class. For example:
+//
+// class MyClass {
+// private:
+// void PrivateMethod();
+// FRIEND_TEST(MyClassTest, PrivateMethodWorks);
+// };
+//
+// class MyClassTest : public testing::Test {
+// // ...
+// };
+//
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+// // Can call MyClass::PrivateMethod() here.
+// }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
+
+#define FRIEND_TEST(test_case_name, test_name) \
+ friend class test_case_name##_##test_name##_Test
+
+#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
new file mode 100644
index 0000000000..cb49e2c754
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
@@ -0,0 +1,44 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gtest.h`
+
+### The following macros can be defined:
+
+* `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
+ `OsStackTraceGetterInterface`.
+* `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
+ `testing::TempDir` for semantics and signature.
+
+## Header `gtest-port.h`
+
+The following macros can be defined:
+
+### Logging:
+
+* `GTEST_LOG_(severity)`
+* `GTEST_CHECK_(condition)`
+* Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
+
+### Threading:
+
+* `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
+* `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
+ are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
+ and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
+* `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
+* `GTEST_LOCK_EXCLUDED_(locks)`
+
+### Underlying library support features
+
+* `GTEST_HAS_CXXABI_H_`
+
+### Exporting API symbols:
+
+* `GTEST_API_` - Specifier for exported symbols.
+
+## Header `gtest-printers.h`
+
+* See documentation at `gtest/gtest-printers.h` for details on how to define a
+ custom printer.
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
new file mode 100644
index 0000000000..9b7fb4261a
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,68 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+ Notification() = delete;
+ Notification(const Notification&) = delete;
+ Notification& operator=(const Notification&) = delete;
+ // void Notify();
+ void WaitForNotification() {}
+};
+} // namespace internal
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
new file mode 100644
index 0000000000..b9495d8378
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
new file mode 100644
index 0000000000..afaaf17ba2
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
new file mode 100644
index 0000000000..45580ae805
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -0,0 +1,306 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests. They are subject to change without notice.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+#include <stdio.h>
+
+#include <memory>
+
+#include "gtest/gtest-matchers.h"
+#include "gtest/internal/gtest-internal.h"
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+namespace testing {
+namespace internal {
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status: The integer exit information in the format specified
+// by wait(2)
+// exit code: The integer code passed to exit(3), _exit(2), or
+// returned from main()
+class GTEST_API_ DeathTest {
+ public:
+ // Create returns false if there was an error determining the
+ // appropriate action to take for the current death test; for example,
+ // if the gtest_death_test_style flag is set to an invalid value.
+ // The LastMessage method will return a more detailed message in that
+ // case. Otherwise, the DeathTest pointer pointed to by the "test"
+ // argument is set. If the death test should be skipped, the pointer
+ // is set to NULL; otherwise, it is set to the address of a new concrete
+ // DeathTest object that controls the execution of the current test.
+ static bool Create(const char* statement, Matcher<const std::string&> matcher,
+ const char* file, int line, DeathTest** test);
+ DeathTest();
+ virtual ~DeathTest() {}
+
+ // A helper class that aborts a death test when it's deleted.
+ class ReturnSentinel {
+ public:
+ explicit ReturnSentinel(DeathTest* test) : test_(test) {}
+ ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
+ private:
+ DeathTest* const test_;
+ ReturnSentinel(const ReturnSentinel&) = delete;
+ ReturnSentinel& operator=(const ReturnSentinel&) = delete;
+ } GTEST_ATTRIBUTE_UNUSED_;
+
+ // An enumeration of possible roles that may be taken when a death
+ // test is encountered. EXECUTE means that the death test logic should
+ // be executed immediately. OVERSEE means that the program should prepare
+ // the appropriate environment for a child process to execute the death
+ // test, then wait for it to complete.
+ enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+ // An enumeration of the three reasons that a test might be aborted.
+ enum AbortReason {
+ TEST_ENCOUNTERED_RETURN_STATEMENT,
+ TEST_THREW_EXCEPTION,
+ TEST_DID_NOT_DIE
+ };
+
+ // Assumes one of the above roles.
+ virtual TestRole AssumeRole() = 0;
+
+ // Waits for the death test to finish and returns its status.
+ virtual int Wait() = 0;
+
+ // Returns true if the death test passed; that is, the test process
+ // exited during the test, its exit status matches a user-supplied
+ // predicate, and its stderr output matches a user-supplied regular
+ // expression.
+ // The user-supplied predicate may be a macro expression rather
+ // than a function pointer or functor, or else Wait and Passed could
+ // be combined.
+ virtual bool Passed(bool exit_status_ok) = 0;
+
+ // Signals that the death test did not die as expected.
+ virtual void Abort(AbortReason reason) = 0;
+
+ // Returns a human-readable outcome message regarding the outcome of
+ // the last death test.
+ static const char* LastMessage();
+
+ static void set_last_death_test_message(const std::string& message);
+
+ private:
+ // A string containing a description of the outcome of the last death test.
+ static std::string last_death_test_message_;
+
+ DeathTest(const DeathTest&) = delete;
+ DeathTest& operator=(const DeathTest&) = delete;
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+// Factory interface for death tests. May be mocked out for testing.
+class DeathTestFactory {
+ public:
+ virtual ~DeathTestFactory() {}
+ virtual bool Create(const char* statement,
+ Matcher<const std::string&> matcher, const char* file,
+ int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+ bool Create(const char* statement, Matcher<const std::string&> matcher,
+ const char* file, int line, DeathTest** test) override;
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
+// and interpreted as a regex (rather than an Eq matcher) for legacy
+// compatibility.
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+ ::testing::internal::RE regex) {
+ return ContainsRegex(regex.pattern());
+}
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(const char* regex) {
+ return ContainsRegex(regex);
+}
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+ const ::std::string& regex) {
+ return ContainsRegex(regex);
+}
+
+// If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
+// used directly.
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+ Matcher<const ::std::string&> matcher) {
+ return matcher;
+}
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } catch (const ::std::exception& gtest_exception) { \
+ fprintf( \
+ stderr, \
+ "\n%s: Caught std::exception-derived exception escaping the " \
+ "death test statement. Exception message: %s\n", \
+ ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+ gtest_exception.what()); \
+ fflush(stderr); \
+ death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+ } catch (...) { \
+ death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+ }
+
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+#endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ ::testing::internal::DeathTest* gtest_dt; \
+ if (!::testing::internal::DeathTest::Create( \
+ #statement, \
+ ::testing::internal::MakeDeathTestMatcher(regex_or_matcher), \
+ __FILE__, __LINE__, &gtest_dt)) { \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+ } \
+ if (gtest_dt != nullptr) { \
+ std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \
+ switch (gtest_dt->AssumeRole()) { \
+ case ::testing::internal::DeathTest::OVERSEE_TEST: \
+ if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+ } \
+ break; \
+ case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+ ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel( \
+ gtest_dt); \
+ GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+ gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+ break; \
+ } \
+ } \
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__) \
+ : fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } else if (!::testing::internal::AlwaysTrue()) { \
+ ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \
+ } else \
+ ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+ InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
+ int a_write_fd)
+ : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
+
+ ~InternalRunDeathTestFlag() {
+ if (write_fd_ >= 0) posix::Close(write_fd_);
+ }
+
+ const std::string& file() const { return file_; }
+ int line() const { return line_; }
+ int index() const { return index_; }
+ int write_fd() const { return write_fd_; }
+
+ private:
+ std::string file_;
+ int line_;
+ int index_;
+ int write_fd_;
+
+ InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+ InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#endif // GTEST_HAS_DEATH_TEST
+
+} // namespace internal
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
new file mode 100644
index 0000000000..a2a60a962b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
@@ -0,0 +1,210 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test. They are subject to change without notice.
+//
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+ FilePath() : pathname_("") {}
+ FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
+
+ explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+ Normalize();
+ }
+
+ FilePath& operator=(const FilePath& rhs) {
+ Set(rhs);
+ return *this;
+ }
+
+ void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
+
+ const std::string& string() const { return pathname_; }
+ const char* c_str() const { return pathname_.c_str(); }
+
+ // Returns the current working directory, or "" if unsuccessful.
+ static FilePath GetCurrentDir();
+
+ // Given directory = "dir", base_name = "test", number = 0,
+ // extension = "xml", returns "dir/test.xml". If number is greater
+ // than zero (e.g., 12), returns "dir/test_12.xml".
+ // On Windows platform, uses \ as the separator rather than /.
+ static FilePath MakeFileName(const FilePath& directory,
+ const FilePath& base_name, int number,
+ const char* extension);
+
+ // Given directory = "dir", relative_path = "test.xml",
+ // returns "dir/test.xml".
+ // On Windows, uses \ as the separator rather than /.
+ static FilePath ConcatPaths(const FilePath& directory,
+ const FilePath& relative_path);
+
+ // Returns a pathname for a file that does not currently exist. The pathname
+ // will be directory/base_name.extension or
+ // directory/base_name_<number>.extension if directory/base_name.extension
+ // already exists. The number will be incremented until a pathname is found
+ // that does not already exist.
+ // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+ // There could be a race condition if two or more processes are calling this
+ // function at the same time -- they could both pick the same filename.
+ static FilePath GenerateUniqueFileName(const FilePath& directory,
+ const FilePath& base_name,
+ const char* extension);
+
+ // Returns true if and only if the path is "".
+ bool IsEmpty() const { return pathname_.empty(); }
+
+ // If input name has a trailing separator character, removes it and returns
+ // the name, otherwise return the name string unmodified.
+ // On Windows platform, uses \ as the separator, other platforms use /.
+ FilePath RemoveTrailingPathSeparator() const;
+
+ // Returns a copy of the FilePath with the directory part removed.
+ // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+ // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+ // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+ // returns an empty FilePath ("").
+ // On Windows platform, '\' is the path separator, otherwise it is '/'.
+ FilePath RemoveDirectoryName() const;
+
+ // RemoveFileName returns the directory path with the filename removed.
+ // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+ // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+ // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+ // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+ // On Windows platform, '\' is the path separator, otherwise it is '/'.
+ FilePath RemoveFileName() const;
+
+ // Returns a copy of the FilePath with the case-insensitive extension removed.
+ // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+ // FilePath("dir/file"). If a case-insensitive extension is not
+ // found, returns a copy of the original FilePath.
+ FilePath RemoveExtension(const char* extension) const;
+
+ // Creates directories so that path exists. Returns true if successful or if
+ // the directories already exist; returns false if unable to create
+ // directories for any reason. Will also return false if the FilePath does
+ // not represent a directory (that is, it doesn't end with a path separator).
+ bool CreateDirectoriesRecursively() const;
+
+ // Create the directory so that path exists. Returns true if successful or
+ // if the directory already exists; returns false if unable to create the
+ // directory for any reason, including if the parent directory does not
+ // exist. Not named "CreateDirectory" because that's a macro on Windows.
+ bool CreateFolder() const;
+
+ // Returns true if FilePath describes something in the file-system,
+ // either a file, directory, or whatever, and that something exists.
+ bool FileOrDirectoryExists() const;
+
+ // Returns true if pathname describes a directory in the file-system
+ // that exists.
+ bool DirectoryExists() const;
+
+ // Returns true if FilePath ends with a path separator, which indicates that
+ // it is intended to represent a directory. Returns false otherwise.
+ // This does NOT check that a directory (or file) actually exists.
+ bool IsDirectory() const;
+
+ // Returns true if pathname describes a root directory. (Windows has one
+ // root directory per disk drive.)
+ bool IsRootDirectory() const;
+
+ // Returns true if pathname describes an absolute path.
+ bool IsAbsolutePath() const;
+
+ private:
+ // Replaces multiple consecutive separators with a single separator.
+ // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+ // redundancies that might be in a pathname involving "." or "..".
+ //
+ // A pathname with multiple consecutive separators may occur either through
+ // user error or as a result of some scripts or APIs that generate a pathname
+ // with a trailing separator. On other platforms the same API or script
+ // may NOT generate a pathname with a trailing "/". Then elsewhere that
+ // pathname may have another "/" and pathname components added to it,
+ // without checking for the separator already being there.
+ // The script language and operating system may allow paths like "foo//bar"
+ // but some of the functions in FilePath will not handle that correctly. In
+ // particular, RemoveTrailingPathSeparator() only removes one separator, and
+ // it is called in CreateDirectoriesRecursively() assuming that it will change
+ // a pathname from directory syntax (trailing separator) to filename syntax.
+ //
+ // On Windows this method also replaces the alternate path separator '/' with
+ // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+ // "bar\\foo".
+
+ void Normalize();
+
+ // Returns a pointer to the last occurrence of a valid path separator in
+ // the FilePath. On Windows, for example, both '/' and '\' are valid path
+ // separators. Returns NULL if no path separator was found.
+ const char* FindLastPathSeparator() const;
+
+ std::string pathname_;
+}; // class FilePath
+
+} // namespace internal
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
new file mode 100644
index 0000000000..9b04e4c85f
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
@@ -0,0 +1,1570 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test. They are subject to change without notice.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_LINUX
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#endif // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__. Writing
+//
+// foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number. For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
+
+// Stringifies its argument.
+// Work around a bug in visual studio which doesn't accept code like this:
+//
+// #define GTEST_STRINGIFY_(name) #name
+// #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ...
+// MACRO(, x, y)
+//
+// Complaining about the argument to GTEST_STRINGIFY_ being empty.
+// This is allowed by the spec.
+#define GTEST_STRINGIFY_HELPER_(name, ...) #name
+#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
+
+namespace proto2 {
+class MessageLite;
+}
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult; // Result of an assertion.
+class Message; // Represents a failure message.
+class Test; // Represents a test.
+class TestInfo; // Information about a test.
+class TestPartResult; // Result of a test part.
+class UnitTest; // A collection of test suites.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo; // Information about a trace point.
+class TestInfoImpl; // Opaque implementation of TestInfo
+class UnitTestImpl; // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// An IgnoredValue object can be implicitly constructed from ANY value.
+class IgnoredValue {
+ struct Sink {};
+
+ public:
+ // This constructor template allows any value to be implicitly
+ // converted to IgnoredValue. The object has no data member and
+ // doesn't try to remember anything about the argument. We
+ // deliberately omit the 'explicit' keyword in order to allow the
+ // conversion to be implicit.
+ // Disable the conversion if T already has a magical conversion operator.
+ // Otherwise we get ambiguity.
+ template <typename T,
+ typename std::enable_if<!std::is_convertible<T, Sink>::value,
+ int>::type = 0>
+ IgnoredValue(const T& /* ignored */) {} // NOLINT(runtime/explicit)
+};
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+ const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+ 4275 /* an exported class was derived from a class that was not exported */)
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled). We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time. Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+ explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275
+
+#endif // GTEST_HAS_EXCEPTIONS
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner-Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+ const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+ const std::vector<std::string>& left,
+ const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+ const std::vector<std::string>& right,
+ size_t context = 2);
+
+} // namespace edit_distance
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings. For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+// expected_expression: "foo"
+// actual_expression: "bar"
+// expected_value: "5"
+// actual_value: "6"
+//
+// The ignoring_case parameter is true if and only if the assertion is a
+// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+ const char* actual_expression,
+ const std::string& expected_value,
+ const std::string& actual_value,
+ bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+ const AssertionResult& assertion_result, const char* expression_text,
+ const char* actual_predicate_value, const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison. (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly. Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+// The most-significant bit being the leftmost, an IEEE
+// floating-point looks like
+//
+// sign_bit exponent_bits fraction_bits
+//
+// Here, sign_bit is a single bit that designates the sign of the
+// number.
+//
+// For float, there are 8 exponent bits and 23 fraction bits.
+//
+// For double, there are 11 exponent bits and 52 fraction bits.
+//
+// More details can be found at
+// http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+// RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+ // Defines the unsigned integer type that has the same size as the
+ // floating point number.
+ typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+ // Constants.
+
+ // # of bits in a number.
+ static const size_t kBitCount = 8 * sizeof(RawType);
+
+ // # of fraction bits in a number.
+ static const size_t kFractionBitCount =
+ std::numeric_limits<RawType>::digits - 1;
+
+ // # of exponent bits in a number.
+ static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+ // The mask for the sign bit.
+ static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+ // The mask for the fraction bits.
+ static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+ (kExponentBitCount + 1);
+
+ // The mask for the exponent bits.
+ static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+ // How many ULP's (Units in the Last Place) we want to tolerate when
+ // comparing two numbers. The larger the value, the more error we
+ // allow. A 0 value means that two numbers must be exactly the same
+ // to be considered equal.
+ //
+ // The maximum error of a single floating-point operation is 0.5
+ // units in the last place. On Intel CPU's, all floating-point
+ // calculations are done with 80-bit precision, while double has 64
+ // bits. Therefore, 4 should be enough for ordinary use.
+ //
+ // See the following article for more details on ULP:
+ // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+ static const uint32_t kMaxUlps = 4;
+
+ // Constructs a FloatingPoint from a raw floating-point number.
+ //
+ // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+ // around may change its bits, although the new value is guaranteed
+ // to be also a NAN. Therefore, don't expect this constructor to
+ // preserve the bits in x when x is a NAN.
+ explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+ // Static methods
+
+ // Reinterprets a bit pattern as a floating-point number.
+ //
+ // This function is needed to test the AlmostEquals() method.
+ static RawType ReinterpretBits(const Bits bits) {
+ FloatingPoint fp(0);
+ fp.u_.bits_ = bits;
+ return fp.u_.value_;
+ }
+
+ // Returns the floating-point number that represent positive infinity.
+ static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
+
+ // Returns the maximum representable finite floating-point number.
+ static RawType Max();
+
+ // Non-static methods
+
+ // Returns the bits that represents this number.
+ const Bits& bits() const { return u_.bits_; }
+
+ // Returns the exponent bits of this number.
+ Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+ // Returns the fraction bits of this number.
+ Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+ // Returns the sign bit of this number.
+ Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+ // Returns true if and only if this is NAN (not a number).
+ bool is_nan() const {
+ // It's a NAN if the exponent bits are all ones and the fraction
+ // bits are not entirely zeros.
+ return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+ }
+
+ // Returns true if and only if this number is at most kMaxUlps ULP's away
+ // from rhs. In particular, this function:
+ //
+ // - returns false if either number is (or both are) NAN.
+ // - treats really large numbers as almost equal to infinity.
+ // - thinks +0.0 and -0.0 are 0 DLP's apart.
+ bool AlmostEquals(const FloatingPoint& rhs) const {
+ // The IEEE standard says that any comparison operation involving
+ // a NAN must return false.
+ if (is_nan() || rhs.is_nan()) return false;
+
+ return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+ kMaxUlps;
+ }
+
+ private:
+ // The data type used to store the actual floating-point number.
+ union FloatingPointUnion {
+ RawType value_; // The raw floating-point number.
+ Bits bits_; // The bits that represent the number.
+ };
+
+ // Converts an integer from the sign-and-magnitude representation to
+ // the biased representation. More precisely, let N be 2 to the
+ // power of (kBitCount - 1), an integer x is represented by the
+ // unsigned number x + N.
+ //
+ // For instance,
+ //
+ // -N + 1 (the most negative number representable using
+ // sign-and-magnitude) is represented by 1;
+ // 0 is represented by N; and
+ // N - 1 (the biggest number representable using
+ // sign-and-magnitude) is represented by 2N - 1.
+ //
+ // Read http://en.wikipedia.org/wiki/Signed_number_representations
+ // for more details on signed number representations.
+ static Bits SignAndMagnitudeToBiased(const Bits& sam) {
+ if (kSignBitMask & sam) {
+ // sam represents a negative number.
+ return ~sam + 1;
+ } else {
+ // sam represents a positive number.
+ return kSignBitMask | sam;
+ }
+ }
+
+ // Given two numbers in the sign-and-magnitude representation,
+ // returns the distance between them as an unsigned number.
+ static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+ const Bits& sam2) {
+ const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+ const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+ return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+ }
+
+ FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() {
+ return FLT_MAX;
+}
+template <>
+inline double FloatingPoint<double>::Max() {
+ return DBL_MAX;
+}
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test suite, we need to assign
+// unique IDs to fixture classes and compare them. The TypeId type is
+// used to hold such IDs. The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+ // dummy_ must not have a const type. Otherwise an overly eager
+ // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+ // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+ static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T. Different values will be
+// returned for different types. Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+ // The compiler is required to allocate a different
+ // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+ // the template. Therefore, the address of dummy_ is guaranteed to
+ // be unique.
+ return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test. Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+ virtual ~TestFactoryBase() {}
+
+ // Creates a test instance to run. The instance is both created and destroyed
+ // within TestInfoImpl::Run()
+ virtual Test* CreateTest() = 0;
+
+ protected:
+ TestFactoryBase() {}
+
+ private:
+ TestFactoryBase(const TestFactoryBase&) = delete;
+ TestFactoryBase& operator=(const TestFactoryBase&) = delete;
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+ Test* CreateTest() override { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+ long hr); // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+ long hr); // NOLINT
+
+#endif // GTEST_OS_WINDOWS
+
+// Types of SetUpTestSuite() and TearDownTestSuite() functions.
+using SetUpTestSuiteFunc = void (*)();
+using TearDownTestSuiteFunc = void (*)();
+
+struct CodeLocation {
+ CodeLocation(const std::string& a_file, int a_line)
+ : file(a_file), line(a_line) {}
+
+ std::string file;
+ int line;
+};
+
+// Helper to identify which setup function for TestCase / TestSuite to call.
+// Only one function is allowed, either TestCase or TestSute but not both.
+
+// Utility functions to help SuiteApiResolver
+using SetUpTearDownSuiteFuncType = void (*)();
+
+inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
+ SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) {
+ return a == def ? nullptr : a;
+}
+
+template <typename T>
+// Note that SuiteApiResolver inherits from T because
+// SetUpTestSuite()/TearDownTestSuite() could be protected. This way
+// SuiteApiResolver can access them.
+struct SuiteApiResolver : T {
+ // testing::Test is only forward declared at this point. So we make it a
+ // dependent class for the compiler to be OK with it.
+ using Test =
+ typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
+
+ static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
+ int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ SetUpTearDownSuiteFuncType test_case_fp =
+ GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
+ SetUpTearDownSuiteFuncType test_suite_fp =
+ GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite);
+
+ GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+ << "Test can not provide both SetUpTestSuite and SetUpTestCase, please "
+ "make sure there is only one present at "
+ << filename << ":" << line_num;
+
+ return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+ (void)(filename);
+ (void)(line_num);
+ return &T::SetUpTestSuite;
+#endif
+ }
+
+ static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
+ int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ SetUpTearDownSuiteFuncType test_case_fp =
+ GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
+ SetUpTearDownSuiteFuncType test_suite_fp =
+ GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite);
+
+ GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+ << "Test can not provide both TearDownTestSuite and TearDownTestCase,"
+ " please make sure there is only one present at"
+ << filename << ":" << line_num;
+
+ return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+ (void)(filename);
+ (void)(line_num);
+ return &T::TearDownTestSuite;
+#endif
+ }
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+// test_suite_name: name of the test suite
+// name: name of the test
+// type_param: the name of the test's type parameter, or NULL if
+// this is not a typed or a type-parameterized test.
+// value_param: text representation of the test's value parameter,
+// or NULL if this is not a type-parameterized test.
+// code_location: code location where the test is defined
+// fixture_class_id: ID of the test fixture class
+// set_up_tc: pointer to the function that sets up the test suite
+// tear_down_tc: pointer to the function that tears down the test suite
+// factory: pointer to the factory that creates a test object.
+// The newly created TestInfo instance will assume
+// ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+ const char* test_suite_name, const char* name, const char* type_param,
+ const char* value_param, CodeLocation code_location,
+ TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+ TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false. None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// State of the definition of a type-parameterized test suite.
+class GTEST_API_ TypedTestSuitePState {
+ public:
+ TypedTestSuitePState() : registered_(false) {}
+
+ // Adds the given test name to defined_test_names_ and return true
+ // if the test suite hasn't been registered; otherwise aborts the
+ // program.
+ bool AddTestName(const char* file, int line, const char* case_name,
+ const char* test_name) {
+ if (registered_) {
+ fprintf(stderr,
+ "%s Test %s must be defined before "
+ "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n",
+ FormatFileLocation(file, line).c_str(), test_name, case_name);
+ fflush(stderr);
+ posix::Abort();
+ }
+ registered_tests_.insert(
+ ::std::make_pair(test_name, CodeLocation(file, line)));
+ return true;
+ }
+
+ bool TestExists(const std::string& test_name) const {
+ return registered_tests_.count(test_name) > 0;
+ }
+
+ const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+ RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+ GTEST_CHECK_(it != registered_tests_.end());
+ return it->second;
+ }
+
+ // Verifies that registered_tests match the test names in
+ // defined_test_names_; returns registered_tests if successful, or
+ // aborts the program otherwise.
+ const char* VerifyRegisteredTestNames(const char* test_suite_name,
+ const char* file, int line,
+ const char* registered_tests);
+
+ private:
+ typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+ bool registered_;
+ RegisteredTestsMap registered_tests_;
+};
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TypedTestCasePState = TypedTestSuitePState;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+ const char* comma = strchr(str, ',');
+ if (comma == nullptr) {
+ return nullptr;
+ }
+ while (IsSpace(*(++comma))) {
+ }
+ return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+ const char* comma = strchr(str, ',');
+ return comma == nullptr ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+ ::std::vector<::std::string>* dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+ template <typename T>
+ static std::string GetName(int i) {
+ return StreamableToString(i);
+ }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+ typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(internal::None, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+ result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+ GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+ i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+ std::vector<std::string> result;
+ GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+ return result;
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test. The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter. It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+ // 'index' is the index of the test in the type list 'Types'
+ // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
+ // Types). Valid values for 'index' are [0, N - 1] where N is the
+ // length of Types.
+ static bool Register(const char* prefix, const CodeLocation& code_location,
+ const char* case_name, const char* test_names, int index,
+ const std::vector<std::string>& type_names =
+ GenerateNames<DefaultNameGenerator, Types>()) {
+ typedef typename Types::Head Type;
+ typedef Fixture<Type> FixtureClass;
+ typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+ // First, registers the first type-parameterized test in the type
+ // list.
+ MakeAndRegisterTestInfo(
+ (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+ "/" + type_names[static_cast<size_t>(index)])
+ .c_str(),
+ StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+ GetTypeName<Type>().c_str(),
+ nullptr, // No value parameter.
+ code_location, GetTypeId<FixtureClass>(),
+ SuiteApiResolver<TestClass>::GetSetUpCaseOrSuite(
+ code_location.file.c_str(), code_location.line),
+ SuiteApiResolver<TestClass>::GetTearDownCaseOrSuite(
+ code_location.file.c_str(), code_location.line),
+ new TestFactoryImpl<TestClass>);
+
+ // Next, recurses (at compile time) with the tail of the type list.
+ return TypeParameterizedTest<Fixture, TestSel,
+ typename Types::Tail>::Register(prefix,
+ code_location,
+ case_name,
+ test_names,
+ index + 1,
+ type_names);
+ }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, internal::None> {
+ public:
+ static bool Register(const char* /*prefix*/, const CodeLocation&,
+ const char* /*case_name*/, const char* /*test_names*/,
+ int /*index*/,
+ const std::vector<std::string>& =
+ std::vector<std::string>() /*type_names*/) {
+ return true;
+ }
+};
+
+GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
+ CodeLocation code_location);
+GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
+ const char* case_name);
+
+// TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test. The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestSuite {
+ public:
+ static bool Register(const char* prefix, CodeLocation code_location,
+ const TypedTestSuitePState* state, const char* case_name,
+ const char* test_names,
+ const std::vector<std::string>& type_names =
+ GenerateNames<DefaultNameGenerator, Types>()) {
+ RegisterTypeParameterizedTestSuiteInstantiation(case_name);
+ std::string test_name =
+ StripTrailingSpaces(GetPrefixUntilComma(test_names));
+ if (!state->TestExists(test_name)) {
+ fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+ case_name, test_name.c_str(),
+ FormatFileLocation(code_location.file.c_str(), code_location.line)
+ .c_str());
+ fflush(stderr);
+ posix::Abort();
+ }
+ const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
+ typedef typename Tests::Head Head;
+
+ // First, register the first test in 'Test' for each type in 'Types'.
+ TypeParameterizedTest<Fixture, Head, Types>::Register(
+ prefix, test_location, case_name, test_names, 0, type_names);
+
+ // Next, recurses (at compile time) with the tail of the test list.
+ return TypeParameterizedTestSuite<Fixture, typename Tests::Tail,
+ Types>::Register(prefix, code_location,
+ state, case_name,
+ SkipComma(test_names),
+ type_names);
+ }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
+ public:
+ static bool Register(const char* /*prefix*/, const CodeLocation&,
+ const TypedTestSuitePState* /*state*/,
+ const char* /*case_name*/, const char* /*test_names*/,
+ const std::vector<std::string>& =
+ std::vector<std::string>() /*type_names*/) {
+ return true;
+ }
+};
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag. The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+ int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+ ConstCharPtr(const char* str) : value(str) {}
+ operator bool() const { return true; }
+ const char* value;
+};
+
+// Helper for declaring std::string within 'if' statement
+// in pre C++17 build environment.
+struct TrueWithString {
+ TrueWithString() = default;
+ explicit TrueWithString(const char* str) : value(str) {}
+ explicit TrueWithString(const std::string& str) : value(str) {}
+ explicit operator bool() const { return true; }
+ std::string value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution. Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code). Unlike rand_r(), it's portable. An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+ static const uint32_t kMaxRange = 1u << 31;
+
+ explicit Random(uint32_t seed) : state_(seed) {}
+
+ void Reseed(uint32_t seed) { state_ = seed; }
+
+ // Generates a random number from [0, range). Crashes if 'range' is
+ // 0 or greater than kMaxRange.
+ uint32_t Generate(uint32_t range);
+
+ private:
+ uint32_t state_;
+ Random(const Random&) = delete;
+ Random& operator=(const Random&) = delete;
+};
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+ typename std::remove_const<typename std::remove_reference<T>::type>::type
+
+// HasDebugStringAndShortDebugString<T>::value is a compile-time bool constant
+// that's true if and only if T has methods DebugString() and ShortDebugString()
+// that return std::string.
+template <typename T>
+class HasDebugStringAndShortDebugString {
+ private:
+ template <typename C>
+ static auto CheckDebugString(C*) -> typename std::is_same<
+ std::string, decltype(std::declval<const C>().DebugString())>::type;
+ template <typename>
+ static std::false_type CheckDebugString(...);
+
+ template <typename C>
+ static auto CheckShortDebugString(C*) -> typename std::is_same<
+ std::string, decltype(std::declval<const C>().ShortDebugString())>::type;
+ template <typename>
+ static std::false_type CheckShortDebugString(...);
+
+ using HasDebugStringType = decltype(CheckDebugString<T>(nullptr));
+ using HasShortDebugStringType = decltype(CheckShortDebugString<T>(nullptr));
+
+ public:
+ static constexpr bool value =
+ HasDebugStringType::value && HasShortDebugStringType::value;
+};
+
+template <typename T>
+constexpr bool HasDebugStringAndShortDebugString<T>::value;
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them). It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0. If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked. Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator'). If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C,
+ class Iterator = decltype(::std::declval<const C&>().begin()),
+ class = decltype(::std::declval<const C&>().end()),
+ class = decltype(++::std::declval<Iterator&>()),
+ class = decltype(*::std::declval<Iterator>()),
+ class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+ return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) {
+ return '\0';
+}
+
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+ template <typename U>
+ static char test(typename U::hasher*, typename U::reverse_iterator*);
+ template <typename U>
+ static int test(typename U::hasher*, ...);
+ template <typename U>
+ static char test(...);
+
+ public:
+ static const bool value = sizeof(test<T>(nullptr, nullptr)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template <typename C,
+ bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer)>
+struct IsRecursiveContainerImpl;
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, false> : public std::false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true> {
+ using value_type = decltype(*std::declval<typename C::const_iterator>());
+ using type =
+ std::is_same<typename std::remove_const<
+ typename std::remove_reference<value_type>::type>::type,
+ C>;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0. When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+ return lhs == rhs;
+}
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
+ return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat. If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+ for (size_t i = 0; i != size; i++) {
+ if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
+ }
+ return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem. Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+ for (Iter it = begin; it != end; ++it) {
+ if (internal::ArrayEq(*it, elem)) return it;
+ }
+ return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0. When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) {
+ *to = from;
+}
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
+ internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat. If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+ for (size_t i = 0; i != size; i++) {
+ internal::CopyArray(from[i], to + i);
+ }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container. Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers. New members
+// should be added as needed. To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier). It's the client's responsibility to satisfy
+// this requirement. Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+ // STL-style container typedefs.
+ typedef Element value_type;
+ typedef Element* iterator;
+ typedef const Element* const_iterator;
+
+ // Constructs from a native array. References the source.
+ NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+ InitRef(array, count);
+ }
+
+ // Constructs from a native array. Copies the source.
+ NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+ InitCopy(array, count);
+ }
+
+ // Copy constructor.
+ NativeArray(const NativeArray& rhs) {
+ (this->*rhs.clone_)(rhs.array_, rhs.size_);
+ }
+
+ ~NativeArray() {
+ if (clone_ != &NativeArray::InitRef) delete[] array_;
+ }
+
+ // STL-style container methods.
+ size_t size() const { return size_; }
+ const_iterator begin() const { return array_; }
+ const_iterator end() const { return array_ + size_; }
+ bool operator==(const NativeArray& rhs) const {
+ return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
+ }
+
+ private:
+ static_assert(!std::is_const<Element>::value, "Type must not be const");
+ static_assert(!std::is_reference<Element>::value,
+ "Type must not be a reference");
+
+ // Initializes this object with a copy of the input.
+ void InitCopy(const Element* array, size_t a_size) {
+ Element* const copy = new Element[a_size];
+ CopyArray(array, a_size, copy);
+ array_ = copy;
+ size_ = a_size;
+ clone_ = &NativeArray::InitCopy;
+ }
+
+ // Initializes this object with a reference of the input.
+ void InitRef(const Element* array, size_t a_size) {
+ array_ = array;
+ size_ = a_size;
+ clone_ = &NativeArray::InitRef;
+ }
+
+ const Element* array_;
+ size_t size_;
+ void (NativeArray::*clone_)(const Element*, size_t);
+};
+
+// Backport of std::index_sequence.
+template <size_t... Is>
+struct IndexSequence {
+ using type = IndexSequence;
+};
+
+// Double the IndexSequence, and one if plus_one is true.
+template <bool plus_one, typename T, size_t sizeofT>
+struct DoubleSequence;
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<true, IndexSequence<I...>, sizeofT> {
+ using type = IndexSequence<I..., (sizeofT + I)..., 2 * sizeofT>;
+};
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
+ using type = IndexSequence<I..., (sizeofT + I)...>;
+};
+
+// Backport of std::make_index_sequence.
+// It uses O(ln(N)) instantiation depth.
+template <size_t N>
+struct MakeIndexSequenceImpl
+ : DoubleSequence<N % 2 == 1, typename MakeIndexSequenceImpl<N / 2>::type,
+ N / 2>::type {};
+
+template <>
+struct MakeIndexSequenceImpl<0> : IndexSequence<> {};
+
+template <size_t N>
+using MakeIndexSequence = typename MakeIndexSequenceImpl<N>::type;
+
+template <typename... T>
+using IndexSequenceFor = typename MakeIndexSequence<sizeof...(T)>::type;
+
+template <size_t>
+struct Ignore {
+ Ignore(...); // NOLINT
+};
+
+template <typename>
+struct ElemFromListImpl;
+template <size_t... I>
+struct ElemFromListImpl<IndexSequence<I...>> {
+ // We make Ignore a template to solve a problem with MSVC.
+ // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but
+ // MSVC doesn't understand how to deal with that pack expansion.
+ // Use `0 * I` to have a single instantiation of Ignore.
+ template <typename R>
+ static R Apply(Ignore<0 * I>..., R (*)(), ...);
+};
+
+template <size_t N, typename... T>
+struct ElemFromList {
+ using type =
+ decltype(ElemFromListImpl<typename MakeIndexSequence<N>::type>::Apply(
+ static_cast<T (*)()>(nullptr)...));
+};
+
+struct FlatTupleConstructTag {};
+
+template <typename... T>
+class FlatTuple;
+
+template <typename Derived, size_t I>
+struct FlatTupleElemBase;
+
+template <typename... T, size_t I>
+struct FlatTupleElemBase<FlatTuple<T...>, I> {
+ using value_type = typename ElemFromList<I, T...>::type;
+ FlatTupleElemBase() = default;
+ template <typename Arg>
+ explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t)
+ : value(std::forward<Arg>(t)) {}
+ value_type value;
+};
+
+template <typename Derived, typename Idx>
+struct FlatTupleBase;
+
+template <size_t... Idx, typename... T>
+struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
+ : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
+ using Indices = IndexSequence<Idx...>;
+ FlatTupleBase() = default;
+ template <typename... Args>
+ explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args)
+ : FlatTupleElemBase<FlatTuple<T...>, Idx>(FlatTupleConstructTag{},
+ std::forward<Args>(args))... {}
+
+ template <size_t I>
+ const typename ElemFromList<I, T...>::type& Get() const {
+ return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+ }
+
+ template <size_t I>
+ typename ElemFromList<I, T...>::type& Get() {
+ return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+ }
+
+ template <typename F>
+ auto Apply(F&& f) -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+ return std::forward<F>(f)(Get<Idx>()...);
+ }
+
+ template <typename F>
+ auto Apply(F&& f) const -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+ return std::forward<F>(f)(Get<Idx>()...);
+ }
+};
+
+// Analog to std::tuple but with different tradeoffs.
+// This class minimizes the template instantiation depth, thus allowing more
+// elements than std::tuple would. std::tuple has been seen to require an
+// instantiation depth of more than 10x the number of elements in some
+// implementations.
+// FlatTuple and ElemFromList are not recursive and have a fixed depth
+// regardless of T...
+// MakeIndexSequence, on the other hand, it is recursive but with an
+// instantiation depth of O(ln(N)).
+template <typename... T>
+class FlatTuple
+ : private FlatTupleBase<FlatTuple<T...>,
+ typename MakeIndexSequence<sizeof...(T)>::type> {
+ using Indices = typename FlatTupleBase<
+ FlatTuple<T...>, typename MakeIndexSequence<sizeof...(T)>::type>::Indices;
+
+ public:
+ FlatTuple() = default;
+ template <typename... Args>
+ explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args)
+ : FlatTuple::FlatTupleBase(tag, std::forward<Args>(args)...) {}
+
+ using FlatTuple::FlatTupleBase::Apply;
+ using FlatTuple::FlatTupleBase::Get;
+};
+
+// Utility functions to be called with static_assert to induce deprecation
+// warnings.
+GTEST_INTERNAL_DEPRECATED(
+ "INSTANTIATE_TEST_CASE_P is deprecated, please use "
+ "INSTANTIATE_TEST_SUITE_P")
+constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+ "TYPED_TEST_CASE_P is deprecated, please use "
+ "TYPED_TEST_SUITE_P")
+constexpr bool TypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+ "TYPED_TEST_CASE is deprecated, please use "
+ "TYPED_TEST_SUITE")
+constexpr bool TypedTestCaseIsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+ "REGISTER_TYPED_TEST_CASE_P is deprecated, please use "
+ "REGISTER_TYPED_TEST_SUITE_P")
+constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+ "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use "
+ "INSTANTIATE_TYPED_TEST_SUITE_P")
+constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
+
+} // namespace internal
+} // namespace testing
+
+namespace std {
+// Some standard library implementations use `struct tuple_size` and some use
+// `class tuple_size`. Clang warns about the mismatch.
+// https://reviews.llvm.org/D55466
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename... Ts>
+struct tuple_size<testing::internal::FlatTuple<Ts...>>
+ : std::integral_constant<size_t, sizeof...(Ts)> {};
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+} // namespace std
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+ ::testing::internal::AssertHelper(result_type, file, line, message) = \
+ ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+ GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+ return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+ GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+ GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+#define GTEST_SKIP_(message) \
+ return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip)
+
+// Suppress MSVC warning 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+// NOTE: The "else" is important to keep this expansion to prevent a top-level
+// "else" from attaching to our "if".
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+ if (::testing::internal::AlwaysTrue()) { \
+ statement; \
+ } else /* NOLINT */ \
+ static_assert(true, "") // User must have a semicolon after expansion.
+
+#if GTEST_HAS_EXCEPTIONS
+
+namespace testing {
+namespace internal {
+
+class NeverThrown {
+ public:
+ const char* what() const noexcept {
+ return "this exception should never be thrown";
+ }
+};
+
+} // namespace internal
+} // namespace testing
+
+#if GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e))
+
+#else // GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) \
+ std::string { "an std::exception-derived error" }
+
+#endif // GTEST_HAS_RTTI
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \
+ catch (typename std::conditional< \
+ std::is_same<typename std::remove_cv<typename std::remove_reference< \
+ expected_exception>::type>::type, \
+ std::exception>::value, \
+ const ::testing::internal::NeverThrown&, const std::exception&>::type \
+ e) { \
+ gtest_msg.value = "Expected: " #statement \
+ " throws an exception of type " #expected_exception \
+ ".\n Actual: it throws "; \
+ gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \
+ gtest_msg.value += " with description \""; \
+ gtest_msg.value += e.what(); \
+ gtest_msg.value += "\"."; \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+ }
+
+#else // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)
+
+#endif // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::TrueWithString gtest_msg{}) { \
+ bool gtest_caught_expected = false; \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } catch (expected_exception const&) { \
+ gtest_caught_expected = true; \
+ } \
+ GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \
+ catch (...) { \
+ gtest_msg.value = "Expected: " #statement \
+ " throws an exception of type " #expected_exception \
+ ".\n Actual: it throws a different type."; \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+ } \
+ if (!gtest_caught_expected) { \
+ gtest_msg.value = "Expected: " #statement \
+ " throws an exception of type " #expected_exception \
+ ".\n Actual: it throws nothing."; \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+ } \
+ } else /*NOLINT*/ \
+ GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__) \
+ : fail(gtest_msg.value.c_str())
+
+#if GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
+ catch (std::exception const& e) { \
+ gtest_msg.value = "it throws "; \
+ gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \
+ gtest_msg.value += " with description \""; \
+ gtest_msg.value += e.what(); \
+ gtest_msg.value += "\"."; \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+ }
+
+#else // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()
+
+#endif // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::TrueWithString gtest_msg{}) { \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } \
+ GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
+ catch (...) { \
+ gtest_msg.value = "it throws."; \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \
+ : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+ " Actual: " + \
+ gtest_msg.value) \
+ .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ bool gtest_caught_any = false; \
+ try { \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ } catch (...) { \
+ gtest_caught_any = true; \
+ } \
+ if (!gtest_caught_any) { \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \
+ : fail("Expected: " #statement \
+ " throws an exception.\n" \
+ " Actual: it doesn't.")
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// representation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (const ::testing::AssertionResult gtest_ar_ = \
+ ::testing::AssertionResult(expression)) \
+ ; \
+ else \
+ fail(::testing::internal::GetBoolAssertionFailureMessage( \
+ gtest_ar_, text, #actual, #expected) \
+ .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::AlwaysTrue()) { \
+ ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+ if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+ goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+ } \
+ } else \
+ GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \
+ : fail("Expected: " #statement \
+ " doesn't generate new fatal " \
+ "failures in the current thread.\n" \
+ " Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ test_suite_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \
+ static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \
+ "test_suite_name must not be empty"); \
+ static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \
+ "test_name must not be empty"); \
+ class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ : public parent_class { \
+ public: \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \
+ ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \
+ const GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name) &) = delete; /* NOLINT */ \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+ (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \
+ GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name) &&) noexcept = delete; /* NOLINT */ \
+ \
+ private: \
+ void TestBody() override; \
+ static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \
+ }; \
+ \
+ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \
+ test_name)::test_info_ = \
+ ::testing::internal::MakeAndRegisterTestInfo( \
+ #test_suite_name, #test_name, nullptr, nullptr, \
+ ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
+ ::testing::internal::SuiteApiResolver< \
+ parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \
+ ::testing::internal::SuiteApiResolver< \
+ parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \
+ new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_( \
+ test_suite_name, test_name)>); \
+ void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
new file mode 100644
index 0000000000..e7af2f904a
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
@@ -0,0 +1,956 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type and function utilities for implementing parameterized tests.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+ TestParamInfo(const ParamType& a_param, size_t an_index)
+ : param(a_param), index(an_index) {}
+ ParamType param;
+ size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+ template <class ParamType>
+ std::string operator()(const TestParamInfo<ParamType>& info) const {
+ return PrintToString(info.param);
+ }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// Utility Functions
+
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test suite. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
+ CodeLocation code_location);
+
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+ virtual ~ParamIteratorInterface() {}
+ // A pointer to the base generator instance.
+ // Used only for the purposes of iterator comparison
+ // to make sure that two iterators belong to the same generator.
+ virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+ // Advances iterator to point to the next element
+ // provided by the generator. The caller is responsible
+ // for not calling Advance() on an iterator equal to
+ // BaseGenerator()->End().
+ virtual void Advance() = 0;
+ // Clones the iterator object. Used for implementing copy semantics
+ // of ParamIterator<T>.
+ virtual ParamIteratorInterface* Clone() const = 0;
+ // Dereferences the current iterator and provides (read-only) access
+ // to the pointed value. It is the caller's responsibility not to call
+ // Current() on an iterator equal to BaseGenerator()->End().
+ // Used for implementing ParamGenerator<T>::operator*().
+ virtual const T* Current() const = 0;
+ // Determines whether the given iterator and other point to the same
+ // element in the sequence generated by the generator.
+ // Used for implementing ParamGenerator<T>::operator==().
+ virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+ typedef T value_type;
+ typedef const T& reference;
+ typedef ptrdiff_t difference_type;
+
+ // ParamIterator assumes ownership of the impl_ pointer.
+ ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+ ParamIterator& operator=(const ParamIterator& other) {
+ if (this != &other) impl_.reset(other.impl_->Clone());
+ return *this;
+ }
+
+ const T& operator*() const { return *impl_->Current(); }
+ const T* operator->() const { return impl_->Current(); }
+ // Prefix version of operator++.
+ ParamIterator& operator++() {
+ impl_->Advance();
+ return *this;
+ }
+ // Postfix version of operator++.
+ ParamIterator operator++(int /*unused*/) {
+ ParamIteratorInterface<T>* clone = impl_->Clone();
+ impl_->Advance();
+ return ParamIterator(clone);
+ }
+ bool operator==(const ParamIterator& other) const {
+ return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+ }
+ bool operator!=(const ParamIterator& other) const {
+ return !(*this == other);
+ }
+
+ private:
+ friend class ParamGenerator<T>;
+ explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+ std::unique_ptr<ParamIteratorInterface<T>> impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+ typedef T ParamType;
+
+ virtual ~ParamGeneratorInterface() {}
+
+ // Generator interface definition
+ virtual ParamIteratorInterface<T>* Begin() const = 0;
+ virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template <typename T>
+class ParamGenerator {
+ public:
+ typedef ParamIterator<T> iterator;
+
+ explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+ ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+ ParamGenerator& operator=(const ParamGenerator& other) {
+ impl_ = other.impl_;
+ return *this;
+ }
+
+ iterator begin() const { return iterator(impl_->Begin()); }
+ iterator end() const { return iterator(impl_->End()); }
+
+ private:
+ std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+ RangeGenerator(T begin, T end, IncrementT step)
+ : begin_(begin),
+ end_(end),
+ step_(step),
+ end_index_(CalculateEndIndex(begin, end, step)) {}
+ ~RangeGenerator() override {}
+
+ ParamIteratorInterface<T>* Begin() const override {
+ return new Iterator(this, begin_, 0, step_);
+ }
+ ParamIteratorInterface<T>* End() const override {
+ return new Iterator(this, end_, end_index_, step_);
+ }
+
+ private:
+ class Iterator : public ParamIteratorInterface<T> {
+ public:
+ Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+ IncrementT step)
+ : base_(base), value_(value), index_(index), step_(step) {}
+ ~Iterator() override {}
+
+ const ParamGeneratorInterface<T>* BaseGenerator() const override {
+ return base_;
+ }
+ void Advance() override {
+ value_ = static_cast<T>(value_ + step_);
+ index_++;
+ }
+ ParamIteratorInterface<T>* Clone() const override {
+ return new Iterator(*this);
+ }
+ const T* Current() const override { return &value_; }
+ bool Equals(const ParamIteratorInterface<T>& other) const override {
+ // Having the same base generator guarantees that the other
+ // iterator is of the same type and we can downcast.
+ GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+ << "The program attempted to compare iterators "
+ << "from different generators." << std::endl;
+ const int other_index =
+ CheckedDowncastToActualType<const Iterator>(&other)->index_;
+ return index_ == other_index;
+ }
+
+ private:
+ Iterator(const Iterator& other)
+ : ParamIteratorInterface<T>(),
+ base_(other.base_),
+ value_(other.value_),
+ index_(other.index_),
+ step_(other.step_) {}
+
+ // No implementation - assignment is unsupported.
+ void operator=(const Iterator& other);
+
+ const ParamGeneratorInterface<T>* const base_;
+ T value_;
+ int index_;
+ const IncrementT step_;
+ }; // class RangeGenerator::Iterator
+
+ static int CalculateEndIndex(const T& begin, const T& end,
+ const IncrementT& step) {
+ int end_index = 0;
+ for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
+ return end_index;
+ }
+
+ // No implementation - assignment is unsupported.
+ void operator=(const RangeGenerator& other);
+
+ const T begin_;
+ const T end_;
+ const IncrementT step_;
+ // The index for the end() iterator. All the elements in the generated
+ // sequence are indexed (0-based) to aid iterator comparison.
+ const int end_index_;
+}; // class RangeGenerator
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+ template <typename ForwardIterator>
+ ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+ : container_(begin, end) {}
+ ~ValuesInIteratorRangeGenerator() override {}
+
+ ParamIteratorInterface<T>* Begin() const override {
+ return new Iterator(this, container_.begin());
+ }
+ ParamIteratorInterface<T>* End() const override {
+ return new Iterator(this, container_.end());
+ }
+
+ private:
+ typedef typename ::std::vector<T> ContainerType;
+
+ class Iterator : public ParamIteratorInterface<T> {
+ public:
+ Iterator(const ParamGeneratorInterface<T>* base,
+ typename ContainerType::const_iterator iterator)
+ : base_(base), iterator_(iterator) {}
+ ~Iterator() override {}
+
+ const ParamGeneratorInterface<T>* BaseGenerator() const override {
+ return base_;
+ }
+ void Advance() override {
+ ++iterator_;
+ value_.reset();
+ }
+ ParamIteratorInterface<T>* Clone() const override {
+ return new Iterator(*this);
+ }
+ // We need to use cached value referenced by iterator_ because *iterator_
+ // can return a temporary object (and of type other then T), so just
+ // having "return &*iterator_;" doesn't work.
+ // value_ is updated here and not in Advance() because Advance()
+ // can advance iterator_ beyond the end of the range, and we cannot
+ // detect that fact. The client code, on the other hand, is
+ // responsible for not calling Current() on an out-of-range iterator.
+ const T* Current() const override {
+ if (value_.get() == nullptr) value_.reset(new T(*iterator_));
+ return value_.get();
+ }
+ bool Equals(const ParamIteratorInterface<T>& other) const override {
+ // Having the same base generator guarantees that the other
+ // iterator is of the same type and we can downcast.
+ GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+ << "The program attempted to compare iterators "
+ << "from different generators." << std::endl;
+ return iterator_ ==
+ CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+ }
+
+ private:
+ Iterator(const Iterator& other)
+ // The explicit constructor call suppresses a false warning
+ // emitted by gcc when supplied with the -Wextra option.
+ : ParamIteratorInterface<T>(),
+ base_(other.base_),
+ iterator_(other.iterator_) {}
+
+ const ParamGeneratorInterface<T>* const base_;
+ typename ContainerType::const_iterator iterator_;
+ // A cached value of *iterator_. We keep it here to allow access by
+ // pointer in the wrapping iterator's operator->().
+ // value_ needs to be mutable to be accessed in Current().
+ // Use of std::unique_ptr helps manage cached value's lifetime,
+ // which is bound by the lifespan of the iterator itself.
+ mutable std::unique_ptr<const T> value_;
+ }; // class ValuesInIteratorRangeGenerator::Iterator
+
+ // No implementation - assignment is unsupported.
+ void operator=(const ValuesInIteratorRangeGenerator& other);
+
+ const ContainerType container_;
+}; // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+ Message name_stream;
+ name_stream << info.index;
+ return name_stream.GetString();
+}
+
+template <typename T = int>
+void TestNotEmpty() {
+ static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
+}
+template <typename T = int>
+void TestNotEmpty(const T&) {}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+ typedef typename TestClass::ParamType ParamType;
+ explicit ParameterizedTestFactory(ParamType parameter)
+ : parameter_(parameter) {}
+ Test* CreateTest() override {
+ TestClass::SetParam(&parameter_);
+ return new TestClass();
+ }
+
+ private:
+ const ParamType parameter_;
+
+ ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+ ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+ virtual ~TestMetaFactoryBase() {}
+
+ virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestSuiteInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestSuite>
+class TestMetaFactory
+ : public TestMetaFactoryBase<typename TestSuite::ParamType> {
+ public:
+ using ParamType = typename TestSuite::ParamType;
+
+ TestMetaFactory() {}
+
+ TestFactoryBase* CreateTestFactory(ParamType parameter) override {
+ return new ParameterizedTestFactory<TestSuite>(parameter);
+ }
+
+ private:
+ TestMetaFactory(const TestMetaFactory&) = delete;
+ TestMetaFactory& operator=(const TestMetaFactory&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfoBase is a generic interface
+// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds
+// a collection of pointers to the ParameterizedTestSuiteInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestSuiteInfoBase {
+ public:
+ virtual ~ParameterizedTestSuiteInfoBase() {}
+
+ // Base part of test suite name for display purposes.
+ virtual const std::string& GetTestSuiteName() const = 0;
+ // Test suite id to verify identity.
+ virtual TypeId GetTestSuiteTypeId() const = 0;
+ // UnitTest class invokes this method to register tests in this
+ // test suite right before running them in RUN_ALL_TESTS macro.
+ // This method should not be called more than once on any single
+ // instance of a ParameterizedTestSuiteInfoBase derived class.
+ virtual void RegisterTests() = 0;
+
+ protected:
+ ParameterizedTestSuiteInfoBase() {}
+
+ private:
+ ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+ delete;
+ ParameterizedTestSuiteInfoBase& operator=(
+ const ParameterizedTestSuiteInfoBase&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Report a the name of a test_suit as safe to ignore
+// as the side effect of construction of this type.
+struct GTEST_API_ MarkAsIgnored {
+ explicit MarkAsIgnored(const char* test_suite);
+};
+
+GTEST_API_ void InsertSyntheticTestCase(const std::string& name,
+ CodeLocation location, bool has_test_p);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test suite and generators
+// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that
+// test suite. It registers tests with all values generated by all
+// generators when asked.
+template <class TestSuite>
+class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
+ public:
+ // ParamType and GeneratorCreationFunc are private types but are required
+ // for declarations of public methods AddTestPattern() and
+ // AddTestSuiteInstantiation().
+ using ParamType = typename TestSuite::ParamType;
+ // A function that returns an instance of appropriate generator type.
+ typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+ using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType>&);
+
+ explicit ParameterizedTestSuiteInfo(const char* name,
+ CodeLocation code_location)
+ : test_suite_name_(name), code_location_(code_location) {}
+
+ // Test suite base name for display purposes.
+ const std::string& GetTestSuiteName() const override {
+ return test_suite_name_;
+ }
+ // Test suite id to verify identity.
+ TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
+ // TEST_P macro uses AddTestPattern() to record information
+ // about a single test in a LocalTestInfo structure.
+ // test_suite_name is the base name of the test suite (without invocation
+ // prefix). test_base_name is the name of an individual test without
+ // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+ // test suite base name and DoBar is test base name.
+ void AddTestPattern(const char* test_suite_name, const char* test_base_name,
+ TestMetaFactoryBase<ParamType>* meta_factory,
+ CodeLocation code_location) {
+ tests_.push_back(std::shared_ptr<TestInfo>(new TestInfo(
+ test_suite_name, test_base_name, meta_factory, code_location)));
+ }
+ // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
+ // about a generator.
+ int AddTestSuiteInstantiation(const std::string& instantiation_name,
+ GeneratorCreationFunc* func,
+ ParamNameGeneratorFunc* name_func,
+ const char* file, int line) {
+ instantiations_.push_back(
+ InstantiationInfo(instantiation_name, func, name_func, file, line));
+ return 0; // Return value used only to run this method in namespace scope.
+ }
+ // UnitTest class invokes this method to register tests in this test suite
+ // right before running tests in RUN_ALL_TESTS macro.
+ // This method should not be called more than once on any single
+ // instance of a ParameterizedTestSuiteInfoBase derived class.
+ // UnitTest has a guard to prevent from calling this method more than once.
+ void RegisterTests() override {
+ bool generated_instantiations = false;
+
+ for (typename TestInfoContainer::iterator test_it = tests_.begin();
+ test_it != tests_.end(); ++test_it) {
+ std::shared_ptr<TestInfo> test_info = *test_it;
+ for (typename InstantiationContainer::iterator gen_it =
+ instantiations_.begin();
+ gen_it != instantiations_.end(); ++gen_it) {
+ const std::string& instantiation_name = gen_it->name;
+ ParamGenerator<ParamType> generator((*gen_it->generator)());
+ ParamNameGeneratorFunc* name_func = gen_it->name_func;
+ const char* file = gen_it->file;
+ int line = gen_it->line;
+
+ std::string test_suite_name;
+ if (!instantiation_name.empty())
+ test_suite_name = instantiation_name + "/";
+ test_suite_name += test_info->test_suite_base_name;
+
+ size_t i = 0;
+ std::set<std::string> test_param_names;
+ for (typename ParamGenerator<ParamType>::iterator param_it =
+ generator.begin();
+ param_it != generator.end(); ++param_it, ++i) {
+ generated_instantiations = true;
+
+ Message test_name_stream;
+
+ std::string param_name =
+ name_func(TestParamInfo<ParamType>(*param_it, i));
+
+ GTEST_CHECK_(IsValidParamName(param_name))
+ << "Parameterized test name '" << param_name
+ << "' is invalid, in " << file << " line " << line << std::endl;
+
+ GTEST_CHECK_(test_param_names.count(param_name) == 0)
+ << "Duplicate parameterized test name '" << param_name << "', in "
+ << file << " line " << line << std::endl;
+
+ test_param_names.insert(param_name);
+
+ if (!test_info->test_base_name.empty()) {
+ test_name_stream << test_info->test_base_name << "/";
+ }
+ test_name_stream << param_name;
+ MakeAndRegisterTestInfo(
+ test_suite_name.c_str(), test_name_stream.GetString().c_str(),
+ nullptr, // No type parameter.
+ PrintToString(*param_it).c_str(), test_info->code_location,
+ GetTestSuiteTypeId(),
+ SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
+ SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
+ test_info->test_meta_factory->CreateTestFactory(*param_it));
+ } // for param_it
+ } // for gen_it
+ } // for test_it
+
+ if (!generated_instantiations) {
+ // There are no generaotrs, or they all generate nothing ...
+ InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
+ !tests_.empty());
+ }
+ } // RegisterTests
+
+ private:
+ // LocalTestInfo structure keeps information about a single test registered
+ // with TEST_P macro.
+ struct TestInfo {
+ TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
+ TestMetaFactoryBase<ParamType>* a_test_meta_factory,
+ CodeLocation a_code_location)
+ : test_suite_base_name(a_test_suite_base_name),
+ test_base_name(a_test_base_name),
+ test_meta_factory(a_test_meta_factory),
+ code_location(a_code_location) {}
+
+ const std::string test_suite_base_name;
+ const std::string test_base_name;
+ const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
+ const CodeLocation code_location;
+ };
+ using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
+ // Records data received from INSTANTIATE_TEST_SUITE_P macros:
+ // <Instantiation name, Sequence generator creation function,
+ // Name generator function, Source file, Source line>
+ struct InstantiationInfo {
+ InstantiationInfo(const std::string& name_in,
+ GeneratorCreationFunc* generator_in,
+ ParamNameGeneratorFunc* name_func_in, const char* file_in,
+ int line_in)
+ : name(name_in),
+ generator(generator_in),
+ name_func(name_func_in),
+ file(file_in),
+ line(line_in) {}
+
+ std::string name;
+ GeneratorCreationFunc* generator;
+ ParamNameGeneratorFunc* name_func;
+ const char* file;
+ int line;
+ };
+ typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+ static bool IsValidParamName(const std::string& name) {
+ // Check for empty string
+ if (name.empty()) return false;
+
+ // Check for invalid characters
+ for (std::string::size_type index = 0; index < name.size(); ++index) {
+ if (!IsAlNum(name[index]) && name[index] != '_') return false;
+ }
+
+ return true;
+ }
+
+ const std::string test_suite_name_;
+ CodeLocation code_location_;
+ TestInfoContainer tests_;
+ InstantiationContainer instantiations_;
+
+ ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+ ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+ delete;
+}; // class ParameterizedTestSuiteInfo
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+template <class TestCase>
+using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo<TestCase>;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteRegistry contains a map of
+// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P
+// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding
+// ParameterizedTestSuiteInfo descriptors.
+class ParameterizedTestSuiteRegistry {
+ public:
+ ParameterizedTestSuiteRegistry() {}
+ ~ParameterizedTestSuiteRegistry() {
+ for (auto& test_suite_info : test_suite_infos_) {
+ delete test_suite_info;
+ }
+ }
+
+ // Looks up or creates and returns a structure containing information about
+ // tests and instantiations of a particular test suite.
+ template <class TestSuite>
+ ParameterizedTestSuiteInfo<TestSuite>* GetTestSuitePatternHolder(
+ const char* test_suite_name, CodeLocation code_location) {
+ ParameterizedTestSuiteInfo<TestSuite>* typed_test_info = nullptr;
+ for (auto& test_suite_info : test_suite_infos_) {
+ if (test_suite_info->GetTestSuiteName() == test_suite_name) {
+ if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
+ // Complain about incorrect usage of Google Test facilities
+ // and terminate the program since we cannot guaranty correct
+ // test suite setup and tear-down in this case.
+ ReportInvalidTestSuiteType(test_suite_name, code_location);
+ posix::Abort();
+ } else {
+ // At this point we are sure that the object we found is of the same
+ // type we are looking for, so we downcast it to that type
+ // without further checks.
+ typed_test_info = CheckedDowncastToActualType<
+ ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
+ }
+ break;
+ }
+ }
+ if (typed_test_info == nullptr) {
+ typed_test_info = new ParameterizedTestSuiteInfo<TestSuite>(
+ test_suite_name, code_location);
+ test_suite_infos_.push_back(typed_test_info);
+ }
+ return typed_test_info;
+ }
+ void RegisterTests() {
+ for (auto& test_suite_info : test_suite_infos_) {
+ test_suite_info->RegisterTests();
+ }
+ }
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ template <class TestCase>
+ ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+ const char* test_case_name, CodeLocation code_location) {
+ return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
+ }
+
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ private:
+ using TestSuiteInfoContainer = ::std::vector<ParameterizedTestSuiteInfoBase*>;
+
+ TestSuiteInfoContainer test_suite_infos_;
+
+ ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+ delete;
+ ParameterizedTestSuiteRegistry& operator=(
+ const ParameterizedTestSuiteRegistry&) = delete;
+};
+
+// Keep track of what type-parameterized test suite are defined and
+// where as well as which are intatiated. This allows susequently
+// identifying suits that are defined but never used.
+class TypeParameterizedTestSuiteRegistry {
+ public:
+ // Add a suite definition
+ void RegisterTestSuite(const char* test_suite_name,
+ CodeLocation code_location);
+
+ // Add an instantiation of a suit.
+ void RegisterInstantiation(const char* test_suite_name);
+
+ // For each suit repored as defined but not reported as instantiation,
+ // emit a test that reports that fact (configurably, as an error).
+ void CheckForInstantiations();
+
+ private:
+ struct TypeParameterizedTestSuiteInfo {
+ explicit TypeParameterizedTestSuiteInfo(CodeLocation c)
+ : code_location(c), instantiated(false) {}
+
+ CodeLocation code_location;
+ bool instantiated;
+ };
+
+ std::map<std::string, TypeParameterizedTestSuiteInfo> suites_;
+};
+
+} // namespace internal
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+ const Container& container);
+
+namespace internal {
+// Used in the Values() function to provide polymorphic capabilities.
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+template <typename... Ts>
+class ValueArray {
+ public:
+ explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {}
+
+ template <typename T>
+ operator ParamGenerator<T>() const { // NOLINT
+ return ValuesIn(MakeVector<T>(MakeIndexSequence<sizeof...(Ts)>()));
+ }
+
+ private:
+ template <typename T, size_t... I>
+ std::vector<T> MakeVector(IndexSequence<I...>) const {
+ return std::vector<T>{static_cast<T>(v_.template Get<I>())...};
+ }
+
+ FlatTuple<Ts...> v_;
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+template <typename... T>
+class CartesianProductGenerator
+ : public ParamGeneratorInterface<::std::tuple<T...>> {
+ public:
+ typedef ::std::tuple<T...> ParamType;
+
+ CartesianProductGenerator(const std::tuple<ParamGenerator<T>...>& g)
+ : generators_(g) {}
+ ~CartesianProductGenerator() override {}
+
+ ParamIteratorInterface<ParamType>* Begin() const override {
+ return new Iterator(this, generators_, false);
+ }
+ ParamIteratorInterface<ParamType>* End() const override {
+ return new Iterator(this, generators_, true);
+ }
+
+ private:
+ template <class I>
+ class IteratorImpl;
+ template <size_t... I>
+ class IteratorImpl<IndexSequence<I...>>
+ : public ParamIteratorInterface<ParamType> {
+ public:
+ IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
+ const std::tuple<ParamGenerator<T>...>& generators,
+ bool is_end)
+ : base_(base),
+ begin_(std::get<I>(generators).begin()...),
+ end_(std::get<I>(generators).end()...),
+ current_(is_end ? end_ : begin_) {
+ ComputeCurrentValue();
+ }
+ ~IteratorImpl() override {}
+
+ const ParamGeneratorInterface<ParamType>* BaseGenerator() const override {
+ return base_;
+ }
+ // Advance should not be called on beyond-of-range iterators
+ // so no component iterators must be beyond end of range, either.
+ void Advance() override {
+ assert(!AtEnd());
+ // Advance the last iterator.
+ ++std::get<sizeof...(T) - 1>(current_);
+ // if that reaches end, propagate that up.
+ AdvanceIfEnd<sizeof...(T) - 1>();
+ ComputeCurrentValue();
+ }
+ ParamIteratorInterface<ParamType>* Clone() const override {
+ return new IteratorImpl(*this);
+ }
+
+ const ParamType* Current() const override { return current_value_.get(); }
+
+ bool Equals(const ParamIteratorInterface<ParamType>& other) const override {
+ // Having the same base generator guarantees that the other
+ // iterator is of the same type and we can downcast.
+ GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+ << "The program attempted to compare iterators "
+ << "from different generators." << std::endl;
+ const IteratorImpl* typed_other =
+ CheckedDowncastToActualType<const IteratorImpl>(&other);
+
+ // We must report iterators equal if they both point beyond their
+ // respective ranges. That can happen in a variety of fashions,
+ // so we have to consult AtEnd().
+ if (AtEnd() && typed_other->AtEnd()) return true;
+
+ bool same = true;
+ bool dummy[] = {
+ (same = same && std::get<I>(current_) ==
+ std::get<I>(typed_other->current_))...};
+ (void)dummy;
+ return same;
+ }
+
+ private:
+ template <size_t ThisI>
+ void AdvanceIfEnd() {
+ if (std::get<ThisI>(current_) != std::get<ThisI>(end_)) return;
+
+ bool last = ThisI == 0;
+ if (last) {
+ // We are done. Nothing else to propagate.
+ return;
+ }
+
+ constexpr size_t NextI = ThisI - (ThisI != 0);
+ std::get<ThisI>(current_) = std::get<ThisI>(begin_);
+ ++std::get<NextI>(current_);
+ AdvanceIfEnd<NextI>();
+ }
+
+ void ComputeCurrentValue() {
+ if (!AtEnd())
+ current_value_ = std::make_shared<ParamType>(*std::get<I>(current_)...);
+ }
+ bool AtEnd() const {
+ bool at_end = false;
+ bool dummy[] = {
+ (at_end = at_end || std::get<I>(current_) == std::get<I>(end_))...};
+ (void)dummy;
+ return at_end;
+ }
+
+ const ParamGeneratorInterface<ParamType>* const base_;
+ std::tuple<typename ParamGenerator<T>::iterator...> begin_;
+ std::tuple<typename ParamGenerator<T>::iterator...> end_;
+ std::tuple<typename ParamGenerator<T>::iterator...> current_;
+ std::shared_ptr<ParamType> current_value_;
+ };
+
+ using Iterator = IteratorImpl<typename MakeIndexSequence<sizeof...(T)>::type>;
+
+ std::tuple<ParamGenerator<T>...> generators_;
+};
+
+template <class... Gen>
+class CartesianProductHolder {
+ public:
+ CartesianProductHolder(const Gen&... g) : generators_(g...) {}
+ template <typename... T>
+ operator ParamGenerator<::std::tuple<T...>>() const {
+ return ParamGenerator<::std::tuple<T...>>(
+ new CartesianProductGenerator<T...>(generators_));
+ }
+
+ private:
+ std::tuple<Gen...> generators_;
+};
+
+} // namespace internal
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
new file mode 100644
index 0000000000..f025db76ad
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,116 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
+#elif defined _WIN32
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif // _WIN32_WCE
+#elif defined __OS2__
+#define GTEST_OS_OS2 1
+#elif defined __APPLE__
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
+#elif defined __DragonFly__
+#define GTEST_OS_DRAGONFLY 1
+#elif defined __FreeBSD__
+#define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
+#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
+#define GTEST_OS_GNU_KFREEBSD 1
+#elif defined __linux__
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
+#elif defined __MVS__
+#define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+#define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+#define GTEST_OS_AIX 1
+#elif defined(__hpux)
+#define GTEST_OS_HPUX 1
+#elif defined __native_client__
+#define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+#define GTEST_OS_NETBSD 1
+#elif defined __OpenBSD__
+#define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+#define GTEST_OS_QNX 1
+#elif defined(__HAIKU__)
+#define GTEST_OS_HAIKU 1
+#elif defined ESP8266
+#define GTEST_OS_ESP8266 1
+#elif defined ESP32
+#define GTEST_OS_ESP32 1
+#elif defined(__XTENSA__)
+#define GTEST_OS_XTENSA 1
+#endif // __CYGWIN__
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
new file mode 100644
index 0000000000..0003d27658
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
@@ -0,0 +1,2413 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Low-level types and utilities for porting Google Test to various
+// platforms. All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice. Code
+// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test. All other Google Test source
+// files are expected to #include this. Therefore, it cannot #include
+// any other Google Test header.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments. Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros. However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition. After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+// - Each macro here is a user-tweakable knob; do not grow the list
+// lightly.
+// - Use #if to key off these macros. Don't use #ifdef or "#if
+// defined(...)", which will not work as these macros are ALWAYS
+// defined.
+//
+// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2)
+// is/isn't available.
+// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions
+// are enabled.
+// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular
+// expressions are/aren't available.
+// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that <pthread.h>
+// is/isn't available.
+// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't
+// enabled.
+// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that
+// std::wstring does/doesn't work (Google Test can
+// be used where std::wstring is unavailable).
+// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the
+// compiler supports Microsoft's "Structured
+// Exception Handling".
+// GTEST_HAS_STREAM_REDIRECTION
+// - Define it to 1/0 to indicate whether the
+// platform supports I/O stream redirection using
+// dup() and dup2().
+// GTEST_LINKED_AS_SHARED_LIBRARY
+// - Define to 1 when compiling tests that use
+// Google Test as a shared library (known as
+// DLL on Windows).
+// GTEST_CREATE_SHARED_LIBRARY
+// - Define to 1 when compiling Google Test itself
+// as a shared library.
+// GTEST_DEFAULT_DEATH_TEST_STYLE
+// - The default value of --gtest_death_test_style.
+// The legacy default has been "fast" in the open
+// source version since 2008. The recommended value
+// is "threadsafe", and can be set in
+// custom/gtest-port.h.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.). Google Test
+// defines these macros automatically. Code outside Google Test MUST
+// NOT define them.
+//
+// GTEST_OS_AIX - IBM AIX
+// GTEST_OS_CYGWIN - Cygwin
+// GTEST_OS_DRAGONFLY - DragonFlyBSD
+// GTEST_OS_FREEBSD - FreeBSD
+// GTEST_OS_FUCHSIA - Fuchsia
+// GTEST_OS_GNU_HURD - GNU/Hurd
+// GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
+// GTEST_OS_HAIKU - Haiku
+// GTEST_OS_HPUX - HP-UX
+// GTEST_OS_LINUX - Linux
+// GTEST_OS_LINUX_ANDROID - Google Android
+// GTEST_OS_MAC - Mac OS X
+// GTEST_OS_IOS - iOS
+// GTEST_OS_NACL - Google Native Client (NaCl)
+// GTEST_OS_NETBSD - NetBSD
+// GTEST_OS_OPENBSD - OpenBSD
+// GTEST_OS_OS2 - OS/2
+// GTEST_OS_QNX - QNX
+// GTEST_OS_SOLARIS - Sun Solaris
+// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile)
+// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop
+// GTEST_OS_WINDOWS_MINGW - MinGW
+// GTEST_OS_WINDOWS_MOBILE - Windows Mobile
+// GTEST_OS_WINDOWS_PHONE - Windows Phone
+// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT
+// GTEST_OS_ZOS - z/OS
+//
+// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the
+// most stable support. Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable. If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.). Google Test
+// defines these macros automatically. Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code. For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+// EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+// GTEST_HAS_DEATH_TEST - death tests
+// GTEST_HAS_TYPED_TEST - typed tests
+// GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+// GTEST_IS_THREADSAFE - Google Test is thread-safe.
+// GTEST_USES_RE2 - the RE2 regular expression library is used
+// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with
+// GTEST_HAS_POSIX_RE (see above) which users can
+// define themselves.
+// GTEST_USES_SIMPLE_RE - our own simple regex is used;
+// the above RE\b(s) are mutually exclusive.
+
+// Misc public macros
+// ------------------
+//
+// GTEST_FLAG(flag_name) - references the variable corresponding to
+// the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a
+// variable don't have to be used.
+// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used.
+// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+// suppressed (constant conditional).
+// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127
+// is suppressed.
+// GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter<std::any> or
+// UniversalPrinter<absl::any> specializations.
+// GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter<std::optional>
+// or
+// UniversalPrinter<absl::optional>
+// specializations.
+// GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
+// Matcher<absl::string_view>
+// specializations.
+// GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter<std::variant> or
+// UniversalPrinter<absl::variant>
+// specializations.
+//
+// Synchronization:
+// Mutex, MutexLock, ThreadLocal, GetThreadCount()
+// - synchronization primitives.
+//
+// Regular expressions:
+// RE - a simple regular expression class using
+// 1) the RE2 syntax on all platforms when built with RE2
+// and Abseil as dependencies
+// 2) the POSIX Extended Regular Expression syntax on
+// UNIX-like platforms,
+// 3) A reduced regular exception syntax on other platforms,
+// including Windows.
+// Logging:
+// GTEST_LOG_() - logs messages at the specified severity level.
+// LogToStderr() - directs all log messages to stderr.
+// FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+// CaptureStdout() - starts capturing stdout.
+// GetCapturedStdout() - stops capturing stdout and returns the captured
+// string.
+// CaptureStderr() - starts capturing stderr.
+// GetCapturedStderr() - stops capturing stderr and returns the captured
+// string.
+//
+// Integer types:
+// TypeWithSize - maps an integer to a int type.
+// TimeInMillis - integers of known sizes.
+// BiggestInt - the biggest signed integer type.
+//
+// Command-line utilities:
+// GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+// GetEnv() - gets the value of an environment variable.
+// BoolFromGTestEnv() - parses a bool environment variable.
+// Int32FromGTestEnv() - parses an int32_t environment variable.
+// StringFromGTestEnv() - parses a string environment variable.
+//
+// Deprecation warnings:
+// GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as
+// deprecated; calling a marked function
+// should generate a compiler warning
+
+#include <ctype.h> // for isspace, etc
+#include <stddef.h> // for ptrdiff_t
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cerrno>
+// #include <condition_variable> // Guarded by GTEST_IS_THREADSAFE below
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <locale>
+#include <memory>
+#include <string>
+// #include <mutex> // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#ifndef _WIN32_WCE
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif // !_WIN32_WCE
+
+#if defined __APPLE__
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
+#endif
+
+#include "gtest/internal/custom/gtest-port.h"
+#include "gtest/internal/gtest-port-arch.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
+
+#if !defined(GTEST_DEV_EMAIL_)
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+#define GTEST_GCC_VER_ \
+ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+// /* code that triggers warnings C4800 and C4385 */
+// GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if defined(_MSC_VER)
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+ __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
+#else
+// Not all compilers are MSVC
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+ _Pragma("clang diagnostic push") \
+ _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+ _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
+#else
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
+// In order to avoid having to include <windows.h>, use forward declaration
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
+#elif GTEST_OS_XTENSA
+#include <unistd.h>
+// Xtensa toolchains define strcasecmp in the string.h header instead of
+// strings.h. string.h is already included.
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+#include <strings.h>
+#include <unistd.h>
+#endif // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#include <android/api-level.h> // NOLINT
+#endif
+
+// Defines this to true if and only if Google Test can use POSIX regular
+// expressions.
+#ifndef GTEST_HAS_POSIX_RE
+#if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
+#endif
+#endif
+
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
+#elif GTEST_HAS_POSIX_RE
+#include <regex.h> // NOLINT
+#define GTEST_USES_POSIX_RE 1
+#else
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
+// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
+// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
+// there can be cleanups for ObjC exceptions which also need cleanups, even if
+// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which
+// checks for C++ exceptions starting at clang r206352, but which checked for
+// cleanups prior to that. To reliably check for C++ exception availability with
+// clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions. However, there is no compile-time way of
+// detecting whether they are enabled or not. Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#define GTEST_HAS_EXCEPTIONS 1
+#else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#define GTEST_HAS_EXCEPTIONS 0
+#endif // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif // GTEST_HAS_EXCEPTIONS
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either. Android has
+// no support for it at least as recent as Froyo (2.2).
+#define GTEST_HAS_STD_WSTRING \
+ (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+ GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA))
+
+#endif // GTEST_HAS_STD_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+#ifdef _MSC_VER
+
+#ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
+// enabled.
+#elif defined(__GNUC__)
+
+#ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+#elif defined(__clang__)
+
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
+
+#else
+
+// For all other compilers, we assume RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+
+#endif // _MSC_VER
+
+#endif // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+#include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+#define GTEST_HAS_PTHREAD \
+ (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \
+ GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
+ GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \
+ GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
+#endif // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+#include <pthread.h> // NOLINT
+
+// For timespec and nanosleep, used below.
+#include <time.h> // NOLINT
+#endif
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+ (defined(__mips__) && __ANDROID_API__ >= 12) || \
+ (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+ GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif // !GTEST_OS_WINDOWS_MOBILE
+#endif // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+ (GTEST_OS_MAC && !GTEST_OS_IOS) || \
+ (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \
+ GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
+ GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
+ GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU || \
+ GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
+ defined(__IBMCPP__) || defined(__HP_aCC)
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+ (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
+ GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD || \
+ GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding. This leads to problems with code like:
+//
+// if (gate)
+// ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ switch (0) \
+ case 0: \
+ default: // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used. This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor. Example:
+//
+// struct Foo {
+// Foo() { ... }
+// } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#elif defined(__clang__)
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+#if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+ __attribute__(( \
+ __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+ __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro. The macro should be used on function declarations
+// following the argument list:
+//
+// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
+#else
+#define GTEST_MUST_USE_RESULT_
+#endif // __GNUC__ && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling. This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#define GTEST_HAS_SEH 1
+#else
+// Assume no SEH.
+#define GTEST_HAS_SEH 0
+#endif
+
+#endif // GTEST_HAS_SEH
+
+#ifndef GTEST_IS_THREADSAFE
+
+#define GTEST_IS_THREADSAFE \
+ (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ || \
+ (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \
+ GTEST_HAS_PTHREAD)
+
+#endif // GTEST_IS_THREADSAFE
+
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable> // NOLINT
+#include <mutex> // NOLINT
+#endif // GTEST_IS_THREADSAFE
+
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
+#ifdef _MSC_VER
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+#define GTEST_API_ __attribute__((visibility("default")))
+#endif // _MSC_VER
+
+#endif // GTEST_API_
+
+#ifndef GTEST_API_
+#define GTEST_API_
+#endif // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
+#endif // GTEST_DEFAULT_DEATH_TEST_STYLE
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+#define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+ __attribute__((optimize("no-optimize-sibling-calls")))
+#else
+#define GTEST_NO_TAIL_CALL_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if !defined(GTEST_HAS_CXXABI_H_)
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif // __has_feature(memory_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+ __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif // __has_feature(address_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif // __clang__
+
+// A function level attribute to disable HWAddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+ __attribute__((no_sanitize("hwaddress")))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif // __has_feature(thread_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif // __clang__
+
+namespace testing {
+
+class Message;
+
+// Legacy imports for backwards compatibility.
+// New code should use std:: names directly.
+using std::get;
+using std::make_tuple;
+using std::tuple;
+using std::tuple_element;
+using std::tuple_size;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about. It has no
+// definition on purpose. Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// A helper for suppressing warnings on constant condition. It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines RE.
+
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+ RE(absl::string_view regex) : regex_(regex) {} // NOLINT
+ RE(const char* regex) : RE(absl::string_view(regex)) {} // NOLINT
+ RE(const std::string& regex) : RE(absl::string_view(regex)) {} // NOLINT
+ RE(const RE& other) : RE(other.pattern()) {}
+
+ const std::string& pattern() const { return regex_.pattern(); }
+
+ static bool FullMatch(absl::string_view str, const RE& re) {
+ return RE2::FullMatch(str, re.regex_);
+ }
+ static bool PartialMatch(absl::string_view str, const RE& re) {
+ return RE2::PartialMatch(str, re.regex_);
+ }
+
+ private:
+ RE2 regex_;
+};
+
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
+// A simple C++ wrapper for <regex.h>. It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+ // A copy constructor is required by the Standard to initialize object
+ // references from r-values.
+ RE(const RE& other) { Init(other.pattern()); }
+
+ // Constructs an RE from a string.
+ RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT
+
+ RE(const char* regex) { Init(regex); } // NOLINT
+ ~RE();
+
+ // Returns the string representation of the regex.
+ const char* pattern() const { return pattern_; }
+
+ // FullMatch(str, re) returns true if and only if regular expression re
+ // matches the entire str.
+ // PartialMatch(str, re) returns true if and only if regular expression re
+ // matches a substring of str (including str itself).
+ static bool FullMatch(const ::std::string& str, const RE& re) {
+ return FullMatch(str.c_str(), re);
+ }
+ static bool PartialMatch(const ::std::string& str, const RE& re) {
+ return PartialMatch(str.c_str(), re);
+ }
+
+ static bool FullMatch(const char* str, const RE& re);
+ static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+ void Init(const char* regex);
+ const char* pattern_;
+ bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+ regex_t full_regex_; // For FullMatch().
+ regex_t partial_regex_; // For PartialMatch().
+
+#else // GTEST_USES_SIMPLE_RE
+
+ const char* full_pattern_; // For FullMatch();
+
+#endif
+};
+
+#endif // ::testing::internal::RE implementation
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+ int line);
+
+// Defines logging utilities:
+// GTEST_LOG_(severity) - logs messages at the specified severity level. The
+// message itself is streamed into the macro.
+// LogToStderr() - directs all log messages to stderr.
+// FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+ GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+ // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+ ~GTestLog();
+
+ ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+ const GTestLogSeverity severity_;
+
+ GTestLog(const GTestLog&) = delete;
+ GTestLog& operator=(const GTestLog&) = delete;
+};
+
+#if !defined(GTEST_LOG_)
+
+#define GTEST_LOG_(severity) \
+ ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+ __FILE__, __LINE__) \
+ .GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(nullptr); }
+
+#endif // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+// Synopsis:
+// GTEST_CHECK_(boolean_condition);
+// or
+// GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+// This checks the condition and if the condition is not satisfied
+// it prints message about the condition violation, including the
+// condition itself, plus additional message streamed into it, if any,
+// and then it aborts the program. It aborts the program irrespective of
+// whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition) \
+ GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+ if (::testing::internal::IsTrue(condition)) \
+ ; \
+ else \
+ GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success). Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+ if (const int gtest_error = (posix_call)) \
+ GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+// char ==> const char&
+// const char ==> const char&
+// char& ==> char&
+// const char& ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef {
+ typedef const T& type;
+};
+template <typename T>
+struct ConstRef<T&> {
+ typedef T& type;
+};
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+ typename ::testing::internal::ConstRef<T>::type
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*). When you use ImplicitCast_, the compiler checks that
+// the cast is safe. Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertible to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+// ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late. It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template <typename To>
+inline To ImplicitCast_(To x) {
+ return x;
+}
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed. When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo? It
+// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus,
+// when you downcast, you should use this macro. In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not). In normal mode, we do the efficient static_cast<>
+// instead. Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+// This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+// if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+// if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template <typename To, typename From> // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) { // so we only accept pointers
+ // Ensures that To is a sub-type of From *. This test is here only
+ // for compile-time type checking, and has no overhead in an
+ // optimized build at run-time, as it will be optimized away
+ // completely.
+ GTEST_INTENTIONAL_CONST_COND_PUSH_()
+ if (false) {
+ GTEST_INTENTIONAL_CONST_COND_POP_()
+ const To to = nullptr;
+ ::testing::internal::ImplicitCast_<From*>(to);
+ }
+
+#if GTEST_HAS_RTTI
+ // RTTI: debug mode only!
+ GTEST_CHECK_(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif
+ return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+ GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+ return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
+ return dynamic_cast<Derived*>(base); // NOLINT
+#else
+ return static_cast<Derived*>(base); // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+// CaptureStdout - starts capturing stdout.
+// GetCapturedStdout - stops capturing stdout and returns the captured string.
+// CaptureStderr - starts capturing stderr.
+// GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+void ClearInjectableArgvs();
+
+#endif // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+
+#if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+ // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+ // avoid including <windows.h> in this header file. Including <windows.h> is
+ // undesirable because it defines a lot of symbols and macros that tend to
+ // conflict with client code. This assumption is verified by
+ // WindowsTypesTest.HANDLEIsVoidStar.
+ typedef void* Handle;
+ AutoHandle();
+ explicit AutoHandle(Handle handle);
+
+ ~AutoHandle();
+
+ Handle Get() const;
+ void Reset();
+ void Reset(Handle handle);
+
+ private:
+ // Returns true if and only if the handle is a valid handle object that can be
+ // closed.
+ bool IsCloseable() const;
+
+ Handle handle_;
+
+ AutoHandle(const AutoHandle&) = delete;
+ AutoHandle& operator=(const AutoHandle&) = delete;
+};
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified. Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
+class GTEST_API_ Notification {
+ public:
+ Notification() : notified_(false) {}
+ Notification(const Notification&) = delete;
+ Notification& operator=(const Notification&) = delete;
+
+ // Notifies all threads created with this notification to start. Must
+ // be called from the controller thread.
+ void Notify() {
+ std::lock_guard<std::mutex> lock(mu_);
+ notified_ = true;
+ cv_.notify_all();
+ }
+
+ // Blocks until the controller thread notifies. Must be called from a test
+ // thread.
+ void WaitForNotification() {
+ std::unique_lock<std::mutex> lock(mu_);
+ cv_.wait(lock, [this]() { return notified_; });
+ }
+
+ private:
+ std::mutex mu_;
+ std::condition_variable cv_;
+ bool notified_;
+};
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+#endif // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+ virtual ~ThreadWithParamBase() {}
+ virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical. Some compilers (for
+// example, SunStudio) treat them as different types. Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+ static_cast<ThreadWithParamBase*>(thread)->Run();
+ return nullptr;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+// void ThreadFunc(int param) { /* Do things with param */ }
+// Notification thread_can_start;
+// ...
+// // The thread_can_start parameter is optional; you can supply NULL.
+// ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+// thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+ typedef void UserThreadFunc(T);
+
+ ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+ : func_(func),
+ param_(param),
+ thread_can_start_(thread_can_start),
+ finished_(false) {
+ ThreadWithParamBase* const base = this;
+ // The thread can be created only after all fields except thread_
+ // have been initialized.
+ GTEST_CHECK_POSIX_SUCCESS_(
+ pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base));
+ }
+ ~ThreadWithParam() override { Join(); }
+
+ void Join() {
+ if (!finished_) {
+ GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr));
+ finished_ = true;
+ }
+ }
+
+ void Run() override {
+ if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification();
+ func_(param_);
+ }
+
+ private:
+ UserThreadFunc* const func_; // User-supplied thread function.
+ const T param_; // User-supplied parameter to the thread function.
+ // When non-NULL, used to block execution until the controller thread
+ // notifies.
+ Notification* const thread_can_start_;
+ bool finished_; // true if and only if we know that the thread function has
+ // finished.
+ pthread_t thread_; // The native thread object.
+
+ ThreadWithParam(const ThreadWithParam&) = delete;
+ ThreadWithParam& operator=(const ThreadWithParam&) = delete;
+};
+#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+ // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms. It is used in conjunction
+// with class MutexLock:
+//
+// Mutex mutex;
+// ...
+// MutexLock lock(&mutex); // Acquires the mutex and releases it at the
+// // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+ enum MutexType { kStatic = 0, kDynamic = 1 };
+ // We rely on kStaticMutex being 0 as it is to what the linker initializes
+ // type_ in static mutexes. critical_section_ will be initialized lazily
+ // in ThreadSafeLazyInit().
+ enum StaticConstructorSelector { kStaticMutex = 0 };
+
+ // This constructor intentionally does nothing. It relies on type_ being
+ // statically initialized to 0 (effectively setting it to kStatic) and on
+ // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+ explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+ Mutex();
+ ~Mutex();
+
+ void Lock();
+
+ void Unlock();
+
+ // Does nothing if the current thread holds the mutex. Otherwise, crashes
+ // with high probability.
+ void AssertHeld();
+
+ private:
+ // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+ void ThreadSafeLazyInit();
+
+ // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
+ // we assume that 0 is an invalid value for thread IDs.
+ unsigned int owner_thread_id_;
+
+ // For static mutexes, we rely on these members being initialized to zeros
+ // by the linker.
+ MutexType type_;
+ long critical_section_init_phase_; // NOLINT
+ GTEST_CRITICAL_SECTION* critical_section_;
+
+ Mutex(const Mutex&) = delete;
+ Mutex& operator=(const Mutex&) = delete;
+};
+
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+ extern ::testing::internal::Mutex mutex
+
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+ ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)". Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+ explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
+
+ ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+ Mutex* const mutex_;
+
+ GTestMutexLock(const GTestMutexLock&) = delete;
+ GTestMutexLock& operator=(const GTestMutexLock&) = delete;
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>. Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+ virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+ // Creates a new ValueHolder<T> object holding a default value passed to
+ // this ThreadLocal<T>'s constructor and returns it. It is the caller's
+ // responsibility not to call this when the ThreadLocal<T> instance already
+ // has a value on the current thread.
+ virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+ ThreadLocalBase() {}
+ virtual ~ThreadLocalBase() {}
+
+ private:
+ ThreadLocalBase(const ThreadLocalBase&) = delete;
+ ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits. A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+ // Registers thread_local_instance as having value on the current thread.
+ // Returns a value that can be used to identify the thread from other threads.
+ static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+ const ThreadLocalBase* thread_local_instance);
+
+ // Invoked when a ThreadLocal instance is destroyed.
+ static void OnThreadLocalDestroyed(
+ const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+ void Join();
+
+ protected:
+ class Runnable {
+ public:
+ virtual ~Runnable() {}
+ virtual void Run() = 0;
+ };
+
+ ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
+ virtual ~ThreadWithParamBase();
+
+ private:
+ AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+ typedef void UserThreadFunc(T);
+
+ ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+ : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
+ virtual ~ThreadWithParam() {}
+
+ private:
+ class RunnableImpl : public Runnable {
+ public:
+ RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
+ virtual ~RunnableImpl() {}
+ virtual void Run() { func_(param_); }
+
+ private:
+ UserThreadFunc* const func_;
+ const T param_;
+
+ RunnableImpl(const RunnableImpl&) = delete;
+ RunnableImpl& operator=(const RunnableImpl&) = delete;
+ };
+
+ ThreadWithParam(const ThreadWithParam&) = delete;
+ ThreadWithParam& operator=(const ThreadWithParam&) = delete;
+};
+
+// Implements thread-local storage on Windows systems.
+//
+// // Thread 1
+// ThreadLocal<int> tl(100); // 100 is the default value for each thread.
+//
+// // Thread 2
+// tl.set(150); // Changes the value for thread 2 only.
+// EXPECT_EQ(150, tl.get());
+//
+// // Thread 1
+// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value.
+// tl.set(200);
+// EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects. That means they
+// will die after main() has returned. Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+ ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+ explicit ThreadLocal(const T& value)
+ : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+ ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+ T* pointer() { return GetOrCreateValue(); }
+ const T* pointer() const { return GetOrCreateValue(); }
+ const T& get() const { return *pointer(); }
+ void set(const T& value) { *pointer() = value; }
+
+ private:
+ // Holds a value of T. Can be deleted via its base class without the caller
+ // knowing the type of T.
+ class ValueHolder : public ThreadLocalValueHolderBase {
+ public:
+ ValueHolder() : value_() {}
+ explicit ValueHolder(const T& value) : value_(value) {}
+
+ T* pointer() { return &value_; }
+
+ private:
+ T value_;
+ ValueHolder(const ValueHolder&) = delete;
+ ValueHolder& operator=(const ValueHolder&) = delete;
+ };
+
+ T* GetOrCreateValue() const {
+ return static_cast<ValueHolder*>(
+ ThreadLocalRegistry::GetValueOnCurrentThread(this))
+ ->pointer();
+ }
+
+ ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
+ return default_factory_->MakeNewHolder();
+ }
+
+ class ValueHolderFactory {
+ public:
+ ValueHolderFactory() {}
+ virtual ~ValueHolderFactory() {}
+ virtual ValueHolder* MakeNewHolder() const = 0;
+
+ private:
+ ValueHolderFactory(const ValueHolderFactory&) = delete;
+ ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
+ };
+
+ class DefaultValueHolderFactory : public ValueHolderFactory {
+ public:
+ DefaultValueHolderFactory() {}
+ ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
+
+ private:
+ DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+ DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+ delete;
+ };
+
+ class InstanceValueHolderFactory : public ValueHolderFactory {
+ public:
+ explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+ ValueHolder* MakeNewHolder() const override {
+ return new ValueHolder(value_);
+ }
+
+ private:
+ const T value_; // The value for each thread.
+
+ InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+ InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+ delete;
+ };
+
+ std::unique_ptr<ValueHolderFactory> default_factory_;
+
+ ThreadLocal(const ThreadLocal&) = delete;
+ ThreadLocal& operator=(const ThreadLocal&) = delete;
+};
+
+#elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+ // Acquires this mutex.
+ void Lock() {
+ GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+ owner_ = pthread_self();
+ has_owner_ = true;
+ }
+
+ // Releases this mutex.
+ void Unlock() {
+ // Since the lock is being released the owner_ field should no longer be
+ // considered valid. We don't protect writing to has_owner_ here, as it's
+ // the caller's responsibility to ensure that the current thread holds the
+ // mutex when this is called.
+ has_owner_ = false;
+ GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+ }
+
+ // Does nothing if the current thread holds the mutex. Otherwise, crashes
+ // with high probability.
+ void AssertHeld() const {
+ GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+ << "The current thread is not holding the mutex @" << this;
+ }
+
+ // A static mutex may be used before main() is entered. It may even
+ // be used before the dynamic initialization stage. Therefore we
+ // must be able to initialize a static mutex object at link time.
+ // This means MutexBase has to be a POD and its member variables
+ // have to be public.
+ public:
+ pthread_mutex_t mutex_; // The underlying pthread mutex.
+ // has_owner_ indicates whether the owner_ field below contains a valid thread
+ // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+ // accesses to the owner_ field should be protected by a check of this field.
+ // An alternative might be to memset() owner_ to all zeros, but there's no
+ // guarantee that a zero'd pthread_t is necessarily invalid or even different
+ // from pthread_self().
+ bool has_owner_;
+ pthread_t owner_; // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+ extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+ ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+ Mutex() {
+ GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
+ has_owner_ = false;
+ }
+ ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
+
+ private:
+ Mutex(const Mutex&) = delete;
+ Mutex& operator=(const Mutex&) = delete;
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)". Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+ explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
+
+ ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+ MutexBase* const mutex_;
+
+ GTestMutexLock(const GTestMutexLock&) = delete;
+ GTestMutexLock& operator=(const GTestMutexLock&) = delete;
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage. Therefore it cannot be templatized to access
+// ThreadLocal<T>. Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+ virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+ delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+ ThreadLocal()
+ : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+ explicit ThreadLocal(const T& value)
+ : key_(CreateKey()),
+ default_factory_(new InstanceValueHolderFactory(value)) {}
+
+ ~ThreadLocal() {
+ // Destroys the managed object for the current thread, if any.
+ DeleteThreadLocalValue(pthread_getspecific(key_));
+
+ // Releases resources associated with the key. This will *not*
+ // delete managed objects for other threads.
+ GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+ }
+
+ T* pointer() { return GetOrCreateValue(); }
+ const T* pointer() const { return GetOrCreateValue(); }
+ const T& get() const { return *pointer(); }
+ void set(const T& value) { *pointer() = value; }
+
+ private:
+ // Holds a value of type T.
+ class ValueHolder : public ThreadLocalValueHolderBase {
+ public:
+ ValueHolder() : value_() {}
+ explicit ValueHolder(const T& value) : value_(value) {}
+
+ T* pointer() { return &value_; }
+
+ private:
+ T value_;
+ ValueHolder(const ValueHolder&) = delete;
+ ValueHolder& operator=(const ValueHolder&) = delete;
+ };
+
+ static pthread_key_t CreateKey() {
+ pthread_key_t key;
+ // When a thread exits, DeleteThreadLocalValue() will be called on
+ // the object managed for that thread.
+ GTEST_CHECK_POSIX_SUCCESS_(
+ pthread_key_create(&key, &DeleteThreadLocalValue));
+ return key;
+ }
+
+ T* GetOrCreateValue() const {
+ ThreadLocalValueHolderBase* const holder =
+ static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+ if (holder != nullptr) {
+ return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+ }
+
+ ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+ ThreadLocalValueHolderBase* const holder_base = new_holder;
+ GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+ return new_holder->pointer();
+ }
+
+ class ValueHolderFactory {
+ public:
+ ValueHolderFactory() {}
+ virtual ~ValueHolderFactory() {}
+ virtual ValueHolder* MakeNewHolder() const = 0;
+
+ private:
+ ValueHolderFactory(const ValueHolderFactory&) = delete;
+ ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
+ };
+
+ class DefaultValueHolderFactory : public ValueHolderFactory {
+ public:
+ DefaultValueHolderFactory() {}
+ ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
+
+ private:
+ DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+ DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+ delete;
+ };
+
+ class InstanceValueHolderFactory : public ValueHolderFactory {
+ public:
+ explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+ ValueHolder* MakeNewHolder() const override {
+ return new ValueHolder(value_);
+ }
+
+ private:
+ const T value_; // The value for each thread.
+
+ InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+ InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+ delete;
+ };
+
+ // A key pthreads uses for looking up per-thread values.
+ const pthread_key_t key_;
+ std::unique_ptr<ValueHolderFactory> default_factory_;
+
+ ThreadLocal(const ThreadLocal&) = delete;
+ ThreadLocal& operator=(const ThreadLocal&) = delete;
+};
+
+#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable). Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+ Mutex() {}
+ void Lock() {}
+ void Unlock() {}
+ void AssertHeld() const {}
+};
+
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+ extern ::testing::internal::Mutex mutex
+
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)". Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+ explicit GTestMutexLock(Mutex*) {} // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+ ThreadLocal() : value_() {}
+ explicit ThreadLocal(const T& value) : value_(value) {}
+ T* pointer() { return &value_; }
+ const T* pointer() const { return &value_; }
+ const T& get() const { return value_; }
+ void set(const T& value) { value_ = value; }
+
+ private:
+ T value_;
+};
+
+#endif // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
+#else
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
+#endif // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF. char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+ return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+ return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+ return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+ return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+ return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+ return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+ return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#ifdef __cpp_char8_t
+inline bool IsXDigit(char8_t ch) {
+ return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#endif
+inline bool IsXDigit(char16_t ch) {
+ const unsigned char low_byte = static_cast<unsigned char>(ch);
+ return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(char32_t ch) {
+ const unsigned char low_byte = static_cast<unsigned char>(ch);
+ return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+ const unsigned char low_byte = static_cast<unsigned char>(ch);
+ return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+ return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+ return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+ std::string::iterator it = str.end();
+ while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
+ return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions. These wrappers hide the differences between
+// Windows/MSVC and POSIX systems. Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+#ifdef __BORLANDC__
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+ return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+#else // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+ GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
+inline int DoIsATTY(int /* fd */) { return 0; }
+#else
+inline int DoIsATTY(int fd) { return _isatty(fd); }
+#endif // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+ return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+#endif // __BORLANDC__
+
+#if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+#else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+#elif GTEST_OS_ESP8266
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) {
+ // stat function not implemented on ESP8266
+ return 0;
+}
+inline int StrCaseCmp(const char* s1, const char* s2) {
+ return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+ return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif // GTEST_OS_WINDOWS
+
+inline int IsATTY(int fd) {
+ // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
+ // to a file on Linux), which is unexpected, so save the previous value, and
+ // restore it after the call.
+ int savedErrno = errno;
+ int isAttyValue = DoIsATTY(fd);
+ errno = savedErrno;
+
+ return isAttyValue;
+}
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+ !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+ struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
+ std::wstring_convert<wchar_codecvt> converter;
+ std::wstring wide_path = converter.from_bytes(path);
+ std::wstring wide_mode = converter.from_bytes(mode);
+ return _wfopen(wide_path.c_str(), wide_mode.c_str());
+#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+ return fopen(path, mode);
+#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
+ return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+ return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+ return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+ GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+ // We are on an embedded platform, which has no environment variables.
+ static_cast<void>(name); // To prevent 'unused argument' warning.
+ return nullptr;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+ // Environment variables which we programmatically clear will be set to the
+ // empty string rather than unset (NULL). Handle that case.
+ const char* const env = getenv(name);
+ return (env != nullptr && env[0] != '\0') ? env : nullptr;
+#else
+ return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+[[noreturn]] void Abort();
+#else
+[[noreturn]] inline void Abort() { abort(); }
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+} // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used. In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that. We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+ _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s
+#define GTEST_SNPRINTF_ _snprintf
+#else
+#define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The biggest signed integer type the compiler supports.
+//
+// long long is guaranteed to be at least 64-bits in C++11.
+using BiggestInt = long long; // NOLINT
+
+// The maximum number a BiggestInt can represent.
+constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits<BiggestInt>::max)();
+
+// This template class serves as a compile-time function from size to
+// type. It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+// TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs. Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+ // This prevents the user from using TypeWithSize<N> with incorrect
+ // values of N.
+ using UInt = void;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+ using Int = std::int32_t;
+ using UInt = std::uint32_t;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+ using Int = std::int64_t;
+ using UInt = std::uint64_t;
+};
+
+// Integer types of known sizes.
+using TimeInMillis = int64_t; // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif // !defined(GTEST_FLAG)
+
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+ ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+ ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+ ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) \
+ ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+ ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+ ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+ (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else // GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+ namespace testing { \
+ GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) \
+ namespace testing { \
+ GTEST_API_ extern bool GTEST_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name) \
+ namespace testing { \
+ GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name) \
+ namespace testing { \
+ GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+ } \
+ static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif // GTEST_HAS_ABSL
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer. If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+GTEST_API_ bool ParseInt32(const Message& src_text, const char* str,
+ int32_t* value);
+
+// Parses a bool/int32_t/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+} // namespace internal
+} // namespace testing
+
+#if !defined(GTEST_INTERNAL_DEPRECATED)
+
+// Internal Macro to mark an API deprecated, for googletest usage only
+// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or
+// GTEST_INTERNAL_DEPRECATED(message) <return_type> myFunction(); Every usage of
+// a deprecated entity will trigger a warning when compiled with
+// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler).
+// For msvc /W3 option will need to be used
+// Note that for 'other' compilers this macro evaluates to nothing to prevent
+// compilations errors.
+#if defined(_MSC_VER)
+#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__GNUC__)
+#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message)))
+#else
+#define GTEST_INTERNAL_DEPRECATED(message)
+#endif
+
+#endif // !defined(GTEST_INTERNAL_DEPRECATED)
+
+#if GTEST_HAS_ABSL
+// Always use absl::any for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include "absl/types/any.h"
+namespace testing {
+namespace internal {
+using Any = ::absl::any;
+} // namespace internal
+} // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<any>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::any for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include <any>
+namespace testing {
+namespace internal {
+using Any = ::std::any;
+} // namespace internal
+} // namespace testing
+// The case where absl is configured NOT to alias std::any is not
+// supported.
+#endif // __has_include(<any>) && __cplusplus >= 201703L
+#endif // __has_include
+#endif // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::optional for UniversalPrinter<> specializations if
+// googletest is built with absl support.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include "absl/types/optional.h"
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
+} // namespace internal
+} // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::optional for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include <optional>
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
+} // namespace internal
+} // namespace testing
+// The case where absl is configured NOT to alias std::optional is not
+// supported.
+#endif // __has_include(<optional>) && __cplusplus >= 201703L
+#endif // __has_include
+#endif // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::string_view for Matcher<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include "absl/strings/string_view.h"
+namespace testing {
+namespace internal {
+using StringView = ::absl::string_view;
+} // namespace internal
+} // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::string_view for Matcher<>
+// specializations.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include <string_view>
+namespace testing {
+namespace internal {
+using StringView = ::std::string_view;
+} // namespace internal
+} // namespace testing
+// The case where absl is configured NOT to alias std::string_view is not
+// supported.
+#endif // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif // __has_include
+#endif // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::variant for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include "absl/types/variant.h"
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::absl::variant<T...>;
+} // namespace internal
+} // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<variant>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::variant for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include <variant>
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::std::variant<T...>;
+} // namespace internal
+} // namespace testing
+// The case where absl is configured NOT to alias std::variant is not supported.
+#endif // __has_include(<variant>) && __cplusplus >= 201703L
+#endif // __has_include
+#endif // GTEST_HAS_ABSL
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
new file mode 100644
index 0000000000..cca2e1f2ad
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
@@ -0,0 +1,177 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test. They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+#include <mem.h>
+#endif
+
+#include <string.h>
+
+#include <cstdint>
+#include <string>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+ // Static utility methods
+
+ // Clones a 0-terminated C string, allocating memory using new. The
+ // caller is responsible for deleting the return value using
+ // delete[]. Returns the cloned string, or NULL if the input is
+ // NULL.
+ //
+ // This is different from strdup() in string.h, which allocates
+ // memory using malloc().
+ static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+ // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+ // able to pass strings to Win32 APIs on CE we need to convert them
+ // to 'Unicode', UTF-16.
+
+ // Creates a UTF-16 wide string from the given ANSI string, allocating
+ // memory using new. The caller is responsible for deleting the return
+ // value using delete[]. Returns the wide string, or NULL if the
+ // input is NULL.
+ //
+ // The wide string is created using the ANSI codepage (CP_ACP) to
+ // match the behaviour of the ANSI versions of Win32 calls and the
+ // C runtime.
+ static LPCWSTR AnsiToUtf16(const char* c_str);
+
+ // Creates an ANSI string from the given wide string, allocating
+ // memory using new. The caller is responsible for deleting the return
+ // value using delete[]. Returns the ANSI string, or NULL if the
+ // input is NULL.
+ //
+ // The returned string is created using the ANSI codepage (CP_ACP) to
+ // match the behaviour of the ANSI versions of Win32 calls and the
+ // C runtime.
+ static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+ // Compares two C strings. Returns true if and only if they have the same
+ // content.
+ //
+ // Unlike strcmp(), this function can handle NULL argument(s). A
+ // NULL C string is considered different to any non-NULL C string,
+ // including the empty string.
+ static bool CStringEquals(const char* lhs, const char* rhs);
+
+ // Converts a wide C string to a String using the UTF-8 encoding.
+ // NULL will be converted to "(null)". If an error occurred during
+ // the conversion, "(failed to convert from wide string)" is
+ // returned.
+ static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+ // Compares two wide C strings. Returns true if and only if they have the
+ // same content.
+ //
+ // Unlike wcscmp(), this function can handle NULL argument(s). A
+ // NULL C string is considered different to any non-NULL C string,
+ // including the empty string.
+ static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+ // Compares two C strings, ignoring case. Returns true if and only if
+ // they have the same content.
+ //
+ // Unlike strcasecmp(), this function can handle NULL argument(s).
+ // A NULL C string is considered different to any non-NULL C string,
+ // including the empty string.
+ static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
+
+ // Compares two wide C strings, ignoring case. Returns true if and only if
+ // they have the same content.
+ //
+ // Unlike wcscasecmp(), this function can handle NULL argument(s).
+ // A NULL C string is considered different to any non-NULL wide C string,
+ // including the empty string.
+ // NB: The implementations on different platforms slightly differ.
+ // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+ // environment variable. On GNU platform this method uses wcscasecmp
+ // which compares according to LC_CTYPE category of the current locale.
+ // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+ // current locale.
+ static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+ const wchar_t* rhs);
+
+ // Returns true if and only if the given string ends with the given suffix,
+ // ignoring case. Any string is considered to end with an empty suffix.
+ static bool EndsWithCaseInsensitive(const std::string& str,
+ const std::string& suffix);
+
+ // Formats an int value as "%02d".
+ static std::string FormatIntWidth2(int value); // "%02d" for width == 2
+
+ // Formats an int value to given width with leading zeros.
+ static std::string FormatIntWidthN(int value, int width);
+
+ // Formats an int value as "%X".
+ static std::string FormatHexInt(int value);
+
+ // Formats an int value as "%X".
+ static std::string FormatHexUInt32(uint32_t value);
+
+ // Formats a byte as "%02X".
+ static std::string FormatByte(unsigned char value);
+
+ private:
+ String(); // Not meant to be instantiated.
+}; // class String
+
+// Gets the content of the stringstream's buffer as an std::string. Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+} // namespace internal
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
new file mode 100644
index 0000000000..6bc02a7de3
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
@@ -0,0 +1,186 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here. It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`). Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+ static const char prefix[] = "std::__";
+ if (s.compare(0, strlen(prefix), prefix) == 0) {
+ std::string::size_type end = s.find("::", strlen(prefix));
+ if (end != s.npos) {
+ // Erase everything between the initial `std` and the second `::`.
+ s.erase(strlen("std"), end - strlen("std"));
+ }
+ }
+ return s;
+}
+
+#if GTEST_HAS_RTTI
+// GetTypeName(const std::type_info&) returns a human-readable name of type T.
+inline std::string GetTypeName(const std::type_info& type) {
+ const char* const name = type.name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+ int status = 0;
+ // gcc's implementation of typeid(T).name() mangles the type name,
+ // so we have to demangle it.
+#if GTEST_HAS_CXXABI_H_
+ using abi::__cxa_demangle;
+#endif // GTEST_HAS_CXXABI_H_
+ char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
+ const std::string name_str(status == 0 ? readable_name : name);
+ free(readable_name);
+ return CanonicalizeForStdLibVersioning(name_str);
+#else
+ return name;
+#endif // GTEST_HAS_CXXABI_H_ || __HP_aCC
+}
+#endif // GTEST_HAS_RTTI
+
+// GetTypeName<T>() returns a human-readable name of type T if and only if
+// RTTI is enabled, otherwise it returns a dummy type name.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+ return GetTypeName(typeid(T));
+#else
+ return "<type>";
+#endif // GTEST_HAS_RTTI
+}
+
+// A unique type indicating an empty node
+struct None {};
+
+#define GTEST_TEMPLATE_ \
+ template <typename T> \
+ class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type. TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>. This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+ template <typename T>
+ struct Bind {
+ typedef Tmpl<T> type;
+ };
+};
+
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
+
+template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
+struct Templates {
+ using Head = TemplateSel<Head_>;
+ using Tail = Templates<Tail_...>;
+};
+
+template <GTEST_TEMPLATE_ Head_>
+struct Templates<Head_> {
+ using Head = TemplateSel<Head_>;
+ using Tail = None;
+};
+
+// Tuple-like type lists
+template <typename Head_, typename... Tail_>
+struct Types {
+ using Head = Head_;
+ using Tail = Types<Tail_...>;
+};
+
+template <typename Head_>
+struct Types<Head_> {
+ using Head = Head_;
+ using Tail = None;
+};
+
+// Helper metafunctions to tell apart a single type from types
+// generated by ::testing::Types
+template <typename... Ts>
+struct ProxyTypeList {
+ using type = Types<Ts...>;
+};
+
+template <typename>
+struct is_proxy_type_list : std::false_type {};
+
+template <typename... Ts>
+struct is_proxy_type_list<ProxyTypeList<Ts...>> : std::true_type {};
+
+// Generator which conditionally creates type lists.
+// It recognizes if a requested type list should be created
+// and prevents creating a new type list nested within another one.
+template <typename T>
+struct GenerateTypeList {
+ private:
+ using proxy = typename std::conditional<is_proxy_type_list<T>::value, T,
+ ProxyTypeList<T>>::type;
+
+ public:
+ using type = typename proxy::type;
+};
+
+} // namespace internal
+
+template <typename... Ts>
+using Types = internal::ProxyTypeList<Ts...>;
+
+} // namespace testing
+
+#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc
new file mode 100644
index 0000000000..2a70ed88c7
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc
@@ -0,0 +1,49 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+#include "src/gtest-assertion-result.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-matchers.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-assertion-result.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-assertion-result.cc
new file mode 100644
index 0000000000..f1c0b10dc9
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+ : success_(other.success_),
+ message_(other.message_.get() != nullptr
+ ? new ::std::string(*other.message_)
+ : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+ using std::swap;
+ swap(success_, other.success_);
+ swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+ AssertionResult negation(!success_);
+ if (message_.get() != nullptr) negation << *message_;
+ return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+ return AssertionFailure() << message;
+}
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc
new file mode 100644
index 0000000000..e6abc6278a
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc
@@ -0,0 +1,1620 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+
+#include <functional>
+#include <utility>
+
+#include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif // GTEST_OS_FUCHSIA
+
+#endif // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
+
+} // namespace testing
+
+GTEST_DEFINE_string_(
+ death_test_style,
+ testing::internal::StringFromGTestEnv("death_test_style",
+ testing::kDefaultDeathTestStyle),
+ "Indicates how to run a death test in a forked child process: "
+ "\"threadsafe\" (child process re-executes the test binary "
+ "from the beginning, running only the specific death test) or "
+ "\"fast\" (child process runs the death test immediately "
+ "after forking).");
+
+GTEST_DEFINE_bool_(
+ death_test_use_fork,
+ testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
+ "Instructs to use fork()/_exit() instead of clone() in death tests. "
+ "Ignored and always uses fork() on POSIX systems where clone() is not "
+ "implemented. Useful when running under valgrind or similar tools if "
+ "those do not support clone(). Valgrind 3.3.1 will just fail if "
+ "it sees an unsupported combination of clone() flags. "
+ "It is not recommended to use this flag w/o valgrind though it will "
+ "work in 99% of the cases. Once valgrind is fixed, this flag will "
+ "most likely be removed.");
+
+GTEST_DEFINE_string_(
+ internal_run_death_test, "",
+ "Indicates the file, line number, temporal index of "
+ "the single death test to run, and a file descriptor to "
+ "which a success code may be sent, all separated by "
+ "the '|' characters. This flag is specified if and only if the "
+ "current process is a sub-process launched for running a thread-safe "
+ "death test. FOR INTERNAL USE ONLY.");
+
+namespace testing {
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+static bool g_in_fast_death_test_child = false;
+#endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process. Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests. IMPORTANT: This is an internal utility. Using it may break the
+// implementation of death tests. User code MUST NOT use it.
+bool InDeathTestChild() {
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+ // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+ // of the death_test_style flag.
+ return !GTEST_FLAG_GET(internal_run_death_test).empty();
+
+#else
+
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+ return !GTEST_FLAG_GET(internal_run_death_test).empty();
+ else
+ return g_in_fast_death_test_child;
+#endif
+}
+
+} // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+ return exit_status == exit_code_;
+
+#else
+
+ return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+}
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+ {
+ bool result;
+ if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+ return result;
+ }
+ }
+#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+ return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+ Message m;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+ m << "Exited with exit status " << exit_code;
+
+#else
+
+ if (WIFEXITED(exit_code)) {
+ m << "Exited with exit status " << WEXITSTATUS(exit_code);
+ } else if (WIFSIGNALED(exit_code)) {
+ m << "Terminated by signal " << WTERMSIG(exit_code);
+ }
+#ifdef WCOREDUMP
+ if (WCOREDUMP(exit_code)) {
+ m << " (core dumped)";
+ }
+#endif
+#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+ return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+ return !ExitedWithCode(0)(exit_status);
+}
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement. It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+ Message msg;
+ msg << "Death tests use fork(), which is unsafe particularly"
+ << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+ if (thread_count == 0) {
+ msg << "couldn't detect the number of threads.";
+ } else {
+ msg << "detected " << thread_count << " threads.";
+ }
+ msg << " See "
+ "https://github.com/google/googletest/blob/master/docs/"
+ "advanced.md#death-tests-and-threads"
+ << " for more explanation and suggested solutions, especially if"
+ << " this is the last message you see before your test times out.";
+ return msg.GetString();
+}
+#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude. DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception. IN_PROGRESS means the test
+// has not yet concluded.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process. Otherwise, the
+// message is simply printed to stderr. In either case, the program
+// then exits with status 1.
+static void DeathTestAbort(const std::string& message) {
+ // On a POSIX system, this function may be called from a threadsafe-style
+ // death test child process, which operates on a very small stack. Use
+ // the heap for any additional non-minuscule memory requirements.
+ const InternalRunDeathTestFlag* const flag =
+ GetUnitTestImpl()->internal_run_death_test_flag();
+ if (flag != nullptr) {
+ FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+ fputc(kDeathTestInternalError, parent);
+ fprintf(parent, "%s", message.c_str());
+ fflush(parent);
+ _exit(1);
+ } else {
+ fprintf(stderr, "%s", message.c_str());
+ fflush(stderr);
+ posix::Abort();
+ }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+#define GTEST_DEATH_TEST_CHECK_(expression) \
+ do { \
+ if (!::testing::internal::IsTrue(expression)) { \
+ DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \
+ ", line " + \
+ ::testing::internal::StreamableToString(__LINE__) + \
+ ": " + #expression); \
+ } \
+ } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again. The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR. If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+ do { \
+ int gtest_retval; \
+ do { \
+ gtest_retval = (expression); \
+ } while (gtest_retval == -1 && errno == EINTR); \
+ if (gtest_retval == -1) { \
+ DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \
+ ", line " + \
+ ::testing::internal::StreamableToString(__LINE__) + \
+ ": " + #expression + " != -1"); \
+ } \
+ } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+ return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+ Message error;
+ char buffer[256];
+ int num_read;
+
+ do {
+ while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+ buffer[num_read] = '\0';
+ error << buffer;
+ }
+ } while (num_read == -1 && errno == EINTR);
+
+ if (num_read == 0) {
+ GTEST_LOG_(FATAL) << error.GetString();
+ } else {
+ const int last_error = errno;
+ GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+ << GetLastErrnoDescription() << " [" << last_error << "]";
+ }
+}
+
+// Death test constructor. Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+ TestInfo* const info = GetUnitTestImpl()->current_test_info();
+ if (info == nullptr) {
+ DeathTestAbort(
+ "Cannot run a death test outside of a TEST or "
+ "TEST_F construct");
+ }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement,
+ Matcher<const std::string&> matcher, const char* file,
+ int line, DeathTest** test) {
+ return GetUnitTestImpl()->death_test_factory()->Create(
+ statement, std::move(matcher), file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+ return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+ last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+ DeathTestImpl(const char* a_statement, Matcher<const std::string&> matcher)
+ : statement_(a_statement),
+ matcher_(std::move(matcher)),
+ spawned_(false),
+ status_(-1),
+ outcome_(IN_PROGRESS),
+ read_fd_(-1),
+ write_fd_(-1) {}
+
+ // read_fd_ is expected to be closed and cleared by a derived class.
+ ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+ void Abort(AbortReason reason) override;
+ bool Passed(bool status_ok) override;
+
+ const char* statement() const { return statement_; }
+ bool spawned() const { return spawned_; }
+ void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+ int status() const { return status_; }
+ void set_status(int a_status) { status_ = a_status; }
+ DeathTestOutcome outcome() const { return outcome_; }
+ void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+ int read_fd() const { return read_fd_; }
+ void set_read_fd(int fd) { read_fd_ = fd; }
+ int write_fd() const { return write_fd_; }
+ void set_write_fd(int fd) { write_fd_ = fd; }
+
+ // Called in the parent process only. Reads the result code of the death
+ // test child process via a pipe, interprets it to set the outcome_
+ // member, and closes read_fd_. Outputs diagnostics and terminates in
+ // case of unexpected codes.
+ void ReadAndInterpretStatusByte();
+
+ // Returns stderr output from the child process.
+ virtual std::string GetErrorLogs();
+
+ private:
+ // The textual content of the code this object is testing. This class
+ // doesn't own this string and should not attempt to delete it.
+ const char* const statement_;
+ // A matcher that's expected to match the stderr output by the child process.
+ Matcher<const std::string&> matcher_;
+ // True if the death test child process has been successfully spawned.
+ bool spawned_;
+ // The exit status of the child process.
+ int status_;
+ // How the death test concluded.
+ DeathTestOutcome outcome_;
+ // Descriptor to the read end of the pipe to the child process. It is
+ // always -1 in the child process. The child keeps its write end of the
+ // pipe in write_fd_.
+ int read_fd_;
+ // Descriptor to the child's write end of the pipe to the parent process.
+ // It is always -1 in the parent process. The parent keeps its end of the
+ // pipe in read_fd_.
+ int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_. Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+ char flag;
+ int bytes_read;
+
+ // The read() here blocks until data is available (signifying the
+ // failure of the death test) or until the pipe is closed (signifying
+ // its success), so it's okay to call this in the parent before
+ // the child process has exited.
+ do {
+ bytes_read = posix::Read(read_fd(), &flag, 1);
+ } while (bytes_read == -1 && errno == EINTR);
+
+ if (bytes_read == 0) {
+ set_outcome(DIED);
+ } else if (bytes_read == 1) {
+ switch (flag) {
+ case kDeathTestReturned:
+ set_outcome(RETURNED);
+ break;
+ case kDeathTestThrew:
+ set_outcome(THREW);
+ break;
+ case kDeathTestLived:
+ set_outcome(LIVED);
+ break;
+ case kDeathTestInternalError:
+ FailFromInternalError(read_fd()); // Does not return.
+ break;
+ default:
+ GTEST_LOG_(FATAL) << "Death test child process reported "
+ << "unexpected status byte ("
+ << static_cast<unsigned int>(flag) << ")";
+ }
+ } else {
+ GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+ << GetLastErrnoDescription();
+ }
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+ set_read_fd(-1);
+}
+
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+ // The parent process considers the death test to be a failure if
+ // it finds any data in our pipe. So, here we write a single flag byte
+ // to the pipe, then exit.
+ const char status_ch = reason == TEST_DID_NOT_DIE ? kDeathTestLived
+ : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+ : kDeathTestReturned;
+
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+ // We are leaking the descriptor here because on some platforms (i.e.,
+ // when built as Windows DLL), destructors of global objects will still
+ // run after calling _exit(). On such systems, write_fd_ will be
+ // indirectly closed from the destructor of UnitTestImpl, causing double
+ // close if it is also closed here. On debug configurations, double close
+ // may assert. As there are no in-process buffers to flush here, we are
+ // relying on the OS to close the descriptor after the process terminates
+ // when the destructors are not run.
+ _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+ ::std::string ret;
+ for (size_t at = 0;;) {
+ const size_t line_end = output.find('\n', at);
+ ret += "[ DEATH ] ";
+ if (line_end == ::std::string::npos) {
+ ret += output.substr(at);
+ break;
+ }
+ ret += output.substr(at, line_end + 1 - at);
+ at = line_end + 1;
+ }
+ return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+// outcome: An enumeration describing how the death test
+// concluded: DIED, LIVED, THREW, or RETURNED. The death test
+// fails in the latter three cases.
+// status: The exit status of the child process. On *nix, it is in the
+// in the format specified by wait(2). On Windows, this is the
+// value supplied to the ExitProcess() API or a numeric code
+// of the exception that terminated the program.
+// matcher_: A matcher that's expected to match the stderr output by the child
+// process.
+//
+// Argument:
+// status_ok: true if exit_status is acceptable in the context of
+// this particular death test, which fails if it is false
+//
+// Returns true if and only if all of the above conditions are met. Otherwise,
+// the first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+ if (!spawned()) return false;
+
+ const std::string error_message = GetErrorLogs();
+
+ bool success = false;
+ Message buffer;
+
+ buffer << "Death test: " << statement() << "\n";
+ switch (outcome()) {
+ case LIVED:
+ buffer << " Result: failed to die.\n"
+ << " Error msg:\n"
+ << FormatDeathTestOutput(error_message);
+ break;
+ case THREW:
+ buffer << " Result: threw an exception.\n"
+ << " Error msg:\n"
+ << FormatDeathTestOutput(error_message);
+ break;
+ case RETURNED:
+ buffer << " Result: illegal return in test statement.\n"
+ << " Error msg:\n"
+ << FormatDeathTestOutput(error_message);
+ break;
+ case DIED:
+ if (status_ok) {
+ if (matcher_.Matches(error_message)) {
+ success = true;
+ } else {
+ std::ostringstream stream;
+ matcher_.DescribeTo(&stream);
+ buffer << " Result: died but not with expected error.\n"
+ << " Expected: " << stream.str() << "\n"
+ << "Actual msg:\n"
+ << FormatDeathTestOutput(error_message);
+ }
+ } else {
+ buffer << " Result: died but not with expected exit code:\n"
+ << " " << ExitSummary(status()) << "\n"
+ << "Actual msg:\n"
+ << FormatDeathTestOutput(error_message);
+ }
+ break;
+ case IN_PROGRESS:
+ default:
+ GTEST_LOG_(FATAL)
+ << "DeathTest::Passed somehow called before conclusion of test";
+ }
+
+ DeathTest::set_last_death_test_message(buffer.GetString());
+ return success;
+}
+
+#if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes: Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+// ends of it.
+// 2. The parent starts the child and provides it with the information
+// necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+// using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+// this is done before step 3, the object's reference count goes down to
+// 0 and it is destroyed, preventing the child from acquiring it. The
+// parent now has to release it, or read operations on the read end of
+// the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+// any possible error messages) from the pipe, and its stderr and then
+// determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+ WindowsDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+ const char* file, int line)
+ : DeathTestImpl(a_statement, std::move(matcher)),
+ file_(file),
+ line_(line) {}
+
+ // All of these virtual functions are inherited from DeathTest.
+ virtual int Wait();
+ virtual TestRole AssumeRole();
+
+ private:
+ // The name of the file in which the death test is located.
+ const char* const file_;
+ // The line number on which the death test is located.
+ const int line_;
+ // Handle to the write end of the pipe to the child process.
+ AutoHandle write_handle_;
+ // Child process handle.
+ AutoHandle child_handle_;
+ // Event the child process uses to signal the parent that it has
+ // acquired the handle to the write end of the pipe. After seeing this
+ // event the parent can release its own handles to make sure its
+ // ReadFile() calls return when the child terminates.
+ AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists. As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+ if (!spawned()) return 0;
+
+ // Wait until the child either signals that it has acquired the write end
+ // of the pipe or it dies.
+ const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+ switch (::WaitForMultipleObjects(2, wait_handles,
+ FALSE, // Waits for any of the handles.
+ INFINITE)) {
+ case WAIT_OBJECT_0:
+ case WAIT_OBJECT_0 + 1:
+ break;
+ default:
+ GTEST_DEATH_TEST_CHECK_(false); // Should not get here.
+ }
+
+ // The child has acquired the write end of the pipe or exited.
+ // We release the handle on our side and continue.
+ write_handle_.Reset();
+ event_handle_.Reset();
+
+ ReadAndInterpretStatusByte();
+
+ // Waits for the child process to exit if it haven't already. This
+ // returns immediately if the child has already exited, regardless of
+ // whether previous calls to WaitForMultipleObjects synchronized on this
+ // handle or not.
+ GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+ ::WaitForSingleObject(child_handle_.Get(), INFINITE));
+ DWORD status_code;
+ GTEST_DEATH_TEST_CHECK_(
+ ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+ child_handle_.Reset();
+ set_status(static_cast<int>(status_code));
+ return status();
+}
+
+// The AssumeRole process for a Windows death test. It creates a child
+// process with the same executable as the current process to run the
+// death test. The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+ const UnitTestImpl* const impl = GetUnitTestImpl();
+ const InternalRunDeathTestFlag* const flag =
+ impl->internal_run_death_test_flag();
+ const TestInfo* const info = impl->current_test_info();
+ const int death_test_index = info->result()->death_test_count();
+
+ if (flag != nullptr) {
+ // ParseInternalRunDeathTestFlag() has performed all the necessary
+ // processing.
+ set_write_fd(flag->write_fd());
+ return EXECUTE_TEST;
+ }
+
+ // WindowsDeathTest uses an anonymous pipe to communicate results of
+ // a death test.
+ SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
+ nullptr, TRUE};
+ HANDLE read_handle, write_handle;
+ GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+ &handles_are_inheritable,
+ 0) // Default buffer size.
+ != FALSE);
+ set_read_fd(
+ ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
+ write_handle_.Reset(write_handle);
+ event_handle_.Reset(::CreateEvent(
+ &handles_are_inheritable,
+ TRUE, // The event will automatically reset to non-signaled state.
+ FALSE, // The initial state is non-signalled.
+ nullptr)); // The even is unnamed.
+ GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
+ const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ "filter=" + info->test_suite_name() + "." +
+ info->name();
+ const std::string internal_flag =
+ std::string("--") + GTEST_FLAG_PREFIX_ +
+ "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+ "|" + StreamableToString(death_test_index) + "|" +
+ StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+ // size_t has the same width as pointers on both 32-bit and 64-bit
+ // Windows platforms.
+ // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+ "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+ StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+ char executable_path[_MAX_PATH + 1]; // NOLINT
+ GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
+ executable_path,
+ _MAX_PATH));
+
+ std::string command_line = std::string(::GetCommandLineA()) + " " +
+ filter_flag + " \"" + internal_flag + "\"";
+
+ DeathTest::set_last_death_test_message("");
+
+ CaptureStderr();
+ // Flush the log buffers since the log streams are shared with the child.
+ FlushInfoLog();
+
+ // The child process will share the standard handles with the parent.
+ STARTUPINFOA startup_info;
+ memset(&startup_info, 0, sizeof(STARTUPINFO));
+ startup_info.dwFlags = STARTF_USESTDHANDLES;
+ startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+ startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+ startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+ PROCESS_INFORMATION process_info;
+ GTEST_DEATH_TEST_CHECK_(
+ ::CreateProcessA(
+ executable_path, const_cast<char*>(command_line.c_str()),
+ nullptr, // Returned process handle is not inheritable.
+ nullptr, // Returned thread handle is not inheritable.
+ TRUE, // Child inherits all inheritable handles (for write_handle_).
+ 0x0, // Default creation flags.
+ nullptr, // Inherit the parent's environment.
+ UnitTest::GetInstance()->original_working_dir(), &startup_info,
+ &process_info) != FALSE);
+ child_handle_.Reset(process_info.hProcess);
+ ::CloseHandle(process_info.hThread);
+ set_spawned(true);
+ return OVERSEE_TEST;
+}
+
+#elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+ FuchsiaDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+ const char* file, int line)
+ : DeathTestImpl(a_statement, std::move(matcher)),
+ file_(file),
+ line_(line) {}
+
+ // All of these virtual functions are inherited from DeathTest.
+ int Wait() override;
+ TestRole AssumeRole() override;
+ std::string GetErrorLogs() override;
+
+ private:
+ // The name of the file in which the death test is located.
+ const char* const file_;
+ // The line number on which the death test is located.
+ const int line_;
+ // The stderr data captured by the child process.
+ std::string captured_stderr_;
+
+ zx::process child_process_;
+ zx::channel exception_channel_;
+ zx::socket stderr_socket_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+ Arguments() { args_.push_back(nullptr); }
+
+ ~Arguments() {
+ for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+ ++i) {
+ free(*i);
+ }
+ }
+ void AddArgument(const char* argument) {
+ args_.insert(args_.end() - 1, posix::StrDup(argument));
+ }
+
+ template <typename Str>
+ void AddArguments(const ::std::vector<Str>& arguments) {
+ for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+ i != arguments.end(); ++i) {
+ args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+ }
+ }
+ char* const* Argv() { return &args_[0]; }
+
+ int size() { return static_cast<int>(args_.size()) - 1; }
+
+ private:
+ std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists. As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+ const int kProcessKey = 0;
+ const int kSocketKey = 1;
+ const int kExceptionKey = 2;
+
+ if (!spawned()) return 0;
+
+ // Create a port to wait for socket/task/exception events.
+ zx_status_t status_zx;
+ zx::port port;
+ status_zx = zx::port::create(0, &port);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+ // Register to wait for the child process to terminate.
+ status_zx =
+ child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+ // Register to wait for the socket to be readable or closed.
+ status_zx = stderr_socket_.wait_async(
+ port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+ // Register to wait for an exception.
+ status_zx = exception_channel_.wait_async(port, kExceptionKey,
+ ZX_CHANNEL_READABLE, 0);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+ bool process_terminated = false;
+ bool socket_closed = false;
+ do {
+ zx_port_packet_t packet = {};
+ status_zx = port.wait(zx::time::infinite(), &packet);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+ if (packet.key == kExceptionKey) {
+ // Process encountered an exception. Kill it directly rather than
+ // letting other handlers process the event. We will get a kProcessKey
+ // event when the process actually terminates.
+ status_zx = child_process_.kill();
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+ } else if (packet.key == kProcessKey) {
+ // Process terminated.
+ GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+ GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+ process_terminated = true;
+ } else if (packet.key == kSocketKey) {
+ GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+ if (packet.signal.observed & ZX_SOCKET_READABLE) {
+ // Read data from the socket.
+ constexpr size_t kBufferSize = 1024;
+ do {
+ size_t old_length = captured_stderr_.length();
+ size_t bytes_read = 0;
+ captured_stderr_.resize(old_length + kBufferSize);
+ status_zx =
+ stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+ kBufferSize, &bytes_read);
+ captured_stderr_.resize(old_length + bytes_read);
+ } while (status_zx == ZX_OK);
+ if (status_zx == ZX_ERR_PEER_CLOSED) {
+ socket_closed = true;
+ } else {
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
+ status_zx = stderr_socket_.wait_async(
+ port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+ }
+ } else {
+ GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED);
+ socket_closed = true;
+ }
+ }
+ } while (!process_terminated && !socket_closed);
+
+ ReadAndInterpretStatusByte();
+
+ zx_info_process_t buffer;
+ status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+ nullptr, nullptr);
+ GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+ GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED);
+ set_status(static_cast<int>(buffer.return_code));
+ return status();
+}
+
+// The AssumeRole process for a Fuchsia death test. It creates a child
+// process with the same executable as the current process to run the
+// death test. The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+ const UnitTestImpl* const impl = GetUnitTestImpl();
+ const InternalRunDeathTestFlag* const flag =
+ impl->internal_run_death_test_flag();
+ const TestInfo* const info = impl->current_test_info();
+ const int death_test_index = info->result()->death_test_count();
+
+ if (flag != nullptr) {
+ // ParseInternalRunDeathTestFlag() has performed all the necessary
+ // processing.
+ set_write_fd(kFuchsiaReadPipeFd);
+ return EXECUTE_TEST;
+ }
+
+ // Flush the log buffers since the log streams are shared with the child.
+ FlushInfoLog();
+
+ // Build the child process command line.
+ const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ "filter=" + info->test_suite_name() + "." +
+ info->name();
+ const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ kInternalRunDeathTestFlag + "=" + file_ +
+ "|" + StreamableToString(line_) + "|" +
+ StreamableToString(death_test_index);
+ Arguments args;
+ args.AddArguments(GetInjectableArgvs());
+ args.AddArgument(filter_flag.c_str());
+ args.AddArgument(internal_flag.c_str());
+
+ // Build the pipe for communication with the child.
+ zx_status_t status;
+ zx_handle_t child_pipe_handle;
+ int child_pipe_fd;
+ status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle);
+ GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+ set_read_fd(child_pipe_fd);
+
+ // Set the pipe handle for the child.
+ fdio_spawn_action_t spawn_actions[2] = {};
+ fdio_spawn_action_t* add_handle_action = &spawn_actions[0];
+ add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+ add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
+ add_handle_action->h.handle = child_pipe_handle;
+
+ // Create a socket pair will be used to receive the child process' stderr.
+ zx::socket stderr_producer_socket;
+ status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+ GTEST_DEATH_TEST_CHECK_(status >= 0);
+ int stderr_producer_fd = -1;
+ status =
+ fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd);
+ GTEST_DEATH_TEST_CHECK_(status >= 0);
+
+ // Make the stderr socket nonblocking.
+ GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
+
+ fdio_spawn_action_t* add_stderr_action = &spawn_actions[1];
+ add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
+ add_stderr_action->fd.local_fd = stderr_producer_fd;
+ add_stderr_action->fd.target_fd = STDERR_FILENO;
+
+ // Create a child job.
+ zx_handle_t child_job = ZX_HANDLE_INVALID;
+ status = zx_job_create(zx_job_default(), 0, &child_job);
+ GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+ zx_policy_basic_t policy;
+ policy.condition = ZX_POL_NEW_ANY;
+ policy.policy = ZX_POL_ACTION_ALLOW;
+ status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+ &policy, 1);
+ GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+ // Create an exception channel attached to the |child_job|, to allow
+ // us to suppress the system default exception handler from firing.
+ status = zx_task_create_exception_channel(
+ child_job, 0, exception_channel_.reset_and_get_address());
+ GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+ // Spawn the child process.
+ status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+ args.Argv(), nullptr, 2, spawn_actions,
+ child_process_.reset_and_get_address(), nullptr);
+ GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+ set_spawned(true);
+ return OVERSEE_TEST;
+}
+
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
+
+#else // We are neither on Windows, nor on Fuchsia.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface. Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+ ForkingDeathTest(const char* statement, Matcher<const std::string&> matcher);
+
+ // All of these virtual functions are inherited from DeathTest.
+ int Wait() override;
+
+ protected:
+ void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+ // PID of child process during death test; 0 in the child process itself.
+ pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement,
+ Matcher<const std::string&> matcher)
+ : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists. As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+ if (!spawned()) return 0;
+
+ ReadAndInterpretStatusByte();
+
+ int status_value;
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+ set_status(status_value);
+ return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+ NoExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher)
+ : ForkingDeathTest(a_statement, std::move(matcher)) {}
+ TestRole AssumeRole() override;
+};
+
+// The AssumeRole process for a fork-and-run death test. It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+ const size_t thread_count = GetThreadCount();
+ if (thread_count != 1) {
+ GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+ }
+
+ int pipe_fd[2];
+ GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+ DeathTest::set_last_death_test_message("");
+ CaptureStderr();
+ // When we fork the process below, the log file buffers are copied, but the
+ // file descriptors are shared. We flush all log files here so that closing
+ // the file descriptors in the child process doesn't throw off the
+ // synchronization between descriptors and buffers in the parent process.
+ // This is as close to the fork as possible to avoid a race condition in case
+ // there are multiple threads running before the death test, and another
+ // thread writes to the log file.
+ FlushInfoLog();
+
+ const pid_t child_pid = fork();
+ GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+ set_child_pid(child_pid);
+ if (child_pid == 0) {
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+ set_write_fd(pipe_fd[1]);
+ // Redirects all logging to stderr in the child process to prevent
+ // concurrent writes to the log files. We capture stderr in the parent
+ // process and append the child process' output to a log.
+ LogToStderr();
+ // Event forwarding to the listeners of event listener API mush be shut
+ // down in death test subprocesses.
+ GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+ g_in_fast_death_test_child = true;
+ return EXECUTE_TEST;
+ } else {
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+ set_read_fd(pipe_fd[0]);
+ set_spawned(true);
+ return OVERSEE_TEST;
+ }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+ ExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+ const char* file, int line)
+ : ForkingDeathTest(a_statement, std::move(matcher)),
+ file_(file),
+ line_(line) {}
+ TestRole AssumeRole() override;
+
+ private:
+ static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+ ::std::vector<std::string> args = GetInjectableArgvs();
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+ ::std::vector<std::string> extra_args =
+ GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+ args.insert(args.end(), extra_args.begin(), extra_args.end());
+#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+ return args;
+ }
+ // The name of the file in which the death test is located.
+ const char* const file_;
+ // The line number on which the death test is located.
+ const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+ Arguments() { args_.push_back(nullptr); }
+
+ ~Arguments() {
+ for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+ ++i) {
+ free(*i);
+ }
+ }
+ void AddArgument(const char* argument) {
+ args_.insert(args_.end() - 1, posix::StrDup(argument));
+ }
+
+ template <typename Str>
+ void AddArguments(const ::std::vector<Str>& arguments) {
+ for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+ i != arguments.end(); ++i) {
+ args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+ }
+ }
+ char* const* Argv() { return &args_[0]; }
+
+ private:
+ std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+ char* const* argv; // Command-line arguments for the child's call to exec
+ int close_fd; // File descriptor to close; the read end of a pipe
+};
+
+#if GTEST_OS_QNX
+extern "C" char** environ;
+#else // GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+ ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+ // We need to execute the test program in the same environment where
+ // it was originally invoked. Therefore we change to the original
+ // working directory first.
+ const char* const original_dir =
+ UnitTest::GetInstance()->original_working_dir();
+ // We can safely call chdir() as it's a direct system call.
+ if (chdir(original_dir) != 0) {
+ DeathTestAbort(std::string("chdir(\"") + original_dir +
+ "\") failed: " + GetLastErrnoDescription());
+ return EXIT_FAILURE;
+ }
+
+ // We can safely call execv() as it's almost a direct system call. We
+ // cannot use execvp() as it's a libc function and thus potentially
+ // unsafe. Since execv() doesn't search the PATH, the user must
+ // invoke the test program via a valid path that contains at least
+ // one path separator.
+ execv(args->argv[0], args->argv);
+ DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
+ original_dir + " failed: " + GetLastErrnoDescription());
+ return EXIT_FAILURE;
+}
+#endif // GTEST_OS_QNX
+
+#if GTEST_HAS_CLONE
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+static void StackLowerThanAddress(const void* ptr,
+ bool* result) GTEST_NO_INLINE_;
+// Make sure sanitizers do not tamper with the stack here.
+// Ideally, we want to use `__builtin_frame_address` instead of a local variable
+// address with sanitizer disabled, but it does not work when the
+// compiler optimizes the stack frame out, which happens on PowerPC targets.
+// HWAddressSanitizer add a random tag to the MSB of the local variable address,
+// making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static void StackLowerThanAddress(const void* ptr, bool* result) {
+ int dummy = 0;
+ *result = std::less<const void*>()(&dummy, ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static bool StackGrowsDown() {
+ int dummy = 0;
+ bool result;
+ StackLowerThanAddress(&dummy, &result);
+ return result;
+}
+#endif // GTEST_HAS_CLONE
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test. The
+// implementation uses fork(2) + exec. On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe. On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead. The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+ ExecDeathTestArgs args = {argv, close_fd};
+ pid_t child_pid = -1;
+
+#if GTEST_OS_QNX
+ // Obtains the current directory and sets it to be closed in the child
+ // process.
+ const int cwd_fd = open(".", O_RDONLY);
+ GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+ // We need to execute the test program in the same environment where
+ // it was originally invoked. Therefore we change to the original
+ // working directory first.
+ const char* const original_dir =
+ UnitTest::GetInstance()->original_working_dir();
+ // We can safely call chdir() as it's a direct system call.
+ if (chdir(original_dir) != 0) {
+ DeathTestAbort(std::string("chdir(\"") + original_dir +
+ "\") failed: " + GetLastErrnoDescription());
+ return EXIT_FAILURE;
+ }
+
+ int fd_flags;
+ // Set close_fd to be closed after spawn.
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(
+ fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
+ struct inheritance inherit = {0};
+ // spawn is a system call.
+ child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
+ // Restores the current working directory.
+ GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#else // GTEST_OS_QNX
+#if GTEST_OS_LINUX
+ // When a SIGPROF signal is received while fork() or clone() are executing,
+ // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+ // it after the call to fork()/clone() is complete.
+ struct sigaction saved_sigprof_action;
+ struct sigaction ignore_sigprof_action;
+ memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+ sigemptyset(&ignore_sigprof_action.sa_mask);
+ ignore_sigprof_action.sa_handler = SIG_IGN;
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(
+ sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif // GTEST_OS_LINUX
+
+#if GTEST_HAS_CLONE
+ const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
+
+ if (!use_fork) {
+ static const bool stack_grows_down = StackGrowsDown();
+ const auto stack_size = static_cast<size_t>(getpagesize() * 2);
+ // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+ void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+ // Maximum stack alignment in bytes: For a downward-growing stack, this
+ // amount is subtracted from size of the stack space to get an address
+ // that is within the stack space and is aligned on all systems we care
+ // about. As far as I know there is no ABI with stack alignment greater
+ // than 64. We assume stack and stack_size already have alignment of
+ // kMaxStackAlignment.
+ const size_t kMaxStackAlignment = 64;
+ void* const stack_top =
+ static_cast<char*>(stack) +
+ (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+ GTEST_DEATH_TEST_CHECK_(
+ static_cast<size_t>(stack_size) > kMaxStackAlignment &&
+ reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+ child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+ GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+ }
+#else
+ const bool use_fork = true;
+#endif // GTEST_HAS_CLONE
+
+ if (use_fork && (child_pid = fork()) == 0) {
+ ExecDeathTestChildMain(&args);
+ _exit(0);
+ }
+#endif // GTEST_OS_QNX
+#if GTEST_OS_LINUX
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(
+ sigaction(SIGPROF, &saved_sigprof_action, nullptr));
+#endif // GTEST_OS_LINUX
+
+ GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+ return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test. It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+ const UnitTestImpl* const impl = GetUnitTestImpl();
+ const InternalRunDeathTestFlag* const flag =
+ impl->internal_run_death_test_flag();
+ const TestInfo* const info = impl->current_test_info();
+ const int death_test_index = info->result()->death_test_count();
+
+ if (flag != nullptr) {
+ set_write_fd(flag->write_fd());
+ return EXECUTE_TEST;
+ }
+
+ int pipe_fd[2];
+ GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+ // Clear the close-on-exec flag on the write end of the pipe, lest
+ // it be closed when the child process does an exec:
+ GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+ const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ "filter=" + info->test_suite_name() + "." +
+ info->name();
+ const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+ "internal_run_death_test=" + file_ + "|" +
+ StreamableToString(line_) + "|" +
+ StreamableToString(death_test_index) + "|" +
+ StreamableToString(pipe_fd[1]);
+ Arguments args;
+ args.AddArguments(GetArgvsForDeathTestChildProcess());
+ args.AddArgument(filter_flag.c_str());
+ args.AddArgument(internal_flag.c_str());
+
+ DeathTest::set_last_death_test_message("");
+
+ CaptureStderr();
+ // See the comment in NoExecDeathTest::AssumeRole for why the next line
+ // is necessary.
+ FlushInfoLog();
+
+ const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+ GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+ set_child_pid(child_pid);
+ set_read_fd(pipe_fd[0]);
+ set_spawned(true);
+ return OVERSEE_TEST;
+}
+
+#endif // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address. If the test should be
+// skipped, sets that pointer to NULL. Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement,
+ Matcher<const std::string&> matcher,
+ const char* file, int line,
+ DeathTest** test) {
+ UnitTestImpl* const impl = GetUnitTestImpl();
+ const InternalRunDeathTestFlag* const flag =
+ impl->internal_run_death_test_flag();
+ const int death_test_index =
+ impl->current_test_info()->increment_death_test_count();
+
+ if (flag != nullptr) {
+ if (death_test_index > flag->index()) {
+ DeathTest::set_last_death_test_message(
+ "Death test count (" + StreamableToString(death_test_index) +
+ ") somehow exceeded expected maximum (" +
+ StreamableToString(flag->index()) + ")");
+ return false;
+ }
+
+ if (!(flag->file() == file && flag->line() == line &&
+ flag->index() == death_test_index)) {
+ *test = nullptr;
+ return true;
+ }
+ }
+
+#if GTEST_OS_WINDOWS
+
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+ GTEST_FLAG_GET(death_test_style) == "fast") {
+ *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
+ }
+
+#elif GTEST_OS_FUCHSIA
+
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+ GTEST_FLAG_GET(death_test_style) == "fast") {
+ *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
+ }
+
+#else
+
+ if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
+ *test = new ExecDeathTest(statement, std::move(matcher), file, line);
+ } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
+ *test = new NoExecDeathTest(statement, std::move(matcher));
+ }
+
+#endif // GTEST_OS_WINDOWS
+
+ else { // NOLINT - this is more readable than unbalanced brackets inside #if.
+ DeathTest::set_last_death_test_message("Unknown death test style \"" +
+ GTEST_FLAG_GET(death_test_style) +
+ "\" encountered");
+ return false;
+ }
+
+ return true;
+}
+
+#if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
+ size_t write_handle_as_size_t,
+ size_t event_handle_as_size_t) {
+ AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+ FALSE, // Non-inheritable.
+ parent_process_id));
+ if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+ DeathTestAbort("Unable to open parent process " +
+ StreamableToString(parent_process_id));
+ }
+
+ GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+ const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
+ HANDLE dup_write_handle;
+
+ // The newly initialized handle is accessible only in the parent
+ // process. To obtain one accessible within the child, we need to use
+ // DuplicateHandle.
+ if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+ ::GetCurrentProcess(), &dup_write_handle,
+ 0x0, // Requested privileges ignored since
+ // DUPLICATE_SAME_ACCESS is used.
+ FALSE, // Request non-inheritable handler.
+ DUPLICATE_SAME_ACCESS)) {
+ DeathTestAbort("Unable to duplicate the pipe handle " +
+ StreamableToString(write_handle_as_size_t) +
+ " from the parent process " +
+ StreamableToString(parent_process_id));
+ }
+
+ const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+ HANDLE dup_event_handle;
+
+ if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+ ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
+ DUPLICATE_SAME_ACCESS)) {
+ DeathTestAbort("Unable to duplicate the event handle " +
+ StreamableToString(event_handle_as_size_t) +
+ " from the parent process " +
+ StreamableToString(parent_process_id));
+ }
+
+ const int write_fd =
+ ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+ if (write_fd == -1) {
+ DeathTestAbort("Unable to convert pipe handle " +
+ StreamableToString(write_handle_as_size_t) +
+ " to a file descriptor");
+ }
+
+ // Signals the parent that the write end of the pipe has been acquired
+ // so the parent can release its own write end.
+ ::SetEvent(dup_event_handle);
+
+ return write_fd;
+}
+#endif // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+ if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
+
+ // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+ // can use it here.
+ int line = -1;
+ int index = -1;
+ ::std::vector< ::std::string> fields;
+ SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
+ int write_fd = -1;
+
+#if GTEST_OS_WINDOWS
+
+ unsigned int parent_process_id = 0;
+ size_t write_handle_as_size_t = 0;
+ size_t event_handle_as_size_t = 0;
+
+ if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+ !ParseNaturalNumber(fields[2], &index) ||
+ !ParseNaturalNumber(fields[3], &parent_process_id) ||
+ !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+ !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+ DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+ GTEST_FLAG_GET(internal_run_death_test));
+ }
+ write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
+ event_handle_as_size_t);
+
+#elif GTEST_OS_FUCHSIA
+
+ if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+ !ParseNaturalNumber(fields[2], &index)) {
+ DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+ GTEST_FLAG_GET(internal_run_death_test));
+ }
+
+#else
+
+ if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+ !ParseNaturalNumber(fields[2], &index) ||
+ !ParseNaturalNumber(fields[3], &write_fd)) {
+ DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+ GTEST_FLAG_GET(internal_run_death_test));
+ }
+
+#endif // GTEST_OS_WINDOWS
+
+ return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+} // namespace internal
+
+#endif // GTEST_HAS_DEATH_TEST
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc
new file mode 100644
index 0000000000..f6ee90cdb7
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc
@@ -0,0 +1,367 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-filepath.h"
+
+#include <stdlib.h>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_WINDOWS_MOBILE
+#include <windows.h>
+#elif GTEST_OS_WINDOWS
+#include <direct.h>
+#include <io.h>
+#else
+#include <limits.h>
+
+#include <climits> // Some Linux distributions define PATH_MAX here.
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+#include "gtest/internal/gtest-string.h"
+
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+#define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif // GTEST_OS_WINDOWS
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+#else
+const char kCurrentDirectoryString[] = ".\\";
+#endif // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+ return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+ return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+ GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \
+ GTEST_OS_XTENSA
+ // These platforms do not have a current directory, so we just return
+ // something reasonable.
+ return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+ char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
+ return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
+#else
+ char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
+ char* result = getcwd(cwd, sizeof(cwd));
+#if GTEST_OS_NACL
+ // getcwd will likely fail in NaCl due to the sandbox, so return something
+ // reasonable. The user may have provided a shim implementation for getcwd,
+ // however, so fallback only when failure is detected.
+ return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
+#endif // GTEST_OS_NACL
+ return FilePath(result == nullptr ? "" : cwd);
+#endif // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+ const std::string dot_extension = std::string(".") + extension;
+ if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+ return FilePath(
+ pathname_.substr(0, pathname_.length() - dot_extension.length()));
+ }
+ return *this;
+}
+
+// Returns a pointer to the last occurrence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+ const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+ const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+ // Comparing two pointers of which only one is NULL is undefined.
+ if (last_alt_sep != nullptr &&
+ (last_sep == nullptr || last_alt_sep > last_sep)) {
+ return last_alt_sep;
+ }
+#endif
+ return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+ const char* const last_sep = FindLastPathSeparator();
+ return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+ const char* const last_sep = FindLastPathSeparator();
+ std::string dir;
+ if (last_sep) {
+ dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
+ } else {
+ dir = kCurrentDirectoryString;
+ }
+ return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+ const FilePath& base_name, int number,
+ const char* extension) {
+ std::string file;
+ if (number == 0) {
+ file = base_name.string() + "." + extension;
+ } else {
+ file =
+ base_name.string() + "_" + StreamableToString(number) + "." + extension;
+ }
+ return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+ const FilePath& relative_path) {
+ if (directory.IsEmpty()) return relative_path;
+ const FilePath dir(directory.RemoveTrailingPathSeparator());
+ return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+ LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+ const DWORD attributes = GetFileAttributes(unicode);
+ delete[] unicode;
+ return attributes != kInvalidFileAttributes;
+#else
+ posix::StatStruct file_stat{};
+ return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+ bool result = false;
+#if GTEST_OS_WINDOWS
+ // Don't strip off trailing separator if path is a root directory on
+ // Windows (like "C:\\").
+ const FilePath& path(IsRootDirectory() ? *this
+ : RemoveTrailingPathSeparator());
+#else
+ const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+ LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+ const DWORD attributes = GetFileAttributes(unicode);
+ delete[] unicode;
+ if ((attributes != kInvalidFileAttributes) &&
+ (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+ result = true;
+ }
+#else
+ posix::StatStruct file_stat{};
+ result =
+ posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+ return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+ return pathname_.length() == 3 && IsAbsolutePath();
+#else
+ return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+ const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+ return pathname_.length() >= 3 &&
+ ((name[0] >= 'a' && name[0] <= 'z') ||
+ (name[0] >= 'A' && name[0] <= 'Z')) &&
+ name[1] == ':' && IsPathSeparator(name[2]);
+#else
+ return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+ const FilePath& base_name,
+ const char* extension) {
+ FilePath full_pathname;
+ int number = 0;
+ do {
+ full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+ } while (full_pathname.FileOrDirectoryExists());
+ return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+ return !pathname_.empty() &&
+ IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+ if (!this->IsDirectory()) {
+ return false;
+ }
+
+ if (pathname_.length() == 0 || this->DirectoryExists()) {
+ return true;
+ }
+
+ const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+ return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+ FilePath removed_sep(this->RemoveTrailingPathSeparator());
+ LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+ int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
+ delete[] unicode;
+#elif GTEST_OS_WINDOWS
+ int result = _mkdir(pathname_.c_str());
+#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+ // do nothing
+ int result = 0;
+#else
+ int result = mkdir(pathname_.c_str(), 0777);
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+ if (result == -1) {
+ return this->DirectoryExists(); // An error is OK if the directory exists.
+ }
+ return true; // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+ return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+ : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+void FilePath::Normalize() {
+ auto out = pathname_.begin();
+
+ for (const char character : pathname_) {
+ if (!IsPathSeparator(character)) {
+ *(out++) = character;
+ } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
+ *(out++) = kPathSeparator;
+ } else {
+ continue;
+ }
+ }
+
+ pathname_.erase(out, pathname_.end());
+}
+
+} // namespace internal
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h
new file mode 100644
index 0000000000..0b9e929c68
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h
@@ -0,0 +1,1212 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.//
+// This file contains purely Google Test's internal implementation. Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+
+#ifndef _WIN32_WCE
+#include <errno.h>
+#endif // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h> // For strtoll/_strtoul64/malloc/free.
+#include <string.h> // For memmove.
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_CAN_STREAM_RESULTS_
+#include <arpa/inet.h> // NOLINT
+#include <netdb.h> // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+#include <windows.h> // NOLINT
+#endif // GTEST_OS_WINDOWS
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace testing {
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library. This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true if and only if Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information. N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe. Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true. On failure, returns false without changing *value.
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
+ const unsigned int raw_seed =
+ (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+ : static_cast<unsigned int>(random_seed_flag);
+
+ // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+ // it's easy to type.
+ const int normalized_seed =
+ static_cast<int>((raw_seed - 1U) %
+ static_cast<unsigned int>(kMaxRandomSeed)) +
+ 1;
+ return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'. The behavior is
+// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+ GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+ << "Invalid random seed " << seed << " - must be in [1, "
+ << kMaxRandomSeed << "].";
+ const int next_seed = seed + 1;
+ return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+ // The c'tor.
+ GTestFlagSaver() {
+ also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+ break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+ catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+ color_ = GTEST_FLAG_GET(color);
+ death_test_style_ = GTEST_FLAG_GET(death_test_style);
+ death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+ fail_fast_ = GTEST_FLAG_GET(fail_fast);
+ filter_ = GTEST_FLAG_GET(filter);
+ internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+ list_tests_ = GTEST_FLAG_GET(list_tests);
+ output_ = GTEST_FLAG_GET(output);
+ brief_ = GTEST_FLAG_GET(brief);
+ print_time_ = GTEST_FLAG_GET(print_time);
+ print_utf8_ = GTEST_FLAG_GET(print_utf8);
+ random_seed_ = GTEST_FLAG_GET(random_seed);
+ repeat_ = GTEST_FLAG_GET(repeat);
+ recreate_environments_when_repeating_ =
+ GTEST_FLAG_GET(recreate_environments_when_repeating);
+ shuffle_ = GTEST_FLAG_GET(shuffle);
+ stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+ stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+ throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
+ }
+
+ // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS.
+ ~GTestFlagSaver() {
+ GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+ GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+ GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+ GTEST_FLAG_SET(color, color_);
+ GTEST_FLAG_SET(death_test_style, death_test_style_);
+ GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+ GTEST_FLAG_SET(filter, filter_);
+ GTEST_FLAG_SET(fail_fast, fail_fast_);
+ GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+ GTEST_FLAG_SET(list_tests, list_tests_);
+ GTEST_FLAG_SET(output, output_);
+ GTEST_FLAG_SET(brief, brief_);
+ GTEST_FLAG_SET(print_time, print_time_);
+ GTEST_FLAG_SET(print_utf8, print_utf8_);
+ GTEST_FLAG_SET(random_seed, random_seed_);
+ GTEST_FLAG_SET(repeat, repeat_);
+ GTEST_FLAG_SET(recreate_environments_when_repeating,
+ recreate_environments_when_repeating_);
+ GTEST_FLAG_SET(shuffle, shuffle_);
+ GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+ GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+ GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
+ }
+
+ private:
+ // Fields for saving the original values of flags.
+ bool also_run_disabled_tests_;
+ bool break_on_failure_;
+ bool catch_exceptions_;
+ std::string color_;
+ std::string death_test_style_;
+ bool death_test_use_fork_;
+ bool fail_fast_;
+ std::string filter_;
+ std::string internal_run_death_test_;
+ bool list_tests_;
+ std::string output_;
+ bool brief_;
+ bool print_time_;
+ bool print_utf8_;
+ int32_t random_seed_;
+ int32_t repeat_;
+ bool recreate_environments_when_repeating_;
+ bool shuffle_;
+ int32_t stack_trace_depth_;
+ std::string stream_result_to_;
+ bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(uint32_t code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+// UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+ const char* shard_index_str,
+ bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as a 32-bit integer. If it is unset,
+// returns default_val. If it is not a 32-bit integer, prints an error and
+// and aborts.
+GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+ int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+ // Implemented as an explicit loop since std::count_if() in libCstd on
+ // Solaris has a non-standard signature.
+ int count = 0;
+ for (auto it = c.begin(); it != c.end(); ++it) {
+ if (predicate(*it)) ++count;
+ }
+ return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+ std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+ return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
+ : v[static_cast<size_t>(i)];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+ std::vector<E>* v) {
+ const int size = static_cast<int>(v->size());
+ GTEST_CHECK_(0 <= begin && begin <= size)
+ << "Invalid shuffle range start " << begin << ": must be in range [0, "
+ << size << "].";
+ GTEST_CHECK_(begin <= end && end <= size)
+ << "Invalid shuffle range finish " << end << ": must be in range ["
+ << begin << ", " << size << "].";
+
+ // Fisher-Yates shuffle, from
+ // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+ for (int range_width = end - begin; range_width >= 2; range_width--) {
+ const int last_in_range = begin + range_width - 1;
+ const int selected =
+ begin +
+ static_cast<int>(random->Generate(static_cast<uint32_t>(range_width)));
+ std::swap((*v)[static_cast<size_t>(selected)],
+ (*v)[static_cast<size_t>(last_in_range)]);
+ }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+ ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object. Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+ delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+ // Constructor.
+ //
+ // TestPropertyKeyIs has NO default constructor.
+ explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+ // Returns true if and only if the test name of test property matches on key_.
+ bool operator()(const TestProperty& test_property) const {
+ return test_property.key() == key_;
+ }
+
+ private:
+ std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests. It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag. E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter. If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+ // Functions for processing the gtest_output flag.
+
+ // Returns the output format, or "" for normal printed output.
+ static std::string GetOutputFormat();
+
+ // Returns the absolute path of the requested output file, or the
+ // default (test_detail.xml in the original working directory) if
+ // none was explicitly specified.
+ static std::string GetAbsolutePathToOutputFile();
+
+ // Functions for processing the gtest_filter flag.
+
+ // Returns true if and only if the user-specified filter matches the test
+ // suite name and the test name.
+ static bool FilterMatchesTest(const std::string& test_suite_name,
+ const std::string& test_name);
+
+#if GTEST_OS_WINDOWS
+ // Function for supporting the gtest_catch_exception flag.
+
+ // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+ // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+ // This function is useful as an __except condition.
+ static int GTestShouldProcessSEH(DWORD exception_code);
+#endif // GTEST_OS_WINDOWS
+
+ // Returns true if "name" matches the ':' separated list of glob-style
+ // filters in "filter".
+ static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present. Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+ OsStackTraceGetterInterface() {}
+ virtual ~OsStackTraceGetterInterface() {}
+
+ // Returns the current OS stack trace as an std::string. Parameters:
+ //
+ // max_depth - the maximum number of stack frames to be included
+ // in the trace.
+ // skip_count - the number of top frames to be skipped; doesn't count
+ // against max_depth.
+ virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+ // UponLeavingGTest() should be called immediately before Google Test calls
+ // user code. It saves some information about the current stack that
+ // CurrentStackTrace() will use to find and hide Google Test stack frames.
+ virtual void UponLeavingGTest() = 0;
+
+ // This string is inserted in place of stack frames that are part of
+ // Google Test's implementation.
+ static const char* const kElidedFramesMarker;
+
+ private:
+ OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+ OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+ delete;
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+ OsStackTraceGetter() {}
+
+ std::string CurrentStackTrace(int max_depth, int skip_count) override;
+ void UponLeavingGTest() override;
+
+ private:
+#if GTEST_HAS_ABSL
+ Mutex mutex_; // Protects all internal state.
+
+ // We save the stack frame below the frame that calls user code.
+ // We do this because the address of the frame immediately below
+ // the user code changes between the call to UponLeavingGTest()
+ // and any calls to the stack trace code from within the user code.
+ void* caller_frame_ = nullptr;
+#endif // GTEST_HAS_ABSL
+
+ OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+ OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+ const char* file;
+ int line;
+ std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+ : public TestPartResultReporterInterface {
+ public:
+ explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+ // Implements the TestPartResultReporterInterface. Reports the test part
+ // result in the current test.
+ void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+ UnitTestImpl* const unit_test_;
+
+ DefaultGlobalTestPartResultReporter(
+ const DefaultGlobalTestPartResultReporter&) = delete;
+ DefaultGlobalTestPartResultReporter& operator=(
+ const DefaultGlobalTestPartResultReporter&) = delete;
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+ : public TestPartResultReporterInterface {
+ public:
+ explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+ // Implements the TestPartResultReporterInterface. The implementation just
+ // delegates to the current global test part result reporter of *unit_test_.
+ void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+ UnitTestImpl* const unit_test_;
+
+ DefaultPerThreadTestPartResultReporter(
+ const DefaultPerThreadTestPartResultReporter&) = delete;
+ DefaultPerThreadTestPartResultReporter& operator=(
+ const DefaultPerThreadTestPartResultReporter&) = delete;
+};
+
+// The private implementation of the UnitTest class. We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+ explicit UnitTestImpl(UnitTest* parent);
+ virtual ~UnitTestImpl();
+
+ // There are two different ways to register your own TestPartResultReporter.
+ // You can register your own repoter to listen either only for test results
+ // from the current thread or for results from all threads.
+ // By default, each per-thread test result repoter just passes a new
+ // TestPartResult to the global test result reporter, which registers the
+ // test part result for the currently running test.
+
+ // Returns the global test part result reporter.
+ TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+ // Sets the global test part result reporter.
+ void SetGlobalTestPartResultReporter(
+ TestPartResultReporterInterface* reporter);
+
+ // Returns the test part result reporter for the current thread.
+ TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+ // Sets the test part result reporter for the current thread.
+ void SetTestPartResultReporterForCurrentThread(
+ TestPartResultReporterInterface* reporter);
+
+ // Gets the number of successful test suites.
+ int successful_test_suite_count() const;
+
+ // Gets the number of failed test suites.
+ int failed_test_suite_count() const;
+
+ // Gets the number of all test suites.
+ int total_test_suite_count() const;
+
+ // Gets the number of all test suites that contain at least one test
+ // that should run.
+ int test_suite_to_run_count() const;
+
+ // Gets the number of successful tests.
+ int successful_test_count() const;
+
+ // Gets the number of skipped tests.
+ int skipped_test_count() const;
+
+ // Gets the number of failed tests.
+ int failed_test_count() const;
+
+ // Gets the number of disabled tests that will be reported in the XML report.
+ int reportable_disabled_test_count() const;
+
+ // Gets the number of disabled tests.
+ int disabled_test_count() const;
+
+ // Gets the number of tests to be printed in the XML report.
+ int reportable_test_count() const;
+
+ // Gets the number of all tests.
+ int total_test_count() const;
+
+ // Gets the number of tests that should run.
+ int test_to_run_count() const;
+
+ // Gets the time of the test program start, in ms from the start of the
+ // UNIX epoch.
+ TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+ // Gets the elapsed time, in milliseconds.
+ TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+ // Returns true if and only if the unit test passed (i.e. all test suites
+ // passed).
+ bool Passed() const { return !Failed(); }
+
+ // Returns true if and only if the unit test failed (i.e. some test suite
+ // failed or something outside of all tests failed).
+ bool Failed() const {
+ return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed();
+ }
+
+ // Gets the i-th test suite among all the test suites. i can range from 0 to
+ // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+ const TestSuite* GetTestSuite(int i) const {
+ const int index = GetElementOr(test_suite_indices_, i, -1);
+ return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
+ }
+
+ // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ const TestCase* GetTestCase(int i) const { return GetTestSuite(i); }
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Gets the i-th test suite among all the test suites. i can range from 0 to
+ // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+ TestSuite* GetMutableSuiteCase(int i) {
+ const int index = GetElementOr(test_suite_indices_, i, -1);
+ return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
+ }
+
+ // Provides access to the event listener list.
+ TestEventListeners* listeners() { return &listeners_; }
+
+ // Returns the TestResult for the test that's currently running, or
+ // the TestResult for the ad hoc test if no test is running.
+ TestResult* current_test_result();
+
+ // Returns the TestResult for the ad hoc test.
+ const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+ // Sets the OS stack trace getter.
+ //
+ // Does nothing if the input and the current OS stack trace getter
+ // are the same; otherwise, deletes the old getter and makes the
+ // input the current getter.
+ void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+ // Returns the current OS stack trace getter if it is not NULL;
+ // otherwise, creates an OsStackTraceGetter, makes it the current
+ // getter, and returns it.
+ OsStackTraceGetterInterface* os_stack_trace_getter();
+
+ // Returns the current OS stack trace as an std::string.
+ //
+ // The maximum number of stack frames to be included is specified by
+ // the gtest_stack_trace_depth flag. The skip_count parameter
+ // specifies the number of top frames to be skipped, which doesn't
+ // count against the number of frames to be included.
+ //
+ // For example, if Foo() calls Bar(), which in turn calls
+ // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+ // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+ std::string CurrentOsStackTraceExceptTop(int skip_count)
+ GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
+
+ // Finds and returns a TestSuite with the given name. If one doesn't
+ // exist, creates one and returns it.
+ //
+ // Arguments:
+ //
+ // test_suite_name: name of the test suite
+ // type_param: the name of the test's type parameter, or NULL if
+ // this is not a typed or a type-parameterized test.
+ // set_up_tc: pointer to the function that sets up the test suite
+ // tear_down_tc: pointer to the function that tears down the test suite
+ TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
+ internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc);
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ TestCase* GetTestCase(const char* test_case_name, const char* type_param,
+ internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc) {
+ return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
+ }
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ // Adds a TestInfo to the unit test.
+ //
+ // Arguments:
+ //
+ // set_up_tc: pointer to the function that sets up the test suite
+ // tear_down_tc: pointer to the function that tears down the test suite
+ // test_info: the TestInfo object
+ void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc,
+ TestInfo* test_info) {
+#if GTEST_HAS_DEATH_TEST
+ // In order to support thread-safe death tests, we need to
+ // remember the original working directory when the test program
+ // was first invoked. We cannot do this in RUN_ALL_TESTS(), as
+ // the user may have changed the current directory before calling
+ // RUN_ALL_TESTS(). Therefore we capture the current directory in
+ // AddTestInfo(), which is called to register a TEST or TEST_F
+ // before main() is reached.
+ if (original_working_dir_.IsEmpty()) {
+ original_working_dir_.Set(FilePath::GetCurrentDir());
+ GTEST_CHECK_(!original_working_dir_.IsEmpty())
+ << "Failed to get the current working directory.";
+ }
+#endif // GTEST_HAS_DEATH_TEST
+
+ GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
+ set_up_tc, tear_down_tc)
+ ->AddTestInfo(test_info);
+ }
+
+ // Returns ParameterizedTestSuiteRegistry object used to keep track of
+ // value-parameterized tests and instantiate and register them.
+ internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() {
+ return parameterized_test_registry_;
+ }
+
+ std::set<std::string>* ignored_parameterized_test_suites() {
+ return &ignored_parameterized_test_suites_;
+ }
+
+ // Returns TypeParameterizedTestSuiteRegistry object used to keep track of
+ // type-parameterized tests and instantiations of them.
+ internal::TypeParameterizedTestSuiteRegistry&
+ type_parameterized_test_registry() {
+ return type_parameterized_test_registry_;
+ }
+
+ // Sets the TestSuite object for the test that's currently running.
+ void set_current_test_suite(TestSuite* a_current_test_suite) {
+ current_test_suite_ = a_current_test_suite;
+ }
+
+ // Sets the TestInfo object for the test that's currently running. If
+ // current_test_info is NULL, the assertion results will be stored in
+ // ad_hoc_test_result_.
+ void set_current_test_info(TestInfo* a_current_test_info) {
+ current_test_info_ = a_current_test_info;
+ }
+
+ // Registers all parameterized tests defined using TEST_P and
+ // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter
+ // combination. This method can be called more then once; it has guards
+ // protecting from registering the tests more then once. If
+ // value-parameterized tests are disabled, RegisterParameterizedTests is
+ // present but does nothing.
+ void RegisterParameterizedTests();
+
+ // Runs all tests in this UnitTest object, prints the result, and
+ // returns true if all tests are successful. If any exception is
+ // thrown during a test, this test is considered to be failed, but
+ // the rest of the tests will still be run.
+ bool RunAllTests();
+
+ // Clears the results of all tests, except the ad hoc tests.
+ void ClearNonAdHocTestResult() {
+ ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
+ }
+
+ // Clears the results of ad-hoc test assertions.
+ void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
+
+ // Adds a TestProperty to the current TestResult object when invoked in a
+ // context of a test or a test suite, or to the global property set. If the
+ // result already contains a property with the same key, the value will be
+ // updated.
+ void RecordProperty(const TestProperty& test_property);
+
+ enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
+
+ // Matches the full name of each test against the user-specified
+ // filter to decide whether the test should run, then records the
+ // result in each TestSuite and TestInfo object.
+ // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+ // based on sharding variables in the environment.
+ // Returns the number of tests that should run.
+ int FilterTests(ReactionToSharding shard_tests);
+
+ // Prints the names of the tests matching the user-specified filter flag.
+ void ListTestsMatchingFilter();
+
+ const TestSuite* current_test_suite() const { return current_test_suite_; }
+ TestInfo* current_test_info() { return current_test_info_; }
+ const TestInfo* current_test_info() const { return current_test_info_; }
+
+ // Returns the vector of environments that need to be set-up/torn-down
+ // before/after the tests are run.
+ std::vector<Environment*>& environments() { return environments_; }
+
+ // Getters for the per-thread Google Test trace stack.
+ std::vector<TraceInfo>& gtest_trace_stack() {
+ return *(gtest_trace_stack_.pointer());
+ }
+ const std::vector<TraceInfo>& gtest_trace_stack() const {
+ return gtest_trace_stack_.get();
+ }
+
+#if GTEST_HAS_DEATH_TEST
+ void InitDeathTestSubprocessControlInfo() {
+ internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+ }
+ // Returns a pointer to the parsed --gtest_internal_run_death_test
+ // flag, or NULL if that flag was not specified.
+ // This information is useful only in a death test child process.
+ // Must not be called before a call to InitGoogleTest.
+ const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+ return internal_run_death_test_flag_.get();
+ }
+
+ // Returns a pointer to the current death test factory.
+ internal::DeathTestFactory* death_test_factory() {
+ return death_test_factory_.get();
+ }
+
+ void SuppressTestEventsIfInSubprocess();
+
+ friend class ReplaceDeathTestFactory;
+#endif // GTEST_HAS_DEATH_TEST
+
+ // Initializes the event listener performing XML output as specified by
+ // UnitTestOptions. Must not be called before InitGoogleTest.
+ void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+ // Initializes the event listener for streaming test results to a socket.
+ // Must not be called before InitGoogleTest.
+ void ConfigureStreamingOutput();
+#endif
+
+ // Performs initialization dependent upon flag values obtained in
+ // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to
+ // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest
+ // this function is also called from RunAllTests. Since this function can be
+ // called more than once, it has to be idempotent.
+ void PostFlagParsingInit();
+
+ // Gets the random seed used at the start of the current test iteration.
+ int random_seed() const { return random_seed_; }
+
+ // Gets the random number generator.
+ internal::Random* random() { return &random_; }
+
+ // Shuffles all test suites, and the tests within each test suite,
+ // making sure that death tests are still run first.
+ void ShuffleTests();
+
+ // Restores the test suites and tests to their order before the first shuffle.
+ void UnshuffleTests();
+
+ // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+ // UnitTest::Run() starts.
+ bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+ friend class ::testing::UnitTest;
+
+ // Used by UnitTest::Run() to capture the state of
+ // GTEST_FLAG(catch_exceptions) at the moment it starts.
+ void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+ // The UnitTest object that owns this implementation object.
+ UnitTest* const parent_;
+
+ // The working directory when the first TEST() or TEST_F() was
+ // executed.
+ internal::FilePath original_working_dir_;
+
+ // The default test part result reporters.
+ DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+ DefaultPerThreadTestPartResultReporter
+ default_per_thread_test_part_result_reporter_;
+
+ // Points to (but doesn't own) the global test part result reporter.
+ TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+ // Protects read and write access to global_test_part_result_reporter_.
+ internal::Mutex global_test_part_result_reporter_mutex_;
+
+ // Points to (but doesn't own) the per-thread test part result reporter.
+ internal::ThreadLocal<TestPartResultReporterInterface*>
+ per_thread_test_part_result_reporter_;
+
+ // The vector of environments that need to be set-up/torn-down
+ // before/after the tests are run.
+ std::vector<Environment*> environments_;
+
+ // The vector of TestSuites in their original order. It owns the
+ // elements in the vector.
+ std::vector<TestSuite*> test_suites_;
+
+ // Provides a level of indirection for the test suite list to allow
+ // easy shuffling and restoring the test suite order. The i-th
+ // element of this vector is the index of the i-th test suite in the
+ // shuffled order.
+ std::vector<int> test_suite_indices_;
+
+ // ParameterizedTestRegistry object used to register value-parameterized
+ // tests.
+ internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
+ internal::TypeParameterizedTestSuiteRegistry
+ type_parameterized_test_registry_;
+
+ // The set holding the name of parameterized
+ // test suites that may go uninstantiated.
+ std::set<std::string> ignored_parameterized_test_suites_;
+
+ // Indicates whether RegisterParameterizedTests() has been called already.
+ bool parameterized_tests_registered_;
+
+ // Index of the last death test suite registered. Initially -1.
+ int last_death_test_suite_;
+
+ // This points to the TestSuite for the currently running test. It
+ // changes as Google Test goes through one test suite after another.
+ // When no test is running, this is set to NULL and Google Test
+ // stores assertion results in ad_hoc_test_result_. Initially NULL.
+ TestSuite* current_test_suite_;
+
+ // This points to the TestInfo for the currently running test. It
+ // changes as Google Test goes through one test after another. When
+ // no test is running, this is set to NULL and Google Test stores
+ // assertion results in ad_hoc_test_result_. Initially NULL.
+ TestInfo* current_test_info_;
+
+ // Normally, a user only writes assertions inside a TEST or TEST_F,
+ // or inside a function called by a TEST or TEST_F. Since Google
+ // Test keeps track of which test is current running, it can
+ // associate such an assertion with the test it belongs to.
+ //
+ // If an assertion is encountered when no TEST or TEST_F is running,
+ // Google Test attributes the assertion result to an imaginary "ad hoc"
+ // test, and records the result in ad_hoc_test_result_.
+ TestResult ad_hoc_test_result_;
+
+ // The list of event listeners that can be used to track events inside
+ // Google Test.
+ TestEventListeners listeners_;
+
+ // The OS stack trace getter. Will be deleted when the UnitTest
+ // object is destructed. By default, an OsStackTraceGetter is used,
+ // but the user can set this field to use a custom getter if that is
+ // desired.
+ OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+ // True if and only if PostFlagParsingInit() has been called.
+ bool post_flag_parse_init_performed_;
+
+ // The random number seed used at the beginning of the test run.
+ int random_seed_;
+
+ // Our random number generator.
+ internal::Random random_;
+
+ // The time of the test program start, in ms from the start of the
+ // UNIX epoch.
+ TimeInMillis start_timestamp_;
+
+ // How long the test took to run, in milliseconds.
+ TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+ // The decomposed components of the gtest_internal_run_death_test flag,
+ // parsed when RUN_ALL_TESTS is called.
+ std::unique_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+ std::unique_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif // GTEST_HAS_DEATH_TEST
+
+ // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+ internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+ // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+ // starts.
+ bool catch_exceptions_;
+
+ UnitTestImpl(const UnitTestImpl&) = delete;
+ UnitTestImpl& operator=(const UnitTestImpl&) = delete;
+}; // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+ return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+ char repeat, const char* regex,
+ const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter. Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+ // Fail fast if the given string does not begin with a digit;
+ // this bypasses strtoXXX's "optional leading whitespace and plus
+ // or minus sign" semantics, which are undesirable here.
+ if (str.empty() || !IsDigit(str[0])) {
+ return false;
+ }
+ errno = 0;
+
+ char* end;
+ // BiggestConvertible is the largest integer type that system-provided
+ // string-to-number conversion routines can return.
+ using BiggestConvertible = unsigned long long; // NOLINT
+
+ const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); // NOLINT
+ const bool parse_success = *end == '\0' && errno == 0;
+
+ GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+ const Integer result = static_cast<Integer>(parsed);
+ if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+ *number = result;
+ return true;
+ }
+ return false;
+}
+#endif // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+ static void RecordProperty(TestResult* test_result,
+ const std::string& xml_element,
+ const TestProperty& property) {
+ test_result->RecordProperty(xml_element, property);
+ }
+
+ static void ClearTestPartResults(TestResult* test_result) {
+ test_result->ClearTestPartResults();
+ }
+
+ static const std::vector<testing::TestPartResult>& test_part_results(
+ const TestResult& test_result) {
+ return test_result.test_part_results();
+ }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+ // Abstract base class for writing strings to a socket.
+ class AbstractSocketWriter {
+ public:
+ virtual ~AbstractSocketWriter() {}
+
+ // Sends a string to the socket.
+ virtual void Send(const std::string& message) = 0;
+
+ // Closes the socket.
+ virtual void CloseConnection() {}
+
+ // Sends a string and a newline to the socket.
+ void SendLn(const std::string& message) { Send(message + "\n"); }
+ };
+
+ // Concrete class for actually writing strings to a socket.
+ class SocketWriter : public AbstractSocketWriter {
+ public:
+ SocketWriter(const std::string& host, const std::string& port)
+ : sockfd_(-1), host_name_(host), port_num_(port) {
+ MakeConnection();
+ }
+
+ ~SocketWriter() override {
+ if (sockfd_ != -1) CloseConnection();
+ }
+
+ // Sends a string to the socket.
+ void Send(const std::string& message) override {
+ GTEST_CHECK_(sockfd_ != -1)
+ << "Send() can be called only when there is a connection.";
+
+ const auto len = static_cast<size_t>(message.length());
+ if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
+ GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+ << host_name_ << ":" << port_num_;
+ }
+ }
+
+ private:
+ // Creates a client socket and connects to the server.
+ void MakeConnection();
+
+ // Closes the socket.
+ void CloseConnection() override {
+ GTEST_CHECK_(sockfd_ != -1)
+ << "CloseConnection() can be called only when there is a connection.";
+
+ close(sockfd_);
+ sockfd_ = -1;
+ }
+
+ int sockfd_; // socket file descriptor
+ const std::string host_name_;
+ const std::string port_num_;
+
+ SocketWriter(const SocketWriter&) = delete;
+ SocketWriter& operator=(const SocketWriter&) = delete;
+ }; // class SocketWriter
+
+ // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+ static std::string UrlEncode(const char* str);
+
+ StreamingListener(const std::string& host, const std::string& port)
+ : socket_writer_(new SocketWriter(host, port)) {
+ Start();
+ }
+
+ explicit StreamingListener(AbstractSocketWriter* socket_writer)
+ : socket_writer_(socket_writer) {
+ Start();
+ }
+
+ void OnTestProgramStart(const UnitTest& /* unit_test */) override {
+ SendLn("event=TestProgramStart");
+ }
+
+ void OnTestProgramEnd(const UnitTest& unit_test) override {
+ // Note that Google Test current only report elapsed time for each
+ // test iteration, not for the entire test program.
+ SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+ // Notify the streaming server to stop.
+ socket_writer_->CloseConnection();
+ }
+
+ void OnTestIterationStart(const UnitTest& /* unit_test */,
+ int iteration) override {
+ SendLn("event=TestIterationStart&iteration=" +
+ StreamableToString(iteration));
+ }
+
+ void OnTestIterationEnd(const UnitTest& unit_test,
+ int /* iteration */) override {
+ SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+ "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+ "ms");
+ }
+
+ // Note that "event=TestCaseStart" is a wire format and has to remain
+ // "case" for compatibility
+ void OnTestSuiteStart(const TestSuite& test_suite) override {
+ SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
+ }
+
+ // Note that "event=TestCaseEnd" is a wire format and has to remain
+ // "case" for compatibility
+ void OnTestSuiteEnd(const TestSuite& test_suite) override {
+ SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+ "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
+ "ms");
+ }
+
+ void OnTestStart(const TestInfo& test_info) override {
+ SendLn(std::string("event=TestStart&name=") + test_info.name());
+ }
+
+ void OnTestEnd(const TestInfo& test_info) override {
+ SendLn("event=TestEnd&passed=" +
+ FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
+ StreamableToString((test_info.result())->elapsed_time()) + "ms");
+ }
+
+ void OnTestPartResult(const TestPartResult& test_part_result) override {
+ const char* file_name = test_part_result.file_name();
+ if (file_name == nullptr) file_name = "";
+ SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+ "&line=" + StreamableToString(test_part_result.line_number()) +
+ "&message=" + UrlEncode(test_part_result.message()));
+ }
+
+ private:
+ // Sends the given message and a newline to the socket.
+ void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
+
+ // Called at the start of streaming to notify the receiver what
+ // protocol we are using.
+ void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+ std::string FormatBool(bool value) { return value ? "1" : "0"; }
+
+ const std::unique_ptr<AbstractSocketWriter> socket_writer_;
+
+ StreamingListener(const StreamingListener&) = delete;
+ StreamingListener& operator=(const StreamingListener&) = delete;
+}; // class StreamingListener
+
+#endif // GTEST_CAN_STREAM_RESULTS_
+
+} // namespace internal
+} // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251
+
+#endif // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-matchers.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-matchers.cc
new file mode 100644
index 0000000000..7e3bcc0cff
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-matchers.cc
@@ -0,0 +1,98 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+#include "gtest/gtest-matchers.h"
+
+#include <string>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string&>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string&>::Matcher(const char* s) {
+ *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const char* s) { *this = Eq(std::string(s)); }
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView&>::Matcher(const std::string& s) {
+ *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView&>::Matcher(const char* s) {
+ *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView&>::Matcher(internal::StringView s) {
+ *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const char* s) {
+ *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(internal::StringView s) {
+ *this = Eq(std::string(s));
+}
+#endif // GTEST_INTERNAL_HAS_STRING_VIEW
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc
new file mode 100644
index 0000000000..d797fe4d58
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc
@@ -0,0 +1,1394 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+
+#if GTEST_OS_WINDOWS
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map> // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif // _MSC_VER
+#else
+#include <unistd.h>
+#endif // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
+#endif // GTEST_OS_MAC
+
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+ GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
+#endif
+
+#if GTEST_OS_QNX
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
+#endif // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+#include <procinfo.h>
+#include <sys/types.h>
+#endif // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
+#endif // GTEST_OS_FUCHSIA
+
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string& filename, int field) {
+ std::string dummy;
+ std::ifstream file(filename.c_str());
+ while (field-- > 0) {
+ file >> dummy;
+ }
+ T output = 0;
+ file >> output;
+ return output;
+}
+} // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+ const std::string filename =
+ (Message() << "/proc/" << getpid() << "/stat").GetString();
+ return ReadProcFileField<size_t>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+ const task_t task = mach_task_self();
+ mach_msg_type_number_t thread_count;
+ thread_act_array_t thread_list;
+ const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+ if (status == KERN_SUCCESS) {
+ // task_threads allocates resources in thread_list and we need to free them
+ // to avoid leaks.
+ vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
+ sizeof(thread_t) * thread_count);
+ return static_cast<size_t>(thread_count);
+ } else {
+ return 0;
+ }
+}
+
+#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+ GTEST_OS_NETBSD
+
+#if GTEST_OS_NETBSD
+#undef KERN_PROC
+#define KERN_PROC KERN_PROC2
+#define kinfo_proc kinfo_proc2
+#endif
+
+#if GTEST_OS_DRAGONFLY
+#define KP_NLWP(kp) (kp.kp_nthreads)
+#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#define KP_NLWP(kp) (kp.ki_numthreads)
+#elif GTEST_OS_NETBSD
+#define KP_NLWP(kp) (kp.p_nlwps)
+#endif
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+ int mib[] = {
+ CTL_KERN,
+ KERN_PROC,
+ KERN_PROC_PID,
+ getpid(),
+#if GTEST_OS_NETBSD
+ sizeof(struct kinfo_proc),
+ 1,
+#endif
+ };
+ u_int miblen = sizeof(mib) / sizeof(mib[0]);
+ struct kinfo_proc info;
+ size_t size = sizeof(info);
+ if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+ return 0;
+ }
+ return static_cast<size_t>(KP_NLWP(info));
+}
+#elif GTEST_OS_OPENBSD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+ int mib[] = {
+ CTL_KERN,
+ KERN_PROC,
+ KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+ getpid(),
+ sizeof(struct kinfo_proc),
+ 0,
+ };
+ u_int miblen = sizeof(mib) / sizeof(mib[0]);
+
+ // get number of structs
+ size_t size;
+ if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
+ return 0;
+ }
+
+ mib[5] = static_cast<int>(size / static_cast<size_t>(mib[4]));
+
+ // populate array of structs
+ struct kinfo_proc info[mib[5]];
+ if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+ return 0;
+ }
+
+ // exclude empty members
+ size_t nthreads = 0;
+ for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
+ if (info[i].p_tid != -1) nthreads++;
+ }
+ return nthreads;
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+ const int fd = open("/proc/self/as", O_RDONLY);
+ if (fd < 0) {
+ return 0;
+ }
+ procfs_info process_info;
+ const int status =
+ devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr);
+ close(fd);
+ if (status == EOK) {
+ return static_cast<size_t>(process_info.num_threads);
+ } else {
+ return 0;
+ }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+ struct procentry64 entry;
+ pid_t pid = getpid();
+ int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1);
+ if (status == 1) {
+ return entry.pi_thcount;
+ } else {
+ return 0;
+ }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+ int dummy_buffer;
+ size_t avail;
+ zx_status_t status =
+ zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+ &dummy_buffer, 0, nullptr, &avail);
+ if (status == ZX_OK) {
+ return avail;
+ } else {
+ return 0;
+ }
+}
+
+#else
+
+size_t GetThreadCount() {
+ // There's no portable way to detect the number of threads, so we just
+ // return 0 to indicate that we cannot detect it.
+ return 0;
+}
+
+#endif // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
+
+AutoHandle::~AutoHandle() { Reset(); }
+
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
+
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
+
+void AutoHandle::Reset(HANDLE handle) {
+ // Resetting with the same handle we already own is invalid.
+ if (handle_ != handle) {
+ if (IsCloseable()) {
+ ::CloseHandle(handle_);
+ }
+ handle_ = handle;
+ } else {
+ GTEST_CHECK_(!IsCloseable())
+ << "Resetting a valid handle to itself is likely a programmer error "
+ "and thus not allowed.";
+ }
+}
+
+bool AutoHandle::IsCloseable() const {
+ // Different Windows APIs may use either of these values to represent an
+ // invalid handle.
+ return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Mutex::Mutex()
+ : owner_thread_id_(0),
+ type_(kDynamic),
+ critical_section_init_phase_(0),
+ critical_section_(new CRITICAL_SECTION) {
+ ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+ // Static mutexes are leaked intentionally. It is not thread-safe to try
+ // to clean them up.
+ if (type_ == kDynamic) {
+ ::DeleteCriticalSection(critical_section_);
+ delete critical_section_;
+ critical_section_ = nullptr;
+ }
+}
+
+void Mutex::Lock() {
+ ThreadSafeLazyInit();
+ ::EnterCriticalSection(critical_section_);
+ owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+ ThreadSafeLazyInit();
+ // We don't protect writing to owner_thread_id_ here, as it's the
+ // caller's responsibility to ensure that the current thread holds the
+ // mutex when this is called.
+ owner_thread_id_ = 0;
+ ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+ ThreadSafeLazyInit();
+ GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+ << "The current thread is not holding the mutex @" << this;
+}
+
+namespace {
+
+#ifdef _MSC_VER
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+// MemoryIsNotDeallocated memory_is_not_deallocated;
+// critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated {
+ public:
+ MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+ old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+ // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+ // doesn't report mem leak if there's no matching deallocation.
+ (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+ }
+
+ ~MemoryIsNotDeallocated() {
+ // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+ (void)_CrtSetDbgFlag(old_crtdbg_flag_);
+ }
+
+ private:
+ int old_crtdbg_flag_;
+
+ MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+ MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
+};
+#endif // _MSC_VER
+
+} // namespace
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+ // Dynamic mutexes are initialized in the constructor.
+ if (type_ == kStatic) {
+ switch (
+ ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+ case 0:
+ // If critical_section_init_phase_ was 0 before the exchange, we
+ // are the first to test it and need to perform the initialization.
+ owner_thread_id_ = 0;
+ {
+ // Use RAII to flag that following mem alloc is never deallocated.
+#ifdef _MSC_VER
+ MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif // _MSC_VER
+ critical_section_ = new CRITICAL_SECTION;
+ }
+ ::InitializeCriticalSection(critical_section_);
+ // Updates the critical_section_init_phase_ to 2 to signal
+ // initialization complete.
+ GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+ 2L, 1L) == 1L);
+ break;
+ case 1:
+ // Somebody else is already initializing the mutex; spin until they
+ // are done.
+ while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
+ 2L) != 2L) {
+ // Possibly yields the rest of the thread's time slice to other
+ // threads.
+ ::Sleep(0);
+ }
+ break;
+
+ case 2:
+ break; // The mutex is already initialized and ready for use.
+
+ default:
+ GTEST_CHECK_(false)
+ << "Unexpected value of critical_section_init_phase_ "
+ << "while initializing a static mutex.";
+ }
+ }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+ static HANDLE CreateThread(Runnable* runnable,
+ Notification* thread_can_start) {
+ ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+ DWORD thread_id;
+ HANDLE thread_handle = ::CreateThread(
+ nullptr, // Default security.
+ 0, // Default stack size.
+ &ThreadWithParamSupport::ThreadMain,
+ param, // Parameter to ThreadMainStatic
+ 0x0, // Default creation flags.
+ &thread_id); // Need a valid pointer for the call to work under Win98.
+ GTEST_CHECK_(thread_handle != nullptr)
+ << "CreateThread failed with error " << ::GetLastError() << ".";
+ if (thread_handle == nullptr) {
+ delete param;
+ }
+ return thread_handle;
+ }
+
+ private:
+ struct ThreadMainParam {
+ ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+ : runnable_(runnable), thread_can_start_(thread_can_start) {}
+ std::unique_ptr<Runnable> runnable_;
+ // Does not own.
+ Notification* thread_can_start_;
+ };
+
+ static DWORD WINAPI ThreadMain(void* ptr) {
+ // Transfers ownership.
+ std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+ if (param->thread_can_start_ != nullptr)
+ param->thread_can_start_->WaitForNotification();
+ param->runnable_->Run();
+ return 0;
+ }
+
+ // Prohibit instantiation.
+ ThreadWithParamSupport();
+
+ ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+ ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
+};
+
+} // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
+ Notification* thread_can_start)
+ : thread_(
+ ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
+
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
+
+void ThreadWithParamBase::Join() {
+ GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+ << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits. A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+ // Registers thread_local_instance as having value on the current thread.
+ // Returns a value that can be used to identify the thread from other threads.
+ static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+ const ThreadLocalBase* thread_local_instance) {
+#ifdef _MSC_VER
+ MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif // _MSC_VER
+ DWORD current_thread = ::GetCurrentThreadId();
+ MutexLock lock(&mutex_);
+ ThreadIdToThreadLocals* const thread_to_thread_locals =
+ GetThreadLocalsMapLocked();
+ ThreadIdToThreadLocals::iterator thread_local_pos =
+ thread_to_thread_locals->find(current_thread);
+ if (thread_local_pos == thread_to_thread_locals->end()) {
+ thread_local_pos =
+ thread_to_thread_locals
+ ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+ .first;
+ StartWatcherThreadFor(current_thread);
+ }
+ ThreadLocalValues& thread_local_values = thread_local_pos->second;
+ ThreadLocalValues::iterator value_pos =
+ thread_local_values.find(thread_local_instance);
+ if (value_pos == thread_local_values.end()) {
+ value_pos =
+ thread_local_values
+ .insert(std::make_pair(
+ thread_local_instance,
+ std::shared_ptr<ThreadLocalValueHolderBase>(
+ thread_local_instance->NewValueForCurrentThread())))
+ .first;
+ }
+ return value_pos->second.get();
+ }
+
+ static void OnThreadLocalDestroyed(
+ const ThreadLocalBase* thread_local_instance) {
+ std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+ // Clean up the ThreadLocalValues data structure while holding the lock, but
+ // defer the destruction of the ThreadLocalValueHolderBases.
+ {
+ MutexLock lock(&mutex_);
+ ThreadIdToThreadLocals* const thread_to_thread_locals =
+ GetThreadLocalsMapLocked();
+ for (ThreadIdToThreadLocals::iterator it =
+ thread_to_thread_locals->begin();
+ it != thread_to_thread_locals->end(); ++it) {
+ ThreadLocalValues& thread_local_values = it->second;
+ ThreadLocalValues::iterator value_pos =
+ thread_local_values.find(thread_local_instance);
+ if (value_pos != thread_local_values.end()) {
+ value_holders.push_back(value_pos->second);
+ thread_local_values.erase(value_pos);
+ // This 'if' can only be successful at most once, so theoretically we
+ // could break out of the loop here, but we don't bother doing so.
+ }
+ }
+ }
+ // Outside the lock, let the destructor for 'value_holders' deallocate the
+ // ThreadLocalValueHolderBases.
+ }
+
+ static void OnThreadExit(DWORD thread_id) {
+ GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+ std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+ // Clean up the ThreadIdToThreadLocals data structure while holding the
+ // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+ {
+ MutexLock lock(&mutex_);
+ ThreadIdToThreadLocals* const thread_to_thread_locals =
+ GetThreadLocalsMapLocked();
+ ThreadIdToThreadLocals::iterator thread_local_pos =
+ thread_to_thread_locals->find(thread_id);
+ if (thread_local_pos != thread_to_thread_locals->end()) {
+ ThreadLocalValues& thread_local_values = thread_local_pos->second;
+ for (ThreadLocalValues::iterator value_pos =
+ thread_local_values.begin();
+ value_pos != thread_local_values.end(); ++value_pos) {
+ value_holders.push_back(value_pos->second);
+ }
+ thread_to_thread_locals->erase(thread_local_pos);
+ }
+ }
+ // Outside the lock, let the destructor for 'value_holders' deallocate the
+ // ThreadLocalValueHolderBases.
+ }
+
+ private:
+ // In a particular thread, maps a ThreadLocal object to its value.
+ typedef std::map<const ThreadLocalBase*,
+ std::shared_ptr<ThreadLocalValueHolderBase> >
+ ThreadLocalValues;
+ // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+ // thread's ID.
+ typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+ // Holds the thread id and thread handle that we pass from
+ // StartWatcherThreadFor to WatcherThreadFunc.
+ typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+ static void StartWatcherThreadFor(DWORD thread_id) {
+ // The returned handle will be kept in thread_map and closed by
+ // watcher_thread in WatcherThreadFunc.
+ HANDLE thread =
+ ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
+ GTEST_CHECK_(thread != nullptr);
+ // We need to pass a valid thread ID pointer into CreateThread for it
+ // to work correctly under Win98.
+ DWORD watcher_thread_id;
+ HANDLE watcher_thread = ::CreateThread(
+ nullptr, // Default security.
+ 0, // Default stack size
+ &ThreadLocalRegistryImpl::WatcherThreadFunc,
+ reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+ CREATE_SUSPENDED, &watcher_thread_id);
+ GTEST_CHECK_(watcher_thread != nullptr)
+ << "CreateThread failed with error " << ::GetLastError() << ".";
+ // Give the watcher thread the same priority as ours to avoid being
+ // blocked by it.
+ ::SetThreadPriority(watcher_thread,
+ ::GetThreadPriority(::GetCurrentThread()));
+ ::ResumeThread(watcher_thread);
+ ::CloseHandle(watcher_thread);
+ }
+
+ // Monitors exit from a given thread and notifies those
+ // ThreadIdToThreadLocals about thread termination.
+ static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+ const ThreadIdAndHandle* tah =
+ reinterpret_cast<const ThreadIdAndHandle*>(param);
+ GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+ OnThreadExit(tah->first);
+ ::CloseHandle(tah->second);
+ delete tah;
+ return 0;
+ }
+
+ // Returns map of thread local instances.
+ static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+ mutex_.AssertHeld();
+#ifdef _MSC_VER
+ MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif // _MSC_VER
+ static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
+ return map;
+ }
+
+ // Protects access to GetThreadLocalsMapLocked() and its return value.
+ static Mutex mutex_;
+ // Protects access to GetThreadMapLocked() and its return value.
+ static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+ Mutex::kStaticMutex); // NOLINT
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+ const ThreadLocalBase* thread_local_instance) {
+ return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+ thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+ const ThreadLocalBase* thread_local_instance) {
+ ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE. Currently only needed for death tests.
+
+RE::~RE() {
+ if (is_valid_) {
+ // regfree'ing an invalid regex might crash because the content
+ // of the regex is undefined. Since the regex's are essentially
+ // the same, one cannot be valid (or invalid) without the other
+ // being so too.
+ regfree(&partial_regex_);
+ regfree(&full_regex_);
+ }
+ free(const_cast<char*>(pattern_));
+}
+
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+ if (!re.is_valid_) return false;
+
+ regmatch_t match;
+ return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+ if (!re.is_valid_) return false;
+
+ regmatch_t match;
+ return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+ pattern_ = posix::StrDup(regex);
+
+ // Reserves enough bytes to hold the regular expression used for a
+ // full match.
+ const size_t full_regex_len = strlen(regex) + 10;
+ char* const full_pattern = new char[full_regex_len];
+
+ snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+ is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+ // We want to call regcomp(&partial_regex_, ...) even if the
+ // previous expression returns false. Otherwise partial_regex_ may
+ // not be properly initialized can may cause trouble when it's
+ // freed.
+ //
+ // Some implementation of POSIX regex (e.g. on at least some
+ // versions of Cygwin) doesn't accept the empty string as a valid
+ // regex. We change it to an equivalent form "()" to be safe.
+ if (is_valid_) {
+ const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+ is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+ }
+ EXPECT_TRUE(is_valid_)
+ << "Regular expression \"" << regex
+ << "\" is not a valid POSIX Extended regular expression.";
+
+ delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true if and only if ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+ return ch != '\0' && strchr(str, ch) != nullptr;
+}
+
+// Returns true if and only if ch belongs to the given classification.
+// Unlike similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+ return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+ return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+ ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true if and only if "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+ return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true if and only if the given atom (specified by escaped and
+// pattern) matches ch. The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+ if (escaped) { // "\\p" where p is pattern_char.
+ switch (pattern_char) {
+ case 'd':
+ return IsAsciiDigit(ch);
+ case 'D':
+ return !IsAsciiDigit(ch);
+ case 'f':
+ return ch == '\f';
+ case 'n':
+ return ch == '\n';
+ case 'r':
+ return ch == '\r';
+ case 's':
+ return IsAsciiWhiteSpace(ch);
+ case 'S':
+ return !IsAsciiWhiteSpace(ch);
+ case 't':
+ return ch == '\t';
+ case 'v':
+ return ch == '\v';
+ case 'w':
+ return IsAsciiWordChar(ch);
+ case 'W':
+ return !IsAsciiWordChar(ch);
+ }
+ return IsAsciiPunct(pattern_char) && pattern_char == ch;
+ }
+
+ return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
+ return (Message() << "Syntax error at index " << index
+ << " in simple regular expression \"" << regex << "\": ")
+ .GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+ if (regex == nullptr) {
+ ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+ return false;
+ }
+
+ bool is_valid = true;
+
+ // True if and only if ?, *, or + can follow the previous atom.
+ bool prev_repeatable = false;
+ for (int i = 0; regex[i]; i++) {
+ if (regex[i] == '\\') { // An escape sequence
+ i++;
+ if (regex[i] == '\0') {
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+ << "'\\' cannot appear at the end.";
+ return false;
+ }
+
+ if (!IsValidEscape(regex[i])) {
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+ << "invalid escape sequence \"\\" << regex[i] << "\".";
+ is_valid = false;
+ }
+ prev_repeatable = true;
+ } else { // Not an escape sequence.
+ const char ch = regex[i];
+
+ if (ch == '^' && i > 0) {
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+ << "'^' can only appear at the beginning.";
+ is_valid = false;
+ } else if (ch == '$' && regex[i + 1] != '\0') {
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+ << "'$' can only appear at the end.";
+ is_valid = false;
+ } else if (IsInSet(ch, "()[]{}|")) {
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+ << "' is unsupported.";
+ is_valid = false;
+ } else if (IsRepeat(ch) && !prev_repeatable) {
+ ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+ << "' can only follow a repeatable token.";
+ is_valid = false;
+ }
+
+ prev_repeatable = !IsInSet(ch, "^$?*+");
+ }
+ }
+
+ return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression. The regex atom is defined as c if escaped is false,
+// or \c otherwise. repeat is the repetition meta character (?, *,
+// or +). The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway. We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+ const char* regex, const char* str) {
+ const size_t min_count = (repeat == '+') ? 1 : 0;
+ const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
+ // We cannot call numeric_limits::max() as it conflicts with the
+ // max() macro on Windows.
+
+ for (size_t i = 0; i <= max_count; ++i) {
+ // We know that the atom matches each of the first i characters in str.
+ if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+ // We have enough matches at the head, and the tail matches too.
+ // Since we only care about *whether* the pattern matches str
+ // (as opposed to *how* it matches), there is no need to find a
+ // greedy match.
+ return true;
+ }
+ if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
+ }
+ return false;
+}
+
+// Returns true if and only if regex matches a prefix of str. regex must
+// be a valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+ if (*regex == '\0') // An empty regex matches a prefix of anything.
+ return true;
+
+ // "$" only matches the end of a string. Note that regex being
+ // valid guarantees that there's nothing after "$" in it.
+ if (*regex == '$') return *str == '\0';
+
+ // Is the first thing in regex an escape sequence?
+ const bool escaped = *regex == '\\';
+ if (escaped) ++regex;
+ if (IsRepeat(regex[1])) {
+ // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+ // here's an indirect recursion. It terminates as the regex gets
+ // shorter in each recursion.
+ return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+ str);
+ } else {
+ // regex isn't empty, isn't "$", and doesn't start with a
+ // repetition. We match the first atom of regex with the first
+ // character of str and recurse.
+ return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+ MatchRegexAtHead(regex + 1, str + 1);
+ }
+}
+
+// Returns true if and only if regex matches any substring of str. regex must
+// be a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally. In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+ if (regex == nullptr || str == nullptr) return false;
+
+ if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
+
+ // A successful match can be anywhere in str.
+ do {
+ if (MatchRegexAtHead(regex, str)) return true;
+ } while (*str++ != '\0');
+ return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+ free(const_cast<char*>(pattern_));
+ free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+ return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+ return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+ pattern_ = full_pattern_ = nullptr;
+ if (regex != nullptr) {
+ pattern_ = posix::StrDup(regex);
+ }
+
+ is_valid_ = ValidateRegex(regex);
+ if (!is_valid_) {
+ // No need to calculate the full pattern when the regex is invalid.
+ return;
+ }
+
+ const size_t len = strlen(regex);
+ // Reserves enough bytes to hold the regular expression used for a
+ // full match: we need space to prepend a '^', append a '$', and
+ // terminate the string with '\0'.
+ char* buffer = static_cast<char*>(malloc(len + 3));
+ full_pattern_ = buffer;
+
+ if (*regex != '^')
+ *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'.
+
+ // We don't use snprintf or strncpy, as they trigger a warning when
+ // compiled with VC++ 8.0.
+ memcpy(buffer, regex, len);
+ buffer += len;
+
+ if (len == 0 || regex[len - 1] != '$')
+ *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'.
+
+ *buffer = '\0';
+}
+
+#endif // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+ const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+ if (line < 0) {
+ return file_name + ":";
+ }
+#ifdef _MSC_VER
+ return file_name + "(" + StreamableToString(line) + "):";
+#else
+ return file_name + ":" + StreamableToString(line) + ":";
+#endif // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+ int line) {
+ const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+ if (line < 0)
+ return file_name;
+ else
+ return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+ : severity_(severity) {
+ const char* const marker = severity == GTEST_INFO ? "[ INFO ]"
+ : severity == GTEST_WARNING ? "[WARNING]"
+ : severity == GTEST_ERROR ? "[ ERROR ]"
+ : "[ FATAL ]";
+ GetStream() << ::std::endl
+ << marker << " " << FormatFileLocation(file, line).c_str()
+ << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+ GetStream() << ::std::endl;
+ if (severity_ == GTEST_FATAL) {
+ fflush(stderr);
+ posix::Abort();
+ }
+}
+
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+ // The ctor redirects the stream to a temporary file.
+ explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+#if GTEST_OS_WINDOWS
+ char temp_dir_path[MAX_PATH + 1] = {'\0'}; // NOLINT
+ char temp_file_path[MAX_PATH + 1] = {'\0'}; // NOLINT
+
+ ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+ const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
+ 0, // Generate unique file name.
+ temp_file_path);
+ GTEST_CHECK_(success != 0)
+ << "Unable to create a temporary file in " << temp_dir_path;
+ const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+ GTEST_CHECK_(captured_fd != -1)
+ << "Unable to open temporary file " << temp_file_path;
+ filename_ = temp_file_path;
+#else
+ // There's no guarantee that a test has write access to the current
+ // directory, so we create the temporary file in a temporary directory.
+ std::string name_template;
+
+#if GTEST_OS_LINUX_ANDROID
+ // Note: Android applications are expected to call the framework's
+ // Context.getExternalStorageDirectory() method through JNI to get
+ // the location of the world-writable SD Card directory. However,
+ // this requires a Context handle, which cannot be retrieved
+ // globally from native code. Doing so also precludes running the
+ // code as part of a regular standalone executable, which doesn't
+ // run in a Dalvik process (e.g. when running it through 'adb shell').
+ //
+ // The location /data/local/tmp is directly accessible from native code.
+ // '/sdcard' and other variants cannot be relied on, as they are not
+ // guaranteed to be mounted, or may have a delay in mounting.
+ name_template = "/data/local/tmp/";
+#elif GTEST_OS_IOS
+ char user_temp_dir[PATH_MAX + 1];
+
+ // Documented alternative to NSTemporaryDirectory() (for obtaining creating
+ // a temporary directory) at
+ // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10
+ //
+ // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not
+ // documented in the confstr() man page at
+ // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr
+ // but are still available, according to the WebKit patches at
+ // https://trac.webkit.org/changeset/262004/webkit
+ // https://trac.webkit.org/changeset/263705/webkit
+ //
+ // The confstr() implementation falls back to getenv("TMPDIR"). See
+ // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html
+ ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir));
+
+ name_template = user_temp_dir;
+ if (name_template.back() != GTEST_PATH_SEP_[0])
+ name_template.push_back(GTEST_PATH_SEP_[0]);
+#else
+ name_template = "/tmp/";
+#endif
+ name_template.append("gtest_captured_stream.XXXXXX");
+
+ // mkstemp() modifies the string bytes in place, and does not go beyond the
+ // string's length. This results in well-defined behavior in C++17.
+ //
+ // The const_cast is needed below C++17. The constraints on std::string
+ // implementations in C++11 and above make assumption behind the const_cast
+ // fairly safe.
+ const int captured_fd = ::mkstemp(const_cast<char*>(name_template.data()));
+ if (captured_fd == -1) {
+ GTEST_LOG_(WARNING)
+ << "Failed to create tmp file " << name_template
+ << " for test; does the test have access to the /tmp directory?";
+ }
+ filename_ = std::move(name_template);
+#endif // GTEST_OS_WINDOWS
+ fflush(nullptr);
+ dup2(captured_fd, fd_);
+ close(captured_fd);
+ }
+
+ ~CapturedStream() { remove(filename_.c_str()); }
+
+ std::string GetCapturedString() {
+ if (uncaptured_fd_ != -1) {
+ // Restores the original stream.
+ fflush(nullptr);
+ dup2(uncaptured_fd_, fd_);
+ close(uncaptured_fd_);
+ uncaptured_fd_ = -1;
+ }
+
+ FILE* const file = posix::FOpen(filename_.c_str(), "r");
+ if (file == nullptr) {
+ GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_
+ << " for capturing stream.";
+ }
+ const std::string content = ReadEntireFile(file);
+ posix::FClose(file);
+ return content;
+ }
+
+ private:
+ const int fd_; // A stream to capture.
+ int uncaptured_fd_;
+ // Name of the temporary file holding the stderr output.
+ ::std::string filename_;
+
+ CapturedStream(const CapturedStream&) = delete;
+ CapturedStream& operator=(const CapturedStream&) = delete;
+};
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+static CapturedStream* g_captured_stderr = nullptr;
+static CapturedStream* g_captured_stdout = nullptr;
+
+// Starts capturing an output stream (stdout/stderr).
+static void CaptureStream(int fd, const char* stream_name,
+ CapturedStream** stream) {
+ if (*stream != nullptr) {
+ GTEST_LOG_(FATAL) << "Only one " << stream_name
+ << " capturer can exist at a time.";
+ }
+ *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
+ const std::string content = (*captured_stream)->GetCapturedString();
+
+ delete *captured_stream;
+ *captured_stream = nullptr;
+
+ return content;
+}
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif // defined(_MSC_VER) || defined(__BORLANDC__)
+
+// Starts capturing stdout.
+void CaptureStdout() {
+ CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+ CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+ return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+ return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif // GTEST_HAS_STREAM_REDIRECTION
+
+size_t GetFileSize(FILE* file) {
+ fseek(file, 0, SEEK_END);
+ return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+ const size_t file_size = GetFileSize(file);
+ char* const buffer = new char[file_size];
+
+ size_t bytes_last_read = 0; // # of bytes read in the last fread()
+ size_t bytes_read = 0; // # of bytes read so far
+
+ fseek(file, 0, SEEK_SET);
+
+ // Keeps reading the file until we cannot read further or the
+ // pre-determined file size is reached.
+ do {
+ bytes_last_read =
+ fread(buffer + bytes_read, 1, file_size - bytes_read, file);
+ bytes_read += bytes_last_read;
+ } while (bytes_last_read > 0 && bytes_read < file_size);
+
+ const std::string content(buffer, bytes_read);
+ delete[] buffer;
+
+ return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs =
+ nullptr; // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
+ if (g_injected_test_argvs != nullptr) {
+ return *g_injected_test_argvs;
+ }
+ return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+ if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+ g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+ SetInjectableArgvs(
+ new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+void ClearInjectableArgvs() {
+ delete g_injected_test_argvs;
+ g_injected_test_argvs = nullptr;
+}
+#endif // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+ DebugBreak();
+ TerminateProcess(GetCurrentProcess(), 1);
+}
+} // namespace posix
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag. For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+ const std::string full_flag =
+ (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+ Message env_var;
+ for (size_t i = 0; i != full_flag.length(); i++) {
+ env_var << ToUpper(full_flag.c_str()[i]);
+ }
+
+ return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer. If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
+ // Parses the environment variable as a decimal integer.
+ char* end = nullptr;
+ const long long_value = strtol(str, &end, 10); // NOLINT
+
+ // Has strtol() consumed all characters in the string?
+ if (*end != '\0') {
+ // No - an invalid character was encountered.
+ Message msg;
+ msg << "WARNING: " << src_text
+ << " is expected to be a 32-bit integer, but actually"
+ << " has value \"" << str << "\".\n";
+ printf("%s", msg.GetString().c_str());
+ fflush(stdout);
+ return false;
+ }
+
+ // Is the parsed value in the range of an int32_t?
+ const auto result = static_cast<int32_t>(long_value);
+ if (long_value == LONG_MAX || long_value == LONG_MIN ||
+ // The parsed value overflows as a long. (strtol() returns
+ // LONG_MAX or LONG_MIN when the input overflows.)
+ result != long_value
+ // The parsed value overflows as an int32_t.
+ ) {
+ Message msg;
+ msg << "WARNING: " << src_text
+ << " is expected to be a 32-bit integer, but actually"
+ << " has value " << str << ", which overflows.\n";
+ printf("%s", msg.GetString().c_str());
+ fflush(stdout);
+ return false;
+ }
+
+ *value = result;
+ return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true if and only if it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+ return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
+ const std::string env_var = FlagToEnvVar(flag);
+ const char* const string_value = posix::GetEnv(env_var.c_str());
+ return string_value == nullptr ? default_value
+ : strcmp(string_value, "0") != 0;
+#endif // defined(GTEST_GET_BOOL_FROM_ENV_)
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+ return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
+ const std::string env_var = FlagToEnvVar(flag);
+ const char* const string_value = posix::GetEnv(env_var.c_str());
+ if (string_value == nullptr) {
+ // The environment variable is not set.
+ return default_value;
+ }
+
+ int32_t result = default_value;
+ if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+ &result)) {
+ printf("The default value %s is used.\n",
+ (Message() << default_value).GetString().c_str());
+ fflush(stdout);
+ return default_value;
+ }
+
+ return result;
+#endif // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system. The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar() {
+ std::string default_value_for_output_flag = "";
+ const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+ if (nullptr != xml_output_file_env) {
+ default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+ }
+ return default_value_for_output_flag;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+ return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
+ const std::string env_var = FlagToEnvVar(flag);
+ const char* const value = posix::GetEnv(env_var.c_str());
+ return value == nullptr ? default_value : value;
+#endif // defined(GTEST_GET_STRING_FROM_ENV_)
+}
+
+} // namespace internal
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc
new file mode 100644
index 0000000000..f3976d230d
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc
@@ -0,0 +1,553 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+// void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise. A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include "gtest/gtest-printers.h"
+
+#include <stdio.h>
+
+#include <cctype>
+#include <cstdint>
+#include <cwchar>
+#include <ostream> // NOLINT
+#include <string>
+#include <type_traits>
+
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+ size_t count, ostream* os) {
+ char text[5] = "";
+ for (size_t i = 0; i != count; i++) {
+ const size_t j = start + i;
+ if (i != 0) {
+ // Organizes the bytes into groups of 2 for easy parsing by
+ // human.
+ if ((j % 2) == 0)
+ *os << ' ';
+ else
+ *os << '-';
+ }
+ GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+ *os << text;
+ }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+ ostream* os) {
+ // Tells the user how big the object is.
+ *os << count << "-byte object <";
+
+ const size_t kThreshold = 132;
+ const size_t kChunkSize = 64;
+ // If the object size is bigger than kThreshold, we'll have to omit
+ // some details by printing only the first and the last kChunkSize
+ // bytes.
+ if (count < kThreshold) {
+ PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+ } else {
+ PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+ *os << " ... ";
+ // Rounds up to 2-byte boundary.
+ const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
+ PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+ }
+ *os << ">";
+}
+
+// Helpers for widening a character to char32_t. Since the standard does not
+// specify if char / wchar_t is signed or unsigned, it is important to first
+// convert it to the unsigned type of the same width before widening it to
+// char32_t.
+template <typename CharType>
+char32_t ToChar32(CharType in) {
+ return static_cast<char32_t>(
+ static_cast<typename std::make_unsigned<CharType>::type>(in));
+}
+
+} // namespace
+
+namespace internal {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object. The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+ ostream* os) {
+ PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+// - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+// - as a hexadecimal escape sequence (e.g. '\x7F'), or
+// - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
+
+// Returns true if c is a printable ASCII character. We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; }
+
+// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a
+// character literal without the quotes, escaping it when necessary; returns how
+// c was formatted.
+template <typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+ const char32_t u_c = ToChar32(c);
+ switch (u_c) {
+ case L'\0':
+ *os << "\\0";
+ break;
+ case L'\'':
+ *os << "\\'";
+ break;
+ case L'\\':
+ *os << "\\\\";
+ break;
+ case L'\a':
+ *os << "\\a";
+ break;
+ case L'\b':
+ *os << "\\b";
+ break;
+ case L'\f':
+ *os << "\\f";
+ break;
+ case L'\n':
+ *os << "\\n";
+ break;
+ case L'\r':
+ *os << "\\r";
+ break;
+ case L'\t':
+ *os << "\\t";
+ break;
+ case L'\v':
+ *os << "\\v";
+ break;
+ default:
+ if (IsPrintableAscii(u_c)) {
+ *os << static_cast<char>(c);
+ return kAsIs;
+ } else {
+ ostream::fmtflags flags = os->flags();
+ *os << "\\x" << std::hex << std::uppercase << static_cast<int>(u_c);
+ os->flags(flags);
+ return kHexEscape;
+ }
+ }
+ return kSpecialEscape;
+}
+
+// Prints a char32_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
+ switch (c) {
+ case L'\'':
+ *os << "'";
+ return kAsIs;
+ case L'"':
+ *os << "\\\"";
+ return kSpecialEscape;
+ default:
+ return PrintAsCharLiteralTo(c, os);
+ }
+}
+
+static const char* GetCharWidthPrefix(char) { return ""; }
+
+static const char* GetCharWidthPrefix(signed char) { return ""; }
+
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
+
+#ifdef __cpp_char8_t
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
+#endif
+
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
+
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
+
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+ return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+#ifdef __cpp_char8_t
+static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
+ return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+#endif
+
+static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) {
+ return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+ return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t)
+// and its code. '\0' is printed as "'\\0'", other unprintable characters are
+// also properly escaped using the standard C++ escape sequence.
+template <typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+ // First, print c as a literal in the most readable form we can find.
+ *os << GetCharWidthPrefix(c) << "'";
+ const CharFormat format = PrintAsCharLiteralTo(c, os);
+ *os << "'";
+
+ // To aid user debugging, we also print c's code in decimal, unless
+ // it's 0 (in which case c was printed as '\\0', making the code
+ // obvious).
+ if (c == 0) return;
+ *os << " (" << static_cast<int>(c);
+
+ // For more convenience, we print c's code again in hexadecimal,
+ // unless c was already printed in the form '\x##' or the code is in
+ // [1, 9].
+ if (format == kHexEscape || (1 <= c && c <= 9)) {
+ // Do nothing.
+ } else {
+ *os << ", 0x" << String::FormatHexInt(static_cast<int>(c));
+ }
+ *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code. L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); }
+
+// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well.
+void PrintTo(char32_t c, ::std::ostream* os) {
+ *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4)
+ << static_cast<uint32_t>(c);
+}
+
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+ if (v == 0) {
+ *os << "0";
+ return;
+ }
+
+ // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+ char buf[40];
+ char* p = buf + sizeof(buf);
+
+ // Some configurations have a __uint128_t, but no support for built in
+ // division. Do manual long division instead.
+
+ uint64_t high = static_cast<uint64_t>(v >> 64);
+ uint64_t low = static_cast<uint64_t>(v);
+
+ *--p = 0;
+ while (high != 0 || low != 0) {
+ uint64_t high_mod = high % 10;
+ high = high / 10;
+ // This is the long division algorithm specialized for a divisor of 10 and
+ // only two elements.
+ // Notable values:
+ // 2^64 / 10 == 1844674407370955161
+ // 2^64 % 10 == 6
+ const uint64_t carry = 6 * high_mod + low % 10;
+ low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+ char digit = static_cast<char>(carry % 10);
+ *--p = '0' + digit;
+ }
+ *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+ __uint128_t uv = static_cast<__uint128_t>(v);
+ if (v < 0) {
+ *os << "-";
+ uv = -uv;
+ }
+ PrintTo(uv, os);
+}
+#endif // __SIZEOF_INT128__
+
+// Prints the given array of characters to the ostream. CharType must be either
+// char, char8_t, char16_t, char32_t, or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+ PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
+ const char* const quote_prefix = GetCharWidthPrefix(*begin);
+ *os << quote_prefix << "\"";
+ bool is_previous_hex = false;
+ CharFormat print_format = kAsIs;
+ for (size_t index = 0; index < len; ++index) {
+ const CharType cur = begin[index];
+ if (is_previous_hex && IsXDigit(cur)) {
+ // Previous character is of '\x..' form and this character can be
+ // interpreted as another hexadecimal digit in its number. Break string to
+ // disambiguate.
+ *os << "\" " << quote_prefix << "\"";
+ }
+ is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+ // Remember if any characters required hex escaping.
+ if (is_previous_hex) {
+ print_format = kHexEscape;
+ }
+ }
+ *os << "\"";
+ return print_format;
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'. CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+ GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+ UniversalPrintCharArray(const CharType* begin, size_t len,
+ ostream* os) {
+ // The code
+ // const char kFoo[] = "foo";
+ // generates an array of 4, not 3, elements, with the last one being '\0'.
+ //
+ // Therefore when printing a char array, we don't print the last element if
+ // it's '\0', such that the output matches the string literal as it's
+ // written in the source code.
+ if (len > 0 && begin[len - 1] == '\0') {
+ PrintCharsAsStringTo(begin, len - 1, os);
+ return;
+ }
+
+ // If, however, the last element in the array is not '\0', e.g.
+ // const char kFoo[] = { 'f', 'o', 'o' };
+ // we must print the entire array. We also print a message to indicate
+ // that the array is not NUL-terminated.
+ PrintCharsAsStringTo(begin, len, os);
+ *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+ UniversalPrintCharArray(begin, len, os);
+}
+
+#ifdef __cpp_char8_t
+// Prints a (const) char8_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
+ UniversalPrintCharArray(begin, len, os);
+}
+#endif
+
+// Prints a (const) char16_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) {
+ UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) char32_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) {
+ UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+ UniversalPrintCharArray(begin, len, os);
+}
+
+namespace {
+
+// Prints a null-terminated C-style string to the ostream.
+template <typename Char>
+void PrintCStringTo(const Char* s, ostream* os) {
+ if (s == nullptr) {
+ *os << "NULL";
+ } else {
+ *os << ImplicitCast_<const void*>(s) << " pointing to ";
+ PrintCharsAsStringTo(s, std::char_traits<Char>::length(s), os);
+ }
+}
+
+} // anonymous namespace
+
+void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
+
+#ifdef __cpp_char8_t
+void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif
+
+void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif // wchar_t is native
+
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+ const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
+
+ for (size_t i = 0; i < length; i++) {
+ unsigned char ch = *s++;
+ if (std::iscntrl(ch)) {
+ switch (ch) {
+ case '\t':
+ case '\n':
+ case '\r':
+ break;
+ default:
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+ const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
+
+ for (size_t i = 0; i < length;) {
+ unsigned char lead = s[i++];
+
+ if (lead <= 0x7f) {
+ continue; // single-byte character (ASCII) 0..7F
+ }
+ if (lead < 0xc2) {
+ return false; // trail byte or non-shortest form
+ } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+ ++i; // 2-byte character
+ } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+ IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+ // check for non-shortest form and surrogate
+ (lead != 0xe0 || s[i] >= 0xa0) &&
+ (lead != 0xed || s[i] < 0xa0)) {
+ i += 2; // 3-byte character
+ } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+ IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+ IsUTF8TrailByte(s[i + 2]) &&
+ // check for non-shortest form
+ (lead != 0xf0 || s[i] >= 0x90) &&
+ (lead != 0xf4 || s[i] < 0x90)) {
+ i += 3; // 4-byte character
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+ if (!ContainsUnprintableControlCodes(str, length) &&
+ IsValidUTF8(str, length)) {
+ *os << "\n As Text: \"" << str << "\"";
+ }
+}
+
+} // anonymous namespace
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+ if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+ if (GTEST_FLAG_GET(print_utf8)) {
+ ConditionalPrintAsText(s.data(), s.size(), os);
+ }
+ }
+}
+
+#ifdef __cpp_char8_t
+void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
+ PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif
+
+void PrintU16StringTo(const ::std::u16string& s, ostream* os) {
+ PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+void PrintU32StringTo(const ::std::u32string& s, ostream* os) {
+ PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+ PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif // GTEST_HAS_STD_WSTRING
+
+} // namespace internal
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc
new file mode 100644
index 0000000000..eb7c8d1cf9
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc
@@ -0,0 +1,105 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+ const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+ return stack_trace == nullptr ? message : std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+ return os << internal::FormatFileLocation(result.file_name(),
+ result.line_number())
+ << " "
+ << (result.type() == TestPartResult::kSuccess ? "Success"
+ : result.type() == TestPartResult::kSkip ? "Skipped"
+ : result.type() == TestPartResult::kFatalFailure
+ ? "Fatal failure"
+ : "Non-fatal failure")
+ << ":\n"
+ << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+ array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+ if (index < 0 || index >= size()) {
+ printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+ internal::posix::Abort();
+ }
+
+ return array_[static_cast<size_t>(index)];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+ return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+ : has_new_fatal_failure_(false),
+ original_reporter_(
+ GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
+ GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+ GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+ original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+ const TestPartResult& result) {
+ if (result.fatally_failed()) has_new_fatal_failure_ = true;
+ original_reporter_->ReportTestPartResult(result);
+}
+
+} // namespace internal
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc
new file mode 100644
index 0000000000..a2828b83c6
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc
@@ -0,0 +1,104 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/gtest-typed-test.h"
+
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+ while (IsSpace(*str)) str++;
+ return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+ std::vector<std::string> name_vec;
+ src = SkipSpaces(src);
+ for (; src != nullptr; src = SkipComma(src)) {
+ name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+ }
+ return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestSuitePState::VerifyRegisteredTestNames(
+ const char* test_suite_name, const char* file, int line,
+ const char* registered_tests) {
+ RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line));
+
+ typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+ registered_ = true;
+
+ std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+ Message errors;
+
+ std::set<std::string> tests;
+ for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+ name_it != name_vec.end(); ++name_it) {
+ const std::string& name = *name_it;
+ if (tests.count(name) != 0) {
+ errors << "Test " << name << " is listed more than once.\n";
+ continue;
+ }
+
+ if (registered_tests_.count(name) != 0) {
+ tests.insert(name);
+ } else {
+ errors << "No test named " << name
+ << " can be found in this test suite.\n";
+ }
+ }
+
+ for (RegisteredTestIter it = registered_tests_.begin();
+ it != registered_tests_.end(); ++it) {
+ if (tests.count(it->first) == 0) {
+ errors << "You forgot to list test " << it->first << ".\n";
+ }
+ }
+
+ const std::string& errors_str = errors.GetString();
+ if (errors_str != "") {
+ fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+ errors_str.c_str());
+ fflush(stderr);
+ posix::Abort();
+ }
+
+ return registered_tests;
+}
+
+} // namespace internal
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
new file mode 100644
index 0000000000..6f31dd2260
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
@@ -0,0 +1,6795 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <chrono> // NOLINT
+#include <cmath>
+#include <cstdint>
+#include <initializer_list>
+#include <iomanip>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream> // NOLINT
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if GTEST_OS_LINUX
+
+#include <fcntl.h> // NOLINT
+#include <limits.h> // NOLINT
+#include <sched.h> // NOLINT
+// Declares vsnprintf(). This header is not available on Windows.
+#include <strings.h> // NOLINT
+#include <sys/mman.h> // NOLINT
+#include <sys/time.h> // NOLINT
+#include <unistd.h> // NOLINT
+
+#include <string>
+
+#elif GTEST_OS_ZOS
+#include <sys/time.h> // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+#include <strings.h> // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE.
+
+#include <windows.h> // NOLINT
+#undef min
+
+#elif GTEST_OS_WINDOWS // We are on Windows proper.
+
+#include <windows.h> // NOLINT
+#undef min
+
+#ifdef _MSC_VER
+#include <crtdbg.h> // NOLINT
+#endif
+
+#include <io.h> // NOLINT
+#include <sys/stat.h> // NOLINT
+#include <sys/timeb.h> // NOLINT
+#include <sys/types.h> // NOLINT
+
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h> // NOLINT
+#endif // GTEST_OS_WINDOWS_MINGW
+
+#else
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+#include <sys/time.h> // NOLINT
+#include <unistd.h> // NOLINT
+
+#endif // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+#include <arpa/inet.h> // NOLINT
+#include <netdb.h> // NOLINT
+#include <sys/socket.h> // NOLINT
+#include <sys/types.h> // NOLINT
+#endif
+
+#include "src/gtest-internal-inl.h"
+
+#if GTEST_OS_WINDOWS
+#define vsnprintf _vsnprintf
+#endif // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#endif // GTEST_HAS_ABSL
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test suite name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test suite whose name matches this filter is considered a death
+// test suite and will be run before test suites whose name doesn't
+// match this filter.
+static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
+bool g_help_flag = false;
+
+// Utility function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+ FILE* fileout = nullptr;
+ FilePath output_file_path(output_file);
+ FilePath output_dir(output_file_path.RemoveFileName());
+
+ if (output_dir.CreateDirectoriesRecursively()) {
+ fileout = posix::FOpen(output_file.c_str(), "w");
+ }
+ if (fileout == nullptr) {
+ GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+ }
+ return fileout;
+}
+
+} // namespace internal
+
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
+static const char* GetDefaultFilter() {
+ const char* const testbridge_test_only =
+ internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+ if (testbridge_test_only != nullptr) {
+ return testbridge_test_only;
+ }
+ return kUniversalFilter;
+}
+
+// Bazel passes in the argument to '--test_runner_fail_fast' via the
+// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable.
+static bool GetDefaultFailFast() {
+ const char* const testbridge_test_runner_fail_fast =
+ internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
+ if (testbridge_test_runner_fail_fast != nullptr) {
+ return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
+ }
+ return false;
+}
+
+} // namespace testing
+
+GTEST_DEFINE_bool_(
+ fail_fast,
+ testing::internal::BoolFromGTestEnv("fail_fast",
+ testing::GetDefaultFailFast()),
+ "True if and only if a test failure should stop further test execution.");
+
+GTEST_DEFINE_bool_(
+ also_run_disabled_tests,
+ testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+ "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+ break_on_failure,
+ testing::internal::BoolFromGTestEnv("break_on_failure", false),
+ "True if and only if a failed assertion should be a debugger "
+ "break-point.");
+
+GTEST_DEFINE_bool_(catch_exceptions,
+ testing::internal::BoolFromGTestEnv("catch_exceptions",
+ true),
+ "True if and only if " GTEST_NAME_
+ " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+ color, testing::internal::StringFromGTestEnv("color", "auto"),
+ "Whether to use colors in the output. Valid values: yes, no, "
+ "and auto. 'auto' means to use colors if the output is "
+ "being sent to a terminal and the TERM environment variable "
+ "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+ filter,
+ testing::internal::StringFromGTestEnv("filter",
+ testing::GetDefaultFilter()),
+ "A colon-separated list of glob (not regex) patterns "
+ "for filtering the tests to run, optionally followed by a "
+ "'-' and a : separated list of negative patterns (tests to "
+ "exclude). A test is run if it matches one of the positive "
+ "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(
+ install_failure_signal_handler,
+ testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+ false),
+ "If true and supported on the current platform, " GTEST_NAME_
+ " should "
+ "install a signal handler that dumps debugging information when fatal "
+ "signals are raised.");
+
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
+
+// The net priority order after flag processing is thus:
+// --gtest_output command line flag
+// GTEST_OUTPUT environment variable
+// XML_OUTPUT_FILE environment variable
+// ''
+GTEST_DEFINE_string_(
+ output,
+ testing::internal::StringFromGTestEnv(
+ "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
+ "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+ "optionally followed by a colon and an output file name or directory. "
+ "A directory is indicated by a trailing pathname separator. "
+ "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+ "If a directory is specified, output files will be created "
+ "within that directory, with file-names based on the test "
+ "executable's name and, if necessary, made unique by adding "
+ "digits.");
+
+GTEST_DEFINE_bool_(
+ brief, testing::internal::BoolFromGTestEnv("brief", false),
+ "True if only test failures should be displayed in text output.");
+
+GTEST_DEFINE_bool_(print_time,
+ testing::internal::BoolFromGTestEnv("print_time", true),
+ "True if and only if " GTEST_NAME_
+ " should display elapsed time in text output.");
+
+GTEST_DEFINE_bool_(print_utf8,
+ testing::internal::BoolFromGTestEnv("print_utf8", true),
+ "True if and only if " GTEST_NAME_
+ " prints UTF8 characters as text.");
+
+GTEST_DEFINE_int32_(
+ random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
+ "Random number seed to use when shuffling test orders. Must be in range "
+ "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+ repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
+ "How many times to repeat each test. Specify a negative number "
+ "for repeating forever. Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+ recreate_environments_when_repeating,
+ testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+ false),
+ "Controls whether global test environments are recreated for each repeat "
+ "of the tests. If set to false the global test environments are only set "
+ "up once, for the first iteration, and only torn down once, for the last. "
+ "Useful for shaking out flaky tests with stable, expensive test "
+ "environments. If --gtest_repeat is set to a negative number, meaning "
+ "there is no last run, the environments will always be recreated to avoid "
+ "leaks.");
+
+GTEST_DEFINE_bool_(show_internal_stack_frames, false,
+ "True if and only if " GTEST_NAME_
+ " should include internal stack frames when "
+ "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(shuffle,
+ testing::internal::BoolFromGTestEnv("shuffle", false),
+ "True if and only if " GTEST_NAME_
+ " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+ stack_trace_depth,
+ testing::internal::Int32FromGTestEnv("stack_trace_depth",
+ testing::kMaxStackTraceDepth),
+ "The maximum number of stack frames to print when an "
+ "assertion fails. The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+ stream_result_to,
+ testing::internal::StringFromGTestEnv("stream_result_to", ""),
+ "This flag specifies the host name and the port number on which to stream "
+ "test results. Example: \"localhost:555\". The flag is effective only on "
+ "Linux.");
+
+GTEST_DEFINE_bool_(
+ throw_on_failure,
+ testing::internal::BoolFromGTestEnv("throw_on_failure", false),
+ "When this flag is specified, a failed assertion will throw an exception "
+ "if exceptions are enabled or exit the program with a non-zero code "
+ "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+ flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
+ "This flag specifies the flagfile to read command-line flags from.");
+#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace testing {
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG). Crashes if 'range' is 0 or greater
+// than kMaxRange.
+uint32_t Random::Generate(uint32_t range) {
+ // These constants are the same as are used in glibc's rand(3).
+ // Use wider types than necessary to prevent unsigned overflow diagnostics.
+ state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
+
+ GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
+ GTEST_CHECK_(range <= kMaxRange)
+ << "Generation of a number in [0, " << range << ") was requested, "
+ << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+ // Converting via modulus introduces a bit of downward bias, but
+ // it's simple, and a linear congruential generator isn't too good
+ // to begin with.
+ return state_ % range;
+}
+
+// GTestIsInitialized() returns true if and only if the user has initialized
+// Google Test. Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestSuites, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestSuiteList(const std::vector<TestSuite*>& case_list,
+ int (TestSuite::*method)() const) {
+ int sum = 0;
+ for (size_t i = 0; i < case_list.size(); i++) {
+ sum += (case_list[i]->*method)();
+ }
+ return sum;
+}
+
+// Returns true if and only if the test suite passed.
+static bool TestSuitePassed(const TestSuite* test_suite) {
+ return test_suite->should_run() && test_suite->Passed();
+}
+
+// Returns true if and only if the test suite failed.
+static bool TestSuiteFailed(const TestSuite* test_suite) {
+ return test_suite->should_run() && test_suite->Failed();
+}
+
+// Returns true if and only if test_suite contains at least one test that
+// should run.
+static bool ShouldRunTestSuite(const TestSuite* test_suite) {
+ return test_suite->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+ int line, const char* message)
+ : data_(new AssertHelperData(type, file, line, message)) {}
+
+AssertHelper::~AssertHelper() { delete data_; }
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+ UnitTest::GetInstance()->AddTestPartResult(
+ data_->type, data_->file, data_->line,
+ AppendUserMessage(data_->message, message),
+ UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+ // Skips the stack frame for this function itself.
+ ); // NOLINT
+}
+
+namespace {
+
+// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
+// to creates test cases for it, a synthetic test case is
+// inserted to report ether an error or a log message.
+//
+// This configuration bit will likely be removed at some point.
+constexpr bool kErrorOnUninstantiatedParameterizedTest = true;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true;
+
+// A test that fails at a given file/line location with a given message.
+class FailureTest : public Test {
+ public:
+ explicit FailureTest(const CodeLocation& loc, std::string error_message,
+ bool as_error)
+ : loc_(loc),
+ error_message_(std::move(error_message)),
+ as_error_(as_error) {}
+
+ void TestBody() override {
+ if (as_error_) {
+ AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(),
+ loc_.line, "") = Message() << error_message_;
+ } else {
+ std::cout << error_message_ << std::endl;
+ }
+ }
+
+ private:
+ const CodeLocation loc_;
+ const std::string error_message_;
+ const bool as_error_;
+};
+
+} // namespace
+
+std::set<std::string>* GetIgnoredParameterizedTestSuites() {
+ return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
+}
+
+// Add a given test_suit to the list of them allow to go un-instantiated.
+MarkAsIgnored::MarkAsIgnored(const char* test_suite) {
+ GetIgnoredParameterizedTestSuites()->insert(test_suite);
+}
+
+// If this parameterized test suite has no instantiations (and that
+// has not been marked as okay), emit a test case reporting that.
+void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
+ bool has_test_p) {
+ const auto& ignored = *GetIgnoredParameterizedTestSuites();
+ if (ignored.find(name) != ignored.end()) return;
+
+ const char kMissingInstantiation[] = //
+ " is defined via TEST_P, but never instantiated. None of the test cases "
+ "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only "
+ "ones provided expand to nothing."
+ "\n\n"
+ "Ideally, TEST_P definitions should only ever be included as part of "
+ "binaries that intend to use them. (As opposed to, for example, being "
+ "placed in a library that may be linked in to get other utilities.)";
+
+ const char kMissingTestCase[] = //
+ " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are "
+ "defined via TEST_P . No test cases will run."
+ "\n\n"
+ "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from "
+ "code that always depend on code that provides TEST_P. Failing to do "
+ "so is often an indication of dead code, e.g. the last TEST_P was "
+ "removed but the rest got left behind.";
+
+ std::string message =
+ "Parameterized test suite " + name +
+ (has_test_p ? kMissingInstantiation : kMissingTestCase) +
+ "\n\n"
+ "To suppress this error for this test suite, insert the following line "
+ "(in a non-header) in the namespace it is defined in:"
+ "\n\n"
+ "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+ name + ");";
+
+ std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
+ RegisterTest( //
+ "GoogleTestVerification", full_name.c_str(),
+ nullptr, // No type parameter.
+ nullptr, // No value parameter.
+ location.file.c_str(), location.line, [message, location] {
+ return new FailureTest(location, message,
+ kErrorOnUninstantiatedParameterizedTest);
+ });
+}
+
+void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
+ CodeLocation code_location) {
+ GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite(
+ test_suite_name, code_location);
+}
+
+void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
+ GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+ case_name);
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
+ const char* test_suite_name, CodeLocation code_location) {
+ suites_.emplace(std::string(test_suite_name),
+ TypeParameterizedTestSuiteInfo(code_location));
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
+ const char* test_suite_name) {
+ auto it = suites_.find(std::string(test_suite_name));
+ if (it != suites_.end()) {
+ it->second.instantiated = true;
+ } else {
+ GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '"
+ << test_suite_name << "'";
+ }
+}
+
+void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
+ const auto& ignored = *GetIgnoredParameterizedTestSuites();
+ for (const auto& testcase : suites_) {
+ if (testcase.second.instantiated) continue;
+ if (ignored.find(testcase.first) != ignored.end()) continue;
+
+ std::string message =
+ "Type parameterized test suite " + testcase.first +
+ " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
+ "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
+ "\n\n"
+ "Ideally, TYPED_TEST_P definitions should only ever be included as "
+ "part of binaries that intend to use them. (As opposed to, for "
+ "example, being placed in a library that may be linked in to get other "
+ "utilities.)"
+ "\n\n"
+ "To suppress this error for this test suite, insert the following line "
+ "(in a non-header) in the namespace it is defined in:"
+ "\n\n"
+ "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+ testcase.first + ");";
+
+ std::string full_name =
+ "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
+ RegisterTest( //
+ "GoogleTestVerification", full_name.c_str(),
+ nullptr, // No type parameter.
+ nullptr, // No value parameter.
+ testcase.second.code_location.file.c_str(),
+ testcase.second.code_location.line, [message, testcase] {
+ return new FailureTest(testcase.second.code_location, message,
+ kErrorOnUninstantiatedTypeParameterizedTest);
+ });
+ }
+}
+
+// A copy of all command line arguments. Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+ // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+ // ::string. This code converts it to the appropriate type.
+ const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+ return ::std::vector<std::string>(custom.begin(), custom.end());
+#else // defined(GTEST_CUSTOM_GET_ARGVS_)
+ return g_argvs;
+#endif // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+ FilePath result;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_OS2
+ result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+ result.Set(FilePath(GetArgvs()[0]));
+#endif // GTEST_OS_WINDOWS
+
+ return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+ std::string s = GTEST_FLAG_GET(output);
+ const char* const gtest_output_flag = s.c_str();
+ const char* const colon = strchr(gtest_output_flag, ':');
+ return (colon == nullptr)
+ ? std::string(gtest_output_flag)
+ : std::string(gtest_output_flag,
+ static_cast<size_t>(colon - gtest_output_flag));
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+ std::string s = GTEST_FLAG_GET(output);
+ const char* const gtest_output_flag = s.c_str();
+
+ std::string format = GetOutputFormat();
+ if (format.empty()) format = std::string(kDefaultOutputFormat);
+
+ const char* const colon = strchr(gtest_output_flag, ':');
+ if (colon == nullptr)
+ return internal::FilePath::MakeFileName(
+ internal::FilePath(
+ UnitTest::GetInstance()->original_working_dir()),
+ internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+ .string();
+
+ internal::FilePath output_name(colon + 1);
+ if (!output_name.IsAbsolutePath())
+ output_name = internal::FilePath::ConcatPaths(
+ internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+ internal::FilePath(colon + 1));
+
+ if (!output_name.IsDirectory()) return output_name.string();
+
+ internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+ output_name, internal::GetCurrentExecutableName(),
+ GetOutputFormat().c_str()));
+ return result.string();
+}
+
+// Returns true if and only if the wildcard pattern matches the string. Each
+// pattern consists of regular characters, single-character wildcards (?), and
+// multi-character wildcards (*).
+//
+// This function implements a linear-time string globbing algorithm based on
+// https://research.swtch.com/glob.
+static bool PatternMatchesString(const std::string& name_str,
+ const char* pattern, const char* pattern_end) {
+ const char* name = name_str.c_str();
+ const char* const name_begin = name;
+ const char* const name_end = name + name_str.size();
+
+ const char* pattern_next = pattern;
+ const char* name_next = name;
+
+ while (pattern < pattern_end || name < name_end) {
+ if (pattern < pattern_end) {
+ switch (*pattern) {
+ default: // Match an ordinary character.
+ if (name < name_end && *name == *pattern) {
+ ++pattern;
+ ++name;
+ continue;
+ }
+ break;
+ case '?': // Match any single character.
+ if (name < name_end) {
+ ++pattern;
+ ++name;
+ continue;
+ }
+ break;
+ case '*':
+ // Match zero or more characters. Start by skipping over the wildcard
+ // and matching zero characters from name. If that fails, restart and
+ // match one more character than the last attempt.
+ pattern_next = pattern;
+ name_next = name + 1;
+ ++pattern;
+ continue;
+ }
+ }
+ // Failed to match a character. Restart if possible.
+ if (name_begin < name_next && name_next <= name_end) {
+ pattern = pattern_next;
+ name = name_next;
+ continue;
+ }
+ return false;
+ }
+ return true;
+}
+
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+ return std::any_of(pattern.begin(), pattern.end(),
+ [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+ UnitTestFilter() = default;
+
+ // Constructs a filter from a string of patterns separated by `:`.
+ explicit UnitTestFilter(const std::string& filter) {
+ // By design "" filter matches "" string.
+ std::vector<std::string> all_patterns;
+ SplitString(filter, ':', &all_patterns);
+ const auto exact_match_patterns_begin = std::partition(
+ all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+ glob_patterns_.reserve(static_cast<size_t>(
+ std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+ std::move(all_patterns.begin(), exact_match_patterns_begin,
+ std::inserter(glob_patterns_, glob_patterns_.begin()));
+ std::move(
+ exact_match_patterns_begin, all_patterns.end(),
+ std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+ }
+
+ // Returns true if and only if name matches at least one of the patterns in
+ // the filter.
+ bool MatchesName(const std::string& name) const {
+ return exact_match_patterns_.count(name) > 0 ||
+ std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+ [&name](const std::string& pattern) {
+ return PatternMatchesString(
+ name, pattern.c_str(),
+ pattern.c_str() + pattern.size());
+ });
+ }
+
+ private:
+ std::vector<std::string> glob_patterns_;
+ std::unordered_set<std::string> exact_match_patterns_;
+};
+
+class PositiveAndNegativeUnitTestFilter {
+ public:
+ // Constructs a positive and a negative filter from a string. The string
+ // contains a positive filter optionally followed by a '-' character and a
+ // negative filter. In case only a negative filter is provided the positive
+ // filter will be assumed "*".
+ // A filter is a list of patterns separated by ':'.
+ explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+ std::vector<std::string> positive_and_negative_filters;
+
+ // NOTE: `SplitString` always returns a non-empty container.
+ SplitString(filter, '-', &positive_and_negative_filters);
+ const auto& positive_filter = positive_and_negative_filters.front();
+
+ if (positive_and_negative_filters.size() > 1) {
+ positive_filter_ = UnitTestFilter(
+ positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+ // TODO(b/214626361): Fail on multiple '-' characters
+ // For the moment to preserve old behavior we concatenate the rest of the
+ // string parts with `-` as separator to generate the negative filter.
+ auto negative_filter_string = positive_and_negative_filters[1];
+ for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+ negative_filter_string =
+ negative_filter_string + '-' + positive_and_negative_filters[i];
+ negative_filter_ = UnitTestFilter(negative_filter_string);
+ } else {
+ // In case we don't have a negative filter and positive filter is ""
+ // we do not use kUniversalFilter by design as opposed to when we have a
+ // negative filter.
+ positive_filter_ = UnitTestFilter(positive_filter);
+ }
+ }
+
+ // Returns true if and only if test name (this is generated by appending test
+ // suit name and test name via a '.' character) matches the positive filter
+ // and does not match the negative filter.
+ bool MatchesTest(const std::string& test_suite_name,
+ const std::string& test_name) const {
+ return MatchesName(test_suite_name + "." + test_name);
+ }
+
+ // Returns true if and only if name matches the positive filter and does not
+ // match the negative filter.
+ bool MatchesName(const std::string& name) const {
+ return positive_filter_.MatchesName(name) &&
+ !negative_filter_.MatchesName(name);
+ }
+
+ private:
+ UnitTestFilter positive_filter_;
+ UnitTestFilter negative_filter_;
+};
+} // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+ const char* filter) {
+ return UnitTestFilter(filter).MatchesName(name_str);
+}
+
+// Returns true if and only if the user-specified filter matches the test
+// suite name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
+ const std::string& test_name) {
+ // Split --gtest_filter at '-', if there is one, to separate into
+ // positive filter and negative filter portions
+ return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+ .MatchesTest(test_suite_name, test_name);
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+ // Google Test should handle a SEH exception if:
+ // 1. the user wants it to, AND
+ // 2. this is not a breakpoint exception, AND
+ // 3. this is not a C++ exception (VC++ implements them via SEH,
+ // apparently).
+ //
+ // SEH exception code for C++ exceptions.
+ // (see http://support.microsoft.com/kb/185294 for more information).
+ const DWORD kCxxExceptionCode = 0xe06d7363;
+
+ bool should_handle = true;
+
+ if (!GTEST_FLAG_GET(catch_exceptions))
+ should_handle = false;
+ else if (exception_code == EXCEPTION_BREAKPOINT)
+ should_handle = false;
+ else if (exception_code == kCxxExceptionCode)
+ should_handle = false;
+
+ return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif // GTEST_HAS_SEH
+
+} // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test. The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+ TestPartResultArray* result)
+ : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
+ Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test. The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+ InterceptMode intercept_mode, TestPartResultArray* result)
+ : intercept_mode_(intercept_mode), result_(result) {
+ Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+ old_reporter_ = impl->GetGlobalTestPartResultReporter();
+ impl->SetGlobalTestPartResultReporter(this);
+ } else {
+ old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+ impl->SetTestPartResultReporterForCurrentThread(this);
+ }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+ impl->SetGlobalTestPartResultReporter(old_reporter_);
+ } else {
+ impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+ }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+ const TestPartResult& result) {
+ result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test. We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test. This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X. The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code. GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library. This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+ const char* /* type_expr */,
+ const char* /* substr_expr */,
+ const TestPartResultArray& results,
+ TestPartResult::Type type,
+ const std::string& substr) {
+ const std::string expected(type == TestPartResult::kFatalFailure
+ ? "1 fatal failure"
+ : "1 non-fatal failure");
+ Message msg;
+ if (results.size() != 1) {
+ msg << "Expected: " << expected << "\n"
+ << " Actual: " << results.size() << " failures";
+ for (int i = 0; i < results.size(); i++) {
+ msg << "\n" << results.GetTestPartResult(i);
+ }
+ return AssertionFailure() << msg;
+ }
+
+ const TestPartResult& r = results.GetTestPartResult(0);
+ if (r.type() != type) {
+ return AssertionFailure() << "Expected: " << expected << "\n"
+ << " Actual:\n"
+ << r;
+ }
+
+ if (strstr(r.message(), substr.c_str()) == nullptr) {
+ return AssertionFailure()
+ << "Expected: " << expected << " containing \"" << substr << "\"\n"
+ << " Actual:\n"
+ << r;
+ }
+
+ return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+ TestPartResult::Type type,
+ const std::string& substr)
+ : results_(results), type_(type), substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring. If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+ EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+ UnitTestImpl* unit_test)
+ : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+ const TestPartResult& result) {
+ unit_test_->current_test_result()->AddTestPartResult(result);
+ unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+ UnitTestImpl* unit_test)
+ : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+ const TestPartResult& result) {
+ unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+ internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+ return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+ TestPartResultReporterInterface* reporter) {
+ internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+ global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+ return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+ TestPartResultReporterInterface* reporter) {
+ per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test suites.
+int UnitTestImpl::successful_test_suite_count() const {
+ return CountIf(test_suites_, TestSuitePassed);
+}
+
+// Gets the number of failed test suites.
+int UnitTestImpl::failed_test_suite_count() const {
+ return CountIf(test_suites_, TestSuiteFailed);
+}
+
+// Gets the number of all test suites.
+int UnitTestImpl::total_test_suite_count() const {
+ return static_cast<int>(test_suites_.size());
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTestImpl::test_suite_to_run_count() const {
+ return CountIf(test_suites_, ShouldRunTestSuite);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count);
+}
+
+// Gets the number of skipped tests.
+int UnitTestImpl::skipped_test_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+ return SumOverTestSuiteList(test_suites_,
+ &TestSuite::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+ return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag. The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+ return os_stack_trace_getter()->CurrentStackTrace(
+ static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
+ // Skips the user-specified number of frames plus this function
+ // itself.
+ ); // NOLINT
+}
+
+// A helper class for measuring elapsed times.
+class Timer {
+ public:
+ Timer() : start_(std::chrono::steady_clock::now()) {}
+
+ // Return time elapsed in milliseconds since the timer was created.
+ TimeInMillis Elapsed() {
+ return std::chrono::duration_cast<std::chrono::milliseconds>(
+ std::chrono::steady_clock::now() - start_)
+ .count();
+ }
+
+ private:
+ std::chrono::steady_clock::time_point start_;
+};
+
+// Returns a timestamp as milliseconds since the epoch. Note this time may jump
+// around subject to adjustments by the system, to measure elapsed time use
+// Timer instead.
+TimeInMillis GetTimeInMillis() {
+ return std::chrono::duration_cast<std::chrono::milliseconds>(
+ std::chrono::system_clock::now() -
+ std::chrono::system_clock::from_time_t(0))
+ .count();
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+ if (!ansi) return nullptr;
+ const int length = strlen(ansi);
+ const int unicode_length =
+ MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
+ WCHAR* unicode = new WCHAR[unicode_length + 1];
+ MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
+ unicode[unicode_length] = 0;
+ return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
+ if (!utf16_str) return nullptr;
+ const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
+ 0, nullptr, nullptr);
+ char* ansi = new char[ansi_length + 1];
+ WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
+ nullptr);
+ ansi[ansi_length] = 0;
+ return ansi;
+}
+
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings. Returns true if and only if they have the same
+// content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s). A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char* lhs, const char* rhs) {
+ if (lhs == nullptr) return rhs == nullptr;
+
+ if (rhs == nullptr) return false;
+
+ return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+ Message* msg) {
+ for (size_t i = 0; i != length;) { // NOLINT
+ if (wstr[i] != L'\0') {
+ *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+ while (i != length && wstr[i] != L'\0') i++;
+ } else {
+ *msg << '\0';
+ i++;
+ }
+ }
+}
+
+#endif // GTEST_HAS_STD_WSTRING
+
+void SplitString(const ::std::string& str, char delimiter,
+ ::std::vector< ::std::string>* dest) {
+ ::std::vector< ::std::string> parsed;
+ ::std::string::size_type pos = 0;
+ while (::testing::internal::AlwaysTrue()) {
+ const ::std::string::size_type colon = str.find(delimiter, pos);
+ if (colon == ::std::string::npos) {
+ parsed.push_back(str.substr(pos));
+ break;
+ } else {
+ parsed.push_back(str.substr(pos, colon - pos));
+ pos = colon + 1;
+ }
+ }
+ dest->swap(parsed);
+}
+
+} // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+ // By default, we want there to be enough precision when printing
+ // a double to a Message.
+ *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator<<(const wchar_t* wide_c_str) {
+ return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator<<(wchar_t* wide_c_str) {
+ return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator<<(const ::std::wstring& wstr) {
+ internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+ return *this;
+}
+#endif // GTEST_HAS_STD_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+ return internal::StringStreamToString(ss_.get());
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+ const std::vector<size_t>& right) {
+ std::vector<std::vector<double> > costs(
+ left.size() + 1, std::vector<double>(right.size() + 1));
+ std::vector<std::vector<EditType> > best_move(
+ left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+ // Populate for empty right.
+ for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+ costs[l_i][0] = static_cast<double>(l_i);
+ best_move[l_i][0] = kRemove;
+ }
+ // Populate for empty left.
+ for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+ costs[0][r_i] = static_cast<double>(r_i);
+ best_move[0][r_i] = kAdd;
+ }
+
+ for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+ for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+ if (left[l_i] == right[r_i]) {
+ // Found a match. Consume it.
+ costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+ best_move[l_i + 1][r_i + 1] = kMatch;
+ continue;
+ }
+
+ const double add = costs[l_i + 1][r_i];
+ const double remove = costs[l_i][r_i + 1];
+ const double replace = costs[l_i][r_i];
+ if (add < remove && add < replace) {
+ costs[l_i + 1][r_i + 1] = add + 1;
+ best_move[l_i + 1][r_i + 1] = kAdd;
+ } else if (remove < add && remove < replace) {
+ costs[l_i + 1][r_i + 1] = remove + 1;
+ best_move[l_i + 1][r_i + 1] = kRemove;
+ } else {
+ // We make replace a little more expensive than add/remove to lower
+ // their priority.
+ costs[l_i + 1][r_i + 1] = replace + 1.00001;
+ best_move[l_i + 1][r_i + 1] = kReplace;
+ }
+ }
+ }
+
+ // Reconstruct the best path. We do it in reverse order.
+ std::vector<EditType> best_path;
+ for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+ EditType move = best_move[l_i][r_i];
+ best_path.push_back(move);
+ l_i -= move != kAdd;
+ r_i -= move != kRemove;
+ }
+ std::reverse(best_path.begin(), best_path.end());
+ return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+ size_t GetId(const std::string& str) {
+ IdMap::iterator it = ids_.find(str);
+ if (it != ids_.end()) return it->second;
+ size_t id = ids_.size();
+ return ids_[str] = id;
+ }
+
+ private:
+ typedef std::map<std::string, size_t> IdMap;
+ IdMap ids_;
+};
+
+} // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+ const std::vector<std::string>& left,
+ const std::vector<std::string>& right) {
+ std::vector<size_t> left_ids, right_ids;
+ {
+ InternalStrings intern_table;
+ for (size_t i = 0; i < left.size(); ++i) {
+ left_ids.push_back(intern_table.GetId(left[i]));
+ }
+ for (size_t i = 0; i < right.size(); ++i) {
+ right_ids.push_back(intern_table.GetId(right[i]));
+ }
+ }
+ return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+ Hunk(size_t left_start, size_t right_start)
+ : left_start_(left_start),
+ right_start_(right_start),
+ adds_(),
+ removes_(),
+ common_() {}
+
+ void PushLine(char edit, const char* line) {
+ switch (edit) {
+ case ' ':
+ ++common_;
+ FlushEdits();
+ hunk_.push_back(std::make_pair(' ', line));
+ break;
+ case '-':
+ ++removes_;
+ hunk_removes_.push_back(std::make_pair('-', line));
+ break;
+ case '+':
+ ++adds_;
+ hunk_adds_.push_back(std::make_pair('+', line));
+ break;
+ }
+ }
+
+ void PrintTo(std::ostream* os) {
+ PrintHeader(os);
+ FlushEdits();
+ for (std::list<std::pair<char, const char*> >::const_iterator it =
+ hunk_.begin();
+ it != hunk_.end(); ++it) {
+ *os << it->first << it->second << "\n";
+ }
+ }
+
+ bool has_edits() const { return adds_ || removes_; }
+
+ private:
+ void FlushEdits() {
+ hunk_.splice(hunk_.end(), hunk_removes_);
+ hunk_.splice(hunk_.end(), hunk_adds_);
+ }
+
+ // Print a unified diff header for one hunk.
+ // The format is
+ // "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+ // where the left/right parts are omitted if unnecessary.
+ void PrintHeader(std::ostream* ss) const {
+ *ss << "@@ ";
+ if (removes_) {
+ *ss << "-" << left_start_ << "," << (removes_ + common_);
+ }
+ if (removes_ && adds_) {
+ *ss << " ";
+ }
+ if (adds_) {
+ *ss << "+" << right_start_ << "," << (adds_ + common_);
+ }
+ *ss << " @@\n";
+ }
+
+ size_t left_start_, right_start_;
+ size_t adds_, removes_, common_;
+ std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+} // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+ const std::vector<std::string>& right,
+ size_t context) {
+ const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+ size_t l_i = 0, r_i = 0, edit_i = 0;
+ std::stringstream ss;
+ while (edit_i < edits.size()) {
+ // Find first edit.
+ while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+ ++l_i;
+ ++r_i;
+ ++edit_i;
+ }
+
+ // Find the first line to include in the hunk.
+ const size_t prefix_context = std::min(l_i, context);
+ Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+ for (size_t i = prefix_context; i > 0; --i) {
+ hunk.PushLine(' ', left[l_i - i].c_str());
+ }
+
+ // Iterate the edits until we found enough suffix for the hunk or the input
+ // is over.
+ size_t n_suffix = 0;
+ for (; edit_i < edits.size(); ++edit_i) {
+ if (n_suffix >= context) {
+ // Continue only if the next hunk is very close.
+ auto it = edits.begin() + static_cast<int>(edit_i);
+ while (it != edits.end() && *it == kMatch) ++it;
+ if (it == edits.end() ||
+ static_cast<size_t>(it - edits.begin()) - edit_i >= context) {
+ // There is no next edit or it is too far away.
+ break;
+ }
+ }
+
+ EditType edit = edits[edit_i];
+ // Reset count when a non match is found.
+ n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+ if (edit == kMatch || edit == kRemove || edit == kReplace) {
+ hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+ }
+ if (edit == kAdd || edit == kReplace) {
+ hunk.PushLine('+', right[r_i].c_str());
+ }
+
+ // Advance indices, depending on edit type.
+ l_i += edit != kAdd;
+ r_i += edit != kRemove;
+ }
+
+ if (!hunk.has_edits()) {
+ // We are done. We don't want this hunk.
+ break;
+ }
+
+ hunk.PrintTo(&ss);
+ }
+ return ss.str();
+}
+
+} // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+ std::vector<std::string> lines;
+ size_t start = 0, end = str.size();
+ if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+ ++start;
+ --end;
+ }
+ bool escaped = false;
+ for (size_t i = start; i + 1 < end; ++i) {
+ if (escaped) {
+ escaped = false;
+ if (str[i] == 'n') {
+ lines.push_back(str.substr(start, i - start - 1));
+ start = i + 1;
+ }
+ } else {
+ escaped = str[i] == '\\';
+ }
+ }
+ lines.push_back(str.substr(start, end - start));
+ return lines;
+}
+
+} // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings. For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+// lhs_expression: "foo"
+// rhs_expression: "bar"
+// lhs_value: "5"
+// rhs_value: "6"
+//
+// The ignoring_case parameter is true if and only if the assertion is a
+// *_STRCASEEQ*. When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* lhs_expression,
+ const char* rhs_expression,
+ const std::string& lhs_value,
+ const std::string& rhs_value, bool ignoring_case) {
+ Message msg;
+ msg << "Expected equality of these values:";
+ msg << "\n " << lhs_expression;
+ if (lhs_value != lhs_expression) {
+ msg << "\n Which is: " << lhs_value;
+ }
+ msg << "\n " << rhs_expression;
+ if (rhs_value != rhs_expression) {
+ msg << "\n Which is: " << rhs_value;
+ }
+
+ if (ignoring_case) {
+ msg << "\nIgnoring case";
+ }
+
+ if (!lhs_value.empty() && !rhs_value.empty()) {
+ const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+ const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
+ if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+ msg << "\nWith diff:\n"
+ << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+ }
+ }
+
+ return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+ const AssertionResult& assertion_result, const char* expression_text,
+ const char* actual_predicate_value, const char* expected_predicate_value) {
+ const char* actual_message = assertion_result.message();
+ Message msg;
+ msg << "Value of: " << expression_text
+ << "\n Actual: " << actual_predicate_value;
+ if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
+ msg << "\nExpected: " << expected_predicate_value;
+ return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+ const char* abs_error_expr, double val1,
+ double val2, double abs_error) {
+ const double diff = fabs(val1 - val2);
+ if (diff <= abs_error) return AssertionSuccess();
+
+ // Find the value which is closest to zero.
+ const double min_abs = std::min(fabs(val1), fabs(val2));
+ // Find the distance to the next double from that value.
+ const double epsilon =
+ nextafter(min_abs, std::numeric_limits<double>::infinity()) - min_abs;
+ // Detect the case where abs_error is so small that EXPECT_NEAR is
+ // effectively the same as EXPECT_EQUAL, and give an informative error
+ // message so that the situation can be more easily understood without
+ // requiring exotic floating-point knowledge.
+ // Don't do an epsilon check if abs_error is zero because that implies
+ // that an equality check was actually intended.
+ if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 &&
+ abs_error < epsilon) {
+ return AssertionFailure()
+ << "The difference between " << expr1 << " and " << expr2 << " is "
+ << diff << ", where\n"
+ << expr1 << " evaluates to " << val1 << ",\n"
+ << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter "
+ << abs_error_expr << " evaluates to " << abs_error
+ << " which is smaller than the minimum distance between doubles for "
+ "numbers of this magnitude which is "
+ << epsilon
+ << ", thus making this EXPECT_NEAR check equivalent to "
+ "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
+ }
+ return AssertionFailure()
+ << "The difference between " << expr1 << " and " << expr2 << " is "
+ << diff << ", which exceeds " << abs_error_expr << ", where\n"
+ << expr1 << " evaluates to " << val1 << ",\n"
+ << expr2 << " evaluates to " << val2 << ", and\n"
+ << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+ RawType val1, RawType val2) {
+ // Returns success if val1 is less than val2,
+ if (val1 < val2) {
+ return AssertionSuccess();
+ }
+
+ // or if val1 is almost equal to val2.
+ const FloatingPoint<RawType> lhs(val1), rhs(val2);
+ if (lhs.AlmostEquals(rhs)) {
+ return AssertionSuccess();
+ }
+
+ // Note that the above two checks will both fail if either val1 or
+ // val2 is NaN, as the IEEE floating-point standard requires that
+ // any predicate involving a NaN must return false.
+
+ ::std::stringstream val1_ss;
+ val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+ << val1;
+
+ ::std::stringstream val2_ss;
+ val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+ << val2;
+
+ return AssertionFailure()
+ << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+ << " Actual: " << StringStreamToString(&val1_ss) << " vs "
+ << StringStreamToString(&val2_ss);
+}
+
+} // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2. Fails
+// otherwise. In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+ float val2) {
+ return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2. Fails
+// otherwise. In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+ double val2) {
+ return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+ const char* rhs_expression, const char* lhs,
+ const char* rhs) {
+ if (String::CStringEquals(lhs, rhs)) {
+ return AssertionSuccess();
+ }
+
+ return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+ PrintToString(rhs), false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+ const char* rhs_expression, const char* lhs,
+ const char* rhs) {
+ if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+ return AssertionSuccess();
+ }
+
+ return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+ PrintToString(rhs), true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+ const char* s2_expression, const char* s1,
+ const char* s2) {
+ if (!String::CStringEquals(s1, s2)) {
+ return AssertionSuccess();
+ } else {
+ return AssertionFailure()
+ << "Expected: (" << s1_expression << ") != (" << s2_expression
+ << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+ }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+ const char* s2_expression, const char* s1,
+ const char* s2) {
+ if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+ return AssertionSuccess();
+ } else {
+ return AssertionFailure()
+ << "Expected: (" << s1_expression << ") != (" << s2_expression
+ << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+ }
+}
+
+} // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true if and only if needle
+// is a substring of haystack. NULL is considered a substring of
+// itself only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+ if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+ return strstr(haystack, needle) != nullptr;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+ if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+ return wcsstr(haystack, needle) != nullptr;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
+ return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+ const char* needle_expr,
+ const char* haystack_expr,
+ const StringType& needle,
+ const StringType& haystack) {
+ if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+ return AssertionSuccess();
+
+ const bool is_wide_string = sizeof(needle[0]) > 1;
+ const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+ return AssertionFailure()
+ << "Value of: " << needle_expr << "\n"
+ << " Actual: " << begin_string_quote << needle << "\"\n"
+ << "Expected: " << (expected_to_be_substring ? "" : "not ")
+ << "a substring of " << haystack_expr << "\n"
+ << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+} // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const char* needle, const char* haystack) {
+ return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const wchar_t* needle, const wchar_t* haystack) {
+ return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr, const char* needle,
+ const char* haystack) {
+ return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr, const wchar_t* needle,
+ const wchar_t* haystack) {
+ return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack) {
+ return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::string& needle,
+ const ::std::string& haystack) {
+ return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack) {
+ return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+ const char* haystack_expr,
+ const ::std::wstring& needle,
+ const ::std::wstring& haystack) {
+ return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
+ long hr) { // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+
+ // Windows CE doesn't support FormatMessage.
+ const char error_text[] = "";
+
+#else
+
+ // Looks up the human-readable system message for the HRESULT code
+ // and since we're not passing any params to FormatMessage, we don't
+ // want inserts expanded.
+ const DWORD kFlags =
+ FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
+ const DWORD kBufSize = 4096;
+ // Gets the system's human readable message string for this HRESULT.
+ char error_text[kBufSize] = {'\0'};
+ DWORD message_length = ::FormatMessageA(kFlags,
+ 0, // no source, we're asking system
+ static_cast<DWORD>(hr), // the error
+ 0, // no line width restrictions
+ error_text, // output buffer
+ kBufSize, // buf size
+ nullptr); // no arguments for inserts
+ // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+ for (; message_length && IsSpace(error_text[message_length - 1]);
+ --message_length) {
+ error_text[message_length - 1] = '\0';
+ }
+
+#endif // GTEST_OS_WINDOWS_MOBILE
+
+ const std::string error_hex("0x" + String::FormatHexInt(hr));
+ return ::testing::AssertionFailure()
+ << "Expected: " << expr << " " << expected << ".\n"
+ << " Actual: " << error_hex << " " << error_text << "\n";
+}
+
+} // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT
+ if (SUCCEEDED(hr)) {
+ return AssertionSuccess();
+ }
+ return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT
+ if (FAILED(hr)) {
+ return AssertionSuccess();
+ }
+ return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length Encoding
+// 0 - 7 bits 0xxxxxxx
+// 8 - 11 bits 110xxxxx 10xxxxxx
+// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx
+// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint3 =
+ (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint4 =
+ (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern. Returns the n
+// lowest bits. As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline uint32_t ChopLowBits(uint32_t* bits, int n) {
+ const uint32_t low_bits = *bits & ((static_cast<uint32_t>(1) << n) - 1);
+ *bits >>= n;
+ return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type uint32_t because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(uint32_t code_point) {
+ if (code_point > kMaxCodePoint4) {
+ return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
+ }
+
+ char str[5]; // Big enough for the largest valid code point.
+ if (code_point <= kMaxCodePoint1) {
+ str[1] = '\0';
+ str[0] = static_cast<char>(code_point); // 0xxxxxxx
+ } else if (code_point <= kMaxCodePoint2) {
+ str[2] = '\0';
+ str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
+ str[0] = static_cast<char>(0xC0 | code_point); // 110xxxxx
+ } else if (code_point <= kMaxCodePoint3) {
+ str[3] = '\0';
+ str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
+ str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
+ str[0] = static_cast<char>(0xE0 | code_point); // 1110xxxx
+ } else { // code_point <= kMaxCodePoint4
+ str[4] = '\0';
+ str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
+ str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
+ str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
+ str[0] = static_cast<char>(0xF0 | code_point); // 11110xxx
+ }
+ return str;
+}
+
+// The following two functions only make sense if the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+ return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+ (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+ wchar_t second) {
+ const auto first_u = static_cast<uint32_t>(first);
+ const auto second_u = static_cast<uint32_t>(second);
+ const uint32_t mask = (1 << 10) - 1;
+ return (sizeof(wchar_t) == 2)
+ ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
+ :
+ // This function should not be called when the condition is
+ // false, but we provide a sensible default in case it is.
+ first_u;
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+// UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+ if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
+
+ ::std::stringstream stream;
+ for (int i = 0; i < num_chars; ++i) {
+ uint32_t unicode_code_point;
+
+ if (str[i] == L'\0') {
+ break;
+ } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+ unicode_code_point =
+ CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
+ i++;
+ } else {
+ unicode_code_point = static_cast<uint32_t>(str[i]);
+ }
+
+ stream << CodePointToUtf8(unicode_code_point);
+ }
+ return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
+ if (wide_c_str == nullptr) return "(null)";
+
+ return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings. Returns true if and only if they have the
+// same content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s). A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
+ if (lhs == nullptr) return rhs == nullptr;
+
+ if (rhs == nullptr) return false;
+
+ return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+ const char* rhs_expression, const wchar_t* lhs,
+ const wchar_t* rhs) {
+ if (String::WideCStringEquals(lhs, rhs)) {
+ return AssertionSuccess();
+ }
+
+ return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+ PrintToString(rhs), false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+ const char* s2_expression, const wchar_t* s1,
+ const wchar_t* s2) {
+ if (!String::WideCStringEquals(s1, s2)) {
+ return AssertionSuccess();
+ }
+
+ return AssertionFailure()
+ << "Expected: (" << s1_expression << ") != (" << s2_expression
+ << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case. Returns true if and only if they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s). A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+ if (lhs == nullptr) return rhs == nullptr;
+ if (rhs == nullptr) return false;
+ return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+// Compares two wide C strings, ignoring case. Returns true if and only if they
+// have the same content.
+//
+// Unlike wcscasecmp(), this function can handle NULL argument(s).
+// A NULL C string is considered different to any non-NULL wide C string,
+// including the empty string.
+// NB: The implementations on different platforms slightly differ.
+// On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+// environment variable. On GNU platform this method uses wcscasecmp
+// which compares according to LC_CTYPE category of the current locale.
+// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+// current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+ const wchar_t* rhs) {
+ if (lhs == nullptr) return rhs == nullptr;
+
+ if (rhs == nullptr) return false;
+
+#if GTEST_OS_WINDOWS
+ return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+ return wcscasecmp(lhs, rhs) == 0;
+#else
+ // Android, Mac OS X and Cygwin don't define wcscasecmp.
+ // Other unknown OSes may not define it either.
+ wint_t left, right;
+ do {
+ left = towlower(static_cast<wint_t>(*lhs++));
+ right = towlower(static_cast<wint_t>(*rhs++));
+ } while (left && left == right);
+ return left == right;
+#endif // OS selector
+}
+
+// Returns true if and only if str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(const std::string& str,
+ const std::string& suffix) {
+ const size_t str_len = str.length();
+ const size_t suffix_len = suffix.length();
+ return (str_len >= suffix_len) &&
+ CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+ suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+ return FormatIntWidthN(value, 2);
+}
+
+// Formats an int value to given width with leading zeros.
+std::string String::FormatIntWidthN(int value, int width) {
+ std::stringstream ss;
+ ss << std::setfill('0') << std::setw(width) << value;
+ return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexUInt32(uint32_t value) {
+ std::stringstream ss;
+ ss << std::hex << std::uppercase << value;
+ return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+ return FormatHexUInt32(static_cast<uint32_t>(value));
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+ std::stringstream ss;
+ ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+ << static_cast<unsigned int>(value);
+ return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+ const ::std::string& str = ss->str();
+ const char* const start = str.c_str();
+ const char* const end = start + str.length();
+
+ std::string result;
+ result.reserve(static_cast<size_t>(2 * (end - start)));
+ for (const char* ch = start; ch != end; ++ch) {
+ if (*ch == '\0') {
+ result += "\\0"; // Replaces NUL with "\\0";
+ } else {
+ result += *ch;
+ }
+ }
+
+ return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+ const Message& user_msg) {
+ // Appends the user message if it's non-empty.
+ const std::string user_msg_string = user_msg.GetString();
+ if (user_msg_string.empty()) {
+ return gtest_msg;
+ }
+ if (gtest_msg.empty()) {
+ return user_msg_string;
+ }
+ return gtest_msg + "\n" + user_msg_string;
+}
+
+} // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+ : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
+
+// D'tor.
+TestResult::~TestResult() {}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+ if (i < 0 || i >= total_part_count()) internal::posix::Abort();
+ return test_part_results_.at(static_cast<size_t>(i));
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+ if (i < 0 || i >= test_property_count()) internal::posix::Abort();
+ return test_properties_.at(static_cast<size_t>(i));
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+ test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+ const TestProperty& test_property) {
+ if (!ValidateTestProperty(xml_element, test_property)) {
+ return;
+ }
+ internal::MutexLock lock(&test_properties_mutex_);
+ const std::vector<TestProperty>::iterator property_with_matching_key =
+ std::find_if(test_properties_.begin(), test_properties_.end(),
+ internal::TestPropertyKeyIs(test_property.key()));
+ if (property_with_matching_key == test_properties_.end()) {
+ test_properties_.push_back(test_property);
+ return;
+ }
+ property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+ "disabled", "errors", "failures", "name",
+ "random_seed", "tests", "time", "timestamp"};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+ "disabled", "errors", "failures", "name",
+ "tests", "time", "timestamp", "skipped"};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+ "classname", "name", "status", "time",
+ "type_param", "value_param", "file", "line"};
+
+// Use a slightly different set for allowed output to ensure existing tests can
+// still RecordProperty("result") or "RecordProperty(timestamp")
+static const char* const kReservedOutputTestCaseAttributes[] = {
+ "classname", "name", "status", "time", "type_param",
+ "value_param", "file", "line", "result", "timestamp"};
+
+template <size_t kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+ return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+ const std::string& xml_element) {
+ if (xml_element == "testsuites") {
+ return ArrayAsVector(kReservedTestSuitesAttributes);
+ } else if (xml_element == "testsuite") {
+ return ArrayAsVector(kReservedTestSuiteAttributes);
+ } else if (xml_element == "testcase") {
+ return ArrayAsVector(kReservedTestCaseAttributes);
+ } else {
+ GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+ }
+ // This code is unreachable but some compilers may not realizes that.
+ return std::vector<std::string>();
+}
+
+// TODO(jdesprez): Merge the two getReserved attributes once skip is improved
+static std::vector<std::string> GetReservedOutputAttributesForElement(
+ const std::string& xml_element) {
+ if (xml_element == "testsuites") {
+ return ArrayAsVector(kReservedTestSuitesAttributes);
+ } else if (xml_element == "testsuite") {
+ return ArrayAsVector(kReservedTestSuiteAttributes);
+ } else if (xml_element == "testcase") {
+ return ArrayAsVector(kReservedOutputTestCaseAttributes);
+ } else {
+ GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+ }
+ // This code is unreachable but some compilers may not realizes that.
+ return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+ Message word_list;
+ for (size_t i = 0; i < words.size(); ++i) {
+ if (i > 0 && words.size() > 2) {
+ word_list << ", ";
+ }
+ if (i == words.size() - 1) {
+ word_list << "and ";
+ }
+ word_list << "'" << words[i] << "'";
+ }
+ return word_list.GetString();
+}
+
+static bool ValidateTestPropertyName(
+ const std::string& property_name,
+ const std::vector<std::string>& reserved_names) {
+ if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+ reserved_names.end()) {
+ ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+ << " (" << FormatWordList(reserved_names)
+ << " are reserved by " << GTEST_NAME_ << ")";
+ return false;
+ }
+ return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element. Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+ const TestProperty& test_property) {
+ return ValidateTestPropertyName(test_property.key(),
+ GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+ test_part_results_.clear();
+ test_properties_.clear();
+ death_test_count_ = 0;
+ elapsed_time_ = 0;
+}
+
+// Returns true off the test part was skipped.
+static bool TestPartSkipped(const TestPartResult& result) {
+ return result.skipped();
+}
+
+// Returns true if and only if the test was skipped.
+bool TestResult::Skipped() const {
+ return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
+}
+
+// Returns true if and only if the test failed.
+bool TestResult::Failed() const {
+ for (int i = 0; i < total_part_count(); ++i) {
+ if (GetTestPartResult(i).failed()) return true;
+ }
+ return false;
+}
+
+// Returns true if and only if the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+ return result.fatally_failed();
+}
+
+// Returns true if and only if the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+ return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true if and only if the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+ return result.nonfatally_failed();
+}
+
+// Returns true if and only if the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+ return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts. This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+ return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+ return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
+
+// The d'tor restores the states of all flags. The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+ UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+ Message value_message;
+ value_message << value;
+ RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+ const std::string& message) {
+ // This function is a friend of UnitTest and as such has access to
+ // AddTestPartResult.
+ UnitTest::GetInstance()->AddTestPartResult(
+ result_type,
+ nullptr, // No info about the source file where the exception occurred.
+ -1, // We have no info on which line caused the exception.
+ message,
+ ""); // No stack trace, either.
+}
+
+} // namespace internal
+
+// Google Test requires all tests in the same test suite to use the same test
+// fixture class. This function checks if the current test has the
+// same fixture class as the first test in the current test suite. If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ const TestSuite* const test_suite = impl->current_test_suite();
+
+ // Info about the first test in the current test suite.
+ const TestInfo* const first_test_info = test_suite->test_info_list()[0];
+ const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+ const char* const first_test_name = first_test_info->name();
+
+ // Info about the current test.
+ const TestInfo* const this_test_info = impl->current_test_info();
+ const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+ const char* const this_test_name = this_test_info->name();
+
+ if (this_fixture_id != first_fixture_id) {
+ // Is the first test defined using TEST?
+ const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+ // Is this test defined using TEST?
+ const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+ if (first_is_TEST || this_is_TEST) {
+ // Both TEST and TEST_F appear in same test suite, which is incorrect.
+ // Tell the user how to fix this.
+
+ // Gets the name of the TEST and the name of the TEST_F. Note
+ // that first_is_TEST and this_is_TEST cannot both be true, as
+ // the fixture IDs are different for the two tests.
+ const char* const TEST_name =
+ first_is_TEST ? first_test_name : this_test_name;
+ const char* const TEST_F_name =
+ first_is_TEST ? this_test_name : first_test_name;
+
+ ADD_FAILURE()
+ << "All tests in the same test suite must use the same test fixture\n"
+ << "class, so mixing TEST_F and TEST in the same test suite is\n"
+ << "illegal. In test suite " << this_test_info->test_suite_name()
+ << ",\n"
+ << "test " << TEST_F_name << " is defined using TEST_F but\n"
+ << "test " << TEST_name << " is defined using TEST. You probably\n"
+ << "want to change the TEST to TEST_F or move it to another test\n"
+ << "case.";
+ } else {
+ // Two fixture classes with the same name appear in two different
+ // namespaces, which is not allowed. Tell the user how to fix this.
+ ADD_FAILURE()
+ << "All tests in the same test suite must use the same test fixture\n"
+ << "class. However, in test suite "
+ << this_test_info->test_suite_name() << ",\n"
+ << "you defined test " << first_test_name << " and test "
+ << this_test_name << "\n"
+ << "using two different test fixture classes. This can happen if\n"
+ << "the two classes are from different namespaces or translation\n"
+ << "units and have the same name. You should probably rename one\n"
+ << "of the classes to put the tests into different test suites.";
+ }
+ return false;
+ }
+
+ return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test. This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+ const char* location) {
+ Message message;
+ message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+ << std::setbase(10) << " thrown in " << location << ".";
+
+ return new std::string(message.GetString());
+}
+
+#endif // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+ const char* location) {
+ Message message;
+ if (description != nullptr) {
+ message << "C++ exception with description \"" << description << "\"";
+ } else {
+ message << "Unknown C++ exception";
+ }
+ message << " thrown in " << location << ".";
+
+ return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+ const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+ const TestPartResult& failure)
+ : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception. (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function. Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+ const char* location) {
+#if GTEST_HAS_SEH
+ __try {
+ return (object->*method)();
+ } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT
+ GetExceptionCode())) {
+ // We create the exception message on the heap because VC++ prohibits
+ // creation of objects with destructors on stack in functions using __try
+ // (see error C2712).
+ std::string* exception_message =
+ FormatSehExceptionMessage(GetExceptionCode(), location);
+ internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+ *exception_message);
+ delete exception_message;
+ return static_cast<Result>(0);
+ }
+#else
+ (void)location;
+ return (object->*method)();
+#endif // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+ const char* location) {
+ // NOTE: The user code can affect the way in which Google Test handles
+ // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+ // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+ // after the exception is caught and either report or re-throw the
+ // exception based on the flag's value:
+ //
+ // try {
+ // // Perform the test method.
+ // } catch (...) {
+ // if (GTEST_FLAG_GET(catch_exceptions))
+ // // Report the exception as failure.
+ // else
+ // throw; // Re-throws the original exception.
+ // }
+ //
+ // However, the purpose of this flag is to allow the program to drop into
+ // the debugger when the exception is thrown. On most platforms, once the
+ // control enters the catch block, the exception origin information is
+ // lost and the debugger will stop the program at the point of the
+ // re-throw in this function -- instead of at the point of the original
+ // throw statement in the code under test. For this reason, we perform
+ // the check early, sacrificing the ability to affect Google Test's
+ // exception handling in the method where the exception is thrown.
+ if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+ try {
+ return HandleSehExceptionsInMethodIfSupported(object, method, location);
+ } catch (const AssertionException&) { // NOLINT
+ // This failure was reported already.
+ } catch (const internal::GoogleTestFailureException&) { // NOLINT
+ // This exception type can only be thrown by a failed Google
+ // Test assertion with the intention of letting another testing
+ // framework catch it. Therefore we just re-throw it.
+ throw;
+ } catch (const std::exception& e) { // NOLINT
+ internal::ReportFailureInUnknownLocation(
+ TestPartResult::kFatalFailure,
+ FormatCxxExceptionMessage(e.what(), location));
+ } catch (...) { // NOLINT
+ internal::ReportFailureInUnknownLocation(
+ TestPartResult::kFatalFailure,
+ FormatCxxExceptionMessage(nullptr, location));
+ }
+ return static_cast<Result>(0);
+#else
+ return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif // GTEST_HAS_EXCEPTIONS
+ } else {
+ return (object->*method)();
+ }
+}
+
+} // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+ if (!HasSameFixtureClass()) return;
+
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+ internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+ // We will run the test only if SetUp() was successful and didn't call
+ // GTEST_SKIP().
+ if (!HasFatalFailure() && !IsSkipped()) {
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+ internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+ "the test body");
+ }
+
+ // However, we want to clean up as much as possible. Hence we will
+ // always call TearDown(), even if SetUp() or the test body has
+ // failed.
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+ internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+ "TearDown()");
+}
+
+// Returns true if and only if the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+ return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true if and only if the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+ return internal::GetUnitTestImpl()
+ ->current_test_result()
+ ->HasNonfatalFailure();
+}
+
+// Returns true if and only if the current test was skipped.
+bool Test::IsSkipped() {
+ return internal::GetUnitTestImpl()->current_test_result()->Skipped();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_suite_name,
+ const std::string& a_name, const char* a_type_param,
+ const char* a_value_param,
+ internal::CodeLocation a_code_location,
+ internal::TypeId fixture_class_id,
+ internal::TestFactoryBase* factory)
+ : test_suite_name_(a_test_suite_name),
+ name_(a_name),
+ type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+ value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
+ location_(a_code_location),
+ fixture_class_id_(fixture_class_id),
+ should_run_(false),
+ is_disabled_(false),
+ matches_filter_(false),
+ is_in_another_shard_(false),
+ factory_(factory),
+ result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+// test_suite_name: name of the test suite
+// name: name of the test
+// type_param: the name of the test's type parameter, or NULL if
+// this is not a typed or a type-parameterized test.
+// value_param: text representation of the test's value parameter,
+// or NULL if this is not a value-parameterized test.
+// code_location: code location where the test is defined
+// fixture_class_id: ID of the test fixture class
+// set_up_tc: pointer to the function that sets up the test suite
+// tear_down_tc: pointer to the function that tears down the test suite
+// factory: pointer to the factory that creates a test object.
+// The newly created TestInfo instance will assume
+// ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+ const char* test_suite_name, const char* name, const char* type_param,
+ const char* value_param, CodeLocation code_location,
+ TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+ TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) {
+ TestInfo* const test_info =
+ new TestInfo(test_suite_name, name, type_param, value_param,
+ code_location, fixture_class_id, factory);
+ GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+ return test_info;
+}
+
+void ReportInvalidTestSuiteType(const char* test_suite_name,
+ CodeLocation code_location) {
+ Message errors;
+ errors
+ << "Attempted redefinition of test suite " << test_suite_name << ".\n"
+ << "All tests in the same test suite must use the same test fixture\n"
+ << "class. However, in test suite " << test_suite_name << ", you tried\n"
+ << "to define a test using a fixture class different from the one\n"
+ << "used earlier. This can happen if the two fixture classes are\n"
+ << "from different namespaces and have the same name. You should\n"
+ << "probably rename one of the classes to put the tests into different\n"
+ << "test suites.";
+
+ GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+ code_location.line)
+ << " " << errors.GetString();
+}
+} // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestSuite class only. We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+ // Constructor.
+ //
+ // TestNameIs has NO default constructor.
+ explicit TestNameIs(const char* name) : name_(name) {}
+
+ // Returns true if and only if the test name of test_info matches name_.
+ bool operator()(const TestInfo* test_info) const {
+ return test_info && test_info->name() == name_;
+ }
+
+ private:
+ std::string name_;
+};
+
+} // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+ if (!parameterized_tests_registered_) {
+ parameterized_test_registry_.RegisterTests();
+ type_parameterized_test_registry_.CheckForInstantiations();
+ parameterized_tests_registered_ = true;
+ }
+}
+
+} // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+ TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+ if (!should_run_) {
+ if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+ return;
+ }
+
+ // Tells UnitTest where to store test result.
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ impl->set_current_test_info(this);
+
+ // Notifies the unit test event listeners that a test is about to start.
+ repeater->OnTestStart(*this);
+ result_.set_start_timestamp(internal::GetTimeInMillis());
+ internal::Timer timer;
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+
+ // Creates the test object.
+ Test* const test = internal::HandleExceptionsInMethodIfSupported(
+ factory_, &internal::TestFactoryBase::CreateTest,
+ "the test fixture's constructor");
+
+ // Runs the test if the constructor didn't generate a fatal failure or invoke
+ // GTEST_SKIP().
+ // Note that the object will not be null
+ if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
+ // This doesn't throw as all user code that can throw are wrapped into
+ // exception handling code.
+ test->Run();
+ }
+
+ if (test != nullptr) {
+ // Deletes the test object.
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+ internal::HandleExceptionsInMethodIfSupported(
+ test, &Test::DeleteSelf_, "the test fixture's destructor");
+ }
+
+ result_.set_elapsed_time(timer.Elapsed());
+
+ // Notifies the unit test event listener that a test has just finished.
+ repeater->OnTestEnd(*this);
+
+ // Tells UnitTest to stop associating assertion results to this
+ // test.
+ impl->set_current_test_info(nullptr);
+}
+
+// Skip and records a skipped test result for this object.
+void TestInfo::Skip() {
+ if (!should_run_) return;
+
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ impl->set_current_test_info(this);
+
+ TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+ // Notifies the unit test event listeners that a test is about to start.
+ repeater->OnTestStart(*this);
+
+ const TestPartResult test_part_result =
+ TestPartResult(TestPartResult::kSkip, this->file(), this->line(), "");
+ impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+ test_part_result);
+
+ // Notifies the unit test event listener that a test has just finished.
+ repeater->OnTestEnd(*this);
+ impl->set_current_test_info(nullptr);
+}
+
+// class TestSuite
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::successful_test_count() const {
+ return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::skipped_test_count() const {
+ return CountIf(test_info_list_, TestSkipped);
+}
+
+// Gets the number of failed tests in this test suite.
+int TestSuite::failed_test_count() const {
+ return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestSuite::reportable_disabled_test_count() const {
+ return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test suite.
+int TestSuite::disabled_test_count() const {
+ return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestSuite::reportable_test_count() const {
+ return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test suite that should run.
+int TestSuite::test_to_run_count() const {
+ return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestSuite::total_test_count() const {
+ return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestSuite with the given name.
+//
+// Arguments:
+//
+// a_name: name of the test suite
+// a_type_param: the name of the test suite's type parameter, or NULL if
+// this is not a typed or a type-parameterized test suite.
+// set_up_tc: pointer to the function that sets up the test suite
+// tear_down_tc: pointer to the function that tears down the test suite
+TestSuite::TestSuite(const char* a_name, const char* a_type_param,
+ internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc)
+ : name_(a_name),
+ type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+ set_up_tc_(set_up_tc),
+ tear_down_tc_(tear_down_tc),
+ should_run_(false),
+ start_timestamp_(0),
+ elapsed_time_(0) {}
+
+// Destructor of TestSuite.
+TestSuite::~TestSuite() {
+ // Deletes every Test in the collection.
+ ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestSuite::GetTestInfo(int i) const {
+ const int index = GetElementOr(test_indices_, i, -1);
+ return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestSuite::GetMutableTestInfo(int i) {
+ const int index = GetElementOr(test_indices_, i, -1);
+ return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Adds a test to this test suite. Will delete the test upon
+// destruction of the TestSuite object.
+void TestSuite::AddTestInfo(TestInfo* test_info) {
+ test_info_list_.push_back(test_info);
+ test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestSuite.
+void TestSuite::Run() {
+ if (!should_run_) return;
+
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ impl->set_current_test_suite(this);
+
+ TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+ // Call both legacy and the new API
+ repeater->OnTestSuiteStart(*this);
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ repeater->OnTestCaseStart(*this);
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+ internal::HandleExceptionsInMethodIfSupported(
+ this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
+
+ const bool skip_all = ad_hoc_test_result().Failed();
+
+ start_timestamp_ = internal::GetTimeInMillis();
+ internal::Timer timer;
+ for (int i = 0; i < total_test_count(); i++) {
+ if (skip_all) {
+ GetMutableTestInfo(i)->Skip();
+ } else {
+ GetMutableTestInfo(i)->Run();
+ }
+ if (GTEST_FLAG_GET(fail_fast) &&
+ GetMutableTestInfo(i)->result()->Failed()) {
+ for (int j = i + 1; j < total_test_count(); j++) {
+ GetMutableTestInfo(j)->Skip();
+ }
+ break;
+ }
+ }
+ elapsed_time_ = timer.Elapsed();
+
+ impl->os_stack_trace_getter()->UponLeavingGTest();
+ internal::HandleExceptionsInMethodIfSupported(
+ this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
+
+ // Call both legacy and the new API
+ repeater->OnTestSuiteEnd(*this);
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ repeater->OnTestCaseEnd(*this);
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ impl->set_current_test_suite(nullptr);
+}
+
+// Skips all tests under this TestSuite.
+void TestSuite::Skip() {
+ if (!should_run_) return;
+
+ internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+ impl->set_current_test_suite(this);
+
+ TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+ // Call both legacy and the new API
+ repeater->OnTestSuiteStart(*this);
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ repeater->OnTestCaseStart(*this);
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ for (int i = 0; i < total_test_count(); i++) {
+ GetMutableTestInfo(i)->Skip();
+ }
+
+ // Call both legacy and the new API
+ repeater->OnTestSuiteEnd(*this);
+ // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ repeater->OnTestCaseEnd(*this);
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ impl->set_current_test_suite(nullptr);
+}
+
+// Clears the results of all tests in this test suite.
+void TestSuite::ClearResult() {
+ ad_hoc_test_result_.Clear();
+ ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test suite.
+void TestSuite::ShuffleTests(internal::Random* random) {
+ Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestSuite::UnshuffleTests() {
+ for (size_t i = 0; i < test_indices_.size(); i++) {
+ test_indices_[i] = static_cast<int>(i);
+ }
+}
+
+// Formats a countable noun. Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count, const char* singular_form,
+ const char* plural_form) {
+ return internal::StreamableToString(count) + " " +
+ (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+ return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test suites.
+static std::string FormatTestSuiteCount(int test_suite_count) {
+ return FormatCountableNoun(test_suite_count, "test suite", "test suites");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation. Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
+ switch (type) {
+ case TestPartResult::kSkip:
+ return "Skipped\n";
+ case TestPartResult::kSuccess:
+ return "Success";
+
+ case TestPartResult::kNonFatalFailure:
+ case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+ return "error: ";
+#else
+ return "Failure\n";
+#endif
+ default:
+ return "Unknown result type";
+ }
+}
+
+namespace internal {
+namespace {
+enum class GTestColor { kDefault, kRed, kGreen, kYellow };
+} // namespace
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+ const TestPartResult& test_part_result) {
+ return (Message() << internal::FormatFileLocation(
+ test_part_result.file_name(),
+ test_part_result.line_number())
+ << " "
+ << TestPartResultTypeToString(test_part_result.type())
+ << test_part_result.message())
+ .GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+ const std::string& result = PrintTestPartResultToString(test_part_result);
+ printf("%s\n", result.c_str());
+ fflush(stdout);
+ // If the test program runs in Visual Studio or a debugger, the
+ // following statements add the test part result message to the Output
+ // window such that the user can double-click on it to jump to the
+ // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+ // We don't call OutputDebugString*() on Windows Mobile, as printing
+ // to stdout is done by OutputDebugString() there already - we don't
+ // want the same message printed twice.
+ ::OutputDebugStringA(result.c_str());
+ ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+ !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+
+// Returns the character attribute for the given color.
+static WORD GetColorAttribute(GTestColor color) {
+ switch (color) {
+ case GTestColor::kRed:
+ return FOREGROUND_RED;
+ case GTestColor::kGreen:
+ return FOREGROUND_GREEN;
+ case GTestColor::kYellow:
+ return FOREGROUND_RED | FOREGROUND_GREEN;
+ default:
+ return 0;
+ }
+}
+
+static int GetBitOffset(WORD color_mask) {
+ if (color_mask == 0) return 0;
+
+ int bitOffset = 0;
+ while ((color_mask & 1) == 0) {
+ color_mask >>= 1;
+ ++bitOffset;
+ }
+ return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+ // Let's reuse the BG
+ static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+ BACKGROUND_RED | BACKGROUND_INTENSITY;
+ static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+ FOREGROUND_RED | FOREGROUND_INTENSITY;
+ const WORD existing_bg = old_color_attrs & background_mask;
+
+ WORD new_color =
+ GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+ static const int bg_bitOffset = GetBitOffset(background_mask);
+ static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+ if (((new_color & background_mask) >> bg_bitOffset) ==
+ ((new_color & foreground_mask) >> fg_bitOffset)) {
+ new_color ^= FOREGROUND_INTENSITY; // invert intensity
+ }
+ return new_color;
+}
+
+#else
+
+// Returns the ANSI color code for the given color. GTestColor::kDefault is
+// an invalid input.
+static const char* GetAnsiColorCode(GTestColor color) {
+ switch (color) {
+ case GTestColor::kRed:
+ return "1";
+ case GTestColor::kGreen:
+ return "2";
+ case GTestColor::kYellow:
+ return "3";
+ default:
+ return nullptr;
+ }
+}
+
+#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true if and only if Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+ std::string c = GTEST_FLAG_GET(color);
+ const char* const gtest_color = c.c_str();
+
+ if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+ // On Windows the TERM variable is usually not set, but the
+ // console there does support colors.
+ return stdout_is_tty;
+#else
+ // On non-Windows platforms, we rely on the TERM variable.
+ const char* const term = posix::GetEnv("TERM");
+ const bool term_supports_color =
+ String::CStringEquals(term, "xterm") ||
+ String::CStringEquals(term, "xterm-color") ||
+ String::CStringEquals(term, "xterm-256color") ||
+ String::CStringEquals(term, "screen") ||
+ String::CStringEquals(term, "screen-256color") ||
+ String::CStringEquals(term, "tmux") ||
+ String::CStringEquals(term, "tmux-256color") ||
+ String::CStringEquals(term, "rxvt-unicode") ||
+ String::CStringEquals(term, "rxvt-unicode-256color") ||
+ String::CStringEquals(term, "linux") ||
+ String::CStringEquals(term, "cygwin");
+ return stdout_is_tty && term_supports_color;
+#endif // GTEST_OS_WINDOWS
+ }
+
+ return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+ String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+ String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+ String::CStringEquals(gtest_color, "1");
+ // We take "yes", "true", "t", and "1" as meaning "yes". If the
+ // value is neither one of these nor "auto", we treat it as "no" to
+ // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+
+GTEST_ATTRIBUTE_PRINTF_(2, 3)
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+
+ static const bool in_color_mode =
+ ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+ const bool use_color = in_color_mode && (color != GTestColor::kDefault);
+
+ if (!use_color) {
+ vprintf(fmt, args);
+ va_end(args);
+ return;
+ }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+ !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+ const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+ // Gets the current text color.
+ CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+ GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+ const WORD old_color_attrs = buffer_info.wAttributes;
+ const WORD new_color = GetNewColor(color, old_color_attrs);
+
+ // We need to flush the stream buffers into the console before each
+ // SetConsoleTextAttribute call lest it affect the text that is already
+ // printed but has not yet reached the console.
+ fflush(stdout);
+ SetConsoleTextAttribute(stdout_handle, new_color);
+
+ vprintf(fmt, args);
+
+ fflush(stdout);
+ // Restores the text color.
+ SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+ printf("\033[0;3%sm", GetAnsiColorCode(color));
+ vprintf(fmt, args);
+ printf("\033[m"); // Resets the terminal to default.
+#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+ va_end(args);
+}
+
+// Text printed in Google Test's text output and --gtest_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+ const char* const type_param = test_info.type_param();
+ const char* const value_param = test_info.value_param();
+
+ if (type_param != nullptr || value_param != nullptr) {
+ printf(", where ");
+ if (type_param != nullptr) {
+ printf("%s = %s", kTypeParamLabel, type_param);
+ if (value_param != nullptr) printf(" and ");
+ }
+ if (value_param != nullptr) {
+ printf("%s = %s", kValueParamLabel, value_param);
+ }
+ }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+ PrettyUnitTestResultPrinter() {}
+ static void PrintTestName(const char* test_suite, const char* test) {
+ printf("%s.%s", test_suite, test);
+ }
+
+ // The following methods override what's in the TestEventListener class.
+ void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+ void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+ void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+ void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseStart(const TestCase& test_case) override;
+#else
+ void OnTestSuiteStart(const TestSuite& test_suite) override;
+#endif // OnTestCaseStart
+
+ void OnTestStart(const TestInfo& test_info) override;
+ void OnTestDisabled(const TestInfo& test_info) override;
+
+ void OnTestPartResult(const TestPartResult& result) override;
+ void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseEnd(const TestCase& test_case) override;
+#else
+ void OnTestSuiteEnd(const TestSuite& test_suite) override;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+ void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+ void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+ void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+
+ private:
+ static void PrintFailedTests(const UnitTest& unit_test);
+ static void PrintFailedTestSuites(const UnitTest& unit_test);
+ static void PrintSkippedTests(const UnitTest& unit_test);
+};
+
+// Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+ const UnitTest& unit_test, int iteration) {
+ if (GTEST_FLAG_GET(repeat) != 1)
+ printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+ std::string f = GTEST_FLAG_GET(filter);
+ const char* const filter = f.c_str();
+
+ // Prints the filter if it's not *. This reminds the user that some
+ // tests may be skipped.
+ if (!String::CStringEquals(filter, kUniversalFilter)) {
+ ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
+ filter);
+ }
+
+ if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+ const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+ ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
+ static_cast<int>(shard_index) + 1,
+ internal::posix::GetEnv(kTestTotalShards));
+ }
+
+ if (GTEST_FLAG_GET(shuffle)) {
+ ColoredPrintf(GTestColor::kYellow,
+ "Note: Randomizing tests' orders with a seed of %d .\n",
+ unit_test.random_seed());
+ }
+
+ ColoredPrintf(GTestColor::kGreen, "[==========] ");
+ printf("Running %s from %s.\n",
+ FormatTestCount(unit_test.test_to_run_count()).c_str(),
+ FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+ fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+ const UnitTest& /*unit_test*/) {
+ ColoredPrintf(GTestColor::kGreen, "[----------] ");
+ printf("Global test environment set-up.\n");
+ fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+ const std::string counts =
+ FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+ ColoredPrintf(GTestColor::kGreen, "[----------] ");
+ printf("%s from %s", counts.c_str(), test_case.name());
+ if (test_case.type_param() == nullptr) {
+ printf("\n");
+ } else {
+ printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+ }
+ fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteStart(
+ const TestSuite& test_suite) {
+ const std::string counts =
+ FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+ ColoredPrintf(GTestColor::kGreen, "[----------] ");
+ printf("%s from %s", counts.c_str(), test_suite.name());
+ if (test_suite.type_param() == nullptr) {
+ printf("\n");
+ } else {
+ printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param());
+ }
+ fflush(stdout);
+}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+ ColoredPrintf(GTestColor::kGreen, "[ RUN ] ");
+ PrintTestName(test_info.test_suite_name(), test_info.name());
+ printf("\n");
+ fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+ ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
+ PrintTestName(test_info.test_suite_name(), test_info.name());
+ printf("\n");
+ fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+ const TestPartResult& result) {
+ switch (result.type()) {
+ // If the test part succeeded, we don't need to do anything.
+ case TestPartResult::kSuccess:
+ return;
+ default:
+ // Print failure message from the assertion
+ // (e.g. expected this and got that).
+ PrintTestPartResult(result);
+ fflush(stdout);
+ }
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+ if (test_info.result()->Passed()) {
+ ColoredPrintf(GTestColor::kGreen, "[ OK ] ");
+ } else if (test_info.result()->Skipped()) {
+ ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] ");
+ } else {
+ ColoredPrintf(GTestColor::kRed, "[ FAILED ] ");
+ }
+ PrintTestName(test_info.test_suite_name(), test_info.name());
+ if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
+
+ if (GTEST_FLAG_GET(print_time)) {
+ printf(" (%s ms)\n",
+ internal::StreamableToString(test_info.result()->elapsed_time())
+ .c_str());
+ } else {
+ printf("\n");
+ }
+ fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+ if (!GTEST_FLAG_GET(print_time)) return;
+
+ const std::string counts =
+ FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+ ColoredPrintf(GTestColor::kGreen, "[----------] ");
+ printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
+ internal::StreamableToString(test_case.elapsed_time()).c_str());
+ fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
+ if (!GTEST_FLAG_GET(print_time)) return;
+
+ const std::string counts =
+ FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+ ColoredPrintf(GTestColor::kGreen, "[----------] ");
+ printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
+ internal::StreamableToString(test_suite.elapsed_time()).c_str());
+ fflush(stdout);
+}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+ const UnitTest& /*unit_test*/) {
+ ColoredPrintf(GTestColor::kGreen, "[----------] ");
+ printf("Global test environment tear-down\n");
+ fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+ const int failed_test_count = unit_test.failed_test_count();
+ ColoredPrintf(GTestColor::kRed, "[ FAILED ] ");
+ printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+
+ for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+ const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+ if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
+ continue;
+ }
+ for (int j = 0; j < test_suite.total_test_count(); ++j) {
+ const TestInfo& test_info = *test_suite.GetTestInfo(j);
+ if (!test_info.should_run() || !test_info.result()->Failed()) {
+ continue;
+ }
+ ColoredPrintf(GTestColor::kRed, "[ FAILED ] ");
+ printf("%s.%s", test_suite.name(), test_info.name());
+ PrintFullTestCommentIfPresent(test_info);
+ printf("\n");
+ }
+ }
+ printf("\n%2d FAILED %s\n", failed_test_count,
+ failed_test_count == 1 ? "TEST" : "TESTS");
+}
+
+// Internal helper for printing the list of test suite failures not covered by
+// PrintFailedTests.
+void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
+ const UnitTest& unit_test) {
+ int suite_failure_count = 0;
+ for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+ const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+ if (!test_suite.should_run()) {
+ continue;
+ }
+ if (test_suite.ad_hoc_test_result().Failed()) {
+ ColoredPrintf(GTestColor::kRed, "[ FAILED ] ");
+ printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
+ ++suite_failure_count;
+ }
+ }
+ if (suite_failure_count > 0) {
+ printf("\n%2d FAILED TEST %s\n", suite_failure_count,
+ suite_failure_count == 1 ? "SUITE" : "SUITES");
+ }
+}
+
+// Internal helper for printing the list of skipped tests.
+void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
+ const int skipped_test_count = unit_test.skipped_test_count();
+ if (skipped_test_count == 0) {
+ return;
+ }
+
+ for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+ const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+ if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
+ continue;
+ }
+ for (int j = 0; j < test_suite.total_test_count(); ++j) {
+ const TestInfo& test_info = *test_suite.GetTestInfo(j);
+ if (!test_info.should_run() || !test_info.result()->Skipped()) {
+ continue;
+ }
+ ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] ");
+ printf("%s.%s", test_suite.name(), test_info.name());
+ printf("\n");
+ }
+ }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+ int /*iteration*/) {
+ ColoredPrintf(GTestColor::kGreen, "[==========] ");
+ printf("%s from %s ran.",
+ FormatTestCount(unit_test.test_to_run_count()).c_str(),
+ FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+ if (GTEST_FLAG_GET(print_time)) {
+ printf(" (%s ms total)",
+ internal::StreamableToString(unit_test.elapsed_time()).c_str());
+ }
+ printf("\n");
+ ColoredPrintf(GTestColor::kGreen, "[ PASSED ] ");
+ printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+ const int skipped_test_count = unit_test.skipped_test_count();
+ if (skipped_test_count > 0) {
+ ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] ");
+ printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
+ PrintSkippedTests(unit_test);
+ }
+
+ if (!unit_test.Passed()) {
+ PrintFailedTests(unit_test);
+ PrintFailedTestSuites(unit_test);
+ }
+
+ int num_disabled = unit_test.reportable_disabled_test_count();
+ if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
+ if (unit_test.Passed()) {
+ printf("\n"); // Add a spacer if no FAILURE banner is displayed.
+ }
+ ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n",
+ num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+ }
+ // Ensure that Google Test output is printed before, e.g., heapchecker output.
+ fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// This class implements the TestEventListener interface.
+//
+// Class BriefUnitTestResultPrinter is copyable.
+class BriefUnitTestResultPrinter : public TestEventListener {
+ public:
+ BriefUnitTestResultPrinter() {}
+ static void PrintTestName(const char* test_suite, const char* test) {
+ printf("%s.%s", test_suite, test);
+ }
+
+ // The following methods override what's in the TestEventListener class.
+ void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+ void OnTestIterationStart(const UnitTest& /*unit_test*/,
+ int /*iteration*/) override {}
+ void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+ void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#else
+ void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+#endif // OnTestCaseStart
+
+ void OnTestStart(const TestInfo& /*test_info*/) override {}
+ void OnTestDisabled(const TestInfo& /*test_info*/) override {}
+
+ void OnTestPartResult(const TestPartResult& result) override;
+ void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#else
+ void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+ void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+ void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+ void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// Called after an assertion failure.
+void BriefUnitTestResultPrinter::OnTestPartResult(
+ const TestPartResult& result) {
+ switch (result.type()) {
+ // If the test part succeeded, we don't need to do anything.
+ case TestPartResult::kSuccess:
+ return;
+ default:
+ // Print failure message from the assertion
+ // (e.g. expected this and got that).
+ PrintTestPartResult(result);
+ fflush(stdout);
+ }
+}
+
+void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+ if (test_info.result()->Failed()) {
+ ColoredPrintf(GTestColor::kRed, "[ FAILED ] ");
+ PrintTestName(test_info.test_suite_name(), test_info.name());
+ PrintFullTestCommentIfPresent(test_info);
+
+ if (GTEST_FLAG_GET(print_time)) {
+ printf(" (%s ms)\n",
+ internal::StreamableToString(test_info.result()->elapsed_time())
+ .c_str());
+ } else {
+ printf("\n");
+ }
+ fflush(stdout);
+ }
+}
+
+void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+ int /*iteration*/) {
+ ColoredPrintf(GTestColor::kGreen, "[==========] ");
+ printf("%s from %s ran.",
+ FormatTestCount(unit_test.test_to_run_count()).c_str(),
+ FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+ if (GTEST_FLAG_GET(print_time)) {
+ printf(" (%s ms total)",
+ internal::StreamableToString(unit_test.elapsed_time()).c_str());
+ }
+ printf("\n");
+ ColoredPrintf(GTestColor::kGreen, "[ PASSED ] ");
+ printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+ const int skipped_test_count = unit_test.skipped_test_count();
+ if (skipped_test_count > 0) {
+ ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] ");
+ printf("%s.\n", FormatTestCount(skipped_test_count).c_str());
+ }
+
+ int num_disabled = unit_test.reportable_disabled_test_count();
+ if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
+ if (unit_test.Passed()) {
+ printf("\n"); // Add a spacer if no FAILURE banner is displayed.
+ }
+ ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n",
+ num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+ }
+ // Ensure that Google Test output is printed before, e.g., heapchecker output.
+ fflush(stdout);
+}
+
+// End BriefUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+ TestEventRepeater() : forwarding_enabled_(true) {}
+ ~TestEventRepeater() override;
+ void Append(TestEventListener* listener);
+ TestEventListener* Release(TestEventListener* listener);
+
+ // Controls whether events will be forwarded to listeners_. Set to false
+ // in death test child processes.
+ bool forwarding_enabled() const { return forwarding_enabled_; }
+ void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+ void OnTestProgramStart(const UnitTest& unit_test) override;
+ void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+ void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+ void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override;
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseStart(const TestSuite& parameter) override;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestSuiteStart(const TestSuite& parameter) override;
+ void OnTestStart(const TestInfo& test_info) override;
+ void OnTestDisabled(const TestInfo& test_info) override;
+ void OnTestPartResult(const TestPartResult& result) override;
+ void OnTestEnd(const TestInfo& test_info) override;
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestCaseEnd(const TestCase& parameter) override;
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+ void OnTestSuiteEnd(const TestSuite& parameter) override;
+ void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+ void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override;
+ void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+ void OnTestProgramEnd(const UnitTest& unit_test) override;
+
+ private:
+ // Controls whether events will be forwarded to listeners_. Set to false
+ // in death test child processes.
+ bool forwarding_enabled_;
+ // The list of listeners that receive events.
+ std::vector<TestEventListener*> listeners_;
+
+ TestEventRepeater(const TestEventRepeater&) = delete;
+ TestEventRepeater& operator=(const TestEventRepeater&) = delete;
+};
+
+TestEventRepeater::~TestEventRepeater() {
+ ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener* listener) {
+ listeners_.push_back(listener);
+}
+
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
+ for (size_t i = 0; i < listeners_.size(); ++i) {
+ if (listeners_[i] == listener) {
+ listeners_.erase(listeners_.begin() + static_cast<int>(i));
+ return listener;
+ }
+ }
+
+ return nullptr;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+ void TestEventRepeater::Name(const Type& parameter) { \
+ if (forwarding_enabled_) { \
+ for (size_t i = 0; i < listeners_.size(); i++) { \
+ listeners_[i]->Name(parameter); \
+ } \
+ } \
+ }
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+ void TestEventRepeater::Name(const Type& parameter) { \
+ if (forwarding_enabled_) { \
+ for (size_t i = listeners_.size(); i != 0; i--) { \
+ listeners_[i - 1]->Name(parameter); \
+ } \
+ } \
+ }
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite)
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+ int iteration) {
+ if (forwarding_enabled_) {
+ for (size_t i = 0; i < listeners_.size(); i++) {
+ listeners_[i]->OnTestIterationStart(unit_test, iteration);
+ }
+ }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+ int iteration) {
+ if (forwarding_enabled_) {
+ for (size_t i = listeners_.size(); i > 0; i--) {
+ listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration);
+ }
+ }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+ explicit XmlUnitTestResultPrinter(const char* output_file);
+
+ void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+ void ListTestsMatchingFilter(const std::vector<TestSuite*>& test_suites);
+
+ // Prints an XML summary of all unit tests.
+ static void PrintXmlTestsList(std::ostream* stream,
+ const std::vector<TestSuite*>& test_suites);
+
+ private:
+ // Is c a whitespace character that is normalized to a space character
+ // when it appears in an XML attribute value?
+ static bool IsNormalizableWhitespace(unsigned char c) {
+ return c == '\t' || c == '\n' || c == '\r';
+ }
+
+ // May c appear in a well-formed XML document?
+ // https://www.w3.org/TR/REC-xml/#charsets
+ static bool IsValidXmlCharacter(unsigned char c) {
+ return IsNormalizableWhitespace(c) || c >= 0x20;
+ }
+
+ // Returns an XML-escaped copy of the input string str. If
+ // is_attribute is true, the text is meant to appear as an attribute
+ // value, and normalizable whitespace is preserved by replacing it
+ // with character references.
+ static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+ // Returns the given string with all characters invalid in XML removed.
+ static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+ // Convenience wrapper around EscapeXml when str is an attribute value.
+ static std::string EscapeXmlAttribute(const std::string& str) {
+ return EscapeXml(str, true);
+ }
+
+ // Convenience wrapper around EscapeXml when str is not an attribute value.
+ static std::string EscapeXmlText(const char* str) {
+ return EscapeXml(str, false);
+ }
+
+ // Verifies that the given attribute belongs to the given element and
+ // streams the attribute as XML.
+ static void OutputXmlAttribute(std::ostream* stream,
+ const std::string& element_name,
+ const std::string& name,
+ const std::string& value);
+
+ // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+ static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+ // Streams a test suite XML stanza containing the given test result.
+ //
+ // Requires: result.Failed()
+ static void OutputXmlTestSuiteForTestResult(::std::ostream* stream,
+ const TestResult& result);
+
+ // Streams an XML representation of a TestResult object.
+ static void OutputXmlTestResult(::std::ostream* stream,
+ const TestResult& result);
+
+ // Streams an XML representation of a TestInfo object.
+ static void OutputXmlTestInfo(::std::ostream* stream,
+ const char* test_suite_name,
+ const TestInfo& test_info);
+
+ // Prints an XML representation of a TestSuite object
+ static void PrintXmlTestSuite(::std::ostream* stream,
+ const TestSuite& test_suite);
+
+ // Prints an XML summary of unit_test to output stream out.
+ static void PrintXmlUnitTest(::std::ostream* stream,
+ const UnitTest& unit_test);
+
+ // Produces a string representing the test properties in a result as space
+ // delimited XML attributes based on the property key="value" pairs.
+ // When the std::string is not empty, it includes a space at the beginning,
+ // to delimit this attribute from prior attributes.
+ static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+ // Streams an XML representation of the test properties of a TestResult
+ // object.
+ static void OutputXmlTestProperties(std::ostream* stream,
+ const TestResult& result);
+
+ // The output file.
+ const std::string output_file_;
+
+ XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+ XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+ : output_file_(output_file) {
+ if (output_file_.empty()) {
+ GTEST_LOG_(FATAL) << "XML output file may not be null";
+ }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+ int /*iteration*/) {
+ FILE* xmlout = OpenFileForWriting(output_file_);
+ std::stringstream stream;
+ PrintXmlUnitTest(&stream, unit_test);
+ fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+ fclose(xmlout);
+}
+
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+ const std::vector<TestSuite*>& test_suites) {
+ FILE* xmlout = OpenFileForWriting(output_file_);
+ std::stringstream stream;
+ PrintXmlTestsList(&stream, test_suites);
+ fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+ fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str. If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+ bool is_attribute) {
+ Message m;
+
+ for (size_t i = 0; i < str.size(); ++i) {
+ const char ch = str[i];
+ switch (ch) {
+ case '<':
+ m << "&lt;";
+ break;
+ case '>':
+ m << "&gt;";
+ break;
+ case '&':
+ m << "&amp;";
+ break;
+ case '\'':
+ if (is_attribute)
+ m << "&apos;";
+ else
+ m << '\'';
+ break;
+ case '"':
+ if (is_attribute)
+ m << "&quot;";
+ else
+ m << '"';
+ break;
+ default:
+ if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+ if (is_attribute &&
+ IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
+ m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+ << ";";
+ else
+ m << ch;
+ }
+ break;
+ }
+ }
+
+ return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+ const std::string& str) {
+ std::string output;
+ output.reserve(str.size());
+ for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+ if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
+ output.push_back(*it);
+
+ return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests"> <-- corresponds to a UnitTest object
+// <testsuite name="testcase-name"> <-- corresponds to a TestSuite object
+// <testcase name="test-name"> <-- corresponds to a TestInfo object
+// <failure message="...">...</failure>
+// <failure message="...">...</failure>
+// <failure message="...">...</failure>
+// <-- individual assertion failures
+// </testcase>
+// </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+ ::std::stringstream ss;
+ ss << (static_cast<double>(ms) * 1e-3);
+ return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+ return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+ // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+ // Windows' localtime(), which has a thread-local tm buffer.
+ struct tm* tm_ptr = localtime(&seconds); // NOLINT
+ if (tm_ptr == nullptr) return false;
+ *out = *tm_ptr;
+ return true;
+#elif defined(__STDC_LIB_EXT1__)
+ // Uses localtime_s when available as localtime_r is only available from
+ // C23 standard.
+ return localtime_s(&seconds, out) != nullptr;
+#else
+ return localtime_r(&seconds, out) != nullptr;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+ struct tm time_struct;
+ if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+ return "";
+ // YYYY-MM-DDThh:mm:ss.sss
+ return StreamableToString(time_struct.tm_year + 1900) + "-" +
+ String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+ String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+ String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+ String::FormatIntWidth2(time_struct.tm_min) + ":" +
+ String::FormatIntWidth2(time_struct.tm_sec) + "." +
+ String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+ const char* data) {
+ const char* segment = data;
+ *stream << "<![CDATA[";
+ for (;;) {
+ const char* const next_segment = strstr(segment, "]]>");
+ if (next_segment != nullptr) {
+ stream->write(segment,
+ static_cast<std::streamsize>(next_segment - segment));
+ *stream << "]]>]]&gt;<![CDATA[";
+ segment = next_segment + strlen("]]>");
+ } else {
+ *stream << segment;
+ break;
+ }
+ }
+ *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+ std::ostream* stream, const std::string& element_name,
+ const std::string& name, const std::string& value) {
+ const std::vector<std::string>& allowed_names =
+ GetReservedOutputAttributesForElement(element_name);
+
+ GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+ allowed_names.end())
+ << "Attribute " << name << " is not allowed for element <" << element_name
+ << ">.";
+
+ *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Streams a test suite XML stanza containing the given test result.
+void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult(
+ ::std::ostream* stream, const TestResult& result) {
+ // Output the boilerplate for a minimal test suite with one test.
+ *stream << " <testsuite";
+ OutputXmlAttribute(stream, "testsuite", "name", "NonTestSuiteFailure");
+ OutputXmlAttribute(stream, "testsuite", "tests", "1");
+ OutputXmlAttribute(stream, "testsuite", "failures", "1");
+ OutputXmlAttribute(stream, "testsuite", "disabled", "0");
+ OutputXmlAttribute(stream, "testsuite", "skipped", "0");
+ OutputXmlAttribute(stream, "testsuite", "errors", "0");
+ OutputXmlAttribute(stream, "testsuite", "time",
+ FormatTimeInMillisAsSeconds(result.elapsed_time()));
+ OutputXmlAttribute(
+ stream, "testsuite", "timestamp",
+ FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+ *stream << ">";
+
+ // Output the boilerplate for a minimal test case with a single test.
+ *stream << " <testcase";
+ OutputXmlAttribute(stream, "testcase", "name", "");
+ OutputXmlAttribute(stream, "testcase", "status", "run");
+ OutputXmlAttribute(stream, "testcase", "result", "completed");
+ OutputXmlAttribute(stream, "testcase", "classname", "");
+ OutputXmlAttribute(stream, "testcase", "time",
+ FormatTimeInMillisAsSeconds(result.elapsed_time()));
+ OutputXmlAttribute(
+ stream, "testcase", "timestamp",
+ FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+
+ // Output the actual test result.
+ OutputXmlTestResult(stream, result);
+
+ // Complete the test suite.
+ *stream << " </testsuite>\n";
+}
+
+// Prints an XML representation of a TestInfo object.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+ const char* test_suite_name,
+ const TestInfo& test_info) {
+ const TestResult& result = *test_info.result();
+ const std::string kTestsuite = "testcase";
+
+ if (test_info.is_in_another_shard()) {
+ return;
+ }
+
+ *stream << " <testcase";
+ OutputXmlAttribute(stream, kTestsuite, "name", test_info.name());
+
+ if (test_info.value_param() != nullptr) {
+ OutputXmlAttribute(stream, kTestsuite, "value_param",
+ test_info.value_param());
+ }
+ if (test_info.type_param() != nullptr) {
+ OutputXmlAttribute(stream, kTestsuite, "type_param",
+ test_info.type_param());
+ }
+
+ OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+ OutputXmlAttribute(stream, kTestsuite, "line",
+ StreamableToString(test_info.line()));
+ if (GTEST_FLAG_GET(list_tests)) {
+ *stream << " />\n";
+ return;
+ }
+
+ OutputXmlAttribute(stream, kTestsuite, "status",
+ test_info.should_run() ? "run" : "notrun");
+ OutputXmlAttribute(stream, kTestsuite, "result",
+ test_info.should_run()
+ ? (result.Skipped() ? "skipped" : "completed")
+ : "suppressed");
+ OutputXmlAttribute(stream, kTestsuite, "time",
+ FormatTimeInMillisAsSeconds(result.elapsed_time()));
+ OutputXmlAttribute(
+ stream, kTestsuite, "timestamp",
+ FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+ OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
+
+ OutputXmlTestResult(stream, result);
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
+ const TestResult& result) {
+ int failures = 0;
+ int skips = 0;
+ for (int i = 0; i < result.total_part_count(); ++i) {
+ const TestPartResult& part = result.GetTestPartResult(i);
+ if (part.failed()) {
+ if (++failures == 1 && skips == 0) {
+ *stream << ">\n";
+ }
+ const std::string location =
+ internal::FormatCompilerIndependentFileLocation(part.file_name(),
+ part.line_number());
+ const std::string summary = location + "\n" + part.summary();
+ *stream << " <failure message=\"" << EscapeXmlAttribute(summary)
+ << "\" type=\"\">";
+ const std::string detail = location + "\n" + part.message();
+ OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+ *stream << "</failure>\n";
+ } else if (part.skipped()) {
+ if (++skips == 1 && failures == 0) {
+ *stream << ">\n";
+ }
+ const std::string location =
+ internal::FormatCompilerIndependentFileLocation(part.file_name(),
+ part.line_number());
+ const std::string summary = location + "\n" + part.summary();
+ *stream << " <skipped message=\""
+ << EscapeXmlAttribute(summary.c_str()) << "\">";
+ const std::string detail = location + "\n" + part.message();
+ OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+ *stream << "</skipped>\n";
+ }
+ }
+
+ if (failures == 0 && skips == 0 && result.test_property_count() == 0) {
+ *stream << " />\n";
+ } else {
+ if (failures == 0 && skips == 0) {
+ *stream << ">\n";
+ }
+ OutputXmlTestProperties(stream, result);
+ *stream << " </testcase>\n";
+ }
+}
+
+// Prints an XML representation of a TestSuite object
+void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
+ const TestSuite& test_suite) {
+ const std::string kTestsuite = "testsuite";
+ *stream << " <" << kTestsuite;
+ OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
+ OutputXmlAttribute(stream, kTestsuite, "tests",
+ StreamableToString(test_suite.reportable_test_count()));
+ if (!GTEST_FLAG_GET(list_tests)) {
+ OutputXmlAttribute(stream, kTestsuite, "failures",
+ StreamableToString(test_suite.failed_test_count()));
+ OutputXmlAttribute(
+ stream, kTestsuite, "disabled",
+ StreamableToString(test_suite.reportable_disabled_test_count()));
+ OutputXmlAttribute(stream, kTestsuite, "skipped",
+ StreamableToString(test_suite.skipped_test_count()));
+
+ OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+
+ OutputXmlAttribute(stream, kTestsuite, "time",
+ FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
+ OutputXmlAttribute(
+ stream, kTestsuite, "timestamp",
+ FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp()));
+ *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result());
+ }
+ *stream << ">\n";
+ for (int i = 0; i < test_suite.total_test_count(); ++i) {
+ if (test_suite.GetTestInfo(i)->is_reportable())
+ OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+ }
+ *stream << " </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+ const UnitTest& unit_test) {
+ const std::string kTestsuites = "testsuites";
+
+ *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+ *stream << "<" << kTestsuites;
+
+ OutputXmlAttribute(stream, kTestsuites, "tests",
+ StreamableToString(unit_test.reportable_test_count()));
+ OutputXmlAttribute(stream, kTestsuites, "failures",
+ StreamableToString(unit_test.failed_test_count()));
+ OutputXmlAttribute(
+ stream, kTestsuites, "disabled",
+ StreamableToString(unit_test.reportable_disabled_test_count()));
+ OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+ OutputXmlAttribute(stream, kTestsuites, "time",
+ FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+ OutputXmlAttribute(
+ stream, kTestsuites, "timestamp",
+ FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+
+ if (GTEST_FLAG_GET(shuffle)) {
+ OutputXmlAttribute(stream, kTestsuites, "random_seed",
+ StreamableToString(unit_test.random_seed()));
+ }
+ *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+ OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+ *stream << ">\n";
+
+ for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+ if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
+ PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
+ }
+
+ // If there was a test failure outside of one of the test suites (like in a
+ // test environment) include that in the output.
+ if (unit_test.ad_hoc_test_result().Failed()) {
+ OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+ }
+
+ *stream << "</" << kTestsuites << ">\n";
+}
+
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+ std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
+ const std::string kTestsuites = "testsuites";
+
+ *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+ *stream << "<" << kTestsuites;
+
+ int total_tests = 0;
+ for (auto test_suite : test_suites) {
+ total_tests += test_suite->total_test_count();
+ }
+ OutputXmlAttribute(stream, kTestsuites, "tests",
+ StreamableToString(total_tests));
+ OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+ *stream << ">\n";
+
+ for (auto test_suite : test_suites) {
+ PrintXmlTestSuite(stream, *test_suite);
+ }
+ *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+ const TestResult& result) {
+ Message attributes;
+ for (int i = 0; i < result.test_property_count(); ++i) {
+ const TestProperty& property = result.GetTestProperty(i);
+ attributes << " " << property.key() << "="
+ << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+ }
+ return attributes.GetString();
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+ std::ostream* stream, const TestResult& result) {
+ const std::string kProperties = "properties";
+ const std::string kProperty = "property";
+
+ if (result.test_property_count() <= 0) {
+ return;
+ }
+
+ *stream << " <" << kProperties << ">\n";
+ for (int i = 0; i < result.test_property_count(); ++i) {
+ const TestProperty& property = result.GetTestProperty(i);
+ *stream << " <" << kProperty;
+ *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+ *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+ *stream << "/>\n";
+ }
+ *stream << " </" << kProperties << ">\n";
+}
+
+// End XmlUnitTestResultPrinter
+
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+ explicit JsonUnitTestResultPrinter(const char* output_file);
+
+ void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+
+ // Prints an JSON summary of all unit tests.
+ static void PrintJsonTestList(::std::ostream* stream,
+ const std::vector<TestSuite*>& test_suites);
+
+ private:
+ // Returns an JSON-escaped copy of the input string str.
+ static std::string EscapeJson(const std::string& str);
+
+ //// Verifies that the given attribute belongs to the given element and
+ //// streams the attribute as JSON.
+ static void OutputJsonKey(std::ostream* stream,
+ const std::string& element_name,
+ const std::string& name, const std::string& value,
+ const std::string& indent, bool comma = true);
+ static void OutputJsonKey(std::ostream* stream,
+ const std::string& element_name,
+ const std::string& name, int value,
+ const std::string& indent, bool comma = true);
+
+ // Streams a test suite JSON stanza containing the given test result.
+ //
+ // Requires: result.Failed()
+ static void OutputJsonTestSuiteForTestResult(::std::ostream* stream,
+ const TestResult& result);
+
+ // Streams a JSON representation of a TestResult object.
+ static void OutputJsonTestResult(::std::ostream* stream,
+ const TestResult& result);
+
+ // Streams a JSON representation of a TestInfo object.
+ static void OutputJsonTestInfo(::std::ostream* stream,
+ const char* test_suite_name,
+ const TestInfo& test_info);
+
+ // Prints a JSON representation of a TestSuite object
+ static void PrintJsonTestSuite(::std::ostream* stream,
+ const TestSuite& test_suite);
+
+ // Prints a JSON summary of unit_test to output stream out.
+ static void PrintJsonUnitTest(::std::ostream* stream,
+ const UnitTest& unit_test);
+
+ // Produces a string representing the test properties in a result as
+ // a JSON dictionary.
+ static std::string TestPropertiesAsJson(const TestResult& result,
+ const std::string& indent);
+
+ // The output file.
+ const std::string output_file_;
+
+ JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+ JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+ delete;
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+ : output_file_(output_file) {
+ if (output_file_.empty()) {
+ GTEST_LOG_(FATAL) << "JSON output file may not be null";
+ }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+ int /*iteration*/) {
+ FILE* jsonout = OpenFileForWriting(output_file_);
+ std::stringstream stream;
+ PrintJsonUnitTest(&stream, unit_test);
+ fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+ fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+ Message m;
+
+ for (size_t i = 0; i < str.size(); ++i) {
+ const char ch = str[i];
+ switch (ch) {
+ case '\\':
+ case '"':
+ case '/':
+ m << '\\' << ch;
+ break;
+ case '\b':
+ m << "\\b";
+ break;
+ case '\t':
+ m << "\\t";
+ break;
+ case '\n':
+ m << "\\n";
+ break;
+ case '\f':
+ m << "\\f";
+ break;
+ case '\r':
+ m << "\\r";
+ break;
+ default:
+ if (ch < ' ') {
+ m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+ } else {
+ m << ch;
+ }
+ break;
+ }
+ }
+
+ return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+ ::std::stringstream ss;
+ ss << (static_cast<double>(ms) * 1e-3) << "s";
+ return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+ struct tm time_struct;
+ if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+ return "";
+ // YYYY-MM-DDThh:mm:ss
+ return StreamableToString(time_struct.tm_year + 1900) + "-" +
+ String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+ String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+ String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+ String::FormatIntWidth2(time_struct.tm_min) + ":" +
+ String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(size_t width) {
+ return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+ const std::string& element_name,
+ const std::string& name,
+ const std::string& value,
+ const std::string& indent,
+ bool comma) {
+ const std::vector<std::string>& allowed_names =
+ GetReservedOutputAttributesForElement(element_name);
+
+ GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+ allowed_names.end())
+ << "Key \"" << name << "\" is not allowed for value \"" << element_name
+ << "\".";
+
+ *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+ if (comma) *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+ std::ostream* stream, const std::string& element_name,
+ const std::string& name, int value, const std::string& indent, bool comma) {
+ const std::vector<std::string>& allowed_names =
+ GetReservedOutputAttributesForElement(element_name);
+
+ GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+ allowed_names.end())
+ << "Key \"" << name << "\" is not allowed for value \"" << element_name
+ << "\".";
+
+ *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+ if (comma) *stream << ",\n";
+}
+
+// Streams a test suite JSON stanza containing the given test result.
+void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
+ ::std::ostream* stream, const TestResult& result) {
+ // Output the boilerplate for a new test suite.
+ *stream << Indent(4) << "{\n";
+ OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
+ OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
+ if (!GTEST_FLAG_GET(list_tests)) {
+ OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
+ OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
+ OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
+ OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6));
+ OutputJsonKey(stream, "testsuite", "time",
+ FormatTimeInMillisAsDuration(result.elapsed_time()),
+ Indent(6));
+ OutputJsonKey(stream, "testsuite", "timestamp",
+ FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+ Indent(6));
+ }
+ *stream << Indent(6) << "\"testsuite\": [\n";
+
+ // Output the boilerplate for a new test case.
+ *stream << Indent(8) << "{\n";
+ OutputJsonKey(stream, "testcase", "name", "", Indent(10));
+ OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10));
+ OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10));
+ OutputJsonKey(stream, "testcase", "timestamp",
+ FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+ Indent(10));
+ OutputJsonKey(stream, "testcase", "time",
+ FormatTimeInMillisAsDuration(result.elapsed_time()),
+ Indent(10));
+ OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false);
+ *stream << TestPropertiesAsJson(result, Indent(10));
+
+ // Output the actual test result.
+ OutputJsonTestResult(stream, result);
+
+ // Finish the test suite.
+ *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+ const char* test_suite_name,
+ const TestInfo& test_info) {
+ const TestResult& result = *test_info.result();
+ const std::string kTestsuite = "testcase";
+ const std::string kIndent = Indent(10);
+
+ *stream << Indent(8) << "{\n";
+ OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent);
+
+ if (test_info.value_param() != nullptr) {
+ OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(),
+ kIndent);
+ }
+ if (test_info.type_param() != nullptr) {
+ OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
+ kIndent);
+ }
+
+ OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+ OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+ if (GTEST_FLAG_GET(list_tests)) {
+ *stream << "\n" << Indent(8) << "}";
+ return;
+ } else {
+ *stream << ",\n";
+ }
+
+ OutputJsonKey(stream, kTestsuite, "status",
+ test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+ OutputJsonKey(stream, kTestsuite, "result",
+ test_info.should_run()
+ ? (result.Skipped() ? "SKIPPED" : "COMPLETED")
+ : "SUPPRESSED",
+ kIndent);
+ OutputJsonKey(stream, kTestsuite, "timestamp",
+ FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+ kIndent);
+ OutputJsonKey(stream, kTestsuite, "time",
+ FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+ OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent,
+ false);
+ *stream << TestPropertiesAsJson(result, kIndent);
+
+ OutputJsonTestResult(stream, result);
+}
+
+void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
+ const TestResult& result) {
+ const std::string kIndent = Indent(10);
+
+ int failures = 0;
+ for (int i = 0; i < result.total_part_count(); ++i) {
+ const TestPartResult& part = result.GetTestPartResult(i);
+ if (part.failed()) {
+ *stream << ",\n";
+ if (++failures == 1) {
+ *stream << kIndent << "\""
+ << "failures"
+ << "\": [\n";
+ }
+ const std::string location =
+ internal::FormatCompilerIndependentFileLocation(part.file_name(),
+ part.line_number());
+ const std::string message = EscapeJson(location + "\n" + part.message());
+ *stream << kIndent << " {\n"
+ << kIndent << " \"failure\": \"" << message << "\",\n"
+ << kIndent << " \"type\": \"\"\n"
+ << kIndent << " }";
+ }
+ }
+
+ if (failures > 0) *stream << "\n" << kIndent << "]";
+ *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestSuite object
+void JsonUnitTestResultPrinter::PrintJsonTestSuite(
+ std::ostream* stream, const TestSuite& test_suite) {
+ const std::string kTestsuite = "testsuite";
+ const std::string kIndent = Indent(6);
+
+ *stream << Indent(4) << "{\n";
+ OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
+ OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
+ kIndent);
+ if (!GTEST_FLAG_GET(list_tests)) {
+ OutputJsonKey(stream, kTestsuite, "failures",
+ test_suite.failed_test_count(), kIndent);
+ OutputJsonKey(stream, kTestsuite, "disabled",
+ test_suite.reportable_disabled_test_count(), kIndent);
+ OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+ OutputJsonKey(
+ stream, kTestsuite, "timestamp",
+ FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()),
+ kIndent);
+ OutputJsonKey(stream, kTestsuite, "time",
+ FormatTimeInMillisAsDuration(test_suite.elapsed_time()),
+ kIndent, false);
+ *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent)
+ << ",\n";
+ }
+
+ *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+ bool comma = false;
+ for (int i = 0; i < test_suite.total_test_count(); ++i) {
+ if (test_suite.GetTestInfo(i)->is_reportable()) {
+ if (comma) {
+ *stream << ",\n";
+ } else {
+ comma = true;
+ }
+ OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+ }
+ }
+ *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+ const UnitTest& unit_test) {
+ const std::string kTestsuites = "testsuites";
+ const std::string kIndent = Indent(2);
+ *stream << "{\n";
+
+ OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+ kIndent);
+ OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+ kIndent);
+ OutputJsonKey(stream, kTestsuites, "disabled",
+ unit_test.reportable_disabled_test_count(), kIndent);
+ OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+ if (GTEST_FLAG_GET(shuffle)) {
+ OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+ kIndent);
+ }
+ OutputJsonKey(stream, kTestsuites, "timestamp",
+ FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+ kIndent);
+ OutputJsonKey(stream, kTestsuites, "time",
+ FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+ false);
+
+ *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+ << ",\n";
+
+ OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+ *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+ bool comma = false;
+ for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+ if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) {
+ if (comma) {
+ *stream << ",\n";
+ } else {
+ comma = true;
+ }
+ PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i));
+ }
+ }
+
+ // If there was a test failure outside of one of the test suites (like in a
+ // test environment) include that in the output.
+ if (unit_test.ad_hoc_test_result().Failed()) {
+ OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+ }
+
+ *stream << "\n"
+ << kIndent << "]\n"
+ << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+ std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
+ const std::string kTestsuites = "testsuites";
+ const std::string kIndent = Indent(2);
+ *stream << "{\n";
+ int total_tests = 0;
+ for (auto test_suite : test_suites) {
+ total_tests += test_suite->total_test_count();
+ }
+ OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+ OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+ *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+ for (size_t i = 0; i < test_suites.size(); ++i) {
+ if (i != 0) {
+ *stream << ",\n";
+ }
+ PrintJsonTestSuite(stream, *test_suites[i]);
+ }
+
+ *stream << "\n"
+ << kIndent << "]\n"
+ << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+ const TestResult& result, const std::string& indent) {
+ Message attributes;
+ for (int i = 0; i < result.test_property_count(); ++i) {
+ const TestProperty& property = result.GetTestProperty(i);
+ attributes << ",\n"
+ << indent << "\"" << property.key() << "\": "
+ << "\"" << EscapeJson(property.value()) << "\"";
+ }
+ return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D". This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char* str) {
+ std::string result;
+ result.reserve(strlen(str) + 1);
+ for (char ch = *str; ch != '\0'; ch = *++str) {
+ switch (ch) {
+ case '%':
+ case '=':
+ case '&':
+ case '\n':
+ result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+ break;
+ default:
+ result.push_back(ch);
+ break;
+ }
+ }
+ return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+ GTEST_CHECK_(sockfd_ == -1)
+ << "MakeConnection() can't be called when there is already a connection.";
+
+ addrinfo hints;
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses.
+ hints.ai_socktype = SOCK_STREAM;
+ addrinfo* servinfo = nullptr;
+
+ // Use the getaddrinfo() to get a linked list of IP addresses for
+ // the given host name.
+ const int error_num =
+ getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+ if (error_num != 0) {
+ GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+ << gai_strerror(error_num);
+ }
+
+ // Loop through all the results and connect to the first we can.
+ for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
+ cur_addr = cur_addr->ai_next) {
+ sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+ cur_addr->ai_protocol);
+ if (sockfd_ != -1) {
+ // Connect the client socket to the server socket.
+ if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+ close(sockfd_);
+ sockfd_ = -1;
+ }
+ }
+ }
+
+ freeaddrinfo(servinfo); // all done with this structure
+
+ if (sockfd_ == -1) {
+ GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+ << host_name_ << ":" << port_num_;
+ }
+}
+
+// End of class Streaming Listener
+#endif // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+ "... " GTEST_NAME_ " internal frames ...";
+
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+ std::string result;
+
+ if (max_depth <= 0) {
+ return result;
+ }
+
+ max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+ std::vector<void*> raw_stack(max_depth);
+ // Skips the frames requested by the caller, plus this function.
+ const int raw_stack_size =
+ absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+ void* caller_frame = nullptr;
+ {
+ MutexLock lock(&mutex_);
+ caller_frame = caller_frame_;
+ }
+
+ for (int i = 0; i < raw_stack_size; ++i) {
+ if (raw_stack[i] == caller_frame &&
+ !GTEST_FLAG_GET(show_internal_stack_frames)) {
+ // Add a marker to the trace and stop adding frames.
+ absl::StrAppend(&result, kElidedFramesMarker, "\n");
+ break;
+ }
+
+ char tmp[1024];
+ const char* symbol = "(unknown)";
+ if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+ symbol = tmp;
+ }
+
+ char line[1024];
+ snprintf(line, sizeof(line), " %p: %s\n", raw_stack[i], symbol);
+ result += line;
+ }
+
+ return result;
+
+#else // !GTEST_HAS_ABSL
+ static_cast<void>(max_depth);
+ static_cast<void>(skip_count);
+ return "";
+#endif // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+ void* caller_frame = nullptr;
+ if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+ caller_frame = nullptr;
+ }
+
+ MutexLock lock(&mutex_);
+ caller_frame_ = caller_frame;
+#endif // GTEST_HAS_ABSL
+}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+ explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+ : premature_exit_filepath_(
+ premature_exit_filepath ? premature_exit_filepath : "") {
+ // If a path to the premature-exit file is specified...
+ if (!premature_exit_filepath_.empty()) {
+ // create the file with a single "0" character in it. I/O
+ // errors are ignored as there's nothing better we can do and we
+ // don't want to fail the test because of this.
+ FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
+ fwrite("0", 1, 1, pfile);
+ fclose(pfile);
+ }
+ }
+
+ ~ScopedPrematureExitFile() {
+#if !defined GTEST_OS_ESP8266
+ if (!premature_exit_filepath_.empty()) {
+ int retval = remove(premature_exit_filepath_.c_str());
+ if (retval) {
+ GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+ << premature_exit_filepath_ << "\" with error "
+ << retval;
+ }
+ }
+#endif
+ }
+
+ private:
+ const std::string premature_exit_filepath_;
+
+ ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+ ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
+};
+
+} // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+ : repeater_(new internal::TestEventRepeater()),
+ default_result_printer_(nullptr),
+ default_xml_generator_(nullptr) {}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output. Can be removed from the listeners list to shut down default
+// console output. Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+ repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it. It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+ if (listener == default_result_printer_)
+ default_result_printer_ = nullptr;
+ else if (listener == default_xml_generator_)
+ default_xml_generator_ = nullptr;
+ return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+ if (default_result_printer_ != listener) {
+ // It is an error to pass this method a listener that is already in the
+ // list.
+ delete Release(default_result_printer_);
+ default_result_printer_ = listener;
+ if (listener != nullptr) Append(listener);
+ }
+}
+
+// Sets the default_xml_generator attribute to the provided listener. The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+ if (default_xml_generator_ != listener) {
+ // It is an error to pass this method a listener that is already in the
+ // list.
+ delete Release(default_xml_generator_);
+ default_xml_generator_ = listener;
+ if (listener != nullptr) Append(listener);
+ }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+ return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+ repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object. The first time this method is
+// called, a UnitTest object is constructed and returned. Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+ // CodeGear C++Builder insists on a public destructor for the
+ // default implementation. Use this implementation to keep good OO
+ // design with private destructor.
+
+#if defined(__BORLANDC__)
+ static UnitTest* const instance = new UnitTest;
+ return instance;
+#else
+ static UnitTest instance;
+ return &instance;
+#endif // defined(__BORLANDC__)
+}
+
+// Gets the number of successful test suites.
+int UnitTest::successful_test_suite_count() const {
+ return impl()->successful_test_suite_count();
+}
+
+// Gets the number of failed test suites.
+int UnitTest::failed_test_suite_count() const {
+ return impl()->failed_test_suite_count();
+}
+
+// Gets the number of all test suites.
+int UnitTest::total_test_suite_count() const {
+ return impl()->total_test_suite_count();
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTest::test_suite_to_run_count() const {
+ return impl()->test_suite_to_run_count();
+}
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+int UnitTest::successful_test_case_count() const {
+ return impl()->successful_test_suite_count();
+}
+int UnitTest::failed_test_case_count() const {
+ return impl()->failed_test_suite_count();
+}
+int UnitTest::total_test_case_count() const {
+ return impl()->total_test_suite_count();
+}
+int UnitTest::test_case_to_run_count() const {
+ return impl()->test_suite_to_run_count();
+}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+ return impl()->successful_test_count();
+}
+
+// Gets the number of skipped tests.
+int UnitTest::skipped_test_count() const {
+ return impl()->skipped_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+ return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+ return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+ return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+ return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+ return impl()->elapsed_time();
+}
+
+// Returns true if and only if the unit test passed (i.e. all test suites
+// passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true if and only if the unit test failed (i.e. some test suite
+// failed or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+const TestSuite* UnitTest::GetTestSuite(int i) const {
+ return impl()->GetTestSuite(i);
+}
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase* UnitTest::GetTestCase(int i) const {
+ return impl()->GetTestCase(i);
+}
+#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test suites.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+ return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+TestSuite* UnitTest::GetMutableTestSuite(int i) {
+ return impl()->GetMutableSuiteCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
+
+// Registers and returns a global test environment. When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered. After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+ if (env == nullptr) {
+ return nullptr;
+ }
+
+ impl_->environments().push_back(env);
+ return env;
+}
+
+// Adds a TestPartResult to the current TestResult object. All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results. The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+ const char* file_name, int line_number,
+ const std::string& message,
+ const std::string& os_stack_trace)
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+ Message msg;
+ msg << message;
+
+ internal::MutexLock lock(&mutex_);
+ if (impl_->gtest_trace_stack().size() > 0) {
+ msg << "\n" << GTEST_NAME_ << " trace:";
+
+ for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
+ const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+ msg << "\n"
+ << internal::FormatFileLocation(trace.file, trace.line) << " "
+ << trace.message;
+ }
+ }
+
+ if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) {
+ msg << internal::kStackTraceMarker << os_stack_trace;
+ }
+
+ const TestPartResult result = TestPartResult(
+ result_type, file_name, line_number, msg.GetString().c_str());
+ impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+ result);
+
+ if (result_type != TestPartResult::kSuccess &&
+ result_type != TestPartResult::kSkip) {
+ // gtest_break_on_failure takes precedence over
+ // gtest_throw_on_failure. This allows a user to set the latter
+ // in the code (perhaps in order to use Google Test assertions
+ // with another testing framework) and specify the former on the
+ // command line for debugging.
+ if (GTEST_FLAG_GET(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+ // Using DebugBreak on Windows allows gtest to still break into a debugger
+ // when a failure happens and both the --gtest_break_on_failure and
+ // the --gtest_catch_exceptions flags are specified.
+ DebugBreak();
+#elif (!defined(__native_client__)) && \
+ ((defined(__clang__) || defined(__GNUC__)) && \
+ (defined(__x86_64__) || defined(__i386__)))
+ // with clang/gcc we can achieve the same effect on x86 by invoking int3
+ asm("int3");
+#else
+ // Dereference nullptr through a volatile pointer to prevent the compiler
+ // from removing. We use this rather than abort() or __builtin_trap() for
+ // portability: some debuggers don't correctly trap abort().
+ *static_cast<volatile int*>(nullptr) = 1;
+#endif // GTEST_OS_WINDOWS
+ } else if (GTEST_FLAG_GET(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+ throw internal::GoogleTestFailureException(result);
+#else
+ // We cannot call abort() as it generates a pop-up in debug mode
+ // that cannot be suppressed in VC 7.1 or below.
+ exit(1);
+#endif
+ }
+ }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+// from SetUpTestSuite or TearDownTestSuite, or to the global property set
+// when invoked elsewhere. If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+ const std::string& value) {
+ impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+ const bool in_death_test_child_process =
+ GTEST_FLAG_GET(internal_run_death_test).length() > 0;
+
+ // Google Test implements this protocol for catching that a test
+ // program exits before returning control to Google Test:
+ //
+ // 1. Upon start, Google Test creates a file whose absolute path
+ // is specified by the environment variable
+ // TEST_PREMATURE_EXIT_FILE.
+ // 2. When Google Test has finished its work, it deletes the file.
+ //
+ // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+ // running a Google-Test-based test program and check the existence
+ // of the file at the end of the test execution to see if it has
+ // exited prematurely.
+
+ // If we are in the child process of a death test, don't
+ // create/delete the premature exit file, as doing so is unnecessary
+ // and will confuse the parent process. Otherwise, create/delete
+ // the file upon entering/leaving this function. If the program
+ // somehow exits before this function has a chance to return, the
+ // premature-exit file will be left undeleted, causing a test runner
+ // that understands the premature-exit-file protocol to report the
+ // test as having failed.
+ const internal::ScopedPrematureExitFile premature_exit_file(
+ in_death_test_child_process
+ ? nullptr
+ : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+ // Captures the value of GTEST_FLAG(catch_exceptions). This value will be
+ // used for the duration of the program.
+ impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
+
+#if GTEST_OS_WINDOWS
+ // Either the user wants Google Test to catch exceptions thrown by the
+ // tests or this is executing in the context of death test child
+ // process. In either case the user does not want to see pop-up dialogs
+ // about crashes - they are expected.
+ if (impl()->catch_exceptions() || in_death_test_child_process) {
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+ // SetErrorMode doesn't exist on CE.
+ SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+ SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+#endif // !GTEST_OS_WINDOWS_MOBILE
+
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+ // Death test children can be terminated with _abort(). On Windows,
+ // _abort() can show a dialog with a warning message. This forces the
+ // abort message to go to stderr instead.
+ _set_error_mode(_OUT_TO_STDERR);
+#endif
+
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+ // In the debug version, Visual Studio pops up a separate dialog
+ // offering a choice to debug the aborted program. We need to suppress
+ // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+ // executed. Google Test will notify the user of any unexpected
+ // failure via stderr.
+ if (!GTEST_FLAG_GET(break_on_failure))
+ _set_abort_behavior(
+ 0x0, // Clear the following flags:
+ _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump.
+
+ // In debug mode, the Windows CRT can crash with an assertion over invalid
+ // input (e.g. passing an invalid file descriptor). The default handling
+ // for these assertions is to pop up a dialog and wait for user input.
+ // Instead ask the CRT to dump such assertions to stderr non-interactively.
+ if (!IsDebuggerPresent()) {
+ (void)_CrtSetReportMode(_CRT_ASSERT,
+ _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+ (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
+ }
+#endif
+ }
+#endif // GTEST_OS_WINDOWS
+
+ return internal::HandleExceptionsInMethodIfSupported(
+ impl(), &internal::UnitTestImpl::RunAllTests,
+ "auxiliary test code (environments or event listeners)")
+ ? 0
+ : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+ return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestSuite object for the test that's currently running,
+// or NULL if no test is running.
+const TestSuite* UnitTest::current_test_suite() const
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+ internal::MutexLock lock(&mutex_);
+ return impl_->current_test_suite();
+}
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase* UnitTest::current_test_case() const
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+ internal::MutexLock lock(&mutex_);
+ return impl_->current_test_suite();
+}
+#endif
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+ internal::MutexLock lock(&mutex_);
+ return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+// Returns ParameterizedTestSuiteRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestSuiteRegistry&
+UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
+ return impl_->parameterized_test_registry();
+}
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() { delete impl_; }
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+ GTEST_LOCK_EXCLUDED_(mutex_) {
+ internal::MutexLock lock(&mutex_);
+ impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
+ internal::MutexLock lock(&mutex_);
+ impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+ : parent_(parent),
+ GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+ default_global_test_part_result_reporter_(this),
+ default_per_thread_test_part_result_reporter_(this),
+ GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_(
+ &default_global_test_part_result_reporter_),
+ per_thread_test_part_result_reporter_(
+ &default_per_thread_test_part_result_reporter_),
+ parameterized_test_registry_(),
+ parameterized_tests_registered_(false),
+ last_death_test_suite_(-1),
+ current_test_suite_(nullptr),
+ current_test_info_(nullptr),
+ ad_hoc_test_result_(),
+ os_stack_trace_getter_(nullptr),
+ post_flag_parse_init_performed_(false),
+ random_seed_(0), // Will be overridden by the flag before first use.
+ random_(0), // Will be reseeded before first use.
+ start_timestamp_(0),
+ elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+ death_test_factory_(new DefaultDeathTestFactory),
+#endif
+ // Will be overridden by the flag before first use.
+ catch_exceptions_(false) {
+ listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+ // Deletes every TestSuite.
+ ForEach(test_suites_, internal::Delete<TestSuite>);
+
+ // Deletes every Environment.
+ ForEach(environments_, internal::Delete<Environment>);
+
+ delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test suite's ad_hoc_test_result when invoke
+// from SetUpTestSuite/TearDownTestSuite, or to the global property set
+// otherwise. If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+ std::string xml_element;
+ TestResult* test_result; // TestResult appropriate for property recording.
+
+ if (current_test_info_ != nullptr) {
+ xml_element = "testcase";
+ test_result = &(current_test_info_->result_);
+ } else if (current_test_suite_ != nullptr) {
+ xml_element = "testsuite";
+ test_result = &(current_test_suite_->ad_hoc_test_result_);
+ } else {
+ xml_element = "testsuites";
+ test_result = &ad_hoc_test_result_;
+ }
+ test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+ if (internal_run_death_test_flag_.get() != nullptr)
+ listeners()->SuppressEventForwarding();
+}
+#endif // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+ const std::string& output_format = UnitTestOptions::GetOutputFormat();
+ if (output_format == "xml") {
+ listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+ UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+ } else if (output_format == "json") {
+ listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+ UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+ } else if (output_format != "") {
+ GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+ << output_format << "\" ignored.";
+ }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+ const std::string& target = GTEST_FLAG_GET(stream_result_to);
+ if (!target.empty()) {
+ const size_t pos = target.find(':');
+ if (pos != std::string::npos) {
+ listeners()->Append(
+ new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
+ } else {
+ GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+ << "\" ignored.";
+ }
+ }
+}
+#endif // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests. Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+ // Ensures that this function does not execute more than once.
+ if (!post_flag_parse_init_performed_) {
+ post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+ // Register to send notifications about key process state changes.
+ listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+ InitDeathTestSubprocessControlInfo();
+ SuppressTestEventsIfInSubprocess();
+#endif // GTEST_HAS_DEATH_TEST
+
+ // Registers parameterized tests. This makes parameterized tests
+ // available to the UnitTest reflection API without running
+ // RUN_ALL_TESTS.
+ RegisterParameterizedTests();
+
+ // Configures listeners for XML output. This makes it possible for users
+ // to shut down the default XML output before invoking RUN_ALL_TESTS.
+ ConfigureXmlOutput();
+
+ if (GTEST_FLAG_GET(brief)) {
+ listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
+ }
+
+#if GTEST_CAN_STREAM_RESULTS_
+ // Configures listeners for streaming test results to the specified server.
+ ConfigureStreamingOutput();
+#endif // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+ if (GTEST_FLAG_GET(install_failure_signal_handler)) {
+ absl::FailureSignalHandlerOptions options;
+ absl::InstallFailureSignalHandler(options);
+ }
+#endif // GTEST_HAS_ABSL
+ }
+}
+
+// A predicate that checks the name of a TestSuite against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only. We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestSuiteNameIs is copyable.
+class TestSuiteNameIs {
+ public:
+ // Constructor.
+ explicit TestSuiteNameIs(const std::string& name) : name_(name) {}
+
+ // Returns true if and only if the name of test_suite matches name_.
+ bool operator()(const TestSuite* test_suite) const {
+ return test_suite != nullptr &&
+ strcmp(test_suite->name(), name_.c_str()) == 0;
+ }
+
+ private:
+ std::string name_;
+};
+
+// Finds and returns a TestSuite with the given name. If one doesn't
+// exist, creates one and returns it. It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+// test_suite_name: name of the test suite
+// type_param: the name of the test suite's type parameter, or NULL if
+// this is not a typed or a type-parameterized test suite.
+// set_up_tc: pointer to the function that sets up the test suite
+// tear_down_tc: pointer to the function that tears down the test suite
+TestSuite* UnitTestImpl::GetTestSuite(
+ const char* test_suite_name, const char* type_param,
+ internal::SetUpTestSuiteFunc set_up_tc,
+ internal::TearDownTestSuiteFunc tear_down_tc) {
+ // Can we find a TestSuite with the given name?
+ const auto test_suite =
+ std::find_if(test_suites_.rbegin(), test_suites_.rend(),
+ TestSuiteNameIs(test_suite_name));
+
+ if (test_suite != test_suites_.rend()) return *test_suite;
+
+ // No. Let's create one.
+ auto* const new_test_suite =
+ new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
+
+ const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
+ // Is this a death test suite?
+ if (death_test_suite_filter.MatchesName(test_suite_name)) {
+ // Yes. Inserts the test suite after the last death test suite
+ // defined so far. This only works when the test suites haven't
+ // been shuffled. Otherwise we may end up running a death test
+ // after a non-death test.
+ ++last_death_test_suite_;
+ test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
+ new_test_suite);
+ } else {
+ // No. Appends to the end of the list.
+ test_suites_.push_back(new_test_suite);
+ }
+
+ test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
+ return new_test_suite;
+}
+
+// Helpers for setting up / tearing down the given environment. They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful. If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+ // True if and only if Google Test is initialized before RUN_ALL_TESTS() is
+ // called.
+ const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
+
+ // Do not run any test if the --help flag was specified.
+ if (g_help_flag) return true;
+
+ // Repeats the call to the post-flag parsing initialization in case the
+ // user didn't call InitGoogleTest.
+ PostFlagParsingInit();
+
+ // Even if sharding is not on, test runners may want to use the
+ // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+ // protocol.
+ internal::WriteToShardStatusFileIfNeeded();
+
+ // True if and only if we are in a subprocess for running a thread-safe-style
+ // death test.
+ bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+ in_subprocess_for_death_test =
+ (internal_run_death_test_flag_.get() != nullptr);
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+ if (in_subprocess_for_death_test) {
+ GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+ }
+#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif // GTEST_HAS_DEATH_TEST
+
+ const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+ in_subprocess_for_death_test);
+
+ // Compares the full test names with the filter to decide which
+ // tests to run.
+ const bool has_tests_to_run =
+ FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+ : IGNORE_SHARDING_PROTOCOL) > 0;
+
+ // Lists the tests and exits if the --gtest_list_tests flag was specified.
+ if (GTEST_FLAG_GET(list_tests)) {
+ // This must be called *after* FilterTests() has been called.
+ ListTestsMatchingFilter();
+ return true;
+ }
+
+ random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
+
+ // True if and only if at least one test has failed.
+ bool failed = false;
+
+ TestEventListener* repeater = listeners()->repeater();
+
+ start_timestamp_ = GetTimeInMillis();
+ repeater->OnTestProgramStart(*parent_);
+
+ // How many times to repeat the tests? We don't want to repeat them
+ // when we are inside the subprocess of a death test.
+ const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
+ // Repeats forever if the repeat count is negative.
+ const bool gtest_repeat_forever = repeat < 0;
+
+ // Should test environments be set up and torn down for each repeat, or only
+ // set up on the first and torn down on the last iteration? If there is no
+ // "last" iteration because the tests will repeat forever, always recreate the
+ // environments to avoid leaks in case one of the environments is using
+ // resources that are external to this process. Without this check there would
+ // be no way to clean up those external resources automatically.
+ const bool recreate_environments_when_repeating =
+ GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+ gtest_repeat_forever;
+
+ for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
+ // We want to preserve failures generated by ad-hoc test
+ // assertions executed before RUN_ALL_TESTS().
+ ClearNonAdHocTestResult();
+
+ Timer timer;
+
+ // Shuffles test suites and tests if requested.
+ if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
+ random()->Reseed(static_cast<uint32_t>(random_seed_));
+ // This should be done before calling OnTestIterationStart(),
+ // such that a test event listener can see the actual test order
+ // in the event.
+ ShuffleTests();
+ }
+
+ // Tells the unit test event listeners that the tests are about to start.
+ repeater->OnTestIterationStart(*parent_, i);
+
+ // Runs each test suite if there is at least one test to run.
+ if (has_tests_to_run) {
+ // Sets up all environments beforehand. If test environments aren't
+ // recreated for each iteration, only do so on the first iteration.
+ if (i == 0 || recreate_environments_when_repeating) {
+ repeater->OnEnvironmentsSetUpStart(*parent_);
+ ForEach(environments_, SetUpEnvironment);
+ repeater->OnEnvironmentsSetUpEnd(*parent_);
+ }
+
+ // Runs the tests only if there was no fatal failure or skip triggered
+ // during global set-up.
+ if (Test::IsSkipped()) {
+ // Emit diagnostics when global set-up calls skip, as it will not be
+ // emitted by default.
+ TestResult& test_result =
+ *internal::GetUnitTestImpl()->current_test_result();
+ for (int j = 0; j < test_result.total_part_count(); ++j) {
+ const TestPartResult& test_part_result =
+ test_result.GetTestPartResult(j);
+ if (test_part_result.type() == TestPartResult::kSkip) {
+ const std::string& result = test_part_result.message();
+ printf("%s\n", result.c_str());
+ }
+ }
+ fflush(stdout);
+ } else if (!Test::HasFatalFailure()) {
+ for (int test_index = 0; test_index < total_test_suite_count();
+ test_index++) {
+ GetMutableSuiteCase(test_index)->Run();
+ if (GTEST_FLAG_GET(fail_fast) &&
+ GetMutableSuiteCase(test_index)->Failed()) {
+ for (int j = test_index + 1; j < total_test_suite_count(); j++) {
+ GetMutableSuiteCase(j)->Skip();
+ }
+ break;
+ }
+ }
+ } else if (Test::HasFatalFailure()) {
+ // If there was a fatal failure during the global setup then we know we
+ // aren't going to run any tests. Explicitly mark all of the tests as
+ // skipped to make this obvious in the output.
+ for (int test_index = 0; test_index < total_test_suite_count();
+ test_index++) {
+ GetMutableSuiteCase(test_index)->Skip();
+ }
+ }
+
+ // Tears down all environments in reverse order afterwards. If test
+ // environments aren't recreated for each iteration, only do so on the
+ // last iteration.
+ if (i == repeat - 1 || recreate_environments_when_repeating) {
+ repeater->OnEnvironmentsTearDownStart(*parent_);
+ std::for_each(environments_.rbegin(), environments_.rend(),
+ TearDownEnvironment);
+ repeater->OnEnvironmentsTearDownEnd(*parent_);
+ }
+ }
+
+ elapsed_time_ = timer.Elapsed();
+
+ // Tells the unit test event listener that the tests have just finished.
+ repeater->OnTestIterationEnd(*parent_, i);
+
+ // Gets the result and clears it.
+ if (!Passed()) {
+ failed = true;
+ }
+
+ // Restores the original test order after the iteration. This
+ // allows the user to quickly repro a failure that happens in the
+ // N-th iteration without repeating the first (N - 1) iterations.
+ // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+ // case the user somehow changes the value of the flag somewhere
+ // (it's always safe to unshuffle the tests).
+ UnshuffleTests();
+
+ if (GTEST_FLAG_GET(shuffle)) {
+ // Picks a new random seed for each iteration.
+ random_seed_ = GetNextRandomSeed(random_seed_);
+ }
+ }
+
+ repeater->OnTestProgramEnd(*parent_);
+
+ if (!gtest_is_initialized_before_run_all_tests) {
+ ColoredPrintf(
+ GTestColor::kRed,
+ "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+ "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+ "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+ " will start to enforce the valid usage. "
+ "Please fix it ASAP, or IT WILL START TO FAIL.\n"); // NOLINT
+#if GTEST_FOR_GOOGLE_
+ ColoredPrintf(GTestColor::kRed,
+ "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif // GTEST_FOR_GOOGLE_
+ }
+
+ return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+ const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+ if (test_shard_file != nullptr) {
+ FILE* const file = posix::FOpen(test_shard_file, "w");
+ if (file == nullptr) {
+ ColoredPrintf(GTestColor::kRed,
+ "Could not write to the test shard status file \"%s\" "
+ "specified by the %s environment variable.\n",
+ test_shard_file, kTestShardStatusFile);
+ fflush(stdout);
+ exit(EXIT_FAILURE);
+ }
+ fclose(file);
+ }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
+ bool in_subprocess_for_death_test) {
+ if (in_subprocess_for_death_test) {
+ return false;
+ }
+
+ const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+ const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+ if (total_shards == -1 && shard_index == -1) {
+ return false;
+ } else if (total_shards == -1 && shard_index != -1) {
+ const Message msg = Message() << "Invalid environment variables: you have "
+ << kTestShardIndex << " = " << shard_index
+ << ", but have left " << kTestTotalShards
+ << " unset.\n";
+ ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
+ fflush(stdout);
+ exit(EXIT_FAILURE);
+ } else if (total_shards != -1 && shard_index == -1) {
+ const Message msg = Message()
+ << "Invalid environment variables: you have "
+ << kTestTotalShards << " = " << total_shards
+ << ", but have left " << kTestShardIndex << " unset.\n";
+ ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
+ fflush(stdout);
+ exit(EXIT_FAILURE);
+ } else if (shard_index < 0 || shard_index >= total_shards) {
+ const Message msg =
+ Message() << "Invalid environment variables: we require 0 <= "
+ << kTestShardIndex << " < " << kTestTotalShards
+ << ", but you have " << kTestShardIndex << "=" << shard_index
+ << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+ ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
+ fflush(stdout);
+ exit(EXIT_FAILURE);
+ }
+
+ return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) {
+ const char* str_val = posix::GetEnv(var);
+ if (str_val == nullptr) {
+ return default_val;
+ }
+
+ int32_t result;
+ if (!ParseInt32(Message() << "The value of environment variable " << var,
+ str_val, &result)) {
+ exit(EXIT_FAILURE);
+ }
+ return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+ return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestSuite and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+ const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+ ? Int32FromEnvOrDie(kTestTotalShards, -1)
+ : -1;
+ const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+ ? Int32FromEnvOrDie(kTestShardIndex, -1)
+ : -1;
+
+ const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+ GTEST_FLAG_GET(filter));
+ const UnitTestFilter disable_test_filter(kDisableTestFilter);
+ // num_runnable_tests are the number of tests that will
+ // run across all shards (i.e., match filter and are not disabled).
+ // num_selected_tests are the number of tests to be run on
+ // this shard.
+ int num_runnable_tests = 0;
+ int num_selected_tests = 0;
+ for (auto* test_suite : test_suites_) {
+ const std::string& test_suite_name = test_suite->name();
+ test_suite->set_should_run(false);
+
+ for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+ TestInfo* const test_info = test_suite->test_info_list()[j];
+ const std::string test_name(test_info->name());
+ // A test is disabled if test suite name or test name matches
+ // kDisableTestFilter.
+ const bool is_disabled =
+ disable_test_filter.MatchesName(test_suite_name) ||
+ disable_test_filter.MatchesName(test_name);
+ test_info->is_disabled_ = is_disabled;
+
+ const bool matches_filter =
+ gtest_flag_filter.MatchesTest(test_suite_name, test_name);
+ test_info->matches_filter_ = matches_filter;
+
+ const bool is_runnable =
+ (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
+ matches_filter;
+
+ const bool is_in_another_shard =
+ shard_tests != IGNORE_SHARDING_PROTOCOL &&
+ !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+ test_info->is_in_another_shard_ = is_in_another_shard;
+ const bool is_selected = is_runnable && !is_in_another_shard;
+
+ num_runnable_tests += is_runnable;
+ num_selected_tests += is_selected;
+
+ test_info->should_run_ = is_selected;
+ test_suite->set_should_run(test_suite->should_run() || is_selected);
+ }
+ }
+ return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n". If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+ if (str != nullptr) {
+ for (int i = 0; *str != '\0'; ++str) {
+ if (i >= max_length) {
+ printf("...");
+ break;
+ }
+ if (*str == '\n') {
+ printf("\\n");
+ i += 2;
+ } else {
+ printf("%c", *str);
+ ++i;
+ }
+ }
+ }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+ // Print at most this many characters for each type/value parameter.
+ const int kMaxParamLength = 250;
+
+ for (auto* test_suite : test_suites_) {
+ bool printed_test_suite_name = false;
+
+ for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+ const TestInfo* const test_info = test_suite->test_info_list()[j];
+ if (test_info->matches_filter_) {
+ if (!printed_test_suite_name) {
+ printed_test_suite_name = true;
+ printf("%s.", test_suite->name());
+ if (test_suite->type_param() != nullptr) {
+ printf(" # %s = ", kTypeParamLabel);
+ // We print the type parameter on a single line to make
+ // the output easy to parse by a program.
+ PrintOnOneLine(test_suite->type_param(), kMaxParamLength);
+ }
+ printf("\n");
+ }
+ printf(" %s", test_info->name());
+ if (test_info->value_param() != nullptr) {
+ printf(" # %s = ", kValueParamLabel);
+ // We print the value parameter on a single line to make the
+ // output easy to parse by a program.
+ PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+ }
+ printf("\n");
+ }
+ }
+ }
+ fflush(stdout);
+ const std::string& output_format = UnitTestOptions::GetOutputFormat();
+ if (output_format == "xml" || output_format == "json") {
+ FILE* fileout = OpenFileForWriting(
+ UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+ std::stringstream stream;
+ if (output_format == "xml") {
+ XmlUnitTestResultPrinter(
+ UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+ .PrintXmlTestsList(&stream, test_suites_);
+ } else if (output_format == "json") {
+ JsonUnitTestResultPrinter(
+ UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+ .PrintJsonTestList(&stream, test_suites_);
+ }
+ fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+ fclose(fileout);
+ }
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+ OsStackTraceGetterInterface* getter) {
+ if (os_stack_trace_getter_ != getter) {
+ delete os_stack_trace_getter_;
+ os_stack_trace_getter_ = getter;
+ }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+ if (os_stack_trace_getter_ == nullptr) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+ os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+ os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif // GTEST_OS_STACK_TRACE_GETTER_
+ }
+
+ return os_stack_trace_getter_;
+}
+
+// Returns the most specific TestResult currently running.
+TestResult* UnitTestImpl::current_test_result() {
+ if (current_test_info_ != nullptr) {
+ return &current_test_info_->result_;
+ }
+ if (current_test_suite_ != nullptr) {
+ return &current_test_suite_->ad_hoc_test_result_;
+ }
+ return &ad_hoc_test_result_;
+}
+
+// Shuffles all test suites, and the tests within each test suite,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+ // Shuffles the death test suites.
+ ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_);
+
+ // Shuffles the non-death test suites.
+ ShuffleRange(random(), last_death_test_suite_ + 1,
+ static_cast<int>(test_suites_.size()), &test_suite_indices_);
+
+ // Shuffles the tests inside each test suite.
+ for (auto& test_suite : test_suites_) {
+ test_suite->ShuffleTests(random());
+ }
+}
+
+// Restores the test suites and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+ for (size_t i = 0; i < test_suites_.size(); i++) {
+ // Unshuffles the tests in each test suite.
+ test_suites_[i]->UnshuffleTests();
+ // Resets the index of each test suite.
+ test_suite_indices_[i] = static_cast<int>(i);
+ }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag. The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
+ // We pass skip_count + 1 to skip this wrapper function in addition
+ // to what the user really wants to skip.
+ return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+} // namespace
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+ // This condition is always false so AlwaysTrue() never actually throws,
+ // but it makes the compiler think that it may throw.
+ if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
+#endif // GTEST_HAS_EXCEPTIONS
+ return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false. None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+ const size_t prefix_len = strlen(prefix);
+ if (strncmp(*pstr, prefix, prefix_len) == 0) {
+ *pstr += prefix_len;
+ return true;
+ }
+ return false;
+}
+
+// Parses a string as a command line flag. The string should have
+// the format "--flag=value". When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseFlagValue(const char* str, const char* flag_name,
+ bool def_optional) {
+ // str and flag must not be NULL.
+ if (str == nullptr || flag_name == nullptr) return nullptr;
+
+ // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+ const std::string flag_str =
+ std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
+ const size_t flag_len = flag_str.length();
+ if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+ // Skips the flag name.
+ const char* flag_end = str + flag_len;
+
+ // When def_optional is true, it's OK to not have a "=value" part.
+ if (def_optional && (flag_end[0] == '\0')) {
+ return flag_end;
+ }
+
+ // If def_optional is true and there are more characters after the
+ // flag name, or if def_optional is false, there must be a '=' after
+ // the flag name.
+ if (flag_end[0] != '=') return nullptr;
+
+ // Returns the string after "=".
+ return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true. On failure, returns false without changing *value.
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
+ // Gets the value of the flag as a string.
+ const char* const value_str = ParseFlagValue(str, flag_name, true);
+
+ // Aborts if the parsing failed.
+ if (value_str == nullptr) return false;
+
+ // Converts the string value to a bool.
+ *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+ return true;
+}
+
+// Parses a string for an int32_t flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true. On failure, returns false without changing *value.
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
+ // Gets the value of the flag as a string.
+ const char* const value_str = ParseFlagValue(str, flag_name, false);
+
+ // Aborts if the parsing failed.
+ if (value_str == nullptr) return false;
+
+ // Sets *value to the value of the flag.
+ return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+ value);
+}
+
+// Parses a string for a string flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true. On failure, returns false without changing *value.
+template <typename String>
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
+ // Gets the value of the flag as a string.
+ const char* const value_str = ParseFlagValue(str, flag_name, false);
+
+ // Aborts if the parsing failed.
+ if (value_str == nullptr) return false;
+
+ // Sets *value to the value of the flag.
+ *value = value_str;
+ return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+ return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
+ SkipPrefix("/", &str)) &&
+ !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+ (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+ SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text. The following escape
+// sequences can be used in the string to control the text color:
+//
+// @@ prints a single '@' character.
+// @R changes the color to red.
+// @G changes the color to green.
+// @Y changes the color to yellow.
+// @D changes to the default terminal text color.
+//
+static void PrintColorEncoded(const char* str) {
+ GTestColor color = GTestColor::kDefault; // The current color.
+
+ // Conceptually, we split the string into segments divided by escape
+ // sequences. Then we print one segment at a time. At the end of
+ // each iteration, the str pointer advances to the beginning of the
+ // next segment.
+ for (;;) {
+ const char* p = strchr(str, '@');
+ if (p == nullptr) {
+ ColoredPrintf(color, "%s", str);
+ return;
+ }
+
+ ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+ const char ch = p[1];
+ str = p + 2;
+ if (ch == '@') {
+ ColoredPrintf(color, "@");
+ } else if (ch == 'D') {
+ color = GTestColor::kDefault;
+ } else if (ch == 'R') {
+ color = GTestColor::kRed;
+ } else if (ch == 'G') {
+ color = GTestColor::kGreen;
+ } else if (ch == 'Y') {
+ color = GTestColor::kYellow;
+ } else {
+ --str;
+ }
+ }
+}
+
+static const char kColorEncodedHelpMessage[] =
+ "This program contains tests written using " GTEST_NAME_
+ ". You can use the\n"
+ "following command line flags to control its behavior:\n"
+ "\n"
+ "Test Selection:\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "list_tests@D\n"
+ " List the names of all tests instead of running them. The name of\n"
+ " TEST(Foo, Bar) is \"Foo.Bar\".\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "filter=@YPOSITIVE_PATTERNS"
+ "[@G-@YNEGATIVE_PATTERNS]@D\n"
+ " Run only the tests whose name matches one of the positive patterns "
+ "but\n"
+ " none of the negative patterns. '?' matches any single character; "
+ "'*'\n"
+ " matches any substring; ':' separates two patterns.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "also_run_disabled_tests@D\n"
+ " Run all disabled tests too.\n"
+ "\n"
+ "Test Execution:\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "repeat=@Y[COUNT]@D\n"
+ " Run the tests repeatedly; use a negative count to repeat forever.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "shuffle@D\n"
+ " Randomize tests' orders on every iteration.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "random_seed=@Y[NUMBER]@D\n"
+ " Random number seed to use for shuffling test orders (between 1 and\n"
+ " 99999, or 0 to use a seed based on the current time).\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "recreate_environments_when_repeating@D\n"
+ " Sets up and tears down the global test environment on each repeat\n"
+ " of the test.\n"
+ "\n"
+ "Test Output:\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+ " Enable/disable colored output. The default is @Gauto@D.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "brief=1@D\n"
+ " Only print test failures.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "print_time=0@D\n"
+ " Don't print the elapsed time of each test.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+ "@Y|@G:@YFILE_PATH]@D\n"
+ " Generate a JSON or XML report in the given directory or with the "
+ "given\n"
+ " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+ " @G--" GTEST_FLAG_PREFIX_
+ "stream_result_to=@YHOST@G:@YPORT@D\n"
+ " Stream test results to the given server.\n"
+#endif // GTEST_CAN_STREAM_RESULTS_
+ "\n"
+ "Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+ " @G--" GTEST_FLAG_PREFIX_
+ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+ " Set the default death test style.\n"
+#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+ " @G--" GTEST_FLAG_PREFIX_
+ "break_on_failure@D\n"
+ " Turn assertion failures into debugger break-points.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "throw_on_failure@D\n"
+ " Turn assertion failures into C++ exceptions for use by an external\n"
+ " test framework.\n"
+ " @G--" GTEST_FLAG_PREFIX_
+ "catch_exceptions=0@D\n"
+ " Do not report exceptions as test failures. Instead, allow them\n"
+ " to crash the program or throw a pop-up (on Windows).\n"
+ "\n"
+ "Except for @G--" GTEST_FLAG_PREFIX_
+ "list_tests@D, you can alternatively set "
+ "the corresponding\n"
+ "environment variable of a flag (all letters in upper-case). For example, "
+ "to\n"
+ "disable colored text output, you can either specify "
+ "@G--" GTEST_FLAG_PREFIX_
+ "color=no@D or set\n"
+ "the @G" GTEST_FLAG_PREFIX_UPPER_
+ "COLOR@D environment variable to @Gno@D.\n"
+ "\n"
+ "For more information, please read the " GTEST_NAME_
+ " documentation at\n"
+ "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+ "\n"
+ "(not one in your own code or tests), please report it to\n"
+ "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+static bool ParseGoogleTestFlag(const char* const arg) {
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name) \
+ do { \
+ auto value = GTEST_FLAG_GET(flag_name); \
+ if (ParseFlag(arg, #flag_name, &value)) { \
+ GTEST_FLAG_SET(flag_name, value); \
+ return true; \
+ } \
+ } while (false)
+
+ GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+ GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+ GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+ GTEST_INTERNAL_PARSE_FLAG(color);
+ GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+ GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+ GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+ GTEST_INTERNAL_PARSE_FLAG(filter);
+ GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+ GTEST_INTERNAL_PARSE_FLAG(list_tests);
+ GTEST_INTERNAL_PARSE_FLAG(output);
+ GTEST_INTERNAL_PARSE_FLAG(brief);
+ GTEST_INTERNAL_PARSE_FLAG(print_time);
+ GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+ GTEST_INTERNAL_PARSE_FLAG(random_seed);
+ GTEST_INTERNAL_PARSE_FLAG(repeat);
+ GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+ GTEST_INTERNAL_PARSE_FLAG(shuffle);
+ GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+ GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+ GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+ return false;
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string& path) {
+ FILE* flagfile = posix::FOpen(path.c_str(), "r");
+ if (!flagfile) {
+ GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
+ << "\"";
+ }
+ std::string contents(ReadEntireFile(flagfile));
+ posix::FClose(flagfile);
+ std::vector<std::string> lines;
+ SplitString(contents, '\n', &lines);
+ for (size_t i = 0; i < lines.size(); ++i) {
+ if (lines[i].empty()) continue;
+ if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
+ }
+}
+#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test. The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+ std::string flagfile_value;
+ for (int i = 1; i < *argc; i++) {
+ const std::string arg_string = StreamableToString(argv[i]);
+ const char* const arg = arg_string.c_str();
+
+ using internal::ParseFlag;
+
+ bool remove_flag = false;
+ if (ParseGoogleTestFlag(arg)) {
+ remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+ } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+ GTEST_FLAG_SET(flagfile, flagfile_value);
+ LoadFlagsFromFile(flagfile_value);
+ remove_flag = true;
+#endif // GTEST_USE_OWN_FLAGFILE_FLAG_
+ } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
+ // Both help flag and unrecognized Google Test flags (excluding
+ // internal ones) trigger help display.
+ g_help_flag = true;
+ }
+
+ if (remove_flag) {
+ // Shift the remainder of the argv list left by one. Note
+ // that argv has (*argc + 1) elements, the last one always being
+ // NULL. The following loop moves the trailing NULL element as
+ // well.
+ for (int j = i; j != *argc; j++) {
+ argv[j] = argv[j + 1];
+ }
+
+ // Decrements the argument count.
+ (*argc)--;
+
+ // We also need to decrement the iterator as we just removed
+ // an element.
+ i--;
+ }
+ }
+
+ if (g_help_flag) {
+ // We print the help here instead of in RUN_ALL_TESTS(), as the
+ // latter may not be called at all if the user is using Google
+ // Test with another testing framework.
+ PrintColorEncoded(kColorEncodedHelpMessage);
+ }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+ if (*argc > 0) {
+ // absl::ParseCommandLine() requires *argc > 0.
+ auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+ *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+ absl::flags_internal::UsageFlagsAction::kHandleUsage,
+ absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+ // Any command-line positional arguments not part of any command-line flag
+ // (or arguments to a flag) are copied back out to argv, with the program
+ // invocation name at position 0, and argc is resized. This includes
+ // positional arguments after the flag-terminating delimiter '--'.
+ // See https://abseil.io/docs/cpp/guides/flags.
+ std::copy(positional_args.begin(), positional_args.end(), argv);
+ if (static_cast<int>(positional_args.size()) < *argc) {
+ argv[positional_args.size()] = nullptr;
+ *argc = static_cast<int>(positional_args.size());
+ }
+ }
+#else
+ ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
+
+ // Fix the value of *_NSGetArgc() on macOS, but if and only if
+ // *_NSGetArgv() == argv
+ // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+ if (*_NSGetArgv() == argv) {
+ *_NSGetArgc() = *argc;
+ }
+#endif
+#endif
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+ ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+ // We don't want to run the initialization code twice.
+ if (GTestIsInitialized()) return;
+
+ if (*argc <= 0) return;
+
+ g_argvs.clear();
+ for (int i = 0; i != *argc; i++) {
+ g_argvs.push_back(StreamableToString(argv[i]));
+ }
+
+#if GTEST_HAS_ABSL
+ absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+ // When using the Abseil Flags library, set the program usage message to the
+ // help message, but remove the color-encoding from the message first.
+ absl::SetProgramUsageMessage(absl::StrReplaceAll(
+ kColorEncodedHelpMessage,
+ {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
+#endif // GTEST_HAS_ABSL
+
+ ParseGoogleTestFlagsOnly(argc, argv);
+ GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+} // namespace internal
+
+// Initializes Google Test. This must be called before calling
+// RUN_ALL_TESTS(). In particular, it parses a command line for the
+// flags that Google Test recognizes. Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned. Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+ GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+ internal::InitGoogleTestImpl(argc, argv);
+#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+ GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+ internal::InitGoogleTestImpl(argc, argv);
+#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+void InitGoogleTest() {
+ // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+ int argc = 1;
+ const auto arg0 = "dummy";
+ char* argv0 = const_cast<char*>(arg0);
+ char** argv = &argv0;
+
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+ GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
+#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+ internal::InitGoogleTestImpl(&argc, argv);
+#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+ std::initializer_list<const char*> environment_variables,
+ const char* fallback, char separator) {
+ for (const char* variable_name : environment_variables) {
+ const char* value = internal::posix::GetEnv(variable_name);
+ if (value != nullptr && value[0] != '\0') {
+ if (value[strlen(value) - 1] != separator) {
+ return std::string(value).append(1, separator);
+ }
+ return value;
+ }
+ }
+ return fallback;
+}
+#endif
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+ return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+ return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
+#elif GTEST_OS_LINUX_ANDROID
+ return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
+#else
+ return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+ internal::TraceInfo trace;
+ trace.file = file;
+ trace.line = line;
+ trace.message.swap(message);
+
+ UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ UnitTest::GetInstance()->PopGTestTrace();
+}
+
+} // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc
new file mode 100644
index 0000000000..44976375c9
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc
@@ -0,0 +1,53 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdio>
+
+#include "gtest/gtest.h"
+
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() { testing::InitGoogleTest(); }
+
+void loop() { RUN_ALL_TESTS(); }
+
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
+GTEST_API_ int main(int argc, char **argv) {
+ printf("Running main() from %s\n", __FILE__);
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+#endif
diff --git a/third_party/aom/third_party/libwebm/AUTHORS.TXT b/third_party/aom/third_party/libwebm/AUTHORS.TXT
new file mode 100644
index 0000000000..59b648ca68
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/AUTHORS.TXT
@@ -0,0 +1,5 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
+Elijah Cirioli <eli.cirioli@gmail.com>
diff --git a/third_party/aom/third_party/libwebm/Android.mk b/third_party/aom/third_party/libwebm/Android.mk
new file mode 100644
index 0000000000..e6c17df021
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/Android.mk
@@ -0,0 +1,23 @@
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libwebm
+LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
+LOCAL_C_INCLUDES:= $(LOCAL_PATH)
+LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
+
+LOCAL_SRC_FILES:= common/file_util.cc \
+ common/hdr_util.cc \
+ mkvparser/mkvparser.cc \
+ mkvparser/mkvreader.cc \
+ mkvmuxer/mkvmuxer.cc \
+ mkvmuxer/mkvmuxerutil.cc \
+ mkvmuxer/mkvwriter.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
+include $(BUILD_STATIC_LIBRARY)
+endif # NDK_ROOT
diff --git a/third_party/aom/third_party/libwebm/LICENSE.TXT b/third_party/aom/third_party/libwebm/LICENSE.TXT
new file mode 100644
index 0000000000..7a6f99547d
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/LICENSE.TXT
@@ -0,0 +1,30 @@
+Copyright (c) 2010, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name of Google nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/third_party/aom/third_party/libwebm/PATENTS.TXT b/third_party/aom/third_party/libwebm/PATENTS.TXT
new file mode 100644
index 0000000000..caedf607e9
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/PATENTS.TXT
@@ -0,0 +1,23 @@
+Additional IP Rights Grant (Patents)
+------------------------------------
+
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/third_party/aom/third_party/libwebm/README.libaom b/third_party/aom/third_party/libwebm/README.libaom
new file mode 100644
index 0000000000..ee350a523a
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/README.libaom
@@ -0,0 +1,20 @@
+URL: https://chromium.googlesource.com/webm/libwebm
+Version: 1930e3ca23b007f3ff11d98a570077be6201957e
+License: BSD
+License File: LICENSE.TXT
+
+Description:
+libwebm is used to handle WebM container I/O.
+
+Local Changes:
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+ file_util.cc/h
+ hdr_util.cc/h
+ webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/third_party/aom/third_party/libwebm/common/file_util.cc b/third_party/aom/third_party/libwebm/common/file_util.cc
new file mode 100644
index 0000000000..6eb6428b98
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/file_util.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "common/file_util.h"
+
+#include <sys/stat.h>
+#ifndef _MSC_VER
+#include <unistd.h> // close()
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <ios>
+#include <string>
+
+namespace libwebm {
+
+std::string GetTempFileName() {
+#if !defined _MSC_VER && !defined __MINGW32__
+ std::string temp_file_name_template_str =
+ std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR")
+ : ".") +
+ "/libwebm_temp.XXXXXX";
+ char* temp_file_name_template =
+ new char[temp_file_name_template_str.length() + 1];
+ memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1);
+ temp_file_name_template_str.copy(temp_file_name_template,
+ temp_file_name_template_str.length(), 0);
+ int fd = mkstemp(temp_file_name_template);
+ std::string temp_file_name =
+ (fd != -1) ? std::string(temp_file_name_template) : std::string();
+ delete[] temp_file_name_template;
+ if (fd != -1) {
+ close(fd);
+ }
+ return temp_file_name;
+#else
+ char tmp_file_name[_MAX_PATH];
+#if defined _MSC_VER || defined MINGW_HAS_SECURE_API
+ errno_t err = tmpnam_s(tmp_file_name);
+#else
+ char* fname_pointer = tmpnam(tmp_file_name);
+ int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+#endif
+ if (err == 0) {
+ return std::string(tmp_file_name);
+ }
+ return std::string();
+#endif
+}
+
+uint64_t GetFileSize(const std::string& file_name) {
+ uint64_t file_size = 0;
+#ifndef _MSC_VER
+ struct stat st;
+ st.st_size = 0;
+ if (stat(file_name.c_str(), &st) == 0) {
+#else
+ struct _stat st;
+ st.st_size = 0;
+ if (_stat(file_name.c_str(), &st) == 0) {
+#endif
+ file_size = st.st_size;
+ }
+ return file_size;
+}
+
+bool GetFileContents(const std::string& file_name, std::string* contents) {
+ std::ifstream file(file_name.c_str());
+ *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0);
+ if (file.good() && contents->size()) {
+ file.read(&(*contents)[0], contents->size());
+ }
+ return !file.fail();
+}
+
+TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
+
+TempFileDeleter::~TempFileDeleter() {
+ std::ifstream file(file_name_.c_str());
+ if (file.good()) {
+ file.close();
+ std::remove(file_name_.c_str());
+ }
+}
+
+} // namespace libwebm
diff --git a/third_party/aom/third_party/libwebm/common/file_util.h b/third_party/aom/third_party/libwebm/common/file_util.h
new file mode 100644
index 0000000000..a873734641
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/file_util.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_FILE_UTIL_H_
+#define LIBWEBM_COMMON_FILE_UTIL_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "mkvmuxer/mkvmuxertypes.h" // LIBWEBM_DISALLOW_COPY_AND_ASSIGN()
+
+namespace libwebm {
+
+// Returns a temporary file name.
+std::string GetTempFileName();
+
+// Returns size of file specified by |file_name|, or 0 upon failure.
+uint64_t GetFileSize(const std::string& file_name);
+
+// Gets the contents file_name as a string. Returns false on error.
+bool GetFileContents(const std::string& file_name, std::string* contents);
+
+// Manages life of temporary file specified at time of construction. Deletes
+// file upon destruction.
+class TempFileDeleter {
+ public:
+ TempFileDeleter();
+ explicit TempFileDeleter(std::string file_name) : file_name_(file_name) {}
+ ~TempFileDeleter();
+ const std::string& name() const { return file_name_; }
+
+ private:
+ std::string file_name_;
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TempFileDeleter);
+};
+
+} // namespace libwebm
+
+#endif // LIBWEBM_COMMON_FILE_UTIL_H_
diff --git a/third_party/aom/third_party/libwebm/common/hdr_util.cc b/third_party/aom/third_party/libwebm/common/hdr_util.cc
new file mode 100644
index 0000000000..916f7170b6
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/hdr_util.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "hdr_util.h"
+
+#include <climits>
+#include <cstddef>
+#include <new>
+
+#include "mkvparser/mkvparser.h"
+
+namespace libwebm {
+const int Vp9CodecFeatures::kValueNotPresent = INT_MAX;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+ PrimaryChromaticityPtr* muxer_pc) {
+ muxer_pc->reset(new (std::nothrow)
+ mkvmuxer::PrimaryChromaticity(parser_pc.x, parser_pc.y));
+ if (!muxer_pc->get())
+ return false;
+ return true;
+}
+
+bool MasteringMetadataValuePresent(double value) {
+ return value != mkvparser::MasteringMetadata::kValueNotPresent;
+}
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+ mkvmuxer::MasteringMetadata* muxer_mm) {
+ if (MasteringMetadataValuePresent(parser_mm.luminance_max))
+ muxer_mm->set_luminance_max(parser_mm.luminance_max);
+ if (MasteringMetadataValuePresent(parser_mm.luminance_min))
+ muxer_mm->set_luminance_min(parser_mm.luminance_min);
+
+ PrimaryChromaticityPtr r_ptr(nullptr);
+ PrimaryChromaticityPtr g_ptr(nullptr);
+ PrimaryChromaticityPtr b_ptr(nullptr);
+ PrimaryChromaticityPtr wp_ptr(nullptr);
+
+ if (parser_mm.r) {
+ if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
+ return false;
+ }
+ if (parser_mm.g) {
+ if (!CopyPrimaryChromaticity(*parser_mm.g, &g_ptr))
+ return false;
+ }
+ if (parser_mm.b) {
+ if (!CopyPrimaryChromaticity(*parser_mm.b, &b_ptr))
+ return false;
+ }
+ if (parser_mm.white_point) {
+ if (!CopyPrimaryChromaticity(*parser_mm.white_point, &wp_ptr))
+ return false;
+ }
+
+ if (!muxer_mm->SetChromaticity(r_ptr.get(), g_ptr.get(), b_ptr.get(),
+ wp_ptr.get())) {
+ return false;
+ }
+
+ return true;
+}
+
+bool ColourValuePresent(long long value) {
+ return value != mkvparser::Colour::kValueNotPresent;
+}
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+ mkvmuxer::Colour* muxer_colour) {
+ if (!muxer_colour)
+ return false;
+
+ if (ColourValuePresent(parser_colour.matrix_coefficients))
+ muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients);
+ if (ColourValuePresent(parser_colour.bits_per_channel))
+ muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel);
+ if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) {
+ muxer_colour->set_chroma_subsampling_horz(
+ parser_colour.chroma_subsampling_horz);
+ }
+ if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) {
+ muxer_colour->set_chroma_subsampling_vert(
+ parser_colour.chroma_subsampling_vert);
+ }
+ if (ColourValuePresent(parser_colour.cb_subsampling_horz))
+ muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz);
+ if (ColourValuePresent(parser_colour.cb_subsampling_vert))
+ muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert);
+ if (ColourValuePresent(parser_colour.chroma_siting_horz))
+ muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz);
+ if (ColourValuePresent(parser_colour.chroma_siting_vert))
+ muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert);
+ if (ColourValuePresent(parser_colour.range))
+ muxer_colour->set_range(parser_colour.range);
+ if (ColourValuePresent(parser_colour.transfer_characteristics)) {
+ muxer_colour->set_transfer_characteristics(
+ parser_colour.transfer_characteristics);
+ }
+ if (ColourValuePresent(parser_colour.primaries))
+ muxer_colour->set_primaries(parser_colour.primaries);
+ if (ColourValuePresent(parser_colour.max_cll))
+ muxer_colour->set_max_cll(parser_colour.max_cll);
+ if (ColourValuePresent(parser_colour.max_fall))
+ muxer_colour->set_max_fall(parser_colour.max_fall);
+
+ if (parser_colour.mastering_metadata) {
+ mkvmuxer::MasteringMetadata muxer_mm;
+ if (!CopyMasteringMetadata(*parser_colour.mastering_metadata, &muxer_mm))
+ return false;
+ if (!muxer_colour->SetMasteringMetadata(muxer_mm))
+ return false;
+ }
+ return true;
+}
+
+// Format of VPx private data:
+//
+// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+// | ID Byte | Length | |
+// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+// | |
+// : Bytes 1..Length of Codec Feature :
+// | |
+// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
+// ID Byte Format
+// ID byte is an unsigned byte.
+// 0 1 2 3 4 5 6 7
+// +-+-+-+-+-+-+-+-+
+// |X| ID |
+// +-+-+-+-+-+-+-+-+
+//
+// The X bit is reserved.
+//
+// See the following link for more information:
+// http://www.webmproject.org/vp9/profiles/
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+ Vp9CodecFeatures* features) {
+ const int kVpxCodecPrivateMinLength = 3;
+ if (!private_data || !features || length < kVpxCodecPrivateMinLength)
+ return false;
+
+ const uint8_t kVp9ProfileId = 1;
+ const uint8_t kVp9LevelId = 2;
+ const uint8_t kVp9BitDepthId = 3;
+ const uint8_t kVp9ChromaSubsamplingId = 4;
+ const int kVpxFeatureLength = 1;
+ int offset = 0;
+
+ // Set features to not set.
+ features->profile = Vp9CodecFeatures::kValueNotPresent;
+ features->level = Vp9CodecFeatures::kValueNotPresent;
+ features->bit_depth = Vp9CodecFeatures::kValueNotPresent;
+ features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent;
+ do {
+ const uint8_t id_byte = private_data[offset++];
+ const uint8_t length_byte = private_data[offset++];
+ if (length_byte != kVpxFeatureLength)
+ return false;
+ if (id_byte == kVp9ProfileId) {
+ const int priv_profile = static_cast<int>(private_data[offset++]);
+ if (priv_profile < 0 || priv_profile > 3)
+ return false;
+ if (features->profile != Vp9CodecFeatures::kValueNotPresent &&
+ features->profile != priv_profile) {
+ return false;
+ }
+ features->profile = priv_profile;
+ } else if (id_byte == kVp9LevelId) {
+ const int priv_level = static_cast<int>(private_data[offset++]);
+
+ const int kNumLevels = 14;
+ const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+ 41, 50, 51, 52, 60, 61, 62};
+
+ for (int i = 0; i < kNumLevels; ++i) {
+ if (priv_level == levels[i]) {
+ if (features->level != Vp9CodecFeatures::kValueNotPresent &&
+ features->level != priv_level) {
+ return false;
+ }
+ features->level = priv_level;
+ break;
+ }
+ }
+ if (features->level == Vp9CodecFeatures::kValueNotPresent)
+ return false;
+ } else if (id_byte == kVp9BitDepthId) {
+ const int priv_profile = static_cast<int>(private_data[offset++]);
+ if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12)
+ return false;
+ if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent &&
+ features->bit_depth != priv_profile) {
+ return false;
+ }
+ features->bit_depth = priv_profile;
+ } else if (id_byte == kVp9ChromaSubsamplingId) {
+ const int priv_profile = static_cast<int>(private_data[offset++]);
+ if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+ return false;
+ if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
+ features->chroma_subsampling != priv_profile) {
+ return false;
+ }
+ features->chroma_subsampling = priv_profile;
+ } else {
+ // Invalid ID.
+ return false;
+ }
+ } while (offset + kVpxCodecPrivateMinLength <= length);
+
+ return true;
+}
+} // namespace libwebm
diff --git a/third_party/aom/third_party/libwebm/common/hdr_util.h b/third_party/aom/third_party/libwebm/common/hdr_util.h
new file mode 100644
index 0000000000..78e2eeb705
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/hdr_util.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_HDR_UTIL_H_
+#define LIBWEBM_COMMON_HDR_UTIL_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "mkvmuxer/mkvmuxer.h"
+
+namespace mkvparser {
+struct Colour;
+struct MasteringMetadata;
+struct PrimaryChromaticity;
+} // namespace mkvparser
+
+namespace libwebm {
+// Utility types and functions for working with the Colour element and its
+// children. Copiers return true upon success. Presence functions return true
+// when the specified element is present.
+
+// TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
+// required by libwebm.
+
+// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video
+// stream. A value of kValueNotPresent represents that the value was not set in
+// the CodecPrivate.
+struct Vp9CodecFeatures {
+ static const int kValueNotPresent;
+
+ Vp9CodecFeatures()
+ : profile(kValueNotPresent),
+ level(kValueNotPresent),
+ bit_depth(kValueNotPresent),
+ chroma_subsampling(kValueNotPresent) {}
+ ~Vp9CodecFeatures() {}
+
+ int profile;
+ int level;
+ int bit_depth;
+ int chroma_subsampling;
+};
+
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+ PrimaryChromaticityPtr* muxer_pc);
+
+bool MasteringMetadataValuePresent(double value);
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+ mkvmuxer::MasteringMetadata* muxer_mm);
+
+bool ColourValuePresent(long long value);
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+ mkvmuxer::Colour* muxer_colour);
+
+// Returns true if |features| is set to one or more valid values.
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+ Vp9CodecFeatures* features);
+
+} // namespace libwebm
+
+#endif // LIBWEBM_COMMON_HDR_UTIL_H_
diff --git a/third_party/aom/third_party/libwebm/common/webmids.h b/third_party/aom/third_party/libwebm/common/webmids.h
new file mode 100644
index 0000000000..fc0c208140
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/webmids.h
@@ -0,0 +1,193 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef COMMON_WEBMIDS_H_
+#define COMMON_WEBMIDS_H_
+
+namespace libwebm {
+
+enum MkvId {
+ kMkvEBML = 0x1A45DFA3,
+ kMkvEBMLVersion = 0x4286,
+ kMkvEBMLReadVersion = 0x42F7,
+ kMkvEBMLMaxIDLength = 0x42F2,
+ kMkvEBMLMaxSizeLength = 0x42F3,
+ kMkvDocType = 0x4282,
+ kMkvDocTypeVersion = 0x4287,
+ kMkvDocTypeReadVersion = 0x4285,
+ kMkvVoid = 0xEC,
+ kMkvSignatureSlot = 0x1B538667,
+ kMkvSignatureAlgo = 0x7E8A,
+ kMkvSignatureHash = 0x7E9A,
+ kMkvSignaturePublicKey = 0x7EA5,
+ kMkvSignature = 0x7EB5,
+ kMkvSignatureElements = 0x7E5B,
+ kMkvSignatureElementList = 0x7E7B,
+ kMkvSignedElement = 0x6532,
+ // segment
+ kMkvSegment = 0x18538067,
+ // Meta Seek Information
+ kMkvSeekHead = 0x114D9B74,
+ kMkvSeek = 0x4DBB,
+ kMkvSeekID = 0x53AB,
+ kMkvSeekPosition = 0x53AC,
+ // Segment Information
+ kMkvInfo = 0x1549A966,
+ kMkvTimecodeScale = 0x2AD7B1,
+ kMkvDuration = 0x4489,
+ kMkvDateUTC = 0x4461,
+ kMkvTitle = 0x7BA9,
+ kMkvMuxingApp = 0x4D80,
+ kMkvWritingApp = 0x5741,
+ // Cluster
+ kMkvCluster = 0x1F43B675,
+ kMkvTimecode = 0xE7,
+ kMkvPrevSize = 0xAB,
+ kMkvBlockGroup = 0xA0,
+ kMkvBlock = 0xA1,
+ kMkvBlockDuration = 0x9B,
+ kMkvReferenceBlock = 0xFB,
+ kMkvLaceNumber = 0xCC,
+ kMkvSimpleBlock = 0xA3,
+ kMkvBlockAdditions = 0x75A1,
+ kMkvBlockMore = 0xA6,
+ kMkvBlockAddID = 0xEE,
+ kMkvBlockAdditional = 0xA5,
+ kMkvDiscardPadding = 0x75A2,
+ // Track
+ kMkvTracks = 0x1654AE6B,
+ kMkvTrackEntry = 0xAE,
+ kMkvTrackNumber = 0xD7,
+ kMkvTrackUID = 0x73C5,
+ kMkvTrackType = 0x83,
+ kMkvFlagEnabled = 0xB9,
+ kMkvFlagDefault = 0x88,
+ kMkvFlagForced = 0x55AA,
+ kMkvFlagLacing = 0x9C,
+ kMkvDefaultDuration = 0x23E383,
+ kMkvMaxBlockAdditionID = 0x55EE,
+ kMkvName = 0x536E,
+ kMkvLanguage = 0x22B59C,
+ kMkvCodecID = 0x86,
+ kMkvCodecPrivate = 0x63A2,
+ kMkvCodecName = 0x258688,
+ kMkvCodecDelay = 0x56AA,
+ kMkvSeekPreRoll = 0x56BB,
+ // video
+ kMkvVideo = 0xE0,
+ kMkvFlagInterlaced = 0x9A,
+ kMkvStereoMode = 0x53B8,
+ kMkvAlphaMode = 0x53C0,
+ kMkvPixelWidth = 0xB0,
+ kMkvPixelHeight = 0xBA,
+ kMkvPixelCropBottom = 0x54AA,
+ kMkvPixelCropTop = 0x54BB,
+ kMkvPixelCropLeft = 0x54CC,
+ kMkvPixelCropRight = 0x54DD,
+ kMkvDisplayWidth = 0x54B0,
+ kMkvDisplayHeight = 0x54BA,
+ kMkvDisplayUnit = 0x54B2,
+ kMkvAspectRatioType = 0x54B3,
+ kMkvColourSpace = 0x2EB524,
+ kMkvFrameRate = 0x2383E3,
+ // end video
+ // colour
+ kMkvColour = 0x55B0,
+ kMkvMatrixCoefficients = 0x55B1,
+ kMkvBitsPerChannel = 0x55B2,
+ kMkvChromaSubsamplingHorz = 0x55B3,
+ kMkvChromaSubsamplingVert = 0x55B4,
+ kMkvCbSubsamplingHorz = 0x55B5,
+ kMkvCbSubsamplingVert = 0x55B6,
+ kMkvChromaSitingHorz = 0x55B7,
+ kMkvChromaSitingVert = 0x55B8,
+ kMkvRange = 0x55B9,
+ kMkvTransferCharacteristics = 0x55BA,
+ kMkvPrimaries = 0x55BB,
+ kMkvMaxCLL = 0x55BC,
+ kMkvMaxFALL = 0x55BD,
+ // mastering metadata
+ kMkvMasteringMetadata = 0x55D0,
+ kMkvPrimaryRChromaticityX = 0x55D1,
+ kMkvPrimaryRChromaticityY = 0x55D2,
+ kMkvPrimaryGChromaticityX = 0x55D3,
+ kMkvPrimaryGChromaticityY = 0x55D4,
+ kMkvPrimaryBChromaticityX = 0x55D5,
+ kMkvPrimaryBChromaticityY = 0x55D6,
+ kMkvWhitePointChromaticityX = 0x55D7,
+ kMkvWhitePointChromaticityY = 0x55D8,
+ kMkvLuminanceMax = 0x55D9,
+ kMkvLuminanceMin = 0x55DA,
+ // end mastering metadata
+ // end colour
+ // projection
+ kMkvProjection = 0x7670,
+ kMkvProjectionType = 0x7671,
+ kMkvProjectionPrivate = 0x7672,
+ kMkvProjectionPoseYaw = 0x7673,
+ kMkvProjectionPosePitch = 0x7674,
+ kMkvProjectionPoseRoll = 0x7675,
+ // end projection
+ // audio
+ kMkvAudio = 0xE1,
+ kMkvSamplingFrequency = 0xB5,
+ kMkvOutputSamplingFrequency = 0x78B5,
+ kMkvChannels = 0x9F,
+ kMkvBitDepth = 0x6264,
+ // end audio
+ // ContentEncodings
+ kMkvContentEncodings = 0x6D80,
+ kMkvContentEncoding = 0x6240,
+ kMkvContentEncodingOrder = 0x5031,
+ kMkvContentEncodingScope = 0x5032,
+ kMkvContentEncodingType = 0x5033,
+ kMkvContentCompression = 0x5034,
+ kMkvContentCompAlgo = 0x4254,
+ kMkvContentCompSettings = 0x4255,
+ kMkvContentEncryption = 0x5035,
+ kMkvContentEncAlgo = 0x47E1,
+ kMkvContentEncKeyID = 0x47E2,
+ kMkvContentSignature = 0x47E3,
+ kMkvContentSigKeyID = 0x47E4,
+ kMkvContentSigAlgo = 0x47E5,
+ kMkvContentSigHashAlgo = 0x47E6,
+ kMkvContentEncAESSettings = 0x47E7,
+ kMkvAESSettingsCipherMode = 0x47E8,
+ kMkvAESSettingsCipherInitData = 0x47E9,
+ // end ContentEncodings
+ // Cueing Data
+ kMkvCues = 0x1C53BB6B,
+ kMkvCuePoint = 0xBB,
+ kMkvCueTime = 0xB3,
+ kMkvCueTrackPositions = 0xB7,
+ kMkvCueTrack = 0xF7,
+ kMkvCueClusterPosition = 0xF1,
+ kMkvCueBlockNumber = 0x5378,
+ // Chapters
+ kMkvChapters = 0x1043A770,
+ kMkvEditionEntry = 0x45B9,
+ kMkvChapterAtom = 0xB6,
+ kMkvChapterUID = 0x73C4,
+ kMkvChapterStringUID = 0x5654,
+ kMkvChapterTimeStart = 0x91,
+ kMkvChapterTimeEnd = 0x92,
+ kMkvChapterDisplay = 0x80,
+ kMkvChapString = 0x85,
+ kMkvChapLanguage = 0x437C,
+ kMkvChapCountry = 0x437E,
+ // Tags
+ kMkvTags = 0x1254C367,
+ kMkvTag = 0x7373,
+ kMkvSimpleTag = 0x67C8,
+ kMkvTagName = 0x45A3,
+ kMkvTagString = 0x4487
+};
+
+} // namespace libwebm
+
+#endif // COMMON_WEBMIDS_H_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
new file mode 100644
index 0000000000..faaf0165f4
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -0,0 +1,4230 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxer.h"
+
+#include <stdint.h>
+
+#include <cfloat>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxerutil.h"
+#include "mkvmuxer/mkvwriter.h"
+#include "mkvparser/mkvparser.h"
+
+namespace mkvmuxer {
+
+const float PrimaryChromaticity::kChromaticityMin = 0.0f;
+const float PrimaryChromaticity::kChromaticityMax = 1.0f;
+const float MasteringMetadata::kMinLuminance = 0.0f;
+const float MasteringMetadata::kMinLuminanceMax = 999.99f;
+const float MasteringMetadata::kMaxLuminanceMax = 9999.99f;
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const uint64_t Colour::kValueNotPresent = UINT64_MAX;
+
+namespace {
+
+const char kDocTypeWebm[] = "webm";
+const char kDocTypeMatroska[] = "matroska";
+
+// Deallocate the string designated by |dst|, and then copy the |src|
+// string to |dst|. The caller owns both the |src| string and the
+// |dst| copy (hence the caller is responsible for eventually
+// deallocating the strings, either directly, or indirectly via
+// StrCpy). Returns true if the source string was successfully copied
+// to the destination.
+bool StrCpy(const char* src, char** dst_ptr) {
+ if (dst_ptr == NULL)
+ return false;
+
+ char*& dst = *dst_ptr;
+
+ delete[] dst;
+ dst = NULL;
+
+ if (src == NULL)
+ return true;
+
+ const size_t size = strlen(src) + 1;
+
+ dst = new (std::nothrow) char[size]; // NOLINT
+ if (dst == NULL)
+ return false;
+
+ strcpy(dst, src); // NOLINT
+ return true;
+}
+
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+bool CopyChromaticity(const PrimaryChromaticity* src,
+ PrimaryChromaticityPtr* dst) {
+ if (!dst)
+ return false;
+
+ dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y()));
+ if (!dst->get())
+ return false;
+
+ return true;
+}
+
+} // namespace
+
+///////////////////////////////////////////////////////////////
+//
+// IMkvWriter Class
+
+IMkvWriter::IMkvWriter() {}
+
+IMkvWriter::~IMkvWriter() {}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+ const char* const doc_type) {
+ // Level 0
+ uint64_t size =
+ EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast<uint64>(1));
+ size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast<uint64>(1));
+ size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast<uint64>(4));
+ size +=
+ EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast<uint64>(8));
+ size += EbmlElementSize(libwebm::kMkvDocType, doc_type);
+ size += EbmlElementSize(libwebm::kMkvDocTypeVersion,
+ static_cast<uint64>(doc_type_version));
+ size +=
+ EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast<uint64>(2));
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion,
+ static_cast<uint64>(1))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion,
+ static_cast<uint64>(1))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength,
+ static_cast<uint64>(4))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength,
+ static_cast<uint64>(8))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion,
+ static_cast<uint64>(doc_type_version))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion,
+ static_cast<uint64>(2))) {
+ return false;
+ }
+
+ return true;
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+ return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm);
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer) {
+ return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
+}
+
+bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
+ int64_t start, int64_t size) {
+ // TODO(vigneshv): Check if this is a reasonable value.
+ const uint32_t kBufSize = 2048;
+ uint8_t* buf = new uint8_t[kBufSize];
+ int64_t offset = start;
+ while (size > 0) {
+ const int64_t read_len = (size > kBufSize) ? kBufSize : size;
+ if (source->Read(offset, static_cast<long>(read_len), buf))
+ return false;
+ dst->Write(buf, static_cast<uint32_t>(read_len));
+ offset += read_len;
+ size -= read_len;
+ }
+ delete[] buf;
+ return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Frame Class
+
+Frame::Frame()
+ : add_id_(0),
+ additional_(NULL),
+ additional_length_(0),
+ duration_(0),
+ duration_set_(false),
+ frame_(NULL),
+ is_key_(false),
+ length_(0),
+ track_number_(0),
+ timestamp_(0),
+ discard_padding_(0),
+ reference_block_timestamp_(0),
+ reference_block_timestamp_set_(false) {}
+
+Frame::~Frame() {
+ delete[] frame_;
+ delete[] additional_;
+}
+
+bool Frame::CopyFrom(const Frame& frame) {
+ delete[] frame_;
+ frame_ = NULL;
+ length_ = 0;
+ if (frame.length() > 0 && frame.frame() != NULL &&
+ !Init(frame.frame(), frame.length())) {
+ return false;
+ }
+ add_id_ = 0;
+ delete[] additional_;
+ additional_ = NULL;
+ additional_length_ = 0;
+ if (frame.additional_length() > 0 && frame.additional() != NULL &&
+ !AddAdditionalData(frame.additional(), frame.additional_length(),
+ frame.add_id())) {
+ return false;
+ }
+ duration_ = frame.duration();
+ duration_set_ = frame.duration_set();
+ is_key_ = frame.is_key();
+ track_number_ = frame.track_number();
+ timestamp_ = frame.timestamp();
+ discard_padding_ = frame.discard_padding();
+ reference_block_timestamp_ = frame.reference_block_timestamp();
+ reference_block_timestamp_set_ = frame.reference_block_timestamp_set();
+ return true;
+}
+
+bool Frame::Init(const uint8_t* frame, uint64_t length) {
+ uint8_t* const data =
+ new (std::nothrow) uint8_t[static_cast<size_t>(length)]; // NOLINT
+ if (!data)
+ return false;
+
+ delete[] frame_;
+ frame_ = data;
+ length_ = length;
+
+ memcpy(frame_, frame, static_cast<size_t>(length_));
+ return true;
+}
+
+bool Frame::AddAdditionalData(const uint8_t* additional, uint64_t length,
+ uint64_t add_id) {
+ uint8_t* const data =
+ new (std::nothrow) uint8_t[static_cast<size_t>(length)]; // NOLINT
+ if (!data)
+ return false;
+
+ delete[] additional_;
+ additional_ = data;
+ additional_length_ = length;
+ add_id_ = add_id;
+
+ memcpy(additional_, additional, static_cast<size_t>(additional_length_));
+ return true;
+}
+
+bool Frame::IsValid() const {
+ if (length_ == 0 || !frame_) {
+ return false;
+ }
+ if ((additional_length_ != 0 && !additional_) ||
+ (additional_ != NULL && additional_length_ == 0)) {
+ return false;
+ }
+ if (track_number_ == 0 || track_number_ > kMaxTrackNumber) {
+ return false;
+ }
+ if (!CanBeSimpleBlock() && !is_key_ && !reference_block_timestamp_set_) {
+ return false;
+ }
+ return true;
+}
+
+bool Frame::CanBeSimpleBlock() const {
+ return additional_ == NULL && discard_padding_ == 0 && duration_ == 0;
+}
+
+void Frame::set_duration(uint64_t duration) {
+ duration_ = duration;
+ duration_set_ = true;
+}
+
+void Frame::set_reference_block_timestamp(int64_t reference_block_timestamp) {
+ reference_block_timestamp_ = reference_block_timestamp;
+ reference_block_timestamp_set_ = true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// CuePoint Class
+
+CuePoint::CuePoint()
+ : time_(0),
+ track_(0),
+ cluster_pos_(0),
+ block_number_(1),
+ output_block_number_(true) {}
+
+CuePoint::~CuePoint() {}
+
+bool CuePoint::Write(IMkvWriter* writer) const {
+ if (!writer || track_ < 1 || cluster_pos_ < 1)
+ return false;
+
+ uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+ static_cast<uint64>(cluster_pos_));
+ size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
+ if (output_block_number_ && block_number_ > 1)
+ size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+ static_cast<uint64>(block_number_));
+ const uint64_t track_pos_size =
+ EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+ const uint64_t payload_size =
+ EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+ track_pos_size;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvCueTime,
+ static_cast<uint64>(time_))) {
+ return false;
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack,
+ static_cast<uint64>(track_))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition,
+ static_cast<uint64>(cluster_pos_))) {
+ return false;
+ }
+ if (output_block_number_ && block_number_ > 1) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber,
+ static_cast<uint64>(block_number_))) {
+ return false;
+ }
+ }
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0)
+ return false;
+
+ if (stop_position - payload_position != static_cast<int64_t>(payload_size))
+ return false;
+
+ return true;
+}
+
+uint64_t CuePoint::PayloadSize() const {
+ uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+ static_cast<uint64>(cluster_pos_));
+ size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
+ if (output_block_number_ && block_number_ > 1)
+ size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+ static_cast<uint64>(block_number_));
+ const uint64_t track_pos_size =
+ EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+ const uint64_t payload_size =
+ EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+ track_pos_size;
+
+ return payload_size;
+}
+
+uint64_t CuePoint::Size() const {
+ const uint64_t payload_size = PayloadSize();
+ return EbmlMasterElementSize(libwebm::kMkvCuePoint, payload_size) +
+ payload_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cues Class
+
+Cues::Cues()
+ : cue_entries_capacity_(0),
+ cue_entries_size_(0),
+ cue_entries_(NULL),
+ output_block_number_(true) {}
+
+Cues::~Cues() {
+ if (cue_entries_) {
+ for (int32_t i = 0; i < cue_entries_size_; ++i) {
+ CuePoint* const cue = cue_entries_[i];
+ delete cue;
+ }
+ delete[] cue_entries_;
+ }
+}
+
+bool Cues::AddCue(CuePoint* cue) {
+ if (!cue)
+ return false;
+
+ if ((cue_entries_size_ + 1) > cue_entries_capacity_) {
+ // Add more CuePoints.
+ const int32_t new_capacity =
+ (!cue_entries_capacity_) ? 2 : cue_entries_capacity_ * 2;
+
+ if (new_capacity < 1)
+ return false;
+
+ CuePoint** const cues =
+ new (std::nothrow) CuePoint*[new_capacity]; // NOLINT
+ if (!cues)
+ return false;
+
+ for (int32_t i = 0; i < cue_entries_size_; ++i) {
+ cues[i] = cue_entries_[i];
+ }
+
+ delete[] cue_entries_;
+
+ cue_entries_ = cues;
+ cue_entries_capacity_ = new_capacity;
+ }
+
+ cue->set_output_block_number(output_block_number_);
+ cue_entries_[cue_entries_size_++] = cue;
+ return true;
+}
+
+CuePoint* Cues::GetCueByIndex(int32_t index) const {
+ if (cue_entries_ == NULL)
+ return NULL;
+
+ if (index >= cue_entries_size_)
+ return NULL;
+
+ return cue_entries_[index];
+}
+
+uint64_t Cues::Size() {
+ uint64_t size = 0;
+ for (int32_t i = 0; i < cue_entries_size_; ++i)
+ size += GetCueByIndex(i)->Size();
+ size += EbmlMasterElementSize(libwebm::kMkvCues, size);
+ return size;
+}
+
+bool Cues::Write(IMkvWriter* writer) const {
+ if (!writer)
+ return false;
+
+ uint64_t size = 0;
+ for (int32_t i = 0; i < cue_entries_size_; ++i) {
+ const CuePoint* const cue = GetCueByIndex(i);
+
+ if (!cue)
+ return false;
+
+ size += cue->Size();
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvCues, size))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ for (int32_t i = 0; i < cue_entries_size_; ++i) {
+ const CuePoint* const cue = GetCueByIndex(i);
+
+ if (!cue->Write(writer))
+ return false;
+ }
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0)
+ return false;
+
+ if (stop_position - payload_position != static_cast<int64_t>(size))
+ return false;
+
+ return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncAESSettings Class
+
+ContentEncAESSettings::ContentEncAESSettings() : cipher_mode_(kCTR) {}
+
+uint64_t ContentEncAESSettings::Size() const {
+ const uint64_t payload = PayloadSize();
+ const uint64_t size =
+ EbmlMasterElementSize(libwebm::kMkvContentEncAESSettings, payload) +
+ payload;
+ return size;
+}
+
+bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
+ const uint64_t payload = PayloadSize();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncAESSettings,
+ payload))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode,
+ static_cast<uint64>(cipher_mode_))) {
+ return false;
+ }
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(payload))
+ return false;
+
+ return true;
+}
+
+uint64_t ContentEncAESSettings::PayloadSize() const {
+ uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode,
+ static_cast<uint64>(cipher_mode_));
+ return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncoding Class
+
+ContentEncoding::ContentEncoding()
+ : enc_algo_(5),
+ enc_key_id_(NULL),
+ encoding_order_(0),
+ encoding_scope_(1),
+ encoding_type_(1),
+ enc_key_id_length_(0) {}
+
+ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; }
+
+bool ContentEncoding::SetEncryptionID(const uint8_t* id, uint64_t length) {
+ if (!id || length < 1)
+ return false;
+
+ delete[] enc_key_id_;
+
+ enc_key_id_ =
+ new (std::nothrow) uint8_t[static_cast<size_t>(length)]; // NOLINT
+ if (!enc_key_id_)
+ return false;
+
+ memcpy(enc_key_id_, id, static_cast<size_t>(length));
+ enc_key_id_length_ = length;
+
+ return true;
+}
+
+uint64_t ContentEncoding::Size() const {
+ const uint64_t encryption_size = EncryptionSize();
+ const uint64_t encoding_size = EncodingSize(0, encryption_size);
+ const uint64_t encodings_size =
+ EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+ encoding_size;
+
+ return encodings_size;
+}
+
+bool ContentEncoding::Write(IMkvWriter* writer) const {
+ const uint64_t encryption_size = EncryptionSize();
+ const uint64_t encoding_size = EncodingSize(0, encryption_size);
+ const uint64_t size =
+ EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+ encoding_size;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncoding,
+ encoding_size))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder,
+ static_cast<uint64>(encoding_order_)))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope,
+ static_cast<uint64>(encoding_scope_)))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType,
+ static_cast<uint64>(encoding_type_)))
+ return false;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption,
+ encryption_size))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo,
+ static_cast<uint64>(enc_algo_))) {
+ return false;
+ }
+ if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_,
+ enc_key_id_length_))
+ return false;
+
+ if (!enc_aes_settings_.Write(writer))
+ return false;
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(size))
+ return false;
+
+ return true;
+}
+
+uint64_t ContentEncoding::EncodingSize(uint64_t compression_size,
+ uint64_t encryption_size) const {
+ // TODO(fgalligan): Add support for compression settings.
+ if (compression_size != 0)
+ return 0;
+
+ uint64_t encoding_size = 0;
+
+ if (encryption_size > 0) {
+ encoding_size +=
+ EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) +
+ encryption_size;
+ }
+ encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType,
+ static_cast<uint64>(encoding_type_));
+ encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope,
+ static_cast<uint64>(encoding_scope_));
+ encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder,
+ static_cast<uint64>(encoding_order_));
+
+ return encoding_size;
+}
+
+uint64_t ContentEncoding::EncryptionSize() const {
+ const uint64_t aes_size = enc_aes_settings_.Size();
+
+ uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID,
+ enc_key_id_, enc_key_id_length_);
+ encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo,
+ static_cast<uint64>(enc_algo_));
+
+ return encryption_size + aes_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Track Class
+
+Track::Track(unsigned int* seed)
+ : codec_id_(NULL),
+ codec_private_(NULL),
+ language_(NULL),
+ max_block_additional_id_(0),
+ name_(NULL),
+ number_(0),
+ type_(0),
+ uid_(MakeUID(seed)),
+ codec_delay_(0),
+ seek_pre_roll_(0),
+ default_duration_(0),
+ codec_private_length_(0),
+ content_encoding_entries_(NULL),
+ content_encoding_entries_size_(0) {}
+
+Track::~Track() {
+ delete[] codec_id_;
+ delete[] codec_private_;
+ delete[] language_;
+ delete[] name_;
+
+ if (content_encoding_entries_) {
+ for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+ ContentEncoding* const encoding = content_encoding_entries_[i];
+ delete encoding;
+ }
+ delete[] content_encoding_entries_;
+ }
+}
+
+bool Track::AddContentEncoding() {
+ const uint32_t count = content_encoding_entries_size_ + 1;
+
+ ContentEncoding** const content_encoding_entries =
+ new (std::nothrow) ContentEncoding*[count]; // NOLINT
+ if (!content_encoding_entries)
+ return false;
+
+ ContentEncoding* const content_encoding =
+ new (std::nothrow) ContentEncoding(); // NOLINT
+ if (!content_encoding) {
+ delete[] content_encoding_entries;
+ return false;
+ }
+
+ for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+ content_encoding_entries[i] = content_encoding_entries_[i];
+ }
+
+ delete[] content_encoding_entries_;
+
+ content_encoding_entries_ = content_encoding_entries;
+ content_encoding_entries_[content_encoding_entries_size_] = content_encoding;
+ content_encoding_entries_size_ = count;
+ return true;
+}
+
+ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const {
+ if (content_encoding_entries_ == NULL)
+ return NULL;
+
+ if (index >= content_encoding_entries_size_)
+ return NULL;
+
+ return content_encoding_entries_[index];
+}
+
+uint64_t Track::PayloadSize() const {
+ uint64_t size =
+ EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+ size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+ size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
+ if (codec_id_)
+ size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
+ if (codec_private_)
+ size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
+ codec_private_length_);
+ if (language_)
+ size += EbmlElementSize(libwebm::kMkvLanguage, language_);
+ if (name_)
+ size += EbmlElementSize(libwebm::kMkvName, name_);
+ if (max_block_additional_id_) {
+ size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+ static_cast<uint64>(max_block_additional_id_));
+ }
+ if (codec_delay_) {
+ size += EbmlElementSize(libwebm::kMkvCodecDelay,
+ static_cast<uint64>(codec_delay_));
+ }
+ if (seek_pre_roll_) {
+ size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+ static_cast<uint64>(seek_pre_roll_));
+ }
+ if (default_duration_) {
+ size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+ static_cast<uint64>(default_duration_));
+ }
+
+ if (content_encoding_entries_size_ > 0) {
+ uint64_t content_encodings_size = 0;
+ for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+ ContentEncoding* const encoding = content_encoding_entries_[i];
+ content_encodings_size += encoding->Size();
+ }
+
+ size += EbmlMasterElementSize(libwebm::kMkvContentEncodings,
+ content_encodings_size) +
+ content_encodings_size;
+ }
+
+ return size;
+}
+
+uint64_t Track::Size() const {
+ uint64_t size = PayloadSize();
+ size += EbmlMasterElementSize(libwebm::kMkvTrackEntry, size);
+ return size;
+}
+
+bool Track::Write(IMkvWriter* writer) const {
+ if (!writer)
+ return false;
+
+ // mandatory elements without a default value.
+ if (!type_ || !codec_id_)
+ return false;
+
+ // AV1 tracks require a CodecPrivate. See
+ // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md
+ // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
+ // point to a stable version once it is finalized, or our own WebM mappings
+ // page on webmproject.org should we decide to release them.
+ if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_)
+ return false;
+
+ // |size| may be bigger than what is written out in this function because
+ // derived classes may write out more data in the Track element.
+ const uint64_t payload_size = PayloadSize();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size))
+ return false;
+
+ uint64_t size =
+ EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+ size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+ size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
+ if (codec_id_)
+ size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
+ if (codec_private_)
+ size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
+ static_cast<uint64>(codec_private_length_));
+ if (language_)
+ size += EbmlElementSize(libwebm::kMkvLanguage, language_);
+ if (name_)
+ size += EbmlElementSize(libwebm::kMkvName, name_);
+ if (max_block_additional_id_)
+ size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+ static_cast<uint64>(max_block_additional_id_));
+ if (codec_delay_)
+ size += EbmlElementSize(libwebm::kMkvCodecDelay,
+ static_cast<uint64>(codec_delay_));
+ if (seek_pre_roll_)
+ size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+ static_cast<uint64>(seek_pre_roll_));
+ if (default_duration_)
+ size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+ static_cast<uint64>(default_duration_));
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber,
+ static_cast<uint64>(number_)))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID,
+ static_cast<uint64>(uid_)))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvTrackType,
+ static_cast<uint64>(type_)))
+ return false;
+ if (max_block_additional_id_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID,
+ static_cast<uint64>(max_block_additional_id_))) {
+ return false;
+ }
+ }
+ if (codec_delay_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay,
+ static_cast<uint64>(codec_delay_)))
+ return false;
+ }
+ if (seek_pre_roll_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll,
+ static_cast<uint64>(seek_pre_roll_)))
+ return false;
+ }
+ if (default_duration_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration,
+ static_cast<uint64>(default_duration_)))
+ return false;
+ }
+ if (codec_id_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvCodecID, codec_id_))
+ return false;
+ }
+ if (codec_private_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_,
+ static_cast<uint64>(codec_private_length_)))
+ return false;
+ }
+ if (language_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvLanguage, language_))
+ return false;
+ }
+ if (name_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvName, name_))
+ return false;
+ }
+
+ int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(size))
+ return false;
+
+ if (content_encoding_entries_size_ > 0) {
+ uint64_t content_encodings_size = 0;
+ for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+ ContentEncoding* const encoding = content_encoding_entries_[i];
+ content_encodings_size += encoding->Size();
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncodings,
+ content_encodings_size))
+ return false;
+
+ for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+ ContentEncoding* const encoding = content_encoding_entries_[i];
+ if (!encoding->Write(writer))
+ return false;
+ }
+ }
+
+ stop_position = writer->Position();
+ if (stop_position < 0)
+ return false;
+ return true;
+}
+
+bool Track::SetCodecPrivate(const uint8_t* codec_private, uint64_t length) {
+ if (!codec_private || length < 1)
+ return false;
+
+ delete[] codec_private_;
+
+ codec_private_ =
+ new (std::nothrow) uint8_t[static_cast<size_t>(length)]; // NOLINT
+ if (!codec_private_)
+ return false;
+
+ memcpy(codec_private_, codec_private, static_cast<size_t>(length));
+ codec_private_length_ = length;
+
+ return true;
+}
+
+void Track::set_codec_id(const char* codec_id) {
+ if (codec_id) {
+ delete[] codec_id_;
+
+ const size_t length = strlen(codec_id) + 1;
+ codec_id_ = new (std::nothrow) char[length]; // NOLINT
+ if (codec_id_) {
+#ifdef _MSC_VER
+ strcpy_s(codec_id_, length, codec_id);
+#else
+ strcpy(codec_id_, codec_id);
+#endif
+ }
+ }
+}
+
+// TODO(fgalligan): Vet the language parameter.
+void Track::set_language(const char* language) {
+ if (language) {
+ delete[] language_;
+
+ const size_t length = strlen(language) + 1;
+ language_ = new (std::nothrow) char[length]; // NOLINT
+ if (language_) {
+#ifdef _MSC_VER
+ strcpy_s(language_, length, language);
+#else
+ strcpy(language_, language);
+#endif
+ }
+ }
+}
+
+void Track::set_name(const char* name) {
+ if (name) {
+ delete[] name_;
+
+ const size_t length = strlen(name) + 1;
+ name_ = new (std::nothrow) char[length]; // NOLINT
+ if (name_) {
+#ifdef _MSC_VER
+ strcpy_s(name_, length, name);
+#else
+ strcpy(name_, name);
+#endif
+ }
+ }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Colour and its child elements
+
+uint64_t PrimaryChromaticity::PrimaryChromaticitySize(
+ libwebm::MkvId x_id, libwebm::MkvId y_id) const {
+ return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_);
+}
+
+bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id,
+ libwebm::MkvId y_id) const {
+ if (!Valid()) {
+ return false;
+ }
+ return WriteEbmlElement(writer, x_id, x_) &&
+ WriteEbmlElement(writer, y_id, y_);
+}
+
+bool PrimaryChromaticity::Valid() const {
+ return (x_ >= kChromaticityMin && x_ <= kChromaticityMax &&
+ y_ >= kChromaticityMin && y_ <= kChromaticityMax);
+}
+
+uint64_t MasteringMetadata::MasteringMetadataSize() const {
+ uint64_t size = PayloadSize();
+
+ if (size > 0)
+ size += EbmlMasterElementSize(libwebm::kMkvMasteringMetadata, size);
+
+ return size;
+}
+
+bool MasteringMetadata::Valid() const {
+ if (luminance_min_ != kValueNotPresent) {
+ if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax ||
+ luminance_min_ > luminance_max_) {
+ return false;
+ }
+ }
+ if (luminance_max_ != kValueNotPresent) {
+ if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax ||
+ luminance_max_ < luminance_min_) {
+ return false;
+ }
+ }
+ if (r_ && !r_->Valid())
+ return false;
+ if (g_ && !g_->Valid())
+ return false;
+ if (b_ && !b_->Valid())
+ return false;
+ if (white_point_ && !white_point_->Valid())
+ return false;
+
+ return true;
+}
+
+bool MasteringMetadata::Write(IMkvWriter* writer) const {
+ const uint64_t size = PayloadSize();
+
+ // Don't write an empty element.
+ if (size == 0)
+ return true;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size))
+ return false;
+ if (luminance_max_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) {
+ return false;
+ }
+ if (luminance_min_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
+ return false;
+ }
+ if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+ libwebm::kMkvPrimaryRChromaticityY)) {
+ return false;
+ }
+ if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+ libwebm::kMkvPrimaryGChromaticityY)) {
+ return false;
+ }
+ if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+ libwebm::kMkvPrimaryBChromaticityY)) {
+ return false;
+ }
+ if (white_point_ &&
+ !white_point_->Write(writer, libwebm::kMkvWhitePointChromaticityX,
+ libwebm::kMkvWhitePointChromaticityY)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool MasteringMetadata::SetChromaticity(
+ const PrimaryChromaticity* r, const PrimaryChromaticity* g,
+ const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
+ PrimaryChromaticityPtr r_ptr(nullptr);
+ if (r) {
+ if (!CopyChromaticity(r, &r_ptr))
+ return false;
+ }
+ PrimaryChromaticityPtr g_ptr(nullptr);
+ if (g) {
+ if (!CopyChromaticity(g, &g_ptr))
+ return false;
+ }
+ PrimaryChromaticityPtr b_ptr(nullptr);
+ if (b) {
+ if (!CopyChromaticity(b, &b_ptr))
+ return false;
+ }
+ PrimaryChromaticityPtr wp_ptr(nullptr);
+ if (white_point) {
+ if (!CopyChromaticity(white_point, &wp_ptr))
+ return false;
+ }
+
+ r_ = r_ptr.release();
+ g_ = g_ptr.release();
+ b_ = b_ptr.release();
+ white_point_ = wp_ptr.release();
+ return true;
+}
+
+uint64_t MasteringMetadata::PayloadSize() const {
+ uint64_t size = 0;
+
+ if (luminance_max_ != kValueNotPresent)
+ size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_);
+ if (luminance_min_ != kValueNotPresent)
+ size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_);
+
+ if (r_) {
+ size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX,
+ libwebm::kMkvPrimaryRChromaticityY);
+ }
+ if (g_) {
+ size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX,
+ libwebm::kMkvPrimaryGChromaticityY);
+ }
+ if (b_) {
+ size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX,
+ libwebm::kMkvPrimaryBChromaticityY);
+ }
+ if (white_point_) {
+ size += white_point_->PrimaryChromaticitySize(
+ libwebm::kMkvWhitePointChromaticityX,
+ libwebm::kMkvWhitePointChromaticityY);
+ }
+
+ return size;
+}
+
+uint64_t Colour::ColourSize() const {
+ uint64_t size = PayloadSize();
+
+ if (size > 0)
+ size += EbmlMasterElementSize(libwebm::kMkvColour, size);
+
+ return size;
+}
+
+bool Colour::Valid() const {
+ if (mastering_metadata_ && !mastering_metadata_->Valid())
+ return false;
+ if (matrix_coefficients_ != kValueNotPresent &&
+ !IsMatrixCoefficientsValueValid(matrix_coefficients_)) {
+ return false;
+ }
+ if (chroma_siting_horz_ != kValueNotPresent &&
+ !IsChromaSitingHorzValueValid(chroma_siting_horz_)) {
+ return false;
+ }
+ if (chroma_siting_vert_ != kValueNotPresent &&
+ !IsChromaSitingVertValueValid(chroma_siting_vert_)) {
+ return false;
+ }
+ if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_))
+ return false;
+ if (transfer_characteristics_ != kValueNotPresent &&
+ !IsTransferCharacteristicsValueValid(transfer_characteristics_)) {
+ return false;
+ }
+ if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_))
+ return false;
+
+ return true;
+}
+
+bool Colour::Write(IMkvWriter* writer) const {
+ const uint64_t size = PayloadSize();
+
+ // Don't write an empty element.
+ if (size == 0)
+ return true;
+
+ // Don't write an invalid element.
+ if (!Valid())
+ return false;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size))
+ return false;
+
+ if (matrix_coefficients_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients,
+ static_cast<uint64>(matrix_coefficients_))) {
+ return false;
+ }
+ if (bits_per_channel_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel,
+ static_cast<uint64>(bits_per_channel_))) {
+ return false;
+ }
+ if (chroma_subsampling_horz_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz,
+ static_cast<uint64>(chroma_subsampling_horz_))) {
+ return false;
+ }
+ if (chroma_subsampling_vert_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert,
+ static_cast<uint64>(chroma_subsampling_vert_))) {
+ return false;
+ }
+
+ if (cb_subsampling_horz_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz,
+ static_cast<uint64>(cb_subsampling_horz_))) {
+ return false;
+ }
+ if (cb_subsampling_vert_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert,
+ static_cast<uint64>(cb_subsampling_vert_))) {
+ return false;
+ }
+ if (chroma_siting_horz_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz,
+ static_cast<uint64>(chroma_siting_horz_))) {
+ return false;
+ }
+ if (chroma_siting_vert_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert,
+ static_cast<uint64>(chroma_siting_vert_))) {
+ return false;
+ }
+ if (range_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvRange,
+ static_cast<uint64>(range_))) {
+ return false;
+ }
+ if (transfer_characteristics_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics,
+ static_cast<uint64>(transfer_characteristics_))) {
+ return false;
+ }
+ if (primaries_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvPrimaries,
+ static_cast<uint64>(primaries_))) {
+ return false;
+ }
+ if (max_cll_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvMaxCLL,
+ static_cast<uint64>(max_cll_))) {
+ return false;
+ }
+ if (max_fall_ != kValueNotPresent &&
+ !WriteEbmlElement(writer, libwebm::kMkvMaxFALL,
+ static_cast<uint64>(max_fall_))) {
+ return false;
+ }
+
+ if (mastering_metadata_ && !mastering_metadata_->Write(writer))
+ return false;
+
+ return true;
+}
+
+bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
+ std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+ if (!mm_ptr.get())
+ return false;
+
+ mm_ptr->set_luminance_max(mastering_metadata.luminance_max());
+ mm_ptr->set_luminance_min(mastering_metadata.luminance_min());
+
+ if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(),
+ mastering_metadata.b(),
+ mastering_metadata.white_point())) {
+ return false;
+ }
+
+ delete mastering_metadata_;
+ mastering_metadata_ = mm_ptr.release();
+ return true;
+}
+
+uint64_t Colour::PayloadSize() const {
+ uint64_t size = 0;
+
+ if (matrix_coefficients_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvMatrixCoefficients,
+ static_cast<uint64>(matrix_coefficients_));
+ }
+ if (bits_per_channel_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvBitsPerChannel,
+ static_cast<uint64>(bits_per_channel_));
+ }
+ if (chroma_subsampling_horz_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz,
+ static_cast<uint64>(chroma_subsampling_horz_));
+ }
+ if (chroma_subsampling_vert_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert,
+ static_cast<uint64>(chroma_subsampling_vert_));
+ }
+ if (cb_subsampling_horz_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz,
+ static_cast<uint64>(cb_subsampling_horz_));
+ }
+ if (cb_subsampling_vert_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert,
+ static_cast<uint64>(cb_subsampling_vert_));
+ }
+ if (chroma_siting_horz_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvChromaSitingHorz,
+ static_cast<uint64>(chroma_siting_horz_));
+ }
+ if (chroma_siting_vert_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvChromaSitingVert,
+ static_cast<uint64>(chroma_siting_vert_));
+ }
+ if (range_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvRange, static_cast<uint64>(range_));
+ }
+ if (transfer_characteristics_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvTransferCharacteristics,
+ static_cast<uint64>(transfer_characteristics_));
+ }
+ if (primaries_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvPrimaries,
+ static_cast<uint64>(primaries_));
+ }
+ if (max_cll_ != kValueNotPresent) {
+ size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast<uint64>(max_cll_));
+ }
+ if (max_fall_ != kValueNotPresent) {
+ size +=
+ EbmlElementSize(libwebm::kMkvMaxFALL, static_cast<uint64>(max_fall_));
+ }
+
+ if (mastering_metadata_)
+ size += mastering_metadata_->MasteringMetadataSize();
+
+ return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Projection element
+
+uint64_t Projection::ProjectionSize() const {
+ uint64_t size = PayloadSize();
+
+ if (size > 0)
+ size += EbmlMasterElementSize(libwebm::kMkvProjection, size);
+
+ return size;
+}
+
+bool Projection::Write(IMkvWriter* writer) const {
+ const uint64_t size = PayloadSize();
+
+ // Don't write an empty element.
+ if (size == 0)
+ return true;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size))
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType,
+ static_cast<uint64>(type_))) {
+ return false;
+ }
+
+ if (private_data_length_ > 0 && private_data_ != NULL &&
+ !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_,
+ private_data_length_)) {
+ return false;
+ }
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_))
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch,
+ pose_pitch_)) {
+ return false;
+ }
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool Projection::SetProjectionPrivate(const uint8_t* data,
+ uint64_t data_length) {
+ if (data == NULL || data_length == 0) {
+ return false;
+ }
+
+ if (data_length != static_cast<size_t>(data_length)) {
+ return false;
+ }
+
+ uint8_t* new_private_data =
+ new (std::nothrow) uint8_t[static_cast<size_t>(data_length)];
+ if (new_private_data == NULL) {
+ return false;
+ }
+
+ delete[] private_data_;
+ private_data_ = new_private_data;
+ private_data_length_ = data_length;
+ memcpy(private_data_, data, static_cast<size_t>(data_length));
+
+ return true;
+}
+
+uint64_t Projection::PayloadSize() const {
+ uint64_t size =
+ EbmlElementSize(libwebm::kMkvProjection, static_cast<uint64>(type_));
+
+ if (private_data_length_ > 0 && private_data_ != NULL) {
+ size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_,
+ private_data_length_);
+ }
+
+ size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_);
+ size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_);
+ size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_);
+
+ return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// VideoTrack Class
+
+VideoTrack::VideoTrack(unsigned int* seed)
+ : Track(seed),
+ display_height_(0),
+ display_width_(0),
+ pixel_height_(0),
+ pixel_width_(0),
+ crop_left_(0),
+ crop_right_(0),
+ crop_top_(0),
+ crop_bottom_(0),
+ frame_rate_(0.0),
+ height_(0),
+ stereo_mode_(0),
+ alpha_mode_(0),
+ width_(0),
+ colour_space_(NULL),
+ colour_(NULL),
+ projection_(NULL) {}
+
+VideoTrack::~VideoTrack() {
+ delete colour_;
+ delete projection_;
+}
+
+bool VideoTrack::SetStereoMode(uint64_t stereo_mode) {
+ if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
+ stereo_mode != kTopBottomRightIsFirst &&
+ stereo_mode != kTopBottomLeftIsFirst &&
+ stereo_mode != kSideBySideRightIsFirst)
+ return false;
+
+ stereo_mode_ = stereo_mode;
+ return true;
+}
+
+bool VideoTrack::SetAlphaMode(uint64_t alpha_mode) {
+ if (alpha_mode != kNoAlpha && alpha_mode != kAlpha)
+ return false;
+
+ alpha_mode_ = alpha_mode;
+ return true;
+}
+
+uint64_t VideoTrack::PayloadSize() const {
+ const uint64_t parent_size = Track::PayloadSize();
+
+ uint64_t size = VideoPayloadSize();
+ size += EbmlMasterElementSize(libwebm::kMkvVideo, size);
+
+ return parent_size + size;
+}
+
+bool VideoTrack::Write(IMkvWriter* writer) const {
+ if (!Track::Write(writer))
+ return false;
+
+ const uint64_t size = VideoPayloadSize();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvVideo, size))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlElement(
+ writer, libwebm::kMkvPixelWidth,
+ static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_)))
+ return false;
+ if (!WriteEbmlElement(
+ writer, libwebm::kMkvPixelHeight,
+ static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_)))
+ return false;
+ if (display_width_ > 0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth,
+ static_cast<uint64>(display_width_)))
+ return false;
+ }
+ if (display_height_ > 0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight,
+ static_cast<uint64>(display_height_)))
+ return false;
+ }
+ if (crop_left_ > 0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft,
+ static_cast<uint64>(crop_left_)))
+ return false;
+ }
+ if (crop_right_ > 0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight,
+ static_cast<uint64>(crop_right_)))
+ return false;
+ }
+ if (crop_top_ > 0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop,
+ static_cast<uint64>(crop_top_)))
+ return false;
+ }
+ if (crop_bottom_ > 0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom,
+ static_cast<uint64>(crop_bottom_)))
+ return false;
+ }
+ if (stereo_mode_ > kMono) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode,
+ static_cast<uint64>(stereo_mode_)))
+ return false;
+ }
+ if (alpha_mode_ > kNoAlpha) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode,
+ static_cast<uint64>(alpha_mode_)))
+ return false;
+ }
+ if (colour_space_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_))
+ return false;
+ }
+ if (frame_rate_ > 0.0) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
+ static_cast<float>(frame_rate_))) {
+ return false;
+ }
+ }
+ if (colour_) {
+ if (!colour_->Write(writer))
+ return false;
+ }
+ if (projection_) {
+ if (!projection_->Write(writer))
+ return false;
+ }
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(size)) {
+ return false;
+ }
+
+ return true;
+}
+
+void VideoTrack::set_colour_space(const char* colour_space) {
+ if (colour_space) {
+ delete[] colour_space_;
+
+ const size_t length = strlen(colour_space) + 1;
+ colour_space_ = new (std::nothrow) char[length]; // NOLINT
+ if (colour_space_) {
+#ifdef _MSC_VER
+ strcpy_s(colour_space_, length, colour_space);
+#else
+ strcpy(colour_space_, colour_space);
+#endif
+ }
+ }
+}
+
+bool VideoTrack::SetColour(const Colour& colour) {
+ std::unique_ptr<Colour> colour_ptr(new Colour());
+ if (!colour_ptr.get())
+ return false;
+
+ if (colour.mastering_metadata()) {
+ if (!colour_ptr->SetMasteringMetadata(*colour.mastering_metadata()))
+ return false;
+ }
+
+ colour_ptr->set_matrix_coefficients(colour.matrix_coefficients());
+ colour_ptr->set_bits_per_channel(colour.bits_per_channel());
+ colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz());
+ colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert());
+ colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz());
+ colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert());
+ colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz());
+ colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert());
+ colour_ptr->set_range(colour.range());
+ colour_ptr->set_transfer_characteristics(colour.transfer_characteristics());
+ colour_ptr->set_primaries(colour.primaries());
+ colour_ptr->set_max_cll(colour.max_cll());
+ colour_ptr->set_max_fall(colour.max_fall());
+ delete colour_;
+ colour_ = colour_ptr.release();
+ return true;
+}
+
+bool VideoTrack::SetProjection(const Projection& projection) {
+ std::unique_ptr<Projection> projection_ptr(new Projection());
+ if (!projection_ptr.get())
+ return false;
+
+ if (projection.private_data()) {
+ if (!projection_ptr->SetProjectionPrivate(
+ projection.private_data(), projection.private_data_length())) {
+ return false;
+ }
+ }
+
+ projection_ptr->set_type(projection.type());
+ projection_ptr->set_pose_yaw(projection.pose_yaw());
+ projection_ptr->set_pose_pitch(projection.pose_pitch());
+ projection_ptr->set_pose_roll(projection.pose_roll());
+ delete projection_;
+ projection_ = projection_ptr.release();
+ return true;
+}
+
+uint64_t VideoTrack::VideoPayloadSize() const {
+ uint64_t size = EbmlElementSize(
+ libwebm::kMkvPixelWidth,
+ static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_));
+ size += EbmlElementSize(
+ libwebm::kMkvPixelHeight,
+ static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_));
+ if (display_width_ > 0)
+ size += EbmlElementSize(libwebm::kMkvDisplayWidth,
+ static_cast<uint64>(display_width_));
+ if (display_height_ > 0)
+ size += EbmlElementSize(libwebm::kMkvDisplayHeight,
+ static_cast<uint64>(display_height_));
+ if (crop_left_ > 0)
+ size += EbmlElementSize(libwebm::kMkvPixelCropLeft,
+ static_cast<uint64>(crop_left_));
+ if (crop_right_ > 0)
+ size += EbmlElementSize(libwebm::kMkvPixelCropRight,
+ static_cast<uint64>(crop_right_));
+ if (crop_top_ > 0)
+ size += EbmlElementSize(libwebm::kMkvPixelCropTop,
+ static_cast<uint64>(crop_top_));
+ if (crop_bottom_ > 0)
+ size += EbmlElementSize(libwebm::kMkvPixelCropBottom,
+ static_cast<uint64>(crop_bottom_));
+ if (stereo_mode_ > kMono)
+ size += EbmlElementSize(libwebm::kMkvStereoMode,
+ static_cast<uint64>(stereo_mode_));
+ if (alpha_mode_ > kNoAlpha)
+ size += EbmlElementSize(libwebm::kMkvAlphaMode,
+ static_cast<uint64>(alpha_mode_));
+ if (frame_rate_ > 0.0)
+ size += EbmlElementSize(libwebm::kMkvFrameRate,
+ static_cast<float>(frame_rate_));
+ if (colour_space_)
+ size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_);
+ if (colour_)
+ size += colour_->ColourSize();
+ if (projection_)
+ size += projection_->ProjectionSize();
+
+ return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// AudioTrack Class
+
+AudioTrack::AudioTrack(unsigned int* seed)
+ : Track(seed), bit_depth_(0), channels_(1), sample_rate_(0.0) {}
+
+AudioTrack::~AudioTrack() {}
+
+uint64_t AudioTrack::PayloadSize() const {
+ const uint64_t parent_size = Track::PayloadSize();
+
+ uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+ static_cast<float>(sample_rate_));
+ size +=
+ EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
+ if (bit_depth_ > 0)
+ size +=
+ EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
+ size += EbmlMasterElementSize(libwebm::kMkvAudio, size);
+
+ return parent_size + size;
+}
+
+bool AudioTrack::Write(IMkvWriter* writer) const {
+ if (!Track::Write(writer))
+ return false;
+
+ // Calculate AudioSettings size.
+ uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+ static_cast<float>(sample_rate_));
+ size +=
+ EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
+ if (bit_depth_ > 0)
+ size +=
+ EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency,
+ static_cast<float>(sample_rate_)))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvChannels,
+ static_cast<uint64>(channels_)))
+ return false;
+ if (bit_depth_ > 0)
+ if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth,
+ static_cast<uint64>(bit_depth_)))
+ return false;
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(size))
+ return false;
+
+ return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Tracks Class
+
+const char Tracks::kOpusCodecId[] = "A_OPUS";
+const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kAv1CodecId[] = "V_AV1";
+const char Tracks::kVp8CodecId[] = "V_VP8";
+const char Tracks::kVp9CodecId[] = "V_VP9";
+const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
+const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
+const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
+const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES";
+
+Tracks::Tracks()
+ : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {}
+
+Tracks::~Tracks() {
+ if (track_entries_) {
+ for (uint32_t i = 0; i < track_entries_size_; ++i) {
+ Track* const track = track_entries_[i];
+ delete track;
+ }
+ delete[] track_entries_;
+ }
+}
+
+bool Tracks::AddTrack(Track* track, int32_t number) {
+ if (number < 0 || wrote_tracks_)
+ return false;
+
+ // This muxer only supports track numbers in the range [1, 126], in
+ // order to be able (to use Matroska integer representation) to
+ // serialize the block header (of which the track number is a part)
+ // for a frame using exactly 4 bytes.
+
+ if (number > 0x7E)
+ return false;
+
+ uint32_t track_num = number;
+
+ if (track_num > 0) {
+ // Check to make sure a track does not already have |track_num|.
+ for (uint32_t i = 0; i < track_entries_size_; ++i) {
+ if (track_entries_[i]->number() == track_num)
+ return false;
+ }
+ }
+
+ const uint32_t count = track_entries_size_ + 1;
+
+ Track** const track_entries = new (std::nothrow) Track*[count]; // NOLINT
+ if (!track_entries)
+ return false;
+
+ for (uint32_t i = 0; i < track_entries_size_; ++i) {
+ track_entries[i] = track_entries_[i];
+ }
+
+ delete[] track_entries_;
+
+ // Find the lowest availible track number > 0.
+ if (track_num == 0) {
+ track_num = count;
+
+ // Check to make sure a track does not already have |track_num|.
+ bool exit = false;
+ do {
+ exit = true;
+ for (uint32_t i = 0; i < track_entries_size_; ++i) {
+ if (track_entries[i]->number() == track_num) {
+ track_num++;
+ exit = false;
+ break;
+ }
+ }
+ } while (!exit);
+ }
+ track->set_number(track_num);
+
+ track_entries_ = track_entries;
+ track_entries_[track_entries_size_] = track;
+ track_entries_size_ = count;
+ return true;
+}
+
+const Track* Tracks::GetTrackByIndex(uint32_t index) const {
+ if (track_entries_ == NULL)
+ return NULL;
+
+ if (index >= track_entries_size_)
+ return NULL;
+
+ return track_entries_[index];
+}
+
+Track* Tracks::GetTrackByNumber(uint64_t track_number) const {
+ const int32_t count = track_entries_size();
+ for (int32_t i = 0; i < count; ++i) {
+ if (track_entries_[i]->number() == track_number)
+ return track_entries_[i];
+ }
+
+ return NULL;
+}
+
+bool Tracks::TrackIsAudio(uint64_t track_number) const {
+ const Track* const track = GetTrackByNumber(track_number);
+
+ if (track->type() == kAudio)
+ return true;
+
+ return false;
+}
+
+bool Tracks::TrackIsVideo(uint64_t track_number) const {
+ const Track* const track = GetTrackByNumber(track_number);
+
+ if (track->type() == kVideo)
+ return true;
+
+ return false;
+}
+
+bool Tracks::Write(IMkvWriter* writer) const {
+ uint64_t size = 0;
+ const int32_t count = track_entries_size();
+ for (int32_t i = 0; i < count; ++i) {
+ const Track* const track = GetTrackByIndex(i);
+
+ if (!track)
+ return false;
+
+ size += track->Size();
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvTracks, size))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ for (int32_t i = 0; i < count; ++i) {
+ const Track* const track = GetTrackByIndex(i);
+ if (!track->Write(writer))
+ return false;
+ }
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(size))
+ return false;
+
+ wrote_tracks_ = true;
+ return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapter Class
+
+bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); }
+
+void Chapter::set_time(const Segment& segment, uint64_t start_ns,
+ uint64_t end_ns) {
+ const SegmentInfo* const info = segment.GetSegmentInfo();
+ const uint64_t timecode_scale = info->timecode_scale();
+ start_timecode_ = start_ns / timecode_scale;
+ end_timecode_ = end_ns / timecode_scale;
+}
+
+bool Chapter::add_string(const char* title, const char* language,
+ const char* country) {
+ if (!ExpandDisplaysArray())
+ return false;
+
+ Display& d = displays_[displays_count_++];
+ d.Init();
+
+ if (!d.set_title(title))
+ return false;
+
+ if (!d.set_language(language))
+ return false;
+
+ if (!d.set_country(country))
+ return false;
+
+ return true;
+}
+
+Chapter::Chapter() {
+ // This ctor only constructs the object. Proper initialization is
+ // done in Init() (called in Chapters::AddChapter()). The only
+ // reason we bother implementing this ctor is because we had to
+ // declare it as private (along with the dtor), in order to prevent
+ // clients from creating Chapter instances (a privelege we grant
+ // only to the Chapters class). Doing no initialization here also
+ // means that creating arrays of chapter objects is more efficient,
+ // because we only initialize each new chapter object as it becomes
+ // active on the array.
+}
+
+Chapter::~Chapter() {}
+
+void Chapter::Init(unsigned int* seed) {
+ id_ = NULL;
+ start_timecode_ = 0;
+ end_timecode_ = 0;
+ displays_ = NULL;
+ displays_size_ = 0;
+ displays_count_ = 0;
+ uid_ = MakeUID(seed);
+}
+
+void Chapter::ShallowCopy(Chapter* dst) const {
+ dst->id_ = id_;
+ dst->start_timecode_ = start_timecode_;
+ dst->end_timecode_ = end_timecode_;
+ dst->uid_ = uid_;
+ dst->displays_ = displays_;
+ dst->displays_size_ = displays_size_;
+ dst->displays_count_ = displays_count_;
+}
+
+void Chapter::Clear() {
+ StrCpy(NULL, &id_);
+
+ while (displays_count_ > 0) {
+ Display& d = displays_[--displays_count_];
+ d.Clear();
+ }
+
+ delete[] displays_;
+ displays_ = NULL;
+
+ displays_size_ = 0;
+}
+
+bool Chapter::ExpandDisplaysArray() {
+ if (displays_size_ > displays_count_)
+ return true; // nothing to do yet
+
+ const int size = (displays_size_ == 0) ? 1 : 2 * displays_size_;
+
+ Display* const displays = new (std::nothrow) Display[size]; // NOLINT
+ if (displays == NULL)
+ return false;
+
+ for (int idx = 0; idx < displays_count_; ++idx) {
+ displays[idx] = displays_[idx]; // shallow copy
+ }
+
+ delete[] displays_;
+
+ displays_ = displays;
+ displays_size_ = size;
+
+ return true;
+}
+
+uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
+ uint64_t payload_size =
+ EbmlElementSize(libwebm::kMkvChapterStringUID, id_) +
+ EbmlElementSize(libwebm::kMkvChapterUID, static_cast<uint64>(uid_)) +
+ EbmlElementSize(libwebm::kMkvChapterTimeStart,
+ static_cast<uint64>(start_timecode_)) +
+ EbmlElementSize(libwebm::kMkvChapterTimeEnd,
+ static_cast<uint64>(end_timecode_));
+
+ for (int idx = 0; idx < displays_count_; ++idx) {
+ const Display& d = displays_[idx];
+ payload_size += d.WriteDisplay(NULL);
+ }
+
+ const uint64_t atom_size =
+ EbmlMasterElementSize(libwebm::kMkvChapterAtom, payload_size) +
+ payload_size;
+
+ if (writer == NULL)
+ return atom_size;
+
+ const int64_t start = writer->Position();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterAtom, payload_size))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID,
+ static_cast<uint64>(uid_)))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart,
+ static_cast<uint64>(start_timecode_)))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd,
+ static_cast<uint64>(end_timecode_)))
+ return 0;
+
+ for (int idx = 0; idx < displays_count_; ++idx) {
+ const Display& d = displays_[idx];
+
+ if (!d.WriteDisplay(writer))
+ return 0;
+ }
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != atom_size)
+ return 0;
+
+ return atom_size;
+}
+
+void Chapter::Display::Init() {
+ title_ = NULL;
+ language_ = NULL;
+ country_ = NULL;
+}
+
+void Chapter::Display::Clear() {
+ StrCpy(NULL, &title_);
+ StrCpy(NULL, &language_);
+ StrCpy(NULL, &country_);
+}
+
+bool Chapter::Display::set_title(const char* title) {
+ return StrCpy(title, &title_);
+}
+
+bool Chapter::Display::set_language(const char* language) {
+ return StrCpy(language, &language_);
+}
+
+bool Chapter::Display::set_country(const char* country) {
+ return StrCpy(country, &country_);
+}
+
+uint64_t Chapter::Display::WriteDisplay(IMkvWriter* writer) const {
+ uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapString, title_);
+
+ if (language_)
+ payload_size += EbmlElementSize(libwebm::kMkvChapLanguage, language_);
+
+ if (country_)
+ payload_size += EbmlElementSize(libwebm::kMkvChapCountry, country_);
+
+ const uint64_t display_size =
+ EbmlMasterElementSize(libwebm::kMkvChapterDisplay, payload_size) +
+ payload_size;
+
+ if (writer == NULL)
+ return display_size;
+
+ const int64_t start = writer->Position();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterDisplay,
+ payload_size))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapString, title_))
+ return 0;
+
+ if (language_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapLanguage, language_))
+ return 0;
+ }
+
+ if (country_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvChapCountry, country_))
+ return 0;
+ }
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != display_size)
+ return 0;
+
+ return display_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapters Class
+
+Chapters::Chapters() : chapters_size_(0), chapters_count_(0), chapters_(NULL) {}
+
+Chapters::~Chapters() {
+ while (chapters_count_ > 0) {
+ Chapter& chapter = chapters_[--chapters_count_];
+ chapter.Clear();
+ }
+
+ delete[] chapters_;
+ chapters_ = NULL;
+}
+
+int Chapters::Count() const { return chapters_count_; }
+
+Chapter* Chapters::AddChapter(unsigned int* seed) {
+ if (!ExpandChaptersArray())
+ return NULL;
+
+ Chapter& chapter = chapters_[chapters_count_++];
+ chapter.Init(seed);
+
+ return &chapter;
+}
+
+bool Chapters::Write(IMkvWriter* writer) const {
+ if (writer == NULL)
+ return false;
+
+ const uint64_t payload_size = WriteEdition(NULL); // return size only
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapters, payload_size))
+ return false;
+
+ const int64_t start = writer->Position();
+
+ if (WriteEdition(writer) == 0) // error
+ return false;
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != payload_size)
+ return false;
+
+ return true;
+}
+
+bool Chapters::ExpandChaptersArray() {
+ if (chapters_size_ > chapters_count_)
+ return true; // nothing to do yet
+
+ const int size = (chapters_size_ == 0) ? 1 : 2 * chapters_size_;
+
+ Chapter* const chapters = new (std::nothrow) Chapter[size]; // NOLINT
+ if (chapters == NULL)
+ return false;
+
+ for (int idx = 0; idx < chapters_count_; ++idx) {
+ const Chapter& src = chapters_[idx];
+ Chapter* const dst = chapters + idx;
+ src.ShallowCopy(dst);
+ }
+
+ delete[] chapters_;
+
+ chapters_ = chapters;
+ chapters_size_ = size;
+
+ return true;
+}
+
+uint64_t Chapters::WriteEdition(IMkvWriter* writer) const {
+ uint64_t payload_size = 0;
+
+ for (int idx = 0; idx < chapters_count_; ++idx) {
+ const Chapter& chapter = chapters_[idx];
+ payload_size += chapter.WriteAtom(NULL);
+ }
+
+ const uint64_t edition_size =
+ EbmlMasterElementSize(libwebm::kMkvEditionEntry, payload_size) +
+ payload_size;
+
+ if (writer == NULL) // return size only
+ return edition_size;
+
+ const int64_t start = writer->Position();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvEditionEntry, payload_size))
+ return 0; // error
+
+ for (int idx = 0; idx < chapters_count_; ++idx) {
+ const Chapter& chapter = chapters_[idx];
+
+ const uint64_t chapter_size = chapter.WriteAtom(writer);
+ if (chapter_size == 0) // error
+ return 0;
+ }
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != edition_size)
+ return 0;
+
+ return edition_size;
+}
+
+// Tag Class
+
+bool Tag::add_simple_tag(const char* tag_name, const char* tag_string) {
+ if (!ExpandSimpleTagsArray())
+ return false;
+
+ SimpleTag& st = simple_tags_[simple_tags_count_++];
+ st.Init();
+
+ if (!st.set_tag_name(tag_name))
+ return false;
+
+ if (!st.set_tag_string(tag_string))
+ return false;
+
+ return true;
+}
+
+Tag::Tag() {
+ simple_tags_ = NULL;
+ simple_tags_size_ = 0;
+ simple_tags_count_ = 0;
+}
+
+Tag::~Tag() {}
+
+void Tag::ShallowCopy(Tag* dst) const {
+ dst->simple_tags_ = simple_tags_;
+ dst->simple_tags_size_ = simple_tags_size_;
+ dst->simple_tags_count_ = simple_tags_count_;
+}
+
+void Tag::Clear() {
+ while (simple_tags_count_ > 0) {
+ SimpleTag& st = simple_tags_[--simple_tags_count_];
+ st.Clear();
+ }
+
+ delete[] simple_tags_;
+ simple_tags_ = NULL;
+
+ simple_tags_size_ = 0;
+}
+
+bool Tag::ExpandSimpleTagsArray() {
+ if (simple_tags_size_ > simple_tags_count_)
+ return true; // nothing to do yet
+
+ const int size = (simple_tags_size_ == 0) ? 1 : 2 * simple_tags_size_;
+
+ SimpleTag* const simple_tags = new (std::nothrow) SimpleTag[size]; // NOLINT
+ if (simple_tags == NULL)
+ return false;
+
+ for (int idx = 0; idx < simple_tags_count_; ++idx) {
+ simple_tags[idx] = simple_tags_[idx]; // shallow copy
+ }
+
+ delete[] simple_tags_;
+
+ simple_tags_ = simple_tags;
+ simple_tags_size_ = size;
+
+ return true;
+}
+
+uint64_t Tag::Write(IMkvWriter* writer) const {
+ uint64_t payload_size = 0;
+
+ for (int idx = 0; idx < simple_tags_count_; ++idx) {
+ const SimpleTag& st = simple_tags_[idx];
+ payload_size += st.Write(NULL);
+ }
+
+ const uint64_t tag_size =
+ EbmlMasterElementSize(libwebm::kMkvTag, payload_size) + payload_size;
+
+ if (writer == NULL)
+ return tag_size;
+
+ const int64_t start = writer->Position();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvTag, payload_size))
+ return 0;
+
+ for (int idx = 0; idx < simple_tags_count_; ++idx) {
+ const SimpleTag& st = simple_tags_[idx];
+
+ if (!st.Write(writer))
+ return 0;
+ }
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != tag_size)
+ return 0;
+
+ return tag_size;
+}
+
+// Tag::SimpleTag
+
+void Tag::SimpleTag::Init() {
+ tag_name_ = NULL;
+ tag_string_ = NULL;
+}
+
+void Tag::SimpleTag::Clear() {
+ StrCpy(NULL, &tag_name_);
+ StrCpy(NULL, &tag_string_);
+}
+
+bool Tag::SimpleTag::set_tag_name(const char* tag_name) {
+ return StrCpy(tag_name, &tag_name_);
+}
+
+bool Tag::SimpleTag::set_tag_string(const char* tag_string) {
+ return StrCpy(tag_string, &tag_string_);
+}
+
+uint64_t Tag::SimpleTag::Write(IMkvWriter* writer) const {
+ uint64_t payload_size = EbmlElementSize(libwebm::kMkvTagName, tag_name_);
+
+ payload_size += EbmlElementSize(libwebm::kMkvTagString, tag_string_);
+
+ const uint64_t simple_tag_size =
+ EbmlMasterElementSize(libwebm::kMkvSimpleTag, payload_size) +
+ payload_size;
+
+ if (writer == NULL)
+ return simple_tag_size;
+
+ const int64_t start = writer->Position();
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvSimpleTag, payload_size))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvTagName, tag_name_))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvTagString, tag_string_))
+ return 0;
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != simple_tag_size)
+ return 0;
+
+ return simple_tag_size;
+}
+
+// Tags Class
+
+Tags::Tags() : tags_size_(0), tags_count_(0), tags_(NULL) {}
+
+Tags::~Tags() {
+ while (tags_count_ > 0) {
+ Tag& tag = tags_[--tags_count_];
+ tag.Clear();
+ }
+
+ delete[] tags_;
+ tags_ = NULL;
+}
+
+int Tags::Count() const { return tags_count_; }
+
+Tag* Tags::AddTag() {
+ if (!ExpandTagsArray())
+ return NULL;
+
+ Tag& tag = tags_[tags_count_++];
+
+ return &tag;
+}
+
+bool Tags::Write(IMkvWriter* writer) const {
+ if (writer == NULL)
+ return false;
+
+ uint64_t payload_size = 0;
+
+ for (int idx = 0; idx < tags_count_; ++idx) {
+ const Tag& tag = tags_[idx];
+ payload_size += tag.Write(NULL);
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvTags, payload_size))
+ return false;
+
+ const int64_t start = writer->Position();
+
+ for (int idx = 0; idx < tags_count_; ++idx) {
+ const Tag& tag = tags_[idx];
+
+ const uint64_t tag_size = tag.Write(writer);
+ if (tag_size == 0) // error
+ return 0;
+ }
+
+ const int64_t stop = writer->Position();
+
+ if (stop >= start && uint64_t(stop - start) != payload_size)
+ return false;
+
+ return true;
+}
+
+bool Tags::ExpandTagsArray() {
+ if (tags_size_ > tags_count_)
+ return true; // nothing to do yet
+
+ const int size = (tags_size_ == 0) ? 1 : 2 * tags_size_;
+
+ Tag* const tags = new (std::nothrow) Tag[size]; // NOLINT
+ if (tags == NULL)
+ return false;
+
+ for (int idx = 0; idx < tags_count_; ++idx) {
+ const Tag& src = tags_[idx];
+ Tag* const dst = tags + idx;
+ src.ShallowCopy(dst);
+ }
+
+ delete[] tags_;
+
+ tags_ = tags;
+ tags_size_ = size;
+
+ return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cluster class
+
+Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+ bool write_last_frame_with_duration, bool fixed_size_timecode)
+ : blocks_added_(0),
+ finalized_(false),
+ fixed_size_timecode_(fixed_size_timecode),
+ header_written_(false),
+ payload_size_(0),
+ position_for_cues_(cues_pos),
+ size_position_(-1),
+ timecode_(timecode),
+ timecode_scale_(timecode_scale),
+ write_last_frame_with_duration_(write_last_frame_with_duration),
+ writer_(NULL) {}
+
+Cluster::~Cluster() {
+ // Delete any stored frames that are left behind. This will happen if the
+ // Cluster was not Finalized for whatever reason.
+ while (!stored_frames_.empty()) {
+ while (!stored_frames_.begin()->second.empty()) {
+ delete stored_frames_.begin()->second.front();
+ stored_frames_.begin()->second.pop_front();
+ }
+ stored_frames_.erase(stored_frames_.begin()->first);
+ }
+}
+
+bool Cluster::Init(IMkvWriter* ptr_writer) {
+ if (!ptr_writer) {
+ return false;
+ }
+ writer_ = ptr_writer;
+ return true;
+}
+
+bool Cluster::AddFrame(const Frame* const frame) {
+ return QueueOrWriteFrame(frame);
+}
+
+bool Cluster::AddFrame(const uint8_t* data, uint64_t length,
+ uint64_t track_number, uint64_t abs_timecode,
+ bool is_key) {
+ Frame frame;
+ if (!frame.Init(data, length))
+ return false;
+ frame.set_track_number(track_number);
+ frame.set_timestamp(abs_timecode);
+ frame.set_is_key(is_key);
+ return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+ const uint8_t* additional,
+ uint64_t additional_length,
+ uint64_t add_id, uint64_t track_number,
+ uint64_t abs_timecode, bool is_key) {
+ if (!additional || additional_length == 0) {
+ return false;
+ }
+ Frame frame;
+ if (!frame.Init(data, length) ||
+ !frame.AddAdditionalData(additional, additional_length, add_id)) {
+ return false;
+ }
+ frame.set_track_number(track_number);
+ frame.set_timestamp(abs_timecode);
+ frame.set_is_key(is_key);
+ return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+ int64_t discard_padding,
+ uint64_t track_number,
+ uint64_t abs_timecode, bool is_key) {
+ Frame frame;
+ if (!frame.Init(data, length))
+ return false;
+ frame.set_discard_padding(discard_padding);
+ frame.set_track_number(track_number);
+ frame.set_timestamp(abs_timecode);
+ frame.set_is_key(is_key);
+ return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddMetadata(const uint8_t* data, uint64_t length,
+ uint64_t track_number, uint64_t abs_timecode,
+ uint64_t duration_timecode) {
+ Frame frame;
+ if (!frame.Init(data, length))
+ return false;
+ frame.set_track_number(track_number);
+ frame.set_timestamp(abs_timecode);
+ frame.set_duration(duration_timecode);
+ frame.set_is_key(true); // All metadata blocks are keyframes.
+ return QueueOrWriteFrame(&frame);
+}
+
+void Cluster::AddPayloadSize(uint64_t size) { payload_size_ += size; }
+
+bool Cluster::Finalize() {
+ return !write_last_frame_with_duration_ && Finalize(false, 0);
+}
+
+bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) {
+ if (!writer_ || finalized_)
+ return false;
+
+ if (write_last_frame_with_duration_) {
+ // Write out held back Frames. This essentially performs a k-way merge
+ // across all tracks in the increasing order of timestamps.
+ while (!stored_frames_.empty()) {
+ Frame* frame = stored_frames_.begin()->second.front();
+
+ // Get the next frame to write (frame with least timestamp across all
+ // tracks).
+ for (FrameMapIterator frames_iterator = ++stored_frames_.begin();
+ frames_iterator != stored_frames_.end(); ++frames_iterator) {
+ if (frames_iterator->second.front()->timestamp() < frame->timestamp()) {
+ frame = frames_iterator->second.front();
+ }
+ }
+
+ // Set the duration if it's the last frame for the track.
+ if (set_last_frame_duration &&
+ stored_frames_[frame->track_number()].size() == 1 &&
+ !frame->duration_set()) {
+ frame->set_duration(duration - frame->timestamp());
+ if (!frame->is_key() && !frame->reference_block_timestamp_set()) {
+ frame->set_reference_block_timestamp(
+ last_block_timestamp_[frame->track_number()]);
+ }
+ }
+
+ // Write the frame and remove it from |stored_frames_|.
+ const bool wrote_frame = DoWriteFrame(frame);
+ stored_frames_[frame->track_number()].pop_front();
+ if (stored_frames_[frame->track_number()].empty()) {
+ stored_frames_.erase(frame->track_number());
+ }
+ delete frame;
+ if (!wrote_frame)
+ return false;
+ }
+ }
+
+ if (size_position_ == -1)
+ return false;
+
+ if (writer_->Seekable()) {
+ const int64_t pos = writer_->Position();
+
+ if (writer_->Position(size_position_))
+ return false;
+
+ if (WriteUIntSize(writer_, payload_size(), 8))
+ return false;
+
+ if (writer_->Position(pos))
+ return false;
+ }
+
+ finalized_ = true;
+
+ return true;
+}
+
+uint64_t Cluster::Size() const {
+ const uint64_t element_size =
+ EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) +
+ payload_size_;
+ return element_size;
+}
+
+bool Cluster::PreWriteBlock() {
+ if (finalized_)
+ return false;
+
+ if (!header_written_) {
+ if (!WriteClusterHeader())
+ return false;
+ }
+
+ return true;
+}
+
+void Cluster::PostWriteBlock(uint64_t element_size) {
+ AddPayloadSize(element_size);
+ ++blocks_added_;
+}
+
+int64_t Cluster::GetRelativeTimecode(int64_t abs_timecode) const {
+ const int64_t cluster_timecode = this->Cluster::timecode();
+ const int64_t rel_timecode =
+ static_cast<int64_t>(abs_timecode) - cluster_timecode;
+
+ if (rel_timecode < 0 || rel_timecode > kMaxBlockTimecode)
+ return -1;
+
+ return rel_timecode;
+}
+
+bool Cluster::DoWriteFrame(const Frame* const frame) {
+ if (!frame || !frame->IsValid())
+ return false;
+
+ if (!PreWriteBlock())
+ return false;
+
+ const uint64_t element_size = WriteFrame(writer_, frame, this);
+ if (element_size == 0)
+ return false;
+
+ PostWriteBlock(element_size);
+ last_block_timestamp_[frame->track_number()] = frame->timestamp();
+ return true;
+}
+
+bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
+ if (!frame || !frame->IsValid())
+ return false;
+
+ // If |write_last_frame_with_duration_| is not set, then write the frame right
+ // away.
+ if (!write_last_frame_with_duration_) {
+ return DoWriteFrame(frame);
+ }
+
+ // Queue the current frame.
+ uint64_t track_number = frame->track_number();
+ Frame* const frame_to_store = new Frame();
+ frame_to_store->CopyFrom(*frame);
+ stored_frames_[track_number].push_back(frame_to_store);
+
+ // Iterate through all queued frames in the current track except the last one
+ // and write it if it is okay to do so (i.e.) no other track has an held back
+ // frame with timestamp <= the timestamp of the frame in question.
+ std::vector<std::list<Frame*>::iterator> frames_to_erase;
+ for (std::list<Frame*>::iterator
+ current_track_iterator = stored_frames_[track_number].begin(),
+ end = --stored_frames_[track_number].end();
+ current_track_iterator != end; ++current_track_iterator) {
+ const Frame* const frame_to_write = *current_track_iterator;
+ bool okay_to_write = true;
+ for (FrameMapIterator track_iterator = stored_frames_.begin();
+ track_iterator != stored_frames_.end(); ++track_iterator) {
+ if (track_iterator->first == track_number) {
+ continue;
+ }
+ if (track_iterator->second.front()->timestamp() <
+ frame_to_write->timestamp()) {
+ okay_to_write = false;
+ break;
+ }
+ }
+ if (okay_to_write) {
+ const bool wrote_frame = DoWriteFrame(frame_to_write);
+ delete frame_to_write;
+ if (!wrote_frame)
+ return false;
+ frames_to_erase.push_back(current_track_iterator);
+ } else {
+ break;
+ }
+ }
+ for (std::vector<std::list<Frame*>::iterator>::iterator iterator =
+ frames_to_erase.begin();
+ iterator != frames_to_erase.end(); ++iterator) {
+ stored_frames_[track_number].erase(*iterator);
+ }
+ return true;
+}
+
+bool Cluster::WriteClusterHeader() {
+ if (finalized_)
+ return false;
+
+ if (WriteID(writer_, libwebm::kMkvCluster))
+ return false;
+
+ // Save for later.
+ size_position_ = writer_->Position();
+
+ // Write "unknown" (EBML coded -1) as cluster size value. We need to write 8
+ // bytes because we do not know how big our cluster will be.
+ if (SerializeInt(writer_, kEbmlUnknownValue, 8))
+ return false;
+
+ if (!WriteEbmlElement(writer_, libwebm::kMkvTimecode, timecode(),
+ fixed_size_timecode_ ? 8 : 0)) {
+ return false;
+ }
+ AddPayloadSize(EbmlElementSize(libwebm::kMkvTimecode, timecode(),
+ fixed_size_timecode_ ? 8 : 0));
+ header_written_ = true;
+
+ return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SeekHead Class
+
+SeekHead::SeekHead() : start_pos_(0ULL) {
+ for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+ seek_entry_id_[i] = 0;
+ seek_entry_pos_[i] = 0;
+ }
+}
+
+SeekHead::~SeekHead() {}
+
+bool SeekHead::Finalize(IMkvWriter* writer) const {
+ if (writer->Seekable()) {
+ if (start_pos_ == -1)
+ return false;
+
+ uint64_t payload_size = 0;
+ uint64_t entry_size[kSeekEntryCount];
+
+ for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+ if (seek_entry_id_[i] != 0) {
+ entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID,
+ static_cast<uint64>(seek_entry_id_[i]));
+ entry_size[i] += EbmlElementSize(
+ libwebm::kMkvSeekPosition, static_cast<uint64>(seek_entry_pos_[i]));
+
+ payload_size +=
+ EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) +
+ entry_size[i];
+ }
+ }
+
+ // No SeekHead elements
+ if (payload_size == 0)
+ return true;
+
+ const int64_t pos = writer->Position();
+ if (writer->Position(start_pos_))
+ return false;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeekHead, payload_size))
+ return false;
+
+ for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+ if (seek_entry_id_[i] != 0) {
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeek, entry_size[i]))
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvSeekID,
+ static_cast<uint64>(seek_entry_id_[i])))
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition,
+ static_cast<uint64>(seek_entry_pos_[i])))
+ return false;
+ }
+ }
+
+ const uint64_t total_entry_size = kSeekEntryCount * MaxEntrySize();
+ const uint64_t total_size =
+ EbmlMasterElementSize(libwebm::kMkvSeekHead, total_entry_size) +
+ total_entry_size;
+ const int64_t size_left = total_size - (writer->Position() - start_pos_);
+
+ const uint64_t bytes_written = WriteVoidElement(writer, size_left);
+ if (!bytes_written)
+ return false;
+
+ if (writer->Position(pos))
+ return false;
+ }
+
+ return true;
+}
+
+bool SeekHead::Write(IMkvWriter* writer) {
+ const uint64_t entry_size = kSeekEntryCount * MaxEntrySize();
+ const uint64_t size =
+ EbmlMasterElementSize(libwebm::kMkvSeekHead, entry_size);
+
+ start_pos_ = writer->Position();
+
+ const uint64_t bytes_written = WriteVoidElement(writer, size + entry_size);
+ if (!bytes_written)
+ return false;
+
+ return true;
+}
+
+bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) {
+ for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+ if (seek_entry_id_[i] == 0) {
+ seek_entry_id_[i] = id;
+ seek_entry_pos_[i] = pos;
+ return true;
+ }
+ }
+ return false;
+}
+
+uint32_t SeekHead::GetId(int index) const {
+ if (index < 0 || index >= kSeekEntryCount)
+ return UINT_MAX;
+ return seek_entry_id_[index];
+}
+
+uint64_t SeekHead::GetPosition(int index) const {
+ if (index < 0 || index >= kSeekEntryCount)
+ return ULLONG_MAX;
+ return seek_entry_pos_[index];
+}
+
+bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) {
+ if (index < 0 || index >= kSeekEntryCount)
+ return false;
+ seek_entry_id_[index] = id;
+ seek_entry_pos_[index] = position;
+ return true;
+}
+
+uint64_t SeekHead::MaxEntrySize() const {
+ const uint64_t max_entry_payload_size =
+ EbmlElementSize(libwebm::kMkvSeekID,
+ static_cast<uint64>(UINT64_C(0xffffffff))) +
+ EbmlElementSize(libwebm::kMkvSeekPosition,
+ static_cast<uint64>(UINT64_C(0xffffffffffffffff)));
+ const uint64_t max_entry_size =
+ EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) +
+ max_entry_payload_size;
+
+ return max_entry_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SegmentInfo Class
+
+SegmentInfo::SegmentInfo()
+ : duration_(-1.0),
+ muxing_app_(NULL),
+ timecode_scale_(1000000ULL),
+ writing_app_(NULL),
+ date_utc_(LLONG_MIN),
+ duration_pos_(-1) {}
+
+SegmentInfo::~SegmentInfo() {
+ delete[] muxing_app_;
+ delete[] writing_app_;
+}
+
+bool SegmentInfo::Init() {
+ int32_t major;
+ int32_t minor;
+ int32_t build;
+ int32_t revision;
+ GetVersion(&major, &minor, &build, &revision);
+ char temp[256];
+#ifdef _MSC_VER
+ sprintf_s(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+ minor, build, revision);
+#else
+ snprintf(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+ minor, build, revision);
+#endif
+
+ const size_t app_len = strlen(temp) + 1;
+
+ delete[] muxing_app_;
+
+ muxing_app_ = new (std::nothrow) char[app_len]; // NOLINT
+ if (!muxing_app_)
+ return false;
+
+#ifdef _MSC_VER
+ strcpy_s(muxing_app_, app_len, temp);
+#else
+ strcpy(muxing_app_, temp);
+#endif
+
+ set_writing_app(temp);
+ if (!writing_app_)
+ return false;
+ return true;
+}
+
+bool SegmentInfo::Finalize(IMkvWriter* writer) const {
+ if (!writer)
+ return false;
+
+ if (duration_ > 0.0) {
+ if (writer->Seekable()) {
+ if (duration_pos_ == -1)
+ return false;
+
+ const int64_t pos = writer->Position();
+
+ if (writer->Position(duration_pos_))
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+ static_cast<float>(duration_)))
+ return false;
+
+ if (writer->Position(pos))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool SegmentInfo::Write(IMkvWriter* writer) {
+ if (!writer || !muxing_app_ || !writing_app_)
+ return false;
+
+ uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale,
+ static_cast<uint64>(timecode_scale_));
+ if (duration_ > 0.0)
+ size +=
+ EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
+ if (date_utc_ != LLONG_MIN)
+ size += EbmlDateElementSize(libwebm::kMkvDateUTC);
+ size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_);
+ size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_);
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvInfo, size))
+ return false;
+
+ const int64_t payload_position = writer->Position();
+ if (payload_position < 0)
+ return false;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale,
+ static_cast<uint64>(timecode_scale_)))
+ return false;
+
+ if (duration_ > 0.0) {
+ // Save for later
+ duration_pos_ = writer->Position();
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+ static_cast<float>(duration_)))
+ return false;
+ }
+
+ if (date_utc_ != LLONG_MIN)
+ WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_);
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_))
+ return false;
+ if (!WriteEbmlElement(writer, libwebm::kMkvWritingApp, writing_app_))
+ return false;
+
+ const int64_t stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64_t>(size))
+ return false;
+
+ return true;
+}
+
+void SegmentInfo::set_muxing_app(const char* app) {
+ if (app) {
+ const size_t length = strlen(app) + 1;
+ char* temp_str = new (std::nothrow) char[length]; // NOLINT
+ if (!temp_str)
+ return;
+
+#ifdef _MSC_VER
+ strcpy_s(temp_str, length, app);
+#else
+ strcpy(temp_str, app);
+#endif
+
+ delete[] muxing_app_;
+ muxing_app_ = temp_str;
+ }
+}
+
+void SegmentInfo::set_writing_app(const char* app) {
+ if (app) {
+ const size_t length = strlen(app) + 1;
+ char* temp_str = new (std::nothrow) char[length]; // NOLINT
+ if (!temp_str)
+ return;
+
+#ifdef _MSC_VER
+ strcpy_s(temp_str, length, app);
+#else
+ strcpy(temp_str, app);
+#endif
+
+ delete[] writing_app_;
+ writing_app_ = temp_str;
+ }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Segment Class
+
+Segment::Segment()
+ : chunk_count_(0),
+ chunk_name_(NULL),
+ chunk_writer_cluster_(NULL),
+ chunk_writer_cues_(NULL),
+ chunk_writer_header_(NULL),
+ chunking_(false),
+ chunking_base_name_(NULL),
+ cluster_list_(NULL),
+ cluster_list_capacity_(0),
+ cluster_list_size_(0),
+ cues_position_(kAfterClusters),
+ cues_track_(0),
+ force_new_cluster_(false),
+ frames_(NULL),
+ frames_capacity_(0),
+ frames_size_(0),
+ has_video_(false),
+ header_written_(false),
+ last_block_duration_(0),
+ last_timestamp_(0),
+ max_cluster_duration_(kDefaultMaxClusterDuration),
+ max_cluster_size_(0),
+ mode_(kFile),
+ new_cuepoint_(false),
+ output_cues_(true),
+ accurate_cluster_duration_(false),
+ fixed_size_cluster_timecode_(false),
+ estimate_file_duration_(false),
+ ebml_header_size_(0),
+ payload_pos_(0),
+ size_position_(0),
+ doc_type_version_(kDefaultDocTypeVersion),
+ doc_type_version_written_(0),
+ duration_(0.0),
+ writer_cluster_(NULL),
+ writer_cues_(NULL),
+ writer_header_(NULL) {
+ const time_t curr_time = time(NULL);
+ seed_ = static_cast<unsigned int>(curr_time);
+#ifdef _WIN32
+ srand(seed_);
+#endif
+}
+
+Segment::~Segment() {
+ if (cluster_list_) {
+ for (int32_t i = 0; i < cluster_list_size_; ++i) {
+ Cluster* const cluster = cluster_list_[i];
+ delete cluster;
+ }
+ delete[] cluster_list_;
+ }
+
+ if (frames_) {
+ for (int32_t i = 0; i < frames_size_; ++i) {
+ Frame* const frame = frames_[i];
+ delete frame;
+ }
+ delete[] frames_;
+ }
+
+ delete[] chunk_name_;
+ delete[] chunking_base_name_;
+
+ if (chunk_writer_cluster_) {
+ chunk_writer_cluster_->Close();
+ delete chunk_writer_cluster_;
+ }
+ if (chunk_writer_cues_) {
+ chunk_writer_cues_->Close();
+ delete chunk_writer_cues_;
+ }
+ if (chunk_writer_header_) {
+ chunk_writer_header_->Close();
+ delete chunk_writer_header_;
+ }
+}
+
+void Segment::MoveCuesBeforeClustersHelper(uint64_t diff, int32_t index,
+ uint64_t* cues_size) {
+ CuePoint* const cue_point = cues_.GetCueByIndex(index);
+ if (cue_point == NULL)
+ return;
+ const uint64_t old_cue_point_size = cue_point->Size();
+ const uint64_t cluster_pos = cue_point->cluster_pos() + diff;
+ cue_point->set_cluster_pos(cluster_pos); // update the new cluster position
+ // New size of the cue is computed as follows
+ // Let a = current sum of size of all CuePoints
+ // Let b = Increase in Cue Point's size due to this iteration
+ // Let c = Increase in size of Cues Element's length due to this iteration
+ // (This is computed as CodedSize(a + b) - CodedSize(a))
+ // Let d = b + c. Now d is the |diff| passed to the next recursive call.
+ // Let e = a + b. Now e is the |cues_size| passed to the next recursive
+ // call.
+ const uint64_t cue_point_size_diff = cue_point->Size() - old_cue_point_size;
+ const uint64_t cue_size_diff =
+ GetCodedUIntSize(*cues_size + cue_point_size_diff) -
+ GetCodedUIntSize(*cues_size);
+ *cues_size += cue_point_size_diff;
+ diff = cue_size_diff + cue_point_size_diff;
+ if (diff > 0) {
+ for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) {
+ MoveCuesBeforeClustersHelper(diff, i, cues_size);
+ }
+ }
+}
+
+void Segment::MoveCuesBeforeClusters() {
+ const uint64_t current_cue_size = cues_.Size();
+ uint64_t cue_size = 0;
+ for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
+ cue_size += cues_.GetCueByIndex(i)->Size();
+ for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
+ MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size);
+
+ // Adjust the Seek Entry to reflect the change in position
+ // of Cluster and Cues
+ int32_t cluster_index = 0;
+ int32_t cues_index = 0;
+ for (int32_t i = 0; i < SeekHead::kSeekEntryCount; ++i) {
+ if (seek_head_.GetId(i) == libwebm::kMkvCluster)
+ cluster_index = i;
+ if (seek_head_.GetId(i) == libwebm::kMkvCues)
+ cues_index = i;
+ }
+ seek_head_.SetSeekEntry(cues_index, libwebm::kMkvCues,
+ seek_head_.GetPosition(cluster_index));
+ seek_head_.SetSeekEntry(cluster_index, libwebm::kMkvCluster,
+ cues_.Size() + seek_head_.GetPosition(cues_index));
+}
+
+bool Segment::Init(IMkvWriter* ptr_writer) {
+ if (!ptr_writer) {
+ return false;
+ }
+ writer_cluster_ = ptr_writer;
+ writer_cues_ = ptr_writer;
+ writer_header_ = ptr_writer;
+ memset(&track_frames_written_, 0,
+ sizeof(track_frames_written_[0]) * kMaxTrackNumber);
+ memset(&last_track_timestamp_, 0,
+ sizeof(last_track_timestamp_[0]) * kMaxTrackNumber);
+ return segment_info_.Init();
+}
+
+bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+ IMkvWriter* writer) {
+ if (!writer->Seekable() || chunking_)
+ return false;
+ const int64_t cluster_offset =
+ cluster_list_[0]->size_position() - GetUIntSize(libwebm::kMkvCluster);
+
+ // Copy the headers.
+ if (!ChunkedCopy(reader, writer, 0, cluster_offset))
+ return false;
+
+ // Recompute cue positions and seek entries.
+ MoveCuesBeforeClusters();
+
+ // Write cues and seek entries.
+ // TODO(vigneshv): As of now, it's safe to call seek_head_.Finalize() for the
+ // second time with a different writer object. But the name Finalize() doesn't
+ // indicate something we want to call more than once. So consider renaming it
+ // to write() or some such.
+ if (!cues_.Write(writer) || !seek_head_.Finalize(writer))
+ return false;
+
+ // Copy the Clusters.
+ if (!ChunkedCopy(reader, writer, cluster_offset,
+ cluster_end_offset_ - cluster_offset))
+ return false;
+
+ // Update the Segment size in case the Cues size has changed.
+ const int64_t pos = writer->Position();
+ const int64_t segment_size = writer->Position() - payload_pos_;
+ if (writer->Position(size_position_) ||
+ WriteUIntSize(writer, segment_size, 8) || writer->Position(pos))
+ return false;
+ return true;
+}
+
+bool Segment::Finalize() {
+ if (WriteFramesAll() < 0)
+ return false;
+
+ // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_|
+ // is set. In all other modes, always call Cluster::Finalize.
+ if ((mode_ == kLive ? accurate_cluster_duration_ : true) &&
+ cluster_list_size_ > 0) {
+ // Update last cluster's size
+ Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+ // For the last frame of the last Cluster, we don't write it as a BlockGroup
+ // with Duration unless the frame itself has duration set explicitly.
+ if (!old_cluster || !old_cluster->Finalize(false, 0))
+ return false;
+ }
+
+ if (mode_ == kFile) {
+ if (chunking_ && chunk_writer_cluster_) {
+ chunk_writer_cluster_->Close();
+ chunk_count_++;
+ }
+
+ double duration =
+ (static_cast<double>(last_timestamp_) + last_block_duration_) /
+ segment_info_.timecode_scale();
+ if (duration_ > 0.0) {
+ duration = duration_;
+ } else {
+ if (last_block_duration_ == 0 && estimate_file_duration_) {
+ const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+ for (int i = 0; i < num_tracks; ++i) {
+ if (track_frames_written_[i] < 2)
+ continue;
+
+ // Estimate the duration for the last block of a Track.
+ const double nano_per_frame =
+ static_cast<double>(last_track_timestamp_[i]) /
+ (track_frames_written_[i] - 1);
+ const double track_duration =
+ (last_track_timestamp_[i] + nano_per_frame) /
+ segment_info_.timecode_scale();
+ if (track_duration > duration)
+ duration = track_duration;
+ }
+ }
+ }
+ segment_info_.set_duration(duration);
+ if (!segment_info_.Finalize(writer_header_))
+ return false;
+
+ if (output_cues_)
+ if (!seek_head_.AddSeekEntry(libwebm::kMkvCues, MaxOffset()))
+ return false;
+
+ if (chunking_) {
+ if (!chunk_writer_cues_)
+ return false;
+
+ char* name = NULL;
+ if (!UpdateChunkName("cues", &name))
+ return false;
+
+ const bool cues_open = chunk_writer_cues_->Open(name);
+ delete[] name;
+ if (!cues_open)
+ return false;
+ }
+
+ cluster_end_offset_ = writer_cluster_->Position();
+
+ // Write the seek headers and cues
+ if (output_cues_)
+ if (!cues_.Write(writer_cues_))
+ return false;
+
+ if (!seek_head_.Finalize(writer_header_))
+ return false;
+
+ if (writer_header_->Seekable()) {
+ if (size_position_ == -1)
+ return false;
+
+ const int64_t segment_size = MaxOffset();
+ if (segment_size < 1)
+ return false;
+
+ const int64_t pos = writer_header_->Position();
+ UpdateDocTypeVersion();
+ if (doc_type_version_ != doc_type_version_written_) {
+ if (writer_header_->Position(0))
+ return false;
+
+ const char* const doc_type =
+ DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+ if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
+ return false;
+ if (writer_header_->Position() != ebml_header_size_)
+ return false;
+
+ doc_type_version_written_ = doc_type_version_;
+ }
+
+ if (writer_header_->Position(size_position_))
+ return false;
+
+ if (WriteUIntSize(writer_header_, segment_size, 8))
+ return false;
+
+ if (writer_header_->Position(pos))
+ return false;
+ }
+
+ if (chunking_) {
+ // Do not close any writers until the segment size has been written,
+ // otherwise the size may be off.
+ if (!chunk_writer_cues_ || !chunk_writer_header_)
+ return false;
+
+ chunk_writer_cues_->Close();
+ chunk_writer_header_->Close();
+ }
+ }
+
+ return true;
+}
+
+Track* Segment::AddTrack(int32_t number) {
+ Track* const track = new (std::nothrow) Track(&seed_); // NOLINT
+
+ if (!track)
+ return NULL;
+
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return NULL;
+ }
+
+ return track;
+}
+
+Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
+
+Tag* Segment::AddTag() { return tags_.AddTag(); }
+
+uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
+ VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_); // NOLINT
+ if (!track)
+ return 0;
+
+ track->set_type(Tracks::kVideo);
+ track->set_codec_id(Tracks::kVp8CodecId);
+ track->set_width(width);
+ track->set_height(height);
+
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return 0;
+ }
+ has_video_ = true;
+
+ return track->number();
+}
+
+bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
+ if (cluster_list_size_ < 1)
+ return false;
+
+ const Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+ if (!cluster)
+ return false;
+
+ CuePoint* const cue = new (std::nothrow) CuePoint(); // NOLINT
+ if (!cue)
+ return false;
+
+ cue->set_time(timestamp / segment_info_.timecode_scale());
+ cue->set_block_number(cluster->blocks_added());
+ cue->set_cluster_pos(cluster->position_for_cues());
+ cue->set_track(track);
+ if (!cues_.AddCue(cue)) {
+ delete cue;
+ return false;
+ }
+
+ new_cuepoint_ = false;
+ return true;
+}
+
+uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
+ int32_t number) {
+ AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_); // NOLINT
+ if (!track)
+ return 0;
+
+ track->set_type(Tracks::kAudio);
+ track->set_codec_id(Tracks::kVorbisCodecId);
+ track->set_sample_rate(sample_rate);
+ track->set_channels(channels);
+
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return 0;
+ }
+
+ return track->number();
+}
+
+bool Segment::AddFrame(const uint8_t* data, uint64_t length,
+ uint64_t track_number, uint64_t timestamp, bool is_key) {
+ if (!data)
+ return false;
+
+ Frame frame;
+ if (!frame.Init(data, length))
+ return false;
+ frame.set_track_number(track_number);
+ frame.set_timestamp(timestamp);
+ frame.set_is_key(is_key);
+ return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+ const uint8_t* additional,
+ uint64_t additional_length,
+ uint64_t add_id, uint64_t track_number,
+ uint64_t timestamp, bool is_key) {
+ if (!data || !additional)
+ return false;
+
+ Frame frame;
+ if (!frame.Init(data, length) ||
+ !frame.AddAdditionalData(additional, additional_length, add_id)) {
+ return false;
+ }
+ frame.set_track_number(track_number);
+ frame.set_timestamp(timestamp);
+ frame.set_is_key(is_key);
+ return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+ int64_t discard_padding,
+ uint64_t track_number,
+ uint64_t timestamp, bool is_key) {
+ if (!data)
+ return false;
+
+ Frame frame;
+ if (!frame.Init(data, length))
+ return false;
+ frame.set_discard_padding(discard_padding);
+ frame.set_track_number(track_number);
+ frame.set_timestamp(timestamp);
+ frame.set_is_key(is_key);
+ return AddGenericFrame(&frame);
+}
+
+bool Segment::AddMetadata(const uint8_t* data, uint64_t length,
+ uint64_t track_number, uint64_t timestamp_ns,
+ uint64_t duration_ns) {
+ if (!data)
+ return false;
+
+ Frame frame;
+ if (!frame.Init(data, length))
+ return false;
+ frame.set_track_number(track_number);
+ frame.set_timestamp(timestamp_ns);
+ frame.set_duration(duration_ns);
+ frame.set_is_key(true); // All metadata blocks are keyframes.
+ return AddGenericFrame(&frame);
+}
+
+bool Segment::AddGenericFrame(const Frame* frame) {
+ if (!frame)
+ return false;
+
+ if (!CheckHeaderInfo())
+ return false;
+
+ // Check for non-monotonically increasing timestamps.
+ if (frame->timestamp() < last_timestamp_)
+ return false;
+
+ // Check if the track number is valid.
+ if (!tracks_.GetTrackByNumber(frame->track_number()))
+ return false;
+
+ if (frame->discard_padding() != 0)
+ doc_type_version_ = 4;
+
+ if (cluster_list_size_ > 0) {
+ const uint64_t timecode_scale = segment_info_.timecode_scale();
+ const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+ const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+ const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+ const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+ if (rel_timecode > kMaxBlockTimecode) {
+ force_new_cluster_ = true;
+ }
+ }
+
+ // If the segment has a video track hold onto audio frames to make sure the
+ // audio that is associated with the start time of a video key-frame is
+ // muxed into the same cluster.
+ if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
+ !force_new_cluster_) {
+ Frame* const new_frame = new (std::nothrow) Frame();
+ if (!new_frame || !new_frame->CopyFrom(*frame)) {
+ delete new_frame;
+ return false;
+ }
+ if (!QueueFrame(new_frame)) {
+ delete new_frame;
+ return false;
+ }
+ track_frames_written_[frame->track_number() - 1]++;
+ return true;
+ }
+
+ if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
+ frame->is_key())) {
+ return false;
+ }
+
+ if (cluster_list_size_ < 1)
+ return false;
+
+ Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+ if (!cluster)
+ return false;
+
+ // If the Frame is not a SimpleBlock, then set the reference_block_timestamp
+ // if it is not set already.
+ bool frame_created = false;
+ if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
+ !frame->reference_block_timestamp_set()) {
+ Frame* const new_frame = new (std::nothrow) Frame();
+ if (!new_frame || !new_frame->CopyFrom(*frame)) {
+ delete new_frame;
+ return false;
+ }
+ new_frame->set_reference_block_timestamp(
+ last_track_timestamp_[frame->track_number() - 1]);
+ frame = new_frame;
+ frame_created = true;
+ }
+
+ if (!cluster->AddFrame(frame))
+ return false;
+
+ if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+ if (!AddCuePoint(frame->timestamp(), cues_track_))
+ return false;
+ }
+
+ last_timestamp_ = frame->timestamp();
+ last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+ last_block_duration_ = frame->duration();
+ track_frames_written_[frame->track_number() - 1]++;
+
+ if (frame_created)
+ delete frame;
+ return true;
+}
+
+void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
+
+void Segment::AccurateClusterDuration(bool accurate_cluster_duration) {
+ accurate_cluster_duration_ = accurate_cluster_duration;
+}
+
+void Segment::UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode) {
+ fixed_size_cluster_timecode_ = fixed_size_cluster_timecode;
+}
+
+bool Segment::SetChunking(bool chunking, const char* filename) {
+ if (chunk_count_ > 0)
+ return false;
+
+ if (chunking) {
+ if (!filename)
+ return false;
+
+ // Check if we are being set to what is already set.
+ if (chunking_ && !strcmp(filename, chunking_base_name_))
+ return true;
+
+ const size_t name_length = strlen(filename) + 1;
+ char* const temp = new (std::nothrow) char[name_length]; // NOLINT
+ if (!temp)
+ return false;
+
+#ifdef _MSC_VER
+ strcpy_s(temp, name_length, filename);
+#else
+ strcpy(temp, filename);
+#endif
+
+ delete[] chunking_base_name_;
+ chunking_base_name_ = temp;
+
+ if (!UpdateChunkName("chk", &chunk_name_))
+ return false;
+
+ if (!chunk_writer_cluster_) {
+ chunk_writer_cluster_ = new (std::nothrow) MkvWriter(); // NOLINT
+ if (!chunk_writer_cluster_)
+ return false;
+ }
+
+ if (!chunk_writer_cues_) {
+ chunk_writer_cues_ = new (std::nothrow) MkvWriter(); // NOLINT
+ if (!chunk_writer_cues_)
+ return false;
+ }
+
+ if (!chunk_writer_header_) {
+ chunk_writer_header_ = new (std::nothrow) MkvWriter(); // NOLINT
+ if (!chunk_writer_header_)
+ return false;
+ }
+
+ if (!chunk_writer_cluster_->Open(chunk_name_))
+ return false;
+
+ const size_t header_length = strlen(filename) + strlen(".hdr") + 1;
+ char* const header = new (std::nothrow) char[header_length]; // NOLINT
+ if (!header)
+ return false;
+
+#ifdef _MSC_VER
+ strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_);
+ strcat_s(header, header_length, ".hdr");
+#else
+ strcpy(header, chunking_base_name_);
+ strcat(header, ".hdr");
+#endif
+ if (!chunk_writer_header_->Open(header)) {
+ delete[] header;
+ return false;
+ }
+
+ writer_cluster_ = chunk_writer_cluster_;
+ writer_cues_ = chunk_writer_cues_;
+ writer_header_ = chunk_writer_header_;
+
+ delete[] header;
+ }
+
+ chunking_ = chunking;
+
+ return true;
+}
+
+bool Segment::CuesTrack(uint64_t track_number) {
+ const Track* const track = GetTrackByNumber(track_number);
+ if (!track)
+ return false;
+
+ cues_track_ = track_number;
+ return true;
+}
+
+void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; }
+
+Track* Segment::GetTrackByNumber(uint64_t track_number) const {
+ return tracks_.GetTrackByNumber(track_number);
+}
+
+bool Segment::WriteSegmentHeader() {
+ UpdateDocTypeVersion();
+
+ const char* const doc_type =
+ DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+ if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
+ return false;
+ doc_type_version_written_ = doc_type_version_;
+ ebml_header_size_ = static_cast<int32_t>(writer_header_->Position());
+
+ // Write "unknown" (-1) as segment size value. If mode is kFile, Segment
+ // will write over duration when the file is finalized.
+ if (WriteID(writer_header_, libwebm::kMkvSegment))
+ return false;
+
+ // Save for later.
+ size_position_ = writer_header_->Position();
+
+ // Write "unknown" (EBML coded -1) as segment size value. We need to write 8
+ // bytes because if we are going to overwrite the segment size later we do
+ // not know how big our segment will be.
+ if (SerializeInt(writer_header_, kEbmlUnknownValue, 8))
+ return false;
+
+ payload_pos_ = writer_header_->Position();
+
+ if (mode_ == kFile && writer_header_->Seekable()) {
+ // Set the duration > 0.0 so SegmentInfo will write out the duration. When
+ // the muxer is done writing we will set the correct duration and have
+ // SegmentInfo upadte it.
+ segment_info_.set_duration(1.0);
+
+ if (!seek_head_.Write(writer_header_))
+ return false;
+ }
+
+ if (!seek_head_.AddSeekEntry(libwebm::kMkvInfo, MaxOffset()))
+ return false;
+ if (!segment_info_.Write(writer_header_))
+ return false;
+
+ if (!seek_head_.AddSeekEntry(libwebm::kMkvTracks, MaxOffset()))
+ return false;
+ if (!tracks_.Write(writer_header_))
+ return false;
+
+ if (chapters_.Count() > 0) {
+ if (!seek_head_.AddSeekEntry(libwebm::kMkvChapters, MaxOffset()))
+ return false;
+ if (!chapters_.Write(writer_header_))
+ return false;
+ }
+
+ if (tags_.Count() > 0) {
+ if (!seek_head_.AddSeekEntry(libwebm::kMkvTags, MaxOffset()))
+ return false;
+ if (!tags_.Write(writer_header_))
+ return false;
+ }
+
+ if (chunking_ && (mode_ == kLive || !writer_header_->Seekable())) {
+ if (!chunk_writer_header_)
+ return false;
+
+ chunk_writer_header_->Close();
+ }
+
+ header_written_ = true;
+
+ return true;
+}
+
+// Here we are testing whether to create a new cluster, given a frame
+// having time frame_timestamp_ns.
+//
+int Segment::TestFrame(uint64_t track_number, uint64_t frame_timestamp_ns,
+ bool is_key) const {
+ if (force_new_cluster_)
+ return 1;
+
+ // If no clusters have been created yet, then create a new cluster
+ // and write this frame immediately, in the new cluster. This path
+ // should only be followed once, the first time we attempt to write
+ // a frame.
+
+ if (cluster_list_size_ <= 0)
+ return 1;
+
+ // There exists at least one cluster. We must compare the frame to
+ // the last cluster, in order to determine whether the frame is
+ // written to the existing cluster, or that a new cluster should be
+ // created.
+
+ const uint64_t timecode_scale = segment_info_.timecode_scale();
+ const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
+
+ const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+ const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+ // For completeness we test for the case when the frame's timecode
+ // is less than the cluster's timecode. Although in principle that
+ // is allowed, this muxer doesn't actually write clusters like that,
+ // so this indicates a bug somewhere in our algorithm.
+
+ if (frame_timecode < last_cluster_timecode) // should never happen
+ return -1;
+
+ // If the frame has a timestamp significantly larger than the last
+ // cluster (in Matroska, cluster-relative timestamps are serialized
+ // using a 16-bit signed integer), then we cannot write this frame
+ // to that cluster, and so we must create a new cluster.
+
+ const int64_t delta_timecode = frame_timecode - last_cluster_timecode;
+
+ if (delta_timecode > kMaxBlockTimecode)
+ return 2;
+
+ // We decide to create a new cluster when we have a video keyframe.
+ // This will flush queued (audio) frames, and write the keyframe
+ // immediately, in the newly-created cluster.
+
+ if (is_key && tracks_.TrackIsVideo(track_number))
+ return 1;
+
+ // Create a new cluster if we have accumulated too many frames
+ // already, where "too many" is defined as "the total time of frames
+ // in the cluster exceeds a threshold".
+
+ const uint64_t delta_ns = delta_timecode * timecode_scale;
+
+ if (max_cluster_duration_ > 0 && delta_ns >= max_cluster_duration_)
+ return 1;
+
+ // This is similar to the case above, with the difference that a new
+ // cluster is created when the size of the current cluster exceeds a
+ // threshold.
+
+ const uint64_t cluster_size = last_cluster->payload_size();
+
+ if (max_cluster_size_ > 0 && cluster_size >= max_cluster_size_)
+ return 1;
+
+ // There's no need to create a new cluster, so emit this frame now.
+
+ return 0;
+}
+
+bool Segment::MakeNewCluster(uint64_t frame_timestamp_ns) {
+ const int32_t new_size = cluster_list_size_ + 1;
+
+ if (new_size > cluster_list_capacity_) {
+ // Add more clusters.
+ const int32_t new_capacity =
+ (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
+ Cluster** const clusters =
+ new (std::nothrow) Cluster*[new_capacity]; // NOLINT
+ if (!clusters)
+ return false;
+
+ for (int32_t i = 0; i < cluster_list_size_; ++i) {
+ clusters[i] = cluster_list_[i];
+ }
+
+ delete[] cluster_list_;
+
+ cluster_list_ = clusters;
+ cluster_list_capacity_ = new_capacity;
+ }
+
+ if (!WriteFramesLessThan(frame_timestamp_ns))
+ return false;
+
+ if (cluster_list_size_ > 0) {
+ // Update old cluster's size
+ Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+ if (!old_cluster || !old_cluster->Finalize(true, frame_timestamp_ns))
+ return false;
+ }
+
+ if (output_cues_)
+ new_cuepoint_ = true;
+
+ if (chunking_ && cluster_list_size_ > 0) {
+ chunk_writer_cluster_->Close();
+ chunk_count_++;
+
+ if (!UpdateChunkName("chk", &chunk_name_))
+ return false;
+ if (!chunk_writer_cluster_->Open(chunk_name_))
+ return false;
+ }
+
+ const uint64_t timecode_scale = segment_info_.timecode_scale();
+ const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
+
+ uint64_t cluster_timecode = frame_timecode;
+
+ if (frames_size_ > 0) {
+ const Frame* const f = frames_[0]; // earliest queued frame
+ const uint64_t ns = f->timestamp();
+ const uint64_t tc = ns / timecode_scale;
+
+ if (tc < cluster_timecode)
+ cluster_timecode = tc;
+ }
+
+ Cluster*& cluster = cluster_list_[cluster_list_size_];
+ const int64_t offset = MaxOffset();
+ cluster = new (std::nothrow)
+ Cluster(cluster_timecode, offset, segment_info_.timecode_scale(),
+ accurate_cluster_duration_, fixed_size_cluster_timecode_);
+ if (!cluster)
+ return false;
+
+ if (!cluster->Init(writer_cluster_))
+ return false;
+
+ cluster_list_size_ = new_size;
+ return true;
+}
+
+bool Segment::DoNewClusterProcessing(uint64_t track_number,
+ uint64_t frame_timestamp_ns, bool is_key) {
+ for (;;) {
+ // Based on the characteristics of the current frame and current
+ // cluster, decide whether to create a new cluster.
+ const int result = TestFrame(track_number, frame_timestamp_ns, is_key);
+ if (result < 0) // error
+ return false;
+
+ // Always set force_new_cluster_ to false after TestFrame.
+ force_new_cluster_ = false;
+
+ // A non-zero result means create a new cluster.
+ if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
+ return false;
+
+ // Write queued (audio) frames.
+ const int frame_count = WriteFramesAll();
+ if (frame_count < 0) // error
+ return false;
+
+ // Write the current frame to the current cluster (if TestFrame
+ // returns 0) or to a newly created cluster (TestFrame returns 1).
+ if (result <= 1)
+ return true;
+
+ // TestFrame returned 2, which means there was a large time
+ // difference between the cluster and the frame itself. Do the
+ // test again, comparing the frame to the new cluster.
+ }
+}
+
+bool Segment::CheckHeaderInfo() {
+ if (!header_written_) {
+ if (!WriteSegmentHeader())
+ return false;
+
+ if (!seek_head_.AddSeekEntry(libwebm::kMkvCluster, MaxOffset()))
+ return false;
+
+ if (output_cues_ && cues_track_ == 0) {
+ // Check for a video track
+ for (uint32_t i = 0; i < tracks_.track_entries_size(); ++i) {
+ const Track* const track = tracks_.GetTrackByIndex(i);
+ if (!track)
+ return false;
+
+ if (tracks_.TrackIsVideo(track->number())) {
+ cues_track_ = track->number();
+ break;
+ }
+ }
+
+ // Set first track found
+ if (cues_track_ == 0) {
+ const Track* const track = tracks_.GetTrackByIndex(0);
+ if (!track)
+ return false;
+
+ cues_track_ = track->number();
+ }
+ }
+ }
+ return true;
+}
+
+void Segment::UpdateDocTypeVersion() {
+ for (uint32_t index = 0; index < tracks_.track_entries_size(); ++index) {
+ const Track* track = tracks_.GetTrackByIndex(index);
+ if (track == NULL)
+ break;
+ if ((track->codec_delay() || track->seek_pre_roll()) &&
+ doc_type_version_ < 4) {
+ doc_type_version_ = 4;
+ break;
+ }
+ }
+}
+
+bool Segment::UpdateChunkName(const char* ext, char** name) const {
+ if (!name || !ext)
+ return false;
+
+ char ext_chk[64];
+#ifdef _MSC_VER
+ sprintf_s(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#else
+ snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#endif
+
+ const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1;
+ char* const str = new (std::nothrow) char[length]; // NOLINT
+ if (!str)
+ return false;
+
+#ifdef _MSC_VER
+ strcpy_s(str, length - strlen(ext_chk), chunking_base_name_);
+ strcat_s(str, length, ext_chk);
+#else
+ strcpy(str, chunking_base_name_);
+ strcat(str, ext_chk);
+#endif
+
+ delete[] * name;
+ *name = str;
+
+ return true;
+}
+
+int64_t Segment::MaxOffset() {
+ if (!writer_header_)
+ return -1;
+
+ int64_t offset = writer_header_->Position() - payload_pos_;
+
+ if (chunking_) {
+ for (int32_t i = 0; i < cluster_list_size_; ++i) {
+ Cluster* const cluster = cluster_list_[i];
+ offset += cluster->Size();
+ }
+
+ if (writer_cues_)
+ offset += writer_cues_->Position();
+ }
+
+ return offset;
+}
+
+bool Segment::QueueFrame(Frame* frame) {
+ const int32_t new_size = frames_size_ + 1;
+
+ if (new_size > frames_capacity_) {
+ // Add more frames.
+ const int32_t new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2;
+
+ if (new_capacity < 1)
+ return false;
+
+ Frame** const frames = new (std::nothrow) Frame*[new_capacity]; // NOLINT
+ if (!frames)
+ return false;
+
+ for (int32_t i = 0; i < frames_size_; ++i) {
+ frames[i] = frames_[i];
+ }
+
+ delete[] frames_;
+ frames_ = frames;
+ frames_capacity_ = new_capacity;
+ }
+
+ frames_[frames_size_++] = frame;
+
+ return true;
+}
+
+int Segment::WriteFramesAll() {
+ if (frames_ == NULL)
+ return 0;
+
+ if (cluster_list_size_ < 1)
+ return -1;
+
+ Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+
+ if (!cluster)
+ return -1;
+
+ for (int32_t i = 0; i < frames_size_; ++i) {
+ Frame*& frame = frames_[i];
+ // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the
+ // places where |doc_type_version_| needs to be updated.
+ if (frame->discard_padding() != 0)
+ doc_type_version_ = 4;
+ if (!cluster->AddFrame(frame)) {
+ delete frame;
+ continue;
+ }
+
+ if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+ if (!AddCuePoint(frame->timestamp(), cues_track_)) {
+ delete frame;
+ continue;
+ }
+ }
+
+ if (frame->timestamp() > last_timestamp_) {
+ last_timestamp_ = frame->timestamp();
+ last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+ }
+
+ delete frame;
+ frame = NULL;
+ }
+
+ const int result = frames_size_;
+ frames_size_ = 0;
+
+ return result;
+}
+
+bool Segment::WriteFramesLessThan(uint64_t timestamp) {
+ // Check |cluster_list_size_| to see if this is the first cluster. If it is
+ // the first cluster the audio frames that are less than the first video
+ // timesatmp will be written in a later step.
+ if (frames_size_ > 0 && cluster_list_size_ > 0) {
+ if (!frames_)
+ return false;
+
+ Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+ if (!cluster)
+ return false;
+
+ int32_t shift_left = 0;
+
+ // TODO(fgalligan): Change this to use the durations of frames instead of
+ // the next frame's start time if the duration is accurate.
+ for (int32_t i = 1; i < frames_size_; ++i) {
+ const Frame* const frame_curr = frames_[i];
+
+ if (frame_curr->timestamp() > timestamp)
+ break;
+
+ const Frame* const frame_prev = frames_[i - 1];
+ if (frame_prev->discard_padding() != 0)
+ doc_type_version_ = 4;
+ if (!cluster->AddFrame(frame_prev)) {
+ delete frame_prev;
+ continue;
+ }
+
+ if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
+ if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) {
+ delete frame_prev;
+ continue;
+ }
+ }
+
+ ++shift_left;
+ if (frame_prev->timestamp() > last_timestamp_) {
+ last_timestamp_ = frame_prev->timestamp();
+ last_track_timestamp_[frame_prev->track_number() - 1] =
+ frame_prev->timestamp();
+ }
+
+ delete frame_prev;
+ }
+
+ if (shift_left > 0) {
+ if (shift_left >= frames_size_)
+ return false;
+
+ const int32_t new_frames_size = frames_size_ - shift_left;
+ for (int32_t i = 0; i < new_frames_size; ++i) {
+ frames_[i] = frames_[i + shift_left];
+ }
+
+ frames_size_ = new_frames_size;
+ }
+ }
+
+ return true;
+}
+
+bool Segment::DocTypeIsWebm() const {
+ const int kNumCodecIds = 9;
+
+ // TODO(vigneshv): Tweak .clang-format.
+ const char* kWebmCodecIds[kNumCodecIds] = {
+ Tracks::kOpusCodecId, Tracks::kVorbisCodecId,
+ Tracks::kAv1CodecId, Tracks::kVp8CodecId,
+ Tracks::kVp9CodecId, Tracks::kWebVttCaptionsId,
+ Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
+ Tracks::kWebVttSubtitlesId};
+
+ const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+ for (int track_index = 0; track_index < num_tracks; ++track_index) {
+ const Track* const track = tracks_.GetTrackByIndex(track_index);
+ const std::string codec_id = track->codec_id();
+
+ bool id_is_webm = false;
+ for (int id_index = 0; id_index < kNumCodecIds; ++id_index) {
+ if (codec_id == kWebmCodecIds[id_index]) {
+ id_is_webm = true;
+ break;
+ }
+ }
+
+ if (!id_is_webm)
+ return false;
+ }
+
+ return true;
+}
+
+} // namespace mkvmuxer
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h
new file mode 100644
index 0000000000..8602d82325
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -0,0 +1,1924 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVMUXER_H_
+#define MKVMUXER_MKVMUXER_H_
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <list>
+#include <map>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxertypes.h"
+
+// For a description of the WebM elements see
+// http://www.webmproject.org/code/specs/container/.
+
+namespace mkvparser {
+class IMkvReader;
+} // namespace mkvparser
+
+namespace mkvmuxer {
+
+class MkvWriter;
+class Segment;
+
+const uint64_t kMaxTrackNumber = 126;
+
+///////////////////////////////////////////////////////////////
+// Interface used by the mkvmuxer to write out the Mkv data.
+class IMkvWriter {
+ public:
+ // Writes out |len| bytes of |buf|. Returns 0 on success.
+ virtual int32 Write(const void* buf, uint32 len) = 0;
+
+ // Returns the offset of the output position from the beginning of the
+ // output.
+ virtual int64 Position() const = 0;
+
+ // Set the current File position. Returns 0 on success.
+ virtual int32 Position(int64 position) = 0;
+
+ // Returns true if the writer is seekable.
+ virtual bool Seekable() const = 0;
+
+ // Element start notification. Called whenever an element identifier is about
+ // to be written to the stream. |element_id| is the element identifier, and
+ // |position| is the location in the WebM stream where the first octet of the
+ // element identifier will be written.
+ // Note: the |MkvId| enumeration in webmids.hpp defines element values.
+ virtual void ElementStartNotify(uint64 element_id, int64 position) = 0;
+
+ protected:
+ IMkvWriter();
+ virtual ~IMkvWriter();
+
+ private:
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter);
+};
+
+// Writes out the EBML header for a WebM file, but allows caller to specify
+// DocType. This function must be called before any other libwebm writing
+// functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+ const char* const doc_type);
+
+// Writes out the EBML header for a WebM file. This function must be called
+// before any other libwebm writing functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version);
+
+// Deprecated. Writes out EBML header with doc_type_version as
+// kDefaultDocTypeVersion. Exists for backward compatibility.
+bool WriteEbmlHeader(IMkvWriter* writer);
+
+// Copies in Chunk from source to destination between the given byte positions
+bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64_t start,
+ int64_t size);
+
+///////////////////////////////////////////////////////////////
+// Class to hold data the will be written to a block.
+class Frame {
+ public:
+ Frame();
+ ~Frame();
+
+ // Sets this frame's contents based on |frame|. Returns true on success. On
+ // failure, this frame's existing contents may be lost.
+ bool CopyFrom(const Frame& frame);
+
+ // Copies |frame| data into |frame_|. Returns true on success.
+ bool Init(const uint8_t* frame, uint64_t length);
+
+ // Copies |additional| data into |additional_|. Returns true on success.
+ bool AddAdditionalData(const uint8_t* additional, uint64_t length,
+ uint64_t add_id);
+
+ // Returns true if the frame has valid parameters.
+ bool IsValid() const;
+
+ // Returns true if the frame can be written as a SimpleBlock based on current
+ // parameters.
+ bool CanBeSimpleBlock() const;
+
+ uint64_t add_id() const { return add_id_; }
+ const uint8_t* additional() const { return additional_; }
+ uint64_t additional_length() const { return additional_length_; }
+ void set_duration(uint64_t duration);
+ uint64_t duration() const { return duration_; }
+ bool duration_set() const { return duration_set_; }
+ const uint8_t* frame() const { return frame_; }
+ void set_is_key(bool key) { is_key_ = key; }
+ bool is_key() const { return is_key_; }
+ uint64_t length() const { return length_; }
+ void set_track_number(uint64_t track_number) { track_number_ = track_number; }
+ uint64_t track_number() const { return track_number_; }
+ void set_timestamp(uint64_t timestamp) { timestamp_ = timestamp; }
+ uint64_t timestamp() const { return timestamp_; }
+ void set_discard_padding(int64_t discard_padding) {
+ discard_padding_ = discard_padding;
+ }
+ int64_t discard_padding() const { return discard_padding_; }
+ void set_reference_block_timestamp(int64_t reference_block_timestamp);
+ int64_t reference_block_timestamp() const {
+ return reference_block_timestamp_;
+ }
+ bool reference_block_timestamp_set() const {
+ return reference_block_timestamp_set_;
+ }
+
+ private:
+ // Id of the Additional data.
+ uint64_t add_id_;
+
+ // Pointer to additional data. Owned by this class.
+ uint8_t* additional_;
+
+ // Length of the additional data.
+ uint64_t additional_length_;
+
+ // Duration of the frame in nanoseconds.
+ uint64_t duration_;
+
+ // Flag indicating that |duration_| has been set. Setting duration causes the
+ // frame to be written out as a Block with BlockDuration instead of as a
+ // SimpleBlock.
+ bool duration_set_;
+
+ // Pointer to the data. Owned by this class.
+ uint8_t* frame_;
+
+ // Flag telling if the data should set the key flag of a block.
+ bool is_key_;
+
+ // Length of the data.
+ uint64_t length_;
+
+ // Mkv track number the data is associated with.
+ uint64_t track_number_;
+
+ // Timestamp of the data in nanoseconds.
+ uint64_t timestamp_;
+
+ // Discard padding for the frame.
+ int64_t discard_padding_;
+
+ // Reference block timestamp.
+ int64_t reference_block_timestamp_;
+
+ // Flag indicating if |reference_block_timestamp_| has been set.
+ bool reference_block_timestamp_set_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Frame);
+};
+
+///////////////////////////////////////////////////////////////
+// Class to hold one cue point in a Cues element.
+class CuePoint {
+ public:
+ CuePoint();
+ ~CuePoint();
+
+ // Returns the size in bytes for the entire CuePoint element.
+ uint64_t Size() const;
+
+ // Output the CuePoint element to the writer. Returns true on success.
+ bool Write(IMkvWriter* writer) const;
+
+ void set_time(uint64_t time) { time_ = time; }
+ uint64_t time() const { return time_; }
+ void set_track(uint64_t track) { track_ = track; }
+ uint64_t track() const { return track_; }
+ void set_cluster_pos(uint64_t cluster_pos) { cluster_pos_ = cluster_pos; }
+ uint64_t cluster_pos() const { return cluster_pos_; }
+ void set_block_number(uint64_t block_number) { block_number_ = block_number; }
+ uint64_t block_number() const { return block_number_; }
+ void set_output_block_number(bool output_block_number) {
+ output_block_number_ = output_block_number;
+ }
+ bool output_block_number() const { return output_block_number_; }
+
+ private:
+ // Returns the size in bytes for the payload of the CuePoint element.
+ uint64_t PayloadSize() const;
+
+ // Absolute timecode according to the segment time base.
+ uint64_t time_;
+
+ // The Track element associated with the CuePoint.
+ uint64_t track_;
+
+ // The position of the Cluster containing the Block.
+ uint64_t cluster_pos_;
+
+ // Number of the Block within the Cluster, starting from 1.
+ uint64_t block_number_;
+
+ // If true the muxer will write out the block number for the cue if the
+ // block number is different than the default of 1. Default is set to true.
+ bool output_block_number_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(CuePoint);
+};
+
+///////////////////////////////////////////////////////////////
+// Cues element.
+class Cues {
+ public:
+ Cues();
+ ~Cues();
+
+ // Adds a cue point to the Cues element. Returns true on success.
+ bool AddCue(CuePoint* cue);
+
+ // Returns the cue point by index. Returns NULL if there is no cue point
+ // match.
+ CuePoint* GetCueByIndex(int32_t index) const;
+
+ // Returns the total size of the Cues element
+ uint64_t Size();
+
+ // Output the Cues element to the writer. Returns true on success.
+ bool Write(IMkvWriter* writer) const;
+
+ int32_t cue_entries_size() const { return cue_entries_size_; }
+ void set_output_block_number(bool output_block_number) {
+ output_block_number_ = output_block_number;
+ }
+ bool output_block_number() const { return output_block_number_; }
+
+ private:
+ // Number of allocated elements in |cue_entries_|.
+ int32_t cue_entries_capacity_;
+
+ // Number of CuePoints in |cue_entries_|.
+ int32_t cue_entries_size_;
+
+ // CuePoint list.
+ CuePoint** cue_entries_;
+
+ // If true the muxer will write out the block number for the cue if the
+ // block number is different than the default of 1. Default is set to true.
+ bool output_block_number_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cues);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncAESSettings element
+class ContentEncAESSettings {
+ public:
+ enum { kCTR = 1 };
+
+ ContentEncAESSettings();
+ ~ContentEncAESSettings() {}
+
+ // Returns the size in bytes for the ContentEncAESSettings element.
+ uint64_t Size() const;
+
+ // Writes out the ContentEncAESSettings element to |writer|. Returns true on
+ // success.
+ bool Write(IMkvWriter* writer) const;
+
+ uint64_t cipher_mode() const { return cipher_mode_; }
+
+ private:
+ // Returns the size in bytes for the payload of the ContentEncAESSettings
+ // element.
+ uint64_t PayloadSize() const;
+
+ // Sub elements
+ uint64_t cipher_mode_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncAESSettings);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+// Currently only whole frames can be encrypted with AES. This dictates that
+// ContentEncodingOrder will be 0, ContentEncodingScope will be 1,
+// ContentEncodingType will be 1, and ContentEncAlgo will be 5.
+class ContentEncoding {
+ public:
+ ContentEncoding();
+ ~ContentEncoding();
+
+ // Sets the content encryption id. Copies |length| bytes from |id| to
+ // |enc_key_id_|. Returns true on success.
+ bool SetEncryptionID(const uint8_t* id, uint64_t length);
+
+ // Returns the size in bytes for the ContentEncoding element.
+ uint64_t Size() const;
+
+ // Writes out the ContentEncoding element to |writer|. Returns true on
+ // success.
+ bool Write(IMkvWriter* writer) const;
+
+ uint64_t enc_algo() const { return enc_algo_; }
+ uint64_t encoding_order() const { return encoding_order_; }
+ uint64_t encoding_scope() const { return encoding_scope_; }
+ uint64_t encoding_type() const { return encoding_type_; }
+ ContentEncAESSettings* enc_aes_settings() { return &enc_aes_settings_; }
+
+ private:
+ // Returns the size in bytes for the encoding elements.
+ uint64_t EncodingSize(uint64_t compression_size,
+ uint64_t encryption_size) const;
+
+ // Returns the size in bytes for the encryption elements.
+ uint64_t EncryptionSize() const;
+
+ // Track element names
+ uint64_t enc_algo_;
+ uint8_t* enc_key_id_;
+ uint64_t encoding_order_;
+ uint64_t encoding_scope_;
+ uint64_t encoding_type_;
+
+ // ContentEncAESSettings element.
+ ContentEncAESSettings enc_aes_settings_;
+
+ // Size of the ContentEncKeyID data in bytes.
+ uint64_t enc_key_id_length_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+};
+
+///////////////////////////////////////////////////////////////
+// Colour element.
+class PrimaryChromaticity {
+ public:
+ static const float kChromaticityMin;
+ static const float kChromaticityMax;
+
+ PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {}
+ PrimaryChromaticity() : x_(0), y_(0) {}
+ ~PrimaryChromaticity() {}
+
+ // Returns sum of |x_id| and |y_id| element id sizes and payload sizes.
+ uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id,
+ libwebm::MkvId y_id) const;
+ bool Valid() const;
+ bool Write(IMkvWriter* writer, libwebm::MkvId x_id,
+ libwebm::MkvId y_id) const;
+
+ float x() const { return x_; }
+ void set_x(float new_x) { x_ = new_x; }
+ float y() const { return y_; }
+ void set_y(float new_y) { y_ = new_y; }
+
+ private:
+ float x_;
+ float y_;
+};
+
+class MasteringMetadata {
+ public:
+ static const float kValueNotPresent;
+ static const float kMinLuminance;
+ static const float kMinLuminanceMax;
+ static const float kMaxLuminanceMax;
+
+ MasteringMetadata()
+ : luminance_max_(kValueNotPresent),
+ luminance_min_(kValueNotPresent),
+ r_(NULL),
+ g_(NULL),
+ b_(NULL),
+ white_point_(NULL) {}
+ ~MasteringMetadata() {
+ delete r_;
+ delete g_;
+ delete b_;
+ delete white_point_;
+ }
+
+ // Returns total size of the MasteringMetadata element.
+ uint64_t MasteringMetadataSize() const;
+ bool Valid() const;
+ bool Write(IMkvWriter* writer) const;
+
+ // Copies non-null chromaticity.
+ bool SetChromaticity(const PrimaryChromaticity* r,
+ const PrimaryChromaticity* g,
+ const PrimaryChromaticity* b,
+ const PrimaryChromaticity* white_point);
+ const PrimaryChromaticity* r() const { return r_; }
+ const PrimaryChromaticity* g() const { return g_; }
+ const PrimaryChromaticity* b() const { return b_; }
+ const PrimaryChromaticity* white_point() const { return white_point_; }
+
+ float luminance_max() const { return luminance_max_; }
+ void set_luminance_max(float luminance_max) {
+ luminance_max_ = luminance_max;
+ }
+ float luminance_min() const { return luminance_min_; }
+ void set_luminance_min(float luminance_min) {
+ luminance_min_ = luminance_min;
+ }
+
+ private:
+ // Returns size of MasteringMetadata child elements.
+ uint64_t PayloadSize() const;
+
+ float luminance_max_;
+ float luminance_min_;
+ PrimaryChromaticity* r_;
+ PrimaryChromaticity* g_;
+ PrimaryChromaticity* b_;
+ PrimaryChromaticity* white_point_;
+};
+
+class Colour {
+ public:
+ enum MatrixCoefficients {
+ kGbr = 0,
+ kBt709 = 1,
+ kUnspecifiedMc = 2,
+ kReserved = 3,
+ kFcc = 4,
+ kBt470bg = 5,
+ kSmpte170MMc = 6,
+ kSmpte240MMc = 7,
+ kYcocg = 8,
+ kBt2020NonConstantLuminance = 9,
+ kBt2020ConstantLuminance = 10,
+ };
+ enum ChromaSitingHorz {
+ kUnspecifiedCsh = 0,
+ kLeftCollocated = 1,
+ kHalfCsh = 2,
+ };
+ enum ChromaSitingVert {
+ kUnspecifiedCsv = 0,
+ kTopCollocated = 1,
+ kHalfCsv = 2,
+ };
+ enum Range {
+ kUnspecifiedCr = 0,
+ kBroadcastRange = 1,
+ kFullRange = 2,
+ kMcTcDefined = 3, // Defined by MatrixCoefficients/TransferCharacteristics.
+ };
+ enum TransferCharacteristics {
+ kIturBt709Tc = 1,
+ kUnspecifiedTc = 2,
+ kReservedTc = 3,
+ kGamma22Curve = 4,
+ kGamma28Curve = 5,
+ kSmpte170MTc = 6,
+ kSmpte240MTc = 7,
+ kLinear = 8,
+ kLog = 9,
+ kLogSqrt = 10,
+ kIec6196624 = 11,
+ kIturBt1361ExtendedColourGamut = 12,
+ kIec6196621 = 13,
+ kIturBt202010bit = 14,
+ kIturBt202012bit = 15,
+ kSmpteSt2084 = 16,
+ kSmpteSt4281Tc = 17,
+ kAribStdB67Hlg = 18,
+ };
+ enum Primaries {
+ kReservedP0 = 0,
+ kIturBt709P = 1,
+ kUnspecifiedP = 2,
+ kReservedP3 = 3,
+ kIturBt470M = 4,
+ kIturBt470Bg = 5,
+ kSmpte170MP = 6,
+ kSmpte240MP = 7,
+ kFilm = 8,
+ kIturBt2020 = 9,
+ kSmpteSt4281P = 10,
+ kJedecP22Phosphors = 22,
+ };
+ static const uint64_t kValueNotPresent;
+ Colour()
+ : matrix_coefficients_(kValueNotPresent),
+ bits_per_channel_(kValueNotPresent),
+ chroma_subsampling_horz_(kValueNotPresent),
+ chroma_subsampling_vert_(kValueNotPresent),
+ cb_subsampling_horz_(kValueNotPresent),
+ cb_subsampling_vert_(kValueNotPresent),
+ chroma_siting_horz_(kValueNotPresent),
+ chroma_siting_vert_(kValueNotPresent),
+ range_(kValueNotPresent),
+ transfer_characteristics_(kValueNotPresent),
+ primaries_(kValueNotPresent),
+ max_cll_(kValueNotPresent),
+ max_fall_(kValueNotPresent),
+ mastering_metadata_(NULL) {}
+ ~Colour() { delete mastering_metadata_; }
+
+ // Returns total size of the Colour element.
+ uint64_t ColourSize() const;
+ bool Valid() const;
+ bool Write(IMkvWriter* writer) const;
+
+ // Deep copies |mastering_metadata|.
+ bool SetMasteringMetadata(const MasteringMetadata& mastering_metadata);
+
+ const MasteringMetadata* mastering_metadata() const {
+ return mastering_metadata_;
+ }
+
+ uint64_t matrix_coefficients() const { return matrix_coefficients_; }
+ void set_matrix_coefficients(uint64_t matrix_coefficients) {
+ matrix_coefficients_ = matrix_coefficients;
+ }
+ uint64_t bits_per_channel() const { return bits_per_channel_; }
+ void set_bits_per_channel(uint64_t bits_per_channel) {
+ bits_per_channel_ = bits_per_channel;
+ }
+ uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; }
+ void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) {
+ chroma_subsampling_horz_ = chroma_subsampling_horz;
+ }
+ uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; }
+ void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) {
+ chroma_subsampling_vert_ = chroma_subsampling_vert;
+ }
+ uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; }
+ void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) {
+ cb_subsampling_horz_ = cb_subsampling_horz;
+ }
+ uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; }
+ void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) {
+ cb_subsampling_vert_ = cb_subsampling_vert;
+ }
+ uint64_t chroma_siting_horz() const { return chroma_siting_horz_; }
+ void set_chroma_siting_horz(uint64_t chroma_siting_horz) {
+ chroma_siting_horz_ = chroma_siting_horz;
+ }
+ uint64_t chroma_siting_vert() const { return chroma_siting_vert_; }
+ void set_chroma_siting_vert(uint64_t chroma_siting_vert) {
+ chroma_siting_vert_ = chroma_siting_vert;
+ }
+ uint64_t range() const { return range_; }
+ void set_range(uint64_t range) { range_ = range; }
+ uint64_t transfer_characteristics() const {
+ return transfer_characteristics_;
+ }
+ void set_transfer_characteristics(uint64_t transfer_characteristics) {
+ transfer_characteristics_ = transfer_characteristics;
+ }
+ uint64_t primaries() const { return primaries_; }
+ void set_primaries(uint64_t primaries) { primaries_ = primaries; }
+ uint64_t max_cll() const { return max_cll_; }
+ void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; }
+ uint64_t max_fall() const { return max_fall_; }
+ void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; }
+
+ private:
+ // Returns size of Colour child elements.
+ uint64_t PayloadSize() const;
+
+ uint64_t matrix_coefficients_;
+ uint64_t bits_per_channel_;
+ uint64_t chroma_subsampling_horz_;
+ uint64_t chroma_subsampling_vert_;
+ uint64_t cb_subsampling_horz_;
+ uint64_t cb_subsampling_vert_;
+ uint64_t chroma_siting_horz_;
+ uint64_t chroma_siting_vert_;
+ uint64_t range_;
+ uint64_t transfer_characteristics_;
+ uint64_t primaries_;
+ uint64_t max_cll_;
+ uint64_t max_fall_;
+
+ MasteringMetadata* mastering_metadata_;
+};
+
+///////////////////////////////////////////////////////////////
+// Projection element.
+class Projection {
+ public:
+ enum ProjectionType {
+ kTypeNotPresent = -1,
+ kRectangular = 0,
+ kEquirectangular = 1,
+ kCubeMap = 2,
+ kMesh = 3,
+ };
+ static const uint64_t kValueNotPresent;
+ Projection()
+ : type_(kRectangular),
+ pose_yaw_(0.0),
+ pose_pitch_(0.0),
+ pose_roll_(0.0),
+ private_data_(NULL),
+ private_data_length_(0) {}
+ ~Projection() { delete[] private_data_; }
+
+ uint64_t ProjectionSize() const;
+ bool Write(IMkvWriter* writer) const;
+
+ bool SetProjectionPrivate(const uint8_t* private_data,
+ uint64_t private_data_length);
+
+ ProjectionType type() const { return type_; }
+ void set_type(ProjectionType type) { type_ = type; }
+ float pose_yaw() const { return pose_yaw_; }
+ void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; }
+ float pose_pitch() const { return pose_pitch_; }
+ void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; }
+ float pose_roll() const { return pose_roll_; }
+ void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; }
+ uint8_t* private_data() const { return private_data_; }
+ uint64_t private_data_length() const { return private_data_length_; }
+
+ private:
+ // Returns size of VideoProjection child elements.
+ uint64_t PayloadSize() const;
+
+ ProjectionType type_;
+ float pose_yaw_;
+ float pose_pitch_;
+ float pose_roll_;
+ uint8_t* private_data_;
+ uint64_t private_data_length_;
+};
+
+///////////////////////////////////////////////////////////////
+// Track element.
+class Track {
+ public:
+ // The |seed| parameter is used to synthesize a UID for the track.
+ explicit Track(unsigned int* seed);
+ virtual ~Track();
+
+ // Adds a ContentEncoding element to the Track. Returns true on success.
+ virtual bool AddContentEncoding();
+
+ // Returns the ContentEncoding by index. Returns NULL if there is no
+ // ContentEncoding match.
+ ContentEncoding* GetContentEncodingByIndex(uint32_t index) const;
+
+ // Returns the size in bytes for the payload of the Track element.
+ virtual uint64_t PayloadSize() const;
+
+ // Returns the size in bytes of the Track element.
+ virtual uint64_t Size() const;
+
+ // Output the Track element to the writer. Returns true on success.
+ virtual bool Write(IMkvWriter* writer) const;
+
+ // Sets the CodecPrivate element of the Track element. Copies |length|
+ // bytes from |codec_private| to |codec_private_|. Returns true on success.
+ bool SetCodecPrivate(const uint8_t* codec_private, uint64_t length);
+
+ void set_codec_id(const char* codec_id);
+ const char* codec_id() const { return codec_id_; }
+ const uint8_t* codec_private() const { return codec_private_; }
+ void set_language(const char* language);
+ const char* language() const { return language_; }
+ void set_max_block_additional_id(uint64_t max_block_additional_id) {
+ max_block_additional_id_ = max_block_additional_id;
+ }
+ uint64_t max_block_additional_id() const { return max_block_additional_id_; }
+ void set_name(const char* name);
+ const char* name() const { return name_; }
+ void set_number(uint64_t number) { number_ = number; }
+ uint64_t number() const { return number_; }
+ void set_type(uint64_t type) { type_ = type; }
+ uint64_t type() const { return type_; }
+ void set_uid(uint64_t uid) { uid_ = uid; }
+ uint64_t uid() const { return uid_; }
+ void set_codec_delay(uint64_t codec_delay) { codec_delay_ = codec_delay; }
+ uint64_t codec_delay() const { return codec_delay_; }
+ void set_seek_pre_roll(uint64_t seek_pre_roll) {
+ seek_pre_roll_ = seek_pre_roll;
+ }
+ uint64_t seek_pre_roll() const { return seek_pre_roll_; }
+ void set_default_duration(uint64_t default_duration) {
+ default_duration_ = default_duration;
+ }
+ uint64_t default_duration() const { return default_duration_; }
+
+ uint64_t codec_private_length() const { return codec_private_length_; }
+ uint32_t content_encoding_entries_size() const {
+ return content_encoding_entries_size_;
+ }
+
+ private:
+ // Track element names.
+ char* codec_id_;
+ uint8_t* codec_private_;
+ char* language_;
+ uint64_t max_block_additional_id_;
+ char* name_;
+ uint64_t number_;
+ uint64_t type_;
+ uint64_t uid_;
+ uint64_t codec_delay_;
+ uint64_t seek_pre_roll_;
+ uint64_t default_duration_;
+
+ // Size of the CodecPrivate data in bytes.
+ uint64_t codec_private_length_;
+
+ // ContentEncoding element list.
+ ContentEncoding** content_encoding_entries_;
+
+ // Number of ContentEncoding elements added.
+ uint32_t content_encoding_entries_size_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Track);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has video specific elements.
+class VideoTrack : public Track {
+ public:
+ // Supported modes for stereo 3D.
+ enum StereoMode {
+ kMono = 0,
+ kSideBySideLeftIsFirst = 1,
+ kTopBottomRightIsFirst = 2,
+ kTopBottomLeftIsFirst = 3,
+ kSideBySideRightIsFirst = 11
+ };
+
+ enum AlphaMode { kNoAlpha = 0, kAlpha = 1 };
+
+ // The |seed| parameter is used to synthesize a UID for the track.
+ explicit VideoTrack(unsigned int* seed);
+ virtual ~VideoTrack();
+
+ // Returns the size in bytes for the payload of the Track element plus the
+ // video specific elements.
+ virtual uint64_t PayloadSize() const;
+
+ // Output the VideoTrack element to the writer. Returns true on success.
+ virtual bool Write(IMkvWriter* writer) const;
+
+ // Sets the video's stereo mode. Returns true on success.
+ bool SetStereoMode(uint64_t stereo_mode);
+
+ // Sets the video's alpha mode. Returns true on success.
+ bool SetAlphaMode(uint64_t alpha_mode);
+
+ void set_display_height(uint64_t height) { display_height_ = height; }
+ uint64_t display_height() const { return display_height_; }
+ void set_display_width(uint64_t width) { display_width_ = width; }
+ uint64_t display_width() const { return display_width_; }
+ void set_pixel_height(uint64_t height) { pixel_height_ = height; }
+ uint64_t pixel_height() const { return pixel_height_; }
+ void set_pixel_width(uint64_t width) { pixel_width_ = width; }
+ uint64_t pixel_width() const { return pixel_width_; }
+
+ void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; }
+ uint64_t crop_left() const { return crop_left_; }
+ void set_crop_right(uint64_t crop_right) { crop_right_ = crop_right; }
+ uint64_t crop_right() const { return crop_right_; }
+ void set_crop_top(uint64_t crop_top) { crop_top_ = crop_top; }
+ uint64_t crop_top() const { return crop_top_; }
+ void set_crop_bottom(uint64_t crop_bottom) { crop_bottom_ = crop_bottom; }
+ uint64_t crop_bottom() const { return crop_bottom_; }
+
+ void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; }
+ double frame_rate() const { return frame_rate_; }
+ void set_height(uint64_t height) { height_ = height; }
+ uint64_t height() const { return height_; }
+ uint64_t stereo_mode() { return stereo_mode_; }
+ uint64_t alpha_mode() { return alpha_mode_; }
+ void set_width(uint64_t width) { width_ = width; }
+ uint64_t width() const { return width_; }
+ void set_colour_space(const char* colour_space);
+ const char* colour_space() const { return colour_space_; }
+
+ Colour* colour() { return colour_; }
+
+ // Deep copies |colour|.
+ bool SetColour(const Colour& colour);
+
+ Projection* projection() { return projection_; }
+
+ // Deep copies |projection|.
+ bool SetProjection(const Projection& projection);
+
+ private:
+ // Returns the size in bytes of the Video element.
+ uint64_t VideoPayloadSize() const;
+
+ // Video track element names.
+ uint64_t display_height_;
+ uint64_t display_width_;
+ uint64_t pixel_height_;
+ uint64_t pixel_width_;
+ uint64_t crop_left_;
+ uint64_t crop_right_;
+ uint64_t crop_top_;
+ uint64_t crop_bottom_;
+ double frame_rate_;
+ uint64_t height_;
+ uint64_t stereo_mode_;
+ uint64_t alpha_mode_;
+ uint64_t width_;
+ char* colour_space_;
+
+ Colour* colour_;
+ Projection* projection_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has audio specific elements.
+class AudioTrack : public Track {
+ public:
+ // The |seed| parameter is used to synthesize a UID for the track.
+ explicit AudioTrack(unsigned int* seed);
+ virtual ~AudioTrack();
+
+ // Returns the size in bytes for the payload of the Track element plus the
+ // audio specific elements.
+ virtual uint64_t PayloadSize() const;
+
+ // Output the AudioTrack element to the writer. Returns true on success.
+ virtual bool Write(IMkvWriter* writer) const;
+
+ void set_bit_depth(uint64_t bit_depth) { bit_depth_ = bit_depth; }
+ uint64_t bit_depth() const { return bit_depth_; }
+ void set_channels(uint64_t channels) { channels_ = channels; }
+ uint64_t channels() const { return channels_; }
+ void set_sample_rate(double sample_rate) { sample_rate_ = sample_rate; }
+ double sample_rate() const { return sample_rate_; }
+
+ private:
+ // Audio track element names.
+ uint64_t bit_depth_;
+ uint64_t channels_;
+ double sample_rate_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(AudioTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Tracks element
+class Tracks {
+ public:
+ // Audio and video type defined by the Matroska specs.
+ enum { kVideo = 0x1, kAudio = 0x2 };
+
+ static const char kOpusCodecId[];
+ static const char kVorbisCodecId[];
+ static const char kAv1CodecId[];
+ static const char kVp8CodecId[];
+ static const char kVp9CodecId[];
+ static const char kWebVttCaptionsId[];
+ static const char kWebVttDescriptionsId[];
+ static const char kWebVttMetadataId[];
+ static const char kWebVttSubtitlesId[];
+
+ Tracks();
+ ~Tracks();
+
+ // Adds a Track element to the Tracks object. |track| will be owned and
+ // deleted by the Tracks object. Returns true on success. |number| is the
+ // number to use for the track. |number| must be >= 0. If |number| == 0
+ // then the muxer will decide on the track number.
+ bool AddTrack(Track* track, int32_t number);
+
+ // Returns the track by index. Returns NULL if there is no track match.
+ const Track* GetTrackByIndex(uint32_t idx) const;
+
+ // Search the Tracks and return the track that matches |tn|. Returns NULL
+ // if there is no track match.
+ Track* GetTrackByNumber(uint64_t track_number) const;
+
+ // Returns true if the track number is an audio track.
+ bool TrackIsAudio(uint64_t track_number) const;
+
+ // Returns true if the track number is a video track.
+ bool TrackIsVideo(uint64_t track_number) const;
+
+ // Output the Tracks element to the writer. Returns true on success.
+ bool Write(IMkvWriter* writer) const;
+
+ uint32_t track_entries_size() const { return track_entries_size_; }
+
+ private:
+ // Track element list.
+ Track** track_entries_;
+
+ // Number of Track elements added.
+ uint32_t track_entries_size_;
+
+ // Whether or not Tracks element has already been written via IMkvWriter.
+ mutable bool wrote_tracks_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tracks);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapter element
+//
+class Chapter {
+ public:
+ // Set the identifier for this chapter. (This corresponds to the
+ // Cue Identifier line in WebVTT.)
+ // TODO(matthewjheaney): the actual serialization of this item in
+ // MKV is pending.
+ bool set_id(const char* id);
+
+ // Converts the nanosecond start and stop times of this chapter to
+ // their corresponding timecode values, and stores them that way.
+ void set_time(const Segment& segment, uint64_t start_time_ns,
+ uint64_t end_time_ns);
+
+ // Sets the uid for this chapter. Primarily used to enable
+ // deterministic output from the muxer.
+ void set_uid(const uint64_t uid) { uid_ = uid; }
+
+ // Add a title string to this chapter, per the semantics described
+ // here:
+ // http://www.matroska.org/technical/specs/index.html
+ //
+ // The title ("chapter string") is a UTF-8 string.
+ //
+ // The language has ISO 639-2 representation, described here:
+ // http://www.loc.gov/standards/iso639-2/englangn.html
+ // http://www.loc.gov/standards/iso639-2/php/English_list.php
+ // If you specify NULL as the language value, this implies
+ // English ("eng").
+ //
+ // The country value corresponds to the codes listed here:
+ // http://www.iana.org/domains/root/db/
+ //
+ // The function returns false if the string could not be allocated.
+ bool add_string(const char* title, const char* language, const char* country);
+
+ private:
+ friend class Chapters;
+
+ // For storage of chapter titles that differ by language.
+ class Display {
+ public:
+ // Establish representation invariant for new Display object.
+ void Init();
+
+ // Reclaim resources, in anticipation of destruction.
+ void Clear();
+
+ // Copies the title to the |title_| member. Returns false on
+ // error.
+ bool set_title(const char* title);
+
+ // Copies the language to the |language_| member. Returns false
+ // on error.
+ bool set_language(const char* language);
+
+ // Copies the country to the |country_| member. Returns false on
+ // error.
+ bool set_country(const char* country);
+
+ // If |writer| is non-NULL, serialize the Display sub-element of
+ // the Atom into the stream. Returns the Display element size on
+ // success, 0 if error.
+ uint64_t WriteDisplay(IMkvWriter* writer) const;
+
+ private:
+ char* title_;
+ char* language_;
+ char* country_;
+ };
+
+ Chapter();
+ ~Chapter();
+
+ // Establish the representation invariant for a newly-created
+ // Chapter object. The |seed| parameter is used to create the UID
+ // for this chapter atom.
+ void Init(unsigned int* seed);
+
+ // Copies this Chapter object to a different one. This is used when
+ // expanding a plain array of Chapter objects (see Chapters).
+ void ShallowCopy(Chapter* dst) const;
+
+ // Reclaim resources used by this Chapter object, pending its
+ // destruction.
+ void Clear();
+
+ // If there is no storage remaining on the |displays_| array for a
+ // new display object, creates a new, longer array and copies the
+ // existing Display objects to the new array. Returns false if the
+ // array cannot be expanded.
+ bool ExpandDisplaysArray();
+
+ // If |writer| is non-NULL, serialize the Atom sub-element into the
+ // stream. Returns the total size of the element on success, 0 if
+ // error.
+ uint64_t WriteAtom(IMkvWriter* writer) const;
+
+ // The string identifier for this chapter (corresponds to WebVTT cue
+ // identifier).
+ char* id_;
+
+ // Start timecode of the chapter.
+ uint64_t start_timecode_;
+
+ // Stop timecode of the chapter.
+ uint64_t end_timecode_;
+
+ // The binary identifier for this chapter.
+ uint64_t uid_;
+
+ // The Atom element can contain multiple Display sub-elements, as
+ // the same logical title can be rendered in different languages.
+ Display* displays_;
+
+ // The physical length (total size) of the |displays_| array.
+ int displays_size_;
+
+ // The logical length (number of active elements) on the |displays_|
+ // array.
+ int displays_count_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapter);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapters element
+//
+class Chapters {
+ public:
+ Chapters();
+ ~Chapters();
+
+ Chapter* AddChapter(unsigned int* seed);
+
+ // Returns the number of chapters that have been added.
+ int Count() const;
+
+ // Output the Chapters element to the writer. Returns true on success.
+ bool Write(IMkvWriter* writer) const;
+
+ private:
+ // Expands the chapters_ array if there is not enough space to contain
+ // another chapter object. Returns true on success.
+ bool ExpandChaptersArray();
+
+ // If |writer| is non-NULL, serialize the Edition sub-element of the
+ // Chapters element into the stream. Returns the Edition element
+ // size on success, 0 if error.
+ uint64_t WriteEdition(IMkvWriter* writer) const;
+
+ // Total length of the chapters_ array.
+ int chapters_size_;
+
+ // Number of active chapters on the chapters_ array.
+ int chapters_count_;
+
+ // Array for storage of chapter objects.
+ Chapter* chapters_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapters);
+};
+
+///////////////////////////////////////////////////////////////
+// Tag element
+//
+class Tag {
+ public:
+ bool add_simple_tag(const char* tag_name, const char* tag_string);
+
+ private:
+ // Tags calls Clear and the destructor of Tag
+ friend class Tags;
+
+ // For storage of simple tags
+ class SimpleTag {
+ public:
+ // Establish representation invariant for new SimpleTag object.
+ void Init();
+
+ // Reclaim resources, in anticipation of destruction.
+ void Clear();
+
+ // Copies the title to the |tag_name_| member. Returns false on
+ // error.
+ bool set_tag_name(const char* tag_name);
+
+ // Copies the language to the |tag_string_| member. Returns false
+ // on error.
+ bool set_tag_string(const char* tag_string);
+
+ // If |writer| is non-NULL, serialize the SimpleTag sub-element of
+ // the Atom into the stream. Returns the SimpleTag element size on
+ // success, 0 if error.
+ uint64_t Write(IMkvWriter* writer) const;
+
+ private:
+ char* tag_name_;
+ char* tag_string_;
+ };
+
+ Tag();
+ ~Tag();
+
+ // Copies this Tag object to a different one. This is used when
+ // expanding a plain array of Tag objects (see Tags).
+ void ShallowCopy(Tag* dst) const;
+
+ // Reclaim resources used by this Tag object, pending its
+ // destruction.
+ void Clear();
+
+ // If there is no storage remaining on the |simple_tags_| array for a
+ // new display object, creates a new, longer array and copies the
+ // existing SimpleTag objects to the new array. Returns false if the
+ // array cannot be expanded.
+ bool ExpandSimpleTagsArray();
+
+ // If |writer| is non-NULL, serialize the Tag sub-element into the
+ // stream. Returns the total size of the element on success, 0 if
+ // error.
+ uint64_t Write(IMkvWriter* writer) const;
+
+ // The Atom element can contain multiple SimpleTag sub-elements
+ SimpleTag* simple_tags_;
+
+ // The physical length (total size) of the |simple_tags_| array.
+ int simple_tags_size_;
+
+ // The logical length (number of active elements) on the |simple_tags_|
+ // array.
+ int simple_tags_count_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tag);
+};
+
+///////////////////////////////////////////////////////////////
+// Tags element
+//
+class Tags {
+ public:
+ Tags();
+ ~Tags();
+
+ Tag* AddTag();
+
+ // Returns the number of tags that have been added.
+ int Count() const;
+
+ // Output the Tags element to the writer. Returns true on success.
+ bool Write(IMkvWriter* writer) const;
+
+ private:
+ // Expands the tags_ array if there is not enough space to contain
+ // another tag object. Returns true on success.
+ bool ExpandTagsArray();
+
+ // Total length of the tags_ array.
+ int tags_size_;
+
+ // Number of active tags on the tags_ array.
+ int tags_count_;
+
+ // Array for storage of tag objects.
+ Tag* tags_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tags);
+};
+
+///////////////////////////////////////////////////////////////
+// Cluster element
+//
+// Notes:
+// |Init| must be called before any other method in this class.
+class Cluster {
+ public:
+ // |timecode| is the absolute timecode of the cluster. |cues_pos| is the
+ // position for the cluster within the segment that should be written in
+ // the cues element. |timecode_scale| is the timecode scale of the segment.
+ Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+ bool write_last_frame_with_duration = false,
+ bool fixed_size_timecode = false);
+ ~Cluster();
+
+ bool Init(IMkvWriter* ptr_writer);
+
+ // Adds a frame to be output in the file. The frame is written out through
+ // |writer_| if successful. Returns true on success.
+ bool AddFrame(const Frame* frame);
+
+ // Adds a frame to be output in the file. The frame is written out through
+ // |writer_| if successful. Returns true on success.
+ // Inputs:
+ // data: Pointer to the data
+ // length: Length of the data
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions. The range of allowed values is [1, 126].
+ // timecode: Absolute (not relative to cluster) timestamp of the
+ // frame, expressed in timecode units.
+ // is_key: Flag telling whether or not this frame is a key frame.
+ bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+ uint64_t timecode, // timecode units (absolute)
+ bool is_key);
+
+ // Adds a frame to be output in the file. The frame is written out through
+ // |writer_| if successful. Returns true on success.
+ // Inputs:
+ // data: Pointer to the data
+ // length: Length of the data
+ // additional: Pointer to the additional data
+ // additional_length: Length of the additional data
+ // add_id: Value of BlockAddID element
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions. The range of allowed values is [1, 126].
+ // abs_timecode: Absolute (not relative to cluster) timestamp of the
+ // frame, expressed in timecode units.
+ // is_key: Flag telling whether or not this frame is a key frame.
+ bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+ const uint8_t* additional,
+ uint64_t additional_length, uint64_t add_id,
+ uint64_t track_number, uint64_t abs_timecode,
+ bool is_key);
+
+ // Adds a frame to be output in the file. The frame is written out through
+ // |writer_| if successful. Returns true on success.
+ // Inputs:
+ // data: Pointer to the data.
+ // length: Length of the data.
+ // discard_padding: DiscardPadding element value.
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions. The range of allowed values is [1, 126].
+ // abs_timecode: Absolute (not relative to cluster) timestamp of the
+ // frame, expressed in timecode units.
+ // is_key: Flag telling whether or not this frame is a key frame.
+ bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+ int64_t discard_padding,
+ uint64_t track_number, uint64_t abs_timecode,
+ bool is_key);
+
+ // Writes a frame of metadata to the output medium; returns true on
+ // success.
+ // Inputs:
+ // data: Pointer to the data
+ // length: Length of the data
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions. The range of allowed values is [1, 126].
+ // timecode: Absolute (not relative to cluster) timestamp of the
+ // metadata frame, expressed in timecode units.
+ // duration: Duration of metadata frame, in timecode units.
+ //
+ // The metadata frame is written as a block group, with a duration
+ // sub-element but no reference time sub-elements (indicating that
+ // it is considered a keyframe, per Matroska semantics).
+ bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+ uint64_t timecode, uint64_t duration);
+
+ // Increments the size of the cluster's data in bytes.
+ void AddPayloadSize(uint64_t size);
+
+ // Closes the cluster so no more data can be written to it. Will update the
+ // cluster's size if |writer_| is seekable. Returns true on success. This
+ // variant of Finalize() fails when |write_last_frame_with_duration_| is set
+ // to true.
+ bool Finalize();
+
+ // Closes the cluster so no more data can be written to it. Will update the
+ // cluster's size if |writer_| is seekable. Returns true on success.
+ // Inputs:
+ // set_last_frame_duration: Boolean indicating whether or not the duration
+ // of the last frame should be set. If set to
+ // false, the |duration| value is ignored and
+ // |write_last_frame_with_duration_| will not be
+ // honored.
+ // duration: Duration of the Cluster in timecode scale.
+ bool Finalize(bool set_last_frame_duration, uint64_t duration);
+
+ // Returns the size in bytes for the entire Cluster element.
+ uint64_t Size() const;
+
+ // Given |abs_timecode|, calculates timecode relative to most recent timecode.
+ // Returns -1 on failure, or a relative timecode.
+ int64_t GetRelativeTimecode(int64_t abs_timecode) const;
+
+ int64_t size_position() const { return size_position_; }
+ int32_t blocks_added() const { return blocks_added_; }
+ uint64_t payload_size() const { return payload_size_; }
+ int64_t position_for_cues() const { return position_for_cues_; }
+ uint64_t timecode() const { return timecode_; }
+ uint64_t timecode_scale() const { return timecode_scale_; }
+ void set_write_last_frame_with_duration(bool write_last_frame_with_duration) {
+ write_last_frame_with_duration_ = write_last_frame_with_duration;
+ }
+ bool write_last_frame_with_duration() const {
+ return write_last_frame_with_duration_;
+ }
+
+ private:
+ // Iterator type for the |stored_frames_| map.
+ typedef std::map<uint64_t, std::list<Frame*> >::iterator FrameMapIterator;
+
+ // Utility method that confirms that blocks can still be added, and that the
+ // cluster header has been written. Used by |DoWriteFrame*|. Returns true
+ // when successful.
+ bool PreWriteBlock();
+
+ // Utility method used by the |DoWriteFrame*| methods that handles the book
+ // keeping required after each block is written.
+ void PostWriteBlock(uint64_t element_size);
+
+ // Does some verification and calls WriteFrame.
+ bool DoWriteFrame(const Frame* const frame);
+
+ // Either holds back the given frame, or writes it out depending on whether or
+ // not |write_last_frame_with_duration_| is set.
+ bool QueueOrWriteFrame(const Frame* const frame);
+
+ // Outputs the Cluster header to |writer_|. Returns true on success.
+ bool WriteClusterHeader();
+
+ // Number of blocks added to the cluster.
+ int32_t blocks_added_;
+
+ // Flag telling if the cluster has been closed.
+ bool finalized_;
+
+ // Flag indicating whether the cluster's timecode will always be written out
+ // using 8 bytes.
+ bool fixed_size_timecode_;
+
+ // Flag telling if the cluster's header has been written.
+ bool header_written_;
+
+ // The size of the cluster elements in bytes.
+ uint64_t payload_size_;
+
+ // The file position used for cue points.
+ const int64_t position_for_cues_;
+
+ // The file position of the cluster's size element.
+ int64_t size_position_;
+
+ // The absolute timecode of the cluster.
+ const uint64_t timecode_;
+
+ // The timecode scale of the Segment containing the cluster.
+ const uint64_t timecode_scale_;
+
+ // Flag indicating whether the last frame of the cluster should be written as
+ // a Block with Duration. If set to true, then it will result in holding back
+ // of frames and the parameterized version of Finalize() must be called to
+ // finish writing the Cluster.
+ bool write_last_frame_with_duration_;
+
+ // Map used to hold back frames, if required. Track number is the key.
+ std::map<uint64_t, std::list<Frame*> > stored_frames_;
+
+ // Map from track number to the timestamp of the last block written for that
+ // track.
+ std::map<uint64_t, uint64_t> last_block_timestamp_;
+
+ // Pointer to the writer object. Not owned by this class.
+ IMkvWriter* writer_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cluster);
+};
+
+///////////////////////////////////////////////////////////////
+// SeekHead element
+class SeekHead {
+ public:
+ SeekHead();
+ ~SeekHead();
+
+ // TODO(fgalligan): Change this to reserve a certain size. Then check how
+ // big the seek entry to be added is as not every seek entry will be the
+ // maximum size it could be.
+ // Adds a seek entry to be written out when the element is finalized. |id|
+ // must be the coded mkv element id. |pos| is the file position of the
+ // element. Returns true on success.
+ bool AddSeekEntry(uint32_t id, uint64_t pos);
+
+ // Writes out SeekHead and SeekEntry elements. Returns true on success.
+ bool Finalize(IMkvWriter* writer) const;
+
+ // Returns the id of the Seek Entry at the given index. Returns -1 if index is
+ // out of range.
+ uint32_t GetId(int index) const;
+
+ // Returns the position of the Seek Entry at the given index. Returns -1 if
+ // index is out of range.
+ uint64_t GetPosition(int index) const;
+
+ // Sets the Seek Entry id and position at given index.
+ // Returns true on success.
+ bool SetSeekEntry(int index, uint32_t id, uint64_t position);
+
+ // Reserves space by writing out a Void element which will be updated with
+ // a SeekHead element later. Returns true on success.
+ bool Write(IMkvWriter* writer);
+
+ // We are going to put a cap on the number of Seek Entries.
+ constexpr static int32_t kSeekEntryCount = 5;
+
+ private:
+ // Returns the maximum size in bytes of one seek entry.
+ uint64_t MaxEntrySize() const;
+
+ // Seek entry id element list.
+ uint32_t seek_entry_id_[kSeekEntryCount];
+
+ // Seek entry pos element list.
+ uint64_t seek_entry_pos_[kSeekEntryCount];
+
+ // The file position of SeekHead element.
+ int64_t start_pos_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SeekHead);
+};
+
+///////////////////////////////////////////////////////////////
+// Segment Information element
+class SegmentInfo {
+ public:
+ SegmentInfo();
+ ~SegmentInfo();
+
+ // Will update the duration if |duration_| is > 0.0. Returns true on success.
+ bool Finalize(IMkvWriter* writer) const;
+
+ // Sets |muxing_app_| and |writing_app_|.
+ bool Init();
+
+ // Output the Segment Information element to the writer. Returns true on
+ // success.
+ bool Write(IMkvWriter* writer);
+
+ void set_duration(double duration) { duration_ = duration; }
+ double duration() const { return duration_; }
+ void set_muxing_app(const char* app);
+ const char* muxing_app() const { return muxing_app_; }
+ void set_timecode_scale(uint64_t scale) { timecode_scale_ = scale; }
+ uint64_t timecode_scale() const { return timecode_scale_; }
+ void set_writing_app(const char* app);
+ const char* writing_app() const { return writing_app_; }
+ void set_date_utc(int64_t date_utc) { date_utc_ = date_utc; }
+ int64_t date_utc() const { return date_utc_; }
+
+ private:
+ // Segment Information element names.
+ // Initially set to -1 to signify that a duration has not been set and should
+ // not be written out.
+ double duration_;
+ // Set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+ char* muxing_app_;
+ uint64_t timecode_scale_;
+ // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+ char* writing_app_;
+ // LLONG_MIN when DateUTC is not set.
+ int64_t date_utc_;
+
+ // The file position of the duration element.
+ int64_t duration_pos_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SegmentInfo);
+};
+
+///////////////////////////////////////////////////////////////
+// This class represents the main segment in a WebM file. Currently only
+// supports one Segment element.
+//
+// Notes:
+// |Init| must be called before any other method in this class.
+class Segment {
+ public:
+ enum Mode { kLive = 0x1, kFile = 0x2 };
+
+ enum CuesPosition {
+ kAfterClusters = 0x0, // Position Cues after Clusters - Default
+ kBeforeClusters = 0x1 // Position Cues before Clusters
+ };
+
+ static constexpr uint32_t kDefaultDocTypeVersion = 4;
+ static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+
+ Segment();
+ ~Segment();
+
+ // Initializes |SegmentInfo| and returns result. Always returns false when
+ // |ptr_writer| is NULL.
+ bool Init(IMkvWriter* ptr_writer);
+
+ // Adds a generic track to the segment. Returns the newly-allocated
+ // track object (which is owned by the segment) on success, NULL on
+ // error. |number| is the number to use for the track. |number|
+ // must be >= 0. If |number| == 0 then the muxer will decide on the
+ // track number.
+ Track* AddTrack(int32_t number);
+
+ // Adds a Vorbis audio track to the segment. Returns the number of the track
+ // on success, 0 on error. |number| is the number to use for the audio track.
+ // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+ // the track number.
+ uint64_t AddAudioTrack(int32_t sample_rate, int32_t channels, int32_t number);
+
+ // Adds an empty chapter to the chapters of this segment. Returns
+ // non-NULL on success. After adding the chapter, the caller should
+ // populate its fields via the Chapter member functions.
+ Chapter* AddChapter();
+
+ // Adds an empty tag to the tags of this segment. Returns
+ // non-NULL on success. After adding the tag, the caller should
+ // populate its fields via the Tag member functions.
+ Tag* AddTag();
+
+ // Adds a cue point to the Cues element. |timestamp| is the time in
+ // nanoseconds of the cue's time. |track| is the Track of the Cue. This
+ // function must be called after AddFrame to calculate the correct
+ // BlockNumber for the CuePoint. Returns true on success.
+ bool AddCuePoint(uint64_t timestamp, uint64_t track);
+
+ // Adds a frame to be output in the file. Returns true on success.
+ // Inputs:
+ // data: Pointer to the data
+ // length: Length of the data
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions.
+ // timestamp: Timestamp of the frame in nanoseconds from 0.
+ // is_key: Flag telling whether or not this frame is a key frame.
+ bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+ uint64_t timestamp_ns, bool is_key);
+
+ // Writes a frame of metadata to the output medium; returns true on
+ // success.
+ // Inputs:
+ // data: Pointer to the data
+ // length: Length of the data
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions.
+ // timecode: Absolute timestamp of the metadata frame, expressed
+ // in nanosecond units.
+ // duration: Duration of metadata frame, in nanosecond units.
+ //
+ // The metadata frame is written as a block group, with a duration
+ // sub-element but no reference time sub-elements (indicating that
+ // it is considered a keyframe, per Matroska semantics).
+ bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+ uint64_t timestamp_ns, uint64_t duration_ns);
+
+ // Writes a frame with additional data to the output medium; returns true on
+ // success.
+ // Inputs:
+ // data: Pointer to the data.
+ // length: Length of the data.
+ // additional: Pointer to additional data.
+ // additional_length: Length of additional data.
+ // add_id: Additional ID which identifies the type of additional data.
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions.
+ // timestamp: Absolute timestamp of the frame, expressed in nanosecond
+ // units.
+ // is_key: Flag telling whether or not this frame is a key frame.
+ bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+ const uint8_t* additional,
+ uint64_t additional_length, uint64_t add_id,
+ uint64_t track_number, uint64_t timestamp,
+ bool is_key);
+
+ // Writes a frame with DiscardPadding to the output medium; returns true on
+ // success.
+ // Inputs:
+ // data: Pointer to the data.
+ // length: Length of the data.
+ // discard_padding: DiscardPadding element value.
+ // track_number: Track to add the data to. Value returned by Add track
+ // functions.
+ // timestamp: Absolute timestamp of the frame, expressed in nanosecond
+ // units.
+ // is_key: Flag telling whether or not this frame is a key frame.
+ bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+ int64_t discard_padding,
+ uint64_t track_number, uint64_t timestamp,
+ bool is_key);
+
+ // Writes a Frame to the output medium. Chooses the correct way of writing
+ // the frame (Block vs SimpleBlock) based on the parameters passed.
+ // Inputs:
+ // frame: frame object
+ bool AddGenericFrame(const Frame* frame);
+
+ // Adds a VP8 video track to the segment. Returns the number of the track on
+ // success, 0 on error. |number| is the number to use for the video track.
+ // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+ // the track number.
+ uint64_t AddVideoTrack(int32_t width, int32_t height, int32_t number);
+
+ // This function must be called after Finalize() if you need a copy of the
+ // output with Cues written before the Clusters. It will return false if the
+ // writer is not seekable of if chunking is set to true.
+ // Input parameters:
+ // reader - an IMkvReader object created with the same underlying file of the
+ // current writer object. Make sure to close the existing writer
+ // object before creating this so that all the data is properly
+ // flushed and available for reading.
+ // writer - an IMkvWriter object pointing to a *different* file than the one
+ // pointed by the current writer object. This file will contain the
+ // Cues element before the Clusters.
+ bool CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+ IMkvWriter* writer);
+
+ // Sets which track to use for the Cues element. Must have added the track
+ // before calling this function. Returns true on success. |track_number| is
+ // returned by the Add track functions.
+ bool CuesTrack(uint64_t track_number);
+
+ // This will force the muxer to create a new Cluster when the next frame is
+ // added.
+ void ForceNewClusterOnNextFrame();
+
+ // Writes out any frames that have not been written out. Finalizes the last
+ // cluster. May update the size and duration of the segment. May output the
+ // Cues element. May finalize the SeekHead element. Returns true on success.
+ bool Finalize();
+
+ // Returns the Cues object.
+ Cues* GetCues() { return &cues_; }
+
+ // Returns the Segment Information object.
+ const SegmentInfo* GetSegmentInfo() const { return &segment_info_; }
+ SegmentInfo* GetSegmentInfo() { return &segment_info_; }
+
+ // Search the Tracks and return the track that matches |track_number|.
+ // Returns NULL if there is no track match.
+ Track* GetTrackByNumber(uint64_t track_number) const;
+
+ // Toggles whether to output a cues element.
+ void OutputCues(bool output_cues);
+
+ // Toggles whether to write the last frame in each Cluster with Duration.
+ void AccurateClusterDuration(bool accurate_cluster_duration);
+
+ // Toggles whether to write the Cluster Timecode using exactly 8 bytes.
+ void UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode);
+
+ // Sets if the muxer will output files in chunks or not. |chunking| is a
+ // flag telling whether or not to turn on chunking. |filename| is the base
+ // filename for the chunk files. The header chunk file will be named
+ // |filename|.hdr and the data chunks will be named
+ // |filename|_XXXXXX.chk. Chunking implies that the muxer will be writing
+ // to files so the muxer will use the default MkvWriter class to control
+ // what data is written to what files. Returns true on success.
+ // TODO: Should we change the IMkvWriter Interface to add Open and Close?
+ // That will force the interface to be dependent on files.
+ bool SetChunking(bool chunking, const char* filename);
+
+ bool chunking() const { return chunking_; }
+ uint64_t cues_track() const { return cues_track_; }
+ void set_max_cluster_duration(uint64_t max_cluster_duration) {
+ max_cluster_duration_ = max_cluster_duration;
+ }
+ uint64_t max_cluster_duration() const { return max_cluster_duration_; }
+ void set_max_cluster_size(uint64_t max_cluster_size) {
+ max_cluster_size_ = max_cluster_size;
+ }
+ uint64_t max_cluster_size() const { return max_cluster_size_; }
+ void set_mode(Mode mode) { mode_ = mode; }
+ Mode mode() const { return mode_; }
+ CuesPosition cues_position() const { return cues_position_; }
+ bool output_cues() const { return output_cues_; }
+ void set_estimate_file_duration(bool estimate_duration) {
+ estimate_file_duration_ = estimate_duration;
+ }
+ bool estimate_file_duration() const { return estimate_file_duration_; }
+ const SegmentInfo* segment_info() const { return &segment_info_; }
+ void set_duration(double duration) { duration_ = duration; }
+ double duration() const { return duration_; }
+
+ // Returns true when codec IDs are valid for WebM.
+ bool DocTypeIsWebm() const;
+
+ private:
+ // Checks if header information has been output and initialized. If not it
+ // will output the Segment element and initialize the SeekHead elment and
+ // Cues elements.
+ bool CheckHeaderInfo();
+
+ // Sets |doc_type_version_| based on the current element requirements.
+ void UpdateDocTypeVersion();
+
+ // Sets |name| according to how many chunks have been written. |ext| is the
+ // file extension. |name| must be deleted by the calling app. Returns true
+ // on success.
+ bool UpdateChunkName(const char* ext, char** name) const;
+
+ // Returns the maximum offset within the segment's payload. When chunking
+ // this function is needed to determine offsets of elements within the
+ // chunked files. Returns -1 on error.
+ int64_t MaxOffset();
+
+ // Adds the frame to our frame array.
+ bool QueueFrame(Frame* frame);
+
+ // Output all frames that are queued. Returns -1 on error, otherwise
+ // it returns the number of frames written.
+ int WriteFramesAll();
+
+ // Output all frames that are queued that have an end time that is less
+ // then |timestamp|. Returns true on success and if there are no frames
+ // queued.
+ bool WriteFramesLessThan(uint64_t timestamp);
+
+ // Outputs the segment header, Segment Information element, SeekHead element,
+ // and Tracks element to |writer_|.
+ bool WriteSegmentHeader();
+
+ // Given a frame with the specified timestamp (nanosecond units) and
+ // keyframe status, determine whether a new cluster should be
+ // created, before writing enqueued frames and the frame itself. The
+ // function returns one of the following values:
+ // -1 = error: an out-of-order frame was detected
+ // 0 = do not create a new cluster, and write frame to the existing cluster
+ // 1 = create a new cluster, and write frame to that new cluster
+ // 2 = create a new cluster, and re-run test
+ int TestFrame(uint64_t track_num, uint64_t timestamp_ns, bool key) const;
+
+ // Create a new cluster, using the earlier of the first enqueued
+ // frame, or the indicated time. Returns true on success.
+ bool MakeNewCluster(uint64_t timestamp_ns);
+
+ // Checks whether a new cluster needs to be created, and if so
+ // creates a new cluster. Returns false if creation of a new cluster
+ // was necessary but creation was not successful.
+ bool DoNewClusterProcessing(uint64_t track_num, uint64_t timestamp_ns,
+ bool key);
+
+ // Adjusts Cue Point values (to place Cues before Clusters) so that they
+ // reflect the correct offsets.
+ void MoveCuesBeforeClusters();
+
+ // This function recursively computes the correct cluster offsets (this is
+ // done to move the Cues before Clusters). It recursively updates the change
+ // in size (which indicates a change in cluster offset) until no sizes change.
+ // Parameters:
+ // diff - indicates the difference in size of the Cues element that needs to
+ // accounted for.
+ // index - index in the list of Cues which is currently being adjusted.
+ // cue_size - sum of size of all the CuePoint elements.
+ void MoveCuesBeforeClustersHelper(uint64_t diff, int index,
+ uint64_t* cue_size);
+
+ // Seeds the random number generator used to make UIDs.
+ unsigned int seed_;
+
+ // WebM elements
+ Cues cues_;
+ SeekHead seek_head_;
+ SegmentInfo segment_info_;
+ Tracks tracks_;
+ Chapters chapters_;
+ Tags tags_;
+
+ // Number of chunks written.
+ int chunk_count_;
+
+ // Current chunk filename.
+ char* chunk_name_;
+
+ // Default MkvWriter object created by this class used for writing clusters
+ // out in separate files.
+ MkvWriter* chunk_writer_cluster_;
+
+ // Default MkvWriter object created by this class used for writing Cues
+ // element out to a file.
+ MkvWriter* chunk_writer_cues_;
+
+ // Default MkvWriter object created by this class used for writing the
+ // Matroska header out to a file.
+ MkvWriter* chunk_writer_header_;
+
+ // Flag telling whether or not the muxer is chunking output to multiple
+ // files.
+ bool chunking_;
+
+ // Base filename for the chunked files.
+ char* chunking_base_name_;
+
+ // File position offset where the Clusters end.
+ int64_t cluster_end_offset_;
+
+ // List of clusters.
+ Cluster** cluster_list_;
+
+ // Number of cluster pointers allocated in the cluster list.
+ int32_t cluster_list_capacity_;
+
+ // Number of clusters in the cluster list.
+ int32_t cluster_list_size_;
+
+ // Indicates whether Cues should be written before or after Clusters
+ CuesPosition cues_position_;
+
+ // Track number that is associated with the cues element for this segment.
+ uint64_t cues_track_;
+
+ // Tells the muxer to force a new cluster on the next Block.
+ bool force_new_cluster_;
+
+ // List of stored audio frames. These variables are used to store frames so
+ // the muxer can follow the guideline "Audio blocks that contain the video
+ // key frame's timecode should be in the same cluster as the video key frame
+ // block."
+ Frame** frames_;
+
+ // Number of frame pointers allocated in the frame list.
+ int32_t frames_capacity_;
+
+ // Number of frames in the frame list.
+ int32_t frames_size_;
+
+ // Flag telling if a video track has been added to the segment.
+ bool has_video_;
+
+ // Flag telling if the segment's header has been written.
+ bool header_written_;
+
+ // Duration of the last block in nanoseconds.
+ uint64_t last_block_duration_;
+
+ // Last timestamp in nanoseconds added to a cluster.
+ uint64_t last_timestamp_;
+
+ // Last timestamp in nanoseconds by track number added to a cluster.
+ uint64_t last_track_timestamp_[kMaxTrackNumber];
+
+ // Number of frames written per track.
+ uint64_t track_frames_written_[kMaxTrackNumber];
+
+ // Maximum time in nanoseconds for a cluster duration. This variable is a
+ // guideline and some clusters may have a longer duration. Default is 30
+ // seconds.
+ uint64_t max_cluster_duration_;
+
+ // Maximum size in bytes for a cluster. This variable is a guideline and
+ // some clusters may have a larger size. Default is 0 which signifies that
+ // the muxer will decide the size.
+ uint64_t max_cluster_size_;
+
+ // The mode that segment is in. If set to |kLive| the writer must not
+ // seek backwards.
+ Mode mode_;
+
+ // Flag telling the muxer that a new cue point should be added.
+ bool new_cuepoint_;
+
+ // TODO(fgalligan): Should we add support for more than one Cues element?
+ // Flag whether or not the muxer should output a Cues element.
+ bool output_cues_;
+
+ // Flag whether or not the last frame in each Cluster will have a Duration
+ // element in it.
+ bool accurate_cluster_duration_;
+
+ // Flag whether or not to write the Cluster Timecode using exactly 8 bytes.
+ bool fixed_size_cluster_timecode_;
+
+ // Flag whether or not to estimate the file duration.
+ bool estimate_file_duration_;
+
+ // The size of the EBML header, used to validate the header if
+ // WriteEbmlHeader() is called more than once.
+ int32_t ebml_header_size_;
+
+ // The file position of the segment's payload.
+ int64_t payload_pos_;
+
+ // The file position of the element's size.
+ int64_t size_position_;
+
+ // Current DocTypeVersion (|doc_type_version_|) and that written in
+ // WriteSegmentHeader().
+ // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_|
+ // differs from |doc_type_version_written_|.
+ uint32_t doc_type_version_;
+ uint32_t doc_type_version_written_;
+
+ // If |duration_| is > 0, then explicitly set the duration of the segment.
+ double duration_;
+
+ // Pointer to the writer objects. Not owned by this class.
+ IMkvWriter* writer_cluster_;
+ IMkvWriter* writer_cues_;
+ IMkvWriter* writer_header_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment);
+};
+
+} // namespace mkvmuxer
+
+#endif // MKVMUXER_MKVMUXER_H_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
new file mode 100644
index 0000000000..e5db121605
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVMUXERTYPES_H_
+#define MKVMUXER_MKVMUXERTYPES_H_
+
+namespace mkvmuxer {
+typedef unsigned char uint8;
+typedef short int16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+} // namespace mkvmuxer
+
+// Copied from Chromium basictypes.h
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+ TypeName(const TypeName&); \
+ void operator=(const TypeName&)
+
+#endif // MKVMUXER_MKVMUXERTYPES_HPP_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
new file mode 100644
index 0000000000..300b155797
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -0,0 +1,743 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxerutil.h"
+
+#ifdef __ANDROID__
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <new>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvwriter.h"
+
+namespace mkvmuxer {
+
+namespace {
+
+// Date elements are always 8 octets in size.
+const int kDateElementSize = 8;
+
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+ uint64 timecode_scale) {
+ uint64 block_additional_elem_size = 0;
+ uint64 block_addid_elem_size = 0;
+ uint64 block_more_payload_size = 0;
+ uint64 block_more_elem_size = 0;
+ uint64 block_additions_payload_size = 0;
+ uint64 block_additions_elem_size = 0;
+ if (frame->additional()) {
+ block_additional_elem_size =
+ EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
+ frame->additional_length());
+ block_addid_elem_size = EbmlElementSize(
+ libwebm::kMkvBlockAddID, static_cast<uint64>(frame->add_id()));
+
+ block_more_payload_size =
+ block_addid_elem_size + block_additional_elem_size;
+ block_more_elem_size =
+ EbmlMasterElementSize(libwebm::kMkvBlockMore, block_more_payload_size) +
+ block_more_payload_size;
+ block_additions_payload_size = block_more_elem_size;
+ block_additions_elem_size =
+ EbmlMasterElementSize(libwebm::kMkvBlockAdditions,
+ block_additions_payload_size) +
+ block_additions_payload_size;
+ }
+
+ uint64 discard_padding_elem_size = 0;
+ if (frame->discard_padding() != 0) {
+ discard_padding_elem_size =
+ EbmlElementSize(libwebm::kMkvDiscardPadding,
+ static_cast<int64>(frame->discard_padding()));
+ }
+
+ const uint64 reference_block_timestamp =
+ frame->reference_block_timestamp() / timecode_scale;
+ uint64 reference_block_elem_size = 0;
+ if (!frame->is_key()) {
+ reference_block_elem_size =
+ EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
+ }
+
+ const uint64 duration = frame->duration() / timecode_scale;
+ uint64 block_duration_elem_size = 0;
+ if (duration > 0)
+ block_duration_elem_size =
+ EbmlElementSize(libwebm::kMkvBlockDuration, duration);
+
+ const uint64 block_payload_size = 4 + frame->length();
+ const uint64 block_elem_size =
+ EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
+ block_payload_size;
+
+ const uint64 block_group_payload_size =
+ block_elem_size + block_additions_elem_size + block_duration_elem_size +
+ discard_padding_elem_size + reference_block_elem_size;
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockGroup,
+ block_group_payload_size)) {
+ return 0;
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlock, block_payload_size))
+ return 0;
+
+ if (WriteUInt(writer, frame->track_number()))
+ return 0;
+
+ if (SerializeInt(writer, timecode, 2))
+ return 0;
+
+ // For a Block, flags is always 0.
+ if (SerializeInt(writer, 0, 1))
+ return 0;
+
+ if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+ return 0;
+
+ if (frame->additional()) {
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockAdditions,
+ block_additions_payload_size)) {
+ return 0;
+ }
+
+ if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockMore,
+ block_more_payload_size))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID,
+ static_cast<uint64>(frame->add_id())))
+ return 0;
+
+ if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
+ frame->additional(), frame->additional_length())) {
+ return 0;
+ }
+ }
+
+ if (frame->discard_padding() != 0 &&
+ !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
+ static_cast<int64>(frame->discard_padding()))) {
+ return false;
+ }
+
+ if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+ reference_block_timestamp)) {
+ return false;
+ }
+
+ if (duration > 0 &&
+ !WriteEbmlElement(writer, libwebm::kMkvBlockDuration, duration)) {
+ return false;
+ }
+ return EbmlMasterElementSize(libwebm::kMkvBlockGroup,
+ block_group_payload_size) +
+ block_group_payload_size;
+}
+
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+ int64 timecode) {
+ if (WriteID(writer, libwebm::kMkvSimpleBlock))
+ return 0;
+
+ const int32 size = static_cast<int32>(frame->length()) + 4;
+ if (WriteUInt(writer, size))
+ return 0;
+
+ if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
+ return 0;
+
+ if (SerializeInt(writer, timecode, 2))
+ return 0;
+
+ uint64 flags = 0;
+ if (frame->is_key())
+ flags |= 0x80;
+
+ if (SerializeInt(writer, flags, 1))
+ return 0;
+
+ if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+ return 0;
+
+ return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+ frame->length();
+}
+
+} // namespace
+
+int32 GetCodedUIntSize(uint64 value) {
+ if (value < 0x000000000000007FULL)
+ return 1;
+ else if (value < 0x0000000000003FFFULL)
+ return 2;
+ else if (value < 0x00000000001FFFFFULL)
+ return 3;
+ else if (value < 0x000000000FFFFFFFULL)
+ return 4;
+ else if (value < 0x00000007FFFFFFFFULL)
+ return 5;
+ else if (value < 0x000003FFFFFFFFFFULL)
+ return 6;
+ else if (value < 0x0001FFFFFFFFFFFFULL)
+ return 7;
+ return 8;
+}
+
+int32 GetUIntSize(uint64 value) {
+ if (value < 0x0000000000000100ULL)
+ return 1;
+ else if (value < 0x0000000000010000ULL)
+ return 2;
+ else if (value < 0x0000000001000000ULL)
+ return 3;
+ else if (value < 0x0000000100000000ULL)
+ return 4;
+ else if (value < 0x0000010000000000ULL)
+ return 5;
+ else if (value < 0x0001000000000000ULL)
+ return 6;
+ else if (value < 0x0100000000000000ULL)
+ return 7;
+ return 8;
+}
+
+int32 GetIntSize(int64 value) {
+ // Doubling the requested value ensures positive values with their high bit
+ // set are written with 0-padding to avoid flipping the signedness.
+ const uint64 v = (value < 0) ? value ^ -1LL : value;
+ return GetUIntSize(2 * v);
+}
+
+uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
+ // Size of EBML ID
+ int32 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += GetCodedUIntSize(value);
+
+ return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, int64 value) {
+ // Size of EBML ID
+ int32 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += GetIntSize(value);
+
+ // Size of Datasize
+ ebml_size++;
+
+ return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value) {
+ return EbmlElementSize(type, value, 0);
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) {
+ // Size of EBML ID
+ uint64 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value);
+
+ // Size of Datasize
+ ebml_size++;
+
+ return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, float /* value */) {
+ // Size of EBML ID
+ uint64 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += sizeof(float);
+
+ // Size of Datasize
+ ebml_size++;
+
+ return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const char* value) {
+ if (!value)
+ return 0;
+
+ // Size of EBML ID
+ uint64 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += strlen(value);
+
+ // Size of Datasize
+ ebml_size += GetCodedUIntSize(strlen(value));
+
+ return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
+ if (!value)
+ return 0;
+
+ // Size of EBML ID
+ uint64 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += size;
+
+ // Size of Datasize
+ ebml_size += GetCodedUIntSize(size);
+
+ return ebml_size;
+}
+
+uint64 EbmlDateElementSize(uint64 type) {
+ // Size of EBML ID
+ uint64 ebml_size = GetUIntSize(type);
+
+ // Datasize
+ ebml_size += kDateElementSize;
+
+ // Size of Datasize
+ ebml_size++;
+
+ return ebml_size;
+}
+
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
+ if (!writer || size < 1 || size > 8)
+ return -1;
+
+ for (int32 i = 1; i <= size; ++i) {
+ const int32 byte_count = size - i;
+ const int32 bit_count = byte_count * 8;
+
+ const int64 bb = value >> bit_count;
+ const uint8 b = static_cast<uint8>(bb);
+
+ const int32 status = writer->Write(&b, 1);
+
+ if (status < 0)
+ return status;
+ }
+
+ return 0;
+}
+
+int32 SerializeFloat(IMkvWriter* writer, float f) {
+ if (!writer)
+ return -1;
+
+ assert(sizeof(uint32) == sizeof(float));
+ // This union is merely used to avoid a reinterpret_cast from float& to
+ // uint32& which will result in violation of strict aliasing.
+ union U32 {
+ uint32 u32;
+ float f;
+ } value;
+ value.f = f;
+
+ for (int32 i = 1; i <= 4; ++i) {
+ const int32 byte_count = 4 - i;
+ const int32 bit_count = byte_count * 8;
+
+ const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
+
+ const int32 status = writer->Write(&byte, 1);
+
+ if (status < 0)
+ return status;
+ }
+
+ return 0;
+}
+
+int32 WriteUInt(IMkvWriter* writer, uint64 value) {
+ if (!writer)
+ return -1;
+
+ int32 size = GetCodedUIntSize(value);
+
+ return WriteUIntSize(writer, value, size);
+}
+
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
+ if (!writer || size < 0 || size > 8)
+ return -1;
+
+ if (size > 0) {
+ const uint64 bit = 1LL << (size * 7);
+
+ if (value > (bit - 2))
+ return -1;
+
+ value |= bit;
+ } else {
+ size = 1;
+ int64 bit;
+
+ for (;;) {
+ bit = 1LL << (size * 7);
+ const uint64 max = bit - 2;
+
+ if (value <= max)
+ break;
+
+ ++size;
+ }
+
+ if (size > 8)
+ return false;
+
+ value |= bit;
+ }
+
+ return SerializeInt(writer, value, size);
+}
+
+int32 WriteID(IMkvWriter* writer, uint64 type) {
+ if (!writer)
+ return -1;
+
+ writer->ElementStartNotify(type, writer->Position());
+
+ const int32 size = GetUIntSize(type);
+
+ return SerializeInt(writer, type, size);
+}
+
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
+ if (!writer)
+ return false;
+
+ if (WriteID(writer, type))
+ return false;
+
+ if (WriteUInt(writer, size))
+ return false;
+
+ return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
+ return WriteEbmlElement(writer, type, value, 0);
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+ uint64 fixed_size) {
+ if (!writer)
+ return false;
+
+ if (WriteID(writer, type))
+ return false;
+
+ uint64 size = GetUIntSize(value);
+ if (fixed_size > 0) {
+ if (size > fixed_size)
+ return false;
+ size = fixed_size;
+ }
+ if (WriteUInt(writer, size))
+ return false;
+
+ if (SerializeInt(writer, value, static_cast<int32>(size)))
+ return false;
+
+ return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
+ if (!writer)
+ return false;
+
+ if (WriteID(writer, type))
+ return 0;
+
+ const uint64 size = GetIntSize(value);
+ if (WriteUInt(writer, size))
+ return false;
+
+ if (SerializeInt(writer, value, static_cast<int32>(size)))
+ return false;
+
+ return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
+ if (!writer)
+ return false;
+
+ if (WriteID(writer, type))
+ return false;
+
+ if (WriteUInt(writer, 4))
+ return false;
+
+ if (SerializeFloat(writer, value))
+ return false;
+
+ return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
+ if (!writer || !value)
+ return false;
+
+ if (WriteID(writer, type))
+ return false;
+
+ const uint64 length = strlen(value);
+ if (WriteUInt(writer, length))
+ return false;
+
+ if (writer->Write(value, static_cast<uint32>(length)))
+ return false;
+
+ return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+ uint64 size) {
+ if (!writer || !value || size < 1)
+ return false;
+
+ if (WriteID(writer, type))
+ return false;
+
+ if (WriteUInt(writer, size))
+ return false;
+
+ if (writer->Write(value, static_cast<uint32>(size)))
+ return false;
+
+ return true;
+}
+
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
+ if (!writer)
+ return false;
+
+ if (WriteID(writer, type))
+ return false;
+
+ if (WriteUInt(writer, kDateElementSize))
+ return false;
+
+ if (SerializeInt(writer, value, kDateElementSize))
+ return false;
+
+ return true;
+}
+
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+ Cluster* cluster) {
+ if (!writer || !frame || !frame->IsValid() || !cluster ||
+ !cluster->timecode_scale())
+ return 0;
+
+ // Technically the timecode for a block can be less than the
+ // timecode for the cluster itself (remember that block timecode
+ // is a signed, 16-bit integer). However, as a simplification we
+ // only permit non-negative cluster-relative timecodes for blocks.
+ const int64 relative_timecode = cluster->GetRelativeTimecode(
+ frame->timestamp() / cluster->timecode_scale());
+ if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
+ return 0;
+
+ return frame->CanBeSimpleBlock()
+ ? WriteSimpleBlock(writer, frame, relative_timecode)
+ : WriteBlock(writer, frame, relative_timecode,
+ cluster->timecode_scale());
+}
+
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
+ if (!writer)
+ return false;
+
+ // Subtract one for the void ID and the coded size.
+ uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+ uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+ void_entry_size;
+
+ if (void_size != size)
+ return 0;
+
+ const int64 payload_position = writer->Position();
+ if (payload_position < 0)
+ return 0;
+
+ if (WriteID(writer, libwebm::kMkvVoid))
+ return 0;
+
+ if (WriteUInt(writer, void_entry_size))
+ return 0;
+
+ const uint8 value = 0;
+ for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
+ if (writer->Write(&value, 1))
+ return 0;
+ }
+
+ const int64 stop_position = writer->Position();
+ if (stop_position < 0 ||
+ stop_position - payload_position != static_cast<int64>(void_size))
+ return 0;
+
+ return void_size;
+}
+
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
+ *major = 0;
+ *minor = 3;
+ *build = 1;
+ *revision = 0;
+}
+
+uint64 MakeUID(unsigned int* seed) {
+ uint64 uid = 0;
+
+#ifdef __MINGW32__
+ srand(*seed);
+#endif
+
+ for (int i = 0; i < 7; ++i) { // avoid problems with 8-byte values
+ uid <<= 8;
+
+// TODO(fgalligan): Move random number generation to platform specific code.
+#ifdef _MSC_VER
+ (void)seed;
+ const int32 nn = rand();
+#elif __ANDROID__
+ (void)seed;
+ int32 temp_num = 1;
+ int fd = open("/dev/urandom", O_RDONLY);
+ if (fd != -1) {
+ read(fd, &temp_num, sizeof(temp_num));
+ close(fd);
+ }
+ const int32 nn = temp_num;
+#elif defined __MINGW32__
+ const int32 nn = rand();
+#else
+ const int32 nn = rand_r(seed);
+#endif
+ const int32 n = 0xFF & (nn >> 4); // throw away low-order bits
+
+ uid |= n;
+ }
+
+ return uid;
+}
+
+bool IsMatrixCoefficientsValueValid(uint64_t value) {
+ switch (value) {
+ case mkvmuxer::Colour::kGbr:
+ case mkvmuxer::Colour::kBt709:
+ case mkvmuxer::Colour::kUnspecifiedMc:
+ case mkvmuxer::Colour::kReserved:
+ case mkvmuxer::Colour::kFcc:
+ case mkvmuxer::Colour::kBt470bg:
+ case mkvmuxer::Colour::kSmpte170MMc:
+ case mkvmuxer::Colour::kSmpte240MMc:
+ case mkvmuxer::Colour::kYcocg:
+ case mkvmuxer::Colour::kBt2020NonConstantLuminance:
+ case mkvmuxer::Colour::kBt2020ConstantLuminance:
+ return true;
+ }
+ return false;
+}
+
+bool IsChromaSitingHorzValueValid(uint64_t value) {
+ switch (value) {
+ case mkvmuxer::Colour::kUnspecifiedCsh:
+ case mkvmuxer::Colour::kLeftCollocated:
+ case mkvmuxer::Colour::kHalfCsh:
+ return true;
+ }
+ return false;
+}
+
+bool IsChromaSitingVertValueValid(uint64_t value) {
+ switch (value) {
+ case mkvmuxer::Colour::kUnspecifiedCsv:
+ case mkvmuxer::Colour::kTopCollocated:
+ case mkvmuxer::Colour::kHalfCsv:
+ return true;
+ }
+ return false;
+}
+
+bool IsColourRangeValueValid(uint64_t value) {
+ switch (value) {
+ case mkvmuxer::Colour::kUnspecifiedCr:
+ case mkvmuxer::Colour::kBroadcastRange:
+ case mkvmuxer::Colour::kFullRange:
+ case mkvmuxer::Colour::kMcTcDefined:
+ return true;
+ }
+ return false;
+}
+
+bool IsTransferCharacteristicsValueValid(uint64_t value) {
+ switch (value) {
+ case mkvmuxer::Colour::kIturBt709Tc:
+ case mkvmuxer::Colour::kUnspecifiedTc:
+ case mkvmuxer::Colour::kReservedTc:
+ case mkvmuxer::Colour::kGamma22Curve:
+ case mkvmuxer::Colour::kGamma28Curve:
+ case mkvmuxer::Colour::kSmpte170MTc:
+ case mkvmuxer::Colour::kSmpte240MTc:
+ case mkvmuxer::Colour::kLinear:
+ case mkvmuxer::Colour::kLog:
+ case mkvmuxer::Colour::kLogSqrt:
+ case mkvmuxer::Colour::kIec6196624:
+ case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut:
+ case mkvmuxer::Colour::kIec6196621:
+ case mkvmuxer::Colour::kIturBt202010bit:
+ case mkvmuxer::Colour::kIturBt202012bit:
+ case mkvmuxer::Colour::kSmpteSt2084:
+ case mkvmuxer::Colour::kSmpteSt4281Tc:
+ case mkvmuxer::Colour::kAribStdB67Hlg:
+ return true;
+ }
+ return false;
+}
+
+bool IsPrimariesValueValid(uint64_t value) {
+ switch (value) {
+ case mkvmuxer::Colour::kReservedP0:
+ case mkvmuxer::Colour::kIturBt709P:
+ case mkvmuxer::Colour::kUnspecifiedP:
+ case mkvmuxer::Colour::kReservedP3:
+ case mkvmuxer::Colour::kIturBt470M:
+ case mkvmuxer::Colour::kIturBt470Bg:
+ case mkvmuxer::Colour::kSmpte170MP:
+ case mkvmuxer::Colour::kSmpte240MP:
+ case mkvmuxer::Colour::kFilm:
+ case mkvmuxer::Colour::kIturBt2020:
+ case mkvmuxer::Colour::kSmpteSt4281P:
+ case mkvmuxer::Colour::kJedecP22Phosphors:
+ return true;
+ }
+ return false;
+}
+
+} // namespace mkvmuxer
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
new file mode 100644
index 0000000000..3355428bd1
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVMUXER_MKVMUXERUTIL_H_
+#define MKVMUXER_MKVMUXERUTIL_H_
+
+#include "mkvmuxertypes.h"
+
+#include "stdint.h"
+
+namespace mkvmuxer {
+class Cluster;
+class Frame;
+class IMkvWriter;
+
+// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because
+// changing them causes pain for downstream projects. It would be nice if a
+// solution that allows removal of the mkvmuxer:: integer types while avoiding
+// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h}
+// are really, for the great majority of cases, EBML size calculation and writer
+// functions, perhaps a more EBML focused utility would be the way to go as a
+// first step.
+
+const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64 kMaxBlockTimecode = 0x07FFFLL;
+
+// Writes out |value| in Big Endian order. Returns 0 on success.
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
+
+// Writes out |f| in Big Endian order. Returns 0 on success.
+int32 SerializeFloat(IMkvWriter* writer, float f);
+
+// Returns the size in bytes of the element.
+int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
+int32 GetCodedUIntSize(uint64 value);
+uint64 EbmlMasterElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, int64 value);
+uint64 EbmlElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, float value);
+uint64 EbmlElementSize(uint64 type, const char* value);
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type);
+
+// Returns the size in bytes of the element assuming that the element was
+// written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
+// computes the necessary number of bytes based on |value|.
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |value|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUInt(IMkvWriter* writer, uint64 value);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |size|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
+
+// Output an Mkv master element. Returns true if the element was written.
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
+
+// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
+// ID to |SerializeInt|. Returns 0 on success.
+int32 WriteID(IMkvWriter* writer, uint64 type);
+
+// Output an Mkv non-master element. Returns true if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+ uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
+
+// Output an Mkv non-master element using fixed size. The element will be
+// written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
+// then it computes the necessary number of bytes based on |value|. Returns true
+// if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+ uint64 fixed_size);
+
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+ Cluster* cluster);
+
+// Output a void element. |size| must be the entire size in bytes that will be
+// void. The function will calculate the size of the void header and subtract
+// it from |size|.
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
+
+// Returns the version number of the muxer in |major|, |minor|, |build|,
+// and |revision|.
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
+
+// Returns a random number to be used for UID, using |seed| to seed
+// the random-number generator (see POSIX rand_r() for semantics).
+uint64 MakeUID(unsigned int* seed);
+
+// Colour field validation helpers. All return true when |value| is valid.
+bool IsMatrixCoefficientsValueValid(uint64_t value);
+bool IsChromaSitingHorzValueValid(uint64_t value);
+bool IsChromaSitingVertValueValid(uint64_t value);
+bool IsColourRangeValueValid(uint64_t value);
+bool IsTransferCharacteristicsValueValid(uint64_t value);
+bool IsPrimariesValueValid(uint64_t value);
+
+} // namespace mkvmuxer
+
+#endif // MKVMUXER_MKVMUXERUTIL_H_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
new file mode 100644
index 0000000000..d668384d85
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvwriter.h"
+
+#include <sys/types.h>
+
+#ifdef _MSC_VER
+#include <share.h> // for _SH_DENYWR
+#endif
+
+namespace mkvmuxer {
+
+MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
+
+MkvWriter::MkvWriter(FILE* fp) : file_(fp), writer_owns_file_(false) {}
+
+MkvWriter::~MkvWriter() { Close(); }
+
+int32 MkvWriter::Write(const void* buffer, uint32 length) {
+ if (!file_)
+ return -1;
+
+ if (length == 0)
+ return 0;
+
+ if (buffer == NULL)
+ return -1;
+
+ const size_t bytes_written = fwrite(buffer, 1, length, file_);
+
+ return (bytes_written == length) ? 0 : -1;
+}
+
+bool MkvWriter::Open(const char* filename) {
+ if (filename == NULL)
+ return false;
+
+ if (file_)
+ return false;
+
+#ifdef _MSC_VER
+ file_ = _fsopen(filename, "wb", _SH_DENYWR);
+#else
+ file_ = fopen(filename, "wb");
+#endif
+ if (file_ == NULL)
+ return false;
+ return true;
+}
+
+void MkvWriter::Close() {
+ if (file_ && writer_owns_file_) {
+ fclose(file_);
+ }
+ file_ = NULL;
+}
+
+int64 MkvWriter::Position() const {
+ if (!file_)
+ return 0;
+
+#ifdef _MSC_VER
+ return _ftelli64(file_);
+#else
+ return ftell(file_);
+#endif
+}
+
+int32 MkvWriter::Position(int64 position) {
+ if (!file_)
+ return -1;
+
+#ifdef _MSC_VER
+ return _fseeki64(file_, position, SEEK_SET);
+#elif defined(_WIN32)
+ return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
+#else
+ return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
+#endif
+}
+
+bool MkvWriter::Seekable() const { return true; }
+
+void MkvWriter::ElementStartNotify(uint64, int64) {}
+
+} // namespace mkvmuxer
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h
new file mode 100644
index 0000000000..4227c63748
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVWRITER_H_
+#define MKVMUXER_MKVWRITER_H_
+
+#include <stdio.h>
+
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvmuxertypes.h"
+
+namespace mkvmuxer {
+
+// Default implementation of the IMkvWriter interface on Windows.
+class MkvWriter : public IMkvWriter {
+ public:
+ MkvWriter();
+ explicit MkvWriter(FILE* fp);
+ virtual ~MkvWriter();
+
+ // IMkvWriter interface
+ virtual int64 Position() const;
+ virtual int32 Position(int64 position);
+ virtual bool Seekable() const;
+ virtual int32 Write(const void* buffer, uint32 length);
+ virtual void ElementStartNotify(uint64 element_id, int64 position);
+
+ // Creates and opens a file for writing. |filename| is the name of the file
+ // to open. This function will overwrite the contents of |filename|. Returns
+ // true on success.
+ bool Open(const char* filename);
+
+ // Closes an opened file.
+ void Close();
+
+ private:
+ // File handle to output file.
+ FILE* file_;
+ bool writer_owns_file_;
+
+ LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
+};
+
+} // namespace mkvmuxer
+
+#endif // MKVMUXER_MKVWRITER_H_
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
new file mode 100644
index 0000000000..868afcb3ed
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
@@ -0,0 +1,8100 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "mkvparser/mkvparser.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#include <float.h> // _isnan() / _finite()
+#define MSC_COMPAT
+#endif
+
+#include <cassert>
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <new>
+
+#include "common/webmids.h"
+
+namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const long long Colour::kValueNotPresent = LLONG_MAX;
+const float Projection::kValueNotPresent = FLT_MAX;
+
+#ifdef MSC_COMPAT
+inline bool isnan(double val) { return !!_isnan(val); }
+inline bool isinf(double val) { return !_finite(val); }
+#else
+inline bool isnan(double val) { return std::isnan(val); }
+inline bool isinf(double val) { return std::isinf(val); }
+#endif // MSC_COMPAT
+
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+ unsigned long long element_size) {
+ if (num_elements == 0 || element_size == 0)
+ return NULL;
+
+ const size_t kMaxAllocSize = 0x80000000; // 2GiB
+ const unsigned long long num_bytes = num_elements * element_size;
+ if (element_size > (kMaxAllocSize / num_elements))
+ return NULL;
+ if (num_bytes != static_cast<size_t>(num_bytes))
+ return NULL;
+
+ return new (std::nothrow) Type[static_cast<size_t>(num_bytes)];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
+ major = 1;
+ minor = 1;
+ build = 1;
+ revision = 0;
+}
+
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+ if (!pReader || pos < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ len = 1;
+ unsigned char b;
+ int status = pReader->Read(pos, 1, &b);
+
+ if (status < 0) // error or underflow
+ return status;
+
+ if (status > 0) // interpreted as "underflow"
+ return E_BUFFER_NOT_FULL;
+
+ if (b == 0) // we can't handle u-int values larger than 8 bytes
+ return E_FILE_FORMAT_INVALID;
+
+ unsigned char m = 0x80;
+
+ while (!(b & m)) {
+ m >>= 1;
+ ++len;
+ }
+
+ long long result = b & (~m);
+ ++pos;
+
+ for (int i = 1; i < len; ++i) {
+ status = pReader->Read(pos, 1, &b);
+
+ if (status < 0) {
+ len = 1;
+ return status;
+ }
+
+ if (status > 0) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result <<= 8;
+ result |= b;
+
+ ++pos;
+ }
+
+ return result;
+}
+
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+ if (pReader == NULL || pos < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // Read the first byte. The length in bytes of the ID is determined by
+ // finding the first set bit in the first byte of the ID.
+ unsigned char temp_byte = 0;
+ int read_status = pReader->Read(pos, 1, &temp_byte);
+
+ if (read_status < 0)
+ return E_FILE_FORMAT_INVALID;
+ else if (read_status > 0) // No data to read.
+ return E_BUFFER_NOT_FULL;
+
+ if (temp_byte == 0) // ID length > 8 bytes; invalid file.
+ return E_FILE_FORMAT_INVALID;
+
+ int bit_pos = 0;
+ const int kMaxIdLengthInBytes = 4;
+ const int kCheckByte = 0x80;
+
+ // Find the first bit that's set.
+ bool found_bit = false;
+ for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+ if ((kCheckByte >> bit_pos) & temp_byte) {
+ found_bit = true;
+ break;
+ }
+ }
+
+ if (!found_bit) {
+ // The value is too large to be a valid ID.
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // Read the remaining bytes of the ID (if any).
+ const int id_length = bit_pos + 1;
+ long long ebml_id = temp_byte;
+ for (int i = 1; i < id_length; ++i) {
+ ebml_id <<= 8;
+ read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+ if (read_status < 0)
+ return E_FILE_FORMAT_INVALID;
+ else if (read_status > 0)
+ return E_BUFFER_NOT_FULL;
+
+ ebml_id |= temp_byte;
+ }
+
+ len = id_length;
+ return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+ if (!pReader || pos < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ long long total, available;
+
+ int status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return E_FILE_FORMAT_INVALID;
+
+ len = 1;
+
+ if (pos >= available)
+ return pos; // too few bytes available
+
+ unsigned char b;
+
+ status = pReader->Read(pos, 1, &b);
+
+ if (status != 0)
+ return status;
+
+ if (b == 0) // we can't handle u-int values larger than 8 bytes
+ return E_FILE_FORMAT_INVALID;
+
+ unsigned char m = 0x80;
+
+ while (!(b & m)) {
+ m >>= 1;
+ ++len;
+ }
+
+ return 0; // success
+}
+
+// TODO(vigneshv): This function assumes that unsigned values never have their
+// high bit set.
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+ if (!pReader || pos < 0 || (size <= 0) || (size > 8))
+ return E_FILE_FORMAT_INVALID;
+
+ long long result = 0;
+
+ for (long long i = 0; i < size; ++i) {
+ unsigned char b;
+
+ const long status = pReader->Read(pos, 1, &b);
+
+ if (status < 0)
+ return status;
+
+ result <<= 8;
+ result |= b;
+
+ ++pos;
+ }
+
+ return result;
+}
+
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+ double& result) {
+ if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
+ return E_FILE_FORMAT_INVALID;
+
+ const long size = static_cast<long>(size_);
+
+ unsigned char buf[8];
+
+ const int status = pReader->Read(pos, size, buf);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 4) {
+ union {
+ float f;
+ unsigned long ff;
+ };
+
+ ff = 0;
+
+ for (int i = 0;;) {
+ ff |= buf[i];
+
+ if (++i >= 4)
+ break;
+
+ ff <<= 8;
+ }
+
+ result = f;
+ } else {
+ union {
+ double d;
+ unsigned long long dd;
+ };
+
+ dd = 0;
+
+ for (int i = 0;;) {
+ dd |= buf[i];
+
+ if (++i >= 8)
+ break;
+
+ dd <<= 8;
+ }
+
+ result = d;
+ }
+
+ if (mkvparser::isinf(result) || mkvparser::isnan(result))
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+}
+
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+ long long& result_ref) {
+ if (!pReader || pos < 0 || size < 1 || size > 8)
+ return E_FILE_FORMAT_INVALID;
+
+ signed char first_byte = 0;
+ const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
+
+ if (status < 0)
+ return status;
+
+ unsigned long long result = static_cast<unsigned long long>(first_byte);
+ ++pos;
+
+ for (long i = 1; i < size; ++i) {
+ unsigned char b;
+
+ const long status = pReader->Read(pos, 1, &b);
+
+ if (status < 0)
+ return status;
+
+ result <<= 8;
+ result |= b;
+
+ ++pos;
+ }
+
+ result_ref = static_cast<long long>(result);
+ return 0;
+}
+
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+ char*& str) {
+ delete[] str;
+ str = NULL;
+
+ if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
+ return E_FILE_FORMAT_INVALID;
+
+ // +1 for '\0' terminator
+ const long required_size = static_cast<long>(size) + 1;
+
+ str = SafeArrayAlloc<char>(1, required_size);
+ if (str == NULL)
+ return E_FILE_FORMAT_INVALID;
+
+ unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
+
+ const long status = pReader->Read(pos, static_cast<long>(size), buf);
+
+ if (status) {
+ delete[] str;
+ str = NULL;
+
+ return status;
+ }
+
+ str[required_size - 1] = '\0';
+ return 0;
+}
+
+long ParseElementHeader(IMkvReader* pReader, long long& pos, long long stop,
+ long long& id, long long& size) {
+ if (stop >= 0 && pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ long len;
+
+ id = ReadID(pReader, pos, len);
+
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume id
+
+ if (stop >= 0 && pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ size = ReadUInt(pReader, pos, len);
+
+ if (size < 0 || len < 1 || len > 8) {
+ // Invalid: Negative payload size, negative or 0 length integer, or integer
+ // larger than 64 bits (libwebm cannot handle them).
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // Avoid rolling over pos when very close to LLONG_MAX.
+ const unsigned long long rollover_check =
+ static_cast<unsigned long long>(pos) + len;
+ if (rollover_check > LLONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume length of size
+
+ // pos now designates payload
+
+ if (stop >= 0 && pos > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0; // success
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+ long long& val) {
+ if (!pReader || pos < 0)
+ return false;
+
+ long long total = 0;
+ long long available = 0;
+
+ const long status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return false;
+
+ long len = 0;
+
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (available - pos) > len)
+ return false;
+
+ if (static_cast<unsigned long>(id) != expected_id)
+ return false;
+
+ pos += len; // consume id
+
+ const long long size = ReadUInt(pReader, pos, len);
+ if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+ return false;
+
+ pos += len; // consume length of size of payload
+
+ val = UnserializeUInt(pReader, pos, size);
+ if (val < 0)
+ return false;
+
+ pos += size; // consume size of payload
+
+ return true;
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+ unsigned char*& buf, size_t& buflen) {
+ if (!pReader || pos < 0)
+ return false;
+
+ long long total = 0;
+ long long available = 0;
+
+ long status = pReader->Length(&total, &available);
+ if (status < 0 || (total >= 0 && available > total))
+ return false;
+
+ long len = 0;
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (available - pos) > len)
+ return false;
+
+ if (static_cast<unsigned long>(id) != expected_id)
+ return false;
+
+ pos += len; // consume id
+
+ const long long size = ReadUInt(pReader, pos, len);
+ if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+ return false;
+
+ unsigned long long rollover_check =
+ static_cast<unsigned long long>(pos) + len;
+ if (rollover_check > LLONG_MAX)
+ return false;
+
+ pos += len; // consume length of size of payload
+
+ rollover_check = static_cast<unsigned long long>(pos) + size;
+ if (rollover_check > LLONG_MAX)
+ return false;
+
+ if ((pos + size) > available)
+ return false;
+
+ if (size >= LONG_MAX)
+ return false;
+
+ const long buflen_ = static_cast<long>(size);
+
+ buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+ if (!buf)
+ return false;
+
+ status = pReader->Read(pos, buflen_, buf);
+ if (status != 0)
+ return false;
+
+ buflen = buflen_;
+
+ pos += size; // consume size of payload
+ return true;
+}
+
+EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
+
+EBMLHeader::~EBMLHeader() { delete[] m_docType; }
+
+void EBMLHeader::Init() {
+ m_version = 1;
+ m_readVersion = 1;
+ m_maxIdLength = 4;
+ m_maxSizeLength = 8;
+
+ if (m_docType) {
+ delete[] m_docType;
+ m_docType = NULL;
+ }
+
+ m_docTypeVersion = 1;
+ m_docTypeReadVersion = 1;
+}
+
+long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
+ if (!pReader)
+ return E_FILE_FORMAT_INVALID;
+
+ long long total, available;
+
+ long status = pReader->Length(&total, &available);
+
+ if (status < 0) // error
+ return status;
+
+ pos = 0;
+
+ // Scan until we find what looks like the first byte of the EBML header.
+ const long long kMaxScanBytes = (available >= 1024) ? 1024 : available;
+ const unsigned char kEbmlByte0 = 0x1A;
+ unsigned char scan_byte = 0;
+
+ while (pos < kMaxScanBytes) {
+ status = pReader->Read(pos, 1, &scan_byte);
+
+ if (status < 0) // error
+ return status;
+ else if (status > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if (scan_byte == kEbmlByte0)
+ break;
+
+ ++pos;
+ }
+
+ long len = 0;
+ const long long ebml_id = ReadID(pReader, pos, len);
+
+ if (ebml_id == E_BUFFER_NOT_FULL)
+ return E_BUFFER_NOT_FULL;
+
+ if (len != 4 || ebml_id != libwebm::kMkvEBML)
+ return E_FILE_FORMAT_INVALID;
+
+ // Move read pos forward to the EBML header size field.
+ pos += 4;
+
+ // Read length of size field.
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return E_FILE_FORMAT_INVALID;
+ else if (result > 0) // need more data
+ return E_BUFFER_NOT_FULL;
+
+ if (len < 1 || len > 8)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && ((total - pos) < len))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((available - pos) < len)
+ return pos + len; // try again later
+
+ // Read the EBML header size.
+ result = ReadUInt(pReader, pos, len);
+
+ if (result < 0) // error
+ return result;
+
+ pos += len; // consume size field
+
+ // pos now designates start of payload
+
+ if ((total >= 0) && ((total - pos) < result))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((available - pos) < result)
+ return pos + result;
+
+ const long long end = pos + result;
+
+ Init();
+
+ while (pos < end) {
+ long long id, size;
+
+ status = ParseElementHeader(pReader, pos, end, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (id == libwebm::kMkvEBMLVersion) {
+ m_version = UnserializeUInt(pReader, pos, size);
+
+ if (m_version <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvEBMLReadVersion) {
+ m_readVersion = UnserializeUInt(pReader, pos, size);
+
+ if (m_readVersion <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvEBMLMaxIDLength) {
+ m_maxIdLength = UnserializeUInt(pReader, pos, size);
+
+ if (m_maxIdLength <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvEBMLMaxSizeLength) {
+ m_maxSizeLength = UnserializeUInt(pReader, pos, size);
+
+ if (m_maxSizeLength <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvDocType) {
+ if (m_docType)
+ return E_FILE_FORMAT_INVALID;
+
+ status = UnserializeString(pReader, pos, size, m_docType);
+
+ if (status) // error
+ return status;
+ } else if (id == libwebm::kMkvDocTypeVersion) {
+ m_docTypeVersion = UnserializeUInt(pReader, pos, size);
+
+ if (m_docTypeVersion <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvDocTypeReadVersion) {
+ m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
+
+ if (m_docTypeReadVersion <= 0)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ pos += size;
+ }
+
+ if (pos != end)
+ return E_FILE_FORMAT_INVALID;
+
+ // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+ if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+ if (m_maxIdLength <= 0 || m_maxIdLength > 4 || m_maxSizeLength <= 0 ||
+ m_maxSizeLength > 8)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+}
+
+Segment::Segment(IMkvReader* pReader, long long elem_start,
+ // long long elem_size,
+ long long start, long long size)
+ : m_pReader(pReader),
+ m_element_start(elem_start),
+ // m_element_size(elem_size),
+ m_start(start),
+ m_size(size),
+ m_pos(start),
+ m_pUnknownSize(0),
+ m_pSeekHead(NULL),
+ m_pInfo(NULL),
+ m_pTracks(NULL),
+ m_pCues(NULL),
+ m_pChapters(NULL),
+ m_pTags(NULL),
+ m_clusters(NULL),
+ m_clusterCount(0),
+ m_clusterPreloadCount(0),
+ m_clusterSize(0) {}
+
+Segment::~Segment() {
+ const long count = m_clusterCount + m_clusterPreloadCount;
+
+ Cluster** i = m_clusters;
+ Cluster** j = m_clusters + count;
+
+ while (i != j) {
+ Cluster* const p = *i++;
+ delete p;
+ }
+
+ delete[] m_clusters;
+
+ delete m_pTracks;
+ delete m_pInfo;
+ delete m_pCues;
+ delete m_pChapters;
+ delete m_pTags;
+ delete m_pSeekHead;
+}
+
+long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
+ Segment*& pSegment) {
+ if (pReader == NULL || pos < 0)
+ return E_PARSE_FAILED;
+
+ pSegment = NULL;
+
+ long long total, available;
+
+ const long status = pReader->Length(&total, &available);
+
+ if (status < 0) // error
+ return status;
+
+ if (available < 0)
+ return -1;
+
+ if ((total >= 0) && (available > total))
+ return -1;
+
+ // I would assume that in practice this loop would execute
+ // exactly once, but we allow for other elements (e.g. Void)
+ // to immediately follow the EBML header. This is fine for
+ // the source filter case (since the entire file is available),
+ // but in the splitter case over a network we should probably
+ // just give up early. We could for example decide only to
+ // execute this loop a maximum of, say, 10 times.
+ // TODO:
+ // There is an implied "give up early" by only parsing up
+ // to the available limit. We do do that, but only if the
+ // total file size is unknown. We could decide to always
+ // use what's available as our limit (irrespective of whether
+ // we happen to know the total file length). This would have
+ // as its sense "parse this much of the file before giving up",
+ // which a slightly different sense from "try to parse up to
+ // 10 EMBL elements before giving up".
+
+ for (;;) {
+ if ((total >= 0) && (pos >= total))
+ return E_FILE_FORMAT_INVALID;
+
+ // Read ID
+ long len;
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result) // error, or too few available bytes
+ return result;
+
+ if ((total >= 0) && ((pos + len) > total))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > available)
+ return pos + len;
+
+ const long long idpos = pos;
+ const long long id = ReadID(pReader, pos, len);
+
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume ID
+
+ // Read Size
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result) // error, or too few available bytes
+ return result;
+
+ if ((total >= 0) && ((pos + len) > total))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > available)
+ return pos + len;
+
+ long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return size;
+
+ pos += len; // consume length of size of element
+
+ // Pos now points to start of payload
+
+ // Handle "unknown size" for live streaming of webm files.
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (id == libwebm::kMkvSegment) {
+ if (size == unknown_size)
+ size = -1;
+
+ else if (total < 0)
+ size = -1;
+
+ else if ((pos + size) > total)
+ size = -1;
+
+ pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+ if (pSegment == NULL)
+ return E_PARSE_FAILED;
+
+ return 0; // success
+ }
+
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && ((pos + size) > total))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + size) > available)
+ return pos + size;
+
+ pos += size; // consume payload
+ }
+}
+
+long long Segment::ParseHeaders() {
+ // Outermost (level 0) segment object has been constructed,
+ // and pos designates start of payload. We need to find the
+ // inner (level 1) elements.
+ long long total, available;
+
+ const int status = m_pReader->Length(&total, &available);
+
+ if (status < 0) // error
+ return status;
+
+ if (total > 0 && available > total)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+ if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+ (segment_stop >= 0 && m_pos > segment_stop)) {
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ for (;;) {
+ if ((total >= 0) && (m_pos >= total))
+ break;
+
+ if ((segment_stop >= 0) && (m_pos >= segment_stop))
+ break;
+
+ long long pos = m_pos;
+ const long long element_start = pos;
+
+ // Avoid rolling over pos when very close to LLONG_MAX.
+ unsigned long long rollover_check = pos + 1ULL;
+ if (rollover_check > LLONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + 1) > available)
+ return (pos + 1);
+
+ long len;
+ long long result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return result;
+
+ if (result > 0) {
+ // MkvReader doesn't have enough data to satisfy this read attempt.
+ return (pos + 1);
+ }
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > available)
+ return pos + len;
+
+ const long long idpos = pos;
+ const long long id = ReadID(m_pReader, idpos, len);
+
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (id == libwebm::kMkvCluster)
+ break;
+
+ pos += len; // consume ID
+
+ if ((pos + 1) > available)
+ return (pos + 1);
+
+ // Read Size
+ result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return result;
+
+ if (result > 0) {
+ // MkvReader doesn't have enough data to satisfy this read attempt.
+ return (pos + 1);
+ }
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > available)
+ return pos + len;
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+
+ if (size < 0 || len < 1 || len > 8) {
+ // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+ // len > 8 is true instead of checking this _everywhere_.
+ return size;
+ }
+
+ pos += len; // consume length of size of element
+
+ // Avoid rolling over pos when very close to LLONG_MAX.
+ rollover_check = static_cast<unsigned long long>(pos) + size;
+ if (rollover_check > LLONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long element_size = size + pos - element_start;
+
+ // Pos now points to start of payload
+
+ if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ // We read EBML elements either in total or nothing at all.
+
+ if ((pos + size) > available)
+ return pos + size;
+
+ if (id == libwebm::kMkvInfo) {
+ if (m_pInfo)
+ return E_FILE_FORMAT_INVALID;
+
+ m_pInfo = new (std::nothrow)
+ SegmentInfo(this, pos, size, element_start, element_size);
+
+ if (m_pInfo == NULL)
+ return -1;
+
+ const long status = m_pInfo->Parse();
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvTracks) {
+ if (m_pTracks)
+ return E_FILE_FORMAT_INVALID;
+
+ m_pTracks = new (std::nothrow)
+ Tracks(this, pos, size, element_start, element_size);
+
+ if (m_pTracks == NULL)
+ return -1;
+
+ const long status = m_pTracks->Parse();
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvCues) {
+ if (m_pCues == NULL) {
+ m_pCues = new (std::nothrow)
+ Cues(this, pos, size, element_start, element_size);
+
+ if (m_pCues == NULL)
+ return -1;
+ }
+ } else if (id == libwebm::kMkvSeekHead) {
+ if (m_pSeekHead == NULL) {
+ m_pSeekHead = new (std::nothrow)
+ SeekHead(this, pos, size, element_start, element_size);
+
+ if (m_pSeekHead == NULL)
+ return -1;
+
+ const long status = m_pSeekHead->Parse();
+
+ if (status)
+ return status;
+ }
+ } else if (id == libwebm::kMkvChapters) {
+ if (m_pChapters == NULL) {
+ m_pChapters = new (std::nothrow)
+ Chapters(this, pos, size, element_start, element_size);
+
+ if (m_pChapters == NULL)
+ return -1;
+
+ const long status = m_pChapters->Parse();
+
+ if (status)
+ return status;
+ }
+ } else if (id == libwebm::kMkvTags) {
+ if (m_pTags == NULL) {
+ m_pTags = new (std::nothrow)
+ Tags(this, pos, size, element_start, element_size);
+
+ if (m_pTags == NULL)
+ return -1;
+
+ const long status = m_pTags->Parse();
+
+ if (status)
+ return status;
+ }
+ }
+
+ m_pos = pos + size; // consume payload
+ }
+
+ if (segment_stop >= 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (m_pInfo == NULL) // TODO: liberalize this behavior
+ return E_FILE_FORMAT_INVALID;
+
+ if (m_pTracks == NULL)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0; // success
+}
+
+long Segment::LoadCluster(long long& pos, long& len) {
+ for (;;) {
+ const long result = DoLoadCluster(pos, len);
+
+ if (result <= 1)
+ return result;
+ }
+}
+
+long Segment::DoLoadCluster(long long& pos, long& len) {
+ if (m_pos < 0)
+ return DoLoadClusterUnknownSize(pos, len);
+
+ long long total, avail;
+
+ long status = m_pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ if (total >= 0 && avail > total)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+ long long cluster_off = -1; // offset relative to start of segment
+ long long cluster_size = -1; // size of cluster payload
+
+ for (;;) {
+ if ((total >= 0) && (m_pos >= total))
+ return 1; // no more clusters
+
+ if ((segment_stop >= 0) && (m_pos >= segment_stop))
+ return 1; // no more clusters
+
+ pos = m_pos;
+
+ // Read ID
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long idpos = pos;
+ const long long id = ReadID(m_pReader, idpos, len);
+
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume ID
+
+ // Read Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ pos += len; // consume length of size of element
+
+ // pos now points to start of payload
+
+ if (size == 0) {
+ // Missing element payload: move on.
+ m_pos = pos;
+ continue;
+ }
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if ((segment_stop >= 0) && (size != unknown_size) &&
+ ((pos + size) > segment_stop)) {
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (id == libwebm::kMkvCues) {
+ if (size == unknown_size) {
+ // Cues element of unknown size: Not supported.
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (m_pCues == NULL) {
+ const long long element_size = (pos - idpos) + size;
+
+ m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+ if (m_pCues == NULL)
+ return -1;
+ }
+
+ m_pos = pos + size; // consume payload
+ continue;
+ }
+
+ if (id != libwebm::kMkvCluster) {
+ // Besides the Segment, Libwebm allows only cluster elements of unknown
+ // size. Fail the parse upon encountering a non-cluster element reporting
+ // unknown size.
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ m_pos = pos + size; // consume payload
+ continue;
+ }
+
+ // We have a cluster.
+
+ cluster_off = idpos - m_start; // relative pos
+
+ if (size != unknown_size)
+ cluster_size = size;
+
+ break;
+ }
+
+ if (cluster_off < 0) {
+ // No cluster, die.
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ long long pos_;
+ long len_;
+
+ status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_);
+
+ if (status < 0) { // error, or underflow
+ pos = pos_;
+ len = len_;
+
+ return status;
+ }
+
+ // status == 0 means "no block entries found"
+ // status > 0 means "found at least one block entry"
+
+ // TODO:
+ // The issue here is that the segment increments its own
+ // pos ptr past the most recent cluster parsed, and then
+ // starts from there to parse the next cluster. If we
+ // don't know the size of the current cluster, then we
+ // must either parse its payload (as we do below), looking
+ // for the cluster (or cues) ID to terminate the parse.
+ // This isn't really what we want: rather, we really need
+ // a way to create the curr cluster object immediately.
+ // The pity is that cluster::parse can determine its own
+ // boundary, and we largely duplicate that same logic here.
+ //
+ // Maybe we need to get rid of our look-ahead preloading
+ // in source::parse???
+ //
+ // As we're parsing the blocks in the curr cluster
+ //(in cluster::parse), we should have some way to signal
+ // to the segment that we have determined the boundary,
+ // so it can adjust its own segment::m_pos member.
+ //
+ // The problem is that we're asserting in asyncreadinit,
+ // because we adjust the pos down to the curr seek pos,
+ // and the resulting adjusted len is > 2GB. I'm suspicious
+ // that this is even correct, but even if it is, we can't
+ // be loading that much data in the cache anyway.
+
+ const long idx = m_clusterCount;
+
+ if (m_clusterPreloadCount > 0) {
+ if (idx >= m_clusterSize)
+ return E_FILE_FORMAT_INVALID;
+
+ Cluster* const pCluster = m_clusters[idx];
+ if (pCluster == NULL || pCluster->m_index >= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long off = pCluster->GetPosition();
+ if (off < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (off == cluster_off) { // preloaded already
+ if (status == 0) // no entries found
+ return E_FILE_FORMAT_INVALID;
+
+ if (cluster_size >= 0)
+ pos += cluster_size;
+ else {
+ const long long element_size = pCluster->GetElementSize();
+
+ if (element_size <= 0)
+ return E_FILE_FORMAT_INVALID; // TODO: handle this case
+
+ pos = pCluster->m_element_start + element_size;
+ }
+
+ pCluster->m_index = idx; // move from preloaded to loaded
+ ++m_clusterCount;
+ --m_clusterPreloadCount;
+
+ m_pos = pos; // consume payload
+ if (segment_stop >= 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0; // success
+ }
+ }
+
+ if (status == 0) { // no entries found
+ if (cluster_size >= 0)
+ pos += cluster_size;
+
+ if ((total >= 0) && (pos >= total)) {
+ m_pos = total;
+ return 1; // no more clusters
+ }
+
+ if ((segment_stop >= 0) && (pos >= segment_stop)) {
+ m_pos = segment_stop;
+ return 1; // no more clusters
+ }
+
+ m_pos = pos;
+ return 2; // try again
+ }
+
+ // status > 0 means we have an entry
+
+ Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
+ if (pCluster == NULL)
+ return -1;
+
+ if (!AppendCluster(pCluster)) {
+ delete pCluster;
+ return -1;
+ }
+
+ if (cluster_size >= 0) {
+ pos += cluster_size;
+
+ m_pos = pos;
+
+ if (segment_stop > 0 && m_pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+ }
+
+ m_pUnknownSize = pCluster;
+ m_pos = -pos;
+
+ return 0; // partial success, since we have a new cluster
+
+ // status == 0 means "no block entries found"
+ // pos designates start of payload
+ // m_pos has NOT been adjusted yet (in case we need to come back here)
+}
+
+long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
+ if (m_pos >= 0 || m_pUnknownSize == NULL)
+ return E_PARSE_FAILED;
+
+ const long status = m_pUnknownSize->Parse(pos, len);
+
+ if (status < 0) // error or underflow
+ return status;
+
+ if (status == 0) // parsed a block
+ return 2; // continue parsing
+
+ const long long start = m_pUnknownSize->m_element_start;
+ const long long size = m_pUnknownSize->GetElementSize();
+
+ if (size < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos = start + size;
+ m_pos = pos;
+
+ m_pUnknownSize = 0;
+
+ return 2; // continue parsing
+}
+
+bool Segment::AppendCluster(Cluster* pCluster) {
+ if (pCluster == NULL || pCluster->m_index < 0)
+ return false;
+
+ const long count = m_clusterCount + m_clusterPreloadCount;
+
+ long& size = m_clusterSize;
+ const long idx = pCluster->m_index;
+
+ if (size < count || idx != m_clusterCount)
+ return false;
+
+ if (count >= size) {
+ const long n = (size <= 0) ? 2048 : 2 * size;
+
+ Cluster** const qq = new (std::nothrow) Cluster*[n];
+ if (qq == NULL)
+ return false;
+
+ Cluster** q = qq;
+ Cluster** p = m_clusters;
+ Cluster** const pp = p + count;
+
+ while (p != pp)
+ *q++ = *p++;
+
+ delete[] m_clusters;
+
+ m_clusters = qq;
+ size = n;
+ }
+
+ if (m_clusterPreloadCount > 0) {
+ Cluster** const p = m_clusters + m_clusterCount;
+ if (*p == NULL || (*p)->m_index >= 0)
+ return false;
+
+ Cluster** q = p + m_clusterPreloadCount;
+ if (q >= (m_clusters + size))
+ return false;
+
+ for (;;) {
+ Cluster** const qq = q - 1;
+ if ((*qq)->m_index >= 0)
+ return false;
+
+ *q = *qq;
+ q = qq;
+
+ if (q == p)
+ break;
+ }
+ }
+
+ m_clusters[idx] = pCluster;
+ ++m_clusterCount;
+ return true;
+}
+
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+ if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+ return false;
+
+ const long count = m_clusterCount + m_clusterPreloadCount;
+
+ long& size = m_clusterSize;
+ if (size < count)
+ return false;
+
+ if (count >= size) {
+ const long n = (size <= 0) ? 2048 : 2 * size;
+
+ Cluster** const qq = new (std::nothrow) Cluster*[n];
+ if (qq == NULL)
+ return false;
+ Cluster** q = qq;
+
+ Cluster** p = m_clusters;
+ Cluster** const pp = p + count;
+
+ while (p != pp)
+ *q++ = *p++;
+
+ delete[] m_clusters;
+
+ m_clusters = qq;
+ size = n;
+ }
+
+ if (m_clusters == NULL)
+ return false;
+
+ Cluster** const p = m_clusters + idx;
+
+ Cluster** q = m_clusters + count;
+ if (q < p || q >= (m_clusters + size))
+ return false;
+
+ while (q > p) {
+ Cluster** const qq = q - 1;
+
+ if ((*qq)->m_index >= 0)
+ return false;
+
+ *q = *qq;
+ q = qq;
+ }
+
+ m_clusters[idx] = pCluster;
+ ++m_clusterPreloadCount;
+ return true;
+}
+
+long Segment::Load() {
+ if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+ return E_PARSE_FAILED;
+
+ // Outermost (level 0) segment object has been constructed,
+ // and pos designates start of payload. We need to find the
+ // inner (level 1) elements.
+
+ const long long header_status = ParseHeaders();
+
+ if (header_status < 0) // error
+ return static_cast<long>(header_status);
+
+ if (header_status > 0) // underflow
+ return E_BUFFER_NOT_FULL;
+
+ if (m_pInfo == NULL || m_pTracks == NULL)
+ return E_FILE_FORMAT_INVALID;
+
+ for (;;) {
+ const long status = LoadCluster();
+
+ if (status < 0) // error
+ return status;
+
+ if (status >= 1) // no more clusters
+ return 0;
+ }
+}
+
+SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {}
+
+SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
+ long long element_start, long long element_size)
+ : m_pSegment(pSegment),
+ m_start(start),
+ m_size(size_),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ m_entries(0),
+ m_entry_count(0),
+ m_void_elements(0),
+ m_void_element_count(0) {}
+
+SeekHead::~SeekHead() {
+ delete[] m_entries;
+ delete[] m_void_elements;
+}
+
+long SeekHead::Parse() {
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long pos = m_start;
+ const long long stop = m_start + m_size;
+
+ // first count the seek head entries
+
+ long long entry_count = 0;
+ long long void_element_count = 0;
+
+ while (pos < stop) {
+ long long id, size;
+
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvSeek) {
+ ++entry_count;
+ if (entry_count > INT_MAX)
+ return E_PARSE_FAILED;
+ } else if (id == libwebm::kMkvVoid) {
+ ++void_element_count;
+ if (void_element_count > INT_MAX)
+ return E_PARSE_FAILED;
+ }
+
+ pos += size; // consume payload
+
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (entry_count > 0) {
+ m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)];
+
+ if (m_entries == NULL)
+ return -1;
+ }
+
+ if (void_element_count > 0) {
+ m_void_elements =
+ new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)];
+
+ if (m_void_elements == NULL)
+ return -1;
+ }
+
+ // now parse the entries and void elements
+
+ Entry* pEntry = m_entries;
+ VoidElement* pVoidElement = m_void_elements;
+
+ pos = m_start;
+
+ while (pos < stop) {
+ const long long idpos = pos;
+
+ long long id, size;
+
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvSeek && entry_count > 0) {
+ if (ParseEntry(pReader, pos, size, pEntry)) {
+ Entry& e = *pEntry++;
+
+ e.element_start = idpos;
+ e.element_size = (pos + size) - idpos;
+ }
+ } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
+ VoidElement& e = *pVoidElement++;
+
+ e.element_start = idpos;
+ e.element_size = (pos + size) - idpos;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
+ assert(count_ >= 0);
+ assert(static_cast<long long>(count_) <= entry_count);
+
+ m_entry_count = static_cast<int>(count_);
+
+ count_ = ptrdiff_t(pVoidElement - m_void_elements);
+ assert(count_ >= 0);
+ assert(static_cast<long long>(count_) <= void_element_count);
+
+ m_void_element_count = static_cast<int>(count_);
+
+ return 0;
+}
+
+int SeekHead::GetCount() const { return m_entry_count; }
+
+const SeekHead::Entry* SeekHead::GetEntry(int idx) const {
+ if (idx < 0)
+ return 0;
+
+ if (idx >= m_entry_count)
+ return 0;
+
+ return m_entries + idx;
+}
+
+int SeekHead::GetVoidElementCount() const { return m_void_element_count; }
+
+const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const {
+ if (idx < 0)
+ return 0;
+
+ if (idx >= m_void_element_count)
+ return 0;
+
+ return m_void_elements + idx;
+}
+
+long Segment::ParseCues(long long off, long long& pos, long& len) {
+ if (m_pCues)
+ return 0; // success
+
+ if (off < 0)
+ return -1;
+
+ long long total, avail;
+
+ const int status = m_pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ assert((total < 0) || (avail <= total));
+
+ pos = m_start + off;
+
+ if ((total < 0) || (pos >= total))
+ return 1; // don't bother parsing cues
+
+ const long long element_start = pos;
+ const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // underflow (weird)
+ {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long idpos = pos;
+
+ const long long id = ReadID(m_pReader, idpos, len);
+
+ if (id != libwebm::kMkvCues)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume ID
+ assert((segment_stop < 0) || (pos <= segment_stop));
+
+ // Read Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // underflow (weird)
+ {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ if (size == 0) // weird, although technically not illegal
+ return 1; // done
+
+ pos += len; // consume length of size of element
+ assert((segment_stop < 0) || (pos <= segment_stop));
+
+ // Pos now points to start of payload
+
+ const long long element_stop = pos + size;
+
+ if ((segment_stop >= 0) && (element_stop > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && (element_stop > total))
+ return 1; // don't bother parsing anymore
+
+ len = static_cast<long>(size);
+
+ if (element_stop > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long element_size = element_stop - element_start;
+
+ m_pCues =
+ new (std::nothrow) Cues(this, pos, size, element_start, element_size);
+ if (m_pCues == NULL)
+ return -1;
+
+ return 0; // success
+}
+
+bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
+ Entry* pEntry) {
+ if (size_ <= 0)
+ return false;
+
+ long long pos = start;
+ const long long stop = start + size_;
+
+ long len;
+
+ // parse the container for the level-1 element ID
+
+ const long long seekIdId = ReadID(pReader, pos, len);
+ if (seekIdId < 0)
+ return false;
+
+ if (seekIdId != libwebm::kMkvSeekID)
+ return false;
+
+ if ((pos + len) > stop)
+ return false;
+
+ pos += len; // consume SeekID id
+
+ const long long seekIdSize = ReadUInt(pReader, pos, len);
+
+ if (seekIdSize <= 0)
+ return false;
+
+ if ((pos + len) > stop)
+ return false;
+
+ pos += len; // consume size of field
+
+ if ((pos + seekIdSize) > stop)
+ return false;
+
+ pEntry->id = ReadID(pReader, pos, len); // payload
+
+ if (pEntry->id <= 0)
+ return false;
+
+ if (len != seekIdSize)
+ return false;
+
+ pos += seekIdSize; // consume SeekID payload
+
+ const long long seekPosId = ReadID(pReader, pos, len);
+
+ if (seekPosId != libwebm::kMkvSeekPosition)
+ return false;
+
+ if ((pos + len) > stop)
+ return false;
+
+ pos += len; // consume id
+
+ const long long seekPosSize = ReadUInt(pReader, pos, len);
+
+ if (seekPosSize <= 0)
+ return false;
+
+ if ((pos + len) > stop)
+ return false;
+
+ pos += len; // consume size
+
+ if ((pos + seekPosSize) > stop)
+ return false;
+
+ pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize);
+
+ if (pEntry->pos < 0)
+ return false;
+
+ pos += seekPosSize; // consume payload
+
+ if (pos != stop)
+ return false;
+
+ return true;
+}
+
+Cues::Cues(Segment* pSegment, long long start_, long long size_,
+ long long element_start, long long element_size)
+ : m_pSegment(pSegment),
+ m_start(start_),
+ m_size(size_),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ m_cue_points(NULL),
+ m_count(0),
+ m_preload_count(0),
+ m_pos(start_) {}
+
+Cues::~Cues() {
+ const long n = m_count + m_preload_count;
+
+ CuePoint** p = m_cue_points;
+ CuePoint** const q = p + n;
+
+ while (p != q) {
+ CuePoint* const pCP = *p++;
+ assert(pCP);
+
+ delete pCP;
+ }
+
+ delete[] m_cue_points;
+}
+
+long Cues::GetCount() const {
+ if (m_cue_points == NULL)
+ return -1;
+
+ return m_count; // TODO: really ignore preload count?
+}
+
+bool Cues::DoneParsing() const {
+ const long long stop = m_start + m_size;
+ return (m_pos >= stop);
+}
+
+bool Cues::Init() const {
+ if (m_cue_points)
+ return true;
+
+ if (m_count != 0 || m_preload_count != 0)
+ return false;
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ const long long stop = m_start + m_size;
+ long long pos = m_start;
+
+ long cue_points_size = 0;
+
+ while (pos < stop) {
+ const long long idpos = pos;
+
+ long len;
+
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop) {
+ return false;
+ }
+
+ pos += len; // consume ID
+
+ const long long size = ReadUInt(pReader, pos, len);
+ if (size < 0 || (pos + len > stop)) {
+ return false;
+ }
+
+ pos += len; // consume Size field
+ if (pos + size > stop) {
+ return false;
+ }
+
+ if (id == libwebm::kMkvCuePoint) {
+ if (!PreloadCuePoint(cue_points_size, idpos))
+ return false;
+ }
+
+ pos += size; // skip payload
+ }
+ return true;
+}
+
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+ if (m_count != 0)
+ return false;
+
+ if (m_preload_count >= cue_points_size) {
+ const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
+
+ CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+ if (qq == NULL)
+ return false;
+
+ CuePoint** q = qq; // beginning of target
+
+ CuePoint** p = m_cue_points; // beginning of source
+ CuePoint** const pp = p + m_preload_count; // end of source
+
+ while (p != pp)
+ *q++ = *p++;
+
+ delete[] m_cue_points;
+
+ m_cue_points = qq;
+ cue_points_size = n;
+ }
+
+ CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+ if (pCP == NULL)
+ return false;
+
+ m_cue_points[m_preload_count++] = pCP;
+ return true;
+}
+
+bool Cues::LoadCuePoint() const {
+ const long long stop = m_start + m_size;
+
+ if (m_pos >= stop)
+ return false; // nothing else to do
+
+ if (!Init()) {
+ m_pos = stop;
+ return false;
+ }
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ while (m_pos < stop) {
+ const long long idpos = m_pos;
+
+ long len;
+
+ const long long id = ReadID(pReader, m_pos, len);
+ if (id < 0 || (m_pos + len) > stop)
+ return false;
+
+ m_pos += len; // consume ID
+
+ const long long size = ReadUInt(pReader, m_pos, len);
+ if (size < 0 || (m_pos + len) > stop)
+ return false;
+
+ m_pos += len; // consume Size field
+ if ((m_pos + size) > stop)
+ return false;
+
+ if (id != libwebm::kMkvCuePoint) {
+ m_pos += size; // consume payload
+ if (m_pos > stop)
+ return false;
+
+ continue;
+ }
+
+ if (m_preload_count < 1)
+ return false;
+
+ CuePoint* const pCP = m_cue_points[m_count];
+ if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
+ return false;
+
+ if (!pCP->Load(pReader)) {
+ m_pos = stop;
+ return false;
+ }
+ ++m_count;
+ --m_preload_count;
+
+ m_pos += size; // consume payload
+ if (m_pos > stop)
+ return false;
+
+ return true; // yes, we loaded a cue point
+ }
+
+ return false; // no, we did not load a cue point
+}
+
+bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
+ const CuePoint::TrackPosition*& pTP) const {
+ if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
+ return false;
+
+ CuePoint** const ii = m_cue_points;
+ CuePoint** i = ii;
+
+ CuePoint** const jj = ii + m_count;
+ CuePoint** j = jj;
+
+ pCP = *i;
+ if (pCP == NULL)
+ return false;
+
+ if (time_ns <= pCP->GetTime(m_pSegment)) {
+ pTP = pCP->Find(pTrack);
+ return (pTP != NULL);
+ }
+
+ while (i < j) {
+ // INVARIANT:
+ //[ii, i) <= time_ns
+ //[i, j) ?
+ //[j, jj) > time_ns
+
+ CuePoint** const k = i + (j - i) / 2;
+ if (k >= jj)
+ return false;
+
+ CuePoint* const pCP = *k;
+ if (pCP == NULL)
+ return false;
+
+ const long long t = pCP->GetTime(m_pSegment);
+
+ if (t <= time_ns)
+ i = k + 1;
+ else
+ j = k;
+
+ if (i > j)
+ return false;
+ }
+
+ if (i != j || i > jj || i <= ii)
+ return false;
+
+ pCP = *--i;
+
+ if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+ return false;
+
+ // TODO: here and elsewhere, it's probably not correct to search
+ // for the cue point with this time, and then search for a matching
+ // track. In principle, the matching track could be on some earlier
+ // cue point, and with our current algorithm, we'd miss it. To make
+ // this bullet-proof, we'd need to create a secondary structure,
+ // with a list of cue points that apply to a track, and then search
+ // that track-based structure for a matching cue point.
+
+ pTP = pCP->Find(pTrack);
+ return (pTP != NULL);
+}
+
+const CuePoint* Cues::GetFirst() const {
+ if (m_cue_points == NULL || m_count == 0)
+ return NULL;
+
+ CuePoint* const* const pp = m_cue_points;
+ if (pp == NULL)
+ return NULL;
+
+ CuePoint* const pCP = pp[0];
+ if (pCP == NULL || pCP->GetTimeCode() < 0)
+ return NULL;
+
+ return pCP;
+}
+
+const CuePoint* Cues::GetLast() const {
+ if (m_cue_points == NULL || m_count <= 0)
+ return NULL;
+
+ const long index = m_count - 1;
+
+ CuePoint* const* const pp = m_cue_points;
+ if (pp == NULL)
+ return NULL;
+
+ CuePoint* const pCP = pp[index];
+ if (pCP == NULL || pCP->GetTimeCode() < 0)
+ return NULL;
+
+ return pCP;
+}
+
+const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
+ if (pCurr == NULL || pCurr->GetTimeCode() < 0 || m_cue_points == NULL ||
+ m_count < 1) {
+ return NULL;
+ }
+
+ long index = pCurr->m_index;
+ if (index >= m_count)
+ return NULL;
+
+ CuePoint* const* const pp = m_cue_points;
+ if (pp == NULL || pp[index] != pCurr)
+ return NULL;
+
+ ++index;
+
+ if (index >= m_count)
+ return NULL;
+
+ CuePoint* const pNext = pp[index];
+
+ if (pNext == NULL || pNext->GetTimeCode() < 0)
+ return NULL;
+
+ return pNext;
+}
+
+const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
+ const CuePoint::TrackPosition* pTP) const {
+ if (pCP == NULL || pTP == NULL)
+ return NULL;
+
+ return m_pSegment->GetBlock(*pCP, *pTP);
+}
+
+const BlockEntry* Segment::GetBlock(const CuePoint& cp,
+ const CuePoint::TrackPosition& tp) {
+ Cluster** const ii = m_clusters;
+ Cluster** i = ii;
+
+ const long count = m_clusterCount + m_clusterPreloadCount;
+
+ Cluster** const jj = ii + count;
+ Cluster** j = jj;
+
+ while (i < j) {
+ // INVARIANT:
+ //[ii, i) < pTP->m_pos
+ //[i, j) ?
+ //[j, jj) > pTP->m_pos
+
+ Cluster** const k = i + (j - i) / 2;
+ assert(k < jj);
+
+ Cluster* const pCluster = *k;
+ assert(pCluster);
+
+ // const long long pos_ = pCluster->m_pos;
+ // assert(pos_);
+ // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+ const long long pos = pCluster->GetPosition();
+ assert(pos >= 0);
+
+ if (pos < tp.m_pos)
+ i = k + 1;
+ else if (pos > tp.m_pos)
+ j = k;
+ else
+ return pCluster->GetEntry(cp, tp);
+ }
+
+ assert(i == j);
+ // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+ Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos); //, -1);
+ if (pCluster == NULL)
+ return NULL;
+
+ const ptrdiff_t idx = i - m_clusters;
+
+ if (!PreloadCluster(pCluster, idx)) {
+ delete pCluster;
+ return NULL;
+ }
+ assert(m_clusters);
+ assert(m_clusterPreloadCount > 0);
+ assert(m_clusters[idx] == pCluster);
+
+ return pCluster->GetEntry(cp, tp);
+}
+
+const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) {
+ if (requested_pos < 0)
+ return 0;
+
+ Cluster** const ii = m_clusters;
+ Cluster** i = ii;
+
+ const long count = m_clusterCount + m_clusterPreloadCount;
+
+ Cluster** const jj = ii + count;
+ Cluster** j = jj;
+
+ while (i < j) {
+ // INVARIANT:
+ //[ii, i) < pTP->m_pos
+ //[i, j) ?
+ //[j, jj) > pTP->m_pos
+
+ Cluster** const k = i + (j - i) / 2;
+ assert(k < jj);
+
+ Cluster* const pCluster = *k;
+ assert(pCluster);
+
+ // const long long pos_ = pCluster->m_pos;
+ // assert(pos_);
+ // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+ const long long pos = pCluster->GetPosition();
+ assert(pos >= 0);
+
+ if (pos < requested_pos)
+ i = k + 1;
+ else if (pos > requested_pos)
+ j = k;
+ else
+ return pCluster;
+ }
+
+ assert(i == j);
+ // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+ Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
+ if (pCluster == NULL)
+ return NULL;
+
+ const ptrdiff_t idx = i - m_clusters;
+
+ if (!PreloadCluster(pCluster, idx)) {
+ delete pCluster;
+ return NULL;
+ }
+ assert(m_clusters);
+ assert(m_clusterPreloadCount > 0);
+ assert(m_clusters[idx] == pCluster);
+
+ return pCluster;
+}
+
+CuePoint::CuePoint(long idx, long long pos)
+ : m_element_start(0),
+ m_element_size(0),
+ m_index(idx),
+ m_timecode(-1 * pos),
+ m_track_positions(NULL),
+ m_track_positions_count(0) {
+ assert(pos > 0);
+}
+
+CuePoint::~CuePoint() { delete[] m_track_positions; }
+
+bool CuePoint::Load(IMkvReader* pReader) {
+ // odbgstream os;
+ // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
+
+ if (m_timecode >= 0) // already loaded
+ return true;
+
+ assert(m_track_positions == NULL);
+ assert(m_track_positions_count == 0);
+
+ long long pos_ = -m_timecode;
+ const long long element_start = pos_;
+
+ long long stop;
+
+ {
+ long len;
+
+ const long long id = ReadID(pReader, pos_, len);
+ if (id != libwebm::kMkvCuePoint)
+ return false;
+
+ pos_ += len; // consume ID
+
+ const long long size = ReadUInt(pReader, pos_, len);
+ assert(size >= 0);
+
+ pos_ += len; // consume Size field
+ // pos_ now points to start of payload
+
+ stop = pos_ + size;
+ }
+
+ const long long element_size = stop - element_start;
+
+ long long pos = pos_;
+
+ // First count number of track positions
+ unsigned long long track_positions_count = 0;
+ while (pos < stop) {
+ long len;
+
+ const long long id = ReadID(pReader, pos, len);
+ if ((id < 0) || (pos + len > stop)) {
+ return false;
+ }
+
+ pos += len; // consume ID
+
+ const long long size = ReadUInt(pReader, pos, len);
+ if ((size < 0) || (pos + len > stop)) {
+ return false;
+ }
+
+ pos += len; // consume Size field
+ if ((pos + size) > stop) {
+ return false;
+ }
+
+ if (id == libwebm::kMkvCueTime)
+ m_timecode = UnserializeUInt(pReader, pos, size);
+
+ else if (id == libwebm::kMkvCueTrackPositions) {
+ ++track_positions_count;
+ if (track_positions_count > UINT_MAX)
+ return E_PARSE_FAILED;
+ }
+
+ pos += size; // consume payload
+ }
+
+ m_track_positions_count = static_cast<size_t>(track_positions_count);
+
+ if (m_timecode < 0 || m_track_positions_count <= 0) {
+ return false;
+ }
+
+ // os << "CuePoint::Load(cont'd): idpos=" << idpos
+ // << " timecode=" << m_timecode
+ // << endl;
+
+ m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+ if (m_track_positions == NULL)
+ return false;
+
+ // Now parse track positions
+
+ TrackPosition* p = m_track_positions;
+ pos = pos_;
+
+ while (pos < stop) {
+ long len;
+
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop)
+ return false;
+
+ pos += len; // consume ID
+
+ const long long size = ReadUInt(pReader, pos, len);
+ assert(size >= 0);
+ assert((pos + len) <= stop);
+
+ pos += len; // consume Size field
+ assert((pos + size) <= stop);
+
+ if (id == libwebm::kMkvCueTrackPositions) {
+ TrackPosition& tp = *p++;
+ if (!tp.Parse(pReader, pos, size)) {
+ return false;
+ }
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return false;
+ }
+
+ assert(size_t(p - m_track_positions) == m_track_positions_count);
+
+ m_element_start = element_start;
+ m_element_size = element_size;
+
+ return true;
+}
+
+bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
+ long long size_) {
+ const long long stop = start_ + size_;
+ long long pos = start_;
+
+ m_track = -1;
+ m_pos = -1;
+ m_block = 1; // default
+
+ while (pos < stop) {
+ long len;
+
+ const long long id = ReadID(pReader, pos, len);
+ if ((id < 0) || ((pos + len) > stop)) {
+ return false;
+ }
+
+ pos += len; // consume ID
+
+ const long long size = ReadUInt(pReader, pos, len);
+ if ((size < 0) || ((pos + len) > stop)) {
+ return false;
+ }
+
+ pos += len; // consume Size field
+ if ((pos + size) > stop) {
+ return false;
+ }
+
+ if (id == libwebm::kMkvCueTrack)
+ m_track = UnserializeUInt(pReader, pos, size);
+ else if (id == libwebm::kMkvCueClusterPosition)
+ m_pos = UnserializeUInt(pReader, pos, size);
+ else if (id == libwebm::kMkvCueBlockNumber)
+ m_block = UnserializeUInt(pReader, pos, size);
+
+ pos += size; // consume payload
+ }
+
+ if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) {
+ return false;
+ }
+
+ return true;
+}
+
+const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
+ if (pTrack == NULL) {
+ return NULL;
+ }
+
+ const long long n = pTrack->GetNumber();
+
+ const TrackPosition* i = m_track_positions;
+ const TrackPosition* const j = i + m_track_positions_count;
+
+ while (i != j) {
+ const TrackPosition& p = *i++;
+
+ if (p.m_track == n)
+ return &p;
+ }
+
+ return NULL; // no matching track number found
+}
+
+long long CuePoint::GetTimeCode() const { return m_timecode; }
+
+long long CuePoint::GetTime(const Segment* pSegment) const {
+ assert(pSegment);
+ assert(m_timecode >= 0);
+
+ const SegmentInfo* const pInfo = pSegment->GetInfo();
+ assert(pInfo);
+
+ const long long scale = pInfo->GetTimeCodeScale();
+ assert(scale >= 1);
+
+ const long long time = scale * m_timecode;
+
+ return time;
+}
+
+bool Segment::DoneParsing() const {
+ if (m_size < 0) {
+ long long total, avail;
+
+ const int status = m_pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return true; // must assume done
+
+ if (total < 0)
+ return false; // assume live stream
+
+ return (m_pos >= total);
+ }
+
+ const long long stop = m_start + m_size;
+
+ return (m_pos >= stop);
+}
+
+const Cluster* Segment::GetFirst() const {
+ if ((m_clusters == NULL) || (m_clusterCount <= 0))
+ return &m_eos;
+
+ Cluster* const pCluster = m_clusters[0];
+ assert(pCluster);
+
+ return pCluster;
+}
+
+const Cluster* Segment::GetLast() const {
+ if ((m_clusters == NULL) || (m_clusterCount <= 0))
+ return &m_eos;
+
+ const long idx = m_clusterCount - 1;
+
+ Cluster* const pCluster = m_clusters[idx];
+ assert(pCluster);
+
+ return pCluster;
+}
+
+unsigned long Segment::GetCount() const { return m_clusterCount; }
+
+const Cluster* Segment::GetNext(const Cluster* pCurr) {
+ assert(pCurr);
+ assert(pCurr != &m_eos);
+ assert(m_clusters);
+
+ long idx = pCurr->m_index;
+
+ if (idx >= 0) {
+ assert(m_clusterCount > 0);
+ assert(idx < m_clusterCount);
+ assert(pCurr == m_clusters[idx]);
+
+ ++idx;
+
+ if (idx >= m_clusterCount)
+ return &m_eos; // caller will LoadCluster as desired
+
+ Cluster* const pNext = m_clusters[idx];
+ assert(pNext);
+ assert(pNext->m_index >= 0);
+ assert(pNext->m_index == idx);
+
+ return pNext;
+ }
+
+ assert(m_clusterPreloadCount > 0);
+
+ long long pos = pCurr->m_element_start;
+
+ assert(m_size >= 0); // TODO
+ const long long stop = m_start + m_size; // end of segment
+
+ {
+ long len;
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+ assert(result == 0);
+ assert((pos + len) <= stop); // TODO
+ if (result != 0)
+ return NULL;
+
+ const long long id = ReadID(m_pReader, pos, len);
+ if (id != libwebm::kMkvCluster)
+ return NULL;
+
+ pos += len; // consume ID
+
+ // Read Size
+ result = GetUIntLength(m_pReader, pos, len);
+ assert(result == 0); // TODO
+ assert((pos + len) <= stop); // TODO
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+ assert(size > 0); // TODO
+ // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+ pos += len; // consume length of size of element
+ assert((pos + size) <= stop); // TODO
+
+ // Pos now points to start of payload
+
+ pos += size; // consume payload
+ }
+
+ long long off_next = 0;
+
+ while (pos < stop) {
+ long len;
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+ assert(result == 0);
+ assert((pos + len) <= stop); // TODO
+ if (result != 0)
+ return NULL;
+
+ const long long idpos = pos; // pos of next (potential) cluster
+
+ const long long id = ReadID(m_pReader, idpos, len);
+ if (id < 0)
+ return NULL;
+
+ pos += len; // consume ID
+
+ // Read Size
+ result = GetUIntLength(m_pReader, pos, len);
+ assert(result == 0); // TODO
+ assert((pos + len) <= stop); // TODO
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+ assert(size >= 0); // TODO
+
+ pos += len; // consume length of size of element
+ assert((pos + size) <= stop); // TODO
+
+ // Pos now points to start of payload
+
+ if (size == 0) // weird
+ continue;
+
+ if (id == libwebm::kMkvCluster) {
+ const long long off_next_ = idpos - m_start;
+
+ long long pos_;
+ long len_;
+
+ const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_);
+
+ assert(status >= 0);
+
+ if (status > 0) {
+ off_next = off_next_;
+ break;
+ }
+ }
+
+ pos += size; // consume payload
+ }
+
+ if (off_next <= 0)
+ return 0;
+
+ Cluster** const ii = m_clusters + m_clusterCount;
+ Cluster** i = ii;
+
+ Cluster** const jj = ii + m_clusterPreloadCount;
+ Cluster** j = jj;
+
+ while (i < j) {
+ // INVARIANT:
+ //[0, i) < pos_next
+ //[i, j) ?
+ //[j, jj) > pos_next
+
+ Cluster** const k = i + (j - i) / 2;
+ assert(k < jj);
+
+ Cluster* const pNext = *k;
+ assert(pNext);
+ assert(pNext->m_index < 0);
+
+ // const long long pos_ = pNext->m_pos;
+ // assert(pos_);
+ // pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+ pos = pNext->GetPosition();
+
+ if (pos < off_next)
+ i = k + 1;
+ else if (pos > off_next)
+ j = k;
+ else
+ return pNext;
+ }
+
+ assert(i == j);
+
+ Cluster* const pNext = Cluster::Create(this, -1, off_next);
+ if (pNext == NULL)
+ return NULL;
+
+ const ptrdiff_t idx_next = i - m_clusters; // insertion position
+
+ if (!PreloadCluster(pNext, idx_next)) {
+ delete pNext;
+ return NULL;
+ }
+ assert(m_clusters);
+ assert(idx_next < m_clusterSize);
+ assert(m_clusters[idx_next] == pNext);
+
+ return pNext;
+}
+
+long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
+ long long& pos, long& len) {
+ assert(pCurr);
+ assert(!pCurr->EOS());
+ assert(m_clusters);
+
+ pResult = 0;
+
+ if (pCurr->m_index >= 0) { // loaded (not merely preloaded)
+ assert(m_clusters[pCurr->m_index] == pCurr);
+
+ const long next_idx = pCurr->m_index + 1;
+
+ if (next_idx < m_clusterCount) {
+ pResult = m_clusters[next_idx];
+ return 0; // success
+ }
+
+ // curr cluster is last among loaded
+
+ const long result = LoadCluster(pos, len);
+
+ if (result < 0) // error or underflow
+ return result;
+
+ if (result > 0) // no more clusters
+ {
+ // pResult = &m_eos;
+ return 1;
+ }
+
+ pResult = GetLast();
+ return 0; // success
+ }
+
+ assert(m_pos > 0);
+
+ long long total, avail;
+
+ long status = m_pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ assert((total < 0) || (avail <= total));
+
+ const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+ // interrogate curr cluster
+
+ pos = pCurr->m_element_start;
+
+ if (pCurr->m_element_size >= 0)
+ pos += pCurr->m_element_size;
+ else {
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id = ReadUInt(m_pReader, pos, len);
+
+ if (id != libwebm::kMkvCluster)
+ return -1;
+
+ pos += len; // consume ID
+
+ // Read Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ pos += len; // consume size field
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size == unknown_size) // TODO: should never happen
+ return E_FILE_FORMAT_INVALID; // TODO: resolve this
+
+ // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+ if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ // Pos now points to start of payload
+
+ pos += size; // consume payload (that is, the current cluster)
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ // By consuming the payload, we are assuming that the curr
+ // cluster isn't interesting. That is, we don't bother checking
+ // whether the payload of the curr cluster is less than what
+ // happens to be available (obtained via IMkvReader::Length).
+ // Presumably the caller has already dispensed with the current
+ // cluster, and really does want the next cluster.
+ }
+
+ // pos now points to just beyond the last fully-loaded cluster
+
+ for (;;) {
+ const long status = DoParseNext(pResult, pos, len);
+
+ if (status <= 1)
+ return status;
+ }
+}
+
+long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
+ long long total, avail;
+
+ long status = m_pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ assert((total < 0) || (avail <= total));
+
+ const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+ // Parse next cluster. This is strictly a parsing activity.
+ // Creation of a new cluster object happens later, after the
+ // parsing is done.
+
+ long long off_next = 0;
+ long long cluster_size = -1;
+
+ for (;;) {
+ if ((total >= 0) && (pos >= total))
+ return 1; // EOF
+
+ if ((segment_stop >= 0) && (pos >= segment_stop))
+ return 1; // EOF
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long idpos = pos; // absolute
+ const long long idoff = pos - m_start; // relative
+
+ const long long id = ReadID(m_pReader, idpos, len); // absolute
+
+ if (id < 0) // error
+ return static_cast<long>(id);
+
+ if (id == 0) // weird
+ return -1; // generic error
+
+ pos += len; // consume ID
+
+ // Read Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ pos += len; // consume length of size of element
+
+ // Pos now points to start of payload
+
+ if (size == 0) // weird
+ continue;
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if ((segment_stop >= 0) && (size != unknown_size) &&
+ ((pos + size) > segment_stop)) {
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (id == libwebm::kMkvCues) {
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long element_stop = pos + size;
+
+ if ((segment_stop >= 0) && (element_stop > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ const long long element_start = idpos;
+ const long long element_size = element_stop - element_start;
+
+ if (m_pCues == NULL) {
+ m_pCues = new (std::nothrow)
+ Cues(this, pos, size, element_start, element_size);
+ if (m_pCues == NULL)
+ return false;
+ }
+
+ pos += size; // consume payload
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ continue;
+ }
+
+ if (id != libwebm::kMkvCluster) { // not a Cluster ID
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += size; // consume payload
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ continue;
+ }
+
+ // We have a cluster.
+ off_next = idoff;
+
+ if (size != unknown_size)
+ cluster_size = size;
+
+ break;
+ }
+
+ assert(off_next > 0); // have cluster
+
+ // We have parsed the next cluster.
+ // We have not created a cluster object yet. What we need
+ // to do now is determine whether it has already be preloaded
+ //(in which case, an object for this cluster has already been
+ // created), and if not, create a new cluster object.
+
+ Cluster** const ii = m_clusters + m_clusterCount;
+ Cluster** i = ii;
+
+ Cluster** const jj = ii + m_clusterPreloadCount;
+ Cluster** j = jj;
+
+ while (i < j) {
+ // INVARIANT:
+ //[0, i) < pos_next
+ //[i, j) ?
+ //[j, jj) > pos_next
+
+ Cluster** const k = i + (j - i) / 2;
+ assert(k < jj);
+
+ const Cluster* const pNext = *k;
+ assert(pNext);
+ assert(pNext->m_index < 0);
+
+ pos = pNext->GetPosition();
+ assert(pos >= 0);
+
+ if (pos < off_next)
+ i = k + 1;
+ else if (pos > off_next)
+ j = k;
+ else {
+ pResult = pNext;
+ return 0; // success
+ }
+ }
+
+ assert(i == j);
+
+ long long pos_;
+ long len_;
+
+ status = Cluster::HasBlockEntries(this, off_next, pos_, len_);
+
+ if (status < 0) { // error or underflow
+ pos = pos_;
+ len = len_;
+
+ return status;
+ }
+
+ if (status > 0) { // means "found at least one block entry"
+ Cluster* const pNext = Cluster::Create(this,
+ -1, // preloaded
+ off_next);
+ if (pNext == NULL)
+ return -1;
+
+ const ptrdiff_t idx_next = i - m_clusters; // insertion position
+
+ if (!PreloadCluster(pNext, idx_next)) {
+ delete pNext;
+ return -1;
+ }
+ assert(m_clusters);
+ assert(idx_next < m_clusterSize);
+ assert(m_clusters[idx_next] == pNext);
+
+ pResult = pNext;
+ return 0; // success
+ }
+
+ // status == 0 means "no block entries found"
+
+ if (cluster_size < 0) { // unknown size
+ const long long payload_pos = pos; // absolute pos of cluster payload
+
+ for (;;) { // determine cluster size
+ if ((total >= 0) && (pos >= total))
+ break;
+
+ if ((segment_stop >= 0) && (pos >= segment_stop))
+ break; // no more clusters
+
+ // Read ID
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long idpos = pos;
+ const long long id = ReadID(m_pReader, idpos, len);
+
+ if (id < 0) // error (or underflow)
+ return static_cast<long>(id);
+
+ // This is the distinguished set of ID's we use to determine
+ // that we have exhausted the sub-element's inside the cluster
+ // whose ID we parsed earlier.
+
+ if (id == libwebm::kMkvCluster || id == libwebm::kMkvCues)
+ break;
+
+ pos += len; // consume ID (of sub-element)
+
+ // Read Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(m_pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(m_pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ pos += len; // consume size field of element
+
+ // pos now points to start of sub-element's payload
+
+ if (size == 0) // weird
+ continue;
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID; // not allowed for sub-elements
+
+ if ((segment_stop >= 0) && ((pos + size) > segment_stop)) // weird
+ return E_FILE_FORMAT_INVALID;
+
+ pos += size; // consume payload of sub-element
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+ } // determine cluster size
+
+ cluster_size = pos - payload_pos;
+ assert(cluster_size >= 0); // TODO: handle cluster_size = 0
+
+ pos = payload_pos; // reset and re-parse original cluster
+ }
+
+ pos += cluster_size; // consume payload
+ if (segment_stop >= 0 && pos > segment_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 2; // try to find a cluster that follows next
+}
+
+const Cluster* Segment::FindCluster(long long time_ns) const {
+ if ((m_clusters == NULL) || (m_clusterCount <= 0))
+ return &m_eos;
+
+ {
+ Cluster* const pCluster = m_clusters[0];
+ assert(pCluster);
+ assert(pCluster->m_index == 0);
+
+ if (time_ns <= pCluster->GetTime())
+ return pCluster;
+ }
+
+ // Binary search of cluster array
+
+ long i = 0;
+ long j = m_clusterCount;
+
+ while (i < j) {
+ // INVARIANT:
+ //[0, i) <= time_ns
+ //[i, j) ?
+ //[j, m_clusterCount) > time_ns
+
+ const long k = i + (j - i) / 2;
+ assert(k < m_clusterCount);
+
+ Cluster* const pCluster = m_clusters[k];
+ assert(pCluster);
+ assert(pCluster->m_index == k);
+
+ const long long t = pCluster->GetTime();
+
+ if (t <= time_ns)
+ i = k + 1;
+ else
+ j = k;
+
+ assert(i <= j);
+ }
+
+ assert(i == j);
+ assert(i > 0);
+ assert(i <= m_clusterCount);
+
+ const long k = i - 1;
+
+ Cluster* const pCluster = m_clusters[k];
+ assert(pCluster);
+ assert(pCluster->m_index == k);
+ assert(pCluster->GetTime() <= time_ns);
+
+ return pCluster;
+}
+
+const Tracks* Segment::GetTracks() const { return m_pTracks; }
+const SegmentInfo* Segment::GetInfo() const { return m_pInfo; }
+const Cues* Segment::GetCues() const { return m_pCues; }
+const Chapters* Segment::GetChapters() const { return m_pChapters; }
+const Tags* Segment::GetTags() const { return m_pTags; }
+const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; }
+
+long long Segment::GetDuration() const {
+ assert(m_pInfo);
+ return m_pInfo->GetDuration();
+}
+
+Chapters::Chapters(Segment* pSegment, long long payload_start,
+ long long payload_size, long long element_start,
+ long long element_size)
+ : m_pSegment(pSegment),
+ m_start(payload_start),
+ m_size(payload_size),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ m_editions(NULL),
+ m_editions_size(0),
+ m_editions_count(0) {}
+
+Chapters::~Chapters() {
+ while (m_editions_count > 0) {
+ Edition& e = m_editions[--m_editions_count];
+ e.Clear();
+ }
+ delete[] m_editions;
+}
+
+long Chapters::Parse() {
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long pos = m_start; // payload start
+ const long long stop = pos + m_size; // payload stop
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0) // weird
+ continue;
+
+ if (id == libwebm::kMkvEditionEntry) {
+ status = ParseEdition(pos, size);
+
+ if (status < 0) // error
+ return status;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+int Chapters::GetEditionCount() const { return m_editions_count; }
+
+const Chapters::Edition* Chapters::GetEdition(int idx) const {
+ if (idx < 0)
+ return NULL;
+
+ if (idx >= m_editions_count)
+ return NULL;
+
+ return m_editions + idx;
+}
+
+bool Chapters::ExpandEditionsArray() {
+ if (m_editions_size > m_editions_count)
+ return true; // nothing else to do
+
+ const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size;
+
+ Edition* const editions = new (std::nothrow) Edition[size];
+
+ if (editions == NULL)
+ return false;
+
+ for (int idx = 0; idx < m_editions_count; ++idx) {
+ m_editions[idx].ShallowCopy(editions[idx]);
+ }
+
+ delete[] m_editions;
+ m_editions = editions;
+
+ m_editions_size = size;
+ return true;
+}
+
+long Chapters::ParseEdition(long long pos, long long size) {
+ if (!ExpandEditionsArray())
+ return -1;
+
+ Edition& e = m_editions[m_editions_count++];
+ e.Init();
+
+ return e.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Chapters::Edition::Edition() {}
+
+Chapters::Edition::~Edition() {}
+
+int Chapters::Edition::GetAtomCount() const { return m_atoms_count; }
+
+const Chapters::Atom* Chapters::Edition::GetAtom(int index) const {
+ if (index < 0)
+ return NULL;
+
+ if (index >= m_atoms_count)
+ return NULL;
+
+ return m_atoms + index;
+}
+
+void Chapters::Edition::Init() {
+ m_atoms = NULL;
+ m_atoms_size = 0;
+ m_atoms_count = 0;
+}
+
+void Chapters::Edition::ShallowCopy(Edition& rhs) const {
+ rhs.m_atoms = m_atoms;
+ rhs.m_atoms_size = m_atoms_size;
+ rhs.m_atoms_count = m_atoms_count;
+}
+
+void Chapters::Edition::Clear() {
+ while (m_atoms_count > 0) {
+ Atom& a = m_atoms[--m_atoms_count];
+ a.Clear();
+ }
+
+ delete[] m_atoms;
+ m_atoms = NULL;
+
+ m_atoms_size = 0;
+}
+
+long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
+ long long size) {
+ const long long stop = pos + size;
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0)
+ continue;
+
+ if (id == libwebm::kMkvChapterAtom) {
+ status = ParseAtom(pReader, pos, size);
+
+ if (status < 0) // error
+ return status;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos,
+ long long size) {
+ if (!ExpandAtomsArray())
+ return -1;
+
+ Atom& a = m_atoms[m_atoms_count++];
+ a.Init();
+
+ return a.Parse(pReader, pos, size);
+}
+
+bool Chapters::Edition::ExpandAtomsArray() {
+ if (m_atoms_size > m_atoms_count)
+ return true; // nothing else to do
+
+ const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size;
+
+ Atom* const atoms = new (std::nothrow) Atom[size];
+
+ if (atoms == NULL)
+ return false;
+
+ for (int idx = 0; idx < m_atoms_count; ++idx) {
+ m_atoms[idx].ShallowCopy(atoms[idx]);
+ }
+
+ delete[] m_atoms;
+ m_atoms = atoms;
+
+ m_atoms_size = size;
+ return true;
+}
+
+Chapters::Atom::Atom() {}
+
+Chapters::Atom::~Atom() {}
+
+unsigned long long Chapters::Atom::GetUID() const { return m_uid; }
+
+const char* Chapters::Atom::GetStringUID() const { return m_string_uid; }
+
+long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; }
+
+long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; }
+
+long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const {
+ return GetTime(pChapters, m_start_timecode);
+}
+
+long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const {
+ return GetTime(pChapters, m_stop_timecode);
+}
+
+int Chapters::Atom::GetDisplayCount() const { return m_displays_count; }
+
+const Chapters::Display* Chapters::Atom::GetDisplay(int index) const {
+ if (index < 0)
+ return NULL;
+
+ if (index >= m_displays_count)
+ return NULL;
+
+ return m_displays + index;
+}
+
+void Chapters::Atom::Init() {
+ m_string_uid = NULL;
+ m_uid = 0;
+ m_start_timecode = -1;
+ m_stop_timecode = -1;
+
+ m_displays = NULL;
+ m_displays_size = 0;
+ m_displays_count = 0;
+}
+
+void Chapters::Atom::ShallowCopy(Atom& rhs) const {
+ rhs.m_string_uid = m_string_uid;
+ rhs.m_uid = m_uid;
+ rhs.m_start_timecode = m_start_timecode;
+ rhs.m_stop_timecode = m_stop_timecode;
+
+ rhs.m_displays = m_displays;
+ rhs.m_displays_size = m_displays_size;
+ rhs.m_displays_count = m_displays_count;
+}
+
+void Chapters::Atom::Clear() {
+ delete[] m_string_uid;
+ m_string_uid = NULL;
+
+ while (m_displays_count > 0) {
+ Display& d = m_displays[--m_displays_count];
+ d.Clear();
+ }
+
+ delete[] m_displays;
+ m_displays = NULL;
+
+ m_displays_size = 0;
+}
+
+long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
+ const long long stop = pos + size;
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0) // 0 length payload, skip.
+ continue;
+
+ if (id == libwebm::kMkvChapterDisplay) {
+ status = ParseDisplay(pReader, pos, size);
+
+ if (status < 0) // error
+ return status;
+ } else if (id == libwebm::kMkvChapterStringUID) {
+ status = UnserializeString(pReader, pos, size, m_string_uid);
+
+ if (status < 0) // error
+ return status;
+ } else if (id == libwebm::kMkvChapterUID) {
+ long long val;
+ status = UnserializeInt(pReader, pos, size, val);
+
+ if (status < 0) // error
+ return status;
+
+ m_uid = static_cast<unsigned long long>(val);
+ } else if (id == libwebm::kMkvChapterTimeStart) {
+ const long long val = UnserializeUInt(pReader, pos, size);
+
+ if (val < 0) // error
+ return static_cast<long>(val);
+
+ m_start_timecode = val;
+ } else if (id == libwebm::kMkvChapterTimeEnd) {
+ const long long val = UnserializeUInt(pReader, pos, size);
+
+ if (val < 0) // error
+ return static_cast<long>(val);
+
+ m_stop_timecode = val;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+long long Chapters::Atom::GetTime(const Chapters* pChapters,
+ long long timecode) {
+ if (pChapters == NULL)
+ return -1;
+
+ Segment* const pSegment = pChapters->m_pSegment;
+
+ if (pSegment == NULL) // weird
+ return -1;
+
+ const SegmentInfo* const pInfo = pSegment->GetInfo();
+
+ if (pInfo == NULL)
+ return -1;
+
+ const long long timecode_scale = pInfo->GetTimeCodeScale();
+
+ if (timecode_scale < 1) // weird
+ return -1;
+
+ if (timecode < 0)
+ return -1;
+
+ const long long result = timecode_scale * timecode;
+
+ return result;
+}
+
+long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos,
+ long long size) {
+ if (!ExpandDisplaysArray())
+ return -1;
+
+ Display& d = m_displays[m_displays_count++];
+ d.Init();
+
+ return d.Parse(pReader, pos, size);
+}
+
+bool Chapters::Atom::ExpandDisplaysArray() {
+ if (m_displays_size > m_displays_count)
+ return true; // nothing else to do
+
+ const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size;
+
+ Display* const displays = new (std::nothrow) Display[size];
+
+ if (displays == NULL)
+ return false;
+
+ for (int idx = 0; idx < m_displays_count; ++idx) {
+ m_displays[idx].ShallowCopy(displays[idx]);
+ }
+
+ delete[] m_displays;
+ m_displays = displays;
+
+ m_displays_size = size;
+ return true;
+}
+
+Chapters::Display::Display() {}
+
+Chapters::Display::~Display() {}
+
+const char* Chapters::Display::GetString() const { return m_string; }
+
+const char* Chapters::Display::GetLanguage() const { return m_language; }
+
+const char* Chapters::Display::GetCountry() const { return m_country; }
+
+void Chapters::Display::Init() {
+ m_string = NULL;
+ m_language = NULL;
+ m_country = NULL;
+}
+
+void Chapters::Display::ShallowCopy(Display& rhs) const {
+ rhs.m_string = m_string;
+ rhs.m_language = m_language;
+ rhs.m_country = m_country;
+}
+
+void Chapters::Display::Clear() {
+ delete[] m_string;
+ m_string = NULL;
+
+ delete[] m_language;
+ m_language = NULL;
+
+ delete[] m_country;
+ m_country = NULL;
+}
+
+long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
+ long long size) {
+ const long long stop = pos + size;
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0) // No payload.
+ continue;
+
+ if (id == libwebm::kMkvChapString) {
+ status = UnserializeString(pReader, pos, size, m_string);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvChapLanguage) {
+ status = UnserializeString(pReader, pos, size, m_language);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvChapCountry) {
+ status = UnserializeString(pReader, pos, size, m_country);
+
+ if (status)
+ return status;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size,
+ long long element_start, long long element_size)
+ : m_pSegment(pSegment),
+ m_start(payload_start),
+ m_size(payload_size),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ m_tags(NULL),
+ m_tags_size(0),
+ m_tags_count(0) {}
+
+Tags::~Tags() {
+ while (m_tags_count > 0) {
+ Tag& t = m_tags[--m_tags_count];
+ t.Clear();
+ }
+ delete[] m_tags;
+}
+
+long Tags::Parse() {
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long pos = m_start; // payload start
+ const long long stop = pos + m_size; // payload stop
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0)
+ return status;
+
+ if (size == 0) // 0 length tag, read another
+ continue;
+
+ if (id == libwebm::kMkvTag) {
+ status = ParseTag(pos, size);
+
+ if (status < 0)
+ return status;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+}
+
+int Tags::GetTagCount() const { return m_tags_count; }
+
+const Tags::Tag* Tags::GetTag(int idx) const {
+ if (idx < 0)
+ return NULL;
+
+ if (idx >= m_tags_count)
+ return NULL;
+
+ return m_tags + idx;
+}
+
+bool Tags::ExpandTagsArray() {
+ if (m_tags_size > m_tags_count)
+ return true; // nothing else to do
+
+ const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size;
+
+ Tag* const tags = new (std::nothrow) Tag[size];
+
+ if (tags == NULL)
+ return false;
+
+ for (int idx = 0; idx < m_tags_count; ++idx) {
+ m_tags[idx].ShallowCopy(tags[idx]);
+ }
+
+ delete[] m_tags;
+ m_tags = tags;
+
+ m_tags_size = size;
+ return true;
+}
+
+long Tags::ParseTag(long long pos, long long size) {
+ if (!ExpandTagsArray())
+ return -1;
+
+ Tag& t = m_tags[m_tags_count++];
+ t.Init();
+
+ return t.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Tags::Tag::Tag() {}
+
+Tags::Tag::~Tag() {}
+
+int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; }
+
+const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const {
+ if (index < 0)
+ return NULL;
+
+ if (index >= m_simple_tags_count)
+ return NULL;
+
+ return m_simple_tags + index;
+}
+
+void Tags::Tag::Init() {
+ m_simple_tags = NULL;
+ m_simple_tags_size = 0;
+ m_simple_tags_count = 0;
+}
+
+void Tags::Tag::ShallowCopy(Tag& rhs) const {
+ rhs.m_simple_tags = m_simple_tags;
+ rhs.m_simple_tags_size = m_simple_tags_size;
+ rhs.m_simple_tags_count = m_simple_tags_count;
+}
+
+void Tags::Tag::Clear() {
+ while (m_simple_tags_count > 0) {
+ SimpleTag& d = m_simple_tags[--m_simple_tags_count];
+ d.Clear();
+ }
+
+ delete[] m_simple_tags;
+ m_simple_tags = NULL;
+
+ m_simple_tags_size = 0;
+}
+
+long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
+ const long long stop = pos + size;
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0)
+ return status;
+
+ if (size == 0) // 0 length tag, read another
+ continue;
+
+ if (id == libwebm::kMkvSimpleTag) {
+ status = ParseSimpleTag(pReader, pos, size);
+
+ if (status < 0)
+ return status;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos,
+ long long size) {
+ if (!ExpandSimpleTagsArray())
+ return -1;
+
+ SimpleTag& st = m_simple_tags[m_simple_tags_count++];
+ st.Init();
+
+ return st.Parse(pReader, pos, size);
+}
+
+bool Tags::Tag::ExpandSimpleTagsArray() {
+ if (m_simple_tags_size > m_simple_tags_count)
+ return true; // nothing else to do
+
+ const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size;
+
+ SimpleTag* const displays = new (std::nothrow) SimpleTag[size];
+
+ if (displays == NULL)
+ return false;
+
+ for (int idx = 0; idx < m_simple_tags_count; ++idx) {
+ m_simple_tags[idx].ShallowCopy(displays[idx]);
+ }
+
+ delete[] m_simple_tags;
+ m_simple_tags = displays;
+
+ m_simple_tags_size = size;
+ return true;
+}
+
+Tags::SimpleTag::SimpleTag() {}
+
+Tags::SimpleTag::~SimpleTag() {}
+
+const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; }
+
+const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; }
+
+void Tags::SimpleTag::Init() {
+ m_tag_name = NULL;
+ m_tag_string = NULL;
+}
+
+void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const {
+ rhs.m_tag_name = m_tag_name;
+ rhs.m_tag_string = m_tag_string;
+}
+
+void Tags::SimpleTag::Clear() {
+ delete[] m_tag_name;
+ m_tag_name = NULL;
+
+ delete[] m_tag_string;
+ m_tag_string = NULL;
+}
+
+long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
+ long long size) {
+ const long long stop = pos + size;
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0) // weird
+ continue;
+
+ if (id == libwebm::kMkvTagName) {
+ status = UnserializeString(pReader, pos, size, m_tag_name);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvTagString) {
+ status = UnserializeString(pReader, pos, size, m_tag_string);
+
+ if (status)
+ return status;
+ }
+
+ pos += size;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_,
+ long long element_start, long long element_size)
+ : m_pSegment(pSegment),
+ m_start(start),
+ m_size(size_),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ m_pMuxingAppAsUTF8(NULL),
+ m_pWritingAppAsUTF8(NULL),
+ m_pTitleAsUTF8(NULL) {}
+
+SegmentInfo::~SegmentInfo() {
+ delete[] m_pMuxingAppAsUTF8;
+ m_pMuxingAppAsUTF8 = NULL;
+
+ delete[] m_pWritingAppAsUTF8;
+ m_pWritingAppAsUTF8 = NULL;
+
+ delete[] m_pTitleAsUTF8;
+ m_pTitleAsUTF8 = NULL;
+}
+
+long SegmentInfo::Parse() {
+ assert(m_pMuxingAppAsUTF8 == NULL);
+ assert(m_pWritingAppAsUTF8 == NULL);
+ assert(m_pTitleAsUTF8 == NULL);
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long pos = m_start;
+ const long long stop = m_start + m_size;
+
+ m_timecodeScale = 1000000;
+ m_duration = -1;
+
+ while (pos < stop) {
+ long long id, size;
+
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvTimecodeScale) {
+ m_timecodeScale = UnserializeUInt(pReader, pos, size);
+
+ if (m_timecodeScale <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvDuration) {
+ const long status = UnserializeFloat(pReader, pos, size, m_duration);
+
+ if (status < 0)
+ return status;
+
+ if (m_duration < 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvMuxingApp) {
+ const long status =
+ UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvWritingApp) {
+ const long status =
+ UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvTitle) {
+ const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
+
+ if (status)
+ return status;
+ }
+
+ pos += size;
+
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ const double rollover_check = m_duration * m_timecodeScale;
+ if (rollover_check > static_cast<double>(LLONG_MAX))
+ return E_FILE_FORMAT_INVALID;
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+}
+
+long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; }
+
+long long SegmentInfo::GetDuration() const {
+ if (m_duration < 0)
+ return -1;
+
+ assert(m_timecodeScale >= 1);
+
+ const double dd = double(m_duration) * double(m_timecodeScale);
+ const long long d = static_cast<long long>(dd);
+
+ return d;
+}
+
+const char* SegmentInfo::GetMuxingAppAsUTF8() const {
+ return m_pMuxingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetWritingAppAsUTF8() const {
+ return m_pWritingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; }
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+ContentEncoding::ContentCompression::ContentCompression()
+ : algo(0), settings(NULL), settings_len(0) {}
+
+ContentEncoding::ContentCompression::~ContentCompression() {
+ delete[] settings;
+}
+
+ContentEncoding::ContentEncryption::ContentEncryption()
+ : algo(0),
+ key_id(NULL),
+ key_id_len(0),
+ signature(NULL),
+ signature_len(0),
+ sig_key_id(NULL),
+ sig_key_id_len(0),
+ sig_algo(0),
+ sig_hash_algo(0) {}
+
+ContentEncoding::ContentEncryption::~ContentEncryption() {
+ delete[] key_id;
+ delete[] signature;
+ delete[] sig_key_id;
+}
+
+ContentEncoding::ContentEncoding()
+ : compression_entries_(NULL),
+ compression_entries_end_(NULL),
+ encryption_entries_(NULL),
+ encryption_entries_end_(NULL),
+ encoding_order_(0),
+ encoding_scope_(1),
+ encoding_type_(0) {}
+
+ContentEncoding::~ContentEncoding() {
+ ContentCompression** comp_i = compression_entries_;
+ ContentCompression** const comp_j = compression_entries_end_;
+
+ while (comp_i != comp_j) {
+ ContentCompression* const comp = *comp_i++;
+ delete comp;
+ }
+
+ delete[] compression_entries_;
+
+ ContentEncryption** enc_i = encryption_entries_;
+ ContentEncryption** const enc_j = encryption_entries_end_;
+
+ while (enc_i != enc_j) {
+ ContentEncryption* const enc = *enc_i++;
+ delete enc;
+ }
+
+ delete[] encryption_entries_;
+}
+
+const ContentEncoding::ContentCompression*
+ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+ const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+ assert(count >= 0);
+
+ if (idx >= static_cast<unsigned long>(count))
+ return NULL;
+
+ return compression_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetCompressionCount() const {
+ const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+ assert(count >= 0);
+
+ return static_cast<unsigned long>(count);
+}
+
+const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex(
+ unsigned long idx) const {
+ const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+ assert(count >= 0);
+
+ if (idx >= static_cast<unsigned long>(count))
+ return NULL;
+
+ return encryption_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetEncryptionCount() const {
+ const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+ assert(count >= 0);
+
+ return static_cast<unsigned long>(count);
+}
+
+long ContentEncoding::ParseContentEncAESSettingsEntry(
+ long long start, long long size, IMkvReader* pReader,
+ ContentEncAESSettings* aes) {
+ assert(pReader);
+ assert(aes);
+
+ long long pos = start;
+ const long long stop = start + size;
+
+ while (pos < stop) {
+ long long id, size;
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvAESSettingsCipherMode) {
+ aes->cipher_mode = UnserializeUInt(pReader, pos, size);
+ if (aes->cipher_mode != 1)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ return 0;
+}
+
+long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
+ IMkvReader* pReader) {
+ assert(pReader);
+
+ long long pos = start;
+ const long long stop = start + size;
+
+ // Count ContentCompression and ContentEncryption elements.
+ long long compression_count = 0;
+ long long encryption_count = 0;
+
+ while (pos < stop) {
+ long long id, size;
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvContentCompression) {
+ ++compression_count;
+ if (compression_count > INT_MAX)
+ return E_PARSE_FAILED;
+ }
+
+ if (id == libwebm::kMkvContentEncryption) {
+ ++encryption_count;
+ if (encryption_count > INT_MAX)
+ return E_PARSE_FAILED;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (compression_count <= 0 && encryption_count <= 0)
+ return -1;
+
+ if (compression_count > 0) {
+ compression_entries_ = new (std::nothrow)
+ ContentCompression*[static_cast<size_t>(compression_count)];
+ if (!compression_entries_)
+ return -1;
+ compression_entries_end_ = compression_entries_;
+ }
+
+ if (encryption_count > 0) {
+ encryption_entries_ = new (std::nothrow)
+ ContentEncryption*[static_cast<size_t>(encryption_count)];
+ if (!encryption_entries_) {
+ delete[] compression_entries_;
+ compression_entries_ = NULL;
+ return -1;
+ }
+ encryption_entries_end_ = encryption_entries_;
+ }
+
+ pos = start;
+ while (pos < stop) {
+ long long id, size;
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvContentEncodingOrder) {
+ encoding_order_ = UnserializeUInt(pReader, pos, size);
+ } else if (id == libwebm::kMkvContentEncodingScope) {
+ encoding_scope_ = UnserializeUInt(pReader, pos, size);
+ if (encoding_scope_ < 1)
+ return -1;
+ } else if (id == libwebm::kMkvContentEncodingType) {
+ encoding_type_ = UnserializeUInt(pReader, pos, size);
+ } else if (id == libwebm::kMkvContentCompression) {
+ ContentCompression* const compression =
+ new (std::nothrow) ContentCompression();
+ if (!compression)
+ return -1;
+
+ status = ParseCompressionEntry(pos, size, pReader, compression);
+ if (status) {
+ delete compression;
+ return status;
+ }
+ assert(compression_count > 0);
+ *compression_entries_end_++ = compression;
+ } else if (id == libwebm::kMkvContentEncryption) {
+ ContentEncryption* const encryption =
+ new (std::nothrow) ContentEncryption();
+ if (!encryption)
+ return -1;
+
+ status = ParseEncryptionEntry(pos, size, pReader, encryption);
+ if (status) {
+ delete encryption;
+ return status;
+ }
+ assert(encryption_count > 0);
+ *encryption_entries_end_++ = encryption;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ return 0;
+}
+
+long ContentEncoding::ParseCompressionEntry(long long start, long long size,
+ IMkvReader* pReader,
+ ContentCompression* compression) {
+ assert(pReader);
+ assert(compression);
+
+ long long pos = start;
+ const long long stop = start + size;
+
+ bool valid = false;
+
+ while (pos < stop) {
+ long long id, size;
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvContentCompAlgo) {
+ long long algo = UnserializeUInt(pReader, pos, size);
+ if (algo < 0)
+ return E_FILE_FORMAT_INVALID;
+ compression->algo = algo;
+ valid = true;
+ } else if (id == libwebm::kMkvContentCompSettings) {
+ if (size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const size_t buflen = static_cast<size_t>(size);
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+ if (buf == NULL)
+ return -1;
+
+ const int read_status =
+ pReader->Read(pos, static_cast<long>(buflen), buf);
+ if (read_status) {
+ delete[] buf;
+ return status;
+ }
+
+ // There should be only one settings element per content compression.
+ if (compression->settings != NULL) {
+ delete[] buf;
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ compression->settings = buf;
+ compression->settings_len = buflen;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ // ContentCompAlgo is mandatory
+ if (!valid)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+}
+
+long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
+ IMkvReader* pReader,
+ ContentEncryption* encryption) {
+ assert(pReader);
+ assert(encryption);
+
+ long long pos = start;
+ const long long stop = start + size;
+
+ while (pos < stop) {
+ long long id, size;
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvContentEncAlgo) {
+ encryption->algo = UnserializeUInt(pReader, pos, size);
+ if (encryption->algo != 5)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvContentEncKeyID) {
+ delete[] encryption->key_id;
+ encryption->key_id = NULL;
+ encryption->key_id_len = 0;
+
+ if (size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const size_t buflen = static_cast<size_t>(size);
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+ if (buf == NULL)
+ return -1;
+
+ const int read_status =
+ pReader->Read(pos, static_cast<long>(buflen), buf);
+ if (read_status) {
+ delete[] buf;
+ return status;
+ }
+
+ encryption->key_id = buf;
+ encryption->key_id_len = buflen;
+ } else if (id == libwebm::kMkvContentSignature) {
+ delete[] encryption->signature;
+ encryption->signature = NULL;
+ encryption->signature_len = 0;
+
+ if (size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const size_t buflen = static_cast<size_t>(size);
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+ if (buf == NULL)
+ return -1;
+
+ const int read_status =
+ pReader->Read(pos, static_cast<long>(buflen), buf);
+ if (read_status) {
+ delete[] buf;
+ return status;
+ }
+
+ encryption->signature = buf;
+ encryption->signature_len = buflen;
+ } else if (id == libwebm::kMkvContentSigKeyID) {
+ delete[] encryption->sig_key_id;
+ encryption->sig_key_id = NULL;
+ encryption->sig_key_id_len = 0;
+
+ if (size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const size_t buflen = static_cast<size_t>(size);
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+ if (buf == NULL)
+ return -1;
+
+ const int read_status =
+ pReader->Read(pos, static_cast<long>(buflen), buf);
+ if (read_status) {
+ delete[] buf;
+ return status;
+ }
+
+ encryption->sig_key_id = buf;
+ encryption->sig_key_id_len = buflen;
+ } else if (id == libwebm::kMkvContentSigAlgo) {
+ encryption->sig_algo = UnserializeUInt(pReader, pos, size);
+ } else if (id == libwebm::kMkvContentSigHashAlgo) {
+ encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
+ } else if (id == libwebm::kMkvContentEncAESSettings) {
+ const long status = ParseContentEncAESSettingsEntry(
+ pos, size, pReader, &encryption->aes_settings);
+ if (status)
+ return status;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ return 0;
+}
+
+Track::Track(Segment* pSegment, long long element_start, long long element_size)
+ : m_pSegment(pSegment),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ content_encoding_entries_(NULL),
+ content_encoding_entries_end_(NULL) {}
+
+Track::~Track() {
+ Info& info = const_cast<Info&>(m_info);
+ info.Clear();
+
+ ContentEncoding** i = content_encoding_entries_;
+ ContentEncoding** const j = content_encoding_entries_end_;
+
+ while (i != j) {
+ ContentEncoding* const encoding = *i++;
+ delete encoding;
+ }
+
+ delete[] content_encoding_entries_;
+}
+
+long Track::Create(Segment* pSegment, const Info& info, long long element_start,
+ long long element_size, Track*& pResult) {
+ if (pResult)
+ return -1;
+
+ Track* const pTrack =
+ new (std::nothrow) Track(pSegment, element_start, element_size);
+
+ if (pTrack == NULL)
+ return -1; // generic error
+
+ const int status = info.Copy(pTrack->m_info);
+
+ if (status) { // error
+ delete pTrack;
+ return status;
+ }
+
+ pResult = pTrack;
+ return 0; // success
+}
+
+Track::Info::Info()
+ : uid(0),
+ defaultDuration(0),
+ codecDelay(0),
+ seekPreRoll(0),
+ nameAsUTF8(NULL),
+ language(NULL),
+ codecId(NULL),
+ codecNameAsUTF8(NULL),
+ codecPrivate(NULL),
+ codecPrivateSize(0),
+ lacing(false) {}
+
+Track::Info::~Info() { Clear(); }
+
+void Track::Info::Clear() {
+ delete[] nameAsUTF8;
+ nameAsUTF8 = NULL;
+
+ delete[] language;
+ language = NULL;
+
+ delete[] codecId;
+ codecId = NULL;
+
+ delete[] codecPrivate;
+ codecPrivate = NULL;
+ codecPrivateSize = 0;
+
+ delete[] codecNameAsUTF8;
+ codecNameAsUTF8 = NULL;
+}
+
+int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
+ if (str == static_cast<char * Info::*>(NULL))
+ return -1;
+
+ char*& dst = dst_.*str;
+
+ if (dst) // should be NULL already
+ return -1;
+
+ const char* const src = this->*str;
+
+ if (src == NULL)
+ return 0;
+
+ const size_t len = strlen(src);
+
+ dst = SafeArrayAlloc<char>(1, len + 1);
+
+ if (dst == NULL)
+ return -1;
+
+ strcpy(dst, src);
+
+ return 0;
+}
+
+int Track::Info::Copy(Info& dst) const {
+ if (&dst == this)
+ return 0;
+
+ dst.type = type;
+ dst.number = number;
+ dst.defaultDuration = defaultDuration;
+ dst.codecDelay = codecDelay;
+ dst.seekPreRoll = seekPreRoll;
+ dst.uid = uid;
+ dst.lacing = lacing;
+ dst.settings = settings;
+
+ // We now copy the string member variables from src to dst.
+ // This involves memory allocation so in principle the operation
+ // can fail (indeed, that's why we have Info::Copy), so we must
+ // report this to the caller. An error return from this function
+ // therefore implies that the copy was only partially successful.
+
+ if (int status = CopyStr(&Info::nameAsUTF8, dst))
+ return status;
+
+ if (int status = CopyStr(&Info::language, dst))
+ return status;
+
+ if (int status = CopyStr(&Info::codecId, dst))
+ return status;
+
+ if (int status = CopyStr(&Info::codecNameAsUTF8, dst))
+ return status;
+
+ if (codecPrivateSize > 0) {
+ if (codecPrivate == NULL)
+ return -1;
+
+ if (dst.codecPrivate)
+ return -1;
+
+ if (dst.codecPrivateSize != 0)
+ return -1;
+
+ dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
+
+ if (dst.codecPrivate == NULL)
+ return -1;
+
+ memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize);
+ dst.codecPrivateSize = codecPrivateSize;
+ }
+
+ return 0;
+}
+
+const BlockEntry* Track::GetEOS() const { return &m_eos; }
+
+long Track::GetType() const { return m_info.type; }
+
+long Track::GetNumber() const { return m_info.number; }
+
+unsigned long long Track::GetUid() const { return m_info.uid; }
+
+const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; }
+
+const char* Track::GetLanguage() const { return m_info.language; }
+
+const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; }
+
+const char* Track::GetCodecId() const { return m_info.codecId; }
+
+const unsigned char* Track::GetCodecPrivate(size_t& size) const {
+ size = m_info.codecPrivateSize;
+ return m_info.codecPrivate;
+}
+
+bool Track::GetLacing() const { return m_info.lacing; }
+
+unsigned long long Track::GetDefaultDuration() const {
+ return m_info.defaultDuration;
+}
+
+unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; }
+
+unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; }
+
+long Track::GetFirst(const BlockEntry*& pBlockEntry) const {
+ const Cluster* pCluster = m_pSegment->GetFirst();
+
+ for (int i = 0;;) {
+ if (pCluster == NULL) {
+ pBlockEntry = GetEOS();
+ return 1;
+ }
+
+ if (pCluster->EOS()) {
+ if (m_pSegment->DoneParsing()) {
+ pBlockEntry = GetEOS();
+ return 1;
+ }
+
+ pBlockEntry = 0;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long status = pCluster->GetFirst(pBlockEntry);
+
+ if (status < 0) // error
+ return status;
+
+ if (pBlockEntry == 0) { // empty cluster
+ pCluster = m_pSegment->GetNext(pCluster);
+ continue;
+ }
+
+ for (;;) {
+ const Block* const pBlock = pBlockEntry->GetBlock();
+ assert(pBlock);
+
+ const long long tn = pBlock->GetTrackNumber();
+
+ if ((tn == m_info.number) && VetEntry(pBlockEntry))
+ return 0;
+
+ const BlockEntry* pNextEntry;
+
+ status = pCluster->GetNext(pBlockEntry, pNextEntry);
+
+ if (status < 0) // error
+ return status;
+
+ if (pNextEntry == 0)
+ break;
+
+ pBlockEntry = pNextEntry;
+ }
+
+ ++i;
+
+ if (i >= 100)
+ break;
+
+ pCluster = m_pSegment->GetNext(pCluster);
+ }
+
+ // NOTE: if we get here, it means that we didn't find a block with
+ // a matching track number. We interpret that as an error (which
+ // might be too conservative).
+
+ pBlockEntry = GetEOS(); // so we can return a non-NULL value
+ return 1;
+}
+
+long Track::GetNext(const BlockEntry* pCurrEntry,
+ const BlockEntry*& pNextEntry) const {
+ assert(pCurrEntry);
+ assert(!pCurrEntry->EOS()); //?
+
+ const Block* const pCurrBlock = pCurrEntry->GetBlock();
+ assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number);
+ if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number)
+ return -1;
+
+ const Cluster* pCluster = pCurrEntry->GetCluster();
+ assert(pCluster);
+ assert(!pCluster->EOS());
+
+ long status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+ if (status < 0) // error
+ return status;
+
+ for (int i = 0;;) {
+ while (pNextEntry) {
+ const Block* const pNextBlock = pNextEntry->GetBlock();
+ assert(pNextBlock);
+
+ if (pNextBlock->GetTrackNumber() == m_info.number)
+ return 0;
+
+ pCurrEntry = pNextEntry;
+
+ status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+ if (status < 0) // error
+ return status;
+ }
+
+ pCluster = m_pSegment->GetNext(pCluster);
+
+ if (pCluster == NULL) {
+ pNextEntry = GetEOS();
+ return 1;
+ }
+
+ if (pCluster->EOS()) {
+ if (m_pSegment->DoneParsing()) {
+ pNextEntry = GetEOS();
+ return 1;
+ }
+
+ // TODO: there is a potential O(n^2) problem here: we tell the
+ // caller to (pre)load another cluster, which he does, but then he
+ // calls GetNext again, which repeats the same search. This is
+ // a pathological case, since the only way it can happen is if
+ // there exists a long sequence of clusters none of which contain a
+ // block from this track. One way around this problem is for the
+ // caller to be smarter when he loads another cluster: don't call
+ // us back until you have a cluster that contains a block from this
+ // track. (Of course, that's not cheap either, since our caller
+ // would have to scan the each cluster as it's loaded, so that
+ // would just push back the problem.)
+
+ pNextEntry = NULL;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ status = pCluster->GetFirst(pNextEntry);
+
+ if (status < 0) // error
+ return status;
+
+ if (pNextEntry == NULL) // empty cluster
+ continue;
+
+ ++i;
+
+ if (i >= 100)
+ break;
+ }
+
+ // NOTE: if we get here, it means that we didn't find a block with
+ // a matching track number after lots of searching, so we give
+ // up trying.
+
+ pNextEntry = GetEOS(); // so we can return a non-NULL value
+ return 1;
+}
+
+bool Track::VetEntry(const BlockEntry* pBlockEntry) const {
+ assert(pBlockEntry);
+ const Block* const pBlock = pBlockEntry->GetBlock();
+ assert(pBlock);
+ assert(pBlock->GetTrackNumber() == m_info.number);
+ if (!pBlock || pBlock->GetTrackNumber() != m_info.number)
+ return false;
+
+ // This function is used during a seek to determine whether the
+ // frame is a valid seek target. This default function simply
+ // returns true, which means all frames are valid seek targets.
+ // It gets overridden by the VideoTrack class, because only video
+ // keyframes can be used as seek target.
+
+ return true;
+}
+
+long Track::Seek(long long time_ns, const BlockEntry*& pResult) const {
+ const long status = GetFirst(pResult);
+
+ if (status < 0) // buffer underflow, etc
+ return status;
+
+ assert(pResult);
+
+ if (pResult->EOS())
+ return 0;
+
+ const Cluster* pCluster = pResult->GetCluster();
+ assert(pCluster);
+ assert(pCluster->GetIndex() >= 0);
+
+ if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+ return 0;
+
+ Cluster** const clusters = m_pSegment->m_clusters;
+ assert(clusters);
+
+ const long count = m_pSegment->GetCount(); // loaded only, not preloaded
+ assert(count > 0);
+
+ Cluster** const i = clusters + pCluster->GetIndex();
+ assert(i);
+ assert(*i == pCluster);
+ assert(pCluster->GetTime() <= time_ns);
+
+ Cluster** const j = clusters + count;
+
+ Cluster** lo = i;
+ Cluster** hi = j;
+
+ while (lo < hi) {
+ // INVARIANT:
+ //[i, lo) <= time_ns
+ //[lo, hi) ?
+ //[hi, j) > time_ns
+
+ Cluster** const mid = lo + (hi - lo) / 2;
+ assert(mid < hi);
+
+ pCluster = *mid;
+ assert(pCluster);
+ assert(pCluster->GetIndex() >= 0);
+ assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+ const long long t = pCluster->GetTime();
+
+ if (t <= time_ns)
+ lo = mid + 1;
+ else
+ hi = mid;
+
+ assert(lo <= hi);
+ }
+
+ assert(lo == hi);
+ assert(lo > i);
+ assert(lo <= j);
+
+ while (lo > i) {
+ pCluster = *--lo;
+ assert(pCluster);
+ assert(pCluster->GetTime() <= time_ns);
+
+ pResult = pCluster->GetEntry(this);
+
+ if ((pResult != 0) && !pResult->EOS())
+ return 0;
+
+ // landed on empty cluster (no entries)
+ }
+
+ pResult = GetEOS(); // weird
+ return 0;
+}
+
+const ContentEncoding* Track::GetContentEncodingByIndex(
+ unsigned long idx) const {
+ const ptrdiff_t count =
+ content_encoding_entries_end_ - content_encoding_entries_;
+ assert(count >= 0);
+
+ if (idx >= static_cast<unsigned long>(count))
+ return NULL;
+
+ return content_encoding_entries_[idx];
+}
+
+unsigned long Track::GetContentEncodingCount() const {
+ const ptrdiff_t count =
+ content_encoding_entries_end_ - content_encoding_entries_;
+ assert(count >= 0);
+
+ return static_cast<unsigned long>(count);
+}
+
+long Track::ParseContentEncodingsEntry(long long start, long long size) {
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+ assert(pReader);
+
+ long long pos = start;
+ const long long stop = start + size;
+
+ // Count ContentEncoding elements.
+ long long count = 0;
+ while (pos < stop) {
+ long long id, size;
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ // pos now designates start of element
+ if (id == libwebm::kMkvContentEncoding) {
+ ++count;
+ if (count > INT_MAX)
+ return E_PARSE_FAILED;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (count <= 0)
+ return -1;
+
+ content_encoding_entries_ =
+ new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)];
+ if (!content_encoding_entries_)
+ return -1;
+
+ content_encoding_entries_end_ = content_encoding_entries_;
+
+ pos = start;
+ while (pos < stop) {
+ long long id, size;
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+ if (status < 0) // error
+ return status;
+
+ // pos now designates start of element
+ if (id == libwebm::kMkvContentEncoding) {
+ ContentEncoding* const content_encoding =
+ new (std::nothrow) ContentEncoding();
+ if (!content_encoding)
+ return -1;
+
+ status = content_encoding->ParseContentEncodingEntry(pos, size, pReader);
+ if (status) {
+ delete content_encoding;
+ return status;
+ }
+
+ *content_encoding_entries_end_++ = content_encoding;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0;
+}
+
+Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {}
+
+BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; }
+
+const Block* Track::EOSBlock::GetBlock() const { return NULL; }
+
+bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
+ long long value_size, bool is_x,
+ PrimaryChromaticity** chromaticity) {
+ if (!reader)
+ return false;
+
+ if (!*chromaticity)
+ *chromaticity = new PrimaryChromaticity();
+
+ if (!*chromaticity)
+ return false;
+
+ PrimaryChromaticity* pc = *chromaticity;
+ float* value = is_x ? &pc->x : &pc->y;
+
+ double parser_value = 0;
+ const long long parse_status =
+ UnserializeFloat(reader, read_pos, value_size, parser_value);
+
+ // Valid range is [0, 1]. Make sure the double is representable as a float
+ // before casting.
+ if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+ (parser_value > 0.0 && parser_value < FLT_MIN))
+ return false;
+
+ *value = static_cast<float>(parser_value);
+
+ return true;
+}
+
+bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
+ long long mm_size, MasteringMetadata** mm) {
+ if (!reader || *mm)
+ return false;
+
+ std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+ if (!mm_ptr.get())
+ return false;
+
+ const long long mm_end = mm_start + mm_size;
+ long long read_pos = mm_start;
+
+ while (read_pos < mm_end) {
+ long long child_id = 0;
+ long long child_size = 0;
+
+ const long long status =
+ ParseElementHeader(reader, read_pos, mm_end, child_id, child_size);
+ if (status < 0)
+ return false;
+
+ if (child_id == libwebm::kMkvLuminanceMax) {
+ double value = 0;
+ const long long value_parse_status =
+ UnserializeFloat(reader, read_pos, child_size, value);
+ if (value < -FLT_MAX || value > FLT_MAX ||
+ (value > 0.0 && value < FLT_MIN)) {
+ return false;
+ }
+ mm_ptr->luminance_max = static_cast<float>(value);
+ if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
+ mm_ptr->luminance_max > 9999.99) {
+ return false;
+ }
+ } else if (child_id == libwebm::kMkvLuminanceMin) {
+ double value = 0;
+ const long long value_parse_status =
+ UnserializeFloat(reader, read_pos, child_size, value);
+ if (value < -FLT_MAX || value > FLT_MAX ||
+ (value > 0.0 && value < FLT_MIN)) {
+ return false;
+ }
+ mm_ptr->luminance_min = static_cast<float>(value);
+ if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
+ mm_ptr->luminance_min > 999.9999) {
+ return false;
+ }
+ } else {
+ bool is_x = false;
+ PrimaryChromaticity** chromaticity;
+ switch (child_id) {
+ case libwebm::kMkvPrimaryRChromaticityX:
+ case libwebm::kMkvPrimaryRChromaticityY:
+ is_x = child_id == libwebm::kMkvPrimaryRChromaticityX;
+ chromaticity = &mm_ptr->r;
+ break;
+ case libwebm::kMkvPrimaryGChromaticityX:
+ case libwebm::kMkvPrimaryGChromaticityY:
+ is_x = child_id == libwebm::kMkvPrimaryGChromaticityX;
+ chromaticity = &mm_ptr->g;
+ break;
+ case libwebm::kMkvPrimaryBChromaticityX:
+ case libwebm::kMkvPrimaryBChromaticityY:
+ is_x = child_id == libwebm::kMkvPrimaryBChromaticityX;
+ chromaticity = &mm_ptr->b;
+ break;
+ case libwebm::kMkvWhitePointChromaticityX:
+ case libwebm::kMkvWhitePointChromaticityY:
+ is_x = child_id == libwebm::kMkvWhitePointChromaticityX;
+ chromaticity = &mm_ptr->white_point;
+ break;
+ default:
+ return false;
+ }
+ const bool value_parse_status = PrimaryChromaticity::Parse(
+ reader, read_pos, child_size, is_x, chromaticity);
+ if (!value_parse_status)
+ return false;
+ }
+
+ read_pos += child_size;
+ if (read_pos > mm_end)
+ return false;
+ }
+
+ *mm = mm_ptr.release();
+ return true;
+}
+
+bool Colour::Parse(IMkvReader* reader, long long colour_start,
+ long long colour_size, Colour** colour) {
+ if (!reader || *colour)
+ return false;
+
+ std::unique_ptr<Colour> colour_ptr(new Colour());
+ if (!colour_ptr.get())
+ return false;
+
+ const long long colour_end = colour_start + colour_size;
+ long long read_pos = colour_start;
+
+ while (read_pos < colour_end) {
+ long long child_id = 0;
+ long long child_size = 0;
+
+ const long status =
+ ParseElementHeader(reader, read_pos, colour_end, child_id, child_size);
+ if (status < 0)
+ return false;
+
+ if (child_id == libwebm::kMkvMatrixCoefficients) {
+ colour_ptr->matrix_coefficients =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->matrix_coefficients < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvBitsPerChannel) {
+ colour_ptr->bits_per_channel =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->bits_per_channel < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvChromaSubsamplingHorz) {
+ colour_ptr->chroma_subsampling_horz =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->chroma_subsampling_horz < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvChromaSubsamplingVert) {
+ colour_ptr->chroma_subsampling_vert =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->chroma_subsampling_vert < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvCbSubsamplingHorz) {
+ colour_ptr->cb_subsampling_horz =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->cb_subsampling_horz < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvCbSubsamplingVert) {
+ colour_ptr->cb_subsampling_vert =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->cb_subsampling_vert < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvChromaSitingHorz) {
+ colour_ptr->chroma_siting_horz =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->chroma_siting_horz < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvChromaSitingVert) {
+ colour_ptr->chroma_siting_vert =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->chroma_siting_vert < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvRange) {
+ colour_ptr->range = UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->range < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvTransferCharacteristics) {
+ colour_ptr->transfer_characteristics =
+ UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->transfer_characteristics < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvPrimaries) {
+ colour_ptr->primaries = UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->primaries < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvMaxCLL) {
+ colour_ptr->max_cll = UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->max_cll < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvMaxFALL) {
+ colour_ptr->max_fall = UnserializeUInt(reader, read_pos, child_size);
+ if (colour_ptr->max_fall < 0)
+ return false;
+ } else if (child_id == libwebm::kMkvMasteringMetadata) {
+ if (!MasteringMetadata::Parse(reader, read_pos, child_size,
+ &colour_ptr->mastering_metadata))
+ return false;
+ } else {
+ return false;
+ }
+
+ read_pos += child_size;
+ if (read_pos > colour_end)
+ return false;
+ }
+ *colour = colour_ptr.release();
+ return true;
+}
+
+bool Projection::Parse(IMkvReader* reader, long long start, long long size,
+ Projection** projection) {
+ if (!reader || *projection)
+ return false;
+
+ std::unique_ptr<Projection> projection_ptr(new Projection());
+ if (!projection_ptr.get())
+ return false;
+
+ const long long end = start + size;
+ long long read_pos = start;
+
+ while (read_pos < end) {
+ long long child_id = 0;
+ long long child_size = 0;
+
+ const long long status =
+ ParseElementHeader(reader, read_pos, end, child_id, child_size);
+ if (status < 0)
+ return false;
+
+ if (child_id == libwebm::kMkvProjectionType) {
+ long long projection_type = kTypeNotPresent;
+ projection_type = UnserializeUInt(reader, read_pos, child_size);
+ if (projection_type < 0)
+ return false;
+
+ projection_ptr->type = static_cast<ProjectionType>(projection_type);
+ } else if (child_id == libwebm::kMkvProjectionPrivate) {
+ if (projection_ptr->private_data != NULL)
+ return false;
+ unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
+
+ if (data == NULL)
+ return false;
+
+ const int status =
+ reader->Read(read_pos, static_cast<long>(child_size), data);
+
+ if (status) {
+ delete[] data;
+ return false;
+ }
+
+ projection_ptr->private_data = data;
+ projection_ptr->private_data_length = static_cast<size_t>(child_size);
+ } else {
+ double value = 0;
+ const long long value_parse_status =
+ UnserializeFloat(reader, read_pos, child_size, value);
+ // Make sure value is representable as a float before casting.
+ if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+ (value > 0.0 && value < FLT_MIN)) {
+ return false;
+ }
+
+ switch (child_id) {
+ case libwebm::kMkvProjectionPoseYaw:
+ projection_ptr->pose_yaw = static_cast<float>(value);
+ break;
+ case libwebm::kMkvProjectionPosePitch:
+ projection_ptr->pose_pitch = static_cast<float>(value);
+ break;
+ case libwebm::kMkvProjectionPoseRoll:
+ projection_ptr->pose_roll = static_cast<float>(value);
+ break;
+ default:
+ return false;
+ }
+ }
+
+ read_pos += child_size;
+ if (read_pos > end)
+ return false;
+ }
+
+ *projection = projection_ptr.release();
+ return true;
+}
+
+VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
+ long long element_size)
+ : Track(pSegment, element_start, element_size),
+ m_colour_space(NULL),
+ m_colour(NULL),
+ m_projection(NULL) {}
+
+VideoTrack::~VideoTrack() {
+ delete[] m_colour_space;
+ delete m_colour;
+ delete m_projection;
+}
+
+long VideoTrack::Parse(Segment* pSegment, const Info& info,
+ long long element_start, long long element_size,
+ VideoTrack*& pResult) {
+ if (pResult)
+ return -1;
+
+ if (info.type != Track::kVideo)
+ return -1;
+
+ long long width = 0;
+ long long height = 0;
+ long long display_width = 0;
+ long long display_height = 0;
+ long long display_unit = 0;
+ long long stereo_mode = 0;
+
+ double rate = 0.0;
+ std::unique_ptr<char[]> colour_space_ptr;
+
+ IMkvReader* const pReader = pSegment->m_pReader;
+
+ const Settings& s = info.settings;
+ assert(s.start >= 0);
+ assert(s.size >= 0);
+
+ long long pos = s.start;
+ assert(pos >= 0);
+
+ const long long stop = pos + s.size;
+
+ std::unique_ptr<Colour> colour_ptr;
+ std::unique_ptr<Projection> projection_ptr;
+
+ while (pos < stop) {
+ long long id, size;
+
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvPixelWidth) {
+ width = UnserializeUInt(pReader, pos, size);
+
+ if (width <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvPixelHeight) {
+ height = UnserializeUInt(pReader, pos, size);
+
+ if (height <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvDisplayWidth) {
+ display_width = UnserializeUInt(pReader, pos, size);
+
+ if (display_width <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvDisplayHeight) {
+ display_height = UnserializeUInt(pReader, pos, size);
+
+ if (display_height <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvDisplayUnit) {
+ display_unit = UnserializeUInt(pReader, pos, size);
+
+ if (display_unit < 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvStereoMode) {
+ stereo_mode = UnserializeUInt(pReader, pos, size);
+
+ if (stereo_mode < 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvFrameRate) {
+ const long status = UnserializeFloat(pReader, pos, size, rate);
+
+ if (status < 0)
+ return status;
+
+ if (rate <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvColour) {
+ Colour* colour = NULL;
+ if (!Colour::Parse(pReader, pos, size, &colour)) {
+ return E_FILE_FORMAT_INVALID;
+ } else {
+ colour_ptr.reset(colour);
+ }
+ } else if (id == libwebm::kMkvProjection) {
+ Projection* projection = NULL;
+ if (!Projection::Parse(pReader, pos, size, &projection)) {
+ return E_FILE_FORMAT_INVALID;
+ } else {
+ projection_ptr.reset(projection);
+ }
+ } else if (id == libwebm::kMkvColourSpace) {
+ char* colour_space = NULL;
+ const long status = UnserializeString(pReader, pos, size, colour_space);
+ if (status < 0)
+ return status;
+ colour_space_ptr.reset(colour_space);
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ VideoTrack* const pTrack =
+ new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
+
+ if (pTrack == NULL)
+ return -1; // generic error
+
+ const int status = info.Copy(pTrack->m_info);
+
+ if (status) { // error
+ delete pTrack;
+ return status;
+ }
+
+ pTrack->m_width = width;
+ pTrack->m_height = height;
+ pTrack->m_display_width = display_width;
+ pTrack->m_display_height = display_height;
+ pTrack->m_display_unit = display_unit;
+ pTrack->m_stereo_mode = stereo_mode;
+ pTrack->m_rate = rate;
+ pTrack->m_colour = colour_ptr.release();
+ pTrack->m_colour_space = colour_space_ptr.release();
+ pTrack->m_projection = projection_ptr.release();
+
+ pResult = pTrack;
+ return 0; // success
+}
+
+bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const {
+ return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey();
+}
+
+long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
+ const long status = GetFirst(pResult);
+
+ if (status < 0) // buffer underflow, etc
+ return status;
+
+ assert(pResult);
+
+ if (pResult->EOS())
+ return 0;
+
+ const Cluster* pCluster = pResult->GetCluster();
+ assert(pCluster);
+ assert(pCluster->GetIndex() >= 0);
+
+ if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+ return 0;
+
+ Cluster** const clusters = m_pSegment->m_clusters;
+ assert(clusters);
+
+ const long count = m_pSegment->GetCount(); // loaded only, not pre-loaded
+ assert(count > 0);
+
+ Cluster** const i = clusters + pCluster->GetIndex();
+ assert(i);
+ assert(*i == pCluster);
+ assert(pCluster->GetTime() <= time_ns);
+
+ Cluster** const j = clusters + count;
+
+ Cluster** lo = i;
+ Cluster** hi = j;
+
+ while (lo < hi) {
+ // INVARIANT:
+ //[i, lo) <= time_ns
+ //[lo, hi) ?
+ //[hi, j) > time_ns
+
+ Cluster** const mid = lo + (hi - lo) / 2;
+ assert(mid < hi);
+
+ pCluster = *mid;
+ assert(pCluster);
+ assert(pCluster->GetIndex() >= 0);
+ assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+ const long long t = pCluster->GetTime();
+
+ if (t <= time_ns)
+ lo = mid + 1;
+ else
+ hi = mid;
+
+ assert(lo <= hi);
+ }
+
+ assert(lo == hi);
+ assert(lo > i);
+ assert(lo <= j);
+
+ pCluster = *--lo;
+ assert(pCluster);
+ assert(pCluster->GetTime() <= time_ns);
+
+ pResult = pCluster->GetEntry(this, time_ns);
+
+ if ((pResult != 0) && !pResult->EOS()) // found a keyframe
+ return 0;
+
+ while (lo != i) {
+ pCluster = *--lo;
+ assert(pCluster);
+ assert(pCluster->GetTime() <= time_ns);
+
+ pResult = pCluster->GetEntry(this, time_ns);
+
+ if ((pResult != 0) && !pResult->EOS())
+ return 0;
+ }
+
+ // weird: we're on the first cluster, but no keyframe found
+ // should never happen but we must return something anyway
+
+ pResult = GetEOS();
+ return 0;
+}
+
+Colour* VideoTrack::GetColour() const { return m_colour; }
+
+Projection* VideoTrack::GetProjection() const { return m_projection; }
+
+long long VideoTrack::GetWidth() const { return m_width; }
+
+long long VideoTrack::GetHeight() const { return m_height; }
+
+long long VideoTrack::GetDisplayWidth() const {
+ return m_display_width > 0 ? m_display_width : GetWidth();
+}
+
+long long VideoTrack::GetDisplayHeight() const {
+ return m_display_height > 0 ? m_display_height : GetHeight();
+}
+
+long long VideoTrack::GetDisplayUnit() const { return m_display_unit; }
+
+long long VideoTrack::GetStereoMode() const { return m_stereo_mode; }
+
+double VideoTrack::GetFrameRate() const { return m_rate; }
+
+AudioTrack::AudioTrack(Segment* pSegment, long long element_start,
+ long long element_size)
+ : Track(pSegment, element_start, element_size) {}
+
+long AudioTrack::Parse(Segment* pSegment, const Info& info,
+ long long element_start, long long element_size,
+ AudioTrack*& pResult) {
+ if (pResult)
+ return -1;
+
+ if (info.type != Track::kAudio)
+ return -1;
+
+ IMkvReader* const pReader = pSegment->m_pReader;
+
+ const Settings& s = info.settings;
+ assert(s.start >= 0);
+ assert(s.size >= 0);
+
+ long long pos = s.start;
+ assert(pos >= 0);
+
+ const long long stop = pos + s.size;
+
+ double rate = 8000.0; // MKV default
+ long long channels = 1;
+ long long bit_depth = 0;
+
+ while (pos < stop) {
+ long long id, size;
+
+ long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (id == libwebm::kMkvSamplingFrequency) {
+ status = UnserializeFloat(pReader, pos, size, rate);
+
+ if (status < 0)
+ return status;
+
+ if (rate <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvChannels) {
+ channels = UnserializeUInt(pReader, pos, size);
+
+ if (channels <= 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvBitDepth) {
+ bit_depth = UnserializeUInt(pReader, pos, size);
+
+ if (bit_depth <= 0)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ AudioTrack* const pTrack =
+ new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
+
+ if (pTrack == NULL)
+ return -1; // generic error
+
+ const int status = info.Copy(pTrack->m_info);
+
+ if (status) {
+ delete pTrack;
+ return status;
+ }
+
+ pTrack->m_rate = rate;
+ pTrack->m_channels = channels;
+ pTrack->m_bitDepth = bit_depth;
+
+ pResult = pTrack;
+ return 0; // success
+}
+
+double AudioTrack::GetSamplingRate() const { return m_rate; }
+
+long long AudioTrack::GetChannels() const { return m_channels; }
+
+long long AudioTrack::GetBitDepth() const { return m_bitDepth; }
+
+Tracks::Tracks(Segment* pSegment, long long start, long long size_,
+ long long element_start, long long element_size)
+ : m_pSegment(pSegment),
+ m_start(start),
+ m_size(size_),
+ m_element_start(element_start),
+ m_element_size(element_size),
+ m_trackEntries(NULL),
+ m_trackEntriesEnd(NULL) {}
+
+long Tracks::Parse() {
+ assert(m_trackEntries == NULL);
+ assert(m_trackEntriesEnd == NULL);
+
+ const long long stop = m_start + m_size;
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long count = 0;
+ long long pos = m_start;
+
+ while (pos < stop) {
+ long long id, size;
+
+ const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size == 0) // weird
+ continue;
+
+ if (id == libwebm::kMkvTrackEntry) {
+ ++count;
+ if (count > INT_MAX)
+ return E_PARSE_FAILED;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (count <= 0)
+ return 0; // success
+
+ m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)];
+
+ if (m_trackEntries == NULL)
+ return -1;
+
+ m_trackEntriesEnd = m_trackEntries;
+
+ pos = m_start;
+
+ while (pos < stop) {
+ const long long element_start = pos;
+
+ long long id, payload_size;
+
+ const long status =
+ ParseElementHeader(pReader, pos, stop, id, payload_size);
+
+ if (status < 0) // error
+ return status;
+
+ if (payload_size == 0) // weird
+ continue;
+
+ const long long payload_stop = pos + payload_size;
+ assert(payload_stop <= stop); // checked in ParseElement
+
+ const long long element_size = payload_stop - element_start;
+
+ if (id == libwebm::kMkvTrackEntry) {
+ Track*& pTrack = *m_trackEntriesEnd;
+ pTrack = NULL;
+
+ const long status = ParseTrackEntry(pos, payload_size, element_start,
+ element_size, pTrack);
+ if (status)
+ return status;
+
+ if (pTrack)
+ ++m_trackEntriesEnd;
+ }
+
+ pos = payload_stop;
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ return 0; // success
+}
+
+unsigned long Tracks::GetTracksCount() const {
+ const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries;
+ assert(result >= 0);
+
+ return static_cast<unsigned long>(result);
+}
+
+long Tracks::ParseTrackEntry(long long track_start, long long track_size,
+ long long element_start, long long element_size,
+ Track*& pResult) const {
+ if (pResult)
+ return -1;
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long pos = track_start;
+ const long long track_stop = track_start + track_size;
+
+ Track::Info info;
+
+ info.type = 0;
+ info.number = 0;
+ info.uid = 0;
+ info.defaultDuration = 0;
+
+ Track::Settings v;
+ v.start = -1;
+ v.size = -1;
+
+ Track::Settings a;
+ a.start = -1;
+ a.size = -1;
+
+ Track::Settings e; // content_encodings_settings;
+ e.start = -1;
+ e.size = -1;
+
+ long long lacing = 1; // default is true
+
+ while (pos < track_stop) {
+ long long id, size;
+
+ const long status = ParseElementHeader(pReader, pos, track_stop, id, size);
+
+ if (status < 0) // error
+ return status;
+
+ if (size < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long start = pos;
+
+ if (id == libwebm::kMkvVideo) {
+ v.start = start;
+ v.size = size;
+ } else if (id == libwebm::kMkvAudio) {
+ a.start = start;
+ a.size = size;
+ } else if (id == libwebm::kMkvContentEncodings) {
+ e.start = start;
+ e.size = size;
+ } else if (id == libwebm::kMkvTrackUID) {
+ if (size > 8)
+ return E_FILE_FORMAT_INVALID;
+
+ info.uid = 0;
+
+ long long pos_ = start;
+ const long long pos_end = start + size;
+
+ while (pos_ != pos_end) {
+ unsigned char b;
+
+ const int status = pReader->Read(pos_, 1, &b);
+
+ if (status)
+ return status;
+
+ info.uid <<= 8;
+ info.uid |= b;
+
+ ++pos_;
+ }
+ } else if (id == libwebm::kMkvTrackNumber) {
+ const long long num = UnserializeUInt(pReader, pos, size);
+
+ if ((num <= 0) || (num > 127))
+ return E_FILE_FORMAT_INVALID;
+
+ info.number = static_cast<long>(num);
+ } else if (id == libwebm::kMkvTrackType) {
+ const long long type = UnserializeUInt(pReader, pos, size);
+
+ if ((type <= 0) || (type > 254))
+ return E_FILE_FORMAT_INVALID;
+
+ info.type = static_cast<long>(type);
+ } else if (id == libwebm::kMkvName) {
+ const long status =
+ UnserializeString(pReader, pos, size, info.nameAsUTF8);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvLanguage) {
+ const long status = UnserializeString(pReader, pos, size, info.language);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvDefaultDuration) {
+ const long long duration = UnserializeUInt(pReader, pos, size);
+
+ if (duration < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ info.defaultDuration = static_cast<unsigned long long>(duration);
+ } else if (id == libwebm::kMkvCodecID) {
+ const long status = UnserializeString(pReader, pos, size, info.codecId);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvFlagLacing) {
+ lacing = UnserializeUInt(pReader, pos, size);
+
+ if ((lacing < 0) || (lacing > 1))
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvCodecPrivate) {
+ delete[] info.codecPrivate;
+ info.codecPrivate = NULL;
+ info.codecPrivateSize = 0;
+
+ const size_t buflen = static_cast<size_t>(size);
+
+ if (buflen) {
+ unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+
+ if (buf == NULL)
+ return -1;
+
+ const int status = pReader->Read(pos, static_cast<long>(buflen), buf);
+
+ if (status) {
+ delete[] buf;
+ return status;
+ }
+
+ info.codecPrivate = buf;
+ info.codecPrivateSize = buflen;
+ }
+ } else if (id == libwebm::kMkvCodecName) {
+ const long status =
+ UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
+
+ if (status)
+ return status;
+ } else if (id == libwebm::kMkvCodecDelay) {
+ info.codecDelay = UnserializeUInt(pReader, pos, size);
+ } else if (id == libwebm::kMkvSeekPreRoll) {
+ info.seekPreRoll = UnserializeUInt(pReader, pos, size);
+ }
+
+ pos += size; // consume payload
+ if (pos > track_stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != track_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (info.number <= 0) // not specified
+ return E_FILE_FORMAT_INVALID;
+
+ if (GetTrackByNumber(info.number))
+ return E_FILE_FORMAT_INVALID;
+
+ if (info.type <= 0) // not specified
+ return E_FILE_FORMAT_INVALID;
+
+ info.lacing = (lacing > 0) ? true : false;
+
+ if (info.type == Track::kVideo) {
+ if (v.start < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (a.start >= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ info.settings = v;
+
+ VideoTrack* pTrack = NULL;
+
+ const long status = VideoTrack::Parse(m_pSegment, info, element_start,
+ element_size, pTrack);
+
+ if (status)
+ return status;
+
+ pResult = pTrack;
+ assert(pResult);
+
+ if (e.start >= 0)
+ pResult->ParseContentEncodingsEntry(e.start, e.size);
+ } else if (info.type == Track::kAudio) {
+ if (a.start < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (v.start >= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ info.settings = a;
+
+ AudioTrack* pTrack = NULL;
+
+ const long status = AudioTrack::Parse(m_pSegment, info, element_start,
+ element_size, pTrack);
+
+ if (status)
+ return status;
+
+ pResult = pTrack;
+ assert(pResult);
+
+ if (e.start >= 0)
+ pResult->ParseContentEncodingsEntry(e.start, e.size);
+ } else {
+ // neither video nor audio - probably metadata or subtitles
+
+ if (a.start >= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (v.start >= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (info.type == Track::kMetadata && e.start >= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ info.settings.start = -1;
+ info.settings.size = 0;
+
+ Track* pTrack = NULL;
+
+ const long status =
+ Track::Create(m_pSegment, info, element_start, element_size, pTrack);
+
+ if (status)
+ return status;
+
+ pResult = pTrack;
+ assert(pResult);
+ }
+
+ return 0; // success
+}
+
+Tracks::~Tracks() {
+ Track** i = m_trackEntries;
+ Track** const j = m_trackEntriesEnd;
+
+ while (i != j) {
+ Track* const pTrack = *i++;
+ delete pTrack;
+ }
+
+ delete[] m_trackEntries;
+}
+
+const Track* Tracks::GetTrackByNumber(long tn) const {
+ if (tn < 0)
+ return NULL;
+
+ Track** i = m_trackEntries;
+ Track** const j = m_trackEntriesEnd;
+
+ while (i != j) {
+ Track* const pTrack = *i++;
+
+ if (pTrack == NULL)
+ continue;
+
+ if (tn == pTrack->GetNumber())
+ return pTrack;
+ }
+
+ return NULL; // not found
+}
+
+const Track* Tracks::GetTrackByIndex(unsigned long idx) const {
+ const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries;
+
+ if (idx >= static_cast<unsigned long>(count))
+ return NULL;
+
+ return m_trackEntries[idx];
+}
+
+long Cluster::Load(long long& pos, long& len) const {
+ if (m_pSegment == NULL)
+ return E_PARSE_FAILED;
+
+ if (m_timecode >= 0) // at least partially loaded
+ return 0;
+
+ if (m_pos != m_element_start || m_element_size >= 0)
+ return E_PARSE_FAILED;
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+ long long total, avail;
+ const int status = pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ if (total >= 0 && (avail > total || m_pos > total))
+ return E_FILE_FORMAT_INVALID;
+
+ pos = m_pos;
+
+ long long cluster_size = -1;
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error or underflow
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id_ = ReadID(pReader, pos, len);
+
+ if (id_ < 0) // error
+ return static_cast<long>(id_);
+
+ if (id_ != libwebm::kMkvCluster)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume id
+
+ // read cluster size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(cluster_size);
+
+ if (size == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume length of size of element
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size != unknown_size)
+ cluster_size = size;
+
+ // pos points to start of payload
+ long long timecode = -1;
+ long long new_pos = -1;
+ bool bBlock = false;
+
+ long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size;
+
+ for (;;) {
+ if ((cluster_stop >= 0) && (pos >= cluster_stop))
+ break;
+
+ // Parse ID
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id = ReadID(pReader, pos, len);
+
+ if (id < 0) // error
+ return static_cast<long>(id);
+
+ if (id == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // This is the distinguished set of ID's we use to determine
+ // that we have exhausted the sub-element's inside the cluster
+ // whose ID we parsed earlier.
+
+ if (id == libwebm::kMkvCluster)
+ break;
+
+ if (id == libwebm::kMkvCues)
+ break;
+
+ pos += len; // consume ID field
+
+ // Parse Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume size field
+
+ if ((cluster_stop >= 0) && (pos > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ // pos now points to start of payload
+
+ if (size == 0)
+ continue;
+
+ if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if (id == libwebm::kMkvTimecode) {
+ len = static_cast<long>(size);
+
+ if ((pos + size) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ timecode = UnserializeUInt(pReader, pos, size);
+
+ if (timecode < 0) // error (or underflow)
+ return static_cast<long>(timecode);
+
+ new_pos = pos + size;
+
+ if (bBlock)
+ break;
+ } else if (id == libwebm::kMkvBlockGroup) {
+ bBlock = true;
+ break;
+ } else if (id == libwebm::kMkvSimpleBlock) {
+ bBlock = true;
+ break;
+ }
+
+ pos += size; // consume payload
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (timecode < 0) // no timecode found
+ return E_FILE_FORMAT_INVALID;
+
+ if (!bBlock)
+ return E_FILE_FORMAT_INVALID;
+
+ m_pos = new_pos; // designates position just beyond timecode payload
+ m_timecode = timecode; // m_timecode >= 0 means we're partially loaded
+
+ if (cluster_size >= 0)
+ m_element_size = cluster_stop - m_element_start;
+
+ return 0;
+}
+
+long Cluster::Parse(long long& pos, long& len) const {
+ long status = Load(pos, len);
+
+ if (status < 0)
+ return status;
+
+ if (m_pos < m_element_start || m_timecode < 0)
+ return E_PARSE_FAILED;
+
+ const long long cluster_stop =
+ (m_element_size < 0) ? -1 : m_element_start + m_element_size;
+
+ if ((cluster_stop >= 0) && (m_pos >= cluster_stop))
+ return 1; // nothing else to do
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long total, avail;
+
+ status = pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ if (total >= 0 && avail > total)
+ return E_FILE_FORMAT_INVALID;
+
+ pos = m_pos;
+
+ for (;;) {
+ if ((cluster_stop >= 0) && (pos >= cluster_stop))
+ break;
+
+ if ((total >= 0) && (pos >= total)) {
+ if (m_element_size < 0)
+ m_element_size = pos - m_element_start;
+
+ break;
+ }
+
+ // Parse ID
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id = ReadID(pReader, pos, len);
+
+ if (id < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ // This is the distinguished set of ID's we use to determine
+ // that we have exhausted the sub-element's inside the cluster
+ // whose ID we parsed earlier.
+
+ if ((id == libwebm::kMkvCluster) || (id == libwebm::kMkvCues)) {
+ if (m_element_size < 0)
+ m_element_size = pos - m_element_start;
+
+ break;
+ }
+
+ pos += len; // consume ID field
+
+ // Parse Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0)
+ return E_BUFFER_NOT_FULL;
+
+ if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume size field
+
+ if ((cluster_stop >= 0) && (pos > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ // pos now points to start of payload
+
+ if (size == 0)
+ continue;
+
+ // const long long block_start = pos;
+ const long long block_stop = pos + size;
+
+ if (cluster_stop >= 0) {
+ if (block_stop > cluster_stop) {
+ if (id == libwebm::kMkvBlockGroup || id == libwebm::kMkvSimpleBlock) {
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ pos = cluster_stop;
+ break;
+ }
+ } else if ((total >= 0) && (block_stop > total)) {
+ m_element_size = total - m_element_start;
+ pos = total;
+ break;
+ } else if (block_stop > avail) {
+ len = static_cast<long>(size);
+ return E_BUFFER_NOT_FULL;
+ }
+
+ Cluster* const this_ = const_cast<Cluster*>(this);
+
+ if (id == libwebm::kMkvBlockGroup)
+ return this_->ParseBlockGroup(size, pos, len);
+
+ if (id == libwebm::kMkvSimpleBlock)
+ return this_->ParseSimpleBlock(size, pos, len);
+
+ pos += size; // consume payload
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (m_element_size < 1)
+ return E_FILE_FORMAT_INVALID;
+
+ m_pos = pos;
+ if (cluster_stop >= 0 && m_pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (m_entries_count > 0) {
+ const long idx = m_entries_count - 1;
+
+ const BlockEntry* const pLast = m_entries[idx];
+ if (pLast == NULL)
+ return E_PARSE_FAILED;
+
+ const Block* const pBlock = pLast->GetBlock();
+ if (pBlock == NULL)
+ return E_PARSE_FAILED;
+
+ const long long start = pBlock->m_start;
+
+ if ((total >= 0) && (start > total))
+ return E_PARSE_FAILED; // defend against trucated stream
+
+ const long long size = pBlock->m_size;
+
+ const long long stop = start + size;
+ if (cluster_stop >= 0 && stop > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && (stop > total))
+ return E_PARSE_FAILED; // defend against trucated stream
+ }
+
+ return 1; // no more entries
+}
+
+long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
+ long& len) {
+ const long long block_start = pos;
+ const long long block_stop = pos + block_size;
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long total, avail;
+
+ long status = pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ assert((total < 0) || (avail <= total));
+
+ // parse track number
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long track = ReadUInt(pReader, pos, len);
+
+ if (track < 0) // error
+ return static_cast<long>(track);
+
+ if (track == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume track number
+
+ if ((pos + 2) > block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + 2) > avail) {
+ len = 2;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ pos += 2; // consume timecode
+
+ if ((pos + 1) > block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ unsigned char flags;
+
+ status = pReader->Read(pos, 1, &flags);
+
+ if (status < 0) { // error or underflow
+ len = 1;
+ return status;
+ }
+
+ ++pos; // consume flags byte
+ assert(pos <= avail);
+
+ if (pos >= block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ const int lacing = int(flags & 0x06) >> 1;
+
+ if ((lacing != 0) && (block_stop > avail)) {
+ len = static_cast<long>(block_stop - pos);
+ return E_BUFFER_NOT_FULL;
+ }
+
+ status = CreateBlock(libwebm::kMkvSimpleBlock, block_start, block_size,
+ 0); // DiscardPadding
+
+ if (status != 0)
+ return status;
+
+ m_pos = block_stop;
+
+ return 0; // success
+}
+
+long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
+ long& len) {
+ const long long payload_start = pos;
+ const long long payload_stop = pos + payload_size;
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long total, avail;
+
+ long status = pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ assert((total < 0) || (avail <= total));
+
+ if ((total >= 0) && (payload_stop > total))
+ return E_FILE_FORMAT_INVALID;
+
+ if (payload_stop > avail) {
+ len = static_cast<long>(payload_size);
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long discard_padding = 0;
+
+ while (pos < payload_stop) {
+ // parse sub-block element ID
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > payload_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id = ReadID(pReader, pos, len);
+
+ if (id < 0) // error
+ return static_cast<long>(id);
+
+ if (id == 0) // not a valid ID
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume ID field
+
+ // Parse Size
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > payload_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ pos += len; // consume size field
+
+ // pos now points to start of sub-block group payload
+
+ if (pos > payload_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if (size == 0) // weird
+ continue;
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID;
+
+ if (id == libwebm::kMkvDiscardPadding) {
+ status = UnserializeInt(pReader, pos, size, discard_padding);
+
+ if (status < 0) // error
+ return status;
+ }
+
+ if (id != libwebm::kMkvBlock) {
+ pos += size; // consume sub-part of block group
+
+ if (pos > payload_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ continue;
+ }
+
+ const long long block_stop = pos + size;
+
+ if (block_stop > payload_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ // parse track number
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((pos + len) > block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long track = ReadUInt(pReader, pos, len);
+
+ if (track < 0) // error
+ return static_cast<long>(track);
+
+ if (track == 0)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume track number
+
+ if ((pos + 2) > block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + 2) > avail) {
+ len = 2;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ pos += 2; // consume timecode
+
+ if ((pos + 1) > block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ unsigned char flags;
+
+ status = pReader->Read(pos, 1, &flags);
+
+ if (status < 0) { // error or underflow
+ len = 1;
+ return status;
+ }
+
+ ++pos; // consume flags byte
+ assert(pos <= avail);
+
+ if (pos >= block_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ const int lacing = int(flags & 0x06) >> 1;
+
+ if ((lacing != 0) && (block_stop > avail)) {
+ len = static_cast<long>(block_stop - pos);
+ return E_BUFFER_NOT_FULL;
+ }
+
+ pos = block_stop; // consume block-part of block group
+ if (pos > payload_stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ if (pos != payload_stop)
+ return E_FILE_FORMAT_INVALID;
+
+ status = CreateBlock(libwebm::kMkvBlockGroup, payload_start, payload_size,
+ discard_padding);
+ if (status != 0)
+ return status;
+
+ m_pos = payload_stop;
+
+ return 0; // success
+}
+
+long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const {
+ assert(m_pos >= m_element_start);
+
+ pEntry = NULL;
+
+ if (index < 0)
+ return -1; // generic error
+
+ if (m_entries_count < 0)
+ return E_BUFFER_NOT_FULL;
+
+ assert(m_entries);
+ assert(m_entries_size > 0);
+ assert(m_entries_count <= m_entries_size);
+
+ if (index < m_entries_count) {
+ pEntry = m_entries[index];
+ assert(pEntry);
+
+ return 1; // found entry
+ }
+
+ if (m_element_size < 0) // we don't know cluster end yet
+ return E_BUFFER_NOT_FULL; // underflow
+
+ const long long element_stop = m_element_start + m_element_size;
+
+ if (m_pos >= element_stop)
+ return 0; // nothing left to parse
+
+ return E_BUFFER_NOT_FULL; // underflow, since more remains to be parsed
+}
+
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+ if (!pSegment || off < 0)
+ return NULL;
+
+ const long long element_start = pSegment->m_start + off;
+
+ Cluster* const pCluster =
+ new (std::nothrow) Cluster(pSegment, idx, element_start);
+
+ return pCluster;
+}
+
+Cluster::Cluster()
+ : m_pSegment(NULL),
+ m_element_start(0),
+ m_index(0),
+ m_pos(0),
+ m_element_size(0),
+ m_timecode(0),
+ m_entries(NULL),
+ m_entries_size(0),
+ m_entries_count(0) // means "no entries"
+{}
+
+Cluster::Cluster(Segment* pSegment, long idx, long long element_start
+ /* long long element_size */)
+ : m_pSegment(pSegment),
+ m_element_start(element_start),
+ m_index(idx),
+ m_pos(element_start),
+ m_element_size(-1 /* element_size */),
+ m_timecode(-1),
+ m_entries(NULL),
+ m_entries_size(0),
+ m_entries_count(-1) // means "has not been parsed yet"
+{}
+
+Cluster::~Cluster() {
+ if (m_entries_count <= 0) {
+ delete[] m_entries;
+ return;
+ }
+
+ BlockEntry** i = m_entries;
+ BlockEntry** const j = m_entries + m_entries_count;
+
+ while (i != j) {
+ BlockEntry* p = *i++;
+ assert(p);
+
+ delete p;
+ }
+
+ delete[] m_entries;
+}
+
+bool Cluster::EOS() const { return (m_pSegment == NULL); }
+
+long Cluster::GetIndex() const { return m_index; }
+
+long long Cluster::GetPosition() const {
+ const long long pos = m_element_start - m_pSegment->m_start;
+ assert(pos >= 0);
+
+ return pos;
+}
+
+long long Cluster::GetElementSize() const { return m_element_size; }
+
+long Cluster::HasBlockEntries(
+ const Segment* pSegment,
+ long long off, // relative to start of segment payload
+ long long& pos, long& len) {
+ assert(pSegment);
+ assert(off >= 0); // relative to segment
+
+ IMkvReader* const pReader = pSegment->m_pReader;
+
+ long long total, avail;
+
+ long status = pReader->Length(&total, &avail);
+
+ if (status < 0) // error
+ return status;
+
+ assert((total < 0) || (avail <= total));
+
+ pos = pSegment->m_start + off; // absolute
+
+ if ((total >= 0) && (pos >= total))
+ return 0; // we don't even have a complete cluster
+
+ const long long segment_stop =
+ (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size;
+
+ long long cluster_stop = -1; // interpreted later to mean "unknown size"
+
+ {
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // need more data
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && ((pos + len) > total))
+ return 0;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id = ReadID(pReader, pos, len);
+
+ if (id < 0) // error
+ return static_cast<long>(id);
+
+ if (id != libwebm::kMkvCluster)
+ return E_PARSE_FAILED;
+
+ pos += len; // consume Cluster ID field
+
+ // read size field
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // weird
+ return E_BUFFER_NOT_FULL;
+
+ if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && ((pos + len) > total))
+ return 0;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ if (size == 0)
+ return 0; // cluster does not have entries
+
+ pos += len; // consume size field
+
+ // pos now points to start of payload
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size != unknown_size) {
+ cluster_stop = pos + size;
+ assert(cluster_stop >= 0);
+
+ if ((segment_stop >= 0) && (cluster_stop > segment_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((total >= 0) && (cluster_stop > total))
+ // return E_FILE_FORMAT_INVALID; //too conservative
+ return 0; // cluster does not have any entries
+ }
+ }
+
+ for (;;) {
+ if ((cluster_stop >= 0) && (pos >= cluster_stop))
+ return 0; // no entries detected
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ long long result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // need more data
+ return E_BUFFER_NOT_FULL;
+
+ if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long id = ReadID(pReader, pos, len);
+
+ if (id < 0) // error
+ return static_cast<long>(id);
+
+ // This is the distinguished set of ID's we use to determine
+ // that we have exhausted the sub-element's inside the cluster
+ // whose ID we parsed earlier.
+
+ if (id == libwebm::kMkvCluster)
+ return 0; // no entries found
+
+ if (id == libwebm::kMkvCues)
+ return 0; // no entries found
+
+ pos += len; // consume id field
+
+ if ((cluster_stop >= 0) && (pos >= cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ // read size field
+
+ if ((pos + 1) > avail) {
+ len = 1;
+ return E_BUFFER_NOT_FULL;
+ }
+
+ result = GetUIntLength(pReader, pos, len);
+
+ if (result < 0) // error
+ return static_cast<long>(result);
+
+ if (result > 0) // underflow
+ return E_BUFFER_NOT_FULL;
+
+ if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > avail)
+ return E_BUFFER_NOT_FULL;
+
+ const long long size = ReadUInt(pReader, pos, len);
+
+ if (size < 0) // error
+ return static_cast<long>(size);
+
+ pos += len; // consume size field
+
+ // pos now points to start of payload
+
+ if ((cluster_stop >= 0) && (pos > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if (size == 0) // weird
+ continue;
+
+ const long long unknown_size = (1LL << (7 * len)) - 1;
+
+ if (size == unknown_size)
+ return E_FILE_FORMAT_INVALID; // not supported inside cluster
+
+ if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+ return E_FILE_FORMAT_INVALID;
+
+ if (id == libwebm::kMkvBlockGroup)
+ return 1; // have at least one entry
+
+ if (id == libwebm::kMkvSimpleBlock)
+ return 1; // have at least one entry
+
+ pos += size; // consume payload
+ if (cluster_stop >= 0 && pos > cluster_stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+}
+
+long long Cluster::GetTimeCode() const {
+ long long pos;
+ long len;
+
+ const long status = Load(pos, len);
+
+ if (status < 0) // error
+ return status;
+
+ return m_timecode;
+}
+
+long long Cluster::GetTime() const {
+ const long long tc = GetTimeCode();
+
+ if (tc < 0)
+ return tc;
+
+ const SegmentInfo* const pInfo = m_pSegment->GetInfo();
+ assert(pInfo);
+
+ const long long scale = pInfo->GetTimeCodeScale();
+ assert(scale >= 1);
+
+ const long long t = m_timecode * scale;
+
+ return t;
+}
+
+long long Cluster::GetFirstTime() const {
+ const BlockEntry* pEntry;
+
+ const long status = GetFirst(pEntry);
+
+ if (status < 0) // error
+ return status;
+
+ if (pEntry == NULL) // empty cluster
+ return GetTime();
+
+ const Block* const pBlock = pEntry->GetBlock();
+ assert(pBlock);
+
+ return pBlock->GetTime(this);
+}
+
+long long Cluster::GetLastTime() const {
+ const BlockEntry* pEntry;
+
+ const long status = GetLast(pEntry);
+
+ if (status < 0) // error
+ return status;
+
+ if (pEntry == NULL) // empty cluster
+ return GetTime();
+
+ const Block* const pBlock = pEntry->GetBlock();
+ assert(pBlock);
+
+ return pBlock->GetTime(this);
+}
+
+long Cluster::CreateBlock(long long id,
+ long long pos, // absolute pos of payload
+ long long size, long long discard_padding) {
+ if (id != libwebm::kMkvBlockGroup && id != libwebm::kMkvSimpleBlock)
+ return E_PARSE_FAILED;
+
+ if (m_entries_count < 0) { // haven't parsed anything yet
+ assert(m_entries == NULL);
+ assert(m_entries_size == 0);
+
+ m_entries_size = 1024;
+ m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+ if (m_entries == NULL)
+ return -1;
+
+ m_entries_count = 0;
+ } else {
+ assert(m_entries);
+ assert(m_entries_size > 0);
+ assert(m_entries_count <= m_entries_size);
+
+ if (m_entries_count >= m_entries_size) {
+ const long entries_size = 2 * m_entries_size;
+
+ BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+ if (entries == NULL)
+ return -1;
+
+ BlockEntry** src = m_entries;
+ BlockEntry** const src_end = src + m_entries_count;
+
+ BlockEntry** dst = entries;
+
+ while (src != src_end)
+ *dst++ = *src++;
+
+ delete[] m_entries;
+
+ m_entries = entries;
+ m_entries_size = entries_size;
+ }
+ }
+
+ if (id == libwebm::kMkvBlockGroup)
+ return CreateBlockGroup(pos, size, discard_padding);
+ else
+ return CreateSimpleBlock(pos, size);
+}
+
+long Cluster::CreateBlockGroup(long long start_offset, long long size,
+ long long discard_padding) {
+ assert(m_entries);
+ assert(m_entries_size > 0);
+ assert(m_entries_count >= 0);
+ assert(m_entries_count < m_entries_size);
+
+ IMkvReader* const pReader = m_pSegment->m_pReader;
+
+ long long pos = start_offset;
+ const long long stop = start_offset + size;
+
+ // For WebM files, there is a bias towards previous reference times
+ //(in order to support alt-ref frames, which refer back to the previous
+ // keyframe). Normally a 0 value is not possible, but here we tenatively
+ // allow 0 as the value of a reference frame, with the interpretation
+ // that this is a "previous" reference time.
+
+ long long prev = 1; // nonce
+ long long next = 0; // nonce
+ long long duration = -1; // really, this is unsigned
+
+ long long bpos = -1;
+ long long bsize = -1;
+
+ while (pos < stop) {
+ long len;
+ const long long id = ReadID(pReader, pos, len);
+ if (id < 0 || (pos + len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume ID
+
+ const long long size = ReadUInt(pReader, pos, len);
+ assert(size >= 0); // TODO
+ assert((pos + len) <= stop);
+
+ pos += len; // consume size
+
+ if (id == libwebm::kMkvBlock) {
+ if (bpos < 0) { // Block ID
+ bpos = pos;
+ bsize = size;
+ }
+ } else if (id == libwebm::kMkvBlockDuration) {
+ if (size > 8)
+ return E_FILE_FORMAT_INVALID;
+
+ duration = UnserializeUInt(pReader, pos, size);
+
+ if (duration < 0)
+ return E_FILE_FORMAT_INVALID;
+ } else if (id == libwebm::kMkvReferenceBlock) {
+ if (size > 8 || size <= 0)
+ return E_FILE_FORMAT_INVALID;
+ const long size_ = static_cast<long>(size);
+
+ long long time;
+
+ long status = UnserializeInt(pReader, pos, size_, time);
+ assert(status == 0);
+ if (status != 0)
+ return -1;
+
+ if (time <= 0) // see note above
+ prev = time;
+ else
+ next = time;
+ }
+
+ pos += size; // consume payload
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+ if (bpos < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ assert(bsize >= 0);
+
+ const long idx = m_entries_count;
+
+ BlockEntry** const ppEntry = m_entries + idx;
+ BlockEntry*& pEntry = *ppEntry;
+
+ pEntry = new (std::nothrow)
+ BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding);
+
+ if (pEntry == NULL)
+ return -1; // generic error
+
+ BlockGroup* const p = static_cast<BlockGroup*>(pEntry);
+
+ const long status = p->Parse();
+
+ if (status == 0) { // success
+ ++m_entries_count;
+ return 0;
+ }
+
+ delete pEntry;
+ pEntry = 0;
+
+ return status;
+}
+
+long Cluster::CreateSimpleBlock(long long st, long long sz) {
+ assert(m_entries);
+ assert(m_entries_size > 0);
+ assert(m_entries_count >= 0);
+ assert(m_entries_count < m_entries_size);
+
+ const long idx = m_entries_count;
+
+ BlockEntry** const ppEntry = m_entries + idx;
+ BlockEntry*& pEntry = *ppEntry;
+
+ pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz);
+
+ if (pEntry == NULL)
+ return -1; // generic error
+
+ SimpleBlock* const p = static_cast<SimpleBlock*>(pEntry);
+
+ const long status = p->Parse();
+
+ if (status == 0) {
+ ++m_entries_count;
+ return 0;
+ }
+
+ delete pEntry;
+ pEntry = 0;
+
+ return status;
+}
+
+long Cluster::GetFirst(const BlockEntry*& pFirst) const {
+ if (m_entries_count <= 0) {
+ long long pos;
+ long len;
+
+ const long status = Parse(pos, len);
+
+ if (status < 0) { // error
+ pFirst = NULL;
+ return status;
+ }
+
+ if (m_entries_count <= 0) { // empty cluster
+ pFirst = NULL;
+ return 0;
+ }
+ }
+
+ assert(m_entries);
+
+ pFirst = m_entries[0];
+ assert(pFirst);
+
+ return 0; // success
+}
+
+long Cluster::GetLast(const BlockEntry*& pLast) const {
+ for (;;) {
+ long long pos;
+ long len;
+
+ const long status = Parse(pos, len);
+
+ if (status < 0) { // error
+ pLast = NULL;
+ return status;
+ }
+
+ if (status > 0) // no new block
+ break;
+ }
+
+ if (m_entries_count <= 0) {
+ pLast = NULL;
+ return 0;
+ }
+
+ assert(m_entries);
+
+ const long idx = m_entries_count - 1;
+
+ pLast = m_entries[idx];
+ assert(pLast);
+
+ return 0;
+}
+
+long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const {
+ assert(pCurr);
+ assert(m_entries);
+ assert(m_entries_count > 0);
+
+ size_t idx = pCurr->GetIndex();
+ assert(idx < size_t(m_entries_count));
+ assert(m_entries[idx] == pCurr);
+
+ ++idx;
+
+ if (idx >= size_t(m_entries_count)) {
+ long long pos;
+ long len;
+
+ const long status = Parse(pos, len);
+
+ if (status < 0) { // error
+ pNext = NULL;
+ return status;
+ }
+
+ if (status > 0) {
+ pNext = NULL;
+ return 0;
+ }
+
+ assert(m_entries);
+ assert(m_entries_count > 0);
+ assert(idx < size_t(m_entries_count));
+ }
+
+ pNext = m_entries[idx];
+ assert(pNext);
+
+ return 0;
+}
+
+long Cluster::GetEntryCount() const { return m_entries_count; }
+
+const BlockEntry* Cluster::GetEntry(const Track* pTrack,
+ long long time_ns) const {
+ assert(pTrack);
+
+ if (m_pSegment == NULL) // this is the special EOS cluster
+ return pTrack->GetEOS();
+
+ const BlockEntry* pResult = pTrack->GetEOS();
+
+ long index = 0;
+
+ for (;;) {
+ if (index >= m_entries_count) {
+ long long pos;
+ long len;
+
+ const long status = Parse(pos, len);
+ assert(status >= 0);
+
+ if (status > 0) // completely parsed, and no more entries
+ return pResult;
+
+ if (status < 0) // should never happen
+ return 0;
+
+ assert(m_entries);
+ assert(index < m_entries_count);
+ }
+
+ const BlockEntry* const pEntry = m_entries[index];
+ assert(pEntry);
+ assert(!pEntry->EOS());
+
+ const Block* const pBlock = pEntry->GetBlock();
+ assert(pBlock);
+
+ if (pBlock->GetTrackNumber() != pTrack->GetNumber()) {
+ ++index;
+ continue;
+ }
+
+ if (pTrack->VetEntry(pEntry)) {
+ if (time_ns < 0) // just want first candidate block
+ return pEntry;
+
+ const long long ns = pBlock->GetTime(this);
+
+ if (ns > time_ns)
+ return pResult;
+
+ pResult = pEntry; // have a candidate
+ } else if (time_ns >= 0) {
+ const long long ns = pBlock->GetTime(this);
+
+ if (ns > time_ns)
+ return pResult;
+ }
+
+ ++index;
+ }
+}
+
+const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
+ const CuePoint::TrackPosition& tp) const {
+ assert(m_pSegment);
+ const long long tc = cp.GetTimeCode();
+
+ if (tp.m_block > 0) {
+ const long block = static_cast<long>(tp.m_block);
+ const long index = block - 1;
+
+ while (index >= m_entries_count) {
+ long long pos;
+ long len;
+
+ const long status = Parse(pos, len);
+
+ if (status < 0) // TODO: can this happen?
+ return NULL;
+
+ if (status > 0) // nothing remains to be parsed
+ return NULL;
+ }
+
+ const BlockEntry* const pEntry = m_entries[index];
+ assert(pEntry);
+ assert(!pEntry->EOS());
+
+ const Block* const pBlock = pEntry->GetBlock();
+ assert(pBlock);
+
+ if ((pBlock->GetTrackNumber() == tp.m_track) &&
+ (pBlock->GetTimeCode(this) == tc)) {
+ return pEntry;
+ }
+ }
+
+ long index = 0;
+
+ for (;;) {
+ if (index >= m_entries_count) {
+ long long pos;
+ long len;
+
+ const long status = Parse(pos, len);
+
+ if (status < 0) // TODO: can this happen?
+ return NULL;
+
+ if (status > 0) // nothing remains to be parsed
+ return NULL;
+
+ assert(m_entries);
+ assert(index < m_entries_count);
+ }
+
+ const BlockEntry* const pEntry = m_entries[index];
+ assert(pEntry);
+ assert(!pEntry->EOS());
+
+ const Block* const pBlock = pEntry->GetBlock();
+ assert(pBlock);
+
+ if (pBlock->GetTrackNumber() != tp.m_track) {
+ ++index;
+ continue;
+ }
+
+ const long long tc_ = pBlock->GetTimeCode(this);
+
+ if (tc_ < tc) {
+ ++index;
+ continue;
+ }
+
+ if (tc_ > tc)
+ return NULL;
+
+ const Tracks* const pTracks = m_pSegment->GetTracks();
+ assert(pTracks);
+
+ const long tn = static_cast<long>(tp.m_track);
+ const Track* const pTrack = pTracks->GetTrackByNumber(tn);
+
+ if (pTrack == NULL)
+ return NULL;
+
+ const long long type = pTrack->GetType();
+
+ if (type == 2) // audio
+ return pEntry;
+
+ if (type != 1) // not video
+ return NULL;
+
+ if (!pBlock->IsKey())
+ return NULL;
+
+ return pEntry;
+ }
+}
+
+BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
+BlockEntry::~BlockEntry() {}
+const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
+long BlockEntry::GetIndex() const { return m_index; }
+
+SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start,
+ long long size)
+ : BlockEntry(pCluster, idx), m_block(start, size, 0) {}
+
+long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); }
+BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; }
+const Block* SimpleBlock::GetBlock() const { return &m_block; }
+
+BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start,
+ long long block_size, long long prev, long long next,
+ long long duration, long long discard_padding)
+ : BlockEntry(pCluster, idx),
+ m_block(block_start, block_size, discard_padding),
+ m_prev(prev),
+ m_next(next),
+ m_duration(duration) {}
+
+long BlockGroup::Parse() {
+ const long status = m_block.Parse(m_pCluster);
+
+ if (status)
+ return status;
+
+ m_block.SetKey((m_prev > 0) && (m_next <= 0));
+
+ return 0;
+}
+
+BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; }
+const Block* BlockGroup::GetBlock() const { return &m_block; }
+long long BlockGroup::GetPrevTimeCode() const { return m_prev; }
+long long BlockGroup::GetNextTimeCode() const { return m_next; }
+long long BlockGroup::GetDurationTimeCode() const { return m_duration; }
+
+Block::Block(long long start, long long size_, long long discard_padding)
+ : m_start(start),
+ m_size(size_),
+ m_track(0),
+ m_timecode(-1),
+ m_flags(0),
+ m_frames(NULL),
+ m_frame_count(-1),
+ m_discard_padding(discard_padding) {}
+
+Block::~Block() { delete[] m_frames; }
+
+long Block::Parse(const Cluster* pCluster) {
+ if (pCluster == NULL)
+ return -1;
+
+ if (pCluster->m_pSegment == NULL)
+ return -1;
+
+ assert(m_start >= 0);
+ assert(m_size >= 0);
+ assert(m_track <= 0);
+ assert(m_frames == NULL);
+ assert(m_frame_count <= 0);
+
+ long long pos = m_start;
+ const long long stop = m_start + m_size;
+
+ long len;
+
+ IMkvReader* const pReader = pCluster->m_pSegment->m_pReader;
+
+ m_track = ReadUInt(pReader, pos, len);
+
+ if (m_track <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume track number
+
+ if ((stop - pos) < 2)
+ return E_FILE_FORMAT_INVALID;
+
+ long status;
+ long long value;
+
+ status = UnserializeInt(pReader, pos, 2, value);
+
+ if (status)
+ return E_FILE_FORMAT_INVALID;
+
+ if (value < SHRT_MIN)
+ return E_FILE_FORMAT_INVALID;
+
+ if (value > SHRT_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ m_timecode = static_cast<short>(value);
+
+ pos += 2;
+
+ if ((stop - pos) <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ status = pReader->Read(pos, 1, &m_flags);
+
+ if (status)
+ return E_FILE_FORMAT_INVALID;
+
+ const int lacing = int(m_flags & 0x06) >> 1;
+
+ ++pos; // consume flags byte
+
+ if (lacing == 0) { // no lacing
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ m_frame_count = 1;
+ m_frames = new (std::nothrow) Frame[m_frame_count];
+ if (m_frames == NULL)
+ return -1;
+
+ Frame& f = m_frames[0];
+ f.pos = pos;
+
+ const long long frame_size = stop - pos;
+
+ if (frame_size > LONG_MAX || frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ f.len = static_cast<long>(frame_size);
+
+ return 0; // success
+ }
+
+ if (pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ unsigned char biased_count;
+
+ status = pReader->Read(pos, 1, &biased_count);
+
+ if (status)
+ return E_FILE_FORMAT_INVALID;
+
+ ++pos; // consume frame count
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ m_frame_count = int(biased_count) + 1;
+
+ m_frames = new (std::nothrow) Frame[m_frame_count];
+ if (m_frames == NULL)
+ return -1;
+
+ if (!m_frames)
+ return E_FILE_FORMAT_INVALID;
+
+ if (lacing == 1) { // Xiph
+ Frame* pf = m_frames;
+ Frame* const pf_end = pf + m_frame_count;
+
+ long long size = 0;
+ int frame_count = m_frame_count;
+
+ while (frame_count > 1) {
+ long frame_size = 0;
+
+ for (;;) {
+ unsigned char val;
+
+ if (pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ status = pReader->Read(pos, 1, &val);
+
+ if (status)
+ return E_FILE_FORMAT_INVALID;
+
+ ++pos; // consume xiph size byte
+
+ frame_size += val;
+
+ if (val < 255)
+ break;
+ }
+
+ Frame& f = *pf++;
+ assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ f.pos = 0; // patch later
+
+ if (frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ f.len = frame_size;
+ size += frame_size; // contribution of this frame
+
+ --frame_count;
+ }
+
+ if (pf >= pf_end || pos > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ {
+ Frame& f = *pf++;
+
+ if (pf != pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ f.pos = 0; // patch later
+
+ const long long total_size = stop - pos;
+
+ if (total_size < size)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long frame_size = total_size - size;
+
+ if (frame_size > LONG_MAX || frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ f.len = static_cast<long>(frame_size);
+ }
+
+ pf = m_frames;
+ while (pf != pf_end) {
+ Frame& f = *pf++;
+ assert((pos + f.len) <= stop);
+
+ if ((pos + f.len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ f.pos = pos;
+ pos += f.len;
+ }
+
+ assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ } else if (lacing == 2) { // fixed-size lacing
+ if (pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long total_size = stop - pos;
+
+ if ((total_size % m_frame_count) != 0)
+ return E_FILE_FORMAT_INVALID;
+
+ const long long frame_size = total_size / m_frame_count;
+
+ if (frame_size > LONG_MAX || frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ Frame* pf = m_frames;
+ Frame* const pf_end = pf + m_frame_count;
+
+ while (pf != pf_end) {
+ assert((pos + frame_size) <= stop);
+ if ((pos + frame_size) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ Frame& f = *pf++;
+
+ f.pos = pos;
+ f.len = static_cast<long>(frame_size);
+
+ pos += frame_size;
+ }
+
+ assert(pos == stop);
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+
+ } else {
+ assert(lacing == 3); // EBML lacing
+
+ if (pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ long long size = 0;
+ int frame_count = m_frame_count;
+
+ long long frame_size = ReadUInt(pReader, pos, len);
+
+ if (frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (frame_size > LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume length of size of first frame
+
+ if ((pos + frame_size) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ Frame* pf = m_frames;
+ Frame* const pf_end = pf + m_frame_count;
+
+ {
+ Frame& curr = *pf;
+
+ curr.pos = 0; // patch later
+
+ curr.len = static_cast<long>(frame_size);
+ size += curr.len; // contribution of this frame
+ }
+
+ --frame_count;
+
+ while (frame_count > 1) {
+ if (pos >= stop)
+ return E_FILE_FORMAT_INVALID;
+
+ assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ const Frame& prev = *pf++;
+ assert(prev.len == frame_size);
+ if (prev.len != frame_size)
+ return E_FILE_FORMAT_INVALID;
+
+ assert(pf < pf_end);
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ Frame& curr = *pf;
+
+ curr.pos = 0; // patch later
+
+ const long long delta_size_ = ReadUInt(pReader, pos, len);
+
+ if (delta_size_ < 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if ((pos + len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ pos += len; // consume length of (delta) size
+ if (pos > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ const long exp = 7 * len - 1;
+ const long long bias = (1LL << exp) - 1LL;
+ const long long delta_size = delta_size_ - bias;
+
+ frame_size += delta_size;
+
+ if (frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ if (frame_size > LONG_MAX)
+ return E_FILE_FORMAT_INVALID;
+
+ curr.len = static_cast<long>(frame_size);
+ // Check if size + curr.len could overflow.
+ if (size > LLONG_MAX - curr.len) {
+ return E_FILE_FORMAT_INVALID;
+ }
+ size += curr.len; // contribution of this frame
+
+ --frame_count;
+ }
+
+ // parse last frame
+ if (frame_count > 0) {
+ if (pos > stop || pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ const Frame& prev = *pf++;
+ assert(prev.len == frame_size);
+ if (prev.len != frame_size)
+ return E_FILE_FORMAT_INVALID;
+
+ if (pf >= pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ Frame& curr = *pf++;
+ if (pf != pf_end)
+ return E_FILE_FORMAT_INVALID;
+
+ curr.pos = 0; // patch later
+
+ const long long total_size = stop - pos;
+
+ if (total_size < size)
+ return E_FILE_FORMAT_INVALID;
+
+ frame_size = total_size - size;
+
+ if (frame_size > LONG_MAX || frame_size <= 0)
+ return E_FILE_FORMAT_INVALID;
+
+ curr.len = static_cast<long>(frame_size);
+ }
+
+ pf = m_frames;
+ while (pf != pf_end) {
+ Frame& f = *pf++;
+ if ((pos + f.len) > stop)
+ return E_FILE_FORMAT_INVALID;
+
+ f.pos = pos;
+ pos += f.len;
+ }
+
+ if (pos != stop)
+ return E_FILE_FORMAT_INVALID;
+ }
+
+ return 0; // success
+}
+
+long long Block::GetTimeCode(const Cluster* pCluster) const {
+ if (pCluster == 0)
+ return m_timecode;
+
+ const long long tc0 = pCluster->GetTimeCode();
+ assert(tc0 >= 0);
+
+ // Check if tc0 + m_timecode would overflow.
+ if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+ return -1;
+ }
+
+ const long long tc = tc0 + m_timecode;
+
+ return tc; // unscaled timecode units
+}
+
+long long Block::GetTime(const Cluster* pCluster) const {
+ assert(pCluster);
+
+ const long long tc = GetTimeCode(pCluster);
+
+ const Segment* const pSegment = pCluster->m_pSegment;
+ const SegmentInfo* const pInfo = pSegment->GetInfo();
+ assert(pInfo);
+
+ const long long scale = pInfo->GetTimeCodeScale();
+ assert(scale >= 1);
+
+ // Check if tc * scale could overflow.
+ if (tc != 0 && scale > LLONG_MAX / tc) {
+ return -1;
+ }
+ const long long ns = tc * scale;
+
+ return ns;
+}
+
+long long Block::GetTrackNumber() const { return m_track; }
+
+bool Block::IsKey() const {
+ return ((m_flags & static_cast<unsigned char>(1 << 7)) != 0);
+}
+
+void Block::SetKey(bool bKey) {
+ if (bKey)
+ m_flags |= static_cast<unsigned char>(1 << 7);
+ else
+ m_flags &= 0x7F;
+}
+
+bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); }
+
+Block::Lacing Block::GetLacing() const {
+ const int value = int(m_flags & 0x06) >> 1;
+ return static_cast<Lacing>(value);
+}
+
+int Block::GetFrameCount() const { return m_frame_count; }
+
+const Block::Frame& Block::GetFrame(int idx) const {
+ assert(idx >= 0);
+ assert(idx < m_frame_count);
+
+ const Frame& f = m_frames[idx];
+ assert(f.pos > 0);
+ assert(f.len > 0);
+
+ return f;
+}
+
+long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const {
+ assert(pReader);
+ assert(buf);
+
+ const long status = pReader->Read(pos, len, buf);
+ return status;
+}
+
+long long Block::GetDiscardPadding() const { return m_discard_padding; }
+
+} // namespace mkvparser
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvparser.h b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.h
new file mode 100644
index 0000000000..848d01f03e
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.h
@@ -0,0 +1,1147 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVPARSER_H_
+#define MKVPARSER_MKVPARSER_H_
+
+#include <cstddef>
+
+namespace mkvparser {
+
+const int E_PARSE_FAILED = -1;
+const int E_FILE_FORMAT_INVALID = -2;
+const int E_BUFFER_NOT_FULL = -3;
+
+class IMkvReader {
+ public:
+ virtual int Read(long long pos, long len, unsigned char* buf) = 0;
+ virtual int Length(long long* total, long long* available) = 0;
+
+ protected:
+ virtual ~IMkvReader() {}
+};
+
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+ unsigned long long element_size);
+long long GetUIntLength(IMkvReader*, long long, long&);
+long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
+long long UnserializeUInt(IMkvReader*, long long pos, long long size);
+
+long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
+long UnserializeInt(IMkvReader*, long long pos, long long size,
+ long long& result);
+
+long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);
+
+long ParseElementHeader(IMkvReader* pReader,
+ long long& pos, // consume id and size fields
+ long long stop, // if you know size of element's parent
+ long long& id, long long& size);
+
+bool Match(IMkvReader*, long long&, unsigned long, long long&);
+bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&);
+
+void GetVersion(int& major, int& minor, int& build, int& revision);
+
+struct EBMLHeader {
+ EBMLHeader();
+ ~EBMLHeader();
+ long long m_version;
+ long long m_readVersion;
+ long long m_maxIdLength;
+ long long m_maxSizeLength;
+ char* m_docType;
+ long long m_docTypeVersion;
+ long long m_docTypeReadVersion;
+
+ long long Parse(IMkvReader*, long long&);
+ void Init();
+};
+
+class Segment;
+class Track;
+class Cluster;
+
+class Block {
+ Block(const Block&);
+ Block& operator=(const Block&);
+
+ public:
+ const long long m_start;
+ const long long m_size;
+
+ Block(long long start, long long size, long long discard_padding);
+ ~Block();
+
+ long Parse(const Cluster*);
+
+ long long GetTrackNumber() const;
+ long long GetTimeCode(const Cluster*) const; // absolute, but not scaled
+ long long GetTime(const Cluster*) const; // absolute, and scaled (ns)
+ bool IsKey() const;
+ void SetKey(bool);
+ bool IsInvisible() const;
+
+ enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml };
+ Lacing GetLacing() const;
+
+ int GetFrameCount() const; // to index frames: [0, count)
+
+ struct Frame {
+ long long pos; // absolute offset
+ long len;
+
+ long Read(IMkvReader*, unsigned char*) const;
+ };
+
+ const Frame& GetFrame(int frame_index) const;
+
+ long long GetDiscardPadding() const;
+
+ private:
+ long long m_track; // Track::Number()
+ short m_timecode; // relative to cluster
+ unsigned char m_flags;
+
+ Frame* m_frames;
+ int m_frame_count;
+
+ protected:
+ const long long m_discard_padding;
+};
+
+class BlockEntry {
+ BlockEntry(const BlockEntry&);
+ BlockEntry& operator=(const BlockEntry&);
+
+ protected:
+ BlockEntry(Cluster*, long index);
+
+ public:
+ virtual ~BlockEntry();
+
+ bool EOS() const { return (GetKind() == kBlockEOS); }
+ const Cluster* GetCluster() const;
+ long GetIndex() const;
+ virtual const Block* GetBlock() const = 0;
+
+ enum Kind { kBlockEOS, kBlockSimple, kBlockGroup };
+ virtual Kind GetKind() const = 0;
+
+ protected:
+ Cluster* const m_pCluster;
+ const long m_index;
+};
+
+class SimpleBlock : public BlockEntry {
+ SimpleBlock(const SimpleBlock&);
+ SimpleBlock& operator=(const SimpleBlock&);
+
+ public:
+ SimpleBlock(Cluster*, long index, long long start, long long size);
+ long Parse();
+
+ Kind GetKind() const;
+ const Block* GetBlock() const;
+
+ protected:
+ Block m_block;
+};
+
+class BlockGroup : public BlockEntry {
+ BlockGroup(const BlockGroup&);
+ BlockGroup& operator=(const BlockGroup&);
+
+ public:
+ BlockGroup(Cluster*, long index,
+ long long block_start, // absolute pos of block's payload
+ long long block_size, // size of block's payload
+ long long prev, long long next, long long duration,
+ long long discard_padding);
+
+ long Parse();
+
+ Kind GetKind() const;
+ const Block* GetBlock() const;
+
+ long long GetPrevTimeCode() const; // relative to block's time
+ long long GetNextTimeCode() const; // as above
+ long long GetDurationTimeCode() const;
+
+ private:
+ Block m_block;
+ const long long m_prev;
+ const long long m_next;
+ const long long m_duration;
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+class ContentEncoding {
+ public:
+ enum { kCTR = 1 };
+
+ ContentEncoding();
+ ~ContentEncoding();
+
+ // ContentCompression element names
+ struct ContentCompression {
+ ContentCompression();
+ ~ContentCompression();
+
+ unsigned long long algo;
+ unsigned char* settings;
+ long long settings_len;
+ };
+
+ // ContentEncAESSettings element names
+ struct ContentEncAESSettings {
+ ContentEncAESSettings() : cipher_mode(kCTR) {}
+ ~ContentEncAESSettings() {}
+
+ unsigned long long cipher_mode;
+ };
+
+ // ContentEncryption element names
+ struct ContentEncryption {
+ ContentEncryption();
+ ~ContentEncryption();
+
+ unsigned long long algo;
+ unsigned char* key_id;
+ long long key_id_len;
+ unsigned char* signature;
+ long long signature_len;
+ unsigned char* sig_key_id;
+ long long sig_key_id_len;
+ unsigned long long sig_algo;
+ unsigned long long sig_hash_algo;
+
+ ContentEncAESSettings aes_settings;
+ };
+
+ // Returns ContentCompression represented by |idx|. Returns NULL if |idx|
+ // is out of bounds.
+ const ContentCompression* GetCompressionByIndex(unsigned long idx) const;
+
+ // Returns number of ContentCompression elements in this ContentEncoding
+ // element.
+ unsigned long GetCompressionCount() const;
+
+ // Parses the ContentCompression element from |pReader|. |start| is the
+ // starting offset of the ContentCompression payload. |size| is the size in
+ // bytes of the ContentCompression payload. |compression| is where the parsed
+ // values will be stored.
+ long ParseCompressionEntry(long long start, long long size,
+ IMkvReader* pReader,
+ ContentCompression* compression);
+
+ // Returns ContentEncryption represented by |idx|. Returns NULL if |idx|
+ // is out of bounds.
+ const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const;
+
+ // Returns number of ContentEncryption elements in this ContentEncoding
+ // element.
+ unsigned long GetEncryptionCount() const;
+
+ // Parses the ContentEncAESSettings element from |pReader|. |start| is the
+ // starting offset of the ContentEncAESSettings payload. |size| is the
+ // size in bytes of the ContentEncAESSettings payload. |encryption| is
+ // where the parsed values will be stored.
+ long ParseContentEncAESSettingsEntry(long long start, long long size,
+ IMkvReader* pReader,
+ ContentEncAESSettings* aes);
+
+ // Parses the ContentEncoding element from |pReader|. |start| is the
+ // starting offset of the ContentEncoding payload. |size| is the size in
+ // bytes of the ContentEncoding payload. Returns true on success.
+ long ParseContentEncodingEntry(long long start, long long size,
+ IMkvReader* pReader);
+
+ // Parses the ContentEncryption element from |pReader|. |start| is the
+ // starting offset of the ContentEncryption payload. |size| is the size in
+ // bytes of the ContentEncryption payload. |encryption| is where the parsed
+ // values will be stored.
+ long ParseEncryptionEntry(long long start, long long size,
+ IMkvReader* pReader, ContentEncryption* encryption);
+
+ unsigned long long encoding_order() const { return encoding_order_; }
+ unsigned long long encoding_scope() const { return encoding_scope_; }
+ unsigned long long encoding_type() const { return encoding_type_; }
+
+ private:
+ // Member variables for list of ContentCompression elements.
+ ContentCompression** compression_entries_;
+ ContentCompression** compression_entries_end_;
+
+ // Member variables for list of ContentEncryption elements.
+ ContentEncryption** encryption_entries_;
+ ContentEncryption** encryption_entries_end_;
+
+ // ContentEncoding element names
+ unsigned long long encoding_order_;
+ unsigned long long encoding_scope_;
+ unsigned long long encoding_type_;
+
+ // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+ ContentEncoding(const ContentEncoding&);
+ ContentEncoding& operator=(const ContentEncoding&);
+};
+
+class Track {
+ Track(const Track&);
+ Track& operator=(const Track&);
+
+ public:
+ class Info;
+ static long Create(Segment*, const Info&, long long element_start,
+ long long element_size, Track*&);
+
+ enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 };
+
+ Segment* const m_pSegment;
+ const long long m_element_start;
+ const long long m_element_size;
+ virtual ~Track();
+
+ long GetType() const;
+ long GetNumber() const;
+ unsigned long long GetUid() const;
+ const char* GetNameAsUTF8() const;
+ const char* GetLanguage() const;
+ const char* GetCodecNameAsUTF8() const;
+ const char* GetCodecId() const;
+ const unsigned char* GetCodecPrivate(size_t&) const;
+ bool GetLacing() const;
+ unsigned long long GetDefaultDuration() const;
+ unsigned long long GetCodecDelay() const;
+ unsigned long long GetSeekPreRoll() const;
+
+ const BlockEntry* GetEOS() const;
+
+ struct Settings {
+ long long start;
+ long long size;
+ };
+
+ class Info {
+ public:
+ Info();
+ ~Info();
+ int Copy(Info&) const;
+ void Clear();
+ long type;
+ long number;
+ unsigned long long uid;
+ unsigned long long defaultDuration;
+ unsigned long long codecDelay;
+ unsigned long long seekPreRoll;
+ char* nameAsUTF8;
+ char* language;
+ char* codecId;
+ char* codecNameAsUTF8;
+ unsigned char* codecPrivate;
+ size_t codecPrivateSize;
+ bool lacing;
+ Settings settings;
+
+ private:
+ Info(const Info&);
+ Info& operator=(const Info&);
+ int CopyStr(char* Info::*str, Info&) const;
+ };
+
+ long GetFirst(const BlockEntry*&) const;
+ long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const;
+ virtual bool VetEntry(const BlockEntry*) const;
+ virtual long Seek(long long time_ns, const BlockEntry*&) const;
+
+ const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const;
+ unsigned long GetContentEncodingCount() const;
+
+ long ParseContentEncodingsEntry(long long start, long long size);
+
+ protected:
+ Track(Segment*, long long element_start, long long element_size);
+
+ Info m_info;
+
+ class EOSBlock : public BlockEntry {
+ public:
+ EOSBlock();
+
+ Kind GetKind() const;
+ const Block* GetBlock() const;
+ };
+
+ EOSBlock m_eos;
+
+ private:
+ ContentEncoding** content_encoding_entries_;
+ ContentEncoding** content_encoding_entries_end_;
+};
+
+struct PrimaryChromaticity {
+ PrimaryChromaticity() : x(0), y(0) {}
+ ~PrimaryChromaticity() {}
+ static bool Parse(IMkvReader* reader, long long read_pos,
+ long long value_size, bool is_x,
+ PrimaryChromaticity** chromaticity);
+ float x;
+ float y;
+};
+
+struct MasteringMetadata {
+ static const float kValueNotPresent;
+
+ MasteringMetadata()
+ : r(NULL),
+ g(NULL),
+ b(NULL),
+ white_point(NULL),
+ luminance_max(kValueNotPresent),
+ luminance_min(kValueNotPresent) {}
+ ~MasteringMetadata() {
+ delete r;
+ delete g;
+ delete b;
+ delete white_point;
+ }
+
+ static bool Parse(IMkvReader* reader, long long element_start,
+ long long element_size,
+ MasteringMetadata** mastering_metadata);
+
+ PrimaryChromaticity* r;
+ PrimaryChromaticity* g;
+ PrimaryChromaticity* b;
+ PrimaryChromaticity* white_point;
+ float luminance_max;
+ float luminance_min;
+};
+
+struct Colour {
+ static const long long kValueNotPresent;
+
+ // Unless otherwise noted all values assigned upon construction are the
+ // equivalent of unspecified/default.
+ Colour()
+ : matrix_coefficients(kValueNotPresent),
+ bits_per_channel(kValueNotPresent),
+ chroma_subsampling_horz(kValueNotPresent),
+ chroma_subsampling_vert(kValueNotPresent),
+ cb_subsampling_horz(kValueNotPresent),
+ cb_subsampling_vert(kValueNotPresent),
+ chroma_siting_horz(kValueNotPresent),
+ chroma_siting_vert(kValueNotPresent),
+ range(kValueNotPresent),
+ transfer_characteristics(kValueNotPresent),
+ primaries(kValueNotPresent),
+ max_cll(kValueNotPresent),
+ max_fall(kValueNotPresent),
+ mastering_metadata(NULL) {}
+ ~Colour() {
+ delete mastering_metadata;
+ mastering_metadata = NULL;
+ }
+
+ static bool Parse(IMkvReader* reader, long long element_start,
+ long long element_size, Colour** colour);
+
+ long long matrix_coefficients;
+ long long bits_per_channel;
+ long long chroma_subsampling_horz;
+ long long chroma_subsampling_vert;
+ long long cb_subsampling_horz;
+ long long cb_subsampling_vert;
+ long long chroma_siting_horz;
+ long long chroma_siting_vert;
+ long long range;
+ long long transfer_characteristics;
+ long long primaries;
+ long long max_cll;
+ long long max_fall;
+
+ MasteringMetadata* mastering_metadata;
+};
+
+struct Projection {
+ enum ProjectionType {
+ kTypeNotPresent = -1,
+ kRectangular = 0,
+ kEquirectangular = 1,
+ kCubeMap = 2,
+ kMesh = 3,
+ };
+ static const float kValueNotPresent;
+ Projection()
+ : type(kTypeNotPresent),
+ private_data(NULL),
+ private_data_length(0),
+ pose_yaw(kValueNotPresent),
+ pose_pitch(kValueNotPresent),
+ pose_roll(kValueNotPresent) {}
+ ~Projection() { delete[] private_data; }
+ static bool Parse(IMkvReader* reader, long long element_start,
+ long long element_size, Projection** projection);
+
+ ProjectionType type;
+ unsigned char* private_data;
+ size_t private_data_length;
+ float pose_yaw;
+ float pose_pitch;
+ float pose_roll;
+};
+
+class VideoTrack : public Track {
+ VideoTrack(const VideoTrack&);
+ VideoTrack& operator=(const VideoTrack&);
+
+ VideoTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+ virtual ~VideoTrack();
+ static long Parse(Segment*, const Info&, long long element_start,
+ long long element_size, VideoTrack*&);
+
+ long long GetWidth() const;
+ long long GetHeight() const;
+ long long GetDisplayWidth() const;
+ long long GetDisplayHeight() const;
+ long long GetDisplayUnit() const;
+ long long GetStereoMode() const;
+ double GetFrameRate() const;
+
+ bool VetEntry(const BlockEntry*) const;
+ long Seek(long long time_ns, const BlockEntry*&) const;
+
+ Colour* GetColour() const;
+
+ Projection* GetProjection() const;
+
+ const char* GetColourSpace() const { return m_colour_space; }
+
+ private:
+ long long m_width;
+ long long m_height;
+ long long m_display_width;
+ long long m_display_height;
+ long long m_display_unit;
+ long long m_stereo_mode;
+ char* m_colour_space;
+ double m_rate;
+
+ Colour* m_colour;
+ Projection* m_projection;
+};
+
+class AudioTrack : public Track {
+ AudioTrack(const AudioTrack&);
+ AudioTrack& operator=(const AudioTrack&);
+
+ AudioTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+ static long Parse(Segment*, const Info&, long long element_start,
+ long long element_size, AudioTrack*&);
+
+ double GetSamplingRate() const;
+ long long GetChannels() const;
+ long long GetBitDepth() const;
+
+ private:
+ double m_rate;
+ long long m_channels;
+ long long m_bitDepth;
+};
+
+class Tracks {
+ Tracks(const Tracks&);
+ Tracks& operator=(const Tracks&);
+
+ public:
+ Segment* const m_pSegment;
+ const long long m_start;
+ const long long m_size;
+ const long long m_element_start;
+ const long long m_element_size;
+
+ Tracks(Segment*, long long start, long long size, long long element_start,
+ long long element_size);
+
+ ~Tracks();
+
+ long Parse();
+
+ unsigned long GetTracksCount() const;
+
+ const Track* GetTrackByNumber(long tn) const;
+ const Track* GetTrackByIndex(unsigned long idx) const;
+
+ private:
+ Track** m_trackEntries;
+ Track** m_trackEntriesEnd;
+
+ long ParseTrackEntry(long long payload_start, long long payload_size,
+ long long element_start, long long element_size,
+ Track*&) const;
+};
+
+class Chapters {
+ Chapters(const Chapters&);
+ Chapters& operator=(const Chapters&);
+
+ public:
+ Segment* const m_pSegment;
+ const long long m_start;
+ const long long m_size;
+ const long long m_element_start;
+ const long long m_element_size;
+
+ Chapters(Segment*, long long payload_start, long long payload_size,
+ long long element_start, long long element_size);
+
+ ~Chapters();
+
+ long Parse();
+
+ class Atom;
+ class Edition;
+
+ class Display {
+ friend class Atom;
+ Display();
+ Display(const Display&);
+ ~Display();
+ Display& operator=(const Display&);
+
+ public:
+ const char* GetString() const;
+ const char* GetLanguage() const;
+ const char* GetCountry() const;
+
+ private:
+ void Init();
+ void ShallowCopy(Display&) const;
+ void Clear();
+ long Parse(IMkvReader*, long long pos, long long size);
+
+ char* m_string;
+ char* m_language;
+ char* m_country;
+ };
+
+ class Atom {
+ friend class Edition;
+ Atom();
+ Atom(const Atom&);
+ ~Atom();
+ Atom& operator=(const Atom&);
+
+ public:
+ unsigned long long GetUID() const;
+ const char* GetStringUID() const;
+
+ long long GetStartTimecode() const;
+ long long GetStopTimecode() const;
+
+ long long GetStartTime(const Chapters*) const;
+ long long GetStopTime(const Chapters*) const;
+
+ int GetDisplayCount() const;
+ const Display* GetDisplay(int index) const;
+
+ private:
+ void Init();
+ void ShallowCopy(Atom&) const;
+ void Clear();
+ long Parse(IMkvReader*, long long pos, long long size);
+ static long long GetTime(const Chapters*, long long timecode);
+
+ long ParseDisplay(IMkvReader*, long long pos, long long size);
+ bool ExpandDisplaysArray();
+
+ char* m_string_uid;
+ unsigned long long m_uid;
+ long long m_start_timecode;
+ long long m_stop_timecode;
+
+ Display* m_displays;
+ int m_displays_size;
+ int m_displays_count;
+ };
+
+ class Edition {
+ friend class Chapters;
+ Edition();
+ Edition(const Edition&);
+ ~Edition();
+ Edition& operator=(const Edition&);
+
+ public:
+ int GetAtomCount() const;
+ const Atom* GetAtom(int index) const;
+
+ private:
+ void Init();
+ void ShallowCopy(Edition&) const;
+ void Clear();
+ long Parse(IMkvReader*, long long pos, long long size);
+
+ long ParseAtom(IMkvReader*, long long pos, long long size);
+ bool ExpandAtomsArray();
+
+ Atom* m_atoms;
+ int m_atoms_size;
+ int m_atoms_count;
+ };
+
+ int GetEditionCount() const;
+ const Edition* GetEdition(int index) const;
+
+ private:
+ long ParseEdition(long long pos, long long size);
+ bool ExpandEditionsArray();
+
+ Edition* m_editions;
+ int m_editions_size;
+ int m_editions_count;
+};
+
+class Tags {
+ Tags(const Tags&);
+ Tags& operator=(const Tags&);
+
+ public:
+ Segment* const m_pSegment;
+ const long long m_start;
+ const long long m_size;
+ const long long m_element_start;
+ const long long m_element_size;
+
+ Tags(Segment*, long long payload_start, long long payload_size,
+ long long element_start, long long element_size);
+
+ ~Tags();
+
+ long Parse();
+
+ class Tag;
+ class SimpleTag;
+
+ class SimpleTag {
+ friend class Tag;
+ SimpleTag();
+ SimpleTag(const SimpleTag&);
+ ~SimpleTag();
+ SimpleTag& operator=(const SimpleTag&);
+
+ public:
+ const char* GetTagName() const;
+ const char* GetTagString() const;
+
+ private:
+ void Init();
+ void ShallowCopy(SimpleTag&) const;
+ void Clear();
+ long Parse(IMkvReader*, long long pos, long long size);
+
+ char* m_tag_name;
+ char* m_tag_string;
+ };
+
+ class Tag {
+ friend class Tags;
+ Tag();
+ Tag(const Tag&);
+ ~Tag();
+ Tag& operator=(const Tag&);
+
+ public:
+ int GetSimpleTagCount() const;
+ const SimpleTag* GetSimpleTag(int index) const;
+
+ private:
+ void Init();
+ void ShallowCopy(Tag&) const;
+ void Clear();
+ long Parse(IMkvReader*, long long pos, long long size);
+
+ long ParseSimpleTag(IMkvReader*, long long pos, long long size);
+ bool ExpandSimpleTagsArray();
+
+ SimpleTag* m_simple_tags;
+ int m_simple_tags_size;
+ int m_simple_tags_count;
+ };
+
+ int GetTagCount() const;
+ const Tag* GetTag(int index) const;
+
+ private:
+ long ParseTag(long long pos, long long size);
+ bool ExpandTagsArray();
+
+ Tag* m_tags;
+ int m_tags_size;
+ int m_tags_count;
+};
+
+class SegmentInfo {
+ SegmentInfo(const SegmentInfo&);
+ SegmentInfo& operator=(const SegmentInfo&);
+
+ public:
+ Segment* const m_pSegment;
+ const long long m_start;
+ const long long m_size;
+ const long long m_element_start;
+ const long long m_element_size;
+
+ SegmentInfo(Segment*, long long start, long long size,
+ long long element_start, long long element_size);
+
+ ~SegmentInfo();
+
+ long Parse();
+
+ long long GetTimeCodeScale() const;
+ long long GetDuration() const; // scaled
+ const char* GetMuxingAppAsUTF8() const;
+ const char* GetWritingAppAsUTF8() const;
+ const char* GetTitleAsUTF8() const;
+
+ private:
+ long long m_timecodeScale;
+ double m_duration;
+ char* m_pMuxingAppAsUTF8;
+ char* m_pWritingAppAsUTF8;
+ char* m_pTitleAsUTF8;
+};
+
+class SeekHead {
+ SeekHead(const SeekHead&);
+ SeekHead& operator=(const SeekHead&);
+
+ public:
+ Segment* const m_pSegment;
+ const long long m_start;
+ const long long m_size;
+ const long long m_element_start;
+ const long long m_element_size;
+
+ SeekHead(Segment*, long long start, long long size, long long element_start,
+ long long element_size);
+
+ ~SeekHead();
+
+ long Parse();
+
+ struct Entry {
+ Entry();
+
+ // the SeekHead entry payload
+ long long id;
+ long long pos;
+
+ // absolute pos of SeekEntry ID
+ long long element_start;
+
+ // SeekEntry ID size + size size + payload
+ long long element_size;
+ };
+
+ int GetCount() const;
+ const Entry* GetEntry(int idx) const;
+
+ struct VoidElement {
+ // absolute pos of Void ID
+ long long element_start;
+
+ // ID size + size size + payload size
+ long long element_size;
+ };
+
+ int GetVoidElementCount() const;
+ const VoidElement* GetVoidElement(int idx) const;
+
+ private:
+ Entry* m_entries;
+ int m_entry_count;
+
+ VoidElement* m_void_elements;
+ int m_void_element_count;
+
+ static bool ParseEntry(IMkvReader*,
+ long long pos, // payload
+ long long size, Entry*);
+};
+
+class Cues;
+class CuePoint {
+ friend class Cues;
+
+ CuePoint(long, long long);
+ ~CuePoint();
+
+ CuePoint(const CuePoint&);
+ CuePoint& operator=(const CuePoint&);
+
+ public:
+ long long m_element_start;
+ long long m_element_size;
+
+ bool Load(IMkvReader*);
+
+ long long GetTimeCode() const; // absolute but unscaled
+ long long GetTime(const Segment*) const; // absolute and scaled (ns units)
+
+ struct TrackPosition {
+ long long m_track;
+ long long m_pos; // of cluster
+ long long m_block;
+ // codec_state //defaults to 0
+ // reference = clusters containing req'd referenced blocks
+ // reftime = timecode of the referenced block
+
+ bool Parse(IMkvReader*, long long, long long);
+ };
+
+ const TrackPosition* Find(const Track*) const;
+
+ private:
+ const long m_index;
+ long long m_timecode;
+ TrackPosition* m_track_positions;
+ size_t m_track_positions_count;
+};
+
+class Cues {
+ friend class Segment;
+
+ Cues(Segment*, long long start, long long size, long long element_start,
+ long long element_size);
+ ~Cues();
+
+ Cues(const Cues&);
+ Cues& operator=(const Cues&);
+
+ public:
+ Segment* const m_pSegment;
+ const long long m_start;
+ const long long m_size;
+ const long long m_element_start;
+ const long long m_element_size;
+
+ bool Find( // lower bound of time_ns
+ long long time_ns, const Track*, const CuePoint*&,
+ const CuePoint::TrackPosition*&) const;
+
+ const CuePoint* GetFirst() const;
+ const CuePoint* GetLast() const;
+ const CuePoint* GetNext(const CuePoint*) const;
+
+ const BlockEntry* GetBlock(const CuePoint*,
+ const CuePoint::TrackPosition*) const;
+
+ bool LoadCuePoint() const;
+ long GetCount() const; // loaded only
+ // long GetTotal() const; //loaded + preloaded
+ bool DoneParsing() const;
+
+ private:
+ bool Init() const;
+ bool PreloadCuePoint(long&, long long) const;
+
+ mutable CuePoint** m_cue_points;
+ mutable long m_count;
+ mutable long m_preload_count;
+ mutable long long m_pos;
+};
+
+class Cluster {
+ friend class Segment;
+
+ Cluster(const Cluster&);
+ Cluster& operator=(const Cluster&);
+
+ public:
+ Segment* const m_pSegment;
+
+ public:
+ static Cluster* Create(Segment*,
+ long index, // index in segment
+ long long off); // offset relative to segment
+ // long long element_size);
+
+ Cluster(); // EndOfStream
+ ~Cluster();
+
+ bool EOS() const;
+
+ long long GetTimeCode() const; // absolute, but not scaled
+ long long GetTime() const; // absolute, and scaled (nanosecond units)
+ long long GetFirstTime() const; // time (ns) of first (earliest) block
+ long long GetLastTime() const; // time (ns) of last (latest) block
+
+ long GetFirst(const BlockEntry*&) const;
+ long GetLast(const BlockEntry*&) const;
+ long GetNext(const BlockEntry* curr, const BlockEntry*& next) const;
+
+ const BlockEntry* GetEntry(const Track*, long long ns = -1) const;
+ const BlockEntry* GetEntry(const CuePoint&,
+ const CuePoint::TrackPosition&) const;
+ // const BlockEntry* GetMaxKey(const VideoTrack*) const;
+
+ // static bool HasBlockEntries(const Segment*, long long);
+
+ static long HasBlockEntries(const Segment*, long long idoff, long long& pos,
+ long& size);
+
+ long GetEntryCount() const;
+
+ long Load(long long& pos, long& size) const;
+
+ long Parse(long long& pos, long& size) const;
+ long GetEntry(long index, const mkvparser::BlockEntry*&) const;
+
+ protected:
+ Cluster(Segment*, long index, long long element_start);
+ // long long element_size);
+
+ public:
+ const long long m_element_start;
+ long long GetPosition() const; // offset relative to segment
+
+ long GetIndex() const;
+ long long GetElementSize() const;
+ // long long GetPayloadSize() const;
+
+ // long long Unparsed() const;
+
+ private:
+ long m_index;
+ mutable long long m_pos;
+ // mutable long long m_size;
+ mutable long long m_element_size;
+ mutable long long m_timecode;
+ mutable BlockEntry** m_entries;
+ mutable long m_entries_size;
+ mutable long m_entries_count;
+
+ long ParseSimpleBlock(long long, long long&, long&);
+ long ParseBlockGroup(long long, long long&, long&);
+
+ long CreateBlock(long long id, long long pos, long long size,
+ long long discard_padding);
+ long CreateBlockGroup(long long start_offset, long long size,
+ long long discard_padding);
+ long CreateSimpleBlock(long long, long long);
+};
+
+class Segment {
+ friend class Cues;
+ friend class Track;
+ friend class VideoTrack;
+
+ Segment(const Segment&);
+ Segment& operator=(const Segment&);
+
+ private:
+ Segment(IMkvReader*, long long elem_start,
+ // long long elem_size,
+ long long pos, long long size);
+
+ public:
+ IMkvReader* const m_pReader;
+ const long long m_element_start;
+ // const long long m_element_size;
+ const long long m_start; // posn of segment payload
+ const long long m_size; // size of segment payload
+ Cluster m_eos; // TODO: make private?
+
+ static long long CreateInstance(IMkvReader*, long long, Segment*&);
+ ~Segment();
+
+ long Load(); // loads headers and all clusters
+
+ // for incremental loading
+ // long long Unparsed() const;
+ bool DoneParsing() const;
+ long long ParseHeaders(); // stops when first cluster is found
+ // long FindNextCluster(long long& pos, long& size) const;
+ long LoadCluster(long long& pos, long& size); // load one cluster
+ long LoadCluster();
+
+ long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
+ long& size);
+
+ const SeekHead* GetSeekHead() const;
+ const Tracks* GetTracks() const;
+ const SegmentInfo* GetInfo() const;
+ const Cues* GetCues() const;
+ const Chapters* GetChapters() const;
+ const Tags* GetTags() const;
+
+ long long GetDuration() const;
+
+ unsigned long GetCount() const;
+ const Cluster* GetFirst() const;
+ const Cluster* GetLast() const;
+ const Cluster* GetNext(const Cluster*);
+
+ const Cluster* FindCluster(long long time_nanoseconds) const;
+ // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const;
+
+ const Cluster* FindOrPreloadCluster(long long pos);
+
+ long ParseCues(long long cues_off, // offset relative to start of segment
+ long long& parse_pos, long& parse_len);
+
+ private:
+ long long m_pos; // absolute file posn; what has been consumed so far
+ Cluster* m_pUnknownSize;
+
+ SeekHead* m_pSeekHead;
+ SegmentInfo* m_pInfo;
+ Tracks* m_pTracks;
+ Cues* m_pCues;
+ Chapters* m_pChapters;
+ Tags* m_pTags;
+ Cluster** m_clusters;
+ long m_clusterCount; // number of entries for which m_index >= 0
+ long m_clusterPreloadCount; // number of entries for which m_index < 0
+ long m_clusterSize; // array size
+
+ long DoLoadCluster(long long&, long&);
+ long DoLoadClusterUnknownSize(long long&, long&);
+ long DoParseNext(const Cluster*&, long long&, long&);
+
+ bool AppendCluster(Cluster*);
+ bool PreloadCluster(Cluster*, ptrdiff_t);
+
+ // void ParseSeekHead(long long pos, long long size);
+ // void ParseSeekEntry(long long pos, long long size);
+ // void ParseCues(long long);
+
+ const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
+};
+
+} // namespace mkvparser
+
+inline long mkvparser::Segment::LoadCluster() {
+ long long pos;
+ long size;
+
+ return LoadCluster(pos, size);
+}
+
+#endif // MKVPARSER_MKVPARSER_H_
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc
new file mode 100644
index 0000000000..9d19c1be56
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "mkvparser/mkvreader.h"
+
+#include <sys/types.h>
+
+#include <cassert>
+
+namespace mkvparser {
+
+MkvReader::MkvReader() : m_file(NULL), reader_owns_file_(true) {}
+
+MkvReader::MkvReader(FILE* fp) : m_file(fp), reader_owns_file_(false) {
+ GetFileSize();
+}
+
+MkvReader::~MkvReader() {
+ if (reader_owns_file_)
+ Close();
+ m_file = NULL;
+}
+
+int MkvReader::Open(const char* fileName) {
+ if (fileName == NULL)
+ return -1;
+
+ if (m_file)
+ return -1;
+
+#ifdef _MSC_VER
+ const errno_t e = fopen_s(&m_file, fileName, "rb");
+
+ if (e)
+ return -1; // error
+#else
+ m_file = fopen(fileName, "rb");
+
+ if (m_file == NULL)
+ return -1;
+#endif
+ return !GetFileSize();
+}
+
+bool MkvReader::GetFileSize() {
+ if (m_file == NULL)
+ return false;
+#ifdef _MSC_VER
+ int status = _fseeki64(m_file, 0L, SEEK_END);
+
+ if (status)
+ return false; // error
+
+ m_length = _ftelli64(m_file);
+#else
+ fseek(m_file, 0L, SEEK_END);
+ m_length = ftell(m_file);
+#endif
+ assert(m_length >= 0);
+
+ if (m_length < 0)
+ return false;
+
+#ifdef _MSC_VER
+ status = _fseeki64(m_file, 0L, SEEK_SET);
+
+ if (status)
+ return false; // error
+#else
+ fseek(m_file, 0L, SEEK_SET);
+#endif
+
+ return true;
+}
+
+void MkvReader::Close() {
+ if (m_file != NULL) {
+ fclose(m_file);
+ m_file = NULL;
+ }
+}
+
+int MkvReader::Length(long long* total, long long* available) {
+ if (m_file == NULL)
+ return -1;
+
+ if (total)
+ *total = m_length;
+
+ if (available)
+ *available = m_length;
+
+ return 0;
+}
+
+int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
+ if (m_file == NULL)
+ return -1;
+
+ if (offset < 0)
+ return -1;
+
+ if (len < 0)
+ return -1;
+
+ if (len == 0)
+ return 0;
+
+ if (offset >= m_length)
+ return -1;
+
+#ifdef _MSC_VER
+ const int status = _fseeki64(m_file, offset, SEEK_SET);
+
+ if (status)
+ return -1; // error
+#elif defined(_WIN32)
+ fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
+#else
+ fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
+#endif
+
+ const size_t size = fread(buffer, 1, len, m_file);
+
+ if (size < size_t(len))
+ return -1; // error
+
+ return 0; // success
+}
+
+} // namespace mkvparser
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvreader.h b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.h
new file mode 100644
index 0000000000..9831ecf645
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVREADER_H_
+#define MKVPARSER_MKVREADER_H_
+
+#include <cstdio>
+
+#include "mkvparser/mkvparser.h"
+
+namespace mkvparser {
+
+class MkvReader : public IMkvReader {
+ public:
+ MkvReader();
+ explicit MkvReader(FILE* fp);
+ virtual ~MkvReader();
+
+ int Open(const char*);
+ void Close();
+
+ virtual int Read(long long position, long length, unsigned char* buffer);
+ virtual int Length(long long* total, long long* available);
+
+ private:
+ MkvReader(const MkvReader&);
+ MkvReader& operator=(const MkvReader&);
+
+ // Determines the size of the file. This is called either by the constructor
+ // or by the Open function depending on file ownership. Returns true on
+ // success.
+ bool GetFileSize();
+
+ long long m_length;
+ FILE* m_file;
+ bool reader_owns_file_;
+};
+
+} // namespace mkvparser
+
+#endif // MKVPARSER_MKVREADER_H_
diff --git a/third_party/aom/third_party/libyuv/LICENSE b/third_party/aom/third_party/libyuv/LICENSE
new file mode 100644
index 0000000000..c911747a6b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name of Google nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/aom/third_party/libyuv/README.libaom b/third_party/aom/third_party/libyuv/README.libaom
new file mode 100644
index 0000000000..6e66f858e2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/README.libaom
@@ -0,0 +1,37 @@
+Name: libyuv
+URL: https://chromium.googlesource.com/libyuv/libyuv/
+Version: dfaf7534e0e536f7e5ef8ddd7326797bd09b8622
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
+
+diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
+index fe89452b7..72a7fb82f 100644
+--- a/third_party/libyuv/source/cpu_id.cc
++++ b/third_party/libyuv/source/cpu_id.cc
+@@ -108,7 +108,7 @@ void CpuId(int eax, int ecx, int* cpu_info) {
+ // }
+ // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+ // https://code.google.com/p/libyuv/issues/detail?id=529
+-#if defined(_M_IX86) && (_MSC_VER < 1900)
++#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+ #pragma optimize("g", off)
+ #endif
+ #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+@@ -129,7 +129,7 @@ int GetXCR0() {
+ #define GetXCR0() 0
+ #endif // defined(_M_IX86) || defined(_M_X64) ..
+ // Return optimization to previous setting.
+-#if defined(_M_IX86) && (_MSC_VER < 1900)
++#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+ #pragma optimize("g", on)
+ #endif
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/basic_types.h b/third_party/aom/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 0000000000..1bea67f2f2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h> // For size_t and NULL
+
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
+#define INT_TYPES_DEFINED
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#include <sys/types.h> // for uintptr_t on x86
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef unsigned short uint16_t;
+typedef short int16_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+#else
+#include <stdint.h> // for uintptr_t and C99 types
+#endif // defined(_MSC_VER) && (_MSC_VER < 1600)
+// Types are deprecated. Enable this macro for legacy types.
+#ifdef LIBYUV_LEGACY_TYPES
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint8_t uint8;
+typedef int8_t int8;
+#endif // LIBYUV_LEGACY_TYPES
+#endif // INT_TYPES_DEFINED
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+ defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
+#else
+#define LIBYUV_API
+#endif // __GNUC__
+#endif // LIBYUV_API
+
+// TODO(fbarchard): Remove bool macros.
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/compare.h b/third_party/aom/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 0000000000..3353ad71c6
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
+
+// Hamming Distance
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32_t ARGBDetect(const uint8_t* argb,
+ int stride_argb,
+ int width,
+ int height);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+
+LIBYUV_API
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+double I420Psnr(const uint8_t* src_y_a,
+ int stride_y_a,
+ const uint8_t* src_u_a,
+ int stride_u_a,
+ const uint8_t* src_v_a,
+ int stride_v_a,
+ const uint8_t* src_y_b,
+ int stride_y_b,
+ const uint8_t* src_u_b,
+ int stride_u_b,
+ const uint8_t* src_v_b,
+ int stride_v_b,
+ int width,
+ int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+double I420Ssim(const uint8_t* src_y_a,
+ int stride_y_a,
+ const uint8_t* src_u_a,
+ int stride_u_a,
+ const uint8_t* src_v_a,
+ int stride_v_a,
+ const uint8_t* src_y_b,
+ int stride_y_b,
+ const uint8_t* src_u_b,
+ int stride_u_b,
+ const uint8_t* src_v_b,
+ int stride_v_b,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert.h b/third_party/aom/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 0000000000..026b153cef
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,526 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h" // For enum RotationMode.
+
+// TODO(fbarchard): fix WebRTC source to include following libyuv headers:
+#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620
+#include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I444 to NV12.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// JPEG to NV12
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8_t* sample,
+ size_t sample_size,
+ int* width,
+ int* height);
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 0000000000..715a3dad97
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,1611 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/rotate.h" // For enum RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Conversion matrix for YUV to RGB
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
+
+// Conversion matrix for YVU to BGR
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
+
+// Macros for end swapped destination Matrix conversions.
+// Swap UV and pass mirrored kYvuJPEGConstants matrix.
+// TODO(fbarchard): Add macro for each Matrix function.
+#define kYuvI601ConstantsVU kYvuI601Constants
+#define kYuvJPEGConstantsVU kYvuJPEGConstants
+#define kYuvH709ConstantsVU kYvuH709Constants
+#define kYuv2020ConstantsVU kYvu2020Constants
+#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
+ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
+#define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \
+ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n)
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I420 with Alpha to preattenuated ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I420 with Alpha to preattenuated ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV12 to ABGR.
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB big endian (rgb in memory) to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate);
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height);
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "sample_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert_from.h b/third_party/aom/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 0000000000..5140ed4f3e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// Convert 8 bit YUV to 10 bit.
+#define H420ToH010 I420ToI010
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height);
+
+// The following are from convert_argb.h
+// DEPRECATED: The prototypes will be removed in future. Use convert_argb.h
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8_t* y,
+ int y_stride,
+ const uint8_t* u,
+ int u_stride,
+ const uint8_t* v,
+ int v_stride,
+ uint8_t* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
+ uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 0000000000..d992363ceb
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW ARGBToRGB24
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8_t(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// Convert RGBA to J400. (JPeg full range).
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ int width,
+ int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert ABGR To NV12.
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert ABGR To NV21.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 0000000000..3e27cc107d
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Internal flag to indicate cpuid requires initialization.
+static const int kCpuInitialized = 0x1;
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100; // unused at this time.
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VBMI = 0x20000;
+static const int kCpuHasAVX512VBMI2 = 0x40000;
+static const int kCpuHasAVX512VBITALG = 0x80000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x200000;
+static const int kCpuHasMSA = 0x400000;
+static const int kCpuHasMMI = 0x800000;
+
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// Returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+ LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+ int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+ int cpu_info = cpu_info_;
+#endif
+ return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
+}
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+LIBYUV_API
+int MipsCpuCaps(const char* cpuinfo_name);
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags);
+
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+// again.
+// - enabling CPU features that are not supported by the CPU will result in
+// undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+ LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+ __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+ cpu_info_ = cpu_flags;
+#endif
+}
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int* cpu_info);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 0000000000..275f8d4c18
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+ kJpegYuv420,
+ kJpegYuv422,
+ kJpegYuv444,
+ kJpegYuv400,
+ kJpegUnknown
+};
+
+struct Buffer {
+ const uint8_t* data;
+ int len;
+};
+
+struct BufferVector {
+ Buffer* buffers;
+ int len;
+ int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+ typedef void (*CallbackFunction)(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows);
+
+ static const int kColorSpaceUnknown;
+ static const int kColorSpaceGrayscale;
+ static const int kColorSpaceRgb;
+ static const int kColorSpaceYCbCr;
+ static const int kColorSpaceCMYK;
+ static const int kColorSpaceYCCK;
+
+ MJpegDecoder();
+ ~MJpegDecoder();
+
+ // Loads a new frame, reads its headers, and determines the uncompressed
+ // image format.
+ // Returns LIBYUV_TRUE if image looks valid and format is supported.
+ // If return value is LIBYUV_TRUE, then the values for all the following
+ // getters are populated.
+ // src_len is the size of the compressed mjpeg frame in bytes.
+ LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
+
+ // Returns width of the last loaded frame in pixels.
+ int GetWidth();
+
+ // Returns height of the last loaded frame in pixels.
+ int GetHeight();
+
+ // Returns format of the last loaded frame. The return value is one of the
+ // kColorSpace* constants.
+ int GetColorSpace();
+
+ // Number of color components in the color space.
+ int GetNumComponents();
+
+ // Sample factors of the n-th component.
+ int GetHorizSampFactor(int component);
+
+ int GetVertSampFactor(int component);
+
+ int GetHorizSubSampFactor(int component);
+
+ int GetVertSubSampFactor(int component);
+
+ // Public for testability.
+ int GetImageScanlinesPerImcuRow();
+
+ // Public for testability.
+ int GetComponentScanlinesPerImcuRow(int component);
+
+ // Width of a component in bytes.
+ int GetComponentWidth(int component);
+
+ // Height of a component.
+ int GetComponentHeight(int component);
+
+ // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+ int GetComponentStride(int component);
+
+ // Size of a component in bytes.
+ int GetComponentSize(int component);
+
+ // Call this after LoadFrame() if you decide you don't want to decode it
+ // after all.
+ LIBYUV_BOOL UnloadFrame();
+
+ // Decodes the entire image into a one-buffer-per-color-component format.
+ // dst_width must match exactly. dst_height must be <= to image height; if
+ // less, the image is cropped. "planes" must have size equal to at least
+ // GetNumComponents() and they must point to non-overlapping buffers of size
+ // at least GetComponentSize(i). The pointers in planes are incremented
+ // to point to after the end of the written data.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
+
+ // Decodes the entire image and passes the data via repeated calls to a
+ // callback function. Each call will get the data for a whole number of
+ // image scanlines.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+ void* opaque,
+ int dst_width,
+ int dst_height);
+
+ // The helper function which recognizes the jpeg sub-sampling type.
+ static JpegSubsamplingType JpegSubsamplingTypeHelper(
+ int* subsample_x,
+ int* subsample_y,
+ int number_of_components);
+
+ private:
+ void AllocOutputBuffers(int num_outbufs);
+ void DestroyOutputBuffers();
+
+ LIBYUV_BOOL StartDecode();
+ LIBYUV_BOOL FinishDecode();
+
+ void SetScanlinePointers(uint8_t** data);
+ LIBYUV_BOOL DecodeImcuRow();
+
+ int GetComponentScanlinePadding(int component);
+
+ // A buffer holding the input data for a frame.
+ Buffer buf_;
+ BufferVector buf_vec_;
+
+ jpeg_decompress_struct* decompress_struct_;
+ jpeg_source_mgr* source_mgr_;
+ SetJmpErrorMgr* error_mgr_;
+
+ // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+ // GetComponentScanlinePadding() != 0.)
+ LIBYUV_BOOL has_scanline_padding_;
+
+ // Temporaries used to point to scanline outputs.
+ int num_outbufs_; // Outermost size of all arrays below.
+ uint8_t*** scanlines_;
+ int* scanlines_sizes_;
+ // Temporary buffer used for decoding when we can't decode directly to the
+ // output buffers. Large enough for just one iMCU row.
+ uint8_t** databuf_;
+ int* databuf_strides_;
+};
+
+} // namespace libyuv
+
+#endif // __cplusplus
+#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 0000000000..8d868b9542
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,900 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int scale, // 16384 for 10 bits
+ int width,
+ int height);
+
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int scale, // 1024 for 10 bits
+ int width,
+ int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ uint32_t value);
+
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int width,
+ int height);
+
+// Copy I400. Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+ int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_uv, int dst_stride_uv, int width, int height);
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+ int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_vu, int dst_stride_vu, int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror. A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alias
+#define RGB24ToRGB24Mirror RGB24Mirror
+
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Mirror a plane of data.
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height,
+ uint32_t value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const int8_t* matrix_argb,
+ int width,
+ int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const int8_t* matrix_rgb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* luma,
+ int width,
+ int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
+// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared. The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const float* poly,
+ int width,
+ int height);
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height);
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+ int src_stride_y0,
+ const uint8_t* src_y1,
+ int src_stride_y1,
+ const uint8_t* alpha,
+ int alpha_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+ int src_stride_y0,
+ const uint8_t* src_u0,
+ int src_stride_u0,
+ const uint8_t* src_v0,
+ int src_stride_v0,
+ const uint8_t* src_y1,
+ int src_stride_y1,
+ const uint8_t* src_u1,
+ int src_stride_u1,
+ const uint8_t* src_v1,
+ int src_stride_v1,
+ const uint8_t* alpha,
+ int alpha_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+ int src_stride_argb,
+ int32_t* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+// 16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int32_t* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius);
+
+// Gaussian 5x5 blur a float plane.
+// Coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32_t value);
+
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+ int src_stride0,
+ const uint8_t* src1,
+ int src_stride1,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation);
+
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+ int src0_stride_y,
+ const uint8_t* src0_u,
+ int src0_stride_u,
+ const uint8_t* src0_v,
+ int src0_stride_v,
+ const uint8_t* src1_y,
+ int src1_stride_y,
+ const uint8_t* src1_u,
+ int src1_stride_u,
+ const uint8_t* src1_v,
+ int src1_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation);
+
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* uv_dudv,
+ int width);
+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* uv_dudv,
+ int width);
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* shuffler,
+ int width,
+ int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/rotate.h b/third_party/aom/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 0000000000..308882242c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate180 = 180, // Rotate 180 degrees.
+ kRotate270 = 270, // Rotate 270 degrees clockwise.
+
+ // Deprecated.
+ kRotateNone = 0,
+ kRotateClockwise = 90,
+ kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotateUV180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+LIBYUV_API
+void RotateUV270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+LIBYUV_API
+void TransposeUV(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 0000000000..20432949ab
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h" // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ enum RotationMode mode);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h b/third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 0000000000..022293eef2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available for Visual C and clangcl 32 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_TRANSPOSEWX8_MMI
+#define HAS_TRANSPOSEUVWX8_MMI
+#endif
+
+void TransposeWxH_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height);
+
+void TransposeWx8_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_MSA(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+
+void TransposeWx8_Any_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Any_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Any_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx16_Any_MSA(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+
+void TransposeUVWxH_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+void TransposeUVWx8_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_MSA(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_Any_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx8_Any_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_Any_MSA(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/row.h b/third_party/aom/third_party/libyuv/include/libyuv/row.h
new file mode 100644
index 0000000000..a27788c1f6
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/row.h
@@ -0,0 +1,4384 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stdlib.h> // For malloc.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif // clang >= 3.5
+#endif // __clang__
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+// clang >= 6.0.0 required for AVX512.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+// clang in xcode follows a different versioning scheme.
+// TODO(fbarchard): fix xcode 9 ios b/789.
+#if (__clang_major__ >= 7) && !defined(__APPLE__)
+#define CLANG_HAS_AVX512 1
+#endif // clang >= 7
+#endif // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_H422TOARGBROW_SSSE3
+#define HAS_HALFFLOATROW_SSE2
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB24ROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB24ROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTORGB24ROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
+#define HAS_RAWTOYJROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_BLENDPLANEROW_SSSE3
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// The following functions fail on gcc/clang 32 bit with fpic and framepointer.
+// caveat: clangcl uses row_win.cc which works.
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+ defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#endif
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTOUVJROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_H422TOARGBROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
+// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB24ROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB24ROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_BLENDPLANEROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+ defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
+#endif
+
+// The following are available for AVX2 Visual C and clangcl 32 bit:
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are also available on x64 Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \
+ (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422ALPHATOARGBROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_CONVERT16TO8ROW_SSSE3
+#define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
+#define HAS_I210TOAR30ROW_SSSE3
+#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
+#define HAS_RGBATOYJROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
+#endif
+
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
+#define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTORAWROW_AVX2
+#define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_CONVERT16TO8ROW_AVX2
+#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
+#define HAS_I210TOAR30ROW_AVX2
+#define HAS_I210TOARGBROW_AVX2
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOAR30ROW_AVX2
+#define HAS_I422TOUYVYROW_AVX2
+#define HAS_I422TOYUY2ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
+#define HAS_RGBATOYJROW_AVX2
+#define HAS_SWAPUVROW_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
+#endif
+
+// The following are available for AVX512 clang x86 platforms:
+// TODO(fbarchard): Port to GCC and Visual C
+// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+ (defined(CLANG_HAS_AVX512))
+#define HAS_ARGBTORGB24ROW_AVX512VBMI
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_BYTETOFLOATROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I422ALPHATOARGBROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_MIRRORSPLITUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB24ROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITRGBROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_SWAPUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24MIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
+#endif
+
+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALESUMSAMPLES_NEON
+#define HAS_GAUSSROW_F32_NEON
+#define HAS_GAUSSCOL_F32_NEON
+
+#endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
+#define HAS_ARGB4444TOARGBROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBBLENDROW_MSA
+#define HAS_ARGBCOLORMATRIXROW_MSA
+#define HAS_ARGBEXTRACTALPHAROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBQUANTIZEROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
+#define HAS_ARGBTOARGB1555ROW_MSA
+#define HAS_ARGBTOARGB4444ROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
+#define HAS_ARGBTORGB565DITHERROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_HALFFLOATROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_I422TOARGB1555ROW_MSA
+#define HAS_I422TORGB565ROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_MERGEUVROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_MIRRORUVROW_MSA
+#define HAS_MIRRORSPLITUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_SETROW_MSA
+#define HAS_SOBELROW_MSA
+#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXROW_MSA
+#define HAS_SOBELXYROW_MSA
+#define HAS_SOBELYROW_MSA
+#define HAS_SPLITUVROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
+#endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_ABGRTOUVROW_MMI
+#define HAS_ABGRTOYROW_MMI
+#define HAS_ARGB1555TOARGBROW_MMI
+#define HAS_ARGB1555TOUVROW_MMI
+#define HAS_ARGB1555TOYROW_MMI
+#define HAS_ARGB4444TOARGBROW_MMI
+#define HAS_ARGB4444TOUVROW_MMI
+#define HAS_ARGB4444TOYROW_MMI
+#define HAS_ARGBADDROW_MMI
+#define HAS_ARGBATTENUATEROW_MMI
+#define HAS_ARGBBLENDROW_MMI
+#define HAS_ARGBCOLORMATRIXROW_MMI
+#define HAS_ARGBCOPYALPHAROW_MMI
+#define HAS_ARGBCOPYYTOALPHAROW_MMI
+#define HAS_ARGBEXTRACTALPHAROW_MMI
+#define HAS_ARGBGRAYROW_MMI
+#define HAS_ARGBMIRRORROW_MMI
+#define HAS_ARGBMULTIPLYROW_MMI
+#define HAS_ARGBSEPIAROW_MMI
+#define HAS_ARGBSETROW_MMI
+#define HAS_ARGBSHADEROW_MMI
+#define HAS_ARGBSHUFFLEROW_MMI
+#define HAS_ARGBSUBTRACTROW_MMI
+#define HAS_ARGBTOARGB1555ROW_MMI
+#define HAS_ARGBTOARGB4444ROW_MMI
+#define HAS_ARGBTORAWROW_MMI
+#define HAS_ARGBTORGB24ROW_MMI
+#define HAS_ARGBTORGB565DITHERROW_MMI
+#define HAS_ARGBTORGB565ROW_MMI
+#define HAS_ARGBTOUV444ROW_MMI
+#define HAS_ARGBTOUVJROW_MMI
+#define HAS_ARGBTOUVROW_MMI
+#define HAS_ARGBTOYJROW_MMI
+#define HAS_ARGBTOYROW_MMI
+#define HAS_BGRATOUVROW_MMI
+#define HAS_BGRATOYROW_MMI
+#define HAS_BLENDPLANEROW_MMI
+#define HAS_COMPUTECUMULATIVESUMROW_MMI
+#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
+#define HAS_HALFFLOATROW_MMI
+#define HAS_I400TOARGBROW_MMI
+#define HAS_I422TOUYVYROW_MMI
+#define HAS_I422TOYUY2ROW_MMI
+#define HAS_I422TOARGBROW_MMI
+#define HAS_I444TOARGBROW_MMI
+#define HAS_INTERPOLATEROW_MMI
+#define HAS_J400TOARGBROW_MMI
+#define HAS_MERGERGBROW_MMI
+#define HAS_MERGEUVROW_MMI
+#define HAS_MIRRORROW_MMI
+#define HAS_MIRRORSPLITUVROW_MMI
+#define HAS_RAWTOARGBROW_MMI
+#define HAS_RAWTORGB24ROW_MMI
+#define HAS_RAWTOUVROW_MMI
+#define HAS_RAWTOYROW_MMI
+#define HAS_RGB24TOARGBROW_MMI
+#define HAS_RGB24TOUVROW_MMI
+#define HAS_RGB24TOYROW_MMI
+#define HAS_RGB565TOARGBROW_MMI
+#define HAS_RGB565TOUVROW_MMI
+#define HAS_RGB565TOYROW_MMI
+#define HAS_RGBATOUVROW_MMI
+#define HAS_RGBATOYROW_MMI
+#define HAS_SOBELROW_MMI
+#define HAS_SOBELTOPLANEROW_MMI
+#define HAS_SOBELXROW_MMI
+#define HAS_SOBELXYROW_MMI
+#define HAS_SOBELYROW_MMI
+#define HAS_SPLITRGBROW_MMI
+#define HAS_SPLITUVROW_MMI
+#define HAS_UYVYTOUVROW_MMI
+#define HAS_UYVYTOYROW_MMI
+#define HAS_YUY2TOUV422ROW_MMI
+#define HAS_YUY2TOUVROW_MMI
+#define HAS_YUY2TOYROW_MMI
+#define HAS_I210TOARGBROW_MMI
+#define HAS_I422TOARGB4444ROW_MMI
+#define HAS_I422TOARGB1555ROW_MMI
+#define HAS_I422TORGB565ROW_MMI
+#define HAS_NV21TORGB24ROW_MMI
+#define HAS_NV12TORGB24ROW_MMI
+#define HAS_I422ALPHATOARGBROW_MMI
+#define HAS_I422TORGB24ROW_MMI
+#define HAS_NV12TOARGBROW_MMI
+#define HAS_NV21TOARGBROW_MMI
+#define HAS_NV12TORGB565ROW_MMI
+#define HAS_YUY2TOARGBROW_MMI
+#define HAS_UYVYTOARGBROW_MMI
+#define HAS_I422TORGBAROW_MMI
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
+#if defined(VISUALC_HAS_AVX2)
+#define SIMD_ALIGNED(var) __declspec(align(32)) var
+#else
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#endif
+typedef __declspec(align(16)) int16_t vec16[8];
+typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) float vecf32[4];
+typedef __declspec(align(16)) int8_t vec8[16];
+typedef __declspec(align(16)) uint16_t uvec16[8];
+typedef __declspec(align(16)) uint32_t uvec32[4];
+typedef __declspec(align(16)) uint8_t uvec8[16];
+typedef __declspec(align(32)) int16_t lvec16[16];
+typedef __declspec(align(32)) int32_t lvec32[8];
+typedef __declspec(align(32)) int8_t lvec8[32];
+typedef __declspec(align(32)) uint16_t ulvec16[16];
+typedef __declspec(align(32)) uint32_t ulvec32[8];
+typedef __declspec(align(32)) uint8_t ulvec8[32];
+#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
+#else
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+typedef int16_t __attribute__((vector_size(16))) vec16;
+typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef float __attribute__((vector_size(16))) vecf32;
+typedef int8_t __attribute__((vector_size(16))) vec8;
+typedef uint16_t __attribute__((vector_size(16))) uvec16;
+typedef uint32_t __attribute__((vector_size(16))) uvec32;
+typedef uint8_t __attribute__((vector_size(16))) uvec8;
+typedef int16_t __attribute__((vector_size(32))) lvec16;
+typedef int32_t __attribute__((vector_size(32))) lvec32;
+typedef int8_t __attribute__((vector_size(32))) lvec8;
+typedef uint16_t __attribute__((vector_size(32))) ulvec16;
+typedef uint32_t __attribute__((vector_size(32))) ulvec32;
+typedef uint8_t __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+typedef int16_t vec16[8];
+typedef int32_t vec32[4];
+typedef float vecf32[4];
+typedef int8_t vec8[16];
+typedef uint16_t uvec16[8];
+typedef uint32_t uvec32[4];
+typedef uint8_t uvec8[16];
+typedef int16_t lvec16[16];
+typedef int32_t lvec32[8];
+typedef int8_t lvec8[32];
+typedef uint16_t ulvec16[16];
+typedef uint32_t ulvec32[8];
+typedef uint8_t ulvec8[32];
+#endif
+
+#if defined(__aarch64__)
+// This struct is for Arm64 color conversion.
+struct YuvConstants {
+ uvec16 kUVToRB;
+ uvec16 kUVToRB2;
+ uvec16 kUVToG;
+ uvec16 kUVToG2;
+ vec16 kUVBiasBGR;
+ vec32 kYToRgb;
+};
+#elif defined(__arm__)
+// This struct is for ArmV7 color conversion.
+struct YuvConstants {
+ uvec8 kUVToRB;
+ uvec8 kUVToG;
+ vec16 kUVBiasBGR;
+ vec32 kYToRgb;
+};
+#else
+// This struct is for Intel color conversion.
+struct YuvConstants {
+ int8_t kUVToB[32];
+ int8_t kUVToG[32];
+ int8_t kUVToR[32];
+ int16_t kUVBiasB[16];
+ int16_t kUVBiasG[16];
+ int16_t kUVBiasR[16];
+ int16_t kYToRgb[16];
+ int16_t kYBiasToRgb[16];
+};
+
+// Offsets into YuvConstants structure
+#define KUVTOB 0
+#define KUVTOG 32
+#define KUVTOR 64
+#define KUVBIASB 96
+#define KUVBIASG 128
+#define KUVBIASR 160
+#define KYTORGB 192
+#define KYBIASTORGB 224
+
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size) \
+ uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \
+ uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+ free(var##_mem); \
+ var = 0
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+
+// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
+// measured and then run with iaca -64 libyuv_unittest.
+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
+// inline assembly blocks.
+// example of iaca:
+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define IACA_ASM_START \
+ ".byte 0x0F, 0x0B\n" \
+ " movl $111, %%ebx\n" \
+ ".byte 0x64, 0x67, 0x90\n"
+
+#define IACA_ASM_END \
+ " movl $222, %%ebx\n" \
+ ".byte 0x64, 0x67, 0x90\n" \
+ ".byte 0x0F, 0x0B\n"
+
+#define IACA_SSC_MARK(MARK_ID) \
+ __asm__ __volatile__("\n\t movl $" #MARK_ID \
+ ", %%ebx" \
+ "\n\t .byte 0x64, 0x67, 0x90" \
+ : \
+ : \
+ : "memory");
+
+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
+
+#else /* Visual C */
+#define IACA_UD_BYTES \
+ { __asm _emit 0x0F __asm _emit 0x0B }
+
+#define IACA_SSC_MARK(x) \
+ { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
+#endif
+
+#define IACA_START \
+ { \
+ IACA_UD_BYTES \
+ IACA_SSC_MARK(111) \
+ }
+#define IACA_END \
+ { \
+ IACA_SSC_MARK(222) \
+ IACA_UD_BYTES \
+ }
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_MMI(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width);
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+
+void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_C(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void SplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void MergeUVRow_C(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_MSA(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_MMI(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width);
+
+void SplitRGBRow_C(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+
+void MergeRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeRGBRow_Any_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_Any_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+
+void MergeUVRow_16_C(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale, /* 64 for 10 bit */
+ int width);
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale,
+ int width);
+
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void MultiplyRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+
+void Convert8To16Row_C(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+
+void Convert16To8Row_C(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
+ int width);
+void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
+ int width);
+
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void SetRow_C(uint8_t* dst, uint8_t v8, int width);
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
+
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
+void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
+void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
+void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
+void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
+void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width);
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width);
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);
+
+void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
+ int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width);
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width);
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
+ int width);
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
+ int width);
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void I444ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToAR30Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToAR30Row_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_MMI(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+
+// Unattenuated planar alpha blend.
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width);
+void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width);
+void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void BlendPlaneRow_MMI(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width);
+void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void BlendPlaneRow_C(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+
+void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+
+void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I444ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+
+void I422ToYUY2Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width);
+void I422ToUYVYRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width);
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32_t fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+
+void RGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
+
+void ARGBShadeRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
+ int width,
+ int area,
+ uint8_t* dst,
+ int count);
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width);
+
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width);
+
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+ const int32_t* bl,
+ int w,
+ int area,
+ uint8_t* dst,
+ int count);
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* uv_dudv,
+ int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* src_dudv,
+ int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction);
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction);
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction);
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelXRow_MSA(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelXRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelYRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelYRow_MSA(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelYRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
+void SobelXYRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelXYRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelXYRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width);
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width);
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width);
+
+// Scale and convert to half float.
+void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_Any_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void HalfFloatRow_MSA(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width);
+void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
+ float* dst_ptr,
+ float param,
+ int width);
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ const uint8_t* luma,
+ uint32_t lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ const uint8_t* luma,
+ uint32_t lumacoeff);
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleMaxSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width);
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleSumSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width);
+void ScaleSamples_C(const float* src, float* dst, float scale, int width);
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
+
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_Any_MMI(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void GaussRow_F32_NEON(const float* src, float* dst, int width);
+void GaussRow_F32_C(const float* src, float* dst, int width);
+
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROW_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale.h b/third_party/aom/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 0000000000..add5a9eb62
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+ kFilterNone = 0, // Point sample; Fastest.
+ kFilterLinear = 1, // Filter horizontally only.
+ kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
+ kFilterBox = 3 // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales an NV12 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// kFilterBox is not supported for the UV channel and will be treated as
+// bilinear.
+// Returns 0 if successful.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API. Deprecated.
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif // __cplusplus
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 0000000000..7641f18e34
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8_t* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8_t* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering);
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint32_t src_fourcc,
+ int src_width,
+ int src_height,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ uint32_t dst_fourcc,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale_row.h b/third_party/aom/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 0000000000..a386d49989
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,1367 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+ _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSSE3
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#endif
+
+// The following are available for gcc/clang x86 platforms, but
+// require clang 3.4 or gcc 4.7.
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
+ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_SCALEUVROWDOWN2BOX_AVX2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+ defined(GCC_HAS_AVX2))
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEADDROW_NEON
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#define HAS_SCALEUVROWDOWNEVEN_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEARGBCOLS_MSA
+#define HAS_SCALEARGBFILTERCOLS_MSA
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEFILTERCOLS_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN34_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEROWDOWN4_MSA
+#endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_FIXEDDIV1_MIPS
+#define HAS_FIXEDDIV_MIPS
+#define HAS_SCALEADDROW_16_MMI
+#define HAS_SCALEADDROW_MMI
+#define HAS_SCALEARGBCOLS_MMI
+#define HAS_SCALEARGBCOLSUP2_MMI
+#define HAS_SCALEARGBROWDOWN2_MMI
+#define HAS_SCALEARGBROWDOWNEVEN_MMI
+#define HAS_SCALECOLS_16_MMI
+#define HAS_SCALECOLS_MMI
+#define HAS_SCALEROWDOWN2_16_MMI
+#define HAS_SCALEROWDOWN2_MMI
+#define HAS_SCALEROWDOWN4_16_MMI
+#define HAS_SCALEROWDOWN4_MMI
+#define HAS_SCALEROWDOWN34_MMI
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int bpp,
+ enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_argb,
+ uint16_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+int FixedDiv_MIPS(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+int FixedDiv1_MIPS(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#elif defined HAS_FIXEDDIV_MIPS
+#define FixedDiv FixedDiv_MIPS
+#define FixedDiv1 FixedDiv1_MIPS
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering,
+ int* x,
+ int* y,
+ int* dx,
+ int* dy);
+
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* d,
+ int dst_width);
+void ScaleCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int,
+ int);
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int,
+ int);
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width);
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int,
+ int);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int,
+ int);
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+// UV Row functions
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width);
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width);
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale_uv.h b/third_party/aom/third_party/libyuv/include/libyuv/scale_uv.h
new file mode 100644
index 0000000000..1b6327aaed
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale_uv.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_UV_H_
+#define INCLUDE_LIBYUV_SCALE_UV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_UV_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/version.h b/third_party/aom/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 0000000000..efaac73e3a
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1768
+
+#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/video_common.h b/third_party/aom/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 0000000000..b9823d71d0
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) \
+ ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+ (static_cast<uint32_t>(c) << 16) | /* NOLINT */ \
+ (static_cast<uint32_t>(d) << 24)) /* NOLINT */
+#else
+#define FOURCC(a, b, c, d) \
+ (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \
+ ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+// http://www.fourcc.org/yuv.php
+// http://v4l2spec.bytesex.org/spec/book1.htm
+// http://developer.apple.com/quicktime/icefloe/dispatch020.html
+// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+ // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+ FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+ FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+ FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+ FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+ FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+ FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+ FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+ FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420
+ FOURCC_I210 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 422
+
+ // 1 Secondary YUV format: row biplanar. deprecated.
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+
+ // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+ FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+ FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010.
+ FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit
+ FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+ FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
+ FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
+ FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
+
+ // 1 Primary Compressed YUV format.
+ FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+ // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+ FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+ FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+ FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
+ FOURCC_J420 =
+ FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J422 =
+ FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J444 =
+ FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_J400 =
+ FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc
+ FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc
+ FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc
+ FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc
+ FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc
+ FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc
+ FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc
+ FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420
+ FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420
+ FOURCC_H210 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 422
+ FOURCC_U210 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 422
+
+ // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
+ FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
+ FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422.
+ FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444.
+ FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2.
+ FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac.
+ FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY.
+ FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac.
+ FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG.
+ FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac.
+ FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR.
+ FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW.
+ FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG.
+ FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB
+ FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB
+ FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO.
+ FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
+ FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
+
+ // deprecated formats. Not supported, but defined for backward compatibility.
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+ FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+ // Match any fourcc.
+ FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+ // Canonical fourcc codes used in our code.
+ FOURCC_BPP_I420 = 12,
+ FOURCC_BPP_I422 = 16,
+ FOURCC_BPP_I444 = 24,
+ FOURCC_BPP_I411 = 12,
+ FOURCC_BPP_I400 = 8,
+ FOURCC_BPP_NV21 = 12,
+ FOURCC_BPP_NV12 = 12,
+ FOURCC_BPP_YUY2 = 16,
+ FOURCC_BPP_UYVY = 16,
+ FOURCC_BPP_M420 = 12, // deprecated
+ FOURCC_BPP_Q420 = 12,
+ FOURCC_BPP_ARGB = 32,
+ FOURCC_BPP_BGRA = 32,
+ FOURCC_BPP_ABGR = 32,
+ FOURCC_BPP_RGBA = 32,
+ FOURCC_BPP_AR30 = 32,
+ FOURCC_BPP_AB30 = 32,
+ FOURCC_BPP_24BG = 24,
+ FOURCC_BPP_RAW = 24,
+ FOURCC_BPP_RGBP = 16,
+ FOURCC_BPP_RGBO = 16,
+ FOURCC_BPP_R444 = 16,
+ FOURCC_BPP_RGGB = 8,
+ FOURCC_BPP_BGGR = 8,
+ FOURCC_BPP_GRBG = 8,
+ FOURCC_BPP_GBRG = 8,
+ FOURCC_BPP_YV12 = 12,
+ FOURCC_BPP_YV16 = 16,
+ FOURCC_BPP_YV24 = 24,
+ FOURCC_BPP_YU12 = 12,
+ FOURCC_BPP_J420 = 12,
+ FOURCC_BPP_J400 = 8,
+ FOURCC_BPP_H420 = 12,
+ FOURCC_BPP_H422 = 16,
+ FOURCC_BPP_H010 = 24,
+ FOURCC_BPP_MJPG = 0, // 0 means unknown.
+ FOURCC_BPP_H264 = 0,
+ FOURCC_BPP_IYUV = 12,
+ FOURCC_BPP_YU16 = 16,
+ FOURCC_BPP_YU24 = 24,
+ FOURCC_BPP_YUYV = 16,
+ FOURCC_BPP_YUVS = 16,
+ FOURCC_BPP_HDYC = 16,
+ FOURCC_BPP_2VUY = 16,
+ FOURCC_BPP_JPEG = 1,
+ FOURCC_BPP_DMB1 = 1,
+ FOURCC_BPP_BA81 = 8,
+ FOURCC_BPP_RGB3 = 24,
+ FOURCC_BPP_BGR3 = 24,
+ FOURCC_BPP_CM32 = 32,
+ FOURCC_BPP_CM24 = 24,
+
+ // Match any fourcc.
+ FOURCC_BPP_ANY = 0, // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/third_party/aom/third_party/libyuv/source/compare.cc b/third_party/aom/third_party/libyuv/source/compare.cc
new file mode 100644
index 0000000000..e93aba1b53
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare.cc
@@ -0,0 +1,440 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
+ const int kBlockSize = 1 << 15; // 32768;
+ int remainder;
+ uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
+ HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ HashDjb2_SSE = HashDjb2_SSE41;
+ }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HashDjb2_SSE = HashDjb2_AVX2;
+ }
+#endif
+
+ while (count >= (uint64_t)(kBlockSize)) {
+ seed = HashDjb2_SSE(src, kBlockSize, seed);
+ src += kBlockSize;
+ count -= kBlockSize;
+ }
+ remainder = (int)count & ~15;
+ if (remainder) {
+ seed = HashDjb2_SSE(src, remainder, seed);
+ src += remainder;
+ count -= remainder;
+ }
+ remainder = (int)count & 15;
+ if (remainder) {
+ seed = HashDjb2_C(src, remainder, seed);
+ }
+ return seed;
+}
+
+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
+ return FOURCC_BGRA;
+ }
+ if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255.
+ return FOURCC_ARGB;
+ }
+ argb += 8;
+ }
+ if (width & 1) {
+ if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
+ return FOURCC_BGRA;
+ }
+ if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ return FOURCC_ARGB;
+ }
+ }
+ return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32_t ARGBDetect(const uint8_t* argb,
+ int stride_argb,
+ int width,
+ int height) {
+ uint32_t fourcc = 0;
+ int h;
+
+ // Coalesce rows.
+ if (stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ stride_argb = 0;
+ }
+ for (h = 0; h < height && fourcc == 0; ++h) {
+ fourcc = ARGBDetectRow_C(argb, width);
+ argb += stride_argb;
+ }
+ return fourcc;
+}
+
+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
+
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ const int kBlockSize = 1 << 15; // 32768;
+ const int kSimdSize = 64;
+ // SIMD for multiple of 64, and C for remainder
+ int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
+ uint64_t diff = 0;
+ int i;
+ uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+ int count) = HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ HammingDistance = HammingDistance_NEON;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ HammingDistance = HammingDistance_SSSE3;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSE42)
+ if (TestCpuFlag(kCpuHasSSE42)) {
+ HammingDistance = HammingDistance_SSE42;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HammingDistance = HammingDistance_AVX2;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ HammingDistance = HammingDistance_MMI;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ HammingDistance = HammingDistance_MSA;
+ }
+#endif
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
+#endif
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ if (remainder) {
+ diff += HammingDistance(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & (kSimdSize - 1);
+ if (remainder) {
+ diff += HammingDistance_C(src_a, src_b, remainder);
+ }
+ return diff;
+}
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ // SumSquareError returns values 0 to 65535 for each squared difference.
+ // Up to 65536 of those can be summed and remain within a uint32_t.
+ // After each block of 65536 pixels, accumulate into a uint64_t.
+ const int kBlockSize = 65536;
+ int remainder = count & (kBlockSize - 1) & ~31;
+ uint64_t sse = 0;
+ int i;
+ uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+ int count) = SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ // Note only used for multiples of 16 so count is not checked.
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ // Note only used for multiples of 32 so count is not checked.
+ SumSquareError = SumSquareError_AVX2;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SumSquareError = SumSquareError_MMI;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SumSquareError = SumSquareError_MSA;
+ }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : sse)
+#endif
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ if (remainder) {
+ sse += SumSquareError(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 31;
+ if (remainder) {
+ sse += SumSquareError_C(src_a, src_b, remainder);
+ }
+ return sse;
+}
+
+LIBYUV_API
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ uint64_t sse = 0;
+ int h;
+ // Coalesce rows.
+ if (stride_a == width && stride_b == width) {
+ width *= height;
+ height = 1;
+ stride_a = stride_b = 0;
+ }
+ for (h = 0; h < height; ++h) {
+ sse += ComputeSumSquareError(src_a, src_b, width);
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+ return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
+ double psnr;
+ if (sse > 0) {
+ double mse = (double)count / (double)sse;
+ psnr = 10.0 * log10(255.0 * 255.0 * mse);
+ } else {
+ psnr = kMaxPsnr; // Limit to prevent divide by 0
+ }
+
+ if (psnr > kMaxPsnr) {
+ psnr = kMaxPsnr;
+ }
+
+ return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ const uint64_t samples = (uint64_t)width * (uint64_t)height;
+ const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+ stride_b, width, height);
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8_t* src_y_a,
+ int stride_y_a,
+ const uint8_t* src_u_a,
+ int stride_u_a,
+ const uint8_t* src_v_a,
+ int stride_v_a,
+ const uint8_t* src_y_b,
+ int stride_y_b,
+ const uint8_t* src_u_b,
+ int stride_u_b,
+ const uint8_t* src_v_b,
+ int stride_v_b,
+ int width,
+ int height) {
+ const uint64_t sse_y = ComputeSumSquareErrorPlane(
+ src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const uint64_t sse_u = ComputeSumSquareErrorPlane(
+ src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+ const uint64_t sse_v = ComputeSumSquareErrorPlane(
+ src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
+ const uint64_t samples = (uint64_t)width * (uint64_t)height +
+ 2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+ const uint64_t sse = sse_y + sse_u + sse_v;
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b) {
+ int64_t sum_a = 0;
+ int64_t sum_b = 0;
+ int64_t sum_sq_a = 0;
+ int64_t sum_sq_b = 0;
+ int64_t sum_axb = 0;
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ int j;
+ for (j = 0; j < 8; ++j) {
+ sum_a += src_a[j];
+ sum_b += src_b[j];
+ sum_sq_a += src_a[j] * src_a[j];
+ sum_sq_b += src_b[j] * src_b[j];
+ sum_axb += src_a[j] * src_b[j];
+ }
+
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+
+ {
+ const int64_t count = 64;
+ // scale the constants by number of pixels
+ const int64_t c1 = (cc1 * count * count) >> 12;
+ const int64_t c2 = (cc2 * count * count) >> 12;
+
+ const int64_t sum_a_x_sum_b = sum_a * sum_b;
+
+ const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+ const int64_t sum_a_sq = sum_a * sum_a;
+ const int64_t sum_b_sq = sum_b * sum_b;
+
+ const int64_t ssim_d =
+ (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
+
+ if (ssim_d == 0.0) {
+ return DBL_MAX;
+ }
+ return ssim_n * 1.0 / ssim_d;
+ }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ int samples = 0;
+ double ssim_total = 0;
+ double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
+ int stride_b) = Ssim8x8_C;
+
+ // sample point start with each 4x4 location
+ int i;
+ for (i = 0; i < height - 8; i += 4) {
+ int j;
+ for (j = 0; j < width - 8; j += 4) {
+ ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+ samples++;
+ }
+
+ src_a += stride_a * 4;
+ src_b += stride_b * 4;
+ }
+
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8_t* src_y_a,
+ int stride_y_a,
+ const uint8_t* src_u_a,
+ int stride_u_a,
+ const uint8_t* src_v_a,
+ int stride_v_a,
+ const uint8_t* src_y_b,
+ int stride_y_b,
+ const uint8_t* src_u_b,
+ int stride_u_b,
+ const uint8_t* src_v_b,
+ int stride_v_b,
+ int width,
+ int height) {
+ const double ssim_y =
+ CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
+ width_uv, height_uv);
+ return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_common.cc b/third_party/aom/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 0000000000..d4b170ad98
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if ORIGINAL_OPT
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ int i;
+ for (i = 0; i < count; ++i) {
+ int x = src_a[i] ^ src_b[i];
+ if (x & 1)
+ ++diff;
+ if (x & 2)
+ ++diff;
+ if (x & 4)
+ ++diff;
+ if (x & 8)
+ ++diff;
+ if (x & 16)
+ ++diff;
+ if (x & 32)
+ ++diff;
+ if (x & 64)
+ ++diff;
+ if (x & 128)
+ ++diff;
+ }
+ return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 3; i += 4) {
+ uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
+ uint32_t u = x - ((x >> 1) & 0x55555555);
+ u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+ diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+ src_a += 4;
+ src_b += 4;
+ }
+
+ for (; i < count; ++i) {
+ uint32_t x = *src_a ^ *src_b;
+ uint32_t u = x - ((x >> 1) & 0x55);
+ u = ((u >> 2) & 0x33) + (u & 0x33);
+ diff += (u + (u >> 4)) & 0x0f;
+ src_a += 1;
+ src_b += 1;
+ }
+
+ return diff;
+}
+
+uint32_t SumSquareError_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse = 0u;
+ int i;
+ for (i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += (uint32_t)(diff * diff);
+ }
+ return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
+ uint32_t hash = seed;
+ int i;
+ for (i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_gcc.cc b/third_party/aom/third_party/libyuv/source/compare_gcc.cc
new file mode 100644
index 0000000000..6700f9697e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_gcc.cc
@@ -0,0 +1,360 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint64_t diff = 0u;
+
+ asm volatile(
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
+
+ // Process 32 bytes per loop.
+ LABELALIGN
+ "1: \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ :
+ : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+ return static_cast<uint32_t>(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ // Process 16 bytes per loop.
+ LABELALIGN
+ "1: \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "+r"(diff) // %3
+ :
+ : "memory", "cc", "ecx", "edx");
+
+ return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ : "m"(kNibbleMask), // %4
+ "m"(kBitCount) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+
+ return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ "vbroadcastf128 %4,%%ymm2 \n"
+ "vbroadcastf128 %5,%%ymm3 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
+ "vzeroupper \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ : "m"(kNibbleMask), // %4
+ "m"(kBitCount) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+ return diff;
+}
+#endif // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+ return sse;
+}
+
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
+static const uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+ uint32_t hash;
+ asm volatile(
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_neon.cc b/third_party/aom/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 0000000000..afdd601216
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
+
+ asm volatile(
+ "vmov.u16 q4, #0 \n" // accumulator
+
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
+
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
+
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "q0", "q1", "q2", "q3", "q4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_neon64.cc b/third_party/aom/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 0000000000..70fb9b9143
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_neon64.cc
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
+ asm volatile(
+ "movi v4.8h, #0 \n"
+
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
+
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_win.cc b/third_party/aom/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 0000000000..d57d3d9d1c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h> // For __popcnt
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 3; i += 4) {
+ uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT
+ src_a += 4;
+ src_b += 4;
+ diff += __popcnt(x);
+ }
+ return diff;
+}
+
+__declspec(naked) uint32_t
+ SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+
+ wloop:
+ movdqu xmm1, [eax]
+ lea eax, [eax + 16]
+ movdqu xmm2, [edx]
+ lea edx, [edx + 16]
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ sub ecx, 16
+ jg wloop
+
+ pshufd xmm1, xmm0, 0xee
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 0x01
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable : 4752)
+__declspec(naked) uint32_t
+ SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ vpxor ymm0, ymm0, ymm0 // sum
+ vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
+ sub edx, eax
+
+ wloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + edx]
+ lea eax, [eax + 32]
+ vpsubusb ymm3, ymm1, ymm2 // abs difference trick
+ vpsubusb ymm2, ymm2, ymm1
+ vpor ymm1, ymm2, ymm3
+ vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
+ vpunpckhbw ymm1, ymm1, ymm5
+ vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
+ vpmaddwd ymm1, ymm1, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm2
+ sub ecx, 32
+ jg wloop
+
+ vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpermq ymm1, ymm0, 0x02 // high + low lane.
+ vpaddd ymm0, ymm0, ymm1
+ vmovd eax, xmm0
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
+uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+__declspec(naked) uint32_t
+ HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, xmmword ptr kHash16x33
+
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ movdqa xmm5, xmmword ptr kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld xmm3, xmm5
+ movdqa xmm5, xmmword ptr kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld xmm4, xmm5
+ movdqa xmm5, xmmword ptr kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld xmm2, xmm5
+ movdqa xmm5, xmmword ptr kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ sub ecx, 16
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) uint32_t
+ HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ vmovd xmm0, [esp + 12] // seed
+
+ wloop:
+ vpmovzxbd xmm3, [eax] // src[0-3]
+ vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
+ vpmovzxbd xmm4, [eax + 4] // src[4-7]
+ vpmulld xmm3, xmm3, xmmword ptr kHashMul0
+ vpmovzxbd xmm2, [eax + 8] // src[8-11]
+ vpmulld xmm4, xmm4, xmmword ptr kHashMul1
+ vpmovzxbd xmm1, [eax + 12] // src[12-15]
+ vpmulld xmm2, xmm2, xmmword ptr kHashMul2
+ lea eax, [eax + 16]
+ vpmulld xmm1, xmm1, xmmword ptr kHashMul3
+ vpaddd xmm3, xmm3, xmm4 // add 16 results
+ vpaddd xmm1, xmm1, xmm2
+ vpaddd xmm1, xmm1, xmm3
+ vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ vpaddd xmm1, xmm1,xmm2
+ vpshufd xmm2, xmm1, 0x01
+ vpaddd xmm1, xmm1, xmm2
+ vpaddd xmm0, xmm0, xmm1
+ sub ecx, 16
+ jg wloop
+
+ vmovd eax, xmm0 // return hash
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert.cc b/third_party/aom/third_party/libyuv/source/convert.cc
new file mode 100644
index 0000000000..98258b9bc9
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert.cc
@@ -0,0 +1,2514 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h" // For ScalePlane()
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int src_uv_width,
+ int src_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ if (src_uv_width == 0 || src_uv_height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return 0;
+}
+
+// Copy I420 with optional flipping.
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// Copy I010 with optional flipping.
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
+ height);
+ // Convert UV planes.
+ Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
+ halfheight);
+ Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int src_uv_width = SUBSAMPLE(width, 1, 1);
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, src_uv_width, height);
+}
+
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Allocate u and v buffers
+ align_buffer_64(plane_u, halfwidth * halfheight * 2);
+ uint8_t* plane_v = plane_u + halfwidth * halfheight;
+
+ I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+ height);
+ MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+ halfwidth, halfheight);
+ free_aligned_buffer_64(plane_u);
+ return 0;
+}
+
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+ }
+ {
+ // Allocate 2 rows of vu.
+ int awidth = halfwidth * 2;
+ align_buffer_64(row_vu_0, awidth * 2);
+ uint8_t* row_vu_1 = row_vu_0 + awidth;
+
+ for (y = 0; y < height - 1; y += 2) {
+ MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+ MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+ halfwidth);
+ InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+ }
+ free_aligned_buffer_64(row_vu_0);
+ }
+ return 0;
+}
+#endif // I422TONV21_ROW_VERSION
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, width, height);
+}
+
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+ SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+ return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+ return 0;
+}
+
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Split UV plane - NV12 / NV21
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+
+ return 0;
+}
+
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+ YUY2ToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUVRow = YUY2ToUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUVRow = YUY2ToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+ UYVYToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUVRow = UYVYToUVRow_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUVRow = UYVYToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUVRow = UYVYToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUVRow = UYVYToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUVRow = UYVYToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToUVRow = AYUVToUVRow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToUVRow = AYUVToUVRow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToUVRow = AYUVToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToUVRow = AYUVToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_vu, int width) = AYUVToVURow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToVURow = AYUVToVURow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToVURow = AYUVToVURow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToVURow = AYUVToVURow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToVURow = AYUVToVURow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToVURow = AYUVToVURow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToVURow = AYUVToVURow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ AYUVToVURow(src_ayuv, 0, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
+ BGRAToYRow_C;
+ if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ BGRAToYRow = BGRAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToYRow = BGRAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BGRAToYRow = BGRAToYRow_Any_MMI;
+ BGRAToUVRow = BGRAToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ BGRAToYRow = BGRAToYRow_Any_MSA;
+ BGRAToUVRow = BGRAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
+ BGRAToUVRow = BGRAToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+ src_bgra += src_stride_bgra * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
+ RGBAToYRow_C;
+ if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYRow = RGBAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGBAToYRow = RGBAToYRow_Any_MMI;
+ RGBAToUVRow = RGBAToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYRow = RGBAToYRow_Any_MSA;
+ RGBAToUVRow = RGBAToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
+ RGBAToUVRow = RGBAToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+ src_rgba += src_stride_rgba * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+ RGB24ToYRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ RGB24ToYRow = RGB24ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
+#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
+ RGB24ToYRow = RGB24ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+ RGB24ToYRow = RGB24ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_MSA;
+ RGB24ToUVRow = RGB24ToUVRow_MSA;
+ }
+ }
+#endif
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
+ void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVJRow_C;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+ RGB24ToYJRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct RGB24 to YUV.
+#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
+#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+ }
+ }
+#endif
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+ RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
+ defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
+ defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
+ void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ RAWToYRow = RAWToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct RAW to YUV.
+#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
+#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToUVRow = RAWToUVRow_Any_MMI;
+ RAWToYRow = RAWToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVRow = RAWToUVRow_Any_MSA;
+ RAWToYRow = RAWToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_MSA;
+ RAWToUVRow = RAWToUVRow_MSA;
+ }
+ }
+#endif
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
+ void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB565ToUVRow_C;
+ void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
+ RGB565ToYRow_C;
+#else
+ void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = RGB565ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ RGB565ToYRow = RGB565ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA))
+#if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
+ RGB565ToYRow = RGB565ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+ RGB565ToYRow = RGB565ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_MSA;
+ RGB565ToUVRow = RGB565ToUVRow_MSA;
+ }
+ }
+#endif
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+ {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
+ void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGB1555ToUVRow_C;
+ void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+ int width) = ARGB1555ToYRow_C;
+#else
+ void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+ }
+ }
+ }
+// MMI and MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA))
+#if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+ }
+ }
+#endif
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#endif
+ {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+ void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGB4444ToUVRow_C;
+ void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+ int width) = ARGB4444ToYRow_C;
+#else
+ void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+#endif
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+ }
+ }
+ }
+#elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
+ }
+ }
+ }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+ RGB24ToYJRow_C;
+ if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+ RAWToYJRow_C;
+ if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_yj = 0;
+ }
+#if defined(HAS_RAWTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToYJRow = RAWToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RAWToYJRow = RAWToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYJRow = RAWToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToYJRow = RAWToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYJRow = RAWToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToYJRow(src_raw, dst_yj, width);
+ src_raw += src_stride_raw;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+static void SplitPixels(const uint8_t* src_u,
+ int src_pixel_stride_uv,
+ uint8_t* dst_u,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst_u = *src_u;
+ ++dst_u;
+ src_u += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to I420.
+LIBYUV_API
+int Android420ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Copy UV planes as is - I420
+ if (src_pixel_stride_uv == 1) {
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+ // Split UV planes - NV21
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ return 0;
+ // Split UV planes - NV12
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+ SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ }
+
+ for (y = 0; y < halfheight; ++y) {
+ SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+ SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_argb.cc b/third_party/aom/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 0000000000..5e7225faf2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,4125 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+ height);
+ return 0;
+}
+
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to ABGR.
+LIBYUV_API
+int J420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to ARGB.
+LIBYUV_API
+int H420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to ABGR.
+LIBYUV_API
+int H420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J422 to ABGR.
+LIBYUV_API
+int J422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H422 to ARGB.
+LIBYUV_API
+int H422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H422 to ABGR.
+LIBYUV_API
+int H422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I444ToARGBRow = I444ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToARGBRow = I210ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I210ToARGBRow = I210ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I210ToARGBRow = I210ToARGBRow_MMI;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToARGBRow = I210ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I210ToARGBRow = I210ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I210ToARGBRow = I210ToARGBRow_MMI;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I210 to ARGB.
+LIBYUV_API
+int I210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I210 to ABGR.
+LIBYUV_API
+int I210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H210 to ARGB.
+LIBYUV_API
+int H210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 with Alpha to ARGB.
+LIBYUV_API
+int I420AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I420 with Alpha to ABGR.
+LIBYUV_API
+int I420AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I420AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I400 to ARGB with matrix.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I400ToARGBRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I400ToARGBRow = I400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
+ J400ToARGBRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_J400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ J400ToARGBRow = J400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ J400ToARGBRow = J400ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ J400ToARGBRow = J400ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_MSA;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ J400ToARGBRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+ (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+ (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+ (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToARGBRow(src_rgb24, dst_argb, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ if (!src_raw || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_argb = 0;
+ }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToARGBRow = RAWToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+ RAWToRGBARow_C;
+ if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgba = 0;
+ }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGBARow = RAWToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGBARow = RAWToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGBARow = RAWToRGBARow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGBARow(src_raw, dst_rgba, width);
+ src_raw += src_stride_raw;
+ dst_rgba += dst_stride_rgba;
+ }
+ return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+ int width) = RGB565ToARGBRow_C;
+ if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb565 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB565ToARGBRow(src_rgb565, dst_argb, width);
+ src_rgb565 += src_stride_rgb565;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
+ if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ // Coalesce rows.
+ if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb1555 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+ src_argb1555 += src_stride_argb1555;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
+ if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ // Coalesce rows.
+ if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb4444 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+ src_argb4444 += src_stride_argb4444;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AR30 to ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_argb = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToARGBRow_C(src_ar30, dst_argb, width);
+ src_ar30 += src_stride_ar30;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_abgr = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+ src_ar30 += src_stride_ar30;
+ dst_abgr += dst_stride_abgr;
+ }
+ return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_ab30 = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+ src_ar30 += src_stride_ar30;
+ dst_ab30 += dst_stride_ab30;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV12ToARGBRow = NV12ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+ if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV21ToARGBRow = NV21ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+ dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+ dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB24Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+ if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToRGB24Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+ if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+ dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+ uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+ if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+ dst_stride_yuv24 = -dst_stride_yuv24;
+ }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+ dst_yuv24 += dst_stride_yuv24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
+ YUY2ToARGBRow_C;
+ if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_argb = 0;
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants, int width) =
+ UYVYToARGBRow_C;
+ if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_argb = 0;
+ }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToARGBRow = UYVYToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ UYVYToARGBRow = UYVYToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_MSA;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
+ src_uyvy += src_stride_uyvy;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+static void WeavePixels(const uint8_t* src_u,
+ const uint8_t* src_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_uv,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_uv[0] = *src_u;
+ dst_uv[1] = *src_v;
+ dst_uv += 2;
+ src_u += src_pixel_stride_uv;
+ src_v += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to ARGB with matrix.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ uint8_t* dst_uv;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+
+ // I420
+ if (src_pixel_stride_uv == 1) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ // NV21
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+ dst_stride_argb, yuvconstants, width, height);
+ // NV12
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+ return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+ dst_stride_argb, yuvconstants, width, height);
+ }
+
+ // General case fallback creates NV12
+ align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+ dst_uv = plane_uv;
+ for (y = 0; y < halfheight; ++y) {
+ WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += halfwidth * 2;
+ }
+ NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+ dst_stride_argb, yuvconstants, width, height);
+ free_aligned_buffer_64(plane_uv);
+ return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_pixel_stride_uv, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, src_pixel_stride_uv, dst_abgr,
+ dst_stride_abgr, &kYvuI601Constants, width,
+ height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGBARow = I422ToRGBARow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB565Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGBARow = I422ToRGBARow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGB24Row = I422ToRGB24Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToRGB565Row = I422ToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row_argb);
+ }
+ return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToAR30Row_C;
+
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToAR30Row = I422ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToAR30Row = I422ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYvuH709Constants, width, height);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_from.cc b/third_party/aom/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 0000000000..f2cfc1d8f5
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,713 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h" // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_y_width,
+ int src_y_height,
+ int dst_uv_width,
+ int dst_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+ const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+ if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+ dst_uv_height <= 0) {
+ return -1;
+ }
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+ dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return 0;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+ halfheight);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int dst_uv_width = (Abs(width) + 1) >> 1;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int dst_uv_width = Abs(width);
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, dst_uv_width,
+ dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToYUY2Row = I422ToYUY2Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_yuy2 + dst_stride_yuy2, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2 * 2;
+ }
+ if (height & 1) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u * 2 == width &&
+ src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_uyvy + dst_stride_uyvy, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy * 2;
+ }
+ if (height & 1) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
+ halfwidth, halfheight);
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8_t* y,
+ int y_stride,
+ const uint8_t* u,
+ int u_stride,
+ const uint8_t* v,
+ int v_stride,
+ uint8_t* dst_sample,
+ int dst_sample_stride,
+ int width,
+ int height,
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
+ int r = 0;
+ if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
+ return -1;
+ }
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_UYVY:
+ r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_RGBP:
+ r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2, width,
+ height);
+ break;
+ case FOURCC_RGBO:
+ r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_R444:
+ r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_24BG:
+ r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
+ break;
+ case FOURCC_RAW:
+ r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3, width,
+ height);
+ break;
+ case FOURCC_ARGB:
+ r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_BGRA:
+ r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_ABGR:
+ r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_RGBA:
+ r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_AR30:
+ r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
+ case FOURCC_I400:
+ r = I400Copy(y, y_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ case FOURCC_NV12: {
+ uint8_t* dst_uv = dst_sample + width * height;
+ r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_uv,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ }
+ case FOURCC_NV21: {
+ uint8_t* dst_vu = dst_sample + width * height;
+ r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width, dst_vu,
+ dst_sample_stride ? dst_sample_stride : width, width,
+ height);
+ break;
+ }
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
+ if (format == FOURCC_YV12) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * halfheight;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * halfheight;
+ }
+ r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+ width, height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ int halfstride = (dst_sample_stride + 1) / 2;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
+ if (format == FOURCC_YV16) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + halfstride * height;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + halfstride * height;
+ }
+ r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
+ width, height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
+ if (format == FOURCC_YV24) {
+ dst_v = dst_sample + dst_sample_stride * height;
+ dst_u = dst_v + dst_sample_stride * height;
+ } else {
+ dst_u = dst_sample + dst_sample_stride * height;
+ dst_v = dst_u + dst_sample_stride * height;
+ }
+ r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+ dst_sample_stride, width, height);
+ break;
+ }
+ // Formats not supported - MJPG, biplanar, some rgb formats.
+ default:
+ return -1; // unknown fourcc - return failure code.
+ }
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_from_argb.cc b/third_party/aom/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 0000000000..4ba4bb5e0f
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,2163 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = ARGBToUV444Row_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_y == width &&
+ dst_stride_u == width && dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+
+ if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToYUY2Row = I422ToYUY2Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_MSA;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8_t* row_u = row_y + ((width + 63) & ~63);
+ uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+ src_argb += src_stride_argb;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+
+ if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_MSA;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8_t* row_u = row_y + ((width + 63) & ~63);
+ uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+ src_argb += src_stride_argb;
+ dst_uyvy += dst_stride_uyvy;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static const uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+ (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+ ARGBToRGB24Row_C;
+ if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ if (TestCpuFlag(kCpuHasAVX512VBMI)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB24Row(src_argb, dst_rgb24, width);
+ src_argb += src_stride_argb;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+ ARGBToRAWRow_C;
+ if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_raw = 0;
+ }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRAWRow = ARGBToRAWRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRAWRow = ARGBToRAWRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRAWRow = ARGBToRAWRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRAWRow(src_argb, dst_raw, width);
+ src_argb += src_stride_argb;
+ dst_raw += dst_stride_raw;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ int width) = ARGBToRGB565Row_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb565 = 0;
+ }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565Row(src_argb, dst_rgb565, width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ int width) = ARGBToARGB1555Row_C;
+ if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb1555 = 0;
+ }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+ src_argb += src_stride_argb;
+ dst_argb1555 += dst_stride_argb1555;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ int width) = ARGBToARGB4444Row_C;
+ if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb4444 = 0;
+ }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+ src_argb += src_stride_argb;
+ dst_argb4444 += dst_stride_argb4444;
+ }
+ return 0;
+}
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
+ ABGRToAR30Row_C;
+ if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+ // Coalesce rows.
+ if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_abgr = dst_stride_ar30 = 0;
+ }
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ABGRToAR30Row = ABGRToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToAR30Row = ABGRToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ABGRToAR30Row(src_abgr, dst_ar30, width);
+ src_abgr += src_stride_abgr;
+ dst_ar30 += dst_stride_ar30;
+ }
+ return 0;
+}
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+ ARGBToAR30Row_C;
+ if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_ar30 = 0;
+ }
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR30Row = ARGBToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBToAR30Row(src_argb, dst_ar30, width);
+ src_argb += src_stride_argb;
+ dst_ar30 += dst_stride_ar30;
+ }
+ return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+ src_argb += src_stride_argb * 2;
+ dst_yj += dst_stride_yj * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to J422. (JPeg full range I422).
+LIBYUV_API
+int ARGBToJ422(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+ RGBAToYJRow_C;
+ if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgba = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGBAToYJRow = RGBAToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYJRow = RGBAToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYJRow = RGBAToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYJRow = RGBAToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGBAToYJRow(src_rgba, dst_yj, width);
+ src_rgba += src_stride_rgba;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_jpeg.cc b/third_party/aom/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 0000000000..d7556ee91b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,602 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+ uint8_t* y;
+ int y_stride;
+ uint8_t* u;
+ int u_stride;
+ uint8_t* v;
+ int v_stride;
+ int w;
+ int h;
+};
+
+static void JpegCopyI420(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+ dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+ dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ int* width,
+ int* height) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret) {
+ *width = mjpeg_decoder.GetWidth();
+ *height = mjpeg_decoder.GetHeight();
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret ? 0 : -1; // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
+ dst_height);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other
+ // colorspace/subsample factors that occur in practice. ERROR: Unable to
+ // convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+struct NV21Buffers {
+ uint8_t* y;
+ int y_stride;
+ uint8_t* vu;
+ int vu_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu,
+ dst_stride_vu, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+static void JpegI420ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 since there is no UV plane.
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ // Use NV21Buffers but with UV instead of VU.
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv,
+ dst_stride_uv, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+struct ARGBBuffers {
+ uint8_t* argb;
+ int argb_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
+ dst_height);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other
+ // colorspace/subsample factors that occur in practice. ERROR: Unable to
+ // convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+#endif // HAVE_JPEG
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_to_argb.cc b/third_party/aom/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 0000000000..84df16c8c2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,382 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+
+// TODO(fbarchard): Add the following:
+// H010ToARGB
+// I010ToARGB
+
+LIBYUV_API
+int ConvertToARGB(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8_t* src;
+ const uint8_t* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
+ // and then rotate the ARGB to the final destination buffer.
+ // For in-place conversion, if destination dst_argb is same as source sample,
+ // also enable temporary buffer.
+ LIBYUV_BOOL need_buf =
+ (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+ uint8_t* dest_argb = dst_argb;
+ int dest_dst_stride_argb = dst_stride_argb;
+ uint8_t* rotate_buffer = NULL;
+ int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+ if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+ if (src_height < 0) {
+ inv_crop_height = -inv_crop_height;
+ }
+
+ if (need_buf) {
+ int argb_size = crop_width * 4 * abs_crop_height;
+ rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ dst_argb = rotate_buffer;
+ dst_stride_argb = crop_width * 4;
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ if (!need_buf && !rotation) {
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
+ }
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_AR30:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_AB30:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_J400:
+ src = sample + src_width * crop_y + crop_x;
+ r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv =
+ sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+ dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv =
+ sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+ dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_J420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_U420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+ crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_J422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_U422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_J444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_U444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
+ abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
+ crop_width, abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ } else if (rotation) {
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height, rotation);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_to_i420.cc b/third_party/aom/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 0000000000..ac6eeab24e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int crop_x,
+ int crop_y,
+ int src_width,
+ int src_height,
+ int crop_width,
+ int crop_height,
+ enum RotationMode rotation,
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8_t* src;
+ const uint8_t* src_uv;
+ const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ // TODO(nisse): Why allow crop_height < 0?
+ const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+ LIBYUV_BOOL need_buf =
+ (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+ format != FOURCC_NV21 && format != FOURCC_YV12) ||
+ dst_y == sample;
+ uint8_t* tmp_y = dst_y;
+ uint8_t* tmp_u = dst_u;
+ uint8_t* tmp_v = dst_v;
+ int tmp_y_stride = dst_stride_y;
+ int tmp_u_stride = dst_stride_u;
+ int tmp_v_stride = dst_stride_v;
+ uint8_t* rotate_buffer = NULL;
+ const int inv_crop_height =
+ (src_height < 0) ? -abs_crop_height : abs_crop_height;
+
+ if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+ crop_width <= 0 || src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination dst_y is same as source sample,
+ // also enable temporary buffer.
+ if (need_buf) {
+ int y_size = crop_width * abs_crop_height;
+ int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+ rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ dst_y = rotate_buffer;
+ dst_u = dst_y + y_size;
+ dst_v = dst_u + uv_size;
+ dst_stride_y = crop_width;
+ dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
+ break;
+ // TODO(fbarchard): Add AR30 and AB30
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, crop_width, inv_crop_height);
+ break;
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + (src_width * abs_src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height, rotation);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + (src_width * abs_src_height) +
+ ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+ // Call NV12 but with dst_u and dst_v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u,
+ dst_stride_u, crop_width, inv_crop_height, rotation);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YV12: {
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+ (crop_x / 2);
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
+ } else {
+ src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+ (crop_x / 2);
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
+ }
+ r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height, rotation);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+ (crop_x / 2);
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
+ } else {
+ src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+ (crop_x / 2);
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
+ }
+ r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, src_width,
+ abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+ tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+ rotation);
+ }
+ free(rotate_buffer);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/cpu_id.cc b/third_party/aom/third_party/libyuv/source/cpu_id.cc
new file mode 100644
index 0000000000..72a7fb82f6
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/cpu_id.cc
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h> // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h> // For _xgetbv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
+ !defined(__clang__)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+ defined(__x86_64__)) && \
+ !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
+#if defined(_MSC_VER)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ __cpuidex(cpu_info, info_eax, info_ecx);
+#elif defined(_M_IX86)
+ __asm {
+ mov eax, info_eax
+ mov ecx, info_ecx
+ mov edi, cpu_info
+ cpuid
+ mov [edi], eax
+ mov [edi + 4], ebx
+ mov [edi + 8], ecx
+ mov [edi + 12], edx
+ }
+#else // Visual C but not x86
+ if (info_ecx == 0) {
+ __cpuid(cpu_info, info_eax);
+ } else {
+ cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
+ }
+#endif
+// GCC version uses inline x86 assembly.
+#else // defined(_MSC_VER)
+ int info_ebx, info_edx;
+ asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+ // Preserve ebx for fpic 32 bit.
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=D"(info_ebx),
+#else
+ "cpuid \n"
+ : "=b"(info_ebx),
+#endif // defined( __i386__) && defined(__PIC__)
+ "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
+ cpu_info[0] = info_eax;
+ cpu_info[1] = info_ebx;
+ cpu_info[2] = info_ecx;
+ cpu_info[3] = info_edx;
+#endif // defined(_MSC_VER)
+}
+#else // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(int eax, int ecx, int* cpu_info) {
+ (void)eax;
+ (void)ecx;
+ cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// For VS2010 and earlier emit can be used:
+// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
+// __asm {
+// xor ecx, ecx // xcr 0
+// xgetbv
+// mov xcr0, eax
+// }
+// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
+// https://code.google.com/p/libyuv/issues/detail?id=529
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+#pragma optimize("g", off)
+#endif
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+ defined(__x86_64__)) && \
+ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int GetXCR0() {
+ int xcr0 = 0;
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
+ xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
+#elif defined(__i386__) || defined(__x86_64__)
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
+#endif // defined(__i386__) || defined(__x86_64__)
+ return xcr0;
+}
+#else
+// xgetbv unavailable to query for OSSave support. Return 0.
+#define GetXCR0() 0
+#endif // defined(_M_IX86) || defined(_M_X64) ..
+// Return optimization to previous setting.
+#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
+#pragma optimize("g", on)
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
+ char cpuinfo_line[512];
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // Assume Neon if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasNEON;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+ char* p = strstr(cpuinfo_line, " neon");
+ if (p && (p[5] == ' ' || p[5] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ // aarch64 uses asimd for Neon.
+ p = strstr(cpuinfo_line, " asimd");
+ if (p) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ }
+ }
+ fclose(f);
+ return 0;
+}
+
+// TODO(fbarchard): Consider read_msa_ir().
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
+ char cpuinfo_line[512];
+ int flag = 0x0;
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // Assume nothing if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return 0;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ // Workaround early kernel without mmi in ASEs line.
+ if (strstr(cpuinfo_line, "Loongson-3")) {
+ flag |= kCpuHasMMI;
+ } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+ flag |= kCpuHasMMI | kCpuHasMSA;
+ }
+ }
+ if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+ if (strstr(cpuinfo_line, "loongson-mmi") &&
+ strstr(cpuinfo_line, "loongson-ext")) {
+ flag |= kCpuHasMMI;
+ }
+ if (strstr(cpuinfo_line, "msa")) {
+ flag |= kCpuHasMSA;
+ }
+ // ASEs is the last line, so we can break here.
+ break;
+ }
+ }
+ fclose(f);
+ return flag;
+}
+
+static SAFEBUFFERS int GetCpuFlags(void) {
+ int cpu_info = 0;
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86))
+ int cpu_info0[4] = {0, 0, 0, 0};
+ int cpu_info1[4] = {0, 0, 0, 0};
+ int cpu_info7[4] = {0, 0, 0, 0};
+ CpuId(0, 0, cpu_info0);
+ CpuId(1, 0, cpu_info1);
+ if (cpu_info0[0] >= 7) {
+ CpuId(7, 0, cpu_info7);
+ }
+ cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
+
+ // AVX requires OS saves YMM registers.
+ if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
+ ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
+ cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+ ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
+
+ // Detect AVX512bw
+ if ((GetXCR0() & 0xe0) == 0xe0) {
+ cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+ cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+ cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+ cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
+ }
+ }
+#endif
+#if defined(__mips__) && defined(__linux__)
+ cpu_info = MipsCpuCaps("/proc/cpuinfo");
+ cpu_info |= kCpuHasMIPS;
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+ cpu_info = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+ cpu_info = kCpuHasNEON;
+#else
+ // Linux arm parse text file for neon detect.
+ cpu_info = ArmCpuCaps("/proc/cpuinfo");
+#endif
+ cpu_info |= kCpuHasARM;
+#endif // __arm__
+ cpu_info |= kCpuInitialized;
+ return cpu_info;
+}
+
+// Note that use of this function is not thread safe.
+LIBYUV_API
+int MaskCpuFlags(int enable_flags) {
+ int cpu_info = GetCpuFlags() & enable_flags;
+ SetCpuFlags(cpu_info);
+ return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+ return MaskCpuFlags(-1);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 0000000000..adba832f53
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,585 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable : 4324)
+#endif
+
+#endif
+
+#include <stdio.h> // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h" // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+ jpeg_error_mgr base; // Must be at the top
+ jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+void OutputHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+ : has_scanline_padding_(LIBYUV_FALSE),
+ num_outbufs_(0),
+ scanlines_(NULL),
+ scanlines_sizes_(NULL),
+ databuf_(NULL),
+ databuf_strides_(NULL) {
+ decompress_struct_ = new jpeg_decompress_struct;
+ source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+ error_mgr_ = new SetJmpErrorMgr;
+ decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+ // Override standard exit()-based error handler.
+ error_mgr_->base.error_exit = &ErrorHandler;
+ error_mgr_->base.output_message = &OutputHandler;
+#endif
+ decompress_struct_->client_data = NULL;
+ source_mgr_->init_source = &init_source;
+ source_mgr_->fill_input_buffer = &fill_input_buffer;
+ source_mgr_->skip_input_data = &skip_input_data;
+ source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+ source_mgr_->term_source = &term_source;
+ jpeg_create_decompress(decompress_struct_);
+ decompress_struct_->src = source_mgr_;
+ buf_vec_.buffers = &buf_;
+ buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+ jpeg_destroy_decompress(decompress_struct_);
+ delete decompress_struct_;
+ delete source_mgr_;
+#ifdef HAVE_SETJMP
+ delete error_mgr_;
+#endif
+ DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
+ if (!ValidateJpeg(src, src_len)) {
+ return LIBYUV_FALSE;
+ }
+
+ buf_.data = src;
+ buf_.len = static_cast<int>(src_len);
+ buf_vec_.pos = 0;
+ decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_read_header, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+ // ERROR: Bad MJPEG header
+ return LIBYUV_FALSE;
+ }
+ AllocOutputBuffers(GetNumComponents());
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+ if (scanlines_sizes_[i] != scanlines_size) {
+ if (scanlines_[i]) {
+ delete scanlines_[i];
+ }
+ scanlines_[i] = new uint8_t*[scanlines_size];
+ scanlines_sizes_[i] = scanlines_size;
+ }
+
+ // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+ // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+ // the preceding scanlines, the padding is not needed/wanted because the
+ // following addresses will already be valid (they are the initial bytes of
+ // the next scanline) and will be overwritten when jpeglib writes out that
+ // next scanline.
+ int databuf_stride = GetComponentStride(i);
+ int databuf_size = scanlines_size * databuf_stride;
+ if (databuf_strides_[i] != databuf_stride) {
+ if (databuf_[i]) {
+ delete databuf_[i];
+ }
+ databuf_[i] = new uint8_t[databuf_size];
+ databuf_strides_[i] = databuf_stride;
+ }
+
+ if (GetComponentStride(i) != GetComponentWidth(i)) {
+ has_scanline_padding_ = LIBYUV_TRUE;
+ }
+ }
+ return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+ return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+ return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+ return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+ return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+ return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+ return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+ return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+ return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+ return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+ return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+ return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+ int hs = GetHorizSubSampFactor(component);
+ return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+ return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+ return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_abort_decompress, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ jpeg_abort_decompress(decompress_struct_);
+ return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
+ int dst_width,
+ int dst_height) {
+ if (dst_width != GetWidth() || dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return LIBYUV_FALSE;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (!StartDecode()) {
+ return LIBYUV_FALSE;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // Compute amount of lines to skip to implement vertical crop.
+ // TODO(fbarchard): Ensure skip is a multiple of maximum component
+ // subsample. ie 2
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ // There is no API to skip lines in the output data, so we read them
+ // into the temp buffer.
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip. Must read it and then
+ // copy the parts we want into the destination.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int scanlines_to_copy =
+ GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i),
+ scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+ }
+ }
+
+ // Read full MCUs but cropped horizontally
+ for (; lines_left > GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+ CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy =
+ DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+ CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+ GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+ return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+ void* opaque,
+ int dst_width,
+ int dst_height) {
+ if (dst_width != GetWidth() || dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return LIBYUV_FALSE;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (!StartDecode()) {
+ return LIBYUV_FALSE;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ // Change our own data buffer pointers so we can pass them to the
+ // callback.
+ databuf_[i] += data_to_skip;
+ }
+ int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+ (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+ // Now change them back.
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ databuf_[i] -= data_to_skip;
+ }
+ lines_left -= scanlines_to_copy;
+ }
+ }
+ // Read full MCUs until we get to the crop point.
+ for (; lines_left >= GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+ }
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+ }
+ return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+ fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+ BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+ if (buf_vec->pos >= buf_vec->len) {
+ // Don't assert-fail when fuzzing.
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ assert(0 && "No more data");
+#endif
+ // ERROR: No more data
+ return FALSE;
+ }
+ cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+ cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+ ++buf_vec->pos;
+ return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
+ jpeg_source_mgr* src = cinfo->src;
+ size_t bytes = static_cast<size_t>(num_bytes);
+ if (bytes > src->bytes_in_buffer) {
+ src->next_input_byte = nullptr;
+ src->bytes_in_buffer = 0;
+ } else {
+ src->next_input_byte += bytes;
+ src->bytes_in_buffer -= bytes;
+ }
+}
+
+void term_source(j_decompress_ptr cinfo) {
+ (void)cinfo; // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+ char buf[JMSG_LENGTH_MAX];
+ (*cinfo->err->format_message)(cinfo, buf);
+// ERROR: Error in jpeglib: buf
+#endif
+
+ SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+ // This rewinds the call stack to the point of the corresponding setjmp()
+ // and causes it to return (for a second time) with value 1.
+ longjmp(mgr->setjmp_buffer, 1);
+}
+
+// Suppress fprintf warnings.
+void OutputHandler(j_common_ptr cinfo) {
+ (void)cinfo;
+}
+
+#endif // HAVE_SETJMP
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+ if (num_outbufs != num_outbufs_) {
+ // We could perhaps optimize this case to resize the output buffers without
+ // necessarily having to delete and recreate each one, but it's not worth
+ // it.
+ DestroyOutputBuffers();
+
+ scanlines_ = new uint8_t**[num_outbufs];
+ scanlines_sizes_ = new int[num_outbufs];
+ databuf_ = new uint8_t*[num_outbufs];
+ databuf_strides_ = new int[num_outbufs];
+
+ for (int i = 0; i < num_outbufs; ++i) {
+ scanlines_[i] = NULL;
+ scanlines_sizes_[i] = 0;
+ databuf_[i] = NULL;
+ databuf_strides_[i] = 0;
+ }
+
+ num_outbufs_ = num_outbufs;
+ }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ delete[] scanlines_[i];
+ delete[] databuf_[i];
+ }
+ delete[] scanlines_;
+ delete[] databuf_;
+ delete[] scanlines_sizes_;
+ delete[] databuf_strides_;
+ scanlines_ = NULL;
+ databuf_ = NULL;
+ scanlines_sizes_ = NULL;
+ databuf_strides_ = NULL;
+ num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+ decompress_struct_->raw_data_out = TRUE;
+ decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
+ decompress_struct_->dither_mode = JDITHER_NONE;
+ // Not applicable to 'raw':
+ decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+ // Only for buffered mode:
+ decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+ // Blocky but fast:
+ decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+ if (!jpeg_start_decompress(decompress_struct_)) {
+ // ERROR: Couldn't start JPEG decompressor";
+ return LIBYUV_FALSE;
+ }
+ return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+ // jpeglib considers it an error if we finish without decoding the whole
+ // image, so we call "abort" rather than "finish".
+ jpeg_abort_decompress(decompress_struct_);
+ return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ uint8_t* data_i = data[i];
+ for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+ scanlines_[i][j] = data_i;
+ data_i += GetComponentStride(i);
+ }
+ }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+ return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+ jpeg_read_raw_data(decompress_struct_, scanlines_,
+ GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+ int* subsample_x,
+ int* subsample_y,
+ int number_of_components) {
+ if (number_of_components == 3) { // Color images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+ subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
+ return kJpegYuv420;
+ }
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+ subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
+ return kJpegYuv422;
+ }
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
+ subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
+ return kJpegYuv444;
+ }
+ } else if (number_of_components == 1) { // Grey-scale images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+ return kJpegYuv400;
+ }
+ }
+ return kJpegUnknown;
+}
+
+} // namespace libyuv
+#endif // HAVE_JPEG
diff --git a/third_party/aom/third_party/libyuv/source/mjpeg_validate.cc b/third_party/aom/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 0000000000..ba0a03ab9e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h> // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to scan for EOI marker (0xff 0xd9).
+static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+ if (src_size_mjpg >= 2) {
+ const uint8_t* end = src_mjpg + src_size_mjpg - 1;
+ const uint8_t* it = src_mjpg;
+ while (it < end) {
+ // TODO(fbarchard): scan for 0xd9 instead.
+ it = (const uint8_t*)(memchr(it, 0xff, end - it));
+ if (it == NULL) {
+ break;
+ }
+ if (it[1] == 0xd9) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ ++it; // Skip over current 0xff.
+ }
+ }
+ // ERROR: Invalid jpeg end code not found. Size src_size_mjpg
+ return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+ // Maximum size that ValidateJpeg will consider valid.
+ const size_t kMaxJpegSize = 0x7fffffffull;
+ const size_t kBackSearchSize = 1024;
+ if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
+ // ERROR: Invalid jpeg size: src_size_mjpg
+ return LIBYUV_FALSE;
+ }
+ // SOI marker
+ if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
+ // ERROR: Invalid jpeg initial start code
+ return LIBYUV_FALSE;
+ }
+
+ // Look for the End Of Image (EOI) marker near the end of the buffer.
+ if (src_size_mjpg > kBackSearchSize) {
+ if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ // Reduce search size for forward search.
+ src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
+ }
+ // Step over SOI marker and scan for EOI.
+ return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/planar_functions.cc b/third_party/aom/third_party/libyuv/source/planar_functions.cc
new file mode 100644
index 0000000000..4e8908c2eb
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/planar_functions.cc
@@ -0,0 +1,4107 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h> // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h" // for ScaleRowDown2
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Nothing to do.
+ if (src_y == dst_y && src_stride_y == dst_stride_y) {
+ return;
+ }
+
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_COPYROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_16_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_NEON;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert a plane of 16 bit data to 8 bit
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int scale, // 16384 for 10 bits
+ int width,
+ int height) {
+ int y;
+ void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
+ int width) = Convert16To8Row_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Convert16To8Row = Convert16To8Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ Convert16To8Row = Convert16To8Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Convert16To8Row = Convert16To8Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ Convert16To8Row = Convert16To8Row_AVX2;
+ }
+ }
+#endif
+
+ // Convert plane
+ for (y = 0; y < height; ++y) {
+ Convert16To8Row(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert a plane of 8 bit data to 16 bit
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int scale, // 16384 for 10 bits
+ int width,
+ int height) {
+ int y;
+ void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
+ int width) = Convert8To16Row_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_CONVERT8TO16ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Convert8To16Row = Convert8To16Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ Convert8To16Row = Convert8To16Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_CONVERT8TO16ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Convert8To16Row = Convert8To16Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ Convert8To16Row = Convert8To16Row_AVX2;
+ }
+ }
+#endif
+
+ // Convert plane
+ for (y = 0; y < height; ++y) {
+ Convert8To16Row(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ (void)src_u;
+ (void)src_stride_u;
+ (void)src_v;
+ (void)src_stride_v;
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv,
+ int src_stride_uv, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_uv, int dst_stride_uv, int width, int height) {
+ if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+ halfheight);
+ return 0;
+}
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu,
+ int src_stride_vu, uint8_t* dst_y, int dst_stride_y,
+ uint8_t* dst_vu, int dst_stride_vu, int width, int height) {
+ return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
+// Support function for NV12 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = SplitUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of UV.
+ SplitUVRow(src_uv, dst_u, dst_v, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+}
+
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_u == width && src_stride_v == width &&
+ dst_stride_uv == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MergeUVRow = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow(src_u, src_v, dst_uv, width);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+}
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ SwapUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_vu = 0;
+ }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SwapUVRow = SwapUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SwapUVRow = SwapUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SwapUVRow = SwapUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SwapUVRow = SwapUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SwapUVRow(src_uv, dst_vu, width);
+ src_uv += src_stride_uv;
+ dst_vu += dst_stride_vu;
+ }
+}
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+ src_stride_vu = -src_stride_vu;
+ }
+
+ SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int y;
+ void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+ uint8_t* dst_b, int width) = SplitRGBRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_r = dst_r + (height - 1) * dst_stride_r;
+ dst_g = dst_g + (height - 1) * dst_stride_g;
+ dst_b = dst_b + (height - 1) * dst_stride_b;
+ dst_stride_r = -dst_stride_r;
+ dst_stride_g = -dst_stride_g;
+ dst_stride_b = -dst_stride_b;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+ dst_stride_g == width && dst_stride_b == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+ }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SplitRGBRow = SplitRGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ SplitRGBRow = SplitRGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SPLITRGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitRGBRow = SplitRGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ SplitRGBRow = SplitRGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitRGBRow = SplitRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitRGBRow = SplitRGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of RGB.
+ SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+ dst_r += dst_stride_r;
+ dst_g += dst_stride_g;
+ dst_b += dst_stride_b;
+ src_rgb += src_stride_rgb;
+ }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int width,
+ int height) {
+ int y;
+ void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+ const uint8_t* src_b, uint8_t* dst_rgb, int width) =
+ MergeRGBRow_C;
+ // Coalesce rows.
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+ dst_stride_rgb = -dst_stride_rgb;
+ }
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_rgb == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+ }
+#if defined(HAS_MERGERGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MergeRGBRow = MergeRGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MergeRGBRow = MergeRGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeRGBRow = MergeRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeRGBRow = MergeRGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGERGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeRGBRow = MergeRGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MergeRGBRow = MergeRGBRow_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of RGB.
+ MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_rgb += dst_stride_rgb;
+ }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+ YUY2ToYRow_C;
+ if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+ width * height <= 32768) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+ UYVYToYRow_C;
+ if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+ dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+ width * height <= 32768) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUV422Row = UYVYToUV422Row_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUV422Row = UYVYToUV422Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
+ UYVYToUV422Row = UYVYToUV422Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+ YUY2ToYRow_C;
+ if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = 0;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+ MirrorUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorUVRow = MirrorUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorUVRow = MirrorUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorUVRow = MirrorUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorUVRow = MirrorUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_MSA;
+ }
+ }
+#endif
+
+ // MirrorUV plane
+ for (y = 0; y < height; ++y) {
+ MirrorUVRow(src_uv, dst_uv, width);
+ src_uv += src_stride_uv;
+ dst_uv += dst_stride_uv;
+ }
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+ ARGBMirrorRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMirrorRow = ARGBMirrorRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ ARGBMirrorRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+ RGB24MirrorRow_C;
+ if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+#if defined(HAS_RGB24MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+ void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+ uint8_t* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ return ARGBBlendRow;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBBlendRow = ARGBBlendRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBBlendRow = ARGBBlendRow_MMI;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBBlendRow = ARGBBlendRow_MSA;
+ }
+#endif
+ return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+ uint8_t* dst_argb, int width) = GetARGBBlend();
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+
+ for (y = 0; y < height; ++y) {
+ ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Alpha Blend plane and store to destination.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+ int src_stride_y0,
+ const uint8_t* src_y1,
+ int src_stride_y1,
+ const uint8_t* alpha,
+ int alpha_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+ const uint8_t* alpha, uint8_t* dst, int width) =
+ BlendPlaneRow_C;
+ if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+
+ // Coalesce rows for Y plane.
+ if (src_stride_y0 == width && src_stride_y1 == width &&
+ alpha_stride == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+ }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ BlendPlaneRow = BlendPlaneRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ BlendPlaneRow = BlendPlaneRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BlendPlaneRow = BlendPlaneRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ BlendPlaneRow = BlendPlaneRow_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+ src_y0 += src_stride_y0;
+ src_y1 += src_stride_y1;
+ alpha += alpha_stride;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+ int src_stride_y0,
+ const uint8_t* src_u0,
+ int src_stride_u0,
+ const uint8_t* src_v0,
+ int src_stride_v0,
+ const uint8_t* src_y1,
+ int src_stride_y1,
+ const uint8_t* src_u1,
+ int src_stride_u1,
+ const uint8_t* src_v1,
+ int src_stride_v1,
+ const uint8_t* alpha,
+ int alpha_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+ // Half width/height for UV.
+ int halfwidth = (width + 1) >> 1;
+ void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+ const uint8_t* alpha, uint8_t* dst, int width) =
+ BlendPlaneRow_C;
+ void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+ if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+ !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+
+ // Blend Y plane.
+ BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+ dst_y, dst_stride_y, width, height);
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ BlendPlaneRow = BlendPlaneRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ BlendPlaneRow = BlendPlaneRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BlendPlaneRow = BlendPlaneRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ BlendPlaneRow = BlendPlaneRow_MMI;
+ }
+ }
+#endif
+ if (!IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
+ }
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ ScaleRowDown2 = ScaleRowDown2Box_AVX2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ ScaleRowDown2 = ScaleRowDown2Box_MMI;
+ }
+ }
+ }
+#endif
+
+ // Row buffer for intermediate alpha pixels.
+ align_buffer_64(halfalpha, halfwidth);
+ for (y = 0; y < height; y += 2) {
+ // last row of odd height image use 1 row of alpha instead of 2.
+ if (y == (height - 1)) {
+ alpha_stride = 0;
+ }
+ // Subsample 2 rows of UV to half width and half height.
+ ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
+ alpha += alpha_stride * 2;
+ BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
+ BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
+ src_u0 += src_stride_u0;
+ src_u1 += src_stride_u1;
+ dst_u += dst_stride_u;
+ src_v0 += src_stride_v0;
+ src_v1 += src_stride_v1;
+ dst_v += dst_stride_v;
+ }
+ free_aligned_buffer_64(halfalpha);
+ return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+ uint8_t* dst, int width) = ARGBMultiplyRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+ }
+ }
+#endif
+
+ // Multiply plane
+ for (y = 0; y < height; ++y) {
+ ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
+ int width) = ARGBAddRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAddRow = ARGBAddRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAddRow = ARGBAddRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAddRow = ARGBAddRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAddRow = ARGBAddRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAddRow = ARGBAddRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_MSA;
+ }
+ }
+#endif
+
+ // Add plane
+ for (y = 0; y < height; ++y) {
+ ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+ uint8_t* dst, int width) = ARGBSubtractRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSubtractRow = ARGBSubtractRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBSubtractRow = ARGBSubtractRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_MSA;
+ }
+ }
+#endif
+
+ // Subtract plane
+ for (y = 0; y < height; ++y) {
+ ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to RGB24.
+LIBYUV_API
+int RAWToRGB24(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
+ RAWToRGB24Row_C;
+ if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGB24Row = RAWToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGB24Row = RAWToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToRGB24Row = RAWToRGB24Row_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGB24Row = RAWToRGB24Row_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGB24Row(src_raw, dst_rgb24, width);
+ src_raw += src_stride_raw;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ uint32_t value) {
+ int y;
+ void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ dst_stride_y = 0;
+ }
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SetRow = SetRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ SetRow = SetRow_Any_X86;
+ if (IS_ALIGNED(width, 4)) {
+ SetRow = SetRow_X86;
+ }
+ }
+#endif
+#if defined(HAS_SETROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ SetRow = SetRow_ERMS;
+ }
+#endif
+#if defined(HAS_SETROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_MSA;
+ }
+#endif
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ SetRow(dst_y, value, width);
+ dst_y += dst_stride_y;
+ }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ uint8_t* start_y = dst_y + y * dst_stride_y + x;
+ uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+ if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+ y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
+ value_v < 0 || value_v > 255) {
+ return -1;
+ }
+
+ SetPlane(start_y, dst_stride_y, width, height, value_y);
+ SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+ SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+ return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height,
+ uint32_t value) {
+ int y;
+ void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+ ARGBSetRow_C;
+ if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+
+#if defined(HAS_ARGBSETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBSetRow = ARGBSetRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBSetRow = ARGBSetRow_X86;
+ }
+#endif
+#if defined(HAS_ARGBSETROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBSetRow = ARGBSetRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSETROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBSetRow = ARGBSetRow_Any_MSA;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_MSA;
+ }
+ }
+#endif
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ ARGBSetRow(dst_argb, value, width);
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+// p is output pixel
+// f is foreground pixel
+// b is background pixel
+// a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+// f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBAttenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBUnattenuateRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+ }
+ }
+#endif
+ // TODO(fbarchard): Neon version.
+
+ for (y = 0; y < height; ++y) {
+ ARGBUnattenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+ ARGBGrayRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBGrayRow = ARGBGrayRow_MMI;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBGrayRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+ ARGBGrayRow_C;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBGrayRow = ARGBGrayRow_MMI;
+ }
+#endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBGrayRow(dst, dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBSEPIAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBSepiaRow = ARGBSepiaRow_MMI;
+ }
+#endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBSepiaRow(dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const int8_t* matrix_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ const int8_t* matrix_argb, int width) =
+ ARGBColorMatrixRow_C;
+ if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+ }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const int8_t* matrix_rgb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ SIMD_ALIGNED(int8_t matrix_argb[16]);
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
+ return -1;
+ }
+
+ // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+ matrix_argb[0] = matrix_rgb[0] / 2;
+ matrix_argb[1] = matrix_rgb[1] / 2;
+ matrix_argb[2] = matrix_rgb[2] / 2;
+ matrix_argb[3] = matrix_rgb[3] / 2;
+ matrix_argb[4] = matrix_rgb[4] / 2;
+ matrix_argb[5] = matrix_rgb[5] / 2;
+ matrix_argb[6] = matrix_rgb[6] / 2;
+ matrix_argb[7] = matrix_rgb[7] / 2;
+ matrix_argb[8] = matrix_rgb[8] / 2;
+ matrix_argb[9] = matrix_rgb[9] / 2;
+ matrix_argb[10] = matrix_rgb[10] / 2;
+ matrix_argb[11] = matrix_rgb[11] / 2;
+ matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+ matrix_argb[15] = 64; // 1.0
+
+ return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
+ dst_stride_argb, &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+ int width) = ARGBColorTableRow_C;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBColorTableRow = ARGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
+ int width) = RGBColorTableRow_C;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+ dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ RGBColorTableRow = RGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ RGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
+ int interval_offset, int width) = ARGBQuantizeRow_C;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+ interval_size < 1 || interval_size > 255) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_MSA;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+ int src_stride_argb,
+ int32_t* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+ const int32_t* previous_cumsum, int width) =
+ ComputeCumulativeSumRow_C;
+ int32_t* previous_cumsum = dst_cumsum;
+ if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ }
+#endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+ }
+#endif
+
+ memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
+ for (y = 0; y < height; ++y) {
+ ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+ previous_cumsum = dst_cumsum;
+ dst_cumsum += dst_stride32_cumsum;
+ src_argb += src_stride_argb;
+ }
+ return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int32_t* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+ const int32_t* previous_cumsum, int width) =
+ ComputeCumulativeSumRow_C;
+ void (*CumulativeSumToAverageRow)(
+ const int32_t* topleft, const int32_t* botleft, int width, int area,
+ uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
+ int32_t* cumsum_bot_row;
+ int32_t* max_cumsum_bot_row;
+ int32_t* cumsum_top_row;
+
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (radius > height) {
+ radius = height;
+ }
+ if (radius > (width / 2 - 1)) {
+ radius = width / 2 - 1;
+ }
+ if (radius <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+ }
+#endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+ }
+#endif
+ // Compute enough CumulativeSum for first row to be blurred. After this
+ // one row of CumulativeSum is updated at a time.
+ ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+ dst_stride32_cumsum, width, radius);
+
+ src_argb = src_argb + radius * src_stride_argb;
+ cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+ max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+ cumsum_top_row = &dst_cumsum[0];
+
+ for (y = 0; y < height; ++y) {
+ int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+ int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+ int area = radius * (bot_y - top_y);
+ int boxwidth = radius * 4;
+ int x;
+ int n;
+
+ // Increment cumsum_top_row pointer with circular buffer wrap around.
+ if (top_y) {
+ cumsum_top_row += dst_stride32_cumsum;
+ if (cumsum_top_row >= max_cumsum_bot_row) {
+ cumsum_top_row = dst_cumsum;
+ }
+ }
+ // Increment cumsum_bot_row pointer with circular buffer wrap around and
+ // then fill in a row of CumulativeSum.
+ if ((y + radius) < height) {
+ const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
+ cumsum_bot_row += dst_stride32_cumsum;
+ if (cumsum_bot_row >= max_cumsum_bot_row) {
+ cumsum_bot_row = dst_cumsum;
+ }
+ ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+ width);
+ src_argb += src_stride_argb;
+ }
+
+ // Left clipped.
+ for (x = 0; x < radius + 1; ++x) {
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+ &dst_argb[x * 4], 1);
+ area += (bot_y - top_y);
+ boxwidth += 4;
+ }
+
+ // Middle unclipped.
+ n = (width - 1) - radius - x + 1;
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+ &dst_argb[x * 4], n);
+
+ // Right clipped.
+ for (x += n; x <= width - 1; ++x) {
+ area -= (bot_y - top_y);
+ boxwidth -= 4;
+ CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+ area, &dst_argb[x * 4], 1);
+ }
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32_t value) {
+ int y;
+ void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
+ uint32_t value) = ARGBShadeRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_NEON;
+ }
+#endif
+#if defined(HAS_ARGBSHADEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBShadeRow = ARGBShadeRow_MMI;
+ }
+#endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+ ARGBShadeRow = ARGBShadeRow_MSA;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShadeRow(src_argb, dst_argb, width, value);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+ int src_stride0,
+ const uint8_t* src1,
+ int src_stride1,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation) {
+ int y;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst = dst + (height - 1) * dst_stride;
+ dst_stride = -dst_stride;
+ }
+ // Coalesce rows.
+ if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride0 = src_stride1 = dst_stride = 0;
+ }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ InterpolateRow(dst, src0, src1 - src0, width, interpolation);
+ src0 += src_stride0;
+ src1 += src_stride1;
+ dst += dst_stride;
+ }
+ return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation) {
+ return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+ src_stride_argb1, dst_argb, dst_stride_argb,
+ width * 4, height, interpolation);
+}
+
+// Interpolate 2 YUV images by specified amount (0 to 255).
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+ int src0_stride_y,
+ const uint8_t* src0_u,
+ int src0_stride_u,
+ const uint8_t* src0_v,
+ int src0_stride_v,
+ const uint8_t* src1_y,
+ int src1_stride_y,
+ const uint8_t* src1_u,
+ int src1_stride_u,
+ const uint8_t* src1_v,
+ int src1_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+ !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+ dst_stride_y, width, height, interpolation);
+ InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+ dst_stride_u, halfwidth, halfheight, interpolation);
+ InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+ dst_stride_v, halfwidth, halfheight, interpolation);
+ return 0;
+}
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* shuffler,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
+ const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
+ if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+ // Coalesce rows.
+ if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_bgra = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBShuffleRow = ARGBShuffleRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+ src_bgra += src_stride_bgra;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int y;
+ void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+ const float* src3, const float* src4, float* dst,
+ int width) = GaussCol_F32_C;
+ void (*GaussRow_F32)(const float* src, float* dst, int width) =
+ GaussRow_F32_C;
+ if (!src || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussCol_F32 = GaussCol_F32_NEON;
+ }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussRow_F32 = GaussRow_F32_NEON;
+ }
+#endif
+ {
+ // 2 pixels on each side, but aligned out to 16 bytes.
+ align_buffer_64(rowbuf, (4 + width + 4) * 4);
+ memset(rowbuf, 0, 16);
+ memset(rowbuf + (4 + width) * 4, 0, 16);
+ float* row = (float*)(rowbuf + 16);
+ const float* src0 = src;
+ const float* src1 = src;
+ const float* src2 = src;
+ const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+ const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+ for (y = 0; y < height; ++y) {
+ GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+ // Extrude edge by 2 floats
+ row[-2] = row[-1] = row[0];
+ row[width + 1] = row[width] = row[width - 1];
+
+ GaussRow_F32(row - 2, dst, width);
+
+ src0 = src1;
+ src1 = src2;
+ src2 = src3;
+ src3 = src4;
+ if ((y + 2) < (height - 1)) {
+ src4 += src_stride;
+ }
+ dst += dst_stride;
+ }
+ free_aligned_buffer_64(rowbuf);
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ void (*SobelRow)(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst,
+ int width)) {
+ int y;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
+ ARGBToYJRow_C;
+ void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+ uint8_t* dst_sobely, int width) = SobelYRow_C;
+ void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+ const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
+ SobelXRow_C;
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelYRow = SobelYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+#if defined(HAS_SOBELYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelYRow = SobelYRow_MMI;
+ }
+#endif
+#if defined(HAS_SOBELYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelYRow = SobelYRow_MSA;
+ }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXRow = SobelXRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+#if defined(HAS_SOBELXROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelXRow = SobelXRow_MMI;
+ }
+#endif
+#if defined(HAS_SOBELXROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXRow = SobelXRow_MSA;
+ }
+#endif
+ {
+ // 3 rows with edges before/after.
+ const int kRowSize = (width + kEdge + 31) & ~31;
+ align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+ uint8_t* row_sobelx = rows;
+ uint8_t* row_sobely = rows + kRowSize;
+ uint8_t* row_y = rows + kRowSize * 2;
+
+ // Convert first row.
+ uint8_t* row_y0 = row_y + kEdge;
+ uint8_t* row_y1 = row_y0 + kRowSize;
+ uint8_t* row_y2 = row_y1 + kRowSize;
+ ARGBToYJRow(src_argb, row_y0, width);
+ row_y0[-1] = row_y0[0];
+ memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
+ ARGBToYJRow(src_argb, row_y1, width);
+ row_y1[-1] = row_y1[0];
+ memset(row_y1 + width, row_y1[width - 1], 16);
+ memset(row_y2 + width, 0, 16);
+
+ for (y = 0; y < height; ++y) {
+ // Convert next row of ARGB to G.
+ if (y < (height - 1)) {
+ src_argb += src_stride_argb;
+ }
+ ARGBToYJRow(src_argb, row_y2, width);
+ row_y2[-1] = row_y2[0];
+ row_y2[width] = row_y2[width - 1];
+
+ SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+ SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+ SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+ // Cycle thru circular queue of 3 row_y buffers.
+ {
+ uint8_t* row_yt = row_y0;
+ row_y0 = row_y1;
+ row_y1 = row_y2;
+ row_y2 = row_yt;
+ }
+
+ dst_argb += dst_stride_argb;
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+ uint8_t* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelRow = SobelRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelRow = SobelRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelRow = SobelRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SOBELROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelRow = SobelRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+ uint8_t* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelToPlaneRow = SobelToPlaneRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SobelToPlaneRow = SobelToPlaneRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+ height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+ uint8_t* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXYRow = SobelXYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXYRow = SobelXYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELXYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelXYRow = SobelXYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXYRow = SobelXYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_MSA;
+ }
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const float* poly,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ const float* poly, int width) = ARGBPolynomialRow_C;
+ if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+ IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height) {
+ int y;
+ void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+ int width) = HalfFloatRow_C;
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ src_stride_y >>= 1;
+ dst_stride_y >>= 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_HALFFLOATROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ HalfFloatRow = HalfFloatRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ HalfFloatRow = HalfFloatRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HalfFloatRow = HalfFloatRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ HalfFloatRow = HalfFloatRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+ HalfFloatRow =
+ (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+ if (IS_ALIGNED(width, 16)) {
+ HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ HalfFloatRow =
+ (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_HALFFLOATROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ HalfFloatRow = HalfFloatRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ HalfFloatRow = HalfFloatRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ HalfFloatRow(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+ void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+ int width) = ByteToFloatRow_C;
+ if (!src_y || !dst_y || width <= 0) {
+ return -1;
+ }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ByteToFloatRow = ByteToFloatRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ByteToFloatRow = ByteToFloatRow_NEON;
+ }
+ }
+#endif
+
+ ByteToFloatRow(src_y, dst_y, scale, width);
+ return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* luma,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBLumaColorTableRow)(
+ const uint8_t* src_argb, uint8_t* dst_argb, int width,
+ const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
+ if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+ ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBCopyAlphaRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyAlphaRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Extract just the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
+ if (!src_argb || !dst_a || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb += (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_a == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_a = 0;
+ }
+ void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+ int width) = ARGBExtractAlphaRow_C;
+#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
+ : ARGBExtractAlphaRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+ : ARGBExtractAlphaRow_Any_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
+ : ARGBExtractAlphaRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
+ : ARGBExtractAlphaRow_Any_MMI;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+ : ARGBExtractAlphaRow_Any_MSA;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBExtractAlphaRow(src_argb, dst_a, width);
+ src_argb += src_stride_argb;
+ dst_a += dst_stride_a;
+ }
+ return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+ int width) = ARGBCopyYToAlphaRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Consider if width is even Y channel can be split
+// directly. A SplitUVRow_Odd function could copy the remaining chroma.
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = SplitUVRow_C;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ {
+ int awidth = halfwidth * 2;
+ // row of y and 2 rows of uv
+ align_buffer_64(rows, awidth * 3);
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Split Y from UV.
+ SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
+ memcpy(dst_y, rows, width);
+ SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
+ memcpy(dst_y + dst_stride_y, rows, width);
+ InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ // Split Y from UV.
+ SplitUVRow(src_yuy2, rows, dst_uv, awidth);
+ memcpy(dst_y, rows, width);
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = SplitUVRow_C;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ {
+ int awidth = halfwidth * 2;
+ // row of y and 2 rows of uv
+ align_buffer_64(rows, awidth * 3);
+
+ for (y = 0; y < height - 1; y += 2) {
+ // Split Y from UV.
+ SplitUVRow(src_uyvy, rows + awidth, rows, awidth);
+ memcpy(dst_y, rows, width);
+ SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth);
+ memcpy(dst_y + dst_stride_y, rows, width);
+ InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ // Split Y from UV.
+ SplitUVRow(src_uyvy, dst_uv, rows, awidth);
+ memcpy(dst_y, rows, width);
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_NEON;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ HalfMergeUVRow = HalfMergeUVRow_AVX2;
+ }
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+ // Merge a row of U and V into a row of UV.
+ HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate.cc b/third_party/aom/third_party/libyuv/source/rotate.cc
new file mode 100644
index 0000000000..32904e4731
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,609 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i = height;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width) = TransposeWx16_C;
+#else
+ void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+ int dst_stride, int width) = TransposeWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeWx16 = TransposeWx16_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_MSA;
+ }
+ }
+#else
+#if defined(HAS_TRANSPOSEWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeWx8 = TransposeWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ TransposeWx8 = TransposeWx8_MMI;
+ }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx8 = TransposeWx8_Fast_SSSE3;
+ }
+ }
+#endif
+#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
+
+#if defined(HAS_TRANSPOSEWX16_MSA)
+ // Work across the source in 16x16 tiles
+ while (i >= 16) {
+ TransposeWx16(src, src_stride, dst, dst_stride, width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst += 16; // Move over 16 columns.
+ i -= 16;
+ }
+#else
+ // Work across the source in 8x8 tiles
+ while (i >= 8) {
+ TransposeWx8(src, src_stride, dst, dst_stride, width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
+ }
+#endif
+
+ if (i > 0) {
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+ }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 90 is a transpose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Rotate by 270 is a transpose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width);
+ const uint8_t* src_bot = src + src_stride * (height - 1);
+ uint8_t* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ CopyRow(src, row, width); // Copy first row into buffer
+ MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ MirrorRow(row, dst_bot, width); // Mirror buffer into last row
+ src += src_stride;
+ dst += dst_stride;
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i = height;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx16_C;
+#else
+ void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx8_C;
+#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#else
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeUVWx8 = TransposeUVWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_MMI;
+ }
+ }
+#endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ // Work through the source in 8x8 tiles.
+ while (i >= 16) {
+ TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst_a += 16; // Move over 8 columns.
+ dst_b += 16; // Move over 8 columns.
+ i -= 16;
+ }
+#else
+ // Work through the source in 8x8 tiles.
+ while (i >= 8) {
+ TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
+ i -= 8;
+ }
+#endif
+
+ if (i > 0) {
+ TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, i);
+ }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+
+ TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+ height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ dst_a += dst_stride_a * (width - 1);
+ dst_b += dst_stride_b * (width - 1);
+ dst_stride_a = -dst_stride_a;
+ dst_stride_b = -dst_stride_b;
+
+ TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+ height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i;
+ void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_MMI;
+ }
+#endif
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_MSA;
+ }
+#endif
+
+ dst_a += dst_stride_a * (height - 1);
+ dst_b += dst_stride_b * (height - 1);
+
+ for (i = 0; i < height; ++i) {
+ MirrorSplitUVRow(src, dst_a, dst_b, width);
+ src += src_stride;
+ dst_a -= dst_stride_a;
+ dst_b -= dst_stride_b;
+ }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src, src_stride, dst, dst_stride, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ halfheight);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum libyuv::RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case libyuv::kRotate0:
+ // copy frame
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+ !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_any.cc b/third_party/aom/third_party/libyuv/source/rotate_any.cc
new file mode 100644
index 0000000000..b3baf084d0
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_any.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \
+ int dst_stride, int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
+ } \
+ TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+ }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_MMI
+TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
+#endif
+#undef TANY
+
+#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
+ void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \
+ int width) { \
+ int r = width & MASK; \
+ int n = width - r; \
+ if (n > 0) { \
+ TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+ } \
+ TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \
+ dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+ }
+
+#ifdef HAS_TRANSPOSEUVWX8_NEON
+TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_SSE2
+TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX8_MMI
+TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
+#endif
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
+#endif
+#undef TUVANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_argb.cc b/third_party/aom/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 0000000000..ae65388601
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static int ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int i;
+ int src_pixel_step = src_stride_argb >> 2;
+ void (*ScaleARGBRowDownEven)(
+ const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
+ uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+ // Check stride is a multiple of 4.
+ if (src_stride_argb & 3) {
+ return -1;
+ }
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+ }
+ }
+#endif
+
+ for (i = 0; i < width; ++i) { // column of source to row of dest.
+ ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
+ dst_argb += dst_stride_argb;
+ src_argb += 4;
+ }
+ return 0;
+}
+
+static int ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ // Rotate by 90 is a ARGBTranspose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src_argb += src_stride_argb * (height - 1);
+ src_stride_argb = -src_stride_argb;
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+}
+
+static int ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ // Rotate by 270 is a ARGBTranspose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst_argb += dst_stride_argb * (width - 1);
+ dst_stride_argb = -dst_stride_argb;
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+}
+
+static int ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width * 4);
+ const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
+ uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+ ARGBMirrorRow_C;
+ void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+ CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMirrorRow = ARGBMirrorRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_AVX)
+ if (TestCpuFlag(kCpuHasAVX)) {
+ CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer
+ ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ src_bot -= src_stride_argb;
+ dst_bot -= dst_stride_argb;
+ }
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate90:
+ return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate270:
+ return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate180:
+ return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_common.cc b/third_party/aom/third_party/libyuv/source/rotate_common.cc
new file mode 100644
index 0000000000..ff212adebc
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_common.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+void TransposeUVWx8_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_a[0] = src[0 * src_stride + 0];
+ dst_b[0] = src[0 * src_stride + 1];
+ dst_a[1] = src[1 * src_stride + 0];
+ dst_b[1] = src[1 * src_stride + 1];
+ dst_a[2] = src[2 * src_stride + 0];
+ dst_b[2] = src[2 * src_stride + 1];
+ dst_a[3] = src[3 * src_stride + 0];
+ dst_b[3] = src[3 * src_stride + 1];
+ dst_a[4] = src[4 * src_stride + 0];
+ dst_b[4] = src[4 * src_stride + 1];
+ dst_a[5] = src[5 * src_stride + 0];
+ dst_b[5] = src[5 * src_stride + 1];
+ dst_a[6] = src[6 * src_stride + 0];
+ dst_b[6] = src[6 * src_stride + 1];
+ dst_a[7] = src[7 * src_stride + 0];
+ dst_b[7] = src[7 * src_stride + 1];
+ src += 2;
+ dst_a += dst_stride_a;
+ dst_b += dst_stride_b;
+ }
+}
+
+void TransposeWxH_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+void TransposeUVWxH_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int i;
+ for (i = 0; i < width * 2; i += 2) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+ dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+ }
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_gcc.cc b/third_party/aom/third_party/libyuv/source/rotate_gcc.cc
new file mode 100644
index 0000000000..fd359d4ae6
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+void TransposeWx8_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
+
+// Transpose 16x8. 64 bit
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "xmm15");
+}
+#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+
+// Transpose UV 8x8. 64 bit.
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+void TransposeUVWx8_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ asm volatile(
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7", "xmm8", "xmm9");
+}
+#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_mips.cc b/third_party/aom/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 0000000000..efe6bd909e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "sw $s0, 0(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "sw $s1, 4(%[dst]) \n"
+ "bnez %[width], 1b \n"
+ " addu %[dst], %[dst], %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "swr $s0, 0(%[dst]) \n"
+ "swl $s0, 3(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "swr $s1, 4(%[dst]) \n"
+ "swl $s1, 7(%[dst]) \n"
+ "bnez %[width], 11b \n"
+ "addu %[dst], %[dst], %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1"
+ );
+}
+
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm__ __volatile__ (
+ ".set noat \n"
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+
+ "srl $AT, %[width], 0x2 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "sw $s4, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $s6, 0($s0) \n"
+ "sw $t8, 4($s0) \n"
+ "sw $s5, 0($s1) \n"
+ "sw $t1, 4($s1) \n"
+ "sw $s7, 0($s2) \n"
+ "sw $t9, 4($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 1b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "swr $s4, 0(%[dst]) \n"
+ "swl $s4, 3(%[dst]) \n"
+ "swr $t0, 4(%[dst]) \n"
+ "swl $t0, 7(%[dst]) \n"
+ "swr $s6, 0($s0) \n"
+ "swl $s6, 3($s0) \n"
+ "swr $t8, 4($s0) \n"
+ "swl $t8, 7($s0) \n"
+ "swr $s5, 0($s1) \n"
+ "swl $s5, 3($s1) \n"
+ "swr $t1, 4($s1) \n"
+ "swl $t1, 7($s1) \n"
+ "swr $s7, 0($s2) \n"
+ "swl $s7, 3($s2) \n"
+ "swr $t9, 4($s2) \n"
+ "swl $t9, 7($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 11b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ ".set at \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+ );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "subu $t7, $t9, %[src_stride] \n"
+ "srl $t1, %[width], 1 \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+ "andi $t0, %[dst_a], 0x3 \n"
+ "andi $t8, %[dst_b], 0x3 \n"
+ "or $t0, $t0, $t8 \n"
+ "andi $t8, %[dst_stride_a], 0x3 \n"
+ "andi $s5, %[dst_stride_b], 0x3 \n"
+ "or $t8, $t8, $s5 \n"
+ "or $t0, $t0, $t8 \n"
+ "bnez $t0, 11f \n"
+ " nop \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+ "1: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "sw $s3, 0($s5) \n"
+ "sw $s4, 0($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "sw $s3, 0(%[dst_a]) \n"
+ "sw $s4, 0(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "sw $s3, 4($s5) \n"
+ "sw $s4, 4($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "sw $s3, 4(%[dst_a]) \n"
+ "sw $s4, 4(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 1b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+ "b 2f \n"
+ " nop \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "swr $s3, 0($s5) \n"
+ "swl $s3, 3($s5) \n"
+ "swr $s4, 0($s6) \n"
+ "swl $s4, 3($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "swr $s3, 0(%[dst_a]) \n"
+ "swl $s3, 3(%[dst_a]) \n"
+ "swr $s4, 0(%[dst_b]) \n"
+ "swl $s4, 3(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+
+ "swr $s3, 4($s5) \n"
+ "swl $s3, 7($s5) \n"
+ "swr $s4, 4($s6) \n"
+ "swl $s4, 7($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "swr $s3, 4(%[dst_a]) \n"
+ "swl $s3, 7(%[dst_a]) \n"
+ "swr $s4, 4(%[dst_b]) \n"
+ "swl $s4, 7(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 11b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+
+ "2: \n"
+ ".set pop \n"
+ : [src] "+r" (src),
+ [dst_a] "+r" (dst_a),
+ [dst_b] "+r" (dst_b),
+ [width] "+r" (width),
+ [src_stride] "+r" (src_stride)
+ : [dst_stride_a] "r" (dst_stride_a),
+ [dst_stride_b] "r" (dst_stride_b)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_neon.cc b/third_party/aom/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 0000000000..844df2bf30
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,418 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %5, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %5, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %5, #2 \n"
+ "blt 3f \n"
+
+ "cmp %5, #4 \n"
+ "blt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ "vld1.32 {d0[0]}, [%0], %2 \n"
+ "vld1.32 {d0[1]}, [%0], %2 \n"
+ "vld1.32 {d1[0]}, [%0], %2 \n"
+ "vld1.32 {d1[1]}, [%0], %2 \n"
+ "vld1.32 {d2[0]}, [%0], %2 \n"
+ "vld1.32 {d2[1]}, [%0], %2 \n"
+ "vld1.32 {d3[0]}, [%0], %2 \n"
+ "vld1.32 {d3[1]}, [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "vld1.8 {q3}, [%6] \n"
+
+ "vtbl.8 d4, {d0, d1}, d6 \n"
+ "vtbl.8 d5, {d0, d1}, d7 \n"
+ "vtbl.8 d0, {d2, d3}, d6 \n"
+ "vtbl.8 d1, {d2, d3}, d7 \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "vst1.32 {d4[0]}, [%0], %4 \n"
+ "vst1.32 {d4[1]}, [%0], %4 \n"
+ "vst1.32 {d5[0]}, [%0], %4 \n"
+ "vst1.32 {d5[1]}, [%0] \n"
+
+ "add %0, %3, #4 \n"
+ "vst1.32 {d0[0]}, [%0], %4 \n"
+ "vst1.32 {d0[1]}, [%0], %4 \n"
+ "vst1.32 {d1[0]}, [%0], %4 \n"
+ "vst1.32 {d1[1]}, [%0] \n"
+
+ "add %1, #4 \n" // src += 4
+ "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %5, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %5, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "vld1.16 {d0[0]}, [%0], %2 \n"
+ "vld1.16 {d1[0]}, [%0], %2 \n"
+ "vld1.16 {d0[1]}, [%0], %2 \n"
+ "vld1.16 {d1[1]}, [%0], %2 \n"
+ "vld1.16 {d0[2]}, [%0], %2 \n"
+ "vld1.16 {d1[2]}, [%0], %2 \n"
+ "vld1.16 {d0[3]}, [%0], %2 \n"
+ "vld1.16 {d1[3]}, [%0] \n"
+
+ "vtrn.8 d0, d1 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.64 {d0}, [%0], %4 \n"
+ "vst1.64 {d1}, [%0] \n"
+
+ "add %1, #2 \n" // src += 2
+ "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %5, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld1.8 {d0[0]}, [%1], %2 \n"
+ "vld1.8 {d0[1]}, [%1], %2 \n"
+ "vld1.8 {d0[2]}, [%1], %2 \n"
+ "vld1.8 {d0[3]}, [%1], %2 \n"
+ "vld1.8 {d0[4]}, [%1], %2 \n"
+ "vld1.8 {d0[5]}, [%1], %2 \n"
+ "vld1.8 {d0[6]}, [%1], %2 \n"
+ "vld1.8 {d0[7]}, [%1] \n"
+
+ "vst1.64 {d0}, [%3] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst), // %3
+ "+r"(dst_stride), // %4
+ "+r"(width) // %5
+ : "r"(&kVTbl4x4Transpose) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
+ 4, 12, 5, 13, 6, 14, 7, 15};
+
+void TransposeUVWx8_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %7, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %7, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %7, #2 \n"
+ "blt 3f \n"
+
+ "cmp %7, #4 \n"
+ "blt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ "vld1.64 {d0}, [%0], %2 \n"
+ "vld1.64 {d1}, [%0], %2 \n"
+ "vld1.64 {d2}, [%0], %2 \n"
+ "vld1.64 {d3}, [%0], %2 \n"
+ "vld1.64 {d4}, [%0], %2 \n"
+ "vld1.64 {d5}, [%0], %2 \n"
+ "vld1.64 {d6}, [%0], %2 \n"
+ "vld1.64 {d7}, [%0] \n"
+
+ "vld1.8 {q15}, [%8] \n"
+
+ "vtrn.8 q0, q1 \n"
+ "vtrn.8 q2, q3 \n"
+
+ "vtbl.8 d16, {d0, d1}, d30 \n"
+ "vtbl.8 d17, {d0, d1}, d31 \n"
+ "vtbl.8 d18, {d2, d3}, d30 \n"
+ "vtbl.8 d19, {d2, d3}, d31 \n"
+ "vtbl.8 d20, {d4, d5}, d30 \n"
+ "vtbl.8 d21, {d4, d5}, d31 \n"
+ "vtbl.8 d22, {d6, d7}, d30 \n"
+ "vtbl.8 d23, {d6, d7}, d31 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.32 {d16[0]}, [%0], %4 \n"
+ "vst1.32 {d16[1]}, [%0], %4 \n"
+ "vst1.32 {d17[0]}, [%0], %4 \n"
+ "vst1.32 {d17[1]}, [%0], %4 \n"
+
+ "add %0, %3, #4 \n"
+ "vst1.32 {d20[0]}, [%0], %4 \n"
+ "vst1.32 {d20[1]}, [%0], %4 \n"
+ "vst1.32 {d21[0]}, [%0], %4 \n"
+ "vst1.32 {d21[1]}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.32 {d18[0]}, [%0], %6 \n"
+ "vst1.32 {d18[1]}, [%0], %6 \n"
+ "vst1.32 {d19[0]}, [%0], %6 \n"
+ "vst1.32 {d19[1]}, [%0], %6 \n"
+
+ "add %0, %5, #4 \n"
+ "vst1.32 {d22[0]}, [%0], %6 \n"
+ "vst1.32 {d22[1]}, [%0], %6 \n"
+ "vst1.32 {d23[0]}, [%0], %6 \n"
+ "vst1.32 {d23[1]}, [%0] \n"
+
+ "add %1, #4*2 \n" // src += 4 * 2
+ "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
+ // dst_stride_b
+ "subs %7, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %7, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
+ "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
+ "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
+ "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
+ "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
+ "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
+ "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
+ "vld2.16 {d1[3], d3[3]}, [%0] \n"
+
+ "vtrn.8 d0, d1 \n"
+ "vtrn.8 d2, d3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.64 {d0}, [%0], %4 \n"
+ "vst1.64 {d2}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.64 {d1}, [%0], %6 \n"
+ "vst1.64 {d3}, [%0] \n"
+
+ "add %1, #2*2 \n" // src += 2 * 2
+ "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
+ // dst_stride_b
+ "subs %7, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
+ "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
+ "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
+ "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
+ "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
+ "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
+ "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
+ "vld2.8 {d0[7], d1[7]}, [%1] \n"
+
+ "vst1.64 {d0}, [%3] \n"
+ "vst1.64 {d1}, [%5] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_a), // %3
+ "+r"(dst_stride_a), // %4
+ "+r"(dst_b), // %5
+ "+r"(dst_stride_b), // %6
+ "+r"(width) // %7
+ : "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_neon64.cc b/third_party/aom/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 0000000000..43c1581731
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %w3, %w3, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
+ "mov %0, %1 \n"
+
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 8
+ "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %w3, %w3, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
+
+ "mov %0, %2 \n"
+
+ "ld1 {v2.16b}, [%4] \n"
+
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "st1 {v3.s}[0], [%0], %6 \n"
+ "st1 {v3.s}[1], [%0], %6 \n"
+ "st1 {v3.s}[2], [%0], %6 \n"
+ "st1 {v3.s}[3], [%0] \n"
+
+ "add %0, %2, #4 \n"
+ "st1 {v0.s}[0], [%0], %6 \n"
+ "st1 {v0.s}[1], [%0], %6 \n"
+ "st1 {v0.s}[2], [%0], %6 \n"
+ "st1 {v0.s}[3], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 4
+ "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %w3, %w3, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.h}[0], [%0], %5 \n"
+ "ld1 {v1.h}[0], [%0], %5 \n"
+ "ld1 {v0.h}[1], [%0], %5 \n"
+ "ld1 {v1.h}[1], [%0], %5 \n"
+ "ld1 {v0.h}[2], [%0], %5 \n"
+ "ld1 {v1.h}[2], [%0], %5 \n"
+ "ld1 {v0.h}[3], [%0], %5 \n"
+ "ld1 {v1.h}[3], [%0] \n"
+
+ "trn2 v2.8b, v0.8b, v1.8b \n"
+ "trn1 v3.8b, v0.8b, v1.8b \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v3.8b}, [%0], %6 \n"
+ "st1 {v2.8b}, [%0] \n"
+
+ "add %1, %1, #2 \n" // src += 2
+ "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %w3, %w3, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "ld1 {v0.b}[0], [%1], %5 \n"
+ "ld1 {v0.b}[1], [%1], %5 \n"
+ "ld1 {v0.b}[2], [%1], %5 \n"
+ "ld1 {v0.b}[3], [%1], %5 \n"
+ "ld1 {v0.b}[4], [%1], %5 \n"
+ "ld1 {v0.b}[5], [%1], %5 \n"
+ "ld1 {v0.b}[6], [%1], %5 \n"
+ "ld1 {v0.b}[7], [%1] \n"
+
+ "st1 {v0.8b}, [%2] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst), // %2
+ "+r"(width) // %3
+ : "r"(&kVTbl4x4Transpose), // %4
+ "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+}
+
+static const uint8_t kVTbl4x4TransposeDi[32] = {
+ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+ 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %w4, %w4, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
+
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
+
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
+
+ "ld1 {v30.16b}, [%8], #16 \n"
+ "ld1 {v31.16b}, [%8] \n"
+
+ "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
+ "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
+ "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
+ "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v16.s}[0], [%0], %6 \n"
+ "st1 {v16.s}[1], [%0], %6 \n"
+ "st1 {v16.s}[2], [%0], %6 \n"
+ "st1 {v16.s}[3], [%0], %6 \n"
+
+ "add %0, %2, #4 \n"
+ "st1 {v18.s}[0], [%0], %6 \n"
+ "st1 {v18.s}[1], [%0], %6 \n"
+ "st1 {v18.s}[2], [%0], %6 \n"
+ "st1 {v18.s}[3], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v17.s}[0], [%0], %7 \n"
+ "st1 {v17.s}[1], [%0], %7 \n"
+ "st1 {v17.s}[2], [%0], %7 \n"
+ "st1 {v17.s}[3], [%0], %7 \n"
+
+ "add %0, %3, #4 \n"
+ "st1 {v19.s}[0], [%0], %7 \n"
+ "st1 {v19.s}[1], [%0], %7 \n"
+ "st1 {v19.s}[2], [%0], %7 \n"
+ "st1 {v19.s}[3], [%0] \n"
+
+ "add %1, %1, #8 \n" // src += 4 * 2
+ "add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
+ // dst_stride_a
+ "add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
+ // dst_stride_b
+ "subs %w4, %w4, #4 \n" // w -= 4
+ "b.eq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[3], [%0] \n"
+
+ "trn1 v4.8b, v0.8b, v2.8b \n"
+ "trn2 v5.8b, v0.8b, v2.8b \n"
+ "trn1 v6.8b, v1.8b, v3.8b \n"
+ "trn2 v7.8b, v1.8b, v3.8b \n"
+
+ "mov %0, %2 \n"
+
+ "st1 {v4.d}[0], [%0], %6 \n"
+ "st1 {v6.d}[0], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v5.d}[0], [%0], %7 \n"
+ "st1 {v7.d}[0], [%0] \n"
+
+ "add %1, %1, #4 \n" // src += 2 * 2
+ "add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
+ // dst_stride_a
+ "add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
+ // dst_stride_b
+ "subs %w4, %w4, #2 \n" // w -= 2
+ "b.eq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[7], [%1] \n"
+
+ "st1 {v0.d}[0], [%2] \n"
+ "st1 {v1.d}[0], [%3] \n"
+
+ "4: \n"
+
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
+ "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
+ "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
+}
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_win.cc b/third_party/aom/third_party/libyuv/source/rotate_win.cc
new file mode 100644
index 0000000000..e887dd525c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_win.cc
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ __asm {
+ push edi
+ push esi
+ push ebp
+ mov eax, [esp + 12 + 4] // src
+ mov edi, [esp + 12 + 8] // src_stride
+ mov edx, [esp + 12 + 12] // dst
+ mov esi, [esp + 12 + 16] // dst_stride
+ mov ecx, [esp + 12 + 20] // width
+
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea ebp, [eax + 8]
+ movq xmm1, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm0, xmm1
+ movq xmm2, qword ptr [eax]
+ movdqa xmm1, xmm0
+ palignr xmm1, xmm1, 8
+ movq xmm3, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm2, xmm3
+ movdqa xmm3, xmm2
+ movq xmm4, qword ptr [eax]
+ palignr xmm3, xmm3, 8
+ movq xmm5, qword ptr [eax + edi]
+ punpcklbw xmm4, xmm5
+ lea eax, [eax + 2 * edi]
+ movdqa xmm5, xmm4
+ movq xmm6, qword ptr [eax]
+ palignr xmm5, xmm5, 8
+ movq xmm7, qword ptr [eax + edi]
+ punpcklbw xmm6, xmm7
+ mov eax, ebp
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm7, 8
+ // Second round of bit swap.
+ punpcklwd xmm0, xmm2
+ punpcklwd xmm1, xmm3
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ palignr xmm2, xmm2, 8
+ palignr xmm3, xmm3, 8
+ punpcklwd xmm4, xmm6
+ punpcklwd xmm5, xmm7
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ palignr xmm6, xmm6, 8
+ palignr xmm7, xmm7, 8
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ punpckldq xmm0, xmm4
+ movq qword ptr [edx], xmm0
+ movdqa xmm4, xmm0
+ palignr xmm4, xmm4, 8
+ movq qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ punpckldq xmm2, xmm6
+ movdqa xmm6, xmm2
+ palignr xmm6, xmm6, 8
+ movq qword ptr [edx], xmm2
+ punpckldq xmm1, xmm5
+ movq qword ptr [edx + esi], xmm6
+ lea edx, [edx + 2 * esi]
+ movdqa xmm5, xmm1
+ movq qword ptr [edx], xmm1
+ palignr xmm5, xmm5, 8
+ punpckldq xmm3, xmm7
+ movq qword ptr [edx + esi], xmm5
+ lea edx, [edx + 2 * esi]
+ movq qword ptr [edx], xmm3
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm7, 8
+ sub ecx, 8
+ movq qword ptr [edx + esi], xmm7
+ lea edx, [edx + 2 * esi]
+ jg convertloop
+
+ pop ebp
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int w) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] // src
+ mov edi, [esp + 16 + 8] // src_stride
+ mov edx, [esp + 16 + 12] // dst_a
+ mov esi, [esp + 16 + 16] // dst_stride_a
+ mov ebx, [esp + 16 + 20] // dst_b
+ mov ebp, [esp + 16 + 24] // dst_stride_b
+ mov ecx, esp
+ sub esp, 4 + 16
+ and esp, ~15
+ mov [esp + 16], ecx
+ mov ecx, [ecx + 16 + 28] // w
+
+ align 4
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm0 // use xmm7 as temp register.
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm7, xmm1
+ movdqa xmm1, xmm7
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm2
+ punpcklbw xmm2, xmm3
+ punpckhbw xmm7, xmm3
+ movdqa xmm3, xmm7
+ movdqu xmm4, [eax]
+ movdqu xmm5, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm4
+ punpcklbw xmm4, xmm5
+ punpckhbw xmm7, xmm5
+ movdqa xmm5, xmm7
+ movdqu xmm6, [eax]
+ movdqu xmm7, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqu [esp], xmm5 // backup xmm5
+ neg edi
+ movdqa xmm5, xmm6 // use xmm5 as temp register.
+ punpcklbw xmm6, xmm7
+ punpckhbw xmm5, xmm7
+ movdqa xmm7, xmm5
+ lea eax, [eax + 8 * edi + 16]
+ neg edi
+ // Second round of bit swap.
+ movdqa xmm5, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm5, xmm2
+ movdqa xmm2, xmm5
+ movdqa xmm5, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm5, xmm3
+ movdqa xmm3, xmm5
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm5, xmm6
+ movdqa xmm6, xmm5
+ movdqu xmm5, [esp] // restore xmm5
+ movdqu [esp], xmm6 // backup xmm6
+ movdqa xmm6, xmm5 // use xmm6 as temp register.
+ punpcklwd xmm5, xmm7
+ punpckhwd xmm6, xmm7
+ movdqa xmm7, xmm6
+
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ movdqa xmm6, xmm0
+ punpckldq xmm0, xmm4
+ punpckhdq xmm6, xmm4
+ movdqa xmm4, xmm6
+ movdqu xmm6, [esp] // restore xmm6
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [ebx], xmm0
+ movlpd qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm4
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm2 // use xmm0 as the temp register.
+ punpckldq xmm2, xmm6
+ movlpd qword ptr [edx], xmm2
+ movhpd qword ptr [ebx], xmm2
+ punpckhdq xmm0, xmm6
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm1 // use xmm0 as the temp register.
+ punpckldq xmm1, xmm5
+ movlpd qword ptr [edx], xmm1
+ movhpd qword ptr [ebx], xmm1
+ punpckhdq xmm0, xmm5
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm3 // use xmm0 as the temp register.
+ punpckldq xmm3, xmm7
+ movlpd qword ptr [edx], xmm3
+ movhpd qword ptr [ebx], xmm3
+ punpckhdq xmm0, xmm7
+ sub ecx, 8
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ jg convertloop
+
+ mov esp, [esp + 16]
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_any.cc b/third_party/aom/third_party/libyuv/source/row_any.cc
new file mode 100644
index 0000000000..7216373bcd
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_any.cc
@@ -0,0 +1,1562 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h> // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// memset for temp is meant to clear the source buffer (not dest) so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 4 planes to 1 with yuvconstants
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 5]); \
+ memset(temp, 0, 64 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 192, a_buf + n, r); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
+ yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_NEON
+ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422ALPHATOARGBROW_MMI
+ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
+#endif
+#undef ANY41C
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_MMI
+ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
+#endif
+#ifdef HAS_I422TOYUY2ROW_SSE2
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOYUY2ROW_MMI
+ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
+#ifdef HAS_I422TOUYVYROW_MMI
+ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
+#endif
+#ifdef HAS_BLENDPLANEROW_AVX2
+ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
+#endif
+#ifdef HAS_BLENDPLANEROW_SSSE3
+ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
+#endif
+#ifdef HAS_BLENDPLANEROW_MMI
+ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
+#endif
+#undef ANY31
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
+// Any 3 planes to 1 with yuvconstants
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 4]); \
+ memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ if (width & 1) { \
+ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
+ temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
+ MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
+#endif // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_MMI
+ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
+ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
+#endif
+#undef ANY31C
+
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \
+ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(T temp[16 * 3]); \
+ SIMD_ALIGNED(uint8_t out[64]); \
+ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r * SBPP); \
+ memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOARGBROW_MMI
+ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#undef ANY31CT
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_MMI
+ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_MMI
+ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_MMI
+ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MMI
+ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_MMI
+ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_MMI
+ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_MMI
+ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 2 planes to 1 with yuvconstants
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
+ }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_MMI
+ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_SSSE3
+ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_AVX2
+ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV21TOARGBROW_NEON
+ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_MMI
+ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_MMI
+ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_MMI
+ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_MMI
+ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
+#endif
+#undef ANY21C
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_SSSE3)
+ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
+ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
+#endif
+#if defined(HAS_RAWTORGB24ROW_NEON)
+ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_MMI
+ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_MMI
+ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_MMI
+ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_MMI
+ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYROW_MMI
+ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_MMI
+ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_MMI
+ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_MMI
+ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_MMI
+ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_MMI
+ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_MMI
+ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MMI
+ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_MMI
+ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_MMI
+ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_MMI
+ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_MMI
+ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_MMI
+ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_MMI
+ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
+ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
+ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
+#endif
+#undef ANY11
+
+// Any 1 to 1 blended. Destination is read, modify, write.
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 2]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 64, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYALPHAROW_MMI
+ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
+#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
+ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
+#endif
+#undef ANY11B
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp, temp + 64, param, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+ }
+
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+ I400ToARGBRow_SSE2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+ I400ToARGBRow_AVX2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+ I400ToARGBRow_NEON,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+ I400ToARGBRow_MSA,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ANY11P(I400ToARGBRow_Any_MMI,
+ I400ToARGBRow_MMI,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+ ARGBToRGB565DitherRow_SSE2,
+ const uint32_t,
+ 4,
+ 2,
+ 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+ ARGBToRGB565DitherRow_AVX2,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+ ARGBToRGB565DitherRow_NEON,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+ ARGBToRGB565DitherRow_MSA,
+ const uint32_t,
+ 4,
+ 2,
+ 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ANY11P(ARGBToRGB565DitherRow_Any_MMI,
+ ARGBToRGB565DitherRow_MMI,
+ const uint32_t,
+ 4,
+ 2,
+ 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MMI
+ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
+#endif
+#undef ANY11P
+#undef ANY11P
+
+// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
+ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+ SIMD_ALIGNED(STYPE temp[32]); \
+ SIMD_ALIGNED(DTYPE out[32]); \
+ memset(temp, 0, 32 * SBPP); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, scale, n); \
+ } \
+ memcpy(temp, src_ptr + n, r * SBPP); \
+ ANY_SIMD(temp, out, scale, MASK + 1); \
+ memcpy(dst_ptr + n, out, r * BPP); \
+ }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+ Convert16To8Row_SSSE3,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+ Convert16To8Row_AVX2,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 31)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+ Convert8To16Row_SSE2,
+ 1,
+ 2,
+ uint8_t,
+ uint16_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+ Convert8To16Row_AVX2,
+ 1,
+ 2,
+ uint8_t,
+ uint16_t,
+ 31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
+ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+ SIMD_ALIGNED(ST temp[32]); \
+ SIMD_ALIGNED(T out[32]); \
+ memset(temp, 0, SBPP * 32); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(temp, src_ptr + n, r * SBPP); \
+ ANY_SIMD(temp, out, param, MASK + 1); \
+ memcpy(dst_ptr + n, out, r * BPP); \
+ }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+ HalfFloat1Row_F16C,
+ uint16_t,
+ uint16_t,
+ 2,
+ 2,
+ 15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+ HalfFloat1Row_NEON,
+ uint16_t,
+ uint16_t,
+ 2,
+ 2,
+ 7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#undef ANY11P16
+
+// Any 1 to 1 with yuvconstants
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 2]); \
+ memset(temp, 0, 128); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_MMI)
+ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \
+ ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_MMI
+ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
+ } \
+ memcpy(temp, src_ptr, r* BPP); \
+ ANY_SIMD(temp, temp + 64, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+ }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
+#ifdef HAS_MIRRORROW_MMI
+ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_MMI
+ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, v32, n); \
+ } \
+ ANY_SIMD(temp, v32, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp, r * BPP); \
+ }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MMI
+ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2. Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
+ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
+ memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
+ }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_MMI
+ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_MMI
+ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
+ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
+ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 3. Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+ uint8_t* dst_b, int width) { \
+ SIMD_ALIGNED(uint8_t temp[16 * 6]); \
+ memset(temp, 0, 16 * 3); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
+ } \
+ memcpy(temp, src_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+ memcpy(dst_r + n, temp + 16 * 3, r); \
+ memcpy(dst_g + n, temp + 16 * 4, r); \
+ memcpy(dst_b + n, temp + 16 * 5, r); \
+ }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_MMI
+ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
+#endif
+
+// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \
+ uint8_t* dst_v, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 4]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
+ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
+ memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
+ }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_AVX2
+ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_MMI
+ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVJROW_MMI
+ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_MMI
+ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_MMI
+ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_MMI
+ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_MMI
+ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_MMI
+ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_MMI
+ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_MMI
+ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_MMI
+ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_MMI
+ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MMI
+ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
+#endif
+#undef ANY12S
+
+// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, MASK + 1); \
+ memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \
+ }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_common.cc b/third_party/aom/third_party/libyuv/source/row_common.cc
new file mode 100644
index 0000000000..79aed5c787
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_common.cc
@@ -0,0 +1,3849 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <stdio.h>
+#include <string.h> // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h" // For kYuvI601Constants
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following ifdef from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+#define LIBYUV_RGB7 1
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86)
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
+ return -(v >= 0) & v;
+}
+// TODO(fbarchard): make clamp255 preserve negative values.
+static __inline int32_t clamp255(int32_t v) {
+ return (-(v >= 255) | v) & 255;
+}
+
+static __inline int32_t clamp1023(int32_t v) {
+ return (-(v >= 1023) | v) & 1023;
+}
+
+static __inline uint32_t Abs(int32_t v) {
+ int m = -(v < 0);
+ return (v + m) ^ m;
+}
+#else // USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
+ return (v < 0) ? 0 : v;
+}
+
+static __inline int32_t clamp255(int32_t v) {
+ return (v > 255) ? 255 : v;
+}
+
+static __inline int32_t clamp1023(int32_t v) {
+ return (v > 1023) ? 1023 : v;
+}
+
+static __inline uint32_t Abs(int32_t v) {
+ return (v < 0) ? -v : v;
+}
+#endif // USE_BRANCHLESS
+static __inline uint32_t Clamp(int32_t val) {
+ int v = clamp0(val);
+ return (uint32_t)(clamp255(v));
+}
+
+static __inline uint32_t Clamp10(int32_t val) {
+ int v = clamp0(val);
+ return (uint32_t)(clamp1023(v));
+}
+
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
+#else
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+ p[0] = (uint8_t)(v & 255);
+ p[1] = (uint8_t)((v >> 8) & 255);
+ p[2] = (uint8_t)((v >> 16) & 255);
+ p[3] = (uint8_t)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb24 += 3;
+ }
+}
+
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_raw += 3;
+ }
+}
+
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
+ dst_rgba[0] = 255u;
+ dst_rgba[1] = b;
+ dst_rgba[2] = g;
+ dst_rgba[3] = r;
+ dst_rgba += 4;
+ src_raw += 3;
+ }
+}
+
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ dst_rgb24 += 3;
+ src_raw += 3;
+ }
+}
+
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb565[0] & 0x1f;
+ uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r = src_rgb565[1] >> 3;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 2) | (g >> 4);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb565 += 2;
+ }
+}
+
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb1555[0] & 0x1f;
+ uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t a = src_argb1555[1] >> 7;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 3) | (g >> 2);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = -a;
+ dst_argb += 4;
+ src_argb1555 += 2;
+ }
+}
+
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb4444[0] & 0x0f;
+ uint8_t g = src_argb4444[0] >> 4;
+ uint8_t r = src_argb4444[1] & 0x0f;
+ uint8_t a = src_argb4444[1] >> 4;
+ dst_argb[0] = (b << 4) | b;
+ dst_argb[1] = (g << 4) | g;
+ dst_argb[2] = (r << 4) | r;
+ dst_argb[3] = (a << 4) | a;
+ dst_argb += 4;
+ src_argb4444 += 2;
+ }
+}
+
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
+ uint32_t b = (ar30 >> 2) & 0xff;
+ uint32_t g = (ar30 >> 12) & 0xff;
+ uint32_t r = (ar30 >> 22) & 0xff;
+ uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
+ *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
+ dst_argb += 4;
+ src_ar30 += 4;
+ }
+}
+
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
+ uint32_t b = (ar30 >> 2) & 0xff;
+ uint32_t g = (ar30 >> 12) & 0xff;
+ uint32_t r = (ar30 >> 22) & 0xff;
+ uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
+ *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
+ dst_abgr += 4;
+ src_ar30 += 4;
+ }
+}
+
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
+ uint32_t b = ar30 & 0x3ff;
+ uint32_t ga = ar30 & 0xc00ffc00;
+ uint32_t r = (ar30 >> 20) & 0x3ff;
+ *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
+ dst_ab30 += 4;
+ src_ar30 += 4;
+ }
+}
+
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
+ dst_rgb[0] = b;
+ dst_rgb[1] = g;
+ dst_rgb[2] = r;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
+ dst_rgb[0] = r;
+ dst_rgb[1] = g;
+ dst_rgb[2] = b;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 2;
+ uint8_t r0 = src_argb[2] >> 3;
+ uint8_t b1 = src_argb[4] >> 3;
+ uint8_t g1 = src_argb[5] >> 2;
+ uint8_t r1 = src_argb[6] >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+ (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 2;
+ uint8_t r0 = src_argb[2] >> 3;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB. When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix. But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+ int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+ uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+ uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
+ uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
+ uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+ (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+ uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 3;
+ uint8_t r0 = src_argb[2] >> 3;
+ uint8_t a0 = src_argb[3] >> 7;
+ uint8_t b1 = src_argb[4] >> 3;
+ uint8_t g1 = src_argb[5] >> 3;
+ uint8_t r1 = src_argb[6] >> 3;
+ uint8_t a1 = src_argb[7] >> 7;
+ *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 3;
+ uint8_t r0 = src_argb[2] >> 3;
+ uint8_t a0 = src_argb[3] >> 7;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ }
+}
+
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t b0 = src_argb[0] >> 4;
+ uint8_t g0 = src_argb[1] >> 4;
+ uint8_t r0 = src_argb[2] >> 4;
+ uint8_t a0 = src_argb[3] >> 4;
+ uint8_t b1 = src_argb[4] >> 4;
+ uint8_t g1 = src_argb[5] >> 4;
+ uint8_t r1 = src_argb[6] >> 4;
+ uint8_t a1 = src_argb[7] >> 4;
+ *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8_t b0 = src_argb[0] >> 4;
+ uint8_t g0 = src_argb[1] >> 4;
+ uint8_t r0 = src_argb[2] >> 4;
+ uint8_t a0 = src_argb[3] >> 4;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ }
+}
+
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+ uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
+ uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+ uint32_t a0 = (src_abgr[3] >> 6);
+ *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+ dst_ar30 += 4;
+ src_abgr += 4;
+ }
+}
+
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
+ uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
+ uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
+ uint32_t a0 = (src_argb[3] >> 6);
+ *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+ dst_ar30 += 4;
+ src_argb += 4;
+ }
+}
+
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+ return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+// 0x7e80) >> 8;
+
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+ return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+}
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
+// TODO(fbarchard): Add rounding to SIMD and use this
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+#endif
+
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+}
+#endif
+
+// ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+ }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = src_rgb0[B] + src_rgb1[B]; \
+ uint16_t ag = src_rgb0[G] + src_rgb1[G]; \
+ uint16_t ar = src_rgb0[R] + src_rgb1[R]; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ } \
+ }
+#endif
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
+// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 7 bit Y (deprecated)
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 8 bit U:
+// b 0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r 0.50000 * 255 = 127.5 = 127
+
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+#else
+// 8 bit
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+#endif
+
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+#else
+static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
+
+// ARGBToYJ_C and ARGBToUVJ_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
+ }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \
+ uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \
+ uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ } \
+ }
+
+#endif
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb565[0] & 0x1f;
+ uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r = src_rgb565[1] >> 3;
+ b = (b << 3) | (b >> 2);
+ g = (g << 2) | (g >> 4);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_rgb565 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb1555[0] & 0x1f;
+ uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+ b = (b << 3) | (b >> 2);
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb1555 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb4444[0] & 0x0f;
+ uint8_t g = src_argb4444[0] >> 4;
+ uint8_t r = src_argb4444[1] & 0x0f;
+ b = (b << 4) | b;
+ g = (g << 4) | g;
+ r = (r << 4) | r;
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb4444 += 2;
+ dst_y += 1;
+ }
+}
+
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t b0 = src_rgb565[0] & 0x1f;
+ uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r0 = src_rgb565[1] >> 3;
+ uint8_t b1 = src_rgb565[2] & 0x1f;
+ uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+ uint8_t r1 = src_rgb565[3] >> 3;
+ uint8_t b2 = next_rgb565[0] & 0x1f;
+ uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8_t r2 = next_rgb565[1] >> 3;
+ uint8_t b3 = next_rgb565[2] & 0x1f;
+ uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+ uint8_t r3 = next_rgb565[3] >> 3;
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 2) | (g1 >> 4);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 2) | (g3 >> 4);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
+ src_rgb565 += 4;
+ next_rgb565 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8_t b0 = src_rgb565[0] & 0x1f;
+ uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r0 = src_rgb565[1] >> 3;
+ uint8_t b2 = next_rgb565[0] & 0x1f;
+ uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8_t r2 = next_rgb565[1] >> 3;
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+ }
+}
+
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t b0 = src_argb1555[0] & 0x1f;
+ uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t b1 = src_argb1555[2] & 0x1f;
+ uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+ uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
+ uint8_t b2 = next_argb1555[0] & 0x1f;
+ uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+ uint8_t b3 = next_argb1555[2] & 0x1f;
+ uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+ uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 3) | (g1 >> 2);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 3) | (g3 >> 2);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
+ src_argb1555 += 4;
+ next_argb1555 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8_t b0 = src_argb1555[0] & 0x1f;
+ uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t b2 = next_argb1555[0] & 0x1f;
+ uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8_t r2 = next_argb1555[1] >> 3;
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+ }
+}
+
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t b0 = src_argb4444[0] & 0x0f;
+ uint8_t g0 = src_argb4444[0] >> 4;
+ uint8_t r0 = src_argb4444[1] & 0x0f;
+ uint8_t b1 = src_argb4444[2] & 0x0f;
+ uint8_t g1 = src_argb4444[2] >> 4;
+ uint8_t r1 = src_argb4444[3] & 0x0f;
+ uint8_t b2 = next_argb4444[0] & 0x0f;
+ uint8_t g2 = next_argb4444[0] >> 4;
+ uint8_t r2 = next_argb4444[1] & 0x0f;
+ uint8_t b3 = next_argb4444[2] & 0x0f;
+ uint8_t g3 = next_argb4444[2] >> 4;
+ uint8_t r3 = next_argb4444[3] & 0x0f;
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b1 = (b1 << 4) | b1;
+ g1 = (g1 << 4) | g1;
+ r1 = (r1 << 4) | r1;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+ b3 = (b3 << 4) | b3;
+ g3 = (g3 << 4) | g3;
+ r3 = (r3 << 4) | r3;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
+ src_argb4444 += 4;
+ next_argb4444 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8_t b0 = src_argb4444[0] & 0x0f;
+ uint8_t g0 = src_argb4444[0] >> 4;
+ uint8_t r0 = src_argb4444[1] & 0x0f;
+ uint8_t b2 = next_argb4444[0] & 0x0f;
+ uint8_t g2 = next_argb4444[0] >> 4;
+ uint8_t r2 = next_argb4444[1] & 0x0f;
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+ }
+}
+
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t ab = src_argb[0];
+ uint8_t ag = src_argb[1];
+ uint8_t ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = src_argb[3];
+ dst_argb += 4;
+ src_argb += 4;
+ }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int sb = (b * 17 + g * 68 + r * 35) >> 7;
+ int sg = (b * 22 + g * 88 + r * 45) >> 7;
+ int sr = (b * 24 + g * 98 + r * 50) >> 7;
+ // b does not over flow. a is preserved from original.
+ dst_argb[0] = sb;
+ dst_argb[1] = clamp255(sg);
+ dst_argb[2] = clamp255(sr);
+ dst_argb += 4;
+ }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = src_argb[0];
+ int g = src_argb[1];
+ int r = src_argb[2];
+ int a = src_argb[3];
+ int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+ a * matrix_argb[3]) >>
+ 6;
+ int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+ a * matrix_argb[7]) >>
+ 6;
+ int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+ a * matrix_argb[11]) >>
+ 6;
+ int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+ a * matrix_argb[15]) >>
+ 6;
+ dst_argb[0] = Clamp(sb);
+ dst_argb[1] = Clamp(sg);
+ dst_argb[2] = Clamp(sr);
+ dst_argb[3] = Clamp(sa);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb[3] = table_argb[a * 4 + 3];
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb += 4;
+ }
+}
+
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+ dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+ dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+ dst_argb += 4;
+ }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 24
+
+void ARGBShadeRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ const uint32_t b_scale = REPEAT8(value & 0xff);
+ const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32_t a_scale = REPEAT8(value >> 24);
+
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32_t b = REPEAT8(src_argb[0]);
+ const uint32_t g = REPEAT8(src_argb[1]);
+ const uint32_t r = REPEAT8(src_argb[2]);
+ const uint32_t a = REPEAT8(src_argb[3]);
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v* f >> 16
+
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32_t b = REPEAT8(src_argb0[0]);
+ const uint32_t g = REPEAT8(src_argb0[1]);
+ const uint32_t r = REPEAT8(src_argb0[2]);
+ const uint32_t a = REPEAT8(src_argb0[3]);
+ const uint32_t b_scale = src_argb1[0];
+ const uint32_t g_scale = src_argb1[1];
+ const uint32_t r_scale = src_argb1[2];
+ const uint32_t a_scale = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_add = src_argb1[0];
+ const int g_add = src_argb1[1];
+ const int r_add = src_argb1[2];
+ const int a_add = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_add);
+ dst_argb[1] = SHADE(g, g_add);
+ dst_argb[2] = SHADE(r, r_add);
+ dst_argb[3] = SHADE(a, a_add);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_sub = src_argb1[0];
+ const int g_sub = src_argb1[1];
+ const int r_sub = src_argb1[2];
+ const int a_sub = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_sub);
+ dst_argb[1] = SHADE(g, g_sub);
+ dst_argb[2] = SHADE(r, r_sub);
+ dst_argb[3] = SHADE(a, a_sub);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i];
+ int b = src_y1[i];
+ int c = src_y2[i];
+ int a_sub = src_y0[i + 2];
+ int b_sub = src_y1[i + 2];
+ int c_sub = src_y2[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobelx[i] = (uint8_t)(clamp255(sobel));
+ }
+}
+
+void SobelYRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i + 0];
+ int b = src_y0[i + 1];
+ int c = src_y0[i + 2];
+ int a_sub = src_y1[i + 0];
+ int b_sub = src_y1[i + 1];
+ int c_sub = src_y1[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobely[i] = (uint8_t)(clamp255(sobel));
+ }
+}
+
+void SobelRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_argb[0] = (uint8_t)(s);
+ dst_argb[1] = (uint8_t)(s);
+ dst_argb[2] = (uint8_t)(s);
+ dst_argb[3] = (uint8_t)(255u);
+ dst_argb += 4;
+ }
+}
+
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_y[i] = (uint8_t)(s);
+ }
+}
+
+void SobelXYRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int g = clamp255(r + b);
+ dst_argb[0] = (uint8_t)(b);
+ dst_argb[1] = (uint8_t)(g);
+ dst_argb[2] = (uint8_t)(r);
+ dst_argb[3] = (uint8_t)(255u);
+ dst_argb += 4;
+ }
+}
+
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ // Copy a Y to RGB.
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t y = src_y[0];
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ ++src_y;
+ }
+}
+
+// TODO(fbarchard): Unify these structures to be platform independent.
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__) // 64 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__) // 32 bit arm
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// JPEG YUV to RGB reference
+// * R = Y - V * -1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y - U * -1.77200
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGB 32 /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -113 /* round(-1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR -90 /* round(-1.40200 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.709 YUV to RGB reference
+// R = (Y - 16) * 1.164 - V * -1.793
+// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+// B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14 /* round(0.213 * 64) */
+#define VG 34 /* round(0.533 * 64) */
+#define VR -115 /* round(-1.793 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// BT.2020 YUV to RGB reference
+// R = (Y - 16) * 1.164384 - V * -1.67867
+// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+// B = (Y - 16) * 1.164384 - U * -2.14177
+
+// Y contribution to R,G,B. Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.142 * 64)) */
+#define UG 12 /* round(0.187326 * 64) */
+#define VG 42 /* round(0.65042 * 64) */
+#define VR -107 /* round(-1.67867 * 64) */
+
+// Bias values to round, and subtract 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+#if defined(__aarch64__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {UG, VG, UG, VG, UG, VG, UG, VG},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {VG, UG, VG, UG, VG, UG, VG, UG},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#elif defined(__arm__)
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+ {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+ {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BB, BG, BR, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+ {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+ {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+ {BR, BG, BB, YGB, 0, 0, 0, 0},
+ {0x0101 * YG, YG, 0, 0}};
+#else
+const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = {
+ {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = {
+ {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+ VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+ {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+ {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+ 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+ {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+ {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+ {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},
+ {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB,
+ YGB}};
+#endif
+
+#undef BB
+#undef BG
+#undef BR
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef YG
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
+ *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
+ *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel8_16(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = (int)(-(u * ub) + y1 + bb);
+ *g = (int)(-(u * ug + v * vg) + y1 + bg);
+ *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 10 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16(int16_t y,
+ int16_t u,
+ int16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+ u = clamp255(u >> 2);
+ v = clamp255(v >> 2);
+ *b = (int)(-(u * ub) + y1 + bb);
+ *g = (int)(-(u * ug + v * vg) + y1 + bg);
+ *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 10 bit assembly.
+// Reads 10 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel10(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ int b16;
+ int g16;
+ int r16;
+ YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ *b = Clamp(b16 >> 6);
+ *g = Clamp(g16 >> 6);
+ *r = Clamp(r16 >> 6);
+}
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YPixel(uint8_t y,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *g = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *r = Clamp(((int32_t)(y1) + ygb) >> 6);
+}
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
+ uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+ yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+ yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 2;
+ src_v += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+#else
+void I444ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
+ uint32_t ar30;
+ b = b >> 4; // convert 10.6 to 10 bit.
+ g = g >> 4;
+ r = r >> 4;
+ b = Clamp10(b);
+ g = Clamp10(g);
+ r = Clamp10(r);
+ ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
+ (*(uint32_t*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = src_a[1];
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ src_a += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ }
+}
+
+void I422ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ }
+}
+
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ b1 = b1 >> 4;
+ g1 = g1 >> 4;
+ r1 = r1 >> 4;
+ *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+ (g1 << 20) | (r1 << 24) | 0xf000f000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+ }
+}
+
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 3;
+ r1 = r1 >> 3;
+ *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+ (g1 << 21) | (r1 << 26) | 0x80008000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+ }
+}
+
+void I422ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32_t*)(dst_rgb565) =
+ b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void NV12ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_uv += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV21ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_vu += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_uv += 2;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ }
+}
+
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_vu += 2;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ }
+}
+
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32_t*)(dst_rgb565) =
+ b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_uv += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_yuy2 += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_uyvy += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToRGBARow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+ rgb_buf + 7, yuvconstants);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+ rgb_buf + 3, yuvconstants);
+ rgb_buf[0] = 255;
+ }
+}
+
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ src += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[x] = src[0];
+ dst[x + 1] = src[-1];
+ src -= 2;
+ }
+ if (width & 1) {
+ dst[width - 1] = src[0];
+ }
+}
+
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_uv[0];
+ dst_uv[1] = src_uv[1];
+ src_uv -= 2;
+ dst_uv += 2;
+ }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[-2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[-2 + 1];
+ src_uv -= 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ const uint32_t* src32 = (const uint32_t*)(src);
+ uint32_t* dst32 = (uint32_t*)(dst);
+ src32 += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst32[x] = src32[0];
+ dst32[x + 1] = src32[-1];
+ src32 -= 2;
+ }
+ if (width & 1) {
+ dst32[width - 1] = src32[0];
+ }
+}
+
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+ int x;
+ src_rgb24 += width * 3 - 3;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ src_rgb24 -= 3;
+ dst_rgb24 += 3;
+ }
+}
+
+void SplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[3];
+ src_uv += 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void MergeUVRow_C(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x];
+ dst_uv[1] = src_v[x];
+ dst_uv[2] = src_u[x + 1];
+ dst_uv[3] = src_v[x + 1];
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1];
+ dst_uv[1] = src_v[width - 1];
+ }
+}
+
+void SplitRGBRow_C(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_r[x] = src_rgb[0];
+ dst_g[x] = src_rgb[1];
+ dst_b[x] = src_rgb[2];
+ src_rgb += 3;
+ }
+}
+
+void MergeRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_rgb[0] = src_r[x];
+ dst_rgb[1] = src_g[x];
+ dst_rgb[2] = src_b[x];
+ dst_rgb += 3;
+ }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+void MergeUVRow_16_C(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x] * scale;
+ dst_uv[1] = src_v[x] * scale;
+ dst_uv[2] = src_u[x + 1] * scale;
+ dst_uv[3] = src_v[x + 1] * scale;
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1] * scale;
+ dst_uv[1] = src_v[width - 1] * scale;
+ }
+}
+
+void MultiplyRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = src_y[x] * scale;
+ }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+ }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 1024 = 10 bits
+void Convert8To16Row_C(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ scale *= 0x0101; // replicates the byte.
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = (src_y[x] * scale) >> 16;
+ }
+}
+
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
+ memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
+ memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
+ memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
+ }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values, filtering 2 rows of YUY2.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+ dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = src_yuy2[1];
+ dst_v[0] = src_yuy2[3];
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_yuy2[0];
+ dst_y[x + 1] = src_yuy2[2];
+ src_yuy2 += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_yuy2[0];
+ }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+ dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = src_uyvy[0];
+ dst_v[0] = src_uyvy[2];
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_uyvy[1];
+ dst_y[x + 1] = src_uyvy[3];
+ src_uyvy += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_uyvy[1];
+ }
+}
+
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint32_t fb = src_argb0[0];
+ uint32_t fg = src_argb0[1];
+ uint32_t fr = src_argb0[2];
+ uint32_t a = src_argb0[3];
+ uint32_t bb = src_argb1[0];
+ uint32_t bg = src_argb1[1];
+ uint32_t br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+
+ fb = src_argb0[4 + 0];
+ fg = src_argb0[4 + 1];
+ fr = src_argb0[4 + 2];
+ a = src_argb0[4 + 3];
+ bb = src_argb1[4 + 0];
+ bg = src_argb1[4 + 1];
+ br = src_argb1[4 + 2];
+ dst_argb[4 + 0] = BLEND(fb, bb, a);
+ dst_argb[4 + 1] = BLEND(fg, bg, a);
+ dst_argb[4 + 2] = BLEND(fr, br, a);
+ dst_argb[4 + 3] = 255u;
+ src_argb0 += 8;
+ src_argb1 += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ uint32_t fb = src_argb0[0];
+ uint32_t fg = src_argb0[1];
+ uint32_t fr = src_argb0[2];
+ uint32_t a = src_argb0[3];
+ uint32_t bb = src_argb1[0];
+ uint32_t bg = src_argb1[1];
+ uint32_t br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+ }
+}
+#undef BLEND
+
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+ dst[1] = UBLEND(src0[1], src1[1], alpha[1]);
+ src0 += 2;
+ src1 += 2;
+ alpha += 2;
+ dst += 2;
+ }
+ if (width & 1) {
+ dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
+ }
+}
+#undef UBLEND
+
+#if defined(__aarch64__) || defined(__arm__)
+#define ATTENUATE(f, a) (f * a + 128) >> 8
+#else
+// This code mimics the SSSE3 version for better testability.
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#endif
+
+// Multiply source RGB by alpha and store to destination.
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ uint32_t b = src_argb[0];
+ uint32_t g = src_argb[1];
+ uint32_t r = src_argb[2];
+ uint32_t a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ b = src_argb[4];
+ g = src_argb[5];
+ r = src_argb[6];
+ a = src_argb[7];
+ dst_argb[4] = ATTENUATE(b, a);
+ dst_argb[5] = ATTENUATE(g, a);
+ dst_argb[6] = ATTENUATE(r, a);
+ dst_argb[7] = a;
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ const uint32_t b = src_argb[0];
+ const uint32_t g = src_argb[1];
+ const uint32_t r = src_argb[2];
+ const uint32_t a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32_t fixed_invtbl8[256] = {
+ 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06),
+ T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d),
+ T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14),
+ T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b),
+ T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22),
+ T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29),
+ T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30),
+ T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+ T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e),
+ T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45),
+ T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c),
+ T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53),
+ T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a),
+ T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61),
+ T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68),
+ T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+ T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76),
+ T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d),
+ T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84),
+ T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b),
+ T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92),
+ T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99),
+ T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0),
+ T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+ T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae),
+ T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5),
+ T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc),
+ T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3),
+ T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca),
+ T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1),
+ T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8),
+ T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+ T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6),
+ T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed),
+ T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4),
+ T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb),
+ T(0xfc), T(0xfd), T(0xfe), 0x01000100};
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ uint32_t b = src_argb[0];
+ uint32_t g = src_argb[1];
+ uint32_t r = src_argb[2];
+ const uint32_t a = src_argb[3];
+ const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
+ b = (b * ia) >> 8;
+ g = (g * ia) >> 8;
+ r = (r * ia) >> 8;
+ // Clamping should not be necessary but is free in assembly.
+ dst_argb[0] = clamp255(b);
+ dst_argb[1] = clamp255(g);
+ dst_argb[2] = clamp255(r);
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width) {
+ int32_t row_sum[4] = {0, 0, 0, 0};
+ int x;
+ for (x = 0; x < width; ++x) {
+ row_sum[0] += row[x * 4 + 0];
+ row_sum[1] += row[x * 4 + 1];
+ row_sum[2] += row[x * 4 + 2];
+ row_sum[3] += row[x * 4 + 3];
+ cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+ cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+ cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+ cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+ }
+}
+
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+ const int32_t* bl,
+ int w,
+ int area,
+ uint8_t* dst,
+ int count) {
+ float ooa = 1.0f / area;
+ int i;
+ for (i = 0; i < count; ++i) {
+ dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst += 4;
+ tl += 4;
+ bl += 4;
+ }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* uv_dudv,
+ int width) {
+ int i;
+ // Render a row of pixels from source into a buffer.
+ float uv[2];
+ uv[0] = uv_dudv[0];
+ uv[1] = uv_dudv[1];
+ for (i = 0; i < width; ++i) {
+ int x = (int)(uv[0]);
+ int y = (int)(uv[1]);
+ *(uint32_t*)(dst_argb) =
+ *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
+ dst_argb += 4;
+ uv[0] += uv_dudv[2];
+ uv[1] += uv_dudv[3];
+ }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8_t* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+static void HalfRow_16_C(const uint16_t* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint16_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (y1_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+ if (y1_fraction == 128) {
+ HalfRow_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ dst_ptr[1] =
+ (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ }
+}
+
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (source_y_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width * 2);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ int index0 = shuffler[0];
+ int index1 = shuffler[1];
+ int index2 = shuffler[2];
+ int index3 = shuffler[3];
+ // Shuffle a row of ARGB.
+ int x;
+ for (x = 0; x < width; ++x) {
+ // To support in-place conversion.
+ uint8_t b = src_argb[index0];
+ uint8_t g = src_argb[index1];
+ uint8_t r = src_argb[index2];
+ uint8_t a = src_argb[index3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void I422ToYUY2Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[1];
+ dst_frame[3] = src_v[0];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = 0;
+ dst_frame[3] = src_v[0];
+ }
+}
+
+void I422ToUYVYRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[1];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = 0;
+ }
+}
+
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float b = (float)(src_argb[0]);
+ float g = (float)(src_argb[1]);
+ float r = (float)(src_argb[2]);
+ float a = (float)(src_argb[3]);
+ float b2 = b * b;
+ float g2 = g * g;
+ float r2 = r * r;
+ float a2 = a * a;
+ float db = poly[0] + poly[4] * b;
+ float dg = poly[1] + poly[5] * g;
+ float dr = poly[2] + poly[6] * r;
+ float da = poly[3] + poly[7] * a;
+ float b3 = b2 * b;
+ float g3 = g2 * g;
+ float r3 = r2 * r;
+ float a3 = a2 * a;
+ db += poly[8] * b2;
+ dg += poly[9] * g2;
+ dr += poly[10] * r2;
+ da += poly[11] * a2;
+ db += poly[12] * b3;
+ dg += poly[13] * g3;
+ dr += poly[14] * r3;
+ da += poly[15] * a3;
+
+ dst_argb[0] = Clamp((int32_t)(db));
+ dst_argb[1] = Clamp((int32_t)(dg));
+ dst_argb[2] = Clamp((int32_t)(dr));
+ dst_argb[3] = Clamp((int32_t)(da));
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+// Work around GCC 7 punning warning -Wstrict-aliasing
+#if defined(__GNUC__)
+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
+#else
+typedef uint32_t uint32_alias_t;
+#endif
+
+void HalfFloatRow_C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ int i;
+ float mult = 1.9259299444e-34f * scale;
+ for (i = 0; i < width; ++i) {
+ float value = src[i] * mult;
+ dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
+ }
+}
+
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float value = src[i] * scale;
+ dst[i] = value;
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
+ uint32_t bc = lumacoeff & 0xff;
+ uint32_t gc = (lumacoeff >> 8) & 0xff;
+ uint32_t rc = (lumacoeff >> 16) & 0xff;
+
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ // Luminance in rows, color values in columns.
+ const uint8_t* luma0 =
+ ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+ luma;
+ const uint8_t* luma1;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ luma1 =
+ ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+ luma;
+ dst_argb[4] = luma1[src_argb[4]];
+ dst_argb[5] = luma1[src_argb[5]];
+ dst_argb[6] = luma1[src_argb[6]];
+ dst_argb[7] = src_argb[7];
+ src_argb += 8;
+ dst_argb += 8;
+ }
+ if (width & 1) {
+ // Luminance in rows, color values in columns.
+ const uint8_t* luma0 =
+ ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+ luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ }
+}
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[3];
+ dst[7] = src[7];
+ dst += 8;
+ src += 8;
+ }
+ if (width & 1) {
+ dst[3] = src[3];
+ }
+}
+
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst_a[0] = src_argb[3];
+ dst_a[1] = src_argb[7];
+ dst_a += 2;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ dst_a[0] = src_argb[3];
+ }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[0];
+ dst[7] = src[1];
+ dst += 8;
+ src += 2;
+ }
+ if (width & 1) {
+ dst[3] = src[0];
+ }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+ defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+ ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+ ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb1555 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+ ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_argb4444 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+ src_y += twidth;
+ src_u += twidth / 2;
+ src_v += twidth / 2;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+ ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb565 += twidth * 2;
+ width -= twidth;
+ }
+}
+#endif
+
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_SSSE3
+
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+ float fsum = 0.f;
+ int i;
+ for (i = 0; i < width; ++i) {
+ float v = *src++;
+ fsum += v * v;
+ *dst++ = v * scale;
+ }
+ return fsum;
+}
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+ float fmax = 0.f;
+ int i;
+ for (i = 0; i < width; ++i) {
+ float v = *src++;
+ float vs = v * scale;
+ fmax = (v > fmax) ? v : fmax;
+ *dst++ = vs;
+ }
+ return fmax;
+}
+
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src++ * scale;
+ }
+}
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ =
+ (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+ (1.0f / 256.0f);
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_yuv24[0] = src_vu[0]; // V
+ dst_yuv24[1] = src_vu[1]; // U
+ dst_yuv24[2] = src_y[0]; // Y0
+ dst_yuv24[3] = src_vu[0]; // V
+ dst_yuv24[4] = src_vu[1]; // U
+ dst_yuv24[5] = src_y[1]; // Y1
+ src_y += 2;
+ src_vu += 2;
+ dst_yuv24 += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ dst_yuv24[0] = src_vu[0]; // V
+ dst_yuv24[1] = src_vu[1]; // U
+ dst_yuv24[2] = src_y[0]; // Y0
+ }
+}
+
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ // Output a row of UV values, filtering 2x2 rows of AYUV.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 5] + 2) >>
+ 2;
+ dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 4] + 2) >>
+ 2;
+ src_ayuv += 8;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 0] + 2) >>
+ 2;
+ dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 1] + 2) >>
+ 2;
+ }
+}
+
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ // Output a row of VU values, filtering 2x2 rows of AYUV.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 4] + 2) >>
+ 2;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 5] + 2) >>
+ 2;
+ src_ayuv += 8;
+ dst_vu += 2;
+ }
+ if (width & 1) {
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 0] + 2) >>
+ 2;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 1] + 2) >>
+ 2;
+ }
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = src_ayuv[2]; // v,u,y,a
+ src_ayuv += 4;
+ }
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t u = src_uv[0];
+ uint8_t v = src_uv[1];
+ dst_vu[0] = v;
+ dst_vu[1] = u;
+ src_uv += 2;
+ dst_vu += 2;
+ }
+}
+
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+ src_u[src_stride_u + 1] + 2) >>
+ 2;
+ dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+ src_v[src_stride_v + 1] + 2) >>
+ 2;
+ src_u += 2;
+ src_v += 2;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+ dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_gcc.cc b/third_party/aom/third_party/libyuv/source/row_gcc.cc
new file mode 100644
index 0000000000..a107c30e76
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_gcc.cc
@@ -0,0 +1,7175 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+ 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
+
+// JPeg full range.
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+ 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+ 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+ -18, -94, 112, 0, -18, -94, 112, 0};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
+
+// Constants for BGRA
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+ 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+ 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+ 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+ 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
+ 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
+ "psrld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGBA) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGB24_0), // %3
+ "m"(kShuffleMaskRAWToRGB24_1), // %4
+ "m"(kShuffleMaskRAWToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm6 \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24), // %3
+ "m"(kPermdRGB24_AVX) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
+ 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+ 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+ 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+ 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+ 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+ 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+ 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+ 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kPermARGBToRGB24_0), // %3
+ "m"(kPermARGBToRGB24_1), // %4
+ "m"(kPermARGBToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm6 \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW), // %3
+ "m"(kPermdRGB24_AVX) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
+ int width) {
+ asm volatile(
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3,%%xmm6 \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(dither4) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
+would be a simple multiplier to shift it into position. It wants a gap of 10
+above the green. Green is 10 bits, so there are 6 bits in the low short. 4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
+ 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
+ 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleRB30), // %3
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleBR30), // %3 reversed shuffler
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleRB30), // %3
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleBR30), // %3 reversed shuffler
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round) \
+ "1: \n" \
+ "movdqu (%0),%%xmm0 \n" \
+ "movdqu 0x10(%0),%%xmm1 \n" \
+ "movdqu 0x20(%0),%%xmm2 \n" \
+ "movdqu 0x30(%0),%%xmm3 \n" \
+ "psubb %%xmm5,%%xmm0 \n" \
+ "psubb %%xmm5,%%xmm1 \n" \
+ "psubb %%xmm5,%%xmm2 \n" \
+ "psubb %%xmm5,%%xmm3 \n" \
+ "movdqu %%xmm4,%%xmm6 \n" \
+ "pmaddubsw %%xmm0,%%xmm6 \n" \
+ "movdqu %%xmm4,%%xmm0 \n" \
+ "pmaddubsw %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm4,%%xmm1 \n" \
+ "pmaddubsw %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm4,%%xmm2 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "lea 0x40(%0),%0 \n" \
+ "phaddw %%xmm0,%%xmm6 \n" \
+ "phaddw %%xmm2,%%xmm1 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "paddw %%" #round ",%%xmm6 \n" \
+ "paddw %%" #round ",%%xmm1 \n" \
+ "psrlw $0x8,%%xmm6 \n" \
+ "psrlw $0x8,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm6 \n" \
+ "movdqu %%xmm6,(%1) \n" \
+ "lea 0x10(%1),%1 \n" \
+ "sub $0x10,%2 \n" \
+ "jg 1b \n"
+
+#define RGBTOY_AVX2(round) \
+ "1: \n" \
+ "vmovdqu (%0),%%ymm0 \n" \
+ "vmovdqu 0x20(%0),%%ymm1 \n" \
+ "vmovdqu 0x40(%0),%%ymm2 \n" \
+ "vmovdqu 0x60(%0),%%ymm3 \n" \
+ "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
+ "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
+ "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
+ "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
+ "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
+ "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
+ "lea 0x80(%0),%0 \n" \
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
+ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
+ "vmovdqu %%ymm0,(%1) \n" \
+ "lea 0x20(%1),%1 \n" \
+ "sub $0x20,%2 \n" \
+ "jg 1b \n" \
+ "vzeroupper \n"
+
+// clang-format on
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm7)
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm7)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm5)
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToV), // %5
+ "m"(kARGBToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kARGBToV), // %6
+ "m"(kARGBToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kABGRToV), // %6
+ "m"(kABGRToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kSub128), // %5
+ "m"(kARGBToVJ), // %6
+ "m"(kARGBToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToVJ), // %5
+ "m"(kARGBToUJ), // %6
+ "m"(kSub128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+#endif // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
+}
+#endif // HAS_ARGBTOUV444ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)), // %4
+ "m"(kBGRAToV), // %5
+ "m"(kBGRAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToV), // %5
+ "m"(kABGRToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba)), // %4
+ "m"(kRGBAToV), // %5
+ "m"(kRGBAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// Read 8 UV from 444
+#define READYUV444 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
+#define READYUV210 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm0 \n" \
+ "psraw $0x2,%%xmm0 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $0x6,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "movq (%[a_buf]),%%xmm5 \n" \
+ "lea 0x8(%[a_buf]),%[a_buf] \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq (%[uv_buf]),%%xmm0 \n" \
+ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 VU from NV21, upsample to 8 UV
+#define READNV21 \
+ "movq (%[vu_buf]),%%xmm0 \n" \
+ "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
+ "pshufb %[kShuffleNV21], %%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+#define READYUY2 \
+ "movdqu (%[yuy2_buf]),%%xmm4 \n" \
+ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
+ "movdqu (%[yuy2_buf]),%%xmm0 \n" \
+ "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
+ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
+
+// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+#define READUYVY \
+ "movdqu (%[uyvy_buf]),%%xmm4 \n" \
+ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
+ "movdqu (%[uyvy_buf]),%%xmm0 \n" \
+ "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
+ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP(yuvconstants) \
+ "movdqa (%[yuvconstants]),%%xmm8 \n" \
+ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
+ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
+ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
+ "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
+ "movdqa 192(%[yuvconstants]),%%xmm14 \n"
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa %%xmm11,%%xmm0 \n" \
+ "pmaddubsw %%xmm8,%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa %%xmm12,%%xmm1 \n" \
+ "pmaddubsw %%xmm9,%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa %%xmm13,%%xmm2 \n" \
+ "pmaddubsw %%xmm10,%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw %%xmm14,%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n"
+#define YUVTORGB_REGS \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else
+#define YUVTORGB_SETUP(yuvconstants)
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB16(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
+ "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
+ "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
+ "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n"
+#define YUVTORGB_REGS
+#endif
+
+#define YUVTORGB(yuvconstants) \
+ YUVTORGB16(yuvconstants) \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm5,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm0,(%[dst_argb]) \n" \
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
+ "lea 0x20(%[dst_argb]), %[dst_argb] \n"
+
+// Store 8 RGBA values.
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm2,%%xmm1 \n" \
+ "punpcklbw %%xmm0,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5,(%[dst_rgba]) \n" \
+ "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
+ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+
+// Store 8 AR30 values.
+#define STOREAR30 \
+ "psraw $0x4,%%xmm0 \n" \
+ "psraw $0x4,%%xmm1 \n" \
+ "psraw $0x4,%%xmm2 \n" \
+ "pminsw %%xmm7,%%xmm0 \n" \
+ "pminsw %%xmm7,%%xmm1 \n" \
+ "pminsw %%xmm7,%%xmm2 \n" \
+ "pmaxsw %%xmm6,%%xmm0 \n" \
+ "pmaxsw %%xmm6,%%xmm1 \n" \
+ "pmaxsw %%xmm6,%%xmm2 \n" \
+ "psllw $0x4,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm3 \n" \
+ "movdqa %%xmm1,%%xmm2 \n" \
+ "punpcklwd %%xmm5,%%xmm1 \n" \
+ "punpckhwd %%xmm5,%%xmm2 \n" \
+ "pslld $0xa,%%xmm1 \n" \
+ "pslld $0xa,%%xmm2 \n" \
+ "por %%xmm1,%%xmm0 \n" \
+ "por %%xmm2,%%xmm3 \n" \
+ "movdqu %%xmm0,(%[dst_ar30]) \n" \
+ "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
+ "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV210
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I422ALPHATOARGBROW_SSSE3
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV21
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUY2
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READUYVY
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STORERGBA
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ "vmovq (%[u_buf]),%%xmm0 \n" \
+ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 210 10 bit, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 \
+ "vmovq (%[u_buf]),%%xmm0 \n" \
+ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+ "vmovdqu (%[uv_buf]),%%xmm0 \n" \
+ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 VU from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+ "vmovdqu (%[vu_buf]),%%xmm0 \n" \
+ "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
+ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
+ "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
+ "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
+ "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+ "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
+ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
+ "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
+ "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
+ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
+
+#if defined(__x86_64__)
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
+ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
+ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
+ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
+ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
+ "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
+ "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
+
+#define YUVTORGB16_AVX2(yuvconstants) \
+ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
+ "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
+ "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
+ "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
+ "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
+
+#define YUVTORGB_REGS_AVX2 \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
+#else // Convert 16 pixels: 16 UV and 16 Y.
+
+#define YUVTORGB_SETUP_AVX2(yuvconstants)
+#define YUVTORGB16_AVX2(yuvconstants) \
+ "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
+ "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
+ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
+ "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
+ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
+ "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
+ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
+ "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
+#define YUVTORGB_REGS_AVX2
+#endif
+
+#define YUVTORGB_AVX2(yuvconstants) \
+ YUVTORGB16_AVX2(yuvconstants) \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vmovdqu %%ymm1,(%[dst_argb]) \n" \
+ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
+ "lea 0x40(%[dst_argb]), %[dst_argb] \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2 \
+ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
+ "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
+ "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
+ "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
+ "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
+ "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
+ "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
+ "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
+ "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
+ "vpslld $0xa,%%ymm1,%%ymm1 \n" \
+ "vpslld $0xa,%%ymm2,%%ymm2 \n" \
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
+ "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
+ "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV444_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+
+ // Step 3: Weave into RGBA
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%[dst_argb]) \n"
+ "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
+ "lea 0x40(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_NV12TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV12_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#if defined(HAS_NV21TOARGBROW_AVX2)
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READNV21_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [vu_buf]"+r"(vu_buf), // %[vu_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleNV21]"m"(kShuffleNV21)
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUY2_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
+ [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READUYVY_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [kShuffleUYVYY]"m"(kShuffleUYVYY),
+ [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
+ "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
+ "pslld $0x18,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "paddsw %%xmm3,%%xmm0 \n"
+ "psraw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
+ "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ "vmovdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorSplitUV) // %4
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored. first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+ 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
+
+// Shuffle last 5 pixels to first 5 mirrored. last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+ 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ src_rgb24 += width * 3 - 48;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // first 5
+ "movdqu 15(%0),%%xmm1 \n" // next 5
+ "movdqu 30(%0),%%xmm2 \n" // next 5
+ "movdqu 32(%0),%%xmm3 \n" // last 1 special
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "lea -0x30(%0),%0 \n"
+ "movdqu %%xmm0,32(%1) \n" // last 5
+ "movdqu %%xmm1,17(%1) \n" // next 5
+ "movdqu %%xmm2,2(%1) \n" // next 5
+ "movlpd %%xmm3,0(%1) \n" // first 1
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorRGB0), // %3
+ "m"(kShuffleMirrorRGB1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_RGB24MIRRORROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "lea -0x10(%0,%2,4),%0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc", "xmm0");
+}
+#endif // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vmovdqu %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
+
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(scale) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+#endif // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// TODO(fbarchard): reduce to SSE2
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+#endif // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
+ 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 2u, 5u, 8u, 11u, 14u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 1u,
+ 4u, 7u, 10u, 13u};
+
+static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+ 3u, 6u, 9u, 12u, 15u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 2u,
+ 5u, 8u, 11u, 14u};
+
+static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+ 4u, 7u, 10u, 13u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 0u, 3u,
+ 6u, 9u, 12u, 15u};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskRGBToR0), // %5
+ "m"(kShuffleMaskRGBToR1), // %6
+ "m"(kShuffleMaskRGBToR2), // %7
+ "m"(kShuffleMaskRGBToG0), // %8
+ "m"(kShuffleMaskRGBToG1), // %9
+ "m"(kShuffleMaskRGBToG2), // %10
+ "m"(kShuffleMaskRGBToB0), // %11
+ "m"(kShuffleMaskRGBToB1), // %12
+ "m"(kShuffleMaskRGBToB2) // %13
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+ 2u, 128u, 128u, 3u, 128u, 128u,
+ 4u, 128u, 128u, 5u};
+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+ 128u, 2u, 128u, 128u, 3u, 128u,
+ 128u, 4u, 128u, 128u};
+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+ 128u, 128u, 2u, 128u, 128u, 3u,
+ 128u, 128u, 4u, 128u};
+
+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+ 7u, 128u, 128u, 8u, 128u, 128u,
+ 9u, 128u, 128u, 10u};
+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+ 128u, 7u, 128u, 128u, 8u, 128u,
+ 128u, 9u, 128u, 128u};
+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
+ 128u, 128u, 8u, 128u, 128u, 9u,
+ 128u, 128u, 10u, 128u};
+
+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+ 12u, 128u, 128u, 13u, 128u, 128u,
+ 14u, 128u, 128u, 15u};
+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+ 128u, 13u, 128u, 128u, 14u, 128u,
+ 128u, 15u, 128u, 128u};
+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+ 128u, 128u, 13u, 128u, 128u, 14u,
+ 128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskRToRGB0), // %5
+ "m"(kShuffleMaskGToRGB0), // %6
+ "m"(kShuffleMaskBToRGB0), // %7
+ "m"(kShuffleMaskRToRGB1), // %8
+ "m"(kShuffleMaskGToRGB1), // %9
+ "m"(kShuffleMaskBToRGB1), // %10
+ "m"(kShuffleMaskRToRGB2), // %11
+ "m"(kShuffleMaskGToRGB2), // %12
+ "m"(kShuffleMaskBToRGB2) // %13
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN "9: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile(
+
+ "rep movsb \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc");
+}
+#endif // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+ 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
+ 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ : "m"(kPermdARGBToY_AVX), // %3
+ "m"(kShuffleAlphaShort_AVX2) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
+ size_t width_tmp = (size_t)(width >> 2);
+ const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
+ asm volatile(
+
+ "rep stosl \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile(
+
+ "rep stosb \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile(
+
+ "rep stosl \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+}
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
+ "vbroadcastss %%xmm6,%%ymm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
+ "vbroadcastss %%xmm7,%%ymm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
+
+ // 32 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(alpha), // %2
+ "+r"(dst), // %3
+ "+rm"(width) // %4
+ ::"memory",
+ "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+ 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
+ 128u, 128u, 14u, 15u, 14u, 15u,
+ 14u, 15u, 128u, 128u};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ uintptr_t alpha;
+ asm volatile(
+ "sub %0,%1 \n"
+ "vbroadcastf128 %5,%%ymm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ // replace VPGATHER
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
+ "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
+ "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
+ // end of VPGATHER
+
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8), // %4
+ "m"(kUnattenShuffleAlpha_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psubb %%xmm5,%%xmm0 \n"
+ "psubb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm4,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "movdqu %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm6 \n"
+ "paddw %%xmm5,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm6,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ asm volatile(
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ asm volatile(
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__AVX2__)
+ ,
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0");
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0");
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width) {
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop.
+ LABELALIGN
+ "10: \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
+ int width,
+ int area,
+ uint8_t* dst,
+ int count) {
+ asm volatile(
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
+
+ // 4 pixel small loop.
+ LABELALIGN
+ "4: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
+
+ // 4 pixel loop
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop
+ LABELALIGN
+ "10: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"((intptr_t)(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* src_dudv,
+ int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp;
+ asm volatile(
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop
+ LABELALIGN
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop
+ LABELALIGN
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "=&r"(temp) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile(
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+rm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile(
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vbroadcastss %%xmm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
+ "vbroadcastss %%xmm4,%%ymm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "rep movsb \n"
+ "jmp 999f \n"
+
+ "99: \n"
+ "vzeroupper \n"
+ "999: \n"
+ : "+D"(dst_ptr), // %0
+ "+S"(src_ptr), // %1
+ "+cm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+
+ "movdqu (%3),%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+
+ "vbroadcastf128 (%3),%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOUYVYROW_AVX2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ asm volatile(
+
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 (%3),%%ymm4 \n"
+ "vbroadcastf128 0x10(%3),%%ymm5 \n"
+ "vbroadcastf128 0x20(%3),%%ymm6 \n"
+ "vbroadcastf128 0x30(%3),%%ymm7 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
+ "lea 0x8(%0),%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
+ // X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "vmovq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ scale *= kScaleBias;
+ asm volatile(
+ "movd %3,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n" // 8 shorts
+ "add $0x10,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
+ "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
+ "punpckhwd %%xmm5,%%xmm3 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "psrld $0xd,%%xmm2 \n"
+ "psrld $0xd,%%xmm3 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,-0x10(%0,%1,1) \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(scale) // %3
+ : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ scale *= kScaleBias;
+ asm volatile(
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
+ : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd 0x10(%0),%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
+ "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
+ : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd 0x10(%0),%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
+ "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm2", "xmm3");
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile(
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ uintptr_t pixel_temp;
+ asm volatile(
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
+ uintptr_t pixel_temp;
+ uintptr_t table_temp;
+ asm volatile(
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
+ : "=&d"(pixel_temp), // %0
+ "=&a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+ 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+ 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+ 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+ 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+ 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+ 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+ 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+ 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+ 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+ 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+ 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+ 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+ 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+ 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+ 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+ 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+ 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+ 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+ 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ uint8_t* src_y_ptr;
+ uint64_t src_offset = 0;
+ uint64_t width64;
+
+ width64 = width;
+ src_y_ptr = (uint8_t*)src_y;
+
+ asm volatile(
+ "vmovdqu %5, %%ymm0 \n" // init blend value
+ "vmovdqu %6, %%ymm1 \n" // init blend value
+ "vmovdqu %7, %%ymm2 \n" // init blend value
+ // "sub $0x20, %3 \n" //sub 32 from
+ // width for final loop
+
+ LABELALIGN
+ "1: \n" // label 1
+ "vmovdqu (%0,%4), %%ymm3 \n" // src_y
+ "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
+ "vmovdqu (%1), %%ymm5 \n" // src_uv
+ "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
+ "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
+ // shuf
+ "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
+ // shuf
+ "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
+ "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
+ // shuf
+ "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
+ "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
+ "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
+ "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
+ "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
+ "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
+ "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
+ "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
+ "add $0x20, %4 \n" // add to src buffer
+ // ptr
+ "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
+ "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
+ "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
+ "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
+ "add $0x60,%2 \n" // add to dst buffer
+ // ptr
+ // "cmp %3, %4 \n" //(width64 -
+ // 32 bytes) and src_offset
+ "sub $0x20,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n" // sse-avx2
+ // transistions
+
+ : "+r"(src_y), //%0
+ "+r"(src_vu), //%1
+ "+r"(dst_yuv24), //%2
+ "+r"(width64), //%3
+ "+r"(src_offset) //%4
+ : "m"(kBLEND0), //%5
+ "m"(kBLEND1), //%6
+ "m"(kBLEND2), //%7
+ "m"(kSHUF0), //%8
+ "m"(kSHUF1), //%9
+ "m"(kSHUF2), //%10
+ "m"(kSHUF3), //%11
+ "m"(kSHUF4), //%12
+ "m"(kSHUF5) //%13
+ : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+ "xmm13", "xmm14", "xmm15");
+}
+#endif // HAS_NV21TOYUV24ROW_AVX2
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "movdqu %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+ asm volatile(
+ "pxor %%xmm1,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n" // load float
+ "maxss %%xmm1, %%xmm0 \n" // clamp to zero
+ "add 4, %0 \n"
+ "movd %%xmm0, (%1) \n" // store float
+ "add 4, %1 \n"
+ "sub $0x4,%2 \n" // 1 float per loop
+ "jg 1b \n"
+ : "+r"(src_x), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_mips.cc b/third_party/aom/third_party/libyuv/source/row_mips.cc
new file mode 100644
index 0000000000..cfc9ffe036
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_mips.cc
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ ".set noat \n"
+ "slti $at, %[count], 8 \n"
+ "bne $at ,$zero, $last8 \n"
+ "xor $t8, %[src], %[dst] \n"
+ "andi $t8, $t8, 0x3 \n"
+
+ "bne $t8, $zero, unaligned \n"
+ "negu $a3, %[dst] \n"
+ // make dst/src aligned
+ "andi $a3, $a3, 0x3 \n"
+ "beq $a3, $zero, $chk16w \n"
+ // word-aligned now count is the remining bytes count
+ "subu %[count], %[count], $a3 \n"
+
+ "lwr $t8, 0(%[src]) \n"
+ "addu %[src], %[src], $a3 \n"
+ "swr $t8, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+
+ // Now the dst/src are mutually word-aligned with word-aligned addresses
+ "$chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, chk8w \n"
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n"
+ // t0 is the "past the end" address
+
+ // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+ // the "t0-32" address
+ // This means: for x=128 the last "safe" a1 address is "t0-160"
+ // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+ // we will use "pref 30,128(a1)", so "t0-160" is the limit
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line of src
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $loop16w \n"
+ "nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lw $t0, 0(%[src]) \n"
+ "bgtz $v1, $skip_pref30_96 \n" // skip
+ "lw $t1, 4(%[src]) \n"
+ "pref 30, 96(%[dst]) \n" // continue
+ "$skip_pref30_96: \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lw $t0, 32(%[src]) \n"
+ "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
+ "lw $t1, 36(%[src]) \n"
+ "pref 30, 128(%[dst]) \n" // set dest, addr 128
+ "$skip_pref30_128: \n"
+ "lw $t2, 40(%[src]) \n"
+ "lw $t3, 44(%[src]) \n"
+ "lw $t4, 48(%[src]) \n"
+ "lw $t5, 52(%[src]) \n"
+ "lw $t6, 56(%[src]) \n"
+ "lw $t7, 60(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
+ "sgtu $v1, %[dst], $t9 \n"
+ "bne %[dst], $a3, $loop16w \n"
+ " addiu %[src], %[src], 64 \n" // adding 64 to src
+ "move %[count], $t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count past 32-bytes
+ "beq %[count], $t8, chk1w \n"
+ // count=t8,no 32-byte chunk
+ " nop \n"
+
+ "lw $t0, 0(%[src]) \n"
+ "lw $t1, 4(%[src]) \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, $last8 \n"
+ " subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+ // copying in words (4-byte chunks)
+ "$wordCopy_loop: \n"
+ "lw $t3, 0(%[src]) \n"
+ // the first t3 may be equal t0 ... optimize?
+ "addiu %[src], %[src],4 \n"
+ "addiu %[dst], %[dst],4 \n"
+ "bne %[dst], $a3,$wordCopy_loop \n"
+ " sw $t3, -4(%[dst]) \n"
+
+ // For the last (<8) bytes
+ "$last8: \n"
+ "blez %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 -last dst address
+ "$last8loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst], $a3, $last8loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "leave: \n"
+ " j $ra \n"
+ " nop \n"
+
+ //
+ // UNALIGNED case
+ //
+
+ "unaligned: \n"
+ // got here with a3="negu a1"
+ "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
+ "beqz $a3, $ua_chk16w \n"
+ " subu %[count], %[count], $a3 \n"
+ // bytes left after initial a3 bytes
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
+ "swr $v1, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+ // below the dst will be word aligned (NOTE1)
+ "$ua_chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, ua_chk8w \n"
+ // if a2==t8, no 64-byte chunks
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n" // t0 "past the end"
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line addr 32
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // safe, as we have at least 64 bytes ahead
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $ua_loop16w \n"
+ // skip "pref 30,64(a1)" for too short arrays
+ " nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$ua_loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "bgtz $v1, $ua_skip_pref30_96 \n"
+ " lwl $t1, 7(%[src]) \n"
+ "pref 30, 96(%[dst]) \n"
+ // continue setting up the dest, addr 96
+ "$ua_skip_pref30_96: \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lwr $t0, 32(%[src]) \n"
+ "lwl $t0, 35(%[src]) \n"
+ "lwr $t1, 36(%[src]) \n"
+ "bgtz $v1, ua_skip_pref30_128 \n"
+ " lwl $t1, 39(%[src]) \n"
+ "pref 30, 128(%[dst]) \n"
+ // continue setting up the dest, addr 128
+ "ua_skip_pref30_128: \n"
+
+ "lwr $t2, 40(%[src]) \n"
+ "lwl $t2, 43(%[src]) \n"
+ "lwr $t3, 44(%[src]) \n"
+ "lwl $t3, 47(%[src]) \n"
+ "lwr $t4, 48(%[src]) \n"
+ "lwl $t4, 51(%[src]) \n"
+ "lwr $t5, 52(%[src]) \n"
+ "lwl $t5, 55(%[src]) \n"
+ "lwr $t6, 56(%[src]) \n"
+ "lwl $t6, 59(%[src]) \n"
+ "lwr $t7, 60(%[src]) \n"
+ "lwl $t7, 63(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst],%[dst],64 \n" // adding 64 to dest
+ "sgtu $v1,%[dst],$t9 \n"
+ "bne %[dst],$a3,$ua_loop16w \n"
+ " addiu %[src],%[src],64 \n" // adding 64 to src
+ "move %[count],$t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "ua_chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count
+ "beq %[count], $t8, $ua_chk1w \n"
+ // when count==t8, no 32-byte chunk
+
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "lwl $t1, 7(%[src]) \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "$ua_chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, ua_smallCopy \n"
+ "subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+
+ // copying in words (4-byte chunks)
+ "$ua_wordCopy_loop: \n"
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addiu %[src], %[src], 4 \n"
+ "addiu %[dst], %[dst], 4 \n"
+ // note: dst=a1 is word aligned here, see NOTE1
+ "bne %[dst], $a3, $ua_wordCopy_loop \n"
+ " sw $v1,-4(%[dst]) \n"
+
+ // Now less than 4 bytes (value in count) left to copy
+ "ua_smallCopy: \n"
+ "beqz %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 = last dst address
+ "$ua_smallCopy_loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst],$a3,$ua_smallCopy_loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "j $ra \n"
+ " nop \n"
+ ".set at \n"
+ ".set reorder \n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ : [count] "r" (count)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+ "t8", "t9", "a3", "v1", "at"
+ );
+}
+#endif // HAS_COPYROW_MIPS
+
+// MIPS DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ ".p2align 2 \n"
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
+ "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
+ "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
+ "sw $t9, 0(%[dst_v]) \n"
+ "sw $t0, 0(%[dst_u]) \n"
+ "sw $t1, 4(%[dst_v]) \n"
+ "sw $t2, 4(%[dst_u]) \n"
+ "sw $t3, 8(%[dst_v]) \n"
+ "sw $t5, 8(%[dst_u]) \n"
+ "sw $t6, 12(%[dst_v]) \n"
+ "sw $t7, 12(%[dst_u]) \n"
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [width] "+r" (width),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7", "t8", "t9"
+ );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "andi $t5, %[width], 0xf \n"
+ "blez $t4, 2f \n"
+ " addu %[src], %[src], %[width] \n" // src += width
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, -16(%[src]) \n" // |3|2|1|0|
+ "lw $t1, -12(%[src]) \n" // |7|6|5|4|
+ "lw $t2, -8(%[src]) \n" // |11|10|9|8|
+ "lw $t3, -4(%[src]) \n" // |15|14|13|12|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t1, $t1 \n" // |6|7|4|5|
+ "wsbh $t2, $t2 \n" // |10|11|8|9|
+ "wsbh $t3, $t3 \n" // |14|15|12|13|
+ "rotr $t0, $t0, 16 \n" // |0|1|2|3|
+ "rotr $t1, $t1, 16 \n" // |4|5|6|7|
+ "rotr $t2, $t2, 16 \n" // |8|9|10|11|
+ "rotr $t3, $t3, 16 \n" // |12|13|14|15|
+ "addiu %[src], %[src], -16 \n"
+ "addiu $t4, $t4, -1 \n"
+ "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
+ "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
+ "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
+ "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
+ "bgtz $t4, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+ "beqz $t5, 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -1(%[src]) \n"
+ "addiu $t5, $t5, -1 \n"
+ "addiu %[src], %[src], -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgez $t5, 2b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src] "+r" (src), [dst] "+r" (dst)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4", "t5"
+ );
+}
+
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ int x = 0;
+ int y = 0;
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "addu $t4, %[width], %[width] \n"
+ "srl %[x], %[width], 4 \n"
+ "andi %[y], %[width], 0xf \n"
+ "blez %[x], 2f \n"
+ " addu %[src_uv], %[src_uv], $t4 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
+ "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
+ "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
+ "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
+ "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
+ "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
+ "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
+ "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
+
+ "rotr $t0, $t0, 16 \n" // |1|0|3|2|
+ "rotr $t1, $t1, 16 \n" // |5|4|7|6|
+ "rotr $t2, $t2, 16 \n" // |9|8|11|10|
+ "rotr $t3, $t3, 16 \n" // |13|12|15|14|
+ "rotr $t4, $t4, 16 \n" // |17|16|19|18|
+ "rotr $t6, $t6, 16 \n" // |21|20|23|22|
+ "rotr $t7, $t7, 16 \n" // |25|24|27|26|
+ "rotr $t8, $t8, 16 \n" // |29|28|31|30|
+ "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
+ "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
+ "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
+ "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
+ "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
+ "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
+ "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
+ "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
+ "addiu %[src_uv], %[src_uv], -32 \n"
+ "addiu %[x], %[x], -1 \n"
+ "swr $t4, 0(%[dst_u]) \n"
+ "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
+ "swr $t6, 0(%[dst_v]) \n"
+ "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
+ "swr $t3, 4(%[dst_v]) \n"
+ "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
+ "swr $t0, 8(%[dst_u]) \n"
+ "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
+ "swr $t1, 8(%[dst_v]) \n"
+ "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
+ "swr $t9, 12(%[dst_u]) \n"
+ "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
+ "swr $t5, 12(%[dst_v]) \n"
+ "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz %[x], 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+ "beqz %[y], 3f \n"
+ " nop \n"
+ "b 2f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -2(%[src_uv]) \n"
+ "lbu $t1, -1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], -2 \n"
+ "addiu %[y], %[y], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[y], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v),
+ [x] "=&r" (x),
+ [y] "+r" (y)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t7", "t8", "t9"
+ );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB \
+ "lw $t0, 0(%[y_buf]) \n" \
+ "lhu $t1, 0(%[u_buf]) \n" \
+ "lhu $t2, 0(%[v_buf]) \n" \
+ "preceu.ph.qbr $t1, $t1 \n" \
+ "preceu.ph.qbr $t2, $t2 \n" \
+ "preceu.ph.qbra $t3, $t0 \n" \
+ "preceu.ph.qbla $t0, $t0 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t3, $t3, $s4 \n" \
+ "subu.ph $t0, $t0, $s4 \n" \
+ "mul.ph $t3, $t3, $s0 \n" \
+ "mul.ph $t0, $t0, $s0 \n" \
+ "shll.ph $t4, $t1, 0x7 \n" \
+ "subu.ph $t4, $t4, $t1 \n" \
+ "mul.ph $t6, $t1, $s1 \n" \
+ "mul.ph $t1, $t2, $s2 \n" \
+ "addq_s.ph $t5, $t4, $t3 \n" \
+ "addq_s.ph $t4, $t4, $t0 \n" \
+ "shra.ph $t5, $t5, 6 \n" \
+ "shra.ph $t4, $t4, 6 \n" \
+ "addiu %[u_buf], 2 \n" \
+ "addiu %[v_buf], 2 \n" \
+ "addu.ph $t6, $t6, $t1 \n" \
+ "mul.ph $t1, $t2, $s3 \n" \
+ "addu.ph $t9, $t6, $t3 \n" \
+ "addu.ph $t8, $t6, $t0 \n" \
+ "shra.ph $t9, $t9, 6 \n" \
+ "shra.ph $t8, $t8, 6 \n" \
+ "addu.ph $t2, $t1, $t3 \n" \
+ "addu.ph $t1, $t1, $t0 \n" \
+ "shra.ph $t2, $t2, 6 \n" \
+ "shra.ph $t1, $t1, 6 \n" \
+ "subu.ph $t5, $t5, $s5 \n" \
+ "subu.ph $t4, $t4, $s5 \n" \
+ "subu.ph $t9, $t9, $s5 \n" \
+ "subu.ph $t8, $t8, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "shll_s.ph $t5, $t5, 8 \n" \
+ "shll_s.ph $t4, $t4, 8 \n" \
+ "shll_s.ph $t9, $t9, 8 \n" \
+ "shll_s.ph $t8, $t8, 8 \n" \
+ "shll_s.ph $t2, $t2, 8 \n" \
+ "shll_s.ph $t1, $t1, 8 \n" \
+ "shra.ph $t5, $t5, 8 \n" \
+ "shra.ph $t4, $t4, 8 \n" \
+ "shra.ph $t9, $t9, 8 \n" \
+ "shra.ph $t8, $t8, 8 \n" \
+ "shra.ph $t2, $t2, 8 \n" \
+ "shra.ph $t1, $t1, 8 \n" \
+ "addu.ph $t5, $t5, $s5 \n" \
+ "addu.ph $t4, $t4, $s5 \n" \
+ "addu.ph $t9, $t9, $s5 \n" \
+ "addu.ph $t8, $t8, $s5 \n" \
+ "addu.ph $t2, $t2, $s5 \n" \
+ "addu.ph $t1, $t1, $s5 \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128| // clipping
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into argb format
+ "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
+ "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
+ "addiu %[width], -4 \n"
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
+ "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into abgr format
+ "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
+ "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
+ "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
+ "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
+
+ "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
+ "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
+ "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff \n"
+ "ori $s6, 0xff \n" // |00|ff|00|ff|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+ // Arranging into bgra format
+ "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
+ "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
+
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
+ "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
+ "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
+ "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
+ "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
+ "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
+ "sll $t1, $t1, 16 \n"
+ "sll $t2, $t2, 16 \n"
+ "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
+ "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ int y0_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "replv.ph $t0, %[y0_fraction] \n"
+ "replv.ph $t1, %[source_y_fraction] \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t2, 0(%[src_ptr]) \n"
+ "lw $t3, 0(%[src_ptr1]) \n"
+ "lw $t4, 4(%[src_ptr]) \n"
+ "lw $t5, 4(%[src_ptr1]) \n"
+ "muleu_s.ph.qbl $t6, $t2, $t0 \n"
+ "muleu_s.ph.qbr $t7, $t2, $t0 \n"
+ "muleu_s.ph.qbl $t8, $t3, $t1 \n"
+ "muleu_s.ph.qbr $t9, $t3, $t1 \n"
+ "muleu_s.ph.qbl $t2, $t4, $t0 \n"
+ "muleu_s.ph.qbr $t3, $t4, $t0 \n"
+ "muleu_s.ph.qbl $t4, $t5, $t1 \n"
+ "muleu_s.ph.qbr $t5, $t5, $t1 \n"
+ "addq.ph $t6, $t6, $t8 \n"
+ "addq.ph $t7, $t7, $t9 \n"
+ "addq.ph $t2, $t2, $t4 \n"
+ "addq.ph $t3, $t3, $t5 \n"
+ "shra.ph $t6, $t6, 8 \n"
+ "shra.ph $t7, $t7, 8 \n"
+ "shra.ph $t2, $t2, 8 \n"
+ "shra.ph $t3, $t3, 8 \n"
+ "precr.qb.ph $t6, $t6, $t7 \n"
+ "precr.qb.ph $t2, $t2, $t3 \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[src_ptr1], %[src_ptr1], 8 \n"
+ "addiu %[dst_width], %[dst_width], -8 \n"
+ "sw $t6, 0(%[dst_ptr]) \n"
+ "sw $t2, 4(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[dst_ptr], %[dst_ptr], 8 \n"
+
+ ".set pop \n"
+ : [dst_ptr] "+r" (dst_ptr),
+ [src_ptr1] "+r" (src_ptr1),
+ [src_ptr] "+r" (src_ptr),
+ [dst_width] "+r" (dst_width)
+ : [source_y_fraction] "r" (source_y_fraction),
+ [y0_fraction] "r" (y0_fraction),
+ [src_stride] "r" (src_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+#endif // __mips_dsp_rev >= 2
+
+#endif // defined(__mips__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_neon.cc b/third_party/aom/third_party/libyuv/source/row_neon.cc
new file mode 100644
index 0000000000..a5aeaabfbd
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_neon.cc
@@ -0,0 +1,3039 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ "vld1.32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUVTORGB_SETUP \
+ "vld1.8 {d24}, [%[kUVToRB]] \n" \
+ "vld1.8 {d25}, [%[kUVToG]] \n" \
+ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
+ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
+
+#define YUVTORGB \
+ "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
+ "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
+ "vmovl.u8 q0, d0 \n" /* Y */ \
+ "vmovl.s16 q10, d1 \n" \
+ "vmovl.s16 q0, d0 \n" \
+ "vmul.s32 q10, q10, q15 \n" \
+ "vmul.s32 q0, q0, q15 \n" \
+ "vqshrun.s32 d0, q0, #16 \n" \
+ "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
+ "vadd.s16 d18, d19 \n" \
+ "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
+ "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
+ "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
+ "vaddw.u16 q1, q1, d16 \n" \
+ "vaddw.u16 q10, q10, d17 \n" \
+ "vaddw.u16 q3, q3, d18 \n" \
+ "vqadd.s16 q8, q0, q13 \n" /* B */ \
+ "vqadd.s16 q9, q0, q14 \n" /* R */ \
+ "vqadd.s16 q0, q0, q4 \n" /* G */ \
+ "vqadd.s16 q8, q8, q1 \n" /* B */ \
+ "vqadd.s16 q9, q9, q10 \n" /* R */ \
+ "vqsub.s16 q0, q0, q3 \n" /* G */ \
+ "vqshrun.s16 d20, q8, #6 \n" /* B */ \
+ "vqshrun.s16 d22, q9, #6 \n" /* R */ \
+ "vqshrun.s16 d21, q0, #6 \n" /* G */
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV444 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %5, %5, #8 \n"
+ "vld1.8 {d23}, [%3]! \n"
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTORGB565 \
+ "vshll.u8 q0, d22, #8 \n" /* R */ \
+ "vshll.u8 q8, d21, #8 \n" /* G */ \
+ "vshll.u8 q9, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #5 \n" /* RG */ \
+ "vsri.16 q0, q9, #11 \n" /* RGB */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB1555 \
+ "vshll.u8 q0, d23, #8 \n" /* A */ \
+ "vshll.u8 q8, d22, #8 \n" /* R */ \
+ "vshll.u8 q9, d21, #8 \n" /* G */ \
+ "vshll.u8 q10, d20, #8 \n" /* B */ \
+ "vsri.16 q0, q8, #1 \n" /* AR */ \
+ "vsri.16 q0, q9, #6 \n" /* ARG */ \
+ "vsri.16 q0, q10, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d4, #0x0f \n" // vbic bits to clear
+ "1: \n"
+
+ READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV400 YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READNV12 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READNV21 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+
+ YUVTORGB_SETUP
+
+ "1: \n"
+
+ READNV12 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+
+ YUVTORGB_SETUP
+
+ "1: \n"
+
+ READNV21 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READNV12 YUVTORGB
+ "subs %3, %3, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUY2 YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READUYVY YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "d0", "d1", "d2" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+ asm volatile(
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+ asm volatile(
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %2 \n"
+ "sub %0, %0, #32 \n" // 32 bytes per loop
+
+ "1: \n"
+ "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
+ "subs %2, #32 \n" // 32 pixels per loop.
+ "vrev64.8 q0, q2 \n"
+ "vrev64.8 q1, q1 \n"
+ "vswp d0, d1 \n"
+ "vswp d2, d3 \n"
+ "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #32 \n"
+
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vrev64.8 d3, d3 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ src_rgb24 += width * 3 - 24;
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"(-24) // %3
+ : "cc", "memory", "d0", "d1", "d2");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "vmov.u8 d0, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ asm volatile(
+ "vdup.32 d2, %2 \n" // dither4
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n" // add for dither
+ ARGBTORGB565
+ "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
+ int width) {
+ asm volatile(
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d1, d24 \n" // B
+ "vmlal.u8 q2, d2, d25 \n" // G
+ "vmlal.u8 q2, d3, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+ "q15");
+}
+
+// clang-format off
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q0, q4, #1 \n" // 2x average
+ "vrshr.u16 q1, q5, #1 \n"
+ "vrshr.u16 q2, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q4, d0, d4 \n" // B
+ "vmlal.u8 q4, d1, d5 \n" // G
+ "vmlal.u8 q4, d2, d6 \n" // R
+ "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // Attenuate 8 pixels.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ asm volatile(
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+ "q14", "q15");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float /*unused*/,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(1.9259299444e-34f) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
+ "vld1.32 {q2}, [%0] \n"
+ "vadd.u32 q0, q0, q1 \n" // * 1
+ "vadd.u32 q1, q1, q2 \n" // * 1
+ "vld1.32 {q2, q3}, [%2]! \n"
+ "vmla.u32 q0, q2, q11 \n" // * 6
+ "vmla.u32 q1, q3, q11 \n" // * 6
+ "vld1.32 {q2, q3}, [%1]! \n"
+ "vld1.32 {q8, q9}, [%3]! \n"
+ "vadd.u32 q2, q2, q8 \n" // add rows for * 4
+ "vadd.u32 q3, q3, q9 \n"
+ "vmla.u32 q0, q2, q10 \n" // * 4
+ "vmla.u32 q1, q3, q10 \n" // * 4
+ "subs %5, %5, #8 \n" // 8 processed per loop
+ "vqshrn.u32 d0, q0, #8 \n" // round and pack
+ "vqshrn.u32 d1, q1, #8 \n"
+ "vst1.u16 {q0}, [%4]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d0, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d1, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
+ "vqrshrn.u16 d0, q0, #2 \n"
+ "vqrshrn.u16 d1, q1, #2 \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_neon64.cc b/third_party/aom/third_party/libyuv/source/row_neon64.cc
new file mode 100644
index 0000000000..d5258a3aef
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_neon64.cc
@@ -0,0 +1,3387 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v1.s}[0], [%1], #4 \n" \
+ "ld1 {v1.s}[1], [%2], #4 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v1.d}[0], [%1], #8 \n" \
+ "ld1 {v1.d}[1], [%2], #8 \n" \
+ "uaddlp v1.8h, v1.16b \n" \
+ "rshrn v1.8b, v1.8h, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "movi v1.8b , #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v3.8b, v2.8b, v2.8b \n" \
+ "uzp2 v1.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
+ "uzp2 v3.8b, v1.8b, v1.8b \n" \
+ "uzp1 v1.8b, v1.8b, v1.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
+ "orr v0.8b, v3.8b, v3.8b \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
+
+#define YUVTORGB_SETUP \
+ "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \
+ "ld1r {v31.4s}, [%[kYToRgb]] \n" \
+ "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+ "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+
+// clang-format off
+
+#define YUVTORGB(vR, vG, vB) \
+ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
+ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
+ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
+ "ushll v0.4s, v0.4h, #0 \n" \
+ "mul v3.4s, v3.4s, v31.4s \n" \
+ "mul v0.4s, v0.4s, v31.4s \n" \
+ "sqshrun v0.4h, v0.4s, #16 \n" \
+ "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
+ "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
+ "uxtl v2.8h, v2.8b \n" \
+ "uxtl v1.8h, v1.8b \n" /* Extract U */ \
+ "mul v3.8h, v27.8h, v1.8h \n" \
+ "mul v5.8h, v29.8h, v1.8h \n" \
+ "mul v6.8h, v30.8h, v2.8h \n" \
+ "mul v7.8h, v28.8h, v2.8h \n" \
+ "sqadd v6.8h, v6.8h, v5.8h \n" \
+ "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
+ "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
+ "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
+ "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
+ "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
+ "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
+ "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
+ "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
+ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
+
+// clang-format on
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV444
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n" /* A */
+
+ "1: \n"
+ READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "ld1 {v23.8b}, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v20.8b, #255 \n" /* A */
+ "1: \n"
+ READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v23, v22, v21)
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "subs %w4, %w4, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+#define ARGBTORGB565 \
+ "shll v0.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v21.8h, #5 \n" /* RG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* RGB */
+
+// clang-format off
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ ARGBTORGB565
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB1555 \
+ "shll v0.8h, v23.8b, #8 \n" /* A */ \
+ "shll v22.8h, v22.8b, #8 \n" /* R */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
+ "shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "sri v0.8h, v22.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v20.8h, #11 \n" /* ARGB */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ ARGBTOARGB1555
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+// clang-format on
+
+#define ARGBTOARGB4444 \
+ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
+ "ushr v20.8b, v20.8b, #4 \n" /* B */ \
+ "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
+ "ushr v22.8b, v22.8b, #4 \n" /* R */ \
+ "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
+ "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
+ "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
+ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v4.16b, #0x0f \n" // bits to clear with vbic.
+ "1: \n"
+ READYUV422
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
+ ARGBTOARGB4444
+ "prfm pldl1keep, [%1, 128] \n"
+ "prfm pldl1keep, [%2, 128] \n"
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUV400
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v20", "v21", "v22", "v23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READNV12
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READNV21
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READNV12
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READNV21
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP "1: \n" READNV12
+ "prfm pldl1keep, [%0, 448] \n" YUVTORGB(
+ v22, v21, v20) ARGBTORGB565
+ "prfm pldl1keep, [%1, 256] \n"
+ "subs %w3, %w3, #8 \n"
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READYUY2
+ "prfm pldl1keep, [%0, 448] \n"
+ YUVTORGB(v22, v21, v20)
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ READUYVY
+ YUVTORGB(v22, v21, v20)
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+ asm volatile(
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+ asm volatile(
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "v0");
+}
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v3.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #32 \n" // 32 pixels per loop.
+ "tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirror) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorUV) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(&kShuffleMirrorUV) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+ 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "ld1 {v3.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #48 \n"
+
+ "1: \n"
+ "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v0.16b, {v0.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v2.16b}, v3.16b \n"
+ "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-48), // %3
+ "r"(&kShuffleMirror) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v4.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
+ // RGB24.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v5.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "movi v0.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v2.8b, v4.8b, v4.8b \n" // move g
+ "orr v1.8b, v5.8b, v5.8b \n" // move r
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
+ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
+ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
+ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
+ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
+ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
+ "dup v2.2D, v0.D[1] \n" /* R */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
+ \
+ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
+ "xtn2 v3.16b, v2.8h \n" \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
+ "dup v1.2D, v0.D[1] \n" \
+ "dup v3.2D, v2.D[1] \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
+ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
+ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
+ \
+ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
+ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
+ \
+ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
+ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
+ \
+ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
+ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
+ "dup v1.2D, v0.D[1] \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
+#define ARGB4444TOARGB \
+ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
+ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
+ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
+ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
+ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
+ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
+ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
+ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
+ "dup v0.2D, v2.D[1] \n" \
+ "dup v1.2D, v3.D[1] \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
+ // RGB24
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(src_uyvyb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ asm volatile(
+ "dup v1.4s, %w2 \n" // dither4
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
+ int width) {
+ asm volatile(
+ "movi v4.16b, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v1.8b, v4.8b \n" // B
+ "umlal v0.8h, v2.8b, v5.8b \n" // G
+ "umlal v0.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movi v24.8b, #112 \n" // UB / VR 0.875
+ // coefficient
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+ "v27", "v28", "v29");
+}
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// clang-format off
+#define RGBTOUV(QB, QG, QR) \
+ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
+ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
+ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
+// clang-format on
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_bgra_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v2.8h, v1.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_abgr_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_rgba_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_rgb24_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_raw_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ RGB565TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
+ RGB565TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_rgb565_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ RGB555TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
+ RGB555TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_argb1555_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
+}
+
+// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+ asm volatile(
+ RGBTOUV_SETUP_REG // sets v20-v25
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ ARGB4444TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%1, 448] \n"
+ ARGB4444TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_argb4444_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28"
+
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+ "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v6.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v4.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "movi v4.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v6.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ asm volatile(
+ "movi v6.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v5.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v4.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // B
+ "umlal v0.8h, v1.8b, v5.8b \n" // G
+ "umlal v0.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_yj), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction), // %4
+ "+r"(y0_fraction) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
+
+ "89: \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
+ // ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
+ // ARGB1.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18");
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // Attenuate 8 pixels.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ asm volatile(
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
+
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ asm volatile(
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v24.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v25.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v26.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v22", "v23", "v24", "v25");
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1");
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2LL), // %5
+ "r"(6LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1LL), // %4
+ "r"(6LL) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float /*unused*/,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width) {
+ float fmax;
+ asm volatile(
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width), // %2
+ "=w"(fmax) // %3
+ : "w"(scale) // %4
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width) {
+ float fsum;
+ asm volatile(
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width), // %2
+ "=w"(fsum) // %3
+ : "w"(scale) // %4
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ : "r"(32LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ asm volatile(
+ "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
+ "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
+ "fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ : "r"(&kGaussCoefficients) // %7
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+ asm volatile(
+ "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
+ // rows
+ "fadd v0.4s, v0.4s, v1.4s \n" // * 1
+ "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
+ "fadd v1.4s, v1.4s, v2.4s \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
+ "fadd v2.4s, v2.4s, v4.4s \n"
+ "fadd v3.4s, v3.4s, v5.4s \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v0.4s, v0.4s, v8.4s \n" // / 256
+ "fmul v1.4s, v1.4s, v8.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kGaussCoefficients), // %3
+ "r"(8LL), // %4
+ "r"(-4LL), // %5
+ "r"(20LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleSwapUV) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_win.cc b/third_party/aom/third_party/libyuv/source/row_win.cc
new file mode 100644
index 0000000000..9afcf060a4
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_win.cc
@@ -0,0 +1,6237 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+// This module is for Visual C 32/64 bit and clangcl 32 bit
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+
+#if defined(_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h> // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// 64 bit
+#if defined(_M_X64)
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8;
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+ a_buf += 8;
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(yuvconstants) \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm2 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+ xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
+ xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
+ xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm0 = _mm_adds_epi16(xmm0, xmm4); \
+ xmm1 = _mm_adds_epi16(xmm1, xmm4); \
+ xmm2 = _mm_adds_epi16(xmm2, xmm4); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
+ xmm1 = _mm_loadu_si128(&xmm0); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
+ _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
+ _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+ dst_argb += 32;
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm4;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+ while (width > 0) {
+ READYUV422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+ while (width > 0) {
+ READYUVA422
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+// 32 bit
+#else // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0};
+
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
+
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
+
+static const vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+ 0, 33, 65, 13, 0, 33, 65, 13};
+
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
+
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+ 33, 65, 13, 0, 33, 65, 13, 0};
+
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
+
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+ 0, 13, 65, 33, 0, 13, 65, 33};
+
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
+
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
+
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+
+// Shuffle table for converting RAW to RGB24. First 8.
+static const uvec8 kShuffleMaskRAWToRGB24_0 = {
+ 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Middle 8.
+static const uvec8 kShuffleMaskRAWToRGB24_1 = {
+ 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleMaskRAWToRGB24_2 = {
+ 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
+
+// YUY2 shuf 16 Y to 32 Y.
+static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
+ 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
+ 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+
+// YUY2 shuf 8 UV to 16 UV.
+static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
+ 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
+ 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
+
+// UYVY shuf 16 Y to 32 Y.
+static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
+ 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
+ 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
+
+// UYVY shuf 8 UV to 16 UV.
+static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
+ 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
+ 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
+
+// NV21 shuf 8 VU to 16 UV.
+static const lvec8 kShuffleNV21 = {
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+ 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ convertloop:
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklbw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ vpunpckhwd ymm1, ymm0, ymm0
+ vpunpcklwd ymm0, ymm0, ymm0
+ vpor ymm0, ymm0, ymm5
+ vpor ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_rgb24
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqu [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqu [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqu [edx + 16], xmm1
+ por xmm3, xmm5
+ movdqu [edx + 48], xmm3
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqu [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqu [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqu [edx + 16], xmm1
+ por xmm3, xmm5
+ movdqu [edx + 48], xmm3
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_rgb24,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_rgb24
+ mov ecx, [esp + 12] // width
+ movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
+ movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
+ movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 4]
+ movdqu xmm2, [eax + 8]
+ lea eax, [eax + 24]
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
+ psllw xmm4, 10
+ psrlw xmm4, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgr565
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ pand xmm1, xmm3 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ pand xmm0, xmm4 // G in middle 6 bits
+ pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
+ por xmm0, xmm7 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
+ vpsllw ymm4, ymm4, 10
+ vpsrlw ymm4, ymm4, 5
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
+ vpand ymm1, ymm0, ymm3 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpand ymm0, ymm0, ymm4 // G in middle 6 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
+ vpor ymm0, ymm0, ymm7 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ vmovd xmm5, eax
+ vbroadcastss ymm5, xmm5
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+ vpsllw ymm3, ymm3, 11
+ vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+ vpsllw ymm7, ymm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
+ vpsllw ymm1, ymm0, 1 // R in upper 5 bits
+ vpsllw ymm2, ymm0, 11 // B in upper 5 bits
+ vpand ymm1, ymm1, ymm3
+ vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
+ vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
+ vpsllw ymm1, ymm1, 8
+ vpor ymm1, ymm1, ymm2 // RB
+ vpsraw ymm2, ymm0, 8 // A
+ vpand ymm0, ymm0, ymm4 // G in middle 5 bits
+ vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
+ vpand ymm2, ymm2, ymm7
+ vpor ymm0, ymm0, ymm2 // AG
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm1, ymm1, 0xd8
+ vpunpckhbw ymm2, ymm1, ymm0
+ vpunpcklbw ymm1, ymm1, ymm0
+ vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+ vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
+ vpand ymm2, ymm0, ymm5 // mask high nibbles
+ vpand ymm0, ymm0, ymm4 // mask low nibbles
+ vpsrlw ymm3, ymm2, 4
+ vpsllw ymm1, ymm0, 4
+ vpor ymm2, ymm2, ymm3
+ vpor ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // mutate for unpack
+ vpermq ymm2, ymm2, 0xd8
+ vpunpckhbw ymm1, ymm0, ymm2
+ vpunpcklbw ymm0, ymm0, ymm2
+ vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
+ vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
+ lea eax, [eax + 32]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
+ psrlw xmm4, 6
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of 1555
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psllw xmm1, 1 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pand xmm1, xmm3
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // G in middle 5 bits
+ psraw xmm2, 8 // A
+ pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
+ pand xmm2, xmm7
+ por xmm0, xmm2 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 18 instructions.
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0
+ movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
+ pslld xmm5, 4
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // mask low nibbles
+ pand xmm2, xmm5 // mask high nibbles
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ psllw xmm1, 4
+ psrlw xmm3, 4
+ por xmm0, xmm1
+ por xmm2, xmm3
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqu [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqu [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ __asm {
+
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ movd xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // width
+ punpcklbw xmm6, xmm6 // make dither 16 bytes
+ movdqa xmm7, xmm6
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ paddusb xmm0, xmm6 // add dither
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ vbroadcastss xmm6, [esp + 12] // dither4
+ mov ecx, [esp + 16] // width
+ vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
+ vpermq ymm6, ymm6, 0xd8
+ vpunpcklwd ymm6, ymm6, ymm6
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpaddusb ymm0, ymm0, ymm6 // add dither
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
+ psrld xmm4, 27
+ movdqa xmm5, xmm4 // generate mask 0x000003e0
+ pslld xmm5, 5
+ movdqa xmm6, xmm4 // generate mask 0x00007c00
+ pslld xmm6, 10
+ pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
+ pslld xmm7, 15
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ movdqa xmm3, xmm0 // R
+ psrad xmm0, 16 // A
+ psrld xmm1, 3 // B
+ psrld xmm2, 6 // G
+ psrld xmm3, 9 // R
+ pand xmm0, xmm7 // A
+ pand xmm1, xmm4 // B
+ pand xmm2, xmm5 // G
+ pand xmm3, xmm6 // R
+ por xmm0, xmm1 // BA
+ por xmm2, xmm3 // GR
+ por xmm0, xmm2 // BGRA
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
+ psllw xmm4, 12
+ movdqa xmm3, xmm4 // generate mask 0x00f000f0
+ psrlw xmm3, 8
+
+ convertloop:
+ movdqu xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0
+ pand xmm0, xmm3 // low nibble
+ pand xmm1, xmm4 // high nibble
+ psrld xmm0, 4
+ psrld xmm1, 8
+ por xmm0, xmm1
+ packuswb xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
+ vpsrld ymm3, ymm3, 27
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
+ vpsrld ymm4, ymm4, 26
+ vpslld ymm4, ymm4, 5
+ vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm2, ymm0, 5 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrld ymm0, ymm0, 8 // R
+ vpand ymm2, ymm2, ymm4 // G
+ vpand ymm1, ymm1, ymm3 // B
+ vpand ymm0, ymm0, ymm5 // R
+ vpor ymm1, ymm1, ymm2 // BG
+ vpor ymm0, ymm0, ymm1 // BGR
+ vpackusdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm4, ymm4, ymm4
+ vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
+ vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
+ vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
+ vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
+ vpslld ymm7, ymm7, 15
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpsrld ymm3, ymm0, 9 // R
+ vpsrld ymm2, ymm0, 6 // G
+ vpsrld ymm1, ymm0, 3 // B
+ vpsrad ymm0, ymm0, 16 // A
+ vpand ymm3, ymm3, ymm6 // R
+ vpand ymm2, ymm2, ymm5 // G
+ vpand ymm1, ymm1, ymm4 // B
+ vpand ymm0, ymm0, ymm7 // A
+ vpor ymm0, ymm0, ymm1 // BA
+ vpor ymm2, ymm2, ymm3 // GR
+ vpor ymm0, ymm0, ymm2 // BGRA
+ vpackssdw ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
+ vpsllw ymm4, ymm4, 12
+ vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
+
+ convertloop:
+ vmovdqu ymm0, [eax] // fetch 8 pixels of argb
+ vpand ymm1, ymm0, ymm4 // high nibble
+ vpand ymm0, ymm0, ymm3 // low nibble
+ vpsrld ymm1, ymm1, 8
+ vpsrld ymm0, ymm0, 4
+ vpor ymm0, ymm0, ymm1
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8
+ lea eax, [eax + 32]
+ vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
+ lea edx, [edx + 16]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToYJ
+ movdqa xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ paddw xmm2, xmm5
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ vbroadcastf128 ymm4, xmmword ptr kARGBToY
+ vbroadcastf128 ymm5, xmmword ptr kAddY16
+ vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vpaddb ymm0, ymm0, ymm5 // add 16 for Y
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
+ vbroadcastf128 ymm5, xmmword ptr kAddYJ64
+ vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
+ vpaddw ymm2, ymm2, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kBGRAToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kABGRToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kRGBAToY
+ movdqa xmm5, xmmword ptr kAddY16
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kARGBToV
+ movdqa xmm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUVJ128
+ movdqa xmm6, xmmword ptr kARGBToVJ
+ movdqa xmm7, xmmword ptr kARGBToUJ
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToV
+ vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+ vpaddb ymm0, ymm0, ymm5 // -> unsigned
+
+ // step 3 - store 16 U and 16 V values
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_AVX2
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
+ vpaddw ymm0, ymm0, ymm5
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
+
+ // step 3 - store 16 U and 16 V values
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVJROW_AVX2
+
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kARGBToV
+ movdqa xmm7, xmmword ptr kARGBToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* convert to U and V */
+ movdqu xmm0, [eax] // U
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ movdqu [edx], xmm0
+
+ movdqu xmm0, [eax] // V
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqu [edx + edi], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kBGRAToV
+ movdqa xmm7, xmmword ptr kBGRAToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kABGRToV
+ movdqa xmm7, xmmword ptr kABGRToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm6, xmmword ptr kRGBAToV
+ movdqa xmm7, xmmword ptr kRGBAToU
+ sub edi, edx // stride from u to v
+
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm1, [eax + 16]
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm2, [eax + 32]
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 \
+ __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
+#define READYUVA422_AVX2 \
+ __asm { \
+ __asm vmovq xmm0, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vpermq ymm5, ymm5, 0xd8 \
+ __asm lea ebp, [ebp + 16]}
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 UV from NV21, upsample to 16 UV.
+#define READNV21_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16]}
+
+// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
+#define READYUY2_AVX2 \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm lea eax, [eax + 32]}
+
+// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
+#define READUYVY_AVX2 \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
+ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
+ __asm vmovdqu ymm0, [eax] /* UV */ \
+ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm lea eax, [eax + 32]}
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) \
+ __asm { \
+ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
+ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
+ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
+ __asm vpsubw ymm2, ymm3, ymm2 \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
+ __asm vpsubw ymm1, ymm3, ymm1 \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
+ __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
+ __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
+ __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
+ __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vpsraw ymm0, ymm0, 6 \
+ __asm vpsraw ymm1, ymm1, 6 \
+ __asm vpsraw ymm2, ymm2, 6 \
+ __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
+ __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
+ __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
+ }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 \
+ __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vmovdqu 0[edx], ymm1 \
+ __asm vmovdqu 32[edx], ymm0 \
+ __asm lea edx, [edx + 64]}
+
+// Store 16 RGBA values.
+#define STORERGBA_AVX2 \
+ __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpermq ymm2, ymm2, 0xd8 \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vmovdqu [edx], ymm0 \
+ __asm vmovdqu [edx + 32], ymm1 \
+ __asm lea edx, [edx + 64]}
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I422ToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422ALPHATOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422ALPHATOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I444ToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ convertloop:
+ READYUV444_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV12ToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV12_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void NV21ToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READNV21_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_YUY2TOARGBROW_AVX2
+// 16 pixels.
+// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+ const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUY2_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOARGBROW_AVX2
+
+#ifdef HAS_UYVYTOARGBROW_AVX2
+// 16 pixels.
+// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
+__declspec(naked) void UYVYToARGBRow_AVX2(
+ const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READUYVY_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_UYVYTOARGBROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+__declspec(naked) void I422ToRGBARow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // abgr
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+ READYUV422_AVX2
+ YUVTORGB_AVX2(ebx)
+ STORERGBA_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I422TORGBAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+// Allows a conversion with half size scaling.
+
+// Read 8 UV from 444.
+#define READYUV444 \
+ __asm { \
+ __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 \
+ __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
+#define READYUVA422 \
+ __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm lea ebp, [ebp + 8]}
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 \
+ __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 VU from NV21, upsample to 8 UV.
+#define READNV21 \
+ __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm lea esi, [esi + 8] \
+ __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8]}
+
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
+#define READYUY2 \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
+ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
+ __asm movdqu xmm0, [eax] /* UV */ \
+ __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm lea eax, [eax + 16]}
+
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
+#define READUYVY \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
+ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
+ __asm movdqu xmm0, [eax] /* UV */ \
+ __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm lea eax, [eax + 16]}
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) \
+ __asm { \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm movdqa xmm3, xmm0 \
+ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
+ __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm psubw xmm0, xmm1 \
+ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
+ __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm psubw xmm1, xmm2 \
+ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
+ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm psubw xmm2, xmm3 \
+ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
+ __asm paddsw xmm0, xmm4 /* B += Y */ \
+ __asm paddsw xmm1, xmm4 /* G += Y */ \
+ __asm paddsw xmm2, xmm4 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// Store 8 ARGB values.
+#define STOREARGB \
+ __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm0 \
+ __asm movdqu 16[edx], xmm1 \
+ __asm lea edx, [edx + 32]}
+
+// Store 8 BGRA values.
+#define STOREBGRA \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
+ __asm lea edx, [edx + 32]}
+
+// Store 8 RGBA values.
+#define STORERGBA \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm movdqa xmm0, xmm5 \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm movdqu 0[edx], xmm5 \
+ __asm movdqu 16[edx], xmm0 \
+ __asm lea edx, [edx + 32]}
+
+// Store 8 RGB24 values.
+#define STORERGB24 \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm lea edx, [edx + 24]}
+
+// Store 8 RGB565 values.
+#define STORERGB565 \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
+ __asm packssdw xmm0, xmm1 \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm lea edx, [edx + 16]}
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void I444ToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUV444
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGB24
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb565_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
+ psrld xmm5, 27
+ pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
+ psrld xmm6, 26
+ pslld xmm6, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
+ pslld xmm7, 11
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGB565
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void I422ToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA422
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // UV
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READNV12
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push ebx
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // VU
+ mov edx, [esp + 8 + 12] // argb
+ mov ebx, [esp + 8 + 16] // yuvconstants
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READNV21
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+ const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // yuy2
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READYUY2
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ ret
+ }
+}
+
+// 8 pixels.
+// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+ const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push ebx
+ mov eax, [esp + 4 + 4] // uyvy
+ mov edx, [esp + 4 + 8] // argb
+ mov ebx, [esp + 4 + 12] // yuvconstants
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+
+ convertloop:
+ READUYVY
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ ret
+ }
+}
+
+__declspec(naked) void I422ToRGBARow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ebx, [esp + 12 + 20] // yuvconstants
+ mov ecx, [esp + 12 + 24] // width
+ sub edi, esi
+
+ convertloop:
+ READYUV422
+ YUVTORGB(ebx)
+ STORERGBA
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_SSSE3
+
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* rgb_buf,
+ const struct YuvConstants*,
+ int width) {
+ __asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ movd xmm2, eax
+ pshufd xmm2, xmm2,0
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0 // Y.Y
+ pmulhuw xmm0, xmm2
+ psubusw xmm0, xmm3
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0 // G
+
+ // Step 2: Weave into ARGB
+ punpcklbw xmm0, xmm0 // GG
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm1 // BGRA next 4 pixels
+ por xmm0, xmm4
+ por xmm1, xmm4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* rgb_buf,
+ const struct YuvConstants*,
+ int width) {
+ __asm {
+ mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
+ vmovd xmm2, eax
+ vbroadcastss ymm2, xmm2
+ mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
+ vmovd xmm3, eax
+ vbroadcastss ymm3, xmm3
+ vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
+ vpslld ymm4, ymm4, 24
+
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+ vmovdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
+ vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
+ vpmulhuw ymm0, ymm0, ymm2
+ vpsubusw ymm0, ymm0, ymm3
+ vpsrlw ymm0, ymm0, 6
+ vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
+
+ // TODO(fbarchard): Weave alpha with unpack.
+ // Step 2: Weave into ARGB
+ vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
+ vpermq ymm1, ymm1, 0xd8
+ vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
+ vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
+ vpor ymm0, ymm0, ymm4
+ vpor ymm1, ymm1, ymm4
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+ movdqu xmm0, [eax - 16 + ecx]
+ pshufb xmm0, xmm5
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
+
+ convertloop:
+ vmovdqu ymm0, [eax - 32 + ecx]
+ vpshufb ymm0, ymm0, ymm5
+ vpermq ymm0, ymm0, 0x4e // swap high and low halfs
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm1, xmmword ptr kShuffleMirrorUV
+ lea eax, [eax + ecx * 2 - 16]
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufb xmm0, xmm1
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [edx + edi], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
+
+ convertloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufd xmm0, xmm0, 0x1b
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
+
+ convertloop:
+ vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [edx], xmm0
+ movdqu [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm2, ymm0, 8 // odd bytes
+ vpsrlw ymm3, ymm1, 8
+ vpand ymm0, ymm0, ymm5 // even bytes
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 0xd8
+ vpermq ymm2, ymm2, 0xd8
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + edi], ymm2
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 32 U's
+ vmovdqu ymm1, [eax + edx] // and 32 V's
+ lea eax, [eax + 32]
+ vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
+ vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
+ vextractf128 [edi], ymm2, 0 // bytes 0..15
+ vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
+ vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
+ vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
+ lea edi, [edi + 64]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ test eax, 15
+ jne convertloopu
+ test edx, 15
+ jne convertloopu
+
+ convertloopa:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloopa
+ ret
+
+ convertloopu:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloopu
+ ret
+ }
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 64
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ rep movsb
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ convertloop:
+ movdqu xmm2, [eax]
+ movdqu xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ convertloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + 32]
+ lea eax, [eax + 64]
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
+ mov ecx, [esp + 12] // width
+
+ extractloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm0, 24
+ psrld xmm1, 24
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg extractloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
+
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_a
+ mov ecx, [esp + 12] // width
+ vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
+
+ extractloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpsrld ymm0, ymm0, 24
+ vpsrld ymm1, ymm1, 24
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ lea eax, [eax + 128]
+ vpackssdw ymm0, ymm0, ymm1 // mutates
+ vpsrld ymm2, ymm2, 24
+ vpsrld ymm3, ymm3, 24
+ vpackssdw ymm2, ymm2, ymm3 // mutates
+ vpackuswb ymm0, ymm0, ymm2 // mutates
+ vpermd ymm0, ymm4, ymm0 // unmutate
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg extractloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ convertloop:
+ movq xmm2, qword ptr [eax] // 8 Y's
+ lea eax, [eax + 8]
+ punpcklbw xmm2, xmm2
+ punpckhwd xmm3, xmm2
+ punpcklwd xmm2, xmm2
+ movdqu xmm4, [edx]
+ movdqu xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ convertloop:
+ vpmovzxbd ymm1, qword ptr [eax]
+ vpmovzxbd ymm2, qword ptr [eax + 8]
+ lea eax, [eax + 16]
+ vpslld ymm1, ymm1, 24
+ vpslld ymm2, ymm2, 24
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
+ __asm {
+ movzx eax, byte ptr [esp + 8] // v8
+ mov edx, 0x01010101 // Duplicate byte to all bytes.
+ mul edx // overwrites edx with upper part of result.
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov ecx, [esp + 12] // width
+ shr ecx, 2
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v8
+ mov ecx, [esp + 12] // width
+ rep stosb
+ mov edi, edx
+ ret
+ }
+}
+
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+ uint32_t v32,
+ int width) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // width
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // even bytes are Y
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // odd bytes are Y
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+ int stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // width
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+ int stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_BLENDPLANEROW_SSSE3
+// Blend 8 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0x00
+
+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.
+ movd xmm7, eax
+ pshufd xmm7, xmm7, 0x00
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
+ mov esi, [esp + 8 + 12] // alpha
+ mov edi, [esp + 8 + 16] // dst
+ mov ecx, [esp + 8 + 20] // width
+ sub eax, esi
+ sub edx, esi
+ sub edi, esi
+
+ // 8 pixel loop.
+ convertloop8:
+ movq xmm0, qword ptr [esi] // alpha
+ punpcklbw xmm0, xmm0
+ pxor xmm0, xmm5 // a, 255-a
+ movq xmm1, qword ptr [eax + esi] // src0
+ movq xmm2, qword ptr [edx + esi] // src1
+ punpcklbw xmm1, xmm2
+ psubb xmm1, xmm6 // bias src0/1 - 128
+ pmaddubsw xmm0, xmm1
+ paddw xmm0, xmm7 // unbias result - 32768 and round.
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ movq qword ptr [edi + esi], xmm0
+ lea esi, [esi + 8]
+ sub ecx, 8
+ jg convertloop8
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_BLENDPLANEROW_SSSE3
+
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 32 pixels at a time.
+// unsigned version of math
+// =((A2*C2)+(B2*(255-C2))+255)/256
+// signed version of math
+// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
+ vpsllw ymm5, ymm5, 8
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ vmovd xmm6, eax
+ vbroadcastss ymm6, xmm6
+ mov eax, 0x807f807f // 32768 + 127 for unbias and round.
+ vmovd xmm7, eax
+ vbroadcastss ymm7, xmm7
+ mov eax, [esp + 8 + 4] // src0
+ mov edx, [esp + 8 + 8] // src1
+ mov esi, [esp + 8 + 12] // alpha
+ mov edi, [esp + 8 + 16] // dst
+ mov ecx, [esp + 8 + 20] // width
+ sub eax, esi
+ sub edx, esi
+ sub edi, esi
+
+ // 32 pixel loop.
+ convertloop32:
+ vmovdqu ymm0, [esi] // alpha
+ vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
+ vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
+ vpxor ymm3, ymm3, ymm5 // a, 255-a
+ vpxor ymm0, ymm0, ymm5 // a, 255-a
+ vmovdqu ymm1, [eax + esi] // src0
+ vmovdqu ymm2, [edx + esi] // src1
+ vpunpckhbw ymm4, ymm1, ymm2
+ vpunpcklbw ymm1, ymm1, ymm2
+ vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
+ vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpmaddubsw ymm0, ymm0, ymm1
+ vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
+ vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
+ vpsrlw ymm3, ymm3, 8
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm3
+ vmovdqu [edi + esi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg convertloop32
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_BLENDPLANEROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+
+// Blend 8 pixels at a time.
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 0x0001
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ sub ecx, 4
+ jl convertloop4b // less than 4 pixels?
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge convertloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm3, xmm3 // generate mask 0xff000000
+ pslld xmm3, 24
+ movdqa xmm4, xmmword ptr kShuffleAlpha0
+ movdqa xmm5, xmmword ptr kShuffleAlpha1
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ pshufb xmm0, xmm4 // isolate first 2 alphas
+ movdqu xmm1, [eax] // read 4 pixels
+ punpcklbw xmm1, xmm1 // first 2 pixel rgbs
+ pmulhuw xmm0, xmm1 // rgb * a
+ movdqu xmm1, [eax] // read 4 pixels
+ pshufb xmm1, xmm5 // isolate next 2 alphas
+ movdqu xmm2, [eax] // read 4 pixels
+ punpckhbw xmm2, xmm2 // next 2 pixel rgbs
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqu xmm2, [eax] // mask original alpha
+ lea eax, [eax + 16]
+ pand xmm2, xmm3
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ por xmm0, xmm2 // copy original alpha
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
+ 128u, 128u, 14u, 15u, 14u, 15u,
+ 14u, 15u, 128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpshufb ymm2, ymm0, ymm4 // low 4 alphas
+ vpshufb ymm3, ymm1, ymm4 // high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * a
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * a
+ vpand ymm6, ymm6, ymm5 // isolate alpha
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vpor ymm0, ymm0, ymm6 // copy original alpha
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
+ mov ecx, [esp + 12 + 12] // width
+ lea ebx, fixed_invtbl8
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 3] // first alpha
+ movzx edi, byte ptr [eax + 7] // second alpha
+ punpcklbw xmm0, xmm0 // first 2
+ movd xmm2, dword ptr [ebx + esi * 4]
+ movd xmm3, dword ptr [ebx + edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm0, xmm2 // rgb * a
+
+ movdqu xmm1, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 11] // third alpha
+ movzx edi, byte ptr [eax + 15] // forth alpha
+ punpckhbw xmm1, xmm1 // next 2
+ movd xmm2, dword ptr [ebx + esi * 4]
+ movd xmm3, dword ptr [ebx + edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm1, xmm2 // rgb * a
+ lea eax, [eax + 16]
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
+ vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#else // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov edx, [esp + 12 + 8] // dst_argb
+ mov ecx, [esp + 12 + 12] // width
+ sub edx, eax
+ lea ebx, fixed_invtbl8
+ vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+ // replace VPGATHER
+ movzx esi, byte ptr [eax + 3] // alpha0
+ movzx edi, byte ptr [eax + 7] // alpha1
+ vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
+ vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
+ movzx esi, byte ptr [eax + 11] // alpha2
+ movzx edi, byte ptr [eax + 15] // alpha3
+ vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
+ vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
+ vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
+ movzx esi, byte ptr [eax + 19] // alpha4
+ movzx edi, byte ptr [eax + 23] // alpha5
+ vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
+ vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
+ vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
+ movzx esi, byte ptr [eax + 27] // alpha6
+ movzx edi, byte ptr [eax + 31] // alpha7
+ vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
+ vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
+ vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
+ vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
+ vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
+ vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
+ vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+ // end of VPGATHER
+
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ vzeroupper
+ ret
+ }
+}
+#endif // USE_GATHER
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, xmmword ptr kARGBToYJ
+ movdqa xmm5, xmmword ptr kAddYJ64
+
+ convertloop:
+ movdqu xmm0, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 G bytes
+ movdqu xmm2, [eax] // A
+ movdqu xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm2, 24
+ psrld xmm3, 24
+ packuswb xmm2, xmm3
+ packuswb xmm2, xmm2 // 8 A bytes
+ movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
+ punpcklbw xmm0, xmm0 // 8 GG words
+ punpcklbw xmm3, xmm2 // 8 GA words
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm3 // GGGA first 4
+ punpckhwd xmm1, xmm3 // GGGA next 4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
+
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
+
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov ecx, [esp + 8] /* width */
+ movdqa xmm2, xmmword ptr kARGBToSepiaB
+ movdqa xmm3, xmmword ptr kARGBToSepiaG
+ movdqa xmm4, xmmword ptr kARGBToSepiaR
+
+ convertloop:
+ movdqu xmm0, [eax] // B
+ movdqu xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ phaddw xmm0, xmm6
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 B values
+ movdqu xmm5, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqu xmm5, [eax] // R
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqu xmm6, [eax] // A
+ movdqu xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm5, xmm6 // 8 RA values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ movdqu [eax], xmm0
+ movdqu [eax + 16], xmm1
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* matrix_argb */
+ movdqu xmm5, [ecx]
+ pshufd xmm2, xmm5, 0x00
+ pshufd xmm3, xmm5, 0x55
+ pshufd xmm4, xmm5, 0xaa
+ pshufd xmm5, xmm5, 0xff
+ mov ecx, [esp + 16] /* width */
+
+ convertloop:
+ movdqu xmm0, [eax] // B
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm7, xmm2
+ movdqu xmm6, [eax] // G
+ movdqu xmm1, [eax + 16]
+ pmaddubsw xmm6, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
+ movdqu xmm1, [eax] // R
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm7, xmm4
+ phaddsw xmm1, xmm7 // R
+ movdqu xmm6, [eax] // A
+ movdqu xmm7, [eax + 16]
+ pmaddubsw xmm6, xmm5
+ pmaddubsw xmm7, xmm5
+ phaddsw xmm6, xmm7 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
+ packuswb xmm1, xmm1 // 8 R values
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm1, xmm6 // 8 RA values
+ movdqa xmm6, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm1 // BGRA first 4
+ punpckhwd xmm6, xmm1 // BGRA next 4
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm6
+ lea eax, [eax + 32]
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ movd xmm2, [esp + 8] /* scale */
+ movd xmm3, [esp + 12] /* interval_size */
+ movd xmm4, [esp + 16] /* interval_offset */
+ mov ecx, [esp + 20] /* width */
+ pshuflw xmm2, xmm2, 040h
+ pshufd xmm2, xmm2, 044h
+ pshuflw xmm3, xmm3, 040h
+ pshufd xmm3, xmm3, 044h
+ pshuflw xmm4, xmm4, 040h
+ pshufd xmm4, xmm4, 044h
+ pxor xmm5, xmm5 // constant 0
+ pcmpeqb xmm6, xmm6 // generate mask 0xff000000
+ pslld xmm6, 24
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ movdqu xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
+ pmulhuw xmm1, xmm2
+ pmullw xmm0, xmm3 // * interval_size
+ movdqu xmm7, [eax] // read 4 pixels
+ pmullw xmm1, xmm3
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
+ paddw xmm1, xmm4
+ packuswb xmm0, xmm1
+ por xmm0, xmm7
+ movdqu [eax], xmm0
+ lea eax, [eax + 16]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ movd xmm2, [esp + 16] // value
+ punpcklbw xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm2, [esi] // read 4 pixels from src_argb1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ lea eax, [eax + 16]
+ lea esi, [esi + 16]
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ sub ecx, 4
+ jl convertloop49
+
+ convertloop4:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge convertloop4
+
+ convertloop49:
+ add ecx, 4 - 1
+ jl convertloop19
+
+ convertloop1:
+ movd xmm0, [eax] // read 1 pixels from src_argb0
+ lea eax, [eax + 4]
+ movd xmm1, [esi] // read 1 pixels from src_argb1
+ lea esi, [esi + 4]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge convertloop1
+
+ convertloop19:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ convertloop:
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vpunpcklbw ymm0, ymm1, ymm1 // low 4
+ vpunpckhbw ymm1, ymm1, ymm1 // high 4
+ vpunpcklbw ymm2, ymm3, ymm5 // low 4
+ vpunpckhbw ymm3, ymm3, ymm5 // high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpackuswb ymm0, ymm0, ymm1
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
+ mov edi, [esp + 8 + 12] // src_y2
+ mov edx, [esp + 8 + 16] // dst_sobelx
+ mov ecx, [esp + 8 + 20] // width
+ sub esi, eax
+ sub edi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
+ mov edx, [esp + 4 + 12] // dst_sobely
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+ pslld xmm5, 24 // 0xff000000
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqa xmm2, xmm0 // GG
+ punpcklbw xmm2, xmm0 // First 8
+ punpckhbw xmm0, xmm0 // Next 8
+ movdqa xmm1, xmm2 // GGGG
+ punpcklwd xmm1, xmm2 // First 4
+ punpckhwd xmm2, xmm2 // Next 4
+ por xmm1, xmm5 // GGGA
+ por xmm2, xmm5
+ movdqa xmm3, xmm0 // GGGG
+ punpcklwd xmm3, xmm0 // Next 4
+ punpckhwd xmm0, xmm0 // Last 4
+ por xmm3, xmm5 // GGGA
+ por xmm0, xmm5
+ movdqu [edx], xmm1
+ movdqu [edx + 16], xmm2
+ movdqu [edx + 32], xmm3
+ movdqu [edx + 48], xmm0
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+
+ convertloop:
+ movdqu xmm0, [eax] // read 16 pixels src_sobelx
+ movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ paddusb xmm2, xmm1 // sobel = sobelx + sobely
+ movdqa xmm3, xmm0 // XA
+ punpcklbw xmm3, xmm5
+ punpckhbw xmm0, xmm5
+ movdqa xmm4, xmm1 // YS
+ punpcklbw xmm4, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa xmm6, xmm4 // YSXA
+ punpcklwd xmm6, xmm3 // First 4
+ punpckhwd xmm4, xmm3 // Next 4
+ movdqa xmm7, xmm1 // YSXA
+ punpcklwd xmm7, xmm0 // Next 4
+ punpckhwd xmm1, xmm0 // Last 4
+ movdqu [edx], xmm6
+ movdqu [edx + 16], xmm4
+ movdqu [edx + 32], xmm7
+ movdqu [edx + 48], xmm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+// in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+// This function requires alignment on accumulation buffer pointers.
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
+ int width,
+ int area,
+ uint8_t* dst,
+ int count) {
+ __asm {
+ mov eax, topleft // eax topleft
+ mov esi, botleft // esi botleft
+ mov edx, width
+ movd xmm5, area
+ mov edi, dst
+ mov ecx, count
+ cvtdq2ps xmm5, xmm5
+ rcpss xmm4, xmm5 // 1.0f / area
+ pshufd xmm4, xmm4, 0
+ sub ecx, 4
+ jl l4b
+
+ cmp area, 128 // 128 pixels will not overflow 15 bits.
+ ja l4
+
+ pshufd xmm5, xmm5, 0 // area
+ pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
+ psrld xmm6, 16
+ cvtdq2ps xmm6, xmm6
+ addps xmm5, xmm6 // (65536.0 + area - 1)
+ mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
+ cvtps2dq xmm5, xmm5 // 0.16 fixed point
+ packssdw xmm5, xmm5 // 16 bit shorts
+
+ // 4 pixel loop small blocks.
+ s4:
+ // top left
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
+ packssdw xmm2, xmm3
+
+ pmulhuw xmm0, xmm5
+ pmulhuw xmm2, xmm5
+
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge s4
+
+ jmp l4b
+
+ // 4 pixel loop
+ l4:
+ // top left
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
+ cvtdq2ps xmm1, xmm1
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ l1:
+ movdqu xmm0, [eax]
+ psubd xmm0, [eax + edx * 4]
+ lea eax, [eax + 16]
+ psubd xmm0, [esi]
+ paddd xmm0, [esi + edx * 4]
+ lea esi, [esi + 16]
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm4
+ cvtps2dq xmm0, xmm0
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword ptr [edi], xmm0
+ lea edi, [edi + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ }
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width) {
+ __asm {
+ mov eax, row
+ mov edx, cumsum
+ mov esi, previous_cumsum
+ mov ecx, width
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+
+ sub ecx, 4
+ jl l4b
+ test edx, 15
+ jne l4b
+
+ // 4 pixel loop
+ l4:
+ movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
+ lea eax, [eax + 16]
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm2, xmm1
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm1
+ punpckhwd xmm3, xmm1
+
+ punpckhbw xmm4, xmm1
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+
+ paddd xmm0, xmm2
+ movdqu xmm2, [esi] // previous row above.
+ paddd xmm2, xmm0
+
+ paddd xmm0, xmm3
+ movdqu xmm3, [esi + 16]
+ paddd xmm3, xmm0
+
+ paddd xmm0, xmm4
+ movdqu xmm4, [esi + 32]
+ paddd xmm4, xmm0
+
+ paddd xmm0, xmm5
+ movdqu xmm5, [esi + 48]
+ lea esi, [esi + 64]
+ paddd xmm5, xmm0
+
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm3
+ movdqu [edx + 32], xmm4
+ movdqu [edx + 48], xmm5
+
+ lea edx, [edx + 64]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ l1:
+ movd xmm2, dword ptr [eax] // 1 argb pixel
+ lea eax, [eax + 4]
+ punpcklbw xmm2, xmm1
+ punpcklwd xmm2, xmm1
+ paddd xmm0, xmm2
+ movdqu xmm2, [esi]
+ lea esi, [esi + 16]
+ paddd xmm2, xmm0
+ movdqu [edx], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 1
+ jge l1
+
+ l1b:
+ }
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+ int src_argb_stride,
+ uint8_t* dst_argb,
+ const float* uv_dudv,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 12] // src_argb
+ mov esi, [esp + 16] // stride
+ mov edx, [esp + 20] // dst_argb
+ mov ecx, [esp + 24] // pointer to uv_dudv
+ movq xmm2, qword ptr [ecx] // uv
+ movq xmm7, qword ptr [ecx + 8] // dudv
+ mov ecx, [esp + 28] // width
+ shl esi, 16 // 4, stride
+ add esi, 4
+ movd xmm5, esi
+ sub ecx, 4
+ jl l4b
+
+ // setup for 4 pixel loop
+ pshufd xmm7, xmm7, 0x44 // dup dudv
+ pshufd xmm5, xmm5, 0 // dup 4, stride
+ movdqa xmm0, xmm2 // x0, y0, x1, y1
+ addps xmm0, xmm7
+ movlhps xmm2, xmm0
+ movdqa xmm4, xmm7
+ addps xmm4, xmm4 // dudv *= 2
+ movdqa xmm3, xmm2 // x2, y2, x3, y3
+ addps xmm3, xmm4
+ addps xmm4, xmm4 // dudv *= 4
+
+ // 4 pixel loop
+ l4:
+ cvttps2dq xmm0, xmm2 // x, y float to int first 2
+ cvttps2dq xmm1, xmm3 // x, y float to int next 2
+ packssdw xmm0, xmm1 // x, y as 8 shorts
+ pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd xmm1, [eax + esi] // read pixel 0
+ movd xmm6, [eax + edi] // read pixel 1
+ punpckldq xmm1, xmm6 // combine pixel 0 and 1
+ addps xmm2, xmm4 // x, y += dx, dy first 2
+ movq qword ptr [edx], xmm1
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ movd xmm6, [eax + esi] // read pixel 2
+ movd xmm0, [eax + edi] // read pixel 3
+ punpckldq xmm6, xmm0 // combine pixel 2 and 3
+ addps xmm3, xmm4 // x, y += dx, dy next 2
+ movq qword ptr 8[edx], xmm6
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ l1:
+ cvttps2dq xmm0, xmm2 // x, y float to int
+ packssdw xmm0, xmm0 // x, y as shorts
+ pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
+ addps xmm2, xmm7 // x, y += dx, dy
+ movd esi, xmm0
+ movd xmm0, [eax + esi] // copy a pixel
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ sub edi, esi
+ cmp eax, 128
+ je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
+
+ vmovd xmm0, eax // high fraction 0..255
+ neg eax
+ add eax, 256
+ vmovd xmm5, eax // low fraction 256..1
+ vpunpcklbw xmm5, xmm5, xmm0
+ vpunpcklwd xmm5, xmm5, xmm5
+ vbroadcastss ymm5, xmm5
+
+ mov eax, 0x80808080 // 128b for bias and rounding.
+ vmovd xmm4, eax
+ vbroadcastss ymm4, xmm4
+
+ xloop:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm2, [esi + edx]
+ vpunpckhbw ymm1, ymm0, ymm2 // mutates
+ vpunpcklbw ymm0, ymm0, ymm2
+ vpsubb ymm1, ymm1, ymm4 // bias to signed image
+ vpsubb ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm5, ymm1
+ vpmaddubsw ymm0, ymm5, ymm0
+ vpaddw ymm1, ymm1, ymm4 // unbias and round
+ vpaddw ymm0, ymm0, ymm4
+ vpsrlw ymm1, ymm1, 8
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ sub ecx, 32
+ jg xloop50
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ rep movsb
+
+ xloop99:
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+// TODO(fbarchard): Consider allowing 256 using memcpy.
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 /256. Blend 100 / 0.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+
+ movd xmm0, eax // high fraction 0..255
+ neg eax
+ add eax, 256
+ movd xmm5, eax // low fraction 255..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x80808080 // 128 for biasing image to signed.
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0x00
+
+ xloop:
+ movdqu xmm0, [esi]
+ movdqu xmm2, [esi + edx]
+ movdqu xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ psubb xmm0, xmm4 // bias image by -128
+ psubb xmm1, xmm4
+ movdqa xmm2, xmm5
+ movdqa xmm3, xmm5
+ pmaddubsw xmm2, xmm0
+ pmaddubsw xmm3, xmm1
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+ psrlw xmm2, 8
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [esi + edi], xmm2
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop
+ jmp xloop99
+
+ // Blend 50 / 50.
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop50
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+ movdqu xmm0, [esi]
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ sub ecx, 16
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ movdqu xmm5, [ecx]
+ mov ecx, [esp + 16] // width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg wloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
+ mov ecx, [esp + 16] // width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2 // YUYV
+ punpckhbw xmm1, xmm2
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ movdqa xmm1, xmm2
+ lea eax, [eax + 16]
+ punpcklbw xmm1, xmm0 // UYVY
+ punpckhbw xmm2, xmm0
+ movdqu [edi], xmm1
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* src_argb */
+ mov edx, [esp + 4 + 8] /* dst_argb */
+ mov esi, [esp + 4 + 12] /* poly */
+ mov ecx, [esp + 4 + 16] /* width */
+ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
+
+ // 2 pixel loop.
+ convertloop:
+ // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+ // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ movq xmm0, qword ptr [eax] // BGRABGRA
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm3
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3 // pixel 0
+ punpckhwd xmm4, xmm3 // pixel 1
+ cvtdq2ps xmm0, xmm0 // 4 floats
+ cvtdq2ps xmm4, xmm4
+ movdqa xmm1, xmm0 // X
+ movdqa xmm5, xmm4
+ mulps xmm0, [esi + 16] // C1 * X
+ mulps xmm4, [esi + 16]
+ addps xmm0, [esi] // result = C0 + C1 * X
+ addps xmm4, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm5
+ mulps xmm2, xmm1 // X * X
+ mulps xmm6, xmm5
+ mulps xmm1, xmm2 // X * X * X
+ mulps xmm5, xmm6
+ mulps xmm2, [esi + 32] // C2 * X * X
+ mulps xmm6, [esi + 32]
+ mulps xmm1, [esi + 48] // C3 * X * X * X
+ mulps xmm5, [esi + 48]
+ addps xmm0, xmm2 // result += C2 * X * X
+ addps xmm4, xmm6
+ addps xmm0, xmm1 // result += C3 * X * X * X
+ addps xmm4, xmm5
+ cvttps2dq xmm0, xmm0
+ cvttps2dq xmm4, xmm4
+ packuswb xmm0, xmm4
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 2
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const float* poly,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* poly */
+ vbroadcastf128 ymm4, [ecx] // C0
+ vbroadcastf128 ymm5, [ecx + 16] // C1
+ vbroadcastf128 ymm6, [ecx + 32] // C2
+ vbroadcastf128 ymm7, [ecx + 48] // C3
+ mov ecx, [esp + 16] /* width */
+
+ // 2 pixel loop.
+ convertloop:
+ vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
+ lea eax, [eax + 8]
+ vcvtdq2ps ymm0, ymm0 // X 8 floats
+ vmulps ymm2, ymm0, ymm0 // X * X
+ vmulps ymm3, ymm0, ymm7 // C3 * X
+ vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
+ vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
+ vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
+ vcvttps2dq ymm0, ymm0
+ vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
+ vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
+ vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
+ vmovq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 2
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ movd xmm4, dword ptr [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+ mulss xmm4, kExpBias
+ pshufd xmm4, xmm4, 0
+ pxor xmm5, xmm5
+ sub edx, eax
+
+ // 8 pixel loop.
+ convertloop:
+ movdqu xmm2, xmmword ptr [eax] // 8 shorts
+ add eax, 16
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm5
+ cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
+ punpckhwd xmm3, xmm5
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ psrld xmm2, 13
+ psrld xmm3, 13
+ packssdw xmm2, xmm3
+ movdqu [eax + edx - 16], xmm2
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ movd xmm4, dword ptr [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+
+ vmulss xmm4, xmm4, kExpBias
+ vbroadcastss ymm4, xmm4
+ vpxor ymm5, ymm5, ymm5
+ sub edx, eax
+
+ // 16 pixel loop.
+ convertloop:
+ vmovdqu ymm2, [eax] // 16 shorts
+ add eax, 32
+ vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
+ vpunpcklwd ymm2, ymm2, ymm5
+ vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
+ vcvtdq2ps ymm2, ymm2
+ vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
+ vmulps ymm2, ymm2, ymm4
+ vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
+ vpsrld ymm2, ymm2, 13
+ vpackssdw ymm2, ymm2, ymm3
+ vmovdqu [eax + edx - 32], ymm2
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src */
+ mov edx, [esp + 8] /* dst */
+ vbroadcastss ymm4, [esp + 12] /* scale */
+ mov ecx, [esp + 16] /* width */
+ sub edx, eax
+
+ // 16 pixel loop.
+ convertloop:
+ vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
+ vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
+ add eax, 32
+ vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
+ vcvtdq2ps ymm3, ymm3
+ vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
+ vmulps ymm3, ymm3, ymm4
+ vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
+ vcvtps2ph xmm3, ymm3, 3
+ vmovdqu [eax + edx + 32], xmm2
+ vmovdqu [eax + edx + 32 + 16], xmm3
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ movzx edx, byte ptr [eax - 4 + 3]
+ movzx edx, byte ptr [esi + edx * 4 + 3]
+ mov byte ptr [eax - 4 + 3], dl
+ dec ecx
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ dec ecx
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
+ mov ecx, [esp + 8 + 12] /* width */
+ movd xmm2, dword ptr [esp + 8 + 16] // luma table
+ movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
+ psllw xmm4, 8
+ pxor xmm5, xmm5
+
+ // 4 pixel loop.
+ convertloop:
+ movdqu xmm0, xmmword ptr [eax] // generate luma ptr
+ pmaddubsw xmm0, xmm3
+ phaddw xmm0, xmm0
+ pand xmm0, xmm4 // mask out low bits
+ punpcklwd xmm0, xmm5
+ paddd xmm0, xmm2 // add table base
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi], dl
+ movzx edx, byte ptr [eax + 1]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 1], dl
+ movzx edx, byte ptr [eax + 2]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 2], dl
+ movzx edx, byte ptr [eax + 3] // copy alpha.
+ mov byte ptr [edi + 3], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 4]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 4], dl
+ movzx edx, byte ptr [eax + 5]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 5], dl
+ movzx edx, byte ptr [eax + 6]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 6], dl
+ movzx edx, byte ptr [eax + 7] // copy alpha.
+ mov byte ptr [edi + 7], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 8]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 8], dl
+ movzx edx, byte ptr [eax + 9]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 9], dl
+ movzx edx, byte ptr [eax + 10]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 10], dl
+ movzx edx, byte ptr [eax + 11] // copy alpha.
+ mov byte ptr [edi + 11], dl
+
+ movd esi, xmm0
+
+ movzx edx, byte ptr [eax + 12]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 12], dl
+ movzx edx, byte ptr [eax + 13]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 13], dl
+ movzx edx, byte ptr [eax + 14]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 14], dl
+ movzx edx, byte ptr [eax + 15] // copy alpha.
+ mov byte ptr [edi + 15], dl
+
+ lea eax, [eax + 16]
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // defined(_M_X64)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
diff --git a/third_party/aom/third_party/libyuv/source/scale.cc b/third_party/aom/third_party/libyuv/source/scale.cc
new file mode 100644
index 0000000000..cf3c033257
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale.cc
@@ -0,0 +1,1935 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h" // For UVScale
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleRowDown2_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+ : ScaleRowDown2Box_C);
+ int row_stride = src_stride << 1;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+ : ScaleRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_NEON
+ : ScaleRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+ : ScaleRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+ : ScaleRowDown2Box_SSSE3);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_AVX2
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+ : ScaleRowDown2Box_Any_AVX2);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_AVX2
+ : ScaleRowDown2Box_AVX2);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
+ : ScaleRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MMI
+ : ScaleRowDown2Box_MMI);
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+ : ScaleRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MSA
+ : ScaleRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void ScalePlaneDown2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleRowDown2_16_C
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+ : ScaleRowDown2Box_16_C);
+ int row_stride = src_stride << 1;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 =
+ filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_16_SSE2
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+ : ScaleRowDown2Box_16_SSE2);
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_16_MMI
+ : ScaleRowDown2Box_16_MMI);
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+ int row_stride = src_stride << 2;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void ScalePlaneDown4_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+ int row_stride = src_stride << 2;
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane down, 3/4
+static void ScalePlaneDown34(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_C;
+ ScaleRowDown34_1 = ScaleRowDown34_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+ }
+ if (dst_width % 24 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_MMI;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_MMI;
+ if (dst_width % 24 == 0) {
+ ScaleRowDown34_0 = ScaleRowDown34_MMI;
+ ScaleRowDown34_1 = ScaleRowDown34_MMI;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
+ }
+ if (dst_width % 48 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_MSA;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+ }
+ if (dst_width % 24 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static void ScalePlaneDown34_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_16_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc def
+// aaabbbcc ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ (void)src_width;
+ (void)src_height;
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+ }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+ }
+ if (dst_width % 12 == 0 && !filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ }
+ if (dst_width % 6 == 0 && filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_MSA;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static void ScalePlaneDown38_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ (void)src_width;
+ (void)src_height;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_16_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
+ uint32_t sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
+ uint32_t sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = dx >> 16;
+ int boxwidth;
+ scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+ scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = MIN1((x >> 16) - ix);
+ *dst_ptr++ =
+ SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+ 16;
+ }
+}
+
+static void ScaleAddCols2_16_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint32_t* src_ptr,
+ uint16_t* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = dx >> 16;
+ int boxwidth;
+ scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+ scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = MIN1((x >> 16) - ix);
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+ scaletbl[boxwidth - minboxwidth] >>
+ 16;
+ }
+}
+
+static void ScaleAddCols0_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr) {
+ int scaleval = 65536 / boxheight;
+ int i;
+ (void)dx;
+ src_ptr += (x >> 16);
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+ }
+}
+
+static void ScaleAddCols1_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr) {
+ int boxwidth = MIN1(dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ x >>= 16;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+static void ScaleAddCols1_16_C(int dst_width,
+ int boxheight,
+ int x,
+ int dx,
+ const uint32_t* src_ptr,
+ uint16_t* dst_ptr) {
+ int boxwidth = MIN1(dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ int j, k;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ {
+ // Allocate a row buffer of uint16_t.
+ align_buffer_64(row16, src_width * 2);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint16_t* src_ptr, uint8_t* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_C
+ : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+ void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+ int src_width) = ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleAddRow = ScaleAddRow_Any_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleAddRow = ScaleAddRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ ScaleAddRow = ScaleAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleAddRow = ScaleAddRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleAddRow = ScaleAddRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 8)) {
+ ScaleAddRow = ScaleAddRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEADDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleAddRow = ScaleAddRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_MSA;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint8_t* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = MIN1((y >> 16) - iy);
+ memset(row16, 0, src_width * 2);
+ for (k = 0; k < boxheight; ++k) {
+ ScaleAddRow(src, (uint16_t*)(row16), src_width);
+ src += src_stride;
+ }
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row16);
+ }
+}
+
+static void ScalePlaneBox_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ int j, k;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ {
+ // Allocate a row buffer of uint32_t.
+ align_buffer_64(row32, src_width * 4);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint32_t* src_ptr, uint16_t* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
+ void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+ int src_width) = ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_16_SSE2;
+ }
+#endif
+
+#if defined(HAS_SCALEADDROW_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
+ ScaleAddRow = ScaleAddRow_16_MMI;
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint16_t* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = MIN1((y >> 16) - iy);
+ memset(row32, 0, src_width * 4);
+ for (k = 0; k < boxheight; ++k) {
+ ScaleAddRow(src, (uint32_t*)(row32), src_width);
+ src += src_stride;
+ }
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row32);
+ }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_MSA;
+ }
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+ void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint16_t* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_C : ScaleCols_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_SSE2;
+ }
+#endif
+#if defined(HAS_SCALECOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_MMI;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+void ScalePlaneBilinearUp_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_16_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+#if defined(HAS_SCALECOLS_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_16_MMI;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint16_t* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4);
+
+ uint16_t* rowptr = (uint16_t*)row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+ int x, int dx) = ScaleCols_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_SSE2;
+ }
+#endif
+#if defined(HAS_SCALECOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_MMI;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+static void ScalePlaneSimple_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+ int x, int dx) = ScaleCols_16_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+#if defined(HAS_SCALECOLS_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_16_MMI;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width && filtering != kFilterBox) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+ // optimized, 3/8
+ ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ (filtering == kFilterBox || filtering == kFilterNone)) {
+ // optimized, 1/4
+ ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width && filtering != kFilterBox) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst, 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
+ // optimized, 3/8
+ ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ (filtering == kFilterBox || filtering == kFilterNone)) {
+ // optimized, 1/4
+ ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+ dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
+// Scale an I444 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_uv || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+ dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int dst_stride_y,
+ int dst_stride_u,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ LIBYUV_BOOL interpolate) {
+ return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+ dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+ dst_height, interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_any.cc b/third_party/aom/third_party/libyuv/source/scale_any.cc
new file mode 100644
index 0000000000..c93d70c5fc
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_any.cc
@@ -0,0 +1,615 @@
+/*
+ * Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h> // For memset/memcpy
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fixed scale down.
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
+ }
+
+// Fixed scale down for odd source width. Used by I420Blend subsampling.
+// Since dst_width is (width + 1) / 2, this function scales one less pixel
+// and copies the last pixel.
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
+ int n = (dst_width - 1) - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r + 1); \
+ }
+
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+ ScaleRowDown2Linear_SSSE3,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+ ScaleRowDown2Box_SSSE3,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+ ScaleUVRowDown2Box_SSSE3,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 4)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+ ScaleUVRowDown2Box_AVX2,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 8)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+ ScaleRowDown2Linear_AVX2,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+ ScaleRowDown2Box_AVX2,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+ ScaleRowDown2Linear_NEON,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+ ScaleRowDown2Box_NEON,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+ ScaleUVRowDown2Box_NEON,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 8)
+#endif
+
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+ ScaleRowDown2Linear_MSA,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+ ScaleRowDown2Box_MSA,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MMI
+SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
+SDANY(ScaleRowDown2Linear_Any_MMI,
+ ScaleRowDown2Linear_MMI,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 7)
+SDANY(ScaleRowDown2Box_Any_MMI,
+ ScaleRowDown2Box_MMI,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 7)
+SDODD(ScaleRowDown2Box_Odd_MMI,
+ ScaleRowDown2Box_MMI,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSSE3
+SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+ ScaleRowDown4Box_SSSE3,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+ ScaleRowDown4Box_AVX2,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+ ScaleRowDown4Box_NEON,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+ ScaleRowDown4Box_MSA,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MMI
+SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_MMI,
+ ScaleRowDown4Box_MMI,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3,
+ ScaleRowDown34_SSSE3,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+ ScaleRowDown34_0_Box_SSSE3,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+ ScaleRowDown34_1_Box_SSSE3,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON,
+ ScaleRowDown34_NEON,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+ ScaleRowDown34_0_Box_NEON,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+ ScaleRowDown34_1_Box_NEON,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+ ScaleRowDown34_MSA,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+ ScaleRowDown34_0_Box_MSA,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+ ScaleRowDown34_1_Box_MSA,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MMI
+SDANY(ScaleRowDown34_Any_MMI,
+ ScaleRowDown34_MMI,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3,
+ ScaleRowDown38_SSSE3,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+ ScaleRowDown38_3_Box_SSSE3,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+ ScaleRowDown38_2_Box_SSSE3,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON,
+ ScaleRowDown38_NEON,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+ ScaleRowDown38_3_Box_NEON,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+ ScaleRowDown38_2_Box_NEON,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+ ScaleRowDown38_MSA,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+ ScaleRowDown38_3_Box_MSA,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+ ScaleRowDown38_2_Box_MSA,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+ ScaleARGBRowDown2_SSE2,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+ ScaleARGBRowDown2Linear_SSE2,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+ ScaleARGBRowDown2Box_SSE2,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON,
+ ScaleARGBRowDown2_NEON,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+ ScaleARGBRowDown2Linear_NEON,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+ ScaleARGBRowDown2Box_NEON,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+ ScaleARGBRowDown2_MSA,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+ ScaleARGBRowDown2Linear_MSA,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+ ScaleARGBRowDown2Box_MSA,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MMI
+SDANY(ScaleARGBRowDown2_Any_MMI,
+ ScaleARGBRowDown2_MMI,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 1)
+SDANY(ScaleARGBRowDown2Linear_Any_MMI,
+ ScaleARGBRowDown2Linear_MMI,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 1)
+SDANY(ScaleARGBRowDown2Box_Any_MMI,
+ ScaleARGBRowDown2Box_MMI,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 1)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+ uint8_t* dst_ptr, int dst_width) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
+ dst_ptr + n * BPP, r); \
+ }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+ ScaleARGBRowDownEven_SSE2,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+ ScaleARGBRowDownEvenBox_SSE2,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+ ScaleARGBRowDownEven_NEON,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+ ScaleARGBRowDownEvenBox_NEON,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+ ScaleARGBRowDownEven_MSA,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+ ScaleARGBRowDownEvenBox_MSA,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
+SDAANY(ScaleARGBRowDownEven_Any_MMI,
+ ScaleARGBRowDownEven_MMI,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 1)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
+ ScaleARGBRowDownEvenBox_MMI,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 1)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+ ScaleUVRowDownEven_NEON,
+ ScaleUVRowDownEven_C,
+ 2,
+ 3)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down. Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint16_t dst_temp[32]); \
+ SIMD_ALIGNED(uint8_t src_temp[32]); \
+ memset(dst_temp, 0, 32 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(src_temp, dst_temp, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
+#endif
+#undef SAANY
+
+#else
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+ int n = src_width & ~MASK; \
+ if (n > 0) { \
+ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
+#endif
+#undef SAANY
+
+#endif // SASIMDONLY
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+ ScaleARGBFilterCols_MSA,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#undef CANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_argb.cc b/third_party/aom/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 0000000000..451d4ec4d1
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,1091 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ uint8_t* dst_argb, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_C
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+ : ScaleARGBRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ } else {
+ src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_SSE2
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+ : ScaleARGBRowDown2Box_Any_SSE2);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_SSE2
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+ : ScaleARGBRowDown2Box_SSE2);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+ : ScaleARGBRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+ : ScaleARGBRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
+ : ScaleARGBRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MMI
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
+ : ScaleARGBRowDown2Box_MMI);
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+ : ScaleARGBRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+ : ScaleARGBRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ uint8_t* dst_argb, int dst_width) =
+ ScaleARGBRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
+ ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+ : ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+ : ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
+ : ScaleARGBRowDownEven_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+ : ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
+ src_argb += xl * 4;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of ARGB.
+ {
+ align_buffer_64(row, clip_src_width * 4);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_argb + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols =
+ filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_argb + yi * src_stride;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_argb + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+ I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+
+ void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols =
+ filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int yi = y >> 16;
+ int uv_yi = yi >> kYShift;
+ const uint8_t* src_row_y = src_y + yi * src_stride_y;
+ const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
+ const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_row, src_width * 4);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+ ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+ if (src_height > 1) {
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+ if (src_height > 2) {
+ src_row_y += src_stride_y;
+ if (!(yi & 1)) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ uv_yi = yi >> kYShift;
+ src_row_y = src_y + yi * src_stride_y;
+ src_row_u = src_u + uv_yi * src_stride_u;
+ src_row_v = src_v + uv_yi * src_stride_v;
+ }
+ if (yi != lasty) {
+ // TODO(fbarchard): Convert the clipped region of row.
+ I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+ ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBCols = ScaleARGBCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBCols_MSA;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+ dx);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64_t clipf = (int64_t)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 4;
+ dst += clip_x * 4;
+ }
+ if (clip_y) {
+ int64_t clipf = (int64_t)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
+ return;
+ }
+ ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+ dst, dst_stride, clip_width, clip_height);
+ return;
+ }
+ }
+ }
+ }
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, 4, filtering);
+ return;
+ }
+ if (filtering && dy < 65536) {
+ ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ if (filtering) {
+ ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8_t* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+ dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
+ clip_width > 32768 || clip_height > 32768 ||
+ (clip_x + clip_width) > dst_width ||
+ (clip_y + clip_height) > dst_height) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+ clip_height, filtering);
+ return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8_t* src_argb,
+ int src_stride_argb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+ filtering);
+ return 0;
+}
+
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint32_t src_fourcc,
+ int src_width,
+ int src_height,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ uint32_t dst_fourcc,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
+ int r;
+ (void)src_fourcc; // TODO(fbarchard): implement and/or assert.
+ (void)dst_fourcc;
+ I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ argb_buffer, src_width * 4, src_width, src_height);
+
+ r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+ dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+ clip_width, clip_height, filtering);
+ free(argb_buffer);
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_common.cc b/third_party/aom/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 0000000000..fd4cbd0386
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,1564 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ int x;
+ dst_width -= 1;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst += 1;
+ s += 2;
+ t += 2;
+ }
+ dst[0] = (s[0] + t[0] + 1) >> 1;
+}
+
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+ src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+ src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+ src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+ src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+ src_ptr[stride * 3 + 7] + 8) >>
+ 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ }
+}
+
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+ src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+ src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+ src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+ src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+ src_ptr[stride * 3 + 7] + 8) >>
+ 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+ src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+ src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+ src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+ src_ptr[stride * 3 + 3] + 8) >>
+ 4;
+ }
+}
+
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* d,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* d,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+void ScaleCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#if defined(__arm__) || defined(__aarch64__)
+#define BLENDER(a, b, f) \
+ (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#else
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+ (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+#endif
+
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+// Same as 8 bit arm blender but return is cast to uint16_t
+#define BLENDER(a, b, f) \
+ (uint16_t)( \
+ (int)(a) + \
+ (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] =
+ (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[1] =
+ (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] =
+ (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[1] =
+ (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+ src_ptr[stride + 1] + src_ptr[stride + 2]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+ src_ptr[stride + 4] + src_ptr[stride + 5]) *
+ (65536 / 6) >>
+ 16;
+ dst_ptr[2] =
+ (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >>
+ 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ int x;
+ assert(src_width > 0);
+ for (x = 0; x < src_width - 1; x += 2) {
+ dst_ptr[0] += src_ptr[0];
+ dst_ptr[1] += src_ptr[1];
+ src_ptr += 2;
+ dst_ptr += 2;
+ }
+ if (src_width & 1) {
+ dst_ptr[0] += src_ptr[0];
+ }
+}
+
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width) {
+ int x;
+ assert(src_width > 0);
+ for (x = 0; x < src_width - 1; x += 2) {
+ dst_ptr[0] += src_ptr[0];
+ dst_ptr[1] += src_ptr[1];
+ src_ptr += 2;
+ dst_ptr += 2;
+ }
+ if (src_width & 1) {
+ dst_ptr[0] += src_ptr[0];
+ }
+}
+
+// ARGB scale row functions
+
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+ src_argb[src_stride + 4] + 2) >>
+ 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+ src_argb[src_stride + 5] + 2) >>
+ 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+ src_argb[src_stride + 6] + 2) >>
+ 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+ src_argb[src_stride + 7] + 2) >>
+ 2;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+ src_argb[src_stride + 4] + 2) >>
+ 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+ src_argb[src_stride + 5] + 2) >>
+ 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+ src_argb[src_stride + 6] + 2) >>
+ 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+ src_argb[src_stride + 7] + 2) >>
+ 2;
+ src_argb += src_stepx * 4;
+ dst_argb += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+ BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += src_stepx * 2;
+ dst_uv += 2;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int bpp,
+ enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher bpp.
+ int dst_width_bytes = dst_width * bpp;
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(bpp >= 1 && bpp <= 4);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width_bytes, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+ dst_width_bytes, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+void ScalePlaneVertical_16(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_argb,
+ uint16_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp,
+ enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher wpp.
+ int dst_width_words = dst_width * wpp;
+ void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(wpp >= 1 && wpp <= 2);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+ dst_width_words, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (src_width < 0) {
+ src_width = -src_width;
+ }
+ if (src_height < 0) {
+ src_height = -src_height;
+ }
+ if (filtering == kFilterBox) {
+ // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ }
+ if (filtering == kFilterBilinear) {
+ if (src_height == 1) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+ if (dst_height == src_height || dst_height * 3 == src_height) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+ // avoid reading 2 pixels horizontally that causes memory exception.
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+ if (dst_width == src_width || dst_width * 3 == src_width) {
+ filtering = kFilterNone;
+ }
+ }
+ return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+ return (int)(((int64_t)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+ return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering,
+ int* x,
+ int* y,
+ int* dx,
+ int* dy) {
+ assert(x != NULL);
+ assert(y != NULL);
+ assert(dx != NULL);
+ assert(dy != NULL);
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ // Check for 1 pixel and avoid FixedDiv overflow.
+ if (dst_width == 1 && src_width >= 32768) {
+ dst_width = src_width;
+ }
+ if (dst_height == 1 && src_height >= 32768) {
+ dst_height = src_height;
+ }
+ if (filtering == kFilterBox) {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = 0;
+ *y = 0;
+ } else if (filtering == kFilterBilinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ if (dst_height <= src_height) {
+ *dy = FixedDiv(src_height, dst_height);
+ *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_height > 1) {
+ *dy = FixedDiv1(src_height, dst_height);
+ *y = 0;
+ }
+ } else if (filtering == kFilterLinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ *dy = FixedDiv(src_height, dst_height);
+ *y = *dy >> 1;
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = CENTERSTART(*dx, 0);
+ *y = CENTERSTART(*dy, 0);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ *x += (dst_width - 1) * *dx;
+ *dx = -*dx;
+ // src_width = -src_width; // Caller must do this.
+ }
+}
+#undef CENTERSTART
+
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* src2 = src_ptr + src_stride;
+
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ uint16_t p0 = src_ptr[0];
+ uint16_t p1 = src_ptr[1];
+ uint16_t p2 = src2[0];
+ uint16_t p3 = src2[1];
+ dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+ dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
+ ++src_ptr;
+ ++src2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ uint16_t p0 = src_ptr[0];
+ uint16_t p1 = src_ptr[1];
+ uint16_t p2 = src2[0];
+ uint16_t p3 = src2[1];
+ dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_gcc.cc b/third_party/aom/third_party/libyuv/source/scale_gcc.cc
new file mode 100644
index 0000000000..e575ee18bc
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1464 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
+ 8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ intptr_t stridex3;
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "=&r"(stridex3) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
+}
+
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(src_stride * 3)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile(
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1, temp_pixel;
+ asm volatile(
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ // 1
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "=&a"(temp_pixel), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1), // %4
+#if defined(__x86_64__)
+ "+rm"(dst_width) // %5
+#else
+ "+m"(dst_width) // %5
+#endif
+ : "rm"(x), // %6
+ "rm"(dx), // %7
+#if defined(__x86_64__)
+ "x"(kFsub80), // %8
+ "x"(kFadd40) // %9
+#else
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
+#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ (void)src_stride;
+ asm volatile(
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "=&r"(src_stepx_x12) // %4
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12;
+ intptr_t row1 = (intptr_t)(src_stride);
+ asm volatile(
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "=&r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1;
+ asm volatile(
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ LABELALIGN
+ "40: \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "99: \n"
+ : "=&a"(x0), // %0
+ "=&d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ (void)x;
+ (void)dx;
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ intptr_t x0, x1;
+ asm volatile(
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile(
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+
+ LABELALIGN
+ "99: \n" // clang-format error.
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
+ return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+ asm volatile(
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx");
+ return num;
+}
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+ 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
+ 6u, 14u, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80};
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5, %%xmm5 \n" // zero
+ "movdqa %4,%%xmm1 \n" // split shuffler
+ "movdqa %5,%%xmm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 8 UV row 0
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
+ "pshufb %%xmm1,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n" // vertical add
+ "psrlw $0x1,%%xmm0 \n" // round
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n" // merge uv
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n" // 4 UV
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
+ "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
+ "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
+ "lea 0x20(%0),%0 \n"
+ "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
+ "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n" // 8 UV
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_mips.cc b/third_party/aom/third_party/libyuv/source/scale_mips.cc
new file mode 100644
index 0000000000..3eb4f27c45
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_mips.cc
@@ -0,0 +1,654 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ // TODO(fbarchard): Use odd pixels instead of even.
+ "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
+ "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
+ "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
+ "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t8, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t1, 8(%[dst]) \n"
+ "sw $t2, 12(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0xf \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t0, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 2 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* t = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
+ "bltz $t9, 2f \n"
+ " nop \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 0(%[t]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[t]) \n" // |23|22|21|20|
+ "lw $t6, 8(%[t]) \n" // |27|26|25|24|
+ "lw $t7, 12(%[t]) \n" // |31|30|29|28|
+ "addiu $t9, $t9, -1 \n"
+ "srl $t8, $t0, 16 \n" // |X|X|3|2|
+ "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
+ "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
+ "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
+ "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
+ "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
+ "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
+ "srl $t8, $t1, 16 \n" // |X|X|7|6|
+ "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
+ "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
+ "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
+ "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
+ "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
+ "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
+ "srl $t8, $t2, 16 \n" // |X|X|11|10|
+ "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
+ "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
+ "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
+ "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
+ "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
+ "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
+ "srl $t8, $t3, 16 \n" // |X|X|15|14|
+ "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
+ "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
+ "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
+ "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
+ "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
+ "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
+ "addiu %[src_ptr], %[src_ptr], 16 \n"
+ "addiu %[t], %[t], 16 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "sb $t1, 2(%[dst]) \n"
+ "sb $t5, 3(%[dst]) \n"
+ "sb $t2, 4(%[dst]) \n"
+ "sb $t6, 5(%[dst]) \n"
+ "sb $t3, 6(%[dst]) \n"
+ "sb $t7, 7(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0x7 \n" // x = residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lwr $t1, 0(%[src_ptr]) \n"
+ "lwl $t1, 3(%[src_ptr]) \n"
+ "lwr $t2, 0(%[t]) \n"
+ "lwl $t2, 3(%[t]) \n"
+ "srl $t8, $t1, 16 \n"
+ "ins $t1, $t2, 16, 16 \n"
+ "ins $t2, $t8, 0, 16 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "shra_r.w $t1, $t1, 2 \n"
+ "shra_r.w $t2, $t2, 2 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "sb $t2, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -2 \n"
+ "addiu %[t], %[t], 4 \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 2 \n"
+
+ "3: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst), [t] "+r" (t)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n"
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
+ "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
+ "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t5, 4(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 7 \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t1, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ const uint8* s2 = s1 + stride;
+ const uint8* s3 = s2 + stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 1 \n"
+ "andi $t8, %[dst_width], 1 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
+ "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
+ "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
+ "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
+ "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
+ "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "add $t4, $t4, $t5 \n"
+ "add $t6, $t6, $t7 \n"
+ "add $t4, $t4, $t6 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "shra_r.w $t4, $t4, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[s3], %[s3], 8 \n"
+ "addiu $t9, $t9, -1 \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 2 \n"
+ "beqz $t8, 2f \n"
+ " nop \n"
+
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+
+ "2: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [s3] "+r" (s3)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
+ "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
+ "addiu %[dst_width], %[dst_width], -24 \n"
+ "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
+ "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
+ "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
+ "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
+ "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
+ "prepend $t1, $t2, 8 \n" // |4|3|1|0|
+ "prepend $t3, $t4, 24 \n" // |15|13|12|11|
+ "prepend $t5, $t6, 8 \n" // |20|19|17|16|
+ "prepend $t7, $t8, 24 \n" // |31|29|28|27|
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t3, 8(%[dst]) \n"
+ "sw $t5, 12(%[dst]) \n"
+ "sw $t9, 16(%[dst]) \n"
+ "sw $t7, 20(%[dst]) \n"
+ "bnez %[dst_width], 1b \n"
+ " addiu %[dst], %[dst], 24 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t3, 3 \n" // 0x00030003
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
+ "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t2, $t2, $t4 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "sll $t5, $t0, 1 \n"
+ "add $t0, $t5, $t0 \n"
+ "shra_r.ph $t2, $t2, 2 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shll.ph $t4, $t2, 1 \n"
+ "addq.ph $t4, $t4, $t2 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.w $t0, $t0, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "srl $t1, $t6, 16 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t2, 3 \n" // 0x00030003
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
+ "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t4, $t4, $t3 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shra_r.ph $t4, $t4, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.ph $t6, $t6, 1 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "srl $t1, $t6, 16 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t6, $t6 \n" // |26|27|24|25|
+ "srl $t0, $t0, 8 \n" // |X|2|3|0|
+ "srl $t3, $t3, 16 \n" // |X|X|15|14|
+ "srl $t5, $t5, 16 \n" // |X|X|23|22|
+ "srl $t7, $t7, 16 \n" // |X|X|31|30|
+ "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
+ "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
+ "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
+ "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
+ "prepend $t2, $t3, 24 \n" // |X|15|14|11|
+ "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
+ "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu %[dst_width], %[dst_width], -12 \n"
+ "addiu $t8,%[dst_width], -12 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t4, 4(%[dst]) \n"
+ "sw $t6, 8(%[dst]) \n"
+ "bgez $t8, 1b \n"
+ " addiu %[dst], %[dst], 12 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* t = src_ptr + stride;
+ const int c = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
+ "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
+ "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
+ "srl $t4, $t4, 2 \n" // t4 / 4
+ "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
+ "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
+ "addu $t6, $t5, $t6 \n"
+ "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
+ "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
+ "addu $t0, $t0, $t2 \n"
+ "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[t], %[t], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t4, -1(%[dst_ptr]) \n"
+ "sb $t6, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [t] "+r" (t),
+ [dst_width] "+r" (dst_width)
+ : [c] "r" (c)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ stride += stride;
+ const uint8* s2 = src_ptr + stride;
+ const int c1 = 0x1C71;
+ const int c2 = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
+ "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
+ "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
+ "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
+ "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
+ "raddu.w.qb $t8, $t8 \n" // R5+R4
+ "addu $t7, $t7, $t8 \n"
+ "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
+ "raddu.w.qb $t8, $t8 \n" // R7 + R6
+ "addu $t6, $t6, $t8 \n"
+ "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
+ "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
+ "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
+ "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
+ "addu $t7, $t7, $t8 \n"
+ "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "raddu.w.qb $t4, $t4 \n"
+ "addu $t0, $t0, $t2 \n"
+ "addu $t0, $t0, $t4 \n"
+ "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t7, $t7, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t6, -1(%[dst_ptr]) \n"
+ "sb $t7, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [dst_width] "+r" (dst_width)
+ : [c1] "r" (c1), [c2] "r" (c2)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/third_party/aom/third_party/libyuv/source/scale_neon.cc b/third_party/aom/third_party/libyuv/source/scale_neon.cc
new file mode 100644
index 0000000000..572b4bfa9b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_neon.cc
@@ -0,0 +1,1016 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+ !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ // row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+ "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
+ 22, 24, 27, 30, 0, 0, 0, 0};
+static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
+ 18, 6, 14, 19, 0, 0, 0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+
+ asm volatile(
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+ "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
+}
+
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_ptr;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q1, q1, q0 \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
+// 4a: 3e04 subs r6, #4
+// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
+// 50: ef64 21f4 vorr q9, q10, q10
+// 54: f942 038d vst2.32 {d16-d19}, [r2]!
+// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "mov r12, %3, lsl #2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ asm volatile(
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld1.32 {" #dn "[" #n "]}, [%6] \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int tmp;
+ const uint8_t* src_tmp = src_argb;
+ asm volatile(
+ "1: \n"
+ // clang-format off
+ LOAD1_DATA32_LANE(d0, 0)
+ LOAD1_DATA32_LANE(d0, 1)
+ LOAD1_DATA32_LANE(d1, 0)
+ LOAD1_DATA32_LANE(d1, 1)
+ LOAD1_DATA32_LANE(d2, 0)
+ LOAD1_DATA32_LANE(d2, 1)
+ LOAD1_DATA32_LANE(d3, 0)
+ LOAD1_DATA32_LANE(d3, 1)
+ // clang-format on
+ "vst1.32 {q0, q1}, [%0]! \n" // store pixels
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "=&r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_argb;
+ asm volatile (
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(d0, d2, 0)
+ LOAD2_DATA32_LANE(d0, d2, 1)
+ LOAD2_DATA32_LANE(d1, d3, 0)
+ LOAD2_DATA32_LANE(d1, d3, 1)
+ "vshrn.i32 d22, q8, #9 \n"
+ "vand.16 d22, d22, d30 \n"
+ "vdup.8 d24, d22[0] \n"
+ "vdup.8 d25, d22[2] \n"
+ "vdup.8 d26, d22[4] \n"
+ "vdup.8 d27, d22[6] \n"
+ "vext.8 d4, d24, d25, #4 \n"
+ "vext.8 d5, d26, d27, #4 \n" // f
+ "veor.8 q10, q2, q3 \n" // 0x7f ^ f
+ "vmull.u8 q11, d0, d20 \n"
+ "vmull.u8 q12, d1, d21 \n"
+ "vmull.u8 q13, d2, d4 \n"
+ "vmull.u8 q14, d3, d5 \n"
+ "vadd.i16 q11, q11, q13 \n"
+ "vadd.i16 q12, q12, q14 \n"
+ "vshrn.i16 d0, q11, #7 \n"
+ "vshrn.i16 d1, q12, #7 \n"
+
+ "vst1.32 {d0, d1}, [%0]! \n" // store pixels
+ "vadd.s32 q8, q8, q9 \n"
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
+ "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
+ "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
+ "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst2.8 {d0, d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.16 {d0[0]}, [%0], %6 \n"
+ "vld1.16 {d0[1]}, [%1], %6 \n"
+ "vld1.16 {d0[2]}, [%2], %6 \n"
+ "vld1.16 {d0[3]}, [%3], %6 \n"
+ "subs %5, %5, #4 \n" // 4 pixels per loop.
+ "vst1.8 {d0}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"(src_stepx * 8) // %6
+ : "memory", "cc", "d0");
+}
+
+#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_neon64.cc b/third_party/aom/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000000..185591cb55
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1152 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uadalp v0.8h, v3.16b \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(src_ptr2), // %3
+ "+r"(src_ptr3), // %4
+ "+r"(dst_width) // %5
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
+
+ // 3 * line_0 + line_1
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // (3 * line_0 + line_1) >> 2
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v20", "memory", "cc");
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+ // average src line 0 with src line 1
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
+}
+
+static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
+ 22, 24, 27, 30, 0, 0, 0, 0};
+static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
+ 34, 6, 22, 35, 0, 0, 0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18};
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "v0", "v1", "v2", "v3", "memory", "cc");
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+ ptrdiff_t tmp_src_stride = src_stride;
+
+ asm volatile(
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+
+ // combine source lines
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // combine source lines
+ "add v0.8h, v0.8h, v16.8h \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // 0+1+2, 3+4+5
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // Align for table lookup, vtbl requires registers to be adjacent
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(dst_width) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+ "memory", "cc");
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ // TODO(fbarchard): use src_stride directly for clang 3.5+.
+ ptrdiff_t tmp_src_stride = src_stride;
+ asm volatile(
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
+
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
+
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
+
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+
+ // combine source lines
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "uqrshrn v2.8b, v2.8h, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+ // combine source lines
+ "uaddl v0.8h, v0.8b, v4.8b \n"
+
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // 0+1+2, 3+4+5
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(dst_width) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v30", "v31", "memory", "cc");
+}
+
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2" // Clobber List
+ );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_ptr;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v1.4s, v1.4s, v0.4s \n"
+ // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
+ LOAD2_DATA8_LANE(0)
+ LOAD2_DATA8_LANE(1)
+ LOAD2_DATA8_LANE(2)
+ LOAD2_DATA8_LANE(3)
+ LOAD2_DATA8_LANE(4)
+ LOAD2_DATA8_LANE(5)
+ LOAD2_DATA8_LANE(6)
+ LOAD2_DATA8_LANE(7)
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3",
+ "v4", "v5", "v6", "v7", "v16", "v17"
+ );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y_fraction = 256 - source_y_fraction;
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
+
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ "st1 {v0.b}[15], [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction), // %4
+ "+r"(y_fraction) // %5
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
+}
+
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((int64_t)(src_stepx * 4)) // %3
+ : "memory", "cc", "v0");
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ asm volatile(
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"((int64_t)(src_stepx * 4)) // %4
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld1 {" #vn ".s}[" #n "], [%6] \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint8_t* src_tmp = src_argb;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ int64_t tmp64;
+ asm volatile(
+ "1: \n"
+ // clang-format off
+ LOAD1_DATA32_LANE(v0, 0)
+ LOAD1_DATA32_LANE(v0, 1)
+ LOAD1_DATA32_LANE(v0, 2)
+ LOAD1_DATA32_LANE(v0, 3)
+ LOAD1_DATA32_LANE(v1, 0)
+ LOAD1_DATA32_LANE(v1, 1)
+ LOAD1_DATA32_LANE(v1, 2)
+ LOAD1_DATA32_LANE(v1, 3)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ // clang-format on
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "=&r"(tmp64), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1");
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ int dx_offset[4] = {0, 1, 2, 3};
+ int* tmp = dx_offset;
+ const uint8_t* src_tmp = src_argb;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ asm volatile (
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
+ // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
+ // d0, d1: a
+ // d2, d3: b
+ LOAD2_DATA32_LANE(v0, v1, 0)
+ LOAD2_DATA32_LANE(v0, v1, 1)
+ LOAD2_DATA32_LANE(v0, v1, 2)
+ LOAD2_DATA32_LANE(v0, v1, 3)
+ "shrn v2.4h, v5.4s, #9 \n"
+ "and v2.8b, v2.8b, v4.8b \n"
+ "dup v16.8b, v2.b[0] \n"
+ "dup v17.8b, v2.b[2] \n"
+ "dup v18.8b, v2.b[4] \n"
+ "dup v19.8b, v2.b[6] \n"
+ "ext v2.8b, v16.8b, v17.8b, #4 \n"
+ "ext v17.8b, v18.8b, v19.8b, #4 \n"
+ "ins v2.d[1], v17.d[0] \n" // f
+ "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
+ "umull v16.8h, v0.8b, v7.8b \n"
+ "umull2 v17.8h, v0.16b, v7.16b \n"
+ "umull v18.8h, v1.8b, v2.8b \n"
+ "umull2 v19.8h, v1.16b, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "add v16.8h, v16.8h, v18.8h \n"
+ "add v17.8h, v17.8h, v19.8h \n"
+ "shrn v0.8b, v16.8h, #7 \n"
+ "shrn2 v0.16b, v17.8h, #7 \n"
+ "st1 {v0.4s}, [%0], #16 \n" // store pixels
+ "add v5.4s, v5.4s, v6.4s \n"
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "+r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v6", "v7", "v16", "v17", "v18", "v19"
+ );
+}
+
+#undef LOAD2_DATA32_LANE
+
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "1: \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ asm volatile(
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
+
+ "1: \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ : "r"(2LL), // %4
+ "r"(14LL) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19" // Clobber List
+ );
+}
+
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
+ "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
+ "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"((int64_t)(src_stepx * 8)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_uv.cc b/third_party/aom/third_party/libyuv/source/scale_uv.cc
new file mode 100644
index 0000000000..b0469f09b8
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_uv.cc
@@ -0,0 +1,891 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_C
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+ : ScaleUVRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+ } else {
+ src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+ }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+ : ScaleUVRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+ : ScaleUVRowDown2Box_SSSE3);
+ }
+ }
+#endif
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+ : ScaleUVRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+ : ScaleUVRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI
+ : ScaleUVRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MMI
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI
+ : ScaleUVRowDown2Box_MMI);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+ : ScaleUVRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+ : ScaleUVRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static void ScaleUVDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+ ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
+ ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+#endif // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_uv, int dst_width) =
+ filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+ : ScaleUVRowDownEven_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif// TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+ : ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static void ScaleUVBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2.
+ src_uv += xl * 2;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of UV.
+ {
+ align_buffer_64(row, clip_src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static void ScaleUVBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+ }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleUVFilterCols = ScaleUVCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_MMI;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * src_stride;
+
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_uv + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif // HAS_SCALEUVBILINEARUP
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleUVCols = ScaleUVCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleUVCols = ScaleUVCols_MMI;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVColsUp2_MMI;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_UV,
+ int src_stride_UV,
+ uint8_t* dst_UV,
+ int dst_stride_UV,
+ int width,
+ int height) {
+ if (!src_UV || !dst_UV || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_UV = src_UV + (height - 1) * src_stride_UV;
+ src_stride_UV = -src_stride_UV;
+ }
+
+ CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
+ return 0;
+}
+#endif // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleUV(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64_t clipf = (int64_t)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 2;
+ dst += clip_x * 2;
+ }
+ if (clip_y) {
+ int64_t clipf = (int64_t)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+ ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+#endif
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst,
+ dst_stride, clip_width, clip_height);
+ return;
+ }
+#endif
+ }
+ }
+ }
+ // HAS_SCALEPLANEVERTICAL
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, 4, filtering);
+ return;
+ }
+
+#if HAS_SCALEUVBILINEARUP
+ if (filtering && dy < 65536) {
+ ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+ if (filtering) {
+ ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+ ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
+ dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_win.cc b/third_party/aom/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 0000000000..c5fc86f3e9
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1391 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for 32 bit Visual C x86 and clangcl
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+// Offsets for source bytes 0 to 9
+static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Offsets for source bytes 0 to 10
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
+ 8, 9, 9, 10, 10, 11, 12, 13};
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
+
+// Coefficients for source bytes 0 to 10
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+
+// Coefficients for source bytes 10 to 21
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+
+// Coefficients for source bytes 21 to 31
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+
+// Coefficients for source bytes 21 to 31
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+
+static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 0,1,2
+static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
+
+// Arrange words 0,3,6 into 3,4,5
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x3 and 2x3
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
+
+// Scaling values for boxes of 3x2 and 2x2
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ packuswb xmm4, xmm4
+ pxor xmm5, xmm5 // constant 0
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ packuswb xmm4, xmm4
+ pxor xmm5, xmm5 // constant 0
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add
+ paddw xmm1, xmm3
+ psrlw xmm0, 1
+ psrlw xmm1, 1
+ pavgw xmm0, xmm5 // (x + 1) / 2
+ pavgw xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// For rounding, average = (sum + 2) / 4
+// becomes average((sum >> 1), 0)
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
+ vpsrlw ymm4, ymm4, 15
+ vpackuswb ymm4, ymm4, ymm4
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add
+ vpaddw ymm1, ymm1, ymm3
+ vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
+ vpsrlw ymm1, ymm1, 1
+ vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
+ vpavgw ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ psrld xmm5, 24
+ pslld xmm5, 16
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ pcmpeqb xmm4, xmm4 // constant 0x0101
+ psrlw xmm4, 15
+ movdqa xmm5, xmm4
+ packuswb xmm4, xmm4
+ psllw xmm5, 3 // constant 0x0008
+
+ wloop:
+ movdqu xmm0, [eax] // average rows
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ pmaddubsw xmm0, xmm4 // horizontal add
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // vertical add rows 0, 1
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + esi * 2]
+ movdqu xmm3, [eax + esi * 2 + 16]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 2
+ paddw xmm1, xmm3
+ movdqu xmm2, [eax + edi]
+ movdqu xmm3, [eax + edi + 16]
+ lea eax, [eax + 32]
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ paddw xmm0, xmm2 // add row 3
+ paddw xmm1, xmm3
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // + 8 for round
+ psrlw xmm0, 4 // /16 for average of 4 * 4
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ sub ecx, 8
+ jg wloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
+ vpsrld ymm5, ymm5, 24
+ vpslld ymm5, ymm5, 16
+
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vpsrlw ymm0, ymm0, 8
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
+ vpsrlw ymm4, ymm4, 15
+ vpsllw ymm5, ymm4, 3 // constant 0x0008
+ vpackuswb ymm4, ymm4, ymm4
+
+ wloop:
+ vmovdqu ymm0, [eax] // average rows
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + esi]
+ vmovdqu ymm3, [eax + esi + 32]
+ vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + esi * 2]
+ vmovdqu ymm3, [eax + esi * 2 + 32]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 2
+ vpaddw ymm1, ymm1, ymm3
+ vmovdqu ymm2, [eax + edi]
+ vmovdqu ymm3, [eax + edi + 32]
+ lea eax, [eax + 64]
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ vpaddw ymm0, ymm0, ymm2 // add row 3
+ vpaddw ymm1, ymm1, ymm3
+ vphaddw ymm0, ymm0, ymm1 // mutates
+ vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
+ vpaddw ymm0, ymm0, ymm5 // + 8 for round
+ vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
+ vpackuswb ymm0, ymm0, ymm0
+ vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
+ vmovdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg wloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, xmmword ptr kShuf0
+ movdqa xmm4, xmmword ptr kShuf1
+ movdqa xmm5, xmmword ptr kShuf2
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShuf01
+ movdqa xmm3, xmmword ptr kShuf11
+ movdqa xmm4, xmmword ptr kShuf21
+ movdqa xmm5, xmmword ptr kMadd01
+ movdqa xmm6, xmmword ptr kMadd11
+ movdqa xmm7, xmmword ptr kRound34
+
+ wloop:
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmmword ptr kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShuf01
+ movdqa xmm3, xmmword ptr kShuf11
+ movdqa xmm4, xmmword ptr kShuf21
+ movdqa xmm5, xmmword ptr kMadd01
+ movdqa xmm6, xmmword ptr kMadd11
+ movdqa xmm7, xmmword ptr kRound34
+
+ wloop:
+ movdqu xmm0, [eax] // pixels 0..7
+ movdqu xmm1, [eax + esi]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqu xmm0, [eax + 16] // pixels 16..23
+ movdqu xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, xmmword ptr kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ sub ecx, 24
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, xmmword ptr kShuf38a
+ movdqa xmm5, xmmword ptr kShuf38b
+
+ xloop:
+ movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ sub ecx, 12
+ jg xloop
+
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShufAc
+ movdqa xmm3, xmmword ptr kShufAc3
+ movdqa xmm4, xmmword ptr kScaleAc33
+ pxor xmm5, xmm5
+
+ xloop:
+ movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqu xmm6, [eax + esi]
+ movhlps xmm1, xmm0
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqu xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
+
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
+
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
+
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ sub ecx, 6
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, xmmword ptr kShufAb0
+ movdqa xmm3, xmmword ptr kShufAb1
+ movdqa xmm4, xmmword ptr kShufAb2
+ movdqa xmm5, xmmword ptr kScaleAb2
+
+ xloop:
+ movdqu xmm0, [eax] // average 2 rows into xmm0
+ movdqu xmm1, [eax + esi]
+ lea eax, [eax + 16]
+ pavgb xmm0, xmm1
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
+ pshufb xmm0, xmm4
+ paddusw xmm1, xmm0
+
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
+
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ sub ecx, 6
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ pxor xmm5, xmm5
+
+ // sum rows
+ xloop:
+ movdqu xmm3, [eax] // read 16 bytes
+ lea eax, [eax + 16]
+ movdqu xmm0, [edx] // read 16 words from destination
+ movdqu xmm1, [edx + 16]
+ movdqa xmm2, xmm3
+ punpcklbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
+ movdqu [edx], xmm0 // write 16 words to destination
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 16
+ jg xloop
+ ret
+ }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ mov edx, [esp + 8] // dst_ptr
+ mov ecx, [esp + 12] // src_width
+ vpxor ymm5, ymm5, ymm5
+
+ // sum rows
+ xloop:
+ vmovdqu ymm3, [eax] // read 32 bytes
+ lea eax, [eax + 32]
+ vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
+ vpunpcklbw ymm2, ymm3, ymm5
+ vpunpckhbw ymm3, ymm3, ymm5
+ vpaddusw ymm0, ymm2, [edx] // sum 16 words
+ vpaddusw ymm1, ymm3, [edx + 32]
+ vmovdqu [edx], ymm0 // write 32 words to destination
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ sub ecx, 32
+ jg xloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SCALEADDROW_AVX2
+
+// Constant for making pixels signed to avoid pmaddubsw
+// saturation.
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+// Constant for making pixels unsigned and adding .5 for rounding.
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov edi, [esp + 12 + 4] // dst_ptr
+ mov esi, [esp + 12 + 8] // src_ptr
+ mov ecx, [esp + 12 + 12] // dst_width
+ movd xmm2, [esp + 12 + 16] // x
+ movd xmm3, [esp + 12 + 20] // dx
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel.
+ movd xmm5, eax
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pcmpeqb xmm7, xmm7 // generate 0x0001
+ psrlw xmm7, 15
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm1, 9 // 7 bit fractions.
+ movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
+ movd xmm4, ebx
+ pshufb xmm1, xmm5 // 0011
+ punpcklwd xmm0, xmm4
+ psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
+ psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm1, xmm1 // 8 bits, 2 pixels.
+ movd ebx, xmm1
+ mov [edi], bx
+ lea edi, [edi + 2]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ xloop29:
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm2, 9 // 7 bit fractions.
+ pshufb xmm2, xmm5 // 0011
+ psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
+ pmaddubsw xmm2, xmm0 // 16 bit
+ paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
+ psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm2, xmm2 // 8 bits
+ movd ebx, xmm2
+ mov [edi], bl
+
+ xloop99:
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg wloop
+
+ ret
+ }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0xdd
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ sub ecx, 4
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ push edi
+ push esi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
+ paddd xmm2, xmm0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
+ paddd xmm2, xmm0 // x3 x2 x1 x0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
+
+ pextrw eax, xmm2, 1 // get x0 integer.
+ pextrw edx, xmm2, 3 // get x1 integer.
+
+ cmp ecx, 0
+ jle xloop99
+ sub ecx, 4
+ jl xloop49
+
+ // 4 Pixel loop.
+ xloop4:
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ pextrw edx, xmm2, 7 // get x3 integer.
+ paddd xmm2, xmm3 // x += dx
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movd xmm1, [esi + eax * 4] // 1 source x2 pixels
+ movd xmm4, [esi + edx * 4] // 1 source x3 pixels
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ punpckldq xmm1, xmm4 // x2 x3
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4 // 4 pixels
+ jge xloop4
+
+ xloop49:
+ test ecx, 2
+ je xloop29
+
+ // 2 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+
+ xloop29:
+ test ecx, 1
+ je xloop99
+
+ // 1 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x2 pixels
+ movd dword ptr [edi], xmm0
+ xloop99:
+
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ movdqa xmm4, xmmword ptr kShuffleColARGB
+ movdqa xmm5, xmmword ptr kShuffleFractions
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ psrlw xmm1, 9 // 7 bit fractions.
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ movd [edi], xmm0
+
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_argb
+ mov eax, [esp + 8] // src_argb
+ mov ecx, [esp + 12] // dst_width
+
+ wloop:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm0
+ punpckhdq xmm1, xmm1
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg wloop
+
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) int FixedDiv_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ idiv dword ptr [esp + 8]
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ mov ecx, [esp + 8] // denom
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ sub eax, 0x00010001
+ sbb edx, 0
+ sub ecx, 1
+ idiv ecx
+ ret
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/video_common.cc b/third_party/aom/third_party/libyuv/source/video_common.cc
new file mode 100644
index 0000000000..92384c050c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+struct FourCCAliasEntry {
+ uint32_t alias;
+ uint32_t canonical;
+};
+
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
+ {FOURCC_IYUV, FOURCC_I420},
+ {FOURCC_YU12, FOURCC_I420},
+ {FOURCC_YU16, FOURCC_I422},
+ {FOURCC_YU24, FOURCC_I444},
+ {FOURCC_YUYV, FOURCC_YUY2},
+ {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
+ {FOURCC_HDYC, FOURCC_UYVY},
+ {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
+ {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
+ {FOURCC_DMB1, FOURCC_MJPG},
+ {FOURCC_BA81, FOURCC_BGGR}, // deprecated.
+ {FOURCC_RGB3, FOURCC_RAW},
+ {FOURCC_BGR3, FOURCC_24BG},
+ {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
+ {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB
+ {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
+ {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
+ {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32_t CanonicalFourCC(uint32_t fourcc) {
+ int i;
+ for (i = 0; i < NUM_ALIASES; ++i) {
+ if (kFourCCAliases[i].alias == fourcc) {
+ return kFourCCAliases[i].canonical;
+ }
+ }
+ // Not an alias, so return it as-is.
+ return fourcc;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/vector/LICENSE b/third_party/aom/third_party/vector/LICENSE
new file mode 100644
index 0000000000..afcb9f00a5
--- /dev/null
+++ b/third_party/aom/third_party/vector/LICENSE
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+Copyright (c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/third_party/aom/third_party/vector/README.libaom b/third_party/aom/third_party/vector/README.libaom
new file mode 100644
index 0000000000..729446dbcb
--- /dev/null
+++ b/third_party/aom/third_party/vector/README.libaom
@@ -0,0 +1,16 @@
+Name: vector
+URL: https://github.com/goldsborough/vector
+Version: commit-id: 40efe82
+License: MIT
+License File: LICENSE
+
+Description:
+A feature-complete, generic and customizable resizable
+array implementation in pure C that supports almost
+the entire C++ std::vector API, including iterators.
+
+Local Modifications:
+1. Renamed some functions to fit in with the AOMedia
+naming convention.
+2. Removed non-global functions from vector.h.
+3. Made all non-global functions in vector.c static.
diff --git a/third_party/aom/third_party/vector/vector.c b/third_party/aom/third_party/vector/vector.c
new file mode 100644
index 0000000000..2295b8f080
--- /dev/null
+++ b/third_party/aom/third_party/vector/vector.c
@@ -0,0 +1,540 @@
+/*
+The MIT License(MIT)
+Copyright(c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions :
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#define __STDC_WANT_LIB_EXT1__ 1
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/vector/vector.h"
+
+/***** PRIVATE *****/
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static bool _vector_should_grow(Vector *vector) {
+ assert(vector->size <= vector->capacity);
+ return vector->size == vector->capacity;
+}
+
+static bool _vector_should_shrink(Vector *vector) {
+ assert(vector->size <= vector->capacity);
+ return vector->size == vector->capacity * VECTOR_SHRINK_THRESHOLD;
+}
+
+static void *_vector_offset(Vector *vector, size_t index) {
+ // return vector->data + (index * vector->element_size);
+ return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+static const void *_vector_const_offset(const Vector *vector, size_t index) {
+ // return vector->data + (index * vector->element_size);
+ return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+static void _vector_assign(Vector *vector, size_t index, void *element) {
+ /* Insert the element */
+ void *offset = _vector_offset(vector, index);
+ memcpy(offset, element, vector->element_size);
+}
+
+static int _vector_move_right(Vector *vector, size_t index) {
+ assert(vector->size < vector->capacity);
+
+ /* The location where to start to move from. */
+ void *offset = _vector_offset(vector, index);
+
+ /* How many to move to the right. */
+ size_t elements_in_bytes = (vector->size - index) * vector->element_size;
+
+#ifdef __STDC_LIB_EXT1__
+ size_t right_capacity_in_bytes =
+ (vector->capacity - (index + 1)) * vector->element_size;
+
+ /* clang-format off */
+ int return_code = memmove_s(
+ offset + vector->element_size,
+ right_capacity_in_bytes,
+ offset,
+ elements_in_bytes);
+
+ /* clang-format on */
+
+ return return_code == 0 ? VECTOR_SUCCESS : VECTOR_ERROR;
+
+#else
+ // memmove(offset + vector->element_size, offset, elements_in_bytes);
+ memmove((unsigned char *)offset + vector->element_size, offset,
+ elements_in_bytes);
+ return VECTOR_SUCCESS;
+#endif
+}
+
+static void _vector_move_left(Vector *vector, size_t index) {
+ size_t right_elements_in_bytes;
+ void *offset;
+
+ /* The offset into the memory */
+ offset = _vector_offset(vector, index);
+
+ /* How many to move to the left */
+ right_elements_in_bytes = (vector->size - index - 1) * vector->element_size;
+
+ // memmove(offset, offset + vector->element_size, right_elements_in_bytes);
+ memmove(offset, (unsigned char *)offset + vector->element_size,
+ right_elements_in_bytes);
+}
+
+static int _vector_reallocate(Vector *vector, size_t new_capacity) {
+ size_t new_capacity_in_bytes;
+ void *old;
+ assert(vector != NULL);
+
+ if (new_capacity < VECTOR_MINIMUM_CAPACITY) {
+ if (vector->capacity > VECTOR_MINIMUM_CAPACITY) {
+ new_capacity = VECTOR_MINIMUM_CAPACITY;
+ } else {
+ /* NO-OP */
+ return VECTOR_SUCCESS;
+ }
+ }
+
+ new_capacity_in_bytes = new_capacity * vector->element_size;
+ old = vector->data;
+
+ if ((vector->data = malloc(new_capacity_in_bytes)) == NULL) {
+ return VECTOR_ERROR;
+ }
+
+#ifdef __STDC_LIB_EXT1__
+ /* clang-format off */
+ if (memcpy_s(vector->data,
+ new_capacity_in_bytes,
+ old,
+ aom_vector_byte_size(vector)) != 0) {
+ return VECTOR_ERROR;
+ }
+/* clang-format on */
+#else
+ memcpy(vector->data, old, aom_vector_byte_size(vector));
+#endif
+
+ vector->capacity = new_capacity;
+
+ free(old);
+
+ return VECTOR_SUCCESS;
+}
+
+static int _vector_adjust_capacity(Vector *vector) {
+ return _vector_reallocate(vector,
+ MAX(1, vector->size * VECTOR_GROWTH_FACTOR));
+}
+
+static void _vector_swap(size_t *first, size_t *second) {
+ size_t temp = *first;
+ *first = *second;
+ *second = temp;
+}
+
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size) {
+ assert(vector != NULL);
+
+ if (vector == NULL) return VECTOR_ERROR;
+
+ vector->size = 0;
+ vector->capacity = MAX(VECTOR_MINIMUM_CAPACITY, capacity);
+ vector->element_size = element_size;
+ vector->data = malloc(vector->capacity * element_size);
+
+ return vector->data == NULL ? VECTOR_ERROR : VECTOR_SUCCESS;
+}
+
+int aom_vector_copy(Vector *destination, Vector *source) {
+ assert(destination != NULL);
+ assert(source != NULL);
+ assert(aom_vector_is_initialized(source));
+ assert(!aom_vector_is_initialized(destination));
+
+ if (destination == NULL) return VECTOR_ERROR;
+ if (source == NULL) return VECTOR_ERROR;
+ if (aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+ if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+ /* Copy ALL the data */
+ destination->size = source->size;
+ destination->capacity = source->size * 2;
+ destination->element_size = source->element_size;
+
+ /* Note that we are not necessarily allocating the same capacity */
+ destination->data = malloc(destination->capacity * source->element_size);
+ if (destination->data == NULL) return VECTOR_ERROR;
+
+ memcpy(destination->data, source->data, aom_vector_byte_size(source));
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_copy_assign(Vector *destination, Vector *source) {
+ assert(destination != NULL);
+ assert(source != NULL);
+ assert(aom_vector_is_initialized(source));
+ assert(aom_vector_is_initialized(destination));
+
+ if (destination == NULL) return VECTOR_ERROR;
+ if (source == NULL) return VECTOR_ERROR;
+ if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+ if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+ aom_vector_destroy(destination);
+
+ return aom_vector_copy(destination, source);
+}
+
+int aom_vector_move(Vector *destination, Vector *source) {
+ assert(destination != NULL);
+ assert(source != NULL);
+
+ if (destination == NULL) return VECTOR_ERROR;
+ if (source == NULL) return VECTOR_ERROR;
+
+ *destination = *source;
+ source->data = NULL;
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_move_assign(Vector *destination, Vector *source) {
+ aom_vector_swap(destination, source);
+ return aom_vector_destroy(source);
+}
+
+int aom_vector_swap(Vector *destination, Vector *source) {
+ void *temp;
+
+ assert(destination != NULL);
+ assert(source != NULL);
+ assert(aom_vector_is_initialized(source));
+ assert(aom_vector_is_initialized(destination));
+
+ if (destination == NULL) return VECTOR_ERROR;
+ if (source == NULL) return VECTOR_ERROR;
+ if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+ if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+ _vector_swap(&destination->size, &source->size);
+ _vector_swap(&destination->capacity, &source->capacity);
+ _vector_swap(&destination->element_size, &source->element_size);
+
+ temp = destination->data;
+ destination->data = source->data;
+ source->data = temp;
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_destroy(Vector *vector) {
+ assert(vector != NULL);
+
+ if (vector == NULL) return VECTOR_ERROR;
+
+ free(vector->data);
+ vector->data = NULL;
+
+ return VECTOR_SUCCESS;
+}
+
+/* Insertion */
+int aom_vector_push_back(Vector *vector, void *element) {
+ assert(vector != NULL);
+ assert(element != NULL);
+
+ if (_vector_should_grow(vector)) {
+ if (_vector_adjust_capacity(vector) == VECTOR_ERROR) {
+ return VECTOR_ERROR;
+ }
+ }
+
+ _vector_assign(vector, vector->size, element);
+
+ ++vector->size;
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_push_front(Vector *vector, void *element) {
+ return aom_vector_insert(vector, 0, element);
+}
+
+int aom_vector_insert(Vector *vector, size_t index, void *element) {
+ void *offset;
+
+ assert(vector != NULL);
+ assert(element != NULL);
+ assert(index <= vector->size);
+
+ if (vector == NULL) return VECTOR_ERROR;
+ if (element == NULL) return VECTOR_ERROR;
+ if (vector->element_size == 0) return VECTOR_ERROR;
+ if (index > vector->size) return VECTOR_ERROR;
+
+ if (_vector_should_grow(vector)) {
+ if (_vector_adjust_capacity(vector) == VECTOR_ERROR) {
+ return VECTOR_ERROR;
+ }
+ }
+
+ /* Move other elements to the right */
+ if (_vector_move_right(vector, index) == VECTOR_ERROR) {
+ return VECTOR_ERROR;
+ }
+
+ /* Insert the element */
+ offset = _vector_offset(vector, index);
+ memcpy(offset, element, vector->element_size);
+ ++vector->size;
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_assign(Vector *vector, size_t index, void *element) {
+ assert(vector != NULL);
+ assert(element != NULL);
+ assert(index < vector->size);
+
+ if (vector == NULL) return VECTOR_ERROR;
+ if (element == NULL) return VECTOR_ERROR;
+ if (vector->element_size == 0) return VECTOR_ERROR;
+ if (index >= vector->size) return VECTOR_ERROR;
+
+ _vector_assign(vector, index, element);
+
+ return VECTOR_SUCCESS;
+}
+
+/* Deletion */
+int aom_vector_pop_back(Vector *vector) {
+ assert(vector != NULL);
+ assert(vector->size > 0);
+
+ if (vector == NULL) return VECTOR_ERROR;
+ if (vector->element_size == 0) return VECTOR_ERROR;
+
+ --vector->size;
+
+#ifndef VECTOR_NO_SHRINK
+ if (_vector_should_shrink(vector)) {
+ _vector_adjust_capacity(vector);
+ }
+#endif
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_pop_front(Vector *vector) { return aom_vector_erase(vector, 0); }
+
+int aom_vector_erase(Vector *vector, size_t index) {
+ assert(vector != NULL);
+ assert(index < vector->size);
+
+ if (vector == NULL) return VECTOR_ERROR;
+ if (vector->element_size == 0) return VECTOR_ERROR;
+ if (index >= vector->size) return VECTOR_ERROR;
+
+ /* Just overwrite */
+ _vector_move_left(vector, index);
+
+#ifndef VECTOR_NO_SHRINK
+ if (--vector->size == vector->capacity / 4) {
+ _vector_adjust_capacity(vector);
+ }
+#endif
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_clear(Vector *vector) { return aom_vector_resize(vector, 0); }
+
+/* Lookup */
+void *aom_vector_get(Vector *vector, size_t index) {
+ assert(vector != NULL);
+ assert(index < vector->size);
+
+ if (vector == NULL) return NULL;
+ if (vector->element_size == 0) return NULL;
+ if (index >= vector->size) return NULL;
+
+ return _vector_offset(vector, index);
+}
+
+const void *aom_vector_const_get(const Vector *vector, size_t index) {
+ assert(vector != NULL);
+ assert(index < vector->size);
+
+ if (vector == NULL) return NULL;
+ if (vector->element_size == 0) return NULL;
+ if (index >= vector->size) return NULL;
+
+ return _vector_const_offset(vector, index);
+}
+
+void *aom_vector_front(Vector *vector) { return aom_vector_get(vector, 0); }
+
+void *aom_vector_back(Vector *vector) {
+ return aom_vector_get(vector, vector->size - 1);
+}
+
+/* Information */
+
+bool aom_vector_is_initialized(const Vector *vector) {
+ return vector->data != NULL;
+}
+
+size_t aom_vector_byte_size(const Vector *vector) {
+ return vector->size * vector->element_size;
+}
+
+size_t aom_vector_free_space(const Vector *vector) {
+ return vector->capacity - vector->size;
+}
+
+bool aom_vector_is_empty(const Vector *vector) { return vector->size == 0; }
+
+/* Memory management */
+int aom_vector_resize(Vector *vector, size_t new_size) {
+ if (new_size <= vector->capacity * VECTOR_SHRINK_THRESHOLD) {
+ vector->size = new_size;
+ if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
+ return VECTOR_ERROR;
+ }
+ } else if (new_size > vector->capacity) {
+ if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
+ return VECTOR_ERROR;
+ }
+ }
+
+ vector->size = new_size;
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity) {
+ if (minimum_capacity > vector->capacity) {
+ if (_vector_reallocate(vector, minimum_capacity) == VECTOR_ERROR) {
+ return VECTOR_ERROR;
+ }
+ }
+
+ return VECTOR_SUCCESS;
+}
+
+int aom_vector_shrink_to_fit(Vector *vector) {
+ return _vector_reallocate(vector, vector->size);
+}
+
+/* Iterators */
+Iterator aom_vector_begin(Vector *vector) { return aom_vector_iterator(vector, 0); }
+
+Iterator aom_vector_end(Vector *vector) {
+ return aom_vector_iterator(vector, vector->size);
+}
+
+Iterator aom_vector_iterator(Vector *vector, size_t index) {
+ Iterator iterator = { NULL, 0 };
+
+ assert(vector != NULL);
+ assert(index <= vector->size);
+
+ if (vector == NULL) return iterator;
+ if (index > vector->size) return iterator;
+ if (vector->element_size == 0) return iterator;
+
+ iterator.pointer = _vector_offset(vector, index);
+ iterator.element_size = vector->element_size;
+
+ return iterator;
+}
+
+void *aom_iterator_get(Iterator *iterator) { return iterator->pointer; }
+
+int aom_iterator_erase(Vector *vector, Iterator *iterator) {
+ size_t index = aom_iterator_index(vector, iterator);
+
+ if (aom_vector_erase(vector, index) == VECTOR_ERROR) {
+ return VECTOR_ERROR;
+ }
+
+ *iterator = aom_vector_iterator(vector, index);
+
+ return VECTOR_SUCCESS;
+}
+
+void aom_iterator_increment(Iterator *iterator) {
+ assert(iterator != NULL);
+ // iterator->pointer += iterator->element_size;
+ iterator->pointer =
+ (unsigned char *)iterator->pointer + iterator->element_size;
+}
+
+void aom_iterator_decrement(Iterator *iterator) {
+ assert(iterator != NULL);
+ // iterator->pointer -= iterator->element_size;
+ iterator->pointer =
+ (unsigned char *)iterator->pointer - iterator->element_size;
+}
+
+void *aom_iterator_next(Iterator *iterator) {
+ void *current = iterator->pointer;
+ aom_iterator_increment(iterator);
+
+ return current;
+}
+
+void *aom_iterator_previous(Iterator *iterator) {
+ void *current = iterator->pointer;
+ aom_iterator_decrement(iterator);
+
+ return current;
+}
+
+bool aom_iterator_equals(Iterator *first, Iterator *second) {
+ assert(first->element_size == second->element_size);
+ return first->pointer == second->pointer;
+}
+
+bool aom_iterator_is_before(Iterator *first, Iterator *second) {
+ assert(first->element_size == second->element_size);
+ return first->pointer < second->pointer;
+}
+
+bool aom_iterator_is_after(Iterator *first, Iterator *second) {
+ assert(first->element_size == second->element_size);
+ return first->pointer > second->pointer;
+}
+
+size_t aom_iterator_index(Vector *vector, Iterator *iterator) {
+ assert(vector != NULL);
+ assert(iterator != NULL);
+ // return (iterator->pointer - vector->data) / vector->element_size;
+ return ((unsigned char *)iterator->pointer - (unsigned char *)vector->data) /
+ vector->element_size;
+}
diff --git a/third_party/aom/third_party/vector/vector.h b/third_party/aom/third_party/vector/vector.h
new file mode 100644
index 0000000000..acc70fe099
--- /dev/null
+++ b/third_party/aom/third_party/vector/vector.h
@@ -0,0 +1,138 @@
+/*
+The MIT License(MIT)
+Copyright(c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions :
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef VECTOR_H
+#define VECTOR_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/***** DEFINITIONS *****/
+
+#define VECTOR_MINIMUM_CAPACITY 2
+#define VECTOR_GROWTH_FACTOR 2
+#define VECTOR_SHRINK_THRESHOLD (1 / 4)
+
+#define VECTOR_ERROR -1
+#define VECTOR_SUCCESS 0
+
+#define VECTOR_UNINITIALIZED NULL
+#define VECTOR_INITIALIZER \
+ { 0, 0, 0, VECTOR_UNINITIALIZED }
+
+/***** STRUCTURES *****/
+
+typedef struct Vector {
+ size_t size;
+ size_t capacity;
+ size_t element_size;
+
+ void *data;
+} Vector;
+
+typedef struct Iterator {
+ void *pointer;
+ size_t element_size;
+} Iterator;
+
+/***** METHODS *****/
+
+/* Constructor */
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size);
+
+/* Copy Constructor */
+int aom_vector_copy(Vector *destination, Vector *source);
+
+/* Copy Assignment */
+int aom_vector_copy_assign(Vector *destination, Vector *source);
+
+/* Move Constructor */
+int aom_vector_move(Vector *destination, Vector *source);
+
+/* Move Assignment */
+int aom_vector_move_assign(Vector *destination, Vector *source);
+
+int aom_vector_swap(Vector *destination, Vector *source);
+
+/* Destructor */
+int aom_vector_destroy(Vector *vector);
+
+/* Insertion */
+int aom_vector_push_back(Vector *vector, void *element);
+int aom_vector_push_front(Vector *vector, void *element);
+int aom_vector_insert(Vector *vector, size_t index, void *element);
+int aom_vector_assign(Vector *vector, size_t index, void *element);
+
+/* Deletion */
+int aom_vector_pop_back(Vector *vector);
+int aom_vector_pop_front(Vector *vector);
+int aom_vector_erase(Vector *vector, size_t index);
+int aom_vector_clear(Vector *vector);
+
+/* Lookup */
+void *aom_vector_get(Vector *vector, size_t index);
+const void *aom_vector_const_get(const Vector *vector, size_t index);
+void *aom_vector_front(Vector *vector);
+void *aom_vector_back(Vector *vector);
+#define VECTOR_GET_AS(type, aom_vector_pointer, index) \
+ *((type *)aom_vector_get((aom_vector_pointer), (index)))
+
+/* Information */
+bool aom_vector_is_initialized(const Vector *vector);
+size_t aom_vector_byte_size(const Vector *vector);
+size_t aom_vector_free_space(const Vector *vector);
+bool aom_vector_is_empty(const Vector *vector);
+
+/* Memory management */
+int aom_vector_resize(Vector *vector, size_t new_size);
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity);
+int aom_vector_shrink_to_fit(Vector *vector);
+
+/* Iterators */
+Iterator aom_vector_begin(Vector *vector);
+Iterator aom_vector_end(Vector *vector);
+Iterator aom_vector_iterator(Vector *vector, size_t index);
+
+void *aom_iterator_get(Iterator *iterator);
+#define ITERATOR_GET_AS(type, iterator) *((type *)aom_iterator_get((iterator)))
+
+int aom_iterator_erase(Vector *vector, Iterator *iterator);
+
+void aom_iterator_increment(Iterator *iterator);
+void aom_iterator_decrement(Iterator *iterator);
+
+void *aom_iterator_next(Iterator *iterator);
+void *aom_iterator_previous(Iterator *iterator);
+
+bool aom_iterator_equals(Iterator *first, Iterator *second);
+bool aom_iterator_is_before(Iterator *first, Iterator *second);
+bool aom_iterator_is_after(Iterator *first, Iterator *second);
+
+size_t aom_iterator_index(Vector *vector, Iterator *iterator);
+
+#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name) \
+ for (Iterator(iterator_name) = aom_vector_begin((aom_vector_pointer)), \
+ end = aom_vector_end((aom_vector_pointer)); \
+ !aom_iterator_equals(&(iterator_name), &end); \
+ aom_iterator_increment(&(iterator_name)))
+
+#endif /* VECTOR_H */
diff --git a/third_party/aom/third_party/x86inc/LICENSE b/third_party/aom/third_party/x86inc/LICENSE
new file mode 100644
index 0000000000..7d07645a17
--- /dev/null
+++ b/third_party/aom/third_party/x86inc/LICENSE
@@ -0,0 +1,18 @@
+Copyright (C) 2005-2012 x264 project
+
+Authors: Loren Merritt <lorenm@u.washington.edu>
+ Anton Mitrofanov <BugMaster@narod.ru>
+ Jason Garrett-Glaser <darkshikari@gmail.com>
+ Henrik Gramner <hengar-6@student.ltu.se>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/third_party/aom/third_party/x86inc/README.libaom b/third_party/aom/third_party/x86inc/README.libaom
new file mode 100644
index 0000000000..6b9235818b
--- /dev/null
+++ b/third_party/aom/third_party/x86inc/README.libaom
@@ -0,0 +1,19 @@
+URL: https://git.videolan.org/git/x264.git
+Version: 3e5aed95cc470f37e2db3e6506a8deb89b527720
+License: ISC
+License File: LICENSE
+
+Description:
+x264/libav's framework for x86 assembly. Contains a variety of macros and
+defines that help automatically allow assembly to work cross-platform.
+
+Local Modifications:
+Get configuration from config/aom_config.asm.
+Prefix functions with aom by default.
+Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
+ exist in libaom.
+Copy PIC 'GLOBAL' macros from x86_abi_support.asm
+Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
+Use .text with no alignment for aout.
+Only use 'hidden' visibility with Chromium.
+Prefix ARCH_* with AOM_.
diff --git a/third_party/aom/third_party/x86inc/x86inc.asm b/third_party/aom/third_party/x86inc/x86inc.asm
new file mode 100644
index 0000000000..b0421f51fa
--- /dev/null
+++ b/third_party/aom/third_party/x86inc/x86inc.asm
@@ -0,0 +1,1923 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2019 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Henrik Gramner <henrik@gramner.com>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Fiona Glaser <fiona@x264.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel@videolan.org .
+
+%include "config/aom_config.asm"
+
+%ifndef private_prefix
+ %define private_prefix aom
+%endif
+
+%ifndef public_prefix
+ %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+ %if AOM_ARCH_X86_64
+ %define STACK_ALIGNMENT 16
+ %else
+ %define STACK_ALIGNMENT 4
+ %endif
+%endif
+
+%define WIN64 0
+%define UNIX64 0
+%if AOM_ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,x64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%define FORMAT_ELF 0
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,elf
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,macho
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+ %define FORMAT_MACHO 1
+%endif
+
+; Set PREFIX for libaom builds.
+%if FORMAT_ELF
+ %undef PREFIX
+%elif WIN64
+ %undef PREFIX
+%else
+ %define PREFIX
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; In some instances macho32 tables get misaligned when using .rodata.
+; When looking at the disassembly it appears that the offset is either
+; correct or consistently off by 90. Placing them in the .text section
+; works around the issue. It appears to be specific to the way libaom
+; handles the tables.
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,win32
+ SECTION .rdata align=%1
+ %elif WIN64
+ SECTION .rdata align=%1
+ %elifidn __OUTPUT_FORMAT__,macho32
+ SECTION .text align=%1
+ fakegot:
+ %elifidn __OUTPUT_FORMAT__,aout
+ SECTION .text
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+; PIC macros from aom_ports/x86_abi_support.asm.
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+ %if CONFIG_PIC=1
+ %ifidn __OUTPUT_FORMAT__,elf32
+ %define GET_GOT_DEFINED 1
+ %define WRT_PLT wrt ..plt
+ %macro GET_GOT 1
+ extern _GLOBAL_OFFSET_TABLE_
+ push %1
+ call %%get_got
+ %%sub_offset:
+ jmp %%exitGG
+ %%get_got:
+ mov %1, [esp]
+ add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+ ret
+ %%exitGG:
+ %undef GLOBAL
+ %define GLOBAL(x) x + %1 wrt ..gotoff
+ %undef RESTORE_GOT
+ %define RESTORE_GOT pop %1
+ %endmacro
+ %elifidn __OUTPUT_FORMAT__,macho32
+ %define GET_GOT_DEFINED 1
+ %macro GET_GOT 1
+ push %1
+ call %%get_got
+ %%get_got:
+ pop %1
+ %undef GLOBAL
+ %define GLOBAL(x) x + %1 - %%get_got
+ %undef RESTORE_GOT
+ %define RESTORE_GOT pop %1
+ %endmacro
+ %else
+ %define GET_GOT_DEFINED 0
+ %endif
+ %endif
+
+ %if AOM_ARCH_X86_64 == 0
+ %undef PIC
+ %endif
+
+%else
+ %macro GET_GOT 1
+ %endmacro
+ %define GLOBAL(x) rel x
+ %define WRT_PLT wrt ..plt
+
+ %if WIN64
+ %define PIC
+ %elifidn __OUTPUT_FORMAT__,macho64
+ %define PIC
+ %elif CONFIG_PIC
+ %define PIC
+ %endif
+%endif
+
+%ifnmacro GET_GOT
+ %macro GET_GOT 1
+ %endmacro
+ %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+ %define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+ %define WRT_PLT
+%endif
+
+%ifdef PIC
+ default rel
+%endif
+
+%ifndef GET_GOT_DEFINED
+ %define GET_GOT_DEFINED 0
+%endif
+; End PIC macros from aom_ports/x86_abi_support.asm.
+
+; libaom explicitly sets visibilty in shared object builds. Avoid setting
+; visibility to hidden as it may break builds that split sources on e.g.,
+; directory boundaries.
+%ifdef CHROMIUM
+ %define VISIBILITY hidden
+ %define HAVE_PRIVATE_EXTERN 1
+%else
+ %define VISIBILITY
+ %define HAVE_PRIVATE_EXTERN 0
+%endif
+
+%ifdef __NASM_VER__
+ %use smartalign
+ %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+ %define HAVE_PRIVATE_EXTERN 0
+ %endif
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+; allocating the specified stack size. If the required stack alignment is
+; larger than the known stack alignment the stack will be manually aligned
+; and an extra register will be allocated to hold the original stack
+; pointer (to not invalidate r0m etc.). To prevent the use of an extra
+; register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %define %2q %2
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif AOM_ARCH_X86_64 ; memory
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp qword r %+ %1 %+ m
+ %else
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp dword r %+ %1 %+ m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+ %if AOM_ARCH_X86_64 == 0
+ %define r%1 e%1
+ %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if AOM_ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro LEA 2
+%if AOM_ARCH_X86_64
+ lea %1, [%2]
+%elif PIC
+ call $+5 ; special-cased to not affect the RSB on most CPU:s
+ pop %1
+ add %1, (%2)-$+1
+%else
+ mov %1, %2
+%endif
+%endmacro
+
+%macro PUSH 1
+ push %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset+gprsize
+ %endif
+%endmacro
+
+%macro POP 1
+ pop %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset-gprsize
+ %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%if AOM_ARCH_X86_64 == 0
+ %define movsxd movifnidn
+%endif
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assertion ``%1'' failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (AOM_ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+ %ifnum %1
+ %if %1 != 0
+ %assign %%pad 0
+ %assign stack_size %1
+ %if stack_size < 0
+ %assign stack_size -stack_size
+ %endif
+ %if WIN64
+ %assign %%pad %%pad + 32 ; shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+ %endif
+ %endif
+ %endif
+ %if required_stack_alignment <= STACK_ALIGNMENT
+ ; maintain the current stack alignment
+ %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %else
+ %assign %%reg_num (regs_used - 1)
+ %xdefine rstk r %+ %%reg_num
+ ; align stack, and save original stack location directly above
+ ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+ ; stack in a single instruction (i.e. mov rsp, rstk or mov
+ ; rsp, [rsp+stack_size_padded])
+ %if %1 < 0 ; need to store rsp on stack
+ %xdefine rstkm [rsp + stack_size + %%pad]
+ %assign %%pad %%pad + gprsize
+ %else ; can keep rsp in rstk during whole function
+ %xdefine rstkm rstk
+ %endif
+ %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+ mov rstk, rsp
+ and rsp, ~(required_stack_alignment-1)
+ sub rsp, stack_size_padded
+ movifnidn rstkm, rstk
+ %endif
+ WIN64_PUSH_XMM
+ %endif
+ %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+ %ifnum %1
+ %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+ %if %1 > 0
+ ; Reserve an additional register for storing the original stack pointer, but avoid using
+ ; eax/rax for this purpose since it can potentially get overwritten as a return value.
+ %assign regs_used (regs_used + 1)
+ %if AOM_ARCH_X86_64 && regs_used == 7
+ %assign regs_used 8
+ %elif AOM_ARCH_X86_64 == 0 && regs_used == 1
+ %assign regs_used 2
+ %endif
+ %endif
+ %if AOM_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+ ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+ ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+ %assign regs_used 5 + UNIX64 * 3
+ %endif
+ %endif
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+ %ifnum %2
+ DEFINE_ARGS %3
+ %elif %1 == 4
+ DEFINE_ARGS %2
+ %elif %1 > 4
+ DEFINE_ARGS %2, %3
+ %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4, %3
+ %if mmsize != 8 && stack_size == 0
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i 8
+ %rep %%xmm_regs_on_stack
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
+ %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %endif
+ WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 0
+ %assign %%pad_size 0
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
+ %assign %%i %%i-1
+ movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
+ %endrep
+ %endif
+ %if stack_size_padded > 0
+ %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %assign %%pad_size stack_size_padded
+ %endif
+ %endif
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 0
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign stack_offset (stack_offset-stack_size_padded)
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%elif AOM_ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ %assign xmm_regs_used %3
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [rstk + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ %if num_args > 7
+ %assign num_args 7
+ %endif
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 7
+ PUSH_IF_USED 3, 4, 5, 6
+ ALLOC_STACK %4
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 6, 5, 4, 3
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+ %macro WIN64_SPILL_XMM 1
+ %endmacro
+ %macro WIN64_RESTORE_XMM 0
+ %endmacro
+ %macro WIN64_PUSH_XMM 0
+ %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+ %if has_epilogue || cpuflag(ssse3)
+ RET
+ %else
+ rep ret
+ %endif
+ annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+ %if notcpuflag(ssse3)
+ times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+ %endif
+ ret
+ annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+ %rep %0
+ %macro %1 1-2 %1
+ %2 %1
+ %if notcpuflag(ssse3)
+ %%branch_instr equ $
+ %xdefine last_branch_adr %%branch_instr
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+ annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+ annotate_function_size
+ %ifndef cglobaled_%2
+ %if %1
+ %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+ %else
+ %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+ %endif
+ %xdefine %2.skip_prologue %2 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %2, 1
+ %endif
+ %xdefine current_function %2
+ %xdefine current_function_section __SECT__
+ %if FORMAT_ELF
+ %if %1
+ global %2:function VISIBILITY
+ %else
+ global %2:function
+ %endif
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+ global %2:private_extern
+ %else
+ global %2
+ %endif
+ align function_align
+ %2:
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
+ %ifnidn %3, ""
+ PROLOGUE %3
+ %endif
+%endmacro
+
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+ %if FORMAT_ELF
+ global current_function %+ %1:function VISIBILITY
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+ global current_function %+ %1:private_extern
+ %else
+ global current_function %+ %1
+ %endif
+ %1:
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %ifdef PREFIX
+ %xdefine %1 mangle(%1)
+ %endif
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 1-2+
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ %if FORMAT_ELF
+ global %1:data VISIBILITY
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+ global %1:private_extern
+ %else
+ global %1
+ %endif
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+ [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+ %ifdef __YASM_VER__
+ %ifdef current_function
+ %if FORMAT_ELF
+ current_function_section
+ %%ecf equ $
+ size current_function %%ecf - current_function
+ __SECT__
+ %endif
+ %endif
+ %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni (1<<12)| cpuflags_sse42
+%assign cpuflags_gfni (1<<13)| cpuflags_sse42
+%assign cpuflags_avx (1<<14)| cpuflags_sse42
+%assign cpuflags_xop (1<<15)| cpuflags_avx
+%assign cpuflags_fma4 (1<<16)| cpuflags_avx
+%assign cpuflags_fma3 (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+
+%assign cpuflags_cache32 (1<<22)
+%assign cpuflags_cache64 (1<<23)
+%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<25)
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+ %xdefine SUFFIX
+ %undef cpuname
+ %assign cpuflags 0
+
+ %if %0 >= 1
+ %rep %0
+ %ifdef cpuname
+ %xdefine cpuname cpuname %+ _%1
+ %else
+ %xdefine cpuname %1
+ %endif
+ %assign cpuflags cpuflags | cpuflags_%1
+ %rotate 1
+ %endrep
+ %xdefine SUFFIX _ %+ cpuname
+
+ %if cpuflag(avx)
+ %assign avx_enabled 1
+ %endif
+ %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elif cpuflag(sse3) && notcpuflag(ssse3)
+ %define movu lddqu
+ %endif
+ %endif
+
+ %if AOM_ARCH_X86_64 || cpuflag(sse2)
+ %ifdef __NASM_VER__
+ ALIGNMODE p6
+ %else
+ CPU amdnop
+ %endif
+ %else
+ %ifdef __NASM_VER__
+ ALIGNMODE nop
+ %else
+ CPU basicnop
+ %endif
+ %endif
+%endmacro
+
+; Merge mmx, sse*, and avx*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro DEFINE_MMREGS 1 ; mmtype
+ %assign %%prev_mmregs 0
+ %ifdef num_mmregs
+ %assign %%prev_mmregs num_mmregs
+ %endif
+
+ %assign num_mmregs 8
+ %if AOM_ARCH_X86_64 && mmsize >= 16
+ %assign num_mmregs 16
+ %if cpuflag(avx512) || mmsize == 64
+ %assign num_mmregs 32
+ %endif
+ %endif
+
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1 %+ %%i
+ CAT_XDEFINE nn%1, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %if %%prev_mmregs > num_mmregs
+ %rep %%prev_mmregs - num_mmregs
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nn %+ mmtype, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+ %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if AOM_ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign avx_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS mm
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign avx_enabled 0
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS xmm
+ %if WIN64
+ AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+ %endif
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS ymm
+ AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS zmm
+ AVX512_MM_PERMUTATION
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+ %define mmmm%1 mm%1
+ %define mmxmm%1 mm%1
+ %define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
+ %define xmmmm%1 mm%1
+ %define xmmxmm%1 xmm%1
+ %define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
+ %define ymmmm%1 mm%1
+ %define ymmxmm%1 xmm%1
+ %define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
+ %define xm%1 xmm %+ m%1
+ %define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 32
+ DECLARE_MMCAST i
+ %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+ %rep %0/2
+ %xdefine %%tmp%2 m%2
+ %rotate 2
+ %endrep
+ %rep %0/2
+ %xdefine m%1 %%tmp%2
+ CAT_XDEFINE nn, m%1, %1
+ %rotate 2
+ %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+ %ifnum %1 ; SWAP 0, 1, ...
+ SWAP_INTERNAL_NUM %1, %2
+ %else ; SWAP m0, m1, ...
+ SWAP_INTERNAL_NAME %1, %2
+ %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+ %rep %0-1
+ %xdefine %%tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 %%tmp
+ CAT_XDEFINE nn, m%1, %1
+ CAT_XDEFINE nn, m%2, %2
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+ %xdefine %%args nn %+ %1
+ %rep %0-1
+ %xdefine %%args %%args, nn %+ %2
+ %rotate 1
+ %endrep
+ SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ %xdefine %%tmp m %+ %%i
+ CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %xdefine %%tmp %%f %+ 0
+ %ifnum %%tmp
+ RESET_MM_PERMUTATION
+ %assign %%i 0
+ %rep num_mmregs
+ %xdefine %%tmp %%f %+ %%i
+ CAT_XDEFINE %%m, %%i, m %+ %%tmp
+ %assign %%i %%i+1
+ %endrep
+ %rep num_mmregs
+ %assign %%i %%i-1
+ CAT_XDEFINE m, %%i, %%m %+ %%i
+ CAT_XDEFINE nn, m %+ %%i, %%i
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ %ifid %1
+ call_internal %1 %+ SUFFIX, %1
+ %else
+ call %1
+ %endif
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %2
+ %ifndef cglobaled_%2
+ %ifdef cglobaled_%1
+ %xdefine %%i %1
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 32
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
+ %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-avx emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+ %ifnum sizeof%7
+ %assign __sizeofreg sizeof%7
+ %elifnum sizeof%6
+ %assign __sizeofreg sizeof%6
+ %else
+ %assign __sizeofreg mmsize
+ %endif
+ %assign __emulate_avx 0
+ %if avx_enabled && __sizeofreg >= 16
+ %xdefine __instr v%1
+ %else
+ %xdefine __instr %1
+ %if %0 >= 8+%4
+ %assign __emulate_avx 1
+ %endif
+ %endif
+ %ifnidn %2, fnord
+ %ifdef cpuname
+ %if notcpuflag(%2)
+ %error use of ``%1'' %2 instruction in cpuname function: current_function
+ %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
+ %error use of ``%1'' sse2 instruction in cpuname function: current_function
+ %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+ %error use of ``%1'' avx2 instruction in cpuname function: current_function
+ %elif __sizeofreg == 16 && notcpuflag(sse)
+ %error use of ``%1'' sse instruction in cpuname function: current_function
+ %elif __sizeofreg == 32 && notcpuflag(avx)
+ %error use of ``%1'' avx instruction in cpuname function: current_function
+ %elif __sizeofreg == 64 && notcpuflag(avx512)
+ %error use of ``%1'' avx512 instruction in cpuname function: current_function
+ %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+ %ifnid %6 ; but sse4 is required for memory operands
+ %if notcpuflag(sse4)
+ %error use of ``%1'' sse4 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %endif
+ %endif
+ %endif
+
+ %if __emulate_avx
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %if %5 && %4 == 0
+ %ifnidn %6, %7
+ %ifidn %6, %8
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %elifnnum sizeof%8
+ ; 3-operand AVX instructions with a memory arg can only have it in src2,
+ ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+ ; So, if the instruction is commutative with a memory arg, swap them.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %endif
+ %ifnidn %6, __src1
+ %if %0 >= 9
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+ %endif
+ %if __sizeofreg == 8
+ MOVQ %6, __src1
+ %elif %3
+ MOVAPS %6, __src1
+ %else
+ MOVDQA %6, __src1
+ %endif
+ %endif
+ %if %0 >= 9
+ %1 %6, __src2, %9
+ %else
+ %1 %6, __src2
+ %endif
+ %elif %0 >= 9
+ __instr %6, %7, %8, %9
+ %elif %0 == 8
+ %if avx_enabled && %5
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %ifnum regnumof%7
+ %ifnum regnumof%8
+ %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+ ; Most VEX-encoded instructions require an additional byte to encode when
+ ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+ ; we can swap src1 and src2 when doing so reduces the instruction length.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %endif
+ __instr %6, __src1, __src2
+ %else
+ __instr %6, %7, %8
+ %endif
+ %elif %0 == 7
+ %if avx_enabled && %5
+ %xdefine __src1 %6
+ %xdefine __src2 %7
+ %ifnum regnumof%6
+ %ifnum regnumof%7
+ %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+ %xdefine __src1 %7
+ %xdefine __src2 %6
+ %endif
+ %endif
+ %endif
+ __instr %6, __src1, __src2
+ %else
+ __instr %6, %7
+ %endif
+ %else
+ __instr %6
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
+ %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+ %ifidn %2, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+ %elifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+; Instructions with both VEX/EVEX and legacy encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse, 1
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3, 1
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3, 0, 1, 0
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse, 1
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+ %macro %1 2-5 fnord, %1, %2
+ %ifdef cpuname
+ %if notcpuflag(%5)
+ %error use of ``%4'' %5 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %ifidn %3, fnord
+ %4 %1, %2
+ %else
+ %4 %1, %2, %3
+ %endif
+ %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsr, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+ %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %elifnidn %1, %4
+ %6 %1, %2, %3
+ %7 %1, %4
+ %else
+ %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+ %push fma4_instr
+ %xdefine %$prefix %1
+ %rep %0 - 1
+ %macro %$prefix%2 4-6 %$prefix, %2
+ %if notcpuflag(fma3) && notcpuflag(fma4)
+ %error use of ``%5%6'' fma instruction in cpuname function: current_function
+ %elif cpuflag(fma4)
+ v%5%6 %1, %2, %3, %4
+ %elifidn %1, %2
+ ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+ %ifnum sizeof%3
+ v%{5}213%6 %2, %3, %4
+ %else
+ v%{5}132%6 %2, %4, %3
+ %endif
+ %elifidn %1, %3
+ v%{5}213%6 %3, %2, %4
+ %elifidn %1, %4
+ v%{5}231%6 %4, %2, %3
+ %else
+ %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+ %pop
+%endmacro
+
+FMA4_INSTR fmadd, pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub, pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd, pd, ps, sd, ss
+FMA4_INSTR fnmsub, pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%3
+ %if regnumof%3 >= 16 || sizeof%3 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
diff --git a/third_party/aom/tools/aggregate_entropy_stats.py b/third_party/aom/tools/aggregate_entropy_stats.py
new file mode 100644
index 0000000000..0311681f2d
--- /dev/null
+++ b/third_party/aom/tools/aggregate_entropy_stats.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Aggregate multiple entropy stats output which is written in 32-bit int.
+
+python ./aggregate_entropy_stats.py [dir of stats files] [keyword of filenames]
+ [filename of final stats]
+"""
+
+__author__ = "yuec@google.com"
+
+import os
+import sys
+import numpy as np
+
+def main():
+ dir = sys.argv[1]
+ sum = []
+ for fn in os.listdir(dir):
+ if sys.argv[2] in fn:
+ stats = np.fromfile(dir + fn, dtype=np.int32)
+ if len(sum) == 0:
+ sum = stats
+ else:
+ sum = np.add(sum, stats)
+ if len(sum) == 0:
+ print("No stats file is found. Double-check directory and keyword?")
+ else:
+ sum.tofile(dir+sys.argv[3])
+
+if __name__ == '__main__':
+ main()
diff --git a/third_party/aom/tools/aom_entropy_optimizer.c b/third_party/aom/tools/aom_entropy_optimizer.c
new file mode 100644
index 0000000000..fa7bf7ea9e
--- /dev/null
+++ b/third_party/aom/tools/aom_entropy_optimizer.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// This tool is a gadget for offline probability training.
+// A binary executable aom_entropy_optimizer will be generated in tools/. It
+// parses a binary file consisting of counts written in the format of
+// FRAME_COUNTS in entropymode.h, and computes optimized probability tables
+// and CDF tables, which will be written to a new c file optimized_probs.c
+// according to format in the codebase.
+//
+// Command line: ./aom_entropy_optimizer [directory of the count file]
+//
+// The input file can either be generated by encoding a single clip by
+// turning on entropy_stats experiment, or be collected at a larger scale at
+// which a python script which will be provided soon can be used to aggregate
+// multiple stats output.
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encoder.h"
+
+#define SPACES_PER_TAB 2
+#define CDF_MAX_SIZE 16
+
+typedef unsigned int aom_count_type;
+// A log file recording parsed counts
+static FILE *logfile; // TODO(yuec): make it a command line option
+
+static void counts_to_cdf(const aom_count_type *counts, aom_cdf_prob *cdf,
+ int modes) {
+ int64_t csum[CDF_MAX_SIZE];
+ assert(modes <= CDF_MAX_SIZE);
+
+ csum[0] = counts[0] + 1;
+ for (int i = 1; i < modes; ++i) csum[i] = counts[i] + 1 + csum[i - 1];
+
+ for (int i = 0; i < modes; ++i) fprintf(logfile, "%d ", counts[i]);
+ fprintf(logfile, "\n");
+
+ int64_t sum = csum[modes - 1];
+ const int64_t round_shift = sum >> 1;
+ for (int i = 0; i < modes; ++i) {
+ cdf[i] = (csum[i] * CDF_PROB_TOP + round_shift) / sum;
+ cdf[i] = AOMMIN(cdf[i], CDF_PROB_TOP - (modes - 1 + i) * 4);
+ cdf[i] = (i == 0) ? AOMMAX(cdf[i], 4) : AOMMAX(cdf[i], cdf[i - 1] + 4);
+ }
+}
+
+static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr,
+ FILE *const probsfile, int tabs,
+ int dim_of_cts, int *cts_each_dim) {
+ if (dim_of_cts < 1) {
+ fprintf(stderr, "The dimension of a counts vector should be at least 1!\n");
+ return 1;
+ }
+ const int total_modes = cts_each_dim[0];
+ if (dim_of_cts == 1) {
+ assert(total_modes <= CDF_MAX_SIZE);
+ aom_cdf_prob cdfs[CDF_MAX_SIZE];
+ aom_count_type *counts1d = *ct_ptr;
+
+ counts_to_cdf(counts1d, cdfs, total_modes);
+ (*ct_ptr) += total_modes;
+
+ if (tabs > 0) fprintf(probsfile, "%*c", tabs * SPACES_PER_TAB, ' ');
+ fprintf(probsfile, "AOM_CDF%d(", total_modes);
+ for (int k = 0; k < total_modes - 1; ++k) {
+ fprintf(probsfile, "%d", cdfs[k]);
+ if (k < total_modes - 2) fprintf(probsfile, ", ");
+ }
+ fprintf(probsfile, ")");
+ } else {
+ for (int k = 0; k < total_modes; ++k) {
+ int tabs_next_level;
+
+ if (dim_of_cts == 2)
+ fprintf(probsfile, "%*c{ ", tabs * SPACES_PER_TAB, ' ');
+ else
+ fprintf(probsfile, "%*c{\n", tabs * SPACES_PER_TAB, ' ');
+ tabs_next_level = dim_of_cts == 2 ? 0 : tabs + 1;
+
+ if (parse_counts_for_cdf_opt(ct_ptr, probsfile, tabs_next_level,
+ dim_of_cts - 1, cts_each_dim + 1)) {
+ return 1;
+ }
+
+ if (dim_of_cts == 2) {
+ if (k == total_modes - 1)
+ fprintf(probsfile, " }\n");
+ else
+ fprintf(probsfile, " },\n");
+ } else {
+ if (k == total_modes - 1)
+ fprintf(probsfile, "%*c}\n", tabs * SPACES_PER_TAB, ' ');
+ else
+ fprintf(probsfile, "%*c},\n", tabs * SPACES_PER_TAB, ' ');
+ }
+ }
+ }
+ return 0;
+}
+
+static void optimize_cdf_table(aom_count_type *counts, FILE *const probsfile,
+ int dim_of_cts, int *cts_each_dim,
+ char *prefix) {
+ aom_count_type *ct_ptr = counts;
+
+ fprintf(probsfile, "%s = {\n", prefix);
+ fprintf(logfile, "%s\n", prefix);
+ if (parse_counts_for_cdf_opt(&ct_ptr, probsfile, 1, dim_of_cts,
+ cts_each_dim)) {
+ fprintf(probsfile, "Optimizer failed!\n");
+ }
+ fprintf(probsfile, "};\n\n");
+ fprintf(logfile, "============================\n");
+}
+
+static void optimize_uv_mode(aom_count_type *counts, FILE *const probsfile,
+ int dim_of_cts, int *cts_each_dim, char *prefix) {
+ aom_count_type *ct_ptr = counts;
+
+ fprintf(probsfile, "%s = {\n", prefix);
+ fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+ fprintf(logfile, "%s\n", prefix);
+ cts_each_dim[2] = UV_INTRA_MODES - 1;
+ for (int k = 0; k < cts_each_dim[1]; ++k) {
+ fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+ parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, dim_of_cts - 2,
+ cts_each_dim + 2);
+ if (k + 1 == cts_each_dim[1]) {
+ fprintf(probsfile, " }\n");
+ } else {
+ fprintf(probsfile, " },\n");
+ }
+ ++ct_ptr;
+ }
+ fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+ fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+ cts_each_dim[2] = UV_INTRA_MODES;
+ parse_counts_for_cdf_opt(&ct_ptr, probsfile, 2, dim_of_cts - 1,
+ cts_each_dim + 1);
+ fprintf(probsfile, "%*c}\n", SPACES_PER_TAB, ' ');
+ fprintf(probsfile, "};\n\n");
+ fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_2d(aom_count_type *counts,
+ FILE *const probsfile,
+ int dim_of_cts, int *cts_each_dim,
+ int *modes_each_ctx, char *prefix) {
+ aom_count_type *ct_ptr = counts;
+
+ assert(dim_of_cts == 2);
+ (void)dim_of_cts;
+
+ fprintf(probsfile, "%s = {\n", prefix);
+ fprintf(logfile, "%s\n", prefix);
+
+ for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+ int num_of_modes = modes_each_ctx[d0_idx];
+
+ if (num_of_modes > 0) {
+ fprintf(probsfile, "%*c{ ", SPACES_PER_TAB, ' ');
+ parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+ ct_ptr += cts_each_dim[1] - num_of_modes;
+ fprintf(probsfile, " },\n");
+ } else {
+ fprintf(probsfile, "%*c{ 0 },\n", SPACES_PER_TAB, ' ');
+ fprintf(logfile, "dummy cdf, no need to optimize\n");
+ ct_ptr += cts_each_dim[1];
+ }
+ }
+ fprintf(probsfile, "};\n\n");
+ fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_3d(aom_count_type *counts,
+ FILE *const probsfile,
+ int dim_of_cts, int *cts_each_dim,
+ int *modes_each_ctx, char *prefix) {
+ aom_count_type *ct_ptr = counts;
+
+ assert(dim_of_cts == 3);
+ (void)dim_of_cts;
+
+ fprintf(probsfile, "%s = {\n", prefix);
+ fprintf(logfile, "%s\n", prefix);
+
+ for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+ fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+ for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+ int num_of_modes = modes_each_ctx[d0_idx];
+
+ if (num_of_modes > 0) {
+ fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+ parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+ ct_ptr += cts_each_dim[2] - num_of_modes;
+ fprintf(probsfile, " },\n");
+ } else {
+ fprintf(probsfile, "%*c{ 0 },\n", 2 * SPACES_PER_TAB, ' ');
+ fprintf(logfile, "dummy cdf, no need to optimize\n");
+ ct_ptr += cts_each_dim[2];
+ }
+ }
+ fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+ }
+ fprintf(probsfile, "};\n\n");
+ fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_4d(aom_count_type *counts,
+ FILE *const probsfile,
+ int dim_of_cts, int *cts_each_dim,
+ int *modes_each_ctx, char *prefix) {
+ aom_count_type *ct_ptr = counts;
+
+ assert(dim_of_cts == 4);
+ (void)dim_of_cts;
+
+ fprintf(probsfile, "%s = {\n", prefix);
+ fprintf(logfile, "%s\n", prefix);
+
+ for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+ fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+ for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+ fprintf(probsfile, "%*c{\n", 2 * SPACES_PER_TAB, ' ');
+ for (int d2_idx = 0; d2_idx < cts_each_dim[2]; ++d2_idx) {
+ int num_of_modes = modes_each_ctx[d0_idx];
+
+ if (num_of_modes > 0) {
+ fprintf(probsfile, "%*c{ ", 3 * SPACES_PER_TAB, ' ');
+ parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+ ct_ptr += cts_each_dim[3] - num_of_modes;
+ fprintf(probsfile, " },\n");
+ } else {
+ fprintf(probsfile, "%*c{ 0 },\n", 3 * SPACES_PER_TAB, ' ');
+ fprintf(logfile, "dummy cdf, no need to optimize\n");
+ ct_ptr += cts_each_dim[3];
+ }
+ }
+ fprintf(probsfile, "%*c},\n", 2 * SPACES_PER_TAB, ' ');
+ }
+ fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+ }
+ fprintf(probsfile, "};\n\n");
+ fprintf(logfile, "============================\n");
+}
+
+int main(int argc, const char **argv) {
+ if (argc < 2) {
+ fprintf(stderr, "Please specify the input stats file!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ FILE *const statsfile = fopen(argv[1], "rb");
+ if (statsfile == NULL) {
+ fprintf(stderr, "Failed to open input file!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ FRAME_COUNTS fc;
+ const size_t bytes = fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile);
+ if (!bytes) {
+ fclose(statsfile);
+ return 1;
+ }
+
+ FILE *const probsfile = fopen("optimized_probs.c", "w");
+ if (probsfile == NULL) {
+ fprintf(stderr,
+ "Failed to create output file for optimized entropy tables!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ logfile = fopen("aom_entropy_optimizer_parsed_counts.log", "w");
+ if (logfile == NULL) {
+ fprintf(stderr, "Failed to create log file for parsed counts!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ int cts_each_dim[10];
+
+ /* Intra mode (keyframe luma) */
+ cts_each_dim[0] = KF_MODE_CONTEXTS;
+ cts_each_dim[1] = KF_MODE_CONTEXTS;
+ cts_each_dim[2] = INTRA_MODES;
+ optimize_cdf_table(&fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+ "const aom_cdf_prob\n"
+ "default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]"
+ "[CDF_SIZE(INTRA_MODES)]");
+
+ cts_each_dim[0] = DIRECTIONAL_MODES;
+ cts_each_dim[1] = 2 * MAX_ANGLE_DELTA + 1;
+ optimize_cdf_table(&fc.angle_delta[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob default_angle_delta_cdf"
+ "[DIRECTIONAL_MODES][CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]");
+
+ /* Intra mode (non-keyframe luma) */
+ cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+ cts_each_dim[1] = INTRA_MODES;
+ optimize_cdf_table(
+ &fc.y_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]");
+
+ /* Intra mode (chroma) */
+ cts_each_dim[0] = CFL_ALLOWED_TYPES;
+ cts_each_dim[1] = INTRA_MODES;
+ cts_each_dim[2] = UV_INTRA_MODES;
+ optimize_uv_mode(&fc.uv_mode[0][0][0], probsfile, 3, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]"
+ "[CDF_SIZE(UV_INTRA_MODES)]");
+
+ /* block partition */
+ cts_each_dim[0] = PARTITION_CONTEXTS;
+ cts_each_dim[1] = EXT_PARTITION_TYPES;
+ int part_types_each_ctx[PARTITION_CONTEXTS] = { 4, 4, 4, 4, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 8, 8, 8, 8 };
+ optimize_cdf_table_var_modes_2d(
+ &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx,
+ "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]"
+ "[CDF_SIZE(EXT_PARTITION_TYPES)]");
+
+ /* tx type */
+ cts_each_dim[0] = EXT_TX_SETS_INTRA;
+ cts_each_dim[1] = EXT_TX_SIZES;
+ cts_each_dim[2] = INTRA_MODES;
+ cts_each_dim[3] = TX_TYPES;
+ int intra_ext_tx_types_each_ctx[EXT_TX_SETS_INTRA] = { 0, 7, 5 };
+ optimize_cdf_table_var_modes_4d(
+ &fc.intra_ext_tx[0][0][0][0], probsfile, 4, cts_each_dim,
+ intra_ext_tx_types_each_ctx,
+ "static const aom_cdf_prob default_intra_ext_tx_cdf[EXT_TX_SETS_INTRA]"
+ "[EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)]");
+
+ cts_each_dim[0] = EXT_TX_SETS_INTER;
+ cts_each_dim[1] = EXT_TX_SIZES;
+ cts_each_dim[2] = TX_TYPES;
+ int inter_ext_tx_types_each_ctx[EXT_TX_SETS_INTER] = { 0, 16, 12, 2 };
+ optimize_cdf_table_var_modes_3d(
+ &fc.inter_ext_tx[0][0][0], probsfile, 3, cts_each_dim,
+ inter_ext_tx_types_each_ctx,
+ "static const aom_cdf_prob default_inter_ext_tx_cdf[EXT_TX_SETS_INTER]"
+ "[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)]");
+
+ /* Chroma from Luma */
+ cts_each_dim[0] = CFL_JOINT_SIGNS;
+ optimize_cdf_table(&fc.cfl_sign[0], probsfile, 1, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]");
+ cts_each_dim[0] = CFL_ALPHA_CONTEXTS;
+ cts_each_dim[1] = CFL_ALPHABET_SIZE;
+ optimize_cdf_table(&fc.cfl_alpha[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS]"
+ "[CDF_SIZE(CFL_ALPHABET_SIZE)]");
+
+ /* Interpolation filter */
+ cts_each_dim[0] = SWITCHABLE_FILTER_CONTEXTS;
+ cts_each_dim[1] = SWITCHABLE_FILTERS;
+ optimize_cdf_table(&fc.switchable_interp[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]"
+ "[CDF_SIZE(SWITCHABLE_FILTERS)]");
+
+ /* Motion vector referencing */
+ cts_each_dim[0] = NEWMV_MODE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.newmv_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = GLOBALMV_MODE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.zeromv_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = REFMV_MODE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.refmv_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = DRL_MODE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.drl_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+ /* ext_inter experiment */
+ /* New compound mode */
+ cts_each_dim[0] = INTER_MODE_CONTEXTS;
+ cts_each_dim[1] = INTER_COMPOUND_MODES;
+ optimize_cdf_table(&fc.inter_compound_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_"
+ "SIZE(INTER_COMPOUND_MODES)]");
+
+ /* Interintra */
+ cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.interintra[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+ cts_each_dim[1] = INTERINTRA_MODES;
+ optimize_cdf_table(&fc.interintra_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE("
+ "INTERINTRA_MODES)]");
+
+ cts_each_dim[0] = BLOCK_SIZES_ALL;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(
+ &fc.wedge_interintra[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+ /* Compound type */
+ cts_each_dim[0] = BLOCK_SIZES_ALL;
+ cts_each_dim[1] = COMPOUND_TYPES - 1;
+ optimize_cdf_table(&fc.compound_type[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob default_compound_type_cdf"
+ "[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]");
+
+ cts_each_dim[0] = BLOCK_SIZES_ALL;
+ cts_each_dim[1] = 16;
+ optimize_cdf_table(&fc.wedge_idx[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]");
+
+ /* motion_var and warped_motion experiments */
+ cts_each_dim[0] = BLOCK_SIZES_ALL;
+ cts_each_dim[1] = MOTION_MODES;
+ optimize_cdf_table(
+ &fc.motion_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]");
+ cts_each_dim[0] = BLOCK_SIZES_ALL;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.obmc[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+ /* Intra/inter flag */
+ cts_each_dim[0] = INTRA_INTER_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(
+ &fc.intra_inter[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]");
+
+ /* Single/comp ref flag */
+ cts_each_dim[0] = COMP_INTER_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(
+ &fc.comp_inter[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]");
+
+ /* ext_comp_refs experiment */
+ cts_each_dim[0] = COMP_REF_TYPE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(
+ &fc.comp_ref_type[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = UNI_COMP_REF_CONTEXTS;
+ cts_each_dim[1] = UNIDIR_COMP_REFS - 1;
+ cts_each_dim[2] = 2;
+ optimize_cdf_table(&fc.uni_comp_ref[0][0][0], probsfile, 3, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_"
+ "COMP_REFS - 1][CDF_SIZE(2)]");
+
+ /* Reference frame (single ref) */
+ cts_each_dim[0] = REF_CONTEXTS;
+ cts_each_dim[1] = SINGLE_REFS - 1;
+ cts_each_dim[2] = 2;
+ optimize_cdf_table(
+ &fc.single_ref[0][0][0], probsfile, 3, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]");
+
+ /* ext_refs experiment */
+ cts_each_dim[0] = REF_CONTEXTS;
+ cts_each_dim[1] = FWD_REFS - 1;
+ cts_each_dim[2] = 2;
+ optimize_cdf_table(
+ &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = REF_CONTEXTS;
+ cts_each_dim[1] = BWD_REFS - 1;
+ cts_each_dim[2] = 2;
+ optimize_cdf_table(
+ &fc.comp_bwdref[0][0][0], probsfile, 3, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]");
+
+ /* palette */
+ cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+ cts_each_dim[1] = PALETTE_SIZES;
+ optimize_cdf_table(&fc.palette_y_size[0][0], probsfile, 2, cts_each_dim,
+ "const aom_cdf_prob default_palette_y_size_cdf"
+ "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+ cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+ cts_each_dim[1] = PALETTE_SIZES;
+ optimize_cdf_table(&fc.palette_uv_size[0][0], probsfile, 2, cts_each_dim,
+ "const aom_cdf_prob default_palette_uv_size_cdf"
+ "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+ cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+ cts_each_dim[1] = PALETTE_Y_MODE_CONTEXTS;
+ cts_each_dim[2] = 2;
+ optimize_cdf_table(&fc.palette_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+ "const aom_cdf_prob default_palette_y_mode_cdf"
+ "[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]"
+ "[CDF_SIZE(2)]");
+
+ cts_each_dim[0] = PALETTE_UV_MODE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.palette_uv_mode[0][0], probsfile, 2, cts_each_dim,
+ "const aom_cdf_prob default_palette_uv_mode_cdf"
+ "[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = PALETTE_SIZES;
+ cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+ cts_each_dim[2] = PALETTE_COLORS;
+ int palette_color_indexes_each_ctx[PALETTE_SIZES] = { 2, 3, 4, 5, 6, 7, 8 };
+ optimize_cdf_table_var_modes_3d(
+ &fc.palette_y_color_index[0][0][0], probsfile, 3, cts_each_dim,
+ palette_color_indexes_each_ctx,
+ "const aom_cdf_prob default_palette_y_color_index_cdf[PALETTE_SIZES]"
+ "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+ cts_each_dim[0] = PALETTE_SIZES;
+ cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+ cts_each_dim[2] = PALETTE_COLORS;
+ optimize_cdf_table_var_modes_3d(
+ &fc.palette_uv_color_index[0][0][0], probsfile, 3, cts_each_dim,
+ palette_color_indexes_each_ctx,
+ "const aom_cdf_prob default_palette_uv_color_index_cdf[PALETTE_SIZES]"
+ "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+ /* Transform size */
+ cts_each_dim[0] = TXFM_PARTITION_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(
+ &fc.txfm_partition[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob\n"
+ "default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]");
+
+ /* Skip flag */
+ cts_each_dim[0] = SKIP_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.skip_txfm[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
+
+ /* Skip mode flag */
+ cts_each_dim[0] = SKIP_MODE_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.skip_mode[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+ /* joint compound flag */
+ cts_each_dim[0] = COMP_INDEX_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.compound_index[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob default_compound_idx_cdfs"
+ "[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = COMP_GROUP_IDX_CONTEXTS;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.comp_group_idx[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob default_comp_group_idx_cdfs"
+ "[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]");
+
+ /* intrabc */
+ cts_each_dim[0] = 2;
+ optimize_cdf_table(
+ &fc.intrabc[0], probsfile, 1, cts_each_dim,
+ "static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)]");
+
+ /* filter_intra experiment */
+ cts_each_dim[0] = FILTER_INTRA_MODES;
+ optimize_cdf_table(
+ &fc.filter_intra_mode[0], probsfile, 1, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]");
+
+ cts_each_dim[0] = BLOCK_SIZES_ALL;
+ cts_each_dim[1] = 2;
+ optimize_cdf_table(&fc.filter_intra[0][0], probsfile, 2, cts_each_dim,
+ "static const aom_cdf_prob "
+ "default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+ /* restoration type */
+ cts_each_dim[0] = RESTORE_SWITCHABLE_TYPES;
+ optimize_cdf_table(&fc.switchable_restore[0], probsfile, 1, cts_each_dim,
+ "static const aom_cdf_prob default_switchable_restore_cdf"
+ "[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]");
+
+ cts_each_dim[0] = 2;
+ optimize_cdf_table(&fc.wiener_restore[0], probsfile, 1, cts_each_dim,
+ "static const aom_cdf_prob default_wiener_restore_cdf"
+ "[CDF_SIZE(2)]");
+
+ cts_each_dim[0] = 2;
+ optimize_cdf_table(&fc.sgrproj_restore[0], probsfile, 1, cts_each_dim,
+ "static const aom_cdf_prob default_sgrproj_restore_cdf"
+ "[CDF_SIZE(2)]");
+
+ /* intra tx size */
+ cts_each_dim[0] = MAX_TX_CATS;
+ cts_each_dim[1] = TX_SIZE_CONTEXTS;
+ cts_each_dim[2] = MAX_TX_DEPTH + 1;
+ int intra_tx_sizes_each_ctx[MAX_TX_CATS] = { 2, 3, 3, 3 };
+ optimize_cdf_table_var_modes_3d(
+ &fc.intra_tx_size[0][0][0], probsfile, 3, cts_each_dim,
+ intra_tx_sizes_each_ctx,
+ "static const aom_cdf_prob default_tx_size_cdf"
+ "[MAX_TX_CATS][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH + 1)]");
+
+ /* transform coding */
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = TX_SIZES;
+ cts_each_dim[2] = TXB_SKIP_CONTEXTS;
+ cts_each_dim[3] = 2;
+ optimize_cdf_table(&fc.txb_skip[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob "
+ "av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES]"
+ "[TXB_SKIP_CONTEXTS][CDF_SIZE(2)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = TX_SIZES;
+ cts_each_dim[2] = PLANE_TYPES;
+ cts_each_dim[3] = EOB_COEF_CONTEXTS;
+ cts_each_dim[4] = 2;
+ optimize_cdf_table(
+ &fc.eob_extra[0][0][0][0][0], probsfile, 5, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_extra_cdfs "
+ "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]"
+ "[CDF_SIZE(2)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 5;
+ optimize_cdf_table(&fc.eob_multi16[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi16_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(5)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 6;
+ optimize_cdf_table(&fc.eob_multi32[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi32_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(6)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 7;
+ optimize_cdf_table(&fc.eob_multi64[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi64_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(7)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 8;
+ optimize_cdf_table(&fc.eob_multi128[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi128_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(8)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 9;
+ optimize_cdf_table(&fc.eob_multi256[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi256_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(9)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 10;
+ optimize_cdf_table(&fc.eob_multi512[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi512_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(10)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = PLANE_TYPES;
+ cts_each_dim[2] = 2;
+ cts_each_dim[3] = 11;
+ optimize_cdf_table(&fc.eob_multi1024[0][0][0][0], probsfile, 4, cts_each_dim,
+ "static const aom_cdf_prob av1_default_eob_multi1024_cdfs"
+ "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(11)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = TX_SIZES;
+ cts_each_dim[2] = PLANE_TYPES;
+ cts_each_dim[3] = LEVEL_CONTEXTS;
+ cts_each_dim[4] = BR_CDF_SIZE;
+ optimize_cdf_table(&fc.coeff_lps_multi[0][0][0][0][0], probsfile, 5,
+ cts_each_dim,
+ "static const aom_cdf_prob "
+ "av1_default_coeff_lps_multi_cdfs[TOKEN_CDF_Q_CTXS]"
+ "[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]"
+ "[CDF_SIZE(BR_CDF_SIZE)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = TX_SIZES;
+ cts_each_dim[2] = PLANE_TYPES;
+ cts_each_dim[3] = SIG_COEF_CONTEXTS;
+ cts_each_dim[4] = NUM_BASE_LEVELS + 2;
+ optimize_cdf_table(
+ &fc.coeff_base_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+ "static const aom_cdf_prob av1_default_coeff_base_multi_cdfs"
+ "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]"
+ "[CDF_SIZE(NUM_BASE_LEVELS + 2)]");
+
+ cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+ cts_each_dim[1] = TX_SIZES;
+ cts_each_dim[2] = PLANE_TYPES;
+ cts_each_dim[3] = SIG_COEF_CONTEXTS_EOB;
+ cts_each_dim[4] = NUM_BASE_LEVELS + 1;
+ optimize_cdf_table(
+ &fc.coeff_base_eob_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+ "static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs"
+ "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]"
+ "[CDF_SIZE(NUM_BASE_LEVELS + 1)]");
+
+ fclose(statsfile);
+ fclose(logfile);
+ fclose(probsfile);
+
+ return 0;
+}
diff --git a/third_party/aom/tools/auto_refactor/auto_refactor.py b/third_party/aom/tools/auto_refactor/auto_refactor.py
new file mode 100644
index 0000000000..dd0d4415f9
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/auto_refactor.py
@@ -0,0 +1,919 @@
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+from __future__ import print_function
+import sys
+import os
+import operator
+from pycparser import c_parser, c_ast, parse_file
+from math import *
+
+from inspect import currentframe, getframeinfo
+from collections import deque
+
+
+def debug_print(frameinfo):
+ print('******** ERROR:', frameinfo.filename, frameinfo.lineno, '********')
+
+
+class StructItem():
+
+ def __init__(self,
+ typedef_name=None,
+ struct_name=None,
+ struct_node=None,
+ is_union=False):
+ self.typedef_name = typedef_name
+ self.struct_name = struct_name
+ self.struct_node = struct_node
+ self.is_union = is_union
+ self.child_decl_map = None
+
+ def __str__(self):
+ return str(self.typedef_name) + ' ' + str(self.struct_name) + ' ' + str(
+ self.is_union)
+
+ def compute_child_decl_map(self, struct_info):
+ self.child_decl_map = {}
+ if self.struct_node != None and self.struct_node.decls != None:
+ for decl_node in self.struct_node.decls:
+ if decl_node.name == None:
+ for sub_decl_node in decl_node.type.decls:
+ sub_decl_status = parse_decl_node(struct_info, sub_decl_node)
+ self.child_decl_map[sub_decl_node.name] = sub_decl_status
+ else:
+ decl_status = parse_decl_node(struct_info, decl_node)
+ self.child_decl_map[decl_status.name] = decl_status
+
+ def get_child_decl_status(self, decl_name):
+ if self.child_decl_map == None:
+ debug_print(getframeinfo(currentframe()))
+ print('child_decl_map is None')
+ return None
+ if decl_name not in self.child_decl_map:
+ debug_print(getframeinfo(currentframe()))
+ print(decl_name, 'does not exist ')
+ return None
+ return self.child_decl_map[decl_name]
+
+
+class StructInfo():
+
+ def __init__(self):
+ self.struct_name_dic = {}
+ self.typedef_name_dic = {}
+ self.enum_value_dic = {} # enum value -> enum_node
+ self.enum_name_dic = {} # enum name -> enum_node
+ self.struct_item_list = []
+
+ def get_struct_by_typedef_name(self, typedef_name):
+ if typedef_name in self.typedef_name_dic:
+ return self.typedef_name_dic[typedef_name]
+ else:
+ return None
+
+ def get_struct_by_struct_name(self, struct_name):
+ if struct_name in self.struct_name_dic:
+ return self.struct_name_dic[struct_name]
+ else:
+ debug_print(getframeinfo(currentframe()))
+ print('Cant find', struct_name)
+ return None
+
+ def update_struct_item_list(self):
+ # Collect all struct_items from struct_name_dic and typedef_name_dic
+ # Compute child_decl_map for each struct item.
+ for struct_name in self.struct_name_dic.keys():
+ struct_item = self.struct_name_dic[struct_name]
+ struct_item.compute_child_decl_map(self)
+ self.struct_item_list.append(struct_item)
+
+ for typedef_name in self.typedef_name_dic.keys():
+ struct_item = self.typedef_name_dic[typedef_name]
+ if struct_item.struct_name not in self.struct_name_dic:
+ struct_item.compute_child_decl_map(self)
+ self.struct_item_list.append(struct_item)
+
+ def update_enum(self, enum_node):
+ if enum_node.name != None:
+ self.enum_name_dic[enum_node.name] = enum_node
+
+ if enum_node.values != None:
+ enumerator_list = enum_node.values.enumerators
+ for enumerator in enumerator_list:
+ self.enum_value_dic[enumerator.name] = enum_node
+
+ def update(self,
+ typedef_name=None,
+ struct_name=None,
+ struct_node=None,
+ is_union=False):
+ """T: typedef_name S: struct_name N: struct_node
+
+ T S N
+ case 1: o o o
+ typedef struct P {
+ int u;
+ } K;
+ T S N
+ case 2: o o x
+ typedef struct P K;
+
+ T S N
+ case 3: x o o
+ struct P {
+ int u;
+ };
+
+ T S N
+ case 4: o x o
+ typedef struct {
+ int u;
+ } K;
+ """
+ struct_item = None
+
+ # Check whether struct_name or typedef_name is already in the dictionary
+ if struct_name in self.struct_name_dic:
+ struct_item = self.struct_name_dic[struct_name]
+
+ if typedef_name in self.typedef_name_dic:
+ struct_item = self.typedef_name_dic[typedef_name]
+
+ if struct_item == None:
+ struct_item = StructItem(typedef_name, struct_name, struct_node, is_union)
+
+ if struct_node.decls != None:
+ struct_item.struct_node = struct_node
+
+ if struct_name != None:
+ self.struct_name_dic[struct_name] = struct_item
+
+ if typedef_name != None:
+ self.typedef_name_dic[typedef_name] = struct_item
+
+
+class StructDefVisitor(c_ast.NodeVisitor):
+
+ def __init__(self):
+ self.struct_info = StructInfo()
+
+ def visit_Struct(self, node):
+ if node.decls != None:
+ self.struct_info.update(None, node.name, node)
+ self.generic_visit(node)
+
+ def visit_Union(self, node):
+ if node.decls != None:
+ self.struct_info.update(None, node.name, node, True)
+ self.generic_visit(node)
+
+ def visit_Enum(self, node):
+ self.struct_info.update_enum(node)
+ self.generic_visit(node)
+
+ def visit_Typedef(self, node):
+ if node.type.__class__.__name__ == 'TypeDecl':
+ typedecl = node.type
+ if typedecl.type.__class__.__name__ == 'Struct':
+ struct_node = typedecl.type
+ typedef_name = node.name
+ struct_name = struct_node.name
+ self.struct_info.update(typedef_name, struct_name, struct_node)
+ elif typedecl.type.__class__.__name__ == 'Union':
+ union_node = typedecl.type
+ typedef_name = node.name
+ union_name = union_node.name
+ self.struct_info.update(typedef_name, union_name, union_node, True)
+ # TODO(angiebird): Do we need to deal with enum here?
+ self.generic_visit(node)
+
+
+def build_struct_info(ast):
+ v = StructDefVisitor()
+ v.visit(ast)
+ struct_info = v.struct_info
+ struct_info.update_struct_item_list()
+ return v.struct_info
+
+
+class DeclStatus():
+
+ def __init__(self, name, struct_item=None, is_ptr_decl=False):
+ self.name = name
+ self.struct_item = struct_item
+ self.is_ptr_decl = is_ptr_decl
+
+ def get_child_decl_status(self, decl_name):
+ if self.struct_item != None:
+ return self.struct_item.get_child_decl_status(decl_name)
+ else:
+ #TODO(angiebird): 2. Investigage the situation when a struct's definition can't be found.
+ return None
+
+ def __str__(self):
+ return str(self.struct_item) + ' ' + str(self.name) + ' ' + str(
+ self.is_ptr_decl)
+
+
+def peel_ptr_decl(decl_type_node):
+ """ Remove PtrDecl and ArrayDecl layer """
+ is_ptr_decl = False
+ peeled_decl_type_node = decl_type_node
+ while peeled_decl_type_node.__class__.__name__ == 'PtrDecl' or peeled_decl_type_node.__class__.__name__ == 'ArrayDecl':
+ is_ptr_decl = True
+ peeled_decl_type_node = peeled_decl_type_node.type
+ return is_ptr_decl, peeled_decl_type_node
+
+
+def parse_peeled_decl_type_node(struct_info, node):
+ struct_item = None
+ if node.__class__.__name__ == 'TypeDecl':
+ if node.type.__class__.__name__ == 'IdentifierType':
+ identifier_type_node = node.type
+ typedef_name = identifier_type_node.names[0]
+ struct_item = struct_info.get_struct_by_typedef_name(typedef_name)
+ elif node.type.__class__.__name__ == 'Struct':
+ struct_node = node.type
+ if struct_node.name != None:
+ struct_item = struct_info.get_struct_by_struct_name(struct_node.name)
+ else:
+ struct_item = StructItem(None, None, struct_node, False)
+ struct_item.compute_child_decl_map(struct_info)
+ elif node.type.__class__.__name__ == 'Union':
+ # TODO(angiebird): Special treatment for Union?
+ struct_node = node.type
+ if struct_node.name != None:
+ struct_item = struct_info.get_struct_by_struct_name(struct_node.name)
+ else:
+ struct_item = StructItem(None, None, struct_node, True)
+ struct_item.compute_child_decl_map(struct_info)
+ elif node.type.__class__.__name__ == 'Enum':
+ # TODO(angiebird): Special treatment for Union?
+ struct_node = node.type
+ struct_item = None
+ else:
+ print('Unrecognized peeled_decl_type_node.type',
+ node.type.__class__.__name__)
+ else:
+ # debug_print(getframeinfo(currentframe()))
+ # print(node.__class__.__name__)
+ #TODO(angiebird): Do we need to take care of this part?
+ pass
+
+ return struct_item
+
+
+def parse_decl_node(struct_info, decl_node):
+ # struct_item is None if this decl_node is not a struct_item
+ decl_node_name = decl_node.name
+ decl_type_node = decl_node.type
+ is_ptr_decl, peeled_decl_type_node = peel_ptr_decl(decl_type_node)
+ struct_item = parse_peeled_decl_type_node(struct_info, peeled_decl_type_node)
+ return DeclStatus(decl_node_name, struct_item, is_ptr_decl)
+
+
+def get_lvalue_lead(lvalue_node):
+ """return '&' or '*' of lvalue if available"""
+ if lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '&':
+ return '&'
+ elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '*':
+ return '*'
+ return None
+
+
+def parse_lvalue(lvalue_node):
+ """get id_chain from lvalue"""
+ id_chain = parse_lvalue_recursive(lvalue_node, [])
+ return id_chain
+
+
+def parse_lvalue_recursive(lvalue_node, id_chain):
+ """cpi->rd->u -> (cpi->rd)->u"""
+ if lvalue_node.__class__.__name__ == 'ID':
+ id_chain.append(lvalue_node.name)
+ id_chain.reverse()
+ return id_chain
+ elif lvalue_node.__class__.__name__ == 'StructRef':
+ id_chain.append(lvalue_node.field.name)
+ return parse_lvalue_recursive(lvalue_node.name, id_chain)
+ elif lvalue_node.__class__.__name__ == 'ArrayRef':
+ return parse_lvalue_recursive(lvalue_node.name, id_chain)
+ elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '&':
+ return parse_lvalue_recursive(lvalue_node.expr, id_chain)
+ elif lvalue_node.__class__.__name__ == 'UnaryOp' and lvalue_node.op == '*':
+ return parse_lvalue_recursive(lvalue_node.expr, id_chain)
+ else:
+ return None
+
+
+class FuncDefVisitor(c_ast.NodeVisitor):
+ func_dictionary = {}
+
+ def visit_FuncDef(self, node):
+ func_name = node.decl.name
+ self.func_dictionary[func_name] = node
+
+
+def build_func_dictionary(ast):
+ v = FuncDefVisitor()
+ v.visit(ast)
+ return v.func_dictionary
+
+
+def get_func_start_coord(func_node):
+ return func_node.coord
+
+
+def find_end_node(node):
+ node_list = []
+ for c in node:
+ node_list.append(c)
+ if len(node_list) == 0:
+ return node
+ else:
+ return find_end_node(node_list[-1])
+
+
+def get_func_end_coord(func_node):
+ return find_end_node(func_node).coord
+
+
+def get_func_size(func_node):
+ start_coord = get_func_start_coord(func_node)
+ end_coord = get_func_end_coord(func_node)
+ if start_coord.file == end_coord.file:
+ return end_coord.line - start_coord.line + 1
+ else:
+ return None
+
+
+def save_object(obj, filename):
+ with open(filename, 'wb') as obj_fp:
+ pickle.dump(obj, obj_fp, protocol=-1)
+
+
+def load_object(filename):
+ obj = None
+ with open(filename, 'rb') as obj_fp:
+ obj = pickle.load(obj_fp)
+ return obj
+
+
+def get_av1_ast(gen_ast=False):
+ # TODO(angiebird): Generalize this path
+ c_filename = './av1_pp.c'
+ print('generate ast')
+ ast = parse_file(c_filename)
+ #save_object(ast, ast_file)
+ print('finished generate ast')
+ return ast
+
+
+def get_func_param_id_map(func_def_node):
+ param_id_map = {}
+ func_decl = func_def_node.decl.type
+ param_list = func_decl.args.params
+ for decl in param_list:
+ param_id_map[decl.name] = decl
+ return param_id_map
+
+
+class IDTreeStack():
+
+ def __init__(self, global_id_tree):
+ self.stack = deque()
+ self.global_id_tree = global_id_tree
+
+ def add_link_node(self, node, link_id_chain):
+ link_node = self.add_id_node(link_id_chain)
+ node.link_node = link_node
+ node.link_id_chain = link_id_chain
+
+ def push_id_tree(self, id_tree=None):
+ if id_tree == None:
+ id_tree = IDStatusNode()
+ self.stack.append(id_tree)
+ return id_tree
+
+ def pop_id_tree(self):
+ return self.stack.pop()
+
+ def add_id_seed_node(self, id_seed, decl_status):
+ return self.stack[-1].add_child(id_seed, decl_status)
+
+ def get_id_seed_node(self, id_seed):
+ idx = len(self.stack) - 1
+ while idx >= 0:
+ id_node = self.stack[idx].get_child(id_seed)
+ if id_node != None:
+ return id_node
+ idx -= 1
+
+ id_node = self.global_id_tree.get_child(id_seed)
+ if id_node != None:
+ return id_node
+ return None
+
+ def add_id_node(self, id_chain):
+ id_seed = id_chain[0]
+ id_seed_node = self.get_id_seed_node(id_seed)
+ if id_seed_node == None:
+ return None
+ if len(id_chain) == 1:
+ return id_seed_node
+ return id_seed_node.add_descendant(id_chain[1:])
+
+ def get_id_node(self, id_chain):
+ id_seed = id_chain[0]
+ id_seed_node = self.get_id_seed_node(id_seed)
+ if id_seed_node == None:
+ return None
+ if len(id_chain) == 1:
+ return id_seed_node
+ return id_seed_node.get_descendant(id_chain[1:])
+
+ def top(self):
+ return self.stack[-1]
+
+
+class IDStatusNode():
+
+ def __init__(self, name=None, root=None):
+ if root is None:
+ self.root = self
+ else:
+ self.root = root
+
+ self.name = name
+
+ self.parent = None
+ self.children = {}
+
+ self.assign = False
+ self.last_assign_coord = None
+ self.refer = False
+ self.last_refer_coord = None
+
+ self.decl_status = None
+
+ self.link_id_chain = None
+ self.link_node = None
+
+ self.visit = False
+
+ def set_link_id_chain(self, link_id_chain):
+ self.set_assign(False)
+ self.link_id_chain = link_id_chain
+ self.link_node = self.root.get_descendant(link_id_chain)
+
+ def set_link_node(self, link_node):
+ self.set_assign(False)
+ self.link_id_chain = ['*']
+ self.link_node = link_node
+
+ def get_link_id_chain(self):
+ return self.link_id_chain
+
+ def get_concrete_node(self):
+ if self.visit == True:
+ # return None when there is a loop
+ return None
+ self.visit = True
+ if self.link_node == None:
+ self.visit = False
+ return self
+ else:
+ concrete_node = self.link_node.get_concrete_node()
+ self.visit = False
+ if concrete_node == None:
+ return self
+ return concrete_node
+
+ def set_assign(self, assign, coord=None):
+ concrete_node = self.get_concrete_node()
+ concrete_node.assign = assign
+ concrete_node.last_assign_coord = coord
+
+ def get_assign(self):
+ concrete_node = self.get_concrete_node()
+ return concrete_node.assign
+
+ def set_refer(self, refer, coord=None):
+ concrete_node = self.get_concrete_node()
+ concrete_node.refer = refer
+ concrete_node.last_refer_coord = coord
+
+ def get_refer(self):
+ concrete_node = self.get_concrete_node()
+ return concrete_node.refer
+
+ def set_parent(self, parent):
+ concrete_node = self.get_concrete_node()
+ concrete_node.parent = parent
+
+ def add_child(self, name, decl_status=None):
+ concrete_node = self.get_concrete_node()
+ if name not in concrete_node.children:
+ child_id_node = IDStatusNode(name, concrete_node.root)
+ concrete_node.children[name] = child_id_node
+ if decl_status == None:
+ # Check if the child decl_status can be inferred from its parent's
+ # decl_status
+ if self.decl_status != None:
+ decl_status = self.decl_status.get_child_decl_status(name)
+ child_id_node.set_decl_status(decl_status)
+ return concrete_node.children[name]
+
+ def get_child(self, name):
+ concrete_node = self.get_concrete_node()
+ if name in concrete_node.children:
+ return concrete_node.children[name]
+ else:
+ return None
+
+ def add_descendant(self, id_chain):
+ current_node = self.get_concrete_node()
+ for name in id_chain:
+ current_node.add_child(name)
+ parent_node = current_node
+ current_node = current_node.get_child(name)
+ current_node.set_parent(parent_node)
+ return current_node
+
+ def get_descendant(self, id_chain):
+ current_node = self.get_concrete_node()
+ for name in id_chain:
+ current_node = current_node.get_child(name)
+ if current_node == None:
+ return None
+ return current_node
+
+ def get_children(self):
+ current_node = self.get_concrete_node()
+ return current_node.children
+
+ def set_decl_status(self, decl_status):
+ current_node = self.get_concrete_node()
+ current_node.decl_status = decl_status
+
+ def get_decl_status(self):
+ current_node = self.get_concrete_node()
+ return current_node.decl_status
+
+ def __str__(self):
+ if self.link_id_chain is None:
+ return str(self.name) + ' a: ' + str(int(self.assign)) + ' r: ' + str(
+ int(self.refer))
+ else:
+ return str(self.name) + ' -> ' + ' '.join(self.link_id_chain)
+
+ def collect_assign_refer_status(self,
+ id_chain=None,
+ assign_ls=None,
+ refer_ls=None):
+ if id_chain == None:
+ id_chain = []
+ if assign_ls == None:
+ assign_ls = []
+ if refer_ls == None:
+ refer_ls = []
+ id_chain.append(self.name)
+ if self.assign:
+ info_str = ' '.join([
+ ' '.join(id_chain[1:]), 'a:',
+ str(int(self.assign)), 'r:',
+ str(int(self.refer)),
+ str(self.last_assign_coord)
+ ])
+ assign_ls.append(info_str)
+ if self.refer:
+ info_str = ' '.join([
+ ' '.join(id_chain[1:]), 'a:',
+ str(int(self.assign)), 'r:',
+ str(int(self.refer)),
+ str(self.last_refer_coord)
+ ])
+ refer_ls.append(info_str)
+ for c in self.children:
+ self.children[c].collect_assign_refer_status(id_chain, assign_ls,
+ refer_ls)
+ id_chain.pop()
+ return assign_ls, refer_ls
+
+ def show(self):
+ assign_ls, refer_ls = self.collect_assign_refer_status()
+ print('---- assign ----')
+ for item in assign_ls:
+ print(item)
+ print('---- refer ----')
+ for item in refer_ls:
+ print(item)
+
+
+class FuncInOutVisitor(c_ast.NodeVisitor):
+
+ def __init__(self,
+ func_def_node,
+ struct_info,
+ func_dictionary,
+ keep_body_id_tree=True,
+ call_param_map=None,
+ global_id_tree=None,
+ func_history=None,
+ unknown=None):
+ self.func_dictionary = func_dictionary
+ self.struct_info = struct_info
+ self.param_id_map = get_func_param_id_map(func_def_node)
+ self.parent_node = None
+ self.global_id_tree = global_id_tree
+ self.body_id_tree = None
+ self.keep_body_id_tree = keep_body_id_tree
+ if func_history == None:
+ self.func_history = {}
+ else:
+ self.func_history = func_history
+
+ if unknown == None:
+ self.unknown = []
+ else:
+ self.unknown = unknown
+
+ self.id_tree_stack = IDTreeStack(global_id_tree)
+ self.id_tree_stack.push_id_tree()
+
+ #TODO move this part into a function
+ for param in self.param_id_map:
+ decl_node = self.param_id_map[param]
+ decl_status = parse_decl_node(self.struct_info, decl_node)
+ descendant = self.id_tree_stack.add_id_seed_node(decl_status.name,
+ decl_status)
+ if call_param_map is not None and param in call_param_map:
+ # This is a function call.
+ # Map the input parameter to the caller's nodes
+ # TODO(angiebird): Can we use add_link_node here?
+ descendant.set_link_node(call_param_map[param])
+
+ def get_id_tree_stack(self):
+ return self.id_tree_stack
+
+ def generic_visit(self, node):
+ prev_parent = self.parent_node
+ self.parent_node = node
+ for c in node:
+ self.visit(c)
+ self.parent_node = prev_parent
+
+ # TODO rename
+ def add_new_id_tree(self, node):
+ self.id_tree_stack.push_id_tree()
+ self.generic_visit(node)
+ id_tree = self.id_tree_stack.pop_id_tree()
+ if self.parent_node == None and self.keep_body_id_tree == True:
+ # this is function body
+ self.body_id_tree = id_tree
+
+ def visit_For(self, node):
+ self.add_new_id_tree(node)
+
+ def visit_Compound(self, node):
+ self.add_new_id_tree(node)
+
+ def visit_Decl(self, node):
+ if node.type.__class__.__name__ != 'FuncDecl':
+ decl_status = parse_decl_node(self.struct_info, node)
+ descendant = self.id_tree_stack.add_id_seed_node(decl_status.name,
+ decl_status)
+ if node.init is not None:
+ init_id_chain = self.process_lvalue(node.init)
+ if init_id_chain != None:
+ if decl_status.struct_item is None:
+ init_descendant = self.id_tree_stack.add_id_node(init_id_chain)
+ if init_descendant != None:
+ init_descendant.set_refer(True, node.coord)
+ else:
+ self.unknown.append(node)
+ descendant.set_assign(True, node.coord)
+ else:
+ self.id_tree_stack.add_link_node(descendant, init_id_chain)
+ else:
+ self.unknown.append(node)
+ else:
+ descendant.set_assign(True, node.coord)
+ self.generic_visit(node)
+
+ def is_lvalue(self, node):
+ if self.parent_node is None:
+ # TODO(angiebird): Do every lvalue has parent_node != None?
+ return False
+ if self.parent_node.__class__.__name__ == 'StructRef':
+ return False
+ if self.parent_node.__class__.__name__ == 'ArrayRef' and node == self.parent_node.name:
+ # if node == self.parent_node.subscript, the node could be lvalue
+ return False
+ if self.parent_node.__class__.__name__ == 'UnaryOp' and self.parent_node.op == '&':
+ return False
+ if self.parent_node.__class__.__name__ == 'UnaryOp' and self.parent_node.op == '*':
+ return False
+ return True
+
+ def process_lvalue(self, node):
+ id_chain = parse_lvalue(node)
+ if id_chain == None:
+ return id_chain
+ elif id_chain[0] in self.struct_info.enum_value_dic:
+ return None
+ else:
+ return id_chain
+
+ def process_possible_lvalue(self, node):
+ if self.is_lvalue(node):
+ id_chain = self.process_lvalue(node)
+ lead_char = get_lvalue_lead(node)
+ # make sure the id is not an enum value
+ if id_chain == None:
+ self.unknown.append(node)
+ return
+ descendant = self.id_tree_stack.add_id_node(id_chain)
+ if descendant == None:
+ self.unknown.append(node)
+ return
+ decl_status = descendant.get_decl_status()
+ if decl_status == None:
+ descendant.set_assign(True, node.coord)
+ descendant.set_refer(True, node.coord)
+ self.unknown.append(node)
+ return
+ if self.parent_node.__class__.__name__ == 'Assignment':
+ if node is self.parent_node.lvalue:
+ if decl_status.struct_item != None:
+ if len(id_chain) > 1:
+ descendant.set_assign(True, node.coord)
+ elif len(id_chain) == 1:
+ if lead_char == '*':
+ descendant.set_assign(True, node.coord)
+ else:
+ right_id_chain = self.process_lvalue(self.parent_node.rvalue)
+ if right_id_chain != None:
+ self.id_tree_stack.add_link_node(descendant, right_id_chain)
+ else:
+ #TODO(angiebird): 1.Find a better way to deal with this case.
+ descendant.set_assign(True, node.coord)
+ else:
+ debug_print(getframeinfo(currentframe()))
+ else:
+ descendant.set_assign(True, node.coord)
+ elif node is self.parent_node.rvalue:
+ if decl_status.struct_item is None:
+ descendant.set_refer(True, node.coord)
+ if lead_char == '&':
+ descendant.set_assign(True, node.coord)
+ else:
+ left_id_chain = self.process_lvalue(self.parent_node.lvalue)
+ left_lead_char = get_lvalue_lead(self.parent_node.lvalue)
+ if left_id_chain != None:
+ if len(left_id_chain) > 1:
+ descendant.set_refer(True, node.coord)
+ elif len(left_id_chain) == 1:
+ if left_lead_char == '*':
+ descendant.set_refer(True, node.coord)
+ else:
+ #TODO(angiebird): Check whether the other node is linked to this node.
+ pass
+ else:
+ self.unknown.append(self.parent_node.lvalue)
+ debug_print(getframeinfo(currentframe()))
+ else:
+ self.unknown.append(self.parent_node.lvalue)
+ debug_print(getframeinfo(currentframe()))
+ else:
+ debug_print(getframeinfo(currentframe()))
+ elif self.parent_node.__class__.__name__ == 'UnaryOp':
+ # TODO(angiebird): Consider +=, *=, -=, /= etc
+ if self.parent_node.op == '--' or self.parent_node.op == '++' or\
+ self.parent_node.op == 'p--' or self.parent_node.op == 'p++':
+ descendant.set_assign(True, node.coord)
+ descendant.set_refer(True, node.coord)
+ else:
+ descendant.set_refer(True, node.coord)
+ elif self.parent_node.__class__.__name__ == 'Decl':
+ #The logic is at visit_Decl
+ pass
+ elif self.parent_node.__class__.__name__ == 'ExprList':
+ #The logic is at visit_FuncCall
+ pass
+ else:
+ descendant.set_refer(True, node.coord)
+
+ def visit_ID(self, node):
+ # If the parent is a FuncCall, this ID is a function name.
+ if self.parent_node.__class__.__name__ != 'FuncCall':
+ self.process_possible_lvalue(node)
+ self.generic_visit(node)
+
+ def visit_StructRef(self, node):
+ self.process_possible_lvalue(node)
+ self.generic_visit(node)
+
+ def visit_ArrayRef(self, node):
+ self.process_possible_lvalue(node)
+ self.generic_visit(node)
+
+ def visit_UnaryOp(self, node):
+ if node.op == '&' or node.op == '*':
+ self.process_possible_lvalue(node)
+ self.generic_visit(node)
+
+ def visit_FuncCall(self, node):
+ if node.name.__class__.__name__ == 'ID':
+ if node.name.name in self.func_dictionary:
+ if node.name.name not in self.func_history:
+ self.func_history[node.name.name] = True
+ func_def_node = self.func_dictionary[node.name.name]
+ call_param_map = self.process_func_call(node, func_def_node)
+
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary, False,
+ call_param_map, self.global_id_tree,
+ self.func_history, self.unknown)
+ visitor.visit(func_def_node.body)
+ else:
+ self.unknown.append(node)
+ self.generic_visit(node)
+
+ def process_func_call(self, func_call_node, func_def_node):
+ # set up a refer/assign for func parameters
+ # return call_param_map
+ call_param_ls = func_call_node.args.exprs
+ call_param_map = {}
+
+ func_decl = func_def_node.decl.type
+ decl_param_ls = func_decl.args.params
+ for param_node, decl_node in zip(call_param_ls, decl_param_ls):
+ id_chain = self.process_lvalue(param_node)
+ if id_chain != None:
+ descendant = self.id_tree_stack.add_id_node(id_chain)
+ if descendant == None:
+ self.unknown.append(param_node)
+ else:
+ decl_status = descendant.get_decl_status()
+ if decl_status != None:
+ if decl_status.struct_item == None:
+ if decl_status.is_ptr_decl == True:
+ descendant.set_assign(True, param_node.coord)
+ descendant.set_refer(True, param_node.coord)
+ else:
+ descendant.set_refer(True, param_node.coord)
+ else:
+ call_param_map[decl_node.name] = descendant
+ else:
+ self.unknown.append(param_node)
+ else:
+ self.unknown.append(param_node)
+ return call_param_map
+
+
+def build_global_id_tree(ast, struct_info):
+ global_id_tree = IDStatusNode()
+ for node in ast.ext:
+ if node.__class__.__name__ == 'Decl':
+ # id tree is for tracking assign/refer status
+ # we don't care about function id because they can't be changed
+ if node.type.__class__.__name__ != 'FuncDecl':
+ decl_status = parse_decl_node(struct_info, node)
+ descendant = global_id_tree.add_child(decl_status.name, decl_status)
+ return global_id_tree
+
+
+class FuncAnalyzer():
+
+ def __init__(self):
+ self.ast = get_av1_ast()
+ self.struct_info = build_struct_info(self.ast)
+ self.func_dictionary = build_func_dictionary(self.ast)
+ self.global_id_tree = build_global_id_tree(self.ast, self.struct_info)
+
+ def analyze(self, func_name):
+ if func_name in self.func_dictionary:
+ func_def_node = self.func_dictionary[func_name]
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary, True, None,
+ self.global_id_tree)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ root.top().show()
+ else:
+ print(func_name, "doesn't exist")
+
+
+if __name__ == '__main__':
+ fa = FuncAnalyzer()
+ fa.analyze('tpl_get_satd_cost')
+ pass
diff --git a/third_party/aom/tools/auto_refactor/av1_preprocess.py b/third_party/aom/tools/auto_refactor/av1_preprocess.py
new file mode 100644
index 0000000000..ea76912cf1
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/av1_preprocess.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+import os
+import sys
+
+
+def is_code_file(filename):
+ return filename.endswith(".c") or filename.endswith(".h")
+
+
+def is_simd_file(filename):
+ simd_keywords = [
+ "avx2", "sse2", "sse3", "ssse3", "sse4", "dspr2", "neon", "msa", "simd",
+ "x86"
+ ]
+ for keyword in simd_keywords:
+ if filename.find(keyword) >= 0:
+ return True
+ return False
+
+
+def get_code_file_list(path, exclude_file_set):
+ code_file_list = []
+ for cur_dir, sub_dir, file_list in os.walk(path):
+ for filename in file_list:
+ if is_code_file(filename) and not is_simd_file(
+ filename) and filename not in exclude_file_set:
+ file_path = os.path.join(cur_dir, filename)
+ code_file_list.append(file_path)
+ return code_file_list
+
+
+def av1_exclude_file_set():
+ exclude_file_set = {
+ "cfl_ppc.c",
+ "ppc_cpudetect.c",
+ }
+ return exclude_file_set
+
+
+def get_av1_pp_command(fake_header_dir, code_file_list):
+ pre_command = "gcc -w -nostdinc -E -I./ -I../ -I" + fake_header_dir + (" "
+ "-D'ATTRIBUTE_PACKED='"
+ " "
+ "-D'__attribute__(x)='"
+ " "
+ "-D'__inline__='"
+ " "
+ "-D'float_t=float'"
+ " "
+ "-D'DECLARE_ALIGNED(n,"
+ " typ,"
+ " "
+ "val)=typ"
+ " val'"
+ " "
+ "-D'volatile='"
+ " "
+ "-D'AV1_K_MEANS_DIM=2'"
+ " "
+ "-D'INLINE='"
+ " "
+ "-D'AOM_INLINE='"
+ " "
+ "-D'AOM_FORCE_INLINE='"
+ " "
+ "-D'inline='"
+ )
+ return pre_command + " " + " ".join(code_file_list)
+
+
+def modify_av1_rtcd(build_dir):
+ av1_rtcd = os.path.join(build_dir, "config/av1_rtcd.h")
+ fp = open(av1_rtcd)
+ string = fp.read()
+ fp.close()
+ new_string = string.replace("#ifdef RTCD_C", "#if 0")
+ fp = open(av1_rtcd, "w")
+ fp.write(new_string)
+ fp.close()
+
+
+def preprocess_av1(aom_dir, build_dir, fake_header_dir):
+ cur_dir = os.getcwd()
+ output = os.path.join(cur_dir, "av1_pp.c")
+ path_list = [
+ os.path.join(aom_dir, "av1/encoder"),
+ os.path.join(aom_dir, "av1/common")
+ ]
+ code_file_list = []
+ for path in path_list:
+ path = os.path.realpath(path)
+ code_file_list.extend(get_code_file_list(path, av1_exclude_file_set()))
+ modify_av1_rtcd(build_dir)
+ cmd = get_av1_pp_command(fake_header_dir, code_file_list) + " >" + output
+ os.chdir(build_dir)
+ os.system(cmd)
+ os.chdir(cur_dir)
+
+
+if __name__ == "__main__":
+ aom_dir = sys.argv[1]
+ build_dir = sys.argv[2]
+ fake_header_dir = sys.argv[3]
+ preprocess_av1(aom_dir, build_dir, fake_header_dir)
diff --git a/third_party/aom/tools/auto_refactor/c_files/decl_status_code.c b/third_party/aom/tools/auto_refactor/c_files/decl_status_code.c
new file mode 100644
index 0000000000..a444553bb1
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/c_files/decl_status_code.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct S1 {
+ int x;
+} T1;
+
+int parse_decl_node_2(void) { int arr[3]; }
+
+int parse_decl_node_3(void) { int *a; }
+
+int parse_decl_node_4(void) { T1 t1[3]; }
+
+int parse_decl_node_5(void) { T1 *t2[3]; }
+
+int parse_decl_node_6(void) { T1 t3[3][3]; }
+
+int main(void) {
+ int a;
+ T1 t1;
+ struct S1 s1;
+ T1 *t2;
+}
diff --git a/third_party/aom/tools/auto_refactor/c_files/func_in_out.c b/third_party/aom/tools/auto_refactor/c_files/func_in_out.c
new file mode 100644
index 0000000000..7f37bbae7e
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/c_files/func_in_out.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct XD {
+ int u;
+ int v;
+} XD;
+
+typedef struct RD {
+ XD *xd;
+ int u;
+ int v;
+} RD;
+
+typedef struct VP9_COMP {
+ int y;
+ RD *rd;
+ RD rd2;
+ int arr[3];
+ union {
+ int z;
+ };
+ struct {
+ int w;
+ };
+} VP9_COMP;
+
+int sub_func(VP9_COMP *cpi, int b) {
+ int d;
+ cpi->y += 1;
+ cpi->y -= b;
+ d = cpi->y * 2;
+ return d;
+}
+
+int func_id_forrest_show(VP9_COMP *cpi, int b) {
+ int c = 2;
+ int x = cpi->y + c * 2 + 1;
+ int y;
+ RD *rd = cpi->rd;
+ y = cpi->rd->u;
+ return x + y;
+}
+
+int func_link_id_chain_1(VP9_COMP *cpi) {
+ RD *rd = cpi->rd;
+ rd->u = 0;
+}
+
+int func_link_id_chain_2(VP9_COMP *cpi) {
+ RD *rd = cpi->rd;
+ XD *xd = rd->xd;
+ xd->u = 0;
+}
+
+int func_assign_refer_status_1(VP9_COMP *cpi) { RD *rd = cpi->rd; }
+
+int func_assign_refer_status_2(VP9_COMP *cpi) {
+ RD *rd2;
+ rd2 = cpi->rd;
+}
+
+int func_assign_refer_status_3(VP9_COMP *cpi) {
+ int a;
+ a = cpi->y;
+}
+
+int func_assign_refer_status_4(VP9_COMP *cpi) {
+ int *b;
+ b = &cpi->y;
+}
+
+int func_assign_refer_status_5(VP9_COMP *cpi) {
+ RD *rd5;
+ rd5 = &cpi->rd2;
+}
+
+int func_assign_refer_status_6(VP9_COMP *cpi, VP9_COMP *cpi2) {
+ cpi->rd = cpi2->rd;
+}
+
+int func_assign_refer_status_7(VP9_COMP *cpi, VP9_COMP *cpi2) {
+ cpi->arr[3] = 0;
+}
+
+int func_assign_refer_status_8(VP9_COMP *cpi, VP9_COMP *cpi2) {
+ int x = cpi->arr[3];
+}
+
+int func_assign_refer_status_9(VP9_COMP *cpi) {
+ {
+ RD *rd = cpi->rd;
+ { rd->u = 0; }
+ }
+}
+
+int func_assign_refer_status_10(VP9_COMP *cpi) { cpi->arr[cpi->rd->u] = 0; }
+
+int func_assign_refer_status_11(VP9_COMP *cpi) {
+ RD *rd11 = &cpi->rd2;
+ rd11->v = 1;
+}
+
+int func_assign_refer_status_12(VP9_COMP *cpi, VP9_COMP *cpi2) {
+ *cpi->rd = *cpi2->rd;
+}
+
+int func_assign_refer_status_13(VP9_COMP *cpi) {
+ cpi->z = 0;
+ cpi->w = 0;
+}
+
+int func(VP9_COMP *cpi, int x) {
+ int a;
+ cpi->y = 4;
+ a = 3 + cpi->y;
+ a = a * x;
+ cpi->y *= 4;
+ RD *ref_rd = cpi->rd;
+ ref_rd->u = 0;
+ cpi->rd2.v = 1;
+ cpi->rd->v = 1;
+ RD *ref_rd2 = &cpi->rd2;
+ RD **ref_rd3 = &(&cpi->rd2);
+ int b = sub_func(cpi, a);
+ cpi->rd->v++;
+ return b;
+}
+
+int func_sub_call_1(VP9_COMP *cpi2, int x) { cpi2->y = 4; }
+
+int func_call_1(VP9_COMP *cpi, int y) { func_sub_call_1(cpi, y); }
+
+int func_sub_call_2(VP9_COMP *cpi2, RD *rd, int x) { rd->u = 0; }
+
+int func_call_2(VP9_COMP *cpi, int y) { func_sub_call_2(cpi, &cpi->rd, y); }
+
+int func_sub_call_3(VP9_COMP *cpi2, int x) {}
+
+int func_call_3(VP9_COMP *cpi, int y) { func_sub_call_3(cpi, ++cpi->y); }
+
+int func_sub_sub_call_4(VP9_COMP *cpi3, XD *xd) {
+ cpi3->rd.u = 0;
+ xd->u = 0;
+}
+
+int func_sub_call_4(VP9_COMP *cpi2, RD *rd) {
+ func_sub_sub_call_4(cpi2, rd->xd);
+}
+
+int func_call_4(VP9_COMP *cpi, int y) { func_sub_call_4(cpi, &cpi->rd); }
+
+int func_sub_call_5(VP9_COMP *cpi) {
+ cpi->y = 2;
+ func_call_5(cpi);
+}
+
+int func_call_5(VP9_COMP *cpi) { func_sub_call_5(cpi); }
+
+int func_compound_1(VP9_COMP *cpi) {
+ for (int i = 0; i < 10; ++i) {
+ cpi->y++;
+ }
+}
+
+int func_compound_2(VP9_COMP *cpi) {
+ for (int i = 0; i < cpi->y; ++i) {
+ cpi->rd->u = i;
+ }
+}
+
+int func_compound_3(VP9_COMP *cpi) {
+ int i = 3;
+ while (i > 0) {
+ cpi->rd->u = i;
+ i--;
+ }
+}
+
+int func_compound_4(VP9_COMP *cpi) {
+ while (cpi->y-- >= 0) {
+ }
+}
+
+int func_compound_5(VP9_COMP *cpi) {
+ do {
+ } while (cpi->y-- >= 0);
+}
+
+int func_compound_6(VP9_COMP *cpi) {
+ for (int i = 0; i < 10; ++i) cpi->y--;
+}
+
+int main(void) {
+ int x;
+ VP9_COMP cpi;
+ RD rd;
+ cpi->rd = rd;
+ func(&cpi, x);
+}
diff --git a/third_party/aom/tools/auto_refactor/c_files/global_variable.c b/third_party/aom/tools/auto_refactor/c_files/global_variable.c
new file mode 100644
index 0000000000..26d5385e97
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/c_files/global_variable.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+extern const int global_a[13];
+
+const int global_b = 0;
+
+typedef struct S1 {
+ int x;
+} T1;
+
+struct S3 {
+ int x;
+} s3;
+
+int func_global_1(int *a) {
+ *a = global_a[3];
+ return 0;
+}
diff --git a/third_party/aom/tools/auto_refactor/c_files/parse_lvalue.c b/third_party/aom/tools/auto_refactor/c_files/parse_lvalue.c
new file mode 100644
index 0000000000..fa44d72381
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/c_files/parse_lvalue.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct RD {
+ int u;
+ int v;
+ int arr[3];
+} RD;
+
+typedef struct VP9_COMP {
+ int y;
+ RD *rd;
+ RD rd2;
+ RD rd3[2];
+} VP9_COMP;
+
+int parse_lvalue_2(VP9_COMP *cpi) { RD *rd2 = &cpi->rd2; }
+
+int func(VP9_COMP *cpi, int x) {
+ cpi->rd->u = 0;
+
+ int y;
+ y = 0;
+
+ cpi->rd2.v = 0;
+
+ cpi->rd->arr[2] = 0;
+
+ cpi->rd3[1]->arr[2] = 0;
+
+ return 0;
+}
+
+int main(void) {
+ int x = 0;
+ VP9_COMP cpi;
+ func(&cpi, x);
+}
diff --git a/third_party/aom/tools/auto_refactor/c_files/simple_code.c b/third_party/aom/tools/auto_refactor/c_files/simple_code.c
new file mode 100644
index 0000000000..902cd1d826
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/c_files/simple_code.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct S {
+ int x;
+ int y;
+ int z;
+} S;
+
+typedef struct T {
+ S s;
+} T;
+
+int d(S *s) {
+ ++s->x;
+ s->x--;
+ s->y = s->y + 1;
+ int *c = &s->x;
+ S ss;
+ ss.x = 1;
+ ss.x += 2;
+ ss.z *= 2;
+ return 0;
+}
+int b(S *s) {
+ d(s);
+ return 0;
+}
+int c(int x) {
+ if (x) {
+ c(x - 1);
+ } else {
+ S s;
+ d(&s);
+ }
+ return 0;
+}
+int a(S *s) {
+ b(s);
+ c(1);
+ return 0;
+}
+int e(void) {
+ c(0);
+ return 0;
+}
+int main(void) {
+ int p = 3;
+ S s;
+ s.x = p + 1;
+ s.y = 2;
+ s.z = 3;
+ a(&s);
+ T t;
+ t.s.x = 3;
+}
diff --git a/third_party/aom/tools/auto_refactor/c_files/struct_code.c b/third_party/aom/tools/auto_refactor/c_files/struct_code.c
new file mode 100644
index 0000000000..7f24d41075
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/c_files/struct_code.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+typedef struct S1 {
+ int x;
+} T1;
+
+struct S3 {
+ int x;
+};
+
+typedef struct {
+ int x;
+ struct S3 s3;
+} T4;
+
+typedef union U5 {
+ int x;
+ double y;
+} T5;
+
+typedef struct S6 {
+ struct {
+ int x;
+ };
+ union {
+ int y;
+ int z;
+ };
+} T6;
+
+typedef struct S7 {
+ struct {
+ int x;
+ } y;
+ union {
+ int w;
+ } z;
+} T7;
+
+int main(void) {}
diff --git a/third_party/aom/tools/auto_refactor/test_auto_refactor.py b/third_party/aom/tools/auto_refactor/test_auto_refactor.py
new file mode 100644
index 0000000000..6b1e269efa
--- /dev/null
+++ b/third_party/aom/tools/auto_refactor/test_auto_refactor.py
@@ -0,0 +1,675 @@
+#!/usr/bin/env python
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+import pprint
+import re
+import os, sys
+import io
+import unittest as googletest
+
+sys.path[0:0] = ['.', '..']
+
+from pycparser import c_parser, parse_file
+from pycparser.c_ast import *
+from pycparser.c_parser import CParser, Coord, ParseError
+
+from auto_refactor import *
+
+
+def get_c_file_path(filename):
+ return os.path.join('c_files', filename)
+
+
+class TestStructInfo(googletest.TestCase):
+
+ def setUp(self):
+ filename = get_c_file_path('struct_code.c')
+ self.ast = parse_file(filename)
+
+ def test_build_struct_info(self):
+ struct_info = build_struct_info(self.ast)
+ typedef_name_dic = struct_info.typedef_name_dic
+ self.assertEqual('T1' in typedef_name_dic, True)
+ self.assertEqual('T4' in typedef_name_dic, True)
+ self.assertEqual('T5' in typedef_name_dic, True)
+
+ struct_name_dic = struct_info.struct_name_dic
+ struct_name = 'S1'
+ self.assertEqual(struct_name in struct_name_dic, True)
+ struct_item = struct_name_dic[struct_name]
+ self.assertEqual(struct_item.is_union, False)
+
+ struct_name = 'S3'
+ self.assertEqual(struct_name in struct_name_dic, True)
+ struct_item = struct_name_dic[struct_name]
+ self.assertEqual(struct_item.is_union, False)
+
+ struct_name = 'U5'
+ self.assertEqual(struct_name in struct_name_dic, True)
+ struct_item = struct_name_dic[struct_name]
+ self.assertEqual(struct_item.is_union, True)
+
+ self.assertEqual(len(struct_info.struct_item_list), 6)
+
+ def test_get_child_decl_status(self):
+ struct_info = build_struct_info(self.ast)
+ struct_item = struct_info.typedef_name_dic['T4']
+
+ decl_status = struct_item.child_decl_map['x']
+ self.assertEqual(decl_status.struct_item, None)
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ decl_status = struct_item.child_decl_map['s3']
+ self.assertEqual(decl_status.struct_item.struct_name, 'S3')
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ struct_item = struct_info.typedef_name_dic['T6']
+ decl_status = struct_item.child_decl_map['x']
+ self.assertEqual(decl_status.struct_item, None)
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ decl_status = struct_item.child_decl_map['y']
+ self.assertEqual(decl_status.struct_item, None)
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ decl_status = struct_item.child_decl_map['z']
+ self.assertEqual(decl_status.struct_item, None)
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ struct_item = struct_info.typedef_name_dic['T7']
+ decl_status = struct_item.child_decl_map['y']
+ self.assertEqual('x' in decl_status.struct_item.child_decl_map, True)
+
+ struct_item = struct_info.typedef_name_dic['T7']
+ decl_status = struct_item.child_decl_map['z']
+ self.assertEqual('w' in decl_status.struct_item.child_decl_map, True)
+
+
+class TestParseLvalue(googletest.TestCase):
+
+ def setUp(self):
+ filename = get_c_file_path('parse_lvalue.c')
+ self.ast = parse_file(filename)
+ self.func_dictionary = build_func_dictionary(self.ast)
+
+ def test_parse_lvalue(self):
+ func_node = self.func_dictionary['func']
+ func_body_items = func_node.body.block_items
+ id_list = parse_lvalue(func_body_items[0].lvalue)
+ ref_id_list = ['cpi', 'rd', 'u']
+ self.assertEqual(id_list, ref_id_list)
+
+ id_list = parse_lvalue(func_body_items[2].lvalue)
+ ref_id_list = ['y']
+ self.assertEqual(id_list, ref_id_list)
+
+ id_list = parse_lvalue(func_body_items[3].lvalue)
+ ref_id_list = ['cpi', 'rd2', 'v']
+ self.assertEqual(id_list, ref_id_list)
+
+ id_list = parse_lvalue(func_body_items[4].lvalue)
+ ref_id_list = ['cpi', 'rd', 'arr']
+ self.assertEqual(id_list, ref_id_list)
+
+ id_list = parse_lvalue(func_body_items[5].lvalue)
+ ref_id_list = ['cpi', 'rd3', 'arr']
+ self.assertEqual(id_list, ref_id_list)
+
+ def test_parse_lvalue_2(self):
+ func_node = self.func_dictionary['parse_lvalue_2']
+ func_body_items = func_node.body.block_items
+ id_list = parse_lvalue(func_body_items[0].init)
+ ref_id_list = ['cpi', 'rd2']
+ self.assertEqual(id_list, ref_id_list)
+
+
+class TestIDStatusNode(googletest.TestCase):
+
+ def test_add_descendant(self):
+ root = IDStatusNode('root')
+ id_chain1 = ['cpi', 'rd', 'u']
+ id_chain2 = ['cpi', 'rd', 'v']
+ root.add_descendant(id_chain1)
+ root.add_descendant(id_chain2)
+
+ ref_children_list1 = ['cpi']
+ children_list1 = list(root.children.keys())
+ self.assertEqual(children_list1, ref_children_list1)
+
+ ref_children_list2 = ['rd']
+ children_list2 = list(root.children['cpi'].children.keys())
+ self.assertEqual(children_list2, ref_children_list2)
+
+ ref_children_list3 = ['u', 'v']
+ children_list3 = list(root.children['cpi'].children['rd'].children.keys())
+ self.assertEqual(children_list3, ref_children_list3)
+
+ def test_get_descendant(self):
+ root = IDStatusNode('root')
+ id_chain1 = ['cpi', 'rd', 'u']
+ id_chain2 = ['cpi', 'rd', 'v']
+ ref_descendant_1 = root.add_descendant(id_chain1)
+ ref_descendant_2 = root.add_descendant(id_chain2)
+
+ descendant_1 = root.get_descendant(id_chain1)
+ self.assertEqual(descendant_1 is ref_descendant_1, True)
+
+ descendant_2 = root.get_descendant(id_chain2)
+ self.assertEqual(descendant_2 is ref_descendant_2, True)
+
+ id_chain3 = ['cpi', 'rd', 'h']
+ descendant_3 = root.get_descendant(id_chain3)
+ self.assertEqual(descendant_3, None)
+
+
+class TestFuncInOut(googletest.TestCase):
+
+ def setUp(self):
+ c_filename = get_c_file_path('func_in_out.c')
+ self.ast = parse_file(c_filename)
+ self.func_dictionary = build_func_dictionary(self.ast)
+ self.struct_info = build_struct_info(self.ast)
+
+ def test_get_func_param_id_map(self):
+ func_def_node = self.func_dictionary['func']
+ param_id_map = get_func_param_id_map(func_def_node)
+ ref_param_id_map_keys = ['cpi', 'x']
+ self.assertEqual(list(param_id_map.keys()), ref_param_id_map_keys)
+
+ def test_assign_refer_status_1(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_1']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ body_id_tree = visitor.body_id_tree
+
+ id_chain = ['rd']
+ descendant = body_id_tree.get_descendant(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+ ref_link_id_chain = ['cpi', 'rd']
+ self.assertEqual(ref_link_id_chain, descendant.get_link_id_chain())
+
+ id_chain = ['cpi', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ def test_assign_refer_status_2(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_2']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ body_id_tree = visitor.body_id_tree
+
+ id_chain = ['rd2']
+ descendant = body_id_tree.get_descendant(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+
+ ref_link_id_chain = ['cpi', 'rd']
+ self.assertEqual(ref_link_id_chain, descendant.get_link_id_chain())
+
+ id_chain = ['cpi', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ def test_assign_refer_status_3(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_3']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ body_id_tree = visitor.body_id_tree
+
+ id_chain = ['a']
+ descendant = body_id_tree.get_descendant(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ def test_assign_refer_status_4(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_4']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ body_id_tree = visitor.body_id_tree
+
+ id_chain = ['b']
+ descendant = body_id_tree.get_descendant(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), True)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ def test_assign_refer_status_5(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_5']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ body_id_tree = visitor.body_id_tree
+
+ id_chain = ['rd5']
+ descendant = body_id_tree.get_descendant(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+
+ id_chain = ['cpi', 'rd2']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ def test_assign_refer_status_6(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_6']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+
+ id_chain = ['cpi', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ id_chain = ['cpi2', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+ self.assertEqual(None, descendant.get_link_id_chain())
+
+ def test_assign_refer_status_7(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_7']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'arr']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_assign_refer_status_8(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_8']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'arr']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_assign_refer_status_9(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_9']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_assign_refer_status_10(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_10']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+
+ id_chain = ['cpi', 'arr']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_assign_refer_status_11(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_11']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd2', 'v']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_assign_refer_status_12(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_12']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ id_chain = ['cpi2', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_assign_refer_status_13(self):
+ func_def_node = self.func_dictionary['func_assign_refer_status_13']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'z']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ id_chain = ['cpi', 'w']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_id_status_forrest_1(self):
+ func_def_node = self.func_dictionary['func']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack().top()
+ children_names = set(root.get_children().keys())
+ ref_children_names = set(['cpi', 'x'])
+ self.assertEqual(children_names, ref_children_names)
+
+ root = visitor.body_id_tree
+ children_names = set(root.get_children().keys())
+ ref_children_names = set(['a', 'ref_rd', 'ref_rd2', 'ref_rd3', 'b'])
+ self.assertEqual(children_names, ref_children_names)
+
+ def test_id_status_forrest_show(self):
+ func_def_node = self.func_dictionary['func_id_forrest_show']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ visitor.get_id_tree_stack().top().show()
+
+ def test_id_status_forrest_2(self):
+ func_def_node = self.func_dictionary['func_id_forrest_show']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack().top()
+ self.assertEqual(root, root.root)
+
+ id_chain = ['cpi', 'rd']
+ descendant = root.get_descendant(id_chain)
+ self.assertEqual(root, descendant.root)
+
+ id_chain = ['b']
+ descendant = root.get_descendant(id_chain)
+ self.assertEqual(root, descendant.root)
+
+ def test_link_id_chain_1(self):
+ func_def_node = self.func_dictionary['func_link_id_chain_1']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+
+ def test_link_id_chain_2(self):
+ func_def_node = self.func_dictionary['func_link_id_chain_2']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd', 'xd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+
+ def test_func_call_1(self):
+ func_def_node = self.func_dictionary['func_call_1']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ id_chain = ['y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_func_call_2(self):
+ func_def_node = self.func_dictionary['func_call_2']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ id_chain = ['cpi', 'rd']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_func_call_3(self):
+ func_def_node = self.func_dictionary['func_call_3']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_func_call_4(self):
+ func_def_node = self.func_dictionary['func_call_4']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ id_chain = ['cpi', 'rd', 'xd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_func_call_5(self):
+ func_def_node = self.func_dictionary['func_call_5']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_func_compound_1(self):
+ func_def_node = self.func_dictionary['func_compound_1']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_func_compound_2(self):
+ func_def_node = self.func_dictionary['func_compound_2']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), False)
+ self.assertEqual(descendant.get_refer(), True)
+
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_func_compound_3(self):
+ func_def_node = self.func_dictionary['func_compound_3']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+
+ id_chain = ['cpi', 'rd', 'u']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), False)
+
+ def test_func_compound_4(self):
+ func_def_node = self.func_dictionary['func_compound_4']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_func_compound_5(self):
+ func_def_node = self.func_dictionary['func_compound_5']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), True)
+
+ def test_func_compound_6(self):
+ func_def_node = self.func_dictionary['func_compound_6']
+ visitor = FuncInOutVisitor(func_def_node, self.struct_info,
+ self.func_dictionary)
+ visitor.visit(func_def_node.body)
+ root = visitor.get_id_tree_stack()
+ id_chain = ['cpi', 'y']
+ descendant = root.get_id_node(id_chain)
+ self.assertEqual(descendant.get_assign(), True)
+ self.assertEqual(descendant.get_refer(), True)
+
+
+class TestDeclStatus(googletest.TestCase):
+
+ def setUp(self):
+ filename = get_c_file_path('decl_status_code.c')
+ self.ast = parse_file(filename)
+ self.func_dictionary = build_func_dictionary(self.ast)
+ self.struct_info = build_struct_info(self.ast)
+
+ def test_parse_decl_node(self):
+ func_def_node = self.func_dictionary['main']
+ decl_list = func_def_node.body.block_items
+ decl_status = parse_decl_node(self.struct_info, decl_list[0])
+ self.assertEqual(decl_status.name, 'a')
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ decl_status = parse_decl_node(self.struct_info, decl_list[1])
+ self.assertEqual(decl_status.name, 't1')
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ decl_status = parse_decl_node(self.struct_info, decl_list[2])
+ self.assertEqual(decl_status.name, 's1')
+ self.assertEqual(decl_status.is_ptr_decl, False)
+
+ decl_status = parse_decl_node(self.struct_info, decl_list[3])
+ self.assertEqual(decl_status.name, 't2')
+ self.assertEqual(decl_status.is_ptr_decl, True)
+
+ def test_parse_decl_node_2(self):
+ func_def_node = self.func_dictionary['parse_decl_node_2']
+ decl_list = func_def_node.body.block_items
+ decl_status = parse_decl_node(self.struct_info, decl_list[0])
+ self.assertEqual(decl_status.name, 'arr')
+ self.assertEqual(decl_status.is_ptr_decl, True)
+ self.assertEqual(decl_status.struct_item, None)
+
+ def test_parse_decl_node_3(self):
+ func_def_node = self.func_dictionary['parse_decl_node_3']
+ decl_list = func_def_node.body.block_items
+ decl_status = parse_decl_node(self.struct_info, decl_list[0])
+ self.assertEqual(decl_status.name, 'a')
+ self.assertEqual(decl_status.is_ptr_decl, True)
+ self.assertEqual(decl_status.struct_item, None)
+
+ def test_parse_decl_node_4(self):
+ func_def_node = self.func_dictionary['parse_decl_node_4']
+ decl_list = func_def_node.body.block_items
+ decl_status = parse_decl_node(self.struct_info, decl_list[0])
+ self.assertEqual(decl_status.name, 't1')
+ self.assertEqual(decl_status.is_ptr_decl, True)
+ self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+ self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+ def test_parse_decl_node_5(self):
+ func_def_node = self.func_dictionary['parse_decl_node_5']
+ decl_list = func_def_node.body.block_items
+ decl_status = parse_decl_node(self.struct_info, decl_list[0])
+ self.assertEqual(decl_status.name, 't2')
+ self.assertEqual(decl_status.is_ptr_decl, True)
+ self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+ self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+ def test_parse_decl_node_6(self):
+ func_def_node = self.func_dictionary['parse_decl_node_6']
+ decl_list = func_def_node.body.block_items
+ decl_status = parse_decl_node(self.struct_info, decl_list[0])
+ self.assertEqual(decl_status.name, 't3')
+ self.assertEqual(decl_status.is_ptr_decl, True)
+ self.assertEqual(decl_status.struct_item.typedef_name, 'T1')
+ self.assertEqual(decl_status.struct_item.struct_name, 'S1')
+
+
+if __name__ == '__main__':
+ googletest.main()
diff --git a/third_party/aom/tools/cpplint.py b/third_party/aom/tools/cpplint.py
new file mode 100755
index 0000000000..e3ebde2f5a
--- /dev/null
+++ b/third_party/aom/tools/cpplint.py
@@ -0,0 +1,6244 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style. It does not attempt to fix
+up these problems -- the point is to educate. It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+import sysconfig
+
+try:
+ xrange # Python 2
+except NameError:
+ xrange = range # Python 3
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+ [--counting=total|toplevel|detailed] [--root=subdir]
+ [--linelength=digits] [--headers=x,y,...]
+ [--quiet]
+ <file> [file] ...
+
+ The style guidelines this tries to follow are those in
+ https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+ Every problem is given a confidence score from 1-5, with 5 meaning we are
+ certain of the problem, and 1 meaning it could be a legitimate construct.
+ This will miss some errors, and is not a substitute for a code review.
+
+ To suppress false-positive errors of a certain category, add a
+ 'NOLINT(category)' comment to the line. NOLINT or NOLINT(*)
+ suppresses errors of all categories on that line.
+
+ The files passed in will be linted; at least one file must be provided.
+ Default linted extensions are .cc, .cpp, .cu, .cuh and .h. Change the
+ extensions with the --extensions flag.
+
+ Flags:
+
+ output=vs7
+ By default, the output is formatted to ease emacs parsing. Visual Studio
+ compatible output (vs7) may also be used. Other formats are unsupported.
+
+ verbose=#
+ Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+ quiet
+ Don't print anything if no errors are found.
+
+ filter=-x,+y,...
+ Specify a comma-separated list of category-filters to apply: only
+ error messages whose category names pass the filters will be printed.
+ (Category names are printed with the message and look like
+ "[whitespace/indent]".) Filters are evaluated left to right.
+ "-FOO" and "FOO" means "do not print categories that start with FOO".
+ "+FOO" means "do print categories that start with FOO".
+
+ Examples: --filter=-whitespace,+whitespace/braces
+ --filter=whitespace,runtime/printf,+runtime/printf_format
+ --filter=-,+build/include_what_you_use
+
+ To see a list of all the categories used in cpplint, pass no arg:
+ --filter=
+
+ counting=total|toplevel|detailed
+ The total number of errors found is always printed. If
+ 'toplevel' is provided, then the count of errors in each of
+ the top-level categories like 'build' and 'whitespace' will
+ also be printed. If 'detailed' is provided, then a count
+ is provided for each category like 'build/class'.
+
+ root=subdir
+ The root directory used for deriving header guard CPP variable.
+ By default, the header guard CPP variable is calculated as the relative
+ path to the directory that contains .git, .hg, or .svn. When this flag
+ is specified, the relative path is calculated from the specified
+ directory. If the specified directory does not exist, this flag is
+ ignored.
+
+ Examples:
+ Assuming that top/src/.git exists (and cwd=top/src), the header guard
+ CPP variables for top/src/chrome/browser/ui/browser.h are:
+
+ No flag => CHROME_BROWSER_UI_BROWSER_H_
+ --root=chrome => BROWSER_UI_BROWSER_H_
+ --root=chrome/browser => UI_BROWSER_H_
+ --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_
+
+ linelength=digits
+ This is the allowed line length for the project. The default value is
+ 80 characters.
+
+ Examples:
+ --linelength=120
+
+ extensions=extension,extension,...
+ The allowed file extensions that cpplint will check
+
+ Examples:
+ --extensions=hpp,cpp
+
+ headers=x,y,...
+ The header extensions that cpplint will treat as .h in checks. Values are
+ automatically added to --extensions list.
+
+ Examples:
+ --headers=hpp,hxx
+ --headers=hpp
+
+ cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+ files. CPPLINT.cfg file can contain a number of key=value pairs.
+ Currently the following options are supported:
+
+ set noparent
+ filter=+filter1,-filter2,...
+ exclude_files=regex
+ linelength=80
+ root=subdir
+ headers=x,y,...
+
+ "set noparent" option prevents cpplint from traversing directory tree
+ upwards looking for more .cfg files in parent directories. This option
+ is usually placed in the top-level project directory.
+
+ The "filter" option is similar in function to --filter flag. It specifies
+ message filters in addition to the |_DEFAULT_FILTERS| and those specified
+ through --filter command-line flag.
+
+ "exclude_files" allows to specify a regular expression to be matched against
+ a file name. If the expression matches, the file is skipped and not run
+ through liner.
+
+ "linelength" allows to specify the allowed line length for the project.
+
+ The "root" option is similar in function to the --root flag (see example
+ above). Paths are relative to the directory of the CPPLINT.cfg.
+
+ The "headers" option is similar in function to the --headers flag
+ (see example above).
+
+ CPPLINT.cfg has an effect on files in the same directory and all
+ sub-directories, unless overridden by a nested configuration file.
+
+ Example file:
+ filter=-build/include_order,+build/include_alpha
+ exclude_files=.*\.cc
+
+ The above example disables build/include_order warning and enables
+ build/include_alpha as well as excludes all .cc from being
+ processed by linter, in the current directory (where the .cfg
+ file is located) and all sub-directories.
+"""
+
+# We categorize each error message we print. Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here! cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+ 'build/class',
+ 'build/c++11',
+ 'build/c++14',
+ 'build/c++tr1',
+ 'build/deprecated',
+ 'build/endif_comment',
+ 'build/explicit_make_pair',
+ 'build/forward_decl',
+ 'build/header_guard',
+ 'build/include',
+ 'build/include_alpha',
+ 'build/include_order',
+ 'build/include_what_you_use',
+ 'build/namespaces',
+ 'build/printf_format',
+ 'build/storage_class',
+ 'legal/copyright',
+ 'readability/alt_tokens',
+ 'readability/braces',
+ 'readability/casting',
+ 'readability/check',
+ 'readability/constructors',
+ 'readability/fn_size',
+ 'readability/inheritance',
+ 'readability/multiline_comment',
+ 'readability/multiline_string',
+ 'readability/namespace',
+ 'readability/nolint',
+ 'readability/nul',
+ 'readability/strings',
+ 'readability/todo',
+ 'readability/utf8',
+ 'runtime/arrays',
+ 'runtime/casting',
+ 'runtime/explicit',
+ 'runtime/int',
+ 'runtime/init',
+ 'runtime/invalid_increment',
+ 'runtime/member_string_references',
+ 'runtime/memset',
+ 'runtime/indentation_namespace',
+ 'runtime/operator',
+ 'runtime/printf',
+ 'runtime/printf_format',
+ 'runtime/references',
+ 'runtime/string',
+ 'runtime/threadsafe_fn',
+ 'runtime/vlog',
+ 'whitespace/blank_line',
+ 'whitespace/braces',
+ 'whitespace/comma',
+ 'whitespace/comments',
+ 'whitespace/empty_conditional_body',
+ 'whitespace/empty_if_body',
+ 'whitespace/empty_loop_body',
+ 'whitespace/end_of_line',
+ 'whitespace/ending_newline',
+ 'whitespace/forcolon',
+ 'whitespace/indent',
+ 'whitespace/line_length',
+ 'whitespace/newline',
+ 'whitespace/operators',
+ 'whitespace/parens',
+ 'whitespace/semicolon',
+ 'whitespace/tab',
+ 'whitespace/todo',
+ ]
+
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+ 'readability/streams',
+ 'readability/function',
+ ]
+
+# The default state of the category filter. This is overridden by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# The default list of categories suppressed for C (not C++) files.
+_DEFAULT_C_SUPPRESSED_CATEGORIES = [
+ 'readability/casting',
+ ]
+
+# The default list of categories suppressed for Linux Kernel files.
+_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [
+ 'whitespace/tab',
+ ]
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+ # Legacy
+ 'algobase.h',
+ 'algo.h',
+ 'alloc.h',
+ 'builtinbuf.h',
+ 'bvector.h',
+ 'complex.h',
+ 'defalloc.h',
+ 'deque.h',
+ 'editbuf.h',
+ 'fstream.h',
+ 'function.h',
+ 'hash_map',
+ 'hash_map.h',
+ 'hash_set',
+ 'hash_set.h',
+ 'hashtable.h',
+ 'heap.h',
+ 'indstream.h',
+ 'iomanip.h',
+ 'iostream.h',
+ 'istream.h',
+ 'iterator.h',
+ 'list.h',
+ 'map.h',
+ 'multimap.h',
+ 'multiset.h',
+ 'ostream.h',
+ 'pair.h',
+ 'parsestream.h',
+ 'pfstream.h',
+ 'procbuf.h',
+ 'pthread_alloc',
+ 'pthread_alloc.h',
+ 'rope',
+ 'rope.h',
+ 'ropeimpl.h',
+ 'set.h',
+ 'slist',
+ 'slist.h',
+ 'stack.h',
+ 'stdiostream.h',
+ 'stl_alloc.h',
+ 'stl_relops.h',
+ 'streambuf.h',
+ 'stream.h',
+ 'strfile.h',
+ 'strstream.h',
+ 'tempbuf.h',
+ 'tree.h',
+ 'type_traits.h',
+ 'vector.h',
+ # 17.6.1.2 C++ library headers
+ 'algorithm',
+ 'array',
+ 'atomic',
+ 'bitset',
+ 'chrono',
+ 'codecvt',
+ 'complex',
+ 'condition_variable',
+ 'deque',
+ 'exception',
+ 'forward_list',
+ 'fstream',
+ 'functional',
+ 'future',
+ 'initializer_list',
+ 'iomanip',
+ 'ios',
+ 'iosfwd',
+ 'iostream',
+ 'istream',
+ 'iterator',
+ 'limits',
+ 'list',
+ 'locale',
+ 'map',
+ 'memory',
+ 'mutex',
+ 'new',
+ 'numeric',
+ 'ostream',
+ 'queue',
+ 'random',
+ 'ratio',
+ 'regex',
+ 'scoped_allocator',
+ 'set',
+ 'sstream',
+ 'stack',
+ 'stdexcept',
+ 'streambuf',
+ 'string',
+ 'strstream',
+ 'system_error',
+ 'thread',
+ 'tuple',
+ 'typeindex',
+ 'typeinfo',
+ 'type_traits',
+ 'unordered_map',
+ 'unordered_set',
+ 'utility',
+ 'valarray',
+ 'vector',
+ # 17.6.1.2 C++ headers for C library facilities
+ 'cassert',
+ 'ccomplex',
+ 'cctype',
+ 'cerrno',
+ 'cfenv',
+ 'cfloat',
+ 'cinttypes',
+ 'ciso646',
+ 'climits',
+ 'clocale',
+ 'cmath',
+ 'csetjmp',
+ 'csignal',
+ 'cstdalign',
+ 'cstdarg',
+ 'cstdbool',
+ 'cstddef',
+ 'cstdint',
+ 'cstdio',
+ 'cstdlib',
+ 'cstring',
+ 'ctgmath',
+ 'ctime',
+ 'cuchar',
+ 'cwchar',
+ 'cwctype',
+ ])
+
+# Type names
+_TYPES = re.compile(
+ r'^(?:'
+ # [dcl.type.simple]
+ r'(char(16_t|32_t)?)|wchar_t|'
+ r'bool|short|int|long|signed|unsigned|float|double|'
+ # [support.types]
+ r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|'
+ # [cstdint.syn]
+ r'(u?int(_fast|_least)?(8|16|32|64)_t)|'
+ r'(u?int(max|ptr)_t)|'
+ r')$')
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+# uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+ r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+# Pattern for matching FileInfo.BaseName() against test file name
+_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$'
+
+# Pattern that matches only complete whitespace, possibly across multiple lines.
+_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL)
+
+# Assertion macros. These are defined in base/logging.h and
+# testing/base/public/gunit.h.
+_CHECK_MACROS = [
+ 'DCHECK', 'CHECK',
+ 'EXPECT_TRUE', 'ASSERT_TRUE',
+ 'EXPECT_FALSE', 'ASSERT_FALSE',
+ ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+ ('>=', 'GE'), ('>', 'GT'),
+ ('<=', 'LE'), ('<', 'LT')]:
+ _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+ _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+ _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+ _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+ ('>=', 'LT'), ('>', 'LE'),
+ ('<=', 'GT'), ('<', 'GE')]:
+ _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+ _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+
+# Alternative tokens and their replacements. For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+ 'and': '&&',
+ 'bitor': '|',
+ 'or': '||',
+ 'xor': '^',
+ 'compl': '~',
+ 'bitand': '&',
+ 'and_eq': '&=',
+ 'or_eq': '|=',
+ 'xor_eq': '^=',
+ 'not': '!',
+ 'not_eq': '!='
+ }
+
+# Compile regular expression that matches all the above keywords. The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+ r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0 # Outside of inline assembly block
+_INSIDE_ASM = 1 # Inside inline assembly block
+_END_ASM = 2 # Last line of inline assembly block
+_BLOCK_ASM = 3 # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+ r'(?:\s+(volatile|__volatile__))?'
+ r'\s*[{(]')
+
+# Match strings that indicate we're working on a C (not C++) file.
+_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|'
+ r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))')
+
+# Match string that indicates we're working on a Linux Kernel file.
+_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)')
+
+_regexp_compile_cache = {}
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+_root_debug = False
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc.
+# This is set by --headers flag.
+_hpp_headers = set(['h'])
+
+# {str, bool}: a map from error categories to booleans which indicate if the
+# category should be suppressed for every line.
+_global_error_suppressions = {}
+
+def ProcessHppHeadersOption(val):
+ global _hpp_headers
+ try:
+ _hpp_headers = set(val.split(','))
+ # Automatically append to extensions list so it does not have to be set 2 times
+ _valid_extensions.update(_hpp_headers)
+ except ValueError:
+ PrintUsage('Header extensions must be comma separated list.')
+
+def IsHeaderExtension(file_extension):
+ return file_extension in _hpp_headers
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+ """Updates the global list of line error-suppressions.
+
+ Parses any NOLINT comments on the current line, updating the global
+ error_suppressions store. Reports an error if the NOLINT comment
+ was malformed.
+
+ Args:
+ filename: str, the name of the input file.
+ raw_line: str, the line of input text, with comments.
+ linenum: int, the number of the current line.
+ error: function, an error handler.
+ """
+ matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line)
+ if matched:
+ if matched.group(1):
+ suppressed_line = linenum + 1
+ else:
+ suppressed_line = linenum
+ category = matched.group(2)
+ if category in (None, '(*)'): # => "suppress all"
+ _error_suppressions.setdefault(None, set()).add(suppressed_line)
+ else:
+ if category.startswith('(') and category.endswith(')'):
+ category = category[1:-1]
+ if category in _ERROR_CATEGORIES:
+ _error_suppressions.setdefault(category, set()).add(suppressed_line)
+ elif category not in _LEGACY_ERROR_CATEGORIES:
+ error(filename, linenum, 'readability/nolint', 5,
+ 'Unknown NOLINT error category: %s' % category)
+
+
+def ProcessGlobalSuppresions(lines):
+ """Updates the list of global error suppressions.
+
+ Parses any lint directives in the file that have global effect.
+
+ Args:
+ lines: An array of strings, each representing a line of the file, with the
+ last element being empty if the file is terminated with a newline.
+ """
+ for line in lines:
+ if _SEARCH_C_FILE.search(line):
+ for category in _DEFAULT_C_SUPPRESSED_CATEGORIES:
+ _global_error_suppressions[category] = True
+ if _SEARCH_KERNEL_FILE.search(line):
+ for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES:
+ _global_error_suppressions[category] = True
+
+
+def ResetNolintSuppressions():
+ """Resets the set of NOLINT suppressions to empty."""
+ _error_suppressions.clear()
+ _global_error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+ """Returns true if the specified error category is suppressed on this line.
+
+ Consults the global error_suppressions map populated by
+ ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions.
+
+ Args:
+ category: str, the category of the error.
+ linenum: int, the current line number.
+ Returns:
+ bool, True iff the error should be suppressed due to a NOLINT comment or
+ global suppression.
+ """
+ return (_global_error_suppressions.get(category, False) or
+ linenum in _error_suppressions.get(category, set()) or
+ linenum in _error_suppressions.get(None, set()))
+
+
+def Match(pattern, s):
+ """Matches the string with the pattern, caching the compiled regexp."""
+ # The regexp compilation caching is inlined in both Match and Search for
+ # performance reasons; factoring it out into a separate function turns out
+ # to be noticeably expensive.
+ if pattern not in _regexp_compile_cache:
+ _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+ return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+ """Replaces instances of pattern in a string with a replacement.
+
+ The compiled regex is kept in a cache shared by Match and Search.
+
+ Args:
+ pattern: regex pattern
+ rep: replacement text
+ s: search string
+
+ Returns:
+ string with replacements made (or original string if no replacements)
+ """
+ if pattern not in _regexp_compile_cache:
+ _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+ return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+ """Searches the string for the pattern, caching the compiled regexp."""
+ if pattern not in _regexp_compile_cache:
+ _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+ return _regexp_compile_cache[pattern].search(s)
+
+
+def _IsSourceExtension(s):
+ """File extension (excluding dot) matches a source file extension."""
+ return s in ('c', 'cc', 'cpp', 'cxx')
+
+
+class _IncludeState(object):
+ """Tracks line numbers for includes, and the order in which includes appear.
+
+ include_list contains list of lists of (header, line number) pairs.
+ It's a lists of lists rather than just one flat list to make it
+ easier to update across preprocessor boundaries.
+
+ Call CheckNextIncludeOrder() once for each header in the file, passing
+ in the type constants defined above. Calls in an illegal order will
+ raise an _IncludeError with an appropriate error message.
+
+ """
+ # self._section will move monotonically through this set. If it ever
+ # needs to move backwards, CheckNextIncludeOrder will raise an error.
+ _INITIAL_SECTION = 0
+ _MY_H_SECTION = 1
+ _C_SECTION = 2
+ _CPP_SECTION = 3
+ _OTHER_H_SECTION = 4
+
+ _TYPE_NAMES = {
+ _C_SYS_HEADER: 'C system header',
+ _CPP_SYS_HEADER: 'C++ system header',
+ _LIKELY_MY_HEADER: 'header this file implements',
+ _POSSIBLE_MY_HEADER: 'header this file may implement',
+ _OTHER_HEADER: 'other header',
+ }
+ _SECTION_NAMES = {
+ _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+ _MY_H_SECTION: 'a header this file implements',
+ _C_SECTION: 'C system header',
+ _CPP_SECTION: 'C++ system header',
+ _OTHER_H_SECTION: 'other header',
+ }
+
+ def __init__(self):
+ self.include_list = [[]]
+ self.ResetSection('')
+
+ def FindHeader(self, header):
+ """Check if a header has already been included.
+
+ Args:
+ header: header to check.
+ Returns:
+ Line number of previous occurrence, or -1 if the header has not
+ been seen before.
+ """
+ for section_list in self.include_list:
+ for f in section_list:
+ if f[0] == header:
+ return f[1]
+ return -1
+
+ def ResetSection(self, directive):
+ """Reset section checking for preprocessor directive.
+
+ Args:
+ directive: preprocessor directive (e.g. "if", "else").
+ """
+ # The name of the current section.
+ self._section = self._INITIAL_SECTION
+ # The path of last found header.
+ self._last_header = ''
+
+ # Update list of includes. Note that we never pop from the
+ # include list.
+ if directive in ('if', 'ifdef', 'ifndef'):
+ self.include_list.append([])
+ elif directive in ('else', 'elif'):
+ self.include_list[-1] = []
+
+ def SetLastHeader(self, header_path):
+ self._last_header = header_path
+
+ def CanonicalizeAlphabeticalOrder(self, header_path):
+ """Returns a path canonicalized for alphabetical comparison.
+
+ - replaces "-" with "_" so they both cmp the same.
+ - removes '-inl' since we don't require them to be after the main header.
+ - lowercase everything, just in case.
+
+ Args:
+ header_path: Path to be canonicalized.
+
+ Returns:
+ Canonicalized path.
+ """
+ return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+ def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+ """Check if a header is in alphabetical order with the previous header.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ header_path: Canonicalized header to be checked.
+
+ Returns:
+ Returns true if the header is in alphabetical order.
+ """
+ # If previous section is different from current section, _last_header will
+ # be reset to empty string, so it's always less than current header.
+ #
+ # If previous line was a blank line, assume that the headers are
+ # intentionally sorted the way they are.
+ if (self._last_header > header_path and
+ Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
+ return False
+ return True
+
+ def CheckNextIncludeOrder(self, header_type):
+ """Returns a non-empty error message if the next header is out of order.
+
+ This function also updates the internal state to be ready to check
+ the next include.
+
+ Args:
+ header_type: One of the _XXX_HEADER constants defined above.
+
+ Returns:
+ The empty string if the header is in the right order, or an
+ error message describing what's wrong.
+
+ """
+ error_message = ('Found %s after %s' %
+ (self._TYPE_NAMES[header_type],
+ self._SECTION_NAMES[self._section]))
+
+ last_section = self._section
+
+ if header_type == _C_SYS_HEADER:
+ if self._section <= self._C_SECTION:
+ self._section = self._C_SECTION
+ else:
+ self._last_header = ''
+ return error_message
+ elif header_type == _CPP_SYS_HEADER:
+ if self._section <= self._CPP_SECTION:
+ self._section = self._CPP_SECTION
+ else:
+ self._last_header = ''
+ return error_message
+ elif header_type == _LIKELY_MY_HEADER:
+ if self._section <= self._MY_H_SECTION:
+ self._section = self._MY_H_SECTION
+ else:
+ self._section = self._OTHER_H_SECTION
+ elif header_type == _POSSIBLE_MY_HEADER:
+ if self._section <= self._MY_H_SECTION:
+ self._section = self._MY_H_SECTION
+ else:
+ # This will always be the fallback because we're not sure
+ # enough that the header is associated with this file.
+ self._section = self._OTHER_H_SECTION
+ else:
+ assert header_type == _OTHER_HEADER
+ self._section = self._OTHER_H_SECTION
+
+ if last_section != self._section:
+ self._last_header = ''
+
+ return ''
+
+
+class _CppLintState(object):
+ """Maintains module-wide state.."""
+
+ def __init__(self):
+ self.verbose_level = 1 # global setting.
+ self.error_count = 0 # global count of reported errors
+ # filters to apply when emitting error messages
+ self.filters = _DEFAULT_FILTERS[:]
+ # backup of filter list. Used to restore the state after each file.
+ self._filters_backup = self.filters[:]
+ self.counting = 'total' # In what way are we counting errors?
+ self.errors_by_category = {} # string to int dict storing error counts
+ self.quiet = False # Suppress non-error messagess?
+
+ # output format:
+ # "emacs" - format that emacs can parse (default)
+ # "vs7" - format that Microsoft Visual Studio 7 can parse
+ self.output_format = 'emacs'
+
+ def SetOutputFormat(self, output_format):
+ """Sets the output format for errors."""
+ self.output_format = output_format
+
+ def SetQuiet(self, quiet):
+ """Sets the module's quiet settings, and returns the previous setting."""
+ last_quiet = self.quiet
+ self.quiet = quiet
+ return last_quiet
+
+ def SetVerboseLevel(self, level):
+ """Sets the module's verbosity, and returns the previous setting."""
+ last_verbose_level = self.verbose_level
+ self.verbose_level = level
+ return last_verbose_level
+
+ def SetCountingStyle(self, counting_style):
+ """Sets the module's counting options."""
+ self.counting = counting_style
+
+ def SetFilters(self, filters):
+ """Sets the error-message filters.
+
+ These filters are applied when deciding whether to emit a given
+ error message.
+
+ Args:
+ filters: A string of comma-separated filters (eg "+whitespace/indent").
+ Each filter should start with + or -; else we die.
+
+ Raises:
+ ValueError: The comma-separated filters did not all start with '+' or '-'.
+ E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+ """
+ # Default filters always have less priority than the flag ones.
+ self.filters = _DEFAULT_FILTERS[:]
+ self.AddFilters(filters)
+
+ def AddFilters(self, filters):
+ """ Adds more filters to the existing list of error-message filters. """
+ for filt in filters.split(','):
+ clean_filt = filt.strip()
+ if clean_filt:
+ self.filters.append(clean_filt)
+ for filt in self.filters:
+ if not (filt.startswith('+') or filt.startswith('-')):
+ raise ValueError('Every filter in --filters must start with + or -'
+ ' (%s does not)' % filt)
+
+ def BackupFilters(self):
+ """ Saves the current filter list to backup storage."""
+ self._filters_backup = self.filters[:]
+
+ def RestoreFilters(self):
+ """ Restores filters previously backed up."""
+ self.filters = self._filters_backup[:]
+
+ def ResetErrorCounts(self):
+ """Sets the module's error statistic back to zero."""
+ self.error_count = 0
+ self.errors_by_category = {}
+
+ def IncrementErrorCount(self, category):
+ """Bumps the module's error statistic."""
+ self.error_count += 1
+ if self.counting in ('toplevel', 'detailed'):
+ if self.counting != 'detailed':
+ category = category.split('/')[0]
+ if category not in self.errors_by_category:
+ self.errors_by_category[category] = 0
+ self.errors_by_category[category] += 1
+
+ def PrintErrorCounts(self):
+ """Print a summary of errors by category, and the total."""
+ for category, count in self.errors_by_category.iteritems():
+ sys.stderr.write('Category \'%s\' errors found: %d\n' %
+ (category, count))
+ sys.stdout.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+ """Gets the module's output format."""
+ return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+ """Sets the module's output format."""
+ _cpplint_state.SetOutputFormat(output_format)
+
+def _Quiet():
+ """Return's the module's quiet setting."""
+ return _cpplint_state.quiet
+
+def _SetQuiet(quiet):
+ """Set the module's quiet status, and return previous setting."""
+ return _cpplint_state.SetQuiet(quiet)
+
+
+def _VerboseLevel():
+ """Returns the module's verbosity setting."""
+ return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+ """Sets the module's verbosity, and returns the previous setting."""
+ return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+ """Sets the module's counting options."""
+ _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+ """Returns the module's list of output filters, as a list."""
+ return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+ """Sets the module's error-message filters.
+
+ These filters are applied when deciding whether to emit a given
+ error message.
+
+ Args:
+ filters: A string of comma-separated filters (eg "whitespace/indent").
+ Each filter should start with + or -; else we die.
+ """
+ _cpplint_state.SetFilters(filters)
+
+def _AddFilters(filters):
+ """Adds more filter overrides.
+
+ Unlike _SetFilters, this function does not reset the current list of filters
+ available.
+
+ Args:
+ filters: A string of comma-separated filters (eg "whitespace/indent").
+ Each filter should start with + or -; else we die.
+ """
+ _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+ """ Saves the current filter list to backup storage."""
+ _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+ """ Restores filters previously backed up."""
+ _cpplint_state.RestoreFilters()
+
+class _FunctionState(object):
+ """Tracks current function name and the number of lines in its body."""
+
+ _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc.
+ _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER.
+
+ def __init__(self):
+ self.in_a_function = False
+ self.lines_in_function = 0
+ self.current_function = ''
+
+ def Begin(self, function_name):
+ """Start analyzing function body.
+
+ Args:
+ function_name: The name of the function being tracked.
+ """
+ self.in_a_function = True
+ self.lines_in_function = 0
+ self.current_function = function_name
+
+ def Count(self):
+ """Count line in current function body."""
+ if self.in_a_function:
+ self.lines_in_function += 1
+
+ def Check(self, error, filename, linenum):
+ """Report if too many lines in function body.
+
+ Args:
+ error: The function to call with any errors found.
+ filename: The name of the current file.
+ linenum: The number of the line to check.
+ """
+ if not self.in_a_function:
+ return
+
+ if Match(r'T(EST|est)', self.current_function):
+ base_trigger = self._TEST_TRIGGER
+ else:
+ base_trigger = self._NORMAL_TRIGGER
+ trigger = base_trigger * 2**_VerboseLevel()
+
+ if self.lines_in_function > trigger:
+ error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+ # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+ if error_level > 5:
+ error_level = 5
+ error(filename, linenum, 'readability/fn_size', error_level,
+ 'Small and focused functions are preferred:'
+ ' %s has %d non-comment lines'
+ ' (error triggered by exceeding %d lines).' % (
+ self.current_function, self.lines_in_function, trigger))
+
+ def End(self):
+ """Stop analyzing function body."""
+ self.in_a_function = False
+
+
+class _IncludeError(Exception):
+ """Indicates a problem with the include order in a file."""
+ pass
+
+
+class FileInfo(object):
+ """Provides utility functions for filenames.
+
+ FileInfo provides easy access to the components of a file's path
+ relative to the project root.
+ """
+
+ def __init__(self, filename):
+ self._filename = filename
+
+ def FullName(self):
+ """Make Windows paths like Unix."""
+ return os.path.abspath(self._filename).replace('\\', '/')
+
+ def RepositoryName(self):
+ """FullName after removing the local path to the repository.
+
+ If we have a real absolute path name here we can try to do something smart:
+ detecting the root of the checkout and truncating /path/to/checkout from
+ the name so that we get header guards that don't include things like
+ "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+ people on different computers who have checked the source out to different
+ locations won't see bogus errors.
+ """
+ fullname = self.FullName()
+
+ if os.path.exists(fullname):
+ project_dir = os.path.dirname(fullname)
+
+ if os.path.exists(os.path.join(project_dir, ".svn")):
+ # If there's a .svn file in the current directory, we recursively look
+ # up the directory tree for the top of the SVN checkout
+ root_dir = project_dir
+ one_up_dir = os.path.dirname(root_dir)
+ while os.path.exists(os.path.join(one_up_dir, ".svn")):
+ root_dir = os.path.dirname(root_dir)
+ one_up_dir = os.path.dirname(one_up_dir)
+
+ prefix = os.path.commonprefix([root_dir, project_dir])
+ return fullname[len(prefix) + 1:]
+
+ # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+ # searching up from the current path.
+ root_dir = current_dir = os.path.dirname(fullname)
+ while current_dir != os.path.dirname(current_dir):
+ if (os.path.exists(os.path.join(current_dir, ".git")) or
+ os.path.exists(os.path.join(current_dir, ".hg")) or
+ os.path.exists(os.path.join(current_dir, ".svn"))):
+ root_dir = current_dir
+ current_dir = os.path.dirname(current_dir)
+
+ if (os.path.exists(os.path.join(root_dir, ".git")) or
+ os.path.exists(os.path.join(root_dir, ".hg")) or
+ os.path.exists(os.path.join(root_dir, ".svn"))):
+ prefix = os.path.commonprefix([root_dir, project_dir])
+ return fullname[len(prefix) + 1:]
+
+ # Don't know what to do; header guard warnings may be wrong...
+ return fullname
+
+ def Split(self):
+ """Splits the file into the directory, basename, and extension.
+
+ For 'chrome/browser/browser.cc', Split() would
+ return ('chrome/browser', 'browser', '.cc')
+
+ Returns:
+ A tuple of (directory, basename, extension).
+ """
+
+ googlename = self.RepositoryName()
+ project, rest = os.path.split(googlename)
+ return (project,) + os.path.splitext(rest)
+
+ def BaseName(self):
+ """File base name - text after the final slash, before the final period."""
+ return self.Split()[1]
+
+ def Extension(self):
+ """File extension - text following the final period."""
+ return self.Split()[2]
+
+ def NoExtension(self):
+ """File has no source file extension."""
+ return '/'.join(self.Split()[0:2])
+
+ def IsSource(self):
+ """File has a source file extension."""
+ return _IsSourceExtension(self.Extension()[1:])
+
+
+def _ShouldPrintError(category, confidence, linenum):
+ """If confidence >= verbose, category passes filter and is not suppressed."""
+
+ # There are three ways we might decide not to print an error message:
+ # a "NOLINT(category)" comment appears in the source,
+ # the verbosity level isn't high enough, or the filters filter it out.
+ if IsErrorSuppressedByNolint(category, linenum):
+ return False
+
+ if confidence < _cpplint_state.verbose_level:
+ return False
+
+ is_filtered = False
+ for one_filter in _Filters():
+ if one_filter.startswith('-'):
+ if category.startswith(one_filter[1:]):
+ is_filtered = True
+ elif one_filter.startswith('+'):
+ if category.startswith(one_filter[1:]):
+ is_filtered = False
+ else:
+ assert False # should have been checked for in SetFilter.
+ if is_filtered:
+ return False
+
+ return True
+
+
+def Error(filename, linenum, category, confidence, message):
+ """Logs the fact we've found a lint error.
+
+ We log where the error was found, and also our confidence in the error,
+ that is, how certain we are this is a legitimate style regression, and
+ not a misidentification or a use that's sometimes justified.
+
+ False positives can be suppressed by the use of
+ "cpplint(category)" comments on the offending line. These are
+ parsed into _error_suppressions.
+
+ Args:
+ filename: The name of the file containing the error.
+ linenum: The number of the line containing the error.
+ category: A string used to describe the "category" this bug
+ falls under: "whitespace", say, or "runtime". Categories
+ may have a hierarchy separated by slashes: "whitespace/indent".
+ confidence: A number from 1-5 representing a confidence score for
+ the error, with 5 meaning that we are certain of the problem,
+ and 1 meaning that it could be a legitimate construct.
+ message: The error message.
+ """
+ if _ShouldPrintError(category, confidence, linenum):
+ _cpplint_state.IncrementErrorCount(category)
+ if _cpplint_state.output_format == 'vs7':
+ sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % (
+ filename, linenum, category, message, confidence))
+ elif _cpplint_state.output_format == 'eclipse':
+ sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % (
+ filename, linenum, message, category, confidence))
+ else:
+ sys.stderr.write('%s:%s: %s [%s] [%d]\n' % (
+ filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+ r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+ r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+ _RE_PATTERN_C_COMMENTS + r'\s+|' +
+ r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+ _RE_PATTERN_C_COMMENTS + r')')
+
+
+def IsCppString(line):
+ """Does line terminate so, that the next symbol is in string constant.
+
+ This function does not consider single-line nor multi-line comments.
+
+ Args:
+ line: is a partial line of code starting from the 0..n.
+
+ Returns:
+ True, if next character appended to 'line' is inside a
+ string constant.
+ """
+
+ line = line.replace(r'\\', 'XX') # after this, \\" does not match to \"
+ return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+ """Removes C++11 raw strings from lines.
+
+ Before:
+ static const char kData[] = R"(
+ multi-line string
+ )";
+
+ After:
+ static const char kData[] = ""
+ (replaced by blank line)
+ "";
+
+ Args:
+ raw_lines: list of raw lines.
+
+ Returns:
+ list of lines with C++11 raw strings replaced by empty strings.
+ """
+
+ delimiter = None
+ lines_without_raw_strings = []
+ for line in raw_lines:
+ if delimiter:
+ # Inside a raw string, look for the end
+ end = line.find(delimiter)
+ if end >= 0:
+ # Found the end of the string, match leading space for this
+ # line and resume copying the original lines, and also insert
+ # a "" on the last line.
+ leading_space = Match(r'^(\s*)\S', line)
+ line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+ delimiter = None
+ else:
+ # Haven't found the end yet, append a blank line.
+ line = '""'
+
+ # Look for beginning of a raw string, and replace them with
+ # empty strings. This is done in a loop to handle multiple raw
+ # strings on the same line.
+ while delimiter is None:
+ # Look for beginning of a raw string.
+ # See 2.14.15 [lex.string] for syntax.
+ #
+ # Once we have matched a raw string, we check the prefix of the
+ # line to make sure that the line is not part of a single line
+ # comment. It's done this way because we remove raw strings
+ # before removing comments as opposed to removing comments
+ # before removing raw strings. This is because there are some
+ # cpplint checks that requires the comments to be preserved, but
+ # we don't want to check comments that are inside raw strings.
+ matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+ if (matched and
+ not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//',
+ matched.group(1))):
+ delimiter = ')' + matched.group(2) + '"'
+
+ end = matched.group(3).find(delimiter)
+ if end >= 0:
+ # Raw string ended on same line
+ line = (matched.group(1) + '""' +
+ matched.group(3)[end + len(delimiter):])
+ delimiter = None
+ else:
+ # Start of a multi-line raw string
+ line = matched.group(1) + '""'
+ else:
+ break
+
+ lines_without_raw_strings.append(line)
+
+ # TODO(unknown): if delimiter is not None here, we might want to
+ # emit a warning for unterminated string.
+ return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+ """Find the beginning marker for a multiline comment."""
+ while lineix < len(lines):
+ if lines[lineix].strip().startswith('/*'):
+ # Only return this marker if the comment goes beyond this line
+ if lines[lineix].strip().find('*/', 2) < 0:
+ return lineix
+ lineix += 1
+ return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+ """We are inside a comment, find the end marker."""
+ while lineix < len(lines):
+ if lines[lineix].strip().endswith('*/'):
+ return lineix
+ lineix += 1
+ return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+ """Clears a range of lines for multi-line comments."""
+ # Having // <empty> comments makes the lines non-empty, so we will not get
+ # unnecessary blank line warnings later in the code.
+ for i in range(begin, end):
+ lines[i] = '/**/'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+ """Removes multiline (c-style) comments from lines."""
+ lineix = 0
+ while lineix < len(lines):
+ lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+ if lineix_begin >= len(lines):
+ return
+ lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+ if lineix_end >= len(lines):
+ error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+ 'Could not find end of multi-line comment')
+ return
+ RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+ lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+ """Removes //-comments and single-line C-style /* */ comments.
+
+ Args:
+ line: A line of C++ source.
+
+ Returns:
+ The line with single-line comments removed.
+ """
+ commentpos = line.find('//')
+ if commentpos != -1 and not IsCppString(line[:commentpos]):
+ line = line[:commentpos].rstrip()
+ # get rid of /* ... */
+ return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+ """Holds 4 copies of all lines with different preprocessing applied to them.
+
+ 1) elided member contains lines without strings and comments.
+ 2) lines member contains lines without comments.
+ 3) raw_lines member contains all the lines without processing.
+ 4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+ strings removed.
+ All these members are of <type 'list'>, and of the same length.
+ """
+
+ def __init__(self, lines):
+ self.elided = []
+ self.lines = []
+ self.raw_lines = lines
+ self.num_lines = len(lines)
+ self.lines_without_raw_strings = CleanseRawStrings(lines)
+ for linenum in range(len(self.lines_without_raw_strings)):
+ self.lines.append(CleanseComments(
+ self.lines_without_raw_strings[linenum]))
+ elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+ self.elided.append(CleanseComments(elided))
+
+ def NumLines(self):
+ """Returns the number of lines represented."""
+ return self.num_lines
+
+ @staticmethod
+ def _CollapseStrings(elided):
+ """Collapses strings and chars on a line to simple "" or '' blocks.
+
+ We nix strings first so we're not fooled by text like '"http://"'
+
+ Args:
+ elided: The line being processed.
+
+ Returns:
+ The line with collapsed strings.
+ """
+ if _RE_PATTERN_INCLUDE.match(elided):
+ return elided
+
+ # Remove escaped characters first to make quote/single quote collapsing
+ # basic. Things that look like escaped characters shouldn't occur
+ # outside of strings and chars.
+ elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+ # Replace quoted strings and digit separators. Both single quotes
+ # and double quotes are processed in the same loop, otherwise
+ # nested quotes wouldn't work.
+ collapsed = ''
+ while True:
+ # Find the first quote character
+ match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+ if not match:
+ collapsed += elided
+ break
+ head, quote, tail = match.groups()
+
+ if quote == '"':
+ # Collapse double quoted strings
+ second_quote = tail.find('"')
+ if second_quote >= 0:
+ collapsed += head + '""'
+ elided = tail[second_quote + 1:]
+ else:
+ # Unmatched double quote, don't bother processing the rest
+ # of the line since this is probably a multiline string.
+ collapsed += elided
+ break
+ else:
+ # Found single quote, check nearby text to eliminate digit separators.
+ #
+ # There is no special handling for floating point here, because
+ # the integer/fractional/exponent parts would all be parsed
+ # correctly as long as there are digits on both sides of the
+ # separator. So we are fine as long as we don't see something
+ # like "0.'3" (gcc 4.9.0 will not allow this literal).
+ if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+ match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+ collapsed += head + match_literal.group(1).replace("'", '')
+ elided = match_literal.group(2)
+ else:
+ second_quote = tail.find('\'')
+ if second_quote >= 0:
+ collapsed += head + "''"
+ elided = tail[second_quote + 1:]
+ else:
+ # Unmatched single quote
+ collapsed += elided
+ break
+
+ return collapsed
+
+
+def FindEndOfExpressionInLine(line, startpos, stack):
+ """Find the position just after the end of current parenthesized expression.
+
+ Args:
+ line: a CleansedLines line.
+ startpos: start searching at this position.
+ stack: nesting stack at startpos.
+
+ Returns:
+ On finding matching end: (index just after matching end, None)
+ On finding an unclosed expression: (-1, None)
+ Otherwise: (-1, new stack at end of this line)
+ """
+ for i in xrange(startpos, len(line)):
+ char = line[i]
+ if char in '([{':
+ # Found start of parenthesized expression, push to expression stack
+ stack.append(char)
+ elif char == '<':
+ # Found potential start of template argument list
+ if i > 0 and line[i - 1] == '<':
+ # Left shift operator
+ if stack and stack[-1] == '<':
+ stack.pop()
+ if not stack:
+ return (-1, None)
+ elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+ # operator<, don't add to stack
+ continue
+ else:
+ # Tentative start of template argument list
+ stack.append('<')
+ elif char in ')]}':
+ # Found end of parenthesized expression.
+ #
+ # If we are currently expecting a matching '>', the pending '<'
+ # must have been an operator. Remove them from expression stack.
+ while stack and stack[-1] == '<':
+ stack.pop()
+ if not stack:
+ return (-1, None)
+ if ((stack[-1] == '(' and char == ')') or
+ (stack[-1] == '[' and char == ']') or
+ (stack[-1] == '{' and char == '}')):
+ stack.pop()
+ if not stack:
+ return (i + 1, None)
+ else:
+ # Mismatched parentheses
+ return (-1, None)
+ elif char == '>':
+ # Found potential end of template argument list.
+
+ # Ignore "->" and operator functions
+ if (i > 0 and
+ (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+ continue
+
+ # Pop the stack if there is a matching '<'. Otherwise, ignore
+ # this '>' since it must be an operator.
+ if stack:
+ if stack[-1] == '<':
+ stack.pop()
+ if not stack:
+ return (i + 1, None)
+ elif char == ';':
+ # Found something that look like end of statements. If we are currently
+ # expecting a '>', the matching '<' must have been an operator, since
+ # template argument list should not contain statements.
+ while stack and stack[-1] == '<':
+ stack.pop()
+ if not stack:
+ return (-1, None)
+
+ # Did not find end of expression or unbalanced parentheses on this line
+ return (-1, stack)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+ """If input points to ( or { or [ or <, finds the position that closes it.
+
+ If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+ linenum/pos that correspond to the closing of the expression.
+
+ TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+ Ideally we would want to index all opening and closing parentheses once
+ and have CloseExpression be just a simple lookup, but due to preprocessor
+ tricks, this is not so easy.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ pos: A position on the line.
+
+ Returns:
+ A tuple (line, linenum, pos) pointer *past* the closing brace, or
+ (line, len(lines), -1) if we never find a close. Note we ignore
+ strings and comments when matching; and the line we return is the
+ 'cleansed' line at linenum.
+ """
+
+ line = clean_lines.elided[linenum]
+ if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
+ return (line, clean_lines.NumLines(), -1)
+
+ # Check first line
+ (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
+ if end_pos > -1:
+ return (line, linenum, end_pos)
+
+ # Continue scanning forward
+ while stack and linenum < clean_lines.NumLines() - 1:
+ linenum += 1
+ line = clean_lines.elided[linenum]
+ (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
+ if end_pos > -1:
+ return (line, linenum, end_pos)
+
+ # Did not find end of expression before end of file, give up
+ return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, stack):
+ """Find position at the matching start of current expression.
+
+ This is almost the reverse of FindEndOfExpressionInLine, but note
+ that the input position and returned position differs by 1.
+
+ Args:
+ line: a CleansedLines line.
+ endpos: start searching at this position.
+ stack: nesting stack at endpos.
+
+ Returns:
+ On finding matching start: (index at matching start, None)
+ On finding an unclosed expression: (-1, None)
+ Otherwise: (-1, new stack at beginning of this line)
+ """
+ i = endpos
+ while i >= 0:
+ char = line[i]
+ if char in ')]}':
+ # Found end of expression, push to expression stack
+ stack.append(char)
+ elif char == '>':
+ # Found potential end of template argument list.
+ #
+ # Ignore it if it's a "->" or ">=" or "operator>"
+ if (i > 0 and
+ (line[i - 1] == '-' or
+ Match(r'\s>=\s', line[i - 1:]) or
+ Search(r'\boperator\s*$', line[0:i]))):
+ i -= 1
+ else:
+ stack.append('>')
+ elif char == '<':
+ # Found potential start of template argument list
+ if i > 0 and line[i - 1] == '<':
+ # Left shift operator
+ i -= 1
+ else:
+ # If there is a matching '>', we can pop the expression stack.
+ # Otherwise, ignore this '<' since it must be an operator.
+ if stack and stack[-1] == '>':
+ stack.pop()
+ if not stack:
+ return (i, None)
+ elif char in '([{':
+ # Found start of expression.
+ #
+ # If there are any unmatched '>' on the stack, they must be
+ # operators. Remove those.
+ while stack and stack[-1] == '>':
+ stack.pop()
+ if not stack:
+ return (-1, None)
+ if ((char == '(' and stack[-1] == ')') or
+ (char == '[' and stack[-1] == ']') or
+ (char == '{' and stack[-1] == '}')):
+ stack.pop()
+ if not stack:
+ return (i, None)
+ else:
+ # Mismatched parentheses
+ return (-1, None)
+ elif char == ';':
+ # Found something that look like end of statements. If we are currently
+ # expecting a '<', the matching '>' must have been an operator, since
+ # template argument list should not contain statements.
+ while stack and stack[-1] == '>':
+ stack.pop()
+ if not stack:
+ return (-1, None)
+
+ i -= 1
+
+ return (-1, stack)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+ """If input points to ) or } or ] or >, finds the position that opens it.
+
+ If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+ linenum/pos that correspond to the opening of the expression.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ pos: A position on the line.
+
+ Returns:
+ A tuple (line, linenum, pos) pointer *at* the opening brace, or
+ (line, 0, -1) if we never find the matching opening brace. Note
+ we ignore strings and comments when matching; and the line we
+ return is the 'cleansed' line at linenum.
+ """
+ line = clean_lines.elided[linenum]
+ if line[pos] not in ')}]>':
+ return (line, 0, -1)
+
+ # Check last line
+ (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
+ if start_pos > -1:
+ return (line, linenum, start_pos)
+
+ # Continue scanning backward
+ while stack and linenum > 0:
+ linenum -= 1
+ line = clean_lines.elided[linenum]
+ (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
+ if start_pos > -1:
+ return (line, linenum, start_pos)
+
+ # Did not find start of expression before beginning of file, give up
+ return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+ """Logs an error if no Copyright message appears at the top of the file."""
+
+ # We'll say it should occur by line 10. Don't forget there's a
+ # placeholder line at the front.
+ for line in xrange(1, min(len(lines), 11)):
+ if re.search(r'Copyright', lines[line], re.I): break
+ else: # means no copyright line was found
+ error(filename, 0, 'legal/copyright', 5,
+ 'No copyright message found. '
+ 'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetIndentLevel(line):
+ """Return the number of leading spaces in line.
+
+ Args:
+ line: A string to check.
+
+ Returns:
+ An integer count of leading spaces, possibly zero.
+ """
+ indent = Match(r'^( *)\S', line)
+ if indent:
+ return len(indent.group(1))
+ else:
+ return 0
+
+def PathSplitToList(path):
+ """Returns the path split into a list by the separator.
+
+ Args:
+ path: An absolute or relative path (e.g. '/a/b/c/' or '../a')
+
+ Returns:
+ A list of path components (e.g. ['a', 'b', 'c]).
+ """
+ lst = []
+ while True:
+ (head, tail) = os.path.split(path)
+ if head == path: # absolute paths end
+ lst.append(head)
+ break
+ if tail == path: # relative paths end
+ lst.append(tail)
+ break
+
+ path = head
+ lst.append(tail)
+
+ lst.reverse()
+ return lst
+
+def GetHeaderGuardCPPVariable(filename):
+ """Returns the CPP variable that should be used as a header guard.
+
+ Args:
+ filename: The name of a C++ header file.
+
+ Returns:
+ The CPP variable that should be used as a header guard in the
+ named file.
+
+ """
+
+ # Restores original filename in case that cpplint is invoked from Emacs's
+ # flymake.
+ filename = re.sub(r'_flymake\.h$', '.h', filename)
+ filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+ # Replace 'c++' with 'cpp'.
+ filename = filename.replace('C++', 'cpp').replace('c++', 'cpp')
+
+ fileinfo = FileInfo(filename)
+ file_path_from_root = fileinfo.RepositoryName()
+
+ def FixupPathFromRoot():
+ if _root_debug:
+ sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n"
+ %(_root, fileinfo.RepositoryName()))
+
+ # Process the file path with the --root flag if it was set.
+ if not _root:
+ if _root_debug:
+ sys.stderr.write("_root unspecified\n")
+ return file_path_from_root
+
+ def StripListPrefix(lst, prefix):
+ # f(['x', 'y'], ['w, z']) -> None (not a valid prefix)
+ if lst[:len(prefix)] != prefix:
+ return None
+ # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd']
+ return lst[(len(prefix)):]
+
+ # root behavior:
+ # --root=subdir , lstrips subdir from the header guard
+ maybe_path = StripListPrefix(PathSplitToList(file_path_from_root),
+ PathSplitToList(_root))
+
+ if _root_debug:
+ sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," +
+ " _root=%s)\n") %(maybe_path, file_path_from_root, _root))
+
+ if maybe_path:
+ return os.path.join(*maybe_path)
+
+ # --root=.. , will prepend the outer directory to the header guard
+ full_path = fileinfo.FullName()
+ root_abspath = os.path.abspath(_root)
+
+ maybe_path = StripListPrefix(PathSplitToList(full_path),
+ PathSplitToList(root_abspath))
+
+ if _root_debug:
+ sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " +
+ "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath))
+
+ if maybe_path:
+ return os.path.join(*maybe_path)
+
+ if _root_debug:
+ sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root))
+
+ # --root=FAKE_DIR is ignored
+ return file_path_from_root
+
+ file_path_from_root = FixupPathFromRoot()
+ return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, clean_lines, error):
+ """Checks that the file contains a header guard.
+
+ Logs an error if no #ifndef header guard is present. For other
+ headers, checks that the full pathname is used.
+
+ Args:
+ filename: The name of the C++ header file.
+ clean_lines: A CleansedLines instance containing the file.
+ error: The function to call with any errors found.
+ """
+
+ # Don't check for header guards if there are error suppression
+ # comments somewhere in this file.
+ #
+ # Because this is silencing a warning for a nonexistent line, we
+ # only support the very specific NOLINT(build/header_guard) syntax,
+ # and not the general NOLINT or NOLINT(*) syntax.
+ raw_lines = clean_lines.lines_without_raw_strings
+ for i in raw_lines:
+ if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+ return
+
+ cppvar = GetHeaderGuardCPPVariable(filename)
+
+ ifndef = ''
+ ifndef_linenum = 0
+ define = ''
+ endif = ''
+ endif_linenum = 0
+ for linenum, line in enumerate(raw_lines):
+ linesplit = line.split()
+ if len(linesplit) >= 2:
+ # find the first occurrence of #ifndef and #define, save arg
+ if not ifndef and linesplit[0] == '#ifndef':
+ # set ifndef to the header guard presented on the #ifndef line.
+ ifndef = linesplit[1]
+ ifndef_linenum = linenum
+ if not define and linesplit[0] == '#define':
+ define = linesplit[1]
+ # find the last occurrence of #endif, save entire line
+ if line.startswith('#endif'):
+ endif = line
+ endif_linenum = linenum
+
+ if not ifndef or not define or ifndef != define:
+ error(filename, 0, 'build/header_guard', 5,
+ 'No #ifndef header guard found, suggested CPP variable is: %s' %
+ cppvar)
+ return
+
+ # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+ # for backward compatibility.
+ if ifndef != cppvar:
+ error_level = 0
+ if ifndef != cppvar + '_':
+ error_level = 5
+
+ ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
+ error)
+ error(filename, ifndef_linenum, 'build/header_guard', error_level,
+ '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+ # Check for "//" comments on endif line.
+ ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+ error)
+ match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+ if match:
+ if match.group(1) == '_':
+ # Issue low severity warning for deprecated double trailing underscore
+ error(filename, endif_linenum, 'build/header_guard', 0,
+ '#endif line should be "#endif // %s"' % cppvar)
+ return
+
+ # Didn't find the corresponding "//" comment. If this file does not
+ # contain any "//" comments at all, it could be that the compiler
+ # only wants "/**/" comments, look for those instead.
+ no_single_line_comments = True
+ for i in xrange(1, len(raw_lines) - 1):
+ line = raw_lines[i]
+ if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+ no_single_line_comments = False
+ break
+
+ if no_single_line_comments:
+ match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+ if match:
+ if match.group(1) == '_':
+ # Low severity warning for double trailing underscore
+ error(filename, endif_linenum, 'build/header_guard', 0,
+ '#endif line should be "#endif /* %s */"' % cppvar)
+ return
+
+ # Didn't find anything
+ error(filename, endif_linenum, 'build/header_guard', 5,
+ '#endif line should be "#endif // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+ """Logs an error if a .cc file does not include its header."""
+
+ # Do not check test files
+ fileinfo = FileInfo(filename)
+ if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()):
+ return
+
+ headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h'
+ if not os.path.exists(headerfile):
+ return
+ headername = FileInfo(headerfile).RepositoryName()
+ first_include = 0
+ for section_list in include_state.include_list:
+ for f in section_list:
+ if headername in f[0] or f[0] in headername:
+ return
+ if not first_include:
+ first_include = f[1]
+
+ error(filename, first_include, 'build/include', 5,
+ '%s should include its header file %s' % (fileinfo.RepositoryName(),
+ headername))
+
+
+def CheckForBadCharacters(filename, lines, error):
+ """Logs an error for each line containing bad characters.
+
+ Two kinds of bad characters:
+
+ 1. Unicode replacement characters: These indicate that either the file
+ contained invalid UTF-8 (likely) or Unicode replacement characters (which
+ it shouldn't). Note that it's possible for this to throw off line
+ numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+ 2. NUL bytes. These are problematic for some tools.
+
+ Args:
+ filename: The name of the current file.
+ lines: An array of strings, each representing a line of the file.
+ error: The function to call with any errors found.
+ """
+ for linenum, line in enumerate(lines):
+ if u'\ufffd' in line:
+ error(filename, linenum, 'readability/utf8', 5,
+ 'Line contains invalid UTF-8 (or Unicode replacement character).')
+ if '\0' in line:
+ error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+ """Logs an error if there is no newline char at the end of the file.
+
+ Args:
+ filename: The name of the current file.
+ lines: An array of strings, each representing a line of the file.
+ error: The function to call with any errors found.
+ """
+
+ # The array lines() was created by adding two newlines to the
+ # original file (go figure), then splitting on \n.
+ # To verify that the file ends in \n, we just have to make sure the
+ # last-but-two element of lines() exists and is empty.
+ if len(lines) < 3 or lines[-2]:
+ error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+ 'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+ """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+ /* ... */ comments are legit inside macros, for one line.
+ Otherwise, we prefer // comments, so it's ok to warn about the
+ other. Likewise, it's ok for strings to extend across multiple
+ lines, as long as a line continuation character (backslash)
+ terminates each line. Although not currently prohibited by the C++
+ style guide, it's ugly and unnecessary. We don't do well with either
+ in this lint program, so we warn about both.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+ # second (escaped) slash may trigger later \" detection erroneously.
+ line = line.replace('\\\\', '')
+
+ if line.count('/*') > line.count('*/'):
+ error(filename, linenum, 'readability/multiline_comment', 5,
+ 'Complex multi-line /*...*/-style comment found. '
+ 'Lint may give bogus warnings. '
+ 'Consider replacing these with //-style comments, '
+ 'with #if 0...#endif, '
+ 'or with more clearly structured multi-line comments.')
+
+ if (line.count('"') - line.count('\\"')) % 2:
+ error(filename, linenum, 'readability/multiline_string', 5,
+ 'Multi-line string ("...") found. This lint script doesn\'t '
+ 'do well with such strings, and may give bogus warnings. '
+ 'Use C++11 raw strings or concatenation instead.')
+
+
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+# _rand(); // false positive due to substring match.
+# ->rand(); // some member function rand().
+# ACMRandom rand(seed); // some variable named rand.
+# ISAACRandom rand(); // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name. This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+ ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+ ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+ ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+ ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+ ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+ ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+ ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+ ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+ ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+ ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+ ('strtok(', 'strtok_r(',
+ _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+ ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
+ )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+ """Checks for calls to thread-unsafe functions.
+
+ Much code has been originally written without consideration of
+ multi-threading. Also, engineers are relying on their old experience;
+ they have learned posix before threading extensions were added. These
+ tests guide the engineers to use thread-safe functions (when using
+ posix directly).
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+ for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+ # Additional pattern matching check to confirm that this is the
+ # function we are looking for
+ if Search(pattern, line):
+ error(filename, linenum, 'runtime/threadsafe_fn', 2,
+ 'Consider using ' + multithread_safe_func +
+ '...) instead of ' + single_thread_func +
+ '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+ """Checks that VLOG() is only used for defining a logging level.
+
+ For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+ VLOG(FATAL) are not.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+ if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+ error(filename, linenum, 'runtime/vlog', 5,
+ 'VLOG() should be used with numeric verbosity level. '
+ 'Use LOG() if you want symbolic severity levels.')
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+ r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+ """Checks for invalid increment *count++.
+
+ For example following function:
+ void increment_counter(int* count) {
+ *count++;
+ }
+ is invalid, because it effectively does count++, moving pointer, and should
+ be replaced with ++*count, (*count)++ or *count += 1.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+ if _RE_PATTERN_INVALID_INCREMENT.match(line):
+ error(filename, linenum, 'runtime/invalid_increment', 5,
+ 'Changing pointer instead of value (or unused value of operator*).')
+
+
+def IsMacroDefinition(clean_lines, linenum):
+ if Search(r'^#define', clean_lines[linenum]):
+ return True
+
+ if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+ return True
+
+ return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+ return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
+class _BlockInfo(object):
+ """Stores information about a generic block of code."""
+
+ def __init__(self, linenum, seen_open_brace):
+ self.starting_linenum = linenum
+ self.seen_open_brace = seen_open_brace
+ self.open_parentheses = 0
+ self.inline_asm = _NO_ASM
+ self.check_namespace_indentation = False
+
+ def CheckBegin(self, filename, clean_lines, linenum, error):
+ """Run checks that applies to text up to the opening brace.
+
+ This is mostly for checking the text after the class identifier
+ and the "{", usually where the base class is specified. For other
+ blocks, there isn't much to check, so we always pass.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ pass
+
+ def CheckEnd(self, filename, clean_lines, linenum, error):
+ """Run checks that applies to text after the closing brace.
+
+ This is mostly used for checking end of namespace comments.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ pass
+
+ def IsBlockInfo(self):
+ """Returns true if this block is a _BlockInfo.
+
+ This is convenient for verifying that an object is an instance of
+ a _BlockInfo, but not an instance of any of the derived classes.
+
+ Returns:
+ True for this class, False for derived classes.
+ """
+ return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+ """Stores information about an 'extern "C"' block."""
+
+ def __init__(self, linenum):
+ _BlockInfo.__init__(self, linenum, True)
+
+
+class _ClassInfo(_BlockInfo):
+ """Stores information about a class."""
+
+ def __init__(self, name, class_or_struct, clean_lines, linenum):
+ _BlockInfo.__init__(self, linenum, False)
+ self.name = name
+ self.is_derived = False
+ self.check_namespace_indentation = True
+ if class_or_struct == 'struct':
+ self.access = 'public'
+ self.is_struct = True
+ else:
+ self.access = 'private'
+ self.is_struct = False
+
+ # Remember initial indentation level for this class. Using raw_lines here
+ # instead of elided to account for leading comments.
+ self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
+
+ # Try to find the end of the class. This will be confused by things like:
+ # class A {
+ # } *x = { ...
+ #
+ # But it's still good enough for CheckSectionSpacing.
+ self.last_line = 0
+ depth = 0
+ for i in range(linenum, clean_lines.NumLines()):
+ line = clean_lines.elided[i]
+ depth += line.count('{') - line.count('}')
+ if not depth:
+ self.last_line = i
+ break
+
+ def CheckBegin(self, filename, clean_lines, linenum, error):
+ # Look for a bare ':'
+ if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+ self.is_derived = True
+
+ def CheckEnd(self, filename, clean_lines, linenum, error):
+ # If there is a DISALLOW macro, it should appear near the end of
+ # the class.
+ seen_last_thing_in_class = False
+ for i in xrange(linenum - 1, self.starting_linenum, -1):
+ match = Search(
+ r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+ self.name + r'\)',
+ clean_lines.elided[i])
+ if match:
+ if seen_last_thing_in_class:
+ error(filename, i, 'readability/constructors', 3,
+ match.group(1) + ' should be the last thing in the class')
+ break
+
+ if not Match(r'^\s*$', clean_lines.elided[i]):
+ seen_last_thing_in_class = True
+
+ # Check that closing brace is aligned with beginning of the class.
+ # Only do this if the closing brace is indented by only whitespaces.
+ # This means we will not check single-line class definitions.
+ indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+ if indent and len(indent.group(1)) != self.class_indent:
+ if self.is_struct:
+ parent = 'struct ' + self.name
+ else:
+ parent = 'class ' + self.name
+ error(filename, linenum, 'whitespace/indent', 3,
+ 'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+ """Stores information about a namespace."""
+
+ def __init__(self, name, linenum):
+ _BlockInfo.__init__(self, linenum, False)
+ self.name = name or ''
+ self.check_namespace_indentation = True
+
+ def CheckEnd(self, filename, clean_lines, linenum, error):
+ """Check end of namespace comments."""
+ line = clean_lines.raw_lines[linenum]
+
+ # Check how many lines is enclosed in this namespace. Don't issue
+ # warning for missing namespace comments if there aren't enough
+ # lines. However, do apply checks if there is already an end of
+ # namespace comment and it's incorrect.
+ #
+ # TODO(unknown): We always want to check end of namespace comments
+ # if a namespace is large, but sometimes we also want to apply the
+ # check if a short namespace contained nontrivial things (something
+ # other than forward declarations). There is currently no logic on
+ # deciding what these nontrivial things are, so this check is
+ # triggered by namespace size only, which works most of the time.
+ if (linenum - self.starting_linenum < 10
+ and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)):
+ return
+
+ # Look for matching comment at end of namespace.
+ #
+ # Note that we accept C style "/* */" comments for terminating
+ # namespaces, so that code that terminate namespaces inside
+ # preprocessor macros can be cpplint clean.
+ #
+ # We also accept stuff like "// end of namespace <name>." with the
+ # period at the end.
+ #
+ # Besides these, we don't accept anything else, otherwise we might
+ # get false negatives when existing comment is a substring of the
+ # expected namespace.
+ if self.name:
+ # Named namespace
+ if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' +
+ re.escape(self.name) + r'[\*/\.\\\s]*$'),
+ line):
+ error(filename, linenum, 'readability/namespace', 5,
+ 'Namespace should be terminated with "// namespace %s"' %
+ self.name)
+ else:
+ # Anonymous namespace
+ if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+ # If "// namespace anonymous" or "// anonymous namespace (more text)",
+ # mention "// anonymous namespace" as an acceptable form
+ if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line):
+ error(filename, linenum, 'readability/namespace', 5,
+ 'Anonymous namespace should be terminated with "// namespace"'
+ ' or "// anonymous namespace"')
+ else:
+ error(filename, linenum, 'readability/namespace', 5,
+ 'Anonymous namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+ """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+ def __init__(self, stack_before_if):
+ # The entire nesting stack before #if
+ self.stack_before_if = stack_before_if
+
+ # The entire nesting stack up to #else
+ self.stack_before_else = []
+
+ # Whether we have already seen #else or #elif
+ self.seen_else = False
+
+
+class NestingState(object):
+ """Holds states related to parsing braces."""
+
+ def __init__(self):
+ # Stack for tracking all braces. An object is pushed whenever we
+ # see a "{", and popped when we see a "}". Only 3 types of
+ # objects are possible:
+ # - _ClassInfo: a class or struct.
+ # - _NamespaceInfo: a namespace.
+ # - _BlockInfo: some other type of block.
+ self.stack = []
+
+ # Top of the previous stack before each Update().
+ #
+ # Because the nesting_stack is updated at the end of each line, we
+ # had to do some convoluted checks to find out what is the current
+ # scope at the beginning of the line. This check is simplified by
+ # saving the previous top of nesting stack.
+ #
+ # We could save the full stack, but we only need the top. Copying
+ # the full nesting stack would slow down cpplint by ~10%.
+ self.previous_stack_top = []
+
+ # Stack of _PreprocessorInfo objects.
+ self.pp_stack = []
+
+ def SeenOpenBrace(self):
+ """Check if we have seen the opening brace for the innermost block.
+
+ Returns:
+ True if we have seen the opening brace, False if the innermost
+ block is still expecting an opening brace.
+ """
+ return (not self.stack) or self.stack[-1].seen_open_brace
+
+ def InNamespaceBody(self):
+ """Check if we are currently one level inside a namespace body.
+
+ Returns:
+ True if top of the stack is a namespace block, False otherwise.
+ """
+ return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+ def InExternC(self):
+ """Check if we are currently one level inside an 'extern "C"' block.
+
+ Returns:
+ True if top of the stack is an extern block, False otherwise.
+ """
+ return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+ def InClassDeclaration(self):
+ """Check if we are currently one level inside a class or struct declaration.
+
+ Returns:
+ True if top of the stack is a class/struct, False otherwise.
+ """
+ return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+ def InAsmBlock(self):
+ """Check if we are currently one level inside an inline ASM block.
+
+ Returns:
+ True if the top of the stack is a block containing inline ASM.
+ """
+ return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+ def InTemplateArgumentList(self, clean_lines, linenum, pos):
+ """Check if current position is inside template argument list.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ pos: position just after the suspected template argument.
+ Returns:
+ True if (linenum, pos) is inside template arguments.
+ """
+ while linenum < clean_lines.NumLines():
+ # Find the earliest character that might indicate a template argument
+ line = clean_lines.elided[linenum]
+ match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+ if not match:
+ linenum += 1
+ pos = 0
+ continue
+ token = match.group(1)
+ pos += len(match.group(0))
+
+ # These things do not look like template argument list:
+ # class Suspect {
+ # class Suspect x; }
+ if token in ('{', '}', ';'): return False
+
+ # These things look like template argument list:
+ # template <class Suspect>
+ # template <class Suspect = default_value>
+ # template <class Suspect[]>
+ # template <class Suspect...>
+ if token in ('>', '=', '[', ']', '.'): return True
+
+ # Check if token is an unmatched '<'.
+ # If not, move on to the next character.
+ if token != '<':
+ pos += 1
+ if pos >= len(line):
+ linenum += 1
+ pos = 0
+ continue
+
+ # We can't be sure if we just find a single '<', and need to
+ # find the matching '>'.
+ (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+ if end_pos < 0:
+ # Not sure if template argument list or syntax error in file
+ return False
+ linenum = end_line
+ pos = end_pos
+ return False
+
+ def UpdatePreprocessor(self, line):
+ """Update preprocessor stack.
+
+ We need to handle preprocessors due to classes like this:
+ #ifdef SWIG
+ struct ResultDetailsPageElementExtensionPoint {
+ #else
+ struct ResultDetailsPageElementExtensionPoint : public Extension {
+ #endif
+
+ We make the following assumptions (good enough for most files):
+ - Preprocessor condition evaluates to true from #if up to first
+ #else/#elif/#endif.
+
+ - Preprocessor condition evaluates to false from #else/#elif up
+ to #endif. We still perform lint checks on these lines, but
+ these do not affect nesting stack.
+
+ Args:
+ line: current line to check.
+ """
+ if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+ # Beginning of #if block, save the nesting stack here. The saved
+ # stack will allow us to restore the parsing state in the #else case.
+ self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+ elif Match(r'^\s*#\s*(else|elif)\b', line):
+ # Beginning of #else block
+ if self.pp_stack:
+ if not self.pp_stack[-1].seen_else:
+ # This is the first #else or #elif block. Remember the
+ # whole nesting stack up to this point. This is what we
+ # keep after the #endif.
+ self.pp_stack[-1].seen_else = True
+ self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+ # Restore the stack to how it was before the #if
+ self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+ else:
+ # TODO(unknown): unexpected #else, issue warning?
+ pass
+ elif Match(r'^\s*#\s*endif\b', line):
+ # End of #if or #else blocks.
+ if self.pp_stack:
+ # If we saw an #else, we will need to restore the nesting
+ # stack to its former state before the #else, otherwise we
+ # will just continue from where we left off.
+ if self.pp_stack[-1].seen_else:
+ # Here we can just use a shallow copy since we are the last
+ # reference to it.
+ self.stack = self.pp_stack[-1].stack_before_else
+ # Drop the corresponding #if
+ self.pp_stack.pop()
+ else:
+ # TODO(unknown): unexpected #endif, issue warning?
+ pass
+
+ # TODO(unknown): Update() is too long, but we will refactor later.
+ def Update(self, filename, clean_lines, linenum, error):
+ """Update nesting state with current line.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Remember top of the previous nesting stack.
+ #
+ # The stack is always pushed/popped and not modified in place, so
+ # we can just do a shallow copy instead of copy.deepcopy. Using
+ # deepcopy would slow down cpplint by ~28%.
+ if self.stack:
+ self.previous_stack_top = self.stack[-1]
+ else:
+ self.previous_stack_top = None
+
+ # Update pp_stack
+ self.UpdatePreprocessor(line)
+
+ # Count parentheses. This is to avoid adding struct arguments to
+ # the nesting stack.
+ if self.stack:
+ inner_block = self.stack[-1]
+ depth_change = line.count('(') - line.count(')')
+ inner_block.open_parentheses += depth_change
+
+ # Also check if we are starting or ending an inline assembly block.
+ if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+ if (depth_change != 0 and
+ inner_block.open_parentheses == 1 and
+ _MATCH_ASM.match(line)):
+ # Enter assembly block
+ inner_block.inline_asm = _INSIDE_ASM
+ else:
+ # Not entering assembly block. If previous line was _END_ASM,
+ # we will now shift to _NO_ASM state.
+ inner_block.inline_asm = _NO_ASM
+ elif (inner_block.inline_asm == _INSIDE_ASM and
+ inner_block.open_parentheses == 0):
+ # Exit assembly block
+ inner_block.inline_asm = _END_ASM
+
+ # Consume namespace declaration at the beginning of the line. Do
+ # this in a loop so that we catch same line declarations like this:
+ # namespace proto2 { namespace bridge { class MessageSet; } }
+ while True:
+ # Match start of namespace. The "\b\s*" below catches namespace
+ # declarations even if it weren't followed by a whitespace, this
+ # is so that we don't confuse our namespace checker. The
+ # missing spaces will be flagged by CheckSpacing.
+ namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+ if not namespace_decl_match:
+ break
+
+ new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+ self.stack.append(new_namespace)
+
+ line = namespace_decl_match.group(2)
+ if line.find('{') != -1:
+ new_namespace.seen_open_brace = True
+ line = line[line.find('{') + 1:]
+
+ # Look for a class declaration in whatever is left of the line
+ # after parsing namespaces. The regexp accounts for decorated classes
+ # such as in:
+ # class LOCKABLE API Object {
+ # };
+ class_decl_match = Match(
+ r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+ r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+ r'(.*)$', line)
+ if (class_decl_match and
+ (not self.stack or self.stack[-1].open_parentheses == 0)):
+ # We do not want to accept classes that are actually template arguments:
+ # template <class Ignore1,
+ # class Ignore2 = Default<Args>,
+ # template <Args> class Ignore3>
+ # void Function() {};
+ #
+ # To avoid template argument cases, we scan forward and look for
+ # an unmatched '>'. If we see one, assume we are inside a
+ # template argument list.
+ end_declaration = len(class_decl_match.group(1))
+ if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+ self.stack.append(_ClassInfo(
+ class_decl_match.group(3), class_decl_match.group(2),
+ clean_lines, linenum))
+ line = class_decl_match.group(4)
+
+ # If we have not yet seen the opening brace for the innermost block,
+ # run checks here.
+ if not self.SeenOpenBrace():
+ self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+ # Update access control if we are inside a class/struct
+ if self.stack and isinstance(self.stack[-1], _ClassInfo):
+ classinfo = self.stack[-1]
+ access_match = Match(
+ r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+ r':(?:[^:]|$)',
+ line)
+ if access_match:
+ classinfo.access = access_match.group(2)
+
+ # Check that access keywords are indented +1 space. Skip this
+ # check if the keywords are not preceded by whitespaces.
+ indent = access_match.group(1)
+ if (len(indent) != classinfo.class_indent + 1 and
+ Match(r'^\s*$', indent)):
+ if classinfo.is_struct:
+ parent = 'struct ' + classinfo.name
+ else:
+ parent = 'class ' + classinfo.name
+ slots = ''
+ if access_match.group(3):
+ slots = access_match.group(3)
+ error(filename, linenum, 'whitespace/indent', 3,
+ '%s%s: should be indented +1 space inside %s' % (
+ access_match.group(2), slots, parent))
+
+ # Consume braces or semicolons from what's left of the line
+ while True:
+ # Match first brace, semicolon, or closed parenthesis.
+ matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+ if not matched:
+ break
+
+ token = matched.group(1)
+ if token == '{':
+ # If namespace or class hasn't seen a opening brace yet, mark
+ # namespace/class head as complete. Push a new block onto the
+ # stack otherwise.
+ if not self.SeenOpenBrace():
+ self.stack[-1].seen_open_brace = True
+ elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+ self.stack.append(_ExternCInfo(linenum))
+ else:
+ self.stack.append(_BlockInfo(linenum, True))
+ if _MATCH_ASM.match(line):
+ self.stack[-1].inline_asm = _BLOCK_ASM
+
+ elif token == ';' or token == ')':
+ # If we haven't seen an opening brace yet, but we already saw
+ # a semicolon, this is probably a forward declaration. Pop
+ # the stack for these.
+ #
+ # Similarly, if we haven't seen an opening brace yet, but we
+ # already saw a closing parenthesis, then these are probably
+ # function arguments with extra "class" or "struct" keywords.
+ # Also pop these stack for these.
+ if not self.SeenOpenBrace():
+ self.stack.pop()
+ else: # token == '}'
+ # Perform end of block checks and pop the stack.
+ if self.stack:
+ self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+ self.stack.pop()
+ line = matched.group(2)
+
+ def InnermostClass(self):
+ """Get class info on the top of the stack.
+
+ Returns:
+ A _ClassInfo object if we are inside a class, or None otherwise.
+ """
+ for i in range(len(self.stack), 0, -1):
+ classinfo = self.stack[i - 1]
+ if isinstance(classinfo, _ClassInfo):
+ return classinfo
+ return None
+
+ def CheckCompletedBlocks(self, filename, error):
+ """Checks that all classes and namespaces have been completely parsed.
+
+ Call this when all lines in a file have been processed.
+ Args:
+ filename: The name of the current file.
+ error: The function to call with any errors found.
+ """
+ # Note: This test can result in false positives if #ifdef constructs
+ # get in the way of brace matching. See the testBuildClass test in
+ # cpplint_unittest.py for an example of this.
+ for obj in self.stack:
+ if isinstance(obj, _ClassInfo):
+ error(filename, obj.starting_linenum, 'build/class', 5,
+ 'Failed to find complete declaration of class %s' %
+ obj.name)
+ elif isinstance(obj, _NamespaceInfo):
+ error(filename, obj.starting_linenum, 'build/namespaces', 5,
+ 'Failed to find complete declaration of namespace %s' %
+ obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+ nesting_state, error):
+ r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+ Complain about several constructs which gcc-2 accepts, but which are
+ not standard C++. Warning about these in lint is one way to ease the
+ transition to new compilers.
+ - put storage class first (e.g. "static const" instead of "const static").
+ - "%lld" instead of %qd" in printf-type functions.
+ - "%1$d" is non-standard in printf-type functions.
+ - "\%" is an undefined character escape sequence.
+ - text after #endif is not allowed.
+ - invalid inner-style forward declaration.
+ - >? and <? operators, and their >?= and <?= cousins.
+
+ Additionally, check for constructor/destructor style violations and reference
+ members, as it is very convenient to do so while checking for
+ gcc-2 compliance.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: A callable to which errors are reported, which takes 4 arguments:
+ filename, line number, error level, and message
+ """
+
+ # Remove comments from the line, but leave in strings for now.
+ line = clean_lines.lines[linenum]
+
+ if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+ error(filename, linenum, 'runtime/printf_format', 3,
+ '%q in format strings is deprecated. Use %ll instead.')
+
+ if Search(r'printf\s*\(.*".*%\d+\$', line):
+ error(filename, linenum, 'runtime/printf_format', 2,
+ '%N$ formats are unconventional. Try rewriting to avoid them.')
+
+ # Remove escaped backslashes before looking for undefined escapes.
+ line = line.replace('\\\\', '')
+
+ if Search(r'("|\').*\\(%|\[|\(|{)', line):
+ error(filename, linenum, 'build/printf_format', 3,
+ '%, [, (, and { are undefined character escapes. Unescape them.')
+
+ # For the rest, work with both comments and strings removed.
+ line = clean_lines.elided[linenum]
+
+ if Search(r'\b(const|volatile|void|char|short|int|long'
+ r'|float|double|signed|unsigned'
+ r'|schar|u?int8|u?int16|u?int32|u?int64)'
+ r'\s+(register|static|extern|typedef)\b',
+ line):
+ error(filename, linenum, 'build/storage_class', 5,
+ 'Storage-class specifier (static, extern, typedef, etc) should be '
+ 'at the beginning of the declaration.')
+
+ if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+ error(filename, linenum, 'build/endif_comment', 5,
+ 'Uncommented text after #endif is non-standard. Use a comment.')
+
+ if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+ error(filename, linenum, 'build/forward_decl', 5,
+ 'Inner-style forward declarations are invalid. Remove this line.')
+
+ if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+ line):
+ error(filename, linenum, 'build/deprecated', 3,
+ '>? and <? (max and min) operators are non-standard and deprecated.')
+
+ if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+ # TODO(unknown): Could it be expanded safely to arbitrary references,
+ # without triggering too many false positives? The first
+ # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+ # the restriction.
+ # Here's the original regexp, for the reference:
+ # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+ # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+ error(filename, linenum, 'runtime/member_string_references', 2,
+ 'const string& members are dangerous. It is much better to use '
+ 'alternatives, such as pointers or simple constants.')
+
+ # Everything else in this function operates on class declarations.
+ # Return early if the top of the nesting stack is not a class, or if
+ # the class head is not completed yet.
+ classinfo = nesting_state.InnermostClass()
+ if not classinfo or not classinfo.seen_open_brace:
+ return
+
+ # The class may have been declared with namespace or classname qualifiers.
+ # The constructor and destructor will not have those qualifiers.
+ base_classname = classinfo.name.split('::')[-1]
+
+ # Look for single-argument constructors that aren't marked explicit.
+ # Technically a valid construct, but against style.
+ explicit_constructor_match = Match(
+ r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?'
+ r'(?:(?:inline|constexpr)\s+)*%s\s*'
+ r'\(((?:[^()]|\([^()]*\))*)\)'
+ % re.escape(base_classname),
+ line)
+
+ if explicit_constructor_match:
+ is_marked_explicit = explicit_constructor_match.group(1)
+
+ if not explicit_constructor_match.group(2):
+ constructor_args = []
+ else:
+ constructor_args = explicit_constructor_match.group(2).split(',')
+
+ # collapse arguments so that commas in template parameter lists and function
+ # argument parameter lists don't split arguments in two
+ i = 0
+ while i < len(constructor_args):
+ constructor_arg = constructor_args[i]
+ while (constructor_arg.count('<') > constructor_arg.count('>') or
+ constructor_arg.count('(') > constructor_arg.count(')')):
+ constructor_arg += ',' + constructor_args[i + 1]
+ del constructor_args[i + 1]
+ constructor_args[i] = constructor_arg
+ i += 1
+
+ defaulted_args = [arg for arg in constructor_args if '=' in arg]
+ noarg_constructor = (not constructor_args or # empty arg list
+ # 'void' arg specifier
+ (len(constructor_args) == 1 and
+ constructor_args[0].strip() == 'void'))
+ onearg_constructor = ((len(constructor_args) == 1 and # exactly one arg
+ not noarg_constructor) or
+ # all but at most one arg defaulted
+ (len(constructor_args) >= 1 and
+ not noarg_constructor and
+ len(defaulted_args) >= len(constructor_args) - 1))
+ initializer_list_constructor = bool(
+ onearg_constructor and
+ Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+ copy_constructor = bool(
+ onearg_constructor and
+ Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+ % re.escape(base_classname), constructor_args[0].strip()))
+
+ if (not is_marked_explicit and
+ onearg_constructor and
+ not initializer_list_constructor and
+ not copy_constructor):
+ if defaulted_args:
+ error(filename, linenum, 'runtime/explicit', 5,
+ 'Constructors callable with one argument '
+ 'should be marked explicit.')
+ else:
+ error(filename, linenum, 'runtime/explicit', 5,
+ 'Single-parameter constructors should be marked explicit.')
+ elif is_marked_explicit and not onearg_constructor:
+ if noarg_constructor:
+ error(filename, linenum, 'runtime/explicit', 5,
+ 'Zero-parameter constructors should not be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
+ """Checks for the correctness of various spacing around function calls.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Since function calls often occur inside if/for/while/switch
+ # expressions - which have their own, more liberal conventions - we
+ # first see if we should be looking inside such an expression for a
+ # function call, to which we can apply more strict standards.
+ fncall = line # if there's no control flow construct, look at whole line
+ for pattern in (r'\bif\s*\((.*)\)\s*{',
+ r'\bfor\s*\((.*)\)\s*{',
+ r'\bwhile\s*\((.*)\)\s*[{;]',
+ r'\bswitch\s*\((.*)\)\s*{'):
+ match = Search(pattern, line)
+ if match:
+ fncall = match.group(1) # look inside the parens for function calls
+ break
+
+ # Except in if/for/while/switch, there should never be space
+ # immediately inside parens (eg "f( 3, 4 )"). We make an exception
+ # for nested parens ( (a+b) + c ). Likewise, there should never be
+ # a space before a ( when it's a function argument. I assume it's a
+ # function argument when the char before the whitespace is legal in
+ # a function name (alnum + _) and we're not starting a macro. Also ignore
+ # pointers and references to arrays and functions coz they're too tricky:
+ # we use a very simple way to recognize these:
+ # " (something)(maybe-something)" or
+ # " (something)(maybe-something," or
+ # " (something)[something]"
+ # Note that we assume the contents of [] to be short enough that
+ # they'll never need to wrap.
+ if ( # Ignore control structures.
+ not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+ fncall) and
+ # Ignore pointers/references to functions.
+ not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+ # Ignore pointers/references to arrays.
+ not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+ if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call
+ error(filename, linenum, 'whitespace/parens', 4,
+ 'Extra space after ( in function call')
+ elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+ error(filename, linenum, 'whitespace/parens', 2,
+ 'Extra space after (')
+ if (Search(r'\w\s+\(', fncall) and
+ not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and
+ not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+ not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+ not Search(r'\bcase\s+\(', fncall)):
+ # TODO(unknown): Space after an operator function seem to be a common
+ # error, silence those for now by restricting them to highest verbosity.
+ if Search(r'\boperator_*\b', line):
+ error(filename, linenum, 'whitespace/parens', 0,
+ 'Extra space before ( in function call')
+ else:
+ error(filename, linenum, 'whitespace/parens', 4,
+ 'Extra space before ( in function call')
+ # If the ) is followed only by a newline or a { + newline, assume it's
+ # part of a control statement (if/while/etc), and don't complain
+ if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+ # If the closing parenthesis is preceded by only whitespaces,
+ # try to give a more descriptive error message.
+ if Search(r'^\s+\)', fncall):
+ error(filename, linenum, 'whitespace/parens', 2,
+ 'Closing ) should be moved to the previous line')
+ else:
+ error(filename, linenum, 'whitespace/parens', 2,
+ 'Extra space before )')
+
+
+def IsBlankLine(line):
+ """Returns true if the given line is blank.
+
+ We consider a line to be blank if the line is empty or consists of
+ only white spaces.
+
+ Args:
+ line: A line of a string.
+
+ Returns:
+ True, if the given line is blank.
+ """
+ return not line or line.isspace()
+
+
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+ error):
+ is_namespace_indent_item = (
+ len(nesting_state.stack) > 1 and
+ nesting_state.stack[-1].check_namespace_indentation and
+ isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+ nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+ if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+ clean_lines.elided, line):
+ CheckItemIndentationInNamespace(filename, clean_lines.elided,
+ line, error)
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+ function_state, error):
+ """Reports for long function bodies.
+
+ For an overview why this is done, see:
+ https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+ Uses a simplistic algorithm assuming other style guidelines
+ (especially spacing) are followed.
+ Only checks unindented functions, so class members are unchecked.
+ Trivial bodies are unchecked, so constructors with huge initializer lists
+ may be missed.
+ Blank/comment lines are not counted so as to avoid encouraging the removal
+ of vertical space and comments just to get through a lint check.
+ NOLINT *on the last line of a function* disables this check.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ function_state: Current function name and lines in body so far.
+ error: The function to call with any errors found.
+ """
+ lines = clean_lines.lines
+ line = lines[linenum]
+ joined_line = ''
+
+ starting_func = False
+ regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ...
+ match_result = Match(regexp, line)
+ if match_result:
+ # If the name is all caps and underscores, figure it's a macro and
+ # ignore it, unless it's TEST or TEST_F.
+ function_name = match_result.group(1).split()[-1]
+ if function_name == 'TEST' or function_name == 'TEST_F' or (
+ not Match(r'[A-Z_]+$', function_name)):
+ starting_func = True
+
+ if starting_func:
+ body_found = False
+ for start_linenum in xrange(linenum, clean_lines.NumLines()):
+ start_line = lines[start_linenum]
+ joined_line += ' ' + start_line.lstrip()
+ if Search(r'(;|})', start_line): # Declarations and trivial functions
+ body_found = True
+ break # ... ignore
+ elif Search(r'{', start_line):
+ body_found = True
+ function = Search(r'((\w|:)*)\(', line).group(1)
+ if Match(r'TEST', function): # Handle TEST... macros
+ parameter_regexp = Search(r'(\(.*\))', joined_line)
+ if parameter_regexp: # Ignore bad syntax
+ function += parameter_regexp.group(1)
+ else:
+ function += '()'
+ function_state.Begin(function)
+ break
+ if not body_found:
+ # No body for the function (or evidence of a non-function) was found.
+ error(filename, linenum, 'readability/fn_size', 5,
+ 'Lint failed to find start of function body.')
+ elif Match(r'^\}\s*$', line): # function end
+ function_state.Check(error, filename, linenum)
+ function_state.End()
+ elif not Match(r'^\s*$', line):
+ function_state.Count() # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(line, filename, linenum, next_line_start, error):
+ """Checks for common mistakes in comments.
+
+ Args:
+ line: The line in question.
+ filename: The name of the current file.
+ linenum: The number of the line to check.
+ next_line_start: The first non-whitespace column of the next line.
+ error: The function to call with any errors found.
+ """
+ commentpos = line.find('//')
+ if commentpos != -1:
+ # Check if the // may be in quotes. If so, ignore it
+ if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0:
+ # Allow one space for new scopes, two spaces otherwise:
+ if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+ ((commentpos >= 1 and
+ line[commentpos-1] not in string.whitespace) or
+ (commentpos >= 2 and
+ line[commentpos-2] not in string.whitespace))):
+ error(filename, linenum, 'whitespace/comments', 2,
+ 'At least two spaces is best between code and comments')
+
+ # Checks for common mistakes in TODO comments.
+ comment = line[commentpos:]
+ match = _RE_PATTERN_TODO.match(comment)
+ if match:
+ # One whitespace is correct; zero whitespace is handled elsewhere.
+ leading_whitespace = match.group(1)
+ if len(leading_whitespace) > 1:
+ error(filename, linenum, 'whitespace/todo', 2,
+ 'Too many spaces before TODO')
+
+ username = match.group(2)
+ if not username:
+ error(filename, linenum, 'readability/todo', 2,
+ 'Missing username in TODO; it should look like '
+ '"// TODO(my_username): Stuff."')
+
+ middle_whitespace = match.group(3)
+ # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+ if middle_whitespace != ' ' and middle_whitespace != '':
+ error(filename, linenum, 'whitespace/todo', 2,
+ 'TODO(my_username) should be followed by a space')
+
+ # If the comment contains an alphanumeric character, there
+ # should be a space somewhere between it and the // unless
+ # it's a /// or //! Doxygen comment.
+ if (Match(r'//[^ ]*\w', comment) and
+ not Match(r'(///|//\!)(\s+|$)', comment)):
+ error(filename, linenum, 'whitespace/comments', 4,
+ 'Should have a space between // and comment')
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+ """Checks for the correctness of various spacing issues in the code.
+
+ Things we check for: spaces around operators, spaces after
+ if/for/while/switch, no spaces around parens in function calls, two
+ spaces between code and comment, don't start a block with a blank
+ line, don't end a function with a blank line, don't add a blank line
+ after public/protected/private, don't have too many blank lines in a row.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: The function to call with any errors found.
+ """
+
+ # Don't use "elided" lines here, otherwise we can't check commented lines.
+ # Don't want to use "raw" either, because we don't want to check inside C++11
+ # raw strings,
+ raw = clean_lines.lines_without_raw_strings
+ line = raw[linenum]
+
+ # Before nixing comments, check if the line is blank for no good
+ # reason. This includes the first line after a block is opened, and
+ # blank lines at the end of a function (ie, right before a line like '}'
+ #
+ # Skip all the blank line checks if we are immediately inside a
+ # namespace body. In other words, don't issue blank line warnings
+ # for this block:
+ # namespace {
+ #
+ # }
+ #
+ # A warning about missing end of namespace comments will be issued instead.
+ #
+ # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+ # like namespaces.
+ if (IsBlankLine(line) and
+ not nesting_state.InNamespaceBody() and
+ not nesting_state.InExternC()):
+ elided = clean_lines.elided
+ prev_line = elided[linenum - 1]
+ prevbrace = prev_line.rfind('{')
+ # TODO(unknown): Don't complain if line before blank line, and line after,
+ # both start with alnums and are indented the same amount.
+ # This ignores whitespace at the start of a namespace block
+ # because those are not usually indented.
+ if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+ # OK, we have a blank line at the start of a code block. Before we
+ # complain, we check if it is an exception to the rule: The previous
+ # non-empty line has the parameters of a function header that are indented
+ # 4 spaces (because they did not fit in a 80 column line when placed on
+ # the same line as the function name). We also check for the case where
+ # the previous line is indented 6 spaces, which may happen when the
+ # initializers of a constructor do not fit into a 80 column line.
+ exception = False
+ if Match(r' {6}\w', prev_line): # Initializer list?
+ # We are looking for the opening column of initializer list, which
+ # should be indented 4 spaces to cause 6 space indentation afterwards.
+ search_position = linenum-2
+ while (search_position >= 0
+ and Match(r' {6}\w', elided[search_position])):
+ search_position -= 1
+ exception = (search_position >= 0
+ and elided[search_position][:5] == ' :')
+ else:
+ # Search for the function arguments or an initializer list. We use a
+ # simple heuristic here: If the line is indented 4 spaces; and we have a
+ # closing paren, without the opening paren, followed by an opening brace
+ # or colon (for initializer lists) we assume that it is the last line of
+ # a function header. If we have a colon indented 4 spaces, it is an
+ # initializer list.
+ exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+ prev_line)
+ or Match(r' {4}:', prev_line))
+
+ if not exception:
+ error(filename, linenum, 'whitespace/blank_line', 2,
+ 'Redundant blank line at the start of a code block '
+ 'should be deleted.')
+ # Ignore blank lines at the end of a block in a long if-else
+ # chain, like this:
+ # if (condition1) {
+ # // Something followed by a blank line
+ #
+ # } else if (condition2) {
+ # // Something else
+ # }
+ if linenum + 1 < clean_lines.NumLines():
+ next_line = raw[linenum + 1]
+ if (next_line
+ and Match(r'\s*}', next_line)
+ and next_line.find('} else ') == -1):
+ error(filename, linenum, 'whitespace/blank_line', 3,
+ 'Redundant blank line at the end of a code block '
+ 'should be deleted.')
+
+ matched = Match(r'\s*(public|protected|private):', prev_line)
+ if matched:
+ error(filename, linenum, 'whitespace/blank_line', 3,
+ 'Do not leave a blank line after "%s:"' % matched.group(1))
+
+ # Next, check comments
+ next_line_start = 0
+ if linenum + 1 < clean_lines.NumLines():
+ next_line = raw[linenum + 1]
+ next_line_start = len(next_line) - len(next_line.lstrip())
+ CheckComment(line, filename, linenum, next_line_start, error)
+
+ # get rid of comments and strings
+ line = clean_lines.elided[linenum]
+
+ # You shouldn't have spaces before your brackets, except maybe after
+ # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'.
+ if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line):
+ error(filename, linenum, 'whitespace/braces', 5,
+ 'Extra space before [')
+
+ # In range-based for, we wanted spaces before and after the colon, but
+ # not around "::" tokens that might appear.
+ if (Search(r'for *\(.*[^:]:[^: ]', line) or
+ Search(r'for *\(.*[^: ]:[^:]', line)):
+ error(filename, linenum, 'whitespace/forcolon', 2,
+ 'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+ """Checks for horizontal spacing around operators.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Don't try to do spacing checks for operator methods. Do this by
+ # replacing the troublesome characters with something else,
+ # preserving column position for all other characters.
+ #
+ # The replacement is done repeatedly to avoid false positives from
+ # operators that call operators.
+ while True:
+ match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+ if match:
+ line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+ else:
+ break
+
+ # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+ # Otherwise not. Note we only check for non-spaces on *both* sides;
+ # sometimes people put non-spaces on one side when aligning ='s among
+ # many lines (not that this is behavior that I approve of...)
+ if ((Search(r'[\w.]=', line) or
+ Search(r'=[\w.]', line))
+ and not Search(r'\b(if|while|for) ', line)
+ # Operators taken from [lex.operators] in C++11 standard.
+ and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+ and not Search(r'operator=', line)):
+ error(filename, linenum, 'whitespace/operators', 4,
+ 'Missing spaces around =')
+
+ # It's ok not to have spaces around binary operators like + - * /, but if
+ # there's too little whitespace, we get concerned. It's hard to tell,
+ # though, so we punt on this one for now. TODO.
+
+ # You should always have whitespace around binary operators.
+ #
+ # Check <= and >= first to avoid false positives with < and >, then
+ # check non-include lines for spacing around < and >.
+ #
+ # If the operator is followed by a comma, assume it's be used in a
+ # macro context and don't do any checks. This avoids false
+ # positives.
+ #
+ # Note that && is not included here. This is because there are too
+ # many false positives due to RValue references.
+ match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
+ if match:
+ error(filename, linenum, 'whitespace/operators', 3,
+ 'Missing spaces around %s' % match.group(1))
+ elif not Match(r'#.*include', line):
+ # Look for < that is not surrounded by spaces. This is only
+ # triggered if both sides are missing spaces, even though
+ # technically should should flag if at least one side is missing a
+ # space. This is done to avoid some false positives with shifts.
+ match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+ if match:
+ (_, _, end_pos) = CloseExpression(
+ clean_lines, linenum, len(match.group(1)))
+ if end_pos <= -1:
+ error(filename, linenum, 'whitespace/operators', 3,
+ 'Missing spaces around <')
+
+ # Look for > that is not surrounded by spaces. Similar to the
+ # above, we only trigger if both sides are missing spaces to avoid
+ # false positives with shifts.
+ match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+ if match:
+ (_, _, start_pos) = ReverseCloseExpression(
+ clean_lines, linenum, len(match.group(1)))
+ if start_pos <= -1:
+ error(filename, linenum, 'whitespace/operators', 3,
+ 'Missing spaces around >')
+
+ # We allow no-spaces around << when used like this: 10<<20, but
+ # not otherwise (particularly, not when used as streams)
+ #
+ # We also allow operators following an opening parenthesis, since
+ # those tend to be macros that deal with operators.
+ match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line)
+ if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+ not (match.group(1) == 'operator' and match.group(2) == ';')):
+ error(filename, linenum, 'whitespace/operators', 3,
+ 'Missing spaces around <<')
+
+ # We allow no-spaces around >> for almost anything. This is because
+ # C++11 allows ">>" to close nested templates, which accounts for
+ # most cases when ">>" is not followed by a space.
+ #
+ # We still warn on ">>" followed by alpha character, because that is
+ # likely due to ">>" being used for right shifts, e.g.:
+ # value >> alpha
+ #
+ # When ">>" is used to close templates, the alphanumeric letter that
+ # follows would be part of an identifier, and there should still be
+ # a space separating the template type and the identifier.
+ # type<type<type>> alpha
+ match = Search(r'>>[a-zA-Z_]', line)
+ if match:
+ error(filename, linenum, 'whitespace/operators', 3,
+ 'Missing spaces around >>')
+
+ # There shouldn't be space around unary operators
+ match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+ if match:
+ error(filename, linenum, 'whitespace/operators', 4,
+ 'Extra space for operator %s' % match.group(1))
+
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+ """Checks for horizontal spacing around parentheses.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # No spaces after an if, while, switch, or for
+ match = Search(r' (if\(|for\(|while\(|switch\()', line)
+ if match:
+ error(filename, linenum, 'whitespace/parens', 5,
+ 'Missing space before ( in %s' % match.group(1))
+
+ # For if/for/while/switch, the left and right parens should be
+ # consistent about how many spaces are inside the parens, and
+ # there should either be zero or one spaces inside the parens.
+ # We don't want: "if ( foo)" or "if ( foo )".
+ # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+ match = Search(r'\b(if|for|while|switch)\s*'
+ r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+ line)
+ if match:
+ if len(match.group(2)) != len(match.group(4)):
+ if not (match.group(3) == ';' and
+ len(match.group(2)) == 1 + len(match.group(4)) or
+ not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+ error(filename, linenum, 'whitespace/parens', 5,
+ 'Mismatching spaces inside () in %s' % match.group(1))
+ if len(match.group(2)) not in [0, 1]:
+ error(filename, linenum, 'whitespace/parens', 5,
+ 'Should have zero or one spaces inside ( and ) in %s' %
+ match.group(1))
+
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+ """Checks for horizontal spacing near commas and semicolons.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ raw = clean_lines.lines_without_raw_strings
+ line = clean_lines.elided[linenum]
+
+ # You should always have a space after a comma (either as fn arg or operator)
+ #
+ # This does not apply when the non-space character following the
+ # comma is another comma, since the only time when that happens is
+ # for empty macro arguments.
+ #
+ # We run this check in two passes: first pass on elided lines to
+ # verify that lines contain missing whitespaces, second pass on raw
+ # lines to confirm that those missing whitespaces are not due to
+ # elided comments.
+ if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+ Search(r',[^,\s]', raw[linenum])):
+ error(filename, linenum, 'whitespace/comma', 3,
+ 'Missing space after ,')
+
+ # You should always have a space after a semicolon
+ # except for few corner cases
+ # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+ # space after ;
+ if Search(r';[^\s};\\)/]', line):
+ error(filename, linenum, 'whitespace/semicolon', 3,
+ 'Missing space after ;')
+
+
+def _IsType(clean_lines, nesting_state, expr):
+ """Check if expression looks like a type name, returns true if so.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ expr: The expression to check.
+ Returns:
+ True, if token looks like a type.
+ """
+ # Keep only the last token in the expression
+ last_word = Match(r'^.*(\b\S+)$', expr)
+ if last_word:
+ token = last_word.group(1)
+ else:
+ token = expr
+
+ # Match native types and stdint types
+ if _TYPES.match(token):
+ return True
+
+ # Try a bit harder to match templated types. Walk up the nesting
+ # stack until we find something that resembles a typename
+ # declaration for what we are looking for.
+ typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) +
+ r'\b')
+ block_index = len(nesting_state.stack) - 1
+ while block_index >= 0:
+ if isinstance(nesting_state.stack[block_index], _NamespaceInfo):
+ return False
+
+ # Found where the opening brace is. We want to scan from this
+ # line up to the beginning of the function, minus a few lines.
+ # template <typename Type1, // stop scanning here
+ # ...>
+ # class C
+ # : public ... { // start scanning here
+ last_line = nesting_state.stack[block_index].starting_linenum
+
+ next_block_start = 0
+ if block_index > 0:
+ next_block_start = nesting_state.stack[block_index - 1].starting_linenum
+ first_line = last_line
+ while first_line >= next_block_start:
+ if clean_lines.elided[first_line].find('template') >= 0:
+ break
+ first_line -= 1
+ if first_line < next_block_start:
+ # Didn't find any "template" keyword before reaching the next block,
+ # there are probably no template things to check for this block
+ block_index -= 1
+ continue
+
+ # Look for typename in the specified range
+ for i in xrange(first_line, last_line + 1, 1):
+ if Search(typename_pattern, clean_lines.elided[i]):
+ return True
+ block_index -= 1
+
+ return False
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error):
+ """Checks for horizontal spacing near commas.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Except after an opening paren, or after another opening brace (in case of
+ # an initializer list, for instance), you should have spaces before your
+ # braces when they are delimiting blocks, classes, namespaces etc.
+ # And since you should never have braces at the beginning of a line,
+ # this is an easy test. Except that braces used for initialization don't
+ # follow the same rule; we often don't want spaces before those.
+ match = Match(r'^(.*[^ ({>]){', line)
+
+ if match:
+ # Try a bit harder to check for brace initialization. This
+ # happens in one of the following forms:
+ # Constructor() : initializer_list_{} { ... }
+ # Constructor{}.MemberFunction()
+ # Type variable{};
+ # FunctionCall(type{}, ...);
+ # LastArgument(..., type{});
+ # LOG(INFO) << type{} << " ...";
+ # map_of_type[{...}] = ...;
+ # ternary = expr ? new type{} : nullptr;
+ # OuterTemplate<InnerTemplateConstructor<Type>{}>
+ #
+ # We check for the character following the closing brace, and
+ # silence the warning if it's one of those listed above, i.e.
+ # "{.;,)<>]:".
+ #
+ # To account for nested initializer list, we allow any number of
+ # closing braces up to "{;,)<". We can't simply silence the
+ # warning on first sight of closing brace, because that would
+ # cause false negatives for things that are not initializer lists.
+ # Silence this: But not this:
+ # Outer{ if (...) {
+ # Inner{...} if (...){ // Missing space before {
+ # }; }
+ #
+ # There is a false negative with this approach if people inserted
+ # spurious semicolons, e.g. "if (cond){};", but we will catch the
+ # spurious semicolon with a separate check.
+ leading_text = match.group(1)
+ (endline, endlinenum, endpos) = CloseExpression(
+ clean_lines, linenum, len(match.group(1)))
+ trailing_text = ''
+ if endpos > -1:
+ trailing_text = endline[endpos:]
+ for offset in xrange(endlinenum + 1,
+ min(endlinenum + 3, clean_lines.NumLines() - 1)):
+ trailing_text += clean_lines.elided[offset]
+ # We also suppress warnings for `uint64_t{expression}` etc., as the style
+ # guide recommends brace initialization for integral types to avoid
+ # overflow/truncation.
+ if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text)
+ and not _IsType(clean_lines, nesting_state, leading_text)):
+ error(filename, linenum, 'whitespace/braces', 5,
+ 'Missing space before {')
+
+ # Make sure '} else {' has spaces.
+ if Search(r'}else', line):
+ error(filename, linenum, 'whitespace/braces', 5,
+ 'Missing space before else')
+
+ # You shouldn't have a space before a semicolon at the end of the line.
+ # There's a special case for "for" since the style guide allows space before
+ # the semicolon there.
+ if Search(r':\s*;\s*$', line):
+ error(filename, linenum, 'whitespace/semicolon', 5,
+ 'Semicolon defining empty statement. Use {} instead.')
+ elif Search(r'^\s*;\s*$', line):
+ error(filename, linenum, 'whitespace/semicolon', 5,
+ 'Line contains only semicolon. If this should be an empty statement, '
+ 'use {} instead.')
+ elif (Search(r'\s+;\s*$', line) and
+ not Search(r'\bfor\b', line)):
+ error(filename, linenum, 'whitespace/semicolon', 5,
+ 'Extra space before last semicolon. If this should be an empty '
+ 'statement, use {} instead.')
+
+
+def IsDecltype(clean_lines, linenum, column):
+ """Check if the token ending on (linenum, column) is decltype().
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: the number of the line to check.
+ column: end column of the token to check.
+ Returns:
+ True if this token is decltype() expression, False otherwise.
+ """
+ (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+ if start_col < 0:
+ return False
+ if Search(r'\bdecltype\s*$', text[0:start_col]):
+ return True
+ return False
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+ """Checks for additional blank line issues related to sections.
+
+ Currently the only thing checked here is blank line before protected/private.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ class_info: A _ClassInfo objects.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ # Skip checks if the class is small, where small means 25 lines or less.
+ # 25 lines seems like a good cutoff since that's the usual height of
+ # terminals, and any class that can't fit in one screen can't really
+ # be considered "small".
+ #
+ # Also skip checks if we are on the first line. This accounts for
+ # classes that look like
+ # class Foo { public: ... };
+ #
+ # If we didn't find the end of the class, last_line would be zero,
+ # and the check will be skipped by the first condition.
+ if (class_info.last_line - class_info.starting_linenum <= 24 or
+ linenum <= class_info.starting_linenum):
+ return
+
+ matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+ if matched:
+ # Issue warning if the line before public/protected/private was
+ # not a blank line, but don't do this if the previous line contains
+ # "class" or "struct". This can happen two ways:
+ # - We are at the beginning of the class.
+ # - We are forward-declaring an inner class that is semantically
+ # private, but needed to be public for implementation reasons.
+ # Also ignores cases where the previous line ends with a backslash as can be
+ # common when defining classes in C macros.
+ prev_line = clean_lines.lines[linenum - 1]
+ if (not IsBlankLine(prev_line) and
+ not Search(r'\b(class|struct)\b', prev_line) and
+ not Search(r'\\$', prev_line)):
+ # Try a bit harder to find the beginning of the class. This is to
+ # account for multi-line base-specifier lists, e.g.:
+ # class Derived
+ # : public Base {
+ end_class_head = class_info.starting_linenum
+ for i in range(class_info.starting_linenum, linenum):
+ if Search(r'\{\s*$', clean_lines.lines[i]):
+ end_class_head = i
+ break
+ if end_class_head < linenum - 1:
+ error(filename, linenum, 'whitespace/blank_line', 3,
+ '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+ """Return the most recent non-blank line and its line number.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file contents.
+ linenum: The number of the line to check.
+
+ Returns:
+ A tuple with two elements. The first element is the contents of the last
+ non-blank line before the current line, or the empty string if this is the
+ first non-blank line. The second is the line number of that line, or -1
+ if this is the first non-blank line.
+ """
+
+ prevlinenum = linenum - 1
+ while prevlinenum >= 0:
+ prevline = clean_lines.elided[prevlinenum]
+ if not IsBlankLine(prevline): # if not a blank line...
+ return (prevline, prevlinenum)
+ prevlinenum -= 1
+ return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+ """Looks for misplaced braces (e.g. at the end of line).
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+
+ line = clean_lines.elided[linenum] # get rid of comments and strings
+
+ if Match(r'\s*{\s*$', line):
+ # We allow an open brace to start a line in the case where someone is using
+ # braces in a block to explicitly create a new scope, which is commonly used
+ # to control the lifetime of stack-allocated variables. Braces are also
+ # used for brace initializers inside function calls. We don't detect this
+ # perfectly: we just don't complain if the last non-whitespace character on
+ # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+ # previous line starts a preprocessor block. We also allow a brace on the
+ # following line if it is part of an array initialization and would not fit
+ # within the 80 character limit of the preceding line.
+ prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+ if (not Search(r'[,;:}{(]\s*$', prevline) and
+ not Match(r'\s*#', prevline) and
+ not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)):
+ error(filename, linenum, 'whitespace/braces', 4,
+ '{ should almost always be at the end of the previous line')
+
+ # An else clause should be on the same line as the preceding closing brace.
+ if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
+ prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+ if Match(r'\s*}\s*$', prevline):
+ error(filename, linenum, 'whitespace/newline', 4,
+ 'An else should appear on the same line as the preceding }')
+
+ # If braces come on one side of an else, they should be on both.
+ # However, we have to worry about "else if" that spans multiple lines!
+ if Search(r'else if\s*\(', line): # could be multi-line if
+ brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+ # find the ( after the if
+ pos = line.find('else if')
+ pos = line.find('(', pos)
+ if pos > 0:
+ (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+ brace_on_right = endline[endpos:].find('{') != -1
+ if brace_on_left != brace_on_right: # must be brace after if
+ error(filename, linenum, 'readability/braces', 5,
+ 'If an else has a brace on one side, it should have it on both')
+ elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+ error(filename, linenum, 'readability/braces', 5,
+ 'If an else has a brace on one side, it should have it on both')
+
+ # Likewise, an else should never have the else clause on the same line
+ if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+ error(filename, linenum, 'whitespace/newline', 4,
+ 'Else clause should never be on same line as else (use 2 lines)')
+
+ # In the same way, a do/while should never be on one line
+ if Match(r'\s*do [^\s{]', line):
+ error(filename, linenum, 'whitespace/newline', 4,
+ 'do/while clauses should not be on a single line')
+
+ # Check single-line if/else bodies. The style guide says 'curly braces are not
+ # required for single-line statements'. We additionally allow multi-line,
+ # single statements, but we reject anything with more than one semicolon in
+ # it. This means that the first semicolon after the if should be at the end of
+ # its line, and the line after that should have an indent level equal to or
+ # lower than the if. We also check for ambiguous if/else nesting without
+ # braces.
+ if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+ if if_else_match and not Match(r'\s*#', line):
+ if_indent = GetIndentLevel(line)
+ endline, endlinenum, endpos = line, linenum, if_else_match.end()
+ if_match = Search(r'\bif\s*\(', line)
+ if if_match:
+ # This could be a multiline if condition, so find the end first.
+ pos = if_match.end() - 1
+ (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+ # Check for an opening brace, either directly after the if or on the next
+ # line. If found, this isn't a single-statement conditional.
+ if (not Match(r'\s*{', endline[endpos:])
+ and not (Match(r'\s*$', endline[endpos:])
+ and endlinenum < (len(clean_lines.elided) - 1)
+ and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+ while (endlinenum < len(clean_lines.elided)
+ and ';' not in clean_lines.elided[endlinenum][endpos:]):
+ endlinenum += 1
+ endpos = 0
+ if endlinenum < len(clean_lines.elided):
+ endline = clean_lines.elided[endlinenum]
+ # We allow a mix of whitespace and closing braces (e.g. for one-liner
+ # methods) and a single \ after the semicolon (for macros)
+ endpos = endline.find(';')
+ if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+ # Semicolon isn't the last character, there's something trailing.
+ # Output a warning if the semicolon is not contained inside
+ # a lambda expression.
+ if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+ endline):
+ error(filename, linenum, 'readability/braces', 4,
+ 'If/else bodies with multiple statements require braces')
+ elif endlinenum < len(clean_lines.elided) - 1:
+ # Make sure the next line is dedented
+ next_line = clean_lines.elided[endlinenum + 1]
+ next_indent = GetIndentLevel(next_line)
+ # With ambiguous nested if statements, this will error out on the
+ # if that *doesn't* match the else, regardless of whether it's the
+ # inner one or outer one.
+ if (if_match and Match(r'\s*else\b', next_line)
+ and next_indent != if_indent):
+ error(filename, linenum, 'readability/braces', 4,
+ 'Else clause should be indented at the same level as if. '
+ 'Ambiguous nested if/else chains require braces.')
+ elif next_indent > if_indent:
+ error(filename, linenum, 'readability/braces', 4,
+ 'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+ """Looks for redundant trailing semicolon.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+
+ line = clean_lines.elided[linenum]
+
+ # Block bodies should not be followed by a semicolon. Due to C++11
+ # brace initialization, there are more places where semicolons are
+ # required than not, so we explicitly list the allowed rules rather
+ # than listing the disallowed ones. These are the places where "};"
+ # should be replaced by just "}":
+ # 1. Some flavor of block following closing parenthesis:
+ # for (;;) {};
+ # while (...) {};
+ # switch (...) {};
+ # Function(...) {};
+ # if (...) {};
+ # if (...) else if (...) {};
+ #
+ # 2. else block:
+ # if (...) else {};
+ #
+ # 3. const member function:
+ # Function(...) const {};
+ #
+ # 4. Block following some statement:
+ # x = 42;
+ # {};
+ #
+ # 5. Block at the beginning of a function:
+ # Function(...) {
+ # {};
+ # }
+ #
+ # Note that naively checking for the preceding "{" will also match
+ # braces inside multi-dimensional arrays, but this is fine since
+ # that expression will not contain semicolons.
+ #
+ # 6. Block following another block:
+ # while (true) {}
+ # {};
+ #
+ # 7. End of namespaces:
+ # namespace {};
+ #
+ # These semicolons seems far more common than other kinds of
+ # redundant semicolons, possibly due to people converting classes
+ # to namespaces. For now we do not warn for this case.
+ #
+ # Try matching case 1 first.
+ match = Match(r'^(.*\)\s*)\{', line)
+ if match:
+ # Matched closing parenthesis (case 1). Check the token before the
+ # matching opening parenthesis, and don't warn if it looks like a
+ # macro. This avoids these false positives:
+ # - macro that defines a base class
+ # - multi-line macro that defines a base class
+ # - macro that defines the whole class-head
+ #
+ # But we still issue warnings for macros that we know are safe to
+ # warn, specifically:
+ # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+ # - TYPED_TEST
+ # - INTERFACE_DEF
+ # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+ #
+ # We implement a list of safe macros instead of a list of
+ # unsafe macros, even though the latter appears less frequently in
+ # google code and would have been easier to implement. This is because
+ # the downside for getting the allowed checks wrong means some extra
+ # semicolons, while the downside for getting disallowed checks wrong
+ # would result in compile errors.
+ #
+ # In addition to macros, we also don't want to warn on
+ # - Compound literals
+ # - Lambdas
+ # - alignas specifier with anonymous structs
+ # - decltype
+ closing_brace_pos = match.group(1).rfind(')')
+ opening_parenthesis = ReverseCloseExpression(
+ clean_lines, linenum, closing_brace_pos)
+ if opening_parenthesis[2] > -1:
+ line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+ macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix)
+ func = Match(r'^(.*\])\s*$', line_prefix)
+ if ((macro and
+ macro.group(1) not in (
+ 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+ 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+ 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+ (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+ Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+ Search(r'\bdecltype$', line_prefix) or
+ Search(r'\s+=\s*$', line_prefix)):
+ match = None
+ if (match and
+ opening_parenthesis[1] > 1 and
+ Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+ # Multi-line lambda-expression
+ match = None
+
+ else:
+ # Try matching cases 2-3.
+ match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+ if not match:
+ # Try matching cases 4-6. These are always matched on separate lines.
+ #
+ # Note that we can't simply concatenate the previous line to the
+ # current line and do a single match, otherwise we may output
+ # duplicate warnings for the blank line case:
+ # if (cond) {
+ # // blank line
+ # }
+ prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+ if prevline and Search(r'[;{}]\s*$', prevline):
+ match = Match(r'^(\s*)\{', line)
+
+ # Check matching closing brace
+ if match:
+ (endline, endlinenum, endpos) = CloseExpression(
+ clean_lines, linenum, len(match.group(1)))
+ if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+ # Current {} pair is eligible for semicolon check, and we have found
+ # the redundant semicolon, output warning here.
+ #
+ # Note: because we are scanning forward for opening braces, and
+ # outputting warnings for the matching closing brace, if there are
+ # nested blocks with trailing semicolons, we will get the error
+ # messages in reversed order.
+
+ # We need to check the line forward for NOLINT
+ raw_lines = clean_lines.raw_lines
+ ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1,
+ error)
+ ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum,
+ error)
+
+ error(filename, endlinenum, 'readability/braces', 4,
+ "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+ """Look for empty loop/conditional body with only a single semicolon.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+
+ # Search for loop keywords at the beginning of the line. Because only
+ # whitespaces are allowed before the keywords, this will also ignore most
+ # do-while-loops, since those lines should start with closing brace.
+ #
+ # We also check "if" blocks here, since an empty conditional block
+ # is likely an error.
+ line = clean_lines.elided[linenum]
+ matched = Match(r'\s*(for|while|if)\s*\(', line)
+ if matched:
+ # Find the end of the conditional expression.
+ (end_line, end_linenum, end_pos) = CloseExpression(
+ clean_lines, linenum, line.find('('))
+
+ # Output warning if what follows the condition expression is a semicolon.
+ # No warning for all other cases, including whitespace or newline, since we
+ # have a separate check for semicolons preceded by whitespace.
+ if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+ if matched.group(1) == 'if':
+ error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+ 'Empty conditional bodies should use {}')
+ else:
+ error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+ 'Empty loop bodies should use {} or continue')
+
+ # Check for if statements that have completely empty bodies (no comments)
+ # and no else clauses.
+ if end_pos >= 0 and matched.group(1) == 'if':
+ # Find the position of the opening { for the if statement.
+ # Return without logging an error if it has no brackets.
+ opening_linenum = end_linenum
+ opening_line_fragment = end_line[end_pos:]
+ # Loop until EOF or find anything that's not whitespace or opening {.
+ while not Search(r'^\s*\{', opening_line_fragment):
+ if Search(r'^(?!\s*$)', opening_line_fragment):
+ # Conditional has no brackets.
+ return
+ opening_linenum += 1
+ if opening_linenum == len(clean_lines.elided):
+ # Couldn't find conditional's opening { or any code before EOF.
+ return
+ opening_line_fragment = clean_lines.elided[opening_linenum]
+ # Set opening_line (opening_line_fragment may not be entire opening line).
+ opening_line = clean_lines.elided[opening_linenum]
+
+ # Find the position of the closing }.
+ opening_pos = opening_line_fragment.find('{')
+ if opening_linenum == end_linenum:
+ # We need to make opening_pos relative to the start of the entire line.
+ opening_pos += end_pos
+ (closing_line, closing_linenum, closing_pos) = CloseExpression(
+ clean_lines, opening_linenum, opening_pos)
+ if closing_pos < 0:
+ return
+
+ # Now construct the body of the conditional. This consists of the portion
+ # of the opening line after the {, all lines until the closing line,
+ # and the portion of the closing line before the }.
+ if (clean_lines.raw_lines[opening_linenum] !=
+ CleanseComments(clean_lines.raw_lines[opening_linenum])):
+ # Opening line ends with a comment, so conditional isn't empty.
+ return
+ if closing_linenum > opening_linenum:
+ # Opening line after the {. Ignore comments here since we checked above.
+ body = list(opening_line[opening_pos+1:])
+ # All lines until closing line, excluding closing line, with comments.
+ body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum])
+ # Closing line before the }. Won't (and can't) have comments.
+ body.append(clean_lines.elided[closing_linenum][:closing_pos-1])
+ body = '\n'.join(body)
+ else:
+ # If statement has brackets and fits on a single line.
+ body = opening_line[opening_pos+1:closing_pos-1]
+
+ # Check if the body is empty
+ if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body):
+ return
+ # The body is empty. Now make sure there's not an else clause.
+ current_linenum = closing_linenum
+ current_line_fragment = closing_line[closing_pos:]
+ # Loop until EOF or find anything that's not whitespace or else clause.
+ while Search(r'^\s*$|^(?=\s*else)', current_line_fragment):
+ if Search(r'^(?=\s*else)', current_line_fragment):
+ # Found an else clause, so don't log an error.
+ return
+ current_linenum += 1
+ if current_linenum == len(clean_lines.elided):
+ break
+ current_line_fragment = clean_lines.elided[current_linenum]
+
+ # The body is empty and there's no else clause until EOF or other code.
+ error(filename, end_linenum, 'whitespace/empty_if_body', 4,
+ ('If statement had no body and no else clause'))
+
+
+def FindCheckMacro(line):
+ """Find a replaceable CHECK-like macro.
+
+ Args:
+ line: line to search on.
+ Returns:
+ (macro name, start position), or (None, -1) if no replaceable
+ macro is found.
+ """
+ for macro in _CHECK_MACROS:
+ i = line.find(macro)
+ if i >= 0:
+ # Find opening parenthesis. Do a regular expression match here
+ # to make sure that we are matching the expected CHECK macro, as
+ # opposed to some other macro that happens to contain the CHECK
+ # substring.
+ matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+ if not matched:
+ continue
+ return (macro, len(matched.group(1)))
+ return (None, -1)
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+ """Checks the use of CHECK and EXPECT macros.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+
+ # Decide the set of replacement macros that should be suggested
+ lines = clean_lines.elided
+ (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+ if not check_macro:
+ return
+
+ # Find end of the boolean expression by matching parentheses
+ (last_line, end_line, end_pos) = CloseExpression(
+ clean_lines, linenum, start_pos)
+ if end_pos < 0:
+ return
+
+ # If the check macro is followed by something other than a
+ # semicolon, assume users will log their own custom error messages
+ # and don't suggest any replacements.
+ if not Match(r'\s*;', last_line[end_pos:]):
+ return
+
+ if linenum == end_line:
+ expression = lines[linenum][start_pos + 1:end_pos - 1]
+ else:
+ expression = lines[linenum][start_pos + 1:]
+ for i in xrange(linenum + 1, end_line):
+ expression += lines[i]
+ expression += last_line[0:end_pos - 1]
+
+ # Parse expression so that we can take parentheses into account.
+ # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+ # which is not replaceable by CHECK_LE.
+ lhs = ''
+ rhs = ''
+ operator = None
+ while expression:
+ matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+ r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+ if matched:
+ token = matched.group(1)
+ if token == '(':
+ # Parenthesized operand
+ expression = matched.group(2)
+ (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
+ if end < 0:
+ return # Unmatched parenthesis
+ lhs += '(' + expression[0:end]
+ expression = expression[end:]
+ elif token in ('&&', '||'):
+ # Logical and/or operators. This means the expression
+ # contains more than one term, for example:
+ # CHECK(42 < a && a < b);
+ #
+ # These are not replaceable with CHECK_LE, so bail out early.
+ return
+ elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+ # Non-relational operator
+ lhs += token
+ expression = matched.group(2)
+ else:
+ # Relational operator
+ operator = token
+ rhs = matched.group(2)
+ break
+ else:
+ # Unparenthesized operand. Instead of appending to lhs one character
+ # at a time, we do another regular expression match to consume several
+ # characters at once if possible. Trivial benchmark shows that this
+ # is more efficient when the operands are longer than a single
+ # character, which is generally the case.
+ matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+ if not matched:
+ matched = Match(r'^(\s*\S)(.*)$', expression)
+ if not matched:
+ break
+ lhs += matched.group(1)
+ expression = matched.group(2)
+
+ # Only apply checks if we got all parts of the boolean expression
+ if not (lhs and operator and rhs):
+ return
+
+ # Check that rhs do not contain logical operators. We already know
+ # that lhs is fine since the loop above parses out && and ||.
+ if rhs.find('&&') > -1 or rhs.find('||') > -1:
+ return
+
+ # At least one of the operands must be a constant literal. This is
+ # to avoid suggesting replacements for unprintable things like
+ # CHECK(variable != iterator)
+ #
+ # The following pattern matches decimal, hex integers, strings, and
+ # characters (in that order).
+ lhs = lhs.strip()
+ rhs = rhs.strip()
+ match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+ if Match(match_constant, lhs) or Match(match_constant, rhs):
+ # Note: since we know both lhs and rhs, we can provide a more
+ # descriptive error message like:
+ # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+ # Instead of:
+ # Consider using CHECK_EQ instead of CHECK(a == b)
+ #
+ # We are still keeping the less descriptive message because if lhs
+ # or rhs gets long, the error message might become unreadable.
+ error(filename, linenum, 'readability/check', 2,
+ 'Consider using %s instead of %s(a %s b)' % (
+ _CHECK_REPLACEMENT[check_macro][operator],
+ check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+ """Check alternative keywords being used in boolean expressions.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Avoid preprocessor lines
+ if Match(r'^\s*#', line):
+ return
+
+ # Last ditch effort to avoid multi-line comments. This will not help
+ # if the comment started before the current line or ended after the
+ # current line, but it catches most of the false positives. At least,
+ # it provides a way to workaround this warning for people who use
+ # multi-line comments in preprocessor macros.
+ #
+ # TODO(unknown): remove this once cpplint has better support for
+ # multi-line comments.
+ if line.find('/*') >= 0 or line.find('*/') >= 0:
+ return
+
+ for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+ error(filename, linenum, 'readability/alt_tokens', 2,
+ 'Use operator %s instead of %s' % (
+ _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+ """Determines the width of the line in column positions.
+
+ Args:
+ line: A string, which may be a Unicode string.
+
+ Returns:
+ The width of the line in column positions, accounting for Unicode
+ combining characters and wide characters.
+ """
+ if isinstance(line, unicode):
+ width = 0
+ for uc in unicodedata.normalize('NFC', line):
+ if unicodedata.east_asian_width(uc) in ('W', 'F'):
+ width += 2
+ elif not unicodedata.combining(uc):
+ # Issue 337
+ # https://mail.python.org/pipermail/python-list/2012-August/628809.html
+ if (sys.version_info.major, sys.version_info.minor) <= (3, 2):
+ # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81
+ is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4
+ # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564
+ is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF
+ if not is_wide_build and is_low_surrogate:
+ width -= 1
+
+ width += 1
+ return width
+ else:
+ return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+ error):
+ """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+ Most of these rules are hard to test (naming, comment style), but we
+ do what we can. In particular we check for 2-space indents, line lengths,
+ tab usage, spaces inside code, etc.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ file_extension: The extension (without the dot) of the filename.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: The function to call with any errors found.
+ """
+
+ # Don't use "elided" lines here, otherwise we can't check commented lines.
+ # Don't want to use "raw" either, because we don't want to check inside C++11
+ # raw strings,
+ raw_lines = clean_lines.lines_without_raw_strings
+ line = raw_lines[linenum]
+ prev = raw_lines[linenum - 1] if linenum > 0 else ''
+
+ if line.find('\t') != -1:
+ error(filename, linenum, 'whitespace/tab', 1,
+ 'Tab found; better to use spaces')
+
+ # One or three blank spaces at the beginning of the line is weird; it's
+ # hard to reconcile that with 2-space indents.
+ # NOTE: here are the conditions rob pike used for his tests. Mine aren't
+ # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces
+ # if(RLENGTH > 20) complain = 0;
+ # if(match($0, " +(error|private|public|protected):")) complain = 0;
+ # if(match(prev, "&& *$")) complain = 0;
+ # if(match(prev, "\\|\\| *$")) complain = 0;
+ # if(match(prev, "[\",=><] *$")) complain = 0;
+ # if(match($0, " <<")) complain = 0;
+ # if(match(prev, " +for \\(")) complain = 0;
+ # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+ scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+ classinfo = nesting_state.InnermostClass()
+ initial_spaces = 0
+ cleansed_line = clean_lines.elided[linenum]
+ while initial_spaces < len(line) and line[initial_spaces] == ' ':
+ initial_spaces += 1
+ # There are certain situations we allow one space, notably for
+ # section labels, and also lines containing multi-line raw strings.
+ # We also don't check for lines that look like continuation lines
+ # (of lines ending in double quotes, commas, equals, or angle brackets)
+ # because the rules for how to indent those are non-trivial.
+ if (not Search(r'[",=><] *$', prev) and
+ (initial_spaces == 1 or initial_spaces == 3) and
+ not Match(scope_or_label_pattern, cleansed_line) and
+ not (clean_lines.raw_lines[linenum] != line and
+ Match(r'^\s*""', line))):
+ error(filename, linenum, 'whitespace/indent', 3,
+ 'Weird number of spaces at line-start. '
+ 'Are you using a 2-space indent?')
+
+ if line and line[-1].isspace():
+ error(filename, linenum, 'whitespace/end_of_line', 4,
+ 'Line ends in whitespace. Consider deleting these extra spaces.')
+
+ # Check if the line is a header guard.
+ is_header_guard = False
+ if IsHeaderExtension(file_extension):
+ cppvar = GetHeaderGuardCPPVariable(filename)
+ if (line.startswith('#ifndef %s' % cppvar) or
+ line.startswith('#define %s' % cppvar) or
+ line.startswith('#endif // %s' % cppvar)):
+ is_header_guard = True
+ # #include lines and header guards can be long, since there's no clean way to
+ # split them.
+ #
+ # URLs can be long too. It's possible to split these, but it makes them
+ # harder to cut&paste.
+ #
+ # The "$Id:...$" comment may also get very long without it being the
+ # developers fault.
+ if (not line.startswith('#include') and not is_header_guard and
+ not Match(r'^\s*//.*http(s?)://\S*$', line) and
+ not Match(r'^\s*//\s*[^\s]*$', line) and
+ not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+ line_width = GetLineWidth(line)
+ if line_width > _line_length:
+ error(filename, linenum, 'whitespace/line_length', 2,
+ 'Lines should be <= %i characters long' % _line_length)
+
+ if (cleansed_line.count(';') > 1 and
+ # for loops are allowed two ;'s (and may run over two lines).
+ cleansed_line.find('for') == -1 and
+ (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+ GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+ # It's ok to have many commands in a switch case that fits in 1 line
+ not ((cleansed_line.find('case ') != -1 or
+ cleansed_line.find('default:') != -1) and
+ cleansed_line.find('break;') != -1)):
+ error(filename, linenum, 'whitespace/newline', 0,
+ 'More than one command on the same line')
+
+ # Some more style checks
+ CheckBraces(filename, clean_lines, linenum, error)
+ CheckTrailingSemicolon(filename, clean_lines, linenum, error)
+ CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+ CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+ CheckOperatorSpacing(filename, clean_lines, linenum, error)
+ CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+ CheckCommaSpacing(filename, clean_lines, linenum, error)
+ CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error)
+ CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
+ CheckCheck(filename, clean_lines, linenum, error)
+ CheckAltTokens(filename, clean_lines, linenum, error)
+ classinfo = nesting_state.InnermostClass()
+ if classinfo:
+ CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+# _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+# _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+# _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+# _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+ """Drops common suffixes like _test.cc or -inl.h from filename.
+
+ For example:
+ >>> _DropCommonSuffixes('foo/foo-inl.h')
+ 'foo/foo'
+ >>> _DropCommonSuffixes('foo/bar/foo.cc')
+ 'foo/bar/foo'
+ >>> _DropCommonSuffixes('foo/foo_internal.h')
+ 'foo/foo'
+ >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+ 'foo/foo_unusualinternal'
+
+ Args:
+ filename: The input filename.
+
+ Returns:
+ The filename with the common suffix removed.
+ """
+ for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+ 'inl.h', 'impl.h', 'internal.h'):
+ if (filename.endswith(suffix) and len(filename) > len(suffix) and
+ filename[-len(suffix) - 1] in ('-', '_')):
+ return filename[:-len(suffix) - 1]
+ return os.path.splitext(filename)[0]
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+ """Figures out what kind of header 'include' is.
+
+ Args:
+ fileinfo: The current file cpplint is running over. A FileInfo instance.
+ include: The path to a #included file.
+ is_system: True if the #include used <> rather than "".
+
+ Returns:
+ One of the _XXX_HEADER constants.
+
+ For example:
+ >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+ _C_SYS_HEADER
+ >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+ _CPP_SYS_HEADER
+ >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+ _LIKELY_MY_HEADER
+ >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+ ... 'bar/foo_other_ext.h', False)
+ _POSSIBLE_MY_HEADER
+ >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+ _OTHER_HEADER
+ """
+ # This is a list of all standard c++ header files, except
+ # those already checked for above.
+ is_cpp_h = include in _CPP_HEADERS
+
+ if is_system:
+ if is_cpp_h:
+ return _CPP_SYS_HEADER
+ else:
+ return _C_SYS_HEADER
+
+ # If the target file and the include we're checking share a
+ # basename when we drop common extensions, and the include
+ # lives in . , then it's likely to be owned by the target file.
+ target_dir, target_base = (
+ os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+ include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+ if target_base == include_base and (
+ include_dir == target_dir or
+ include_dir == os.path.normpath(target_dir + '/../public')):
+ return _LIKELY_MY_HEADER
+
+ # If the target and include share some initial basename
+ # component, it's possible the target is implementing the
+ # include, so it's allowed to be first, but we'll never
+ # complain if it's not there.
+ target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+ include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+ if (target_first_component and include_first_component and
+ target_first_component.group(0) ==
+ include_first_component.group(0)):
+ return _POSSIBLE_MY_HEADER
+
+ return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+ """Check rules that are applicable to #include lines.
+
+ Strings on #include lines are NOT removed from elided line, to make
+ certain tasks easier. However, to prevent false positives, checks
+ applicable to #include lines in CheckLanguage must be put here.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ include_state: An _IncludeState instance in which the headers are inserted.
+ error: The function to call with any errors found.
+ """
+ fileinfo = FileInfo(filename)
+ line = clean_lines.lines[linenum]
+
+ # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+ # Only do this check if the included header follows google naming
+ # conventions. If not, assume that it's a 3rd party API that
+ # requires special include conventions.
+ #
+ # We also make an exception for Lua headers, which follow google
+ # naming convention but not the include convention.
+ match = Match(r'#include\s*"([^/]+\.h)"', line)
+ if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
+ error(filename, linenum, 'build/include', 4,
+ 'Include the directory when naming .h files')
+
+ # we shouldn't include a file more than once. actually, there are a
+ # handful of instances where doing so is okay, but in general it's
+ # not.
+ match = _RE_PATTERN_INCLUDE.search(line)
+ if match:
+ include = match.group(2)
+ is_system = (match.group(1) == '<')
+ duplicate_line = include_state.FindHeader(include)
+ if duplicate_line >= 0:
+ error(filename, linenum, 'build/include', 4,
+ '"%s" already included at %s:%s' %
+ (include, filename, duplicate_line))
+ elif (include.endswith('.cc') and
+ os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+ error(filename, linenum, 'build/include', 4,
+ 'Do not include .cc files from other packages')
+ elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+ include_state.include_list[-1].append((include, linenum))
+
+ # We want to ensure that headers appear in the right order:
+ # 1) for foo.cc, foo.h (preferred location)
+ # 2) c system files
+ # 3) cpp system files
+ # 4) for foo.cc, foo.h (deprecated location)
+ # 5) other google headers
+ #
+ # We classify each include statement as one of those 5 types
+ # using a number of techniques. The include_state object keeps
+ # track of the highest type seen, and complains if we see a
+ # lower type after that.
+ error_message = include_state.CheckNextIncludeOrder(
+ _ClassifyInclude(fileinfo, include, is_system))
+ if error_message:
+ error(filename, linenum, 'build/include_order', 4,
+ '%s. Should be: %s.h, c system, c++ system, other.' %
+ (error_message, fileinfo.BaseName()))
+ canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+ if not include_state.IsInAlphabeticalOrder(
+ clean_lines, linenum, canonical_include):
+ error(filename, linenum, 'build/include_alpha', 4,
+ 'Include "%s" not in alphabetical order' % include)
+ include_state.SetLastHeader(canonical_include)
+
+
+
+def _GetTextInside(text, start_pattern):
+ r"""Retrieves all the text between matching open and close parentheses.
+
+ Given a string of lines and a regular expression string, retrieve all the text
+ following the expression and between opening punctuation symbols like
+ (, [, or {, and the matching close-punctuation symbol. This properly nested
+ occurrences of the punctuations, so for the text like
+ printf(a(), b(c()));
+ a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+ start_pattern must match string having an open punctuation symbol at the end.
+
+ Args:
+ text: The lines to extract text. Its comments and strings must be elided.
+ It can be single line and can span multiple lines.
+ start_pattern: The regexp string indicating where to start extracting
+ the text.
+ Returns:
+ The extracted text.
+ None if either the opening string or ending punctuation could not be found.
+ """
+ # TODO(unknown): Audit cpplint.py to see what places could be profitably
+ # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+ # Give opening punctuations to get the matching close-punctuations.
+ matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+ closing_punctuation = set(matching_punctuation.itervalues())
+
+ # Find the position to start extracting text.
+ match = re.search(start_pattern, text, re.M)
+ if not match: # start_pattern not found in text.
+ return None
+ start_position = match.end(0)
+
+ assert start_position > 0, (
+ 'start_pattern must ends with an opening punctuation.')
+ assert text[start_position - 1] in matching_punctuation, (
+ 'start_pattern must ends with an opening punctuation.')
+ # Stack of closing punctuations we expect to have in text after position.
+ punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+ position = start_position
+ while punctuation_stack and position < len(text):
+ if text[position] == punctuation_stack[-1]:
+ punctuation_stack.pop()
+ elif text[position] in closing_punctuation:
+ # A closing punctuation without matching opening punctuations.
+ return None
+ elif text[position] in matching_punctuation:
+ punctuation_stack.append(matching_punctuation[text[position]])
+ position += 1
+ if punctuation_stack:
+ # Opening punctuations left without matching close-punctuations.
+ return None
+ # punctuations match.
+ return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+# < (?: < (?: < [^<>]*
+# >
+# | [^<>] )*
+# >
+# | [^<>] )*
+# >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*' # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+ r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+ r'(?:\w|'
+ r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+ r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+ r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+ r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+ r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+ r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+# Stream types.
+_RE_PATTERN_REF_STREAM_PARAM = (
+ r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+ include_state, nesting_state, error):
+ """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+ Some of these rules are hard to test (function overloading, using
+ uint32 inappropriately), but we do the best we can.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ file_extension: The extension (without the dot) of the filename.
+ include_state: An _IncludeState instance in which the headers are inserted.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: The function to call with any errors found.
+ """
+ # If the line is empty or consists of entirely a comment, no need to
+ # check it.
+ line = clean_lines.elided[linenum]
+ if not line:
+ return
+
+ match = _RE_PATTERN_INCLUDE.search(line)
+ if match:
+ CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+ return
+
+ # Reset include state across preprocessor directives. This is meant
+ # to silence warnings for conditional includes.
+ match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+ if match:
+ include_state.ResetSection(match.group(1))
+
+ # Make Windows paths like Unix.
+ fullname = os.path.abspath(filename).replace('\\', '/')
+
+ # Perform other checks now that we are sure that this is not an include line
+ CheckCasts(filename, clean_lines, linenum, error)
+ CheckGlobalStatic(filename, clean_lines, linenum, error)
+ CheckPrintf(filename, clean_lines, linenum, error)
+
+ if IsHeaderExtension(file_extension):
+ # TODO(unknown): check that 1-arg constructors are explicit.
+ # How to tell it's a constructor?
+ # (handled in CheckForNonStandardConstructs for now)
+ # TODO(unknown): check that classes declare or disable copy/assign
+ # (level 1 error)
+ pass
+
+ # Check if people are using the verboten C basic types. The only exception
+ # we regularly allow is "unsigned short port" for port.
+ if Search(r'\bshort port\b', line):
+ if not Search(r'\bunsigned short port\b', line):
+ error(filename, linenum, 'runtime/int', 4,
+ 'Use "unsigned short" for ports, not "short"')
+ else:
+ match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+ if match:
+ error(filename, linenum, 'runtime/int', 4,
+ 'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+ # Check if some verboten operator overloading is going on
+ # TODO(unknown): catch out-of-line unary operator&:
+ # class X {};
+ # int operator&(const X& x) { return 42; } // unary operator&
+ # The trick is it's hard to tell apart from binary operator&:
+ # class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+ if Search(r'\boperator\s*&\s*\(\s*\)', line):
+ error(filename, linenum, 'runtime/operator', 4,
+ 'Unary operator& is dangerous. Do not use it.')
+
+ # Check for suspicious usage of "if" like
+ # } if (a == b) {
+ if Search(r'\}\s*if\s*\(', line):
+ error(filename, linenum, 'readability/braces', 4,
+ 'Did you mean "else if"? If not, start a new line for "if".')
+
+ # Check for potential format string bugs like printf(foo).
+ # We constrain the pattern not to pick things like DocidForPrintf(foo).
+ # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+ # TODO(unknown): Catch the following case. Need to change the calling
+ # convention of the whole function to process multiple line to handle it.
+ # printf(
+ # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+ printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+ if printf_args:
+ match = Match(r'([\w.\->()]+)$', printf_args)
+ if match and match.group(1) != '__VA_ARGS__':
+ function_name = re.search(r'\b((?:string)?printf)\s*\(',
+ line, re.I).group(1)
+ error(filename, linenum, 'runtime/printf', 4,
+ 'Potential format string bug. Do %s("%%s", %s) instead.'
+ % (function_name, match.group(1)))
+
+ # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+ match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+ if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+ error(filename, linenum, 'runtime/memset', 4,
+ 'Did you mean "memset(%s, 0, %s)"?'
+ % (match.group(1), match.group(2)))
+
+ if Search(r'\busing namespace\b', line):
+ error(filename, linenum, 'build/namespaces', 5,
+ 'Do not use namespace using-directives. '
+ 'Use using-declarations instead.')
+
+ # Detect variable-length arrays.
+ match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+ if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+ match.group(3).find(']') == -1):
+ # Split the size using space and arithmetic operators as delimiters.
+ # If any of the resulting tokens are not compile time constants then
+ # report the error.
+ tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+ is_const = True
+ skip_next = False
+ for tok in tokens:
+ if skip_next:
+ skip_next = False
+ continue
+
+ if Search(r'sizeof\(.+\)', tok): continue
+ if Search(r'arraysize\(\w+\)', tok): continue
+
+ tok = tok.lstrip('(')
+ tok = tok.rstrip(')')
+ if not tok: continue
+ if Match(r'\d+', tok): continue
+ if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+ if Match(r'k[A-Z0-9]\w*', tok): continue
+ if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+ if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+ # A catch all for tricky sizeof cases, including 'sizeof expression',
+ # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+ # requires skipping the next token because we split on ' ' and '*'.
+ if tok.startswith('sizeof'):
+ skip_next = True
+ continue
+ is_const = False
+ break
+ if not is_const:
+ error(filename, linenum, 'runtime/arrays', 1,
+ 'Do not use variable-length arrays. Use an appropriately named '
+ "('k' followed by CamelCase) compile-time constant for the size.")
+
+ # Check for use of unnamed namespaces in header files. Registration
+ # macros are typically OK, so we allow use of "namespace {" on lines
+ # that end with backslashes.
+ if (IsHeaderExtension(file_extension)
+ and Search(r'\bnamespace\s*{', line)
+ and line[-1] != '\\'):
+ error(filename, linenum, 'build/namespaces', 4,
+ 'Do not use unnamed namespaces in header files. See '
+ 'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+ ' for more information.')
+
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+ """Check for unsafe global or static objects.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Match two lines at a time to support multiline declarations
+ if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+ line += clean_lines.elided[linenum + 1].strip()
+
+ # Check for people declaring static/global STL strings at the top level.
+ # This is dangerous because the C++ language does not guarantee that
+ # globals with constructors are initialized before the first access, and
+ # also because globals can be destroyed when some threads are still running.
+ # TODO(unknown): Generalize this to also find static unique_ptr instances.
+ # TODO(unknown): File bugs for clang-tidy to find these.
+ match = Match(
+ r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +'
+ r'([a-zA-Z0-9_:]+)\b(.*)',
+ line)
+
+ # Remove false positives:
+ # - String pointers (as opposed to values).
+ # string *pointer
+ # const string *pointer
+ # string const *pointer
+ # string *const pointer
+ #
+ # - Functions and template specializations.
+ # string Function<Type>(...
+ # string Class<Type>::Method(...
+ #
+ # - Operators. These are matched separately because operator names
+ # cross non-word boundaries, and trying to match both operators
+ # and functions at the same time would decrease accuracy of
+ # matching identifiers.
+ # string Class::operator*()
+ if (match and
+ not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and
+ not Search(r'\boperator\W', line) and
+ not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))):
+ if Search(r'\bconst\b', line):
+ error(filename, linenum, 'runtime/string', 4,
+ 'For a static/global string constant, use a C style string '
+ 'instead: "%schar%s %s[]".' %
+ (match.group(1), match.group(2) or '', match.group(3)))
+ else:
+ error(filename, linenum, 'runtime/string', 4,
+ 'Static/global string variables are not permitted.')
+
+ if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or
+ Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)):
+ error(filename, linenum, 'runtime/init', 4,
+ 'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+ """Check for printf related issues.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # When snprintf is used, the second argument shouldn't be a literal.
+ match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+ if match and match.group(2) != '0':
+ # If 2nd arg is zero, snprintf is used to calculate size.
+ error(filename, linenum, 'runtime/printf', 3,
+ 'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+ 'to snprintf.' % (match.group(1), match.group(2)))
+
+ # Check if some verboten C functions are being used.
+ if Search(r'\bsprintf\s*\(', line):
+ error(filename, linenum, 'runtime/printf', 5,
+ 'Never use sprintf. Use snprintf instead.')
+ match = Search(r'\b(strcpy|strcat)\s*\(', line)
+ if match:
+ error(filename, linenum, 'runtime/printf', 4,
+ 'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+ """Check if current line contains an inherited function.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ Returns:
+ True if current line contains a function with "override"
+ virt-specifier.
+ """
+ # Scan back a few lines for start of current function
+ for i in xrange(linenum, max(-1, linenum - 10), -1):
+ match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+ if match:
+ # Look for "override" after the matching closing parenthesis
+ line, _, closing_paren = CloseExpression(
+ clean_lines, i, len(match.group(1)))
+ return (closing_paren >= 0 and
+ Search(r'\boverride\b', line[closing_paren:]))
+ return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+ """Check if current line contains an out-of-line method definition.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ Returns:
+ True if current line contains an out-of-line method definition.
+ """
+ # Scan back a few lines for start of current function
+ for i in xrange(linenum, max(-1, linenum - 10), -1):
+ if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+ return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+ return False
+
+
+def IsInitializerList(clean_lines, linenum):
+ """Check if current line is inside constructor initializer list.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ Returns:
+ True if current line appears to be inside constructor initializer
+ list, False otherwise.
+ """
+ for i in xrange(linenum, 1, -1):
+ line = clean_lines.elided[i]
+ if i == linenum:
+ remove_function_body = Match(r'^(.*)\{\s*$', line)
+ if remove_function_body:
+ line = remove_function_body.group(1)
+
+ if Search(r'\s:\s*\w+[({]', line):
+ # A lone colon tend to indicate the start of a constructor
+ # initializer list. It could also be a ternary operator, which
+ # also tend to appear in constructor initializer lists as
+ # opposed to parameter lists.
+ return True
+ if Search(r'\}\s*,\s*$', line):
+ # A closing brace followed by a comma is probably the end of a
+ # brace-initialized member in constructor initializer list.
+ return True
+ if Search(r'[{};]\s*$', line):
+ # Found one of the following:
+ # - A closing brace or semicolon, probably the end of the previous
+ # function.
+ # - An opening brace, probably the start of current class or namespace.
+ #
+ # Current line is probably not inside an initializer list since
+ # we saw one of those things without seeing the starting colon.
+ return False
+
+ # Got to the beginning of the file without seeing the start of
+ # constructor initializer list.
+ return False
+
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+ nesting_state, error):
+ """Check for non-const references.
+
+ Separate from CheckLanguage since it scans backwards from current
+ line, instead of scanning forward.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: The function to call with any errors found.
+ """
+ # Do nothing if there is no '&' on current line.
+ line = clean_lines.elided[linenum]
+ if '&' not in line:
+ return
+
+ # If a function is inherited, current function doesn't have much of
+ # a choice, so any non-const references should not be blamed on
+ # derived function.
+ if IsDerivedFunction(clean_lines, linenum):
+ return
+
+ # Don't warn on out-of-line method definitions, as we would warn on the
+ # in-line declaration, if it isn't marked with 'override'.
+ if IsOutOfLineMethodDefinition(clean_lines, linenum):
+ return
+
+ # Long type names may be broken across multiple lines, usually in one
+ # of these forms:
+ # LongType
+ # ::LongTypeContinued &identifier
+ # LongType::
+ # LongTypeContinued &identifier
+ # LongType<
+ # ...>::LongTypeContinued &identifier
+ #
+ # If we detected a type split across two lines, join the previous
+ # line to current line so that we can match const references
+ # accordingly.
+ #
+ # Note that this only scans back one line, since scanning back
+ # arbitrary number of lines would be expensive. If you have a type
+ # that spans more than 2 lines, please use a typedef.
+ if linenum > 1:
+ previous = None
+ if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+ # previous_line\n + ::current_line
+ previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+ clean_lines.elided[linenum - 1])
+ elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+ # previous_line::\n + current_line
+ previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+ clean_lines.elided[linenum - 1])
+ if previous:
+ line = previous.group(1) + line.lstrip()
+ else:
+ # Check for templated parameter that is split across multiple lines
+ endpos = line.rfind('>')
+ if endpos > -1:
+ (_, startline, startpos) = ReverseCloseExpression(
+ clean_lines, linenum, endpos)
+ if startpos > -1 and startline < linenum:
+ # Found the matching < on an earlier line, collect all
+ # pieces up to current line.
+ line = ''
+ for i in xrange(startline, linenum + 1):
+ line += clean_lines.elided[i].strip()
+
+ # Check for non-const references in function parameters. A single '&' may
+ # found in the following places:
+ # inside expression: binary & for bitwise AND
+ # inside expression: unary & for taking the address of something
+ # inside declarators: reference parameter
+ # We will exclude the first two cases by checking that we are not inside a
+ # function body, including one that was just introduced by a trailing '{'.
+ # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+ if (nesting_state.previous_stack_top and
+ not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+ isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+ # Not at toplevel, not within a class, and not within a namespace
+ return
+
+ # Avoid initializer lists. We only need to scan back from the
+ # current line for something that starts with ':'.
+ #
+ # We don't need to check the current line, since the '&' would
+ # appear inside the second set of parentheses on the current line as
+ # opposed to the first set.
+ if linenum > 0:
+ for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+ previous_line = clean_lines.elided[i]
+ if not Search(r'[),]\s*$', previous_line):
+ break
+ if Match(r'^\s*:\s+\S', previous_line):
+ return
+
+ # Avoid preprocessors
+ if Search(r'\\\s*$', line):
+ return
+
+ # Avoid constructor initializer lists
+ if IsInitializerList(clean_lines, linenum):
+ return
+
+ # We allow non-const references in a few standard places, like functions
+ # called "swap()" or iostream operators like "<<" or ">>". Do not check
+ # those function parameters.
+ #
+ # We also accept & in static_assert, which looks like a function but
+ # it's actually a declaration expression.
+ allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+ r'operator\s*[<>][<>]|'
+ r'static_assert|COMPILE_ASSERT'
+ r')\s*\(')
+ if Search(allowed_functions, line):
+ return
+ elif not Search(r'\S+\([^)]*$', line):
+ # Don't see an allowed function on this line. Actually we
+ # didn't see any function name on this line, so this is likely a
+ # multi-line parameter list. Try a bit harder to catch this case.
+ for i in xrange(2):
+ if (linenum > i and
+ Search(allowed_functions, clean_lines.elided[linenum - i - 1])):
+ return
+
+ decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body
+ for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+ if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and
+ not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)):
+ error(filename, linenum, 'runtime/references', 2,
+ 'Is this a non-const reference? '
+ 'If so, make const or use a pointer: ' +
+ ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCasts(filename, clean_lines, linenum, error):
+ """Various cast related checks.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ # Check to see if they're using an conversion function cast.
+ # I just try to capture the most common basic types, though there are more.
+ # Parameterless conversion functions, such as bool(), are allowed as they are
+ # probably a member operator declaration or default constructor.
+ match = Search(
+ r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b'
+ r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+ r'(\([^)].*)', line)
+ expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+ if match and not expecting_function:
+ matched_type = match.group(2)
+
+ # matched_new_or_template is used to silence two false positives:
+ # - New operators
+ # - Template arguments with function types
+ #
+ # For template arguments, we match on types immediately following
+ # an opening bracket without any spaces. This is a fast way to
+ # silence the common case where the function type is the first
+ # template argument. False negative with less-than comparison is
+ # avoided because those operators are usually followed by a space.
+ #
+ # function<double(double)> // bracket + no space = false positive
+ # value < double(42) // bracket + space = true positive
+ matched_new_or_template = match.group(1)
+
+ # Avoid arrays by looking for brackets that come after the closing
+ # parenthesis.
+ if Match(r'\([^()]+\)\s*\[', match.group(3)):
+ return
+
+ # Other things to ignore:
+ # - Function pointers
+ # - Casts to pointer types
+ # - Placement new
+ # - Alias declarations
+ matched_funcptr = match.group(3)
+ if (matched_new_or_template is None and
+ not (matched_funcptr and
+ (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+ matched_funcptr) or
+ matched_funcptr.startswith('(*)'))) and
+ not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+ not Search(r'new\(\S+\)\s*' + matched_type, line)):
+ error(filename, linenum, 'readability/casting', 4,
+ 'Using deprecated casting style. '
+ 'Use static_cast<%s>(...) instead' %
+ matched_type)
+
+ if not expecting_function:
+ CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+ r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+ # This doesn't catch all cases. Consider (const char * const)"hello".
+ #
+ # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+ # compile).
+ if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+ r'\((char\s?\*+\s?)\)\s*"', error):
+ pass
+ else:
+ # Check pointer casts for other than string constants
+ CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+ r'\((\w+\s?\*+\s?)\)', error)
+
+ # In addition, we look for people taking the address of a cast. This
+ # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+ # point where you think.
+ #
+ # Some non-identifier character is required before the '&' for the
+ # expression to be recognized as a cast. These are casts:
+ # expression = &static_cast<int*>(temporary());
+ # function(&(int*)(temporary()));
+ #
+ # This is not a cast:
+ # reference_type&(int* function_param);
+ match = Search(
+ r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+ r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+ if match:
+ # Try a better error message when the & is bound to something
+ # dereferenced by the casted pointer, as opposed to the casted
+ # pointer itself.
+ parenthesis_error = False
+ match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+ if match:
+ _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+ if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+ _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+ if x2 >= 0:
+ extended_line = clean_lines.elided[y2][x2:]
+ if y2 < clean_lines.NumLines() - 1:
+ extended_line += clean_lines.elided[y2 + 1]
+ if Match(r'\s*(?:->|\[)', extended_line):
+ parenthesis_error = True
+
+ if parenthesis_error:
+ error(filename, linenum, 'readability/casting', 4,
+ ('Are you taking an address of something dereferenced '
+ 'from a cast? Wrapping the dereferenced expression in '
+ 'parentheses will make the binding more obvious'))
+ else:
+ error(filename, linenum, 'runtime/casting', 4,
+ ('Are you taking an address of a cast? '
+ 'This is dangerous: could be a temp var. '
+ 'Take the address before doing the cast, rather than after'))
+
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
+ """Checks for a C-style cast by looking for the pattern.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ cast_type: The string for the C++ cast to recommend. This is either
+ reinterpret_cast, static_cast, or const_cast, depending.
+ pattern: The regular expression used to find C-style casts.
+ error: The function to call with any errors found.
+
+ Returns:
+ True if an error was emitted.
+ False otherwise.
+ """
+ line = clean_lines.elided[linenum]
+ match = Search(pattern, line)
+ if not match:
+ return False
+
+ # Exclude lines with keywords that tend to look like casts
+ context = line[0:match.start(1) - 1]
+ if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
+ return False
+
+ # Try expanding current context to see if we one level of
+ # parentheses inside a macro.
+ if linenum > 0:
+ for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+ context = clean_lines.elided[i] + context
+ if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+ return False
+
+ # operator++(int) and operator--(int)
+ if context.endswith(' operator++') or context.endswith(' operator--'):
+ return False
+
+ # A single unnamed argument for a function tends to look like old style cast.
+ # If we see those, don't issue warnings for deprecated casts.
+ remainder = line[match.end(0):]
+ if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+ remainder):
+ return False
+
+ # At this point, all that should be left is actual casts.
+ error(filename, linenum, 'readability/casting', 4,
+ 'Using C-style cast. Use %s<%s>(...) instead' %
+ (cast_type, match.group(1)))
+
+ return True
+
+
+def ExpectingFunctionArgs(clean_lines, linenum):
+ """Checks whether where function type arguments are expected.
+
+ Args:
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+
+ Returns:
+ True if the line at 'linenum' is inside something that expects arguments
+ of function types.
+ """
+ line = clean_lines.elided[linenum]
+ return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+ (linenum >= 2 and
+ (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+ clean_lines.elided[linenum - 1]) or
+ Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+ clean_lines.elided[linenum - 2]) or
+ Search(r'\bstd::m?function\s*\<\s*$',
+ clean_lines.elided[linenum - 1]))))
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+ ('<deque>', ('deque',)),
+ ('<functional>', ('unary_function', 'binary_function',
+ 'plus', 'minus', 'multiplies', 'divides', 'modulus',
+ 'negate',
+ 'equal_to', 'not_equal_to', 'greater', 'less',
+ 'greater_equal', 'less_equal',
+ 'logical_and', 'logical_or', 'logical_not',
+ 'unary_negate', 'not1', 'binary_negate', 'not2',
+ 'bind1st', 'bind2nd',
+ 'pointer_to_unary_function',
+ 'pointer_to_binary_function',
+ 'ptr_fun',
+ 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+ 'mem_fun_ref_t',
+ 'const_mem_fun_t', 'const_mem_fun1_t',
+ 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+ 'mem_fun_ref',
+ )),
+ ('<limits>', ('numeric_limits',)),
+ ('<list>', ('list',)),
+ ('<map>', ('map', 'multimap',)),
+ ('<memory>', ('allocator', 'make_shared', 'make_unique', 'shared_ptr',
+ 'unique_ptr', 'weak_ptr')),
+ ('<queue>', ('queue', 'priority_queue',)),
+ ('<set>', ('set', 'multiset',)),
+ ('<stack>', ('stack',)),
+ ('<string>', ('char_traits', 'basic_string',)),
+ ('<tuple>', ('tuple',)),
+ ('<unordered_map>', ('unordered_map', 'unordered_multimap')),
+ ('<unordered_set>', ('unordered_set', 'unordered_multiset')),
+ ('<utility>', ('pair',)),
+ ('<vector>', ('vector',)),
+
+ # gcc extensions.
+ # Note: std::hash is their hash, ::hash is our hash
+ ('<hash_map>', ('hash_map', 'hash_multimap',)),
+ ('<hash_set>', ('hash_set', 'hash_multiset',)),
+ ('<slist>', ('slist',)),
+ )
+
+_HEADERS_MAYBE_TEMPLATES = (
+ ('<algorithm>', ('copy', 'max', 'min', 'min_element', 'sort',
+ 'transform',
+ )),
+ ('<utility>', ('forward', 'make_pair', 'move', 'swap')),
+ )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_headers_maybe_templates = []
+for _header, _templates in _HEADERS_MAYBE_TEMPLATES:
+ for _template in _templates:
+ # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+ # type::max().
+ _re_pattern_headers_maybe_templates.append(
+ (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+ _template,
+ _header))
+
+# Other scripts may reach in and modify this pattern.
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+ for _template in _templates:
+ _re_pattern_templates.append(
+ (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+ _template + '<>',
+ _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+ """Check if these two filenames belong to the same module.
+
+ The concept of a 'module' here is a as follows:
+ foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+ same 'module' if they are in the same directory.
+ some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+ to belong to the same module here.
+
+ If the filename_cc contains a longer path than the filename_h, for example,
+ '/absolute/path/to/base/sysinfo.cc', and this file would include
+ 'base/sysinfo.h', this function also produces the prefix needed to open the
+ header. This is used by the caller of this function to more robustly open the
+ header file. We don't have access to the real include paths in this context,
+ so we need this guesswork here.
+
+ Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+ according to this implementation. Because of this, this function gives
+ some false positives. This should be sufficiently rare in practice.
+
+ Args:
+ filename_cc: is the path for the .cc file
+ filename_h: is the path for the header path
+
+ Returns:
+ Tuple with a bool and a string:
+ bool: True if filename_cc and filename_h belong to the same module.
+ string: the additional prefix needed to open the header file.
+ """
+
+ fileinfo = FileInfo(filename_cc)
+ if not fileinfo.IsSource():
+ return (False, '')
+ filename_cc = filename_cc[:-len(fileinfo.Extension())]
+ matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName())
+ if matched_test_suffix:
+ filename_cc = filename_cc[:-len(matched_test_suffix.group(1))]
+ filename_cc = filename_cc.replace('/public/', '/')
+ filename_cc = filename_cc.replace('/internal/', '/')
+
+ if not filename_h.endswith('.h'):
+ return (False, '')
+ filename_h = filename_h[:-len('.h')]
+ if filename_h.endswith('-inl'):
+ filename_h = filename_h[:-len('-inl')]
+ filename_h = filename_h.replace('/public/', '/')
+ filename_h = filename_h.replace('/internal/', '/')
+
+ files_belong_to_same_module = filename_cc.endswith(filename_h)
+ common_path = ''
+ if files_belong_to_same_module:
+ common_path = filename_cc[:-len(filename_h)]
+ return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_dict, io=codecs):
+ """Fill up the include_dict with new includes found from the file.
+
+ Args:
+ filename: the name of the header to read.
+ include_dict: a dictionary in which the headers are inserted.
+ io: The io factory to use to read the file. Provided for testability.
+
+ Returns:
+ True if a header was successfully added. False otherwise.
+ """
+ headerfile = None
+ try:
+ headerfile = io.open(filename, 'r', 'utf8', 'replace')
+ except IOError:
+ return False
+ linenum = 0
+ for line in headerfile:
+ linenum += 1
+ clean_line = CleanseComments(line)
+ match = _RE_PATTERN_INCLUDE.search(clean_line)
+ if match:
+ include = match.group(2)
+ include_dict.setdefault(include, linenum)
+ return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+ io=codecs):
+ """Reports for missing stl includes.
+
+ This function will output warnings to make sure you are including the headers
+ necessary for the stl containers and functions that you use. We only give one
+ reason to include a header. For example, if you use both equal_to<> and
+ less<> in a .h file, only one (the latter in the file) of these will be
+ reported as a reason to include the <functional>.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ include_state: An _IncludeState instance.
+ error: The function to call with any errors found.
+ io: The IO factory to use to read the header file. Provided for unittest
+ injection.
+ """
+ required = {} # A map of header name to linenumber and the template entity.
+ # Example of required: { '<functional>': (1219, 'less<>') }
+
+ for linenum in xrange(clean_lines.NumLines()):
+ line = clean_lines.elided[linenum]
+ if not line or line[0] == '#':
+ continue
+
+ # String is special -- it is a non-templatized type in STL.
+ matched = _RE_PATTERN_STRING.search(line)
+ if matched:
+ # Don't warn about strings in non-STL namespaces:
+ # (We check only the first match per line; good enough.)
+ prefix = line[:matched.start()]
+ if prefix.endswith('std::') or not prefix.endswith('::'):
+ required['<string>'] = (linenum, 'string')
+
+ for pattern, template, header in _re_pattern_headers_maybe_templates:
+ if pattern.search(line):
+ required[header] = (linenum, template)
+
+ # The following function is just a speed up, no semantics are changed.
+ if not '<' in line: # Reduces the cpu time usage by skipping lines.
+ continue
+
+ for pattern, template, header in _re_pattern_templates:
+ matched = pattern.search(line)
+ if matched:
+ # Don't warn about IWYU in non-STL namespaces:
+ # (We check only the first match per line; good enough.)
+ prefix = line[:matched.start()]
+ if prefix.endswith('std::') or not prefix.endswith('::'):
+ required[header] = (linenum, template)
+
+ # The policy is that if you #include something in foo.h you don't need to
+ # include it again in foo.cc. Here, we will look at possible includes.
+ # Let's flatten the include_state include_list and copy it into a dictionary.
+ include_dict = dict([item for sublist in include_state.include_list
+ for item in sublist])
+
+ # Did we find the header for this file (if any) and successfully load it?
+ header_found = False
+
+ # Use the absolute path so that matching works properly.
+ abs_filename = FileInfo(filename).FullName()
+
+ # For Emacs's flymake.
+ # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+ # by flymake and that file name might end with '_flymake.cc'. In that case,
+ # restore original file name here so that the corresponding header file can be
+ # found.
+ # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+ # instead of 'foo_flymake.h'
+ abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+ # include_dict is modified during iteration, so we iterate over a copy of
+ # the keys.
+ header_keys = include_dict.keys()
+ for header in header_keys:
+ (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+ fullpath = common_path + header
+ if same_module and UpdateIncludeState(fullpath, include_dict, io):
+ header_found = True
+
+ # If we can't find the header file for a .cc, assume it's because we don't
+ # know where to look. In that case we'll give up as we're not sure they
+ # didn't include it in the .h file.
+ # TODO(unknown): Do a better job of finding .h files so we are confident that
+ # not having the .h file means there isn't one.
+ if filename.endswith('.cc') and not header_found:
+ return
+
+ # All the lines have been processed, report the errors found.
+ for required_header_unstripped in required:
+ template = required[required_header_unstripped][1]
+ if required_header_unstripped.strip('<>"') not in include_dict:
+ error(filename, required[required_header_unstripped][0],
+ 'build/include_what_you_use', 4,
+ 'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+ """Check that make_pair's template arguments are deduced.
+
+ G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
+ specified explicitly, and such use isn't intended in any case.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+ match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+ if match:
+ error(filename, linenum, 'build/explicit_make_pair',
+ 4, # 4 = high confidence
+ 'For C++11-compatibility, omit template arguments from make_pair'
+ ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+ """Check if line contains a redundant "virtual" function-specifier.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ # Look for "virtual" on current line.
+ line = clean_lines.elided[linenum]
+ virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+ if not virtual: return
+
+ # Ignore "virtual" keywords that are near access-specifiers. These
+ # are only used in class base-specifier and do not apply to member
+ # functions.
+ if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+ Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+ return
+
+ # Ignore the "virtual" keyword from virtual base classes. Usually
+ # there is a column on the same line in these cases (virtual base
+ # classes are rare in google3 because multiple inheritance is rare).
+ if Match(r'^.*[^:]:[^:].*$', line): return
+
+ # Look for the next opening parenthesis. This is the start of the
+ # parameter list (possibly on the next line shortly after virtual).
+ # TODO(unknown): doesn't work if there are virtual functions with
+ # decltype() or other things that use parentheses, but csearch suggests
+ # that this is rare.
+ end_col = -1
+ end_line = -1
+ start_col = len(virtual.group(2))
+ for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+ line = clean_lines.elided[start_line][start_col:]
+ parameter_list = Match(r'^([^(]*)\(', line)
+ if parameter_list:
+ # Match parentheses to find the end of the parameter list
+ (_, end_line, end_col) = CloseExpression(
+ clean_lines, start_line, start_col + len(parameter_list.group(1)))
+ break
+ start_col = 0
+
+ if end_col < 0:
+ return # Couldn't find end of parameter list, give up
+
+ # Look for "override" or "final" after the parameter list
+ # (possibly on the next few lines).
+ for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+ line = clean_lines.elided[i][end_col:]
+ match = Search(r'\b(override|final)\b', line)
+ if match:
+ error(filename, linenum, 'readability/inheritance', 4,
+ ('"virtual" is redundant since function is '
+ 'already declared as "%s"' % match.group(1)))
+
+ # Set end_col to check whole lines after we are done with the
+ # first line.
+ end_col = 0
+ if Search(r'[^\w]\s*$', line):
+ break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+ """Check if line contains a redundant "override" or "final" virt-specifier.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ # Look for closing parenthesis nearby. We need one to confirm where
+ # the declarator ends and where the virt-specifier starts to avoid
+ # false positives.
+ line = clean_lines.elided[linenum]
+ declarator_end = line.rfind(')')
+ if declarator_end >= 0:
+ fragment = line[declarator_end:]
+ else:
+ if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+ fragment = line
+ else:
+ return
+
+ # Check that at most one of "override" or "final" is present, not both
+ if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+ error(filename, linenum, 'readability/inheritance', 4,
+ ('"override" is redundant since function is '
+ 'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+ """Checks that the new block is directly in a namespace.
+
+ Args:
+ nesting_state: The _NestingState object that contains info about our state.
+ is_forward_declaration: If the class is a forward declared class.
+ Returns:
+ Whether or not the new block is directly in a namespace.
+ """
+ if is_forward_declaration:
+ if len(nesting_state.stack) >= 1 and (
+ isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+ return True
+ else:
+ return False
+
+ return (len(nesting_state.stack) > 1 and
+ nesting_state.stack[-1].check_namespace_indentation and
+ isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+ raw_lines_no_comments, linenum):
+ """This method determines if we should apply our namespace indentation check.
+
+ Args:
+ nesting_state: The current nesting state.
+ is_namespace_indent_item: If we just put a new class on the stack, True.
+ If the top of the stack is not a class, or we did not recently
+ add the class, False.
+ raw_lines_no_comments: The lines without the comments.
+ linenum: The current line number we are processing.
+
+ Returns:
+ True if we should apply our namespace indentation check. Currently, it
+ only works for classes and namespaces inside of a namespace.
+ """
+
+ is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+ linenum)
+
+ if not (is_namespace_indent_item or is_forward_declaration):
+ return False
+
+ # If we are in a macro, we do not want to check the namespace indentation.
+ if IsMacroDefinition(raw_lines_no_comments, linenum):
+ return False
+
+ return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+ error):
+ line = raw_lines_no_comments[linenum]
+ if Match(r'^\s+', line):
+ error(filename, linenum, 'runtime/indentation_namespace', 4,
+ 'Do not indent within a namespace')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+ include_state, function_state, nesting_state, error,
+ extra_check_functions=[]):
+ """Processes a single line in the file.
+
+ Args:
+ filename: Filename of the file that is being processed.
+ file_extension: The extension (dot not included) of the file.
+ clean_lines: An array of strings, each representing a line of the file,
+ with comments stripped.
+ line: Number of line being processed.
+ include_state: An _IncludeState instance in which the headers are inserted.
+ function_state: A _FunctionState instance which counts function lines, etc.
+ nesting_state: A NestingState instance which maintains information about
+ the current stack of nested blocks being parsed.
+ error: A callable to which errors are reported, which takes 4 arguments:
+ filename, line number, error level, and message
+ extra_check_functions: An array of additional check functions that will be
+ run on each source line. Each function takes 4
+ arguments: filename, clean_lines, line, error
+ """
+ raw_lines = clean_lines.raw_lines
+ ParseNolintSuppressions(filename, raw_lines[line], line, error)
+ nesting_state.Update(filename, clean_lines, line, error)
+ CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+ error)
+ if nesting_state.InAsmBlock(): return
+ CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+ CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+ CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+ CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+ nesting_state, error)
+ CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+ CheckForNonStandardConstructs(filename, clean_lines, line,
+ nesting_state, error)
+ CheckVlogArguments(filename, clean_lines, line, error)
+ CheckPosixThreading(filename, clean_lines, line, error)
+ CheckInvalidIncrement(filename, clean_lines, line, error)
+ CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+ CheckRedundantVirtual(filename, clean_lines, line, error)
+ CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
+ for check_fn in extra_check_functions:
+ check_fn(filename, clean_lines, line, error)
+
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+ """Flag those c++11 features that we only allow in certain places.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+ # Flag unapproved C++ TR1 headers.
+ if include and include.group(1).startswith('tr1/'):
+ error(filename, linenum, 'build/c++tr1', 5,
+ ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1))
+
+ # Flag unapproved C++11 headers.
+ if include and include.group(1) in ('cfenv',
+ 'condition_variable',
+ 'fenv.h',
+ 'future',
+ 'mutex',
+ 'thread',
+ 'chrono',
+ 'ratio',
+ 'regex',
+ 'system_error',
+ ):
+ error(filename, linenum, 'build/c++11', 5,
+ ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+ # The only place where we need to worry about C++11 keywords and library
+ # features in preprocessor directives is in macro definitions.
+ if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+ # These are classes and free functions. The classes are always
+ # mentioned as std::*, but we only catch the free functions if
+ # they're not found by ADL. They're alphabetical by header.
+ for top_name in (
+ # type_traits
+ 'alignment_of',
+ 'aligned_union',
+ ):
+ if Search(r'\bstd::%s\b' % top_name, line):
+ error(filename, linenum, 'build/c++11', 5,
+ ('std::%s is an unapproved C++11 class or function. Send c-style '
+ 'an example of where it would make your code more readable, and '
+ 'they may let you use it.') % top_name)
+
+
+def FlagCxx14Features(filename, clean_lines, linenum, error):
+ """Flag those C++14 features that we restrict.
+
+ Args:
+ filename: The name of the current file.
+ clean_lines: A CleansedLines instance containing the file.
+ linenum: The number of the line to check.
+ error: The function to call with any errors found.
+ """
+ line = clean_lines.elided[linenum]
+
+ include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+ # Flag unapproved C++14 headers.
+ if include and include.group(1) in ('scoped_allocator', 'shared_mutex'):
+ error(filename, linenum, 'build/c++14', 5,
+ ('<%s> is an unapproved C++14 header.') % include.group(1))
+
+
+def ProcessFileData(filename, file_extension, lines, error,
+ extra_check_functions=[]):
+ """Performs lint checks and reports any errors to the given error function.
+
+ Args:
+ filename: Filename of the file that is being processed.
+ file_extension: The extension (dot not included) of the file.
+ lines: An array of strings, each representing a line of the file, with the
+ last element being empty if the file is terminated with a newline.
+ error: A callable to which errors are reported, which takes 4 arguments:
+ filename, line number, error level, and message
+ extra_check_functions: An array of additional check functions that will be
+ run on each source line. Each function takes 4
+ arguments: filename, clean_lines, line, error
+ """
+ lines = (['// marker so line numbers and indices both start at 1'] + lines +
+ ['// marker so line numbers end in a known way'])
+
+ include_state = _IncludeState()
+ function_state = _FunctionState()
+ nesting_state = NestingState()
+
+ ResetNolintSuppressions()
+
+ CheckForCopyright(filename, lines, error)
+ ProcessGlobalSuppresions(lines)
+ RemoveMultiLineComments(filename, lines, error)
+ clean_lines = CleansedLines(lines)
+
+ if IsHeaderExtension(file_extension):
+ CheckForHeaderGuard(filename, clean_lines, error)
+
+ for line in xrange(clean_lines.NumLines()):
+ ProcessLine(filename, file_extension, clean_lines, line,
+ include_state, function_state, nesting_state, error,
+ extra_check_functions)
+ FlagCxx11Features(filename, clean_lines, line, error)
+ nesting_state.CheckCompletedBlocks(filename, error)
+
+ CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+ # Check that the .cc file has included its header if it exists.
+ if _IsSourceExtension(file_extension):
+ CheckHeaderFileIncluded(filename, include_state, error)
+
+ # We check here rather than inside ProcessLine so that we see raw
+ # lines rather than "cleaned" lines.
+ CheckForBadCharacters(filename, lines, error)
+
+ CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessConfigOverrides(filename):
+ """ Loads the configuration files and processes the config overrides.
+
+ Args:
+ filename: The name of the file being processed by the linter.
+
+ Returns:
+ False if the current |filename| should not be processed further.
+ """
+
+ abs_filename = os.path.abspath(filename)
+ cfg_filters = []
+ keep_looking = True
+ while keep_looking:
+ abs_path, base_name = os.path.split(abs_filename)
+ if not base_name:
+ break # Reached the root directory.
+
+ cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+ abs_filename = abs_path
+ if not os.path.isfile(cfg_file):
+ continue
+
+ try:
+ with open(cfg_file) as file_handle:
+ for line in file_handle:
+ line, _, _ = line.partition('#') # Remove comments.
+ if not line.strip():
+ continue
+
+ name, _, val = line.partition('=')
+ name = name.strip()
+ val = val.strip()
+ if name == 'set noparent':
+ keep_looking = False
+ elif name == 'filter':
+ cfg_filters.append(val)
+ elif name == 'exclude_files':
+ # When matching exclude_files pattern, use the base_name of
+ # the current file name or the directory name we are processing.
+ # For example, if we are checking for lint errors in /foo/bar/baz.cc
+ # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+ # file's "exclude_files" filter is meant to be checked against "bar"
+ # and not "baz" nor "bar/baz.cc".
+ if base_name:
+ pattern = re.compile(val)
+ if pattern.match(base_name):
+ if _cpplint_state.quiet:
+ # Suppress "Ignoring file" warning when using --quiet.
+ return False
+ sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+ 'File path component "%s" matches '
+ 'pattern "%s"\n' %
+ (filename, cfg_file, base_name, val))
+ return False
+ elif name == 'linelength':
+ global _line_length
+ try:
+ _line_length = int(val)
+ except ValueError:
+ sys.stderr.write('Line length must be numeric.')
+ elif name == 'root':
+ global _root
+ # root directories are specified relative to CPPLINT.cfg dir.
+ _root = os.path.join(os.path.dirname(cfg_file), val)
+ elif name == 'headers':
+ ProcessHppHeadersOption(val)
+ else:
+ sys.stderr.write(
+ 'Invalid configuration option (%s) in file %s\n' %
+ (name, cfg_file))
+
+ except IOError:
+ sys.stderr.write(
+ "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+ keep_looking = False
+
+ # Apply all the accumulated filters in reverse order (top-level directory
+ # config options having the least priority).
+ for filter in reversed(cfg_filters):
+ _AddFilters(filter)
+
+ return True
+
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+ """Does google-lint on a single file.
+
+ Args:
+ filename: The name of the file to parse.
+
+ vlevel: The level of errors to report. Every error of confidence
+ >= verbose_level will be reported. 0 is a good default.
+
+ extra_check_functions: An array of additional check functions that will be
+ run on each source line. Each function takes 4
+ arguments: filename, clean_lines, line, error
+ """
+
+ _SetVerboseLevel(vlevel)
+ _BackupFilters()
+ old_errors = _cpplint_state.error_count
+
+ if not ProcessConfigOverrides(filename):
+ _RestoreFilters()
+ return
+
+ lf_lines = []
+ crlf_lines = []
+ try:
+ # Support the UNIX convention of using "-" for stdin. Note that
+ # we are not opening the file with universal newline support
+ # (which codecs doesn't support anyway), so the resulting lines do
+ # contain trailing '\r' characters if we are reading a file that
+ # has CRLF endings.
+ # If after the split a trailing '\r' is present, it is removed
+ # below.
+ if filename == '-':
+ lines = codecs.StreamReaderWriter(sys.stdin,
+ codecs.getreader('utf8'),
+ codecs.getwriter('utf8'),
+ 'replace').read().split('\n')
+ else:
+ lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+ # Remove trailing '\r'.
+ # The -1 accounts for the extra trailing blank line we get from split()
+ for linenum in range(len(lines) - 1):
+ if lines[linenum].endswith('\r'):
+ lines[linenum] = lines[linenum].rstrip('\r')
+ crlf_lines.append(linenum + 1)
+ else:
+ lf_lines.append(linenum + 1)
+
+ except IOError:
+ sys.stderr.write(
+ "Skipping input '%s': Can't open for reading\n" % filename)
+ _RestoreFilters()
+ return
+
+ # Note, if no dot is found, this will give the entire filename as the ext.
+ file_extension = filename[filename.rfind('.') + 1:]
+
+ # When reading from stdin, the extension is unknown, so no cpplint tests
+ # should rely on the extension.
+ if filename != '-' and file_extension not in _valid_extensions:
+ sys.stderr.write('Ignoring %s; not a valid file name '
+ '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+ else:
+ ProcessFileData(filename, file_extension, lines, Error,
+ extra_check_functions)
+
+ # If end-of-line sequences are a mix of LF and CR-LF, issue
+ # warnings on the lines with CR.
+ #
+ # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+ # since critique can handle these just fine, and the style guide
+ # doesn't dictate a particular end of line sequence.
+ #
+ # We can't depend on os.linesep to determine what the desired
+ # end-of-line sequence should be, since that will return the
+ # server-side end-of-line sequence.
+ if lf_lines and crlf_lines:
+ # Warn on every line with CR. An alternative approach might be to
+ # check whether the file is mostly CRLF or just LF, and warn on the
+ # minority, we bias toward LF here since most tools prefer LF.
+ for linenum in crlf_lines:
+ Error(filename, linenum, 'whitespace/newline', 1,
+ 'Unexpected \\r (^M) found; better to use only \\n')
+
+ # Suppress printing anything if --quiet was passed unless the error
+ # count has increased after processing this file.
+ if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count:
+ sys.stdout.write('Done processing %s\n' % filename)
+ _RestoreFilters()
+
+
+def PrintUsage(message):
+ """Prints a brief usage string and exits, optionally with an error message.
+
+ Args:
+ message: The optional error message.
+ """
+ sys.stderr.write(_USAGE)
+ if message:
+ sys.exit('\nFATAL ERROR: ' + message)
+ else:
+ sys.exit(1)
+
+
+def PrintCategories():
+ """Prints a list of all the error-categories used by error messages.
+
+ These are the categories used to filter messages via --filter.
+ """
+ sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES))
+ sys.exit(0)
+
+
+def ParseArguments(args):
+ """Parses the command line arguments.
+
+ This may set the output format and verbosity level as side-effects.
+
+ Args:
+ args: The command line arguments:
+
+ Returns:
+ The list of filenames to lint.
+ """
+ try:
+ (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+ 'counting=',
+ 'filter=',
+ 'root=',
+ 'linelength=',
+ 'extensions=',
+ 'headers=',
+ 'quiet'])
+ except getopt.GetoptError:
+ PrintUsage('Invalid arguments.')
+
+ verbosity = _VerboseLevel()
+ output_format = _OutputFormat()
+ filters = ''
+ quiet = _Quiet()
+ counting_style = ''
+
+ for (opt, val) in opts:
+ if opt == '--help':
+ PrintUsage(None)
+ elif opt == '--output':
+ if val not in ('emacs', 'vs7', 'eclipse'):
+ PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+ output_format = val
+ elif opt == '--quiet':
+ quiet = True
+ elif opt == '--verbose':
+ verbosity = int(val)
+ elif opt == '--filter':
+ filters = val
+ if not filters:
+ PrintCategories()
+ elif opt == '--counting':
+ if val not in ('total', 'toplevel', 'detailed'):
+ PrintUsage('Valid counting options are total, toplevel, and detailed')
+ counting_style = val
+ elif opt == '--root':
+ global _root
+ _root = val
+ elif opt == '--linelength':
+ global _line_length
+ try:
+ _line_length = int(val)
+ except ValueError:
+ PrintUsage('Line length must be digits.')
+ elif opt == '--extensions':
+ global _valid_extensions
+ try:
+ _valid_extensions = set(val.split(','))
+ except ValueError:
+ PrintUsage('Extensions must be comma separated list.')
+ elif opt == '--headers':
+ ProcessHppHeadersOption(val)
+
+ if not filenames:
+ PrintUsage('No files were specified.')
+
+ _SetOutputFormat(output_format)
+ _SetQuiet(quiet)
+ _SetVerboseLevel(verbosity)
+ _SetFilters(filters)
+ _SetCountingStyle(counting_style)
+
+ return filenames
+
+
+def main():
+ filenames = ParseArguments(sys.argv[1:])
+
+ # Change stderr to write with replacement characters so we don't die
+ # if we try to print something containing non-ASCII characters.
+ sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+ codecs.getreader('utf8'),
+ codecs.getwriter('utf8'),
+ 'replace')
+
+ _cpplint_state.ResetErrorCounts()
+ for filename in filenames:
+ ProcessFile(filename, _cpplint_state.verbose_level)
+ # If --quiet is passed, suppress printing error count unless there are errors.
+ if not _cpplint_state.quiet or _cpplint_state.error_count > 0:
+ _cpplint_state.PrintErrorCounts()
+
+ sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/third_party/aom/tools/diff.py b/third_party/aom/tools/diff.py
new file mode 100644
index 0000000000..7bb6b7fcb4
--- /dev/null
+++ b/third_party/aom/tools/diff.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Classes for representing diff pieces."""
+
+__author__ = "jkoleszar@google.com"
+
+import re
+
+
+class DiffLines(object):
+ """A container for one half of a diff."""
+
+ def __init__(self, filename, offset, length):
+ self.filename = filename
+ self.offset = offset
+ self.length = length
+ self.lines = []
+ self.delta_line_nums = []
+
+ def Append(self, line):
+ l = len(self.lines)
+ if line[0] != " ":
+ self.delta_line_nums.append(self.offset + l)
+ self.lines.append(line[1:])
+ assert l+1 <= self.length
+
+ def Complete(self):
+ return len(self.lines) == self.length
+
+ def __contains__(self, item):
+ return item >= self.offset and item <= self.offset + self.length - 1
+
+
+class DiffHunk(object):
+ """A container for one diff hunk, consisting of two DiffLines."""
+
+ def __init__(self, header, file_a, file_b, start_a, len_a, start_b, len_b):
+ self.header = header
+ self.left = DiffLines(file_a, start_a, len_a)
+ self.right = DiffLines(file_b, start_b, len_b)
+ self.lines = []
+
+ def Append(self, line):
+ """Adds a line to the DiffHunk and its DiffLines children."""
+ if line[0] == "-":
+ self.left.Append(line)
+ elif line[0] == "+":
+ self.right.Append(line)
+ elif line[0] == " ":
+ self.left.Append(line)
+ self.right.Append(line)
+ elif line[0] == "\\":
+ # Ignore newline messages from git diff.
+ pass
+ else:
+ assert False, ("Unrecognized character at start of diff line "
+ "%r" % line[0])
+ self.lines.append(line)
+
+ def Complete(self):
+ return self.left.Complete() and self.right.Complete()
+
+ def __repr__(self):
+ return "DiffHunk(%s, %s, len %d)" % (
+ self.left.filename, self.right.filename,
+ max(self.left.length, self.right.length))
+
+
+def ParseDiffHunks(stream):
+ """Walk a file-like object, yielding DiffHunks as they're parsed."""
+
+ file_regex = re.compile(r"(\+\+\+|---) (\S+)")
+ range_regex = re.compile(r"@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?")
+ hunk = None
+ while True:
+ line = stream.readline()
+ if not line:
+ break
+
+ if hunk is None:
+ # Parse file names
+ diff_file = file_regex.match(line)
+ if diff_file:
+ if line.startswith("---"):
+ a_line = line
+ a = diff_file.group(2)
+ continue
+ if line.startswith("+++"):
+ b_line = line
+ b = diff_file.group(2)
+ continue
+
+ # Parse offset/lengths
+ diffrange = range_regex.match(line)
+ if diffrange:
+ if diffrange.group(2):
+ start_a = int(diffrange.group(1))
+ len_a = int(diffrange.group(3))
+ else:
+ start_a = 1
+ len_a = int(diffrange.group(1))
+
+ if diffrange.group(5):
+ start_b = int(diffrange.group(4))
+ len_b = int(diffrange.group(6))
+ else:
+ start_b = 1
+ len_b = int(diffrange.group(4))
+
+ header = [a_line, b_line, line]
+ hunk = DiffHunk(header, a, b, start_a, len_a, start_b, len_b)
+ else:
+ # Add the current line to the hunk
+ hunk.Append(line)
+
+ # See if the whole hunk has been parsed. If so, yield it and prepare
+ # for the next hunk.
+ if hunk.Complete():
+ yield hunk
+ hunk = None
+
+ # Partial hunks are a parse error
+ assert hunk is None
diff --git a/third_party/aom/tools/dump_obu.cc b/third_party/aom/tools/dump_obu.cc
new file mode 100644
index 0000000000..b9ff985c44
--- /dev/null
+++ b/third_party/aom/tools/dump_obu.cc
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "tools/obu_parser.h"
+
+namespace {
+
+const size_t kInitialBufferSize = 100 * 1024;
+
+struct InputContext {
+ InputContext() = default;
+ ~InputContext() { free(unit_buffer); }
+
+ void Init() {
+ memset(avx_ctx, 0, sizeof(*avx_ctx));
+ memset(obu_ctx, 0, sizeof(*obu_ctx));
+ obu_ctx->avx_ctx = avx_ctx;
+#if CONFIG_WEBM_IO
+ memset(webm_ctx, 0, sizeof(*webm_ctx));
+#endif
+ }
+
+ AvxInputContext *avx_ctx = nullptr;
+ ObuDecInputContext *obu_ctx = nullptr;
+#if CONFIG_WEBM_IO
+ WebmInputContext *webm_ctx = nullptr;
+#endif
+ uint8_t *unit_buffer = nullptr;
+ size_t unit_buffer_size = 0;
+};
+
+void PrintUsage() {
+ printf("Libaom OBU dump.\nUsage: dump_obu <input_file>\n");
+}
+
+VideoFileType GetFileType(InputContext *ctx) {
+ // TODO(https://crbug.com/aomedia/1706): webm type does not support reading
+ // from stdin yet, and file_is_webm is not using the detect buffer when
+ // determining the type. Therefore it should only be checked when using a file
+ // and needs to be checked prior to other types.
+#if CONFIG_WEBM_IO
+ if (file_is_webm(ctx->webm_ctx, ctx->avx_ctx)) return FILE_TYPE_WEBM;
+#endif
+ if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF;
+ if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU;
+ return FILE_TYPE_RAW;
+}
+
+bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) {
+ const VideoFileType file_type = ctx->avx_ctx->file_type;
+ switch (file_type) {
+ case FILE_TYPE_IVF: {
+ if (ivf_read_frame(ctx->avx_ctx, &ctx->unit_buffer, unit_size,
+ &ctx->unit_buffer_size, NULL)) {
+ return false;
+ }
+ break;
+ }
+ case FILE_TYPE_OBU: {
+ if (obudec_read_temporal_unit(ctx->obu_ctx, &ctx->unit_buffer, unit_size,
+ &ctx->unit_buffer_size)) {
+ return false;
+ }
+ break;
+ }
+#if CONFIG_WEBM_IO
+ case FILE_TYPE_WEBM: {
+ if (webm_read_frame(ctx->webm_ctx, &ctx->unit_buffer, unit_size,
+ &ctx->unit_buffer_size)) {
+ return false;
+ }
+ break;
+ }
+#endif
+ default:
+ // TODO(tomfinegan): Abuse FILE_TYPE_RAW for AV1/OBU elementary streams?
+ fprintf(stderr, "Error: Unsupported file type.\n");
+ return false;
+ }
+
+ return true;
+}
+
+} // namespace
+
+int main(int argc, const char *argv[]) {
+ // TODO(tomfinegan): Could do with some params for verbosity.
+ if (argc < 2) {
+ PrintUsage();
+ return EXIT_SUCCESS;
+ }
+
+ const std::string filename = argv[1];
+
+ using FilePtr = std::unique_ptr<FILE, decltype(&fclose)>;
+ FilePtr input_file(fopen(filename.c_str(), "rb"), &fclose);
+ if (input_file.get() == nullptr) {
+ input_file.release();
+ fprintf(stderr, "Error: Cannot open input file.\n");
+ return EXIT_FAILURE;
+ }
+
+ AvxInputContext avx_ctx;
+ InputContext input_ctx;
+ input_ctx.avx_ctx = &avx_ctx;
+ ObuDecInputContext obu_ctx;
+ input_ctx.obu_ctx = &obu_ctx;
+#if CONFIG_WEBM_IO
+ WebmInputContext webm_ctx;
+ input_ctx.webm_ctx = &webm_ctx;
+#endif
+
+ input_ctx.Init();
+ avx_ctx.file = input_file.get();
+ avx_ctx.file_type = GetFileType(&input_ctx);
+
+ // Note: the reader utilities will realloc the buffer using realloc() etc.
+ // Can't have nice things like unique_ptr wrappers with that type of
+ // behavior underneath the function calls.
+ input_ctx.unit_buffer =
+ reinterpret_cast<uint8_t *>(calloc(kInitialBufferSize, 1));
+ if (!input_ctx.unit_buffer) {
+ fprintf(stderr, "Error: No memory, can't alloc input buffer.\n");
+ return EXIT_FAILURE;
+ }
+ input_ctx.unit_buffer_size = kInitialBufferSize;
+
+ size_t unit_size = 0;
+ int unit_number = 0;
+ int64_t obu_overhead_bytes_total = 0;
+ while (ReadTemporalUnit(&input_ctx, &unit_size)) {
+ printf("Temporal unit %d\n", unit_number);
+
+ int obu_overhead_current_unit = 0;
+ if (!aom_tools::DumpObu(input_ctx.unit_buffer, static_cast<int>(unit_size),
+ &obu_overhead_current_unit)) {
+ fprintf(stderr, "Error: Temporal Unit parse failed on unit number %d.\n",
+ unit_number);
+ return EXIT_FAILURE;
+ }
+ printf(" OBU overhead: %d\n", obu_overhead_current_unit);
+ ++unit_number;
+ obu_overhead_bytes_total += obu_overhead_current_unit;
+ }
+
+ printf("File total OBU overhead: %" PRId64 "\n", obu_overhead_bytes_total);
+ return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/tools/frame_size_variation_analyzer.py b/third_party/aom/tools/frame_size_variation_analyzer.py
new file mode 100644
index 0000000000..5c02319df1
--- /dev/null
+++ b/third_party/aom/tools/frame_size_variation_analyzer.py
@@ -0,0 +1,74 @@
+# RTC frame size variation analyzer
+# Usage:
+# 1. Config with "-DCONFIG_OUTPUT_FRAME_SIZE=1".
+# 2. Build aomenc. Encode a file, and generate output file: frame_sizes.csv
+# 3. Run: python ./frame_size.py frame_sizes.csv target-bitrate fps
+# Where target-bitrate: Bitrate (kbps), and fps is frame per second.
+# Example: python ../aom/tools/frame_size_variation_analyzer.py frame_sizes.csv
+# 1000 30
+
+import numpy as np
+import csv
+import sys
+import matplotlib.pyplot as plt
+
+# return the moving average
+def moving_average(x, w):
+ return np.convolve(x, np.ones(w), 'valid') / w
+
+def frame_size_analysis(filename, target_br, fps):
+ tbr = target_br * 1000 / fps
+
+ with open(filename, 'r') as infile:
+ raw_data = list(csv.reader(infile, delimiter=','))
+
+ data = np.array(raw_data).astype(float)
+ fsize = data[:, 0].astype(float) # frame size
+ qindex = data[:, 1].astype(float) # qindex
+
+ # Frame bit rate mismatch
+ mismatch = np.absolute(fsize - np.full(fsize.size, tbr))
+
+ # Count how many frames are more than 2.5x of frame target bit rate.
+ tbr_thr = tbr * 2.5
+ cnt = 0
+ idx = np.arange(fsize.size)
+ for i in idx:
+ if fsize[i] > tbr_thr:
+ cnt = cnt + 1
+
+ # Use the 15-frame moving window
+ win = 15
+ avg_fsize = moving_average(fsize, win)
+ win_mismatch = np.absolute(avg_fsize - np.full(avg_fsize.size, tbr))
+
+ print('[Target frame rate (bit)]:', "%.2f"%tbr)
+ print('[Average frame rate (bit)]:', "%.2f"%np.average(fsize))
+ print('[Frame rate standard deviation]:', "%.2f"%np.std(fsize))
+ print('[Max/min frame rate (bit)]:', "%.2f"%np.max(fsize), '/', "%.2f"%np.min(fsize))
+ print('[Average frame rate mismatch (bit)]:', "%.2f"%np.average(mismatch))
+ print('[Number of frames (frame rate > 2.5x of target frame rate)]:', cnt)
+ print(' Moving window size:', win)
+ print('[Moving average frame rate mismatch (bit)]:', "%.2f"%np.average(win_mismatch))
+ print('------------------------------')
+
+ figure, axis = plt.subplots(2)
+ x = np.arange(fsize.size)
+ axis[0].plot(x, fsize, color='blue')
+ axis[0].set_title("frame sizes")
+ axis[1].plot(x, qindex, color='blue')
+ axis[1].set_title("frame qindex")
+ plt.tight_layout()
+
+ # Save the plot
+ plotname = filename + '.png'
+ plt.savefig(plotname)
+ plt.show()
+
+if __name__ == '__main__':
+ if (len(sys.argv) < 4):
+ print(sys.argv[0], 'input_file, target_bitrate, fps')
+ sys.exit()
+ target_br = int(sys.argv[2])
+ fps = int(sys.argv[3])
+ frame_size_analysis(sys.argv[1], target_br, fps)
diff --git a/third_party/aom/tools/gen_authors.sh b/third_party/aom/tools/gen_authors.sh
new file mode 100755
index 0000000000..5def8bc898
--- /dev/null
+++ b/third_party/aom/tools/gen_authors.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Add organization names manually.
+
+cat <<EOF
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v "corp.google\|clang-format")
+EOF
diff --git a/third_party/aom/tools/gen_constrained_tokenset.py b/third_party/aom/tools/gen_constrained_tokenset.py
new file mode 100755
index 0000000000..f5b0816dbf
--- /dev/null
+++ b/third_party/aom/tools/gen_constrained_tokenset.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Generate the probability model for the constrained token set.
+
+Model obtained from a 2-sided zero-centered distribution derived
+from a Pareto distribution. The cdf of the distribution is:
+cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+
+For a given beta and a given probability of the 1-node, the alpha
+is first solved, and then the {alpha, beta} pair is used to generate
+the probabilities for the rest of the nodes.
+"""
+
+import heapq
+import sys
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+
+def cdf_spareto(x, xm, beta):
+ p = 1 - (xm / (np.abs(x) + xm))**beta
+ p = 0.5 + 0.5 * np.sign(x) * p
+ return p
+
+
+def get_spareto(p, beta):
+ cdf = cdf_spareto
+
+ def func(x):
+ return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) /
+ (1 - cdf(0.5, x, beta)) - p)**2
+
+ alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12)
+ parray = np.zeros(11)
+ parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5)
+ parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta)))
+ parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta)))
+ parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta)))
+ parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta)))
+ parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta)))
+ parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta)))
+ parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta)))
+ parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta)))
+ parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta)))
+ parray[10] = 2 * (1. - cdf(66.5, alpha, beta))
+ return parray
+
+
+def quantize_probs(p, save_first_bin, bits):
+ """Quantize probability precisely.
+
+ Quantize probabilities minimizing dH (Kullback-Leibler divergence)
+ approximated by: sum (p_i-q_i)^2/p_i.
+ References:
+ https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+ https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit
+ """
+ num_sym = p.size
+ p = np.clip(p, 1e-16, 1)
+ L = 2**bits
+ pL = p * L
+ ip = 1. / p # inverse probability
+ q = np.clip(np.round(pL), 1, L + 1 - num_sym)
+ quant_err = (pL - q)**2 * ip
+ sgn = np.sign(L - q.sum()) # direction of correction
+ if sgn != 0: # correction is needed
+ v = [] # heap of adjustment results (adjustment err, index) of each symbol
+ for i in range(1 if save_first_bin else 0, num_sym):
+ q_adj = q[i] + sgn
+ if q_adj > 0 and q_adj < L:
+ adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+ heapq.heappush(v, (adj_err, i))
+ while q.sum() != L:
+ # apply lowest error adjustment
+ (adj_err, i) = heapq.heappop(v)
+ quant_err[i] += adj_err
+ q[i] += sgn
+ # calculate the cost of adjusting this symbol again
+ q_adj = q[i] + sgn
+ if q_adj > 0 and q_adj < L:
+ adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+ heapq.heappush(v, (adj_err, i))
+ return q
+
+
+def get_quantized_spareto(p, beta, bits, first_token):
+ parray = get_spareto(p, beta)
+ parray = parray[1:] / (1 - parray[0])
+ # CONFIG_NEW_TOKENSET
+ if first_token > 1:
+ parray = parray[1:] / (1 - parray[0])
+ qarray = quantize_probs(parray, first_token == 1, bits)
+ return qarray.astype(np.int)
+
+
+def main(bits=15, first_token=1):
+ beta = 8
+ for q in range(1, 256):
+ parray = get_quantized_spareto(q / 256., beta, bits, first_token)
+ assert parray.sum() == 2**bits
+ print('{', ', '.join('%d' % i for i in parray), '},')
+
+
+if __name__ == '__main__':
+ if len(sys.argv) > 2:
+ main(int(sys.argv[1]), int(sys.argv[2]))
+ elif len(sys.argv) > 1:
+ main(int(sys.argv[1]))
+ else:
+ main()
diff --git a/third_party/aom/tools/gop_bitrate/analyze_data.py b/third_party/aom/tools/gop_bitrate/analyze_data.py
new file mode 100644
index 0000000000..4e006b9220
--- /dev/null
+++ b/third_party/aom/tools/gop_bitrate/analyze_data.py
@@ -0,0 +1,18 @@
+with open('experiment.txt', 'r') as file:
+ lines = file.readlines()
+ curr_filename = ''
+ keyframe = 0
+ actual_value = 0
+ estimate_value = 0
+ print('filename, estimated value (b), actual value (b)')
+ for line in lines:
+ if line.startswith('input:'):
+ curr_filename = line[13:].strip()
+ if line.startswith('estimated'):
+ estimate_value = float(line[19:].strip())
+ if line.startswith('frame:'):
+ actual_value += float(line[line.find('size')+6:line.find('total')-2])
+ if line.startswith('****'):
+ print(f'{curr_filename}, {estimate_value}, {actual_value}')
+ estimate_value = 0
+ actual_value = 0
diff --git a/third_party/aom/tools/gop_bitrate/encode_all_script.sh b/third_party/aom/tools/gop_bitrate/encode_all_script.sh
new file mode 100755
index 0000000000..0689b33138
--- /dev/null
+++ b/third_party/aom/tools/gop_bitrate/encode_all_script.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+#INPUT=media/cheer_sif.y4m
+OUTPUT=test.webm
+LIMIT=17
+CPU_USED=3
+CQ_LEVEL=36
+
+for input in media/*
+do
+ echo "****" >> experiment.txt
+ echo "input: $input" >> experiment.txt
+ ./aomenc --limit=$LIMIT --codec=av1 --cpu-used=$CPU_USED --end-usage=q --cq-level=$CQ_LEVEL --psnr --threads=0 --profile=0 --lag-in-frames=35 --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=160 --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --minsection-pct=0 --maxsection-pct=2000 --arnr-maxframes=7 --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 --frame-parallel=0 --tile-columns=0 -o $OUTPUT $input >> experiment.txt
+done
diff --git a/third_party/aom/tools/gop_bitrate/python/bitrate_accuracy.py b/third_party/aom/tools/gop_bitrate/python/bitrate_accuracy.py
new file mode 100644
index 0000000000..2a5da6a794
--- /dev/null
+++ b/third_party/aom/tools/gop_bitrate/python/bitrate_accuracy.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+# Model A only.
+# Uses least squares regression to find the solution
+# when there is one unknown variable.
+def lstsq_solution(A, B):
+ A_inv = np.linalg.pinv(A)
+ x = np.matmul(A_inv, B)
+ return x[0][0]
+
+# Model B only.
+# Uses the pseudoinverse matrix to find the solution
+# when there are two unknown variables.
+def pinv_solution(A, mv, B):
+ new_A = np.concatenate((A, mv), axis=1)
+ new_A_inv = np.linalg.pinv(new_A)
+ new_x = np.matmul(new_A_inv, B)
+ print("pinv solution:", new_x[0][0], new_x[1][0])
+ return (new_x[0][0], new_x[1][0])
+
+# Model A only.
+# Finds the coefficient to multiply A by to minimize
+# the percentage error between A and B.
+def minimize_percentage_error_model_a(A, B):
+ R = np.divide(A, B)
+ num = 0
+ den = 0
+ best_x = 0
+ best_error = 100
+ for r_i in R:
+ num += r_i
+ den += r_i**2
+ if den == 0:
+ return 0
+ return (num/den)[0]
+
+# Model B only.
+# Finds the coefficients to multiply to the frame bitrate
+# and the motion vector bitrate to minimize the percent error.
+def minimize_percentage_error_model_b(r_e, r_m, r_f):
+ r_ef = np.divide(r_e, r_f)
+ r_mf = np.divide(r_m, r_f)
+ sum_ef = np.sum(r_ef)
+ sum_ef_sq = np.sum(np.square(r_ef))
+ sum_mf = np.sum(r_mf)
+ sum_mf_sq = np.sum(np.square(r_mf))
+ sum_ef_mf = np.sum(np.multiply(r_ef, r_mf))
+ # Divides x by y. If y is zero, returns 0.
+ divide = lambda x, y : 0 if y == 0 else x / y
+ # Set up and solve the matrix equation
+ A = np.array([[1, divide(sum_ef_mf, sum_ef_sq)],[divide(sum_ef_mf, sum_mf_sq), 1]])
+ B = np.array([divide(sum_ef, sum_ef_sq), divide(sum_mf, sum_mf_sq)])
+ A_inv = np.linalg.pinv(A)
+ x = np.matmul(A_inv, B)
+ return x
+
+# Model A only.
+# Calculates the least squares error between A and B
+# using coefficients in X.
+def average_lstsq_error(A, B, x):
+ error = 0
+ n = 0
+ for i, a in enumerate(A):
+ a = a[0]
+ b = B[i][0]
+ if b == 0:
+ continue
+ n += 1
+ error += (b - x*a)**2
+ if n == 0:
+ return None
+ error /= n
+ return error
+
+# Model A only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_a(A, B, x):
+ error = 0
+ n = 0
+ for i, a in enumerate(A):
+ a = a[0]
+ b = B[i][0]
+ if b == 0:
+ continue
+ n += 1
+ error_i = (abs(x*a-b)/b)*100
+ error += error_i
+ error /= n
+ return error
+
+# Model B only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_b(A, M, B, x):
+ error = 0
+ for i, a in enumerate(A):
+ a = a[0]
+ mv = M[i]
+ b = B[i][0]
+ if b == 0:
+ continue
+ estimate = x[0]*a
+ estimate += x[1]*mv
+ error += abs(estimate - b) / b
+ error *= 100
+ error /= A.shape[0]
+ return error
+
+def average_squared_error_model_a(A, B, x):
+ error = 0
+ n = 0
+ for i, a in enumerate(A):
+ a = a[0]
+ b = B[i][0]
+ if b == 0:
+ continue
+ n += 1
+ error_i = (1 - x*(a/b))**2
+ error += error_i
+ error /= n
+ error = error**0.5
+ return error * 100
+
+def average_squared_error_model_b(A, M, B, x):
+ error = 0
+ n = 0
+ for i, a in enumerate(A):
+ a = a[0]
+ b = B[i][0]
+ mv = M[i]
+ if b == 0:
+ continue
+ n += 1
+ error_i = 1 - ((x[0]*a + x[1]*mv)/b)
+ error_i = error_i**2
+ error += error_i
+ error /= n
+ error = error**0.5
+ return error * 100
+
+# Traverses the data and prints out one value for
+# each update type.
+def print_solutions(file_path):
+ data = np.genfromtxt(file_path, delimiter="\t")
+ prev_update = 0
+ split_list_indices = list()
+ for i, val in enumerate(data):
+ if prev_update != val[3]:
+ split_list_indices.append(i)
+ prev_update = val[3]
+ split = np.split(data, split_list_indices)
+ for array in split:
+ A, mv, B, update = np.hsplit(array, 4)
+ z = np.where(B == 0)[0]
+ r_e = np.delete(A, z, axis=0)
+ r_m = np.delete(mv, z, axis=0)
+ r_f = np.delete(B, z, axis=0)
+ A = r_e
+ mv = r_m
+ B = r_f
+ all_zeros = not A.any()
+ if all_zeros:
+ continue
+ print("update type:", update[0][0])
+ x_ls = lstsq_solution(A, B)
+ x_a = minimize_percentage_error_model_a(A, B)
+ x_b = minimize_percentage_error_model_b(A, mv, B)
+ percent_error_a = average_percent_error_model_a(A, B, x_a)
+ percent_error_b = average_percent_error_model_b(A, mv, B, x_b)[0]
+ baseline_percent_error_a = average_percent_error_model_a(A, B, 1)
+ baseline_percent_error_b = average_percent_error_model_b(A, mv, B, [1, 1])[0]
+
+ squared_error_a = average_squared_error_model_a(A, B, x_a)
+ squared_error_b = average_squared_error_model_b(A, mv, B, x_b)[0]
+ baseline_squared_error_a = average_squared_error_model_a(A, B, 1)
+ baseline_squared_error_b = average_squared_error_model_b(A, mv, B, [1, 1])[0]
+
+ print("model,\tframe_coeff,\tmv_coeff,\terror,\tbaseline_error")
+ print("Model A %_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(percent_error_a) + ",\t" + str(baseline_percent_error_a))
+ print("Model A sq_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(squared_error_a) + ",\t" + str(baseline_squared_error_a))
+ print("Model B %_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(percent_error_b) + ",\t" + str(baseline_percent_error_b))
+ print("Model B sq_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(squared_error_b) + ",\t" + str(baseline_squared_error_b))
+ print()
+
+if __name__ == "__main__":
+ print_solutions("data2/all_lowres_target_lt600_data.txt")
diff --git a/third_party/aom/tools/inspect-cli.js b/third_party/aom/tools/inspect-cli.js
new file mode 100644
index 0000000000..a14c08111a
--- /dev/null
+++ b/third_party/aom/tools/inspect-cli.js
@@ -0,0 +1,39 @@
+/**
+ * This tool lets you test if the compiled Javascript decoder is functioning properly. You'll
+ * need to download a SpiderMonkey js-shell to run this script.
+ * https://archive.mozilla.org/pub/firefox/nightly/latest-mozilla-central/
+ *
+ * Example:
+ * js-shell inspect-cli.js video.ivf
+ */
+load("inspect.js");
+var buffer = read(scriptArgs[0], "binary");
+var Module = {
+ noExitRuntime: true,
+ noInitialRun: true,
+ preInit: [],
+ preRun: [],
+ postRun: [function () {
+ printErr(`Loaded Javascript Decoder OK`);
+ }],
+ memoryInitializerPrefixURL: "bin/",
+ arguments: ['input.ivf', 'output.raw'],
+ on_frame_decoded_json: function (jsonString) {
+ let json = JSON.parse("[" + Module.UTF8ToString(jsonString) + "null]");
+ json.forEach(frame => {
+ if (frame) {
+ print(frame.frame);
+ }
+ });
+ }
+};
+DecoderModule(Module);
+Module.FS.writeFile("/tmp/input.ivf", buffer, { encoding: "binary" });
+Module._open_file();
+Module._set_layers(0xFFFFFFFF); // Set this to zero if you want to benchmark decoding.
+while(true) {
+ printErr("Decoding Frame ...");
+ if (Module._read_frame()) {
+ break;
+ }
+}
diff --git a/third_party/aom/tools/inspect-post.js b/third_party/aom/tools/inspect-post.js
new file mode 100644
index 0000000000..31c40bb82c
--- /dev/null
+++ b/third_party/aom/tools/inspect-post.js
@@ -0,0 +1 @@
+Module["FS"] = FS;
diff --git a/third_party/aom/tools/intersect-diffs.py b/third_party/aom/tools/intersect-diffs.py
new file mode 100755
index 0000000000..960183675d
--- /dev/null
+++ b/third_party/aom/tools/intersect-diffs.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Calculates the "intersection" of two unified diffs.
+
+Given two diffs, A and B, it finds all hunks in B that had non-context lines
+in A and prints them to stdout. This is useful to determine the hunks in B that
+are relevant to A. The resulting file can be applied with patch(1) on top of A.
+"""
+
+__author__ = "jkoleszar@google.com"
+
+import sys
+
+import diff
+
+
+def FormatDiffHunks(hunks):
+ """Re-serialize a list of DiffHunks."""
+ r = []
+ last_header = None
+ for hunk in hunks:
+ this_header = hunk.header[0:2]
+ if last_header != this_header:
+ r.extend(hunk.header)
+ last_header = this_header
+ else:
+ r.extend(hunk.header[2])
+ r.extend(hunk.lines)
+ r.append("\n")
+ return "".join(r)
+
+
+def ZipHunks(rhs_hunks, lhs_hunks):
+ """Join two hunk lists on filename."""
+ for rhs_hunk in rhs_hunks:
+ rhs_file = rhs_hunk.right.filename.split("/")[1:]
+
+ for lhs_hunk in lhs_hunks:
+ lhs_file = lhs_hunk.left.filename.split("/")[1:]
+ if lhs_file != rhs_file:
+ continue
+ yield (rhs_hunk, lhs_hunk)
+
+
+def main():
+ old_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[1], "r"))]
+ new_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[2], "r"))]
+ out_hunks = []
+
+ # Join the right hand side of the older diff with the left hand side of the
+ # newer diff.
+ for old_hunk, new_hunk in ZipHunks(old_hunks, new_hunks):
+ if new_hunk in out_hunks:
+ continue
+ old_lines = old_hunk.right
+ new_lines = new_hunk.left
+
+ # Determine if this hunk overlaps any non-context line from the other
+ for i in old_lines.delta_line_nums:
+ if i in new_lines:
+ out_hunks.append(new_hunk)
+ break
+
+ if out_hunks:
+ print(FormatDiffHunks(out_hunks))
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
diff --git a/third_party/aom/tools/lint-hunks.py b/third_party/aom/tools/lint-hunks.py
new file mode 100755
index 0000000000..8b3af972fc
--- /dev/null
+++ b/third_party/aom/tools/lint-hunks.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Performs style checking on each diff hunk."""
+import getopt
+import os
+import io
+import subprocess
+import sys
+
+import diff
+
+
+SHORT_OPTIONS = "h"
+LONG_OPTIONS = ["help"]
+
+TOPLEVEL_CMD = ["git", "rev-parse", "--show-toplevel"]
+DIFF_CMD = ["git", "diff"]
+DIFF_INDEX_CMD = ["git", "diff-index", "-u", "HEAD", "--"]
+SHOW_CMD = ["git", "show"]
+CPPLINT_FILTERS = ["-readability/casting"]
+
+
+class Usage(Exception):
+ pass
+
+
+class SubprocessException(Exception):
+ def __init__(self, args):
+ msg = "Failed to execute '%s'"%(" ".join(args))
+ super(SubprocessException, self).__init__(msg)
+
+
+class Subprocess(subprocess.Popen):
+ """Adds the notion of an expected returncode to Popen."""
+
+ def __init__(self, args, expected_returncode=0, **kwargs):
+ self._args = args
+ self._expected_returncode = expected_returncode
+ super(Subprocess, self).__init__(args, **kwargs)
+
+ def communicate(self, *args, **kwargs):
+ result = super(Subprocess, self).communicate(*args, **kwargs)
+ if self._expected_returncode is not None:
+ try:
+ ok = self.returncode in self._expected_returncode
+ except TypeError:
+ ok = self.returncode == self._expected_returncode
+ if not ok:
+ raise SubprocessException(self._args)
+ return result
+
+
+def main(argv=None):
+ if argv is None:
+ argv = sys.argv
+ try:
+ try:
+ opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
+ except getopt.error as msg:
+ raise Usage(msg)
+
+ # process options
+ for o, _ in opts:
+ if o in ("-h", "--help"):
+ print(__doc__)
+ sys.exit(0)
+
+ if args and len(args) > 1:
+ print(__doc__)
+ sys.exit(0)
+
+ # Find the fully qualified path to the root of the tree
+ tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True)
+ tl = tl.communicate()[0].strip()
+
+ # See if we're working on the index or not.
+ if args:
+ diff_cmd = DIFF_CMD + [args[0] + "^!"]
+ else:
+ diff_cmd = DIFF_INDEX_CMD
+
+ # Build the command line to execute cpplint
+ cpplint_cmd = [os.path.join(tl, "tools", "cpplint.py"),
+ "--filter=" + ",".join(CPPLINT_FILTERS),
+ "-"]
+
+ # Get a list of all affected lines
+ file_affected_line_map = {}
+ p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True)
+ stdout = p.communicate()[0]
+ for hunk in diff.ParseDiffHunks(io.StringIO(stdout)):
+ filename = hunk.right.filename[2:]
+ if filename not in file_affected_line_map:
+ file_affected_line_map[filename] = set()
+ file_affected_line_map[filename].update(hunk.right.delta_line_nums)
+
+ # Run each affected file through cpplint
+ lint_failed = False
+ for filename, affected_lines in file_affected_line_map.items():
+ if filename.split(".")[-1] not in ("c", "h", "cc"):
+ continue
+ if filename.startswith("third_party"):
+ continue
+
+ if args:
+ # File contents come from git
+ show_cmd = SHOW_CMD + [args[0] + ":" + filename]
+ show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True)
+ lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+ stdin=show.stdout, stderr=subprocess.PIPE,
+ text=True)
+ lint_out = lint.communicate()[1]
+ else:
+ # File contents come from the working tree
+ lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+ stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+ text=True)
+ stdin = open(os.path.join(tl, filename)).read()
+ lint_out = lint.communicate(stdin)[1]
+
+ for line in lint_out.split("\n"):
+ fields = line.split(":")
+ if fields[0] != "-":
+ continue
+ warning_line_num = int(fields[1])
+ if warning_line_num in affected_lines:
+ print("%s:%d:%s"%(filename, warning_line_num,
+ ":".join(fields[2:])))
+ lint_failed = True
+
+ # Set exit code if any relevant lint errors seen
+ if lint_failed:
+ return 1
+
+ except Usage as err:
+ print(err, file=sys.stderr)
+ print("for help use --help", file=sys.stderr)
+ return 2
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/third_party/aom/tools/obu_parser.cc b/third_party/aom/tools/obu_parser.cc
new file mode 100644
index 0000000000..5716b46218
--- /dev/null
+++ b/third_party/aom/tools/obu_parser.cc
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include <cstdio>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/obu_util.h"
+#include "tools/obu_parser.h"
+
+namespace aom_tools {
+
+// Basic OBU syntax
+// 8 bits: Header
+// 7
+// forbidden bit
+// 6,5,4,3
+// type bits
+// 2
+// extension flag bit
+// 1
+// has size field bit
+// 0
+// reserved bit
+const uint32_t kObuForbiddenBitMask = 0x1;
+const uint32_t kObuForbiddenBitShift = 7;
+const uint32_t kObuTypeBitsMask = 0xF;
+const uint32_t kObuTypeBitsShift = 3;
+const uint32_t kObuExtensionFlagBitMask = 0x1;
+const uint32_t kObuExtensionFlagBitShift = 2;
+const uint32_t kObuHasSizeFieldBitMask = 0x1;
+const uint32_t kObuHasSizeFieldBitShift = 1;
+
+// When extension flag bit is set:
+// 8 bits: extension header
+// 7,6,5
+// temporal ID
+// 4,3
+// spatial ID
+// 2,1,0
+// reserved bits
+const uint32_t kObuExtTemporalIdBitsMask = 0x7;
+const uint32_t kObuExtTemporalIdBitsShift = 5;
+const uint32_t kObuExtSpatialIdBitsMask = 0x3;
+const uint32_t kObuExtSpatialIdBitsShift = 3;
+
+bool ValidObuType(int obu_type) {
+ switch (obu_type) {
+ case OBU_SEQUENCE_HEADER:
+ case OBU_TEMPORAL_DELIMITER:
+ case OBU_FRAME_HEADER:
+ case OBU_TILE_GROUP:
+ case OBU_METADATA:
+ case OBU_FRAME:
+ case OBU_REDUNDANT_FRAME_HEADER:
+ case OBU_TILE_LIST:
+ case OBU_PADDING: return true;
+ }
+ return false;
+}
+
+bool ParseObuHeader(uint8_t obu_header_byte, ObuHeader *obu_header) {
+ const int forbidden_bit =
+ (obu_header_byte >> kObuForbiddenBitShift) & kObuForbiddenBitMask;
+ if (forbidden_bit) {
+ fprintf(stderr, "Invalid OBU, forbidden bit set.\n");
+ return false;
+ }
+
+ obu_header->type = static_cast<OBU_TYPE>(
+ (obu_header_byte >> kObuTypeBitsShift) & kObuTypeBitsMask);
+ if (!ValidObuType(obu_header->type)) {
+ fprintf(stderr, "Invalid OBU type: %d.\n", obu_header->type);
+ return false;
+ }
+
+ obu_header->has_extension =
+ (obu_header_byte >> kObuExtensionFlagBitShift) & kObuExtensionFlagBitMask;
+ obu_header->has_size_field =
+ (obu_header_byte >> kObuHasSizeFieldBitShift) & kObuHasSizeFieldBitMask;
+ return true;
+}
+
+bool ParseObuExtensionHeader(uint8_t ext_header_byte, ObuHeader *obu_header) {
+ obu_header->temporal_layer_id =
+ (ext_header_byte >> kObuExtTemporalIdBitsShift) &
+ kObuExtTemporalIdBitsMask;
+ obu_header->spatial_layer_id =
+ (ext_header_byte >> kObuExtSpatialIdBitsShift) & kObuExtSpatialIdBitsMask;
+
+ return true;
+}
+
+void PrintObuHeader(const ObuHeader *header) {
+ printf(
+ " OBU type: %s\n"
+ " extension: %s\n",
+ aom_obu_type_to_string(static_cast<OBU_TYPE>(header->type)),
+ header->has_extension ? "yes" : "no");
+ if (header->has_extension) {
+ printf(
+ " temporal_id: %d\n"
+ " spatial_id: %d\n",
+ header->temporal_layer_id, header->spatial_layer_id);
+ }
+}
+
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) {
+ const int kObuHeaderSizeBytes = 1;
+ const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes;
+ int consumed = 0;
+ int obu_overhead = 0;
+ ObuHeader obu_header;
+ while (consumed < length) {
+ const int remaining = length - consumed;
+ if (remaining < kMinimumBytesRequired) {
+ fprintf(stderr,
+ "OBU parse error. Did not consume all data, %d bytes remain.\n",
+ remaining);
+ return false;
+ }
+
+ int obu_header_size = 0;
+
+ memset(&obu_header, 0, sizeof(obu_header));
+ const uint8_t obu_header_byte = *(data + consumed);
+ if (!ParseObuHeader(obu_header_byte, &obu_header)) {
+ fprintf(stderr, "OBU parsing failed at offset %d.\n", consumed);
+ return false;
+ }
+
+ ++obu_overhead;
+ ++obu_header_size;
+
+ if (obu_header.has_extension) {
+ const uint8_t obu_ext_header_byte =
+ *(data + consumed + kObuHeaderSizeBytes);
+ if (!ParseObuExtensionHeader(obu_ext_header_byte, &obu_header)) {
+ fprintf(stderr, "OBU extension parsing failed at offset %d.\n",
+ consumed + kObuHeaderSizeBytes);
+ return false;
+ }
+
+ ++obu_overhead;
+ ++obu_header_size;
+ }
+
+ PrintObuHeader(&obu_header);
+
+ uint64_t obu_size = 0;
+ size_t length_field_size = 0;
+ if (aom_uleb_decode(data + consumed + obu_header_size,
+ remaining - obu_header_size, &obu_size,
+ &length_field_size) != 0) {
+ fprintf(stderr, "OBU size parsing failed at offset %d.\n",
+ consumed + obu_header_size);
+ return false;
+ }
+ int current_obu_length = static_cast<int>(obu_size);
+ if (obu_header_size + static_cast<int>(length_field_size) +
+ current_obu_length >
+ remaining) {
+ fprintf(stderr, "OBU parsing failed: not enough OBU data.\n");
+ return false;
+ }
+ consumed += obu_header_size + static_cast<int>(length_field_size) +
+ current_obu_length;
+ printf(" length: %d\n",
+ static_cast<int>(obu_header_size + length_field_size +
+ current_obu_length));
+ }
+
+ if (obu_overhead_bytes != nullptr) *obu_overhead_bytes = obu_overhead;
+ printf(" TU size: %d\n", consumed);
+
+ return true;
+}
+
+} // namespace aom_tools
diff --git a/third_party/aom/tools/obu_parser.h b/third_party/aom/tools/obu_parser.h
new file mode 100644
index 0000000000..1d7d2d794b
--- /dev/null
+++ b/third_party/aom/tools/obu_parser.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TOOLS_OBU_PARSER_H_
+#define AOM_TOOLS_OBU_PARSER_H_
+
+#include <cstdint>
+
+namespace aom_tools {
+
+// Print information obtained from OBU(s) in data until data is exhausted or an
+// error occurs. Returns true when all data is consumed successfully, and
+// optionally reports OBU storage overhead via obu_overhead_bytes when the
+// pointer is non-null.
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes);
+
+} // namespace aom_tools
+
+#endif // AOM_TOOLS_OBU_PARSER_H_
diff --git a/third_party/aom/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py b/third_party/aom/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
new file mode 100644
index 0000000000..9afb78cbf5
--- /dev/null
+++ b/third_party/aom/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py
@@ -0,0 +1,154 @@
+#!/usr/bin/python3
+##
+## Copyright (c) 2022, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+""" Analyze the log generated by experimental flag CONFIG_RATECTRL_LOG."""
+
+import matplotlib.pyplot as plt
+import os
+
+
+def get_file_basename(filename):
+ return filename.split(".")[0]
+
+
+def parse_log(log_file):
+ data_list = []
+ with open(log_file) as fp:
+ for line in fp:
+ dic = {}
+ word_ls = line.split()
+ i = 0
+ while i < len(word_ls):
+ dic[word_ls[i]] = float(word_ls[i + 1])
+ i += 2
+ data_list.append(dic)
+ fp.close()
+ return data_list
+
+
+def extract_data(data_list, name):
+ arr = []
+ for data in data_list:
+ arr.append(data[name])
+ return arr
+
+
+def visualize_q_indices(exp_summary, exp_list, fig_path=None):
+ for exp in exp_list:
+ data = parse_log(exp["log"])
+ q_indices = extract_data(data, "q")
+ plt.title(exp_summary)
+ plt.xlabel("frame_coding_idx")
+ plt.ylabel("q_index")
+ plt.plot(q_indices, marker=".", label=exp["label"])
+ plt.legend()
+ if fig_path:
+ plt.savefig(fig_path)
+ else:
+ plt.show()
+ plt.clf()
+
+
+def get_rc_type_from_exp_type(exp_type):
+ if exp_type == "Q_3P":
+ return "q"
+ return "vbr"
+
+
+def test_video(exe_name, input, exp_type, level, log=None, limit=150):
+ basic_cmd = ("--test-decode=warn --threads=0 --profile=0 --min-q=0 --max-q=63"
+ " --auto-alt-ref=1 --kf-max-dist=160 --kf-min-dist=0 "
+ "--drop-frame=0 --static-thresh=0 --minsection-pct=0 "
+ "--maxsection-pct=2000 --arnr-maxframes=7 --arnr-strength=5 "
+ "--sharpness=0 --undershoot-pct=100 --overshoot-pct=100 "
+ "--frame-parallel=0 --tile-columns=0 --cpu-used=3 "
+ "--lag-in-frames=48 --psnr")
+ rc_type = get_rc_type_from_exp_type(exp_type)
+ rc_cmd = "--end-usage=" + rc_type
+ level_cmd = ""
+ if rc_type == "q":
+ level_cmd += "--cq-level=" + str(level)
+ elif rc_type == "vbr":
+ level_cmd += "--target-bitrate=" + str(level)
+ limit_cmd = "--limit=" + str(limit)
+ passes_cmd = "--passes=3 --second-pass-log=second_pass_log"
+ output_cmd = "-o test.webm"
+ input_cmd = "~/data/" + input
+ log_cmd = ""
+ if log != None:
+ log_cmd = ">" + log
+ cmd_ls = [
+ exe_name, basic_cmd, rc_cmd, level_cmd, limit_cmd, passes_cmd, output_cmd,
+ input_cmd, log_cmd
+ ]
+ cmd = " ".join(cmd_ls)
+ os.system(cmd)
+
+
+def gen_ratectrl_log(test_case):
+ exe = test_case["exe"]
+ video = test_case["video"]
+ exp_type = test_case["exp_type"]
+ level = test_case["level"]
+ log = test_case["log"]
+ test_video(exe, video, exp_type, level, log=log, limit=150)
+ return log
+
+
+def gen_test_case(exp_type, dataset, videoname, level, log_dir=None):
+ test_case = {}
+ exe = "./aomenc_bl"
+ if exp_type == "BA_3P":
+ exe = "./aomenc_ba"
+ test_case["exe"] = exe
+
+ video = os.path.join(dataset, videoname)
+ test_case["video"] = video
+ test_case["exp_type"] = exp_type
+ test_case["level"] = level
+
+ video_basename = get_file_basename(videoname)
+ log = ".".join([dataset, video_basename, exp_type, str(level)])
+ if log_dir != None:
+ log = os.path.join(log_dir, log)
+ test_case["log"] = log
+ return test_case
+
+
+def run_ratectrl_exp(exp_config):
+ fp = open(exp_config)
+ log_dir = "./lowres_rc_log"
+ fig_dir = "./lowres_rc_fig"
+ dataset = "lowres"
+ for line in fp:
+ word_ls = line.split()
+ dataset = word_ls[0]
+ videoname = word_ls[1]
+ exp_type_ls = ["VBR_3P", "BA_3P"]
+ level_ls = [int(v) for v in word_ls[2:4]]
+ exp_ls = []
+ for i in range(len(exp_type_ls)):
+ exp_type = exp_type_ls[i]
+ test_case = gen_test_case(exp_type, dataset, videoname, level_ls[i],
+ log_dir)
+ log = gen_ratectrl_log(test_case)
+ exp = {}
+ exp["log"] = log
+ exp["label"] = exp_type
+ exp_ls.append(exp)
+ video_basename = get_file_basename(videoname)
+ fig_path = os.path.join(fig_dir, video_basename + ".png")
+ visualize_q_indices(video_basename, exp_ls, fig_path)
+ fp.close()
+
+
+if __name__ == "__main__":
+ run_ratectrl_exp("exp_rc_config")
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc b/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc
new file mode 100644
index 0000000000..7c5400b91a
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+typedef enum CODE_TYPE {
+ CODE_TYPE_C,
+ CODE_TYPE_SSE2,
+ CODE_TYPE_SSE4_1
+} CODE_TYPE;
+
+int get_cos_idx(double value, int mod) {
+ return round(acos(fabs(value)) / PI * mod);
+}
+
+char *cos_text_arr(double value, int mod, char *text, int size) {
+ int num = get_cos_idx(value, mod);
+ if (value < 0) {
+ snprintf(text, size, "-cospi[%2d]", num);
+ } else {
+ snprintf(text, size, " cospi[%2d]", num);
+ }
+
+ if (num == 0)
+ printf("v: %f -> %d/%d v==-1 is %d\n", value, num, mod, value == -1);
+
+ return text;
+}
+
+char *cos_text_sse2(double w0, double w1, int mod, char *text, int size) {
+ int idx0 = get_cos_idx(w0, mod);
+ int idx1 = get_cos_idx(w1, mod);
+ char p[] = "p";
+ char n[] = "m";
+ char *sgn0 = w0 < 0 ? n : p;
+ char *sgn1 = w1 < 0 ? n : p;
+ snprintf(text, size, "cospi_%s%02d_%s%02d", sgn0, idx0, sgn1, idx1);
+ return text;
+}
+
+char *cos_text_sse4_1(double w, int mod, char *text, int size) {
+ int idx = get_cos_idx(w, mod);
+ char p[] = "p";
+ char n[] = "m";
+ char *sgn = w < 0 ? n : p;
+ snprintf(text, size, "cospi_%s%02d", sgn, idx);
+ return text;
+}
+
+void node_to_code_c(Node *node, const char *buf0, const char *buf1) {
+ int cnt = 0;
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+ }
+ if (cnt == 2) {
+ int cnt2 = 0;
+ printf(" %s[%d] =", buf1, node->nodeIdx);
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node->inWeight[i]) == 1) {
+ cnt2++;
+ }
+ }
+ if (cnt2 == 2) {
+ printf(" apply_value(");
+ }
+ int cnt1 = 0;
+ for (int i = 0; i < 2; i++) {
+ if (node->inWeight[i] == 1) {
+ if (cnt1 > 0)
+ printf(" + %s[%d]", buf0, node->inNodeIdx[i]);
+ else
+ printf(" %s[%d]", buf0, node->inNodeIdx[i]);
+ cnt1++;
+ } else if (node->inWeight[i] == -1) {
+ if (cnt1 > 0)
+ printf(" - %s[%d]", buf0, node->inNodeIdx[i]);
+ else
+ printf("-%s[%d]", buf0, node->inNodeIdx[i]);
+ cnt1++;
+ }
+ }
+ if (cnt2 == 2) {
+ printf(", stage_range[stage])");
+ }
+ printf(";\n");
+ } else {
+ char w0[100];
+ char w1[100];
+ printf(
+ " %s[%d] = half_btf(%s, %s[%d], %s, %s[%d], "
+ "cos_bit);\n",
+ buf1, node->nodeIdx, cos_text_arr(node->inWeight[0], COS_MOD, w0, 100),
+ buf0, node->inNodeIdx[0],
+ cos_text_arr(node->inWeight[1], COS_MOD, w1, 100), buf0,
+ node->inNodeIdx[1]);
+ }
+}
+
+void gen_code_c(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+ char *fun_name = new char[100];
+ get_fun_name(fun_name, 100, type, node_num);
+
+ printf("\n");
+ printf(
+ "void av1_%s(const int32_t *input, int32_t *output, int8_t cos_bit, "
+ "const int8_t* stage_range) "
+ "{\n",
+ fun_name);
+ printf(" assert(output != input);\n");
+ printf(" const int32_t size = %d;\n", node_num);
+ printf(" const int32_t *cospi = cospi_arr(cos_bit);\n");
+ printf("\n");
+
+ printf(" int32_t stage = 0;\n");
+ printf(" int32_t *bf0, *bf1;\n");
+ printf(" int32_t step[%d];\n", node_num);
+
+ const char *buf0 = "bf0";
+ const char *buf1 = "bf1";
+ const char *input = "input";
+
+ int si = 0;
+ printf("\n");
+ printf(" // stage %d;\n", si);
+ printf(" apply_range(stage, input, %s, size, stage_range[stage]);\n", input);
+
+ si = 1;
+ printf("\n");
+ printf(" // stage %d;\n", si);
+ printf(" stage++;\n");
+ if (si % 2 == (stage_num - 1) % 2) {
+ printf(" %s = output;\n", buf1);
+ } else {
+ printf(" %s = step;\n", buf1);
+ }
+
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ node_to_code_c(node + idx, input, buf1);
+ }
+
+ printf(" range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+
+ for (int si = 2; si < stage_num; si++) {
+ printf("\n");
+ printf(" // stage %d\n", si);
+ printf(" stage++;\n");
+ if (si % 2 == (stage_num - 1) % 2) {
+ printf(" %s = step;\n", buf0);
+ printf(" %s = output;\n", buf1);
+ } else {
+ printf(" %s = output;\n", buf0);
+ printf(" %s = step;\n", buf1);
+ }
+
+ // computation code
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ node_to_code_c(node + idx, buf0, buf1);
+ }
+
+ if (si != stage_num - 1) {
+ printf(
+ " range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+ }
+ }
+ printf(" apply_range(stage, input, output, size, stage_range[stage]);\n");
+ printf("}\n");
+}
+
+void single_node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+ printf(" %s[%2d] =", buf1, node->nodeIdx);
+ if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+ printf(" _mm_adds_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+ node->inNodeIdx[1]);
+ } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+ printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+ node->inNodeIdx[1]);
+ } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+ printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+ node->inNodeIdx[0]);
+ } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+ printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+ } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+ printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+ } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+ printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+ } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+ printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+ }
+ printf(";\n");
+}
+
+void pair_node_to_code_sse2(Node *node, Node *partnerNode, const char *buf0,
+ const char *buf1) {
+ char temp0[100];
+ char temp1[100];
+ // btf_16_sse2_type0(w0, w1, in0, in1, out0, out1)
+ if (node->inNodeIdx[0] != partnerNode->inNodeIdx[0])
+ printf(" btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+ cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+ 100),
+ cos_text_sse2(partnerNode->inWeight[1], partnerNode->inWeight[0],
+ COS_MOD, temp1, 100),
+ buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+ node->nodeIdx, buf1, partnerNode->nodeIdx);
+ else
+ printf(" btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+ cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+ 100),
+ cos_text_sse2(partnerNode->inWeight[0], partnerNode->inWeight[1],
+ COS_MOD, temp1, 100),
+ buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+ node->nodeIdx, buf1, partnerNode->nodeIdx);
+}
+
+Node *get_partner_node(Node *node) {
+ int diff = node->inNode[1]->nodeIdx - node->nodeIdx;
+ return node + diff;
+}
+
+void node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+ int cnt = 0;
+ int cnt1 = 0;
+ if (node->visited == 0) {
+ node->visited = 1;
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+ if (fabs(node->inWeight[i]) == 1) cnt1++;
+ }
+ if (cnt == 2) {
+ if (cnt1 == 2) {
+ // has a partner
+ Node *partnerNode = get_partner_node(node);
+ partnerNode->visited = 1;
+ single_node_to_code_sse2(node, buf0, buf1);
+ single_node_to_code_sse2(partnerNode, buf0, buf1);
+ } else {
+ single_node_to_code_sse2(node, buf0, buf1);
+ }
+ } else {
+ Node *partnerNode = get_partner_node(node);
+ partnerNode->visited = 1;
+ pair_node_to_code_sse2(node, partnerNode, buf0, buf1);
+ }
+ }
+}
+
+void gen_cospi_list_sse2(Node *node, int stage_num, int node_num) {
+ int visited[65][65][2][2];
+ memset(visited, 0, sizeof(visited));
+ char text[100];
+ char text1[100];
+ char text2[100];
+ int size = 100;
+ printf("\n");
+ for (int si = 1; si < stage_num; si++) {
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ int cnt = 0;
+ Node *node0 = node + idx;
+ if (node0->visited == 0) {
+ node0->visited = 1;
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+ cnt++;
+ }
+ if (cnt != 2) {
+ {
+ double w0 = node0->inWeight[0];
+ double w1 = node0->inWeight[1];
+ int idx0 = get_cos_idx(w0, COS_MOD);
+ int idx1 = get_cos_idx(w1, COS_MOD);
+ int sgn0 = w0 < 0 ? 1 : 0;
+ int sgn1 = w1 < 0 ? 1 : 0;
+
+ if (!visited[idx0][idx1][sgn0][sgn1]) {
+ visited[idx0][idx1][sgn0][sgn1] = 1;
+ printf(" __m128i %s = pair_set_epi16(%s, %s);\n",
+ cos_text_sse2(w0, w1, COS_MOD, text, size),
+ cos_text_arr(w0, COS_MOD, text1, size),
+ cos_text_arr(w1, COS_MOD, text2, size));
+ }
+ }
+ Node *node1 = get_partner_node(node0);
+ node1->visited = 1;
+ if (node1->inNode[0]->nodeIdx != node0->inNode[0]->nodeIdx) {
+ double w0 = node1->inWeight[0];
+ double w1 = node1->inWeight[1];
+ int idx0 = get_cos_idx(w0, COS_MOD);
+ int idx1 = get_cos_idx(w1, COS_MOD);
+ int sgn0 = w0 < 0 ? 1 : 0;
+ int sgn1 = w1 < 0 ? 1 : 0;
+
+ if (!visited[idx1][idx0][sgn1][sgn0]) {
+ visited[idx1][idx0][sgn1][sgn0] = 1;
+ printf(" __m128i %s = pair_set_epi16(%s, %s);\n",
+ cos_text_sse2(w1, w0, COS_MOD, text, size),
+ cos_text_arr(w1, COS_MOD, text1, size),
+ cos_text_arr(w0, COS_MOD, text2, size));
+ }
+ } else {
+ double w0 = node1->inWeight[0];
+ double w1 = node1->inWeight[1];
+ int idx0 = get_cos_idx(w0, COS_MOD);
+ int idx1 = get_cos_idx(w1, COS_MOD);
+ int sgn0 = w0 < 0 ? 1 : 0;
+ int sgn1 = w1 < 0 ? 1 : 0;
+
+ if (!visited[idx0][idx1][sgn0][sgn1]) {
+ visited[idx0][idx1][sgn0][sgn1] = 1;
+ printf(" __m128i %s = pair_set_epi16(%s, %s);\n",
+ cos_text_sse2(w0, w1, COS_MOD, text, size),
+ cos_text_arr(w0, COS_MOD, text1, size),
+ cos_text_arr(w1, COS_MOD, text2, size));
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void gen_code_sse2(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+ char *fun_name = new char[100];
+ get_fun_name(fun_name, 100, type, node_num);
+
+ printf("\n");
+ printf(
+ "void %s_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) "
+ "{\n",
+ fun_name);
+
+ printf(" const int32_t* cospi = cospi_arr(cos_bit);\n");
+ printf(" const __m128i __zero = _mm_setzero_si128();\n");
+ printf(" const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+ graph_reset_visited(node, stage_num, node_num);
+ gen_cospi_list_sse2(node, stage_num, node_num);
+ graph_reset_visited(node, stage_num, node_num);
+ for (int si = 1; si < stage_num; si++) {
+ char in[100];
+ char out[100];
+ printf("\n");
+ printf(" // stage %d\n", si);
+ if (si == 1)
+ snprintf(in, 100, "%s", "input");
+ else
+ snprintf(in, 100, "x%d", si - 1);
+ if (si == stage_num - 1) {
+ snprintf(out, 100, "%s", "output");
+ } else {
+ snprintf(out, 100, "x%d", si);
+ printf(" __m128i %s[%d];\n", out, node_num);
+ }
+ // computation code
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ node_to_code_sse2(node + idx, in, out);
+ }
+ }
+
+ printf("}\n");
+}
+void gen_cospi_list_sse4_1(Node *node, int stage_num, int node_num) {
+ int visited[65][2];
+ memset(visited, 0, sizeof(visited));
+ char text[100];
+ char text1[100];
+ int size = 100;
+ printf("\n");
+ for (int si = 1; si < stage_num; si++) {
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ Node *node0 = node + idx;
+ if (node0->visited == 0) {
+ int cnt = 0;
+ node0->visited = 1;
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+ cnt++;
+ }
+ if (cnt != 2) {
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node0->inWeight[i]) != 1 &&
+ fabs(node0->inWeight[i]) != 0) {
+ double w = node0->inWeight[i];
+ int idx = get_cos_idx(w, COS_MOD);
+ int sgn = w < 0 ? 1 : 0;
+
+ if (!visited[idx][sgn]) {
+ visited[idx][sgn] = 1;
+ printf(" __m128i %s = _mm_set1_epi32(%s);\n",
+ cos_text_sse4_1(w, COS_MOD, text, size),
+ cos_text_arr(w, COS_MOD, text1, size));
+ }
+ }
+ }
+ Node *node1 = get_partner_node(node0);
+ node1->visited = 1;
+ }
+ }
+ }
+ }
+}
+
+void single_node_to_code_sse4_1(Node *node, const char *buf0,
+ const char *buf1) {
+ printf(" %s[%2d] =", buf1, node->nodeIdx);
+ if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+ printf(" _mm_add_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+ node->inNodeIdx[1]);
+ } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+ printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+ node->inNodeIdx[1]);
+ } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+ printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+ node->inNodeIdx[0]);
+ } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+ printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+ } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+ printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+ } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+ printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+ } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+ printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+ }
+ printf(";\n");
+}
+
+void pair_node_to_code_sse4_1(Node *node, Node *partnerNode, const char *buf0,
+ const char *buf1) {
+ char temp0[100];
+ char temp1[100];
+ if (node->inWeight[0] * partnerNode->inWeight[0] < 0) {
+ /* type0
+ * cos sin
+ * sin -cos
+ */
+ // btf_32_sse2_type0(w0, w1, in0, in1, out0, out1)
+ // out0 = w0*in0 + w1*in1
+ // out1 = -w0*in1 + w1*in0
+ printf(
+ " btf_32_type0_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+ "__rounding, cos_bit);\n",
+ cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+ cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+ node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+ partnerNode->nodeIdx);
+ } else {
+ /* type1
+ * cos sin
+ * -sin cos
+ */
+ // btf_32_sse2_type1(w0, w1, in0, in1, out0, out1)
+ // out0 = w0*in0 + w1*in1
+ // out1 = w0*in1 - w1*in0
+ printf(
+ " btf_32_type1_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+ "__rounding, cos_bit);\n",
+ cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+ cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+ node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+ partnerNode->nodeIdx);
+ }
+}
+
+void node_to_code_sse4_1(Node *node, const char *buf0, const char *buf1) {
+ int cnt = 0;
+ int cnt1 = 0;
+ if (node->visited == 0) {
+ node->visited = 1;
+ for (int i = 0; i < 2; i++) {
+ if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+ if (fabs(node->inWeight[i]) == 1) cnt1++;
+ }
+ if (cnt == 2) {
+ if (cnt1 == 2) {
+ // has a partner
+ Node *partnerNode = get_partner_node(node);
+ partnerNode->visited = 1;
+ single_node_to_code_sse4_1(node, buf0, buf1);
+ single_node_to_code_sse4_1(partnerNode, buf0, buf1);
+ } else {
+ single_node_to_code_sse2(node, buf0, buf1);
+ }
+ } else {
+ Node *partnerNode = get_partner_node(node);
+ partnerNode->visited = 1;
+ pair_node_to_code_sse4_1(node, partnerNode, buf0, buf1);
+ }
+ }
+}
+
+void gen_code_sse4_1(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+ char *fun_name = new char[100];
+ get_fun_name(fun_name, 100, type, node_num);
+
+ printf("\n");
+ printf(
+ "void %s_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit) "
+ "{\n",
+ fun_name);
+
+ printf(" const int32_t* cospi = cospi_arr(cos_bit);\n");
+ printf(" const __m128i __zero = _mm_setzero_si128();\n");
+ printf(" const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+ graph_reset_visited(node, stage_num, node_num);
+ gen_cospi_list_sse4_1(node, stage_num, node_num);
+ graph_reset_visited(node, stage_num, node_num);
+ for (int si = 1; si < stage_num; si++) {
+ char in[100];
+ char out[100];
+ printf("\n");
+ printf(" // stage %d\n", si);
+ if (si == 1)
+ snprintf(in, 100, "%s", "input");
+ else
+ snprintf(in, 100, "x%d", si - 1);
+ if (si == stage_num - 1) {
+ snprintf(out, 100, "%s", "output");
+ } else {
+ snprintf(out, 100, "x%d", si);
+ printf(" __m128i %s[%d];\n", out, node_num);
+ }
+ // computation code
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ node_to_code_sse4_1(node + idx, in, out);
+ }
+ }
+
+ printf("}\n");
+}
+
+void gen_hybrid_code(CODE_TYPE code_type, TYPE_TXFM txfm_type, int node_num) {
+ int stage_num = get_hybrid_stage_num(txfm_type, node_num);
+
+ Node *node = new Node[node_num * stage_num];
+ init_graph(node, stage_num, node_num);
+
+ gen_hybrid_graph_1d(node, stage_num, node_num, 0, 0, node_num, txfm_type);
+
+ switch (code_type) {
+ case CODE_TYPE_C: gen_code_c(node, stage_num, node_num, txfm_type); break;
+ case CODE_TYPE_SSE2:
+ gen_code_sse2(node, stage_num, node_num, txfm_type);
+ break;
+ case CODE_TYPE_SSE4_1:
+ gen_code_sse4_1(node, stage_num, node_num, txfm_type);
+ break;
+ }
+
+ delete[] node;
+}
+
+int main(int argc, char **argv) {
+ CODE_TYPE code_type = CODE_TYPE_SSE4_1;
+ for (int txfm_type = TYPE_DCT; txfm_type < TYPE_LAST; txfm_type++) {
+ for (int node_num = 4; node_num <= 64; node_num *= 2) {
+ gen_hybrid_code(code_type, (TYPE_TXFM)txfm_type, node_num);
+ }
+ }
+ return 0;
+}
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.cc b/third_party/aom/tools/txfm_analyzer/txfm_graph.cc
new file mode 100644
index 0000000000..a249061008
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.cc
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct Node Node;
+
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+ const int txfm_size) {
+ if (type == TYPE_DCT)
+ snprintf(str_fun_name, str_buf_size, "fdct%d_new", txfm_size);
+ else if (type == TYPE_ADST)
+ snprintf(str_fun_name, str_buf_size, "fadst%d_new", txfm_size);
+ else if (type == TYPE_IDCT)
+ snprintf(str_fun_name, str_buf_size, "idct%d_new", txfm_size);
+ else if (type == TYPE_IADST)
+ snprintf(str_fun_name, str_buf_size, "iadst%d_new", txfm_size);
+}
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+ const TYPE_TXFM type, const int txfm_size) {
+ if (type == TYPE_DCT)
+ snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+ else if (type == TYPE_ADST)
+ snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+ else if (type == TYPE_IDCT)
+ snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+ else if (type == TYPE_IADST)
+ snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+}
+
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+ const TYPE_TXFM type1, const int txfm_size0,
+ const int txfm_size1) {
+ if (type0 == TYPE_DCT && type1 == TYPE_DCT)
+ snprintf(buf, buf_size, "_dct_dct_%dx%d", txfm_size1, txfm_size0);
+ else if (type0 == TYPE_DCT && type1 == TYPE_ADST)
+ snprintf(buf, buf_size, "_dct_adst_%dx%d", txfm_size1, txfm_size0);
+ else if (type0 == TYPE_ADST && type1 == TYPE_ADST)
+ snprintf(buf, buf_size, "_adst_adst_%dx%d", txfm_size1, txfm_size0);
+ else if (type0 == TYPE_ADST && type1 == TYPE_DCT)
+ snprintf(buf, buf_size, "_adst_dct_%dx%d", txfm_size1, txfm_size0);
+}
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type) {
+ if (type == TYPE_DCT)
+ return TYPE_IDCT;
+ else if (type == TYPE_ADST)
+ return TYPE_IADST;
+ else if (type == TYPE_IDCT)
+ return TYPE_DCT;
+ else if (type == TYPE_IADST)
+ return TYPE_ADST;
+ else
+ return TYPE_LAST;
+}
+
+void reference_dct_1d(double *in, double *out, int size) {
+ const double kInvSqrt2 = 0.707106781186547524400844362104;
+ for (int k = 0; k < size; k++) {
+ out[k] = 0; // initialize out[k]
+ for (int n = 0; n < size; n++) {
+ out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+ }
+ if (k == 0) out[k] = out[k] * kInvSqrt2;
+ }
+}
+
+void reference_dct_2d(double *in, double *out, int size) {
+ double *tempOut = new double[size * size];
+ // dct each row: in -> out
+ for (int r = 0; r < size; r++) {
+ reference_dct_1d(in + r * size, out + r * size, size);
+ }
+
+ for (int r = 0; r < size; r++) {
+ // out ->tempOut
+ for (int c = 0; c < size; c++) {
+ tempOut[r * size + c] = out[c * size + r];
+ }
+ }
+ for (int r = 0; r < size; r++) {
+ reference_dct_1d(tempOut + r * size, out + r * size, size);
+ }
+ delete[] tempOut;
+}
+
+void reference_adst_1d(double *in, double *out, int size) {
+ for (int k = 0; k < size; k++) {
+ out[k] = 0; // initialize out[k]
+ for (int n = 0; n < size; n++) {
+ out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+ }
+ }
+}
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+ int type1) {
+ double *tempOut = new double[size * size];
+ // dct each row: in -> out
+ for (int r = 0; r < size; r++) {
+ if (type0 == TYPE_DCT)
+ reference_dct_1d(in + r * size, out + r * size, size);
+ else
+ reference_adst_1d(in + r * size, out + r * size, size);
+ }
+
+ for (int r = 0; r < size; r++) {
+ // out ->tempOut
+ for (int c = 0; c < size; c++) {
+ tempOut[r * size + c] = out[c * size + r];
+ }
+ }
+ for (int r = 0; r < size; r++) {
+ if (type1 == TYPE_DCT)
+ reference_dct_1d(tempOut + r * size, out + r * size, size);
+ else
+ reference_adst_1d(tempOut + r * size, out + r * size, size);
+ }
+ delete[] tempOut;
+}
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+ int type0, int type1) {
+ double *tempOut = new double[size0 * size1];
+ // dct each row: in -> out
+ for (int r = 0; r < size1; r++) {
+ if (type0 == TYPE_DCT)
+ reference_dct_1d(in + r * size0, out + r * size0, size0);
+ else
+ reference_adst_1d(in + r * size0, out + r * size0, size0);
+ }
+
+ for (int r = 0; r < size1; r++) {
+ // out ->tempOut
+ for (int c = 0; c < size0; c++) {
+ tempOut[c * size1 + r] = out[r * size0 + c];
+ }
+ }
+ for (int r = 0; r < size0; r++) {
+ if (type1 == TYPE_DCT)
+ reference_dct_1d(tempOut + r * size1, out + r * size1, size1);
+ else
+ reference_adst_1d(tempOut + r * size1, out + r * size1, size1);
+ }
+ delete[] tempOut;
+}
+
+unsigned int get_max_bit(unsigned int x) {
+ int max_bit = -1;
+ while (x) {
+ x = x >> 1;
+ max_bit++;
+ }
+ return max_bit;
+}
+
+unsigned int bitwise_reverse(unsigned int x, int max_bit) {
+ x = ((x >> 16) & 0x0000ffff) | ((x & 0x0000ffff) << 16);
+ x = ((x >> 8) & 0x00ff00ff) | ((x & 0x00ff00ff) << 8);
+ x = ((x >> 4) & 0x0f0f0f0f) | ((x & 0x0f0f0f0f) << 4);
+ x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
+ x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
+ x = x >> (31 - max_bit);
+ return x;
+}
+
+int get_idx(int ri, int ci, int cSize) { return ri * cSize + ci; }
+
+void add_node(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int in, double w) {
+ int outIdx = get_idx(stage_idx, node_idx, node_num);
+ int inIdx = get_idx(stage_idx - 1, in, node_num);
+ int idx = node[outIdx].inNodeNum;
+ if (idx < 2) {
+ node[outIdx].inNode[idx] = &node[inIdx];
+ node[outIdx].inNodeIdx[idx] = in;
+ node[outIdx].inWeight[idx] = w;
+ idx++;
+ node[outIdx].inNodeNum = idx;
+ } else {
+ printf("Error: inNode is full");
+ }
+}
+
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int in0, double w0, int in1, double w1) {
+ int outIdx = get_idx(stage_idx, node_idx, node_num);
+ int inIdx0 = get_idx(stage_idx - 1, in0, node_num);
+ int inIdx1 = get_idx(stage_idx - 1, in1, node_num);
+
+ int idx = 0;
+ // if(w0 != 0) {
+ node[outIdx].inNode[idx] = &node[inIdx0];
+ node[outIdx].inNodeIdx[idx] = in0;
+ node[outIdx].inWeight[idx] = w0;
+ idx++;
+ //}
+
+ // if(w1 != 0) {
+ node[outIdx].inNode[idx] = &node[inIdx1];
+ node[outIdx].inNodeIdx[idx] = in1;
+ node[outIdx].inWeight[idx] = w1;
+ idx++;
+ //}
+
+ node[outIdx].inNodeNum = idx;
+}
+
+void propagate(Node *node, int stage_num, int node_num, int stage_idx) {
+ for (int ni = 0; ni < node_num; ni++) {
+ int outIdx = get_idx(stage_idx, ni, node_num);
+ node[outIdx].value = 0;
+ for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+ node[outIdx].value +=
+ node[outIdx].inNode[k]->value * node[outIdx].inWeight[k];
+ }
+ }
+}
+
+int64_t round_shift(int64_t value, int bit) {
+ if (bit > 0) {
+ if (value < 0) {
+ return -round_shift(-value, bit);
+ } else {
+ return (value + (1 << (bit - 1))) >> bit;
+ }
+ } else {
+ return value << (-bit);
+ }
+}
+
+void round_shift_array(int32_t *arr, int size, int bit) {
+ if (bit == 0) {
+ return;
+ } else {
+ for (int i = 0; i < size; i++) {
+ arr[i] = round_shift(arr[i], bit);
+ }
+ }
+}
+
+void graph_reset_visited(Node *node, int stage_num, int node_num) {
+ for (int si = 0; si < stage_num; si++) {
+ for (int ni = 0; ni < node_num; ni++) {
+ int idx = get_idx(si, ni, node_num);
+ node[idx].visited = 0;
+ }
+ }
+}
+
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int estimate_bit) {
+ if (stage_idx > 0) {
+ int outIdx = get_idx(stage_idx, node_idx, node_num);
+ int64_t out = 0;
+ node[outIdx].value = 0;
+ for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+ int64_t w = round(node[outIdx].inWeight[k] * (1 << estimate_bit));
+ int64_t v = round(node[outIdx].inNode[k]->value);
+ out += v * w;
+ }
+ node[outIdx].value = round_shift(out, estimate_bit);
+ }
+}
+
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int amplify_bit) {
+ int outIdx = get_idx(stage_idx, node_idx, node_num);
+ node[outIdx].value = round_shift(round(node[outIdx].value), -amplify_bit);
+}
+
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+ int stage_idx, int amplify_bit,
+ int estimate_bit) {
+ for (int ni = 0; ni < node_num; ni++) {
+ estimate_value(node, stage_num, node_num, stage_idx, ni, estimate_bit);
+ amplify_value(node, stage_num, node_num, stage_idx, ni, amplify_bit);
+ }
+}
+
+void init_graph(Node *node, int stage_num, int node_num) {
+ for (int si = 0; si < stage_num; si++) {
+ for (int ni = 0; ni < node_num; ni++) {
+ int outIdx = get_idx(si, ni, node_num);
+ node[outIdx].stageIdx = si;
+ node[outIdx].nodeIdx = ni;
+ node[outIdx].value = 0;
+ node[outIdx].inNodeNum = 0;
+ if (si >= 1) {
+ connect_node(node, stage_num, node_num, si, ni, ni, 1, ni, 0);
+ }
+ }
+ }
+}
+
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N, int star) {
+ for (int i = 0; i < N / 2; i++) {
+ int out = node_idx + i;
+ int in1 = node_idx + N - 1 - i;
+ if (star == 1) {
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+ 1);
+ } else {
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+ 1);
+ }
+ }
+ for (int i = N / 2; i < N; i++) {
+ int out = node_idx + i;
+ int in1 = node_idx + N - 1 - i;
+ if (star == 1) {
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+ 1);
+ } else {
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+ 1);
+ }
+ }
+}
+
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N) {
+ int max_bit = get_max_bit(N - 1);
+ for (int i = 0; i < N; i++) {
+ int out = node_idx + bitwise_reverse(i, max_bit);
+ int in = node_idx + i;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+}
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N) {
+ int max_bit = get_max_bit(N);
+ for (int ni = 0; ni < N / 2; ni++) {
+ int ai = bitwise_reverse(N + ni, max_bit);
+ int out = node_idx + ni;
+ int in1 = node_idx + N - ni - 1;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+ sin(PI * ai / (2 * 2 * N)), in1, cos(PI * ai / (2 * 2 * N)));
+ }
+ for (int ni = N / 2; ni < N; ni++) {
+ int ai = bitwise_reverse(N + ni, max_bit);
+ int out = node_idx + ni;
+ int in1 = node_idx + N - ni - 1;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+ cos(PI * ai / (2 * 2 * N)), in1, -sin(PI * ai / (2 * 2 * N)));
+ }
+}
+
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N) {
+ for (int ni = 0; ni < N / 4; ni++) {
+ int out = node_idx + ni;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+ }
+
+ for (int ni = N / 4; ni < N / 2; ni++) {
+ int out = node_idx + ni;
+ int in1 = node_idx + N - ni - 1;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+ -cos(PI / 4), in1, cos(-PI / 4));
+ }
+
+ for (int ni = N / 2; ni < N * 3 / 4; ni++) {
+ int out = node_idx + ni;
+ int in1 = node_idx + N - ni - 1;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+ cos(-PI / 4), in1, cos(PI / 4));
+ }
+
+ for (int ni = N * 3 / 4; ni < N; ni++) {
+ int out = node_idx + ni;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+ }
+}
+
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int idx, int N) {
+ // TODO(angiebird): Simplify and clarify this function
+
+ int i = 2 * N / (1 << (idx / 2));
+ int max_bit =
+ get_max_bit(i / 2) - 1; // the max_bit counts on i/2 instead of N here
+ int N_over_i = 2 << (idx / 2);
+
+ for (int nj = 0; nj < N / 2; nj += N_over_i) {
+ int j = nj / (N_over_i);
+ int kj = bitwise_reverse(i / 4 + j, max_bit);
+ // printf("kj = %d\n", kj);
+
+ // I_N/2i --- 0
+ int offset = nj;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in = out;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+
+ // -C_Kj/i --- S_Kj/i
+ offset += N_over_i / 4;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in0 = out;
+ double w0 = -cos(kj * PI / i);
+ int in1 = N - (offset + ni) - 1 + node_idx;
+ double w1 = sin(kj * PI / i);
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+ w1);
+ }
+
+ // S_kj/i --- -C_Kj/i
+ offset += N_over_i / 4;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in0 = out;
+ double w0 = -sin(kj * PI / i);
+ int in1 = N - (offset + ni) - 1 + node_idx;
+ double w1 = -cos(kj * PI / i);
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+ w1);
+ }
+
+ // I_N/2i --- 0
+ offset += N_over_i / 4;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in = out;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+ }
+
+ for (int nj = N / 2; nj < N; nj += N_over_i) {
+ int j = nj / N_over_i;
+ int kj = bitwise_reverse(i / 4 + j, max_bit);
+
+ // I_N/2i --- 0
+ int offset = nj;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in = out;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+
+ // C_kj/i --- -S_Kj/i
+ offset += N_over_i / 4;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in0 = out;
+ double w0 = cos(kj * PI / i);
+ int in1 = N - (offset + ni) - 1 + node_idx;
+ double w1 = -sin(kj * PI / i);
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+ w1);
+ }
+
+ // S_kj/i --- C_Kj/i
+ offset += N_over_i / 4;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in0 = out;
+ double w0 = sin(kj * PI / i);
+ int in1 = N - (offset + ni) - 1 + node_idx;
+ double w1 = cos(kj * PI / i);
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+ w1);
+ }
+
+ // I_N/2i --- 0
+ offset += N_over_i / 4;
+ for (int ni = 0; ni < N_over_i / 4; ni++) {
+ int out = node_idx + offset + ni;
+ int in = out;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+ }
+}
+
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int idx, int N) {
+ int B_size = 1 << ((idx + 1) / 2);
+ for (int ni = 0; ni < N; ni += B_size) {
+ gen_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni, B_size,
+ (ni / B_size) % 2);
+ }
+}
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N) {
+ int max_idx = 2 * (get_max_bit(N) + 1) - 3;
+ for (int idx = 0; idx < max_idx; idx++) {
+ int s = stage_idx + max_idx - idx - 1;
+ if (idx == 0) {
+ // type 1
+ gen_type1_graph(node, stage_num, node_num, s, node_idx, N);
+ } else if (idx == max_idx - 1) {
+ // type 2
+ gen_type2_graph(node, stage_num, node_num, s, node_idx, N);
+ } else if ((idx + 1) % 2 == 0) {
+ // type 4
+ gen_type4_graph(node, stage_num, node_num, s, node_idx, idx, N);
+ } else if ((idx + 1) % 2 == 1) {
+ // type 3
+ gen_type3_graph(node, stage_num, node_num, s, node_idx, idx, N);
+ } else {
+ printf("check gen_R_graph()\n");
+ }
+ }
+}
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N) {
+ if (N > 2) {
+ gen_B_graph(node, stage_num, node_num, stage_idx, node_idx, N, 0);
+ gen_DCT_graph(node, stage_num, node_num, stage_idx + 1, node_idx, N / 2);
+ gen_R_graph(node, stage_num, node_num, stage_idx + 1, node_idx + N / 2,
+ N / 2);
+ } else {
+ // generate dct_2
+ connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+ cos(PI / 4), node_idx + 1, cos(PI / 4));
+ connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+ node_idx + 1, -cos(PI / 4), node_idx, cos(PI / 4));
+ }
+}
+
+int get_dct_stage_num(int size) { return 2 * get_max_bit(size); }
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int dct_node_num) {
+ gen_DCT_graph(node, stage_num, node_num, stage_idx, node_idx, dct_node_num);
+ int dct_stage_num = get_dct_stage_num(dct_node_num);
+ gen_P_graph(node, stage_num, node_num, stage_idx + dct_stage_num - 2,
+ node_idx, dct_node_num);
+}
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx) {
+ int size = 1 << (adst_idx + 1);
+ for (int ni = 0; ni < size / 2; ni++) {
+ int nOut = node_idx + ni;
+ int nIn = nOut + size / 2;
+ connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, 1, nIn,
+ 1);
+ // printf("nOut: %d nIn: %d\n", nOut, nIn);
+ }
+ for (int ni = size / 2; ni < size; ni++) {
+ int nOut = node_idx + ni;
+ int nIn = nOut - size / 2;
+ connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, -1, nIn,
+ 1);
+ // printf("ndctOut: %d nIn: %d\n", nOut, nIn);
+ }
+}
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx, int adst_node_num) {
+ int size = 1 << (adst_idx + 1);
+ for (int ni = 0; ni < adst_node_num; ni += size) {
+ gen_adst_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+ adst_idx);
+ }
+}
+
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, double freq) {
+ connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+ cos(freq * PI), node_idx + 1, sin(freq * PI));
+ connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+ node_idx + 1, -cos(freq * PI), node_idx, sin(freq * PI));
+}
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx) {
+ int size = 1 << (adst_idx);
+ for (int i = 0; i < size / 2; i++) {
+ int ni = i * 2;
+ double fi = (1 + 4 * i) * 1.0 / (1 << (adst_idx + 1));
+ gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+ }
+}
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx, int adst_node_num) {
+ int size = 1 << (adst_idx);
+ for (int i = 0; i < adst_node_num / size; i++) {
+ if (i % 2 == 1) {
+ int ni = i * size;
+ gen_adst_E_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+ adst_idx);
+ }
+ }
+}
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ for (int i = 0; i < adst_node_num / 2; i++) {
+ int ni = i * 2;
+ double fi = (1 + 4 * i) * 1.0 / (4 * adst_node_num);
+ gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+ }
+}
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ // reverse order when idx is 1, 3, 5, 7 ...
+ // example of adst_node_num = 8:
+ // 0 1 2 3 4 5 6 7
+ // --> 0 7 2 5 4 3 6 1
+ for (int ni = 0; ni < adst_node_num; ni++) {
+ if (ni % 2 == 0) {
+ int out = node_idx + ni;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out,
+ 0);
+ } else {
+ int out = node_idx + ni;
+ int in = node_idx + adst_node_num - ni;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+ }
+}
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ // reverse order
+ // 0 1 2 3 --> 3 2 1 0
+ for (int ni = 0; ni < adst_node_num; ni++) {
+ int out = node_idx + ni;
+ int in = node_idx + adst_node_num - ni - 1;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+}
+
+int get_Q_out2in(int adst_node_num, int out) {
+ int in;
+ if (out % 2 == 0) {
+ in = out;
+ } else {
+ in = adst_node_num - out;
+ }
+ return in;
+}
+
+int get_Ibar_out2in(int adst_node_num, int out) {
+ return adst_node_num - out - 1;
+}
+
+void gen_adst_IbarQ_graph(Node *node, int stage_num, int node_num,
+ int stage_idx, int node_idx, int adst_node_num) {
+ // in -> Ibar -> Q -> out
+ for (int ni = 0; ni < adst_node_num; ni++) {
+ int out = node_idx + ni;
+ int in = node_idx +
+ get_Ibar_out2in(adst_node_num, get_Q_out2in(adst_node_num, ni));
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+}
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ // reverse order
+ for (int ni = 0; ni < adst_node_num; ni++) {
+ int out = node_idx + ni;
+ int in = out;
+ if (ni % 2 == 0) {
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ } else {
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, -1, in,
+ 0);
+ }
+ }
+}
+
+int get_hadamard_idx(int x, int adst_node_num) {
+ int max_bit = get_max_bit(adst_node_num - 1);
+ x = bitwise_reverse(x, max_bit);
+
+ // gray code
+ int c = x & 1;
+ int p = x & 1;
+ int y = c;
+
+ for (int i = 1; i <= max_bit; i++) {
+ p = c;
+ c = (x >> i) & 1;
+ y += (c ^ p) << i;
+ }
+ return y;
+}
+
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ for (int ni = 0; ni < adst_node_num; ni++) {
+ int out = node_idx + ni;
+ int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+ }
+}
+
+void gen_adst_HtD_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ for (int ni = 0; ni < adst_node_num; ni++) {
+ int out = node_idx + ni;
+ int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+ double inW;
+ if (ni % 2 == 0)
+ inW = 1;
+ else
+ inW = -1;
+ connect_node(node, stage_num, node_num, stage_idx + 1, out, in, inW, in, 0);
+ }
+}
+
+int get_adst_stage_num(int adst_node_num) {
+ return 2 * get_max_bit(adst_node_num) + 2;
+}
+
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ int max_bit = get_max_bit(adst_node_num);
+ int si = 0;
+ gen_adst_IbarQ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+ adst_node_num);
+ si++;
+ gen_adst_VJ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+ adst_node_num);
+ si++;
+ for (int adst_idx = max_bit - 1; adst_idx >= 1; adst_idx--) {
+ gen_adst_U_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+ adst_idx, adst_node_num);
+ si++;
+ gen_adst_V_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+ adst_idx, adst_node_num);
+ si++;
+ }
+ gen_adst_HtD_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+ adst_node_num);
+ si++;
+ return si + 1;
+}
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num) {
+ int hybrid_stage_num = get_hybrid_stage_num(TYPE_ADST, adst_node_num);
+ // generate a adst tempNode
+ Node *tempNode = new Node[hybrid_stage_num * adst_node_num];
+ init_graph(tempNode, hybrid_stage_num, adst_node_num);
+ int si = gen_iadst_graph(tempNode, hybrid_stage_num, adst_node_num, 0, 0,
+ adst_node_num);
+
+ // tempNode's inverse graph to node[stage_idx][node_idx]
+ gen_inv_graph(tempNode, hybrid_stage_num, adst_node_num, node, stage_num,
+ node_num, stage_idx, node_idx);
+ delete[] tempNode;
+ return si;
+}
+
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int dct_node_num) {
+ for (int first = 0; first < dct_node_num; first++) {
+ for (int second = 0; second < dct_node_num; second++) {
+ // int sIn = stage_idx;
+ int sOut = stage_idx + 1;
+ int nIn = node_idx + first * dct_node_num + second;
+ int nOut = node_idx + second * dct_node_num + first;
+
+ // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+ connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+ }
+ }
+}
+
+void connect_layer_2d_new(Node *node, int stage_num, int node_num,
+ int stage_idx, int node_idx, int dct_node_num0,
+ int dct_node_num1) {
+ for (int i = 0; i < dct_node_num1; i++) {
+ for (int j = 0; j < dct_node_num0; j++) {
+ // int sIn = stage_idx;
+ int sOut = stage_idx + 1;
+ int nIn = node_idx + i * dct_node_num0 + j;
+ int nOut = node_idx + j * dct_node_num1 + i;
+
+ // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+ connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+ }
+ }
+}
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int dct_node_num) {
+ int dct_stage_num = get_dct_stage_num(dct_node_num);
+ // put 2 layers of dct_node_num DCTs on the graph
+ for (int ni = 0; ni < dct_node_num; ni++) {
+ gen_DCT_graph_1d(node, stage_num, node_num, stage_idx,
+ node_idx + ni * dct_node_num, dct_node_num);
+ gen_DCT_graph_1d(node, stage_num, node_num, stage_idx + dct_stage_num,
+ node_idx + ni * dct_node_num, dct_node_num);
+ }
+ // connect first layer and second layer
+ connect_layer_2d(node, stage_num, node_num, stage_idx + dct_stage_num - 1,
+ node_idx, dct_node_num);
+}
+
+int get_hybrid_stage_num(int type, int hybrid_node_num) {
+ if (type == TYPE_DCT || type == TYPE_IDCT) {
+ return get_dct_stage_num(hybrid_node_num);
+ } else if (type == TYPE_ADST || type == TYPE_IADST) {
+ return get_adst_stage_num(hybrid_node_num);
+ }
+ return 0;
+}
+
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num) {
+ int stage_num = 0;
+ stage_num += get_hybrid_stage_num(type0, hybrid_node_num);
+ stage_num += get_hybrid_stage_num(type1, hybrid_node_num);
+ return stage_num;
+}
+
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+ int hybrid_node_num1) {
+ int stage_num = 0;
+ stage_num += get_hybrid_stage_num(type0, hybrid_node_num0);
+ stage_num += get_hybrid_stage_num(type1, hybrid_node_num1);
+ return stage_num;
+}
+
+int get_hybrid_amplify_factor(int type, int hybrid_node_num) {
+ return get_max_bit(hybrid_node_num) - 1;
+}
+
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int hybrid_node_num, int type) {
+ if (type == TYPE_DCT) {
+ gen_DCT_graph_1d(node, stage_num, node_num, stage_idx, node_idx,
+ hybrid_node_num);
+ } else if (type == TYPE_ADST) {
+ gen_adst_graph(node, stage_num, node_num, stage_idx, node_idx,
+ hybrid_node_num);
+ } else if (type == TYPE_IDCT) {
+ int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+ // generate a dct tempNode
+ Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+ init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+ gen_DCT_graph_1d(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+ hybrid_node_num);
+
+ // tempNode's inverse graph to node[stage_idx][node_idx]
+ gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+ node_num, stage_idx, node_idx);
+ delete[] tempNode;
+ } else if (type == TYPE_IADST) {
+ int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+ // generate a adst tempNode
+ Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+ init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+ gen_adst_graph(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+ hybrid_node_num);
+
+ // tempNode's inverse graph to node[stage_idx][node_idx]
+ gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+ node_num, stage_idx, node_idx);
+ delete[] tempNode;
+ }
+}
+
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int hybrid_node_num, int type0,
+ int type1) {
+ int hybrid_stage_num = get_hybrid_stage_num(type0, hybrid_node_num);
+
+ for (int ni = 0; ni < hybrid_node_num; ni++) {
+ gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+ node_idx + ni * hybrid_node_num, hybrid_node_num,
+ type0);
+ gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx + hybrid_stage_num,
+ node_idx + ni * hybrid_node_num, hybrid_node_num,
+ type1);
+ }
+
+ // connect first layer and second layer
+ connect_layer_2d(node, stage_num, node_num, stage_idx + hybrid_stage_num - 1,
+ node_idx, hybrid_node_num);
+}
+
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+ int stage_idx, int node_idx, int hybrid_node_num0,
+ int hybrid_node_num1, int type0, int type1) {
+ int hybrid_stage_num0 = get_hybrid_stage_num(type0, hybrid_node_num0);
+
+ for (int ni = 0; ni < hybrid_node_num1; ni++) {
+ gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+ node_idx + ni * hybrid_node_num0, hybrid_node_num0,
+ type0);
+ }
+ for (int ni = 0; ni < hybrid_node_num0; ni++) {
+ gen_hybrid_graph_1d(
+ node, stage_num, node_num, stage_idx + hybrid_stage_num0,
+ node_idx + ni * hybrid_node_num1, hybrid_node_num1, type1);
+ }
+
+ // connect first layer and second layer
+ connect_layer_2d_new(node, stage_num, node_num,
+ stage_idx + hybrid_stage_num0 - 1, node_idx,
+ hybrid_node_num0, hybrid_node_num1);
+}
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+ int inv_stage_num, int inv_node_num, int inv_stage_idx,
+ int inv_node_idx) {
+ // clean up inNodeNum in invNode because of add_node
+ for (int si = 1 + inv_stage_idx; si < inv_stage_idx + stage_num; si++) {
+ for (int ni = inv_node_idx; ni < inv_node_idx + node_num; ni++) {
+ int idx = get_idx(si, ni, inv_node_num);
+ invNode[idx].inNodeNum = 0;
+ }
+ }
+ // generate inverse graph of node on invNode
+ for (int si = 1; si < stage_num; si++) {
+ for (int ni = 0; ni < node_num; ni++) {
+ int invSi = stage_num - si;
+ int idx = get_idx(si, ni, node_num);
+ for (int k = 0; k < node[idx].inNodeNum; k++) {
+ int invNi = node[idx].inNodeIdx[k];
+ add_node(invNode, inv_stage_num, inv_node_num, invSi + inv_stage_idx,
+ invNi + inv_node_idx, ni + inv_node_idx,
+ node[idx].inWeight[k]);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.h b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
new file mode 100644
index 0000000000..8dc36146dd
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+#define AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+
+struct Node {
+ Node *inNode[2];
+ int inNodeNum;
+ int inNodeIdx[2];
+ double inWeight[2];
+ double value;
+ int nodeIdx;
+ int stageIdx;
+ int visited;
+};
+
+#define STAGENUM (10)
+#define NODENUM (32)
+#define COS_MOD (128)
+
+typedef enum {
+ TYPE_DCT = 0,
+ TYPE_ADST,
+ TYPE_IDCT,
+ TYPE_IADST,
+ TYPE_LAST
+} TYPE_TXFM;
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type);
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+ const int txfm_size);
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+ const TYPE_TXFM type, const int txfm_size);
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+ const TYPE_TXFM type1, const int txfm_size0,
+ const int txfm_size1);
+unsigned int get_max_bit(unsigned int x);
+unsigned int bitwise_reverse(unsigned int x, int max_bit);
+int get_idx(int ri, int ci, int cSize);
+
+int get_dct_stage_num(int size);
+void reference_dct_1d(double *in, double *out, int size);
+void reference_dct_2d(double *in, double *out, int size);
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int in0, double w0, int in1, double w1);
+void propagate(Node *node, int stage_num, int node_num, int stage);
+void init_graph(Node *node, int stage_num, int node_num);
+void graph_reset_visited(Node *node, int stage_num, int node_num);
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N, int star);
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N);
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N);
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N);
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int idx, int N);
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int idx, int N);
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N);
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int N);
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int dct_node_num);
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int dct_node_num);
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int dct_node_num);
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx);
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx, int adst_node_num);
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, double freq);
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx);
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_idx, int adst_node_num);
+
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+
+int get_hadamard_idx(int x, int adst_node_num);
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int adst_node_num);
+void reference_adst_1d(double *in, double *out, int size);
+
+int get_adst_stage_num(int adst_node_num);
+int get_hybrid_stage_num(int type, int hybrid_node_num);
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num);
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+ int hybrid_node_num1);
+int get_hybrid_amplify_factor(int type, int hybrid_node_num);
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int hybrid_node_num, int type);
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int hybrid_node_num, int type0,
+ int type1);
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+ int stage_idx, int node_idx, int hybrid_node_num0,
+ int hybrid_node_num1, int type0, int type1);
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+ int type1);
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+ int type0, int type1);
+void reference_adst_dct_2d(double *in, double *out, int size);
+
+void gen_code(Node *node, int stage_num, int node_num, TYPE_TXFM type);
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+ int inv_stage_num, int inv_node_num, int inv_stage_idx,
+ int inv_node_idx);
+
+TYPE_TXFM hybrid_char_to_int(char ctype);
+
+int64_t round_shift(int64_t value, int bit);
+void round_shift_array(int32_t *arr, int size, int bit);
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int estimate_bit);
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+ int node_idx, int estimate_bit);
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+ int stage_idx, int amplify_bit,
+ int estimate_bit);
+#endif // AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
diff --git a/third_party/aom/tools/wrap-commit-msg.py b/third_party/aom/tools/wrap-commit-msg.py
new file mode 100755
index 0000000000..c51ed093d3
--- /dev/null
+++ b/third_party/aom/tools/wrap-commit-msg.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Wraps paragraphs of text, preserving manual formatting
+
+This is like fold(1), but has the special convention of not modifying lines
+that start with whitespace. This allows you to intersperse blocks with
+special formatting, like code blocks, with written prose. The prose will
+be wordwrapped, and the manual formatting will be preserved.
+
+ * This won't handle the case of a bulleted (or ordered) list specially, so
+ manual wrapping must be done.
+
+Occasionally it's useful to put something with explicit formatting that
+doesn't look at all like a block of text inline.
+
+ indicator = has_leading_whitespace(line);
+ if (indicator)
+ preserve_formatting(line);
+
+The intent is that this docstring would make it through the transform
+and still be legible and presented as it is in the source. If additional
+cases are handled, update this doc to describe the effect.
+"""
+
+__author__ = "jkoleszar@google.com"
+import textwrap
+import sys
+
+def wrap(text):
+ if text:
+ return textwrap.fill(text, break_long_words=False) + '\n'
+ return ""
+
+
+def main(fileobj):
+ text = ""
+ output = ""
+ while True:
+ line = fileobj.readline()
+ if not line:
+ break
+
+ if line.lstrip() == line:
+ text += line
+ else:
+ output += wrap(text)
+ text=""
+ output += line
+ output += wrap(text)
+
+ # Replace the file or write to stdout.
+ if fileobj == sys.stdin:
+ fileobj = sys.stdout
+ else:
+ fileobj.seek(0)
+ fileobj.truncate(0)
+ fileobj.write(output)
+
+if __name__ == "__main__":
+ if len(sys.argv) > 1:
+ main(open(sys.argv[1], "r+"))
+ else:
+ main(sys.stdin)
diff --git a/third_party/aom/usage.dox b/third_party/aom/usage.dox
new file mode 100644
index 0000000000..4004f4af5f
--- /dev/null
+++ b/third_party/aom/usage.dox
@@ -0,0 +1,109 @@
+/*!\page usage Usage
+
+ The aom multi-format codec SDK provides a unified interface amongst its
+ supported codecs. This abstraction allows applications using this SDK to
+ easily support multiple video formats with minimal code duplication or
+ "special casing." This section describes the interface common to all codecs.
+ For codec-specific details, see the \ref codecs page.
+
+ The following sections are common to all codecs:
+ - \ref usage_types
+ - \ref usage_features
+ - \ref usage_init
+ - \ref usage_errors
+
+ For more information on decoder and encoder specific usage, see the
+ following pages:
+ \if decoder
+ \li \subpage usage_decode
+ \endif
+ \if encoder
+ \li \subpage usage_encode
+ \endif
+
+ \section usage_types Important Data Types
+ There are two important data structures to consider in this interface.
+
+ \subsection usage_ctxs Contexts
+ A context is a storage area allocated by the calling application that the
+ codec may write into to store details about a single instance of that codec.
+ Most of the context is implementation specific, and thus opaque to the
+ application. The context structure as seen by the application is of fixed
+ size, and thus can be allocated with automatic storage or dynamically
+ on the heap.
+
+ Most operations require an initialized codec context. Codec context
+ instances are codec specific. That is, the codec to be used for the encoded
+ video must be known at initialization time. See #aom_codec_ctx_t for further
+ information.
+
+ \subsection usage_ifaces Interfaces
+ A codec interface is an opaque structure that controls how function calls
+ into the generic interface are dispatched to their codec-specific
+ implementations. Applications \ref MUSTNOT attempt to examine or override
+ this storage, as it contains internal implementation details likely to
+ change from release to release.
+
+ Each supported codec will expose an interface structure to the application
+ as an <code>extern</code> reference to a structure of the incomplete type
+ #aom_codec_iface_t.
+
+ \section usage_features Features
+ Several "features" are defined that are optionally implemented by codec
+ algorithms. Indeed, the same algorithm may support different features on
+ different platforms. The purpose of defining these features is that when
+ they are implemented, they conform to a common interface. The features, or
+ capabilities, of an algorithm can be queried from it's interface by using
+ the aom_codec_get_caps() method. Attempts to invoke features not supported
+ by an algorithm will generally result in #AOM_CODEC_INCAPABLE.
+
+ \if decoder
+ Currently defined decoder features include:
+ \endif
+
+ \section usage_init Initialization
+ To initialize a codec instance, the address of the codec context
+ and interface structures are passed to an initialization function. Depending
+ on the \ref usage_features that the codec supports, the codec could be
+ initialized in different modes.
+
+ To prevent cases of confusion where the ABI of the library changes,
+ the ABI is versioned. The ABI version number must be passed at
+ initialization time to ensure the application is using a header file that
+ matches the library. The current ABI version number is stored in the
+ preprocessor macros #AOM_CODEC_ABI_VERSION, #AOM_ENCODER_ABI_VERSION, and
+ #AOM_DECODER_ABI_VERSION. For convenience, each initialization function has
+ a wrapper macro that inserts the correct version number. These macros are
+ named like the initialization methods, but without the _ver suffix.
+
+
+ The available initialization methods are:
+ \if encoder
+ \li #aom_codec_enc_init (calls aom_codec_enc_init_ver())
+ \endif
+ \if decoder
+ \li #aom_codec_dec_init (calls aom_codec_dec_init_ver())
+ \endif
+
+
+ \section usage_errors Error Handling
+ Almost all codec functions return an error status of type #aom_codec_err_t.
+ The semantics of how each error condition should be processed is clearly
+ defined in the definitions of each enumerated value. Error values can be
+ converted into ASCII strings with the aom_codec_error() and
+ aom_codec_err_to_string() methods. The difference between these two methods is
+ that aom_codec_error() returns the error state from an initialized context,
+ whereas aom_codec_err_to_string() can be used in cases where an error occurs
+ outside any context. The enumerated value returned from the last call can be
+ retrieved from the <code>err</code> member of the decoder context as well.
+ Finally, more detailed error information may be able to be obtained by using
+ the aom_codec_error_detail() method. Not all errors produce detailed error
+ information.
+
+ In addition to error information, the codec library's build configuration
+ is available at runtime on some platforms. This information can be returned
+ by calling aom_codec_build_config(), and is formatted as a base64 coded string
+ (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not
+ useful to an application at runtime, but may be of use to aom for support.
+
+*/
diff --git a/third_party/aom/usage_cx.dox b/third_party/aom/usage_cx.dox
new file mode 100644
index 0000000000..51b4e8e3e2
--- /dev/null
+++ b/third_party/aom/usage_cx.dox
@@ -0,0 +1,9 @@
+/*! \page usage_encode Encoding
+
+ The aom_codec_encode() function is at the core of the encode loop. It
+ processes raw images passed by the application, producing packets of
+ compressed data.
+
+ \ref samples
+
+*/
diff --git a/third_party/aom/usage_dx.dox b/third_party/aom/usage_dx.dox
new file mode 100644
index 0000000000..76dc213bf0
--- /dev/null
+++ b/third_party/aom/usage_dx.dox
@@ -0,0 +1,22 @@
+/*! \page usage_decode Decoding
+
+ The aom_codec_decode() function is at the core of the decode loop. It
+ processes packets of compressed data passed by the application, producing
+ decoded images. The decoder expects packets to comprise exactly one image
+ frame of data. Packets \ref MUST be passed in decode order. If the
+ application wishes to associate some data with the frame, the
+ <code>user_priv</code> member may be set.
+
+ \ref samples
+
+
+ \section usage_frame_iter Frame Iterator Based Decoding
+ Decoded frames are made available to the application
+ through the aom_codec_get_frame() iterator. The application initializes the
+ iterator storage (of type #aom_codec_iter_t) to NULL, then calls
+ aom_codec_get_frame repeatedly until it returns NULL, indicating that all
+ images have been returned. This process may result in zero, one, or many
+ frames that are ready for display, depending on the codec.
+
+
+*/